diff --git a/.github/workflows/cppcheck.yml b/.github/workflows/cppcheck.yml
new file mode 100644
index 00000000..e4a8d9af
--- /dev/null
+++ b/.github/workflows/cppcheck.yml
@@ -0,0 +1,39 @@
+name: Static Analysis
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  cppcheck:
+    name: Run Cppcheck
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          lfs: true
+
+      - name: Install cppcheck
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cppcheck
+
+      - name: Run cppcheck
+        run: |
+          mkdir -p cppcheck-report
+          cppcheck --enable=all --inconclusive --quiet \
+                   --output-file=cppcheck-report/cppcheck.txt \
+                   $GITHUB_WORKSPACE/framework/src/ \
+                   -I $GITHUB_WORKSPACE/include/ \
+                   -I $GITHUB_WORKSPACE/framework/include/
+          cat cppcheck-report/cppcheck.txt
+
+      - name: Upload cppcheck report artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: cppcheck-report
+          path: cppcheck-report/cppcheck.txt
diff --git a/MODULE.bazel b/MODULE.bazel
index 63282ba8..0fec2bc3 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -14,6 +14,17 @@ bazel_dep(name = "rules_python", version = "0.37.2")
 bazel_dep(name = "platforms", version = "0.0.10")
 bazel_dep(name = "googletest", version = "1.15.2")
 bazel_dep(name = "apple_support", version = "1.17.1", repo_name = "build_bazel_apple_support")
+bazel_dep(name = "curl", version = "8.8.0")
+bazel_dep(name = "nlohmann_json", version = "3.11.3")
+bazel_dep(name = "hedron_compile_commands", dev_dependency = True)
+bazel_dep(name = "flatbuffers", version = "24.3.25")
+
+# Hedron's Compile Commands Extractor for Bazel
+git_override(
+    module_name = "hedron_compile_commands",
+    remote = "https://github.com/hedronvision/bazel-compile-commands-extractor.git",
+    commit = "4f28899228fb3ad0126897876f147ca15026151e",
+)
 
 # Use archive_override to patch rules_foreign_cc to default to specific cmake version
 archive_override(
diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock
index 52f43e79..f210eeb0 100644
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
@@ -4,6 +4,7 @@
     "https://bcr.bazel.build/bazel_registry.json": "8a28e4aff06ee60aed2a8c281907fb8bcbf3b753c91fb5a5c57da3215d5b3497",
     "https://bcr.bazel.build/modules/abseil-cpp/20210324.2/MODULE.bazel": "7cd0312e064fde87c8d1cd79ba06c876bd23630c83466e9500321be55c96ace2",
     "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/MODULE.bazel": "70390338f7a5106231d20620712f7cccb659cd0e9d073d1991c038eb9fc57589",
+    "https://bcr.bazel.build/modules/abseil-cpp/20220623.1/MODULE.bazel": "73ae41b6818d423a11fd79d95aedef1258f304448193d4db4ff90e5e7a0f076c",
     "https://bcr.bazel.build/modules/abseil-cpp/20230125.1/MODULE.bazel": "89047429cb0207707b2dface14ba7f8df85273d484c2572755be4bab7ce9c3a0",
     "https://bcr.bazel.build/modules/abseil-cpp/20230802.0.bcr.1/MODULE.bazel": "1c8cec495288dccd14fdae6e3f95f772c1c91857047a098fad772034264cc8cb",
     "https://bcr.bazel.build/modules/abseil-cpp/20230802.0/MODULE.bazel": "d253ae36a8bd9ee3c5955384096ccb6baf16a1b1e93e858370da0a3b94f77c16",
@@ -12,7 +13,21 @@
     "https://bcr.bazel.build/modules/apple_support/1.15.1/MODULE.bazel": "a0556fefca0b1bb2de8567b8827518f94db6a6e7e7d632b4c48dc5f865bc7c85",
     "https://bcr.bazel.build/modules/apple_support/1.17.1/MODULE.bazel": "655c922ab1209978a94ef6ca7d9d43e940cd97d9c172fb55f94d91ac53f8610b",
     "https://bcr.bazel.build/modules/apple_support/1.17.1/source.json": "6b2b8c74d14e8d485528a938e44bdb72a5ba17632b9e14ef6e68a5ee96c8347f",
+    "https://bcr.bazel.build/modules/apple_support/1.3.1/MODULE.bazel": "6d04819e9f8775a6eabe3c232585454d5393c6c4600029d063566a4f2326a600",
     "https://bcr.bazel.build/modules/apple_support/1.5.0/MODULE.bazel": "50341a62efbc483e8a2a6aec30994a58749bd7b885e18dd96aa8c33031e558ef",
+    "https://bcr.bazel.build/modules/aspect_bazel_lib/1.29.2/MODULE.bazel": "3ca4ed580f4d7e7e47c0b4f2e4799e8c895a2e59e7fab922078cdd5fb631095b",
+    "https://bcr.bazel.build/modules/aspect_bazel_lib/1.31.2/MODULE.bazel": "7bee702b4862612f29333590f4b658a5832d433d6f8e4395f090e8f4e85d442f",
+    "https://bcr.bazel.build/modules/aspect_bazel_lib/1.39.0/MODULE.bazel": "4b9135560d1b9f9520b85739da72de105fc919346c83c874ebf0789794075340",
+    "https://bcr.bazel.build/modules/aspect_bazel_lib/1.40.0/MODULE.bazel": "eac4cf71482009e142804f72b2b102fb7e9812c326702d8b7206385b24f7805f",
+    "https://bcr.bazel.build/modules/aspect_bazel_lib/1.40.0/source.json": "035a1023f17bde54c2a695158e473ab23619c659099e69264dd9d52a9475b7b1",
+    "https://bcr.bazel.build/modules/aspect_rules_esbuild/0.15.0/MODULE.bazel": "35508f042286d2074b080df9d88b5faa2b97f98558b68101a39be8e5ab2837b1",
+    "https://bcr.bazel.build/modules/aspect_rules_esbuild/0.15.0/source.json": "be3ba638076fdb3f369b86b399990500f45c1c8251526a72d882d12d13d81ae5",
+    "https://bcr.bazel.build/modules/aspect_rules_js/1.29.2/MODULE.bazel": "1b8f06192f8372e33139e1f6aa97e1d56295eb5134b288d70a750d4d13a37736",
+    "https://bcr.bazel.build/modules/aspect_rules_js/1.34.1/MODULE.bazel": "d86fd8dcc3e09d17df383e994b3adf87ed645b065c604eb4560404603c46fa8d",
+    "https://bcr.bazel.build/modules/aspect_rules_js/1.34.1/source.json": "ecb32e41d9a1e1dc84525141bbfdc7155b8b71048b0a4e3f2740ed31febda874",
+    "https://bcr.bazel.build/modules/aspect_rules_ts/1.4.5/MODULE.bazel": "9e6520f1aa823e7f707968124e1bbe87598ec5495df3162d0749fa19a29973bb",
+    "https://bcr.bazel.build/modules/aspect_rules_ts/1.4.5/source.json": "40b03d827dd656b775318fe205a54481219d4729a67bf8293e6706a8a41ab2cd",
+    "https://bcr.bazel.build/modules/bazel_features/0.1.0/MODULE.bazel": "47011d645b0f949f42ee67f2e8775188a9cf4a0a1528aa2fa4952f2fd00906fd",
     "https://bcr.bazel.build/modules/bazel_features/1.1.1/MODULE.bazel": "27b8c79ef57efe08efccbd9dd6ef70d61b4798320b8d3c134fd571f78963dbcd",
     "https://bcr.bazel.build/modules/bazel_features/1.10.0/MODULE.bazel": "f75e8807570484a99be90abcd52b5e1f390362c258bcb73106f4544957a48101",
     "https://bcr.bazel.build/modules/bazel_features/1.11.0/MODULE.bazel": "f9382337dd5a474c3b7d334c2f83e50b6eaedc284253334cf823044a26de03e8",
@@ -20,6 +35,7 @@
     "https://bcr.bazel.build/modules/bazel_features/1.19.0/source.json": "d7bf14517c1b25b9d9c580b0f8795fceeae08a7590f507b76aace528e941375d",
     "https://bcr.bazel.build/modules/bazel_features/1.9.1/MODULE.bazel": "8f679097876a9b609ad1f60249c49d68bfab783dd9be012faf9d82547b14815a",
     "https://bcr.bazel.build/modules/bazel_skylib/1.0.3/MODULE.bazel": "bcb0fd896384802d1ad283b4e4eb4d718eebd8cb820b0a2c3a347fb971afd9d8",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.1.1/MODULE.bazel": "1add3e7d93ff2e6998f9e118022c84d163917d912f5afafb3058e3d2f1545b5e",
     "https://bcr.bazel.build/modules/bazel_skylib/1.2.0/MODULE.bazel": "44fe84260e454ed94ad326352a698422dbe372b21a1ac9f3eab76eb531223686",
     "https://bcr.bazel.build/modules/bazel_skylib/1.2.1/MODULE.bazel": "f35baf9da0efe45fa3da1696ae906eea3d615ad41e2e3def4aeb4e8bc0ef9a7a",
     "https://bcr.bazel.build/modules/bazel_skylib/1.3.0/MODULE.bazel": "20228b92868bf5cfc41bda7afc8a8ba2a543201851de39d990ec957b513579c5",
@@ -29,15 +45,33 @@
     "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/MODULE.bazel": "8fdee2dbaace6c252131c00e1de4b165dc65af02ea278476187765e1a617b917",
     "https://bcr.bazel.build/modules/bazel_skylib/1.7.1/MODULE.bazel": "3120d80c5861aa616222ec015332e5f8d3171e062e3e804a2a0253e1be26e59b",
     "https://bcr.bazel.build/modules/bazel_skylib/1.7.1/source.json": "f121b43eeefc7c29efbd51b83d08631e2347297c95aac9764a701f2a6a2bb953",
+    "https://bcr.bazel.build/modules/boringssl/0.0.0-20211025-d4f1ab9/MODULE.bazel": "6ee6353f8b1a701fe2178e1d925034294971350b6d3ac37e67e5a7d463267834",
+    "https://bcr.bazel.build/modules/boringssl/0.0.0-20211025-d4f1ab9/source.json": "323bafff99739f6aba35b69a84f0bc04ddb4540a46c1694355f60f073dff3001",
     "https://bcr.bazel.build/modules/buildozer/7.1.2/MODULE.bazel": "2e8dd40ede9c454042645fd8d8d0cd1527966aa5c919de86661e62953cd73d84",
     "https://bcr.bazel.build/modules/buildozer/7.1.2/source.json": "c9028a501d2db85793a6996205c8de120944f50a0d570438fcae0457a5f9d1f8",
+    "https://bcr.bazel.build/modules/c-ares/1.15.0/MODULE.bazel": "ba0a78360fdc83f02f437a9e7df0532ad1fbaa59b722f6e715c11effebaa0166",
+    "https://bcr.bazel.build/modules/c-ares/1.15.0/source.json": "5e3ed991616c5ec4cc09b0893b29a19232de4a1830eb78c567121bfea87453f7",
+    "https://bcr.bazel.build/modules/curl/8.8.0/MODULE.bazel": "7da3b3e79b0b4ee8f8c95d640bc6ad7b430ce66ef6e9c9d2bc29b3b5ef85f6fe",
+    "https://bcr.bazel.build/modules/curl/8.8.0/source.json": "d7d138b6878cf38891692fee0649ace35357fd549b425614d571786f054374d4",
+    "https://bcr.bazel.build/modules/flatbuffers/24.3.25/MODULE.bazel": "2794b084ee385ecd08a22fd90614b93851508ceb7a97e63da399886dedbc696c",
+    "https://bcr.bazel.build/modules/flatbuffers/24.3.25/source.json": "0cea4d62612a34154ffe0208a85f9f197edbb1f8f37a8855ec4aa722fea69276",
+    "https://bcr.bazel.build/modules/gazelle/0.26.0/MODULE.bazel": "6bf5f61b15648e7e35db25fb23cef6b4164fc71c3064ac42ecacafcb6d02abe6",
+    "https://bcr.bazel.build/modules/gazelle/0.32.0/MODULE.bazel": "b499f58a5d0d3537f3cf5b76d8ada18242f64ec474d8391247438bf04f58c7b8",
+    "https://bcr.bazel.build/modules/gazelle/0.32.0/source.json": "ef7e2d5194a004d902f5a745eb8f466c90b63a539e9d59311197b87e4d1caee7",
     "https://bcr.bazel.build/modules/google_benchmark/1.8.2/MODULE.bazel": "a70cf1bba851000ba93b58ae2f6d76490a9feb74192e57ab8e8ff13c34ec50cb",
     "https://bcr.bazel.build/modules/googletest/1.11.0/MODULE.bazel": "3a83f095183f66345ca86aa13c58b59f9f94a2f81999c093d4eeaa2d262d12f4",
     "https://bcr.bazel.build/modules/googletest/1.14.0.bcr.1/MODULE.bazel": "22c31a561553727960057361aa33bf20fb2e98584bc4fec007906e27053f80c6",
     "https://bcr.bazel.build/modules/googletest/1.14.0/MODULE.bazel": "cfbcbf3e6eac06ef9d85900f64424708cc08687d1b527f0ef65aa7517af8118f",
     "https://bcr.bazel.build/modules/googletest/1.15.2/MODULE.bazel": "6de1edc1d26cafb0ea1a6ab3f4d4192d91a312fd2d360b63adaa213cd00b2108",
     "https://bcr.bazel.build/modules/googletest/1.15.2/source.json": "dbdda654dcb3a0d7a8bc5d0ac5fc7e150b58c2a986025ae5bc634bb2cb61f470",
+    "https://bcr.bazel.build/modules/grpc/1.41.0/MODULE.bazel": "5bcbfc2b274dabea628f0649dc50c90cf36543b1cfc31624832538644ad1aae8",
+    "https://bcr.bazel.build/modules/grpc/1.48.1/MODULE.bazel": "3ca31ff176210449f280cb7765b59f3c6497abe10fa6f888de7b7bf00de53176",
+    "https://bcr.bazel.build/modules/grpc/1.48.1/source.json": "fb95df9c53c0a004f6681fa0e4a87d7b8c85c2182a73ada28c06339dbee78e42",
     "https://bcr.bazel.build/modules/libpfm/4.11.0/MODULE.bazel": "45061ff025b301940f1e30d2c16bea596c25b176c8b6b3087e92615adbd52902",
+    "https://bcr.bazel.build/modules/mbedtls/3.6.0/MODULE.bazel": "8e380e4698107c5f8766264d4df92e36766248447858db28187151d884995a09",
+    "https://bcr.bazel.build/modules/mbedtls/3.6.0/source.json": "1dbe7eb5258050afcc3806b9d43050f71c6f539ce0175535c670df606790b30c",
+    "https://bcr.bazel.build/modules/nlohmann_json/3.11.3/MODULE.bazel": "87023db2f55fc3a9949c7b08dc711fae4d4be339a80a99d04453c4bb3998eefc",
+    "https://bcr.bazel.build/modules/nlohmann_json/3.11.3/source.json": "296c63a90c6813e53b3812d24245711981fc7e563d98fe15625f55181494488a",
     "https://bcr.bazel.build/modules/platforms/0.0.10/MODULE.bazel": "8cb8efaf200bdeb2150d93e162c40f388529a25852b332cec879373771e48ed5",
     "https://bcr.bazel.build/modules/platforms/0.0.10/source.json": "f22828ff4cf021a6b577f1bf6341cb9dcd7965092a439f64fc1bb3b7a5ae4bd5",
     "https://bcr.bazel.build/modules/platforms/0.0.4/MODULE.bazel": "9b328e31ee156f53f3c416a64f8491f7eb731742655a47c9eec4703a71644aee",
@@ -51,10 +85,12 @@
     "https://bcr.bazel.build/modules/protobuf/24.4/MODULE.bazel": "7bc7ce5f2abf36b3b7b7c8218d3acdebb9426aeb35c2257c96445756f970eb12",
     "https://bcr.bazel.build/modules/protobuf/24.4/source.json": "ace4b8c65d4cfe64efe544f09fc5e5df77faf3a67fbb29c5341e0d755d9b15d6",
     "https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0",
+    "https://bcr.bazel.build/modules/protobuf/3.19.2/MODULE.bazel": "532ffe5f2186b69fdde039efe6df13ba726ff338c6bc82275ad433013fa10573",
     "https://bcr.bazel.build/modules/protobuf/3.19.6/MODULE.bazel": "9233edc5e1f2ee276a60de3eaa47ac4132302ef9643238f23128fea53ea12858",
     "https://bcr.bazel.build/modules/pybind11_bazel/2.11.1/MODULE.bazel": "88af1c246226d87e65be78ed49ecd1e6f5e98648558c14ce99176da041dc378e",
     "https://bcr.bazel.build/modules/pybind11_bazel/2.12.0/MODULE.bazel": "e6f4c20442eaa7c90d7190d8dc539d0ab422f95c65a57cc59562170c58ae3d34",
     "https://bcr.bazel.build/modules/pybind11_bazel/2.12.0/source.json": "6900fdc8a9e95866b8c0d4ad4aba4d4236317b5c1cd04c502df3f0d33afed680",
+    "https://bcr.bazel.build/modules/re2/2021-09-01/MODULE.bazel": "bcb6b96f3b071e6fe2d8bed9cc8ada137a105f9d2c5912e91d27528b3d123833",
     "https://bcr.bazel.build/modules/re2/2023-09-01/MODULE.bazel": "cb3d511531b16cfc78a225a9e2136007a48cf8a677e4264baeab57fe78a80206",
     "https://bcr.bazel.build/modules/re2/2024-07-02/MODULE.bazel": "0eadc4395959969297cbcf31a249ff457f2f1d456228c67719480205aa306daa",
     "https://bcr.bazel.build/modules/re2/2024-07-02/source.json": "547d0111a9d4f362db32196fef805abbf3676e8d6afbe44d395d87816c1130ca",
@@ -65,7 +101,13 @@
     "https://bcr.bazel.build/modules/rules_cc/0.0.9/MODULE.bazel": "836e76439f354b89afe6a911a7adf59a6b2518fafb174483ad78a2a2fde7b1c5",
     "https://bcr.bazel.build/modules/rules_cc/0.1.0/MODULE.bazel": "2fef03775b9ba995ec543868840041cc69e8bc705eb0cb6604a36eee18c87d8b",
     "https://bcr.bazel.build/modules/rules_cc/0.1.0/source.json": "8a4e832d75e073ab56c74dd77008cf7a81e107dec4544019eb1eefc1320d55be",
+    "https://bcr.bazel.build/modules/rules_go/0.33.0/MODULE.bazel": "a2b11b64cd24bf94f57454f53288a5dacfe6cb86453eee7761b7637728c1910c",
+    "https://bcr.bazel.build/modules/rules_go/0.34.0/MODULE.bazel": "20240361d6ff5cb752121af8c64aa41adc5a72ade59c90040606070e1690be09",
+    "https://bcr.bazel.build/modules/rules_go/0.41.0/MODULE.bazel": "55861d8e8bb0e62cbd2896f60ff303f62ffcb0eddb74ecb0e5c0cbe36fc292c8",
+    "https://bcr.bazel.build/modules/rules_go/0.41.0/source.json": "a46e5f523176e3bd60b1c9cfdcb6c878b9cd14c21fe1a563c4ba0e6d0e7c4dd8",
     "https://bcr.bazel.build/modules/rules_java/4.0.0/MODULE.bazel": "5a78a7ae82cd1a33cef56dc578c7d2a46ed0dca12643ee45edbb8417899e6f74",
+    "https://bcr.bazel.build/modules/rules_java/5.1.0/MODULE.bazel": "324b6478b0343a3ce7a9add8586ad75d24076d6d43d2f622990b9c1cfd8a1b15",
+    "https://bcr.bazel.build/modules/rules_java/5.3.5/MODULE.bazel": "a4ec4f2db570171e3e5eb753276ee4b389bae16b96207e9d3230895c99644b86",
     "https://bcr.bazel.build/modules/rules_java/6.3.0/MODULE.bazel": "a97c7678c19f236a956ad260d59c86e10a463badb7eb2eda787490f4c969b963",
     "https://bcr.bazel.build/modules/rules_java/7.1.0/MODULE.bazel": "30d9135a2b6561c761bd67bd4990da591e6bdc128790ce3e7afd6a3558b2fb64",
     "https://bcr.bazel.build/modules/rules_java/7.6.5/MODULE.bazel": "481164be5e02e4cab6e77a36927683263be56b7e36fef918b458d7a8a1ebadb1",
@@ -77,6 +119,9 @@
     "https://bcr.bazel.build/modules/rules_license/0.0.3/MODULE.bazel": "627e9ab0247f7d1e05736b59dbb1b6871373de5ad31c3011880b4133cafd4bd0",
     "https://bcr.bazel.build/modules/rules_license/0.0.7/MODULE.bazel": "088fbeb0b6a419005b89cf93fe62d9517c0a2b8bb56af3244af65ecfe37e7d5d",
     "https://bcr.bazel.build/modules/rules_license/0.0.7/source.json": "355cc5737a0f294e560d52b1b7a6492d4fff2caf0bef1a315df5a298fca2d34a",
+    "https://bcr.bazel.build/modules/rules_nodejs/5.8.2/MODULE.bazel": "6bc03c8f37f69401b888023bf511cb6ee4781433b0cb56236b2e55a21e3a026a",
+    "https://bcr.bazel.build/modules/rules_nodejs/5.8.3/MODULE.bazel": "9fac1897d2067a37693e47f48e11cdb386a455902313c85e9e46fe0aaaa2e4e1",
+    "https://bcr.bazel.build/modules/rules_nodejs/5.8.3/source.json": "adc580471187345e43dd874d951a84d2256455fbeaedca539174f1e4ab49f9a4",
     "https://bcr.bazel.build/modules/rules_pkg/0.7.0/MODULE.bazel": "df99f03fc7934a4737122518bb87e667e62d780b610910f0447665a7e2be62dc",
     "https://bcr.bazel.build/modules/rules_pkg/0.7.0/source.json": "c2557066e0c0342223ba592510ad3d812d4963b9024831f7f66fd0584dd8c66c",
     "https://bcr.bazel.build/modules/rules_proto/4.0.0/MODULE.bazel": "a7a7b6ce9bee418c1a760b3d84f83a299ad6952f9903c67f19e4edd964894e06",
@@ -92,10 +137,16 @@
     "https://bcr.bazel.build/modules/rules_python/0.37.2/MODULE.bazel": "b5ffde91410745750b6c13be1c5dc4555ef5bc50562af4a89fd77807fdde626a",
     "https://bcr.bazel.build/modules/rules_python/0.37.2/source.json": "af5c224d27ec98a612b4dcbdc481e02502cd5a4b49d87f0093200a10a35383e9",
     "https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c",
+    "https://bcr.bazel.build/modules/rules_swift/1.2.0/MODULE.bazel": "9559e7b880723a274845b92bc760bb2d4c9f9f562388155e357f05932b941789",
+    "https://bcr.bazel.build/modules/rules_swift/1.2.0/source.json": "c9344551abbd8544e128be8130277da6cd2f54a7a40182700b15c1fb8adb9f81",
+    "https://bcr.bazel.build/modules/stardoc/0.5.0/MODULE.bazel": "f9f1f46ba8d9c3362648eea571c6f9100680efc44913618811b58cc9c02cd678",
     "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8",
     "https://bcr.bazel.build/modules/stardoc/0.5.3/MODULE.bazel": "c7f6948dae6999bf0db32c1858ae345f112cacf98f174c7a8bb707e41b974f1c",
+    "https://bcr.bazel.build/modules/stardoc/0.5.4/MODULE.bazel": "6569966df04610b8520957cb8e97cf2e9faac2c0309657c537ab51c16c18a2a4",
     "https://bcr.bazel.build/modules/stardoc/0.6.2/MODULE.bazel": "7060193196395f5dd668eda046ccbeacebfd98efc77fed418dbe2b82ffaa39fd",
     "https://bcr.bazel.build/modules/stardoc/0.6.2/source.json": "d2ff8063b63b4a85e65fe595c4290f99717434fa9f95b4748a79a7d04dfed349",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20211020-160625a/MODULE.bazel": "6cced416be2dc5b9c05efd5b997049ba795e5e4e6fafbe1624f4587767638928",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220602-e5f2601/MODULE.bazel": "84a1b5fc76719c2841759d150637cca2fdc19abccc680d6d02614def044379de",
     "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43",
     "https://bcr.bazel.build/modules/upb/0.0.0-20230516-61a97ef/MODULE.bazel": "c0df5e35ad55e264160417fd0875932ee3c9dda63d9fccace35ac62f45e1b6f9",
     "https://bcr.bazel.build/modules/upb/0.0.0-20230516-61a97ef/source.json": "b2150404947339e8b947c6b16baa39fa75657f4ddec5e37272c7b11c7ab533bc",
@@ -134,6 +185,11490 @@
         ]
       }
     },
+    "@@aspect_bazel_lib~//lib:extensions.bzl%toolchains": {
+      "general": {
+        "bzlTransitiveDigest": "cyiMvevu77OGi9zTS4peB4cqYPpX4JK6nZW3vbnEjMI=",
+        "usagesDigest": "MqlTLnt+KowkMXXH3DzwrO4g8VlBc9gSrGIl9NEh+S4=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "copy_directory_darwin_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_platform_repo",
+            "attributes": {
+              "platform": "darwin_amd64"
+            }
+          },
+          "copy_directory_darwin_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_platform_repo",
+            "attributes": {
+              "platform": "darwin_arm64"
+            }
+          },
+          "copy_directory_freebsd_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_platform_repo",
+            "attributes": {
+              "platform": "freebsd_amd64"
+            }
+          },
+          "copy_directory_linux_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_platform_repo",
+            "attributes": {
+              "platform": "linux_amd64"
+            }
+          },
+          "copy_directory_linux_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_platform_repo",
+            "attributes": {
+              "platform": "linux_arm64"
+            }
+          },
+          "copy_directory_windows_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_platform_repo",
+            "attributes": {
+              "platform": "windows_amd64"
+            }
+          },
+          "copy_directory_toolchains": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_directory_toolchain.bzl",
+            "ruleClassName": "copy_directory_toolchains_repo",
+            "attributes": {
+              "user_repository_name": "copy_directory"
+            }
+          },
+          "copy_to_directory_darwin_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_platform_repo",
+            "attributes": {
+              "platform": "darwin_amd64"
+            }
+          },
+          "copy_to_directory_darwin_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_platform_repo",
+            "attributes": {
+              "platform": "darwin_arm64"
+            }
+          },
+          "copy_to_directory_freebsd_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_platform_repo",
+            "attributes": {
+              "platform": "freebsd_amd64"
+            }
+          },
+          "copy_to_directory_linux_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_platform_repo",
+            "attributes": {
+              "platform": "linux_amd64"
+            }
+          },
+          "copy_to_directory_linux_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_platform_repo",
+            "attributes": {
+              "platform": "linux_arm64"
+            }
+          },
+          "copy_to_directory_windows_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_platform_repo",
+            "attributes": {
+              "platform": "windows_amd64"
+            }
+          },
+          "copy_to_directory_toolchains": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:copy_to_directory_toolchain.bzl",
+            "ruleClassName": "copy_to_directory_toolchains_repo",
+            "attributes": {
+              "user_repository_name": "copy_to_directory"
+            }
+          },
+          "jq_darwin_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:jq_toolchain.bzl",
+            "ruleClassName": "jq_platform_repo",
+            "attributes": {
+              "platform": "darwin_amd64",
+              "version": "1.6"
+            }
+          },
+          "jq_darwin_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:jq_toolchain.bzl",
+            "ruleClassName": "jq_platform_repo",
+            "attributes": {
+              "platform": "darwin_arm64",
+              "version": "1.6"
+            }
+          },
+          "jq_linux_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:jq_toolchain.bzl",
+            "ruleClassName": "jq_platform_repo",
+            "attributes": {
+              "platform": "linux_amd64",
+              "version": "1.6"
+            }
+          },
+          "jq_windows_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:jq_toolchain.bzl",
+            "ruleClassName": "jq_platform_repo",
+            "attributes": {
+              "platform": "windows_amd64",
+              "version": "1.6"
+            }
+          },
+          "jq": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:jq_toolchain.bzl",
+            "ruleClassName": "jq_host_alias_repo",
+            "attributes": {}
+          },
+          "jq_toolchains": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:jq_toolchain.bzl",
+            "ruleClassName": "jq_toolchains_repo",
+            "attributes": {
+              "user_repository_name": "jq"
+            }
+          },
+          "yq_darwin_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "darwin_amd64",
+              "version": "4.25.2"
+            }
+          },
+          "yq_darwin_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "darwin_arm64",
+              "version": "4.25.2"
+            }
+          },
+          "yq_linux_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "linux_amd64",
+              "version": "4.25.2"
+            }
+          },
+          "yq_linux_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "linux_arm64",
+              "version": "4.25.2"
+            }
+          },
+          "yq_linux_s390x": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "linux_s390x",
+              "version": "4.25.2"
+            }
+          },
+          "yq_linux_ppc64le": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "linux_ppc64le",
+              "version": "4.25.2"
+            }
+          },
+          "yq_windows_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_platform_repo",
+            "attributes": {
+              "platform": "windows_amd64",
+              "version": "4.25.2"
+            }
+          },
+          "yq": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_host_alias_repo",
+            "attributes": {}
+          },
+          "yq_toolchains": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:yq_toolchain.bzl",
+            "ruleClassName": "yq_toolchains_repo",
+            "attributes": {
+              "user_repository_name": "yq"
+            }
+          },
+          "coreutils_darwin_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:coreutils_toolchain.bzl",
+            "ruleClassName": "coreutils_platform_repo",
+            "attributes": {
+              "platform": "darwin_amd64",
+              "version": "0.0.16"
+            }
+          },
+          "coreutils_darwin_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:coreutils_toolchain.bzl",
+            "ruleClassName": "coreutils_platform_repo",
+            "attributes": {
+              "platform": "darwin_arm64",
+              "version": "0.0.16"
+            }
+          },
+          "coreutils_linux_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:coreutils_toolchain.bzl",
+            "ruleClassName": "coreutils_platform_repo",
+            "attributes": {
+              "platform": "linux_amd64",
+              "version": "0.0.16"
+            }
+          },
+          "coreutils_linux_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:coreutils_toolchain.bzl",
+            "ruleClassName": "coreutils_platform_repo",
+            "attributes": {
+              "platform": "linux_arm64",
+              "version": "0.0.16"
+            }
+          },
+          "coreutils_windows_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:coreutils_toolchain.bzl",
+            "ruleClassName": "coreutils_platform_repo",
+            "attributes": {
+              "platform": "windows_amd64",
+              "version": "0.0.16"
+            }
+          },
+          "coreutils_toolchains": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:coreutils_toolchain.bzl",
+            "ruleClassName": "coreutils_toolchains_repo",
+            "attributes": {
+              "user_repository_name": "coreutils"
+            }
+          },
+          "expand_template_darwin_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_platform_repo",
+            "attributes": {
+              "platform": "darwin_amd64"
+            }
+          },
+          "expand_template_darwin_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_platform_repo",
+            "attributes": {
+              "platform": "darwin_arm64"
+            }
+          },
+          "expand_template_freebsd_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_platform_repo",
+            "attributes": {
+              "platform": "freebsd_amd64"
+            }
+          },
+          "expand_template_linux_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_platform_repo",
+            "attributes": {
+              "platform": "linux_amd64"
+            }
+          },
+          "expand_template_linux_arm64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_platform_repo",
+            "attributes": {
+              "platform": "linux_arm64"
+            }
+          },
+          "expand_template_windows_amd64": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_platform_repo",
+            "attributes": {
+              "platform": "windows_amd64"
+            }
+          },
+          "expand_template_toolchains": {
+            "bzlFile": "@@aspect_bazel_lib~//lib/private:expand_template_toolchain.bzl",
+            "ruleClassName": "expand_template_toolchains_repo",
+            "attributes": {
+              "user_repository_name": "expand_template"
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "aspect_bazel_lib~",
+            "aspect_bazel_lib",
+            "aspect_bazel_lib~"
+          ],
+          [
+            "aspect_bazel_lib~",
+            "bazel_skylib",
+            "bazel_skylib~"
+          ],
+          [
+            "aspect_bazel_lib~",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
+    "@@aspect_rules_esbuild~//esbuild:extensions.bzl%esbuild": {
+      "general": {
+        "bzlTransitiveDigest": "V7gqrTgzsNF5oyT9MPb6/wXsP0JTzzlsNO813vDYL3o=",
+        "usagesDigest": "MXYZ9socGXzRzqMvkGiWBTmlQNiIg/0jV3eix/hm9IM=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "esbuild_darwin-x64": {
+            "bzlFile": "@@aspect_rules_esbuild~//esbuild:repositories.bzl",
+            "ruleClassName": "esbuild_repositories",
+            "attributes": {
+              "esbuild_version": "0.16.7",
+              "platform": "darwin-x64"
+            }
+          },
+          "esbuild_darwin-arm64": {
+            "bzlFile": "@@aspect_rules_esbuild~//esbuild:repositories.bzl",
+            "ruleClassName": "esbuild_repositories",
+            "attributes": {
+              "esbuild_version": "0.16.7",
+              "platform": "darwin-arm64"
+            }
+          },
+          "esbuild_linux-x64": {
+            "bzlFile": "@@aspect_rules_esbuild~//esbuild:repositories.bzl",
+            "ruleClassName": "esbuild_repositories",
+            "attributes": {
+              "esbuild_version": "0.16.7",
+              "platform": "linux-x64"
+            }
+          },
+          "esbuild_linux-arm64": {
+            "bzlFile": "@@aspect_rules_esbuild~//esbuild:repositories.bzl",
+            "ruleClassName": "esbuild_repositories",
+            "attributes": {
+              "esbuild_version": "0.16.7",
+              "platform": "linux-arm64"
+            }
+          },
+          "esbuild_win32-x64": {
+            "bzlFile": "@@aspect_rules_esbuild~//esbuild:repositories.bzl",
+            "ruleClassName": "esbuild_repositories",
+            "attributes": {
+              "esbuild_version": "0.16.7",
+              "platform": "win32-x64"
+            }
+          },
+          "esbuild_toolchains": {
+            "bzlFile": "@@aspect_rules_esbuild~//esbuild/private:toolchains_repo.bzl",
+            "ruleClassName": "toolchains_repo",
+            "attributes": {
+              "esbuild_version": "0.16.7",
+              "user_repository_name": "esbuild"
+            }
+          },
+          "npm__esbuild_0.16.7": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "esbuild",
+              "version": "0.16.7",
+              "root_package": "",
+              "link_workspace": "",
+              "link_packages": {},
+              "integrity": "sha512-P6OBFYFSQOGzfApqCeYKqfKRRbCIRsdppTXFo4aAvtiW3o8TTyiIplBvHJI171saPAiy3WlawJHCveJVIOIx1A==",
+              "url": "",
+              "commit": "",
+              "patch_args": [
+                "-p0"
+              ],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__esbuild_0.16.7__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "esbuild",
+              "version": "0.16.7",
+              "dev": false,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {},
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "aspect_bazel_lib~",
+            "bazel_skylib",
+            "bazel_skylib~"
+          ],
+          [
+            "aspect_bazel_lib~",
+            "bazel_tools",
+            "bazel_tools"
+          ],
+          [
+            "aspect_rules_esbuild~",
+            "aspect_rules_js",
+            "aspect_rules_js~"
+          ],
+          [
+            "aspect_rules_esbuild~",
+            "bazel_skylib",
+            "bazel_skylib~"
+          ],
+          [
+            "aspect_rules_js~",
+            "aspect_bazel_lib",
+            "aspect_bazel_lib~"
+          ],
+          [
+            "aspect_rules_js~",
+            "bazel_features",
+            "bazel_features~"
+          ],
+          [
+            "aspect_rules_js~",
+            "bazel_skylib",
+            "bazel_skylib~"
+          ],
+          [
+            "aspect_rules_js~",
+            "bazel_tools",
+            "bazel_tools"
+          ],
+          [
+            "bazel_features~",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
+    "@@aspect_rules_js~//npm:extensions.bzl%npm": {
+      "general": {
+        "bzlTransitiveDigest": "w6lgYO0RBRKMcZfpeusmQItcjQEtzi3yrl4pr7bCcxY=",
+        "usagesDigest": "X+Ly8Gee3CATehlJzM3ECpcqMkB382icTycKvOy8CIU=",
+        "recordedFileInputs": {
+          "@@flatbuffers~//pnpm-lock.yaml": "130fab1c4307b9bdac43bf88332f2311209a33aead012922d7a97e58a50b7de4",
+          "@@flatbuffers~//.npmrc": "d94d573d5aa644cdd09ff46d9b9c5e9b59185533420308c9a55ad5dc3176f22b"
+        },
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "npm": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_translate_lock.bzl",
+            "ruleClassName": "npm_translate_lock_rule",
+            "attributes": {
+              "pnpm_lock": "@@flatbuffers~//:pnpm-lock.yaml",
+              "update_pnpm_lock": false,
+              "npmrc": "@@flatbuffers~//:.npmrc",
+              "use_home_npmrc": false,
+              "patches": {},
+              "patch_args": {},
+              "custom_postinstalls": {},
+              "package_visibility": {},
+              "prod": false,
+              "public_hoist_packages": {},
+              "dev": false,
+              "no_optional": false,
+              "lifecycle_hooks": {
+                "*": [
+                  "preinstall",
+                  "install",
+                  "postinstall"
+                ]
+              },
+              "lifecycle_hooks_envs": {},
+              "lifecycle_hooks_execution_requirements": {
+                "*": [
+                  "no-sandbox"
+                ]
+              },
+              "bins": {},
+              "verify_node_modules_ignored": "@@flatbuffers~//:.bazelignore",
+              "external_repository_action_cache": ".aspect/rules/external_repository_action_cache",
+              "link_workspace": "",
+              "root_package": ".",
+              "additional_file_contents": {},
+              "repositories_bzl_filename": "repositories.bzl",
+              "defs_bzl_filename": "defs.bzl",
+              "generate_bzl_library_targets": false,
+              "data": [],
+              "preupdate": [],
+              "quiet": true,
+              "update_pnpm_lock_node_toolchain_prefix": "nodejs",
+              "npm_package_target_name": "{dirname}"
+            }
+          },
+          "npm__at_aashutoshrathi_word-wrap__1.2.6": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@aashutoshrathi/word-wrap",
+              "version": "1.2.6",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==",
+              "url": "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_aashutoshrathi_word-wrap__1.2.6__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@aashutoshrathi/word-wrap",
+              "version": "1.2.6",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_android-arm64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/android-arm64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-B8JbS61bEunhfx8kasogFENgQfr/dIp+ggYXwTqdbMAgGDhRa3AaPpQMuQU0rNxDLECj6FhDzk1cF9WHMVwrtA==",
+              "url": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_android-arm64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/android-arm64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/android-arm64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_android-arm__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/android-arm",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-31E2lxlGM1KEfivQl8Yf5aYU/mflz9g06H6S15ITUFQueMFtFjESRMoDSkvMo8thYvLBax+VKTPlpnx+sPicOA==",
+              "url": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_android-arm__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/android-arm",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/android-arm": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_android-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/android-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-rdqqYfRIn4jWOp+lzQttYMa2Xar3OK9Yt2fhOhzFXqg0rVWEfSclJvZq5fZslnz6ypHvVf3CT7qyf0A5pM682A==",
+              "url": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_android-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/android-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/android-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_darwin-arm64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/darwin-arm64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-RQw9DemMbIq35Bprbboyf8SmOr4UXsRVxJ97LgB55VKKeJOOdvsIPy0nFyF2l8U+h4PtBx/1kRf0BelOYCiQcw==",
+              "url": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_darwin-arm64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/darwin-arm64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/darwin-arm64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_darwin-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/darwin-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-3sur80OT9YdeZwIVgERAysAbwncom7b4bCI2XKLjMfPymTud7e/oY4y+ci1XVp5TfQp/bppn7xLw1n/oSQY3/Q==",
+              "url": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_darwin-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/darwin-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/darwin-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_freebsd-arm64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/freebsd-arm64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-WAnPJSDattvS/XtPCTj1tPoTxERjcTpH6HsMr6ujTT+X6rylVe8ggxk8pVxzf5U1wh5sPODpawNicF5ta/9Tmw==",
+              "url": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_freebsd-arm64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/freebsd-arm64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/freebsd-arm64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_freebsd-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/freebsd-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-ICvZyOplIjmmhjd6mxi+zxSdpPTKFfyPPQMQTK/w+8eNK6WV01AjIztJALDtwNNfFhfZLux0tZLC+U9nSyA5Zg==",
+              "url": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_freebsd-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/freebsd-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/freebsd-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-arm64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-arm64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-z1zMZivxDLHWnyGOctT9JP70h0beY54xDDDJt4VpTX+iwA77IFsE1vCXWmprajJGa+ZYSqkSbRQ4eyLCpCmiCQ==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-arm64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-arm64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-arm64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-arm__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-arm",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-H4vmI5PYqSvosPaTJuEppU9oz1dq2A7Mr2vyg5TF9Ga+3+MGgBdGzcyBP7qK9MrwFQZlvNyJrvz6GuCaj3OukQ==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-arm__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-arm",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-arm": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-ia32__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-ia32",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-1a8suQiFJmZz1khm/rDglOc8lavtzEMRo0v6WhPgxkrjcU0LkHj+TwBrALwoz/OtMExvsqbbMI0ChyelKabSvQ==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-ia32__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-ia32",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-ia32": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-loong64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-loong64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-fHZWS2JJxnXt1uYJsDv9+b60WCc2RlvVAy1F76qOLtXRO+H4mjt3Tr6MJ5l7Q78X8KgCFudnTuiQRBhULUyBKQ==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-loong64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-loong64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-loong64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-mips64el__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-mips64el",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Wy/z0EL5qZYLX66dVnEg9riiwls5IYnziwuju2oUiuxVc+/edvqXa04qNtbrs0Ukatg5HEzqT94Zs7J207dN5Q==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-mips64el__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-mips64el",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-mips64el": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-ppc64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-ppc64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-ETaW6245wK23YIEufhMQ3HSeHO7NgsLx8gygBVldRHKhOlD1oNeNy/P67mIh1zPn2Hr2HLieQrt6tWrVwuqrxg==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-ppc64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-ppc64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-ppc64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-riscv64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-riscv64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-T2DRQk55SgoleTP+DtPlMrxi/5r9AeFgkhkZ/B0ap99zmxtxdOixOMI570VjdRCs9pE4Wdkz7JYrsPvsl7eESg==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-riscv64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-riscv64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-riscv64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-s390x__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-s390x",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-NPxbdmmo3Bk7mbNeHmcCd7R7fptJaczPYBaELk6NcXxy7HLNyWwCyDJ/Xx+/YcNH7Im5dHdx9gZ5xIwyliQCbg==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-s390x__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-s390x",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-s390x": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_linux-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/linux-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-lytMAVOM3b1gPypL2TRmZ5rnXl7+6IIk8uB3eLsV1JwcizuolblXRrc5ShPrO9ls/b+RTp+E6gbsuLWHWi2zGg==",
+              "url": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_linux-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/linux-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/linux-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_netbsd-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/netbsd-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-hvWVo2VsXz/8NVt1UhLzxwAfo5sioj92uo0bCfLibB0xlOmimU/DeAEsQILlBQvkhrGjamP0/el5HU76HAitGw==",
+              "url": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_netbsd-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/netbsd-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/netbsd-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_openbsd-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/openbsd-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-/7Y7u77rdvmGTxR83PgaSvSBJCC2L3Kb1M/+dmSIvRvQPXXCuC97QAwMugBNG0yGcbEGfFBH7ojPzAOxfGNkwQ==",
+              "url": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_openbsd-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/openbsd-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/openbsd-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_sunos-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/sunos-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-9Lc4s7Oi98GqFA4HzA/W2JHIYfnXbUYgekUP/Sm4BG9sfLjyv6GKKHKKVs83SMicBF2JwAX6A1PuOLMqpD001w==",
+              "url": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_sunos-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/sunos-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/sunos-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_win32-arm64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/win32-arm64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-rq6WzBGjSzihI9deW3fC2Gqiak68+b7qo5/3kmB6Gvbh/NYPA0sJhrnp7wgV4bNwjqM+R2AApXGxMO7ZoGhIJg==",
+              "url": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_win32-arm64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/win32-arm64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/win32-arm64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_win32-ia32__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/win32-ia32",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-AIAbverbg5jMvJznYiGhrd3sumfwWs8572mIJL5NQjJa06P8KfCPWZQ0NwZbPQnbQi9OWSZhFVSUWjjIrn4hSw==",
+              "url": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_win32-ia32__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/win32-ia32",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/win32-ia32": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_esbuild_win32-x64__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@esbuild/win32-x64",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-bfZ0cQ1uZs2PqpulNL5j/3w+GDhP36k1K5c38QdQg+Swy51jFZWWeIkteNsufkQxp986wnqRRsb/bHbY1WQ7TA==",
+              "url": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_esbuild_win32-x64__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@esbuild/win32-x64",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@esbuild/win32-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_eslint-community_eslint-utils__4.4.0__eslint_8.55.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@eslint-community/eslint-utils",
+              "version": "4.4.0_eslint_8.55.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-1/sA4dwrzBAyeUoQ6oxahHKmrZvsnLCg4RfxW3ZFGGmQkSNQPFNLV9CUEFQP1x9EYXHTo5p6xdhZM1Ne9p/AfA==",
+              "url": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_eslint-community_eslint-utils__4.4.0__eslint_8.55.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@eslint-community/eslint-utils",
+              "version": "4.4.0_eslint_8.55.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "eslint": "8.55.0",
+                "eslint-visitor-keys": "3.4.3"
+              },
+              "transitive_closure": {
+                "@eslint-community/eslint-utils": [
+                  "4.4.0_eslint_8.55.0"
+                ],
+                "eslint": [
+                  "8.55.0"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ],
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "@eslint/js": [
+                  "8.55.0"
+                ],
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "chalk": [
+                  "4.1.2"
+                ],
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "escape-string-regexp": [
+                  "4.0.0"
+                ],
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "esquery": [
+                  "1.5.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "find-up": [
+                  "5.0.0"
+                ],
+                "glob-parent": [
+                  "6.0.2"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "graphemer": [
+                  "1.4.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "imurmurhash": [
+                  "0.1.4"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-path-inside": [
+                  "3.0.3"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "lodash.merge": [
+                  "4.6.2"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "natural-compare": [
+                  "1.4.0"
+                ],
+                "optionator": [
+                  "0.9.3"
+                ],
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "text-table": [
+                  "0.2.0"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_eslint-community_regexpp__4.10.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@eslint-community/regexpp",
+              "version": "4.10.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Cu96Sd2By9mCNTx2iyKOmq10v22jUVQv0lQnlGNy16oE9589yE+QADPbrMGCkA51cKZSg3Pu/aTJVTGfL/qjUA==",
+              "url": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.10.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_eslint-community_regexpp__4.10.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@eslint-community/regexpp",
+              "version": "4.10.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_eslint_eslintrc__2.1.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@eslint/eslintrc",
+              "version": "2.1.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-269Z39MS6wVJtsoUl10L60WdkhJVdPG24Q4eZTH3nnF6lpvSShEK3wQjDX9JRWAUPvPh7COouPpU9IrqaZFvtQ==",
+              "url": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-2.1.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_eslint_eslintrc__2.1.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@eslint/eslintrc",
+              "version": "2.1.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "ajv": "6.12.6",
+                "debug": "4.3.4",
+                "espree": "9.6.1",
+                "globals": "13.23.0",
+                "ignore": "5.3.0",
+                "import-fresh": "3.3.0",
+                "js-yaml": "4.1.0",
+                "minimatch": "3.1.2",
+                "strip-json-comments": "3.1.1"
+              },
+              "transitive_closure": {
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_eslint_js__8.55.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@eslint/js",
+              "version": "8.55.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-qQfo2mxH5yVom1kacMtZZJFVdW+E70mqHMJvVg6WTLo+VBuQJ4TojZlfWBjK0ve5BdEeNAVxOsl/nvNMpJOaJA==",
+              "url": "https://registry.npmjs.org/@eslint/js/-/js-8.55.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_eslint_js__8.55.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@eslint/js",
+              "version": "8.55.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@eslint/js": [
+                  "8.55.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_humanwhocodes_config-array__0.11.13": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@humanwhocodes/config-array",
+              "version": "0.11.13",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-JSBDMiDKSzQVngfRjOdFXgFfklaXI4K9nLF49Auh21lmBWRLIK3+xTErTWD4KU54pb6coM6ESE7Awz/FNU3zgQ==",
+              "url": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.13.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_humanwhocodes_config-array__0.11.13__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@humanwhocodes/config-array",
+              "version": "0.11.13",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@humanwhocodes/object-schema": "2.0.1",
+                "debug": "4.3.4",
+                "minimatch": "3.1.2"
+              },
+              "transitive_closure": {
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "ms": [
+                  "2.1.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_humanwhocodes_module-importer__1.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@humanwhocodes/module-importer",
+              "version": "1.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==",
+              "url": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_humanwhocodes_module-importer__1.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@humanwhocodes/module-importer",
+              "version": "1.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_humanwhocodes_object-schema__2.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@humanwhocodes/object-schema",
+              "version": "2.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-dvuCeX5fC9dXgJn9t+X5atfmgQAzUOWqS1254Gh0m6i8wKd10ebXkfNKiRK+1GWi/yTvvLDHpoxLr0xxxeslWw==",
+              "url": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_humanwhocodes_object-schema__2.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@humanwhocodes/object-schema",
+              "version": "2.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_nodelib_fs.scandir__2.1.5": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@nodelib/fs.scandir",
+              "version": "2.1.5",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
+              "url": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_nodelib_fs.scandir__2.1.5__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@nodelib/fs.scandir",
+              "version": "2.1.5",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@nodelib/fs.stat": "2.0.5",
+                "run-parallel": "1.2.0"
+              },
+              "transitive_closure": {
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_nodelib_fs.stat__2.0.5": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@nodelib/fs.stat",
+              "version": "2.0.5",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
+              "url": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_nodelib_fs.stat__2.0.5__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@nodelib/fs.stat",
+              "version": "2.0.5",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_nodelib_fs.walk__1.2.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@nodelib/fs.walk",
+              "version": "1.2.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
+              "url": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_nodelib_fs.walk__1.2.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@nodelib/fs.walk",
+              "version": "1.2.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@nodelib/fs.scandir": "2.1.5",
+                "fastq": "1.15.0"
+              },
+              "transitive_closure": {
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_types_json-schema__7.0.15": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@types/json-schema",
+              "version": "7.0.15",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
+              "url": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_types_json-schema__7.0.15__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@types/json-schema",
+              "version": "7.0.15",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@types/json-schema": [
+                  "7.0.15"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_types_node__20.10.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@types/node",
+              "version": "20.10.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {
+                "": [
+                  "@types/node"
+                ]
+              },
+              "integrity": "sha512-D08YG6rr8X90YB56tSIuBaddy/UXAA9RKJoFvrsnogAum/0pmjkgi4+2nx96A330FmioegBWmEYQ+syqCFaveg==",
+              "url": "https://registry.npmjs.org/@types/node/-/node-20.10.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_types_node__20.10.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@types/node",
+              "version": "20.10.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {
+                "": [
+                  "@types/node"
+                ]
+              },
+              "deps": {
+                "undici-types": "5.26.5"
+              },
+              "transitive_closure": {
+                "@types/node": [
+                  "20.10.4"
+                ],
+                "undici-types": [
+                  "5.26.5"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_types_semver__7.5.6": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@types/semver",
+              "version": "7.5.6",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-dn1l8LaMea/IjDoHNd9J52uBbInB796CDffS6VdIxvqYCPSG0V0DzHp76GpaWnlhg88uYyPbXCDIowa86ybd5A==",
+              "url": "https://registry.npmjs.org/@types/semver/-/semver-7.5.6.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_types_semver__7.5.6__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@types/semver",
+              "version": "7.5.6",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@types/semver": [
+                  "7.5.6"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_eslint-plugin__6.13.2__-1224903089": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/eslint-plugin",
+              "version": "6.13.2_-1224903089",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {
+                "": [
+                  "@typescript-eslint/eslint-plugin"
+                ]
+              },
+              "integrity": "sha512-3+9OGAWHhk4O1LlcwLBONbdXsAhLjyCFogJY/cWy2lxdVJ2JrcTF2pTGMaLl2AE7U1l31n8Py4a8bx5DLf/0dQ==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_eslint-plugin__6.13.2__-1224903089__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/eslint-plugin",
+              "version": "6.13.2_-1224903089",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {
+                "": [
+                  "@typescript-eslint/eslint-plugin"
+                ]
+              },
+              "deps": {
+                "@eslint-community/regexpp": "4.10.0",
+                "@typescript-eslint/parser": "6.13.2_1796040679",
+                "@typescript-eslint/scope-manager": "6.13.2",
+                "@typescript-eslint/type-utils": "6.13.2_1796040679",
+                "@typescript-eslint/utils": "6.13.2_1796040679",
+                "@typescript-eslint/visitor-keys": "6.13.2",
+                "debug": "4.3.4",
+                "eslint": "8.55.0",
+                "graphemer": "1.4.0",
+                "ignore": "5.3.0",
+                "natural-compare": "1.4.0",
+                "semver": "7.5.4",
+                "ts-api-utils": "1.0.3_typescript_5.3.3",
+                "typescript": "5.3.3"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/eslint-plugin": [
+                  "6.13.2_-1224903089"
+                ],
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ],
+                "@typescript-eslint/parser": [
+                  "6.13.2_1796040679"
+                ],
+                "@typescript-eslint/scope-manager": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/type-utils": [
+                  "6.13.2_1796040679"
+                ],
+                "@typescript-eslint/utils": [
+                  "6.13.2_1796040679"
+                ],
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "eslint": [
+                  "8.55.0"
+                ],
+                "graphemer": [
+                  "1.4.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "natural-compare": [
+                  "1.4.0"
+                ],
+                "semver": [
+                  "7.5.4"
+                ],
+                "ts-api-utils": [
+                  "1.0.3_typescript_5.3.3"
+                ],
+                "typescript": [
+                  "5.3.3"
+                ],
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ],
+                "@eslint-community/eslint-utils": [
+                  "4.4.0_eslint_8.55.0"
+                ],
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "@eslint/js": [
+                  "8.55.0"
+                ],
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "chalk": [
+                  "4.1.2"
+                ],
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "escape-string-regexp": [
+                  "4.0.0"
+                ],
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "esquery": [
+                  "1.5.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "find-up": [
+                  "5.0.0"
+                ],
+                "glob-parent": [
+                  "5.1.2",
+                  "6.0.2"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "imurmurhash": [
+                  "0.1.4"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-path-inside": [
+                  "3.0.3"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "lodash.merge": [
+                  "4.6.2"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "optionator": [
+                  "0.9.3"
+                ],
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "text-table": [
+                  "0.2.0"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "@types/json-schema": [
+                  "7.0.15"
+                ],
+                "@types/semver": [
+                  "7.5.6"
+                ],
+                "@typescript-eslint/typescript-estree": [
+                  "6.13.2_typescript_5.3.3"
+                ],
+                "globby": [
+                  "11.1.0"
+                ],
+                "array-union": [
+                  "2.1.0"
+                ],
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "slash": [
+                  "3.0.0"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_parser__6.13.2__1796040679": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/parser",
+              "version": "6.13.2_1796040679",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {
+                "": [
+                  "@typescript-eslint/parser"
+                ]
+              },
+              "integrity": "sha512-MUkcC+7Wt/QOGeVlM8aGGJZy1XV5YKjTpq9jK6r6/iLsGXhBVaGP5N0UYvFsu9BFlSpwY9kMretzdBH01rkRXg==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_parser__6.13.2__1796040679__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/parser",
+              "version": "6.13.2_1796040679",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {
+                "": [
+                  "@typescript-eslint/parser"
+                ]
+              },
+              "deps": {
+                "@typescript-eslint/scope-manager": "6.13.2",
+                "@typescript-eslint/types": "6.13.2",
+                "@typescript-eslint/typescript-estree": "6.13.2_typescript_5.3.3",
+                "@typescript-eslint/visitor-keys": "6.13.2",
+                "debug": "4.3.4",
+                "eslint": "8.55.0",
+                "typescript": "5.3.3"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/parser": [
+                  "6.13.2_1796040679"
+                ],
+                "@typescript-eslint/scope-manager": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/typescript-estree": [
+                  "6.13.2_typescript_5.3.3"
+                ],
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "eslint": [
+                  "8.55.0"
+                ],
+                "typescript": [
+                  "5.3.3"
+                ],
+                "@eslint-community/eslint-utils": [
+                  "4.4.0_eslint_8.55.0"
+                ],
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ],
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "@eslint/js": [
+                  "8.55.0"
+                ],
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "chalk": [
+                  "4.1.2"
+                ],
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "escape-string-regexp": [
+                  "4.0.0"
+                ],
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "esquery": [
+                  "1.5.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "find-up": [
+                  "5.0.0"
+                ],
+                "glob-parent": [
+                  "5.1.2",
+                  "6.0.2"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "graphemer": [
+                  "1.4.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "imurmurhash": [
+                  "0.1.4"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-path-inside": [
+                  "3.0.3"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "lodash.merge": [
+                  "4.6.2"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "natural-compare": [
+                  "1.4.0"
+                ],
+                "optionator": [
+                  "0.9.3"
+                ],
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "text-table": [
+                  "0.2.0"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "globby": [
+                  "11.1.0"
+                ],
+                "semver": [
+                  "7.5.4"
+                ],
+                "ts-api-utils": [
+                  "1.0.3_typescript_5.3.3"
+                ],
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ],
+                "array-union": [
+                  "2.1.0"
+                ],
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "slash": [
+                  "3.0.0"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_scope-manager__6.13.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/scope-manager",
+              "version": "6.13.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-CXQA0xo7z6x13FeDYCgBkjWzNqzBn8RXaE3QVQVIUm74fWJLkJkaHmHdKStrxQllGh6Q4eUGyNpMe0b1hMkXFA==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_scope-manager__6.13.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/scope-manager",
+              "version": "6.13.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@typescript-eslint/types": "6.13.2",
+                "@typescript-eslint/visitor-keys": "6.13.2"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/scope-manager": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_type-utils__6.13.2__1796040679": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/type-utils",
+              "version": "6.13.2_1796040679",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Qr6ssS1GFongzH2qfnWKkAQmMUyZSyOr0W54nZNU1MDfo+U4Mv3XveeLZzadc/yq8iYhQZHYT+eoXJqnACM1tw==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_type-utils__6.13.2__1796040679__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/type-utils",
+              "version": "6.13.2_1796040679",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@typescript-eslint/typescript-estree": "6.13.2_typescript_5.3.3",
+                "@typescript-eslint/utils": "6.13.2_1796040679",
+                "debug": "4.3.4",
+                "eslint": "8.55.0",
+                "ts-api-utils": "1.0.3_typescript_5.3.3",
+                "typescript": "5.3.3"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/type-utils": [
+                  "6.13.2_1796040679"
+                ],
+                "@typescript-eslint/typescript-estree": [
+                  "6.13.2_typescript_5.3.3"
+                ],
+                "@typescript-eslint/utils": [
+                  "6.13.2_1796040679"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "eslint": [
+                  "8.55.0"
+                ],
+                "ts-api-utils": [
+                  "1.0.3_typescript_5.3.3"
+                ],
+                "typescript": [
+                  "5.3.3"
+                ],
+                "@eslint-community/eslint-utils": [
+                  "4.4.0_eslint_8.55.0"
+                ],
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ],
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "@eslint/js": [
+                  "8.55.0"
+                ],
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "chalk": [
+                  "4.1.2"
+                ],
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "escape-string-regexp": [
+                  "4.0.0"
+                ],
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "esquery": [
+                  "1.5.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "find-up": [
+                  "5.0.0"
+                ],
+                "glob-parent": [
+                  "5.1.2",
+                  "6.0.2"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "graphemer": [
+                  "1.4.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "imurmurhash": [
+                  "0.1.4"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-path-inside": [
+                  "3.0.3"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "lodash.merge": [
+                  "4.6.2"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "natural-compare": [
+                  "1.4.0"
+                ],
+                "optionator": [
+                  "0.9.3"
+                ],
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "text-table": [
+                  "0.2.0"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "@types/json-schema": [
+                  "7.0.15"
+                ],
+                "@types/semver": [
+                  "7.5.6"
+                ],
+                "@typescript-eslint/scope-manager": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "semver": [
+                  "7.5.4"
+                ],
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ],
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "globby": [
+                  "11.1.0"
+                ],
+                "array-union": [
+                  "2.1.0"
+                ],
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "slash": [
+                  "3.0.0"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_types__6.13.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/types",
+              "version": "6.13.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-7sxbQ+EMRubQc3wTfTsycgYpSujyVbI1xw+3UMRUcrhSy+pN09y/lWzeKDbvhoqcRbHdc+APLs/PWYi/cisLPg==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/types/-/types-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_types__6.13.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/types",
+              "version": "6.13.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_typescript-estree__6.13.2__typescript_5.3.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/typescript-estree",
+              "version": "6.13.2_typescript_5.3.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-SuD8YLQv6WHnOEtKv8D6HZUzOub855cfPnPMKvdM/Bh1plv1f7Q/0iFUDLKKlxHcEstQnaUU4QZskgQq74t+3w==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_typescript-estree__6.13.2__typescript_5.3.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/typescript-estree",
+              "version": "6.13.2_typescript_5.3.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@typescript-eslint/types": "6.13.2",
+                "@typescript-eslint/visitor-keys": "6.13.2",
+                "debug": "4.3.4",
+                "globby": "11.1.0",
+                "is-glob": "4.0.3",
+                "semver": "7.5.4",
+                "ts-api-utils": "1.0.3_typescript_5.3.3",
+                "typescript": "5.3.3"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/typescript-estree": [
+                  "6.13.2_typescript_5.3.3"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "globby": [
+                  "11.1.0"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "semver": [
+                  "7.5.4"
+                ],
+                "ts-api-utils": [
+                  "1.0.3_typescript_5.3.3"
+                ],
+                "typescript": [
+                  "5.3.3"
+                ],
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "array-union": [
+                  "2.1.0"
+                ],
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "slash": [
+                  "3.0.0"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "glob-parent": [
+                  "5.1.2"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_utils__6.13.2__1796040679": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/utils",
+              "version": "6.13.2_1796040679",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-b9Ptq4eAZUym4idijCRzl61oPCwwREcfDI8xGk751Vhzig5fFZR9CyzDz4Sp/nxSLBYxUPyh4QdIDqWykFhNmQ==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_utils__6.13.2__1796040679__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/utils",
+              "version": "6.13.2_1796040679",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@eslint-community/eslint-utils": "4.4.0_eslint_8.55.0",
+                "@types/json-schema": "7.0.15",
+                "@types/semver": "7.5.6",
+                "@typescript-eslint/scope-manager": "6.13.2",
+                "@typescript-eslint/types": "6.13.2",
+                "@typescript-eslint/typescript-estree": "6.13.2_typescript_5.3.3",
+                "eslint": "8.55.0",
+                "semver": "7.5.4"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/utils": [
+                  "6.13.2_1796040679"
+                ],
+                "@eslint-community/eslint-utils": [
+                  "4.4.0_eslint_8.55.0"
+                ],
+                "@types/json-schema": [
+                  "7.0.15"
+                ],
+                "@types/semver": [
+                  "7.5.6"
+                ],
+                "@typescript-eslint/scope-manager": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/typescript-estree": [
+                  "6.13.2_typescript_5.3.3"
+                ],
+                "eslint": [
+                  "8.55.0"
+                ],
+                "semver": [
+                  "7.5.4"
+                ],
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ],
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ],
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "@eslint/js": [
+                  "8.55.0"
+                ],
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "chalk": [
+                  "4.1.2"
+                ],
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "escape-string-regexp": [
+                  "4.0.0"
+                ],
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "esquery": [
+                  "1.5.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "find-up": [
+                  "5.0.0"
+                ],
+                "glob-parent": [
+                  "5.1.2",
+                  "6.0.2"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "graphemer": [
+                  "1.4.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "imurmurhash": [
+                  "0.1.4"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-path-inside": [
+                  "3.0.3"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "lodash.merge": [
+                  "4.6.2"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "natural-compare": [
+                  "1.4.0"
+                ],
+                "optionator": [
+                  "0.9.3"
+                ],
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "text-table": [
+                  "0.2.0"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ],
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "globby": [
+                  "11.1.0"
+                ],
+                "ts-api-utils": [
+                  "1.0.3_typescript_5.3.3"
+                ],
+                "typescript": [
+                  "5.3.3"
+                ],
+                "array-union": [
+                  "2.1.0"
+                ],
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "slash": [
+                  "3.0.0"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_typescript-eslint_visitor-keys__6.13.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@typescript-eslint/visitor-keys",
+              "version": "6.13.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-OGznFs0eAQXJsp+xSd6k/O1UbFi/K/L7WjqeRoFE7vadjAF9y0uppXhYNQNEqygjou782maGClOoZwPqF0Drlw==",
+              "url": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-6.13.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_typescript-eslint_visitor-keys__6.13.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@typescript-eslint/visitor-keys",
+              "version": "6.13.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@typescript-eslint/types": "6.13.2",
+                "eslint-visitor-keys": "3.4.3"
+              },
+              "transitive_closure": {
+                "@typescript-eslint/visitor-keys": [
+                  "6.13.2"
+                ],
+                "@typescript-eslint/types": [
+                  "6.13.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__at_ungap_structured-clone__1.2.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "@ungap/structured-clone",
+              "version": "1.2.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==",
+              "url": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.2.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__at_ungap_structured-clone__1.2.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "@ungap/structured-clone",
+              "version": "1.2.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__acorn-jsx__5.3.2__acorn_8.11.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "acorn-jsx",
+              "version": "5.3.2_acorn_8.11.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
+              "url": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__acorn-jsx__5.3.2__acorn_8.11.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "acorn-jsx",
+              "version": "5.3.2_acorn_8.11.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "acorn": "8.11.2"
+              },
+              "transitive_closure": {
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__acorn__8.11.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "acorn",
+              "version": "8.11.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-nc0Axzp/0FILLEVsm4fNwLCwMttvhEI263QtVPQcbpfZZ3ts0hLsZGOpE6czNlid7CJ9MlyH8reXkpsf3YUY4w==",
+              "url": "https://registry.npmjs.org/acorn/-/acorn-8.11.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__acorn__8.11.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "acorn",
+              "version": "8.11.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "acorn": [
+                  "8.11.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__ajv__6.12.6": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "ajv",
+              "version": "6.12.6",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+              "url": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__ajv__6.12.6__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "ajv",
+              "version": "6.12.6",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "fast-deep-equal": "3.1.3",
+                "fast-json-stable-stringify": "2.1.0",
+                "json-schema-traverse": "0.4.1",
+                "uri-js": "4.4.1"
+              },
+              "transitive_closure": {
+                "ajv": [
+                  "6.12.6"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__ansi-regex__5.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "ansi-regex",
+              "version": "5.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+              "url": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__ansi-regex__5.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "ansi-regex",
+              "version": "5.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "ansi-regex": [
+                  "5.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__ansi-styles__4.3.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "ansi-styles",
+              "version": "4.3.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+              "url": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__ansi-styles__4.3.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "ansi-styles",
+              "version": "4.3.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "color-convert": "2.0.1"
+              },
+              "transitive_closure": {
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__argparse__2.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "argparse",
+              "version": "2.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+              "url": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__argparse__2.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "argparse",
+              "version": "2.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "argparse": [
+                  "2.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__array-union__2.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "array-union",
+              "version": "2.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==",
+              "url": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__array-union__2.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "array-union",
+              "version": "2.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "array-union": [
+                  "2.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__balanced-match__1.0.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "balanced-match",
+              "version": "1.0.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+              "url": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__balanced-match__1.0.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "balanced-match",
+              "version": "1.0.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "balanced-match": [
+                  "1.0.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__brace-expansion__1.1.11": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "brace-expansion",
+              "version": "1.1.11",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
+              "url": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__brace-expansion__1.1.11__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "brace-expansion",
+              "version": "1.1.11",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "balanced-match": "1.0.2",
+                "concat-map": "0.0.1"
+              },
+              "transitive_closure": {
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__braces__3.0.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "braces",
+              "version": "3.0.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+              "url": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__braces__3.0.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "braces",
+              "version": "3.0.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "fill-range": "7.0.1"
+              },
+              "transitive_closure": {
+                "braces": [
+                  "3.0.2"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__callsites__3.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "callsites",
+              "version": "3.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+              "url": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__callsites__3.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "callsites",
+              "version": "3.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "callsites": [
+                  "3.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__chalk__4.1.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "chalk",
+              "version": "4.1.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+              "url": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__chalk__4.1.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "chalk",
+              "version": "4.1.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "ansi-styles": "4.3.0",
+                "supports-color": "7.2.0"
+              },
+              "transitive_closure": {
+                "chalk": [
+                  "4.1.2"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__color-convert__2.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "color-convert",
+              "version": "2.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+              "url": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__color-convert__2.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "color-convert",
+              "version": "2.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "color-name": "1.1.4"
+              },
+              "transitive_closure": {
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__color-name__1.1.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "color-name",
+              "version": "1.1.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+              "url": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__color-name__1.1.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "color-name",
+              "version": "1.1.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "color-name": [
+                  "1.1.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__concat-map__0.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "concat-map",
+              "version": "0.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+              "url": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__concat-map__0.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "concat-map",
+              "version": "0.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "concat-map": [
+                  "0.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__cross-spawn__7.0.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "cross-spawn",
+              "version": "7.0.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+              "url": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__cross-spawn__7.0.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "cross-spawn",
+              "version": "7.0.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "path-key": "3.1.1",
+                "shebang-command": "2.0.0",
+                "which": "2.0.2"
+              },
+              "transitive_closure": {
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__debug__4.3.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "debug",
+              "version": "4.3.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+              "url": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__debug__4.3.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "debug",
+              "version": "4.3.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "ms": "2.1.2"
+              },
+              "transitive_closure": {
+                "debug": [
+                  "4.3.4"
+                ],
+                "ms": [
+                  "2.1.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__deep-is__0.1.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "deep-is",
+              "version": "0.1.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
+              "url": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__deep-is__0.1.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "deep-is",
+              "version": "0.1.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "deep-is": [
+                  "0.1.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__dir-glob__3.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "dir-glob",
+              "version": "3.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==",
+              "url": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__dir-glob__3.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "dir-glob",
+              "version": "3.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "path-type": "4.0.0"
+              },
+              "transitive_closure": {
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__doctrine__3.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "doctrine",
+              "version": "3.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-yS+Q5i3hBf7GBkd4KG8a7eBNNWNGLTaEwwYWUijIYM7zrlYDM0BFXHjjPWlWZ1Rg7UaddZeIDmi9jF3HmqiQ2w==",
+              "url": "https://registry.npmjs.org/doctrine/-/doctrine-3.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__doctrine__3.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "doctrine",
+              "version": "3.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "esutils": "2.0.3"
+              },
+              "transitive_closure": {
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__esbuild__0.19.8": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "esbuild",
+              "version": "0.19.8",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {
+                "": [
+                  "esbuild"
+                ]
+              },
+              "integrity": "sha512-l7iffQpT2OrZfH2rXIp7/FkmaeZM0vxbxN9KfiCwGYuZqzMg/JdvX26R31Zxn/Pxvsrg3Y9N6XTcnknqDyyv4w==",
+              "url": "https://registry.npmjs.org/esbuild/-/esbuild-0.19.8.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [
+                "preinstall",
+                "install",
+                "postinstall"
+              ],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__esbuild__0.19.8__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "esbuild",
+              "version": "0.19.8",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {
+                "": [
+                  "esbuild"
+                ]
+              },
+              "deps": {
+                "@esbuild/android-arm": "0.19.8",
+                "@esbuild/android-arm64": "0.19.8",
+                "@esbuild/android-x64": "0.19.8",
+                "@esbuild/darwin-arm64": "0.19.8",
+                "@esbuild/darwin-x64": "0.19.8",
+                "@esbuild/freebsd-arm64": "0.19.8",
+                "@esbuild/freebsd-x64": "0.19.8",
+                "@esbuild/linux-arm": "0.19.8",
+                "@esbuild/linux-arm64": "0.19.8",
+                "@esbuild/linux-ia32": "0.19.8",
+                "@esbuild/linux-loong64": "0.19.8",
+                "@esbuild/linux-mips64el": "0.19.8",
+                "@esbuild/linux-ppc64": "0.19.8",
+                "@esbuild/linux-riscv64": "0.19.8",
+                "@esbuild/linux-s390x": "0.19.8",
+                "@esbuild/linux-x64": "0.19.8",
+                "@esbuild/netbsd-x64": "0.19.8",
+                "@esbuild/openbsd-x64": "0.19.8",
+                "@esbuild/sunos-x64": "0.19.8",
+                "@esbuild/win32-arm64": "0.19.8",
+                "@esbuild/win32-ia32": "0.19.8",
+                "@esbuild/win32-x64": "0.19.8"
+              },
+              "transitive_closure": {
+                "esbuild": [
+                  "0.19.8"
+                ],
+                "@esbuild/android-arm": [
+                  "0.19.8"
+                ],
+                "@esbuild/android-arm64": [
+                  "0.19.8"
+                ],
+                "@esbuild/android-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/darwin-arm64": [
+                  "0.19.8"
+                ],
+                "@esbuild/darwin-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/freebsd-arm64": [
+                  "0.19.8"
+                ],
+                "@esbuild/freebsd-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-arm": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-arm64": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-ia32": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-loong64": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-mips64el": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-ppc64": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-riscv64": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-s390x": [
+                  "0.19.8"
+                ],
+                "@esbuild/linux-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/netbsd-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/openbsd-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/sunos-x64": [
+                  "0.19.8"
+                ],
+                "@esbuild/win32-arm64": [
+                  "0.19.8"
+                ],
+                "@esbuild/win32-ia32": [
+                  "0.19.8"
+                ],
+                "@esbuild/win32-x64": [
+                  "0.19.8"
+                ]
+              },
+              "lifecycle_build_target": true,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [
+                "no-sandbox"
+              ],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__escape-string-regexp__4.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "escape-string-regexp",
+              "version": "4.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
+              "url": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__escape-string-regexp__4.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "escape-string-regexp",
+              "version": "4.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "escape-string-regexp": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__eslint-scope__7.2.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "eslint-scope",
+              "version": "7.2.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-dOt21O7lTMhDM+X9mB4GX+DZrZtCUJPL/wlcTqxyrx5IvO0IYtILdtrQGQp+8n5S0gwSVmOf9NQrjMOgfQZlIg==",
+              "url": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-7.2.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__eslint-scope__7.2.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "eslint-scope",
+              "version": "7.2.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "esrecurse": "4.3.0",
+                "estraverse": "5.3.0"
+              },
+              "transitive_closure": {
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__eslint-visitor-keys__3.4.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "eslint-visitor-keys",
+              "version": "3.4.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+              "url": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__eslint-visitor-keys__3.4.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "eslint-visitor-keys",
+              "version": "3.4.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__eslint__8.55.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "eslint",
+              "version": "8.55.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {
+                "": [
+                  "eslint"
+                ]
+              },
+              "integrity": "sha512-iyUUAM0PCKj5QpwGfmCAG9XXbZCWsqP/eWAWrG/W0umvjuLRBECwSFdt+rCntju0xEH7teIABPwXpahftIaTdA==",
+              "url": "https://registry.npmjs.org/eslint/-/eslint-8.55.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__eslint__8.55.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "eslint",
+              "version": "8.55.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {
+                "": [
+                  "eslint"
+                ]
+              },
+              "deps": {
+                "@eslint-community/eslint-utils": "4.4.0_eslint_8.55.0",
+                "@eslint-community/regexpp": "4.10.0",
+                "@eslint/eslintrc": "2.1.4",
+                "@eslint/js": "8.55.0",
+                "@humanwhocodes/config-array": "0.11.13",
+                "@humanwhocodes/module-importer": "1.0.1",
+                "@nodelib/fs.walk": "1.2.8",
+                "@ungap/structured-clone": "1.2.0",
+                "ajv": "6.12.6",
+                "chalk": "4.1.2",
+                "cross-spawn": "7.0.3",
+                "debug": "4.3.4",
+                "doctrine": "3.0.0",
+                "escape-string-regexp": "4.0.0",
+                "eslint-scope": "7.2.2",
+                "eslint-visitor-keys": "3.4.3",
+                "espree": "9.6.1",
+                "esquery": "1.5.0",
+                "esutils": "2.0.3",
+                "fast-deep-equal": "3.1.3",
+                "file-entry-cache": "6.0.1",
+                "find-up": "5.0.0",
+                "glob-parent": "6.0.2",
+                "globals": "13.23.0",
+                "graphemer": "1.4.0",
+                "ignore": "5.3.0",
+                "imurmurhash": "0.1.4",
+                "is-glob": "4.0.3",
+                "is-path-inside": "3.0.3",
+                "js-yaml": "4.1.0",
+                "json-stable-stringify-without-jsonify": "1.0.1",
+                "levn": "0.4.1",
+                "lodash.merge": "4.6.2",
+                "minimatch": "3.1.2",
+                "natural-compare": "1.4.0",
+                "optionator": "0.9.3",
+                "strip-ansi": "6.0.1",
+                "text-table": "0.2.0"
+              },
+              "transitive_closure": {
+                "eslint": [
+                  "8.55.0"
+                ],
+                "@eslint-community/eslint-utils": [
+                  "4.4.0_eslint_8.55.0"
+                ],
+                "@eslint-community/regexpp": [
+                  "4.10.0"
+                ],
+                "@eslint/eslintrc": [
+                  "2.1.4"
+                ],
+                "@eslint/js": [
+                  "8.55.0"
+                ],
+                "@humanwhocodes/config-array": [
+                  "0.11.13"
+                ],
+                "@humanwhocodes/module-importer": [
+                  "1.0.1"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "@ungap/structured-clone": [
+                  "1.2.0"
+                ],
+                "ajv": [
+                  "6.12.6"
+                ],
+                "chalk": [
+                  "4.1.2"
+                ],
+                "cross-spawn": [
+                  "7.0.3"
+                ],
+                "debug": [
+                  "4.3.4"
+                ],
+                "doctrine": [
+                  "3.0.0"
+                ],
+                "escape-string-regexp": [
+                  "4.0.0"
+                ],
+                "eslint-scope": [
+                  "7.2.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ],
+                "espree": [
+                  "9.6.1"
+                ],
+                "esquery": [
+                  "1.5.0"
+                ],
+                "esutils": [
+                  "2.0.3"
+                ],
+                "fast-deep-equal": [
+                  "3.1.3"
+                ],
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "find-up": [
+                  "5.0.0"
+                ],
+                "glob-parent": [
+                  "6.0.2"
+                ],
+                "globals": [
+                  "13.23.0"
+                ],
+                "graphemer": [
+                  "1.4.0"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "imurmurhash": [
+                  "0.1.4"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-path-inside": [
+                  "3.0.3"
+                ],
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "lodash.merge": [
+                  "4.6.2"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "natural-compare": [
+                  "1.4.0"
+                ],
+                "optionator": [
+                  "0.9.3"
+                ],
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "text-table": [
+                  "0.2.0"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "ms": [
+                  "2.1.2"
+                ],
+                "path-key": [
+                  "3.1.1"
+                ],
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ],
+                "ansi-styles": [
+                  "4.3.0"
+                ],
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ],
+                "color-convert": [
+                  "2.0.1"
+                ],
+                "color-name": [
+                  "1.1.4"
+                ],
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ],
+                "json-schema-traverse": [
+                  "0.4.1"
+                ],
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "@humanwhocodes/object-schema": [
+                  "2.0.1"
+                ],
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "strip-json-comments": [
+                  "3.1.1"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__espree__9.6.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "espree",
+              "version": "9.6.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-oruZaFkjorTpF32kDSI5/75ViwGeZginGGy2NoOSg3Q9bnwlnmDm4HLnkl0RE3n+njDXR037aY1+x58Z/zFdwQ==",
+              "url": "https://registry.npmjs.org/espree/-/espree-9.6.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__espree__9.6.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "espree",
+              "version": "9.6.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "acorn": "8.11.2",
+                "acorn-jsx": "5.3.2_acorn_8.11.2",
+                "eslint-visitor-keys": "3.4.3"
+              },
+              "transitive_closure": {
+                "espree": [
+                  "9.6.1"
+                ],
+                "acorn": [
+                  "8.11.2"
+                ],
+                "acorn-jsx": [
+                  "5.3.2_acorn_8.11.2"
+                ],
+                "eslint-visitor-keys": [
+                  "3.4.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__esquery__1.5.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "esquery",
+              "version": "1.5.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-YQLXUplAwJgCydQ78IMJywZCceoqk1oH01OERdSAJc/7U2AylwjhSCLDEtqwg811idIS/9fIU5GjG73IgjKMVg==",
+              "url": "https://registry.npmjs.org/esquery/-/esquery-1.5.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__esquery__1.5.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "esquery",
+              "version": "1.5.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "estraverse": "5.3.0"
+              },
+              "transitive_closure": {
+                "esquery": [
+                  "1.5.0"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__esrecurse__4.3.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "esrecurse",
+              "version": "4.3.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
+              "url": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__esrecurse__4.3.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "esrecurse",
+              "version": "4.3.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "estraverse": "5.3.0"
+              },
+              "transitive_closure": {
+                "esrecurse": [
+                  "4.3.0"
+                ],
+                "estraverse": [
+                  "5.3.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__estraverse__5.3.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "estraverse",
+              "version": "5.3.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+              "url": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__estraverse__5.3.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "estraverse",
+              "version": "5.3.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "estraverse": [
+                  "5.3.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__esutils__2.0.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "esutils",
+              "version": "2.0.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
+              "url": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__esutils__2.0.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "esutils",
+              "version": "2.0.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "esutils": [
+                  "2.0.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fast-deep-equal__3.1.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fast-deep-equal",
+              "version": "3.1.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+              "url": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fast-deep-equal__3.1.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fast-deep-equal",
+              "version": "3.1.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "fast-deep-equal": [
+                  "3.1.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fast-glob__3.3.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fast-glob",
+              "version": "3.3.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==",
+              "url": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fast-glob__3.3.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fast-glob",
+              "version": "3.3.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@nodelib/fs.stat": "2.0.5",
+                "@nodelib/fs.walk": "1.2.8",
+                "glob-parent": "5.1.2",
+                "merge2": "1.4.1",
+                "micromatch": "4.0.5"
+              },
+              "transitive_closure": {
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "glob-parent": [
+                  "5.1.2"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fast-json-stable-stringify__2.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fast-json-stable-stringify",
+              "version": "2.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
+              "url": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fast-json-stable-stringify__2.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fast-json-stable-stringify",
+              "version": "2.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "fast-json-stable-stringify": [
+                  "2.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fast-levenshtein__2.0.6": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fast-levenshtein",
+              "version": "2.0.6",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
+              "url": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fast-levenshtein__2.0.6__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fast-levenshtein",
+              "version": "2.0.6",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "fast-levenshtein": [
+                  "2.0.6"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fastq__1.15.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fastq",
+              "version": "1.15.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==",
+              "url": "https://registry.npmjs.org/fastq/-/fastq-1.15.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fastq__1.15.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fastq",
+              "version": "1.15.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "reusify": "1.0.4"
+              },
+              "transitive_closure": {
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__file-entry-cache__6.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "file-entry-cache",
+              "version": "6.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-7Gps/XWymbLk2QLYK4NzpMOrYjMhdIxXuIvy2QBsLE6ljuodKvdkWs/cpyJJ3CVIVpH0Oi1Hvg1ovbMzLdFBBg==",
+              "url": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__file-entry-cache__6.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "file-entry-cache",
+              "version": "6.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "flat-cache": "3.2.0"
+              },
+              "transitive_closure": {
+                "file-entry-cache": [
+                  "6.0.1"
+                ],
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fill-range__7.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fill-range",
+              "version": "7.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+              "url": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fill-range__7.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fill-range",
+              "version": "7.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "to-regex-range": "5.0.1"
+              },
+              "transitive_closure": {
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__find-up__5.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "find-up",
+              "version": "5.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
+              "url": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__find-up__5.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "find-up",
+              "version": "5.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "locate-path": "6.0.0",
+                "path-exists": "4.0.0"
+              },
+              "transitive_closure": {
+                "find-up": [
+                  "5.0.0"
+                ],
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "path-exists": [
+                  "4.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__flat-cache__3.2.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "flat-cache",
+              "version": "3.2.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-CYcENa+FtcUKLmhhqyctpclsq7QF38pKjZHsGNiSQF5r4FtoKDWabFDl3hzaEQMvT1LHEysw5twgLvpYYb4vbw==",
+              "url": "https://registry.npmjs.org/flat-cache/-/flat-cache-3.2.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__flat-cache__3.2.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "flat-cache",
+              "version": "3.2.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "flatted": "3.2.9",
+                "keyv": "4.5.4",
+                "rimraf": "3.0.2"
+              },
+              "transitive_closure": {
+                "flat-cache": [
+                  "3.2.0"
+                ],
+                "flatted": [
+                  "3.2.9"
+                ],
+                "keyv": [
+                  "4.5.4"
+                ],
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__flatted__3.2.9": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "flatted",
+              "version": "3.2.9",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-36yxDn5H7OFZQla0/jFJmbIKTdZAQHngCedGxiMmpNfEZM0sdEeT+WczLQrjK6D7o2aiyLYDnkw0R3JK0Qv1RQ==",
+              "url": "https://registry.npmjs.org/flatted/-/flatted-3.2.9.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__flatted__3.2.9__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "flatted",
+              "version": "3.2.9",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "flatted": [
+                  "3.2.9"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__fs.realpath__1.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "fs.realpath",
+              "version": "1.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
+              "url": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__fs.realpath__1.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "fs.realpath",
+              "version": "1.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "fs.realpath": [
+                  "1.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__glob-parent__5.1.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "glob-parent",
+              "version": "5.1.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+              "url": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__glob-parent__5.1.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "glob-parent",
+              "version": "5.1.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "is-glob": "4.0.3"
+              },
+              "transitive_closure": {
+                "glob-parent": [
+                  "5.1.2"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__glob-parent__6.0.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "glob-parent",
+              "version": "6.0.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==",
+              "url": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__glob-parent__6.0.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "glob-parent",
+              "version": "6.0.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "is-glob": "4.0.3"
+              },
+              "transitive_closure": {
+                "glob-parent": [
+                  "6.0.2"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__glob__7.2.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "glob",
+              "version": "7.2.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+              "url": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__glob__7.2.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "glob",
+              "version": "7.2.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "fs.realpath": "1.0.0",
+                "inflight": "1.0.6",
+                "inherits": "2.0.4",
+                "minimatch": "3.1.2",
+                "once": "1.4.0",
+                "path-is-absolute": "1.0.1"
+              },
+              "transitive_closure": {
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__globals__13.23.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "globals",
+              "version": "13.23.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-XAmF0RjlrjY23MA51q3HltdlGxUpXPvg0GioKiD9X6HD28iMjo2dKC8Vqwm7lne4GNr78+RHTfliktR6ZH09wA==",
+              "url": "https://registry.npmjs.org/globals/-/globals-13.23.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__globals__13.23.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "globals",
+              "version": "13.23.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "type-fest": "0.20.2"
+              },
+              "transitive_closure": {
+                "globals": [
+                  "13.23.0"
+                ],
+                "type-fest": [
+                  "0.20.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__globby__11.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "globby",
+              "version": "11.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==",
+              "url": "https://registry.npmjs.org/globby/-/globby-11.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__globby__11.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "globby",
+              "version": "11.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "array-union": "2.1.0",
+                "dir-glob": "3.0.1",
+                "fast-glob": "3.3.2",
+                "ignore": "5.3.0",
+                "merge2": "1.4.1",
+                "slash": "3.0.0"
+              },
+              "transitive_closure": {
+                "globby": [
+                  "11.1.0"
+                ],
+                "array-union": [
+                  "2.1.0"
+                ],
+                "dir-glob": [
+                  "3.0.1"
+                ],
+                "fast-glob": [
+                  "3.3.2"
+                ],
+                "ignore": [
+                  "5.3.0"
+                ],
+                "merge2": [
+                  "1.4.1"
+                ],
+                "slash": [
+                  "3.0.0"
+                ],
+                "@nodelib/fs.stat": [
+                  "2.0.5"
+                ],
+                "@nodelib/fs.walk": [
+                  "1.2.8"
+                ],
+                "glob-parent": [
+                  "5.1.2"
+                ],
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ],
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ],
+                "@nodelib/fs.scandir": [
+                  "2.1.5"
+                ],
+                "fastq": [
+                  "1.15.0"
+                ],
+                "reusify": [
+                  "1.0.4"
+                ],
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ],
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__graphemer__1.4.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "graphemer",
+              "version": "1.4.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==",
+              "url": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__graphemer__1.4.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "graphemer",
+              "version": "1.4.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "graphemer": [
+                  "1.4.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__has-flag__4.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "has-flag",
+              "version": "4.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+              "url": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__has-flag__4.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "has-flag",
+              "version": "4.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "has-flag": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__ignore__5.3.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "ignore",
+              "version": "5.3.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-g7dmpshy+gD7mh88OC9NwSGTKoc3kyLAZQRU1mt53Aw/vnvfXnbC+F/7F7QoYVKbV+KNvJx8wArewKy1vXMtlg==",
+              "url": "https://registry.npmjs.org/ignore/-/ignore-5.3.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__ignore__5.3.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "ignore",
+              "version": "5.3.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "ignore": [
+                  "5.3.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__import-fresh__3.3.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "import-fresh",
+              "version": "3.3.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==",
+              "url": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__import-fresh__3.3.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "import-fresh",
+              "version": "3.3.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "parent-module": "1.0.1",
+                "resolve-from": "4.0.0"
+              },
+              "transitive_closure": {
+                "import-fresh": [
+                  "3.3.0"
+                ],
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "resolve-from": [
+                  "4.0.0"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__imurmurhash__0.1.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "imurmurhash",
+              "version": "0.1.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
+              "url": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__imurmurhash__0.1.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "imurmurhash",
+              "version": "0.1.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "imurmurhash": [
+                  "0.1.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__inflight__1.0.6": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "inflight",
+              "version": "1.0.6",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
+              "url": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__inflight__1.0.6__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "inflight",
+              "version": "1.0.6",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "once": "1.4.0",
+                "wrappy": "1.0.2"
+              },
+              "transitive_closure": {
+                "inflight": [
+                  "1.0.6"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__inherits__2.0.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "inherits",
+              "version": "2.0.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+              "url": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__inherits__2.0.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "inherits",
+              "version": "2.0.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "inherits": [
+                  "2.0.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__is-extglob__2.1.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "is-extglob",
+              "version": "2.1.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+              "url": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__is-extglob__2.1.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "is-extglob",
+              "version": "2.1.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "is-extglob": [
+                  "2.1.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__is-glob__4.0.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "is-glob",
+              "version": "4.0.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+              "url": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__is-glob__4.0.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "is-glob",
+              "version": "4.0.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "is-extglob": "2.1.1"
+              },
+              "transitive_closure": {
+                "is-glob": [
+                  "4.0.3"
+                ],
+                "is-extglob": [
+                  "2.1.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__is-number__7.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "is-number",
+              "version": "7.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+              "url": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__is-number__7.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "is-number",
+              "version": "7.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "is-number": [
+                  "7.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__is-path-inside__3.0.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "is-path-inside",
+              "version": "3.0.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ==",
+              "url": "https://registry.npmjs.org/is-path-inside/-/is-path-inside-3.0.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__is-path-inside__3.0.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "is-path-inside",
+              "version": "3.0.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "is-path-inside": [
+                  "3.0.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__isexe__2.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "isexe",
+              "version": "2.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+              "url": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__isexe__2.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "isexe",
+              "version": "2.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "isexe": [
+                  "2.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__js-yaml__4.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "js-yaml",
+              "version": "4.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+              "url": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__js-yaml__4.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "js-yaml",
+              "version": "4.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "argparse": "2.0.1"
+              },
+              "transitive_closure": {
+                "js-yaml": [
+                  "4.1.0"
+                ],
+                "argparse": [
+                  "2.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__json-buffer__3.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "json-buffer",
+              "version": "3.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
+              "url": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__json-buffer__3.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "json-buffer",
+              "version": "3.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "json-buffer": [
+                  "3.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__json-schema-traverse__0.4.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "json-schema-traverse",
+              "version": "0.4.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+              "url": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__json-schema-traverse__0.4.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "json-schema-traverse",
+              "version": "0.4.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "json-schema-traverse": [
+                  "0.4.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__json-stable-stringify-without-jsonify__1.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "json-stable-stringify-without-jsonify",
+              "version": "1.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
+              "url": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__json-stable-stringify-without-jsonify__1.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "json-stable-stringify-without-jsonify",
+              "version": "1.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "json-stable-stringify-without-jsonify": [
+                  "1.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__keyv__4.5.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "keyv",
+              "version": "4.5.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
+              "url": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__keyv__4.5.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "keyv",
+              "version": "4.5.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "json-buffer": "3.0.1"
+              },
+              "transitive_closure": {
+                "keyv": [
+                  "4.5.4"
+                ],
+                "json-buffer": [
+                  "3.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__levn__0.4.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "levn",
+              "version": "0.4.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==",
+              "url": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__levn__0.4.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "levn",
+              "version": "0.4.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "prelude-ls": "1.2.1",
+                "type-check": "0.4.0"
+              },
+              "transitive_closure": {
+                "levn": [
+                  "0.4.1"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__locate-path__6.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "locate-path",
+              "version": "6.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
+              "url": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__locate-path__6.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "locate-path",
+              "version": "6.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "p-locate": "5.0.0"
+              },
+              "transitive_closure": {
+                "locate-path": [
+                  "6.0.0"
+                ],
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__lodash.merge__4.6.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "lodash.merge",
+              "version": "4.6.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
+              "url": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__lodash.merge__4.6.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "lodash.merge",
+              "version": "4.6.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "lodash.merge": [
+                  "4.6.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__lru-cache__6.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "lru-cache",
+              "version": "6.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
+              "url": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__lru-cache__6.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "lru-cache",
+              "version": "6.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "yallist": "4.0.0"
+              },
+              "transitive_closure": {
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__merge2__1.4.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "merge2",
+              "version": "1.4.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
+              "url": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__merge2__1.4.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "merge2",
+              "version": "1.4.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "merge2": [
+                  "1.4.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__micromatch__4.0.5": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "micromatch",
+              "version": "4.0.5",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA==",
+              "url": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__micromatch__4.0.5__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "micromatch",
+              "version": "4.0.5",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "braces": "3.0.2",
+                "picomatch": "2.3.1"
+              },
+              "transitive_closure": {
+                "micromatch": [
+                  "4.0.5"
+                ],
+                "braces": [
+                  "3.0.2"
+                ],
+                "picomatch": [
+                  "2.3.1"
+                ],
+                "fill-range": [
+                  "7.0.1"
+                ],
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__minimatch__3.1.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "minimatch",
+              "version": "3.1.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+              "url": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__minimatch__3.1.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "minimatch",
+              "version": "3.1.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "brace-expansion": "1.1.11"
+              },
+              "transitive_closure": {
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__ms__2.1.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "ms",
+              "version": "2.1.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+              "url": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__ms__2.1.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "ms",
+              "version": "2.1.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "ms": [
+                  "2.1.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__natural-compare__1.4.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "natural-compare",
+              "version": "1.4.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
+              "url": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__natural-compare__1.4.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "natural-compare",
+              "version": "1.4.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "natural-compare": [
+                  "1.4.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__once__1.4.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "once",
+              "version": "1.4.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
+              "url": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__once__1.4.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "once",
+              "version": "1.4.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "wrappy": "1.0.2"
+              },
+              "transitive_closure": {
+                "once": [
+                  "1.4.0"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__optionator__0.9.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "optionator",
+              "version": "0.9.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==",
+              "url": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__optionator__0.9.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "optionator",
+              "version": "0.9.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "@aashutoshrathi/word-wrap": "1.2.6",
+                "deep-is": "0.1.4",
+                "fast-levenshtein": "2.0.6",
+                "levn": "0.4.1",
+                "prelude-ls": "1.2.1",
+                "type-check": "0.4.0"
+              },
+              "transitive_closure": {
+                "optionator": [
+                  "0.9.3"
+                ],
+                "@aashutoshrathi/word-wrap": [
+                  "1.2.6"
+                ],
+                "deep-is": [
+                  "0.1.4"
+                ],
+                "fast-levenshtein": [
+                  "2.0.6"
+                ],
+                "levn": [
+                  "0.4.1"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ],
+                "type-check": [
+                  "0.4.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__p-limit__3.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "p-limit",
+              "version": "3.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
+              "url": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__p-limit__3.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "p-limit",
+              "version": "3.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "yocto-queue": "0.1.0"
+              },
+              "transitive_closure": {
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__p-locate__5.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "p-locate",
+              "version": "5.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
+              "url": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__p-locate__5.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "p-locate",
+              "version": "5.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "p-limit": "3.1.0"
+              },
+              "transitive_closure": {
+                "p-locate": [
+                  "5.0.0"
+                ],
+                "p-limit": [
+                  "3.1.0"
+                ],
+                "yocto-queue": [
+                  "0.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__parent-module__1.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "parent-module",
+              "version": "1.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+              "url": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__parent-module__1.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "parent-module",
+              "version": "1.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "callsites": "3.1.0"
+              },
+              "transitive_closure": {
+                "parent-module": [
+                  "1.0.1"
+                ],
+                "callsites": [
+                  "3.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__path-exists__4.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "path-exists",
+              "version": "4.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
+              "url": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__path-exists__4.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "path-exists",
+              "version": "4.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "path-exists": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__path-is-absolute__1.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "path-is-absolute",
+              "version": "1.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
+              "url": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__path-is-absolute__1.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "path-is-absolute",
+              "version": "1.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "path-is-absolute": [
+                  "1.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__path-key__3.1.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "path-key",
+              "version": "3.1.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+              "url": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__path-key__3.1.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "path-key",
+              "version": "3.1.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "path-key": [
+                  "3.1.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__path-type__4.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "path-type",
+              "version": "4.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==",
+              "url": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__path-type__4.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "path-type",
+              "version": "4.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "path-type": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__picomatch__2.3.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "picomatch",
+              "version": "2.3.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+              "url": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__picomatch__2.3.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "picomatch",
+              "version": "2.3.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "picomatch": [
+                  "2.3.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__prelude-ls__1.2.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "prelude-ls",
+              "version": "1.2.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==",
+              "url": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__prelude-ls__1.2.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "prelude-ls",
+              "version": "1.2.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "prelude-ls": [
+                  "1.2.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__punycode__2.3.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "punycode",
+              "version": "2.3.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
+              "url": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__punycode__2.3.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "punycode",
+              "version": "2.3.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "punycode": [
+                  "2.3.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__queue-microtask__1.2.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "queue-microtask",
+              "version": "1.2.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+              "url": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__queue-microtask__1.2.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "queue-microtask",
+              "version": "1.2.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "queue-microtask": [
+                  "1.2.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__resolve-from__4.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "resolve-from",
+              "version": "4.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+              "url": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__resolve-from__4.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "resolve-from",
+              "version": "4.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "resolve-from": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__reusify__1.0.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "reusify",
+              "version": "1.0.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==",
+              "url": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__reusify__1.0.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "reusify",
+              "version": "1.0.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "reusify": [
+                  "1.0.4"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__rimraf__3.0.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "rimraf",
+              "version": "3.0.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
+              "url": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__rimraf__3.0.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "rimraf",
+              "version": "3.0.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "glob": "7.2.3"
+              },
+              "transitive_closure": {
+                "rimraf": [
+                  "3.0.2"
+                ],
+                "glob": [
+                  "7.2.3"
+                ],
+                "fs.realpath": [
+                  "1.0.0"
+                ],
+                "inflight": [
+                  "1.0.6"
+                ],
+                "inherits": [
+                  "2.0.4"
+                ],
+                "minimatch": [
+                  "3.1.2"
+                ],
+                "once": [
+                  "1.4.0"
+                ],
+                "path-is-absolute": [
+                  "1.0.1"
+                ],
+                "wrappy": [
+                  "1.0.2"
+                ],
+                "brace-expansion": [
+                  "1.1.11"
+                ],
+                "balanced-match": [
+                  "1.0.2"
+                ],
+                "concat-map": [
+                  "0.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__run-parallel__1.2.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "run-parallel",
+              "version": "1.2.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
+              "url": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__run-parallel__1.2.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "run-parallel",
+              "version": "1.2.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "queue-microtask": "1.2.3"
+              },
+              "transitive_closure": {
+                "run-parallel": [
+                  "1.2.0"
+                ],
+                "queue-microtask": [
+                  "1.2.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__semver__7.5.4": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "semver",
+              "version": "7.5.4",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
+              "url": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__semver__7.5.4__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "semver",
+              "version": "7.5.4",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "lru-cache": "6.0.0"
+              },
+              "transitive_closure": {
+                "semver": [
+                  "7.5.4"
+                ],
+                "lru-cache": [
+                  "6.0.0"
+                ],
+                "yallist": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__shebang-command__2.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "shebang-command",
+              "version": "2.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+              "url": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__shebang-command__2.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "shebang-command",
+              "version": "2.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "shebang-regex": "3.0.0"
+              },
+              "transitive_closure": {
+                "shebang-command": [
+                  "2.0.0"
+                ],
+                "shebang-regex": [
+                  "3.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__shebang-regex__3.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "shebang-regex",
+              "version": "3.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+              "url": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__shebang-regex__3.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "shebang-regex",
+              "version": "3.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "shebang-regex": [
+                  "3.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__slash__3.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "slash",
+              "version": "3.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==",
+              "url": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__slash__3.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "slash",
+              "version": "3.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "slash": [
+                  "3.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__strip-ansi__6.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "strip-ansi",
+              "version": "6.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+              "url": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__strip-ansi__6.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "strip-ansi",
+              "version": "6.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "ansi-regex": "5.0.1"
+              },
+              "transitive_closure": {
+                "strip-ansi": [
+                  "6.0.1"
+                ],
+                "ansi-regex": [
+                  "5.0.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__strip-json-comments__3.1.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "strip-json-comments",
+              "version": "3.1.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
+              "url": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__strip-json-comments__3.1.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "strip-json-comments",
+              "version": "3.1.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "strip-json-comments": [
+                  "3.1.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__supports-color__7.2.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "supports-color",
+              "version": "7.2.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+              "url": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__supports-color__7.2.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "supports-color",
+              "version": "7.2.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "has-flag": "4.0.0"
+              },
+              "transitive_closure": {
+                "supports-color": [
+                  "7.2.0"
+                ],
+                "has-flag": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__text-table__0.2.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "text-table",
+              "version": "0.2.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==",
+              "url": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__text-table__0.2.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "text-table",
+              "version": "0.2.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "text-table": [
+                  "0.2.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__to-regex-range__5.0.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "to-regex-range",
+              "version": "5.0.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+              "url": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__to-regex-range__5.0.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "to-regex-range",
+              "version": "5.0.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "is-number": "7.0.0"
+              },
+              "transitive_closure": {
+                "to-regex-range": [
+                  "5.0.1"
+                ],
+                "is-number": [
+                  "7.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__ts-api-utils__1.0.3__typescript_5.3.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "ts-api-utils",
+              "version": "1.0.3_typescript_5.3.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-wNMeqtMz5NtwpT/UZGY5alT+VoKdSsOOP/kqHFcUW1P/VRhH2wJ48+DN2WwUliNbQ976ETwDL0Ifd2VVvgonvg==",
+              "url": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-1.0.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__ts-api-utils__1.0.3__typescript_5.3.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "ts-api-utils",
+              "version": "1.0.3_typescript_5.3.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "typescript": "5.3.3"
+              },
+              "transitive_closure": {
+                "ts-api-utils": [
+                  "1.0.3_typescript_5.3.3"
+                ],
+                "typescript": [
+                  "5.3.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__type-check__0.4.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "type-check",
+              "version": "0.4.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==",
+              "url": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__type-check__0.4.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "type-check",
+              "version": "0.4.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "prelude-ls": "1.2.1"
+              },
+              "transitive_closure": {
+                "type-check": [
+                  "0.4.0"
+                ],
+                "prelude-ls": [
+                  "1.2.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__type-fest__0.20.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "type-fest",
+              "version": "0.20.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
+              "url": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__type-fest__0.20.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "type-fest",
+              "version": "0.20.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "type-fest": [
+                  "0.20.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__typescript__5.3.3": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "typescript",
+              "version": "5.3.3",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {
+                "": [
+                  "typescript"
+                ]
+              },
+              "integrity": "sha512-pXWcraxM0uxAS+tN0AG/BF2TyqmHO014Z070UsJ+pFvYuRSq8KH8DmWpnbXe0pEPDHXZV3FcAbJkijJ5oNEnWw==",
+              "url": "https://registry.npmjs.org/typescript/-/typescript-5.3.3.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__typescript__5.3.3__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "typescript",
+              "version": "5.3.3",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {
+                "": [
+                  "typescript"
+                ]
+              },
+              "deps": {},
+              "transitive_closure": {
+                "typescript": [
+                  "5.3.3"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__undici-types__5.26.5": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "undici-types",
+              "version": "5.26.5",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+              "url": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__undici-types__5.26.5__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "undici-types",
+              "version": "5.26.5",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "undici-types": [
+                  "5.26.5"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__uri-js__4.4.1": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "uri-js",
+              "version": "4.4.1",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+              "url": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__uri-js__4.4.1__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "uri-js",
+              "version": "4.4.1",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "punycode": "2.3.1"
+              },
+              "transitive_closure": {
+                "uri-js": [
+                  "4.4.1"
+                ],
+                "punycode": [
+                  "2.3.1"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__which__2.0.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "which",
+              "version": "2.0.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+              "url": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__which__2.0.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "which",
+              "version": "2.0.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {
+                "isexe": "2.0.0"
+              },
+              "transitive_closure": {
+                "which": [
+                  "2.0.2"
+                ],
+                "isexe": [
+                  "2.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__wrappy__1.0.2": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "wrappy",
+              "version": "1.0.2",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
+              "url": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__wrappy__1.0.2__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "wrappy",
+              "version": "1.0.2",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "wrappy": [
+                  "1.0.2"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__yallist__4.0.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "yallist",
+              "version": "4.0.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==",
+              "url": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__yallist__4.0.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "yallist",
+              "version": "4.0.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "yallist": [
+                  "4.0.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          },
+          "npm__yocto-queue__0.1.0": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_rule",
+            "attributes": {
+              "package": "yocto-queue",
+              "version": "0.1.0",
+              "root_package": "",
+              "link_workspace": "flatbuffers~",
+              "link_packages": {},
+              "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
+              "url": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
+              "commit": "",
+              "patch_args": [],
+              "patches": [],
+              "custom_postinstall": "",
+              "npm_auth": "",
+              "npm_auth_basic": "",
+              "npm_auth_username": "",
+              "npm_auth_password": "",
+              "lifecycle_hooks": [],
+              "extra_build_content": "",
+              "generate_bzl_library_targets": false
+            }
+          },
+          "npm__yocto-queue__0.1.0__links": {
+            "bzlFile": "@@aspect_rules_js~//npm/private:npm_import.bzl",
+            "ruleClassName": "npm_import_links",
+            "attributes": {
+              "package": "yocto-queue",
+              "version": "0.1.0",
+              "dev": true,
+              "root_package": "",
+              "link_packages": {},
+              "deps": {},
+              "transitive_closure": {
+                "yocto-queue": [
+                  "0.1.0"
+                ]
+              },
+              "lifecycle_build_target": false,
+              "lifecycle_hooks_env": [],
+              "lifecycle_hooks_execution_requirements": [],
+              "bins": {},
+              "npm_translate_lock_repo": "npm",
+              "package_visibility": [
+                "//visibility:public"
+              ]
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "aspect_bazel_lib~",
+            "bazel_skylib",
+            "bazel_skylib~"
+          ],
+          [
+            "aspect_bazel_lib~",
+            "bazel_tools",
+            "bazel_tools"
+          ],
+          [
+            "aspect_rules_js~",
+            "aspect_bazel_lib",
+            "aspect_bazel_lib~"
+          ],
+          [
+            "aspect_rules_js~",
+            "bazel_features",
+            "bazel_features~"
+          ],
+          [
+            "aspect_rules_js~",
+            "bazel_skylib",
+            "bazel_skylib~"
+          ],
+          [
+            "aspect_rules_js~",
+            "bazel_tools",
+            "bazel_tools"
+          ],
+          [
+            "bazel_features~",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
     "@@platforms//host:extension.bzl%host_platform": {
       "general": {
         "bzlTransitiveDigest": "xelQcPZH8+tmuOHVjL9vDxMnnQNMlwj0SlvgoqBkm4U=",
@@ -151,35 +11686,6 @@
         "recordedRepoMappingEntries": []
       }
     },
-    "@@protobuf~//:non_module_deps.bzl%non_module_deps": {
-      "general": {
-        "bzlTransitiveDigest": "n42CE1R95fa5ddK2PVwgWYAZfG476FzMuRvz0zo5gs8=",
-        "usagesDigest": "1JwsUDre7ljlZoaD2WfcvUlKnXUonmxIKAVBQ82j6Ig=",
-        "recordedFileInputs": {},
-        "recordedDirentsInputs": {},
-        "envVariables": {},
-        "generatedRepoSpecs": {
-          "utf8_range": {
-            "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl",
-            "ruleClassName": "http_archive",
-            "attributes": {
-              "urls": [
-                "https://github.com/protocolbuffers/utf8_range/archive/de0b4a8ff9b5d4c98108bdfe723291a33c52c54f.zip"
-              ],
-              "strip_prefix": "utf8_range-de0b4a8ff9b5d4c98108bdfe723291a33c52c54f",
-              "sha256": "5da960e5e5d92394c809629a03af3c7709d2d3d0ca731dacb3a9fb4bf28f7702"
-            }
-          }
-        },
-        "recordedRepoMappingEntries": [
-          [
-            "protobuf~",
-            "bazel_tools",
-            "bazel_tools"
-          ]
-        ]
-      }
-    },
     "@@pybind11_bazel~//:internal_configure.bzl%internal_configure_extension": {
       "general": {
         "bzlTransitiveDigest": "CyAKLVVonohnkTSqg9II/HA7M49sOlnMkgMHL3CmDuc=",
@@ -597,6 +12103,68 @@
         ]
       }
     },
+    "@@rules_go~//go:extensions.bzl%go_sdk": {
+      "general": {
+        "bzlTransitiveDigest": "6OpUR/yglzmu6OR0l9BvoXNEmRETCk2i9/mg6yhIbMA=",
+        "usagesDigest": "d+jWsKUXmjXLstb8Ps8lKcqQSS92aURSbdbgcoFp7Ao=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "go_default_sdk": {
+            "bzlFile": "@@rules_go~//go/private:sdk.bzl",
+            "ruleClassName": "go_download_sdk_rule",
+            "attributes": {
+              "goos": "",
+              "goarch": "",
+              "sdks": {},
+              "urls": [
+                "https://dl.google.com/go/{}"
+              ],
+              "version": "1.20.2"
+            }
+          },
+          "go_host_compatible_sdk_label": {
+            "bzlFile": "@@rules_go~//go/private:extensions.bzl",
+            "ruleClassName": "host_compatible_toolchain",
+            "attributes": {
+              "toolchain": "@go_default_sdk//:ROOT"
+            }
+          },
+          "go_toolchains": {
+            "bzlFile": "@@rules_go~//go/private:sdk.bzl",
+            "ruleClassName": "go_multiple_toolchains",
+            "attributes": {
+              "prefixes": [
+                "_0000_go_default_sdk_"
+              ],
+              "geese": [
+                ""
+              ],
+              "goarchs": [
+                ""
+              ],
+              "sdk_repos": [
+                "go_default_sdk"
+              ],
+              "sdk_types": [
+                "remote"
+              ],
+              "sdk_versions": [
+                "1.20.2"
+              ]
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "rules_go~",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
     "@@rules_jvm_external~//:extensions.bzl%maven": {
       "general": {
         "bzlTransitiveDigest": "ZZwUwwzxkACVpF3u5nup1ClQKp1WEF5TLy//fGjPiKU=",
@@ -1804,32 +13372,93 @@
         ]
       }
     },
-    "@@rules_jvm_external~//:non-module-deps.bzl%non_module_deps": {
+    "@@rules_nodejs~//nodejs:extensions.bzl%node": {
       "general": {
-        "bzlTransitiveDigest": "ZOivBbbZUakRexeLO/N26oX4Bcph6HHnqNmfxt7yoCc=",
-        "usagesDigest": "53kHAQcKNmL0k7OtizNBnaTWq84lbKdGYv7383Wp/fc=",
+        "bzlTransitiveDigest": "KOk+Te5m8n3d0B9F5+lgyrzLbtEzqeqWset0MugBbOY=",
+        "usagesDigest": "Hpfezedx02zaAfjLSaFZ52QtTY1hd57RMXgRknmjzA0=",
         "recordedFileInputs": {},
         "recordedDirentsInputs": {},
         "envVariables": {},
         "generatedRepoSpecs": {
-          "io_bazel_rules_kotlin": {
-            "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl",
-            "ruleClassName": "http_archive",
+          "nodejs_linux_amd64": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
             "attributes": {
-              "sha256": "946747acdbeae799b085d12b240ec346f775ac65236dfcf18aa0cd7300f6de78",
-              "urls": [
-                "https://github.com/bazelbuild/rules_kotlin/releases/download/v1.7.0-RC-2/rules_kotlin_release.tgz"
-              ]
+              "platform": "linux_amd64",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs_linux_arm64": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
+            "attributes": {
+              "platform": "linux_arm64",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs_linux_s390x": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
+            "attributes": {
+              "platform": "linux_s390x",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs_linux_ppc64le": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
+            "attributes": {
+              "platform": "linux_ppc64le",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs_darwin_amd64": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
+            "attributes": {
+              "platform": "darwin_amd64",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs_darwin_arm64": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
+            "attributes": {
+              "platform": "darwin_arm64",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs_windows_amd64": {
+            "bzlFile": "@@rules_nodejs~//nodejs:repositories.bzl",
+            "ruleClassName": "node_repositories",
+            "attributes": {
+              "platform": "windows_amd64",
+              "node_version": "16.20.0"
+            }
+          },
+          "nodejs": {
+            "bzlFile": "@@rules_nodejs~//nodejs/private:nodejs_repo_host_os_alias.bzl",
+            "ruleClassName": "nodejs_repo_host_os_alias",
+            "attributes": {
+              "user_node_repository_name": "nodejs"
+            }
+          },
+          "nodejs_host": {
+            "bzlFile": "@@rules_nodejs~//nodejs/private:nodejs_repo_host_os_alias.bzl",
+            "ruleClassName": "nodejs_repo_host_os_alias",
+            "attributes": {
+              "user_node_repository_name": "nodejs"
+            }
+          },
+          "nodejs_toolchains": {
+            "bzlFile": "@@rules_nodejs~//nodejs/private:toolchains_repo.bzl",
+            "ruleClassName": "toolchains_repo",
+            "attributes": {
+              "user_node_repository_name": "nodejs"
             }
           }
         },
-        "recordedRepoMappingEntries": [
-          [
-            "rules_jvm_external~",
-            "bazel_tools",
-            "bazel_tools"
-          ]
-        ]
+        "recordedRepoMappingEntries": []
       }
     },
     "@@rules_python~//python/private/pypi:pip.bzl%pip_internal": {
@@ -4116,35 +15745,6 @@
           ]
         ]
       }
-    },
-    "@@upb~//:non_module_deps.bzl%non_module_deps": {
-      "general": {
-        "bzlTransitiveDigest": "n42CE1R95fa5ddK2PVwgWYAZfG476FzMuRvz0zo5gs8=",
-        "usagesDigest": "jUN0s3TyKWQVNLdkIwSzKkk73kEAiVZpjP3qSq+wCWA=",
-        "recordedFileInputs": {},
-        "recordedDirentsInputs": {},
-        "envVariables": {},
-        "generatedRepoSpecs": {
-          "utf8_range": {
-            "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl",
-            "ruleClassName": "http_archive",
-            "attributes": {
-              "urls": [
-                "https://github.com/protocolbuffers/utf8_range/archive/de0b4a8ff9b5d4c98108bdfe723291a33c52c54f.zip"
-              ],
-              "strip_prefix": "utf8_range-de0b4a8ff9b5d4c98108bdfe723291a33c52c54f",
-              "sha256": "5da960e5e5d92394c809629a03af3c7709d2d3d0ca731dacb3a9fb4bf28f7702"
-            }
-          }
-        },
-        "recordedRepoMappingEntries": [
-          [
-            "upb~",
-            "bazel_tools",
-            "bazel_tools"
-          ]
-        ]
-      }
     }
   }
 }
diff --git a/framework/src/vx_context.cpp b/framework/src/vx_context.cpp
index affb901d..084ad36c 100644
--- a/framework/src/vx_context.cpp
+++ b/framework/src/vx_context.cpp
@@ -34,6 +34,8 @@ vx_char targetModules[][VX_MAX_TARGET_NAME] = {
 #endif
     "openvx-c_model",
     "openvx-onnxRT",
+    "openvx-ai-server",
+    "openvx-liteRT",
 };
 
 const vx_char extensions[] =
diff --git a/include/VX/vx_corevx_ext.h b/include/VX/vx_corevx_ext.h
index 2180b30e..7469c5be 100644
--- a/include/VX/vx_corevx_ext.h
+++ b/include/VX/vx_corevx_ext.h
@@ -1,6 +1,6 @@
 /**
  * @file vx_corevx_ext.h
- * @brief Extensions enabled for corevs
+ * @brief Extensions enabled for corevx
  * @version 0.1
  * @date 2024-12-15
  *
@@ -13,6 +13,24 @@
 #include <VX/vx_kernels.h>
 #include <VX/vx_types.h>
 
+#ifdef __cplusplus
+#include <string>
+
+/*! \brief A character array (string) type.
+ * \note This is a C++ string type. It is not a C string.
+ * \ingroup group_basic_features
+ */
+using vx_string = std::string;
+#endif /* __cplusplus */
+
+/*! \brief The type enumeration lists additional types to extend the known types in OpenVX.
+ * \ingroup group_basic_features
+ */
+enum vx_type_ext_e
+{
+    VX_TYPE_STRING = 0x818, /*!< \brief A <tt>\ref vx_string</tt>. */
+};
+
 /*! \brief Define Edge AI Vendor ID
  * \ingroup group_basic_features
  */
@@ -30,6 +48,14 @@ enum vx_kernel_ext_e
      * \brief The ONNX Runtime CPU Inference kernel.
      */
     VX_KERNEL_ORT_CPU_INF = VX_KERNEL_BASE(VX_ID_EDGE_AI, VX_LIBRARY_KHR_BASE) + 0x1,
+    /*!
+     * \brief The AI Model Server Chatbot kernel.
+     */
+    VX_KERNEL_AIS_CHATBOT = VX_KERNEL_BASE(VX_ID_EDGE_AI, VX_LIBRARY_KHR_BASE) + 0x2,
+    /*!
+     * \brief The LiteRT CPU Inference kernel.
+     */
+    VX_KERNEL_LITERT_CPU_INF = VX_KERNEL_BASE(VX_ID_EDGE_AI, VX_LIBRARY_KHR_BASE) + 0x3,
 };
 
 /*! \brief addtitional tensor attributes.
diff --git a/kernels/ai_server/BUILD b/kernels/ai_server/BUILD
new file mode 100644
index 00000000..274f9687
--- /dev/null
+++ b/kernels/ai_server/BUILD
@@ -0,0 +1,20 @@
+cc_library(
+    name = "llm_kernels",
+    srcs = glob([
+        "*.cpp",
+    ]),
+    hdrs = glob([
+        "*.h",
+        "*.hpp",
+    ]),
+    includes = [
+        ".",
+        "//framework/include"
+    ],
+    deps = [
+        "//:corevx",
+        "@curl//:curl",
+        "@nlohmann_json//:json"
+    ],
+    visibility = ["//visibility:public"]
+)
\ No newline at end of file
diff --git a/kernels/ai_server/chatbot.hpp b/kernels/ai_server/chatbot.hpp
new file mode 100644
index 00000000..617c7c8b
--- /dev/null
+++ b/kernels/ai_server/chatbot.hpp
@@ -0,0 +1,110 @@
+/**
+ * @file chatbot.hpp
+ * @brief Kernel for AI Model Server Chatbot
+ * @version 0.1
+ * @date 2025-04-04
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#include <curl/curl.h>
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+#include <VX/vx.h>
+
+#define DEFAULT_MODEL "gpt-4o-mini"
+#define SERVER_URL "http://localhost:8000"
+#define API_KEY "hardcoded-api-key"
+
+class RemoteModelClient
+{
+private:
+    // Helper function for non-streaming response
+    static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp)
+    {
+        size_t totalSize = size * nmemb;
+        ((std::string *)userp)->append((char *)contents, totalSize);
+        return totalSize;
+    }
+
+public:
+    // kernel function (non-streaming)
+    vx_status AiServerQuery(const std::string &input_text, std::string &output_text, const std::string &api_path)
+    {
+        CURL *curl = curl_easy_init();
+        if (!curl)
+            return VX_FAILURE;
+
+        nlohmann::json request_json = {
+            {"model", DEFAULT_MODEL},
+            {"messages", {{{"role", "user"}, {"content", input_text}}}},
+            {"max_tokens", 100},
+            {"stream", false}};
+
+        std::string request_payload = request_json.dump();
+        std::string response_string;
+        std::string api_url = std::string(SERVER_URL) + api_path;
+
+        struct curl_slist *headers = nullptr;
+        headers = curl_slist_append(headers, "Content-Type: application/json");
+        headers = curl_slist_append(headers, ("Authorization: Bearer " + std::string(API_KEY)).c_str());
+
+        curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str());
+        curl_easy_setopt(curl, CURLOPT_POSTFIELDS, request_payload.c_str());
+        curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
+        curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_string);
+
+        CURLcode res = curl_easy_perform(curl);
+        curl_slist_free_all(headers);
+        curl_easy_cleanup(curl);
+
+        if (res != CURLE_OK)
+            return VX_FAILURE;
+
+        auto json_response = nlohmann::json::parse(response_string);
+        output_text = json_response["choices"][0]["message"]["content"];
+
+        return VX_SUCCESS;
+    }
+
+    // kernel function (streaming)
+    vx_status AiServerQueryStream(const std::string &input_text, std::string &output_text, const std::string &api_path)
+    {
+        CURL *curl = curl_easy_init();
+        if (!curl)
+            return VX_FAILURE;
+
+        nlohmann::json request_json = {
+            {"model", DEFAULT_MODEL},
+            {"messages", {{{"role", "user"}, {"content", input_text}}}},
+            {"max_tokens", 100},
+            {"stream", true}};
+
+        std::string request_payload = request_json.dump();
+        std::string response_chunk;
+        std::string api_url = std::string(SERVER_URL) + api_path;
+
+        struct curl_slist *headers = nullptr;
+        headers = curl_slist_append(headers, "Content-Type: application/json");
+        headers = curl_slist_append(headers, ("Authorization: Bearer " + std::string(API_KEY)).c_str());
+
+        curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str());
+        curl_easy_setopt(curl, CURLOPT_POSTFIELDS, request_payload.c_str());
+        curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
+        curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_chunk);
+
+        CURLcode res = curl_easy_perform(curl);
+        curl_slist_free_all(headers);
+        curl_easy_cleanup(curl);
+
+        if (res != CURLE_OK)
+            return VX_FAILURE;
+
+        // Just return raw streamed response (newline-delimited JSON chunks)
+        output_text = response_chunk;
+        return VX_SUCCESS;
+    }
+};
diff --git a/kernels/liteRT/BUILD b/kernels/liteRT/BUILD
new file mode 100644
index 00000000..e9783d06
--- /dev/null
+++ b/kernels/liteRT/BUILD
@@ -0,0 +1,21 @@
+
+cc_library(
+    name = "liteRT_kernels",
+    srcs = glob([
+        "*.cpp",
+    ]),
+    hdrs = glob([
+        "*.h",
+        "*.hpp",
+    ]),
+    includes = [
+        ".",
+        "//framework/include",
+    ],
+    deps = [
+        "//:corevx",
+        "//third_party:tflite",
+        "//third_party:tflite-hdrs",
+    ],
+    visibility = ["//visibility:public"]
+)
\ No newline at end of file
diff --git a/kernels/liteRT/tflite.hpp b/kernels/liteRT/tflite.hpp
new file mode 100644
index 00000000..c6c24e2b
--- /dev/null
+++ b/kernels/liteRT/tflite.hpp
@@ -0,0 +1,255 @@
+/**
+ * @file tflite.hpp
+ * @brief
+ * @version 0.1
+ * @date 2025-04-19
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/optional_debug_tools.h"
+
+#define TFLITE_MINIMAL_CHECK(x)                                  \
+    if (!(x))                                                    \
+    {                                                            \
+        fprintf(stderr, "Error at %s:%d\n", __FILE__, __LINE__); \
+        return VX_FAILURE;                                       \
+    }
+
+/**
+ * @brief Class to run TFLite models
+ *
+ */
+class TFLiteRunner
+{
+public:
+    /**
+     * @brief TFLiteRunner Constructor
+     */
+    TFLiteRunner() : modelLoaded(false) {};
+
+    /**
+     * @brief Initialize the TFLite interpreter (load the model)
+     * @param filename Path to the ONNX model file
+     * @return VX_SUCCESS on success, VX_FAILURE otherwise
+     */
+    vx_status init(std::string &filename)
+    {
+        TFLITE_MINIMAL_CHECK(false == filename.empty())
+
+        if (!modelLoaded)
+        {
+            // Load model
+            model = tflite::FlatBufferModel::BuildFromFile(filename.c_str());
+            TFLITE_MINIMAL_CHECK(model != nullptr);
+
+            // Build the interpreter with the InterpreterBuilder.
+            // Note: all Interpreters should be built with the InterpreterBuilder,
+            // which allocates memory for the Interpreter and does various set up
+            // tasks so that the Interpreter can read the provided model.
+            tflite::ops::builtin::BuiltinOpResolver resolver;
+            tflite::InterpreterBuilder builder(*model, resolver);
+            builder(&interpreter);
+            TFLITE_MINIMAL_CHECK(interpreter != nullptr);
+
+            printf("=== Pre-invoke Interpreter State ===\n");
+            tflite::PrintInterpreterState(interpreter.get());
+        }
+
+        return VX_SUCCESS;
+    }
+
+    /**
+     * @brief Validate input/output parameters
+     * @param inputDims  Input tensor dimensions
+     * @param outputDims Output tensor dimensions
+     * @return VX_SUCCESS on success, VX_FAILURE otherwise
+     */
+    vx_status validate(std::vector<std::vector<size_t>> &inputDims, std::vector<std::vector<size_t>> &outputDims)
+    {
+        vx_status status = VX_SUCCESS;
+
+        // Validate input dimensions
+        if (inputDims.size() != interpreter->inputs().size())
+        {
+            fprintf(stderr, "Mismatch in number of input tensors: expected %zu, got %zu\n",
+                    inputDims.size(), interpreter->inputs().size());
+            return VX_FAILURE;
+        }
+
+        for (std::size_t i = 0; i < interpreter->inputs().size(); ++i)
+        {
+            TfLiteTensor *input_tensor = interpreter->tensor(interpreter->inputs()[i]);
+            if (input_tensor == nullptr)
+            {
+                fprintf(stderr, "Input tensor at index %zu is null.\n", i);
+                return VX_FAILURE;
+            }
+
+            // Get the shape of the input tensor
+            std::vector<size_t> tensor_shape(input_tensor->dims->size);
+            for (int j = 0; j < input_tensor->dims->size; ++j)
+            {
+                tensor_shape[j] = input_tensor->dims->data[j];
+            }
+
+            // Compare with the expected shape
+            if (tensor_shape != inputDims[i])
+            {
+                fprintf(stderr, "Mismatch in input tensor %zu shape: expected {", i);
+                for (size_t dim : inputDims[i])
+                    fprintf(stderr, "%zu,", dim);
+                fprintf(stderr, "} but got {");
+                for (size_t dim : tensor_shape)
+                    fprintf(stderr, "%zu,", dim);
+                fprintf(stderr, "}\n");
+                return VX_FAILURE;
+            }
+        }
+
+        // Validate output dimensions
+        if (outputDims.size() != interpreter->outputs().size())
+        {
+            fprintf(stderr, "Mismatch in number of output tensors: expected %zu, got %zu\n",
+                    outputDims.size(), interpreter->outputs().size());
+            return VX_FAILURE;
+        }
+
+        for (std::size_t i = 0; i < interpreter->outputs().size(); ++i)
+        {
+            TfLiteTensor *output_tensor = interpreter->tensor(interpreter->outputs()[i]);
+            if (output_tensor == nullptr)
+            {
+                fprintf(stderr, "Output tensor at index %zu is null.\n", i);
+                return VX_FAILURE;
+            }
+
+            // Get the shape of the output tensor
+            std::vector<size_t> tensor_shape(output_tensor->dims->size);
+            for (int j = 0; j < output_tensor->dims->size; ++j)
+            {
+                tensor_shape[j] = output_tensor->dims->data[j];
+            }
+
+            // Compare with the expected shape
+            if (tensor_shape != outputDims[i])
+            {
+                fprintf(stderr, "Mismatch in output tensor %zu shape: expected {", i);
+                for (size_t dim : outputDims[i])
+                    fprintf(stderr, "%zu,", dim);
+                fprintf(stderr, "} but got {");
+                for (size_t dim : tensor_shape)
+                    fprintf(stderr, "%zu,", dim);
+                fprintf(stderr, "}\n");
+                return VX_FAILURE;
+            }
+        }
+
+        return status;
+    }
+
+    /**
+     * @brief Allocate memory for input and output tensors
+     * @param inputTensors  Input tensors
+     * @param outputTensors Output tensors
+     * @return VX_SUCCESS on success, VX_FAILURE otherwise
+     */
+    vx_status allocate(std::vector<std::pair<float *, vx_size>> &inputTensors, std::vector<std::pair<float *, vx_size>> &outputTensors)
+    {
+        vx_status status = VX_SUCCESS;
+
+        // Fill input buffers
+        // TODO(user): Insert code to fill input tensors.
+        // Note: The buffer of the input tensor with index `i` of type T can
+        // be accessed with `T* input = interpreter->typed_input_tensor<T>(i);`
+        for (std::size_t i = 0; i < interpreter->inputs().size(); ++i)
+        {
+            status = bindMemory(interpreter->inputs()[i], inputTensors[i].first, inputTensors[i].second);
+        }
+
+        // Read output buffers
+        // TODO(user): Insert getting data out code.
+        // Note: The buffer of the output tensor with index `i` of type T can
+        // be accessed with `T* output = interpreter->typed_output_tensor<T>(i);`
+        for (std::size_t i = 0; i < interpreter->outputs().size(); ++i)
+        {
+            status |= bindMemory(interpreter->outputs()[i], outputTensors[i].first, outputTensors[i].second);
+        }
+
+        // Allocate tensor buffers.
+        TFLITE_MINIMAL_CHECK(interpreter->AllocateTensors() == kTfLiteOk);
+
+        return status;
+    }
+
+    /**
+     * @brief Run the kernel (execute the model)
+     * @param inputTensors  Input tensors
+     * @param outputTensosrs Output tensors
+     * @return VX_SUCCESS on success, VX_FAILURE otherwise
+     */
+    vx_status run()
+    {
+        // Run inference
+        TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk);
+        printf("\n\n=== Post-invoke Interpreter State ===\n");
+        tflite::PrintInterpreterState(interpreter.get());
+        return VX_SUCCESS;
+    }
+
+private:
+    bool modelLoaded = false;
+    std::unique_ptr<tflite::FlatBufferModel> model;
+    // Pointer to the TFLite interpreter
+    std::unique_ptr<tflite::Interpreter> interpreter;
+
+    /**
+     * @brief Bind pre-allocated memory to a tensor
+     * @param tensor_index Index of the tensor to bind
+     * @param pre_allocated_memory Pointer to the pre-allocated memory
+     * @param size_in_bytes Size of the pre-allocated memory in bytes
+     * @return VX_SUCCESS on success, VX_FAILURE otherwise
+     */
+    vx_status bindMemory(int tensor_index, void* pre_allocated_memory, size_t size_in_bytes)
+    {
+        vx_status status = VX_SUCCESS;
+
+        // Get the tensor
+        TfLiteTensor* tensor = interpreter->tensor(tensor_index);
+
+        // Check if the tensor exists
+        if (tensor == nullptr)
+        {
+            fprintf(stderr, "Tensor at index %d does not exist.\n", tensor_index);
+            status = VX_FAILURE;
+        }
+
+        // Ensure the tensor type and size match your pre-allocated memory
+        if (VX_SUCCESS == status &&
+            tensor->bytes != size_in_bytes)
+        {
+            fprintf(stderr, "Pre-allocated memory size (%ld) does not match tensor size (%ld).\n",
+                    size_in_bytes, tensor->bytes);
+            status = VX_FAILURE;
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Bind the pre-allocated memory to the tensor
+            TFLITE_MINIMAL_CHECK(kTfLiteOk == interpreter->SetCustomAllocationForTensor(
+                                                  tensor_index,
+                                                  {pre_allocated_memory, size_in_bytes},
+                                                  kTfLiteCustomAllocationFlagsSkipAlignCheck));
+        }
+
+        return status;
+    }
+};
diff --git a/targets/ai_server/BUILD b/targets/ai_server/BUILD
new file mode 100644
index 00000000..6ee430a1
--- /dev/null
+++ b/targets/ai_server/BUILD
@@ -0,0 +1,32 @@
+
+cc_library(
+    name = "ai-server",
+    srcs = glob([
+        "*.cpp",
+        "*.h",
+    ]),
+    includes = [
+        ".",
+        "//framework/include",
+        "//kernels/ai-server",
+    ],
+    deps = [
+        "//:corevx",
+        "//kernels/ai_server:llm_kernels"
+    ],
+    visibility = ["//visibility:public"]
+)
+
+cc_shared_library(
+    name = "openvx-ai-server",
+    deps = [
+        ":ai-server",
+    ],
+    visibility = ["//visibility:public"]
+)
+
+cc_import(
+    name = "imported_openvx_ai_server",
+    shared_library = ":openvx-ai-server",
+    visibility = ["//visibility:public"]
+)
diff --git a/targets/ai_server/vx_chatbot.cpp b/targets/ai_server/vx_chatbot.cpp
new file mode 100644
index 00000000..6e3ab048
--- /dev/null
+++ b/targets/ai_server/vx_chatbot.cpp
@@ -0,0 +1,108 @@
+/**
+ * @file vx_chatbot.cpp
+ * @brief OpenVX Interface Into AI Model Server
+ * @version 0.1
+ * @date 2025-01-20
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
+#include <VX/vx.h>
+#include <VX/vx_compatibility.h>
+#include <VX/vx_helper.h>
+#include <VX/vx_lib_debug.h>
+
+#include "chatbot.hpp"
+#include "vx_internal.h"
+
+// Create an instance of ORT runner
+static const std::shared_ptr<RemoteModelClient> kernel = std::make_shared<RemoteModelClient>();
+
+static std::unordered_map<std::string, const std::string> api_map = {
+    {"chat", "/v1/chat/completions"},
+};
+
+class VxRemoteModelClient
+{
+private:
+    static vx_status store_vx_string_to_array(vx_array arr, const vx_string &in)
+    {
+        vx_status status = vxTruncateArray(arr, 0); // clear existing contents
+        if (status != VX_SUCCESS)
+            return status;
+
+        return vxAddArrayItems(arr, in.size(), in.data(), sizeof(char));
+    }
+
+    static vx_status load_vx_string_from_array(vx_array arr, vx_string &out)
+    {
+        vx_size size = 0;
+        vx_status status = vxQueryArray(arr, VX_ARRAY_ATTRIBUTE_NUMITEMS, &size, sizeof(size));
+        if (status != VX_SUCCESS || size == 0)
+            return VX_FAILURE;
+
+        out.resize(size); // allocate space directly in std::string
+        status = vxCopyArrayRange(arr, 0, size, sizeof(char), out.data(), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+        return status;
+    }
+
+public:
+    static constexpr vx_param_description_t kernelParams[] = {
+        {VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED},  // Parameter 0: Input text
+        {VX_OUTPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED}, // Parameter 1: Output text
+    };
+
+    static vx_status VX_CALLBACK init(vx_node node, const vx_reference parameters[], vx_uint32 num)
+    {
+        (void)node;
+        (void)parameters;
+        (void)num;
+        return VX_SUCCESS;
+    }
+
+    static vx_status VX_CALLBACK validate(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
+    {
+        (void)node;
+        (void)parameters;
+        (void)num;
+        (void)metas;
+        return VX_SUCCESS;
+    }
+
+    static vx_status VX_CALLBACK run(vx_node node, const vx_reference *parameters, vx_uint32 num)
+    {
+        (void)node;
+        (void)parameters;
+        (void)num;
+        vx_status status = VX_SUCCESS;
+        vx_string input_text, output_text;
+
+        status = load_vx_string_from_array((vx_array)parameters[0], input_text);
+        status |= kernel->AiServerQuery(
+            input_text,       // Input text
+            output_text,      // Output text
+            api_map["chat"]); // API path
+        status |= store_vx_string_to_array((vx_array)parameters[1], output_text);
+
+        return status;
+    }
+};
+
+/**
+ * @brief Ai Model Server Chatbot Kernel description structure
+ */
+vx_kernel_description_t chatbot_kernel = {
+    VX_KERNEL_AIS_CHATBOT,    // Unique kernel ID
+    "remote.model.chat",      // Kernel name
+    VxRemoteModelClient::run, // Kernel execution function
+    const_cast<vx_param_description_t *>(VxRemoteModelClient::kernelParams),
+    dimof(VxRemoteModelClient::kernelParams), // Number of parameters
+    VxRemoteModelClient::validate,            // Kernel validation function
+    nullptr,
+    nullptr,
+    VxRemoteModelClient::init, // Kernel initialization function
+    nullptr};
\ No newline at end of file
diff --git a/targets/ai_server/vx_interface.cpp b/targets/ai_server/vx_interface.cpp
new file mode 100644
index 00000000..c4b3e3a8
--- /dev/null
+++ b/targets/ai_server/vx_interface.cpp
@@ -0,0 +1,258 @@
+/**
+ * @file vx_interface.cpp
+ * @brief AI Model Server Target Interface
+ * @version 0.1
+ * @date 2025-01-20
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+
+/*!
+ * \file
+ * \brief The AI Model Server Target Interface
+ */
+
+#include <VX/vx.h>
+
+#include "vx_internal.h"
+#include "vx_interface.h"
+
+static const vx_char name[VX_MAX_TARGET_NAME] = "corevx.ai.server";
+
+/*! \brief Declares the list of all supported base kernels.
+ * \ingroup group_implementation
+ * \note This is the list of all supported base kernels! It must at least
+ * match the OpenVX 1.0 Specification.
+ */
+static vx_kernel_description_t *target_kernels[] =
+    {
+        &chatbot_kernel};
+
+/*! \brief Declares the number of base supported kernels.
+ * \ingroup group_implementation
+ */
+static vx_uint32 num_target_kernels = dimof(target_kernels);
+
+/******************************************************************************/
+/* EXPORTED FUNCTIONS */
+/******************************************************************************/
+extern "C" vx_status vxTargetInit(vx_target target)
+{
+    if (target)
+    {
+        strncpy(target->name, name, VX_MAX_TARGET_NAME);
+        target->priority = VX_TARGET_PRIORITY_ORT;
+    }
+    return target->initializeTarget(target_kernels, num_target_kernels);
+}
+
+extern "C" vx_status vxTargetDeinit(vx_target target)
+{
+    return target->deinitializeTarget();
+}
+
+extern "C" vx_status vxTargetSupports(vx_target target,
+                                      vx_char targetName[VX_MAX_TARGET_NAME],
+                                      vx_char kernelName[VX_MAX_KERNEL_NAME],
+                                      vx_uint32 *pIndex)
+{
+    vx_status status = VX_ERROR_NOT_SUPPORTED;
+    if (strncmp(targetName, name, VX_MAX_TARGET_NAME) == 0)
+    {
+        vx_uint32 k = 0u;
+        for (k = 0u; k < VX_INT_MAX_KERNELS; k++)
+        {
+            vx_char targetKernelName[VX_MAX_KERNEL_NAME];
+            vx_char *kernel;
+            vx_char def[8] = "default";
+
+            if (target->kernels[k])
+            {
+                strncpy(targetKernelName, target->kernels[k]->name, VX_MAX_KERNEL_NAME);
+                kernel = strtok(targetKernelName, ":");
+                if (kernel == nullptr)
+                {
+                    kernel = def;
+                }
+
+                if (strncmp(kernelName, kernel, VX_MAX_KERNEL_NAME) == 0)
+                {
+                    status = VX_SUCCESS;
+                    if (pIndex)
+                        *pIndex = k;
+                    break;
+                }
+            }
+        }
+    }
+    return status;
+}
+
+extern "C" vx_action vxTargetProcess(vx_target target, vx_node nodes[], vx_size startIndex, vx_size numNodes)
+{
+    vx_action action = VX_ACTION_CONTINUE;
+    vx_status status = VX_SUCCESS;
+    vx_size n = 0;
+    (void)target;
+
+    for (n = startIndex; (n < (startIndex + numNodes)) && (action == VX_ACTION_CONTINUE); n++)
+    {
+        vx_context context = vxGetContext((vx_reference)nodes[n]);
+        VX_PRINT(VX_ZONE_GRAPH, "Executing Kernel %s:%d in Nodes[%u] on target %s\n",
+                 nodes[n]->kernel->name,
+                 nodes[n]->kernel->enumeration,
+                 n,
+                 nodes[n]->context->targets[nodes[n]->affinity]->name);
+
+        if (context->perf_enabled)
+            Osal::startCapture(&nodes[n]->perf);
+
+        if (nodes[n]->is_replicated == vx_true_e)
+        {
+            vx_size num_replicas = 0;
+            vx_uint32 param;
+            vx_uint32 num_parameters = nodes[n]->kernel->signature.num_parameters;
+            vx_reference parameters[VX_INT_MAX_PARAMS] = {nullptr};
+
+            for (param = 0; param < num_parameters; ++param)
+            {
+                if (nodes[n]->replicated_flags[param] == vx_true_e)
+                {
+                    vx_size numItems = 0;
+                    if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID)
+                    {
+                        vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope;
+                        numItems = pyr->numLevels;
+                    }
+                    else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY)
+                    {
+                        vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope;
+                        numItems = arr->num_items;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_PARAMETERS;
+                        break;
+                    }
+
+                    if (num_replicas == 0)
+                        num_replicas = numItems;
+                    else if (numItems != num_replicas)
+                    {
+                        status = VX_ERROR_INVALID_PARAMETERS;
+                        break;
+                    }
+                }
+                else
+                {
+                    parameters[param] = nodes[n]->parameters[param];
+                }
+            }
+
+            if (status == VX_SUCCESS)
+            {
+                vx_size replica;
+                for (replica = 0; replica < num_replicas; ++replica)
+                {
+                    for (param = 0; param < num_parameters; ++param)
+                    {
+                        if (nodes[n]->replicated_flags[param] == vx_true_e)
+                        {
+                            if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID)
+                            {
+                                vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope;
+                                parameters[param] = (vx_reference)pyr->levels[replica];
+                            }
+                            else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY)
+                            {
+                                vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope;
+                                parameters[param] = (vx_reference)arr->items[replica];
+                            }
+                        }
+                    }
+
+                    status = nodes[n]->kernel->function((vx_node)nodes[n],
+                                                        parameters,
+                                                        num_parameters);
+                }
+            }
+        }
+        else
+        {
+            status = nodes[n]->kernel->function((vx_node)nodes[n],
+                                                (vx_reference *)nodes[n]->parameters,
+                                                nodes[n]->kernel->signature.num_parameters);
+        }
+
+        nodes[n]->executed = vx_true_e;
+        nodes[n]->status = status;
+
+        if (context->perf_enabled)
+            Osal::stopCapture(&nodes[n]->perf);
+
+        VX_PRINT(VX_ZONE_GRAPH, "kernel %s returned %d\n", nodes[n]->kernel->name, status);
+
+        if (status == VX_SUCCESS)
+        {
+            /* call the callback if it is attached */
+            if (nodes[n]->callback)
+            {
+                action = nodes[n]->callback((vx_node)nodes[n]);
+                VX_PRINT(VX_ZONE_GRAPH, "callback returned action %d\n", action);
+            }
+        }
+        else
+        {
+            action = VX_ACTION_ABANDON;
+            VX_PRINT(VX_ZONE_ERROR, "Abandoning Graph due to error (%d)!\n", status);
+        }
+    }
+    return action;
+}
+
+extern "C" vx_status vxTargetVerify(vx_target target, vx_node node)
+{
+    vx_status status = VX_SUCCESS;
+    (void)target;
+    (void)node;
+
+    return status;
+}
+
+extern "C" vx_kernel vxTargetAddKernel(vx_target target,
+                                       vx_char name[VX_MAX_KERNEL_NAME],
+                                       vx_enum enumeration,
+                                       vx_kernel_f func_ptr,
+                                       vx_uint32 numParams,
+                                       vx_kernel_validate_f validate,
+                                       vx_kernel_input_validate_f input,
+                                       vx_kernel_output_validate_f output,
+                                       vx_kernel_initialize_f initialize,
+                                       vx_kernel_deinitialize_f deinitialize)
+{
+    VX_PRINT(VX_ZONE_INFO, "Entered %s\n", __func__);
+    vx_uint32 k = 0u;
+    vx_kernel kernel = nullptr;
+    Osal::semWait(&target->lock);
+
+    for (k = 0; k < VX_INT_MAX_KERNELS; k++)
+    {
+        if (target->kernels[k] == nullptr || target->kernels[k]->enabled == vx_false_e)
+        {
+            target->kernels[k] = reinterpret_cast<vx_kernel>(Reference::createReference(target->context, VX_TYPE_KERNEL, VX_INTERNAL, target->context));
+            target->kernels[k]->initializeKernel(enumeration, func_ptr, name,
+                                                 nullptr, numParams,
+                                                 validate, input, output,
+                                                 initialize, deinitialize);
+            VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, target->kernels[k]->name);
+            target->num_kernels++;
+            kernel = target->kernels[k];
+            break;
+        }
+        kernel = nullptr;
+    }
+    Osal::semPost(&target->lock);
+
+    return kernel;
+}
diff --git a/targets/ai_server/vx_interface.h b/targets/ai_server/vx_interface.h
new file mode 100644
index 00000000..7d35de14
--- /dev/null
+++ b/targets/ai_server/vx_interface.h
@@ -0,0 +1,17 @@
+/**
+ * @file vx_interface.h
+ * @brief AI Model Server Target Interface
+ * @version 0.1
+ * @date 2025-01-20
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#ifndef OPENVX_INTERFACE_H
+#define OPENVX_INTERFACE_H
+
+#include <VX/vx_helper.h>
+
+extern vx_kernel_description_t chatbot_kernel;
+
+#endif /* OPENVX_INTERFACE_H */
diff --git a/targets/liteRT/BUILD b/targets/liteRT/BUILD
new file mode 100644
index 00000000..6bf9f710
--- /dev/null
+++ b/targets/liteRT/BUILD
@@ -0,0 +1,29 @@
+
+cc_library(
+    name = "liteRT",
+    srcs = glob([
+        "*.cpp",
+        "*.h",
+    ]),
+    includes = [
+        ".",
+        "//framework/include",
+    ],
+    deps = [
+        "//:corevx",
+        "//kernels/liteRT:liteRT_kernels",
+    ],
+    visibility = ["//visibility:public"]
+)
+
+cc_shared_library(
+    name = "openvx-liteRT",
+    deps = [":liteRT"],
+    visibility = ["//visibility:public"]
+)
+
+cc_import(
+    name = "imported_openvx_liteRT",
+    shared_library = ":openvx-liteRT",
+    visibility = ["//visibility:public"]
+)
\ No newline at end of file
diff --git a/targets/liteRT/vx_interface.cpp b/targets/liteRT/vx_interface.cpp
new file mode 100644
index 00000000..52335621
--- /dev/null
+++ b/targets/liteRT/vx_interface.cpp
@@ -0,0 +1,258 @@
+/**
+ * @file vx_interface.cpp
+ * @brief TFLITE Runtime Target Interface
+ * @version 0.1
+ * @date 2025-01-20
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+
+/*!
+ * \file
+ * \brief The TFLITE-RT Target Interface
+ */
+
+#include <VX/vx.h>
+
+#include "vx_internal.h"
+#include "vx_interface.h"
+
+static const vx_char name[VX_MAX_TARGET_NAME] = "corevx.tflite.rt";
+
+/*! \brief Declares the list of all supported base kernels.
+ * \ingroup group_implementation
+ * \note This is the list of all supported base kernels! It must at least
+ * match the OpenVX 1.0 Specification.
+ */
+static vx_kernel_description_t *target_kernels[] =
+    {
+        &tflite_cpu_inf_kernel};
+
+/*! \brief Declares the number of base supported kernels.
+ * \ingroup group_implementation
+ */
+static vx_uint32 num_target_kernels = dimof(target_kernels);
+
+/******************************************************************************/
+/* EXPORTED FUNCTIONS */
+/******************************************************************************/
+extern "C" vx_status vxTargetInit(vx_target target)
+{
+    if (target)
+    {
+        strncpy(target->name, name, VX_MAX_TARGET_NAME);
+        target->priority = VX_TARGET_PRIORITY_ORT;
+    }
+    return target->initializeTarget(target_kernels, num_target_kernels);
+}
+
+extern "C" vx_status vxTargetDeinit(vx_target target)
+{
+    return target->deinitializeTarget();
+}
+
+extern "C" vx_status vxTargetSupports(vx_target target,
+                                      vx_char targetName[VX_MAX_TARGET_NAME],
+                                      vx_char kernelName[VX_MAX_KERNEL_NAME],
+                                      vx_uint32 *pIndex)
+{
+    vx_status status = VX_ERROR_NOT_SUPPORTED;
+    if (strncmp(targetName, name, VX_MAX_TARGET_NAME) == 0)
+    {
+        vx_uint32 k = 0u;
+        for (k = 0u; k < VX_INT_MAX_KERNELS; k++)
+        {
+            vx_char targetKernelName[VX_MAX_KERNEL_NAME];
+            vx_char *kernel;
+            vx_char def[8] = "default";
+
+            if (target->kernels[k])
+            {
+                strncpy(targetKernelName, target->kernels[k]->name, VX_MAX_KERNEL_NAME);
+                kernel = strtok(targetKernelName, ":");
+                if (kernel == nullptr)
+                {
+                    kernel = def;
+                }
+
+                if (strncmp(kernelName, kernel, VX_MAX_KERNEL_NAME) == 0)
+                {
+                    status = VX_SUCCESS;
+                    if (pIndex)
+                        *pIndex = k;
+                    break;
+                }
+            }
+        }
+    }
+    return status;
+}
+
+extern "C" vx_action vxTargetProcess(vx_target target, vx_node nodes[], vx_size startIndex, vx_size numNodes)
+{
+    vx_action action = VX_ACTION_CONTINUE;
+    vx_status status = VX_SUCCESS;
+    vx_size n = 0;
+    (void)target;
+
+    for (n = startIndex; (n < (startIndex + numNodes)) && (action == VX_ACTION_CONTINUE); n++)
+    {
+        vx_context context = vxGetContext((vx_reference)nodes[n]);
+        VX_PRINT(VX_ZONE_GRAPH, "Executing Kernel %s:%d in Nodes[%u] on target %s\n",
+                 nodes[n]->kernel->name,
+                 nodes[n]->kernel->enumeration,
+                 n,
+                 nodes[n]->context->targets[nodes[n]->affinity]->name);
+
+        if (context->perf_enabled)
+            Osal::startCapture(&nodes[n]->perf);
+
+        if (nodes[n]->is_replicated == vx_true_e)
+        {
+            vx_size num_replicas = 0;
+            vx_uint32 param;
+            vx_uint32 num_parameters = nodes[n]->kernel->signature.num_parameters;
+            vx_reference parameters[VX_INT_MAX_PARAMS] = {nullptr};
+
+            for (param = 0; param < num_parameters; ++param)
+            {
+                if (nodes[n]->replicated_flags[param] == vx_true_e)
+                {
+                    vx_size numItems = 0;
+                    if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID)
+                    {
+                        vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope;
+                        numItems = pyr->numLevels;
+                    }
+                    else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY)
+                    {
+                        vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope;
+                        numItems = arr->num_items;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_PARAMETERS;
+                        break;
+                    }
+
+                    if (num_replicas == 0)
+                        num_replicas = numItems;
+                    else if (numItems != num_replicas)
+                    {
+                        status = VX_ERROR_INVALID_PARAMETERS;
+                        break;
+                    }
+                }
+                else
+                {
+                    parameters[param] = nodes[n]->parameters[param];
+                }
+            }
+
+            if (status == VX_SUCCESS)
+            {
+                vx_size replica;
+                for (replica = 0; replica < num_replicas; ++replica)
+                {
+                    for (param = 0; param < num_parameters; ++param)
+                    {
+                        if (nodes[n]->replicated_flags[param] == vx_true_e)
+                        {
+                            if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID)
+                            {
+                                vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope;
+                                parameters[param] = (vx_reference)pyr->levels[replica];
+                            }
+                            else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY)
+                            {
+                                vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope;
+                                parameters[param] = (vx_reference)arr->items[replica];
+                            }
+                        }
+                    }
+
+                    status = nodes[n]->kernel->function((vx_node)nodes[n],
+                                                        parameters,
+                                                        num_parameters);
+                }
+            }
+        }
+        else
+        {
+            status = nodes[n]->kernel->function((vx_node)nodes[n],
+                                                (vx_reference *)nodes[n]->parameters,
+                                                nodes[n]->kernel->signature.num_parameters);
+        }
+
+        nodes[n]->executed = vx_true_e;
+        nodes[n]->status = status;
+
+        if (context->perf_enabled)
+            Osal::stopCapture(&nodes[n]->perf);
+
+        VX_PRINT(VX_ZONE_GRAPH, "kernel %s returned %d\n", nodes[n]->kernel->name, status);
+
+        if (status == VX_SUCCESS)
+        {
+            /* call the callback if it is attached */
+            if (nodes[n]->callback)
+            {
+                action = nodes[n]->callback((vx_node)nodes[n]);
+                VX_PRINT(VX_ZONE_GRAPH, "callback returned action %d\n", action);
+            }
+        }
+        else
+        {
+            action = VX_ACTION_ABANDON;
+            VX_PRINT(VX_ZONE_ERROR, "Abandoning Graph due to error (%d)!\n", status);
+        }
+    }
+    return action;
+}
+
+extern "C" vx_status vxTargetVerify(vx_target target, vx_node node)
+{
+    vx_status status = VX_SUCCESS;
+    (void)target;
+    (void)node;
+
+    return status;
+}
+
+extern "C" vx_kernel vxTargetAddKernel(vx_target target,
+                                       vx_char name[VX_MAX_KERNEL_NAME],
+                                       vx_enum enumeration,
+                                       vx_kernel_f func_ptr,
+                                       vx_uint32 numParams,
+                                       vx_kernel_validate_f validate,
+                                       vx_kernel_input_validate_f input,
+                                       vx_kernel_output_validate_f output,
+                                       vx_kernel_initialize_f initialize,
+                                       vx_kernel_deinitialize_f deinitialize)
+{
+    VX_PRINT(VX_ZONE_INFO, "Entered %s\n", __func__);
+    vx_uint32 k = 0u;
+    vx_kernel kernel = nullptr;
+    Osal::semWait(&target->lock);
+
+    for (k = 0; k < VX_INT_MAX_KERNELS; k++)
+    {
+        if (target->kernels[k] == nullptr || target->kernels[k]->enabled == vx_false_e)
+        {
+            target->kernels[k] = reinterpret_cast<vx_kernel>(Reference::createReference(target->context, VX_TYPE_KERNEL, VX_INTERNAL, target->context));
+            target->kernels[k]->initializeKernel(enumeration, func_ptr, name,
+                                                 nullptr, numParams,
+                                                 validate, input, output,
+                                                 initialize, deinitialize);
+            VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, target->kernels[k]->name);
+            target->num_kernels++;
+            kernel = target->kernels[k];
+            break;
+        }
+        kernel = nullptr;
+    }
+    Osal::semPost(&target->lock);
+
+    return kernel;
+}
diff --git a/targets/liteRT/vx_interface.h b/targets/liteRT/vx_interface.h
new file mode 100644
index 00000000..ec0e7d4b
--- /dev/null
+++ b/targets/liteRT/vx_interface.h
@@ -0,0 +1,17 @@
+/**
+ * @file vx_interface.h
+ * @brief TFLITE Runtime Target Interface
+ * @version 0.1
+ * @date 2025-01-20
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#ifndef OPENVX_INTERFACE_H
+#define OPENVX_INTERFACE_H
+
+#include <VX/vx_helper.h>
+
+extern vx_kernel_description_t tflite_cpu_inf_kernel;
+
+#endif /* OPENVX_INTERFACE_H */
diff --git a/targets/liteRT/vx_litert_inf.cpp b/targets/liteRT/vx_litert_inf.cpp
new file mode 100644
index 00000000..43168350
--- /dev/null
+++ b/targets/liteRT/vx_litert_inf.cpp
@@ -0,0 +1,280 @@
+/**
+ * @file vx_ort_inf.cpp
+ * @brief OpenVX Interface Into LiteRT
+ * @version 0.1
+ * @date 2025-01-20
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#include <string>
+
+#include <VX/vx.h>
+#include <VX/vx_compatibility.h>
+#include <VX/vx_helper.h>
+#include <VX/vx_lib_debug.h>
+
+#include "tflite.hpp"
+#include "vx_internal.h"
+
+// Create an instance of ORT runner
+static const std::shared_ptr<TFLiteRunner> kernel = std::make_shared<TFLiteRunner>();
+
+class VxLiteRTRunner
+{
+public:
+    static constexpr vx_param_description_t kernelParams[] = {
+        {VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED},        // Parameter 0: Model path
+        {VX_INPUT, VX_TYPE_OBJECT_ARRAY, VX_PARAMETER_STATE_REQUIRED}, // Parameter 1: Input tensors
+        {VX_OUTPUT, VX_TYPE_OBJECT_ARRAY, VX_PARAMETER_STATE_REQUIRED} // Parameter 2: Output tensors
+    };
+
+    // Initialization function
+    static vx_status VX_CALLBACK litertInitWrapper(vx_node node, const vx_reference parameters[], vx_uint32 num)
+    {
+        vx_status status = VX_SUCCESS;
+        std::string modelPath;
+        // Get the tensor pointers, total size of each, and cache them in a vector of pairs
+        std::vector<std::pair<float *, vx_size>> inputTensors;
+        std::vector<std::pair<float *, vx_size>> outputTensors;
+        // Get the tensor dimensions
+        std::vector<std::vector<vx_size>> inputDims;
+        std::vector<std::vector<vx_size>> outputDims;
+
+        if (nullptr == node ||
+            nullptr == parameters ||
+            num != dimof(kernelParams))
+        {
+            status = VX_FAILURE;
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Get the model path from the first parameter
+            vx_array array = (vx_array)parameters[0];
+            status = readStringFromVxArray(array, modelPath);
+
+            if (VX_SUCCESS == status)
+            {
+                VX_PRINT(VX_ZONE_INFO, "Reading from model path: %s\n", modelPath.c_str());
+                // Initialize the kernel with the model path
+                status |= kernel->init(modelPath);
+            }
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Process input tensors
+            status = processTensors((vx_object_array)parameters[1], inputTensors);
+            // Process output tensors if input processing was successful
+            status |= processTensors((vx_object_array)parameters[2], outputTensors);
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Bind the input and output tensors
+            status = kernel->allocate(inputTensors, outputTensors);
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Get the input tensor dimensions from the tensors
+            status = processTensorDims(reinterpret_cast<vx_object_array>(parameters[1]), inputDims);
+            // Get the output tensor dimensions from the tensors
+            status = processTensorDims(reinterpret_cast<vx_object_array>(parameters[2]), outputDims);
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Call the validate member function
+            status = kernel->validate(inputDims, outputDims);
+        }
+
+        return status;
+    }
+
+    // Validation function
+    static vx_status VX_CALLBACK litertValidateWrapper(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
+    {
+        vx_status status = VX_SUCCESS;
+
+        if (nullptr == node ||
+            nullptr == parameters ||
+            num != dimof(kernelParams) ||
+            nullptr == metas)
+        {
+            std::cerr << "Error: Invalid parameters during validation!" << std::endl;
+            status = VX_FAILURE;
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Retrieve the kernel instance from the node's local data
+            if (!kernel)
+            {
+                std::cerr << "Error: Kernel instance is null during validation!" << std::endl;
+                status = VX_FAILURE;
+            }
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            vx_object_array outputObjArr = reinterpret_cast<vx_object_array>(parameters[2]);
+            vx_size numItems = 0;
+            vx_enum itemType = VX_TYPE_TENSOR;
+
+            status = vxQueryObjectArray(outputObjArr, VX_OBJECT_ARRAY_NUMITEMS, &numItems, sizeof(numItems));
+            status |= vxSetMetaFormatAttribute(metas[2], VX_OBJECT_ARRAY_NUMITEMS, &numItems, sizeof(numItems));
+            status |= vxSetMetaFormatAttribute(metas[2], VX_OBJECT_ARRAY_ITEMTYPE, &itemType, sizeof(vx_enum));
+        }
+
+        return status;
+    }
+
+    // Execution function
+    static vx_status VX_CALLBACK litertRunWrapper(vx_node node, const vx_reference *parameters, vx_uint32 num)
+    {
+        vx_status status = VX_SUCCESS;
+
+        if (nullptr == node ||
+            nullptr == parameters ||
+            num != dimof(kernelParams))
+        {
+            status = VX_FAILURE;
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Retrieve the kernel instance from the node's local data
+            if (!kernel)
+            {
+                std::cerr << "Error: Kernel instance is null during execution!" << std::endl;
+                status = VX_FAILURE;
+            }
+        }
+
+        if (VX_SUCCESS == status)
+        {
+            // Call the run member function
+            status = kernel->run();
+        }
+
+        return status;
+    }
+
+private:
+    /**
+     * @brief Helper function to read a string from a VX char array
+     *
+     * @param[in]  array  openvx char array to read from
+     * @param[out] str    Output string containing the read data
+     * @return vx_status  VX_SUCCESS on success, otherwise an error code
+     */
+    static vx_status readStringFromVxArray(vx_array array, std::string &str)
+    {
+        vx_status status = VX_SUCCESS;
+        vx_size num_items = 0u, stride = 0u;
+        vx_map_id map_id = 0;
+        void *ptr = nullptr;
+
+        status = vxQueryArray(array, VX_ARRAY_ATTRIBUTE_NUMITEMS, &num_items, sizeof(num_items));
+        status |= vxMapArrayRange(array, 0, num_items, &map_id, &stride, &ptr, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, VX_NOGAP_X);
+
+        if (VX_SUCCESS == status)
+        {
+            str = std::string(static_cast<char *>(ptr));
+            status |= vxUnmapArrayRange(array, map_id);
+        }
+
+        return status;
+    }
+
+    /**
+     * @brief Helper function to process tensor dimensions from an object array
+     *
+     * @param[in]  objArr  Object array containing tensors
+     * @param[out] dims    Vector of vectors containing tensor dimensions
+     * @return vx_status   VX_SUCCESS on success, otherwise an error code
+     */
+    static vx_status processTensorDims(vx_object_array objArr, std::vector<std::vector<vx_size>> &dims)
+    {
+        vx_status status = VX_SUCCESS;
+        vx_size numItems = 0, numDims = 0;
+        std::vector<vx_size> tensorDims;
+
+        status = vxQueryObjectArray(objArr, VX_OBJECT_ARRAY_NUMITEMS, &numItems, sizeof(numItems));
+
+        for (vx_uint32 i = 0; i < numItems && status == VX_SUCCESS; ++i)
+        {
+            vx_tensor tensor = reinterpret_cast<vx_tensor>(vxGetObjectArrayItem(objArr, i));
+            status |= vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &numDims, sizeof(numDims));
+            tensorDims.resize(numDims);
+            status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, tensorDims.data(), sizeof(vx_size) * tensorDims.size());
+
+            if (VX_SUCCESS != status)
+            {
+                std::cerr << "Error: Unable to query tensor in " << __func__ << " " << status << std::endl;
+                break;
+            }
+            dims.push_back(tensorDims);
+        }
+        return status;
+    }
+
+    /**
+     * @brief Helper function to process tensors from an object array
+     *
+     * @param[in]  objArr  Object array containing tensors
+     * @param[out] tensors Vector of pairs containing tensor data and size
+     * @return vx_status   VX_SUCCESS on success, otherwise an error code
+     */
+    static vx_status processTensors(vx_object_array objArr, std::vector<std::pair<float *, size_t>> &tensors)
+    {
+        vx_status status = VX_SUCCESS;
+        vx_size numItems = 0;
+        vxQueryObjectArray(objArr, VX_OBJECT_ARRAY_NUMITEMS, &numItems, sizeof(numItems));
+
+        for (vx_uint32 i = 0; i < numItems && status == VX_SUCCESS; ++i)
+        {
+            vx_tensor tensor = (vx_tensor)vxGetObjectArrayItem(objArr, i);
+            vx_size dims[VX_MAX_TENSOR_DIMENSIONS];
+            vx_size stride[VX_MAX_TENSOR_DIMENSIONS];
+            vx_size viewStart[VX_MAX_TENSOR_DIMENSIONS] = {0};
+            void *ptr = nullptr;
+            vx_size numDims = 0, size = 0;
+            vx_map_id map_id = 0;
+
+            status |= vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &numDims, sizeof(numDims));
+            status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, dims, sizeof(dims));
+            status |= vxQueryTensor(tensor, VX_TENSOR_STRIDE, stride, sizeof(stride));
+            status |= vxQueryTensor(tensor, VX_TENSOR_TOTAL_SIZE, &size, sizeof(size));
+            status |= vxMapTensorPatch(tensor, numDims, viewStart, dims, &map_id, stride, &ptr, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+
+            if (VX_SUCCESS != status)
+            {
+                std::cerr << "Error: Unable to prep tensor in " << __func__ << ", status: " << status << std::endl;
+                break;
+            }
+
+            tensors.emplace_back((float *)ptr, size);
+            status |= vxUnmapTensorPatch(tensor, map_id);
+        }
+        return status;
+    }
+};
+
+/**
+ * @brief LiteRT CPU Inference Kernel description structure
+ */
+vx_kernel_description_t tflite_cpu_inf_kernel =
+    {
+        VX_KERNEL_LITERT_CPU_INF,         // Unique kernel ID
+        "tflite.cpu.runner",              // Kernel name
+        VxLiteRTRunner::litertRunWrapper, // Kernel execution function
+        const_cast<vx_param_description_t *>(VxLiteRTRunner::kernelParams),
+        dimof(VxLiteRTRunner::kernelParams),   // Number of parameters
+        VxLiteRTRunner::litertValidateWrapper, // Kernel validation function
+        nullptr,
+        nullptr,
+        VxLiteRTRunner::litertInitWrapper, // Kernel initialization function
+        nullptr};
\ No newline at end of file
diff --git a/tests/integration_test/BUILD b/tests/integration_test/BUILD
index 539aec0d..a1ab40d6 100644
--- a/tests/integration_test/BUILD
+++ b/tests/integration_test/BUILD
@@ -25,4 +25,33 @@
 #         "//tests/raw:models",
 #     ],
 #     size = "small"
-# )
\ No newline at end of file
+# )
+
+cc_test(
+    name = "test_tflite",
+    srcs = [
+        "test_tflite.cpp"
+    ],
+    includes = [
+        "include",
+        "framework/include"
+    ],
+    deps = [
+        "//:corevx",
+        "@googletest//:gtest_main",
+        "//targets/c_model:imported_openvx_c_model",
+        "//targets/debug:imported_openvx_debug",
+        "//targets/extras:imported_openvx_extras",
+        "//targets/opencl:imported_openvx_opencl",
+        "//targets/liteRT:imported_openvx_liteRT",
+    ],
+    linkopts = select({
+        "@platforms//os:linux": ["-Wl,-rpath,$ORIGIN"],
+        "@platforms//os:macos": ["-Wl,-rpath,@executable_path"],
+        "//conditions:default": [],
+    }),
+    data = [
+        "//tests/raw:models",
+    ],
+    size = "small"
+)
\ No newline at end of file
diff --git a/tests/integration_test/test_tflite.cpp b/tests/integration_test/test_tflite.cpp
new file mode 100644
index 00000000..cfe70b9e
--- /dev/null
+++ b/tests/integration_test/test_tflite.cpp
@@ -0,0 +1,133 @@
+/**
+ * @file test_tflite.cpp
+ * @brief Test TFLite Target
+ * @version 0.1
+ * @date 2025-04-26
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+#include <filesystem>
+#include <fstream>
+#include <gtest/gtest.h>
+#include <vector>
+
+#include <VX/vx.h>
+
+#include "vx_internal.h"
+
+class TFLiteIntegrationTest : public ::testing::Test
+{
+protected:
+    vx_context context;
+    vx_graph graph;
+    std::string model_path = "./tests/raw/matmul_model.tflite";
+
+    void SetUp() override
+    {
+        // Initialize OpenVX context
+        context = vxCreateContext();
+        ASSERT_EQ(vxGetStatus(context), VX_SUCCESS);
+    }
+
+    void TearDown() override
+    {
+        vxReleaseGraph(&graph);
+        vxReleaseContext(&context);
+    }
+};
+
+TEST_F(TFLiteIntegrationTest, TfliteMatMul)
+{
+    const vx_size numDims = 2u;
+    vx_size inputADims[] = {3, 4};
+    vx_size inputBDims[] = {4, 3};
+    vx_size outputDims[] = {3, 3};
+
+    // Create input tensors
+    vx_tensor input_a = vxCreateTensor(context, numDims, inputADims, VX_TYPE_FLOAT32, 0);
+    vx_tensor input_b = vxCreateTensor(context, numDims, inputBDims, VX_TYPE_FLOAT32, 0);
+    vx_tensor output_c = vxCreateTensor(context, numDims, outputDims, VX_TYPE_FLOAT32, 0);
+    ASSERT_EQ(vxGetStatus(input_a), VX_SUCCESS);
+    ASSERT_EQ(vxGetStatus(input_b), VX_SUCCESS);
+    ASSERT_EQ(vxGetStatus(output_c), VX_SUCCESS);
+
+    // Query tensor strides
+    vx_size inputAStride[numDims];
+    vx_size inputBStride[numDims];
+    vx_size outputStride[numDims];
+    ASSERT_EQ(VX_SUCCESS, vxQueryTensor(input_a, VX_TENSOR_STRIDE, inputAStride, sizeof(inputAStride)));
+    ASSERT_EQ(VX_SUCCESS, vxQueryTensor(input_b, VX_TENSOR_STRIDE, inputBStride, sizeof(inputBStride)));
+    ASSERT_EQ(VX_SUCCESS, vxQueryTensor(output_c, VX_TENSOR_STRIDE, outputStride, sizeof(outputStride)));
+
+    // Create object arrays for inputs and outputs
+    vx_object_array input_tensors = vxCreateObjectArrayWithType(context, VX_TYPE_TENSOR);
+    vx_object_array output_tensors = vxCreateObjectArrayWithType(context, VX_TYPE_TENSOR);
+    ASSERT_EQ(vxGetStatus(input_tensors), VX_SUCCESS);
+    ASSERT_EQ(vxGetStatus(output_tensors), VX_SUCCESS);
+
+    // Set object array with items
+    ASSERT_EQ(VX_SUCCESS, vxSetObjectArrayItem(input_tensors, 0, (vx_reference)input_a));
+    ASSERT_EQ(VX_SUCCESS, vxSetObjectArrayItem(input_tensors, 1, (vx_reference)input_b));
+    ASSERT_EQ(VX_SUCCESS, vxSetObjectArrayItem(output_tensors, 0, (vx_reference)output_c));
+    ASSERT_EQ(input_a, (vx_tensor)vxGetObjectArrayItem(input_tensors, 0));
+    ASSERT_EQ(input_b, (vx_tensor)vxGetObjectArrayItem(input_tensors, 1));
+    ASSERT_EQ(output_c, (vx_tensor)vxGetObjectArrayItem(output_tensors, 0));
+
+    // Create model path array
+    vx_array model_path_array = vxCreateArray(context, VX_TYPE_CHAR, model_path.length() + 1);
+    ASSERT_EQ(vxGetStatus(model_path_array), VX_SUCCESS);
+    ASSERT_EQ(VX_SUCCESS, vxAddArrayItems(model_path_array, model_path.length() + 1, model_path.c_str(), sizeof(char)));
+
+    // Create graph
+    graph = vxCreateGraph(context);
+    ASSERT_EQ(vxGetStatus(graph), VX_SUCCESS);
+
+    // Get tflite kernel
+    vx_kernel kernel = vxGetKernelByEnum(context, VX_KERNEL_LITERT_CPU_INF);
+    ASSERT_EQ(vxGetStatus(kernel), VX_SUCCESS);
+
+    // Create node
+    vx_node node = vxCreateGenericNode(graph, kernel);
+    ASSERT_EQ(vxGetStatus(node), VX_SUCCESS);
+
+    // Set node parameters
+    ASSERT_EQ(VX_SUCCESS, vxSetParameterByIndex(node, 0, (vx_reference)model_path_array));
+    ASSERT_EQ(VX_SUCCESS, vxSetParameterByIndex(node, 1, (vx_reference)input_tensors));
+    ASSERT_EQ(VX_SUCCESS, vxSetParameterByIndex(node, 2, (vx_reference)output_tensors));
+
+    // Verify graph
+    ASSERT_EQ(vxVerifyGraph(graph), VX_SUCCESS);
+
+    // Fill input data
+    vx_float32 input_data_a[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    vx_float32 input_data_b[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    vx_size viewStart[VX_MAX_TENSOR_DIMENSIONS] = {0};
+
+    ASSERT_EQ(VX_SUCCESS, vxCopyTensorPatch((vx_tensor)vxGetObjectArrayItem(input_tensors, 0), numDims, viewStart, inputADims, inputAStride, input_data_a, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
+    ASSERT_EQ(VX_SUCCESS, vxCopyTensorPatch((vx_tensor)vxGetObjectArrayItem(input_tensors, 1), numDims, viewStart, inputBDims, inputBStride, input_data_b, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
+
+    // Process graph
+    ASSERT_EQ(vxProcessGraph(graph), VX_SUCCESS);
+
+    // Read output
+    float output_data[9];
+    ASSERT_EQ(VX_SUCCESS, vxCopyTensorPatch((vx_tensor)vxGetObjectArrayItem(output_tensors, 0), numDims, viewStart, outputDims, outputStride, output_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
+
+    // Validate results
+    float expected[9] = {70, 80, 90, 158, 184, 210, 246, 288, 330};
+    for (vx_uint8 i = 0; i < 9; i++)
+    {
+        EXPECT_NEAR(output_data[i], expected[i], 1e-5);
+    }
+
+    // Cleanup
+    vxReleaseTensor(&input_a);
+    vxReleaseTensor(&input_b);
+    vxReleaseTensor(&output_c);
+    vxReleaseArray(&model_path_array);
+    vxReleaseObjectArray(&input_tensors);
+    vxReleaseObjectArray(&output_tensors);
+    vxReleaseKernel(&kernel);
+    vxReleaseNode(&node);
+}
\ No newline at end of file
diff --git a/tests/raw/BUILD b/tests/raw/BUILD
index 462eb091..07741d0b 100644
--- a/tests/raw/BUILD
+++ b/tests/raw/BUILD
@@ -15,6 +15,7 @@ filegroup(
     name = "models",
     srcs = glob([
         "*.onnx",
+        "*.tflite",
     ]),
     visibility = ["//visibility:public"],
 )
\ No newline at end of file
diff --git a/tests/raw/matmul_model.tflite b/tests/raw/matmul_model.tflite
new file mode 100644
index 00000000..d920cbc9
Binary files /dev/null and b/tests/raw/matmul_model.tflite differ
diff --git a/tests/raw/tf.py b/tests/raw/tf.py
new file mode 100644
index 00000000..5220446a
--- /dev/null
+++ b/tests/raw/tf.py
@@ -0,0 +1,26 @@
+"""
+TensorFlow Lite Model Conversion Example
+This script demonstrates how to convert a TensorFlow model to TensorFlow Lite format.
+It includes a simple matrix multiplication model and shows how to save the converted model.
+"""
+import tensorflow as tf
+
+# Create a simple MatMul model
+class MatMulModel(tf.Module):
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=[3, 4], dtype=tf.float32),
+        tf.TensorSpec(shape=[4, 3], dtype=tf.float32)
+    ])
+    def matmul(self, a, b):
+        return tf.matmul(a, b)
+
+# Instantiate the model
+model = MatMulModel()
+
+# Convert to TFLite
+converter = tf.lite.TFLiteConverter.from_concrete_functions([model.matmul.get_concrete_function()])
+tflite_model = converter.convert()
+
+# Save the TFLite model
+with open("matmul_model.tflite", "wb") as f:
+    f.write(tflite_model)
\ No newline at end of file
diff --git a/third_party/BUILD b/third_party/BUILD
index fe8628c3..5d45505f 100644
--- a/third_party/BUILD
+++ b/third_party/BUILD
@@ -1,3 +1,20 @@
 """
 BUILD file for build defs for third party deps
-"""
\ No newline at end of file
+"""
+
+cc_import(
+    name = "tflite",
+    shared_library = select({
+        "@platforms//os:linux": "tflite-hdrs/libtensorflowlite.so",
+        "@platforms//os:macos": "tflite-hdrs/libtensorflowlite.dylib",
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tflite-hdrs",
+    hdrs = glob(["tflite-hdrs/**/*.h"]),
+    includes = ["tflite-hdrs"],
+    deps = ["@flatbuffers"],
+    visibility = ["//visibility:public"],
+)
\ No newline at end of file
diff --git a/third_party/patch/onnx.patch b/third_party/patch/onnx.patch
index ac25ff81..8a7deae3 100644
--- a/third_party/patch/onnx.patch
+++ b/third_party/patch/onnx.patch
@@ -22,4 +22,24 @@ index 9219f16be0..e1559bd3da 100644
 +google_nsync;https://github.com/amikhail48/nsync/archive/refs/tags/1.29.3.zip;1cdfb3b740dadf9a6cc6d6b65976d31f9d9c2900
  googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349
  #xnnpack 2024.09.04
- googlexnnpack;https://github.com/google/XNNPACK/archive/309b75c9e56e0a674bf78d59872ce131f814dfb6.zip;39FA5259EAEACE0547284B63D5CEDC4F05553F5A
\ No newline at end of file
+ googlexnnpack;https://github.com/google/XNNPACK/archive/309b75c9e56e0a674bf78d59872ce131f814dfb6.zip;39FA5259EAEACE0547284B63D5CEDC4F05553F5A
+diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
+index af6f52090a..37ae94f1ae 100644
+--- onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
++++ onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
+@@ -6,10 +6,11 @@
+ #include "sqnbitgemm.h"
+ #include "sqnbitgemm_kernel_avx_common.h"
+
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Warray-bounds"
+
+ MLAS_FORCEINLINE void
+@@ -1044,6 +1050,7 @@ MlasQ4Int8TileGemmKernelBlkLen32Avx2(
+             BiasPtr += BiasPtr != nullptr ? 1 : 0;
+             SumPtr += 1;
+         }
+     } // m
+     return CountM;
+ }
++#pragma GCC diagnostic pop
diff --git a/third_party/tflite-hdrs/libtensorflowlite.dylib b/third_party/tflite-hdrs/libtensorflowlite.dylib
new file mode 100755
index 00000000..60a3827e
Binary files /dev/null and b/third_party/tflite-hdrs/libtensorflowlite.dylib differ
diff --git a/third_party/tflite-hdrs/libtensorflowlite.so b/third_party/tflite-hdrs/libtensorflowlite.so
new file mode 100755
index 00000000..03fb0b88
Binary files /dev/null and b/third_party/tflite-hdrs/libtensorflowlite.so differ
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_api.h b/third_party/tflite-hdrs/tensorflow/c/c_api.h
new file mode 100644
index 00000000..9812b0a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_api.h
@@ -0,0 +1,1667 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_API_H_
+#define TENSORFLOW_C_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_attrtype.h"
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/c/tf_tstring.h"
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow.
+//
+// The API leans towards simplicity and uniformity instead of convenience
+// since most usage will be by language specific wrappers.
+//
+// Conventions:
+// * We use the prefix TF_ for everything in the API.
+// * Objects are always passed around as pointers to opaque structs
+//   and these structs are allocated/deallocated via the API.
+// * TF_Status holds error information.  It is an object type
+//   and therefore is passed around as a pointer to an opaque
+//   struct as mentioned above.
+// * Every call that has a TF_Status* argument clears it on success
+//   and fills it with error info on failure.
+// * unsigned char is used for booleans (instead of the 'bool' type).
+//   In C++ bool is a keyword while in C99 bool is a macro defined
+//   in stdbool.h. It is possible for the two to be inconsistent.
+//   For example, neither the C99 nor the C++11 standard force a byte
+//   size on the bool type, so the macro defined in stdbool.h could
+//   be inconsistent with the bool keyword in C++. Thus, the use
+//   of stdbool.h is avoided and unsigned char is used instead.
+// * size_t is used to represent byte sizes of objects that are
+//   materialized in the address space of the calling process.
+// * int is used as an index into arrays.
+// * Deletion functions are safe to call on nullptr.
+//
+// Questions left to address:
+// * Might at some point need a way for callers to provide their own Env.
+// * Maybe add TF_TensorShape that encapsulates dimension info.
+//
+// Design decisions made:
+// * Backing store for tensor memory has an associated deallocation
+//   function.  This deallocation function will point to client code
+//   for tensors populated by the client.  So the client can do things
+//   like shadowing a numpy array.
+// * We do not provide TF_OK since it is not strictly necessary and we
+//   are not optimizing for convenience.
+// * We make assumption that one session has one graph.  This should be
+//   fine since we have the ability to run sub-graphs.
+// * We could allow NULL for some arguments (e.g., NULL options arg).
+//   However since convenience is not a primary goal, we don't do this.
+// * Devices are not in this API.  Instead, they are created/used internally
+//   and the API just provides high level controls over the number of
+//   devices of each type.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// TF_Version returns a string describing version information of the
+// TensorFlow library. TensorFlow uses semantic versioning.
+TF_CAPI_EXPORT extern const char* TF_Version(void);
+
+// Parsing a serialized TensorProto into a TF_Tensor.
+TF_CAPI_EXPORT extern void TF_TensorFromProto(const TF_Buffer* from,
+                                              TF_Tensor* to, TF_Status* status);
+
+// --------------------------------------------------------------------------
+// Used to return strings across the C API. The caller does not take ownership
+// of the underlying data pointer and is not responsible for freeing it.
+typedef struct TF_StringView {
+  const char* data;
+  size_t len;
+} TF_StringView;
+
+// --------------------------------------------------------------------------
+// TF_SessionOptions holds options that can be passed during session creation.
+typedef struct TF_SessionOptions TF_SessionOptions;
+
+// Return a new options object.
+TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions(void);
+
+// Set the target in TF_SessionOptions.options.
+// target can be empty, a single entry, or a comma separated list of entries.
+// Each entry is in one of the following formats :
+// "local"
+// ip:port
+// host:port
+TF_CAPI_EXPORT extern void TF_SetTarget(TF_SessionOptions* options,
+                                        const char* target);
+
+// Set the config in TF_SessionOptions.options.
+// config should be a serialized tensorflow.ConfigProto proto.
+// If config was not parsed successfully as a ConfigProto, record the
+// error information in *status.
+TF_CAPI_EXPORT extern void TF_SetConfig(TF_SessionOptions* options,
+                                        const void* proto, size_t proto_len,
+                                        TF_Status* status);
+
+// Destroy an options object.
+TF_CAPI_EXPORT extern void TF_DeleteSessionOptions(TF_SessionOptions*);
+
+// TODO(jeff,sanjay):
+// - export functions to set Config fields
+
+// --------------------------------------------------------------------------
+// The new graph construction API, still under development.
+
+// Represents a computation graph.  Graphs may be shared between sessions.
+// Graphs are thread-safe when used as directed below.
+typedef struct TF_Graph TF_Graph;
+
+// Return a new graph object.
+TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph(void);
+
+// Destroy an options object. Graph will be deleted once no more
+// TFSession's are referencing it.
+TF_CAPI_EXPORT extern void TF_DeleteGraph(TF_Graph*);
+
+// Operation being built. The underlying graph must outlive this.
+typedef struct TF_OperationDescription TF_OperationDescription;
+
+// Operation that has been added to the graph. Valid until the graph is
+// deleted -- in particular adding a new operation to the graph does not
+// invalidate old TF_Operation* pointers.
+typedef struct TF_Operation TF_Operation;
+
+// Represents a specific input of an operation.
+typedef struct TF_Input {
+  TF_Operation* oper;
+  int index;  // The index of the input within oper.
+} TF_Input;
+
+// Represents a specific output of an operation.
+typedef struct TF_Output {
+  TF_Operation* oper;
+  int index;  // The index of the output within oper.
+} TF_Output;
+
+// TF_Function is a grouping of operations with defined inputs and outputs.
+// Once created and added to graphs, functions can be invoked by creating an
+// operation whose operation type matches the function name.
+typedef struct TF_Function TF_Function;
+
+// Function definition options. TODO(iga): Define and implement
+typedef struct TF_FunctionOptions TF_FunctionOptions;
+
+// Sets the shape of the Tensor referenced by `output` in `graph` to
+// the shape described by `dims` and `num_dims`.
+//
+// If the number of dimensions is unknown, `num_dims` must be set to
+// -1 and `dims` can be null. If a dimension is unknown, the
+// corresponding entry in the `dims` array must be -1.
+//
+// This does not overwrite the existing shape associated with `output`,
+// but merges the input shape with the existing shape.  For example,
+// setting a shape of [-1, 2] with an existing shape [2, -1] would set
+// a final shape of [2, 2] based on shape merging semantics.
+//
+// Returns an error into `status` if:
+//   * `output` is not in `graph`.
+//   * An invalid shape is being set (e.g., the shape being set
+//     is incompatible with the existing shape).
+TF_CAPI_EXPORT extern void TF_GraphSetTensorShape(TF_Graph* graph,
+                                                  TF_Output output,
+                                                  const int64_t* dims,
+                                                  const int num_dims,
+                                                  TF_Status* status);
+
+// Returns the number of dimensions of the Tensor referenced by `output`
+// in `graph`.
+//
+// If the number of dimensions in the shape is unknown, returns -1.
+//
+// Returns an error into `status` if:
+//   * `output` is not in `graph`.
+TF_CAPI_EXPORT extern int TF_GraphGetTensorNumDims(TF_Graph* graph,
+                                                   TF_Output output,
+                                                   TF_Status* status);
+
+// Returns the shape of the Tensor referenced by `output` in `graph`
+// into `dims`. `dims` must be an array large enough to hold `num_dims`
+// entries (e.g., the return value of TF_GraphGetTensorNumDims).
+//
+// If the number of dimensions in the shape is unknown or the shape is
+// a scalar, `dims` will remain untouched. Otherwise, each element of
+// `dims` will be set corresponding to the size of the dimension. An
+// unknown dimension is represented by `-1`.
+//
+// Returns an error into `status` if:
+//   * `output` is not in `graph`.
+//   * `num_dims` does not match the actual number of dimensions.
+TF_CAPI_EXPORT extern void TF_GraphGetTensorShape(TF_Graph* graph,
+                                                  TF_Output output,
+                                                  int64_t* dims, int num_dims,
+                                                  TF_Status* status);
+
+// Creates a new operation - see `TF_NewOperation` for more details.
+//
+// The lock for `graph` must be held when calling this function.
+//
+// Unless implementing advanced behavior, like custom gradient functions, you
+// most likely need to call `TF_NewOperation` instead.
+TF_CAPI_EXPORT extern TF_OperationDescription* TF_NewOperationLocked(
+    TF_Graph* graph, const char* op_type, const char* oper_name);
+
+// Operation will only be added to *graph when TF_FinishOperation() is
+// called (assuming TF_FinishOperation() does not return an error).
+// *graph must not be deleted until after TF_FinishOperation() is
+// called.
+TF_CAPI_EXPORT extern TF_OperationDescription* TF_NewOperation(
+    TF_Graph* graph, const char* op_type, const char* oper_name);
+
+// Specify the device for `desc`.  Defaults to empty, meaning unconstrained.
+TF_CAPI_EXPORT extern void TF_SetDevice(TF_OperationDescription* desc,
+                                        const char* device);
+
+// The calls to TF_AddInput and TF_AddInputList must match (in number,
+// order, and type) the op declaration.  For example, the "Concat" op
+// has registration:
+//   REGISTER_OP("Concat")
+//       .Input("concat_dim: int32")
+//       .Input("values: N * T")
+//       .Output("output: T")
+//       .Attr("N: int >= 2")
+//       .Attr("T: type");
+// that defines two inputs, "concat_dim" and "values" (in that order).
+// You must use TF_AddInput() for the first input (since it takes a
+// single tensor), and TF_AddInputList() for the second input (since
+// it takes a list, even if you were to pass a list with a single
+// tensor), as in:
+//   TF_OperationDescription* desc = TF_NewOperation(graph, "Concat", "c");
+//   TF_Output concat_dim_input = {...};
+//   TF_AddInput(desc, concat_dim_input);
+//   TF_Output values_inputs[5] = {{...}, ..., {...}};
+//   TF_AddInputList(desc, values_inputs, 5);
+
+// For inputs that take a single tensor.
+TF_CAPI_EXPORT extern void TF_AddInput(TF_OperationDescription* desc,
+                                       TF_Output input);
+
+// For inputs that take a list of tensors.
+// inputs must point to TF_Output[num_inputs].
+TF_CAPI_EXPORT extern void TF_AddInputList(TF_OperationDescription* desc,
+                                           const TF_Output* inputs,
+                                           int num_inputs);
+
+// Call once per control input to `desc`.
+TF_CAPI_EXPORT extern void TF_AddControlInput(TF_OperationDescription* desc,
+                                              TF_Operation* input);
+
+// Request that `desc` be co-located on the device where `op`
+// is placed.
+//
+// Use of this is discouraged since the implementation of device placement is
+// subject to change. Primarily intended for internal libraries
+TF_CAPI_EXPORT extern void TF_ColocateWith(TF_OperationDescription* desc,
+                                           TF_Operation* op);
+
+// Call some TF_SetAttr*() function for every attr that is not
+// inferred from an input and doesn't have a default value you wish to
+// keep.
+
+// `value` must point to a string of length `length` bytes.
+TF_CAPI_EXPORT extern void TF_SetAttrString(TF_OperationDescription* desc,
+                                            const char* attr_name,
+                                            const void* value, size_t length);
+// `values` and `lengths` each must have lengths `num_values`.
+// `values[i]` must point to a string of length `lengths[i]` bytes.
+TF_CAPI_EXPORT extern void TF_SetAttrStringList(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                const void* const* values,
+                                                const size_t* lengths,
+                                                int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrInt(TF_OperationDescription* desc,
+                                         const char* attr_name, int64_t value);
+TF_CAPI_EXPORT extern void TF_SetAttrIntList(TF_OperationDescription* desc,
+                                             const char* attr_name,
+                                             const int64_t* values,
+                                             int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrFloat(TF_OperationDescription* desc,
+                                           const char* attr_name, float value);
+TF_CAPI_EXPORT extern void TF_SetAttrFloatList(TF_OperationDescription* desc,
+                                               const char* attr_name,
+                                               const float* values,
+                                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrBool(TF_OperationDescription* desc,
+                                          const char* attr_name,
+                                          unsigned char value);
+TF_CAPI_EXPORT extern void TF_SetAttrBoolList(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const unsigned char* values,
+                                              int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrType(TF_OperationDescription* desc,
+                                          const char* attr_name,
+                                          TF_DataType value);
+TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const TF_DataType* values,
+                                              int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrPlaceholder(TF_OperationDescription* desc,
+                                                 const char* attr_name,
+                                                 const char* placeholder);
+
+// Set a 'func' attribute to the specified name.
+// `value` must point to a string of length `length` bytes.
+TF_CAPI_EXPORT extern void TF_SetAttrFuncName(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const char* value, size_t length);
+
+// Set `num_dims` to -1 to represent "unknown rank".  Otherwise,
+// `dims` points to an array of length `num_dims`.  `dims[i]` must be
+// >= -1, with -1 meaning "unknown dimension".
+TF_CAPI_EXPORT extern void TF_SetAttrShape(TF_OperationDescription* desc,
+                                           const char* attr_name,
+                                           const int64_t* dims, int num_dims);
+// `dims` and `num_dims` must point to arrays of length `num_shapes`.
+// Set `num_dims[i]` to -1 to represent "unknown rank".  Otherwise,
+// `dims[i]` points to an array of length `num_dims[i]`.  `dims[i][j]`
+// must be >= -1, with -1 meaning "unknown dimension".
+TF_CAPI_EXPORT extern void TF_SetAttrShapeList(TF_OperationDescription* desc,
+                                               const char* attr_name,
+                                               const int64_t* const* dims,
+                                               const int* num_dims,
+                                               int num_shapes);
+// `proto` must point to an array of `proto_len` bytes representing a
+// binary-serialized TensorShapeProto.
+TF_CAPI_EXPORT extern void TF_SetAttrTensorShapeProto(
+    TF_OperationDescription* desc, const char* attr_name, const void* proto,
+    size_t proto_len, TF_Status* status);
+// `protos` and `proto_lens` must point to arrays of length `num_shapes`.
+// `protos[i]` must point to an array of `proto_lens[i]` bytes
+// representing a binary-serialized TensorShapeProto.
+TF_CAPI_EXPORT extern void TF_SetAttrTensorShapeProtoList(
+    TF_OperationDescription* desc, const char* attr_name,
+    const void* const* protos, const size_t* proto_lens, int num_shapes,
+    TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_SetAttrTensor(TF_OperationDescription* desc,
+                                            const char* attr_name,
+                                            TF_Tensor* value,
+                                            TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorList(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                TF_Tensor* const* values,
+                                                int num_values,
+                                                TF_Status* status);
+
+// `proto` should point to a sequence of bytes of length `proto_len`
+// representing a binary serialization of an AttrValue protocol
+// buffer.
+TF_CAPI_EXPORT extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                const void* proto,
+                                                size_t proto_len,
+                                                TF_Status* status);
+
+// Adds this operation to the graph - see `TF_FinishOperation` for more details.
+//
+// The lock for `graph` must be held when calling this function.
+//
+// Unless implementing advanced behavior, like custom gradient functions, you
+// most likely need to call `TF_FinishOperation` instead.
+TF_CAPI_EXPORT extern TF_Operation* TF_FinishOperationLocked(
+    TF_OperationDescription* desc, TF_Status* status);
+
+// If this function succeeds:
+//   * *status is set to an OK value,
+//   * a TF_Operation is added to the graph,
+//   * a non-null value pointing to the added operation is returned --
+//     this value is valid until the underlying graph is deleted.
+// Otherwise:
+//   * *status is set to a non-OK value,
+//   * the graph is not modified,
+//   * a null value is returned.
+// In either case, it deletes `desc`.
+TF_CAPI_EXPORT extern TF_Operation* TF_FinishOperation(
+    TF_OperationDescription* desc, TF_Status* status);
+
+// TF_Operation functions.  Operations are immutable once created, so
+// these are all query functions.
+
+TF_CAPI_EXPORT extern const char* TF_OperationName(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationOpType(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationDevice(TF_Operation* oper);
+
+TF_CAPI_EXPORT extern int TF_OperationNumOutputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern TF_DataType TF_OperationOutputType(TF_Output oper_out);
+TF_CAPI_EXPORT extern int TF_OperationOutputListLength(TF_Operation* oper,
+                                                       const char* arg_name,
+                                                       TF_Status* status);
+
+TF_CAPI_EXPORT extern int TF_OperationNumInputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern TF_DataType TF_OperationInputType(TF_Input oper_in);
+TF_CAPI_EXPORT extern int TF_OperationInputListLength(TF_Operation* oper,
+                                                      const char* arg_name,
+                                                      TF_Status* status);
+
+// In this code:
+//   TF_Output producer = TF_OperationInput(consumer);
+// There is an edge from producer.oper's output (given by
+// producer.index) to consumer.oper's input (given by consumer.index).
+TF_CAPI_EXPORT extern TF_Output TF_OperationInput(TF_Input oper_in);
+
+// Get list of all inputs of a specific operation.  `inputs` must point to
+// an array of length at least `max_inputs` (ideally set to
+// TF_OperationNumInputs(oper)).  Beware that a concurrent
+// modification of the graph can increase the number of inputs of
+// an operation.
+TF_CAPI_EXPORT extern void TF_OperationAllInputs(TF_Operation* oper,
+                                                 TF_Output* inputs,
+                                                 int max_inputs);
+
+// Get the number of current consumers of a specific output of an
+// operation.  Note that this number can change when new operations
+// are added to the graph.
+TF_CAPI_EXPORT extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
+
+// Get list of all current consumers of a specific output of an
+// operation.  `consumers` must point to an array of length at least
+// `max_consumers` (ideally set to
+// TF_OperationOutputNumConsumers(oper_out)).  Beware that a concurrent
+// modification of the graph can increase the number of consumers of
+// an operation.  Returns the number of output consumers (should match
+// TF_OperationOutputNumConsumers(oper_out)).
+TF_CAPI_EXPORT extern int TF_OperationOutputConsumers(TF_Output oper_out,
+                                                      TF_Input* consumers,
+                                                      int max_consumers);
+
+// Get the number of control inputs to an operation.
+TF_CAPI_EXPORT extern int TF_OperationNumControlInputs(TF_Operation* oper);
+
+// Get list of all control inputs to an operation.  `control_inputs` must
+// point to an array of length `max_control_inputs` (ideally set to
+// TF_OperationNumControlInputs(oper)).  Returns the number of control
+// inputs (should match TF_OperationNumControlInputs(oper)).
+TF_CAPI_EXPORT extern int TF_OperationGetControlInputs(
+    TF_Operation* oper, TF_Operation** control_inputs, int max_control_inputs);
+
+// Get the number of operations that have `*oper` as a control input.
+// Note that this number can change when new operations are added to
+// the graph.
+TF_CAPI_EXPORT extern int TF_OperationNumControlOutputs(TF_Operation* oper);
+
+// Get the list of operations that have `*oper` as a control input.
+// `control_outputs` must point to an array of length at least
+// `max_control_outputs` (ideally set to
+// TF_OperationNumControlOutputs(oper)). Beware that a concurrent
+// modification of the graph can increase the number of control
+// outputs.  Returns the number of control outputs (should match
+// TF_OperationNumControlOutputs(oper)).
+TF_CAPI_EXPORT extern int TF_OperationGetControlOutputs(
+    TF_Operation* oper, TF_Operation** control_outputs,
+    int max_control_outputs);
+
+// TF_AttrMetadata describes the value of an attribute on an operation.
+typedef struct TF_AttrMetadata {
+  // A boolean: 1 if the attribute value is a list, 0 otherwise.
+  unsigned char is_list;
+
+  // Length of the list if is_list is true. Undefined otherwise.
+  int64_t list_size;
+
+  // Type of elements of the list if is_list != 0.
+  // Type of the single value stored in the attribute if is_list == 0.
+  TF_AttrType type;
+
+  // Total size the attribute value.
+  // The units of total_size depend on is_list and type.
+  // (1) If type == TF_ATTR_STRING and is_list == 0
+  //     then total_size is the byte size of the string
+  //     valued attribute.
+  // (2) If type == TF_ATTR_STRING and is_list == 1
+  //     then total_size is the cumulative byte size
+  //     of all the strings in the list.
+  // (3) If type == TF_ATTR_SHAPE and is_list == 0
+  //     then total_size is the number of dimensions
+  //     of the shape valued attribute, or -1
+  //     if its rank is unknown.
+  // (4) If type == TF_ATTR_SHAPE and is_list == 1
+  //     then total_size is the cumulative number
+  //     of dimensions of all shapes in the list.
+  // (5) Otherwise, total_size is undefined.
+  int64_t total_size;
+} TF_AttrMetadata;
+
+// Returns metadata about the value of the attribute `attr_name` of `oper`.
+TF_CAPI_EXPORT extern TF_AttrMetadata TF_OperationGetAttrMetadata(
+    TF_Operation* oper, const char* attr_name, TF_Status* status);
+
+// Fills in `value` with the value of the attribute `attr_name`.  `value` must
+// point to an array of length at least `max_length` (ideally set to
+// TF_AttrMetadata.total_size from TF_OperationGetAttrMetadata(oper,
+// attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrString(TF_Operation* oper,
+                                                     const char* attr_name,
+                                                     void* value,
+                                                     size_t max_length,
+                                                     TF_Status* status);
+
+// Get the list of strings in the value of the attribute `attr_name`.  Fills in
+// `values` and `lengths`, each of which must point to an array of length at
+// least `max_values`.
+//
+// The elements of values will point to addresses in `storage` which must be at
+// least `storage_size` bytes in length.  Ideally, max_values would be set to
+// TF_AttrMetadata.list_size and `storage` would be at least
+// TF_AttrMetadata.total_size, obtained from TF_OperationGetAttrMetadata(oper,
+// attr_name).
+//
+// Fails if storage_size is too small to hold the requested number of strings.
+TF_CAPI_EXPORT extern void TF_OperationGetAttrStringList(
+    TF_Operation* oper, const char* attr_name, void** values, size_t* lengths,
+    int max_values, void* storage, size_t storage_size, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_OperationGetAttrInt(TF_Operation* oper,
+                                                  const char* attr_name,
+                                                  int64_t* value,
+                                                  TF_Status* status);
+
+// Fills in `values` with the value of the attribute `attr_name` of `oper`.
+// `values` must point to an array of length at least `max_values` (ideally set
+// TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
+// attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrIntList(TF_Operation* oper,
+                                                      const char* attr_name,
+                                                      int64_t* values,
+                                                      int max_values,
+                                                      TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_OperationGetAttrFloat(TF_Operation* oper,
+                                                    const char* attr_name,
+                                                    float* value,
+                                                    TF_Status* status);
+
+// Fills in `values` with the value of the attribute `attr_name` of `oper`.
+// `values` must point to an array of length at least `max_values` (ideally set
+// to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
+// attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrFloatList(TF_Operation* oper,
+                                                        const char* attr_name,
+                                                        float* values,
+                                                        int max_values,
+                                                        TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_OperationGetAttrBool(TF_Operation* oper,
+                                                   const char* attr_name,
+                                                   unsigned char* value,
+                                                   TF_Status* status);
+
+// Fills in `values` with the value of the attribute `attr_name` of `oper`.
+// `values` must point to an array of length at least `max_values` (ideally set
+// to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
+// attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrBoolList(TF_Operation* oper,
+                                                       const char* attr_name,
+                                                       unsigned char* values,
+                                                       int max_values,
+                                                       TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_OperationGetAttrType(TF_Operation* oper,
+                                                   const char* attr_name,
+                                                   TF_DataType* value,
+                                                   TF_Status* status);
+
+// Fills in `values` with the value of the attribute `attr_name` of `oper`.
+// `values` must point to an array of length at least `max_values` (ideally set
+// to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
+// attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTypeList(TF_Operation* oper,
+                                                       const char* attr_name,
+                                                       TF_DataType* values,
+                                                       int max_values,
+                                                       TF_Status* status);
+
+// Fills in `value` with the value of the attribute `attr_name` of `oper`.
+// `values` must point to an array of length at least `num_dims` (ideally set to
+// TF_Attr_Meta.size from TF_OperationGetAttrMetadata(oper, attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrShape(TF_Operation* oper,
+                                                    const char* attr_name,
+                                                    int64_t* value,
+                                                    int num_dims,
+                                                    TF_Status* status);
+
+// Fills in `dims` with the list of shapes in the attribute `attr_name` of
+// `oper` and `num_dims` with the corresponding number of dimensions. On return,
+// for every i where `num_dims[i]` > 0, `dims[i]` will be an array of
+// `num_dims[i]` elements. A value of -1 for `num_dims[i]` indicates that the
+// i-th shape in the list is unknown.
+//
+// The elements of `dims` will point to addresses in `storage` which must be
+// large enough to hold at least `storage_size` int64_ts.  Ideally, `num_shapes`
+// would be set to TF_AttrMetadata.list_size and `storage_size` would be set to
+// TF_AttrMetadata.total_size from TF_OperationGetAttrMetadata(oper,
+// attr_name).
+//
+// Fails if storage_size is insufficient to hold the requested shapes.
+TF_CAPI_EXPORT extern void TF_OperationGetAttrShapeList(
+    TF_Operation* oper, const char* attr_name, int64_t** dims, int* num_dims,
+    int num_shapes, int64_t* storage, int storage_size, TF_Status* status);
+
+// Sets `value` to the binary-serialized TensorShapeProto of the value of
+// `attr_name` attribute of `oper`.
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorShapeProto(
+    TF_Operation* oper, const char* attr_name, TF_Buffer* value,
+    TF_Status* status);
+
+// Fills in `values` with binary-serialized TensorShapeProto values of the
+// attribute `attr_name` of `oper`. `values` must point to an array of length at
+// least `num_values` (ideally set to TF_AttrMetadata.list_size from
+// TF_OperationGetAttrMetadata(oper, attr_name)).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorShapeProtoList(
+    TF_Operation* oper, const char* attr_name, TF_Buffer** values,
+    int max_values, TF_Status* status);
+
+// Gets the TF_Tensor valued attribute of `attr_name` of `oper`.
+//
+// Allocates a new TF_Tensor which the caller is expected to take
+// ownership of (and can deallocate using TF_DeleteTensor).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensor(TF_Operation* oper,
+                                                     const char* attr_name,
+                                                     TF_Tensor** value,
+                                                     TF_Status* status);
+
+// Fills in `values` with the TF_Tensor values of the attribute `attr_name` of
+// `oper`. `values` must point to an array of TF_Tensor* of length at least
+// `max_values` (ideally set to TF_AttrMetadata.list_size from
+// TF_OperationGetAttrMetadata(oper, attr_name)).
+//
+// The caller takes ownership of all the non-null TF_Tensor* entries in `values`
+// (which can be deleted using TF_DeleteTensor(values[i])).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorList(TF_Operation* oper,
+                                                         const char* attr_name,
+                                                         TF_Tensor** values,
+                                                         int max_values,
+                                                         TF_Status* status);
+
+// Sets `output_attr_value` to the binary-serialized AttrValue proto
+// representation of the value of the `attr_name` attr of `oper`.
+TF_CAPI_EXPORT extern void TF_OperationGetAttrValueProto(
+    TF_Operation* oper, const char* attr_name, TF_Buffer* output_attr_value,
+    TF_Status* status);
+
+// Get the number of attributes the operation has.
+TF_CAPI_EXPORT extern int TF_OperationGetNumAttrs(TF_Operation* oper);
+
+// Get the length of the name of the ith attribute, or -1 if there is not an
+// ith attribute.
+TF_CAPI_EXPORT extern int TF_OperationGetAttrNameLength(TF_Operation* oper,
+                                                        int i);
+
+// Get the name of the ith attribute.  output should have the size of
+// TF_OperationGetAttrNameLength(oper, i).
+TF_CAPI_EXPORT extern void TF_OperationGetAttrName(TF_Operation* oper, int i,
+                                                   char* output,
+                                                   TF_Status* status);
+
+// Returns the operation in the graph with `oper_name`. Returns nullptr if
+// no operation found.
+TF_CAPI_EXPORT extern TF_Operation* TF_GraphOperationByName(
+    TF_Graph* graph, const char* oper_name);
+
+// Iterate through the operations of a graph.  To use:
+// size_t pos = 0;
+// TF_Operation* oper;
+// while ((oper = TF_GraphNextOperation(graph, &pos)) != nullptr) {
+//   DoSomethingWithOperation(oper);
+// }
+TF_CAPI_EXPORT extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph,
+                                                          size_t* pos);
+
+// Write out a serialized representation of `graph` (as a GraphDef protocol
+// message) to `output_graph_def` (allocated by TF_NewBuffer()).
+// `output_graph_def`'s underlying buffer will be freed when TF_DeleteBuffer()
+// is called.
+//
+// May fail on very large graphs in the future.
+TF_CAPI_EXPORT extern void TF_GraphToGraphDef(TF_Graph* graph,
+                                              TF_Buffer* output_graph_def,
+                                              TF_Status* status);
+
+// Returns the serialized OpDef proto with name `op_name`, or a bad status if no
+// such op exists. This can return OpDefs of functions copied into the graph.
+TF_CAPI_EXPORT extern void TF_GraphGetOpDef(TF_Graph* graph,
+                                            const char* op_name,
+                                            TF_Buffer* output_op_def,
+                                            TF_Status* status);
+
+// Returns the serialized VersionDef proto for this graph.
+TF_CAPI_EXPORT extern void TF_GraphVersions(TF_Graph* graph,
+                                            TF_Buffer* output_version_def,
+                                            TF_Status* status);
+
+// TF_ImportGraphDefOptions holds options that can be passed to
+// TF_GraphImportGraphDef.
+typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
+
+TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions(
+    void);
+TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
+    TF_ImportGraphDefOptions* opts);
+
+// Set the prefix to be prepended to the names of nodes in `graph_def` that will
+// be imported into `graph`. `prefix` is copied and has no lifetime
+// requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
+    TF_ImportGraphDefOptions* opts, const char* prefix);
+
+// Set the execution device for nodes in `graph_def`.
+// Only applies to nodes where a device was not already explicitly specified.
+// `device` is copied and has no lifetime requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetDefaultDevice(
+    TF_ImportGraphDefOptions* opts, const char* device);
+
+// Set whether to uniquify imported operation names. If true, imported operation
+// names will be modified if their name already exists in the graph. If false,
+// conflicting names will be treated as an error. Note that this option has no
+// effect if a prefix is set, since the prefix will guarantee all names are
+// unique. Defaults to false.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyNames(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_names);
+
+// If true, the specified prefix will be modified if it already exists as an
+// operation name or prefix in the graph. If false, a conflicting prefix will be
+// treated as an error. This option has no effect if no prefix is specified.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyPrefix(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_prefix);
+
+// Set any imported nodes with input `src_name:src_index` to have that input
+// replaced with `dst`. `src_name` refers to a node in the graph to be imported,
+// `dst` references a node already existing in the graph being imported into.
+// `src_name` is copied and has no lifetime requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddInputMapping(
+    TF_ImportGraphDefOptions* opts, const char* src_name, int src_index,
+    TF_Output dst);
+
+// Set any imported nodes with control input `src_name` to have that input
+// replaced with `dst`. `src_name` refers to a node in the graph to be imported,
+// `dst` references an operation already existing in the graph being imported
+// into. `src_name` is copied and has no lifetime requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsRemapControlDependency(
+    TF_ImportGraphDefOptions* opts, const char* src_name, TF_Operation* dst);
+
+// Cause the imported graph to have a control dependency on `oper`. `oper`
+// should exist in the graph being imported into.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddControlDependency(
+    TF_ImportGraphDefOptions* opts, TF_Operation* oper);
+
+// Add an output in `graph_def` to be returned via the `return_outputs` output
+// parameter of TF_GraphImportGraphDef(). If the output is remapped via an input
+// mapping, the corresponding existing tensor in `graph` will be returned.
+// `oper_name` is copied and has no lifetime requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddReturnOutput(
+    TF_ImportGraphDefOptions* opts, const char* oper_name, int index);
+
+// Returns the number of return outputs added via
+// TF_ImportGraphDefOptionsAddReturnOutput().
+TF_CAPI_EXPORT extern int TF_ImportGraphDefOptionsNumReturnOutputs(
+    const TF_ImportGraphDefOptions* opts);
+
+// Add an operation in `graph_def` to be returned via the `return_opers` output
+// parameter of TF_GraphImportGraphDef(). `oper_name` is copied and has no
+// lifetime requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddReturnOperation(
+    TF_ImportGraphDefOptions* opts, const char* oper_name);
+
+// Returns the number of return operations added via
+// TF_ImportGraphDefOptionsAddReturnOperation().
+TF_CAPI_EXPORT extern int TF_ImportGraphDefOptionsNumReturnOperations(
+    const TF_ImportGraphDefOptions* opts);
+
+// TF_ImportGraphDefResults holds results that are generated by
+// TF_GraphImportGraphDefWithResults().
+typedef struct TF_ImportGraphDefResults TF_ImportGraphDefResults;
+
+// Fetches the return outputs requested via
+// TF_ImportGraphDefOptionsAddReturnOutput(). The number of fetched outputs is
+// returned in `num_outputs`. The array of return outputs is returned in
+// `outputs`. `*outputs` is owned by and has the lifetime of `results`.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsReturnOutputs(
+    TF_ImportGraphDefResults* results, int* num_outputs, TF_Output** outputs);
+
+// Fetches the return operations requested via
+// TF_ImportGraphDefOptionsAddReturnOperation(). The number of fetched
+// operations is returned in `num_opers`. The array of return operations is
+// returned in `opers`. `*opers` is owned by and has the lifetime of `results`.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsReturnOperations(
+    TF_ImportGraphDefResults* results, int* num_opers, TF_Operation*** opers);
+
+// Fetches any input mappings requested via
+// TF_ImportGraphDefOptionsAddInputMapping() that didn't appear in the GraphDef
+// and weren't used as input to any node in the imported graph def. The number
+// of fetched mappings is returned in `num_missing_unused_input_mappings`. The
+// array of each mapping's source node name is returned in `src_names`, and the
+// array of each mapping's source index is returned in `src_indexes`.
+//
+// `*src_names`, `*src_indexes`, and the memory backing each string in
+// `src_names` are owned by and have the lifetime of `results`.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsMissingUnusedInputMappings(
+    TF_ImportGraphDefResults* results, int* num_missing_unused_input_mappings,
+    const char*** src_names, int** src_indexes);
+
+// Deletes a results object returned by TF_GraphImportGraphDefWithResults().
+TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefResults(
+    TF_ImportGraphDefResults* results);
+
+// Import the graph serialized in `graph_def` into `graph`.  Returns nullptr and
+// a bad status on error. Otherwise, returns a populated
+// TF_ImportGraphDefResults instance. The returned instance must be deleted via
+// TF_DeleteImportGraphDefResults().
+TF_CAPI_EXPORT extern TF_ImportGraphDefResults*
+TF_GraphImportGraphDefWithResults(TF_Graph* graph, const TF_Buffer* graph_def,
+                                  const TF_ImportGraphDefOptions* options,
+                                  TF_Status* status);
+
+// Has the same behavior as TF_GraphImportGraphDefWithResults, but instead of
+// taking in a serialized tensorflow::GraphDef, it takes in a *pointer* to the
+// C++ *in memory representation* of the GraphDef, stored in `graph_def->data`
+TF_CAPI_EXPORT extern TF_ImportGraphDefResults*
+TF_GraphImportGraphDefWithResultsNoSerialization(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Status* status);
+
+// Import the graph serialized in `graph_def` into `graph`.
+// Convenience function for when only return outputs are needed.
+//
+// `num_return_outputs` must be the number of return outputs added (i.e. the
+// result of TF_ImportGraphDefOptionsNumReturnOutputs()).  If
+// `num_return_outputs` is non-zero, `return_outputs` must be of length
+// `num_return_outputs`. Otherwise it can be null.
+TF_CAPI_EXPORT extern void TF_GraphImportGraphDefWithReturnOutputs(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Output* return_outputs,
+    int num_return_outputs, TF_Status* status);
+
+// Import the graph serialized in `graph_def` into `graph`.
+// Convenience function for when no results are needed.
+TF_CAPI_EXPORT extern void TF_GraphImportGraphDef(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Status* status);
+
+// Adds a copy of function `func` and optionally its gradient function `grad`
+// to `g`. Once `func`/`grad` is added to `g`, it can be called by creating
+// an operation using the function's name.
+// Any changes to `func`/`grad` (including deleting it) done after this method
+// returns, won't affect the copy of `func`/`grad` in `g`.
+// If `func` or `grad` are already in `g`, TF_GraphCopyFunction has no
+// effect on them, but can establish the function->gradient relationship
+// between them if `func` does not already have a gradient. If `func` already
+// has a gradient different from `grad`, an error is returned.
+//
+// `func` must not be null.
+// If `grad` is null and `func` is not in `g`, `func` is added without a
+// gradient.
+// If `grad` is null and `func` is in `g`, TF_GraphCopyFunction is a noop.
+// `grad` must have appropriate signature as described in the doc of
+// GradientDef in tensorflow/core/framework/function.proto.
+//
+// If successful, status is set to OK and `func` and `grad` are added to `g`.
+// Otherwise, status is set to the encountered error and `g` is unmodified.
+TF_CAPI_EXPORT extern void TF_GraphCopyFunction(TF_Graph* g,
+                                                const TF_Function* func,
+                                                const TF_Function* grad,
+                                                TF_Status* status);
+
+// Returns the number of TF_Functions registered in `g`.
+TF_CAPI_EXPORT extern int TF_GraphNumFunctions(TF_Graph* g);
+
+// Fills in `funcs` with the TF_Function* registered in `g`.
+// `funcs` must point to an array of TF_Function* of length at least
+// `max_func`. In usual usage, max_func should be set to the result of
+// TF_GraphNumFunctions(g). In this case, all the functions registered in
+// `g` will be returned. Else, an unspecified subset.
+//
+// If successful, returns the number of TF_Function* successfully set in
+// `funcs` and sets status to OK. The caller takes ownership of
+// all the returned TF_Functions. They must be deleted with TF_DeleteFunction.
+// On error, returns 0, sets status to the encountered error, and the contents
+// of funcs will be undefined.
+TF_CAPI_EXPORT extern int TF_GraphGetFunctions(TF_Graph* g, TF_Function** funcs,
+                                               int max_func, TF_Status* status);
+
+// Note: The following function may fail on very large protos in the future.
+
+TF_CAPI_EXPORT extern void TF_OperationToNodeDef(TF_Operation* oper,
+                                                 TF_Buffer* output_node_def,
+                                                 TF_Status* status);
+
+typedef struct TF_WhileParams {
+  // The number of inputs to the while loop, i.e. the number of loop variables.
+  // This is the size of cond_inputs, body_inputs, and body_outputs.
+  const int ninputs;
+
+  // The while condition graph. The inputs are the current values of the loop
+  // variables. The output should be a scalar boolean.
+  TF_Graph* const cond_graph;
+  const TF_Output* const cond_inputs;
+  TF_Output cond_output;
+
+  // The loop body graph. The inputs are the current values of the loop
+  // variables. The outputs are the updated values of the loop variables.
+  TF_Graph* const body_graph;
+  const TF_Output* const body_inputs;
+  TF_Output* const body_outputs;
+
+  // Unique null-terminated name for this while loop. This is used as a prefix
+  // for created operations.
+  const char* name;
+} TF_WhileParams;
+
+// Creates a TF_WhileParams for creating a while loop in `g`. `inputs` are
+// outputs that already exist in `g` used as initial values for the loop
+// variables.
+//
+// The returned TF_WhileParams will have all fields initialized except
+// `cond_output`, `body_outputs`, and `name`. The `body_outputs` buffer will be
+// allocated to size `ninputs`. The caller should build `cond_graph` and
+// `body_graph` starting from the inputs, and store the final outputs in
+// `cond_output` and `body_outputs`.
+//
+// If `status` is OK, the caller must call either TF_FinishWhile or
+// TF_AbortWhile on the returned TF_WhileParams. If `status` isn't OK, the
+// returned TF_WhileParams is not valid, and the caller should not call
+// TF_FinishWhile() or TF_AbortWhile().
+//
+// Missing functionality (TODO):
+// - Gradients
+// - Reference-type inputs
+// - Directly referencing external tensors from the cond/body graphs (this is
+//   possible in the Python API)
+TF_CAPI_EXPORT extern TF_WhileParams TF_NewWhile(TF_Graph* g, TF_Output* inputs,
+                                                 int ninputs,
+                                                 TF_Status* status);
+
+// Builds the while loop specified by `params` and returns the output tensors of
+// the while loop in `outputs`. `outputs` should be allocated to size
+// `params.ninputs`.
+//
+// `params` is no longer valid once this returns.
+//
+// Either this or TF_AbortWhile() must be called after a successful
+// TF_NewWhile() call.
+TF_CAPI_EXPORT extern void TF_FinishWhile(const TF_WhileParams* params,
+                                          TF_Status* status,
+                                          TF_Output* outputs);
+
+// Frees `params`s resources without building a while loop. `params` is no
+// longer valid after this returns. Either this or TF_FinishWhile() must be
+// called after a successful TF_NewWhile() call.
+TF_CAPI_EXPORT extern void TF_AbortWhile(const TF_WhileParams* params);
+
+// Adds operations to compute the partial derivatives of sum of `y`s w.r.t `x`s,
+// i.e., d(y_1 + y_2 + ...)/dx_1, d(y_1 + y_2 + ...)/dx_2...
+//
+// `dx` are used as initial gradients (which represent the symbolic partial
+// derivatives of some loss function `L` w.r.t. `y`).
+// `dx` must be nullptr or have size `ny`.
+// If `dx` is nullptr, the implementation will use dx of `OnesLike` for all
+// shapes in `y`.
+// The partial derivatives are returned in `dy`. `dy` should be allocated to
+// size `nx`.
+//
+// Gradient nodes are automatically named under the "gradients/" prefix. To
+// guarantee name uniqueness, subsequent calls to the same graph will
+// append an incremental tag to the prefix: "gradients_1/", "gradients_2/", ...
+// See TF_AddGradientsWithPrefix, which provides a means to specify a custom
+// name prefix for operations added to a graph to compute the gradients.
+//
+// WARNING: This function does not yet support all the gradients that python
+// supports. See
+// https://www.tensorflow.org/code/tensorflow/cc/gradients/README.md
+// for instructions on how to add C++ more gradients.
+TF_CAPI_EXPORT void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny,
+                                    TF_Output* x, int nx, TF_Output* dx,
+                                    TF_Status* status, TF_Output* dy);
+
+// Adds operations to compute the partial derivatives of sum of `y`s w.r.t `x`s,
+// i.e., d(y_1 + y_2 + ...)/dx_1, d(y_1 + y_2 + ...)/dx_2...
+// This is a variant of TF_AddGradients that allows to caller to pass a custom
+// name prefix to the operations added to a graph to compute the gradients.
+//
+// `dx` are used as initial gradients (which represent the symbolic partial
+// derivatives of some loss function `L` w.r.t. `y`).
+// `dx` must be nullptr or have size `ny`.
+// If `dx` is nullptr, the implementation will use dx of `OnesLike` for all
+// shapes in `y`.
+// The partial derivatives are returned in `dy`. `dy` should be allocated to
+// size `nx`.
+// `prefix` names the scope into which all gradients operations are being added.
+// `prefix` must be unique within the provided graph otherwise this operation
+// will fail. If `prefix` is nullptr, the default prefixing behaviour takes
+// place, see TF_AddGradients for more details.
+//
+// WARNING: This function does not yet support all the gradients that python
+// supports. See
+// https://www.tensorflow.org/code/tensorflow/cc/gradients/README.md
+// for instructions on how to add C++ more gradients.
+TF_CAPI_EXPORT void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix,
+                                              TF_Output* y, int ny,
+                                              TF_Output* x, int nx,
+                                              TF_Output* dx, TF_Status* status,
+                                              TF_Output* dy);
+
+// Create a TF_Function from a TF_Graph
+//
+// Params:
+//  fn_body - the graph whose operations (or subset of whose operations) will be
+//            converted to TF_Function.
+//  fn_name - the name of the new TF_Function. Should match the operation
+//            name (OpDef.name) regexp [A-Z][A-Za-z0-9_.\\-/]*.
+//            If `append_hash_to_fn_name` is false, `fn_name` must be distinct
+//            from other function and operation names (at least those
+//            registered in graphs where this function will be used).
+//  append_hash_to_fn_name - Must be 0 or 1. If set to 1, the actual name
+//                           of the function will be `fn_name` appended with
+//                           '_<hash_of_this_function's_definition>'.
+//                           If set to 0, the function's name will be `fn_name`.
+//  num_opers - `num_opers` contains the number of elements in the `opers` array
+//              or a special value of -1 meaning that no array is given.
+//              The distinction between an empty array of operations and no
+//              array of operations is necessary to distinguish the case of
+//              creating a function with no body (e.g. identity or permutation)
+//              and the case of creating a function whose body contains all
+//              the nodes in the graph (except for the automatic skipping, see
+//              below).
+//  opers - Array of operations to become the body of the function or null.
+//          - If no array is given (`num_opers` = -1), all the
+//          operations in `fn_body` will become part of the function
+//          except operations referenced in `inputs`. These operations
+//          must have a single output (these operations are typically
+//          placeholders created for the sole purpose of representing
+//          an input. We can relax this constraint if there are
+//          compelling use cases).
+//          - If an array is given (`num_opers` >= 0), all operations
+//          in it will become part of the function. In particular, no
+//          automatic skipping of dummy input operations is performed.
+//  ninputs - number of elements in `inputs` array
+//  inputs - array of TF_Outputs that specify the inputs to the function.
+//           If `ninputs` is zero (the function takes no inputs), `inputs`
+//           can be null. The names used for function inputs are normalized
+//           names of the operations (usually placeholders) pointed to by
+//           `inputs`. These operation names should start with a letter.
+//           Normalization will convert all letters to lowercase and
+//           non-alphanumeric characters to '_' to make resulting names match
+//           the "[a-z][a-z0-9_]*" pattern for operation argument names.
+//           `inputs` cannot contain the same tensor twice.
+//  noutputs - number of elements in `outputs` array
+//  outputs - array of TF_Outputs that specify the outputs of the function.
+//            If `noutputs` is zero (the function returns no outputs), `outputs`
+//            can be null. `outputs` can contain the same tensor more than once.
+//  output_names - The names of the function's outputs. `output_names` array
+//                 must either have the same length as `outputs`
+//                 (i.e. `noutputs`) or be null. In the former case,
+//                 the names should match the regular expression for ArgDef
+//                 names - "[a-z][a-z0-9_]*". In the latter case,
+//                 names for outputs will be generated automatically.
+//  opts - various options for the function, e.g. XLA's inlining control.
+//  description - optional human-readable description of this function.
+//  status - Set to OK on success and an appropriate error on failure.
+//
+// Note that when the same TF_Output is listed as both an input and an output,
+// the corresponding function's output will equal to this input,
+// instead of the original node's output.
+//
+// Callers must also satisfy the following constraints:
+// - `inputs` cannot refer to TF_Outputs within a control flow context. For
+//   example, one cannot use the output of "switch" node as input.
+// - `inputs` and `outputs` cannot have reference types. Reference types are
+//   not exposed through C API and are being replaced with Resources. We support
+//   reference types inside function's body to support legacy code. Do not
+//   use them in new code.
+// - Every node in the function's body must have all of its inputs (including
+//   control inputs). In other words, for every node in the body, each input
+//   must be either listed in `inputs` or must come from another node in
+//   the body. In particular, it is an error to have a control edge going from
+//   a node outside of the body into a node in the body. This applies to control
+//   edges going from nodes referenced in `inputs` to nodes in the body when
+//   the former nodes are not in the body (automatically skipped or not
+//   included in explicitly specified body).
+//
+// Returns:
+//  On success, a newly created TF_Function instance. It must be deleted by
+//  calling TF_DeleteFunction.
+//
+//  On failure, null.
+TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    const TF_FunctionOptions* opts, const char* description, TF_Status* status);
+
+// Similar to TF_GraphToFunction but allows specifying control outputs of the
+// function.
+//
+//  The arguments of TF_GraphToFunction have the same meaning, but the new
+//  arguments are as follows:
+//
+//    ncontrol_outputs: Number of control outputs of the function.
+//    control_outputs: vector of TF_Operation objects to be marked as control
+//      outputs of the function. Operations marked as control outputs are
+//      guaranteed to execute.
+//    control_output_names: Optional. If not nullptr, vector of strings, one
+//      per control output, with their names to be added to the function's
+//      OpDef.
+TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status);
+
+// Returns the name of the graph function.
+// The return value points to memory that is only usable until the next
+// mutation to *func.
+TF_CAPI_EXPORT extern const char* TF_FunctionName(TF_Function* func);
+
+// Write out a serialized representation of `func` (as a FunctionDef protocol
+// message) to `output_func_def` (allocated by TF_NewBuffer()).
+// `output_func_def`'s underlying buffer will be freed when TF_DeleteBuffer()
+// is called.
+//
+// May fail on very large graphs in the future.
+TF_CAPI_EXPORT extern void TF_FunctionToFunctionDef(TF_Function* func,
+                                                    TF_Buffer* output_func_def,
+                                                    TF_Status* status);
+
+// Construct and return the function whose FunctionDef representation is
+// serialized in `proto`. `proto_len` must equal the number of bytes
+// pointed to by `proto`.
+// Returns:
+//  On success, a newly created TF_Function instance. It must be deleted by
+//  calling TF_DeleteFunction.
+//
+//  On failure, null.
+TF_CAPI_EXPORT extern TF_Function* TF_FunctionImportFunctionDef(
+    const void* proto, size_t proto_len, TF_Status* status);
+
+// Sets function attribute named `attr_name` to value stored in `proto`.
+// If this attribute is already set to another value, it is overridden.
+// `proto` should point to a sequence of bytes of length `proto_len`
+// representing a binary serialization of an AttrValue protocol
+// buffer.
+TF_CAPI_EXPORT extern void TF_FunctionSetAttrValueProto(TF_Function* func,
+                                                        const char* attr_name,
+                                                        const void* proto,
+                                                        size_t proto_len,
+                                                        TF_Status* status);
+
+// Sets `output_attr_value` to the binary-serialized AttrValue proto
+// representation of the value of the `attr_name` attr of `func`.
+// If `attr_name` attribute is not present, status is set to an error.
+TF_CAPI_EXPORT extern void TF_FunctionGetAttrValueProto(
+    TF_Function* func, const char* attr_name, TF_Buffer* output_attr_value,
+    TF_Status* status);
+
+// Frees the memory used by the `func` struct.
+// TF_DeleteFunction is a noop if `func` is null.
+// Deleting a function does not remove it from any graphs it was copied to.
+TF_CAPI_EXPORT extern void TF_DeleteFunction(TF_Function* func);
+
+// Attempts to evaluate `output`. This will only be possible if `output` doesn't
+// depend on any graph inputs (this function is safe to call if this isn't the
+// case though).
+//
+// If the evaluation is successful, this function returns true and `output`s
+// value is returned in `result`. Otherwise returns false. An error status is
+// returned if something is wrong with the graph or input. Note that this may
+// return false even if no error status is set.
+TF_CAPI_EXPORT extern unsigned char TF_TryEvaluateConstant(TF_Graph* graph,
+                                                           TF_Output output,
+                                                           TF_Tensor** result,
+                                                           TF_Status* status);
+
+// TODO(josh11b): Register OpDef, available to all operations added
+// to this graph.
+
+// --------------------------------------------------------------------------
+// API for driving Graph execution.
+
+typedef struct TF_Session TF_Session;
+
+// Return a new execution session with the associated graph, or NULL on
+// error. Does not take ownership of any input parameters.
+//
+// *`graph` must be a valid graph (not deleted or nullptr). `graph` will be
+// kept alive for the lifetime of the returned TF_Session. New nodes can still
+// be added to `graph` after this call.
+TF_CAPI_EXPORT extern TF_Session* TF_NewSession(TF_Graph* graph,
+                                                const TF_SessionOptions* opts,
+                                                TF_Status* status);
+
+// This function creates a new TF_Session (which is created on success) using
+// `session_options`, and then initializes state (restoring tensors and other
+// assets) using `run_options`.
+//
+// Any NULL and non-NULL value combinations for (`run_options, `meta_graph_def`)
+// are valid.
+//
+// - `export_dir` must be set to the path of the exported SavedModel.
+// - `tags` must include the set of tags used to identify one MetaGraphDef in
+//    the SavedModel.
+// - `graph` must be a graph newly allocated with TF_NewGraph().
+//
+// If successful, populates `graph` with the contents of the Graph and
+// `meta_graph_def` with the MetaGraphDef of the loaded model.
+TF_CAPI_EXPORT extern TF_Session* TF_LoadSessionFromSavedModel(
+    const TF_SessionOptions* session_options, const TF_Buffer* run_options,
+    const char* export_dir, const char* const* tags, int tags_len,
+    TF_Graph* graph, TF_Buffer* meta_graph_def, TF_Status* status);
+
+// Close a session.
+//
+// Contacts any other processes associated with the session, if applicable.
+// May not be called after TF_DeleteSession().
+TF_CAPI_EXPORT extern void TF_CloseSession(TF_Session*, TF_Status* status);
+
+// Destroy a session object.
+//
+// Even if error information is recorded in *status, this call discards all
+// local resources associated with the session.  The session may not be used
+// during or after this call (and the session drops its reference to the
+// corresponding graph).
+TF_CAPI_EXPORT extern void TF_DeleteSession(TF_Session*, TF_Status* status);
+
+// Run the graph associated with the session starting with the supplied inputs
+// (inputs[0,ninputs-1] with corresponding values in input_values[0,ninputs-1]).
+//
+// Any NULL and non-NULL value combinations for (`run_options`,
+// `run_metadata`) are valid.
+//
+//    - `run_options` may be NULL, in which case it will be ignored; or
+//      non-NULL, in which case it must point to a `TF_Buffer` containing the
+//      serialized representation of a `RunOptions` protocol buffer.
+//    - `run_metadata` may be NULL, in which case it will be ignored; or
+//      non-NULL, in which case it must point to an empty, freshly allocated
+//      `TF_Buffer` that may be updated to contain the serialized representation
+//      of a `RunMetadata` protocol buffer.
+//
+// The caller retains ownership of `input_values` (which can be deleted using
+// TF_DeleteTensor). The caller also retains ownership of `run_options` and/or
+// `run_metadata` (when not NULL) and should manually call TF_DeleteBuffer on
+// them.
+//
+// On success, the tensors corresponding to outputs[0,noutputs-1] are placed in
+// output_values[]. Ownership of the elements of output_values[] is transferred
+// to the caller, which must eventually call TF_DeleteTensor on them.
+//
+// On failure, output_values[] contains NULLs.
+TF_CAPI_EXPORT extern void TF_SessionRun(
+    TF_Session* session,
+    // RunOptions
+    const TF_Buffer* run_options,
+    // Input tensors
+    const TF_Output* inputs, TF_Tensor* const* input_values, int ninputs,
+    // Output tensors
+    const TF_Output* outputs, TF_Tensor** output_values, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // RunMetadata
+    TF_Buffer* run_metadata,
+    // Output status
+    TF_Status*);
+
+// Set up the graph with the intended feeds (inputs) and fetches (outputs) for a
+// sequence of partial run calls.
+//
+// On success, returns a handle that is used for subsequent PRun calls. The
+// handle should be deleted with TF_DeletePRunHandle when it is no longer
+// needed.
+//
+// On failure, out_status contains a tensorflow::Status with an error
+// message. *handle is set to nullptr.
+TF_CAPI_EXPORT extern void TF_SessionPRunSetup(
+    TF_Session*,
+    // Input names
+    const TF_Output* inputs, int ninputs,
+    // Output names
+    const TF_Output* outputs, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // Output handle
+    const char** handle,
+    // Output status
+    TF_Status*);
+
+// Continue to run the graph with additional feeds and fetches. The
+// execution state is uniquely identified by the handle.
+TF_CAPI_EXPORT extern void TF_SessionPRun(
+    TF_Session*, const char* handle,
+    // Input tensors
+    const TF_Output* inputs, TF_Tensor* const* input_values, int ninputs,
+    // Output tensors
+    const TF_Output* outputs, TF_Tensor** output_values, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // Output status
+    TF_Status*);
+
+// Deletes a handle allocated by TF_SessionPRunSetup.
+// Once called, no more calls to TF_SessionPRun should be made.
+TF_CAPI_EXPORT extern void TF_DeletePRunHandle(const char* handle);
+
+// --------------------------------------------------------------------------
+// The deprecated session API.  Please switch to the above instead of
+// TF_ExtendGraph(). This deprecated API can be removed at any time without
+// notice.
+
+typedef struct TF_DeprecatedSession TF_DeprecatedSession;
+
+TF_CAPI_EXPORT extern TF_DeprecatedSession* TF_NewDeprecatedSession(
+    const TF_SessionOptions*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_CloseDeprecatedSession(TF_DeprecatedSession*,
+                                                     TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteDeprecatedSession(TF_DeprecatedSession*,
+                                                      TF_Status* status);
+TF_CAPI_EXPORT extern void TF_Reset(const TF_SessionOptions* opt,
+                                    const char** containers, int ncontainers,
+                                    TF_Status* status);
+// Treat the bytes proto[0,proto_len-1] as a serialized GraphDef and
+// add the nodes in that GraphDef to the graph for the session.
+//
+// Prefer use of TF_Session and TF_GraphImportGraphDef over this.
+TF_CAPI_EXPORT extern void TF_ExtendGraph(TF_DeprecatedSession*,
+                                          const void* proto, size_t proto_len,
+                                          TF_Status*);
+
+// See TF_SessionRun() above.
+TF_CAPI_EXPORT extern void TF_Run(TF_DeprecatedSession*,
+                                  const TF_Buffer* run_options,
+                                  const char** input_names, TF_Tensor** inputs,
+                                  int ninputs, const char** output_names,
+                                  TF_Tensor** outputs, int noutputs,
+                                  const char** target_oper_names, int ntargets,
+                                  TF_Buffer* run_metadata, TF_Status*);
+
+// See TF_SessionPRunSetup() above.
+TF_CAPI_EXPORT extern void TF_PRunSetup(TF_DeprecatedSession*,
+                                        const char** input_names, int ninputs,
+                                        const char** output_names, int noutputs,
+                                        const char** target_oper_names,
+                                        int ntargets, const char** handle,
+                                        TF_Status*);
+
+// See TF_SessionPRun above.
+TF_CAPI_EXPORT extern void TF_PRun(TF_DeprecatedSession*, const char* handle,
+                                   const char** input_names, TF_Tensor** inputs,
+                                   int ninputs, const char** output_names,
+                                   TF_Tensor** outputs, int noutputs,
+                                   const char** target_oper_names, int ntargets,
+                                   TF_Status*);
+
+typedef struct TF_DeviceList TF_DeviceList;
+
+// Lists all devices in a TF_Session.
+//
+// Caller takes ownership of the returned TF_DeviceList* which must eventually
+// be freed with a call to TF_DeleteDeviceList.
+TF_CAPI_EXPORT extern TF_DeviceList* TF_SessionListDevices(TF_Session* session,
+                                                           TF_Status* status);
+
+// Lists all devices in a TF_Session.
+//
+// Caller takes ownership of the returned TF_DeviceList* which must eventually
+// be freed with a call to TF_DeleteDeviceList.
+TF_CAPI_EXPORT extern TF_DeviceList* TF_DeprecatedSessionListDevices(
+    TF_DeprecatedSession* session, TF_Status* status);
+
+// Deallocates the device list.
+TF_CAPI_EXPORT extern void TF_DeleteDeviceList(TF_DeviceList* list);
+
+// Counts the number of elements in the device list.
+TF_CAPI_EXPORT extern int TF_DeviceListCount(const TF_DeviceList* list);
+
+// Retrieves the full name of the device (e.g. /job:worker/replica:0/...)
+// The return value will be a pointer to a null terminated string. The caller
+// must not modify or delete the string. It will be deallocated upon a call to
+// TF_DeleteDeviceList.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and a null pointer will be returned.
+TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
+                                                    int index,
+                                                    TF_Status* status);
+
+// Retrieves the type of the device at the given index.
+//
+// The caller must not modify or delete the string. It will be deallocated upon
+// a call to TF_DeleteDeviceList.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and a null pointer will be returned.
+TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
+                                                    int index,
+                                                    TF_Status* status);
+
+// Retrieve the amount of memory associated with a given device.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and -1 will be returned.
+TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
+    const TF_DeviceList* list, int index, TF_Status* status);
+
+// Retrieve the incarnation number of a given device.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and 0 will be returned.
+TF_CAPI_EXPORT extern uint64_t TF_DeviceListIncarnation(
+    const TF_DeviceList* list, int index, TF_Status* status);
+
+// --------------------------------------------------------------------------
+// Load plugins containing custom ops and kernels
+
+// TF_Library holds information about dynamically loaded TensorFlow plugins.
+typedef struct TF_Library TF_Library;
+
+// Load the library specified by library_filename and register the ops and
+// kernels present in that library.
+//
+// Pass "library_filename" to a platform-specific mechanism for dynamically
+// loading a library. The rules for determining the exact location of the
+// library are platform-specific and are not documented here.
+//
+// On success, place OK in status and return the newly created library handle.
+// The caller owns the library handle.
+//
+// On failure, place an error status in status and return NULL.
+TF_CAPI_EXPORT extern TF_Library* TF_LoadLibrary(const char* library_filename,
+                                                 TF_Status* status);
+
+// Get the OpList of OpDefs defined in the library pointed by lib_handle.
+//
+// Returns a TF_Buffer. The memory pointed to by the result is owned by
+// lib_handle. The data in the buffer will be the serialized OpList proto for
+// ops defined in the library.
+TF_CAPI_EXPORT extern TF_Buffer TF_GetOpList(TF_Library* lib_handle);
+
+// Frees the memory associated with the library handle.
+// Does NOT unload the library.
+TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
+
+// Get the OpList of all OpDefs defined in this address space.
+// Returns a TF_Buffer, ownership of which is transferred to the caller
+// (and can be freed using TF_DeleteBuffer).
+//
+// The data in the buffer will be the serialized OpList proto for ops registered
+// in this address space.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList(void);
+
+// TF_ApiDefMap encapsulates a collection of API definitions for an operation.
+//
+// This object maps the name of a TensorFlow operation to a description of the
+// API to generate for it, as defined by the ApiDef protocol buffer (
+// https://www.tensorflow.org/code/tensorflow/core/framework/api_def.proto)
+//
+// The ApiDef messages are typically used to generate convenience wrapper
+// functions for TensorFlow operations in various language bindings.
+typedef struct TF_ApiDefMap TF_ApiDefMap;
+
+// Creates a new TF_ApiDefMap instance.
+//
+// Params:
+//  op_list_buffer - TF_Buffer instance containing serialized OpList
+//    protocol buffer. (See
+//    https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto
+//    for the OpList proto definition).
+//  status - Set to OK on success and an appropriate error on failure.
+TF_CAPI_EXPORT extern TF_ApiDefMap* TF_NewApiDefMap(TF_Buffer* op_list_buffer,
+                                                    TF_Status* status);
+
+// Deallocates a TF_ApiDefMap.
+TF_CAPI_EXPORT extern void TF_DeleteApiDefMap(TF_ApiDefMap* apimap);
+
+// Add ApiDefs to the map.
+//
+// `text` corresponds to a text representation of an ApiDefs protocol message.
+// (https://www.tensorflow.org/code/tensorflow/core/framework/api_def.proto).
+//
+// The provided ApiDefs will be merged with existing ones in the map, with
+// precedence given to the newly added version in case of conflicts with
+// previous calls to TF_ApiDefMapPut.
+TF_CAPI_EXPORT extern void TF_ApiDefMapPut(TF_ApiDefMap* api_def_map,
+                                           const char* text, size_t text_len,
+                                           TF_Status* status);
+
+// Returns a serialized ApiDef protocol buffer for the TensorFlow operation
+// named `name`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map,
+                                                 const char* name,
+                                                 size_t name_len,
+                                                 TF_Status* status);
+
+// --------------------------------------------------------------------------
+// Kernel definition information.
+
+// Returns a serialized KernelList protocol buffer containing KernelDefs for all
+// registered kernels.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
+
+// Returns a serialized KernelList protocol buffer containing KernelDefs for all
+// kernels registered for the operation named `name`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
+    const char* name, TF_Status* status);
+
+// Update edge, switch input/ output in a node
+TF_CAPI_EXPORT extern void TF_UpdateEdge(TF_Graph* graph, TF_Output new_src,
+                                         TF_Input dst, TF_Status* status);
+
+// --------------------------------------------------------------------------
+// In-process TensorFlow server functionality, for use in distributed training.
+// A Server instance encapsulates a set of devices and a Session target that
+// can participate in distributed training. A server belongs to a cluster
+// (specified by a ClusterSpec), and corresponds to a particular task in a
+// named job. The server can communicate with any other server in the same
+// cluster.
+
+// In-process TensorFlow server.
+typedef struct TF_Server TF_Server;
+
+// Creates a new in-process TensorFlow server configured using a serialized
+// ServerDef protocol buffer provided via `proto` and `proto_len`.
+//
+// The server will not serve any requests until TF_ServerStart is invoked.
+// The server will stop serving requests once TF_ServerStop or
+// TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto,
+                                              size_t proto_len,
+                                              TF_Status* status);
+
+// Starts an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStart(TF_Server* server, TF_Status* status);
+
+// Stops an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStop(TF_Server* server, TF_Status* status);
+
+// Blocks until the server has been successfully stopped (via TF_ServerStop or
+// TF_ServerClose).
+TF_CAPI_EXPORT extern void TF_ServerJoin(TF_Server* server, TF_Status* status);
+
+// Returns the target string that can be provided to TF_SetTarget() to connect
+// a TF_Session to `server`.
+//
+// The returned string is valid only until TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
+
+// Destroy an in-process TensorFlow server, frees memory. If server is running
+// it will be stopped and joined.
+TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
+
+// Register a listener method that processes printed messages.
+//
+// If any listeners are registered, the print operator will call all listeners
+// with the printed messages and immediately return without writing to the
+// logs.
+TF_CAPI_EXPORT extern void TF_RegisterLogListener(
+    void (*listener)(const char*));
+
+// Register a FileSystem plugin from filename `plugin_filename`.
+//
+// On success, place OK in status.
+// On failure, place an error status in status.
+TF_CAPI_EXPORT extern void TF_RegisterFilesystemPlugin(
+    const char* plugin_filename, TF_Status* status);
+
+// Apis that are corresponding to python c api. --------------------
+
+// Add control input to `op`.
+TF_CAPI_EXPORT extern void TF_AddOperationControlInput(TF_Graph* graph,
+                                                       TF_Operation* op,
+                                                       TF_Operation* input);
+
+// Changes an attr value in the node_def Protocol Buffer and sets a status upon
+// completion.
+TF_CAPI_EXPORT extern void TF_SetAttr(TF_Graph* graph, TF_Operation* op,
+                                      const char* attr_name,
+                                      TF_Buffer* attr_value_proto,
+                                      TF_Status* status);
+
+// Clears the attr in the node_def Protocol Buffer and sets a status upon
+// completion.
+TF_CAPI_EXPORT extern void TF_ClearAttr(TF_Graph* graph, TF_Operation* op,
+                                        const char* attr_name,
+                                        TF_Status* status);
+
+// Sets the experimental_type` field in the node_def Protocol Buffer.
+TF_CAPI_EXPORT extern void TF_SetFullType(TF_Graph* graph, TF_Operation* op,
+                                          const TF_Buffer* full_type_proto);
+
+// Set the requested device for `graph`.
+TF_CAPI_EXPORT extern void TF_SetRequestedDevice(TF_Graph* graph,
+                                                 TF_Operation* op,
+                                                 const char* device);
+
+// Remove all the control inputs from `op` in `graph`.
+TF_CAPI_EXPORT extern void TF_RemoveAllControlInputs(TF_Graph* graph,
+                                                     TF_Operation* op);
+
+// Set if `graph` requires shape inference functions.
+TF_CAPI_EXPORT extern void TF_SetRequireShapeInferenceFns(TF_Graph* graph,
+                                                          bool require);
+
+// Extends `session` with any new operations added to its associated graph.
+// Usually this happens automatically in TF_SessionRun. After this is called,
+// TF_SessionRun will no longer extend the session on every call.
+//
+// We expose this here to allow fine-grained synchronization in multi-threaded
+// workloads, which is required since the Python implementation depends on the
+// above mutation methods. This allows us to prevent modifications to nodes in
+// the graph after the session has been made aware of them.
+TF_CAPI_EXPORT extern void TF_ExtendSession(TF_Session* session,
+                                            TF_Status* status);
+
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource or variant tensor, or otherwise returns the empty
+// string.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetHandleShapeAndType(TF_Graph* graph,
+                                                          TF_Output output);
+
+// Sets `output` based on `proto`, which should be a serialized
+// CppShapeInferenceResult::HandleData proto. `output` should be a resource
+// or variant tensor.
+// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string
+// because I couldn't get SWIG to work otherwise.
+TF_CAPI_EXPORT extern void TF_SetHandleShapeAndType(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    const void* proto,
+                                                    size_t proto_len,
+                                                    TF_Status* status);
+
+// This method is used to add a new input edge to 'dst', which must be a While
+// op. The While op's "T" attribute must have already been updated to include
+// the new edge. This is used to construct tf.while_loop gradients.
+TF_CAPI_EXPORT extern void TF_AddWhileInputHack(TF_Graph* graph,
+                                                TF_Output new_src,
+                                                TF_Operation* dst,
+                                                TF_Status* status);
+
+// ----------------------------------------------------------------
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_api_experimental.h b/third_party/tflite-hdrs/tensorflow/c/c_api_experimental.h
new file mode 100644
index 00000000..abae68cf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_api_experimental.h
@@ -0,0 +1,324 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_C_C_API_EXPERIMENTAL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+
+// --------------------------------------------------------------------------
+// Experimental C API for TensorFlow.
+//
+// The API here is subject to changes in the future.
+// --------------------------------------------------------------------------
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// When `enable` is true, set
+// tensorflow.ConfigProto.OptimizerOptions.global_jit_level to ON_1, and also
+// set XLA flag values to prepare for XLA compilation. Otherwise set
+// global_jit_level to OFF.
+//
+// This and the next API are syntax sugar over TF_SetConfig(), and is used by
+// clients that cannot read/write the tensorflow.ConfigProto proto.
+// TODO: Migrate to TF_CreateConfig() below.
+TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
+                                                   unsigned char enable);
+
+// Set XLA's internal BuildXlaOpsPassFlags.tf_xla_enable_lazy_compilation to the
+// value of 'enabled'. Also returns the original value of that flag.
+//
+// Use in tests to allow XLA to fallback to TF classic. This has global effect.
+TF_CAPI_EXPORT unsigned char TF_SetXlaEnableLazyCompilation(
+    unsigned char enable);
+TF_CAPI_EXPORT unsigned char TF_SetTfXlaCpuGlobalJit(unsigned char enable);
+
+// Sets XLA's auto jit mode according to the specified string, which is parsed
+// as if passed in XLA_FLAGS. This has global effect.
+TF_CAPI_EXPORT void TF_SetXlaAutoJitMode(const char* mode);
+
+// Returns whether the single GPU or general XLA auto jit optimizations are
+// enabled through MarkForCompilationPassFlags.
+TF_CAPI_EXPORT unsigned char TF_GetXlaAutoJitEnabled();
+
+// Sets XLA's minimum cluster size. This has global effect.
+TF_CAPI_EXPORT void TF_SetXlaMinClusterSize(int size);
+
+// Gets/Sets TF/XLA flag for whether(true) or not(false) to disable constant
+// folding. This is for testing to ensure that XLA is being tested rather than
+// Tensorflow's CPU implementation through constant folding.
+TF_CAPI_EXPORT unsigned char TF_GetXlaConstantFoldingDisabled();
+TF_CAPI_EXPORT void TF_SetXlaConstantFoldingDisabled(
+    unsigned char should_enable);
+
+// Create a serialized tensorflow.ConfigProto proto, where:
+//
+// a) ConfigProto.optimizer_options.global_jit_level is set to ON_1 if
+// `enable_xla_compilation` is non-zero, and OFF otherwise.
+// b) ConfigProto.gpu_options.allow_growth is set to `gpu_memory_allow_growth`.
+// c) ConfigProto.device_count is set to `num_cpu_devices`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_CreateConfig(
+    unsigned char enable_xla_compilation, unsigned char gpu_memory_allow_growth,
+    unsigned int num_cpu_devices);
+
+// Create a serialized tensorflow.RunOptions proto, where RunOptions.trace_level
+// is set to FULL_TRACE if `enable_full_trace` is non-zero, and NO_TRACE
+// otherwise.
+TF_CAPI_EXPORT extern TF_Buffer* TF_CreateRunOptions(
+    unsigned char enable_full_trace);
+
+// Returns the graph content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
+                                                      size_t* len);
+
+// Returns the function content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+//
+// Do not return const char*, because some foreign language binding
+// (e.g. swift) cannot then call free() on the returned pointer.
+TF_CAPI_EXPORT extern char* TF_FunctionDebugString(TF_Function* func,
+                                                   size_t* len);
+
+// On success, dequeues a tensor from a TF-managed FifoQueue given by
+// `tensor_id`, associated with `session`. There must be a graph node named
+// "fifo_queue_dequeue_<tensor_id>", to be executed by this API call.
+
+// Caller must call TF_DeleteTensor() over the returned tensor. If the queue is
+// empty, this call is blocked.
+//
+// Tensors are enqueued via the corresponding TF enqueue op.
+// TODO(hongm): Add support for `timeout_ms`.
+TF_CAPI_EXPORT extern TF_Tensor* TF_DequeueNamedTensor(TF_Session* session,
+                                                       int tensor_id,
+                                                       TF_Status* status);
+
+// On success, enqueues `tensor` into a TF-managed FifoQueue given by
+// `tensor_id`, associated with `session`. There must be a graph node named
+// "fifo_queue_enqueue_<tensor_id>", to be executed by this API call. It reads
+// from a placeholder node "arg_tensor_enqueue_<tensor_id>".
+//
+// `tensor` is still owned by the caller. This call will be blocked if the queue
+// has reached its capacity, and will be unblocked when the queued tensors again
+// drop below the capacity due to dequeuing.
+//
+// Tensors are dequeued via the corresponding TF dequeue op.
+// TODO(hongm): Add support for `timeout_ms`.
+TF_CAPI_EXPORT extern void TF_EnqueueNamedTensor(TF_Session* session,
+                                                 int tensor_id,
+                                                 TF_Tensor* tensor,
+                                                 TF_Status* status);
+// Create a serialized tensorflow.ServerDef proto.
+TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
+                                                      const char* errMsg);
+
+// TF_NewCheckpointReader() return the CheckpointReader that can be use to
+// investigate or load the variable from the checkpoint file
+typedef struct TF_CheckpointReader TF_CheckpointReader;
+TF_CAPI_EXPORT extern TF_CheckpointReader* TF_NewCheckpointReader(
+    const char* filename, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteCheckpointReader(
+    TF_CheckpointReader* reader);
+TF_CAPI_EXPORT extern int TF_CheckpointReaderHasTensor(
+    TF_CheckpointReader* reader, const char* name);
+// Get the variable name at the given index
+TF_CAPI_EXPORT extern const char* TF_CheckpointReaderGetVariable(
+    TF_CheckpointReader* reader, int index);
+// Get the number of variable in the checkpoint
+TF_CAPI_EXPORT extern int TF_CheckpointReaderSize(TF_CheckpointReader* reader);
+// Get the DataType of a variable
+TF_CAPI_EXPORT extern TF_DataType TF_CheckpointReaderGetVariableDataType(
+    TF_CheckpointReader* reader, const char* name);
+// Read the shape of a variable and write to `dims`
+TF_CAPI_EXPORT extern void TF_CheckpointReaderGetVariableShape(
+    TF_CheckpointReader* reader, const char* name, int64_t* dims, int num_dims,
+    TF_Status* status);
+// Get the number of dimension of a variable
+TF_CAPI_EXPORT extern int TF_CheckpointReaderGetVariableNumDims(
+    TF_CheckpointReader* reader, const char* name);
+// Load the weight of a variable
+TF_CAPI_EXPORT extern TF_Tensor* TF_CheckpointReaderGetTensor(
+    TF_CheckpointReader* reader, const char* name, TF_Status* status);
+
+// TF_NewAttrBuilder() returns an object that you can set attributes on as
+// though it were an op. This allows querying properties of that op for
+// type-checking purposes like if the op will run on a particular device type.
+typedef struct TF_AttrBuilder TF_AttrBuilder;
+TF_CAPI_EXPORT extern TF_AttrBuilder* TF_NewAttrBuilder(const char* op_name);
+TF_CAPI_EXPORT extern void TF_DeleteAttrBuilder(TF_AttrBuilder* builder);
+TF_CAPI_EXPORT extern void TF_AttrBuilderSetType(TF_AttrBuilder* builder,
+                                                 const char* attr_name,
+                                                 TF_DataType value);
+TF_CAPI_EXPORT extern void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder,
+                                                     const char* attr_name,
+                                                     const TF_DataType* values,
+                                                     int num_values);
+
+// Checks the tensorflow::NodeDef built via the methods above to see if it can
+// run on device_type.
+TF_CAPI_EXPORT extern void TF_AttrBuilderCheckCanRunOnDevice(
+    TF_AttrBuilder* builder, const char* device_type, TF_Status* status);
+
+// For argument number input_index, fetch the corresponding number_attr that
+// needs to be updated with the argument length of the input list.
+// Returns nullptr if there is any problem like op_name is not found, or the
+// argument does not support this attribute type.
+TF_CAPI_EXPORT extern const char* TF_GetNumberAttrForOpListInput(
+    const char* op_name, int input_index, TF_Status* status);
+
+// Returns 1 if the op is stateful, 0 otherwise. The return value is undefined
+// if the status is not ok.
+TF_CAPI_EXPORT extern int TF_OpIsStateful(const char* op_type,
+                                          TF_Status* status);
+
+// Platform specific initialization routine. Very few platforms actually require
+// this to be called.
+TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv);
+
+// Platform-specific implementation to return an unused port. (This should used
+// in tests only.)
+TF_CAPI_EXPORT int TF_PickUnusedPortOrDie(void);
+
+// Fast path method that makes constructing a single scalar tensor require less
+// overhead and copies.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromScalar(
+    TF_DataType data_type, void* data, size_t len, TF_Status* status);
+
+// Specify the server_def that enables collective ops.
+// This is different to the above function in that it doesn't create remote
+// contexts, and remotely executing ops is not possible. It just enables
+// communication for collective ops.
+TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
+                                                   const void* proto,
+                                                   size_t proto_len,
+                                                   TF_Status* status);
+
+// Aborts all ongoing collectives with the specified status. After abortion,
+// subsequent collectives will error with this status immediately. To reset the
+// collectives, create a new EagerContext.
+//
+// This is intended to be used when a peer failure is detected.
+TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
+                                                  TF_Status* status);
+
+// Checks the health of collective ops peers. Explicit health check is needed in
+// multi worker collective ops to detect failures in the cluster.  If a peer is
+// down, collective ops may hang.
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(
+    TFE_Context* ctx, const char* task, int64_t timeout_in_ms,
+    TF_Status* status);
+
+// Information about the shape of a Tensor and its type.
+struct TF_ShapeAndType {
+  // Number of dimensions. -1 indicates unknown rank.
+  int num_dims;
+  // Array of dimensions. -1 indicates unknown dim.
+  int64_t* dims;
+  // The data type. May be 0 to denote unknown type.
+  TF_DataType dtype;
+};
+
+typedef struct TF_ShapeAndType TF_ShapeAndType;
+
+// A list of TF_ShapeAndType elements..
+struct TF_ShapeAndTypeList {
+  int num_items;
+  TF_ShapeAndType* items;
+};
+typedef struct TF_ShapeAndTypeList TF_ShapeAndTypeList;
+
+// API for manipulating TF_ShapeAndTypeList objects.
+//
+TF_CAPI_EXPORT extern TF_ShapeAndTypeList* TF_NewShapeAndTypeList(
+    int num_shapes);
+TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetShape(
+    TF_ShapeAndTypeList* shape_list, int index, const int64_t* dims,
+    int num_dims);
+TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetUnknownShape(
+    TF_ShapeAndTypeList* shape_list, int index);
+TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetDtype(
+    TF_ShapeAndTypeList* shape_list, int index, TF_DataType dtype);
+TF_CAPI_EXPORT extern void TF_DeleteShapeAndTypeList(
+    TF_ShapeAndTypeList* shape_list);
+TF_CAPI_EXPORT extern void TF_DeleteShapeAndTypeListArray(
+    TF_ShapeAndTypeList** shape_list_array, int num_items);
+
+// Infer shapes for the given `op`. The arguments mimic the arguments of the
+// `shape_inference::InferenceContext` constructor. Note the following:
+//   - The inputs of the `op` are not used for shape inference. So, it is
+//     OK to not have the inputs properly set in `op`. See `input_tensors`
+//     if you want shape inference to consider the input tensors of the
+//     op for shape inference.
+//   - The types need not be set in `input_shapes` as it is not used.
+//   - The number of `input_tensors` should be the same as the number of items
+//     in `input_shapes`.
+//
+// The results are returned in `output_shapes` and
+// `output_resource_shapes_and_types`. The caller is responsible for freeing the
+// memory in these buffers by calling `TF_DeleteShapeAndTypeList`.
+TF_CAPI_EXPORT extern void TFE_InferShapes(
+    TFE_Op* op, TF_ShapeAndTypeList* input_shapes, TF_Tensor** input_tensors,
+    TF_ShapeAndTypeList* input_tensor_as_shapes,
+    TF_ShapeAndTypeList** input_resource_shapes_and_types,
+    TF_ShapeAndTypeList** output_shapes,
+    TF_ShapeAndTypeList*** output_resource_shapes_and_types, TF_Status* status);
+
+TF_CAPI_EXPORT extern void
+TF_ImportGraphDefOptionsSetValidateColocationConstraints(
+    TF_ImportGraphDefOptions* opts, unsigned char enable);
+
+// Load the library specified by library_filename and register the pluggable
+// device and related kernels present in that library. This function is not
+// supported on embedded on mobile and embedded platforms and will fail if
+// called.
+//
+// Pass "library_filename" to a platform-specific mechanism for dynamically
+// loading a library. The rules for determining the exact location of the
+// library are platform-specific and are not documented here.
+//
+// On success, returns the newly created library handle and places OK in status.
+// The caller owns the library handle.
+//
+// On failure, returns nullptr and places an error status in status.
+TF_CAPI_EXPORT extern TF_Library* TF_LoadPluggableDeviceLibrary(
+    const char* library_filename, TF_Status* status);
+
+// Frees the memory associated with the library handle.
+// Does NOT unload the library.
+TF_CAPI_EXPORT extern void TF_DeletePluggableDeviceLibraryHandle(
+    TF_Library* lib_handle);
+
+// Removes `func_name` from `g`. If `func_name` is not in `g`, an error will be
+// returned.
+TF_CAPI_EXPORT extern void TF_GraphRemoveFunction(TF_Graph* g,
+                                                  const char* func_name,
+                                                  TF_Status* status);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_C_API_EXPERIMENTAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_api_internal.h b/third_party/tflite-hdrs/tensorflow/c/c_api_internal.h
new file mode 100644
index 00000000..15d279b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_api_internal.h
@@ -0,0 +1,227 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_API_INTERNAL_H_
+#define TENSORFLOW_C_C_API_INTERNAL_H_
+
+#include <list>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/framework/op_gen_lib.h"
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+class Device;
+class DeviceMgr;
+class ServerInterface;
+}  // namespace tensorflow
+
+// Internal structures used by the C API. These are likely to change and should
+// not be depended on.
+
+struct TF_SessionOptions {
+  tensorflow::SessionOptions options;
+};
+
+struct TF_DeprecatedSession {
+  tensorflow::Session* session;
+};
+
+struct TF_Library {
+  void* lib_handle;
+  TF_Buffer op_list;
+};
+
+struct TF_Graph {
+  TF_Graph();
+
+  mutable tensorflow::mutex mu;
+  tensorflow::Graph graph TF_GUARDED_BY(mu);
+
+  // Runs shape inference.
+  tensorflow::ShapeRefiner refiner TF_GUARDED_BY(mu);
+
+  // Maps from name of an operation to the Node* in 'graph'.
+  std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
+      TF_GUARDED_BY(mu);
+
+  // The keys of this map are all the active sessions using this graph. Each
+  // value records whether the graph has been mutated since the corresponding
+  // session has been run (this is detected in RecordMutation function). If the
+  // string is empty, no mutation has occurred. Otherwise the string is a
+  // description of the mutation suitable for returning to the user.
+  //
+  // Sessions are added to this map in TF_NewSession, and removed in
+  // TF_DeleteSession.
+  // TF_Graph may only / must be deleted when
+  //   sessions.size() == 0 && delete_requested == true
+  //
+  // TODO(b/74949947): mutations currently trigger a warning instead of a bad
+  // status, this should be reverted when possible.
+  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::string> sessions
+      TF_GUARDED_BY(mu);
+  bool delete_requested TF_GUARDED_BY(mu);  // set true by TF_DeleteGraph
+
+  // Used to link graphs contained in TF_WhileParams to the parent graph that
+  // will eventually contain the full while loop.
+  TF_Graph* parent;
+  TF_Output* parent_inputs;
+};
+
+struct TF_OperationDescription {
+  TF_OperationDescription(TF_Graph* g, const char* op_type,
+                          const char* node_name)
+      : node_builder(node_name, op_type, g->graph.op_registry()), graph(g) {}
+
+  tensorflow::NodeBuilder node_builder;
+  TF_Graph* graph;
+  std::set<tensorflow::string> colocation_constraints;
+};
+
+struct TF_Operation {
+  tensorflow::Node node;
+
+ private:
+  ~TF_Operation() = default;
+};
+
+struct TF_Session {
+  TF_Session(tensorflow::Session* s, TF_Graph* g);
+
+  tensorflow::Session* session;
+  TF_Graph* const graph;
+
+  tensorflow::mutex mu TF_ACQUIRED_AFTER(TF_Graph::mu);
+  int last_num_graph_nodes;
+
+  // If true, TF_SessionRun and similar methods will call
+  // ExtendSessionGraphHelper before running the graph (this is the default
+  // public behavior). Can be set to false if the caller needs to call
+  // ExtendSessionGraphHelper manually.
+  std::atomic<bool> extend_before_run;
+};
+
+struct TF_ImportGraphDefOptions {
+  tensorflow::ImportGraphDefOptions opts;
+
+  // Backing memory for TensorId fields in opts.
+  // TODO(skyewm): it'd be better if ImportGraphDefOptions owned this.
+  std::vector<tensorflow::string> tensor_id_data;
+};
+
+struct TF_ImportGraphDefResults {
+  std::vector<TF_Output> return_tensors;
+  std::vector<TF_Operation*> return_nodes;
+  std::vector<const char*> missing_unused_key_names;
+  std::vector<int> missing_unused_key_indexes;
+
+  // Backing memory for missing_unused_key_names values.
+  std::vector<tensorflow::string> missing_unused_key_names_data;
+};
+
+struct TF_DeviceList {
+  std::vector<tensorflow::DeviceAttributes> response;
+};
+
+struct TF_Function {
+  tensorflow::FunctionRecord* record;
+};
+
+struct TF_ApiDefMap {
+  explicit TF_ApiDefMap(const tensorflow::OpList& op_list)
+      :
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+        api_def_map(op_list),
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+        update_docs_called(false) {
+  }
+
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+  tensorflow::ApiDefMap api_def_map TF_GUARDED_BY(lock);
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+  bool update_docs_called TF_GUARDED_BY(lock);
+  tensorflow::mutex lock;
+};
+
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+struct TF_Server {
+  TF_Server(std::unique_ptr<tensorflow::ServerInterface> server);
+
+  const tensorflow::string target;
+  std::unique_ptr<tensorflow::ServerInterface> server;
+};
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+
+namespace tensorflow {
+
+// Set the shapes and types of the output's handle.
+//
+// The lengths of the arrays pointed to by `shapes`, `ranks`, and `types` must
+// all be equal to `num_shapes_and_types`. If `ranks[i] != -1`, (i.e., if the
+// rank is known), then it must be equal to the length of `shapes[i]`; if
+// `ranks[i] == 1`, then `shapes[i]` may be nullptr.
+//
+// TODO(akshayka): Implement a corresponding getter method.
+void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
+                                           int num_shapes_and_types,
+                                           const int64_t** shapes,
+                                           const int* ranks,
+                                           const TF_DataType* types,
+                                           TF_Status* status);
+
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu);
+
+bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
+    TF_LOCKS_EXCLUDED(session->graph->mu, session->mu);
+
+std::string getTF_OutputDebugString(TF_Output node);
+
+// Set whether to propagate assigned device information when constructing a new
+// Graph from a GraphDef. By default assigned device information is not copied
+// and is re-computed by the runtime.
+inline void TF_ImportGraphDefOptionsSetPropagateDeviceSpec(
+    TF_ImportGraphDefOptions* opts, unsigned char propagate_device_spec) {
+  opts->opts.propagate_device_spec = propagate_device_spec;
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_api_macros.h b/third_party/tflite-hdrs/tensorflow/c/c_api_macros.h
new file mode 100644
index 00000000..d73546ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_api_macros.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_API_MACROS_H_
+#define TENSORFLOW_C_C_API_MACROS_H_
+
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#ifdef TF_CAPI_WEAK
+#define TF_CAPI_EXPORT \
+  __attribute__((visibility("default"))) __attribute((weak))
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // TF_CAPI_WEAK
+#endif  // _WIN32
+#endif  // SWIG
+
+// TF_Bool is the C API typedef for unsigned char, while TF_BOOL is
+// the datatype for boolean tensors.
+#ifndef TF_Bool
+#define TF_Bool unsigned char
+#endif  // TF_Bool
+
+// Macro used to calculate struct size for maintaining ABI stability across
+// different struct implementations.
+#ifndef TF_OFFSET_OF_END
+#define TF_OFFSET_OF_END(TYPE, MEMBER) \
+  (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
+#endif  // TF_OFFSET_OF_END
+
+#endif  // TENSORFLOW_C_C_API_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_api_macros_internal.h b/third_party/tflite-hdrs/tensorflow/c/c_api_macros_internal.h
new file mode 100644
index 00000000..b2bc61d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_api_macros_internal.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_API_MACROS_INTERNAL_H_
+#define TENSORFLOW_C_C_API_MACROS_INTERNAL_H_
+
+#ifdef __cplusplus
+#include "tensorflow/core/platform/status.h"
+
+// Macro to verify that the field `struct_size` of STRUCT_OBJ is initialized.
+// `struct_size` is used for struct member compatibility check between core TF
+// and plug-ins with the same C API minor version. More info here:
+// https://github.com/tensorflow/community/blob/master/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md
+#define TF_VALIDATE_STRUCT_SIZE(STRUCT_NAME, STRUCT_OBJ, SIZE_VALUE_NAME) \
+  do {                                                                    \
+    if (STRUCT_OBJ.struct_size == 0) {                                    \
+      return tensorflow::Status(absl::StatusCode::kFailedPrecondition,    \
+                                "Expected initialized `" #STRUCT_NAME     \
+                                "` structure with `struct_size` field "   \
+                                "set to " #SIZE_VALUE_NAME                \
+                                ". Found `struct_size` = 0.");            \
+    }                                                                     \
+  } while (0)
+
+// Macro to verify that the field NAME of STRUCT_OBJ is not null.
+#define TF_VALIDATE_NOT_NULL(STRUCT_NAME, STRUCT_OBJ, NAME)            \
+  do {                                                                 \
+    if (STRUCT_OBJ.NAME == 0) {                                        \
+      return tensorflow::Status(absl::StatusCode::kFailedPrecondition, \
+                                "'" #NAME "' field in " #STRUCT_NAME   \
+                                " must be set.");                      \
+    }                                                                  \
+  } while (0)
+
+#endif  // __cplusplus
+#endif  // TENSORFLOW_C_C_API_MACROS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_op_requires.h b/third_party/tflite-hdrs/tensorflow/c/c_op_requires.h
new file mode 100644
index 00000000..1a515bb1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_op_requires.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_OP_REQUIRES_H_
+#define TENSORFLOW_C_C_OP_REQUIRES_H_
+
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Convenience macros for asserting and handling exceptional conditions, for
+// C structs, including `TF_OpKernelContext`, `TF_Status`, etc. This is analogus
+// to the macros in tensorflow/core/framework/op_requires.h. This is provided
+// for plugin OpKernel developer's convenience.
+
+#define C_OPKERNELCONTEXT_REQUIRES_OK(CTX, C_STATUS, __VA_ARGS__) \
+  do {                                                            \
+    ::tensorflow::Status _s(__VA_ARGS__);                         \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                              \
+      ::tensorflow::Set_TF_Status_from_Status(C_STATUS, _s);      \
+      TF_OpKernelContext_Failure(CTX, C_STATUS);                  \
+      TF_DeleteStatus(C_STATUS);                                  \
+      return;                                                     \
+    }                                                             \
+  } while (0)
+
+#define TF_CLEANUP_AND_RETURN_IF_ERROR(C_STATUS, BUFFER, __VA_ARGS__) \
+  do {                                                                \
+    ::tensorflow::Status _s(__VA_ARGS__);                             \
+    if (TF_PREDICT_TRUE(!_s.ok())) {                                  \
+      TF_DeleteStatus(C_STATUS);                                      \
+      TF_DeleteBuffer(BUFFER);                                        \
+      return _s;                                                      \
+    }                                                                 \
+  } while (0)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_C_OP_REQUIRES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/c_test_util.h b/third_party/tflite-hdrs/tensorflow/c/c_test_util.h
new file mode 100644
index 00000000..7eeb1ee5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/c_test_util.h
@@ -0,0 +1,165 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_C_TEST_UTIL_H_
+#define TENSORFLOW_C_C_TEST_UTIL_H_
+
+#include "tensorflow/c/c_api.h"
+
+#include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+using ::tensorflow::string;
+
+typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
+    unique_tensor_ptr;
+
+TF_Tensor* BoolTensor(int32_t v);
+
+// Create a tensor with values of type TF_INT8 provided by `values`.
+TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values);
+
+// Create a tensor with values of type TF_INT32 provided by `values`.
+TF_Tensor* Int32Tensor(const int64_t* dims, int num_dims,
+                       const int32_t* values);
+
+// Create 1 dimensional tensor with values from `values`
+TF_Tensor* Int32Tensor(const std::vector<int32_t>& values);
+
+TF_Tensor* Int32Tensor(int32_t v);
+
+TF_Tensor* DoubleTensor(double v);
+
+TF_Tensor* FloatTensor(float v);
+
+TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
+                          const char* name = "feed",
+                          TF_DataType dtype = TF_INT32,
+                          const std::vector<int64_t>& dims = {});
+
+TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
+                    const char* name = "const");
+
+TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
+TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
+TF_Operation* ScalarConst(double v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
+TF_Operation* ScalarConst(float v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
+TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "add");
+
+TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                         TF_Status* s, const char* name = "add");
+
+TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
+                                    TF_Graph* graph, TF_Operation* ctrl_op,
+                                    TF_Status* s, const char* name = "add");
+
+TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
+                  const char* name = "add");
+
+TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "min");
+
+TF_Operation* Mul(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "mul");
+
+// If `op_device` is non-empty, set the created op on that device.
+TF_Operation* MinWithDevice(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                            const string& op_device, TF_Status* s,
+                            const char* name = "min");
+
+TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s,
+                  const char* name = "neg");
+
+TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
+
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s);
+
+// Split `input` along the first dimension into 3 tensors
+TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
+                     const char* name = "split3");
+
+bool IsPlaceholder(const tensorflow::NodeDef& node_def);
+
+bool IsScalarConst(const tensorflow::NodeDef& node_def, int v);
+
+bool IsAddN(const tensorflow::NodeDef& node_def, int n);
+
+bool IsNeg(const tensorflow::NodeDef& node_def, const string& input);
+
+bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
+
+bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def);
+
+bool GetFunctionDef(TF_Function* func, tensorflow::FunctionDef* func_def);
+
+bool GetAttrValue(TF_Operation* oper, const char* attr_name,
+                  tensorflow::AttrValue* attr_value, TF_Status* s);
+
+// Returns a sorted vector of std::pair<function_name, gradient_func> from
+// graph_def.library().gradient()
+std::vector<std::pair<string, string>> GetGradDefs(
+    const tensorflow::GraphDef& graph_def);
+
+// Returns a sorted vector of names contained in `grad_def`
+std::vector<string> GetFuncNames(const tensorflow::GraphDef& graph_def);
+
+class CSession {
+ public:
+  CSession(TF_Graph* graph, TF_Status* s, bool use_XLA = false);
+  explicit CSession(TF_Session* session);
+
+  ~CSession();
+
+  void SetInputs(std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs);
+  void SetOutputs(std::initializer_list<TF_Operation*> outputs);
+  void SetOutputs(const std::vector<TF_Output>& outputs);
+  void SetTargets(std::initializer_list<TF_Operation*> targets);
+
+  void Run(TF_Status* s);
+
+  void CloseAndDelete(TF_Status* s);
+
+  TF_Tensor* output_tensor(int i) { return output_values_[i]; }
+
+  TF_Session* mutable_session() { return session_; }
+
+ private:
+  void DeleteInputValues();
+  void ResetOutputValues();
+
+  TF_Session* session_;
+  std::vector<TF_Output> inputs_;
+  std::vector<TF_Tensor*> input_values_;
+  std::vector<TF_Output> outputs_;
+  std::vector<TF_Tensor*> output_values_;
+  std::vector<TF_Operation*> targets_;
+};
+
+#endif  // TENSORFLOW_C_C_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/checkpoint_reader.h b/third_party/tflite-hdrs/tensorflow/c/checkpoint_reader.h
new file mode 100644
index 00000000..75008ffa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/checkpoint_reader.h
@@ -0,0 +1,83 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_CHECKPOINT_READER_H_
+#define TENSORFLOW_C_CHECKPOINT_READER_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+
+namespace tensorflow {
+namespace checkpoint {
+
+class TensorSliceReader;
+
+// A wrapper around BundleReader (for V2 checkpoints) and
+// checkpoint::TensorSliceReader (for V1), that is more easily SWIG wrapped for
+// other languages.
+//
+// The class currently only interacts with single-slice (i.e., non-partitioned)
+// variables.
+class CheckpointReader {
+ public:
+  CheckpointReader(const string& filename, TF_Status* status);
+
+  bool HasTensor(const string& name) const;
+  const string DebugString() const;
+
+  // Returns a map from variable names to their shapes.  Slices of a partitioned
+  // tensor are combined into a single entry.
+  const TensorSliceReader::VarToShapeMap& GetVariableToShapeMap() const;
+
+  // Returns a map from variable names to their data types.  Slices of a
+  // partitioned tensor are combined into a single entry.
+  const TensorSliceReader::VarToDataTypeMap& GetVariableToDataTypeMap() const;
+
+  // Attempts to look up the tensor named "name" and stores the found result in
+  // "out_tensor".
+  void GetTensor(const string& name,
+                 std::unique_ptr<tensorflow::Tensor>* out_tensor,
+                 TF_Status* out_status) const;
+
+ private:
+  // Uses "v2_reader_" to build "var name -> shape" and "var name -> data type"
+  // maps; both owned by caller.
+  // REQUIRES: "v2_reader_ != nullptr && v2_reader_.status().ok()".
+  std::pair<std::unique_ptr<TensorSliceReader::VarToShapeMap>,
+            std::unique_ptr<TensorSliceReader::VarToDataTypeMap> >
+  BuildV2VarMaps();
+
+  // Invariant: exactly one of "reader_" and "v2_reader_" is non-null.
+  std::unique_ptr<TensorSliceReader> reader_;
+  std::unique_ptr<BundleReader> v2_reader_;
+
+  std::unique_ptr<TensorSliceReader::VarToShapeMap> var_to_shape_map_;
+  std::unique_ptr<TensorSliceReader::VarToDataTypeMap> var_to_data_type_map_;
+
+  CheckpointReader(const CheckpointReader&) = delete;
+  void operator=(const CheckpointReader&) = delete;
+};
+
+}  // namespace checkpoint
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_CHECKPOINT_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/conversion_macros.h b/third_party/tflite-hdrs/tensorflow/c/conversion_macros.h
new file mode 100644
index 00000000..d1f99b7b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/conversion_macros.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_CONVERSION_MACROS_H_
+#define TENSORFLOW_C_CONVERSION_MACROS_H_
+
+#define DEFINE_CONVERSION_FUNCTIONS(cpp_impl, wrapper)                         \
+  inline cpp_impl *unwrap(wrapper *w) {                                        \
+    return reinterpret_cast<cpp_impl *>(w);                                    \
+  }                                                                            \
+                                                                               \
+  inline const cpp_impl *unwrap(const wrapper *w) {                            \
+    return reinterpret_cast<const cpp_impl *>(w);                              \
+  }                                                                            \
+                                                                               \
+  inline wrapper *wrap(cpp_impl *i) { return reinterpret_cast<wrapper *>(i); } \
+  inline const wrapper *wrap(const cpp_impl *i) {                              \
+    return reinterpret_cast<const wrapper *>(i);                               \
+  }
+
+#endif  // TENSORFLOW_C_CONVERSION_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/abstract_context.h b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_context.h
new file mode 100644
index 00000000..4bf6ff9b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_context.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+
+namespace tensorflow {
+
+// Abstract interface to a context.
+//
+// This serves as a factory for creating `AbstractOperation`s and for
+// registering traced functions.
+// Operations creation within a context can only be executed in that context
+// (for now at least).
+// Implementations of the context may contain some state e.g. an execution
+// environment, a traced representation etc.
+class AbstractContext {
+ protected:
+  enum AbstractContextKind { kGraph, kMlir, kEager, kTfrt, kTape, kOpHandler };
+  explicit AbstractContext(AbstractContextKind kind) : kind_(kind) {}
+  virtual ~AbstractContext() {}
+
+ public:
+  AbstractContextKind getKind() const { return kind_; }
+
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage its own
+  // lifetime through ref counting. Thus clients MUST call Release() in order to
+  // destroy an instance of this class.
+  virtual void Release() = 0;
+
+  // Creates an operation builder and ties it to this context.
+  // The returned object can be used for setting operation's attributes,
+  // adding inputs and finally executing (immediately or lazily as in tracing)
+  // it in this context.
+  virtual AbstractOperation* CreateOperation() = 0;
+
+  // Registers a function with this context, after this the function is
+  // available to be called/referenced by its name in this context.
+  virtual absl::Status RegisterFunction(AbstractFunction*) = 0;
+  // Remove a function. 'func' argument is the name of a previously added
+  // FunctionDef. The name is in fdef.signature.name.
+  virtual absl::Status RemoveFunction(const string& func) = 0;
+
+ private:
+  const AbstractContextKind kind_;
+};
+
+namespace internal {
+struct AbstractContextDeleter {
+  void operator()(AbstractContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractContextPtr =
+    std::unique_ptr<AbstractContext, internal::AbstractContextDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/abstract_function.h b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_function.h
new file mode 100644
index 00000000..7bc8f8bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_function.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/intrusive_ptr.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class FunctionRecord;
+
+// A traced function: this hides the complexity of converting the serialized
+// representation between various supported formats e.g. FunctionDef and Mlir
+// function.
+class AbstractFunction : public core::RefCounted {
+ protected:
+  enum AbstractFunctionKind { kGraph, kMlir };
+  explicit AbstractFunction(AbstractFunctionKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractFunctionKind getKind() const { return kind_; }
+
+  // Returns the AbstractFunction as a FunctionDef.
+  virtual absl::Status GetFunctionDef(const FunctionDef**) = 0;
+
+  // Returns a shared reference to the wrapped function.
+  virtual absl::StatusOr<core::RefCountPtr<FunctionRecord>>
+  GetFunctionRecord() = 0;
+
+ private:
+  const AbstractFunctionKind kind_;
+};
+
+using AbstractFunctionPtr =
+    tensorflow::core::IntrusivePtr<tensorflow::AbstractFunction>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/abstract_op_attrs.h b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_op_attrs.h
new file mode 100644
index 00000000..e799552a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_op_attrs.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_OP_ATTRS_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_OP_ATTRS_H_
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Attributes of an op.
+class AbstractOpAttrs {
+ protected:
+  enum AbstractOpAttrsKind { kEager, kTfrt };
+  explicit AbstractOpAttrs(AbstractOpAttrsKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractOpAttrsKind getKind() const { return kind_; }
+  virtual ~AbstractOpAttrs() = default;
+
+  // Returns the AbstractFunction as a FunctionDef.
+  virtual void GetNameAttrList(
+      tensorflow::NameAttrList* name_and_attrs) const = 0;
+
+  virtual bool GetInt(absl::string_view, int64_t* result) const = 0;
+  virtual bool GetFloat(absl::string_view attr_name, float* result) const = 0;
+  virtual bool GetBool(absl::string_view attr_name, bool* result) const = 0;
+  virtual bool GetType(absl::string_view attr_name, DataType* result) const = 0;
+  virtual absl::Status GetTypeList(
+      absl::string_view attr_name,
+      absl::InlinedVector<DataType, 4>* type_list) const = 0;
+
+ private:
+  const AbstractOpAttrsKind kind_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_OP_ATTRS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/abstract_operation.h b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_operation.h
new file mode 100644
index 00000000..95142210
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_operation.h
@@ -0,0 +1,172 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Abstract interface to an operation.
+// This interface allows building and executing an operation in either
+// tracing or immediate execution mode.
+class AbstractOperation {
+ protected:
+  enum AbstractOperationKind {
+    kGraph,
+    kMlir,
+    kEager,
+    kTfrt,
+    kTape,
+    kOpHandler
+  };
+  explicit AbstractOperation(AbstractOperationKind kind) : kind_(kind) {}
+  virtual ~AbstractOperation() {}
+
+ public:
+  AbstractOperationKind getKind() const { return kind_; }
+
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage it's own
+  // lifetime through ref counting. Thus this must be allocated on the heap and
+  // clients MUST call Release() in order to destroy an instance of this class.
+  virtual void Release() = 0;
+
+  virtual absl::Status Reset(const char* op, const char* raw_device_name) = 0;
+
+  virtual const string& Name() const = 0;
+
+  // Returns the operation's device name.
+  //
+  // The value returned may be different from the one set by SetDeviceName, but
+  // it will be compatible with it: the name will be updated by device placement
+  // logic to refer to the specific device chosen.
+  //
+  // Example: If one calls `op->SetDeviceName("/device:GPU")`, the value
+  // returned by DeviceName should be "/device:GPU:*" until a particular GPU is
+  // chosen for the operation by the device placement logic in the
+  // executor. After that, the value returned by DeviceName will be a full
+  // device name such as "/job:localhost/replica:0/task:0/device:GPU:1".
+  virtual const string& DeviceName() const = 0;
+
+  // Sets the operation device name.
+  //
+  // The given `name` must be parseable by DeviceNameUtils::ParseFullName, and
+  // the result will be used as a constraint for device placement. See the
+  // documentation for DeviceName for more details.
+  //
+  // The value will override the previous value - that is, no "merging" of
+  // existing and given constraints will be performed.
+  virtual absl::Status SetDeviceName(const char* name) = 0;
+
+  virtual absl::Status AddInput(AbstractTensorHandle* input) = 0;
+  virtual absl::Status AddInputList(
+      absl::Span<AbstractTensorHandle* const> inputs) = 0;
+  virtual absl::Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                               int* num_retvals) = 0;
+
+  virtual absl::Status SetAttrString(const char* attr_name, const char* data,
+                                     size_t length) = 0;
+  virtual absl::Status SetAttrInt(const char* attr_name, int64_t value) = 0;
+  virtual absl::Status SetAttrFloat(const char* attr_name, float value) = 0;
+  virtual absl::Status SetAttrBool(const char* attr_name, bool value) = 0;
+  virtual absl::Status SetAttrType(const char* attr_name, DataType value) = 0;
+  virtual absl::Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                                    const int num_dims) = 0;
+  virtual absl::Status SetAttrShape(const char* attr_name,
+                                    const PartialTensorShape shape);
+  virtual absl::Status SetAttrFunction(const char* attr_name,
+                                       const AbstractOperation* value) = 0;
+  virtual absl::Status SetAttrFunctionName(const char* attr_name,
+                                           const char* value,
+                                           size_t length) = 0;
+  virtual absl::Status SetAttrTensor(const char* attr_name,
+                                     AbstractTensorInterface* tensor) = 0;
+  virtual absl::Status SetAttrStringList(const char* attr_name,
+                                         const void* const* values,
+                                         const size_t* lengths,
+                                         int num_values) = 0;
+  virtual absl::Status SetAttrStringList(const char* attr_name,
+                                         absl::Span<string const> values);
+  virtual absl::Status SetAttrFloatList(const char* attr_name,
+                                        const float* values,
+                                        int num_values) = 0;
+  virtual absl::Status SetAttrIntList(const char* attr_name,
+                                      const int64_t* values,
+                                      int num_values) = 0;
+  virtual absl::Status SetAttrTypeList(const char* attr_name,
+                                       const DataType* values,
+                                       int num_values) = 0;
+  virtual absl::Status SetAttrBoolList(const char* attr_name,
+                                       const unsigned char* values,
+                                       int num_values) = 0;
+  virtual absl::Status SetAttrShapeList(const char* attr_name,
+                                        const int64_t** dims,
+                                        const int* num_dims,
+                                        int num_values) = 0;
+  virtual absl::Status SetAttrFunctionList(
+      const char* attr_name, absl::Span<const AbstractOperation*> values) = 0;
+
+ private:
+  const AbstractOperationKind kind_;
+};
+
+// TODO(b/193656009): Defining these in a cc file causes linker errors with
+// fastbuild.
+inline absl::Status AbstractOperation::SetAttrShape(
+    const char* attr_name, const PartialTensorShape shape) {
+  return SetAttrShape(attr_name, shape.dim_sizes().data(), shape.dims());
+}
+
+inline absl::Status AbstractOperation::SetAttrStringList(
+    const char* attr_name, absl::Span<string const> values) {
+  std::vector<const char*> raw_strs;
+  std::vector<size_t> lengths;
+  raw_strs.reserve(values.size());
+  lengths.reserve(values.size());
+  for (const auto& s : values) {
+    raw_strs.emplace_back(s.data());
+    lengths.emplace_back(s.size());
+  }
+  return SetAttrStringList(attr_name,
+                           reinterpret_cast<const void**>(raw_strs.data()),
+                           lengths.data(), values.size());
+}
+
+namespace internal {
+struct AbstractOperationDeleter {
+  void operator()(AbstractOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractOperationPtr =
+    std::unique_ptr<AbstractOperation, internal::AbstractOperationDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/abstract_tensor_handle.h b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_tensor_handle.h
new file mode 100644
index 00000000..4a40b1c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/abstract_tensor_handle.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Abstract interface to a Tensor handle in either tracing or immediate
+// execution mode.
+class AbstractTensorHandle : public core::RefCounted {
+ protected:
+  enum AbstractTensorHandleKind { kGraph, kMlir, kEager, kTfrt, kCustomDevice };
+  explicit AbstractTensorHandle(AbstractTensorHandleKind kind) : kind_(kind) {}
+  ~AbstractTensorHandle() override {}
+
+ public:
+  // Returns tensor dtype.
+  virtual tensorflow::DataType DataType() const = 0;
+
+  // Returns the status of the tensor handle. If it is a tfrt::TensorHandle,
+  // the tensor handle can be an error and return non-OK status.
+  virtual absl::Status TensorHandleStatus() const;
+
+  // Returns tensor shape. If tensor has unknown rank, shape remains untouched.
+  virtual absl::Status Shape(tensorflow::PartialTensorShape* shape) const = 0;
+
+  // Returns tensor (full) type.
+  // While there is no immediate plan to deprecate dtype and shape in favor
+  // of only using full type type information, this is a future possibility.
+  //
+  // Note that map_dtype_to_child_of_tensor() from core/framework/types.h
+  // can be used to set a FullTypeDef based on dtype in a derived class if
+  // appropriate.
+  virtual tensorflow::FullTypeDef FullType() const = 0;
+
+  // The default debug string includes a shape, dtype and FullType.
+  // Implementations are free to override it with something more informative.
+  virtual std::string DebugString() const;
+
+  AbstractTensorHandleKind getKind() const { return kind_; }
+
+ private:
+  const AbstractTensorHandleKind kind_;
+};
+
+namespace internal {
+struct AbstractTensorHandleDeleter {
+  void operator()(AbstractTensorHandle* p) const {
+    if (p != nullptr) {
+      p->Unref();
+    }
+  }
+};
+}  // namespace internal
+
+// TODO(b/185908092): Make AbstractTensorHandlePtr an IntrusivePtr.
+using AbstractTensorHandlePtr =
+    std::unique_ptr<AbstractTensorHandle,
+                    internal::AbstractTensorHandleDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api.h
new file mode 100644
index 00000000..7f458ac5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api.h
@@ -0,0 +1,448 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_C_API_H_
+#define TENSORFLOW_C_EAGER_C_API_H_
+
+// C API extensions to experiment with eager execution of kernels.
+// WARNING: Unlike tensorflow/c/c_api.h, the API here is not guaranteed to be
+// stable and can change without notice.
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TFE_ContextOptions TFE_ContextOptions;
+
+// Return a new options object.
+TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions(void);
+
+// Set the config in TF_ContextOptions.options.
+// config should be a serialized tensorflow.ConfigProto proto.
+// If config was not parsed successfully as a ConfigProto, record the
+// error information in *status.
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetConfig(
+    TFE_ContextOptions* options, const void* proto, size_t proto_len,
+    TF_Status* status);
+
+// Controls how to act when we try to run an operation on a given device but
+// some input tensors are not on that device.
+// LINT.IfChange
+// Note: Keep in sync with internal copy of enum in eager/context.h.
+typedef enum TFE_ContextDevicePlacementPolicy {
+  // Running operations with input tensors on the wrong device will fail.
+  TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
+  // Copy the tensor to the right device but log a warning.
+  TFE_DEVICE_PLACEMENT_WARN = 1,
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default placement
+  // policy.
+  TFE_DEVICE_PLACEMENT_SILENT = 2,
+  // Placement policy which silently copies int32 tensors but not other dtypes.
+  TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
+} TFE_ContextDevicePlacementPolicy;
+// LINT.ThenChange(//tensorflow/c/eager/immediate_execution_context.h)
+
+// Sets the default execution mode (sync/async). Note that this can be
+// overridden per thread using TFE_ContextSetExecutorForThread.
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*,
+                                                      unsigned char enable);
+
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy(
+    TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy);
+
+// Destroy an options object.
+TF_CAPI_EXPORT extern void TFE_DeleteContextOptions(TFE_ContextOptions*);
+
+// "Context" under which operations/functions are executed. It encapsulates
+// things like the available devices, resource manager etc.
+// TFE_Context must outlive all tensor handles created using it. In other
+// words, TFE_DeleteContext() must be called after all tensor handles have
+// been deleted (with TFE_DeleteTensorHandle).
+//
+// TODO(ashankar): Merge with TF_Session?
+typedef struct TFE_Context TFE_Context;
+
+TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(
+    const TFE_ContextOptions* opts, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx);
+TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
+                                                            TF_Status* status);
+
+// Clears the internal caches in the TFE context. Useful when reseeding random
+// ops.
+TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx);
+
+// Sets a thread-local device placement policy. After this call, other calls to
+// TFE_Execute in the same thread will use the device policy specified here
+// instead of the device policy used to construct the context. This has no
+// effect on the device policy used by other program threads.
+TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalDevicePlacementPolicy(
+    TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy);
+
+// Returns the device placement policy to be used by this context in the current
+// thread.
+TF_CAPI_EXPORT extern TFE_ContextDevicePlacementPolicy
+TFE_ContextGetDevicePlacementPolicy(TFE_Context* ctx);
+
+// A tensorflow.ServerDef specifies remote workers (in addition to the current
+// workers name). Operations created in this context can then be executed on
+// any of these remote workers by setting an appropriate device.
+//
+// If the following is set, all servers identified by the
+// ServerDef must be up when the context is created.
+TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
+                                                   int keep_alive_secs,
+                                                   const void* proto,
+                                                   size_t proto_len,
+                                                   TF_Status* status);
+
+// A handle to a tensor on a device.
+//
+// Like a TF_Tensor, a TFE_TensorHandle refers to a tensor with a value, shape,
+// type etc. Unlike a TF_Tensor, a TFE_TensorHandle may refer to such tensors
+// placed in the memory of different devices or remote address spaces.
+typedef struct TFE_TensorHandle TFE_TensorHandle;
+
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t,
+                                                            TF_Status* status);
+// Indicates that the caller will not be using `h` any more.
+TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h);
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h,
+                                                  TF_Status* status);
+TF_CAPI_EXPORT extern int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h,
+                                                          TF_Status* status);
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
+                                                  int dim_index,
+                                                  TF_Status* status);
+
+// Returns the device of the operation that produced `h`. If `h` was produced by
+// a copy, returns the destination device of the copy. Note that the returned
+// device name is not always the device holding the tensor handle's memory. If
+// you want the latter, use TFE_TensorHandleBackingDeviceName. This function
+// will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// Returns the name of the device in whose memory `h` resides.
+//
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleBackingDeviceName(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// Return a pointer to a new TFE_TensorHandle that shares the underlying tensor
+// with `h`. On success, `status` is set to OK. On failure, `status` reflects
+// the error and a nullptr is returned.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopySharingTensor(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// This function will block till the operation that produces `h` has
+// completed. The memory returned might alias the internal memory used by
+// TensorFlow. Hence, callers should not mutate this memory (for example by
+// modifying the memory region pointed to by TF_TensorData() on the returned
+// TF_Tensor).
+TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
+                                                         TF_Status* status);
+
+// Create a new TFE_TensorHandle with the same contents as 'h' but placed
+// in the memory of the device name 'device_name'.
+// If source and destination are the same device, then this creates a new handle
+// that shares the underlying buffer. Otherwise, it currently requires at least
+// one of the source or destination devices to be CPU (i.e., for the source or
+// destination tensor to be placed in host memory).
+// If async execution is enabled, the copy may be enqueued and the call will
+// return "non-ready" handle. Else, this function returns after the copy has
+// been done.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(
+    TFE_TensorHandle* h, TFE_Context* ctx, const char* device_name,
+    TF_Status* status);
+
+// Debugging/Profiling information for TFE_TensorHandle
+//
+// TFE_TensorDebugInfo contains information useful for debugging and
+// profiling tensors.
+typedef struct TFE_TensorDebugInfo TFE_TensorDebugInfo;
+
+// Retrieves TFE_TensorDebugInfo for `handle`.
+// If TFE_TensorHandleTensorDebugInfo succeeds, `status` is set to OK and caller
+// is responsible for deleting returned TFE_TensorDebugInfo.
+// If TFE_TensorHandleTensorDebugInfo fails, `status` is set to appropriate
+// error and nullptr is returned. This function can block till the operation
+// that produces `handle` has completed.
+TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// Deletes `debug_info`.
+TF_CAPI_EXPORT extern void TFE_DeleteTensorDebugInfo(
+    TFE_TensorDebugInfo* debug_info);
+
+// Returns the number of dimensions used to represent the tensor on its device.
+// The number of dimensions used to represent the tensor on device can be
+// different from the number returned by TFE_TensorHandleNumDims.
+// The return value was current at the time of TFE_TensorDebugInfo creation.
+TF_CAPI_EXPORT extern int TFE_TensorDebugInfoOnDeviceNumDims(
+    TFE_TensorDebugInfo* debug_info);
+
+// Returns the number of elements in dimension `dim_index`.
+// Tensor representation on device can be transposed from its representation
+// on host. The data contained in dimension `dim_index` on device
+// can correspond to the data contained in another dimension in on-host
+// representation. The dimensions are indexed using the standard TensorFlow
+// major-to-minor order (slowest varying dimension first),
+// not the XLA's minor-to-major order.
+// On-device dimensions can be padded. TFE_TensorDebugInfoOnDeviceDim returns
+// the number of elements in a dimension after padding.
+// The return value was current at the time of TFE_TensorDebugInfo creation.
+TF_CAPI_EXPORT extern int64_t TFE_TensorDebugInfoOnDeviceDim(
+    TFE_TensorDebugInfo* debug_info, int dim_index);
+
+// Description of the TensorFlow op to execute.
+//
+// Assumes that the provided 'ctx' outlives the returned TFE_Op, i.e.,
+// TFE_DeleteOp() is called before TFE_DeleteContext().
+//
+// Very similar to TF_OperationDescription with some differences:
+// (1) TF_Output or TFE_TensorHandle* as arguments to TF_AddInput,
+//     TF_AddInputList
+// (2) TF_ColocateWith, TF_AddControlInput etc. do not make sense.
+// (3) Implementation detail: Avoid use of NodeBuilder/NodeDefBuilder since
+//     the additional sanity checks there seem unnecessary;
+typedef struct TFE_Op TFE_Op;
+
+TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx,
+                                        const char* op_or_function_name,
+                                        TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
+
+// Returns the op or function name `op` will execute.
+//
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetName(const TFE_Op* op,
+                                                TF_Status* status);
+TF_CAPI_EXPORT extern TFE_Context* TFE_OpGetContext(const TFE_Op* op,
+                                                    TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
+                                           TF_Status* status);
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(const TFE_Op* op,
+                                                  TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input,
+                                          TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_OpAddInputList(TFE_Op* op,
+                                              TFE_TensorHandle** inputs,
+                                              int num_inputs,
+                                              TF_Status* status);
+
+// Fetches the current number of inputs attached to `op`.
+//
+// Does not use the operation's definition to determine how many inputs should
+// be attached. It is intended for use with TFE_OpGetFlatInput to inspect an
+// already-finalized operation.
+//
+// Note that TFE_OpGetFlatInputCount and TFE_OpGetFlatInput operate on a flat
+// sequence of inputs, unlike TFE_OpGetInputLength (for getting the length of a
+// particular named input list, which may only be part of the op's inputs).
+TF_CAPI_EXPORT extern int TFE_OpGetFlatInputCount(const TFE_Op* op,
+                                                  TF_Status* status);
+// Returns a borrowed reference to one of `op`'s inputs. Use
+// `TFE_TensorHandleCopySharingTensor` to make a new reference.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_OpGetFlatInput(const TFE_Op* op,
+                                                           int index,
+                                                           TF_Status* status);
+
+TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op,
+                                                    const char* attr_name,
+                                                    unsigned char* is_list,
+                                                    TF_Status* status);
+// Get an attribute type given an op name; a fusion of TFE_NewOp and
+// TFE_OpGetAttrType for use from Python without the overhead of the individual
+// calls and memory management of TFE_Op.
+TF_CAPI_EXPORT extern TF_AttrType TFE_OpNameGetAttrType(
+    TFE_Context* ctx, const char* op_or_function_name, const char* attr_name,
+    unsigned char* is_list, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op,
+                                               const char* attr_name,
+                                               const void* value,
+                                               size_t length);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name,
+                                            int64_t value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name,
+                                              float value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name,
+                                             unsigned char value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
+                                             TF_DataType value);
+// If the number of dimensions is unknown, `num_dims` must be set to
+// -1 and `dims` can be null.  If a dimension is unknown, the
+// corresponding entry in the `dims` array must be -1.
+TF_CAPI_EXPORT extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name,
+                                              const int64_t* dims,
+                                              const int num_dims,
+                                              TF_Status* out_status);
+
+// Sets the attribute attr_name to be a function specified by 'function'.
+//
+// TODO(ashankar,iga): Add this functionality to the C API for graph
+// construction. Perhaps we want an AttrValueMap equivalent in the C API?
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFunction(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const TFE_Op* value);
+
+TF_CAPI_EXPORT void TFE_OpSetAttrFunctionName(TFE_Op* op, const char* attr_name,
+                                              const char* data, size_t length);
+
+TF_CAPI_EXPORT extern void TFE_OpSetAttrTensor(TFE_Op* op,
+                                               const char* attr_name,
+                                               TF_Tensor* tensor,
+                                               TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op,
+                                                   const char* attr_name,
+                                                   const void* const* values,
+                                                   const size_t* lengths,
+                                                   int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op,
+                                                const char* attr_name,
+                                                const int64_t* values,
+                                                int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op,
+                                                  const char* attr_name,
+                                                  const float* values,
+                                                  int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const unsigned char* values,
+                                                 int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const TF_DataType* values,
+                                                 int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(
+    TFE_Op* op, const char* attr_name, const int64_t** dims,
+    const int* num_dims, int num_values, TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFunctionList(TFE_Op* op,
+                                                     const char* attr_name,
+                                                     const TFE_Op** value,
+                                                     int num_values);
+
+// Returns the length (number of tensors) of the input argument `input_name`
+// found in the provided `op`.
+TF_CAPI_EXPORT extern int TFE_OpGetInputLength(TFE_Op* op,
+                                               const char* input_name,
+                                               TF_Status* status);
+
+// Returns the length (number of tensors) of the output argument `output_name`
+// found in the provided `op`.
+TF_CAPI_EXPORT extern int TFE_OpGetOutputLength(TFE_Op* op,
+                                                const char* output_name,
+                                                TF_Status* status);
+
+// Execute the operation defined by 'op' and return handles to computed
+// tensors in `retvals`.
+//
+// 'retvals' must point to a pre-allocated array of TFE_TensorHandle* and
+// '*num_retvals' should be set to the size of this array. It is an error if
+// the size of 'retvals' is less than the number of outputs. This call sets
+// *num_retvals to the number of outputs.
+//
+// If async execution is enabled, the call may simply enqueue the execution
+// and return "non-ready" handles in `retvals`. Note that any handles contained
+// in 'op' should not be mutated till the kernel execution actually finishes.
+//
+// For sync execution, if any of the inputs to `op` are not ready, this call
+// will block till they become ready and then return when the kernel execution
+// is done.
+// TODO(agarwal): change num_retvals to int from int*.
+TF_CAPI_EXPORT extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals,
+                                       int* num_retvals, TF_Status* status);
+
+// Add a function (serialized FunctionDef protocol buffer) to ctx so
+// that it can be invoked using TFE_Execute.
+TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(
+    TFE_Context* ctx, const char* serialized_function_def, size_t size,
+    TF_Status* status);
+
+// Adds a function (created from TF_GraphToFunction or
+// TF_FunctionImportFunctionDef) to the context, allowing it to be executed with
+// TFE_Execute by creating an op with the same name as the function.
+TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
+                                                  TF_Function* function,
+                                                  TF_Status* status);
+
+// Removes a function from the context. Once removed, you can no longer
+// TFE_Execute it or TFE_Execute any TFE_Op which has it as an attribute or any
+// other function which calls it as an attribute.
+TF_CAPI_EXPORT extern void TFE_ContextRemoveFunction(TFE_Context* ctx,
+                                                     const char* name,
+                                                     TF_Status* status);
+
+// Checks whether a function is registered under `name`.
+TF_CAPI_EXPORT unsigned char TFE_ContextHasFunction(TFE_Context* ctx,
+                                                    const char* name);
+
+// Enables tracing of RunMetadata on the ops executed from this context.
+TF_CAPI_EXPORT extern void TFE_ContextEnableRunMetadata(TFE_Context* ctx);
+
+// Disables tracing of RunMetadata on the ops executed from this context.
+TF_CAPI_EXPORT extern void TFE_ContextDisableRunMetadata(TFE_Context* ctx);
+
+// Populates the passed-in buffer with a serialized RunMetadata protocol buffer
+// containing any run metadata information accumulated so far and clears this
+// information.
+// If async mode is enabled, this call blocks till all currently pending ops are
+// done.
+TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx,
+                                                        TF_Buffer* buf,
+                                                        TF_Status* status);
+
+// Some TF ops need a step container to be set to limit the lifetime of some
+// resources (mostly TensorArray and Stack, used in while loop gradients in
+// graph mode). Calling this on a context tells it to start a step.
+TF_CAPI_EXPORT extern void TFE_ContextStartStep(TFE_Context* ctx);
+
+// Ends a step. When there is no active step (that is, every started step has
+// been ended) step containers will be cleared. Note: it is not safe to call
+// TFE_ContextEndStep while ops that rely on the step container may be running.
+TF_CAPI_EXPORT extern void TFE_ContextEndStep(TFE_Context* ctx);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#ifdef __cplusplus
+// A workaround to ease conversion to and from numpy objects and
+// TFE_TensorHandle's.
+//
+// TODO(ashankar): Figure out an alternative scheme that precludes the need for
+// these API-boundary breaking methods.
+namespace tensorflow {
+class Tensor;
+}  // namespace tensorflow
+
+TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t,
+                                      TF_Status* status);
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_experimental.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_experimental.h
new file mode 100644
index 00000000..ab50b470
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_experimental.h
@@ -0,0 +1,797 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Resets `op_to_reset` with `op_or_function_name` and `raw_device_name`. This
+// is for performance optimization by reusing an exiting unused op rather than
+// creating a new op every time. If `raw_device_name` is `NULL` or empty, it
+// does not set the device name. If it's not `NULL`, then it attempts to parse
+// and set the device name. It's effectively `TFE_OpSetDevice`, but it is faster
+// than separately calling it because if the existing op has the same
+// `raw_device_name`, it skips parsing and just leave as it is.
+TF_CAPI_EXPORT extern void TFE_OpReset(TFE_Op* op_to_reset,
+                                       const char* op_or_function_name,
+                                       const char* raw_device_name,
+                                       TF_Status* status);
+
+// Enables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextEnableGraphCollection(TFE_Context* ctx);
+
+// Disables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextDisableGraphCollection(TFE_Context* ctx);
+
+// TODO(fishx): Move these monitoring APIs into a separate file.
+// -----------------------------------------------------------------------------
+// Monitoring Counter APIs.
+// These APIs de-templated monitoring Counter for swig.
+
+typedef struct TFE_MonitoringCounterCell TFE_MonitoringCounterCell;
+
+// Atomically increments the value of the cell. The value must be non-negative.
+TF_CAPI_EXPORT extern void TFE_MonitoringCounterCellIncrementBy(
+    TFE_MonitoringCounterCell* cell, int64_t value);
+
+// Retrieves the current value of the cell.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringCounterCellValue(
+    TFE_MonitoringCounterCell* cell);
+
+// APIs for Counter without label.
+typedef struct TFE_MonitoringCounter0 TFE_MonitoringCounter0;
+// Returns a new Counter metric object. The caller should manage lifetime of
+// the object. Using duplicate metric name will crash the program with fatal
+// error.
+TF_CAPI_EXPORT extern TFE_MonitoringCounter0* TFE_MonitoringNewCounter0(
+    const char* name, TF_Status* status, const char* description);
+// Deletes the Counter object.
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteCounter0(
+    TFE_MonitoringCounter0* counter);
+// Retrieves the cell from the Counter object. The Counter object will manage
+// lifetime of the cell.
+TF_CAPI_EXPORT extern TFE_MonitoringCounterCell* TFE_MonitoringGetCellCounter0(
+    TFE_MonitoringCounter0* counter);
+
+// APIs for Counter with 1 label.
+typedef struct TFE_MonitoringCounter1 TFE_MonitoringCounter1;
+TF_CAPI_EXPORT extern TFE_MonitoringCounter1* TFE_MonitoringNewCounter1(
+    const char* name, TF_Status* status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteCounter1(
+    TFE_MonitoringCounter1* counter);
+TF_CAPI_EXPORT extern TFE_MonitoringCounterCell* TFE_MonitoringGetCellCounter1(
+    TFE_MonitoringCounter1* counter, const char* label1);
+
+// APIs for Counter with 2 labels.
+typedef struct TFE_MonitoringCounter2 TFE_MonitoringCounter2;
+TF_CAPI_EXPORT extern TFE_MonitoringCounter2* TFE_MonitoringNewCounter2(
+    const char* name, TF_Status* status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteCounter2(
+    TFE_MonitoringCounter2* counter);
+TF_CAPI_EXPORT extern TFE_MonitoringCounterCell* TFE_MonitoringGetCellCounter2(
+    TFE_MonitoringCounter2* counter, const char* label1, const char* label2);
+
+// -----------------------------------------------------------------------------
+// Monitoring Gauge APIs.
+// These APIs de-templated monitoring Gauge for swig.
+
+typedef struct TFE_MonitoringIntGaugeCell TFE_MonitoringIntGaugeCell;
+
+// Atomically set the value of the cell.
+TF_CAPI_EXPORT extern void TFE_MonitoringIntGaugeCellSet(
+    TFE_MonitoringIntGaugeCell* cell, int64_t value);
+
+// Retrieves the current value of the cell.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringIntGaugeCellValue(
+    TFE_MonitoringIntGaugeCell* cell);
+
+// APIs for Int Gauge without label.
+typedef struct TFE_MonitoringIntGauge0 TFE_MonitoringIntGauge0;
+TF_CAPI_EXPORT extern TFE_MonitoringIntGauge0* TFE_MonitoringNewIntGauge0(
+    const char* name, TF_Status* out_status, const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteIntGauge0(
+    TFE_MonitoringIntGauge0* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringIntGaugeCell*
+TFE_MonitoringGetCellIntGauge0(TFE_MonitoringIntGauge0* gauge);
+
+// APIs for Int Gauge with 1 label.
+typedef struct TFE_MonitoringIntGauge1 TFE_MonitoringIntGauge1;
+TF_CAPI_EXPORT extern TFE_MonitoringIntGauge1* TFE_MonitoringNewIntGauge1(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteIntGauge1(
+    TFE_MonitoringIntGauge1* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringIntGaugeCell*
+TFE_MonitoringGetCellIntGauge1(TFE_MonitoringIntGauge1* gauge,
+                               const char* label1);
+
+// APIs for Int Gauge with 2 label.
+typedef struct TFE_MonitoringIntGauge2 TFE_MonitoringIntGauge2;
+TF_CAPI_EXPORT extern TFE_MonitoringIntGauge2* TFE_MonitoringNewIntGauge2(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteIntGauge2(
+    TFE_MonitoringIntGauge2* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringIntGaugeCell*
+TFE_MonitoringGetCellIntGauge2(TFE_MonitoringIntGauge2* gauge,
+                               const char* label1, const char* label2);
+
+typedef struct TFE_MonitoringStringGaugeCell TFE_MonitoringStringGaugeCell;
+TF_CAPI_EXPORT extern void TFE_MonitoringStringGaugeCellSet(
+    TFE_MonitoringStringGaugeCell* cell, const char* value);
+// Retrieves the string value and saves it in the buffer.
+TF_CAPI_EXPORT extern const void TFE_MonitoringStringGaugeCellValue(
+    TFE_MonitoringStringGaugeCell* cell, TF_Buffer* buf);
+
+// APIs for String Gauge without label.
+typedef struct TFE_MonitoringStringGauge0 TFE_MonitoringStringGauge0;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge0* TFE_MonitoringNewStringGauge0(
+    const char* name, TF_Status* out_status, const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge0(
+    TFE_MonitoringStringGauge0* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge0(TFE_MonitoringStringGauge0* gauge);
+
+// APIs for String Gauge with 1 label.
+typedef struct TFE_MonitoringStringGauge1 TFE_MonitoringStringGauge1;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge1* TFE_MonitoringNewStringGauge1(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge1(
+    TFE_MonitoringStringGauge1* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge1(TFE_MonitoringStringGauge1* gauge,
+                                  const char* label1);
+
+// APIs for String Gauge with 2 label.
+typedef struct TFE_MonitoringStringGauge2 TFE_MonitoringStringGauge2;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge2* TFE_MonitoringNewStringGauge2(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge2(
+    TFE_MonitoringStringGauge2* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge2(TFE_MonitoringStringGauge2* gauge,
+                                  const char* label1, const char* label2);
+
+// APIs for String Gauge with 3 labels.
+typedef struct TFE_MonitoringStringGauge3 TFE_MonitoringStringGauge3;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge3* TFE_MonitoringNewStringGauge3(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2, const char* label3);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge3(
+    TFE_MonitoringStringGauge3* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge3(TFE_MonitoringStringGauge3* gauge,
+                                  const char* label1, const char* label2,
+                                  const char* label3);
+
+// APIs for String Gauge with 4 labels.
+typedef struct TFE_MonitoringStringGauge4 TFE_MonitoringStringGauge4;
+TF_CAPI_EXPORT extern TFE_MonitoringStringGauge4* TFE_MonitoringNewStringGauge4(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2, const char* label3,
+    const char* label4);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteStringGauge4(
+    TFE_MonitoringStringGauge4* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringStringGaugeCell*
+TFE_MonitoringGetCellStringGauge4(TFE_MonitoringStringGauge4* gauge,
+                                  const char* label1, const char* label2,
+                                  const char* label3, const char* label4);
+
+typedef struct TFE_MonitoringBoolGaugeCell TFE_MonitoringBoolGaugeCell;
+TF_CAPI_EXPORT extern void TFE_MonitoringBoolGaugeCellSet(
+    TFE_MonitoringBoolGaugeCell* cell, bool value);
+TF_CAPI_EXPORT extern bool TFE_MonitoringBoolGaugeCellValue(
+    TFE_MonitoringBoolGaugeCell* cell);
+
+// APIs for Bool Gauge without label.
+typedef struct TFE_MonitoringBoolGauge0 TFE_MonitoringBoolGauge0;
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGauge0* TFE_MonitoringNewBoolGauge0(
+    const char* name, TF_Status* out_status, const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBoolGauge0(
+    TFE_MonitoringBoolGauge0* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGaugeCell*
+TFE_MonitoringGetCellBoolGauge0(TFE_MonitoringBoolGauge0* gauge);
+
+// APIs for Bool Gauge with 1 label.
+typedef struct TFE_MonitoringBoolGauge1 TFE_MonitoringBoolGauge1;
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGauge1* TFE_MonitoringNewBoolGauge1(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBoolGauge1(
+    TFE_MonitoringBoolGauge1* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGaugeCell*
+TFE_MonitoringGetCellBoolGauge1(TFE_MonitoringBoolGauge1* gauge,
+                                const char* label1);
+
+// APIs for Bool Gauge with 2 label.
+typedef struct TFE_MonitoringBoolGauge2 TFE_MonitoringBoolGauge2;
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGauge2* TFE_MonitoringNewBoolGauge2(
+    const char* name, TF_Status* out_status, const char* description,
+    const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBoolGauge2(
+    TFE_MonitoringBoolGauge2* gauge);
+TF_CAPI_EXPORT extern TFE_MonitoringBoolGaugeCell*
+TFE_MonitoringGetCellBoolGauge2(TFE_MonitoringBoolGauge2* gauge,
+                                const char* label1, const char* label2);
+
+// -----------------------------------------------------------------------------
+// Monitoring Sampler APIs.
+// These APIs de-templated monitoring Sampler for swig.
+
+typedef struct TFE_MonitoringSamplerCell TFE_MonitoringSamplerCell;
+
+// Atomically add the value of the cell.
+TF_CAPI_EXPORT extern void TFE_MonitoringSamplerCellAdd(
+    TFE_MonitoringSamplerCell* cell, double value);
+
+// Retrieves the current value of the cell. The return value is a HistogramProto
+// saved in the buffer.
+TF_CAPI_EXPORT extern void TFE_MonitoringSamplerCellValue(
+    TFE_MonitoringSamplerCell* cell, TF_Buffer* buf);
+
+// APIs for sampler buckets
+typedef struct TFE_MonitoringBuckets TFE_MonitoringBuckets;
+TF_CAPI_EXPORT extern TFE_MonitoringBuckets*
+TFE_MonitoringNewExponentialBuckets(double scale, double growth_factor,
+                                    int bucket_count);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteBuckets(
+    TFE_MonitoringBuckets* buckets);
+
+// APIs for Sampler without label.
+typedef struct TFE_MonitoringSampler0 TFE_MonitoringSampler0;
+TF_CAPI_EXPORT extern TFE_MonitoringSampler0* TFE_MonitoringNewSampler0(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* out_status,
+    const char* description);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler0(
+    TFE_MonitoringSampler0* sampler);
+TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler0(
+    TFE_MonitoringSampler0* sampler);
+
+// APIs for Sampler with 1 label.
+typedef struct TFE_MonitoringSampler1 TFE_MonitoringSampler1;
+TF_CAPI_EXPORT extern TFE_MonitoringSampler1* TFE_MonitoringNewSampler1(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* out_status,
+    const char* description, const char* label1);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler1(
+    TFE_MonitoringSampler1* sampler);
+TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler1(
+    TFE_MonitoringSampler1* sampler, const char* label1);
+
+// APIs for Sampler with 2 label.
+typedef struct TFE_MonitoringSampler2 TFE_MonitoringSampler2;
+TF_CAPI_EXPORT extern TFE_MonitoringSampler2* TFE_MonitoringNewSampler2(
+    const char* name, TFE_MonitoringBuckets* buckets, TF_Status* out_status,
+    const char* description, const char* label1, const char* label2);
+TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler2(
+    TFE_MonitoringSampler2* sampler);
+TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
+    TFE_MonitoringSampler2* sampler, const char* label1, const char* label2);
+
+// Sets whether to use TFRT
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetTfrt(TFE_ContextOptions*,
+                                                     bool use_tfrt);
+
+// Returns the context_id from the EagerContext which is used by the
+// EagerService to maintain consistency between client and worker. The
+// context_id is initialized with a dummy value and is later set when the worker
+// is initialized (either locally or remotely). The context_id can change during
+// the process lifetime although this should cause the worker to be
+// reinitialized (e.g. cleared caches) as well.
+TF_CAPI_EXPORT extern uint64_t TFE_GetContextId(TFE_Context* ctx);
+
+// -----------------------------------------------------------------------------
+// Cancellation APIs.
+
+typedef struct TFE_CancellationManager TFE_CancellationManager;
+typedef int64_t TFE_CancellationToken;
+typedef struct TFE_CancelCallback {
+  void (*callback)(void* context);
+  void* context;
+} TFE_CancelCallback;
+TF_CAPI_EXPORT extern TFE_CancellationManager* TFE_NewCancellationManager();
+TF_CAPI_EXPORT extern bool TFE_CancellationManagerIsCancelled(
+    TFE_CancellationManager*);
+TF_CAPI_EXPORT extern bool TFE_CancellationManagerIsCancelling(
+    TFE_CancellationManager*);
+TF_CAPI_EXPORT extern void TFE_CancellationManagerStartCancel(
+    TFE_CancellationManager*);
+TF_CAPI_EXPORT extern TFE_CancellationToken TFE_CancellationManagerGetToken(
+    TFE_CancellationManager*);
+TF_CAPI_EXPORT extern bool TFE_CancellationManagerRegisterCallback(
+    TFE_CancellationManager*, TFE_CancellationToken token,
+    const TFE_CancelCallback* c_callback, const char* callback_name);
+TF_CAPI_EXPORT extern bool TFE_CancellationManagerDeregisterCallback(
+    TFE_CancellationManager*, TFE_CancellationToken token);
+TF_CAPI_EXPORT extern bool TFE_CancellationManagerTryDeregisterCallback(
+    TFE_CancellationManager*, TFE_CancellationToken token);
+TF_CAPI_EXPORT extern void TFE_DeleteCancellationManager(
+    TFE_CancellationManager*);
+
+// Associates the given `cancellation_manager` with `op`, so that invoking
+// `TFE_CancellationManagerStartCancel(cancellation_manager)` will cancel the
+// execution of `op`.
+typedef struct TFE_CancellationManager TFE_CancellationManager;
+TF_CAPI_EXPORT extern void TFE_OpSetCancellationManager(
+    TFE_Op* op, TFE_CancellationManager* cancellation_manager,
+    TF_Status* status);
+
+// -----------------------------------------------------------------------------
+// Eager Executor APIs.
+typedef struct TFE_Executor TFE_Executor;
+
+// Creates a new eager Executor. Nodes in one executor are guaranteed to be
+// executed in sequence. Assigning nodes to different executors allows executing
+// nodes in parallel.
+// in_flight_nodes_limit: when is_async is true, this value controls the
+// maximum number of in flight async nodes. Enqueuing of additional async ops
+// after the limit is reached blocks until some inflight nodes finishes.
+// The effect is bounding the memory held by inflight TensorHandles that are
+// referenced by the inflight nodes.
+// A recommended value has not been established.
+// A value of 0 removes the limit, which is the behavior of TensorFlow 2.11.
+// When is_async is false, the value is ignored.
+TF_CAPI_EXPORT extern TFE_Executor* TFE_NewExecutor(
+    bool is_async, bool enable_streaming_enqueue, int in_flight_nodes_limit);
+
+// Deletes the eager Executor without waiting for enqueued nodes. Please call
+// TFE_ExecutorWaitForAllPendingNodes before calling this API if you want to
+// make sure all nodes are finished.
+TF_CAPI_EXPORT extern void TFE_DeleteExecutor(TFE_Executor*);
+
+// Returns true if the executor is in async mode.
+TF_CAPI_EXPORT extern bool TFE_ExecutorIsAsync(TFE_Executor*);
+
+// Causes the calling thread to block till all ops dispatched in this executor
+// have been executed. Note that "execution" here refers to kernel execution /
+// scheduling of copies, etc. Similar to sync execution, it doesn't guarantee
+// that lower level device queues (like GPU streams) have been flushed.
+//
+// This call may not block for execution of ops enqueued concurrently with this
+// call.
+TF_CAPI_EXPORT extern void TFE_ExecutorWaitForAllPendingNodes(
+    TFE_Executor*, TF_Status* status);
+
+// When an error happens, any pending operations are discarded, and newly issued
+// ops return an error. This call clears the error state and re-enables
+// execution of newly issued ops.
+//
+// Note that outputs of discarded ops remain in a corrupt state and should not
+// be used for future calls.
+// TODO(agarwal): mark the affected handles and raise errors if they are used.
+TF_CAPI_EXPORT extern void TFE_ExecutorClearError(TFE_Executor*);
+
+// Sets a custom Executor for the current thread. All nodes created by this
+// thread will be added to this Executor. It will override the current executor.
+TF_CAPI_EXPORT extern void TFE_ContextSetExecutorForThread(TFE_Context*,
+                                                           TFE_Executor*);
+
+// Returns the Executor for the current thread.
+TF_CAPI_EXPORT extern TFE_Executor* TFE_ContextGetExecutorForThread(
+    TFE_Context*);
+
+// -----------------------------------------------------------------------------
+// Dynamic cluster API.
+
+// Update an existing context with a new set of servers defined in a ServerDef
+// proto. Servers can be added to and removed from the list of remote workers
+// in the context. A New set of servers identified by the ServerDef must be up
+// when the context is updated.
+//
+// This API is for experimental usage and may be subject to change.
+TF_CAPI_EXPORT extern void TFE_ContextUpdateServerDef(TFE_Context* ctx,
+                                                      int keep_alive_secs,
+                                                      const void* proto,
+                                                      size_t proto_len,
+                                                      TF_Status* status);
+
+// This API is for experimental usage and may be subject to change.
+TF_CAPI_EXPORT extern void TFE_ContextUpdateServerDefWithTimeout(
+    TFE_Context* ctx, int keep_alive_secs, const void* proto, size_t proto_len,
+    int64_t init_timeout_in_ms, TF_Status* status);
+
+// This API is for experimental usage and may be subject to change.
+TF_CAPI_EXPORT extern void TFE_ContextSetServerDefWithTimeout(
+    TFE_Context* ctx, int keep_alive_secs, const void* proto, size_t proto_len,
+    int64_t init_timeout_in_ms, TF_Status* status,
+    bool clear_existing_contexts);
+
+// Set server def with retries and timeout. This is helpful for fault-tolerant
+// initial connection in high-preemption environments, such as
+// ParameterServerStrategy training.
+// This API is for experimental usage and may be subject to change.
+TF_CAPI_EXPORT extern void TFE_ContextSetServerDefWithTimeoutAndRetries(
+    TFE_Context* ctx, int keep_alive_secs, const void* proto, size_t proto_len,
+    int64_t init_timeout_in_ms, int retries, TF_Status* status,
+    bool clear_existing_contexts);
+
+// Checks whether a remote worker is alive or not. This will return true even if
+// the context doesn't exist on the remote worker.
+TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
+                                                 const char* worker_name,
+                                                 TF_Status* status);
+
+// Sync pending nodes in local executors (including the context default executor
+// and thread executors) and streaming requests to remote executors, and get the
+// combined status.
+TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
+                                                TF_Status* status);
+
+// This function will block till the operation that produces `h` has
+// completed. This is only valid on local TFE_TensorHandles. The pointer
+// returned will be on the device in which the TFE_TensorHandle resides (so e.g.
+// for a GPU tensor this will return a pointer to GPU memory). The pointer is
+// only guaranteed to be valid until TFE_DeleteTensorHandle is called on this
+// TensorHandle. Only supports POD data types.
+TF_CAPI_EXPORT extern void* TFE_TensorHandleDevicePointer(TFE_TensorHandle*,
+                                                          TF_Status*);
+
+// This function will block till the operation that produces `h` has
+// completed. This is only valid on local TFE_TensorHandles. Returns the size in
+// bytes of the memory pointed to by the device pointer returned above.
+TF_CAPI_EXPORT extern size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle*,
+                                                              TF_Status*);
+
+// Creates a new TensorHandle from memory residing in the physical device
+// device_name. Takes ownership of the memory, and will call deleter to release
+// it after TF no longer needs it or in case of error.
+//
+// Custom devices must use TFE_NewCustomDeviceTensorHandle instead.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
+    TFE_Context* ctx, const char* device_name, TF_DataType, const int64_t* dims,
+    int num_dims, void* data, size_t len,
+    void (*deallocator)(void* data, size_t len, void* arg),
+    void* deallocator_arg, TF_Status* status);
+
+// Retrieves the address space (i.e. job, replia, task) of the local host and
+// saves it in the buffer.
+TF_CAPI_EXPORT extern void TFE_HostAddressSpace(TFE_Context* ctx,
+                                                TF_Buffer* buf);
+
+// APIs for generically dealing with op attributes (e.g. when forwarding them
+// through custom device implementations).
+//
+// TODO(allenl): Currently these are black boxes, but we should have some way to
+// inspect values. This would let people e.g. copy over most attributes and then
+// modify some based on their values.
+
+// A reference to an op's name -> attribute mapping
+typedef struct TFE_OpAttrs TFE_OpAttrs;
+
+// Fetch a reference to `op`'s attributes. The returned reference is only valid
+// while `op` is alive.
+TF_CAPI_EXPORT extern const TFE_OpAttrs* TFE_OpGetAttrs(const TFE_Op* op);
+// Add attributes in `attrs` to `op`.
+//
+// Does not overwrite or update existing attributes, but adds new ones.
+TF_CAPI_EXPORT extern void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs);
+
+// Serialize `attrs` as a tensorflow::NameAttrList protocol buffer (into `buf`),
+// containing the op name and a map of its attributes.
+TF_CAPI_EXPORT extern void TFE_OpAttrsSerialize(const TFE_OpAttrs* attrs,
+                                                TF_Buffer* buf,
+                                                TF_Status* status);
+
+// Set an op's attribute from a serialized AttrValue protocol buffer.
+//
+// Analogous to TF_SetAttrValueProto for building graph operations.
+TF_CAPI_EXPORT extern void TFE_OpSetAttrValueProto(const TFE_Op* op,
+                                                   const char* attr_name,
+                                                   const void* proto,
+                                                   size_t proto_len,
+                                                   TF_Status* status);
+
+// TODO(b/166642410): It would be nice, for custom devices and for other users,
+// to have a non-string representation of devices (TF_Device) extracted from
+// tensors/ops/etc. and usable in APIs like OpSetDevice/ResetOp/etc.
+
+#define TFE_CUSTOM_DEVICE_VERSION 4
+
+// Struct to be filled in. Functions are required except where indicated.
+typedef struct TFE_CustomDevice {
+  int version = TFE_CUSTOM_DEVICE_VERSION;
+  // Method to copy a tensor to the custom device.
+  TFE_TensorHandle* (*copy_tensor_to_device)(TFE_Context* context,
+                                             TFE_TensorHandle* tensor,
+                                             TF_Status* status,
+                                             void* device_info);
+
+  // Method to copy a tensor from the custom device to a target device.
+  TFE_TensorHandle* (*copy_tensor_from_device)(TFE_Context* context,
+                                               TFE_TensorHandle* tensor,
+                                               const char* target_device_name,
+                                               TF_Status* status,
+                                               void* device_info);
+
+  // Method to execute an operation.
+  //
+  // Arguments provide enough information to reconstruct the original `TFE_Op`,
+  // or construct a transformed version, by inspecting the passed `op`.
+  //
+  // TFE_OpGetDevice(op) records the original placement of the operation. It may
+  // be an empty string if no device was explicitly requested, but will
+  // otherwise be the name of this custom device. Ops are placed onto a custom
+  // device if any of their inputs are on that custom device, but custom devices
+  // are free to set a bad status in order to require explicit placement.
+  void (*execute)(const TFE_Op* op, int* num_outputs,
+                  TFE_TensorHandle** outputs, TF_Status* s, void* device_info);
+
+  // Method to delete a device.
+  void (*delete_device)(void* device_info);
+
+  // Implements TFE_CreatePackedTensorHandle when one of `handles` is on this
+  // custom device.
+  //
+  // Many devices will want to simply return an "unimplemented" status
+  // here. This is the default behavior if `pack` is null when passed to
+  // TFE_RegisterCustomDevice.
+  TFE_TensorHandle* (*pack)(TFE_Context* context, TFE_TensorHandle** handles,
+                            int num_handles, TF_Status* s,
+                            void* device_info) = nullptr;
+
+  // Pins the op to `device` based on inputs to `op`. Returns true
+  // signifying to pin to the current custom device. Returns false
+  // to pin to the physical device.
+  //
+  // This function is guaranteed to be called only when all of the custom-device
+  // inputs are on this device.
+  bool (*shall_pin_to_this_device)(const TFE_Op* op, TF_Status* s) = nullptr;
+} TFE_CustomDevice;
+
+// Registers a custom device for use with eager execution.
+//
+// Eager operations may be placed on this device, e.g.  `with
+// tf.device("CUSTOM"):` from Python if `device_name` for this call is
+// "/job:localhost/replica:0/task:0/device:CUSTOM:0".
+//
+// The custom device defines copy operations for moving TensorHandles on and
+// off, and an execution operation for named operations. Often execution will
+// simply wrap op execution on one or more physical devices.
+//
+// device_info is an opaque caller-defined type stored with the custom device
+// which is passed to the functions referenced in the TFE_CustomDevice struct
+// `device` (execute, delete_device, etc.). It can for example contain the
+// names of wrapped devices.
+//
+// There are currently no graph semantics implemented for registered custom
+// devices, so executing tf.functions which contain operations placed on the
+// custom devices will fail.
+//
+// `device_name` must not name an existing physical or custom device. It must
+// follow the format:
+//
+//    /job:<name>/replica:<replica>/task:<task>/device:<type>:<device_num>
+//
+// If the device is successfully registered, `status` is set to TF_OK. Otherwise
+// the device is not usable. In case of a bad status, `device.delete_device` is
+// still called on `device_info` (i.e. the caller does not retain ownership).
+//
+// This API is highly experimental, and in particular is expected to change when
+// it starts supporting operations with attributes and when tf.function support
+// is added.
+TF_CAPI_EXPORT extern void TFE_RegisterCustomDevice(TFE_Context* ctx,
+                                                    TFE_CustomDevice device,
+                                                    const char* device_name,
+                                                    void* device_info,
+                                                    TF_Status* status);
+
+// Returns whether `device_name` maps to a registered custom device.
+TF_CAPI_EXPORT extern bool TFE_IsCustomDevice(TFE_Context* ctx,
+                                              const char* device_name);
+
+// Struct to be filled in to define a custom device tensor handle. Fields are
+// required except where indicated.
+typedef struct TFE_CustomDeviceTensorHandleMethods {
+  int version = TFE_CUSTOM_DEVICE_VERSION;
+
+  // Computes the rank of the tensor handle.
+  //
+  // Shapes are specified via callbacks because retrieving the shape of a tensor
+  // is a blocking operation for async eager; custom devices should avoid
+  // retrieving shapes of tensors they wrap until the custom device tensor's
+  // shape is explicitly requested where possible.
+  int (*num_dims)(void* data, TF_Status* status);
+
+  // Computes the axis length at `dim_index`.
+  int64_t (*dim)(void* data, int dim_index, TF_Status* status);
+
+  void (*deallocator)(void* data);
+
+  // Summarizes the value of this tensor. The caller takes ownership of the
+  // returned buffer. If `status` is not TF_OK, instead returns a null pointer.
+  //
+  // Does not include the shape and dtype of the tensor (which is generally
+  // appended later), but should include any information specific to this custom
+  // device which would be useful for debugging.
+  //
+  // Optional. If null, defaults to resolving the TFE_TensorHandle into a
+  // TF_Tensor and summarizing that.
+  TF_Buffer* (*summarize)(void* data, TF_Status* status) = nullptr;
+} TFE_CustomDeviceTensorHandle;
+
+// Creates a new TensorHandle from memory residing in a custom device. Takes
+// ownership of the memory pointed to by `tensor_handle_data`, and calls
+// `methods.deallocator` to release it after TF no longer needs it or in case of
+// an error.
+//
+// This call is similar to `TFE_NewTensorHandleFromDeviceMemory`, but supports
+// custom devices instead of physical devices and does not require blocking
+// waiting for exact shapes.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewCustomDeviceTensorHandle(
+    TFE_Context*, const char* device_name, TF_DataType, void* data,
+    TFE_CustomDeviceTensorHandle methods, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_ContextGetFunctionDef(TFE_Context* ctx,
+                                                     const char* function_name,
+                                                     TF_Buffer* buf,
+                                                     TF_Status* status);
+
+// Get GraphDebugInfo containing stack traces mapping to node names
+TF_CAPI_EXPORT extern void TFE_ContextGetGraphDebugInfo(
+    TFE_Context* ctx, const char* function_name, TF_Buffer* buf,
+    TF_Status* status);
+
+// Extracts a TF_Function from the context.
+// Must call TF_DeleteFunction on the returned value.
+TF_CAPI_EXPORT extern TF_Function* TFE_ContextGetFunction(TFE_Context* ctx,
+                                                          const char* name,
+                                                          TF_Status* status);
+
+// Allocate and return a new Tensor on the host.
+//
+// The caller must set the Tensor values by writing them to the pointer returned
+// by TF_TensorData with length TF_TensorByteSize.
+TF_CAPI_EXPORT extern TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx,
+                                                        TF_DataType dtype,
+                                                        const int64_t* dims,
+                                                        int num_dims,
+                                                        TF_Status* status);
+
+// Given a Tensor, wrap it with a TensorHandle
+//
+// Similar to TFE_NewTensorHandle, but includes a pointer to the TFE_Context.
+// The context should be identical to that of the Tensor.
+TF_CAPI_EXPORT TFE_TensorHandle* TFE_NewTensorHandleFromTensor(
+    TFE_Context* ctx, TF_Tensor* t, TF_Status* status);
+
+// Create a packed TensorHandle with the given list of TensorHandles.
+// If `handles` are on the same device, assign the same device to the packed
+// handle; if `handles` are on different deivces, assign a CompositeDevice to
+// it.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_CreatePackedTensorHandle(
+    TFE_Context* ctx, TFE_TensorHandle** handles, int* num_handles,
+    TF_Status* status);
+
+// Configure soft device placement policy for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx,
+                                                      unsigned char enable,
+                                                      TF_Status* status);
+
+// Configure device placement policy logging for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
+                                                     unsigned char enable,
+                                                     TF_Status* status);
+
+// Enables running eager ops as function.
+TF_CAPI_EXPORT void TFE_ContextSetRunEagerOpAsFunction(TFE_Context* ctx,
+                                                       unsigned char enable,
+                                                       TF_Status* status);
+
+// Enables rewrite jit_compile functions.
+TF_CAPI_EXPORT void TFE_ContextSetJitCompileRewrite(TFE_Context* ctx,
+                                                    unsigned char enable,
+                                                    TF_Status* status);
+
+// Returns the device type of the operation that produced `h`.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceType(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// Returns the device ID of the operation that produced `h`.
+TF_CAPI_EXPORT extern int TFE_TensorHandleDeviceID(TFE_TensorHandle* h,
+                                                   TF_Status* status);
+
+// Returns the status for the tensor handle. In TFRT, a tensor handle can carry
+// error info if error happens. If so, the status will be set with the error
+// info. If not, status will be set as OK.
+TF_CAPI_EXPORT extern void TFE_TensorHandleGetStatus(TFE_TensorHandle* h,
+                                                     TF_Status* status);
+
+// Get a comma-separated list of op names executed in graph functions dispatched
+// to `ctx`. This feature is currently only enabled for TFRT debug builds, for
+// performance and simplicity reasons.
+TF_CAPI_EXPORT extern void TFE_GetExecutedOpNames(TFE_Context* ctx,
+                                                  TF_Buffer* buf,
+                                                  TF_Status* status);
+
+// Set logical devices to the context's device manager.
+// If logical devices are already configured at context initialization
+// through TFE_ContextOptions, this method should not be called.
+TF_CAPI_EXPORT extern void TFE_SetLogicalCpuDevices(TFE_Context* ctx,
+                                                    int num_cpus,
+                                                    const char* prefix,
+                                                    TF_Status* status);
+
+// Set configuration key and value using coordination service.
+// If coordination service is enabled, the key-value will be stored on the
+// leader and become accessible to all workers in the cluster.
+// Currently, a config key can only be set with one value, and subsequently
+// setting the same key will lead to errors.
+//
+// Note that the key-values are only expected to be used for cluster
+// configuration data, and should not be used for storing a large amount of data
+// or being accessed very frequently.
+TF_CAPI_EXPORT extern void TFE_InsertConfigKeyValue(TFE_Context* ctx,
+                                                    const char* key,
+                                                    const char* value,
+                                                    TF_Status* status);
+
+// Get configuration key and value using coordination service.
+// The config key must be set before getting its value. Getting value of
+// non-existing config keys will result in errors.
+// If `timeout_in_ms=0`, this call will block until the key-value is set or the
+// worker shuts down.
+TF_CAPI_EXPORT extern void TFE_GetConfigKeyValue(TFE_Context* ctx,
+                                                 const char* key,
+                                                 int64_t timeout_in_ms,
+                                                 TF_Buffer* value_buf,
+                                                 TF_Status* status);
+
+// Delete configuration key-value. If `key` is a directory, recursively clean up
+// all key-values under the path specified by `key`.
+TF_CAPI_EXPORT extern void TFE_DeleteConfigKeyValue(TFE_Context* ctx,
+                                                    const char* key,
+                                                    TF_Status* status);
+
+// Report error (specified by error_code and error_message) to other tasks in
+// the cluster.
+TF_CAPI_EXPORT extern void TFE_ReportErrorToCluster(TFE_Context* ctx,
+                                                    int error_code,
+                                                    const char* error_message,
+                                                    TF_Status* status);
+
+// Get task states from the Coordination Service.
+TF_CAPI_EXPORT extern void TFE_GetTaskStates(TFE_Context* ctx,
+                                             const TF_Buffer& tasks,
+                                             void* states, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_WaitAtBarrier(TFE_Context* ctx,
+                                             const char* barrier_id,
+                                             int64_t barrier_timeout_in_ms,
+                                             TF_Status* status);
+
+TF_CAPI_EXPORT extern void TFE_InitializeLocalOnlyContext(TFE_Context* ctx,
+                                                          int keep_alive_secs,
+                                                          const void* proto,
+                                                          size_t proto_len,
+                                                          TF_Status* status);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_experimental_reader.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_experimental_reader.h
new file mode 100644
index 00000000..71c2e465
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_experimental_reader.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");;
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_READER_H_
+#define TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_READER_H_
+
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Test only exports of the monitoring Cell Reader API which allows tests to
+// read current values from streamz counters defined in other modules.
+//
+// The code under test will have created streamz counters like this:
+// auto* streamz = tensorflow::monitoring::Counter<1>::New("name",
+// "description", "label");
+// and then incremented that counter for various values of label:
+// streamz->GetCell("label-value")->IncrementBy(1);
+//
+// The test code can then read and test the value of that counter:
+//
+// auto* reader = TFE_MonitoringNewCounterReader("name");
+// test();
+// int64_t value = TFE_MonitoringReadCounter1(reader, "label-value");
+
+// Opaque handle to a reader.
+typedef struct TFE_MonitoringCounterReader TFE_MonitoringCounterReader;
+
+// Returns a handle to be used for reading values from streamz counter. The
+// counter can have been created with any number of labels.
+TF_CAPI_EXPORT extern TFE_MonitoringCounterReader*
+TFE_MonitoringNewCounterReader(const char* name);
+
+// Reads the value of a counter that was created with 0 labels.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringReadCounter0(
+    TFE_MonitoringCounterReader*);
+
+// Reads the value of specific cell of a counter that was created with 1 label.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringReadCounter1(
+    TFE_MonitoringCounterReader*, const char* label_value);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_internal.h
new file mode 100644
index 00000000..eff96826
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_internal.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/tfe_cancellation_manager_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_executor_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_monitoring_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_op_attrs_internal.h"  // IWYU pragma: export
+#include "tensorflow/c/eager/tfe_tensor_debug_info_internal.h"  // IWYU pragma: export
+
+// TODO(b/154564140): Move this to its own header. This requires splitting
+// c_api_experimental.h
+struct TFE_ContextOptions {
+  TF_SessionOptions session_options;
+  // true if async execution is enabled.
+  bool async = false;
+  TFE_ContextDevicePlacementPolicy device_placement_policy{
+      TFE_DEVICE_PLACEMENT_SILENT};
+  // If true, use TFRT backend
+  bool use_tfrt = false;
+  // Whether to run elementary eager ops wrapped in a call op.
+  bool run_eager_op_as_function = false;
+  // Whether to rewrite jit_compile functions.
+  bool jit_compile_rewrite = false;
+};
+
+#endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_remote_test_util.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_remote_test_util.h
new file mode 100644
index 00000000..6d9edb65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_remote_test_util.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_REMOTE_TEST_UTIL_H_
+#define TENSORFLOW_C_EAGER_C_API_REMOTE_TEST_UTIL_H_
+
+// Run a function containing a MatMul op and check its output.
+// If heavy_load_on_streaming_rpc is true, send some rpc requests before the one
+// which creates a remote input, to simulate a scenario that the remote input
+// is not ready when we start running an op or a function.
+void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
+                                   bool heavy_load_on_streaming_rpc,
+                                   bool remote_func_outputs = false,
+                                   bool has_packed_input = false);
+
+#endif  // TENSORFLOW_C_EAGER_C_API_REMOTE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_test_util.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_test_util.h
new file mode 100644
index 00000000..ff5b0736
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_test_util.h
@@ -0,0 +1,174 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
+#define TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+// Return a tensor handle containing a float scalar
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, float value);
+
+// Return a tensor handle containing a int scalar
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, int value);
+
+// Return a tensor handle containing a bool scalar
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, bool value);
+
+// Return a tensor handle containing a tstring scalar
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx,
+                                         const tensorflow::tstring& value);
+
+// Return a tensor handle containing a 2x2 matrix of doubles
+TFE_TensorHandle* DoubleTestMatrixTensorHandle(TFE_Context* ctx);
+
+// Return a tensor handle containing a 2x2 matrix of floats
+TFE_TensorHandle* TestMatrixTensorHandle(TFE_Context* ctx);
+
+// Return a tensor handle containing 2D matrix containing given data and
+// dimensions
+TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
+                                                  float data[], int64_t dims[],
+                                                  int num_dims);
+
+// Get a Matrix TensorHandle with given float values and dimensions
+TFE_TensorHandle* TestTensorHandleWithDimsFloat(TFE_Context* ctx, float data[],
+                                                int64_t dims[], int num_dims);
+
+// Get a Matrix TensorHandle with given int values and dimensions
+TFE_TensorHandle* TestTensorHandleWithDimsInt(TFE_Context* ctx, int data[],
+                                              int64_t dims[], int num_dims);
+
+// Return a tensor handle with given type, values and dimensions.
+template <class T, TF_DataType datatype>
+TFE_TensorHandle* TestTensorHandleWithDims(TFE_Context* ctx, const T* data,
+                                           const int64_t* dims, int num_dims) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t = TFE_AllocateHostTensor(ctx, datatype, dims, num_dims, status);
+  memcpy(TF_TensorData(t), data, TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+// Return a scalar tensor handle with given values.
+template <class T, TF_DataType datatype>
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, const T value) {
+  T data[] = {value};
+  return TestTensorHandleWithDims<T, datatype>(ctx, data, nullptr, 0);
+}
+
+// Return a tensor handle containing a 100x100 matrix of floats
+TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx);
+
+// Return a tensor handle containing a 3x2 matrix of doubles
+TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2(TFE_Context* ctx);
+
+// Return a tensor handle containing a 3x2 matrix of floats
+TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx);
+
+// Return a variable handle referring to a variable with the given initial value
+// on the given device.
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name = "");
+
+// Return an add op multiplying `a` by `b`.
+TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
+
+// Return a matmul op multiplying `a` by `b`.
+TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
+
+// Return an identity op.
+TFE_Op* IdentityOp(TFE_Context* ctx, TFE_TensorHandle* a);
+
+// Return a shape op fetching the shape of `a`.
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a);
+
+// Return an allreduce op adding up input tensor `in` from `group_size` workers.
+TFE_Op* AllReduceOp(TFE_Context* ctx, TFE_TensorHandle* in, int group_size);
+
+// Return a SendOp op `op_name` with send input tensor `in` and attributes
+// `send_device`, `recv_device`, and `send_device_incarnation` set.
+TFE_Op* SendOp(TFE_Context* ctx, TFE_TensorHandle* in,
+               const std::string& op_name, const std::string& send_device,
+               const std::string& recv_device,
+               tensorflow::uint64 send_device_incarnation);
+
+// Return a RecvOp op `op_name` with the attributes `send_device`,
+// `recv_device`, and `send_device_incarnation` set.
+TFE_Op* RecvOp(TFE_Context* ctx, const std::string& op_name,
+               const std::string& send_device, const std::string& recv_device,
+               tensorflow::uint64 send_device_incarnation);
+
+// Return a 1-D INT32 tensor containing a single value 1.
+TFE_TensorHandle* TestAxisTensorHandle(TFE_Context* ctx);
+
+// Return an op taking minimum of `input` long `axis` dimension.
+TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
+              TFE_TensorHandle* axis);
+
+// If there is a device of type `device_type`, returns true
+// and sets 'device_name' accordingly.
+// `device_type` must be either "GPU" or "TPU".
+bool GetDeviceName(TFE_Context* ctx, tensorflow::string* device_name,
+                   const char* device_type);
+
+// Create a ServerDef with the given `job_name` and add `num_tasks` tasks in it.
+tensorflow::ServerDef GetServerDef(const tensorflow::string& job_name,
+                                   int num_tasks);
+
+// Create a ServerDef with job name "localhost" and add `num_tasks` tasks in it.
+tensorflow::ServerDef GetServerDef(int num_tasks);
+
+// Create a multi-client ServerDef with the given `job_name`, add `num_tasks`
+// tasks and `num_virtual_gpus` virtual GPUs in it.
+tensorflow::ServerDef GetMultiClientServerDef(const std::string& job_name,
+                                              int num_tasks,
+                                              int num_virtual_gpus = 0);
+
+// Create a variable handle with name `variable_name` on a device with name
+// `device_name`.
+TFE_TensorHandle* CreateVarHandle(TFE_Context* ctx,
+                                  const tensorflow::string& device_name,
+                                  const tensorflow::string& variable_name);
+
+// Create a variable with value `value` and name `variable_name` on a device
+// with name `device_name`.
+TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value,
+                                 const tensorflow::string& device_name,
+                                 const tensorflow::string& variable_name);
+
+TFE_Context* CreateContext(const std::string& serialized_server_def,
+                           bool isolate_session_state,
+                           int64_t init_timeout_in_ms);
+
+tensorflow::ServerDef ReplaceTaskInServerDef(
+    const tensorflow::ServerDef& server_def, int task_index);
+
+void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index,
+                            const std::string& host, int port);
+
+std::vector<std::string> ListDeviceNames(TFE_Context* ctx);
+
+#endif  // TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_unified_experimental.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_unified_experimental.h
new file mode 100644
index 00000000..41228f07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_unified_experimental.h
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_H_
+#define TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_H_
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =============================================================================
+// Unified Execution APIs for Eager and tracing backends.
+// =============================================================================
+
+// -----------------------------------------------------------------------------
+// Core APIs
+// -----------------------------------------------------------------------------
+
+// A TF_ExecutionContext stores knowledge about how to execute an operation.
+// E.g. it could know whether we're in eager mode or graph mode, keeps track
+// of gradient tapes, etc.
+typedef struct TF_ExecutionContext TF_ExecutionContext;
+
+// A TF_AbstractTensor is an input to an operation. E.g. it could be a union
+// type of eager and graph tensors. It is also the result of executing an
+// operation.
+typedef struct TF_AbstractTensor TF_AbstractTensor;
+
+// A TF_AbstractOp is the metadata we need to execute an operation. E.g. this
+// could contain the op type and other attributes.
+typedef struct TF_AbstractOp TF_AbstractOp;
+
+// Stores a function representation that can be used for execution or for
+// setting functional attributes of other composite ops e.g. control flow.
+typedef struct TF_AbstractFunction TF_AbstractFunction;
+
+// This allows the client to swap the implementation of the tracing engine.
+// Any future call to TF_CreateFunction will use the implementation defined
+// here.
+void TF_SetTracingImplementation(const char* name, TF_Status*);
+
+// Creates a new TensorFlow function. A Function is an execution context, and as
+// such it can trace operations through TF_ExecuteOperation. After completing
+// tracing, a function can be obtained by TF_FinalizeFunction.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* status);
+
+// Creates a context for eager execution of operations.
+TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions*,
+                                                 TF_Status* s);
+void TF_DeleteExecutionContext(TF_ExecutionContext*);
+
+// Represents a (partially-defined) shape.
+typedef struct TF_Shape {
+  int num_dims;  // Must be >= -1; -1 represents unknown rank.
+  int64_t* dim_sizes;
+} TF_Shape;
+
+// Add a new parameter to a TensorFlow Function.
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Shape shape,
+                                           TF_Status* s);
+
+// Create an operation suitable to use with the provided context. The operation
+// requires its type (e.g. "AddV2") to be set independently.
+TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* ctx);
+void TF_DeleteAbstractOp(TF_AbstractOp*);
+
+// TODO(srbs): Add APIs for specifying attrs etc.
+// `op_type` must outlive `op`.
+void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
+                            TF_Status* s);
+// `op_name` must outlive `op`.
+void TF_AbstractOpSetOpName(TF_AbstractOp* op, const char* const op_name,
+                            TF_Status* s);
+// `attr_name` must outlive `op`.
+void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
+                              TF_DataType value, TF_Status* s);
+
+void TF_DeleteAbstractTensor(TF_AbstractTensor*);
+
+// TF_OutputList holds the list of TF_AbstractTensor that results from executing
+// an operation, or provided to create a function.
+// When executing an operation in an eager context, the expected number of
+// outputs must be set beforehand with `TF_OutputListSetNumOutputs`.
+typedef struct TF_OutputList TF_OutputList;
+TF_OutputList* TF_NewOutputList();
+void TF_DeleteOutputList(TF_OutputList* o);
+// Prepare tracing to the expected number of output for an operation.
+void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs, TF_Status*);
+// Return the number of outputs in the list.
+int TF_OutputListNumOutputs(TF_OutputList* o);
+// Return the `i`th output in the list.
+TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i);
+// Append a tensor at the end of the output list, growing its size by one.
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status*);
+
+// TF_ExecuteOperation will, if in eager mode, execute, if in graph mode, maybe
+// capture some inputs and then add a node in the graph. The output tensors are
+// returned through the provided TF_OutputList.
+// Any active tape will observe the effects of this execution.
+void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
+                         TF_AbstractTensor* const* inputs, TF_OutputList* o,
+                         TF_Status* s);
+
+// Creates a new TF_AbstractFunction from the current tracing states in the
+// context. The provided `ctx` is consumed by this API call and deleted.
+// The returned TF_AbstractFunction must be deleted by the client,
+// TODO(aminim): clarify the contract on the state of the context after this
+// call.
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList*, TF_Status*);
+
+void TF_DeleteAbstractFunction(TF_AbstractFunction*);
+
+// Register the function with the given context. This is particularly useful for
+// making a function available to an eager context.
+void TF_ExecutionContextRegisterFunction(TF_ExecutionContext*,
+                                         TF_AbstractFunction*, TF_Status*);
+
+// -----------------------------------------------------------------------------
+// APIs specific to Eager modes
+// -----------------------------------------------------------------------------
+
+// Temporary APIs till we figure out how to create scalar valued Eager
+// tensors and how to get value out of eager abstract tensors.
+TF_AbstractTensor* TF_CreateAbstractTensorFromEagerTensor(TFE_TensorHandle* t,
+                                                          TF_Status* s);
+TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
+                                                  TF_Status* s);
+TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext*,
+                                              TF_Status* s);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/c_api_unified_experimental_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_unified_experimental_internal.h
new file mode 100644
index 00000000..872b9081
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -0,0 +1,138 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
+
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Represents the results of the execution of an operation.
+struct OutputList {
+  std::vector<AbstractTensorHandle*> outputs;
+  int expected_num_outputs = -1;
+};
+
+namespace tracing {
+
+// =============================================================================
+// Implementation detail for the unified execution APIs for Eager and tracing
+// backends (graph/MLIR).
+//
+// This defines a set of abstract classes that are intended to provide the
+// functionality of the opaque C types exposed in the public APIs defined in the
+// `c_api_unified_experimental.h` header.
+// =============================================================================
+
+// Represents either a MlirTensor or a GraphTensor.
+// This base class does not expose any public methods other than to distinguish
+// which subclass it actually is. The user is responsible to use the right
+// type of AbstractTensor in their context (do not pass an MlirTensor to a
+// GraphContext and vice-versa).
+class TracingTensorHandle : public AbstractTensorHandle {
+ protected:
+  explicit TracingTensorHandle(AbstractTensorHandleKind kind)
+      : AbstractTensorHandle(kind) {}
+
+ public:
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kGraph || ptr->getKind() == kMlir;
+  }
+};
+
+// An abstract operation describes an operation by its type, name, and
+// attributes. It can be "executed" by the context with some input tensors.
+// It is allowed to reusing the same abstract operation for multiple execution
+// on a given context, with the same or different input tensors.
+class TracingOperation : public AbstractOperation {
+ protected:
+  explicit TracingOperation(AbstractOperationKind kind)
+      : AbstractOperation(kind) {}
+
+ public:
+  // Sets the name of the operation: this is an optional identifier that is
+  // not intended to carry semantics and preserved/propagated without
+  // guarantees.
+  virtual absl::Status SetOpName(const char* op_name) = 0;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kGraph || ptr->getKind() == kMlir;
+  }
+};
+
+namespace internal {
+struct TracingOperationDeleter {
+  void operator()(TracingOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using TracingOperationPtr =
+    std::unique_ptr<TracingOperation, internal::TracingOperationDeleter>;
+
+// This holds the context for the execution: dispatching operations either to an
+// MLIR implementation or to a graph implementation.
+class TracingContext : public AbstractContext {
+ protected:
+  explicit TracingContext(AbstractContextKind kind) : AbstractContext(kind) {}
+
+ public:
+  // Add a function parameter and return the corresponding tensor.
+  virtual absl::Status AddParameter(DataType dtype,
+                                    const PartialTensorShape& shape,
+                                    TracingTensorHandle**) = 0;
+
+  // Finalize this context and make a function out of it. The context is in a
+  // invalid state after this call and must be destroyed.
+  virtual absl::Status Finalize(OutputList* outputs, AbstractFunction**) = 0;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kGraph || ptr->getKind() == kMlir;
+  }
+};
+
+typedef TracingContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
+absl::Status SetDefaultTracingEngine(const char* name);
+void RegisterTracingEngineFactory(const ::tensorflow::string& name,
+                                  FactoryFunction factory);
+}  // namespace tracing
+
+DEFINE_CONVERSION_FUNCTIONS(AbstractContext, TF_ExecutionContext)
+DEFINE_CONVERSION_FUNCTIONS(AbstractTensorHandle, TF_AbstractTensor)
+DEFINE_CONVERSION_FUNCTIONS(AbstractFunction, TF_AbstractFunction)
+DEFINE_CONVERSION_FUNCTIONS(AbstractOperation, TF_AbstractOp)
+DEFINE_CONVERSION_FUNCTIONS(OutputList, TF_OutputList)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/custom_device_testutil.h b/third_party/tflite-hdrs/tensorflow/c/eager/custom_device_testutil.h
new file mode 100644
index 00000000..a7c60080
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/custom_device_testutil.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_CUSTOM_DEVICE_TESTUTIL_H_
+#define TENSORFLOW_C_EAGER_CUSTOM_DEVICE_TESTUTIL_H_
+
+// A simple logging device to test custom device registration.
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+
+void RegisterLoggingDevice(TFE_Context* context, const char* name,
+                           bool strict_scope_placement, bool* arrived_flag,
+                           bool* executed_flag, TF_Status* status);
+void AllocateLoggingDevice(const char* name, bool* arrived_flag,
+                           bool* executed_flag, TFE_CustomDevice** device,
+                           void** device_info);
+TFE_TensorHandle* UnpackTensorHandle(TFE_TensorHandle* logged_tensor_handle,
+                                     TF_Status* status);
+
+#endif  // TENSORFLOW_C_EAGER_CUSTOM_DEVICE_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/dlpack.h b/third_party/tflite-hdrs/tensorflow/c/eager/dlpack.h
new file mode 100644
index 00000000..8c85dee6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/dlpack.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_DLPACK_H_
+#define TENSORFLOW_C_EAGER_DLPACK_H_
+
+#include "tensorflow/c/eager/c_api.h"
+
+namespace tensorflow {
+
+// PyCapsule name for DLPack Tensor
+const char* const kDlTensorCapsuleName = "dltensor";
+
+// Converts eager tensor handle to DLPack (DLManagedTensor*), and return the
+// void* for further PyCapsule construction.
+TF_CAPI_EXPORT extern void* TFE_HandleToDLPack(TFE_TensorHandle* h,
+                                               TF_Status* status);
+
+// Converts DLPack (DLManagedTensor*) to eager tensor handle.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_HandleFromDLPack(void* dlm,
+                                                             TF_Status* status,
+                                                             TFE_Context* ctx);
+
+// Calls the destructor of DLManagedTensor, used in the destructor of PyCapsule.
+TF_CAPI_EXPORT extern void TFE_CallDLManagedTensorDeleter(void* dlm_ptr);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_DLPACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/gradient_checker.h b/third_party/tflite-hdrs/tensorflow/c/eager/gradient_checker.h
new file mode 100644
index 00000000..d64ad448
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/gradient_checker.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_GRADIENT_CHECKER_H_
+#define TENSORFLOW_C_EAGER_GRADIENT_CHECKER_H_
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/unified_api_testutil.h"
+
+namespace tensorflow {
+namespace gradients {
+
+/* Returns numerical grad inside `dtheta_approx` given `forward` model and
+ * parameter specified by `input_index`.
+ *
+ * I.e. if y = <output of the forward model> and w = inputs[input_index],
+ * this will calculate dy/dw numerically.
+ *
+ * `use_function` indicates whether to use graph mode(true) or eager(false).
+ *
+ * `numerical_grad` is the pointer to the AbstractTensorHandle* which will
+ * hold the numerical gradient data at the end of the function.
+ */
+absl::Status CalcNumericalGrad(AbstractContext* ctx, Model forward,
+                               absl::Span<AbstractTensorHandle* const> inputs,
+                               int input_index, bool use_function,
+                               AbstractTensorHandle** numerical_grad);
+
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_GRADIENT_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/gradients.h b/third_party/tflite-hdrs/tensorflow/c/eager/gradients.h
new file mode 100644
index 00000000..88c1df24
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/gradients.h
@@ -0,0 +1,178 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_GRADIENTS_H_
+#define TENSORFLOW_C_EAGER_GRADIENTS_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/tape.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+
+namespace tensorflow {
+namespace gradients {
+
+// =============== Experimental C++ API for computing gradients ===============
+
+// Sample gradient function:
+//
+// class AddGradientFunction : public GradientFunction {
+//  public:
+//   Status Compute(Context* ctx,
+//                  absl::Span<AbstractTensorHandle* const> grad_inputs,
+//                  absl::Span<AbstractTensorHandle*> grad_outputs) override {
+//     grad_outputs[0] = grad_inputs[0];
+//     grad_outputs[1] = grad_inputs[0];
+//     grad_outputs[0]->Ref();
+//     grad_outputs[1]->Ref();
+//     return OkStatus();
+//   }
+//   ~AddGradientFunction() override {}
+// };
+//
+// GradientFunction* AddRegisterer(const ForwardOperation& op) {
+//   // More complex gradient functions can use inputs/attrs etc. from the
+//   // forward `op`.
+//   return new AddGradientFunction;
+// }
+//
+// Status RegisterGradients(GradientRegistry* registry) {
+//   return registry->Register("Add", AddRegisterer);
+// }
+class GradientFunction {
+ public:
+  virtual absl::Status Compute(
+      AbstractContext* ctx,
+      absl::Span<AbstractTensorHandle* const> grad_outputs,
+      absl::Span<AbstractTensorHandle*> grad_inputs) = 0;
+  virtual ~GradientFunction() {}
+};
+
+// Metadata from the forward operation that is made available to the
+// gradient registerer to instantiate a GradientFunction.
+struct ForwardOperation {
+ public:
+  string op_name;
+  std::vector<AbstractTensorHandle*> inputs;
+  std::vector<AbstractTensorHandle*> outputs;
+  std::vector<int64_t> skip_input_indices;
+  AttrBuilder attrs;
+};
+
+using GradientFunctionFactory =
+    std::function<GradientFunction*(const ForwardOperation& op)>;
+
+// Map from op name to a `GradientFunctionFactory`.
+class GradientRegistry {
+ public:
+  absl::Status Register(const string& op,
+                        GradientFunctionFactory gradient_function_factory);
+  absl::Status Lookup(
+      const ForwardOperation& op,
+      std::unique_ptr<GradientFunction>* gradient_function) const;
+
+ private:
+  absl::flat_hash_map<string, GradientFunctionFactory> registry_;
+};
+
+// TODO(srbs): Figure out if we can avoid declaring this in the public header.
+// Wrapper for a tensor output of an operation executing under a tape.
+//
+// `GetID` returns a unique id for the wrapped tensor which is used to maintain
+// a map (`tensorflow::eager::TensorTape`) from the wrapped tensor to the id of
+// the op that produced it (or -1 if this tensor was watched using
+// `GradientTape::Watch`.) The op_id is simply a unique index assigned to each
+// op executed under the tape. A separate map (`tensorflow::eager::OpTape`)
+// maintains the map from `op_id` to a `OpTapeEntry` which stores the `op_type`,
+// inputs and outputs and the gradient function These data structures combined
+// allow us to trace the data dependencies between operations and hence compute
+// gradients.
+//
+// `ZerosLike` is not expected to be called and returns a nullptr. The creation
+// of default zeros grads is handled by the `DefaultGradientFunction` registered
+// for each op.
+// TODO(srbs): We need to define `ZerosLike` here to keep the compiler happy.
+// Figure out a way to avoid this.
+// TODO(srbs): Should ZerosLike check-fail instead of returning nullptr?
+class TapeTensor {
+ public:
+  explicit TapeTensor(AbstractTensorHandle* handle);
+  TapeTensor(const TapeTensor& other);
+  ~TapeTensor();
+
+  int64_t GetID() const;
+  tensorflow::DataType GetDType() const;
+
+  AbstractTensorHandle* ZerosLike() const;
+
+  AbstractTensorHandle* GetHandle() const;
+
+ private:
+  AbstractTensorHandle* handle_;
+};
+
+// A tracing/immediate-execution agnostic tape.
+//
+// Gradient functions defined for this tape must support handling null incoming
+// gradients.
+class Tape : protected eager::GradientTape<AbstractTensorHandle,
+                                           GradientFunction, TapeTensor> {
+ public:
+  using GradientTape<AbstractTensorHandle, GradientFunction,
+                     TapeTensor>::GradientTape;
+  // Returns whether the tape is persistent, i.e., whether the tape will hold
+  // onto its internal state after a call to `ComputeGradient`.
+  using GradientTape<AbstractTensorHandle, GradientFunction,
+                     TapeTensor>::IsPersistent;
+
+  // Adds this tensor to the list of watched tensors.
+  //
+  // This is a no-op if the tensor is already being watched either from an
+  // earlier call to `GradientTape::Watch` or being an output of an op with
+  // watched inputs.
+  void Watch(const AbstractTensorHandle*);
+  // Records an operation with given inputs and outputs
+  // on the tape and marks all its outputs as watched if at
+  // least one input of the op is watched and has a trainable dtype.
+  // op_name is optional and is used for debugging only.
+  void RecordOperation(absl::Span<AbstractTensorHandle* const> inputs,
+                       absl::Span<AbstractTensorHandle* const> outputs,
+                       GradientFunction* gradient_function,
+                       const string& op_name = "");
+  // Returns whether any tensor in a list of tensors is being watched and has
+  // a trainable dtype.
+  bool ShouldRecord(
+      absl::Span<const AbstractTensorHandle* const> tensors) const;
+  // Unwatches this tensor on the tape. Mainly used for cleanup when deleting
+  // eager tensors.
+  void DeleteTrace(const AbstractTensorHandle*);
+
+  // Consumes the internal state of the tape (so cannot be called more than
+  // once unless the tape is persistent) and produces the gradient of the target
+  // tensors with respect to the source tensors. The output gradients are used
+  // if not empty and not null. The result is populated with one tensor per
+  // target element.
+  absl::Status ComputeGradient(
+      AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> targets,
+      absl::Span<AbstractTensorHandle* const> sources,
+      absl::Span<AbstractTensorHandle* const> output_gradients,
+      absl::Span<AbstractTensorHandle*> result);
+};
+
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_GRADIENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/gradients_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/gradients_internal.h
new file mode 100644
index 00000000..93c2d36b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/gradients_internal.h
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_GRADIENTS_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_GRADIENTS_INTERNAL_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+
+// Helper functions which delegate to `AbstractOperation`, update
+// the state of the ForwardOperation and call the tape as appropriate.
+// These APIs are mainly to facilitate testing and are subject to change.
+
+// Records the op name in the `ForwardOperation`.
+absl::Status Reset(AbstractOperation*, const char* op,
+                   const char* raw_device_name, ForwardOperation*);
+
+// Records the inputs in the `ForwardOperation`.
+absl::Status AddInput(AbstractOperation*, AbstractTensorHandle*,
+                      ForwardOperation*);
+absl::Status AddInputList(AbstractOperation*,
+                          absl::Span<AbstractTensorHandle* const> inputs,
+                          ForwardOperation*);
+
+// Sets the attrs in the `ForwardOperation`.
+absl::Status SetAttrString(AbstractOperation*, const char* attr_name,
+                           const char* data, size_t length, ForwardOperation*);
+absl::Status SetAttrInt(AbstractOperation*, const char* attr_name,
+                        int64_t value, ForwardOperation*);
+absl::Status SetAttrFloat(AbstractOperation*, const char* attr_name,
+                          float value, ForwardOperation*);
+absl::Status SetAttrBool(AbstractOperation*, const char* attr_name, bool value,
+                         ForwardOperation*);
+absl::Status SetAttrType(AbstractOperation*, const char* attr_name,
+                         DataType value, ForwardOperation*);
+absl::Status SetAttrShape(AbstractOperation*, const char* attr_name,
+                          const int64_t* dims, const int num_dims,
+                          ForwardOperation*);
+absl::Status SetAttrFunction(AbstractOperation*, const char* attr_name,
+                             const AbstractOperation* value, ForwardOperation*);
+absl::Status SetAttrFunctionName(AbstractOperation*, const char* attr_name,
+                                 const char* value, size_t length,
+                                 ForwardOperation*);
+absl::Status SetAttrTensor(AbstractOperation*, const char* attr_name,
+                           AbstractTensorInterface* tensor, ForwardOperation*);
+absl::Status SetAttrStringList(AbstractOperation*, const char* attr_name,
+                               const void* const* values, const size_t* lengths,
+                               int num_values, ForwardOperation*);
+absl::Status SetAttrFloatList(AbstractOperation*, const char* attr_name,
+                              const float* values, int num_values,
+                              ForwardOperation*);
+absl::Status SetAttrIntList(AbstractOperation*, const char* attr_name,
+                            const int64_t* values, int num_values,
+                            ForwardOperation*);
+absl::Status SetAttrTypeList(AbstractOperation*, const char* attr_name,
+                             const DataType* values, int num_values,
+                             ForwardOperation*);
+absl::Status SetAttrBoolList(AbstractOperation*, const char* attr_name,
+                             const unsigned char* values, int num_values,
+                             ForwardOperation*);
+absl::Status SetAttrShapeList(AbstractOperation*, const char* attr_name,
+                              const int64_t** dims, const int* num_dims,
+                              int num_values, ForwardOperation*);
+absl::Status SetAttrFunctionList(AbstractOperation*, const char* attr_name,
+                                 absl::Span<const AbstractOperation*> values,
+                                 ForwardOperation*);
+
+// Make the call to `Tape::RecordOperation`.
+absl::Status Execute(AbstractOperation*, AbstractContext*,
+                     absl::Span<AbstractTensorHandle*> retvals,
+                     int* num_retvals, ForwardOperation*, Tape*,
+                     const GradientRegistry&);
+
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_GRADIENTS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/graph_function.h b/third_party/tflite-hdrs/tensorflow/c/eager/graph_function.h
new file mode 100644
index 00000000..b15d1b4b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/graph_function.h
@@ -0,0 +1,53 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_GRAPH_FUNCTION_H_
+#define TENSORFLOW_C_EAGER_GRAPH_FUNCTION_H_
+
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/refcount.h"
+namespace tensorflow {
+namespace tracing {
+namespace graph {
+using tensorflow::AbstractFunction;
+// Thin wrapper around a FunctionDef.
+class GraphFunction : public AbstractFunction {
+ public:
+  explicit GraphFunction(FunctionDef fdef);
+  ~GraphFunction() override;
+
+  // GraphFunction maybe stay alive for the duration of the returned
+  // FunctionDef.
+  absl::Status GetFunctionDef(const FunctionDef** fdef) override;
+
+  // Returns a shared reference to the wrapped function.
+  absl::StatusOr<core::RefCountPtr<FunctionRecord>> GetFunctionRecord()
+      override {
+    return func_record_.GetNewRef();
+  }
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractFunction* ptr) {
+    return ptr->getKind() == kGraph;
+  }
+
+ private:
+  core::RefCountPtr<FunctionRecord> func_record_;
+};
+}  // namespace graph
+}  // namespace tracing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_GRAPH_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_context.h b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_context.h
new file mode 100644
index 00000000..216fcfe9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_context.h
@@ -0,0 +1,294 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/immediate_execution_distributed_manager.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+class EagerExecutor;
+class EagerContext;
+class CustomDevice;
+class CustomDeviceOpHandler;
+class Device;
+
+// LINT.IfChange
+// Note: Keep in sync with exported copy of enum in eager/c_api.h.
+enum ContextDevicePlacementPolicy {
+  // Running operations with input tensors on the wrong device will fail.
+  DEVICE_PLACEMENT_EXPLICIT = 0,
+  // Copy the tensor to the right device but log a warning.
+  DEVICE_PLACEMENT_WARN = 1,
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default policy.
+  DEVICE_PLACEMENT_SILENT = 2,
+  // Placement policy which silently copies int32 tensors but not other dtypes.
+  DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
+};
+// LINT.ThenChange(//tensorflow/c/eager/c_api.h)
+
+// Abstract interface to a context.
+//
+// A context is responsible for creating key objects such as Tensors,
+// TensorHandles & Operations.
+class ImmediateExecutionContext : public AbstractContext {
+ public:
+  // Optimized scalar creation functions
+  virtual AbstractTensorInterface* CreateInt64Scalar(int64_t value) = 0;
+  virtual AbstractTensorInterface* CreateUint64Scalar(uint64 value) = 0;
+  virtual AbstractTensorInterface* CreateInt32Scalar(int32_t value) = 0;
+  virtual AbstractTensorInterface* CreateFloatScalar(float value) = 0;
+  virtual AbstractTensorInterface* CreateDoubleScalar(double value) = 0;
+  virtual AbstractTensorInterface* CreateHalfScalar(Eigen::half value) = 0;
+  virtual AbstractTensorInterface* CreateStringScalar(tstring value) = 0;
+  virtual AbstractTensorInterface* CreateComplex128Scalar(complex128 value) = 0;
+  virtual AbstractTensorInterface* CreateBoolScalar(bool value) = 0;
+
+  // Tensor creation functions
+  virtual AbstractTensorInterface* CreateTensor(
+      DataType dtype, absl::Span<const int64_t> dim_sizes) = 0;
+
+  typedef void (*MemoryReleaser)(void* data, size_t len, void* arg);
+
+  // Create a tensor instance from the given data buffer and description.
+  // `memory_releaser` will be called on destruction, and it's responsible for
+  // cleaning up the underlying buffer.
+  virtual AbstractTensorInterface* CreateTensor(
+      DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
+      MemoryReleaser memory_releaser, void* memory_releaser_arg) = 0;
+
+  // Create a handle to wrap and manage a Tensor
+  virtual ImmediateExecutionTensorHandle* CreateLocalHandle(
+      AbstractTensorInterface* t) = 0;
+  // Copy the handle to another device.
+  virtual ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
+      absl::Status* status) = 0;
+
+  // Create an operation to perform op execution
+  ImmediateExecutionOperation* CreateOperation() override = 0;
+
+  // Returns whether the runtime is backed by TFRT or the legacy TF Eager
+  // Runtime. This is necessary to decouple runtime-dependent
+  // code that is layered on top of the runtime.
+  virtual bool UsesTFRT() = 0;
+
+  // List attributes of available devices
+  virtual void ListDevices(std::vector<DeviceAttributes>* devices) = 0;
+
+  // Add `devices` into context's device manager. Context's device manager
+  // will take ownership and maintain devices' lifetime.
+  virtual absl::Status AddDevices(
+      std::vector<std::unique_ptr<Device>> devices) = 0;
+
+  // Block until all pending nodes are finished.
+  virtual absl::Status AsyncWait() = 0;
+
+  // Add a function (serialized FunctionDef protocol buffer) so that it can
+  // be executed as an op. Return error if the function with the same name
+  // already exists.
+  virtual absl::Status AddFunctionDef(const FunctionDef& fdef) = 0;
+
+  // Notifies about the function removal.
+  virtual absl::Status AddRemoveFunctionNotifier(
+      const string& func, std::function<void()> notifier) = 0;
+
+  // Same as `AddFunctionDef`, but additionally saves the `stack_traces` under
+  // the key of the function definition name (to be retrieved during function
+  // instantiation).
+  virtual absl::Status AddFunctionDefWithStackTraces(
+      const FunctionDef& fdef, const StackTracesMap& stack_traces) = 0;
+
+  // Find and return a added function by its name.
+  virtual const FunctionDef* FindFunctionDef(const string& name) const = 0;
+
+  // Find and return a function record added by its name.
+  virtual core::RefCountPtr<FunctionRecord> FindRecord(
+      const string& name) const = 0;
+
+  // Return the ParsedName of Host CPU device.
+  virtual const DeviceNameUtils::ParsedName& HostCPUParsedName() const = 0;
+  virtual const string& HostCPUName() const = 0;
+
+  // Configure soft device placement policy.
+  virtual void SetAllowSoftPlacement(bool enable) = 0;
+
+  // Configure device placement policy logging.
+  virtual void SetLogDevicePlacement(bool enable) = 0;
+
+  // Enables running eager ops as functions.
+  virtual void SetRunEagerOpAsFunction(bool enable) = 0;
+
+  // Enables rewriting jit_compile functions.
+  virtual void SetJitCompileRewrite(bool enable) = 0;
+
+  // Sets the device placement policy for the current thread.
+  virtual void SetThreadLocalDevicePlacementPolicy(
+      ContextDevicePlacementPolicy policy) = 0;
+  // Returns the device placement policy for the current thread.
+  virtual ContextDevicePlacementPolicy GetDevicePlacementPolicy() const = 0;
+
+  // Configure graph collection in RunMetadata.
+  virtual void SetShouldStoreGraphs(bool value) = 0;
+
+  // Return the collected RunMetadata. This method will transfer the ownership
+  // to the caller.
+  virtual std::unique_ptr<RunMetadata> ExportRunMetadata() = 0;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Experimental Custom Device.
+  //===--------------------------------------------------------------------===//
+  virtual CustomDeviceOpHandler& GetCustomDeviceOpHandler() = 0;
+
+  // Returns whether `device_name` is registered as a custom device.
+  virtual bool IsCustomDevice(const string& device_name) = 0;
+
+  // Register a custom device. It will return error is the device name is
+  // already registered.
+  // TODO(tfrt-devs): Remove this method. Let caller register it directly into
+  // CustomDeviceOpHandler.
+  virtual absl::Status RegisterCustomDevice(
+      const string& name, std::unique_ptr<CustomDevice> device) = 0;
+
+  // Return FunctionLibraryDefinition. Transformations need to use it to use it
+  // to invoke MLIR compiler passes.
+  virtual FunctionLibraryDefinition* FuncLibDef() = 0;
+
+  // Resets the global rendezvous used for functions.
+  virtual void ResetGlobalRendezvousForFunction() = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Following are features in current TF Eager Runtime.
+  // TODO(tfrt-devs): Figure out a way to deprecate following features after
+  // migrated to TFRT.
+  //===--------------------------------------------------------------------===//
+  // Clear pending nodes in thread executors and kernel caches.
+  virtual void ClearCachesAndThreadExecutors() = 0;
+
+  // Initialize the step resource container for a training step. This is used
+  // in current TF runtime. For tfrt, it is used by fallback op handler.
+  virtual void StartStep() = 0;
+  // Destroy the step resource container for a training step.
+  virtual void EndStep() = 0;
+
+  // Return the Eager Executor for current thread. Please note that Eager
+  // Executor is only used in current TF but not in TFRT.
+  virtual EagerExecutor& Executor() = 0;
+  // Update the Eager Executor for current thread.
+  virtual void SetExecutorForThread(EagerExecutor* executor) = 0;
+
+  // Return a list of local tensorflow::Device*.
+  // TODO(tfrt-devs): We shouldn't expose legacy device in this API.
+  virtual std::vector<tensorflow::Device*> ListLocalTfDevices() = 0;
+
+  // Return a list of all tensorflow::Device*.
+  virtual std::vector<tensorflow::Device*> ListAllTfDevices() = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Following are helper functions to assist integrating TFRT with current
+  // TF eager runtime.
+  // TODO(b/172877902): These helper functions are currently used to support
+  // PyFuncOp on TFRT, and might be useful for ops that directly use low
+  // level TF APIs. Remove/replace the following functions when TFRT native
+  // ops are implemented.
+  //===--------------------------------------------------------------------===//
+  // Create an abstract tensor handle from tensorflow::Tensor.
+  virtual ImmediateExecutionTensorHandle* CreateLocalHandleFromTFTensor(
+      tensorflow::Tensor& t, const char* d_name) = 0;
+
+  // Convert a TFRT TensorHandle to tensorflow::TensorHandle.
+  virtual ImmediateExecutionTensorHandle* TFTensorHandleFromInterface(
+      ImmediateExecutionTensorHandle* handle) = 0;
+
+  virtual std::vector<std::string> GetLoggedOpsTestonly() { return {}; }
+
+  // Get a list of the names of functions that have been registered.
+  virtual std::vector<string> ListFunctionNames() = 0;
+
+  struct CacheStats {
+    int64_t kernel_cache_size;
+    int64_t device_cache_size;
+    std::map<std::string, int64_t> func_kernel_cache_entries;
+    int64_t local_rendezvous_cache_active_size;
+  };
+  virtual CacheStats GetCacheStats() = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Distributed runtime related functions.
+  //===--------------------------------------------------------------------===//
+#if !defined(IS_MOBILE_PLATFORM)
+  // Set up a multi-client distributed execution environment. Must be called on
+  // all tasks in the cluster.
+  // This call internally coordinates with other tasks to initialize the eager
+  // context and TF server for multi-client execution.
+  virtual absl::Status EnableCollectiveOps(const ServerDef& server_def) = 0;
+
+  // Set a distributed manager that helps set up, update, and check liveness
+  // of member tasks in the cluster.
+  virtual void SetDistributedManager(
+      std::unique_ptr<ImmediateExecutionDistributedManager> distributed) = 0;
+
+  virtual ImmediateExecutionDistributedManager* GetDistributedManager() = 0;
+#endif  // !IS_MOBILE_PLATFORM
+
+ protected:
+  explicit ImmediateExecutionContext(AbstractContextKind kind)
+      : AbstractContext(kind) {}
+  ~ImmediateExecutionContext() override {}
+};
+
+namespace internal {
+struct ImmediateExecutionContextDeleter {
+  void operator()(ImmediateExecutionContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateContextPtr =
+    std::unique_ptr<ImmediateExecutionContext,
+                    internal::ImmediateExecutionContextDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_distributed_manager.h b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_distributed_manager.h
new file mode 100644
index 00000000..f4f4f093
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_distributed_manager.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_DISTRIBUTED_MANAGER_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_DISTRIBUTED_MANAGER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tsl {
+class CoordinationServiceAgent;
+}
+
+namespace tensorflow {
+class ImmediateExecutionContext;
+class ServerDef;
+class WorkerEnv;
+class WorkerCacheInterface;
+
+class ImmediateExecutionDistributedManager {
+ public:
+  virtual ~ImmediateExecutionDistributedManager() {}
+
+  // Set up distributed execution environment on local and remote tasks.
+  // When `reset_context` is true, initialize new cluster context state based
+  // on cluster configurations provided in `server_def`; otherwise, update
+  // existing context state with the provided `server_def`. Contexts created
+  // on remote tasks will be considered stale and garbage collected after
+  // `keep_alive_secs` of inactivity.
+  virtual absl::Status SetOrUpdateServerDef(
+      const ServerDef& server_def, bool reset_context, int keep_alive_secs,
+      int64_t init_timeout_in_ms, int retries,
+      bool clear_existing_contexts = false) = 0;
+
+  // Initializes context for the local worker and no contexts will be created
+  // for remote workers. Currently this only works for resetting context.
+  // TODO(b/289445025): Consider removing this when we find a proper fix.
+  virtual absl::Status InitializeLocalOnlyContext(const ServerDef& server_def,
+                                                  int keep_alive_secs) = 0;
+
+  // Set up a multi-client distributed execution environment. Must be called
+  // on all tasks in the cluster. This call internally coordinates with other
+  // tasks to initialize the eager context and TF server for multi-client
+  // execution.
+  virtual absl::Status EnableCollectiveOps(const ServerDef& server_def) = 0;
+
+  // Check if the remote task is alive.
+  virtual absl::Status CheckRemoteAlive(const std::string& remote_task_name,
+                                        bool* is_alive) = 0;
+
+  // Get pointer to the coordination service agent instance.
+  virtual tsl::CoordinationServiceAgent* GetCoordinationServiceAgent() = 0;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_DISTRIBUTED_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_operation.h b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_operation.h
new file mode 100644
index 00000000..fb76af9d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_operation.h
@@ -0,0 +1,104 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
+
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+struct TFE_Op;
+
+namespace tensorflow {
+
+class ImmediateExecutionContext;
+class AbstractOpAttrs;
+
+// Abstract interface to an operation.
+class ImmediateExecutionOperation : public AbstractOperation {
+ public:
+  virtual void Clear() = 0;
+
+  // Returns the inputs of this op.
+  virtual absl::Span<ImmediateExecutionTensorHandle* const> GetInputs()
+      const = 0;
+  virtual absl::Status SetInput(size_t index,
+                                ImmediateExecutionTensorHandle* input) = 0;
+
+  virtual ImmediateExecutionContext* GetContext() const = 0;
+
+  // Following two methods are used to support custom device.
+  // Return true if the inputs contain custom device tensor handle. It means
+  // that the argument need to be handled by a custom device.
+  virtual bool HasCustomDeviceInput() const = 0;
+
+  virtual const tensorflow::OpDef* OpDef() const = 0;
+
+  virtual absl::Status InputLength(const char* input_name, int* length) = 0;
+  virtual absl::Status OutputLength(const char* output_name, int* length) = 0;
+
+  // Set stack trace to be used for potential async error reporting.
+  virtual void SetStackTrace(ManagedStackTrace stack_trace) = 0;
+
+  virtual const tensorflow::AbstractOpAttrs* GetOpAttrs() const = 0;
+  virtual void AddAttrs(const AbstractOpAttrs* op_attrs) = 0;
+
+  virtual void SetCancellationManager(
+      CancellationManager* cancellation_manager) = 0;
+
+  // Returns the stack trace set by `SetStackTrace` if exists.
+  virtual absl::optional<ManagedStackTrace> GetStackTrace() = 0;
+
+  virtual void SetStepId(int64_t step_id) = 0;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
+  }
+
+ protected:
+  explicit ImmediateExecutionOperation(AbstractOperationKind kind)
+      : AbstractOperation(kind) {}
+  ~ImmediateExecutionOperation() override {}
+};
+
+namespace internal {
+struct ImmediateExecutionOperationDeleter {
+  void operator()(ImmediateExecutionOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateOpPtr =
+    std::unique_ptr<ImmediateExecutionOperation,
+                    internal::ImmediateExecutionOperationDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_tensor_handle.h b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_tensor_handle.h
new file mode 100644
index 00000000..61fc0fe8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
+
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Abstract interface to a TensorHandle.
+//
+// A TensorHandle is management class around a Tensor which may track additional
+// metadata and synchronization.
+//
+// This allows us to hide concrete implementations of TensorHandle from header
+// files. The interface lists the common functionality that must be provided by
+// any concrete implementation. However, in cases where the true concrete class
+// is needed a static_cast can be applied.
+class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
+ public:
+  // Returns number of dimensions.
+  virtual absl::Status NumDims(int* num_dims) const = 0;
+  // Returns number of elements across all dimensions.
+  virtual absl::Status NumElements(int64_t* num_elements) const = 0;
+  // Returns size of specified dimension
+  //
+  // -1 indicates an unknown axis length; this is unreachable for most standard
+  // ImmediateExecutionTensorHandles, but comes up for example when computing
+  // the shape of a parallel tensor with component shapes differing across
+  // devices.
+  virtual absl::Status Dim(int dim_index, int64_t* dim) const = 0;
+
+  // Returns the device which created the handle.
+  virtual const char* DeviceName(absl::Status* status) const = 0;
+  // Returns the device where the tensor was placed.
+  virtual const char* BackingDeviceName(absl::Status* status) const = 0;
+  // Returns the device type which created the handle.
+  virtual const char* DeviceType(absl::Status* status) const = 0;
+  // Returns the device ID which created the handle.
+  virtual int DeviceId(absl::Status* status) const = 0;
+  // Returns a tensor for the handle. If tensor is remote, it will be copied.
+  virtual AbstractTensorInterface* Resolve(absl::Status* status) = 0;
+
+  std::string DebugString() const override;
+
+  // Returns a Boolean hint indicating whether callers should prefer
+  // `SummarizeValue` to resolving this handle and formatting the tensor.
+  //
+  // For example some tensor handles may represent distributed values, in which
+  // case placement information is lost when resolving the handle.
+  //
+  // If false, a caller might implement pretty-printing by resolving and
+  // iterating over the resulting tensor. This may still be viable if resolving
+  // the handle loses information, but `SummarizeValue` would be more precise.
+  virtual bool PreferCustomSummarizer() const { return false; }
+
+  // Returns a string which summarizes the value of this TensorHandle, for
+  // debugging. Does not include a shape or dtype.
+  //
+  // Included in the default implementation of DebugString.
+  virtual absl::Status SummarizeValue(std::string& summary) const;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
+  }
+
+ protected:
+  explicit ImmediateExecutionTensorHandle(AbstractTensorHandleKind kind)
+      : AbstractTensorHandle(kind) {}
+  ~ImmediateExecutionTensorHandle() override {}
+};
+
+namespace internal {
+struct ImmediateExecutionTensorHandleDeleter {
+  void operator()(ImmediateExecutionTensorHandle* p) const {
+    if (p != nullptr) {
+      p->Unref();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateTensorHandlePtr =
+    std::unique_ptr<ImmediateExecutionTensorHandle,
+                    internal::ImmediateExecutionTensorHandleDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device.h b/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device.h
new file mode 100644
index 00000000..b8e571b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_H_
+#define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+namespace tensorflow {
+namespace parallel_device {
+
+// Allocate a parallel device named `device_name` which forwards operations to
+// `underlying_devices`, maintaining "parallel tensors" with components placed
+// on each underlying device.
+//
+// For example if `device_name` is
+//   "/job:localhost/replica:0/task:0/device:CUSTOM:0"
+// and `underlying_devices` is
+//   {"/job:localhost/replica:0/task:0/device:GPU:0",
+//    "/job:localhost/replica:0/task:0/device:GPU:1"}
+// Then executing an operation on CUSTOM:0 will execute it on GPU:0 and GPU:1.
+//
+// Implicit copies onto `device_name` are allowed, replicating the value once
+// per device in `underlying_devices`. Implicit copies off of the device throw
+// an error.
+//
+// All component tensors must have the same dtype. Currently they must also have
+// the same shape, although this requirement may be relaxed in the future.
+//
+// `device_name` must not name an existing physical or custom device (see
+// the documentation for TFE_RegisterCustomDevice for more information).
+//
+// Tensors may be copied on or off the device explicitly using
+// TPUReplicatedInput and TPUReplicatedOutput respectively. For example, with
+// two component devices, running `x = TPUReplicatedInput(inputs=[a, b])` on the
+// parallel device creates a parallel tensor `x` with `a` on the first of
+// `underlying_devices` and `b` on the second. Running `a_unpacked, b_unpacked =
+// TPUReplicatedOutput(input=x, num_replicas=2)` un-packs the parallel tensor
+// into its components.
+//
+// The filled `device` struct and the allocated `device_info` struct may be
+// passed to TFE_RegisterCustomDevice. The `device_name` arguments must match.
+void AllocateParallelDevice(const char* device_name,
+                            const char* const* underlying_devices,
+                            int num_underlying_devices,
+                            TFE_CustomDevice* device, void** device_info);
+
+}  // namespace parallel_device
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device_lib.h
new file mode 100644
index 00000000..03845d15
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -0,0 +1,299 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_LIB_H_
+#define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_LIB_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/safe_ptr.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace parallel_device {
+
+using TensorHandlePtr = tensorflow::Safe_TFE_TensorHandlePtr;
+
+class ParallelTensor;
+class DeviceThread;
+
+// Forwards operations to `devices`, maintaining ParallelTensor with components
+// placed on each underlying device.
+class ParallelDevice {
+ public:
+  // Eager async execution is only supported when remote eager is not in use
+  // (b/157523095).
+  explicit ParallelDevice(const std::vector<std::string>& devices,
+                          bool is_async = false, int in_flight_nodes_limit = 0);
+
+  ~ParallelDevice();
+
+  // Helper to copy a tensor handle from another device once for each component
+  // of the ParallelDevice.
+  //
+  // Sets a bad status and returns a nullptr if `tensor` is already on the
+  // ParallelDevice, or if the individual copies fail.
+  std::unique_ptr<ParallelTensor> CopyToParallelDevice(TFE_Context* context,
+                                                       TFE_TensorHandle* tensor,
+                                                       TF_Status* status) const;
+
+  // Construct a parallel tensor consisting of the scalar values from `values`.
+  template <typename DataType>
+  std::unique_ptr<ParallelTensor> ScalarsFromSequence(
+      absl::Span<const DataType> values, TFE_Context* context,
+      TF_Status* status) const;
+
+  // A parallel tensor with scalar integers numbering component devices.
+  std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
+                                            TF_Status* status) const;
+
+  // The number of devices operations run on.
+  size_t num_underlying_devices() const { return underlying_devices_.size(); }
+
+  // The devices operations run on.
+  const std::vector<std::string>& underlying_devices() const {
+    return underlying_devices_;
+  }
+
+  // Takes a description of a single operation being executed on the
+  // ParallelDevice, and in turn runs one operation per component device with
+  // its corresponding inputs from the input ParallelTensors. Wraps the
+  // resulting per-device and per-output TFE_TensorHandles into one
+  // ParallelTensor per output of the original operation.
+  //
+  // Attributes are forwarded to executed operations unmodified.
+  //
+  // The returned optional has a value if and only if `status` evaluates to
+  // TF_OK. Bad statuses are forwarded from underlying `TFE_Execute` calls, or
+  // if sanity checks on dtypes/metadata fail.
+  absl::optional<std::vector<std::unique_ptr<ParallelTensor>>> Execute(
+      TFE_Context* context, const std::vector<ParallelTensor*>& inputs,
+      const char* operation_name, const TFE_OpAttrs* attributes,
+      int expected_max_outputs, TF_Status* status) const;
+
+  // A non-blocking version of `Execute`. After each call, `Join` must be called
+  // before `StartExecute` is called again. Using `StartExecute` with `Join`
+  // allows the caller to schedule computation on multiple ParallelDevices
+  // without sequencing those operations (first call `StartExecute` on each
+  // parallel device, then call `Join` on each; even if some of the `Join`s
+  // return a bad status the caller must run all of the `Join`s or any future
+  // `StartExecute`s will deadlock).
+  //
+  // If `is_async=false` (constructor argument), `cancellation_manager` must
+  // live until `Join` finishes. If `is_async=true` it must live until `Join` is
+  // followed by `TFE_ContextAsyncWait` to clear pending operations. It will be
+  // used to cancel all other operations if any fails.
+  //
+  // Set step_id to configure the step id used for rendezvous creation. step id
+  // of value -1 is reserved for global rendezvous and should not be set here.
+  void StartExecute(TFE_Context* context,
+                    const std::vector<ParallelTensor*>& inputs,
+                    const char* operation_name, const TFE_OpAttrs* attributes,
+                    int expected_max_outputs,
+                    CancellationManager& cancellation_manager,
+                    std::optional<int64_t> step_id = std::nullopt) const;
+
+  void StartExecute(TFE_Context* context,
+                    const std::vector<std::vector<TFE_TensorHandle*>>& inputs,
+                    const char* operation_name, const TFE_OpAttrs* attributes,
+                    int expected_max_outputs,
+                    CancellationManager& cancellation_manager,
+                    std::optional<int64_t> step_id = std::nullopt) const;
+
+  // Blocks until the previous `StartExecute` has run `TFE_Execute` on each
+  // device. If is_async=false (constructor argument) this means the ops have
+  // run and have results. If is_async=true it means that all of the
+  // device-specific executors have scheduled the op.
+  //
+  // Accepts inferred shapes for outputs (`expected_output_shapes`), which if
+  // fully defined will avoid querying the shapes of the underlying
+  // TensorHandles when ParallelTensor::Shape is called. This allows async
+  // computation to continue without blocking.
+  //
+  // The return status and value is the same as `Execute`.
+  absl::optional<std::vector<std::unique_ptr<ParallelTensor>>> Join(
+      const std::vector<PartialTensorShape>& expected_output_shapes,
+      TF_Status* status) const;
+
+  void AsyncWait(TFE_Context* context, TF_Status* status) const;
+
+  // Device strings for component devices that only include a
+  // worker/task/replica if any of those differ across components. Useful for
+  // printing debug messages.
+  std::vector<std::string> SummarizeDeviceNames() const;
+
+ private:
+  // A sequence of device names, indicating which devices replicated operations
+  // are forwarded to.
+  const std::vector<std::string> underlying_devices_;
+  // A sequence of thread wrappers, one per device, for executing operations in
+  // parallel.
+  //
+  // Conceptually this is a thread pool with one thread per device. It requires
+  // less synchronization than a thread pool would for this task, since Execute
+  // acquires each thread in order (and so only one Execute will schedule
+  // blocking collective operations at a time), and avoids some dynamic
+  // allocation/scheduling.
+  //
+  // TODO(allenl): Keep a map from outer thread to list of inner threads rather
+  // than a single list of threads so aliased nested parallel devices don't
+  // re-use a thread.
+  std::vector<std::unique_ptr<DeviceThread>> device_threads_;
+  // A cancellation manager to use if the caller does not provide one. When ops
+  // are executed asynchronously this must outlive the queued op, so it can't be
+  // function-local to Execute.
+  mutable std::unique_ptr<CancellationManager> default_cancellation_manager_;
+};
+
+// Contains a tuple of tensors, one on each of the `underlying_devices_` of the
+// ParallelDevice.
+class ParallelTensor {
+ public:
+  // Construct a ParallelTensor from TensorHandles placed on the component
+  // devices of a ParallelDevice. If called, ParallelTensor::Shape inspects
+  // `components` to determine a shape.
+  static std::unique_ptr<ParallelTensor> FromTensorHandles(
+      const ParallelDevice& parallel_device,
+      std::vector<TensorHandlePtr> components, TF_Status* status);
+  // Uses the provided shape without additional checks, which avoids blocking
+  // when ParallelTensor::Shape is called.
+  static std::unique_ptr<ParallelTensor> FromTensorHandles(
+      const ParallelDevice& parallel_device,
+      std::vector<TensorHandlePtr> components, absl::Span<const int64_t> shape,
+      TF_Status* status);
+
+  size_t num_tensors() const { return tensors_.size(); }
+  TFE_TensorHandle* tensor(size_t index) const { return tensors_[index].get(); }
+
+  // If the `shape` argument to `FromTensorHandles` is specified, returns that.
+  //
+  // Otherwise if all of the tensors have the same shape, returns that via the
+  // `shape` output argument. This blocks waiting for async tensors, may return
+  // a delayed bad status encountered during async execution, and will return a
+  // bad status unless all tensors have the same shape.
+  absl::Status Shape(const std::vector<int64_t>** shape) const;
+  TF_DataType dtype() const { return dtype_; }
+
+  // Sets its output argument to a summary of the values of this tensor on every
+  // component device.
+  absl::Status SummarizeValue(std::string& summary);
+
+  std::vector<TensorHandlePtr> release_tensors() { return std::move(tensors_); }
+
+  std::vector<TFE_TensorHandle*> tensors() const {
+    std::vector<TFE_TensorHandle*> result;
+    result.reserve(tensors_.size());
+    for (const TensorHandlePtr& tensor : tensors_) {
+      result.emplace_back(tensor.get());
+    }
+    return result;
+  }
+
+ private:
+  ParallelTensor(const ParallelDevice& device,
+                 std::vector<TensorHandlePtr> tensors,
+                 absl::Span<const int64_t> shape, const TF_DataType dtype)
+      : device_(device),
+        tensors_(std::move(tensors)),
+        shape_(std::vector<int64_t>(shape.begin(), shape.end())),
+        dtype_(dtype) {}
+  ParallelTensor(const ParallelDevice& device,
+                 std::vector<TensorHandlePtr> tensors, const TF_DataType dtype)
+      : device_(device),
+        tensors_(std::move(tensors)),
+        shape_(absl::nullopt),
+        dtype_(dtype) {}
+
+  const ParallelDevice& device_;
+  std::vector<TensorHandlePtr> tensors_;
+  // Parallel tensors are immutable but compute their shape lazily unless it is
+  // provided on construction. The optional has a value if the lazy computation
+  // has been completed or the shape was provided on construction.
+  mutable absl::optional<std::vector<int64_t>> shape_;
+  const TF_DataType dtype_;
+};
+
+template <typename DataType>
+std::unique_ptr<ParallelTensor> ParallelDevice::ScalarsFromSequence(
+    absl::Span<DataType const> values, TFE_Context* context,
+    TF_Status* status) const {
+  std::vector<TensorHandlePtr> components;
+  components.reserve(underlying_devices_.size());
+
+  if (values.size() != num_underlying_devices()) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        "Number of values did not match number of underlying devices.");
+    return nullptr;
+  }
+  TF_DataType datatype_enum(
+      static_cast<TF_DataType>(DataTypeToEnum<DataType>().value));
+  for (int device_index = 0; device_index < num_underlying_devices();
+       ++device_index) {
+    auto device_value = absl::make_unique<DataType>();
+    *device_value = values[device_index];
+    std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+        TF_NewTensor(
+            datatype_enum, /*dims=*/nullptr, /*num_dims=*/0,
+            device_value.release(), sizeof(DataType),
+            [](void* data, size_t, void* arg) {
+              delete reinterpret_cast<DataType*>(data);
+            },
+            nullptr),
+        TF_DeleteTensor);
+    // TODO(allenl): Here and when executing regular operations, we could hold
+    // on to one TFE_Op per device and just call TFE_ResetOp to avoid parsing
+    // device names repeatedly.
+    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> const_op(
+        TFE_NewOp(context, "Const", status), TFE_DeleteOp);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetDevice(const_op.get(), underlying_devices_[device_index].c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrType(const_op.get(), "dtype", datatype_enum);
+    TFE_TensorHandle* device_handle;
+    int num_outputs = 1;
+    TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    components.emplace_back(device_handle);
+  }
+  return ParallelTensor::FromTensorHandles(*this, std::move(components),
+                                           status);
+}
+
+}  // namespace parallel_device
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device_testlib.h b/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device_testlib.h
new file mode 100644
index 00000000..d55a23bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/parallel_device/parallel_device_testlib.h
@@ -0,0 +1,172 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
+#define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
+
+#include <array>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace parallel_device {
+
+// A helper for performing common operations on variables. A much more
+// restricted stand-in for tf.Variable in Python.
+class Variable {
+ public:
+  // Construct a Variable from a resource-dtype TFE_TensorHandle and an
+  // indication of the dtype of the variable's value.
+  //
+  // Note that creating this resource-dtype handle can fail, so `Create` is a
+  // separate static method which returns a status.
+  Variable(TFE_TensorHandle* handle, TF_DataType type)
+      : handle_(handle), type_(type) {}
+
+  // Helper for constructing a resource handle and wrapping it in a `Variable`
+  // object.
+  static Variable* Create(TFE_Context* context, TF_DataType type,
+                          const int64_t* dims, const int num_dims,
+                          const char* device, TF_Status* status);
+  // Dereferences the backing buffer for the variable. Note that since this can
+  // fail (it runs operations), it must be called explicitly and the resulting
+  // `status` checked.
+  void Destroy(TFE_Context* context, TF_Status* status);
+
+  // Reads from the variable.
+  TensorHandlePtr Read(TFE_Context* context, TF_Status* status);
+  // Assigns a new value to the variable.
+  void Assign(TFE_Context* context, TFE_TensorHandle* value, TF_Status* status);
+  // Adds `value` to the existing value of the variable.
+  void AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
+                 TF_Status* status);
+
+ private:
+  // Helper for running any single-argument assignment ops (Assign, AssignAdd,
+  // AssignSub, ...).
+  void GeneralAssignment(const char* op_name, TFE_Context* context,
+                         TFE_TensorHandle* value, TF_Status* status);
+
+  // The a handle for the resource-dtype tensor pointing to the variable's
+  // buffer.
+  TFE_TensorHandle* handle_;
+  // The dtype of the variable's buffer (input dtype for assignments, output
+  // dtype of read operations).
+  TF_DataType type_;
+};
+
+// Creates a TFE_TensorHandle with value `v`.
+TensorHandlePtr FloatTensorHandle(float v, TF_Status* status);
+
+// Creates a rank-one TFE_TensorHandle with value `v`.
+TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
+                                        TF_Status* status);
+
+// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
+template <std::size_t num_replicas>
+void ExtractPerDeviceValues(
+    TFE_Context* context, TFE_TensorHandle* input,
+    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status);
+
+// Helper to pack `num_replicas` TFE_TensorHandles into one parallel handle.
+template <std::size_t num_replicas>
+TensorHandlePtr CreatePerDeviceValues(
+    TFE_Context* context,
+    const std::array<TFE_TensorHandle*, num_replicas>& components,
+    const char* device, TF_Status* status);
+
+TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
+                         TFE_TensorHandle* second, TF_Status* status);
+
+// Assert that `handle` is equal to `expected_value`.
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value);
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status);
+
+// Create and modify a variable placed on a parallel device which composes
+// `first_device` and `second_device`.
+void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
+                             const char* second_device);
+
+// Implementations of templated functions ******************************
+
+template <std::size_t num_replicas>
+TensorHandlePtr CreatePerDeviceValues(
+    TFE_Context* context,
+    const std::array<TFE_TensorHandle*, num_replicas>& components,
+    const char* device, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "TPUReplicatedInput", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrInt(op.get(), "N", num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    TFE_OpAddInput(op.get(), components[i], status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_TensorHandle* result_handle;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(result_handle);
+}
+
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> actual_value(
+      TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
+  ASSERT_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+  ASSERT_EQ(TF_TensorType(actual_value.get()),
+            static_cast<TF_DataType>(DataTypeToEnum<value_type>().value));
+  EXPECT_EQ(expected_value,
+            *static_cast<value_type*>(TF_TensorData(actual_value.get())));
+}
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status) {
+  TFE_CustomDevice device;
+  void* device_info;
+  tensorflow::parallel_device::AllocateParallelDevice(
+      device_name, underlying_devices.data(), underlying_devices.size(),
+      &device, &device_info);
+  TFE_RegisterCustomDevice(context, device, device_name, device_info, status);
+}
+
+}  // namespace parallel_device
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tape.h b/third_party/tflite-hdrs/tensorflow/c/eager/tape.h
new file mode 100644
index 00000000..7ed8025b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tape.h
@@ -0,0 +1,1168 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TAPE_H_
+#define TENSORFLOW_C_EAGER_TAPE_H_
+
+// Language-agnostic gradient tape. Does not perform backpropagation, just
+// maintains the data structures required to do so.
+
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace eager {
+
+// Represents an entry in the tape.
+template <typename BackwardFunction, typename TapeTensor>
+struct OpTapeEntry {
+  string op_type;
+  std::vector<TapeTensor> output_tensor_info;
+  std::vector<int64_t> input_tensor_id;
+
+  // TODO(apassos) consider narrowing down this interface.
+  BackwardFunction* backward_function;
+
+  // Should be called before deleting the backward function. TODO(apassos) use
+  // unique_ptrs to ensure this happens.
+  std::function<void(BackwardFunction*)> backward_function_deleter;
+};
+
+// Map from tensor_id to internally-defined operation-id of the operation which
+// produced this tensor. A value of -1 means that the tensor was directly
+// watched and not the result of any operation in the tape.
+using TensorTape = std::unordered_map<int64_t, int64_t>;
+
+// Map from operation-id to tape entry.
+template <typename BackwardFunction, typename TapeTensor>
+using OpTape =
+    std::unordered_map<int64_t, OpTapeEntry<BackwardFunction, TapeTensor>>;
+
+// Operations the tape needs to perform on tensors to do backpropagation. Named
+// "vspace" because a subset of these are related to a vector space, such as
+// adding gradients, getting zeroes, etc. Currently cannot be implemented
+// without using tensorflow python code, hence left unspecified here.
+//
+// Gradient is the type returned by gradient functions. In Python TF it's either
+// Tensor or IndexedSlices or None, which here we map to nullptr. Gradients need
+// to allow their size to be computed and they need to be passable to a backward
+// function and deleted (as the backprop code creates lots of gradients the user
+// is not interested in).
+//
+// BackwardFunction needs to be a closure which stores intermediate activations
+// from the forward computation and calls a vector-jacobian product function
+// (also known as adjoint function) to compute, given downstream gradients,
+// upstream gradients.
+//
+// TODO(apassos) provide concrete template instantiations for TFE_TensorHandle
+// specialization, which is blocked by quite a few things needing to loop back
+// into python now.
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+class VSpace {
+ public:
+  virtual ~VSpace() {}
+
+  // Returns the number of elements in the gradient tensor.
+  virtual int64_t NumElements(Gradient* tensor) const = 0;
+
+  // Consumes references to the tensors in the gradient_tensors list and returns
+  // a tensor with the result.
+  virtual Gradient* AggregateGradients(
+      gtl::ArraySlice<Gradient*> gradient_tensors) const = 0;
+
+  // Calls the passed-in backward function.
+  //
+  // `unneeded_gradients` contains sorted list of input indices for which a
+  // gradient is not required.
+  virtual absl::Status CallBackwardFunction(
+      const string& op_type, BackwardFunction* backward_function,
+      const std::vector<int64_t>& unneeded_gradients,
+      gtl::ArraySlice<Gradient*> output_gradients,
+      absl::Span<Gradient*> result) const = 0;
+
+  // Builds a tensor filled with ones with the same shape and dtype as `t`.
+  virtual absl::Status BuildOnesLike(const TapeTensor& t,
+                                     Gradient** result) const = 0;
+
+  // Looks up the ID of a Gradient.
+  virtual int64_t TensorId(Gradient* tensor) const = 0;
+
+  // Converts a Gradient to a TapeTensor.
+  virtual TapeTensor TapeTensorFromGradient(Gradient* gradient) const = 0;
+
+  // Marks the following gradient as a result so it's not consumed by backward
+  // functions.
+  virtual void MarkAsResult(Gradient* gradient) const = 0;
+
+  // Deletes the input tensor.
+  virtual void DeleteGradient(Gradient* gradient) const = 0;
+};
+
+// Traces the execution of operations, doing eager garbage collection, and
+// exporting a full trace so other code can do backpropagation. Not thread-safe.
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+class GradientTape {
+ public:
+  // If `persistent` is true, GradientTape will not eagerly delete backward
+  // functions (and hence the tensors they keep alive). Instead, everything
+  // is deleted in ~GradientTape. Persistent GradientTapes are useful when
+  // users want to compute multiple gradients over the same tape.
+  explicit GradientTape(bool persistent) : persistent_(persistent) {}
+  ~GradientTape() {
+    for (const auto& pair : op_tape_) {
+      pair.second.backward_function_deleter(pair.second.backward_function);
+    }
+  }
+
+  // Returns whether any tensor in a list of tensors is being watched and has
+  // a trainable dtype.
+  bool ShouldRecord(absl::Span<const int64_t> tensor_ids,
+                    absl::Span<const tensorflow::DataType> dtypes) const;
+
+  // Adds this tensor to the list of watched tensors.
+  //
+  // This is a no-op if the tensor is already being watched either from an
+  // earlier call to `GradientTape::Watch` or being an output of an op with
+  // watched inputs.
+  void Watch(int64_t tensor_id);
+
+  // Records an operation with inputs `input_tensor_id` and outputs
+  // `output_tensors` on the tape and marks all its outputs as watched if at
+  // least one input of the op is watched and has trainable dtype.
+  //
+  // op_type is used to decide which of the incoming gradients can be left as
+  // nullptr instead of building zeros when build_default_zeros_grads == true.
+  void RecordOperation(
+      const string& op_type, const std::vector<TapeTensor>& output_tensors,
+      absl::Span<const int64_t> input_tensor_id,
+      absl::Span<const tensorflow::DataType> input_dtypes,
+      const std::function<BackwardFunction*()>& backward_function_getter,
+      const std::function<void(BackwardFunction*)>& backward_function_deleter);
+
+  void DeleteTrace(int64_t tensor_id);
+
+  // Consumes the internal state of the tape (so cannot be called more than
+  // once) and produces the gradient of the target tensors with respect to the
+  // source tensors. The output gradients are used if not empty and not
+  // null. The result is populated with one tensor per target element.
+  // When running backward functions, builds zeros-like tensors for
+  // incoming grads which are nullptrs, unless `build_default_zeros_grads`
+  // is set to false.
+  absl::Status ComputeGradient(
+      const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
+      const absl::Span<const int64_t> target_tensor_ids,
+      const absl::Span<const int64_t> source_tensor_ids,
+      const std::unordered_map<int64, TapeTensor>& sources_that_are_targets,
+      gtl::ArraySlice<Gradient*> output_gradients, absl::Span<Gradient*> result,
+      bool build_default_zeros_grads = true);
+
+  // Whether the tape is persistent. See ctor for detailed description.
+  bool IsPersistent() const { return persistent_; }
+
+ private:
+  TensorTape tensor_tape_;
+  OpTape<BackwardFunction, TapeTensor> op_tape_;
+  int64_t next_op_id_{0};
+
+  // Map from tensor id to number of remaining usages (i.e. how many entries in
+  // the tape refer to it); to aid in tape garbage collection.
+  std::unordered_map<int64_t, int64_t> tensor_usage_;
+
+  // If false, all activations are deleted in the first call to ComputeGradient.
+  // Else, only when this is destructed.
+  bool persistent_;
+};
+
+// Describes a callback for special-cased and more efficient jvp computation.
+//
+// Could just be a simple typedef in ForwardAccumulator, but MSVC chokes on
+// that.
+template <typename Gradient>
+class ForwardFunction
+    : public std::function<absl::Status(const std::vector<Gradient*>&,
+                                        std::vector<Gradient*>*, bool)> {
+ public:
+  template <typename lambda_type>
+  explicit ForwardFunction(lambda_type lambda)
+      : std::function<absl::Status(const std::vector<Gradient*>&,
+                                   std::vector<Gradient*>*, bool)>(lambda) {}
+};
+
+// Computes Jacobian-vector products using forward-mode automatic
+// differentiation.
+//
+// While GradientTape's RecordOperation is trivial, ForwardAccumulator's
+// Accumulate runs the gradient computation immediately.
+//
+// Keeps references to Tensors watched via Watch and computed in Accumulate
+// corresponding to output_tensors, and releases these references in its
+// destructor. However, waiting until the destructor runs loses the memory
+// efficiency of forward-mode autodiff. Instead, language bindings should call
+// DeleteGradient as soon as a Tensor which was `Watch`ed or was an output
+// Tensor passed to Accumulate goes out of scope.
+//
+// Not thread-safe.
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+class ForwardAccumulator {
+ public:
+  // Does not take ownership of `vspace`, which must outlive the
+  // ForwardAccumulator.
+  explicit ForwardAccumulator(
+      const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
+      bool use_batch)
+      : vspace_(vspace), use_batch_(use_batch) {
+    call_state_.emplace(nullptr, false);
+  }
+
+  virtual ~ForwardAccumulator() {
+    for (auto accumulated : accumulated_gradients_) {
+      vspace_.DeleteGradient(accumulated.second);
+    }
+  }
+
+  // Tell the forward accumulator to watch tensor_id, with a Tensor tangent
+  // vector `tangent` of matching shape and dtype. Tangents are the "vector" in
+  // "Jacobian-vector product"; `Watch`ing a new Tensor and immediately calling
+  // FetchJVP for it would return `tangent`.
+  void Watch(int64_t tensor_id, Gradient* tangent);
+
+  // Removes the gradient associated with tensor_id. Should be called when the
+  // Tensor associated with `tensor_id` is deleted.
+  void DeleteGradient(int64_t tensor_id);
+
+  // Runs forward autodiff. Should be called whenever a new operation is
+  // available and the accumulator is active.
+  //
+  // Like GradientTape::RecordOperation, this method takes the operation type
+  // `op_type` (e.g. "Add"), the operation's inputs (`input_tensors`,
+  // `input_tensor_id`, and `input_dtypes`; the latter two are somewhat
+  // redundant but taken as arguments to avoid repeatedly fetching these values
+  // between calls to ShouldRecord and Accumulator), and its outputs
+  // (`output_tensors`).
+  //
+  // If provided, a non-null `forward_function` will be used instead of the
+  // backward function (`backward_function_getter` /
+  // `backward_function_deleter`) to compute jvps for this operation. If
+  // `forward_function` is null, a GradientTape is used on the backward function
+  // to compute the jvp, which will waste computation when executing eagerly.
+  //
+  // Unlike GradientTape::RecordOperation, Accumulate runs gradient computation
+  // immediately. It stores the results, which feed into Accumulate for future
+  // operations and may be fetched by calling FetchJVP. ForwardAccumulator
+  // maintains a reference to these JVPs: if an `output_tensors` Tensor is
+  // deleted, `DeleteGradient` should be called as soon as possible to free the
+  // (now inaccessible) corresponding JVPs, but ForwardAccumulator's destructor
+  // will release remaining references.
+  //
+  // This method is not thread-safe (and in general ForwardAccumulator is not
+  // thread-safe).
+  absl::Status Accumulate(
+      const string& op_type, const std::vector<TapeTensor>& input_tensors,
+      const std::vector<TapeTensor>& output_tensors,
+      absl::Span<const int64_t> input_tensor_id,
+      absl::Span<const tensorflow::DataType> input_dtypes,
+      const ForwardFunction<Gradient>* forward_function,
+      const std::function<BackwardFunction*()>& backward_function_getter,
+      const std::function<void(BackwardFunction*)>& backward_function_deleter);
+
+  // Returns true if `Accumulate` is active somewhere above on the stack and
+  // there isn't an intervening PushState. This is useful for ordering
+  // ForwardAccumulators, where more deeply nested accumulators should not see
+  // computations from less deeply nested accumulators.
+  bool BusyAccumulating() const { return call_state_.top().accumulating; }
+
+  // Fetches the current Jacobian-vector product associated with `tensor_id`, or
+  // a nullptr if none is available.
+  //
+  // Returns a borrowed reference, i.e. does not run VSpace::MarkAsResult on its
+  // return value. The caller should increment the reference count before
+  // deleting the ForwardAccumulator or calling DeleteGradient if keeping a
+  // persistent reference to a non-null result.
+  Gradient* FetchJVP(int64_t tensor_id);
+
+  // Indicates whether the forward accumulator should run on an operation with
+  // the specified inputs and dtypes.
+  bool ShouldRecord(absl::Span<const int64_t> tensor_ids,
+                    absl::Span<const tensorflow::DataType> dtypes);
+
+  // Temporarily push or pop transient state for this accumulator.
+  //
+  // Allows an accumulator which is currently processing an operation to
+  // temporarily reset its state. Without pushing and popping, accumulators
+  // ignore operations executed as a direct result of their own jvp
+  // computations.
+  void PushState() { call_state_.emplace(nullptr, false); }
+  void PopState() { call_state_.pop(); }
+
+ private:
+  // Helper for Accumulate: uses a GradientTape to compute forward gradients
+  // from a backward gradient function. Fills `out_grads` corresponding to
+  // `output_tensors`. `out_grads` must not be null.
+  //
+  // Executes the backward function in order to trace its gradient, which will
+  // waste computation if executing eagerly (when graph building the unneeded
+  // computation is pruned). Temporarily sets `backward_tape` so that
+  // Accumulate will forward op executions to the tape while the backward
+  // function is running; this effectively adds the backward tape to the active
+  // set (but does not require complicated callbacks to the language bindings).
+  absl::Status ForwardpropFromTape(
+      const string& op_type, const std::vector<TapeTensor>& output_tensors,
+      const std::function<BackwardFunction*()>& backward_function_getter,
+      const std::function<void(BackwardFunction*)>& backward_function_deleter,
+      const std::vector<Gradient*>& in_grads, absl::Span<Gradient*> out_grads);
+
+  // Maps from tensor IDs to corresponding JVPs.
+  std::unordered_map<int64_t, Gradient*> accumulated_gradients_;
+  // Not owned; provides operations on Tensors which are currently only
+  // available in language bindings (e.g. Python).
+  const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace_;
+
+  // Decides if tangents are vectorized or not
+  bool use_batch_;
+
+  struct AccumulatorCallState {
+    AccumulatorCallState(
+        GradientTape<Gradient, BackwardFunction, TapeTensor>* backward_tape,
+        bool accumulating)
+        : backward_tape(backward_tape), accumulating(accumulating) {}
+    // Set temporarily while in the Accumulate method; if backward_tape is not
+    // nullptr then we forward op executions to it so Accumulate can compute a
+    // backward pass on its backward function.
+    //
+    // Not owned by the ForwardAccumulator. The method which sets
+    // `backward_tape` keeps ownership.
+    GradientTape<Gradient, BackwardFunction, TapeTensor>* backward_tape;
+    // While the Accumulate method is running (accumulating is True), any op
+    // executions not forwarded to backward_tape should be ignored.
+    bool accumulating;
+  };
+  // A deque-backed stack, whose element references are not invalidated by
+  // pushes and pops at the back.
+  std::stack<AccumulatorCallState> call_state_;
+};
+
+// Template instantiations here
+
+inline bool IsDtypeTrainable(DataType dtype) {
+  switch (dtype) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_COMPLEX64:
+    case DT_COMPLEX128:
+    case DT_RESOURCE:
+    case DT_VARIANT:
+      return true;
+    case DT_QINT8:
+    case DT_QINT16:
+    case DT_QINT32:
+    case DT_QUINT8:
+    case DT_QUINT16:
+      return tensorflow::flags::Global()
+          .enable_quantized_dtypes_training.value();
+    default:
+      return false;
+  }
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+bool GradientTape<Gradient, BackwardFunction, TapeTensor>::ShouldRecord(
+    absl::Span<const int64_t> tensor_ids,
+    absl::Span<const tensorflow::DataType> dtypes) const {
+  CHECK_EQ(tensor_ids.size(), dtypes.size());
+  for (int i = 0; i < tensor_ids.size(); ++i) {
+    if (tensor_tape_.find(tensor_ids[i]) != tensor_tape_.end()) {
+      if (IsDtypeTrainable(dtypes[i])) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+void GradientTape<Gradient, BackwardFunction, TapeTensor>::Watch(
+    int64_t tensor_id) {
+  tensor_tape_.emplace(tensor_id, -1);
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+void GradientTape<Gradient, BackwardFunction, TapeTensor>::RecordOperation(
+    const string& op_type, const std::vector<TapeTensor>& output_tensors,
+    absl::Span<const int64_t> input_tensor_id,
+    absl::Span<const tensorflow::DataType> input_dtypes,
+    const std::function<BackwardFunction*()>& backward_function_getter,
+    const std::function<void(BackwardFunction*)>& backward_function_deleter) {
+  if (!ShouldRecord(input_tensor_id, input_dtypes)) {
+    return;
+  }
+  std::vector<int64_t> ids;
+  ids.reserve(input_tensor_id.size());
+  for (int64_t i : input_tensor_id) {
+    tensor_usage_[i]++;
+    ids.push_back(i);
+  }
+  const int64_t op_id = next_op_id_++;
+  std::vector<TapeTensor> tensors;
+  tensors.reserve(output_tensors.size());
+  for (const TapeTensor& o : output_tensors) {
+    // Note: the tensor can have already been watched and hence be in the tape,
+    // so we cannot check that we're inserting it here.
+    tensor_tape_[o.GetID()] = op_id;
+    tensor_usage_[o.GetID()] = 1;
+    tensors.push_back(o);
+  }
+  op_tape_[op_id] = OpTapeEntry<BackwardFunction, TapeTensor>{
+      op_type, std::move(tensors), std::move(ids), backward_function_getter(),
+      backward_function_deleter};
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+void GradientTape<Gradient, BackwardFunction, TapeTensor>::DeleteTrace(
+    int64_t tensor_id) {
+  auto it = tensor_usage_.find(tensor_id);
+  if (it == tensor_usage_.end()) {
+    return;
+  }
+  it->second--;
+  if (it->second != 0) {
+    return;
+  }
+  tensor_usage_.erase(it);
+  auto tensor_op_it = tensor_tape_.find(tensor_id);
+  if (tensor_op_it == tensor_tape_.end()) {
+    return;
+  }
+  const int64_t op_id = tensor_op_it->second;
+  if (op_id == -1) {
+    // Do not delete watched tensors.
+    return;
+  }
+  tensor_tape_.erase(tensor_op_it);
+  auto op_it = op_tape_.find(op_id);
+  CHECK(op_it != op_tape_.end());
+  for (const auto& output : op_it->second.output_tensor_info) {
+    if (tensor_usage_.find(output.GetID()) != tensor_usage_.end()) {
+      // Found a usage for an output, so cannot delete the op.
+      return;
+    }
+  }
+  for (int64_t id : op_it->second.input_tensor_id) {
+    DeleteTrace(id);
+  }
+  op_it->second.backward_function_deleter(op_it->second.backward_function);
+  op_tape_.erase(op_it);
+}
+
+// Terminology:
+//
+//  - op: a possibly composite operation, which has an entry in the tape
+//  - target: dy in dx/dy
+//  - source: dx in dx/dy
+//  - tensor: one of the many inputs or outputs of an operation
+//
+// Below here we do the gradient algorithm. It works as follows:
+//
+// First we filter the tape to just the subset of operations we want to
+// differentiate. In the process of doing so we count how many times each Tensor
+// is used as an input to an op (so we know when we're done computing gradients
+// for that Tensor). We also count, for each tape entry, how many of its output
+// Tensors need gradients to be computed (Tensors which are not used do not need
+// any gradients to be computed).
+//
+// Finally, we start a backprop stack with a set of tape entries for which we
+// have all gradients available. This set usually is a subset of the set of
+// targets (not all since targets which have outputs in the tape will not have
+// gradients available initially).
+//
+// Then we repeatedly pop an entry from the stack, run its backprop, and update
+// the gradients of its inputs. Once we have computed all gradients for a single
+// input we can mark this input as done, and this can trigger adding an entry to
+// the stack if all outputs of that entry are now done.
+//
+// When the stack is empty we have gradients for all tensors we're interested
+// in.
+
+namespace {
+
+template <typename BackwardFunction, typename TapeTensor>
+struct BackpropInitialState {
+  OpTape<BackwardFunction, TapeTensor> op_tape;
+
+  // Map from tensor ID to how many references still exist for this tensor in
+  // the tape.
+  std::unordered_map<int64_t, int64_t> tensor_usage_counts;
+
+  // Maps from op ID to how many output tensors of this op still need to have
+  // their gradients computed.
+  std::unordered_map<int64_t, int64_t> op_missing_tensor;
+};
+
+// If `persistent_tape` is true, op_tape is not changed and none of the
+// backwards functions are deleted.
+// If `persistent_tape` is false, op_tape is cleared and backwards functions
+// not needed for gradient computation are deleted. Backwards functions that
+// are needed, are copied and returned in BackpropInitialState.
+template <typename BackwardFunction, typename TapeTensor>
+BackpropInitialState<BackwardFunction, TapeTensor> PrepareBackprop(
+    absl::Span<const int64_t> target, const TensorTape& tensor_tape,
+    OpTape<BackwardFunction, TapeTensor>* op_tape,
+    const std::unordered_set<int64_t>& sources_set, bool persistent_tape) {
+  std::vector<int64_t> tensor_stack;
+  tensor_stack.reserve(target.size());
+  for (auto t : target) {
+    tensor_stack.push_back(t);
+  }
+  BackpropInitialState<BackwardFunction, TapeTensor> result;
+  while (!tensor_stack.empty()) {
+    int64_t tensor_id = tensor_stack.back();
+    tensor_stack.pop_back();
+    auto op_id_it = tensor_tape.find(tensor_id);
+    if (op_id_it == tensor_tape.end()) {
+      continue;
+    }
+    int64_t op_id = op_id_it->second;
+    auto op_it = op_tape->find(op_id);
+    auto result_op_it = result.op_tape.find(op_id);
+    if (op_id == -1 || op_it == op_tape->end() ||
+        result_op_it != result.op_tape.end()) {
+      continue;
+    }
+    CHECK(result.op_tape.emplace(op_id, op_it->second).second);
+    for (auto it : op_it->second.input_tensor_id) {
+      auto count_it = result.tensor_usage_counts.find(it);
+      if (count_it != result.tensor_usage_counts.end()) {
+        count_it->second++;
+      } else {
+        result.tensor_usage_counts[it] = 1;
+        if (tensor_tape.find(it) != tensor_tape.end()) {
+          tensor_stack.push_back(it);
+        }
+      }
+    }
+    if (!persistent_tape) {
+      op_tape->erase(op_it);
+    }
+  }
+  for (auto& pair : result.tensor_usage_counts) {
+    auto it = tensor_tape.find(pair.first);
+    if (it != tensor_tape.end() && it->second != -1) {
+      result.op_missing_tensor[it->second] += 1;
+    }
+  }
+  if (!persistent_tape) {
+    // Call destructors for all unneeded gradient functions and
+    // clear the op_tape. We can clear the tape because ownership of
+    // backward functions that will be used for gradient computation
+    // has been transferred to `result`.
+    for (const auto& op_pair : *op_tape) {
+      op_pair.second.backward_function_deleter(
+          op_pair.second.backward_function);
+    }
+    op_tape->clear();
+  }
+  return result;
+}
+
+template <typename BackwardFunction, typename TapeTensor>
+std::vector<int64_t> InitialStack(
+    const OpTape<BackwardFunction, TapeTensor>& op_tape,
+    const std::unordered_map<int64_t, int64_t>& op_missing_tensor) {
+  std::vector<int64_t> result;
+  for (auto& op_entry : op_tape) {
+    if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
+      result.push_back(op_entry.first);
+    }
+  }
+  return result;
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+absl::Status InitialGradients(
+    const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
+    absl::Span<const int64_t> target_tensor_ids,
+    const std::unordered_map<int64_t, TapeTensor>& sources_that_are_targets,
+    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
+    const OpTape<BackwardFunction, TapeTensor>& op_tape,
+    std::unordered_map<int64_t, std::vector<Gradient*>>* result) {
+  for (int i = 0, end = target_tensor_ids.size(); i < end; ++i) {
+    const int64_t id = target_tensor_ids[i];
+    if (output_gradients.empty() || output_gradients[i] == nullptr) {
+      auto tensor_it = tensor_tape.find(id);
+      if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
+        auto op_it = op_tape.find(tensor_it->second);
+        if (op_it == op_tape.end()) {
+          return errors::Internal(
+              "Internal state of the gradient tape is invalid: "
+              "failed to find operation producing a tensor");
+        }
+        bool found = false;
+        for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
+          if (op_it->second.output_tensor_info[j].GetID() == id) {
+            found = true;
+            Gradient* ones_like = nullptr;
+            TF_RETURN_IF_ERROR(vspace.BuildOnesLike(
+                op_it->second.output_tensor_info[j], &ones_like));
+            (*result)[id].push_back(ones_like);
+            break;
+          }
+        }
+        if (!found) {
+          return errors::Internal(
+              "Internal state of the gradient tape is invalid: "
+              "none of operations outputs match expected tensor");
+        }
+      } else {
+        // This target tensor was not generated by any operation recorded on
+        // the tape, so no gradient needs to be computed from it unless this
+        // target is also a source.
+        auto source_tensor = sources_that_are_targets.find(id);
+        if (source_tensor != sources_that_are_targets.end()) {
+          Gradient* ones_like = nullptr;
+          TF_RETURN_IF_ERROR(
+              vspace.BuildOnesLike(source_tensor->second, &ones_like));
+          (*result)[id].push_back(ones_like);
+        }
+      }
+    } else {
+      (*result)[id].push_back(output_gradients[i]);
+    }
+  }
+  return absl::OkStatus();
+}
+
+// TODO(agarwal): use an automatic mechanism for handling None arguments to
+// gradient functions.
+//
+// Some gradient functions can accept None arguments for gradients. The
+// following maps the operation name to the indices at which the corresponding
+// gradient function can accept None values. e.g. FusedBatchNorm outputs 5
+// values and hence receives 5 gradient values during backprop. However the
+// gradient function uses only the first of those values and ignores the rest.
+// The entry, "FusedBatchNorm": [1, 2, 3, 4], indicates that only the gradient
+// corresponding to index 0 is used, and the gradient values at indices 1-4 are
+// ignored (and hence can be None). The backprop algorithm can then leverage
+// this by not constructing zeros to pass for those indices.
+std::unordered_map<string, std::unordered_set<int>>*
+FunctionsAcceptingNoneForIndicesMap() {
+  static auto* const m =
+      new std::unordered_map<string, std::unordered_set<int>>({
+          {"SoftmaxCrossEntropyWithLogits", {1}},
+          {"SparseSoftmaxCrossEntropyWithLogits", {1}},
+          {"FusedBatchNorm", {1, 2, 3, 4}},
+      });
+  return m;
+}
+
+}  // namespace
+
+// If over kMinAggregateCount gradients are accumulated and the total
+// memory consumption is over kMinAggregateBytes, do an early aggregation
+// so as to release the gradient tensor to save memory.
+constexpr int kMinAggregateCount = 4;
+constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+absl::Status
+GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
+    const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
+    const absl::Span<const int64_t> target_tensor_ids,
+    const absl::Span<const int64_t> source_tensor_ids,
+    const std::unordered_map<int64_t, TapeTensor>& sources_that_are_targets,
+    gtl::ArraySlice<Gradient*> output_gradients, absl::Span<Gradient*> result,
+    bool build_default_zeros_grads) {
+  std::unordered_set<int64_t> sources_set(source_tensor_ids.begin(),
+                                          source_tensor_ids.end());
+  BackpropInitialState<BackwardFunction, TapeTensor> state = PrepareBackprop(
+      target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
+  std::vector<int64_t> op_stack =
+      InitialStack(state.op_tape, state.op_missing_tensor);
+  std::unordered_map<int64_t, std::vector<Gradient*>> gradients;
+  absl::Status s = InitialGradients(vspace, target_tensor_ids,
+                                    sources_that_are_targets, output_gradients,
+                                    tensor_tape_, state.op_tape, &gradients);
+  auto cleanup = gtl::MakeCleanup([this, &state]() {
+    if (!persistent_) {
+      // Release all backprop functions
+      for (const auto& pair : state.op_tape) {
+        pair.second.backward_function_deleter(pair.second.backward_function);
+      }
+    }
+  });
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unordered_map<int64_t, int64_t> gradients_size;
+  // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
+  // time, for better CPU backprop performance.
+  VLOG(1) << "Initial stack:";
+  if (VLOG_IS_ON(1)) {
+    for (auto t : op_stack) {
+      VLOG(1) << "  " << t;
+    }
+  }
+  while (!op_stack.empty()) {
+    const int64_t op = op_stack.back();
+    VLOG(1) << "Popped " << op;
+    op_stack.pop_back();
+    auto op_it = state.op_tape.find(op);
+    if (op_it == state.op_tape.end()) {
+      // It is possible for ops to end up on the stack if they are unrelated to
+      // the target; we should just skip them.
+      continue;
+    }
+    auto trace = std::move(op_it->second);
+    state.op_tape.erase(op_it);
+    std::vector<Gradient*> out_gradients;
+    out_gradients.reserve(trace.output_tensor_info.size());
+    std::vector<int64_t> unneeded_gradients;
+    for (int i = 0, end = trace.input_tensor_id.size(); i < end; i++) {
+      const auto& in_tensor_id = trace.input_tensor_id[i];
+      if (tensor_tape_.find(in_tensor_id) == tensor_tape_.end() &&
+          sources_set.find(in_tensor_id) == sources_set.end()) {
+        unneeded_gradients.push_back(i);
+      }
+    }
+
+    bool any_gradient_nonzero = false;
+    std::vector<int> zero_indices;
+    for (int i = 0, end = trace.output_tensor_info.size(); i < end; ++i) {
+      const int64_t id = trace.output_tensor_info[i].GetID();
+      auto grad_it = gradients.find(id);
+      if (grad_it == gradients.end()) {
+        out_gradients.push_back(nullptr);
+        if (build_default_zeros_grads) {
+          auto func_name_it =
+              FunctionsAcceptingNoneForIndicesMap()->find(trace.op_type);
+          if (func_name_it == FunctionsAcceptingNoneForIndicesMap()->end() ||
+              func_name_it->second.find(i) == func_name_it->second.end()) {
+            zero_indices.push_back(i);
+          }
+        }
+      } else {
+        any_gradient_nonzero = true;
+        Gradient* new_gradients = nullptr;
+        if (grad_it->second.size() == 1) {
+          new_gradients = grad_it->second.at(0);
+        } else {
+          new_gradients = vspace.AggregateGradients(grad_it->second);
+        }
+        if (sources_set.find(grad_it->first) == sources_set.end()) {
+          gradients.erase(grad_it);
+        } else {
+          grad_it->second.clear();
+          grad_it->second.push_back(new_gradients);
+          vspace.MarkAsResult(new_gradients);
+        }
+        out_gradients.push_back(new_gradients);
+      }
+    }
+    VLOG(1) << "Calling gradient function for '" << trace.op_type << "'";
+    std::vector<Gradient*> in_gradients(trace.input_tensor_id.size());
+    DCHECK(build_default_zeros_grads || zero_indices.empty());
+    if (any_gradient_nonzero) {
+      for (const auto i : zero_indices) {
+        out_gradients[i] = trace.output_tensor_info[i].ZerosLike();
+      }
+      absl::Status s;
+      s = vspace.CallBackwardFunction(trace.op_type, trace.backward_function,
+                                      unneeded_gradients, out_gradients,
+                                      absl::MakeSpan(in_gradients));
+      if (!persistent_) {
+        trace.backward_function_deleter(trace.backward_function);
+      }
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      if (!persistent_) {
+        trace.backward_function_deleter(trace.backward_function);
+      }
+      for (Gradient* grad : out_gradients) {
+        if (grad != nullptr) {
+          vspace.DeleteGradient(grad);
+        }
+      }
+    }
+    for (int i = 0, end = in_gradients.size(); i < end; ++i) {
+      const int64_t id = trace.input_tensor_id[i];
+      if (in_gradients[i] != nullptr) {
+        auto& unaggregated_grads = gradients[id];
+        unaggregated_grads.push_back(in_gradients[i]);
+        if (unaggregated_grads.size() > kMinAggregateCount) {
+          auto size_it = gradients_size.find(id);
+          int64_t size;
+          if (size_it == gradients_size.end()) {
+            size = vspace.NumElements(unaggregated_grads[0]);
+            gradients_size.emplace(id, size);
+          } else {
+            size = size_it->second;
+          }
+          if (unaggregated_grads.size() * size * 4 > kMinAggregateBytes) {
+            Gradient* grad = vspace.AggregateGradients(unaggregated_grads);
+            unaggregated_grads.clear();
+            unaggregated_grads.push_back(grad);
+          }
+        }
+      }
+      auto usage_count_it = state.tensor_usage_counts.find(id);
+      if (usage_count_it == state.tensor_usage_counts.end()) {
+        VLOG(1) << "Tensor " << id << " not used";
+        continue;
+      }
+      usage_count_it->second--;
+      if (usage_count_it->second > 0) {
+        VLOG(1) << "Tensor " << id << " usage count " << usage_count_it->second;
+        continue;
+      }
+      auto tape_it = tensor_tape_.find(id);
+      if (tape_it == tensor_tape_.end()) {
+        VLOG(1) << "Tensor " << id
+                << " has no associated op. Deleting gradient";
+        auto grad_it = gradients.find(id);
+        if (grad_it != gradients.end()) {
+          for (auto g : grad_it->second) {
+            vspace.DeleteGradient(g);
+          }
+          gradients.erase(grad_it);
+        }
+        continue;
+      }
+      const int64_t op_id = tape_it->second;
+      if (op_id == -1) {
+        VLOG(1) << "Tensor " << id << " is source";
+        continue;
+      }
+      auto missing_it = state.op_missing_tensor.find(op_id);
+      if (missing_it != state.op_missing_tensor.end()) {
+        missing_it->second--;
+        VLOG(1) << "Op " << op_id << " missing " << missing_it->second
+                << " output gradients";
+        if (missing_it->second == 0) {
+          op_stack.insert(op_stack.begin(), op_id);
+        }
+      }
+    }
+  }
+  if (!state.op_tape.empty()) {
+    return tensorflow::errors::Internal("Invalid tape state.");
+  }
+  if (result.size() != source_tensor_ids.size()) {
+    return errors::Internal("Expected result Span to be of size ",
+                            source_tensor_ids.size(), " found ", result.size(),
+                            " in call to Tape::ComputeGradient.");
+  }
+  std::unordered_set<int64_t> used_gradient_ids(source_tensor_ids.size());
+  for (int i = 0; i < source_tensor_ids.size(); i++) {
+    int64_t tensor_id = source_tensor_ids[i];
+    auto grad_it = gradients.find(tensor_id);
+    if (grad_it == gradients.end()) {
+      result[i] = nullptr;
+    } else {
+      if (grad_it->second.size() > 1) {
+        Gradient* grad = vspace.AggregateGradients(grad_it->second);
+        grad_it->second.clear();
+        grad_it->second.push_back(grad);
+      }
+      result[i] = grad_it->second[0];
+      used_gradient_ids.insert(tensor_id);
+    }
+  }
+  VLOG(1) << "Final gradients size: "
+          << gradients.size() - used_gradient_ids.size();
+  for (const auto& grad_pair : gradients) {
+    if (used_gradient_ids.find(grad_pair.first) == used_gradient_ids.end()) {
+      for (const auto& g : grad_pair.second) {
+        vspace.DeleteGradient(g);
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+bool ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ShouldRecord(
+    absl::Span<const int64_t> tensor_ids,
+    absl::Span<const tensorflow::DataType> dtypes) {
+  if (call_state_.top().backward_tape != nullptr) {
+    // If we're forwarding Accumulate calls to backward_tape's RecordOperation,
+    // we should also delegate ShouldRecord.
+    return call_state_.top().backward_tape->ShouldRecord(tensor_ids, dtypes);
+  }
+  if (call_state_.top().accumulating) {
+    return false;
+  }
+  for (int i = 0; i < tensor_ids.size(); ++i) {
+    if (accumulated_gradients_.find(tensor_ids[i]) !=
+        accumulated_gradients_.end()) {
+      if (IsDtypeTrainable(dtypes[i])) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+absl::Status
+ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ForwardpropFromTape(
+    const string& op_type, const std::vector<TapeTensor>& output_tensors,
+    const std::function<BackwardFunction*()>& backward_function_getter,
+    const std::function<void(BackwardFunction*)>& backward_function_deleter,
+    const std::vector<Gradient*>& in_grads, absl::Span<Gradient*> out_grads) {
+  /* This function is approximately equivalent to this Python code:
+
+  forwardprop_aids = tf.ones_like(output_tensors)
+  with tf.GradientTape() as g:
+    g.watch(forwardprop_aids)
+    grad = backward_function(forwardprop_aids)
+  forward_grads = g.gradient(grad, forwardprop_aids, output_gradients=in_grads)
+  accumulated_gradients_[ID(output_tensors)] = forward_grads
+  */
+  std::unique_ptr<GradientTape<Gradient, BackwardFunction, TapeTensor>> tape(
+      new GradientTape<Gradient, BackwardFunction, TapeTensor>(false));
+  AccumulatorCallState& call_state = call_state_.top();
+  call_state.backward_tape = tape.get();
+  auto pop_backward_tape =
+      gtl::MakeCleanup([&call_state] { call_state.backward_tape = nullptr; });
+  std::vector<Gradient*> forwardprop_aids;
+  std::vector<int64_t> sources;
+  std::unordered_set<int64_t> sources_set;
+  sources.reserve(output_tensors.size());
+  for (const TapeTensor& output_tensor : output_tensors) {
+    // Ownership of `aid` transferred to CallBackwardFunction below.
+    Gradient* aid;
+    if (output_tensor.GetDType() == tensorflow::DT_VARIANT) {
+      // Note: Needs to be zeros rather than ones since there's currently no
+      // ones_like for variants.
+      aid = output_tensor.ZerosLike();
+    } else {
+      // TODO(allenl): Figure out why using zeros_like everywhere causes issues
+      // for some gradient functions and if there's another way to work around
+      // it (e.g. conds instead of ifs). The value shouldn't really matter.
+      TF_RETURN_IF_ERROR(vspace_.BuildOnesLike(output_tensor, &aid));
+    }
+    if (TF_PREDICT_FALSE(aid == nullptr)) {
+      return tensorflow::errors::Internal(
+          "Failed to create ones tensor for tensor ", output_tensor.GetID(),
+          " with dtype ", output_tensor.GetDType());
+    }
+    forwardprop_aids.push_back(aid);
+    int64_t aid_id = vspace_.TensorId(aid);
+    sources.push_back(aid_id);
+    sources_set.insert(aid_id);
+    tape->Watch(aid_id);
+  }
+  std::vector<Gradient*> grad(in_grads.size());
+  auto delete_grad = gtl::MakeCleanup([&grad, this] {
+    for (Gradient* tensor : grad) {
+      this->vspace_.DeleteGradient(tensor);
+    }
+  });
+  {
+    std::vector<int64_t> unneeded_gradients;
+    std::unique_ptr<BackwardFunction, std::function<void(BackwardFunction*)>>
+        backward_function(backward_function_getter(),
+                          backward_function_deleter);
+    TF_RETURN_IF_ERROR(vspace_.CallBackwardFunction(
+        op_type, backward_function.get(), unneeded_gradients, forwardprop_aids,
+        absl::MakeSpan(grad)));
+  }
+
+  // Stop the tape from recording
+  pop_backward_tape.release()();
+
+  std::vector<int64_t> targets;
+  std::vector<Gradient*> used_in_grads;
+  // We may end up with slightly fewer elements than we reserve, but grad.size()
+  // should be a reasonably tight upper bound.
+  targets.reserve(grad.size());
+  used_in_grads.reserve(grad.size());
+  std::unordered_map<int64_t, TapeTensor> sources_that_are_targets;
+  for (int grad_index = 0, end = grad.size(); grad_index < end; ++grad_index) {
+    Gradient* grad_tensor = grad[grad_index];
+    if (grad_tensor != nullptr) {
+      int64_t tensor_id = vspace_.TensorId(grad_tensor);
+      targets.push_back(tensor_id);
+      if (sources_set.find(tensor_id) != sources_set.end()) {
+        sources_that_are_targets.emplace(
+            tensor_id, vspace_.TapeTensorFromGradient(grad_tensor));
+      }
+      Gradient* in_grad = in_grads[grad_index];
+      if (in_grad != nullptr) {
+        // ComputeGradient steals a reference
+        vspace_.MarkAsResult(in_grad);
+      }
+      used_in_grads.push_back(in_grad);
+    }
+  }
+
+  return tape->ComputeGradient(vspace_, targets, sources,
+                               sources_that_are_targets, used_in_grads,
+                               out_grads);
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+absl::Status
+ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Accumulate(
+    const string& op_type, const std::vector<TapeTensor>& input_tensors,
+    const std::vector<TapeTensor>& output_tensors,
+    absl::Span<const int64_t> input_tensor_id,
+    absl::Span<const tensorflow::DataType> input_dtypes,
+    const ForwardFunction<Gradient>* forward_function,
+    const std::function<BackwardFunction*()>& backward_function_getter,
+    const std::function<void(BackwardFunction*)>& backward_function_deleter) {
+  if (call_state_.top().backward_tape != nullptr) {
+    // If backward_tape is not null, then this call to Accumulate is the result
+    // of a still-active call to Accumulate which is running operations. We
+    // forward these operations to backward_tape so the outer Accumulate call
+    // can do its work.
+    //
+    // Rather than re-entering and delegating Accumulate like this, we could
+    // instead allow ForwardAccumulator some control over the current tape set
+    // (so it can deactivate itself and activate its GradientTape). Currently
+    // that is managed by the language binding and would require relatively
+    // messy callbacks.
+    call_state_.top().backward_tape->RecordOperation(
+        op_type, output_tensors, input_tensor_id, input_dtypes,
+        backward_function_getter, backward_function_deleter);
+    return absl::OkStatus();
+  }
+  if (!ShouldRecord(input_tensor_id, input_dtypes)) {
+    return absl::OkStatus();
+  }
+
+  // We may need to allocate zero inputs for trainable dtypes we don't have JVPs
+  // for. Make sure they get cleaned up.
+  std::vector<Gradient*> new_zeros;
+  auto delete_new_zeros = gtl::MakeCleanup([&new_zeros, this] {
+    for (Gradient* tensor : new_zeros) {
+      this->vspace_.DeleteGradient(tensor);
+    }
+  });
+  std::vector<Gradient*> in_grads;
+  in_grads.reserve(input_tensors.size());
+  for (int target_index = 0; target_index < input_tensors.size();
+       ++target_index) {
+    const auto current_grad =
+        accumulated_gradients_.find(input_tensors[target_index].GetID());
+    if (current_grad == accumulated_gradients_.end()) {
+      if (IsDtypeTrainable(input_tensors[target_index].GetDType())) {
+        // ForwardAccumulator defaults to zeros for unwatched Tensors, unlike
+        // GradientTape which uses ones.
+        Gradient* zero = input_tensors[target_index].ZerosLike();
+        new_zeros.push_back(zero);
+        in_grads.push_back(zero);
+      } else {
+        in_grads.push_back(nullptr);
+      }
+    } else {
+      in_grads.push_back(current_grad->second);
+    }
+  }
+
+  // Avoid infinite recursion. Whichever forward function we run, it'll end up
+  // executing ops, and we don't want to watch those with this accumulator.
+  call_state_.emplace(nullptr, true);
+  auto pop_call_state = gtl::MakeCleanup([this] { this->call_state_.pop(); });
+
+  std::vector<Gradient*> forward_grads;
+  if (forward_function == nullptr) {
+    // We have no special-cased forward gradient. Fall back to running the
+    // backward function under a gradient tape.
+    forward_grads.resize(output_tensors.size());
+    TF_RETURN_IF_ERROR(ForwardpropFromTape(
+        op_type, output_tensors, backward_function_getter,
+        backward_function_deleter, in_grads, absl::MakeSpan(forward_grads)));
+  } else {
+    TF_RETURN_IF_ERROR(
+        (*forward_function)(in_grads, &forward_grads, use_batch_));
+  }
+  for (int i = 0; i < forward_grads.size(); ++i) {
+    if (forward_grads[i] != nullptr) {
+      int64_t tensor_id = output_tensors[i].GetID();
+      auto existing = accumulated_gradients_.find(tensor_id);
+      if (existing != accumulated_gradients_.end()) {
+        // This is a somewhat odd case to be in, since it means we have two
+        // operations which supposedly both created the same Tensor. It comes up
+        // in recompute_grad, where the gradients have the same value. However,
+        // only the original gradient is connected to everything else, so we
+        // should still use that.
+        vspace_.DeleteGradient(forward_grads[i]);
+      } else {
+        accumulated_gradients_[output_tensors[i].GetID()] = forward_grads[i];
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+void ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Watch(
+    int64_t tensor_id, Gradient* tangent) {
+  typename std::unordered_map<int64_t, Gradient*>::iterator existing =
+      accumulated_gradients_.find(tensor_id);
+  vspace_.MarkAsResult(tangent);
+  if (existing == accumulated_gradients_.end()) {
+    accumulated_gradients_.emplace(tensor_id, tangent);
+  } else {
+    std::array<Gradient*, 2> to_aggregate;
+    to_aggregate[0] = tangent;
+    to_aggregate[1] = existing->second;
+    // AggregateGradients steals a reference to each of its arguments. We
+    // MarkAsResult on `tangent` above so we don't steal a reference to it.
+    existing->second = vspace_.AggregateGradients(to_aggregate);
+  }
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+void ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::DeleteGradient(
+    int64_t tensor_id) {
+  auto existing = accumulated_gradients_.find(tensor_id);
+  if (existing != accumulated_gradients_.end()) {
+    vspace_.DeleteGradient(existing->second);
+    accumulated_gradients_.erase(existing);
+  }
+}
+
+template <typename Gradient, typename BackwardFunction, typename TapeTensor>
+Gradient* ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::FetchJVP(
+    int64_t tensor_id) {
+  auto lookup = accumulated_gradients_.find(tensor_id);
+  if (lookup == accumulated_gradients_.end()) {
+    return nullptr;
+  } else {
+    return lookup->second;
+  }
+}
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_cancellation_manager_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_cancellation_manager_internal.h
new file mode 100644
index 00000000..6fdecd78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_cancellation_manager_internal.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_CANCELLATION_MANAGER_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_CANCELLATION_MANAGER_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/core/framework/cancellation.h"
+
+struct TFE_CancellationManager;
+typedef struct TFE_CancellationManager TFE_CancellationManager;
+
+namespace tensorflow {
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::CancellationManager,
+                            TFE_CancellationManager);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::CancellationManager*,
+                            TFE_CancellationManager*);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_CANCELLATION_MANAGER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_context_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_context_internal.h
new file mode 100644
index 00000000..1f203531
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_context_internal.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+
+// Wraps a pointer to a context implementation.
+//
+// WARNING: Since the underlying object could be ref-counted a user of this
+// interface cannot destruct the underlying context object. Instead, call
+// TFE_DeleteContext who calls Release() on the context pointer and deletes
+// the TFE_Context structure.
+typedef struct TFE_Context TFE_Context;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionContext, TFE_Context);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_executor_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_executor_internal.h
new file mode 100644
index 00000000..7f55532a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_executor_internal.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_EXECUTOR_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_EXECUTOR_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+
+struct TFE_Executor {
+  explicit TFE_Executor(bool async, bool enable_streaming_enqueue,
+                        int in_flight_nodes_limit)
+      : owned_executor(new tensorflow::EagerExecutor(
+            async, enable_streaming_enqueue, in_flight_nodes_limit)) {}
+
+  explicit TFE_Executor(tensorflow::EagerExecutor* executor)
+      : owned_executor(nullptr), unowned_executor(executor) {}
+
+  tensorflow::EagerExecutor* executor() {
+    return owned_executor == nullptr ? unowned_executor : owned_executor.get();
+  }
+
+  std::unique_ptr<tensorflow::EagerExecutor> owned_executor;
+  tensorflow::EagerExecutor* unowned_executor;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_EXECUTOR_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_monitoring_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_monitoring_internal.h
new file mode 100644
index 00000000..e33eaa23
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_monitoring_internal.h
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_MONITORING_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_MONITORING_INTERNAL_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/platform/types.h"
+
+struct TFE_MonitoringCounterCell {
+  tensorflow::monitoring::CounterCell cell;
+};
+
+template <int NumLabels>
+struct TFE_MonitoringCounter {
+  template <typename... LabelDesc>
+  TFE_MonitoringCounter(const char* name, const char* description,
+                        LabelDesc&&... label) {
+    counter = absl::WrapUnique(tensorflow::monitoring::Counter<NumLabels>::New(
+        name, description, label...));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Counter<NumLabels>> counter;
+};
+
+struct TFE_MonitoringCounter0 : TFE_MonitoringCounter<0> {
+  using TFE_MonitoringCounter::TFE_MonitoringCounter;
+};
+struct TFE_MonitoringCounter1 : TFE_MonitoringCounter<1> {
+  using TFE_MonitoringCounter::TFE_MonitoringCounter;
+};
+struct TFE_MonitoringCounter2 : TFE_MonitoringCounter<2> {
+  using TFE_MonitoringCounter::TFE_MonitoringCounter;
+};
+
+struct TFE_MonitoringIntGaugeCell {
+  tensorflow::monitoring::GaugeCell<int64_t> cell;
+};
+struct TFE_MonitoringStringGaugeCell {
+  tensorflow::monitoring::GaugeCell<tensorflow::string> cell;
+};
+struct TFE_MonitoringBoolGaugeCell {
+  tensorflow::monitoring::GaugeCell<bool> cell;
+};
+
+template <typename ValueType, int NumLabels>
+struct TFE_MonitoringGauge {
+  template <typename... LabelDesc>
+  TFE_MonitoringGauge(const char* name, const char* description,
+                      LabelDesc&&... label) {
+    gauge = absl::WrapUnique(
+        tensorflow::monitoring::Gauge<ValueType, NumLabels>::New(
+            name, description, label...));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Gauge<ValueType, NumLabels>> gauge;
+};
+
+struct TFE_MonitoringIntGauge0 : TFE_MonitoringGauge<int64_t, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringIntGauge1 : TFE_MonitoringGauge<int64_t, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringIntGauge2 : TFE_MonitoringGauge<int64_t, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringStringGauge0 : TFE_MonitoringGauge<tensorflow::string, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge1 : TFE_MonitoringGauge<tensorflow::string, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge2 : TFE_MonitoringGauge<tensorflow::string, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge3 : TFE_MonitoringGauge<tensorflow::string, 3> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringStringGauge4 : TFE_MonitoringGauge<tensorflow::string, 4> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringBoolGauge0 : TFE_MonitoringGauge<bool, 0> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringBoolGauge1 : TFE_MonitoringGauge<bool, 1> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+struct TFE_MonitoringBoolGauge2 : TFE_MonitoringGauge<bool, 2> {
+  using TFE_MonitoringGauge::TFE_MonitoringGauge;
+};
+
+struct TFE_MonitoringBuckets {
+  explicit TFE_MonitoringBuckets(
+      std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
+          fn) {
+    create_buckets = fn;
+  }
+
+  std::function<std::unique_ptr<tensorflow::monitoring::Buckets>(void)>
+      create_buckets;
+};
+
+struct TFE_MonitoringSamplerCell {
+  tensorflow::monitoring::SamplerCell cell;
+};
+
+template <int NumLabels>
+struct TFE_MonitoringSampler {
+  template <typename... LabelDesc>
+  TFE_MonitoringSampler(
+      const char* name,
+      std::unique_ptr<tensorflow::monitoring::Buckets> buckets,
+      const char* description, LabelDesc&&... label) {
+    sampler = absl::WrapUnique(tensorflow::monitoring::Sampler<NumLabels>::New(
+        {name, description, label...}, std::move(buckets)));
+  }
+
+  std::unique_ptr<tensorflow::monitoring::Sampler<NumLabels>> sampler;
+};
+
+struct TFE_MonitoringSampler0 : TFE_MonitoringSampler<0> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+struct TFE_MonitoringSampler1 : TFE_MonitoringSampler<1> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+struct TFE_MonitoringSampler2 : TFE_MonitoringSampler<2> {
+  using TFE_MonitoringSampler::TFE_MonitoringSampler;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_MONITORING_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_monitoring_reader_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_monitoring_reader_internal.h
new file mode 100644
index 00000000..3c63e672
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_monitoring_reader_internal.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_TFE_MONITORING_READER_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_MONITORING_READER_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
+
+struct TFE_MonitoringCounterReader {
+  explicit TFE_MonitoringCounterReader(const char* name) {
+    counter = std::make_unique<
+        ::tensorflow::monitoring::testing::CellReader<int64_t>>(name);
+  }
+  template <typename... LabelType>
+  int64_t Read(const LabelType&... labels);
+  std::unique_ptr<::tensorflow::monitoring::testing::CellReader<int64_t>>
+      counter;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_MONITORING_READER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_op_attrs_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_op_attrs_internal.h
new file mode 100644
index 00000000..24e3692a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_op_attrs_internal.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_OP_ATTRS_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_OP_ATTRS_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/abstract_op_attrs.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+// An equivalent of a tensorflow::NameAttrList protocol buffer, but used in ways
+// that sometimes do not require serialization.
+typedef struct TFE_OpAttrs TFE_OpAttrs;
+
+typedef struct TFE_Context TFE_Context;
+typedef struct TFE_Op TFE_Op;
+
+namespace tensorflow {
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOpAttrs, TFE_OpAttrs);
+
+// Set an AttrValue on the op. Doesn't handle the list types.
+void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
+                          const tensorflow::AttrValue& default_value,
+                          const char* attr_name, TF_Status* status);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_OP_ATTRS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_op_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_op_internal.h
new file mode 100644
index 00000000..3fe94d35
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_op_internal.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+
+// Wraps a pointer to an operation implementation.
+//
+// WARNING: Since the underlying object could be ref-counted a user of this
+// interface cannot destruct the underlying operation object. Instead, call
+// TFE_DeleteOp who calls Release() on the operation pointer and deletes
+// the TFE_Op structure.
+typedef struct TFE_Op TFE_Op;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionOperation, TFE_Op);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionOperation*, TFE_Op*);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_tensor_debug_info_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_tensor_debug_info_internal.h
new file mode 100644
index 00000000..0c570660
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_tensor_debug_info_internal.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_TENSOR_DEBUG_INFO_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_TENSOR_DEBUG_INFO_INTERNAL_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+struct TFE_TensorDebugInfo {
+  explicit TFE_TensorDebugInfo(const std::vector<int64_t>& dims)
+      : dev_dims(dims) {}
+
+  // Fully-padded, minor-to-major.
+  std::vector<int64_t> dev_dims;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_TENSOR_DEBUG_INFO_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tfe_tensorhandle_internal.h b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_tensorhandle_internal.h
new file mode 100644
index 00000000..308e8c24
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tfe_tensorhandle_internal.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+
+// Wraps a pointer to a tensor handle implementation.
+//
+// WARNING: Since the underlying object could be ref-counted a user of this
+// interface cannot destruct the underlying handle object. Instead, call
+// TFE_DeleteTensorHandle who calls Release() on the handle pointer and deletes
+// the TFE_TensorHandle structure.
+typedef struct TFE_TensorHandle TFE_TensorHandle;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionTensorHandle,
+                            TFE_TensorHandle);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionTensorHandle*,
+                            TFE_TensorHandle*);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/tracing_utils.h b/third_party/tflite-hdrs/tensorflow/c/eager/tracing_utils.h
new file mode 100644
index 00000000..1c336322
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/tracing_utils.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TRACING_UTILS_H_
+#define TENSORFLOW_C_EAGER_TRACING_UTILS_H_
+
+#include "tensorflow/c/eager/abstract_operation.h"
+
+namespace tensorflow {
+namespace tracing {
+absl::Status MaybeSetOpName(AbstractOperation*, const char* op_name);
+}  // namespace tracing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TRACING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/eager/unified_api_testutil.h b/third_party/tflite-hdrs/tensorflow/c/eager/unified_api_testutil.h
new file mode 100644
index 00000000..2df18c13
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/eager/unified_api_testutil.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_UNIFIED_API_TESTUTIL_H_
+#define TENSORFLOW_C_EAGER_UNIFIED_API_TESTUTIL_H_
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Builds and returns a `TracingContext` using the default tracing impl.
+AbstractContext* BuildFunction(const char* fn_name);
+
+// Creates parameters (placeholders) in the tracing `ctx` using the shape and
+// dtype of `inputs`.
+absl::Status CreateParamsForInputs(
+    AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+    std::vector<AbstractTensorHandle*>* params);
+
+// A callable that takes tensor inputs and returns zero or more tensor outputs.
+using Model = std::function<absl::Status(
+    AbstractContext*, absl::Span<AbstractTensorHandle* const>,
+    absl::Span<AbstractTensorHandle*>)>;
+
+// Runs `model` maybe wrapped in a function call op. This can be thought as
+// being equivalent to the following python code.
+//
+// if use_function:
+//   outputs = tf.function(model)(inputs)
+// else:
+//   outputs = model(inputs)
+absl::Status RunModel(Model model, AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      bool use_function);
+
+absl::Status BuildImmediateExecutionContext(bool use_tfrt,
+                                            AbstractContext** ctx);
+
+// Return a tensor handle with given type, values and dimensions.
+template <class T, TF_DataType datatype>
+absl::Status TestTensorHandleWithDims(AbstractContext* ctx, const T* data,
+                                      const int64_t* dims, int num_dims,
+                                      AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager =
+      TestTensorHandleWithDims<T, datatype>(eager_ctx, data, dims, num_dims);
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return absl::OkStatus();
+}
+
+// Return a scalar tensor handle with given value.
+template <class T, TF_DataType datatype>
+absl::Status TestScalarTensorHandle(AbstractContext* ctx, const T value,
+                                    AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager =
+      TestScalarTensorHandle<T, datatype>(eager_ctx, value);
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return absl::OkStatus();
+}
+
+// Places data from `t` into *result_tensor.
+absl::Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_UNIFIED_API_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/env.h b/third_party/tflite-hdrs/tensorflow/c/env.h
new file mode 100644
index 00000000..ac6a9e32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/env.h
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_ENV_H_
+#define TENSORFLOW_C_ENV_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_file_statistics.h"
+#include "tensorflow/c/tf_status.h"
+
+// --------------------------------------------------------------------------
+// C API for tensorflow::Env.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_WritableFileHandle TF_WritableFileHandle;
+typedef struct TF_StringStream TF_StringStream;
+typedef struct TF_Thread TF_Thread;
+
+typedef struct TF_ThreadOptions {
+  // Thread stack size to use (in bytes), zero implies that the system default
+  // will be used.
+  size_t stack_size;
+
+  // Guard area size to use near thread stacks to use (in bytes), zero implies
+  // that the system default will be used.
+  size_t guard_size;
+
+  // The NUMA node to use, -1 implies that there should be no NUMA affinity for
+  // this thread.
+  int numa_node;
+} TF_ThreadOptions;
+
+// Creates the specified directory. Typical status code are:
+//  * TF_OK - successfully created the directory
+//  * TF_ALREADY_EXISTS - directory already exists
+//  * TF_PERMISSION_DENIED - dirname is not writable
+TF_CAPI_EXPORT extern void TF_CreateDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory. Typical status codes are:
+//  * TF_OK - successfully deleted the directory
+//  * TF_FAILED_PRECONDITION - the directory is not empty
+TF_CAPI_EXPORT extern void TF_DeleteDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory and all subdirectories and files underneath
+// it. This is accomplished by traversing the directory tree rooted at dirname
+// and deleting entries as they are encountered.
+//
+// If dirname itself is not readable or does not exist, *undeleted_dir_count is
+// set to 1, *undeleted_file_count is set to 0 and an appropriate status (e.g.
+// TF_NOT_FOUND) is returned.
+//
+// If dirname and all its descendants were successfully deleted, TF_OK is
+// returned and both error counters are set to zero.
+//
+// Otherwise, while traversing the tree, undeleted_file_count and
+// undeleted_dir_count are updated if an entry of the corresponding type could
+// not be deleted. The returned error status represents the reason that any one
+// of these entries could not be deleted.
+//
+// Typical status codes:
+//  * TF_OK - dirname exists and we were able to delete everything underneath
+//  * TF_NOT_FOUND - dirname doesn't exist
+//  * TF_PERMISSION_DENIED - dirname or some descendant is not writable
+//  * TF_UNIMPLEMENTED - some underlying functions (like Delete) are not
+//    implemented
+TF_CAPI_EXPORT extern void TF_DeleteRecursively(const char* dirname,
+                                                int64_t* undeleted_file_count,
+                                                int64_t* undeleted_dir_count,
+                                                TF_Status* status);
+
+// Obtains statistics for the given path. If status is TF_OK, *stats is
+// updated, otherwise it is not touched.
+TF_CAPI_EXPORT extern void TF_FileStat(const char* filename,
+                                       TF_FileStatistics* stats,
+                                       TF_Status* status);
+
+// Creates or truncates the given filename and returns a handle to be used for
+// appending data to the file. If status is TF_OK, *handle is updated and the
+// caller is responsible for freeing it (see TF_CloseWritableFile).
+TF_CAPI_EXPORT extern void TF_NewWritableFile(const char* filename,
+                                              TF_WritableFileHandle** handle,
+                                              TF_Status* status);
+
+// Closes the given handle and frees its memory. If there was a problem closing
+// the file, it is indicated by status. Memory is freed in any case.
+TF_CAPI_EXPORT extern void TF_CloseWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Syncs content of the handle to the filesystem. Blocks waiting for the
+// filesystem to indicate that the content has been persisted.
+TF_CAPI_EXPORT extern void TF_SyncWritableFile(TF_WritableFileHandle* handle,
+                                               TF_Status* status);
+
+// Flush local buffers to the filesystem. If the process terminates after a
+// successful flush, the contents may still be persisted, since the underlying
+// filesystem may eventually flush the contents.  If the OS or machine crashes
+// after a successful flush, the contents may or may not be persisted, depending
+// on the implementation.
+TF_CAPI_EXPORT extern void TF_FlushWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Appends the given bytes to the file. Any failure to do so is indicated in
+// status.
+TF_CAPI_EXPORT extern void TF_AppendWritableFile(TF_WritableFileHandle* handle,
+                                                 const char* data,
+                                                 size_t length,
+                                                 TF_Status* status);
+
+// Deletes the named file and indicates whether successful in *status.
+TF_CAPI_EXPORT extern void TF_DeleteFile(const char* filename,
+                                         TF_Status* status);
+
+// Retrieves the next item from the given TF_StringStream and places a pointer
+// to it in *result. If no more items are in the list, *result is set to NULL
+// and false is returned.
+//
+// Ownership of the items retrieved with this function remains with the library.
+// Item points are invalidated after a call to TF_StringStreamDone.
+TF_CAPI_EXPORT extern bool TF_StringStreamNext(TF_StringStream* list,
+                                               const char** result);
+
+// Frees the resources associated with given string list. All pointers returned
+// by TF_StringStreamNext are invalid after this call.
+TF_CAPI_EXPORT extern void TF_StringStreamDone(TF_StringStream* list);
+
+// Retrieves the list of children of the given directory. You can iterate
+// through the list with TF_StringStreamNext. The caller is responsible for
+// freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetChildren(const char* filename,
+                                                      TF_Status* status);
+
+// Retrieves a list of directory names on the local machine that may be used for
+// temporary storage. You can iterate through the list with TF_StringStreamNext.
+// The caller is responsible for freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
+
+// Creates a temporary file name with an extension.
+// The caller is responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern char* TF_GetTempFileName(const char* extension);
+
+// Returns the number of nanoseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void);
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void);
+
+// Populates a TF_ThreadOptions struct with system-default values.
+TF_CAPI_EXPORT extern void TF_DefaultThreadOptions(TF_ThreadOptions* options);
+
+// Returns a new thread that is running work_func and is identified
+// (for debugging/performance-analysis) by thread_name.
+//
+// The given param (which may be null) is passed to work_func when the thread
+// starts. In this way, data may be passed from the thread back to the caller.
+//
+// Caller takes ownership of the result and must call TF_JoinThread on it
+// eventually.
+TF_CAPI_EXPORT extern TF_Thread* TF_StartThread(const TF_ThreadOptions* options,
+                                                const char* thread_name,
+                                                void (*work_func)(void*),
+                                                void* param);
+
+// Waits for the given thread to finish execution, then deletes it.
+TF_CAPI_EXPORT extern void TF_JoinThread(TF_Thread* thread);
+
+// \brief Load a dynamic library.
+//
+// Pass "library_filename" to a platform-specific mechanism for dynamically
+// loading a library. The rules for determining the exact location of the
+// library are platform-specific and are not documented here.
+//
+// On success, place OK in status and return the newly created library handle.
+// Otherwise returns nullptr and set error status.
+TF_CAPI_EXPORT extern void* TF_LoadSharedLibrary(const char* library_filename,
+                                                 TF_Status* status);
+
+// \brief Get a pointer to a symbol from a dynamic library.
+//
+// "handle" should be a pointer returned from a previous call to
+// TF_LoadLibraryFromEnv. On success, place OK in status and return a pointer to
+// the located symbol. Otherwise returns nullptr and set error status.
+TF_CAPI_EXPORT extern void* TF_GetSymbolFromLibrary(void* handle,
+                                                    const char* symbol_name,
+                                                    TF_Status* status);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_C_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/filesystem_interface.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/filesystem_interface.h
new file mode 100644
index 00000000..13fd7632
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/filesystem_interface.h
@@ -0,0 +1,1125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_FILESYSTEM_INTERFACE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_FILESYSTEM_INTERFACE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/tf_file_statistics.h"
+#include "tensorflow/c/tf_status.h"
+
+/// This is the interop header between core TensorFlow and modular filesystem
+/// plugins (see initial RFC https://github.com/tensorflow/community/pull/101).
+///
+/// Both core TensorFlow and every plugin will use this header. The associated
+/// `.cc` file is only used by core TensorFlow to implement checking needed for
+/// plugin registration and ensuring API and ABI compatibility. Plugin authors
+/// don't need to read the `.cc` file but they should consult every section of
+/// this file to ensure a compliant plugin can be built and that the plugin can
+/// be used without recompilation in the widest range of TensorFlow versions.
+///
+/// The header is divided into sections, as follows:
+///   1. Opaque plugin private data structures and wrappers for type safety;
+///   2. Function tables for plugin functionality;
+///   3. Versioning metadata;
+///   4. Plugin registration API and the DSO entry point.
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// SECTION 1. Opaque data structures to hold plugin specific data
+/// ----------------------------------------------------------------------------
+///
+/// The following data structures incorporate a `void*` that is opaque to
+/// TensorFlow but can be used by each filesystem plugin to represent internal
+/// data.
+///
+/// We prefer to have these structures instead of passing `void*` into
+/// method signatures to have some type of type safety: for example, operations
+/// that are only valid on random access files have a `TF_RandomAccessFile`
+/// argument.
+///
+/// Lifetime: The wrapper data structures are owned by core TensorFlow. The data
+/// pointed to by the `void*` members is always owned by the plugin. The plugin
+/// will provide functions to call to allocate and deallocate this data (see
+/// next sections) and core TensorFlow ensures to call these at the proper time.
+///
+/// Plugins will never receive a `TF_*` pointer that is `nullptr`. Core
+/// TensorFlow will never touch the `void*` wrapped by these structures, except
+/// to initialize it as `nullptr`.
+
+typedef struct TF_RandomAccessFile {
+  void* plugin_file;
+} TF_RandomAccessFile;
+
+typedef struct TF_WritableFile {
+  void* plugin_file;
+} TF_WritableFile;
+
+typedef struct TF_ReadOnlyMemoryRegion {
+  void* plugin_memory_region;
+} TF_ReadOnlyMemoryRegion;
+
+typedef struct TF_Filesystem {
+  void* plugin_filesystem;
+} TF_Filesystem;
+
+typedef struct TF_TransactionToken {
+  void* token;
+  TF_Filesystem* owner;
+} TF_TransactionToken;
+
+// The named union is needed here (as opposed to
+// inside the `TF_Filesystem_Option_Value` struct)
+// as MSVC does not recognize `typeof`.
+typedef union TF_Filesystem_Option_Value_Union {
+  int64_t int_val;
+  double real_val;
+  struct {
+    char* buf;
+    int buf_length;
+  } buffer_val;
+} TF_Filesystem_Option_Value_Union;
+
+typedef struct TF_Filesystem_Option_Value {
+  int type_tag;    // type of values in the values union
+  int num_values;  // number of values
+  TF_Filesystem_Option_Value_Union*
+      values;  // owned (plugins must make a copy if storing this)
+} TF_Filesystem_Option_Value;
+
+typedef enum TF_Filesystem_Option_Type {
+  TF_Filesystem_Option_Type_Int = 0,
+  TF_Filesystem_Option_Type_Real,
+  TF_Filesystem_Option_Type_Buffer,
+  TF_Filesystem_Num_Option_Types,  // must always be the last item
+} TF_Filesystem_Option_Type;
+
+typedef struct TF_Filesystem_Option {
+  char* name;                         // null terminated, owned
+  char* description;                  // null terminated, owned
+  int per_file;                       // bool actually, but bool is not a C type
+  TF_Filesystem_Option_Value* value;  // owned
+} TF_Filesystem_Option;
+
+/// SECTION 2. Function tables for functionality provided by plugins
+/// ----------------------------------------------------------------------------
+///
+/// The following data structures represent the function tables for operations
+/// that plugins provide (some are mandatory, some are optional, with or without
+/// a default implementation).
+///
+/// Each plugin implements the operations that are supported and TensorFlow will
+/// properly handle the cases when an operation is not supported (i.e., return
+/// the corresponding `Status` value).
+///
+/// REQUIRED OPERATIONS: All required operations are marked as such, including
+/// operations which are conditionally required. If the presence of an operation
+/// `foo` requires operation `bar` to be present, this is specified in `foo`. If
+/// the entire set of operations in a table is not provided, use `nullptr` for
+/// the struct pointer (e.g., when a file type is not supported).
+///
+/// DEFAULT IMPLEMENTATIONS: Some operations have default implementations that
+/// TensorFlow uses in case the plugin doesn't supply its own version. An
+/// operation `foo` might have a default implementation which uses `bar` and
+/// `foobar`. If the plugin supplies `bar` and `foobar`, TensorFlow can use the
+/// default implementation of `foo`.
+///
+/// During plugin loading, plugins will call the registration function provided
+/// by this interface, supplying values for each of these structures. Core
+/// TensorFlow checks that the plugin supplies all mandatory operations and
+/// then copies these tables to a different memory location, marking the new
+/// operation tables as read-only. Once a plugin is loaded, none of these
+/// operation pointers may change.
+///
+/// There are 4 function tables: one for each of the 3 file objects in
+/// TensorFlow (i.e., `RandomAccessFile`, `WritableFile`,
+/// `ReadOnlyMemoryRegion`) and one for all the operations a `Filesystem`
+/// implements. Each of them is in a 1-to-1 correspondence with the wrapper
+/// structures from the first section: these tables only contain function
+/// pointers that operate on the corresponding data. Thus, the first argument of
+/// each of these functions is a pointer to the paired struct and this argument
+/// can be used to track state in between calls (from an object oriented point
+/// of view, this can be viewed as a "vtable" for a "class" -- that is the
+/// corresponding struct above --; the first argument is in place of `this`).
+///
+/// Except where noted otherwise, all pointer arguments are owned by core
+/// TensorFlow and are guaranteed to not be `nullptr`.
+///
+/// All path-like arguments are null terminated `char*` strings. Plugins can
+/// assume that before any function using path arguments is invoked, the path is
+/// made canonical by calling the function provided by `translate_name` or a
+/// default implementation of that (supplied by core TensorFlow).
+///
+/// The only time the pointer to the `TF_*` structures from section 1 is not
+/// marked `const` in these functions is when these function are either
+/// allocating or deallocating the plugin specific data. That is, in the 4
+/// `cleanup` functions (one for each data structure), the `init` function for
+/// `TF_Filesystem` and the `new_*` methods of `TF_FilesystemOps` to initialize
+/// the 3 types of files. In all other cases, there is no need to modify the
+/// address of the opaque data pointer, hence the wrapper pointer is marked
+/// `const`.
+///
+/// For consistency, the arguments on all these functions follow the same
+/// pattern: first we have the opaque pointer argument ("this" above), then the
+/// input arguments, then the in-out arguments (if any) and we finish the
+/// argument list with the out arguments. We only use the return type for an out
+/// parameter if that is a plain C type, as this ensures ABI compatibility
+/// (returning structures has issues in case compiler options affect
+/// optimizations such as RVO). If a status needs to be returned from these
+/// methods, the last argument is always a `TF_Status *` (or an array of such
+/// pointers) owned by core TensorFlow and guaranteed to not be `nullptr`.
+///
+/// To ensure ABI and API compatibility, we have out-of-bounds data that is used
+/// by both core TensorFlow and the plugin at load time. We don't include this
+/// data in the structures here to prevent cases when padding/packing enabled by
+/// different compiler options breaks compatibility. For more details about how
+/// this is used, please consult next sections. Here we just wrap these tables
+/// in lint warnings so that changes here cause changes to the versioning data
+/// as well. Here is a short summary of what changes are allowed:
+///   * adding a new method at the end of a table is allowed at any time;
+///   * any other change to these tables is only allowed on a major TensorFlow
+///     version change (e.g., from 2.x to 3.0). This is provided as an escape
+///     hatch to allow cleaning up these tables. Since any of these changes
+///     break ABI compatibility and cause all plugins to be recompiled, these
+///     type of changes should be extremely rare.
+///
+/// Next section will detail this as well as some corner cases that are out of
+/// scope for now.
+
+// LINT.IfChange
+typedef struct TF_RandomAccessFileOps {
+  /// Releases resources associated with `*file`.
+  ///
+  /// Requires that `*file` is not used in any concurrent or subsequent
+  /// operations.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  void (*cleanup)(TF_RandomAccessFile* file);
+
+  /// Reads up to `n` bytes from `*file` starting at `offset`.
+  ///
+  /// The output is in `buffer`, core TensorFlow owns the buffer and guarantees
+  /// that at least `n` bytes are available.
+  ///
+  /// Returns number of bytes read or -1 in case of error. Because of this
+  /// constraint and the fact that `ssize_t` is not defined in `stdint.h`/C++
+  /// standard, the return type is `int64_t`.
+  ///
+  /// This is thread safe.
+  ///
+  /// Note: the `buffer` argument is NOT a null terminated string!
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if exactly `n` bytes have been read.
+  ///   * Must set `status` to `TF_OUT_OF_RANGE` if fewer than `n` bytes have
+  ///     been read due to EOF.
+  ///   * Must return -1 for any other error and must set `status` to any
+  ///     other value to provide more information about the error.
+  int64_t (*read)(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+                  char* buffer, TF_Status* status);
+} TF_RandomAccessFileOps;
+// LINT.ThenChange(:random_access_file_ops_version)
+
+// LINT.IfChange
+typedef struct TF_WritableFileOps {
+  /// Releases resources associated with `*file`.
+  ///
+  /// Requires that `*file` is not used in any concurrent or subsequent
+  /// operations.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  void (*cleanup)(TF_WritableFile* file);
+
+  /// Appends `buffer` of size `n` to `*file`.
+  ///
+  /// Core TensorFlow owns `buffer` and guarantees at least `n` bytes of storage
+  /// that can be used to write data.
+  ///
+  /// Note: the `buffer` argument is NOT a null terminated string!
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if exactly `n` bytes have been written.
+  ///   * Must set `status` to `TF_RESOURCE_EXHAUSTED` if fewer than `n` bytes
+  ///     have been written, potentially due to quota/disk space.
+  ///   * Might use any other error value for `status` to signal other errors.
+  void (*append)(const TF_WritableFile* file, const char* buffer, size_t n,
+                 TF_Status* status);
+
+  /// Returns the current write position in `*file`.
+  ///
+  /// Plugins should ensure that the implementation is idempotent, 2 identical
+  /// calls result in the same answer.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` and return current position if no error.
+  ///   * Must set `status` to any other value and return -1 in case of error.
+  int64_t (*tell)(const TF_WritableFile* file, TF_Status* status);
+
+  /// Flushes `*file` and syncs contents to filesystem.
+  ///
+  /// This call might not block, and when it returns the contents might not have
+  /// been fully persisted.
+  ///
+  /// DEFAULT IMPLEMENTATION: No op.
+  void (*flush)(const TF_WritableFile* file, TF_Status* status);
+
+  /// Syncs contents of `*file` with the filesystem.
+  ///
+  /// This call should block until filesystem confirms that all buffers have
+  /// been flushed and persisted.
+  ///
+  /// DEFAULT IMPLEMENTATION: No op.
+  void (*sync)(const TF_WritableFile* file, TF_Status* status);
+
+  /// Closes `*file`.
+  ///
+  /// Flushes all buffers and deallocates all resources.
+  ///
+  /// Calling `close` must not result in calling `cleanup`.
+  ///
+  /// Core TensorFlow will never call `close` twice.
+  void (*close)(const TF_WritableFile* file, TF_Status* status);
+} TF_WritableFileOps;
+// LINT.ThenChange(:writable_file_ops_version)
+
+// LINT.IfChange
+typedef struct TF_ReadOnlyMemoryRegionOps {
+  /// Releases resources associated with `*region`.
+  ///
+  /// Requires that `*region` is not used in any concurrent or subsequent
+  /// operations.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  void (*cleanup)(TF_ReadOnlyMemoryRegion* region);
+
+  /// Returns a pointer to the memory region.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  const void* (*data)(const TF_ReadOnlyMemoryRegion* region);
+
+  /// Returns the length of the memory region in bytes.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  uint64_t (*length)(const TF_ReadOnlyMemoryRegion* region);
+} TF_ReadOnlyMemoryRegionOps;
+// LINT.ThenChange(:read_only_memory_region_ops_version)
+
+// LINT.IfChange
+typedef struct TF_FilesystemOps {
+  /// Acquires all resources used by the filesystem.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  void (*init)(TF_Filesystem* filesystem, TF_Status* status);
+
+  /// Releases all resources used by the filesystem
+  ///
+  /// NOTE: TensorFlow does not unload DSOs. Thus, the only way a filesystem
+  /// won't be registered anymore is if this function gets called by core
+  /// TensorFlow and the `TF_Filesystem*` object is destroyed. However, due to
+  /// registration being done in a static instance of `Env`, the destructor of
+  /// `FileSystem` is never called (see
+  /// https://github.com/tensorflow/tensorflow/issues/27535). In turn, this
+  /// function will never be called. There are plans to refactor registration
+  /// and fix this.
+  ///
+  /// TODO(b/139060984): After all filesystems are converted, revisit note.
+  ///
+  /// This operation must be provided. See "REQUIRED OPERATIONS" above.
+  void (*cleanup)(TF_Filesystem* filesystem);
+
+  /// Creates a new random access read-only file from given `path`.
+  ///
+  /// After this call `file` may be concurrently accessed by multiple threads.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `file` was updated.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to an
+  ///     existing file or one of the parent entries in `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` points to a
+  ///     directory or if it is invalid (e.g., malformed, or has a parent entry
+  ///     which is a file).
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// REQUIREMENTS: If plugins implement this, they must also provide a filled
+  /// `TF_RandomAccessFileOps` table. See "REQUIRED OPERATIONS" above.
+  void (*new_random_access_file)(const TF_Filesystem* filesystem,
+                                 const char* path, TF_RandomAccessFile* file,
+                                 TF_Status* status);
+
+  /// Creates an object to write to a file with the specified `path`.
+  ///
+  /// If the file already exists, it is deleted and recreated. The `file` object
+  /// must only be accessed by one thread at a time.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `file` was updated.
+  ///   * Must set `status` to `TF_NOT_FOUND` if one of the parents entries in
+  ///     `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` points to a
+  ///     directory or if it is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// REQUIREMENTS: If plugins implement this, they must also provide a filled
+  /// `TF_WritableFileOps` table. See "REQUIRED OPERATIONS" above.
+  void (*new_writable_file)(const TF_Filesystem* filesystem, const char* path,
+                            TF_WritableFile* file, TF_Status* status);
+
+  /// Creates an object to append to a file with the specified `path`.
+  ///
+  /// If the file doesn't exists, it is first created with empty contents.
+  /// The `file` object must only be accessed by one thread at a time.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `file` was updated.
+  ///   * Must set `status` to `TF_NOT_FOUND` if one of the parents entries in
+  ///     `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` points to a
+  ///     directory or if it is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// REQUIREMENTS: If plugins implement this, they must also provide a filled
+  /// `TF_WritableFileOps` table. See "REQUIRED OPERATIONS" above.
+  void (*new_appendable_file)(const TF_Filesystem* filesystem, const char* path,
+                              TF_WritableFile* file, TF_Status* status);
+
+  /// Creates a read-only region of memory from contents of `path`.
+  ///
+  /// After this call `region` may be concurrently accessed by multiple threads.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `region` was updated.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to an
+  ///     existing file or one of the parent entries in `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` points to a
+  ///     directory or if it is invalid.
+  ///   * Must set `status` to `TF_INVALID_ARGUMENT` if `path` points to an
+  ///     empty file.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// REQUIREMENTS: If plugins implement this, they must also provide a filled
+  /// `TF_ReadOnlyMemoryRegionOps` table. See "REQUIRED OPERATIONS" above.
+  void (*new_read_only_memory_region_from_file)(const TF_Filesystem* filesystem,
+                                                const char* path,
+                                                TF_ReadOnlyMemoryRegion* region,
+                                                TF_Status* status);
+
+  /// Creates the directory specified by `path`, assuming parent exists.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if directory was created.
+  ///   * Must set `status` to `TF_NOT_FOUND` if one of the parents entries in
+  ///     `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid.
+  ///   * Must set `status` to `TF_ALREADY_EXISTS` if `path` already exists.
+  ///   * Might use any other error value for `status` to signal other errors.
+  void (*create_dir)(const TF_Filesystem* filesystem, const char* path,
+                     TF_Status* status);
+
+  /// Creates the directory specified by `path` and all needed ancestors.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if directory was created.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid or
+  ///     if it exists but is not a directory.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// NOTE: The requirements specify that `TF_ALREADY_EXISTS` is not returned if
+  /// directory exists. Similarly, `TF_NOT_FOUND` is not be returned, as the
+  /// missing directory entry and all its descendants will be created by the
+  /// plugin.
+  ///
+  /// DEFAULT IMPLEMENTATION: Creates directories one by one. Needs
+  /// `path_exists`, `is_directory`, and `create_dir`.
+  void (*recursively_create_dir)(const TF_Filesystem* filesystem,
+                                 const char* path, TF_Status* status);
+
+  /// Deletes the file specified by `path`.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if file was deleted.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` points to a
+  ///     directory or if it is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  void (*delete_file)(const TF_Filesystem* filesystem, const char* path,
+                      TF_Status* status);
+
+  /// Deletes the empty directory specified by `path`.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if directory was deleted.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` does not point
+  ///     to a directory, if `path` is invalid, or if directory is not empty.
+  ///   * Might use any other error value for `status` to signal other errors.
+  void (*delete_dir)(const TF_Filesystem* filesystem, const char* path,
+                     TF_Status* status);
+
+  /// Deletes the directory specified by `path` and all its contents.
+  ///
+  /// This is accomplished by traversing directory tree rooted at `path` and
+  /// deleting entries as they are encountered, from leaves to root. Each plugin
+  /// is free to choose a different approach which obtains similar results.
+  ///
+  /// On successful deletion, `status` must be `TF_OK` and `*undeleted_files`
+  /// and `*undeleted_dirs` must be 0. On unsuccessful deletion, `status` must
+  /// be set to the reason why one entry couldn't be removed and the proper
+  /// count must be updated. If the deletion is unsuccessful because the
+  /// traversal couldn't start, `*undeleted_files` must be set to 0 and
+  /// `*undeleted_dirs` must be set to 1.
+  ///
+  /// TODO(b/139060984): After all filesystems are converted, consider
+  /// invariant about `*undeleted_files` and `*undeleted_dirs`.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if directory was deleted.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: Does a BFS traversal of tree rooted at `path`,
+  /// deleting entries as needed. Needs `path_exists`, `get_children`,
+  /// `is_directory`, `delete_file`, and `delete_dir`.
+  void (*delete_recursively)(const TF_Filesystem* filesystem, const char* path,
+                             uint64_t* undeleted_files,
+                             uint64_t* undeleted_dirs, TF_Status* status);
+
+  /// Renames the file given by `src` to that in `dst`.
+  ///
+  /// Replaces `dst` if it exists. In case of error, both `src` and `dst` keep
+  /// the same state as before the call.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if rename was completed.
+  ///   * Must set `status` to `TF_NOT_FOUND` if one of the parents entries in
+  ///     either `src` or `dst` doesn't exist or if the specified `src` path
+  ///     doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if either `src` or
+  ///     `dst` is a directory or if either of them is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: Copies file and deletes original. Needs
+  /// `copy_file`. and `delete_file`.
+  void (*rename_file)(const TF_Filesystem* filesystem, const char* src,
+                      const char* dst, TF_Status* status);
+
+  /// Copies the file given by `src` to that in `dst`.
+  ///
+  /// Similar to `rename_file`, but both `src` and `dst` exist after this call
+  /// with the same contents. In case of error, both `src` and `dst` keep the
+  /// same state as before the call.
+  ///
+  /// If `dst` is a directory, creates a file with the same name as the source
+  /// inside the target directory.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if rename was completed.
+  ///   * Must set `status` to `TF_NOT_FOUND` if one of the parents entries in
+  ///     either `src` or `dst` doesn't exist or if the specified `src` path
+  ///     doesn't exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if either `src` or
+  ///     `dst` is a directory or if either of them is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: Reads from `src` and writes to `dst`. Needs
+  /// `new_random_access_file` and `new_writable_file`.
+  void (*copy_file)(const TF_Filesystem* filesystem, const char* src,
+                    const char* dst, TF_Status* status);
+
+  /// Checks if `path` exists.
+  ///
+  /// Note that this doesn't differentiate between files and directories.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `path` exists.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to a
+  ///     filesystem entry.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  void (*path_exists)(const TF_Filesystem* filesystem, const char* path,
+                      TF_Status* status);
+
+  /// Checks if all values in `paths` exist in the filesystem.
+  ///
+  /// Returns `true` if and only if calling `path_exists` on each entry in
+  /// `paths` would set `status` to `TF_OK`.
+  ///
+  /// Caller guarantees that:
+  ///   * `paths` has exactly `num_files` entries.
+  ///   * `statuses` is either null or an array of `num_files` non-null elements
+  ///     of type `TF_Status*`.
+  ///
+  /// If `statuses` is not null, plugins must fill each element with detailed
+  /// status for each file, as if calling `path_exists` on each one. Core
+  /// TensorFlow initializes the `statuses` array and plugins must use
+  /// `TF_SetStatus` to set each element instead of directly assigning.
+  ///
+  /// DEFAULT IMPLEMENTATION: Checks existence of every file. Needs
+  /// `path_exists`.
+  bool (*paths_exist)(const TF_Filesystem* filesystem, char** paths,
+                      int num_files, TF_Status** statuses);
+
+  /// Obtains statistics for the given `path`.
+  ///
+  /// Updates `stats` only if `status` is set to `TF_OK`.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `path` exists.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to a
+  ///     filesystem entry.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  void (*stat)(const TF_Filesystem* filesystem, const char* path,
+               TF_FileStatistics* stats, TF_Status* status);
+
+  /// Checks whether the given `path` is a directory or not.
+  ///
+  /// If `status` is not `TF_OK`, returns `false`, otherwise returns the same
+  /// as the `is_directory` member of a `TF_FileStatistics` that would be used
+  /// on the equivalent call of `stat`.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `path` exists.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to a
+  ///     filesystem entry.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: Gets statistics about `path`. Needs `stat`.
+  bool (*is_directory)(const TF_Filesystem* filesystem, const char* path,
+                       TF_Status* status);
+
+  /// Returns the size of the file given by `path`.
+  ///
+  /// If `status` is not `TF_OK`, return value is undefined. Otherwise, returns
+  /// the same as `length` member of a `TF_FileStatistics` that would be used on
+  /// the equivalent call of `stat`.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `path` exists.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to a
+  ///     filesystem entry.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path` is invalid or
+  ///     points to a directory.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: Gets statistics about `path`. Needs `stat`.
+  int64_t (*get_file_size)(const TF_Filesystem* filesystem, const char* path,
+                           TF_Status* status);
+
+  /// Translates `uri` to a filename for the filesystem
+  ///
+  /// A filesystem is registered for a specific scheme and all of the methods
+  /// should work with URIs. Hence, each filesystem needs to be able to
+  /// translate from an URI to a path on the filesystem. For example, this
+  /// function could translate `fs:///path/to/a/file` into `/path/to/a/file`, if
+  /// implemented by a filesystem registered to handle the `fs://` scheme.
+  ///
+  /// A new `char*` buffer must be allocated by this method. Core TensorFlow
+  /// manages the lifetime of the buffer after the call. Thus, all callers of
+  /// this method must take ownership of the returned pointer.
+  ///
+  /// The implementation should clean up paths, including but not limited to,
+  /// removing duplicate `/`s, and resolving `..` and `.`.
+  ///
+  /// Plugins must not return `nullptr`. Returning empty strings is allowed.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// This function will be called by core TensorFlow to clean up all path
+  /// arguments for all other methods in the filesystem API.
+  ///
+  /// DEFAULT IMPLEMENTATION: Uses `io::CleanPath` and `io::ParseURI`.
+  char* (*translate_name)(const TF_Filesystem* filesystem, const char* uri);
+
+  /// Finds all entries in the directory given by `path`.
+  ///
+  /// The returned entries are paths relative to `path`.
+  ///
+  /// Plugins must allocate `entries` to hold all names that need to be returned
+  /// and return the size of `entries`. Caller takes ownership of `entries`
+  /// after the call.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `entries` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if all children were returned.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to a
+  ///     filesystem entry or if one of the parents entries in `path` doesn't
+  ///     exist.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if one of the parent
+  ///     entries in `path` is not a directory, or if `path` is a file.
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*get_children)(const TF_Filesystem* filesystem, const char* path,
+                      char*** entries, TF_Status* status);
+
+  /// Finds all entries matching the regular expression given by `glob`.
+  ///
+  /// Pattern must match the entire entry name, not just a substring.
+  ///
+  /// pattern: { term }
+  /// term:
+  ///   '*': matches any sequence of non-'/' characters
+  ///   '?': matches a single non-'/' character
+  ///   '[' [ '^' ] { match-list } ']':
+  ///        matches any single character (not) on the list
+  ///   c: matches character c (c != '*', '?', '\\', '[')
+  ///   '\\' c: matches character c
+  /// character-range:
+  ///   c: matches character c (c != '\\', '-', ']')
+  ///   '\\' c: matches character c
+  ///   lo '-' hi: matches character c for lo <= c <= hi
+  ///
+  /// Implementations must allocate `entries` to hold all names that need to be
+  /// returned and return the size of `entries`. Caller takes ownership of
+  /// `entries` after the call.
+  ///
+  /// In case of error, the implementations must set `status` to a value
+  /// different than `TF_OK`, free any memory that might have been allocated for
+  /// `entries` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if all matches were returned.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: Scans the directory tree (in parallel if possible)
+  /// and fills `*entries`. Needs `get_children` and `is_directory`.
+  int (*get_matching_paths)(const TF_Filesystem* filesystem, const char* glob,
+                            char*** entries, TF_Status* status);
+
+  /// Flushes any filesystem cache currently in memory
+  ///
+  /// DEFAULT IMPLEMENTATION: No op.
+  void (*flush_caches)(const TF_Filesystem* filesystem);
+
+  /// Starts a new transaction.
+  ///
+  /// An opaque transaction token is returned in `token`. Ownership of the token
+  /// is in filesystem. Token will be freed in `end_transaction` call and any
+  /// access to token after that is invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if transaction successfuly started.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if multiple transactions
+  ///     are not supported
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*start_transaction)(const TF_Filesystem* filesystem,
+                           TF_TransactionToken** token, TF_Status* status);
+
+  /// Ends transaction and free the `token`. Any access to token after
+  /// that will be invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if transaction successfuly finalized.
+  ///   * Must set `status` to `TF_NOT_FOUND` if token is invalid/not found
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*end_transaction)(const TF_Filesystem* filesystem,
+                         TF_TransactionToken* token, TF_Status* status);
+
+  /// Adds file/directory in the `path` to transaction in `token`. It is a valid
+  /// operation to add a path that doesn't exist yet to a transaction.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if path added to transaction successful.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `token` is invalid.
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if file/directory is in
+  ///     another transaction and multiple transactions are not supported
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*add_to_transaction)(const TF_Filesystem* filesystem, const char* path,
+                            TF_TransactionToken* token, TF_Status* status);
+
+  /// Returns transaction token for file/directory in the `path`. Note that path
+  /// may not exist yet but still might be part of a transaction.
+  ///
+  /// Transaction token is returned in `token`. Ownership of the token is in
+  /// filesystem. Token will be freed in `end_transaction` call and any access
+  /// to token after that is invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if a transaction for path is found
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` is not part of any
+  ///     transaction
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if `path`  is
+  ///     not in this filesystem.
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*get_transaction_for_path)(const TF_Filesystem* filesystem,
+                                  const char* path, TF_TransactionToken** token,
+                                  TF_Status* status);
+
+  /// Returns transaction token for `path` if it is part of a transaction else
+  /// starts a new transaction and adds `path` to that transaction
+  ///
+  /// Transaction token is returned in `token`. Ownership of the token is in
+  /// filesystem. Token will be freed in `end_transaction` call and any access
+  /// to token after that is invalid.
+  ///
+  /// In case of error, plugins must set `status` to a value different than
+  /// `TF_OK`, free memory allocated for `token` and return -1.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if transaction found or successfuly
+  ///     started.
+  ///   * Must set `status` to `TF_NOT_FOUND` if `path` doesn't point to this
+  ///     filesystem
+  ///   * Must set `status` to `TF_FAILED_PRECONDITION` if file/directory is
+  ///     not in any transaction and multiple transactions are not supported.
+  ///   * Might use any other error value for `status` to signal other errors.
+  int (*get_or_start_transaction_for_path)(const TF_Filesystem* filesystem,
+                                           const char* path,
+                                           TF_TransactionToken** token,
+                                           TF_Status* status);
+
+  /// Decodes transaction token in `token` to human readable format for
+  /// debugging.
+  ///
+  /// A new `char*` buffer must be allocated by this method. Core TensorFlow
+  /// manages the lifetime of the buffer after the call. Thus, all callers of
+  /// this method must take ownership of the returned pointer.
+  ///
+  /// Plugins must not return `nullptr`. Returning empty strings is allowed.
+  ///
+  /// The allocation and freeing of memory must happen via the functions sent to
+  /// core TensorFlow upon registration (see the `TF_FilesystemPluginInfo`
+  /// structure in Section 4).
+  ///
+  /// DEFAULT IMPLEMENTATION: Dump token and owner address.
+  char* (*decode_transaction_token)(const TF_Filesystem* filesystem,
+                                    const TF_TransactionToken* token);
+
+  /// Returns pointer to an array of available configuration options and their
+  /// current/default values in `options` and number of options in array in
+  /// `num_options`. Ownership of the array is transferred to caller and the
+  /// caller is responsible of freeing the buffers using respective file systems
+  /// allocation API.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `options` and `num_options` set.
+  ///     If there is no configurable option, `num_options` should be 0.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: return 0 options and `TF_OK`.
+  void (*get_filesystem_configuration)(const TF_Filesystem* filesystem,
+                                       TF_Filesystem_Option** options,
+                                       int* num_options, TF_Status* status);
+
+  /// Updates filesystem configuration with options passed in `options`. It can
+  /// contain full set of options supported by the filesystem or just a subset
+  /// of them. Ownership of options and buffers therein belongs to the caller
+  /// and any buffers need to be allocated through filesystem allocation API.
+  /// Filesystems may choose to ignore configuration errors but should at least
+  /// display a warning or error message to warn the users.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if options are updated.
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: return `TF_NOT_FOUND`.
+  void (*set_filesystem_configuration)(const TF_Filesystem* filesystem,
+                                       const TF_Filesystem_Option* options,
+                                       int num_options, TF_Status* status);
+
+  /// Returns the value of the filesystem option given in `key` in `option`.
+  /// Valid values of the `key` are returned by
+  /// `get_file_system_configuration_keys` call. Ownership of the
+  /// `option` is transferred to caller. Buffers therein should be allocated and
+  /// freed by the relevant filesystems allocation API.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `option` is set
+  ///   * Must set `status` to `TF_NOT_FOUND` if the key is invalid
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: return `TF_NOT_FOUND`.
+  void (*get_filesystem_configuration_option)(const TF_Filesystem* filesystem,
+                                              const char* key,
+                                              TF_Filesystem_Option** option,
+                                              TF_Status* status);
+
+  /// Sets the value of the filesystem option given in `key` to value in
+  /// `option`. Valid values of the `key` are returned by
+  /// `get_file_system_configuration_keys` call. Ownership of the `option` and
+  /// the `key` belogs to the caller. Buffers therein should be allocated and
+  /// freed by the filesystems allocation API.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` if `option` is set/updated
+  ///   * Must set `status` to `TF_NOT_FOUND` if the key is invalid
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: return `TF_NOT_FOUND`.
+  void (*set_filesystem_configuration_option)(
+      const TF_Filesystem* filesystem, const TF_Filesystem_Option* option,
+      TF_Status* status);
+
+  /// Returns a list of valid configuration keys in `keys` array and number of
+  /// keys in `num_keys`. Ownership of the buffers in `keys` are transferred to
+  /// caller and needs to be freed using relevant filesystem allocation API.
+  ///
+  /// Plugins:
+  ///   * Must set `status` to `TF_OK` on success. If there are no configurable
+  ///     keys, `num_keys` should be set to 0
+  ///   * Might use any other error value for `status` to signal other errors.
+  ///
+  /// DEFAULT IMPLEMENTATION: return `TF_OK` and `num_keys`=0.
+  void (*get_filesystem_configuration_keys)(const TF_Filesystem* filesystem,
+                                            char** keys, int* num_keys,
+                                            TF_Status* status);
+} TF_FilesystemOps;
+// LINT.ThenChange(:filesystem_ops_version)
+
+/// SECTION 3. ABI and API compatibility
+/// ----------------------------------------------------------------------------
+///
+/// In this section we define constants and macros to record versioning
+/// information for each of the structures in section 2: ABI and API versions
+/// and the number of functions in each of the function tables (which is
+/// automatically determined, so ignored for the rest of this comment).
+///
+/// Since filesystem plugins are outside of TensorFlow's code tree, they are not
+/// tied with TensorFlow releases and should have their own versioning metadata
+/// in addition with the data discussed in this section. Each plugin author can
+/// use a custom scheme, but it should only relate to changes in plugin code.
+/// This section only touches metadata related to the versioning of this
+/// interface that is shared by all possible plugins.
+///
+/// The API number increases whenever we break API compatibility while still
+/// maintaining ABI compatibility. This happens only in the following cases:
+///   1. A new method is added _at the end_ of the function table.
+///   2. Preconditions or postconditions for one operation in these function
+///   table change. Note that only core TensorFlow is able to impose these
+///   invariants (i.e., guarantee the preconditions before calling the operation
+///   and check the postconditions after the operation returns). If plugins need
+///   additional invariants, they should be checked on the plugin side and the
+///   `status` out variable should be updated accordingly (e.g., to include
+///   plugin version information that relates to the condition change).
+///
+/// All other changes to the data structures (e.g., method removal, method
+/// reordering, argument reordering, adding or removing arguments, changing the
+/// type or the constness of a parameter, etc.) results in an ABI breakage.
+/// Thus, we should not do any of these types of changes, except, potentially,
+/// when we are releasing a new major version of TensorFlow. This is an escape
+/// hatch, to be used rarely, preferably only to cleanup these structures.
+/// Whenever we do these changes, the ABI number must be increased.
+///
+/// Next section will detail how this metadata is used at plugin registration to
+/// only load compatible plugins and discard all others.
+
+// LINT.IfChange(random_access_file_ops_version)
+constexpr int TF_RANDOM_ACCESS_FILE_OPS_API = 0;
+constexpr int TF_RANDOM_ACCESS_FILE_OPS_ABI = 0;
+constexpr size_t TF_RANDOM_ACCESS_FILE_OPS_SIZE =
+    sizeof(TF_RandomAccessFileOps);
+// LINT.ThenChange()
+
+// LINT.IfChange(writable_file_ops_version)
+constexpr int TF_WRITABLE_FILE_OPS_API = 0;
+constexpr int TF_WRITABLE_FILE_OPS_ABI = 0;
+constexpr size_t TF_WRITABLE_FILE_OPS_SIZE = sizeof(TF_WritableFileOps);
+// LINT.ThenChange()
+
+// LINT.IfChange(read_only_memory_region_ops_version)
+constexpr int TF_READ_ONLY_MEMORY_REGION_OPS_API = 0;
+constexpr int TF_READ_ONLY_MEMORY_REGION_OPS_ABI = 0;
+constexpr size_t TF_READ_ONLY_MEMORY_REGION_OPS_SIZE =
+    sizeof(TF_ReadOnlyMemoryRegionOps);
+// LINT.ThenChange()
+
+// LINT.IfChange(filesystem_ops_version)
+constexpr int TF_FILESYSTEM_OPS_API = 0;
+constexpr int TF_FILESYSTEM_OPS_ABI = 0;
+constexpr size_t TF_FILESYSTEM_OPS_SIZE = sizeof(TF_FilesystemOps);
+// LINT.ThenChange()
+
+/// SECTION 4. Plugin registration and initialization
+/// ----------------------------------------------------------------------------
+///
+/// In this section we define the API used by core TensorFlow to initialize a
+/// filesystem provided by a plugin. That is, we define the following:
+///   * `TF_InitPlugin` function: must be present in the plugin shared object as
+///     it will be called by core TensorFlow when the filesystem plugin is
+///     loaded;
+///   * `TF_FilesystemPluginOps` struct: used to transfer information between
+///     plugins and core TensorFlow about the operations provided and metadata;
+///   * `TF_FilesystemPluginInfo` struct: similar to the above structure, but
+///     collects information about all the file schemes that the plugin provides
+///     support for, as well as about the plugin's memory handling routines;
+///   * `TF_SetFilesystemVersionMetadata` function: must be called by plugins in
+///     their `TF_InitPlugin` to record the versioning information the plugins
+///     are compiled against.
+///
+/// The `TF_InitPlugin` function is used by plugins to set up the data
+/// structures that implement this interface, as presented in Section 2. In
+/// order to not have plugin shared objects call back symbols defined in core
+/// TensorFlow, `TF_InitPlugin` has a `TF_FilesystemPluginInfo` argument which
+/// the plugin must fill (using the `TF_SetFilesystemVersionMetadata` for the
+/// metadata and setting up all the supported operations and the URI schemes
+/// that are supported).
+
+/// This structure incorporates the operations defined in Section 2 and the
+/// metadata defined in section 3, allowing plugins to define different ops
+/// for different URI schemes.
+///
+/// Every URI scheme is of the form "fs" for URIs of form "fs:///path/to/file".
+/// For local filesystems (i.e., when the URI is "/path/to/file"), the scheme
+/// must be "". The scheme must never be `nullptr`.
+///
+/// Every plugin fills this in `TF_InitPlugin`, using the alocator passed as
+/// argument to allocate memory. After `TF_InitPlugin` finishes, core
+/// TensorFlow uses the information present in this to initialize filesystems
+/// for the URI schemes that the plugin requests.
+///
+/// All pointers defined in this structure point to memory allocated by the DSO
+/// using an allocator provided by core TensorFlow when calling `TF_InitPlugin`.
+///
+/// IMPORTANT: To maintain binary compatibility, the layout of this structure
+/// must not change! In the unlikely case that a new type of file needs to be
+/// supported, add the new ops and metadata at the end of the structure.
+typedef struct TF_FilesystemPluginOps {
+  char* scheme;
+  int filesystem_ops_abi;
+  int filesystem_ops_api;
+  size_t filesystem_ops_size;
+  TF_FilesystemOps* filesystem_ops;
+  int random_access_file_ops_abi;
+  int random_access_file_ops_api;
+  size_t random_access_file_ops_size;
+  TF_RandomAccessFileOps* random_access_file_ops;
+  int writable_file_ops_abi;
+  int writable_file_ops_api;
+  size_t writable_file_ops_size;
+  TF_WritableFileOps* writable_file_ops;
+  int read_only_memory_region_ops_abi;
+  int read_only_memory_region_ops_api;
+  size_t read_only_memory_region_ops_size;
+  TF_ReadOnlyMemoryRegionOps* read_only_memory_region_ops;
+} TF_FilesystemPluginOps;
+
+/// This structure gathers together all the operations provided by the plugin.
+///
+/// Plugins must provide exactly `num_schemes` elements in the `ops` array.
+///
+/// Since memory that is allocated by the DSO gets transferred to core
+/// TensorFlow, we need to provide a way for the allocation and deallocation to
+/// match. This is why this structure also defines `plugin_memory_allocate` and
+/// `plugin_memory_free` members.
+///
+/// All memory allocated by the plugin that will be owned by core TensorFlow
+/// must be allocated using the allocator in this structure. Core TensorFlow
+/// will use the deallocator to free this memory once it no longer needs it.
+///
+/// IMPORTANT: To maintain binary compatibility, the layout of this structure
+/// must not change! In the unlikely case that new global operations must be
+/// provided, add them at the end of the structure.
+typedef struct TF_FilesystemPluginInfo {
+  size_t num_schemes;
+  TF_FilesystemPluginOps* ops;
+  void* (*plugin_memory_allocate)(size_t size);
+  void (*plugin_memory_free)(void* ptr);
+} TF_FilesystemPluginInfo;
+
+/// Convenience function for setting the versioning metadata.
+///
+/// The argument is guaranteed to not be `nullptr`.
+///
+/// We want this to be defined in the plugin's memory space and we guarantee
+/// that core TensorFlow will never call this.
+static inline void TF_SetFilesystemVersionMetadata(
+    TF_FilesystemPluginOps* ops) {
+  ops->filesystem_ops_abi = TF_FILESYSTEM_OPS_ABI;
+  ops->filesystem_ops_api = TF_FILESYSTEM_OPS_API;
+  ops->filesystem_ops_size = TF_FILESYSTEM_OPS_SIZE;
+  ops->random_access_file_ops_abi = TF_RANDOM_ACCESS_FILE_OPS_ABI;
+  ops->random_access_file_ops_api = TF_RANDOM_ACCESS_FILE_OPS_API;
+  ops->random_access_file_ops_size = TF_RANDOM_ACCESS_FILE_OPS_SIZE;
+  ops->writable_file_ops_abi = TF_WRITABLE_FILE_OPS_ABI;
+  ops->writable_file_ops_api = TF_WRITABLE_FILE_OPS_API;
+  ops->writable_file_ops_size = TF_WRITABLE_FILE_OPS_SIZE;
+  ops->read_only_memory_region_ops_abi = TF_READ_ONLY_MEMORY_REGION_OPS_ABI;
+  ops->read_only_memory_region_ops_api = TF_READ_ONLY_MEMORY_REGION_OPS_API;
+  ops->read_only_memory_region_ops_size = TF_READ_ONLY_MEMORY_REGION_OPS_SIZE;
+}
+
+/// Initializes a TensorFlow plugin.
+///
+/// Must be implemented by the plugin DSO. It is called by TensorFlow runtime.
+///
+/// Filesystem plugins can be loaded on demand by users via
+/// `Env::LoadLibrary` or during TensorFlow's startup if they are on certain
+/// paths (although this has a security risk if two plugins register for the
+/// same filesystem and the malicious one loads before the legimitate one -
+/// but we consider this to be something that users should care about and
+/// manage themselves). In both of these cases, core TensorFlow looks for
+/// the `TF_InitPlugin` symbol and calls this function.
+///
+/// For every filesystem URI scheme that this plugin supports, the plugin must
+/// add one `TF_FilesystemPluginInfo` entry in `plugin_info->ops` and call
+/// `TF_SetFilesystemVersionMetadata` for that entry.
+///
+/// Plugins must also initialize `plugin_info->plugin_memory_allocate` and
+/// `plugin_info->plugin_memory_free` to ensure memory allocated by plugin is
+/// freed in a compatible way.
+TF_CAPI_EXPORT extern void TF_InitPlugin(TF_FilesystemPluginInfo* plugin_info);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_FILESYSTEM_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/modular_filesystem.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/modular_filesystem.h
new file mode 100644
index 00000000..a19ee27d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/modular_filesystem.h
@@ -0,0 +1,210 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_MODULAR_FILESYSTEM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_MODULAR_FILESYSTEM_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/core/platform/file_statistics.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/file_system.h"
+
+/// This file builds classes needed to hold a filesystem implementation in the
+/// modular world. Once all TensorFlow filesystems are converted to use the
+/// plugin based approach, this file will replace the one in core/platform and
+/// the names will lose the `Modular` part. Until that point, the `Modular*`
+/// classes here are experimental and subject to breaking changes.
+/// For documentation on these methods, consult `core/platform/filesystem.h`.
+
+namespace tensorflow {
+
+// TODO(b/143949615): After all filesystems are converted, this file will be
+// moved to core/platform, and this class can become a singleton and replace the
+// need for `Env::Default()`. At that time, we might decide to remove the need
+// for `Env::Default()` altogether, but that's a different project, not in
+// scope for now. I'm just mentioning this here as that transition will mean
+// removal of the registration part from `Env` and adding it here instead: we
+// will need tables to hold for each scheme the function tables that implement
+// the needed functionality instead of the current `FileSystemRegistry` code in
+// `core/platform/env.cc`.
+class ModularFileSystem final : public FileSystem {
+ public:
+  ModularFileSystem(
+      std::unique_ptr<TF_Filesystem> filesystem,
+      std::unique_ptr<const TF_FilesystemOps> filesystem_ops,
+      std::unique_ptr<const TF_RandomAccessFileOps> random_access_file_ops,
+      std::unique_ptr<const TF_WritableFileOps> writable_file_ops,
+      std::unique_ptr<const TF_ReadOnlyMemoryRegionOps>
+          read_only_memory_region_ops,
+      std::function<void*(size_t)> plugin_memory_allocate,
+      std::function<void(void*)> plugin_memory_free)
+      : filesystem_(std::move(filesystem)),
+        ops_(std::move(filesystem_ops)),
+        random_access_file_ops_(std::move(random_access_file_ops)),
+        writable_file_ops_(std::move(writable_file_ops)),
+        read_only_memory_region_ops_(std::move(read_only_memory_region_ops)),
+        plugin_memory_allocate_(std::move(plugin_memory_allocate)),
+        plugin_memory_free_(std::move(plugin_memory_free)) {}
+
+  ~ModularFileSystem() override { ops_->cleanup(filesystem_.get()); }
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override;
+  absl::Status NewAppendableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override;
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+  absl::Status FileExists(const std::string& fname,
+                          TransactionToken* token) override;
+  bool FilesExist(const std::vector<std::string>& files,
+                  TransactionToken* token,
+                  std::vector<absl::Status>* status) override;
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<std::string>* result) override;
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override;
+  absl::Status DeleteFile(const std::string& fname,
+                          TransactionToken* token) override;
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override;
+  absl::Status DeleteDir(const std::string& dirname,
+                         TransactionToken* token) override;
+  absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                    TransactionToken* token) override;
+  absl::Status CreateDir(const std::string& dirname,
+                         TransactionToken* token) override;
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
+                    FileStatistics* stat) override;
+  absl::Status IsDirectory(const std::string& fname,
+                           TransactionToken* token) override;
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64* file_size) override;
+  absl::Status RenameFile(const std::string& src, const std::string& target,
+                          TransactionToken* token) override;
+  absl::Status CopyFile(const std::string& src, const std::string& target,
+                        TransactionToken* token) override;
+  std::string TranslateName(const std::string& name) const override;
+  void FlushCaches(TransactionToken* token) override;
+  absl::Status SetOption(const std::string& name,
+                         const std::vector<string>& values) override;
+  absl::Status SetOption(const std::string& name,
+                         const std::vector<int64_t>& values) override;
+  absl::Status SetOption(const std::string& name,
+                         const std::vector<double>& values) override;
+
+ private:
+  std::unique_ptr<TF_Filesystem> filesystem_;
+  std::unique_ptr<const TF_FilesystemOps> ops_;
+  std::unique_ptr<const TF_RandomAccessFileOps> random_access_file_ops_;
+  std::unique_ptr<const TF_WritableFileOps> writable_file_ops_;
+  std::unique_ptr<const TF_ReadOnlyMemoryRegionOps>
+      read_only_memory_region_ops_;
+  std::function<void*(size_t)> plugin_memory_allocate_;
+  std::function<void(void*)> plugin_memory_free_;
+  ModularFileSystem(const ModularFileSystem&) = delete;
+  void operator=(const ModularFileSystem&) = delete;
+};
+
+class ModularRandomAccessFile final : public RandomAccessFile {
+ public:
+  ModularRandomAccessFile(const std::string& filename,
+                          std::unique_ptr<TF_RandomAccessFile> file,
+                          const TF_RandomAccessFileOps* ops)
+      : filename_(filename), file_(std::move(file)), ops_(ops) {}
+
+  ~ModularRandomAccessFile() override { ops_->cleanup(file_.get()); }
+
+  absl::Status Read(uint64 offset, size_t n, StringPiece* result,
+                    char* scratch) const override;
+  absl::Status Name(StringPiece* result) const override;
+
+ private:
+  std::string filename_;
+  std::unique_ptr<TF_RandomAccessFile> file_;
+  const TF_RandomAccessFileOps* ops_;  // not owned
+  ModularRandomAccessFile(const ModularRandomAccessFile&) = delete;
+  void operator=(const ModularRandomAccessFile&) = delete;
+};
+
+class ModularWritableFile final : public WritableFile {
+ public:
+  ModularWritableFile(const std::string& filename,
+                      std::unique_ptr<TF_WritableFile> file,
+                      const TF_WritableFileOps* ops)
+      : filename_(filename), file_(std::move(file)), ops_(ops) {}
+
+  ~ModularWritableFile() override { ops_->cleanup(file_.get()); }
+
+  absl::Status Append(StringPiece data) override;
+  absl::Status Close() override;
+  absl::Status Flush() override;
+  absl::Status Sync() override;
+  absl::Status Name(StringPiece* result) const override;
+  absl::Status Tell(int64_t* position) override;
+
+ private:
+  std::string filename_;
+  std::unique_ptr<TF_WritableFile> file_;
+  const TF_WritableFileOps* ops_;  // not owned
+  ModularWritableFile(const ModularWritableFile&) = delete;
+  void operator=(const ModularWritableFile&) = delete;
+};
+
+class ModularReadOnlyMemoryRegion final : public ReadOnlyMemoryRegion {
+ public:
+  ModularReadOnlyMemoryRegion(std::unique_ptr<TF_ReadOnlyMemoryRegion> region,
+                              const TF_ReadOnlyMemoryRegionOps* ops)
+      : region_(std::move(region)), ops_(ops) {}
+
+  ~ModularReadOnlyMemoryRegion() override { ops_->cleanup(region_.get()); };
+
+  const void* data() override { return ops_->data(region_.get()); }
+  uint64 length() override { return ops_->length(region_.get()); }
+
+ private:
+  std::unique_ptr<TF_ReadOnlyMemoryRegion> region_;
+  const TF_ReadOnlyMemoryRegionOps* ops_;  // not owned
+  ModularReadOnlyMemoryRegion(const ModularReadOnlyMemoryRegion&) = delete;
+  void operator=(const ModularReadOnlyMemoryRegion&) = delete;
+};
+
+// Registers a filesystem plugin so that core TensorFlow can use it.
+absl::Status RegisterFilesystemPlugin(const std::string& dso_path);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_MODULAR_FILESYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/modular_filesystem_registration.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/modular_filesystem_registration.h
new file mode 100644
index 00000000..e119857f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/modular_filesystem_registration.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_MODULAR_FILESYSTEM_REGISTRATION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_MODULAR_FILESYSTEM_REGISTRATION_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace filesystem_registration {
+
+// Implementation for filesystem registration
+//
+// Don't call this directly. Instead call `RegisterFilesystemPlugin`.
+// Exposed only for static registration of local filesystems.
+absl::Status RegisterFilesystemPluginImpl(const TF_FilesystemPluginInfo* info);
+
+}  // namespace filesystem_registration
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_MODULAR_FILESYSTEM_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/cleanup.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/cleanup.h
new file mode 100644
index 00000000..cc7a7451
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/cleanup.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// MakeCleanup(f) returns an RAII cleanup object that calls 'f' in its
+// destructor. The easiest way to use MakeCleanup is with a lambda argument,
+// capturing the return value in an 'auto' local variable. Most users will not
+// need more sophisticated syntax than that.
+//
+// Example:
+//   void func() {
+//     FILE* fp = fopen("data.txt", "r");
+//     if (fp == nullptr) return;
+//     auto fp_cleaner = gtl::MakeCleanup([fp] { fclose(fp); });
+//     // No matter what, fclose(fp) will happen.
+//     DataObject d;
+//     while (ReadDataObject(fp, &d)) {
+//       if (d.IsBad()) {
+//         LOG(ERROR) << "Bad Data";
+//         return;
+//       }
+//       PushGoodData(d);
+//     }
+//   }
+//
+// You can use Cleanup<F> directly, instead of using MakeCleanup and auto,
+// but there's rarely a reason to do that.
+//
+// You can call 'release()' on a Cleanup object to cancel the cleanup.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_CLEANUP_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_CLEANUP_H_
+
+#include <type_traits>
+#include <utility>
+
+namespace tf_gcs_filesystem {
+
+// A move-only RAII object that calls a stored cleanup functor when
+// destroyed. Cleanup<F> is the return type of gtl::MakeCleanup(F).
+template <typename F>
+class Cleanup {
+ public:
+  Cleanup() : released_(true), f_() {}
+
+  template <typename G>
+  explicit Cleanup(G&& f)          // NOLINT
+      : f_(std::forward<G>(f)) {}  // NOLINT(build/c++11)
+
+  Cleanup(Cleanup&& src)  // NOLINT
+      : released_(src.is_released()), f_(src.release()) {}
+
+  // Implicitly move-constructible from any compatible Cleanup<G>.
+  // The source will be released as if src.release() were called.
+  // A moved-from Cleanup can be safely destroyed or reassigned.
+  template <typename G>
+  Cleanup(Cleanup<G>&& src)  // NOLINT
+      : released_(src.is_released()), f_(src.release()) {}
+
+  // Assignment to a Cleanup object behaves like destroying it
+  // and making a new one in its place, analogous to unique_ptr
+  // semantics.
+  Cleanup& operator=(Cleanup&& src) {  // NOLINT
+    if (!released_) f_();
+    released_ = src.released_;
+    f_ = src.release();
+    return *this;
+  }
+
+  ~Cleanup() {
+    if (!released_) f_();
+  }
+
+  // Releases the cleanup function instead of running it.
+  // Hint: use c.release()() to run early.
+  F release() {
+    released_ = true;
+    return std::move(f_);
+  }
+
+  bool is_released() const { return released_; }
+
+ private:
+  static_assert(!std::is_reference<F>::value, "F must not be a reference");
+
+  bool released_ = false;
+  F f_;
+};
+
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
+Cleanup<DecayF> MakeCleanup(F&& f) {
+  return Cleanup<DecayF>(std::forward<F>(f));
+}
+
+}  // namespace tf_gcs_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_CLEANUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache.h
new file mode 100644
index 00000000..c0347faa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache.h
@@ -0,0 +1,191 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_EXPIRING_LRU_CACHE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_EXPIRING_LRU_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/c/env.h"
+#include "tensorflow/c/tf_status.h"
+
+namespace tf_gcs_filesystem {
+
+/// \brief An LRU cache of string keys and arbitrary values, with configurable
+/// max item age (in seconds) and max entries.
+///
+/// This class is thread safe.
+template <typename T>
+class ExpiringLRUCache {
+ public:
+  /// A `max_age` of 0 means that nothing is cached. A `max_entries` of 0 means
+  /// that there is no limit on the number of entries in the cache (however, if
+  /// `max_age` is also 0, the cache will not be populated).
+  ExpiringLRUCache(uint64_t max_age, size_t max_entries,
+                   std::function<uint64_t()> timer_seconds = TF_NowSeconds)
+      : max_age_(max_age),
+        max_entries_(max_entries),
+        timer_seconds_(timer_seconds) {}
+
+  /// Insert `value` with key `key`. This will replace any previous entry with
+  /// the same key.
+  void Insert(const std::string& key, const T& value) {
+    if (max_age_ == 0) {
+      return;
+    }
+    absl::MutexLock lock(&mu_);
+    InsertLocked(key, value);
+  }
+
+  // Delete the entry with key `key`. Return true if the entry was found for
+  // `key`, false if the entry was not found. In both cases, there is no entry
+  // with key `key` existed after the call.
+  bool Delete(const std::string& key) {
+    absl::MutexLock lock(&mu_);
+    return DeleteLocked(key);
+  }
+
+  /// Look up the entry with key `key` and copy it to `value` if found. Returns
+  /// true if an entry was found for `key`, and its timestamp is not more than
+  /// max_age_ seconds in the past.
+  bool Lookup(const std::string& key, T* value) {
+    if (max_age_ == 0) {
+      return false;
+    }
+    absl::MutexLock lock(&mu_);
+    return LookupLocked(key, value);
+  }
+
+  typedef std::function<void(const std::string&, T*, TF_Status*)> ComputeFunc;
+
+  /// Look up the entry with key `key` and copy it to `value` if found. If not
+  /// found, call `compute_func`. If `compute_func` set `status` to `TF_OK`,
+  /// store a copy of the output parameter in the cache, and another copy in
+  /// `value`.
+  void LookupOrCompute(const std::string& key, T* value,
+                       const ComputeFunc& compute_func, TF_Status* status) {
+    if (max_age_ == 0) {
+      return compute_func(key, value, status);
+    }
+
+    // Note: we hold onto mu_ for the rest of this function. In practice, this
+    // is okay, as stat requests are typically fast, and concurrent requests are
+    // often for the same file. Future work can split this up into one lock per
+    // key if this proves to be a significant performance bottleneck.
+    absl::MutexLock lock(&mu_);
+    if (LookupLocked(key, value)) {
+      return TF_SetStatus(status, TF_OK, "");
+    }
+    compute_func(key, value, status);
+    if (TF_GetCode(status) == TF_OK) {
+      InsertLocked(key, *value);
+    }
+  }
+
+  /// Clear the cache.
+  void Clear() {
+    absl::MutexLock lock(&mu_);
+    cache_.clear();
+    lru_list_.clear();
+  }
+
+  /// Accessors for cache parameters.
+  uint64_t max_age() const { return max_age_; }
+  size_t max_entries() const { return max_entries_; }
+
+ private:
+  struct Entry {
+    /// The timestamp (seconds) at which the entry was added to the cache.
+    uint64_t timestamp;
+
+    /// The entry's value.
+    T value;
+
+    /// A list iterator pointing to the entry's position in the LRU list.
+    std::list<std::string>::iterator lru_iterator;
+  };
+
+  bool LookupLocked(const std::string& key, T* value)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    if (timer_seconds_() - it->second.timestamp > max_age_) {
+      cache_.erase(it);
+      return false;
+    }
+    *value = it->second.value;
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return true;
+  }
+
+  void InsertLocked(const std::string& key, const T& value)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    lru_list_.push_front(key);
+    Entry entry{timer_seconds_(), value, lru_list_.begin()};
+    auto insert = cache_.insert(std::make_pair(key, entry));
+    if (!insert.second) {
+      lru_list_.erase(insert.first->second.lru_iterator);
+      insert.first->second = entry;
+    } else if (max_entries_ > 0 && cache_.size() > max_entries_) {
+      cache_.erase(lru_list_.back());
+      lru_list_.pop_back();
+    }
+  }
+
+  bool DeleteLocked(const std::string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    cache_.erase(it);
+    return true;
+  }
+
+  /// The maximum age of entries in the cache, in seconds. A value of 0 means
+  /// that no entry is ever placed in the cache.
+  const uint64_t max_age_;
+
+  /// The maximum number of entries in the cache. A value of 0 means there is no
+  /// limit on entry count.
+  const size_t max_entries_;
+
+  /// The callback to read timestamps.
+  std::function<uint64_t()> timer_seconds_;
+
+  /// Guards access to the cache and the LRU list.
+  absl::Mutex mu_;
+
+  /// The cache (a map from string key to Entry).
+  std::map<std::string, Entry> cache_ ABSL_GUARDED_BY(mu_);
+
+  /// The LRU list of entries. The front of the list identifies the most
+  /// recently accessed entry.
+  std::list<std::string> lru_list_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tf_gcs_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_EXPIRING_LRU_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
new file mode 100644
index 00000000..c7781f52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "google/cloud/storage/client.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h"
+#include "tensorflow/c/tf_status.h"
+
+void ParseGCSPath(const std::string& fname, bool object_empty_ok,
+                  std::string* bucket, std::string* object, TF_Status* status);
+
+namespace tf_random_access_file {
+void Cleanup(TF_RandomAccessFile* file);
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status);
+}  // namespace tf_random_access_file
+
+namespace tf_writable_file {
+void Cleanup(TF_WritableFile* file);
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status);
+int64_t Tell(const TF_WritableFile* file, TF_Status* status);
+void Flush(const TF_WritableFile* file, TF_Status* status);
+void Sync(const TF_WritableFile* file, TF_Status* status);
+void Close(const TF_WritableFile* file, TF_Status* status);
+}  // namespace tf_writable_file
+
+namespace tf_read_only_memory_region {
+void Cleanup(TF_ReadOnlyMemoryRegion* region);
+const void* Data(const TF_ReadOnlyMemoryRegion* region);
+uint64_t Length(const TF_ReadOnlyMemoryRegion* region);
+}  // namespace tf_read_only_memory_region
+
+namespace tf_gcs_filesystem {
+typedef struct GcsFileStat {
+  TF_FileStatistics base;
+  int64_t generation_number;
+} GcsFileStat;
+
+typedef struct GCSFile {
+  google::cloud::storage::Client gcs_client;  // owned
+  bool compose;
+  absl::Mutex block_cache_lock;
+  std::shared_ptr<RamFileBlockCache> file_block_cache
+      ABSL_GUARDED_BY(block_cache_lock);
+  uint64_t block_size;  // Reads smaller than block_size will trigger a read
+                        // of block_size.
+  std::unique_ptr<ExpiringLRUCache<GcsFileStat>> stat_cache;
+  GCSFile(google::cloud::storage::Client&& gcs_client);
+  // This constructor is used for testing purpose only.
+  GCSFile(google::cloud::storage::Client&& gcs_client, bool compose,
+          uint64_t block_size, size_t max_bytes, uint64_t max_staleness,
+          uint64_t stat_cache_max_age, size_t stat_cache_max_entries);
+} GCSFile;
+
+// This function is used to initialize a filesystem without the need of setting
+// manually environement variables.
+void InitTest(TF_Filesystem* filesystem, bool compose, uint64_t block_size,
+              size_t max_bytes, uint64_t max_staleness,
+              uint64_t stat_cache_max_age, size_t stat_cache_max_entries,
+              TF_Status* status);
+
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+void Cleanup(TF_Filesystem* filesystem);
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status);
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status);
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status);
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status);
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status);
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status);
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status);
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
+              TF_Status* status);
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status);
+}  // namespace tf_gcs_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
new file mode 100644
index 00000000..dfe182e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+
+#include <fstream>
+#include <ios>
+#include <string>
+
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const std::string& temp_file_name, std::ios::openmode mode);
+  TempFile(TempFile&& rhs);
+  ~TempFile() override;
+  const std::string getName() const;
+  bool truncate();
+
+ private:
+  const std::string name_;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
new file mode 100644
index 00000000..7e674722
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
@@ -0,0 +1,269 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_RAM_FILE_BLOCK_CACHE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_RAM_FILE_BLOCK_CACHE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "tensorflow/c/env.h"
+#include "tensorflow/c/logging.h"
+#include "tensorflow/c/tf_status.h"
+
+namespace tf_gcs_filesystem {
+
+/// \brief An LRU block cache of file contents, keyed by {filename, offset}.
+///
+/// This class should be shared by read-only random access files on a remote
+/// filesystem (e.g. GCS).
+class RamFileBlockCache {
+ public:
+  /// The callback executed when a block is not found in the cache, and needs to
+  /// be fetched from the backing filesystem. This callback is provided when the
+  /// cache is constructed. It returns total bytes read ( -1 in case of errors
+  /// ). The `status` should be `TF_OK` as long as the read from the remote
+  /// filesystem succeeded (similar to the semantics of the read(2) system
+  /// call).
+  typedef std::function<int64_t(const std::string& filename, size_t offset,
+                                size_t buffer_size, char* buffer,
+                                TF_Status* status)>
+      BlockFetcher;
+
+  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64_t max_staleness,
+                    BlockFetcher block_fetcher,
+                    std::function<uint64_t()> timer_seconds = TF_NowSeconds)
+      : block_size_(block_size),
+        max_bytes_(max_bytes),
+        max_staleness_(max_staleness),
+        block_fetcher_(block_fetcher),
+        timer_seconds_(timer_seconds),
+        pruning_thread_(nullptr,
+                        [](TF_Thread* thread) { TF_JoinThread(thread); }) {
+    if (max_staleness_ > 0) {
+      TF_ThreadOptions thread_options;
+      TF_DefaultThreadOptions(&thread_options);
+      pruning_thread_.reset(
+          TF_StartThread(&thread_options, "TF_prune_FBC", PruneThread, this));
+    }
+    TF_VLog(1, "GCS file block cache is %s.\n",
+            (IsCacheEnabled() ? "enabled" : "disabled"));
+  }
+
+  ~RamFileBlockCache() {
+    if (pruning_thread_) {
+      stop_pruning_thread_.Notify();
+      // Destroying pruning_thread_ will block until Prune() receives the above
+      // notification and returns.
+      pruning_thread_.reset();
+    }
+  }
+
+  /// Read `n` bytes from `filename` starting at `offset` into `buffer`. It
+  /// returns total bytes read ( -1 in case of errors ). This method will set
+  /// `status` to:
+  ///
+  /// 1) The error from the remote filesystem, if the read from the remote
+  ///    filesystem failed.
+  /// 2) `TF_FAILED_PRECONDITION` if the read from the remote filesystem
+  /// succeeded,
+  ///    but the read returned a partial block, and the LRU cache contained a
+  ///    block at a higher offset (indicating that the partial block should have
+  ///    been a full block).
+  /// 3) `TF_OUT_OF_RANGE` if the read from the remote filesystem succeeded, but
+  ///    the file contents do not extend past `offset` and thus nothing was
+  ///    placed in `out`.
+  /// 4) `TF_OK` otherwise (i.e. the read succeeded, and at least one byte was
+  /// placed
+  ///    in `buffer`).
+  ///
+  /// Caller is responsible for allocating memory for `buffer`.
+  /// `buffer` will be left unchanged in case of errors.
+  int64_t Read(const std::string& filename, size_t offset, size_t n,
+               char* buffer, TF_Status* status);
+
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file doesn't
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  bool ValidateAndUpdateFileSignature(const std::string& filename,
+                                      int64_t file_signature)
+      ABSL_LOCKS_EXCLUDED(mu_);
+
+  /// Remove all cached blocks for `filename`.
+  void RemoveFile(const std::string& filename) ABSL_LOCKS_EXCLUDED(mu_);
+
+  /// Remove all cached data.
+  void Flush() ABSL_LOCKS_EXCLUDED(mu_);
+
+  /// Accessors for cache parameters.
+  size_t block_size() const { return block_size_; }
+  size_t max_bytes() const { return max_bytes_; }
+  uint64_t max_staleness() const { return max_staleness_; }
+
+  /// The current size (in bytes) of the cache.
+  size_t CacheSize() const ABSL_LOCKS_EXCLUDED(mu_);
+
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  bool IsCacheEnabled() const { return block_size_ > 0 && max_bytes_ > 0; }
+
+  // We can not pass a lambda with capture as a function pointer to
+  // `TF_StartThread`, so we have to wrap `Prune` inside a static function.
+  static void PruneThread(void* param) {
+    auto ram_file_block_cache = static_cast<RamFileBlockCache*>(param);
+    ram_file_block_cache->Prune();
+  }
+
+ private:
+  /// The size of the blocks stored in the LRU cache, as well as the size of the
+  /// reads from the underlying filesystem.
+  const size_t block_size_;
+  /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
+  const size_t max_bytes_;
+  /// The maximum staleness of any block in the LRU cache, in seconds.
+  const uint64_t max_staleness_;
+  /// The callback to read a block from the underlying filesystem.
+  const BlockFetcher block_fetcher_;
+  /// The callback to read timestamps.
+  const std::function<uint64_t()> timer_seconds_;
+
+  /// \brief The key type for the file block cache.
+  ///
+  /// The file block cache key is a {filename, offset} pair.
+  typedef std::pair<std::string, size_t> Key;
+
+  /// \brief The state of a block.
+  ///
+  /// A block begins in the CREATED stage. The first thread will attempt to read
+  /// the block from the filesystem, transitioning the state of the block to
+  /// FETCHING. After completing, if the read was successful the state should
+  /// be FINISHED. Otherwise the state should be ERROR. A subsequent read can
+  /// re-fetch the block if the state is ERROR.
+  enum class FetchState {
+    CREATED,
+    FETCHING,
+    FINISHED,
+    ERROR,
+  };
+
+  /// \brief A block of a file.
+  ///
+  /// A file block consists of the block data, the block's current position in
+  /// the LRU cache, the timestamp (seconds since epoch) at which the block
+  /// was cached, a coordination lock, and state & condition variables.
+  ///
+  /// Thread safety:
+  /// The iterator and timestamp fields should only be accessed while holding
+  /// the block-cache-wide mu_ instance variable. The state variable should only
+  /// be accessed while holding the Block's mu lock. The data vector should only
+  /// be accessed after state == FINISHED, and it should never be modified.
+  ///
+  /// In order to prevent deadlocks, never grab the block-cache-wide mu_ lock
+  /// AFTER grabbing any block's mu lock. It is safe to grab mu without locking
+  /// mu_.
+  struct Block {
+    /// The block data.
+    std::vector<char> data;
+    /// A list iterator pointing to the block's position in the LRU list.
+    std::list<Key>::iterator lru_iterator;
+    /// A list iterator pointing to the block's position in the LRA list.
+    std::list<Key>::iterator lra_iterator;
+    /// The timestamp (seconds since epoch) at which the block was cached.
+    uint64_t timestamp;
+    /// Mutex to guard state variable
+    absl::Mutex mu;
+    /// The state of the block.
+    FetchState state ABSL_GUARDED_BY(mu) = FetchState::CREATED;
+    /// Wait on cond_var if state is FETCHING.
+    absl::CondVar cond_var;
+  };
+
+  /// \brief The block map type for the file block cache.
+  ///
+  /// The block map is an ordered map from Key to Block.
+  typedef std::map<Key, std::shared_ptr<Block>> BlockMap;
+
+  /// Prune the cache by removing files with expired blocks.
+  void Prune() ABSL_LOCKS_EXCLUDED(mu_);
+
+  bool BlockNotStale(const std::shared_ptr<Block>& block)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Look up a Key in the block cache.
+  std::shared_ptr<Block> Lookup(const Key& key) ABSL_LOCKS_EXCLUDED(mu_);
+
+  void MaybeFetch(const Key& key, const std::shared_ptr<Block>& block,
+                  TF_Status* status) ABSL_LOCKS_EXCLUDED(mu_);
+
+  /// Trim the block cache to make room for another entry.
+  void Trim() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Update the LRU iterator for the block at `key`.
+  void UpdateLRU(const Key& key, const std::shared_ptr<Block>& block,
+                 TF_Status* status) ABSL_LOCKS_EXCLUDED(mu_);
+
+  /// Remove all blocks of a file, with mu_ already held.
+  void RemoveFile_Locked(const std::string& filename)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Remove the block `entry` from the block map and LRU list, and update the
+  /// cache size accordingly.
+  void RemoveBlock(BlockMap::iterator entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// The cache pruning thread that removes files with expired blocks.
+  std::unique_ptr<TF_Thread, std::function<void(TF_Thread*)>> pruning_thread_;
+
+  /// Notification for stopping the cache pruning thread.
+  absl::Notification stop_pruning_thread_;
+
+  /// Guards access to the block map, LRU list, and cached byte count.
+  mutable absl::Mutex mu_;
+
+  /// The block map (map from Key to Block).
+  BlockMap block_map_ ABSL_GUARDED_BY(mu_);
+
+  /// The LRU list of block keys. The front of the list identifies the most
+  /// recently accessed block.
+  std::list<Key> lru_list_ ABSL_GUARDED_BY(mu_);
+
+  /// The LRA (least recently added) list of block keys. The front of the list
+  /// identifies the most recently added block.
+  ///
+  /// Note: blocks are added to lra_list_ only after they have successfully been
+  /// fetched from the underlying block store.
+  std::list<Key> lra_list_ ABSL_GUARDED_BY(mu_);
+
+  /// The combined number of bytes in all of the cached blocks.
+  size_t cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+
+  // A filename->file_signature map.
+  std::map<std::string, int64_t> file_signature_map_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tf_gcs_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_RAM_FILE_BLOCK_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/copy_file.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/copy_file.h
new file mode 100644
index 00000000..d7c2f970
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/copy_file.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_COPY_FILE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_COPY_FILE_H_
+
+#include <sys/stat.h>
+
+namespace tf_posix_filesystem {
+
+// Transfers up to `size` bytes from `dst_fd` to `src_fd`.
+//
+// This method uses `sendfile` if available (i.e., linux 2.6.33 or later) or an
+// intermediate buffer if not.
+//
+// Returns number of bytes transferred or -1 on failure.
+int CopyFileContents(int dst_fd, int src_fd, off_t size);
+
+}  // namespace tf_posix_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_COPY_FILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.h
new file mode 100644
index 00000000..0a444ef8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_POSIX_FILESYSTEM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_POSIX_FILESYSTEM_H_
+
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+
+// Initialize the POSIX filesystem.
+//
+// In general, the `TF_InitPlugin` symbol doesn't need to be exposed in a header
+// file, since the plugin registration will look for the symbol in the DSO file
+// that provides the filesystem functionality. However, the POSIX filesystem
+// needs to be statically registered in some tests and utilities for building
+// the API files at the time of creating the pip package. Hence, we need to
+// expose this function so that this filesystem can be statically registered
+// when needed.
+void TF_InitPlugin(TF_FilesystemPluginInfo* info);
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_POSIX_FILESYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_helper.h b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_helper.h
new file mode 100644
index 00000000..612366ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_helper.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_POSIX_FILESYSTEM_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_POSIX_FILESYSTEM_HELPER_H_
+
+#include <dirent.h>
+#include <sys/stat.h>
+
+namespace tf_posix_filesystem {
+
+// Copies up to `size` of `src` to `dst`, creating destination if needed.
+//
+// Callers should pass size of `src` in `size` and the permissions of `src` in
+// `mode`. The later is only used if `dst` needs to be created.
+int TransferFileContents(const char* src, const char* dst, mode_t mode,
+                         off_t size);
+
+// Returns true only if `entry` points to an entry other than `.` or `..`.
+//
+// This is a filter for `scandir`.
+int RemoveSpecialDirectoryEntries(const struct dirent* entry);
+
+}  // namespace tf_posix_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_POSIX_POSIX_FILESYSTEM_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/array_grad.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/array_grad.h
new file mode 100644
index 00000000..3dcf98b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/array_grad.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+GradientFunction* IdentityNRegisterer(const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/grad_test_helper.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/grad_test_helper.h
new file mode 100644
index 00000000..84761f96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/grad_test_helper.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_GRAD_TEST_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_GRAD_TEST_HELPER_H_
+
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/unified_api_testutil.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+
+void CompareNumericalAndAutodiffGradients(
+    Model model, Model grad_model, AbstractContext* ctx,
+    absl::Span<AbstractTensorHandle* const> inputs, bool use_function,
+    double abs_error = 1e-2);
+
+void CheckTensorValue(AbstractTensorHandle* t, absl::Span<const float> manuals,
+                      absl::Span<const int64_t> dims, double abs_error = 1e-2);
+
+Model BuildGradModel(Model forward, GradientRegistry registry);
+
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_GRAD_TEST_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/math_grad.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/math_grad.h
new file mode 100644
index 00000000..e26ee899
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/math_grad.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+
+GradientFunction* AddRegisterer(const ForwardOperation& op);
+GradientFunction* ExpRegisterer(const ForwardOperation& op);
+GradientFunction* MatMulRegisterer(const ForwardOperation& op);
+GradientFunction* SqrtRegisterer(const ForwardOperation& op);
+GradientFunction* NegRegisterer(const ForwardOperation& op);
+GradientFunction* SubRegisterer(const ForwardOperation& op);
+GradientFunction* MulRegisterer(const ForwardOperation& op);
+GradientFunction* Log1pRegisterer(const ForwardOperation& op);
+GradientFunction* DivNoNanRegisterer(const ForwardOperation& op);
+
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/nn_grad.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/nn_grad.h
new file mode 100644
index 00000000..2a635f54
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/nn_grad.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+GradientFunction* ReluRegisterer(const ForwardOperation& op);
+GradientFunction* SparseSoftmaxCrossEntropyWithLogitsRegisterer(
+    const ForwardOperation& op);
+GradientFunction* BiasAddRegisterer(const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/not_differentiable.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/not_differentiable.h
new file mode 100644
index 00000000..7167340a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/not_differentiable.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NOT_DIFFERENTIABLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NOT_DIFFERENTIABLE_H_
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+// Ignores `grad_outputs` and sets all entries in grad_inputs to nullptr.
+class NotDifferentiableGradientFunction : public GradientFunction {
+  absl::Status Compute(AbstractContext* ctx,
+                       absl::Span<AbstractTensorHandle* const> grad_outputs,
+                       absl::Span<AbstractTensorHandle*> grad_inputs) override;
+};
+// Shorthand for registry->Register(op, new NotDifferentiableGradientFunction)
+absl::Status RegisterNotDifferentiable(GradientRegistry* registry,
+                                       const string& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NOT_DIFFERENTIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/tape/tape_context.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/tape/tape_context.h
new file mode 100644
index 00000000..f92c35f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/tape/tape_context.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_CONTEXT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_CONTEXT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_operation.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gradients {
+class TapeContext : public AbstractContext {
+ public:
+  explicit TapeContext(AbstractContext*, Tape*, const GradientRegistry&);
+  void Release() override;
+  TapeOperation* CreateOperation() override;
+  absl::Status RegisterFunction(AbstractFunction*) override;
+  absl::Status RemoveFunction(const string& func) override;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kTape;
+  }
+  ~TapeContext() override;
+
+ private:
+  AbstractContext* parent_ctx_;  // Not owned.
+  Tape* tape_;
+  const GradientRegistry& registry_;
+};
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/tape/tape_operation.h b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/tape/tape_operation.h
new file mode 100644
index 00000000..8f447440
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/gradients/tape/tape_operation.h
@@ -0,0 +1,94 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_OPERATION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_OPERATION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gradients {
+class TapeOperation : public AbstractOperation {
+ public:
+  explicit TapeOperation(AbstractOperation*, Tape*, const GradientRegistry&);
+  void Release() override;
+  absl::Status Reset(const char* op, const char* raw_device_name) override;
+  const string& Name() const override;
+  const string& DeviceName() const override;
+  absl::Status SetDeviceName(const char* name) override;
+  absl::Status AddInput(AbstractTensorHandle* input) override;
+  absl::Status AddInputList(
+      absl::Span<AbstractTensorHandle* const> inputs) override;
+  absl::Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                       int* num_retvals) override;
+  absl::Status SetAttrString(const char* attr_name, const char* data,
+                             size_t length) override;
+  absl::Status SetAttrInt(const char* attr_name, int64_t value) override;
+  absl::Status SetAttrFloat(const char* attr_name, float value) override;
+  absl::Status SetAttrBool(const char* attr_name, bool value) override;
+  absl::Status SetAttrType(const char* attr_name, DataType value) override;
+  absl::Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                            const int num_dims) override;
+  absl::Status SetAttrFunction(const char* attr_name,
+                               const AbstractOperation* value) override;
+  absl::Status SetAttrFunctionName(const char* attr_name, const char* value,
+                                   size_t length) override;
+  absl::Status SetAttrTensor(const char* attr_name,
+                             AbstractTensorInterface* tensor) override;
+  absl::Status SetAttrStringList(const char* attr_name,
+                                 const void* const* values,
+                                 const size_t* lengths,
+                                 int num_values) override;
+  absl::Status SetAttrFloatList(const char* attr_name, const float* values,
+                                int num_values) override;
+  absl::Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                              int num_values) override;
+  absl::Status SetAttrTypeList(const char* attr_name, const DataType* values,
+                               int num_values) override;
+  absl::Status SetAttrBoolList(const char* attr_name,
+                               const unsigned char* values,
+                               int num_values) override;
+  absl::Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                                const int* num_dims, int num_values) override;
+  absl::Status SetAttrFunctionList(
+      const char* attr_name,
+      absl::Span<const AbstractOperation*> values) override;
+  AbstractOperation* GetBackingOperation();
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kTape;
+  }
+  ~TapeOperation() override;
+
+ private:
+  AbstractOperation* parent_op_;
+  ForwardOperation forward_op_;
+  Tape* tape_;
+  const GradientRegistry& registry_;
+};
+
+}  // namespace gradients
+}  // namespace tensorflow
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/grappler/grappler.h b/third_party/tflite-hdrs/tensorflow/c/experimental/grappler/grappler.h
new file mode 100644
index 00000000..0a293c66
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/grappler/grappler.h
@@ -0,0 +1,294 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRAPPLER_GRAPPLER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRAPPLER_GRAPPLER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/c/tf_status.h"
+
+// --------------------------------------------------------------------------
+// C API for Graph. The API is under active development and eventually
+// should allow registering a plugin graph optimizer with TensorFlow.
+//
+// Conventions:
+//   * Struct prefix indicates whether struct fields should be filled by the
+//     plugin or core implementation:
+//     * Struct that should be filled by the plugin: `TP_OptimizerConfigs`,
+//       `TP_Optimizer`, `TP_OptimizerRegistrationParams`
+//     * Struct that should be filled by the proper: `TF_GrapplerItem`,
+//       `TF_GraphProperties`, `TF_FunctionLibraryDefinition`
+//   * We use `struct_size` for version checking. It should be set both by
+//     core and the plugin.
+//     * For example, `TF_InitGraph` function receives
+//       `TP_OptimizerRegistrationParams*` as input with `struct_size`
+//       populated by core. The plugin is responsible for setting
+//       `struct_size` as well, along with all other fields.
+//     * Refer to "TensorFlow Versioning Strategy" section at
+//       https://github.com/tensorflow/community/pull/257/files.
+//     * Note that the API is still under active development and doesn't have
+//       versioning guarantees yet.
+//   * `void* ext` is a free-form field that can be populated by
+//     a plugin in `TP_*` structs or potential future extension points .
+//
+// Example usage:
+//
+//   /* Sample TensorFlow code below, exact implementation might differ. */
+//   // Version checking uses `struct_size`. It should be set both by core
+//   // and the plugin.
+//   TP_OptimizerRegistrationParams params{
+//       TP_OPTIMIZER_REGISTRATION_PARAMS_STRUCT_SIZE};
+//   TP_Optimizer optimizer{TP_OPTIMIZER_STRUCT_SIZE};
+//   TP_OptimizerConfigs configs{TP_OPTIMIZER_CONFIGS_STRUCT_SIZE};
+//   params.optimizer = &optimizer;
+//   params.configs = &configs;
+//
+//   /* Plugin code below */
+//    void TF_InitGraph(TP_OptimizerRegistrationParams* params,
+//                            TF_Status* status) {
+//      params->struct_size = TP_OPTIMIZER_REGISTRATION_PARAMS_STRUCT_SIZE;
+//      params->device_type = "MY_DEVICE";
+//
+//      // Disable certain optimizer.
+//      params->optimizer_configs->struct_size =
+//      TP_OPTIMIZER_CONFIGS_STRUCT_SIZE; params->optimizer_configs->remapping =
+//      TF_TriState_Off;
+//
+//      // Set functions to create a new optimizer.
+//      params->optimizer->struct_size = TP_OPTIMIZER_STRUCT_SIZE;
+//      params->optimizer->create_func = (My_optimizer::create_func);
+//    }
+
+#define GO_MAJOR 0
+#define GO_MINOR 0
+#define GO_PATCH 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TF_TriState is the C API typedef for tri-state.
+typedef enum TF_TriState {
+  TF_TriState_Default = 0,
+  TF_TriState_Off,
+  TF_TriState_On,
+} TF_TriState;
+
+// TF_GrapplerItem represents a combination of a graph, one of more fetch nodes,
+// and potentially a set of nodes to feed.
+typedef struct TF_GrapplerItem TF_GrapplerItem;
+
+// Flags indicating whether existing optimizers should be turned off.
+// It's optional for plugin to set functions to return true/false. If not
+// set, proper uses configuration set by user.
+typedef struct TP_OptimizerConfigs {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+  TF_TriState disable_model_pruning;
+  TF_TriState implementation_selector;
+  TF_TriState function_optimization;
+  TF_TriState common_subgraph_elimination;
+  TF_TriState arithmetic_optimization;
+  TF_TriState debug_stripper;
+  TF_TriState constant_folding;
+  TF_TriState shape_optimization;
+  TF_TriState auto_mixed_precision;
+  TF_TriState auto_mixed_precision_onednn_bfloat16;
+  TF_TriState auto_mixed_precision_mkl;
+  TF_TriState pin_to_host_optimization;
+  TF_TriState layout_optimizer;
+  TF_TriState remapping;
+  TF_TriState loop_optimization;
+  TF_TriState dependency_optimization;
+  TF_TriState auto_parallel;
+  TF_TriState memory_optimization;
+  TF_TriState scoped_allocator_optimization;
+} TP_OptimizerConfigs;
+
+#define TP_OPTIMIZER_CONFIGS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TP_OptimizerConfigs, scoped_allocator_optimization)
+
+// Struct for Optimizer. Plugin authors must provide an optimize function.
+// Creation and deletion functions are optional.
+typedef struct TP_Optimizer {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // [Optional]
+  // Create function for optimizer.
+  void* (*create_func)();
+
+  // Optimizer function for optimizer. The first param is an optimizer created
+  // by create_func. The second param is input graph. The third param is
+  // GrapplerItem. The fourth param is output graph.
+  void (*optimize_func)(void*, const TF_Buffer*, const TF_GrapplerItem*,
+                        TF_Buffer*, TF_Status*);
+
+  // [Optional]
+  // Destroy function for optimizer. If Create function is provided, destroy
+  // function is must.
+  void (*destroy_func)(void*);
+} TP_Optimizer;
+
+#define TP_OPTIMIZER_STRUCT_SIZE TF_OFFSET_OF_END(TP_Optimizer, destroy_func)
+
+typedef struct TP_OptimizerRegistrationParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // Graph C API version.
+  int32_t major_version;
+  int32_t minor_version;
+  int32_t patch_version;
+
+  // Backend device type supported by the optimizer.
+  const char* device_type;
+  TP_OptimizerConfigs* optimizer_configs;  // output, set by plugin
+  TP_Optimizer* optimizer;                 // output, set by plugin
+} TP_OptimizerRegistrationParams;
+
+#define TP_OPTIMIZER_REGISTRATION_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TP_OptimizerRegistrationParams, optimizer)
+
+// TF_InitGraph is used to do graph optimizer registration.
+// Plugin should implement TF_InitGraph to register graph optimizers.
+void TF_InitGraph(TP_OptimizerRegistrationParams* params, TF_Status* status);
+
+// Get a set of node names that must be preserved. They can not be transformed
+// or removed during the graph transformation. This includes feed and fetch
+// nodes, keep_ops, init_ops. Fills in `num_values` and `storage_size`, they
+// will be used in `TF_GetNodesToPreserveList`.
+TF_CAPI_EXPORT extern void TF_GetNodesToPreserveListSize(
+    const TF_GrapplerItem* item, int* num_values, size_t* storage_size,
+    TF_Status* status);
+
+// Get a set of node names that must be preserved. They can not be transformed
+// or removed during the graph transformation. This includes feed and fetch
+// nodes, keep_ops, init_ops. Fills in `values` and `lengths`, each of which
+// must point to an array of length at least `num_values`.
+//
+// The elements of values will point to addresses in `storage` which must be at
+// least `storage_size` bytes in length.  `num_values` and `storage` can be
+// obtained from TF_GetNodesToPreserveSize
+//
+// Fails if storage_size is too small to hold the requested number of strings.
+TF_CAPI_EXPORT extern void TF_GetNodesToPreserveList(
+    const TF_GrapplerItem* item, char** values, size_t* lengths, int num_values,
+    void* storage, size_t storage_size, TF_Status* status);
+
+// Get a set of node names for fetch nodes. Fills in `values` and `lengths`,
+// they will be used in `TF_GetFetchNodesList`
+TF_CAPI_EXPORT extern void TF_GetFetchNodesListSize(const TF_GrapplerItem* item,
+                                                    int* num_values,
+                                                    size_t* storage_size,
+                                                    TF_Status* status);
+
+// Get a set of node names for fetch nodes. Fills in `values` and `lengths`,
+// each of which must point to an array of length at least `num_values`.
+//
+// The elements of values will point to addresses in `storage` which must be at
+// least `storage_size` bytes in length.  `num_values` and `storage` can be
+// obtained from TF_GetFetchNodesSize
+//
+// Fails if storage_size is too small to hold the requested number of strings.
+TF_CAPI_EXPORT extern void TF_GetFetchNodesList(const TF_GrapplerItem* item,
+                                                char** values, size_t* lengths,
+                                                int num_values, void* storage,
+                                                size_t storage_size,
+                                                TF_Status* status);
+
+// Infer OpInfo::TensorProperties for graph nodes inputs/outputs.
+//
+// Typical use case, is to infer tensor properties from a graph, before doing
+// optimization pass. Nodes modified during optimization pass have to be
+// invalidated, to prevent further incorrect optimizations based on wrong shape
+// and data type properties.
+typedef struct TF_GraphProperties TF_GraphProperties;
+
+// Create GraphProperties. The item must outlive the properties.
+TF_CAPI_EXPORT extern TF_GraphProperties* TF_NewGraphProperties(
+    const TF_GrapplerItem* item);
+
+// Delete GraphProperties.
+TF_CAPI_EXPORT extern void TF_DeleteGraphProperties(
+    TF_GraphProperties* graph_properties);
+
+// Infer tensor shapes through abstract interpretation.
+// If assume_valid_feeds is true, it can help infer shapes in the fanout of fed
+// nodes. This may cause incorrectness in graph analyses, but is useful for
+// simulation or scheduling.
+// If aggressive_shape_inference is true, nodes are executed on the host to
+// identify output values when possible and does other aggressive strategies.
+// This may cause incorrectness in graph analyses, but is useful for simulation
+// or scheduling.
+// If include_input_tensor_values is true, the values of constant
+// tensors will included in the input properties.
+// If include_output_tensor_values is true, the values of constant tensors will
+// be included in the output properties.
+TF_CAPI_EXPORT extern void TF_InferStatically(
+    TF_GraphProperties* graph_properties, TF_Bool assume_valid_feeds,
+    TF_Bool aggressive_shape_inference, TF_Bool include_input_tensor_values,
+    TF_Bool include_output_tensor_values, TF_Status* s);
+
+// Get the size of input OpInfo::TensorProperties given node name.
+TF_CAPI_EXPORT extern void TF_GetInputPropertiesListSize(
+    TF_GraphProperties* graph_properties, const char* name, int* num_values,
+    TF_Status* status);
+
+// Get the size of output OpInfo::TensorProperties given node name.
+TF_CAPI_EXPORT extern void TF_GetOutputPropertiesListSize(
+    TF_GraphProperties* graph_properties, const char* name, int* num_values,
+    TF_Status* status);
+
+// Get a list of input OpInfo::TensorProperties given node name.
+// Return the serialized list `properties`.
+TF_CAPI_EXPORT extern void TF_GetInputPropertiesList(
+    TF_GraphProperties* graph_properties, const char* name,
+    TF_Buffer** properties, int num_values, TF_Status* status);
+
+// Get a list of output OpInfo::TensorProperties given node name.
+// Return the serialized list `properties`.
+TF_CAPI_EXPORT extern void TF_GetOutputPropertiesList(
+    TF_GraphProperties* graph_properties, const char* name,
+    TF_Buffer** properties, int num_values, TF_Status* status);
+
+// Helper to maintain a map between function names in a given
+// FunctionDefLibrary and function definitions.
+// Typical use case, is to look up an OpDef by type name.
+typedef struct TF_FunctionLibraryDefinition TF_FunctionLibraryDefinition;
+
+// Create NewFunctionLibraryDefinition.
+TF_CAPI_EXPORT extern TF_FunctionLibraryDefinition*
+TF_NewFunctionLibraryDefinition(const TF_Buffer* graph_buf, TF_Status* status);
+
+// Delete NewFunctionLibraryDefinition.
+TF_CAPI_EXPORT extern void TF_DeleteFunctionLibraryDefinition(
+    TF_FunctionLibraryDefinition* fn_lib);
+
+// Shorthand for calling LookUp to get the OpDef from FunctionLibraryDefinition
+// given op name. The returned OpDef is represented by TF_Buffer.
+TF_CAPI_EXPORT extern void TF_LookUpOpDef(TF_FunctionLibraryDefinition* fn_lib,
+                                          const char* name, TF_Buffer* buf,
+                                          TF_Status* s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRAPPLER_GRAPPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/grappler/grappler_internal.h b/third_party/tflite-hdrs/tensorflow/c/experimental/grappler/grappler_internal.h
new file mode 100644
index 00000000..799d3bef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/grappler/grappler_internal.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Classes and utilities that work with Graph C API for internal use.
+// This includes functions used for optimizer registration and interfaces needed
+// for testing.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRAPPLER_GRAPPLER_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRAPPLER_GRAPPLER_INTERNAL_H_
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/experimental/grappler/grappler.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Plugin initialization function that a device plugin
+// must define.
+typedef void (*TFInitGraphPluginFn)(TP_OptimizerRegistrationParams* const,
+                                    TF_Status* const);
+
+// Registers Graph optimizers.
+Status InitGraphPlugin(void* dso_handle);
+
+// Allow registering a graph optimizer using a function (used for
+// testing).
+Status InitGraphPlugin(TFInitGraphPluginFn init_fn);
+
+struct GrapplerItem;
+class Cluster;
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* s) const { TF_DeleteStatus(s); }
+};
+using OwnedTFStatus = std::unique_ptr<TF_Status, TFStatusDeleter>;
+
+struct TFBufferDeleter {
+  void operator()(TF_Buffer* buf) const { TF_DeleteBuffer(buf); }
+};
+using OwnedTFBuffer = std::unique_ptr<TF_Buffer, TFBufferDeleter>;
+
+class CGraphOptimizer : public CustomGraphOptimizer {
+ public:
+  explicit CGraphOptimizer(TP_Optimizer optimizer, const char* device_type)
+      : optimizer_(optimizer), device_type_(device_type) {
+    if (optimizer.create_func != nullptr) {
+      c_optimizer_ = (*optimizer_.create_func)();
+    } else {
+      c_optimizer_ = nullptr;
+    }
+  }
+  std::string name() const override { return "PluggableGraphOptimizer"; }
+  bool UsesFunctionLibrary() const override { return false; }
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return OkStatus();
+  }
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph_def) override;
+
+  ~CGraphOptimizer() override {
+    if (optimizer_.destroy_func != nullptr) {
+      (*optimizer_.destroy_func)(c_optimizer_);
+    }
+  }
+
+ private:
+  TP_Optimizer optimizer_;
+  std::string device_type_;
+  void* c_optimizer_;
+};
+
+// Registration function to register a CGraphOptimizer along with plugin configs
+// and device type.
+void CGraphOptimizerRegister(
+    const PluginGraphOptimizerRegistry::Creator& creator,
+    const TP_OptimizerConfigs tp_configs, const char* device_type);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRAPPLER_GRAPPLER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/next_pluggable_device/c_api.h b/third_party/tflite-hdrs/tensorflow/c/experimental/next_pluggable_device/c_api.h
new file mode 100644
index 00000000..036d33dc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/next_pluggable_device/c_api.h
@@ -0,0 +1,156 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
+
+#include <cstdint>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/kernels_experimental.h"
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/c/tf_status.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+// --------------------------------------------------------------------------
+// C API for device. The API is under active development and eventually
+// should allow registering a plugin device with TensorFlow.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TF_Device is a C wrapper to the C++ TF Device class. This is to be passed
+// through TF_OpKernelContext, and is opaque to plugin.
+typedef struct TF_Device TF_Device;
+
+typedef struct TF_VariableInfo TF_VariableInfo;
+
+// Returns a `TF_Device` pointer, which actually points to a C++ `Device`.
+// Currently we only allow `NextPluggableDevice` to be casted as `TF_Device`,
+// but in theory every this is a C API for every kind of device.
+TF_CAPI_EXPORT extern TF_Device* TF_GetDevice(TF_OpKernelContext* ctx);
+
+// --------------------------  Resource  ---------------------------------------
+// Create a `tensorflow::PluginResource` to the ResourceMgr provided by the
+// `ctx`. The `tensorflow::PluginResource` wraps a resource by plugin (as a
+// opaque pointer, since TensorFlow cannot parse it). `delete_func` is needed
+// for ResourceMgr to clean up the resource. `status` will be set.
+TF_CAPI_EXPORT extern void TF_CreatePluginResource(
+    TF_OpKernelContext* ctx, const char* container_name,
+    const char* plugin_resource_name, void* plugin_resource,
+    void (*delete_func)(void*), TF_Status* status);
+
+// If the ResourceMgr provided by the `ctx` has a resource
+// `plugin_resource_name`, returns it in `*result_plugin_resource`. Otherwise,
+// invokes create_func to create the resource. `delete_func` is needed for
+// ResourceMgr to clean up the resource. `status` will be set. If `status` is
+// not OK, `*result_plugin_resource` will be set as nullptr.
+//
+// Caller does not take ownership of the `plugin_resource`.
+TF_CAPI_EXPORT extern void TF_LookupOrCreatePluginResource(
+    TF_OpKernelContext* ctx, const char* container_name,
+    const char* plugin_resource_name, void** result_plugin_resource,
+    void* (*create_func)(void*), void* create_func_args,
+    void (*delete_func)(void*), TF_Status* status);
+
+// -------------------------  VariableInfo  ------------------------------------
+TF_CAPI_EXPORT extern TF_VariableInfo* TF_CreateVariableInfoFromContext(
+    TF_OpKernelContext* ctx, int index, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_LockVariableInfos(TF_VariableInfo** vars,
+                                                int num_vars,
+                                                TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_AllocateTempForVariableInfo(
+    TF_OpKernelContext* ctx, TF_VariableInfo* var_info, TF_Status* status);
+
+TF_CAPI_EXPORT extern TF_Tensor* TF_GetTensorFromVariableInfo(
+    TF_VariableInfo* var_info, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_DeleteVariableInfo(TF_VariableInfo* var_info);
+
+// ---------------------  Coordination service  --------------------------------
+// Returns a not owning pointer to the coordination service agent, which is
+// opaque to plugin. Plugin OpKernels need to use the accompanying C APIs to
+// access coordination service functionalities.
+TF_CAPI_EXPORT extern TF_CoordinationServiceAgent*
+TF_GetCoordinationServiceAgent(TF_OpKernelContext* ctx);
+
+// Returns true if the coordination service agent has been initialized.
+TF_CAPI_EXPORT extern bool TF_CoordinationServiceIsInitialized(
+    TF_CoordinationServiceAgent* agent);
+
+TF_CAPI_EXPORT extern void TF_CoordinationServiceInsertKeyValue(
+    const char* key, int64_t key_size, const char* value, int64_t value_size,
+    TF_CoordinationServiceAgent* agent, TF_Status* status);
+
+// Obtains key-value from coordination service agent. The returned `TF_Buffer`
+// is a newly allocated buffer to hold the string key-value, and caller is
+// responsible for managing the lifetime. If error, `status` will be set and a
+// nullptr will be returned.
+TF_CAPI_EXPORT extern TF_Buffer* TF_CoordinationServiceGetKeyValue(
+    const char* key, int64_t key_size, TF_CoordinationServiceAgent* agent,
+    TF_Status* status);
+
+TF_CAPI_EXPORT extern TF_Buffer* TF_CoordinationServiceGetKeyValueWithTimeout(
+    const char* key, int64_t key_size, int64_t timeout_seconds,
+    TF_CoordinationServiceAgent* agent, TF_Status* status);
+
+TF_CAPI_EXPORT extern TF_Buffer* TF_CoordinationServiceTryGetKeyValue(
+    const char* key, int64_t key_size, TF_CoordinationServiceAgent* agent,
+    TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_CoordinationServiceDeleteKeyValue(
+    const char* key, int64_t key_size, TF_CoordinationServiceAgent* agent,
+    TF_Status* status);
+
+// ----------------------------  PJRT  -----------------------------------------
+// Passes the pointer to a vector of PJRT_NamedValue and number of options to
+// set options for creating a PJRT client. Passes nullptr for create_options and
+// 0 for num_options if no options need to be set. You can use
+// ConvertToPjRtNamedValueList in
+// tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h to generate the options.
+TF_CAPI_EXPORT extern void TF_CreateAndSetPjRtCApiClient(
+    const char* device_type, TF_Status* status, PJRT_NamedValue* create_options,
+    int num_options);
+
+// Resets the PjRt client for a device. After this, `TF_GetPjRtCClient` will
+// returns an error for that device.
+TF_CAPI_EXPORT extern void TF_ResetPjRtCClient(const char* device_type,
+                                               TF_Status* status);
+
+// Gets the `PJRT_Client*` stored in TF global ResourceManager.
+TF_CAPI_EXPORT extern PJRT_Client* TF_GetPjRtCClient(const char* device_type,
+                                                     TF_Status* status);
+
+// Gets the `PJRT_Buffer*` stored in the tensor. The status will contain error
+// if the tensor does not have a `PjRtCApiBuffer`.
+TF_CAPI_EXPORT extern PJRT_Buffer* TF_GetPjRtCBuffer(TF_Tensor* c_tensor,
+                                                     TF_Status* status);
+
+// Creates a `PjRtCApiBuffer` with the `PJRT_Buffer*` passed in and set to the
+// tensor.
+TF_CAPI_EXPORT extern void TF_CreatePjRtBuffer(TF_Tensor* c_tensor,
+                                               PJRT_Buffer* c_buffer,
+                                               const char* device_type,
+                                               TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h b/third_party/tflite-hdrs/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
new file mode 100644
index 00000000..c2378b68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_TENSOR_PJRT_BUFFER_UTIL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_TENSOR_PJRT_BUFFER_UTIL_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/pjrt_c_api_client.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+absl::StatusOr<PJRT_Buffer*> GetPjRtCBufferFromTensor(const Tensor* tensor);
+
+absl::Status SetPjRtCBufferToTensor(PJRT_Buffer* c_buffer,
+                                    xla::PjRtCApiClient* c_api_client,
+                                    Tensor* tensor);
+
+absl::StatusOr<xla::PjRtCApiClient*> GetPjRtCApiClient(
+    const DeviceType& device_type);
+
+absl::Status ResetPjRtClient(const DeviceType& device_type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_TENSOR_PJRT_BUFFER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/array_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/array_ops.h
new file mode 100644
index 00000000..0af99e9f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/array_ops.h
@@ -0,0 +1,70 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is MACHINE GENERATED! Do not edit.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Return a tensor with the same shape and contents as the input tensor or
+// value.
+absl::Status Identity(AbstractContext* ctx, AbstractTensorHandle* const input,
+                      AbstractTensorHandle** output, const char* name = nullptr,
+                      const char* raw_device_name = nullptr);
+
+// Returns a list of tensors with the same shapes and contents as the input
+absl::Status IdentityN(AbstractContext* ctx,
+                       absl::Span<AbstractTensorHandle* const> input,
+                       absl::Span<AbstractTensorHandle*> output,
+                       const char* name = nullptr,
+                       const char* raw_device_name = nullptr);
+
+// Returns a tensor of zeros with the same shape and type as x.
+absl::Status ZerosLike(AbstractContext* ctx, AbstractTensorHandle* const x,
+                       AbstractTensorHandle** y, const char* name = nullptr,
+                       const char* raw_device_name = nullptr);
+
+// Returns the shape of a tensor.
+absl::Status Shape(AbstractContext* ctx, AbstractTensorHandle* const input,
+                   AbstractTensorHandle** output, DataType out_type = DT_INT32,
+                   const char* name = nullptr,
+                   const char* raw_device_name = nullptr);
+
+// Inserts a dimension of 1 into a tensor's shape.
+absl::Status ExpandDims(AbstractContext* ctx, AbstractTensorHandle* const input,
+                        AbstractTensorHandle* const dim,
+                        AbstractTensorHandle** output,
+                        const char* name = nullptr,
+                        const char* raw_device_name = nullptr);
+
+// Returns a tensor of ones with the same shape and type as x.
+absl::Status OnesLike(AbstractContext* ctx, AbstractTensorHandle* const x,
+                      AbstractTensorHandle** y, const char* name = nullptr,
+                      const char* raw_device_name = nullptr);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/case_format.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/case_format.h
new file mode 100644
index 00000000..f8255f6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/case_format.h
@@ -0,0 +1,46 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_CASE_FORMAT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_CASE_FORMAT_H_
+
+#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+// Conversion routines between upper/lower camel/snake case formats, e.g.:
+//   "lowerCamelCase"
+//   "lower_snake_case"
+//   "UpperCamelCase"
+//   "UPPER_SNAKE_CASE"
+//
+// The input format is automatically detected.
+// The delimiter must be specified if it is other than an underscore ('_')
+// for conversion either *to* or *from* snake case.
+//
+// Leading and trailing delimiters are supported, e.g.:
+//    "__OneTwo__" (in camel case)  <==>  "__ONE_TWO__" (in snake case)
+//
+// Note: performance not yet tested.
+string toLowerCamel(const string &s, const char delimiter = '_');
+string toLowerSnake(const string &s, const char delimiter = '_');
+string toUpperCamel(const string &s, const char delimiter = '_');
+string toUpperSnake(const string &s, const char delimiter = '_');
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_CASE_FORMAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/controller.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/controller.h
new file mode 100644
index 00000000..e152efeb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/controller.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_CONTROLLER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_CONTROLLER_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/common/path_config.h"
+#include "tensorflow/c/experimental/ops/gen/common/source_code.h"
+#include "tensorflow/c/experimental/ops/gen/model/op_spec.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+class Controller {
+ public:
+  explicit Controller(PathConfig path_config, Env* env = Env::Default());
+  virtual ~Controller();
+  const void WriteFile(const string& file_path, const SourceCode& code) const;
+  const std::vector<OpSpec>& GetModelOps() const;
+
+ private:
+  void InitializeOpApi();
+  void BuildModel();
+
+  // Data model: Ops to generate
+  std::vector<OpSpec> operators_;
+
+  // Configuration
+  Env* env_;
+  PathConfig path_config_;
+
+  // Initialized TensorFlow Op/API definitions
+  OpList op_list_;
+  ApiDefMap* api_def_map_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_CONTROLLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/path_config.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/path_config.h
new file mode 100644
index 00000000..ce29063b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/path_config.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_PATH_CONFIG_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_PATH_CONFIG_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+struct PathConfig {
+  string output_path;
+  std::vector<string> op_names;
+  std::vector<string> api_dirs;
+  string tf_prefix_dir;
+  string tf_root_dir;
+  string tf_output_dir;
+
+  explicit PathConfig() = default;
+  explicit PathConfig(const string &output_dir, const string &source_dir,
+                      const string &api_dir_list,
+                      const std::vector<string> op_names);
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_PATH_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/source_code.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/source_code.h
new file mode 100644
index 00000000..df1aa90a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/source_code.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_SOURCE_CODE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_SOURCE_CODE_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+class SourceCode {
+ public:
+  string Render() const;
+  void SetSpacesPerIndent(int spaces_per_indent) {
+    spaces_per_indent_ = spaces_per_indent;
+  }
+
+  void AddLineWithIndent(const string &line);
+  void AddLineWithoutIndent(const string &line);
+  void AddBlankLine();
+  void IncreaseIndent();
+  void DecreaseIndent();
+
+ private:
+  struct Line {
+    int indent;
+    string text;
+  };
+
+  void ValidateAndAddLine(int indent_level, const string &raw_line);
+
+  int spaces_per_indent_ = 2;
+  int current_indent_ = 0;
+  std::vector<Line> lines_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_SOURCE_CODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/view_util.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/view_util.h
new file mode 100644
index 00000000..7ab437a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/common/view_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_VIEW_UTIL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_VIEW_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+string Call(const string &function, std::vector<string> arguments);
+string Call(const string &object, const string &method,
+            std::vector<string> arguments, const char *oper = "->");
+string Quoted(const string &s);
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_COMMON_VIEW_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
new file mode 100644
index 00000000..0a7b08cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_CPP_GENERATOR_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_CPP_GENERATOR_H_
+
+#include "tensorflow/c/experimental/ops/gen/common/controller.h"
+#include "tensorflow/c/experimental/ops/gen/common/path_config.h"
+#include "tensorflow/c/experimental/ops/gen/common/source_code.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+class CppGenerator {
+ public:
+  explicit CppGenerator(cpp::CppConfig cpp_config, PathConfig path_config);
+  SourceCode HeaderFileContents() const;
+  SourceCode SourceFileContents() const;
+  string HeaderFileName() const;
+  string SourceFileName() const;
+  void WriteHeaderFile() const;
+  void WriteSourceFile() const;
+
+ private:
+  SourceCode GenerateOneFile(cpp::RendererContext::Mode mode) const;
+
+  Controller controller_;
+  cpp::CppConfig cpp_config_;
+  PathConfig path_config_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_CPP_GENERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
new file mode 100644
index 00000000..fa7571d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_CPP_CONFIG_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_CPP_CONFIG_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+struct CppConfig {
+  string category;
+  string unit;
+  std::vector<string> namespaces;
+
+  explicit CppConfig() = default;
+  explicit CppConfig(const string &category,
+                     const string &name_space = "tensorflow::ops");
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_CPP_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.h
new file mode 100644
index 00000000..4bfc3f92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_CPP_FILE_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_CPP_FILE_RENDERER_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class CppFileRenderer : public Renderer {
+ public:
+  explicit CppFileRenderer(RendererContext context,
+                           const std::vector<OpView> &ops);
+  void Render();
+
+ private:
+  GuardRenderer guard_;
+  NamespaceRenderer name_space_;
+  IncludeRenderer includes_;
+  std::vector<OpView> ops_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_CPP_FILE_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
new file mode 100644
index 00000000..a45fe89a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_GUARD_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_GUARD_RENDERER_H_
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class GuardRenderer : public Renderer {
+ public:
+  explicit GuardRenderer(RendererContext context);
+
+  void Open();
+  void Close();
+
+ private:
+  string guard_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_GUARD_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
new file mode 100644
index 00000000..e43715a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_INCLUDE_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_INCLUDE_RENDERER_H_
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class IncludeRenderer : public Renderer {
+ public:
+  explicit IncludeRenderer(RendererContext context);
+
+  string SelfHeaderPath() const;
+  void SelfHeader();
+  void Headers();
+
+ private:
+  void Include(const string &tf_file_path);
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_INCLUDE_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
new file mode 100644
index 00000000..fd8ccf95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_NAMESPACE_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_NAMESPACE_RENDERER_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class NamespaceRenderer : public Renderer {
+ public:
+  explicit NamespaceRenderer(RendererContext context);
+
+  void Open();
+  void Close();
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_NAMESPACE_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
new file mode 100644
index 00000000..9131cc94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_COMMENT_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_COMMENT_RENDERER_H_
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class OpCommentRenderer : public Renderer {
+ public:
+  explicit OpCommentRenderer(RendererContext context, OpView op);
+
+  void Render();
+
+ private:
+  OpView op_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_COMMENT_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
new file mode 100644
index 00000000..98c3b0d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_IMPLEMENTATION_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_IMPLEMENTATION_RENDERER_H_
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class OpImplementationRenderer : public Renderer {
+ public:
+  explicit OpImplementationRenderer(RendererContext context, OpView op);
+  void Render();
+
+ private:
+  void RenderInitialization();
+  void RenderExecutionListOp();
+  void RenderExecutionMultipleOutputs();
+  void RenderExecutionZeroOutputs();
+  void RenderExecutionSingleOutput();
+
+  OpView op_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_IMPLEMENTATION_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
new file mode 100644
index 00000000..3360e14e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_RENDERER_H_
+
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class OpRenderer : public Renderer {
+ public:
+  explicit OpRenderer(RendererContext context, OpView op);
+  void Render();
+
+ private:
+  OpView op_;
+  OpCommentRenderer comment_;
+
+  string Signature() const;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
new file mode 100644
index 00000000..b6168b19
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
@@ -0,0 +1,100 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class Renderer {
+ public:
+  explicit Renderer(RendererContext context);
+
+ protected:
+  // Append a blank line.
+  Renderer &BlankLine();
+
+  // Append a line of source code, left-justified (not indented).
+  // Use for preprocessors directives ("#include"), namespaces, etc.
+  Renderer &CodeLine(const string &text);
+  template <typename... Args>
+  Renderer CodeLine(absl::string_view text, const Args &...args) {
+    return CodeLine(absl::Substitute(text, args...));
+  }
+
+  // Append a multiline string of source code, left-justified (not indented).
+  // Note: Trims leading/trailing whitespace including newlines, making this
+  //       method convenient for multiline raw strings.
+  // Newlines ('\n') are allowed/expected.
+  Renderer &CodeLines(const string &text);
+  template <typename... Args>
+  Renderer CodeLines(absl::string_view text, const Args &...args) {
+    return CodeLines(absl::Substitute(text, args...));
+  }
+
+  // Indent and append a C++ statement.
+  // Note: do *not* include a trailing semicolon in the statement text.
+  Renderer &Statement(const string &text);
+  template <typename... Args>
+  Renderer Statement(absl::string_view text, const Args &...args) {
+    return Statement(absl::Substitute(text, args...));
+  }
+
+  // Indent and append a call to a TF method returning a Status to check.
+  // Note: do *not* include a trailing semicolon in the statement text.
+  Renderer &TFStatement(const string &text);
+  template <typename... Args>
+  Renderer TFStatement(absl::string_view text, const Args &...args) {
+    return TFStatement(absl::Substitute(text, args...));
+  }
+
+  // Indent and append a C++ single-line style comment (using '//').
+  Renderer &CommentLine(const string &text = "");
+  template <typename... Args>
+  Renderer CommentLine(absl::string_view text, const Args &...args) {
+    return CommentLine(absl::Substitute(text, args...));
+  }
+
+  // Append a line of code which starts a new block: trailing with '{') and
+  // indenting.
+  Renderer &BlockOpen(const string &text);
+  template <typename... Args>
+  Renderer BlockOpen(absl::string_view text, const Args &...args) {
+    return BlockOpen(absl::Substitute(text, args...));
+  }
+
+  // Append a line of code ending a block: unindenting and adding '}'.
+  // Note: optional trailing text is often a comment, e.g. '// namespace xyz'.
+  Renderer &BlockClose(const string &text = "");
+  template <typename... Args>
+  Renderer BlockClose(absl::string_view text, const Args &...args) {
+    return BlockClose(absl::Substitute(text, args...));
+  }
+
+ protected:
+  RendererContext context_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h
new file mode 100644
index 00000000..c0eb03e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_CONTEXT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_CONTEXT_H_
+
+#include "tensorflow/c/experimental/ops/gen/common/path_config.h"
+#include "tensorflow/c/experimental/ops/gen/common/source_code.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+struct RendererContext {
+  enum Mode { kHeader = 0, kSource };
+
+  Mode mode;
+  SourceCode &code;
+  CppConfig cpp_config;
+  PathConfig path_config;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.h
new file mode 100644
index 00000000..d071f62c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ARG_TYPE_VIEW_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ARG_TYPE_VIEW_H_
+
+#include "tensorflow/c/experimental/ops/gen/model/arg_type.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class ArgTypeView {
+ public:
+  explicit ArgTypeView(ArgType arg_type);
+
+  string TypeName() const;
+
+ private:
+  ArgType arg_type_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ARG_TYPE_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h
new file mode 100644
index 00000000..49085d3a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h
@@ -0,0 +1,47 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ARG_VIEW_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ARG_VIEW_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/cpp/views/arg_type_view.h"
+#include "tensorflow/c/experimental/ops/gen/model/arg_spec.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class ArgView {
+ public:
+  explicit ArgView(ArgSpec arg);
+
+  string VariableName() const;
+  string SetterMethod() const;
+  std::vector<string> SetterArgs() const;
+  int Position() const;
+
+  bool IsList() const;
+
+ private:
+  ArgSpec arg_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ARG_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h
new file mode 100644
index 00000000..70149aa6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ATTR_VIEW_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ATTR_VIEW_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/model/attr_spec.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class AttrView {
+ public:
+  explicit AttrView(AttrSpec attr) : attr_(attr) {}
+
+  string VariableName() const;
+  string VariableType() const;
+  string AttrNameString() const;
+  string VariableStrLen() const;
+  string VariableSpanData() const;
+  string VariableSpanLen() const;
+  string DefaultValue() const;
+  string InputArg(bool with_default_value) const;
+  string SetterMethod() const;
+  std::vector<string> SetterArgs() const;
+
+ private:
+  AttrSpec attr_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_ATTR_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h
new file mode 100644
index 00000000..ff3e2b51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h
@@ -0,0 +1,46 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_OP_ARGUMENT_VIEW_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_OP_ARGUMENT_VIEW_H_
+
+#include "tensorflow/c/experimental/ops/gen/model/arg_spec.h"
+#include "tensorflow/c/experimental/ops/gen/model/attr_spec.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class OpArgumentView {
+ public:
+  explicit OpArgumentView(ArgSpec arg);
+  explicit OpArgumentView(AttrSpec attr);
+  explicit OpArgumentView(string type, string var, string def = "");
+
+  string Declaration() const;
+  string Initializer() const;
+  bool HasDefaultValue() const;
+
+ private:
+  string type_name_;
+  string variable_name_;
+  string default_value_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_OP_ARGUMENT_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/op_view.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/op_view.h
new file mode 100644
index 00000000..35b8858b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/cpp/views/op_view.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_OP_VIEW_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_OP_VIEW_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h"
+#include "tensorflow/c/experimental/ops/gen/model/op_spec.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+namespace cpp {
+
+class OpView {
+ public:
+  explicit OpView(OpSpec op);
+
+  const std::vector<ArgView> &Inputs() const;
+  const std::vector<ArgView> &Outputs() const;
+  const std::vector<AttrView> &Attributes() const;
+  const std::vector<OpArgumentView> &AllArguments() const;
+
+  int NumInputs() const;
+  int NumOutputs() const;
+  ArgView OnlyInput() const;
+  ArgView OnlyOutput() const;
+
+  string FunctionName() const;
+  string VariableName() const;
+  string OpNameString() const;
+  string Summary() const;
+  std::vector<string> Description() const;
+  bool IsListOp() const;
+
+ private:
+  OpSpec op_;
+  std::vector<ArgView> input_args_;
+  std::vector<ArgView> output_args_;
+  std::vector<AttrView> argument_attrs_;
+  std::vector<OpArgumentView> all_arguments_;
+};
+
+}  // namespace cpp
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_VIEWS_OP_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/arg_spec.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/arg_spec.h
new file mode 100644
index 00000000..d18c0d62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/arg_spec.h
@@ -0,0 +1,53 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ARG_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ARG_SPEC_H_
+
+#include "tensorflow/c/experimental/ops/gen/model/arg_type.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+// An input or output argument to an Op.
+//
+// Essentially, this represents an OpDef::ArgDef and its context within the Op.
+class ArgSpec {
+ public:
+  ArgSpec() = default;
+  ArgSpec(const ArgSpec& other) = default;
+  static ArgSpec CreateInput(const OpDef::ArgDef& arg_def, int position);
+  static ArgSpec CreateOutput(const OpDef::ArgDef& arg_def, int position);
+
+  const string& name() const { return name_; }
+  const string& description() const { return description_; }
+  const ArgType arg_type() const { return arg_type_; }
+  const int position() const { return position_; }
+
+ private:
+  explicit ArgSpec(const OpDef::ArgDef& arg_def, ArgType arg_type,
+                   int position);
+
+  string name_;
+  string description_;
+  ArgType arg_type_;
+  int position_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ARG_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/arg_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/arg_type.h
new file mode 100644
index 00000000..df3b9e94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/arg_type.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ARG_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ARG_TYPE_H_
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+// Type information of an Op argument (ArgSpec)..
+//
+// This represents the type information with OpDef::ArgDef and any type-related
+// context.
+class ArgType {
+ public:
+  ArgType() = default;
+  ArgType(const ArgType& other) = default;
+  static ArgType CreateInput(const OpDef::ArgDef& arg_def);
+  static ArgType CreateInputRef(const OpDef::ArgDef& arg_def);
+  static ArgType CreateOutput(const OpDef::ArgDef& arg_def);
+
+  const tensorflow::DataType data_type() const { return data_type_; }
+  const string type_attr_name() const { return type_attr_name_; }
+  const bool is_read_only() const { return kind_ == kInput; }
+  const bool is_list() const { return is_list_; }
+
+ private:
+  enum Kind { kInput = 0, kInputRef, kOutput };
+
+  explicit ArgType(const OpDef::ArgDef& arg_def, Kind kind);
+
+  Kind kind_;
+  tensorflow::DataType data_type_;
+  string type_attr_name_;
+  bool is_list_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ARG_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/attr_spec.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/attr_spec.h
new file mode 100644
index 00000000..8c9488bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/attr_spec.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ATTR_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ATTR_SPEC_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+// An attribute for an Op, such as an input/output type or for passing options.
+//
+// Essentially, this represents an OpDef::AttrDef and its context within the Op.
+class AttrSpec {
+ public:
+  AttrSpec() = default;
+  AttrSpec(const AttrSpec& other) = default;
+  static AttrSpec Create(const OpDef::AttrDef& attr_def);
+
+  const string& name() const { return name_; }
+  const string& description() const { return description_; }
+  const string& full_type() const { return full_type_; }
+  const string& base_type() const { return base_type_; }
+  const AttrValue& default_value() const { return default_value_; }
+  const bool is_list() const { return is_list_; }
+
+ private:
+  explicit AttrSpec(const OpDef::AttrDef& attr_def);
+
+  string name_;
+  string description_;
+  string full_type_;
+  string base_type_;
+  AttrValue default_value_;
+  bool is_list_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_ATTR_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/op_spec.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/op_spec.h
new file mode 100644
index 00000000..986ece00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/gen/model/op_spec.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_OP_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_OP_SPEC_H_
+
+#include <map>
+#include <vector>
+
+#include "tensorflow/c/experimental/ops/gen/model/arg_spec.h"
+#include "tensorflow/c/experimental/ops/gen/model/attr_spec.h"
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace generator {
+
+// An Op.
+//
+// Essentially, this represents an OpDef and any necessary context (e.g ApiDef).
+class OpSpec {
+ public:
+  static OpSpec Create(const OpDef& op_def, const ApiDef& api_def);
+
+  const string& name() const { return name_; }
+  const string& summary() const { return summary_; }
+  const string& description() const { return description_; }
+  const std::vector<ArgSpec>& Inputs() const { return input_args_; }
+  const std::vector<ArgSpec>& Outputs() const { return output_args_; }
+  const std::vector<AttrSpec>& Attributes() const { return argument_attrs_; }
+
+ private:
+  explicit OpSpec(const OpDef& op_def, const ApiDef& api_def);
+
+ private:
+  string name_;
+  string summary_;
+  string description_;
+  std::vector<ArgSpec> input_args_;
+  std::vector<ArgSpec> output_args_;
+  std::vector<AttrSpec> argument_attrs_;
+  std::map<string, AttrSpec> type_attrs_;
+};
+
+}  // namespace generator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_MODEL_OP_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/io_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/io_ops.h
new file mode 100644
index 00000000..939c8536
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/io_ops.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is MACHINE GENERATED! Do not edit.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_IO_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_IO_OPS_H_
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Restores tensors from a V2 checkpoint.
+absl::Status RestoreV2(AbstractContext* ctx, AbstractTensorHandle* const prefix,
+                       AbstractTensorHandle* const tensor_names,
+                       AbstractTensorHandle* const shape_and_slices,
+                       absl::Span<AbstractTensorHandle*> tensors,
+                       absl::Span<DataType> dtypes, const char* name = nullptr,
+                       const char* raw_device_name = nullptr);
+
+// Saves tensors in V2 checkpoint format.
+absl::Status SaveV2(AbstractContext* ctx, AbstractTensorHandle* const prefix,
+                    AbstractTensorHandle* const tensor_names,
+                    AbstractTensorHandle* const shape_and_slices,
+                    absl::Span<AbstractTensorHandle* const> tensors,
+                    const char* name = nullptr,
+                    const char* raw_device_name = nullptr);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_IO_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/math_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/math_ops.h
new file mode 100644
index 00000000..c33c89fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/math_ops.h
@@ -0,0 +1,107 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is MACHINE GENERATED! Do not edit.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Returns x * y element-wise.
+absl::Status Mul(AbstractContext* ctx, AbstractTensorHandle* const x,
+                 AbstractTensorHandle* const y, AbstractTensorHandle** z,
+                 const char* name = nullptr,
+                 const char* raw_device_name = nullptr);
+
+// Returns the complex conjugate of a complex number.
+absl::Status Conj(AbstractContext* ctx, AbstractTensorHandle* const input,
+                  AbstractTensorHandle** output, const char* name = nullptr,
+                  const char* raw_device_name = nullptr);
+
+// Returns x + y element-wise.
+absl::Status AddV2(AbstractContext* ctx, AbstractTensorHandle* const x,
+                   AbstractTensorHandle* const y, AbstractTensorHandle** z,
+                   const char* name = nullptr,
+                   const char* raw_device_name = nullptr);
+
+// Multiply the matrix "a" by the matrix "b".
+absl::Status MatMul(AbstractContext* ctx, AbstractTensorHandle* const a,
+                    AbstractTensorHandle* const b,
+                    AbstractTensorHandle** product, bool transpose_a = false,
+                    bool transpose_b = false, const char* name = nullptr,
+                    const char* raw_device_name = nullptr);
+
+// Computes numerical negative value element-wise.
+absl::Status Neg(AbstractContext* ctx, AbstractTensorHandle* const x,
+                 AbstractTensorHandle** y, const char* name = nullptr,
+                 const char* raw_device_name = nullptr);
+
+// Computes the sum of elements across dimensions of a tensor.
+absl::Status Sum(AbstractContext* ctx, AbstractTensorHandle* const input,
+                 AbstractTensorHandle* const reduction_indices,
+                 AbstractTensorHandle** output, bool keep_dims = false,
+                 const char* name = nullptr,
+                 const char* raw_device_name = nullptr);
+
+// Returns x - y element-wise.
+absl::Status Sub(AbstractContext* ctx, AbstractTensorHandle* const x,
+                 AbstractTensorHandle* const y, AbstractTensorHandle** z,
+                 const char* name = nullptr,
+                 const char* raw_device_name = nullptr);
+
+// Returns x / y element-wise.
+absl::Status Div(AbstractContext* ctx, AbstractTensorHandle* const x,
+                 AbstractTensorHandle* const y, AbstractTensorHandle** z,
+                 const char* name = nullptr,
+                 const char* raw_device_name = nullptr);
+
+// Returns 0 if the denominator is zero.
+absl::Status DivNoNan(AbstractContext* ctx, AbstractTensorHandle* const x,
+                      AbstractTensorHandle* const y, AbstractTensorHandle** z,
+                      const char* name = nullptr,
+                      const char* raw_device_name = nullptr);
+
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+absl::Status Exp(AbstractContext* ctx, AbstractTensorHandle* const x,
+                 AbstractTensorHandle** y, const char* name = nullptr,
+                 const char* raw_device_name = nullptr);
+
+// Computes square root of x element-wise.
+absl::Status Sqrt(AbstractContext* ctx, AbstractTensorHandle* const x,
+                  AbstractTensorHandle** y, const char* name = nullptr,
+                  const char* raw_device_name = nullptr);
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+absl::Status SqrtGrad(AbstractContext* ctx, AbstractTensorHandle* const y,
+                      AbstractTensorHandle* const dy, AbstractTensorHandle** z,
+                      const char* name = nullptr,
+                      const char* raw_device_name = nullptr);
+
+// Computes natural logarithm of (1 + x) element-wise.
+absl::Status Log1p(AbstractContext* ctx, AbstractTensorHandle* const x,
+                   AbstractTensorHandle** y, const char* name = nullptr,
+                   const char* raw_device_name = nullptr);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/nn_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/nn_ops.h
new file mode 100644
index 00000000..0006267f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/nn_ops.h
@@ -0,0 +1,69 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is MACHINE GENERATED! Do not edit.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+absl::Status SparseSoftmaxCrossEntropyWithLogits(
+    AbstractContext* ctx, AbstractTensorHandle* const features,
+    AbstractTensorHandle* const labels, AbstractTensorHandle** loss,
+    AbstractTensorHandle** backprop, const char* name = nullptr,
+    const char* raw_device_name = nullptr);
+
+// Computes rectified linear gradients for a Relu operation.
+absl::Status ReluGrad(AbstractContext* ctx,
+                      AbstractTensorHandle* const gradients,
+                      AbstractTensorHandle* const features,
+                      AbstractTensorHandle** backprops,
+                      const char* name = nullptr,
+                      const char* raw_device_name = nullptr);
+
+// Computes rectified linear: `max(features, 0)`.
+absl::Status Relu(AbstractContext* ctx, AbstractTensorHandle* const features,
+                  AbstractTensorHandle** activations,
+                  const char* name = nullptr,
+                  const char* raw_device_name = nullptr);
+
+// Adds `bias` to `value`.
+absl::Status BiasAdd(AbstractContext* ctx, AbstractTensorHandle* const value,
+                     AbstractTensorHandle* const bias,
+                     AbstractTensorHandle** output,
+                     const char* data_format = "NHWC",
+                     const char* name = nullptr,
+                     const char* raw_device_name = nullptr);
+
+// The backward operation for "BiasAdd" on the "bias" tensor.
+absl::Status BiasAddGrad(AbstractContext* ctx,
+                         AbstractTensorHandle* const out_backprop,
+                         AbstractTensorHandle** output,
+                         const char* data_format = "NHWC",
+                         const char* name = nullptr,
+                         const char* raw_device_name = nullptr);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/ops/resource_variable_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/resource_variable_ops.h
new file mode 100644
index 00000000..02b42bf4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/ops/resource_variable_ops.h
@@ -0,0 +1,67 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is MACHINE GENERATED! Do not edit.
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_RESOURCE_VARIABLE_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_RESOURCE_VARIABLE_OPS_H_
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Creates a handle to a Variable resource.
+absl::Status VarHandleOp(AbstractContext* ctx, AbstractTensorHandle** resource,
+                         DataType dtype, const PartialTensorShape shape,
+                         const char* container = "",
+                         const char* shared_name = "",
+                         absl::Span<string const> allowed_devices = {},
+                         const char* name = nullptr,
+                         const char* raw_device_name = nullptr);
+
+// Reads the value of a variable.
+absl::Status ReadVariableOp(AbstractContext* ctx,
+                            AbstractTensorHandle* const resource,
+                            AbstractTensorHandle** value, DataType dtype,
+                            const char* name = nullptr,
+                            const char* raw_device_name = nullptr);
+
+// Assigns a new value to a variable.
+absl::Status AssignVariableOp(AbstractContext* ctx,
+                              AbstractTensorHandle* const resource,
+                              AbstractTensorHandle* const value,
+                              bool validate_shape = false,
+                              const char* name = nullptr,
+                              const char* raw_device_name = nullptr);
+
+// Deletes the resource specified by the handle.
+absl::Status DestroyResourceOp(AbstractContext* ctx,
+                               AbstractTensorHandle* const resource,
+                               bool ignore_lookup_error = true,
+                               const char* name = nullptr,
+                               const char* raw_device_name = nullptr);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_RESOURCE_VARIABLE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.h b/third_party/tflite-hdrs/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.h
new file mode 100644
index 00000000..bf8dbc49
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.h
@@ -0,0 +1,178 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_PLUGGABLE_PROFILER_PLUGGABLE_PROFILER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_PLUGGABLE_PROFILER_PLUGGABLE_PROFILER_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_status.h"
+
+// C API for Pluggable Profiler. The API is under active development and
+// eventually should allow registering a profiler with TensorFlow.
+//
+// Conventions:
+//   * Struct prefix indicates whether struct fields should be filled by the
+//   plug-in or core TensorFlow implementation:
+//     * TF_: Set/filled by core, unless marked otherwise.
+//     * TP_: Set/filled by plug-in, unless marked otherwise.
+//     * This prefix rule only applies to structures. Enumerations and methods
+//     are all prefixed with TP_.
+//   * Structs begin with two fields:
+//     * size_t struct_size: Stores the unpadded size of the struct.
+//     * void* ext: A reserved field that may be populated by a plugin in TP_*
+//     structs or potential future extension points in TF_ structs. Must be set
+//     to zero by default if it unused.
+//  * We use struct_size for version checking by both core and plug-in.
+//    * It is exempt from the TF/TP rule above and must be set both by core and
+//    plug-in.
+//    * It can be checked programmatically to determine which struct fields are
+//    available in the structure.
+//  * When a member is added to a struct, the struct size definition must be
+//  updated to use the new last member of the struct.
+//
+// Example usage:
+//   /* Sample TensorFlow code below, exact implementation might differ. */
+//   // Version checking uses `struct_size`. It is exempt from the `TF/TP` rule
+//   // above and should be set both by core and the plugin."
+//
+//   /* Plugin code below */
+//   void profiler_start(const TP_Profiler* profiler, TF_Status* status) {
+//     /* Enable profiler */
+//     ...
+//   }
+//
+//  void profiler_stop(const TP_Profiler* profiler, TF_Status* status) {
+//    /* Disable Profiler */
+//    ...
+//  }
+//
+//  void profiler_collect_data_xspace(const TP_Profiler* profiler, uint8_t*
+//  buffer, size_t* size_in_bytes, TF_Status* status) {
+//    /* Plugin generates Xspace based on collected profiler data. */
+//    Xspace xspace = get_my_xspace();
+//    size_t buffer_size_in_bytes = *size_in_bytes;
+//    *size_in_bytes = xspace.ByteSizeLong(); /* get the size of Xspace */
+//    if (buffer == nullptr) {
+//      /* TensorFlow will first get the size of Xspace, then allocate the big
+//         enough buffer and pass it to the plugin for retrieving Xspace. */
+//      return;
+//    }
+//    bool success = xspace.SerializeToArray(buffer, buffer_size_in_bytes);
+//  }
+//
+// void TF_InitProfiler(TF_ProfilerRegistrationParams* params, TF_Status*
+// status) {
+//   *params = { TF_PROFILER_REGISTRATION_PARAMS_STRUCT_SIZE };
+//   params->profiler->struct_size = TP_PROFILER_STRUCT_SIZE;
+//   params->profiler_fns->struct_size = TP_PROFILER_FNS_STRUCT_SIZE;
+//
+//   params->profiler->type = "MyDeviceType";
+//
+//   params->profiler_fns->start =  profiler_start;
+//   params->profiler_fns->stop = profiler_stop;
+//   params->profiler_fns->collect_data_xspace = profiler_collect_data_xspace;
+//   params->destroy_profiler = profiler_destroy_profiler;
+//   params->destroy_profiler_fns = profiler_destroy_profiler_fns;
+// }
+
+#define TP_MAJOR 0
+#define TP_MINOR 0
+#define TP_PATCH 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// TP_Profiler holds a pointer to device type filed by the plug-in.
+typedef struct TP_Profiler {
+  size_t struct_size;
+  void* ext;  // free-form data set by plugin.
+  const char* device_type;
+
+  // The struct size must be updated when adding new members.
+#define TP_PROFILER_STRUCT_SIZE TF_OFFSET_OF_END(TP_Profiler, device_type)
+} TP_Profiler;
+
+// --------------------------------------------------------------------------
+// TP_ProfilerFns holds the profiler interface function pointers filled by the
+// plug-in.
+typedef struct TP_ProfilerFns {
+  size_t struct_size;
+
+  void* ext;  // reserved for future use.
+  // Starts profiling.
+  void (*start)(const TP_Profiler* profiler, TF_Status* status);
+  // Stops profiling.
+  void (*stop)(const TP_Profiler* profiler, TF_Status* status);
+
+  // Saves collected profile data into XSpace and serializes it to the buffer.
+  // - If `buffer` is null, returns the required buffer size in `size_in_bytes`.
+  // - If `buffer` is not null and `size_in_bytes` is the required buffer size,
+  //   `buffer` is populated with profile data in serialized XSpace format.
+  //
+  // Only the first call with a non-null `buffer` following successful calls to
+  // start and stop might return data. Subsequent calls might return empty data
+  // unless start and stop are successfully called again.
+  void (*collect_data_xspace)(const TP_Profiler* profiler, uint8_t* buffer,
+                              size_t* size_in_bytes, TF_Status* status);
+
+  // The struct size must be updated when adding new members.
+#define TP_PROFILER_FNS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TP_ProfilerFns, collect_data_xspace)
+} TP_ProfilerFns;
+
+// TF_ProfilerRegistrationParams holds the pointers to TP_Profiler and
+// TP_ProfilerFns, the memory of TP_Profiler and TP_ProfilerFns is owned by Core
+// TensorFlow and populated by the plug-in.
+typedef struct TF_ProfilerRegistrationParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // TensorFlow Profiler C API version.
+  int32_t major_version;
+  int32_t minor_version;
+  int32_t patch_version;
+
+  // [in/out] Memory owned by core but attributes within are populated by the
+  // plugin.
+  TP_Profiler* profiler;
+  // [in/out] Memory owned by core but attributes within are populated by the
+  // plugin.
+  TP_ProfilerFns* profiler_fns;
+  // [out] Pointer to plugin's `TP_Profiler` clean up function.
+  // Cleans up fields inside `TP_Profiler` that were allocated
+  // by the plugin. `profiler` itself must not be deleted by the plugin.
+  void (*destroy_profiler)(TP_Profiler* profiler);
+  // [out] Pointer to plugin's `TP_ProfilerFns` clean up function.
+  // Cleans up fields inside `TP_ProfilerFns` that were allocated
+  // by the plugin. `profiler_fns` itself must not be deleted by the plugin.
+  void (*destroy_profiler_fns)(TP_ProfilerFns* profiler_fns);
+
+  // The struct size must be updated when adding new members.
+#define TF_PROFILER_REGISTRATION_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TF_ProfilerRegistrationParams, destroy_profiler_fns)
+} TF_ProfilerRegistrationParams;
+
+// TF_InitProfiler to do profiler registration.
+// Plug-in should implement TF_InitProfiler to register the profiler.
+void TF_InitProfiler(TF_ProfilerRegistrationParams* params, TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_PLUGGABLE_PROFILER_PLUGGABLE_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h b/third_party/tflite-hdrs/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
new file mode 100644
index 00000000..55af07ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_PLUGGABLE_PROFILER_PLUGGABLE_PROFILER_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_PLUGGABLE_PROFILER_PLUGGABLE_PROFILER_INTERNAL_H_
+#include "tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Plugin initialization function that a device plugin must define. Returns
+// a TF_Status output specifying whether the initialization is successful.
+using TFInitProfilerFn = void (*)(TF_ProfilerRegistrationParams* const,
+                                  TF_Status* const);
+
+// Registers plugin's profiler to TensorFlow's profiler registry.
+absl::Status InitPluginProfiler(TFInitProfilerFn init_fn);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_PLUGGABLE_PROFILER_PLUGGABLE_PROFILER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/concrete_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/concrete_function.h
new file mode 100644
index 00000000..0fc60557
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_CONCRETE_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+
+namespace tensorflow {
+
+// ConcreteFunctions correspond to an instance of a tf.function with a known set
+// of inputs (either through get_concrete_function) or an input_signature.
+// ConcreteFunction attempts to preserve the user-facing semantics of the
+// tf.function python API and can take a limited set of types as arguments
+// (to be modeled in tensorflow::Value), not just Tensors.
+//
+// SavedModelAPI's ConcreteFunctions' lifetimes are bound to the SavedModel they
+// are loaded from, since they retain pointers to the TensorHandles owned by the
+// SavedModel, and the FunctionDef of the SavedModel.
+//
+// Note(bmzhao): This class is only TEMPORARILY virtual, as a way to unblock
+// TFRT integration with TF Serving. Do not add more virtual implementations of
+// this class. Eventually we want to remove this virtual base class indirection
+// and have only a single implementation.
+class ConcreteFunction {
+ public:
+  virtual ~ConcreteFunction() = default;
+
+  // This method returns the "Call" Op used to execute the function.
+  virtual absl::Status MakeCallOp(
+      absl::Span<AbstractTensorHandle* const> inputs,
+      ImmediateOpPtr* out) const = 0;
+
+  virtual const FunctionMetadata& GetFunctionMetadata() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_CONCRETE_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/function_metadata.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/function_metadata.h
new file mode 100644
index 00000000..8499288f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/function_metadata.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_FUNCTION_METADATA_H_
+
+namespace tensorflow {
+
+class FunctionMetadata {
+  // TODO(bmzhao): Fill in with fields as necessary
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_FUNCTION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/ops/restore_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/ops/restore_ops.h
new file mode 100644
index 00000000..5a0ec2bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/ops/restore_ops.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_RESTORE_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_RESTORE_OPS_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace internal {
+
+// TODO(bmzhao): Add a function to restore multiple tensors in one call.
+
+// Restores a single non-partioned tensorhandle of dtype `dtype`, using
+// checkpoint at `prefix`, with a value stored in `checkpoint_key`.
+absl::Status SingleRestore(ImmediateExecutionContext* ctx,
+                           const std::string& prefix,
+                           const std::string& checkpoint_key, DataType dtype,
+                           ImmediateTensorHandlePtr* out);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_RESTORE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
new file mode 100644
index 00000000..ee01935b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Executes a VarHandleOp using `ctx`, and fills `handle` with the DT_RESOURCE
+// TensorHandle associated with the variable. This is equivalent to creating an
+// unitialized TF2 tf.Variable.
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
+absl::Status CreateUninitializedResourceVariable(
+    ImmediateExecutionContext* ctx, DataType dtype, TensorShape shape,
+    const char* raw_device_name, ImmediateTensorHandlePtr* handle);
+
+// Executes an AssignVariableOp using `ctx`, assigning the variable associated
+// with `variable_handle` with `value`. `dtype` must be the datatype of the
+// underlying variable for `variable_handle`. Note that it is illegal to assign
+// a variable to a Tensor with a different dtype than what the variable was
+// created with.
+absl::Status AssignVariable(ImmediateExecutionContext* ctx,
+                            ImmediateExecutionTensorHandle* variable_handle,
+                            DataType dtype,
+                            ImmediateExecutionTensorHandle* value);
+
+// Executes a ReadVariableOp using `ctx`. This reads the underlying variable
+// value of `variable_handle` and copies the value to `output`. `dtype` must be
+// the dtype of the variable associated with `variable_handle`.
+absl::Status ReadVariable(ImmediateExecutionContext* ctx,
+                          ImmediateExecutionTensorHandle* variable_handle,
+                          DataType dtype, ImmediateTensorHandlePtr* output);
+
+// Executes DestroyResourceOp on `handle`, using `ctx`. This is equivalent to
+// the cleanup that occurs in a tf.Variable's EagerResourceDeleter:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L289-L290
+absl::Status DestroyResource(ImmediateExecutionContext* ctx,
+                             ImmediateExecutionTensorHandle* handle);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/asset.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/asset.h
new file mode 100644
index 00000000..4f4bff86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/asset.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_ASSET_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_ASSET_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class Asset : public TensorHandleConvertible {
+ public:
+  static absl::Status Create(ImmediateExecutionContext* ctx,
+                             const std::string& saved_model_dir,
+                             const std::string& asset_filename,
+                             std::unique_ptr<Asset>* output);
+
+  // Asset is movable, but not copyable.
+  Asset(Asset&& other) = default;
+  Asset& operator=(Asset&& other) = default;
+
+  ~Asset() override = default;
+
+ private:
+  explicit Asset(ImmediateTensorHandlePtr handle);
+  Asset(const Asset&) = delete;
+  Asset& operator=(const Asset&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_ASSET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/constant.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/constant.h
new file mode 100644
index 00000000..0d89cf37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/constant.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_CONSTANT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_CONSTANT_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// This class corresponds to python's tf.constant, which is effectively a
+// TensorHandle explicitly initialized to some value.
+// For now this doesn't do much beyond wrap Context's CreateLocalHandle method,
+// and offer a subclass of TensorHandleConvertible. Note that similar to
+// the python's eager mode logic, we bypass calling the "Const" op:
+// https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/python/framework/constant_op.py#L301
+class Constant : public TensorHandleConvertible {
+ public:
+  static absl::Status Create(ImmediateExecutionContext* ctx,
+                             AbstractTensorInterface* tensor,
+                             std::unique_ptr<Constant>* output);
+
+  // RevivedConstant is movable, but not copyable.
+  Constant(Constant&& other) = default;
+  Constant& operator=(Constant&& other) = default;
+
+  ~Constant() override = default;
+
+ private:
+  explicit Constant(ImmediateTensorHandlePtr handle);
+  Constant(const Constant&) = delete;
+  Constant& operator=(const Constant&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_CONSTANT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h
new file mode 100644
index 00000000..810a42ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// FlatTensorFunction models a TF2 eager runtime view of a callable function,
+// taking + returning flat lists of tensors, including any captures.
+// Effectively, it is a thin wrapper around a FunctionDef owned by the
+// EagerContext, and any TensorHandle captures associated with the function. The
+// MakeCallOp method handles the logic of marshaling captures after the user
+// provided inputs automatically.
+// Note(bmzhao): This class is mainly intended to house low-level reusable
+// function logic between SignatureDefFunction and ConcreteFunction, which
+// present higher level interfaces. This type does *not* hold any "function
+// metadata".
+class FlatTensorFunction {
+ public:
+  // Factory for creating a FlatTensorFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 FlatTensorFunction. FlatTensorFunction will register this
+  //                 function_def with `ctx` on creation, and de-register it on
+  //                 destruction. function_def must be non-null, but
+  //                 otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             FlatTensorFunction. FlatTensorFunction will participate in
+  //             ownership of the handles (it explicitly increments the refcount
+  //             of each handle, and will decrement them on destruction).
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFConcreteFunction.
+  //  out      - The output FlatTensorFunction.
+  static absl::Status Create(
+      const FunctionDef* function_def,
+      std::vector<ImmediateExecutionTensorHandle*> captures,
+      ImmediateExecutionContext* ctx, std::unique_ptr<FlatTensorFunction>* out);
+
+  // This method creates a "Call" Op used to execute the function.
+  absl::Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                          ImmediateOpPtr* out) const;
+
+  ~FlatTensorFunction();
+
+ private:
+  FlatTensorFunction(const std::string& name,
+                     std::vector<ImmediateTensorHandlePtr> captures,
+                     ImmediateExecutionContext* ctx);
+
+  FlatTensorFunction(const FlatTensorFunction&) = delete;
+  FlatTensorFunction& operator=(const FlatTensorFunction&) = delete;
+
+  // Name of the FunctionDef corresponding to this TFConcreteFunction
+  std::string name_;
+  std::vector<ImmediateTensorHandlePtr> captures_;
+  ImmediateExecutionContext* ctx_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h
new file mode 100644
index 00000000..07a4e185
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_PARTIALLY_REVIVED_OBJECTS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_PARTIALLY_REVIVED_OBJECTS_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// Container for objects during the revival step in SavedModel's loading.
+// Notably, resources and functions can be in a state where they reference
+// other resources/functions that have not been constructed yet. We collect
+// *all* objects in a partially valid state here, then properly initialize
+// resources and functions. Implementation-wise, PartiallyRevivedObjects
+// contains maps keyed by the node number of the SavedObjectGraph, and map to an
+// object of the corresponding type. So, if node 2 in the object graph is a
+// variable, PartiallyRevivedObjects.variables[2] exists, and corresponds to a
+// tensorflow::Variable object. The only exception to this is the
+// "signatures_map", which is keyed by the "signature" key
+// (https://github.com/tensorflow/tensorflow/blob/372918decee7f558b3c194b04f77c20dcc679a31/tensorflow/core/protobuf/meta_graph.proto#L89),
+// and maps to the SignatureDefFunction node in the SavedObjectGraph.
+struct PartiallyRevivedObjects {
+  gtl::FlatMap<int, std::unique_ptr<Variable>> variables;
+  gtl::FlatMap<int, std::unique_ptr<Asset>> assets;
+  gtl::FlatMap<int, std::unique_ptr<Constant>> constants;
+  gtl::FlatMap<int, TFConcreteFunctionRevivalState> concrete_functions;
+  gtl::FlatMap<int, TFSignatureDefFunctionRevivalState> signature_def_functions;
+  gtl::FlatMap<int, RestoredResourceRevivalState> restored_resources;
+  gtl::FlatMap<std::string, int> signatures_map;
+
+  absl::Status Build(ImmediateExecutionContext* ctx,
+                     const SavedObjectGraph& obj_graph,
+                     RevivedObjects* revived);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_PARTIALLY_REVIVED_OBJECTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h
new file mode 100644
index 00000000..691a591c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// RestoredResource represents a TF2 "Resource" object loaded from a savedmodel,
+// analogous to the Python _RestoredResource object:
+// https://github.com/tensorflow/tensorflow/blob/fda326e542ca67534e8411edb180e8760a4828b7/tensorflow/python/saved_model/load.py#L481
+// TF2 resource objects typically extend TrackableResource:
+// https://github.com/tensorflow/tensorflow/blob/fda326e542ca67534e8411edb180e8760a4828b7/tensorflow/python/training/tracking/tracking.py#L285
+// and are expected to implement "_create_resource", "_initialize", and
+// "_destroy_resource" functions:
+// https://github.com/tensorflow/tensorflow/blob/139ba9c5284799beafdd1d7f895127cf00e7c48f/tensorflow/python/training/tracking/tracking.py#L262-L281
+class RestoredResource : TensorHandleConvertible {
+ public:
+  // Note(bmzhao): RestoredResource stores non-owning pointers to its associated
+  // functions because SavedModel internally owns all functions and objects in
+  // the RevivedObjects struct (which owns all functions). One alternative would
+  // be to have RevivedObjects store shared_ptr<TFConcreteFunction> instead, and
+  // change RestoredResource's constructor take shared_ptr<TFConcreteFunction>.
+  // To keep things simple, I've stuck to raw pointers for now.
+  //
+  // Params:
+  //  device - The device string associated with the SavedResource
+  //           https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/saved_object_graph.proto#L182
+  //           Conceptually, this is the same device used in CapturableResource:
+  //           https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/python/training/tracking/tracking.py#L222-L225
+  //           Implementation-wise, it is device used when invoking the
+  //           create_resource function to produce the resource_handle
+  //           associated with the object:
+  //           https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/python/training/tracking/tracking.py#L246-L247
+  //  create_resource - Non owning pointer to the create_resource function
+  //                    associated with this object. Must be NON-NULL.
+  //  initialize - Non owning pointer to the initialize function associated with
+  //               this object. Must be NON-NULL.
+  //  destroy_resource - Non owning pointer to the destroy_resource function
+  //                     associated with this object. Ideally this should be
+  //                     NON-NULL, but in order to support models saved prior to
+  //                     https://github.com/tensorflow/tensorflow/commit/3c806101f57768e479f8646e7518bbdff1632ca3
+  //                     we allow null here. This will, however, leak resources.
+  RestoredResource(const std::string& device,
+                   TFConcreteFunction* create_resource,
+                   TFConcreteFunction* initialize,
+                   TFConcreteFunction* destroy_resource,
+                   ImmediateTensorHandlePtr resource_handle);
+
+  absl::Status Initialize() const;
+
+  // RestoredResource is movable, but not copyable.
+  RestoredResource(RestoredResource&& other) = default;
+  RestoredResource& operator=(RestoredResource&& other) = default;
+
+  ~RestoredResource() override;
+
+ private:
+  std::string device_;
+  TFConcreteFunction* create_resource_;
+  TFConcreteFunction* initialize_;
+  TFConcreteFunction* destroy_resource_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h
new file mode 100644
index 00000000..48d00308
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_REVIVAL_STATE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_REVIVAL_STATE_H_
+
+#include <string>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h"
+
+namespace tensorflow {
+
+// All "Resources" should have these 3 saved functions:
+// https://github.com/tensorflow/tensorflow/blob/86dc281333d7d277ddc1882f2bca4b17e7ec40e5/tensorflow/python/training/tracking/tracking.py#L277-L281
+struct RestoredResourceRevivalState {
+  std::string device;
+  TFConcreteFunctionRevivalState* create_resource = nullptr;
+  TFConcreteFunctionRevivalState* initialize = nullptr;
+  TFConcreteFunctionRevivalState* destroy_resource = nullptr;
+  ImmediateTensorHandlePtr resource_handle = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_REVIVAL_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h
new file mode 100644
index 00000000..0f09c743
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_REVIVED_OBJECTS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_REVIVED_OBJECTS_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+
+// A container for revived saved model objects.
+//
+// Most of the objects will be revived from nodes in the object graph, and for
+// those objects this container provides a map from node id to the revived
+// objects.
+//
+// For objects that have to be revived but are not part of the object graph,
+// this container provides a place where the objects can be stored so they are
+// available to the runtime.
+template <typename T>
+class RevivedObjectContainer {
+ public:
+  // Insert an object that is not related to a node id. This usually means the
+  // object was not referenced by the object_graph, but is needed by other
+  // objects.
+  void Insert(std::unique_ptr<T> object) {
+    objects_.push_back(std::move(object));
+  }
+
+  // Insert an object that is tied to the given object graph node id.
+  void Insert(std::unique_ptr<T> object, int node_id) {
+    objects_by_id_[node_id] = object.get();
+    Insert(std::move(object));
+  }
+
+  // Find an object by the object graph node id.
+  // Returns nullptr if there is no such object.
+  T* Find(int node_id) {
+    auto it = objects_by_id_.find(node_id);
+    return it == objects_by_id_.end() ? nullptr : it->second;
+  }
+
+ private:
+  std::vector<std::unique_ptr<T>> objects_;
+  absl::flat_hash_map<int, T*> objects_by_id_;
+};
+
+// RevivedObjects is mainly used as a container for all the "state" owned by
+// SavedModel. It stores all non-"user object" nodes from a SavedModel
+// (https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/core/protobuf/saved_object_graph.proto#L57-L62)
+// in a "fully constructed" state. It is effectively a strongly typed map, where
+// each member is a map from the node id in the SavedObjectGraph's nodes
+// (https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/core/protobuf/saved_object_graph.proto#L25-L29)
+// to the revived object of the corresponding type.
+struct RevivedObjects {
+  // Order of declaration is important here: we want the RestoredResources to be
+  // freed after TFConcreteFunctions, for example.
+  gtl::FlatMap<int, std::unique_ptr<Variable>> variables;
+  gtl::FlatMap<int, std::unique_ptr<Asset>> assets;
+  gtl::FlatMap<int, std::unique_ptr<Constant>> constants;
+  gtl::FlatMap<int, std::unique_ptr<TFSignatureDefFunction>>
+      signature_def_functions;
+  RevivedObjectContainer<TFConcreteFunction> concrete_functions;
+  gtl::FlatMap<int, RestoredResource> restored_resources;
+  gtl::FlatMap<std::string, int> signatures_map;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_REVIVED_OBJECTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
new file mode 100644
index 00000000..4c2c874e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TENSORHANDLE_CONVERTIBLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TENSORHANDLE_CONVERTIBLE_H_
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+
+namespace tensorflow {
+
+// A common interface for objects that can be converted to a TensorHandle.
+// Examples of objects that implement this include Variables, Constants, Assets,
+// etc. This is used to convert captured objects into a ConcreteFunction's
+// captured TensorHandles:
+// https://github.com/tensorflow/tensorflow/blob/676a68963ea4b64fe479b9cede06aa8f5b290ab8/tensorflow/python/saved_model/load.py#L229-L240
+class TensorHandleConvertible {
+ public:
+  explicit TensorHandleConvertible(ImmediateTensorHandlePtr handle)
+      : handle_(std::move(handle)) {}
+
+  ImmediateExecutionTensorHandle* handle() { return handle_.get(); }
+
+  // TensorHandleConvertible is movable, but not copyable.
+  TensorHandleConvertible(TensorHandleConvertible&& other) = default;
+  TensorHandleConvertible& operator=(TensorHandleConvertible&& other) = default;
+
+  virtual ~TensorHandleConvertible() = default;
+
+ protected:
+  TensorHandleConvertible(const TensorHandleConvertible&) = delete;
+  TensorHandleConvertible& operator=(const TensorHandleConvertible&) = delete;
+  ImmediateTensorHandlePtr handle_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TENSORHANDLE_CONVERTIBLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
new file mode 100644
index 00000000..669d77b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// TF Eager Runtime-based implementation of a "ConcreteFunction" loaded from a
+// saved model.
+class TFConcreteFunction : public ConcreteFunction {
+ public:
+  // Factory function for creating a TFConcreteFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 TFConcreteFunction. TFConcreteFunction will register this
+  //                 function_def with `ctx` on creation, and de-register it on
+  //                 destruction. function_def must be non-null, but
+  //                 otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             TFConcreteFunction.
+  //  metadata - The FunctionMetadata associated with this TFConcreteFunction.
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFConcreteFunction.
+  //  out      - The output TFConcreteFunction.
+  static absl::Status Create(
+      const FunctionDef* function_def,
+      std::vector<ImmediateExecutionTensorHandle*> captures,
+      FunctionMetadata metadata, ImmediateExecutionContext* ctx,
+      std::unique_ptr<TFConcreteFunction>* out);
+
+  // This method returns the "Call" Op used to execute the function.
+  absl::Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                          ImmediateOpPtr* out) const override;
+
+  const FunctionMetadata& GetFunctionMetadata() const override;
+
+  ~TFConcreteFunction() override = default;
+
+ private:
+  TFConcreteFunction(std::unique_ptr<FlatTensorFunction> func,
+                     FunctionMetadata metadata);
+
+  TFConcreteFunction(const TFConcreteFunction&) = delete;
+  TFConcreteFunction& operator=(const TFConcreteFunction&) = delete;
+
+  std::unique_ptr<FlatTensorFunction> func_;
+  FunctionMetadata metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h
new file mode 100644
index 00000000..3dd7a6ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_REVIVAL_STATE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_REVIVAL_STATE_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// TFConcreteFunctionRevivalState wraps the state needed for building a
+// TF_ConcreteFunction. This is mainly used in PartiallyRevivedObjects, which
+// wraps partially constructed Function and Resource objects.
+struct TFConcreteFunctionRevivalState {
+  // Index of the node in the SavedObjectGraph it was loaded from.
+  int node_id;
+
+  // Pointer to the original functiondef. fdef_ is guaranteed to be
+  // non-null.
+  const FunctionDef* fdef;
+
+  // TensorHandle captures for this funtion
+  std::vector<ImmediateExecutionTensorHandle*> captures;
+
+  // SavedConcreteFunction contains much of the metadata of the expected "types"
+  // of the inputs and outputs of a function.
+  // Note(bmzhao): saved_concrete_func_ is guaranteed to be non-null.
+  const SavedConcreteFunction* saved_concrete_func;
+
+  // This field is only present on TF2 ConcreteFunctions, and is useful for
+  // determining the original argument *names* of the function, (since the
+  // "canonicalized_input_signature" may append extra uniquifying integers).
+  // However, SavedBareConcreteFunctions do not have a FunctionSpec.
+  // Note(bmzhao): if function_spec_.has_value(), *function_spec_ is guaranteed
+  // to be non-null.
+  absl::optional<const FunctionSpec*> function_spec;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_REVIVAL_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h
new file mode 100644
index 00000000..c9b98189
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// This is the TF eager runtime implementation of SignatureDefFunction (separate
+// from the TFRT implementation). The user-facing API of SignatureDefFunctions
+// and their semantic differences from ConcreteFunction are described here:
+// https://github.com/tensorflow/tensorflow/blob/e2db60c9d9598ebae0b7741587ce6f5d473584d9/tensorflow/cc/saved_model/experimental/public/signature_def_function.h#L30-L59
+// Additional implementation notes are available here:
+// https://github.com/tensorflow/tensorflow/blob/e2db60c9d9598ebae0b7741587ce6f5d473584d9/tensorflow/c/experimental/saved_model/core/signature_def_function.h#L31-L48
+class TFSignatureDefFunction : public SignatureDefFunction {
+ public:
+  // Factory function for creating a TFSignatureDefFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 TFSignatureDefFunction. TFSignatureDefFunction will
+  //                 register this function_def with `ctx` on creation, and
+  //                 de-register it on destruction. function_def must be
+  //                 non-null, but otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             TFConcreteFunction.
+  //  metadata - FunctionMetadata associated with this TFSignatureDefFunction.
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFSignatureDefFunction.
+  //  out      - The output TFSignatureDefFunction.
+  static absl::Status Create(
+      const FunctionDef* function_def,
+      std::vector<ImmediateExecutionTensorHandle*> captures,
+      SignatureDefFunctionMetadata metadata, ImmediateExecutionContext* ctx,
+      std::unique_ptr<TFSignatureDefFunction>* out);
+
+  // This method creates a "Call" Op used to execute the function.
+  absl::Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                          ImmediateOpPtr* out) const override;
+
+  const SignatureDefFunctionMetadata& GetFunctionMetadata() const override;
+
+  ~TFSignatureDefFunction() override = default;
+
+ private:
+  TFSignatureDefFunction(std::unique_ptr<FlatTensorFunction> func,
+                         SignatureDefFunctionMetadata metadata);
+
+  TFSignatureDefFunction(const TFSignatureDefFunction&) = delete;
+  TFSignatureDefFunction& operator=(const TFSignatureDefFunction&) = delete;
+
+  std::unique_ptr<FlatTensorFunction> func_;
+  SignatureDefFunctionMetadata metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h
new file mode 100644
index 00000000..ac1b20e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_REVIVAL_STATE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_REVIVAL_STATE_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// FunctionBuilder wraps the state needed for building a SignatureDefFunction.
+// This is mainly used in PartiallyRevivedObjects, which wraps partially
+// constructed Function and Resource objects.
+struct TFSignatureDefFunctionRevivalState {
+  // Index of the node in the SavedObjectGraph it was loaded from.
+  int node_id = 0;
+
+  // Pointer to the original functiondef. fdef_ is guaranteed to be
+  // non-null.
+  const FunctionDef* fdef = nullptr;
+
+  // SavedConcreteFunction contains much of the metadata of the expected "types"
+  // of the inputs and outputs of a function.
+  // Note(bmzhao): saved_concrete_func_ is guaranteed to be non-null.
+  const SavedConcreteFunction* saved_concrete_func = nullptr;
+
+  // The name of the SignatureDef key.
+  std::string signature_key;
+
+  // TensorHandle captures for this funtion
+  std::vector<ImmediateExecutionTensorHandle*> captures;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_REVIVAL_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/variable.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
new file mode 100644
index 00000000..5a9ad51a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_VARIABLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_VARIABLE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+class Variable : public TensorHandleConvertible {
+ public:
+  // Creates an uninitialized resource variable. Note that a caller must
+  // call "assign" to associate a value with the variable.
+  static absl::Status CreateUninitialized(
+      ImmediateExecutionContext* ctx, DataType dtype, TensorShape shape,
+      absl::optional<std::string> name, const char* raw_device_name,
+      const std::vector<std::string>& component_devices,
+      std::unique_ptr<Variable>* output);
+
+  // The dtype of the underlying variable.
+  DataType dtype();
+
+  // The shape of the underlying variable.
+  TensorShape shape();
+
+  // Updates the variable's contents with `handle`.
+  absl::Status Assign(ImmediateExecutionTensorHandle* handle);
+
+  // Reads the value of the variable, and stores it in `out`
+  absl::Status ReadValue(ImmediateTensorHandlePtr* out);
+
+  // Variable is movable, but not copyable.
+  Variable(Variable&& other) = default;
+  Variable& operator=(Variable&& other) = default;
+
+  ~Variable() override;
+
+ private:
+  Variable(ImmediateExecutionContext* ctx, DataType dtype, TensorShape shape,
+           absl::optional<std::string> name, ImmediateTensorHandlePtr handle);
+  Variable(const Variable& variable) = delete;
+  Variable& operator=(const Variable&) = delete;
+
+  std::string name_;
+  DataType dtype_;
+  TensorShape shape_;
+
+  // ctx_ must outlive Variable.
+  ImmediateExecutionContext* ctx_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/saved_model_api.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/saved_model_api.h
new file mode 100644
index 00000000..1fd56822
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/saved_model_api.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_API_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/cc/saved_model/bundle_v2.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Note(bmzhao): This class is only TEMPORARILY virtual, as a way to unblock
+// TFRT integration with TF Serving. Do not add more virtual implementations of
+// this class. Eventually we want to remove this virtual base class indirection
+// and have only a single implementation.
+class SavedModelAPI {
+ public:
+  // Retrieve a function from the TF2 SavedModel, using the "path" to a function
+  // in a TF2 savedmodel.
+  //
+  // Note: `function` is a double pointer, so that implementations are
+  // able to return a pointer to an internal member.
+  virtual absl::Status GetFunction(const std::string& function_path,
+                                   ConcreteFunction** function) = 0;
+
+  // Retrieve a list of child functions from a SavedModel given a starting node.
+  // 0 is the root node.
+  virtual absl::Status GetFunctions(
+      int node_id,
+      absl::flat_hash_map<std::string, ConcreteFunction*>* functions) = 0;
+
+  // Retrieve a SignatureDefFunction from a SavedModel, using the key of the
+  // SignatureDef map:
+  // https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
+  virtual absl::Status GetSignatureDefFunction(
+      const std::string& signature_def_key,
+      SignatureDefFunction** function) = 0;
+
+  virtual SavedModelV2Bundle* GetBundle() = 0;
+
+  virtual ~SavedModelAPI() = default;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
new file mode 100644
index 00000000..9a6108db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
+
+// Some internal utility functions for the SavedModelAPI, factored out into a
+// separately unit-testable header.
+
+#include <memory>
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Load a TensorProto into a tensorflow::Constant. This is similar to the
+// constant loading logic in python:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/saved_model/load.py#L437
+absl::Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
+                                   const TensorProto& proto,
+                                   std::unique_ptr<Constant>* output);
+
+// Creates a tensorflow::Variable from a SavedVariable. This is similar to the
+// logic in:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/saved_model/load.py#L407
+// Note that the caller **must assign a value** to the loaded variable.
+absl::Status LoadSavedVariable(ImmediateExecutionContext* ctx,
+                               const SavedVariable& variable,
+                               std::unique_ptr<Variable>* output);
+
+absl::Status LoadSavedAsset(ImmediateExecutionContext* ctx,
+                            const SavedAsset& asset,
+                            const std::string& saved_model_dir,
+                            absl::Span<const AssetFileDef> assets,
+                            std::unique_ptr<Asset>* output);
+
+// Creates a TFConcreteFunction from a SavedConcreteFunction.
+absl::Status LoadTFConcreteFunction(
+    const SavedConcreteFunction& saved_concrete_function,
+    const FunctionDef* function_def,
+    const std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>&
+        captured_objects,
+    ImmediateExecutionContext* ctx, std::unique_ptr<TFConcreteFunction>* out);
+
+// Flattens `signature` into a vector of TensorSpecProto pointers back into
+// `signature`. `signature` must outlive flattened_specs. `signature` must also
+// be the input or output signature of a SavedConcreteFunction (i.e. "nested
+// structures of tensorspecs").
+absl::Status FlattenSignature(
+    const StructuredValue& signature,
+    std::vector<const TensorSpecProto*>* flattened_specs);
+
+// Find the node id in `object_graph` at location `path`. `path` must be
+// a dot-delimited string of object names relative to the root object. If no
+// object is found, returns absl::nullopt.
+absl::optional<int> FindNodeAtPath(absl::string_view path,
+                                   const SavedObjectGraph& object_graph);
+
+// Maps each node in `graphdef` to its corresponding Attribute Map.
+// Callers must ensure that `graphdef` outlives the returned map.
+gtl::FlatMap<absl::string_view, const AttrValueMap*, StringPieceHasher>
+NodeToAttrMap(const tensorflow::GraphDef& graphdef);
+
+// Maps the name of each FunctionDef in `library` to its corresponding
+// FunctionDef. Callers must ensure `library` outlives the returned map.
+gtl::FlatMap<absl::string_view, const tensorflow::FunctionDef*,
+             StringPieceHasher>
+FunctionNameToFunctionDefMap(const FunctionDefLibrary& library);
+
+// Finds the "signatures" object in the object graph, and fills a mapping of
+// each signature's name to the corresponding function's node in the object
+// graph.
+absl::Status GetSignaturesMap(const SavedObjectGraph& saved_objects,
+                              gtl::FlatMap<std::string, int>* signatures_map);
+
+// Validates the `saved_function`.
+absl::Status ValidateSingleConcreteFunction(
+    const SavedFunction& saved_function);
+
+// Walks through the SavedObjectGraph in metagraph, and restores all nodes
+// (except "UserDefinedObjects") with their corresponding type in
+// "PartiallyRevivedObjects".
+absl::Status PartiallyReviveSavedModelObjects(
+    const MetaGraphDef& metagraph, ImmediateExecutionContext* context,
+    const std::string& directory, PartiallyRevivedObjects* objects);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/signature_def_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/signature_def_function.h
new file mode 100644
index 00000000..71e6a432
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/signature_def_function.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+namespace tensorflow {
+
+// See tensorflow/cc/experimental/saved_model/public/signature_def_function.h
+// for SignatureDefFunction's intended user-facing semantics.
+// This class is the "implementation" C++ part of the C++/C/C++ sandwich for
+// a SignatureDefFunction.
+// Note(bmzhao): Implementation-wise, SignatureDefFunctions are always saved as
+// a "BareConcreteFunction", w/o a FunctionSpec, rather than a SavedFunction:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/core/protobuf/saved_object_graph.proto#L60
+// Additionally they are guaranteed to be children of the .signatures attribute
+// of the root object, where the child object "name" is the signature_def key:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/python/saved_model/signature_serialization.py#L181-L230
+// One of the critical requirements of SignatureDef functions is that their
+// inputs and outputs are "named". For example, a `.signatures` function:
+// a. Requires users to pass: kwargs of all inputs:
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/python/saved_model/signature_serialization.py#L119-L126
+// b. Returns a dictionary of named outputs.
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/python/saved_model/signature_serialization.py#L153-L161
+// Since SignatureDefFunctions do not have FunctionSpecs, but guarantee the
+// dictionary of inputs/outputs, we can parse these dictionaries' keys to obtain
+// the input/output names of the SignatureDef:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/core/protobuf/meta_graph.proto#L318-L321
+class SignatureDefFunction {
+ public:
+  virtual ~SignatureDefFunction() = default;
+
+  // Creates a "Call" Op used to execute the function.
+  virtual absl::Status MakeCallOp(
+      absl::Span<AbstractTensorHandle* const> inputs,
+      ImmediateOpPtr* out) const = 0;
+
+  virtual const SignatureDefFunctionMetadata& GetFunctionMetadata() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
new file mode 100644
index 00000000..e9cc0b11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+// SignatureDefParam represents a named Tensor input or output to a
+// SignatureDefFunction.
+class SignatureDefParam {
+ public:
+  SignatureDefParam(std::string name, TensorSpec spec);
+
+  const std::string& name() const;
+
+  const TensorSpec& spec() const;
+
+ private:
+  std::string name_;
+  TensorSpec spec_;
+};
+
+class SignatureDefFunctionMetadata {
+ public:
+  SignatureDefFunctionMetadata() = default;
+  SignatureDefFunctionMetadata(std::vector<SignatureDefParam> arguments,
+                               std::vector<SignatureDefParam> returns);
+
+  const std::vector<SignatureDefParam>& arguments() const;
+  const std::vector<SignatureDefParam>& returns() const;
+
+ private:
+  std::vector<SignatureDefParam> arguments_;
+  std::vector<SignatureDefParam> returns_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tensor_spec.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tensor_spec.h
new file mode 100644
index 00000000..dcdff890
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tensor_spec.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSOR_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSOR_SPEC_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+// Note(bmzhao): TensorSpec deliberately does not store the "name" from a
+// TensorSpecProto. From edloper@, "Names should really be associated with
+// parameters, not the tensors inside those parameters. This would be
+// inconsistent with the corresponding Python class, but I don't think that's
+// necessarily a problem. If it turns out later that we really need a name
+// attribute here, we can always add it back in; but let's see how far we can
+// get without it."
+class TensorSpec {
+ public:
+  // Constructs a scalar, DT_FLOAT TensorSpec
+  TensorSpec();
+
+  TensorSpec(PartialTensorShape shape, DataType dtype);
+
+  explicit TensorSpec(const TensorSpecProto& proto);
+
+  const PartialTensorShape& shape() const;
+  DataType dtype() const;
+
+ private:
+  PartialTensorShape shape_;
+  DataType dtype_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSOR_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/test_utils.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/test_utils.h
new file mode 100644
index 00000000..f3e6548d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/test_utils.h
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TEST_UTILS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TEST_UTILS_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace testing {
+
+// Creates a DeviceMgr suitable for local tests.
+std::unique_ptr<StaticDeviceMgr> CreateTestingDeviceMgr();
+
+// Creates an EagerContext suitable for local tests. Does not take ownership
+// of `device_mgr`.
+EagerContextPtr CreateTestingEagerContext(DeviceMgr* device_mgr);
+
+// Converts a tensorflow::DatatypeSet to std::vector<DataType>.
+// This is useful for tests using GTest's ::testing::ValuesIn, since
+// DataTypeSet doesn't fullfill all the constraints of an STL-like iterable.
+std::vector<DataType> DataTypeSetToVector(DataTypeSet set);
+
+// Returns a vector of shapes intended to be "interesting" test cases.
+// Currently, this returns scalar, 1D vector, 2D matrix, and a 4D tensor shapes
+std::vector<std::vector<int64_t>> InterestingShapes();
+
+// Returns a TensorHandle of `dtype` and `shape`, filled with `value`.
+// `dtype` must be an integer dtype, float, or double.
+// If a TensorHandle cannot be created successfully, this function will
+// CHECK fail. This should only be used for testing purposes.
+ImmediateTensorHandlePtr CreateTensorHandle(ImmediateExecutionContext* ctx,
+                                            DataType dtype,
+                                            absl::Span<const int64_t> shape,
+                                            int8_t value);
+
+// Fills a numeric tensor's buffer with `value`.
+// dtype must be any integer dtype, float or double.
+void FillNumericTensorBuffer(DataType dtype, size_t num_elements, void* buffer,
+                             int8_t value);
+
+// Checks the underlying data is equal for the buffers for two numeric tensors.
+// Note: The caller must ensure to check that the dtypes and sizes of the
+// underlying buffers are the same before calling this.
+// dtype must be any integer dtype, float, or double.
+void CheckBufferDataIsEqual(DataType dtype, int64_t num_elements, void* a,
+                            void* b);
+
+// Converts a TensorHandle to a Tensor, and dies if unsuccessful. This should
+// only be used for testing purposes.
+AbstractTensorPtr TensorHandleToTensor(ImmediateExecutionTensorHandle* handle);
+
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.h
new file mode 100644
index 00000000..e3fbfefe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_CONCRETE_FUNCTION_TEST_PROTOS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_CONCRETE_FUNCTION_TEST_PROTOS_H_
+
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+namespace testing {
+
+// Returns a StructuredValue corresponding to the serialized InputSignature of a
+// tf.function with 0 inputs
+StructuredValue ZeroArgInputSignature();
+
+// Returns a StructuredValue corresponding to the serialized InputSignature of a
+// tf.function with 1 input
+StructuredValue SingleArgInputSignature();
+
+// Returns a StructuredValue corresponding to the serialized InputSignature of a
+// tf.function with 3 inputs
+StructuredValue ThreeArgInputSignature();
+
+// Returns a StructuredValue corresponding to the serialized OutputSignature of
+// a tf.function with no return values
+StructuredValue ZeroReturnOutputSignature();
+
+// Returns a StructuredValue corresponding to the serialized OutputSignature of
+// a tf.function with a single tensor output
+StructuredValue SingleReturnOutputSignature();
+
+// Returns a StructuredValue corresponding to the serialized OutputSignature of
+// a tf.function with three tensor outputs
+StructuredValue ThreeReturnOutputSignature();
+
+}  // namespace testing
+}  // namespace tensorflow
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_CONCRETE_FUNCTION_TEST_PROTOS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
new file mode 100644
index 00000000..17c71258
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_API_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/cc/saved_model/bundle_v2.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// An implementation of the SavedModelAPI using the TF Eager runtime. See
+// https://github.com/tensorflow/community/blob/master/rfcs/20200218-tf-c-saved-model.md
+// Conceptually, there are many differences between a tf.function and
+// a FunctionDef is executed by the C API.
+// 1. A tf.function is polymorphic, meaning it can correspond to multiple
+// ConcreteFunctions (of differing shapes, python arguments, etc). A
+// FunctionDef corresponds to a single ConcreteFunction.
+// 2. A tf.function can take arbitrary python inputs, whereas the FunctionDef
+// only accepts tensors.
+// 3. A tf.function is a closure that can contain captured inputs, whereas
+// FunctionDefs loaded from SavedModels are "functional" (all inputs are
+// explicitly passed as arguments).
+// The SavedModelAPI only supports loading tf.functions annotated with input
+// signatures so that we ensure that there is a 1:1 mapping between tf.function
+// -> FunctionDef, and have a guarantee that all inputs are tensors.
+// (https://github.com/tensorflow/tensorflow/blob/2b96f3662bd776e277f86997659e61046b56c315/tensorflow/python/eager/def_function.py#L1167-L1171),
+class TFSavedModelAPI : public SavedModelAPI {
+ public:
+  absl::Status GetFunction(const std::string& function_path,
+                           ConcreteFunction** function) override;
+
+  absl::Status GetFunctions(
+      int node_id,
+      absl::flat_hash_map<std::string, ConcreteFunction*>* functions) override;
+
+  absl::Status GetSignatureDefFunction(
+      const std::string& signature_def_key,
+      SignatureDefFunction** function) override;
+
+  static absl::Status Load(
+      const std::string& directory,
+      const absl::optional<std::unordered_set<std::string>>& tags,
+      ImmediateExecutionContext* context,
+      std::unique_ptr<TFSavedModelAPI>* out);
+
+  ~TFSavedModelAPI() override = default;
+
+  absl::Status GetVariable(const std::string& variable_path,
+                           Variable** variable);
+
+  SavedModelV2Bundle* GetBundle() override;
+
+ private:
+  TFSavedModelAPI(const std::string& directory, SavedModelV2Bundle bundle,
+                  RevivedObjects revived_objects);
+
+  std::string directory_;
+  SavedModelV2Bundle bundle_;
+  RevivedObjects revived_objects_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SAVED_MODEL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h
new file mode 100644
index 00000000..66e0a8f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+
+// Internal structures used by the SavedModel C API. These are likely to change
+// and should not be depended on.
+
+struct TF_ConcreteFunctionList {
+  std::vector<tensorflow::ConcreteFunction*> list;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/concrete_function_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/concrete_function_type.h
new file mode 100644
index 00000000..bc36b0c6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/concrete_function_type.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+
+// Internal structures used by the SavedModel C API. These are likely to change
+// and should not be depended on.
+
+// It doesn't make sense to wrap tensorflow::ConcreteFunction* in a separate
+// struct, since the lifetime of the struct and the raw pointer it wraps would
+// be different. Therefore TF_ConcreteFunction* = tensorflow::ConcreteFunction*.
+typedef struct TF_ConcreteFunction TF_ConcreteFunction;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ConcreteFunction, TF_ConcreteFunction)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/function_metadata_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/function_metadata_type.h
new file mode 100644
index 00000000..40f05f91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/function_metadata_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_FUNCTION_METADATA_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_FUNCTION_METADATA_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
+
+typedef struct TF_FunctionMetadata TF_FunctionMetadata;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::FunctionMetadata, TF_FunctionMetadata)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_FUNCTION_METADATA_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
new file mode 100644
index 00000000..380c3703
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
+
+#include <memory>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+
+// Internal structures used by the SavedModel C API. These are likely to change
+// and should not be depended on.
+
+typedef struct TF_SavedModel TF_SavedModel;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SavedModelAPI, TF_SavedModel)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h
new file mode 100644
index 00000000..fa6d0f65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefFunctionMetadata,
+                            TF_SignatureDefFunctionMetadata)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h
new file mode 100644
index 00000000..ca44dc43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+
+typedef struct TF_SignatureDefFunction TF_SignatureDefFunction;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefFunction,
+                            TF_SignatureDefFunction)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h
new file mode 100644
index 00000000..6f535110
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefParamList TF_SignatureDefParamList;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(std::vector<SignatureDefParam>,
+                            TF_SignatureDefParamList)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_LIST_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h
new file mode 100644
index 00000000..fd634bcd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefParam TF_SignatureDefParam;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefParam, TF_SignatureDefParam)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h
new file mode 100644
index 00000000..7284c8a8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_TENSOR_SPEC_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_TENSOR_SPEC_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+
+typedef struct TF_TensorSpec TF_TensorSpec;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::TensorSpec, TF_TensorSpec)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_TENSOR_SPEC_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
new file mode 100644
index 00000000..68f1ece2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
+
+// IWYU pragma: begin_exports
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param_list.h"
+#include "tensorflow/c/experimental/saved_model/public/tensor_spec.h"
+// IWYU pragma: end_exports
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/concrete_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/concrete_function.h
new file mode 100644
index 00000000..ff8a2459
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a Function loaded from a SavedModel.
+// TODO(bmzhao): Work together w/srbs@ to make sure this composes w/the
+// C++ Unified Eager/Graph API's AbstractFunction
+typedef struct TF_ConcreteFunction TF_ConcreteFunction;
+
+// Returns FunctionMetadata associated with `func`. Metadata's lifetime is
+// bound to `func`, which is bound to the TF_SavedModel it was loaded from.
+TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
+    TF_ConcreteFunction* func);
+
+// Returns a TFE_Op suitable for executing this function. Caller must provide
+// all function inputs in `inputs`, and must not add any additional inputs on
+// the returned op. (i.e. don't call TFE_OpAddInput or TFE_OpAddInputList).
+// The caller is responsible for deleting the returned TFE_Op. If op
+// construction fails, `status` will be non-OK and the returned pointer will be
+// null.
+// TODO(bmzhao): Remove this function in a subsequent change; Design + implement
+// a Function Execution interface for ConcreteFunction that accepts a tagged
+// union of types (tensorflow::Value). This effectively requires moving much of
+// the implementation of function.py/def_function.py to C++, and exposing a
+// high-level API here. A strawman for what this interface could look like:
+// TF_Value* TF_ExecuteFunction(TFE_Context*, TF_ConcreteFunction*, TF_Value*
+// inputs, int num_inputs, TF_Status* status);
+TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionMakeCallOp(
+    TF_ConcreteFunction* func, TFE_TensorHandle** inputs, int num_inputs,
+    TF_Status* status);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/concrete_function_list.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
new file mode 100644
index 00000000..e3554675
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
+typedef struct TF_ConcreteFunctionList TF_ConcreteFunctionList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_ConcreteFunctionListSize(
+    TF_ConcreteFunctionList* list);
+
+// Returns the `i`th TF_ConcreteFunction in the list.
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_ConcreteFunctionListGet(
+    TF_ConcreteFunctionList* list, int i);
+
+// Deletes `list`.
+TF_CAPI_EXPORT extern void TF_DeleteConcreteFunctionList(
+    TF_ConcreteFunctionList* list);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/function_metadata.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/function_metadata.h
new file mode 100644
index 00000000..83ca3c73
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/function_metadata.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_FUNCTION_METADATA_H_
+
+#include "tensorflow/c/c_api_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type used to store any metadata associated with a function.
+typedef struct TF_FunctionMetadata TF_FunctionMetadata;
+
+// TODO(bmzhao): Add getters for fields as we determine what metadata
+// we want to expose.
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/saved_model_api.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/saved_model_api.h
new file mode 100644
index 00000000..cef7fe86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/saved_model_api.h
@@ -0,0 +1,107 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SAVED_MODEL_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SAVED_MODEL_API_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/c/tf_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type representing a Tensorflow "SavedModel"
+// (https://www.tensorflow.org/guide/saved_model) that we always pass by pointer
+// to achieve ABI stability.
+typedef struct TF_SavedModel TF_SavedModel;
+
+// Load a SavedModel from `dirname`. We expect the SavedModel to contain a
+// single Metagraph (as for those exported from TF2's `tf.saved_model.save`).
+//
+// Params:
+//  dirname - A directory filepath that the SavedModel is at.
+//  ctx - A TFE_Context containing optional load/TF runtime options.
+//        `ctx` must outlive the returned TF_SavedModel pointer.
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a newly created
+//  TF_SavedModel instance. It must be deleted by calling TF_DeleteSavedModel.
+TF_CAPI_EXPORT extern TF_SavedModel* TF_LoadSavedModel(const char* dirname,
+                                                       TFE_Context* ctx,
+                                                       TF_Status* status);
+
+// Load a SavedModel from `dirname`.
+//
+// Params:
+//  dirname - A directory filepath that the SavedModel is at.
+//  ctx - A TFE_Context containing optional load/TF runtime options.
+//        `ctx` must outlive the returned TF_SavedModel pointer.
+//  tags - char* array of SavedModel tags. We will load the metagraph matching
+//         the tags.
+//  tags_len - number of elements in the `tags` array.
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a newly created
+//  TF_SavedModel instance. It must be deleted by calling TF_DeleteSavedModel.
+TF_CAPI_EXPORT extern TF_SavedModel* TF_LoadSavedModelWithTags(
+    const char* dirname, TFE_Context* ctx, const char* const* tags,
+    int tags_len, TF_Status* status);
+
+// Deletes a TF_SavedModel, and frees any resources owned by it.
+TF_CAPI_EXPORT extern void TF_DeleteSavedModel(TF_SavedModel* model);
+
+// Retrieve a function from the TF2 SavedModel via function path.
+//
+// Params:
+//  model - The TF2 SavedModel to load a function from.
+//  function_path - A string containing the path from the root saved python
+//                  object to a tf.function method.
+//                  TODO(bmzhao): Add a detailed example of this with a
+//                  python tf.module before moving this out of experimental.
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a
+//  TF_ConcreteFunction instance. The lifetime of this instance is
+//  "conceptually" bound to `model`. Once `model` is deleted, all
+//  `TF_ConcreteFunctions` retrieved from it are invalid, and have been deleted.
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(
+    TF_SavedModel* model, const char* function_path, TF_Status* status);
+
+// Retrieve a function from the TF SavedModel via a SignatureDef key.
+//
+// Params:
+//  model - The SavedModel to load a function from.
+//  signature_def_key - The string key of the SignatureDef map of a SavedModel:
+//                      https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
+//  status - Set to OK on success and an appropriate error on failure.
+// Returns:
+//  If status is not OK, returns nullptr. Otherwise, returns a
+//  TF_SignatureDefFunction instance. Once `model` is deleted, all
+//  `TF_SignatureDefFunctions` retrieved from it are invalid, and have been
+//  deleted.
+TF_CAPI_EXPORT extern TF_SignatureDefFunction*
+TF_GetSavedModelSignatureDefFunction(TF_SavedModel* model,
+                                     const char* signature_def_key,
+                                     TF_Status* status);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_function.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_function.h
new file mode 100644
index 00000000..16471fdc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_function.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a SignatureDefFunction loaded from a
+// SavedModel.
+typedef struct TF_SignatureDefFunction TF_SignatureDefFunction;
+
+// Returns FunctionMetadata associated with `func`. Metadata's lifetime is
+// bound to `func`, which is bound to the TF_SavedModel it was loaded from.
+TF_CAPI_EXPORT extern TF_SignatureDefFunctionMetadata*
+TF_SignatureDefFunctionGetMetadata(TF_SignatureDefFunction* func);
+
+// Returns a TFE_Op suitable for executing this function. Caller must provide
+// all function inputs in `inputs`, and must not add any additional inputs on
+// the returned op. (i.e. don't call TFE_OpAddInput or TFE_OpAddInputList).
+// The caller is responsible for deleting the returned TFE_Op. If op
+// construction fails, `status` will be non-OK and the returned pointer will be
+// null.
+TF_CAPI_EXPORT extern TFE_Op* TF_SignatureDefFunctionMakeCallOp(
+    TF_SignatureDefFunction* func, TFE_TensorHandle** inputs, int num_inputs,
+    TF_Status* status);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
new file mode 100644
index 00000000..b7a7f67e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param_list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a SignatureDefFunction loaded from a
+// SavedModel.
+typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
+
+// Retrieves the arguments of the SignatureDefFunction. The caller is not
+// responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern const TF_SignatureDefParamList*
+TF_SignatureDefFunctionMetadataArgs(
+    const TF_SignatureDefFunctionMetadata* list);
+
+// Retrieves the returns of the SignatureDefFunction. The caller is not
+// responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern const TF_SignatureDefParamList*
+TF_SignatureDefFunctionMetadataReturns(
+    const TF_SignatureDefFunctionMetadata* list);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_param.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_param.h
new file mode 100644
index 00000000..82993d7f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_param.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/tensor_spec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that containing metadata of an input/output of a
+// TF_SignatureDefFunction loaded from a SavedModel.
+typedef struct TF_SignatureDefParam TF_SignatureDefParam;
+
+// Returns the name of the given parameter. The caller is not responsible for
+// freeing the returned char*.
+TF_CAPI_EXPORT extern const char* TF_SignatureDefParamName(
+    const TF_SignatureDefParam* param);
+
+// Returns the TensorSpec associated with the given parameter. The caller is
+// not reponsible for freeing the returned TF_TensorSpec*.
+TF_CAPI_EXPORT extern const TF_TensorSpec* TF_SignatureDefParamTensorSpec(
+    const TF_SignatureDefParam* param);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_param_list.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_param_list.h
new file mode 100644
index 00000000..0cb3a0d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/signature_def_param_list.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that containing metadata of an input/output of a
+// ConcreteFunction loaded from a SavedModel.
+typedef struct TF_SignatureDefParamList TF_SignatureDefParamList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_SignatureDefParamListSize(
+    const TF_SignatureDefParamList* list);
+
+// Returns the `i`th TF_SignatureDefParam in the list.
+TF_CAPI_EXPORT extern const TF_SignatureDefParam* TF_SignatureDefParamListGet(
+    const TF_SignatureDefParamList* list, int i);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_LIST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/tensor_spec.h b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/tensor_spec.h
new file mode 100644
index 00000000..82972ef7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/saved_model/public/tensor_spec.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSOR_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSOR_SPEC_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_shape.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type corresponding to TensorSpec
+typedef struct TF_TensorSpec TF_TensorSpec;
+
+// Returns the dtype associated with the TensorSpec.
+TF_CAPI_EXPORT extern TF_DataType TF_TensorSpecDataType(
+    const TF_TensorSpec* spec);
+
+// Returns the shape associated with the TensorSpec. The returned Shape is not
+// owned by the caller. Caller must not call TF_DeleteShape on the returned
+// shape.
+TF_CAPI_EXPORT extern const TF_Shape* TF_TensorSpecShape(
+    const TF_TensorSpec* spec);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSOR_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor.h b/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor.h
new file mode 100644
index 00000000..eebbae6c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor.h
@@ -0,0 +1,536 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_status.h"
+
+// --------------------------------------------------------------------------
+// C API for StreamExecutor. The API is under active development and eventually
+// should allow registering a pluggable device with TensorFlow.
+//
+// Conventions:
+//   * Struct prefix indicates whether struct fields should be filled by the
+//     plugin or core implementation:
+//     * SE_ : set/filled by core unless explicitly marked otherwise.
+//     * SP_ : set/filled by plugin unless explicitly marked otherwise.
+//   * We use `struct_size` for version checking. It is exempt from the `SE/SP`
+//     rule above and should be set both by core and the plugin.
+//     * For example, `create_device` function receives `SP_Device*` as input
+//       with `struct_size` populated by core. The plugin is responsible for
+//       setting `struct_size` as well, along with all other fields.
+//     * Refer to "TensorFlow Versioning Strategy" section at
+//       https://github.com/tensorflow/community/pull/257/files.
+//     * Note that the API is still under active development and doesn't have
+//       versioning guarantees yet.
+//   * `void* ext` is a free-form field that can be populated by
+//     a plugin in `SP_*` structs or potential future extension points in `SE_`
+//     structs.
+//
+// Example usage:
+//
+//   /* Sample TensorFlow code below, exact implementation might differ. */
+//   // Version checking uses `struct_size`. It is exempt from the `SE/SP` rule
+//   // above and should be set both by core and the plugin."
+//   SP_Device device { SP_DEVICE_STRUCT_SIZE };
+//   SE_CreateDeviceParams params { SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE } ;
+//   params.device = &device;
+//
+//   /* Plugin code below */
+//   constexpr char DEVICE_NAME[] = "MY_DEVICE";
+//   constexpr char DEVICE_TYPE[] = "GPU";
+//
+//   void create_device(const SP_Platform* platform,
+//                      SE_CreateDeviceParams* params, TF_Status* status) {
+//     // Custom actions based on TensorFlow's view of SP_Device.
+//     OnTFDeviceView(params->device->struct_size);
+//     params->device = { SP_DEVICE_STRUCT_SIZE };
+//     params->device->device_handle = get_my_device_handle(device->ordinal);
+//     params->device->ordinal = params->ordinal;
+//     ...
+//   }
+//
+//   void destroy_device(const SP_Platform* platform, SP_Device* device) {
+//     delete_my_device_handle(device->device_handle);
+//   }
+//
+//   void SE_InitPlugin(
+//       SE_PlatformRegistrationParams* params,
+//       TF_Status* status) {
+//     params->platform = { SP_PLATFORM_STRUCT_SIZE };
+//     // Values such as `name` and `type` must outlive SE_InitPlugin call.
+//     params->platform->name = DEVICE_NAME;
+//     params->platform->type = DEVICE_TYPE;
+//     params->platform_fns->get_device_count = get_device_count;
+//     params->platform_fns->create_device = create_device;
+//     params->platform_fns->destroy_device = destroy_device;
+//     ...
+//   }
+
+#define SE_MAJOR 0
+#define SE_MINOR 0
+#define SE_PATCH 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct SP_Stream_st* SP_Stream;
+typedef struct SP_Event_st* SP_Event;
+typedef struct SP_Timer_st* SP_Timer;
+// Takes `callback_arg` passed to `host_callback` as the first argument.
+typedef void (*SE_StatusCallbackFn)(void* const, TF_Status* const);
+
+typedef struct SP_TimerFns {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+  uint64_t (*nanoseconds)(SP_Timer timer);
+} SP_TimerFns;
+
+#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_TimerFns, nanoseconds)
+
+typedef struct SP_AllocatorStats {
+  size_t struct_size;
+  int64_t num_allocs;
+  int64_t bytes_in_use;
+  int64_t peak_bytes_in_use;
+  int64_t largest_alloc_size;
+
+  int8_t has_bytes_limit;
+  int64_t bytes_limit;
+
+  int64_t bytes_reserved;
+  int64_t peak_bytes_reserved;
+
+  int8_t has_bytes_reservable_limit;
+  int64_t bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;
+} SP_AllocatorStats;
+
+#define SP_ALLOCATORSTATS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_AllocatorStats, largest_free_block_bytes)
+
+// Potential states for an SP_Event. If `poll_for_status` returns anything aside
+// from kPending or kComplete, an error has occurred; kUnknown is a bad state.
+typedef enum SE_EventStatus {
+  SE_EVENT_UNKNOWN,
+  SE_EVENT_ERROR,
+  SE_EVENT_PENDING,
+  SE_EVENT_COMPLETE,
+} SE_EventStatus;
+
+// Memory allocation information.
+// This matches DeviceMemoryBase defined here:
+// https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/compiler/xla/stream_executor/device_memory.h;l=57
+typedef struct SP_DeviceMemoryBase {
+  size_t struct_size;
+  void* ext;  // Reserved for future use
+  // Platform-dependent value representing allocated memory.
+  // Note that the pointer does not have to be to the virtual address itself.
+  void* opaque;
+  uint64_t size;     // Size in bytes of this allocation.
+  uint64_t payload;  // Value for plugin's use
+} SP_DeviceMemoryBase;
+
+#define SP_DEVICE_MEMORY_BASE_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_DeviceMemoryBase, payload)
+
+typedef struct SP_Device {
+  size_t struct_size;
+  void* ext;        // free-form data set by plugin
+  int32_t ordinal;  // device index
+
+  // Device vendor can store handle to their device representation
+  // here.
+  void* device_handle;
+
+  // [Optional]
+  // Device hardware name. Used for printing.
+  // Must be null-terminated.
+  const char* hardware_name;
+
+  // [Optional]
+  // Device vendor name. Used for printing.
+  // Must be null-terminated.
+  const char* device_vendor;
+
+  // [Optional]
+  // Returns the PCI bus identifier for this device, of the form
+  // [domain]:[bus]:[device].[function]
+  // where domain number is usually 0000.
+  // Example: 0000:00:02.1
+  // For more information see:
+  // https://en.wikipedia.org/wiki/PCI_configuration_space
+  // https://www.oreilly.com/library/view/linux-device-drivers/0596005903/ch12.html
+  // Used for printing. Must be null-terminated.
+  const char* pci_bus_id;
+} SP_Device;
+
+#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, pci_bus_id)
+
+typedef struct SE_CreateDeviceParams {
+  size_t struct_size;
+  void* ext;        // reserved for future use
+  int32_t ordinal;  // device index
+
+  SP_Device* device;  // Input/output, struct_size set by TF for plugin to read.
+                      // Subsequently plugin fills the entire struct.
+} SE_CreateDeviceParams;
+
+#define SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateDeviceParams, device)
+
+typedef struct SP_DeviceFns {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // [Optional]
+  // Returns the NUMA node associated with this device, for use in
+  // determining socket locality. If the NUMA node could not be determined, -1
+  // is returned.
+  // Negative values are treated as "unset".
+  int32_t (*get_numa_node)(const SP_Device* device);
+
+  // [Optional]
+  // Device's memory bandwidth in bytes/sec.  (This is for reads/writes to/from
+  // the device's own memory, not for transfers between the host and device.)
+  // Negative values are treated as "unset".
+  int64_t (*get_memory_bandwidth)(const SP_Device* device);
+
+  // [Optional]
+  // Estimate of average number of floating point operations per second for
+  // this device * 10e-9.
+  // Negative values are treated as "unset".
+  double (*get_gflops)(const SP_Device* device);
+} SP_DeviceFns;
+
+#define SP_DEVICE_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_DeviceFns, get_gflops)
+
+typedef struct SE_CreateDeviceFnsParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  SP_DeviceFns* device_fns;  // output, to be filled by plugin
+} SE_CreateDeviceFnsParams;
+
+#define SE_CREATE_DEVICE_FNS_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateDeviceFnsParams, device_fns)
+
+typedef struct SP_StreamExecutor {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  /*** ALLOCATION CALLBACKS ***/
+  // Synchronously allocates `size` bytes on the underlying platform and returns
+  // `SP_DeviceMemoryBase` representing that allocation. In the case of failure,
+  // nullptr is returned.
+  // `memory_space` is reserved for a potential future usage and should be set
+  // to 0.
+  void (*allocate)(const SP_Device* device, uint64_t size, int64_t memory_space,
+                   SP_DeviceMemoryBase* mem);
+
+  // Deallocate the device memory previously allocated via this interface.
+  // Deallocation of a nullptr-representative value is permitted.
+  void (*deallocate)(const SP_Device* device, SP_DeviceMemoryBase* memory);
+
+  // Allocates a region of host memory and registers it with the platform API.
+  // Memory allocated in this manner is required for use in asynchronous memcpy
+  // operations, such as `memcpy_dtoh`.
+  void* (*host_memory_allocate)(const SP_Device* device, uint64_t size);
+
+  // Deallocates a region of host memory allocated by `host_memory_allocate`.
+  void (*host_memory_deallocate)(const SP_Device* device, void* mem);
+
+  // Allocates unified memory space of the given size, if supported. Unified
+  // memory support should be added by setting `supports_unified_memory` field
+  // in `SP_Platform`.
+  void* (*unified_memory_allocate)(const SP_Device* device, uint64_t bytes);
+
+  // Deallocates unified memory space previously allocated with
+  // `unified_memory_allocate`. Unified
+  // memory support should be added by setting `supports_unified_memory` field
+  // in `SP_Platform`.
+  void (*unified_memory_deallocate)(const SP_Device* device, void* location);
+
+  // Fills SP_AllocatorStats with allocator statistics, if it is available.
+  // If it is not available, return false.
+  TF_Bool (*get_allocator_stats)(const SP_Device* device,
+                                 SP_AllocatorStats* stats);
+  // Fills the underlying device memory usage information, if it is
+  // available. If it is not available (false is returned), free/total need not
+  // be initialized.
+  TF_Bool (*device_memory_usage)(const SP_Device* device, int64_t* free,
+                                 int64_t* total);
+
+  /*** STREAM CALLBACKS ***/
+  // Creates SP_Stream. This call should also allocate stream
+  // resources on the underlying platform and initializes its
+  // internals.
+  void (*create_stream)(const SP_Device* device, SP_Stream* stream,
+                        TF_Status* status);
+
+  // Destroys SP_Stream and deallocates any underlying resources.
+  void (*destroy_stream)(const SP_Device* device, SP_Stream stream);
+
+  // Causes `dependent` to not begin execution until `other` has finished its
+  // last-enqueued work.
+  void (*create_stream_dependency)(const SP_Device* device, SP_Stream dependent,
+                                   SP_Stream other, TF_Status* status);
+
+  // Without blocking the device, retrieve the current stream status.
+  void (*get_stream_status)(const SP_Device* device, SP_Stream stream,
+                            TF_Status* status);
+
+  /*** EVENT CALLBACKS ***/
+  // Create SP_Event. Performs platform-specific allocation and initialization
+  // of an event.
+  void (*create_event)(const SP_Device* device, SP_Event* event,
+                       TF_Status* status);
+
+  // Destroy SE_Event and perform any platform-specific deallocation and
+  // cleanup of an event.
+  void (*destroy_event)(const SP_Device* device, SP_Event event);
+
+  // Requests the current status of the event from the underlying platform.
+  SE_EventStatus (*get_event_status)(const SP_Device* device, SP_Event event);
+  // Inserts the specified event at the end of the specified stream.
+  void (*record_event)(const SP_Device* device, SP_Stream stream,
+                       SP_Event event, TF_Status* status);
+
+  // Wait for the specified event at the end of the specified stream.
+  void (*wait_for_event)(const SP_Device* const device, SP_Stream stream,
+                         SP_Event event, TF_Status* const status);
+
+  /*** TIMER CALLBACKS ***/
+  // Creates SP_Timer. Allocates timer resources on the underlying platform
+  // and initializes its internals, setting `timer` output variable. Sets
+  // values in `timer_fns` struct.
+  void (*create_timer)(const SP_Device* device, SP_Timer* timer,
+                       TF_Status* status);
+
+  // Destroy timer and deallocates timer resources on the underlying platform.
+  void (*destroy_timer)(const SP_Device* device, SP_Timer timer);
+
+  // Records a start event for an interval timer.
+  void (*start_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer,
+                      TF_Status* status);
+
+  // Records a stop event for an interval timer.
+  void (*stop_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer,
+                     TF_Status* status);
+
+  /*** MEMCPY CALLBACKS ***/
+  // Enqueues a memcpy operation onto stream, with a host destination location
+  // `host_dst` and a device memory source, with target size `size`.
+  void (*memcpy_dtoh)(const SP_Device* device, SP_Stream stream, void* host_dst,
+                      const SP_DeviceMemoryBase* device_src, uint64_t size,
+                      TF_Status* status);
+
+  // Enqueues a memcpy operation onto stream, with a device destination
+  // location and a host memory source, with target size `size`.
+  void (*memcpy_htod)(const SP_Device* device, SP_Stream stream,
+                      SP_DeviceMemoryBase* device_dst, const void* host_src,
+                      uint64_t size, TF_Status* status);
+
+  // Enqueues a memcpy operation onto stream, with a device destination
+  // location and a device memory source, with target size `size`.
+  void (*memcpy_dtod)(const SP_Device* device, SP_Stream stream,
+                      SP_DeviceMemoryBase* device_dst,
+                      const SP_DeviceMemoryBase* device_src, uint64_t size,
+                      TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is
+  // copied from the device source to the host destination.
+  void (*sync_memcpy_dtoh)(const SP_Device* device, void* host_dst,
+                           const SP_DeviceMemoryBase* device_src, uint64_t size,
+                           TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is
+  // copied from the host source to the device destination.
+  void (*sync_memcpy_htod)(const SP_Device* device,
+                           SP_DeviceMemoryBase* device_dst,
+                           const void* host_src, uint64_t size,
+                           TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is copied from the
+  // device source to the device destination.
+  void (*sync_memcpy_dtod)(const SP_Device* device,
+                           SP_DeviceMemoryBase* device_dst,
+                           const SP_DeviceMemoryBase* device_src, uint64_t size,
+                           TF_Status* status);
+
+  // Causes the host code to synchronously wait for the event to complete.
+  void (*block_host_for_event)(const SP_Device* device, SP_Event event,
+                               TF_Status* status);
+
+  // [Optional]
+  // Causes the host code to synchronously wait for operations entrained onto
+  // stream to complete. Effectively a join on the asynchronous device
+  // operations enqueued on the stream before this program point.
+  // If not set, then corresponding functionality will be implemented
+  // by registering an event on the `stream` and waiting for it using
+  // `block_host_for_event`.
+  void (*block_host_until_done)(const SP_Device* device, SP_Stream stream,
+                                TF_Status* status);
+
+  // Synchronizes all activity occurring in the StreamExecutor's context (most
+  // likely a whole device).
+  void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status);
+
+  // Zero out `size` bytes starting at the location.
+  void (*mem_zero)(const SP_Device* device, SP_Stream stream,
+                   SP_DeviceMemoryBase* location, uint64_t size,
+                   TF_Status* status);
+
+  // Set the 8-bit patterns starting at the location with `size` bytes.
+  void (*memset)(const SP_Device* device, SP_Stream stream,
+                 SP_DeviceMemoryBase* location, uint8_t pattern, uint64_t size,
+                 TF_Status* status);
+
+  // Set the 32-bit patterns starting at the location with `size` bytes.
+  void (*memset32)(const SP_Device* device, SP_Stream stream,
+                   SP_DeviceMemoryBase* location, uint32_t pattern,
+                   uint64_t size, TF_Status* status);
+
+  // Enqueues on a stream a user-specified function to be run on the host.
+  // `callback_arg` should be passed as the first argument to `callback_fn`.
+  TF_Bool (*host_callback)(const SP_Device* device, SP_Stream stream,
+                           SE_StatusCallbackFn callback_fn, void* callback_arg);
+} SP_StreamExecutor;
+
+#define SP_STREAMEXECUTOR_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_StreamExecutor, host_callback)
+
+typedef struct SE_CreateStreamExecutorParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  SP_StreamExecutor* stream_executor;  // output, to be filled by plugin
+} SE_CreateStreamExecutorParams;
+
+#define SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateStreamExecutorParams, stream_executor)
+
+typedef struct SP_Platform {
+  size_t struct_size;
+
+  void* ext;  // free-form data set by plugin
+
+  // Platform name (also referred to as subtype), for example MY_DEVICE.
+  // The name must start with a capital letter and consist of
+  // capital letters and underscores.
+  // Must be null-terminated.
+  const char* name;
+
+  // Device type name, for example GPU. Must be null-terminated.
+  // The name must start with a capital letter and consist of
+  // capital letters and underscores.
+  const char* type;
+
+  // Whether this platform supports unified memory.
+  // Unified memory is a single memory address space accessible from any device.
+  TF_Bool supports_unified_memory;
+
+  // Whether to wrap allocator for this device with an allocator that uses BFC
+  // (best-fit with coalescing) strategy.
+  TF_Bool use_bfc_allocator;
+
+  // Whether to force the memory allocations to grow over time instead of
+  // allocating it all at once. When this is set to true, the value of
+  // allow_growth is ignored.
+  TF_Bool force_memory_growth;
+} SP_Platform;
+
+#define SP_PLATFORM_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_Platform, force_memory_growth)
+
+typedef struct SP_PlatformFns {
+  size_t struct_size;
+
+  void* ext;  // reserved for future use
+
+  // Callbacks for getting device count
+  void (*get_device_count)(const SP_Platform* platform, int* device_count,
+                           TF_Status* status);
+  // Callbacks for creating/destroying SP_Device.
+  void (*create_device)(const SP_Platform* platform,
+                        SE_CreateDeviceParams* params, TF_Status* status);
+
+  // Clean up fields inside SP_Device that were allocated
+  // by the plugin. `device` itself should not be deleted here.
+  void (*destroy_device)(const SP_Platform* platform, SP_Device* device);
+
+  // Callbacks for creating/destroying SP_DeviceFns.
+  void (*create_device_fns)(const SP_Platform* platform,
+                            SE_CreateDeviceFnsParams* params,
+                            TF_Status* status);
+
+  // Clean up fields inside SP_DeviceFns that were allocated
+  // by the plugin. `device_fns` itself should not be deleted here.
+  void (*destroy_device_fns)(const SP_Platform* platform,
+                             SP_DeviceFns* device_fns);
+
+  // Callbacks for creating/destroying SP_StreamExecutor.
+  void (*create_stream_executor)(const SP_Platform* platform,
+                                 SE_CreateStreamExecutorParams* params,
+                                 TF_Status* status);
+  // Clean up fields inside SP_StreamExecutor that were allocated
+  // by the plugin. `stream_executor` itself should not be deleted here.
+  void (*destroy_stream_executor)(const SP_Platform* platform,
+                                  SP_StreamExecutor* stream_executor);
+
+  // Callbacks for creating/destroying SP_TimerFns.
+  void (*create_timer_fns)(const SP_Platform* platform, SP_TimerFns* timer,
+                           TF_Status* status);
+
+  void (*destroy_timer_fns)(const SP_Platform* platform,
+                            SP_TimerFns* timer_fns);
+} SP_PlatformFns;
+
+#define SP_PLATFORM_FNS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_PlatformFns, destroy_timer_fns)
+
+typedef struct SE_PlatformRegistrationParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // StreamExecutor C API version.
+  int32_t major_version;
+  int32_t minor_version;
+  int32_t patch_version;
+
+  SP_Platform* platform;         // output, set by plugin
+  SP_PlatformFns* platform_fns;  // output, set by plugin
+  // Clean up fields inside SP_Platform that were allocated
+  // by the plugin. `platform` itself should not be deleted here.
+  void (*destroy_platform)(SP_Platform* platform);  // out, set by plugin
+  void (*destroy_platform_fns)(
+      SP_PlatformFns* platform_fns);  // out, set by plugin
+} SE_PlatformRegistrationParams;
+
+#define SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_PlatformRegistrationParams, destroy_platform_fns)
+
+void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
new file mode 100644
index 00000000..b8217ea3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -0,0 +1,336 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Classes and utilities that work with StreamExecutor C API for internal use.
+// This includes functions used for device registration and interfaces needed
+// for testing.
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+// Plugin initialization function that a device plugin
+// must define.
+typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams* const,
+                               TF_Status* const);
+
+// Registers StreamExecutor platform. `device_type` and `platform_name` are
+// output parameters.
+absl::Status InitStreamExecutorPlugin(void* dso_handle,
+                                      std::string* device_type,
+                                      std::string* platform_name);
+
+// Allow registering a StreamExecutor plugin using a function (used for
+// testing).
+absl::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
+                                      std::string* device_type,
+                                      std::string* platform_name);
+
+// Converts DeviceMemoryBase to a C struct.
+inline SP_DeviceMemoryBase DeviceMemoryBaseToC(const DeviceMemoryBase* mem) {
+  SP_DeviceMemoryBase device_memory_base{SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
+  // `opaque` field inside SP_DeviceMemoryBase is not const.
+  // Therefore, we need to cast away the constness before setting it.
+  device_memory_base.opaque = const_cast<void*>(mem->opaque());
+  device_memory_base.size = mem->size();
+  device_memory_base.payload = mem->payload();
+  return device_memory_base;
+}
+
+// This file implements core stream executor base classes in terms of
+// the C API defined in stream_executor.h. A class "CSomething" represents a
+// "Something" that can be manipulated via calls in the C interface.
+class CPlatform : public Platform {
+ public:
+  explicit CPlatform(SP_Platform platform,
+                     void (*destroy_platform)(SP_Platform*),
+                     SP_PlatformFns platform_fns,
+                     void (*destroy_platform_fns)(SP_PlatformFns*),
+                     SP_DeviceFns device_fns, SP_StreamExecutor stream_executor,
+                     SP_TimerFns timer_fns);
+  ~CPlatform() override;
+
+  Id id() const override { return const_cast<int*>(&plugin_id_value_); }
+  const std::string& Name() const override { return name_; }
+  int VisibleDeviceCount() const override {
+    int visible_device_count = 0;
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    platform_fns_.get_device_count(&platform_, &visible_device_count,
+                                   c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return 0;
+    }
+    return visible_device_count;
+  }
+  bool UseBfcAllocator() const { return platform_.use_bfc_allocator; }
+  bool ForceMemoryGrowth() const { return platform_.force_memory_growth; }
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  absl::StatusOr<StreamExecutor*> FindExisting(int ordinal) override;
+
+ private:
+  // Returns a device constructed with the ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      int ordinal);
+
+  SP_Platform platform_;
+  void (*destroy_platform_)(SP_Platform*);
+  SP_PlatformFns platform_fns_;
+  void (*destroy_platform_fns_)(SP_PlatformFns*);
+  SP_DeviceFns device_fns_;
+  SP_StreamExecutor stream_executor_;
+  SP_TimerFns timer_fns_;
+  const std::string name_;
+  int plugin_id_value_;
+  stream_executor::ExecutorCache executor_cache_;
+};
+
+class CEvent : public Event {
+ public:
+  CEvent(SP_Device* device, SP_StreamExecutor* stream_executor)
+      : device_(device),
+        stream_executor_(stream_executor),
+        event_handle_(nullptr) {}
+  ~CEvent() override { Destroy(); }
+
+  Event::Status PollForStatus() override {
+    SE_EventStatus event_status =
+        stream_executor_->get_event_status(device_, event_handle_);
+
+    switch (event_status) {
+      case SE_EVENT_ERROR:
+        return Event::Status::kError;
+      case SE_EVENT_PENDING:
+        return Event::Status::kPending;
+      case SE_EVENT_COMPLETE:
+        return Event::Status::kComplete;
+      default:
+        return Event::Status::kUnknown;
+    }
+  }
+
+  absl::Status Create() {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    stream_executor_->create_event(device_, &event_handle_, c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+
+  absl::Status Record(SP_Stream stream_handle) {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    stream_executor_->record_event(device_, stream_handle, event_handle_,
+                                   c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+
+  void Destroy() {
+    if (event_handle_ != nullptr) {
+      stream_executor_->destroy_event(device_, event_handle_);
+      event_handle_ = nullptr;
+    }
+  }
+
+  SP_Event Handle() { return event_handle_; }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Event event_handle_;
+};
+
+class CStream : public StreamCommon {
+ public:
+  CStream(SP_Device* device, SP_StreamExecutor* stream_executor,
+          StreamExecutor* executor)
+      : StreamCommon(executor),
+        device_(device),
+        stream_executor_(stream_executor),
+        stream_handle_(nullptr) {}
+  ~CStream() override {
+    BlockHostUntilDone().IgnoreError();
+    parent()->DeallocateStream(this);
+    Destroy();
+  }
+
+  absl::Status Create() {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    stream_executor_->create_stream(device_, &stream_handle_, c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+
+  void Destroy() {
+    if (stream_handle_ != nullptr) {
+      stream_executor_->destroy_stream(device_, stream_handle_);
+      stream_handle_ = nullptr;
+    }
+  }
+  absl::Status RefreshStatus() override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    stream_executor_->get_stream_status(device_, stream_handle_,
+                                        c_status.get());
+    absl::Status status = tensorflow::StatusFromTF_Status(c_status.get());
+    CheckStatus(status);
+    return status;
+  }
+
+  absl::Status RecordEvent(Event* event) override {
+    return static_cast<CEvent*>(event)->Record(stream_handle_);
+  }
+
+  absl::Status BlockHostUntilDone() override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_Stream stream_handle = Handle();
+
+    // If `block_host_until_done` is set, use it.
+    if (stream_executor_->block_host_until_done != nullptr) {
+      stream_executor_->block_host_until_done(device_, stream_handle,
+                                              c_status.get());
+      return tensorflow::StatusFromTF_Status(c_status.get());
+    }
+    // Create and record an event and then wait for it.
+    SP_Event event_handle;
+    stream_executor_->create_event(device_, &event_handle, c_status.get());
+    TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+    stream_executor_->record_event(device_, stream_handle, event_handle,
+                                   c_status.get());
+    absl::Status s = tensorflow::StatusFromTF_Status(c_status.get());
+    if (!s.ok()) {
+      stream_executor_->destroy_event(device_, event_handle);
+      return s;
+    }
+    stream_executor_->block_host_for_event(device_, event_handle,
+                                           c_status.get());
+    stream_executor_->destroy_event(device_, event_handle);
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+
+  absl::Status WaitFor(Stream* other) override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_Stream other_handle = static_cast<CStream*>(other)->Handle();
+    stream_executor_->create_stream_dependency(device_, stream_handle_,
+                                               other_handle, c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  absl::Status WaitFor(Event* event) override {
+    SP_Event event_handle = static_cast<CEvent*>(event)->Handle();
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    stream_executor_->wait_for_event(device_, stream_handle_, event_handle,
+                                     c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  absl::Status MemZero(DeviceMemoryBase* location, uint64_t size) override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem = DeviceMemoryBaseToC(location);
+    stream_executor_->mem_zero(device_, stream_handle_, &device_mem, size,
+                               c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  absl::Status Memset32(DeviceMemoryBase* location, uint32_t pattern,
+                        uint64_t size) override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem = DeviceMemoryBaseToC(location);
+    stream_executor_->memset32(device_, stream_handle_, &device_mem, pattern,
+                               size, c_status.get());
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                      uint64_t size) override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    stream_executor_->memcpy_htod(device_, stream_handle_, &device_mem_dst,
+                                  host_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+    }
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst,
+                      const DeviceMemoryBase& gpu_src, uint64_t size) override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->memcpy_dtod(device_, stream_handle_, &device_mem_dst,
+                                  &device_mem_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+    }
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  absl::Status Memcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                      uint64_t size) override {
+    tensorflow::TF_StatusPtr c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->memcpy_dtoh(device_, stream_handle_, host_dst,
+                                  &device_mem_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+    }
+    return tensorflow::StatusFromTF_Status(c_status.get());
+  }
+  // Wrapper that allows passing std::function across C API.
+  struct HostCallbackContext {
+    absl::AnyInvocable<absl::Status() &&> callback;
+  };
+
+  // This wrapper allows calling `HostCallbackContext::callback` across C API.
+  // This function matches `SE_StatusCallbackFn` signature and will be passed as
+  // `callback_fn` to `host_callback` in `SP_StreamExecutor`.
+  static void HostCallbackTrampoline(void* ctx, TF_Status* status) {
+    HostCallbackContext* host_ctx = static_cast<HostCallbackContext*>(ctx);
+    absl::Status s = std::move(host_ctx->callback)();
+    tsl::Set_TF_Status_from_Status(status, s);
+    delete host_ctx;
+  }
+  absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) override {
+    HostCallbackContext* ctx = new HostCallbackContext{std::move(callback)};
+    if (stream_executor_->host_callback(device_, stream_handle_,
+                                        &HostCallbackTrampoline, ctx)) {
+      return absl::OkStatus();
+    }
+    return absl::InternalError("Failed to host callback.");
+  }
+  SP_Stream Handle() { return stream_handle_; }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Stream stream_handle_;
+};
+
+}  // namespace stream_executor
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor_test_util.h b/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor_test_util.h
new file mode 100644
index 00000000..0bebf6f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/experimental/stream_executor/stream_executor_test_util.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_TEST_UTIL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_TEST_UTIL_H_
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+
+struct SP_Stream_st {
+  explicit SP_Stream_st(int id) : stream_id(id) {}
+  int stream_id;
+};
+
+struct SP_Event_st {
+  explicit SP_Event_st(int id) : event_id(id) {}
+  int event_id;
+};
+
+struct SP_Timer_st {
+  explicit SP_Timer_st(int id) : timer_id(id) {}
+  int timer_id;
+};
+
+namespace stream_executor {
+namespace test_util {
+
+constexpr int kDeviceCount = 2;
+constexpr char kDeviceName[] = "MY_DEVICE";
+constexpr char kDeviceType[] = "GPU";
+
+void PopulateDefaultStreamExecutor(SP_StreamExecutor* se);
+void PopulateDefaultDeviceFns(SP_DeviceFns* device_fns);
+void PopulateDefaultTimerFns(SP_TimerFns* timer_fns);
+void PopulateDefaultPlatform(SP_Platform* platform,
+                             SP_PlatformFns* platform_fns);
+void PopulateDefaultPlatformRegistrationParams(
+    SE_PlatformRegistrationParams* const params);
+
+void DestroyPlatform(SP_Platform* platform);
+void DestroyPlatformFns(SP_PlatformFns* platform_fns);
+
+}  // namespace test_util
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/kernels.h b/third_party/tflite-hdrs/tensorflow/c/kernels.h
new file mode 100644
index 00000000..fd7f99cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/kernels.h
@@ -0,0 +1,543 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_KERNELS_H_
+#define TENSORFLOW_C_KERNELS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+
+// Required for IS_MOBILE_PLATFORM definition
+#include "tsl/platform/platform.h"  // IWYU pragma: keep
+
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_Tensor TF_Tensor;
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow Kernels.
+//
+// This API allows developers to register custom kernel implementations for
+// TensorFlow.
+//
+// See c_api.h header comments for a discussion about API conventions.
+//
+// Users wishing to extend TensorFlow with new kernels will call
+// `TF_NewKernelBuilder`. The resulting kernel builder can be registered with
+// `TF_RegisterKernelBuilder`, which will allow TF to construct user-provided
+// kernels when necessary.
+
+typedef struct TF_KernelBuilder TF_KernelBuilder;
+typedef struct TF_OpKernelConstruction TF_OpKernelConstruction;
+typedef struct TF_OpKernelContext TF_OpKernelContext;
+typedef struct TF_AsyncOpKernelDoneCallback TF_AsyncOpKernelDoneCallback;
+
+// Run callback function for async kernel.
+TF_CAPI_EXPORT extern void TF_RunAsyncOpKernelDoneCallback(
+    TF_AsyncOpKernelDoneCallback*);
+
+// TF_InitKernel to do op/kernel registration.
+// Plugin should implement TF_InitKernel to register kernels. This function
+// should register all kernels in a plugin.
+void TF_InitKernel();
+
+// Allocates a new kernel builder and returns a pointer to it.
+//
+// If non-null, TensorFlow will call create_func when it needs to instantiate
+// the kernel. The pointer returned by create_func will be passed to
+// compute_func and delete_func, thereby functioning as a "this" pointer for
+// referring to kernel instances.
+//
+// The TF_OpKernelConstruction pointer passed to create_func is owned by
+// TensorFlow and will be deleted once create_func returns. It must not be used
+// after this.
+//
+// When TensorFlow needs to perform a computation with this kernel, it will
+// call compute_func. This function will receive the pointer returned by
+// create_func (or null if no create_func was provided), along with the inputs
+// to the computation.
+//
+// The TF_OpKernelContext pointer received by compute_func is owned by
+// TensorFlow and will be deleted once compute_func returns. It must not be used
+// after this.
+//
+// Finally, when TensorFlow no longer needs the kernel, it will call
+// delete_func if one is provided. This function will receive the pointer
+// returned in `create_func` or nullptr if no `create_func` was provided.
+//
+// The caller should pass the result of this function to
+// TF_RegisterKernelBuilder, which will take ownership of the pointer. If, for
+// some reason, the kernel builder will not be registered, the caller should
+// delete it with TF_DeleteKernelBuilder.
+TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*));
+
+// Allocates a new kernel builder and returns a pointer to it.
+//
+// It is similar as TF_NewKernelBuilder, except compute_async_func.
+// It creates an AsyncOpKernel, and performs async computation through
+// compute_async_func.
+TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewAsyncKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_async_func)(void*, TF_OpKernelContext*,
+                               TF_AsyncOpKernelDoneCallback* done),
+    void (*delete_func)(void*));
+
+// Specifies that this kernel's attribute only supports the given type.
+TF_CAPI_EXPORT extern void TF_KernelBuilder_TypeConstraint(
+    TF_KernelBuilder* kernel_builder, const char* attr_name,
+    const TF_DataType type, TF_Status* status);
+
+// Specify that this kernel requires/provides an input/output arg
+// in host memory (instead of the default, device memory).
+TF_CAPI_EXPORT extern void TF_KernelBuilder_HostMemory(
+    TF_KernelBuilder* kernel_builder, const char* arg_name);
+
+// Specify a priority number for this kernel.
+TF_CAPI_EXPORT extern void TF_KernelBuilder_Priority(
+    TF_KernelBuilder* kernel_builder, int32_t priority_number);
+
+// Specify a label for this kernel.
+TF_CAPI_EXPORT extern void TF_KernelBuilder_Label(
+    TF_KernelBuilder* kernel_builder, const char* label);
+
+// Register the given kernel builder with the TensorFlow runtime. If
+// registration fails, the given status will be populated.
+//
+// This call takes ownership of the `builder` pointer.
+TF_CAPI_EXPORT extern void TF_RegisterKernelBuilder(const char* kernel_name,
+                                                    TF_KernelBuilder* builder,
+                                                    TF_Status* status);
+
+// Register the given kernel builder with the TensorFlow runtime. If
+// registration fails, the given status will be populated.
+//
+// This method is the same as TF_RegisterKernelBuilder except it takes in a
+// serialized KernelDef, and uses it for registration, instead of building a new
+// one. Users can choose to not provide a serialized KernelDef and in that case
+// it's identical to TF_RegisterKernelBuilder.
+TF_CAPI_EXPORT extern void TF_RegisterKernelBuilderWithKernelDef(
+    const char* serialized_kernel_def, const char* name,
+    TF_KernelBuilder* builder, TF_Status* status);
+
+// Deletes the given TF_KernelBuilder. This should be called only if the kernel
+// builder is not registered with TensorFlow via TF_RegisterKernelBuilder.
+TF_CAPI_EXPORT extern void TF_DeleteKernelBuilder(TF_KernelBuilder* builder);
+
+// --------------------------------------------------------------------------
+// OpKernelContext routines
+
+// TF_GetStream returns the SP_Stream available in ctx.
+// This function returns a stream only for devices registered using the
+// StreamExecutor C API
+// (tensorflow/c/experimental/stream_executor/stream_executor.h). It will return
+// nullptr and set error status in all other cases.
+// Experimental: this function doesn't have compatibility guarantees and subject
+// to change at any time.
+TF_CAPI_EXPORT extern SP_Stream TF_GetStream(TF_OpKernelContext* ctx,
+                                             TF_Status* status);
+
+// TF_NumInputs returns the number of inputs available in ctx.
+TF_CAPI_EXPORT extern int TF_NumInputs(TF_OpKernelContext* ctx);
+
+// TF_NumOutputs returns the number of outputs to be placed in *ctx by the
+// kernel.
+TF_CAPI_EXPORT extern int TF_NumOutputs(TF_OpKernelContext* ctx);
+
+// Retrieves the ith input from ctx. If TF_GetCode(status) is TF_OK, *tensor is
+// populated and its ownership is passed to the caller. In any other case,
+// *tensor is not modified.
+//
+// If i < 0 or i >= TF_NumInputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_GetInput(TF_OpKernelContext* ctx, int i,
+                                       TF_Tensor** tensor, TF_Status* status);
+
+typedef struct {
+  size_t struct_size;
+  void* priv;         // Not used, for possible extension.
+  int start;          // output
+  int stop;           // output
+  TF_Status* status;  // output
+} TF_InputRange_Args;
+const size_t TF_InputRange_Args_STRUCT_SIZE =
+    TF_OFFSET_OF_END(TF_InputRange_Args, status);
+
+// Retrieves the start and stop indices, given the input name. Equivalent to
+// OpKernel::InputRange(). `args` will contain the result indices and status.
+TF_CAPI_EXPORT extern void TF_InputRange(TF_OpKernelContext* ctx,
+                                         const char* name,
+                                         TF_InputRange_Args* args);
+
+// Returns the data type of the index-th input. If index < 0 or index >=
+// TF_NumInputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern TF_DataType TF_InputDatatype(TF_OpKernelContext* ctx,
+                                                   int index);
+
+// Sets the ith output of ctx to tensor. If TF_GetCode(status) is anything but
+// TF_OK, ctx is left unmodified.
+//
+// If i < 0 or i >= TF_NumOutputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
+                                        const TF_Tensor* tensor,
+                                        TF_Status* status);
+
+// Retrieves the ith output from ctx. If TF_GetCode(status) is TF_OK, *tensor is
+// populated and its ownership is passed to the caller. In any other case,
+// *tensor is not modified.
+//
+// If i < 0 or i >= TF_NumOutputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern TF_Tensor* TF_GetMutableOutput(TF_OpKernelContext* ctx,
+                                                     int i, TF_Status* status);
+
+// Retrieves a serialized FunctionDefLibrary. Status will be set.
+TF_CAPI_EXPORT extern void TF_GetSerializedFunctionDefLibrary(
+    TF_OpKernelContext* ctx, TF_Buffer* serialized_function_def_library,
+    TF_Status* status);
+
+// Retrieves a serialized ConfigProto. Status will be set.
+TF_CAPI_EXPORT extern void TF_GetSerializedConfigProto(
+    TF_OpKernelContext* ctx, TF_Buffer* serialized_config_proto,
+    TF_Status* status);
+
+// Retrieves a serialized ResourceHandleProto. Status will be set.
+TF_CAPI_EXPORT extern void TF_GetSerializedResourceHandleProto(
+    TF_OpKernelContext* ctx, int i, TF_Buffer* serialized_resource_handle_proto,
+    TF_Status* status);
+
+// Notifies the given OpKernelConstruction that kernel construction has failed.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_Failure(
+    TF_OpKernelConstruction* ctx, TF_Status* status);
+
+// Notifies the given OpKernelContext that the kernel's compute function has
+// failed.
+TF_CAPI_EXPORT extern void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx,
+                                                      TF_Status* status);
+
+// Returns the expected output data type of the ith output. If i < 0 or
+// i >= TF_NumOutputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern TF_DataType TF_ExpectedOutputDataType(
+    TF_OpKernelContext* ctx, int i);
+
+// Returns true if the ith input is allocated in host memory. If i < 0 or i >=
+// TF_NumInputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern bool TF_IsHostMemoryInput(TF_OpKernelContext* ctx, int i,
+                                                TF_Status* status);
+
+// Returns true if the ith output is allocated in host memory. If i < 0 or i >=
+// TF_NumOutputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern bool TF_IsHostMemoryOutput(TF_OpKernelContext* ctx, int i,
+                                                 TF_Status* status);
+
+// Returns the step ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_StepId(TF_OpKernelContext* ctx);
+
+// Returns the serialized NodeDef protocol buffer for the kernel
+TF_CAPI_EXPORT extern TF_Buffer* TF_OpKernelConstruction_GetNodeDef(
+    TF_OpKernelConstruction* ctx, TF_Status* status);
+
+// Returns the frame ID of the given context.
+TF_CAPI_EXPORT extern uint64_t TF_GetFrameId(TF_OpKernelContext* ctx);
+
+// Returns the Iter ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_GetIterId(TF_OpKernelContext* ctx);
+
+// Returns the Step ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_GetStepId(TF_OpKernelContext* ctx);
+
+// Returns the Device ID of the device that the context possesses. Returns the
+// PlatformDeviceId if a mapping between between TfDeviceId and PlatformDeviceId
+// is set; otherwise returns the id in the device name. Please refer to
+// tensorflow/compiler/xla/tsl/framework/device_id.h for more details.
+// For mobile or slim build, returns the id in the device name.
+TF_CAPI_EXPORT extern int TF_GetDeviceId(TF_OpKernelContext* ctx);
+
+// Returns the Device Name of the device that the context possesses.
+//
+// The returned TF_StringView's underlying string is owned by the OpKernel and
+// has the same lifetime as the OpKernel.
+TF_CAPI_EXPORT TF_StringView TF_GetDeviceName(TF_OpKernelContext* ctx);
+
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+// Returns the rendezvous in the context. Not supported on mobile.
+TF_CAPI_EXPORT TF_RendezvousThunk TF_GetRendezvous(TF_OpKernelContext* ctx);
+#endif
+
+// Returns the graph def version of the given context.
+TF_CAPI_EXPORT extern int TF_GetGraphDefVersion(TF_OpKernelContext* ctx);
+
+// Returns the name of the OpKernel.
+//
+// The returned TF_StringView's underlying string is owned by the OpKernel and
+// has the same lifetime as the OpKernel.
+TF_CAPI_EXPORT extern TF_StringView TF_GetOpKernelName(TF_OpKernelContext* ctx);
+
+// Returns the default container of the resource manager in OpKernelContext.
+//
+// The returned TF_StringView's underlying string is owned by the OpKernel and
+// has the same lifetime as the OpKernel.
+TF_CAPI_EXPORT extern TF_StringView TF_GetResourceMgrDefaultContainerName(
+    TF_OpKernelContext* ctx);
+
+// Returns the name of the requested input at `index` from the OpKernel.
+//
+// The returned TF_StringView's underlying string is owned by the OpKernel and
+// has the same lifetime as the OpKernel.
+TF_CAPI_EXPORT extern TF_StringView TF_GetOpKernelRequestedInput(
+    TF_OpKernelContext* ctx, size_t index);
+
+// Get the list_size and total_size of the attribute `attr_name` of `oper`.
+// list_size - the length of the list.
+// total_size - total size of the list.
+//   (1) If attr_type == TF_ATTR_STRING
+//       then total_size is the cumulative byte size
+//       of all the strings in the list.
+//   (3) If attr_type == TF_ATTR_SHAPE
+//       then total_size is the number of dimensions
+//       of the shape valued attribute, or -1
+//       if its rank is unknown.
+//   (4) If attr_type == TF_ATTR_SHAPE
+//       then total_size is the cumulative number
+//       of dimensions of all shapes in the list.
+//   (5) Otherwise, total_size is undefined.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrSize(
+    TF_OpKernelConstruction* ctx, const char* attr_name, int32_t* list_size,
+    int32_t* total_size, TF_Status* status);
+
+// Interprets the named kernel construction attribute as a TF_DataType and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// TF_DataType, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrType(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_DataType* val,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as int32_t and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// int32, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrInt32(
+    TF_OpKernelConstruction* ctx, const char* attr_name, int32_t* val,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as int64_t and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// int64, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrInt64(
+    TF_OpKernelConstruction* ctx, const char* attr_name, int64_t* val,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as float and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// float, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrFloat(
+    TF_OpKernelConstruction* ctx, const char* attr_name, float* val,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as bool and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// bool, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrBool(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_Bool* val,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as string and
+// places it into *val. `val` must
+// point to an array of length at least `max_length` (ideally set to
+// total_size from TF_OpKernelConstruction_GetAttrSize(ctx,
+// attr_name, list_size, total_size)). *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// string, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrString(
+    TF_OpKernelConstruction* ctx, const char* attr_name, char* val,
+    size_t max_length, TF_Status* status);
+
+// Interprets the named kernel construction attribute as tensor and places it
+// into *val. Allocates a new TF_Tensor which the caller is expected to take
+// ownership of (and can deallocate using TF_DeleteTensor). *status is set to
+// TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// tensor, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrTensor(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_Tensor** val,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as a TF_DataType array and
+// places it into *vals. *status is set to TF_OK.
+// `vals` must point to an array of length at least `max_values` (ideally set
+// to list_size from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, list_size,
+// total_size)).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrTypeList(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_DataType* vals,
+    int max_vals, TF_Status* status);
+
+// Interprets the named kernel construction attribute as int32_t array and
+// places it into *vals. *status is set to TF_OK.
+// `vals` must point to an array of length at least `max_values` (ideally set
+// to list_size from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, list_size,
+// total_size)).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrInt32List(
+    TF_OpKernelConstruction* ctx, const char* attr_name, int32_t* vals,
+    int max_vals, TF_Status* status);
+
+// Interprets the named kernel construction attribute as int64_t array and
+// places it into *vals. *status is set to TF_OK.
+// `vals` must point to an array of length at least `max_values` (ideally set
+// to list_size from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, list_size,
+// total_size)).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrInt64List(
+    TF_OpKernelConstruction* ctx, const char* attr_name, int64_t* vals,
+    int max_vals, TF_Status* status);
+
+// Interprets the named kernel construction attribute as float array and
+// places it into *vals. *status is set to TF_OK.
+// `vals` must point to an array of length at least `max_values` (ideally set
+// to list_size from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, list_size,
+// total_size)).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrFloatList(
+    TF_OpKernelConstruction* ctx, const char* attr_name, float* vals,
+    int max_vals, TF_Status* status);
+
+// Interprets the named kernel construction attribute as bool array and
+// places it into *vals. *status is set to TF_OK.
+// `vals` must point to an array of length at least `max_values` (ideally set
+// to list_size from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, list_size,
+// total_size)).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrBoolList(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_Bool* vals,
+    int max_vals, TF_Status* status);
+
+// Interprets the named kernel construction attribute as string array and fills
+// in `vals` and `lengths`, each of which must point to an array of length at
+// least `max_values`. *status is set to TF_OK. The elements of values will
+// point to addresses in `storage` which must be at least `storage_size` bytes
+// in length. Ideally, max_values would be set to list_size and `storage` would
+// be at least total_size, obtained from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, list_size,
+// total_size).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrStringList(
+    TF_OpKernelConstruction* ctx, const char* attr_name, char** values,
+    size_t* lengths, int max_values, void* storage, size_t storage_size,
+    TF_Status* status);
+
+// Interprets the named kernel construction attribute as tensor array and places
+// it into *vals. *status is set to TF_OK.
+// `vals` must point to an array of length at least `max_values`
+// (ideally set to list_size from TF_OpKernelConstruction_GetAttrSize(ctx,
+// attr_name, list_size, total_size)).
+//
+// The caller takes ownership of all the non-null TF_Tensor* entries in `vals`
+// (which can be deleted using TF_DeleteTensor(vals[i])).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrTensorList(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_Tensor** vals,
+    int max_values, TF_Status* status);
+
+// Interprets the named kernel construction attribute as a
+// tensorflow::NameAttrList and returns the serialized proto as TF_Buffer.
+// `status` will be set. The caller takes ownership of the returned TF_Buffer
+// (if not null) and is responsible for managing its lifetime.
+TF_CAPI_EXPORT extern TF_Buffer* TF_OpKernelConstruction_GetAttrFunction(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_Status* status);
+
+// Return true if the kernel construction has the attr_name
+TF_CAPI_EXPORT extern bool TF_OpKernelConstruction_HasAttr(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_Status* status);
+
+// Returns the unique operation name for this OpKernel.
+TF_CAPI_EXPORT extern TF_StringView TF_OpKernelConstruction_GetName(
+    TF_OpKernelConstruction* ctx);
+
+// Allocates Tensor for output at given index. Caller takes ownership of
+// returned TF_Tensor and should deallocate it using TF_DeleteTensor(tensor).
+//
+// This function should be used to allocate outputs inside kernel
+// compute function.
+TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
+                                            int index, TF_DataType dtype,
+                                            const int64_t* dims, int num_dims,
+                                            size_t len, TF_Status* status);
+
+// Tries to forward one of the inputs given in input_indices to
+// output[output_index]. If none of the given inputs can be forwarded, calls
+// allocate_output() to allocate a new output buffer. The index of the
+// forwarded input will be assign to output argument forwarded_input (if it's
+// not nullptr). If no inputs are forwarded, forwarded_input will be assigned
+// -1.
+TF_CAPI_EXPORT TF_Tensor* TF_ForwardInputOrAllocateOutput(
+    TF_OpKernelContext* context, const int* candidate_input_indices,
+    int num_candidate_input_indices, int output_index,
+    const int64_t* output_dims, int output_num_dims, int* forwarded_input,
+    TF_Status* status);
+
+// Allocates a temporary Tensor of the specified type and shape. The
+// Tensor must not be used after kernel construction is
+// complete.
+//
+// num_dims must equal the size of array dims
+TF_CAPI_EXPORT extern TF_Tensor* TF_AllocateTemp(
+    TF_OpKernelContext* context, TF_DataType dtype, const int64_t* dims,
+    int num_dims, TF_AllocatorAttributes* alloc_attrs, TF_Status* status);
+
+// Used by OpKernel implementations to track actively running deferred ops.
+//
+// A deferred op is one whose Compute method returns (or whose ComputeAsync
+// method invokes the callback) when work is scheduled onto a device. At that
+// point, we don't know when the work will actually complete (or if it has
+// already completed) on the device. These functions allow the executor to
+// track the status of deferred ops and act accordingly.
+//
+// Deferred OpKernel implementations must use these methods to get two
+// functions. It then must call these two functions in pairs, before and after
+// device execution, respectively.
+TF_CAPI_EXPORT extern void TF_IncNumDeferredOps(TF_OpKernelContext* context);
+TF_CAPI_EXPORT extern void TF_DecNumDeferredOps(TF_OpKernelContext* context);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/kernels/tensor_shape_utils.h b/third_party/tflite-hdrs/tensorflow/c/kernels/tensor_shape_utils.h
new file mode 100644
index 00000000..27167b39
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/kernels/tensor_shape_utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains shape utilities to be used by kernels and is not part of
+// the C API. As such, it is subject to change at any time.
+
+#ifndef TENSORFLOW_C_KERNELS_TENSOR_SHAPE_UTILS_H_
+#define TENSORFLOW_C_KERNELS_TENSOR_SHAPE_UTILS_H_
+
+#include <string>
+
+#include "tensorflow/c/tf_tensor.h"
+
+namespace tensorflow {
+
+// The following are utils for the shape of a TF_Tensor type.
+// These functions may later be subsumed by the methods for a
+// TF_TensorShape type.
+
+// Returns a string representation of the TF_Tensor shape.
+std::string ShapeDebugString(TF_Tensor* tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_KERNELS_TENSOR_SHAPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/kernels_experimental.h b/third_party/tflite-hdrs/tensorflow/c/kernels_experimental.h
new file mode 100644
index 00000000..2f93e6b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/kernels_experimental.h
@@ -0,0 +1,192 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_KERNELS_EXPERIMENTAL_H_
+#define TENSORFLOW_C_KERNELS_EXPERIMENTAL_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/kernels.h"
+
+// --------------------------------------------------------------------------
+// Experimental kernel C API for TensorFlow.
+//
+// The API here is subject to changes in the future.
+// --------------------------------------------------------------------------
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_VariableInputLockHolder TF_VariableInputLockHolder;
+
+// Expose higher level Assignment operation for Pluggable vendors to implement
+// in the plugin for Training. The API takes in the context with indices for
+// the input and value tensors. It also accepts the copy callback provided by
+// pluggable vendor to do the copying of the tensors. The caller takes ownership
+// of the `source` and `dest` tensors and is responsible for freeing them with
+// TF_DeleteTensor. This function will return an error when the following
+// conditions are met:
+//   1. `validate_shape` is set to `true`
+//   2. The variable is initialized
+//   3. The shape of the value tensor doesn't match the shape of the variable
+//      tensor.
+TF_CAPI_EXPORT extern void TF_AssignVariable(
+    TF_OpKernelContext* ctx, int input_index, int value_index,
+    bool validate_shape,
+    void (*copyFunc)(TF_OpKernelContext* ctx, TF_Tensor* source,
+                     TF_Tensor* dest),
+    TF_Status* status);
+
+// Expose higher level Assignment operation for Pluggable vendors to implement
+// in the plugin for Training on ref variables. The API takes in the context
+// with indices for the input and value tensors. It also accepts the copy
+// callback provided by pluggable vendor to do the copying of the tensors. The
+// caller takes ownership of the `source` and `dest` tensors and is responsible
+// for freeing them with TF_DeleteTensor.
+TF_CAPI_EXPORT extern void TF_AssignRefVariable(
+    TF_OpKernelContext* ctx, int input_ref_index, int output_ref_index,
+    int value_index, bool use_locking, bool validate_shape,
+    void (*copyFunc)(TF_OpKernelContext* ctx, TF_Tensor* source,
+                     TF_Tensor* dest),
+    TF_Status* status);
+
+// Expose higher level AssignUpdate operation for Pluggable vendors to implement
+// in the plugin for Training. The API takes in the context with indices for the
+// input and value tensors. It also accepts the copy callback provided by
+// pluggable vendor to do the copying of the tensors and the update callback to
+// apply the arithmetic operation. The caller takes ownership of the `source`,
+// `dest`, `tensor` and `value` tensors and is responsible for freeing them with
+// TF_DeleteTensor.
+TF_CAPI_EXPORT extern void TF_AssignUpdateVariable(
+    TF_OpKernelContext* ctx, int input_index, int value_index, int Op,
+    int isVariantType,
+    void (*copyFunc)(TF_OpKernelContext* ctx, TF_Tensor* source,
+                     TF_Tensor* dest),
+    void (*updateFunc)(TF_OpKernelContext* ctx, TF_Tensor* tensor,
+                       TF_Tensor* value, int Op),
+    TF_Status* status);
+
+// Expose higher level temporary variable operator for Pluggable vendors to
+// implement in the plugin for managing temporary variables. The API takes in
+// the context with indices for the input and value tensors. It also accepts the
+// allocator provided by pluggable vendor to do the allocate_temp of the
+// tensors. The caller takes ownership of temporary variables and is responsible
+// for freeing them with TF_DestroyTemporaryVariable. This function will return
+// an error when the following conditions are met:
+//   1. Cannot allocate a new temporary variable
+//   2. Calling plugin allocator failed
+TF_CAPI_EXPORT extern void TF_TemporaryVariable(
+    TF_OpKernelContext* ctx, TF_DataType dtype, const int64_t* dims,
+    int num_dims, TF_StringView* var_name,
+    void (*plugin_allocator)(TF_OpKernelContext*, TF_Tensor*, TF_DataType,
+                             const int64_t*, int, TF_Status*),
+    TF_Status* tf_status);
+
+// Expose higher level temporary variable operator for Pluggable vendors to
+// implement in the plugin for destroying temporary variables. The API takes in
+// the context with indices for the input and variable name. This function will
+// return an error when either of the following conditions is met:
+//   1. `input data type` is not ref type
+//   2. Cannot find temporary variable by name in arguments
+TF_CAPI_EXPORT extern void TF_DestroyTemporaryVariable(TF_OpKernelContext* ctx,
+                                                       const int index,
+                                                       TF_StringView* var_name,
+                                                       TF_Status* tf_status);
+
+// This is a helper function which acquires mutexes in-order to provide
+// thread-safe way of performing weights update during the optimizer op. It
+// returns an opaque LockHolder handle back to plugin. This handle is passed to
+// the Release API for releasing the locks when the weight update is done. The
+// caller takes ownership of the `source` and `dest` tensors and is responsible
+// for freeing them with TF_DeleteTensor.
+TF_CAPI_EXPORT extern void TF_MaybeLockVariableInputMutexesInOrder(
+    TF_OpKernelContext* ctx, bool do_lock, bool sparse, const int* const inputs,
+    size_t len,
+    void (*copyFunc)(TF_OpKernelContext* ctx, TF_Tensor* source,
+                     TF_Tensor* dest),
+    TF_VariableInputLockHolder** lockHolder, TF_Status* status);
+
+// This interface returns `out` tensor which is updated corresponding to the
+// variable passed with input index. The caller takes ownership of the `source`
+// and `dest` tensors and is responsible for freeing them with TF_DeleteTensor.
+TF_CAPI_EXPORT extern void TF_GetInputTensorFromVariable(
+    TF_OpKernelContext* ctx, int input, bool lock_held, bool isVariantType,
+    bool sparse,
+    void (*copyFunc)(TF_OpKernelContext* ctx, TF_Tensor* source,
+                     TF_Tensor* dest),
+    TF_Tensor** out, TF_Status* status);
+
+// This interface forwards the reference from input to the output tensors
+// corresponding to the indices provided with `input_index` and `output_index`
+TF_CAPI_EXPORT extern void TF_OpKernelContext_ForwardRefInputToRefOutput(
+    TF_OpKernelContext* ctx, int32_t input_index, int32_t output_index);
+
+// The API releases the opaque lock handle returned with
+// `TF_MaybeLockVariableInputMutexesInOrder` API
+TF_CAPI_EXPORT extern void TF_ReleaseVariableInputLockHolder(
+    TF_VariableInputLockHolder* lockHolder);
+
+// Allows plugin to get TF_Tensor when passed its input_name
+TF_CAPI_EXPORT extern void TF_GetInputByName(TF_OpKernelContext* ctx,
+                                             const char* inputName,
+                                             TF_Tensor** tensor,
+                                             TF_Status* status);
+
+// Interprets the named kernel construction attribute as a shape attribute and
+// fills in `vals` with the size of each dimension. `vals` must point to an
+// array of length at least `max_values` (ideally set to total_size from
+// TF_OpKernelConstruction_GetAttrSize(ctx, attr_name, &list_size,
+// &total_size)).
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrTensorShape(
+    TF_OpKernelConstruction* ctx, const char* attr_name, int64_t* dims,
+    size_t num_dims, TF_Status* status);
+
+TF_CAPI_EXPORT extern bool TF_IsRefInput(TF_OpKernelContext* ctx, int i,
+                                         TF_Status* status);
+
+#ifndef IS_MOBILE_PLATFORM
+// Expose higher level AddN operation for Pluggable vendors to implement
+// in the plugin for Variant data types. The API takes in the context and a
+// callback provided by pluggable vendor to do a Binary Add operation on the
+// tensors unwrapped from the Variant tensors. The caller takes ownership of the
+// `a`, `b` and `out` tensors and is responsible for freeing them with
+// TF_DeleteTensor.
+TF_CAPI_EXPORT extern void TF_AddNVariant(
+    TF_OpKernelContext* ctx,
+    void (*binary_add_func)(TF_OpKernelContext* ctx, TF_Tensor* a, TF_Tensor* b,
+                            TF_Tensor* out),
+    TF_Status* status);
+
+// Expose higher level ZerosLike operation for Pluggable vendors to implement
+// in the plugin for Variant data types. The API takes in the context and a
+// callback provided by pluggable vendor to do a ZerosLike operation on the
+// tensors unwrapped from the Variant tensors. The caller takes ownership of the
+// `input` and `out` tensors and is responsible for freeing them with
+// TF_DeleteTensor.
+TF_CAPI_EXPORT extern void TF_ZerosLikeVariant(
+    TF_OpKernelContext* ctx,
+    void (*zeros_like_func)(TF_OpKernelContext* ctx, TF_Tensor* input,
+                            TF_Tensor* out),
+    TF_Status* status);
+
+typedef struct TF_CoordinationServiceAgent TF_CoordinationServiceAgent;
+
+#endif  // IS_MOBILE_PLATFORM
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_KERNELS_EXPERIMENTAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/logging.h b/third_party/tflite-hdrs/tensorflow/c/logging.h
new file mode 100644
index 00000000..9583777b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/logging.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_LOGGING_H_
+#define TENSORFLOW_C_LOGGING_H_
+
+#include "tensorflow/c/c_api_macros.h"
+
+// --------------------------------------------------------------------------
+// C API for tensorflow::Logging.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum TF_LogLevel {
+  TF_INFO = 0,
+  TF_WARNING = 1,
+  TF_ERROR = 2,
+  TF_FATAL = 3,
+} TF_LogLevel;
+
+TF_CAPI_EXPORT extern void TF_Log(TF_LogLevel level, const char* fmt, ...);
+TF_CAPI_EXPORT extern void TF_VLog(int level, const char* fmt, ...);
+TF_CAPI_EXPORT extern void TF_DVLog(int level, const char* fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_C_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/ops.h b/third_party/tflite-hdrs/tensorflow/c/ops.h
new file mode 100644
index 00000000..5d3a1e89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/ops.h
@@ -0,0 +1,364 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Routines for registering new ops and for implementing op shape inference
+// functions.
+//
+// This API is alpha software and is subject to change.
+//
+// REGISTRATION
+// ------------
+//
+// In order to register a new op, create a new TF_OpDefinitionBuilder:
+//
+// TF_OpDefinitionBuilder* builder = TF_NewOpDefinitionBuilder("OpName");
+//
+// Inputs, outputs and attributes can be added to the builder with the
+// corresponding functions, e.g.
+//
+// TF_OpDefinitionBuilderAddInput(builder, "input1: int32");
+// TF_OpDefinitionBuilderAddOutput(builder, "output1: int64");
+// TF_OpDefinitionBuilderAddAttr(builder, "attr: int32");
+//
+// The builder may then be registered with TensorFlow using the
+// TF_RegisterOpDefinition function. E.g.
+//
+// TF_Status* status = TF_NewStatus();
+// TF_RegisterOpDefinition(builder, &status);
+// if (TF_GetCode(status) != TF_OK) {
+//   // handle error
+// }
+//
+// SHAPE INFERENCE
+// ---------------
+//
+// You can provide a shape inference function that TensorFlow will call when it
+// wants to understand the shape of outputs that the op will produce. Use the
+// TF_OpDefinitionBuilderSetShapeInferenceFunction function to register a shape
+// inference function pointer with TensorFlow. The following is an example of a
+// very simple shape inference function:
+//
+// void identity_shape_fn(TF_ShapeInferenceContext* ctx, TF_Status* status) {
+//   TF_ShapeHandle* input = TF_NewShapeHandle();
+//   TF_ShapeInferenceContextGetInput(ctx, 0, input, status);
+//   if (TF_GetCode(status) == TF_OK) {
+//     TF_ShapeInferenceContextSetOutput(ctx, 0, input, status);
+//   }
+//   TF_DeleteShapeHandle(input);
+// }
+//
+// The following code registers the inference function with TensorFlow:
+//
+// TF_OpDefinitionBuilderSetShapeInferenceFunction(builder, &identity_shape_fn);
+//
+// For more details about shape inference, see the documentation for
+// TF_OpDefinitionBuilderSetShapeInferenceFunction.
+
+#ifndef TENSORFLOW_C_OPS_H_
+#define TENSORFLOW_C_OPS_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TF_DimensionHandle;
+struct TF_OpDefinitionBuilder;
+struct TF_ShapeHandle;
+struct TF_ShapeInferenceContext;
+
+// Returns a newly allocated op definition builder for the given op name. The
+// returned builder may be customized with the `TF_OpDefinitionBuilder...`
+// functions and then registered with TensorFlow with TF_RegisterOpDefinition.
+//
+// The returned pointer is either freed by a call to TF_RegisterOpDefinition, or
+// can be manually deleted by TF_DeleteOpDefinitionBuilder if it is never
+// registered.
+TF_CAPI_EXPORT extern TF_OpDefinitionBuilder* TF_NewOpDefinitionBuilder(
+    const char* op_name);
+
+// Registers the given op builder with TensorFlow. Indicates success or
+// otherwise in the given status.
+//
+// `builder` is freed whether the op was successfully registered or not. You
+// must call either this function or TF_DeleteOpDefinitionBuilder to free the
+// builder, but never both.
+TF_CAPI_EXPORT extern void TF_RegisterOpDefinition(
+    TF_OpDefinitionBuilder* builder, TF_Status* status);
+
+// Frees the given op definition builder. You must call either this function or
+// TF_RegisterOpDefinition to free the builder, but never both.
+TF_CAPI_EXPORT extern void TF_DeleteOpDefinitionBuilder(
+    TF_OpDefinitionBuilder* builder);
+
+//----------------------------------------------------
+// Attribute functions.
+
+// Adds an attr to the given TF_OpDefinitionBuilder. The spec has
+// format "<name>:<type>" or "<name>:<type>=<default>"
+// where <name> matches regexp [a-zA-Z][a-zA-Z0-9_]*.
+// By convention, names containing only capital letters are reserved for
+// attributes whose values can be inferred by the operator implementation if not
+// supplied by the user. If the attribute name contains characters other than
+// capital letters, the operator expects the user to provide the attribute value
+// at operation runtime.
+//
+// <type> can be:
+//   "string", "int", "float", "bool", "type", "shape", or "tensor"
+//   "numbertype", "realnumbertype", "quantizedtype"
+//       (meaning "type" with a restriction on valid values)
+//   "{int32,int64}" or {realnumbertype,quantizedtype,string}"
+//       (meaning "type" with a restriction containing unions of value types)
+//   "{\"foo\", \"bar\n baz\"}", or "{'foo', 'bar\n baz'}"
+//       (meaning "string" with a restriction on valid values)
+//   "list(string)", ..., "list(tensor)", "list(numbertype)", ...
+//       (meaning lists of the above types)
+//   "int >= 2" (meaning "int" with a restriction on valid values)
+//   "list(string) >= 2", "list(int) >= 2"
+//       (meaning "list(string)" / "list(int)" with length at least 2)
+// <default>, if included, should use the Proto text format
+// of <type>.  For lists use [a, b, c] format.
+//
+// Note that any attr specifying the length of an input or output will
+// get a default minimum of 1 unless the >= # syntax is used.
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderAddAttr(
+    TF_OpDefinitionBuilder* builder, const char* attr_spec);
+
+// Adds an input to this TF_OpDefinitionBuilder.
+// The spec has form "<name>:<type-expr>" or "<name>:Ref(<type-expr>)"
+// where <name> matches regexp [a-z][a-z0-9_]* and <type-expr> can be:
+// * For a single tensor: <type>
+// * For a sequence of tensors with the same type: <number>*<type>
+// * For a sequence of tensors with different types: <type-list>
+// Where:
+//   <type> is either one of "float", "int32", "string", ...
+//          or the name of an attr (see TF_OpDefinitionBuilderAddAttr)
+//          with type "type".
+//   <number> is the name of an attr with type "int".
+//   <type-list> is the name of an attr with type "list(type)".
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderAddInput(
+    TF_OpDefinitionBuilder* builder, const char* input_spec);
+
+// Adds an output to this TF_OpDefinitionBuilder.
+// The spec has form "<name>:<type-expr>" or "<name>:Ref(<type-expr>)"
+// where <name> matches regexp [a-z][a-z0-9_]* and <type-expr> can be:
+// * For a single tensor: <type>
+// * For a sequence of tensors with the same type: <number>*<type>
+// * For a sequence of tensors with different types: <type-list>
+// Where:
+//   <type> is either one of "float", "int32", "string", ...
+//          or the name of an attr (see TF_OpDefinitionBuilderAddAttr)
+//          with type "type".
+//   <number> is the name of an attr with type "int".
+//   <type-list> is the name of an attr with type "list(type)".
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderAddOutput(
+    TF_OpDefinitionBuilder* builder, const char* output_spec);
+
+// Sets the commutative property for the op built by the given builder.
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderSetIsCommutative(
+    TF_OpDefinitionBuilder* builder, bool is_commutative);
+
+// Sets the is_aggregate property of the builder to the given value.
+//
+// If is_aggregate is true, then the operation produced by this builder accepts
+// N >= 2 inputs and produces 1 output all of the same type. Should be
+// associative and commutative, and produce output with the same shape as the
+// input. The optimizer may replace an aggregate op taking input from multiple
+// devices with a tree of aggregate ops that aggregate locally within each
+// device (and possibly within groups of nearby devices) before communicating.
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderSetIsAggregate(
+    TF_OpDefinitionBuilder* builder, bool is_aggregate);
+
+// Sets the is_stateful property of the builder to the given value.
+//
+// The op built by this builder is stateful if its behavior depends on some
+// state beyond its input tensors (e.g. variable reading op) or if it has a
+// side-effect (e.g. printing or asserting ops). Equivalently, stateless ops
+// must always produce the same output for the same input and have no
+// side-effects.
+//
+// By default Ops may be moved between devices. Stateful ops should either not
+// be moved, or should only be moved if that state can also be moved (e.g. via
+// some sort of save / restore). Stateful ops are guaranteed to never be
+// optimized away by Common Subexpression Elimination (CSE).
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderSetIsStateful(
+    TF_OpDefinitionBuilder* builder, bool is_stateful);
+
+// Sets the allows_uninitialized_input property of the operation built by this
+// builder.
+//
+// By default, all inputs to an Op must be initialized Tensors. Ops that may
+// initialize tensors for the first time should set this field to true, to allow
+// the Op to take an uninitialized Tensor as input.
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderSetAllowsUninitializedInput(
+    TF_OpDefinitionBuilder* builder, bool allows_uninitialized_input);
+
+// Adds a deprecation warning for the given op. This indicates to the user that
+// `version` is the first TensorFlow GraphDef version for which the operation is
+// deprecated. `explanation` should contain the reason for the deprecation and
+// what to use instead.
+//
+// This function is only an indicator that the operation may disappear in a
+// version of TensorFlow after `version`. It does not affect op registration.
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderDeprecated(
+    TF_OpDefinitionBuilder* builder, int version, const char* explanation);
+
+// Sets the shape inference function for the op.
+TF_CAPI_EXPORT extern void TF_OpDefinitionBuilderSetShapeInferenceFunction(
+    TF_OpDefinitionBuilder* builder,
+    void (*shape_inference_func)(TF_ShapeInferenceContext* ctx,
+                                 TF_Status* status));
+
+//----------------------------------------------------
+// Functions for TF_ShapeInferenceContext.
+//
+// Functions for implementing shape inference functions. TensorFlow uses these
+// functions to determine the shape of tensors produced by an operation without
+// having to actually run the operation. If an operation chooses to provide a
+// shape inference function, it will be invoked by TensorFlow as needed.
+//
+// When invoked by TensorFlow, the shape inference function is provided with a
+// TF_ShapeInferenceContext pointer. The function's implementation will use the
+// accessor and mutator functions with names beginning with
+// TF_ShapeInferenceContext to examine the input state and determine the output
+// shape.
+
+// Returns the number of inputs in the given shape inference context.
+TF_CAPI_EXPORT extern int64_t TF_ShapeInferenceContextNumInputs(
+    TF_ShapeInferenceContext* ctx);
+
+// Returns a newly allocated shape handle. The shapes represented by these
+// handles may be queried or mutated with the corresponding
+// TF_ShapeInferenceContext...  functions.
+TF_CAPI_EXPORT extern TF_ShapeHandle* TF_NewShapeHandle();
+
+// Places the ith input of the given shape inference context into the given
+// shape handle, or returns a status other than TF_OK indicating why the input
+// could not be retrieved
+// (for example, if i < 0 || i >= TF_ShapeInferenceContextNumInputs(ctx)).
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextGetInput(
+    TF_ShapeInferenceContext* ctx, int i, TF_ShapeHandle* handle,
+    TF_Status* status);
+
+// Places the given shape handle into the `i`th output position of the given
+// context. Internally, the shape handle is copied; the caller may subsequently
+// delete `handle`.
+TF_CAPI_EXPORT
+extern void TF_ShapeInferenceContextSetOutput(TF_ShapeInferenceContext* ctx,
+                                              int i, TF_ShapeHandle* handle,
+                                              TF_Status* status);
+
+// Returns a newly-allocated scalar shape handle. The returned handle should
+// be freed with TF_DeleteShapeHandle.
+TF_CAPI_EXPORT extern TF_ShapeHandle* TF_ShapeInferenceContextScalar(
+    TF_ShapeInferenceContext* ctx);
+
+// Returns a newly-allocate shape handle representing a vector of the given
+// size. The returned handle should be freed with TF_DeleteShapeHandle.
+TF_CAPI_EXPORT extern TF_ShapeHandle* TF_ShapeInferenceContextVectorFromSize(
+    TF_ShapeInferenceContext* ctx, size_t size);
+
+// Returns a newly allocated dimension handle. It must be freed with
+// TF_DeleteDimensionHandle.
+TF_CAPI_EXPORT extern TF_DimensionHandle* TF_NewDimensionHandle();
+
+// Interprets the named shape inference context attribute as a TF_DataType and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// TF_DataType, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContext_GetAttrType(
+    TF_ShapeInferenceContext* ctx, const char* attr_name, TF_DataType* val,
+    TF_Status* status);
+
+// Returns the rank of the shape represented by the given handle.
+TF_CAPI_EXPORT extern int64_t TF_ShapeInferenceContextRank(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* handle);
+
+// Returns 1 if `handle` has a known rank, 0 otherwise.
+TF_CAPI_EXPORT extern int TF_ShapeInferenceContextRankKnown(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* handle);
+
+// If <handle> has rank <rank>, or its rank is unknown, return OK and return the
+// shape with asserted rank in <*result>. Otherwise an error is placed into
+// `status`.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextWithRank(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* handle, int64_t rank,
+    TF_ShapeHandle* result, TF_Status* status);
+
+// If <handle> has rank at least <rank>, or its rank is unknown, return OK and
+// return the shape with asserted rank in <*result>. Otherwise an error is
+// placed into `status`.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextWithRankAtLeast(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* handle, int64_t rank,
+    TF_ShapeHandle* result, TF_Status* status);
+
+// If <handle> has rank at most <rank>, or its rank is unknown, return OK and
+// return the shape with asserted rank in <*result>. Otherwise an error is
+// placed into `status`.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextWithRankAtMost(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* handle, int64_t rank,
+    TF_ShapeHandle* result, TF_Status* status);
+
+// Places a handle to the ith dimension of the given shape into *result.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextDim(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* shape_handle, int64_t i,
+    TF_DimensionHandle* result);
+
+// Returns in <*result> a sub-shape of <shape_handle>, with dimensions
+// [start:end]. <start> and <end> can be negative, to index from the end of the
+// shape. <start> and <end> are set to the rank of <shape_handle> if > rank of
+// <shape_handle>.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextSubshape(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* shape_handle, int64_t start,
+    int64_t end, TF_ShapeHandle* result, TF_Status* status);
+
+// Places an unknown shape in all outputs for the given inference context. Used
+// for shape inference functions with ops whose output shapes are unknown.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextSetUnknownShape(
+    TF_ShapeInferenceContext* ctx, TF_Status* status);
+
+// Returns whether the given handle represents a known dimension.
+TF_CAPI_EXPORT extern int TF_DimensionHandleValueKnown(
+    TF_DimensionHandle* dim_handle);
+
+// Returns the value of the given dimension.
+TF_CAPI_EXPORT extern int64_t TF_DimensionHandleValue(
+    TF_DimensionHandle* dim_handle);
+
+// Returns in <*result> the result of appending the dimensions of <second> to
+// those of <first>.
+TF_CAPI_EXPORT extern void TF_ShapeInferenceContextConcatenateShapes(
+    TF_ShapeInferenceContext* ctx, TF_ShapeHandle* first,
+    TF_ShapeHandle* second, TF_ShapeHandle* result, TF_Status* status);
+
+// Frees the given shape handle.
+TF_CAPI_EXPORT extern void TF_DeleteShapeHandle(TF_ShapeHandle* handle);
+
+// Frees the given dimension handle.
+TF_CAPI_EXPORT extern void TF_DeleteDimensionHandle(TF_DimensionHandle* handle);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/python_api.h b/third_party/tflite-hdrs/tensorflow/c/python_api.h
new file mode 100644
index 00000000..043b7668
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/python_api.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_PYTHON_API_H_
+#define TENSORFLOW_C_PYTHON_API_H_
+
+#include <string>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+
+// These functions can be removed without notice. They exist to facilitate some
+// refactoring of graph construction code in the Python API.
+
+namespace tensorflow {
+
+void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input);
+
+// Changes an attr value in the node_def Protocol Buffer and sets a status upon
+// completion.
+void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
+             TF_Buffer* attr_value_proto, TF_Status* status);
+
+// Clears the attr in the node_def Protocol Buffer and sets a status upon
+// completion.
+void ClearAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
+               TF_Status* status);
+
+// Sets the experimental_type` field in the node_def Protocol Buffer.
+void SetFullType(TF_Graph* graph, TF_Operation* op,
+                 const TF_Buffer* full_type_proto);
+
+void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
+
+// Updates 'dst' to consume 'new_src'.
+void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
+                TF_Status* status);
+
+// Extends `session` with any new operations added to its associated graph.
+// Usually this happens automatically in TF_SessionRun. After this is called,
+// TF_SessionRun will no longer extend the session on every call.
+//
+// We expose this here to allow fine-grained synchronization in multi-threaded
+// workloads, which is required since the Python implementation depends on the
+// above mutation methods. This allows us to prevent modifications to nodes in
+// the graph after the session has been made aware of them.
+void ExtendSession(TF_Session* session, TF_Status* status);
+
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource or variant tensor, or otherwise returns the empty
+// string.
+std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output);
+
+// Sets `output` based on `proto`, which should be a serialized
+// CppShapeInferenceResult::HandleData proto. `output` should be a resource
+// or variant tensor.
+// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string
+// because I couldn't get SWIG to work otherwise.
+void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
+                           size_t proto_len, TF_Status* status);
+
+// This method is used to add a new input edge to 'dst', which must be a While
+// op. The While op's "T" attribute must have already been updated to include
+// the new edge. This is used to construct tf.while_loop gradients.
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/safe_ptr.h b/third_party/tflite-hdrs/tensorflow/c/safe_ptr.h
new file mode 100644
index 00000000..8d8b8141
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/safe_ptr.h
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_SAFE_PTR_H_
+#define TENSORFLOW_C_SAFE_PTR_H_
+
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+
+namespace tensorflow {
+namespace detail {
+
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+};
+
+struct TFETensorHandleDeleter {
+  void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+};
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* p) const { TF_DeleteStatus(p); }
+};
+
+struct TFBufferDeleter {
+  void operator()(TF_Buffer* p) const { TF_DeleteBuffer(p); }
+};
+
+}  // namespace detail
+
+// Safe containers for an owned TF_Tensor. On destruction, the tensor will be
+// deleted by TF_DeleteTensor.
+using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, detail::TFTensorDeleter>;
+Safe_TF_TensorPtr make_safe(TF_Tensor* tensor);
+
+// Safe containers for an owned TFE_TensorHandle. On destruction, the handle
+// will be deleted by TFE_DeleteTensorHandle.
+using Safe_TFE_TensorHandlePtr =
+    std::unique_ptr<TFE_TensorHandle, detail::TFETensorHandleDeleter>;
+Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle);
+
+// Safe containers for an owned TF_Status. On destruction, the handle
+// will be deleted by TF_DeleteStatus.
+using Safe_TF_StatusPtr = std::unique_ptr<TF_Status, detail::TFStatusDeleter>;
+Safe_TF_StatusPtr make_safe(TF_Status* status);
+
+// Safe containers for an owned TF_Buffer. On destruction, the handle
+// will be deleted by TF_DeleteBuffer.
+using Safe_TF_BufferPtr = std::unique_ptr<TF_Buffer, detail::TFBufferDeleter>;
+Safe_TF_BufferPtr make_safe(TF_Buffer* buffer);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_SAFE_PTR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tensor_interface.h b/third_party/tflite-hdrs/tensorflow/c/tensor_interface.h
new file mode 100644
index 00000000..0b352f56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tensor_interface.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TENSOR_INTERFACE_H_
+#define TENSORFLOW_C_TENSOR_INTERFACE_H_
+
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Abstract interface to a Tensor.
+//
+// This allows us to hide concrete implementations of Tensor from header
+// files. The interface lists the common functionality that must be provided by
+// any concrete implementation. However, in cases where the true concrete class
+// is needed a static_cast can be applied.
+class AbstractTensorInterface {
+ public:
+  // Release any underlying resources, including the interface object.
+  virtual void Release() = 0;
+
+  // Returns tensor dtype.
+  virtual DataType Type() const = 0;
+  // Returns number of dimensions.
+  virtual int NumDims() const = 0;
+  // Returns size of specified dimension
+  virtual int64_t Dim(int dim_index) const = 0;
+  // Returns number of elements across all dimensions.
+  virtual int64_t NumElements() const = 0;
+  // Return size in bytes of the Tensor
+  virtual size_t ByteSize() const = 0;
+  // Returns a pointer to tensor data
+  virtual void* Data() const = 0;
+
+  // Returns if the tensor is aligned
+  virtual bool IsAligned() const = 0;
+  // Returns if their is sole ownership of this Tensor and thus it can be moved.
+  virtual bool CanMove() const = 0;
+
+  virtual std::string SummarizeValue() const = 0;
+
+ protected:
+  virtual ~AbstractTensorInterface() {}
+};
+
+namespace internal {
+struct AbstractTensorInterfaceDeleter {
+  void operator()(AbstractTensorInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractTensorPtr =
+    std::unique_ptr<AbstractTensorInterface,
+                    internal::AbstractTensorInterfaceDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_TENSOR_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_attrtype.h b/third_party/tflite-hdrs/tensorflow/c/tf_attrtype.h
new file mode 100644
index 00000000..0c1545db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_attrtype.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_TF_ATTRTYPE_H_
+#define TENSORFLOW_C_TF_ATTRTYPE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TF_AttrType describes the type of the value of an attribute on an operation.
+typedef enum TF_AttrType {
+  TF_ATTR_STRING = 0,
+  TF_ATTR_INT = 1,
+  TF_ATTR_FLOAT = 2,
+  TF_ATTR_BOOL = 3,
+  TF_ATTR_TYPE = 4,
+  TF_ATTR_SHAPE = 5,
+  TF_ATTR_TENSOR = 6,
+  TF_ATTR_PLACEHOLDER = 7,
+  TF_ATTR_FUNC = 8,
+} TF_AttrType;
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_ATTRTYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_buffer.h b/third_party/tflite-hdrs/tensorflow/c/tf_buffer.h
new file mode 100644
index 00000000..71a9aef8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_buffer.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_BUFFER_H_
+#define TENSORFLOW_C_TF_BUFFER_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// TF_Buffer holds a pointer to a block of data and its associated length.
+// Typically, the data consists of a serialized protocol buffer, but other data
+// may also be held in a buffer.
+//
+// By default, TF_Buffer itself does not do any memory management of the
+// pointed-to block.  If need be, users of this struct should specify how to
+// deallocate the block by setting the `data_deallocator` function pointer.
+typedef struct TF_Buffer {
+  const void* data;
+  size_t length;
+  void (*data_deallocator)(void* data, size_t length);
+} TF_Buffer;
+
+// Makes a copy of the input and sets an appropriate deallocator.  Useful for
+// passing in read-only, input protobufs.
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBufferFromString(const void* proto,
+                                                        size_t proto_len);
+
+// Useful for passing *out* a protobuf.
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer(void);
+
+TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
+
+TF_CAPI_EXPORT extern TF_Buffer TF_GetBuffer(TF_Buffer* buffer);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_buffer_internal.h b/third_party/tflite-hdrs/tensorflow/c/tf_buffer_internal.h
new file mode 100644
index 00000000..85436f42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_buffer_internal.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_BUFFER_INTERNAL_H_
+#define TENSORFLOW_C_TF_BUFFER_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+absl::Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
+                             TF_Buffer* out);
+
+absl::Status BufferToMessage(const TF_Buffer* in,
+                             tensorflow::protobuf::MessageLite* out);
+
+namespace internal {
+
+struct TF_BufferDeleter {
+  void operator()(TF_Buffer* buf) const { TF_DeleteBuffer(buf); }
+};
+
+}  // namespace internal
+
+using TF_BufferPtr = std::unique_ptr<TF_Buffer, internal::TF_BufferDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_TF_BUFFER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_datatype.h b/third_party/tflite-hdrs/tensorflow/c/tf_datatype.h
new file mode 100644
index 00000000..9a9eaadc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_datatype.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_DATATYPE_H_
+#define TENSORFLOW_C_TF_DATATYPE_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
+// The enum values here are identical to corresponding values in types.proto.
+typedef enum TF_DataType {
+  TF_FLOAT = 1,
+  TF_DOUBLE = 2,
+  TF_INT32 = 3,  // Int32 tensors are always in 'host' memory.
+  TF_UINT8 = 4,
+  TF_INT16 = 5,
+  TF_INT8 = 6,
+  TF_STRING = 7,
+  TF_COMPLEX64 = 8,  // Single-precision complex
+  TF_COMPLEX = 8,    // Old identifier kept for API backwards compatibility
+  TF_INT64 = 9,
+  TF_BOOL = 10,
+  TF_QINT8 = 11,     // Quantized int8
+  TF_QUINT8 = 12,    // Quantized uint8
+  TF_QINT32 = 13,    // Quantized int32
+  TF_BFLOAT16 = 14,  // Float32 truncated to 16 bits.
+  TF_QINT16 = 15,    // Quantized int16
+  TF_QUINT16 = 16,   // Quantized uint16
+  TF_UINT16 = 17,
+  TF_COMPLEX128 = 18,  // Double-precision complex
+  TF_HALF = 19,
+  TF_RESOURCE = 20,
+  TF_VARIANT = 21,
+  TF_UINT32 = 22,
+  TF_UINT64 = 23,
+  TF_FLOAT8_E5M2 = 24,    // 5 exponent bits, 2 mantissa bits.
+  TF_FLOAT8_E4M3FN = 25,  // 4 exponent bits, 3 mantissa bits, finite-only, with
+                          // 2 NaNs (0bS1111111).
+  // TODO - b/299182407: Leaving room for remaining float8 types.
+  // TF_FLOAT8_E4M3FNUZ = 26,
+  // TF_FLOAT8_E4M3B11FNUZ = 27,
+  // TF_FLOAT8_E5M2FNUZ = 28,
+  TF_INT4 = 29,
+  TF_UINT4 = 30,
+} TF_DataType;
+
+// TF_DataTypeSize returns the sizeof() for the underlying type corresponding
+// to the given TF_DataType enum value. Returns 0 for variable length types
+// (eg. TF_STRING) or on failure.
+TF_CAPI_EXPORT extern size_t TF_DataTypeSize(TF_DataType dt);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_DATATYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_file_statistics.h b/third_party/tflite-hdrs/tensorflow/c/tf_file_statistics.h
new file mode 100644
index 00000000..117d9501
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_file_statistics.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_FILE_STATISTICS_H_
+#define TENSORFLOW_C_TF_FILE_STATISTICS_H_
+
+#include <stdint.h>
+
+typedef struct TF_FileStatistics {
+  // The length of the file in bytes.
+  int64_t length;
+  // The last modified time in nanoseconds.
+  int64_t mtime_nsec;
+  // Whether the name refers to a directory.
+  bool is_directory;
+} TF_FileStatistics;
+
+// TODO(b/139060984): `tensorflow::FileStatistics` from
+// `core/platform/file_statistics.h` is a duplicate of this so maybe try to
+// remove duplication later?
+
+#endif  // TENSORFLOW_C_TF_FILE_STATISTICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_shape.h b/third_party/tflite-hdrs/tensorflow/c/tf_shape.h
new file mode 100644
index 00000000..f218d05e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_shape.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+
+#ifndef TENSORFLOW_C_TF_SHAPE_H_
+#define TENSORFLOW_C_TF_SHAPE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// An opaque type corresponding to a shape in tensorflow. In the future,
+// we may expose the ABI of TF_Shape for performance reasons.
+typedef struct TF_Shape TF_Shape;
+
+// Return a new, unknown rank shape object. The caller is responsible for
+// calling TF_DeleteShape to deallocate and destroy the returned shape.
+TF_CAPI_EXPORT extern TF_Shape* TF_NewShape();
+
+// Returns the rank of `shape`. If `shape` has unknown rank, returns -1.
+TF_CAPI_EXPORT extern int TF_ShapeDims(const TF_Shape* shape);
+
+// Returns the `d`th dimension of `shape`. If `shape` has unknown rank,
+// invoking this function is undefined behavior. Returns -1 if dimension is
+// unknown.
+TF_CAPI_EXPORT extern int64_t TF_ShapeDimSize(const TF_Shape* shape, int d);
+
+// Deletes `shape`.
+TF_CAPI_EXPORT extern void TF_DeleteShape(TF_Shape* shape);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_shape_internal.h b/third_party/tflite-hdrs/tensorflow/c/tf_shape_internal.h
new file mode 100644
index 00000000..fe977264
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_shape_internal.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_SHAPE_INTERNAL_H_
+#define TENSORFLOW_C_TF_SHAPE_INTERNAL_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+typedef struct TF_Shape TF_Shape;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::PartialTensorShape, TF_Shape);
+
+}
+
+#endif  // TENSORFLOW_C_TF_SHAPE_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_status.h b/third_party/tflite-hdrs/tensorflow/c/tf_status.h
new file mode 100644
index 00000000..8979e42c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_status.h
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_STATUS_H_
+#define TENSORFLOW_C_TF_STATUS_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "xla/tsl/c/tsl_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TSL_Status TF_Status;
+
+// --------------------------------------------------------------------------
+// TF_Code holds an error code.  The enum values here are identical to
+// corresponding values in error_codes.proto.
+typedef TSL_Code TF_Code;
+// LINT.IfChange
+#define TF_OK TSL_OK
+#define TF_CANCELLED TSL_CANCELLED
+#define TF_UNKNOWN TSL_UNKNOWN
+#define TF_INVALID_ARGUMENT TSL_INVALID_ARGUMENT
+#define TF_DEADLINE_EXCEEDED TSL_DEADLINE_EXCEEDED
+#define TF_NOT_FOUND TSL_NOT_FOUND
+#define TF_ALREADY_EXISTS TSL_ALREADY_EXISTS
+#define TF_PERMISSION_DENIED TSL_PERMISSION_DENIED
+#define TF_UNAUTHENTICATED TSL_UNAUTHENTICATED
+#define TF_RESOURCE_EXHAUSTED TSL_RESOURCE_EXHAUSTED
+#define TF_FAILED_PRECONDITION TSL_FAILED_PRECONDITION
+#define TF_ABORTED TSL_ABORTED
+#define TF_OUT_OF_RANGE TSL_OUT_OF_RANGE
+#define TF_UNIMPLEMENTED TSL_UNIMPLEMENTED
+#define TF_INTERNAL TSL_INTERNAL
+#define TF_UNAVAILABLE TSL_UNAVAILABLE
+#define TF_DATA_LOSS TSL_DATA_LOSS
+// LINT.ThenChange(//tensorflow/python/py_exception_registry_wrapper.cc)
+
+// --------------------------------------------------------------------------
+
+// Return a new status object.
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus(void);
+
+// Delete a previously created status object.
+TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
+
+// Record <code, msg> in *s.  Any previous information is lost.
+// A common use is to clear a status: TF_SetStatus(s, TF_OK, "");
+TF_CAPI_EXPORT extern void TF_SetStatus(TF_Status* s, TF_Code code,
+                                        const char* msg);
+
+// Record <key, value> as a payload in *s. The previous payload having the
+// same key (if any) is overwritten. Payload will not be added if the Status
+// is OK.
+TF_CAPI_EXPORT void TF_SetPayload(TF_Status* s, const char* key,
+                                  const char* value);
+
+// Iterates over the stored payloads and calls the `visitor(key, value)`
+// callable for each one. `key` and `value` is only usable during the callback.
+// `capture` will be passed to the callback without modification.
+#define TF_PayloadVisitor TSL_PayloadVisitor
+TF_CAPI_EXPORT extern void TF_ForEachPayload(const TF_Status* s,
+                                             TF_PayloadVisitor visitor,
+                                             void* capture);
+
+// Convert from an I/O error code (e.g., errno) to a TF_Status value.
+// Any previous information is lost. Prefer to use this instead of TF_SetStatus
+// when the error comes from I/O operations.
+TF_CAPI_EXPORT extern void TF_SetStatusFromIOError(TF_Status* s, int error_code,
+                                                   const char* context);
+
+// Return the code record in *s.
+TF_CAPI_EXPORT extern TF_Code TF_GetCode(const TF_Status* s);
+
+// Return a pointer to the (null-terminated) error message in *s.  The
+// return value points to memory that is only usable until the next
+// mutation to *s.  Always returns an empty string if TF_GetCode(s) is
+// TF_OK.
+TF_CAPI_EXPORT extern const char* TF_Message(const TF_Status* s);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_status_helper.h b/third_party/tflite-hdrs/tensorflow/c/tf_status_helper.h
new file mode 100644
index 00000000..ce833c39
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_status_helper.h
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_STATUS_HELPER_H_
+#define TENSORFLOW_C_TF_STATUS_HELPER_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/tf_status.h"
+#include "tsl/platform/status.h"
+
+namespace tsl {
+// Set the attribute of "tf_status" from the attributes of "status".
+void Set_TF_Status_from_Status(TF_Status* tf_status,
+                               const absl::Status& status);
+
+// Returns a "status" from "tf_status".
+absl::Status StatusFromTF_Status(const TF_Status* tf_status);
+}  // namespace tsl
+
+namespace tensorflow {
+using tsl::Set_TF_Status_from_Status;  // NOLINT
+using tsl::StatusFromTF_Status;        // NOLINT
+
+namespace internal {
+struct TF_StatusDeleter {
+  void operator()(TF_Status* tf_status) const { TF_DeleteStatus(tf_status); }
+};
+}  // namespace internal
+
+using TF_StatusPtr = std::unique_ptr<TF_Status, internal::TF_StatusDeleter>;
+
+}  // namespace tensorflow
+
+#define TF_STATUS_ASSIGN_OR_RETURN(lhs, rexpr, c_status) \
+  _TF_STATUS_ASSIGN_OR_RETURN_IMPL(                      \
+      _TF_STATUS_CONCAT(_status_or_value, __COUNTER__), lhs, rexpr, c_status);
+
+#define _TF_STATUS_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr, c_status) \
+  auto statusor = (rexpr);                                               \
+  if (!statusor.ok()) {                                                  \
+    tensorflow::Set_TF_Status_from_Status(c_status, statusor.status());  \
+    return;                                                              \
+  }                                                                      \
+  lhs = std::move(*statusor)
+
+#define TF_STATUS_RETURN_IF_ERROR(rexpr, c_status)                         \
+  _TF_STATUS_RETURN_IF_ERROR_IMPL(_TF_STATUS_CONCAT(_status, __COUNTER__), \
+                                  rexpr, c_status);
+
+#define _TF_STATUS_RETURN_IF_ERROR_IMPL(status, rexpr, c_status) \
+  auto status = (rexpr);                                         \
+  if (!status.ok()) {                                            \
+    tensorflow::Set_TF_Status_from_Status(c_status, status);     \
+    return;                                                      \
+  }
+
+#define _TF_STATUS_CONCAT(x, y) _TF_STATUS_CONCAT_IMPL(x, y)
+#define _TF_STATUS_CONCAT_IMPL(x, y) x##y
+
+#endif  // TENSORFLOW_C_TF_STATUS_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_status_internal.h b/third_party/tflite-hdrs/tensorflow/c/tf_status_internal.h
new file mode 100644
index 00000000..4aa273fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_status_internal.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_STATUS_INTERNAL_H_
+#define TENSORFLOW_C_TF_STATUS_INTERNAL_H_
+
+#include "xla/tsl/c/tsl_status_internal.h"
+
+typedef struct TSL_Status TF_Status;
+
+#endif  // TENSORFLOW_C_TF_STATUS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_tensor.h b/third_party/tflite-hdrs/tensorflow/c/tf_tensor.h
new file mode 100644
index 00000000..b2855d28
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_tensor.h
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_TENSOR_H_
+#define TENSORFLOW_C_TF_TENSOR_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Allocator Attributes used for tensor allocation.
+typedef struct TF_AllocatorAttributes {
+  size_t struct_size;
+  // Set boolean to 1 for CPU allocation, else 0.
+  TF_Bool on_host;
+} TF_AllocatorAttributes;
+
+#define TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TF_AllocatorAttributes, on_host)
+
+// --------------------------------------------------------------------------
+// TF_Tensor holds a multi-dimensional array of elements of a single data type.
+// For all types other than TF_STRING, the data buffer stores elements
+// in row major order.  E.g. if data is treated as a vector of TF_DataType:
+//
+//   element 0:   index (0, ..., 0)
+//   element 1:   index (0, ..., 1)
+//   ...
+//
+// The format for TF_STRING tensors is:
+//   start_offset: array[uint64]
+//   data:         byte[...]
+//
+//   The string length (as a varint, start_offset[i + 1] - start_offset[i]),
+//   followed by the contents of the string is encoded at data[start_offset[i]].
+//   TF_StringEncode and TF_StringDecode facilitate this encoding.
+
+typedef struct TF_Tensor TF_Tensor;
+
+// Return a new tensor that holds the bytes data[0,len-1].
+//
+// The data will be deallocated by a subsequent call to TF_DeleteTensor via:
+//      (*deallocator)(data, len, deallocator_arg)
+// Clients must provide a custom deallocator function so they can pass in
+// memory managed by something like numpy.
+//
+// May return NULL (and invoke the deallocator) if the provided data buffer
+// (data, len) is inconsistent with a tensor of the given TF_DataType
+// and the shape specified by (dima, num_dims).
+TF_CAPI_EXPORT extern TF_Tensor* TF_NewTensor(
+    TF_DataType, const int64_t* dims, int num_dims, void* data, size_t len,
+    void (*deallocator)(void* data, size_t len, void* arg),
+    void* deallocator_arg);
+
+// Returns the alignment, in bytes, required for allocating aligned tensors.
+//
+// This can be used in combination with TF_NewTensor to manually manage
+// memory while ensuring the resulting tensors satisfy TensorFlow's
+// memory alignment preferences.
+TF_CAPI_EXPORT extern size_t TF_TensorDefaultAlignment();
+
+// Allocate and return a new Tensor.
+//
+// This function is an alternative to TF_NewTensor and should be used when
+// memory is allocated to pass the Tensor to the C API. The allocated memory
+// satisfies TensorFlow's memory alignment preferences and should be preferred
+// over calling malloc and free.
+//
+// The caller must set the Tensor values by writing them to the pointer returned
+// by TF_TensorData with length TF_TensorByteSize.
+TF_CAPI_EXPORT extern TF_Tensor* TF_AllocateTensor(TF_DataType,
+                                                   const int64_t* dims,
+                                                   int num_dims, size_t len);
+
+// Deletes `tensor` and returns a new TF_Tensor with the same content if
+// possible. Returns nullptr and leaves `tensor` untouched if not.
+TF_CAPI_EXPORT extern TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor);
+
+// Destroy a tensor.
+TF_CAPI_EXPORT extern void TF_DeleteTensor(TF_Tensor*);
+
+// Return the type of a tensor element.
+TF_CAPI_EXPORT extern TF_DataType TF_TensorType(const TF_Tensor*);
+
+// Set a new shape for the Tensor.
+TF_CAPI_EXPORT extern void TF_SetShape(TF_Tensor* tensor, const int64_t* dims,
+                                       int num_dims);
+
+// Return the number of dimensions that the tensor has.
+TF_CAPI_EXPORT extern int TF_NumDims(const TF_Tensor*);
+
+// Return the length of the tensor in the "dim_index" dimension.
+// REQUIRES: 0 <= dim_index < TF_NumDims(tensor)
+TF_CAPI_EXPORT extern int64_t TF_Dim(const TF_Tensor* tensor, int dim_index);
+
+// Return the size of the underlying data in bytes.
+TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
+
+// Return a pointer to the underlying data buffer.
+TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
+
+// Returns the number of elements in the tensor.
+TF_CAPI_EXPORT extern int64_t TF_TensorElementCount(const TF_Tensor* tensor);
+
+// Copy the internal data representation of `from` to `to`. `new_dims` and
+// `num_new_dims` specify the new shape of the `to` tensor, `type` specifies its
+// data type. On success, *status is set to TF_OK and the two tensors share the
+// same data buffer.
+//
+// This call requires that the `from` tensor and the given type and shape (dims
+// and num_dims) are "compatible" (i.e. they occupy the same number of bytes).
+// Specifically, given from_type_size = TF_DataTypeSize(TF_TensorType(from)):
+//
+// ShapeElementCount(dims, num_dims) * TF_DataTypeSize(type)
+//
+// must equal
+//
+// TF_TensorElementCount(from) * from_type_size
+//
+// where TF_ShapeElementCount would be the number of elements in a tensor with
+// the given shape.
+//
+// In addition, this function requires:
+//   * TF_DataTypeSize(TF_TensorType(from)) != 0
+//   * TF_DataTypeSize(type) != 0
+//
+// If any of the requirements are not met, *status is set to
+// TF_INVALID_ARGUMENT.
+TF_CAPI_EXPORT extern void TF_TensorBitcastFrom(const TF_Tensor* from,
+                                                TF_DataType type, TF_Tensor* to,
+                                                const int64_t* new_dims,
+                                                int num_new_dims,
+                                                TF_Status* status);
+
+// Returns bool iff this tensor is aligned.
+TF_CAPI_EXPORT extern bool TF_TensorIsAligned(const TF_Tensor*);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_tensor_helper.h b/third_party/tflite-hdrs/tensorflow/c/tf_tensor_helper.h
new file mode 100644
index 00000000..b77d5a78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_tensor_helper.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_TENSOR_HELPER_H_
+#define TENSORFLOW_C_TF_TENSOR_HELPER_H_
+
+#include <memory>
+
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class Tensor;
+
+absl::Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
+
+TF_Tensor* TF_TensorFromTensor(const Tensor& src, absl::Status* status);
+
+TF_Tensor* TF_TensorFromTensorShallow(const Tensor& src, absl::Status* status);
+
+namespace internal {
+
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* tf_tensor) const { TF_DeleteTensor(tf_tensor); }
+};
+
+}  // namespace internal
+
+// Struct that wraps TF_Tensor to delete once out of scope.
+using TF_TensorPtr = std::unique_ptr<TF_Tensor, internal::TFTensorDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_TF_TENSOR_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_tensor_internal.h b/third_party/tflite-hdrs/tensorflow/c/tf_tensor_internal.h
new file mode 100644
index 00000000..61bceee5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_tensor_internal.h
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_TENSOR_INTERNAL_H_
+#define TENSORFLOW_C_TF_TENSOR_INTERNAL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/c/tf_tensor_helper.h"  // IWYU pragma: export
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/status.h"
+
+// Internal structures used by the C API. These are likely to change and should
+// not be depended on.
+
+// This struct forms part of the C API's public interface. It must strictly be
+// passed to or returned from C functions *by pointer*. Otherwise, changes to
+// its internal structure will break the C API's binary interface.
+typedef struct TF_Tensor {
+  tensorflow::AbstractTensorInterface* tensor;
+} TF_Tensor;
+
+class TF_ManagedBuffer : public tensorflow::TensorBuffer {
+ public:
+  TF_ManagedBuffer(void* data, size_t len,
+                   void (*deallocator)(void* data, size_t len, void* arg),
+                   void* deallocator_arg, bool owns_memory)
+      : TensorBuffer(data),
+        len_(len),
+        deallocator_(deallocator),
+        deallocator_arg_(deallocator_arg),
+        owns_memory_(owns_memory) {}
+
+  ~TF_ManagedBuffer() override {
+    (*deallocator_)(data(), len_, deallocator_arg_);
+  }
+
+  size_t size() const override { return len_; }
+  TensorBuffer* root_buffer() override { return this; }
+  void FillAllocationDescription(
+      tensorflow::AllocationDescription* proto) const override {
+    int64_t rb = size();
+    proto->set_requested_bytes(rb);
+    proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
+  }
+
+  bool OwnsMemory() const override { return owns_memory_; }
+
+ private:
+  const size_t len_;
+  void (*const deallocator_)(void* data, size_t len, void* arg);
+  void* const deallocator_arg_;
+  bool owns_memory_;
+};
+
+namespace tensorflow {
+
+class TensorCApi {
+ public:
+  static TensorBuffer* Buffer(const Tensor& tensor) { return tensor.buf_; }
+  static Tensor MakeTensor(TF_DataType type, const TensorShape& shape,
+                           TensorBuffer* buf) {
+    return Tensor(static_cast<DataType>(type), shape, buf);
+  }
+};
+
+// Allocates tensor data buffer using specified allocator.
+// `operation` is a name for this operation.
+void* allocate_tensor(const char* operation, size_t len, Allocator* allocator);
+
+// Deallocates tensor data buffer.
+// Defaults to deallocating using CPU allocator. You can pass pointer to
+// a different Allocator as `arg`.
+void deallocate_buffer(void* data, size_t len, void* arg);
+
+class TensorInterface : public AbstractTensorInterface {
+ public:
+  TensorInterface() {}
+  explicit TensorInterface(tensorflow::Tensor t) : tensor_(std::move(t)) {}
+  ~TensorInterface() override {}
+
+  void Release() override;
+
+  DataType Type() const override;
+  int NumDims() const override;
+  int64_t Dim(int dim_index) const override;
+  int64_t NumElements() const override;
+  size_t ByteSize() const override;
+  void* Data() const override;
+  bool IsAligned() const override;
+  bool CanMove() const override;
+  std::string SummarizeValue() const override;
+
+  void SetShape(const int64_t* dims, int num_dims);
+  absl::Status ToTensor(tensorflow::Tensor* dst) const;
+  absl::Status BitcastFrom(const TensorInterface& from, DataType type,
+                           const int64_t* new_dims, int num_new_dims);
+  absl::Status FromProto(const tensorflow::TensorProto& from);
+
+  tensorflow::Tensor& Tensor() { return tensor_; }
+
+ private:
+  tensorflow::Tensor tensor_;
+};
+
+inline Tensor& TensorFromInterface(AbstractTensorInterface* tensor) {
+  return down_cast<TensorInterface*>(tensor)->Tensor();
+}
+
+AbstractTensorInterface* TensorInterfaceFromTensor(const Tensor& src,
+                                                   absl::Status* status);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_TF_TENSOR_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/c/tf_tstring.h b/third_party/tflite-hdrs/tensorflow/c/tf_tstring.h
new file mode 100644
index 00000000..876fd5f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/c/tf_tstring.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_TF_TSTRING_H_
+#define TENSORFLOW_C_TF_TSTRING_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/platform/ctstring.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TF_CAPI_EXPORT extern void TF_StringInit(TF_TString *t);
+
+TF_CAPI_EXPORT extern void TF_StringCopy(TF_TString *dst, const char *src,
+                                         size_t size);
+
+TF_CAPI_EXPORT extern void TF_StringAssignView(TF_TString *dst, const char *src,
+                                               size_t size);
+
+TF_CAPI_EXPORT extern const char *TF_StringGetDataPointer(
+    const TF_TString *tstr);
+
+TF_CAPI_EXPORT extern TF_TString_Type TF_StringGetType(const TF_TString *str);
+
+TF_CAPI_EXPORT extern size_t TF_StringGetSize(const TF_TString *tstr);
+
+TF_CAPI_EXPORT extern size_t TF_StringGetCapacity(const TF_TString *str);
+
+TF_CAPI_EXPORT extern void TF_StringDealloc(TF_TString *tstr);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_TSTRING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/client/client_session.h b/third_party/tflite-hdrs/tensorflow/cc/client/client_session.h
new file mode 100644
index 00000000..9dc790d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/client/client_session.h
@@ -0,0 +1,164 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_CLIENT_CLIENT_SESSION_H_
+#define TENSORFLOW_CC_CLIENT_CLIENT_SESSION_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tsl {
+namespace thread {
+struct ThreadPoolOptions;
+}
+}  // namespace tsl
+
+namespace tensorflow {
+
+namespace thread {
+using tsl::thread::ThreadPoolOptions;
+}
+
+/// @addtogroup core
+/// @{
+
+/// A `ClientSession` object lets the caller drive the evaluation of the
+/// TensorFlow graph constructed with the C++ API.
+///
+/// Example:
+///
+///     Scope root = Scope::NewRootScope();
+///     auto a = Placeholder(root, DT_INT32);
+///     auto c = Add(root, a, {41});
+///
+///     ClientSession session(root);
+///     std::vector<Tensor> outputs;
+///
+///     Status s = session.Run({ {a, {1}} }, {c}, &outputs);
+///     if (!s.ok()) { ... }
+class ClientSession {
+ public:
+  /// A data type to represent feeds to a Run call.
+  ///
+  /// This is a map of `Output` objects returned by op-constructors to the value
+  /// to feed them with. See `Input::Initializer` for details on what can be
+  /// used as feed values.
+  typedef std::unordered_map<Output, Input::Initializer, OutputHash> FeedType;
+
+  /// Create a new session to evaluate the graph contained in `scope` by
+  /// connecting to the TensorFlow runtime specified by `target`.
+  ClientSession(const Scope& scope, const string& target);
+
+  /// Same as above, but use the empty string ("") as the target specification.
+  explicit ClientSession(const Scope& scope);
+
+  /// Create a new session, configuring it with `session_options`.
+  ClientSession(const Scope& scope, const SessionOptions& session_options);
+
+  ~ClientSession();
+
+  /// Evaluate the tensors in `fetch_outputs`. The values are returned as
+  /// `Tensor` objects in `outputs`. The number and order of `outputs` will
+  /// match `fetch_outputs`.
+  absl::Status Run(const std::vector<Output>& fetch_outputs,
+                   std::vector<Tensor>* outputs) const;
+
+  /// Same as above, but use the mapping in `inputs` as feeds.
+  absl::Status Run(const FeedType& inputs,
+                   const std::vector<Output>& fetch_outputs,
+                   std::vector<Tensor>* outputs) const;
+
+  /// Same as above. Additionally runs the operations ins `run_outputs`.
+  absl::Status Run(const FeedType& inputs,
+                   const std::vector<Output>& fetch_outputs,
+                   const std::vector<Operation>& run_outputs,
+                   std::vector<Tensor>* outputs) const;
+
+  /// Use `run_options` to turn on performance profiling. `run_metadata`, if not
+  /// null, is filled in with the profiling results.
+  absl::Status Run(const RunOptions& run_options, const FeedType& inputs,
+                   const std::vector<Output>& fetch_outputs,
+                   const std::vector<Operation>& run_outputs,
+                   std::vector<Tensor>* outputs,
+                   RunMetadata* run_metadata) const;
+
+  /// Same as above. Additionally allows user to provide custom threadpool
+  /// implementation via ThreadPoolOptions.
+  absl::Status Run(const RunOptions& run_options, const FeedType& inputs,
+                   const std::vector<Output>& fetch_outputs,
+                   const std::vector<Operation>& run_outputs,
+                   std::vector<Tensor>* outputs, RunMetadata* run_metadata,
+                   const thread::ThreadPoolOptions& threadpool_options) const;
+
+  /// \brief A handle to a subgraph, created with
+  /// `ClientSession::MakeCallable()`.
+  typedef int64_t CallableHandle;
+
+  /// \brief Creates a `handle` for invoking the subgraph defined by
+  /// `callable_options`.
+  /// NOTE: This API is still experimental and may change.
+  absl::Status MakeCallable(const CallableOptions& callable_options,
+                            CallableHandle* out_handle);
+
+  /// \brief Invokes the subgraph named by `handle` with the given options and
+  /// input tensors.
+  ///
+  /// The order of tensors in `feed_tensors` must match the order of names in
+  /// `CallableOptions::feed()` and the order of tensors in `fetch_tensors` will
+  /// match the order of names in `CallableOptions::fetch()` when this subgraph
+  /// was created.
+  /// NOTE: This API is still experimental and may change.
+  absl::Status RunCallable(CallableHandle handle,
+                           const std::vector<Tensor>& feed_tensors,
+                           std::vector<Tensor>* fetch_tensors,
+                           RunMetadata* run_metadata);
+
+  /// \brief Invokes the subgraph named by `handle` with the given options and
+  /// input tensors.
+  ///
+  /// The order of tensors in `feed_tensors` must match the order of names in
+  /// `CallableOptions::feed()` and the order of tensors in `fetch_tensors` will
+  /// match the order of names in `CallableOptions::fetch()` when this subgraph
+  /// was created.
+  /// NOTE: This API is still experimental and may change.
+  absl::Status RunCallable(CallableHandle handle,
+                           const std::vector<Tensor>& feed_tensors,
+                           std::vector<Tensor>* fetch_tensors,
+                           RunMetadata* run_metadata,
+                           const thread::ThreadPoolOptions& options);
+
+  /// \brief Releases resources associated with the given `handle` in this
+  /// session.
+  /// NOTE: This API is still experimental and may change.
+  absl::Status ReleaseCallable(CallableHandle handle);
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  Impl* impl() { return impl_.get(); }
+  const Impl* impl() const { return impl_.get(); }
+};
+
+/// @}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CC_CLIENT_CLIENT_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/runtime.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/runtime.h
new file mode 100644
index 00000000..711a38c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/runtime.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// Runtime represents an opaque instance of a Tensorflow runtime, with its own
+// resources, threadpools, etc. Clients are expected to construct a Runtime
+// object through tensorflow::cc::RuntimeBuilder::Build, after setting any
+// relevant configuration options. Many Tensorflow functions take a reference to
+// the runtime as an argument (eg: tensorflow::cc::SavedModelAPI::Load), and
+// may have different implementations depending on the runtime. For many of
+// these Runtime-attached objects (such as tensorflow::cc::TensorHandle), the
+// Runtime must outlive these objects.
+class Runtime {
+ public:
+  // Runtime is movable, but not copyable.
+  Runtime(Runtime&&) = default;
+  Runtime& operator=(Runtime&&) = default;
+
+ private:
+  friend class RuntimeBuilder;
+  friend class SavedModelAPI;
+  friend class TensorHandle;
+
+  // Wraps a TFE_Context. Takes ownership of ctx.
+  explicit Runtime(TFE_Context* ctx) : ctx_(ctx) {}
+
+  // Deletes the currently wrapped TFE_Context, swaps it with ctx,
+  // and takes ownership of ctx.
+  void Reset(TFE_Context* ctx) { ctx_.reset(ctx); }
+
+  // Returns the TFE_Context that this object wraps. This object
+  // retains ownership of the pointer.
+  TFE_Context* GetTFEContext() const { return ctx_.get(); }
+
+  // Runtime is not copyable
+  Runtime(const Runtime&) = delete;
+  Runtime& operator=(const Runtime&) = delete;
+
+  struct TFEContextDeleter {
+    void operator()(TFE_Context* p) const { TFE_DeleteContext(p); }
+  };
+  std::unique_ptr<TFE_Context, TFEContextDeleter> ctx_;
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/runtime_builder.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/runtime_builder.h
new file mode 100644
index 00000000..737e06cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/runtime_builder.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// RuntimeBuilder is a builder used to construct a tensorflow::cc::Runtime.
+// Use this to set configuration options, like threadpool size, etc.
+class RuntimeBuilder {
+ public:
+  RuntimeBuilder() : options_(TFE_NewContextOptions()) {}
+
+  // If `use_tfrt` is true, we will use the new Tensorflow Runtime
+  // (https://blog.tensorflow.org/2020/04/tfrt-new-tensorflow-runtime.html) as
+  // our runtime implementation.
+  RuntimeBuilder& SetUseTFRT(bool use_tfrt);
+
+  // Build a Tensorflow Runtime.
+  //
+  // Params:
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr. Otherwise, returns a
+  //  unique_ptr<tensorflow::cc::Runtime>.
+  std::unique_ptr<Runtime> Build(Status* status);
+
+  // RuntimeBuilder is movable, but not copyable.
+  RuntimeBuilder(RuntimeBuilder&&) = default;
+  RuntimeBuilder& operator=(RuntimeBuilder&&) = default;
+
+ private:
+  // RuntimeBuilder is not copyable
+  RuntimeBuilder(const RuntimeBuilder&) = delete;
+  RuntimeBuilder& operator=(const RuntimeBuilder&) = delete;
+
+  struct TFEContextOptionsDeleter {
+    void operator()(TFE_ContextOptions* p) const {
+      TFE_DeleteContextOptions(p);
+    }
+  };
+  std::unique_ptr<TFE_ContextOptions, TFEContextOptionsDeleter> options_;
+};
+
+inline RuntimeBuilder& RuntimeBuilder::SetUseTFRT(bool use_tfrt) {
+  TFE_ContextOptionsSetTfrt(options_.get(), use_tfrt);
+  return *this;
+}
+
+inline std::unique_ptr<Runtime> RuntimeBuilder::Build(Status* status) {
+  TFE_Context* result = TFE_NewContext(options_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return nullptr;
+  }
+  // We can't use std::make_unique here because of its interaction with a
+  // private constructor: https://abseil.io/tips/134
+  return std::unique_ptr<Runtime>(new Runtime(result));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/status.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/status.h
new file mode 100644
index 00000000..98c8cf6c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/status.h
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/c/tf_status.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// Status is a wrapper around an error code and an optional error message.
+// The set of error codes are defined here:
+// https://github.com/tensorflow/tensorflow/blob/08931c1e3e9eb2e26230502d678408e66730826c/tensorflow/c/tf_status.h#L39-L60
+// Many Tensorflow APIs return a Status, or take a Status as an out parameter.
+// Clients should check for status.ok() after calling these APIs, and either
+// handle or propagate the error appropriately.
+// TODO(bmzhao): Add a detailed code example before moving out of experimental.
+class Status {
+ public:
+  // Create a success status
+  Status() : status_(TF_NewStatus()) {}
+
+  // Return the status code
+  TF_Code code() const;
+
+  // Returns the error message in Status.
+  std::string message() const;
+
+  // Returns the error message in Status.
+  bool ok() const;
+
+  // Record <code, msg> in Status. Any previous information is lost.
+  // A common use is to clear a status: SetStatus(TF_OK, "");
+  void SetStatus(TF_Code code, const std::string& msg);
+
+  // Status is movable, but not copyable.
+  Status(Status&&) = default;
+  Status& operator=(Status&&) = default;
+
+ private:
+  friend class RuntimeBuilder;
+  friend class Runtime;
+  friend class SavedModelAPI;
+  friend class TensorHandle;
+
+  // Wraps a TF_Status*, and takes ownership of it.
+  explicit Status(TF_Status* status) : status_(status) {}
+
+  // Status is not copyable
+  Status(const Status&) = delete;
+  Status& operator=(const Status&) = delete;
+
+  // Returns the TF_Status that this object wraps. This object
+  // retains ownership of the pointer.
+  TF_Status* GetTFStatus() const { return status_.get(); }
+
+  struct TFStatusDeleter {
+    void operator()(TF_Status* p) const { TF_DeleteStatus(p); }
+  };
+  std::unique_ptr<TF_Status, TFStatusDeleter> status_;
+};
+
+inline TF_Code Status::code() const { return TF_GetCode(status_.get()); }
+
+inline std::string Status::message() const {
+  return std::string(TF_Message(status_.get()));
+}
+
+inline bool Status::ok() const { return code() == TF_OK; }
+
+inline void Status::SetStatus(TF_Code code, const std::string& msg) {
+  TF_SetStatus(status_.get(), code, msg.c_str());
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/tensor.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/tensor.h
new file mode 100644
index 00000000..7aab1cce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/tensor.h
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// Tensor represents an n-dimensional array of values.
+class Tensor {
+ public:
+  using DeleterCallback = std::function<void(void*, size_t)>;
+
+  // Constructs a Tensor from user provided buffer.
+  //
+  // Params:
+  //  dtype - The dtype of the tensor's data.
+  //  shape - A shape vector, where each element corresponds to the size of
+  //          the tensor's corresponding dimension.
+  //  data - Pointer to a buffer of memory to construct a Tensor out of.
+  //  len - The length (in bytes) of `data`
+  //  deleter - A std::function to be called when the Tensor no longer needs the
+  //            memory in `data`. This can be used to free `data`, or
+  //            perhaps decrement a refcount associated with `data`, etc.
+  //  status - Set to OK on success and an error on failure.
+  // Returns:
+  // If an error occurred, status->ok() will be false, and the returned
+  // Tensor must not be used.
+  // TODO(bmzhao): Add Runtime as an argument to this function so we can swap to
+  // a TFRT backed tensor.
+  // TODO(bmzhao): Add benchmarks on overhead for this function; we can
+  // consider using int64_t* + length rather than vector.
+  static Tensor FromBuffer(TF_DataType dtype, const std::vector<int64_t>& shape,
+                           void* data, size_t len, DeleterCallback deleter,
+                           Status* status);
+
+  // TODO(bmzhao): In the case we construct a tensor from non-owned memory,
+  // we should offer a way to deep copy the tensor into a new tensor, which
+  // owns the underlying memory. This could be a .deepcopy()/clone() method.
+
+  // TODO(bmzhao): In the future, we want to relax the non-copyability
+  // constraint. To do so, we can add a C API function that acts like
+  // CopyFrom:
+  // https://github.com/tensorflow/tensorflow/blob/08931c1e3e9eb2e26230502d678408e66730826c/tensorflow/core/framework/tensor.h#L301-L311
+
+  // Tensor is movable, but not copyable
+  Tensor(Tensor&&) = default;
+  Tensor& operator=(Tensor&&) = default;
+
+  // Returns the number of dimensions in the tensor. Can be -1, which represents
+  // unknown rank.
+  int dims() const;
+
+  // Returns the number of elements in dimension `d`.
+  // REQUIRES: `0 <= d < dims()`
+  int64_t dim_size(int d) const;
+
+  // Returns a pointer to the underlying data buffer.
+  void* data() const;
+
+  // Returns the data type of the tensor.
+  TF_DataType dtype() const;
+
+  // Returns the number of elements in the tensor. For a tensor with a partially
+  // defined shape, -1 means not fully defined.
+  int64_t num_elements() const;
+
+  // Returns the size of the underlying data in bytes.
+  size_t num_bytes() const;
+
+ private:
+  friend class TensorHandle;
+  friend class Runtime;
+
+  // Wraps a TF_Tensor. Takes ownership of handle.
+  explicit Tensor(TF_Tensor* tensor) : tensor_(tensor) {}
+
+  // Tensor is not copyable
+  Tensor(const Tensor&) = delete;
+  Tensor& operator=(const Tensor&) = delete;
+
+  // Returns the underlying TF_Tensor that this object wraps.
+  // This object retains ownership of the pointer.
+  TF_Tensor* GetTFTensor() const { return tensor_.get(); }
+
+  struct DeleterStruct {
+    std::function<void(void*, size_t)> deleter;
+  };
+
+  static void DeleterFunction(void* memory, size_t len, void* deleter_struct) {
+    DeleterStruct* deleter = reinterpret_cast<DeleterStruct*>(deleter_struct);
+    deleter->deleter(memory, len);
+    delete deleter;
+  }
+
+  struct TFTensorDeleter {
+    void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+  };
+  std::unique_ptr<TF_Tensor, TFTensorDeleter> tensor_;
+};
+
+inline void* Tensor::data() const { return TF_TensorData(tensor_.get()); }
+
+inline int Tensor::dims() const { return TF_NumDims(tensor_.get()); }
+
+inline int64_t Tensor::dim_size(int d) const {
+  return TF_Dim(tensor_.get(), d);
+}
+
+inline TF_DataType Tensor::dtype() const {
+  return TF_TensorType(tensor_.get());
+}
+
+inline int64_t Tensor::num_elements() const {
+  return TF_TensorElementCount(tensor_.get());
+}
+
+inline size_t Tensor::num_bytes() const {
+  return TF_TensorByteSize(tensor_.get());
+}
+
+inline Tensor Tensor::FromBuffer(TF_DataType dtype,
+                                 const std::vector<int64_t>& shape, void* data,
+                                 size_t len, DeleterCallback deleter,
+                                 Status* status) {
+  // Credit to apassos@ for this technique:
+  // Despite the fact that our API takes a std::function deleter, we are able
+  // to maintain ABI stability because:
+  // 1. Only a function pointer is sent across the C API (&DeleterFunction)
+  // 2. DeleterFunction is defined in the same build artifact that constructed
+  //    the std::function (so there isn't confusion about std::function ABI).
+  // Note that 2. is satisfied by the fact that this is a header-only API, where
+  // the function implementations are inline.
+
+  DeleterStruct* deleter_struct = new DeleterStruct{deleter};
+  TF_Tensor* tensor = TF_NewTensor(dtype, shape.data(), shape.size(), data, len,
+                                   &DeleterFunction, deleter_struct);
+  if (tensor == nullptr) {
+    status->SetStatus(TF_INVALID_ARGUMENT,
+                      "Failed to create tensor for input buffer");
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/tensorhandle.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/tensorhandle.h
new file mode 100644
index 00000000..99453ee7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/public/tensorhandle.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// An opaque representation of a tensor computed/managed by the Tensorflow
+// runtime (tensorflow:cc::Runtime). Unlike a tensor, a Tensorhandle may refer
+// to tensors placed in memory of different devices or remote address spaces.
+// Note that tensorflow::cc::Runtime MUST outlive all TensorHandles created
+// from it.
+class TensorHandle {
+ public:
+  // Unwraps a Tensor from the given TensorHandle. If an error occurred,
+  // status->ok() will be false, and the returned Tensor must not be used.
+  Tensor Resolve(Status* status);
+
+  // Constructs a TensorHandle from a Tensor. If an error occurred,
+  // status->ok() will be false, and the returned TensorHandle must not be used.
+  static TensorHandle FromTensor(const Tensor& tensor, const Runtime& runtime,
+                                 Status* status);
+
+  // TensorHandle is movable, and not copyable
+  TensorHandle(TensorHandle&&) = default;
+  TensorHandle& operator=(TensorHandle&&) = default;
+
+ private:
+  // Wraps a TFE_TensorHandle. Takes ownership of handle.
+  explicit TensorHandle(TFE_TensorHandle* handle) : handle_(handle) {}
+
+  // TensorHandle is not copyable
+  TensorHandle(const TensorHandle&) = delete;
+  TensorHandle& operator=(const TensorHandle&) = delete;
+
+  // Returns the underlying TFE_TensorHandle that this object wraps.
+  // This object retains ownership of the pointer.
+  TFE_TensorHandle* GetTFETensorHandle() const { return handle_.get(); }
+
+  // Deletes the currently wrapped TFE_TensorHandle, and swaps it with handle,
+  // and takes ownership of handle.
+  void Reset(TFE_TensorHandle* handle) { handle_.reset(handle); }
+
+  struct TFETensorHandleDeleter {
+    void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+  };
+  std::unique_ptr<TFE_TensorHandle, TFETensorHandleDeleter> handle_;
+};
+
+inline Tensor TensorHandle::Resolve(Status* status) {
+  TF_Tensor* tensor =
+      TFE_TensorHandleResolve(handle_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+inline TensorHandle TensorHandle::FromTensor(const Tensor& tensor,
+                                             const Runtime& runtime,
+                                             Status* status) {
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandleFromTensor(
+      runtime.GetTFEContext(), tensor.GetTFTensor(), status->GetTFStatus());
+  if (!status->ok()) {
+    return TensorHandle(nullptr);
+  }
+  return TensorHandle(tensor_handle);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
new file mode 100644
index 00000000..1e649d5d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_TESTS_TENSOR_TYPES_TEST_UTIL_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_TESTS_TENSOR_TYPES_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+
+namespace tensorflow {
+
+// Each of the following struct types have two members: a kDType that
+// corresponds to a TF_Datatype enum value, and a typedef "type"
+// of its corresponding C++ type. These types allow us to write Dtype-agnostic
+// tests via GoogleTest's TypedTests:
+// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
+struct FloatType {
+  using type = float;
+  static constexpr TF_DataType kDType = TF_FLOAT;
+};
+
+struct DoubleType {
+  using type = double;
+  static constexpr TF_DataType kDType = TF_DOUBLE;
+};
+
+struct Int32Type {
+  using type = int32_t;
+  static constexpr TF_DataType kDType = TF_INT32;
+};
+
+struct UINT8Type {
+  using type = uint8_t;
+  static constexpr TF_DataType kDType = TF_UINT8;
+};
+
+struct INT8Type {
+  using type = int8_t;
+  static constexpr TF_DataType kDType = TF_INT8;
+};
+
+struct INT64Type {
+  using type = int64_t;
+  static constexpr TF_DataType kDType = TF_INT64;
+};
+
+struct UINT16Type {
+  using type = uint16_t;
+  static constexpr TF_DataType kDType = TF_UINT16;
+};
+
+struct UINT32Type {
+  using type = uint32_t;
+  static constexpr TF_DataType kDType = TF_UINT32;
+};
+
+struct UINT64Type {
+  using type = uint64_t;
+  static constexpr TF_DataType kDType = TF_UINT64;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_TESTS_TENSOR_TYPES_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/libexport/load.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/libexport/load.h
new file mode 100644
index 00000000..6775f73b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/libexport/load.h
@@ -0,0 +1,108 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_LIBEXPORT_LOAD_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_LIBEXPORT_LOAD_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/trackable_object_graph.pb.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+namespace libexport {
+
+// A low-level representation of a SavedModel.
+//
+// This class should only ever be a thin wrapper around disk (or other storage)
+// access for a SavedModel.  Higher level functionality should be layered on top
+// by other functions and classes.
+//
+// In the future, this class can also provide a mechanism for automatic version
+// migration.  This will allow the calling code to always work against the most
+// recent version of SavedModel.
+class TFPackage {
+ public:
+  // Load a SavedModel, parsing the associated protobuf for later access.
+  static absl::StatusOr<TFPackage> Load(const std::string& path);
+
+  // Reads and returns a checkpoint key associated with a variable.
+  //
+  // The variable is identified by the index in the object graph node list.
+  //
+  // RestoreV2 is the operation that will ultimately be responsible for reading
+  // and restoring the variable(s)' values.  Variable values are indexed in the
+  // checkpoint files by "checkpoint keys".  These keys along with dtype and
+  // shape / slice information allow RestoreV2 to look up a variable's value in
+  // the SavedModel and restore it into a tensor.
+  absl::StatusOr<std::string> GetVariableCheckpointKey(int index);
+
+  // Retrieves the object graph from the SavedModel.
+  //
+  // For now, we're returning the object graph directly (i.e. the parsed proto)
+  // rather than adding abstraction on top.  We may later find we would like an
+  // intermediate abstraction layer to make traversal easier, but for now the
+  // extra complexity doesn't seem justified.  Regardless of what we choose,
+  // that logic should live outside this class; this class should continue to
+  // have the clearly-defined, singular responsibility of reading and parsing
+  // the low-level, serialized format.
+  const SavedObjectGraph& GetObjectGraph();
+
+  // Retrieves a specific GraphDef node by name.
+  //
+  // GraphDef nodes are stored as a repeating list of nodes.  At module load
+  // time, a module may have constants that need to be restored.  To restore
+  // these constants, they are looked up in the GraphDef's nodes by their name.
+  // Since we may need to load many constants, we create a hash map of these
+  // names to their corresponding nodes at load time in order to look them up
+  // in constant time.
+  absl::StatusOr<const tensorflow::NodeDef*> GetGraphDefNode(std::string name);
+
+  // Returns a list of function defs in the SavedModel.
+  const protobuf::RepeatedPtrField<FunctionDef>& GetFunctionDefs();
+
+  // Returns a BundleReader for reading variable values.
+  //
+  // This TFPackage retains ownership of the underlying reader.
+  tensorflow::BundleReader* GetVariableReader() {
+    return variable_reader_.get();
+  }
+
+  // Returns whether or not we found a valid checkpoint when loading the
+  // package.
+  bool HasCheckpoint() { return has_checkpoint_; }
+
+  // Returns the path to the variables file.
+  const std::string GetVariablesFilepath() const { return variables_filepath_; }
+
+ private:
+  SavedModel saved_model_proto_;
+  TrackableObjectGraph trackable_object_graph_;
+  std::unique_ptr<tensorflow::BundleReader> variable_reader_;
+  std::string variables_filepath_;
+  bool has_checkpoint_;
+  absl::flat_hash_map<std::string, const NodeDef*> graph_def_nodes_by_name_;
+};
+
+}  // namespace libexport
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_LIBEXPORT_LOAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/experimental/libexport/save.h b/third_party/tflite-hdrs/tensorflow/cc/experimental/libexport/save.h
new file mode 100644
index 00000000..382f4645
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/experimental/libexport/save.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_LIBEXPORT_SAVE_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_LIBEXPORT_SAVE_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace libexport {
+
+// Writes a saved model to disk.
+//
+// Writes a saved model to the given `export_dir`.
+TF_EXPORT Status Save(const std::string& export_dir);
+
+}  // namespace libexport
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_EXPORT_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/cc_op_gen.h b/third_party/tflite-hdrs/tensorflow/cc/framework/cc_op_gen.h
new file mode 100644
index 00000000..7b348365
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/cc_op_gen.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#define TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace cc_op {
+/// Result is written to files dot_h and dot_cc.
+void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& dot_h_fname, const string& dot_cc_fname);
+
+}  // namespace cc_op
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/cc_op_gen_util.h b/third_party/tflite-hdrs/tensorflow/cc/framework/cc_op_gen_util.h
new file mode 100644
index 00000000..4e3272c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/cc_op_gen_util.h
@@ -0,0 +1,148 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_UTIL_H_
+#define TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_UTIL_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace cc_op {
+
+absl::StatusOr<ApiDefMap> LoadOpsAndApiDefs(
+    OpList& ops, bool include_internal,
+    const std::vector<string>& api_def_dirs);
+
+// Converts:
+//   bazel-out/.../(bin|genfiles)/(external/YYY/)?XX
+// to: XX.
+string GetPath(absl::string_view dot_h_fname);
+
+// Converts: some/path/to/file.xx
+// to: file
+// (note that suffix is removed)
+string GetFilename(absl::string_view path);
+
+// Converts:
+//   cc/ops/gen_foo_ops.h
+// to:
+//   CC_OPS_GEN_FOO_OPS_H_
+string ToGuard(absl::string_view path);
+
+// Converts: some_name_xyz
+// to: Some Name Xyz
+string ToTitle(absl::string_view name);
+
+// Change:     Into:
+//   ABC         /// ABC
+//               ///
+//   DEF         /// DEF
+string MakeComment(absl::string_view text, absl::string_view indent);
+
+string PrintString(absl::string_view str);
+
+string PrintTensorShape(const TensorShapeProto& shape_proto);
+
+template <typename T>
+string PrintArray(int64_t num_elts, const T* array) {
+  string ret;
+  for (int64_t i = 0; i < num_elts; ++i) {
+    if (i > 0) strings::StrAppend(&ret, ", ");
+    strings::StrAppend(&ret, array[i]);
+  }
+  return ret;
+}
+
+string PrintTensor(const TensorProto& tensor_proto);
+
+string PrintTensorProto(const TensorProto& proto);
+
+string PrintAttrValue(absl::string_view, const AttrValue& attr_value);
+
+bool IsEmptyList(const AttrValue::ListValue& list);
+
+string ToCamelCase(absl::string_view str);
+
+string SeparateNamespaces(absl::string_view str);
+
+// Returns a <string, bool> pair. The string is the C++ type name to be used for
+// attr_type when defining an object of that type. The bool is a flag to
+// indicate whether to treat the type as const when accepting the C++ type as an
+// argument to a function.
+std::pair<absl::string_view, bool> AttrTypeName(absl::string_view attr_type);
+
+absl::string_view ListElementTypeName(absl::string_view attr_type);
+
+bool IsCPPKeyword(absl::string_view name);
+
+string AvoidCPPKeywords(absl::string_view name);
+
+void InferArgAttributes(const OpDef::ArgDef& arg,
+                        std::unordered_map<string, string>* inferred_attrs);
+
+void InferOpAttributes(
+    const OpDef& op_def,
+    std::unordered_map<string, string>* inferred_input_attrs);
+
+bool ArgIsList(const OpDef::ArgDef& arg);
+
+bool HasOptionalAttrs(
+    const ApiDef& api_def,
+    const std::unordered_map<string, string>& inferred_input_attrs);
+
+struct OpInfo {
+  // graph_op_def: The OpDef used by the runtime, has the names that
+  //   must be used when calling NodeBuilder.
+  // interface_op_def: The OpDef used in the interface in the generated
+  //   code, with possibly overridden names and defaults.
+  OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
+         const std::vector<string>& aliases);
+  OpInfo(const OpDef& graph_op_def, const ApiDef& api_def);
+  string GetOpAttrStruct() const;
+  string GetConstructorDecl(absl::string_view op_name_prefix,
+                            bool include_attr) const;
+
+  string op_name;
+  std::vector<string> arg_types;
+  std::vector<string> arg_names;
+  std::vector<string> output_types;
+  std::vector<string> output_names;
+  std::vector<bool> is_list_output;
+  bool has_optional_attrs;
+  string comment;
+
+  const OpDef& graph_op_def;
+  const ApiDef& api_def;
+  const std::vector<string>& aliases;
+  // Map from type attribute to corresponding original argument name.
+  std::unordered_map<string, string> inferred_input_attrs;
+};
+
+}  // namespace cc_op
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h b/third_party/tflite-hdrs/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
new file mode 100644
index 00000000..c11c9635
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_FUZZING_CC_OP_FUZZ_GEN_H_
+#define TENSORFLOW_CC_FRAMEWORK_FUZZING_CC_OP_FUZZ_GEN_H_
+
+#include "tensorflow/cc/framework/cc_op_gen_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace cc_op {
+
+// String with single fuzzer file content.
+string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable);
+
+// Do we have all we need to create a fuzzer
+bool OpFuzzingIsOk(const OpInfo& op_info);
+
+}  // namespace cc_op
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_FUZZING_CC_OP_FUZZ_GEN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/grad_op_registry.h b/third_party/tflite-hdrs/tensorflow/cc/framework/grad_op_registry.h
new file mode 100644
index 00000000..b0847844
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/grad_op_registry.h
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace ops {
+
+/// GradFunc is the signature for all gradient functions in GradOpRegistry.
+/// Implementations should add operations to compute the gradient outputs of
+/// 'op' (returned in 'grad_outputs') using 'scope' and 'grad_inputs'.
+typedef absl::Status (*GradFunc)(const Scope& scope, const Operation& op,
+                                 const std::vector<Output>& grad_inputs,
+                                 std::vector<Output>* grad_outputs);
+
+/// GradOpRegistry maintains a static registry of gradient functions.
+/// Gradient functions are indexed in the registry by the forward op name (i.e.
+/// "MatMul" -> MatMulGrad func).
+class GradOpRegistry {
+ public:
+  /// Registers 'func' as the gradient function for 'op'.
+  /// Returns true if registration was successful, check fails otherwise.
+  bool Register(const string& op, GradFunc func);
+
+  /// Sets 'func' to the gradient function for 'op' and returns Status OK if
+  /// the gradient function for 'op' exists in the registry.
+  /// Note that 'func' can be null for ops that have registered no-gradient with
+  /// the registry.
+  /// Returns error status otherwise.
+  absl::Status Lookup(const string& op, GradFunc* func) const;
+
+  /// Returns a pointer to the global gradient function registry.
+  static GradOpRegistry* Global();
+
+ private:
+  std::unordered_map<string, GradFunc> registry_;
+};
+
+}  // namespace ops
+
+// Macros used to define gradient functions for ops.
+#define REGISTER_GRADIENT_OP(name, fn) \
+  REGISTER_GRADIENT_OP_UNIQ_HELPER(__COUNTER__, name, fn)
+
+#define REGISTER_NO_GRADIENT_OP(name) \
+  REGISTER_GRADIENT_OP_UNIQ_HELPER(__COUNTER__, name, nullptr)
+
+#define REGISTER_GRADIENT_OP_UNIQ_HELPER(ctr, name, fn) \
+  REGISTER_GRADIENT_OP_UNIQ(ctr, name, fn)
+
+#define REGISTER_GRADIENT_OP_UNIQ(ctr, name, fn) \
+  static bool unused_ret_val_##ctr =             \
+      ::tensorflow::ops::GradOpRegistry::Global()->Register(name, fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/gradient_checker.h b/third_party/tflite-hdrs/tensorflow/cc/framework/gradient_checker.h
new file mode 100644
index 00000000..20b6545f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/gradient_checker.h
@@ -0,0 +1,65 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+/// Returns in 'max_error' the maximum element-wise error for dy/dx between the
+/// computed and numeric Jacobian matrices where 'xs' and 'ys' are tensors.
+/// X_T and Y_T are the c++ types for the x and y tensors, and JAC_T is a
+/// real-valued type to store the Jacobian derivatives dy/dx.
+/// This function adds operations to the graph associated with 'scope'.
+///
+/// Examples:
+/// if y = Square(x), where x (and so y) are DT_FLOAT,
+/// <X_T, Y_T, JAC_T> should be <float, float, float>
+///
+/// if y = Square(x), where x (and so y) are DT_DOUBLE,
+/// <X_T, Y_T, JAC_T> should be <double, double, double>
+///
+/// if y = Square(x), where x (and so y) are DT_COMPLEX64,
+/// <X_T, Y_T, JAC_T> should be <complex64, complex64, float>
+/// Note that JAC_T is always real-valued, and should be an appropriate
+/// precision to host the partial derivatives for dy/dx
+///
+/// if y = ComplexAbs(x) where x is DT_COMPLEX64 (so y is DT_FLOAT)
+/// <X_T, Y_T, JAC_T> should be <complex64, float, float>
+///
+/// if y = Complex(x, x) where x is DT_FLOAT (so y is DT_COMPLEX64)
+/// <X_T, Y_T, JAC_T> should be <float, complex64, float>
+template <typename X_T, typename Y_T, typename JAC_T>
+absl::Status ComputeGradientError(const Scope& scope, const OutputList& xs,
+                                  const std::vector<TensorShape>& x_shapes,
+                                  const OutputList& ys,
+                                  const std::vector<TensorShape>& y_shapes,
+                                  JAC_T* max_error);
+
+/// Overload of ComputeGradientError which takes an initial value for 'x'.
+template <typename X_T, typename Y_T, typename JAC_T>
+absl::Status ComputeGradientError(const Scope& scope, const Output& x,
+                                  const Tensor& x_init_value, const Output& y,
+                                  const TensorShape& y_shape, JAC_T* max_error);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/gradients.h b/third_party/tflite-hdrs/tensorflow/cc/framework/gradients.h
new file mode 100644
index 00000000..c79269fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/gradients.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+
+/// NOTE: This API is a work in progress and will likely be changing frequently.
+///
+/// Given initial gradients 'grad_inputs' (which represent the symbolic partial
+/// derivatives of some loss function 'L' w.r.t 'outputs'), adds gradient nodes
+/// to the graph associated with 'scope', which compute (and return in
+/// 'grad_outputs') the symbolic partial derivatives of 'L' w.r.t 'inputs'.
+absl::Status AddSymbolicGradients(const Scope& scope,
+                                  const std::vector<Output>& outputs,
+                                  const std::vector<Output>& inputs,
+                                  const std::vector<Output>& grad_inputs,
+                                  std::vector<Output>* grad_outputs);
+
+// Same as above, but uses 'OnesLike' for all shapes in
+// 'outputs' as grad_inputs.
+absl::Status AddSymbolicGradients(const Scope& scope,
+                                  const std::vector<Output>& outputs,
+                                  const std::vector<Output>& inputs,
+                                  std::vector<Output>* grad_outputs);
+
+/// Returns a sentinel Output that represents 'no gradient' (i.e. no gradient
+/// flows along some graph edge during backpropagation).
+/// Can be returned in 'grad_outputs' by an invocation of 'AddSymbolicGradients'
+/// (note that gradient flow through an Output can be stopped through the use of
+/// the StopGradient node).
+Output NoGradient();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/ops.h b/third_party/tflite-hdrs/tensorflow/cc/framework/ops.h
new file mode 100644
index 00000000..e856e311
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/ops.h
@@ -0,0 +1,304 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#define TENSORFLOW_CC_FRAMEWORK_OPS_H_
+
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+/// @defgroup core Core Tensorflow API
+
+class Output;
+
+/// @addtogroup core
+/// @{
+
+/// Represents a node in the computation graph.
+class Operation {
+ public:
+  Operation() : node_(nullptr) {}
+  explicit Operation(Node* n);
+
+  int32 num_inputs() const { return node_->num_inputs(); }
+  DataType input_type(int32_t o) const { return node_->input_type(o); }
+  Output input(int32_t i) const;
+
+  int32 num_outputs() const { return node_->num_outputs(); }
+  DataType output_type(int32_t o) const { return node_->output_type(o); }
+  Output output(int32_t i) const;
+
+  Node* node() const { return node_; }
+
+  uint64 hash(int32_t index) const;
+
+  bool operator==(const Operation& other) const { return node_ == other.node_; }
+
+ private:
+  typedef std::vector<std::pair<Node*, int32>> Inputs;
+  static Inputs GetInputs(Node* node);
+
+  Inputs inputs_;
+  Node* node_;
+};
+
+/// Represents a tensor value produced by an Operation.
+class Output {
+ public:
+  Output() = default;
+  explicit Output(Node* n) : op_(n) {}
+  Output(Node* n, int32_t index) : op_(n), index_(index) {}
+  Output(const Operation& op, int32_t index) : op_(op), index_(index) {}
+
+  Operation op() const { return op_; }
+  Node* node() const { return op().node(); }
+  int32 index() const { return index_; }
+  DataType type() const { return op_.output_type(index_); }
+  std::string name() const {
+    return strings::StrCat(node()->name(), ":", index());
+  }
+  bool operator==(const Output& other) const {
+    return op_ == other.op_ && index_ == other.index_;
+  }
+
+  uint64 hash() const { return op_.hash(index_); }
+
+ private:
+  Operation op_ = Operation(nullptr);
+  int32 index_ = 0;
+};
+
+/// Hash class that can be used for e.g. storing Outputs in an unordered_map
+struct OutputHash {
+  std::size_t operator()(const Output& output) const {
+    return Hash64Combine(std::hash<Node*>()(output.node()),
+                         std::hash<int32>()(output.index()));
+  }
+};
+
+/// Represents a tensor value that can be used as an operand to an Operation.
+class Input {
+ public:
+  /// Initializer enables constructing an Input object from various kinds of C++
+  /// constants such as simple primitive constants and nested initializer lists
+  /// representing a multi-dimensional array. Initializer constructors are all
+  /// templates, so the aforementioned kinds of C++ constants can be used to
+  /// construct an Initializer. Initializer stores the value it got constructed
+  /// with in a Tensor object.
+  struct Initializer {
+    /// Construct from a scalar value of an arithmetic type or a type that can
+    /// be converted to a string (eg. a string literal).
+    template <typename T, typename = typename std::enable_if<
+                              std::is_arithmetic<T>::value ||
+                              std::is_convertible<T, std::string>::value>::type>
+    Initializer(const T& v) {  // NOLINT(runtime/explicit)
+      typedef typename RealType<T>::type RealT;
+      Tensor t(DataTypeToEnum<RealT>::v(), TensorShape());
+      t.flat<RealT>()(0) = RealT(v);
+      tensor = t;
+    }
+
+    Initializer(const Tensor& t) : tensor(t) {}  // NOLINT(runtime/explicit)
+
+    /// Construct from a scalar value and an explicit shape
+    template <typename T, typename = typename std::enable_if<
+                              std::is_arithmetic<T>::value ||
+                              std::is_convertible<T, std::string>::value>::type>
+    Initializer(const T& v, const TensorShape& shape) {
+      typedef typename RealType<T>::type RealT;
+      Tensor t(DataTypeToEnum<RealT>::v(), shape);
+      for (int64_t i = 0; i < t.NumElements(); ++i) {
+        t.flat<RealT>()(i) = RealT(v);
+      }
+      tensor = t;
+    }
+
+    /// Construct from a initializer list of scalars (a one-dimensional tensor).
+    template <typename T, typename = typename std::enable_if<
+                              std::is_arithmetic<T>::value ||
+                              std::is_convertible<T, std::string>::value>::type>
+    Initializer(
+        const std::initializer_list<T>& v) {  // NOLINT(runtime/explicit)
+      typedef typename RealType<T>::type RealT;
+      Tensor t(DataTypeToEnum<RealT>::v(),
+               TensorShape{static_cast<int>(v.size())});
+      std::copy_n(v.begin(), v.size(), t.flat<RealT>().data());
+      tensor = t;
+    }
+
+    /// Construct from a initializer list of scalars and an explicit shape.
+    template <typename T, typename = typename std::enable_if<
+                              std::is_arithmetic<T>::value ||
+                              std::is_convertible<T, std::string>::value>::type>
+    Initializer(const std::initializer_list<T>& v, const TensorShape& shape) {
+      typedef typename RealType<T>::type RealT;
+      Tensor t(DataTypeToEnum<RealT>::v(), shape);
+      if (t.NumElements() != static_cast<int64_t>(v.size())) {
+        status = absl::InvalidArgumentError(absl::StrCat(
+            "Cannot construct a tensor with ", t.NumElements(),
+            " from an initializer list with ", v.size(), " elements"));
+        return;
+      }
+      std::copy_n(v.begin(), v.size(), t.flat<RealT>().data());
+      tensor = t;
+    }
+
+    /// Construct a multi-dimensional tensor from a nested initializer
+    /// list. Note that C++ syntax allows nesting of arbitrarily typed
+    /// initializer lists, so such invalid initializers cannot be disallowed at
+    /// compile time. This function performs checks to make sure that the nested
+    /// initializer list is indeed a valid multi-dimensional tensor.
+    Initializer(const std::initializer_list<Initializer>& v);
+
+    // START_SKIP_DOXYGEN
+    template <typename T, bool = std::is_convertible<T, std::string>::value>
+    struct RealType {
+      typedef tstring type;
+    };
+
+    template <typename T>
+    struct RealType<T, false> {
+      typedef T type;
+    };
+    // END_SKIP_DOXYGEN
+
+    TensorProto AsTensorProto() {
+      TensorProto tensor_proto;
+      if (tensor.NumElements() > 1) {
+        tensor.AsProtoTensorContent(&tensor_proto);
+      } else {
+        tensor.AsProtoField(&tensor_proto);
+      }
+      return tensor_proto;
+    }
+
+    absl::Status status;
+    Tensor tensor;
+  };
+
+  /// All of Input's constructors are implicit. Input can be implicitly
+  /// constructed from the following objects :
+  /// * Output: This is so that the output of an Operation can be directly used
+  ///   as the input to a op wrapper, which takes Inputs.
+  /// * A scalar, or a multi-dimensional tensor specified as a recursive
+  ///   initializer list. This enables directly passing constants as
+  ///   inputs to op wrappers.
+  /// * A Tensor object.
+  Input(const Output& o) : output_(o) {}  // NOLINT(runtime/explicit)
+
+  template <typename T, typename = typename std::enable_if<
+                            std::is_arithmetic<T>::value ||
+                            std::is_convertible<T, std::string>::value>::type>
+  Input(const T& v)  // NOLINT(runtime/explicit)
+      : Input(Initializer(v)) {}
+
+  Input(const Initializer& init)  // NOLINT(runtime/explicit)
+      : status_(init.status),
+        tensor_(init.tensor) {}
+
+  Input(const Tensor& t)  // NOLINT(runtime/explicit)
+      : status_(absl::OkStatus()), tensor_(t) {}
+
+  Input(const std::initializer_list<Initializer>&
+            init) {  // NOLINT(runtime/explicit)
+    for (const auto& i : init) {
+      if (!i.status.ok()) {
+        status_ = i.status;
+        return;
+      }
+    }
+    tensor_ = Initializer(init).tensor;
+  }
+
+  /// Constructor specifying a node name, index and datatype. This should only
+  /// be used for specifying a backward edge, needed by control flow.
+  Input(const std::string& name, int32_t i, DataType dt)
+      : node_name_(name), index_(i), data_type_(dt) {}
+
+  Node* node() const { return output_.node(); }
+  std::string node_name() const { return node_name_; }
+  int32 index() const { return node_name_.empty() ? output_.index() : index_; }
+  DataType data_type() const { return data_type_; }
+  absl::Status status() const { return status_; }
+  const Tensor& tensor() const { return tensor_; }
+
+ private:
+  absl::Status status_;
+  Output output_ = Output(Operation(nullptr), 0);
+  Tensor tensor_;
+  const std::string node_name_ = "";
+  int32 index_ = 0;
+  DataType data_type_ = DT_INVALID;
+};
+
+/// A type for representing the output of ops that produce more than one output,
+/// or a list of tensors.
+typedef std::vector<Output> OutputList;
+
+/// A type for representing the input to ops that require a list of tensors.
+class InputList {
+ public:
+  /// Implicitly convert a list of outputs to a list of inputs. This is useful
+  /// to write code such as ops::Concat(ops::Split(x, 4)).
+  InputList(const OutputList& out) {  // NOLINT(runtime/explicit)
+    for (auto const& x : out) {
+      inputs_.push_back(x);
+    }
+  }
+
+  InputList(
+      const std::initializer_list<Input>& inputs)  // NOLINT(runtime/explicit)
+      : inputs_(inputs.begin(), inputs.end()) {}
+
+  InputList(const absl::Span<const Input>& inputs)  // NOLINT(runtime/explicit)
+      : inputs_(inputs.begin(), inputs.end()) {}
+
+  InputList(
+      const std::initializer_list<Output>& out) {  // NOLINT(runtime/explicit)
+    for (auto const& x : out) {
+      inputs_.push_back(x);
+    }
+  }
+
+  typename std::vector<Input>::iterator begin() { return inputs_.begin(); }
+  typename std::vector<Input>::iterator end() { return inputs_.end(); }
+  typename std::vector<Input>::const_iterator begin() const {
+    return inputs_.begin();
+  }
+  typename std::vector<Input>::const_iterator end() const {
+    return inputs_.end();
+  }
+
+ private:
+  std::vector<Input> inputs_;
+};
+
+/// @}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/scope.h b/third_party/tflite-hdrs/tensorflow/cc/framework/scope.h
new file mode 100644
index 00000000..9b8896e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/scope.h
@@ -0,0 +1,270 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#define TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+class Graph;
+class GraphDef;
+class NodeBuilder;
+struct CompositeOpScopes;
+
+/// @addtogroup core
+/// @{
+
+/// A `Scope` object represents a set of related TensorFlow ops that have the
+/// same properties such as a common name prefix.
+///
+/// A Scope object is a container for TensorFlow Op properties. Op constructors
+/// get a Scope object as a mandatory first argument and the constructed op
+/// acquires the properties in the object.
+///
+/// A simple example:
+///
+///     using namespace ops;
+///     Scope root = Scope::NewRootScope();
+///     auto c1 = Const(root, { {1, 1} });
+///     auto m = MatMul(root, c1, { {41}, {1} });
+///     GraphDef gdef;
+///     Status s = root.ToGraphDef(&gdef);
+///     if (!s.ok()) { ... }
+///
+/// Scope hierarchy:
+///
+/// The Scope class provides various With<> functions that create a new scope.
+/// The new scope typically has one property changed while other properties are
+/// inherited from the parent scope.
+/// NewSubScope(name) method appends `name` to the prefix of names for ops
+/// created within the scope, and WithOpName() changes the suffix which
+/// otherwise defaults to the type of the op.
+///
+/// Name examples:
+///
+///     Scope root = Scope::NewRootScope();
+///     Scope linear = root.NewSubScope("linear");
+///     // W will be named "linear/W"
+///     auto W = Variable(linear.WithOpName("W"),
+///                       {2, 2}, DT_FLOAT);
+///     // b will be named "linear/b_3"
+///     int idx = 3;
+///     auto b = Variable(linear.WithOpName("b_", idx),
+///                       {2}, DT_FLOAT);
+///     auto x = Const(linear, {...});  // name: "linear/Const"
+///     auto m = MatMul(linear, x, W);  // name: "linear/MatMul"
+///     auto r = BiasAdd(linear, m, b); // name: "linear/BiasAdd"
+///
+/// Scope lifetime:
+///
+/// A new scope is created by calling Scope::NewRootScope. This creates some
+/// resources that are shared by all the child scopes that inherit from this
+/// scope, directly or transitively. For instance, a new scope creates a new
+/// Graph object to which operations are added when the new scope or its
+/// children are used by an Op constructor. The new scope also has a Status
+/// object which will be used to indicate errors by Op-constructor functions
+/// called on any child scope. The Op-constructor functions have to check the
+/// scope's status by calling the ok() method before proceeding to construct the
+/// op.
+///
+/// Thread safety:
+///
+/// A `Scope` object is NOT thread-safe. Threads cannot concurrently call
+/// op-constructor functions on the same `Scope` object.
+class Scope {
+ public:
+  Scope(const Scope& other);
+  ~Scope();
+  Scope& operator=(const Scope& other);
+
+  // The following functions are for users making graphs. They return brand new
+  // scopes, or scopes derived from an existing scope object.
+
+  /// Return a new scope.
+  /// This creates a new graph and all operations constructed in this graph
+  /// should use the returned object as the "root" scope.
+  static Scope NewRootScope();
+
+  /// Return a new scope. Ops created with this scope will have
+  /// `name/child_scope_name` as the prefix. The actual name will be unique
+  /// in the current scope. All other properties are inherited from the current
+  /// scope. If `child_scope_name` is empty, the `/` is elided.
+  Scope NewSubScope(const string& child_scope_name) const;
+
+  /// Return a new scope. All ops created within the returned scope will have
+  /// names of the form `name/StrCat(fragments...)[_suffix]`
+  template <typename... Ty>
+  Scope WithOpName(Ty... fragments) const {
+    return WithOpNameImpl(absl::StrCat(fragments...));
+  }
+
+  /// Return a new scope. All ops created within the returned scope will have as
+  /// control dependencies the union of operations in the control_deps vector
+  /// and the control dependencies of the current scope.
+  Scope WithControlDependencies(absl::Span<const Operation> control_deps) const;
+  /// Same as above, but convenient to add control dependency on the operation
+  /// producing the control_dep output.
+  Scope WithControlDependencies(const Output& control_dep) const;
+
+  /// Return a new scope. All ops created within the returned scope will have no
+  /// control dependencies on other operations.
+  Scope WithNoControlDependencies() const;
+
+  /// Return a new scope. All ops created within the returned scope will have
+  /// the device field set to 'device'.
+  Scope WithDevice(const string& device) const;
+
+  /// Returns a new scope.  All ops created within the returned scope will have
+  /// their assigned device set to `assigned_device`.
+  Scope WithAssignedDevice(const string& assigned_device) const;
+
+  /// Returns a new scope.  All ops created within the returned scope will have
+  /// their _XlaCluster attribute set to `xla_cluster`.
+  Scope WithXlaCluster(const string& xla_cluster) const;
+
+  /// Return a new scope. All ops created within the returned scope will be
+  /// co-located on the device where op is placed.
+  /// NOTE: This function is intended to be use internal libraries only for
+  /// controlling placement of ops on to devices. Public use is not encouraged
+  /// because the implementation of device placement is subject to change.
+  Scope ColocateWith(const Operation& op) const;
+  /// Convenience function for above.
+  Scope ColocateWith(const Output& out) const { return ColocateWith(out.op()); }
+  /// Clear all colocation constraints.
+  Scope ClearColocation() const;
+
+  /// Return a new scope. The op-constructor functions taking the returned scope
+  /// as the scope argument will exit as soon as an error is detected, instead
+  /// of setting the status on the scope.
+  Scope ExitOnError() const;
+
+  /// Return a new scope. All ops created with the new scope will have
+  /// kernel_label as the value for their '_kernel' attribute;
+  Scope WithKernelLabel(const string& kernel_label) const;
+
+  // The following functions are for scope object consumers.
+
+  /// Return a unique name, using default_name if an op name has not been
+  /// specified.
+  string GetUniqueNameForOp(const string& default_name) const;
+
+  /// Update the status on this scope.
+  /// Note: The status object is shared between all children of this scope.
+  /// If the resulting status is not OkStatus() and exit_on_error_ is set on
+  /// this scope, this function exits by calling LOG(FATAL).
+  void UpdateStatus(const absl::Status& s) const;
+
+  // START_SKIP_DOXYGEN
+
+  /// Update the builder with properties accumulated in this scope. Does not set
+  /// status().
+  // TODO(skyewm): NodeBuilder is not part of public API
+  void UpdateBuilder(NodeBuilder* builder) const;
+  // END_SKIP_DOXYGEN
+
+  CompositeOpScopes GetCompositeOpScopes(const string& composite_op_name) const;
+
+  bool ok() const;
+
+  // TODO(skyewm): Graph is not part of public API
+  Graph* graph() const;
+
+  // TODO(skyewm): Graph is not part of public API
+  std::shared_ptr<Graph> graph_as_shared_ptr() const;
+
+  absl::Status status() const;
+
+  /// If status() is ok, convert the Graph object stored in this scope
+  /// to a GraphDef proto and return an ok Status. Otherwise, return the error
+  /// status as is without performing GraphDef conversion. If
+  /// `include_debug_info` is true, populate the `debug_info` field of the
+  /// GraphDef from stack traces in this Graph.
+  absl::Status ToGraphDef(GraphDef* gdef,
+                          bool include_debug_info = false) const;
+
+  // START_SKIP_DOXYGEN
+
+  /// If status() is OkStatus(), construct a Graph object using `opts` as the
+  /// GraphConstructorOptions, and return Status::OK if graph construction was
+  /// successful. Otherwise, return the error status.
+  // TODO(josh11b, keveman): Make this faster; right now it converts
+  // Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
+  // edges from the source and to the sink node, resolves back edges
+  // by name), and makes sure the resulting graph is valid.
+  absl::Status ToGraph(
+      Graph* g, GraphConstructorOptions opts = GraphConstructorOptions{}) const;
+
+  // Calls AddNode() using this scope's ShapeRefiner. This exists in the public
+  // API to prevent custom op wrappers from needing access to shape_refiner.h or
+  // scope_internal.h.
+  // TODO(skyewm): remove this from public API
+  absl::Status DoShapeInference(Node* node) const;
+
+  // Creates a new root scope that causes all DoShapeInference() calls to return
+  // OkStatus() (on the returned scope and any subscopes). Used for testing.
+  // TODO(skyewm): fix tests that still require this and eventually remove, or
+  // at least remove from public API
+  static Scope DisabledShapeInferenceScope();
+  // END_SKIP_DOXYGEN
+
+  const std::vector<Operation>& control_deps() const;
+
+  // START_SKIP_DOXYGEN
+  class Impl;
+  Impl* impl() { return impl_.get(); }
+  const Impl* impl() const { return impl_.get(); }
+  // END_SKIP_DOXYGEN
+
+ private:
+  Scope WithOpNameImpl(const string& op_name) const;
+
+  friend class InternalScope;
+  std::unique_ptr<Impl> impl_;
+  explicit Scope(Impl*);
+};
+
+/// A helper struct to hold the scopes that would be used by a function
+/// constructing a composite op.
+struct CompositeOpScopes {
+  /// Scope to be used for creating the local ops (primitive or other composite
+  /// ops).
+  Scope child;
+  /// Scope to be used for creating the last op.
+  Scope last;
+};
+
+// Creates a node of the given operation, with the given inputs, and assigns the
+// result to output. This does not support the ability to add additional
+// attributes.
+absl::Status CreateOutputWithScope(string op_name,
+                                   absl::Span<const ::tensorflow::Input> inputs,
+                                   const Scope& scope, Output* output);
+/// @}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/scope_internal.h b/third_party/tflite-hdrs/tensorflow/cc/framework/scope_internal.h
new file mode 100644
index 00000000..0cf6af68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/scope_internal.h
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#define TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+
+class ShapeRefiner;
+
+// NewInternalScope returns a new scope which doesn't take ownership of
+// graph, status, name_map, and refiner.
+// This is intended to enable the C API (which are used by other language
+// bindings) to create a Scope and access C++ functionality (i.e. gradients).
+//
+// Shape inference is disabled if `refiner` is nullptr.
+Scope NewInternalScope(Graph* graph, absl::Status* status,
+                       ShapeRefiner* refiner);
+
+class Scope::Impl {
+ public:
+  // A NameMap is used to keep track of suffixes for names used in a scope. A
+  // name that has not been used so far in a scope will get no suffix. Later
+  // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
+  // can share the same NameMap. For instance, a new scope created using
+  // WithControlDependencies() would share the same NameMap with the parent.
+  typedef std::unordered_map<string, int> NameMap;
+
+  Impl(const std::shared_ptr<Graph>& graph,
+       const std::shared_ptr<absl::Status>& status,
+       const std::shared_ptr<NameMap>& name_map,
+       const std::shared_ptr<ShapeRefiner>& refiner);
+
+  const string& name() const { return name_; }
+  const std::vector<Operation>& control_deps() const { return control_deps_; }
+
+ private:
+  friend class Scope;
+
+  // Tag types to choose the constructor to dispatch.
+  struct Tags {
+    enum class ScopeName;
+    enum class OpName;
+    enum class ControlDeps;
+    enum class Device;
+    enum class SingleUseScope;
+    enum class ExitOnError;
+    enum class KernelLabel;
+    enum class Colocate;
+    enum class AssignedDevice;
+    enum class XlaCluster;
+  };
+
+  Impl(Graph* graph, absl::Status* status, NameMap* name_map,
+       ShapeRefiner* refiner, bool disable_shape_inference);
+  Impl(const Scope& other, Tags::ScopeName, const string& name,
+       bool copy_names);
+  Impl(const Scope& other, Tags::OpName, const string& name,
+       const string& op_name);
+  Impl(const Scope& other, Tags::ControlDeps,
+       std::vector<Operation> control_deps, bool clear_control_deps);
+  Impl(const Scope& other, Tags::Device, const string& device);
+  Impl(const Scope& other, Tags::SingleUseScope, const string& op_name);
+  Impl(const Scope& other, Tags::ExitOnError);
+  Impl(const Scope& other, Tags::KernelLabel, const string& kernel_label);
+  Impl(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
+       bool clear_colocations);
+  Impl(const Scope& other, Tags::AssignedDevice, const string& assigned_device);
+  Impl(const Scope& other, Tags::XlaCluster, const string& xla_cluster);
+
+  std::unordered_set<string> GetColocationConstraints(
+      const Operation& colocate_with_op) const;
+
+  // Helper functions to get a unique names.
+  string GetUniqueName(const string& prefix, bool check_single_use) const;
+  string GetNameForOp(const string& default_name) const;
+
+  bool single_use_scope() const { return scope_used_ != nullptr; }
+
+  // The graph, status, and name maps are shared by all child scopes
+  // created from a single 'root' scope. A root scope is created by calling the
+  // Scope::NewRootScope function, which creates a new graph, a new status and
+  // the name maps.
+  std::shared_ptr<Graph> graph_ = nullptr;
+  std::shared_ptr<absl::Status> status_ = nullptr;
+  std::shared_ptr<NameMap> name_map_ = nullptr;
+  std::shared_ptr<ShapeRefiner> refiner_ = nullptr;
+
+  // If scope_used_ is not nullptr, op_name_ should be empty and
+  // GetUniqueNameForOp can only be called once on this scope. More calls to
+  // GetUniqueNameForOp will cause an error status to be set on this scope.
+  std::shared_ptr<bool> scope_used_ = nullptr;
+
+  const std::vector<Operation> control_deps_;
+
+  // The fully-qualified name of this scope (i.e. includes any parent scope
+  // names).
+  const string name_ = "";
+  const string op_name_ = "";
+  const bool exit_on_error_ = false;
+  const string kernel_label_ = "";
+  const string device_ = "";
+  const string assigned_device_ = "";
+  const string xla_cluster_ = "";
+  const std::unordered_set<string> colocation_constraints_;
+
+  // If true, Scope::DoShapeInference() always returns Status:OK().
+  // TODO(skyewm): remove this when possible
+  const bool disable_shape_inference_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/testutil.h b/third_party/tflite-hdrs/tensorflow/cc/framework/testutil.h
new file mode 100644
index 00000000..2464b491
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/testutil.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#define TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace test {
+
+/// Computes the outputs listed in 'tensors', returns the tensors in 'out'.
+void GetTensors(const Scope& scope, OutputList tensors,
+                std::vector<Tensor>* out);
+
+// Computes the outputs listed in 'tensors', returns the tensors in 'out'.
+// assign_vars are extra outputs that should be run
+// e.g. to assign values to variables.
+void GetTensors(const Scope& scope, const std::vector<Output>& assign_vars,
+                const OutputList& tensors, std::vector<Tensor>* out);
+
+/// Computes the output 'tensor', returning the resulting tensor in 'out'.
+void GetTensor(const Scope& scope, Output tensor, Tensor* out);
+
+// Computes the output 'tensor', returning the resulting tensor in 'out'.
+// assign_vars are extra outputs that should be run
+// e.g. to assign values to variables.
+void GetTensor(const Scope& scope, const std::vector<Output>& assign_vars,
+               Output tensor, Tensor* out);
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/framework/while_gradients.h b/third_party/tflite-hdrs/tensorflow/cc/framework/while_gradients.h
new file mode 100644
index 00000000..1f31de15
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/framework/while_gradients.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#define TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/graph/while_context.h"
+
+// Utility functions for constructing while loop gradients
+
+namespace tensorflow {
+
+// Adds the gradient computation for the while loop associated with
+// `while_ctx`. `grad_inputs` are the partial derivatives w.r.t. the loop
+// outputs, i.e. the exit nodes.  The partial derivatives w.r.t. the loop
+// inputs, i.e. the input loop vars, are returned in `grad_outputs`.
+// `grad_inputs` and `grad_outputs` are both in loop-variable order, as defined
+// by the original inputs to BuildWhileLoop().
+// TODO(skyewm): maybe comment on NoGradient once it's supported
+absl::Status AddWhileLoopGradient(WhileContext* while_ctx, const Scope& scope,
+                                  const std::vector<Output>& grad_inputs,
+                                  std::vector<Output>* grad_outputs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/gradients/grad_helper.h b/third_party/tflite-hdrs/tensorflow/cc/gradients/grad_helper.h
new file mode 100644
index 00000000..2a50d648
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/gradients/grad_helper.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_GRADIENTS_GRAD_HELPER_H_
+#define TENSORFLOW_CC_GRADIENTS_GRAD_HELPER_H_
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+
+// Helper function for reduction ops.
+//
+// input_shape: 1-D Tensor, the shape of the Tensor being reduced.
+// axes: 1-D Tensor, the reduction axes.
+//   Note that the reduction indices are in the range
+//   -rank(input_shape), rank(input_shape)
+// returns a 1-D Tensor, the output shape as if keep_dims were set to True.
+Output ReducedShapeHelper(const Scope& scope, const Output& input_shape,
+                          const Output& reduction_axes);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_GRADIENTS_GRAD_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/gradients/grad_testutil.h b/third_party/tflite-hdrs/tensorflow/cc/gradients/grad_testutil.h
new file mode 100644
index 00000000..acde3075
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/gradients/grad_testutil.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#define TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace test {
+
+/// Calls the gradient function registered for 'op', adding gradient operations
+/// to the graph associated with 'scope'. Gradient outputs for each 'op' input
+/// are returned in 'grad_outputs'.
+absl::Status CallGradFunction(const Scope& scope, const Operation& op,
+                              const std::vector<Output>& grad_inputs,
+                              std::vector<Output>* grad_outputs);
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/ops/const_op.h b/third_party/tflite-hdrs/tensorflow/cc/ops/const_op.h
new file mode 100644
index 00000000..9c888701
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/ops/const_op.h
@@ -0,0 +1,87 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_OPS_CONST_OP_H_
+#define TENSORFLOW_CC_OPS_CONST_OP_H_
+
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+namespace ops {
+
+/// @defgroup const_op Const Op
+/// @{
+
+Output Const(const Scope& scope, const Input::Initializer& val);
+
+Output ConstFromProto(const Scope& scope, const TensorProto& proto);
+
+NodeBuilder::NodeOut AsNodeOut(const Scope& scope, const Input& inp);
+
+template <typename T>
+Output Const(const Scope& scope, const Input::Initializer& val) {
+  auto orig_const_output = Const(scope, val);
+  if (!scope.ok()) return Output();
+
+  typedef typename Input::Initializer::RealType<T>::type DstT;
+
+  if (val.tensor.dtype() == DataTypeToEnum<DstT>::v()) {
+    return orig_const_output;
+  }
+  if (val.tensor.NumElements() == 0) {
+    Tensor t(DataTypeToEnum<DstT>::v(), val.tensor.shape());
+    return Const(scope, Input::Initializer(t));
+  }
+
+  // TODO(keveman): Refactor Cast op's kernel implementation such that the code
+  // can be directly called here instead of adding the Cast op to the graph.
+  auto orig_const = AsNodeOut(scope, orig_const_output);
+  const auto cast_op_name = scope.GetUniqueNameForOp("Cast");
+
+  auto cast_builder = NodeBuilder(cast_op_name, "Cast")
+                          .Input(orig_const)
+                          .Attr("DstT", DataTypeToEnum<DstT>::v());
+  scope.UpdateBuilder(&cast_builder);
+  Node* ret;
+  scope.UpdateStatus(cast_builder.Finalize(scope.graph(), &ret));
+  if (!scope.ok()) return Output();
+  scope.UpdateStatus(scope.DoShapeInference(ret));
+  return Output(ret, 0);
+}
+
+template <typename T>
+Output Const(const Scope& scope, const T& v, const TensorShape shape) {
+  return Const(scope, Input::Initializer(v, shape));
+}
+
+template <typename T>
+Output Const(const Scope& scope, const std::initializer_list<T>& v,
+             const TensorShape shape) {
+  return Const(scope, Input::Initializer(v, shape));
+}
+
+std::vector<NodeBuilder::NodeOut> AsNodeOutList(const Scope& scope,
+                                                const InputList& inp);
+
+/// }@
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_OPS_CONST_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/ops/standard_ops.h b/third_party/tflite-hdrs/tensorflow/cc/ops/standard_ops.h
new file mode 100644
index 00000000..98f53010
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/ops/standard_ops.h
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#define TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/candidate_sampling_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
+#include "tensorflow/cc/ops/data_flow_ops.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/io_ops.h"
+#include "tensorflow/cc/ops/linalg_ops.h"
+#include "tensorflow/cc/ops/logging_ops.h"
+#include "tensorflow/cc/ops/lookup_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/no_op.h"
+#include "tensorflow/cc/ops/parsing_ops.h"
+#include "tensorflow/cc/ops/random_ops.h"
+#include "tensorflow/cc/ops/sparse_ops.h"
+#include "tensorflow/cc/ops/state_ops.h"
+#include "tensorflow/cc/ops/string_ops.h"
+#include "tensorflow/cc/ops/training_ops.h"
+#include "tensorflow/cc/ops/user_ops.h"
+
+#endif  // TENSORFLOW_CC_OPS_STANDARD_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/ops/while_loop.h b/third_party/tflite-hdrs/tensorflow/cc/ops/while_loop.h
new file mode 100644
index 00000000..5a1a45da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/ops/while_loop.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#define TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Function that takes cond graph inputs and returns cond graph boolean output.
+// 'output' need not be set if an error is returned.
+typedef std::function<absl::Status(
+    const Scope&, const std::vector<Output>& inputs, Output* output)>
+    CondGraphBuilderFn;
+
+// Function that takes body graph inputs and returns body graph outputs.
+// 'outputs' need not be populated if an error is returned.
+typedef std::function<absl::Status(const Scope&,
+                                   const std::vector<Output>& inputs,
+                                   std::vector<Output>* outputs)>
+    BodyGraphBuilderFn;
+
+// Constructs a while loop.
+//
+// Arguments:
+// * scope: used to construct the while loop.
+// * inputs: the initial values of the loop variables. Must be non-empty.
+// * cond: a function that builds the condition graph of the loop. Takes the
+//     current loop variables as inputs and returns a scalar boolean Output
+//     indicating whether the loop should continue.
+// * body: a function that builds the body graph of the loop. Takes the current
+//     loop variables as inputs and returns the updated loop variables.
+// * frame_name: the frame name to use for this while loop. This should be a
+//     unique name. This will be used as a prefix for created operations.
+// * outputs: output param that returns final loop variable outputs in non-error
+//     case. Must be non-null and empty.
+// * create_while_ctx: if true, a WhileContext is created and populated for this
+//     loop. See core/graph/while_context.h for more details on
+//     WhileContexts. This is set to false for loops used as part of gradient
+//     computations, since they're part of the gradient for a loop in the
+//     forward-pass.
+//     TODO(skyewm): revisit this. Should we create WhileContexts for all loops,
+//     even if we don't need them?
+// * cond_output: if non-null, the output of the predicate is returned. This
+//     will always be a LoopCond node.
+//
+// Returns an error if the while loop could not be fully constructed.
+//
+// TODO(skyewm): clean up partially-constructed loop in error case
+// TODO(skyewm): create public interface to this method
+absl::Status BuildWhileLoop(const Scope& scope,
+                            const std::vector<Output>& inputs,
+                            const CondGraphBuilderFn& cond,
+                            const BodyGraphBuilderFn& body,
+                            const string& frame_name, OutputList* outputs,
+                            bool create_while_ctx = true,
+                            Output* cond_output = nullptr);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_OPS_WHILE_LOOP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/bundle_v2.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/bundle_v2.h
new file mode 100644
index 00000000..ec85d14f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/bundle_v2.h
@@ -0,0 +1,90 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for loading the persistent representation of a SavedModelV2.
+// Please note that this is depended on by code that does not make use of
+// the full runtime and its dependencies should be restricted.
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_BUNDLE_V2_H_
+#define TENSORFLOW_CC_SAVED_MODEL_BUNDLE_V2_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/trackable_object_graph.pb.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+
+/// Represents a version 2 SavedModel that is loaded from storage (but not yet
+/// loaded into an executable in-memory representation).
+class SavedModelV2Bundle {
+ public:
+  using RestoreObjectsCallback = std::function<absl::Status(
+      int, const TrackableObjectGraph::TrackableObject&)>;
+
+  /// Loads persistent representations for a SavedModelV2 from the specified
+  /// export directory.
+  static absl::Status Load(const std::string& export_dir,
+                           SavedModelV2Bundle* bundle);
+
+  /// MetaGraphDef from the loaded SavedModel.
+  MetaGraphDef& meta_graph_def() { return meta_graph_def_; }
+
+  /// SavedObjectGraph from the MetaGraphDef.
+  const SavedObjectGraph& saved_object_graph() {
+    return meta_graph_def().object_graph_def();
+  }
+
+  /// TrackableObjectGraph loaded from the variable_reader() checkpoint.
+  TrackableObjectGraph& trackable_object_graph() {
+    return trackable_object_graph_;
+  }
+
+  /// BundleReader for accessing the variables bundle.
+  BundleReader* variable_reader() { return variable_reader_.get(); }
+
+  /// The GraphDebugInfo (or nullptr if none).
+  GraphDebugInfo* debug_info() { return debug_info_.get(); }
+
+  /// Restores objects, invoking the callback with the node id in the
+  /// saved_object_graph() and the corresponding TrackableObject from the
+  /// trackable_object_graph(). The callback may use the variable_reader() but
+  /// must not modify the underlying saved_object_graph().
+  absl::Status VisitObjectsToRestore(RestoreObjectsCallback callback);
+
+ private:
+  absl::Status RecurseObjectsToRestore(
+      const SavedObject* saved_object, int saved_object_node_id,
+      const TrackableObjectGraph::TrackableObject* trackable_object,
+      std::string object_name,
+      absl::flat_hash_set<int>* seen_trackable_node_ids,
+      RestoreObjectsCallback callback);
+
+  MetaGraphDef meta_graph_def_;
+  TrackableObjectGraph trackable_object_graph_;
+  std::unique_ptr<BundleReader> variable_reader_;
+  std::unique_ptr<GraphDebugInfo> debug_info_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_BUNDLE_V2_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/constants.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/constants.h
new file mode 100644
index 00000000..e8a267e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/constants.h
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+
+namespace tensorflow {
+
+// SavedModel assets directory.
+inline constexpr char kSavedModelAssetsDirectory[] = "assets";
+
+// SavedModel assets.extra directory.
+inline constexpr char kSavedModelAssetsExtraDirectory[] = "assets.extra";
+
+// SavedModel assets key for graph collection-def.
+inline constexpr char kSavedModelAssetsKey[] = "saved_model_assets";
+
+/// SavedModel legacy init op collection key. Used in v1 SavedModels.
+inline constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
+
+/// SavedModel main op collection key. Used in v1 SavedModels.
+inline constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
+
+// CollectionDef key for the SavedModel train op.
+// Not exported while export_all_saved_models is experimental.
+inline constexpr char kSavedModelTrainOpKey[] = "saved_model_train_op";
+
+// Schema version for SavedModel.
+inline constexpr int kSavedModelSchemaVersion = 1;
+
+// SavedModel proto filename prefix.
+inline constexpr char kSavedModelFilenamePrefix[] = "saved_model";
+// SavedModel proto filename.
+inline constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
+
+// SavedModel chunked proto filename.
+inline constexpr char kSavedModelFilenameCpb[] = "saved_model.cpb";
+
+// SavedModel text format proto filename.
+inline constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
+
+// Subdirectory where debugging related files are written.
+inline constexpr char kSavedModelDebugDirectory[] = "debug";
+
+// File name for GraphDebugInfo protocol buffer which corresponds to the
+// SavedModel.
+inline constexpr char kSavedModelDebugInfoFilenamePb[] =
+    "saved_model_debug_info.pb";
+
+// Directory in which to save the SavedModel variables.
+inline constexpr char kSavedModelVariablesDirectory[] = "variables";
+
+// SavedModel variables filename.
+inline constexpr char kSavedModelVariablesFilename[] = "variables";
+
+// SavedModel SignatureDef keys for the initialization and train ops. Used in
+// V2 SavedModels.
+inline constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
+inline constexpr char kSavedModelTrainOpSignatureKey[] =
+    "__saved_model_train_op";
+
+// Key in the TensorBundle for the object graph proto.
+inline constexpr char kObjectGraphProtoKey[] = "_CHECKPOINTABLE_OBJECT_GRAPH";
+
+// Filename for the FingerprintDef protocol buffer.
+inline constexpr char kFingerprintFilenamePb[] = "fingerprint.pb";
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/concrete_function.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/concrete_function.h
new file mode 100644
index 00000000..1adaf70b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/concrete_function.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// ConcreteFunction is an executable "function" loaded from a SavedModelAPI.
+class ConcreteFunction final {
+ public:
+  // TODO(bmzhao): Adding ConcreteFunction::Run in subsequent CL, since
+  // it depends on tensorflow::cc::Tensor and tensorflow::cc::TensorHandle
+
+  // Returns FunctionMetadata associated with this ConcreteFunction.
+  const FunctionMetadata* GetFunctionMetadata();
+
+ private:
+  friend class SavedModelAPI;
+  friend class ConcreteFunctionList;
+
+  // TODO(bmzhao): Consider adding a macro for wrapping/unwrapping
+  // when moving out of experimental.
+  static ConcreteFunction* wrap(TF_ConcreteFunction* p) {
+    return reinterpret_cast<ConcreteFunction*>(p);
+  }
+  static TF_ConcreteFunction* unwrap(ConcreteFunction* p) {
+    return reinterpret_cast<TF_ConcreteFunction*>(p);
+  }
+};
+
+inline const FunctionMetadata* ConcreteFunction::GetFunctionMetadata() {
+  return FunctionMetadata::wrap(TF_ConcreteFunctionGetMetadata(unwrap(this)));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
new file mode 100644
index 00000000..88cb779e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
+
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// ConcreteFunctionList helps convert an opaque pointer to an array of
+// ConcreteFunction pointers to a std::vector.
+class ConcreteFunctionList {
+ public:
+  // Converts this object to a std::vector<ConcreteFunction*>
+  std::vector<ConcreteFunction*> ToVector();
+
+ private:
+  friend class SavedModelAPI;
+  // Wraps a TF_ConcreteFunctionList. Takes ownership of list.
+  explicit ConcreteFunctionList(TF_ConcreteFunctionList* list) : list_(list) {}
+
+  struct TFConcreteFunctionListDeleter {
+    void operator()(TF_ConcreteFunctionList* p) const {
+      TF_DeleteConcreteFunctionList(p);
+    }
+  };
+  std::unique_ptr<TF_ConcreteFunctionList, TFConcreteFunctionListDeleter> list_;
+};
+
+inline std::vector<ConcreteFunction*> ConcreteFunctionList::ToVector() {
+  int size = TF_ConcreteFunctionListSize(list_.get());
+  std::vector<ConcreteFunction*> result;
+  result.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    result.push_back(
+        ConcreteFunction::wrap(TF_ConcreteFunctionListGet(list_.get(), i)));
+  }
+  return result;
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/function_metadata.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/function_metadata.h
new file mode 100644
index 00000000..11e1a860
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/function_metadata.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// FunctionMetadata stores additional function information, including
+// optional signaturedef feeds/fetches (for TF1-based ConcreteFunctions),
+// a valid function path (for TF2-based ConcreteFunctions), and
+// the types + number of inputs and outputs.
+class FunctionMetadata final {
+  // TODO(bmzhao): Add getters here as necessary.
+ private:
+  friend class ConcreteFunction;
+  static FunctionMetadata* wrap(TF_FunctionMetadata* p) {
+    return reinterpret_cast<FunctionMetadata*>(p);
+  }
+  static TF_FunctionMetadata* unwrap(FunctionMetadata* p) {
+    return reinterpret_cast<TF_FunctionMetadata*>(p);
+  }
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
new file mode 100644
index 00000000..9d30a4a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -0,0 +1,155 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
+#include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
+#include "tensorflow/cc/saved_model/experimental/public/signature_def_function.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SavedModelAPI offers a way to load Tensorflow Saved Models
+// (https://www.tensorflow.org/guide/saved_model) and execute saved
+// tf.functions or legacy SignatureDefs in a TF2-idiomatic fashion.
+// See RFC 207
+// (https://github.com/tensorflow/community/blob/master/rfcs/20200218-tf-c-saved-model.md)
+// TODO(bmzhao): Add an e2e example here, once ConcreteFunction::Run is added.
+class SavedModelAPI {
+ public:
+  // Load a SavedModel from `dirname`.
+  //
+  // Params:
+  //  saved_model_path - A directory filepath that the SavedModel is at.
+  //  runtime - A runtime used to load SavedModelAPI. `runtime` must outlive the
+  //            returned TF_SavedModel pointer.
+  //  tags - Optional set of tags. If tags = nullptr, we expect the SavedModel
+  //         to contain a single Metagraph (as for those exported from TF2's
+  //         `tf.saved_model.save`). If tags != nullptr, we load the metagraph
+  //         matching the tags:
+  //         https://github.com/tensorflow/tensorflow/blob/428cdeda09aef81e958eeb274b83d27ad635b57b/tensorflow/core/protobuf/meta_graph.proto#L50-L56
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr.
+  static std::unique_ptr<SavedModelAPI> Load(
+      const std::string& saved_model_path, const Runtime& runtime,
+      Status* status, const std::unordered_set<std::string>* tags = nullptr);
+
+  // Retrieve a function from the TF2 SavedModel via function path.
+  //
+  // Params:
+  //  function_path - A string containing the path from the root saved python
+  //                  object to a tf.function method.
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr. Otherwise, returns a
+  //  tensorflow::cc::ConcreteFunction pointer. The lifetime of this pointer
+  //  is bound to SavedModelAPI it was loaded from.
+  ConcreteFunction* GetConcreteFunction(const std::string& function_path,
+                                        Status* status);
+
+  // Retrieve a function from the TF SavedModel via a SignatureDef key.
+  //
+  // Params:
+  //  signature_def_key - String key of SignatureDef map of a SavedModel:
+  //                      https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
+  //  status - Set to OK on success and an appropriate error on failure.
+  // Returns:
+  //  If status is not OK, returns nullptr. Otherwise, returns a
+  //  tensorflow::cc::ConcreteFunction pointer. The lifetime of this pointer
+  //  is bound to SavedModelAPI it was loaded from.
+  SignatureDefFunction* GetSignatureDefFunction(
+      const std::string& function_path, Status* status);
+
+  // SavedModelAPI is movable, but not copyable.
+  SavedModelAPI(SavedModelAPI&&) = default;
+  SavedModelAPI& operator=(SavedModelAPI&&) = default;
+
+ private:
+  SavedModelAPI(const SavedModelAPI&) = delete;
+  SavedModelAPI& operator=(const SavedModelAPI&) = delete;
+
+  explicit SavedModelAPI(TF_SavedModel* model) : saved_model_(model) {}
+  struct TFSavedModelDeleter {
+    void operator()(TF_SavedModel* p) const { TF_DeleteSavedModel(p); }
+  };
+  std::unique_ptr<TF_SavedModel, TFSavedModelDeleter> saved_model_;
+};
+
+inline std::unique_ptr<SavedModelAPI> SavedModelAPI::Load(
+    const std::string& saved_model_path, const Runtime& runtime, Status* status,
+    const std::unordered_set<std::string>* tags) {
+  TF_SavedModel* saved_model = nullptr;
+
+  if (tags == nullptr) {
+    saved_model =
+        TF_LoadSavedModel(saved_model_path.c_str(), runtime.GetTFEContext(),
+                          status->GetTFStatus());
+  } else {
+    std::vector<const char*> tags_vector;
+    tags_vector.reserve(tags->size());
+    for (const std::string& tag : *tags) {
+      tags_vector.push_back(tag.c_str());
+    }
+    saved_model = TF_LoadSavedModelWithTags(
+        saved_model_path.c_str(), runtime.GetTFEContext(), tags_vector.data(),
+        tags_vector.size(), status->GetTFStatus());
+  }
+
+  if (!status->ok()) {
+    return nullptr;
+  }
+
+  // We can't use std::make_unique here because of its interaction with a
+  // private constructor: https://abseil.io/tips/134
+  return std::unique_ptr<SavedModelAPI>(new SavedModelAPI(saved_model));
+}
+
+inline ConcreteFunction* SavedModelAPI::GetConcreteFunction(
+    const std::string& function_path, Status* status) {
+  TF_ConcreteFunction* function = TF_GetSavedModelConcreteFunction(
+      saved_model_.get(), function_path.c_str(), status->GetTFStatus());
+  if (!status->ok()) {
+    return nullptr;
+  }
+  return ConcreteFunction::wrap(function);
+}
+
+inline SignatureDefFunction* SavedModelAPI::GetSignatureDefFunction(
+    const std::string& function_path, Status* status) {
+  TF_SignatureDefFunction* function = TF_GetSavedModelSignatureDefFunction(
+      saved_model_.get(), function_path.c_str(), status->GetTFStatus());
+  if (!status->ok()) {
+    return nullptr;
+  }
+  return SignatureDefFunction::wrap(function);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/signature_def_function.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/signature_def_function.h
new file mode 100644
index 00000000..bc72d208
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/signature_def_function.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SignatureDefFunctions are functions that correspond to either:
+// "signatures" saved from a TF2 SavedModel APIs:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/python/saved_model/save.py#L830-L854
+// Or the "SignatureDefMap" saved from TF1 SavedModel APIs:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/python/saved_model/load_v1_in_v2_test.py#L170-L174
+// In both cases, a SignatureDef is serialized as a SignatureDef protobuf:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/core/protobuf/meta_graph.proto#L260-L330
+// and represents a computation defined by a TF subgraph.
+// These Signatures were primarily designed to be interoperable with the legacy
+// TF 1 Session-based C++ SavedModelBundle loading APIs:
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/cc/saved_model/loader.h#L96-L108
+// SignatureDefFunctions have different semantics from regular TF2
+// ConcreteFunctions, and are mainly intended provide a serving-friendly
+// transition point from the TF1 Session API.
+// First, SignatureDefFunctions have different calling conventions.
+// SignatureDefFunctions' inputs and outputs are constrained to **flattened
+// lists of TensorHandles only**. They do not support more exotic input/output
+// types (like optionals, generators, etc). Additionally, this flattening means
+// they will not preserve the exact interface of the original tf.function they
+// were traced from, as things like composite tensors decay into their
+// internal dense tensor representation.
+// Second, all inputs and outputs are "named", and these names are load bearing
+// (eg: they are part of the interface of tensorflow_serving):
+// https://github.com/tensorflow/serving/blob/e0d247b2e4050713194b8fad0be24a0636df7209/tensorflow_serving/apis/predict.proto#L21
+// https://github.com/tensorflow/serving/blob/e0d247b2e4050713194b8fad0be24a0636df7209/tensorflow_serving/apis/predict.proto#L39
+// The name of each input/output is stored in the corresponding tf::Argument in
+// SignatureDefFunctionMetadata::arguments(). Users must ensure the order of
+// TensorHandles passed to the function matches with the order of named
+// arguments. Similarly the name of the outputs is stored in
+// SignatureDefFunctionMetadata::returns().
+class SignatureDefFunction final {
+ public:
+  // Returns FunctionMetadata associated with this ConcreteFunction.
+  const SignatureDefFunctionMetadata* GetFunctionMetadata();
+
+ private:
+  friend class SavedModelAPI;
+  friend class ConcreteFunctionList;
+
+  // TODO(bmzhao): Consider adding a macro for wrapping/unwrapping
+  // when moving out of experimental.
+  static SignatureDefFunction* wrap(TF_SignatureDefFunction* p) {
+    return reinterpret_cast<SignatureDefFunction*>(p);
+  }
+  static TF_SignatureDefFunction* unwrap(SignatureDefFunction* p) {
+    return reinterpret_cast<TF_SignatureDefFunction*>(p);
+  }
+};
+
+inline const SignatureDefFunctionMetadata*
+SignatureDefFunction::GetFunctionMetadata() {
+  return SignatureDefFunctionMetadata::wrap(
+      TF_SignatureDefFunctionGetMetadata(unwrap(this)));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h
new file mode 100644
index 00000000..6cb01bf1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SignatureDefFunctionMetadata stores additional information on each input
+// and output's names, dtypes, and shape.
+class SignatureDefFunctionMetadata final {
+  // TODO(bmzhao): Add getters here as necessary.
+ private:
+  friend class SignatureDefFunction;
+  static SignatureDefFunctionMetadata* wrap(
+      TF_SignatureDefFunctionMetadata* p) {
+    return reinterpret_cast<SignatureDefFunctionMetadata*>(p);
+  }
+  static TF_SignatureDefFunctionMetadata* unwrap(
+      SignatureDefFunctionMetadata* p) {
+    return reinterpret_cast<TF_SignatureDefFunctionMetadata*>(p);
+  }
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/fingerprinting.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/fingerprinting.h
new file mode 100644
index 00000000..2b232481
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/fingerprinting.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_H_
+#define TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
+
+namespace tensorflow::saved_model::fingerprinting {
+
+// Creates a FingerprintDef proto from a SavedModel (regular or chunked) and the
+// checkpoint meta file (.index) in `export_dir`.
+absl::StatusOr<FingerprintDef> CreateFingerprintDef(
+    absl::string_view export_dir);
+
+// Loads the `fingerprint.pb` from `export_dir`, returns an error if there is
+// none.
+absl::StatusOr<FingerprintDef> ReadSavedModelFingerprint(
+    absl::string_view export_dir);
+
+// Canonical fingerprinting ID for a SavedModel.
+std::string Singleprint(uint64_t graph_def_program_hash,
+                        uint64_t signature_def_hash,
+                        uint64_t saved_object_graph_hash,
+                        uint64_t checkpoint_hash);
+std::string Singleprint(const FingerprintDef& fingerprint);
+absl::StatusOr<std::string> Singleprint(absl::string_view export_dir);
+
+}  // namespace tensorflow::saved_model::fingerprinting
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/fingerprinting_utils.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/fingerprinting_utils.h
new file mode 100644
index 00000000..306abec8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/fingerprinting_utils.h
@@ -0,0 +1,137 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_UTILS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "riegeli/bytes/fd_reader.h"  // from @riegeli
+#include "riegeli/records/record_reader.h"  // from @riegeli
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/tools/proto_splitter/chunk.pb.h"
+
+namespace tensorflow::saved_model::fingerprinting {
+
+namespace fingerprinting_utils_internal {
+
+using ::tensorflow::protobuf::Map;
+using ::tensorflow::protobuf::Message;
+using ::tensorflow::protobuf::RepeatedPtrField;
+
+// Number of sequential FieldIndex matches of `a` in `b`. (Length of initial
+// subsequence.)
+// Example: `a = {4, 2}`, `b = {4, 2, 1, 3}`, `fieldTagMatches(a, b) == 2`
+absl::StatusOr<int> fieldTagMatches(
+    const RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>& a,
+    const RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>& b);
+
+// Pull out the relevant data within `chunked_message`. A `chunked_field` is
+// relevant if its `field_tags` are an initial subsequence any of the
+// `target_fields` in the provided `target_fields_list`.
+absl::StatusOr<::tensorflow::proto_splitter::ChunkedMessage>
+PruneChunkedMessage(
+    const ::tensorflow::proto_splitter::ChunkedMessage& chunked_message,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    std::vector<::tensorflow::proto_splitter::ChunkInfo> chunks_info,
+    std::vector<RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>>
+        target_fields_list);
+
+// Deterministically serializes the proto `message`.
+std::string SerializeProto(const Message& message);
+
+// Uses metadata contained in `chunked_message` to hash fields within the
+// data accessed by the `reader` using `chunks_info`.
+absl::StatusOr<uint64_t> HashFields(
+    const ::tensorflow::proto_splitter::ChunkedMessage& chunked_message,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    const std::vector<::tensorflow::proto_splitter::ChunkInfo>& chunks_info,
+    const RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>&
+        field_tags,
+    Message* merged_message);
+
+// Gets the field tags for `graph_def`.::tensorflow
+inline RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>
+GraphDefFieldTags();
+
+// Gets the field tags for `signature_def`.
+inline RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>
+SignatureDefFieldTags();
+
+// Gets the field tags for `saved_object_graph`.
+inline RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>
+SavedObjectGraphFieldTags();
+
+// Returns a `SavedModel` containing only fields (up to those) specified by
+// `GraphDefFieldTags()`, `SignatureDefFieldTags()`, and
+// `SavedObjectGraphFieldTags()`.
+absl::StatusOr<tensorflow::SavedModel> PrunedSavedModel(
+    absl::string_view export_dir,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    const std::vector<::tensorflow::proto_splitter::ChunkInfo>& chunks_info,
+    ::tensorflow::proto_splitter::ChunkMetadata& chunk_metadata);
+
+// Hashes the contents of `message` specified by `field_tags`.
+absl::StatusOr<uint64_t> HashMessage(
+    Message* message,
+    const ::tensorflow::proto_splitter::ChunkedMessage& chunked_message,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    const std::vector<::tensorflow::proto_splitter::ChunkInfo>& chunks_info,
+    const RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>&
+        field_tags);
+
+// Hashes the contents of `graph_def`.
+absl::StatusOr<uint64_t> HashGraphDef(
+    tensorflow::GraphDef* graph_def,
+    const ::tensorflow::proto_splitter::ChunkedMessage& chunked_message,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    const std::vector<::tensorflow::proto_splitter::ChunkInfo>& chunks_info);
+
+// Hashes the contents of `signature_def`.
+absl::StatusOr<uint64_t> HashSignatureDef(
+    const Map<std::string, ::tensorflow::SignatureDef>& signature_def_map,
+    const ::tensorflow::proto_splitter::ChunkedMessage& chunked_message,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    const std::vector<::tensorflow::proto_splitter::ChunkInfo>& chunks_info);
+
+// Hashes the contents of `saved_object_graph`.
+absl::StatusOr<uint64_t> HashSavedObjectGraph(
+    tensorflow::SavedObjectGraph* saved_object_graph,
+    const ::tensorflow::proto_splitter::ChunkedMessage& chunked_message,
+    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    const std::vector<::tensorflow::proto_splitter::ChunkInfo>& chunks_info);
+
+}  // namespace fingerprinting_utils_internal
+
+// Returns the hash of the checkpoint .index file, 0 if there is none.
+uint64_t HashCheckpointIndexFile(absl::string_view model_dir);
+
+// Creates a FingerprintDef proto from a chunked SavedModel and the checkpoint
+// meta file (.index) in `export_dir`.
+absl::StatusOr<FingerprintDef> CreateFingerprintDefCpb(
+    absl::string_view export_dir, std::string cpb_file);
+
+}  // namespace tensorflow::saved_model::fingerprinting
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/image_format/internal_api.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/image_format/internal_api.h
new file mode 100644
index 00000000..5c9b13d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/image_format/internal_api.h
@@ -0,0 +1,65 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_IMAGE_FORMAT_INTERNAL_API_H_
+#define TENSORFLOW_CC_SAVED_MODEL_IMAGE_FORMAT_INTERNAL_API_H_
+
+#include <string>
+#include <tuple>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+#define IS_OSS false
+
+namespace tensorflow {
+namespace image_format {
+
+// Reads the SavedModel proto from {file_prefix}{.pb|.cpb}.
+// Returns a failure status when the SavedModel file does not exist.
+absl::Status ReadSavedModel(const std::string& file_prefix,
+                            SavedModel* saved_model_proto);
+
+// Writes the SavedModel proto to a file or to string. If the proto is < the
+// protobuf maximum size, then it will be serialized as a `.pb` proto binary.
+// When larger than the maximum size, the SavedModel proto is destructively
+// separated into chunks and written to
+// `.cpb` (chunked proto).
+//
+// Write SavedModel to {file_prefix}{.pb|.cpb}.
+absl::Status WriteSavedModel(SavedModel* saved_model_proto,
+                             const std::string& file_prefix);
+// Writes the SavedModel proto to std::string
+// The bool field record whether it's saved as a chunked protobuf (true) or
+// regular protobuf (false)
+absl::StatusOr<std::tuple<std::string, bool>> WriteSavedModelToString(
+    SavedModel* saved_model_proto);
+#if !IS_OSS
+absl::StatusOr<std::tuple<absl::Cord, bool>> WriteSavedModelToCord(
+    SavedModel* saved_model_proto);
+#endif
+
+// See above. The `debug_max_size` argument can be used to the maximum size to
+// less than 2GB for testing purposes.
+absl::Status WriteSavedModel(SavedModel* saved_model_proto,
+                             const std::string& file_prefix,
+                             int debug_max_size);
+
+}  // namespace image_format
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_IMAGE_FORMAT_INTERNAL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/loader.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/loader.h
new file mode 100644
index 00000000..f549645e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/loader.h
@@ -0,0 +1,150 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// SavedModel loading functions and SavedModelBundle struct.
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#define TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+
+#include <string>
+#include <unordered_set>
+
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+/// Represents a SavedModel that is loaded from storage.
+class SavedModelBundleInterface {
+ public:
+  virtual ~SavedModelBundleInterface();
+
+  /// Returns the TensorFlow Session that can be used to interact with the
+  /// SavedModel.
+  virtual Session* GetSession() const = 0;
+
+  /// Returns a map from signature name to SignatureDef for all signatures in
+  /// in the SavedModel.
+  virtual const protobuf::Map<string, SignatureDef>& GetSignatures() const = 0;
+};
+
+/// SavedModel representation once the SavedModel is loaded from storage.
+///
+/// NOTE: Prefer to use SavedModelBundleLite in new code, as it consumes less
+/// RAM.
+struct SavedModelBundle : public SavedModelBundleInterface {
+  /// A TensorFlow Session does not Close itself on destruction. To avoid
+  /// resource leaks, we explicitly call Close on Sessions that we create.
+  ~SavedModelBundle() override {
+    if (session) {
+      session->Close().IgnoreError();
+    }
+  }
+
+  SavedModelBundle() = default;
+
+  Session* GetSession() const override { return session.get(); }
+  const protobuf::Map<string, SignatureDef>& GetSignatures() const override {
+    return meta_graph_def.signature_def();
+  }
+
+  std::unique_ptr<Session> session;
+  MetaGraphDef meta_graph_def;
+  std::unique_ptr<GraphDebugInfo> debug_info;
+};
+
+// A version of SavedModelBundle that avoids storing a potentially large
+// MetaGraphDef. Prefer to use SavedModelBundleLite in new code.
+class SavedModelBundleLite : public SavedModelBundleInterface {
+ public:
+  SavedModelBundleLite() = default;
+  SavedModelBundleLite(SavedModelBundleLite&& other) = default;
+  SavedModelBundleLite& operator=(SavedModelBundleLite&& other) = default;
+
+  SavedModelBundleLite(std::unique_ptr<Session> session,
+                       protobuf::Map<string, SignatureDef> signatures)
+      : session_(std::move(session)), signatures_(std::move(signatures)) {}
+
+  /// A TensorFlow Session does not Close itself on destruction. To avoid
+  /// resource leaks, we explicitly call Close on Sessions that we create.
+  ~SavedModelBundleLite() override {
+    if (session_) {
+      session_->Close().IgnoreError();
+    }
+  }
+
+  Session* GetSession() const override { return session_.get(); }
+  const protobuf::Map<string, SignatureDef>& GetSignatures() const override {
+    return signatures_;
+  }
+
+ private:
+  std::unique_ptr<Session> session_;
+  protobuf::Map<string, SignatureDef> signatures_;
+};
+
+// Restore variable and resources in the SavedModel export dir for the
+// indicated metagraph.
+// The recommended way to load a saved model is to call LoadSavedModel,
+// which provides an already initialized Metagraph, Session, and DebugInfo.
+absl::Status RestoreSession(const RunOptions& run_options,
+                            const MetaGraphDef& meta_graph,
+                            const string& export_dir,
+                            std::unique_ptr<Session>* session);
+
+// Initialize a session which wraps this metagraph.
+// The recommended way to load a saved model is to call LoadSavedModel,
+// which provides an already initialized Metagraph, Session, and DebugInfo.
+absl::Status LoadMetagraphIntoSession(const SessionOptions& session_options,
+                                      const MetaGraphDef& meta_graph,
+                                      std::unique_ptr<Session>* session);
+
+/// Loads a SavedModel from the specified export directory. The MetaGraphDef
+/// to be loaded is identified by the supplied tags, corresponding exactly to
+/// the set of tags used at SavedModel build time. Stores a SavedModel bundle in
+/// *bundle with a session and the requested MetaGraphDef, if found.
+///
+/// NOTE: Prefer the overload that takes a SavedModelBundleLite* in new code.
+absl::Status LoadSavedModel(const SessionOptions& session_options,
+                            const RunOptions& run_options,
+                            const string& export_dir,
+                            const std::unordered_set<string>& tags,
+                            SavedModelBundle* bundle);
+
+/// Loads a SavedModel from the specified export directory. The MetaGraphDef
+/// to be loaded is identified by the supplied tags, corresponding exactly to
+/// the set of tags used at SavedModel build time. Stores a SavedModel bundle
+/// in *bundle with a session created from the requested MetaGraphDef if found.
+///
+/// This overload creates a SavedModelBundleLite, which consumes less RAM than
+/// an equivalent SavedModelBundle.
+absl::Status LoadSavedModel(const SessionOptions& session_options,
+                            const RunOptions& run_options,
+                            const string& export_dir,
+                            const std::unordered_set<string>& tags,
+                            SavedModelBundleLite* bundle);
+
+/// Checks whether the provided directory could contain a SavedModel. Note that
+/// the method does not load any data by itself. If the method returns `false`,
+/// the export directory definitely does not contain a SavedModel. If the method
+/// returns `true`, the export directory may contain a SavedModel but provides
+/// no guarantee that it can be loaded.
+bool MaybeSavedModelDirectory(const std::string& export_dir);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/loader_util.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/loader_util.h
new file mode 100644
index 00000000..9ce3500c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/loader_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_LOADER_UTIL_H_
+#define TENSORFLOW_CC_SAVED_MODEL_LOADER_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace internal {
+
+// A SavedModel may store the name of the initialization op to run in the
+// in the SignatureDef (v2) or a collection (v1). If an init_op collection
+// exists, then the collection must contain exactly one op.
+absl::Status GetInitOp(const string& export_dir,
+                       const MetaGraphDef& meta_graph_def,
+                       string* init_op_name);
+
+absl::Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
+                              std::vector<AssetFileDef>* asset_file_defs);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_LOADER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/metrics.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/metrics.h
new file mode 100644
index 00000000..fa587d60
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/metrics.h
@@ -0,0 +1,147 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// APIs for accessing SavedModel and checkpoint metric objects.
+//
+// In order to collect the data from these metrics, please add the metrics to
+// the provided monitoring platform. Unless configured with a user-specified
+// monitoring platform, the data is not collected in OSS.
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_METRICS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_METRICS_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
+
+namespace tensorflow {
+namespace metrics {
+
+const char kFingerprintFound[] = "FOUND";
+const char kFingerprintNotFound[] = "NOT_FOUND";
+const char kFingerprintError[] = "ERROR";
+
+// Returns "/tensorflow/core/saved_model/write/count" cell. This metric
+// has 1 field "write_version", which is equal to the
+// `tensorflow::libexport::GetWriteVersion` of the protobuf and should be
+// incremented when a SavedModel has been successfully written.
+monitoring::CounterCell& SavedModelWriteCount(absl::string_view write_version);
+
+// Returns "/tensorflow/core/saved_model/read/count" cell. This metric
+// has 1 field "write_version", which is equal to the
+// `tensorflow::libexport::GetWriteVersion` of the protobuf, and should be
+// incremented when a SavedModel has been successfully read.
+monitoring::CounterCell& SavedModelReadCount(absl::string_view write_version);
+
+// Returns "/tensorflow/core/saved_model/write/api" cell. This metric has 1
+// field "api_label" which corresponds to a SavedModel write API. The cell for
+// `foo` should be incremented when the write API `foo` is called.
+monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label);
+
+// Returns "/tensorflow/core/saved_model/read/api" cell. This metric has 1
+// field "api_label" which corresponds to a SavedModel read API. The cell for
+// `foo` should be incremented when the read API `foo` is called.
+monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label);
+
+// Returns "/tensorflow/core/saved_model/write/fingerprint" cell, which contains
+// the saved_model_checksum of the SM's fingerprint when it is exported.
+monitoring::GaugeCell<std::string>& SavedModelWriteFingerprint();
+
+// Returns "/tensorflow/core/saved_model/write/path" cell, which contains
+// the saved_model_path of the SM when it is exported.
+monitoring::GaugeCell<std::string>& SavedModelWritePath();
+
+// Returns "/tensorflow/core/saved_model/write/path_and_fingerprint" cell, which
+// contains the path (saved_model_path) and fingerprint (concatenation of
+// graph_def_program_hash, signature_def_hash, saved_object_graph_hash,
+// and checkpoint_hash) of the SavedModel when it is exported.
+monitoring::GaugeCell<std::string>& SavedModelWritePathAndSingleprint();
+
+// Returns "/tensorflow/core/saved_model/read/fingerprint" cell, wich contains
+// the saved_model_checksum of the SM's fingerprint when it is imported.
+monitoring::GaugeCell<std::string>& SavedModelReadFingerprint();
+
+// Returns "/tensorflow/core/saved_model/read/path" cell, wich contains
+// the saved_model_path of the SM when it is imported.
+monitoring::GaugeCell<std::string>& SavedModelReadPath();
+
+// Returns "/tensorflow/core/saved_model/read/path_and_fingerprint" cell, which
+// contains the path (saved_model_path) and singleprint (concatenation of
+// graph_def_program_hash, signature_def_hash, saved_object_graph_hash,
+// and checkpoint_hash) of the SavedModel when it is imported.
+monitoring::GaugeCell<std::string>& SavedModelReadPathAndSingleprint();
+
+// Returns the fingerprint as a Json string.
+std::string MakeFingerprintJson(FingerprintDef fingerprint_def);
+
+// Returns canonical string concatenation of path and singleprint.
+absl::StatusOr<std::string> MakeSavedModelPathAndSingleprint(
+    std::string path, std::string singleprint);
+
+// Returns path and singleprint as a pair, parsed canonically from the string
+// metric.
+absl::StatusOr<std::pair<std::string, std::string>>
+ParseSavedModelPathAndSingleprint(std::string path_and_singleprint);
+
+// Returns string status indicating whether or not the fingerprint.pb file was
+// found when loading the SavedModel.
+monitoring::GaugeCell<std::string>& SavedModelFoundFingerprintOnLoad();
+
+// Returns "/tensorflow/core/checkpoint/read/read_durations" cell belonging to
+// field `api_label`.
+monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label);
+
+// Returns "/tensorflow/core/checkpoint/write/write_durations" cell belonging to
+// field `api_label`.
+monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label);
+
+// Returns "/tensorflow/core/checkpoint/write/async_write_durations" cell
+// belonging to field `api_label`.
+monitoring::SamplerCell& AsyncCheckpointWriteDuration(
+    absl::string_view api_label);
+
+// Returns  "/tensorflow/core/checkpoint/write/training_time_saved" cell
+// belonging to field `api_label`.
+monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label);
+
+// Returns  "/tensorflow/core/checkpoint/write/checkpoint_size" cell
+// belonging to field (`api_label`, `filesize`).
+monitoring::CounterCell& CheckpointSize(absl::string_view api_label,
+                                        int64_t filesize);
+
+// Returns "/tensorflow/core/checkpoint/sharding/callback_duration" cell which
+// describes how long it took to execute the checkpoint sharding callback in
+// microseconds.
+monitoring::CounterCell& ShardingCallbackDuration();
+
+// Returns "/tensorflow/core/checkpoint/sharding/num_checkpoint_shards_written"
+// cell which describes how many checkpoint shard files were written during
+// saving.
+monitoring::CounterCell& NumCheckpointShardsWritten();
+
+// Returns "/tensorflow/core/checkpoint/sharding/callback_description" cell
+// which describes the callback used to shard the checkpoint during saving.
+monitoring::GaugeCell<std::string>& ShardingCallbackDescription();
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/reader.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/reader.h
new file mode 100644
index 00000000..b5e81f9e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/reader.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// Functions to read the SavedModel proto, or parts of it.
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_READER_H_
+#define TENSORFLOW_CC_SAVED_MODEL_READER_H_
+
+#include <memory>
+#include <unordered_set>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+namespace tensorflow {
+absl::Status ReadSavedModel(absl::string_view export_dir,
+                            SavedModel* saved_model_proto);
+
+// Finds and returns the MetaGraphDef (within the provided SavedModel) that
+// matches the given set of tags. The lifetime of the returned MetaGraphDef is
+// the same as the lifetime of `saved_model_proto`.
+//
+// FindMetaGraphDef returns a failure status when no MetaGraphDef matches the
+// provided tags.
+absl::StatusOr<MetaGraphDef*> FindMetaGraphDef(
+    const std::unordered_set<string>& tags, SavedModel* saved_model_proto);
+
+// Reads the SavedModel proto from saved_model.pb(txt) in the given directory,
+// finds the MetaGraphDef that matches the given set of tags and writes it to
+// the `meta_graph_def` parameter. Returns a failure status when the SavedModel
+// file does not exist or no MetaGraphDef matches the tags.
+absl::Status ReadMetaGraphDefFromSavedModel(
+    absl::string_view export_dir, const std::unordered_set<string>& tags,
+    MetaGraphDef* meta_graph_def);
+
+// Store debug info from the SavedModel export dir.
+absl::Status ReadSavedModelDebugInfoIfPresent(
+    absl::string_view export_dir,
+    std::unique_ptr<GraphDebugInfo>* debug_info_proto);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/signature_constants.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/signature_constants.h
new file mode 100644
index 00000000..7d8c07f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/signature_constants.h
@@ -0,0 +1,69 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+
+namespace tensorflow {
+
+/// Key in the signature def map for `default` serving signatures. The default
+/// signature is used in inference requests where a specific signature was not
+/// specified.
+static constexpr char kDefaultServingSignatureDefKey[] = "serving_default";
+
+////////////////////////////////////////////////////////////////////////////////
+/// Classification API constants.
+
+/// Classification inputs.
+static constexpr char kClassifyInputs[] = "inputs";
+
+/// Classification method name used in a SignatureDef.
+static constexpr char kClassifyMethodName[] = "tensorflow/serving/classify";
+
+/// Classification classes output.
+static constexpr char kClassifyOutputClasses[] = "classes";
+
+/// Classification scores output.
+static constexpr char kClassifyOutputScores[] = "scores";
+
+////////////////////////////////////////////////////////////////////////////////
+/// Predict API constants.
+
+/// Predict inputs.
+static constexpr char kPredictInputs[] = "inputs";
+
+/// Predict method name used in a SignatureDef.
+static constexpr char kPredictMethodName[] = "tensorflow/serving/predict";
+
+/// Predict outputs.
+static constexpr char kPredictOutputs[] = "outputs";
+
+////////////////////////////////////////////////////////////////////////////////
+/// Regression API constants.
+
+/// Regression inputs.
+static constexpr char kRegressInputs[] = "inputs";
+
+/// Regression method name used in a SignatureDef.
+static constexpr char kRegressMethodName[] = "tensorflow/serving/regress";
+
+/// Regression outputs.
+static constexpr char kRegressOutputs[] = "outputs";
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/tag_constants.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/tag_constants.h
new file mode 100644
index 00000000..68a090e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/tag_constants.h
@@ -0,0 +1,35 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+
+namespace tensorflow {
+
+/// Tag for the `gpu` graph.
+constexpr char kSavedModelTagGpu[] = "gpu";
+
+/// Tag for the `tpu` graph.
+constexpr char kSavedModelTagTpu[] = "tpu";
+
+/// Tag for the `serving` graph.
+constexpr char kSavedModelTagServe[] = "serve";
+
+/// Tag for the `training` graph.
+constexpr char kSavedModelTagTrain[] = "train";
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/test_utils.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/test_utils.h
new file mode 100644
index 00000000..3e131951
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/test_utils.h
@@ -0,0 +1,53 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_SAVED_MODEL_TEST_UTILS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_TEST_UTILS_H_
+
+#include <ostream>
+#include <string>
+
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow::saved_model {
+
+// TODO(b/229726259) Switch to OSS version after it's available.
+// Simple implementation of a proto matcher comparing string representations.
+// Only works as ShapeProto's textual representation is deterministic.
+class ProtoStringMatcher {
+ public:
+  explicit ProtoStringMatcher(const tensorflow::protobuf::Message& expected)
+      : expected_(expected.DebugString()) {}
+
+  template <typename Message>
+  bool MatchAndExplain(const Message& p,
+                       ::testing::MatchResultListener*) const {
+    return p.DebugString() == expected_;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_;
+  }
+
+ private:
+  const std::string expected_;
+};
+
+inline ::testing::PolymorphicMatcher<ProtoStringMatcher> EqualsProto(
+    const tensorflow::protobuf::Message& x) {
+  return ::testing::MakePolymorphicMatcher(ProtoStringMatcher(x));
+}
+
+}  // namespace tensorflow::saved_model
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/saved_model/util.h b/third_party/tflite-hdrs/tensorflow/cc/saved_model/util.h
new file mode 100644
index 00000000..2489f837
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/saved_model/util.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_SAVED_MODEL_UTIL_H_
+#define TENSORFLOW_CC_SAVED_MODEL_UTIL_H_
+
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+namespace tensorflow {
+namespace saved_model {
+
+// Utility functions for SavedModel reading and writing.
+
+// Returns "WriteVersion" ("1" or "2") of the SavedModel protobuf. If the
+// protobuf has exactly one MetaGraphDef, which contains a SavedObjectGraph, it
+// is version 2. Else, the protobuf is version 1.
+//
+// NOTE: The "WriteVersion" does *not* equal the major version of TF.
+std::string GetWriteVersion(const SavedModel& saved_model);
+
+// Get view of string keys of a map.
+std::set<std::string> GetMapKeys(
+    const ::google::protobuf::Map<std::string, ::tensorflow::TensorProto>& map);
+
+// Get the default input value from signature if it's missing in the request
+// inputs. If `is_alias` is set to true, the keys of the `request_inputs` are
+// alias names rather than the feed names in the graph.
+absl::Status GetInputValues(
+    const SignatureDef& signature,
+    const ::google::protobuf::Map<std::string, ::tensorflow::TensorProto>& request_inputs,
+    std::vector<std::pair<string, Tensor>>& inputs);
+
+}  // namespace saved_model
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/tools/freeze_saved_model.h b/third_party/tflite-hdrs/tensorflow/cc/tools/freeze_saved_model.h
new file mode 100644
index 00000000..8a35bafe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/tools/freeze_saved_model.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+#define TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+
+#include <unordered_set>
+
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns a frozen GraphDef, input tensors, and output tensors from the loaded
+// SavedModelBundle.
+// `inputs` and `outputs` consist of the union of all inputs and outputs in the
+// SignatureDefs in the SavedModelBundle.
+// FreezeSavedModel sets `frozen_graph_def` to a GraphDef of all nodes needed by
+// `outputs`. All variables in the supplied SavedModelBundle are converted to
+// constants, set to the value of the variables, by running the restored Session
+// in the SavedModelBundle.
+// WARNING: Only the variable checkpoints will be reflected in the frozen
+// graph_def. All saved_model assets will be ignored.
+absl::Status FreezeSavedModel(const SavedModelBundle& saved_model_bundle,
+                              GraphDef* frozen_graph_def,
+                              std::unordered_set<string>* inputs,
+                              std::unordered_set<string>* outputs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/training/coordinator.h b/third_party/tflite-hdrs/tensorflow/cc/training/coordinator.h
new file mode 100644
index 00000000..2a52d743
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/training/coordinator.h
@@ -0,0 +1,136 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#define TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+
+#include <atomic>
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/tsl/protobuf/error_codes.pb.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+/// The abstract interface for runners which must implement the Join and the
+/// IsRunning function.
+class RunnerInterface {
+ public:
+  virtual ~RunnerInterface() {}
+  virtual absl::Status Join() = 0;
+  virtual absl::Status ExportCostGraph(CostGraphDef* cost_graph) const {
+    return absl::Status(absl::StatusCode::kInvalidArgument,
+                        "No cost model to export.");
+  }
+  /// Returns true iff the runner is running, i.e. if it is trying to populate
+  /// its queue.
+  virtual bool IsRunning() const = 0;
+};
+
+/// Coordinator class manages the termination of a collection of QueueRunners.
+/// Without a coordinator, QueueRunners have to be joined in a specific order;
+/// otherwise the QueueRunner::Join() could sometimes hang. The
+/// Coordinator::RequestStop() plays the key role which notifies all running
+/// threads under a coordinator to stop. This function could be called by any
+/// thread or any client.
+/// Usage, in the client:
+///   Coordinator coord;
+///   std::unique_ptr<QueueRunner> qr(&coord, ...);
+///   qr.Start(session);
+///   coord.RegisterRunner(std::move(qr));
+///   /// do some work
+///   TF_CHECK_OK(coord.Join());
+/// In each thread of QueueRunner, the coordinator needs to be used as:
+///   void Run() {
+///     while (!coord->ShouldStop()) {
+///       /// do some work
+///       if (error) {
+///         coord->RequestStop();
+///         coord->ReportStatus(error_status);
+///       }
+///     }
+///   }
+class Coordinator {
+ public:
+  Coordinator();
+
+  /// Constructor with a list of error codes which would not be taken as errors
+  /// in status reporting.
+  Coordinator(const std::vector<error::Code>& clean_stop_errors);
+
+  /// In the destructor, RequestStop() and Join() would be called.
+  ~Coordinator();
+
+  /// Registers a runner, i.e. a unit of running threads which is usually a
+  /// QueueRunner. It takes the ownership of runner to avoid lifecycle-related
+  /// problems. Note, the coordinator would not start these threads; they are
+  /// supposed to be in running state when they are registered here.
+  absl::Status RegisterRunner(std::unique_ptr<RunnerInterface> runner);
+
+  /// Returns true iff all the registered runners have been stopped.
+  bool AllRunnersStopped();
+
+  /// Requests all running threads to stop.
+  absl::Status RequestStop();
+
+  /// Returns true if its RequestStop() has been called.
+  bool ShouldStop();
+
+  /// Joins all threads, returns OK or the first reported and unexpected status.
+  absl::Status Join();
+
+  /// Reports status to the coordinator. This is usually called by threads.
+  void ReportStatus(const absl::Status& status);
+
+  /// Returns the latest status.
+  absl::Status GetStatus();
+
+  /// Returns immediately if the coordinator is stopped or blocks until
+  /// RequestStop() is called.
+  void WaitForStop();
+
+  // Returns the cost graph from stored run metadata in registered runners.
+  absl::Status ExportCostGraph(CostGraphDef* cost_graph) const;
+
+ private:
+  std::unordered_set<int> clean_stop_errors_;
+  condition_variable wait_for_stop_;
+
+  mutex mu_;
+  bool should_stop_ TF_GUARDED_BY(mu_);
+
+  mutex status_lock_;
+  absl::Status status_ TF_GUARDED_BY(status_lock_);
+
+  mutable mutex runners_lock_;
+  std::vector<std::unique_ptr<RunnerInterface>> runners_
+      TF_GUARDED_BY(runners_lock_);
+
+  Coordinator(const Coordinator&) = delete;
+  void operator=(const Coordinator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_TRAINING_COORDINATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/cc/training/queue_runner.h b/third_party/tflite-hdrs/tensorflow/cc/training/queue_runner.h
new file mode 100644
index 00000000..3122ff31
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/cc/training/queue_runner.h
@@ -0,0 +1,144 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#define TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/cc/training/coordinator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+/// QueueRunner class imitates the behavior of the python version of QueueRunner
+/// which creates a thread for each enqueue op, runs close op on completion.
+class QueueRunner : public RunnerInterface {
+ public:
+  /// Creates a new QueueRunner from proto.
+  // TODO(yuefengz): we may want to initialize from queues and ops in the
+  // future.
+  static absl::Status New(const QueueRunnerDef& queue_runner_def,
+                          std::unique_ptr<QueueRunner>* result);
+
+  /// Creates a new QueueRunner with a coordinator, see coordinator.h for usage.
+  static absl::Status New(const QueueRunnerDef& queue_runner_def,
+                          Coordinator* coord,
+                          std::unique_ptr<QueueRunner>* result);
+
+  /// Adds a callback that the queue runner will call when it detects an error.
+  void AddErrorCallback(const std::function<void(absl::Status)>& cb);
+
+  /// Delete the previously registered callbacks.
+  void ClearErrorCallbacks();
+
+  /// The destructor would join all the threads.
+  ~QueueRunner();
+
+  /// Starts the queue runner with the given session.
+  absl::Status Start(Session* sess);
+
+  /// Starts the queue runner with the given session and sets the run arguments
+  /// for sess->Run. It also collects and stores the cost model.
+  absl::Status StartAndCollectCostGraph(
+      Session* sess, const RunOptions& run_options = RunOptions());
+
+  /// Starts the queue runner with the given session, and wait for up to the
+  /// specified time (in milliseconds) for the queues to start to fill up.
+  absl::Status Start(Session* sess, int wait_for_ms);
+  absl::Status StartAndCollectCostGraph(
+      Session* session, int wait_for_ms,
+      const RunOptions& run_options = RunOptions());
+
+  /// Requests to stop and runs the cancel op. It would be called in a separate
+  /// thread when coordinator is set. If there is no coordinator it should be
+  /// called before calling Join.
+  void Stop(Session* sess);
+
+  /// Joins all the threads. Returns okay if all threads run successfully;
+  /// otherwise returns the first captured failure status.
+  absl::Status Join() final;
+
+  /// Returns the latest status.
+  absl::Status GetStatus();
+
+  // Returns the stored cost model.
+  absl::Status ExportCostGraph(CostGraphDef* cost_graph) const override;
+
+ private:
+  QueueRunner() : coord_(nullptr), stopped_(false), cg_mu_(nullptr) {}
+
+  // Initializes the instance with the QueueRunnerDef proto.
+  absl::Status Init(const QueueRunnerDef& queue_runner_def);
+
+  // The Run function for each thread.
+  void Run(Session* sess, const string& enqueue_op);
+
+  // Updates the internal status; it only keeps OK or the first unexpected error
+  // status.
+  void UpdateStatus(const absl::Status& status);
+
+  bool IsQueueClosed(absl::Status status) const {
+    return queue_closed_exception_types_.count(
+               static_cast<int>(status.code())) > 0;
+  }
+
+  bool IsRunning() const override { return !stopped_; }
+
+  void SetRunArgumentsAndCostGraph(const RunOptions& run_options);
+
+  absl::Status RealRun(Session* sess, const string& op, bool update_costs);
+
+  string queue_name_;
+  std::vector<string> enqueue_op_names_;
+  string close_op_name_;
+  string cancel_op_name_;
+  // code::Code casted to int to avoid a hash function.
+  std::unordered_set<int> queue_closed_exception_types_;
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  mutex mu_;
+  int runs_ = 0;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+  absl::Status enqueue_status_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<BlockingCounter> counter_;
+
+  Coordinator* coord_;
+
+  std::atomic<bool> stopped_;
+
+  mutex cb_mu_;
+  std::vector<std::function<void(absl::Status)>> callbacks_;
+
+  mutable std::unique_ptr<mutex> cg_mu_;
+  std::unique_ptr<CostGraphDef> cost_graph_ TF_GUARDED_BY(cg_mu_);
+  RunOptions run_options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/aot_only_var_handle_op.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/aot_only_var_handle_op.h
new file mode 100644
index 00000000..43a8196e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/aot_only_var_handle_op.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_AOT_AOT_ONLY_VAR_HANDLE_OP_H_
+#define TENSORFLOW_COMPILER_AOT_AOT_ONLY_VAR_HANDLE_OP_H_
+
+namespace tensorflow {
+namespace tfcompile {
+
+static constexpr const char* const kXlaAotOnlyVarHandleOp =
+    "_XlaAotOnlyVarHandleOp";
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_AOT_ONLY_VAR_HANDLE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/benchmark.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/benchmark.h
new file mode 100644
index 00000000..526c76c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/benchmark.h
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains benchmark functions used with the code-generated benchmarks that can
+// be used to test a model on android. See also code generation rules in
+// tfcompile.bzl.
+//
+// This is separate from the built-in micro-benchmarks, because we want to:
+// 1. show a binary with minimal dependencies, to show a close-to-lower-bound
+//    binary size.
+// 2. compile on Android.
+#ifndef TENSORFLOW_COMPILER_AOT_BENCHMARK_H_
+#define TENSORFLOW_COMPILER_AOT_BENCHMARK_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tfcompile {
+namespace benchmark {
+
+// Options specifies options for benchmarks of functions generated by tfcompile.
+struct Options {
+  // kDefaultMicros specifies the default time to run the benchmark, and is used
+  // if neither max_iters nor max_micros is set.
+  static constexpr int64_t kDefaultMicros = 3000000;
+
+  int64_t max_iters = 0;   // Maximum iterations to run, ignored if <= 0.
+  int64_t max_micros = 0;  // Maximum microseconds to run, ignored if <= 0.
+};
+
+// Stats holds statistics collected during benchmarking.
+struct Stats {
+  std::vector<int64_t> per_iter_us;  // Per-iteration deltas in us.
+  int64_t total_us;                  // Total time in us.
+
+  Stats() : total_us(0) { per_iter_us.reserve(5000); }
+};
+
+// DumpStatsToStdout printfs to stdout stats in a multi-line human-friendly
+// form.
+void DumpStatsToStdout(const Stats& stats);
+
+// BenchmarkFn is the signature of the function generated by tfcompile.
+typedef std::function<void()> BenchmarkFn;
+
+// Benchmark runs a benchmark of the function `fn`, collecting stats in `stats`.
+// Use `options` to configure benchmarking options.
+void Benchmark(const Options& options, const BenchmarkFn& fn, Stats* stats);
+
+}  // namespace benchmark
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/codegen.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/codegen.h
new file mode 100644
index 00000000..993196b1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/codegen.h
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_AOT_CODEGEN_H_
+#define TENSORFLOW_COMPILER_AOT_CODEGEN_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/aot/compile.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+// CodegenOpts specifies code generation options for the generated header file
+// and the generated metadata object file.
+struct CodegenOpts {
+  // The name of the generated C++ class, wrapping the generated function.
+  string class_name;
+
+  // Target triple for the architecture we're targeting.
+  string target_triple;
+
+  // Namespaces specifies a list of C++ namespaces to add to the generated
+  // header.  If empty, all symbols will be in the global namespace.
+  std::vector<string> namespaces;
+
+  // If true, generate name-to-index data for Lookup{Arg,Result}Index methods.
+  bool gen_name_to_index = false;
+
+  // If true, generate program shape data for the ProgramShape method.
+  bool gen_program_shape = false;
+
+  // If true, emit a serialized HloProfilePrinterData protobuf that can be used
+  // to pretty print HLO profile counters.
+  bool gen_hlo_profile_printer_data = false;
+
+  // If true, sets this executable as an XLA Runtime one.
+  bool use_xla_runtime = false;
+};
+
+// Describes a generated metadata object file.
+struct MetadataResult {
+  // These are top level "extern C" declarations that are expected to be visible
+  // wherever program_shape_access_shim is emitted.
+  std::vector<string> header_variable_decls;
+
+  // program_shape_access_shim is a C++ expression that constructs the
+  // xla::ProgramShapeProto instance for the CompileResult passed to
+  // GenerateMetadata.
+  string program_shape_access_shim;
+
+  // hlo_profile_printer_data_access_shim is a C++ expression that constructs
+  // the xla::HloProfilePrinterData instance for the CompileResult passed to
+  // GenerateMetadata.  If the xla::HloProfilePrinterData is null then this is a
+  // C++ expression that evaluates to nullptr at runtime.
+  string hlo_profile_printer_data_access_shim;
+
+  // The contents of the object (".o") file.
+  string object_file_data;
+};
+
+// Generates a metadata object file according to `opts` and `compile_result`.
+// The generated object file is returned via `metadata_result`.
+absl::Status GenerateMetadata(const CodegenOpts& opts,
+                              const CompileResult& compile_result,
+                              MetadataResult* metadata_result);
+
+// GenerateHeader uses the meta-information from compile_result to generate a
+// C++ header giving access to the function in the generated object file.  The
+// header includes API usage documentation.
+//
+// metadata_result is an instance of MetadataResult obtained by a previous
+// invocation to GenerateMetadata.
+absl::Status GenerateHeader(const CodegenOpts& opts,
+                            const tf2xla::Config& config,
+                            const CompileResult& compile_result,
+                            const MetadataResult& metadata_result,
+                            string* header);
+
+// ParseCppClass parses `cpp_class` into its `class_name` and `namespaces`
+// components.  The syntax is [[<optional_namespace>::],...]<class_name>.  This
+// mirrors the C++ syntax for referring to a class, where multiple namespaces
+// may precede the class name, separated by double-colons.
+absl::Status ParseCppClass(const string& cpp_class, string* class_name,
+                           std::vector<string>* namespaces);
+
+// ValidateCppIdent returns OK iff ident is a valid C++ identifier.  The msg is
+// appended to error messages.
+absl::Status ValidateCppIdent(absl::string_view ident, absl::string_view msg);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_CODEGEN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/compile.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/compile.h
new file mode 100644
index 00000000..9d3ff78a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/compile.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_AOT_COMPILE_H_
+#define TENSORFLOW_COMPILER_AOT_COMPILE_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/aot/flags.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "xla/service/cpu/cpu_compiler.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+// CompileResult describes the output of CompileGraph, where the object file
+// data and meta-information is available in aot.
+struct CompileResult {
+  // Contains object file and meta-info.
+  std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
+  xla::ProgramShapeProto program_shape;  // Static shape of args and results.
+  string entry_point;                    // Name of generated function.
+  int pointer_size = 0;                  // Size of a pointer in bytes.
+};
+
+// CompileGraph compiles the graph_def into an object file containing a function
+// that performs the graph operations.
+//
+// The XLA compilation options are specified in the flags.
+absl::Status CompileGraph(GraphDef graph_def, const tf2xla::Config& config,
+                          const MainFlags& flags,
+                          CompileResult* compile_result);
+
+// The full compilation method, for reuse in a library setting.
+absl::Status Main(const MainFlags& flags);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_COMPILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/embedded_protocol_buffers.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/embedded_protocol_buffers.h
new file mode 100644
index 00000000..0af4d4a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines utilities to help "embed" protocol buffers into object
+// (".o") files.  These C++ binaries and shared objects can link in these .o to
+// get access to said protocol buffers at runtime.
+
+#ifndef TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
+#define TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace tfcompile {
+using absl::StatusOr;
+
+// Represents a set of protocol buffers embedded into an object file and
+// describes how to access them at runtime.
+struct EmbeddedProtocolBuffers {
+  // Each instance CPPShim describes how to generate C++ code to instantiate a
+  // protobuf instance from the corresponding static data emitted into the
+  // object file.
+  struct CPPShim {
+    // `expression` is a C++ expression that creates an instance of said
+    // protocol buffer when executed.
+    string expression;
+
+    // `variable_decl` is an "extern C" array declaration that is used in
+    // `expression`.  It must be visible wherever `expression` is emitted.
+    string variable_decl;
+  };
+
+  // Each cpp_shim corresponds to one embedded protocol buffer.
+  std::vector<CPPShim> cpp_shims;
+
+  // The contents of the object (".o") file the protocol buffers are embbed in.
+  // This needs to be linked in to any program that wants to execute any of the
+  // expressions in `cpp_shims`.
+  string object_file_data;
+};
+
+// Describes a protocol buffer to embed into an object file.
+struct ProtobufToEmbed {
+  // `symbol_prefix` is prefix that is guaranteed to be unique across the binary
+  // or DSO the generated object file will be linked into.
+  string symbol_prefix;
+
+  // `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
+  // namespace qualified) protocol buffer name.  This is only used in
+  // CPPShim::expression so relatively qualified names are fine as long as
+  // they're valid wherever CPPShim::expression is emitted.
+  string qualified_cpp_protobuf_name;
+
+  // `message` is the protocol buffer to be embedded.  It is allowed to be
+  // nullptr, in which case the generated C++ shim expression is just `nullptr`,
+  // and the generated object file does not define any symbols.
+  const ::tensorflow::protobuf::MessageLite* message;
+};
+
+// Embeds a sequence of protocol buffers into an object file.
+//
+// `target_triple` is the target triple for the target architecture for the
+// generated object file.
+//
+// `protobufs_to_embed` describes the protocol buffers to embed into the
+// resulting object file.  The C++ shim for protobufs_to_embed[i] is
+// cpp_shims[i] in the returned EmbeddedProtocolBuffers instance.  The contents
+// of all the protocol buffers are embedded into a single .o file whose content
+// is stored in the object_file_data field in the returned
+// EmbeddedProtocolBuffers instance.
+absl::StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
+    absl::string_view target_triple,
+    absl::Span<const ProtobufToEmbed> protobufs_to_embed);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/flags.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/flags.h
new file mode 100644
index 00000000..7b02f172
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/flags.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_AOT_FLAGS_H_
+#define TENSORFLOW_COMPILER_AOT_FLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+// Flags for the tfcompile binary.  See *.cc file for descriptions.
+
+struct MainFlags {
+  string graph;
+  string debug_info;
+  string debug_info_path_begin_marker;
+  string config;
+  bool dump_fetch_nodes = false;
+  string target_triple;
+  string target_cpu;
+  string target_features;
+  string entry_point;
+  string cpp_class;
+  string out_function_object;
+  string out_metadata_object;
+  string out_header;
+  string out_session_module;
+  string mlir_components;
+  bool experimental_quantize = false;
+
+  // Sanitizer pass options
+  bool sanitize_dataflow = false;
+  string sanitize_abilists_dataflow;
+
+  // C++ codegen options
+  bool gen_name_to_index = false;
+  bool gen_program_shape = false;
+};
+
+// Appends to flag_list a tensorflow::Flag for each field in MainFlags.
+void AppendMainFlags(std::vector<Flag>* flag_list, MainFlags* flags);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/aot/quantize.h b/third_party/tflite-hdrs/tensorflow/compiler/aot/quantize.h
new file mode 100644
index 00000000..62f03808
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/aot/quantize.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_AOT_QUANTIZE_H_
+#define TENSORFLOW_COMPILER_AOT_QUANTIZE_H_
+
+#include <functional>
+#include <iostream>
+#include <ostream>
+
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+using QuantizeXlaFn = std::function<absl::Status(
+    const tf2xla::Config& config, xla::XlaComputation* computation)>;
+
+// Set the static quantization function to the `fn` if it hasn't been set.
+// Return false if the static function has been set.
+bool RegisterQuantizeFn(const QuantizeXlaFn& fn);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_QUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/build_xla_ops_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/build_xla_ops_pass.h
new file mode 100644
index 00000000..c1219d7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/build_xla_ops_pass.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_BUILD_XLA_OPS_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_BUILD_XLA_OPS_PASS_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Replaces TF function calls marked with `_XlaCompiledKernel` with _XlaCompile
+// and _XlaRun nodes (which compile and launch, respectively, the corresponding
+// HLO module).
+class BuildXlaOpsPass : public GraphOptimizationPass {
+ public:
+  // If enable_lazy_compilation is not nullopt then *enable_lazy_compilation
+  // overrides --tf_xla_enable_lazy_compilation flag in deciding whether lazy
+  // compilation is enabled.
+  explicit BuildXlaOpsPass(
+      std::optional<bool> enable_lazy_compilation = std::nullopt)
+      : enable_lazy_compilation_(enable_lazy_compilation) {}
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  std::optional<bool> enable_lazy_compilation_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_BUILD_XLA_OPS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/clone_constants_for_better_clustering.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
new file mode 100644
index 00000000..ebe51008
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
+#define TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+// Clones small host constants in the graph to make it easier to form larger
+// clusters.
+//
+// This helps us in two ways:
+//
+//  - It reduces dependencies between clusters.  Let's say a constant C is used
+//    by nodes X and Y.  If X and Y are put in different clusters (for whatever
+//    reason) Y's cluster now has to wait for all the operations in X's cluster
+//    to finish before it starts running.
+//
+//  - It lets us create bigger clusters in multi-GPU benchmarks.  Consider the
+//    following graph:
+//
+//    digraph {
+//      Const -> GPU_1
+//      Const -> GPU_0_Y
+//      GPU_0_X -> GPU_0_Y
+//    }
+//
+//    We'd cluster Const and GPU_1 together (and place it on GPU_1), and this
+//    will block us from clustering GPU_0_X and GPU_0_Y together since that
+//    would increase the amount of work on GPU 0 waiting on work on GPU 1.
+//    However, cloning Const into two copies, one for GPU_0_Y and one for GPU_1
+//    will let us create one cluster containing {Const/copy_0, GPU_1} and
+//    another containing {Const/copy_1, GPU_0_X, GPU_0_Y}.
+//
+// We only clone small host constants now to avoid increasing memory consumption
+// too much.  Moreover, in practice the constants we have to duplicate are
+// things like the `perm` input to `Transpose` and the `size` input to `Slice`
+// which tend to be small anyway.
+
+class CloneConstantsForBetterClusteringPass : public GraphOptimizationPass {
+ public:
+  CloneConstantsForBetterClusteringPass() = default;
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/cluster_scoping_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/cluster_scoping_pass.h
new file mode 100644
index 00000000..0b0c2ccf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/cluster_scoping_pass.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_CLUSTER_SCOPING_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_CLUSTER_SCOPING_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// This pass adds scopes to nodes in the _XlaInternalScope attribute to guide
+// the later clustering passes.  A major reason to do this is to prevent the
+// clustering from losing critical parallelism in the Tensorflow graph, which
+// can incur great performance degradation.
+//
+// This pass must be run before MarkForCompilationPass, as it stores the
+// scoping information that MarkForCompilationPass will need to respect for
+// clustering decision.
+class ClusterScopingPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_CLUSTER_SCOPING_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/compilability_check_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/compilability_check_util.h
new file mode 100644
index 00000000..0d86c22d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/compilability_check_util.h
@@ -0,0 +1,340 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
+
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/device_util.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/service/graphcycles/graphcycles.h"
+#include "xla/union_find.h"
+#include "xla/util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+// Checks whether a TF node can be compiled or not.  "Recursive" as in for call
+// and functional while nodes it recursively checks whether the callee functions
+// can be compiled.
+class RecursiveCompilabilityChecker {
+ public:
+  // Contains node name and function name. If the node is not inside a function
+  // body, function name is an empty string.
+  struct StackFrame {
+    std::string name;
+    std::string function_name;
+    std::shared_ptr<AbstractStackTrace> stack_trace;
+  };
+
+  // Contains information about uncompilable node inside a function body.
+  struct UncompilableNodeInfo {
+    std::string name;
+    // A list representing a stacktrace from the highest level node in
+    // increasing call depth to immediate node that fails the
+    // compilability checker.
+    std::vector<StackFrame> stack_trace;
+    std::string uncompilable_reason;
+  };
+
+  // Aggregates information about what kinds of ops are allowed.
+  struct OperationFilter {  // TODO(lzr): Add AllowEverything() helper.
+    // Whether resource variable ops are allowed are allowed in callees.  We do
+    // not allow resource variable ops in called functions (either as direct TF
+    // calls or as higher order control flow ops) because we do not yet model
+    // their memory effects in jit/resource_operation_safety_analysis.
+    bool allow_resource_ops_in_called_functions = false;
+
+    // Whether Stack operations are allowed.  We avoid auto-clustering Stack
+    // operations in general because we do not support snapshotting them.
+    //
+    // TODO(b/112837194): This restriction can be lifted with some work.
+    bool allow_stack_ops = false;
+
+    // Whether TensorArray operations are allowed.  We avoid auto-clustering
+    // TensorArray operations in general because we do not support snapshotting
+    // them.
+    //
+    // TODO(b/112837194): This restriction can be lifted with some work.
+    bool allow_tensor_array_ops = false;
+
+    // Whether stateful RNG ops are allowed.  XLA's RNG does not have the same
+    // seeding behavior as TensorFlow's RNG (b/34749654).  So we avoid
+    // auto-clustering stateful RNG ops.
+    bool allow_stateful_rng_ops = false;
+
+    // TODO(b/118970344): Whether ControlTrigger ops are allowed.  It is unsound
+    // to cluster ControlTrigger because of how we use deadness analysis.
+    bool allow_control_trigger = false;
+
+    // Whether it is okay to "cluster" Assert and CheckNumerics by simply
+    // removing them (they're not removed during clustering, but their
+    // XlaOpKernel is a no-op kernel).  We avoid auto-clustering these ops so
+    // that the user is not surprised when XLA is implicitly enabled. If the
+    // user explicitly specifies to use XLA, it is fine to resort to a dummy
+    // implementation. Currently Assert and CheckNumerics ops have dummy XLA
+    // implementations.
+    bool allow_eliding_assert_and_checknumerics_ops = false;
+
+    // Whether ops that produce or consume DT_VARIANT values are allowed.  We
+    // don't auto-cluster these ops because we don't yet support live-in or
+    // live-out DT_VARIANT values.
+    bool allow_ops_producing_or_consuming_variant = false;
+
+    // Whether ops known to be slow on XLA-GPU should be considered compilable.
+    bool allow_slow_ops = false;
+
+    // Whether ops known to have numerical accuracy issues should be considered
+    // compilable..
+    bool allow_inaccurate_ops = false;
+
+    // Require the function to be always compilable, regardless whether some
+    // control flow branches might be dead for a given input.
+    bool require_always_compilable = false;
+
+    // Whether string constants are compilable.
+    bool allow_string_consts = true;
+
+    // Whether to allow the compilation of CollectiveReduceV2Op.
+    bool allow_collective_reduce_v2 = true;
+
+    // Whether to allow the compilation of WhereOp.
+    bool allow_where_op = true;
+
+    // Whether to allow the compilation of UniqueOp. Compilation of the UniqueOp
+    // generates output with bounded dynamic shape that may cause failures with
+    // auto clustering.
+    // TODO(b/209813421): Enable tf.unique during
+    // autoclustering once all failures are rfixed.
+    bool allow_unique_op = true;
+
+    // Whether ops that are marked as outside compiled are always considered
+    // compilable.
+    // TODO(b/191502757):  Make this behavior true by default and remove this
+    // option once inference converter supports outside compilation.
+    bool allow_outside_compiled = false;
+  };
+
+  RecursiveCompilabilityChecker(OperationFilter op_filter,
+                                DeviceType jit_device_type)
+      : op_filter_(std::move(op_filter)),
+        jit_device_type_(std::move(jit_device_type)) {}
+
+  using UncompilableNodesMap =
+      std::map<std::string,
+               std::pair<NameAttrList, std::vector<UncompilableNodeInfo>>>;
+
+  // Returns a map where the key is the function identifier(short debug
+  // string) of the function encapsulating the uncompilable nodes, and the
+  // value is a pair of NameAttrList of the function and a vector of
+  // uncompilable node info. When uncompilable node is not inside any
+  // function call nodes, then key is a ShortDebugString() of an empty
+  // NameAttrList.
+  //
+  // Also, when `node` is inside a function body, users can set
+  // `node_stack_trace` to provide an additional context for `node`'s
+  // placement within the outer most graph.
+  UncompilableNodesMap FindUncompilableNodes(
+      const Node& node, FunctionLibraryRuntime* lib_runtime,
+      const std::vector<StackFrame>* node_stack_trace = nullptr) const;
+
+  // Returns true if `node` can be compiled by XLA.
+  bool IsCompilableNode(const Node& node,
+                        FunctionLibraryRuntime* lib_runtime) const {
+    std::vector<StackFrameView> stack_trace;
+    stack_trace.emplace_back(StackFrameView{node.name(), ""});
+    return IsCompilableNode(node, lib_runtime, &stack_trace);
+  }
+
+  // Returns true if XLA supports this Op, but we don't want to cluster it (ie:
+  // due to performance or correctness concerns).
+  bool OpIsInaccurate(const Node& node) const;
+  bool OpIsSlow(const Node& node) const;
+
+ private:
+  struct StackFrameView {
+    absl::string_view name;
+    absl::string_view function_name;
+    std::shared_ptr<AbstractStackTrace> stack_trace;
+  };
+
+  bool IsCompilableNode(
+      const Node& node, FunctionLibraryRuntime* lib_runtime,
+      std::vector<StackFrameView>* stack_trace,
+      NameAttrList* encapsulating_function = nullptr,
+      UncompilableNodesMap* uncompilable_nodes = nullptr) const;
+  bool IsCompilableCall(
+      const NodeDef& call_def, FunctionLibraryRuntime* lib_runtime,
+      std::vector<StackFrameView>* stack_trace,
+      NameAttrList* encapsulating_function = nullptr,
+      UncompilableNodesMap* uncompilable_nodes = nullptr) const;
+  bool IsCompilableIf(const Node& if_node, FunctionLibraryRuntime* lib_runtime,
+                      std::vector<StackFrameView>* stack_trace,
+                      NameAttrList* encapsulating_function,
+                      UncompilableNodesMap* uncompilable_nodes) const;
+  bool IsCompilableWhile(const Node& while_node,
+                         FunctionLibraryRuntime* lib_runtime,
+                         std::vector<StackFrameView>* stack_trace,
+                         NameAttrList* encapsulating_function,
+                         UncompilableNodesMap* uncompilable_nodes) const;
+
+  // Tests whether 'case_node' is compilable. Every operator in all branches
+  // must be compilable.
+  bool IsCompilableCase(const Node& case_node,
+                        FunctionLibraryRuntime* lib_runtime,
+                        std::vector<StackFrameView>* stack_trace,
+                        NameAttrList* encapsulating_function,
+                        UncompilableNodesMap* uncompilable_nodes) const;
+
+  // Returns compilability of node def retrieved from `node`'s attribute with
+  // name `attr_name`.
+  bool ExtractNodeDefAndCheckCompilability(
+      const Node& node, const std::string& attr_name,
+      const std::string& call_name, NameAttrList* encapsulating_function,
+      FunctionLibraryRuntime* lib_runtime,
+      std::vector<StackFrameView>* stack_trace,
+      UncompilableNodesMap* uncompilable_nodes) const;
+
+  bool IsStackOp(const Node& node) const {
+    const XlaResourceOpInfo* op_info =
+        GetResourceOpInfoForOp(node.type_string());
+    return op_info && op_info->resource_kind() == XlaResourceKind::kStack;
+  }
+
+  bool IsTensorArrayOp(const Node& node) const {
+    const XlaResourceOpInfo* op_info =
+        GetResourceOpInfoForOp(node.type_string());
+    return op_info && op_info->resource_kind() == XlaResourceKind::kTensorArray;
+  }
+
+  bool IsAssertOrCheckNumerics(absl::string_view op_name) const {
+    return op_name == "Assert" || op_name == "CheckNumerics";
+  }
+
+  bool IsStatefulRandomOp(absl::string_view op_name) const {
+    return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
+           op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
+           op_name == "TruncatedNormal" || op_name == "Multinomial";
+  }
+
+  bool OpProducesOrConsumesVariant(const Node& node) const {
+    auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
+    return absl::c_any_of(node.input_types(), is_variant) ||
+           absl::c_any_of(node.output_types(), is_variant);
+  }
+
+  bool HasXLAKernel(const Node& node,
+                    string* uncompilable_reason = nullptr) const;
+
+  static void MaybeMarkUncompilableNode(
+      const absl::string_view reason,
+      const std::vector<StackFrameView>& stack_trace,
+      NameAttrList* encapsulating_function,
+      UncompilableNodesMap* uncompilable_nodes_map);
+
+  // Make sure we don't recurse infinitely on recursive functions.
+  const size_t kMaxRecursionDepth = 50;
+
+  const OperationFilter op_filter_;
+  const DeviceType jit_device_type_;
+};
+
+RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
+    const XlaOpRegistry::DeviceRegistration& registration);
+
+// Given a FunctionLibraryRuntime and a `function`, returns this function's body
+// in `fbody` as well as the indices of its constant and resource arguments.
+// `fbody` is owned by `flr`.
+// `constant_arg_indices` and `resource_arg_indices` should be empty vector.
+// They are sorted in ascending order on this function's return.
+absl::Status GetBodyAndConstantsAndResources(
+    FunctionLibraryRuntime* flr, const NameAttrList& function,
+    const FunctionBody** fbody, std::vector<int>* constant_arg_indices,
+    std::vector<int>* resource_arg_indices);
+
+// Given a NodeDef `node_def` returns true iff `node_def` has kXlaCompileAttr
+// set.
+bool CanCreateXlaKernel(const NodeDef& node_def);
+
+// Returns memory types for the input.
+// `constant_arg_indices` and `resource_arg_indices` are sorted arrays of
+// indices corresponding to constant and resource arguments respectively.
+//
+// One might wonder, about the case where a compile-time constant argument
+// (which must be in host memory) is also used as an input into an op,
+// e.g. `Add`, that expects its inputs in device memory. Here is how it
+// works now.
+// First, what do we mean by "op expects an input in XYZ memory"?
+// There are two types of "ops" here: the tf2xla kernel and the HLO
+// computation it builds. The tf2xla kernel needs to retrieve the actual
+// numeric value of the compile-time constant tensors, so it really expects
+// them to be on in host memory. However, for other inputs, it refers to them
+// using xla::ComputationDataHandle, which is just a symbolic handle that
+// xla::ComputationBuilder assigns. How does this handle gets assigned for
+// constant arguments? Even constant arguments get an _Arg node in the graph
+// instantiated for Function compilation. The tf2xla kernel for constant _Arg
+// nodes takes the constant value, converts it to XlaLiteral, and feeds it
+// to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+// constant XlaLiteral is included in the HLO graph, and subsequently, in
+// the actual executable, which is copied to the device before being
+// executed. Thus, when this executable runs, the constant is available in
+// device memory.
+tensorflow::MemoryTypeVector GetInputMemoryTypes(
+    const tensorflow::FunctionBody* fbody,
+    absl::Span<int const> constant_arg_indices,
+    absl::Span<int const> resource_arg_indices);
+
+// Returns output memory types.
+//
+// XlaLaunch kernel keeps all outputs (including constants, which it copies),
+// in device memory except for resources.
+tensorflow::MemoryTypeVector GetOutputMemoryTypes(
+    const tensorflow::FunctionBody* fbody);
+
+// Check whether graph can trigger XLA compilation.
+bool CanTriggerXlaCompilation(const GraphDef& graph);
+
+// Returns true iff the node can trigger XLA compilation.
+bool NodeCanTriggerXlaCompilation(const NodeDef& node);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/deadness_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/deadness_analysis.h
new file mode 100644
index 00000000..80fa9a20
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/deadness_analysis.h
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// This analyzes a TensorFlow graph to identify nodes which may have partially
+// dead inputs (i.e. these nodes may have some dead inputs and some alive
+// inputs).
+//
+// For example, the ADD node in the following graph
+//
+//      V0  PRED0    V1  PRED1
+//       |    |       |    |
+//       v    v       v    v
+//       SWITCH       SWITCH
+//          |            |
+//          +---+   + ---+
+//              |   |
+//              v   v
+//               ADD
+//
+// can have its inputs independently dead or alive based on the runtime values
+// of PRED0 and PRED1.
+//
+// It is tempting to call this a liveness analysis but I avoided that because
+// "liveness" already has other connotations.
+class DeadnessAnalysis {
+ public:
+  // An opaque representation of a predicate.  DeadnessPredicate
+  // instances that compare equal via operator== represent predicates
+  // that always evaluate to the same value.
+  struct DeadnessPredicate {
+   public:
+    DeadnessPredicate(const DeadnessPredicate&) = default;
+    DeadnessPredicate(DeadnessPredicate&&) = default;
+
+    DeadnessPredicate& operator=(const DeadnessPredicate&) = default;
+    DeadnessPredicate& operator=(DeadnessPredicate&&) = default;
+
+    bool operator==(const DeadnessPredicate& other) const {
+      return other.pred_ == pred_;
+    }
+
+    bool operator!=(const DeadnessPredicate& other) const {
+      return other.pred_ != pred_;
+    }
+
+   private:
+    explicit DeadnessPredicate(void* pred) : pred_(pred) {}
+
+    // This is really a Predicate*, but we don't want to expose that
+    // implementation detail to our clients.  `pred_` has pointer equality so we
+    // can just compare the pointer in operator== and operator!=.
+    void* pred_;
+
+    friend class DeadnessAnalysis;
+  };
+
+  virtual absl::StatusOr<DeadnessPredicate> GetPredicateFor(Node* n,
+                                                            int oidx) const = 0;
+
+  // Prints out the internal state of this instance.  For debugging purposes
+  // only.
+  virtual void Print() const = 0;
+  virtual ~DeadnessAnalysis();
+
+  string DebugString(DeadnessPredicate predicate) const;
+
+  // Run the deadness analysis over `graph` and returns an error or a populated
+  // instance of DeadnessAnalysis in `result`.
+  static absl::Status Run(const Graph& graph,
+                          std::unique_ptr<DeadnessAnalysis>* result);
+
+ protected:
+  static DeadnessPredicate MakeDeadnessPredicate(void* pred) {
+    return DeadnessPredicate(pred);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/deadness_analysis_internal.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/deadness_analysis_internal.h
new file mode 100644
index 00000000..0dc18d3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_
+#define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/graph/tensor_id.h"
+
+namespace tensorflow {
+namespace deadness_analysis_internal {
+
+// Returns a map describing the predicate each Tensor was mapped to.  For
+// testing purposes only.
+using PredicateMapTy = absl::flat_hash_map<TensorId, string, TensorId::Hasher>;
+absl::Status ComputePredicates(const Graph& graph,
+                               PredicateMapTy* out_predicate_map,
+                               bool enable_optimistic = true);
+
+}  // namespace deadness_analysis_internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/defs.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/defs.h
new file mode 100644
index 00000000..58bd4bdd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/defs.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Provides definitions needed for use of the TensorFlow XLA
+// device.
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEFS_H_
+#define TENSORFLOW_COMPILER_JIT_DEFS_H_
+
+namespace tensorflow {
+
+// Name of attribute used to tag operators for compilation with XLA
+
+// Implies must-compile semantics: either it will be compiled
+// with XLA, or an error will be thrown.
+extern const char* const kXlaMustCompileAttr;  // "_XlaMustCompile"
+
+// Implies auto-clustering: tagged nodes will be clustered and compiled with XLA
+// on a best-effort basis.
+extern const char* const kXlaCompileAttr;  // "_XlaCompile"
+
+// Implies auto-clustering within the given scope.
+extern const char* const kXlaScopeAttr;    // "_XlaScope"
+extern const char* const kXlaInternalScopeAttr;  // "_XlaInternalScope"
+
+// The id of the compiled cluster.
+extern const char* const kXlaClusterIdAttr;  // "_xla_compile_id"
+
+[[deprecated("XLA:CPU/GPU devices are deprecated")]] void
+RequestXlaDevicesCreation();
+
+[[deprecated("XLA:CPU/GPU devices are deprecated")]] bool
+XlaDevicesCreationRequired();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEFS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_cache.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_cache.h
new file mode 100644
index 00000000..ad871349
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_cache.h
@@ -0,0 +1,258 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CACHE_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CACHE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace device_compilation_cache_internal {
+template <typename ExecutableType>
+int64_t ExecutableSize(const ExecutableType* executable) {
+  return 0;
+}
+
+template <>
+inline int64_t ExecutableSize<xla::LocalExecutable>(
+    const xla::LocalExecutable* executable) {
+  if (executable != nullptr && executable->executable() != nullptr) {
+    return executable->executable()->SizeOfGeneratedCodeInBytes();
+  }
+
+  return 0;
+}
+
+template <>
+inline int64_t ExecutableSize<xla::PjRtLoadedExecutable>(
+    const xla::PjRtLoadedExecutable* executable) {
+  if (executable != nullptr) {
+    return executable->SizeOfGeneratedCodeInBytes();
+  }
+
+  return 0;
+}
+}  // namespace device_compilation_cache_internal
+
+// Cache to store compiled HLO, executables and related metadata keyed by
+// `DeviceCompilationClusterSignature`. The cache owns the stored
+// CompilationResults and Executables.
+// Currently no cache eviction policy is implemented and the cache grows without
+// bound.
+template <typename ExecutableType>
+class DeviceCompilationCache {
+ public:
+  DeviceCompilationCache() = default;
+  ~DeviceCompilationCache() = default;
+
+  using Key = DeviceCompilationClusterSignature;
+  struct Value {
+    DeviceCompileState compile_state = DeviceCompileState::kUncompiled;
+    absl::Status compilation_status;
+    int64_t request_count = 0;
+    const XlaCompiler::CompilationResult* compilation_result = nullptr;
+    ExecutableType* executable = nullptr;
+  };
+
+  // Returns std::nullopt if value for the supplied key is not found. If a value
+  // is found, `request_count` is incremented before returning the value.
+  std::optional<Value> Lookup(const Key& key) const;
+
+  // Inserts an empty value if value is not found and returns it. If a value is
+  // found, `request_count` is incremented before returning the value.
+  Value LookupOrCreate(const Key& key);
+
+  // Caches `compile_state`, `compilation_status`, `compilation_result` and
+  // `executable` and associates them with the provided `key`. Takes ownership
+  // of `compilation_result` and `executable`. Does not increment the
+  // corresponding `request_count`. Only arguments that are not std::nullopt are
+  // updated in the cache.
+  void Store(const Key& key, std::optional<DeviceCompileState> compile_state,
+             std::optional<absl::Status> compilation_status,
+             std::optional<std::unique_ptr<XlaCompiler::CompilationResult>>
+                 compilation_result,
+             std::optional<std::unique_ptr<ExecutableType>> executable);
+
+  std::string DebugString() const;
+
+ private:
+  // The value associated with a cache entry.
+  struct Entry {
+    mutable mutex mu;
+
+    // The current compilation state for this entry.
+    DeviceCompileState compile_state TF_GUARDED_BY(mu) =
+        DeviceCompileState::kUncompiled;
+
+    // The number of times a compilation with this signature has been requested.
+    int64_t request_count TF_GUARDED_BY(mu) = 0;
+
+    // Did compilation succeed?
+    absl::Status compilation_status TF_GUARDED_BY(mu);
+
+    // Output of the XlaCompiler.
+    std::unique_ptr<XlaCompiler::CompilationResult> compilation_result
+        TF_GUARDED_BY(mu);
+
+    // The XLA executable compiled from <computation>. May be null if no
+    // executable has been built.
+    std::unique_ptr<ExecutableType> executable TF_GUARDED_BY(mu);
+
+    std::string DebugString() const {
+      mutex_lock lock(mu);
+
+      int64_t executable_size =
+          device_compilation_cache_internal::ExecutableSize<ExecutableType>(
+              executable.get());
+
+      int64_t hlo_module_size = 0;
+      if (compilation_result != nullptr &&
+          compilation_result->computation != nullptr) {
+        hlo_module_size =
+            compilation_result->computation->proto().ByteSizeLong();
+      }
+
+      return absl::StrCat(
+          "{compile_state: ", compile_state, ", request_count: ", request_count,
+          ", compilation_status: ", compilation_status.ToString(),
+          ", compilation_result?: ", compilation_result != nullptr,
+          ", hlo_module_size: ", hlo_module_size, " bytes",
+          ", executable?: ", executable != nullptr,
+          ", executable_size: ", executable_size, " bytes}");
+    }
+  };
+
+  mutable mutex compile_cache_mu_;
+  absl::flat_hash_map<Key, std::unique_ptr<Entry>, Key::Hash> cache_
+      TF_GUARDED_BY(compile_cache_mu_);
+
+  DeviceCompilationCache(const DeviceCompilationCache&) = delete;
+  void operator=(const DeviceCompilationCache&) = delete;
+};
+
+template <typename ExecutableType>
+std::optional<typename DeviceCompilationCache<ExecutableType>::Value>
+DeviceCompilationCache<ExecutableType>::Lookup(const Key& key) const {
+  // The outer lock protects the existence of the cache entry. It does not
+  // protect the contents of the cache entry.
+  Entry* entry;
+  {
+    mutex_lock lock(compile_cache_mu_);
+    // Find cache entry.
+    auto it = cache_.find(key);
+    if (it == cache_.cend()) {
+      return std::nullopt;
+    }
+
+    entry = it->second.get();
+  }
+
+  mutex_lock lock(entry->mu);
+  Value value = {/*compile_state=*/entry->compile_state,
+                 /*compilation_status=*/entry->compilation_status,
+                 /*request_count=*/++entry->request_count,
+                 /*compilation_result=*/entry->compilation_result.get(),
+                 /*executable=*/entry->executable.get()};
+  return value;
+}
+
+template <typename ExecutableType>
+typename DeviceCompilationCache<ExecutableType>::Value
+DeviceCompilationCache<ExecutableType>::LookupOrCreate(const Key& key) {
+  // The outer lock protects the existence of the cache entry. It does not
+  // protect the contents of the cache entry.
+  Entry* entry;
+  {
+    mutex_lock lock(compile_cache_mu_);
+    // Emplace empty cache entry if not found.
+    auto it = cache_.emplace(key, std::make_unique<Entry>()).first;
+    entry = it->second.get();
+  }
+
+  mutex_lock lock(entry->mu);
+  Value value = {/*compile_state=*/entry->compile_state,
+                 /*compilation_status=*/entry->compilation_status,
+                 /*request_count=*/++entry->request_count,
+                 /*compilation_result=*/entry->compilation_result.get(),
+                 /*executable=*/entry->executable.get()};
+  return value;
+}
+
+template <typename ExecutableType>
+void DeviceCompilationCache<ExecutableType>::Store(
+    const Key& key, std::optional<DeviceCompileState> compile_state,
+    std::optional<absl::Status> compilation_status,
+    std::optional<std::unique_ptr<XlaCompiler::CompilationResult>>
+        compilation_result,
+    std::optional<std::unique_ptr<ExecutableType>> executable) {
+  Entry* entry;
+  {
+    mutex_lock lock(compile_cache_mu_);
+    // Emplace empty cache entry if not found.
+    auto it = cache_.emplace(key, std::make_unique<Entry>()).first;
+    entry = it->second.get();
+  }
+
+  {
+    mutex_lock lock(entry->mu);
+    if (compile_state.has_value()) {
+      entry->compile_state = *compile_state;
+    }
+    if (compilation_status.has_value()) {
+      entry->compilation_status = *compilation_status;
+    }
+    if (compilation_result.has_value()) {
+      entry->compilation_result = std::move(*compilation_result);
+    }
+    if (executable.has_value()) {
+      entry->executable = std::move(*executable);
+    }
+  }
+
+  VLOG(4) << "Added/updated cache entry: key=" << key.HumanString()
+          << ", entry=" << entry->DebugString();
+}
+
+template <typename ExecutableType>
+std::string DeviceCompilationCache<ExecutableType>::DebugString() const {
+  std::string s = "DeviceCompilationCache<ExecutableType> {\n";
+  {
+    mutex_lock lock(compile_cache_mu_);
+    for (const auto& [key, entry] : cache_) {
+      absl::StrAppend(&s, key.HumanString(), " : ", entry->DebugString(),
+                      ",\n");
+    }
+  }
+  absl::StrAppend(&s, "}");
+
+  return s;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_cluster_signature.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_cluster_signature.h
new file mode 100644
index 00000000..4acea2a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_cluster_signature.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CLUSTER_SIGNATURE_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CLUSTER_SIGNATURE_H_
+
+#include <utility>
+#include <variant>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+
+namespace tensorflow {
+
+// Describes the types, shapes and any compile-time constant arguments
+// to a kernel. Key that uniquely identifies a compilation output.
+struct DeviceCompilationClusterSignature {
+  // Name of the cluster, built from the function name and it's attributes.
+  string name;
+
+  // List of args (either as a TensorTypeAndShape or as a Tensor value)
+  // for compile-time constant arguments to the compilation, ordered by
+  // argument number. Tensors must be in host memory.
+  using TensorTypeAndShape =
+      std::pair<DataType, absl::InlinedVector<int64_t, 4>>;
+  absl::InlinedVector<std::variant<Tensor, TensorTypeAndShape>, 8> args;
+
+  bool operator==(const DeviceCompilationClusterSignature& other) const;
+
+  struct Hash {
+    uint64 operator()(const DeviceCompilationClusterSignature& signature) const;
+  };
+
+  // Returns a human-readable description of the signature.
+  string HumanString() const;
+
+  // Builds the signature for a compilation.
+  static absl::StatusOr<DeviceCompilationClusterSignature> Build(
+      const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CLUSTER_SIGNATURE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_profiler.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_profiler.h
new file mode 100644
index 00000000..9f1d9521
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compilation_profiler.h
@@ -0,0 +1,101 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_PROFILER_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_PROFILER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+namespace tensorflow {
+
+// Tracks statistics for device compilation and uses these to determine whether
+// the given cluster should be compiled or not.
+class DeviceCompilationProfiler : public ResourceBase {
+ public:
+  DeviceCompilationProfiler() = default;
+  ~DeviceCompilationProfiler() override;
+
+  struct ClusterCompileStats {
+    // Number of times the cluster has been (re-)compiled.
+    int64_t compile_count = 0;
+
+    // The number of times this cluster has been executed.
+    int64_t execution_count = 0;
+
+    // Cumulative time spent compiling the cluster.
+    int64_t cumulative_compile_time_us = 0;
+
+    // True if we have decided that this cluster is too dynamic (i.e. its shapes
+    // change too frequently) to profitably JIT compile.  Once a cluster is
+    // tagged megamorphic, it stays megamorphic forever.
+    bool is_megamorphic = false;
+
+    std::string DebugString() const {
+      return absl::StrCat(
+          "DeviceCompilationProfiler::ClusterCompileStats {compile_count=",
+          compile_count, ", execution_count=", execution_count,
+          ", cumulative_compile_time_us=", cumulative_compile_time_us,
+          ", is_megamorphic=", is_megamorphic, "}");
+    }
+  };
+
+  // Returns the compilation statistics for the given cluster.
+  absl::StatusOr<ClusterCompileStats> GetCompileStats(
+      const NameAttrList& function) const;
+
+  // Determines whether the cluster should be compiled. Creates and inserts an
+  // entry into stats (also calls `RegisterExecution`) for `function` if it
+  // doesn't already exist.
+  virtual bool ShouldCompileCluster(const NameAttrList& function,
+                                    DeviceCompileMode compile_mode,
+                                    int64_t current_request_count);
+
+  // Registers a cluster execution. Increments the execution count for the given
+  // cluster and also determines whether the cluster has gone megamorphic (and
+  // sets the megamorphic bit accordingly).
+  void RegisterExecution(const NameAttrList& function);
+
+  // Registers a cluster compilation. Increments the compilation count and
+  // accumulates the compile time for the given cluster. Also broadcasts an
+  // XlaJitCompilationActivity.
+  virtual absl::Status RegisterCompilation(const NameAttrList& function,
+                                           int64_t compile_time_us,
+                                           bool used_persistent_cache);
+
+  void IncrementOngoingAsyncCompilations();
+  void DecrementOngoingAsyncCompilations();
+  int64_t GetNumOngoingAsyncCompilations() const;
+  std::string DebugString() const override;
+
+ private:
+  mutable mutex mu_;
+
+  // Maps cluster names to compilation statistics for said cluster.
+  absl::flat_hash_map<std::string, ClusterCompileStats> cluster_compile_stats_
+      TF_GUARDED_BY(mu_);
+
+  int64_t num_ongoing_compilations_ TF_GUARDED_BY(mu_) = 0;
+
+  DeviceCompilationProfiler(const DeviceCompilationProfiler&) = delete;
+  void operator=(const DeviceCompilationProfiler&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compiler.h
new file mode 100644
index 00000000..1baa7085
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compiler.h
@@ -0,0 +1,504 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_H_
+
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/device_compilation_cache.h"
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/jit/device_executable_persistor.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/local_client.h"
+#include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Compiles/lowers a given Tensorflow graph/function/cluster into a compiled XLA
+// compilation (HLO) using the XlaCompiler and compiles the resulting
+// XlaCompilationResult into an `ExecutableType` (eg. xla::LocalExecutable) by
+// calling `ClientType` (eg. xla::LocalClient).
+//
+// Caches the compiled XlaCompilationResult and Executable using a
+// DeviceCompilationCache. Compilation is done only when there's a cache miss.
+//
+// Uses the DeviceExecutablePersistor class for persistence and tries to load a
+// serialized executable from disk upon a request for compilation. If the
+// appropriate executable isn't found on disk, compiles the given Tensorflow
+// function/graph/cluster into an XlaCompilationResult (HLO) and
+// `ExecutableType` and tries saving/persisting the compiled HLO and executable
+// to disk.
+//
+// Since XLA computations must have static shapes, DeviceCompiler generates a
+// new XLA computation for each new set of input shapes.
+// TODO(b/255826209): De-templatize once we've moved to Device API completely.
+template <typename ExecutableType, typename ClientType>
+class DeviceCompiler : public ResourceBase {
+ public:
+  DeviceCompiler(
+      std::unique_ptr<DeviceExecutablePersistor<ExecutableType, ClientType>>
+          persistor,
+      std::unique_ptr<DeviceCompilerClient<ExecutableType, ClientType>>
+          compiler_client);
+  ~DeviceCompiler() override;
+
+  enum class CompileScope {
+    kOp,
+    kFunction,
+  };
+
+  // Compiles a function into a XlaCompiler::CompilationResult that can be used
+  // to execute an XLA Computation. Compilation results are cached. Compilation
+  // is skipped if there is a cache hit. `function` is the name of a Tensorflow
+  // function to compile. `args` is a description of the arguments to the
+  // computation.
+  //
+  // `compile_mode` controls the behavior of the compilation cache on a cache
+  // miss.  If `compile_mode` is `kLazy` then, based on some profitability
+  // heuristics, the compilation cache may decide not to compile the cluster at
+  // this time.  In this case it returns null into both `out_compilation_result`
+  // and `out_executable`.  If `compile_mode` is `kStrict` then the compilation
+  // cache always attempts the compilation on a cache miss. If compilation mode
+  // is 'kAsync' compilation of the cluster happens in the background while the
+  // fallback path executes.
+  //
+  // The result of compilation is written to `*out_compilation_result`, which
+  // must be non-null. If `out_executable` is non-null, also builds an
+  // `ExecutableType` and sets `out_executable` to point to it. The
+  // resulting executable pointer may be null if the computation has no
+  // non-constant outputs.
+  absl::Status CompileIfNeeded(
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      const std::vector<XlaCompiler::Argument>& args,
+      const XlaCompiler::CompileOptions& compile_options,
+      DeviceCompileMode compile_mode, DeviceCompilationProfiler* profiler,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      ExecutableType** out_executable);
+
+  // As above, but for a single op.
+  absl::Status CompileSingleOpIfNeeded(
+      const XlaCompiler::Options& options,
+      const std::vector<XlaCompiler::Argument>& args,
+      const XlaCompiler::CompileOptions& compile_options, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      ExecutableType** out_executable);
+
+  ClientType* client() const { return compiler_client_->client(); }
+  const DeviceType& device_type() const { return persistor_->device_type(); }
+  DeviceCompilationCache<ExecutableType>* cache() { return cache_.get(); }
+  DeviceExecutablePersistor<ExecutableType, ClientType>* persistor() {
+    return persistor_.get();
+  }
+  DeviceCompilerClient<ExecutableType, ClientType>* compiler_client() {
+    return compiler_client_.get();
+  }
+
+  string DebugString() const override;
+
+ private:
+  // Common implementation of Compile and CompileSingleOp. The `OpKernelContext`
+  // parameter is always null for the former.
+  absl::Status CompileImpl(
+      const XlaCompiler::CompileOptions& compile_options,
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      const std::vector<XlaCompiler::Argument>& args, CompileScope scope,
+      DeviceCompileMode compile_mode, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      ExecutableType** out_executable);
+
+  StatusOr<typename DeviceCompilationCache<ExecutableType>::Value>
+  CompileStrict(
+      const DeviceCompilationClusterSignature& sig,
+      const XlaCompiler::CompileOptions& compile_options,
+      const XlaCompiler::Options& options,
+      const std::vector<XlaCompiler::Argument>& args,
+      const NameAttrList& function,
+      typename DeviceCompilationCache<ExecutableType>::Value cache_value,
+      CompileScope scope, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler, mutex* mu)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(*mu);
+
+  absl::Status CompileAsynchronous(
+      const DeviceCompilationClusterSignature& sig,
+      const XlaCompiler::CompileOptions& compile_options,
+      const XlaCompiler::Options& options,
+      const std::vector<XlaCompiler::Argument>& args,
+      const NameAttrList& function, CompileScope scope, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler);
+
+  std::unique_ptr<DeviceExecutablePersistor<ExecutableType, ClientType>>
+      persistor_;
+  std::unique_ptr<DeviceCompilerClient<ExecutableType, ClientType>>
+      compiler_client_;
+  std::unique_ptr<DeviceCompilationCache<ExecutableType>> cache_;
+
+  // Pool of threads for asynchronous compilations.
+  std::unique_ptr<thread::ThreadPool> async_compiler_threads_;
+
+  mutex cluster_mutexes_mu_;
+  absl::flat_hash_map<DeviceCompilationClusterSignature, std::unique_ptr<mutex>,
+                      DeviceCompilationClusterSignature::Hash>
+      cluster_mutexes_ TF_GUARDED_BY(cluster_mutexes_mu_);
+
+  DeviceCompiler(const DeviceCompiler&) = delete;
+  void operator=(const DeviceCompiler&) = delete;
+};
+
+namespace device_compiler_internal {
+// Print something that users can search for to definitively ascertain that XLA
+// was used for their TF model.
+// Prints only once to avoid spamming LOG(INFO).
+inline void LogOnceXlaCompiledFirstCluster() {
+  static absl::once_flag log_once;
+  absl::call_once(log_once, [] {
+    LOG(INFO) << "Compiled cluster using XLA!  This line is logged at most "
+                 "once for the lifetime of the process.";
+  });
+}
+
+template <typename ExecutableType>
+inline absl::Status EligibleToPersist(DeviceCompileState compile_state,
+                                      const ExecutableType* executable) {
+  if (compile_state != DeviceCompileState::kCompiled) {
+    return errors::FailedPrecondition(
+        "Cache entry to serialize is not compiled.");
+  }
+  if (executable == nullptr) {
+    return errors::FailedPrecondition(
+        "LocalExecutable not found for cache entry to serialize.");
+  }
+  return absl::OkStatus();
+}
+}  // namespace device_compiler_internal
+
+template <typename ExecutableType, typename ClientType>
+DeviceCompiler<ExecutableType, ClientType>::DeviceCompiler(
+    std::unique_ptr<DeviceExecutablePersistor<ExecutableType, ClientType>>
+        persistor,
+    std::unique_ptr<DeviceCompilerClient<ExecutableType, ClientType>>
+        compiler_client)
+    : persistor_(std::move(persistor)),
+      compiler_client_(std::move(compiler_client)) {
+  cache_ = std::make_unique<DeviceCompilationCache<ExecutableType>>();
+  async_compiler_threads_ = std::make_unique<tensorflow::thread::ThreadPool>(
+      tensorflow::Env::Default(), "async_compiler_threads",
+      kNumAsyncDeviceCompilerThreads);
+}
+
+template <typename ExecutableType, typename ClientType>
+DeviceCompiler<ExecutableType, ClientType>::~DeviceCompiler() {
+  // Since programs are owned by the cache, ensure any use of our programs have
+  // completed by waiting for all stream executors to complete.
+  compiler_client_->WaitForProgramsToFinish();
+  // Wait for all outstanding compilations to finish.
+  // Resetting the pointer explicitly in the top level destructor.
+  // Without this, the pointer would be reset when the AsyncCompilationState
+  // is destructed, which is dependent on the order of the members in the
+  // DeviceCompiler class, which is error prone if the order changes.
+  async_compiler_threads_.reset();
+  // TODO(b/110813685): Think about the program ownership model. Programs are
+  // currently owned by the compilation cache which means we must wait for
+  // program completion in the destructor. There are multiple compilation caches
+  // around, which complicates things a little. Perhaps having programs be
+  // shared_ptrs (an invasive change) would make the model easier to reason
+  // about?
+}
+
+template <typename ExecutableType, typename ClientType>
+string DeviceCompiler<ExecutableType, ClientType>::DebugString() const {
+  return "DeviceCompiler";
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status DeviceCompiler<ExecutableType, ClientType>::CompileIfNeeded(
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args,
+    const XlaCompiler::CompileOptions& compile_options,
+    DeviceCompileMode compile_mode, DeviceCompilationProfiler* profiler,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    ExecutableType** out_executable) {
+  return CompileImpl(compile_options, options, function, args,
+                     CompileScope::kFunction, compile_mode, /*ctx=*/nullptr,
+                     profiler, out_compilation_result, out_executable);
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status
+DeviceCompiler<ExecutableType, ClientType>::CompileSingleOpIfNeeded(
+    const XlaCompiler::Options& options,
+    const std::vector<XlaCompiler::Argument>& args,
+    const XlaCompiler::CompileOptions& compile_options, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    ExecutableType** out_executable) {
+  const NodeDef& def = ctx->op_kernel().def();
+  NameAttrList name;
+  name.set_name(def.op());
+  *name.mutable_attr() = def.attr();
+  // Remove the "_class" attribute from the attribute set used to create the
+  // compilation cache key. This attribute is information for the colocator
+  // and causes false uniqueness between nodes.
+  name.mutable_attr()->erase("_class");
+  return CompileImpl(compile_options, options, name, args, CompileScope::kOp,
+                     DeviceCompileMode::kStrict, ctx, profiler,
+                     out_compilation_result, out_executable);
+}
+
+template <typename ExecutableType, typename ClientType>
+StatusOr<typename DeviceCompilationCache<ExecutableType>::Value>
+DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
+    const DeviceCompilationClusterSignature& sig,
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::Options& options,
+    const std::vector<XlaCompiler::Argument>& args,
+    const NameAttrList& function,
+    typename DeviceCompilationCache<ExecutableType>::Value cache_value,
+    CompileScope scope, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler, mutex* mu) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+  const uint64 compile_start_us = env->NowMicros();
+
+  TfGraphToHloCompiler compiler(options);
+  cache_value.compile_state = DeviceCompileState::kCompiled;
+
+  std::unique_ptr<ExecutableType> out_executable;
+  auto out_compilation_result =
+      std::make_unique<XlaCompiler::CompilationResult>();
+
+  if (scope == CompileScope::kOp) {
+    cache_value.compilation_status = compiler.CompileSingleOp(
+        compile_options, ctx, args, out_compilation_result.get());
+  } else {
+    CHECK(scope == CompileScope::kFunction);  // Crash OK
+    cache_value.compilation_status = compiler.Compile(
+        compile_options, function, args, out_compilation_result.get());
+  }
+  TF_RETURN_IF_ERROR(cache_value.compilation_status);
+  TF_RET_CHECK(cache_value.executable == nullptr);
+  TF_RET_CHECK(out_compilation_result->computation != nullptr);
+
+  auto loaded_executable = persistor_->TryToLoadExecutable(
+      DeviceCompilationClusterSignature::Hash()(sig), sig.HumanString(),
+      options, *out_compilation_result, compiler_client_.get());
+
+  if (loaded_executable.has_value()) {
+    cache_value.compilation_status = loaded_executable->status();
+    if (loaded_executable->ok()) {
+      out_executable = *std::move(*loaded_executable);
+      metrics::UpdatePersistentCacheLoadCount();
+    }
+  } else {
+    auto built_executable =
+        compiler_client_->BuildExecutable(options, *out_compilation_result);
+    TF_RETURN_IF_ERROR(built_executable.status());
+    out_executable = *std::move(built_executable);
+
+    TF_RETURN_IF_ERROR(
+        device_compiler_internal::EligibleToPersist<ExecutableType>(
+            cache_value.compile_state, out_executable.get()));
+    TF_RETURN_IF_ERROR(persistor_->TryToPersistExecutable(
+        DeviceCompilationClusterSignature::Hash()(sig), sig.HumanString(),
+        options, *out_compilation_result, *out_executable,
+        compiler_client_.get()));
+  }
+
+  cache_value.compilation_result = out_compilation_result.get();
+  cache_value.executable = out_executable.get();
+  cache_->Store(sig, cache_value.compile_state, cache_value.compilation_status,
+                std::move(out_compilation_result), std::move(out_executable));
+
+  const uint64 compile_end_us = env->NowMicros();
+  const uint64 compile_time_us = compile_end_us - compile_start_us;
+
+  device_compiler_internal::LogOnceXlaCompiledFirstCluster();
+  TF_RETURN_IF_ERROR(profiler->RegisterCompilation(
+      function, compile_time_us, loaded_executable.has_value()));
+  return cache_value;
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status DeviceCompiler<ExecutableType, ClientType>::CompileAsynchronous(
+    const DeviceCompilationClusterSignature& signature,
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::Options& options,
+    const std::vector<XlaCompiler::Argument>& args,
+    const NameAttrList& function, CompileScope scope, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler) {
+  // Explicitly capture all required data by value for async compilation.
+  // Update compilation state in cache.
+  cache_->Store(signature, DeviceCompileState::kCompiling, std::nullopt,
+                std::nullopt, std::nullopt);
+  profiler->IncrementOngoingAsyncCompilations();
+  // Don't move the above code into the thread function as it synchronously
+  // updates the async compilation state!
+
+  // When the ThreadPool for the compilation cache is destroyed, it waits for
+  // compilations to have finished. This means that both 'entry' and 'this' will
+  // be alive for the duration of the compilation.
+  // !!Pay attention when additional variables must be captured by this lambda!!
+  // All values are captured by value. Make sure that all pointer values (like
+  // entry) do not get freed until the lambda has finished.
+  const std::string& function_name = function.name();
+  async_compiler_threads_->Schedule([=] {
+    VLOG(2) << "Starting asynchronous compilation of cluster " << function_name
+            << '.';
+    // We don't need to lock mu, but do it anyway to satisfy thread safety
+    // analysis.
+    mutex mu;
+    mutex_lock lock(mu);
+    auto cache_value = typename DeviceCompilationCache<ExecutableType>::Value();
+    auto s = CompileStrict(signature, compile_options, options, args, function,
+                           cache_value, scope, ctx, profiler, &mu);
+    VLOG(2) << "Finished asynchronous compililation of cluster "
+            << function_name << '.';
+    profiler->DecrementOngoingAsyncCompilations();
+    // Update compilation status in cache.
+    if (!s.ok()) {
+      cache_->Store(signature, std::nullopt, s.status(), std::nullopt,
+                    std::nullopt);
+    }
+  });
+  return absl::OkStatus();
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status DeviceCompiler<ExecutableType, ClientType>::CompileImpl(
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args, CompileScope scope,
+    DeviceCompileMode compile_mode, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    ExecutableType** out_executable) {
+  DCHECK_NE(out_executable, nullptr);
+  VLOG(2) << "DeviceCompiler::Compile " << DebugString();
+
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "num_inputs=" << args.size();
+    for (int i = 0, end = args.size(); i < end; i++) {
+      VLOG(3) << i << ": " << args[i].HumanString();
+    }
+  }
+  TF_ASSIGN_OR_RETURN(auto signature,
+                      DeviceCompilationClusterSignature::Build(function, args));
+
+  // The outer lock protects the existence of the mutex in the map.
+  mutex* cluster_mutex;
+  {
+    mutex_lock lock(cluster_mutexes_mu_);
+    auto it =
+        cluster_mutexes_.emplace(signature, std::make_unique<mutex>()).first;
+    cluster_mutex = it->second.get();
+  }
+
+  profiler->RegisterExecution(function);
+
+  string human_signature;
+  if (VLOG_IS_ON(2)) {
+    human_signature = VLOG_IS_ON(3) ? signature.HumanString() : function.name();
+    VLOG(2) << "DeviceCompilationClusterSignature: " << human_signature;
+  }
+
+  // Acquire the cache entry lock and compile, if necessary.
+  // TODO(phawkins): this locking will need to be restructured when we implement
+  // cache eviction.
+  mutex_lock cluster_compile_lock(*cluster_mutex);
+  auto cache_value = cache_->LookupOrCreate(signature);
+
+  int64_t current_request_count = cache_value.request_count;
+  VLOG(2) << "Compilation cache entry hit: "
+          << static_cast<int>(cache_value.compile_state)
+          << " signature: " << human_signature << " with request count "
+          << current_request_count;
+
+  DeviceCompileState state = cache_value.compile_state;
+  *out_compilation_result = nullptr;
+  *out_executable = nullptr;
+
+  // Check if the requested entry is uncompiled and return an error if
+  // compilation is disabled. This will raise an error for kLazy even if we have
+  // not yet hit the compilation threshold and no compilation happens this
+  // round. This is to avoid non-determanism of when compilation is disallowed,
+  // for example by changing the threshold.
+  if (state == DeviceCompileState::kUncompiled && FailOnXlaCompilation()) {
+    VLOG(1) << "XLA compilation disabled: " << function.name() << "\n"
+            << absl::StrJoin(
+                   args, "\n",
+                   [](std::string* out, const XlaCompiler::Argument& arg) {
+                     absl::StrAppend(out, " arg: ", arg.HumanString());
+                   });
+
+    return errors::Internal("XLA compilation disabled");
+  }
+
+  if (state == DeviceCompileState::kUncompiled) {
+    XLA_SCOPED_LOGGING_TIMER("Compilation of XLA executable");
+    if (!profiler->ShouldCompileCluster(function, compile_mode,
+                                        current_request_count)) {
+      VLOG(2) << "Not compiling for signature: " << human_signature;
+      return absl::OkStatus();
+    } else if (compile_mode == DeviceCompileMode::kAsync) {
+      VLOG(2) << "Queueing asynchronous compilation for signature: "
+              << human_signature;
+      TF_RETURN_IF_ERROR(CompileAsynchronous(signature, compile_options,
+                                             options, args, function, scope,
+                                             ctx, profiler));
+      return absl::OkStatus();
+    } else {
+      VLOG(2) << "Instantly compiling for signature: " << human_signature;
+      TF_ASSIGN_OR_RETURN(
+          cache_value,
+          CompileStrict(signature, compile_options, options, args, function,
+                        cache_value, scope, ctx, profiler, cluster_mutex));
+    }
+  } else if (state == DeviceCompileState::kCompiling) {
+    VLOG(2) << "Ongoing asynchronous compilation for signature: "
+            << human_signature;
+    return absl::OkStatus();
+  } else if (state == DeviceCompileState::kCompiled) {
+    VLOG(2) << "Already Compiled for signature: " << human_signature;
+  }
+
+  TF_RETURN_IF_ERROR(cache_value.compilation_status);
+  *out_compilation_result = cache_value.compilation_result;
+  *out_executable = cache_value.executable;
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compiler_client.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compiler_client.h
new file mode 100644
index 00000000..358cb923
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_compiler_client.h
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_CLIENT_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_CLIENT_H_
+
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/executable_build_options.h"
+
+namespace tensorflow {
+
+template <typename ExecutableType, typename ClientType>
+class DeviceCompilerClient {
+ public:
+  DeviceCompilerClient() = default;
+  virtual ~DeviceCompilerClient() = default;
+
+  // Compiles `result` (HLO) to an `ExecutableType` using `ClientType` and
+  // returns it.
+  virtual StatusOr<std::unique_ptr<ExecutableType>> BuildExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) = 0;
+
+  // Serializes an available `executable` to string using `ClientType` and
+  // returns it.
+  virtual absl::StatusOr<std::string> SerializeExecutable(
+      const ExecutableType& executable) = 0;
+
+  // Compiles `result` (HLO) to a serializable executable (eg.
+  // xla::AotCompilationResult) using `ClientType`, serializes it to string and
+  // returns it.
+  virtual absl::StatusOr<std::string> BuildSerializedExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) = 0;
+
+  // Loads `serialized_executable` into an `ExecutableType` using `ClientType`.
+  virtual StatusOr<std::unique_ptr<ExecutableType>> LoadExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result,
+      const std::string& serialized_executable) = 0;
+
+  // Waits for the underlying `ClientType` backend's programs to finish
+  // executing before returning.
+  virtual void WaitForProgramsToFinish() = 0;
+
+  virtual ClientType* client() const = 0;
+
+ private:
+  DeviceCompilerClient(const DeviceCompilerClient&) = delete;
+  void operator=(const DeviceCompilerClient&) = delete;
+};
+
+// Generates the ExecutableBuildOptions for compilation from HLO to
+// executable.
+xla::ExecutableBuildOptions GetExecutableBuildOptions(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result, int default_device_ordinal);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_executable_persistor.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_executable_persistor.h
new file mode 100644
index 00000000..458441c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_executable_persistor.h
@@ -0,0 +1,404 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_EXECUTABLE_PERSISTOR_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_EXECUTABLE_PERSISTOR_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
+#include "tensorflow/compiler/jit/xla_device_compiler_client.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/util.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+// Returns the persisted compilation cache file name for the given key.
+std::string XlaSerializedCacheKeyToFileName(const XlaSerializedCacheKey& key);
+
+// Offers a way to persist and/or load compiled `ExecutableType`s along with the
+// corresponding HLO (`CompilationResult`) to/from `persistent_cache_directory`
+// (if one was provided during construction) on disk  using `ClientType`.
+template <typename ExecutableType, typename ClientType>
+class DeviceExecutablePersistor {
+ public:
+  // Configuration for setting up persistence (directory, filename prefix, etc).
+  struct Config {
+    Config() = default;
+    explicit Config(absl::string_view persistent_cache_directory,
+                    bool disable_strict_signature_checks,
+                    absl::string_view persistence_prefix,
+                    bool persistent_cache_directory_read_only)
+        : persistent_cache_directory(persistent_cache_directory),
+          disable_strict_signature_checks(disable_strict_signature_checks),
+          persistence_prefix(persistence_prefix),
+          persistent_cache_directory_read_only(
+              persistent_cache_directory_read_only) {}
+
+    explicit Config(absl::string_view persistent_cache_directory,
+                    bool disable_strict_signature_checks,
+                    absl::string_view persistence_prefix)
+        : persistent_cache_directory(persistent_cache_directory),
+          disable_strict_signature_checks(disable_strict_signature_checks),
+          persistence_prefix(persistence_prefix) {}
+
+    // If non-empty, JIT-compiled executables are saved to and loaded from the
+    // specified file system directory path.
+    std::string persistent_cache_directory;
+
+    // Disable strict signature checks for entries loaded into the cache from
+    // external sources.
+    bool disable_strict_signature_checks = false;
+
+    // The cache persistence prefix to use if serializing/deserialzing entries.
+    std::string persistence_prefix;
+
+    // Cache is read-only if set to true.
+    bool persistent_cache_directory_read_only = false;
+  };
+
+  DeviceExecutablePersistor(const Config& config,
+                            const DeviceType& device_type);
+  virtual ~DeviceExecutablePersistor() = default;
+
+  // Returns std::nullopt if persistence is not enabled (i.e.
+  // `persistent_cache_directory_` is empty) or if the serialized entry is not
+  // found on disk. Otherwise, loads and returns the serialized executable
+  // (or returns a status).
+  // TODO(b/255826209): Take in Signature instead of hash and string once cache
+  // is refactored.
+  std::optional<StatusOr<std::unique_ptr<ExecutableType>>> TryToLoadExecutable(
+      uint64 signature_hash, const std::string& signature_str,
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& compilation_result,
+      DeviceCompilerClient<ExecutableType, ClientType>* client) const;
+
+  // Tries to serialize an already built `executable` and persist it on disk. If
+  // unable to do so, tries to build a serialized executable using the AOT
+  // pipeline and persists that to disk.
+  // TODO(b/255826209): Take in Signature instead hash and string once cache
+  // is refactored.
+  virtual absl::Status TryToPersistExecutable(
+      uint64 signature_hash, const std::string& signature_str,
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& compilation_result,
+      const ExecutableType& executable,
+      DeviceCompilerClient<ExecutableType, ClientType>* client) const;
+
+  const DeviceType& device_type() const { return device_type_; }
+  const std::string& persistence_prefix() const { return persistence_prefix_; }
+  const std::string& persistent_cache_directory() const {
+    return persistent_cache_directory_;
+  }
+
+ private:
+  // Returns a cache key proto that identifies an entry in the compilation
+  // cache.
+  XlaSerializedCacheKey BuildSerializedCacheKey(
+      uint64 signature_hash, const xla::HloModuleProto& hlo_module) const;
+
+  XlaSerializedCacheKey BuildSerializedCacheKey(
+      uint64 signature_hash, const xla::HloModuleProto& hlo_module,
+      bool compiled_using_pjrt) const;
+
+  // Serializes the signature and its corresponding entry to a proto message.
+  absl::StatusOr<XlaSerializedCacheEntry> SerializeEntry(
+      uint64 signature_hash, const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& compilation_result,
+      const ExecutableType& executable,
+      DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const;
+
+  // Saves the cache entry in the file directory supplied during the
+  // construction of this class. Overwrites existing entries.
+  absl::Status SaveSerializedEntry(const XlaSerializedCacheEntry& entry) const;
+
+  // Tries to read a cache entry given a `key` by searching the file directory
+  // supplied during the construction of this class. Returns std::nullopt if no
+  // cache entry is found.
+  absl::StatusOr<std::optional<XlaSerializedCacheEntry>>
+  TryToReadSerializedEntry(const XlaSerializedCacheKey& key) const;
+
+  // Checks if the loaded `entry` matches the expected `key` and `hlo_module`.
+  absl::Status VerifyLoadedCacheEntry(
+      const XlaSerializedCacheKey& key, const xla::HloModuleProto& hlo_module,
+      const XlaSerializedCacheEntry& entry) const;
+
+  std::string GetFilePath(const XlaSerializedCacheKey& key) const;
+
+  const DeviceType device_type_;
+  const bool disable_strict_signature_checks_;
+  const std::string persistence_prefix_;
+
+  // If non-empty, JIT-compiled executables are saved to and loaded from the
+  // specified file system directory path.
+  const std::string persistent_cache_directory_;
+
+  // Cache is read-only if set to true.
+  const bool persistent_cache_directory_read_only_;
+
+  DeviceExecutablePersistor(const DeviceExecutablePersistor&) = delete;
+  void operator=(const DeviceExecutablePersistor&) = delete;
+};
+
+template <typename ExecutableType, typename ClientType>
+DeviceExecutablePersistor<ExecutableType, ClientType>::
+    DeviceExecutablePersistor(const Config& config,
+                              const DeviceType& device_type)
+    : device_type_(device_type),
+      disable_strict_signature_checks_(config.disable_strict_signature_checks),
+      persistence_prefix_(config.persistence_prefix),
+      persistent_cache_directory_(config.persistent_cache_directory),
+      persistent_cache_directory_read_only_(
+          config.persistent_cache_directory_read_only) {}
+
+template <typename ExecutableType, typename ClientType>
+std::string DeviceExecutablePersistor<ExecutableType, ClientType>::GetFilePath(
+    const XlaSerializedCacheKey& key) const {
+  const std::string file_name = XlaSerializedCacheKeyToFileName(key);
+  return io::JoinPath(persistent_cache_directory_, file_name);
+}
+
+template <typename ExecutableType, typename ClientType>
+XlaSerializedCacheKey
+DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
+    uint64 signature_hash, const xla::HloModuleProto& hlo_module,
+    bool compiled_using_pjrt) const {
+  XlaSerializedCacheKey key;
+  key.set_signature_fingerprint(signature_hash);
+  key.set_cluster_fingerprint(DeterministicProtoHash64(hlo_module));
+  key.set_device_type(device_type().type_string());
+  key.set_prefix(persistence_prefix());
+  key.set_compiled_using_pjrt(compiled_using_pjrt);
+  return key;
+}
+
+template <typename ExecutableType, typename ClientType>
+XlaSerializedCacheKey
+DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
+    uint64 signature_hash, const xla::HloModuleProto& hlo_module) const {
+  return BuildSerializedCacheKey(signature_hash, hlo_module, false);
+}
+
+// This template specialization sets compiled_using_prjt to true in the cache
+// key when the template arguments are PjRtLoadedExecutable and PjRtClient.
+template <>
+inline XlaSerializedCacheKey
+DeviceExecutablePersistor<xla::PjRtLoadedExecutable, xla::PjRtClient>::
+    BuildSerializedCacheKey(uint64 signature_hash,
+                            const xla::HloModuleProto& hlo_module) const {
+  return BuildSerializedCacheKey(signature_hash, hlo_module, true);
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::StatusOr<std::optional<XlaSerializedCacheEntry>>
+DeviceExecutablePersistor<ExecutableType, ClientType>::TryToReadSerializedEntry(
+    const XlaSerializedCacheKey& key) const {
+  Env* env = Env::Default();
+  const std::string file_path = GetFilePath(key);
+  if (!env->FileExists(file_path).ok()) {
+    return absl::StatusOr<std::optional<XlaSerializedCacheEntry>>(std::nullopt);
+  }
+
+  XlaSerializedCacheEntry entry;
+  TF_RETURN_IF_ERROR(ReadTextOrBinaryProto(env, file_path, &entry));
+  return std::optional<XlaSerializedCacheEntry>(entry);
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status
+DeviceExecutablePersistor<ExecutableType, ClientType>::VerifyLoadedCacheEntry(
+    const XlaSerializedCacheKey& key, const xla::HloModuleProto& hlo_module,
+    const XlaSerializedCacheEntry& entry) const {
+  XLA_SCOPED_LOGGING_TIMER(absl::StrCat("Verifying loaded cache entry: ",
+                                        hlo_module.entry_computation_name()));
+
+  if (!AreSerializedProtosEqual(key, entry.key())) {
+    VLOG(2) << "Serialized cache key does not match:\n"
+            << "got:\n"
+            << entry.key().DebugString() << "\nexpected:\n"
+            << key.DebugString() << "\n";
+    return errors::InvalidArgument("Serialized cache key does not match.");
+  }
+
+  // Perform a stricter (slower) check of the snapshot to verify that they
+  // match exactly.
+  if (!disable_strict_signature_checks_) {
+    if (!AreSerializedProtosEqual(hlo_module, entry.hlo_module())) {
+      VLOG(2) << "HLOs do not match:\n"
+              << "got:\n"
+              << hlo_module.DebugString() << "\nexpected:\n"
+              << entry.hlo_module().DebugString() << "\n";
+      return errors::InvalidArgument("Serialized HLO does not match.");
+    }
+  }
+
+  if (entry.executable().empty()) {
+    return errors::InvalidArgument("No binary found in serialized entry.");
+  }
+  return absl::OkStatus();
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status
+DeviceExecutablePersistor<ExecutableType, ClientType>::SaveSerializedEntry(
+    const XlaSerializedCacheEntry& entry) const {
+  Env* env = Env::Default();
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(persistent_cache_directory_));
+
+  // The cache on the filesystem can be read while we're writing out the proto.
+  // To prevent reads of partially-written files, we write the proto to a temp
+  // file, then move it into place once we're done writing.  And we warn the
+  // user if these moves are not known to be atomic.
+  bool has_atomic_move = false;
+  env->HasAtomicMove(persistent_cache_directory_, &has_atomic_move)
+      .IgnoreError();
+  if (!has_atomic_move) {
+    LOG_EVERY_POW_2(WARNING)
+        << "Filesystem for XLA persistent cache at "
+        << persistent_cache_directory_
+        << " does not support atomic moves.  Therefore the persistent cache is "
+           "racy if you have multiple XLA compilations occurring "
+           "simultaneously!  You have been warned. :)";
+  }
+
+  // Write to temp location, then when that completes, atomically move into the
+  // final location.
+  std::string temp_path =
+      io::JoinPath(persistent_cache_directory_,
+                   XlaSerializedCacheKeyToFileName(entry.key()));
+  if (!env->CreateUniqueFileName(&temp_path, ".tmp")) {
+    return absl::UnavailableError(absl::StrCat(
+        "Could not create a unique file inside ", persistent_cache_directory_));
+  }
+  TF_RETURN_IF_ERROR(WriteBinaryProto(env, temp_path, entry));
+  return env->RenameFile(temp_path, GetFilePath(entry.key()));
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::StatusOr<XlaSerializedCacheEntry>
+DeviceExecutablePersistor<ExecutableType, ClientType>::SerializeEntry(
+    uint64 signature_hash, const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const ExecutableType& executable,
+    DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
+  XlaSerializedCacheEntry serialized_entry;
+  const xla::HloModuleProto& hlo_module =
+      compilation_result.computation->proto();
+  *serialized_entry.mutable_key() =
+      BuildSerializedCacheKey(signature_hash, hlo_module);
+  *serialized_entry.mutable_hlo_module() = hlo_module;
+
+  // XLA compiler supports exporting executables as an AOT compilation result
+  // to avoid running potentially expensive compilation pipeline twice.
+  // Check if XLA compiler can export available executable.
+  if (auto serialized_executable =
+          compiler_client->SerializeExecutable(executable);
+      serialized_executable.ok()) {
+    serialized_entry.set_executable(std::move(*serialized_executable));
+    return serialized_entry;
+  } else if (serialized_executable.status().code() == error::UNIMPLEMENTED) {
+    VLOG(1) << "Executable export is not implemented";
+  } else {
+    return serialized_executable.status();
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto serialized_executable,
+      compiler_client->BuildSerializedExecutable(options, compilation_result));
+  serialized_entry.set_executable(std::move(serialized_executable));
+  return serialized_entry;
+}
+
+template <typename ExecutableType, typename ClientType>
+std::optional<StatusOr<std::unique_ptr<ExecutableType>>>
+DeviceExecutablePersistor<ExecutableType, ClientType>::TryToLoadExecutable(
+    uint64 signature_hash, const std::string& signature_str,
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& compilation_result,
+    DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
+  if (persistent_cache_directory_.empty()) {
+    return std::nullopt;
+  }
+
+  const xla::HloModuleProto& hlo_module =
+      compilation_result.computation->proto();
+
+  XlaSerializedCacheKey cache_key =
+      BuildSerializedCacheKey(signature_hash, hlo_module);
+
+  std::optional<XlaSerializedCacheEntry> serialized_entry;
+  {
+    XLA_SCOPED_LOGGING_TIMER(
+        absl::StrCat("Try loading serialized cache entry:", signature_str));
+    TF_ASSIGN_OR_RETURN(serialized_entry, TryToReadSerializedEntry(cache_key));
+  }
+
+  if (!serialized_entry.has_value()) {
+    return std::nullopt;
+  }
+
+  TF_RETURN_IF_ERROR(
+      VerifyLoadedCacheEntry(cache_key, hlo_module, *serialized_entry));
+
+  VLOG(1) << "Loading cached entry for: " << signature_str;
+  return compiler_client->LoadExecutable(options, compilation_result,
+                                         serialized_entry->executable());
+}
+
+template <typename ExecutableType, typename ClientType>
+absl::Status
+DeviceExecutablePersistor<ExecutableType, ClientType>::TryToPersistExecutable(
+    uint64 signature_hash, const std::string& signature_str,
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const ExecutableType& executable,
+    DeviceCompilerClient<ExecutableType, ClientType>* client) const {
+  if (persistent_cache_directory_.empty() ||
+      persistent_cache_directory_read_only_) {
+    VLOG(1) << "Not persisting executable. No `persistent_cache_directory` "
+               "provided or cache is read-only.";
+    return absl::OkStatus();
+  }
+
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("Serializing and saving cache entry: ", signature_str));
+  TF_ASSIGN_OR_RETURN(XlaSerializedCacheEntry serialized_entry,
+                      SerializeEntry(signature_hash, options,
+                                     compilation_result, executable, client));
+  TF_RETURN_IF_ERROR(SaveSerializedEntry(std::move(serialized_entry)));
+  VLOG(2) << "XlaSerializedCacheEntry saved for signature: [" << signature_str
+          << "] with signature hash: " << signature_hash;
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_EXECUTABLE_PERSISTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/device_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_util.h
new file mode 100644
index 00000000..745f8730
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/device_util.h
@@ -0,0 +1,203 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_UTIL_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/numeric/bits.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace jit {
+class DeviceInfoCache;
+class DeviceSet;
+
+// Instances of DeviceId represent TensorFlow devices as integers.
+//
+// This helps avoid having to manipulate device names as strings when
+// auto-clustering.
+class DeviceId {
+ public:
+  DeviceId(DeviceId&&) = default;
+  DeviceId(const DeviceId&) = default;
+  DeviceId& operator=(const DeviceId&) = default;
+
+  bool operator==(const DeviceId& other) const { return id() == other.id(); }
+  bool operator!=(const DeviceId& other) const { return !(*this == other); }
+
+ private:
+  int id_;
+
+  explicit DeviceId(int id) : id_(id) {}
+
+  int id() const { return id_; }
+
+  friend class DeviceInfoCache;
+  friend class DeviceSet;
+};
+
+// A set of DeviceIds, represented as a bitmap.
+class DeviceSet {
+ public:
+  void Insert(DeviceId device_id);
+  void UnionWith(const DeviceSet& other);
+  bool IsEmpty() const;
+
+  // Calls `func` on each DeviceId in the set.  Stops iterating early if `func`
+  // return false.
+  //
+  // TODO(sanjoy): Change this to take a typed std::function if that's
+  // performance neutral.
+  template <typename FnTy>
+  void ForEach(FnTy func) const {
+    // This is really a poor man's iterator, we should consider writing a proper
+    // iterator if this ends up being used widely.
+    for (int word_index = 0, end = storage_.size(); word_index < end;
+         word_index++) {
+      uint64 word = storage_[word_index];
+      while (word != 0) {
+        uint64 only_lowest_bit_set = word & -word;
+        // The number of trailing zeros in a non-zero word is the index of the
+        // least significant 1.
+        int bit_index = absl::countr_zero(word);
+        if (!func(DeviceId(word_index * kWordSize + bit_index))) {
+          return;
+        }
+        word ^= only_lowest_bit_set;
+      }
+    }
+  }
+
+ private:
+  absl::InlinedVector<uint64, 1> storage_;
+
+  const int kWordSize = 64;
+};
+
+// Caches some miscellaneous information about TF devices.  Thread compatible.
+class DeviceInfoCache {
+ public:
+  bool IsGpu(DeviceId device) const { return is_gpu_[device.id()]; }
+  bool IsCpu(DeviceId device) const { return is_cpu_[device.id()]; }
+
+  absl::string_view GetNameFor(DeviceId device) const {
+    return names_[device.id()];
+  }
+
+  absl::StatusOr<DeviceId> GetIdFor(absl::string_view name);
+
+  using DeviceRegistration = const XlaOpRegistry::DeviceRegistration;
+
+  DeviceRegistration* GetCompilationDevice(DeviceId device) const {
+    return id_to_compilation_device_[device.id()];
+  }
+
+  absl::StatusOr<DeviceRegistration*> GetCompilationDevice(
+      absl::string_view name) {
+    TF_ASSIGN_OR_RETURN(DeviceId device_id, GetIdFor(name));
+    return GetCompilationDevice(device_id);
+  }
+
+  const DeviceType& GetDeviceTypeFor(DeviceId device) const {
+    return *id_to_device_type_[device.id()];
+  }
+
+  using DeviceTypeConstRef = std::reference_wrapper<const DeviceType>;
+
+  absl::StatusOr<DeviceTypeConstRef> GetDeviceTypeFor(
+      absl::string_view device_name) {
+    TF_ASSIGN_OR_RETURN(DeviceId device_id, GetIdFor(device_name));
+    return std::cref(*id_to_device_type_[device_id.id()]);
+  }
+
+  string DebugString(const DeviceSet& device_set) const;
+
+ private:
+  absl::flat_hash_map<string, DeviceId> name_to_id_;
+
+  // These fields are populated for a device in GetIdFor, *before* we give out a
+  // DeviceId.
+  std::vector<const XlaOpRegistry::DeviceRegistration*>
+      id_to_compilation_device_;
+  std::vector<std::unique_ptr<DeviceType>> id_to_device_type_;
+  std::vector<string> names_;
+  std::vector<bool> is_cpu_;
+  std::vector<bool> is_gpu_;
+};
+
+}  // namespace jit
+
+// Returns the DeviceType corresponding to 'device'.
+absl::Status DeviceNameToDeviceType(const string& device,
+                                    DeviceType* device_type);
+
+// Picks the device for which XLA should compile a cluster that contains
+// operations placed in devices in `devices`.  For instance a cluster that
+// contains operations solely placed on the CPU will be compiled into a CPU
+// executable by XLA, whereas a cluster that contains operations placed on the
+// CPU and also operations placed on the GPU will be compiled into a GPU
+// executable.
+//
+// Returns a non-OK Status if no unambiguous choice of device exists.
+//
+// We choose the device using the following rules:
+//
+//  - It is an error for `device_names` to contain more than one device of the
+//    same type.
+//  - GPU is preferred over CPU.
+//  - If `allow_mixing_unknown_and_cpu` is true then unknown devices are
+//    preferred over CPU.
+//  - XLA devices count as "unrecognized devices".
+//
+// This set of rules above implicitly assume that XLA:GPU can compile all
+// operations in the cluster that XLA:CPU can compile, and if
+// `allow_mixing_unknown_and_cpu` then the unrecognized device can also compile
+// all operations in the cluster that XLA:CPU can compile.
+//
+// We provide the `allow_mixing_unknown_and_cpu` knob so that we can do both of
+// the following things:
+//
+// - Let MarkForCompilationPass not inject CPU-placed operations into clusters
+//   that will run on unknown devices (because the unknown XLA backend may not
+//   support every operation supported by CPU).
+// - Let BuildXlaOpsPass successfully infer a compilation device for a cluster
+//   that contains nodes placed on both the CPU and on unknown devices.  In this
+//   case it is the responsibility of the optimization pass that injected the
+//   CPU nodes into the cluster to ensure that these nodes can be compiled by
+//   the unknown XLA backend.
+absl::StatusOr<jit::DeviceId> PickDeviceForXla(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu);
+
+// This is like `PickDeviceForXla` except that it returns nullopt (instead of a
+// non-OK Status) if no unambiguous choice of device exists.
+//
+// We return a failing Status for errors unrelated to the device choice
+// algorithm itself.
+absl::StatusOr<std::optional<jit::DeviceId>> MaybePickDeviceForXla(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
new file mode 100644
index 00000000..0c7729f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization pass that groups nodes marked with a common
+// kXlaClusterAttr into functions, and replaces the original nodes by
+// calls. The calls are annotated with kXlaCompiledKernelAttr.
+
+#ifndef TENSORFLOW_COMPILER_JIT_ENCAPSULATE_SUBGRAPHS_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_ENCAPSULATE_SUBGRAPHS_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// EncapsulateSubgraphs pass takes all the nodes with the same cluster ID
+// (derived from kXlaClusterAttr=ID (kXlaClusterAttr) attribute), puts them into
+// a TF function, and replaces the subgraph in the main graph with a call to
+// that TF function annotated with kXlaCompiledKernelAttr (_XlaCompiledKernel).
+class EncapsulateSubgraphsPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+// A rewriting function to apply to each subgraph during encapsulation.
+// 'arg_source_tensors' are the tensors corresponding to the arguments in the
+// original source graph (*not* 'graph').
+//
+// 'graph' is the subgraph. The rewriting may renumber the inputs and outputs;
+// 'input_permutation' is a mapping from old argument numbers to new argument
+// numbers, whereas 'output_permutation' is the same for outputs. Both
+// 'input_permutation' and 'output_permutation' are initialized to the identity
+// permutation. 'nodedef' is the NodeDef for the call to the function under
+// construction, provided to allow additional attributes to be set.
+// The rewrite may also change the NodeDef's operator name, and that
+// name will be used as the name of the generated function.
+typedef std::function<absl::Status(
+    const std::vector<OutputTensor>& arg_source_tensors,
+    std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
+    std::vector<int>* output_permutation, NodeDef* node_def)>
+    RewriteSubgraphFn;
+
+// Transformation that finds subgraphs whose nodes are marked with
+// 'group_attribute', splits those subgraphs into functions, and replaces
+// the originals with function calls.
+//
+// 'group_attribute' must be a string valued-attribute that names the new
+// functions to introduce.
+//
+// If 'rewrite_subgraph_fn' is set, it is applied to each subgraph before
+// function conversion.
+//
+// If 'reuse_existing_functions' is set, use an existing function with the
+// same name, if any.
+//
+// TODO(phawkins): currently, some information in control edges
+// is not preserved. Suppose you have A and B in the main
+// graph, C and D in a subgraph. B and C have control deps from A, D has control
+// dep from B. Originally D must run after C, post-transformation this
+// dependency is lost.
+absl::Status EncapsulateSubgraphsInFunctions(
+    string group_attribute, const Graph& graph_in,
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
+    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
+
+// The attribute that marks function calls produced by the encapsulate
+// subgraphs pass and that should in turn be compiled via XlaLaunch operators.
+extern const char* const kXlaCompiledKernelAttr;
+
+// Does `node` have the kXlaCompiledKernelAttr attribute?
+bool IsXlaCompiledKernel(const Node& node);
+
+// Functions produced by the EncapsulateSubgraphs pass have their arguments in
+// the order:
+// 1) compile-time constant arguments, in host memory,
+// 2) other arguments, in device memory.
+// 3) resource variable arguments, in host memory. Note that only the resource
+//    Tensor itself is in host memory; the underlying value may be in device
+//    memory.
+// The functions are annotated with the following attributes that describe how
+// many constant and resource arguments there are:
+
+// Name of the attribute containing the number of constant arguments.
+extern const char* const kXlaNumConstantArgsAttr;
+
+// Name of the attribute containing the number of resource variable arguments.
+extern const char* const kXlaNumResourceArgsAttr;
+
+// Name of the attribute defining whether the cluster has reference variables.
+extern const char* const kXlaHasReferenceVarsAttr;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_SUBGRAPHS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_util.h
new file mode 100644
index 00000000..7c99763c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_util.h
@@ -0,0 +1,155 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains some utility functions for encapsulating XLA computation
+// in host graph and encapsulating outside compilation in XLA computation.
+
+#ifndef TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Attribute marking output tensor shapes inferred by XLA. Attribute value is
+// a list of PartialTensorShape objects.
+extern const char kXlaInferredShapesAttrName[];
+
+// Infers output shapes for all nodes in graph `g`. The output shapes will be
+// stored in node attribute `kXlaInferredShapesAttrName`.
+//
+// We have to perform shape inference before encapsulation because after
+// encapsulation, some nodes will be encapsulated into function call, and shape
+// inference does not handle function call at the moment.
+absl::Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g);
+
+// Attribute indicating that some ops in this node's XLA computation has control
+// dependency on this node. Attribute value will always be "true".
+extern const char kXlaConnectedToXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependency on some ops in
+// this node's XLA computation. Attribute value will always be "true".
+extern const char kXlaConnectedFromXlaComputationAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// string (original input node name).
+extern const char kOutsideCompilationOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// int (src_output for original edge).
+extern const char kOutsideCompilationSrcOutputAttrName[];
+
+// Attribute indicating that this node has control dependencies on some other
+// nodes within the same XLA cluster. Attribute value will be a list of string
+// (node names).
+extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
+
+// Attribute indicating that this node is an outside compilation node which is
+// lifted out of If/While/function node. Attribute value will always be boolean
+// value "true".
+extern const char kXlaIsLiftedArgAttrName[];
+
+// Attribute indicating that this node is a Placeholder node for an _Arg node
+// lifted out of If/While/function node. Attribute value will be a string, which
+// is the outside compilation cluster name sending the lifted arg node to host.
+extern const char kXlaLiftedArgOutsideCompilationAttrName[];
+
+// Attribute indicating that this is an IdentityN node receiving inputs for a
+// outside compilation Placeholder node (the original outside compilation node
+// is moved out of TPU computation, and we left a Placeholder node there).
+// Attribute value will be a string, which is the outside compilation cluster
+// name for the outside compilation Placeholder node.
+extern const char kXlaOutsideCompilationInputsAttrName[];
+
+// Attribute indicating that this is a Placeholder node for an _Arg node used in
+// outside compilation. We should not move this node out of XLA computation.
+// Attribute value will always be boolean value "true".
+extern const char kXlaIsPlaceholderForArg[];
+
+// Information for XLA computation.
+struct XlaClusterInfo {
+  // Add an explicitly-defined default constructor for this class.
+  //
+  // The compiler may delete the default constructor here because
+  // host_compute_core is a const member whose type (std::map) doesn't
+  // necessarily have a user provided constructor -- while libc++ and
+  // libstdc++ 4.8 provide a user defined default constructor, libstdc++ at
+  // least >= 7.3 does not. See also c++11 [class.ctor] p5.
+  //
+  // TODO(klimek): In c++17 we'll be able to initialize host_compute_core
+  // without losing aggregate initialization, which allows us to get rid of
+  // the constructor definitions again.
+  XlaClusterInfo() {}
+  XlaClusterInfo(const string& cluster_name,
+                 const NameAttrList& func_name_attrs, Node* node,
+                 const std::map<string, int>& host_compute_core)
+      : cluster_name(cluster_name),
+        func_name_attrs(func_name_attrs),
+        node(node),
+        host_compute_core(host_compute_core) {}
+  // XLA cluster name. It might be different from `func_name`.
+  const string cluster_name;
+  // Name and attributes of XLA computation function.
+  const NameAttrList func_name_attrs;
+  // The XLA computation node in the graph.
+  Node* node;
+  // A mapping from outside compilation cluster name to its device assignment.
+  const std::map<string, int> host_compute_core;
+};
+
+// Finds dependencies between outside compilation clusters, including both data
+// dependencies and control dependencies. cluster_deps maps the name name of an
+// outside compilation cluster to a set of names of outside compilation clusters
+// that it depends on.
+absl::StatusOr<
+    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+OutsideCompilationClusterDependencies(
+    const Graph* g, const string& outside_compilation_attr_name);
+
+// Preprocesses edges within the same XLA cluster. It will perform the following
+// operations in order:
+//
+// 0.  Remove edges from source node to outside compilation nodes, and edges
+//     from outside compilation nodes to sink node.
+// 1a. For edges between different outside compilation clusters, remove the edge
+//     and add attr "kXlaControlDependenciesWithinXlaClusterAttrName = src node
+//     name" to dst node.
+// 1b. For control edges between outside compilation and its XLA computation,
+//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
+//     outside compilation node.
+// 2.  For data edges between different outside compilations, remove the edge
+//     and create a Placeholder node as dst node's input.
+absl::Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+
+// Postprocesses edges within the same XLA cluster. This function reverts what
+// `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
+// following operations in order:
+//
+// 1. Remove Placeholder nodes between different outside compilations (created
+//    in `PreprocessEdgesBetweenOutsideCompilations` step 2).
+// 2a. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessEdgesBetweenOutsideCompilations` step 1a).
+// Notice that control edges marked by
+// `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
+// They are handled in `RewriteOutsideCompilationSubgraphFn`.
+absl::Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
new file mode 100644
index 00000000..6301e963
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+// Rewrites computations generated by the xla.compile() Python code into
+// XlaLaunch nodes.
+//
+// xla.compile() does two main things:
+// a) marks operators that make up an XLA computation with the attribute
+//    _xla_compile_id=XYZ, where XYZ is a unique key.
+// b) adds XlaClusterOutput nodes to represent outputs of the computation.
+//    These nodes are not marked with the _xla_compile_id attribute.
+
+#ifndef TENSORFLOW_COMPILER_JIT_ENCAPSULATE_XLA_COMPUTATIONS_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_ENCAPSULATE_XLA_COMPUTATIONS_PASS_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Encapsulates nodes marked with the _xla_compile_id attribute into
+// XlaLaunch operators.
+class EncapsulateXlaComputationsPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+  // The following methods are public only for unit tests.
+
+  // This pass has two stages:
+  // a) first, we call EncapsulateSubgraphsPass to encapsulate all nodes
+  //    marked with the same _xla_compile_id attribute into functions. These
+  //    functions contain the computations to be passed to XlaLaunch. During
+  //    encapsulation, we sort the arguments into the order expected by
+  //    XlaLaunch.
+  static absl::Status Encapsulate(std::unique_ptr<Graph>* graph,
+                                  FunctionLibraryDefinition* flib_def);
+
+  // b) we rewrite the function calls generated in phase (a) into XlaLaunch
+  //    operators. We also convert the XlaClusterOutput output nodes of the
+  //    function call into the outputs of the XlaLaunch operator.
+  static absl::Status BuildXlaLaunchOps(Graph* graph);
+
+  struct XlaFunctionInfo {
+    int variable_start_index = -1;
+    std::string function_name;
+  };
+
+  // We need to introduce this version to adapt to the output of gpu inference
+  // converter. The single argument overload version calls this function.
+  //
+  // When add_edges_to_output_of_downstream_nodes is true, the output edges of
+  // the xla_launch_node's immediate downstream nodes would be attached to the
+  // generated xla node. For example, if the original graph is
+  // StatefulPartitionedCall{_xla_compile_id=1} -> XlaClusterOutput -> NodeA
+  // The output graph of this function would look like the following when
+  // add_edges_to_output_of_downstream_nodes is true:
+  // XlaLaunch -> NodeA
+  static absl::Status BuildXlaLaunchOps(
+      Graph* graph,
+      const std::function<absl::StatusOr<bool>(const Node&)>&
+          is_xla_launch_node,
+      const std::function<absl::StatusOr<XlaFunctionInfo>(const Node&)>&
+          get_xla_function_info,
+      bool add_edges_to_output_of_downstream_nodes);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_XLA_COMPUTATIONS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/extract_outside_compilation_pass.h
new file mode 100644
index 00000000..7631ccd0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_EXTRACT_OUTSIDE_COMPILATION_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_EXTRACT_OUTSIDE_COMPILATION_PASS_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Rewrite function for outside compilation subgraphs. It will perform the
+// following steps:
+//
+// 1. Add a XLA computation key placeholder node (it will be used as input for
+//    XlaRecvAtHost and XlaSendFromHost);
+// 2. Replace all _Arg nodes with one single XlaRecvAtHost node;
+// 3. Replace all _Retval nodes with one single XlaSendFromHost node;
+// 4. Mark all nodes except key placeholder with attr `xla_cluster_attr_name`
+//    and `outside_compilation_attr_name`;
+// 5. For nodes marked with attr kXlaConnectedToXlaComputationAttrName, add a
+//    control edge from the node to XlaSendFromHost; for nodes marked with attr
+//    kXlaConnectedFromXlaComputationAttrName, add a control edge from
+//    XlaRecvAtHost node to the node;
+// 6. Try pruning XlaRecvAtHost/XlaSendFromHost/key placeholder node.
+// 7. Add necessary attributes to `node_def`, so we can replace it with a
+//    XlaHostCompute node later. If all input shapes for XlaSendFromHost are
+//    known, "shapes" attr will be set to the list of input shapes; otherwise
+//    "shape_inference_graph" attr will be set to shape inference function name.
+class RewriteOutsideCompilationSubgraphFn {
+ public:
+  RewriteOutsideCompilationSubgraphFn(
+      const string& xla_cluster_attr_name,
+      const string& outside_compilation_attr_name,
+      const string& xla_cluster_name, const string& new_function_name)
+      : xla_cluster_attr_name_(xla_cluster_attr_name),
+        outside_compilation_attr_name_(outside_compilation_attr_name),
+        xla_cluster_name_(xla_cluster_name),
+        new_function_name_(new_function_name) {}
+
+  absl::Status operator()(const std::vector<OutputTensor>&,
+                          std::unique_ptr<Graph>* graph,
+                          std::vector<int>* input_permutation,
+                          std::vector<int>* output_permutation,
+                          NodeDef* node_def);
+
+ private:
+  string xla_cluster_attr_name_;
+  string outside_compilation_attr_name_;
+  string xla_cluster_name_;
+  string new_function_name_;
+};
+
+// For an XLA computation function, replace all outside compilations with
+// XlaHostCompute nodes. Each outside compilation subgraph will be rewritten by
+// `RewriteOutsideCompilationSubgraphFn`, and they will be merged into one
+// single host side graph (`host_graph`).
+//
+// xla_cluster_attr_name and outside_compilation_attr_name: attr name for XLA
+//   computation and outside compilation. Required for
+//   `RewriteOutsideCompilationSubgraphFn`.
+// xla_cluster_name: XLA cluster name for this XLA computation. We need it
+//   because XLA cluster name might be different from `func_name`.
+// func_name_attrs: they will be used to instantiate the XLA computation func.
+// new_func_name: new function name for rewritten XLA computation func.
+// host_compute_core: mapping from outside compilation cluster name to XLA
+//   device assignment.
+// fld: FunctionLibraryDefinition object.
+// host_graph: Graph object to store host side graph for all outside
+//   compilations within this XLA computation func. If there is no outside
+//   compilation, it will be empty.
+// shape_inference_graphs: a list of outside compilation shape inference
+//   function names. These functions need to be rewritten later.
+// has_outside_compilation: a bool indicating whether this function has any
+//   outside compilation nodes.
+absl::Status ExtractOutsideCompilationForFunction(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const NameAttrList& func_name_attrs, const string& new_func_name,
+    const string& host_graph_func_name,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
+    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation);
+
+// Rewrites XLA computation in `clusters` to replace outside compilation nodes
+// with XlaHostCompute, and moves those outside compilations into `g`. If shapes
+// of outside compilation outputs cannot be determined now, we will store shape
+// inference graph into `fld`.
+absl::Status ExtractOutsideCompilation(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name,
+    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    bool* modified);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_EXTRACT_OUTSIDE_COMPILATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/flags.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/flags.h
new file mode 100644
index 00000000..9dbd6106
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/flags.h
@@ -0,0 +1,360 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_FLAGS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+
+struct XlaAutoJitFlag {
+  // Control compilation of operators into XLA computations on CPU and GPU
+  // devices.  0 = use ConfigProto setting; -1 = off; 1 = on for things very
+  // likely to be improved; 2 = on for everything.
+  //
+  // If all non-CPU ops in the graph being optimized are placed on a single GPU
+  // and there is at least one node placed on that GPU then
+  // `optimization_level_single_gpu` applies.  Otherwise
+  // `optimization_level_general` applies.
+  //
+  // Experimental.
+  int32 optimization_level_single_gpu;
+  int32 optimization_level_general;
+};
+
+// Sets the xla_auto_jit_flag based on the given flag string. Supported syntax
+// is:
+// <number>: sets general and single_gpu setting to the provided number.
+// single-gpu(<number>): sets the single_gpu setting to the provided number.
+bool SetXlaAutoJitFlagFromFlagString(const string& value);
+
+// Flags associated with the XLA bridge's mark_for_compilation_pass module.
+struct MarkForCompilationPassFlags {
+  XlaAutoJitFlag xla_auto_jit_flag;
+
+  // Minimum number of operators in an XLA compilation. Ignored for operators
+  // placed on an XLA device or operators explicitly marked for compilation.
+  int32 tf_xla_min_cluster_size;
+
+  // Maximum number of operators in an XLA compilation.
+  int32 tf_xla_max_cluster_size;
+
+  // If non-empty, limit XLA clustering to the following TF operations.
+  string tf_xla_ops_to_cluster;
+
+  // If non-empty, remove following operations from XLA clustering excludelist.
+  string tf_xla_cluster_exclude_ops;
+
+  // Dump graphs during XLA compilation.
+  bool tf_xla_clustering_debug;
+
+  // Enables global JIT compilation for CPU via SessionOptions.
+  bool tf_xla_cpu_global_jit;
+
+  // "Compiler fuel" for clustering.  Only this many ops will be marked as
+  // eligible for clustering.
+  int64_t tf_xla_clustering_fuel;
+
+  // If tf_xla_disable_deadness_safety_checks_for_debugging is set to true then
+  // we do not do deadness related safety checks.  This is unsound in general,
+  // but can be used as a debugging aid.
+  bool tf_xla_disable_deadness_safety_checks_for_debugging;
+
+  // If tf_xla_disable_resource_variable_safety_checks_for_debugging is set to
+  // true then we do not do safety checks to preserve TensorFlow's resource
+  // variable concurrency semantics.  This is unsound in general, but can be
+  // used as a debugging aid.
+  bool tf_xla_disable_resource_variable_safety_checks_for_debugging;
+
+  // If true names of clustered operations will be computed deterministically
+  // so that they remain stable from run to run of auto clusteing.
+  bool tf_xla_deterministic_cluster_names;
+
+  // If non-empty, JIT-compiled executables are saved to and loaded from the
+  // specified file system directory path.
+  std::string tf_xla_persistent_cache_directory;
+
+  // If non-empty, the persistent cache will only be used for the specified
+  // devices (comma separated). Each device type should be able to be converted
+  // to `DeviceType`.
+  std::string tf_xla_persistent_cache_device_types;
+
+  bool tf_xla_persistent_cache_read_only;
+
+  // If true, entries loaded into the XLA compile cache will not have their
+  // signatures checked strictly. This should generally not be disabled except
+  // for debugging. Defaults to false.
+  bool tf_xla_disable_strict_signature_checks;
+
+  // Specifies the persistance cache prefix. Default is "xla_compile_cache"
+  string tf_xla_persistent_cache_prefix;
+};
+
+// Flags associated with XLA Sparse Core.
+struct XlaSparseCoreFlags {
+  // Max level of division to split input data into minibatches.
+  int tf_xla_sparse_core_minibatch_max_division_level;
+
+  // Disable table stacking for all the tables passed to the SparseCore
+  // mid level API.
+  bool tf_xla_sparse_core_disable_table_stacking;
+
+  // If non-zero, limits the size of the activations for a given table to
+  // be below these many bytes.
+  int64_t tf_xla_sparse_core_stacking_mem_limit_bytes;
+
+  // If non-zero, limits the size of any table shard to be below these
+  // many bytes.
+  int64_t tf_xla_sparse_core_stacking_table_shard_limit_bytes;
+};
+
+// Flags associated with the XLA bridge's xla_device module.
+struct XlaDeviceFlags {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // auto-clustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+
+  // Enables "XLA" devices if this flag is set.
+  bool tf_xla_enable_xla_devices;
+};
+
+// Flags common to the _Xla* ops and their kernels.
+struct XlaOpsCommonFlags {
+  // If true, _XlaCompile always refuses to compile the cluster, which means the
+  // XLA clusters always run in the TF executor.  Defaults to false.
+  bool tf_xla_always_defer_compilation;
+  // If true, _XlaCompile compiles the cluster asynchronously with respect to
+  // the main execution. The fallback path is taken while compilation happens.
+  bool tf_xla_async_compilation;
+
+  class PjRtForSingleDeviceCompilationRollout {
+   public:
+    // Allow using Device API (PjRt) for `device_type` in the XlaLaunch op.
+    // Please note that `enabled_for_xla_launch_` needs to be true in addition
+    // to the `device_type` being allowed in order to use the Device API for
+    // single device compilation and execution in the XlaLaunch op.
+    void AllowForDeviceInXlaLaunch(const DeviceType& device_type) {
+      xla_launch_allowed_devices_.insert(device_type.type_string());
+    }
+
+    bool IsEnabledInXlaLaunchForDevice(const DeviceType& device_type) const {
+      if (!enabled_for_gpu_ && device_type.type_string() == "GPU") return false;
+      return enabled_for_all_ ||
+             (enabled_for_xla_launch_ &&
+              xla_launch_allowed_devices_.contains(device_type.type_string()));
+    }
+
+    // Allow using Device API (PjRt) for `device_type` in the XlaCompileOnDemand
+    // op. Please note that `enabled_for_compile_on_demand_` needs to be true in
+    // addition to the `device_type` being allowed in order to use the Device
+    // API for single device compilation and execution in the XlaCompileOnDemand
+    // op.
+    void AllowForDeviceInXlaCompileOnDemand(const DeviceType& device_type) {
+      xla_compile_on_demand_allowed_devices_.insert(device_type.type_string());
+    }
+
+    bool IsEnabledInXlaCompileOnDemandForDevice(
+        const DeviceType& device_type) const {
+      if (!enabled_for_gpu_ && device_type.type_string() == "GPU") return false;
+      return enabled_for_all_ ||
+             (enabled_for_compile_on_demand_ &&
+              xla_compile_on_demand_allowed_devices_.contains(
+                  device_type.type_string()));
+    }
+
+    // Allow using Device API (PjRt) for `device_type` in the XlaCompile and
+    // XlaRun ops. Please note that `enabled_for_compile_and_run_` needs to be
+    // true in addition to the `device_type` being allowed in order to use the
+    // Device API for single device compilation and execution in the XlaCompile
+    // and XlaRun ops.
+    void AllowForDeviceInXlaCompileAndRun(const DeviceType& device_type) {
+      xla_compile_and_run_allowed_devices_.insert(device_type.type_string());
+    }
+
+    bool IsEnabledInXlaCompileAndRunForDevice(
+        const DeviceType& device_type) const {
+      if (!enabled_for_gpu_ && device_type.type_string() == "GPU") return false;
+      return enabled_for_all_ || (enabled_for_compile_and_run_ &&
+                                  xla_compile_and_run_allowed_devices_.contains(
+                                      device_type.type_string()));
+    }
+
+    bool IsEnabledForGpu() const { return enabled_for_gpu_; }
+
+    // If true, uses Device API (PjRt) for single device compilation and
+    // execution of functions marked for JIT compilation i.e. jit_compile=True.
+    // Defaults to false.
+    bool enabled_for_xla_launch_;
+
+    // If true, uses Device API (PjRt) for compiling and executing ops one by
+    // one in "on-demand" mode. Defaults to false.
+    bool enabled_for_compile_on_demand_;
+
+    // If true, uses Device API (PjRt) for compilation and execution when
+    // auto-clustering is enabled. Defaults to false.
+    bool enabled_for_compile_and_run_;
+
+    // If true, uses Device API (PjRt) for compilation and execution everywhere
+    // i.e. for functions marked for JIT compilation, for ops in "on-demand"
+    // mode and auto-clustering. Defaults to false.
+    //
+    // Note that this flag can be overridden by device flag like
+    // `enabled_for_gpu_` below.
+    bool enabled_for_all_;
+
+    // If true, enable Device API (PjRt) for TF GPU device. This is a helper
+    // flag so that individual tests can turn on PjRt for GPU specifically.
+    // Once the rollout to GPU is complete, this flag can be deprecated.
+    bool enabled_for_gpu_;
+
+   private:
+    // Devices for which using Device API (PjRt) is allowed in the XlaLaunch op.
+    // This can only be modified programmatically.
+    absl::flat_hash_set<std::string> xla_launch_allowed_devices_;
+    // Devices for which using Device API (PjRt) is allowed in the
+    // XlaCompileOnDemand op. This can only be modified programmatically.
+    absl::flat_hash_set<std::string> xla_compile_on_demand_allowed_devices_;
+    // Devices for which using Device API (PjRt) is allowed in the
+    // XlaCompile and XlaRun ops. This can only be modified programmatically.
+    absl::flat_hash_set<std::string> xla_compile_and_run_allowed_devices_;
+  } tf_xla_use_device_api;
+};
+
+// Flags for the XlaCallModule kernel.
+struct XlaCallModuleFlags {
+  // Used by XlaCallModuleOp to specify safety checks to disable.
+  absl::flat_hash_set<std::string> disabled_checks;
+};
+
+// Flags for the build_xla_ops pass.
+struct BuildXlaOpsPassFlags {
+  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
+  // Defaults to true.
+  bool tf_xla_enable_lazy_compilation;
+
+  // If true then insert Print nodes to print out values produced by XLA
+  // clusters.  Useful for debugging.
+  bool tf_xla_print_cluster_outputs;
+
+  // If true, insert CheckNumerics nodes for every floating point typed input to
+  // an XLA cluster.
+  bool tf_xla_check_cluster_input_numerics;
+
+  // If true, insert CheckNumerics nodes for every floating point typed output
+  // from an XLA cluster.
+  bool tf_xla_check_cluster_output_numerics;
+
+  // Disables all constant folding. The primary use for this is for testing to
+  // guarantee that tests are run on XLA and not on TF's CPU implementation.
+  bool tf_xla_disable_constant_folding;
+
+  // Disables full embedding pipelining when true. Instead, strict SparseCore
+  // TensorCore sequencing will be used.
+  bool tf_xla_disable_full_embedding_pipelining;
+
+  // Force the WhileOps in embedding_pipelining and embedding_sequencing to use
+  // this many parallel_iterations
+  int tf_xla_embedding_parallel_iterations;
+};
+
+// Flags for common MLIR configurations.
+struct MlirCommonFlags {
+  ConfigProto::Experimental::MlirBridgeRollout tf_mlir_enable_mlir_bridge;
+
+  bool tf_mlir_enable_merge_control_flow_pass;
+  bool tf_mlir_enable_convert_control_to_data_outputs_pass;
+  bool tf_mlir_enable_composite_tpuexecute_side_effects;
+  bool tf_mlir_enable_strict_clusters;
+  bool tf_mlir_enable_tpu_variable_runtime_reformatting_pass;
+  // TODO(pineapplejuice233): Revisit this flag once the performance impact is verified
+  // with different local CPU devices settings.
+  bool tf_mlir_enable_multiple_local_cpu_devices;
+};
+
+// Flags for the JitRt pipeline -- see tf_jitrt_pipeline.h for details.
+struct JitRtFlags {
+  bool always_specialize;
+  bool cost_driven_async_parallel_for;
+
+  // Enables tracking of the "live" JitRt queries to, on a crash, identify the
+  // "query of death". See TfJitRtQueryOfDeathLogger.
+  bool log_query_of_death;
+
+  // Enable vectorization, which requires tiling and peeling on different ops.
+  bool vectorize;
+
+  // Enables crash reproducer for JitRt MLIR pass manager.
+  bool enable_crash_reproducer;
+};
+
+// Return a pointer to the DumpGraphFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+
+// Getters for flags structs defined above.  The first call to any of these
+// parses TF_XLA_FLAGS for all of them.  Those functions which return a pointer
+// always return the same pointer.
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
+BuildXlaOpsPassFlags* GetBuildXlaOpsPassFlags();
+XlaSparseCoreFlags* GetXlaSparseCoreFlags();
+XlaDeviceFlags* GetXlaDeviceFlags();
+XlaOpsCommonFlags* GetXlaOpsCommonFlags();
+XlaCallModuleFlags* GetXlaCallModuleFlags();
+
+MlirCommonFlags* GetMlirCommonFlags();
+
+void ResetJitCompilerFlags();
+
+const JitRtFlags& GetJitRtFlags();
+
+// Returns the effective MLIR bridge rollout state based on the flags and the
+// optional configuration.
+ConfigProto::Experimental::MlirBridgeRollout GetMlirBridgeRolloutState(
+    std::optional<const ConfigProto> config_proto);
+
+// Appends the flag definitions associated with
+// MarkForCompilationPassFlags/DumpGraphFlags to `flag_list`.
+//
+// Has the side-effect of parsing TF_XLA_FLAGS if that hasn't happened yet.
+void AppendMarkForCompilationPassFlags(
+    std::vector<tensorflow::Flag>* flag_list);
+
+// Disables XLA compilation, forces it to return an error message instead. Can
+// be used by a server to ensure that JIT compilation is opt-in.
+void DisableXlaCompilation();
+
+// Enables XLA compilation. Can be used with `DisableXlaCompilation` to
+// enable/disable JIT compilation at different stages.
+void EnableXlaCompilation();
+
+// Returns `false` unless `DisableXlaCompilation` was called.
+bool FailOnXlaCompilation();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/force_xla_constants_on_host_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/force_xla_constants_on_host_pass.h
new file mode 100644
index 00000000..ae7cf149
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/force_xla_constants_on_host_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_FORCE_XLA_CONSTANTS_ON_HOST_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_FORCE_XLA_CONSTANTS_ON_HOST_PASS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// An optimization pass which marks the constants which have to be resolved for
+// XLA compilation with `_input_hostmem`.
+class ForceXlaConstantsOnHostPass : public GraphOptimizationPass {
+ public:
+  ForceXlaConstantsOnHostPass() = default;
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_FORCE_XLA_CONSTANTS_ON_HOST_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/get_compiler_ir.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/get_compiler_ir.h
new file mode 100644
index 00000000..a4352d11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/get_compiler_ir.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
+#define TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+class ProcessFunctionLibraryRuntime;
+class Device;
+class Tensor;
+class TensorHandle;
+class EagerContext;
+
+enum class IrExportStage {
+  STABLEHLO,
+  STABLEHLO_SERIALIZED,
+  HLO,
+  HLO_NO_METADATA,
+  HLO_SERIALIZED,
+  OPTIMIZED_HLO,
+  OPTIMIZED_HLO_SERIALIZED,
+  OPTIMIZED_HLO_PROTO_SERIALIZED,
+  OPTIMIZED_HLO_DOT
+};
+
+struct ArgShapeAndDType {
+  TensorShape shape;
+  DataType dtype;
+};
+
+enum class CompilerArgSource {
+  TENSOR_SPEC,
+  CONCRETE_INPUT,
+};
+
+// Returns the IR format of the selected stage for a given function `func_name`
+// using library runtime `runtime` on a device `dev` with given
+// `inputs_arg_shape_and_dtype` and `input_handles`.
+absl::StatusOr<std::string> GetCompilerIr(
+    IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
+    absl::string_view func_name, Device* dev, EagerContext* context,
+    absl::Span<const ArgShapeAndDType> input_arg_shape_and_dtype,
+    absl::Span<const TensorHandle* const> input_handles,
+    CompilerArgSource compiler_arg_source);
+
+// Returns the IR format of the selected stage for a given function `func_name`
+// using library runtime `runtime` on a platform `platform_name` with given
+// `inputs_arg_shape_and_dtype` and `input_handles`.
+absl::StatusOr<std::string> GetCompilerIr(
+    IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
+    absl::string_view func_name, absl::string_view platform_name,
+    EagerContext* context,
+    absl::Span<const ArgShapeAndDType> input_arg_shape_and_dtype,
+    absl::Span<const TensorHandle* const> input_handles,
+    CompilerArgSource compiler_arg_source);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h
new file mode 100644
index 00000000..23f54afe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_INCREASE_DYNAMISM_FOR_AUTO_JIT_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_INCREASE_DYNAMISM_FOR_AUTO_JIT_PASS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Increases the amount of "dynamism" representable by XLA clusters by rewriting
+// the TensorFlow graph.  This pass does the following rewrites:
+//
+// Slice
+// -----
+//
+//   Slice(op, begin, size <must be constant>) =>
+//     Slice(op, begin, actual_size(op.shape(), size, begin));
+//       _XlaCompileTimeConstantInputs={2}
+//
+// where
+//
+//   actual_size(op_shape, size, begin)[i] =
+//     size[i] == -1 ? (op_shape[i] - size[i])
+//                   : size[i]
+//
+// This pass, combined with jit/partially_decluster_pass, reduces the number of
+// unnecessary cluster recompilations in some common cases.  After the rewrite
+// shown above jit/partially_decluster_pass extracts the actual_size(...)
+// computation to outside the XLA cluster, causing the cluster to be versioned
+// only on the actual size of the XlaDynamicSlice.  This avoids recompilation
+// due to superficial changes that don't affect tensor shapes.
+//
+// Future Work TODO(b/111210515)
+// -----------------------------
+//
+// In the future we will also translate StridedSlice and Pad a similar way.
+class IncreaseDynamismForAutoJitPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_INCREASE_DYNAMISM_FOR_AUTO_JIT_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/kernels/xla_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/kernels/xla_ops.h
new file mode 100644
index 00000000..911b5cae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -0,0 +1,140 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
+#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
+
+#include <atomic>
+
+#include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/stream_executor_util.h"
+
+namespace tensorflow {
+
+
+// XlaLocalLaunchBase is almost the same as XlaLocalLaunchOp.
+// The only difference is that it does not require arguments to follow
+// the "constants, then regular args, then resources" order.
+// It takes vectors of constant and resource arguments explicitly.
+// It does not have corresponding OpDef because it is never present
+// in the GraphDef.
+// Currently, it is used by eager runtime. FunctionLibraryRuntime creates
+// this kernel when asked to create a kernel for an XLA-compiled function.
+//
+// `has_ref_vars`: whether the input computation can have reference variables.
+// TODO(cheshire): instead derive this information from the input graph.
+class XlaLocalLaunchBase : public AsyncOpKernel {
+ public:
+  XlaLocalLaunchBase(OpKernelConstruction* ctx,
+                     const std::vector<int>& constants,
+                     const std::vector<int>& resources,
+                     const NameAttrList& function, bool has_ref_vars);
+  XlaLocalLaunchBase(const XlaLocalLaunchBase&) = delete;
+  XlaLocalLaunchBase& operator=(const XlaLocalLaunchBase&) = delete;
+  ~XlaLocalLaunchBase() override = default;
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ protected:
+  // Indexes of compile-time constant inputs
+  const std::vector<int> constants_;
+  // Indexes of resource inputs
+  const std::vector<int> resources_;
+
+  const NameAttrList function_;
+  const XlaPlatformInfo platform_info_;
+
+  bool has_ref_vars_;
+};
+
+// XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
+// which will be compiled and executed using XLA.  The XlaLocalLaunchOp is
+// responsible for handling interactions with the TensorFlow executor.
+// Once all inputs are present, and their shapes are known, the op can
+// use a 'DeviceCompiler' to compile and execute code which is specific
+// to the shapes of input Tensors.
+// XlaLocalLaunchOp uses xla::LocalClient::Compile() and
+// xla::LocalExecutable::Run(), and passes arguments into/out of XLA in device
+// memory.
+class XlaLocalLaunchOp : public XlaLocalLaunchBase {
+ public:
+  explicit XlaLocalLaunchOp(OpKernelConstruction* ctx);
+  ~XlaLocalLaunchOp() override;
+
+ private:
+  XlaLocalLaunchOp(const XlaLocalLaunchOp&) = delete;
+  void operator=(const XlaLocalLaunchOp&) = delete;
+};
+
+class XlaCompileOp : public OpKernel {
+ public:
+  explicit XlaCompileOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Indexes of compile-time constant inputs
+  const std::vector<int> constants_;
+  // Indexes of resource inputs
+  const std::vector<int> resources_;
+
+  const NameAttrList function_;
+
+  XlaPlatformInfo platform_info_;
+
+  const bool must_compile_;
+
+  // Whether the graph has TF reference variables.
+  const bool has_ref_vars_;
+
+  // cannot_compile_cluster_ is set to true if XLA returns an Unimplemented
+  // error when compiling the cluster this _XlaCompile is supposed to compile.
+  // If `cannot_compile_cluster_` is true then we avoid compiling this cluster
+  // on any future calls to _XlaCompile.
+  bool cannot_compile_cluster_ TF_GUARDED_BY(cannot_compile_cluster_mu_) =
+      false;
+
+  mutex cannot_compile_cluster_mu_;
+};
+
+class XlaRunOp : public OpKernel {
+ public:
+  explicit XlaRunOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  const XlaPlatformInfo platform_info_;
+};
+
+class XlaMergeOp : public OpKernel {
+ public:
+  explicit XlaMergeOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/mark_for_compilation_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/mark_for_compilation_pass.h
new file mode 100644
index 00000000..558912f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization passes that marks nodes that are to be compiled with
+// attribute kXlaClusterAttr. Nodes with the same cluster ID will be compiled
+// together.
+
+#ifndef TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// The attribute that marks nodes to be grouped into functions by the
+// encapsulate subgraphs pass.
+extern const char* const kXlaClusterAttr;
+
+// Marks a subset of nodes in the graph which are to be clustered
+// with an attribute _XlaCluster=<cluster id> so they are picked up by the
+// EncapsulateSubgraphsPass.
+class MarkForCompilationPass : public GraphOptimizationPass {
+ public:
+  MarkForCompilationPass() = default;
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  absl::Status RunForTest(const GraphOptimizationPassOptions& options,
+                          bool disable_deadness_analysis,
+                          bool deterministic_cluster_names);
+
+  friend class MarkForCompilationPassTestHelper;
+};
+
+absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable();
+
+namespace testing {
+// DO NOT USE IN PRODUCTION.
+//
+// Resets some internal state to let us write reliable unit tests.
+void ResetClusterSequenceNumber();
+
+// Return a list of operation that we choose not to put into the allowlist.
+absl::flat_hash_set<string> GetKnownXLAAllowlistOp();
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
new file mode 100644
index 00000000..84d24898
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_
+#define TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+
+namespace tensorflow {
+class MarkForCompilationPassTestHelper {
+ public:
+  struct Options {
+    bool enable_global_jit;
+    bool disable_deadness_analysis;
+    bool enable_cluster_scoping;
+    bool deterministic_cluster_names;
+    std::string session_name;  // ConfigProto.Experimental.SessionMetadata.name
+
+    Options()
+        : enable_global_jit(true),
+          disable_deadness_analysis(true),
+          enable_cluster_scoping(true),
+          deterministic_cluster_names(false) {}
+
+    Options WithNoGlobalJit() {
+      Options copy = *this;
+      copy.enable_global_jit = false;
+      return copy;
+    }
+
+    Options WithDeadnessAnalysis() {
+      Options copy = *this;
+      copy.disable_deadness_analysis = false;
+      return copy;
+    }
+
+    Options WithNoClusterScoping() {
+      Options copy = *this;
+      copy.enable_cluster_scoping = false;
+      return copy;
+    }
+
+    Options WithDeterministicClusterNames() {
+      Options copy = *this;
+      copy.deterministic_cluster_names = true;
+      return copy;
+    }
+
+    Options WithSessionName(std::string name) {
+      Options copy = *this;
+      copy.session_name = std::move(name);
+      return copy;
+    }
+  };
+
+  // Runs the MarkForCompilation pass on `graph` after assigning all nodes in
+  // `graph` to the CPU device.  To make testing easier, ignores device
+  // registration and  _XlaCompile attributes.
+  static absl::Status MarkForCompilation(std::unique_ptr<Graph>* graph,
+                                         FunctionLibraryDefinition* flib_def,
+                                         Options options = Options());
+
+  // Like `MarkForCompilation` but creates `flib_def` from the op registry.
+  static absl::Status MarkForCompilation(std::unique_ptr<Graph>* graph,
+                                         Options options = Options());
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/node_matchers.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/node_matchers.h
new file mode 100644
index 00000000..a0208680
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/node_matchers.h
@@ -0,0 +1,251 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Provides a set of matchers for tensorflow nodes.
+//
+// Example usage:
+//
+//  tensorflow::Node* node = ...;
+//  EXPECT_THAT(node, NodeWith(Name("name"), Op("op"),
+//                             Inputs(Out(3, NodeWith(Name("input"))))))
+//
+// Matchable node properties (the expressions that go inside NodeWith(...))
+// are:
+//
+//  - Name(string): matches the node name exactly.  We will probably need to
+//    have this take a string matcher soon in the future.
+//
+//  - Op(string): matches the op exactly.
+//
+//  - AssignedDevice(string): matches the assigned device exactly.
+//
+//  - Inputs(<ordered list>): matches the list of non-control inputs to the node
+//    exactly (i.e. does not match a suffix or a prefix) where each element
+//    matches an output of a node (see Out(idx, node) below).
+//
+//  - CtrlDeps(<unordered list>): matches the list of control dependences on the
+//    node exactly but in any order.
+//
+//  - ConstantValue(tensorflow::Input::Initializer init): matches a Const node
+//    with the constant value `init`.  Implies Op("Const").
+//
+//  - Attr(name, value): Matches a single attribute with name `name` and value
+//    `value`.  Right now only boolean values are supported.
+//
+// Overlapping node properties may not be repeated in a single NodeWith(...)
+// matcher.  E.g. NodeWith(Op("Foo"), Op("Bar")) will CHECK-fail.  Since
+// ConstantValue implies Op("Const"), a single NodeWith matcher can't have both
+// ConstantValue(...) and Op(...).  Multiple Attr() values can be combined as
+// long as the attribute names are different.
+//
+// Out(idx, node) matches the `idx`'th output of a node that matches `node`.
+
+#ifndef TENSORFLOW_COMPILER_JIT_NODE_MATCHERS_H_
+#define TENSORFLOW_COMPILER_JIT_NODE_MATCHERS_H_
+
+#include <array>
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "xla/test.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace testing {
+namespace matchers {
+
+namespace impl {
+
+using OutEdge = std::pair<const Node*, int>;
+
+// -----------------------------------------------------------------------------
+// Implementation details.
+
+// Properties that we match on for a particular Node.  If a particular property
+// is nullopt then any value for it is allowed.
+class NodeMatcherProperties {
+ public:
+  using NodeSeqMatcher = std::vector<::testing::Matcher<const Node*>>;
+  using InputSeqMatcher = std::vector<::testing::Matcher<OutEdge>>;
+  using AttrKeyValuePair = std::pair<string, std::optional<AttrValue>>;
+
+  const std::optional<string>& name() const { return name_; }
+  const std::optional<string>& op() const { return op_; }
+  const std::optional<string>& assigned_device() const {
+    return assigned_device_;
+  }
+  const std::optional<Tensor>& constant_value() const {
+    return constant_value_;
+  }
+  const std::optional<InputSeqMatcher>& inputs() const {
+    return input_matchers_;
+  }
+  const std::optional<NodeSeqMatcher>& control_deps() const {
+    return control_deps_;
+  }
+  const std::optional<AttrKeyValuePair>& attr() const { return attr_; }
+
+  void set_name(string name) {
+    DCHECK(IsEmpty());
+    name_ = std::move(name);
+  }
+
+  void set_op(string op) {
+    DCHECK(IsEmpty());
+    op_ = std::move(op);
+  }
+
+  void set_assigned_device(string assigned_device) {
+    DCHECK(IsEmpty());
+    assigned_device_ = std::move(assigned_device);
+  }
+
+  void set_constant_value(Tensor constant_value) {
+    DCHECK(IsEmpty());
+    constant_value_ = std::move(constant_value);
+    op_ = "Const";
+  }
+
+  void set_inputs(InputSeqMatcher inputs) {
+    DCHECK(IsEmpty());
+    input_matchers_ = std::move(inputs);
+  }
+
+  void set_control_deps(NodeSeqMatcher control_deps) {
+    DCHECK(IsEmpty());
+    control_deps_ = std::move(control_deps);
+  }
+
+  void set_attr(AttrKeyValuePair attr) {
+    DCHECK(IsEmpty());
+    attr_ = std::move(attr);
+  }
+
+  bool IsEmpty() const {
+    return !name().has_value() && !op().has_value() && !inputs().has_value() &&
+           !control_deps().has_value() && !attr().has_value();
+  }
+
+ private:
+  std::optional<string> name_;
+  std::optional<string> op_;
+  std::optional<string> assigned_device_;
+  std::optional<Tensor> constant_value_;
+  std::optional<InputSeqMatcher> input_matchers_;
+  std::optional<NodeSeqMatcher> control_deps_;
+  std::optional<AttrKeyValuePair> attr_;
+};
+
+::testing::Matcher<const Node*> NodeWith(
+    absl::Span<const NodeMatcherProperties> props);
+
+impl::NodeMatcherProperties Inputs(
+    absl::Span<const ::testing::Matcher<OutEdge>> inputs);
+
+impl::NodeMatcherProperties CtrlDeps(
+    absl::Span<const ::testing::Matcher<const Node*>> control_deps);
+
+impl::NodeMatcherProperties Attr(std::pair<string, AttrValue> attrs);
+impl::NodeMatcherProperties Attr(string name);
+
+std::pair<string, AttrValue> AttrLiteralHelper(
+    const std::pair<string, bool>& bool_attr);
+
+std::pair<string, AttrValue> AttrLiteralHelper(
+    const std::pair<string, absl::Span<const int>>& int_list_attr);
+
+std::pair<string, AttrValue> AttrLiteralHelper(
+    const std::pair<string, absl::Span<const string>>& string_list_attr);
+}  // namespace impl
+
+// -----------------------------------------------------------------------------
+// Public interface.
+
+// Matches a node with name `name`.
+impl::NodeMatcherProperties Name(string name);
+
+// Matches a node with op `op`.
+impl::NodeMatcherProperties Op(string op);
+
+// Matches a node with assigned device `assigned_device`.
+impl::NodeMatcherProperties AssignedDevice(string assigned_device);
+
+// Matches a node with a boolean typed attribute named `name` and with value
+// `value`.
+template <typename ValueTy>
+impl::NodeMatcherProperties Attr(const string& name, ValueTy value) {
+  return impl::Attr({impl::AttrLiteralHelper({name, value})});
+}
+
+inline impl::NodeMatcherProperties Attr(const string& name) {
+  return impl::Attr(name);
+}
+
+// Matches a node with inputs `inputs`.
+//
+// `inputs` are ordered; `inputs`[i] must match input i.
+template <typename... Ts>
+impl::NodeMatcherProperties Inputs(Ts... inputs) {
+  return impl::Inputs({inputs...});
+}
+
+// Matches the `idx`'th output of a node that matches `node`.
+::testing::Matcher<impl::OutEdge> Out(int oidx,
+                                      ::testing::Matcher<const Node*> node);
+
+// Matches the first output of a node that matches `node`.
+inline ::testing::Matcher<impl::OutEdge> Out(
+    ::testing::Matcher<const Node*> node) {
+  return Out(0, node);
+}
+
+// Matches a node with control dependences `control_deps`.
+//
+// `control_deps` are unordered and will match the control deps of a node in any
+// order.
+template <typename... Ts>
+impl::NodeMatcherProperties CtrlDeps(Ts... control_deps) {
+  return impl::CtrlDeps({control_deps...});
+}
+
+// Matches a constant node with value `val`.
+impl::NodeMatcherProperties ConstantValue(
+    const ::tensorflow::Input::Initializer& val);
+
+// The main gmock matcher.  See file comment for example usage.
+template <typename... Ts>
+::testing::Matcher<const Node*> NodeWith(Ts... args) {
+  std::array<impl::NodeMatcherProperties, sizeof...(Ts)> array = {args...};
+  return impl::NodeWith(array);
+}
+
+::testing::Matcher<impl::OutEdge> Const(
+    const ::tensorflow::Input::Initializer& val);
+}  // namespace matchers
+
+// If `g` has a node named `name` returns it, otherwise returns null.
+Node* FindNodeByName(Graph* g, absl::string_view name);
+}  // namespace testing
+
+void PrintTo(const Node* n, ::std::ostream* os);
+void PrintTo(Node* n, ::std::ostream* os);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_NODE_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/partially_decluster_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/partially_decluster_pass.h
new file mode 100644
index 00000000..18b0091c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/partially_decluster_pass.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// Clones or moves nodes from within a cluster to outside the cluster if
+// profitable.  There are two reasons why we do this:
+//
+//  - Reducing device-to-host copies.
+//  - Reducing the number of XLA recompilations.
+class PartiallyDeclusterPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_base_device.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_base_device.h
new file mode 100644
index 00000000..b2135745
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_base_device.h
@@ -0,0 +1,112 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_BASE_DEVICE_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_BASE_DEVICE_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/device_base.h"
+
+namespace tensorflow {
+
+// tensorflow::PjRtBaseDevice replaces the deprecated tensorflow::XlaDevice.
+// This accelerator agnostic device is mainly used to store metadata.
+class PjRtBaseDevice : public LocalDevice {
+ public:
+  // Stores metadata about the PjRtBaseDevice.
+  class Metadata {
+   public:
+    Metadata(const DeviceType& jit_device_type,
+             std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+                 shape_determination_fns)
+        : jit_device_type_(jit_device_type),
+          shape_determination_fns_(std::move(shape_determination_fns)) {}
+
+    // The index of the device on this host.
+    int device_ordinal() const;
+
+    const DeviceType& jit_device_type() const { return jit_device_type_; }
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns&
+    default_shape_determination_fns() const {
+      return shape_determination_fns_.at(0);
+    }
+
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns&
+    shape_determination_fns_at(int i) const {
+      return shape_determination_fns_[i];
+    }
+
+   private:
+    const DeviceType jit_device_type_;
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns_;
+
+    Metadata(const Metadata&) = delete;
+    void operator=(const Metadata&) = delete;
+  };
+
+  struct Options {
+    // The device name's prefix (e.g., "/task:7")
+    std::string device_name_prefix;
+
+    // The name of the  device (e.g., "TPU")
+    std::string device_name;
+
+    // The index of the device.
+    int device_ordinal = -1;
+
+    // The name of the compilation device, also referred to as jit_device_type.
+    // (e.g., "XLA_CPU_JIT");
+    std::string compilation_device_name;
+
+    // A vector of ShapeDeterminationFn (i.e., a bundle of LayoutSelectionFn,
+    // ShapeRepresentationFn). Each bundle describes how the on-host shapes of
+    // a) argument and return value, for entry computations b) variables, for
+    // all computations, should be represented in XLA. Parameters/return values
+    // will be shaped according to the function pair, and reshaped back to/from
+    // their declared shapes for computations. Must be non-empty.
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns;
+
+    Options(std::string device_name_prefix, std::string device_name,
+            int device_ordinal, std::string compilation_device_name,
+            std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+                shape_determination_fns)
+        : device_name_prefix(device_name_prefix),
+          device_name(device_name),
+          device_ordinal(device_ordinal),
+          compilation_device_name(compilation_device_name),
+          shape_determination_fns(shape_determination_fns) {}
+  };
+
+  // Creates a new PJRT base device.
+  PjRtBaseDevice(const SessionOptions& session_options, const Options& options);
+
+  static absl::StatusOr<const PjRtBaseDevice::Metadata*> GetMetadataFromDevice(
+      DeviceBase* device);
+
+ private:
+  // The metadata of this PjRtBaseDevice.
+  const Metadata metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_BASE_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_compile_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_compile_util.h
new file mode 100644
index 00000000..11645651
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_compile_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_COMPILE_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_COMPILE_UTIL_H_
+
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Compiles a `function` to PjRtLoadedExecutable `executable` with `ctx`.
+// The compilation result is output in `compilation_result`. The PJRT client
+// used for compilation is output in `client`. The PJRT executable is output in
+// `executable`.
+absl::Status CompileToPjRtLoadedExecutable(
+    const OpKernelContext& ctx, const XlaPlatformInfo& platform_info,
+    const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args,
+    DeviceCompileMode compile_mode, bool has_ref_vars,
+    bool may_alias_resource_update,
+    const XlaCompiler::CompilationResult** compilation_result,
+    xla::PjRtClient** client, xla::PjRtLoadedExecutable** executable);
+
+// Similar to the above function but it does not take a OpKernelContext.
+// Instead, it takes the following arguments that are obtained from
+// OpKernelContext in the above function.
+// - `device`: the device used to compile the function.
+// - `rm`: the resource manager for DeviceCompiler to store JIT-compiled XLA
+// computation.
+// - `flr`: the FunctionLibraryRuntime for the `function`.
+absl::Status CompileToPjRtLoadedExecutable(
+    const DeviceBase* device, const XlaPlatformInfo& platform_info,
+    const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args,
+    DeviceCompileMode compile_mode, bool has_ref_vars,
+    bool may_alias_resource_update, FunctionLibraryRuntime* flr,
+    ResourceMgr* rm, const XlaCompiler::CompilationResult** compilation_result,
+    xla::PjRtClient** client, xla::PjRtLoadedExecutable** executable);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_COMPILE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_device_compiler_client.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_device_compiler_client.h
new file mode 100644
index 00000000..8c590b57
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_device_compiler_client.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_COMPILER_CLIENT_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_COMPILER_CLIENT_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace tensorflow {
+
+// Calls into PjRtClient to provide functionality for building, serializing and
+// loading PjRtLoadedExecutables.
+class PjRtDeviceCompilerClient
+    : public DeviceCompilerClient<xla::PjRtLoadedExecutable, xla::PjRtClient> {
+ public:
+  explicit PjRtDeviceCompilerClient(xla::PjRtClient* client)
+      : client_(client) {}
+
+  absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> BuildExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Returns a platform-specific serialization of `executable`. The
+  // serialization is not guaranteed to be stable over time. `executable` must
+  // have been produced by this client.
+  absl::StatusOr<std::string> SerializeExecutable(
+      const xla::PjRtLoadedExecutable& executable) override;
+
+  // PjRt doesn't support AOT compilation yet. Builds a PjRtLoadedExecutable and
+  // serializes it to string.
+  absl::StatusOr<std::string> BuildSerializedExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Deserializes a serialized executable as produced by
+  // PjRtExecutable::SerializeExecutable(). `serialized_executable` must have
+  // been produced by a compiler of the same platform and version as this one.
+  //
+  // PjRt doesn't support AOT compilation yet. Loading a serialized executable
+  // is currently only implemented for TfrtTpuPjrtClient and hence, this
+  // function doesn't use PjRtClient::LoadSerializedExecutable() and uses
+  // PjRtClient::DeserializeExecutable() instead.
+  absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> LoadExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result,
+      const std::string& serialized_executable) override;
+
+  // No-op. PJRT uses futures and waiting for programs to finish isn't
+  // necessary.
+  void WaitForProgramsToFinish() override;
+
+  xla::PjRtClient* client() const override { return client_; }
+
+ private:
+  xla::PjRtClient* const client_;
+
+  PjRtDeviceCompilerClient(const PjRtDeviceCompilerClient&) = delete;
+  void operator=(const PjRtDeviceCompilerClient&) = delete;
+};
+
+// Generates CompileOptions for PJRT compilation.
+xla::CompileOptions GetPjRtCompileOptions(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_COMPILER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_device_context.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_device_context.h
new file mode 100644
index 00000000..7637d396
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_device_context.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Helper class for managing data transfers between host and accelerator
+// devices using PjRt.
+class PjRtDeviceContext : public DeviceContext {
+ public:
+  explicit PjRtDeviceContext(
+      XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+      bool use_pjrt_tensor_buffer = false)
+      : shape_determination_fns_(std::move(shape_determination_fns)),
+        use_pjrt_tensor_buffer_(use_pjrt_tensor_buffer) {}
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
+  bool use_pjrt_tensor_buffer() const { return use_pjrt_tensor_buffer_; }
+
+ private:
+  XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns_;
+  // Note: we currently assume the PjRtBuffer is a PjRtStreamExecutorBuffer.
+  bool use_pjrt_tensor_buffer_;
+};
+
+void PjRtDeviceToDeviceCopy(DeviceContext* send_dev_context,
+                            DeviceContext* recv_dev_context, Device* src,
+                            Device* dst, AllocatorAttributes src_alloc_attr,
+                            AllocatorAttributes dst_alloc_attr,
+                            const Tensor* input, Tensor* output,
+                            int dev_to_dev_stream_index, StatusCallback done);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_tensor_buffer.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_tensor_buffer.h
new file mode 100644
index 00000000..0dd496c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_tensor_buffer.h
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_TENSOR_BUFFER_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_TENSOR_BUFFER_H_
+
+#include <memory>
+#include <utility>
+
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// PjRtTensorBuffer is derived from TensorBuffer, which holds a device memory
+// pointer so that legacy TF kernel can access it directly. PjRtTensorBuffer
+// also owns a PjRtBuffer for XLA kernel's usage.
+class PjRtTensorBuffer : public TensorBuffer {
+ public:
+  PjRtTensorBuffer(const void* ptr, size_t expected_size,
+                   std::unique_ptr<xla::PjRtBuffer> pjrt_buffer)
+      : TensorBuffer(const_cast<void*>(ptr)),
+        expected_size_(expected_size),
+        pjrt_buffer_(std::move(pjrt_buffer)) {}
+
+  size_t size() const override { return expected_size_; }
+
+  TensorBuffer* root_buffer() override { return this; }
+
+  xla::PjRtBuffer* pjrt_buffer() const { return pjrt_buffer_.get(); }
+
+  // TODO(b/288965065): Implement this.
+  void FillAllocationDescription(AllocationDescription* proto) const override {
+    proto->set_requested_bytes(static_cast<int64_t>(expected_size_));
+  }
+
+ private:
+  size_t expected_size_;
+  std::unique_ptr<xla::PjRtBuffer> pjrt_buffer_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_TENSOR_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_tensor_buffer_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_tensor_buffer_util.h
new file mode 100644
index 00000000..f73834b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/pjrt_tensor_buffer_util.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_TENSOR_BUFFER_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_TENSOR_BUFFER_UTIL_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/jit/pjrt_tensor_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+// Takes the device memory pointer from the PjRtBuffer and create a Tensor that
+// contains a PjRtTensorBuffer. The PjRtTensorBuffer holds the pointer to the
+// device memory. It also owns the PjRtBuffer.
+//
+// TODO(b/289001822): Create a unit test to cover this function.
+absl::StatusOr<Tensor> MakeTensorFromPjRtBuffer(
+    DataType dtype, const TensorShape& shape,
+    std::unique_ptr<xla::PjRtBuffer> pjrt_buffer);
+
+// For TensorFlow internal use only.
+class PjRtTensorBufferUtil {
+ public:
+  // Takes the device memory pointer from the PjRtBuffer and create a
+  // PjRtTensorBuffer. The PjRtTensorBuffer holds the pointer to the device
+  // memory. It also owns the PjRtBuffer. If output_tensor does not use
+  // PjRtTensorBuffer and the opaque device memory is the same, update the
+  // output_tensor->buf_ so that the same device memory will not be double-free.
+  // Otherwise a new Tensor will be created with the PjRtTensorBuffer.
+  //
+  // TODO(b/289001822): Create a unit test to cover this function.
+  static absl::Status UpdateOrMakeTensorWithPjRtBuffer(
+      DataType dtype, const TensorShape& shape,
+      std::unique_ptr<xla::PjRtBuffer> pjrt_buffer, Tensor* output_tensor);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_TENSOR_BUFFER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/report_clustering_info_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/report_clustering_info_pass.h
new file mode 100644
index 00000000..2ac67bf1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/report_clustering_info_pass.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_REPORT_CLUSTERING_INFO_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_REPORT_CLUSTERING_INFO_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// This is not really an optimization pass.  It does not change the graph in any
+// way; instead it computes a summary of the XLA clusters in the graph and
+// broadcasts it via xla_activity_listener.
+class ReportClusteringInfoPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_REPORT_CLUSTERING_INFO_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/resource_operation_safety_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/resource_operation_safety_analysis.h
new file mode 100644
index 00000000..eea18fb1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/resource_operation_safety_analysis.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
+
+#include "xla/service/graphcycles/graphcycles.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// An XLA cluster hoists all resource reads to be beginning of the cluster
+// execution and all the resource writes to the end.  This means it cannot
+// enforce arbitrary ordering dependencies (via control or data edges) between
+// resource operations.  Since all resource reads happen before all resource
+// writes, edges constraining resource writes to happen before resource reads
+// are problematic.  This analysis returns the set of pairs of resource
+// operations that cannot be put in the same cluster because XLA cannot respect
+// the dependencies between them in the TensorFlow program.
+//
+// The restrictions are not transitive: it is fine to put A and C in the same
+// cluster even if the returned set contains (A,B) and (B,C).
+//
+// In other words, if these pairs are seen as edges in an undirected graph of
+// the nodes in `g` then auto-clustering is at least as constrained as the graph
+// coloring problem on this graph.
+//
+//
+// For instance if we auto-cluster all operations in this TensorFlow graph:
+//
+//         AssignVariablepOp0  ->  AssignVariableOp1
+//                                      |
+//                                      v
+//                              ReadVariableOp0  ->  ReadVariableOp1
+//
+// we will lose the AssignVariablepOp1 -> ReadVariableOp0. The ReadVariableOp0
+// -> ReadVariableOp1 and AssignVariableOp0 -> AssignVariableOp1 edges will be
+// respected by XlaLaunchOp though because all reads happen before all writes
+// with that limited clustering..
+//
+//
+// NB!  The result computed by this analysis assumes that we don't auto-cluster
+// back-edges (i.e. the edges from NextIteration to Merge).
+//
+// NB!  The result computed by this analysis assumes that we don't auto-cluster
+// functional control flow nodes containing resource operations.
+//
+// If `resource_ops_to_ignore` is set then nodes for which it returns true are
+// ignored (we pretend these nodes are not resource operations).
+absl::Status ComputeIncompatibleResourceOperationPairs(
+    const Graph& g, const FunctionLibraryDefinition* flib_def,
+    const std::function<absl::Status(const Node&, bool*)>&
+        resource_ops_to_ignore,
+    std::vector<std::pair<int, int>>* result);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/shape_inference.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/shape_inference.h
new file mode 100644
index 00000000..467ecb83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/shape_inference.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_H_
+#define TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_H_
+
+#include <map>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+struct InferredShape {
+  // Shape of the argument tensor.
+  PartialTensorShape shape;
+
+  // If the argument is a resource variable, the type and shape of the
+  // variable's value.
+  DataType handle_type = DT_INVALID;
+  PartialTensorShape handle_shape;
+};
+typedef std::unordered_map<string, std::vector<InferredShape>> GraphShapeInfo;
+
+// Infer shapes for all Tensors in a graph, and save them in a map.  The vector
+// for a Node contains the information about each of its outputs.
+// TODO(phawkins): this code does not infer accurate shapes for cyclic graphs.
+// `arg_shapes`: user given map from the `index` to shapes of this
+// node, where `index` is the `index` attribute of `_Arg` op or `_index`
+// attribute of `Placeholder` op.
+absl::Status InferShapes(Graph* graph,
+                         const std::map<int, InferredShape>& arg_shapes,
+                         const tensorflow::FunctionLibraryDefinition* fnlib_def,
+                         GraphShapeInfo* shape_info);
+
+// Merges two InferredShapes. Return an error if the two shapes cannot be
+// merged.
+absl::StatusOr<InferredShape> MergeInferredShapes(const InferredShape& a,
+                                                  const InferredShape& b);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/shape_inference_helpers.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/shape_inference_helpers.h
new file mode 100644
index 00000000..d4c81954
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/shape_inference_helpers.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+#define TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Helper class to temporarily remove, then replace, the back edges in a
+// graph. Simple algorithms for shape inference don't work with cycles, and this
+// class can be used to remove cycles before running inference and replace them
+// after. Correct usage requires exactly one call to Remove(), followed by any
+// number of calls to RemovedEdges() and at most one call to Replace(). The call
+// to Replace() is optional if the graph will be discarded without being
+// executed, e.g., if it is being used purely for a shape inference pass.
+class BackEdgeHelper {
+ public:
+  struct BackEdge {
+    const Edge* edge;
+    Node* src;
+    int src_output;
+    Node* dst;
+    int dst_input;
+  };
+
+  BackEdgeHelper() = default;
+  // Disallows copy and assign.
+  BackEdgeHelper(const BackEdgeHelper& other) = delete;
+  BackEdgeHelper& operator=(const BackEdgeHelper& other) = delete;
+
+  // Temporarily removes all the back edges in graph.
+  absl::Status Remove(Graph* graph);
+
+  // Gets the list of removed edges.
+  const std::vector<BackEdge>& RemovedEdges() const;
+
+  // Replaces the back edges removed by a prior call to Remove.
+  absl::Status Replace();
+
+ private:
+  Graph* graph_ = nullptr;  // not owned
+  std::vector<BackEdge> back_edges_;
+  // Set once Replace has been called.
+  bool replaced_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/test_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/test_util.h
new file mode 100644
index 00000000..ec694662
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/test_util.h
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for tests.
+
+#ifndef TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
+
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+// Tests that the shapes in 'shape_info' for the nodes in `graph` match
+// `expected_shapes`. Returns an error if there are nodes in `expected_shapes`
+// that do not have shape information. Ignores nodes in `graph` that do not have
+// `expected_shapes` entries.
+absl::Status ShapeAnnotationsMatch(
+    const Graph& graph, const GraphShapeInfo& shape_info,
+    std::map<string, std::vector<PartialTensorShape>> expected_shapes);
+
+// A helper object to create GraphOptimizationPassOptions.
+struct GraphOptimizationPassWrapper {
+  explicit GraphOptimizationPassWrapper()
+      : library(OpRegistry::Global(), FunctionDefLibrary()) {
+    session_options.env = Env::Default();
+  }
+
+  // Create GraphOptimizationPassOptions with a graph passed in constructor and
+  // sensible options.
+  GraphOptimizationPassOptions CreateGraphOptimizationPassOptions(
+      std::unique_ptr<Graph>* graph) {
+    GraphOptimizationPassOptions options;
+    options.session_options = &session_options;
+    options.flib_def = &library;
+    options.graph = graph;
+    return options;
+  }
+
+  FunctionLibraryDefinition library;
+  SessionOptions session_options;
+};
+
+// Helps set up devices for unit tests.
+class DeviceSetup {
+ public:
+  void AddDevicesAndSetUp(
+      const std::vector<std::string>& device_names,
+      const std::optional<FunctionDef>& fdef = std::nullopt);
+  Device* GetDevice(const string& device_name);
+  FunctionLibraryRuntime* flr() { return flr_; }
+
+ private:
+  FunctionLibraryRuntime* flr_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h
new file mode 100644
index 00000000..4750803c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/tests/auto_clustering_test_helper.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_TESTS_AUTO_CLUSTERING_TEST_HELPER_H_
+#define TENSORFLOW_COMPILER_JIT_TESTS_AUTO_CLUSTERING_TEST_HELPER_H_
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+// Helper to write integration tests and benchmarks for the auto-clustering pass
+// pipeline.  These tests run auto-clustering on a graphdef and compare a
+// summary of the auto-clustering decisions with a "golden" summary.
+//
+// To create a new test from an TF workload first run the workload with the
+// following environment variables set:
+//
+//   TF_DUMP_GRAPH_PREFIX=<some temporary directory>
+//   TF_XLA_FLAGS="--tf_xla_clustering_debug"
+//
+// If auto-clustering is enabled this should produce files named
+// before_mark_for_compilation_<N>.pbtxt in the temporary directory.  As the
+// file name suggests, these are graphdefs that have been dumped right before
+// the mark_for_compilation pass.  There should be one
+// before_mark_for_compilation_<N>.pbtxt for every TF graph that was
+// auto-clustered, out of which usually only one is the "main" graph that's
+// running training/inference.
+//
+// Copy the pbtxt for that "main" graph to tensorflow/compiler/jit/tests/
+// (i.e. this directory) and create a corresponding empty .golden_summary file.
+// Add the .pbtxt and .golden_summary files to the "data" section of the cc_test
+// rule for :auto_clustering_test and then see the comment on update_golden on
+// how to auto-generate the .golden_summary file.
+
+class AutoClusteringTest : public ::testing::Test {
+ protected:
+  absl::Status RunAutoClusteringTestWithPbtxt(
+      absl::string_view pbtxt_file_path,
+      absl::string_view golden_summary_file_path);
+  absl::Status RunAutoClusteringTestWithGzippedPbtxt(
+      absl::string_view gzipped_pbtxt_file_path,
+      absl::string_view golden_summary_file_path);
+
+ private:
+  absl::Status RunAutoClusteringTestImpl(
+      GraphDef graphdef, absl::string_view golden_summary_file_path);
+};
+
+#if defined(PLATFORM_GOOGLE)
+// Reads the GraphDef stored in graph_def_path (which must be a pbtxt file) and
+// benchmarks MarkForCompilationPass on this graphdef.
+absl::Status BenchmarkMarkForCompilation(absl::string_view graph_def_path,
+                                         benchmark::State& state);
+#endif  // PLATFORM_GOOGLE
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TESTS_AUTO_CLUSTERING_TEST_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/tests/device_compiler_test_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
new file mode 100644
index 00000000..58e0a034
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_TESTS_DEVICE_COMPILER_TEST_HELPER_H_
+#define TENSORFLOW_COMPILER_JIT_TESTS_DEVICE_COMPILER_TEST_HELPER_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/jit/xla_activity_listener.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// A listener to inspect the use of XLA's persistent compilation cache entries.
+class JitCompilationListener : public XlaActivityListener {
+ public:
+  absl::Status Listen(
+      const XlaAutoClusteringActivity& auto_clustering_activity) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status Listen(
+      const XlaJitCompilationActivity& jit_compilation_activity) override {
+    activity_history_.push_back(jit_compilation_activity);
+    return absl::OkStatus();
+  }
+
+  absl::Status Listen(
+      const XlaOptimizationRemark& optimization_remark) override {
+    return absl::OkStatus();
+  }
+
+  ~JitCompilationListener() override = default;
+
+  absl::Status VerifyPersistentCacheUseListenerHistory(
+      bool expect_persistent_cache_use) {
+    for (const auto& activity : activity_history_) {
+      if (activity.used_persistent_cache() != expect_persistent_cache_use) {
+        return absl::FailedPreconditionError("Unexpected listener history.");
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  std::vector<XlaJitCompilationActivity> GetListenerHistory() {
+    return activity_history_;
+  }
+
+  void ClearListenerHistory() { activity_history_.clear(); }
+
+ private:
+  std::vector<XlaJitCompilationActivity> activity_history_;
+};
+
+// Fixture for testing XLA compilation cache serialization.
+class DeviceCompilerSerializeTest : public ::testing::Test {
+ protected:
+  DeviceCompilerSerializeTest() {
+    auto listener = std::make_unique<JitCompilationListener>();
+    listener_ = listener.get();
+    RegisterXlaActivityListener(std::move(listener));
+  }
+
+  JitCompilationListener* listener() const { return listener_; }
+
+  // Returns a test graph that will split into two XLA clusters (due to a node
+  // with _XlaCompile = false).
+  GraphDef GetTestGraph(const PartialTensorShape& input_shape);
+
+  // Runs the graph using specified batch size both with and without XLA JIT
+  // compilation. Returns an error if the results between the two do not match.
+  absl::Status ExecuteWithBatch(const GraphDef& graph, int batch);
+
+  // Adds the suffix "_altered" to the HLO module names of all of the persistent
+  // XLA compilation cache entries found at the specified directory. If none are
+  // found, returns NOT_FOUND error.
+  absl::Status AlterPersistentCacheEntryHloModuleNames(
+      absl::string_view persistent_cache_dir_path,
+      absl::string_view file_prefix = "xla_compile_cache");
+
+ private:
+  JitCompilationListener* listener_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TESTS_DEVICE_COMPILER_TEST_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h
new file mode 100644
index 00000000..adc2a74e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_TF_GRAPH_TO_HLO_COMPILER_H_
+#define TENSORFLOW_COMPILER_JIT_TF_GRAPH_TO_HLO_COMPILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/jit/tf_to_hlo_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+
+namespace tensorflow {
+
+class TfGraphToHloCompiler : public TfToHloCompiler {
+ public:
+  TfGraphToHloCompiler() = delete;
+
+  explicit TfGraphToHloCompiler(const XlaCompiler::Options& options)
+      : xla_compiler_(options) {}
+
+  // Compiles a Tensorflow `function` into an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result` by calling
+  // XlaCompiler::CompileFunction.
+  absl::Status Compile(const XlaCompiler::CompileOptions& options,
+                       const NameAttrList& function,
+                       absl::Span<const XlaArgument> args,
+                       XlaCompilationResult* result) override;
+
+  // Compiles a Tensorflow single op into an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result` by calling
+  // XlaCompiler::CompileSingleOp.
+  absl::Status CompileSingleOp(const XlaCompiler::CompileOptions& options,
+                               const OpKernelContext* ctx,
+                               absl::Span<const XlaArgument> args,
+                               XlaCompilationResult* result) override;
+
+ private:
+  XlaCompiler xla_compiler_;
+
+  TfGraphToHloCompiler(const TfGraphToHloCompiler&) = delete;
+  void operator=(const TfGraphToHloCompiler&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TF_GRAPH_TO_HLO_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/tf_to_hlo_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/tf_to_hlo_compiler.h
new file mode 100644
index 00000000..f9937a65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/tf_to_hlo_compiler.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_TF_TO_HLO_COMPILER_H_
+#define TENSORFLOW_COMPILER_JIT_TF_TO_HLO_COMPILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class TfToHloCompiler {
+ public:
+  TfToHloCompiler() = default;
+  virtual ~TfToHloCompiler() = default;
+
+  // Compiles a Tensorflow `function` to an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result`.
+  virtual absl::Status Compile(const XlaCompiler::CompileOptions& options,
+                               const NameAttrList& function,
+                               absl::Span<const XlaArgument> args,
+                               XlaCompilationResult* result) = 0;
+
+  // Compiles a Tensorflow single op to an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result`.
+  virtual absl::Status CompileSingleOp(
+      const XlaCompiler::CompileOptions& options, const OpKernelContext* ctx,
+      absl::Span<const XlaArgument> args, XlaCompilationResult* result) = 0;
+
+ private:
+  TfToHloCompiler(const TfToHloCompiler&) = delete;
+  void operator=(const TfToHloCompiler&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TF_TO_HLO_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/variable_info.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/variable_info.h
new file mode 100644
index 00000000..9294c5e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/variable_info.h
@@ -0,0 +1,95 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_VARIABLE_INFO_H_
+#define TENSORFLOW_COMPILER_JIT_VARIABLE_INFO_H_
+
+#include <map>
+#include <optional>
+#include <set>
+#include <string>
+
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Information about the state of a variable passed as input to the _XlaCompile
+// and _XlaRun operators.  Unlocks the resource variable and decrements its
+// refcount on destruction.
+class VariableInfo {
+ public:
+  explicit VariableInfo(int index, absl::string_view name, Var* var,
+                        const std::optional<ManagedStackTrace>&
+                            definition_stack_trace = std::nullopt);
+  VariableInfo(VariableInfo&& other);
+
+  VariableInfo& operator=(VariableInfo&& other);
+
+  VariableInfo(const VariableInfo&) = delete;
+  VariableInfo& operator=(const VariableInfo&) = delete;
+
+  // The index of the DT_RESOURCE input to the _XlaCompile/_XlaRun operator.
+  // Note that the indices can be different between _XlaCompile and _XlaRun.
+  int index() const { return index_; }
+
+  // A pointer to the resource variable.  May be null if this VariableInfo is
+  // "empty", i.e. it does not track a resource variable.
+  Var* var() const { return var_; }
+
+  // Returns the variable name.
+  absl::string_view name() const { return name_; }
+
+  // Returns true if the resource variable lock was successfully acquired by
+  // this thread.
+  bool lock_held() const { return lock_held_; }
+  void set_lock_held() { lock_held_ = true; }
+
+  // Returns true if the resource variable reader lock was successfully acquired
+  // by this thread.
+  bool shared_lock_held() const { return shared_lock_held_; }
+  void set_shared_lock_held() { shared_lock_held_ = true; }
+
+  bool read_only() const { return read_only_; }
+  void set_read_only() { read_only_ = true; }
+
+  const std::optional<ManagedStackTrace>& definition_stack_trace() const {
+    return definition_stack_trace_;
+  }
+
+  ~VariableInfo();
+
+ private:
+  int index_;
+  std::string name_;
+  Var* var_;
+  std::optional<ManagedStackTrace> definition_stack_trace_;
+
+  // We can't use a optional<mutex_lock> here because it confuses the compiler's
+  // thread safety analysis. Instead we use a boolean flag and release the lock
+  // in the VariableInfo destructor.
+  bool lock_held_ = false;
+  bool shared_lock_held_ = false;
+
+  // Whether this variable is going to be mutated. Left false if the caller
+  // doesn't provide this information.
+  bool read_only_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_VARIABLE_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/variable_info_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/variable_info_util.h
new file mode 100644
index 00000000..ac825d14
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/variable_info_util.h
@@ -0,0 +1,93 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_VARIABLE_INFO_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_VARIABLE_INFO_UTIL_H_
+
+#include <map>
+#include <optional>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Snapshot of resource variables for a TF kernel invocation, mapping from
+// parameter number to values at execution time. If the resource variable is not
+// initialized, the value will not be present.
+using ResourceVarsSnapshot = absl::flat_hash_map<int, std::optional<Tensor>>;
+
+// Takes a snapshot of the values of resource variable arguments, whose indices
+// are specified in `variable_indices` argument. We snapshot tensors that back
+// resource variables since concurrent updates may modify the shape, and it is
+// important that the shapes used for compilation match the true shapes of the
+// buffers.
+//
+// We snapshot the entire set of resource variables as one atomic operation.
+// This models Read->* dependencies between resource variable operations.  See
+// jit/resource_operation_safety_analysis for details.
+absl::Status SnapshotResourceVariables(
+    OpKernelContext* ctx, absl::Span<const int> variable_indices,
+    absl::Span<VariableInfo const> variable_infos,
+    ResourceVarsSnapshot* result);
+
+// Acquires the mutexes for all the variables in `variables` using a
+// deadlock-safe protocol (acquire the mutexes in increasing-address order).
+//
+// `variables` is allowed to contain instances that don't track a resource
+// variable (i.e. variables[i].var() can be null for some i).
+//
+// If the variable is read_only(), only acquires reader locks.
+absl::Status LockVariables(absl::Span<VariableInfo*> variables)
+    TF_EXCLUSIVE_LOCK_FUNCTION();
+absl::Status LockVariables(absl::Span<VariableInfo> variables)
+    TF_EXCLUSIVE_LOCK_FUNCTION();
+
+// Returns a vector of VariableInfo instances for the resource variable inputs,
+// given that *all* inputs are in `inputs`. The input indices for the resource
+// variable inputs are in `variable_indices`.
+//
+// When using the VariableInfos generated by this version, all variables would
+// be writer-locked.
+absl::Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                        absl::Span<const Tensor* const> inputs,
+                                        absl::Span<const int> variable_indices,
+                                        std::vector<VariableInfo>* result);
+
+// variables_updated is a set containing the indices of the variables that are
+// going to be mutated. If variables_updated is empty, then in LockVariables all
+// variables would only be reader-locked. If variables_updated is null, then we
+// consider this information unknown and will acquire writer-lock for all
+// variables.
+absl::Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                        absl::Span<const Tensor* const> inputs,
+                                        absl::Span<const int> variable_indices,
+                                        const std::set<int>* variables_updated,
+                                        std::vector<VariableInfo>* result);
+
+std::vector<int> GetResourceVariableIndicesFromContext(OpKernelContext* ctx);
+
+absl::Status CreateVariableInfoLookup(
+    absl::Span<VariableInfo const> variable_args,
+    absl::flat_hash_map<int, const VariableInfo*>& variable_info_lookup);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_VARIABLE_INFO_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_activity_listener.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_activity_listener.h
new file mode 100644
index 00000000..d8be8309
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_activity_listener.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_ACTIVITY_LISTENER_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_ACTIVITY_LISTENER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+// Broadcast `auto_clustering_activity` to all the registered listeners.
+absl::Status BroadcastXlaActivity(
+    XlaAutoClusteringActivity auto_clustering_activity);
+
+// Broadcast `jit_compilation_activity` to all the registered listeners.
+absl::Status BroadcastXlaActivity(
+    XlaJitCompilationActivity jit_compilation_activity);
+
+// Broadcast `jit_compilation_activity` to all the registered listeners.
+absl::Status BroadcastOptimizationRemark(
+    XlaOptimizationRemark optimization_remark);
+
+// LINT.IfChange
+// Called after TensorFlow realizes possible lost performance. The parameters in
+// this should match all of the values in the XlaOptimizationRemark proto.
+absl::Status BroadcastOptimizationRemark(
+    XlaOptimizationRemark::Warning optimization_warning,
+    string debug_information);
+
+// LINT.ThenChange(//tensorflow/compiler/jit/xla_activity.proto)
+
+// Various components of the system can subclass XlaActivityListener to
+// notifications on auto-clustering and JIT compilation events.
+//
+// Subclasses of XlaActivityListener must be thread safe.
+class XlaActivityListener {
+ public:
+  // Called after TensorFlow auto-clusters a graph.
+  virtual absl::Status Listen(
+      const XlaAutoClusteringActivity& auto_clustering_activity) = 0;
+
+  // Called after TensorFlow JIT compiles an XLA cluster.
+  virtual absl::Status Listen(
+      const XlaJitCompilationActivity& jit_compilation_activity) = 0;
+
+  // Called after TensorFlow realizes possible lost performance.
+  virtual absl::Status Listen(
+      const XlaOptimizationRemark& optimization_remark) = 0;
+
+  // Called at program exit in best-effort manner to give listeners a chance to
+  // flush their state.
+  //
+  // Default implementation is a no-op.
+  virtual void Flush();
+
+  virtual ~XlaActivityListener();
+};
+
+// Registers an `XlaActivityListener`, which will be invoked on all subsequent
+// `BroadcastXlaActivity` calls.
+void RegisterXlaActivityListener(std::unique_ptr<XlaActivityListener> listener);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_ACTIVITY_LISTENER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_cluster_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_cluster_util.h
new file mode 100644
index 00000000..6fe0b485
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_cluster_util.h
@@ -0,0 +1,113 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains utilities for clustering compilable graph nodes via XLA.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "xla/service/graphcycles/graphcycles.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+
+// The attribute that marks nodes to be grouped into functions by the
+// encapsulate subgraphs pass.
+extern const char* const kXlaClusterAttr;
+
+// The attribute that marks certain inputs to a Node as required to be a
+// constant at compile time.  If this attribute is present then the
+// CompileTimeConstantInput information in the corresponding XlaOpKernel is
+// ignored.
+//
+// The value for this attribute, if present, has to be a list of strings naming
+// the inputs to the node that must be constant.
+extern const char* const kXlaCompileTimeConstantInputsAttr;
+
+using OrderedNodeSet = std::set<Node*, NodeComparatorID>;
+
+// Returns true if `node` has a ref tensor input that it forwards to its output.
+bool HasForwardedRefInput(const Node& node);
+
+// Creates a graph representation to enable cycle detection when clustering.
+// This representation handles loops in graph by disconnecting each loop from
+// the enclosing graph.
+//
+// Returns true for success and false for valid graphs that we can't handle yet
+// (b/127521408).
+absl::StatusOr<bool> CreateCycleDetectionGraph(const Graph* graph,
+                                               xla::GraphCycles* cycles);
+
+// Returns the XLA cluster in which `node` is placed if it is in an XLA cluster,
+// otherwise returns nullopt.
+std::optional<absl::string_view> GetXlaClusterForNode(const Node& node);
+
+// Removes `node_def` its XLA cluster (by clearing its _XlaCluster attribute).
+void RemoveFromXlaCluster(NodeDef* node_def);
+
+// Removes `node` its XLA cluster (by clearing its _XlaCluster attribute).
+void RemoveFromXlaCluster(Node* node);
+
+// Returns true if `node` has a DT_RESOURCE typed input or output.
+bool HasResourceInputOrOutput(const Node& node);
+
+// Determines the global jit level based on GraphOptimizationPassOptions,
+// --tf_xla_auto_jit and whether the graph is a single GPU graph.
+OptimizerOptions::GlobalJitLevel GetGlobalJitLevelForGraph(
+    const GraphOptimizationPassOptions& options);
+
+// Returns true if `g` is a single-GPU graph.  A single-GPU graph uses exactly
+// one GPU (and any number of CPUs).
+bool IsSingleGpuGraph(const Graph& g);
+
+// Returns true if it is possible (but not guaranteed) that `n` calls a
+// function.
+bool MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def);
+
+// Returns true if `node` an operator that consumes only the shape of its input,
+// not the data itself.
+bool IsShapeConsumerOp(const Node& node);
+
+// Computes a clustering summary for `graph`.  See documentation on
+// `XlaAutoClusteringSummary` for details.
+XlaAutoClusteringSummary GetXlaAutoClusteringSummary(const Graph& graph);
+
+// Returns the set of nodes that have a path to or from nodes that may have ref
+// variables as input or output.
+//
+// We assume each node has a trivial path to itself so the returned set includes
+// all of the nodes that have ref variables as input or output.
+absl::StatusOr<absl::flat_hash_set<Node*>> GetNodesRelatedToRefVariables(
+    const Graph& graph, FunctionLibraryRuntime* lib_runtime);
+
+// Deterministically serialized the graph to a byte string.
+absl::StatusOr<std::string> SerializeGraphDeterministic(const Graph& graph);
+
+// Computes a fingerprint of the given `graph`. The fingerprint can use used to
+// check if two graphs are likely the same but should not be relied on
+// determining if the graphs are identical.
+absl::StatusOr<uint64> FingerprintGraph(const Graph& graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compile_on_demand_op.h
new file mode 100644
index 00000000..dfe9ddaa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The XlaCompileOnDemandOp is an OpKernel that, when its Compute method is
+// called, will generate an xla::Computation and run it asynchronously.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/compiler/jit/variable_info_util.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// An OpKernel that compiles an op to an XLA computation and runs it. Unlike
+// XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
+// vanilla TensorFlow op as long as the bridge supports it.
+class XlaCompileOnDemandOp : public OpKernel {
+ public:
+  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx),
+        platform_info_(XlaPlatformInfoFromDevice(ctx->device())) {}
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  absl::Status Compile(const std::vector<XlaCompiler::Argument>& args,
+                       OpKernelContext* ctx,
+                       DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
+                           xla_device_compiler,
+                       DeviceCompilationProfiler** profiler,
+                       const XlaCompiler::CompilationResult** result,
+                       xla::LocalExecutable** executable);
+
+  absl::Status Compile(const std::vector<XlaCompiler::Argument>& args,
+                       OpKernelContext* ctx,
+                       DeviceCompiler<xla::PjRtLoadedExecutable,
+                                      xla::PjRtClient>** pjrt_device_compiler,
+                       DeviceCompilationProfiler** profiler,
+                       const XlaCompiler::CompilationResult** result,
+                       xla::PjRtLoadedExecutable** executable);
+
+  absl::Status Run(const ResourceVarsSnapshot& variable_args,
+                   const XlaCompiler::CompilationResult* result,
+                   const DeviceCompiler<xla::LocalExecutable, xla::LocalClient>*
+                       xla_device_compiler,
+                   xla::LocalExecutable* executable, OpKernelContext* ctx);
+
+  const XlaPlatformInfo platform_info_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compile_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compile_util.h
new file mode 100644
index 00000000..d722ba8e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compile_util.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// The number of compiler threads to use for asynchronous device compilation.
+inline constexpr int64_t kNumAsyncDeviceCompilerThreads = 10;
+
+enum class DeviceCompileMode {
+  kLazy,
+  kStrict,
+  kAsync,
+};
+
+enum class DeviceCompileState {
+  kUncompiled,
+  kCompiling,
+  kCompiled,
+};
+
+// Creates a single-node graph using the specified `node_def` as the only op
+// apart from the arg and retval nodes corresponding to `args` and
+// `result_types` respectively.
+absl::StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
+    const NodeDef& node_def, absl::Span<const XlaArgument> args,
+    absl::Span<const DataType> result_types);
+
+// Checks if single device compilation and execution with PJRT is enabled for
+// `device_type` in either the XlaLaunch op or the XlaCompileOnDemand op.
+bool UsePjRtForSingleDeviceCompilation(const DeviceType& device_type);
+
+// Gets the resource name of the PjRt DeviceCompiler for `device_type`.
+std::string GetPjRtDeviceCompilerResourceName(const DeviceType& device_type);
+
+// Gets the resource name of the DeviceCompilationProfiler for `device_type`
+// when PjRt is used for compilation and execution.
+std::string GetPjRtDeviceCompilationProfilerResourceName(
+    const DeviceType& device_type);
+
+// Gets the ResourceMgr where the DeviceCompiler is/should be stored for the
+// given `device_type`.
+absl::StatusOr<ResourceMgr*> GetResourceMgrForDeviceCompiler(
+    const OpKernelContext& ctx, const DeviceType& device_type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compiler_options_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compiler_options_util.h
new file mode 100644
index 00000000..23cb5f86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_compiler_options_util.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILER_OPTIONS_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_COMPILER_OPTIONS_UTIL_H_
+
+#include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace tensorflow {
+
+// Returns created options for the XLA compiler.
+XlaCompiler::Options GenerateCompilerOptions(
+    const DeviceCompiler<xla::LocalExecutable, xla::LocalClient>&
+        xla_device_compiler,
+    const FunctionLibraryRuntime& function_library, DeviceBase* device,
+    se::Stream* stream, const XlaPlatformInfo& platform_info,
+    bool has_ref_vars);
+
+// Returns created options for XLA compiler when TFRT-TPU is used.
+XlaCompiler::Options GenerateCompilerOptionsForTfrtTpu(
+    const DeviceCompiler<xla::LocalExecutable, xla::LocalClient>&
+        xla_device_compiler,
+    const FunctionLibraryRuntime& function_library);
+
+// Returns created options for XLA compiler when PjRt (Device API) is used for
+// compilation and execution.
+XlaCompiler::Options GenerateCompilerOptionsForPjRt(
+    const FunctionLibraryRuntime& function_library,
+    const DeviceBase* device_base, const XlaPlatformInfo& platform_info,
+    const DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>*
+        pjrt_device_compiler);
+
+// Returns created options for XLA compiler when PjRt (Device API) is used for
+// compilation and execution.
+XlaCompiler::Options GenerateCompilerOptionsForPjRt(
+    const FunctionLibraryDefinition* function_library_def,
+    int graph_def_version, const DeviceBase* device_base,
+    const XlaPlatformInfo& platform_info,
+    const DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>*
+        pjrt_device_compiler);
+
+// Returns created CompileOptions for XLA compiler.
+XlaCompiler::CompileOptions GenerateCompileOptions(
+    bool has_ref_vars, bool may_alias_resource_update);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILER_OPTIONS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device.h
new file mode 100644
index 00000000..877d208d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device.h
@@ -0,0 +1,321 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The XlaDevice executes a TensorFlow graph using the XLA linear algebra
+// runtime.
+//
+// Operators assigned to an XlaDevice are compiled into XLA computations.
+// Tensors on an XlaDevice are thin wrappers around XLA ScopedShapedBuffers.
+//
+// XlaDevice is instantiated separately for each XLA backend (e.g., CPU or GPU),
+// under different names (e.g., XLA_CPU or XLA_GPU).
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
+#include <set>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/client/local_client.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
+
+namespace tensorflow {
+
+class XlaDevice : public LocalDevice {
+ public:
+  // Given a tensor, sets `xla::Shape*` the shape of tensor's representation
+  // on device, fully padded. On error, the contents of `xla::Shape*`
+  // are undefined.
+  typedef std::function<absl::Status(const Tensor&, xla::Shape*)> PaddedShapeFn;
+
+  // Wrapper class to store metadata about the XlaDevice, where it can be
+  // retrieved e.g., when lazily creating the XlaCompilationCache device.
+  class Metadata {
+   public:
+    Metadata(int device_ordinal, se::Platform* platform,
+             const DeviceType& device_type,
+             std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+                 shape_determination_fns,
+             PaddedShapeFn padded_shape_fn, bool use_multiple_streams);
+
+    // The index of the device on this host.
+    int device_ordinal() const;
+
+    se::Platform* platform() const;
+    xla::LocalClient* client() const;
+    const DeviceType& jit_device_type() const;
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns&
+    default_shape_determination_fns() const {
+      return shape_determination_fns_.at(0);
+    }
+    const PaddedShapeFn& padded_shape_fn() const { return padded_shape_fn_; }
+
+    bool UseMultipleStreams() const { return use_multiple_streams_; }
+
+   private:
+    const int device_ordinal_;
+    const DeviceType device_type_;
+    se::Platform* platform_;  // Not owned.
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns_;
+    PaddedShapeFn padded_shape_fn_;
+    const bool use_multiple_streams_;
+
+    Metadata(const Metadata&) = delete;
+    void operator=(const Metadata&) = delete;
+  };
+
+  // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by `ctx`.
+  static absl::Status GetMetadata(OpKernelContext* ctx,
+                                  const Metadata** metadata);
+
+  // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by `ctx`.
+  static absl::Status GetMetadata(OpKernelConstruction* ctx,
+                                  const Metadata** metadata);
+
+  // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by
+  // `device`.
+  static absl::Status GetMetadataFromDevice(
+      DeviceBase* device, const XlaDevice::Metadata** metadata);
+
+  struct Options {
+    // The StreamExecutor platform. Not owned. Must be non-null.
+    se::Platform* platform = nullptr;
+
+    // The device name's prefix (e.g., "/task:7")
+    string device_name_prefix;
+
+    // The name of the XLA device (e.g., "XLA_CPU")
+    string device_name;
+
+    // The number of the device.
+    int device_ordinal = -1;
+
+    // The name of the compilation device (e.g., "XLA_CPU_JIT");
+    string compilation_device_name;
+
+    // If 'use_multiple_streams' is true, we create separate streams for
+    // compute, host-to-device, and device-to-host communication.
+    bool use_multiple_streams = false;
+
+    // If true, the XLA devices with the same device ordinal will share the same
+    // compute stream. Otherwise each XLA device will having their own compute
+    // streams.
+    bool use_global_compute_stream = false;
+
+    // A vector of ShapeDeterminationFn (i.e., a bundle of LayoutSelectionFn,
+    // ShapeRepresentationFn). Each bundle describes how the on-host shapes of
+    // a) argument and return value, for entry computations b) variables, for
+    // all computations, should be represented in XLA. Parameters/return values
+    // will be shaped according to the function pair, and reshaped back to/from
+    // their declared shapes for computations. Must be non-empty.
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns;
+
+    // If padded_shape_fn is empty, a default implementation that returns
+    // the logical on-device shape without padding is used.
+    PaddedShapeFn padded_shape_fn;
+
+    // Set of devices to use. This controls which of the devices on the given
+    // platform will have resources allocated. For GPUs this will be
+    // filled from visible_gpu_devices list from session configuration.
+    std::optional<std::set<int>> allowed_devices;
+  };
+
+  // Creates a new XLA Device.
+  XlaDevice(const SessionOptions& session_options, const Options& options);
+  ~XlaDevice() override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override
+      TF_LOCKS_EXCLUDED(mu_);
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+  absl::Status Sync() override;
+
+  absl::Status TryGetDeviceContext(DeviceContext** out_context) override
+      TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override
+      TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status MakeTensorFromProto(DeviceContext* device_context,
+                                   const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor);
+
+  const Metadata& metadata() { return xla_metadata_; }
+
+  // Ensures the DeviceContext associated with this XlaDevice is created and
+  // valid (i.e. all streams are ok). If any state is not valid, a new
+  // DeviceContext will be created.
+  //
+  // TODO(b/111859745): The Eager context needs to call this method to recover
+  // from failures.
+  absl::Status EnsureDeviceContextOk() TF_LOCKS_EXCLUDED(mu_);
+
+  // Two convenient methods to get the underlying device context.
+  // Get the default device context, created by the first
+  // shape_representation_fn.
+  absl::StatusOr<DeviceContext*> GetDeviceContextDefault();
+  // Get the device context given the index.
+  absl::StatusOr<DeviceContext*> GetDeviceContextWithIndex(int index);
+
+  // Instructs this XlaDevice to set a AcceleratorDeviceInfo, which holds extra
+  // information for GPU and TPU devices.
+  absl::Status UseAcceleratorDeviceInfo() TF_LOCKS_EXCLUDED(mu_);
+
+  // Instructs this XlaDevice to return 'sync_on_completion' for
+  // AllowsSyncOnCompletion().
+  void SetAllowsSyncOnCompletion(bool sync_on_completion)
+      TF_LOCKS_EXCLUDED(mu_);
+  bool AllowsSyncOnCompletion() const override TF_LOCKS_EXCLUDED(mu_);
+
+  // Installs an error handling callback when RefreshStatus sees !status.ok().
+  void SetHandleDeviceErrorCallback(std::function<absl::Status()> callback);
+
+  absl::Status RefreshStatus() override TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  absl::StatusOr<xla::LocalClient*> GetOrCreateClient() const;
+  Allocator* GetAllocatorLocked(AllocatorAttributes attr)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  absl::Status EnsureStreamOkLocked(xla::Backend* backend, const string& name,
+                                    std::shared_ptr<se::Stream>* stream,
+                                    bool* stream_was_changed)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Return a vector of device context, ordered by the sequence in the given
+  // shape_representation_fns.
+  absl::StatusOr<std::vector<DeviceContext*>> GetDeviceContextLocked()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Handles error when RefreshStatus sees !status.ok().
+  absl::Status HandleDeviceError();
+
+  mutable mutex mu_;
+  // The metadata of this XlaDevice.
+  const Metadata xla_metadata_;
+  // Which hardware device in the client's platform this XlaDevice controls.
+  const int device_ordinal_;
+  // The name/type of this XlaDevice. eg. "XLA_GPU".
+  const DeviceType device_name_;
+  // The name of the device that is used to compile Ops for this XlaDevice.
+  const DeviceType jit_device_name_;
+  // The platform for this device.
+  se::Platform* const platform_;  // Not owned.
+  // Intra-op threads to spawn (from SessionOptions).
+  const int intra_op_parallelism_threads_;
+  // Memory allocator associated with this device.
+  Allocator* xla_allocator_ TF_GUARDED_BY(mu_) = nullptr;  // Not owned.
+  std::unique_ptr<AsyncValueAllocator> pjrt_allocator_ TF_GUARDED_BY(mu_);
+
+  // Stream associated with this device. Operations enqueued on this
+  // stream are executed on the device. Operations include data
+  // copying back and forth between CPU and the device, and
+  // computations enqueued by XLA.
+  std::shared_ptr<se::Stream> stream_ TF_GUARDED_BY(mu_);
+  // If false, only stream_ is valid and all computation and transfers use
+  // stream_. If true, computation is performed by stream_ and transfers are
+  // performed by host_to_device/device_to_device stream or borrowing a stream
+  // for each device to host transfer.
+  const bool use_multiple_streams_;
+  // If use_multiple_streams_, host to device transfers are performed using this
+  // stream.
+  std::shared_ptr<se::Stream> host_to_device_stream_ TF_GUARDED_BY(mu_);
+  // If use_multiple_streams_, transfers between different devices are performed
+  // using these streams.
+  std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_
+      TF_GUARDED_BY(mu_);
+
+  // See comments in options.
+  std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+      shape_determination_fns_;
+
+  // A list of the device context accessed by all users of the XlaDevice, set by
+  // calls to EnsureDeviceContextOk. The number of device conetexts is based on
+  // the number of shape representation functions in XlaDevice::Options. If
+  // accelerator_device_info_ is non-null, this pointer is also filled in to
+  // that struct. DeviceContext is a ref-counted object.
+  std::vector<DeviceContext*> device_contexts_ TF_GUARDED_BY(mu_);
+
+  // Holds extra information for GPU and TPU devices, e.g. the device context.
+  bool use_accelerator_device_info_ TF_GUARDED_BY(mu_) = false;
+  std::unique_ptr<DeviceBase::AcceleratorDeviceInfo> accelerator_device_info_
+      TF_GUARDED_BY(mu_);
+
+  // Thread pool used for running closures
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  // True if the device allows XlaDevice::Sync to be called on completion
+  // regardless of status.
+  bool sync_on_completion_ TF_GUARDED_BY(mu_) = true;
+
+  // A callback that will be invoked when RefreshStatus sees a status error.
+  std::function<absl::Status()> device_error_callback_ TF_GUARDED_BY(mu_);
+
+  // Set of devices to use. This controls which of the devices on the given
+  // platform will have resources allocated. For GPUs this will be
+  // filled from visible_gpu_devices list from session configuration.
+  std::optional<std::set<int>> allowed_devices_;
+
+  const bool use_global_compute_stream_;
+
+  // A static vector with device_ordinal as its index, describing the global
+  // compute streams used in each XLA device. It is only used if
+  // `use_global_compute_stream` in `XlaDevice::Options` is set to true.
+  static mutex global_mu_;
+  static std::vector<std::shared_ptr<se::Stream>>* global_compute_streams_
+      TF_GUARDED_BY(global_mu_);
+};
+
+// Builds OpKernel registrations on 'device' for the JIT operators
+// registered on 'jit_device'. Returns ownership of a XlaDeviceOpRegistrations
+// object that encapsulates the kernel registrations.
+struct XlaDeviceOpRegistrations {
+  std::vector<std::unique_ptr<kernel_factory::OpKernelRegistrar>>
+      op_kernel_registrars;
+};
+
+XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(
+    const char* device, const char* jit_device,
+    OpKernel* (*factory)(OpKernelConstruction*),
+    absl::string_view kernel_class_name);
+
+XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
+                                                   const char* jit_device);
+
+absl::Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_compiler_client.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_compiler_client.h
new file mode 100644
index 00000000..3967897c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_compiler_client.h
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_COMPILER_CLIENT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_COMPILER_CLIENT_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "xla/client/local_client.h"
+
+namespace tensorflow {
+
+class XlaDeviceCompilerClient
+    : public DeviceCompilerClient<xla::LocalExecutable, xla::LocalClient> {
+ public:
+  explicit XlaDeviceCompilerClient(xla::LocalClient* client)
+      : client_(client) {}
+
+  absl::StatusOr<std::unique_ptr<xla::LocalExecutable>> BuildExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Returns a serialized AOT result obtained by exporting the available
+  // `executable` using the XlaCompiler.
+  absl::StatusOr<std::string> SerializeExecutable(
+      const xla::LocalExecutable& executable) override;
+
+  // Returns a serialized AOT result obtained by compiling `result` into an AOT
+  // result.
+  absl::StatusOr<std::string> BuildSerializedExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Loads a serialized AOT result (`serialized_executable`) into an
+  // xla::LocalExecutable and returns it.
+  absl::StatusOr<std::unique_ptr<xla::LocalExecutable>> LoadExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result,
+      const std::string& serialized_executable) override;
+
+  void WaitForProgramsToFinish() override;
+
+  xla::LocalClient* client() const override { return client_; }
+
+ private:
+  xla::LocalClient* const client_;
+
+  XlaDeviceCompilerClient(const XlaDeviceCompilerClient&) = delete;
+  void operator=(const XlaDeviceCompilerClient&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_COMPILER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_context.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_context.h
new file mode 100644
index 00000000..4e8a769e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_context.h
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_CONTEXT_H_
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/global_data.h"
+#include "xla/client/local_client.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// The allocator used for Tensors assigned to the XLA device. The allocator
+// ignores the alignment and size of the request and always returns a new,
+// empty, XlaTensor.
+class XlaDeviceAllocator : public Allocator {
+ public:
+  XlaDeviceAllocator(se::StreamExecutor* stream_executor);
+  ~XlaDeviceAllocator() override;
+
+  string Name() override;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+  std::optional<AllocatorStats> GetStats() override;
+  bool ClearStats() override;
+
+ private:
+  // The stream executor of the device.
+  se::StreamExecutor* stream_executor_;
+};
+
+// Helper class for managing data transfers between host and XLA devices.
+class XlaDeviceContext : public DeviceContext {
+ public:
+  explicit XlaDeviceContext(
+      std::shared_ptr<se::Stream> compute_stream,
+      std::shared_ptr<se::Stream> host_to_device_stream,
+      std::shared_ptr<se::Stream> device_to_host_stream,
+      std::vector<std::shared_ptr<se::Stream>> device_to_device_streams,
+      xla::LocalClient* client,
+      XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+      thread::ThreadPool* thread_pool);
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
+  xla::LocalClient* client() const { return client_; }
+  se::Stream* stream() const override { return stream_.get(); }
+  se::Stream* host_to_device_stream() const {
+    return host_to_device_stream_.get();
+  }
+  se::Stream* device_to_device_stream(int index) const {
+    return device_to_device_streams_.at(index).get();
+  }
+  xla::TransferManager* transfer_manager() const { return transfer_manager_; }
+  const XlaShapeLayoutHelpers::ShapeDeterminationFns& shape_determination_fns()
+      const {
+    return shape_determination_fns_;
+  }
+
+  // Returns a device-to-device stream, in round-robin fashion.
+  se::Stream* GetDeviceToDeviceStream();
+
+  absl::Status ThenExecute(Device* device, stream_executor::Stream* stream,
+                           std::function<void()> func) override;
+
+ private:
+  bool UseMultipleStreams() const { return stream_ != host_to_device_stream_; }
+
+  // The main compute stream of the device, used to synchronize the transfer
+  // streams if they are set.
+  std::shared_ptr<se::Stream> stream_;
+  // The stream to use for transferring data from host to device. Can be
+  // idential to stream_, but must not be nullptr.
+  std::shared_ptr<se::Stream> host_to_device_stream_;
+  // The stream to use for transferring data from device to host. Can be
+  // idential to stream_. If nullptr, borrow a stream from backend for each
+  // transfer request to support out-of-order requests.
+  std::shared_ptr<se::Stream> device_to_host_stream_;
+  // Streams to use for transferring data directly between different devices,
+  // e.g., over NVLINK.
+  std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_;
+
+  // For the underlying memory allocator and XLA's TransferManager.
+  xla::LocalClient* client_;
+  // Transfer manager, for marshalling data to and from the device.
+  xla::TransferManager* transfer_manager_;
+
+  XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns_;
+
+  // Thread pool used for running closures
+  thread::ThreadPool* thread_pool_;
+
+  absl::Mutex mu_;
+  int next_stream_ TF_GUARDED_BY(mu_) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_ops.h
new file mode 100644
index 00000000..fdb28446
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_device_ops.h
@@ -0,0 +1,256 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common kernel registrations for XLA devices.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/constant_op.h"
+#include "tensorflow/core/kernels/data/finalize_dataset_op.h"
+#include "tensorflow/core/kernels/data/generator_dataset_op.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/optional_ops.h"
+#include "tensorflow/core/kernels/data/options_dataset_op.h"
+#include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
+#include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/function_ops.h"
+#include "tensorflow/core/kernels/identity_op.h"
+#include "tensorflow/core/kernels/resource_variable_ops.h"
+#include "tensorflow/core/kernels/shape_ops.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+
+namespace tensorflow {
+
+// Dummy OpKernel, used for kernels assigned to an XLA device that should be
+// compiled. Should never be called at runtime since such ops should be
+// rewritten to a XlaLaunch op. If it is called, it means the placer placed an
+// operator on an XLA device but the compiler did not compile it.
+class XlaDeviceDummyOp : public OpKernel {
+ public:
+  explicit XlaDeviceDummyOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class XlaAssignVariableOp : public OpKernel {
+ public:
+  explicit XlaAssignVariableOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  DataType dtype_;
+};
+
+#define REGISTER_XLA_LAUNCH_KERNEL(DEVICE, KERNEL, TYPES) \
+  REGISTER_KERNEL_BUILDER(Name("XlaLaunch")               \
+                              .Device(DEVICE)             \
+                              .HostMemory("constants")    \
+                              .HostMemory("resources"),   \
+                          KERNEL);
+
+#define REGISTER_XLA_COMPILE_KERNEL(DEVICE, KERNEL, TYPES)          \
+  REGISTER_KERNEL_BUILDER(Name("_XlaCompile")                       \
+                              .Device(DEVICE)                       \
+                              .HostMemory("constants")              \
+                              .HostMemory("key")                    \
+                              .HostMemory("compilation_successful") \
+                              .HostMemory("resources"),             \
+                          KERNEL);
+
+#define REGISTER_XLA_RUN_KERNEL(DEVICE, KERNEL, TYPES) \
+  REGISTER_KERNEL_BUILDER(Name("_XlaRun").Device(DEVICE), KERNEL);
+
+#define REGISTER_XLA_DEVICE_KERNELS(DEVICE, TYPES)                             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Const").Device(DEVICE).TypeConstraint("dtype", TYPES),             \
+      ConstantOp);                                                             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("VarHandleOp").Device(DEVICE).HostMemory("resource"), VarHandleOp); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_VarHandlesOp").Device(DEVICE).HostMemory("resources"),            \
+      ResourceHandlesOp<Var>);                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ReadVariableOp").Device(DEVICE).HostMemory("resource"),            \
+      ReadVariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_ReadVariablesOp").Device(DEVICE).HostMemory("resources"),         \
+      ReadVariablesOp);                                                        \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("DestroyResourceOp").Device(DEVICE).HostMemory("resource"),         \
+      DestroyResourceOp);                                                      \
+  REGISTER_KERNEL_BUILDER(Name("Shape")                                        \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int32>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeOp<int32>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("Shape")                                        \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int64_t>("out_type")             \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeOp<int64_t>);                                   \
+  REGISTER_KERNEL_BUILDER(Name("ShapeN")                                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int32>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeNOp<int32>);                                    \
+  REGISTER_KERNEL_BUILDER(Name("ShapeN")                                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int64_t>("out_type")             \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeNOp<int64_t>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("VariableShape")                                \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<int32>("out_type")               \
+                              .HostMemory("output")                            \
+                              .HostMemory("input"),                            \
+                          VariableShapeOp<int32>);                             \
+  REGISTER_KERNEL_BUILDER(Name("VariableShape")                                \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<int64_t>("out_type")             \
+                              .HostMemory("output")                            \
+                              .HostMemory("input"),                            \
+                          VariableShapeOp<int64_t>);                           \
+  REGISTER_KERNEL_BUILDER(Name("Size")                                         \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int32>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          SizeOp<int32>);                                      \
+  REGISTER_KERNEL_BUILDER(Name("Size")                                         \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int64_t>("out_type")             \
+                              .TypeConstraint("T", TYPES),                     \
+                          SizeOp<int64_t>);                                    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Rank").Device(DEVICE).HostMemory("output").TypeConstraint("T",     \
+                                                                      TYPES),  \
+      RankOp);                                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("AssignVariableOp").Device(DEVICE).HostMemory("resource"),          \
+      XlaAssignVariableOp);                                                    \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp);   \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kArgOp).Device(DEVICE).TypeConstraint("T", TYPES), ArgOp);          \
+  REGISTER_KERNEL_BUILDER(Name(kArgOp)                                         \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<ResourceHandle>("T"),            \
+                          ArgOp);                                              \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kArgOp).Device(DEVICE).TypeConstraint<Variant>("T"), ArgOp);        \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kRetOp).Device(DEVICE).TypeConstraint("T", TYPES), RetvalOp);       \
+  REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<ResourceHandle>("T")             \
+                              .HostMemory("input"),                            \
+                          RetvalOp);                                           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kDeviceRetOp).Device(DEVICE).TypeConstraint<int32>("T"), RetvalOp); \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RemoteCall").Device(DEVICE).HostMemory("target"), RemoteCallOp);   \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("GeneratorDataset").Device(DEVICE).HostMemory("handle"),            \
+      data::GeneratorDatasetOp);                                               \
+  REGISTER_KERNEL_BUILDER(Name("PrefetchDataset")                              \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("buffer_size")                       \
+                              .HostMemory("input_dataset")                     \
+                              .HostMemory("handle"),                           \
+                          data::PrefetchDatasetOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("OptionsDataset")                               \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("input_dataset")                     \
+                              .HostMemory("handle"),                           \
+                          data::OptionsDatasetOp);                             \
+  REGISTER_KERNEL_BUILDER(Name("FinalizeDataset")                              \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("input_dataset")                     \
+                              .HostMemory("handle"),                           \
+                          data::FinalizeDatasetOp);                            \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE),                   \
+                          data::IteratorHandleOp);                             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MakeIterator").Device(DEVICE).HostMemory("dataset"),               \
+      data::MakeIteratorOp);                                                   \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE),            \
+                          data::AnonymousIteratorHandleOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2").Device(DEVICE),          \
+                          data::AnonymousIteratorHandleOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV3").Device(DEVICE),          \
+                          data::AnonymousIteratorHandleOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE),               \
+                          data::DeleteIteratorOp);                             \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
+                          data::IteratorGetNextOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
+                          data::IteratorGetNextAsOptionalOp);                  \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE),          \
+                          data::IteratorGetNextOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("string_handle"),                    \
+                          data::IteratorToStringHandleOp);                     \
+  REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2")                   \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("string_handle"),                    \
+                          data::IteratorFromStringHandleOp);                   \
+  REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE),                 \
+                          data::OptionalNoneOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE),            \
+                          data::OptionalFromValueOp);                          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("OptionalHasValue").Device(DEVICE).HostMemory("has_value"),         \
+      data::OptionalHasValueOp);                                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE),             \
+                          data::OptionalGetValueOp);                           \
+  REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp)              \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<tstring>("T"),                   \
+                          ArgOp);                                              \
+  REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kRetOp)              \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<tstring>("T")                    \
+                              .HostMemory("input"),                            \
+                          RetvalOp);
+
+// TODO(b/118881356): currently we do not register the QueueEnqueueMany,
+// QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read
+// and write the tensors they access in order to concatenate them into a batch.
+// We would need either to call out to an XLA computation to perform the
+// concatenation, or we would need to refactor those kernels so the splitting
+// or merging is done in a separate operator that can be compiled.
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_host_recv_device_context.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_host_recv_device_context.h
new file mode 100644
index 00000000..d6dfc6f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_host_recv_device_context.h
@@ -0,0 +1,93 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_HOST_RECV_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_HOST_RECV_DEVICE_CONTEXT_H_
+
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// XlaHostRecvDeviceContext is a DeviceContext that is intended to be
+// used to transfer from device->host using Rendezvous. It transfers the
+// content of `device_memory_base` with `shape` using `stream`. Only
+// `CopyDeviceTensorToCPU` method is implemented. The `done_event` is marked as
+// Concrete once transfer is completed.
+//
+// Example usage:
+//
+//  Device device;
+//  stream_executor::Stream stream(executor);
+//  Tensor device_tensor(device_allocator, DT_FLOAT, TensorShape({2, 2}));
+//  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+//  xla::Shape shape(xla::F32, {2, 2}, {}, {})
+//  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+//      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(stream.parent());
+//  done_event->Init();
+//  Tensor dest_cpu_tensor;
+//
+//  XlaHostRecvDeviceContext device_context(&stream, gpu_dst,
+//    shape, done_event);
+//  device_context.CopyDeviceTensorToCPUSync(
+//    &device_tensor, "", &device, &dest_cpu_tensor);
+
+class XlaHostRecvDeviceContext : public DeviceContext {
+ public:
+  XlaHostRecvDeviceContext(
+      se::Stream* stream, const se::DeviceMemoryBase& device_memory_base,
+      const xla::Shape& shape,
+      tsl::AsyncValueRef<std::unique_ptr<se::Event>>& done_event)
+      : stream_(stream),
+        device_memory_base_(device_memory_base),
+        shape_(shape),
+        done_event_(done_event) {}
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override {
+    done(errors::Internal("host->device copy not implemented."));
+  }
+
+  // Copies `device_memory_base_` with `shape_` into `cpu_tensor`.
+  // `device_tensor` is unused.
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override {
+    done(errors::Internal("device->device copy not implemented."));
+  }
+
+ private:
+  se::Stream* stream_;  // Not owned.
+  // This is copied rather than a reference or pointer since its lifetime
+  // is not guaranteed to outlast the original object.  Object slicing is
+  // not an issue here since only DeviceMemoryBase methods/members are used.
+  const se::DeviceMemoryBase device_memory_base_;
+  const xla::Shape shape_;
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event_;
+
+  XlaHostRecvDeviceContext(const XlaHostRecvDeviceContext&) = delete;
+  void operator=(const XlaHostRecvDeviceContext&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_HOST_RECV_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_host_send_device_context.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_host_send_device_context.h
new file mode 100644
index 00000000..52ca6125
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_host_send_device_context.h
@@ -0,0 +1,90 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_HOST_SEND_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_HOST_SEND_DEVICE_CONTEXT_H_
+
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// XlaHostSendDeviceContext is a DeviceContext that is intended to be
+// used to transfer from host->device using Rendezvous. It transfers the
+// content of `device_memory_base` with `shape` using `stream`. Only
+// `CopyCPUTensorToDevice` method is implemented. The `done_event` is marked as
+// Concrete once transfer is completed.
+//
+// Example usage:
+//
+//  Device device;
+//  stream_executor::Stream stream(executor);
+//  Tensor cpu_tensor(host_allocator, DT_FLOAT, TensorShape({2, 2}));
+//  Tensor device_tensor(device_allocator, DT_FLOAT, TensorShape({2, 2}));
+//  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+//  xla::Shape shape(xla::F32, {2, 2}, {}, {})
+//  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+//      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(stream.parent());
+//  done_event->Init();
+//
+//  XlaHostSendDeviceContext device_context(&stream, &gpu_dst,
+//    shape, done_event);
+//  device_context.CopyCPUTensorToDeviceSync(
+//    &cpu_tensor, &device, &device_tensor);
+
+class XlaHostSendDeviceContext : public DeviceContext {
+ public:
+  XlaHostSendDeviceContext(
+      se::Stream* stream, se::DeviceMemoryBase* device_memory_base,
+      const xla::Shape& shape,
+      tsl::AsyncValueRef<std::unique_ptr<se::Event>>& done_event)
+      : stream_(stream),
+        device_memory_base_(device_memory_base),
+        shape_(shape),
+        done_event_(done_event) {}
+
+  // Copies 'cpu_tensor' to `device_memory_base_` with `shape_`.
+  // `device_tensor` is unused.
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override {
+    done(errors::Internal("host->device copy not implemented."));
+  }
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override {
+    done(errors::Internal("device->device copy not implemented."));
+  }
+
+ private:
+  se::Stream* stream_;                        // Not owned.
+  se::DeviceMemoryBase* device_memory_base_;  // Not owned.
+  const xla::Shape shape_;
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event_;
+
+  XlaHostSendDeviceContext(const XlaHostSendDeviceContext&) = delete;
+  void operator=(const XlaHostSendDeviceContext&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_HOST_SEND_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_kernel_creator.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_kernel_creator.h
new file mode 100644
index 00000000..67c843bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_kernel_creator.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FunctionLibraryRuntime;
+class OpKernel;
+
+class XlaKernelCreator : public CustomKernelCreator {
+ public:
+  // Given a NodeDef 'node_def' and the function library runtime 'flr', returns
+  // true if 'node_def' is a call to a compilable function defined in 'flr',
+  // with the kXlaCompileAttr set.
+  bool CanCreateKernel(
+      const FunctionLibraryRuntime& flr,
+      const std::shared_ptr<const NodeProperties>& props) const override;
+
+  // Given a supported NodeDef, returns a XlaLaunchOp that computes the node.
+  absl::Status CreateKernel(FunctionLibraryRuntime* flr,
+                            const std::shared_ptr<const NodeProperties>& props,
+                            std::unique_ptr<OpKernel>* kernel) const override;
+};
+
+bool RegisterLaunchOpCreator();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_launch_util.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_launch_util.h
new file mode 100644
index 00000000..5e5128d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_launch_util.h
@@ -0,0 +1,267 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains utilities for launching compiled XLA kernels for a KernelContext.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Creates a list of updated resource variables.
+absl::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
+    OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult& compilation_result,
+    int missing_ctx_input_prefix);
+
+// Returns pointers to inputs stored in `ctx`.
+std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx);
+
+absl::StatusOr<std::vector<int>> GetConstantInputIndicesFromContext(
+    OpKernelContext* ctx);
+
+absl::Status SetOutputForConstant(
+    OpKernelContext* ctx, bool requires_copy_to_device,
+    const XlaCompiler::CompilationResult* compilation_result, int output_num);
+
+// Converts input tensors and variables which are parameters of the
+// XlaComputation into PjRtBuffers to be fed as input to the
+// PjRtLoadedExecutable.
+//
+// Assumes that the first `num_missing_prefix_ctx_inputs` inputs to the
+// compilation_result are missing in `inputs` and adjusts indexing into `inputs`
+// accordingly.
+// `input_mapping` is a vector that maps from the parameters of the
+// XlaComputation to their original argument positions. This can be sourced from
+// `XlaCompiler::CompilationResult::input_mapping`.
+// `variable_snapshots` is a map of {index of the input to the
+// compilation_result -> underlying Tensor the variable is/was pointing to (i.e.
+// the value of the variable at the time of lowering/compilation)}.
+//
+// The obtained PjRtBuffers are populated to `args` vector.
+// `non_donatable_input_indices` will also be set, which contains the indices of
+// the input that should not be donated to output.
+//
+// There can be three types of input: 1. Tensor with PjRtTensorBuffer; 2.
+// Tensor with AsyncValueTensor; 3. Tensor with raw device mem pointer.
+// For case 3, we need to create a PjRtBuffer from the raw device mem pointer,
+// and we need to ensure the PjRtBuffer persists till XLA computation is
+// complete. Therefore we put the newly created PjRtBuffer into `owned_args`.
+// Caller is responsible to ensure `owned_args` lives till the end of XLA
+// computation.
+absl::Status PreparePjRtExecutableArguments(
+    int num_missing_prefix_ctx_inputs, const std::vector<int>& input_mapping,
+    const std::vector<const Tensor*>& inputs,
+    const absl::flat_hash_map<int, const Tensor*>& variable_snapshots,
+    xla::PjRtClient* pjrt_client, xla::PjRtDevice* pjrt_device,
+    bool use_pjrt_tensor_buffer, std::vector<xla::PjRtBuffer*>* args,
+    std::vector<std::unique_ptr<xla::PjRtBuffer>>* owned_args,
+    absl::flat_hash_set<int>* non_donatable_input_indices);
+
+// Populates the OpKernelContext outputs with the outputs of the
+// PjRtLoadedExecutable. Requires the `compilation_result` used to build the
+// PjRtLoadedExecutable.
+// This function only looks at variables that were updated, so `variables` can
+// either be all the variables or only the ones that were updated.
+// Assumes that the first `num_missing_prefix_ctx_inputs` inputs to the
+// compilation_result are missing in `inputs` and adjusts indexing into `inputs`
+// accordingly.
+absl::Status PopulateCtxOutputsFromPjRtExecutableOutputs(
+    int num_missing_prefix_ctx_inputs, const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    bool use_pjrt_tensor_buffer,
+    std::vector<std::unique_ptr<xla::PjRtBuffer>>& executable_outputs,
+    OpKernelContext* ctx);
+
+// Returns the options used for executing a PjRtLoadedExecutable.
+xla::ExecuteOptions GetPjRtExecuteOptions(
+    const DeviceType& device_type,
+    absl::flat_hash_set<int> non_donatable_input_indices);
+
+// Returns the device ordinal from the parsed name of the device.
+int GetDeviceOrdinal(const DeviceBase* device);
+
+// Returns the device type from the OpKernelContext.
+DeviceType GetDeviceType(OpKernelContext* ctx);
+
+// Runs `executable` and populates the outputs in `ctx`. `inputs` and
+// `variables` are the input arguments to the computation, usually read from the
+// OpKernelContext, `ctx`. Requires the device-appropriate `pjrt_client` and the
+// `compilation_result` used to build the `executable`.
+absl::Status RunPjRtExecutable(
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    xla::PjRtClient* pjrt_client, xla::PjRtLoadedExecutable* executable,
+    OpKernelContext* ctx);
+
+// Same as the above function but takes in `updated_variables` and
+// `variable_snapshots` which is a map of {index of the input to the
+// compilation_result -> underlying Tensor the variable is/was pointing to
+// (i.e. the value of the variable at the time of lowering/compilation)}.
+// Assumes that the first `num_missing_prefix_ctx_inputs` inputs to the
+// compilation_result are missing in `inputs` and adjusts indexing into `inputs`
+// accordingly.
+absl::Status RunPjRtExecutable(
+    int num_missing_prefix_ctx_inputs, const std::vector<const Tensor*>& inputs,
+    const absl::flat_hash_map<int, const Tensor*>& variable_snapshots,
+    const std::vector<VariableInfo>& updated_variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    xla::PjRtClient* pjrt_client, xla::PjRtLoadedExecutable* executable,
+    OpKernelContext* ctx);
+
+// Similar to the above function but it does not take an OpKernelContext, and
+// it returns the output in PjRtBuffers, instead of populating results into
+// OpKernelContext.
+absl::StatusOr<std::vector<std::unique_ptr<xla::PjRtBuffer>>> RunPjRtExecutable(
+    int num_missing_prefix_ctx_inputs, const std::vector<const Tensor*>& inputs,
+    const absl::flat_hash_map<int, const Tensor*>& variable_snapshots,
+    const std::vector<VariableInfo>& updated_variables,
+    const DeviceType& device_type, bool use_pjrt_tensor_buffer,
+    const XlaCompiler::CompilationResult& compilation_result,
+    xla::PjRtDevice* device, xla::PjRtClient* pjrt_client,
+    xla::PjRtLoadedExecutable* executable);
+
+// Helper class to perform the marshalling of TensorFlow inputs and outputs to
+// ShapedBuffers suitable for passing to an XLA computation.
+class XlaComputationLaunchContext {
+ public:
+  // Create a new launch context. 'allocate_xla_tensors' is true if allocated
+  // output tensors and variables are always XlaTensors. If false they are
+  // assumed to be "normal" device pointers.
+  // If 'use_multiple_streams' is true, tensors may be defined and used on
+  // multiple streams and so se::Events must be defined and waited for. If
+  // 'use_multiple_streams' is true, 'allocate_xla_tensors' must also be true
+  // because we track inter-stream dependencies through events inside XlaTensor
+  // objects.
+  XlaComputationLaunchContext(xla::LocalClient* client,
+                              se::DeviceMemoryAllocator* xla_allocator,
+                              int device_ordinal, bool allocate_xla_tensors,
+                              bool use_multiple_streams);
+
+  // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
+  // op.
+  // Precondition: variables in `variable_args` are locked.
+  static absl::StatusOr<std::vector<XlaCompiler::Argument>>
+  BuildXlaCompilerArguments(absl::Span<int const> must_be_constant_idxs,
+                            absl::Span<const Tensor* const> inputs,
+                            absl::Span<VariableInfo const> variable_args,
+                            Device* device);
+
+  // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
+  // `variables` is a map from TensorFlow argument number to resource variable.
+  //
+  // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
+  // missing and adjusts input indices accordingly.  All elements in kernel's
+  // input_mapping must be greater than or equal to `missing_ctx_input_prefix`
+  // (in other words, no inputs actually required by the kernel can be missing).
+  absl::StatusOr<std::vector<xla::ExecutionInput>> PopulateInputs(
+      OpKernelContext* ctx,
+      const XlaCompiler::CompilationResult* compilation_result,
+      const std::map<int, const Tensor*>& resource_vars,
+      int missing_ctx_input_prefix,
+      const xla::HloInputOutputAliasConfig& input_output_alias);
+
+  // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
+  // writes out the resource variable updates.
+  //
+  // Updates to all resource variables are written in a single atomic operation.
+  // This models *->Write dependencies between resource variable operations.
+  // See jit/resource_operation_safety_analysis for details.
+  //
+  //
+  // Assumes that the first `missing_ctx_input_prefix` inputs to the
+  // compilation_result are missing and adjusts input indices accordingly.
+  absl::Status PopulateOutputs(
+      OpKernelContext* ctx,
+      const XlaCompiler::CompilationResult* compilation_result,
+      xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
+      absl::Span<VariableInfo> variable_infos,
+      const xla::HloInputOutputAliasConfig& input_output_alias,
+      const std::map<int, const Tensor*>& resource_vars);
+
+ private:
+  xla::LocalClient* client_;
+  se::DeviceMemoryAllocator* xla_allocator_;
+  bool allocate_xla_tensors_;
+  bool use_multiple_streams_;
+  int device_ordinal_;
+};
+
+// A simple TensorBuffer implementation that allows us to create Tensors that
+// take ownership of pre-allocated memory.
+class XlaTensorBuffer : public TensorBuffer {
+ public:
+  XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
+                  Allocator* allocator)
+      : TensorBuffer(const_cast<void*>(ptr)),
+        expected_size_(expected_size),
+        actual_size_(actual_size),
+        allocator_(allocator) {}
+
+  ~XlaTensorBuffer() override {
+    if (data()) {
+      allocator_->DeallocateRaw(data());
+    }
+  }
+
+  size_t size() const override { return expected_size_; }
+
+  TensorBuffer* root_buffer() override { return this; }
+
+  void FillAllocationDescription(AllocationDescription* proto) const override {
+    proto->set_requested_bytes(static_cast<int64_t>(expected_size_));
+    proto->set_allocator_name(allocator_->Name());
+    proto->set_ptr(reinterpret_cast<uintptr_t>(data()));
+    if (allocator_->TracksAllocationSizes()) {
+      auto ab = static_cast<int64_t>(allocator_->AllocatedSize(data()));
+      proto->set_allocated_bytes(ab);
+      int64_t id = allocator_->AllocationId(data());
+      if (id > 0) {
+        proto->set_allocation_id(id);
+      }
+      if (RefCountIsOne()) {
+        proto->set_has_single_reference(true);
+      }
+    }
+  }
+
+ private:
+  size_t expected_size_;
+  size_t actual_size_;
+  Allocator* allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_platform_info.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_platform_info.h
new file mode 100644
index 00000000..7c5099f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_platform_info.h
@@ -0,0 +1,172 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/pjrt_base_device.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// Holds some information about the platform on which an
+// XlaLaunch/_XlaCompile/_XlaRun op must run on. Provides a common layer of
+// abstraction for normal, XLA devices and devices inheriting from
+// PjRtBaseDevice.
+class XlaPlatformInfo {
+ public:
+  XlaPlatformInfo() : device_type_("") {}
+  XlaPlatformInfo(XlaPlatformInfo&&) = default;
+  explicit XlaPlatformInfo(
+      const DeviceType device_type, se::Platform::Id platform_id,
+      const XlaDevice::Metadata* xla_device_metadata,
+      const PjRtBaseDevice::Metadata* pjrt_device_metadata,
+      std::shared_ptr<se::DeviceMemoryAllocator> device_allocator)
+      : device_type_(device_type),
+        platform_id_(platform_id),
+        xla_device_metadata_(xla_device_metadata),
+        pjrt_device_metadata_(pjrt_device_metadata),
+        device_allocator_(device_allocator) {}
+
+  XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
+
+  bool UseMultipleStreams() const {
+    return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
+  }
+
+  // Non-null only when run on an XLA device.
+  std::shared_ptr<se::DeviceMemoryAllocator> custom_allocator() const {
+    return device_allocator_;
+  }
+
+  DeviceType device_type() const { return device_type_; }
+
+  // This is equal to xla_device_metadata()->platform()->id() if
+  // xla_device_metadata() is not nullptr.
+  se::Platform::Id platform_id() const { return platform_id_; }
+
+  // This may be null if the op this XlaPlatformInfo is for was not placed on an
+  // XLA device.
+  const XlaDevice::Metadata* xla_device_metadata() const {
+    return xla_device_metadata_;
+  }
+  bool is_on_xla_device() const { return xla_device_metadata() != nullptr; }
+
+  const PjRtBaseDevice::Metadata* pjrt_device_metadata() const {
+    return pjrt_device_metadata_;
+  }
+
+ private:
+  DeviceType device_type_;
+  se::Platform::Id platform_id_;
+
+  // xla_device_metadata_ lives in the tensorflow::DeviceBase in which the
+  // XlaLaunch/_XlaCompile/_XlaRun op is placed and thus does not die before the
+  // XlaLaunch/_XlaCompile/_XlaRun OpKernel.
+  const XlaDevice::Metadata* xla_device_metadata_;
+
+  // pjrt_device_metadata_ lives in tensorflow::PjRtBaseDevice in which the
+  // XlaLaunch/XlaCompileOnDemand op is placed and thus does not die before the
+  // op kernel.
+  const PjRtBaseDevice::Metadata* pjrt_device_metadata_;
+
+  // If the op associated with this XlaPlatformInfo is placed on an XLA device
+  // then device_allocator_ is the xla::Backend's memory allocator.  If the op
+  // is placed on a regular CPU or GPU device then device_allocator_ is null.
+  // The allocator is of unknown provenance; keep it in a shared pointer to
+  // set an artificial refcount of one.
+  std::shared_ptr<se::DeviceMemoryAllocator> device_allocator_;
+
+  XlaPlatformInfo(const XlaPlatformInfo&) = delete;
+  void operator=(const XlaPlatformInfo&) = delete;
+};
+
+// Returns a set containing the device ids contained in visible_device_list or
+// nullopt if it is empty. It returns error in case of malformed configuration
+// string.
+absl::StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
+    absl::string_view visible_device_list);
+
+// Returns the device type for building a DeviceCompiler from the given platform
+// type.
+absl::StatusOr<DeviceType> GetCompilationDeviceType(
+    const DeviceType& platform_device_type);
+
+// Builds a DeviceCompiler that uses xla::LocalClient using `platform_info` and
+// `compilation_device_type` (in non-TPU case) and sets *xla_device_compiler to
+// point to it. Uses flags from `MarkForCompilationPassFlags` for configuring
+// the persistor used in the DeviceCompiler. The platform ID from
+// `platform_info` must not be null in CPU case.
+absl::Status BuildXlaDeviceCompiler(
+    DeviceBase* dev, FunctionLibraryRuntime* flr,
+    const XlaPlatformInfo& platform_info, DeviceType compilation_device_type,
+    DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
+        xla_device_compiler);
+
+// Fetches a DeviceCompiler from the tfrt_global resource manager (or creates
+// one there if not found) that uses xla::PjRtClient using an appropriate
+// PjRtClient for `platform_info.device_type()` and sets *pjrt_device_compiler
+// to point to it. Also fetches/creates a DeviceCompilationProfiler from/in the
+// tfrt_global resource manager for `platform_info.device_type()` and sets
+// *profiler to point to it.  Uses flags from `MarkForCompilationPassFlags` for
+// configuring the persistor used in the DeviceCompiler. Please note that
+// non-XLA devices aren't supported yet. This is because:
+// 1. PjRtClient doesn't support data transfer for non-XLA devices yet
+// 2. Fetching the PjRtClient for non-XLA devices is also not supported yet
+absl::Status GetOrCreatePjRtDeviceCompilerAndProfiler(
+    const OpKernelContext& ctx, const XlaPlatformInfo& platform_info,
+    FunctionLibraryRuntime* flr,
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>**
+        pjrt_device_compiler,
+    DeviceCompilationProfiler** profiler);
+
+// Same as the above function but takes the resource manager `rm` instead of an
+// OpKernelContext.
+absl::Status GetOrCreatePjRtDeviceCompilerAndProfiler(
+    const XlaPlatformInfo& platform_info, ResourceMgr* rm,
+    FunctionLibraryRuntime* flr,
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>**
+        pjrt_device_compiler,
+    DeviceCompilationProfiler** profiler);
+
+// Returns information about the platform from kernel context.
+XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device);
+
+// Obtains persistent cache directory for executables that target a given device
+// based off xla flags. If you shouldn't use persistent caching, returns "".
+std::string GetPersistentCacheDirectory(
+    const DeviceType& compilation_device_type);
+
+// Returns allocator from platform info if non-null, or populate and return a
+// pointer to the allocator adapter with allocator from context.
+//
+// This is necessary because for XLA devices the underlying TF allocator returns
+// dummy tensors.
+//
+// `stream` parameter is nullable when running on host.
+std::shared_ptr<se::DeviceMemoryAllocator> GetAllocator(
+    DeviceBase* device, se::Stream* stream,
+    const XlaPlatformInfo& platform_info);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_tensor.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_tensor.h
new file mode 100644
index 00000000..91e06ddf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_tensor.h
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "xla/client/local_client.h"
+#include "xla/service/shaped_buffer.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The implementation of a Tensor for an XlaDevice. All device tensors are
+// actually one of these.
+//
+// To distinguish between "normal" device tensors and XlaTensors, the raw
+// pointer data stored in the TensorBuffer is a tagged pointer.
+class XlaTensor {
+ public:
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static XlaTensor* FromTensor(const Tensor* tensor);
+
+  // Create a DeviceMemoryBase from a Tensor. The Tensor can be an XlaTensor, in
+  // which case the returned value is shaped_buffer()->root_buffer(), or a
+  // normal Tensor in which case the returned value is
+  // {tensor.tensor_data().data(), tensor.tensor_data().size}.
+  static se::DeviceMemoryBase DeviceMemoryFromTensor(const Tensor& tensor);
+
+  // Assign the internal ShapedBuffer to new memory for the given dtype and
+  // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
+  // is replaced and the managed memory deallocated.
+  absl::Status AllocateShapedBuffer(DataType dtype,
+                                    const xla::Shape& on_device_shape,
+                                    xla::LocalClient* client,
+                                    int device_ordinal);
+
+  // Some Tensors can have complex on-device shapes, including tuple shapes. To
+  // manage the memory for these tensors a ShapedBuffer may be required.
+
+  // Return true if this XlaTensor contains a ShapedBuffer.
+  bool has_shaped_buffer() const { return shaped_buffer_.has_value(); }
+  // Return the contained ShapedBuffer.
+  // REQUIRES: has_shaped_buffer()
+  const xla::ShapedBuffer& shaped_buffer() const {
+    CHECK(has_shaped_buffer());
+    return *shaped_buffer_;
+  }
+  xla::ShapedBuffer& shaped_buffer() {
+    CHECK(has_shaped_buffer());
+    return *shaped_buffer_;
+  }
+  // Mutates the XlaTensor to set the ShapedBuffer.
+  void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
+    shaped_buffer_ = std::move(shaped_buffer);
+  }
+
+  // Adds synchronization events to 'stream' that wait for this tensor to be
+  // defined on 'stream'. Does nothing if the tensor is already defined on that
+  // stream.
+  void WaitForDefinitionEventOnStream(se::Stream* stream);
+
+  // (Re)sets the definition event of the tensor to 'event', and promises that
+  // the tensor has already been defined on stream. Removes any previous
+  // definition event or any previous promises about the tensor being defined on
+  // streams.
+  // It is legal to reset the definition event of a tensor when overwriting the
+  // tensor's value (at which point, it is effectively a new tensor once again.)
+  void ResetDefinitionEvent(std::shared_ptr<se::Event> event,
+                            se::Stream* stream);
+
+  // Refresh the status of streams_defined_on_. Return the first not-OK stream's
+  // status or OK.
+  absl::Status RefreshStatusOfStreams();
+
+  // Convert from a raw pointer to an XlaTensor, removing the pointer tag.
+  static XlaTensor* FromOpaquePointer(void* ptr);
+  // Convert to a raw pointer from an XlaTensor, adding the pointer tag.
+  static void* ToOpaquePointer(XlaTensor* tensor);
+
+ private:
+  // The optional contained ShapedBuffer.
+  std::optional<xla::ScopedShapedBuffer> shaped_buffer_;
+  // An optional host tensor value.
+  std::optional<Tensor> host_tensor_;
+  // An optional event that is triggered when the tensor's content has been
+  // defined. If this event is nullptr, it is assumed that the tensor's content
+  // is always defined.
+  std::shared_ptr<se::Event> definition_event_;
+  // A list of all streams for which the tensor's content is defined for any
+  // newly enqueued command.
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ TF_GUARDED_BY(mu_);
+  mutex mu_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_tpu_device.h b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_tpu_device.h
new file mode 100644
index 00000000..bb31c65b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/jit/xla_tpu_device.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_TPU_DEVICE_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_TPU_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+void RegisterTpuDeviceToDeviceCopy();
+
+void RegisterTpuNodeDevice(
+    bool tpu_autoclustering, bool tpu_xla_device_failure_closes_chips,
+    bool tpu_use_substreams_for_cross_tpu_device_transfers);
+
+void RegisterTpuSystemDevice();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_TPU_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/init_mlir.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/init_mlir.h
new file mode 100644
index 00000000..290ef361
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/init_mlir.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_INIT_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_INIT_MLIR_H_
+
+namespace tensorflow {
+
+// Initializer to perform TF's InitMain initialization.
+// InitMain also performs flag parsing and '--' is used to separate flags passed
+// to it: Flags before the first '--' are parsed by InitMain and argc and argv
+// progressed to the flags post. If there is no separator, then no flags are
+// parsed by InitMain and argc/argv left unadjusted.
+class InitMlir {
+ public:
+  InitMlir(int *argc, char ***argv);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_INIT_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/allocation.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/allocation.h
new file mode 100644
index 00000000..a82d8c04
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/allocation.h
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Memory management for TF Lite.
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_ALLOCATION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_ALLOCATION_H_
+
+#include <stddef.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+/// A memory allocation handle. This could be a mmap or shared memory.
+class Allocation {
+ public:
+  using Ptr = std::unique_ptr<Allocation>;
+
+  virtual ~Allocation() {}
+
+  enum class Type {
+    kMMap,
+    kFileCopy,
+    kMemory,
+  };
+
+  /// Base pointer of this allocation
+  virtual const void* base() const = 0;
+  /// Size in bytes of the allocation
+  virtual size_t bytes() const = 0;
+  /// Whether the allocation is valid
+  virtual bool valid() const = 0;
+  /// Return the type of the Allocation.
+  Type type() const { return type_; }
+
+ protected:
+  Allocation(ErrorReporter* error_reporter, Type type)
+      : error_reporter_(error_reporter), type_(type) {}
+  ErrorReporter* error_reporter_;
+
+ private:
+  const Type type_;
+};
+
+/// Note that not all platforms support MMAP-based allocation.
+/// Use `IsSupported()` to check.
+class MMAPAllocation : public Allocation {
+ public:
+  /// Loads and maps the provided file to a memory region.
+  MMAPAllocation(const char* filename, ErrorReporter* error_reporter);
+
+  /// Maps the provided file descriptor to a memory region.
+  /// Note: The provided file descriptor will be dup'ed for usage; the caller
+  /// retains ownership of the provided descriptor and should close accordingly.
+  MMAPAllocation(int fd, ErrorReporter* error_reporter);
+
+  /// Maps the provided file descriptor, with the given offset and length (both
+  /// in bytes), to a memory region.
+  /// Note: The provided file descriptor will be dup'ed for usage; the caller
+  /// retains ownership of the provided descriptor and should close accordingly.
+  MMAPAllocation(int fd, size_t offset, size_t length,
+                 ErrorReporter* error_reporter);
+
+  ~MMAPAllocation() override;
+  const void* base() const override;
+  size_t bytes() const override;
+  bool valid() const override;
+
+  int fd() const { return mmap_fd_; }
+
+  // The start address of the mmapped buffer.
+  // This will be base() rounded down to the nearest page boundary.
+  const void* mmapped_buffer() const { return mmapped_buffer_; }
+
+  // The size of the mmapped buffer.
+  size_t mmapped_buffer_size() const { return bytes() + offset_in_buffer_; }
+
+  // Offset of mmapped_buffer() in the file referenced by the file descriptor.
+  size_t mmapped_buffer_offset_in_file() const {
+    return offset_of_buffer_in_file_;
+  }
+
+  static bool IsSupported();
+
+ protected:
+  // Data required for mmap.
+  int mmap_fd_ = -1;  // mmap file descriptor
+  const void* mmapped_buffer_;
+  size_t buffer_size_bytes_ = 0;
+  // Used when the address to mmap is not page-aligned.
+  size_t offset_in_buffer_ = 0;
+  size_t offset_of_buffer_in_file_ = 0;
+
+ private:
+  // Assumes ownership of the provided `owned_fd` instance.
+  MMAPAllocation(ErrorReporter* error_reporter, int owned_fd);
+
+  // Assumes ownership of the provided `owned_fd` instance, and uses the given
+  // offset and length (both in bytes) for memory mapping.
+  MMAPAllocation(ErrorReporter* error_reporter, int owned_fd, size_t offset,
+                 size_t length);
+};
+
+class FileCopyAllocation : public Allocation {
+ public:
+  /// Loads the provided file into a heap memory region.
+  FileCopyAllocation(const char* filename, ErrorReporter* error_reporter);
+  ~FileCopyAllocation() override;
+  const void* base() const override;
+  size_t bytes() const override;
+  bool valid() const override;
+
+ private:
+  std::unique_ptr<const char[]> copied_buffer_;
+  size_t buffer_size_bytes_ = 0;
+};
+
+class MemoryAllocation : public Allocation {
+ public:
+  /// Provides a (read-only) view of the provided buffer region as an
+  /// allocation.
+  /// Note: The caller retains ownership of `ptr`, and must ensure it remains
+  /// valid for the lifetime of the class instance.
+  MemoryAllocation(const void* ptr, size_t num_bytes,
+                   ErrorReporter* error_reporter);
+  ~MemoryAllocation() override;
+  const void* base() const override;
+  size_t bytes() const override;
+  bool valid() const override;
+
+ private:
+  const void* buffer_;
+#if defined(__x86_64__) && defined(UNDEFINED_BEHAVIOR_SANITIZER)
+  void* aligned_ptr_ = nullptr;
+#endif
+  size_t buffer_size_bytes_ = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_ALLOCATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
new file mode 100644
index 00000000..db9715e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -0,0 +1,148 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_COMMON_TFL_PASS_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_COMMON_TFL_PASS_CONFIG_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_join.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+
+namespace mlir {
+namespace TFL {
+
+// A config that controls which passes get run as part TFLite converter.
+struct PassConfig {
+  explicit PassConfig(quant::QuantizationSpecs specs)
+      : quant_specs(std::move(specs)) {}
+
+  // If `emit_builtin_tflite_ops` is true, TF Lite legalization passes will be
+  // added, which produces TF Lite ops.
+  bool emit_builtin_tflite_ops = true;
+  // If `lower_tensor_list_ops` is true, tensorlist ops will be lowered to basic
+  // TF ops before legalization to TF Lite dialect.
+  bool lower_tensor_list_ops = false;
+  // The allowlist of functions that would be preserved after trimming.
+  llvm::ArrayRef<std::string> trim_functions_allowlist;
+  // All information about quantization.
+  quant::QuantizationSpecs quant_specs;
+  // If `form_clusters` is true , clusters are formed by grouping consecutive
+  // ops of the same device, under a `tf_device.launch` op.
+  bool form_clusters = false;
+  // If `unfold_batch_matmul` is true, the tf.BatchMatMul is unfolded to a set
+  // of tfl.fully_connected ops.
+  bool unfold_batch_matmul = true;
+  // Whether to outline WhileOp at the end of the pipeline.
+  bool outline_tf_while = false;
+  // Whether to do shape inference.
+  bool shape_inference = true;
+  // Whether to do TFLite runtime verification.
+  bool runtime_verification = true;
+  // Whether to enable TFLite variables or not, this will allow
+  // mutable variables and produce ReadVariable/AssignVariable ops in TFLite.
+  bool enable_tflite_variables = false;
+  // Whether to unfold large splat constant tensors and replace them with
+  // fill operation.
+  bool unfold_large_splat_constant = false;
+  // Whether to run the `GuaranteeAllFuncsOneUsePass` to ensure each function
+  // has a single use.
+  bool guarantee_all_funcs_one_use = false;
+  // Whether to enable the hlo/stablehlo to tf conversion. This also supports
+  // the case where a saved model contains both TF module and serialized
+  // StableHLO module.
+  bool enable_hlo_to_tf_conversion = false;
+  // Whether to disable the direct hlo/stablehlo to Tensorflow Lite conversion.
+  //
+  // This prevents from directly converting from HLO to TFLite without going
+  // through TF for some of the ops. Some conversions are only supported through
+  // this path.
+  bool disable_hlo_to_tfl_conversion = false;
+  // Whether to enable to use DynamicUpdateSlice op.
+  bool enable_dynamic_update_slice = false;
+  // Whether to preserve AssertOp during legalization.
+  bool preserve_assert_op = false;
+  // Whether to enable TF->stablehlo passes.
+  bool enable_stablehlo_conversion = false;
+  // Whether to convert `tf.TensorList*` to `tfl.custom_op` if they can all
+  // be supported.
+  bool legalize_custom_tensor_list_ops = false;
+  // Whether to convert some tensor types to a lower precision if all values
+  // within that tensor are within the range of the lower precision. This could
+  // have side effects e.g. reduced flatbuffer size. Only certain type
+  // conversions are supported.
+  bool reduce_type_precision = false;
+  // Whether to consider this model a quantized model with quantize/dequantize
+  // ops and to convert kernels to quantized kernels wherever appropriate.
+  quant::QDQConversionMode qdq_conversion_mode =
+      quant::QDQConversionMode::kQDQNone;
+
+  // When set to true, StableHLO Quantizer is run. The full configuration for
+  // the quantizer is at `ConverterFlags::quantization_config`.
+  bool enable_stablehlo_quantizer = false;
+
+  // Enables the attempt to directly lower composites into tflite ops.
+  bool enable_composite_direct_lowering = true;
+
+  // Specifies the framework of the original model.
+  tflite::ConverterFlags::ModelOriginFramework model_origin_framework =
+      tflite::ConverterFlags::UNSET;
+
+  // When set to true, convert +Inf/-Inf to MIN/MAX float value and output of
+  // convert only contains finite values.
+  bool canonicalizing_inf_as_min_max_float = true;
+};
+
+inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
+                                     const PassConfig& pass_config) {
+  return os << "emit_builtin_tflite_ops: "
+            << pass_config.emit_builtin_tflite_ops
+            << "\nlower_tensor_list_ops: " << pass_config.lower_tensor_list_ops
+            << "\ntrim_functions_allowlist: "
+            << absl::StrJoin(pass_config.trim_functions_allowlist.vec(), ",")
+            << "\nform_clusters: " << pass_config.form_clusters
+            << "\nunfold_batch_matmul: " << pass_config.unfold_batch_matmul
+            << "\noutline_tf_while: " << pass_config.outline_tf_while
+            << "\nshape_inference: " << pass_config.shape_inference
+            << "\nruntime_verification: " << pass_config.runtime_verification
+            << "\nenable_tflite_variables: "
+            << pass_config.enable_tflite_variables
+            << "\nunfold_large_splat_constant: "
+            << pass_config.unfold_large_splat_constant
+            << "\nguarantee_all_funcs_one_use: "
+            << pass_config.guarantee_all_funcs_one_use
+            << "\nenable_hlo_to_tf_conversion: "
+            << pass_config.enable_hlo_to_tf_conversion
+            << "\nenable_stablehlo_conversion: "
+            << pass_config.enable_stablehlo_conversion
+            << "\nlegalize_custom_tensor_list_ops: "
+            << pass_config.legalize_custom_tensor_list_ops
+            << "\nreduce_type_precision: " << pass_config.reduce_type_precision
+            << "\nconvert_qdq_format: "
+            << GetQDQQuantModeString(pass_config.qdq_conversion_mode)
+            << "\nmodel_origin_framework: "
+            << tflite::ConverterFlags::ModelOriginFramework_Name(
+                   pass_config.model_origin_framework)
+            << "\n";
+}
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_COMMON_TFL_PASS_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/absl_error_model_builder.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/absl_error_model_builder.h
new file mode 100644
index 00000000..c3d76e2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/absl_error_model_builder.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_ABSL_ERROR_MODEL_BUILDER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_ABSL_ERROR_MODEL_BUILDER_H_
+
+#include <cstdarg>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+#include "tensorflow/compiler/mlir/lite/core/model_builder_base.h"
+
+namespace mlir::TFL {
+
+// An error reporter that uses absl logging.
+class AbslErrorReporter : public tflite::ErrorReporter {
+  int Report(const char* format, va_list args) override;
+};
+
+tflite::ErrorReporter* GetAbslErrorReporter();
+
+class FlatBufferModelAbslError
+    : public tflite::impl::FlatBufferModelBase<FlatBufferModelAbslError> {
+ public:
+  // Use stderr_reporter as the default error reporter.
+  static tflite::ErrorReporter* GetDefaultErrorReporter() {
+    return GetAbslErrorReporter();
+  }
+
+  // Inherit all constructors from FlatBufferModelBase since inherited factory
+  // methods refer to them.
+  using FlatBufferModelBase<FlatBufferModelAbslError>::FlatBufferModelBase;
+};
+
+}  // namespace mlir::TFL
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_ABSL_ERROR_MODEL_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/error_reporter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/error_reporter.h
new file mode 100644
index 00000000..79c9fc93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/error_reporter.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_ERROR_REPORTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_ERROR_REPORTER_H_
+
+#include <cstdarg>
+
+namespace tflite {
+
+/// A functor that reports error to supporting system. Invoked similar to
+/// printf.
+///
+/// Usage:
+///  ErrorReporter foo;
+///  foo.Report("test %d", 5);
+/// or
+///  va_list args;
+///  foo.Report("test %d", args); // where args is va_list
+///
+/// Subclass ErrorReporter to provide another reporting destination.
+/// For example, if you have a GUI program, you might redirect to a buffer
+/// that drives a GUI error log box.
+class ErrorReporter {
+ public:
+  virtual ~ErrorReporter() = default;
+  /// Converts `args` to character equivalents according to `format` string,
+  /// constructs the error string and report it.
+  /// Returns number of characters written or zero on success, and negative
+  /// number on error.
+  virtual int Report(const char* format, va_list args) = 0;
+
+  /// Converts arguments to character equivalents according to `format` string,
+  /// constructs the error string and report it.
+  /// Returns number of characters written or zero on success, and negative
+  /// number on error.
+  int Report(const char* format, ...);
+
+  /// Equivalent to `Report` above. The additional `void*` parameter is unused.
+  /// This method is for compatibility with macros that takes `TfLiteContext`,
+  /// like TF_LITE_ENSURE and related macros.
+  int ReportError(void*, const char* format, ...);
+};
+
+}  // namespace tflite
+
+// You should not make bare calls to the error reporter, instead use the
+// TF_LITE_REPORT_ERROR macro, since this allows message strings to be
+// stripped when the binary size has to be optimized. If you are looking to
+// reduce binary size, define TF_LITE_STRIP_ERROR_STRINGS when compiling and
+// every call will be stubbed out, taking no memory.
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+#define TF_LITE_REPORT_ERROR(reporter, ...)                               \
+  do {                                                                    \
+    static_cast<::tflite::ErrorReporter*>(reporter)->Report(__VA_ARGS__); \
+  } while (false)
+#else  // TF_LITE_STRIP_ERROR_STRINGS
+#define TF_LITE_REPORT_ERROR(reporter, ...)
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h
new file mode 100644
index 00000000..ed452c90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h
@@ -0,0 +1,428 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+
+#include <cstddef>
+#include <type_traits>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/lite/core/c/tflite_types.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+// The namespace tflite_file is for the data structures that define the .tflite
+// file format, and code that is tightly coupled with those data structures.
+// The .tflite file format is the serialized flatbuffer representation of
+// computations on tensors that TF Lite uses for distribution of compiled ML
+// models.
+namespace tflite_file {
+
+// This namespace contains functions that transform code and data structures
+// that are defined in the flatbuffer serialization format into
+// in-memory values that are used by the runtime API, interpreter and compiler.
+namespace flatbuffer_conversions {
+
+using tflite::Operator;
+
+// Interface class for builtin data allocations.
+class BuiltinDataAllocator {
+ public:
+  virtual void* Allocate(size_t size, size_t alignment_hint) = 0;
+  virtual void Deallocate(void* data) = 0;
+
+  // Allocate a structure, but make sure it is a POD structure that doesn't
+  // require constructors to run. The reason we do this, is that Interpreter's C
+  // extension part will take ownership so destructors  will not be run during
+  // deallocation.
+  template <typename T>
+  T* AllocatePOD() {
+    // TODO(b/154346074): Change this to is_trivially_destructible when all
+    // platform targets support that properly.
+    static_assert(std::is_pod<T>::value, "Builtin data structure must be POD.");
+    void* allocated_memory = this->Allocate(sizeof(T), alignof(T));
+    return new (allocated_memory) T();
+  }
+
+  virtual ~BuiltinDataAllocator() = default;
+};
+
+// Parse the appropriate data out of the op.
+//
+// This handles builtin data explicitly as there are flatbuffer schemas.
+// If it returns kTfLiteOk, it passes the data out with `builtin_data`. The
+// calling function has to pass in an allocator object, and this allocator
+// will be called to reserve space for the output data. If the calling
+// function's allocator reserves memory on the heap, then it's the calling
+// function's responsibility to free it.
+// If it returns kTfLiteError, `builtin_data` will be `nullptr`.
+absl::Status ParseOpData(const tflite::Operator* op,
+                         tflite::BuiltinOperator op_type,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+// Converts the tensor data type used in the flat buffer to the representation
+// used by the runtime.
+absl::Status ConvertTensorType(tflite::TensorType tensor_type,
+                               TfLiteType* type);
+
+absl::Status ParseAbs(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseAdd(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseAddN(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseArgMax(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseArgMin(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseAssignVariable(const Operator* op,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+absl::Status ParseBatchMatMul(const Operator* op,
+                              BuiltinDataAllocator* allocator,
+                              void** builtin_data);
+
+absl::Status ParseBatchToSpaceNd(const Operator* op,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+absl::Status ParseBroadcastArgs(const Operator* op,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+absl::Status ParseBroadcastTo(const Operator* op,
+                              BuiltinDataAllocator* allocator,
+                              void** builtin_data);
+
+absl::Status ParseCallOnce(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseCeil(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseCast(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseConcatenation(const Operator* op,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+absl::Status ParseConv2D(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseCos(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseCumsum(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseDepthToSpace(const Operator* op,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+absl::Status ParseDepthwiseConv2D(const Operator* op,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+absl::Status ParseDequantize(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseDiv(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseElu(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseEmbeddingLookup(const Operator* op,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+absl::Status ParseEqual(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseExp(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseExpandDims(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseFill(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseFloor(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseFloorDiv(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseFloorMod(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseFullyConnected(const Operator* op,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+absl::Status ParseGather(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseGatherNd(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseGreater(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseGreaterEqual(const Operator* op,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+absl::Status ParseHardSwish(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseIf(const Operator* op, BuiltinDataAllocator* allocator,
+                     void** builtin_data);
+
+absl::Status ParseL2Normalization(const Operator* op,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+absl::Status ParseLeakyRelu(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseLess(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseLessEqual(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseLog(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseLogicalAnd(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseLogicalNot(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseLogicalOr(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseLogistic(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseLogSoftmax(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseLSTM(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseMaximum(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseMinimum(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseMirrorPad(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseMul(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseNeg(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseNotEqual(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParsePack(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParsePad(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParsePadV2(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParsePool(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParsePow(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParsePrelu(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseQuantize(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseReadVariable(const Operator* op,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+absl::Status ParseReducer(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseRelu(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseRelu6(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseReshape(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseResizeBilinear(const Operator* op,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+absl::Status ParseResizeNearestNeighbor(const Operator* op,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data);
+
+absl::Status ParseRound(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseRsqrt(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseSelectV2(const Operator* op, BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+absl::Status ParseShape(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseSin(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseSlice(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseSoftmax(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseSpaceToBatchNd(const Operator* op,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+absl::Status ParseSpaceToDepth(const Operator* op,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+absl::Status ParseSplit(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseSplitV(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+absl::Status ParseSqueeze(const Operator* op, BuiltinDataAllocator* allocator,
+                          void** builtin_data);
+
+absl::Status ParseSqrt(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+absl::Status ParseSquare(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseSquaredDifference(const Operator* op,
+                                    BuiltinDataAllocator* allocator,
+                                    void** builtin_data);
+
+absl::Status ParseStridedSlice(const Operator* op,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+absl::Status ParseSub(const Operator* op, BuiltinDataAllocator* allocator,
+                      void** builtin_data);
+
+absl::Status ParseSvdf(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseTanh(const Operator* op, BuiltinDataAllocator* allocator,
+                       void** builtin_data);
+
+absl::Status ParseTranspose(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseTransposeConv(const Operator* op,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+absl::Status ParseUnpack(const Operator* op, BuiltinDataAllocator* allocator,
+                         void** builtin_data);
+
+absl::Status ParseUnidirectionalSequenceLSTM(const Operator* op,
+                                             BuiltinDataAllocator* allocator,
+                                             void** builtin_data);
+
+absl::Status ParseVarHandle(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseWhile(const Operator* op, BuiltinDataAllocator* allocator,
+                        void** builtin_data);
+
+absl::Status ParseZerosLike(const Operator* op, BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+absl::Status ParseBitwiseXor(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseRightShift(const Operator* op,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+absl::Status ParseStablehloScatter(const Operator* op,
+                                   BuiltinDataAllocator* allocator,
+                                   void** builtin_data);
+
+absl::Status ParseStablehloRngBitGenerator(const Operator* op,
+                                           BuiltinDataAllocator* allocator,
+                                           void** builtin_data);
+
+absl::Status ParseStablehloGather(const Operator* op,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+absl::Status ParseStablehloReduceWindow(const Operator* op,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data);
+
+absl::Status ParseStablehloPad(const Operator* op,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+absl::Status ParseStablehloComposite(const Operator* op,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
+absl::Status ParseStablehloShiftLeft(const Operator* op,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
+absl::Status ParseStablehloCase(const Operator* op,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+}  // namespace flatbuffer_conversions
+}  // namespace tflite_file
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/verifier.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/verifier.h
new file mode 100644
index 00000000..2e24347d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/api/verifier.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Abstract interface for verifying a model.
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_VERIFIER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_VERIFIER_H_
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+/// Abstract interface that verifies whether a given model is legit.
+/// It facilitates the use-case to verify and build a model without loading it
+/// twice.
+/// (See also "tensorflow/lite/tools/verifier.h".)
+class TfLiteVerifier {
+ public:
+  /// Returns true if the model is legit.
+  virtual bool Verify(const char* data, int length,
+                      ErrorReporter* reporter) = 0;
+  virtual ~TfLiteVerifier() {}
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_API_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
new file mode 100644
index 00000000..1327162f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
@@ -0,0 +1,670 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/builtin_op_data.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
+
+#include <stdbool.h>  // IWYU pragma: keep
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/compiler/mlir/lite/core/c/tflite_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TfLiteReshapeParams can't have dynamic data so we fix the maximum possible
+// number of dimensions.
+#define TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_SCATTER_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_CASE_PARAMS_MAX_BRANCHES_COUNT 20
+
+// TODO(aselle): Consider using "if this then that" for testing.
+
+// Useful placeholder to put in otherwise empty structs to avoid size warnings.
+typedef struct {
+  char dummy;
+} EmptyStructPlaceholder;
+
+// IMPORTANT: All new members of structs must be added at the end to ensure
+// backwards compatibility.
+
+// Possible padding types (for convolutions)
+typedef enum {
+  kTfLitePaddingUnknown = 0,
+  kTfLitePaddingSame,
+  kTfLitePaddingValid,
+} TfLitePadding;
+
+typedef enum {
+  kTfLiteMirrorPaddingUnknown = 0,
+  kTfLiteMirrorPaddingReflect,
+  kTfLiteMirrorPaddingSymmetric,
+} TfLiteMirrorPaddingMode;
+
+// TODO(b/130259536): We should move this out of builtin_op_data.
+typedef struct {
+  int width;
+  int height;
+  int width_offset;
+  int height_offset;
+} TfLitePaddingValues;
+
+typedef struct {
+  TfLiteMirrorPaddingMode mode;
+} TfLiteMirrorPaddingParams;
+
+// Possible fused activation functions.
+typedef enum {
+  kTfLiteActNone = 0,
+  kTfLiteActRelu,
+  kTfLiteActReluN1To1,  // min(max(-1, x), 1)
+  kTfLiteActRelu6,      // min(max(0, x), 6)
+  kTfLiteActTanh,
+  kTfLiteActSignBit,
+  kTfLiteActSigmoid,
+} TfLiteFusedActivation;
+
+typedef struct {
+  // Parameters for CONV_2D version 1.
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  TfLiteFusedActivation activation;
+
+  // Parameters for CONV_2D version 2.
+  // Note: Version 2 supports dilation values not equal to 1.
+  int dilation_width_factor;
+  int dilation_height_factor;
+
+  // Parameters for CONV_2D version 7 or above.
+  // Used to determine the default value for the quantized bias.
+  TfLiteType quantized_bias_type;
+} TfLiteConvParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int stride_depth;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int dilation_depth_factor;
+  TfLiteFusedActivation activation;
+} TfLiteConv3DParams;
+
+typedef TfLiteConv3DParams TfLiteConv3DTransposeParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int filter_width;
+  int filter_height;
+  TfLiteFusedActivation activation;
+  struct {
+    TfLitePaddingValues padding;
+  } computed;
+} TfLitePoolParams;
+
+typedef struct {
+  // Parameters for DepthwiseConv version 1 or above.
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  //
+  // The information can be deduced from the shape of input and the shape of
+  // weights. Since the TFLiteConverter toolchain doesn't support partially
+  // specified shapes, relying on `depth_multiplier` stops us from supporting
+  // graphs with dynamic shape tensors.
+  //
+  // Note: Some of the delegates (e.g. NNAPI, GPU) are still relying on this
+  // field.
+  int depth_multiplier;
+  TfLiteFusedActivation activation;
+  // Parameters for DepthwiseConv version 2 or above.
+  int dilation_width_factor;
+  int dilation_height_factor;
+} TfLiteDepthwiseConvParams;
+
+typedef struct {
+  int rank;
+  TfLiteFusedActivation activation;
+
+  // Parameter for SVDF version 4.
+  bool asymmetric_quantize_inputs;
+} TfLiteSVDFParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+
+  // Parameter for RNN version 3.
+  bool asymmetric_quantize_inputs;
+} TfLiteRNNParams;
+
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+
+  // Parameter for Sequence RNN version 3.
+  bool asymmetric_quantize_inputs;
+} TfLiteSequenceRNNParams;
+
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+  bool merge_outputs;
+
+  // Parameter for Bidirectional RNN version 3.
+  bool asymmetric_quantize_inputs;
+} TfLiteBidirectionalSequenceRNNParams;
+
+typedef enum {
+  kTfLiteFullyConnectedWeightsFormatDefault = 0,
+  kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8 = 1,
+} TfLiteFullyConnectedWeightsFormat;
+
+typedef struct {
+  // Parameters for FullyConnected version 1 or above.
+  TfLiteFusedActivation activation;
+
+  // Parameters for FullyConnected version 2 or above.
+  TfLiteFullyConnectedWeightsFormat weights_format;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimensions in the input and the output
+  // tensors are the same. Furthermore, all but the last dimension of the input
+  // and output shapes will be equal.
+  bool keep_num_dims;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true and the weights are quantized, then non constant inputs
+  // are quantized at evaluation time with asymmetric quantization.
+  bool asymmetric_quantize_inputs;
+
+  // Parameters for FullyConnected version 10 or above.
+  // Used to determine the default value for the quantized bias.
+  TfLiteType quantized_bias_type;
+} TfLiteFullyConnectedParams;
+
+typedef enum {
+  kTfLiteLshProjectionUnknown = 0,
+  kTfLiteLshProjectionSparse = 1,
+  kTfLiteLshProjectionDense = 2,
+} TfLiteLSHProjectionType;
+
+typedef struct {
+  TfLiteLSHProjectionType type;
+} TfLiteLSHProjectionParams;
+
+typedef struct {
+  float beta;
+} TfLiteSoftmaxParams;
+
+typedef struct {
+  int axis;
+  TfLiteFusedActivation activation;
+} TfLiteConcatenationParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+  // Parameter added for the version 4.
+  bool pot_scale_int16;
+} TfLiteAddParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteSpaceToBatchNDParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteBatchToSpaceNDParams;
+
+typedef struct {
+  bool adj_x;
+  bool adj_y;
+  // Parameters for BatchMatMul version 4 or above.
+  // If set to true and the weights are quantized, then non constant inputs
+  // are quantized at evaluation time with asymmetric quantization.
+  bool asymmetric_quantize_inputs;
+} TfLiteBatchMatMulParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteMulParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+  // Parameter added for the version 5.
+  bool pot_scale_int16;
+} TfLiteSubParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteDivParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteL2NormParams;
+
+typedef struct {
+  int radius;
+  float bias;
+  float alpha;
+  float beta;
+} TfLiteLocalResponseNormParams;
+
+typedef enum {
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+} TfLiteLSTMKernelType;
+
+typedef struct {
+  // Parameters for LSTM version 1.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  TfLiteLSTMKernelType kernel_type;
+
+  // Parameters for LSTM version 4.
+  bool asymmetric_quantize_inputs;
+} TfLiteLSTMParams;
+
+typedef struct {
+  // Parameters needed for the underlying LSTM.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If set to true then the first dimension is time, otherwise batch.
+  bool time_major;
+
+  // Parameter for unidirectional sequence RNN version 3.
+  bool asymmetric_quantize_inputs;
+
+  // Parameter for unidirectional sequence RNN version 4.
+  bool diagonal_recurrent_tensors;
+} TfLiteUnidirectionalSequenceLSTMParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  // Parameters inherited for the LSTM kernel.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If true, store the outputs of both directions in the first output.
+  bool merge_outputs;
+
+  // Parameters supported by version 2:
+  // If set to true then the first dimension is time, otherwise batch.
+  bool time_major;
+
+  // Parameters supported by version 3:
+  // If set to true, then hybrid ops use asymmetric quantization for inputs.
+  bool asymmetric_quantize_inputs;
+} TfLiteBidirectionalSequenceLSTMParams;
+
+typedef struct {
+  bool align_corners;
+  // half_pixel_centers assumes pixels are of half the actual dimensions, and
+  // yields more accurate resizes. Corresponds to the same argument for the
+  // original TensorFlow op in TF2.0.
+  bool half_pixel_centers;
+} TfLiteResizeBilinearParams;
+
+typedef struct {
+  bool align_corners;
+  bool half_pixel_centers;
+} TfLiteResizeNearestNeighborParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLitePadParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLitePadV2Params;
+
+typedef struct {
+  // These fields are only used in old models for backward compatibility.
+  // In the current implementation, we use the 2nd input of the op as the shape,
+  // and these fields are unused.
+  int32_t shape[TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT];
+  int num_dimensions;
+} TfLiteReshapeParams;
+
+typedef struct {
+  int ngram_size;
+  int max_skip_size;
+  bool include_all_ngrams;
+} TfLiteSkipGramParams;
+
+typedef struct {
+  int block_size;
+} TfLiteSpaceToDepthParams;
+
+typedef struct {
+  int block_size;
+} TfLiteDepthToSpaceParams;
+
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
+typedef enum {
+  kTfLiteCombinerTypeSum = 0,
+  kTfLiteCombinerTypeMean = 1,
+  kTfLiteCombinerTypeSqrtn = 2,
+} TfLiteCombinerType;
+
+typedef struct {
+  TfLiteCombinerType combiner;
+} TfLiteEmbeddingLookupSparseParams;
+
+typedef struct {
+  int axis;
+  int batch_dims;
+} TfLiteGatherParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteTransposeParams;
+
+typedef struct {
+  bool keep_dims;
+} TfLiteReducerParams;
+
+typedef struct {
+  int num_splits;
+} TfLiteSplitParams;
+
+typedef struct {
+  int num_splits;
+} TfLiteSplitVParams;
+
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int32_t squeeze_dims[8];
+  int num_squeeze_dims;
+} TfLiteSqueezeParams;
+
+typedef struct {
+  int begin_mask;
+  int end_mask;
+  int ellipsis_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+
+  // Parameters supported by version 8:
+  // If true, then the end tensor is an offset of the begin tensor.
+  bool offset;
+} TfLiteStridedSliceParams;
+
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMinParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+
+  // Parameters supported by version 4:
+  TfLiteFusedActivation activation;
+
+  // Parameters for TransposeConv version 5 or above.
+  // Used to determine the default value for the quantized bias.
+  TfLiteType quantized_bias_type;
+} TfLiteTransposeConvParams;
+
+typedef struct {
+  bool validate_indices;
+} TfLiteSparseToDenseParams;
+
+typedef struct {
+  TfLiteType out_type;
+} TfLiteShapeParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteRankParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  float min;
+  float max;
+  int num_bits;
+
+  // Parameters supported by version 2:
+  bool narrow_range;
+} TfLiteFakeQuantParams;
+
+typedef struct {
+  int values_count;
+  int axis;
+} TfLitePackParams;
+
+typedef struct {
+  int axis;
+} TfLiteOneHotParams;
+
+typedef struct {
+  int num;
+  int axis;
+} TfLiteUnpackParams;
+
+typedef struct {
+  float alpha;
+} TfLiteLeakyReluParams;
+
+typedef struct {
+  TfLiteType index_out_type;
+} TfLiteUniqueParams;
+
+typedef struct {
+  int seq_dim;
+  int batch_dim;
+} TfLiteReverseSequenceParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteMatrixDiagParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteMatrixSetDiagParams;
+
+typedef struct {
+  int then_subgraph_index;
+  int else_subgraph_index;
+} TfLiteIfParams;
+
+typedef struct {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+} TfLiteWhileParams;
+
+typedef struct {
+  bool exclusive;
+  bool reverse;
+} TfLiteCumsumParams;
+
+typedef struct {
+  int init_subgraph_index;
+} TfLiteCallOnceParams;
+
+typedef struct {
+  int table_id;
+  TfLiteType key_dtype;
+  TfLiteType value_dtype;
+} TfLiteHashtableParams;
+
+typedef struct {
+  const char* container;
+  const char* shared_name;
+} TfLiteVarHandleParams;
+
+typedef struct {
+  int seed;
+  int seed2;
+} TfLiteRandomParams;
+
+typedef struct {
+  int num_boundaries;
+  // This points to the memory stored in the model (flatbuffer),
+  // and is not owned.
+  const float* boundaries;
+} TfLiteBucketizeParams;
+
+typedef struct {
+  bool approximate;
+} TfLiteGeluParams;
+
+typedef struct {
+  int64_t dimension;
+} TfLiteStablehloConcatenateParams;
+
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#scatter
+  bool indices_are_sorted;
+  int64_t
+      update_window_dims[TFLITE_STABLEHLO_SCATTER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_update_window_dims;
+  int64_t
+      inserted_window_dims[TFLITE_STABLEHLO_SCATTER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_inserted_window_dims;
+  int64_t scatter_dims_to_operand_dims
+      [TFLITE_STABLEHLO_SCATTER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_scatter_dims_to_operand_dims;
+  int64_t index_vector_dim;
+  bool unique_indices;
+  int update_computation_subgraph_index;
+} TfLiteStablehloScatterParams;
+
+typedef enum {
+  kTfLiteRngAlgorithmUnknown = 0,
+  // An algorithm auto-selected by the system according to device type.
+  kTfLiteRngAlgorithmDefault,
+  // The Philox algorithm, as described in paper
+  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
+  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+  kTfLiteRngAlgorithmPhilox,
+  // The ThreeFry algorithm, as described in paper
+  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
+  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+  kTfLiteRngAlgorithmThreefry,
+} TfLiteRngAlgorithm;
+
+typedef struct {
+  TfLiteRngAlgorithm algorithm;
+} TfLiteStablehloRngBitGeneratorParams;
+
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#gather
+  int64_t offset_dims[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_offset_dims;
+  int64_t
+      collapsed_slice_dims[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_collapsed_slice_dims;
+  int64_t start_index_map[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_start_index_map;
+  int64_t index_vector_dim;
+  int64_t slice_sizes[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_slice_sizes;
+  bool indices_are_sorted;
+} TfLiteStablehloGatherParams;
+
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#reduce_window
+  int64_t window_dimensions
+      [TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t
+      window_strides[TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t
+      base_dilations[TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t window_dilations
+      [TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t
+      padding[2 * TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT];
+  int body_subgraph_index;
+} TfLiteStablehloReduceWindowParams;
+
+enum TfLiteReduceWindowFunction {
+  TfLiteReduceWindowFunctionUnsupported,
+  TfLiteReduceWindowFunctionAdd,
+  TfLiteReduceWindowFunctionMul,
+  TfLiteReduceWindowFunctionMin,
+  TfLiteReduceWindowFunctionMax,
+  TfLiteReduceWindowFunctionAll,
+  TfLiteReduceWindowFunctionAny
+};
+
+typedef struct {
+  enum TfLiteReduceWindowFunction reduce_function;
+} TfLiteReduceWindowParams;
+
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#pad
+  int64_t edge_padding_low[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t edge_padding_high[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t interior_padding[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
+} TfLiteStablehloPadParams;
+
+typedef struct {
+  const char* name;
+  int32_t subgraph_index;
+  int32_t version;
+  const uint8_t* attributes;
+  size_t attributes_size;
+} TfLiteStablehloCompositeParams;
+
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
+  int32_t
+      branch_subgraph_indices[TFLITE_STABLEHLO_CASE_PARAMS_MAX_BRANCHES_COUNT];
+  uint32_t num_branches;
+} TfLiteStablehloCaseParams;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/c/tflite_types.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/c/tflite_types.h
new file mode 100644
index 00000000..068facb1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/c/tflite_types.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file hosts data structures that are needed both for LiteRT and
+// Compiler.
+
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api_types.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api_types.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/c_api_types.h"
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_TFLITE_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_TFLITE_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Types supported by tensor
+// LINT.IfChange
+typedef enum {
+  kTfLiteNoType = 0,
+  kTfLiteFloat32 = 1,
+  kTfLiteInt32 = 2,
+  kTfLiteUInt8 = 3,
+  kTfLiteInt64 = 4,
+  kTfLiteString = 5,
+  kTfLiteBool = 6,
+  kTfLiteInt16 = 7,
+  kTfLiteComplex64 = 8,
+  kTfLiteInt8 = 9,
+  kTfLiteFloat16 = 10,
+  kTfLiteFloat64 = 11,
+  kTfLiteComplex128 = 12,
+  kTfLiteUInt64 = 13,
+  kTfLiteResource = 14,
+  kTfLiteVariant = 15,
+  kTfLiteUInt32 = 16,
+  kTfLiteUInt16 = 17,
+  kTfLiteInt4 = 18,
+  kTfLiteBFloat16 = 19,
+} TfLiteType;
+// LINT.ThenChange(//tensorflow/lite/profiling/proto/model_runtime_info.proto:EdgeDataType)
+
+/// Legacy. Will be deprecated in favor of `TfLiteAffineQuantization`.
+/// If per-layer quantization is specified this field will still be populated in
+/// addition to `TfLiteAffineQuantization`.
+/// Parameters for asymmetric quantization. Quantized values can be converted
+/// back to float using: `real_value = scale * (quantized_value - zero_point)`
+typedef struct TfLiteQuantizationParams {
+  float scale;
+  int32_t zero_point;
+} TfLiteQuantizationParams;
+
+/// Storage format of each dimension in a sparse tensor.
+typedef enum TfLiteDimensionType {
+  kTfLiteDimDense = 0,
+  kTfLiteDimSparseCSR,
+} TfLiteDimensionType;
+
+#ifdef __cplusplus
+}  // extern C
+#endif
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_TFLITE_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/macros.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/macros.h
new file mode 100644
index 00000000..c18984d3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/macros.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides utility macros and functions that are inherently platform
+// specific or shared across runtime & converter.
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_MACROS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_MACROS_H_
+
+#ifndef TF_LITE_STATIC_MEMORY
+// maximum size of a valid flatbuffer
+inline constexpr unsigned int flatbuffer_size_max = 2147483648;
+// If none zero then the buffer is stored outside of the flatbuffers, string
+inline constexpr char tflite_metadata_buffer_location[] = "buffer_location";
+// field for minimum runtime version, string
+inline constexpr char tflite_metadata_min_runtime_version[] =
+    "min_runtime_version";
+// the stablehlo op version is supported by the tflite runtime
+inline constexpr char tflite_supported_stablehlo_version[] = "1.0.0";
+#endif
+
+// LINT.IfChange(TFLITE_NOINLINE)
+
+#ifdef _WIN32
+#define TFLITE_NOINLINE __declspec(noinline)
+#else
+#ifdef __has_attribute
+#if __has_attribute(noinline)
+#define TFLITE_NOINLINE __attribute__((noinline))
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute(noinline)
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute
+#endif  // _WIN32
+
+// LINT.ThenChange(//tensorflow/lite/core/macros.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/model_builder_base.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/model_builder_base.h
new file mode 100644
index 00000000..e7892cc0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/core/model_builder_base.h
@@ -0,0 +1,614 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Deserialization infrastructure for tflite. Provides functionality
+/// to go from a serialized tflite model in flatbuffer format to an
+/// in-memory representation of the model.
+///
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/model_builder.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/model_builder.h"
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_MODEL_BUILDER_BASE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_CORE_MODEL_BUILDER_BASE_H_
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "flatbuffers/base.h"  // from @flatbuffers
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "flatbuffers/verifier.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+#include "tensorflow/compiler/mlir/lite/core/api/verifier.h"
+#include "tensorflow/compiler/mlir/lite/core/macros.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+std::unique_ptr<Allocation> GetAllocationFromFile(
+    const char* filename, ErrorReporter* error_reporter);
+
+std::unique_ptr<Allocation> GetAllocationFromFile(
+    int fd, ErrorReporter* error_reporter);
+
+namespace impl {
+
+/// An RAII object that represents a read-only tflite model, copied from disk,
+/// or mmapped. This uses flatbuffers as the serialization format.
+///
+/// NOTE: The current API requires that a FlatBufferModelBase instance be kept
+/// alive by the client as long as it is in use by any dependent Interpreter
+/// instances. As the FlatBufferModelBase instance is effectively immutable
+/// after creation, the client may safely use a single model with multiple
+/// dependent Interpreter instances, even across multiple threads (though note
+/// that each Interpreter instance is *not* thread-safe).
+///
+/// <pre><code>
+/// using namespace tflite;
+/// StderrReporter error_reporter;
+/// auto model = FlatBufferModelBase::BuildFromFile("interesting_model.tflite",
+///                                             &error_reporter);
+/// MyOpResolver resolver;  // You need to subclass OpResolver to provide
+///                         // implementations.
+/// InterpreterBuilder builder(*model, resolver);
+/// std::unique_ptr<Interpreter> interpreter;
+/// if(builder(&interpreter) == kTfLiteOk) {
+///   .. run model inference with interpreter
+/// }
+/// </code></pre>
+///
+/// OpResolver must be defined to provide your kernel implementations to the
+/// interpreter. This is environment specific and may consist of just the
+/// builtin ops, or some custom operators you defined to extend tflite.
+template <typename T>
+class FlatBufferModelBase {
+ public:
+  /// Builds a model based on a file.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModelBase instance.
+  /// Returns a nullptr in case of failure.
+  static std::unique_ptr<T> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    std::unique_ptr<T> model = BuildFromAllocation(
+        GetAllocationFromFile(filename, error_reporter), error_reporter);
+#if FLATBUFFERS_LITTLEENDIAN == 1
+    return model;
+#else
+    return ByteConvertModel(std::move(model), error_reporter);
+#endif
+  }
+
+  /// Verifies whether the content of the file is legit, then builds a model
+  /// based on the file.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// file contents. By default, we always check with tflite::VerifyModelBuffer.
+  /// If extra_verifier is supplied, the file contents is also checked against
+  /// the extra_verifier after the check against tflite::VerifyModelBuilder.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModelBase instance.
+  /// Returns a nullptr in case of failure.
+  static std::unique_ptr<T> VerifyAndBuildFromFile(
+      const char* filename, TfLiteVerifier* extra_verifier = nullptr,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    std::unique_ptr<T> model = VerifyAndBuildFromAllocation(
+        GetAllocationFromFile(filename, error_reporter), extra_verifier,
+        error_reporter);
+#if FLATBUFFERS_LITTLEENDIAN == 1
+    return model;
+#else
+    return ByteConvertModel(std::move(model), error_reporter);
+#endif
+  }
+
+  /// Builds a model based on a file descriptor.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModelBase instance. Caller retains ownership
+  /// of `fd` and must ensure it is closed after BuildFromFile returns. Returns
+  /// a nullptr in case of failure.
+  static std::unique_ptr<T> BuildFromFileDescriptor(
+      int fd, ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    std::unique_ptr<T> model = BuildFromAllocation(
+        GetAllocationFromFile(fd, error_reporter), error_reporter);
+#if FLATBUFFERS_LITTLEENDIAN == 1
+    return model;
+#else
+    return ByteConvertModel(std::move(model), error_reporter);
+#endif
+  }
+
+  /// Verifies whether the content of the file descriptor is legit, then builds
+  /// a model based on the file.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// file contents. By default, we always check with tflite::VerifyModelBuffer.
+  /// If extra_verifier is supplied, the file contents is also checked against
+  /// the extra_verifier after the check against tflite::VerifyModelBuilder.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModelBase instance.
+  /// Returns a nullptr in case of failure.
+  static std::unique_ptr<T> VerifyAndBuildFromFileDescriptor(
+      int fd, TfLiteVerifier* extra_verifier = nullptr,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    std::unique_ptr<FlatBufferModelBase<T>> model =
+        VerifyAndBuildFromAllocation(GetAllocationFromFile(fd, error_reporter),
+                                     extra_verifier, error_reporter);
+#if FLATBUFFERS_LITTLEENDIAN == 1
+    return model;
+#else
+    return ByteConvertModel(std::move(model), error_reporter);
+#endif
+  }
+
+  /// Builds a model based on a pre-loaded flatbuffer.
+  /// Caller retains ownership of the buffer and should keep it alive until
+  /// the returned object is destroyed. Caller also retains ownership of
+  /// `error_reporter` and must ensure its lifetime is longer than the
+  /// FlatBufferModelBase instance.
+  /// Returns a nullptr in case of failure.
+  /// NOTE: this does NOT validate the buffer so it should NOT be called on
+  /// invalid/untrusted input. Use VerifyAndBuildFromBuffer in that case
+  static std::unique_ptr<T> BuildFromBuffer(
+      const char* caller_owned_buffer, size_t buffer_size,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    std::unique_ptr<Allocation> allocation(
+        new MemoryAllocation(caller_owned_buffer, buffer_size, error_reporter));
+    return BuildFromAllocation(std::move(allocation), error_reporter);
+  }
+
+  /// Verifies whether the content of the buffer is legit, then builds a model
+  /// based on the pre-loaded flatbuffer.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// buffer. By default, we always check with tflite::VerifyModelBuffer. If
+  /// extra_verifier is supplied, the buffer is checked against the
+  /// extra_verifier after the check against tflite::VerifyModelBuilder. The
+  /// caller retains ownership of the buffer and should keep it alive until the
+  /// returned object is destroyed. Caller retains ownership of `error_reporter`
+  /// and must ensure its lifetime is longer than the FlatBufferModelBase
+  /// instance. Returns a nullptr in case of failure.
+  static std::unique_ptr<T> VerifyAndBuildFromBuffer(
+      const char* caller_owned_buffer, size_t buffer_size,
+      TfLiteVerifier* extra_verifier = nullptr,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    std::unique_ptr<Allocation> allocation(
+        new MemoryAllocation(caller_owned_buffer, buffer_size, error_reporter));
+    return VerifyAndBuildFromAllocation(std::move(allocation), extra_verifier,
+                                        error_reporter);
+  }
+
+#if FLATBUFFERS_LITTLEENDIAN == 0
+
+  void ByteSwapSerializedModel(std::string* serialized_model,
+                               bool from_big_endian) {
+    const uint8_t* buffer =
+        reinterpret_cast<const uint8_t*>(serialized_model->c_str());
+    const tflite::Model* input_model = tflite::GetModel(buffer);
+    ByteSwapTFLiteModel(input_model, from_big_endian);
+  }
+
+  void ByteSwapBuffer(int8_t tensor_type, size_t buffer_size, uint8_t* buffer,
+                      bool from_big_endian) {
+    switch (tensor_type) {
+      case tflite::TensorType_STRING: {
+        auto bp = reinterpret_cast<int32_t*>(buffer);
+        int num_of_strings =
+            from_big_endian ? bp[0] : flatbuffers::EndianSwap(bp[0]);
+        for (int i = 0; i < num_of_strings + 2; i++)
+          bp[i] = flatbuffers::EndianSwap(bp[i]);
+        break;
+      }
+      // 16-bit types
+      case tflite::TensorType_FLOAT16:
+      case tflite::TensorType_INT16:
+      case tflite::TensorType_UINT16: {
+        auto bp = reinterpret_cast<uint16_t*>(buffer);
+        for (int i = 0; i < buffer_size / 2; i++)
+          bp[i] = flatbuffers::EndianSwap(bp[i]);
+        break;
+      }
+      // 32-bit types
+      case tflite::TensorType_FLOAT32:
+      case tflite::TensorType_INT32:
+      case tflite::TensorType_UINT32:
+      case tflite::TensorType_COMPLEX64: {
+        auto bp = reinterpret_cast<uint32_t*>(buffer);
+        for (int i = 0; i < buffer_size / 4; i++)
+          bp[i] = flatbuffers::EndianSwap(bp[i]);
+        break;
+      }
+      // 64-bit types
+      case tflite::TensorType_INT64:
+      case tflite::TensorType_FLOAT64:
+      case tflite::TensorType_UINT64:
+      case tflite::TensorType_COMPLEX128: {
+        auto bp = reinterpret_cast<uint64_t*>(buffer);
+        for (int i = 0; i < buffer_size / 8; i++)
+          bp[i] = flatbuffers::EndianSwap(bp[i]);
+        break;
+      }
+      default:
+        break;
+    }
+  }
+
+  void ByteSwapTFLiteModel(const tflite::Model* tfl_model,
+                           bool from_big_endian) {
+    std::vector<bool> buffer_swapped(tfl_model->buffers()->size(), false);
+    for (size_t subgraph_idx = 0; subgraph_idx < tfl_model->subgraphs()->size();
+         subgraph_idx++) {
+      const tflite::SubGraph* subgraph =
+          tfl_model->subgraphs()->Get(subgraph_idx);
+      for (size_t ts_idx = 0; ts_idx < subgraph->tensors()->size(); ts_idx++) {
+        const tflite::Tensor* tensor = subgraph->tensors()->Get(ts_idx);
+        if (tensor->buffer() > 0 &&
+            tensor->buffer() < tfl_model->buffers()->size() &&
+            !buffer_swapped[tensor->buffer()]) {
+          const tflite::Buffer* buffer_ =
+              (*tfl_model->buffers())[tensor->buffer()];
+          if (!buffer_ || !buffer_->data()) continue;
+          auto* buffer = buffer_->data();
+          uint8_t* buff_ = const_cast<uint8_t*>(buffer->data());
+          ByteSwapBuffer(tensor->type(), buffer->size(), buff_,
+                         from_big_endian);
+          buffer_swapped[tensor->buffer()] = true;
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<T> ByteConvertModel(std::unique_ptr<T> model,
+                                      ErrorReporter* error_reporter,
+                                      bool from_big_endian) {
+    if (model == nullptr) return model;
+    auto tfl_model = model->GetModel();
+    if (tfl_model->subgraphs()->size() == 0) return model;
+    if (tfl_model->subgraphs()->Get(0)->tensors()->size() == 0) return model;
+    if (tfl_model->buffers()->size() < 2) return model;
+    return ByteSwapFlatBufferModelBase<T>(std::move(model), error_reporter,
+                                          from_big_endian);
+  }
+
+  std::unique_ptr<T> ByteSwapFlatBufferModelBase(std::unique_ptr<T> model,
+                                                 ErrorReporter* error_reporter,
+                                                 bool from_big_endian) {
+    FlatBufferModelBase<T>* modelp = model.release();
+    auto tflite_model = modelp->GetModel();
+    auto copied_model = std::make_unique<tflite::ModelT>();
+    tflite_model->UnPackTo(copied_model.get(), nullptr);
+    ByteSwapTFLiteModelT(copied_model.get(), from_big_endian);
+    std::unique_ptr<flatbuffers::FlatBufferBuilder> builder(
+        new flatbuffers::FlatBufferBuilder());
+    auto packed_model = tflite::Model::Pack(*builder, copied_model.get());
+    tflite::FinishModelBuffer(*builder, packed_model);
+    flatbuffers::FlatBufferBuilder* builder_ = builder.release();
+    return BuildFromBuffer(
+        reinterpret_cast<const char*>(builder_->GetBufferPointer()),
+        builder_->GetSize(), error_reporter);
+  }
+
+  void ByteSwapTFLiteModelT(tflite::ModelT* tfl_modelt, bool from_big_endian) {
+    size_t bytes_per_elem = 0;
+    std::vector<bool> buffer_swapped(tfl_modelt->buffers.size(), false);
+    for (size_t subgraph_idx = 0; subgraph_idx < tfl_modelt->subgraphs.size();
+         subgraph_idx++) {
+      tflite::SubGraphT* subgraph =
+          tfl_modelt->subgraphs.at(subgraph_idx).get();
+      for (size_t ts_idx = 0; ts_idx < subgraph->tensors.size(); ts_idx++) {
+        tflite::TensorT* tensor = subgraph->tensors[ts_idx].get();
+        if (tensor->buffer > 0 && tensor->buffer < tfl_modelt->buffers.size() &&
+            !buffer_swapped[tensor->buffer]) {
+          const auto* buffer =
+              &(tfl_modelt->buffers[tensor->buffer].get()->data);
+          if (buffer && buffer->data()) {
+            uint8_t* buff_ = const_cast<uint8_t*>(buffer->data());
+            ByteSwapBuffer(tensor->type, buffer->size(), buff_,
+                           from_big_endian);
+            buffer_swapped[tensor->buffer] = true;
+          }
+        }
+      }
+    }
+  }
+
+#endif
+
+  /// Builds a model directly from an allocation.
+  /// Ownership of the allocation is passed to the model, but the caller
+  /// retains ownership of `error_reporter` and must ensure its lifetime is
+  /// longer than the FlatBufferModelBase instance.
+  /// Returns a nullptr in case of failure (e.g., the allocation is invalid).
+  static std::unique_ptr<T> BuildFromAllocation(
+      std::unique_ptr<Allocation> allocation,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    std::unique_ptr<T> model(
+        new T(std::move(allocation), ValidateErrorReporter(error_reporter)));
+    if (!model->initialized()) {
+      model.reset();
+    } else {
+      model->ValidateModelBuffers(error_reporter);
+    }
+    return model;
+  }
+
+  /// Verifies whether the content of the allocation is legit, then builds a
+  /// model based on the provided allocation.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// buffer. By default, we always check with tflite::VerifyModelBuffer. If
+  /// extra_verifier is supplied, the buffer is checked against the
+  /// extra_verifier after the check against tflite::VerifyModelBuilder.
+  /// Ownership of the allocation is passed to the model, but the caller
+  /// retains ownership of `error_reporter` and must ensure its lifetime is
+  /// longer than the FlatBufferModelBase instance.
+  /// Returns a nullptr in case of failure.
+  static std::unique_ptr<T> VerifyAndBuildFromAllocation(
+      std::unique_ptr<Allocation> allocation,
+      TfLiteVerifier* extra_verifier = nullptr,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+    if (!allocation || !allocation->valid()) {
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "The model allocation is null/empty");
+      return nullptr;
+    }
+
+    {
+      // Flatbuffers can only be smaller than 2GB. The file format appends some
+      // data after the actual flabuffer. We truncate the allocation size to 2GB
+      // so that the verifier doesn't early exit on us.
+      size_t allocation_size =
+          std::min(allocation->bytes(),
+                   static_cast<size_t>(FLATBUFFERS_MAX_BUFFER_SIZE - 1));
+      flatbuffers::Verifier base_verifier(
+          reinterpret_cast<const uint8_t*>(allocation->base()), allocation_size,
+          flatbuffers::Verifier::Options());
+      if (!VerifyModelBuffer(base_verifier)) {
+        TF_LITE_REPORT_ERROR(error_reporter,
+                             "The model is not a valid Flatbuffer buffer");
+        return nullptr;
+      }
+
+      if (extra_verifier &&
+          !extra_verifier->Verify(static_cast<const char*>(allocation->base()),
+                                  allocation_size, error_reporter)) {
+        // The verifier will have already logged an appropriate error message.
+        return nullptr;
+      }
+    }
+
+    return BuildFromAllocation(std::move(allocation), error_reporter);
+  }
+
+  /// Builds a model directly from a flatbuffer pointer
+  /// Caller retains ownership of the buffer and should keep it alive until the
+  /// returned object is destroyed. Caller retains ownership of `error_reporter`
+  /// and must ensure its lifetime is longer than the FlatBufferModelBase
+  /// instance. Returns a nullptr in case of failure.
+  static std::unique_ptr<T> BuildFromModel(
+      const tflite::Model* caller_owned_model_spec,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter()) {
+    error_reporter = ValidateErrorReporter(error_reporter);
+
+    if (CheckBufferOutsideModel(caller_owned_model_spec)) {
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "The model contains weights not accessible from "
+                           "tflite::Model *, please use other api");
+      return nullptr;
+    }
+
+    std::unique_ptr<T> model(new T(caller_owned_model_spec, error_reporter));
+    if (!model->initialized()) {
+      model.reset();
+    } else {
+      model->ValidateModelBuffers(error_reporter);
+    }
+    return model;
+  }
+
+  // Releases memory or unmaps mmaped memory.
+  ~FlatBufferModelBase() = default;
+
+  // Copying or assignment is disallowed to simplify ownership semantics.
+  FlatBufferModelBase(const FlatBufferModelBase&) = delete;
+  FlatBufferModelBase& operator=(const FlatBufferModelBase&) = delete;
+
+  bool initialized() const { return model_ != nullptr; }
+  const tflite::Model* operator->() const { return model_; }
+  const tflite::Model* GetModel() const { return model_; }
+  ErrorReporter* error_reporter() const { return error_reporter_; }
+  const Allocation* allocation() const { return allocation_.get(); }
+
+  // Returns the minimum runtime version from the flatbuffer. This runtime
+  // version encodes the minimum required interpreter version to run the
+  // flatbuffer model. If the minimum version can't be determined, an empty
+  // string will be returned.
+  // Note that the returned minimum version is a lower-bound but not a strict
+  // lower-bound; ops in the graph may not have an associated runtime version,
+  // in which case the actual required runtime might be greater than the
+  // reported minimum.
+  std::string GetMinimumRuntime() const {
+    if (!model_ || !model_->metadata()) return "";
+
+    for (int i = 0; i < model_->metadata()->size(); ++i) {
+      auto metadata = model_->metadata()->Get(i);
+      if (metadata->name()->str() == tflite_metadata_min_runtime_version) {
+        auto buf = metadata->buffer();
+        auto* buffer = (*model_->buffers())[buf];
+        auto* array = buffer->data();
+        // Get the real length of the runtime string, since there might be
+        // trailing
+        // '\0's in the buffer.
+        for (int len = 0; len < array->size(); ++len) {
+          if (array->data()[len] == '\0') {
+            return std::string(reinterpret_cast<const char*>(array->data()),
+                               len);
+          }
+        }
+        // If there is no '\0' in the buffer, this indicates that the flatbuffer
+        // is malformed.
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Min_runtime_version in model metadata is malformed");
+        break;
+      }
+    }
+    return "";
+  }
+
+  // Return model metadata as a mapping of name & buffer strings.
+  // See Metadata table in TFLite schema.
+  std::map<std::string, std::string> ReadAllMetadata() const {
+    return ReadAllMetadata(model_);
+  }
+
+  // // Return model metadata as a mapping of name & buffer strings.
+  // // See Metadata table in TFLite schema.
+  static std::map<std::string, std::string> ReadAllMetadata(
+      const ::tflite::Model* model) {
+    std::map<std::string, std::string> keys_values;
+    if (!model || !model->metadata() || !model->buffers()) return keys_values;
+
+    for (size_t i = 0; i < model->metadata()->size(); ++i) {
+      auto metadata = model->metadata()->Get(i);
+      auto buf = metadata->buffer();
+      if (buf >= model->buffers()->size()) continue;
+      const tflite::Buffer* buffer = (*model->buffers())[buf];
+      if (!buffer || !buffer->data()) continue;
+      const flatbuffers::Vector<uint8_t>* array = buffer->data();
+      if (!array) continue;
+      std::string val = std::string(
+          reinterpret_cast<const char*>(array->data()), array->size());
+      // Skip if key or value of metadata is empty.
+      if (!metadata->name() || val.empty()) continue;
+      keys_values[metadata->name()->str()] = val;
+    }
+    return keys_values;
+  }
+
+  // Validates if the FlatBufferModelBase's buffer is well-formed. Specifically,
+  // it checks if the 0th entry of the model buffers is an empty buffer
+  // (sentinel). This is a convention so that tensors without a buffer can
+  // provide 0 as their buffer. NOTE: The function doesn't explicitly fail for
+  // backward compatibility reasons; it just provides a warning in case of
+  // failures.
+  void ValidateModelBuffers(ErrorReporter* error_reporter) {
+    auto buffers = model_->buffers();
+    if (buffers && buffers->size() > 0) {
+      auto first_buffer = buffers->Get(0);
+      if (first_buffer && first_buffer->size() != 0) {
+        // Note the 0th entry of this array must be an empty buffer (sentinel).
+        // This is a convention so that tensors without a buffer can provide 0
+        // as their buffer.
+        TF_LITE_REPORT_ERROR(
+            error_reporter,
+            "The 0th entry of the model buffer must be an empty buffer.");
+      }
+    }
+  }
+
+  /// Returns true if the model identifier is correct (otherwise false and
+  /// reports an error).
+  bool CheckModelIdentifier() const {
+    if (allocation_->bytes() < 7) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Model provided must have at least 7 bytes to hold identifier.\n");
+      return false;
+    }
+    if (!tflite::ModelBufferHasIdentifier(allocation_->base())) {
+      const char* ident = flatbuffers::GetBufferIdentifier(allocation_->base());
+      // Suppress unused variable warning.
+      (void)ident;
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Model provided has model identifier '%c%c%c%c', should be '%s'\n",
+          ident[0], ident[1], ident[2], ident[3], tflite::ModelIdentifier());
+      return false;
+    }
+    return true;
+  }
+
+  /// Check If the buffer is stored as part of the Flatbuffer or outside
+  /// Return false if the buffers are part of the Flatbuffer
+  static bool CheckBufferOutsideModel(const tflite::Model* model) {
+    if (!model || !model->metadata()) return false;
+
+    for (int i = 0; i < model->metadata()->size(); ++i) {
+      auto metadata = model->metadata()->Get(i);
+      if (metadata->name()->str() == tflite_metadata_buffer_location) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ protected:
+  /// Loads a model from a given allocation. FlatBufferModelBase will take over
+  /// the ownership of `allocation`, and delete it in destructor. The ownership
+  /// of `error_reporter`remains with the caller and must have lifetime at least
+  /// as much as FlatBufferModelBase. This is to allow multiple models to use
+  /// the same ErrorReporter instance.
+  explicit FlatBufferModelBase(
+      std::unique_ptr<Allocation> allocation,
+      ErrorReporter* error_reporter = T::GetDefaultErrorReporter())
+      : error_reporter_(ValidateErrorReporter(error_reporter)),
+        allocation_(std::move(allocation)) {
+    if (!allocation_ || !allocation_->valid() || !CheckModelIdentifier()) {
+      return;
+    }
+
+    model_ = ::tflite::GetModel(allocation_->base());
+  }
+
+  /// Loads a model from Model flatbuffer. The `model` has to remain alive and
+  /// unchanged until the end of this flatbuffer model's lifetime.
+  FlatBufferModelBase(const Model* model, ErrorReporter* error_reporter)
+      : model_(model), error_reporter_(ValidateErrorReporter(error_reporter)) {}
+
+  static ErrorReporter* ValidateErrorReporter(ErrorReporter* error_reporter) {
+    return error_reporter ? error_reporter : T::GetDefaultErrorReporter();
+  }
+
+  /// Flatbuffer traverser pointer. (Model* is a pointer that is within the
+  /// allocated memory of the data allocated by allocation's internals.
+  const tflite::Model* model_ = nullptr;
+  /// The error reporter to use for model errors and subsequent errors when
+  /// the interpreter is created
+  ErrorReporter* error_reporter_;
+  /// The allocator used for holding memory of the model. Note that this will
+  /// be null if the client provides a tflite::Model directly.
+  std::unique_ptr<Allocation> allocation_;
+};
+
+}  // namespace impl
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_CORE_MODEL_BUILDER_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/debug/debug.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/debug/debug.h
new file mode 100644
index 00000000..3d7f6fe5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/debug/debug.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_DEBUG_DEBUG_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_DEBUG_DEBUG_H_
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/debug/debug_options.pb.h"
+
+namespace tensorflow {
+
+// Initializes the pass manager with default options that make debugging easier.
+// The `out` method parameter is exposed for testing purposes and not intended
+// to be specified by client code.
+void InitPassManager(mlir::PassManager& pm,
+                     const converter::DebugOptions& options,
+                     llvm::raw_ostream& out = llvm::outs());
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_DEBUG_DEBUG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/delegates/flex/allowlisted_flex_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/delegates/flex/allowlisted_flex_ops.h
new file mode 100644
index 00000000..fc75816d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/delegates/flex/allowlisted_flex_ops.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_H_
+
+#include <set>
+#include <string>
+
+namespace tflite {
+namespace flex {
+
+// Whether the given op has been statically allowlisted for flex export.
+//
+// This static allowlist is formed by the intersection of ops supported by
+// TensorFlowMobile on both iOS and Android. As the converter is likely running
+// on a host that has the full suite of TensorFlow ops available, we use this
+// static allowlist to ensure compatibility when deploying to a mobile device.
+// TODO(b/118389105): Automate generation of the allowlisted flex ops.
+bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name);
+
+// Return the list of allowlisted flex ops.
+const std::set<std::string>& GetFlexAllowlist();
+
+// Return the list of TF.Text flex ops.
+const std::set<std::string>& GetTFTextFlexAllowlist();
+
+// Return the list of SentencePiece flex ops.
+const std::set<std::string>& GetSentencePieceFlexAllowlist();
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/delegates/flex/allowlisted_flex_ops_internal.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/delegates/flex/allowlisted_flex_ops_internal.h
new file mode 100644
index 00000000..420516c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/delegates/flex/allowlisted_flex_ops_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_INTERNAL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_INTERNAL_H_
+
+#include <string>
+
+namespace tflite {
+namespace flex {
+
+// Return true if op_name is a tf.text op need to be supported by flex delegate.
+bool IsAllowedTFTextOpForFlex(const std::string& op_name);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_DELEGATES_FLEX_ALLOWLISTED_FLEX_OPS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h
new file mode 100644
index 00000000..358392ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h
@@ -0,0 +1,133 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_COMMON_OUTLINE_OPERATIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_COMMON_OUTLINE_OPERATIONS_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
+
+namespace mlir {
+namespace TFL {
+namespace common {
+
+// Returns true if the `op` is a constant-like op or produces none type.
+bool IsConstantOrNone(Operation* op);
+
+// Computes the list of Value(s) referenced by Subgraph Operations that are
+// not defined within the Subgraph. Any such Value(s)
+// are validly in-scope for the initial Operation. They must be either
+// defined above the subgraph or appear as an argument to the containing func.
+// These Value(s) are taken to be the arguments of the new raised func.
+// An operand dependency is a Value referenced anywhere in an Op
+// that is defined above the Op. All SSA Values are assigned/defined in a
+// BlockArg or as a result of an Operation.
+llvm::SmallVector<Value> AccumulateOperandsDefinedAbove(
+    const llvm::SetVector<Operation*>& partition_ops);
+
+// Similar to `AccumulateOperandsDefinedAbove()`, computes the Value(s) that are
+// defined within a Subgraph and referenced in a descendant Operation. These
+// Values(s) are to be returned by the new raised function.
+llvm::SmallVector<Value> AccumulateResultsDefinedWithin(
+    const llvm::SetVector<Operation*>& partition_ops);
+
+// Represents a view of a set of mlir Operations that form a subgraph of the
+// entire Module's DAG. `Subgraph` can be thought of as segment of sequential
+// Operations within a func definition. Additional facts:
+//    1. Subgraphs are restricted to a single Block. They do not span
+//        branching instructions. Thus the subgraph is a simple 1-degree path.
+//    2. All Operations in a subgraph belong to the same block in a
+//        funtion body.
+//    3. Function bodies are assumed to have only one block in some places.
+class Subgraph {
+  // Set vector preserves insertion order, must insert Ops in topological order.
+ public:
+  const llvm::SetVector<Operation*> partition_ops_;
+
+  // Subgraphs are given a unique incremented integer id based on when
+  // they were encountered in this pass.
+  const int subgraph_id_;
+
+  const llvm::StringRef dialect_namespace_;
+
+  Subgraph(const llvm::SetVector<Operation*> partition_ops, int num_subgraphs)
+      : partition_ops_(partition_ops),
+        subgraph_id_(num_subgraphs),
+        func_arguments_(AccumulateOperandsDefinedAbove(partition_ops)),
+        func_outputs_(AccumulateResultsDefinedWithin(partition_ops)) {}
+
+  const llvm::SmallVector<Value>& FuncArguments() const {
+    // `Value`s in MLIR library are implemented as having "value semantics"
+    // see "llvm/llvm-project/mlir/include/mlir/IR/Value.h" so copying is fine.
+    return func_arguments_;
+  }
+  const llvm::SmallVector<Value>& FuncOutputs() const { return func_outputs_; }
+
+ private:
+  // Compute once at construction and save as field.
+  const llvm::SmallVector<Value> func_arguments_;
+  const llvm::SmallVector<Value> func_outputs_;
+};
+
+// Helper data structure for output parameters to `ExtractSubgraphToFunc`.
+// `ExtractSubgraphToFunc` adds exactly two "new" `Operations`, a FuncOp and
+// a CallOp. Pass these back to the caller for setting more specific attributes
+// after graph mutation has taken place.
+struct OpsAdded {
+  mlir::func::FuncOp func_op;
+  mlir::func::CallOp call_op;
+};
+
+// Given a `Subgraph` containing a sequence of adjacent `Operations` from
+// the `module`, raise these `Operations` (and any ops contained nested within)
+// to the body of a new seperate root level function. Replace in their current
+// location with a `CallOp` which invokes said `FuncOp`. The inputs to
+// this new functions are taken to be the `Values` that appear as operands
+// to ops in the subgraph, which are not self-contained within the subgraph.
+// The outputs of this function are taken to be the results of ops in the
+// subgraph which are referenced as operands outside of the subgraph.
+// Also refer to documention of `AccumulateOperandsDefinedAbove` &
+// `AccumulateResultsDefinedWithin`.
+void ExtractSubgraphToFunc(const Subgraph& subgraph, OpBuilder& builder,
+                           ModuleOp& module, OpsAdded& ops_added);
+
+}  // namespace common
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_COMMON_OUTLINE_OPERATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/remat/metadata_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/remat/metadata_util.h
new file mode 100644
index 00000000..6036c468
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/remat/metadata_util.h
@@ -0,0 +1,62 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Functions for serializiation/deserialization of control dependency
+/// information to/from model metadata.
+///
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_REMAT_METADATA_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_REMAT_METADATA_UTIL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/utils/control_edges.h"
+
+namespace tflite {
+
+/// Control dependencies for the model is the collection of control dependencies
+/// for its subgraphs.
+using ModelControlDependencies = std::vector<ControlEdges>;
+
+/// Serializes `in` into the returned string. The result is parseable with
+/// ParseModelControlDependencies.
+std::string SerializeModelControlDependencies(
+    const ModelControlDependencies& in);
+
+/// Deserializes `*out` from a character buffer of size `size` at `data`.
+/// Returns true iff successful. `*out` needn't be empty before invocation.
+/// When returning false, `*out`'s state is undefined.
+bool ParseModelControlDependencies(const char* data, size_t size,
+                                   ModelControlDependencies* out);
+
+/// The key under which to store the serialized control dependencies in the
+/// model's metadata.
+constexpr char kModelControlDependenciesMetadataKey[] =
+    "model_control_dependencies";
+
+/// To allow future changes to the format, serialized control dependency data
+/// will contain a version; this constant is the version that will be used for
+/// serialization.  For deserialization, past versions should remain parseable.
+constexpr uint32_t kModelControlDependenciesMetadataVersion = 1;
+
+inline constexpr char kModelUseStablehloTensorKey[] = "keep_stablehlo_constant";
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_REMAT_METADATA_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/remat/rematerializer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/remat/rematerializer.h
new file mode 100644
index 00000000..02f66046
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/remat/rematerializer.h
@@ -0,0 +1,262 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_REMAT_REMATERIALIZER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_REMAT_REMATERIALIZER_H_
+
+// This file declares the Rematerializer class, which is used by an MLIR-based
+// set of transformations for TFLite IR that lower memory usage by redoing
+// operations with small inputs and large outputs instead of keeping the result
+// in memory. This class allows us to compactly and efficiently represent the
+// (idealized) memory profile of a TFLite graph and simulate the effect of
+// re-inserting operations on that memory profile.
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+namespace mlir {
+namespace TFL {
+
+// A class that
+// (1) Encodes in concise form the memory requirements of a computational graph
+// (2) Allows for the fast simulation of changes to the peak memory requirement
+//     under rematerialization of intermediate results in the graph
+// (3) Implements a greedy algorithm for finding rematerializations of
+//     intermediate results in that graph to lower peak memory requirements.
+class Rematerializer {
+ public:
+  Rematerializer() = default;
+  virtual ~Rematerializer() = default;
+
+  // The type used for memory sizes (in bytes) and differences thereof.
+  using SizeT = int64_t;
+
+  // The memory profile: The i-th element gives the amount of memory
+  // that is needed when performing the i-th operation. This is the
+  // sum of the sizes of
+  //
+  // (1) input tensors of that operation,
+  // (2) output tensors of that operation,
+  // (3) output tensors of preceding operations that are input tensors
+  //     of subsequent operations.
+  using MemProfile = std::vector<SizeT>;
+
+  // Used for specifying memory consumption at a certain operation in the
+  // computational graph.
+  struct MemSpec {
+    int op_index;  // The index of the operation
+    SizeT size;    // The amount of memory needed in order to execute this
+                   // operation, i.e., the sum of input and output sizes and the
+                   // sizes of outputs of previous operations that are needed as
+                   // inputs of subsequent operations.
+    explicit MemSpec(int op_index = 0, SizeT size = 0)
+        : op_index(op_index), size(size) {}
+  };
+
+  static bool BySize(const MemSpec& a, const MemSpec& b) {
+    return std::tie(a.size, a.op_index) < std::tie(b.size, b.op_index);
+  }
+
+  static bool ByOpIndex(const MemSpec& a, const MemSpec& b) {
+    return std::tie(a.op_index, a.size) < std::tie(b.op_index, b.size);
+  }
+
+  // Specifies an elementary rematerialization operation: The operations in
+  // operations [`begin`, `end`) will be rescheduled before operation `insert`.
+  // A valid `RematSpec` requires begin <= end <= insert <= number of
+  // operations. Note that (1) `end` is exclusive -- begin == end signifies a
+  // trivial RematSpec (no operation will be rescheduled), (2) the
+  // zero-initialized RematSpec {} is trivial and always valid.
+  struct RematSpec {
+    int begin;
+    int end;
+    int insert;
+  };
+
+  // Gives the peak memory location and size after inserting operations
+  // according to `remat` (but doesn't actually insert them.)  Ties are broken
+  // towards later locations. `remat` must be valid (see above).
+  MemSpec GetPeakMemory(const RematSpec& remat = {}) const;
+
+  // Gives memory profile after inserting operations according to `remat` (but
+  // doesn't actually insert them). `remat` must be valid (see above).
+  MemProfile GetMemProfile(const RematSpec& remat = {}) const;
+
+  // Runs the greedy incremental block algorithm: Finds a sequence of
+  // rematerializations of block size up to max_block_length, each reducing peak
+  // memory by at least min_savings. If max_cost >= 0, at most max_cost
+  // operations will be re-inserted. For each rematerialization found,
+  // ApplyRemat is invoked (which can be used to apply the rematerialization to
+  // the higher- level representation, e.g., MLIR, flatbuffer, ...)
+  void RunGreedyAlgorithm(int max_cost, int max_block_length,
+                          SizeT min_savings);
+
+  virtual void ApplyRemat(const RematSpec& remat) {}
+
+ protected:
+  // Rematerializes the outputs of the operations [`remat.begin`, `remat.end`)
+  // before operation remat.insert by copying that operation range before
+  // remat.insert and updating tensor references so that any operation that can
+  // will make use of the rematerialized outputs rather than the original ones.
+  // `remat` must be valid (see above).
+  void Remat(const RematSpec& remat);
+
+  // The protected methods below are to be used by derived classes to create the
+  // low-level (this class) representation from a high-level one.
+
+  // Creates a new tensor-like object that takes `size` bytes. Returns a
+  // contiguous increasing index for each new object, starting at 0.
+  int AddTensor(SizeT size);
+
+  // Creates an operation. If `is_stateful`, the operation (and any block of
+  // operations containing it) will never be considered for rematerialization.
+  // Returns a contiguous increasing index for each new object, starting at 0.
+  int AddOperation(bool is_stateful);
+
+  // The operator with index `ioperation` will be assumed to produce and/or
+  // consume the tensor with index `itensor`. NoOp if that's already the case.
+  // The arguments must be valid indices (i.e., obtained with
+  // `AddOperation`/`AddTensor`).
+  void AddUse(int ioperation, int itensor);
+
+  // Undoes an AddUse(ioperation, itensor). NoOp if there was no prior `AddUse`.
+  // The arguments must be valid indices (i.e., obtained with
+  // `AddOperation`/`AddTensor`).
+  void DelUse(int ioperation, int itensor);
+
+ private:
+  // Find the best remat operation that saves at least `min_savings` bytes for a
+  // block of operators with a length is between [`begin_len`, `end_len`).
+  // 'Best' means with the highest savings, ties are broken towards shorter
+  // blocks.
+  std::tuple<SizeT, RematSpec> FindBestRemat(SizeT min_savings, int begin_len,
+                                             int end_len) const;
+
+  // Optimization: Estimate (from above) the remat savings of instruction block
+  // [begin, end) after operation `peak_location`
+  SizeT MaxSavings(int begin, int end, int peak_loc) const;
+
+  // If I want to remat ops [begin, end) after the op at operation `peak_loc`,
+  // find the latest point at which to reinsert them (the op before which to
+  // insert.)
+  int FindBestRematPoint(int begin, int end, int peak_loc) const;
+
+  // The memory objects.
+  struct Tensor {
+    SizeT size;                   // The size of the object (in bytes.)
+    std::vector<int> operations;  // The operations it is used in. This vector
+                                  // is kept sorted + unique.
+
+    // The operation that makes the first use of this tensor.
+    int first_use() const { return *operations.begin(); }
+
+    // The operation that makes the last use of this tensor.
+    int last_use() const { return *operations.rbegin(); }
+  };
+
+  // The operators.
+  struct Operation {
+    bool is_stateful = false;  // Results of an Operation can be rematerialized
+                               // only if `!is_stateful`. This probably should
+                               // be replaced with a more-fine grained
+                               // approach--for example, the results of a "read
+                               // resource variable" operation can be
+                               // rematerialized as long as this doesn't happen
+                               // after the corresponding "write resource
+                               // variable" operation.
+
+    std::vector<int> tensors;  // The tensors that are used (input or output) by
+                               // this operation. They needn't correspond to
+                               // tensors in the TF graph -- we may add fake
+                               // tensors to model memory consumed in addition
+                               // to input and output tensors. This vector is
+                               // kept sorted + unique.
+
+    SizeT alloc = 0;    // The number of bytes that need to be allocated before
+                        // this operation.
+    SizeT dealloc = 0;  // The number of bytes that can be deallocated after
+                        // this operation.
+  };
+
+  // Given the current state of `operations_` and `tensors_`, return a vector of
+  // corrections that transform the current memory profile into the one that we
+  // would get after applying `remat`.
+  //
+  // The memory profile of a sequence of operations is the partial sum of the
+  // sizes of the allocations that are necessary before an operation and the
+  // negative sizes of the deallocations that are possible after the previous
+  // operation.
+  //
+  // If we modify the operation sequence by cloning an operation range, that
+  // memory profile will change--cloning makes it necessary to extend the
+  // lifetime of some tensors, while other tensors can be deallocated early and
+  // rematerialized later.
+  //
+  // This method represents these changes in compact form: It returns a vector
+  // of (position of operation, delta) pairs in lexicographic order; one
+  // obtains the memory profile after `remat` by adding the deltas from any
+  // entries (i, delta) to the i-th entry of the partial sum.
+  //
+  // This allows us to efficiently compute the change to the peak of a memory
+  // profile due to cloning an operation range without having to actually clone
+  // that range and without having to build a profile vector.
+  //
+  // The returned vector has at most 2 entries for each tensor referenced in
+  // [remat.begin, remat.end). There may be multiple entries for a single
+  // operation position; operation positions refer to the sequence *after*
+  // cloning [`remat.begin`, `remat.end`) before `remat.insert`.
+  std::vector<MemSpec> GetDeltas(const RematSpec& remat) const;
+
+  // Helper template: Iterates through all `MemSpec`s (i.e., operation
+  // index/memory usage pairs) for the current graph in operation order and
+  // calls `mapper` on them. This is an optimization -- by instantiating with an
+  // appropriate `Mapper`, it allows us to e.g. compute the peak memory without
+  // having to instantiate an actual memory profile vector.
+  template <class Mapper>
+  void MapMem(const Mapper& mapper, const RematSpec& remat) const {
+    const auto deltas = GetDeltas(remat);
+    const auto len = (remat.end - remat.begin);
+    auto idelta = deltas.begin();
+
+    for (MemSpec m; m.op_index < operations_.size() + len; ++m.op_index) {
+      // Are we in the cloned portion of the new operation sequence?
+      // Then all alloc/dealloc information must come from deltas.
+      const bool patch =
+          (m.op_index >= remat.insert) && (m.op_index < remat.insert + len);
+      // Are we past the insertion portion of the new operation sequence?
+      // Then we need to convert indices back to the original sequence.
+      const int shift = (m.op_index >= remat.insert + len) ? len : 0;
+      m.size += patch ? 0 : operations_[m.op_index - shift].alloc;
+      // deltas is sorted by location; apply any corrections to the current
+      // operator.
+      for (; idelta != deltas.end() && idelta->op_index == m.op_index;
+           ++idelta) {
+        m.size += idelta->size;
+      }
+      mapper(m);
+      m.size -= patch ? 0 : operations_[m.op_index - shift].dealloc;
+    }
+  }
+
+  std::vector<Operation> operations_;
+  std::vector<Tensor> tensors_;
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_REMAT_REMATERIALIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/cost.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/cost.h
new file mode 100644
index 00000000..2d79e8d3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/cost.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_COST_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_COST_H_
+
+#include <string>
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Cost attribute string on the TFL dialect.
+constexpr char kCost[] = "tac.cost";
+
+inline void UpdateCost(Operation* op, float cost, OpBuilder* builder) {
+  op->setAttr(kCost, builder->getF32FloatAttr(cost));
+}
+
+// Get the cost annotated with kCost.
+inline bool GetCostOnOp(Operation* op, float* cost) {
+  auto cost_type = op->getAttrOfType<FloatAttr>(kCost);
+  if (cost_type == nullptr) {
+    return false;
+  }
+
+  *cost = cost_type.getValueAsDouble();
+  return true;
+}
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_COST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h
new file mode 100644
index 00000000..ed61f74c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h
@@ -0,0 +1,52 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_SUBGRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_SUBGRAPH_H_
+
+#include <optional>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Interface name here is the "hook" between the CallOp and FuncOps.
+// Take the following example:
+//
+// call @func_1_CPU {tac.interface_name = "func_1"}
+//
+// "func_1" is the interface name where "func_1_cpu" is the real implementation
+// we can have multiple FuncOps like "func_1_cpu" and "func_1_gpu" and they
+// both implement "func_1".
+//
+// The attribute on the FuncOp means what it actually implements while the
+// attribute on the CallOp means what it actually looks for.
+constexpr char kInterfaceNameAttr[] = "tac.interface_name";
+
+inline std::optional<std::string> GetInterFaceName(Operation* op) {
+  auto name_attr = op->getAttrOfType<StringAttr>(kInterfaceNameAttr);
+  if (!name_attr) return std::nullopt;
+  return name_attr.getValue().str();
+}
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
new file mode 100644
index 00000000..2f299287
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
@@ -0,0 +1,150 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_TARGETS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_TARGETS_H_
+
+#include <algorithm>
+#include <cctype>
+#include <functional>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Device attribute string on the TFL dialect.
+constexpr char kDevice[] = "tac.device";
+
+// Inference type.
+constexpr char kInferenceType[] = "tac.inference_type";
+
+// Inference type.
+constexpr char kSkipTargetAnnotation[] = "tac.skip_target_annotation";
+
+// TODO(renjieliu): Add more inference types.
+enum InferenceType {
+  UNKNOWN = 0,
+  FLOAT = 1,
+  QUANTIZED_INT8 = 2,
+  QUANTIZED_UINT8 = 3,
+  HYBRID = 4
+};
+
+inline InferenceType GetInferenceTypeEnum(llvm::StringRef inference_type_str) {
+  if (inference_type_str == "FLOAT") {
+    return FLOAT;
+  } else if (inference_type_str == "QUANTIZED_INT8") {
+    return QUANTIZED_INT8;
+  } else if (inference_type_str == "QUANTIZED_UINT8") {
+    return QUANTIZED_UINT8;
+  } else if (inference_type_str == "HYBRID") {
+    return HYBRID;
+  } else {
+    return UNKNOWN;
+  }
+}
+
+inline std::string GetInferenceString(InferenceType inference_type) {
+  if (inference_type == FLOAT) {
+    return "FLOAT";
+  } else if (inference_type == QUANTIZED_INT8) {
+    return "QUANTIZED_INT8";
+  } else if (inference_type == QUANTIZED_UINT8) {
+    return "QUANTIZED_UINT8";
+  } else if (inference_type == HYBRID) {
+    return "HYBRID";
+  } else {
+    return "UNKNOWN";
+  }
+}
+
+// Returns canonical representation for hardware name (All uppercase).
+// TODO(b/177376459): Remove this in favor of the string defined by hardwares
+// MyHardware::kId.
+inline std::string GetCanonicalHardwareName(const std::string& hardware_name) {
+  std::string name = hardware_name;
+  std::transform(
+      name.begin(), name.end(), name.begin(),
+      [](unsigned char c) -> unsigned char { return std::toupper(c); });
+  return name;
+}
+
+// Get the target annotation form the op.
+inline std::optional<std::string> GetTargetAnnotation(Operation* op) {
+  auto device = op->getAttrOfType<StringAttr>(kDevice);
+  if (device == nullptr || device.getValue().empty()) return std::nullopt;
+
+  return GetCanonicalHardwareName(device.getValue().str());
+}
+
+// Get inference type attribute from the operation if available.
+inline std::optional<InferenceType> GetInferenceTypeAnnotation(Operation* op) {
+  auto inference_type = op->getAttrOfType<StringAttr>(kInferenceType);
+  if (inference_type == nullptr) return std::nullopt;
+
+  llvm::StringRef device_name_str = inference_type.getValue();
+  return GetInferenceTypeEnum(device_name_str);
+}
+
+// InferenceDeviceType is a combination of the hardware with inference type.
+struct InferenceDeviceType {
+  std::string hardware;
+  InferenceType inference_type;
+
+  bool operator==(const InferenceDeviceType& other) const {
+    return (hardware == other.hardware) &&
+           (inference_type == other.inference_type);
+  }
+
+  bool operator!=(const InferenceDeviceType& other) const {
+    return !(*this == other);
+  }
+
+  struct inference_device_type_hash {
+    size_t operator()(const InferenceDeviceType& p) const {
+      auto hash1 = std::hash<std::string>{}(p.hardware);
+      auto hash2 = std::hash<InferenceType>{}(p.inference_type);
+      return hash1 ^ hash2;
+    }
+  };
+};
+
+// Get InferenceDeviceType attribute from the operation if available.
+inline std::optional<InferenceDeviceType> GetInferenceDeviceTypeForOp(
+    Operation* op) {
+  auto hardware = GetTargetAnnotation(op);
+  if (!hardware.has_value()) return std::nullopt;
+
+  auto inference_type = GetInferenceTypeAnnotation(op);
+  if (!inference_type.has_value()) return std::nullopt;
+
+  InferenceDeviceType inference_device_type;
+  inference_device_type.hardware = hardware.value();
+  inference_device_type.inference_type = inference_type.value();
+  return inference_device_type;
+}
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_TARGETS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h
new file mode 100644
index 00000000..741ee5d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h
@@ -0,0 +1,94 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_UTILS_H_
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/CastInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Returns true if 'op' is non const op. Returns false otherwise or if
+// 'op' is null.
+inline bool IsNonConstOp(Operation* op) {
+  if (!op) return false;
+  if (llvm::isa<arith::ConstantOp, mlir::func::ConstantOp>(op)) return false;
+  if (op->hasTrait<OpTrait::ConstantLike>()) return false;
+  if (llvm::isa<TFL::ConstOp, TFL::QConstOp>(op)) return false;
+  return true;
+}
+
+// Returns true if 'op' is a terminator op, otherwise false.
+bool IsTerminatorOp(Operation* op);
+
+// Returns true if 'op' is not TFL Quant / Dequant op. Returns False otherwise
+// or if 'op' is null.
+bool NotTFLQuantDequantizeOp(Operation* op);
+
+// Returns true if it is a shaped type of f32 elements.
+inline bool IsF32ShapedType(Type t) {
+  if (auto shaped_type = mlir::dyn_cast_or_null<ShapedType>(t)) {
+    return shaped_type.getElementType().isF32();
+  }
+  return false;
+}
+
+// Return true when the given element_type is QI8.
+inline bool IsQI8Type(Type t) {
+  auto quantized_type = quant::QuantizedType::getQuantizedElementType(t);
+  return quantized_type != nullptr &&
+         quantized_type.getStorageTypeIntegralWidth() == 8 &&
+         quantized_type.isSigned();
+}
+
+// Return true when the given element_type is QUI8.
+inline bool IsQUI8Type(Type t) {
+  auto quantized_type = quant::QuantizedType::getQuantizedElementType(t);
+  return quantized_type != nullptr &&
+         quantized_type.getStorageTypeIntegralWidth() == 8 &&
+         !quantized_type.isSigned();
+}
+
+// Return true when the given element_type is QI32.
+inline bool IsQI32Type(Type t) {
+  auto quantized_type = quant::QuantizedType::getQuantizedElementType(t);
+  return quantized_type != nullptr &&
+         quantized_type.getStorageTypeIntegralWidth() == 32 &&
+         quantized_type.isSigned();
+}
+
+// Try to guess the inference type of the op.
+InferenceType GetInferenceType(Operation* op);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.h
new file mode 100644
index 00000000..84264907
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_EXAMPLES_EXAMPLE_HARDWARE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_EXAMPLES_EXAMPLE_HARDWARE_H_
+
+#include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/simple_hardware.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+class ExampleHardware : public SimpleHardware {
+ public:
+  static constexpr char kId[] = "ExampleHardware";
+
+  mlir::RewritePatternSet GetTransformations(
+      MLIRContext* context) const override;
+
+  mlir::TypeID GetTypeId() const override {
+    return mlir::TypeID::get<ExampleHardware>();
+  }
+
+  bool IsNotSupportedOp(mlir::Operation* op) const override { return false; }
+
+  float AdvantageOverCPU() const override { return 5.0; }
+};
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_EXAMPLES_EXAMPLE_HARDWARE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.h
new file mode 100644
index 00000000..4a5f5f11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.h
@@ -0,0 +1,31 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_EXECUTION_METADATA_EXPORTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_EXECUTION_METADATA_EXPORTER_H_
+
+#include <optional>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace tflite {
+
+// Returns serialized string for the generated flatbuffer.
+std::optional<std::string> ExportRuntimeMetadata(mlir::ModuleOp module);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_EXECUTION_METADATA_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h
new file mode 100644
index 00000000..149c2076
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_GPU_HARDWARE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_GPU_HARDWARE_H_
+
+#include <cstddef>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+// Gpu hardware class which handles GPU capabilities in TFLite.
+// This is used by TAC to get op supported/ op cost estimates on GPU.
+class GpuHardware : public TargetHardware {
+ public:
+  static constexpr char kId[] = "GPU";
+  mlir::RewritePatternSet GetTransformations(
+      MLIRContext* context) const override;
+
+  mlir::TypeID GetTypeId() const override {
+    return mlir::TypeID::get<GpuHardware>();
+  }
+
+  double GetHardwareSwitchingCost(const TargetHardware* from,
+                                  size_t buffer_size) const override;
+};
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_GPU_HARDWARE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.h
new file mode 100644
index 00000000..51c1c117
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* NNAPI Hardware Implementation */
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_NNAPI_HARDWARE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_NNAPI_HARDWARE_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/simple_hardware.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+class NNAPIHardware : public SimpleHardware {
+ public:
+  static constexpr char kId[] = "NNAPI";
+
+  mlir::RewritePatternSet GetTransformations(
+      MLIRContext* context) const override;
+
+  mlir::TypeID GetTypeId() const override {
+    return mlir::TypeID::get<NNAPIHardware>();
+  }
+
+  bool IsNotSupportedOp(mlir::Operation* op) const override { return false; }
+
+  float AdvantageOverCPU() const override { return 5.0; }
+};
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_NNAPI_HARDWARE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/simple_hardware.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/simple_hardware.h
new file mode 100644
index 00000000..ca371544
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/simple_hardware.h
@@ -0,0 +1,67 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_SIMPLE_HARDWARE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_SIMPLE_HARDWARE_H_
+
+#include <cstddef>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// A simple hardware is an interface makes you add a target backend easily if
+// you don't want too much customization.
+//
+// It allows you to easily specify the ops capabilities (by
+// specifying the denylist), the rest ops will be considered supported. Also you
+// can also specify the advantage over CPU.
+//
+// If you need more customization, e.g., if you have your own hardware dialect,
+// please consider use TargetHardware directly.
+class SimpleHardware : public TargetHardware {
+ public:
+  // This is essentially a denylist.
+  // TODO(renjieliu): Consider whether we want an allowlist for custom op as
+  // well.
+  virtual bool IsNotSupportedOp(mlir::Operation* op) const = 0;
+
+  // The larger the value is, the more preferrable over CPU.
+  // If the value > 1, means the hardware has advantage over CPU.
+  // If the value < 1, means CPU is more preferred.
+  // If we specify 10.0, meaning the hardware is 10x faster than CPU.
+  // The value should be > 0.
+  // TODO(renjieliu): Consider add an interface for more detailed customization,
+  // for example, users should be able to specify some ops are preferred and
+  // some are less preferred.
+  virtual float AdvantageOverCPU() const = 0;
+
+ private:
+  bool IsOpSupported(mlir::Operation* op) const override;
+
+  double GetHardwareSwitchingCost(const TargetHardware* from,
+                                  size_t buffer_size) const override;
+
+  double GetOpCost(mlir::Operation* op) const override;
+};
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_SIMPLE_HARDWARE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
new file mode 100644
index 00000000..136bb5ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
@@ -0,0 +1,195 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_TARGET_HARDWARE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_TARGET_HARDWARE_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Default fixed values for ops.
+constexpr static float kDefaultFixedValuedCost = 1000000.0;
+
+// This is just fake data.
+constexpr static float kCrossHardwareTransferPerByteCost = 5.0f;
+
+// This is just fake data.
+constexpr static float kCrossHardwareTransferFixedCost = 10.f;
+
+// Interface for an Operation capabilities which should be tied to
+// a specific hardware.
+// Users should implement the interface and use TargetHardwareOpRegistration
+// for registering the operation.
+class TargetHardwareOperation {
+ public:
+  virtual ~TargetHardwareOperation() = default;
+
+  virtual double GetOpCost(mlir::Operation* op) const = 0;
+
+  virtual bool IsOpSupported(mlir::Operation* op) const = 0;
+};
+
+// Abstract base class for a hardware.
+// To introduce new hardware
+// users should implement the interface and use TargetHardwareRegistration
+// for registering the hardware.
+// Subclasses must implement the pure virtual function interface and
+// define static member variable that retrieves string identifying the Target
+// Hardware. Example,
+// class MyType : public TargetHardware {
+//  public:
+//   static constexpr char kId[] = "MyHardware";
+// };
+class TargetHardware {
+ public:
+  virtual ~TargetHardware() = default;
+
+  // Initializes all TargetHardwareOperation registered for this hardware.
+  // Users overriding this function, should call the base class method to
+  // initialize the ops.
+  virtual bool Init();
+
+  // Returns the cost of running 'op' on this Hardware.
+  virtual double GetOpCost(mlir::Operation* op) const;
+
+  // Returns the cost of running the whole function on this hardware.
+  // By default this is the sum of the cost of individual cost for each op.
+  virtual double GetFuncCost(func::FuncOp* func) const;
+
+  // Returns true if 'op' can run on this Hardware.
+  virtual bool IsOpSupported(mlir::Operation* op) const;
+
+  // Switching cost between from hardware and this hardware.
+  // If both the hardwares are the same, the transfer cost is basically 0.
+  virtual double GetHardwareSwitchingCost(const TargetHardware* from,
+                                          size_t buffer_size) const = 0;
+
+  // Returns a list of all patterns to apply for this hardware.
+  virtual mlir::RewritePatternSet GetTransformations(
+      MLIRContext* context) const = 0;
+
+  // Returns TypeId for the provided hardware.
+  // Usually should be something like mlir::TypeID::get<MyType>()
+  virtual mlir::TypeID GetTypeId() const = 0;
+
+  virtual void GetDependentDialects(mlir::DialectRegistry& registry) const {}
+
+ protected:
+  // All registered hardware ops.
+  std::vector<std::unique_ptr<TargetHardwareOperation>> hardware_ops_;
+};
+
+// Returns pointer to the Hardware identified by 'hardware_name'.
+// If not found nullptr is returned.
+// DEPRECATED: Do not use, prefer GetTargetHardwareFactory instead.
+const TargetHardware* GetTargetHardware(const std::string& hardware_name);
+
+// Returns the factory method for the requested hardware if present.
+std::function<std::unique_ptr<TargetHardware>()> GetTargetHardwareFactory(
+    const std::string& hardware_name);
+
+namespace internal {
+
+void RegisterTargetHardwareFactory(
+    const std::string& unique_name, const std::string& description,
+    mlir::TypeID type_id,
+    std::function<std::unique_ptr<TargetHardware>()> target_hardware_factory);
+
+// Registers the provided target hardware factory.
+template <typename T>
+void RegisterTargetHardwareFactory(
+    const std::string& description,
+    std::function<std::unique_ptr<TargetHardware>()> target_hardware_factory) {
+  RegisterTargetHardwareFactory(T::kId, description, mlir::TypeID::get<T>(),
+                                target_hardware_factory);
+}
+
+// DEPRECATED: Do not use, prefer RegisterTargetHardwareOpFactory intstead.
+void RegisterTargetHardwareOp(
+    mlir::TypeID hardware_type, mlir::TypeID op_type,
+    std::function<std::unique_ptr<TargetHardwareOperation>()>
+        target_hardware_op_factory);
+
+void RegisterTargetHardwareOpFactory(
+    mlir::TypeID hardware_type, mlir::TypeID op_type,
+    std::function<std::unique_ptr<TargetHardwareOperation>()>
+        target_hardware_op_factory);
+}  // namespace internal
+
+// Register target hardware.
+template <typename Hardware>
+struct TargetHardwareRegistration {
+  TargetHardwareRegistration(const std::string& description,
+                             std::function<std::unique_ptr<TargetHardware>()>
+                                 target_hardware_factory) {
+    internal::RegisterTargetHardwareFactory<Hardware>(description,
+                                                      target_hardware_factory);
+  }
+};
+
+// Register Op capabilities for specific hardware.
+template <typename Hardware, typename Op>
+struct TargetHardwareOpRegistration {
+  explicit TargetHardwareOpRegistration(
+      std::function<std::unique_ptr<TargetHardwareOperation>()>
+          target_hardware_op_factory) {
+    // TODO(b/177376459): remove this.
+    internal::RegisterTargetHardwareOp(mlir::TypeID::get<Hardware>(),
+                                       mlir::TypeID::get<Op>(),
+                                       target_hardware_op_factory);
+    internal::RegisterTargetHardwareOpFactory(mlir::TypeID::get<Hardware>(),
+                                              mlir::TypeID::get<Op>(),
+                                              target_hardware_op_factory);
+  }
+};
+
+//======== util functions ==========
+
+// Process user specified device specs, will always add CPU if it's not there.
+// specified_device_specs: ',' separated, like "GPU,DSP,CPU".
+// device_specs: processed device specs enum.
+bool ProcessTargetDevices(llvm::ArrayRef<std::string> specified_device_specs,
+                          std::vector<std::string>* device_specs);
+
+// Check whether two hardwares are the same.
+inline bool IsSameHardware(const TargetHardware* lhs,
+                           const TargetHardware* rhs) {
+  return lhs->GetTypeId() == rhs->GetTypeId();
+}
+
+// Returns the ID identifying 'hardware'. This should match the ID defined
+// in the hardware field ID.
+// For example, if MyHardware is passed the value returned should match
+// MyHardware::kId.
+std::string GetHardwareName(const TargetHardware* hardware);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_HARDWARES_TARGET_HARDWARE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.h
new file mode 100644
index 00000000..5776891f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_PY_WRAPPER_TAC_WRAPPER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_PY_WRAPPER_TAC_WRAPPER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+namespace tflite {
+
+// Run target-aware-conversion for the given tflite model with the given device
+// specs.
+// Warning: The API is experimental and subject to change.
+bool run_tac(const std::string& model_file_path,
+             const std::vector<std::string>& device_specs,
+             const std::string& model_output_path);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_PY_WRAPPER_TAC_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h
new file mode 100644
index 00000000..a40a3b94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_IMPORTER_EXPORTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_IMPORTER_EXPORTER_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Interface for Importing program to TAC (Target Aware Conversion) Module.
+// This class is an interface for importing program in TAC.
+// See TacModule in how to register it with the module and use it.
+class TacImporter {
+ public:
+  virtual ~TacImporter() = default;
+
+  // Imports and returns the Module for the imported program.
+  virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Import() = 0;
+};
+
+// Interface for exporting a module.
+// Users should implement the interface for exporting the result from TAC
+// in their preferred way.
+// See TacModule in how to register it with the module and use it.
+class TacExporter {
+ public:
+  virtual ~TacExporter() = default;
+
+  // Imports and returns the Module for the imported program.
+  virtual absl::Status Export(mlir::ModuleOp module) = 0;
+};
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_IMPORTER_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h
new file mode 100644
index 00000000..7733a9bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h
@@ -0,0 +1,122 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_MODULE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_MODULE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h"
+#include "tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Main class for using Target Aware Conversion (TAC).
+// To run TAC:
+// 1) users should create object form this class, with desired options
+//    (TacModule::Options).
+// 2) Use SetImporter/SetExporter to the desired importer
+//    and exporter.
+// 3) Call Run()
+//
+// The module fetches all TargetHardware backends registered in the binary
+// and only create TargetHardware requested in Options.
+//
+// This class is not thread safe.
+class TacModule {
+ public:
+  // TAC options. Contains knobs to configure TAC as needed.
+  struct Options {
+    // List of names for the requested Target hardware.
+    std::vector<std::string> hardware_backends;
+    // Debug mode.
+    // This will output different alternative subgraphs in mlir format for debug
+    // purpose.
+    bool debug_mode = false;
+    // Whether to enable inliner passes or not.
+    bool enable_inliner = false;
+    // Whether to legalize ops to TFLite ops before exporting.
+    bool legalize_to_tflite_ops = false;
+  };
+
+  virtual ~TacModule() = default;
+
+  explicit TacModule(const Options& options) : options_(options) {}
+
+  void SetImporter(std::unique_ptr<TacImporter> importer) {
+    importer_ = std::move(importer);
+  }
+
+  void SetExporter(std::unique_ptr<TacExporter> exporter) {
+    exporter_ = std::move(exporter);
+  }
+
+  // Returns pointer to the TargetHardware that is identified by 'hardware_name'
+  // Returns NULL If no hardware with this name found.
+  const tac::TargetHardware* GetTargetHardware(
+      const std::string& hardware_name) const;
+
+  // Runs the TAC workflow, configured as in the options provided during
+  // construction.
+  // SetImporter/SetExporter should be called prior to invoking `Run`.
+  // Returns Status of the Run.
+  virtual absl::Status Run();
+
+  // Returns all available hardware backends registered in this module
+  // instance.
+  const std::vector<const tac::TargetHardware*>& GetAvailableHardwares() const {
+    return const_backends_;
+  }
+
+  // Registers all dialects in 'registry' with the module.
+  // This to allow clients to register extra dialects required.
+  void RegisterExtraDialects(mlir::DialectRegistry& registry);
+
+ protected:
+  // Adds TAC passes to the 'pass_manager'.
+  virtual void AddTACPass(mlir::OpPassManager* pass_manager,
+                          llvm::ArrayRef<std::string> device_specs);
+
+ private:
+  // Runs all TAC passes on the provided module.
+  absl::Status RunTacPasses(mlir::ModuleOp* module, bool debug_mode = false);
+
+  // Create instances of all registered hardwares.
+  std::vector<std::unique_ptr<tac::TargetHardware>> InstantiateBackends();
+
+  std::unique_ptr<TacImporter> importer_;
+  std::unique_ptr<TacExporter> exporter_;
+  // Owned list of all target hardware backends.
+  std::vector<std::unique_ptr<tac::TargetHardware>> backends_;
+  // Holder for const pointers for the data in 'backends_'
+  std::vector<const tac::TargetHardware*> const_backends_;
+  // Extra dialects requested by the user.
+  mlir::DialectRegistry registry_;
+
+  const Options options_;
+};
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_MODULE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h
new file mode 100644
index 00000000..ed59787f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h
@@ -0,0 +1,73 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TFLITE_IMPORT_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TFLITE_IMPORT_EXPORT_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/Support/SourceMgr.h"
+#include "tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+// TAC Importer for TFLite.
+// This import to MLIR from tflite file or MLIR
+class TfLiteImporter : public mlir::TFL::tac::TacImporter {
+ public:
+  // Options for configuring the importer.
+  struct Options {
+    std::string file_name;
+    // Whether the input file is an MLIR not tflite file.
+    bool input_mlir = false;
+  };
+
+  explicit TfLiteImporter(const Options& options) : options_(options) {}
+
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Import() override;
+
+ private:
+  Options options_;
+  mlir::MLIRContext context_;
+  llvm::SourceMgr source_mgr_;
+  std::unique_ptr<mlir::SourceMgrDiagnosticHandler> source_mgr_handler_;
+};
+
+// TAC Exporter. It exports the provided Module to a tflite file.
+class TfLiteExporter : public mlir::TFL::tac::TacExporter {
+ public:
+  // Exporter configuration options.
+  struct Options {
+    bool export_runtime_metadata = false;
+    bool output_mlir = false;
+    std::string output_file_name;
+    std::vector<std::string> target_hardware_backends;
+  };
+
+  explicit TfLiteExporter(const Options& options) : options_(options) {}
+
+  absl::Status Export(mlir::ModuleOp module) override;
+
+ private:
+  Options options_;
+};
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TFLITE_IMPORT_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.h
new file mode 100644
index 00000000..86445235
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.h
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_COST_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_COST_MODEL_H_
+
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// TODO(renjieliu): We need to come up with a better strategy to do cost
+// estimatation. Maybe build a big lookup table for all the ops.
+
+// TODO(renjieliu): We need to consider what's the default value if we cannot
+// analyze the cost.
+
+// ================== Interface  ========================
+
+// Get the estimated cost for the op under the given hardware spec senario.
+float GetCostForOp(Operation* op, const std::string& hardware);
+
+// Get the estimated cost for the whole function under the given hardware.
+float GetCostForFunc(func::FuncOp* func, const std::string& hardware);
+
+// Get the transfer cost given from & to hardware info.
+// We will only calculate for the "necessary" tensor transferred.
+// from_graph & to_graph are used to compute the "necessary" tensors.
+//     from_graph
+//    /    \   \
+//  out1   out2  out3
+//           \   /
+//           to_graph
+// So only out2 & out3 are counted.
+float GetTransferCost(const std::string& from_hardware_str,
+                      const std::string& to_hardware_str,
+                      func::CallOp from_graph, func::CallOp to_graph);
+
+// Get the cross quantization/dequantization boundary cost.
+float GetQuantDequantCost(InferenceType from_inference_type,
+                          InferenceType to_inference_type,
+                          func::CallOp from_graph, func::CallOp to_graph);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_COST_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.h
new file mode 100644
index 00000000..e6d77838
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_H_
+
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Returns true if 'op' is supported to run on 'hardware'.
+bool IsSupported(Operation* op, const std::string& hardware);
+
+// Return proper rewriter patterns for different hardwares.
+RewritePatternSet GetHardwareRewritePatterns(MLIRContext* context,
+                                             const std::string& hardware);
+
+// Convert quantized ops to float, this will essentially insert dequantize &
+// quantize pair around the op.
+void ConvertQuantizedOpToFloat(func::FuncOp func, OpBuilder* builder);
+
+// This will optimize the quantized ops -> float graph.
+void OptimizeQuantizedOpToFloat(func::FuncOp func, MLIRContext* context);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_gpu.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_gpu.h
new file mode 100644
index 00000000..9de0e3c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_gpu.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_GPU_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_GPU_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// nit: Returns all the gpu suitable transformation patterns.
+RewritePatternSet GetHardwareRewritePatternsGPU(MLIRContext* context);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h
new file mode 100644
index 00000000..3866d576
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h
@@ -0,0 +1,115 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_PATTERNS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_PATTERNS_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// TODO(renjieliu): add more patterns.
+
+// This basically:
+// Pack => (Concat -> Reshape)
+struct LowerPackIntoConcatReshape : public OpRewritePattern<TFL::PackOp> {
+  using OpRewritePattern<TFL::PackOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::PackOp pack_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+struct SquaredDifference : public OpRewritePattern<TFL::SquaredDifferenceOp> {
+  using OpRewritePattern<TFL::SquaredDifferenceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::SquaredDifferenceOp squared_diff_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Unroll split into a bunch of slice ops.
+struct UnrollSplit : public OpRewritePattern<TFL::SplitOp> {
+  using OpRewritePattern<TFL::SplitOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::SplitOp split_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Unroll splitv into a bunch of slice ops.
+struct UnrollSplitV : public OpRewritePattern<TFL::SplitVOp> {
+  using OpRewritePattern<TFL::SplitVOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::SplitVOp splitv_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Ensure bias for conv2d op.
+struct EnsureBiasForConv2d : public OpRewritePattern<TFL::Conv2DOp> {
+  using OpRewritePattern<TFL::Conv2DOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::Conv2DOp conv_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Pad slice to 4d.
+struct PadSlice : public OpRewritePattern<TFL::SliceOp> {
+  using OpRewritePattern<TFL::SliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::SliceOp slice_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Fully connected to conv2d.
+struct FullyConnectedToConv : public OpRewritePattern<TFL::FullyConnectedOp> {
+  using OpRewritePattern<TFL::FullyConnectedOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::FullyConnectedOp fc_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Pad concat to 4d.
+struct PadConcat : public OpRewritePattern<TFL::ConcatenationOp> {
+  using OpRewritePattern<TFL::ConcatenationOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::ConcatenationOp concat_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Convert reduce mean 4d to avg pool.
+struct ReduceMeanToAvgPool : public OpRewritePattern<TFL::MeanOp> {
+  using OpRewritePattern<TFL::MeanOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::MeanOp mean_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Insert Requant ops for reduce_mean.
+struct InsertRequantForReduceMean : public OpRewritePattern<TFL::MeanOp> {
+  using OpRewritePattern<TFL::MeanOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::MeanOp mean_op,
+                                PatternRewriter& rewriter) const override;
+};
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_DEVICE_TRANSFORM_PATTERNS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
new file mode 100644
index 00000000..a16b0f77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
@@ -0,0 +1,77 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/tac_filter.pb.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+class TacModule;
+
+// Create an instance of the TargetAnnotationPass.
+// TODO(b/177376459): Remove in favor of the one below.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTargetAnnotationPass(
+    llvm::ArrayRef<std::string> device_specs);
+
+// Create and instance of TargetAnnotationPass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTargetAnnotationPass(
+    const TacModule* module);
+
+// Create an instance of the RaiseTargetSubgraphsPass. If `skip_raise_cpu_ops`,
+// we skip clustering for CPU ops for better clustering of ops running on other
+// ML accelerators. When `ignore_inference_type` is set to true, the inference
+// types are set to "NOT_CARE" for better clustering.
+std::unique_ptr<OperationPass<ModuleOp>> CreateRaiseTargetSubgraphsPass(
+    bool skip_raise_cpu_ops = false, bool ignore_inference_type = false);
+
+// Create an instance of the AlternativeSubgraphPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateAlternativeSubgraphPass(
+    llvm::ArrayRef<std::string> device_specs);
+
+// Create an instance of ComputeCostPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateComputeCostPass();
+
+// Create an instance of PickSubgraphsPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePickSubgraphsPass();
+
+// Create an instance of DeviceTransformGPUPass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDeviceTransformGPUPass();
+
+// Create an instance of GetOpCostPass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateGetOpCostPass();
+
+// Create an instance of FoldConstantsToSubgraphPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateFoldConstantsToSubgraphPass(
+    bool fold_all_constants);
+
+// Create an instance of TacFilterPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTacFilterPass(
+    ::third_party::tensorflow::compiler::mlir::lite::experimental::tac::
+        TacFilters* tac_filters);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
new file mode 100644
index 00000000..6e61dbe9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
@@ -0,0 +1,90 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_TAC_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_TAC_PASS_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h"
+#include "tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+// An OperationPass<> with access to the TAC module instance that the
+// pass is running part of.
+// See OperationPass<> comments for all details/restrictions of OperationPass.
+//
+// When adding new Pass to TAC, users should use this class as the base class
+// as it provides access to the TAC module.
+template <typename T>
+class TacPass : public OperationPass<T> {
+ public:
+  using OperationPass<T>::OperationPass;
+  explicit TacPass(const TacModule* module)
+      : OperationPass<T>::OperationPass(mlir::TypeID::get<T>()),
+        module_(module) {}
+
+  ~TacPass() override = default;
+
+  const TargetHardware* GetTargetHardware(
+      const std::string& hardware_name) const {
+    return module_ != nullptr
+               ? module_->GetTargetHardware(hardware_name)
+               : mlir::TFL::tac::GetTargetHardware(hardware_name);
+  }
+
+ protected:
+  const TacModule* module_ = nullptr;  // Not owned.
+};
+
+// A FunctionPass but with access to TAC module.
+// See FunctionPass comments for all details/restrictions of FunctionPass.
+//
+// When adding new Pass to TAC, users should use this class as the base class
+// as it provides access to the TAC module.
+template <typename T>
+class TacFunctionPass : public TacPass<func::FuncOp> {
+ public:
+  using TacPass<func::FuncOp>::TacPass;
+
+  ~TacFunctionPass() override = default;
+
+  mlir::func::FuncOp getFunction() { return getOperation(); }
+
+  virtual void runOnFunction() = 0;
+
+  void runOnOperation() final {
+    if (!getFunction().isExternal()) runOnFunction();
+  }
+
+ protected:
+  // Returns the derived pass name.
+  StringRef getName() const override { return llvm::getTypeName<T>(); }
+
+  // A clone method to create a copy of this pass.
+  std::unique_ptr<Pass> clonePass() const override {
+    return std::make_unique<T>(*static_cast<const T*>(this));
+  }
+};
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_TAC_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
new file mode 100644
index 00000000..049cc186
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_UTILS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_UTILS_UTILS_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+
+// Import the file as mlir module, the input maybe flatbuffer or mlir file.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportFlatbufferOrMlir(
+    const std::string& input_filename, bool input_mlir,
+    bool experimental_prune_unreachable_nodes_unconditionally,
+    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+
+// Export the module to file, can be either mlir or flatbuffer.
+absl::Status ExportFlatbufferOrMlir(
+    const std::string& output_filename, bool output_mlir, mlir::ModuleOp module,
+    bool enable_select_tf_ops,
+    std::optional<int> custom_option_alignment = std::nullopt);
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_UTILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_export.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_export.h
new file mode 100644
index 00000000..27cf2852
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_export.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
+
+#include <cstddef>
+#include <map>
+#include <optional>
+#include <string>
+#include <unordered_set>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+
+namespace tflite {
+// Options for exporting to Flatbuffer.
+struct FlatbufferExportOptions {
+  // ConverterFlags proto. The following fields are migrated.
+  // bool emit_builtin_tflite_ops  -> !converter_flags.force_select_tf_ops()
+  // bool emit_select_tf_ops       -> converter_flags.enable_select_tf_ops()
+  // bool emit_custom_ops          -> converter_flags.allow_custom_ops()
+  // bool allow_all_select_tf_ops  -> converter_flags.allow_all_select_tf_ops()
+  // std::set<> select_user_tf_ops -> converter_flags.select_user_tf_ops()
+  tflite::ConverterFlags converter_flags;
+  // When exporting from SavedModel, this will have the requested tags.
+  std::unordered_set<std::string> saved_model_tags;
+  // Metadata key/value pairs to write to the flatbuffer.
+  std::map<std::string, std::string> metadata;
+  // OpOrArgNameMapper to convert location of the op to name in flatbuffer.
+  // If not set, a default mapper will be used.
+  tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper = nullptr;
+  // User-specified value of flatbuffer alignment requirement for custom
+  // options. If specified, the value should be multiplier of 16 (default
+  // alignment for TFL flatbuffer).
+  std::optional<size_t> custom_option_alignment = std::nullopt;
+};
+
+// Translates the given MLIR `module` into a FlatBuffer and stores the
+// serialized flatbuffer into the string.
+// Returns true on successful exporting, false otherwise.
+bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
+                                       const FlatbufferExportOptions& options,
+                                       std::string* serialized_flatbuffer,
+                                       bool serialize_stablehlo_ops = false);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h
new file mode 100644
index 00000000..ba97ede7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_FLAGS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_FLAGS_H_
+
+#include <string>
+
+// These flags are used to control the emission or not of different kinds of ops
+// during the flatbuffer translation.
+extern bool emit_builtin_tflite_ops;
+extern bool emit_select_tf_ops;
+extern bool emit_custom_ops;
+// The flag to control whether to lower tensorlist ops into TF ops.
+extern bool lower_tensor_list_ops;
+// The flag to control whether debug info gets stripped on export.
+extern bool strip_debug_info;
+// The flag to control whether to store constant & custom buffers inside
+// flatbuffer
+extern bool use_buffer_offset;
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_import.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_import.h
new file mode 100644
index 00000000..f0f70114
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_import.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_IMPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_IMPORT_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace tflite {
+// Converts a TFLite flatbuffer stored in `buffer` to a MLIR module
+// The buffer must live for the duration of the function call,
+// The caller receives ownership of the module.
+// `base_loc` is used for error reporting and debug info.
+// If ordered_output_arrays is not empty, then the imported mlir function will
+// only return nodes in ordered_output_arrays in the same order. Returns nullptr
+// on failure, and more specific errors will be emitted via the context.
+// If `use_external_constant` is true, it will create `tfl.external_const`
+// instead of `tfl.const`.
+// If `experimental_prune_unreachable_nodes_unconditionally` is true, nodes that
+// are not ancestors of the output nodes will be pruned.
+mlir::OwningOpRef<mlir::ModuleOp> FlatBufferToMlir(
+    absl::string_view buffer, mlir::MLIRContext* context,
+    mlir::Location base_loc, bool use_external_constant = false,
+    const std::vector<std::string>& ordered_input_arrays = {},
+    const std::vector<std::string>& ordered_output_arrays = {},
+    bool experimental_prune_unreachable_nodes_unconditionally = false,
+    bool disable_vhlo_to_stablehlo = false);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_IMPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_operator.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
new file mode 100644
index 00000000..f0afe15f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
@@ -0,0 +1,309 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_OPERATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_OPERATOR_H_
+
+#include <stdint.h>
+
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloTypes.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+
+// duplicated from
+// https://github.com/openxla/stablehlo/blob/e5ad51715a11721c78b6748ab5de7945df24b1b8/stablehlo/transforms/StablehloLegalizeToVhlo.cpp#L756
+// so we can create correct vhlo types
+class StablehloVhloTypeConverter : public mlir::vhlo::VhloTypeConverter {
+ public:
+  StablehloVhloTypeConverter() : mlir::vhlo::VhloTypeConverter() {
+    addConversion([](mlir::Type type) -> mlir::Type {
+      if (type.getDialect().getNamespace() ==
+          mlir::vhlo::VhloDialect::getDialectNamespace()) {
+        return type;
+      }
+      return {};
+    });
+    addConversion([](mlir::stablehlo::TokenType token) -> mlir::Type {
+      return mlir::vhlo::TokenV1Type::get(token.getContext());
+    });
+    addBuiltinToVhloConversions();
+  }
+
+  mlir::Attribute convertEncoding(mlir::Attribute attr) const final {
+    // Must be VHLO encoding, or convertible to VHLO encoding.
+    if (attr.getDialect().getNamespace() ==
+        mlir::vhlo::VhloDialect::getDialectNamespace())
+      return attr;
+
+    if (auto stablehloAttr =
+            mlir::dyn_cast_or_null<mlir::stablehlo::TypeExtensionsAttr>(attr)) {
+      return mlir::vhlo::TypeExtensionsV1Attr::get(stablehloAttr.getContext(),
+                                                   stablehloAttr.getBounds());
+    }
+
+    // Was not VHLO encoding, or convertible.
+    return {};
+  }
+};
+
+// from
+// https://github.com/openxla/stablehlo/blob/e5ad51715a11721c78b6748ab5de7945df24b1b8/stablehlo/transforms/VhloLegalizeToStablehlo.cpp#L45C70-L45C70
+class VhloToStablehloTypeConverter : public vhlo::VhloTypeConverter {
+ public:
+  VhloToStablehloTypeConverter() : vhlo::VhloTypeConverter() {
+    addConversion([](Type type) -> Type { return type; });
+    addConversion([](vhlo::TokenV1Type token) -> Type {
+      return stablehlo::TokenType::get(token.getContext());
+    });
+    addVhloToBuiltinConversions();
+  }
+
+  Attribute convertEncoding(Attribute attr) const final {
+    if (auto vhloAttr =
+            mlir::dyn_cast_or_null<vhlo::TypeExtensionsV1Attr>(attr)) {
+      return stablehlo::TypeExtensionsAttr::get(vhloAttr.getContext(),
+                                                vhloAttr.getBounds());
+    }
+    // All encodings supported in StableHLO.
+    return attr;
+  }
+};
+
+// Returns true if the op_code belongs to a stablehlo operation.
+bool IsStablehloOp(const tflite::OperatorCodeT &op_code);
+
+// Returns the MLIR op name for the flatbuffer operator corresponding to
+// `op_code`.
+std::string GetMlirOpNameFromOpCode(const ::tflite::OperatorCodeT &op_code);
+
+// Returns the builtin op code for the given MLIR operation on success; emits
+// error and returns std::nullopt on failure.
+std::optional<tflite::BuiltinOperator> GetBuiltinOpCode(Operation *mlir_op);
+
+// Packs the given MLIR operation into a TFLite FlatBuffer operator object.
+// Returns the FlatBuffer offset for the operator on success; emits error and
+// returns std::nullopt on failure.
+std::optional<flatbuffers::Offset<tflite::Operator>> CreateFlatBufferOperator(
+    Operation *mlir_op, uint32_t opcode_index,
+    const std::vector<int32_t> &operands, const std::vector<int32_t> &results,
+    const std::vector<int32_t> &intermediates,
+    flatbuffers::FlatBufferBuilder *fbb,
+    std::optional<int> debug_metadata_index = -1);
+
+// Populates the array of mlir::NamedAttributes corresponding to the given
+// tflite::FlatbufferOptionsUnion.
+// We use an out parameter per LLVM convention
+void BuiltinOptionsToAttributes(
+    tflite::BuiltinOptionsUnion op_union, mlir::Builder builder,
+    // NOLINTNEXTLINE
+    llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes);
+
+// While the last several tensors could be optional tensors for an tfl op, the
+// number of input operands could vary. This function gets the min/max number of
+// operands from tflite op name.
+llvm::MinMax OperandNumbersMinMax(llvm::StringRef op_name);
+
+// Populates the `custom_code` and `custom_options` to attributes.
+// `custom_code` is used to identify CustomOp.
+// `custom_options` are opaque attribute used to store infomations for this
+// custom op.
+absl::Status CustomOptionsToAttributes(
+    const std::string &custom_code, const std::vector<uint8_t> &custom_options,
+    mlir::Builder builder,
+    // NOLINTNEXTLINE
+    Location loc, llvm::SmallVectorImpl<mlir::NamedAttribute> *attributes);
+
+// TODO(zichuanwei@): Populate Builtin_options_2 manual for now, should automate
+// these in the future
+void BuiltinOptions2ToAttributes(
+    tflite::BuiltinOptions2Union op_union, mlir::Builder builder,
+    llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes);
+
+// Function calls with a non-specialized type will result to a linker error.
+template <typename T>
+inline std::vector<T> GetVector(DenseElementsAttr elements);
+
+// TODO(zichuanwei@): for each type, we need to make sure the element type
+// matches the expected type otherwise an error should be thrown, but for now
+// we're just returning empty vector
+template <>
+inline std::vector<bool> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isSignlessInteger(1)) {
+    auto vec = llvm::to_vector(
+        llvm::map_range(elements.getValues<bool>(),
+                        [&](bool value) -> uint8_t { return value ? 1 : 0; }));
+    return std::vector<bool>(vec.begin(), vec.end());
+  }
+
+  return std::vector<bool>();
+}
+
+template <>
+inline std::vector<int8_t> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isSignlessInteger(8)) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APInt>(),
+        [&](APInt value) -> int8_t { return value.getSExtValue(); }));
+    return std::vector<int8_t>(vec.begin(), vec.end());
+  }
+
+  return std::vector<int8_t>();
+}
+
+template <>
+inline std::vector<int16_t> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isSignlessInteger(16)) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APInt>(),
+        [&](APInt value) -> int16_t { return value.getSExtValue(); }));
+    return std::vector<int16_t>(vec.begin(), vec.end());
+  }
+
+  return std::vector<int16_t>();
+}
+
+template <>
+inline std::vector<int32_t> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isSignlessInteger(32)) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APInt>(),
+        [&](APInt value) -> int32_t { return value.getSExtValue(); }));
+    return std::vector<int32_t>(vec.begin(), vec.end());
+  }
+
+  return std::vector<int32_t>();
+}
+
+template <>
+inline std::vector<int64_t> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isSignlessInteger(64)) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APInt>(),
+        [&](APInt value) -> int64_t { return value.getSExtValue(); }));
+    return std::vector<int64_t>(vec.begin(), vec.end());
+  }
+
+  return std::vector<int64_t>();
+}
+
+template <>
+inline std::vector<uint64_t> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isSignlessInteger(64)) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APInt>(),
+        [&](APInt value) -> uint64_t { return value.getSExtValue(); }));
+    return std::vector<uint64_t>(vec.begin(), vec.end());
+  }
+
+  return std::vector<uint64_t>();
+}
+
+template <>
+inline std::vector<float> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isF32()) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APFloat>(),
+        [&](APFloat value) -> float { return value.convertToFloat(); }));
+    return std::vector<float>(vec.begin(), vec.end());
+  }
+
+  return std::vector<float>();
+}
+
+template <>
+inline std::vector<double> GetVector(DenseElementsAttr elements) {
+  auto type = elements.getType();
+  auto elemType = type.getElementType();
+  if (elemType.isF64()) {
+    auto vec = llvm::to_vector(llvm::map_range(
+        elements.getValues<APFloat>(),
+        [&](APFloat value) -> double { return value.convertToFloat(); }));
+    return std::vector<double>(vec.begin(), vec.end());
+  }
+
+  return std::vector<double>();
+}
+
+// Handles the case when the DenseElementsAttr doesn't exist, and when it
+// doesn't returns a vector of length `default_size` all with the same value
+// `default_value`.
+template <typename T>
+static inline std::vector<T> GetOptionalVector(
+    std::optional<DenseElementsAttr> elements, int64_t default_size = 0,
+    int64_t default_value = 0) {
+  if (elements.has_value()) {
+    return GetVector<T>(elements.value());
+  }
+  return std::vector<T>(default_size, default_value);
+}
+
+// Handles the case when the ArrayRef doesn't exist, and when it
+// doesn't returns a vector of length `default_size` all with the same value
+// `default_value`.
+template <typename T>
+static inline std::vector<T> GetOptionalVector(
+    std::optional<ArrayRef<T>> values, int64_t default_size = 0,
+    int64_t default_value = 0) {
+  if (values.has_value()) {
+    return std::vector<T>(values->begin(), values->end());
+  }
+  return std::vector<T>(default_size, default_value);
+}
+
+template <typename T>
+static inline std::vector<T> GetVector(
+    vhlo::TensorV1Attr elements,
+    mlir::vhlo::VhloTypeConverter &vhlo_type_converter) {
+  return GetOptionalVector<T>(mlir::DenseIntElementsAttr::getFromRawBuffer(
+      mlir::cast<mlir::ShapedType>(
+          vhlo_type_converter.convertType(elements.getType())),
+      elements.getData()));
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_OPERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_translate.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_translate.h
new file mode 100644
index 00000000..f344fc28
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_translate.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_H_
+
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+
+namespace tflite {
+
+// Translates the given MLIR `module` into a FlatBuffer and stores the
+// serialized flatbuffer into the string. This uses OpOrArgLocNameMapper to
+// convert location of the op to name in flatbuffer. Returns true if translation
+// fails, otherwise returns false.
+bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
+                                       std::string* serialized_flatbuffer,
+                                       bool emit_builtin_tflite_ops,
+                                       bool emit_select_tf_ops,
+                                       bool emit_custom_ops);
+
+// Same as the above but with a custom op name mapper.
+bool MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h
new file mode 100644
index 00000000..6c8f80d4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_FLAGS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_FLAGS_H_
+
+#include <string>
+
+// These flags are used to control the emission or not of different kinds of ops
+// during the flatbuffer translation.
+extern bool emit_builtin_tflite_ops;
+extern bool emit_select_tf_ops;
+extern bool emit_custom_ops;
+// The flag to control whether to lower tensorlist ops into TF ops.
+extern bool lower_tensor_list_ops;
+// The flag to control whether debug info gets stripped on export.
+extern bool strip_debug_info;
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
new file mode 100644
index 00000000..5946ce0f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the MLIR TensorFlow Lite dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_OPS_H_
+
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeSupport.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops_dialect.h.inc"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops_enums.h.inc"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops_attrdefs.h.inc"
+
+namespace mlir {
+namespace TFL {
+
+typedef TFLDialect TensorFlowLiteDialect;
+
+// The Control type is a token-like value that models control dependencies
+class ControlType : public Type::TypeBase<ControlType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name = "tfl.control";
+};
+
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.h.inc"
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/common.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/common.h
new file mode 100644
index 00000000..fd9fdf81
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/common.h
@@ -0,0 +1,236 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_COMMON_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_COMMON_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#include <cmath>
+#include <functional>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/compiler/mlir/lite/core/macros.h"
+#include "tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h"
+#include "tensorflow/compiler/mlir/lite/kernels/internal/optimized/neon_check.h"
+
+// LINT.IfChange
+
+namespace tflite_migration {
+
+constexpr int kReverseShift = -1;
+
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
+
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
+
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  TFLITE_DCHECK_LE(shift, 0);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  TFLITE_DCHECK_GE(shift, 0);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+#ifdef USE_NEON
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
+    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
+  TFLITE_DCHECK(quantized_multiplier >= 0);
+
+  const int right_shift = std::min(-1, shift);
+  const int left_shift = shift - right_shift;
+
+  const int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  int32x4x4_t result;
+  result.val[0] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  result.val[1] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  result.val[2] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  result.val[3] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  return result;
+}
+#endif  // USE_NEON
+// Double-rounding MultiplyByQuantizedMultiplier
+#else
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+#ifdef USE_NEON
+// Round uses ARM's rounding shift right.
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
+    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
+  int32x4x4_t result;
+
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  return result;
+}
+#endif  // USE_NEON
+#endif  // TFLITE_SINGLE_ROUNDING
+
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+  if (integer_input == 0) {
+    return std::numeric_limits<T>::digits;
+  }
+#if defined(__GNUC__)
+  if (std::is_same<T, uint32_t>::value) {
+    return __builtin_clz(integer_input);
+  } else if (std::is_same<T, uint64_t>::value) {
+    return __builtin_clzll(integer_input);
+  }
+#endif
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+}
+
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t* output_inv_sqrt,
+                                             int* output_shift) {
+  TFLITE_DCHECK_GE(input, 0);
+  if (input <= 1) {
+    // Handle the input value 1 separately to avoid overflow in that case
+    // in the general computation below (b/143972021). Also handle 0 as if it
+    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+    // but rare/unrealistic input value. We can expect both to occur in some
+    // incompletely trained models, but probably not in fully trained models.
+    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+    *output_shift = 0;
+    return;
+  }
+  TFLITE_DCHECK_GT(input, 1);
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  const unsigned max_left_shift_bits =
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
+}  // namespace tflite_migration
+
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/common.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h
new file mode 100644
index 00000000..3233fe00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_COMPATIBILITY_MACROS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_COMPATIBILITY_MACROS_H_
+
+#ifndef TFLITE_ABORT
+#define TFLITE_ABORT abort()
+#endif
+
+#ifndef TFLITE_ASSERT_FALSE
+#if defined(NDEBUG)
+#define TFLITE_ASSERT_FALSE (static_cast<void>(0))
+#else
+#define TFLITE_ASSERT_FALSE TFLITE_ABORT
+#endif
+#endif
+
+// LINT.IfChange
+
+#ifndef TFLITE_DCHECK
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_EQ
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GE
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GT
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LE
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LT
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/compatibility.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_COMPATIBILITY_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/cppmath.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/cppmath.h
new file mode 100644
index 00000000..49b66e10
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/cppmath.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_CPPMATH_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_CPPMATH_H_
+
+#include <cmath>
+
+// LINT.IfChange
+
+namespace tflite_migration {
+
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(__ZEPHYR__)
+#define TF_LITE_GLOBAL_STD_PREFIX
+#else
+#define TF_LITE_GLOBAL_STD_PREFIX std
+#endif
+
+#define DECLARE_STD_GLOBAL_SWITCH1(tf_name, std_name) \
+  template <class T>                                  \
+  inline T tf_name(const T x) {                       \
+    return TF_LITE_GLOBAL_STD_PREFIX::std_name(x);    \
+  }
+
+DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round)
+
+}  // namespace tflite_migration
+
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/cppmath.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_CPPMATH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/optimized/neon_check.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/optimized/neon_check.h
new file mode 100644
index 00000000..ec3908d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/optimized/neon_check.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
+
+// LINT.IfChange
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>  // IWYU pragma: export
+#endif
+
+#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#define USE_NEON
+#include "NEON_2_SSE.h"  // IWYU pragma: export
+#endif
+
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/optimized/neon_check.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/quantization_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/quantization_util.h
new file mode 100644
index 00000000..b38391c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/quantization_util.h
@@ -0,0 +1,166 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+namespace tflite_migration {
+
+// LINT.IfChange
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier > 1.
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift);
+
+// Splits a double input value into a returned fraction, and a shift value from
+// the exponent, using only bitwise and integer operations to support
+// microcontrollers and other environments without floating-point support.
+//
+// This is designed to be a replacement for how std::frexp() is used within the
+// QuantizeMultiplier() function, and so has a different signature than the
+// standard version, returning a 64-bit integer rather than a double. This
+// result has a maximum value of 1<<31, with the fraction expressed as a
+// proportion of that maximum.
+//
+// std::frexp() returns NaNs and infinities unmodified, but since we're
+// returning integers that can't represent those values, instead we return
+// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
+// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
+// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
+// result in return values that end up truncating some bits at the end,
+// reflecting the loss of precision inherent in denormalization.
+int64_t IntegerFrExp(double input, int* shift);
+
+// Converts an integer fraction in the format produced by IntegerFrExp (where
+// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
+// IEEE binary64 double format result. The implementation uses only integer and
+// bitwise operators, so no floating point hardware support or emulation is
+// needed. This is here so quantized operations can run non-time-critical
+// preparation calculations on microcontrollers and other platforms without
+// float support.
+double DoubleFromFractionAndShift(int64_t fraction, int shift);
+
+// Performs a multiplication of two numbers in double format, using only integer
+// and bitwise instructions. This is aimed at supporting housekeeping functions
+// for quantized operations on microcontrollers without floating-point hardware.
+double IntegerDoubleMultiply(double a, double b);
+
+// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
+// greater than b. It is implemented using only integer and logical instructions
+// so that it can be easily run on microcontrollers for quantized operations.
+int IntegerDoubleCompare(double a, double b);
+
+// This first creates a multiplier in a double equivalent of
+// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
+// precision in the double's fractional bits.  It then splits the result into
+// significand and exponent.
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift);
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+
+// Calculate the largest input that will result in a within-bounds intermediate
+// result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
+// it must not overflow before we reduce the value by multiplication by the
+// input multiplier.  The negative radius is used as the minimum difference in
+// Softmax.
+int CalculateInputRadius(int input_integer_bits, int input_left_shift,
+                         int total_signed_bits = 31);
+
+// Converts a floating-point number to an integer. For all inputs x where
+// static_cast<IntOut>(x) is legal according to the C++ standard, the result
+// is identical to that cast (i.e. the result is x with its fractional part
+// truncated whenever that is representable as IntOut).
+//
+// static_cast would cause undefined behavior for the following cases, which
+// have well-defined behavior for this function:
+//
+//  1. If x is NaN, the result is zero.
+//
+//  2. If the truncated form of x is above the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::max().
+//
+//  3. If the truncated form of x is below the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::min().
+//
+// Note that cases #2 and #3 cover infinities as well as finite numbers.
+//
+// The range of FloatIn must include the range of IntOut, otherwise
+// the results are undefined.
+// TODO(sfeuz): Replace by absl::SafeCast once available.
+template <class IntOut, class FloatIn>
+IntOut SafeCast(FloatIn x) {
+  static_assert(!std::numeric_limits<FloatIn>::is_integer,
+                "FloatIn is integer");
+  static_assert(std::numeric_limits<IntOut>::is_integer,
+                "IntOut is not integer");
+  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
+
+  // Special case NaN, for which the logic below doesn't work.
+  if (std::isnan(x)) {
+    return 0;
+  }
+
+  // Negative values all clip to zero for unsigned results.
+  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
+    return 0;
+  }
+
+  // Handle infinities.
+  if (std::isinf(x)) {
+    return x < 0 ? std::numeric_limits<IntOut>::min()
+                 : std::numeric_limits<IntOut>::max();
+  }
+
+  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
+  // unless x is zero in which case exp == 0. Note that this implies that the
+  // magnitude of x is strictly less than 2^exp.
+  int exp = 0;
+  std::frexp(x, &exp);
+
+  // Let N be the number of non-sign bits in the representation of IntOut. If
+  // the magnitude of x is strictly less than 2^N, the truncated version of x
+  // is representable as IntOut. The only representable integer for which this
+  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
+  // by the fall-through below.
+  if (exp <= std::numeric_limits<IntOut>::digits) {
+    return x;
+  }
+
+  // Handle numbers with magnitude >= 2^N.
+  return x < 0 ? std::numeric_limits<IntOut>::min()
+               : std::numeric_limits<IntOut>::max();
+}
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/quantization_util.h)
+}  // namespace tflite_migration
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape.h
new file mode 100644
index 00000000..3a602ba9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape.h
@@ -0,0 +1,263 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
+
+// This file is the MLIR copy of runtime_shape as part of the effort to
+// decouple TFLite from MLIR.
+// LINT.IfChange
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+
+#include "tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h"
+
+namespace mlir {
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+class RuntimeShape {
+ public:
+  // Shapes with dimensions up to 6 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 6;
+
+  RuntimeShape& operator=(RuntimeShape const&) = delete;
+
+  RuntimeShape() : size_(0) {}
+
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    if (dimensions_count > kMaxSmallSize) {
+      dims_pointer_ = new int32_t[dimensions_count];
+    }
+  }
+
+  RuntimeShape(int shape_size, int32_t value) : size_(0) {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i) {
+      SetDim(i, value);
+    }
+  }
+
+  RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  RuntimeShape(const std::initializer_list<int> init_list) : size_(0) {
+    BuildFrom(init_list);
+  }
+
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_ = new int32_t[size_];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
+  }
+
+  bool operator==(const RuntimeShape& comp) const {
+    return this->size_ == comp.size_ &&
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
+               0;
+  }
+
+  ~RuntimeShape();
+
+  inline int32_t DimensionsCount() const { return size_; }
+
+  int32_t Dims(int i) const;
+
+  inline void SetDim(int i, int32_t val) {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_[i] = val;
+    } else {
+      dims_[i] = val;
+    }
+  }
+
+  inline int32_t* DimsData() {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  inline const int32_t* DimsData() const {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  // The caller must ensure that the shape is no bigger than 5-D.
+  inline const int32_t* DimsDataUpTo5D() const { return dims_; }
+
+  inline void Resize(int dimensions_count) {
+    const int32_t old_size = size_;
+    size_ = dimensions_count;
+
+    if (old_size <= kMaxSmallSize) {
+      if (dimensions_count <= kMaxSmallSize) {
+        return;
+      } else {  // Small to big.
+        int32_t* new_big_data = new int32_t[dimensions_count];
+        memcpy(new_big_data, dims_, sizeof(int32_t) * old_size);
+        dims_pointer_ = new_big_data;
+      }
+    } else {
+      if (dimensions_count > kMaxSmallSize && dimensions_count <= old_size) {
+        return;
+      }
+      std::unique_ptr<int32_t[]> old_data(dims_pointer_);
+      if (dimensions_count <= old_size) {  // Big to small.
+        memcpy(dims_, old_data.get(), sizeof(int32_t) * dimensions_count);
+      } else {  // Big to bigger.
+        dims_pointer_ = new int32_t[dimensions_count];
+        memcpy(dims_pointer_, old_data.get(), sizeof(int32_t) * old_size);
+      }
+    }
+  }
+
+  void ReplaceWith(int dimensions_count, const int32_t* dims_data);
+
+  template <typename T>
+  inline void BuildFrom(const T& src_iterable) {
+    const int dimensions_count =
+        std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32_t* data = DimsData();
+    for (auto it : src_iterable) {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static RuntimeShape ExtendedShape(int new_shape_size,
+                                           const RuntimeShape& shape) {
+    return RuntimeShape(new_shape_size, shape, 1);
+  }
+
+  inline void BuildFrom(const std::initializer_list<int> init_list) {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  int FlatSize() const;
+
+  bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
+
+ private:
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
+      : size_(0) {
+    // If the following check fails, it is likely because a 4D-only kernel is
+    // being used with an array of larger dimension count.
+    TFLITE_DCHECK_GE(new_shape_size, shape.DimensionsCount());
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i) {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32_t) * shape.DimensionsCount());
+  }
+
+  int32_t size_;
+  union {
+    int32_t dims_[kMaxSmallSize];
+    int32_t* dims_pointer_;
+  };
+};
+
+// Converts inference-style shape to legacy tflite::Dims<4>.
+inline mlir::Dims<4> ToRuntimeDims(const mlir::RuntimeShape& array_shape) {
+  mlir::Dims<4> result;
+  const int dimensions_count = array_shape.DimensionsCount();
+  TFLITE_DCHECK_LE(dimensions_count, 4);
+  int cum_prod = 1;
+  for (int i = 0; i < 4; i++) {
+    const int new_dim =
+        (i < dimensions_count) ? array_shape.Dims(dimensions_count - 1 - i) : 1;
+    result.sizes[i] = new_dim;
+    result.strides[i] = cum_prod;
+    cum_prod *= new_dim;
+  }
+  return result;
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+inline RuntimeShape DimsToShape(const mlir::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
+// Since tensors with '0' in their shape are valid in TF, these offset functions
+// allow that as long as the corresponding index is also 0. It is upto the
+// calling ops to ensure that they perform verification checks on tensor shapes
+// if they don't support a particular behavior.
+
+inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
+  const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
+  TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
+                (i0 >= 0 && i0 < dims_data[0]));
+  TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
+                (i1 >= 0 && i1 < dims_data[1]));
+  TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
+                (i2 >= 0 && i2 < dims_data[2]));
+  TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
+                (i3 >= 0 && i3 < dims_data[3]));
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
+                  int i4) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
+  const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
+  TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
+                (i0 >= 0 && i0 < dims_data[0]));
+  TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
+                (i1 >= 0 && i1 < dims_data[1]));
+  TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
+                (i2 >= 0 && i2 < dims_data[2]));
+  TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
+                (i3 >= 0 && i3 < dims_data[3]));
+  TFLITE_DCHECK((dims_data[4] == 0 && i4 == 0) ||
+                (i4 >= 0 && i4 < dims_data[4]));
+  return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
+             dims_data[4] +
+         i4;
+}
+
+inline int Offset(const RuntimeShape& shape, int* index) {
+  return Offset(shape, index[0], index[1], index[2], index[3]);
+}
+
+}  // namespace mlir
+
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/runtime_shape.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.h
new file mode 100644
index 00000000..56ba7181
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_UTILS_SPARSITY_FORMAT_CONVERTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_UTILS_SPARSITY_FORMAT_CONVERTER_H_
+
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/compiler/mlir/lite/core/c/tflite_types.h"
+
+namespace tflite_migration {
+namespace internal {
+namespace sparsity {
+
+// LINT.IfChange
+
+// A converter that keeps an internal representation of sparse tensor parameters
+// and converts tensors between dense and sparse formats.
+template <typename T>
+class FormatConverter {
+ public:
+  /*
+   * Creates a dense to sparse converter.
+   * @param shape             Shape of the dense tensor.
+   * @param traversal_order   In what order to traverse all dimensions,
+   *                          including block dimensions.
+   * @param format            Whether each dimension in the dense tensor is
+   *                          dense or sparse (not in the traversal order).
+   * @param block_size        Size of each block dimension.
+   * @param block_map         Map from block dimension to original tensor
+   *                          dimension.
+   */
+  FormatConverter(const std::vector<int>& shape,
+                  const std::vector<int>& traversal_order,
+                  const std::vector<TfLiteDimensionType>& format,
+                  const std::vector<int>& block_size = {},
+                  const std::vector<int>& block_map = {});
+
+  const std::vector<T>& GetData() { return data_; }
+
+  const std::vector<std::vector<int>>& GetDimMetadata() {
+    return dim_metadata_;
+  }
+
+  // Method for dense to sparse conversion. Need to call GetData() method to get
+  // the compressed data.
+
+  void DenseToSparse(const T* src_data);
+
+  // Check if val is equal to zero.
+  bool IsZero(const T val);
+
+  // Shape of the conceptual dense tensor.
+  std::vector<int> dense_shape_;
+  // Shape of the dense tensor with inner blocks reduced. For example, a (4, 4)
+  // tensor with (2, 2) block has blocked_shape (2, 2).
+  std::vector<int> blocked_shape_;
+  // Total number of elements in the dense tensor.
+  size_t dense_size_;
+  // Has n(original dimension)+k(block_dimension) elements.
+  std::vector<int> traversal_order_;
+  // Format of each dimension in the traversal order.
+  std::vector<TfLiteDimensionType> format_;
+  // Size of each block dimension, in the same order as block map.
+  std::vector<int> block_size_;
+  // Map from block dimension to the original tensor dimension.
+  std::vector<int> block_map_;
+  // Metadata of each dimension in the traversal order.
+  // Each dimension needs two vectors. For dense dimensions, the first vector
+  // stores the size of that dimension, and the second vector is empty. For
+  // sparse dimensions, the first vector stores the segments and the second one
+  // stores the indices.
+  std::vector<std::vector<int>> dim_metadata_;
+  // Actual buffer holding data after conversion. Could be sparse buffer or
+  // dense buffer.
+  std::vector<T> data_;
+};
+
+extern template class FormatConverter<int32_t>;
+extern template class FormatConverter<int8_t>;
+extern template class FormatConverter<float>;
+extern template class FormatConverter<Eigen::half>;
+
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h)
+
+}  // namespace sparsity
+}  // namespace internal
+}  // namespace tflite_migration
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_INTERNAL_UTILS_SPARSITY_FORMAT_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/padding.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/padding.h
new file mode 100644
index 00000000..b0dd6daf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/kernels/padding.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_PADDING_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_PADDING_H_
+
+// LINT.IfChange
+#include "tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h"
+
+namespace tflite_migration {
+
+// Matching GetWindowedOutputSize in TensorFlow.
+inline int ComputeOutSize(TfLitePadding padding, int image_size,
+                          int filter_size, int stride, int dilation_rate = 1) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+
+  // TODO(b/186448822): This uses 0 since the function has no other way to
+  // report error case
+  if (stride == 0) return 0;
+
+  switch (padding) {
+    case kTfLitePaddingSame:
+      return (image_size + stride - 1) / stride;
+    case kTfLitePaddingValid:
+      return (image_size + stride - effective_filter_size) / stride;
+    default:
+      return 0;
+  }
+}
+
+// It's not guaranteed that padding is symmetric. It's important to keep
+// offset for algorithms need all paddings.
+inline int ComputePaddingWithOffset(int stride, int dilation_rate, int in_size,
+                                    int filter_size, int out_size,
+                                    int* offset) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int total_padding =
+      ((out_size - 1) * stride + effective_filter_size - in_size);
+  total_padding = total_padding > 0 ? total_padding : 0;
+  *offset = total_padding % 2;
+  return total_padding / 2;
+}
+
+}  // namespace tflite_migration
+
+// LINT.ThenChange(//tensorflow/lite/kernels/padding.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_KERNELS_PADDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/error_collector.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/error_collector.h
new file mode 100644
index 00000000..f21b0c47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/error_collector.h
@@ -0,0 +1,58 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/metrics/converter_error_data.pb.h"
+#include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
+
+namespace mlir {
+namespace TFL {
+
+// A singleton to store errors collected by the instrumentation.
+class ErrorCollector {
+  using ConverterErrorData = tflite::metrics::ConverterErrorData;
+  using ConverterErrorDataSet =
+      std::unordered_set<ConverterErrorData, ConverterErrorDataHash,
+                         ConverterErrorDataComparison>;
+
+ public:
+  const ConverterErrorDataSet &CollectedErrors() { return collected_errors_; }
+
+  void ReportError(const ConverterErrorData &error) {
+    collected_errors_.insert(error);
+  }
+
+  // Clear the set of collected errors.
+  void Clear() { collected_errors_.clear(); }
+
+  // Returns the global instance of ErrorCollector.
+  static ErrorCollector* GetErrorCollector();
+
+ private:
+  ErrorCollector() {}
+
+  ConverterErrorDataSet collected_errors_;
+
+  static ErrorCollector* error_collector_instance_;
+};
+
+}  // namespace TFL
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
new file mode 100644
index 00000000..e3ac59a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
@@ -0,0 +1,78 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_INST_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_INST_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/metrics/converter_error_data.pb.h"
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
+#include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
+
+namespace mlir {
+namespace TFL {
+
+// Collects errors when running the pass manager.
+class ErrorCollectorInstrumentation : public PassInstrumentation {
+  using ConverterErrorData = tflite::metrics::ConverterErrorData;
+  using ErrorCode = ConverterErrorData::ErrorCode;
+
+ public:
+  explicit ErrorCollectorInstrumentation(MLIRContext *context);
+
+ private:
+  // Instrumentation hooks. These hooks don't need to be thread-safe. The pass
+  // manager runs each pass for the entire module, then it walks through
+  // each op in the module and runs the pass on them, may be in async mode.
+  void runBeforePass(Pass *pass, Operation *module) override;
+  void runAfterPass(Pass *pass, Operation *module) override;
+  void runAfterPassFailed(Pass *pass, Operation *module) override;
+
+  // The handler to capture error messages.
+  std::unique_ptr<ScopedDiagnosticHandler> handler_;
+  // A map from location to op name.
+  std::unordered_map<Location, std::string, LocationHash> loc_to_name_;
+  // Stores the error message for errors without op name and error code.
+  std::string common_error_message_;
+  // Name of the running pass.
+  std::string pass_name_;
+  // Pointer to the global ErrorCollector instance.
+  ErrorCollector *error_collector_;
+};
+
+// Prefix when adding error code as a note in Diagnostic.
+constexpr char kErrorCodePrefix[] = "Error code: ";
+
+// Adds error code to a newly created InFlightDiagnostic.
+inline InFlightDiagnostic AttachErrorCode(InFlightDiagnostic &&diag,
+                                          int error_code) {
+  using tflite::metrics::ConverterErrorData;
+  diag.attachNote() << kErrorCodePrefix
+                    << ConverterErrorData::ErrorCode_Name(error_code);
+  return std::move(diag);
+}
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_INST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/types_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/types_util.h
new file mode 100644
index 00000000..7fe31a38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/metrics/types_util.h
@@ -0,0 +1,71 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_METRICS_TYPES_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_METRICS_TYPES_UTIL_H_
+
+#include <cstddef>
+#include <functional>
+#include <string>
+
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/metrics/converter_error_data.pb.h"
+
+namespace mlir {
+namespace TFL {
+
+// The hash function for mlir::Location.
+struct LocationHash {
+  std::size_t operator()(const Location& v) const noexcept {
+    return hash_value(v);
+  }
+};
+
+// The hash function for ConverterErrorData.
+struct ConverterErrorDataHash {
+  std::size_t operator()(
+      const tflite::metrics::ConverterErrorData& v) const noexcept {
+    std::size_t hash_result = std::hash<std::string>{}(v.error_message());
+    if (v.has_subcomponent()) {
+      hash_result ^= std::hash<std::string>{}(v.subcomponent()) << 1;
+    }
+    if (v.has_error_code()) {
+      hash_result ^= std::hash<int>{}(v.error_code()) << 2;
+    }
+    if (v.has_operator_() && v.operator_().has_name()) {
+      hash_result ^= std::hash<std::string>{}(v.operator_().name()) << 3;
+    }
+    return hash_result;
+  }
+};
+
+// The comparison function for ConverterErrorData.
+struct ConverterErrorDataComparison {
+  std::size_t operator()(
+      const tflite::metrics::ConverterErrorData& a,
+      const tflite::metrics::ConverterErrorData& b) const noexcept {
+    return ConverterErrorDataHash()(a) == ConverterErrorDataHash()(b);
+  }
+};
+
+// Helper function to create a new ConverterErrorData.
+tflite::metrics::ConverterErrorData NewConverterErrorData(
+    const std ::string& pass_name, const std::string& error_message,
+    tflite::metrics::ConverterErrorData::ErrorCode error_code,
+    const std::string& op_name, const Location& location);
+
+}  // namespace TFL
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_METRICS_TYPES_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/offset_buffer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/offset_buffer.h
new file mode 100644
index 00000000..79e9d3f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/offset_buffer.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_OFFSET_BUFFER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_OFFSET_BUFFER_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+// Check if the model is using custom_option_offset to store custom op
+// buffers. When this field is not explicitly set by the user, then FlatBuffer
+// will omit the field and interpret this as 0, to ensure this field is
+// populated. The flatbuffer exporter will always set it to 1, and it's also not
+// a valid buffer offset value. So it's only valid when it's > 1.
+inline bool IsValidBufferOffset(const int64_t offset) { return offset > 1; }
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_OFFSET_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/converter_python_api.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/converter_python_api.h
new file mode 100644
index 00000000..cfcba696
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/converter_python_api.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_CONVERTER_PYTHON_API_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_CONVERTER_PYTHON_API_H_
+
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+
+namespace tflite {
+
+// Convert a model represented in `input_contents`. `model_flags_proto`
+// describes model parameters. `flags_proto` describes conversion
+// parameters (see relevant .protos for more information). Returns a string
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statistics like arithmetic ops count.
+// `debug_info_str` contains the `GraphDebugInfo` proto.
+PyObject* Convert(PyObject* model_flags_proto_txt_raw,
+                  PyObject* converter_flags_proto_txt_raw,
+                  PyObject* input_contents_txt_raw,
+                  bool extended_return = false,
+                  PyObject* debug_info_txt_raw = nullptr,
+                  const tensorflow::quantization::PyFunctionLibrary*
+                      quantization_py_function_library = nullptr);
+
+// Quantize the model with calibration data. Throw errors if `fully_quantize`
+// is specified by the calibration data are not sufficient to quantize the
+// model.
+PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
+                            bool fully_quantize, int inference_type,
+                            int input_data_type, int output_data_type,
+                            bool enable_numeric_verify = false,
+                            bool enable_whole_model_verify = false,
+                            PyObject* op_denylist = nullptr,
+                            PyObject* node_denylist = nullptr,
+                            bool enable_variable_quantization = false,
+                            bool disable_per_channel_for_dense_layers = false,
+                            PyObject* debug_options_proto_txt_raw = nullptr);
+
+// Sparsifies model to encode sparse tensors with proper format. Throws error if
+// sparsification fails.
+PyObject* MlirSparsifyModel(PyObject* data);
+
+// Registers the given custom opdefs to TensorFlow global op registry.
+PyObject* RegisterCustomOpdefs(PyObject* list);
+
+// Returns the collected TFLite conversion errors.
+std::vector<std::string> RetrieveCollectedErrors();
+
+// Returns MLIR string dump of the given Flatbuffer model.
+std::string FlatBufferFileToMlir(const std::string& model,
+                                 bool input_is_filepath);
+
+// All the exported functions should be listed in
+// tensorflow/tools/def_file_filter/symbols_pybind.txt for the Windows build.
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_CONVERTER_PYTHON_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.h
new file mode 100644
index 00000000..3164265f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_FLATBUFFER_TO_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_FLATBUFFER_TO_MLIR_H_
+
+#include <string>
+
+namespace tensorflow {
+
+// Translates the given FlatBuffer filename or buffer into MLIR and returns
+// translated MLIR as string.
+std::string FlatBufferFileToMlir(const std::string& model_file_or_buffer,
+                                 bool input_is_filepath);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_FLATBUFFER_TO_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
new file mode 100644
index 00000000..a1a73863
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_GRAPHDEF_TO_TFL_FLATBUFFER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_GRAPHDEF_TO_TFL_FLATBUFFER_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+
+namespace tensorflow {
+
+// Converts the given GraphDef to a TF Lite FlatBuffer string according to the
+// given model flags, converter flags and debug information. Returns error
+// status if it fails to convert the input.
+absl::Status ConvertGraphDefToTFLiteFlatBuffer(
+    const tflite::ModelFlags& model_flags,
+    tflite::ConverterFlags& converter_flags, const GraphDebugInfo& debug_info,
+    const GraphDef& input, std::string* result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_GRAPHDEF_TO_TFL_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.h
new file mode 100644
index 00000000..f98a3522
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+
+#include <Python.h>
+
+#include <cstdarg>
+#include <sstream>
+#include <string>
+
+#include "tensorflow/compiler/mlir/lite/stateful_error_reporter.h"
+
+namespace tflite_migration {
+namespace interpreter_wrapper {
+
+class PythonErrorReporter : public tflite_migration::StatefulErrorReporter {
+ public:
+  PythonErrorReporter() = default;
+
+  // Report an error message
+  int Report(const char* format, va_list args) override;
+
+  // Sets a Python runtime exception with the last error and
+  // clears the error message buffer.
+  PyObject* exception();
+
+  // Gets the last error message and clears the buffer.
+  std::string message() override;
+
+ private:
+  std::stringstream buffer_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite_migration
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_utils.h
new file mode 100644
index 00000000..8afc03ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+
+#include <Python.h>
+
+#include <cstddef>
+
+namespace mlirlite {
+namespace python_utils {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length);
+PyObject* ConvertToPyString(const char* data, size_t length);
+
+}  // namespace python_utils
+}  // namespace mlirlite
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
new file mode 100644
index 00000000..9008560f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_JAX_TO_TFL_FLATBUFFER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_JAX_TO_TFL_FLATBUFFER_H_
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Converts the given Jax model to a TF Lite FlatBuffer
+// string according to the given model flags, converter flags and tags. Returns
+// error status if it fails to convert the input.
+absl::Status ConvertJaxToTFLiteFlatBuffer(
+    const std::string& input, const tflite::ModelFlags& model_flags,
+    tflite::ConverterFlags& converter_flags, string* result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_JAX_TO_TFL_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
new file mode 100644
index 00000000..92801047
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
+
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Converts the given saved_model(either v1 or v2) to a TF Lite FlatBuffer
+// string according to the given model flags, converter flags and tags. Returns
+// error status if it fails to convert the input.
+absl::Status ConvertSavedModelToTFLiteFlatBuffer(
+    const tflite::ModelFlags& model_flags,
+    tflite::ConverterFlags& converter_flags, string* result,
+    const quantization::PyFunctionLibrary* quantization_py_function_lib);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
new file mode 100644
index 00000000..de1e33f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/types.pb.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Register all custom ops including user specified custom ops.
+absl::Status RegisterAllCustomOps(
+    const tflite::ConverterFlags& converter_flags);
+
+// Populate quantization specs (or not) given user specified ranges for each
+// input arrays.
+absl::Status PopulateQuantizationSpecs(
+    const tflite::ModelFlags& model_flags,
+    tflite::ConverterFlags& converter_flags,
+    mlir::quant::QuantizationSpecs* quant_specs,
+    std::vector<string>* node_names, std::vector<string>* node_dtypes,
+    std::vector<std::optional<std::vector<int>>>* node_shapes,
+    std::vector<std::optional<double>>* node_mins,
+    std::vector<std::optional<double>>* node_maxs);
+
+// Convert imported MLIR file to TfLite flatbuffer.
+// This will also run relevant passes as well.
+absl::Status ConvertMLIRToTFLiteFlatBuffer(
+    const tflite::ModelFlags& model_flags,
+    tflite::ConverterFlags& converter_flags,
+    std::unique_ptr<mlir::MLIRContext>&& context,
+    mlir::OwningOpRef<mlir::ModuleOp> module,
+    const mlir::TFL::PassConfig& pass_config,
+    const std::unordered_set<std::string>& saved_model_tags, string* result,
+    const quantization::PyFunctionLibrary* quantization_py_function_lib);
+
+// Give a warning for any unused flags that have been specified.
+void WarningUnusedFlags(const tflite::ModelFlags& model_flags,
+                        const tflite::ConverterFlags& converter_flags);
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/device_target.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/device_target.h
new file mode 100644
index 00000000..01072c50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/device_target.h
@@ -0,0 +1,196 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_DEVICE_TARGET_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_DEVICE_TARGET_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/numerical_utils.h"
+
+namespace mlir {
+namespace quant {
+
+class QuantizeContext;
+
+using AdjacentOperations = llvm::SmallVectorImpl<Operation*>;
+using QuantizedMultipliers = llvm::SmallVector<QuantizedMultiplier, 4>;
+using QuantizedRanges = llvm::SmallVector<QuantizedRange, 4>;
+using ScaleFn = std::function<LogicalResult(QuantizeContext*, Operation*,
+                                            AdjacentOperations*, bool*)>;
+
+using ScaleDecomposeFn =
+    std::function<LogicalResult(Operation*, QuantizedMultipliers*,
+                                QuantizedMultipliers*, QuantizedRanges*)>;
+
+static const QuantizedMultiplier kUnitQuantizedMultiplier{1, 0};
+
+enum class ScaleConstraintType {
+  OutputInputSameScale,
+  OutputInputFreeScale,
+  CustomScale,
+};
+
+// Each kernel signature has its own specification for scales.
+struct KernelSpec {
+  // Scale constraint
+  ScaleConstraintType type;
+
+  // Custom function to derive the scales. Only available when the scale
+  // constraint is `CustomScale`.
+  ScaleFn scale_fn;
+};
+
+class KernelSpecs {
+ public:
+  using Signature = llvm::SmallVector<quant::AnyQuantizedType, 4>;
+
+  // Returns the kernel specification for the kernel signature.
+  std::optional<KernelSpec> Find(const Signature& signature) const {
+    auto spec_it = all_signatures_.find(signature);
+    if (spec_it != all_signatures_.end()) {
+      return spec_it->second;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  ScaleDecomposeFn GetDecomposeFn() const { return decompose_fn_; }
+
+  // Adds the kernel signature with the kernel specification.
+  LogicalResult Add(const Signature& signature, const KernelSpec& spec) {
+    if (all_signatures_.insert({signature, spec}).second) return success();
+    return failure();
+  }
+
+  KernelSpecs& WithSignature(const KernelSpecs::Signature& signature,
+                             const ScaleFn& fn) {
+    (void)Add(signature, {ScaleConstraintType::CustomScale, fn});
+    return *this;
+  }
+
+  KernelSpecs& WithImpl(const ScaleDecomposeFn& dfn) {
+    decompose_fn_ = dfn;
+    return *this;
+  }
+
+ private:
+  // The signature is pattern match based.
+  struct SignatureInfo : public llvm::DenseMapInfo<Signature> {
+    static inline Signature getEmptyKey() { return {}; }
+    static inline Signature getTombstoneKey() { return {nullptr}; }
+    static unsigned getHashValue(Signature val) {
+      return llvm::hash_combine_range(val.begin(), val.end());
+    }
+    static bool isEqual(Signature LHS, Signature RHS) {
+      if (RHS == getEmptyKey()) return LHS == getEmptyKey();
+      if (RHS == getTombstoneKey()) return LHS == getTombstoneKey();
+      if (LHS.size() != RHS.size()) return false;
+      for (auto arg : llvm::zip(LHS, RHS)) {
+        if (std::get<0>(arg) != std::get<1>(arg)) return false;
+      }
+      return true;
+    }
+  };
+
+  // Maps the signature to the kernel spec. Note that the matching is
+  // pattern match based.
+  llvm::DenseMap<Signature, KernelSpec, SignatureInfo> all_signatures_;
+
+  // A method to compute the effective multipliers. This is independent on the
+  // bits of the ports, thus all the signature shares the same here.
+  ScaleDecomposeFn decompose_fn_;
+};
+
+class DeviceTarget {
+ public:
+  explicit DeviceTarget(MLIRContext* ctx);
+
+  // Retrieves the kernel spec for the quant region op.
+  std::optional<KernelSpec> GetKernelSpec(
+      llvm::StringRef kernel, const KernelSpecs::Signature& signature) const;
+
+  // Retrieves the scale decomposition function for the quant region op.
+  ScaleDecomposeFn GetDecomposeFn(quantfork::QuantizeRegionOp op) const;
+
+  // converts specification to signature:
+  // - UniformedQuantizedType -> AnyQuantizedType
+  // - AnyQuantizedType (int) -> AnyQuantizedType
+  // - Float -> {}
+  static void AppendToSignature(Type spec, KernelSpecs::Signature* signature);
+
+ protected:
+  // Adds the kernel spec with the custom scale function for the kernel.
+  LogicalResult RegisterKernel(llvm::StringRef kernel,
+                               const KernelSpecs::Signature& signature,
+                               const ScaleFn& fn, const ScaleDecomposeFn& dfn);
+
+  // Adds the kernel spec with the scale constraint type for the kernel.
+  LogicalResult RegisterKernel(llvm::StringRef kernel,
+                               const KernelSpecs::Signature& signature,
+                               ScaleConstraintType constraint);
+
+  // Adds the kernel with the name. Retrun an existing one if it has been
+  // added before.
+  KernelSpecs& RegisterKernel(llvm::StringRef kernel) { return specs_[kernel]; }
+
+  // For "mulmat->add" type of kernels, convert the scales of all the ports to
+  // multipliers.
+  static LogicalResult DecomposeMultiplyAccumulateScale(
+      Operation* op, QuantizedMultipliers* input_multipliers,
+      QuantizedMultipliers* output_multipliers, QuantizedRanges* output_ranges);
+
+  // For "reshape" type of kernels.
+  static LogicalResult DecomposeSameScale(
+      Operation* op, QuantizedMultipliers* input_multipliers,
+      QuantizedMultipliers* output_multipliers, QuantizedRanges* output_ranges);
+
+  // A set of parameters are required to build the signatures.
+  FloatType f32_;
+  IntegerType i8_, i32_;
+  int64_t i8_min_, i8_max_, i32_min_, i32_max_;
+  quant::AnyQuantizedType any_, qi8_, qi8n_, qi32_;
+
+ private:
+  // Maps the kernel names to all the available kernels.
+  llvm::StringMap<KernelSpecs> specs_;
+
+  // Points to the global MLIRContext.
+  MLIRContext* ctx_;
+};
+
+}  // namespace quant
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_DEVICE_TARGET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/Passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/Passes.h
new file mode 100644
index 00000000..06f4697f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/Passes.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+//
+// This file defines all of the passes owned by the quantization dialect. As
+// things mature, it is expected that passes specific to certain frontend or
+// backend dialects will move to those dialects directly. For now, they are
+// incubated here.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_PASSES_H_
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace func {
+class FuncOp;
+}  // namespace func
+
+namespace quantfork {
+
+/// Creates a pass that converts quantization simulation operations (i.e.
+/// FakeQuant and those like it) to casts into/out of supported QuantizedTypes.
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertSimulatedQuantPass();
+
+/// Creates a pass that converts constants followed by a qbarrier to a
+/// constant whose value is quantized. This is typically one of the last
+/// passes done when lowering to express actual quantized arithmetic in a
+/// low level representation. Because it modifies the constant, it is
+/// destructive and cannot be undone.
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertConstPass();
+
+//===----------------------------------------------------------------------===//
+// Registration
+//===----------------------------------------------------------------------===//
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h.inc"
+
+}  // namespace quantfork
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h
new file mode 100644
index 00000000..bee081a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_QUANTOPS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_QUANTOPS_H_
+
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsDialect.h.inc"
+#define GET_OP_CLASSES
+
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_QUANTOPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h
new file mode 100644
index 00000000..bfc6afb8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_QUANTIZEUTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_QUANTIZEUTILS_H_
+
+namespace mlir {
+class Attribute;
+class Type;
+
+namespace quant {
+class QuantizedType;
+class UniformQuantizedType;
+}  // namespace quant
+namespace quantfork {
+class UniformQuantizedValueConverter;
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType().
+/// Returns nullptr if the conversion is not supported. On success, stores the
+/// converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttr(Attribute realValue,
+                       quant::QuantizedType quantizedElementType,
+                       Type &outConvertedType);
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType() and casted to an
+/// UniformQuantizedType. Returns nullptr if the conversion is not supported. On
+/// success, stores the converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttrUniform(Attribute realValue,
+                              quant::UniformQuantizedType quantizedElementType,
+                              const UniformQuantizedValueConverter &converter,
+                              Type &outConvertedType);
+}  // namespace quantfork
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_QUANTIZEUTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
new file mode 100644
index 00000000..9257f533
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_QUANTIZE_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_QUANTIZE_MODEL_H_
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/lite/debug/debug_options.pb.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace mlir {
+namespace lite {
+
+// Quantizes the input model represented as `model_buffer` and writes the result
+// to the `output_buffer`. Both `model_buffer` and `output_buffer` should be a
+// valid FlatBuffer format for Model supported by TFLite.
+//
+// The `input_type`, `output_type` and `inference_type` can be float32 / qint8 /
+// int8 / int16.
+//
+// Returns a partially quantized model if `fully_quantize` is false. Returns a
+// non-OK status if the quantization fails.
+//
+// When `verify_numeric` is true, the model will have it's original float ops
+// and NumericVerify ops to compare output values from the quantized and float
+// ops.
+//
+// When `legacy_float_scale` is true, the quantizer will use float scale instead
+// of double, and call TOCO's quantization routines to maintain bit-exactness of
+// the values with the TOCO quantizer.
+absl::Status QuantizeModel(
+    absl::string_view model_buffer, const tflite::TensorType &input_type,
+    const tflite::TensorType &output_type,
+    const tflite::TensorType &inference_type,
+    const std::unordered_set<std::string> &operator_names,
+    bool disable_per_channel, bool fully_quantize, std::string &output_buffer,
+    bool verify_numeric = false, bool whole_model_verify = false,
+    bool legacy_float_scale = true,
+    const absl::flat_hash_set<std::string> &denylisted_ops = {},
+    const absl::flat_hash_set<std::string> &denylisted_nodes = {},
+    bool enable_variable_quantization = false,
+    bool disable_per_channel_for_dense_layers = false,
+    const std::optional<const tensorflow::converter::DebugOptions>
+        &debug_options = std::nullopt);
+
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_QUANTIZE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
new file mode 100644
index 00000000..65d044e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
@@ -0,0 +1,91 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_QUANTIZE_WEIGHTS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_QUANTIZE_WEIGHTS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace mlir {
+namespace lite {
+
+// Supported resulting types from quantization process.
+enum class BufferType { QUANTIZED_INT8, QUANTIZED_FLOAT16 };
+
+// Stores information about how to quantize a user-specified custom operation.
+// CustomOpInfo contains info of its corresponding CustomOp registered in the
+// CustomOpMap. 'quantizable_input_indices' is used to determine which indices
+// of the CustomOp are quantizable. 'is_weight_only' is used specify whether the
+// custom op is quantized only for storage and dequantized at runtime.
+// 'no_side_effect' is used to determine whether the op can be pruned if
+// considered as trivially dead.
+struct CustomOpInfo {
+  std::vector<std::int32_t> quantizable_input_indices;
+  bool is_weight_only = false;
+  bool no_side_effect = true;
+};
+
+using BuiltinOperatorSet = absl::flat_hash_set<tflite::BuiltinOperator>;
+// Map from custom op code to custom op quantization information.
+using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
+
+// Applies dynamic range quantization for the given model wehre the input_model
+// type is flatbuffer but is converted to MLIR during quantization process and
+// then converted back to flatbuffer for return. Note that this is part of
+// reaching feature parity with the old quantizer for dynamic range
+// quantization, specifically for
+// third_party/tensorflow/lite/tools/optimize/quantize_weights.h.
+// TODO(b/202468183): Selective quantization + quant debugger support for
+// dynamic range quantization for verify_numeric and whole_model_verify flags.
+absl::Status QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const tflite::Model* input_model,
+    const tflite::TensorType& inference_type,
+    const absl::flat_hash_set<std::string>& denylisted_ops,
+    const CustomOpMap& custom_op_map,
+    int64_t minimum_elements_for_weights = 1024,
+    bool disable_per_channel = false, bool weight_only_quantization = false,
+    bool legacy_float_scale = false);
+
+// Overloading methods to support old quantizer versions API
+absl::Status QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const tflite::Model* input_model,
+                             int64_t weights_min_num_elements,
+                             bool use_hybrid_evaluation = true);
+
+absl::Status QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const tflite::Model* input_model,
+                             BufferType quant_type = BufferType::QUANTIZED_INT8,
+                             bool use_updated_hybrid_scheme = true);
+
+absl::Status QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const tflite::Model* input_model,
+                             int64_t weights_min_num_elements,
+                             const CustomOpMap& custom_op_map,
+                             bool use_updated_hybrid_scheme = true,
+                             const BuiltinOperatorSet& op_denylist = {});
+
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_QUANTIZE_WEIGHTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/test_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/test_util.h
new file mode 100644
index 00000000..8953a384
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/test_util.h
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TEST_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TEST_UTIL_H_
+
+#include <cstdarg>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+
+namespace mlir {
+namespace lite {
+namespace internal {
+// Test model with a single convolution.
+// Floating point weights of the model are all integers and lie in
+// range[-127, 127]. The weights have been put in such a way that each
+// channel has at least one weight as -127 and one weight as 127.
+// The activations are all in range: [-128, 127]
+// This means all bias computations should result in 1.0 scale.
+extern const char* kConvModelWithMinus128Plus127Weights;
+
+// Test model with single convolution where all weights are integers between
+// [0, 10] weights are randomly distributed. It is not guaranteed that min max
+// for weights are going to appear in each channel.
+// Activations have min = 0, max = 10.
+extern const char* kConvModelWith0Plus10Weights;
+
+// Test model where no bias is in the conv.
+extern const char* kConvModelWithNoBias;
+
+// A floating point model with a single softmax. The input tensor has min
+// and max in range [-5, 5], not necessarily -5 or +5.
+extern const char* kSingleSoftmaxModelMinMinus5MaxPlus5;
+
+// A floating point model with a single average pool. The input tensor has min
+// and max in range [-5, 5], not necessarily -5 or +5.
+extern const char* kSingleAvgPoolModelMinMinus5MaxPlus5;
+
+// Test model with a weights variable that is shared between a convolution layer
+// and an add operation.
+extern const char* kModelWithSharedWeights;
+
+// Test model with Add followed by a reshape. Model has 2 inputs for add.
+extern const char* kMultiInputAddWithReshape;
+
+// Test gather operation with quantized input.
+extern const char* kQuantizedWithGather;
+
+// Test model with a tf.constant input to tf.add. Model has 2 inputs one
+// constant and other placeholder.
+extern const char* kConstInputAddModel;
+
+// A float test model with concat that has [0, 5] and [0, 10] for inputs and [0,
+// 10] as output.
+extern const char* kFloatConcatMax5Max10Max10;
+
+// Test model with broadcast_to op.
+extern const char* kModelWithBroadcastToOp;
+
+// Test model with a custom op.
+extern const char* kModelWithCustomOp;
+
+// Test model with a argmax op.
+extern const char* kModelWithArgMaxOp;
+
+// Test model with a fully connected op.
+extern const char* kModelWithFCOp;
+
+// Test model with a gather_nd op.
+extern const char* kModelWithGatherNDOp;
+
+// Test model with a Where op.
+extern const char* kModelWithWhereOp;
+
+// Test model with mixed quantizable and un-quantizable ops.
+// reshape->custom->custom->squeeze.
+extern const char* kModelMixed;
+
+// Test model with mixed quantizable and
+// and un-quantizable ops for
+// activations in 16-bit.
+extern const char* kModelMixed16x8;
+
+// Test model with split op.
+extern const char* kModelSplit;
+
+// Test model with pack op.
+extern const char* kModelPack;
+
+// Test model with LSTM op that has layer norm, has projection, without
+// peephole, without cifg.
+extern const char* kLstmCalibrated;
+extern const char* kLstmQuantized;
+
+// Test model with LSTM op that has peephole, without layer norm, without
+// projection, without cifg.
+extern const char* kLstmCalibrated2;
+extern const char* kLstmQuantized2;
+
+extern const char* kUnidirectionalSequenceLstmCalibrated;
+extern const char* kUnidirectionalSequenceLstmQuantized;
+
+// Test model with a minimum op.
+extern const char* kModelWithMinimumOp;
+
+// Test model with a maximum op.
+extern const char* kModelWithMaximumOp;
+
+// Test model with a transpose op.
+extern const char* kModelWithTranspose;
+
+// Test model with SVDF op.
+extern const char* kSvdfCalibrated;
+extern const char* kSvdfQuantized;
+
+// Test model with an unpack op.
+extern const char* kModelWithUnpack;
+
+// Test QAT model with fc op.
+extern const char* kQatModelWithFc;
+
+// Test calibrated model with resource variables.
+extern const char* kModelWithResourceVarsCalibrated;
+
+// An error reporter that fails on testing.
+class FailOnErrorReporter : public tflite::ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override;
+};
+}  // namespace internal
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h
new file mode 100644
index 00000000..94742d11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TFL_TO_STD_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TFL_TO_STD_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// Converts all the tfl.quantize/tfl.dequantize ops to the ops in the mlir.quant
+// dialect ones in the function.
+void ConvertTFLQuantOpsToMlirQuantOps(func::FuncOp func);
+
+// Converts all the mlir.quant dialect ops to the tfl.quantize/tfl.dequantize
+// ops in the function.
+void ConvertMlirQuantOpsToTFLQuantOps(func::FuncOp func);
+
+// A helper class to convert target function to another representation using
+// `ConvertForward` function during construction and convert target function
+// back to the original representation using `ConvertBackward` function during
+// deconstruction.
+template <void (*ConvertForward)(func::FuncOp),
+          void (*ConvertBackward)(func::FuncOp)>
+class ScopedOpsConverter {
+ public:
+  explicit ScopedOpsConverter(func::FuncOp func) : func_(func) {
+    ConvertForward(func_);
+  }
+
+  ScopedOpsConverter(const ScopedOpsConverter&) = delete;
+  ScopedOpsConverter operator=(const ScopedOpsConverter&) = delete;
+  ScopedOpsConverter(const ScopedOpsConverter&&) = delete;
+  ScopedOpsConverter operator=(const ScopedOpsConverter&&) = delete;
+
+  ~ScopedOpsConverter() { ConvertBackward(func_); }
+
+ private:
+  func::FuncOp func_;
+};
+
+using ScopedTFLQuantOpsToMlirQuantOpsConverter =
+    ScopedOpsConverter<ConvertTFLQuantOpsToMlirQuantOps,
+                       ConvertMlirQuantOpsToTFLQuantOps>;
+using ScopedMlirQuantOpsToTFLQuantOpsConverter =
+    ScopedOpsConverter<ConvertMlirQuantOpsToTFLQuantOps,
+                       ConvertTFLQuantOpsToMlirQuantOps>;
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TFL_TO_STD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/model_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/model_utils.h
new file mode 100644
index 00000000..5841a4c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/model_utils.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file is the MLIR copy of part of
+// third_party/tensorflow/lite/tools/optimize/model_utils.h as part of the
+// effort to decouple TFLite from MLIR.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_MODEL_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_MODEL_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace mlir {
+namespace lite {
+namespace toco_legacy {
+
+using std::string;
+using tflite::ModelT;
+using tflite::OperatorT;
+using tflite::TensorT;
+using tflite::TensorType;
+
+// LINT.IfChange(MakeDequantizeOperator)
+// Creates a Dequantize OperatorT object.
+void MakeDequantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
+                            int32_t input, int32_t output);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/model_utils.h:MakeDequantizeOperator)
+
+// LINT.IfChange(MakeTensor)
+// Create a new TensorT object without quantization parameters.
+void MakeTensor(const string& name, const std::vector<int32_t>& shape,
+                const std::vector<int32_t>& shape_signature,
+                const TensorType& type, std::unique_ptr<TensorT>* tensor);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/model_utils.h:MakeTensor)
+
+// LINT.IfChange(HasMinMax)
+bool HasMinMax(const TensorT* tensor);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/model_utils.h:HasMinMax)
+
+}  // namespace toco_legacy
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_MODEL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h
new file mode 100644
index 00000000..7bc80a1b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file is the MLIR copy of part of
+// third_party/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+// as part of the effort to decouple TFLite from MLIR.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_PORTABLE_TENSOR_UTILS_H_
+
+#include <cstdint>
+
+namespace mlir {
+namespace lite {
+namespace toco_legacy {
+
+// LINT.IfChange(portable_symmetric_quantize_floats)
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float min_value,
+                                     float max_value, float* scaling_factor);
+// LINT.ThenChange(//tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h:portable_symmetric_quantize_floats)
+
+}  // namespace toco_legacy
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_PORTABLE_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h
new file mode 100644
index 00000000..bd68ed1c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file is the MLIR copy of part of
+// third_party/tensorflow/lite/tools/optimize/quantization_utils.h as part of
+// the effort to decouple TFLite from MLIR.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_QUANTIZATION_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace mlir {
+namespace lite {
+namespace toco_legacy {
+
+using tflite::ModelT;
+using tflite::QuantizationParametersT;
+using tflite::TensorT;
+using tflite::TensorType;
+
+// LINT.IfChange(num_elements)
+// Returns the number of elements in the given tensor.
+absl::Status NumElements(const TensorT& tensor, uint64_t* num_elements);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:num_elements)
+
+// LINT.IfChange(fill_per_channel_min_max)
+// Populates the max and min values for per channel quantization.
+absl::Status FillPerChannelMinMax(const float* const input,
+                                  const std::vector<int32_t>& dimension,
+                                  int32_t channel_dim_index,
+                                  QuantizationParametersT* quantization_params);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:fill_per_channel_min_max)
+
+// LINT.IfChange(symmetric_per_channel_quantization)
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+// Parameters:
+// - tensor is the tensor to be quantized, needed to access associated
+//   quantization parameters
+// - input is the float input data to be quantized.
+// - channel_dim_index is the channel index within "dimension".
+//   dimension[channel_dim_index] gives the number of channels.
+// - output_scale is the output scale, the size of which equals the number of
+//   channels.
+// - output_value is the output data, the size of which equals the number of
+//   inputs.
+absl::Status SymmetricPerChannelQuantization(TensorT* tensor,
+                                             const float* const input,
+                                             int32_t channel_dim_index,
+                                             std::vector<float>* output_scales,
+                                             std::vector<int8_t>* output_value);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:symmetric_per_channel_quantization)
+
+// LINT.IfChange(symmetric_per_channel_quantize_values)
+// Quantize the values given an array of scales.
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int32_t>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:symmetric_per_channel_quantize_values)
+
+// LINT.IfChange(symmetric_quantize_tensor)
+// Quantizes tensor using symmetric quantization with the min and max elements
+// of the tensor.
+absl::Status SymmetricQuantizeTensor(ModelT* model, TensorT* tensor);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:symmetric_quantize_tensor)
+
+// LINT.IfChange(symmetric_quantize_tensor_per_channel)
+// Quantizes tensor with per channel.
+absl::Status SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
+                                               int32_t channel_dim_index);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:symmetric_quantize_tensor_per_channel)
+
+// LINT.IfChange(quantize_tensor_float16)
+// Quantizes tensor to float16.
+absl::Status QuantizeTensorFloat16(ModelT* model, TensorT* tensor);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:quantize_tensor_float16)
+
+// LINT.IfChange(add_quantization_params)
+absl::Status AddQuantizationParams(const std::vector<float>& scales,
+                                   const std::vector<int64_t>& zero_point,
+                                   int quantized_dimension,
+                                   const uint8_t* buffer_data,
+                                   size_t buffer_size, TensorType output_type,
+                                   ModelT* model, TensorT* tensor);
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/quantization_utils.h:add_quantization_params)
+
+}  // namespace toco_legacy
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_QUANTIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.h
new file mode 100644
index 00000000..039c18d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.h
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_QUANTIZE_WEIGHTS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_QUANTIZE_WEIGHTS_H_
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace mlir {
+namespace lite {
+namespace toco_legacy {
+
+using ::tflite::BuiltinOperator;
+using ::tflite::Model;
+
+// Supported resulting types from quantization process.
+enum class BufferType { QUANTIZED_INT8, QUANTIZED_FLOAT16 };
+enum class QuantizerType { OLD_QUANTIZER, MLIR_QUANTIZER };
+
+// Stores information about how to quantize a user-specified custom operation.
+struct CustomOpInfo {
+  std::vector<std::int32_t> quantizable_input_indices;
+  bool is_hybrid;
+};
+
+// Map from custom op code to custom op quantization information.
+using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
+
+// This macro is for internal use for conversions requiring previous behavior.
+#ifdef TFLITE_USE_PREVIOUS_HYBRID_SCHEME
+// Use asymmetric quantized activations and per-channel quantized weights.
+constexpr bool kUseUpdatedHybridSchemeDefault = false;
+#else
+// Use symmetric quantized activations and per-channel quantized weights.
+constexpr bool kUseUpdatedHybridSchemeDefault = true;
+#endif
+
+// Quantizes input_model and populates the provided builder with the new model.
+// By default only weights tensors weight more than 1024 elements will be
+// quantized.
+//
+// A tflite::Model can be obtained from the builder with:
+//   const uint8_t* buffer = builder->GetBufferPointer();
+//   tflite::Model* model = GetModel(buffer);
+absl::Status QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    BufferType quant_type = BufferType::QUANTIZED_INT8,
+    bool use_updated_hybrid_scheme = kUseUpdatedHybridSchemeDefault,
+    QuantizerType quantizer_type = QuantizerType::OLD_QUANTIZER);
+
+// Same as above, but only weights with greater than or equal
+// weights_min_num_elements elements will be quantized.
+absl::Status QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    uint64_t weights_min_num_elements,
+    QuantizerType quantizer_type = QuantizerType::OLD_QUANTIZER);
+
+// Same as above, but with entry point of quantizing custom ops.
+absl::Status QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    uint64_t weights_min_num_elements, const CustomOpMap& custom_op_map,
+    QuantizerType quantizer_type = QuantizerType::OLD_QUANTIZER);
+
+// Same as above, but if use updated_hybrid_scheme is false,
+// use previous quantization scheme. Optional op_denylist argument
+// disables hybrid evaluation for provided BuiltinOperators.
+absl::Status QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    uint64_t weights_min_num_elements, const CustomOpMap& custom_op_map,
+    bool use_updated_hybrid_scheme,
+    const absl::flat_hash_set<BuiltinOperator>& op_denylist = {},
+    QuantizerType quantizer_type = QuantizerType::OLD_QUANTIZER);
+
+namespace internal {
+// If use_hybrid_evaluation is false, will disable using hybrid eval for
+// operations that support it.
+//
+// We use this internal QuantizeWeights call to test models with hybrid
+// evaluation disabled.
+absl::Status QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    uint64_t weights_min_num_elements, bool use_hybrid_evaluation,
+    QuantizerType quantizer_type = QuantizerType::OLD_QUANTIZER);
+}  // namespace internal
+
+}  // namespace toco_legacy
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_LITE_TOCO_LEGACY_QUANTIZE_WEIGHTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/numerical_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/numerical_utils.h
new file mode 100644
index 00000000..d938cd2c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/numerical_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_NUMERICAL_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_NUMERICAL_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "absl/types/optional.h"
+
+namespace mlir {
+namespace quant {
+
+using QuantizedMultiplier = std::pair<int32_t, int32_t>;
+using QuantizedRange = std::pair<int32_t, int32_t>;
+
+// Decompose double precision multiplier to integer multiplier and exponent.
+//    double_multiplier = int_multiplier * 2 ^ (-31 + exponent)
+// int_multiplier will be range of (2^31, 2^30].
+QuantizedMultiplier QuantizeMultiplier(double double_multiplier);
+
+// Calculate the effective quantized value range for the scale, zero point. The
+// range is the minimum range defined by [rmin, rmax] and [qmin, qmax].
+QuantizedRange CalculateQuantizedRange(double scale, int32_t zero_point,
+                                       std::optional<double> rmin,
+                                       std::optional<double> rmax, int32_t qmin,
+                                       int32_t qmax);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_NUMERICAL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/quantization_context.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
new file mode 100644
index 00000000..a1f40f86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
@@ -0,0 +1,245 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONTEXT_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONTEXT_H_
+
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/device_target.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+
+namespace mlir {
+namespace quant {
+
+static bool EmptyParams(QuantParams p) { return p == quant::QuantizedType(); }
+
+// The state for each op result during the quantization parameters propagation.
+struct QuantState {
+  // Quantization parameters propagated to an op result.
+  QuantParams params;
+  // A flag indicates this state (the params) shouldn't be changed after it is
+  // initialized. This flag will be set to true if the quantization parameters
+  // are from the quantization-aware training.
+  const bool immutable;
+
+  bool IsEmpty() { return EmptyParams(params); }
+};
+
+// The state for rescaling the propagated quantization parameters. This can be
+// on the input side to satisfy the constraint of previous operation, or on the
+// output side to satisfy the constraint of the next operation.
+struct RequantizeState {
+  // Sometimes, we have to "requantize" the quantization result to satisfy all
+  // the constraints. The "requantize" can happen either on the input or output
+  // of the quantization result.
+  enum RequantizePosition {
+    NO_REQUANTIZE,
+    ON_INPUT,
+    ON_OUTPUT
+  } pos = NO_REQUANTIZE;
+
+  // Quantization parameters will be used to add the requantize ops.
+  QuantParams params;
+};
+
+// This class manages all the intermediate quantization states.
+class QuantizeContext {
+ public:
+  QuantizeContext(func::FuncOp func, const DeviceTarget &spec);
+
+  // Returns all the quant region ops.
+  std::vector<quantfork::QuantizeRegionOp> GetAllOps();
+
+  // For each quant region op, propagates its quantization parameters according
+  // to the kernel specification and also returns the adjacent quant region ops
+  // which get the new quantization parameters propagated.
+  LogicalResult Handle(quantfork::QuantizeRegionOp op,
+                       llvm::SmallVectorImpl<Operation *> *new_items,
+                       bool *changed);
+
+  // Updates the port quantization specifications of all the quant region ops
+  // with the propagation results.
+  LogicalResult Finalize();
+
+  // Dumps the states stores in the state manager.
+  void DumpStates(quantfork::QuantizeRegionOp current_op = {});
+
+  // Update the quantization parameter for certain result of the op. By this
+  // method, the quantization parameter is propagated to all the users of the
+  // result as well.
+  bool SetResultParams(Operation *op, int index, QuantParams params) {
+    return states_manager_.SetResultParams(op, index, params);
+  }
+
+  // Update the quantization parameter for certain operand of the op. By this
+  // method, the quantization parameter is propagated to the defining op of
+  // operand as well.
+  bool SetOperandParams(Operation *op, int index, QuantParams params) {
+    return states_manager_.SetOperandParams(op, index, params);
+  }
+
+  // Return the quantization parameter of certain result of the op.
+  QuantParams GetResultParams(Operation *op, int index) {
+    return states_manager_.GetResultParams(op, index);
+  }
+
+  // Return the quantization parameter of certain operand of the op.
+  QuantParams GetOperandParams(Operation *op, int index) {
+    return states_manager_.GetOperandParams(op, index);
+  }
+
+  // Return the signature of the op.
+  KernelSpecs::Signature GetSignature(quantfork::QuantizeRegionOp op);
+
+  // A heuristic to get quantization parameters satisfies the same scale
+  // constraints:
+  // - If there are immutable states,
+  //   - use the single input, or,
+  //   - use the single output, or,
+  //   - use the first one in the collection,
+  // - use the single input if it is ready, or,
+  // - use the single output if it is ready, or,
+  // - use the first ready one in the collection.
+  QuantParams GetQuantParamsForSameScaleConstraint(Operation *op);
+
+  // Propagate `params` to all the quantizable port of the `op`. The adjacent
+  // ops, which have the parameters propagated to, are collected by `new_items`,
+  // so they can be added to the working queue. `changed` is set to true if
+  // there are any new elements being added to `new_items`.
+  LogicalResult PropagateQuantParams(Operation *op, const QuantParams params,
+                                     AdjacentOperations *new_items,
+                                     bool *changed);
+
+ private:
+  class StatesManager {
+   public:
+    // Sets the quantization parameters of the constant result according to its
+    // content.
+    //
+    // Always returns true.
+    bool SetConstantResultParams(Operation *op);
+
+    // Sets the quantization parameters of the result to a fixed value. If any
+    // quantization parameters have been propagated, a `requantize` will happen
+    // on the input of propagated quantization.
+    //
+    // Returns true, if the users of the result needs to be added to the
+    // worklist.
+    bool SetResultParams(Operation *op, int index, QuantParams params);
+
+    // Sets the quantization parameters of the operand to a fixed value. If any
+    // quantization parameters have been propagated, a `requantize` will happen
+    // on the output of propagated quantization.
+    //
+    // Returns true, if the defining op of the operand needs to be added to the
+    // worklist.
+    bool SetOperandParams(Operation *op, int index, QuantParams params);
+
+    // Returns the quantization parameters of the index-th result of the op.
+    QuantParams GetResultParams(Operation *op, int index) {
+      return states_[result_states_[{op, index}]].params;
+    }
+
+    // Returns the quantization parameters of the index-th operand of the op.
+    QuantParams GetOperandParams(Operation *op, int index) {
+      return states_[operand_states_[{op, index}]].params;
+    }
+
+   private:
+    friend class QuantizeContext;
+
+    // Uses the type of `val` to set the initial state of the index-th result if
+    // `as_result` is true or index-th operand if `as_result` is false. The
+    // state is immutable if the type is a quantized type. Returns the index of
+    // this new state in the state vector.
+    int InitializeState(quantfork::QuantizeRegionOp op, int index,
+                        bool as_result);
+
+    // Sets the state of the index-th operand of the op. If this operand is
+    // cached, uses the cached result without creating new entry in the state
+    // vector. Otherwise, allocate a new entry in the state vector.
+    void InitializeOperandState(quantfork::QuantizeRegionOp op, int index,
+                                llvm::DenseMap<Value, int> *cache);
+
+    // Sets the state of the index-th result of the op. If this result is
+    // cached, uses the cached result without creating new entry in the state
+    // vector. Otherwise, allocate a new entry in the state vector.
+    void InitializeResultState(quantfork::QuantizeRegionOp op, int index,
+                               llvm::DenseMap<Value, int> *cache);
+
+    // Returns the state of the index-th operand of the op.
+    QuantState &GetOperandQuantState(Operation *op, int index) {
+      return states_[operand_states_[{op, index}]];
+    }
+
+    // Returns the state of the index-th result of the op.
+    QuantState &GetResultQuantState(Operation *op, int index) {
+      return states_[result_states_[{op, index}]];
+    }
+
+    // Returns the state of the index-th operand of the op.
+    RequantizeState &GetOperandRequantizeState(Operation *op, int index) {
+      return rescale_states_[operand_states_[{op, index}]];
+    }
+
+    // Returns the state of the index-th result of the op.
+    RequantizeState &GetResultRequantizeState(Operation *op, int index) {
+      return rescale_states_[result_states_[{op, index}]];
+    }
+
+   private:
+    // This is used to identify an operand or result of an op. The second
+    // element of this pair is the index of the operand or result.
+    using OpValue = std::pair<mlir::Operation *, int>;
+
+    // The vector contains all the quantization parameters propagated from the
+    // defining operations of the value, or from the quantization aware
+    // training.
+    std::vector<QuantState> states_;
+
+    // The map contains all the quantization parameters which are required to
+    // satisfy the same operands and results constraint. The keys of this map
+    // are the values from `operand_states_` and `result_state_`.
+    std::unordered_map<int, RequantizeState> rescale_states_;
+
+    // Maps of indexes to the propagation state vector from the ops operands,
+    // results and arguments.
+    llvm::DenseMap<OpValue, int> operand_states_;
+    llvm::DenseMap<OpValue, int> result_states_;
+  };
+
+  func::FuncOp func_;
+
+  DeviceTarget target_spec_;
+
+  StatesManager states_manager_;
+};
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/quantization_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/quantization_passes.h
new file mode 100644
index 00000000..5c119a65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/quantization_passes.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_PASSES_H_
+
+#include <functional>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+using OperationToName = std::function<llvm::StringRef(Operation* op)>;
+
+// Creates an instance pass to import quantization stats to the operations in
+// the function. A custom method to get the name from the op is used because
+// different dialect ops might have different ways to assign the name.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateImportQuantStatsPass(
+    OperationToName op_to_name, const std::string& stats_str);
+
+// Creates an instance pass to import quantization stats to the operations in
+// the function. A custom method to get the name from the op is used because
+// different dialect ops might have different ways to assign the name.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateImportQuantStatsPassForTFControlDialect(const std::string& stats_str);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
new file mode 100644
index 00000000..c55d59ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adaptor functions for StableHLO Quantizer.
+// Provides simpler interfaces when integrating StableHLO Quantizer into TFLite
+// Converter.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_STABLEHLO_QUANTIZATION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_STABLEHLO_QUANTIZATION_H_
+
+#include <string>
+#include <unordered_set>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+
+namespace tensorflow {
+
+// Runs quantization on `module_op`. `saved_model_bundle` is required to
+// retrieve information about the original model (e.g. signature def mapping)
+// because quantization requires exporting the intermediate `ModuleOp` back to
+// SavedModel for calibration. Similarly, `saved_model_dir` is required to
+// access the assets of the original model. `saved_model_tags` uniquely
+// identifies the `MetaGraphDef`. `quantization_config` determines the behavior
+// of StableHLO Quantizer. `quantization_py_function_lib` contains python
+// implementations of certain APIs that are required for calibration.
+// `module_op` is the input graph to be quantized and it should contain
+// StableHLO ops.
+//
+// Returns a quantized `ModuleOp` in StableHLO, potentially wrapped inside a
+// XlaCallModuleOp. Returns a non-OK status if quantization fails, or any of
+// `saved_model_bundle` or `quantization_py_function_lib` is a nullptr.
+absl::StatusOr<mlir::ModuleOp> RunQuantization(
+    const SavedModelBundle* saved_model_bundle,
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& saved_model_tags,
+    const stablehlo::quantization::QuantizationConfig& quantization_config,
+    const tensorflow::quantization::PyFunctionLibrary*
+        quantization_py_function_lib,
+    mlir::ModuleOp module_op);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_STABLEHLO_QUANTIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h
new file mode 100644
index 00000000..a552cc65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_TENSORFLOW_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_TENSORFLOW_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Legalize the tf ops to the quant ops, so the quantization passes can work.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeTFToQuantPass();
+
+// Fallbacks ops that are not supported by TF Quantization to TFLite Flex ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateFallbackToFlexOpsPass(
+    const std::string &mode = "DEFAULT");
+
+}  // namespace TF
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_TENSORFLOW_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/conversion_metadata_generated.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/conversion_metadata_generated.h
new file mode 100755
index 00000000..12af129c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/conversion_metadata_generated.h
@@ -0,0 +1,672 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_CONVERSIONMETADATA_TFLITE_H_
+#define FLATBUFFERS_GENERATED_CONVERSIONMETADATA_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+
+struct Environment;
+struct EnvironmentBuilder;
+struct EnvironmentT;
+
+struct SparsityBlockSize;
+struct SparsityBlockSizeBuilder;
+struct SparsityBlockSizeT;
+
+struct ConversionOptions;
+struct ConversionOptionsBuilder;
+struct ConversionOptionsT;
+
+struct ConversionMetadata;
+struct ConversionMetadataBuilder;
+struct ConversionMetadataT;
+
+enum ModelType : int32_t {
+  ModelType_NONE = 0,
+  ModelType_TF_SAVED_MODEL = 1,
+  ModelType_KERAS_MODEL = 2,
+  ModelType_TF_CONCRETE_FUNCTIONS = 3,
+  ModelType_TF_GRAPH_DEF = 4,
+  ModelType_TF_SESSION = 5,
+  ModelType_JAX = 6,
+  ModelType_MIN = ModelType_NONE,
+  ModelType_MAX = ModelType_JAX
+};
+
+inline const ModelType (&EnumValuesModelType())[7] {
+  static const ModelType values[] = {
+    ModelType_NONE,
+    ModelType_TF_SAVED_MODEL,
+    ModelType_KERAS_MODEL,
+    ModelType_TF_CONCRETE_FUNCTIONS,
+    ModelType_TF_GRAPH_DEF,
+    ModelType_TF_SESSION,
+    ModelType_JAX
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesModelType() {
+  static const char * const names[8] = {
+    "NONE",
+    "TF_SAVED_MODEL",
+    "KERAS_MODEL",
+    "TF_CONCRETE_FUNCTIONS",
+    "TF_GRAPH_DEF",
+    "TF_SESSION",
+    "JAX",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameModelType(ModelType e) {
+  if (::flatbuffers::IsOutRange(e, ModelType_NONE, ModelType_JAX)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesModelType()[index];
+}
+
+enum ModelOptimizationMode : int32_t {
+  ModelOptimizationMode_PTQ_FLOAT16 = 1001,
+  ModelOptimizationMode_PTQ_DYNAMIC_RANGE = 1002,
+  ModelOptimizationMode_PTQ_FULL_INTEGER = 1003,
+  ModelOptimizationMode_PTQ_INT16 = 1004,
+  ModelOptimizationMode_QUANTIZATION_AWARE_TRAINING = 2000,
+  ModelOptimizationMode_RANDOM_SPARSITY = 3001,
+  ModelOptimizationMode_BLOCK_SPARSITY = 3002,
+  ModelOptimizationMode_STRUCTURED_SPARSITY = 3003,
+  ModelOptimizationMode_MIN = ModelOptimizationMode_PTQ_FLOAT16,
+  ModelOptimizationMode_MAX = ModelOptimizationMode_STRUCTURED_SPARSITY
+};
+
+inline const ModelOptimizationMode (&EnumValuesModelOptimizationMode())[8] {
+  static const ModelOptimizationMode values[] = {
+    ModelOptimizationMode_PTQ_FLOAT16,
+    ModelOptimizationMode_PTQ_DYNAMIC_RANGE,
+    ModelOptimizationMode_PTQ_FULL_INTEGER,
+    ModelOptimizationMode_PTQ_INT16,
+    ModelOptimizationMode_QUANTIZATION_AWARE_TRAINING,
+    ModelOptimizationMode_RANDOM_SPARSITY,
+    ModelOptimizationMode_BLOCK_SPARSITY,
+    ModelOptimizationMode_STRUCTURED_SPARSITY
+  };
+  return values;
+}
+
+inline const char *EnumNameModelOptimizationMode(ModelOptimizationMode e) {
+  switch (e) {
+    case ModelOptimizationMode_PTQ_FLOAT16: return "PTQ_FLOAT16";
+    case ModelOptimizationMode_PTQ_DYNAMIC_RANGE: return "PTQ_DYNAMIC_RANGE";
+    case ModelOptimizationMode_PTQ_FULL_INTEGER: return "PTQ_FULL_INTEGER";
+    case ModelOptimizationMode_PTQ_INT16: return "PTQ_INT16";
+    case ModelOptimizationMode_QUANTIZATION_AWARE_TRAINING: return "QUANTIZATION_AWARE_TRAINING";
+    case ModelOptimizationMode_RANDOM_SPARSITY: return "RANDOM_SPARSITY";
+    case ModelOptimizationMode_BLOCK_SPARSITY: return "BLOCK_SPARSITY";
+    case ModelOptimizationMode_STRUCTURED_SPARSITY: return "STRUCTURED_SPARSITY";
+    default: return "";
+  }
+}
+
+struct EnvironmentT : public ::flatbuffers::NativeTable {
+  typedef Environment TableType;
+  std::string tensorflow_version{};
+  uint32_t api_version = 0;
+  tflite::ModelType model_type = tflite::ModelType_NONE;
+};
+
+struct Environment FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EnvironmentT NativeTableType;
+  typedef EnvironmentBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TENSORFLOW_VERSION = 4,
+    VT_API_VERSION = 6,
+    VT_MODEL_TYPE = 8
+  };
+  const ::flatbuffers::String *tensorflow_version() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TENSORFLOW_VERSION);
+  }
+  uint32_t api_version() const {
+    return GetField<uint32_t>(VT_API_VERSION, 0);
+  }
+  tflite::ModelType model_type() const {
+    return static_cast<tflite::ModelType>(GetField<int32_t>(VT_MODEL_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TENSORFLOW_VERSION) &&
+           verifier.VerifyString(tensorflow_version()) &&
+           VerifyField<uint32_t>(verifier, VT_API_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_MODEL_TYPE, 4) &&
+           verifier.EndTable();
+  }
+  EnvironmentT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EnvironmentT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Environment> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EnvironmentBuilder {
+  typedef Environment Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tensorflow_version(::flatbuffers::Offset<::flatbuffers::String> tensorflow_version) {
+    fbb_.AddOffset(Environment::VT_TENSORFLOW_VERSION, tensorflow_version);
+  }
+  void add_api_version(uint32_t api_version) {
+    fbb_.AddElement<uint32_t>(Environment::VT_API_VERSION, api_version, 0);
+  }
+  void add_model_type(tflite::ModelType model_type) {
+    fbb_.AddElement<int32_t>(Environment::VT_MODEL_TYPE, static_cast<int32_t>(model_type), 0);
+  }
+  explicit EnvironmentBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Environment> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Environment>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Environment> CreateEnvironment(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> tensorflow_version = 0,
+    uint32_t api_version = 0,
+    tflite::ModelType model_type = tflite::ModelType_NONE) {
+  EnvironmentBuilder builder_(_fbb);
+  builder_.add_model_type(model_type);
+  builder_.add_api_version(api_version);
+  builder_.add_tensorflow_version(tensorflow_version);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Environment> CreateEnvironmentDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *tensorflow_version = nullptr,
+    uint32_t api_version = 0,
+    tflite::ModelType model_type = tflite::ModelType_NONE) {
+  auto tensorflow_version__ = tensorflow_version ? _fbb.CreateString(tensorflow_version) : 0;
+  return tflite::CreateEnvironment(
+      _fbb,
+      tensorflow_version__,
+      api_version,
+      model_type);
+}
+
+::flatbuffers::Offset<Environment> CreateEnvironment(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparsityBlockSizeT : public ::flatbuffers::NativeTable {
+  typedef SparsityBlockSize TableType;
+  std::vector<uint32_t> values{};
+};
+
+struct SparsityBlockSize FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SparsityBlockSizeT NativeTableType;
+  typedef SparsityBlockSizeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<uint32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  SparsityBlockSizeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparsityBlockSizeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SparsityBlockSize> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparsityBlockSizeBuilder {
+  typedef SparsityBlockSize Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values) {
+    fbb_.AddOffset(SparsityBlockSize::VT_VALUES, values);
+  }
+  explicit SparsityBlockSizeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SparsityBlockSize> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SparsityBlockSize>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values = 0) {
+  SparsityBlockSizeBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSizeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<uint32_t>(*values) : 0;
+  return tflite::CreateSparsityBlockSize(
+      _fbb,
+      values__);
+}
+
+::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConversionOptionsT : public ::flatbuffers::NativeTable {
+  typedef ConversionOptions TableType;
+  std::vector<tflite::ModelOptimizationMode> model_optimization_modes{};
+  bool allow_custom_ops = false;
+  bool enable_select_tf_ops = false;
+  bool force_select_tf_ops = false;
+  std::vector<std::unique_ptr<tflite::SparsityBlockSizeT>> sparsity_block_sizes{};
+  ConversionOptionsT() = default;
+  ConversionOptionsT(const ConversionOptionsT &o);
+  ConversionOptionsT(ConversionOptionsT&&) FLATBUFFERS_NOEXCEPT = default;
+  ConversionOptionsT &operator=(ConversionOptionsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConversionOptionsT NativeTableType;
+  typedef ConversionOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_OPTIMIZATION_MODES = 4,
+    VT_ALLOW_CUSTOM_OPS = 6,
+    VT_ENABLE_SELECT_TF_OPS = 8,
+    VT_FORCE_SELECT_TF_OPS = 10,
+    VT_SPARSITY_BLOCK_SIZES = 12
+  };
+  const ::flatbuffers::Vector<int32_t> *model_optimization_modes() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_MODEL_OPTIMIZATION_MODES);
+  }
+  bool allow_custom_ops() const {
+    return GetField<uint8_t>(VT_ALLOW_CUSTOM_OPS, 0) != 0;
+  }
+  bool enable_select_tf_ops() const {
+    return GetField<uint8_t>(VT_ENABLE_SELECT_TF_OPS, 0) != 0;
+  }
+  bool force_select_tf_ops() const {
+    return GetField<uint8_t>(VT_FORCE_SELECT_TF_OPS, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>> *>(VT_SPARSITY_BLOCK_SIZES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_OPTIMIZATION_MODES) &&
+           verifier.VerifyVector(model_optimization_modes()) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_CUSTOM_OPS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_SELECT_TF_OPS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_FORCE_SELECT_TF_OPS, 1) &&
+           VerifyOffset(verifier, VT_SPARSITY_BLOCK_SIZES) &&
+           verifier.VerifyVector(sparsity_block_sizes()) &&
+           verifier.VerifyVectorOfTables(sparsity_block_sizes()) &&
+           verifier.EndTable();
+  }
+  ConversionOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConversionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConversionOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConversionOptionsBuilder {
+  typedef ConversionOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_optimization_modes(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> model_optimization_modes) {
+    fbb_.AddOffset(ConversionOptions::VT_MODEL_OPTIMIZATION_MODES, model_optimization_modes);
+  }
+  void add_allow_custom_ops(bool allow_custom_ops) {
+    fbb_.AddElement<uint8_t>(ConversionOptions::VT_ALLOW_CUSTOM_OPS, static_cast<uint8_t>(allow_custom_ops), 0);
+  }
+  void add_enable_select_tf_ops(bool enable_select_tf_ops) {
+    fbb_.AddElement<uint8_t>(ConversionOptions::VT_ENABLE_SELECT_TF_OPS, static_cast<uint8_t>(enable_select_tf_ops), 0);
+  }
+  void add_force_select_tf_ops(bool force_select_tf_ops) {
+    fbb_.AddElement<uint8_t>(ConversionOptions::VT_FORCE_SELECT_TF_OPS, static_cast<uint8_t>(force_select_tf_ops), 0);
+  }
+  void add_sparsity_block_sizes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes) {
+    fbb_.AddOffset(ConversionOptions::VT_SPARSITY_BLOCK_SIZES, sparsity_block_sizes);
+  }
+  explicit ConversionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConversionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConversionOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConversionOptions> CreateConversionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> model_optimization_modes = 0,
+    bool allow_custom_ops = false,
+    bool enable_select_tf_ops = false,
+    bool force_select_tf_ops = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes = 0) {
+  ConversionOptionsBuilder builder_(_fbb);
+  builder_.add_sparsity_block_sizes(sparsity_block_sizes);
+  builder_.add_model_optimization_modes(model_optimization_modes);
+  builder_.add_force_select_tf_ops(force_select_tf_ops);
+  builder_.add_enable_select_tf_ops(enable_select_tf_ops);
+  builder_.add_allow_custom_ops(allow_custom_ops);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ConversionOptions> CreateConversionOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *model_optimization_modes = nullptr,
+    bool allow_custom_ops = false,
+    bool enable_select_tf_ops = false,
+    bool force_select_tf_ops = false,
+    const std::vector<::flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes = nullptr) {
+  auto model_optimization_modes__ = model_optimization_modes ? _fbb.CreateVector<int32_t>(*model_optimization_modes) : 0;
+  auto sparsity_block_sizes__ = sparsity_block_sizes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SparsityBlockSize>>(*sparsity_block_sizes) : 0;
+  return tflite::CreateConversionOptions(
+      _fbb,
+      model_optimization_modes__,
+      allow_custom_ops,
+      enable_select_tf_ops,
+      force_select_tf_ops,
+      sparsity_block_sizes__);
+}
+
+::flatbuffers::Offset<ConversionOptions> CreateConversionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConversionMetadataT : public ::flatbuffers::NativeTable {
+  typedef ConversionMetadata TableType;
+  std::unique_ptr<tflite::EnvironmentT> environment{};
+  std::unique_ptr<tflite::ConversionOptionsT> options{};
+  ConversionMetadataT() = default;
+  ConversionMetadataT(const ConversionMetadataT &o);
+  ConversionMetadataT(ConversionMetadataT&&) FLATBUFFERS_NOEXCEPT = default;
+  ConversionMetadataT &operator=(ConversionMetadataT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ConversionMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConversionMetadataT NativeTableType;
+  typedef ConversionMetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENVIRONMENT = 4,
+    VT_OPTIONS = 6
+  };
+  const tflite::Environment *environment() const {
+    return GetPointer<const tflite::Environment *>(VT_ENVIRONMENT);
+  }
+  const tflite::ConversionOptions *options() const {
+    return GetPointer<const tflite::ConversionOptions *>(VT_OPTIONS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ENVIRONMENT) &&
+           verifier.VerifyTable(environment()) &&
+           VerifyOffset(verifier, VT_OPTIONS) &&
+           verifier.VerifyTable(options()) &&
+           verifier.EndTable();
+  }
+  ConversionMetadataT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConversionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConversionMetadata> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConversionMetadataBuilder {
+  typedef ConversionMetadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_environment(::flatbuffers::Offset<tflite::Environment> environment) {
+    fbb_.AddOffset(ConversionMetadata::VT_ENVIRONMENT, environment);
+  }
+  void add_options(::flatbuffers::Offset<tflite::ConversionOptions> options) {
+    fbb_.AddOffset(ConversionMetadata::VT_OPTIONS, options);
+  }
+  explicit ConversionMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConversionMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConversionMetadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::Environment> environment = 0,
+    ::flatbuffers::Offset<tflite::ConversionOptions> options = 0) {
+  ConversionMetadataBuilder builder_(_fbb);
+  builder_.add_options(options);
+  builder_.add_environment(environment);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline EnvironmentT *Environment::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EnvironmentT>(new EnvironmentT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Environment::UnPackTo(EnvironmentT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tensorflow_version(); if (_e) _o->tensorflow_version = _e->str(); }
+  { auto _e = api_version(); _o->api_version = _e; }
+  { auto _e = model_type(); _o->model_type = _e; }
+}
+
+inline ::flatbuffers::Offset<Environment> Environment::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEnvironment(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Environment> CreateEnvironment(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EnvironmentT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tensorflow_version = _o->tensorflow_version.empty() ? 0 : _fbb.CreateString(_o->tensorflow_version);
+  auto _api_version = _o->api_version;
+  auto _model_type = _o->model_type;
+  return tflite::CreateEnvironment(
+      _fbb,
+      _tensorflow_version,
+      _api_version,
+      _model_type);
+}
+
+inline SparsityBlockSizeT *SparsityBlockSize::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SparsityBlockSizeT>(new SparsityBlockSizeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SparsityBlockSize::UnPackTo(SparsityBlockSizeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<SparsityBlockSize> SparsityBlockSize::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparsityBlockSize(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SparsityBlockSizeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateSparsityBlockSize(
+      _fbb,
+      _values);
+}
+
+inline ConversionOptionsT::ConversionOptionsT(const ConversionOptionsT &o)
+      : model_optimization_modes(o.model_optimization_modes),
+        allow_custom_ops(o.allow_custom_ops),
+        enable_select_tf_ops(o.enable_select_tf_ops),
+        force_select_tf_ops(o.force_select_tf_ops) {
+  sparsity_block_sizes.reserve(o.sparsity_block_sizes.size());
+  for (const auto &sparsity_block_sizes_ : o.sparsity_block_sizes) { sparsity_block_sizes.emplace_back((sparsity_block_sizes_) ? new tflite::SparsityBlockSizeT(*sparsity_block_sizes_) : nullptr); }
+}
+
+inline ConversionOptionsT &ConversionOptionsT::operator=(ConversionOptionsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(model_optimization_modes, o.model_optimization_modes);
+  std::swap(allow_custom_ops, o.allow_custom_ops);
+  std::swap(enable_select_tf_ops, o.enable_select_tf_ops);
+  std::swap(force_select_tf_ops, o.force_select_tf_ops);
+  std::swap(sparsity_block_sizes, o.sparsity_block_sizes);
+  return *this;
+}
+
+inline ConversionOptionsT *ConversionOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConversionOptionsT>(new ConversionOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConversionOptions::UnPackTo(ConversionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_optimization_modes(); if (_e) { _o->model_optimization_modes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->model_optimization_modes[_i] = static_cast<tflite::ModelOptimizationMode>(_e->Get(_i)); } } else { _o->model_optimization_modes.resize(0); } }
+  { auto _e = allow_custom_ops(); _o->allow_custom_ops = _e; }
+  { auto _e = enable_select_tf_ops(); _o->enable_select_tf_ops = _e; }
+  { auto _e = force_select_tf_ops(); _o->force_select_tf_ops = _e; }
+  { auto _e = sparsity_block_sizes(); if (_e) { _o->sparsity_block_sizes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->sparsity_block_sizes[_i]) { _e->Get(_i)->UnPackTo(_o->sparsity_block_sizes[_i].get(), _resolver); } else { _o->sparsity_block_sizes[_i] = std::unique_ptr<tflite::SparsityBlockSizeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->sparsity_block_sizes.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<ConversionOptions> ConversionOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConversionOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ConversionOptions> CreateConversionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConversionOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_optimization_modes = _o->model_optimization_modes.size() ? _fbb.CreateVectorScalarCast<int32_t>(::flatbuffers::data(_o->model_optimization_modes), _o->model_optimization_modes.size()) : 0;
+  auto _allow_custom_ops = _o->allow_custom_ops;
+  auto _enable_select_tf_ops = _o->enable_select_tf_ops;
+  auto _force_select_tf_ops = _o->force_select_tf_ops;
+  auto _sparsity_block_sizes = _o->sparsity_block_sizes.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SparsityBlockSize>> (_o->sparsity_block_sizes.size(), [](size_t i, _VectorArgs *__va) { return CreateSparsityBlockSize(*__va->__fbb, __va->__o->sparsity_block_sizes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateConversionOptions(
+      _fbb,
+      _model_optimization_modes,
+      _allow_custom_ops,
+      _enable_select_tf_ops,
+      _force_select_tf_ops,
+      _sparsity_block_sizes);
+}
+
+inline ConversionMetadataT::ConversionMetadataT(const ConversionMetadataT &o)
+      : environment((o.environment) ? new tflite::EnvironmentT(*o.environment) : nullptr),
+        options((o.options) ? new tflite::ConversionOptionsT(*o.options) : nullptr) {
+}
+
+inline ConversionMetadataT &ConversionMetadataT::operator=(ConversionMetadataT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(environment, o.environment);
+  std::swap(options, o.options);
+  return *this;
+}
+
+inline ConversionMetadataT *ConversionMetadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConversionMetadataT>(new ConversionMetadataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConversionMetadata::UnPackTo(ConversionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = environment(); if (_e) { if(_o->environment) { _e->UnPackTo(_o->environment.get(), _resolver); } else { _o->environment = std::unique_ptr<tflite::EnvironmentT>(_e->UnPack(_resolver)); } } else if (_o->environment) { _o->environment.reset(); } }
+  { auto _e = options(); if (_e) { if(_o->options) { _e->UnPackTo(_o->options.get(), _resolver); } else { _o->options = std::unique_ptr<tflite::ConversionOptionsT>(_e->UnPack(_resolver)); } } else if (_o->options) { _o->options.reset(); } }
+}
+
+inline ::flatbuffers::Offset<ConversionMetadata> ConversionMetadata::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConversionMetadata(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConversionMetadataT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _environment = _o->environment ? CreateEnvironment(_fbb, _o->environment.get(), _rehasher) : 0;
+  auto _options = _o->options ? CreateConversionOptions(_fbb, _o->options.get(), _rehasher) : 0;
+  return tflite::CreateConversionMetadata(
+      _fbb,
+      _environment,
+      _options);
+}
+
+inline const tflite::ConversionMetadata *GetConversionMetadata(const void *buf) {
+  return ::flatbuffers::GetRoot<tflite::ConversionMetadata>(buf);
+}
+
+inline const tflite::ConversionMetadata *GetSizePrefixedConversionMetadata(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::ConversionMetadata>(buf);
+}
+
+inline bool VerifyConversionMetadataBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::ConversionMetadata>(nullptr);
+}
+
+inline bool VerifySizePrefixedConversionMetadataBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::ConversionMetadata>(nullptr);
+}
+
+inline void FinishConversionMetadataBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::ConversionMetadata> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedConversionMetadataBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::ConversionMetadata> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+inline std::unique_ptr<tflite::ConversionMetadataT> UnPackConversionMetadata(
+    const void *buf,
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ConversionMetadataT>(GetConversionMetadata(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<tflite::ConversionMetadataT> UnPackSizePrefixedConversionMetadata(
+    const void *buf,
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ConversionMetadataT>(GetSizePrefixedConversionMetadata(buf)->UnPack(res));
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_CONVERSIONMETADATA_TFLITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_conversion_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_conversion_utils.h
new file mode 100644
index 00000000..ebf9219f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_conversion_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(BuiltinOperator builtin_code);
+
+// The following methods are for backward compatibility for the early version
+// three, which does not have an extended builtin code.
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    int32_t version = 1);
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    const char *custom_code = nullptr, int32_t version = 1);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_generated.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_generated.h
new file mode 100755
index 00000000..6262406f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_generated.h
@@ -0,0 +1,25321 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+
+struct CustomQuantization;
+struct CustomQuantizationBuilder;
+struct CustomQuantizationT;
+
+struct BlockwiseQuantization;
+struct BlockwiseQuantizationBuilder;
+struct BlockwiseQuantizationT;
+
+struct QuantizationParameters;
+struct QuantizationParametersBuilder;
+struct QuantizationParametersT;
+
+struct Int32Vector;
+struct Int32VectorBuilder;
+struct Int32VectorT;
+
+struct Uint16Vector;
+struct Uint16VectorBuilder;
+struct Uint16VectorT;
+
+struct Uint8Vector;
+struct Uint8VectorBuilder;
+struct Uint8VectorT;
+
+struct DimensionMetadata;
+struct DimensionMetadataBuilder;
+struct DimensionMetadataT;
+
+struct SparsityParameters;
+struct SparsityParametersBuilder;
+struct SparsityParametersT;
+
+struct VariantSubType;
+struct VariantSubTypeBuilder;
+struct VariantSubTypeT;
+
+struct Tensor;
+struct TensorBuilder;
+struct TensorT;
+
+struct StablehloGatherOptions;
+struct StablehloGatherOptionsBuilder;
+struct StablehloGatherOptionsT;
+
+struct StablehloTransposeOptions;
+struct StablehloTransposeOptionsBuilder;
+struct StablehloTransposeOptionsT;
+
+struct StablehloDotGeneralOptions;
+struct StablehloDotGeneralOptionsBuilder;
+struct StablehloDotGeneralOptionsT;
+
+struct StablehloReduceWindowOptions;
+struct StablehloReduceWindowOptionsBuilder;
+struct StablehloReduceWindowOptionsT;
+
+struct StablehloWhileOptions;
+struct StablehloWhileOptionsBuilder;
+struct StablehloWhileOptionsT;
+
+struct StablehloSortOptions;
+struct StablehloSortOptionsBuilder;
+struct StablehloSortOptionsT;
+
+struct StablehloConcatenateOptions;
+struct StablehloConcatenateOptionsBuilder;
+struct StablehloConcatenateOptionsT;
+
+struct StablehloBroadcastInDimOptions;
+struct StablehloBroadcastInDimOptionsBuilder;
+struct StablehloBroadcastInDimOptionsT;
+
+struct StablehloCompareOptions;
+struct StablehloCompareOptionsBuilder;
+struct StablehloCompareOptionsT;
+
+struct StablehloDynamicSliceOptions;
+struct StablehloDynamicSliceOptionsBuilder;
+struct StablehloDynamicSliceOptionsT;
+
+struct StablehloPadOptions;
+struct StablehloPadOptionsBuilder;
+struct StablehloPadOptionsT;
+
+struct StablehloIotaOptions;
+struct StablehloIotaOptionsBuilder;
+struct StablehloIotaOptionsT;
+
+struct StablehloCustomCallOptions;
+struct StablehloCustomCallOptionsBuilder;
+struct StablehloCustomCallOptionsT;
+
+struct StablehloReduceOptions;
+struct StablehloReduceOptionsBuilder;
+struct StablehloReduceOptionsT;
+
+struct StablehloSliceOptions;
+struct StablehloSliceOptionsBuilder;
+struct StablehloSliceOptionsT;
+
+struct StablehloConvolutionOptions;
+struct StablehloConvolutionOptionsBuilder;
+struct StablehloConvolutionOptionsT;
+
+struct StablehloScatterOptions;
+struct StablehloScatterOptionsBuilder;
+struct StablehloScatterOptionsT;
+
+struct StablehloCaseOptions;
+struct StablehloCaseOptionsBuilder;
+struct StablehloCaseOptionsT;
+
+struct StablehloRngBitGeneratorOptions;
+struct StablehloRngBitGeneratorOptionsBuilder;
+struct StablehloRngBitGeneratorOptionsT;
+
+struct Conv2DOptions;
+struct Conv2DOptionsBuilder;
+struct Conv2DOptionsT;
+
+struct Conv3DOptions;
+struct Conv3DOptionsBuilder;
+struct Conv3DOptionsT;
+
+struct Pool2DOptions;
+struct Pool2DOptionsBuilder;
+struct Pool2DOptionsT;
+
+struct DepthwiseConv2DOptions;
+struct DepthwiseConv2DOptionsBuilder;
+struct DepthwiseConv2DOptionsT;
+
+struct ConcatEmbeddingsOptions;
+struct ConcatEmbeddingsOptionsBuilder;
+struct ConcatEmbeddingsOptionsT;
+
+struct LSHProjectionOptions;
+struct LSHProjectionOptionsBuilder;
+struct LSHProjectionOptionsT;
+
+struct SVDFOptions;
+struct SVDFOptionsBuilder;
+struct SVDFOptionsT;
+
+struct RNNOptions;
+struct RNNOptionsBuilder;
+struct RNNOptionsT;
+
+struct SequenceRNNOptions;
+struct SequenceRNNOptionsBuilder;
+struct SequenceRNNOptionsT;
+
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsBuilder;
+struct BidirectionalSequenceRNNOptionsT;
+
+struct FullyConnectedOptions;
+struct FullyConnectedOptionsBuilder;
+struct FullyConnectedOptionsT;
+
+struct SoftmaxOptions;
+struct SoftmaxOptionsBuilder;
+struct SoftmaxOptionsT;
+
+struct ConcatenationOptions;
+struct ConcatenationOptionsBuilder;
+struct ConcatenationOptionsT;
+
+struct AddOptions;
+struct AddOptionsBuilder;
+struct AddOptionsT;
+
+struct MulOptions;
+struct MulOptionsBuilder;
+struct MulOptionsT;
+
+struct L2NormOptions;
+struct L2NormOptionsBuilder;
+struct L2NormOptionsT;
+
+struct LocalResponseNormalizationOptions;
+struct LocalResponseNormalizationOptionsBuilder;
+struct LocalResponseNormalizationOptionsT;
+
+struct LSTMOptions;
+struct LSTMOptionsBuilder;
+struct LSTMOptionsT;
+
+struct UnidirectionalSequenceLSTMOptions;
+struct UnidirectionalSequenceLSTMOptionsBuilder;
+struct UnidirectionalSequenceLSTMOptionsT;
+
+struct BidirectionalSequenceLSTMOptions;
+struct BidirectionalSequenceLSTMOptionsBuilder;
+struct BidirectionalSequenceLSTMOptionsT;
+
+struct ResizeBilinearOptions;
+struct ResizeBilinearOptionsBuilder;
+struct ResizeBilinearOptionsT;
+
+struct ResizeNearestNeighborOptions;
+struct ResizeNearestNeighborOptionsBuilder;
+struct ResizeNearestNeighborOptionsT;
+
+struct CallOptions;
+struct CallOptionsBuilder;
+struct CallOptionsT;
+
+struct PadOptions;
+struct PadOptionsBuilder;
+struct PadOptionsT;
+
+struct PadV2Options;
+struct PadV2OptionsBuilder;
+struct PadV2OptionsT;
+
+struct ReshapeOptions;
+struct ReshapeOptionsBuilder;
+struct ReshapeOptionsT;
+
+struct SpaceToBatchNDOptions;
+struct SpaceToBatchNDOptionsBuilder;
+struct SpaceToBatchNDOptionsT;
+
+struct BatchToSpaceNDOptions;
+struct BatchToSpaceNDOptionsBuilder;
+struct BatchToSpaceNDOptionsT;
+
+struct SkipGramOptions;
+struct SkipGramOptionsBuilder;
+struct SkipGramOptionsT;
+
+struct SpaceToDepthOptions;
+struct SpaceToDepthOptionsBuilder;
+struct SpaceToDepthOptionsT;
+
+struct DepthToSpaceOptions;
+struct DepthToSpaceOptionsBuilder;
+struct DepthToSpaceOptionsT;
+
+struct SubOptions;
+struct SubOptionsBuilder;
+struct SubOptionsT;
+
+struct DivOptions;
+struct DivOptionsBuilder;
+struct DivOptionsT;
+
+struct TopKV2Options;
+struct TopKV2OptionsBuilder;
+struct TopKV2OptionsT;
+
+struct EmbeddingLookupSparseOptions;
+struct EmbeddingLookupSparseOptionsBuilder;
+struct EmbeddingLookupSparseOptionsT;
+
+struct GatherOptions;
+struct GatherOptionsBuilder;
+struct GatherOptionsT;
+
+struct TransposeOptions;
+struct TransposeOptionsBuilder;
+struct TransposeOptionsT;
+
+struct ExpOptions;
+struct ExpOptionsBuilder;
+struct ExpOptionsT;
+
+struct CosOptions;
+struct CosOptionsBuilder;
+struct CosOptionsT;
+
+struct ReducerOptions;
+struct ReducerOptionsBuilder;
+struct ReducerOptionsT;
+
+struct SqueezeOptions;
+struct SqueezeOptionsBuilder;
+struct SqueezeOptionsT;
+
+struct SplitOptions;
+struct SplitOptionsBuilder;
+struct SplitOptionsT;
+
+struct SplitVOptions;
+struct SplitVOptionsBuilder;
+struct SplitVOptionsT;
+
+struct StridedSliceOptions;
+struct StridedSliceOptionsBuilder;
+struct StridedSliceOptionsT;
+
+struct LogSoftmaxOptions;
+struct LogSoftmaxOptionsBuilder;
+struct LogSoftmaxOptionsT;
+
+struct CastOptions;
+struct CastOptionsBuilder;
+struct CastOptionsT;
+
+struct DequantizeOptions;
+struct DequantizeOptionsBuilder;
+struct DequantizeOptionsT;
+
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsBuilder;
+struct MaximumMinimumOptionsT;
+
+struct TileOptions;
+struct TileOptionsBuilder;
+struct TileOptionsT;
+
+struct ArgMaxOptions;
+struct ArgMaxOptionsBuilder;
+struct ArgMaxOptionsT;
+
+struct ArgMinOptions;
+struct ArgMinOptionsBuilder;
+struct ArgMinOptionsT;
+
+struct GreaterOptions;
+struct GreaterOptionsBuilder;
+struct GreaterOptionsT;
+
+struct GreaterEqualOptions;
+struct GreaterEqualOptionsBuilder;
+struct GreaterEqualOptionsT;
+
+struct LessOptions;
+struct LessOptionsBuilder;
+struct LessOptionsT;
+
+struct LessEqualOptions;
+struct LessEqualOptionsBuilder;
+struct LessEqualOptionsT;
+
+struct NegOptions;
+struct NegOptionsBuilder;
+struct NegOptionsT;
+
+struct SelectOptions;
+struct SelectOptionsBuilder;
+struct SelectOptionsT;
+
+struct SliceOptions;
+struct SliceOptionsBuilder;
+struct SliceOptionsT;
+
+struct TransposeConvOptions;
+struct TransposeConvOptionsBuilder;
+struct TransposeConvOptionsT;
+
+struct ExpandDimsOptions;
+struct ExpandDimsOptionsBuilder;
+struct ExpandDimsOptionsT;
+
+struct SparseToDenseOptions;
+struct SparseToDenseOptionsBuilder;
+struct SparseToDenseOptionsT;
+
+struct EqualOptions;
+struct EqualOptionsBuilder;
+struct EqualOptionsT;
+
+struct NotEqualOptions;
+struct NotEqualOptionsBuilder;
+struct NotEqualOptionsT;
+
+struct ShapeOptions;
+struct ShapeOptionsBuilder;
+struct ShapeOptionsT;
+
+struct RankOptions;
+struct RankOptionsBuilder;
+struct RankOptionsT;
+
+struct PowOptions;
+struct PowOptionsBuilder;
+struct PowOptionsT;
+
+struct FakeQuantOptions;
+struct FakeQuantOptionsBuilder;
+struct FakeQuantOptionsT;
+
+struct PackOptions;
+struct PackOptionsBuilder;
+struct PackOptionsT;
+
+struct LogicalOrOptions;
+struct LogicalOrOptionsBuilder;
+struct LogicalOrOptionsT;
+
+struct OneHotOptions;
+struct OneHotOptionsBuilder;
+struct OneHotOptionsT;
+
+struct AbsOptions;
+struct AbsOptionsBuilder;
+struct AbsOptionsT;
+
+struct HardSwishOptions;
+struct HardSwishOptionsBuilder;
+struct HardSwishOptionsT;
+
+struct LogicalAndOptions;
+struct LogicalAndOptionsBuilder;
+struct LogicalAndOptionsT;
+
+struct LogicalNotOptions;
+struct LogicalNotOptionsBuilder;
+struct LogicalNotOptionsT;
+
+struct UnpackOptions;
+struct UnpackOptionsBuilder;
+struct UnpackOptionsT;
+
+struct FloorDivOptions;
+struct FloorDivOptionsBuilder;
+struct FloorDivOptionsT;
+
+struct SquareOptions;
+struct SquareOptionsBuilder;
+struct SquareOptionsT;
+
+struct ZerosLikeOptions;
+struct ZerosLikeOptionsBuilder;
+struct ZerosLikeOptionsT;
+
+struct FillOptions;
+struct FillOptionsBuilder;
+struct FillOptionsT;
+
+struct FloorModOptions;
+struct FloorModOptionsBuilder;
+struct FloorModOptionsT;
+
+struct RangeOptions;
+struct RangeOptionsBuilder;
+struct RangeOptionsT;
+
+struct LeakyReluOptions;
+struct LeakyReluOptionsBuilder;
+struct LeakyReluOptionsT;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsBuilder;
+struct SquaredDifferenceOptionsT;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsBuilder;
+struct MirrorPadOptionsT;
+
+struct UniqueOptions;
+struct UniqueOptionsBuilder;
+struct UniqueOptionsT;
+
+struct ReverseV2Options;
+struct ReverseV2OptionsBuilder;
+struct ReverseV2OptionsT;
+
+struct AddNOptions;
+struct AddNOptionsBuilder;
+struct AddNOptionsT;
+
+struct GatherNdOptions;
+struct GatherNdOptionsBuilder;
+struct GatherNdOptionsT;
+
+struct WhereOptions;
+struct WhereOptionsBuilder;
+struct WhereOptionsT;
+
+struct ReverseSequenceOptions;
+struct ReverseSequenceOptionsBuilder;
+struct ReverseSequenceOptionsT;
+
+struct MatrixDiagOptions;
+struct MatrixDiagOptionsBuilder;
+struct MatrixDiagOptionsT;
+
+struct QuantizeOptions;
+struct QuantizeOptionsBuilder;
+struct QuantizeOptionsT;
+
+struct MatrixSetDiagOptions;
+struct MatrixSetDiagOptionsBuilder;
+struct MatrixSetDiagOptionsT;
+
+struct IfOptions;
+struct IfOptionsBuilder;
+struct IfOptionsT;
+
+struct CallOnceOptions;
+struct CallOnceOptionsBuilder;
+struct CallOnceOptionsT;
+
+struct WhileOptions;
+struct WhileOptionsBuilder;
+struct WhileOptionsT;
+
+struct NonMaxSuppressionV4Options;
+struct NonMaxSuppressionV4OptionsBuilder;
+struct NonMaxSuppressionV4OptionsT;
+
+struct NonMaxSuppressionV5Options;
+struct NonMaxSuppressionV5OptionsBuilder;
+struct NonMaxSuppressionV5OptionsT;
+
+struct ScatterNdOptions;
+struct ScatterNdOptionsBuilder;
+struct ScatterNdOptionsT;
+
+struct SelectV2Options;
+struct SelectV2OptionsBuilder;
+struct SelectV2OptionsT;
+
+struct DensifyOptions;
+struct DensifyOptionsBuilder;
+struct DensifyOptionsT;
+
+struct SegmentSumOptions;
+struct SegmentSumOptionsBuilder;
+struct SegmentSumOptionsT;
+
+struct BatchMatMulOptions;
+struct BatchMatMulOptionsBuilder;
+struct BatchMatMulOptionsT;
+
+struct CumsumOptions;
+struct CumsumOptionsBuilder;
+struct CumsumOptionsT;
+
+struct BroadcastToOptions;
+struct BroadcastToOptionsBuilder;
+struct BroadcastToOptionsT;
+
+struct Rfft2dOptions;
+struct Rfft2dOptionsBuilder;
+struct Rfft2dOptionsT;
+
+struct HashtableOptions;
+struct HashtableOptionsBuilder;
+struct HashtableOptionsT;
+
+struct HashtableFindOptions;
+struct HashtableFindOptionsBuilder;
+struct HashtableFindOptionsT;
+
+struct HashtableImportOptions;
+struct HashtableImportOptionsBuilder;
+struct HashtableImportOptionsT;
+
+struct HashtableSizeOptions;
+struct HashtableSizeOptionsBuilder;
+struct HashtableSizeOptionsT;
+
+struct VarHandleOptions;
+struct VarHandleOptionsBuilder;
+struct VarHandleOptionsT;
+
+struct ReadVariableOptions;
+struct ReadVariableOptionsBuilder;
+struct ReadVariableOptionsT;
+
+struct AssignVariableOptions;
+struct AssignVariableOptionsBuilder;
+struct AssignVariableOptionsT;
+
+struct RandomOptions;
+struct RandomOptionsBuilder;
+struct RandomOptionsT;
+
+struct BucketizeOptions;
+struct BucketizeOptionsBuilder;
+struct BucketizeOptionsT;
+
+struct GeluOptions;
+struct GeluOptionsBuilder;
+struct GeluOptionsT;
+
+struct DynamicUpdateSliceOptions;
+struct DynamicUpdateSliceOptionsBuilder;
+struct DynamicUpdateSliceOptionsT;
+
+struct UnsortedSegmentProdOptions;
+struct UnsortedSegmentProdOptionsBuilder;
+struct UnsortedSegmentProdOptionsT;
+
+struct UnsortedSegmentMaxOptions;
+struct UnsortedSegmentMaxOptionsBuilder;
+struct UnsortedSegmentMaxOptionsT;
+
+struct UnsortedSegmentSumOptions;
+struct UnsortedSegmentSumOptionsBuilder;
+struct UnsortedSegmentSumOptionsT;
+
+struct ATan2Options;
+struct ATan2OptionsBuilder;
+struct ATan2OptionsT;
+
+struct UnsortedSegmentMinOptions;
+struct UnsortedSegmentMinOptionsBuilder;
+struct UnsortedSegmentMinOptionsT;
+
+struct SignOptions;
+struct SignOptionsBuilder;
+struct SignOptionsT;
+
+struct BitcastOptions;
+struct BitcastOptionsBuilder;
+struct BitcastOptionsT;
+
+struct BitwiseXorOptions;
+struct BitwiseXorOptionsBuilder;
+struct BitwiseXorOptionsT;
+
+struct RightShiftOptions;
+struct RightShiftOptionsBuilder;
+struct RightShiftOptionsT;
+
+struct DilateOptions;
+struct DilateOptionsBuilder;
+struct DilateOptionsT;
+
+struct ReduceWindowOptions;
+struct ReduceWindowOptionsBuilder;
+struct ReduceWindowOptionsT;
+
+struct OperatorCode;
+struct OperatorCodeBuilder;
+struct OperatorCodeT;
+
+struct StableHLOCompositeOptions;
+struct StableHLOCompositeOptionsBuilder;
+struct StableHLOCompositeOptionsT;
+
+struct StablehloShiftLeftOptions;
+struct StablehloShiftLeftOptionsBuilder;
+struct StablehloShiftLeftOptionsT;
+
+struct Operator;
+struct OperatorBuilder;
+struct OperatorT;
+
+struct SubGraph;
+struct SubGraphBuilder;
+struct SubGraphT;
+
+struct Buffer;
+struct BufferBuilder;
+struct BufferT;
+
+struct Metadata;
+struct MetadataBuilder;
+struct MetadataT;
+
+struct TensorMap;
+struct TensorMapBuilder;
+struct TensorMapT;
+
+struct SignatureDef;
+struct SignatureDefBuilder;
+struct SignatureDefT;
+
+struct Model;
+struct ModelBuilder;
+struct ModelT;
+
+enum TensorType : int8_t {
+  TensorType_FLOAT32 = 0,
+  TensorType_FLOAT16 = 1,
+  TensorType_INT32 = 2,
+  TensorType_UINT8 = 3,
+  TensorType_INT64 = 4,
+  TensorType_STRING = 5,
+  TensorType_BOOL = 6,
+  TensorType_INT16 = 7,
+  TensorType_COMPLEX64 = 8,
+  TensorType_INT8 = 9,
+  TensorType_FLOAT64 = 10,
+  TensorType_COMPLEX128 = 11,
+  TensorType_UINT64 = 12,
+  TensorType_RESOURCE = 13,
+  TensorType_VARIANT = 14,
+  TensorType_UINT32 = 15,
+  TensorType_UINT16 = 16,
+  TensorType_INT4 = 17,
+  TensorType_BFLOAT16 = 18,
+  TensorType_MIN = TensorType_FLOAT32,
+  TensorType_MAX = TensorType_BFLOAT16
+};
+
+inline const TensorType (&EnumValuesTensorType())[19] {
+  static const TensorType values[] = {
+    TensorType_FLOAT32,
+    TensorType_FLOAT16,
+    TensorType_INT32,
+    TensorType_UINT8,
+    TensorType_INT64,
+    TensorType_STRING,
+    TensorType_BOOL,
+    TensorType_INT16,
+    TensorType_COMPLEX64,
+    TensorType_INT8,
+    TensorType_FLOAT64,
+    TensorType_COMPLEX128,
+    TensorType_UINT64,
+    TensorType_RESOURCE,
+    TensorType_VARIANT,
+    TensorType_UINT32,
+    TensorType_UINT16,
+    TensorType_INT4,
+    TensorType_BFLOAT16
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTensorType() {
+  static const char * const names[20] = {
+    "FLOAT32",
+    "FLOAT16",
+    "INT32",
+    "UINT8",
+    "INT64",
+    "STRING",
+    "BOOL",
+    "INT16",
+    "COMPLEX64",
+    "INT8",
+    "FLOAT64",
+    "COMPLEX128",
+    "UINT64",
+    "RESOURCE",
+    "VARIANT",
+    "UINT32",
+    "UINT16",
+    "INT4",
+    "BFLOAT16",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTensorType(TensorType e) {
+  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_BFLOAT16)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTensorType()[index];
+}
+
+enum QuantizationDetails : uint8_t {
+  QuantizationDetails_NONE = 0,
+  QuantizationDetails_CustomQuantization = 1,
+  QuantizationDetails_BlockwiseQuantization = 2,
+  QuantizationDetails_MIN = QuantizationDetails_NONE,
+  QuantizationDetails_MAX = QuantizationDetails_BlockwiseQuantization
+};
+
+inline const QuantizationDetails (&EnumValuesQuantizationDetails())[3] {
+  static const QuantizationDetails values[] = {
+    QuantizationDetails_NONE,
+    QuantizationDetails_CustomQuantization,
+    QuantizationDetails_BlockwiseQuantization
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQuantizationDetails() {
+  static const char * const names[4] = {
+    "NONE",
+    "CustomQuantization",
+    "BlockwiseQuantization",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  if (::flatbuffers::IsOutRange(e, QuantizationDetails_NONE, QuantizationDetails_BlockwiseQuantization)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQuantizationDetails()[index];
+}
+
+template<typename T> struct QuantizationDetailsTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails_NONE;
+};
+
+template<> struct QuantizationDetailsTraits<tflite::CustomQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails_CustomQuantization;
+};
+
+template<> struct QuantizationDetailsTraits<tflite::BlockwiseQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails_BlockwiseQuantization;
+};
+
+template<typename T> struct QuantizationDetailsUnionTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails_NONE;
+};
+
+template<> struct QuantizationDetailsUnionTraits<tflite::CustomQuantizationT> {
+  static const QuantizationDetails enum_value = QuantizationDetails_CustomQuantization;
+};
+
+template<> struct QuantizationDetailsUnionTraits<tflite::BlockwiseQuantizationT> {
+  static const QuantizationDetails enum_value = QuantizationDetails_BlockwiseQuantization;
+};
+
+struct QuantizationDetailsUnion {
+  QuantizationDetails type;
+  void *value;
+
+  QuantizationDetailsUnion() : type(QuantizationDetails_NONE), value(nullptr) {}
+  QuantizationDetailsUnion(QuantizationDetailsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(QuantizationDetails_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  QuantizationDetailsUnion(const QuantizationDetailsUnion &);
+  QuantizationDetailsUnion &operator=(const QuantizationDetailsUnion &u)
+    { QuantizationDetailsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  QuantizationDetailsUnion &operator=(QuantizationDetailsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~QuantizationDetailsUnion() { Reset(); }
+
+  void Reset();
+
+  template <typename T>
+  void Set(T&& val) {
+    typedef typename std::remove_reference<T>::type RT;
+    Reset();
+    type = QuantizationDetailsUnionTraits<RT>::enum_value;
+    if (type != QuantizationDetails_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+
+  static void *UnPack(const void *obj, QuantizationDetails type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::CustomQuantizationT *AsCustomQuantization() {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<tflite::CustomQuantizationT *>(value) : nullptr;
+  }
+  const tflite::CustomQuantizationT *AsCustomQuantization() const {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<const tflite::CustomQuantizationT *>(value) : nullptr;
+  }
+  tflite::BlockwiseQuantizationT *AsBlockwiseQuantization() {
+    return type == QuantizationDetails_BlockwiseQuantization ?
+      reinterpret_cast<tflite::BlockwiseQuantizationT *>(value) : nullptr;
+  }
+  const tflite::BlockwiseQuantizationT *AsBlockwiseQuantization() const {
+    return type == QuantizationDetails_BlockwiseQuantization ?
+      reinterpret_cast<const tflite::BlockwiseQuantizationT *>(value) : nullptr;
+  }
+};
+
+bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+enum DimensionType : int8_t {
+  DimensionType_DENSE = 0,
+  DimensionType_SPARSE_CSR = 1,
+  DimensionType_MIN = DimensionType_DENSE,
+  DimensionType_MAX = DimensionType_SPARSE_CSR
+};
+
+inline const DimensionType (&EnumValuesDimensionType())[2] {
+  static const DimensionType values[] = {
+    DimensionType_DENSE,
+    DimensionType_SPARSE_CSR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDimensionType() {
+  static const char * const names[3] = {
+    "DENSE",
+    "SPARSE_CSR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDimensionType(DimensionType e) {
+  if (::flatbuffers::IsOutRange(e, DimensionType_DENSE, DimensionType_SPARSE_CSR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDimensionType()[index];
+}
+
+enum SparseIndexVector : uint8_t {
+  SparseIndexVector_NONE = 0,
+  SparseIndexVector_Int32Vector = 1,
+  SparseIndexVector_Uint16Vector = 2,
+  SparseIndexVector_Uint8Vector = 3,
+  SparseIndexVector_MIN = SparseIndexVector_NONE,
+  SparseIndexVector_MAX = SparseIndexVector_Uint8Vector
+};
+
+inline const SparseIndexVector (&EnumValuesSparseIndexVector())[4] {
+  static const SparseIndexVector values[] = {
+    SparseIndexVector_NONE,
+    SparseIndexVector_Int32Vector,
+    SparseIndexVector_Uint16Vector,
+    SparseIndexVector_Uint8Vector
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesSparseIndexVector() {
+  static const char * const names[5] = {
+    "NONE",
+    "Int32Vector",
+    "Uint16Vector",
+    "Uint8Vector",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameSparseIndexVector(SparseIndexVector e) {
+  if (::flatbuffers::IsOutRange(e, SparseIndexVector_NONE, SparseIndexVector_Uint8Vector)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesSparseIndexVector()[index];
+}
+
+template<typename T> struct SparseIndexVectorTraits {
+  static const SparseIndexVector enum_value = SparseIndexVector_NONE;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Int32Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Int32Vector;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Uint16Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint16Vector;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Uint8Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint8Vector;
+};
+
+template<typename T> struct SparseIndexVectorUnionTraits {
+  static const SparseIndexVector enum_value = SparseIndexVector_NONE;
+};
+
+template<> struct SparseIndexVectorUnionTraits<tflite::Int32VectorT> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Int32Vector;
+};
+
+template<> struct SparseIndexVectorUnionTraits<tflite::Uint16VectorT> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint16Vector;
+};
+
+template<> struct SparseIndexVectorUnionTraits<tflite::Uint8VectorT> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint8Vector;
+};
+
+struct SparseIndexVectorUnion {
+  SparseIndexVector type;
+  void *value;
+
+  SparseIndexVectorUnion() : type(SparseIndexVector_NONE), value(nullptr) {}
+  SparseIndexVectorUnion(SparseIndexVectorUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(SparseIndexVector_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  SparseIndexVectorUnion(const SparseIndexVectorUnion &);
+  SparseIndexVectorUnion &operator=(const SparseIndexVectorUnion &u)
+    { SparseIndexVectorUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  SparseIndexVectorUnion &operator=(SparseIndexVectorUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~SparseIndexVectorUnion() { Reset(); }
+
+  void Reset();
+
+  template <typename T>
+  void Set(T&& val) {
+    typedef typename std::remove_reference<T>::type RT;
+    Reset();
+    type = SparseIndexVectorUnionTraits<RT>::enum_value;
+    if (type != SparseIndexVector_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+
+  static void *UnPack(const void *obj, SparseIndexVector type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::Int32VectorT *AsInt32Vector() {
+    return type == SparseIndexVector_Int32Vector ?
+      reinterpret_cast<tflite::Int32VectorT *>(value) : nullptr;
+  }
+  const tflite::Int32VectorT *AsInt32Vector() const {
+    return type == SparseIndexVector_Int32Vector ?
+      reinterpret_cast<const tflite::Int32VectorT *>(value) : nullptr;
+  }
+  tflite::Uint16VectorT *AsUint16Vector() {
+    return type == SparseIndexVector_Uint16Vector ?
+      reinterpret_cast<tflite::Uint16VectorT *>(value) : nullptr;
+  }
+  const tflite::Uint16VectorT *AsUint16Vector() const {
+    return type == SparseIndexVector_Uint16Vector ?
+      reinterpret_cast<const tflite::Uint16VectorT *>(value) : nullptr;
+  }
+  tflite::Uint8VectorT *AsUint8Vector() {
+    return type == SparseIndexVector_Uint8Vector ?
+      reinterpret_cast<tflite::Uint8VectorT *>(value) : nullptr;
+  }
+  const tflite::Uint8VectorT *AsUint8Vector() const {
+    return type == SparseIndexVector_Uint8Vector ?
+      reinterpret_cast<const tflite::Uint8VectorT *>(value) : nullptr;
+  }
+};
+
+bool VerifySparseIndexVector(::flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type);
+bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+enum BuiltinOperator : int32_t {
+  BuiltinOperator_ADD = 0,
+  BuiltinOperator_AVERAGE_POOL_2D = 1,
+  BuiltinOperator_CONCATENATION = 2,
+  BuiltinOperator_CONV_2D = 3,
+  BuiltinOperator_DEPTHWISE_CONV_2D = 4,
+  BuiltinOperator_DEPTH_TO_SPACE = 5,
+  BuiltinOperator_DEQUANTIZE = 6,
+  BuiltinOperator_EMBEDDING_LOOKUP = 7,
+  BuiltinOperator_FLOOR = 8,
+  BuiltinOperator_FULLY_CONNECTED = 9,
+  BuiltinOperator_HASHTABLE_LOOKUP = 10,
+  BuiltinOperator_L2_NORMALIZATION = 11,
+  BuiltinOperator_L2_POOL_2D = 12,
+  BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION = 13,
+  BuiltinOperator_LOGISTIC = 14,
+  BuiltinOperator_LSH_PROJECTION = 15,
+  BuiltinOperator_LSTM = 16,
+  BuiltinOperator_MAX_POOL_2D = 17,
+  BuiltinOperator_MUL = 18,
+  BuiltinOperator_RELU = 19,
+  BuiltinOperator_RELU_N1_TO_1 = 20,
+  BuiltinOperator_RELU6 = 21,
+  BuiltinOperator_RESHAPE = 22,
+  BuiltinOperator_RESIZE_BILINEAR = 23,
+  BuiltinOperator_RNN = 24,
+  BuiltinOperator_SOFTMAX = 25,
+  BuiltinOperator_SPACE_TO_DEPTH = 26,
+  BuiltinOperator_SVDF = 27,
+  BuiltinOperator_TANH = 28,
+  BuiltinOperator_CONCAT_EMBEDDINGS = 29,
+  BuiltinOperator_SKIP_GRAM = 30,
+  BuiltinOperator_CALL = 31,
+  BuiltinOperator_CUSTOM = 32,
+  BuiltinOperator_EMBEDDING_LOOKUP_SPARSE = 33,
+  BuiltinOperator_PAD = 34,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  BuiltinOperator_GATHER = 36,
+  BuiltinOperator_BATCH_TO_SPACE_ND = 37,
+  BuiltinOperator_SPACE_TO_BATCH_ND = 38,
+  BuiltinOperator_TRANSPOSE = 39,
+  BuiltinOperator_MEAN = 40,
+  BuiltinOperator_SUB = 41,
+  BuiltinOperator_DIV = 42,
+  BuiltinOperator_SQUEEZE = 43,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  BuiltinOperator_STRIDED_SLICE = 45,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  BuiltinOperator_EXP = 47,
+  BuiltinOperator_TOPK_V2 = 48,
+  BuiltinOperator_SPLIT = 49,
+  BuiltinOperator_LOG_SOFTMAX = 50,
+  BuiltinOperator_DELEGATE = 51,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  BuiltinOperator_CAST = 53,
+  BuiltinOperator_PRELU = 54,
+  BuiltinOperator_MAXIMUM = 55,
+  BuiltinOperator_ARG_MAX = 56,
+  BuiltinOperator_MINIMUM = 57,
+  BuiltinOperator_LESS = 58,
+  BuiltinOperator_NEG = 59,
+  BuiltinOperator_PADV2 = 60,
+  BuiltinOperator_GREATER = 61,
+  BuiltinOperator_GREATER_EQUAL = 62,
+  BuiltinOperator_LESS_EQUAL = 63,
+  BuiltinOperator_SELECT = 64,
+  BuiltinOperator_SLICE = 65,
+  BuiltinOperator_SIN = 66,
+  BuiltinOperator_TRANSPOSE_CONV = 67,
+  BuiltinOperator_SPARSE_TO_DENSE = 68,
+  BuiltinOperator_TILE = 69,
+  BuiltinOperator_EXPAND_DIMS = 70,
+  BuiltinOperator_EQUAL = 71,
+  BuiltinOperator_NOT_EQUAL = 72,
+  BuiltinOperator_LOG = 73,
+  BuiltinOperator_SUM = 74,
+  BuiltinOperator_SQRT = 75,
+  BuiltinOperator_RSQRT = 76,
+  BuiltinOperator_SHAPE = 77,
+  BuiltinOperator_POW = 78,
+  BuiltinOperator_ARG_MIN = 79,
+  BuiltinOperator_FAKE_QUANT = 80,
+  BuiltinOperator_REDUCE_PROD = 81,
+  BuiltinOperator_REDUCE_MAX = 82,
+  BuiltinOperator_PACK = 83,
+  BuiltinOperator_LOGICAL_OR = 84,
+  BuiltinOperator_ONE_HOT = 85,
+  BuiltinOperator_LOGICAL_AND = 86,
+  BuiltinOperator_LOGICAL_NOT = 87,
+  BuiltinOperator_UNPACK = 88,
+  BuiltinOperator_REDUCE_MIN = 89,
+  BuiltinOperator_FLOOR_DIV = 90,
+  BuiltinOperator_REDUCE_ANY = 91,
+  BuiltinOperator_SQUARE = 92,
+  BuiltinOperator_ZEROS_LIKE = 93,
+  BuiltinOperator_FILL = 94,
+  BuiltinOperator_FLOOR_MOD = 95,
+  BuiltinOperator_RANGE = 96,
+  BuiltinOperator_RESIZE_NEAREST_NEIGHBOR = 97,
+  BuiltinOperator_LEAKY_RELU = 98,
+  BuiltinOperator_SQUARED_DIFFERENCE = 99,
+  BuiltinOperator_MIRROR_PAD = 100,
+  BuiltinOperator_ABS = 101,
+  BuiltinOperator_SPLIT_V = 102,
+  BuiltinOperator_UNIQUE = 103,
+  BuiltinOperator_CEIL = 104,
+  BuiltinOperator_REVERSE_V2 = 105,
+  BuiltinOperator_ADD_N = 106,
+  BuiltinOperator_GATHER_ND = 107,
+  BuiltinOperator_COS = 108,
+  BuiltinOperator_WHERE = 109,
+  BuiltinOperator_RANK = 110,
+  BuiltinOperator_ELU = 111,
+  BuiltinOperator_REVERSE_SEQUENCE = 112,
+  BuiltinOperator_MATRIX_DIAG = 113,
+  BuiltinOperator_QUANTIZE = 114,
+  BuiltinOperator_MATRIX_SET_DIAG = 115,
+  BuiltinOperator_ROUND = 116,
+  BuiltinOperator_HARD_SWISH = 117,
+  BuiltinOperator_IF = 118,
+  BuiltinOperator_WHILE = 119,
+  BuiltinOperator_NON_MAX_SUPPRESSION_V4 = 120,
+  BuiltinOperator_NON_MAX_SUPPRESSION_V5 = 121,
+  BuiltinOperator_SCATTER_ND = 122,
+  BuiltinOperator_SELECT_V2 = 123,
+  BuiltinOperator_DENSIFY = 124,
+  BuiltinOperator_SEGMENT_SUM = 125,
+  BuiltinOperator_BATCH_MATMUL = 126,
+  BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  BuiltinOperator_CUMSUM = 128,
+  BuiltinOperator_CALL_ONCE = 129,
+  BuiltinOperator_BROADCAST_TO = 130,
+  BuiltinOperator_RFFT2D = 131,
+  BuiltinOperator_CONV_3D = 132,
+  BuiltinOperator_IMAG = 133,
+  BuiltinOperator_REAL = 134,
+  BuiltinOperator_COMPLEX_ABS = 135,
+  BuiltinOperator_HASHTABLE = 136,
+  BuiltinOperator_HASHTABLE_FIND = 137,
+  BuiltinOperator_HASHTABLE_IMPORT = 138,
+  BuiltinOperator_HASHTABLE_SIZE = 139,
+  BuiltinOperator_REDUCE_ALL = 140,
+  BuiltinOperator_CONV_3D_TRANSPOSE = 141,
+  BuiltinOperator_VAR_HANDLE = 142,
+  BuiltinOperator_READ_VARIABLE = 143,
+  BuiltinOperator_ASSIGN_VARIABLE = 144,
+  BuiltinOperator_BROADCAST_ARGS = 145,
+  BuiltinOperator_RANDOM_STANDARD_NORMAL = 146,
+  BuiltinOperator_BUCKETIZE = 147,
+  BuiltinOperator_RANDOM_UNIFORM = 148,
+  BuiltinOperator_MULTINOMIAL = 149,
+  BuiltinOperator_GELU = 150,
+  BuiltinOperator_DYNAMIC_UPDATE_SLICE = 151,
+  BuiltinOperator_RELU_0_TO_1 = 152,
+  BuiltinOperator_UNSORTED_SEGMENT_PROD = 153,
+  BuiltinOperator_UNSORTED_SEGMENT_MAX = 154,
+  BuiltinOperator_UNSORTED_SEGMENT_SUM = 155,
+  BuiltinOperator_ATAN2 = 156,
+  BuiltinOperator_UNSORTED_SEGMENT_MIN = 157,
+  BuiltinOperator_SIGN = 158,
+  BuiltinOperator_BITCAST = 159,
+  BuiltinOperator_BITWISE_XOR = 160,
+  BuiltinOperator_RIGHT_SHIFT = 161,
+  BuiltinOperator_STABLEHLO_LOGISTIC = 162,
+  BuiltinOperator_STABLEHLO_ADD = 163,
+  BuiltinOperator_STABLEHLO_DIVIDE = 164,
+  BuiltinOperator_STABLEHLO_MULTIPLY = 165,
+  BuiltinOperator_STABLEHLO_MAXIMUM = 166,
+  BuiltinOperator_STABLEHLO_RESHAPE = 167,
+  BuiltinOperator_STABLEHLO_CLAMP = 168,
+  BuiltinOperator_STABLEHLO_CONCATENATE = 169,
+  BuiltinOperator_STABLEHLO_BROADCAST_IN_DIM = 170,
+  BuiltinOperator_STABLEHLO_CONVOLUTION = 171,
+  BuiltinOperator_STABLEHLO_SLICE = 172,
+  BuiltinOperator_STABLEHLO_CUSTOM_CALL = 173,
+  BuiltinOperator_STABLEHLO_REDUCE = 174,
+  BuiltinOperator_STABLEHLO_ABS = 175,
+  BuiltinOperator_STABLEHLO_AND = 176,
+  BuiltinOperator_STABLEHLO_COSINE = 177,
+  BuiltinOperator_STABLEHLO_EXPONENTIAL = 178,
+  BuiltinOperator_STABLEHLO_FLOOR = 179,
+  BuiltinOperator_STABLEHLO_LOG = 180,
+  BuiltinOperator_STABLEHLO_MINIMUM = 181,
+  BuiltinOperator_STABLEHLO_NEGATE = 182,
+  BuiltinOperator_STABLEHLO_OR = 183,
+  BuiltinOperator_STABLEHLO_POWER = 184,
+  BuiltinOperator_STABLEHLO_REMAINDER = 185,
+  BuiltinOperator_STABLEHLO_RSQRT = 186,
+  BuiltinOperator_STABLEHLO_SELECT = 187,
+  BuiltinOperator_STABLEHLO_SUBTRACT = 188,
+  BuiltinOperator_STABLEHLO_TANH = 189,
+  BuiltinOperator_STABLEHLO_SCATTER = 190,
+  BuiltinOperator_STABLEHLO_COMPARE = 191,
+  BuiltinOperator_STABLEHLO_CONVERT = 192,
+  BuiltinOperator_STABLEHLO_DYNAMIC_SLICE = 193,
+  BuiltinOperator_STABLEHLO_DYNAMIC_UPDATE_SLICE = 194,
+  BuiltinOperator_STABLEHLO_PAD = 195,
+  BuiltinOperator_STABLEHLO_IOTA = 196,
+  BuiltinOperator_STABLEHLO_DOT_GENERAL = 197,
+  BuiltinOperator_STABLEHLO_REDUCE_WINDOW = 198,
+  BuiltinOperator_STABLEHLO_SORT = 199,
+  BuiltinOperator_STABLEHLO_WHILE = 200,
+  BuiltinOperator_STABLEHLO_GATHER = 201,
+  BuiltinOperator_STABLEHLO_TRANSPOSE = 202,
+  BuiltinOperator_DILATE = 203,
+  BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR = 204,
+  BuiltinOperator_REDUCE_WINDOW = 205,
+  BuiltinOperator_STABLEHLO_COMPOSITE = 206,
+  BuiltinOperator_STABLEHLO_SHIFT_LEFT = 207,
+  BuiltinOperator_STABLEHLO_CBRT = 208,
+  BuiltinOperator_STABLEHLO_CASE = 209,
+  BuiltinOperator_MIN = BuiltinOperator_ADD,
+  BuiltinOperator_MAX = BuiltinOperator_STABLEHLO_CASE
+};
+
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[210] {
+  static const BuiltinOperator values[] = {
+    BuiltinOperator_ADD,
+    BuiltinOperator_AVERAGE_POOL_2D,
+    BuiltinOperator_CONCATENATION,
+    BuiltinOperator_CONV_2D,
+    BuiltinOperator_DEPTHWISE_CONV_2D,
+    BuiltinOperator_DEPTH_TO_SPACE,
+    BuiltinOperator_DEQUANTIZE,
+    BuiltinOperator_EMBEDDING_LOOKUP,
+    BuiltinOperator_FLOOR,
+    BuiltinOperator_FULLY_CONNECTED,
+    BuiltinOperator_HASHTABLE_LOOKUP,
+    BuiltinOperator_L2_NORMALIZATION,
+    BuiltinOperator_L2_POOL_2D,
+    BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+    BuiltinOperator_LOGISTIC,
+    BuiltinOperator_LSH_PROJECTION,
+    BuiltinOperator_LSTM,
+    BuiltinOperator_MAX_POOL_2D,
+    BuiltinOperator_MUL,
+    BuiltinOperator_RELU,
+    BuiltinOperator_RELU_N1_TO_1,
+    BuiltinOperator_RELU6,
+    BuiltinOperator_RESHAPE,
+    BuiltinOperator_RESIZE_BILINEAR,
+    BuiltinOperator_RNN,
+    BuiltinOperator_SOFTMAX,
+    BuiltinOperator_SPACE_TO_DEPTH,
+    BuiltinOperator_SVDF,
+    BuiltinOperator_TANH,
+    BuiltinOperator_CONCAT_EMBEDDINGS,
+    BuiltinOperator_SKIP_GRAM,
+    BuiltinOperator_CALL,
+    BuiltinOperator_CUSTOM,
+    BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+    BuiltinOperator_PAD,
+    BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+    BuiltinOperator_GATHER,
+    BuiltinOperator_BATCH_TO_SPACE_ND,
+    BuiltinOperator_SPACE_TO_BATCH_ND,
+    BuiltinOperator_TRANSPOSE,
+    BuiltinOperator_MEAN,
+    BuiltinOperator_SUB,
+    BuiltinOperator_DIV,
+    BuiltinOperator_SQUEEZE,
+    BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator_STRIDED_SLICE,
+    BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+    BuiltinOperator_EXP,
+    BuiltinOperator_TOPK_V2,
+    BuiltinOperator_SPLIT,
+    BuiltinOperator_LOG_SOFTMAX,
+    BuiltinOperator_DELEGATE,
+    BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator_CAST,
+    BuiltinOperator_PRELU,
+    BuiltinOperator_MAXIMUM,
+    BuiltinOperator_ARG_MAX,
+    BuiltinOperator_MINIMUM,
+    BuiltinOperator_LESS,
+    BuiltinOperator_NEG,
+    BuiltinOperator_PADV2,
+    BuiltinOperator_GREATER,
+    BuiltinOperator_GREATER_EQUAL,
+    BuiltinOperator_LESS_EQUAL,
+    BuiltinOperator_SELECT,
+    BuiltinOperator_SLICE,
+    BuiltinOperator_SIN,
+    BuiltinOperator_TRANSPOSE_CONV,
+    BuiltinOperator_SPARSE_TO_DENSE,
+    BuiltinOperator_TILE,
+    BuiltinOperator_EXPAND_DIMS,
+    BuiltinOperator_EQUAL,
+    BuiltinOperator_NOT_EQUAL,
+    BuiltinOperator_LOG,
+    BuiltinOperator_SUM,
+    BuiltinOperator_SQRT,
+    BuiltinOperator_RSQRT,
+    BuiltinOperator_SHAPE,
+    BuiltinOperator_POW,
+    BuiltinOperator_ARG_MIN,
+    BuiltinOperator_FAKE_QUANT,
+    BuiltinOperator_REDUCE_PROD,
+    BuiltinOperator_REDUCE_MAX,
+    BuiltinOperator_PACK,
+    BuiltinOperator_LOGICAL_OR,
+    BuiltinOperator_ONE_HOT,
+    BuiltinOperator_LOGICAL_AND,
+    BuiltinOperator_LOGICAL_NOT,
+    BuiltinOperator_UNPACK,
+    BuiltinOperator_REDUCE_MIN,
+    BuiltinOperator_FLOOR_DIV,
+    BuiltinOperator_REDUCE_ANY,
+    BuiltinOperator_SQUARE,
+    BuiltinOperator_ZEROS_LIKE,
+    BuiltinOperator_FILL,
+    BuiltinOperator_FLOOR_MOD,
+    BuiltinOperator_RANGE,
+    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator_LEAKY_RELU,
+    BuiltinOperator_SQUARED_DIFFERENCE,
+    BuiltinOperator_MIRROR_PAD,
+    BuiltinOperator_ABS,
+    BuiltinOperator_SPLIT_V,
+    BuiltinOperator_UNIQUE,
+    BuiltinOperator_CEIL,
+    BuiltinOperator_REVERSE_V2,
+    BuiltinOperator_ADD_N,
+    BuiltinOperator_GATHER_ND,
+    BuiltinOperator_COS,
+    BuiltinOperator_WHERE,
+    BuiltinOperator_RANK,
+    BuiltinOperator_ELU,
+    BuiltinOperator_REVERSE_SEQUENCE,
+    BuiltinOperator_MATRIX_DIAG,
+    BuiltinOperator_QUANTIZE,
+    BuiltinOperator_MATRIX_SET_DIAG,
+    BuiltinOperator_ROUND,
+    BuiltinOperator_HARD_SWISH,
+    BuiltinOperator_IF,
+    BuiltinOperator_WHILE,
+    BuiltinOperator_NON_MAX_SUPPRESSION_V4,
+    BuiltinOperator_NON_MAX_SUPPRESSION_V5,
+    BuiltinOperator_SCATTER_ND,
+    BuiltinOperator_SELECT_V2,
+    BuiltinOperator_DENSIFY,
+    BuiltinOperator_SEGMENT_SUM,
+    BuiltinOperator_BATCH_MATMUL,
+    BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES,
+    BuiltinOperator_CUMSUM,
+    BuiltinOperator_CALL_ONCE,
+    BuiltinOperator_BROADCAST_TO,
+    BuiltinOperator_RFFT2D,
+    BuiltinOperator_CONV_3D,
+    BuiltinOperator_IMAG,
+    BuiltinOperator_REAL,
+    BuiltinOperator_COMPLEX_ABS,
+    BuiltinOperator_HASHTABLE,
+    BuiltinOperator_HASHTABLE_FIND,
+    BuiltinOperator_HASHTABLE_IMPORT,
+    BuiltinOperator_HASHTABLE_SIZE,
+    BuiltinOperator_REDUCE_ALL,
+    BuiltinOperator_CONV_3D_TRANSPOSE,
+    BuiltinOperator_VAR_HANDLE,
+    BuiltinOperator_READ_VARIABLE,
+    BuiltinOperator_ASSIGN_VARIABLE,
+    BuiltinOperator_BROADCAST_ARGS,
+    BuiltinOperator_RANDOM_STANDARD_NORMAL,
+    BuiltinOperator_BUCKETIZE,
+    BuiltinOperator_RANDOM_UNIFORM,
+    BuiltinOperator_MULTINOMIAL,
+    BuiltinOperator_GELU,
+    BuiltinOperator_DYNAMIC_UPDATE_SLICE,
+    BuiltinOperator_RELU_0_TO_1,
+    BuiltinOperator_UNSORTED_SEGMENT_PROD,
+    BuiltinOperator_UNSORTED_SEGMENT_MAX,
+    BuiltinOperator_UNSORTED_SEGMENT_SUM,
+    BuiltinOperator_ATAN2,
+    BuiltinOperator_UNSORTED_SEGMENT_MIN,
+    BuiltinOperator_SIGN,
+    BuiltinOperator_BITCAST,
+    BuiltinOperator_BITWISE_XOR,
+    BuiltinOperator_RIGHT_SHIFT,
+    BuiltinOperator_STABLEHLO_LOGISTIC,
+    BuiltinOperator_STABLEHLO_ADD,
+    BuiltinOperator_STABLEHLO_DIVIDE,
+    BuiltinOperator_STABLEHLO_MULTIPLY,
+    BuiltinOperator_STABLEHLO_MAXIMUM,
+    BuiltinOperator_STABLEHLO_RESHAPE,
+    BuiltinOperator_STABLEHLO_CLAMP,
+    BuiltinOperator_STABLEHLO_CONCATENATE,
+    BuiltinOperator_STABLEHLO_BROADCAST_IN_DIM,
+    BuiltinOperator_STABLEHLO_CONVOLUTION,
+    BuiltinOperator_STABLEHLO_SLICE,
+    BuiltinOperator_STABLEHLO_CUSTOM_CALL,
+    BuiltinOperator_STABLEHLO_REDUCE,
+    BuiltinOperator_STABLEHLO_ABS,
+    BuiltinOperator_STABLEHLO_AND,
+    BuiltinOperator_STABLEHLO_COSINE,
+    BuiltinOperator_STABLEHLO_EXPONENTIAL,
+    BuiltinOperator_STABLEHLO_FLOOR,
+    BuiltinOperator_STABLEHLO_LOG,
+    BuiltinOperator_STABLEHLO_MINIMUM,
+    BuiltinOperator_STABLEHLO_NEGATE,
+    BuiltinOperator_STABLEHLO_OR,
+    BuiltinOperator_STABLEHLO_POWER,
+    BuiltinOperator_STABLEHLO_REMAINDER,
+    BuiltinOperator_STABLEHLO_RSQRT,
+    BuiltinOperator_STABLEHLO_SELECT,
+    BuiltinOperator_STABLEHLO_SUBTRACT,
+    BuiltinOperator_STABLEHLO_TANH,
+    BuiltinOperator_STABLEHLO_SCATTER,
+    BuiltinOperator_STABLEHLO_COMPARE,
+    BuiltinOperator_STABLEHLO_CONVERT,
+    BuiltinOperator_STABLEHLO_DYNAMIC_SLICE,
+    BuiltinOperator_STABLEHLO_DYNAMIC_UPDATE_SLICE,
+    BuiltinOperator_STABLEHLO_PAD,
+    BuiltinOperator_STABLEHLO_IOTA,
+    BuiltinOperator_STABLEHLO_DOT_GENERAL,
+    BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
+    BuiltinOperator_STABLEHLO_SORT,
+    BuiltinOperator_STABLEHLO_WHILE,
+    BuiltinOperator_STABLEHLO_GATHER,
+    BuiltinOperator_STABLEHLO_TRANSPOSE,
+    BuiltinOperator_DILATE,
+    BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR,
+    BuiltinOperator_REDUCE_WINDOW,
+    BuiltinOperator_STABLEHLO_COMPOSITE,
+    BuiltinOperator_STABLEHLO_SHIFT_LEFT,
+    BuiltinOperator_STABLEHLO_CBRT,
+    BuiltinOperator_STABLEHLO_CASE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOperator() {
+  static const char * const names[211] = {
+    "ADD",
+    "AVERAGE_POOL_2D",
+    "CONCATENATION",
+    "CONV_2D",
+    "DEPTHWISE_CONV_2D",
+    "DEPTH_TO_SPACE",
+    "DEQUANTIZE",
+    "EMBEDDING_LOOKUP",
+    "FLOOR",
+    "FULLY_CONNECTED",
+    "HASHTABLE_LOOKUP",
+    "L2_NORMALIZATION",
+    "L2_POOL_2D",
+    "LOCAL_RESPONSE_NORMALIZATION",
+    "LOGISTIC",
+    "LSH_PROJECTION",
+    "LSTM",
+    "MAX_POOL_2D",
+    "MUL",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "RESHAPE",
+    "RESIZE_BILINEAR",
+    "RNN",
+    "SOFTMAX",
+    "SPACE_TO_DEPTH",
+    "SVDF",
+    "TANH",
+    "CONCAT_EMBEDDINGS",
+    "SKIP_GRAM",
+    "CALL",
+    "CUSTOM",
+    "EMBEDDING_LOOKUP_SPARSE",
+    "PAD",
+    "UNIDIRECTIONAL_SEQUENCE_RNN",
+    "GATHER",
+    "BATCH_TO_SPACE_ND",
+    "SPACE_TO_BATCH_ND",
+    "TRANSPOSE",
+    "MEAN",
+    "SUB",
+    "DIV",
+    "SQUEEZE",
+    "UNIDIRECTIONAL_SEQUENCE_LSTM",
+    "STRIDED_SLICE",
+    "BIDIRECTIONAL_SEQUENCE_RNN",
+    "EXP",
+    "TOPK_V2",
+    "SPLIT",
+    "LOG_SOFTMAX",
+    "DELEGATE",
+    "BIDIRECTIONAL_SEQUENCE_LSTM",
+    "CAST",
+    "PRELU",
+    "MAXIMUM",
+    "ARG_MAX",
+    "MINIMUM",
+    "LESS",
+    "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
+    "EQUAL",
+    "NOT_EQUAL",
+    "LOG",
+    "SUM",
+    "SQRT",
+    "RSQRT",
+    "SHAPE",
+    "POW",
+    "ARG_MIN",
+    "FAKE_QUANT",
+    "REDUCE_PROD",
+    "REDUCE_MAX",
+    "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
+    "LOGICAL_AND",
+    "LOGICAL_NOT",
+    "UNPACK",
+    "REDUCE_MIN",
+    "FLOOR_DIV",
+    "REDUCE_ANY",
+    "SQUARE",
+    "ZEROS_LIKE",
+    "FILL",
+    "FLOOR_MOD",
+    "RANGE",
+    "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
+    "REVERSE_SEQUENCE",
+    "MATRIX_DIAG",
+    "QUANTIZE",
+    "MATRIX_SET_DIAG",
+    "ROUND",
+    "HARD_SWISH",
+    "IF",
+    "WHILE",
+    "NON_MAX_SUPPRESSION_V4",
+    "NON_MAX_SUPPRESSION_V5",
+    "SCATTER_ND",
+    "SELECT_V2",
+    "DENSIFY",
+    "SEGMENT_SUM",
+    "BATCH_MATMUL",
+    "PLACEHOLDER_FOR_GREATER_OP_CODES",
+    "CUMSUM",
+    "CALL_ONCE",
+    "BROADCAST_TO",
+    "RFFT2D",
+    "CONV_3D",
+    "IMAG",
+    "REAL",
+    "COMPLEX_ABS",
+    "HASHTABLE",
+    "HASHTABLE_FIND",
+    "HASHTABLE_IMPORT",
+    "HASHTABLE_SIZE",
+    "REDUCE_ALL",
+    "CONV_3D_TRANSPOSE",
+    "VAR_HANDLE",
+    "READ_VARIABLE",
+    "ASSIGN_VARIABLE",
+    "BROADCAST_ARGS",
+    "RANDOM_STANDARD_NORMAL",
+    "BUCKETIZE",
+    "RANDOM_UNIFORM",
+    "MULTINOMIAL",
+    "GELU",
+    "DYNAMIC_UPDATE_SLICE",
+    "RELU_0_TO_1",
+    "UNSORTED_SEGMENT_PROD",
+    "UNSORTED_SEGMENT_MAX",
+    "UNSORTED_SEGMENT_SUM",
+    "ATAN2",
+    "UNSORTED_SEGMENT_MIN",
+    "SIGN",
+    "BITCAST",
+    "BITWISE_XOR",
+    "RIGHT_SHIFT",
+    "STABLEHLO_LOGISTIC",
+    "STABLEHLO_ADD",
+    "STABLEHLO_DIVIDE",
+    "STABLEHLO_MULTIPLY",
+    "STABLEHLO_MAXIMUM",
+    "STABLEHLO_RESHAPE",
+    "STABLEHLO_CLAMP",
+    "STABLEHLO_CONCATENATE",
+    "STABLEHLO_BROADCAST_IN_DIM",
+    "STABLEHLO_CONVOLUTION",
+    "STABLEHLO_SLICE",
+    "STABLEHLO_CUSTOM_CALL",
+    "STABLEHLO_REDUCE",
+    "STABLEHLO_ABS",
+    "STABLEHLO_AND",
+    "STABLEHLO_COSINE",
+    "STABLEHLO_EXPONENTIAL",
+    "STABLEHLO_FLOOR",
+    "STABLEHLO_LOG",
+    "STABLEHLO_MINIMUM",
+    "STABLEHLO_NEGATE",
+    "STABLEHLO_OR",
+    "STABLEHLO_POWER",
+    "STABLEHLO_REMAINDER",
+    "STABLEHLO_RSQRT",
+    "STABLEHLO_SELECT",
+    "STABLEHLO_SUBTRACT",
+    "STABLEHLO_TANH",
+    "STABLEHLO_SCATTER",
+    "STABLEHLO_COMPARE",
+    "STABLEHLO_CONVERT",
+    "STABLEHLO_DYNAMIC_SLICE",
+    "STABLEHLO_DYNAMIC_UPDATE_SLICE",
+    "STABLEHLO_PAD",
+    "STABLEHLO_IOTA",
+    "STABLEHLO_DOT_GENERAL",
+    "STABLEHLO_REDUCE_WINDOW",
+    "STABLEHLO_SORT",
+    "STABLEHLO_WHILE",
+    "STABLEHLO_GATHER",
+    "STABLEHLO_TRANSPOSE",
+    "DILATE",
+    "STABLEHLO_RNG_BIT_GENERATOR",
+    "REDUCE_WINDOW",
+    "STABLEHLO_COMPOSITE",
+    "STABLEHLO_SHIFT_LEFT",
+    "STABLEHLO_CBRT",
+    "STABLEHLO_CASE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
+  if (::flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_STABLEHLO_CASE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOperator()[index];
+}
+
+enum BuiltinOptions : uint8_t {
+  BuiltinOptions_NONE = 0,
+  BuiltinOptions_Conv2DOptions = 1,
+  BuiltinOptions_DepthwiseConv2DOptions = 2,
+  BuiltinOptions_ConcatEmbeddingsOptions = 3,
+  BuiltinOptions_LSHProjectionOptions = 4,
+  BuiltinOptions_Pool2DOptions = 5,
+  BuiltinOptions_SVDFOptions = 6,
+  BuiltinOptions_RNNOptions = 7,
+  BuiltinOptions_FullyConnectedOptions = 8,
+  BuiltinOptions_SoftmaxOptions = 9,
+  BuiltinOptions_ConcatenationOptions = 10,
+  BuiltinOptions_AddOptions = 11,
+  BuiltinOptions_L2NormOptions = 12,
+  BuiltinOptions_LocalResponseNormalizationOptions = 13,
+  BuiltinOptions_LSTMOptions = 14,
+  BuiltinOptions_ResizeBilinearOptions = 15,
+  BuiltinOptions_CallOptions = 16,
+  BuiltinOptions_ReshapeOptions = 17,
+  BuiltinOptions_SkipGramOptions = 18,
+  BuiltinOptions_SpaceToDepthOptions = 19,
+  BuiltinOptions_EmbeddingLookupSparseOptions = 20,
+  BuiltinOptions_MulOptions = 21,
+  BuiltinOptions_PadOptions = 22,
+  BuiltinOptions_GatherOptions = 23,
+  BuiltinOptions_BatchToSpaceNDOptions = 24,
+  BuiltinOptions_SpaceToBatchNDOptions = 25,
+  BuiltinOptions_TransposeOptions = 26,
+  BuiltinOptions_ReducerOptions = 27,
+  BuiltinOptions_SubOptions = 28,
+  BuiltinOptions_DivOptions = 29,
+  BuiltinOptions_SqueezeOptions = 30,
+  BuiltinOptions_SequenceRNNOptions = 31,
+  BuiltinOptions_StridedSliceOptions = 32,
+  BuiltinOptions_ExpOptions = 33,
+  BuiltinOptions_TopKV2Options = 34,
+  BuiltinOptions_SplitOptions = 35,
+  BuiltinOptions_LogSoftmaxOptions = 36,
+  BuiltinOptions_CastOptions = 37,
+  BuiltinOptions_DequantizeOptions = 38,
+  BuiltinOptions_MaximumMinimumOptions = 39,
+  BuiltinOptions_ArgMaxOptions = 40,
+  BuiltinOptions_LessOptions = 41,
+  BuiltinOptions_NegOptions = 42,
+  BuiltinOptions_PadV2Options = 43,
+  BuiltinOptions_GreaterOptions = 44,
+  BuiltinOptions_GreaterEqualOptions = 45,
+  BuiltinOptions_LessEqualOptions = 46,
+  BuiltinOptions_SelectOptions = 47,
+  BuiltinOptions_SliceOptions = 48,
+  BuiltinOptions_TransposeConvOptions = 49,
+  BuiltinOptions_SparseToDenseOptions = 50,
+  BuiltinOptions_TileOptions = 51,
+  BuiltinOptions_ExpandDimsOptions = 52,
+  BuiltinOptions_EqualOptions = 53,
+  BuiltinOptions_NotEqualOptions = 54,
+  BuiltinOptions_ShapeOptions = 55,
+  BuiltinOptions_PowOptions = 56,
+  BuiltinOptions_ArgMinOptions = 57,
+  BuiltinOptions_FakeQuantOptions = 58,
+  BuiltinOptions_PackOptions = 59,
+  BuiltinOptions_LogicalOrOptions = 60,
+  BuiltinOptions_OneHotOptions = 61,
+  BuiltinOptions_LogicalAndOptions = 62,
+  BuiltinOptions_LogicalNotOptions = 63,
+  BuiltinOptions_UnpackOptions = 64,
+  BuiltinOptions_FloorDivOptions = 65,
+  BuiltinOptions_SquareOptions = 66,
+  BuiltinOptions_ZerosLikeOptions = 67,
+  BuiltinOptions_FillOptions = 68,
+  BuiltinOptions_BidirectionalSequenceLSTMOptions = 69,
+  BuiltinOptions_BidirectionalSequenceRNNOptions = 70,
+  BuiltinOptions_UnidirectionalSequenceLSTMOptions = 71,
+  BuiltinOptions_FloorModOptions = 72,
+  BuiltinOptions_RangeOptions = 73,
+  BuiltinOptions_ResizeNearestNeighborOptions = 74,
+  BuiltinOptions_LeakyReluOptions = 75,
+  BuiltinOptions_SquaredDifferenceOptions = 76,
+  BuiltinOptions_MirrorPadOptions = 77,
+  BuiltinOptions_AbsOptions = 78,
+  BuiltinOptions_SplitVOptions = 79,
+  BuiltinOptions_UniqueOptions = 80,
+  BuiltinOptions_ReverseV2Options = 81,
+  BuiltinOptions_AddNOptions = 82,
+  BuiltinOptions_GatherNdOptions = 83,
+  BuiltinOptions_CosOptions = 84,
+  BuiltinOptions_WhereOptions = 85,
+  BuiltinOptions_RankOptions = 86,
+  BuiltinOptions_ReverseSequenceOptions = 87,
+  BuiltinOptions_MatrixDiagOptions = 88,
+  BuiltinOptions_QuantizeOptions = 89,
+  BuiltinOptions_MatrixSetDiagOptions = 90,
+  BuiltinOptions_HardSwishOptions = 91,
+  BuiltinOptions_IfOptions = 92,
+  BuiltinOptions_WhileOptions = 93,
+  BuiltinOptions_DepthToSpaceOptions = 94,
+  BuiltinOptions_NonMaxSuppressionV4Options = 95,
+  BuiltinOptions_NonMaxSuppressionV5Options = 96,
+  BuiltinOptions_ScatterNdOptions = 97,
+  BuiltinOptions_SelectV2Options = 98,
+  BuiltinOptions_DensifyOptions = 99,
+  BuiltinOptions_SegmentSumOptions = 100,
+  BuiltinOptions_BatchMatMulOptions = 101,
+  BuiltinOptions_CumsumOptions = 102,
+  BuiltinOptions_CallOnceOptions = 103,
+  BuiltinOptions_BroadcastToOptions = 104,
+  BuiltinOptions_Rfft2dOptions = 105,
+  BuiltinOptions_Conv3DOptions = 106,
+  BuiltinOptions_HashtableOptions = 107,
+  BuiltinOptions_HashtableFindOptions = 108,
+  BuiltinOptions_HashtableImportOptions = 109,
+  BuiltinOptions_HashtableSizeOptions = 110,
+  BuiltinOptions_VarHandleOptions = 111,
+  BuiltinOptions_ReadVariableOptions = 112,
+  BuiltinOptions_AssignVariableOptions = 113,
+  BuiltinOptions_RandomOptions = 114,
+  BuiltinOptions_BucketizeOptions = 115,
+  BuiltinOptions_GeluOptions = 116,
+  BuiltinOptions_DynamicUpdateSliceOptions = 117,
+  BuiltinOptions_UnsortedSegmentProdOptions = 118,
+  BuiltinOptions_UnsortedSegmentMaxOptions = 119,
+  BuiltinOptions_UnsortedSegmentMinOptions = 120,
+  BuiltinOptions_UnsortedSegmentSumOptions = 121,
+  BuiltinOptions_ATan2Options = 122,
+  BuiltinOptions_SignOptions = 123,
+  BuiltinOptions_BitcastOptions = 124,
+  BuiltinOptions_BitwiseXorOptions = 125,
+  BuiltinOptions_RightShiftOptions = 126,
+  BuiltinOptions_MIN = BuiltinOptions_NONE,
+  BuiltinOptions_MAX = BuiltinOptions_RightShiftOptions
+};
+
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[127] {
+  static const BuiltinOptions values[] = {
+    BuiltinOptions_NONE,
+    BuiltinOptions_Conv2DOptions,
+    BuiltinOptions_DepthwiseConv2DOptions,
+    BuiltinOptions_ConcatEmbeddingsOptions,
+    BuiltinOptions_LSHProjectionOptions,
+    BuiltinOptions_Pool2DOptions,
+    BuiltinOptions_SVDFOptions,
+    BuiltinOptions_RNNOptions,
+    BuiltinOptions_FullyConnectedOptions,
+    BuiltinOptions_SoftmaxOptions,
+    BuiltinOptions_ConcatenationOptions,
+    BuiltinOptions_AddOptions,
+    BuiltinOptions_L2NormOptions,
+    BuiltinOptions_LocalResponseNormalizationOptions,
+    BuiltinOptions_LSTMOptions,
+    BuiltinOptions_ResizeBilinearOptions,
+    BuiltinOptions_CallOptions,
+    BuiltinOptions_ReshapeOptions,
+    BuiltinOptions_SkipGramOptions,
+    BuiltinOptions_SpaceToDepthOptions,
+    BuiltinOptions_EmbeddingLookupSparseOptions,
+    BuiltinOptions_MulOptions,
+    BuiltinOptions_PadOptions,
+    BuiltinOptions_GatherOptions,
+    BuiltinOptions_BatchToSpaceNDOptions,
+    BuiltinOptions_SpaceToBatchNDOptions,
+    BuiltinOptions_TransposeOptions,
+    BuiltinOptions_ReducerOptions,
+    BuiltinOptions_SubOptions,
+    BuiltinOptions_DivOptions,
+    BuiltinOptions_SqueezeOptions,
+    BuiltinOptions_SequenceRNNOptions,
+    BuiltinOptions_StridedSliceOptions,
+    BuiltinOptions_ExpOptions,
+    BuiltinOptions_TopKV2Options,
+    BuiltinOptions_SplitOptions,
+    BuiltinOptions_LogSoftmaxOptions,
+    BuiltinOptions_CastOptions,
+    BuiltinOptions_DequantizeOptions,
+    BuiltinOptions_MaximumMinimumOptions,
+    BuiltinOptions_ArgMaxOptions,
+    BuiltinOptions_LessOptions,
+    BuiltinOptions_NegOptions,
+    BuiltinOptions_PadV2Options,
+    BuiltinOptions_GreaterOptions,
+    BuiltinOptions_GreaterEqualOptions,
+    BuiltinOptions_LessEqualOptions,
+    BuiltinOptions_SelectOptions,
+    BuiltinOptions_SliceOptions,
+    BuiltinOptions_TransposeConvOptions,
+    BuiltinOptions_SparseToDenseOptions,
+    BuiltinOptions_TileOptions,
+    BuiltinOptions_ExpandDimsOptions,
+    BuiltinOptions_EqualOptions,
+    BuiltinOptions_NotEqualOptions,
+    BuiltinOptions_ShapeOptions,
+    BuiltinOptions_PowOptions,
+    BuiltinOptions_ArgMinOptions,
+    BuiltinOptions_FakeQuantOptions,
+    BuiltinOptions_PackOptions,
+    BuiltinOptions_LogicalOrOptions,
+    BuiltinOptions_OneHotOptions,
+    BuiltinOptions_LogicalAndOptions,
+    BuiltinOptions_LogicalNotOptions,
+    BuiltinOptions_UnpackOptions,
+    BuiltinOptions_FloorDivOptions,
+    BuiltinOptions_SquareOptions,
+    BuiltinOptions_ZerosLikeOptions,
+    BuiltinOptions_FillOptions,
+    BuiltinOptions_BidirectionalSequenceLSTMOptions,
+    BuiltinOptions_BidirectionalSequenceRNNOptions,
+    BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+    BuiltinOptions_FloorModOptions,
+    BuiltinOptions_RangeOptions,
+    BuiltinOptions_ResizeNearestNeighborOptions,
+    BuiltinOptions_LeakyReluOptions,
+    BuiltinOptions_SquaredDifferenceOptions,
+    BuiltinOptions_MirrorPadOptions,
+    BuiltinOptions_AbsOptions,
+    BuiltinOptions_SplitVOptions,
+    BuiltinOptions_UniqueOptions,
+    BuiltinOptions_ReverseV2Options,
+    BuiltinOptions_AddNOptions,
+    BuiltinOptions_GatherNdOptions,
+    BuiltinOptions_CosOptions,
+    BuiltinOptions_WhereOptions,
+    BuiltinOptions_RankOptions,
+    BuiltinOptions_ReverseSequenceOptions,
+    BuiltinOptions_MatrixDiagOptions,
+    BuiltinOptions_QuantizeOptions,
+    BuiltinOptions_MatrixSetDiagOptions,
+    BuiltinOptions_HardSwishOptions,
+    BuiltinOptions_IfOptions,
+    BuiltinOptions_WhileOptions,
+    BuiltinOptions_DepthToSpaceOptions,
+    BuiltinOptions_NonMaxSuppressionV4Options,
+    BuiltinOptions_NonMaxSuppressionV5Options,
+    BuiltinOptions_ScatterNdOptions,
+    BuiltinOptions_SelectV2Options,
+    BuiltinOptions_DensifyOptions,
+    BuiltinOptions_SegmentSumOptions,
+    BuiltinOptions_BatchMatMulOptions,
+    BuiltinOptions_CumsumOptions,
+    BuiltinOptions_CallOnceOptions,
+    BuiltinOptions_BroadcastToOptions,
+    BuiltinOptions_Rfft2dOptions,
+    BuiltinOptions_Conv3DOptions,
+    BuiltinOptions_HashtableOptions,
+    BuiltinOptions_HashtableFindOptions,
+    BuiltinOptions_HashtableImportOptions,
+    BuiltinOptions_HashtableSizeOptions,
+    BuiltinOptions_VarHandleOptions,
+    BuiltinOptions_ReadVariableOptions,
+    BuiltinOptions_AssignVariableOptions,
+    BuiltinOptions_RandomOptions,
+    BuiltinOptions_BucketizeOptions,
+    BuiltinOptions_GeluOptions,
+    BuiltinOptions_DynamicUpdateSliceOptions,
+    BuiltinOptions_UnsortedSegmentProdOptions,
+    BuiltinOptions_UnsortedSegmentMaxOptions,
+    BuiltinOptions_UnsortedSegmentMinOptions,
+    BuiltinOptions_UnsortedSegmentSumOptions,
+    BuiltinOptions_ATan2Options,
+    BuiltinOptions_SignOptions,
+    BuiltinOptions_BitcastOptions,
+    BuiltinOptions_BitwiseXorOptions,
+    BuiltinOptions_RightShiftOptions
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOptions() {
+  static const char * const names[128] = {
+    "NONE",
+    "Conv2DOptions",
+    "DepthwiseConv2DOptions",
+    "ConcatEmbeddingsOptions",
+    "LSHProjectionOptions",
+    "Pool2DOptions",
+    "SVDFOptions",
+    "RNNOptions",
+    "FullyConnectedOptions",
+    "SoftmaxOptions",
+    "ConcatenationOptions",
+    "AddOptions",
+    "L2NormOptions",
+    "LocalResponseNormalizationOptions",
+    "LSTMOptions",
+    "ResizeBilinearOptions",
+    "CallOptions",
+    "ReshapeOptions",
+    "SkipGramOptions",
+    "SpaceToDepthOptions",
+    "EmbeddingLookupSparseOptions",
+    "MulOptions",
+    "PadOptions",
+    "GatherOptions",
+    "BatchToSpaceNDOptions",
+    "SpaceToBatchNDOptions",
+    "TransposeOptions",
+    "ReducerOptions",
+    "SubOptions",
+    "DivOptions",
+    "SqueezeOptions",
+    "SequenceRNNOptions",
+    "StridedSliceOptions",
+    "ExpOptions",
+    "TopKV2Options",
+    "SplitOptions",
+    "LogSoftmaxOptions",
+    "CastOptions",
+    "DequantizeOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
+    "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
+    "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
+    "EqualOptions",
+    "NotEqualOptions",
+    "ShapeOptions",
+    "PowOptions",
+    "ArgMinOptions",
+    "FakeQuantOptions",
+    "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
+    "LogicalAndOptions",
+    "LogicalNotOptions",
+    "UnpackOptions",
+    "FloorDivOptions",
+    "SquareOptions",
+    "ZerosLikeOptions",
+    "FillOptions",
+    "BidirectionalSequenceLSTMOptions",
+    "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
+    "RangeOptions",
+    "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
+    "ReverseSequenceOptions",
+    "MatrixDiagOptions",
+    "QuantizeOptions",
+    "MatrixSetDiagOptions",
+    "HardSwishOptions",
+    "IfOptions",
+    "WhileOptions",
+    "DepthToSpaceOptions",
+    "NonMaxSuppressionV4Options",
+    "NonMaxSuppressionV5Options",
+    "ScatterNdOptions",
+    "SelectV2Options",
+    "DensifyOptions",
+    "SegmentSumOptions",
+    "BatchMatMulOptions",
+    "CumsumOptions",
+    "CallOnceOptions",
+    "BroadcastToOptions",
+    "Rfft2dOptions",
+    "Conv3DOptions",
+    "HashtableOptions",
+    "HashtableFindOptions",
+    "HashtableImportOptions",
+    "HashtableSizeOptions",
+    "VarHandleOptions",
+    "ReadVariableOptions",
+    "AssignVariableOptions",
+    "RandomOptions",
+    "BucketizeOptions",
+    "GeluOptions",
+    "DynamicUpdateSliceOptions",
+    "UnsortedSegmentProdOptions",
+    "UnsortedSegmentMaxOptions",
+    "UnsortedSegmentMinOptions",
+    "UnsortedSegmentSumOptions",
+    "ATan2Options",
+    "SignOptions",
+    "BitcastOptions",
+    "BitwiseXorOptions",
+    "RightShiftOptions",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_RightShiftOptions)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOptions()[index];
+}
+
+template<typename T> struct BuiltinOptionsTraits {
+  static const BuiltinOptions enum_value = BuiltinOptions_NONE;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Conv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Conv2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DepthwiseConv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthwiseConv2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ConcatEmbeddingsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatEmbeddingsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LSHProjectionOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSHProjectionOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Pool2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Pool2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SVDFOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SVDFOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FullyConnectedOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FullyConnectedOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ConcatenationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatenationOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AddOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::L2NormOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_L2NormOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LocalResponseNormalizationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LocalResponseNormalizationOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ResizeBilinearOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeBilinearOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CallOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReshapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReshapeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SkipGramOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SkipGramOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SpaceToDepthOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToDepthOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::EmbeddingLookupSparseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EmbeddingLookupSparseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MulOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GatherOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BatchToSpaceNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchToSpaceNDOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SpaceToBatchNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToBatchNDOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TransposeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReducerOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReducerOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SubOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SubOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DivOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SqueezeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SqueezeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::StridedSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_StridedSliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ExpOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TopKV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TopKV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SplitOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogSoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogSoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DequantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LessOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NegOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NegOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PadV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GreaterOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GreaterEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LessEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SelectOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TransposeConvOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeConvOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SparseToDenseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SparseToDenseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ExpandDimsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpandDimsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::EqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NotEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NotEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ShapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ShapeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PowOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PowOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ArgMinOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMinOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FakeQuantOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FakeQuantOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalOrOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalOrOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::OneHotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_OneHotOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalAndOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalAndOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalNotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalNotOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnpackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnpackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FloorDivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorDivOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SquareOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquareOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ZerosLikeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ZerosLikeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FillOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FillOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BidirectionalSequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FloorModOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorModOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RangeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RangeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ResizeNearestNeighborOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AbsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AbsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SplitVOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UniqueOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReverseV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AddNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GatherNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CosOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CosOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::WhereOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhereOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RankOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RankOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReverseSequenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseSequenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MatrixDiagOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MatrixDiagOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::QuantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_QuantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MatrixSetDiagOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MatrixSetDiagOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HardSwishOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HardSwishOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::IfOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_IfOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::WhileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DepthToSpaceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthToSpaceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NonMaxSuppressionV4Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NonMaxSuppressionV4Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NonMaxSuppressionV5Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NonMaxSuppressionV5Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ScatterNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ScatterNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SelectV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DensifyOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DensifyOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SegmentSumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BatchMatMulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchMatMulOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CumsumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CumsumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CallOnceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOnceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BroadcastToOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BroadcastToOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Rfft2dOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Rfft2dOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Conv3DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Conv3DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableFindOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableFindOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableImportOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableImportOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableSizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableSizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::VarHandleOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_VarHandleOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReadVariableOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReadVariableOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AssignVariableOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AssignVariableOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RandomOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RandomOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BucketizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BucketizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GeluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GeluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DynamicUpdateSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DynamicUpdateSliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentProdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentProdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentMinOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentMinOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentSumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ATan2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ATan2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SignOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SignOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BitcastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BitwiseXorOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RightShiftOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions;
+};
+
+template<typename T> struct BuiltinOptionsUnionTraits {
+  static const BuiltinOptions enum_value = BuiltinOptions_NONE;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::Conv2DOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Conv2DOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::DepthwiseConv2DOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthwiseConv2DOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ConcatEmbeddingsOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatEmbeddingsOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LSHProjectionOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSHProjectionOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::Pool2DOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Pool2DOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SVDFOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SVDFOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RNNOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RNNOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::FullyConnectedOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FullyConnectedOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SoftmaxOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ConcatenationOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatenationOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::AddOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::L2NormOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_L2NormOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LocalResponseNormalizationOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LocalResponseNormalizationOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LSTMOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSTMOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ResizeBilinearOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeBilinearOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::CallOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ReshapeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReshapeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SkipGramOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SkipGramOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SpaceToDepthOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToDepthOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::EmbeddingLookupSparseOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EmbeddingLookupSparseOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::MulOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MulOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::PadOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::GatherOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BatchToSpaceNDOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchToSpaceNDOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SpaceToBatchNDOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToBatchNDOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::TransposeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ReducerOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReducerOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SubOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SubOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::DivOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DivOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SqueezeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SqueezeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SequenceRNNOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::StridedSliceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_StridedSliceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ExpOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::TopKV2OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TopKV2Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SplitOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LogSoftmaxOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogSoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::CastOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CastOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::DequantizeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::MaximumMinimumOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ArgMaxOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LessOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::NegOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NegOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::PadV2OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadV2Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::GreaterOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::GreaterEqualOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterEqualOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LessEqualOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessEqualOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SelectOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SliceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SliceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::TransposeConvOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeConvOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SparseToDenseOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SparseToDenseOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::TileOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TileOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ExpandDimsOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpandDimsOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::EqualOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EqualOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::NotEqualOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NotEqualOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ShapeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ShapeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::PowOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PowOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ArgMinOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMinOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::FakeQuantOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FakeQuantOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::PackOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PackOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LogicalOrOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalOrOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::OneHotOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_OneHotOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LogicalAndOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalAndOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LogicalNotOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalNotOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UnpackOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnpackOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::FloorDivOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorDivOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SquareOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquareOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ZerosLikeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ZerosLikeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::FillOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FillOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BidirectionalSequenceLSTMOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BidirectionalSequenceRNNOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UnidirectionalSequenceLSTMOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::FloorModOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorModOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RangeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RangeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ResizeNearestNeighborOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::LeakyReluOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SquaredDifferenceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::MirrorPadOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::AbsOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AbsOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SplitVOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UniqueOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ReverseV2OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseV2Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::AddNOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddNOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::GatherNdOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::CosOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CosOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::WhereOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhereOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RankOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RankOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ReverseSequenceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseSequenceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::MatrixDiagOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MatrixDiagOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::QuantizeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_QuantizeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::MatrixSetDiagOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MatrixSetDiagOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::HardSwishOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HardSwishOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::IfOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_IfOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::WhileOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhileOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::DepthToSpaceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthToSpaceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::NonMaxSuppressionV4OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NonMaxSuppressionV4Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::NonMaxSuppressionV5OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NonMaxSuppressionV5Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ScatterNdOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ScatterNdOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SelectV2OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectV2Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::DensifyOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DensifyOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SegmentSumOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BatchMatMulOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchMatMulOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::CumsumOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CumsumOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::CallOnceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOnceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BroadcastToOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BroadcastToOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::Rfft2dOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Rfft2dOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::Conv3DOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Conv3DOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::HashtableOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::HashtableFindOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableFindOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::HashtableImportOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableImportOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::HashtableSizeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HashtableSizeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::VarHandleOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_VarHandleOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ReadVariableOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReadVariableOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::AssignVariableOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AssignVariableOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RandomOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RandomOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BucketizeOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BucketizeOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::GeluOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GeluOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::DynamicUpdateSliceOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DynamicUpdateSliceOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UnsortedSegmentProdOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentProdOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UnsortedSegmentMaxOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentMaxOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UnsortedSegmentMinOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentMinOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::UnsortedSegmentSumOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::ATan2OptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ATan2Options;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::SignOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SignOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BitcastOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BitwiseXorOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RightShiftOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions;
+};
+
+struct BuiltinOptionsUnion {
+  BuiltinOptions type;
+  void *value;
+
+  BuiltinOptionsUnion() : type(BuiltinOptions_NONE), value(nullptr) {}
+  BuiltinOptionsUnion(BuiltinOptionsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(BuiltinOptions_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  BuiltinOptionsUnion(const BuiltinOptionsUnion &);
+  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u)
+    { BuiltinOptionsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~BuiltinOptionsUnion() { Reset(); }
+
+  void Reset();
+
+  template <typename T>
+  void Set(T&& val) {
+    typedef typename std::remove_reference<T>::type RT;
+    Reset();
+    type = BuiltinOptionsUnionTraits<RT>::enum_value;
+    if (type != BuiltinOptions_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+
+  static void *UnPack(const void *obj, BuiltinOptions type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::Conv2DOptionsT *AsConv2DOptions() {
+    return type == BuiltinOptions_Conv2DOptions ?
+      reinterpret_cast<tflite::Conv2DOptionsT *>(value) : nullptr;
+  }
+  const tflite::Conv2DOptionsT *AsConv2DOptions() const {
+    return type == BuiltinOptions_Conv2DOptions ?
+      reinterpret_cast<const tflite::Conv2DOptionsT *>(value) : nullptr;
+  }
+  tflite::DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() {
+    return type == BuiltinOptions_DepthwiseConv2DOptions ?
+      reinterpret_cast<tflite::DepthwiseConv2DOptionsT *>(value) : nullptr;
+  }
+  const tflite::DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() const {
+    return type == BuiltinOptions_DepthwiseConv2DOptions ?
+      reinterpret_cast<const tflite::DepthwiseConv2DOptionsT *>(value) : nullptr;
+  }
+  tflite::ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() {
+    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
+      reinterpret_cast<tflite::ConcatEmbeddingsOptionsT *>(value) : nullptr;
+  }
+  const tflite::ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() const {
+    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
+      reinterpret_cast<const tflite::ConcatEmbeddingsOptionsT *>(value) : nullptr;
+  }
+  tflite::LSHProjectionOptionsT *AsLSHProjectionOptions() {
+    return type == BuiltinOptions_LSHProjectionOptions ?
+      reinterpret_cast<tflite::LSHProjectionOptionsT *>(value) : nullptr;
+  }
+  const tflite::LSHProjectionOptionsT *AsLSHProjectionOptions() const {
+    return type == BuiltinOptions_LSHProjectionOptions ?
+      reinterpret_cast<const tflite::LSHProjectionOptionsT *>(value) : nullptr;
+  }
+  tflite::Pool2DOptionsT *AsPool2DOptions() {
+    return type == BuiltinOptions_Pool2DOptions ?
+      reinterpret_cast<tflite::Pool2DOptionsT *>(value) : nullptr;
+  }
+  const tflite::Pool2DOptionsT *AsPool2DOptions() const {
+    return type == BuiltinOptions_Pool2DOptions ?
+      reinterpret_cast<const tflite::Pool2DOptionsT *>(value) : nullptr;
+  }
+  tflite::SVDFOptionsT *AsSVDFOptions() {
+    return type == BuiltinOptions_SVDFOptions ?
+      reinterpret_cast<tflite::SVDFOptionsT *>(value) : nullptr;
+  }
+  const tflite::SVDFOptionsT *AsSVDFOptions() const {
+    return type == BuiltinOptions_SVDFOptions ?
+      reinterpret_cast<const tflite::SVDFOptionsT *>(value) : nullptr;
+  }
+  tflite::RNNOptionsT *AsRNNOptions() {
+    return type == BuiltinOptions_RNNOptions ?
+      reinterpret_cast<tflite::RNNOptionsT *>(value) : nullptr;
+  }
+  const tflite::RNNOptionsT *AsRNNOptions() const {
+    return type == BuiltinOptions_RNNOptions ?
+      reinterpret_cast<const tflite::RNNOptionsT *>(value) : nullptr;
+  }
+  tflite::FullyConnectedOptionsT *AsFullyConnectedOptions() {
+    return type == BuiltinOptions_FullyConnectedOptions ?
+      reinterpret_cast<tflite::FullyConnectedOptionsT *>(value) : nullptr;
+  }
+  const tflite::FullyConnectedOptionsT *AsFullyConnectedOptions() const {
+    return type == BuiltinOptions_FullyConnectedOptions ?
+      reinterpret_cast<const tflite::FullyConnectedOptionsT *>(value) : nullptr;
+  }
+  tflite::SoftmaxOptionsT *AsSoftmaxOptions() {
+    return type == BuiltinOptions_SoftmaxOptions ?
+      reinterpret_cast<tflite::SoftmaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::SoftmaxOptionsT *AsSoftmaxOptions() const {
+    return type == BuiltinOptions_SoftmaxOptions ?
+      reinterpret_cast<const tflite::SoftmaxOptionsT *>(value) : nullptr;
+  }
+  tflite::ConcatenationOptionsT *AsConcatenationOptions() {
+    return type == BuiltinOptions_ConcatenationOptions ?
+      reinterpret_cast<tflite::ConcatenationOptionsT *>(value) : nullptr;
+  }
+  const tflite::ConcatenationOptionsT *AsConcatenationOptions() const {
+    return type == BuiltinOptions_ConcatenationOptions ?
+      reinterpret_cast<const tflite::ConcatenationOptionsT *>(value) : nullptr;
+  }
+  tflite::AddOptionsT *AsAddOptions() {
+    return type == BuiltinOptions_AddOptions ?
+      reinterpret_cast<tflite::AddOptionsT *>(value) : nullptr;
+  }
+  const tflite::AddOptionsT *AsAddOptions() const {
+    return type == BuiltinOptions_AddOptions ?
+      reinterpret_cast<const tflite::AddOptionsT *>(value) : nullptr;
+  }
+  tflite::L2NormOptionsT *AsL2NormOptions() {
+    return type == BuiltinOptions_L2NormOptions ?
+      reinterpret_cast<tflite::L2NormOptionsT *>(value) : nullptr;
+  }
+  const tflite::L2NormOptionsT *AsL2NormOptions() const {
+    return type == BuiltinOptions_L2NormOptions ?
+      reinterpret_cast<const tflite::L2NormOptionsT *>(value) : nullptr;
+  }
+  tflite::LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
+      reinterpret_cast<tflite::LocalResponseNormalizationOptionsT *>(value) : nullptr;
+  }
+  const tflite::LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() const {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
+      reinterpret_cast<const tflite::LocalResponseNormalizationOptionsT *>(value) : nullptr;
+  }
+  tflite::LSTMOptionsT *AsLSTMOptions() {
+    return type == BuiltinOptions_LSTMOptions ?
+      reinterpret_cast<tflite::LSTMOptionsT *>(value) : nullptr;
+  }
+  const tflite::LSTMOptionsT *AsLSTMOptions() const {
+    return type == BuiltinOptions_LSTMOptions ?
+      reinterpret_cast<const tflite::LSTMOptionsT *>(value) : nullptr;
+  }
+  tflite::ResizeBilinearOptionsT *AsResizeBilinearOptions() {
+    return type == BuiltinOptions_ResizeBilinearOptions ?
+      reinterpret_cast<tflite::ResizeBilinearOptionsT *>(value) : nullptr;
+  }
+  const tflite::ResizeBilinearOptionsT *AsResizeBilinearOptions() const {
+    return type == BuiltinOptions_ResizeBilinearOptions ?
+      reinterpret_cast<const tflite::ResizeBilinearOptionsT *>(value) : nullptr;
+  }
+  tflite::CallOptionsT *AsCallOptions() {
+    return type == BuiltinOptions_CallOptions ?
+      reinterpret_cast<tflite::CallOptionsT *>(value) : nullptr;
+  }
+  const tflite::CallOptionsT *AsCallOptions() const {
+    return type == BuiltinOptions_CallOptions ?
+      reinterpret_cast<const tflite::CallOptionsT *>(value) : nullptr;
+  }
+  tflite::ReshapeOptionsT *AsReshapeOptions() {
+    return type == BuiltinOptions_ReshapeOptions ?
+      reinterpret_cast<tflite::ReshapeOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReshapeOptionsT *AsReshapeOptions() const {
+    return type == BuiltinOptions_ReshapeOptions ?
+      reinterpret_cast<const tflite::ReshapeOptionsT *>(value) : nullptr;
+  }
+  tflite::SkipGramOptionsT *AsSkipGramOptions() {
+    return type == BuiltinOptions_SkipGramOptions ?
+      reinterpret_cast<tflite::SkipGramOptionsT *>(value) : nullptr;
+  }
+  const tflite::SkipGramOptionsT *AsSkipGramOptions() const {
+    return type == BuiltinOptions_SkipGramOptions ?
+      reinterpret_cast<const tflite::SkipGramOptionsT *>(value) : nullptr;
+  }
+  tflite::SpaceToDepthOptionsT *AsSpaceToDepthOptions() {
+    return type == BuiltinOptions_SpaceToDepthOptions ?
+      reinterpret_cast<tflite::SpaceToDepthOptionsT *>(value) : nullptr;
+  }
+  const tflite::SpaceToDepthOptionsT *AsSpaceToDepthOptions() const {
+    return type == BuiltinOptions_SpaceToDepthOptions ?
+      reinterpret_cast<const tflite::SpaceToDepthOptionsT *>(value) : nullptr;
+  }
+  tflite::EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() {
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
+      reinterpret_cast<tflite::EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+  }
+  const tflite::EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() const {
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
+      reinterpret_cast<const tflite::EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+  }
+  tflite::MulOptionsT *AsMulOptions() {
+    return type == BuiltinOptions_MulOptions ?
+      reinterpret_cast<tflite::MulOptionsT *>(value) : nullptr;
+  }
+  const tflite::MulOptionsT *AsMulOptions() const {
+    return type == BuiltinOptions_MulOptions ?
+      reinterpret_cast<const tflite::MulOptionsT *>(value) : nullptr;
+  }
+  tflite::PadOptionsT *AsPadOptions() {
+    return type == BuiltinOptions_PadOptions ?
+      reinterpret_cast<tflite::PadOptionsT *>(value) : nullptr;
+  }
+  const tflite::PadOptionsT *AsPadOptions() const {
+    return type == BuiltinOptions_PadOptions ?
+      reinterpret_cast<const tflite::PadOptionsT *>(value) : nullptr;
+  }
+  tflite::GatherOptionsT *AsGatherOptions() {
+    return type == BuiltinOptions_GatherOptions ?
+      reinterpret_cast<tflite::GatherOptionsT *>(value) : nullptr;
+  }
+  const tflite::GatherOptionsT *AsGatherOptions() const {
+    return type == BuiltinOptions_GatherOptions ?
+      reinterpret_cast<const tflite::GatherOptionsT *>(value) : nullptr;
+  }
+  tflite::BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() {
+    return type == BuiltinOptions_BatchToSpaceNDOptions ?
+      reinterpret_cast<tflite::BatchToSpaceNDOptionsT *>(value) : nullptr;
+  }
+  const tflite::BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() const {
+    return type == BuiltinOptions_BatchToSpaceNDOptions ?
+      reinterpret_cast<const tflite::BatchToSpaceNDOptionsT *>(value) : nullptr;
+  }
+  tflite::SpaceToBatchNDOptionsT *AsSpaceToBatchNDOptions() {
+    return type == BuiltinOptions_SpaceToBatchNDOptions ?
+      reinterpret_cast<tflite::SpaceToBatchNDOptionsT *>(value) : nullptr;
+  }
+  const tflite::SpaceToBatchNDOptionsT *AsSpaceToBatchNDOptions() const {
+    return type == BuiltinOptions_SpaceToBatchNDOptions ?
+      reinterpret_cast<const tflite::SpaceToBatchNDOptionsT *>(value) : nullptr;
+  }
+  tflite::TransposeOptionsT *AsTransposeOptions() {
+    return type == BuiltinOptions_TransposeOptions ?
+      reinterpret_cast<tflite::TransposeOptionsT *>(value) : nullptr;
+  }
+  const tflite::TransposeOptionsT *AsTransposeOptions() const {
+    return type == BuiltinOptions_TransposeOptions ?
+      reinterpret_cast<const tflite::TransposeOptionsT *>(value) : nullptr;
+  }
+  tflite::ReducerOptionsT *AsReducerOptions() {
+    return type == BuiltinOptions_ReducerOptions ?
+      reinterpret_cast<tflite::ReducerOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReducerOptionsT *AsReducerOptions() const {
+    return type == BuiltinOptions_ReducerOptions ?
+      reinterpret_cast<const tflite::ReducerOptionsT *>(value) : nullptr;
+  }
+  tflite::SubOptionsT *AsSubOptions() {
+    return type == BuiltinOptions_SubOptions ?
+      reinterpret_cast<tflite::SubOptionsT *>(value) : nullptr;
+  }
+  const tflite::SubOptionsT *AsSubOptions() const {
+    return type == BuiltinOptions_SubOptions ?
+      reinterpret_cast<const tflite::SubOptionsT *>(value) : nullptr;
+  }
+  tflite::DivOptionsT *AsDivOptions() {
+    return type == BuiltinOptions_DivOptions ?
+      reinterpret_cast<tflite::DivOptionsT *>(value) : nullptr;
+  }
+  const tflite::DivOptionsT *AsDivOptions() const {
+    return type == BuiltinOptions_DivOptions ?
+      reinterpret_cast<const tflite::DivOptionsT *>(value) : nullptr;
+  }
+  tflite::SqueezeOptionsT *AsSqueezeOptions() {
+    return type == BuiltinOptions_SqueezeOptions ?
+      reinterpret_cast<tflite::SqueezeOptionsT *>(value) : nullptr;
+  }
+  const tflite::SqueezeOptionsT *AsSqueezeOptions() const {
+    return type == BuiltinOptions_SqueezeOptions ?
+      reinterpret_cast<const tflite::SqueezeOptionsT *>(value) : nullptr;
+  }
+  tflite::SequenceRNNOptionsT *AsSequenceRNNOptions() {
+    return type == BuiltinOptions_SequenceRNNOptions ?
+      reinterpret_cast<tflite::SequenceRNNOptionsT *>(value) : nullptr;
+  }
+  const tflite::SequenceRNNOptionsT *AsSequenceRNNOptions() const {
+    return type == BuiltinOptions_SequenceRNNOptions ?
+      reinterpret_cast<const tflite::SequenceRNNOptionsT *>(value) : nullptr;
+  }
+  tflite::StridedSliceOptionsT *AsStridedSliceOptions() {
+    return type == BuiltinOptions_StridedSliceOptions ?
+      reinterpret_cast<tflite::StridedSliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::StridedSliceOptionsT *AsStridedSliceOptions() const {
+    return type == BuiltinOptions_StridedSliceOptions ?
+      reinterpret_cast<const tflite::StridedSliceOptionsT *>(value) : nullptr;
+  }
+  tflite::ExpOptionsT *AsExpOptions() {
+    return type == BuiltinOptions_ExpOptions ?
+      reinterpret_cast<tflite::ExpOptionsT *>(value) : nullptr;
+  }
+  const tflite::ExpOptionsT *AsExpOptions() const {
+    return type == BuiltinOptions_ExpOptions ?
+      reinterpret_cast<const tflite::ExpOptionsT *>(value) : nullptr;
+  }
+  tflite::TopKV2OptionsT *AsTopKV2Options() {
+    return type == BuiltinOptions_TopKV2Options ?
+      reinterpret_cast<tflite::TopKV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::TopKV2OptionsT *AsTopKV2Options() const {
+    return type == BuiltinOptions_TopKV2Options ?
+      reinterpret_cast<const tflite::TopKV2OptionsT *>(value) : nullptr;
+  }
+  tflite::SplitOptionsT *AsSplitOptions() {
+    return type == BuiltinOptions_SplitOptions ?
+      reinterpret_cast<tflite::SplitOptionsT *>(value) : nullptr;
+  }
+  const tflite::SplitOptionsT *AsSplitOptions() const {
+    return type == BuiltinOptions_SplitOptions ?
+      reinterpret_cast<const tflite::SplitOptionsT *>(value) : nullptr;
+  }
+  tflite::LogSoftmaxOptionsT *AsLogSoftmaxOptions() {
+    return type == BuiltinOptions_LogSoftmaxOptions ?
+      reinterpret_cast<tflite::LogSoftmaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogSoftmaxOptionsT *AsLogSoftmaxOptions() const {
+    return type == BuiltinOptions_LogSoftmaxOptions ?
+      reinterpret_cast<const tflite::LogSoftmaxOptionsT *>(value) : nullptr;
+  }
+  tflite::CastOptionsT *AsCastOptions() {
+    return type == BuiltinOptions_CastOptions ?
+      reinterpret_cast<tflite::CastOptionsT *>(value) : nullptr;
+  }
+  const tflite::CastOptionsT *AsCastOptions() const {
+    return type == BuiltinOptions_CastOptions ?
+      reinterpret_cast<const tflite::CastOptionsT *>(value) : nullptr;
+  }
+  tflite::DequantizeOptionsT *AsDequantizeOptions() {
+    return type == BuiltinOptions_DequantizeOptions ?
+      reinterpret_cast<tflite::DequantizeOptionsT *>(value) : nullptr;
+  }
+  const tflite::DequantizeOptionsT *AsDequantizeOptions() const {
+    return type == BuiltinOptions_DequantizeOptions ?
+      reinterpret_cast<const tflite::DequantizeOptionsT *>(value) : nullptr;
+  }
+  tflite::MaximumMinimumOptionsT *AsMaximumMinimumOptions() {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<tflite::MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  const tflite::MaximumMinimumOptionsT *AsMaximumMinimumOptions() const {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<const tflite::MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  tflite::ArgMaxOptionsT *AsArgMaxOptions() {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<tflite::ArgMaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::ArgMaxOptionsT *AsArgMaxOptions() const {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<const tflite::ArgMaxOptionsT *>(value) : nullptr;
+  }
+  tflite::LessOptionsT *AsLessOptions() {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<tflite::LessOptionsT *>(value) : nullptr;
+  }
+  const tflite::LessOptionsT *AsLessOptions() const {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<const tflite::LessOptionsT *>(value) : nullptr;
+  }
+  tflite::NegOptionsT *AsNegOptions() {
+    return type == BuiltinOptions_NegOptions ?
+      reinterpret_cast<tflite::NegOptionsT *>(value) : nullptr;
+  }
+  const tflite::NegOptionsT *AsNegOptions() const {
+    return type == BuiltinOptions_NegOptions ?
+      reinterpret_cast<const tflite::NegOptionsT *>(value) : nullptr;
+  }
+  tflite::PadV2OptionsT *AsPadV2Options() {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<tflite::PadV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::PadV2OptionsT *AsPadV2Options() const {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<const tflite::PadV2OptionsT *>(value) : nullptr;
+  }
+  tflite::GreaterOptionsT *AsGreaterOptions() {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<tflite::GreaterOptionsT *>(value) : nullptr;
+  }
+  const tflite::GreaterOptionsT *AsGreaterOptions() const {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<const tflite::GreaterOptionsT *>(value) : nullptr;
+  }
+  tflite::GreaterEqualOptionsT *AsGreaterEqualOptions() {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<tflite::GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::GreaterEqualOptionsT *AsGreaterEqualOptions() const {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<const tflite::GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  tflite::LessEqualOptionsT *AsLessEqualOptions() {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<tflite::LessEqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::LessEqualOptionsT *AsLessEqualOptions() const {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<const tflite::LessEqualOptionsT *>(value) : nullptr;
+  }
+  tflite::SelectOptionsT *AsSelectOptions() {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<tflite::SelectOptionsT *>(value) : nullptr;
+  }
+  const tflite::SelectOptionsT *AsSelectOptions() const {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<const tflite::SelectOptionsT *>(value) : nullptr;
+  }
+  tflite::SliceOptionsT *AsSliceOptions() {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<tflite::SliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::SliceOptionsT *AsSliceOptions() const {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<const tflite::SliceOptionsT *>(value) : nullptr;
+  }
+  tflite::TransposeConvOptionsT *AsTransposeConvOptions() {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<tflite::TransposeConvOptionsT *>(value) : nullptr;
+  }
+  const tflite::TransposeConvOptionsT *AsTransposeConvOptions() const {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<const tflite::TransposeConvOptionsT *>(value) : nullptr;
+  }
+  tflite::SparseToDenseOptionsT *AsSparseToDenseOptions() {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<tflite::SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  const tflite::SparseToDenseOptionsT *AsSparseToDenseOptions() const {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<const tflite::SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  tflite::TileOptionsT *AsTileOptions() {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<tflite::TileOptionsT *>(value) : nullptr;
+  }
+  const tflite::TileOptionsT *AsTileOptions() const {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<const tflite::TileOptionsT *>(value) : nullptr;
+  }
+  tflite::ExpandDimsOptionsT *AsExpandDimsOptions() {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<tflite::ExpandDimsOptionsT *>(value) : nullptr;
+  }
+  const tflite::ExpandDimsOptionsT *AsExpandDimsOptions() const {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<const tflite::ExpandDimsOptionsT *>(value) : nullptr;
+  }
+  tflite::EqualOptionsT *AsEqualOptions() {
+    return type == BuiltinOptions_EqualOptions ?
+      reinterpret_cast<tflite::EqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::EqualOptionsT *AsEqualOptions() const {
+    return type == BuiltinOptions_EqualOptions ?
+      reinterpret_cast<const tflite::EqualOptionsT *>(value) : nullptr;
+  }
+  tflite::NotEqualOptionsT *AsNotEqualOptions() {
+    return type == BuiltinOptions_NotEqualOptions ?
+      reinterpret_cast<tflite::NotEqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::NotEqualOptionsT *AsNotEqualOptions() const {
+    return type == BuiltinOptions_NotEqualOptions ?
+      reinterpret_cast<const tflite::NotEqualOptionsT *>(value) : nullptr;
+  }
+  tflite::ShapeOptionsT *AsShapeOptions() {
+    return type == BuiltinOptions_ShapeOptions ?
+      reinterpret_cast<tflite::ShapeOptionsT *>(value) : nullptr;
+  }
+  const tflite::ShapeOptionsT *AsShapeOptions() const {
+    return type == BuiltinOptions_ShapeOptions ?
+      reinterpret_cast<const tflite::ShapeOptionsT *>(value) : nullptr;
+  }
+  tflite::PowOptionsT *AsPowOptions() {
+    return type == BuiltinOptions_PowOptions ?
+      reinterpret_cast<tflite::PowOptionsT *>(value) : nullptr;
+  }
+  const tflite::PowOptionsT *AsPowOptions() const {
+    return type == BuiltinOptions_PowOptions ?
+      reinterpret_cast<const tflite::PowOptionsT *>(value) : nullptr;
+  }
+  tflite::ArgMinOptionsT *AsArgMinOptions() {
+    return type == BuiltinOptions_ArgMinOptions ?
+      reinterpret_cast<tflite::ArgMinOptionsT *>(value) : nullptr;
+  }
+  const tflite::ArgMinOptionsT *AsArgMinOptions() const {
+    return type == BuiltinOptions_ArgMinOptions ?
+      reinterpret_cast<const tflite::ArgMinOptionsT *>(value) : nullptr;
+  }
+  tflite::FakeQuantOptionsT *AsFakeQuantOptions() {
+    return type == BuiltinOptions_FakeQuantOptions ?
+      reinterpret_cast<tflite::FakeQuantOptionsT *>(value) : nullptr;
+  }
+  const tflite::FakeQuantOptionsT *AsFakeQuantOptions() const {
+    return type == BuiltinOptions_FakeQuantOptions ?
+      reinterpret_cast<const tflite::FakeQuantOptionsT *>(value) : nullptr;
+  }
+  tflite::PackOptionsT *AsPackOptions() {
+    return type == BuiltinOptions_PackOptions ?
+      reinterpret_cast<tflite::PackOptionsT *>(value) : nullptr;
+  }
+  const tflite::PackOptionsT *AsPackOptions() const {
+    return type == BuiltinOptions_PackOptions ?
+      reinterpret_cast<const tflite::PackOptionsT *>(value) : nullptr;
+  }
+  tflite::LogicalOrOptionsT *AsLogicalOrOptions() {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<tflite::LogicalOrOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogicalOrOptionsT *AsLogicalOrOptions() const {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<const tflite::LogicalOrOptionsT *>(value) : nullptr;
+  }
+  tflite::OneHotOptionsT *AsOneHotOptions() {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<tflite::OneHotOptionsT *>(value) : nullptr;
+  }
+  const tflite::OneHotOptionsT *AsOneHotOptions() const {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<const tflite::OneHotOptionsT *>(value) : nullptr;
+  }
+  tflite::LogicalAndOptionsT *AsLogicalAndOptions() {
+    return type == BuiltinOptions_LogicalAndOptions ?
+      reinterpret_cast<tflite::LogicalAndOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogicalAndOptionsT *AsLogicalAndOptions() const {
+    return type == BuiltinOptions_LogicalAndOptions ?
+      reinterpret_cast<const tflite::LogicalAndOptionsT *>(value) : nullptr;
+  }
+  tflite::LogicalNotOptionsT *AsLogicalNotOptions() {
+    return type == BuiltinOptions_LogicalNotOptions ?
+      reinterpret_cast<tflite::LogicalNotOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogicalNotOptionsT *AsLogicalNotOptions() const {
+    return type == BuiltinOptions_LogicalNotOptions ?
+      reinterpret_cast<const tflite::LogicalNotOptionsT *>(value) : nullptr;
+  }
+  tflite::UnpackOptionsT *AsUnpackOptions() {
+    return type == BuiltinOptions_UnpackOptions ?
+      reinterpret_cast<tflite::UnpackOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnpackOptionsT *AsUnpackOptions() const {
+    return type == BuiltinOptions_UnpackOptions ?
+      reinterpret_cast<const tflite::UnpackOptionsT *>(value) : nullptr;
+  }
+  tflite::FloorDivOptionsT *AsFloorDivOptions() {
+    return type == BuiltinOptions_FloorDivOptions ?
+      reinterpret_cast<tflite::FloorDivOptionsT *>(value) : nullptr;
+  }
+  const tflite::FloorDivOptionsT *AsFloorDivOptions() const {
+    return type == BuiltinOptions_FloorDivOptions ?
+      reinterpret_cast<const tflite::FloorDivOptionsT *>(value) : nullptr;
+  }
+  tflite::SquareOptionsT *AsSquareOptions() {
+    return type == BuiltinOptions_SquareOptions ?
+      reinterpret_cast<tflite::SquareOptionsT *>(value) : nullptr;
+  }
+  const tflite::SquareOptionsT *AsSquareOptions() const {
+    return type == BuiltinOptions_SquareOptions ?
+      reinterpret_cast<const tflite::SquareOptionsT *>(value) : nullptr;
+  }
+  tflite::ZerosLikeOptionsT *AsZerosLikeOptions() {
+    return type == BuiltinOptions_ZerosLikeOptions ?
+      reinterpret_cast<tflite::ZerosLikeOptionsT *>(value) : nullptr;
+  }
+  const tflite::ZerosLikeOptionsT *AsZerosLikeOptions() const {
+    return type == BuiltinOptions_ZerosLikeOptions ?
+      reinterpret_cast<const tflite::ZerosLikeOptionsT *>(value) : nullptr;
+  }
+  tflite::FillOptionsT *AsFillOptions() {
+    return type == BuiltinOptions_FillOptions ?
+      reinterpret_cast<tflite::FillOptionsT *>(value) : nullptr;
+  }
+  const tflite::FillOptionsT *AsFillOptions() const {
+    return type == BuiltinOptions_FillOptions ?
+      reinterpret_cast<const tflite::FillOptionsT *>(value) : nullptr;
+  }
+  tflite::BidirectionalSequenceLSTMOptionsT *AsBidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_BidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<tflite::BidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const tflite::BidirectionalSequenceLSTMOptionsT *AsBidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_BidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  tflite::BidirectionalSequenceRNNOptionsT *AsBidirectionalSequenceRNNOptions() {
+    return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
+      reinterpret_cast<tflite::BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
+  }
+  const tflite::BidirectionalSequenceRNNOptionsT *AsBidirectionalSequenceRNNOptions() const {
+    return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
+      reinterpret_cast<const tflite::BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
+  }
+  tflite::UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<tflite::UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  tflite::FloorModOptionsT *AsFloorModOptions() {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<tflite::FloorModOptionsT *>(value) : nullptr;
+  }
+  const tflite::FloorModOptionsT *AsFloorModOptions() const {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<const tflite::FloorModOptionsT *>(value) : nullptr;
+  }
+  tflite::RangeOptionsT *AsRangeOptions() {
+    return type == BuiltinOptions_RangeOptions ?
+      reinterpret_cast<tflite::RangeOptionsT *>(value) : nullptr;
+  }
+  const tflite::RangeOptionsT *AsRangeOptions() const {
+    return type == BuiltinOptions_RangeOptions ?
+      reinterpret_cast<const tflite::RangeOptionsT *>(value) : nullptr;
+  }
+  tflite::ResizeNearestNeighborOptionsT *AsResizeNearestNeighborOptions() {
+    return type == BuiltinOptions_ResizeNearestNeighborOptions ?
+      reinterpret_cast<tflite::ResizeNearestNeighborOptionsT *>(value) : nullptr;
+  }
+  const tflite::ResizeNearestNeighborOptionsT *AsResizeNearestNeighborOptions() const {
+    return type == BuiltinOptions_ResizeNearestNeighborOptions ?
+      reinterpret_cast<const tflite::ResizeNearestNeighborOptionsT *>(value) : nullptr;
+  }
+  tflite::LeakyReluOptionsT *AsLeakyReluOptions() {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<tflite::LeakyReluOptionsT *>(value) : nullptr;
+  }
+  const tflite::LeakyReluOptionsT *AsLeakyReluOptions() const {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<const tflite::LeakyReluOptionsT *>(value) : nullptr;
+  }
+  tflite::SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<tflite::SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  const tflite::SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() const {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<const tflite::SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  tflite::MirrorPadOptionsT *AsMirrorPadOptions() {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<tflite::MirrorPadOptionsT *>(value) : nullptr;
+  }
+  const tflite::MirrorPadOptionsT *AsMirrorPadOptions() const {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<const tflite::MirrorPadOptionsT *>(value) : nullptr;
+  }
+  tflite::AbsOptionsT *AsAbsOptions() {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<tflite::AbsOptionsT *>(value) : nullptr;
+  }
+  const tflite::AbsOptionsT *AsAbsOptions() const {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<const tflite::AbsOptionsT *>(value) : nullptr;
+  }
+  tflite::SplitVOptionsT *AsSplitVOptions() {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<tflite::SplitVOptionsT *>(value) : nullptr;
+  }
+  const tflite::SplitVOptionsT *AsSplitVOptions() const {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<const tflite::SplitVOptionsT *>(value) : nullptr;
+  }
+  tflite::UniqueOptionsT *AsUniqueOptions() {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<tflite::UniqueOptionsT *>(value) : nullptr;
+  }
+  const tflite::UniqueOptionsT *AsUniqueOptions() const {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<const tflite::UniqueOptionsT *>(value) : nullptr;
+  }
+  tflite::ReverseV2OptionsT *AsReverseV2Options() {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<tflite::ReverseV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::ReverseV2OptionsT *AsReverseV2Options() const {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<const tflite::ReverseV2OptionsT *>(value) : nullptr;
+  }
+  tflite::AddNOptionsT *AsAddNOptions() {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<tflite::AddNOptionsT *>(value) : nullptr;
+  }
+  const tflite::AddNOptionsT *AsAddNOptions() const {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<const tflite::AddNOptionsT *>(value) : nullptr;
+  }
+  tflite::GatherNdOptionsT *AsGatherNdOptions() {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<tflite::GatherNdOptionsT *>(value) : nullptr;
+  }
+  const tflite::GatherNdOptionsT *AsGatherNdOptions() const {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<const tflite::GatherNdOptionsT *>(value) : nullptr;
+  }
+  tflite::CosOptionsT *AsCosOptions() {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<tflite::CosOptionsT *>(value) : nullptr;
+  }
+  const tflite::CosOptionsT *AsCosOptions() const {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<const tflite::CosOptionsT *>(value) : nullptr;
+  }
+  tflite::WhereOptionsT *AsWhereOptions() {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<tflite::WhereOptionsT *>(value) : nullptr;
+  }
+  const tflite::WhereOptionsT *AsWhereOptions() const {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<const tflite::WhereOptionsT *>(value) : nullptr;
+  }
+  tflite::RankOptionsT *AsRankOptions() {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<tflite::RankOptionsT *>(value) : nullptr;
+  }
+  const tflite::RankOptionsT *AsRankOptions() const {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<const tflite::RankOptionsT *>(value) : nullptr;
+  }
+  tflite::ReverseSequenceOptionsT *AsReverseSequenceOptions() {
+    return type == BuiltinOptions_ReverseSequenceOptions ?
+      reinterpret_cast<tflite::ReverseSequenceOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReverseSequenceOptionsT *AsReverseSequenceOptions() const {
+    return type == BuiltinOptions_ReverseSequenceOptions ?
+      reinterpret_cast<const tflite::ReverseSequenceOptionsT *>(value) : nullptr;
+  }
+  tflite::MatrixDiagOptionsT *AsMatrixDiagOptions() {
+    return type == BuiltinOptions_MatrixDiagOptions ?
+      reinterpret_cast<tflite::MatrixDiagOptionsT *>(value) : nullptr;
+  }
+  const tflite::MatrixDiagOptionsT *AsMatrixDiagOptions() const {
+    return type == BuiltinOptions_MatrixDiagOptions ?
+      reinterpret_cast<const tflite::MatrixDiagOptionsT *>(value) : nullptr;
+  }
+  tflite::QuantizeOptionsT *AsQuantizeOptions() {
+    return type == BuiltinOptions_QuantizeOptions ?
+      reinterpret_cast<tflite::QuantizeOptionsT *>(value) : nullptr;
+  }
+  const tflite::QuantizeOptionsT *AsQuantizeOptions() const {
+    return type == BuiltinOptions_QuantizeOptions ?
+      reinterpret_cast<const tflite::QuantizeOptionsT *>(value) : nullptr;
+  }
+  tflite::MatrixSetDiagOptionsT *AsMatrixSetDiagOptions() {
+    return type == BuiltinOptions_MatrixSetDiagOptions ?
+      reinterpret_cast<tflite::MatrixSetDiagOptionsT *>(value) : nullptr;
+  }
+  const tflite::MatrixSetDiagOptionsT *AsMatrixSetDiagOptions() const {
+    return type == BuiltinOptions_MatrixSetDiagOptions ?
+      reinterpret_cast<const tflite::MatrixSetDiagOptionsT *>(value) : nullptr;
+  }
+  tflite::HardSwishOptionsT *AsHardSwishOptions() {
+    return type == BuiltinOptions_HardSwishOptions ?
+      reinterpret_cast<tflite::HardSwishOptionsT *>(value) : nullptr;
+  }
+  const tflite::HardSwishOptionsT *AsHardSwishOptions() const {
+    return type == BuiltinOptions_HardSwishOptions ?
+      reinterpret_cast<const tflite::HardSwishOptionsT *>(value) : nullptr;
+  }
+  tflite::IfOptionsT *AsIfOptions() {
+    return type == BuiltinOptions_IfOptions ?
+      reinterpret_cast<tflite::IfOptionsT *>(value) : nullptr;
+  }
+  const tflite::IfOptionsT *AsIfOptions() const {
+    return type == BuiltinOptions_IfOptions ?
+      reinterpret_cast<const tflite::IfOptionsT *>(value) : nullptr;
+  }
+  tflite::WhileOptionsT *AsWhileOptions() {
+    return type == BuiltinOptions_WhileOptions ?
+      reinterpret_cast<tflite::WhileOptionsT *>(value) : nullptr;
+  }
+  const tflite::WhileOptionsT *AsWhileOptions() const {
+    return type == BuiltinOptions_WhileOptions ?
+      reinterpret_cast<const tflite::WhileOptionsT *>(value) : nullptr;
+  }
+  tflite::DepthToSpaceOptionsT *AsDepthToSpaceOptions() {
+    return type == BuiltinOptions_DepthToSpaceOptions ?
+      reinterpret_cast<tflite::DepthToSpaceOptionsT *>(value) : nullptr;
+  }
+  const tflite::DepthToSpaceOptionsT *AsDepthToSpaceOptions() const {
+    return type == BuiltinOptions_DepthToSpaceOptions ?
+      reinterpret_cast<const tflite::DepthToSpaceOptionsT *>(value) : nullptr;
+  }
+  tflite::NonMaxSuppressionV4OptionsT *AsNonMaxSuppressionV4Options() {
+    return type == BuiltinOptions_NonMaxSuppressionV4Options ?
+      reinterpret_cast<tflite::NonMaxSuppressionV4OptionsT *>(value) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV4OptionsT *AsNonMaxSuppressionV4Options() const {
+    return type == BuiltinOptions_NonMaxSuppressionV4Options ?
+      reinterpret_cast<const tflite::NonMaxSuppressionV4OptionsT *>(value) : nullptr;
+  }
+  tflite::NonMaxSuppressionV5OptionsT *AsNonMaxSuppressionV5Options() {
+    return type == BuiltinOptions_NonMaxSuppressionV5Options ?
+      reinterpret_cast<tflite::NonMaxSuppressionV5OptionsT *>(value) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV5OptionsT *AsNonMaxSuppressionV5Options() const {
+    return type == BuiltinOptions_NonMaxSuppressionV5Options ?
+      reinterpret_cast<const tflite::NonMaxSuppressionV5OptionsT *>(value) : nullptr;
+  }
+  tflite::ScatterNdOptionsT *AsScatterNdOptions() {
+    return type == BuiltinOptions_ScatterNdOptions ?
+      reinterpret_cast<tflite::ScatterNdOptionsT *>(value) : nullptr;
+  }
+  const tflite::ScatterNdOptionsT *AsScatterNdOptions() const {
+    return type == BuiltinOptions_ScatterNdOptions ?
+      reinterpret_cast<const tflite::ScatterNdOptionsT *>(value) : nullptr;
+  }
+  tflite::SelectV2OptionsT *AsSelectV2Options() {
+    return type == BuiltinOptions_SelectV2Options ?
+      reinterpret_cast<tflite::SelectV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::SelectV2OptionsT *AsSelectV2Options() const {
+    return type == BuiltinOptions_SelectV2Options ?
+      reinterpret_cast<const tflite::SelectV2OptionsT *>(value) : nullptr;
+  }
+  tflite::DensifyOptionsT *AsDensifyOptions() {
+    return type == BuiltinOptions_DensifyOptions ?
+      reinterpret_cast<tflite::DensifyOptionsT *>(value) : nullptr;
+  }
+  const tflite::DensifyOptionsT *AsDensifyOptions() const {
+    return type == BuiltinOptions_DensifyOptions ?
+      reinterpret_cast<const tflite::DensifyOptionsT *>(value) : nullptr;
+  }
+  tflite::SegmentSumOptionsT *AsSegmentSumOptions() {
+    return type == BuiltinOptions_SegmentSumOptions ?
+      reinterpret_cast<tflite::SegmentSumOptionsT *>(value) : nullptr;
+  }
+  const tflite::SegmentSumOptionsT *AsSegmentSumOptions() const {
+    return type == BuiltinOptions_SegmentSumOptions ?
+      reinterpret_cast<const tflite::SegmentSumOptionsT *>(value) : nullptr;
+  }
+  tflite::BatchMatMulOptionsT *AsBatchMatMulOptions() {
+    return type == BuiltinOptions_BatchMatMulOptions ?
+      reinterpret_cast<tflite::BatchMatMulOptionsT *>(value) : nullptr;
+  }
+  const tflite::BatchMatMulOptionsT *AsBatchMatMulOptions() const {
+    return type == BuiltinOptions_BatchMatMulOptions ?
+      reinterpret_cast<const tflite::BatchMatMulOptionsT *>(value) : nullptr;
+  }
+  tflite::CumsumOptionsT *AsCumsumOptions() {
+    return type == BuiltinOptions_CumsumOptions ?
+      reinterpret_cast<tflite::CumsumOptionsT *>(value) : nullptr;
+  }
+  const tflite::CumsumOptionsT *AsCumsumOptions() const {
+    return type == BuiltinOptions_CumsumOptions ?
+      reinterpret_cast<const tflite::CumsumOptionsT *>(value) : nullptr;
+  }
+  tflite::CallOnceOptionsT *AsCallOnceOptions() {
+    return type == BuiltinOptions_CallOnceOptions ?
+      reinterpret_cast<tflite::CallOnceOptionsT *>(value) : nullptr;
+  }
+  const tflite::CallOnceOptionsT *AsCallOnceOptions() const {
+    return type == BuiltinOptions_CallOnceOptions ?
+      reinterpret_cast<const tflite::CallOnceOptionsT *>(value) : nullptr;
+  }
+  tflite::BroadcastToOptionsT *AsBroadcastToOptions() {
+    return type == BuiltinOptions_BroadcastToOptions ?
+      reinterpret_cast<tflite::BroadcastToOptionsT *>(value) : nullptr;
+  }
+  const tflite::BroadcastToOptionsT *AsBroadcastToOptions() const {
+    return type == BuiltinOptions_BroadcastToOptions ?
+      reinterpret_cast<const tflite::BroadcastToOptionsT *>(value) : nullptr;
+  }
+  tflite::Rfft2dOptionsT *AsRfft2dOptions() {
+    return type == BuiltinOptions_Rfft2dOptions ?
+      reinterpret_cast<tflite::Rfft2dOptionsT *>(value) : nullptr;
+  }
+  const tflite::Rfft2dOptionsT *AsRfft2dOptions() const {
+    return type == BuiltinOptions_Rfft2dOptions ?
+      reinterpret_cast<const tflite::Rfft2dOptionsT *>(value) : nullptr;
+  }
+  tflite::Conv3DOptionsT *AsConv3DOptions() {
+    return type == BuiltinOptions_Conv3DOptions ?
+      reinterpret_cast<tflite::Conv3DOptionsT *>(value) : nullptr;
+  }
+  const tflite::Conv3DOptionsT *AsConv3DOptions() const {
+    return type == BuiltinOptions_Conv3DOptions ?
+      reinterpret_cast<const tflite::Conv3DOptionsT *>(value) : nullptr;
+  }
+  tflite::HashtableOptionsT *AsHashtableOptions() {
+    return type == BuiltinOptions_HashtableOptions ?
+      reinterpret_cast<tflite::HashtableOptionsT *>(value) : nullptr;
+  }
+  const tflite::HashtableOptionsT *AsHashtableOptions() const {
+    return type == BuiltinOptions_HashtableOptions ?
+      reinterpret_cast<const tflite::HashtableOptionsT *>(value) : nullptr;
+  }
+  tflite::HashtableFindOptionsT *AsHashtableFindOptions() {
+    return type == BuiltinOptions_HashtableFindOptions ?
+      reinterpret_cast<tflite::HashtableFindOptionsT *>(value) : nullptr;
+  }
+  const tflite::HashtableFindOptionsT *AsHashtableFindOptions() const {
+    return type == BuiltinOptions_HashtableFindOptions ?
+      reinterpret_cast<const tflite::HashtableFindOptionsT *>(value) : nullptr;
+  }
+  tflite::HashtableImportOptionsT *AsHashtableImportOptions() {
+    return type == BuiltinOptions_HashtableImportOptions ?
+      reinterpret_cast<tflite::HashtableImportOptionsT *>(value) : nullptr;
+  }
+  const tflite::HashtableImportOptionsT *AsHashtableImportOptions() const {
+    return type == BuiltinOptions_HashtableImportOptions ?
+      reinterpret_cast<const tflite::HashtableImportOptionsT *>(value) : nullptr;
+  }
+  tflite::HashtableSizeOptionsT *AsHashtableSizeOptions() {
+    return type == BuiltinOptions_HashtableSizeOptions ?
+      reinterpret_cast<tflite::HashtableSizeOptionsT *>(value) : nullptr;
+  }
+  const tflite::HashtableSizeOptionsT *AsHashtableSizeOptions() const {
+    return type == BuiltinOptions_HashtableSizeOptions ?
+      reinterpret_cast<const tflite::HashtableSizeOptionsT *>(value) : nullptr;
+  }
+  tflite::VarHandleOptionsT *AsVarHandleOptions() {
+    return type == BuiltinOptions_VarHandleOptions ?
+      reinterpret_cast<tflite::VarHandleOptionsT *>(value) : nullptr;
+  }
+  const tflite::VarHandleOptionsT *AsVarHandleOptions() const {
+    return type == BuiltinOptions_VarHandleOptions ?
+      reinterpret_cast<const tflite::VarHandleOptionsT *>(value) : nullptr;
+  }
+  tflite::ReadVariableOptionsT *AsReadVariableOptions() {
+    return type == BuiltinOptions_ReadVariableOptions ?
+      reinterpret_cast<tflite::ReadVariableOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReadVariableOptionsT *AsReadVariableOptions() const {
+    return type == BuiltinOptions_ReadVariableOptions ?
+      reinterpret_cast<const tflite::ReadVariableOptionsT *>(value) : nullptr;
+  }
+  tflite::AssignVariableOptionsT *AsAssignVariableOptions() {
+    return type == BuiltinOptions_AssignVariableOptions ?
+      reinterpret_cast<tflite::AssignVariableOptionsT *>(value) : nullptr;
+  }
+  const tflite::AssignVariableOptionsT *AsAssignVariableOptions() const {
+    return type == BuiltinOptions_AssignVariableOptions ?
+      reinterpret_cast<const tflite::AssignVariableOptionsT *>(value) : nullptr;
+  }
+  tflite::RandomOptionsT *AsRandomOptions() {
+    return type == BuiltinOptions_RandomOptions ?
+      reinterpret_cast<tflite::RandomOptionsT *>(value) : nullptr;
+  }
+  const tflite::RandomOptionsT *AsRandomOptions() const {
+    return type == BuiltinOptions_RandomOptions ?
+      reinterpret_cast<const tflite::RandomOptionsT *>(value) : nullptr;
+  }
+  tflite::BucketizeOptionsT *AsBucketizeOptions() {
+    return type == BuiltinOptions_BucketizeOptions ?
+      reinterpret_cast<tflite::BucketizeOptionsT *>(value) : nullptr;
+  }
+  const tflite::BucketizeOptionsT *AsBucketizeOptions() const {
+    return type == BuiltinOptions_BucketizeOptions ?
+      reinterpret_cast<const tflite::BucketizeOptionsT *>(value) : nullptr;
+  }
+  tflite::GeluOptionsT *AsGeluOptions() {
+    return type == BuiltinOptions_GeluOptions ?
+      reinterpret_cast<tflite::GeluOptionsT *>(value) : nullptr;
+  }
+  const tflite::GeluOptionsT *AsGeluOptions() const {
+    return type == BuiltinOptions_GeluOptions ?
+      reinterpret_cast<const tflite::GeluOptionsT *>(value) : nullptr;
+  }
+  tflite::DynamicUpdateSliceOptionsT *AsDynamicUpdateSliceOptions() {
+    return type == BuiltinOptions_DynamicUpdateSliceOptions ?
+      reinterpret_cast<tflite::DynamicUpdateSliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::DynamicUpdateSliceOptionsT *AsDynamicUpdateSliceOptions() const {
+    return type == BuiltinOptions_DynamicUpdateSliceOptions ?
+      reinterpret_cast<const tflite::DynamicUpdateSliceOptionsT *>(value) : nullptr;
+  }
+  tflite::UnsortedSegmentProdOptionsT *AsUnsortedSegmentProdOptions() {
+    return type == BuiltinOptions_UnsortedSegmentProdOptions ?
+      reinterpret_cast<tflite::UnsortedSegmentProdOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnsortedSegmentProdOptionsT *AsUnsortedSegmentProdOptions() const {
+    return type == BuiltinOptions_UnsortedSegmentProdOptions ?
+      reinterpret_cast<const tflite::UnsortedSegmentProdOptionsT *>(value) : nullptr;
+  }
+  tflite::UnsortedSegmentMaxOptionsT *AsUnsortedSegmentMaxOptions() {
+    return type == BuiltinOptions_UnsortedSegmentMaxOptions ?
+      reinterpret_cast<tflite::UnsortedSegmentMaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnsortedSegmentMaxOptionsT *AsUnsortedSegmentMaxOptions() const {
+    return type == BuiltinOptions_UnsortedSegmentMaxOptions ?
+      reinterpret_cast<const tflite::UnsortedSegmentMaxOptionsT *>(value) : nullptr;
+  }
+  tflite::UnsortedSegmentMinOptionsT *AsUnsortedSegmentMinOptions() {
+    return type == BuiltinOptions_UnsortedSegmentMinOptions ?
+      reinterpret_cast<tflite::UnsortedSegmentMinOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnsortedSegmentMinOptionsT *AsUnsortedSegmentMinOptions() const {
+    return type == BuiltinOptions_UnsortedSegmentMinOptions ?
+      reinterpret_cast<const tflite::UnsortedSegmentMinOptionsT *>(value) : nullptr;
+  }
+  tflite::UnsortedSegmentSumOptionsT *AsUnsortedSegmentSumOptions() {
+    return type == BuiltinOptions_UnsortedSegmentSumOptions ?
+      reinterpret_cast<tflite::UnsortedSegmentSumOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnsortedSegmentSumOptionsT *AsUnsortedSegmentSumOptions() const {
+    return type == BuiltinOptions_UnsortedSegmentSumOptions ?
+      reinterpret_cast<const tflite::UnsortedSegmentSumOptionsT *>(value) : nullptr;
+  }
+  tflite::ATan2OptionsT *AsATan2Options() {
+    return type == BuiltinOptions_ATan2Options ?
+      reinterpret_cast<tflite::ATan2OptionsT *>(value) : nullptr;
+  }
+  const tflite::ATan2OptionsT *AsATan2Options() const {
+    return type == BuiltinOptions_ATan2Options ?
+      reinterpret_cast<const tflite::ATan2OptionsT *>(value) : nullptr;
+  }
+  tflite::SignOptionsT *AsSignOptions() {
+    return type == BuiltinOptions_SignOptions ?
+      reinterpret_cast<tflite::SignOptionsT *>(value) : nullptr;
+  }
+  const tflite::SignOptionsT *AsSignOptions() const {
+    return type == BuiltinOptions_SignOptions ?
+      reinterpret_cast<const tflite::SignOptionsT *>(value) : nullptr;
+  }
+  tflite::BitcastOptionsT *AsBitcastOptions() {
+    return type == BuiltinOptions_BitcastOptions ?
+      reinterpret_cast<tflite::BitcastOptionsT *>(value) : nullptr;
+  }
+  const tflite::BitcastOptionsT *AsBitcastOptions() const {
+    return type == BuiltinOptions_BitcastOptions ?
+      reinterpret_cast<const tflite::BitcastOptionsT *>(value) : nullptr;
+  }
+  tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() {
+    return type == BuiltinOptions_BitwiseXorOptions ?
+      reinterpret_cast<tflite::BitwiseXorOptionsT *>(value) : nullptr;
+  }
+  const tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() const {
+    return type == BuiltinOptions_BitwiseXorOptions ?
+      reinterpret_cast<const tflite::BitwiseXorOptionsT *>(value) : nullptr;
+  }
+  tflite::RightShiftOptionsT *AsRightShiftOptions() {
+    return type == BuiltinOptions_RightShiftOptions ?
+      reinterpret_cast<tflite::RightShiftOptionsT *>(value) : nullptr;
+  }
+  const tflite::RightShiftOptionsT *AsRightShiftOptions() const {
+    return type == BuiltinOptions_RightShiftOptions ?
+      reinterpret_cast<const tflite::RightShiftOptionsT *>(value) : nullptr;
+  }
+};
+
+bool VerifyBuiltinOptions(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+enum BuiltinOptions2 : uint8_t {
+  BuiltinOptions2_NONE = 0,
+  BuiltinOptions2_StablehloConcatenateOptions = 1,
+  BuiltinOptions2_StablehloBroadcastInDimOptions = 2,
+  BuiltinOptions2_StablehloSliceOptions = 3,
+  BuiltinOptions2_StablehloConvolutionOptions = 4,
+  BuiltinOptions2_StablehloCustomCallOptions = 5,
+  BuiltinOptions2_StablehloReduceOptions = 6,
+  BuiltinOptions2_StablehloScatterOptions = 7,
+  BuiltinOptions2_StablehloCompareOptions = 8,
+  BuiltinOptions2_StablehloDynamicSliceOptions = 9,
+  BuiltinOptions2_StablehloPadOptions = 10,
+  BuiltinOptions2_StablehloIotaOptions = 11,
+  BuiltinOptions2_StablehloDotGeneralOptions = 12,
+  BuiltinOptions2_StablehloReduceWindowOptions = 13,
+  BuiltinOptions2_StablehloSortOptions = 14,
+  BuiltinOptions2_StablehloWhileOptions = 15,
+  BuiltinOptions2_StablehloGatherOptions = 16,
+  BuiltinOptions2_StablehloTransposeOptions = 17,
+  BuiltinOptions2_DilateOptions = 18,
+  BuiltinOptions2_StablehloRngBitGeneratorOptions = 19,
+  BuiltinOptions2_ReduceWindowOptions = 20,
+  BuiltinOptions2_StableHLOCompositeOptions = 21,
+  BuiltinOptions2_StablehloShiftLeftOptions = 22,
+  BuiltinOptions2_StablehloCaseOptions = 23,
+  BuiltinOptions2_MIN = BuiltinOptions2_NONE,
+  BuiltinOptions2_MAX = BuiltinOptions2_StablehloCaseOptions
+};
+
+inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[24] {
+  static const BuiltinOptions2 values[] = {
+    BuiltinOptions2_NONE,
+    BuiltinOptions2_StablehloConcatenateOptions,
+    BuiltinOptions2_StablehloBroadcastInDimOptions,
+    BuiltinOptions2_StablehloSliceOptions,
+    BuiltinOptions2_StablehloConvolutionOptions,
+    BuiltinOptions2_StablehloCustomCallOptions,
+    BuiltinOptions2_StablehloReduceOptions,
+    BuiltinOptions2_StablehloScatterOptions,
+    BuiltinOptions2_StablehloCompareOptions,
+    BuiltinOptions2_StablehloDynamicSliceOptions,
+    BuiltinOptions2_StablehloPadOptions,
+    BuiltinOptions2_StablehloIotaOptions,
+    BuiltinOptions2_StablehloDotGeneralOptions,
+    BuiltinOptions2_StablehloReduceWindowOptions,
+    BuiltinOptions2_StablehloSortOptions,
+    BuiltinOptions2_StablehloWhileOptions,
+    BuiltinOptions2_StablehloGatherOptions,
+    BuiltinOptions2_StablehloTransposeOptions,
+    BuiltinOptions2_DilateOptions,
+    BuiltinOptions2_StablehloRngBitGeneratorOptions,
+    BuiltinOptions2_ReduceWindowOptions,
+    BuiltinOptions2_StableHLOCompositeOptions,
+    BuiltinOptions2_StablehloShiftLeftOptions,
+    BuiltinOptions2_StablehloCaseOptions
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOptions2() {
+  static const char * const names[25] = {
+    "NONE",
+    "StablehloConcatenateOptions",
+    "StablehloBroadcastInDimOptions",
+    "StablehloSliceOptions",
+    "StablehloConvolutionOptions",
+    "StablehloCustomCallOptions",
+    "StablehloReduceOptions",
+    "StablehloScatterOptions",
+    "StablehloCompareOptions",
+    "StablehloDynamicSliceOptions",
+    "StablehloPadOptions",
+    "StablehloIotaOptions",
+    "StablehloDotGeneralOptions",
+    "StablehloReduceWindowOptions",
+    "StablehloSortOptions",
+    "StablehloWhileOptions",
+    "StablehloGatherOptions",
+    "StablehloTransposeOptions",
+    "DilateOptions",
+    "StablehloRngBitGeneratorOptions",
+    "ReduceWindowOptions",
+    "StableHLOCompositeOptions",
+    "StablehloShiftLeftOptions",
+    "StablehloCaseOptions",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOptions2(BuiltinOptions2 e) {
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions2_NONE, BuiltinOptions2_StablehloCaseOptions)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOptions2()[index];
+}
+
+template<typename T> struct BuiltinOptions2Traits {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_NONE;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloConcatenateOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloConcatenateOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloBroadcastInDimOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloBroadcastInDimOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloSliceOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloSliceOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloConvolutionOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloConvolutionOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloCustomCallOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloCustomCallOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloReduceOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloReduceOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloScatterOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloScatterOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloCompareOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloCompareOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloDynamicSliceOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloDynamicSliceOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloPadOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloPadOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloIotaOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloIotaOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloDotGeneralOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloDotGeneralOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloReduceWindowOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloReduceWindowOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloSortOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloSortOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloWhileOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloWhileOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloGatherOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloGatherOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloTransposeOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloTransposeOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::DilateOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_DilateOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloRngBitGeneratorOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloRngBitGeneratorOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::ReduceWindowOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_ReduceWindowOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StableHLOCompositeOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StableHLOCompositeOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloShiftLeftOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloShiftLeftOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloCaseOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloCaseOptions;
+};
+
+template<typename T> struct BuiltinOptions2UnionTraits {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_NONE;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloConcatenateOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloConcatenateOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloBroadcastInDimOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloBroadcastInDimOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloSliceOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloSliceOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloConvolutionOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloConvolutionOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloCustomCallOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloCustomCallOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloReduceOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloReduceOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloScatterOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloScatterOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloCompareOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloCompareOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloDynamicSliceOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloDynamicSliceOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloPadOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloPadOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloIotaOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloIotaOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloDotGeneralOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloDotGeneralOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloReduceWindowOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloReduceWindowOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloSortOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloSortOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloWhileOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloWhileOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloGatherOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloGatherOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloTransposeOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloTransposeOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::DilateOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_DilateOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloRngBitGeneratorOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloRngBitGeneratorOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::ReduceWindowOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_ReduceWindowOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StableHLOCompositeOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StableHLOCompositeOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloShiftLeftOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloShiftLeftOptions;
+};
+
+template<> struct BuiltinOptions2UnionTraits<tflite::StablehloCaseOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloCaseOptions;
+};
+
+struct BuiltinOptions2Union {
+  BuiltinOptions2 type;
+  void *value;
+
+  BuiltinOptions2Union() : type(BuiltinOptions2_NONE), value(nullptr) {}
+  BuiltinOptions2Union(BuiltinOptions2Union&& u) FLATBUFFERS_NOEXCEPT :
+    type(BuiltinOptions2_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  BuiltinOptions2Union(const BuiltinOptions2Union &);
+  BuiltinOptions2Union &operator=(const BuiltinOptions2Union &u)
+    { BuiltinOptions2Union t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  BuiltinOptions2Union &operator=(BuiltinOptions2Union &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~BuiltinOptions2Union() { Reset(); }
+
+  void Reset();
+
+  template <typename T>
+  void Set(T&& val) {
+    typedef typename std::remove_reference<T>::type RT;
+    Reset();
+    type = BuiltinOptions2UnionTraits<RT>::enum_value;
+    if (type != BuiltinOptions2_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+
+  static void *UnPack(const void *obj, BuiltinOptions2 type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::StablehloConcatenateOptionsT *AsStablehloConcatenateOptions() {
+    return type == BuiltinOptions2_StablehloConcatenateOptions ?
+      reinterpret_cast<tflite::StablehloConcatenateOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloConcatenateOptionsT *AsStablehloConcatenateOptions() const {
+    return type == BuiltinOptions2_StablehloConcatenateOptions ?
+      reinterpret_cast<const tflite::StablehloConcatenateOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloBroadcastInDimOptionsT *AsStablehloBroadcastInDimOptions() {
+    return type == BuiltinOptions2_StablehloBroadcastInDimOptions ?
+      reinterpret_cast<tflite::StablehloBroadcastInDimOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloBroadcastInDimOptionsT *AsStablehloBroadcastInDimOptions() const {
+    return type == BuiltinOptions2_StablehloBroadcastInDimOptions ?
+      reinterpret_cast<const tflite::StablehloBroadcastInDimOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloSliceOptionsT *AsStablehloSliceOptions() {
+    return type == BuiltinOptions2_StablehloSliceOptions ?
+      reinterpret_cast<tflite::StablehloSliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloSliceOptionsT *AsStablehloSliceOptions() const {
+    return type == BuiltinOptions2_StablehloSliceOptions ?
+      reinterpret_cast<const tflite::StablehloSliceOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloConvolutionOptionsT *AsStablehloConvolutionOptions() {
+    return type == BuiltinOptions2_StablehloConvolutionOptions ?
+      reinterpret_cast<tflite::StablehloConvolutionOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloConvolutionOptionsT *AsStablehloConvolutionOptions() const {
+    return type == BuiltinOptions2_StablehloConvolutionOptions ?
+      reinterpret_cast<const tflite::StablehloConvolutionOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloCustomCallOptionsT *AsStablehloCustomCallOptions() {
+    return type == BuiltinOptions2_StablehloCustomCallOptions ?
+      reinterpret_cast<tflite::StablehloCustomCallOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloCustomCallOptionsT *AsStablehloCustomCallOptions() const {
+    return type == BuiltinOptions2_StablehloCustomCallOptions ?
+      reinterpret_cast<const tflite::StablehloCustomCallOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloReduceOptionsT *AsStablehloReduceOptions() {
+    return type == BuiltinOptions2_StablehloReduceOptions ?
+      reinterpret_cast<tflite::StablehloReduceOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloReduceOptionsT *AsStablehloReduceOptions() const {
+    return type == BuiltinOptions2_StablehloReduceOptions ?
+      reinterpret_cast<const tflite::StablehloReduceOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloScatterOptionsT *AsStablehloScatterOptions() {
+    return type == BuiltinOptions2_StablehloScatterOptions ?
+      reinterpret_cast<tflite::StablehloScatterOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloScatterOptionsT *AsStablehloScatterOptions() const {
+    return type == BuiltinOptions2_StablehloScatterOptions ?
+      reinterpret_cast<const tflite::StablehloScatterOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloCompareOptionsT *AsStablehloCompareOptions() {
+    return type == BuiltinOptions2_StablehloCompareOptions ?
+      reinterpret_cast<tflite::StablehloCompareOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloCompareOptionsT *AsStablehloCompareOptions() const {
+    return type == BuiltinOptions2_StablehloCompareOptions ?
+      reinterpret_cast<const tflite::StablehloCompareOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloDynamicSliceOptionsT *AsStablehloDynamicSliceOptions() {
+    return type == BuiltinOptions2_StablehloDynamicSliceOptions ?
+      reinterpret_cast<tflite::StablehloDynamicSliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloDynamicSliceOptionsT *AsStablehloDynamicSliceOptions() const {
+    return type == BuiltinOptions2_StablehloDynamicSliceOptions ?
+      reinterpret_cast<const tflite::StablehloDynamicSliceOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloPadOptionsT *AsStablehloPadOptions() {
+    return type == BuiltinOptions2_StablehloPadOptions ?
+      reinterpret_cast<tflite::StablehloPadOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloPadOptionsT *AsStablehloPadOptions() const {
+    return type == BuiltinOptions2_StablehloPadOptions ?
+      reinterpret_cast<const tflite::StablehloPadOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloIotaOptionsT *AsStablehloIotaOptions() {
+    return type == BuiltinOptions2_StablehloIotaOptions ?
+      reinterpret_cast<tflite::StablehloIotaOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloIotaOptionsT *AsStablehloIotaOptions() const {
+    return type == BuiltinOptions2_StablehloIotaOptions ?
+      reinterpret_cast<const tflite::StablehloIotaOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloDotGeneralOptionsT *AsStablehloDotGeneralOptions() {
+    return type == BuiltinOptions2_StablehloDotGeneralOptions ?
+      reinterpret_cast<tflite::StablehloDotGeneralOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloDotGeneralOptionsT *AsStablehloDotGeneralOptions() const {
+    return type == BuiltinOptions2_StablehloDotGeneralOptions ?
+      reinterpret_cast<const tflite::StablehloDotGeneralOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloReduceWindowOptionsT *AsStablehloReduceWindowOptions() {
+    return type == BuiltinOptions2_StablehloReduceWindowOptions ?
+      reinterpret_cast<tflite::StablehloReduceWindowOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloReduceWindowOptionsT *AsStablehloReduceWindowOptions() const {
+    return type == BuiltinOptions2_StablehloReduceWindowOptions ?
+      reinterpret_cast<const tflite::StablehloReduceWindowOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloSortOptionsT *AsStablehloSortOptions() {
+    return type == BuiltinOptions2_StablehloSortOptions ?
+      reinterpret_cast<tflite::StablehloSortOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloSortOptionsT *AsStablehloSortOptions() const {
+    return type == BuiltinOptions2_StablehloSortOptions ?
+      reinterpret_cast<const tflite::StablehloSortOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloWhileOptionsT *AsStablehloWhileOptions() {
+    return type == BuiltinOptions2_StablehloWhileOptions ?
+      reinterpret_cast<tflite::StablehloWhileOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloWhileOptionsT *AsStablehloWhileOptions() const {
+    return type == BuiltinOptions2_StablehloWhileOptions ?
+      reinterpret_cast<const tflite::StablehloWhileOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloGatherOptionsT *AsStablehloGatherOptions() {
+    return type == BuiltinOptions2_StablehloGatherOptions ?
+      reinterpret_cast<tflite::StablehloGatherOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloGatherOptionsT *AsStablehloGatherOptions() const {
+    return type == BuiltinOptions2_StablehloGatherOptions ?
+      reinterpret_cast<const tflite::StablehloGatherOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloTransposeOptionsT *AsStablehloTransposeOptions() {
+    return type == BuiltinOptions2_StablehloTransposeOptions ?
+      reinterpret_cast<tflite::StablehloTransposeOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloTransposeOptionsT *AsStablehloTransposeOptions() const {
+    return type == BuiltinOptions2_StablehloTransposeOptions ?
+      reinterpret_cast<const tflite::StablehloTransposeOptionsT *>(value) : nullptr;
+  }
+  tflite::DilateOptionsT *AsDilateOptions() {
+    return type == BuiltinOptions2_DilateOptions ?
+      reinterpret_cast<tflite::DilateOptionsT *>(value) : nullptr;
+  }
+  const tflite::DilateOptionsT *AsDilateOptions() const {
+    return type == BuiltinOptions2_DilateOptions ?
+      reinterpret_cast<const tflite::DilateOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloRngBitGeneratorOptionsT *AsStablehloRngBitGeneratorOptions() {
+    return type == BuiltinOptions2_StablehloRngBitGeneratorOptions ?
+      reinterpret_cast<tflite::StablehloRngBitGeneratorOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloRngBitGeneratorOptionsT *AsStablehloRngBitGeneratorOptions() const {
+    return type == BuiltinOptions2_StablehloRngBitGeneratorOptions ?
+      reinterpret_cast<const tflite::StablehloRngBitGeneratorOptionsT *>(value) : nullptr;
+  }
+  tflite::ReduceWindowOptionsT *AsReduceWindowOptions() {
+    return type == BuiltinOptions2_ReduceWindowOptions ?
+      reinterpret_cast<tflite::ReduceWindowOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReduceWindowOptionsT *AsReduceWindowOptions() const {
+    return type == BuiltinOptions2_ReduceWindowOptions ?
+      reinterpret_cast<const tflite::ReduceWindowOptionsT *>(value) : nullptr;
+  }
+  tflite::StableHLOCompositeOptionsT *AsStableHLOCompositeOptions() {
+    return type == BuiltinOptions2_StableHLOCompositeOptions ?
+      reinterpret_cast<tflite::StableHLOCompositeOptionsT *>(value) : nullptr;
+  }
+  const tflite::StableHLOCompositeOptionsT *AsStableHLOCompositeOptions() const {
+    return type == BuiltinOptions2_StableHLOCompositeOptions ?
+      reinterpret_cast<const tflite::StableHLOCompositeOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloShiftLeftOptionsT *AsStablehloShiftLeftOptions() {
+    return type == BuiltinOptions2_StablehloShiftLeftOptions ?
+      reinterpret_cast<tflite::StablehloShiftLeftOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloShiftLeftOptionsT *AsStablehloShiftLeftOptions() const {
+    return type == BuiltinOptions2_StablehloShiftLeftOptions ?
+      reinterpret_cast<const tflite::StablehloShiftLeftOptionsT *>(value) : nullptr;
+  }
+  tflite::StablehloCaseOptionsT *AsStablehloCaseOptions() {
+    return type == BuiltinOptions2_StablehloCaseOptions ?
+      reinterpret_cast<tflite::StablehloCaseOptionsT *>(value) : nullptr;
+  }
+  const tflite::StablehloCaseOptionsT *AsStablehloCaseOptions() const {
+    return type == BuiltinOptions2_StablehloCaseOptions ?
+      reinterpret_cast<const tflite::StablehloCaseOptionsT *>(value) : nullptr;
+  }
+};
+
+bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions2 type);
+bool VerifyBuiltinOptions2Vector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+enum StablehloPrecisionConfig : uint32_t {
+  StablehloPrecisionConfig_DEFAULT = 0,
+  StablehloPrecisionConfig_HIGH = 1,
+  StablehloPrecisionConfig_HIGHEST = 2,
+  StablehloPrecisionConfig_MIN = StablehloPrecisionConfig_DEFAULT,
+  StablehloPrecisionConfig_MAX = StablehloPrecisionConfig_HIGHEST
+};
+
+inline const StablehloPrecisionConfig (&EnumValuesStablehloPrecisionConfig())[3] {
+  static const StablehloPrecisionConfig values[] = {
+    StablehloPrecisionConfig_DEFAULT,
+    StablehloPrecisionConfig_HIGH,
+    StablehloPrecisionConfig_HIGHEST
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesStablehloPrecisionConfig() {
+  static const char * const names[4] = {
+    "DEFAULT",
+    "HIGH",
+    "HIGHEST",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameStablehloPrecisionConfig(StablehloPrecisionConfig e) {
+  if (::flatbuffers::IsOutRange(e, StablehloPrecisionConfig_DEFAULT, StablehloPrecisionConfig_HIGHEST)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesStablehloPrecisionConfig()[index];
+}
+
+enum StablehloComparisonDirection : uint32_t {
+  StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_EQ = 0,
+  StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_NE = 1,
+  StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_GE = 2,
+  StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_GT = 3,
+  StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_LE = 4,
+  StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_LT = 5,
+  StablehloComparisonDirection_MIN = StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_EQ,
+  StablehloComparisonDirection_MAX = StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_LT
+};
+
+inline const StablehloComparisonDirection (&EnumValuesStablehloComparisonDirection())[6] {
+  static const StablehloComparisonDirection values[] = {
+    StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_EQ,
+    StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_NE,
+    StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_GE,
+    StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_GT,
+    StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_LE,
+    StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_LT
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesStablehloComparisonDirection() {
+  static const char * const names[7] = {
+    "STABLEHLO_COMPARISON_DIRECTION_EQ",
+    "STABLEHLO_COMPARISON_DIRECTION_NE",
+    "STABLEHLO_COMPARISON_DIRECTION_GE",
+    "STABLEHLO_COMPARISON_DIRECTION_GT",
+    "STABLEHLO_COMPARISON_DIRECTION_LE",
+    "STABLEHLO_COMPARISON_DIRECTION_LT",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameStablehloComparisonDirection(StablehloComparisonDirection e) {
+  if (::flatbuffers::IsOutRange(e, StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_EQ, StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_LT)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesStablehloComparisonDirection()[index];
+}
+
+enum StablehloComparisonType : uint32_t {
+  StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE = 0,
+  StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_FLOAT = 1,
+  StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER = 2,
+  StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_SIGNED = 3,
+  StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_UNSIGNED = 4,
+  StablehloComparisonType_MIN = StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE,
+  StablehloComparisonType_MAX = StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_UNSIGNED
+};
+
+inline const StablehloComparisonType (&EnumValuesStablehloComparisonType())[5] {
+  static const StablehloComparisonType values[] = {
+    StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE,
+    StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_FLOAT,
+    StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
+    StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_SIGNED,
+    StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_UNSIGNED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesStablehloComparisonType() {
+  static const char * const names[6] = {
+    "STABLEHLO_COMPARISON_TYPE_NOTYPE",
+    "STABLEHLO_COMPARISON_TYPE_FLOAT",
+    "STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER",
+    "STABLEHLO_COMPARISON_TYPE_SIGNED",
+    "STABLEHLO_COMPARISON_TYPE_UNSIGNED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameStablehloComparisonType(StablehloComparisonType e) {
+  if (::flatbuffers::IsOutRange(e, StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE, StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_UNSIGNED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesStablehloComparisonType()[index];
+}
+
+enum RngAlgorithm : int8_t {
+  RngAlgorithm_DEFAULT = 0,
+  RngAlgorithm_PHILOX = 1,
+  RngAlgorithm_THREEFRY = 2,
+  RngAlgorithm_MIN = RngAlgorithm_DEFAULT,
+  RngAlgorithm_MAX = RngAlgorithm_THREEFRY
+};
+
+inline const RngAlgorithm (&EnumValuesRngAlgorithm())[3] {
+  static const RngAlgorithm values[] = {
+    RngAlgorithm_DEFAULT,
+    RngAlgorithm_PHILOX,
+    RngAlgorithm_THREEFRY
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesRngAlgorithm() {
+  static const char * const names[4] = {
+    "DEFAULT",
+    "PHILOX",
+    "THREEFRY",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameRngAlgorithm(RngAlgorithm e) {
+  if (::flatbuffers::IsOutRange(e, RngAlgorithm_DEFAULT, RngAlgorithm_THREEFRY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesRngAlgorithm()[index];
+}
+
+enum Padding : int8_t {
+  Padding_SAME = 0,
+  Padding_VALID = 1,
+  Padding_MIN = Padding_SAME,
+  Padding_MAX = Padding_VALID
+};
+
+inline const Padding (&EnumValuesPadding())[2] {
+  static const Padding values[] = {
+    Padding_SAME,
+    Padding_VALID
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPadding() {
+  static const char * const names[3] = {
+    "SAME",
+    "VALID",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePadding(Padding e) {
+  if (::flatbuffers::IsOutRange(e, Padding_SAME, Padding_VALID)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPadding()[index];
+}
+
+enum ActivationFunctionType : int8_t {
+  ActivationFunctionType_NONE = 0,
+  ActivationFunctionType_RELU = 1,
+  ActivationFunctionType_RELU_N1_TO_1 = 2,
+  ActivationFunctionType_RELU6 = 3,
+  ActivationFunctionType_TANH = 4,
+  ActivationFunctionType_SIGN_BIT = 5,
+  ActivationFunctionType_MIN = ActivationFunctionType_NONE,
+  ActivationFunctionType_MAX = ActivationFunctionType_SIGN_BIT
+};
+
+inline const ActivationFunctionType (&EnumValuesActivationFunctionType())[6] {
+  static const ActivationFunctionType values[] = {
+    ActivationFunctionType_NONE,
+    ActivationFunctionType_RELU,
+    ActivationFunctionType_RELU_N1_TO_1,
+    ActivationFunctionType_RELU6,
+    ActivationFunctionType_TANH,
+    ActivationFunctionType_SIGN_BIT
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesActivationFunctionType() {
+  static const char * const names[7] = {
+    "NONE",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "TANH",
+    "SIGN_BIT",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameActivationFunctionType(ActivationFunctionType e) {
+  if (::flatbuffers::IsOutRange(e, ActivationFunctionType_NONE, ActivationFunctionType_SIGN_BIT)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesActivationFunctionType()[index];
+}
+
+enum LSHProjectionType : int8_t {
+  LSHProjectionType_UNKNOWN = 0,
+  LSHProjectionType_SPARSE = 1,
+  LSHProjectionType_DENSE = 2,
+  LSHProjectionType_MIN = LSHProjectionType_UNKNOWN,
+  LSHProjectionType_MAX = LSHProjectionType_DENSE
+};
+
+inline const LSHProjectionType (&EnumValuesLSHProjectionType())[3] {
+  static const LSHProjectionType values[] = {
+    LSHProjectionType_UNKNOWN,
+    LSHProjectionType_SPARSE,
+    LSHProjectionType_DENSE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLSHProjectionType() {
+  static const char * const names[4] = {
+    "UNKNOWN",
+    "SPARSE",
+    "DENSE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
+  if (::flatbuffers::IsOutRange(e, LSHProjectionType_UNKNOWN, LSHProjectionType_DENSE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLSHProjectionType()[index];
+}
+
+enum FullyConnectedOptionsWeightsFormat : int8_t {
+  FullyConnectedOptionsWeightsFormat_DEFAULT = 0,
+  FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8 = 1,
+  FullyConnectedOptionsWeightsFormat_MIN = FullyConnectedOptionsWeightsFormat_DEFAULT,
+  FullyConnectedOptionsWeightsFormat_MAX = FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8
+};
+
+inline const FullyConnectedOptionsWeightsFormat (&EnumValuesFullyConnectedOptionsWeightsFormat())[2] {
+  static const FullyConnectedOptionsWeightsFormat values[] = {
+    FullyConnectedOptionsWeightsFormat_DEFAULT,
+    FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesFullyConnectedOptionsWeightsFormat() {
+  static const char * const names[3] = {
+    "DEFAULT",
+    "SHUFFLED4x16INT8",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameFullyConnectedOptionsWeightsFormat(FullyConnectedOptionsWeightsFormat e) {
+  if (::flatbuffers::IsOutRange(e, FullyConnectedOptionsWeightsFormat_DEFAULT, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFullyConnectedOptionsWeightsFormat()[index];
+}
+
+enum LSTMKernelType : int8_t {
+  LSTMKernelType_FULL = 0,
+  LSTMKernelType_BASIC = 1,
+  LSTMKernelType_MIN = LSTMKernelType_FULL,
+  LSTMKernelType_MAX = LSTMKernelType_BASIC
+};
+
+inline const LSTMKernelType (&EnumValuesLSTMKernelType())[2] {
+  static const LSTMKernelType values[] = {
+    LSTMKernelType_FULL,
+    LSTMKernelType_BASIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLSTMKernelType() {
+  static const char * const names[3] = {
+    "FULL",
+    "BASIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
+  if (::flatbuffers::IsOutRange(e, LSTMKernelType_FULL, LSTMKernelType_BASIC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLSTMKernelType()[index];
+}
+
+enum CombinerType : int8_t {
+  CombinerType_SUM = 0,
+  CombinerType_MEAN = 1,
+  CombinerType_SQRTN = 2,
+  CombinerType_MIN = CombinerType_SUM,
+  CombinerType_MAX = CombinerType_SQRTN
+};
+
+inline const CombinerType (&EnumValuesCombinerType())[3] {
+  static const CombinerType values[] = {
+    CombinerType_SUM,
+    CombinerType_MEAN,
+    CombinerType_SQRTN
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCombinerType() {
+  static const char * const names[4] = {
+    "SUM",
+    "MEAN",
+    "SQRTN",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCombinerType(CombinerType e) {
+  if (::flatbuffers::IsOutRange(e, CombinerType_SUM, CombinerType_SQRTN)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCombinerType()[index];
+}
+
+enum MirrorPadMode : int8_t {
+  MirrorPadMode_REFLECT = 0,
+  MirrorPadMode_SYMMETRIC = 1,
+  MirrorPadMode_MIN = MirrorPadMode_REFLECT,
+  MirrorPadMode_MAX = MirrorPadMode_SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode_REFLECT,
+    MirrorPadMode_SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[3] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  if (::flatbuffers::IsOutRange(e, MirrorPadMode_REFLECT, MirrorPadMode_SYMMETRIC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
+enum ReduceWindowFunction : int32_t {
+  ReduceWindowFunction_UNSUPPORTED = 0,
+  ReduceWindowFunction_ADD = 1,
+  ReduceWindowFunction_MUL = 2,
+  ReduceWindowFunction_MINIMUM = 3,
+  ReduceWindowFunction_MAXIMUM = 4,
+  ReduceWindowFunction_ALL = 5,
+  ReduceWindowFunction_ANY = 6,
+  ReduceWindowFunction_MIN = ReduceWindowFunction_UNSUPPORTED,
+  ReduceWindowFunction_MAX = ReduceWindowFunction_ANY
+};
+
+inline const ReduceWindowFunction (&EnumValuesReduceWindowFunction())[7] {
+  static const ReduceWindowFunction values[] = {
+    ReduceWindowFunction_UNSUPPORTED,
+    ReduceWindowFunction_ADD,
+    ReduceWindowFunction_MUL,
+    ReduceWindowFunction_MINIMUM,
+    ReduceWindowFunction_MAXIMUM,
+    ReduceWindowFunction_ALL,
+    ReduceWindowFunction_ANY
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesReduceWindowFunction() {
+  static const char * const names[8] = {
+    "UNSUPPORTED",
+    "ADD",
+    "MUL",
+    "MINIMUM",
+    "MAXIMUM",
+    "ALL",
+    "ANY",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameReduceWindowFunction(ReduceWindowFunction e) {
+  if (::flatbuffers::IsOutRange(e, ReduceWindowFunction_UNSUPPORTED, ReduceWindowFunction_ANY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesReduceWindowFunction()[index];
+}
+
+enum CustomOptionsFormat : int8_t {
+  CustomOptionsFormat_FLEXBUFFERS = 0,
+  CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
+  CustomOptionsFormat_MAX = CustomOptionsFormat_FLEXBUFFERS
+};
+
+inline const CustomOptionsFormat (&EnumValuesCustomOptionsFormat())[1] {
+  static const CustomOptionsFormat values[] = {
+    CustomOptionsFormat_FLEXBUFFERS
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCustomOptionsFormat() {
+  static const char * const names[2] = {
+    "FLEXBUFFERS",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
+  if (::flatbuffers::IsOutRange(e, CustomOptionsFormat_FLEXBUFFERS, CustomOptionsFormat_FLEXBUFFERS)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCustomOptionsFormat()[index];
+}
+
+struct CustomQuantizationT : public ::flatbuffers::NativeTable {
+  typedef CustomQuantization TableType;
+  std::vector<uint8_t> custom{};
+};
+
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CustomQuantizationT NativeTableType;
+  typedef CustomQuantizationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CUSTOM = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CUSTOM) &&
+           verifier.VerifyVector(custom()) &&
+           verifier.EndTable();
+  }
+  CustomQuantizationT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CustomQuantizationT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CustomQuantization> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CustomQuantizationBuilder {
+  typedef CustomQuantization Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_custom(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom) {
+    fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
+  }
+  explicit CustomQuantizationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CustomQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CustomQuantization>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom = 0) {
+  CustomQuantizationBuilder builder_(_fbb);
+  builder_.add_custom(custom);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *custom = nullptr) {
+  if (custom) { _fbb.ForceVectorAlignment(custom->size(), sizeof(uint8_t), 16); }
+  auto custom__ = custom ? _fbb.CreateVector<uint8_t>(*custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      custom__);
+}
+
+::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BlockwiseQuantizationT : public ::flatbuffers::NativeTable {
+  typedef BlockwiseQuantization TableType;
+  int32_t scales = 0;
+  int32_t zero_points = 0;
+  int32_t block_size = 0;
+};
+
+struct BlockwiseQuantization FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BlockwiseQuantizationT NativeTableType;
+  typedef BlockwiseQuantizationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SCALES = 4,
+    VT_ZERO_POINTS = 6,
+    VT_BLOCK_SIZE = 8
+  };
+  int32_t scales() const {
+    return GetField<int32_t>(VT_SCALES, 0);
+  }
+  int32_t zero_points() const {
+    return GetField<int32_t>(VT_ZERO_POINTS, 0);
+  }
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SCALES, 4) &&
+           VerifyField<int32_t>(verifier, VT_ZERO_POINTS, 4) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
+           verifier.EndTable();
+  }
+  BlockwiseQuantizationT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BlockwiseQuantizationT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BlockwiseQuantization> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BlockwiseQuantizationT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BlockwiseQuantizationBuilder {
+  typedef BlockwiseQuantization Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_scales(int32_t scales) {
+    fbb_.AddElement<int32_t>(BlockwiseQuantization::VT_SCALES, scales, 0);
+  }
+  void add_zero_points(int32_t zero_points) {
+    fbb_.AddElement<int32_t>(BlockwiseQuantization::VT_ZERO_POINTS, zero_points, 0);
+  }
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(BlockwiseQuantization::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit BlockwiseQuantizationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BlockwiseQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BlockwiseQuantization>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BlockwiseQuantization> CreateBlockwiseQuantization(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t scales = 0,
+    int32_t zero_points = 0,
+    int32_t block_size = 0) {
+  BlockwiseQuantizationBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  builder_.add_zero_points(zero_points);
+  builder_.add_scales(scales);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BlockwiseQuantization> CreateBlockwiseQuantization(::flatbuffers::FlatBufferBuilder &_fbb, const BlockwiseQuantizationT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct QuantizationParametersT : public ::flatbuffers::NativeTable {
+  typedef QuantizationParameters TableType;
+  std::vector<float> min{};
+  std::vector<float> max{};
+  std::vector<float> scale{};
+  std::vector<int64_t> zero_point{};
+  tflite::QuantizationDetailsUnion details{};
+  int32_t quantized_dimension = 0;
+};
+
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef QuantizationParametersT NativeTableType;
+  typedef QuantizationParametersBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MIN = 4,
+    VT_MAX = 6,
+    VT_SCALE = 8,
+    VT_ZERO_POINT = 10,
+    VT_DETAILS_TYPE = 12,
+    VT_DETAILS = 14,
+    VT_QUANTIZED_DIMENSION = 16
+  };
+  const ::flatbuffers::Vector<float> *min() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_MIN);
+  }
+  const ::flatbuffers::Vector<float> *max() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_MAX);
+  }
+  const ::flatbuffers::Vector<float> *scale() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_SCALE);
+  }
+  const ::flatbuffers::Vector<int64_t> *zero_point() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
+  }
+  tflite::QuantizationDetails details_type() const {
+    return static_cast<tflite::QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
+  }
+  const void *details() const {
+    return GetPointer<const void *>(VT_DETAILS);
+  }
+  template<typename T> const T *details_as() const;
+  const tflite::CustomQuantization *details_as_CustomQuantization() const {
+    return details_type() == tflite::QuantizationDetails_CustomQuantization ? static_cast<const tflite::CustomQuantization *>(details()) : nullptr;
+  }
+  const tflite::BlockwiseQuantization *details_as_BlockwiseQuantization() const {
+    return details_type() == tflite::QuantizationDetails_BlockwiseQuantization ? static_cast<const tflite::BlockwiseQuantization *>(details()) : nullptr;
+  }
+  int32_t quantized_dimension() const {
+    return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MIN) &&
+           verifier.VerifyVector(min()) &&
+           VerifyOffset(verifier, VT_MAX) &&
+           verifier.VerifyVector(max()) &&
+           VerifyOffset(verifier, VT_SCALE) &&
+           verifier.VerifyVector(scale()) &&
+           VerifyOffset(verifier, VT_ZERO_POINT) &&
+           verifier.VerifyVector(zero_point()) &&
+           VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE, 1) &&
+           VerifyOffset(verifier, VT_DETAILS) &&
+           VerifyQuantizationDetails(verifier, details(), details_type()) &&
+           VerifyField<int32_t>(verifier, VT_QUANTIZED_DIMENSION, 4) &&
+           verifier.EndTable();
+  }
+  QuantizationParametersT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(QuantizationParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<QuantizationParameters> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const tflite::CustomQuantization *QuantizationParameters::details_as<tflite::CustomQuantization>() const {
+  return details_as_CustomQuantization();
+}
+
+template<> inline const tflite::BlockwiseQuantization *QuantizationParameters::details_as<tflite::BlockwiseQuantization>() const {
+  return details_as_BlockwiseQuantization();
+}
+
+struct QuantizationParametersBuilder {
+  typedef QuantizationParameters Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_min(::flatbuffers::Offset<::flatbuffers::Vector<float>> min) {
+    fbb_.AddOffset(QuantizationParameters::VT_MIN, min);
+  }
+  void add_max(::flatbuffers::Offset<::flatbuffers::Vector<float>> max) {
+    fbb_.AddOffset(QuantizationParameters::VT_MAX, max);
+  }
+  void add_scale(::flatbuffers::Offset<::flatbuffers::Vector<float>> scale) {
+    fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
+  }
+  void add_zero_point(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> zero_point) {
+    fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
+  }
+  void add_details_type(tflite::QuantizationDetails details_type) {
+    fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
+  }
+  void add_details(::flatbuffers::Offset<void> details) {
+    fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
+  }
+  void add_quantized_dimension(int32_t quantized_dimension) {
+    fbb_.AddElement<int32_t>(QuantizationParameters::VT_QUANTIZED_DIMENSION, quantized_dimension, 0);
+  }
+  explicit QuantizationParametersBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<QuantizationParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<QuantizationParameters>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> min = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> max = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> scale = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> zero_point = 0,
+    tflite::QuantizationDetails details_type = tflite::QuantizationDetails_NONE,
+    ::flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
+  QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_quantized_dimension(quantized_dimension);
+  builder_.add_details(details);
+  builder_.add_zero_point(zero_point);
+  builder_.add_scale(scale);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_details_type(details_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *min = nullptr,
+    const std::vector<float> *max = nullptr,
+    const std::vector<float> *scale = nullptr,
+    const std::vector<int64_t> *zero_point = nullptr,
+    tflite::QuantizationDetails details_type = tflite::QuantizationDetails_NONE,
+    ::flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
+  auto min__ = min ? _fbb.CreateVector<float>(*min) : 0;
+  auto max__ = max ? _fbb.CreateVector<float>(*max) : 0;
+  auto scale__ = scale ? _fbb.CreateVector<float>(*scale) : 0;
+  auto zero_point__ = zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0;
+  return tflite::CreateQuantizationParameters(
+      _fbb,
+      min__,
+      max__,
+      scale__,
+      zero_point__,
+      details_type,
+      details,
+      quantized_dimension);
+}
+
+::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Int32VectorT : public ::flatbuffers::NativeTable {
+  typedef Int32Vector TableType;
+  std::vector<int32_t> values{};
+};
+
+struct Int32Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Int32VectorT NativeTableType;
+  typedef Int32VectorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Int32VectorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Int32VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Int32Vector> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Int32VectorBuilder {
+  typedef Int32Vector Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> values) {
+    fbb_.AddOffset(Int32Vector::VT_VALUES, values);
+  }
+  explicit Int32VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int32Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Int32Vector>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> values = 0) {
+  Int32VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<int32_t>(*values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      values__);
+}
+
+::flatbuffers::Offset<Int32Vector> CreateInt32Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Uint16VectorT : public ::flatbuffers::NativeTable {
+  typedef Uint16Vector TableType;
+  std::vector<uint16_t> values{};
+};
+
+struct Uint16Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Uint16VectorT NativeTableType;
+  typedef Uint16VectorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<uint16_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint16_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Uint16VectorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint16VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Uint16Vector> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Uint16VectorBuilder {
+  typedef Uint16Vector Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint16_t>> values) {
+    fbb_.AddOffset(Uint16Vector::VT_VALUES, values);
+  }
+  explicit Uint16VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Uint16Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Uint16Vector>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint16_t>> values = 0) {
+  Uint16VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint16_t> *values = nullptr) {
+  if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint16_t), 4); }
+  auto values__ = values ? _fbb.CreateVector<uint16_t>(*values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      values__);
+}
+
+::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Uint8VectorT : public ::flatbuffers::NativeTable {
+  typedef Uint8Vector TableType;
+  std::vector<uint8_t> values{};
+};
+
+struct Uint8Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Uint8VectorT NativeTableType;
+  typedef Uint8VectorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Uint8VectorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint8VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Uint8Vector> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Uint8VectorBuilder {
+  typedef Uint8Vector Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> values) {
+    fbb_.AddOffset(Uint8Vector::VT_VALUES, values);
+  }
+  explicit Uint8VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Uint8Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Uint8Vector>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> values = 0) {
+  Uint8VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *values = nullptr) {
+  if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint8_t), 4); }
+  auto values__ = values ? _fbb.CreateVector<uint8_t>(*values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      values__);
+}
+
+::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DimensionMetadataT : public ::flatbuffers::NativeTable {
+  typedef DimensionMetadata TableType;
+  tflite::DimensionType format = tflite::DimensionType_DENSE;
+  int32_t dense_size = 0;
+  tflite::SparseIndexVectorUnion array_segments{};
+  tflite::SparseIndexVectorUnion array_indices{};
+};
+
+struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DimensionMetadataT NativeTableType;
+  typedef DimensionMetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FORMAT = 4,
+    VT_DENSE_SIZE = 6,
+    VT_ARRAY_SEGMENTS_TYPE = 8,
+    VT_ARRAY_SEGMENTS = 10,
+    VT_ARRAY_INDICES_TYPE = 12,
+    VT_ARRAY_INDICES = 14
+  };
+  tflite::DimensionType format() const {
+    return static_cast<tflite::DimensionType>(GetField<int8_t>(VT_FORMAT, 0));
+  }
+  int32_t dense_size() const {
+    return GetField<int32_t>(VT_DENSE_SIZE, 0);
+  }
+  tflite::SparseIndexVector array_segments_type() const {
+    return static_cast<tflite::SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_SEGMENTS_TYPE, 0));
+  }
+  const void *array_segments() const {
+    return GetPointer<const void *>(VT_ARRAY_SEGMENTS);
+  }
+  template<typename T> const T *array_segments_as() const;
+  const tflite::Int32Vector *array_segments_as_Int32Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector_Int32Vector ? static_cast<const tflite::Int32Vector *>(array_segments()) : nullptr;
+  }
+  const tflite::Uint16Vector *array_segments_as_Uint16Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector_Uint16Vector ? static_cast<const tflite::Uint16Vector *>(array_segments()) : nullptr;
+  }
+  const tflite::Uint8Vector *array_segments_as_Uint8Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector_Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_segments()) : nullptr;
+  }
+  tflite::SparseIndexVector array_indices_type() const {
+    return static_cast<tflite::SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_INDICES_TYPE, 0));
+  }
+  const void *array_indices() const {
+    return GetPointer<const void *>(VT_ARRAY_INDICES);
+  }
+  template<typename T> const T *array_indices_as() const;
+  const tflite::Int32Vector *array_indices_as_Int32Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector_Int32Vector ? static_cast<const tflite::Int32Vector *>(array_indices()) : nullptr;
+  }
+  const tflite::Uint16Vector *array_indices_as_Uint16Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector_Uint16Vector ? static_cast<const tflite::Uint16Vector *>(array_indices()) : nullptr;
+  }
+  const tflite::Uint8Vector *array_indices_as_Uint8Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector_Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_indices()) : nullptr;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FORMAT, 1) &&
+           VerifyField<int32_t>(verifier, VT_DENSE_SIZE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_SEGMENTS_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ARRAY_SEGMENTS) &&
+           VerifySparseIndexVector(verifier, array_segments(), array_segments_type()) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_INDICES_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ARRAY_INDICES) &&
+           VerifySparseIndexVector(verifier, array_indices(), array_indices_type()) &&
+           verifier.EndTable();
+  }
+  DimensionMetadataT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DimensionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DimensionMetadata> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const tflite::Int32Vector *DimensionMetadata::array_segments_as<tflite::Int32Vector>() const {
+  return array_segments_as_Int32Vector();
+}
+
+template<> inline const tflite::Uint16Vector *DimensionMetadata::array_segments_as<tflite::Uint16Vector>() const {
+  return array_segments_as_Uint16Vector();
+}
+
+template<> inline const tflite::Uint8Vector *DimensionMetadata::array_segments_as<tflite::Uint8Vector>() const {
+  return array_segments_as_Uint8Vector();
+}
+
+template<> inline const tflite::Int32Vector *DimensionMetadata::array_indices_as<tflite::Int32Vector>() const {
+  return array_indices_as_Int32Vector();
+}
+
+template<> inline const tflite::Uint16Vector *DimensionMetadata::array_indices_as<tflite::Uint16Vector>() const {
+  return array_indices_as_Uint16Vector();
+}
+
+template<> inline const tflite::Uint8Vector *DimensionMetadata::array_indices_as<tflite::Uint8Vector>() const {
+  return array_indices_as_Uint8Vector();
+}
+
+struct DimensionMetadataBuilder {
+  typedef DimensionMetadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_format(tflite::DimensionType format) {
+    fbb_.AddElement<int8_t>(DimensionMetadata::VT_FORMAT, static_cast<int8_t>(format), 0);
+  }
+  void add_dense_size(int32_t dense_size) {
+    fbb_.AddElement<int32_t>(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0);
+  }
+  void add_array_segments_type(tflite::SparseIndexVector array_segments_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_SEGMENTS_TYPE, static_cast<uint8_t>(array_segments_type), 0);
+  }
+  void add_array_segments(::flatbuffers::Offset<void> array_segments) {
+    fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments);
+  }
+  void add_array_indices_type(tflite::SparseIndexVector array_indices_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_INDICES_TYPE, static_cast<uint8_t>(array_indices_type), 0);
+  }
+  void add_array_indices(::flatbuffers::Offset<void> array_indices) {
+    fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices);
+  }
+  explicit DimensionMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DimensionMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DimensionMetadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::DimensionType format = tflite::DimensionType_DENSE,
+    int32_t dense_size = 0,
+    tflite::SparseIndexVector array_segments_type = tflite::SparseIndexVector_NONE,
+    ::flatbuffers::Offset<void> array_segments = 0,
+    tflite::SparseIndexVector array_indices_type = tflite::SparseIndexVector_NONE,
+    ::flatbuffers::Offset<void> array_indices = 0) {
+  DimensionMetadataBuilder builder_(_fbb);
+  builder_.add_array_indices(array_indices);
+  builder_.add_array_segments(array_segments);
+  builder_.add_dense_size(dense_size);
+  builder_.add_array_indices_type(array_indices_type);
+  builder_.add_array_segments_type(array_segments_type);
+  builder_.add_format(format);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparsityParametersT : public ::flatbuffers::NativeTable {
+  typedef SparsityParameters TableType;
+  std::vector<int32_t> traversal_order{};
+  std::vector<int32_t> block_map{};
+  std::vector<std::unique_ptr<tflite::DimensionMetadataT>> dim_metadata{};
+  SparsityParametersT() = default;
+  SparsityParametersT(const SparsityParametersT &o);
+  SparsityParametersT(SparsityParametersT&&) FLATBUFFERS_NOEXCEPT = default;
+  SparsityParametersT &operator=(SparsityParametersT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SparsityParametersT NativeTableType;
+  typedef SparsityParametersBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TRAVERSAL_ORDER = 4,
+    VT_BLOCK_MAP = 6,
+    VT_DIM_METADATA = 8
+  };
+  const ::flatbuffers::Vector<int32_t> *traversal_order() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_TRAVERSAL_ORDER);
+  }
+  const ::flatbuffers::Vector<int32_t> *block_map() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_BLOCK_MAP);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *>(VT_DIM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TRAVERSAL_ORDER) &&
+           verifier.VerifyVector(traversal_order()) &&
+           VerifyOffset(verifier, VT_BLOCK_MAP) &&
+           verifier.VerifyVector(block_map()) &&
+           VerifyOffset(verifier, VT_DIM_METADATA) &&
+           verifier.VerifyVector(dim_metadata()) &&
+           verifier.VerifyVectorOfTables(dim_metadata()) &&
+           verifier.EndTable();
+  }
+  SparsityParametersT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparsityParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SparsityParameters> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparsityParametersBuilder {
+  typedef SparsityParameters Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_traversal_order(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> traversal_order) {
+    fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order);
+  }
+  void add_block_map(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> block_map) {
+    fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map);
+  }
+  void add_dim_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata) {
+    fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata);
+  }
+  explicit SparsityParametersBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SparsityParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SparsityParameters>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> traversal_order = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> block_map = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata = 0) {
+  SparsityParametersBuilder builder_(_fbb);
+  builder_.add_dim_metadata(dim_metadata);
+  builder_.add_block_map(block_map);
+  builder_.add_traversal_order(traversal_order);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *traversal_order = nullptr,
+    const std::vector<int32_t> *block_map = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata = nullptr) {
+  auto traversal_order__ = traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0;
+  auto block_map__ = block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0;
+  auto dim_metadata__ = dim_metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::DimensionMetadata>>(*dim_metadata) : 0;
+  return tflite::CreateSparsityParameters(
+      _fbb,
+      traversal_order__,
+      block_map__,
+      dim_metadata__);
+}
+
+::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct VariantSubTypeT : public ::flatbuffers::NativeTable {
+  typedef VariantSubType TableType;
+  std::vector<int32_t> shape{};
+  tflite::TensorType type = tflite::TensorType_FLOAT32;
+  bool has_rank = false;
+};
+
+struct VariantSubType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef VariantSubTypeT NativeTableType;
+  typedef VariantSubTypeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SHAPE = 4,
+    VT_TYPE = 6,
+    VT_HAS_RANK = 8
+  };
+  const ::flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  }
+  tflite::TensorType type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  bool has_rank() const {
+    return GetField<uint8_t>(VT_HAS_RANK, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.VerifyVector(shape()) &&
+           VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_RANK, 1) &&
+           verifier.EndTable();
+  }
+  VariantSubTypeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(VariantSubTypeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<VariantSubType> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct VariantSubTypeBuilder {
+  typedef VariantSubType Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape) {
+    fbb_.AddOffset(VariantSubType::VT_SHAPE, shape);
+  }
+  void add_type(tflite::TensorType type) {
+    fbb_.AddElement<int8_t>(VariantSubType::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  void add_has_rank(bool has_rank) {
+    fbb_.AddElement<uint8_t>(VariantSubType::VT_HAS_RANK, static_cast<uint8_t>(has_rank), 0);
+  }
+  explicit VariantSubTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<VariantSubType> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<VariantSubType>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubType(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
+    tflite::TensorType type = tflite::TensorType_FLOAT32,
+    bool has_rank = false) {
+  VariantSubTypeBuilder builder_(_fbb);
+  builder_.add_shape(shape);
+  builder_.add_has_rank(has_rank);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubTypeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *shape = nullptr,
+    tflite::TensorType type = tflite::TensorType_FLOAT32,
+    bool has_rank = false) {
+  auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
+  return tflite::CreateVariantSubType(
+      _fbb,
+      shape__,
+      type,
+      has_rank);
+}
+
+::flatbuffers::Offset<VariantSubType> CreateVariantSubType(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TensorT : public ::flatbuffers::NativeTable {
+  typedef Tensor TableType;
+  std::vector<int32_t> shape{};
+  tflite::TensorType type = tflite::TensorType_FLOAT32;
+  uint32_t buffer = 0;
+  std::string name{};
+  std::unique_ptr<tflite::QuantizationParametersT> quantization{};
+  bool is_variable = false;
+  std::unique_ptr<tflite::SparsityParametersT> sparsity{};
+  std::vector<int32_t> shape_signature{};
+  bool has_rank = false;
+  std::vector<std::unique_ptr<tflite::VariantSubTypeT>> variant_tensors{};
+  TensorT() = default;
+  TensorT(const TensorT &o);
+  TensorT(TensorT&&) FLATBUFFERS_NOEXCEPT = default;
+  TensorT &operator=(TensorT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorT NativeTableType;
+  typedef TensorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SHAPE = 4,
+    VT_TYPE = 6,
+    VT_BUFFER = 8,
+    VT_NAME = 10,
+    VT_QUANTIZATION = 12,
+    VT_IS_VARIABLE = 14,
+    VT_SPARSITY = 16,
+    VT_SHAPE_SIGNATURE = 18,
+    VT_HAS_RANK = 20,
+    VT_VARIANT_TENSORS = 22
+  };
+  const ::flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  }
+  tflite::TensorType type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  uint32_t buffer() const {
+    return GetField<uint32_t>(VT_BUFFER, 0);
+  }
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  const tflite::QuantizationParameters *quantization() const {
+    return GetPointer<const tflite::QuantizationParameters *>(VT_QUANTIZATION);
+  }
+  bool is_variable() const {
+    return GetField<uint8_t>(VT_IS_VARIABLE, 0) != 0;
+  }
+  const tflite::SparsityParameters *sparsity() const {
+    return GetPointer<const tflite::SparsityParameters *>(VT_SPARSITY);
+  }
+  const ::flatbuffers::Vector<int32_t> *shape_signature() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE_SIGNATURE);
+  }
+  bool has_rank() const {
+    return GetField<uint8_t>(VT_HAS_RANK, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *>(VT_VARIANT_TENSORS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.VerifyVector(shape()) &&
+           VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER, 4) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_QUANTIZATION) &&
+           verifier.VerifyTable(quantization()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_VARIABLE, 1) &&
+           VerifyOffset(verifier, VT_SPARSITY) &&
+           verifier.VerifyTable(sparsity()) &&
+           VerifyOffset(verifier, VT_SHAPE_SIGNATURE) &&
+           verifier.VerifyVector(shape_signature()) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_RANK, 1) &&
+           VerifyOffset(verifier, VT_VARIANT_TENSORS) &&
+           verifier.VerifyVector(variant_tensors()) &&
+           verifier.VerifyVectorOfTables(variant_tensors()) &&
+           verifier.EndTable();
+  }
+  TensorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Tensor> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorBuilder {
+  typedef Tensor Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape) {
+    fbb_.AddOffset(Tensor::VT_SHAPE, shape);
+  }
+  void add_type(tflite::TensorType type) {
+    fbb_.AddElement<int8_t>(Tensor::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Tensor::VT_BUFFER, buffer, 0);
+  }
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Tensor::VT_NAME, name);
+  }
+  void add_quantization(::flatbuffers::Offset<tflite::QuantizationParameters> quantization) {
+    fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
+  }
+  void add_is_variable(bool is_variable) {
+    fbb_.AddElement<uint8_t>(Tensor::VT_IS_VARIABLE, static_cast<uint8_t>(is_variable), 0);
+  }
+  void add_sparsity(::flatbuffers::Offset<tflite::SparsityParameters> sparsity) {
+    fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity);
+  }
+  void add_shape_signature(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature) {
+    fbb_.AddOffset(Tensor::VT_SHAPE_SIGNATURE, shape_signature);
+  }
+  void add_has_rank(bool has_rank) {
+    fbb_.AddElement<uint8_t>(Tensor::VT_HAS_RANK, static_cast<uint8_t>(has_rank), 0);
+  }
+  void add_variant_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors) {
+    fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors);
+  }
+  explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Tensor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Tensor>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Tensor> CreateTensor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
+    tflite::TensorType type = tflite::TensorType_FLOAT32,
+    uint32_t buffer = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    bool is_variable = false,
+    ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature = 0,
+    bool has_rank = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0) {
+  TensorBuilder builder_(_fbb);
+  builder_.add_variant_tensors(variant_tensors);
+  builder_.add_shape_signature(shape_signature);
+  builder_.add_sparsity(sparsity);
+  builder_.add_quantization(quantization);
+  builder_.add_name(name);
+  builder_.add_buffer(buffer);
+  builder_.add_shape(shape);
+  builder_.add_has_rank(has_rank);
+  builder_.add_is_variable(is_variable);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *shape = nullptr,
+    tflite::TensorType type = tflite::TensorType_FLOAT32,
+    uint32_t buffer = 0,
+    const char *name = nullptr,
+    ::flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    bool is_variable = false,
+    ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    const std::vector<int32_t> *shape_signature = nullptr,
+    bool has_rank = false,
+    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr) {
+  auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto shape_signature__ = shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0;
+  auto variant_tensors__ = variant_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>>(*variant_tensors) : 0;
+  return tflite::CreateTensor(
+      _fbb,
+      shape__,
+      type,
+      buffer,
+      name__,
+      quantization,
+      is_variable,
+      sparsity,
+      shape_signature__,
+      has_rank,
+      variant_tensors__);
+}
+
+::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloGatherOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloGatherOptions TableType;
+  std::vector<int64_t> offset_dims{};
+  std::vector<int64_t> collapsed_slice_dims{};
+  std::vector<int64_t> start_index_map{};
+  int64_t index_vector_dim = 0;
+  std::vector<int64_t> slice_sizes{};
+  bool indices_are_sorted = false;
+};
+
+struct StablehloGatherOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloGatherOptionsT NativeTableType;
+  typedef StablehloGatherOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OFFSET_DIMS = 4,
+    VT_COLLAPSED_SLICE_DIMS = 6,
+    VT_START_INDEX_MAP = 8,
+    VT_INDEX_VECTOR_DIM = 10,
+    VT_SLICE_SIZES = 12,
+    VT_INDICES_ARE_SORTED = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *offset_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_OFFSET_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *collapsed_slice_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_COLLAPSED_SLICE_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *start_index_map() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_START_INDEX_MAP);
+  }
+  int64_t index_vector_dim() const {
+    return GetField<int64_t>(VT_INDEX_VECTOR_DIM, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *slice_sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_SLICE_SIZES);
+  }
+  bool indices_are_sorted() const {
+    return GetField<uint8_t>(VT_INDICES_ARE_SORTED, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_OFFSET_DIMS) &&
+           verifier.VerifyVector(offset_dims()) &&
+           VerifyOffset(verifier, VT_COLLAPSED_SLICE_DIMS) &&
+           verifier.VerifyVector(collapsed_slice_dims()) &&
+           VerifyOffset(verifier, VT_START_INDEX_MAP) &&
+           verifier.VerifyVector(start_index_map()) &&
+           VerifyField<int64_t>(verifier, VT_INDEX_VECTOR_DIM, 8) &&
+           VerifyOffset(verifier, VT_SLICE_SIZES) &&
+           verifier.VerifyVector(slice_sizes()) &&
+           VerifyField<uint8_t>(verifier, VT_INDICES_ARE_SORTED, 1) &&
+           verifier.EndTable();
+  }
+  StablehloGatherOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloGatherOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloGatherOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloGatherOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloGatherOptionsBuilder {
+  typedef StablehloGatherOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_offset_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> offset_dims) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_OFFSET_DIMS, offset_dims);
+  }
+  void add_collapsed_slice_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> collapsed_slice_dims) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_COLLAPSED_SLICE_DIMS, collapsed_slice_dims);
+  }
+  void add_start_index_map(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_index_map) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_START_INDEX_MAP, start_index_map);
+  }
+  void add_index_vector_dim(int64_t index_vector_dim) {
+    fbb_.AddElement<int64_t>(StablehloGatherOptions::VT_INDEX_VECTOR_DIM, index_vector_dim, 0);
+  }
+  void add_slice_sizes(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_SLICE_SIZES, slice_sizes);
+  }
+  void add_indices_are_sorted(bool indices_are_sorted) {
+    fbb_.AddElement<uint8_t>(StablehloGatherOptions::VT_INDICES_ARE_SORTED, static_cast<uint8_t>(indices_are_sorted), 0);
+  }
+  explicit StablehloGatherOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloGatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloGatherOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloGatherOptions> CreateStablehloGatherOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> offset_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> collapsed_slice_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_index_map = 0,
+    int64_t index_vector_dim = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes = 0,
+    bool indices_are_sorted = false) {
+  StablehloGatherOptionsBuilder builder_(_fbb);
+  builder_.add_index_vector_dim(index_vector_dim);
+  builder_.add_slice_sizes(slice_sizes);
+  builder_.add_start_index_map(start_index_map);
+  builder_.add_collapsed_slice_dims(collapsed_slice_dims);
+  builder_.add_offset_dims(offset_dims);
+  builder_.add_indices_are_sorted(indices_are_sorted);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloGatherOptions> CreateStablehloGatherOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *offset_dims = nullptr,
+    const std::vector<int64_t> *collapsed_slice_dims = nullptr,
+    const std::vector<int64_t> *start_index_map = nullptr,
+    int64_t index_vector_dim = 0,
+    const std::vector<int64_t> *slice_sizes = nullptr,
+    bool indices_are_sorted = false) {
+  auto offset_dims__ = offset_dims ? _fbb.CreateVector<int64_t>(*offset_dims) : 0;
+  auto collapsed_slice_dims__ = collapsed_slice_dims ? _fbb.CreateVector<int64_t>(*collapsed_slice_dims) : 0;
+  auto start_index_map__ = start_index_map ? _fbb.CreateVector<int64_t>(*start_index_map) : 0;
+  auto slice_sizes__ = slice_sizes ? _fbb.CreateVector<int64_t>(*slice_sizes) : 0;
+  return tflite::CreateStablehloGatherOptions(
+      _fbb,
+      offset_dims__,
+      collapsed_slice_dims__,
+      start_index_map__,
+      index_vector_dim,
+      slice_sizes__,
+      indices_are_sorted);
+}
+
+::flatbuffers::Offset<StablehloGatherOptions> CreateStablehloGatherOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloGatherOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloTransposeOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloTransposeOptions TableType;
+  std::vector<int64_t> permutation{};
+};
+
+struct StablehloTransposeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloTransposeOptionsT NativeTableType;
+  typedef StablehloTransposeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PERMUTATION = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *permutation() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_PERMUTATION);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_PERMUTATION) &&
+           verifier.VerifyVector(permutation()) &&
+           verifier.EndTable();
+  }
+  StablehloTransposeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloTransposeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloTransposeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloTransposeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloTransposeOptionsBuilder {
+  typedef StablehloTransposeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_permutation(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> permutation) {
+    fbb_.AddOffset(StablehloTransposeOptions::VT_PERMUTATION, permutation);
+  }
+  explicit StablehloTransposeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloTransposeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloTransposeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloTransposeOptions> CreateStablehloTransposeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> permutation = 0) {
+  StablehloTransposeOptionsBuilder builder_(_fbb);
+  builder_.add_permutation(permutation);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloTransposeOptions> CreateStablehloTransposeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *permutation = nullptr) {
+  auto permutation__ = permutation ? _fbb.CreateVector<int64_t>(*permutation) : 0;
+  return tflite::CreateStablehloTransposeOptions(
+      _fbb,
+      permutation__);
+}
+
+::flatbuffers::Offset<StablehloTransposeOptions> CreateStablehloTransposeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloTransposeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloDotGeneralOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloDotGeneralOptions TableType;
+  std::vector<int64_t> lhs_batching_dimensions{};
+  std::vector<int64_t> rhs_batching_dimensions{};
+  std::vector<int64_t> lhs_contracting_dimensions{};
+  std::vector<int64_t> rhs_contracting_dimensions{};
+  std::vector<tflite::StablehloPrecisionConfig> precision_config{};
+};
+
+struct StablehloDotGeneralOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloDotGeneralOptionsT NativeTableType;
+  typedef StablehloDotGeneralOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LHS_BATCHING_DIMENSIONS = 4,
+    VT_RHS_BATCHING_DIMENSIONS = 6,
+    VT_LHS_CONTRACTING_DIMENSIONS = 8,
+    VT_RHS_CONTRACTING_DIMENSIONS = 10,
+    VT_PRECISION_CONFIG = 12
+  };
+  const ::flatbuffers::Vector<int64_t> *lhs_batching_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LHS_BATCHING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *rhs_batching_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_RHS_BATCHING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *lhs_contracting_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LHS_CONTRACTING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *rhs_contracting_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_RHS_CONTRACTING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<uint32_t> *precision_config() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_PRECISION_CONFIG);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_LHS_BATCHING_DIMENSIONS) &&
+           verifier.VerifyVector(lhs_batching_dimensions()) &&
+           VerifyOffset(verifier, VT_RHS_BATCHING_DIMENSIONS) &&
+           verifier.VerifyVector(rhs_batching_dimensions()) &&
+           VerifyOffset(verifier, VT_LHS_CONTRACTING_DIMENSIONS) &&
+           verifier.VerifyVector(lhs_contracting_dimensions()) &&
+           VerifyOffset(verifier, VT_RHS_CONTRACTING_DIMENSIONS) &&
+           verifier.VerifyVector(rhs_contracting_dimensions()) &&
+           VerifyOffset(verifier, VT_PRECISION_CONFIG) &&
+           verifier.VerifyVector(precision_config()) &&
+           verifier.EndTable();
+  }
+  StablehloDotGeneralOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloDotGeneralOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloDotGeneralOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDotGeneralOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloDotGeneralOptionsBuilder {
+  typedef StablehloDotGeneralOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_lhs_batching_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_batching_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_LHS_BATCHING_DIMENSIONS, lhs_batching_dimensions);
+  }
+  void add_rhs_batching_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_batching_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_RHS_BATCHING_DIMENSIONS, rhs_batching_dimensions);
+  }
+  void add_lhs_contracting_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_contracting_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_LHS_CONTRACTING_DIMENSIONS, lhs_contracting_dimensions);
+  }
+  void add_rhs_contracting_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_contracting_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_RHS_CONTRACTING_DIMENSIONS, rhs_contracting_dimensions);
+  }
+  void add_precision_config(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> precision_config) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_PRECISION_CONFIG, precision_config);
+  }
+  explicit StablehloDotGeneralOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloDotGeneralOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloDotGeneralOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloDotGeneralOptions> CreateStablehloDotGeneralOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_batching_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_batching_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_contracting_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_contracting_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> precision_config = 0) {
+  StablehloDotGeneralOptionsBuilder builder_(_fbb);
+  builder_.add_precision_config(precision_config);
+  builder_.add_rhs_contracting_dimensions(rhs_contracting_dimensions);
+  builder_.add_lhs_contracting_dimensions(lhs_contracting_dimensions);
+  builder_.add_rhs_batching_dimensions(rhs_batching_dimensions);
+  builder_.add_lhs_batching_dimensions(lhs_batching_dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloDotGeneralOptions> CreateStablehloDotGeneralOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *lhs_batching_dimensions = nullptr,
+    const std::vector<int64_t> *rhs_batching_dimensions = nullptr,
+    const std::vector<int64_t> *lhs_contracting_dimensions = nullptr,
+    const std::vector<int64_t> *rhs_contracting_dimensions = nullptr,
+    const std::vector<uint32_t> *precision_config = nullptr) {
+  auto lhs_batching_dimensions__ = lhs_batching_dimensions ? _fbb.CreateVector<int64_t>(*lhs_batching_dimensions) : 0;
+  auto rhs_batching_dimensions__ = rhs_batching_dimensions ? _fbb.CreateVector<int64_t>(*rhs_batching_dimensions) : 0;
+  auto lhs_contracting_dimensions__ = lhs_contracting_dimensions ? _fbb.CreateVector<int64_t>(*lhs_contracting_dimensions) : 0;
+  auto rhs_contracting_dimensions__ = rhs_contracting_dimensions ? _fbb.CreateVector<int64_t>(*rhs_contracting_dimensions) : 0;
+  auto precision_config__ = precision_config ? _fbb.CreateVector<uint32_t>(*precision_config) : 0;
+  return tflite::CreateStablehloDotGeneralOptions(
+      _fbb,
+      lhs_batching_dimensions__,
+      rhs_batching_dimensions__,
+      lhs_contracting_dimensions__,
+      rhs_contracting_dimensions__,
+      precision_config__);
+}
+
+::flatbuffers::Offset<StablehloDotGeneralOptions> CreateStablehloDotGeneralOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDotGeneralOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloReduceWindowOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloReduceWindowOptions TableType;
+  std::vector<int64_t> window_dimensions{};
+  std::vector<int64_t> window_strides{};
+  std::vector<int64_t> base_dilations{};
+  std::vector<int64_t> window_dilations{};
+  std::vector<int64_t> padding{};
+  int32_t body_subgraph_index = 0;
+};
+
+struct StablehloReduceWindowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloReduceWindowOptionsT NativeTableType;
+  typedef StablehloReduceWindowOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_WINDOW_DIMENSIONS = 4,
+    VT_WINDOW_STRIDES = 6,
+    VT_BASE_DILATIONS = 8,
+    VT_WINDOW_DILATIONS = 10,
+    VT_PADDING = 12,
+    VT_BODY_SUBGRAPH_INDEX = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *window_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *window_strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_STRIDES);
+  }
+  const ::flatbuffers::Vector<int64_t> *base_dilations() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_BASE_DILATIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *window_dilations() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_DILATIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *padding() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_PADDING);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_WINDOW_DIMENSIONS) &&
+           verifier.VerifyVector(window_dimensions()) &&
+           VerifyOffset(verifier, VT_WINDOW_STRIDES) &&
+           verifier.VerifyVector(window_strides()) &&
+           VerifyOffset(verifier, VT_BASE_DILATIONS) &&
+           verifier.VerifyVector(base_dilations()) &&
+           VerifyOffset(verifier, VT_WINDOW_DILATIONS) &&
+           verifier.VerifyVector(window_dilations()) &&
+           VerifyOffset(verifier, VT_PADDING) &&
+           verifier.VerifyVector(padding()) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  StablehloReduceWindowOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloReduceWindowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloReduceWindowOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceWindowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloReduceWindowOptionsBuilder {
+  typedef StablehloReduceWindowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_window_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dimensions) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_WINDOW_DIMENSIONS, window_dimensions);
+  }
+  void add_window_strides(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_WINDOW_STRIDES, window_strides);
+  }
+  void add_base_dilations(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> base_dilations) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_BASE_DILATIONS, base_dilations);
+  }
+  void add_window_dilations(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dilations) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_WINDOW_DILATIONS, window_dilations);
+  }
+  void add_padding(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_PADDING, padding);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloReduceWindowOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit StablehloReduceWindowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloReduceWindowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloReduceWindowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloReduceWindowOptions> CreateStablehloReduceWindowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> base_dilations = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dilations = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding = 0,
+    int32_t body_subgraph_index = 0) {
+  StablehloReduceWindowOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_padding(padding);
+  builder_.add_window_dilations(window_dilations);
+  builder_.add_base_dilations(base_dilations);
+  builder_.add_window_strides(window_strides);
+  builder_.add_window_dimensions(window_dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloReduceWindowOptions> CreateStablehloReduceWindowOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *window_dimensions = nullptr,
+    const std::vector<int64_t> *window_strides = nullptr,
+    const std::vector<int64_t> *base_dilations = nullptr,
+    const std::vector<int64_t> *window_dilations = nullptr,
+    const std::vector<int64_t> *padding = nullptr,
+    int32_t body_subgraph_index = 0) {
+  auto window_dimensions__ = window_dimensions ? _fbb.CreateVector<int64_t>(*window_dimensions) : 0;
+  auto window_strides__ = window_strides ? _fbb.CreateVector<int64_t>(*window_strides) : 0;
+  auto base_dilations__ = base_dilations ? _fbb.CreateVector<int64_t>(*base_dilations) : 0;
+  auto window_dilations__ = window_dilations ? _fbb.CreateVector<int64_t>(*window_dilations) : 0;
+  auto padding__ = padding ? _fbb.CreateVector<int64_t>(*padding) : 0;
+  return tflite::CreateStablehloReduceWindowOptions(
+      _fbb,
+      window_dimensions__,
+      window_strides__,
+      base_dilations__,
+      window_dilations__,
+      padding__,
+      body_subgraph_index);
+}
+
+::flatbuffers::Offset<StablehloReduceWindowOptions> CreateStablehloReduceWindowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceWindowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloWhileOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloWhileOptions TableType;
+  int32_t cond_subgraph_index = 0;
+  int32_t body_subgraph_index = 0;
+};
+
+struct StablehloWhileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloWhileOptionsT NativeTableType;
+  typedef StablehloWhileOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COND_SUBGRAPH_INDEX = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  int32_t cond_subgraph_index() const {
+    return GetField<int32_t>(VT_COND_SUBGRAPH_INDEX, 0);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX, 4) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  StablehloWhileOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloWhileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloWhileOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloWhileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloWhileOptionsBuilder {
+  typedef StablehloWhileOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cond_subgraph_index(int32_t cond_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloWhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloWhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit StablehloWhileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloWhileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloWhileOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloWhileOptions> CreateStablehloWhileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t cond_subgraph_index = 0,
+    int32_t body_subgraph_index = 0) {
+  StablehloWhileOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_cond_subgraph_index(cond_subgraph_index);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloWhileOptions> CreateStablehloWhileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloWhileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloSortOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloSortOptions TableType;
+  int64_t dimension = 0;
+  bool is_stable = false;
+  int32_t comparator_subgraph_index = 0;
+};
+
+struct StablehloSortOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloSortOptionsT NativeTableType;
+  typedef StablehloSortOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DIMENSION = 4,
+    VT_IS_STABLE = 6,
+    VT_COMPARATOR_SUBGRAPH_INDEX = 8
+  };
+  int64_t dimension() const {
+    return GetField<int64_t>(VT_DIMENSION, 0);
+  }
+  bool is_stable() const {
+    return GetField<uint8_t>(VT_IS_STABLE, 0) != 0;
+  }
+  int32_t comparator_subgraph_index() const {
+    return GetField<int32_t>(VT_COMPARATOR_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_DIMENSION, 8) &&
+           VerifyField<uint8_t>(verifier, VT_IS_STABLE, 1) &&
+           VerifyField<int32_t>(verifier, VT_COMPARATOR_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  StablehloSortOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloSortOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloSortOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSortOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloSortOptionsBuilder {
+  typedef StablehloSortOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dimension(int64_t dimension) {
+    fbb_.AddElement<int64_t>(StablehloSortOptions::VT_DIMENSION, dimension, 0);
+  }
+  void add_is_stable(bool is_stable) {
+    fbb_.AddElement<uint8_t>(StablehloSortOptions::VT_IS_STABLE, static_cast<uint8_t>(is_stable), 0);
+  }
+  void add_comparator_subgraph_index(int32_t comparator_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloSortOptions::VT_COMPARATOR_SUBGRAPH_INDEX, comparator_subgraph_index, 0);
+  }
+  explicit StablehloSortOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloSortOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloSortOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloSortOptions> CreateStablehloSortOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t dimension = 0,
+    bool is_stable = false,
+    int32_t comparator_subgraph_index = 0) {
+  StablehloSortOptionsBuilder builder_(_fbb);
+  builder_.add_dimension(dimension);
+  builder_.add_comparator_subgraph_index(comparator_subgraph_index);
+  builder_.add_is_stable(is_stable);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloSortOptions> CreateStablehloSortOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSortOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloConcatenateOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloConcatenateOptions TableType;
+  int64_t dimension = 0;
+};
+
+struct StablehloConcatenateOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloConcatenateOptionsT NativeTableType;
+  typedef StablehloConcatenateOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DIMENSION = 4
+  };
+  int64_t dimension() const {
+    return GetField<int64_t>(VT_DIMENSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_DIMENSION, 8) &&
+           verifier.EndTable();
+  }
+  StablehloConcatenateOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloConcatenateOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloConcatenateOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConcatenateOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloConcatenateOptionsBuilder {
+  typedef StablehloConcatenateOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dimension(int64_t dimension) {
+    fbb_.AddElement<int64_t>(StablehloConcatenateOptions::VT_DIMENSION, dimension, 0);
+  }
+  explicit StablehloConcatenateOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloConcatenateOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloConcatenateOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloConcatenateOptions> CreateStablehloConcatenateOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t dimension = 0) {
+  StablehloConcatenateOptionsBuilder builder_(_fbb);
+  builder_.add_dimension(dimension);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloConcatenateOptions> CreateStablehloConcatenateOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConcatenateOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloBroadcastInDimOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloBroadcastInDimOptions TableType;
+  std::vector<int64_t> broadcast_dimensions{};
+};
+
+struct StablehloBroadcastInDimOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloBroadcastInDimOptionsT NativeTableType;
+  typedef StablehloBroadcastInDimOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BROADCAST_DIMENSIONS = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *broadcast_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_BROADCAST_DIMENSIONS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BROADCAST_DIMENSIONS) &&
+           verifier.VerifyVector(broadcast_dimensions()) &&
+           verifier.EndTable();
+  }
+  StablehloBroadcastInDimOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloBroadcastInDimOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloBroadcastInDimOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloBroadcastInDimOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloBroadcastInDimOptionsBuilder {
+  typedef StablehloBroadcastInDimOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_broadcast_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> broadcast_dimensions) {
+    fbb_.AddOffset(StablehloBroadcastInDimOptions::VT_BROADCAST_DIMENSIONS, broadcast_dimensions);
+  }
+  explicit StablehloBroadcastInDimOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloBroadcastInDimOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloBroadcastInDimOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloBroadcastInDimOptions> CreateStablehloBroadcastInDimOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> broadcast_dimensions = 0) {
+  StablehloBroadcastInDimOptionsBuilder builder_(_fbb);
+  builder_.add_broadcast_dimensions(broadcast_dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloBroadcastInDimOptions> CreateStablehloBroadcastInDimOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *broadcast_dimensions = nullptr) {
+  auto broadcast_dimensions__ = broadcast_dimensions ? _fbb.CreateVector<int64_t>(*broadcast_dimensions) : 0;
+  return tflite::CreateStablehloBroadcastInDimOptions(
+      _fbb,
+      broadcast_dimensions__);
+}
+
+::flatbuffers::Offset<StablehloBroadcastInDimOptions> CreateStablehloBroadcastInDimOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloBroadcastInDimOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloCompareOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloCompareOptions TableType;
+  tflite::StablehloComparisonDirection comparison_direction = tflite::StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_EQ;
+  tflite::StablehloComparisonType compare_type = tflite::StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE;
+};
+
+struct StablehloCompareOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloCompareOptionsT NativeTableType;
+  typedef StablehloCompareOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COMPARISON_DIRECTION = 4,
+    VT_COMPARE_TYPE = 6
+  };
+  tflite::StablehloComparisonDirection comparison_direction() const {
+    return static_cast<tflite::StablehloComparisonDirection>(GetField<uint32_t>(VT_COMPARISON_DIRECTION, 0));
+  }
+  tflite::StablehloComparisonType compare_type() const {
+    return static_cast<tflite::StablehloComparisonType>(GetField<uint32_t>(VT_COMPARE_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_COMPARISON_DIRECTION, 4) &&
+           VerifyField<uint32_t>(verifier, VT_COMPARE_TYPE, 4) &&
+           verifier.EndTable();
+  }
+  StablehloCompareOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloCompareOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloCompareOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCompareOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloCompareOptionsBuilder {
+  typedef StablehloCompareOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_comparison_direction(tflite::StablehloComparisonDirection comparison_direction) {
+    fbb_.AddElement<uint32_t>(StablehloCompareOptions::VT_COMPARISON_DIRECTION, static_cast<uint32_t>(comparison_direction), 0);
+  }
+  void add_compare_type(tflite::StablehloComparisonType compare_type) {
+    fbb_.AddElement<uint32_t>(StablehloCompareOptions::VT_COMPARE_TYPE, static_cast<uint32_t>(compare_type), 0);
+  }
+  explicit StablehloCompareOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloCompareOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloCompareOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloCompareOptions> CreateStablehloCompareOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::StablehloComparisonDirection comparison_direction = tflite::StablehloComparisonDirection_STABLEHLO_COMPARISON_DIRECTION_EQ,
+    tflite::StablehloComparisonType compare_type = tflite::StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE) {
+  StablehloCompareOptionsBuilder builder_(_fbb);
+  builder_.add_compare_type(compare_type);
+  builder_.add_comparison_direction(comparison_direction);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloCompareOptions> CreateStablehloCompareOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCompareOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloDynamicSliceOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloDynamicSliceOptions TableType;
+  std::vector<int64_t> slice_sizes{};
+};
+
+struct StablehloDynamicSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloDynamicSliceOptionsT NativeTableType;
+  typedef StablehloDynamicSliceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SLICE_SIZES = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *slice_sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_SLICE_SIZES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SLICE_SIZES) &&
+           verifier.VerifyVector(slice_sizes()) &&
+           verifier.EndTable();
+  }
+  StablehloDynamicSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloDynamicSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloDynamicSliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDynamicSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloDynamicSliceOptionsBuilder {
+  typedef StablehloDynamicSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_slice_sizes(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes) {
+    fbb_.AddOffset(StablehloDynamicSliceOptions::VT_SLICE_SIZES, slice_sizes);
+  }
+  explicit StablehloDynamicSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloDynamicSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloDynamicSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloDynamicSliceOptions> CreateStablehloDynamicSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes = 0) {
+  StablehloDynamicSliceOptionsBuilder builder_(_fbb);
+  builder_.add_slice_sizes(slice_sizes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloDynamicSliceOptions> CreateStablehloDynamicSliceOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *slice_sizes = nullptr) {
+  auto slice_sizes__ = slice_sizes ? _fbb.CreateVector<int64_t>(*slice_sizes) : 0;
+  return tflite::CreateStablehloDynamicSliceOptions(
+      _fbb,
+      slice_sizes__);
+}
+
+::flatbuffers::Offset<StablehloDynamicSliceOptions> CreateStablehloDynamicSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDynamicSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloPadOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloPadOptions TableType;
+  std::vector<int64_t> edge_padding_low{};
+  std::vector<int64_t> edge_padding_high{};
+  std::vector<int64_t> interior_padding{};
+};
+
+struct StablehloPadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloPadOptionsT NativeTableType;
+  typedef StablehloPadOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EDGE_PADDING_LOW = 4,
+    VT_EDGE_PADDING_HIGH = 6,
+    VT_INTERIOR_PADDING = 8
+  };
+  const ::flatbuffers::Vector<int64_t> *edge_padding_low() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_EDGE_PADDING_LOW);
+  }
+  const ::flatbuffers::Vector<int64_t> *edge_padding_high() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_EDGE_PADDING_HIGH);
+  }
+  const ::flatbuffers::Vector<int64_t> *interior_padding() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INTERIOR_PADDING);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_EDGE_PADDING_LOW) &&
+           verifier.VerifyVector(edge_padding_low()) &&
+           VerifyOffset(verifier, VT_EDGE_PADDING_HIGH) &&
+           verifier.VerifyVector(edge_padding_high()) &&
+           VerifyOffset(verifier, VT_INTERIOR_PADDING) &&
+           verifier.VerifyVector(interior_padding()) &&
+           verifier.EndTable();
+  }
+  StablehloPadOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloPadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloPadOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloPadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloPadOptionsBuilder {
+  typedef StablehloPadOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_edge_padding_low(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_low) {
+    fbb_.AddOffset(StablehloPadOptions::VT_EDGE_PADDING_LOW, edge_padding_low);
+  }
+  void add_edge_padding_high(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_high) {
+    fbb_.AddOffset(StablehloPadOptions::VT_EDGE_PADDING_HIGH, edge_padding_high);
+  }
+  void add_interior_padding(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> interior_padding) {
+    fbb_.AddOffset(StablehloPadOptions::VT_INTERIOR_PADDING, interior_padding);
+  }
+  explicit StablehloPadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloPadOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloPadOptions> CreateStablehloPadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_low = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_high = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> interior_padding = 0) {
+  StablehloPadOptionsBuilder builder_(_fbb);
+  builder_.add_interior_padding(interior_padding);
+  builder_.add_edge_padding_high(edge_padding_high);
+  builder_.add_edge_padding_low(edge_padding_low);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloPadOptions> CreateStablehloPadOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *edge_padding_low = nullptr,
+    const std::vector<int64_t> *edge_padding_high = nullptr,
+    const std::vector<int64_t> *interior_padding = nullptr) {
+  auto edge_padding_low__ = edge_padding_low ? _fbb.CreateVector<int64_t>(*edge_padding_low) : 0;
+  auto edge_padding_high__ = edge_padding_high ? _fbb.CreateVector<int64_t>(*edge_padding_high) : 0;
+  auto interior_padding__ = interior_padding ? _fbb.CreateVector<int64_t>(*interior_padding) : 0;
+  return tflite::CreateStablehloPadOptions(
+      _fbb,
+      edge_padding_low__,
+      edge_padding_high__,
+      interior_padding__);
+}
+
+::flatbuffers::Offset<StablehloPadOptions> CreateStablehloPadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloPadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloIotaOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloIotaOptions TableType;
+  int64_t iota_dimension = 0;
+};
+
+struct StablehloIotaOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloIotaOptionsT NativeTableType;
+  typedef StablehloIotaOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IOTA_DIMENSION = 4
+  };
+  int64_t iota_dimension() const {
+    return GetField<int64_t>(VT_IOTA_DIMENSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_IOTA_DIMENSION, 8) &&
+           verifier.EndTable();
+  }
+  StablehloIotaOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloIotaOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloIotaOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloIotaOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloIotaOptionsBuilder {
+  typedef StablehloIotaOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_iota_dimension(int64_t iota_dimension) {
+    fbb_.AddElement<int64_t>(StablehloIotaOptions::VT_IOTA_DIMENSION, iota_dimension, 0);
+  }
+  explicit StablehloIotaOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloIotaOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloIotaOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloIotaOptions> CreateStablehloIotaOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t iota_dimension = 0) {
+  StablehloIotaOptionsBuilder builder_(_fbb);
+  builder_.add_iota_dimension(iota_dimension);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloIotaOptions> CreateStablehloIotaOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloIotaOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloCustomCallOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloCustomCallOptions TableType;
+  std::string call_target_name{};
+  bool has_side_effect = false;
+  std::string backend_config{};
+  int32_t api_version = 0;
+  std::vector<int32_t> called_computations{};
+  std::vector<uint8_t> custom_attributes{};
+};
+
+struct StablehloCustomCallOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloCustomCallOptionsT NativeTableType;
+  typedef StablehloCustomCallOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CALL_TARGET_NAME = 4,
+    VT_HAS_SIDE_EFFECT = 6,
+    VT_BACKEND_CONFIG = 8,
+    VT_API_VERSION = 10,
+    VT_CALLED_COMPUTATIONS = 12,
+    VT_CUSTOM_ATTRIBUTES = 14
+  };
+  const ::flatbuffers::String *call_target_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CALL_TARGET_NAME);
+  }
+  bool has_side_effect() const {
+    return GetField<uint8_t>(VT_HAS_SIDE_EFFECT, 0) != 0;
+  }
+  const ::flatbuffers::String *backend_config() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_BACKEND_CONFIG);
+  }
+  int32_t api_version() const {
+    return GetField<int32_t>(VT_API_VERSION, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *called_computations() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_CALLED_COMPUTATIONS);
+  }
+  const ::flatbuffers::Vector<uint8_t> *custom_attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_ATTRIBUTES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CALL_TARGET_NAME) &&
+           verifier.VerifyString(call_target_name()) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_SIDE_EFFECT, 1) &&
+           VerifyOffset(verifier, VT_BACKEND_CONFIG) &&
+           verifier.VerifyString(backend_config()) &&
+           VerifyField<int32_t>(verifier, VT_API_VERSION, 4) &&
+           VerifyOffset(verifier, VT_CALLED_COMPUTATIONS) &&
+           verifier.VerifyVector(called_computations()) &&
+           VerifyOffset(verifier, VT_CUSTOM_ATTRIBUTES) &&
+           verifier.VerifyVector(custom_attributes()) &&
+           verifier.EndTable();
+  }
+  StablehloCustomCallOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloCustomCallOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloCustomCallOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCustomCallOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloCustomCallOptionsBuilder {
+  typedef StablehloCustomCallOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_call_target_name(::flatbuffers::Offset<::flatbuffers::String> call_target_name) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_CALL_TARGET_NAME, call_target_name);
+  }
+  void add_has_side_effect(bool has_side_effect) {
+    fbb_.AddElement<uint8_t>(StablehloCustomCallOptions::VT_HAS_SIDE_EFFECT, static_cast<uint8_t>(has_side_effect), 0);
+  }
+  void add_backend_config(::flatbuffers::Offset<::flatbuffers::String> backend_config) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_BACKEND_CONFIG, backend_config);
+  }
+  void add_api_version(int32_t api_version) {
+    fbb_.AddElement<int32_t>(StablehloCustomCallOptions::VT_API_VERSION, api_version, 0);
+  }
+  void add_called_computations(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> called_computations) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_CALLED_COMPUTATIONS, called_computations);
+  }
+  void add_custom_attributes(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_attributes) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_CUSTOM_ATTRIBUTES, custom_attributes);
+  }
+  explicit StablehloCustomCallOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloCustomCallOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloCustomCallOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloCustomCallOptions> CreateStablehloCustomCallOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> call_target_name = 0,
+    bool has_side_effect = false,
+    ::flatbuffers::Offset<::flatbuffers::String> backend_config = 0,
+    int32_t api_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> called_computations = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_attributes = 0) {
+  StablehloCustomCallOptionsBuilder builder_(_fbb);
+  builder_.add_custom_attributes(custom_attributes);
+  builder_.add_called_computations(called_computations);
+  builder_.add_api_version(api_version);
+  builder_.add_backend_config(backend_config);
+  builder_.add_call_target_name(call_target_name);
+  builder_.add_has_side_effect(has_side_effect);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloCustomCallOptions> CreateStablehloCustomCallOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *call_target_name = nullptr,
+    bool has_side_effect = false,
+    const char *backend_config = nullptr,
+    int32_t api_version = 0,
+    const std::vector<int32_t> *called_computations = nullptr,
+    const std::vector<uint8_t> *custom_attributes = nullptr) {
+  auto call_target_name__ = call_target_name ? _fbb.CreateString(call_target_name) : 0;
+  auto backend_config__ = backend_config ? _fbb.CreateString(backend_config) : 0;
+  auto called_computations__ = called_computations ? _fbb.CreateVector<int32_t>(*called_computations) : 0;
+  auto custom_attributes__ = custom_attributes ? _fbb.CreateVector<uint8_t>(*custom_attributes) : 0;
+  return tflite::CreateStablehloCustomCallOptions(
+      _fbb,
+      call_target_name__,
+      has_side_effect,
+      backend_config__,
+      api_version,
+      called_computations__,
+      custom_attributes__);
+}
+
+::flatbuffers::Offset<StablehloCustomCallOptions> CreateStablehloCustomCallOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCustomCallOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloReduceOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloReduceOptions TableType;
+  std::vector<int64_t> dimensions{};
+  int32_t body_subgraph_index = 0;
+};
+
+struct StablehloReduceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloReduceOptionsT NativeTableType;
+  typedef StablehloReduceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DIMENSIONS = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  const ::flatbuffers::Vector<int64_t> *dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_DIMENSIONS);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DIMENSIONS) &&
+           verifier.VerifyVector(dimensions()) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  StablehloReduceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloReduceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloReduceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloReduceOptionsBuilder {
+  typedef StablehloReduceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> dimensions) {
+    fbb_.AddOffset(StablehloReduceOptions::VT_DIMENSIONS, dimensions);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloReduceOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit StablehloReduceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloReduceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloReduceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloReduceOptions> CreateStablehloReduceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> dimensions = 0,
+    int32_t body_subgraph_index = 0) {
+  StablehloReduceOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_dimensions(dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloReduceOptions> CreateStablehloReduceOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *dimensions = nullptr,
+    int32_t body_subgraph_index = 0) {
+  auto dimensions__ = dimensions ? _fbb.CreateVector<int64_t>(*dimensions) : 0;
+  return tflite::CreateStablehloReduceOptions(
+      _fbb,
+      dimensions__,
+      body_subgraph_index);
+}
+
+::flatbuffers::Offset<StablehloReduceOptions> CreateStablehloReduceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloSliceOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloSliceOptions TableType;
+  std::vector<int64_t> start_indices{};
+  std::vector<int64_t> limit_indices{};
+  std::vector<int64_t> strides{};
+};
+
+struct StablehloSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloSliceOptionsT NativeTableType;
+  typedef StablehloSliceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_START_INDICES = 4,
+    VT_LIMIT_INDICES = 6,
+    VT_STRIDES = 8
+  };
+  const ::flatbuffers::Vector<int64_t> *start_indices() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_START_INDICES);
+  }
+  const ::flatbuffers::Vector<int64_t> *limit_indices() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LIMIT_INDICES);
+  }
+  const ::flatbuffers::Vector<int64_t> *strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_STRIDES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_START_INDICES) &&
+           verifier.VerifyVector(start_indices()) &&
+           VerifyOffset(verifier, VT_LIMIT_INDICES) &&
+           verifier.VerifyVector(limit_indices()) &&
+           VerifyOffset(verifier, VT_STRIDES) &&
+           verifier.VerifyVector(strides()) &&
+           verifier.EndTable();
+  }
+  StablehloSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloSliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloSliceOptionsBuilder {
+  typedef StablehloSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_start_indices(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_indices) {
+    fbb_.AddOffset(StablehloSliceOptions::VT_START_INDICES, start_indices);
+  }
+  void add_limit_indices(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> limit_indices) {
+    fbb_.AddOffset(StablehloSliceOptions::VT_LIMIT_INDICES, limit_indices);
+  }
+  void add_strides(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> strides) {
+    fbb_.AddOffset(StablehloSliceOptions::VT_STRIDES, strides);
+  }
+  explicit StablehloSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloSliceOptions> CreateStablehloSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_indices = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> limit_indices = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> strides = 0) {
+  StablehloSliceOptionsBuilder builder_(_fbb);
+  builder_.add_strides(strides);
+  builder_.add_limit_indices(limit_indices);
+  builder_.add_start_indices(start_indices);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloSliceOptions> CreateStablehloSliceOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *start_indices = nullptr,
+    const std::vector<int64_t> *limit_indices = nullptr,
+    const std::vector<int64_t> *strides = nullptr) {
+  auto start_indices__ = start_indices ? _fbb.CreateVector<int64_t>(*start_indices) : 0;
+  auto limit_indices__ = limit_indices ? _fbb.CreateVector<int64_t>(*limit_indices) : 0;
+  auto strides__ = strides ? _fbb.CreateVector<int64_t>(*strides) : 0;
+  return tflite::CreateStablehloSliceOptions(
+      _fbb,
+      start_indices__,
+      limit_indices__,
+      strides__);
+}
+
+::flatbuffers::Offset<StablehloSliceOptions> CreateStablehloSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloConvolutionOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloConvolutionOptions TableType;
+  std::vector<int64_t> window_strides{};
+  std::vector<int64_t> padding{};
+  std::vector<int64_t> lhs_dilation{};
+  std::vector<int64_t> rhs_dilation{};
+  std::vector<bool> window_reversal{};
+  int64_t input_batch_dimension = 0;
+  int64_t input_feature_dimension = 0;
+  std::vector<int64_t> input_spatial_dimensions{};
+  int64_t kernel_input_feature_dimension = 0;
+  int64_t kernel_output_feature_dimension = 0;
+  std::vector<int64_t> kernel_spatial_dimensions{};
+  int64_t output_batch_dimension = 0;
+  int64_t output_feature_dimension = 0;
+  std::vector<int64_t> output_spatial_dimensions{};
+  int64_t feature_group_count = 0;
+  int64_t batch_group_count = 0;
+  std::vector<tflite::StablehloPrecisionConfig> precision_config{};
+};
+
+struct StablehloConvolutionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloConvolutionOptionsT NativeTableType;
+  typedef StablehloConvolutionOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_WINDOW_STRIDES = 4,
+    VT_PADDING = 6,
+    VT_LHS_DILATION = 8,
+    VT_RHS_DILATION = 10,
+    VT_WINDOW_REVERSAL = 12,
+    VT_INPUT_BATCH_DIMENSION = 14,
+    VT_INPUT_FEATURE_DIMENSION = 16,
+    VT_INPUT_SPATIAL_DIMENSIONS = 18,
+    VT_KERNEL_INPUT_FEATURE_DIMENSION = 20,
+    VT_KERNEL_OUTPUT_FEATURE_DIMENSION = 22,
+    VT_KERNEL_SPATIAL_DIMENSIONS = 24,
+    VT_OUTPUT_BATCH_DIMENSION = 26,
+    VT_OUTPUT_FEATURE_DIMENSION = 28,
+    VT_OUTPUT_SPATIAL_DIMENSIONS = 30,
+    VT_FEATURE_GROUP_COUNT = 32,
+    VT_BATCH_GROUP_COUNT = 34,
+    VT_PRECISION_CONFIG = 36
+  };
+  const ::flatbuffers::Vector<int64_t> *window_strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_STRIDES);
+  }
+  const ::flatbuffers::Vector<int64_t> *padding() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_PADDING);
+  }
+  const ::flatbuffers::Vector<int64_t> *lhs_dilation() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LHS_DILATION);
+  }
+  const ::flatbuffers::Vector<int64_t> *rhs_dilation() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_RHS_DILATION);
+  }
+  const ::flatbuffers::Vector<uint8_t> *window_reversal() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_WINDOW_REVERSAL);
+  }
+  int64_t input_batch_dimension() const {
+    return GetField<int64_t>(VT_INPUT_BATCH_DIMENSION, 0);
+  }
+  int64_t input_feature_dimension() const {
+    return GetField<int64_t>(VT_INPUT_FEATURE_DIMENSION, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *input_spatial_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INPUT_SPATIAL_DIMENSIONS);
+  }
+  int64_t kernel_input_feature_dimension() const {
+    return GetField<int64_t>(VT_KERNEL_INPUT_FEATURE_DIMENSION, 0);
+  }
+  int64_t kernel_output_feature_dimension() const {
+    return GetField<int64_t>(VT_KERNEL_OUTPUT_FEATURE_DIMENSION, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *kernel_spatial_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_KERNEL_SPATIAL_DIMENSIONS);
+  }
+  int64_t output_batch_dimension() const {
+    return GetField<int64_t>(VT_OUTPUT_BATCH_DIMENSION, 0);
+  }
+  int64_t output_feature_dimension() const {
+    return GetField<int64_t>(VT_OUTPUT_FEATURE_DIMENSION, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *output_spatial_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_OUTPUT_SPATIAL_DIMENSIONS);
+  }
+  int64_t feature_group_count() const {
+    return GetField<int64_t>(VT_FEATURE_GROUP_COUNT, 0);
+  }
+  int64_t batch_group_count() const {
+    return GetField<int64_t>(VT_BATCH_GROUP_COUNT, 0);
+  }
+  const ::flatbuffers::Vector<uint32_t> *precision_config() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_PRECISION_CONFIG);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_WINDOW_STRIDES) &&
+           verifier.VerifyVector(window_strides()) &&
+           VerifyOffset(verifier, VT_PADDING) &&
+           verifier.VerifyVector(padding()) &&
+           VerifyOffset(verifier, VT_LHS_DILATION) &&
+           verifier.VerifyVector(lhs_dilation()) &&
+           VerifyOffset(verifier, VT_RHS_DILATION) &&
+           verifier.VerifyVector(rhs_dilation()) &&
+           VerifyOffset(verifier, VT_WINDOW_REVERSAL) &&
+           verifier.VerifyVector(window_reversal()) &&
+           VerifyField<int64_t>(verifier, VT_INPUT_BATCH_DIMENSION, 8) &&
+           VerifyField<int64_t>(verifier, VT_INPUT_FEATURE_DIMENSION, 8) &&
+           VerifyOffset(verifier, VT_INPUT_SPATIAL_DIMENSIONS) &&
+           verifier.VerifyVector(input_spatial_dimensions()) &&
+           VerifyField<int64_t>(verifier, VT_KERNEL_INPUT_FEATURE_DIMENSION, 8) &&
+           VerifyField<int64_t>(verifier, VT_KERNEL_OUTPUT_FEATURE_DIMENSION, 8) &&
+           VerifyOffset(verifier, VT_KERNEL_SPATIAL_DIMENSIONS) &&
+           verifier.VerifyVector(kernel_spatial_dimensions()) &&
+           VerifyField<int64_t>(verifier, VT_OUTPUT_BATCH_DIMENSION, 8) &&
+           VerifyField<int64_t>(verifier, VT_OUTPUT_FEATURE_DIMENSION, 8) &&
+           VerifyOffset(verifier, VT_OUTPUT_SPATIAL_DIMENSIONS) &&
+           verifier.VerifyVector(output_spatial_dimensions()) &&
+           VerifyField<int64_t>(verifier, VT_FEATURE_GROUP_COUNT, 8) &&
+           VerifyField<int64_t>(verifier, VT_BATCH_GROUP_COUNT, 8) &&
+           VerifyOffset(verifier, VT_PRECISION_CONFIG) &&
+           verifier.VerifyVector(precision_config()) &&
+           verifier.EndTable();
+  }
+  StablehloConvolutionOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloConvolutionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloConvolutionOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConvolutionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloConvolutionOptionsBuilder {
+  typedef StablehloConvolutionOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_window_strides(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_WINDOW_STRIDES, window_strides);
+  }
+  void add_padding(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_PADDING, padding);
+  }
+  void add_lhs_dilation(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_dilation) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_LHS_DILATION, lhs_dilation);
+  }
+  void add_rhs_dilation(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_dilation) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_RHS_DILATION, rhs_dilation);
+  }
+  void add_window_reversal(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> window_reversal) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_WINDOW_REVERSAL, window_reversal);
+  }
+  void add_input_batch_dimension(int64_t input_batch_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_INPUT_BATCH_DIMENSION, input_batch_dimension, 0);
+  }
+  void add_input_feature_dimension(int64_t input_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_INPUT_FEATURE_DIMENSION, input_feature_dimension, 0);
+  }
+  void add_input_spatial_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_spatial_dimensions) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_INPUT_SPATIAL_DIMENSIONS, input_spatial_dimensions);
+  }
+  void add_kernel_input_feature_dimension(int64_t kernel_input_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_KERNEL_INPUT_FEATURE_DIMENSION, kernel_input_feature_dimension, 0);
+  }
+  void add_kernel_output_feature_dimension(int64_t kernel_output_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_KERNEL_OUTPUT_FEATURE_DIMENSION, kernel_output_feature_dimension, 0);
+  }
+  void add_kernel_spatial_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> kernel_spatial_dimensions) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_KERNEL_SPATIAL_DIMENSIONS, kernel_spatial_dimensions);
+  }
+  void add_output_batch_dimension(int64_t output_batch_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_OUTPUT_BATCH_DIMENSION, output_batch_dimension, 0);
+  }
+  void add_output_feature_dimension(int64_t output_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_OUTPUT_FEATURE_DIMENSION, output_feature_dimension, 0);
+  }
+  void add_output_spatial_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_spatial_dimensions) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_OUTPUT_SPATIAL_DIMENSIONS, output_spatial_dimensions);
+  }
+  void add_feature_group_count(int64_t feature_group_count) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_FEATURE_GROUP_COUNT, feature_group_count, 0);
+  }
+  void add_batch_group_count(int64_t batch_group_count) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_BATCH_GROUP_COUNT, batch_group_count, 0);
+  }
+  void add_precision_config(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> precision_config) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_PRECISION_CONFIG, precision_config);
+  }
+  explicit StablehloConvolutionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloConvolutionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloConvolutionOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloConvolutionOptions> CreateStablehloConvolutionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_dilation = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_dilation = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> window_reversal = 0,
+    int64_t input_batch_dimension = 0,
+    int64_t input_feature_dimension = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_spatial_dimensions = 0,
+    int64_t kernel_input_feature_dimension = 0,
+    int64_t kernel_output_feature_dimension = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> kernel_spatial_dimensions = 0,
+    int64_t output_batch_dimension = 0,
+    int64_t output_feature_dimension = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_spatial_dimensions = 0,
+    int64_t feature_group_count = 0,
+    int64_t batch_group_count = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> precision_config = 0) {
+  StablehloConvolutionOptionsBuilder builder_(_fbb);
+  builder_.add_batch_group_count(batch_group_count);
+  builder_.add_feature_group_count(feature_group_count);
+  builder_.add_output_feature_dimension(output_feature_dimension);
+  builder_.add_output_batch_dimension(output_batch_dimension);
+  builder_.add_kernel_output_feature_dimension(kernel_output_feature_dimension);
+  builder_.add_kernel_input_feature_dimension(kernel_input_feature_dimension);
+  builder_.add_input_feature_dimension(input_feature_dimension);
+  builder_.add_input_batch_dimension(input_batch_dimension);
+  builder_.add_precision_config(precision_config);
+  builder_.add_output_spatial_dimensions(output_spatial_dimensions);
+  builder_.add_kernel_spatial_dimensions(kernel_spatial_dimensions);
+  builder_.add_input_spatial_dimensions(input_spatial_dimensions);
+  builder_.add_window_reversal(window_reversal);
+  builder_.add_rhs_dilation(rhs_dilation);
+  builder_.add_lhs_dilation(lhs_dilation);
+  builder_.add_padding(padding);
+  builder_.add_window_strides(window_strides);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloConvolutionOptions> CreateStablehloConvolutionOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *window_strides = nullptr,
+    const std::vector<int64_t> *padding = nullptr,
+    const std::vector<int64_t> *lhs_dilation = nullptr,
+    const std::vector<int64_t> *rhs_dilation = nullptr,
+    const std::vector<uint8_t> *window_reversal = nullptr,
+    int64_t input_batch_dimension = 0,
+    int64_t input_feature_dimension = 0,
+    const std::vector<int64_t> *input_spatial_dimensions = nullptr,
+    int64_t kernel_input_feature_dimension = 0,
+    int64_t kernel_output_feature_dimension = 0,
+    const std::vector<int64_t> *kernel_spatial_dimensions = nullptr,
+    int64_t output_batch_dimension = 0,
+    int64_t output_feature_dimension = 0,
+    const std::vector<int64_t> *output_spatial_dimensions = nullptr,
+    int64_t feature_group_count = 0,
+    int64_t batch_group_count = 0,
+    const std::vector<uint32_t> *precision_config = nullptr) {
+  auto window_strides__ = window_strides ? _fbb.CreateVector<int64_t>(*window_strides) : 0;
+  auto padding__ = padding ? _fbb.CreateVector<int64_t>(*padding) : 0;
+  auto lhs_dilation__ = lhs_dilation ? _fbb.CreateVector<int64_t>(*lhs_dilation) : 0;
+  auto rhs_dilation__ = rhs_dilation ? _fbb.CreateVector<int64_t>(*rhs_dilation) : 0;
+  auto window_reversal__ = window_reversal ? _fbb.CreateVector<uint8_t>(*window_reversal) : 0;
+  auto input_spatial_dimensions__ = input_spatial_dimensions ? _fbb.CreateVector<int64_t>(*input_spatial_dimensions) : 0;
+  auto kernel_spatial_dimensions__ = kernel_spatial_dimensions ? _fbb.CreateVector<int64_t>(*kernel_spatial_dimensions) : 0;
+  auto output_spatial_dimensions__ = output_spatial_dimensions ? _fbb.CreateVector<int64_t>(*output_spatial_dimensions) : 0;
+  auto precision_config__ = precision_config ? _fbb.CreateVector<uint32_t>(*precision_config) : 0;
+  return tflite::CreateStablehloConvolutionOptions(
+      _fbb,
+      window_strides__,
+      padding__,
+      lhs_dilation__,
+      rhs_dilation__,
+      window_reversal__,
+      input_batch_dimension,
+      input_feature_dimension,
+      input_spatial_dimensions__,
+      kernel_input_feature_dimension,
+      kernel_output_feature_dimension,
+      kernel_spatial_dimensions__,
+      output_batch_dimension,
+      output_feature_dimension,
+      output_spatial_dimensions__,
+      feature_group_count,
+      batch_group_count,
+      precision_config__);
+}
+
+::flatbuffers::Offset<StablehloConvolutionOptions> CreateStablehloConvolutionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConvolutionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloScatterOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloScatterOptions TableType;
+  bool indices_are_sorted = false;
+  std::vector<int64_t> update_window_dims{};
+  std::vector<int64_t> inserted_window_dims{};
+  std::vector<int64_t> scatter_dims_to_operand_dims{};
+  int64_t index_vector_dim = 0;
+  bool unique_indices = false;
+  int32_t update_computation_subgraph_index = 0;
+};
+
+struct StablehloScatterOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloScatterOptionsT NativeTableType;
+  typedef StablehloScatterOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INDICES_ARE_SORTED = 4,
+    VT_UPDATE_WINDOW_DIMS = 6,
+    VT_INSERTED_WINDOW_DIMS = 8,
+    VT_SCATTER_DIMS_TO_OPERAND_DIMS = 10,
+    VT_INDEX_VECTOR_DIM = 12,
+    VT_UNIQUE_INDICES = 14,
+    VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX = 16
+  };
+  bool indices_are_sorted() const {
+    return GetField<uint8_t>(VT_INDICES_ARE_SORTED, 0) != 0;
+  }
+  const ::flatbuffers::Vector<int64_t> *update_window_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_UPDATE_WINDOW_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *inserted_window_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INSERTED_WINDOW_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *scatter_dims_to_operand_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_SCATTER_DIMS_TO_OPERAND_DIMS);
+  }
+  int64_t index_vector_dim() const {
+    return GetField<int64_t>(VT_INDEX_VECTOR_DIM, 0);
+  }
+  bool unique_indices() const {
+    return GetField<uint8_t>(VT_UNIQUE_INDICES, 0) != 0;
+  }
+  int32_t update_computation_subgraph_index() const {
+    return GetField<int32_t>(VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_INDICES_ARE_SORTED, 1) &&
+           VerifyOffset(verifier, VT_UPDATE_WINDOW_DIMS) &&
+           verifier.VerifyVector(update_window_dims()) &&
+           VerifyOffset(verifier, VT_INSERTED_WINDOW_DIMS) &&
+           verifier.VerifyVector(inserted_window_dims()) &&
+           VerifyOffset(verifier, VT_SCATTER_DIMS_TO_OPERAND_DIMS) &&
+           verifier.VerifyVector(scatter_dims_to_operand_dims()) &&
+           VerifyField<int64_t>(verifier, VT_INDEX_VECTOR_DIM, 8) &&
+           VerifyField<uint8_t>(verifier, VT_UNIQUE_INDICES, 1) &&
+           VerifyField<int32_t>(verifier, VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  StablehloScatterOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloScatterOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloScatterOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloScatterOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloScatterOptionsBuilder {
+  typedef StablehloScatterOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_indices_are_sorted(bool indices_are_sorted) {
+    fbb_.AddElement<uint8_t>(StablehloScatterOptions::VT_INDICES_ARE_SORTED, static_cast<uint8_t>(indices_are_sorted), 0);
+  }
+  void add_update_window_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> update_window_dims) {
+    fbb_.AddOffset(StablehloScatterOptions::VT_UPDATE_WINDOW_DIMS, update_window_dims);
+  }
+  void add_inserted_window_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inserted_window_dims) {
+    fbb_.AddOffset(StablehloScatterOptions::VT_INSERTED_WINDOW_DIMS, inserted_window_dims);
+  }
+  void add_scatter_dims_to_operand_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> scatter_dims_to_operand_dims) {
+    fbb_.AddOffset(StablehloScatterOptions::VT_SCATTER_DIMS_TO_OPERAND_DIMS, scatter_dims_to_operand_dims);
+  }
+  void add_index_vector_dim(int64_t index_vector_dim) {
+    fbb_.AddElement<int64_t>(StablehloScatterOptions::VT_INDEX_VECTOR_DIM, index_vector_dim, 0);
+  }
+  void add_unique_indices(bool unique_indices) {
+    fbb_.AddElement<uint8_t>(StablehloScatterOptions::VT_UNIQUE_INDICES, static_cast<uint8_t>(unique_indices), 0);
+  }
+  void add_update_computation_subgraph_index(int32_t update_computation_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloScatterOptions::VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX, update_computation_subgraph_index, 0);
+  }
+  explicit StablehloScatterOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloScatterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloScatterOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloScatterOptions> CreateStablehloScatterOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool indices_are_sorted = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> update_window_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inserted_window_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> scatter_dims_to_operand_dims = 0,
+    int64_t index_vector_dim = 0,
+    bool unique_indices = false,
+    int32_t update_computation_subgraph_index = 0) {
+  StablehloScatterOptionsBuilder builder_(_fbb);
+  builder_.add_index_vector_dim(index_vector_dim);
+  builder_.add_update_computation_subgraph_index(update_computation_subgraph_index);
+  builder_.add_scatter_dims_to_operand_dims(scatter_dims_to_operand_dims);
+  builder_.add_inserted_window_dims(inserted_window_dims);
+  builder_.add_update_window_dims(update_window_dims);
+  builder_.add_unique_indices(unique_indices);
+  builder_.add_indices_are_sorted(indices_are_sorted);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloScatterOptions> CreateStablehloScatterOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool indices_are_sorted = false,
+    const std::vector<int64_t> *update_window_dims = nullptr,
+    const std::vector<int64_t> *inserted_window_dims = nullptr,
+    const std::vector<int64_t> *scatter_dims_to_operand_dims = nullptr,
+    int64_t index_vector_dim = 0,
+    bool unique_indices = false,
+    int32_t update_computation_subgraph_index = 0) {
+  auto update_window_dims__ = update_window_dims ? _fbb.CreateVector<int64_t>(*update_window_dims) : 0;
+  auto inserted_window_dims__ = inserted_window_dims ? _fbb.CreateVector<int64_t>(*inserted_window_dims) : 0;
+  auto scatter_dims_to_operand_dims__ = scatter_dims_to_operand_dims ? _fbb.CreateVector<int64_t>(*scatter_dims_to_operand_dims) : 0;
+  return tflite::CreateStablehloScatterOptions(
+      _fbb,
+      indices_are_sorted,
+      update_window_dims__,
+      inserted_window_dims__,
+      scatter_dims_to_operand_dims__,
+      index_vector_dim,
+      unique_indices,
+      update_computation_subgraph_index);
+}
+
+::flatbuffers::Offset<StablehloScatterOptions> CreateStablehloScatterOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloScatterOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloCaseOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloCaseOptions TableType;
+  std::vector<int32_t> branch_subgraph_indices{};
+};
+
+struct StablehloCaseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloCaseOptionsT NativeTableType;
+  typedef StablehloCaseOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BRANCH_SUBGRAPH_INDICES = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *branch_subgraph_indices() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_BRANCH_SUBGRAPH_INDICES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BRANCH_SUBGRAPH_INDICES) &&
+           verifier.VerifyVector(branch_subgraph_indices()) &&
+           verifier.EndTable();
+  }
+  StablehloCaseOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloCaseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloCaseOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCaseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloCaseOptionsBuilder {
+  typedef StablehloCaseOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_branch_subgraph_indices(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> branch_subgraph_indices) {
+    fbb_.AddOffset(StablehloCaseOptions::VT_BRANCH_SUBGRAPH_INDICES, branch_subgraph_indices);
+  }
+  explicit StablehloCaseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloCaseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloCaseOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloCaseOptions> CreateStablehloCaseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> branch_subgraph_indices = 0) {
+  StablehloCaseOptionsBuilder builder_(_fbb);
+  builder_.add_branch_subgraph_indices(branch_subgraph_indices);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloCaseOptions> CreateStablehloCaseOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *branch_subgraph_indices = nullptr) {
+  auto branch_subgraph_indices__ = branch_subgraph_indices ? _fbb.CreateVector<int32_t>(*branch_subgraph_indices) : 0;
+  return tflite::CreateStablehloCaseOptions(
+      _fbb,
+      branch_subgraph_indices__);
+}
+
+::flatbuffers::Offset<StablehloCaseOptions> CreateStablehloCaseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCaseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloRngBitGeneratorOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloRngBitGeneratorOptions TableType;
+  tflite::RngAlgorithm algorithm = tflite::RngAlgorithm_DEFAULT;
+};
+
+struct StablehloRngBitGeneratorOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloRngBitGeneratorOptionsT NativeTableType;
+  typedef StablehloRngBitGeneratorOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALGORITHM = 4
+  };
+  tflite::RngAlgorithm algorithm() const {
+    return static_cast<tflite::RngAlgorithm>(GetField<int8_t>(VT_ALGORITHM, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_ALGORITHM, 1) &&
+           verifier.EndTable();
+  }
+  StablehloRngBitGeneratorOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloRngBitGeneratorOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloRngBitGeneratorOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloRngBitGeneratorOptionsBuilder {
+  typedef StablehloRngBitGeneratorOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_algorithm(tflite::RngAlgorithm algorithm) {
+    fbb_.AddElement<int8_t>(StablehloRngBitGeneratorOptions::VT_ALGORITHM, static_cast<int8_t>(algorithm), 0);
+  }
+  explicit StablehloRngBitGeneratorOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloRngBitGeneratorOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> CreateStablehloRngBitGeneratorOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::RngAlgorithm algorithm = tflite::RngAlgorithm_DEFAULT) {
+  StablehloRngBitGeneratorOptionsBuilder builder_(_fbb);
+  builder_.add_algorithm(algorithm);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloRngBitGeneratorOptions> CreateStablehloRngBitGeneratorOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloRngBitGeneratorOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Conv2DOptionsT : public ::flatbuffers::NativeTable {
+  typedef Conv2DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  int32_t dilation_w_factor = 1;
+  int32_t dilation_h_factor = 1;
+  tflite::TensorType quantized_bias_type = tflite::TensorType_FLOAT32;
+};
+
+struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Conv2DOptionsT NativeTableType;
+  typedef Conv2DOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_DILATION_W_FACTOR = 12,
+    VT_DILATION_H_FACTOR = 14,
+    VT_QUANTIZED_BIAS_TYPE = 16
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  tflite::TensorType quantized_bias_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  Conv2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Conv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Conv2DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Conv2DOptionsBuilder {
+  typedef Conv2DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
+  }
+  explicit Conv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Conv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Conv2DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1,
+    tflite::TensorType quantized_bias_type = tflite::TensorType_FLOAT32) {
+  Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Conv3DOptionsT : public ::flatbuffers::NativeTable {
+  typedef Conv3DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_d = 0;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  int32_t dilation_d_factor = 1;
+  int32_t dilation_w_factor = 1;
+  int32_t dilation_h_factor = 1;
+};
+
+struct Conv3DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Conv3DOptionsT NativeTableType;
+  typedef Conv3DOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_D = 6,
+    VT_STRIDE_W = 8,
+    VT_STRIDE_H = 10,
+    VT_FUSED_ACTIVATION_FUNCTION = 12,
+    VT_DILATION_D_FACTOR = 14,
+    VT_DILATION_W_FACTOR = 16,
+    VT_DILATION_H_FACTOR = 18
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_d() const {
+    return GetField<int32_t>(VT_STRIDE_D, 0);
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_d_factor() const {
+    return GetField<int32_t>(VT_DILATION_D_FACTOR, 1);
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_D, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_D_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
+           verifier.EndTable();
+  }
+  Conv3DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Conv3DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Conv3DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Conv3DOptionsBuilder {
+  typedef Conv3DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Conv3DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_d(int32_t stride_d) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_STRIDE_D, stride_d, 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv3DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_d_factor(int32_t dilation_d_factor) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_D_FACTOR, dilation_d_factor, 1);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  explicit Conv3DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Conv3DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Conv3DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_d = 0,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    int32_t dilation_d_factor = 1,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
+  Conv3DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_dilation_d_factor(dilation_d_factor);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_stride_d(stride_d);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Pool2DOptionsT : public ::flatbuffers::NativeTable {
+  typedef Pool2DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  int32_t filter_width = 0;
+  int32_t filter_height = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Pool2DOptionsT NativeTableType;
+  typedef Pool2DOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FILTER_WIDTH = 10,
+    VT_FILTER_HEIGHT = 12,
+    VT_FUSED_ACTIVATION_FUNCTION = 14
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  int32_t filter_width() const {
+    return GetField<int32_t>(VT_FILTER_WIDTH, 0);
+  }
+  int32_t filter_height() const {
+    return GetField<int32_t>(VT_FILTER_HEIGHT, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_WIDTH, 4) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_HEIGHT, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+  Pool2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Pool2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Pool2DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Pool2DOptionsBuilder {
+  typedef Pool2DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_filter_width(int32_t filter_width) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_WIDTH, filter_width, 0);
+  }
+  void add_filter_height(int32_t filter_height) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_HEIGHT, filter_height, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit Pool2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Pool2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Pool2DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    int32_t filter_width = 0,
+    int32_t filter_height = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  Pool2DOptionsBuilder builder_(_fbb);
+  builder_.add_filter_height(filter_height);
+  builder_.add_filter_width(filter_width);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DepthwiseConv2DOptionsT : public ::flatbuffers::NativeTable {
+  typedef DepthwiseConv2DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  int32_t depth_multiplier = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  int32_t dilation_w_factor = 1;
+  int32_t dilation_h_factor = 1;
+};
+
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DepthwiseConv2DOptionsT NativeTableType;
+  typedef DepthwiseConv2DOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_DEPTH_MULTIPLIER = 10,
+    VT_FUSED_ACTIVATION_FUNCTION = 12,
+    VT_DILATION_W_FACTOR = 14,
+    VT_DILATION_H_FACTOR = 16
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  int32_t depth_multiplier() const {
+    return GetField<int32_t>(VT_DEPTH_MULTIPLIER, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int32_t>(verifier, VT_DEPTH_MULTIPLIER, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
+           verifier.EndTable();
+  }
+  DepthwiseConv2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthwiseConv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DepthwiseConv2DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DepthwiseConv2DOptionsBuilder {
+  typedef DepthwiseConv2DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_depth_multiplier(int32_t depth_multiplier) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER, depth_multiplier, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  explicit DepthwiseConv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DepthwiseConv2DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    int32_t depth_multiplier = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
+  DepthwiseConv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_depth_multiplier(depth_multiplier);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConcatEmbeddingsOptionsT : public ::flatbuffers::NativeTable {
+  typedef ConcatEmbeddingsOptions TableType;
+  int32_t num_channels = 0;
+  std::vector<int32_t> num_columns_per_channel{};
+  std::vector<int32_t> embedding_dim_per_channel{};
+};
+
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConcatEmbeddingsOptionsT NativeTableType;
+  typedef ConcatEmbeddingsOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_CHANNELS = 4,
+    VT_NUM_COLUMNS_PER_CHANNEL = 6,
+    VT_EMBEDDING_DIM_PER_CHANNEL = 8
+  };
+  int32_t num_channels() const {
+    return GetField<int32_t>(VT_NUM_CHANNELS, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
+  }
+  const ::flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHANNELS, 4) &&
+           VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
+           verifier.VerifyVector(num_columns_per_channel()) &&
+           VerifyOffset(verifier, VT_EMBEDDING_DIM_PER_CHANNEL) &&
+           verifier.VerifyVector(embedding_dim_per_channel()) &&
+           verifier.EndTable();
+  }
+  ConcatEmbeddingsOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConcatEmbeddingsOptionsBuilder {
+  typedef ConcatEmbeddingsOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_channels(int32_t num_channels) {
+    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
+  }
+  void add_num_columns_per_channel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> num_columns_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
+  }
+  void add_embedding_dim_per_channel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL, embedding_dim_per_channel);
+  }
+  explicit ConcatEmbeddingsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_channels = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
+  ConcatEmbeddingsOptionsBuilder builder_(_fbb);
+  builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
+  builder_.add_num_columns_per_channel(num_columns_per_channel);
+  builder_.add_num_channels(num_channels);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_channels = 0,
+    const std::vector<int32_t> *num_columns_per_channel = nullptr,
+    const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
+  auto num_columns_per_channel__ = num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0;
+  auto embedding_dim_per_channel__ = embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0;
+  return tflite::CreateConcatEmbeddingsOptions(
+      _fbb,
+      num_channels,
+      num_columns_per_channel__,
+      embedding_dim_per_channel__);
+}
+
+::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LSHProjectionOptionsT : public ::flatbuffers::NativeTable {
+  typedef LSHProjectionOptions TableType;
+  tflite::LSHProjectionType type = tflite::LSHProjectionType_UNKNOWN;
+};
+
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LSHProjectionOptionsT NativeTableType;
+  typedef LSHProjectionOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4
+  };
+  tflite::LSHProjectionType type() const {
+    return static_cast<tflite::LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  LSHProjectionOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LSHProjectionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LSHProjectionOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LSHProjectionOptionsBuilder {
+  typedef LSHProjectionOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type(tflite::LSHProjectionType type) {
+    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  explicit LSHProjectionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LSHProjectionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LSHProjectionOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::LSHProjectionType type = tflite::LSHProjectionType_UNKNOWN) {
+  LSHProjectionOptionsBuilder builder_(_fbb);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SVDFOptionsT : public ::flatbuffers::NativeTable {
+  typedef SVDFOptions TableType;
+  int32_t rank = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SVDFOptionsT NativeTableType;
+  typedef SVDFOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_RANK = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  int32_t rank() const {
+    return GetField<int32_t>(VT_RANK, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RANK, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  SVDFOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SVDFOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SVDFOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SVDFOptionsBuilder {
+  typedef SVDFOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_rank(int32_t rank) {
+    fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(SVDFOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit SVDFOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SVDFOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SVDFOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t rank = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool asymmetric_quantize_inputs = false) {
+  SVDFOptionsBuilder builder_(_fbb);
+  builder_.add_rank(rank);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RNNOptionsT : public ::flatbuffers::NativeTable {
+  typedef RNNOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct RNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RNNOptionsT NativeTableType;
+  typedef RNNOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  RNNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RNNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RNNOptionsBuilder {
+  typedef RNNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(RNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit RNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RNNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool asymmetric_quantize_inputs = false) {
+  RNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<RNNOptions> CreateRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SequenceRNNOptionsT : public ::flatbuffers::NativeTable {
+  typedef SequenceRNNOptions TableType;
+  bool time_major = false;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SequenceRNNOptionsT NativeTableType;
+  typedef SequenceRNNOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TIME_MAJOR = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  SequenceRNNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SequenceRNNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SequenceRNNOptionsBuilder {
+  typedef SequenceRNNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit SequenceRNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool time_major = false,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool asymmetric_quantize_inputs = false) {
+  SequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BidirectionalSequenceRNNOptionsT : public ::flatbuffers::NativeTable {
+  typedef BidirectionalSequenceRNNOptions TableType;
+  bool time_major = false;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool merge_outputs = false;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsT NativeTableType;
+  typedef BidirectionalSequenceRNNOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TIME_MAJOR = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_MERGE_OUTPUTS = 8,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 10
+  };
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceRNNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  typedef BidirectionalSequenceRNNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool time_major = false,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool merge_outputs = false,
+    bool asymmetric_quantize_inputs = false) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FullyConnectedOptionsT : public ::flatbuffers::NativeTable {
+  typedef FullyConnectedOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat_DEFAULT;
+  bool keep_num_dims = false;
+  bool asymmetric_quantize_inputs = false;
+  tflite::TensorType quantized_bias_type = tflite::TensorType_FLOAT32;
+};
+
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FullyConnectedOptionsT NativeTableType;
+  typedef FullyConnectedOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_WEIGHTS_FORMAT = 6,
+    VT_KEEP_NUM_DIMS = 8,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 10,
+    VT_QUANTIZED_BIAS_TYPE = 12
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  tflite::FullyConnectedOptionsWeightsFormat weights_format() const {
+    return static_cast<tflite::FullyConnectedOptionsWeightsFormat>(GetField<int8_t>(VT_WEIGHTS_FORMAT, 0));
+  }
+  bool keep_num_dims() const {
+    return GetField<uint8_t>(VT_KEEP_NUM_DIMS, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  tflite::TensorType quantized_bias_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int8_t>(verifier, VT_WEIGHTS_FORMAT, 1) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_NUM_DIMS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  FullyConnectedOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FullyConnectedOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FullyConnectedOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FullyConnectedOptionsBuilder {
+  typedef FullyConnectedOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_weights_format(tflite::FullyConnectedOptionsWeightsFormat weights_format) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_WEIGHTS_FORMAT, static_cast<int8_t>(weights_format), 0);
+  }
+  void add_keep_num_dims(bool keep_num_dims) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_KEEP_NUM_DIMS, static_cast<uint8_t>(keep_num_dims), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
+  }
+  explicit FullyConnectedOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FullyConnectedOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FullyConnectedOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat_DEFAULT,
+    bool keep_num_dims = false,
+    bool asymmetric_quantize_inputs = false,
+    tflite::TensorType quantized_bias_type = tflite::TensorType_FLOAT32) {
+  FullyConnectedOptionsBuilder builder_(_fbb);
+  builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_keep_num_dims(keep_num_dims);
+  builder_.add_weights_format(weights_format);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SoftmaxOptionsT : public ::flatbuffers::NativeTable {
+  typedef SoftmaxOptions TableType;
+  float beta = 0.0f;
+};
+
+struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SoftmaxOptionsT NativeTableType;
+  typedef SoftmaxOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BETA = 4
+  };
+  float beta() const {
+    return GetField<float>(VT_BETA, 0.0f);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_BETA, 4) &&
+           verifier.EndTable();
+  }
+  SoftmaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SoftmaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SoftmaxOptionsBuilder {
+  typedef SoftmaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit SoftmaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    float beta = 0.0f) {
+  SoftmaxOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConcatenationOptionsT : public ::flatbuffers::NativeTable {
+  typedef ConcatenationOptions TableType;
+  int32_t axis = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConcatenationOptionsT NativeTableType;
+  typedef ConcatenationOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+  ConcatenationOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConcatenationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConcatenationOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConcatenationOptionsBuilder {
+  typedef ConcatenationOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit ConcatenationOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConcatenationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConcatenationOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  ConcatenationOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddOptionsT : public ::flatbuffers::NativeTable {
+  typedef AddOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool pot_scale_int16 = true;
+};
+
+struct AddOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AddOptionsT NativeTableType;
+  typedef AddOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16, 1) &&
+           verifier.EndTable();
+  }
+  AddOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AddOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddOptionsBuilder {
+  typedef AddOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(AddOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
+  explicit AddOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AddOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AddOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AddOptions> CreateAddOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
+  AddOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<AddOptions> CreateAddOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MulOptionsT : public ::flatbuffers::NativeTable {
+  typedef MulOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct MulOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MulOptionsT NativeTableType;
+  typedef MulOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+  MulOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MulOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MulOptionsBuilder {
+  typedef MulOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit MulOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MulOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MulOptions> CreateMulOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  MulOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MulOptions> CreateMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct L2NormOptionsT : public ::flatbuffers::NativeTable {
+  typedef L2NormOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef L2NormOptionsT NativeTableType;
+  typedef L2NormOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+  L2NormOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(L2NormOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<L2NormOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct L2NormOptionsBuilder {
+  typedef L2NormOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit L2NormOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<L2NormOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<L2NormOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  L2NormOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LocalResponseNormalizationOptionsT : public ::flatbuffers::NativeTable {
+  typedef LocalResponseNormalizationOptions TableType;
+  int32_t radius = 0;
+  float bias = 0.0f;
+  float alpha = 0.0f;
+  float beta = 0.0f;
+};
+
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LocalResponseNormalizationOptionsT NativeTableType;
+  typedef LocalResponseNormalizationOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_RADIUS = 4,
+    VT_BIAS = 6,
+    VT_ALPHA = 8,
+    VT_BETA = 10
+  };
+  int32_t radius() const {
+    return GetField<int32_t>(VT_RADIUS, 0);
+  }
+  float bias() const {
+    return GetField<float>(VT_BIAS, 0.0f);
+  }
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  float beta() const {
+    return GetField<float>(VT_BETA, 0.0f);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RADIUS, 4) &&
+           VerifyField<float>(verifier, VT_BIAS, 4) &&
+           VerifyField<float>(verifier, VT_ALPHA, 4) &&
+           VerifyField<float>(verifier, VT_BETA, 4) &&
+           verifier.EndTable();
+  }
+  LocalResponseNormalizationOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LocalResponseNormalizationOptionsBuilder {
+  typedef LocalResponseNormalizationOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_radius(int32_t radius) {
+    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS, radius, 0);
+  }
+  void add_bias(float bias) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias, 0.0f);
+  }
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit LocalResponseNormalizationOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t radius = 0,
+    float bias = 0.0f,
+    float alpha = 0.0f,
+    float beta = 0.0f) {
+  LocalResponseNormalizationOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  builder_.add_alpha(alpha);
+  builder_.add_bias(bias);
+  builder_.add_radius(radius);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LSTMOptionsT : public ::flatbuffers::NativeTable {
+  typedef LSTMOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  float cell_clip = 0.0f;
+  float proj_clip = 0.0f;
+  tflite::LSTMKernelType kernel_type = tflite::LSTMKernelType_FULL;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LSTMOptionsT NativeTableType;
+  typedef LSTMOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_KERNEL_TYPE = 10,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  tflite::LSTMKernelType kernel_type() const {
+    return static_cast<tflite::LSTMKernelType>(GetField<int8_t>(VT_KERNEL_TYPE, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
+           VerifyField<int8_t>(verifier, VT_KERNEL_TYPE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  LSTMOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LSTMOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LSTMOptionsBuilder {
+  typedef LSTMOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_kernel_type(tflite::LSTMKernelType kernel_type) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_KERNEL_TYPE, static_cast<int8_t>(kernel_type), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(LSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit LSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LSTMOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    tflite::LSTMKernelType kernel_type = tflite::LSTMKernelType_FULL,
+    bool asymmetric_quantize_inputs = false) {
+  LSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_kernel_type(kernel_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnidirectionalSequenceLSTMOptionsT : public ::flatbuffers::NativeTable {
+  typedef UnidirectionalSequenceLSTMOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  float cell_clip = 0.0f;
+  float proj_clip = 0.0f;
+  bool time_major = false;
+  bool asymmetric_quantize_inputs = false;
+  bool diagonal_recurrent_tensors = false;
+};
+
+struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnidirectionalSequenceLSTMOptionsT NativeTableType;
+  typedef UnidirectionalSequenceLSTMOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_TIME_MAJOR = 10,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12,
+    VT_DIAGONAL_RECURRENT_TENSORS = 14
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool diagonal_recurrent_tensors() const {
+    return GetField<uint8_t>(VT_DIAGONAL_RECURRENT_TENSORS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DIAGONAL_RECURRENT_TENSORS, 1) &&
+           verifier.EndTable();
+  }
+  UnidirectionalSequenceLSTMOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnidirectionalSequenceLSTMOptionsBuilder {
+  typedef UnidirectionalSequenceLSTMOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(UnidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  void add_diagonal_recurrent_tensors(bool diagonal_recurrent_tensors) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_DIAGONAL_RECURRENT_TENSORS, static_cast<uint8_t>(diagonal_recurrent_tensors), 0);
+  }
+  explicit UnidirectionalSequenceLSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool time_major = false,
+    bool asymmetric_quantize_inputs = false,
+    bool diagonal_recurrent_tensors = false) {
+  UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_diagonal_recurrent_tensors(diagonal_recurrent_tensors);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_time_major(time_major);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BidirectionalSequenceLSTMOptionsT : public ::flatbuffers::NativeTable {
+  typedef BidirectionalSequenceLSTMOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  float cell_clip = 0.0f;
+  float proj_clip = 0.0f;
+  bool merge_outputs = false;
+  bool time_major = true;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BidirectionalSequenceLSTMOptionsT NativeTableType;
+  typedef BidirectionalSequenceLSTMOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_MERGE_OUTPUTS = 10,
+    VT_TIME_MAJOR = 12,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 14
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 1) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceLSTMOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceLSTMOptionsBuilder {
+  typedef BidirectionalSequenceLSTMOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 1);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BidirectionalSequenceLSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool merge_outputs = false,
+    bool time_major = true,
+    bool asymmetric_quantize_inputs = false) {
+  BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_time_major(time_major);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ResizeBilinearOptionsT : public ::flatbuffers::NativeTable {
+  typedef ResizeBilinearOptions TableType;
+  bool align_corners = false;
+  bool half_pixel_centers = false;
+};
+
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ResizeBilinearOptionsT NativeTableType;
+  typedef ResizeBilinearOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALIGN_CORNERS = 8,
+    VT_HALF_PIXEL_CENTERS = 10
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS, 1) &&
+           verifier.EndTable();
+  }
+  ResizeBilinearOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeBilinearOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ResizeBilinearOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ResizeBilinearOptionsBuilder {
+  typedef ResizeBilinearOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
+  explicit ResizeBilinearOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ResizeBilinearOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ResizeBilinearOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeBilinearOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ResizeNearestNeighborOptionsT : public ::flatbuffers::NativeTable {
+  typedef ResizeNearestNeighborOptions TableType;
+  bool align_corners = false;
+  bool half_pixel_centers = false;
+};
+
+struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ResizeNearestNeighborOptionsT NativeTableType;
+  typedef ResizeNearestNeighborOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALIGN_CORNERS = 4,
+    VT_HALF_PIXEL_CENTERS = 6
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS, 1) &&
+           verifier.EndTable();
+  }
+  ResizeNearestNeighborOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ResizeNearestNeighborOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ResizeNearestNeighborOptionsBuilder {
+  typedef ResizeNearestNeighborOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
+  explicit ResizeNearestNeighborOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ResizeNearestNeighborOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ResizeNearestNeighborOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeNearestNeighborOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CallOptionsT : public ::flatbuffers::NativeTable {
+  typedef CallOptions TableType;
+  uint32_t subgraph = 0;
+};
+
+struct CallOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CallOptionsT NativeTableType;
+  typedef CallOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SUBGRAPH = 4
+  };
+  uint32_t subgraph() const {
+    return GetField<uint32_t>(VT_SUBGRAPH, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH, 4) &&
+           verifier.EndTable();
+  }
+  CallOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CallOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CallOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CallOptionsBuilder {
+  typedef CallOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_subgraph(uint32_t subgraph) {
+    fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
+  }
+  explicit CallOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CallOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CallOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CallOptions> CreateCallOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t subgraph = 0) {
+  CallOptionsBuilder builder_(_fbb);
+  builder_.add_subgraph(subgraph);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CallOptions> CreateCallOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PadOptionsT : public ::flatbuffers::NativeTable {
+  typedef PadOptions TableType;
+};
+
+struct PadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PadOptionsT NativeTableType;
+  typedef PadOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PadOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PadOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadOptionsBuilder {
+  typedef PadOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PadOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PadOptions> CreatePadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  PadOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<PadOptions> CreatePadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PadV2OptionsT : public ::flatbuffers::NativeTable {
+  typedef PadV2Options TableType;
+};
+
+struct PadV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PadV2OptionsT NativeTableType;
+  typedef PadV2OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PadV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PadV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadV2OptionsBuilder {
+  typedef PadV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PadV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PadV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PadV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PadV2Options> CreatePadV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  PadV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<PadV2Options> CreatePadV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReshapeOptionsT : public ::flatbuffers::NativeTable {
+  typedef ReshapeOptions TableType;
+  std::vector<int32_t> new_shape{};
+};
+
+struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReshapeOptionsT NativeTableType;
+  typedef ReshapeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NEW_SHAPE = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *new_shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NEW_SHAPE) &&
+           verifier.VerifyVector(new_shape()) &&
+           verifier.EndTable();
+  }
+  ReshapeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReshapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReshapeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReshapeOptionsBuilder {
+  typedef ReshapeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_new_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape) {
+    fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
+  }
+  explicit ReshapeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReshapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReshapeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape = 0) {
+  ReshapeOptionsBuilder builder_(_fbb);
+  builder_.add_new_shape(new_shape);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *new_shape = nullptr) {
+  auto new_shape__ = new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0;
+  return tflite::CreateReshapeOptions(
+      _fbb,
+      new_shape__);
+}
+
+::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SpaceToBatchNDOptionsT : public ::flatbuffers::NativeTable {
+  typedef SpaceToBatchNDOptions TableType;
+};
+
+struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SpaceToBatchNDOptionsT NativeTableType;
+  typedef SpaceToBatchNDOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SpaceToBatchNDOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SpaceToBatchNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SpaceToBatchNDOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SpaceToBatchNDOptionsBuilder {
+  typedef SpaceToBatchNDOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SpaceToBatchNDOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SpaceToBatchNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SpaceToBatchNDOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SpaceToBatchNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BatchToSpaceNDOptionsT : public ::flatbuffers::NativeTable {
+  typedef BatchToSpaceNDOptions TableType;
+};
+
+struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BatchToSpaceNDOptionsT NativeTableType;
+  typedef BatchToSpaceNDOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BatchToSpaceNDOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BatchToSpaceNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BatchToSpaceNDOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BatchToSpaceNDOptionsBuilder {
+  typedef BatchToSpaceNDOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BatchToSpaceNDOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BatchToSpaceNDOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BatchToSpaceNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SkipGramOptionsT : public ::flatbuffers::NativeTable {
+  typedef SkipGramOptions TableType;
+  int32_t ngram_size = 0;
+  int32_t max_skip_size = 0;
+  bool include_all_ngrams = false;
+};
+
+struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SkipGramOptionsT NativeTableType;
+  typedef SkipGramOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NGRAM_SIZE = 4,
+    VT_MAX_SKIP_SIZE = 6,
+    VT_INCLUDE_ALL_NGRAMS = 8
+  };
+  int32_t ngram_size() const {
+    return GetField<int32_t>(VT_NGRAM_SIZE, 0);
+  }
+  int32_t max_skip_size() const {
+    return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
+  }
+  bool include_all_ngrams() const {
+    return GetField<uint8_t>(VT_INCLUDE_ALL_NGRAMS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NGRAM_SIZE, 4) &&
+           VerifyField<int32_t>(verifier, VT_MAX_SKIP_SIZE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS, 1) &&
+           verifier.EndTable();
+  }
+  SkipGramOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SkipGramOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SkipGramOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SkipGramOptionsBuilder {
+  typedef SkipGramOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_ngram_size(int32_t ngram_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
+  }
+  void add_max_skip_size(int32_t max_skip_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size, 0);
+  }
+  void add_include_all_ngrams(bool include_all_ngrams) {
+    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS, static_cast<uint8_t>(include_all_ngrams), 0);
+  }
+  explicit SkipGramOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SkipGramOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SkipGramOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t ngram_size = 0,
+    int32_t max_skip_size = 0,
+    bool include_all_ngrams = false) {
+  SkipGramOptionsBuilder builder_(_fbb);
+  builder_.add_max_skip_size(max_skip_size);
+  builder_.add_ngram_size(ngram_size);
+  builder_.add_include_all_ngrams(include_all_ngrams);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SpaceToDepthOptionsT : public ::flatbuffers::NativeTable {
+  typedef SpaceToDepthOptions TableType;
+  int32_t block_size = 0;
+};
+
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SpaceToDepthOptionsT NativeTableType;
+  typedef SpaceToDepthOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
+           verifier.EndTable();
+  }
+  SpaceToDepthOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SpaceToDepthOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SpaceToDepthOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SpaceToDepthOptionsBuilder {
+  typedef SpaceToDepthOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit SpaceToDepthOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SpaceToDepthOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SpaceToDepthOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  SpaceToDepthOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DepthToSpaceOptionsT : public ::flatbuffers::NativeTable {
+  typedef DepthToSpaceOptions TableType;
+  int32_t block_size = 0;
+};
+
+struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DepthToSpaceOptionsT NativeTableType;
+  typedef DepthToSpaceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
+           verifier.EndTable();
+  }
+  DepthToSpaceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthToSpaceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DepthToSpaceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DepthToSpaceOptionsBuilder {
+  typedef DepthToSpaceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(DepthToSpaceOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit DepthToSpaceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DepthToSpaceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DepthToSpaceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  DepthToSpaceOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SubOptionsT : public ::flatbuffers::NativeTable {
+  typedef SubOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool pot_scale_int16 = true;
+};
+
+struct SubOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SubOptionsT NativeTableType;
+  typedef SubOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16, 1) &&
+           verifier.EndTable();
+  }
+  SubOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SubOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SubOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SubOptionsBuilder {
+  typedef SubOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(SubOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
+  explicit SubOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SubOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SubOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SubOptions> CreateSubOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
+  SubOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SubOptions> CreateSubOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DivOptionsT : public ::flatbuffers::NativeTable {
+  typedef DivOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct DivOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DivOptionsT NativeTableType;
+  typedef DivOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+  DivOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DivOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DivOptionsBuilder {
+  typedef DivOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DivOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit DivOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DivOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DivOptions> CreateDivOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  DivOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DivOptions> CreateDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TopKV2OptionsT : public ::flatbuffers::NativeTable {
+  typedef TopKV2Options TableType;
+};
+
+struct TopKV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TopKV2OptionsT NativeTableType;
+  typedef TopKV2OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TopKV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TopKV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TopKV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TopKV2OptionsBuilder {
+  typedef TopKV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TopKV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TopKV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TopKV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  TopKV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EmbeddingLookupSparseOptionsT : public ::flatbuffers::NativeTable {
+  typedef EmbeddingLookupSparseOptions TableType;
+  tflite::CombinerType combiner = tflite::CombinerType_SUM;
+};
+
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EmbeddingLookupSparseOptionsT NativeTableType;
+  typedef EmbeddingLookupSparseOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COMBINER = 4
+  };
+  tflite::CombinerType combiner() const {
+    return static_cast<tflite::CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_COMBINER, 1) &&
+           verifier.EndTable();
+  }
+  EmbeddingLookupSparseOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EmbeddingLookupSparseOptionsBuilder {
+  typedef EmbeddingLookupSparseOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_combiner(tflite::CombinerType combiner) {
+    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER, static_cast<int8_t>(combiner), 0);
+  }
+  explicit EmbeddingLookupSparseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::CombinerType combiner = tflite::CombinerType_SUM) {
+  EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
+  builder_.add_combiner(combiner);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherOptionsT : public ::flatbuffers::NativeTable {
+  typedef GatherOptions TableType;
+  int32_t axis = 0;
+  int32_t batch_dims = 0;
+};
+
+struct GatherOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GatherOptionsT NativeTableType;
+  typedef GatherOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4,
+    VT_BATCH_DIMS = 6
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  int32_t batch_dims() const {
+    return GetField<int32_t>(VT_BATCH_DIMS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           VerifyField<int32_t>(verifier, VT_BATCH_DIMS, 4) &&
+           verifier.EndTable();
+  }
+  GatherOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GatherOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherOptionsBuilder {
+  typedef GatherOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_AXIS, axis, 0);
+  }
+  void add_batch_dims(int32_t batch_dims) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_BATCH_DIMS, batch_dims, 0);
+  }
+  explicit GatherOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GatherOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0,
+    int32_t batch_dims = 0) {
+  GatherOptionsBuilder builder_(_fbb);
+  builder_.add_batch_dims(batch_dims);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<GatherOptions> CreateGatherOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeOptionsT : public ::flatbuffers::NativeTable {
+  typedef TransposeOptions TableType;
+};
+
+struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TransposeOptionsT NativeTableType;
+  typedef TransposeOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TransposeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TransposeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeOptionsBuilder {
+  typedef TransposeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TransposeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TransposeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TransposeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  TransposeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExpOptionsT : public ::flatbuffers::NativeTable {
+  typedef ExpOptions TableType;
+};
+
+struct ExpOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExpOptionsT NativeTableType;
+  typedef ExpOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ExpOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExpOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpOptionsBuilder {
+  typedef ExpOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ExpOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExpOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExpOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ExpOptions> CreateExpOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CosOptionsT : public ::flatbuffers::NativeTable {
+  typedef CosOptions TableType;
+};
+
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CosOptionsT NativeTableType;
+  typedef CosOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  CosOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CosOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CosOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CosOptionsBuilder {
+  typedef CosOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CosOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CosOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CosOptions> CreateCosOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  CosOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CosOptions> CreateCosOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReducerOptionsT : public ::flatbuffers::NativeTable {
+  typedef ReducerOptions TableType;
+  bool keep_dims = false;
+};
+
+struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReducerOptionsT NativeTableType;
+  typedef ReducerOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEEP_DIMS = 4
+  };
+  bool keep_dims() const {
+    return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_DIMS, 1) &&
+           verifier.EndTable();
+  }
+  ReducerOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReducerOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReducerOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReducerOptionsBuilder {
+  typedef ReducerOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keep_dims(bool keep_dims) {
+    fbb_.AddElement<uint8_t>(ReducerOptions::VT_KEEP_DIMS, static_cast<uint8_t>(keep_dims), 0);
+  }
+  explicit ReducerOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReducerOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReducerOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool keep_dims = false) {
+  ReducerOptionsBuilder builder_(_fbb);
+  builder_.add_keep_dims(keep_dims);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SqueezeOptionsT : public ::flatbuffers::NativeTable {
+  typedef SqueezeOptions TableType;
+  std::vector<int32_t> squeeze_dims{};
+};
+
+struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SqueezeOptionsT NativeTableType;
+  typedef SqueezeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SQUEEZE_DIMS = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *squeeze_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SQUEEZE_DIMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SQUEEZE_DIMS) &&
+           verifier.VerifyVector(squeeze_dims()) &&
+           verifier.EndTable();
+  }
+  SqueezeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SqueezeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SqueezeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SqueezeOptionsBuilder {
+  typedef SqueezeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_squeeze_dims(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> squeeze_dims) {
+    fbb_.AddOffset(SqueezeOptions::VT_SQUEEZE_DIMS, squeeze_dims);
+  }
+  explicit SqueezeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SqueezeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SqueezeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> squeeze_dims = 0) {
+  SqueezeOptionsBuilder builder_(_fbb);
+  builder_.add_squeeze_dims(squeeze_dims);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *squeeze_dims = nullptr) {
+  auto squeeze_dims__ = squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(
+      _fbb,
+      squeeze_dims__);
+}
+
+::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SplitOptionsT : public ::flatbuffers::NativeTable {
+  typedef SplitOptions TableType;
+  int32_t num_splits = 0;
+};
+
+struct SplitOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SplitOptionsT NativeTableType;
+  typedef SplitOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS, 4) &&
+           verifier.EndTable();
+  }
+  SplitOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SplitOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitOptionsBuilder {
+  typedef SplitOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SplitOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SplitOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SplitOptions> CreateSplitOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SplitOptions> CreateSplitOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SplitVOptionsT : public ::flatbuffers::NativeTable {
+  typedef SplitVOptions TableType;
+  int32_t num_splits = 0;
+};
+
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SplitVOptionsT NativeTableType;
+  typedef SplitVOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS, 4) &&
+           verifier.EndTable();
+  }
+  SplitVOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitVOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SplitVOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitVOptionsBuilder {
+  typedef SplitVOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitVOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SplitVOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SplitVOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitVOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StridedSliceOptionsT : public ::flatbuffers::NativeTable {
+  typedef StridedSliceOptions TableType;
+  int32_t begin_mask = 0;
+  int32_t end_mask = 0;
+  int32_t ellipsis_mask = 0;
+  int32_t new_axis_mask = 0;
+  int32_t shrink_axis_mask = 0;
+  bool offset = false;
+};
+
+struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StridedSliceOptionsT NativeTableType;
+  typedef StridedSliceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BEGIN_MASK = 4,
+    VT_END_MASK = 6,
+    VT_ELLIPSIS_MASK = 8,
+    VT_NEW_AXIS_MASK = 10,
+    VT_SHRINK_AXIS_MASK = 12,
+    VT_OFFSET = 14
+  };
+  int32_t begin_mask() const {
+    return GetField<int32_t>(VT_BEGIN_MASK, 0);
+  }
+  int32_t end_mask() const {
+    return GetField<int32_t>(VT_END_MASK, 0);
+  }
+  int32_t ellipsis_mask() const {
+    return GetField<int32_t>(VT_ELLIPSIS_MASK, 0);
+  }
+  int32_t new_axis_mask() const {
+    return GetField<int32_t>(VT_NEW_AXIS_MASK, 0);
+  }
+  int32_t shrink_axis_mask() const {
+    return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
+  }
+  bool offset() const {
+    return GetField<uint8_t>(VT_OFFSET, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BEGIN_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_END_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_ELLIPSIS_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_NEW_AXIS_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK, 4) &&
+           VerifyField<uint8_t>(verifier, VT_OFFSET, 1) &&
+           verifier.EndTable();
+  }
+  StridedSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StridedSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StridedSliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StridedSliceOptionsBuilder {
+  typedef StridedSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_begin_mask(int32_t begin_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_BEGIN_MASK, begin_mask, 0);
+  }
+  void add_end_mask(int32_t end_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_END_MASK, end_mask, 0);
+  }
+  void add_ellipsis_mask(int32_t ellipsis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_ELLIPSIS_MASK, ellipsis_mask, 0);
+  }
+  void add_new_axis_mask(int32_t new_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_NEW_AXIS_MASK, new_axis_mask, 0);
+  }
+  void add_shrink_axis_mask(int32_t shrink_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK, shrink_axis_mask, 0);
+  }
+  void add_offset(bool offset) {
+    fbb_.AddElement<uint8_t>(StridedSliceOptions::VT_OFFSET, static_cast<uint8_t>(offset), 0);
+  }
+  explicit StridedSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StridedSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StridedSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t begin_mask = 0,
+    int32_t end_mask = 0,
+    int32_t ellipsis_mask = 0,
+    int32_t new_axis_mask = 0,
+    int32_t shrink_axis_mask = 0,
+    bool offset = false) {
+  StridedSliceOptionsBuilder builder_(_fbb);
+  builder_.add_shrink_axis_mask(shrink_axis_mask);
+  builder_.add_new_axis_mask(new_axis_mask);
+  builder_.add_ellipsis_mask(ellipsis_mask);
+  builder_.add_end_mask(end_mask);
+  builder_.add_begin_mask(begin_mask);
+  builder_.add_offset(offset);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogSoftmaxOptionsT : public ::flatbuffers::NativeTable {
+  typedef LogSoftmaxOptions TableType;
+};
+
+struct LogSoftmaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogSoftmaxOptionsT NativeTableType;
+  typedef LogSoftmaxOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogSoftmaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogSoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogSoftmaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogSoftmaxOptionsBuilder {
+  typedef LogSoftmaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogSoftmaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogSoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogSoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogSoftmaxOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CastOptionsT : public ::flatbuffers::NativeTable {
+  typedef CastOptions TableType;
+  tflite::TensorType in_data_type = tflite::TensorType_FLOAT32;
+  tflite::TensorType out_data_type = tflite::TensorType_FLOAT32;
+};
+
+struct CastOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CastOptionsT NativeTableType;
+  typedef CastOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  tflite::TensorType in_data_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  tflite::TensorType out_data_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  CastOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CastOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CastOptionsBuilder {
+  typedef CastOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_in_data_type(tflite::TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(tflite::TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
+  explicit CastOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CastOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CastOptions> CreateCastOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType in_data_type = tflite::TensorType_FLOAT32,
+    tflite::TensorType out_data_type = tflite::TensorType_FLOAT32) {
+  CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CastOptions> CreateCastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DequantizeOptionsT : public ::flatbuffers::NativeTable {
+  typedef DequantizeOptions TableType;
+};
+
+struct DequantizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DequantizeOptionsT NativeTableType;
+  typedef DequantizeOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DequantizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DequantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DequantizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DequantizeOptionsBuilder {
+  typedef DequantizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DequantizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DequantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DequantizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DequantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MaximumMinimumOptionsT : public ::flatbuffers::NativeTable {
+  typedef MaximumMinimumOptions TableType;
+};
+
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MaximumMinimumOptionsT NativeTableType;
+  typedef MaximumMinimumOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MaximumMinimumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MaximumMinimumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MaximumMinimumOptionsBuilder {
+  typedef MaximumMinimumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MaximumMinimumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MaximumMinimumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MaximumMinimumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  MaximumMinimumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TileOptionsT : public ::flatbuffers::NativeTable {
+  typedef TileOptions TableType;
+};
+
+struct TileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TileOptionsT NativeTableType;
+  typedef TileOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TileOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TileOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TileOptionsBuilder {
+  typedef TileOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TileOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TileOptions> CreateTileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  TileOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TileOptions> CreateTileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMaxOptionsT : public ::flatbuffers::NativeTable {
+  typedef ArgMaxOptions TableType;
+  tflite::TensorType output_type = tflite::TensorType_FLOAT32;
+};
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgMaxOptionsT NativeTableType;
+  typedef ArgMaxOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUTPUT_TYPE = 4
+  };
+  tflite::TensorType output_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  ArgMaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArgMaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMaxOptionsBuilder {
+  typedef ArgMaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_output_type(tflite::TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType output_type = tflite::TensorType_FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMinOptionsT : public ::flatbuffers::NativeTable {
+  typedef ArgMinOptions TableType;
+  tflite::TensorType output_type = tflite::TensorType_FLOAT32;
+};
+
+struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgMinOptionsT NativeTableType;
+  typedef ArgMinOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUTPUT_TYPE = 4
+  };
+  tflite::TensorType output_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  ArgMinOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArgMinOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMinOptionsBuilder {
+  typedef ArgMinOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_output_type(tflite::TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMinOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMinOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArgMinOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArgMinOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType output_type = tflite::TensorType_FLOAT32) {
+  ArgMinOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GreaterOptionsT : public ::flatbuffers::NativeTable {
+  typedef GreaterOptions TableType;
+};
+
+struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GreaterOptionsT NativeTableType;
+  typedef GreaterOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GreaterOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterOptionsBuilder {
+  typedef GreaterOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GreaterOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GreaterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GreaterOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GreaterEqualOptionsT : public ::flatbuffers::NativeTable {
+  typedef GreaterEqualOptions TableType;
+};
+
+struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GreaterEqualOptionsT NativeTableType;
+  typedef GreaterEqualOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterEqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GreaterEqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterEqualOptionsBuilder {
+  typedef GreaterEqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GreaterEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GreaterEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GreaterEqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LessOptionsT : public ::flatbuffers::NativeTable {
+  typedef LessOptions TableType;
+};
+
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LessOptionsT NativeTableType;
+  typedef LessOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LessOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessOptionsBuilder {
+  typedef LessOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LessOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LessOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LessOptions> CreateLessOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LessOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LessOptions> CreateLessOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LessEqualOptionsT : public ::flatbuffers::NativeTable {
+  typedef LessEqualOptions TableType;
+};
+
+struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LessEqualOptionsT NativeTableType;
+  typedef LessEqualOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessEqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LessEqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessEqualOptionsBuilder {
+  typedef LessEqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LessEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LessEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LessEqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LessEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NegOptionsT : public ::flatbuffers::NativeTable {
+  typedef NegOptions TableType;
+};
+
+struct NegOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NegOptionsT NativeTableType;
+  typedef NegOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NegOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NegOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NegOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NegOptionsBuilder {
+  typedef NegOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NegOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NegOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NegOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NegOptions> CreateNegOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NegOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<NegOptions> CreateNegOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SelectOptionsT : public ::flatbuffers::NativeTable {
+  typedef SelectOptions TableType;
+};
+
+struct SelectOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SelectOptionsT NativeTableType;
+  typedef SelectOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SelectOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SelectOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SelectOptionsBuilder {
+  typedef SelectOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SelectOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SelectOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SelectOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SelectOptions> CreateSelectOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SelectOptions> CreateSelectOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SliceOptionsT : public ::flatbuffers::NativeTable {
+  typedef SliceOptions TableType;
+};
+
+struct SliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SliceOptionsT NativeTableType;
+  typedef SliceOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SliceOptionsBuilder {
+  typedef SliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SliceOptions> CreateSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SliceOptions> CreateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeConvOptionsT : public ::flatbuffers::NativeTable {
+  typedef TransposeConvOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  tflite::TensorType quantized_bias_type = tflite::TensorType_FLOAT32;
+};
+
+struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TransposeConvOptionsT NativeTableType;
+  typedef TransposeConvOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_QUANTIZED_BIAS_TYPE = 12
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  tflite::TensorType quantized_bias_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  TransposeConvOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeConvOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TransposeConvOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeConvOptionsBuilder {
+  typedef TransposeConvOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
+  }
+  explicit TransposeConvOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TransposeConvOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TransposeConvOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    tflite::TensorType quantized_bias_type = tflite::TensorType_FLOAT32) {
+  TransposeConvOptionsBuilder builder_(_fbb);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExpandDimsOptionsT : public ::flatbuffers::NativeTable {
+  typedef ExpandDimsOptions TableType;
+};
+
+struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExpandDimsOptionsT NativeTableType;
+  typedef ExpandDimsOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ExpandDimsOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpandDimsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExpandDimsOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpandDimsOptionsBuilder {
+  typedef ExpandDimsOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ExpandDimsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExpandDimsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExpandDimsOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpandDimsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparseToDenseOptionsT : public ::flatbuffers::NativeTable {
+  typedef SparseToDenseOptions TableType;
+  bool validate_indices = false;
+};
+
+struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SparseToDenseOptionsT NativeTableType;
+  typedef SparseToDenseOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALIDATE_INDICES = 4
+  };
+  bool validate_indices() const {
+    return GetField<uint8_t>(VT_VALIDATE_INDICES, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VALIDATE_INDICES, 1) &&
+           verifier.EndTable();
+  }
+  SparseToDenseOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparseToDenseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SparseToDenseOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparseToDenseOptionsBuilder {
+  typedef SparseToDenseOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_validate_indices(bool validate_indices) {
+    fbb_.AddElement<uint8_t>(SparseToDenseOptions::VT_VALIDATE_INDICES, static_cast<uint8_t>(validate_indices), 0);
+  }
+  explicit SparseToDenseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SparseToDenseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SparseToDenseOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool validate_indices = false) {
+  SparseToDenseOptionsBuilder builder_(_fbb);
+  builder_.add_validate_indices(validate_indices);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EqualOptionsT : public ::flatbuffers::NativeTable {
+  typedef EqualOptions TableType;
+};
+
+struct EqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EqualOptionsT NativeTableType;
+  typedef EqualOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  EqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EqualOptionsBuilder {
+  typedef EqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit EqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EqualOptions> CreateEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  EqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<EqualOptions> CreateEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NotEqualOptionsT : public ::flatbuffers::NativeTable {
+  typedef NotEqualOptions TableType;
+};
+
+struct NotEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NotEqualOptionsT NativeTableType;
+  typedef NotEqualOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NotEqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NotEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NotEqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NotEqualOptionsBuilder {
+  typedef NotEqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NotEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NotEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NotEqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NotEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ShapeOptionsT : public ::flatbuffers::NativeTable {
+  typedef ShapeOptions TableType;
+  tflite::TensorType out_type = tflite::TensorType_FLOAT32;
+};
+
+struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ShapeOptionsT NativeTableType;
+  typedef ShapeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUT_TYPE = 4
+  };
+  tflite::TensorType out_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  ShapeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ShapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ShapeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ShapeOptionsBuilder {
+  typedef ShapeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_out_type(tflite::TensorType out_type) {
+    fbb_.AddElement<int8_t>(ShapeOptions::VT_OUT_TYPE, static_cast<int8_t>(out_type), 0);
+  }
+  explicit ShapeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ShapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ShapeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType out_type = tflite::TensorType_FLOAT32) {
+  ShapeOptionsBuilder builder_(_fbb);
+  builder_.add_out_type(out_type);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RankOptionsT : public ::flatbuffers::NativeTable {
+  typedef RankOptions TableType;
+};
+
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RankOptionsT NativeTableType;
+  typedef RankOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RankOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RankOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RankOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RankOptionsBuilder {
+  typedef RankOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RankOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RankOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RankOptions> CreateRankOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RankOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<RankOptions> CreateRankOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PowOptionsT : public ::flatbuffers::NativeTable {
+  typedef PowOptions TableType;
+};
+
+struct PowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PowOptionsT NativeTableType;
+  typedef PowOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PowOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PowOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PowOptionsBuilder {
+  typedef PowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PowOptions> CreatePowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  PowOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<PowOptions> CreatePowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FakeQuantOptionsT : public ::flatbuffers::NativeTable {
+  typedef FakeQuantOptions TableType;
+  float min = 0.0f;
+  float max = 0.0f;
+  int32_t num_bits = 0;
+  bool narrow_range = false;
+};
+
+struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FakeQuantOptionsT NativeTableType;
+  typedef FakeQuantOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MIN = 4,
+    VT_MAX = 6,
+    VT_NUM_BITS = 8,
+    VT_NARROW_RANGE = 10
+  };
+  float min() const {
+    return GetField<float>(VT_MIN, 0.0f);
+  }
+  float max() const {
+    return GetField<float>(VT_MAX, 0.0f);
+  }
+  int32_t num_bits() const {
+    return GetField<int32_t>(VT_NUM_BITS, 0);
+  }
+  bool narrow_range() const {
+    return GetField<uint8_t>(VT_NARROW_RANGE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_MIN, 4) &&
+           VerifyField<float>(verifier, VT_MAX, 4) &&
+           VerifyField<int32_t>(verifier, VT_NUM_BITS, 4) &&
+           VerifyField<uint8_t>(verifier, VT_NARROW_RANGE, 1) &&
+           verifier.EndTable();
+  }
+  FakeQuantOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FakeQuantOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FakeQuantOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FakeQuantOptionsBuilder {
+  typedef FakeQuantOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_min(float min) {
+    fbb_.AddElement<float>(FakeQuantOptions::VT_MIN, min, 0.0f);
+  }
+  void add_max(float max) {
+    fbb_.AddElement<float>(FakeQuantOptions::VT_MAX, max, 0.0f);
+  }
+  void add_num_bits(int32_t num_bits) {
+    fbb_.AddElement<int32_t>(FakeQuantOptions::VT_NUM_BITS, num_bits, 0);
+  }
+  void add_narrow_range(bool narrow_range) {
+    fbb_.AddElement<uint8_t>(FakeQuantOptions::VT_NARROW_RANGE, static_cast<uint8_t>(narrow_range), 0);
+  }
+  explicit FakeQuantOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FakeQuantOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FakeQuantOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    float min = 0.0f,
+    float max = 0.0f,
+    int32_t num_bits = 0,
+    bool narrow_range = false) {
+  FakeQuantOptionsBuilder builder_(_fbb);
+  builder_.add_num_bits(num_bits);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_narrow_range(narrow_range);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PackOptionsT : public ::flatbuffers::NativeTable {
+  typedef PackOptions TableType;
+  int32_t values_count = 0;
+  int32_t axis = 0;
+};
+
+struct PackOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PackOptionsT NativeTableType;
+  typedef PackOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES_COUNT = 4,
+    VT_AXIS = 6
+  };
+  int32_t values_count() const {
+    return GetField<int32_t>(VT_VALUES_COUNT, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_VALUES_COUNT, 4) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+  PackOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PackOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PackOptionsBuilder {
+  typedef PackOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values_count(int32_t values_count) {
+    fbb_.AddElement<int32_t>(PackOptions::VT_VALUES_COUNT, values_count, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(PackOptions::VT_AXIS, axis, 0);
+  }
+  explicit PackOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PackOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PackOptions> CreatePackOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t values_count = 0,
+    int32_t axis = 0) {
+  PackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_values_count(values_count);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<PackOptions> CreatePackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogicalOrOptionsT : public ::flatbuffers::NativeTable {
+  typedef LogicalOrOptions TableType;
+};
+
+struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogicalOrOptionsT NativeTableType;
+  typedef LogicalOrOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalOrOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalOrOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogicalOrOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalOrOptionsBuilder {
+  typedef LogicalOrOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalOrOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogicalOrOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogicalOrOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalOrOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OneHotOptionsT : public ::flatbuffers::NativeTable {
+  typedef OneHotOptions TableType;
+  int32_t axis = 0;
+};
+
+struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OneHotOptionsT NativeTableType;
+  typedef OneHotOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+  OneHotOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OneHotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<OneHotOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OneHotOptionsBuilder {
+  typedef OneHotOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(OneHotOptions::VT_AXIS, axis, 0);
+  }
+  explicit OneHotOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<OneHotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<OneHotOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0) {
+  OneHotOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AbsOptionsT : public ::flatbuffers::NativeTable {
+  typedef AbsOptions TableType;
+};
+
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AbsOptionsT NativeTableType;
+  typedef AbsOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AbsOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AbsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AbsOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AbsOptionsBuilder {
+  typedef AbsOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AbsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AbsOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  AbsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<AbsOptions> CreateAbsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HardSwishOptionsT : public ::flatbuffers::NativeTable {
+  typedef HardSwishOptions TableType;
+};
+
+struct HardSwishOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HardSwishOptionsT NativeTableType;
+  typedef HardSwishOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  HardSwishOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HardSwishOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HardSwishOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HardSwishOptionsBuilder {
+  typedef HardSwishOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HardSwishOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HardSwishOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HardSwishOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HardSwishOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogicalAndOptionsT : public ::flatbuffers::NativeTable {
+  typedef LogicalAndOptions TableType;
+};
+
+struct LogicalAndOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogicalAndOptionsT NativeTableType;
+  typedef LogicalAndOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalAndOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalAndOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogicalAndOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalAndOptionsBuilder {
+  typedef LogicalAndOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalAndOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogicalAndOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogicalAndOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalAndOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogicalNotOptionsT : public ::flatbuffers::NativeTable {
+  typedef LogicalNotOptions TableType;
+};
+
+struct LogicalNotOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogicalNotOptionsT NativeTableType;
+  typedef LogicalNotOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalNotOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalNotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogicalNotOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalNotOptionsBuilder {
+  typedef LogicalNotOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalNotOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogicalNotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogicalNotOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalNotOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnpackOptionsT : public ::flatbuffers::NativeTable {
+  typedef UnpackOptions TableType;
+  int32_t num = 0;
+  int32_t axis = 0;
+};
+
+struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnpackOptionsT NativeTableType;
+  typedef UnpackOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM = 4,
+    VT_AXIS = 6
+  };
+  int32_t num() const {
+    return GetField<int32_t>(VT_NUM, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM, 4) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+  UnpackOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnpackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnpackOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnpackOptionsBuilder {
+  typedef UnpackOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num(int32_t num) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_NUM, num, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_AXIS, axis, 0);
+  }
+  explicit UnpackOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnpackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnpackOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num = 0,
+    int32_t axis = 0) {
+  UnpackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_num(num);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FloorDivOptionsT : public ::flatbuffers::NativeTable {
+  typedef FloorDivOptions TableType;
+};
+
+struct FloorDivOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloorDivOptionsT NativeTableType;
+  typedef FloorDivOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorDivOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorDivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FloorDivOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorDivOptionsBuilder {
+  typedef FloorDivOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FloorDivOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloorDivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FloorDivOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorDivOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquareOptionsT : public ::flatbuffers::NativeTable {
+  typedef SquareOptions TableType;
+};
+
+struct SquareOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SquareOptionsT NativeTableType;
+  typedef SquareOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquareOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquareOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SquareOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquareOptionsBuilder {
+  typedef SquareOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SquareOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SquareOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SquareOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SquareOptions> CreateSquareOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SquareOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SquareOptions> CreateSquareOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ZerosLikeOptionsT : public ::flatbuffers::NativeTable {
+  typedef ZerosLikeOptions TableType;
+};
+
+struct ZerosLikeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ZerosLikeOptionsT NativeTableType;
+  typedef ZerosLikeOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ZerosLikeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ZerosLikeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ZerosLikeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ZerosLikeOptionsBuilder {
+  typedef ZerosLikeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ZerosLikeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ZerosLikeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ZerosLikeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ZerosLikeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FillOptionsT : public ::flatbuffers::NativeTable {
+  typedef FillOptions TableType;
+};
+
+struct FillOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FillOptionsT NativeTableType;
+  typedef FillOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FillOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FillOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FillOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FillOptionsBuilder {
+  typedef FillOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FillOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FillOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FillOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FillOptions> CreateFillOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  FillOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FillOptions> CreateFillOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FloorModOptionsT : public ::flatbuffers::NativeTable {
+  typedef FloorModOptions TableType;
+};
+
+struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloorModOptionsT NativeTableType;
+  typedef FloorModOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorModOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorModOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FloorModOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorModOptionsBuilder {
+  typedef FloorModOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FloorModOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloorModOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FloorModOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorModOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RangeOptionsT : public ::flatbuffers::NativeTable {
+  typedef RangeOptions TableType;
+};
+
+struct RangeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RangeOptionsT NativeTableType;
+  typedef RangeOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RangeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RangeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RangeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RangeOptionsBuilder {
+  typedef RangeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RangeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RangeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RangeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RangeOptions> CreateRangeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RangeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<RangeOptions> CreateRangeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LeakyReluOptionsT : public ::flatbuffers::NativeTable {
+  typedef LeakyReluOptions TableType;
+  float alpha = 0.0f;
+};
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LeakyReluOptionsT NativeTableType;
+  typedef LeakyReluOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA, 4) &&
+           verifier.EndTable();
+  }
+  LeakyReluOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LeakyReluOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LeakyReluOptionsBuilder {
+  typedef LeakyReluOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquaredDifferenceOptionsT : public ::flatbuffers::NativeTable {
+  typedef SquaredDifferenceOptions TableType;
+};
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SquaredDifferenceOptionsT NativeTableType;
+  typedef SquaredDifferenceOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquaredDifferenceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SquaredDifferenceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  typedef SquaredDifferenceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MirrorPadOptionsT : public ::flatbuffers::NativeTable {
+  typedef MirrorPadOptions TableType;
+  tflite::MirrorPadMode mode = tflite::MirrorPadMode_REFLECT;
+};
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MirrorPadOptionsT NativeTableType;
+  typedef MirrorPadOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODE = 4
+  };
+  tflite::MirrorPadMode mode() const {
+    return static_cast<tflite::MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE, 1) &&
+           verifier.EndTable();
+  }
+  MirrorPadOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MirrorPadOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MirrorPadOptionsBuilder {
+  typedef MirrorPadOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_mode(tflite::MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MirrorPadMode mode = tflite::MirrorPadMode_REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UniqueOptionsT : public ::flatbuffers::NativeTable {
+  typedef UniqueOptions TableType;
+  tflite::TensorType idx_out_type = tflite::TensorType_INT32;
+};
+
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UniqueOptionsT NativeTableType;
+  typedef UniqueOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IDX_OUT_TYPE = 4
+  };
+  tflite::TensorType idx_out_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+  UniqueOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UniqueOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UniqueOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UniqueOptionsBuilder {
+  typedef UniqueOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_idx_out_type(tflite::TensorType idx_out_type) {
+    fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
+  }
+  explicit UniqueOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UniqueOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UniqueOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType idx_out_type = tflite::TensorType_INT32) {
+  UniqueOptionsBuilder builder_(_fbb);
+  builder_.add_idx_out_type(idx_out_type);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseV2OptionsT : public ::flatbuffers::NativeTable {
+  typedef ReverseV2Options TableType;
+};
+
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReverseV2OptionsT NativeTableType;
+  typedef ReverseV2OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ReverseV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReverseV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseV2OptionsBuilder {
+  typedef ReverseV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReverseV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReverseV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ReverseV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddNOptionsT : public ::flatbuffers::NativeTable {
+  typedef AddNOptions TableType;
+};
+
+struct AddNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AddNOptionsT NativeTableType;
+  typedef AddNOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AddNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AddNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddNOptionsBuilder {
+  typedef AddNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AddNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AddNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AddNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AddNOptions> CreateAddNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  AddNOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<AddNOptions> CreateAddNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherNdOptionsT : public ::flatbuffers::NativeTable {
+  typedef GatherNdOptions TableType;
+};
+
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GatherNdOptionsT NativeTableType;
+  typedef GatherNdOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GatherNdOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GatherNdOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherNdOptionsBuilder {
+  typedef GatherNdOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GatherNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GatherNdOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  GatherNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhereOptionsT : public ::flatbuffers::NativeTable {
+  typedef WhereOptions TableType;
+};
+
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef WhereOptionsT NativeTableType;
+  typedef WhereOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  WhereOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhereOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<WhereOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhereOptionsBuilder {
+  typedef WhereOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<WhereOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<WhereOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  WhereOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<WhereOptions> CreateWhereOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseSequenceOptionsT : public ::flatbuffers::NativeTable {
+  typedef ReverseSequenceOptions TableType;
+  int32_t seq_dim = 0;
+  int32_t batch_dim = 0;
+};
+
+struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReverseSequenceOptionsT NativeTableType;
+  typedef ReverseSequenceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SEQ_DIM = 4,
+    VT_BATCH_DIM = 6
+  };
+  int32_t seq_dim() const {
+    return GetField<int32_t>(VT_SEQ_DIM, 0);
+  }
+  int32_t batch_dim() const {
+    return GetField<int32_t>(VT_BATCH_DIM, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SEQ_DIM, 4) &&
+           VerifyField<int32_t>(verifier, VT_BATCH_DIM, 4) &&
+           verifier.EndTable();
+  }
+  ReverseSequenceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseSequenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReverseSequenceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseSequenceOptionsBuilder {
+  typedef ReverseSequenceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_seq_dim(int32_t seq_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_SEQ_DIM, seq_dim, 0);
+  }
+  void add_batch_dim(int32_t batch_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_BATCH_DIM, batch_dim, 0);
+  }
+  explicit ReverseSequenceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReverseSequenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReverseSequenceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t seq_dim = 0,
+    int32_t batch_dim = 0) {
+  ReverseSequenceOptionsBuilder builder_(_fbb);
+  builder_.add_batch_dim(batch_dim);
+  builder_.add_seq_dim(seq_dim);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MatrixDiagOptionsT : public ::flatbuffers::NativeTable {
+  typedef MatrixDiagOptions TableType;
+};
+
+struct MatrixDiagOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MatrixDiagOptionsT NativeTableType;
+  typedef MatrixDiagOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MatrixDiagOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MatrixDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MatrixDiagOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MatrixDiagOptionsBuilder {
+  typedef MatrixDiagOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MatrixDiagOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MatrixDiagOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MatrixDiagOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  MatrixDiagOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct QuantizeOptionsT : public ::flatbuffers::NativeTable {
+  typedef QuantizeOptions TableType;
+};
+
+struct QuantizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef QuantizeOptionsT NativeTableType;
+  typedef QuantizeOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  QuantizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(QuantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<QuantizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct QuantizeOptionsBuilder {
+  typedef QuantizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit QuantizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<QuantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<QuantizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  QuantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MatrixSetDiagOptionsT : public ::flatbuffers::NativeTable {
+  typedef MatrixSetDiagOptions TableType;
+};
+
+struct MatrixSetDiagOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MatrixSetDiagOptionsT NativeTableType;
+  typedef MatrixSetDiagOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MatrixSetDiagOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MatrixSetDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MatrixSetDiagOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MatrixSetDiagOptionsBuilder {
+  typedef MatrixSetDiagOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MatrixSetDiagOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MatrixSetDiagOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MatrixSetDiagOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  MatrixSetDiagOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct IfOptionsT : public ::flatbuffers::NativeTable {
+  typedef IfOptions TableType;
+  int32_t then_subgraph_index = 0;
+  int32_t else_subgraph_index = 0;
+};
+
+struct IfOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IfOptionsT NativeTableType;
+  typedef IfOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_THEN_SUBGRAPH_INDEX = 4,
+    VT_ELSE_SUBGRAPH_INDEX = 6
+  };
+  int32_t then_subgraph_index() const {
+    return GetField<int32_t>(VT_THEN_SUBGRAPH_INDEX, 0);
+  }
+  int32_t else_subgraph_index() const {
+    return GetField<int32_t>(VT_ELSE_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_THEN_SUBGRAPH_INDEX, 4) &&
+           VerifyField<int32_t>(verifier, VT_ELSE_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  IfOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(IfOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<IfOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct IfOptionsBuilder {
+  typedef IfOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_then_subgraph_index(int32_t then_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_THEN_SUBGRAPH_INDEX, then_subgraph_index, 0);
+  }
+  void add_else_subgraph_index(int32_t else_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_ELSE_SUBGRAPH_INDEX, else_subgraph_index, 0);
+  }
+  explicit IfOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IfOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IfOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IfOptions> CreateIfOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t then_subgraph_index = 0,
+    int32_t else_subgraph_index = 0) {
+  IfOptionsBuilder builder_(_fbb);
+  builder_.add_else_subgraph_index(else_subgraph_index);
+  builder_.add_then_subgraph_index(then_subgraph_index);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<IfOptions> CreateIfOptions(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CallOnceOptionsT : public ::flatbuffers::NativeTable {
+  typedef CallOnceOptions TableType;
+  int32_t init_subgraph_index = 0;
+};
+
+struct CallOnceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CallOnceOptionsT NativeTableType;
+  typedef CallOnceOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INIT_SUBGRAPH_INDEX = 4
+  };
+  int32_t init_subgraph_index() const {
+    return GetField<int32_t>(VT_INIT_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INIT_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  CallOnceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CallOnceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CallOnceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CallOnceOptionsBuilder {
+  typedef CallOnceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_init_subgraph_index(int32_t init_subgraph_index) {
+    fbb_.AddElement<int32_t>(CallOnceOptions::VT_INIT_SUBGRAPH_INDEX, init_subgraph_index, 0);
+  }
+  explicit CallOnceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CallOnceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CallOnceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t init_subgraph_index = 0) {
+  CallOnceOptionsBuilder builder_(_fbb);
+  builder_.add_init_subgraph_index(init_subgraph_index);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhileOptionsT : public ::flatbuffers::NativeTable {
+  typedef WhileOptions TableType;
+  int32_t cond_subgraph_index = 0;
+  int32_t body_subgraph_index = 0;
+};
+
+struct WhileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef WhileOptionsT NativeTableType;
+  typedef WhileOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COND_SUBGRAPH_INDEX = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  int32_t cond_subgraph_index() const {
+    return GetField<int32_t>(VT_COND_SUBGRAPH_INDEX, 0);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX, 4) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  WhileOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<WhileOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhileOptionsBuilder {
+  typedef WhileOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cond_subgraph_index(int32_t cond_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit WhileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<WhileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<WhileOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<WhileOptions> CreateWhileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t cond_subgraph_index = 0,
+    int32_t body_subgraph_index = 0) {
+  WhileOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_cond_subgraph_index(cond_subgraph_index);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<WhileOptions> CreateWhileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NonMaxSuppressionV4OptionsT : public ::flatbuffers::NativeTable {
+  typedef NonMaxSuppressionV4Options TableType;
+};
+
+struct NonMaxSuppressionV4Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NonMaxSuppressionV4OptionsT NativeTableType;
+  typedef NonMaxSuppressionV4OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NonMaxSuppressionV4OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NonMaxSuppressionV4Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NonMaxSuppressionV4OptionsBuilder {
+  typedef NonMaxSuppressionV4Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV4OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NonMaxSuppressionV4Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NonMaxSuppressionV4Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NonMaxSuppressionV4OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NonMaxSuppressionV5OptionsT : public ::flatbuffers::NativeTable {
+  typedef NonMaxSuppressionV5Options TableType;
+};
+
+struct NonMaxSuppressionV5Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NonMaxSuppressionV5OptionsT NativeTableType;
+  typedef NonMaxSuppressionV5OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NonMaxSuppressionV5OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NonMaxSuppressionV5Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NonMaxSuppressionV5OptionsBuilder {
+  typedef NonMaxSuppressionV5Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV5OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NonMaxSuppressionV5Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NonMaxSuppressionV5Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NonMaxSuppressionV5OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ScatterNdOptionsT : public ::flatbuffers::NativeTable {
+  typedef ScatterNdOptions TableType;
+};
+
+struct ScatterNdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ScatterNdOptionsT NativeTableType;
+  typedef ScatterNdOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ScatterNdOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ScatterNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ScatterNdOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ScatterNdOptionsBuilder {
+  typedef ScatterNdOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ScatterNdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ScatterNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ScatterNdOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ScatterNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SelectV2OptionsT : public ::flatbuffers::NativeTable {
+  typedef SelectV2Options TableType;
+};
+
+struct SelectV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SelectV2OptionsT NativeTableType;
+  typedef SelectV2OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SelectV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SelectV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SelectV2OptionsBuilder {
+  typedef SelectV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SelectV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SelectV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SelectV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DensifyOptionsT : public ::flatbuffers::NativeTable {
+  typedef DensifyOptions TableType;
+};
+
+struct DensifyOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DensifyOptionsT NativeTableType;
+  typedef DensifyOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DensifyOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DensifyOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DensifyOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DensifyOptionsBuilder {
+  typedef DensifyOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DensifyOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DensifyOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DensifyOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DensifyOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SegmentSumOptionsT : public ::flatbuffers::NativeTable {
+  typedef SegmentSumOptions TableType;
+};
+
+struct SegmentSumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SegmentSumOptionsT NativeTableType;
+  typedef SegmentSumOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SegmentSumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SegmentSumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SegmentSumOptionsBuilder {
+  typedef SegmentSumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SegmentSumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SegmentSumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SegmentSumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SegmentSumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BatchMatMulOptionsT : public ::flatbuffers::NativeTable {
+  typedef BatchMatMulOptions TableType;
+  bool adj_x = false;
+  bool adj_y = false;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BatchMatMulOptionsT NativeTableType;
+  typedef BatchMatMulOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ADJ_X = 4,
+    VT_ADJ_Y = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  bool adj_x() const {
+    return GetField<uint8_t>(VT_ADJ_X, 0) != 0;
+  }
+  bool adj_y() const {
+    return GetField<uint8_t>(VT_ADJ_Y, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_X, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_Y, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+  BatchMatMulOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BatchMatMulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BatchMatMulOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BatchMatMulOptionsBuilder {
+  typedef BatchMatMulOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_adj_x(bool adj_x) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_X, static_cast<uint8_t>(adj_x), 0);
+  }
+  void add_adj_y(bool adj_y) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_Y, static_cast<uint8_t>(adj_y), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BatchMatMulOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BatchMatMulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BatchMatMulOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool adj_x = false,
+    bool adj_y = false,
+    bool asymmetric_quantize_inputs = false) {
+  BatchMatMulOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_adj_y(adj_y);
+  builder_.add_adj_x(adj_x);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CumsumOptionsT : public ::flatbuffers::NativeTable {
+  typedef CumsumOptions TableType;
+  bool exclusive = false;
+  bool reverse = false;
+};
+
+struct CumsumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CumsumOptionsT NativeTableType;
+  typedef CumsumOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EXCLUSIVE = 4,
+    VT_REVERSE = 6
+  };
+  bool exclusive() const {
+    return GetField<uint8_t>(VT_EXCLUSIVE, 0) != 0;
+  }
+  bool reverse() const {
+    return GetField<uint8_t>(VT_REVERSE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_EXCLUSIVE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_REVERSE, 1) &&
+           verifier.EndTable();
+  }
+  CumsumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CumsumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CumsumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CumsumOptionsBuilder {
+  typedef CumsumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_exclusive(bool exclusive) {
+    fbb_.AddElement<uint8_t>(CumsumOptions::VT_EXCLUSIVE, static_cast<uint8_t>(exclusive), 0);
+  }
+  void add_reverse(bool reverse) {
+    fbb_.AddElement<uint8_t>(CumsumOptions::VT_REVERSE, static_cast<uint8_t>(reverse), 0);
+  }
+  explicit CumsumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CumsumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CumsumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool exclusive = false,
+    bool reverse = false) {
+  CumsumOptionsBuilder builder_(_fbb);
+  builder_.add_reverse(reverse);
+  builder_.add_exclusive(exclusive);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BroadcastToOptionsT : public ::flatbuffers::NativeTable {
+  typedef BroadcastToOptions TableType;
+};
+
+struct BroadcastToOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BroadcastToOptionsT NativeTableType;
+  typedef BroadcastToOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BroadcastToOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BroadcastToOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BroadcastToOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BroadcastToOptionsBuilder {
+  typedef BroadcastToOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BroadcastToOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BroadcastToOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BroadcastToOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BroadcastToOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Rfft2dOptionsT : public ::flatbuffers::NativeTable {
+  typedef Rfft2dOptions TableType;
+};
+
+struct Rfft2dOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Rfft2dOptionsT NativeTableType;
+  typedef Rfft2dOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  Rfft2dOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Rfft2dOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Rfft2dOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Rfft2dOptionsBuilder {
+  typedef Rfft2dOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Rfft2dOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Rfft2dOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Rfft2dOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  Rfft2dOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HashtableOptionsT : public ::flatbuffers::NativeTable {
+  typedef HashtableOptions TableType;
+  int32_t table_id = 0;
+  tflite::TensorType key_dtype = tflite::TensorType_FLOAT32;
+  tflite::TensorType value_dtype = tflite::TensorType_FLOAT32;
+};
+
+struct HashtableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableOptionsT NativeTableType;
+  typedef HashtableOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TABLE_ID = 4,
+    VT_KEY_DTYPE = 6,
+    VT_VALUE_DTYPE = 8
+  };
+  int32_t table_id() const {
+    return GetField<int32_t>(VT_TABLE_ID, 0);
+  }
+  tflite::TensorType key_dtype() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_KEY_DTYPE, 0));
+  }
+  tflite::TensorType value_dtype() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_VALUE_DTYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_TABLE_ID, 4) &&
+           VerifyField<int8_t>(verifier, VT_KEY_DTYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_VALUE_DTYPE, 1) &&
+           verifier.EndTable();
+  }
+  HashtableOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HashtableOptionsBuilder {
+  typedef HashtableOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_table_id(int32_t table_id) {
+    fbb_.AddElement<int32_t>(HashtableOptions::VT_TABLE_ID, table_id, 0);
+  }
+  void add_key_dtype(tflite::TensorType key_dtype) {
+    fbb_.AddElement<int8_t>(HashtableOptions::VT_KEY_DTYPE, static_cast<int8_t>(key_dtype), 0);
+  }
+  void add_value_dtype(tflite::TensorType value_dtype) {
+    fbb_.AddElement<int8_t>(HashtableOptions::VT_VALUE_DTYPE, static_cast<int8_t>(value_dtype), 0);
+  }
+  explicit HashtableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t table_id = 0,
+    tflite::TensorType key_dtype = tflite::TensorType_FLOAT32,
+    tflite::TensorType value_dtype = tflite::TensorType_FLOAT32) {
+  HashtableOptionsBuilder builder_(_fbb);
+  builder_.add_table_id(table_id);
+  builder_.add_value_dtype(value_dtype);
+  builder_.add_key_dtype(key_dtype);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HashtableFindOptionsT : public ::flatbuffers::NativeTable {
+  typedef HashtableFindOptions TableType;
+};
+
+struct HashtableFindOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableFindOptionsT NativeTableType;
+  typedef HashtableFindOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  HashtableFindOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableFindOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableFindOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HashtableFindOptionsBuilder {
+  typedef HashtableFindOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableFindOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableFindOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableFindOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HashtableFindOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HashtableImportOptionsT : public ::flatbuffers::NativeTable {
+  typedef HashtableImportOptions TableType;
+};
+
+struct HashtableImportOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableImportOptionsT NativeTableType;
+  typedef HashtableImportOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  HashtableImportOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableImportOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableImportOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HashtableImportOptionsBuilder {
+  typedef HashtableImportOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableImportOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableImportOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableImportOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HashtableImportOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HashtableSizeOptionsT : public ::flatbuffers::NativeTable {
+  typedef HashtableSizeOptions TableType;
+};
+
+struct HashtableSizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableSizeOptionsT NativeTableType;
+  typedef HashtableSizeOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  HashtableSizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableSizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableSizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HashtableSizeOptionsBuilder {
+  typedef HashtableSizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableSizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableSizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableSizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HashtableSizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct VarHandleOptionsT : public ::flatbuffers::NativeTable {
+  typedef VarHandleOptions TableType;
+  std::string container{};
+  std::string shared_name{};
+};
+
+struct VarHandleOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef VarHandleOptionsT NativeTableType;
+  typedef VarHandleOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CONTAINER = 4,
+    VT_SHARED_NAME = 6
+  };
+  const ::flatbuffers::String *container() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CONTAINER);
+  }
+  const ::flatbuffers::String *shared_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_SHARED_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CONTAINER) &&
+           verifier.VerifyString(container()) &&
+           VerifyOffset(verifier, VT_SHARED_NAME) &&
+           verifier.VerifyString(shared_name()) &&
+           verifier.EndTable();
+  }
+  VarHandleOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(VarHandleOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<VarHandleOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct VarHandleOptionsBuilder {
+  typedef VarHandleOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_container(::flatbuffers::Offset<::flatbuffers::String> container) {
+    fbb_.AddOffset(VarHandleOptions::VT_CONTAINER, container);
+  }
+  void add_shared_name(::flatbuffers::Offset<::flatbuffers::String> shared_name) {
+    fbb_.AddOffset(VarHandleOptions::VT_SHARED_NAME, shared_name);
+  }
+  explicit VarHandleOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<VarHandleOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<VarHandleOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> container = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> shared_name = 0) {
+  VarHandleOptionsBuilder builder_(_fbb);
+  builder_.add_shared_name(shared_name);
+  builder_.add_container(container);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *container = nullptr,
+    const char *shared_name = nullptr) {
+  auto container__ = container ? _fbb.CreateString(container) : 0;
+  auto shared_name__ = shared_name ? _fbb.CreateString(shared_name) : 0;
+  return tflite::CreateVarHandleOptions(
+      _fbb,
+      container__,
+      shared_name__);
+}
+
+::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReadVariableOptionsT : public ::flatbuffers::NativeTable {
+  typedef ReadVariableOptions TableType;
+};
+
+struct ReadVariableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReadVariableOptionsT NativeTableType;
+  typedef ReadVariableOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ReadVariableOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReadVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReadVariableOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReadVariableOptionsBuilder {
+  typedef ReadVariableOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ReadVariableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReadVariableOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReadVariableOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ReadVariableOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AssignVariableOptionsT : public ::flatbuffers::NativeTable {
+  typedef AssignVariableOptions TableType;
+};
+
+struct AssignVariableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AssignVariableOptionsT NativeTableType;
+  typedef AssignVariableOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AssignVariableOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AssignVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AssignVariableOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AssignVariableOptionsBuilder {
+  typedef AssignVariableOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AssignVariableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AssignVariableOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AssignVariableOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  AssignVariableOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RandomOptionsT : public ::flatbuffers::NativeTable {
+  typedef RandomOptions TableType;
+  int64_t seed = 0;
+  int64_t seed2 = 0;
+};
+
+struct RandomOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RandomOptionsT NativeTableType;
+  typedef RandomOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SEED = 4,
+    VT_SEED2 = 6
+  };
+  int64_t seed() const {
+    return GetField<int64_t>(VT_SEED, 0);
+  }
+  int64_t seed2() const {
+    return GetField<int64_t>(VT_SEED2, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_SEED, 8) &&
+           VerifyField<int64_t>(verifier, VT_SEED2, 8) &&
+           verifier.EndTable();
+  }
+  RandomOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RandomOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RandomOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RandomOptionsBuilder {
+  typedef RandomOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_seed(int64_t seed) {
+    fbb_.AddElement<int64_t>(RandomOptions::VT_SEED, seed, 0);
+  }
+  void add_seed2(int64_t seed2) {
+    fbb_.AddElement<int64_t>(RandomOptions::VT_SEED2, seed2, 0);
+  }
+  explicit RandomOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RandomOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RandomOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RandomOptions> CreateRandomOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t seed = 0,
+    int64_t seed2 = 0) {
+  RandomOptionsBuilder builder_(_fbb);
+  builder_.add_seed2(seed2);
+  builder_.add_seed(seed);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<RandomOptions> CreateRandomOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BucketizeOptionsT : public ::flatbuffers::NativeTable {
+  typedef BucketizeOptions TableType;
+  std::vector<float> boundaries{};
+};
+
+struct BucketizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BucketizeOptionsT NativeTableType;
+  typedef BucketizeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BOUNDARIES = 4
+  };
+  const ::flatbuffers::Vector<float> *boundaries() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_BOUNDARIES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BOUNDARIES) &&
+           verifier.VerifyVector(boundaries()) &&
+           verifier.EndTable();
+  }
+  BucketizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BucketizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BucketizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BucketizeOptionsBuilder {
+  typedef BucketizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_boundaries(::flatbuffers::Offset<::flatbuffers::Vector<float>> boundaries) {
+    fbb_.AddOffset(BucketizeOptions::VT_BOUNDARIES, boundaries);
+  }
+  explicit BucketizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BucketizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BucketizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> boundaries = 0) {
+  BucketizeOptionsBuilder builder_(_fbb);
+  builder_.add_boundaries(boundaries);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *boundaries = nullptr) {
+  auto boundaries__ = boundaries ? _fbb.CreateVector<float>(*boundaries) : 0;
+  return tflite::CreateBucketizeOptions(
+      _fbb,
+      boundaries__);
+}
+
+::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GeluOptionsT : public ::flatbuffers::NativeTable {
+  typedef GeluOptions TableType;
+  bool approximate = false;
+};
+
+struct GeluOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GeluOptionsT NativeTableType;
+  typedef GeluOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_APPROXIMATE = 4
+  };
+  bool approximate() const {
+    return GetField<uint8_t>(VT_APPROXIMATE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_APPROXIMATE, 1) &&
+           verifier.EndTable();
+  }
+  GeluOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GeluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GeluOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GeluOptionsBuilder {
+  typedef GeluOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_approximate(bool approximate) {
+    fbb_.AddElement<uint8_t>(GeluOptions::VT_APPROXIMATE, static_cast<uint8_t>(approximate), 0);
+  }
+  explicit GeluOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GeluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GeluOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GeluOptions> CreateGeluOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool approximate = false) {
+  GeluOptionsBuilder builder_(_fbb);
+  builder_.add_approximate(approximate);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<GeluOptions> CreateGeluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DynamicUpdateSliceOptionsT : public ::flatbuffers::NativeTable {
+  typedef DynamicUpdateSliceOptions TableType;
+};
+
+struct DynamicUpdateSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DynamicUpdateSliceOptionsT NativeTableType;
+  typedef DynamicUpdateSliceOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DynamicUpdateSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DynamicUpdateSliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DynamicUpdateSliceOptionsBuilder {
+  typedef DynamicUpdateSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DynamicUpdateSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DynamicUpdateSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DynamicUpdateSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DynamicUpdateSliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnsortedSegmentProdOptionsT : public ::flatbuffers::NativeTable {
+  typedef UnsortedSegmentProdOptions TableType;
+};
+
+struct UnsortedSegmentProdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentProdOptionsT NativeTableType;
+  typedef UnsortedSegmentProdOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  UnsortedSegmentProdOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentProdOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnsortedSegmentProdOptionsBuilder {
+  typedef UnsortedSegmentProdOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentProdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentProdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentProdOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentProdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnsortedSegmentMaxOptionsT : public ::flatbuffers::NativeTable {
+  typedef UnsortedSegmentMaxOptions TableType;
+};
+
+struct UnsortedSegmentMaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentMaxOptionsT NativeTableType;
+  typedef UnsortedSegmentMaxOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  UnsortedSegmentMaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentMaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnsortedSegmentMaxOptionsBuilder {
+  typedef UnsortedSegmentMaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentMaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentMaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentMaxOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnsortedSegmentSumOptionsT : public ::flatbuffers::NativeTable {
+  typedef UnsortedSegmentSumOptions TableType;
+};
+
+struct UnsortedSegmentSumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentSumOptionsT NativeTableType;
+  typedef UnsortedSegmentSumOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  UnsortedSegmentSumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentSumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnsortedSegmentSumOptionsBuilder {
+  typedef UnsortedSegmentSumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentSumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentSumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentSumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentSumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ATan2OptionsT : public ::flatbuffers::NativeTable {
+  typedef ATan2Options TableType;
+};
+
+struct ATan2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ATan2OptionsT NativeTableType;
+  typedef ATan2OptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ATan2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ATan2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ATan2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ATan2OptionsBuilder {
+  typedef ATan2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ATan2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ATan2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ATan2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ATan2Options> CreateATan2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ATan2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ATan2Options> CreateATan2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnsortedSegmentMinOptionsT : public ::flatbuffers::NativeTable {
+  typedef UnsortedSegmentMinOptions TableType;
+};
+
+struct UnsortedSegmentMinOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentMinOptionsT NativeTableType;
+  typedef UnsortedSegmentMinOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  UnsortedSegmentMinOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentMinOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnsortedSegmentMinOptionsBuilder {
+  typedef UnsortedSegmentMinOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentMinOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentMinOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentMinOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentMinOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SignOptionsT : public ::flatbuffers::NativeTable {
+  typedef SignOptions TableType;
+};
+
+struct SignOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SignOptionsT NativeTableType;
+  typedef SignOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SignOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SignOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SignOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SignOptionsBuilder {
+  typedef SignOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SignOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SignOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SignOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SignOptions> CreateSignOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SignOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<SignOptions> CreateSignOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BitcastOptionsT : public ::flatbuffers::NativeTable {
+  typedef BitcastOptions TableType;
+};
+
+struct BitcastOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BitcastOptionsT NativeTableType;
+  typedef BitcastOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BitcastOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BitcastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BitcastOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BitcastOptionsBuilder {
+  typedef BitcastOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BitcastOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BitcastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BitcastOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BitcastOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BitwiseXorOptionsT : public ::flatbuffers::NativeTable {
+  typedef BitwiseXorOptions TableType;
+};
+
+struct BitwiseXorOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BitwiseXorOptionsT NativeTableType;
+  typedef BitwiseXorOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BitwiseXorOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BitwiseXorOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BitwiseXorOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BitwiseXorOptionsBuilder {
+  typedef BitwiseXorOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BitwiseXorOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BitwiseXorOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BitwiseXorOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BitwiseXorOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RightShiftOptionsT : public ::flatbuffers::NativeTable {
+  typedef RightShiftOptions TableType;
+};
+
+struct RightShiftOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RightShiftOptionsT NativeTableType;
+  typedef RightShiftOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RightShiftOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RightShiftOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RightShiftOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RightShiftOptionsBuilder {
+  typedef RightShiftOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RightShiftOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RightShiftOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RightShiftOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RightShiftOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DilateOptionsT : public ::flatbuffers::NativeTable {
+  typedef DilateOptions TableType;
+};
+
+struct DilateOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DilateOptionsT NativeTableType;
+  typedef DilateOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DilateOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DilateOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DilateOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DilateOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DilateOptionsBuilder {
+  typedef DilateOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DilateOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DilateOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DilateOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DilateOptions> CreateDilateOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DilateOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<DilateOptions> CreateDilateOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DilateOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReduceWindowOptionsT : public ::flatbuffers::NativeTable {
+  typedef ReduceWindowOptions TableType;
+  tflite::ReduceWindowFunction reduce_function = tflite::ReduceWindowFunction_UNSUPPORTED;
+};
+
+struct ReduceWindowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReduceWindowOptionsT NativeTableType;
+  typedef ReduceWindowOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_REDUCE_FUNCTION = 4
+  };
+  tflite::ReduceWindowFunction reduce_function() const {
+    return static_cast<tflite::ReduceWindowFunction>(GetField<int32_t>(VT_REDUCE_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_REDUCE_FUNCTION, 4) &&
+           verifier.EndTable();
+  }
+  ReduceWindowOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReduceWindowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReduceWindowOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReduceWindowOptionsBuilder {
+  typedef ReduceWindowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_reduce_function(tflite::ReduceWindowFunction reduce_function) {
+    fbb_.AddElement<int32_t>(ReduceWindowOptions::VT_REDUCE_FUNCTION, static_cast<int32_t>(reduce_function), 0);
+  }
+  explicit ReduceWindowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReduceWindowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReduceWindowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ReduceWindowFunction reduce_function = tflite::ReduceWindowFunction_UNSUPPORTED) {
+  ReduceWindowOptionsBuilder builder_(_fbb);
+  builder_.add_reduce_function(reduce_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorCodeT : public ::flatbuffers::NativeTable {
+  typedef OperatorCode TableType;
+  int8_t deprecated_builtin_code = 0;
+  std::string custom_code{};
+  int32_t version = 1;
+  tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD;
+};
+
+struct OperatorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorCodeT NativeTableType;
+  typedef OperatorCodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEPRECATED_BUILTIN_CODE = 4,
+    VT_CUSTOM_CODE = 6,
+    VT_VERSION = 8,
+    VT_BUILTIN_CODE = 10
+  };
+  int8_t deprecated_builtin_code() const {
+    return GetField<int8_t>(VT_DEPRECATED_BUILTIN_CODE, 0);
+  }
+  const ::flatbuffers::String *custom_code() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CUSTOM_CODE);
+  }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 1);
+  }
+  tflite::BuiltinOperator builtin_code() const {
+    return static_cast<tflite::BuiltinOperator>(GetField<int32_t>(VT_BUILTIN_CODE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_DEPRECATED_BUILTIN_CODE, 1) &&
+           VerifyOffset(verifier, VT_CUSTOM_CODE) &&
+           verifier.VerifyString(custom_code()) &&
+           VerifyField<int32_t>(verifier, VT_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_BUILTIN_CODE, 4) &&
+           verifier.EndTable();
+  }
+  OperatorCodeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<OperatorCode> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OperatorCodeBuilder {
+  typedef OperatorCode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_deprecated_builtin_code(int8_t deprecated_builtin_code) {
+    fbb_.AddElement<int8_t>(OperatorCode::VT_DEPRECATED_BUILTIN_CODE, deprecated_builtin_code, 0);
+  }
+  void add_custom_code(::flatbuffers::Offset<::flatbuffers::String> custom_code) {
+    fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
+  }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_VERSION, version, 1);
+  }
+  void add_builtin_code(tflite::BuiltinOperator builtin_code) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int32_t>(builtin_code), 0);
+  }
+  explicit OperatorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<OperatorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<OperatorCode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t deprecated_builtin_code = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> custom_code = 0,
+    int32_t version = 1,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_builtin_code(builtin_code);
+  builder_.add_version(version);
+  builder_.add_custom_code(custom_code);
+  builder_.add_deprecated_builtin_code(deprecated_builtin_code);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t deprecated_builtin_code = 0,
+    const char *custom_code = nullptr,
+    int32_t version = 1,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD) {
+  auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
+  return tflite::CreateOperatorCode(
+      _fbb,
+      deprecated_builtin_code,
+      custom_code__,
+      version,
+      builtin_code);
+}
+
+::flatbuffers::Offset<OperatorCode> CreateOperatorCode(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StableHLOCompositeOptionsT : public ::flatbuffers::NativeTable {
+  typedef StableHLOCompositeOptions TableType;
+  std::string name{};
+  int32_t decomposition_subgraph_index = 0;
+  std::vector<uint8_t> composite_attributes{};
+  tflite::CustomOptionsFormat composite_attributes_format = tflite::CustomOptionsFormat_FLEXBUFFERS;
+  int32_t version = 0;
+};
+
+struct StableHLOCompositeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StableHLOCompositeOptionsT NativeTableType;
+  typedef StableHLOCompositeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_DECOMPOSITION_SUBGRAPH_INDEX = 6,
+    VT_COMPOSITE_ATTRIBUTES = 8,
+    VT_COMPOSITE_ATTRIBUTES_FORMAT = 10,
+    VT_VERSION = 12
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  int32_t decomposition_subgraph_index() const {
+    return GetField<int32_t>(VT_DECOMPOSITION_SUBGRAPH_INDEX, 0);
+  }
+  const ::flatbuffers::Vector<uint8_t> *composite_attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_COMPOSITE_ATTRIBUTES);
+  }
+  tflite::CustomOptionsFormat composite_attributes_format() const {
+    return static_cast<tflite::CustomOptionsFormat>(GetField<int8_t>(VT_COMPOSITE_ATTRIBUTES_FORMAT, 0));
+  }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int32_t>(verifier, VT_DECOMPOSITION_SUBGRAPH_INDEX, 4) &&
+           VerifyOffset(verifier, VT_COMPOSITE_ATTRIBUTES) &&
+           verifier.VerifyVector(composite_attributes()) &&
+           VerifyField<int8_t>(verifier, VT_COMPOSITE_ATTRIBUTES_FORMAT, 1) &&
+           VerifyField<int32_t>(verifier, VT_VERSION, 4) &&
+           verifier.EndTable();
+  }
+  StableHLOCompositeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableHLOCompositeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StableHLOCompositeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StableHLOCompositeOptionsBuilder {
+  typedef StableHLOCompositeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(StableHLOCompositeOptions::VT_NAME, name);
+  }
+  void add_decomposition_subgraph_index(int32_t decomposition_subgraph_index) {
+    fbb_.AddElement<int32_t>(StableHLOCompositeOptions::VT_DECOMPOSITION_SUBGRAPH_INDEX, decomposition_subgraph_index, 0);
+  }
+  void add_composite_attributes(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> composite_attributes) {
+    fbb_.AddOffset(StableHLOCompositeOptions::VT_COMPOSITE_ATTRIBUTES, composite_attributes);
+  }
+  void add_composite_attributes_format(tflite::CustomOptionsFormat composite_attributes_format) {
+    fbb_.AddElement<int8_t>(StableHLOCompositeOptions::VT_COMPOSITE_ATTRIBUTES_FORMAT, static_cast<int8_t>(composite_attributes_format), 0);
+  }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(StableHLOCompositeOptions::VT_VERSION, version, 0);
+  }
+  explicit StableHLOCompositeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StableHLOCompositeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StableHLOCompositeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    int32_t decomposition_subgraph_index = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> composite_attributes = 0,
+    tflite::CustomOptionsFormat composite_attributes_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    int32_t version = 0) {
+  StableHLOCompositeOptionsBuilder builder_(_fbb);
+  builder_.add_version(version);
+  builder_.add_composite_attributes(composite_attributes);
+  builder_.add_decomposition_subgraph_index(decomposition_subgraph_index);
+  builder_.add_name(name);
+  builder_.add_composite_attributes_format(composite_attributes_format);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    int32_t decomposition_subgraph_index = 0,
+    const std::vector<uint8_t> *composite_attributes = nullptr,
+    tflite::CustomOptionsFormat composite_attributes_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    int32_t version = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto composite_attributes__ = composite_attributes ? _fbb.CreateVector<uint8_t>(*composite_attributes) : 0;
+  return tflite::CreateStableHLOCompositeOptions(
+      _fbb,
+      name__,
+      decomposition_subgraph_index,
+      composite_attributes__,
+      composite_attributes_format,
+      version);
+}
+
+::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StablehloShiftLeftOptionsT : public ::flatbuffers::NativeTable {
+  typedef StablehloShiftLeftOptions TableType;
+};
+
+struct StablehloShiftLeftOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloShiftLeftOptionsT NativeTableType;
+  typedef StablehloShiftLeftOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  StablehloShiftLeftOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StablehloShiftLeftOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StablehloShiftLeftOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloShiftLeftOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StablehloShiftLeftOptionsBuilder {
+  typedef StablehloShiftLeftOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit StablehloShiftLeftOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloShiftLeftOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloShiftLeftOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloShiftLeftOptions> CreateStablehloShiftLeftOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  StablehloShiftLeftOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<StablehloShiftLeftOptions> CreateStablehloShiftLeftOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloShiftLeftOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorT : public ::flatbuffers::NativeTable {
+  typedef Operator TableType;
+  uint32_t opcode_index = 0;
+  std::vector<int32_t> inputs{};
+  std::vector<int32_t> outputs{};
+  tflite::BuiltinOptionsUnion builtin_options{};
+  std::vector<uint8_t> custom_options{};
+  tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS;
+  std::vector<bool> mutating_variable_inputs{};
+  std::vector<int32_t> intermediates{};
+  uint64_t large_custom_options_offset = 0;
+  uint64_t large_custom_options_size = 0;
+  tflite::BuiltinOptions2Union builtin_options_2{};
+  int32_t debug_metadata_index = -1;
+};
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorT NativeTableType;
+  typedef OperatorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OPCODE_INDEX = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_BUILTIN_OPTIONS_TYPE = 10,
+    VT_BUILTIN_OPTIONS = 12,
+    VT_CUSTOM_OPTIONS = 14,
+    VT_CUSTOM_OPTIONS_FORMAT = 16,
+    VT_MUTATING_VARIABLE_INPUTS = 18,
+    VT_INTERMEDIATES = 20,
+    VT_LARGE_CUSTOM_OPTIONS_OFFSET = 22,
+    VT_LARGE_CUSTOM_OPTIONS_SIZE = 24,
+    VT_BUILTIN_OPTIONS_2_TYPE = 26,
+    VT_BUILTIN_OPTIONS_2 = 28,
+    VT_DEBUG_METADATA_INDEX = 30
+  };
+  uint32_t opcode_index() const {
+    return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  tflite::BuiltinOptions builtin_options_type() const {
+    return static_cast<tflite::BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
+  }
+  const void *builtin_options() const {
+    return GetPointer<const void *>(VT_BUILTIN_OPTIONS);
+  }
+  template<typename T> const T *builtin_options_as() const;
+  const tflite::Conv2DOptions *builtin_options_as_Conv2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_Conv2DOptions ? static_cast<const tflite::Conv2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DepthwiseConv2DOptions ? static_cast<const tflite::DepthwiseConv2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ConcatEmbeddingsOptions ? static_cast<const tflite::ConcatEmbeddingsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LSHProjectionOptions ? static_cast<const tflite::LSHProjectionOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Pool2DOptions *builtin_options_as_Pool2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_Pool2DOptions ? static_cast<const tflite::Pool2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SVDFOptions *builtin_options_as_SVDFOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SVDFOptions ? static_cast<const tflite::SVDFOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RNNOptions *builtin_options_as_RNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RNNOptions ? static_cast<const tflite::RNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FullyConnectedOptions ? static_cast<const tflite::FullyConnectedOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SoftmaxOptions *builtin_options_as_SoftmaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SoftmaxOptions ? static_cast<const tflite::SoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ConcatenationOptions *builtin_options_as_ConcatenationOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ConcatenationOptions ? static_cast<const tflite::ConcatenationOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AddOptions *builtin_options_as_AddOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AddOptions ? static_cast<const tflite::AddOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::L2NormOptions *builtin_options_as_L2NormOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_L2NormOptions ? static_cast<const tflite::L2NormOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LocalResponseNormalizationOptions *builtin_options_as_LocalResponseNormalizationOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LocalResponseNormalizationOptions ? static_cast<const tflite::LocalResponseNormalizationOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LSTMOptions *builtin_options_as_LSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LSTMOptions ? static_cast<const tflite::LSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ResizeBilinearOptions ? static_cast<const tflite::ResizeBilinearOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CallOptions *builtin_options_as_CallOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CallOptions ? static_cast<const tflite::CallOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReshapeOptions *builtin_options_as_ReshapeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReshapeOptions ? static_cast<const tflite::ReshapeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SkipGramOptions *builtin_options_as_SkipGramOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SkipGramOptions ? static_cast<const tflite::SkipGramOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SpaceToDepthOptions ? static_cast<const tflite::SpaceToDepthOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::EmbeddingLookupSparseOptions *builtin_options_as_EmbeddingLookupSparseOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_EmbeddingLookupSparseOptions ? static_cast<const tflite::EmbeddingLookupSparseOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MulOptions *builtin_options_as_MulOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MulOptions ? static_cast<const tflite::MulOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PadOptions *builtin_options_as_PadOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PadOptions ? static_cast<const tflite::PadOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GatherOptions *builtin_options_as_GatherOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GatherOptions ? static_cast<const tflite::GatherOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BatchToSpaceNDOptions ? static_cast<const tflite::BatchToSpaceNDOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SpaceToBatchNDOptions *builtin_options_as_SpaceToBatchNDOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SpaceToBatchNDOptions ? static_cast<const tflite::SpaceToBatchNDOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TransposeOptions *builtin_options_as_TransposeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TransposeOptions ? static_cast<const tflite::TransposeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReducerOptions *builtin_options_as_ReducerOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReducerOptions ? static_cast<const tflite::ReducerOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SubOptions *builtin_options_as_SubOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SubOptions ? static_cast<const tflite::SubOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DivOptions *builtin_options_as_DivOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DivOptions ? static_cast<const tflite::DivOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SqueezeOptions *builtin_options_as_SqueezeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SqueezeOptions ? static_cast<const tflite::SqueezeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SequenceRNNOptions *builtin_options_as_SequenceRNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SequenceRNNOptions ? static_cast<const tflite::SequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::StridedSliceOptions *builtin_options_as_StridedSliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_StridedSliceOptions ? static_cast<const tflite::StridedSliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ExpOptions *builtin_options_as_ExpOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ExpOptions ? static_cast<const tflite::ExpOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TopKV2Options *builtin_options_as_TopKV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TopKV2Options ? static_cast<const tflite::TopKV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::SplitOptions *builtin_options_as_SplitOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SplitOptions ? static_cast<const tflite::SplitOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogSoftmaxOptions *builtin_options_as_LogSoftmaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogSoftmaxOptions ? static_cast<const tflite::LogSoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CastOptions *builtin_options_as_CastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CastOptions ? static_cast<const tflite::CastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DequantizeOptions *builtin_options_as_DequantizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DequantizeOptions ? static_cast<const tflite::DequantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MaximumMinimumOptions ? static_cast<const tflite::MaximumMinimumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ArgMaxOptions ? static_cast<const tflite::ArgMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LessOptions *builtin_options_as_LessOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LessOptions ? static_cast<const tflite::LessOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NegOptions *builtin_options_as_NegOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NegOptions ? static_cast<const tflite::NegOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PadV2Options *builtin_options_as_PadV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PadV2Options ? static_cast<const tflite::PadV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::GreaterOptions *builtin_options_as_GreaterOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GreaterOptions ? static_cast<const tflite::GreaterOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GreaterEqualOptions ? static_cast<const tflite::GreaterEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LessEqualOptions *builtin_options_as_LessEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LessEqualOptions ? static_cast<const tflite::LessEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SelectOptions *builtin_options_as_SelectOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SelectOptions ? static_cast<const tflite::SelectOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SliceOptions *builtin_options_as_SliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SliceOptions ? static_cast<const tflite::SliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TransposeConvOptions *builtin_options_as_TransposeConvOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TransposeConvOptions ? static_cast<const tflite::TransposeConvOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SparseToDenseOptions ? static_cast<const tflite::SparseToDenseOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TileOptions *builtin_options_as_TileOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TileOptions ? static_cast<const tflite::TileOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ExpandDimsOptions ? static_cast<const tflite::ExpandDimsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::EqualOptions *builtin_options_as_EqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_EqualOptions ? static_cast<const tflite::EqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NotEqualOptions *builtin_options_as_NotEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NotEqualOptions ? static_cast<const tflite::NotEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ShapeOptions *builtin_options_as_ShapeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ShapeOptions ? static_cast<const tflite::ShapeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PowOptions *builtin_options_as_PowOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PowOptions ? static_cast<const tflite::PowOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ArgMinOptions *builtin_options_as_ArgMinOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ArgMinOptions ? static_cast<const tflite::ArgMinOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FakeQuantOptions *builtin_options_as_FakeQuantOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FakeQuantOptions ? static_cast<const tflite::FakeQuantOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PackOptions *builtin_options_as_PackOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PackOptions ? static_cast<const tflite::PackOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalOrOptions *builtin_options_as_LogicalOrOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogicalOrOptions ? static_cast<const tflite::LogicalOrOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::OneHotOptions *builtin_options_as_OneHotOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_OneHotOptions ? static_cast<const tflite::OneHotOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalAndOptions *builtin_options_as_LogicalAndOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogicalAndOptions ? static_cast<const tflite::LogicalAndOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalNotOptions *builtin_options_as_LogicalNotOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogicalNotOptions ? static_cast<const tflite::LogicalNotOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnpackOptions *builtin_options_as_UnpackOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnpackOptions ? static_cast<const tflite::UnpackOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FloorDivOptions *builtin_options_as_FloorDivOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FloorDivOptions ? static_cast<const tflite::FloorDivOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SquareOptions *builtin_options_as_SquareOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SquareOptions ? static_cast<const tflite::SquareOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ZerosLikeOptions *builtin_options_as_ZerosLikeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ZerosLikeOptions ? static_cast<const tflite::ZerosLikeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FillOptions *builtin_options_as_FillOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FillOptions ? static_cast<const tflite::FillOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BidirectionalSequenceLSTMOptions *builtin_options_as_BidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BidirectionalSequenceLSTMOptions ? static_cast<const tflite::BidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BidirectionalSequenceRNNOptions ? static_cast<const tflite::BidirectionalSequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnidirectionalSequenceLSTMOptions *builtin_options_as_UnidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnidirectionalSequenceLSTMOptions ? static_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FloorModOptions *builtin_options_as_FloorModOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FloorModOptions ? static_cast<const tflite::FloorModOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RangeOptions *builtin_options_as_RangeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RangeOptions ? static_cast<const tflite::RangeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ResizeNearestNeighborOptions ? static_cast<const tflite::ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LeakyReluOptions ? static_cast<const tflite::LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SquaredDifferenceOptions ? static_cast<const tflite::SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MirrorPadOptions ? static_cast<const tflite::MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AbsOptions *builtin_options_as_AbsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AbsOptions ? static_cast<const tflite::AbsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SplitVOptions *builtin_options_as_SplitVOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SplitVOptions ? static_cast<const tflite::SplitVOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UniqueOptions *builtin_options_as_UniqueOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UniqueOptions ? static_cast<const tflite::UniqueOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReverseV2Options *builtin_options_as_ReverseV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReverseV2Options ? static_cast<const tflite::ReverseV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::AddNOptions *builtin_options_as_AddNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AddNOptions ? static_cast<const tflite::AddNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GatherNdOptions *builtin_options_as_GatherNdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GatherNdOptions ? static_cast<const tflite::GatherNdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CosOptions *builtin_options_as_CosOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CosOptions ? static_cast<const tflite::CosOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::WhereOptions *builtin_options_as_WhereOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_WhereOptions ? static_cast<const tflite::WhereOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RankOptions *builtin_options_as_RankOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RankOptions ? static_cast<const tflite::RankOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReverseSequenceOptions *builtin_options_as_ReverseSequenceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReverseSequenceOptions ? static_cast<const tflite::ReverseSequenceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MatrixDiagOptions *builtin_options_as_MatrixDiagOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MatrixDiagOptions ? static_cast<const tflite::MatrixDiagOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::QuantizeOptions *builtin_options_as_QuantizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_QuantizeOptions ? static_cast<const tflite::QuantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MatrixSetDiagOptions *builtin_options_as_MatrixSetDiagOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MatrixSetDiagOptions ? static_cast<const tflite::MatrixSetDiagOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HardSwishOptions *builtin_options_as_HardSwishOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_HardSwishOptions ? static_cast<const tflite::HardSwishOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::IfOptions *builtin_options_as_IfOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_IfOptions ? static_cast<const tflite::IfOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::WhileOptions *builtin_options_as_WhileOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_WhileOptions ? static_cast<const tflite::WhileOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DepthToSpaceOptions *builtin_options_as_DepthToSpaceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DepthToSpaceOptions ? static_cast<const tflite::DepthToSpaceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV4Options *builtin_options_as_NonMaxSuppressionV4Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NonMaxSuppressionV4Options ? static_cast<const tflite::NonMaxSuppressionV4Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV5Options *builtin_options_as_NonMaxSuppressionV5Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NonMaxSuppressionV5Options ? static_cast<const tflite::NonMaxSuppressionV5Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::ScatterNdOptions *builtin_options_as_ScatterNdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ScatterNdOptions ? static_cast<const tflite::ScatterNdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SelectV2Options *builtin_options_as_SelectV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SelectV2Options ? static_cast<const tflite::SelectV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::DensifyOptions *builtin_options_as_DensifyOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DensifyOptions ? static_cast<const tflite::DensifyOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SegmentSumOptions *builtin_options_as_SegmentSumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SegmentSumOptions ? static_cast<const tflite::SegmentSumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BatchMatMulOptions *builtin_options_as_BatchMatMulOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BatchMatMulOptions ? static_cast<const tflite::BatchMatMulOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CumsumOptions *builtin_options_as_CumsumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CumsumOptions ? static_cast<const tflite::CumsumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CallOnceOptions *builtin_options_as_CallOnceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CallOnceOptions ? static_cast<const tflite::CallOnceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BroadcastToOptions *builtin_options_as_BroadcastToOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BroadcastToOptions ? static_cast<const tflite::BroadcastToOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Rfft2dOptions *builtin_options_as_Rfft2dOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_Rfft2dOptions ? static_cast<const tflite::Rfft2dOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Conv3DOptions *builtin_options_as_Conv3DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_Conv3DOptions ? static_cast<const tflite::Conv3DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableOptions *builtin_options_as_HashtableOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_HashtableOptions ? static_cast<const tflite::HashtableOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableFindOptions *builtin_options_as_HashtableFindOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_HashtableFindOptions ? static_cast<const tflite::HashtableFindOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableImportOptions *builtin_options_as_HashtableImportOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_HashtableImportOptions ? static_cast<const tflite::HashtableImportOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableSizeOptions *builtin_options_as_HashtableSizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_HashtableSizeOptions ? static_cast<const tflite::HashtableSizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::VarHandleOptions *builtin_options_as_VarHandleOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_VarHandleOptions ? static_cast<const tflite::VarHandleOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReadVariableOptions *builtin_options_as_ReadVariableOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReadVariableOptions ? static_cast<const tflite::ReadVariableOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AssignVariableOptions *builtin_options_as_AssignVariableOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AssignVariableOptions ? static_cast<const tflite::AssignVariableOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RandomOptions *builtin_options_as_RandomOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RandomOptions ? static_cast<const tflite::RandomOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BucketizeOptions *builtin_options_as_BucketizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BucketizeOptions ? static_cast<const tflite::BucketizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GeluOptions *builtin_options_as_GeluOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GeluOptions ? static_cast<const tflite::GeluOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DynamicUpdateSliceOptions *builtin_options_as_DynamicUpdateSliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DynamicUpdateSliceOptions ? static_cast<const tflite::DynamicUpdateSliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentProdOptions *builtin_options_as_UnsortedSegmentProdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnsortedSegmentProdOptions ? static_cast<const tflite::UnsortedSegmentProdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentMaxOptions *builtin_options_as_UnsortedSegmentMaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnsortedSegmentMaxOptions ? static_cast<const tflite::UnsortedSegmentMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentMinOptions *builtin_options_as_UnsortedSegmentMinOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnsortedSegmentMinOptions ? static_cast<const tflite::UnsortedSegmentMinOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentSumOptions *builtin_options_as_UnsortedSegmentSumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnsortedSegmentSumOptions ? static_cast<const tflite::UnsortedSegmentSumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ATan2Options *builtin_options_as_ATan2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ATan2Options ? static_cast<const tflite::ATan2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::SignOptions *builtin_options_as_SignOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SignOptions ? static_cast<const tflite::SignOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BitcastOptions *builtin_options_as_BitcastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BitcastOptions ? static_cast<const tflite::BitcastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BitwiseXorOptions *builtin_options_as_BitwiseXorOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BitwiseXorOptions ? static_cast<const tflite::BitwiseXorOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RightShiftOptions *builtin_options_as_RightShiftOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RightShiftOptions ? static_cast<const tflite::RightShiftOptions *>(builtin_options()) : nullptr;
+  }
+  const ::flatbuffers::Vector<uint8_t> *custom_options() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
+  }
+  tflite::CustomOptionsFormat custom_options_format() const {
+    return static_cast<tflite::CustomOptionsFormat>(GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
+  }
+  const ::flatbuffers::Vector<uint8_t> *mutating_variable_inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_MUTATING_VARIABLE_INPUTS);
+  }
+  const ::flatbuffers::Vector<int32_t> *intermediates() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INTERMEDIATES);
+  }
+  uint64_t large_custom_options_offset() const {
+    return GetField<uint64_t>(VT_LARGE_CUSTOM_OPTIONS_OFFSET, 0);
+  }
+  uint64_t large_custom_options_size() const {
+    return GetField<uint64_t>(VT_LARGE_CUSTOM_OPTIONS_SIZE, 0);
+  }
+  tflite::BuiltinOptions2 builtin_options_2_type() const {
+    return static_cast<tflite::BuiltinOptions2>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_2_TYPE, 0));
+  }
+  const void *builtin_options_2() const {
+    return GetPointer<const void *>(VT_BUILTIN_OPTIONS_2);
+  }
+  template<typename T> const T *builtin_options_2_as() const;
+  const tflite::StablehloConcatenateOptions *builtin_options_2_as_StablehloConcatenateOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloConcatenateOptions ? static_cast<const tflite::StablehloConcatenateOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloBroadcastInDimOptions *builtin_options_2_as_StablehloBroadcastInDimOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloBroadcastInDimOptions ? static_cast<const tflite::StablehloBroadcastInDimOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloSliceOptions *builtin_options_2_as_StablehloSliceOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloSliceOptions ? static_cast<const tflite::StablehloSliceOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloConvolutionOptions *builtin_options_2_as_StablehloConvolutionOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloConvolutionOptions ? static_cast<const tflite::StablehloConvolutionOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloCustomCallOptions *builtin_options_2_as_StablehloCustomCallOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloCustomCallOptions ? static_cast<const tflite::StablehloCustomCallOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloReduceOptions *builtin_options_2_as_StablehloReduceOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloReduceOptions ? static_cast<const tflite::StablehloReduceOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloScatterOptions *builtin_options_2_as_StablehloScatterOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloScatterOptions ? static_cast<const tflite::StablehloScatterOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloCompareOptions *builtin_options_2_as_StablehloCompareOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloCompareOptions ? static_cast<const tflite::StablehloCompareOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloDynamicSliceOptions *builtin_options_2_as_StablehloDynamicSliceOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloDynamicSliceOptions ? static_cast<const tflite::StablehloDynamicSliceOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloPadOptions *builtin_options_2_as_StablehloPadOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloPadOptions ? static_cast<const tflite::StablehloPadOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloIotaOptions *builtin_options_2_as_StablehloIotaOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloIotaOptions ? static_cast<const tflite::StablehloIotaOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloDotGeneralOptions *builtin_options_2_as_StablehloDotGeneralOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloDotGeneralOptions ? static_cast<const tflite::StablehloDotGeneralOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloReduceWindowOptions *builtin_options_2_as_StablehloReduceWindowOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloReduceWindowOptions ? static_cast<const tflite::StablehloReduceWindowOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloSortOptions *builtin_options_2_as_StablehloSortOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloSortOptions ? static_cast<const tflite::StablehloSortOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloWhileOptions *builtin_options_2_as_StablehloWhileOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloWhileOptions ? static_cast<const tflite::StablehloWhileOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloGatherOptions *builtin_options_2_as_StablehloGatherOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloGatherOptions ? static_cast<const tflite::StablehloGatherOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloTransposeOptions *builtin_options_2_as_StablehloTransposeOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloTransposeOptions ? static_cast<const tflite::StablehloTransposeOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::DilateOptions *builtin_options_2_as_DilateOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_DilateOptions ? static_cast<const tflite::DilateOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloRngBitGeneratorOptions *builtin_options_2_as_StablehloRngBitGeneratorOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloRngBitGeneratorOptions ? static_cast<const tflite::StablehloRngBitGeneratorOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::ReduceWindowOptions *builtin_options_2_as_ReduceWindowOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_ReduceWindowOptions ? static_cast<const tflite::ReduceWindowOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StableHLOCompositeOptions *builtin_options_2_as_StableHLOCompositeOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StableHLOCompositeOptions ? static_cast<const tflite::StableHLOCompositeOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloShiftLeftOptions *builtin_options_2_as_StablehloShiftLeftOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloShiftLeftOptions ? static_cast<const tflite::StablehloShiftLeftOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloCaseOptions *builtin_options_2_as_StablehloCaseOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloCaseOptions ? static_cast<const tflite::StablehloCaseOptions *>(builtin_options_2()) : nullptr;
+  }
+  int32_t debug_metadata_index() const {
+    return GetField<int32_t>(VT_DEBUG_METADATA_INDEX, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX, 4) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_TYPE, 1) &&
+           VerifyOffset(verifier, VT_BUILTIN_OPTIONS) &&
+           VerifyBuiltinOptions(verifier, builtin_options(), builtin_options_type()) &&
+           VerifyOffset(verifier, VT_CUSTOM_OPTIONS) &&
+           verifier.VerifyVector(custom_options()) &&
+           VerifyField<int8_t>(verifier, VT_CUSTOM_OPTIONS_FORMAT, 1) &&
+           VerifyOffset(verifier, VT_MUTATING_VARIABLE_INPUTS) &&
+           verifier.VerifyVector(mutating_variable_inputs()) &&
+           VerifyOffset(verifier, VT_INTERMEDIATES) &&
+           verifier.VerifyVector(intermediates()) &&
+           VerifyField<uint64_t>(verifier, VT_LARGE_CUSTOM_OPTIONS_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_LARGE_CUSTOM_OPTIONS_SIZE, 8) &&
+           VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_2_TYPE, 1) &&
+           VerifyOffset(verifier, VT_BUILTIN_OPTIONS_2) &&
+           VerifyBuiltinOptions2(verifier, builtin_options_2(), builtin_options_2_type()) &&
+           VerifyField<int32_t>(verifier, VT_DEBUG_METADATA_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  OperatorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Operator> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const tflite::Conv2DOptions *Operator::builtin_options_as<tflite::Conv2DOptions>() const {
+  return builtin_options_as_Conv2DOptions();
+}
+
+template<> inline const tflite::DepthwiseConv2DOptions *Operator::builtin_options_as<tflite::DepthwiseConv2DOptions>() const {
+  return builtin_options_as_DepthwiseConv2DOptions();
+}
+
+template<> inline const tflite::ConcatEmbeddingsOptions *Operator::builtin_options_as<tflite::ConcatEmbeddingsOptions>() const {
+  return builtin_options_as_ConcatEmbeddingsOptions();
+}
+
+template<> inline const tflite::LSHProjectionOptions *Operator::builtin_options_as<tflite::LSHProjectionOptions>() const {
+  return builtin_options_as_LSHProjectionOptions();
+}
+
+template<> inline const tflite::Pool2DOptions *Operator::builtin_options_as<tflite::Pool2DOptions>() const {
+  return builtin_options_as_Pool2DOptions();
+}
+
+template<> inline const tflite::SVDFOptions *Operator::builtin_options_as<tflite::SVDFOptions>() const {
+  return builtin_options_as_SVDFOptions();
+}
+
+template<> inline const tflite::RNNOptions *Operator::builtin_options_as<tflite::RNNOptions>() const {
+  return builtin_options_as_RNNOptions();
+}
+
+template<> inline const tflite::FullyConnectedOptions *Operator::builtin_options_as<tflite::FullyConnectedOptions>() const {
+  return builtin_options_as_FullyConnectedOptions();
+}
+
+template<> inline const tflite::SoftmaxOptions *Operator::builtin_options_as<tflite::SoftmaxOptions>() const {
+  return builtin_options_as_SoftmaxOptions();
+}
+
+template<> inline const tflite::ConcatenationOptions *Operator::builtin_options_as<tflite::ConcatenationOptions>() const {
+  return builtin_options_as_ConcatenationOptions();
+}
+
+template<> inline const tflite::AddOptions *Operator::builtin_options_as<tflite::AddOptions>() const {
+  return builtin_options_as_AddOptions();
+}
+
+template<> inline const tflite::L2NormOptions *Operator::builtin_options_as<tflite::L2NormOptions>() const {
+  return builtin_options_as_L2NormOptions();
+}
+
+template<> inline const tflite::LocalResponseNormalizationOptions *Operator::builtin_options_as<tflite::LocalResponseNormalizationOptions>() const {
+  return builtin_options_as_LocalResponseNormalizationOptions();
+}
+
+template<> inline const tflite::LSTMOptions *Operator::builtin_options_as<tflite::LSTMOptions>() const {
+  return builtin_options_as_LSTMOptions();
+}
+
+template<> inline const tflite::ResizeBilinearOptions *Operator::builtin_options_as<tflite::ResizeBilinearOptions>() const {
+  return builtin_options_as_ResizeBilinearOptions();
+}
+
+template<> inline const tflite::CallOptions *Operator::builtin_options_as<tflite::CallOptions>() const {
+  return builtin_options_as_CallOptions();
+}
+
+template<> inline const tflite::ReshapeOptions *Operator::builtin_options_as<tflite::ReshapeOptions>() const {
+  return builtin_options_as_ReshapeOptions();
+}
+
+template<> inline const tflite::SkipGramOptions *Operator::builtin_options_as<tflite::SkipGramOptions>() const {
+  return builtin_options_as_SkipGramOptions();
+}
+
+template<> inline const tflite::SpaceToDepthOptions *Operator::builtin_options_as<tflite::SpaceToDepthOptions>() const {
+  return builtin_options_as_SpaceToDepthOptions();
+}
+
+template<> inline const tflite::EmbeddingLookupSparseOptions *Operator::builtin_options_as<tflite::EmbeddingLookupSparseOptions>() const {
+  return builtin_options_as_EmbeddingLookupSparseOptions();
+}
+
+template<> inline const tflite::MulOptions *Operator::builtin_options_as<tflite::MulOptions>() const {
+  return builtin_options_as_MulOptions();
+}
+
+template<> inline const tflite::PadOptions *Operator::builtin_options_as<tflite::PadOptions>() const {
+  return builtin_options_as_PadOptions();
+}
+
+template<> inline const tflite::GatherOptions *Operator::builtin_options_as<tflite::GatherOptions>() const {
+  return builtin_options_as_GatherOptions();
+}
+
+template<> inline const tflite::BatchToSpaceNDOptions *Operator::builtin_options_as<tflite::BatchToSpaceNDOptions>() const {
+  return builtin_options_as_BatchToSpaceNDOptions();
+}
+
+template<> inline const tflite::SpaceToBatchNDOptions *Operator::builtin_options_as<tflite::SpaceToBatchNDOptions>() const {
+  return builtin_options_as_SpaceToBatchNDOptions();
+}
+
+template<> inline const tflite::TransposeOptions *Operator::builtin_options_as<tflite::TransposeOptions>() const {
+  return builtin_options_as_TransposeOptions();
+}
+
+template<> inline const tflite::ReducerOptions *Operator::builtin_options_as<tflite::ReducerOptions>() const {
+  return builtin_options_as_ReducerOptions();
+}
+
+template<> inline const tflite::SubOptions *Operator::builtin_options_as<tflite::SubOptions>() const {
+  return builtin_options_as_SubOptions();
+}
+
+template<> inline const tflite::DivOptions *Operator::builtin_options_as<tflite::DivOptions>() const {
+  return builtin_options_as_DivOptions();
+}
+
+template<> inline const tflite::SqueezeOptions *Operator::builtin_options_as<tflite::SqueezeOptions>() const {
+  return builtin_options_as_SqueezeOptions();
+}
+
+template<> inline const tflite::SequenceRNNOptions *Operator::builtin_options_as<tflite::SequenceRNNOptions>() const {
+  return builtin_options_as_SequenceRNNOptions();
+}
+
+template<> inline const tflite::StridedSliceOptions *Operator::builtin_options_as<tflite::StridedSliceOptions>() const {
+  return builtin_options_as_StridedSliceOptions();
+}
+
+template<> inline const tflite::ExpOptions *Operator::builtin_options_as<tflite::ExpOptions>() const {
+  return builtin_options_as_ExpOptions();
+}
+
+template<> inline const tflite::TopKV2Options *Operator::builtin_options_as<tflite::TopKV2Options>() const {
+  return builtin_options_as_TopKV2Options();
+}
+
+template<> inline const tflite::SplitOptions *Operator::builtin_options_as<tflite::SplitOptions>() const {
+  return builtin_options_as_SplitOptions();
+}
+
+template<> inline const tflite::LogSoftmaxOptions *Operator::builtin_options_as<tflite::LogSoftmaxOptions>() const {
+  return builtin_options_as_LogSoftmaxOptions();
+}
+
+template<> inline const tflite::CastOptions *Operator::builtin_options_as<tflite::CastOptions>() const {
+  return builtin_options_as_CastOptions();
+}
+
+template<> inline const tflite::DequantizeOptions *Operator::builtin_options_as<tflite::DequantizeOptions>() const {
+  return builtin_options_as_DequantizeOptions();
+}
+
+template<> inline const tflite::MaximumMinimumOptions *Operator::builtin_options_as<tflite::MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
+}
+
+template<> inline const tflite::ArgMaxOptions *Operator::builtin_options_as<tflite::ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
+}
+
+template<> inline const tflite::LessOptions *Operator::builtin_options_as<tflite::LessOptions>() const {
+  return builtin_options_as_LessOptions();
+}
+
+template<> inline const tflite::NegOptions *Operator::builtin_options_as<tflite::NegOptions>() const {
+  return builtin_options_as_NegOptions();
+}
+
+template<> inline const tflite::PadV2Options *Operator::builtin_options_as<tflite::PadV2Options>() const {
+  return builtin_options_as_PadV2Options();
+}
+
+template<> inline const tflite::GreaterOptions *Operator::builtin_options_as<tflite::GreaterOptions>() const {
+  return builtin_options_as_GreaterOptions();
+}
+
+template<> inline const tflite::GreaterEqualOptions *Operator::builtin_options_as<tflite::GreaterEqualOptions>() const {
+  return builtin_options_as_GreaterEqualOptions();
+}
+
+template<> inline const tflite::LessEqualOptions *Operator::builtin_options_as<tflite::LessEqualOptions>() const {
+  return builtin_options_as_LessEqualOptions();
+}
+
+template<> inline const tflite::SelectOptions *Operator::builtin_options_as<tflite::SelectOptions>() const {
+  return builtin_options_as_SelectOptions();
+}
+
+template<> inline const tflite::SliceOptions *Operator::builtin_options_as<tflite::SliceOptions>() const {
+  return builtin_options_as_SliceOptions();
+}
+
+template<> inline const tflite::TransposeConvOptions *Operator::builtin_options_as<tflite::TransposeConvOptions>() const {
+  return builtin_options_as_TransposeConvOptions();
+}
+
+template<> inline const tflite::SparseToDenseOptions *Operator::builtin_options_as<tflite::SparseToDenseOptions>() const {
+  return builtin_options_as_SparseToDenseOptions();
+}
+
+template<> inline const tflite::TileOptions *Operator::builtin_options_as<tflite::TileOptions>() const {
+  return builtin_options_as_TileOptions();
+}
+
+template<> inline const tflite::ExpandDimsOptions *Operator::builtin_options_as<tflite::ExpandDimsOptions>() const {
+  return builtin_options_as_ExpandDimsOptions();
+}
+
+template<> inline const tflite::EqualOptions *Operator::builtin_options_as<tflite::EqualOptions>() const {
+  return builtin_options_as_EqualOptions();
+}
+
+template<> inline const tflite::NotEqualOptions *Operator::builtin_options_as<tflite::NotEqualOptions>() const {
+  return builtin_options_as_NotEqualOptions();
+}
+
+template<> inline const tflite::ShapeOptions *Operator::builtin_options_as<tflite::ShapeOptions>() const {
+  return builtin_options_as_ShapeOptions();
+}
+
+template<> inline const tflite::PowOptions *Operator::builtin_options_as<tflite::PowOptions>() const {
+  return builtin_options_as_PowOptions();
+}
+
+template<> inline const tflite::ArgMinOptions *Operator::builtin_options_as<tflite::ArgMinOptions>() const {
+  return builtin_options_as_ArgMinOptions();
+}
+
+template<> inline const tflite::FakeQuantOptions *Operator::builtin_options_as<tflite::FakeQuantOptions>() const {
+  return builtin_options_as_FakeQuantOptions();
+}
+
+template<> inline const tflite::PackOptions *Operator::builtin_options_as<tflite::PackOptions>() const {
+  return builtin_options_as_PackOptions();
+}
+
+template<> inline const tflite::LogicalOrOptions *Operator::builtin_options_as<tflite::LogicalOrOptions>() const {
+  return builtin_options_as_LogicalOrOptions();
+}
+
+template<> inline const tflite::OneHotOptions *Operator::builtin_options_as<tflite::OneHotOptions>() const {
+  return builtin_options_as_OneHotOptions();
+}
+
+template<> inline const tflite::LogicalAndOptions *Operator::builtin_options_as<tflite::LogicalAndOptions>() const {
+  return builtin_options_as_LogicalAndOptions();
+}
+
+template<> inline const tflite::LogicalNotOptions *Operator::builtin_options_as<tflite::LogicalNotOptions>() const {
+  return builtin_options_as_LogicalNotOptions();
+}
+
+template<> inline const tflite::UnpackOptions *Operator::builtin_options_as<tflite::UnpackOptions>() const {
+  return builtin_options_as_UnpackOptions();
+}
+
+template<> inline const tflite::FloorDivOptions *Operator::builtin_options_as<tflite::FloorDivOptions>() const {
+  return builtin_options_as_FloorDivOptions();
+}
+
+template<> inline const tflite::SquareOptions *Operator::builtin_options_as<tflite::SquareOptions>() const {
+  return builtin_options_as_SquareOptions();
+}
+
+template<> inline const tflite::ZerosLikeOptions *Operator::builtin_options_as<tflite::ZerosLikeOptions>() const {
+  return builtin_options_as_ZerosLikeOptions();
+}
+
+template<> inline const tflite::FillOptions *Operator::builtin_options_as<tflite::FillOptions>() const {
+  return builtin_options_as_FillOptions();
+}
+
+template<> inline const tflite::BidirectionalSequenceLSTMOptions *Operator::builtin_options_as<tflite::BidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_BidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const tflite::BidirectionalSequenceRNNOptions *Operator::builtin_options_as<tflite::BidirectionalSequenceRNNOptions>() const {
+  return builtin_options_as_BidirectionalSequenceRNNOptions();
+}
+
+template<> inline const tflite::UnidirectionalSequenceLSTMOptions *Operator::builtin_options_as<tflite::UnidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_UnidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const tflite::FloorModOptions *Operator::builtin_options_as<tflite::FloorModOptions>() const {
+  return builtin_options_as_FloorModOptions();
+}
+
+template<> inline const tflite::RangeOptions *Operator::builtin_options_as<tflite::RangeOptions>() const {
+  return builtin_options_as_RangeOptions();
+}
+
+template<> inline const tflite::ResizeNearestNeighborOptions *Operator::builtin_options_as<tflite::ResizeNearestNeighborOptions>() const {
+  return builtin_options_as_ResizeNearestNeighborOptions();
+}
+
+template<> inline const tflite::LeakyReluOptions *Operator::builtin_options_as<tflite::LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const tflite::SquaredDifferenceOptions *Operator::builtin_options_as<tflite::SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const tflite::MirrorPadOptions *Operator::builtin_options_as<tflite::MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
+template<> inline const tflite::AbsOptions *Operator::builtin_options_as<tflite::AbsOptions>() const {
+  return builtin_options_as_AbsOptions();
+}
+
+template<> inline const tflite::SplitVOptions *Operator::builtin_options_as<tflite::SplitVOptions>() const {
+  return builtin_options_as_SplitVOptions();
+}
+
+template<> inline const tflite::UniqueOptions *Operator::builtin_options_as<tflite::UniqueOptions>() const {
+  return builtin_options_as_UniqueOptions();
+}
+
+template<> inline const tflite::ReverseV2Options *Operator::builtin_options_as<tflite::ReverseV2Options>() const {
+  return builtin_options_as_ReverseV2Options();
+}
+
+template<> inline const tflite::AddNOptions *Operator::builtin_options_as<tflite::AddNOptions>() const {
+  return builtin_options_as_AddNOptions();
+}
+
+template<> inline const tflite::GatherNdOptions *Operator::builtin_options_as<tflite::GatherNdOptions>() const {
+  return builtin_options_as_GatherNdOptions();
+}
+
+template<> inline const tflite::CosOptions *Operator::builtin_options_as<tflite::CosOptions>() const {
+  return builtin_options_as_CosOptions();
+}
+
+template<> inline const tflite::WhereOptions *Operator::builtin_options_as<tflite::WhereOptions>() const {
+  return builtin_options_as_WhereOptions();
+}
+
+template<> inline const tflite::RankOptions *Operator::builtin_options_as<tflite::RankOptions>() const {
+  return builtin_options_as_RankOptions();
+}
+
+template<> inline const tflite::ReverseSequenceOptions *Operator::builtin_options_as<tflite::ReverseSequenceOptions>() const {
+  return builtin_options_as_ReverseSequenceOptions();
+}
+
+template<> inline const tflite::MatrixDiagOptions *Operator::builtin_options_as<tflite::MatrixDiagOptions>() const {
+  return builtin_options_as_MatrixDiagOptions();
+}
+
+template<> inline const tflite::QuantizeOptions *Operator::builtin_options_as<tflite::QuantizeOptions>() const {
+  return builtin_options_as_QuantizeOptions();
+}
+
+template<> inline const tflite::MatrixSetDiagOptions *Operator::builtin_options_as<tflite::MatrixSetDiagOptions>() const {
+  return builtin_options_as_MatrixSetDiagOptions();
+}
+
+template<> inline const tflite::HardSwishOptions *Operator::builtin_options_as<tflite::HardSwishOptions>() const {
+  return builtin_options_as_HardSwishOptions();
+}
+
+template<> inline const tflite::IfOptions *Operator::builtin_options_as<tflite::IfOptions>() const {
+  return builtin_options_as_IfOptions();
+}
+
+template<> inline const tflite::WhileOptions *Operator::builtin_options_as<tflite::WhileOptions>() const {
+  return builtin_options_as_WhileOptions();
+}
+
+template<> inline const tflite::DepthToSpaceOptions *Operator::builtin_options_as<tflite::DepthToSpaceOptions>() const {
+  return builtin_options_as_DepthToSpaceOptions();
+}
+
+template<> inline const tflite::NonMaxSuppressionV4Options *Operator::builtin_options_as<tflite::NonMaxSuppressionV4Options>() const {
+  return builtin_options_as_NonMaxSuppressionV4Options();
+}
+
+template<> inline const tflite::NonMaxSuppressionV5Options *Operator::builtin_options_as<tflite::NonMaxSuppressionV5Options>() const {
+  return builtin_options_as_NonMaxSuppressionV5Options();
+}
+
+template<> inline const tflite::ScatterNdOptions *Operator::builtin_options_as<tflite::ScatterNdOptions>() const {
+  return builtin_options_as_ScatterNdOptions();
+}
+
+template<> inline const tflite::SelectV2Options *Operator::builtin_options_as<tflite::SelectV2Options>() const {
+  return builtin_options_as_SelectV2Options();
+}
+
+template<> inline const tflite::DensifyOptions *Operator::builtin_options_as<tflite::DensifyOptions>() const {
+  return builtin_options_as_DensifyOptions();
+}
+
+template<> inline const tflite::SegmentSumOptions *Operator::builtin_options_as<tflite::SegmentSumOptions>() const {
+  return builtin_options_as_SegmentSumOptions();
+}
+
+template<> inline const tflite::BatchMatMulOptions *Operator::builtin_options_as<tflite::BatchMatMulOptions>() const {
+  return builtin_options_as_BatchMatMulOptions();
+}
+
+template<> inline const tflite::CumsumOptions *Operator::builtin_options_as<tflite::CumsumOptions>() const {
+  return builtin_options_as_CumsumOptions();
+}
+
+template<> inline const tflite::CallOnceOptions *Operator::builtin_options_as<tflite::CallOnceOptions>() const {
+  return builtin_options_as_CallOnceOptions();
+}
+
+template<> inline const tflite::BroadcastToOptions *Operator::builtin_options_as<tflite::BroadcastToOptions>() const {
+  return builtin_options_as_BroadcastToOptions();
+}
+
+template<> inline const tflite::Rfft2dOptions *Operator::builtin_options_as<tflite::Rfft2dOptions>() const {
+  return builtin_options_as_Rfft2dOptions();
+}
+
+template<> inline const tflite::Conv3DOptions *Operator::builtin_options_as<tflite::Conv3DOptions>() const {
+  return builtin_options_as_Conv3DOptions();
+}
+
+template<> inline const tflite::HashtableOptions *Operator::builtin_options_as<tflite::HashtableOptions>() const {
+  return builtin_options_as_HashtableOptions();
+}
+
+template<> inline const tflite::HashtableFindOptions *Operator::builtin_options_as<tflite::HashtableFindOptions>() const {
+  return builtin_options_as_HashtableFindOptions();
+}
+
+template<> inline const tflite::HashtableImportOptions *Operator::builtin_options_as<tflite::HashtableImportOptions>() const {
+  return builtin_options_as_HashtableImportOptions();
+}
+
+template<> inline const tflite::HashtableSizeOptions *Operator::builtin_options_as<tflite::HashtableSizeOptions>() const {
+  return builtin_options_as_HashtableSizeOptions();
+}
+
+template<> inline const tflite::VarHandleOptions *Operator::builtin_options_as<tflite::VarHandleOptions>() const {
+  return builtin_options_as_VarHandleOptions();
+}
+
+template<> inline const tflite::ReadVariableOptions *Operator::builtin_options_as<tflite::ReadVariableOptions>() const {
+  return builtin_options_as_ReadVariableOptions();
+}
+
+template<> inline const tflite::AssignVariableOptions *Operator::builtin_options_as<tflite::AssignVariableOptions>() const {
+  return builtin_options_as_AssignVariableOptions();
+}
+
+template<> inline const tflite::RandomOptions *Operator::builtin_options_as<tflite::RandomOptions>() const {
+  return builtin_options_as_RandomOptions();
+}
+
+template<> inline const tflite::BucketizeOptions *Operator::builtin_options_as<tflite::BucketizeOptions>() const {
+  return builtin_options_as_BucketizeOptions();
+}
+
+template<> inline const tflite::GeluOptions *Operator::builtin_options_as<tflite::GeluOptions>() const {
+  return builtin_options_as_GeluOptions();
+}
+
+template<> inline const tflite::DynamicUpdateSliceOptions *Operator::builtin_options_as<tflite::DynamicUpdateSliceOptions>() const {
+  return builtin_options_as_DynamicUpdateSliceOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentProdOptions *Operator::builtin_options_as<tflite::UnsortedSegmentProdOptions>() const {
+  return builtin_options_as_UnsortedSegmentProdOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentMaxOptions *Operator::builtin_options_as<tflite::UnsortedSegmentMaxOptions>() const {
+  return builtin_options_as_UnsortedSegmentMaxOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentMinOptions *Operator::builtin_options_as<tflite::UnsortedSegmentMinOptions>() const {
+  return builtin_options_as_UnsortedSegmentMinOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentSumOptions *Operator::builtin_options_as<tflite::UnsortedSegmentSumOptions>() const {
+  return builtin_options_as_UnsortedSegmentSumOptions();
+}
+
+template<> inline const tflite::ATan2Options *Operator::builtin_options_as<tflite::ATan2Options>() const {
+  return builtin_options_as_ATan2Options();
+}
+
+template<> inline const tflite::SignOptions *Operator::builtin_options_as<tflite::SignOptions>() const {
+  return builtin_options_as_SignOptions();
+}
+
+template<> inline const tflite::BitcastOptions *Operator::builtin_options_as<tflite::BitcastOptions>() const {
+  return builtin_options_as_BitcastOptions();
+}
+
+template<> inline const tflite::BitwiseXorOptions *Operator::builtin_options_as<tflite::BitwiseXorOptions>() const {
+  return builtin_options_as_BitwiseXorOptions();
+}
+
+template<> inline const tflite::RightShiftOptions *Operator::builtin_options_as<tflite::RightShiftOptions>() const {
+  return builtin_options_as_RightShiftOptions();
+}
+
+template<> inline const tflite::StablehloConcatenateOptions *Operator::builtin_options_2_as<tflite::StablehloConcatenateOptions>() const {
+  return builtin_options_2_as_StablehloConcatenateOptions();
+}
+
+template<> inline const tflite::StablehloBroadcastInDimOptions *Operator::builtin_options_2_as<tflite::StablehloBroadcastInDimOptions>() const {
+  return builtin_options_2_as_StablehloBroadcastInDimOptions();
+}
+
+template<> inline const tflite::StablehloSliceOptions *Operator::builtin_options_2_as<tflite::StablehloSliceOptions>() const {
+  return builtin_options_2_as_StablehloSliceOptions();
+}
+
+template<> inline const tflite::StablehloConvolutionOptions *Operator::builtin_options_2_as<tflite::StablehloConvolutionOptions>() const {
+  return builtin_options_2_as_StablehloConvolutionOptions();
+}
+
+template<> inline const tflite::StablehloCustomCallOptions *Operator::builtin_options_2_as<tflite::StablehloCustomCallOptions>() const {
+  return builtin_options_2_as_StablehloCustomCallOptions();
+}
+
+template<> inline const tflite::StablehloReduceOptions *Operator::builtin_options_2_as<tflite::StablehloReduceOptions>() const {
+  return builtin_options_2_as_StablehloReduceOptions();
+}
+
+template<> inline const tflite::StablehloScatterOptions *Operator::builtin_options_2_as<tflite::StablehloScatterOptions>() const {
+  return builtin_options_2_as_StablehloScatterOptions();
+}
+
+template<> inline const tflite::StablehloCompareOptions *Operator::builtin_options_2_as<tflite::StablehloCompareOptions>() const {
+  return builtin_options_2_as_StablehloCompareOptions();
+}
+
+template<> inline const tflite::StablehloDynamicSliceOptions *Operator::builtin_options_2_as<tflite::StablehloDynamicSliceOptions>() const {
+  return builtin_options_2_as_StablehloDynamicSliceOptions();
+}
+
+template<> inline const tflite::StablehloPadOptions *Operator::builtin_options_2_as<tflite::StablehloPadOptions>() const {
+  return builtin_options_2_as_StablehloPadOptions();
+}
+
+template<> inline const tflite::StablehloIotaOptions *Operator::builtin_options_2_as<tflite::StablehloIotaOptions>() const {
+  return builtin_options_2_as_StablehloIotaOptions();
+}
+
+template<> inline const tflite::StablehloDotGeneralOptions *Operator::builtin_options_2_as<tflite::StablehloDotGeneralOptions>() const {
+  return builtin_options_2_as_StablehloDotGeneralOptions();
+}
+
+template<> inline const tflite::StablehloReduceWindowOptions *Operator::builtin_options_2_as<tflite::StablehloReduceWindowOptions>() const {
+  return builtin_options_2_as_StablehloReduceWindowOptions();
+}
+
+template<> inline const tflite::StablehloSortOptions *Operator::builtin_options_2_as<tflite::StablehloSortOptions>() const {
+  return builtin_options_2_as_StablehloSortOptions();
+}
+
+template<> inline const tflite::StablehloWhileOptions *Operator::builtin_options_2_as<tflite::StablehloWhileOptions>() const {
+  return builtin_options_2_as_StablehloWhileOptions();
+}
+
+template<> inline const tflite::StablehloGatherOptions *Operator::builtin_options_2_as<tflite::StablehloGatherOptions>() const {
+  return builtin_options_2_as_StablehloGatherOptions();
+}
+
+template<> inline const tflite::StablehloTransposeOptions *Operator::builtin_options_2_as<tflite::StablehloTransposeOptions>() const {
+  return builtin_options_2_as_StablehloTransposeOptions();
+}
+
+template<> inline const tflite::DilateOptions *Operator::builtin_options_2_as<tflite::DilateOptions>() const {
+  return builtin_options_2_as_DilateOptions();
+}
+
+template<> inline const tflite::StablehloRngBitGeneratorOptions *Operator::builtin_options_2_as<tflite::StablehloRngBitGeneratorOptions>() const {
+  return builtin_options_2_as_StablehloRngBitGeneratorOptions();
+}
+
+template<> inline const tflite::ReduceWindowOptions *Operator::builtin_options_2_as<tflite::ReduceWindowOptions>() const {
+  return builtin_options_2_as_ReduceWindowOptions();
+}
+
+template<> inline const tflite::StableHLOCompositeOptions *Operator::builtin_options_2_as<tflite::StableHLOCompositeOptions>() const {
+  return builtin_options_2_as_StableHLOCompositeOptions();
+}
+
+template<> inline const tflite::StablehloShiftLeftOptions *Operator::builtin_options_2_as<tflite::StablehloShiftLeftOptions>() const {
+  return builtin_options_2_as_StablehloShiftLeftOptions();
+}
+
+template<> inline const tflite::StablehloCaseOptions *Operator::builtin_options_2_as<tflite::StablehloCaseOptions>() const {
+  return builtin_options_2_as_StablehloCaseOptions();
+}
+
+struct OperatorBuilder {
+  typedef Operator Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_opcode_index(uint32_t opcode_index) {
+    fbb_.AddElement<uint32_t>(Operator::VT_OPCODE_INDEX, opcode_index, 0);
+  }
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(Operator::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
+  }
+  void add_builtin_options_type(tflite::BuiltinOptions builtin_options_type) {
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE, static_cast<uint8_t>(builtin_options_type), 0);
+  }
+  void add_builtin_options(::flatbuffers::Offset<void> builtin_options) {
+    fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
+  }
+  void add_custom_options(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_options) {
+    fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
+  }
+  void add_custom_options_format(tflite::CustomOptionsFormat custom_options_format) {
+    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT, static_cast<int8_t>(custom_options_format), 0);
+  }
+  void add_mutating_variable_inputs(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> mutating_variable_inputs) {
+    fbb_.AddOffset(Operator::VT_MUTATING_VARIABLE_INPUTS, mutating_variable_inputs);
+  }
+  void add_intermediates(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> intermediates) {
+    fbb_.AddOffset(Operator::VT_INTERMEDIATES, intermediates);
+  }
+  void add_large_custom_options_offset(uint64_t large_custom_options_offset) {
+    fbb_.AddElement<uint64_t>(Operator::VT_LARGE_CUSTOM_OPTIONS_OFFSET, large_custom_options_offset, 0);
+  }
+  void add_large_custom_options_size(uint64_t large_custom_options_size) {
+    fbb_.AddElement<uint64_t>(Operator::VT_LARGE_CUSTOM_OPTIONS_SIZE, large_custom_options_size, 0);
+  }
+  void add_builtin_options_2_type(tflite::BuiltinOptions2 builtin_options_2_type) {
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_2_TYPE, static_cast<uint8_t>(builtin_options_2_type), 0);
+  }
+  void add_builtin_options_2(::flatbuffers::Offset<void> builtin_options_2) {
+    fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS_2, builtin_options_2);
+  }
+  void add_debug_metadata_index(int32_t debug_metadata_index) {
+    fbb_.AddElement<int32_t>(Operator::VT_DEBUG_METADATA_INDEX, debug_metadata_index, -1);
+  }
+  explicit OperatorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Operator> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Operator>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Operator> CreateOperator(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t opcode_index = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs = 0,
+    tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE,
+    ::flatbuffers::Offset<void> builtin_options = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_options = 0,
+    tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> intermediates = 0,
+    uint64_t large_custom_options_offset = 0,
+    uint64_t large_custom_options_size = 0,
+    tflite::BuiltinOptions2 builtin_options_2_type = tflite::BuiltinOptions2_NONE,
+    ::flatbuffers::Offset<void> builtin_options_2 = 0,
+    int32_t debug_metadata_index = -1) {
+  OperatorBuilder builder_(_fbb);
+  builder_.add_large_custom_options_size(large_custom_options_size);
+  builder_.add_large_custom_options_offset(large_custom_options_offset);
+  builder_.add_debug_metadata_index(debug_metadata_index);
+  builder_.add_builtin_options_2(builtin_options_2);
+  builder_.add_intermediates(intermediates);
+  builder_.add_mutating_variable_inputs(mutating_variable_inputs);
+  builder_.add_custom_options(custom_options);
+  builder_.add_builtin_options(builtin_options);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_opcode_index(opcode_index);
+  builder_.add_builtin_options_2_type(builtin_options_2_type);
+  builder_.add_custom_options_format(custom_options_format);
+  builder_.add_builtin_options_type(builtin_options_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Operator> CreateOperatorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t opcode_index = 0,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE,
+    ::flatbuffers::Offset<void> builtin_options = 0,
+    const std::vector<uint8_t> *custom_options = nullptr,
+    tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    const std::vector<uint8_t> *mutating_variable_inputs = nullptr,
+    const std::vector<int32_t> *intermediates = nullptr,
+    uint64_t large_custom_options_offset = 0,
+    uint64_t large_custom_options_size = 0,
+    tflite::BuiltinOptions2 builtin_options_2_type = tflite::BuiltinOptions2_NONE,
+    ::flatbuffers::Offset<void> builtin_options_2 = 0,
+    int32_t debug_metadata_index = -1) {
+  auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
+  auto custom_options__ = custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0;
+  auto mutating_variable_inputs__ = mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0;
+  auto intermediates__ = intermediates ? _fbb.CreateVector<int32_t>(*intermediates) : 0;
+  return tflite::CreateOperator(
+      _fbb,
+      opcode_index,
+      inputs__,
+      outputs__,
+      builtin_options_type,
+      builtin_options,
+      custom_options__,
+      custom_options_format,
+      mutating_variable_inputs__,
+      intermediates__,
+      large_custom_options_offset,
+      large_custom_options_size,
+      builtin_options_2_type,
+      builtin_options_2,
+      debug_metadata_index);
+}
+
+::flatbuffers::Offset<Operator> CreateOperator(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SubGraphT : public ::flatbuffers::NativeTable {
+  typedef SubGraph TableType;
+  std::vector<std::unique_ptr<tflite::TensorT>> tensors{};
+  std::vector<int32_t> inputs{};
+  std::vector<int32_t> outputs{};
+  std::vector<std::unique_ptr<tflite::OperatorT>> operators{};
+  std::string name{};
+  int32_t debug_metadata_index = -1;
+  SubGraphT() = default;
+  SubGraphT(const SubGraphT &o);
+  SubGraphT(SubGraphT&&) FLATBUFFERS_NOEXCEPT = default;
+  SubGraphT &operator=(SubGraphT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct SubGraph FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SubGraphT NativeTableType;
+  typedef SubGraphBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TENSORS = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_OPERATORS = 10,
+    VT_NAME = 12,
+    VT_DEBUG_METADATA_INDEX = 14
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>> *tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>> *>(VT_TENSORS);
+  }
+  const ::flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>> *operators() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>> *>(VT_OPERATORS);
+  }
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  int32_t debug_metadata_index() const {
+    return GetField<int32_t>(VT_DEBUG_METADATA_INDEX, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TENSORS) &&
+           verifier.VerifyVector(tensors()) &&
+           verifier.VerifyVectorOfTables(tensors()) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           VerifyOffset(verifier, VT_OPERATORS) &&
+           verifier.VerifyVector(operators()) &&
+           verifier.VerifyVectorOfTables(operators()) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int32_t>(verifier, VT_DEBUG_METADATA_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  SubGraphT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SubGraphT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SubGraph> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SubGraphBuilder {
+  typedef SubGraph Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>>> tensors) {
+    fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
+  }
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(SubGraph::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
+  }
+  void add_operators(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>>> operators) {
+    fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
+  }
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(SubGraph::VT_NAME, name);
+  }
+  void add_debug_metadata_index(int32_t debug_metadata_index) {
+    fbb_.AddElement<int32_t>(SubGraph::VT_DEBUG_METADATA_INDEX, debug_metadata_index, -1);
+  }
+  explicit SubGraphBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SubGraph> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SubGraph>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraph(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>>> tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>>> operators = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    int32_t debug_metadata_index = -1) {
+  SubGraphBuilder builder_(_fbb);
+  builder_.add_debug_metadata_index(debug_metadata_index);
+  builder_.add_name(name);
+  builder_.add_operators(operators);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_tensors(tensors);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::Tensor>> *tensors = nullptr,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Operator>> *operators = nullptr,
+    const char *name = nullptr,
+    int32_t debug_metadata_index = -1) {
+  auto tensors__ = tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Tensor>>(*tensors) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
+  auto operators__ = operators ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Operator>>(*operators) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateSubGraph(
+      _fbb,
+      tensors__,
+      inputs__,
+      outputs__,
+      operators__,
+      name__,
+      debug_metadata_index);
+}
+
+::flatbuffers::Offset<SubGraph> CreateSubGraph(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BufferT : public ::flatbuffers::NativeTable {
+  typedef Buffer TableType;
+  std::vector<uint8_t> data{};
+  uint64_t offset = 0;
+  uint64_t size = 0;
+};
+
+struct Buffer FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BufferT NativeTableType;
+  typedef BufferBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4,
+    VT_OFFSET = 6,
+    VT_SIZE = 8
+  };
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  uint64_t offset() const {
+    return GetField<uint64_t>(VT_OFFSET, 0);
+  }
+  uint64_t size() const {
+    return GetField<uint64_t>(VT_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           VerifyField<uint64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_SIZE, 8) &&
+           verifier.EndTable();
+  }
+  BufferT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Buffer> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BufferBuilder {
+  typedef Buffer Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(Buffer::VT_DATA, data);
+  }
+  void add_offset(uint64_t offset) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_OFFSET, offset, 0);
+  }
+  void add_size(uint64_t size) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_SIZE, size, 0);
+  }
+  explicit BufferBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Buffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Buffer>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0,
+    uint64_t offset = 0,
+    uint64_t size = 0) {
+  BufferBuilder builder_(_fbb);
+  builder_.add_size(size);
+  builder_.add_offset(offset);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Buffer> CreateBufferDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr,
+    uint64_t offset = 0,
+    uint64_t size = 0) {
+  if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return tflite::CreateBuffer(
+      _fbb,
+      data__,
+      offset,
+      size);
+}
+
+::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MetadataT : public ::flatbuffers::NativeTable {
+  typedef Metadata TableType;
+  std::string name{};
+  uint32_t buffer = 0;
+};
+
+struct Metadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MetadataT NativeTableType;
+  typedef MetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_BUFFER = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t buffer() const {
+    return GetField<uint32_t>(VT_BUFFER, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER, 4) &&
+           verifier.EndTable();
+  }
+  MetadataT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Metadata> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MetadataBuilder {
+  typedef Metadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Metadata::VT_NAME, name);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Metadata::VT_BUFFER, buffer, 0);
+  }
+  explicit MetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Metadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Metadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Metadata> CreateMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    uint32_t buffer = 0) {
+  MetadataBuilder builder_(_fbb);
+  builder_.add_buffer(buffer);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Metadata> CreateMetadataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t buffer = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateMetadata(
+      _fbb,
+      name__,
+      buffer);
+}
+
+::flatbuffers::Offset<Metadata> CreateMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TensorMapT : public ::flatbuffers::NativeTable {
+  typedef TensorMap TableType;
+  std::string name{};
+  uint32_t tensor_index = 0;
+};
+
+struct TensorMap FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorMapT NativeTableType;
+  typedef TensorMapBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TENSOR_INDEX = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t tensor_index() const {
+    return GetField<uint32_t>(VT_TENSOR_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_TENSOR_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  TensorMapT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorMapT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TensorMap> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorMapBuilder {
+  typedef TensorMap Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(TensorMap::VT_NAME, name);
+  }
+  void add_tensor_index(uint32_t tensor_index) {
+    fbb_.AddElement<uint32_t>(TensorMap::VT_TENSOR_INDEX, tensor_index, 0);
+  }
+  explicit TensorMapBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorMap> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorMap>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMap(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    uint32_t tensor_index = 0) {
+  TensorMapBuilder builder_(_fbb);
+  builder_.add_tensor_index(tensor_index);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t tensor_index = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateTensorMap(
+      _fbb,
+      name__,
+      tensor_index);
+}
+
+::flatbuffers::Offset<TensorMap> CreateTensorMap(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SignatureDefT : public ::flatbuffers::NativeTable {
+  typedef SignatureDef TableType;
+  std::vector<std::unique_ptr<tflite::TensorMapT>> inputs{};
+  std::vector<std::unique_ptr<tflite::TensorMapT>> outputs{};
+  std::string signature_key{};
+  uint32_t subgraph_index = 0;
+  SignatureDefT() = default;
+  SignatureDefT(const SignatureDefT &o);
+  SignatureDefT(SignatureDefT&&) FLATBUFFERS_NOEXCEPT = default;
+  SignatureDefT &operator=(SignatureDefT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct SignatureDef FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SignatureDefT NativeTableType;
+  typedef SignatureDefBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INPUTS = 4,
+    VT_OUTPUTS = 6,
+    VT_SIGNATURE_KEY = 8,
+    VT_SUBGRAPH_INDEX = 12
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *>(VT_OUTPUTS);
+  }
+  const ::flatbuffers::String *signature_key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_SIGNATURE_KEY);
+  }
+  uint32_t subgraph_index() const {
+    return GetField<uint32_t>(VT_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           verifier.VerifyVectorOfTables(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           verifier.VerifyVectorOfTables(outputs()) &&
+           VerifyOffset(verifier, VT_SIGNATURE_KEY) &&
+           verifier.VerifyString(signature_key()) &&
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+  SignatureDefT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SignatureDefT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SignatureDef> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SignatureDefBuilder {
+  typedef SignatureDef Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> inputs) {
+    fbb_.AddOffset(SignatureDef::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> outputs) {
+    fbb_.AddOffset(SignatureDef::VT_OUTPUTS, outputs);
+  }
+  void add_signature_key(::flatbuffers::Offset<::flatbuffers::String> signature_key) {
+    fbb_.AddOffset(SignatureDef::VT_SIGNATURE_KEY, signature_key);
+  }
+  void add_subgraph_index(uint32_t subgraph_index) {
+    fbb_.AddElement<uint32_t>(SignatureDef::VT_SUBGRAPH_INDEX, subgraph_index, 0);
+  }
+  explicit SignatureDefBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SignatureDef> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SignatureDef>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDef(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> signature_key = 0,
+    uint32_t subgraph_index = 0) {
+  SignatureDefBuilder builder_(_fbb);
+  builder_.add_subgraph_index(subgraph_index);
+  builder_.add_signature_key(signature_key);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TensorMap>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::TensorMap>> *outputs = nullptr,
+    const char *signature_key = nullptr,
+    uint32_t subgraph_index = 0) {
+  auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>>(*outputs) : 0;
+  auto signature_key__ = signature_key ? _fbb.CreateString(signature_key) : 0;
+  return tflite::CreateSignatureDef(
+      _fbb,
+      inputs__,
+      outputs__,
+      signature_key__,
+      subgraph_index);
+}
+
+::flatbuffers::Offset<SignatureDef> CreateSignatureDef(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelT : public ::flatbuffers::NativeTable {
+  typedef Model TableType;
+  uint32_t version = 0;
+  std::vector<std::unique_ptr<tflite::OperatorCodeT>> operator_codes{};
+  std::vector<std::unique_ptr<tflite::SubGraphT>> subgraphs{};
+  std::string description{};
+  std::vector<std::unique_ptr<tflite::BufferT>> buffers{};
+  std::vector<int32_t> metadata_buffer{};
+  std::vector<std::unique_ptr<tflite::MetadataT>> metadata{};
+  std::vector<std::unique_ptr<tflite::SignatureDefT>> signature_defs{};
+  ModelT() = default;
+  ModelT(const ModelT &o);
+  ModelT(ModelT&&) FLATBUFFERS_NOEXCEPT = default;
+  ModelT &operator=(ModelT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelT NativeTableType;
+  typedef ModelBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VERSION = 4,
+    VT_OPERATOR_CODES = 6,
+    VT_SUBGRAPHS = 8,
+    VT_DESCRIPTION = 10,
+    VT_BUFFERS = 12,
+    VT_METADATA_BUFFER = 14,
+    VT_METADATA = 16,
+    VT_SIGNATURE_DEFS = 18
+  };
+  uint32_t version() const {
+    return GetField<uint32_t>(VT_VERSION, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>> *operator_codes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>> *>(VT_OPERATOR_CODES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>> *subgraphs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>> *>(VT_SUBGRAPHS);
+  }
+  const ::flatbuffers::String *description() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DESCRIPTION);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>> *buffers() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>> *>(VT_BUFFERS);
+  }
+  const ::flatbuffers::Vector<int32_t> *metadata_buffer() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_METADATA_BUFFER);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>> *metadata() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_VERSION, 4) &&
+           VerifyOffset(verifier, VT_OPERATOR_CODES) &&
+           verifier.VerifyVector(operator_codes()) &&
+           verifier.VerifyVectorOfTables(operator_codes()) &&
+           VerifyOffset(verifier, VT_SUBGRAPHS) &&
+           verifier.VerifyVector(subgraphs()) &&
+           verifier.VerifyVectorOfTables(subgraphs()) &&
+           VerifyOffset(verifier, VT_DESCRIPTION) &&
+           verifier.VerifyString(description()) &&
+           VerifyOffset(verifier, VT_BUFFERS) &&
+           verifier.VerifyVector(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) &&
+           VerifyOffset(verifier, VT_METADATA_BUFFER) &&
+           verifier.VerifyVector(metadata_buffer()) &&
+           VerifyOffset(verifier, VT_METADATA) &&
+           verifier.VerifyVector(metadata()) &&
+           verifier.VerifyVectorOfTables(metadata()) &&
+           VerifyOffset(verifier, VT_SIGNATURE_DEFS) &&
+           verifier.VerifyVector(signature_defs()) &&
+           verifier.VerifyVectorOfTables(signature_defs()) &&
+           verifier.EndTable();
+  }
+  ModelT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Model> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelBuilder {
+  typedef Model Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_version(uint32_t version) {
+    fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
+  }
+  void add_operator_codes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>>> operator_codes) {
+    fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
+  }
+  void add_subgraphs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>>> subgraphs) {
+    fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
+  }
+  void add_description(::flatbuffers::Offset<::flatbuffers::String> description) {
+    fbb_.AddOffset(Model::VT_DESCRIPTION, description);
+  }
+  void add_buffers(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers) {
+    fbb_.AddOffset(Model::VT_BUFFERS, buffers);
+  }
+  void add_metadata_buffer(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer) {
+    fbb_.AddOffset(Model::VT_METADATA_BUFFER, metadata_buffer);
+  }
+  void add_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata) {
+    fbb_.AddOffset(Model::VT_METADATA, metadata);
+  }
+  void add_signature_defs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
+    fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
+  }
+  explicit ModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Model> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Model>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Model> CreateModel(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>>> operator_codes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>>> subgraphs = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> description = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
+  ModelBuilder builder_(_fbb);
+  builder_.add_signature_defs(signature_defs);
+  builder_.add_metadata(metadata);
+  builder_.add_metadata_buffer(metadata_buffer);
+  builder_.add_buffers(buffers);
+  builder_.add_description(description);
+  builder_.add_subgraphs(subgraphs);
+  builder_.add_operator_codes(operator_codes);
+  builder_.add_version(version);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Model> CreateModelDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t version = 0,
+    const std::vector<::flatbuffers::Offset<tflite::OperatorCode>> *operator_codes = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::SubGraph>> *subgraphs = nullptr,
+    const char *description = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
+    const std::vector<int32_t> *metadata_buffer = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
+  auto operator_codes__ = operator_codes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
+  auto subgraphs__ = subgraphs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
+  auto description__ = description ? _fbb.CreateString(description) : 0;
+  auto buffers__ = buffers ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
+  auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
+  auto metadata__ = metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
+  auto signature_defs__ = signature_defs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
+  return tflite::CreateModel(
+      _fbb,
+      version,
+      operator_codes__,
+      subgraphs__,
+      description__,
+      buffers__,
+      metadata_buffer__,
+      metadata__,
+      signature_defs__);
+}
+
+::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline CustomQuantizationT *CustomQuantization::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CustomQuantizationT>(new CustomQuantizationT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom.begin()); } }
+}
+
+inline ::flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCustomQuantization(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->custom.size(), sizeof(uint8_t), 16);
+  auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      _custom);
+}
+
+inline BlockwiseQuantizationT *BlockwiseQuantization::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BlockwiseQuantizationT>(new BlockwiseQuantizationT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BlockwiseQuantization::UnPackTo(BlockwiseQuantizationT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = scales(); _o->scales = _e; }
+  { auto _e = zero_points(); _o->zero_points = _e; }
+  { auto _e = block_size(); _o->block_size = _e; }
+}
+
+inline ::flatbuffers::Offset<BlockwiseQuantization> BlockwiseQuantization::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BlockwiseQuantizationT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBlockwiseQuantization(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BlockwiseQuantization> CreateBlockwiseQuantization(::flatbuffers::FlatBufferBuilder &_fbb, const BlockwiseQuantizationT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BlockwiseQuantizationT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _scales = _o->scales;
+  auto _zero_points = _o->zero_points;
+  auto _block_size = _o->block_size;
+  return tflite::CreateBlockwiseQuantization(
+      _fbb,
+      _scales,
+      _zero_points,
+      _block_size);
+}
+
+inline QuantizationParametersT *QuantizationParameters::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<QuantizationParametersT>(new QuantizationParametersT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = min(); if (_e) { _o->min.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->min[_i] = _e->Get(_i); } } else { _o->min.resize(0); } }
+  { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } else { _o->max.resize(0); } }
+  { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } else { _o->scale.resize(0); } }
+  { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } else { _o->zero_point.resize(0); } }
+  { auto _e = details_type(); _o->details.type = _e; }
+  { auto _e = details(); if (_e) _o->details.value = tflite::QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); }
+  { auto _e = quantized_dimension(); _o->quantized_dimension = _e; }
+}
+
+inline ::flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateQuantizationParameters(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const QuantizationParametersT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
+  auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
+  auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
+  auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  auto _details_type = _o->details.type;
+  auto _details = _o->details.Pack(_fbb);
+  auto _quantized_dimension = _o->quantized_dimension;
+  return tflite::CreateQuantizationParameters(
+      _fbb,
+      _min,
+      _max,
+      _scale,
+      _zero_point,
+      _details_type,
+      _details,
+      _quantized_dimension);
+}
+
+inline Int32VectorT *Int32Vector::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Int32VectorT>(new Int32VectorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Int32Vector::UnPackTo(Int32VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<Int32Vector> Int32Vector::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInt32Vector(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Int32VectorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      _values);
+}
+
+inline Uint16VectorT *Uint16Vector::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Uint16VectorT>(new Uint16VectorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Uint16Vector::UnPackTo(Uint16VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<Uint16Vector> Uint16Vector::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUint16Vector(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Uint16VectorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->values.size(), sizeof(uint16_t), 4);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      _values);
+}
+
+inline Uint8VectorT *Uint8Vector::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Uint8VectorT>(new Uint8VectorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->values.begin()); } }
+}
+
+inline ::flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUint8Vector(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Uint8VectorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->values.size(), sizeof(uint8_t), 4);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      _values);
+}
+
+inline DimensionMetadataT *DimensionMetadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DimensionMetadataT>(new DimensionMetadataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = format(); _o->format = _e; }
+  { auto _e = dense_size(); _o->dense_size = _e; }
+  { auto _e = array_segments_type(); _o->array_segments.type = _e; }
+  { auto _e = array_segments(); if (_e) _o->array_segments.value = tflite::SparseIndexVectorUnion::UnPack(_e, array_segments_type(), _resolver); }
+  { auto _e = array_indices_type(); _o->array_indices.type = _e; }
+  { auto _e = array_indices(); if (_e) _o->array_indices.value = tflite::SparseIndexVectorUnion::UnPack(_e, array_indices_type(), _resolver); }
+}
+
+inline ::flatbuffers::Offset<DimensionMetadata> DimensionMetadata::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDimensionMetadata(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _format = _o->format;
+  auto _dense_size = _o->dense_size;
+  auto _array_segments_type = _o->array_segments.type;
+  auto _array_segments = _o->array_segments.Pack(_fbb);
+  auto _array_indices_type = _o->array_indices.type;
+  auto _array_indices = _o->array_indices.Pack(_fbb);
+  return tflite::CreateDimensionMetadata(
+      _fbb,
+      _format,
+      _dense_size,
+      _array_segments_type,
+      _array_segments,
+      _array_indices_type,
+      _array_indices);
+}
+
+inline SparsityParametersT::SparsityParametersT(const SparsityParametersT &o)
+      : traversal_order(o.traversal_order),
+        block_map(o.block_map) {
+  dim_metadata.reserve(o.dim_metadata.size());
+  for (const auto &dim_metadata_ : o.dim_metadata) { dim_metadata.emplace_back((dim_metadata_) ? new tflite::DimensionMetadataT(*dim_metadata_) : nullptr); }
+}
+
+inline SparsityParametersT &SparsityParametersT::operator=(SparsityParametersT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(traversal_order, o.traversal_order);
+  std::swap(block_map, o.block_map);
+  std::swap(dim_metadata, o.dim_metadata);
+  return *this;
+}
+
+inline SparsityParametersT *SparsityParameters::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SparsityParametersT>(new SparsityParametersT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } else { _o->traversal_order.resize(0); } }
+  { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } else { _o->block_map.resize(0); } }
+  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->dim_metadata.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<SparsityParameters> SparsityParameters::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparsityParameters(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _traversal_order = _o->traversal_order.size() ? _fbb.CreateVector(_o->traversal_order) : 0;
+  auto _block_map = _o->block_map.size() ? _fbb.CreateVector(_o->block_map) : 0;
+  auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::DimensionMetadata>> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateSparsityParameters(
+      _fbb,
+      _traversal_order,
+      _block_map,
+      _dim_metadata);
+}
+
+inline VariantSubTypeT *VariantSubType::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<VariantSubTypeT>(new VariantSubTypeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void VariantSubType::UnPackTo(VariantSubTypeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } else { _o->shape.resize(0); } }
+  { auto _e = type(); _o->type = _e; }
+  { auto _e = has_rank(); _o->has_rank = _e; }
+}
+
+inline ::flatbuffers::Offset<VariantSubType> VariantSubType::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateVariantSubType(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubType(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const VariantSubTypeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
+  auto _type = _o->type;
+  auto _has_rank = _o->has_rank;
+  return tflite::CreateVariantSubType(
+      _fbb,
+      _shape,
+      _type,
+      _has_rank);
+}
+
+inline TensorT::TensorT(const TensorT &o)
+      : shape(o.shape),
+        type(o.type),
+        buffer(o.buffer),
+        name(o.name),
+        quantization((o.quantization) ? new tflite::QuantizationParametersT(*o.quantization) : nullptr),
+        is_variable(o.is_variable),
+        sparsity((o.sparsity) ? new tflite::SparsityParametersT(*o.sparsity) : nullptr),
+        shape_signature(o.shape_signature),
+        has_rank(o.has_rank) {
+  variant_tensors.reserve(o.variant_tensors.size());
+  for (const auto &variant_tensors_ : o.variant_tensors) { variant_tensors.emplace_back((variant_tensors_) ? new tflite::VariantSubTypeT(*variant_tensors_) : nullptr); }
+}
+
+inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(shape, o.shape);
+  std::swap(type, o.type);
+  std::swap(buffer, o.buffer);
+  std::swap(name, o.name);
+  std::swap(quantization, o.quantization);
+  std::swap(is_variable, o.is_variable);
+  std::swap(sparsity, o.sparsity);
+  std::swap(shape_signature, o.shape_signature);
+  std::swap(has_rank, o.has_rank);
+  std::swap(variant_tensors, o.variant_tensors);
+  return *this;
+}
+
+inline TensorT *Tensor::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TensorT>(new TensorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Tensor::UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } else { _o->shape.resize(0); } }
+  { auto _e = type(); _o->type = _e; }
+  { auto _e = buffer(); _o->buffer = _e; }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = quantization(); if (_e) { if(_o->quantization) { _e->UnPackTo(_o->quantization.get(), _resolver); } else { _o->quantization = std::unique_ptr<tflite::QuantizationParametersT>(_e->UnPack(_resolver)); } } else if (_o->quantization) { _o->quantization.reset(); } }
+  { auto _e = is_variable(); _o->is_variable = _e; }
+  { auto _e = sparsity(); if (_e) { if(_o->sparsity) { _e->UnPackTo(_o->sparsity.get(), _resolver); } else { _o->sparsity = std::unique_ptr<tflite::SparsityParametersT>(_e->UnPack(_resolver)); } } else if (_o->sparsity) { _o->sparsity.reset(); } }
+  { auto _e = shape_signature(); if (_e) { _o->shape_signature.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape_signature[_i] = _e->Get(_i); } } else { _o->shape_signature.resize(0); } }
+  { auto _e = has_rank(); _o->has_rank = _e; }
+  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->variant_tensors.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<Tensor> Tensor::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensor(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TensorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
+  auto _type = _o->type;
+  auto _buffer = _o->buffer;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0;
+  auto _is_variable = _o->is_variable;
+  auto _sparsity = _o->sparsity ? CreateSparsityParameters(_fbb, _o->sparsity.get(), _rehasher) : 0;
+  auto _shape_signature = _o->shape_signature.size() ? _fbb.CreateVector(_o->shape_signature) : 0;
+  auto _has_rank = _o->has_rank;
+  auto _variant_tensors = _o->variant_tensors.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>> (_o->variant_tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateVariantSubType(*__va->__fbb, __va->__o->variant_tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateTensor(
+      _fbb,
+      _shape,
+      _type,
+      _buffer,
+      _name,
+      _quantization,
+      _is_variable,
+      _sparsity,
+      _shape_signature,
+      _has_rank,
+      _variant_tensors);
+}
+
+inline StablehloGatherOptionsT *StablehloGatherOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloGatherOptionsT>(new StablehloGatherOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloGatherOptions::UnPackTo(StablehloGatherOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = offset_dims(); if (_e) { _o->offset_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->offset_dims[_i] = _e->Get(_i); } } else { _o->offset_dims.resize(0); } }
+  { auto _e = collapsed_slice_dims(); if (_e) { _o->collapsed_slice_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->collapsed_slice_dims[_i] = _e->Get(_i); } } else { _o->collapsed_slice_dims.resize(0); } }
+  { auto _e = start_index_map(); if (_e) { _o->start_index_map.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->start_index_map[_i] = _e->Get(_i); } } else { _o->start_index_map.resize(0); } }
+  { auto _e = index_vector_dim(); _o->index_vector_dim = _e; }
+  { auto _e = slice_sizes(); if (_e) { _o->slice_sizes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->slice_sizes[_i] = _e->Get(_i); } } else { _o->slice_sizes.resize(0); } }
+  { auto _e = indices_are_sorted(); _o->indices_are_sorted = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloGatherOptions> StablehloGatherOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloGatherOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloGatherOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloGatherOptions> CreateStablehloGatherOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloGatherOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloGatherOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _offset_dims = _o->offset_dims.size() ? _fbb.CreateVector(_o->offset_dims) : 0;
+  auto _collapsed_slice_dims = _o->collapsed_slice_dims.size() ? _fbb.CreateVector(_o->collapsed_slice_dims) : 0;
+  auto _start_index_map = _o->start_index_map.size() ? _fbb.CreateVector(_o->start_index_map) : 0;
+  auto _index_vector_dim = _o->index_vector_dim;
+  auto _slice_sizes = _o->slice_sizes.size() ? _fbb.CreateVector(_o->slice_sizes) : 0;
+  auto _indices_are_sorted = _o->indices_are_sorted;
+  return tflite::CreateStablehloGatherOptions(
+      _fbb,
+      _offset_dims,
+      _collapsed_slice_dims,
+      _start_index_map,
+      _index_vector_dim,
+      _slice_sizes,
+      _indices_are_sorted);
+}
+
+inline StablehloTransposeOptionsT *StablehloTransposeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloTransposeOptionsT>(new StablehloTransposeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloTransposeOptions::UnPackTo(StablehloTransposeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = permutation(); if (_e) { _o->permutation.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->permutation[_i] = _e->Get(_i); } } else { _o->permutation.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloTransposeOptions> StablehloTransposeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloTransposeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloTransposeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloTransposeOptions> CreateStablehloTransposeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloTransposeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloTransposeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _permutation = _o->permutation.size() ? _fbb.CreateVector(_o->permutation) : 0;
+  return tflite::CreateStablehloTransposeOptions(
+      _fbb,
+      _permutation);
+}
+
+inline StablehloDotGeneralOptionsT *StablehloDotGeneralOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloDotGeneralOptionsT>(new StablehloDotGeneralOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloDotGeneralOptions::UnPackTo(StablehloDotGeneralOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = lhs_batching_dimensions(); if (_e) { _o->lhs_batching_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->lhs_batching_dimensions[_i] = _e->Get(_i); } } else { _o->lhs_batching_dimensions.resize(0); } }
+  { auto _e = rhs_batching_dimensions(); if (_e) { _o->rhs_batching_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->rhs_batching_dimensions[_i] = _e->Get(_i); } } else { _o->rhs_batching_dimensions.resize(0); } }
+  { auto _e = lhs_contracting_dimensions(); if (_e) { _o->lhs_contracting_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->lhs_contracting_dimensions[_i] = _e->Get(_i); } } else { _o->lhs_contracting_dimensions.resize(0); } }
+  { auto _e = rhs_contracting_dimensions(); if (_e) { _o->rhs_contracting_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->rhs_contracting_dimensions[_i] = _e->Get(_i); } } else { _o->rhs_contracting_dimensions.resize(0); } }
+  { auto _e = precision_config(); if (_e) { _o->precision_config.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->precision_config[_i] = static_cast<tflite::StablehloPrecisionConfig>(_e->Get(_i)); } } else { _o->precision_config.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloDotGeneralOptions> StablehloDotGeneralOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDotGeneralOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloDotGeneralOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloDotGeneralOptions> CreateStablehloDotGeneralOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDotGeneralOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloDotGeneralOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _lhs_batching_dimensions = _o->lhs_batching_dimensions.size() ? _fbb.CreateVector(_o->lhs_batching_dimensions) : 0;
+  auto _rhs_batching_dimensions = _o->rhs_batching_dimensions.size() ? _fbb.CreateVector(_o->rhs_batching_dimensions) : 0;
+  auto _lhs_contracting_dimensions = _o->lhs_contracting_dimensions.size() ? _fbb.CreateVector(_o->lhs_contracting_dimensions) : 0;
+  auto _rhs_contracting_dimensions = _o->rhs_contracting_dimensions.size() ? _fbb.CreateVector(_o->rhs_contracting_dimensions) : 0;
+  auto _precision_config = _o->precision_config.size() ? _fbb.CreateVectorScalarCast<uint32_t>(::flatbuffers::data(_o->precision_config), _o->precision_config.size()) : 0;
+  return tflite::CreateStablehloDotGeneralOptions(
+      _fbb,
+      _lhs_batching_dimensions,
+      _rhs_batching_dimensions,
+      _lhs_contracting_dimensions,
+      _rhs_contracting_dimensions,
+      _precision_config);
+}
+
+inline StablehloReduceWindowOptionsT *StablehloReduceWindowOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloReduceWindowOptionsT>(new StablehloReduceWindowOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloReduceWindowOptions::UnPackTo(StablehloReduceWindowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = window_dimensions(); if (_e) { _o->window_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->window_dimensions[_i] = _e->Get(_i); } } else { _o->window_dimensions.resize(0); } }
+  { auto _e = window_strides(); if (_e) { _o->window_strides.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->window_strides[_i] = _e->Get(_i); } } else { _o->window_strides.resize(0); } }
+  { auto _e = base_dilations(); if (_e) { _o->base_dilations.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->base_dilations[_i] = _e->Get(_i); } } else { _o->base_dilations.resize(0); } }
+  { auto _e = window_dilations(); if (_e) { _o->window_dilations.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->window_dilations[_i] = _e->Get(_i); } } else { _o->window_dilations.resize(0); } }
+  { auto _e = padding(); if (_e) { _o->padding.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->padding[_i] = _e->Get(_i); } } else { _o->padding.resize(0); } }
+  { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloReduceWindowOptions> StablehloReduceWindowOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceWindowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloReduceWindowOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloReduceWindowOptions> CreateStablehloReduceWindowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceWindowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloReduceWindowOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _window_dimensions = _o->window_dimensions.size() ? _fbb.CreateVector(_o->window_dimensions) : 0;
+  auto _window_strides = _o->window_strides.size() ? _fbb.CreateVector(_o->window_strides) : 0;
+  auto _base_dilations = _o->base_dilations.size() ? _fbb.CreateVector(_o->base_dilations) : 0;
+  auto _window_dilations = _o->window_dilations.size() ? _fbb.CreateVector(_o->window_dilations) : 0;
+  auto _padding = _o->padding.size() ? _fbb.CreateVector(_o->padding) : 0;
+  auto _body_subgraph_index = _o->body_subgraph_index;
+  return tflite::CreateStablehloReduceWindowOptions(
+      _fbb,
+      _window_dimensions,
+      _window_strides,
+      _base_dilations,
+      _window_dilations,
+      _padding,
+      _body_subgraph_index);
+}
+
+inline StablehloWhileOptionsT *StablehloWhileOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloWhileOptionsT>(new StablehloWhileOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloWhileOptions::UnPackTo(StablehloWhileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cond_subgraph_index(); _o->cond_subgraph_index = _e; }
+  { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloWhileOptions> StablehloWhileOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloWhileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloWhileOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloWhileOptions> CreateStablehloWhileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloWhileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloWhileOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cond_subgraph_index = _o->cond_subgraph_index;
+  auto _body_subgraph_index = _o->body_subgraph_index;
+  return tflite::CreateStablehloWhileOptions(
+      _fbb,
+      _cond_subgraph_index,
+      _body_subgraph_index);
+}
+
+inline StablehloSortOptionsT *StablehloSortOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloSortOptionsT>(new StablehloSortOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloSortOptions::UnPackTo(StablehloSortOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = dimension(); _o->dimension = _e; }
+  { auto _e = is_stable(); _o->is_stable = _e; }
+  { auto _e = comparator_subgraph_index(); _o->comparator_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloSortOptions> StablehloSortOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSortOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloSortOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloSortOptions> CreateStablehloSortOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSortOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloSortOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _dimension = _o->dimension;
+  auto _is_stable = _o->is_stable;
+  auto _comparator_subgraph_index = _o->comparator_subgraph_index;
+  return tflite::CreateStablehloSortOptions(
+      _fbb,
+      _dimension,
+      _is_stable,
+      _comparator_subgraph_index);
+}
+
+inline StablehloConcatenateOptionsT *StablehloConcatenateOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloConcatenateOptionsT>(new StablehloConcatenateOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloConcatenateOptions::UnPackTo(StablehloConcatenateOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = dimension(); _o->dimension = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloConcatenateOptions> StablehloConcatenateOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConcatenateOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloConcatenateOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloConcatenateOptions> CreateStablehloConcatenateOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConcatenateOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloConcatenateOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _dimension = _o->dimension;
+  return tflite::CreateStablehloConcatenateOptions(
+      _fbb,
+      _dimension);
+}
+
+inline StablehloBroadcastInDimOptionsT *StablehloBroadcastInDimOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloBroadcastInDimOptionsT>(new StablehloBroadcastInDimOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloBroadcastInDimOptions::UnPackTo(StablehloBroadcastInDimOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = broadcast_dimensions(); if (_e) { _o->broadcast_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->broadcast_dimensions[_i] = _e->Get(_i); } } else { _o->broadcast_dimensions.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloBroadcastInDimOptions> StablehloBroadcastInDimOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloBroadcastInDimOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloBroadcastInDimOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloBroadcastInDimOptions> CreateStablehloBroadcastInDimOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloBroadcastInDimOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloBroadcastInDimOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _broadcast_dimensions = _o->broadcast_dimensions.size() ? _fbb.CreateVector(_o->broadcast_dimensions) : 0;
+  return tflite::CreateStablehloBroadcastInDimOptions(
+      _fbb,
+      _broadcast_dimensions);
+}
+
+inline StablehloCompareOptionsT *StablehloCompareOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloCompareOptionsT>(new StablehloCompareOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloCompareOptions::UnPackTo(StablehloCompareOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = comparison_direction(); _o->comparison_direction = _e; }
+  { auto _e = compare_type(); _o->compare_type = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloCompareOptions> StablehloCompareOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCompareOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloCompareOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloCompareOptions> CreateStablehloCompareOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCompareOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloCompareOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _comparison_direction = _o->comparison_direction;
+  auto _compare_type = _o->compare_type;
+  return tflite::CreateStablehloCompareOptions(
+      _fbb,
+      _comparison_direction,
+      _compare_type);
+}
+
+inline StablehloDynamicSliceOptionsT *StablehloDynamicSliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloDynamicSliceOptionsT>(new StablehloDynamicSliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloDynamicSliceOptions::UnPackTo(StablehloDynamicSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = slice_sizes(); if (_e) { _o->slice_sizes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->slice_sizes[_i] = _e->Get(_i); } } else { _o->slice_sizes.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloDynamicSliceOptions> StablehloDynamicSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDynamicSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloDynamicSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloDynamicSliceOptions> CreateStablehloDynamicSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloDynamicSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloDynamicSliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _slice_sizes = _o->slice_sizes.size() ? _fbb.CreateVector(_o->slice_sizes) : 0;
+  return tflite::CreateStablehloDynamicSliceOptions(
+      _fbb,
+      _slice_sizes);
+}
+
+inline StablehloPadOptionsT *StablehloPadOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloPadOptionsT>(new StablehloPadOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloPadOptions::UnPackTo(StablehloPadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = edge_padding_low(); if (_e) { _o->edge_padding_low.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->edge_padding_low[_i] = _e->Get(_i); } } else { _o->edge_padding_low.resize(0); } }
+  { auto _e = edge_padding_high(); if (_e) { _o->edge_padding_high.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->edge_padding_high[_i] = _e->Get(_i); } } else { _o->edge_padding_high.resize(0); } }
+  { auto _e = interior_padding(); if (_e) { _o->interior_padding.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->interior_padding[_i] = _e->Get(_i); } } else { _o->interior_padding.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloPadOptions> StablehloPadOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloPadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloPadOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloPadOptions> CreateStablehloPadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloPadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloPadOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _edge_padding_low = _o->edge_padding_low.size() ? _fbb.CreateVector(_o->edge_padding_low) : 0;
+  auto _edge_padding_high = _o->edge_padding_high.size() ? _fbb.CreateVector(_o->edge_padding_high) : 0;
+  auto _interior_padding = _o->interior_padding.size() ? _fbb.CreateVector(_o->interior_padding) : 0;
+  return tflite::CreateStablehloPadOptions(
+      _fbb,
+      _edge_padding_low,
+      _edge_padding_high,
+      _interior_padding);
+}
+
+inline StablehloIotaOptionsT *StablehloIotaOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloIotaOptionsT>(new StablehloIotaOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloIotaOptions::UnPackTo(StablehloIotaOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = iota_dimension(); _o->iota_dimension = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloIotaOptions> StablehloIotaOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloIotaOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloIotaOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloIotaOptions> CreateStablehloIotaOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloIotaOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloIotaOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _iota_dimension = _o->iota_dimension;
+  return tflite::CreateStablehloIotaOptions(
+      _fbb,
+      _iota_dimension);
+}
+
+inline StablehloCustomCallOptionsT *StablehloCustomCallOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloCustomCallOptionsT>(new StablehloCustomCallOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloCustomCallOptions::UnPackTo(StablehloCustomCallOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = call_target_name(); if (_e) _o->call_target_name = _e->str(); }
+  { auto _e = has_side_effect(); _o->has_side_effect = _e; }
+  { auto _e = backend_config(); if (_e) _o->backend_config = _e->str(); }
+  { auto _e = api_version(); _o->api_version = _e; }
+  { auto _e = called_computations(); if (_e) { _o->called_computations.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->called_computations[_i] = _e->Get(_i); } } else { _o->called_computations.resize(0); } }
+  { auto _e = custom_attributes(); if (_e) { _o->custom_attributes.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom_attributes.begin()); } }
+}
+
+inline ::flatbuffers::Offset<StablehloCustomCallOptions> StablehloCustomCallOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCustomCallOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloCustomCallOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloCustomCallOptions> CreateStablehloCustomCallOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCustomCallOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloCustomCallOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _call_target_name = _o->call_target_name.empty() ? 0 : _fbb.CreateString(_o->call_target_name);
+  auto _has_side_effect = _o->has_side_effect;
+  auto _backend_config = _o->backend_config.empty() ? 0 : _fbb.CreateString(_o->backend_config);
+  auto _api_version = _o->api_version;
+  auto _called_computations = _o->called_computations.size() ? _fbb.CreateVector(_o->called_computations) : 0;
+  auto _custom_attributes = _o->custom_attributes.size() ? _fbb.CreateVector(_o->custom_attributes) : 0;
+  return tflite::CreateStablehloCustomCallOptions(
+      _fbb,
+      _call_target_name,
+      _has_side_effect,
+      _backend_config,
+      _api_version,
+      _called_computations,
+      _custom_attributes);
+}
+
+inline StablehloReduceOptionsT *StablehloReduceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloReduceOptionsT>(new StablehloReduceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloReduceOptions::UnPackTo(StablehloReduceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = dimensions(); if (_e) { _o->dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dimensions[_i] = _e->Get(_i); } } else { _o->dimensions.resize(0); } }
+  { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloReduceOptions> StablehloReduceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloReduceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloReduceOptions> CreateStablehloReduceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloReduceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloReduceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _dimensions = _o->dimensions.size() ? _fbb.CreateVector(_o->dimensions) : 0;
+  auto _body_subgraph_index = _o->body_subgraph_index;
+  return tflite::CreateStablehloReduceOptions(
+      _fbb,
+      _dimensions,
+      _body_subgraph_index);
+}
+
+inline StablehloSliceOptionsT *StablehloSliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloSliceOptionsT>(new StablehloSliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloSliceOptions::UnPackTo(StablehloSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = start_indices(); if (_e) { _o->start_indices.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->start_indices[_i] = _e->Get(_i); } } else { _o->start_indices.resize(0); } }
+  { auto _e = limit_indices(); if (_e) { _o->limit_indices.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->limit_indices[_i] = _e->Get(_i); } } else { _o->limit_indices.resize(0); } }
+  { auto _e = strides(); if (_e) { _o->strides.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->strides[_i] = _e->Get(_i); } } else { _o->strides.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloSliceOptions> StablehloSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloSliceOptions> CreateStablehloSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloSliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _start_indices = _o->start_indices.size() ? _fbb.CreateVector(_o->start_indices) : 0;
+  auto _limit_indices = _o->limit_indices.size() ? _fbb.CreateVector(_o->limit_indices) : 0;
+  auto _strides = _o->strides.size() ? _fbb.CreateVector(_o->strides) : 0;
+  return tflite::CreateStablehloSliceOptions(
+      _fbb,
+      _start_indices,
+      _limit_indices,
+      _strides);
+}
+
+inline StablehloConvolutionOptionsT *StablehloConvolutionOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloConvolutionOptionsT>(new StablehloConvolutionOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloConvolutionOptions::UnPackTo(StablehloConvolutionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = window_strides(); if (_e) { _o->window_strides.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->window_strides[_i] = _e->Get(_i); } } else { _o->window_strides.resize(0); } }
+  { auto _e = padding(); if (_e) { _o->padding.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->padding[_i] = _e->Get(_i); } } else { _o->padding.resize(0); } }
+  { auto _e = lhs_dilation(); if (_e) { _o->lhs_dilation.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->lhs_dilation[_i] = _e->Get(_i); } } else { _o->lhs_dilation.resize(0); } }
+  { auto _e = rhs_dilation(); if (_e) { _o->rhs_dilation.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->rhs_dilation[_i] = _e->Get(_i); } } else { _o->rhs_dilation.resize(0); } }
+  { auto _e = window_reversal(); if (_e) { _o->window_reversal.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->window_reversal[_i] = _e->Get(_i) != 0; } } else { _o->window_reversal.resize(0); } }
+  { auto _e = input_batch_dimension(); _o->input_batch_dimension = _e; }
+  { auto _e = input_feature_dimension(); _o->input_feature_dimension = _e; }
+  { auto _e = input_spatial_dimensions(); if (_e) { _o->input_spatial_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->input_spatial_dimensions[_i] = _e->Get(_i); } } else { _o->input_spatial_dimensions.resize(0); } }
+  { auto _e = kernel_input_feature_dimension(); _o->kernel_input_feature_dimension = _e; }
+  { auto _e = kernel_output_feature_dimension(); _o->kernel_output_feature_dimension = _e; }
+  { auto _e = kernel_spatial_dimensions(); if (_e) { _o->kernel_spatial_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->kernel_spatial_dimensions[_i] = _e->Get(_i); } } else { _o->kernel_spatial_dimensions.resize(0); } }
+  { auto _e = output_batch_dimension(); _o->output_batch_dimension = _e; }
+  { auto _e = output_feature_dimension(); _o->output_feature_dimension = _e; }
+  { auto _e = output_spatial_dimensions(); if (_e) { _o->output_spatial_dimensions.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->output_spatial_dimensions[_i] = _e->Get(_i); } } else { _o->output_spatial_dimensions.resize(0); } }
+  { auto _e = feature_group_count(); _o->feature_group_count = _e; }
+  { auto _e = batch_group_count(); _o->batch_group_count = _e; }
+  { auto _e = precision_config(); if (_e) { _o->precision_config.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->precision_config[_i] = static_cast<tflite::StablehloPrecisionConfig>(_e->Get(_i)); } } else { _o->precision_config.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloConvolutionOptions> StablehloConvolutionOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConvolutionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloConvolutionOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloConvolutionOptions> CreateStablehloConvolutionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloConvolutionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloConvolutionOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _window_strides = _o->window_strides.size() ? _fbb.CreateVector(_o->window_strides) : 0;
+  auto _padding = _o->padding.size() ? _fbb.CreateVector(_o->padding) : 0;
+  auto _lhs_dilation = _o->lhs_dilation.size() ? _fbb.CreateVector(_o->lhs_dilation) : 0;
+  auto _rhs_dilation = _o->rhs_dilation.size() ? _fbb.CreateVector(_o->rhs_dilation) : 0;
+  auto _window_reversal = _o->window_reversal.size() ? _fbb.CreateVector(_o->window_reversal) : 0;
+  auto _input_batch_dimension = _o->input_batch_dimension;
+  auto _input_feature_dimension = _o->input_feature_dimension;
+  auto _input_spatial_dimensions = _o->input_spatial_dimensions.size() ? _fbb.CreateVector(_o->input_spatial_dimensions) : 0;
+  auto _kernel_input_feature_dimension = _o->kernel_input_feature_dimension;
+  auto _kernel_output_feature_dimension = _o->kernel_output_feature_dimension;
+  auto _kernel_spatial_dimensions = _o->kernel_spatial_dimensions.size() ? _fbb.CreateVector(_o->kernel_spatial_dimensions) : 0;
+  auto _output_batch_dimension = _o->output_batch_dimension;
+  auto _output_feature_dimension = _o->output_feature_dimension;
+  auto _output_spatial_dimensions = _o->output_spatial_dimensions.size() ? _fbb.CreateVector(_o->output_spatial_dimensions) : 0;
+  auto _feature_group_count = _o->feature_group_count;
+  auto _batch_group_count = _o->batch_group_count;
+  auto _precision_config = _o->precision_config.size() ? _fbb.CreateVectorScalarCast<uint32_t>(::flatbuffers::data(_o->precision_config), _o->precision_config.size()) : 0;
+  return tflite::CreateStablehloConvolutionOptions(
+      _fbb,
+      _window_strides,
+      _padding,
+      _lhs_dilation,
+      _rhs_dilation,
+      _window_reversal,
+      _input_batch_dimension,
+      _input_feature_dimension,
+      _input_spatial_dimensions,
+      _kernel_input_feature_dimension,
+      _kernel_output_feature_dimension,
+      _kernel_spatial_dimensions,
+      _output_batch_dimension,
+      _output_feature_dimension,
+      _output_spatial_dimensions,
+      _feature_group_count,
+      _batch_group_count,
+      _precision_config);
+}
+
+inline StablehloScatterOptionsT *StablehloScatterOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloScatterOptionsT>(new StablehloScatterOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloScatterOptions::UnPackTo(StablehloScatterOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = indices_are_sorted(); _o->indices_are_sorted = _e; }
+  { auto _e = update_window_dims(); if (_e) { _o->update_window_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->update_window_dims[_i] = _e->Get(_i); } } else { _o->update_window_dims.resize(0); } }
+  { auto _e = inserted_window_dims(); if (_e) { _o->inserted_window_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inserted_window_dims[_i] = _e->Get(_i); } } else { _o->inserted_window_dims.resize(0); } }
+  { auto _e = scatter_dims_to_operand_dims(); if (_e) { _o->scatter_dims_to_operand_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scatter_dims_to_operand_dims[_i] = _e->Get(_i); } } else { _o->scatter_dims_to_operand_dims.resize(0); } }
+  { auto _e = index_vector_dim(); _o->index_vector_dim = _e; }
+  { auto _e = unique_indices(); _o->unique_indices = _e; }
+  { auto _e = update_computation_subgraph_index(); _o->update_computation_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloScatterOptions> StablehloScatterOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloScatterOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloScatterOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloScatterOptions> CreateStablehloScatterOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloScatterOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloScatterOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _indices_are_sorted = _o->indices_are_sorted;
+  auto _update_window_dims = _o->update_window_dims.size() ? _fbb.CreateVector(_o->update_window_dims) : 0;
+  auto _inserted_window_dims = _o->inserted_window_dims.size() ? _fbb.CreateVector(_o->inserted_window_dims) : 0;
+  auto _scatter_dims_to_operand_dims = _o->scatter_dims_to_operand_dims.size() ? _fbb.CreateVector(_o->scatter_dims_to_operand_dims) : 0;
+  auto _index_vector_dim = _o->index_vector_dim;
+  auto _unique_indices = _o->unique_indices;
+  auto _update_computation_subgraph_index = _o->update_computation_subgraph_index;
+  return tflite::CreateStablehloScatterOptions(
+      _fbb,
+      _indices_are_sorted,
+      _update_window_dims,
+      _inserted_window_dims,
+      _scatter_dims_to_operand_dims,
+      _index_vector_dim,
+      _unique_indices,
+      _update_computation_subgraph_index);
+}
+
+inline StablehloCaseOptionsT *StablehloCaseOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloCaseOptionsT>(new StablehloCaseOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloCaseOptions::UnPackTo(StablehloCaseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = branch_subgraph_indices(); if (_e) { _o->branch_subgraph_indices.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->branch_subgraph_indices[_i] = _e->Get(_i); } } else { _o->branch_subgraph_indices.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<StablehloCaseOptions> StablehloCaseOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCaseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloCaseOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloCaseOptions> CreateStablehloCaseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloCaseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloCaseOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _branch_subgraph_indices = _o->branch_subgraph_indices.size() ? _fbb.CreateVector(_o->branch_subgraph_indices) : 0;
+  return tflite::CreateStablehloCaseOptions(
+      _fbb,
+      _branch_subgraph_indices);
+}
+
+inline StablehloRngBitGeneratorOptionsT *StablehloRngBitGeneratorOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloRngBitGeneratorOptionsT>(new StablehloRngBitGeneratorOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloRngBitGeneratorOptions::UnPackTo(StablehloRngBitGeneratorOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = algorithm(); _o->algorithm = _e; }
+}
+
+inline ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> StablehloRngBitGeneratorOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloRngBitGeneratorOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloRngBitGeneratorOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> CreateStablehloRngBitGeneratorOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloRngBitGeneratorOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloRngBitGeneratorOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _algorithm = _o->algorithm;
+  return tflite::CreateStablehloRngBitGeneratorOptions(
+      _fbb,
+      _algorithm);
+}
+
+inline Conv2DOptionsT *Conv2DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Conv2DOptionsT>(new Conv2DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; }
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
+  { auto _e = quantized_bias_type(); _o->quantized_bias_type = _e; }
+}
+
+inline ::flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Conv2DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
+  auto _quantized_bias_type = _o->quantized_bias_type;
+  return tflite::CreateConv2DOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor,
+      _quantized_bias_type);
+}
+
+inline Conv3DOptionsT *Conv3DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Conv3DOptionsT>(new Conv3DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Conv3DOptions::UnPackTo(Conv3DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_d(); _o->stride_d = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = dilation_d_factor(); _o->dilation_d_factor = _e; }
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; }
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
+}
+
+inline ::flatbuffers::Offset<Conv3DOptions> Conv3DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConv3DOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Conv3DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_d = _o->stride_d;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_d_factor = _o->dilation_d_factor;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
+  return tflite::CreateConv3DOptions(
+      _fbb,
+      _padding,
+      _stride_d,
+      _stride_w,
+      _stride_h,
+      _fused_activation_function,
+      _dilation_d_factor,
+      _dilation_w_factor,
+      _dilation_h_factor);
+}
+
+inline Pool2DOptionsT *Pool2DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Pool2DOptionsT>(new Pool2DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = filter_width(); _o->filter_width = _e; }
+  { auto _e = filter_height(); _o->filter_height = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline ::flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePool2DOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Pool2DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _filter_width = _o->filter_width;
+  auto _filter_height = _o->filter_height;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreatePool2DOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _filter_width,
+      _filter_height,
+      _fused_activation_function);
+}
+
+inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DepthwiseConv2DOptionsT>(new DepthwiseConv2DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = depth_multiplier(); _o->depth_multiplier = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; }
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
+}
+
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DepthwiseConv2DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _depth_multiplier = _o->depth_multiplier;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
+  return tflite::CreateDepthwiseConv2DOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _depth_multiplier,
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor);
+}
+
+inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConcatEmbeddingsOptionsT>(new ConcatEmbeddingsOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConcatEmbeddingsOptions::UnPackTo(ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_channels(); _o->num_channels = _e; }
+  { auto _e = num_columns_per_channel(); if (_e) { _o->num_columns_per_channel.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->num_columns_per_channel[_i] = _e->Get(_i); } } else { _o->num_columns_per_channel.resize(0); } }
+  { auto _e = embedding_dim_per_channel(); if (_e) { _o->embedding_dim_per_channel.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->embedding_dim_per_channel[_i] = _e->Get(_i); } } else { _o->embedding_dim_per_channel.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> ConcatEmbeddingsOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConcatEmbeddingsOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_channels = _o->num_channels;
+  auto _num_columns_per_channel = _o->num_columns_per_channel.size() ? _fbb.CreateVector(_o->num_columns_per_channel) : 0;
+  auto _embedding_dim_per_channel = _o->embedding_dim_per_channel.size() ? _fbb.CreateVector(_o->embedding_dim_per_channel) : 0;
+  return tflite::CreateConcatEmbeddingsOptions(
+      _fbb,
+      _num_channels,
+      _num_columns_per_channel,
+      _embedding_dim_per_channel);
+}
+
+inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LSHProjectionOptionsT>(new LSHProjectionOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LSHProjectionOptions::UnPackTo(LSHProjectionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = type(); _o->type = _e; }
+}
+
+inline ::flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LSHProjectionOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _type = _o->type;
+  return tflite::CreateLSHProjectionOptions(
+      _fbb,
+      _type);
+}
+
+inline SVDFOptionsT *SVDFOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SVDFOptionsT>(new SVDFOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = rank(); _o->rank = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSVDFOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SVDFOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _rank = _o->rank;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateSVDFOptions(
+      _fbb,
+      _rank,
+      _fused_activation_function,
+      _asymmetric_quantize_inputs);
+}
+
+inline RNNOptionsT *RNNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RNNOptionsT>(new RNNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RNNOptions::UnPackTo(RNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<RNNOptions> RNNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<RNNOptions> CreateRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RNNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateRNNOptions(
+      _fbb,
+      _fused_activation_function,
+      _asymmetric_quantize_inputs);
+}
+
+inline SequenceRNNOptionsT *SequenceRNNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SequenceRNNOptionsT>(new SequenceRNNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SequenceRNNOptions::UnPackTo(SequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<SequenceRNNOptions> SequenceRNNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SequenceRNNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateSequenceRNNOptions(
+      _fbb,
+      _time_major,
+      _fused_activation_function,
+      _asymmetric_quantize_inputs);
+}
+
+inline BidirectionalSequenceRNNOptionsT *BidirectionalSequenceRNNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BidirectionalSequenceRNNOptionsT>(new BidirectionalSequenceRNNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BidirectionalSequenceRNNOptions::UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> BidirectionalSequenceRNNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceRNNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _merge_outputs = _o->merge_outputs;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateBidirectionalSequenceRNNOptions(
+      _fbb,
+      _time_major,
+      _fused_activation_function,
+      _merge_outputs,
+      _asymmetric_quantize_inputs);
+}
+
+inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FullyConnectedOptionsT>(new FullyConnectedOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = weights_format(); _o->weights_format = _e; }
+  { auto _e = keep_num_dims(); _o->keep_num_dims = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+  { auto _e = quantized_bias_type(); _o->quantized_bias_type = _e; }
+}
+
+inline ::flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _weights_format = _o->weights_format;
+  auto _keep_num_dims = _o->keep_num_dims;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  auto _quantized_bias_type = _o->quantized_bias_type;
+  return tflite::CreateFullyConnectedOptions(
+      _fbb,
+      _fused_activation_function,
+      _weights_format,
+      _keep_num_dims,
+      _asymmetric_quantize_inputs,
+      _quantized_bias_type);
+}
+
+inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SoftmaxOptionsT>(new SoftmaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SoftmaxOptions::UnPackTo(SoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = beta(); _o->beta = _e; }
+}
+
+inline ::flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SoftmaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _beta = _o->beta;
+  return tflite::CreateSoftmaxOptions(
+      _fbb,
+      _beta);
+}
+
+inline ConcatenationOptionsT *ConcatenationOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConcatenationOptionsT>(new ConcatenationOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConcatenationOptions::UnPackTo(ConcatenationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline ::flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatenationOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConcatenationOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateConcatenationOptions(
+      _fbb,
+      _axis,
+      _fused_activation_function);
+}
+
+inline AddOptionsT *AddOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AddOptionsT>(new AddOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AddOptions::UnPackTo(AddOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
+}
+
+inline ::flatbuffers::Offset<AddOptions> AddOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<AddOptions> CreateAddOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
+  return tflite::CreateAddOptions(
+      _fbb,
+      _fused_activation_function,
+      _pot_scale_int16);
+}
+
+inline MulOptionsT *MulOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MulOptionsT>(new MulOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MulOptions::UnPackTo(MulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline ::flatbuffers::Offset<MulOptions> MulOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMulOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MulOptions> CreateMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MulOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateMulOptions(
+      _fbb,
+      _fused_activation_function);
+}
+
+inline L2NormOptionsT *L2NormOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<L2NormOptionsT>(new L2NormOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void L2NormOptions::UnPackTo(L2NormOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline ::flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateL2NormOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const L2NormOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateL2NormOptions(
+      _fbb,
+      _fused_activation_function);
+}
+
+inline LocalResponseNormalizationOptionsT *LocalResponseNormalizationOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LocalResponseNormalizationOptionsT>(new LocalResponseNormalizationOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = radius(); _o->radius = _e; }
+  { auto _e = bias(); _o->bias = _e; }
+  { auto _e = alpha(); _o->alpha = _e; }
+  { auto _e = beta(); _o->beta = _e; }
+}
+
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> LocalResponseNormalizationOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LocalResponseNormalizationOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _radius = _o->radius;
+  auto _bias = _o->bias;
+  auto _alpha = _o->alpha;
+  auto _beta = _o->beta;
+  return tflite::CreateLocalResponseNormalizationOptions(
+      _fbb,
+      _radius,
+      _bias,
+      _alpha,
+      _beta);
+}
+
+inline LSTMOptionsT *LSTMOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LSTMOptionsT>(new LSTMOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = cell_clip(); _o->cell_clip = _e; }
+  { auto _e = proj_clip(); _o->proj_clip = _e; }
+  { auto _e = kernel_type(); _o->kernel_type = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LSTMOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _kernel_type = _o->kernel_type;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _kernel_type,
+      _asymmetric_quantize_inputs);
+}
+
+inline UnidirectionalSequenceLSTMOptionsT *UnidirectionalSequenceLSTMOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnidirectionalSequenceLSTMOptionsT>(new UnidirectionalSequenceLSTMOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = cell_clip(); _o->cell_clip = _e; }
+  { auto _e = proj_clip(); _o->proj_clip = _e; }
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+  { auto _e = diagonal_recurrent_tensors(); _o->diagonal_recurrent_tensors = _e; }
+}
+
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnidirectionalSequenceLSTMOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _time_major = _o->time_major;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  auto _diagonal_recurrent_tensors = _o->diagonal_recurrent_tensors;
+  return tflite::CreateUnidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _time_major,
+      _asymmetric_quantize_inputs,
+      _diagonal_recurrent_tensors);
+}
+
+inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BidirectionalSequenceLSTMOptionsT>(new BidirectionalSequenceLSTMOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = cell_clip(); _o->cell_clip = _e; }
+  { auto _e = proj_clip(); _o->proj_clip = _e; }
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; }
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _merge_outputs = _o->merge_outputs;
+  auto _time_major = _o->time_major;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateBidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _merge_outputs,
+      _time_major,
+      _asymmetric_quantize_inputs);
+}
+
+inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ResizeBilinearOptionsT>(new ResizeBilinearOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ResizeBilinearOptions::UnPackTo(ResizeBilinearOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = align_corners(); _o->align_corners = _e; }
+  { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
+}
+
+inline ::flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ResizeBilinearOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _align_corners = _o->align_corners;
+  auto _half_pixel_centers = _o->half_pixel_centers;
+  return tflite::CreateResizeBilinearOptions(
+      _fbb,
+      _align_corners,
+      _half_pixel_centers);
+}
+
+inline ResizeNearestNeighborOptionsT *ResizeNearestNeighborOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ResizeNearestNeighborOptionsT>(new ResizeNearestNeighborOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ResizeNearestNeighborOptions::UnPackTo(ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = align_corners(); _o->align_corners = _e; }
+  { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
+}
+
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> ResizeNearestNeighborOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeNearestNeighborOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ResizeNearestNeighborOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _align_corners = _o->align_corners;
+  auto _half_pixel_centers = _o->half_pixel_centers;
+  return tflite::CreateResizeNearestNeighborOptions(
+      _fbb,
+      _align_corners,
+      _half_pixel_centers);
+}
+
+inline CallOptionsT *CallOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CallOptionsT>(new CallOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CallOptions::UnPackTo(CallOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = subgraph(); _o->subgraph = _e; }
+}
+
+inline ::flatbuffers::Offset<CallOptions> CallOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCallOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CallOptions> CreateCallOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CallOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _subgraph = _o->subgraph;
+  return tflite::CreateCallOptions(
+      _fbb,
+      _subgraph);
+}
+
+inline PadOptionsT *PadOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PadOptionsT>(new PadOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PadOptions::UnPackTo(PadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<PadOptions> PadOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<PadOptions> CreatePadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PadOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadOptions(
+      _fbb);
+}
+
+inline PadV2OptionsT *PadV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PadV2OptionsT>(new PadV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PadV2Options::UnPackTo(PadV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<PadV2Options> PadV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadV2Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<PadV2Options> CreatePadV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PadV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadV2Options(
+      _fbb);
+}
+
+inline ReshapeOptionsT *ReshapeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReshapeOptionsT>(new ReshapeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReshapeOptions::UnPackTo(ReshapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = new_shape(); if (_e) { _o->new_shape.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->new_shape[_i] = _e->Get(_i); } } else { _o->new_shape.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReshapeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReshapeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
+  return tflite::CreateReshapeOptions(
+      _fbb,
+      _new_shape);
+}
+
+inline SpaceToBatchNDOptionsT *SpaceToBatchNDOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SpaceToBatchNDOptionsT>(new SpaceToBatchNDOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SpaceToBatchNDOptions::UnPackTo(SpaceToBatchNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToBatchNDOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SpaceToBatchNDOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSpaceToBatchNDOptions(
+      _fbb);
+}
+
+inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BatchToSpaceNDOptionsT>(new BatchToSpaceNDOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BatchToSpaceNDOptions::UnPackTo(BatchToSpaceNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBatchToSpaceNDOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BatchToSpaceNDOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBatchToSpaceNDOptions(
+      _fbb);
+}
+
+inline SkipGramOptionsT *SkipGramOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SkipGramOptionsT>(new SkipGramOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = ngram_size(); _o->ngram_size = _e; }
+  { auto _e = max_skip_size(); _o->max_skip_size = _e; }
+  { auto _e = include_all_ngrams(); _o->include_all_ngrams = _e; }
+}
+
+inline ::flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSkipGramOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SkipGramOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _ngram_size = _o->ngram_size;
+  auto _max_skip_size = _o->max_skip_size;
+  auto _include_all_ngrams = _o->include_all_ngrams;
+  return tflite::CreateSkipGramOptions(
+      _fbb,
+      _ngram_size,
+      _max_skip_size,
+      _include_all_ngrams);
+}
+
+inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SpaceToDepthOptionsT>(new SpaceToDepthOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SpaceToDepthOptions::UnPackTo(SpaceToDepthOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = block_size(); _o->block_size = _e; }
+}
+
+inline ::flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SpaceToDepthOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateSpaceToDepthOptions(
+      _fbb,
+      _block_size);
+}
+
+inline DepthToSpaceOptionsT *DepthToSpaceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DepthToSpaceOptionsT>(new DepthToSpaceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DepthToSpaceOptions::UnPackTo(DepthToSpaceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = block_size(); _o->block_size = _e; }
+}
+
+inline ::flatbuffers::Offset<DepthToSpaceOptions> DepthToSpaceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthToSpaceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DepthToSpaceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateDepthToSpaceOptions(
+      _fbb,
+      _block_size);
+}
+
+inline SubOptionsT *SubOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SubOptionsT>(new SubOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SubOptions::UnPackTo(SubOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
+}
+
+inline ::flatbuffers::Offset<SubOptions> SubOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSubOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SubOptions> CreateSubOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SubOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
+  return tflite::CreateSubOptions(
+      _fbb,
+      _fused_activation_function,
+      _pot_scale_int16);
+}
+
+inline DivOptionsT *DivOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DivOptionsT>(new DivOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DivOptions::UnPackTo(DivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline ::flatbuffers::Offset<DivOptions> DivOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDivOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DivOptions> CreateDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DivOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateDivOptions(
+      _fbb,
+      _fused_activation_function);
+}
+
+inline TopKV2OptionsT *TopKV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TopKV2OptionsT>(new TopKV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TopKV2Options::UnPackTo(TopKV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<TopKV2Options> TopKV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTopKV2Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TopKV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTopKV2Options(
+      _fbb);
+}
+
+inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EmbeddingLookupSparseOptionsT>(new EmbeddingLookupSparseOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EmbeddingLookupSparseOptions::UnPackTo(EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = combiner(); _o->combiner = _e; }
+}
+
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> EmbeddingLookupSparseOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EmbeddingLookupSparseOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _combiner = _o->combiner;
+  return tflite::CreateEmbeddingLookupSparseOptions(
+      _fbb,
+      _combiner);
+}
+
+inline GatherOptionsT *GatherOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GatherOptionsT>(new GatherOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GatherOptions::UnPackTo(GatherOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; }
+  { auto _e = batch_dims(); _o->batch_dims = _e; }
+}
+
+inline ::flatbuffers::Offset<GatherOptions> GatherOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GatherOptions> CreateGatherOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GatherOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  auto _batch_dims = _o->batch_dims;
+  return tflite::CreateGatherOptions(
+      _fbb,
+      _axis,
+      _batch_dims);
+}
+
+inline TransposeOptionsT *TransposeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TransposeOptionsT>(new TransposeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TransposeOptions::UnPackTo(TransposeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TransposeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTransposeOptions(
+      _fbb);
+}
+
+inline ExpOptionsT *ExpOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExpOptionsT>(new ExpOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExpOptions::UnPackTo(ExpOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ExpOptions> ExpOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ExpOptions> CreateExpOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExpOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateExpOptions(
+      _fbb);
+}
+
+inline CosOptionsT *CosOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CosOptionsT>(new CosOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CosOptions::UnPackTo(CosOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<CosOptions> CosOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCosOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CosOptions> CreateCosOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateCosOptions(
+      _fbb);
+}
+
+inline ReducerOptionsT *ReducerOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReducerOptionsT>(new ReducerOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReducerOptions::UnPackTo(ReducerOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = keep_dims(); _o->keep_dims = _e; }
+}
+
+inline ::flatbuffers::Offset<ReducerOptions> ReducerOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReducerOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReducerOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _keep_dims = _o->keep_dims;
+  return tflite::CreateReducerOptions(
+      _fbb,
+      _keep_dims);
+}
+
+inline SqueezeOptionsT *SqueezeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SqueezeOptionsT>(new SqueezeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SqueezeOptions::UnPackTo(SqueezeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = squeeze_dims(); if (_e) { _o->squeeze_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->squeeze_dims[_i] = _e->Get(_i); } } else { _o->squeeze_dims.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSqueezeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SqueezeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _squeeze_dims = _o->squeeze_dims.size() ? _fbb.CreateVector(_o->squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(
+      _fbb,
+      _squeeze_dims);
+}
+
+inline SplitOptionsT *SplitOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SplitOptionsT>(new SplitOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SplitOptions::UnPackTo(SplitOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; }
+}
+
+inline ::flatbuffers::Offset<SplitOptions> SplitOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SplitOptions> CreateSplitOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SplitOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitOptions(
+      _fbb,
+      _num_splits);
+}
+
+inline SplitVOptionsT *SplitVOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SplitVOptionsT>(new SplitVOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; }
+}
+
+inline ::flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitVOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitVOptions(
+      _fbb,
+      _num_splits);
+}
+
+inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StridedSliceOptionsT>(new StridedSliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = begin_mask(); _o->begin_mask = _e; }
+  { auto _e = end_mask(); _o->end_mask = _e; }
+  { auto _e = ellipsis_mask(); _o->ellipsis_mask = _e; }
+  { auto _e = new_axis_mask(); _o->new_axis_mask = _e; }
+  { auto _e = shrink_axis_mask(); _o->shrink_axis_mask = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+}
+
+inline ::flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStridedSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StridedSliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _begin_mask = _o->begin_mask;
+  auto _end_mask = _o->end_mask;
+  auto _ellipsis_mask = _o->ellipsis_mask;
+  auto _new_axis_mask = _o->new_axis_mask;
+  auto _shrink_axis_mask = _o->shrink_axis_mask;
+  auto _offset = _o->offset;
+  return tflite::CreateStridedSliceOptions(
+      _fbb,
+      _begin_mask,
+      _end_mask,
+      _ellipsis_mask,
+      _new_axis_mask,
+      _shrink_axis_mask,
+      _offset);
+}
+
+inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogSoftmaxOptionsT>(new LogSoftmaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogSoftmaxOptions(
+      _fbb);
+}
+
+inline CastOptionsT *CastOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CastOptionsT>(new CastOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CastOptions::UnPackTo(CastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; }
+  { auto _e = out_data_type(); _o->out_data_type = _e; }
+}
+
+inline ::flatbuffers::Offset<CastOptions> CastOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCastOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CastOptions> CreateCastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
+  return tflite::CreateCastOptions(
+      _fbb,
+      _in_data_type,
+      _out_data_type);
+}
+
+inline DequantizeOptionsT *DequantizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DequantizeOptionsT>(new DequantizeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDequantizeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDequantizeOptions(
+      _fbb);
+}
+
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MaximumMinimumOptionsT>(new MaximumMinimumOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
+      _fbb);
+}
+
+inline TileOptionsT *TileOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TileOptionsT>(new TileOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TileOptions::UnPackTo(TileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<TileOptions> TileOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTileOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TileOptions> CreateTileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TileOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTileOptions(
+      _fbb);
+}
+
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArgMaxOptionsT>(new ArgMaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; }
+}
+
+inline ::flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
+inline ArgMinOptionsT *ArgMinOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArgMinOptionsT>(new ArgMinOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArgMinOptions::UnPackTo(ArgMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; }
+}
+
+inline ::flatbuffers::Offset<ArgMinOptions> ArgMinOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMinOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArgMinOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMinOptions(
+      _fbb,
+      _output_type);
+}
+
+inline GreaterOptionsT *GreaterOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GreaterOptionsT>(new GreaterOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GreaterOptions::UnPackTo(GreaterOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<GreaterOptions> GreaterOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GreaterOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterOptions(
+      _fbb);
+}
+
+inline GreaterEqualOptionsT *GreaterEqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GreaterEqualOptionsT>(new GreaterEqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GreaterEqualOptions::UnPackTo(GreaterEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<GreaterEqualOptions> GreaterEqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GreaterEqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterEqualOptions(
+      _fbb);
+}
+
+inline LessOptionsT *LessOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LessOptionsT>(new LessOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<LessOptions> LessOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LessOptions> CreateLessOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessOptions(
+      _fbb);
+}
+
+inline LessEqualOptionsT *LessEqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LessEqualOptionsT>(new LessEqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LessEqualOptions::UnPackTo(LessEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<LessEqualOptions> LessEqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LessEqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessEqualOptions(
+      _fbb);
+}
+
+inline NegOptionsT *NegOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NegOptionsT>(new NegOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NegOptions::UnPackTo(NegOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<NegOptions> NegOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNegOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NegOptions> CreateNegOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NegOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNegOptions(
+      _fbb);
+}
+
+inline SelectOptionsT *SelectOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SelectOptionsT>(new SelectOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SelectOptions::UnPackTo(SelectOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SelectOptions> SelectOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSelectOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SelectOptions> CreateSelectOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SelectOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSelectOptions(
+      _fbb);
+}
+
+inline SliceOptionsT *SliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SliceOptionsT>(new SliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SliceOptions::UnPackTo(SliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SliceOptions> SliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SliceOptions> CreateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSliceOptions(
+      _fbb);
+}
+
+inline TransposeConvOptionsT *TransposeConvOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TransposeConvOptionsT>(new TransposeConvOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = quantized_bias_type(); _o->quantized_bias_type = _e; }
+}
+
+inline ::flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeConvOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TransposeConvOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _quantized_bias_type = _o->quantized_bias_type;
+  return tflite::CreateTransposeConvOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _fused_activation_function,
+      _quantized_bias_type);
+}
+
+inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExpandDimsOptionsT>(new ExpandDimsOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExpandDimsOptions::UnPackTo(ExpandDimsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ExpandDimsOptions> ExpandDimsOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpandDimsOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExpandDimsOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateExpandDimsOptions(
+      _fbb);
+}
+
+inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SparseToDenseOptionsT>(new SparseToDenseOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SparseToDenseOptions::UnPackTo(SparseToDenseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = validate_indices(); _o->validate_indices = _e; }
+}
+
+inline ::flatbuffers::Offset<SparseToDenseOptions> SparseToDenseOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparseToDenseOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SparseToDenseOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _validate_indices = _o->validate_indices;
+  return tflite::CreateSparseToDenseOptions(
+      _fbb,
+      _validate_indices);
+}
+
+inline EqualOptionsT *EqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EqualOptionsT>(new EqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EqualOptions::UnPackTo(EqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<EqualOptions> EqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EqualOptions> CreateEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateEqualOptions(
+      _fbb);
+}
+
+inline NotEqualOptionsT *NotEqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NotEqualOptionsT>(new NotEqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NotEqualOptions::UnPackTo(NotEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<NotEqualOptions> NotEqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNotEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NotEqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNotEqualOptions(
+      _fbb);
+}
+
+inline ShapeOptionsT *ShapeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ShapeOptionsT>(new ShapeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ShapeOptions::UnPackTo(ShapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = out_type(); _o->out_type = _e; }
+}
+
+inline ::flatbuffers::Offset<ShapeOptions> ShapeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateShapeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ShapeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _out_type = _o->out_type;
+  return tflite::CreateShapeOptions(
+      _fbb,
+      _out_type);
+}
+
+inline RankOptionsT *RankOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RankOptionsT>(new RankOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RankOptions::UnPackTo(RankOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<RankOptions> RankOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRankOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<RankOptions> CreateRankOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRankOptions(
+      _fbb);
+}
+
+inline PowOptionsT *PowOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PowOptionsT>(new PowOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PowOptions::UnPackTo(PowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<PowOptions> PowOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePowOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<PowOptions> CreatePowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PowOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePowOptions(
+      _fbb);
+}
+
+inline FakeQuantOptionsT *FakeQuantOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FakeQuantOptionsT>(new FakeQuantOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FakeQuantOptions::UnPackTo(FakeQuantOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = min(); _o->min = _e; }
+  { auto _e = max(); _o->max = _e; }
+  { auto _e = num_bits(); _o->num_bits = _e; }
+  { auto _e = narrow_range(); _o->narrow_range = _e; }
+}
+
+inline ::flatbuffers::Offset<FakeQuantOptions> FakeQuantOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFakeQuantOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FakeQuantOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _min = _o->min;
+  auto _max = _o->max;
+  auto _num_bits = _o->num_bits;
+  auto _narrow_range = _o->narrow_range;
+  return tflite::CreateFakeQuantOptions(
+      _fbb,
+      _min,
+      _max,
+      _num_bits,
+      _narrow_range);
+}
+
+inline PackOptionsT *PackOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PackOptionsT>(new PackOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PackOptions::UnPackTo(PackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values_count(); _o->values_count = _e; }
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline ::flatbuffers::Offset<PackOptions> PackOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePackOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<PackOptions> CreatePackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PackOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values_count = _o->values_count;
+  auto _axis = _o->axis;
+  return tflite::CreatePackOptions(
+      _fbb,
+      _values_count,
+      _axis);
+}
+
+inline LogicalOrOptionsT *LogicalOrOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogicalOrOptionsT>(new LogicalOrOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogicalOrOptions::UnPackTo(LogicalOrOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<LogicalOrOptions> LogicalOrOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalOrOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogicalOrOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalOrOptions(
+      _fbb);
+}
+
+inline OneHotOptionsT *OneHotOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<OneHotOptionsT>(new OneHotOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void OneHotOptions::UnPackTo(OneHotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline ::flatbuffers::Offset<OneHotOptions> OneHotOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOneHotOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const OneHotOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateOneHotOptions(
+      _fbb,
+      _axis);
+}
+
+inline AbsOptionsT *AbsOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AbsOptionsT>(new AbsOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<AbsOptions> AbsOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAbsOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<AbsOptions> CreateAbsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAbsOptions(
+      _fbb);
+}
+
+inline HardSwishOptionsT *HardSwishOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HardSwishOptionsT>(new HardSwishOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HardSwishOptions::UnPackTo(HardSwishOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<HardSwishOptions> HardSwishOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHardSwishOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HardSwishOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateHardSwishOptions(
+      _fbb);
+}
+
+inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogicalAndOptionsT>(new LogicalAndOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogicalAndOptions::UnPackTo(LogicalAndOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<LogicalAndOptions> LogicalAndOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalAndOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogicalAndOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalAndOptions(
+      _fbb);
+}
+
+inline LogicalNotOptionsT *LogicalNotOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogicalNotOptionsT>(new LogicalNotOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogicalNotOptions::UnPackTo(LogicalNotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<LogicalNotOptions> LogicalNotOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalNotOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogicalNotOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalNotOptions(
+      _fbb);
+}
+
+inline UnpackOptionsT *UnpackOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnpackOptionsT>(new UnpackOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnpackOptions::UnPackTo(UnpackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num(); _o->num = _e; }
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline ::flatbuffers::Offset<UnpackOptions> UnpackOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnpackOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnpackOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num = _o->num;
+  auto _axis = _o->axis;
+  return tflite::CreateUnpackOptions(
+      _fbb,
+      _num,
+      _axis);
+}
+
+inline FloorDivOptionsT *FloorDivOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FloorDivOptionsT>(new FloorDivOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FloorDivOptions::UnPackTo(FloorDivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<FloorDivOptions> FloorDivOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorDivOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FloorDivOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorDivOptions(
+      _fbb);
+}
+
+inline SquareOptionsT *SquareOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SquareOptionsT>(new SquareOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SquareOptions::UnPackTo(SquareOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SquareOptions> SquareOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquareOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SquareOptions> CreateSquareOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SquareOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquareOptions(
+      _fbb);
+}
+
+inline ZerosLikeOptionsT *ZerosLikeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ZerosLikeOptionsT>(new ZerosLikeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ZerosLikeOptions::UnPackTo(ZerosLikeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ZerosLikeOptions> ZerosLikeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateZerosLikeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ZerosLikeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateZerosLikeOptions(
+      _fbb);
+}
+
+inline FillOptionsT *FillOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FillOptionsT>(new FillOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FillOptions::UnPackTo(FillOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<FillOptions> FillOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFillOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FillOptions> CreateFillOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FillOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFillOptions(
+      _fbb);
+}
+
+inline FloorModOptionsT *FloorModOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FloorModOptionsT>(new FloorModOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FloorModOptions::UnPackTo(FloorModOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<FloorModOptions> FloorModOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorModOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FloorModOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorModOptions(
+      _fbb);
+}
+
+inline RangeOptionsT *RangeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RangeOptionsT>(new RangeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RangeOptions::UnPackTo(RangeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<RangeOptions> RangeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRangeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<RangeOptions> CreateRangeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RangeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRangeOptions(
+      _fbb);
+}
+
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LeakyReluOptionsT>(new LeakyReluOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = alpha(); _o->alpha = _e; }
+}
+
+inline ::flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLeakyReluOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _alpha = _o->alpha;
+  return tflite::CreateLeakyReluOptions(
+      _fbb,
+      _alpha);
+}
+
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SquaredDifferenceOptionsT>(new SquaredDifferenceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquaredDifferenceOptions(
+      _fbb);
+}
+
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MirrorPadOptionsT>(new MirrorPadOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; }
+}
+
+inline ::flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMirrorPadOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  return tflite::CreateMirrorPadOptions(
+      _fbb,
+      _mode);
+}
+
+inline UniqueOptionsT *UniqueOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UniqueOptionsT>(new UniqueOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = idx_out_type(); _o->idx_out_type = _e; }
+}
+
+inline ::flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUniqueOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _idx_out_type = _o->idx_out_type;
+  return tflite::CreateUniqueOptions(
+      _fbb,
+      _idx_out_type);
+}
+
+inline ReverseV2OptionsT *ReverseV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReverseV2OptionsT>(new ReverseV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseV2Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateReverseV2Options(
+      _fbb);
+}
+
+inline AddNOptionsT *AddNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AddNOptionsT>(new AddNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AddNOptions::UnPackTo(AddNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<AddNOptions> AddNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddNOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<AddNOptions> CreateAddNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AddNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAddNOptions(
+      _fbb);
+}
+
+inline GatherNdOptionsT *GatherNdOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GatherNdOptionsT>(new GatherNdOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherNdOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGatherNdOptions(
+      _fbb);
+}
+
+inline WhereOptionsT *WhereOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<WhereOptionsT>(new WhereOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<WhereOptions> WhereOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhereOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<WhereOptions> CreateWhereOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateWhereOptions(
+      _fbb);
+}
+
+inline ReverseSequenceOptionsT *ReverseSequenceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReverseSequenceOptionsT>(new ReverseSequenceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReverseSequenceOptions::UnPackTo(ReverseSequenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = seq_dim(); _o->seq_dim = _e; }
+  { auto _e = batch_dim(); _o->batch_dim = _e; }
+}
+
+inline ::flatbuffers::Offset<ReverseSequenceOptions> ReverseSequenceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseSequenceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReverseSequenceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _seq_dim = _o->seq_dim;
+  auto _batch_dim = _o->batch_dim;
+  return tflite::CreateReverseSequenceOptions(
+      _fbb,
+      _seq_dim,
+      _batch_dim);
+}
+
+inline MatrixDiagOptionsT *MatrixDiagOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MatrixDiagOptionsT>(new MatrixDiagOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MatrixDiagOptions::UnPackTo(MatrixDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<MatrixDiagOptions> MatrixDiagOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMatrixDiagOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MatrixDiagOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMatrixDiagOptions(
+      _fbb);
+}
+
+inline QuantizeOptionsT *QuantizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<QuantizeOptionsT>(new QuantizeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void QuantizeOptions::UnPackTo(QuantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<QuantizeOptions> QuantizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateQuantizeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const QuantizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateQuantizeOptions(
+      _fbb);
+}
+
+inline MatrixSetDiagOptionsT *MatrixSetDiagOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MatrixSetDiagOptionsT>(new MatrixSetDiagOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MatrixSetDiagOptions::UnPackTo(MatrixSetDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> MatrixSetDiagOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMatrixSetDiagOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MatrixSetDiagOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMatrixSetDiagOptions(
+      _fbb);
+}
+
+inline IfOptionsT *IfOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<IfOptionsT>(new IfOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void IfOptions::UnPackTo(IfOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = then_subgraph_index(); _o->then_subgraph_index = _e; }
+  { auto _e = else_subgraph_index(); _o->else_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<IfOptions> IfOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateIfOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<IfOptions> CreateIfOptions(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const IfOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _then_subgraph_index = _o->then_subgraph_index;
+  auto _else_subgraph_index = _o->else_subgraph_index;
+  return tflite::CreateIfOptions(
+      _fbb,
+      _then_subgraph_index,
+      _else_subgraph_index);
+}
+
+inline CallOnceOptionsT *CallOnceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CallOnceOptionsT>(new CallOnceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CallOnceOptions::UnPackTo(CallOnceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = init_subgraph_index(); _o->init_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<CallOnceOptions> CallOnceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCallOnceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CallOnceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _init_subgraph_index = _o->init_subgraph_index;
+  return tflite::CreateCallOnceOptions(
+      _fbb,
+      _init_subgraph_index);
+}
+
+inline WhileOptionsT *WhileOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<WhileOptionsT>(new WhileOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void WhileOptions::UnPackTo(WhileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cond_subgraph_index(); _o->cond_subgraph_index = _e; }
+  { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<WhileOptions> WhileOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhileOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<WhileOptions> CreateWhileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const WhileOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cond_subgraph_index = _o->cond_subgraph_index;
+  auto _body_subgraph_index = _o->body_subgraph_index;
+  return tflite::CreateWhileOptions(
+      _fbb,
+      _cond_subgraph_index,
+      _body_subgraph_index);
+}
+
+inline NonMaxSuppressionV4OptionsT *NonMaxSuppressionV4Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NonMaxSuppressionV4OptionsT>(new NonMaxSuppressionV4OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NonMaxSuppressionV4Options::UnPackTo(NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> NonMaxSuppressionV4Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNonMaxSuppressionV4Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV4OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNonMaxSuppressionV4Options(
+      _fbb);
+}
+
+inline NonMaxSuppressionV5OptionsT *NonMaxSuppressionV5Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NonMaxSuppressionV5OptionsT>(new NonMaxSuppressionV5OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NonMaxSuppressionV5Options::UnPackTo(NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> NonMaxSuppressionV5Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNonMaxSuppressionV5Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV5OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNonMaxSuppressionV5Options(
+      _fbb);
+}
+
+inline ScatterNdOptionsT *ScatterNdOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ScatterNdOptionsT>(new ScatterNdOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ScatterNdOptions::UnPackTo(ScatterNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ScatterNdOptions> ScatterNdOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateScatterNdOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ScatterNdOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateScatterNdOptions(
+      _fbb);
+}
+
+inline SelectV2OptionsT *SelectV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SelectV2OptionsT>(new SelectV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SelectV2Options::UnPackTo(SelectV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SelectV2Options> SelectV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSelectV2Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SelectV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSelectV2Options(
+      _fbb);
+}
+
+inline DensifyOptionsT *DensifyOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DensifyOptionsT>(new DensifyOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DensifyOptions::UnPackTo(DensifyOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<DensifyOptions> DensifyOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDensifyOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DensifyOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDensifyOptions(
+      _fbb);
+}
+
+inline SegmentSumOptionsT *SegmentSumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SegmentSumOptionsT>(new SegmentSumOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SegmentSumOptions::UnPackTo(SegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SegmentSumOptions> SegmentSumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSegmentSumOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SegmentSumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSegmentSumOptions(
+      _fbb);
+}
+
+inline BatchMatMulOptionsT *BatchMatMulOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BatchMatMulOptionsT>(new BatchMatMulOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BatchMatMulOptions::UnPackTo(BatchMatMulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = adj_x(); _o->adj_x = _e; }
+  { auto _e = adj_y(); _o->adj_y = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline ::flatbuffers::Offset<BatchMatMulOptions> BatchMatMulOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBatchMatMulOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BatchMatMulOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _adj_x = _o->adj_x;
+  auto _adj_y = _o->adj_y;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateBatchMatMulOptions(
+      _fbb,
+      _adj_x,
+      _adj_y,
+      _asymmetric_quantize_inputs);
+}
+
+inline CumsumOptionsT *CumsumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CumsumOptionsT>(new CumsumOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CumsumOptions::UnPackTo(CumsumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = exclusive(); _o->exclusive = _e; }
+  { auto _e = reverse(); _o->reverse = _e; }
+}
+
+inline ::flatbuffers::Offset<CumsumOptions> CumsumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCumsumOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CumsumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _exclusive = _o->exclusive;
+  auto _reverse = _o->reverse;
+  return tflite::CreateCumsumOptions(
+      _fbb,
+      _exclusive,
+      _reverse);
+}
+
+inline BroadcastToOptionsT *BroadcastToOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BroadcastToOptionsT>(new BroadcastToOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BroadcastToOptions::UnPackTo(BroadcastToOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<BroadcastToOptions> BroadcastToOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBroadcastToOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BroadcastToOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBroadcastToOptions(
+      _fbb);
+}
+
+inline Rfft2dOptionsT *Rfft2dOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Rfft2dOptionsT>(new Rfft2dOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Rfft2dOptions::UnPackTo(Rfft2dOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<Rfft2dOptions> Rfft2dOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRfft2dOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Rfft2dOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRfft2dOptions(
+      _fbb);
+}
+
+inline HashtableOptionsT *HashtableOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HashtableOptionsT>(new HashtableOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HashtableOptions::UnPackTo(HashtableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = table_id(); _o->table_id = _e; }
+  { auto _e = key_dtype(); _o->key_dtype = _e; }
+  { auto _e = value_dtype(); _o->value_dtype = _e; }
+}
+
+inline ::flatbuffers::Offset<HashtableOptions> HashtableOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHashtableOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _table_id = _o->table_id;
+  auto _key_dtype = _o->key_dtype;
+  auto _value_dtype = _o->value_dtype;
+  return tflite::CreateHashtableOptions(
+      _fbb,
+      _table_id,
+      _key_dtype,
+      _value_dtype);
+}
+
+inline HashtableFindOptionsT *HashtableFindOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HashtableFindOptionsT>(new HashtableFindOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HashtableFindOptions::UnPackTo(HashtableFindOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<HashtableFindOptions> HashtableFindOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHashtableFindOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableFindOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateHashtableFindOptions(
+      _fbb);
+}
+
+inline HashtableImportOptionsT *HashtableImportOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HashtableImportOptionsT>(new HashtableImportOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HashtableImportOptions::UnPackTo(HashtableImportOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<HashtableImportOptions> HashtableImportOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHashtableImportOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableImportOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateHashtableImportOptions(
+      _fbb);
+}
+
+inline HashtableSizeOptionsT *HashtableSizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HashtableSizeOptionsT>(new HashtableSizeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HashtableSizeOptions::UnPackTo(HashtableSizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<HashtableSizeOptions> HashtableSizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHashtableSizeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableSizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateHashtableSizeOptions(
+      _fbb);
+}
+
+inline VarHandleOptionsT *VarHandleOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<VarHandleOptionsT>(new VarHandleOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void VarHandleOptions::UnPackTo(VarHandleOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = container(); if (_e) _o->container = _e->str(); }
+  { auto _e = shared_name(); if (_e) _o->shared_name = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<VarHandleOptions> VarHandleOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateVarHandleOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const VarHandleOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _container = _o->container.empty() ? 0 : _fbb.CreateString(_o->container);
+  auto _shared_name = _o->shared_name.empty() ? 0 : _fbb.CreateString(_o->shared_name);
+  return tflite::CreateVarHandleOptions(
+      _fbb,
+      _container,
+      _shared_name);
+}
+
+inline ReadVariableOptionsT *ReadVariableOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReadVariableOptionsT>(new ReadVariableOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReadVariableOptions::UnPackTo(ReadVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ReadVariableOptions> ReadVariableOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReadVariableOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReadVariableOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateReadVariableOptions(
+      _fbb);
+}
+
+inline AssignVariableOptionsT *AssignVariableOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AssignVariableOptionsT>(new AssignVariableOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AssignVariableOptions::UnPackTo(AssignVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<AssignVariableOptions> AssignVariableOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAssignVariableOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AssignVariableOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAssignVariableOptions(
+      _fbb);
+}
+
+inline RandomOptionsT *RandomOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RandomOptionsT>(new RandomOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RandomOptions::UnPackTo(RandomOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = seed(); _o->seed = _e; }
+  { auto _e = seed2(); _o->seed2 = _e; }
+}
+
+inline ::flatbuffers::Offset<RandomOptions> RandomOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRandomOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<RandomOptions> CreateRandomOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RandomOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _seed = _o->seed;
+  auto _seed2 = _o->seed2;
+  return tflite::CreateRandomOptions(
+      _fbb,
+      _seed,
+      _seed2);
+}
+
+inline BucketizeOptionsT *BucketizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BucketizeOptionsT>(new BucketizeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BucketizeOptions::UnPackTo(BucketizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = boundaries(); if (_e) { _o->boundaries.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->boundaries[_i] = _e->Get(_i); } } else { _o->boundaries.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BucketizeOptions> BucketizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBucketizeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BucketizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _boundaries = _o->boundaries.size() ? _fbb.CreateVector(_o->boundaries) : 0;
+  return tflite::CreateBucketizeOptions(
+      _fbb,
+      _boundaries);
+}
+
+inline GeluOptionsT *GeluOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GeluOptionsT>(new GeluOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GeluOptions::UnPackTo(GeluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = approximate(); _o->approximate = _e; }
+}
+
+inline ::flatbuffers::Offset<GeluOptions> GeluOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGeluOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GeluOptions> CreateGeluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GeluOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _approximate = _o->approximate;
+  return tflite::CreateGeluOptions(
+      _fbb,
+      _approximate);
+}
+
+inline DynamicUpdateSliceOptionsT *DynamicUpdateSliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DynamicUpdateSliceOptionsT>(new DynamicUpdateSliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DynamicUpdateSliceOptions::UnPackTo(DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> DynamicUpdateSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDynamicUpdateSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DynamicUpdateSliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDynamicUpdateSliceOptions(
+      _fbb);
+}
+
+inline UnsortedSegmentProdOptionsT *UnsortedSegmentProdOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnsortedSegmentProdOptionsT>(new UnsortedSegmentProdOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnsortedSegmentProdOptions::UnPackTo(UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> UnsortedSegmentProdOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnsortedSegmentProdOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentProdOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateUnsortedSegmentProdOptions(
+      _fbb);
+}
+
+inline UnsortedSegmentMaxOptionsT *UnsortedSegmentMaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnsortedSegmentMaxOptionsT>(new UnsortedSegmentMaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnsortedSegmentMaxOptions::UnPackTo(UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> UnsortedSegmentMaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnsortedSegmentMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentMaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateUnsortedSegmentMaxOptions(
+      _fbb);
+}
+
+inline UnsortedSegmentSumOptionsT *UnsortedSegmentSumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnsortedSegmentSumOptionsT>(new UnsortedSegmentSumOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnsortedSegmentSumOptions::UnPackTo(UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> UnsortedSegmentSumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnsortedSegmentSumOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentSumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateUnsortedSegmentSumOptions(
+      _fbb);
+}
+
+inline ATan2OptionsT *ATan2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ATan2OptionsT>(new ATan2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ATan2Options::UnPackTo(ATan2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<ATan2Options> ATan2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateATan2Options(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ATan2Options> CreateATan2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ATan2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateATan2Options(
+      _fbb);
+}
+
+inline UnsortedSegmentMinOptionsT *UnsortedSegmentMinOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnsortedSegmentMinOptionsT>(new UnsortedSegmentMinOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnsortedSegmentMinOptions::UnPackTo(UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> UnsortedSegmentMinOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnsortedSegmentMinOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentMinOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateUnsortedSegmentMinOptions(
+      _fbb);
+}
+
+inline SignOptionsT *SignOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SignOptionsT>(new SignOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SignOptions::UnPackTo(SignOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<SignOptions> SignOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSignOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SignOptions> CreateSignOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SignOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSignOptions(
+      _fbb);
+}
+
+inline BitcastOptionsT *BitcastOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BitcastOptionsT>(new BitcastOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BitcastOptions::UnPackTo(BitcastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<BitcastOptions> BitcastOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBitcastOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BitcastOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBitcastOptions(
+      _fbb);
+}
+
+inline BitwiseXorOptionsT *BitwiseXorOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BitwiseXorOptionsT>(new BitwiseXorOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BitwiseXorOptions::UnPackTo(BitwiseXorOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> BitwiseXorOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBitwiseXorOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BitwiseXorOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBitwiseXorOptions(
+      _fbb);
+}
+
+inline RightShiftOptionsT *RightShiftOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RightShiftOptionsT>(new RightShiftOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RightShiftOptions::UnPackTo(RightShiftOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<RightShiftOptions> RightShiftOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRightShiftOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RightShiftOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRightShiftOptions(
+      _fbb);
+}
+
+inline DilateOptionsT *DilateOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DilateOptionsT>(new DilateOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DilateOptions::UnPackTo(DilateOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<DilateOptions> DilateOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DilateOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDilateOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<DilateOptions> CreateDilateOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DilateOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DilateOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDilateOptions(
+      _fbb);
+}
+
+inline ReduceWindowOptionsT *ReduceWindowOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReduceWindowOptionsT>(new ReduceWindowOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReduceWindowOptions::UnPackTo(ReduceWindowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = reduce_function(); _o->reduce_function = _e; }
+}
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> ReduceWindowOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReduceWindowOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReduceWindowOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _reduce_function = _o->reduce_function;
+  return tflite::CreateReduceWindowOptions(
+      _fbb,
+      _reduce_function);
+}
+
+inline OperatorCodeT *OperatorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<OperatorCodeT>(new OperatorCodeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = deprecated_builtin_code(); _o->deprecated_builtin_code = _e; }
+  { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); }
+  { auto _e = version(); _o->version = _e; }
+  { auto _e = builtin_code(); _o->builtin_code = _e; }
+}
+
+inline ::flatbuffers::Offset<OperatorCode> OperatorCode::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOperatorCode(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _deprecated_builtin_code = _o->deprecated_builtin_code;
+  auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  auto _version = _o->version;
+  auto _builtin_code = _o->builtin_code;
+  return tflite::CreateOperatorCode(
+      _fbb,
+      _deprecated_builtin_code,
+      _custom_code,
+      _version,
+      _builtin_code);
+}
+
+inline StableHLOCompositeOptionsT *StableHLOCompositeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StableHLOCompositeOptionsT>(new StableHLOCompositeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StableHLOCompositeOptions::UnPackTo(StableHLOCompositeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = decomposition_subgraph_index(); _o->decomposition_subgraph_index = _e; }
+  { auto _e = composite_attributes(); if (_e) { _o->composite_attributes.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->composite_attributes.begin()); } }
+  { auto _e = composite_attributes_format(); _o->composite_attributes_format = _e; }
+  { auto _e = version(); _o->version = _e; }
+}
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> StableHLOCompositeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStableHLOCompositeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StableHLOCompositeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _decomposition_subgraph_index = _o->decomposition_subgraph_index;
+  auto _composite_attributes = _o->composite_attributes.size() ? _fbb.CreateVector(_o->composite_attributes) : 0;
+  auto _composite_attributes_format = _o->composite_attributes_format;
+  auto _version = _o->version;
+  return tflite::CreateStableHLOCompositeOptions(
+      _fbb,
+      _name,
+      _decomposition_subgraph_index,
+      _composite_attributes,
+      _composite_attributes_format,
+      _version);
+}
+
+inline StablehloShiftLeftOptionsT *StablehloShiftLeftOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StablehloShiftLeftOptionsT>(new StablehloShiftLeftOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StablehloShiftLeftOptions::UnPackTo(StablehloShiftLeftOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<StablehloShiftLeftOptions> StablehloShiftLeftOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloShiftLeftOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStablehloShiftLeftOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StablehloShiftLeftOptions> CreateStablehloShiftLeftOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StablehloShiftLeftOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StablehloShiftLeftOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateStablehloShiftLeftOptions(
+      _fbb);
+}
+
+inline OperatorT *Operator::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<OperatorT>(new OperatorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Operator::UnPackTo(OperatorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = opcode_index(); _o->opcode_index = _e; }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } else { _o->outputs.resize(0); } }
+  { auto _e = builtin_options_type(); _o->builtin_options.type = _e; }
+  { auto _e = builtin_options(); if (_e) _o->builtin_options.value = tflite::BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); }
+  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom_options.begin()); } }
+  { auto _e = custom_options_format(); _o->custom_options_format = _e; }
+  { auto _e = mutating_variable_inputs(); if (_e) { _o->mutating_variable_inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mutating_variable_inputs[_i] = _e->Get(_i) != 0; } } else { _o->mutating_variable_inputs.resize(0); } }
+  { auto _e = intermediates(); if (_e) { _o->intermediates.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->intermediates[_i] = _e->Get(_i); } } else { _o->intermediates.resize(0); } }
+  { auto _e = large_custom_options_offset(); _o->large_custom_options_offset = _e; }
+  { auto _e = large_custom_options_size(); _o->large_custom_options_size = _e; }
+  { auto _e = builtin_options_2_type(); _o->builtin_options_2.type = _e; }
+  { auto _e = builtin_options_2(); if (_e) _o->builtin_options_2.value = tflite::BuiltinOptions2Union::UnPack(_e, builtin_options_2_type(), _resolver); }
+  { auto _e = debug_metadata_index(); _o->debug_metadata_index = _e; }
+}
+
+inline ::flatbuffers::Offset<Operator> Operator::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOperator(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Operator> CreateOperator(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const OperatorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _opcode_index = _o->opcode_index;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
+  auto _builtin_options_type = _o->builtin_options.type;
+  auto _builtin_options = _o->builtin_options.Pack(_fbb);
+  auto _custom_options = _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
+  auto _custom_options_format = _o->custom_options_format;
+  auto _mutating_variable_inputs = _o->mutating_variable_inputs.size() ? _fbb.CreateVector(_o->mutating_variable_inputs) : 0;
+  auto _intermediates = _o->intermediates.size() ? _fbb.CreateVector(_o->intermediates) : 0;
+  auto _large_custom_options_offset = _o->large_custom_options_offset;
+  auto _large_custom_options_size = _o->large_custom_options_size;
+  auto _builtin_options_2_type = _o->builtin_options_2.type;
+  auto _builtin_options_2 = _o->builtin_options_2.Pack(_fbb);
+  auto _debug_metadata_index = _o->debug_metadata_index;
+  return tflite::CreateOperator(
+      _fbb,
+      _opcode_index,
+      _inputs,
+      _outputs,
+      _builtin_options_type,
+      _builtin_options,
+      _custom_options,
+      _custom_options_format,
+      _mutating_variable_inputs,
+      _intermediates,
+      _large_custom_options_offset,
+      _large_custom_options_size,
+      _builtin_options_2_type,
+      _builtin_options_2,
+      _debug_metadata_index);
+}
+
+inline SubGraphT::SubGraphT(const SubGraphT &o)
+      : inputs(o.inputs),
+        outputs(o.outputs),
+        name(o.name),
+        debug_metadata_index(o.debug_metadata_index) {
+  tensors.reserve(o.tensors.size());
+  for (const auto &tensors_ : o.tensors) { tensors.emplace_back((tensors_) ? new tflite::TensorT(*tensors_) : nullptr); }
+  operators.reserve(o.operators.size());
+  for (const auto &operators_ : o.operators) { operators.emplace_back((operators_) ? new tflite::OperatorT(*operators_) : nullptr); }
+}
+
+inline SubGraphT &SubGraphT::operator=(SubGraphT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(tensors, o.tensors);
+  std::swap(inputs, o.inputs);
+  std::swap(outputs, o.outputs);
+  std::swap(operators, o.operators);
+  std::swap(name, o.name);
+  std::swap(debug_metadata_index, o.debug_metadata_index);
+  return *this;
+}
+
+inline SubGraphT *SubGraph::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SubGraphT>(new SubGraphT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SubGraph::UnPackTo(SubGraphT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->tensors.resize(0); } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } else { _o->outputs.resize(0); } }
+  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operators.resize(0); } }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = debug_metadata_index(); _o->debug_metadata_index = _e; }
+}
+
+inline ::flatbuffers::Offset<SubGraph> SubGraph::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSubGraph(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraph(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SubGraphT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tensors = _o->tensors.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Tensor>> (_o->tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
+  auto _operators = _o->operators.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Operator>> (_o->operators.size(), [](size_t i, _VectorArgs *__va) { return CreateOperator(*__va->__fbb, __va->__o->operators[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _debug_metadata_index = _o->debug_metadata_index;
+  return tflite::CreateSubGraph(
+      _fbb,
+      _tensors,
+      _inputs,
+      _outputs,
+      _operators,
+      _name,
+      _debug_metadata_index);
+}
+
+inline BufferT *Buffer::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BufferT>(new BufferT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Buffer::UnPackTo(BufferT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->data.begin()); } }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = size(); _o->size = _e; }
+}
+
+inline ::flatbuffers::Offset<Buffer> Buffer::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBuffer(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->data.size(), sizeof(uint8_t), 16);
+  auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
+  auto _offset = _o->offset;
+  auto _size = _o->size;
+  return tflite::CreateBuffer(
+      _fbb,
+      _data,
+      _offset,
+      _size);
+}
+
+inline MetadataT *Metadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MetadataT>(new MetadataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Metadata::UnPackTo(MetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = buffer(); _o->buffer = _e; }
+}
+
+inline ::flatbuffers::Offset<Metadata> Metadata::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMetadata(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Metadata> CreateMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MetadataT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _buffer = _o->buffer;
+  return tflite::CreateMetadata(
+      _fbb,
+      _name,
+      _buffer);
+}
+
+inline TensorMapT *TensorMap::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TensorMapT>(new TensorMapT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TensorMap::UnPackTo(TensorMapT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = tensor_index(); _o->tensor_index = _e; }
+}
+
+inline ::flatbuffers::Offset<TensorMap> TensorMap::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensorMap(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMap(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TensorMapT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _tensor_index = _o->tensor_index;
+  return tflite::CreateTensorMap(
+      _fbb,
+      _name,
+      _tensor_index);
+}
+
+inline SignatureDefT::SignatureDefT(const SignatureDefT &o)
+      : signature_key(o.signature_key),
+        subgraph_index(o.subgraph_index) {
+  inputs.reserve(o.inputs.size());
+  for (const auto &inputs_ : o.inputs) { inputs.emplace_back((inputs_) ? new tflite::TensorMapT(*inputs_) : nullptr); }
+  outputs.reserve(o.outputs.size());
+  for (const auto &outputs_ : o.outputs) { outputs.emplace_back((outputs_) ? new tflite::TensorMapT(*outputs_) : nullptr); }
+}
+
+inline SignatureDefT &SignatureDefT::operator=(SignatureDefT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(inputs, o.inputs);
+  std::swap(outputs, o.outputs);
+  std::swap(signature_key, o.signature_key);
+  std::swap(subgraph_index, o.subgraph_index);
+  return *this;
+}
+
+inline SignatureDefT *SignatureDef::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SignatureDefT>(new SignatureDefT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SignatureDef::UnPackTo(SignatureDefT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->outputs.resize(0); } }
+  { auto _e = signature_key(); if (_e) _o->signature_key = _e->str(); }
+  { auto _e = subgraph_index(); _o->subgraph_index = _e; }
+}
+
+inline ::flatbuffers::Offset<SignatureDef> SignatureDef::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSignatureDef(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDef(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SignatureDefT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>> (_o->inputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->inputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>> (_o->outputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->outputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _signature_key = _o->signature_key.empty() ? 0 : _fbb.CreateString(_o->signature_key);
+  auto _subgraph_index = _o->subgraph_index;
+  return tflite::CreateSignatureDef(
+      _fbb,
+      _inputs,
+      _outputs,
+      _signature_key,
+      _subgraph_index);
+}
+
+inline ModelT::ModelT(const ModelT &o)
+      : version(o.version),
+        description(o.description),
+        metadata_buffer(o.metadata_buffer) {
+  operator_codes.reserve(o.operator_codes.size());
+  for (const auto &operator_codes_ : o.operator_codes) { operator_codes.emplace_back((operator_codes_) ? new tflite::OperatorCodeT(*operator_codes_) : nullptr); }
+  subgraphs.reserve(o.subgraphs.size());
+  for (const auto &subgraphs_ : o.subgraphs) { subgraphs.emplace_back((subgraphs_) ? new tflite::SubGraphT(*subgraphs_) : nullptr); }
+  buffers.reserve(o.buffers.size());
+  for (const auto &buffers_ : o.buffers) { buffers.emplace_back((buffers_) ? new tflite::BufferT(*buffers_) : nullptr); }
+  metadata.reserve(o.metadata.size());
+  for (const auto &metadata_ : o.metadata) { metadata.emplace_back((metadata_) ? new tflite::MetadataT(*metadata_) : nullptr); }
+  signature_defs.reserve(o.signature_defs.size());
+  for (const auto &signature_defs_ : o.signature_defs) { signature_defs.emplace_back((signature_defs_) ? new tflite::SignatureDefT(*signature_defs_) : nullptr); }
+}
+
+inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(version, o.version);
+  std::swap(operator_codes, o.operator_codes);
+  std::swap(subgraphs, o.subgraphs);
+  std::swap(description, o.description);
+  std::swap(buffers, o.buffers);
+  std::swap(metadata_buffer, o.metadata_buffer);
+  std::swap(metadata, o.metadata);
+  std::swap(signature_defs, o.signature_defs);
+  return *this;
+}
+
+inline ModelT *Model::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelT>(new ModelT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Model::UnPackTo(ModelT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = version(); _o->version = _e; }
+  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operator_codes.resize(0); } }
+  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->subgraphs.resize(0); } }
+  { auto _e = description(); if (_e) _o->description = _e->str(); }
+  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->buffers.resize(0); } }
+  { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } else { _o->metadata_buffer.resize(0); } }
+  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metadata.resize(0); } }
+  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->signature_defs.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<Model> Model::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModel(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _version = _o->version;
+  auto _operator_codes = _o->operator_codes.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>> (_o->operator_codes.size(), [](size_t i, _VectorArgs *__va) { return CreateOperatorCode(*__va->__fbb, __va->__o->operator_codes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _description = _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
+  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
+  auto _metadata = _o->metadata.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateModel(
+      _fbb,
+      _version,
+      _operator_codes,
+      _subgraphs,
+      _description,
+      _buffers,
+      _metadata_buffer,
+      _metadata,
+      _signature_defs);
+}
+
+inline bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+  switch (type) {
+    case QuantizationDetails_NONE: {
+      return true;
+    }
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case QuantizationDetails_BlockwiseQuantization: {
+      auto ptr = reinterpret_cast<const tflite::BlockwiseQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyQuantizationDetailsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyQuantizationDetails(
+        verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const ::flatbuffers::resolver_function_t *resolver) {
+  (void)resolver;
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantization *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case QuantizationDetails_BlockwiseQuantization: {
+      auto ptr = reinterpret_cast<const tflite::BlockwiseQuantization *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline ::flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
+  (void)_rehasher;
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantizationT *>(value);
+      return CreateCustomQuantization(_fbb, ptr, _rehasher).Union();
+    }
+    case QuantizationDetails_BlockwiseQuantization: {
+      auto ptr = reinterpret_cast<const tflite::BlockwiseQuantizationT *>(value);
+      return CreateBlockwiseQuantization(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline QuantizationDetailsUnion::QuantizationDetailsUnion(const QuantizationDetailsUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      value = new tflite::CustomQuantizationT(*reinterpret_cast<tflite::CustomQuantizationT *>(u.value));
+      break;
+    }
+    case QuantizationDetails_BlockwiseQuantization: {
+      value = new tflite::BlockwiseQuantizationT(*reinterpret_cast<tflite::BlockwiseQuantizationT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void QuantizationDetailsUnion::Reset() {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<tflite::CustomQuantizationT *>(value);
+      delete ptr;
+      break;
+    }
+    case QuantizationDetails_BlockwiseQuantization: {
+      auto ptr = reinterpret_cast<tflite::BlockwiseQuantizationT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = QuantizationDetails_NONE;
+}
+
+inline bool VerifySparseIndexVector(::flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type) {
+  switch (type) {
+    case SparseIndexVector_NONE: {
+      return true;
+    }
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifySparseIndexVector(
+        verifier,  values->Get(i), types->GetEnum<SparseIndexVector>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *SparseIndexVectorUnion::UnPack(const void *obj, SparseIndexVector type, const ::flatbuffers::resolver_function_t *resolver) {
+  (void)resolver;
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline ::flatbuffers::Offset<void> SparseIndexVectorUnion::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
+  (void)_rehasher;
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32VectorT *>(value);
+      return CreateInt32Vector(_fbb, ptr, _rehasher).Union();
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16VectorT *>(value);
+      return CreateUint16Vector(_fbb, ptr, _rehasher).Union();
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8VectorT *>(value);
+      return CreateUint8Vector(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline SparseIndexVectorUnion::SparseIndexVectorUnion(const SparseIndexVectorUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      value = new tflite::Int32VectorT(*reinterpret_cast<tflite::Int32VectorT *>(u.value));
+      break;
+    }
+    case SparseIndexVector_Uint16Vector: {
+      value = new tflite::Uint16VectorT(*reinterpret_cast<tflite::Uint16VectorT *>(u.value));
+      break;
+    }
+    case SparseIndexVector_Uint8Vector: {
+      value = new tflite::Uint8VectorT(*reinterpret_cast<tflite::Uint8VectorT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void SparseIndexVectorUnion::Reset() {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<tflite::Int32VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<tflite::Uint16VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<tflite::Uint8VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = SparseIndexVector_NONE;
+}
+
+inline bool VerifyBuiltinOptions(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
+  switch (type) {
+    case BuiltinOptions_NONE: {
+      return true;
+    }
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BroadcastToOptions: {
+      auto ptr = reinterpret_cast<const tflite::BroadcastToOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_Rfft2dOptions: {
+      auto ptr = reinterpret_cast<const tflite::Rfft2dOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_Conv3DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv3DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_HashtableOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_HashtableFindOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableFindOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_HashtableImportOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableImportOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_HashtableSizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableSizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_VarHandleOptions: {
+      auto ptr = reinterpret_cast<const tflite::VarHandleOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReadVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReadVariableOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AssignVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::AssignVariableOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RandomOptions: {
+      auto ptr = reinterpret_cast<const tflite::RandomOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BucketizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::BucketizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GeluOptions: {
+      auto ptr = reinterpret_cast<const tflite::GeluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnsortedSegmentMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnsortedSegmentMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMinOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnsortedSegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentSumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ATan2Options: {
+      auto ptr = reinterpret_cast<const tflite::ATan2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SignOptions: {
+      auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyBuiltinOptionsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyBuiltinOptions(
+        verifier,  values->Get(i), types->GetEnum<BuiltinOptions>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, const ::flatbuffers::resolver_function_t *resolver) {
+  (void)resolver;
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BroadcastToOptions: {
+      auto ptr = reinterpret_cast<const tflite::BroadcastToOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_Rfft2dOptions: {
+      auto ptr = reinterpret_cast<const tflite::Rfft2dOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_Conv3DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv3DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_HashtableOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_HashtableFindOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableFindOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_HashtableImportOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableImportOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_HashtableSizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableSizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_VarHandleOptions: {
+      auto ptr = reinterpret_cast<const tflite::VarHandleOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReadVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReadVariableOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AssignVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::AssignVariableOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RandomOptions: {
+      auto ptr = reinterpret_cast<const tflite::RandomOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BucketizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::BucketizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GeluOptions: {
+      auto ptr = reinterpret_cast<const tflite::GeluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnsortedSegmentMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnsortedSegmentMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMinOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnsortedSegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentSumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ATan2Options: {
+      auto ptr = reinterpret_cast<const tflite::ATan2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SignOptions: {
+      auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline ::flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
+  (void)_rehasher;
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptionsT *>(value);
+      return CreateConv2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptionsT *>(value);
+      return CreateDepthwiseConv2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptionsT *>(value);
+      return CreateConcatEmbeddingsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptionsT *>(value);
+      return CreateLSHProjectionOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptionsT *>(value);
+      return CreatePool2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptionsT *>(value);
+      return CreateSVDFOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptionsT *>(value);
+      return CreateRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptionsT *>(value);
+      return CreateFullyConnectedOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptionsT *>(value);
+      return CreateSoftmaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptionsT *>(value);
+      return CreateConcatenationOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptionsT *>(value);
+      return CreateAddOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptionsT *>(value);
+      return CreateL2NormOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptionsT *>(value);
+      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptionsT *>(value);
+      return CreateLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptionsT *>(value);
+      return CreateResizeBilinearOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptionsT *>(value);
+      return CreateCallOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptionsT *>(value);
+      return CreateReshapeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptionsT *>(value);
+      return CreateSkipGramOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptionsT *>(value);
+      return CreateSpaceToDepthOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptionsT *>(value);
+      return CreateEmbeddingLookupSparseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptionsT *>(value);
+      return CreateMulOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptionsT *>(value);
+      return CreatePadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptionsT *>(value);
+      return CreateGatherOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptionsT *>(value);
+      return CreateBatchToSpaceNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptionsT *>(value);
+      return CreateSpaceToBatchNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptionsT *>(value);
+      return CreateTransposeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptionsT *>(value);
+      return CreateReducerOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptionsT *>(value);
+      return CreateSubOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptionsT *>(value);
+      return CreateDivOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptionsT *>(value);
+      return CreateSqueezeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptionsT *>(value);
+      return CreateSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptionsT *>(value);
+      return CreateStridedSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptionsT *>(value);
+      return CreateExpOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2OptionsT *>(value);
+      return CreateTopKV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptionsT *>(value);
+      return CreateSplitOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptionsT *>(value);
+      return CreateLogSoftmaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptionsT *>(value);
+      return CreateCastOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptionsT *>(value);
+      return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptionsT *>(value);
+      return CreateMaximumMinimumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptionsT *>(value);
+      return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptionsT *>(value);
+      return CreateLessOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptionsT *>(value);
+      return CreateNegOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2OptionsT *>(value);
+      return CreatePadV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptionsT *>(value);
+      return CreateGreaterOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptionsT *>(value);
+      return CreateGreaterEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptionsT *>(value);
+      return CreateLessEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptionsT *>(value);
+      return CreateSelectOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptionsT *>(value);
+      return CreateSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptionsT *>(value);
+      return CreateTransposeConvOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptionsT *>(value);
+      return CreateSparseToDenseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptionsT *>(value);
+      return CreateTileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptionsT *>(value);
+      return CreateExpandDimsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptionsT *>(value);
+      return CreateEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptionsT *>(value);
+      return CreateNotEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptionsT *>(value);
+      return CreateShapeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptionsT *>(value);
+      return CreatePowOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptionsT *>(value);
+      return CreateArgMinOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptionsT *>(value);
+      return CreateFakeQuantOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptionsT *>(value);
+      return CreatePackOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptionsT *>(value);
+      return CreateLogicalOrOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptionsT *>(value);
+      return CreateOneHotOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptionsT *>(value);
+      return CreateLogicalAndOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptionsT *>(value);
+      return CreateLogicalNotOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptionsT *>(value);
+      return CreateUnpackOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptionsT *>(value);
+      return CreateFloorDivOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptionsT *>(value);
+      return CreateSquareOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptionsT *>(value);
+      return CreateZerosLikeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptionsT *>(value);
+      return CreateFillOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateBidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptionsT *>(value);
+      return CreateBidirectionalSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateUnidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptionsT *>(value);
+      return CreateFloorModOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptionsT *>(value);
+      return CreateRangeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptionsT *>(value);
+      return CreateResizeNearestNeighborOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptionsT *>(value);
+      return CreateLeakyReluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptionsT *>(value);
+      return CreateSquaredDifferenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptionsT *>(value);
+      return CreateMirrorPadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptionsT *>(value);
+      return CreateAbsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptionsT *>(value);
+      return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptionsT *>(value);
+      return CreateUniqueOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2OptionsT *>(value);
+      return CreateReverseV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptionsT *>(value);
+      return CreateAddNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptionsT *>(value);
+      return CreateGatherNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptionsT *>(value);
+      return CreateCosOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptionsT *>(value);
+      return CreateWhereOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptionsT *>(value);
+      return CreateRankOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptionsT *>(value);
+      return CreateReverseSequenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptionsT *>(value);
+      return CreateMatrixDiagOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptionsT *>(value);
+      return CreateQuantizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptionsT *>(value);
+      return CreateMatrixSetDiagOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptionsT *>(value);
+      return CreateHardSwishOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptionsT *>(value);
+      return CreateIfOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptionsT *>(value);
+      return CreateWhileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptionsT *>(value);
+      return CreateDepthToSpaceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4OptionsT *>(value);
+      return CreateNonMaxSuppressionV4Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5OptionsT *>(value);
+      return CreateNonMaxSuppressionV5Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptionsT *>(value);
+      return CreateScatterNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2OptionsT *>(value);
+      return CreateSelectV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptionsT *>(value);
+      return CreateDensifyOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptionsT *>(value);
+      return CreateSegmentSumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptionsT *>(value);
+      return CreateBatchMatMulOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptionsT *>(value);
+      return CreateCumsumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptionsT *>(value);
+      return CreateCallOnceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BroadcastToOptions: {
+      auto ptr = reinterpret_cast<const tflite::BroadcastToOptionsT *>(value);
+      return CreateBroadcastToOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_Rfft2dOptions: {
+      auto ptr = reinterpret_cast<const tflite::Rfft2dOptionsT *>(value);
+      return CreateRfft2dOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_Conv3DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv3DOptionsT *>(value);
+      return CreateConv3DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_HashtableOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableOptionsT *>(value);
+      return CreateHashtableOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_HashtableFindOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableFindOptionsT *>(value);
+      return CreateHashtableFindOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_HashtableImportOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableImportOptionsT *>(value);
+      return CreateHashtableImportOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_HashtableSizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableSizeOptionsT *>(value);
+      return CreateHashtableSizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_VarHandleOptions: {
+      auto ptr = reinterpret_cast<const tflite::VarHandleOptionsT *>(value);
+      return CreateVarHandleOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReadVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReadVariableOptionsT *>(value);
+      return CreateReadVariableOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AssignVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::AssignVariableOptionsT *>(value);
+      return CreateAssignVariableOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RandomOptions: {
+      auto ptr = reinterpret_cast<const tflite::RandomOptionsT *>(value);
+      return CreateRandomOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BucketizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::BucketizeOptionsT *>(value);
+      return CreateBucketizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GeluOptions: {
+      auto ptr = reinterpret_cast<const tflite::GeluOptionsT *>(value);
+      return CreateGeluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptionsT *>(value);
+      return CreateDynamicUpdateSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptionsT *>(value);
+      return CreateUnsortedSegmentProdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnsortedSegmentMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMaxOptionsT *>(value);
+      return CreateUnsortedSegmentMaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnsortedSegmentMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMinOptionsT *>(value);
+      return CreateUnsortedSegmentMinOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnsortedSegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentSumOptionsT *>(value);
+      return CreateUnsortedSegmentSumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ATan2Options: {
+      auto ptr = reinterpret_cast<const tflite::ATan2OptionsT *>(value);
+      return CreateATan2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SignOptions: {
+      auto ptr = reinterpret_cast<const tflite::SignOptionsT *>(value);
+      return CreateSignOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptionsT *>(value);
+      return CreateBitcastOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptionsT *>(value);
+      return CreateBitwiseXorOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptionsT *>(value);
+      return CreateRightShiftOptions(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      value = new tflite::Conv2DOptionsT(*reinterpret_cast<tflite::Conv2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      value = new tflite::DepthwiseConv2DOptionsT(*reinterpret_cast<tflite::DepthwiseConv2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      value = new tflite::ConcatEmbeddingsOptionsT(*reinterpret_cast<tflite::ConcatEmbeddingsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      value = new tflite::LSHProjectionOptionsT(*reinterpret_cast<tflite::LSHProjectionOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      value = new tflite::Pool2DOptionsT(*reinterpret_cast<tflite::Pool2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SVDFOptions: {
+      value = new tflite::SVDFOptionsT(*reinterpret_cast<tflite::SVDFOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RNNOptions: {
+      value = new tflite::RNNOptionsT(*reinterpret_cast<tflite::RNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      value = new tflite::FullyConnectedOptionsT(*reinterpret_cast<tflite::FullyConnectedOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      value = new tflite::SoftmaxOptionsT(*reinterpret_cast<tflite::SoftmaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      value = new tflite::ConcatenationOptionsT(*reinterpret_cast<tflite::ConcatenationOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddOptions: {
+      value = new tflite::AddOptionsT(*reinterpret_cast<tflite::AddOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_L2NormOptions: {
+      value = new tflite::L2NormOptionsT(*reinterpret_cast<tflite::L2NormOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      value = new tflite::LocalResponseNormalizationOptionsT(*reinterpret_cast<tflite::LocalResponseNormalizationOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LSTMOptions: {
+      value = new tflite::LSTMOptionsT(*reinterpret_cast<tflite::LSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      value = new tflite::ResizeBilinearOptionsT(*reinterpret_cast<tflite::ResizeBilinearOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CallOptions: {
+      value = new tflite::CallOptionsT(*reinterpret_cast<tflite::CallOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      value = new tflite::ReshapeOptionsT(*reinterpret_cast<tflite::ReshapeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      value = new tflite::SkipGramOptionsT(*reinterpret_cast<tflite::SkipGramOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      value = new tflite::SpaceToDepthOptionsT(*reinterpret_cast<tflite::SpaceToDepthOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      value = new tflite::EmbeddingLookupSparseOptionsT(*reinterpret_cast<tflite::EmbeddingLookupSparseOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MulOptions: {
+      value = new tflite::MulOptionsT(*reinterpret_cast<tflite::MulOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PadOptions: {
+      value = new tflite::PadOptionsT(*reinterpret_cast<tflite::PadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      value = new tflite::GatherOptionsT(*reinterpret_cast<tflite::GatherOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      value = new tflite::BatchToSpaceNDOptionsT(*reinterpret_cast<tflite::BatchToSpaceNDOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      value = new tflite::SpaceToBatchNDOptionsT(*reinterpret_cast<tflite::SpaceToBatchNDOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeOptions: {
+      value = new tflite::TransposeOptionsT(*reinterpret_cast<tflite::TransposeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReducerOptions: {
+      value = new tflite::ReducerOptionsT(*reinterpret_cast<tflite::ReducerOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SubOptions: {
+      value = new tflite::SubOptionsT(*reinterpret_cast<tflite::SubOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DivOptions: {
+      value = new tflite::DivOptionsT(*reinterpret_cast<tflite::DivOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      value = new tflite::SqueezeOptionsT(*reinterpret_cast<tflite::SqueezeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      value = new tflite::SequenceRNNOptionsT(*reinterpret_cast<tflite::SequenceRNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      value = new tflite::StridedSliceOptionsT(*reinterpret_cast<tflite::StridedSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpOptions: {
+      value = new tflite::ExpOptionsT(*reinterpret_cast<tflite::ExpOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TopKV2Options: {
+      value = new tflite::TopKV2OptionsT(*reinterpret_cast<tflite::TopKV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitOptions: {
+      value = new tflite::SplitOptionsT(*reinterpret_cast<tflite::SplitOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      value = new tflite::LogSoftmaxOptionsT(*reinterpret_cast<tflite::LogSoftmaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CastOptions: {
+      value = new tflite::CastOptionsT(*reinterpret_cast<tflite::CastOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      value = new tflite::DequantizeOptionsT(*reinterpret_cast<tflite::DequantizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      value = new tflite::MaximumMinimumOptionsT(*reinterpret_cast<tflite::MaximumMinimumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      value = new tflite::ArgMaxOptionsT(*reinterpret_cast<tflite::ArgMaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      value = new tflite::LessOptionsT(*reinterpret_cast<tflite::LessOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NegOptions: {
+      value = new tflite::NegOptionsT(*reinterpret_cast<tflite::NegOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PadV2Options: {
+      value = new tflite::PadV2OptionsT(*reinterpret_cast<tflite::PadV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      value = new tflite::GreaterOptionsT(*reinterpret_cast<tflite::GreaterOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      value = new tflite::GreaterEqualOptionsT(*reinterpret_cast<tflite::GreaterEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      value = new tflite::LessEqualOptionsT(*reinterpret_cast<tflite::LessEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      value = new tflite::SelectOptionsT(*reinterpret_cast<tflite::SelectOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      value = new tflite::SliceOptionsT(*reinterpret_cast<tflite::SliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      value = new tflite::TransposeConvOptionsT(*reinterpret_cast<tflite::TransposeConvOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      value = new tflite::SparseToDenseOptionsT(*reinterpret_cast<tflite::SparseToDenseOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TileOptions: {
+      value = new tflite::TileOptionsT(*reinterpret_cast<tflite::TileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      value = new tflite::ExpandDimsOptionsT(*reinterpret_cast<tflite::ExpandDimsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_EqualOptions: {
+      value = new tflite::EqualOptionsT(*reinterpret_cast<tflite::EqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      value = new tflite::NotEqualOptionsT(*reinterpret_cast<tflite::NotEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ShapeOptions: {
+      value = new tflite::ShapeOptionsT(*reinterpret_cast<tflite::ShapeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PowOptions: {
+      value = new tflite::PowOptionsT(*reinterpret_cast<tflite::PowOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      value = new tflite::ArgMinOptionsT(*reinterpret_cast<tflite::ArgMinOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      value = new tflite::FakeQuantOptionsT(*reinterpret_cast<tflite::FakeQuantOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PackOptions: {
+      value = new tflite::PackOptionsT(*reinterpret_cast<tflite::PackOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      value = new tflite::LogicalOrOptionsT(*reinterpret_cast<tflite::LogicalOrOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      value = new tflite::OneHotOptionsT(*reinterpret_cast<tflite::OneHotOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      value = new tflite::LogicalAndOptionsT(*reinterpret_cast<tflite::LogicalAndOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      value = new tflite::LogicalNotOptionsT(*reinterpret_cast<tflite::LogicalNotOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnpackOptions: {
+      value = new tflite::UnpackOptionsT(*reinterpret_cast<tflite::UnpackOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      value = new tflite::FloorDivOptionsT(*reinterpret_cast<tflite::FloorDivOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquareOptions: {
+      value = new tflite::SquareOptionsT(*reinterpret_cast<tflite::SquareOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      value = new tflite::ZerosLikeOptionsT(*reinterpret_cast<tflite::ZerosLikeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FillOptions: {
+      value = new tflite::FillOptionsT(*reinterpret_cast<tflite::FillOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      value = new tflite::BidirectionalSequenceLSTMOptionsT(*reinterpret_cast<tflite::BidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      value = new tflite::BidirectionalSequenceRNNOptionsT(*reinterpret_cast<tflite::BidirectionalSequenceRNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      value = new tflite::UnidirectionalSequenceLSTMOptionsT(*reinterpret_cast<tflite::UnidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FloorModOptions: {
+      value = new tflite::FloorModOptionsT(*reinterpret_cast<tflite::FloorModOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RangeOptions: {
+      value = new tflite::RangeOptionsT(*reinterpret_cast<tflite::RangeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      value = new tflite::ResizeNearestNeighborOptionsT(*reinterpret_cast<tflite::ResizeNearestNeighborOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      value = new tflite::LeakyReluOptionsT(*reinterpret_cast<tflite::LeakyReluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      value = new tflite::SquaredDifferenceOptionsT(*reinterpret_cast<tflite::SquaredDifferenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      value = new tflite::MirrorPadOptionsT(*reinterpret_cast<tflite::MirrorPadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      value = new tflite::AbsOptionsT(*reinterpret_cast<tflite::AbsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      value = new tflite::SplitVOptionsT(*reinterpret_cast<tflite::SplitVOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UniqueOptions: {
+      value = new tflite::UniqueOptionsT(*reinterpret_cast<tflite::UniqueOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      value = new tflite::ReverseV2OptionsT(*reinterpret_cast<tflite::ReverseV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      value = new tflite::AddNOptionsT(*reinterpret_cast<tflite::AddNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      value = new tflite::GatherNdOptionsT(*reinterpret_cast<tflite::GatherNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      value = new tflite::CosOptionsT(*reinterpret_cast<tflite::CosOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      value = new tflite::WhereOptionsT(*reinterpret_cast<tflite::WhereOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      value = new tflite::RankOptionsT(*reinterpret_cast<tflite::RankOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      value = new tflite::ReverseSequenceOptionsT(*reinterpret_cast<tflite::ReverseSequenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      value = new tflite::MatrixDiagOptionsT(*reinterpret_cast<tflite::MatrixDiagOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      value = new tflite::QuantizeOptionsT(*reinterpret_cast<tflite::QuantizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      value = new tflite::MatrixSetDiagOptionsT(*reinterpret_cast<tflite::MatrixSetDiagOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      value = new tflite::HardSwishOptionsT(*reinterpret_cast<tflite::HardSwishOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_IfOptions: {
+      value = new tflite::IfOptionsT(*reinterpret_cast<tflite::IfOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhileOptions: {
+      value = new tflite::WhileOptionsT(*reinterpret_cast<tflite::WhileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      value = new tflite::DepthToSpaceOptionsT(*reinterpret_cast<tflite::DepthToSpaceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      value = new tflite::NonMaxSuppressionV4OptionsT(*reinterpret_cast<tflite::NonMaxSuppressionV4OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      value = new tflite::NonMaxSuppressionV5OptionsT(*reinterpret_cast<tflite::NonMaxSuppressionV5OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      value = new tflite::ScatterNdOptionsT(*reinterpret_cast<tflite::ScatterNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SelectV2Options: {
+      value = new tflite::SelectV2OptionsT(*reinterpret_cast<tflite::SelectV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DensifyOptions: {
+      value = new tflite::DensifyOptionsT(*reinterpret_cast<tflite::DensifyOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      value = new tflite::SegmentSumOptionsT(*reinterpret_cast<tflite::SegmentSumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      value = new tflite::BatchMatMulOptionsT(*reinterpret_cast<tflite::BatchMatMulOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CumsumOptions: {
+      value = new tflite::CumsumOptionsT(*reinterpret_cast<tflite::CumsumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      value = new tflite::CallOnceOptionsT(*reinterpret_cast<tflite::CallOnceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BroadcastToOptions: {
+      value = new tflite::BroadcastToOptionsT(*reinterpret_cast<tflite::BroadcastToOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_Rfft2dOptions: {
+      value = new tflite::Rfft2dOptionsT(*reinterpret_cast<tflite::Rfft2dOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_Conv3DOptions: {
+      value = new tflite::Conv3DOptionsT(*reinterpret_cast<tflite::Conv3DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_HashtableOptions: {
+      value = new tflite::HashtableOptionsT(*reinterpret_cast<tflite::HashtableOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_HashtableFindOptions: {
+      value = new tflite::HashtableFindOptionsT(*reinterpret_cast<tflite::HashtableFindOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_HashtableImportOptions: {
+      value = new tflite::HashtableImportOptionsT(*reinterpret_cast<tflite::HashtableImportOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_HashtableSizeOptions: {
+      value = new tflite::HashtableSizeOptionsT(*reinterpret_cast<tflite::HashtableSizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_VarHandleOptions: {
+      value = new tflite::VarHandleOptionsT(*reinterpret_cast<tflite::VarHandleOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReadVariableOptions: {
+      value = new tflite::ReadVariableOptionsT(*reinterpret_cast<tflite::ReadVariableOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AssignVariableOptions: {
+      value = new tflite::AssignVariableOptionsT(*reinterpret_cast<tflite::AssignVariableOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RandomOptions: {
+      value = new tflite::RandomOptionsT(*reinterpret_cast<tflite::RandomOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BucketizeOptions: {
+      value = new tflite::BucketizeOptionsT(*reinterpret_cast<tflite::BucketizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GeluOptions: {
+      value = new tflite::GeluOptionsT(*reinterpret_cast<tflite::GeluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      value = new tflite::DynamicUpdateSliceOptionsT(*reinterpret_cast<tflite::DynamicUpdateSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      value = new tflite::UnsortedSegmentProdOptionsT(*reinterpret_cast<tflite::UnsortedSegmentProdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentMaxOptions: {
+      value = new tflite::UnsortedSegmentMaxOptionsT(*reinterpret_cast<tflite::UnsortedSegmentMaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentMinOptions: {
+      value = new tflite::UnsortedSegmentMinOptionsT(*reinterpret_cast<tflite::UnsortedSegmentMinOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentSumOptions: {
+      value = new tflite::UnsortedSegmentSumOptionsT(*reinterpret_cast<tflite::UnsortedSegmentSumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ATan2Options: {
+      value = new tflite::ATan2OptionsT(*reinterpret_cast<tflite::ATan2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SignOptions: {
+      value = new tflite::SignOptionsT(*reinterpret_cast<tflite::SignOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BitcastOptions: {
+      value = new tflite::BitcastOptionsT(*reinterpret_cast<tflite::BitcastOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      value = new tflite::BitwiseXorOptionsT(*reinterpret_cast<tflite::BitwiseXorOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      value = new tflite::RightShiftOptionsT(*reinterpret_cast<tflite::RightShiftOptionsT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void BuiltinOptionsUnion::Reset() {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<tflite::Conv2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<tflite::DepthwiseConv2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<tflite::ConcatEmbeddingsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<tflite::LSHProjectionOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<tflite::Pool2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<tflite::SVDFOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<tflite::RNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<tflite::FullyConnectedOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<tflite::SoftmaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<tflite::ConcatenationOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<tflite::AddOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<tflite::L2NormOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<tflite::LocalResponseNormalizationOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<tflite::LSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<tflite::ResizeBilinearOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<tflite::CallOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<tflite::ReshapeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<tflite::SkipGramOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<tflite::SpaceToDepthOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<tflite::EmbeddingLookupSparseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<tflite::MulOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<tflite::PadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<tflite::GatherOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<tflite::BatchToSpaceNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<tflite::SpaceToBatchNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<tflite::TransposeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<tflite::ReducerOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<tflite::SubOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<tflite::DivOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<tflite::SqueezeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<tflite::SequenceRNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<tflite::StridedSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<tflite::ExpOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<tflite::TopKV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<tflite::SplitOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<tflite::LogSoftmaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<tflite::CastOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<tflite::DequantizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<tflite::MaximumMinimumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<tflite::ArgMaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<tflite::LessOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<tflite::NegOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<tflite::PadV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<tflite::GreaterOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<tflite::GreaterEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<tflite::LessEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<tflite::SelectOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<tflite::SliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<tflite::TransposeConvOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<tflite::SparseToDenseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<tflite::TileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<tflite::ExpandDimsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<tflite::EqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<tflite::NotEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<tflite::ShapeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<tflite::PowOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<tflite::ArgMinOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<tflite::FakeQuantOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<tflite::PackOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<tflite::LogicalOrOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<tflite::OneHotOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<tflite::LogicalAndOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<tflite::LogicalNotOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<tflite::UnpackOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<tflite::FloorDivOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<tflite::SquareOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<tflite::ZerosLikeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<tflite::FillOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<tflite::BidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<tflite::BidirectionalSequenceRNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<tflite::UnidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<tflite::FloorModOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<tflite::RangeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<tflite::ResizeNearestNeighborOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<tflite::LeakyReluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<tflite::SquaredDifferenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<tflite::MirrorPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<tflite::AbsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<tflite::SplitVOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<tflite::UniqueOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<tflite::ReverseV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<tflite::AddNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<tflite::GatherNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<tflite::CosOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<tflite::WhereOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<tflite::RankOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<tflite::ReverseSequenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<tflite::MatrixDiagOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<tflite::QuantizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<tflite::MatrixSetDiagOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<tflite::HardSwishOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<tflite::IfOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<tflite::WhileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<tflite::DepthToSpaceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<tflite::NonMaxSuppressionV4OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<tflite::NonMaxSuppressionV5OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<tflite::ScatterNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<tflite::SelectV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<tflite::DensifyOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<tflite::SegmentSumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<tflite::BatchMatMulOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CumsumOptions: {
+      auto ptr = reinterpret_cast<tflite::CumsumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CallOnceOptions: {
+      auto ptr = reinterpret_cast<tflite::CallOnceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BroadcastToOptions: {
+      auto ptr = reinterpret_cast<tflite::BroadcastToOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_Rfft2dOptions: {
+      auto ptr = reinterpret_cast<tflite::Rfft2dOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_Conv3DOptions: {
+      auto ptr = reinterpret_cast<tflite::Conv3DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_HashtableOptions: {
+      auto ptr = reinterpret_cast<tflite::HashtableOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_HashtableFindOptions: {
+      auto ptr = reinterpret_cast<tflite::HashtableFindOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_HashtableImportOptions: {
+      auto ptr = reinterpret_cast<tflite::HashtableImportOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_HashtableSizeOptions: {
+      auto ptr = reinterpret_cast<tflite::HashtableSizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_VarHandleOptions: {
+      auto ptr = reinterpret_cast<tflite::VarHandleOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReadVariableOptions: {
+      auto ptr = reinterpret_cast<tflite::ReadVariableOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AssignVariableOptions: {
+      auto ptr = reinterpret_cast<tflite::AssignVariableOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RandomOptions: {
+      auto ptr = reinterpret_cast<tflite::RandomOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BucketizeOptions: {
+      auto ptr = reinterpret_cast<tflite::BucketizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GeluOptions: {
+      auto ptr = reinterpret_cast<tflite::GeluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<tflite::DynamicUpdateSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<tflite::UnsortedSegmentProdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentMaxOptions: {
+      auto ptr = reinterpret_cast<tflite::UnsortedSegmentMaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentMinOptions: {
+      auto ptr = reinterpret_cast<tflite::UnsortedSegmentMinOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentSumOptions: {
+      auto ptr = reinterpret_cast<tflite::UnsortedSegmentSumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ATan2Options: {
+      auto ptr = reinterpret_cast<tflite::ATan2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SignOptions: {
+      auto ptr = reinterpret_cast<tflite::SignOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<tflite::BitcastOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<tflite::BitwiseXorOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<tflite::RightShiftOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = BuiltinOptions_NONE;
+}
+
+inline bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions2 type) {
+  switch (type) {
+    case BuiltinOptions2_NONE: {
+      return true;
+    }
+    case BuiltinOptions2_StablehloConcatenateOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConcatenateOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloBroadcastInDimOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloBroadcastInDimOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloConvolutionOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConvolutionOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloCustomCallOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCustomCallOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloReduceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloScatterOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloScatterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloCompareOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCompareOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloDynamicSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDynamicSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloIotaOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloIotaOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloDotGeneralOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDotGeneralOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceWindowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloSortOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSortOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloWhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloWhileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloGatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloGatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloTransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloTransposeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_DilateOptions: {
+      auto ptr = reinterpret_cast<const tflite::DilateOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloRngBitGeneratorOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StableHLOCompositeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloShiftLeftOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloShiftLeftOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2_StablehloCaseOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCaseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyBuiltinOptions2Vector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyBuiltinOptions2(
+        verifier,  values->Get(i), types->GetEnum<BuiltinOptions2>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *BuiltinOptions2Union::UnPack(const void *obj, BuiltinOptions2 type, const ::flatbuffers::resolver_function_t *resolver) {
+  (void)resolver;
+  switch (type) {
+    case BuiltinOptions2_StablehloConcatenateOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConcatenateOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloBroadcastInDimOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloBroadcastInDimOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloConvolutionOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConvolutionOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloCustomCallOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCustomCallOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloReduceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloScatterOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloScatterOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloCompareOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCompareOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloDynamicSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDynamicSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloIotaOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloIotaOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloDotGeneralOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDotGeneralOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceWindowOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloSortOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSortOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloWhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloWhileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloGatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloGatherOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloTransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloTransposeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_DilateOptions: {
+      auto ptr = reinterpret_cast<const tflite::DilateOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloRngBitGeneratorOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StableHLOCompositeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloShiftLeftOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloShiftLeftOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions2_StablehloCaseOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCaseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline ::flatbuffers::Offset<void> BuiltinOptions2Union::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
+  (void)_rehasher;
+  switch (type) {
+    case BuiltinOptions2_StablehloConcatenateOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConcatenateOptionsT *>(value);
+      return CreateStablehloConcatenateOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloBroadcastInDimOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloBroadcastInDimOptionsT *>(value);
+      return CreateStablehloBroadcastInDimOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSliceOptionsT *>(value);
+      return CreateStablehloSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloConvolutionOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConvolutionOptionsT *>(value);
+      return CreateStablehloConvolutionOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloCustomCallOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCustomCallOptionsT *>(value);
+      return CreateStablehloCustomCallOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloReduceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceOptionsT *>(value);
+      return CreateStablehloReduceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloScatterOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloScatterOptionsT *>(value);
+      return CreateStablehloScatterOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloCompareOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCompareOptionsT *>(value);
+      return CreateStablehloCompareOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloDynamicSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDynamicSliceOptionsT *>(value);
+      return CreateStablehloDynamicSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloPadOptionsT *>(value);
+      return CreateStablehloPadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloIotaOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloIotaOptionsT *>(value);
+      return CreateStablehloIotaOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloDotGeneralOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDotGeneralOptionsT *>(value);
+      return CreateStablehloDotGeneralOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceWindowOptionsT *>(value);
+      return CreateStablehloReduceWindowOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloSortOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSortOptionsT *>(value);
+      return CreateStablehloSortOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloWhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloWhileOptionsT *>(value);
+      return CreateStablehloWhileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloGatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloGatherOptionsT *>(value);
+      return CreateStablehloGatherOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloTransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloTransposeOptionsT *>(value);
+      return CreateStablehloTransposeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_DilateOptions: {
+      auto ptr = reinterpret_cast<const tflite::DilateOptionsT *>(value);
+      return CreateDilateOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloRngBitGeneratorOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptionsT *>(value);
+      return CreateStablehloRngBitGeneratorOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptionsT *>(value);
+      return CreateReduceWindowOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StableHLOCompositeOptionsT *>(value);
+      return CreateStableHLOCompositeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloShiftLeftOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloShiftLeftOptionsT *>(value);
+      return CreateStablehloShiftLeftOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions2_StablehloCaseOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCaseOptionsT *>(value);
+      return CreateStablehloCaseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline BuiltinOptions2Union::BuiltinOptions2Union(const BuiltinOptions2Union &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case BuiltinOptions2_StablehloConcatenateOptions: {
+      value = new tflite::StablehloConcatenateOptionsT(*reinterpret_cast<tflite::StablehloConcatenateOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloBroadcastInDimOptions: {
+      value = new tflite::StablehloBroadcastInDimOptionsT(*reinterpret_cast<tflite::StablehloBroadcastInDimOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloSliceOptions: {
+      value = new tflite::StablehloSliceOptionsT(*reinterpret_cast<tflite::StablehloSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloConvolutionOptions: {
+      value = new tflite::StablehloConvolutionOptionsT(*reinterpret_cast<tflite::StablehloConvolutionOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloCustomCallOptions: {
+      value = new tflite::StablehloCustomCallOptionsT(*reinterpret_cast<tflite::StablehloCustomCallOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloReduceOptions: {
+      value = new tflite::StablehloReduceOptionsT(*reinterpret_cast<tflite::StablehloReduceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloScatterOptions: {
+      value = new tflite::StablehloScatterOptionsT(*reinterpret_cast<tflite::StablehloScatterOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloCompareOptions: {
+      value = new tflite::StablehloCompareOptionsT(*reinterpret_cast<tflite::StablehloCompareOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloDynamicSliceOptions: {
+      value = new tflite::StablehloDynamicSliceOptionsT(*reinterpret_cast<tflite::StablehloDynamicSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloPadOptions: {
+      value = new tflite::StablehloPadOptionsT(*reinterpret_cast<tflite::StablehloPadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloIotaOptions: {
+      value = new tflite::StablehloIotaOptionsT(*reinterpret_cast<tflite::StablehloIotaOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloDotGeneralOptions: {
+      value = new tflite::StablehloDotGeneralOptionsT(*reinterpret_cast<tflite::StablehloDotGeneralOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloReduceWindowOptions: {
+      value = new tflite::StablehloReduceWindowOptionsT(*reinterpret_cast<tflite::StablehloReduceWindowOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloSortOptions: {
+      value = new tflite::StablehloSortOptionsT(*reinterpret_cast<tflite::StablehloSortOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloWhileOptions: {
+      value = new tflite::StablehloWhileOptionsT(*reinterpret_cast<tflite::StablehloWhileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloGatherOptions: {
+      value = new tflite::StablehloGatherOptionsT(*reinterpret_cast<tflite::StablehloGatherOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloTransposeOptions: {
+      value = new tflite::StablehloTransposeOptionsT(*reinterpret_cast<tflite::StablehloTransposeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_DilateOptions: {
+      value = new tflite::DilateOptionsT(*reinterpret_cast<tflite::DilateOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloRngBitGeneratorOptions: {
+      value = new tflite::StablehloRngBitGeneratorOptionsT(*reinterpret_cast<tflite::StablehloRngBitGeneratorOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      value = new tflite::ReduceWindowOptionsT(*reinterpret_cast<tflite::ReduceWindowOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      value = new tflite::StableHLOCompositeOptionsT(*reinterpret_cast<tflite::StableHLOCompositeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloShiftLeftOptions: {
+      value = new tflite::StablehloShiftLeftOptionsT(*reinterpret_cast<tflite::StablehloShiftLeftOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions2_StablehloCaseOptions: {
+      value = new tflite::StablehloCaseOptionsT(*reinterpret_cast<tflite::StablehloCaseOptionsT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void BuiltinOptions2Union::Reset() {
+  switch (type) {
+    case BuiltinOptions2_StablehloConcatenateOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloConcatenateOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloBroadcastInDimOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloBroadcastInDimOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloSliceOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloConvolutionOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloConvolutionOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloCustomCallOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloCustomCallOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloReduceOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloReduceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloScatterOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloScatterOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloCompareOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloCompareOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloDynamicSliceOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloDynamicSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloPadOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloIotaOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloIotaOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloDotGeneralOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloDotGeneralOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloReduceWindowOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloReduceWindowOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloSortOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloSortOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloWhileOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloWhileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloGatherOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloGatherOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloTransposeOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloTransposeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_DilateOptions: {
+      auto ptr = reinterpret_cast<tflite::DilateOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloRngBitGeneratorOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloRngBitGeneratorOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<tflite::ReduceWindowOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<tflite::StableHLOCompositeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloShiftLeftOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloShiftLeftOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions2_StablehloCaseOptions: {
+      auto ptr = reinterpret_cast<tflite::StablehloCaseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = BuiltinOptions2_NONE;
+}
+
+inline const tflite::Model *GetModel(const void *buf) {
+  return ::flatbuffers::GetRoot<tflite::Model>(buf);
+}
+
+inline const tflite::Model *GetSizePrefixedModel(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::Model>(buf);
+}
+
+inline const char *ModelIdentifier() {
+  return "TFL3";
+}
+
+inline bool ModelBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModelIdentifier());
+}
+
+inline bool SizePrefixedModelBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModelIdentifier(), true);
+}
+
+inline bool VerifyModelBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline bool VerifySizePrefixedModelBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline const char *ModelExtension() {
+  return "tflite";
+}
+
+inline void FinishModelBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::Model> root) {
+  fbb.Finish(root, ModelIdentifier());
+}
+
+inline void FinishSizePrefixedModelBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::Model> root) {
+  fbb.FinishSizePrefixed(root, ModelIdentifier());
+}
+
+inline std::unique_ptr<tflite::ModelT> UnPackModel(
+    const void *buf,
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ModelT>(GetModel(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<tflite::ModelT> UnPackSizePrefixedModel(
+    const void *buf,
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ModelT>(GetSizePrefixedModel(buf)->UnPack(res));
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_utils.h
new file mode 100644
index 00000000..7498aa02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/schema/schema_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// The following methods are introduced to resolve op builtin code shortage
+// problem. The new builtin operator will be assigned to the extended builtin
+// code field in the flatbuffer schema. Those methods helps to hide builtin code
+// details.
+BuiltinOperator GetBuiltinCode(const OperatorCode *op_code);
+
+BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
new file mode 100644
index 00000000..4fa1b5e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_SPARSITY_SPARSIFY_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_SPARSITY_SPARSIFY_MODEL_H_
+
+#include "absl/status/status.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace mlir {
+namespace lite {
+
+// Sparsify the `input_model` and write the result to a flatbuffer `builder`.
+absl::Status SparsifyModel(const tflite::ModelT& input_model,
+                           flatbuffers::FlatBufferBuilder* builder);
+}  // namespace lite
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_SPARSITY_SPARSIFY_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h
new file mode 100644
index 00000000..6f3d2d55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h
@@ -0,0 +1,26 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_FOLDERS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_FOLDERS_H_
+
+namespace mlir::odml {
+
+// Populates the pattern set with all folding patterns. These patterns
+// are intended to have precedence over any other patterns added to the set.
+void PopulateFolderPatterns(RewritePatternSet &patternSet);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_FOLDERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
new file mode 100644
index 00000000..bb0c02cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateSHLOSimplifyPass();
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h.inc"
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h
new file mode 100644
index 00000000..c6461d81
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_CHECK_DIALECTS_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_CHECK_DIALECTS_PASS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Creates a pass which checks if there exists allowed dialect ops only or not.
+// Based on the list of dialect and op names, it signals failure or not.
+// If some ops are in the `optional_accepted_dialects`, then it warns them.
+std::unique_ptr<Pass> createCheckAcceptedOpsPass(
+    const std::vector<std::string> &optional_accepted_dialects = {});
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_CHECK_DIALECTS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
new file mode 100644
index 00000000..2afa2066
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
@@ -0,0 +1,55 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_AVG_POOL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_AVG_POOL_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+// Given a Composite op that wraps a core.aten.avg_pool2d, returns the padding
+// configuration required for the `tfl.pad` if the padding part of the op is
+// to be done before average pooling.
+DenseIntElementsAttr GetPadOpAttr(Builder& builder, mhlo::CompositeOp op);
+
+// Given a Composite op that wraps a core.aten.avg_pool2d, and assuming that
+// the padding part is extracted into a tfl.pad op prior to a
+// tfl.average_pool_2d, this function finds the return type of the needed
+// tfl.pad .
+ShapedType GetPadOpType(mhlo::CompositeOp op);
+
+// Given a Composite op that wraps a core.aten.avg_pool2d, finds the padding
+// attribute to be passed to the a tfl.average_pool_2d that can fully replace
+// this composite (here, padding is done directly by the tfl.average_pool_2d as
+// opposed to being extracted into a separate tfl.pad).
+StringAttr GetAvgPoolOpPadAttr(Builder& builder, mhlo::CompositeOp op);
+
+// Get dense attr for a matrix that corrects the over counting of divisors when
+// casting an average pool with ceil mode on in terms of average pool with it
+// off.
+DenseFPElementsAttr GetCorrectionMatrix(Builder& builder, mhlo::CompositeOp op);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_AVG_POOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
new file mode 100644
index 00000000..0bb758ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
+
+namespace mlir {
+namespace odml {
+
+std::unique_ptr<Pass> CreateCompositeLoweringPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
new file mode 100644
index 00000000..fbd131bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
@@ -0,0 +1,84 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+// Ensure an attribute named attr_name exists and it is of type AttrType.
+// If so, sets the `out_attr` pointer to point to the casted attribute.
+template <typename AttrType>
+bool EnsureAttribute(const DictionaryAttr& composite_attributes,
+                     const std::string& attr_name, AttrType* out_attr) {
+  Attribute attr = composite_attributes.get(attr_name);
+  if (!mlir::isa_and_nonnull<AttrType>(attr)) {
+    return false;
+  }
+  if (AttrType content = mlir::dyn_cast<AttrType>(attr)) {
+    *out_attr = content;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Changes a DenseIntElementsAttr **containing I64** elements to an I32 Vector.
+bool DenseI64AttrToI32Vector(const DenseIntElementsAttr& dense_attr,
+                             std::vector<int32_t>* out_vec);
+
+// Gets boolean from composite attrs if it exists.
+std::optional<bool> GetBoolFromCompositeAttr(
+    const DictionaryAttr& composite_attrs, llvm::StringRef attr_name);
+
+// Given a DictionaryAttr, checks if it has a DenseIntElementsAttr attribute
+// with the name attr_name. If so, extracts its values and stores as a vector
+// of int32_t elements.
+// Note: This assumes the DenseIntElementsAttr has its values stored as int64_t.
+bool GetI32VectorFromDenseI64CompositeAttr(
+    const DictionaryAttr& composite_attrs, const std::string& attr_name,
+    std::vector<int32_t>* out_vec);
+
+// Get a DenseIntElementsAttr of type I64 and convert it to an I32 attribute.
+DenseIntElementsAttr DenseI64AttrToI32Attr(
+    const DenseIntElementsAttr& dense_attr, PatternRewriter& builder);
+
+// Returns a NHWC shaped type from an NCHW shaped type op.
+// For example- Given a Composite op that wraps a core.aten.avg_pool2d, this
+// returns the return type of the tfl.average_pool_2d emitted. Note that the
+// aten.avg_pool2d works with the NCHW layout while tfl.average_pool_2d assumes
+// NHWC.
+ShapedType GetNhwcReturnTypeFromNchw(Operation* old_op);
+
+}  // namespace odml
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h
new file mode 100644
index 00000000..444a3c46
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_DROP_SAVEDMODEL_SEMANTICS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_DROP_SAVEDMODEL_SEMANTICS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+std::unique_ptr<Pass> CreateDropSavedModelSemanticsPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_DROP_SAVEDMODEL_SEMANTICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.h
new file mode 100644
index 00000000..ff91176a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_HLO_MATCHERS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_HLO_MATCHERS_H_
+
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+// The following 5 different forms of mhlo::iota will be matched:
+// 1. IotaOp.
+// 2. IotaOp + BroadCastInDim.
+// 3. IotaOp + Reshape.
+// 4. Constant (folded Iota) + BroadCastInDim.
+// 5. Constant (folded result).
+// Moreover, the dimensions has to match the iota_dimension.
+bool MatchIota(DenseIntElementsAttr dimensions, Value iota);
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_HLO_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv.h
new file mode 100644
index 00000000..0f741d9c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CONV_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CONV_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+// Prepares mhlo.convolutions and legalizes to the corresponding tfl op.
+//
+// Note: "tfl-native" layouts are as follows:
+// 2D             : [b, 0, 1, f]x[o, 0, 1, i]->[b, 0, 1, f]
+// 3D             : [b, 0, 1, 2, f]x[0, 1, 2, i, o]->[b, 0, 1, 2, f]
+// 2D (depthwise) : [b, 0, 1, f]x[i, 0, 1, o]->[b, 0, 1, f]
+//
+// Matches: mhlo.convolution
+//   layout:        any (will transpose to tfl-native)
+//   padding:       any (will pull into explicit pad_op)
+//   lhs_dilations: trivial (all 1)
+//   rhs_dilations: any
+//   strides:       any
+//   feature_group: see decision tree below
+//   batch_group:   trivial (1)
+//   reversal:      trivial (all False)
+//   shape:         static, rank 4 or 5
+//
+// This pattern emits TFL convs based on the following decision tree:
+// if lhs_dilations are trivial && kernel_out_features == output_features
+//   if feature_group == 1:
+//      if rank == 5: tfl.conv_3D
+//      if rank == 4: tfl.conv_2D
+//   else if input_features == feature_group:
+//      if rank == 4: tfl.depthwise_conv TODO: b/352954597 - Add support.
+//   else:
+//      if rank == 4: tfl.conv_2D
+// else:
+//   tfl.transpose_conv TODO: b/352954597 - Add support.
+void PopulateLegalizeConvPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                                  ConversionTarget& target);
+
+void PopulatePrepareConvPatterns(MLIRContext* ctx, RewritePatternSet& patterns);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.h
new file mode 100644
index 00000000..fe9664c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.h
@@ -0,0 +1,298 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CONV_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CONV_UTIL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/op_util_common.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+// Helpers for working with mhlo.convolution attrs in the mlir api as
+// native cc types.
+
+namespace mlir::odml {
+
+class ConvView {
+ public:
+  // int for each spatial dim. Default 1.
+  llvm::ArrayRef<int64_t> Strides() const { return strides_; }
+
+  // 2d array for each spatial dim. Default 0.
+  llvm::ArrayRef<DimPadding> Padding() const { return padding_; }
+
+  int64_t BatchGroupCount() const { return batch_group_count_; }
+
+  int64_t FeatureGroupCount() const { return feature_group_count_; }
+
+  // int for each spatial dim. Default 1.
+  llvm::ArrayRef<int64_t> InputDilations() const { return input_dilations_; }
+
+  // int for each spatial dim. Default 1.
+  llvm::ArrayRef<int64_t> KernelDilations() const { return kernel_dilations_; }
+
+  // bool for each spatial dim. Default false.
+  llvm::ArrayRef<bool> WindowReversal() const { return window_reversal_; }
+
+  llvm::ArrayRef<int64_t> InputShape() const { return input_shape_; }
+
+  const Layout& InputLayout() const { return input_layout_; }
+
+  llvm::ArrayRef<int64_t> KernelShape() const { return kernel_shape_; }
+
+  const Layout& KernelLayout() const { return kernel_layout_; }
+
+  llvm::ArrayRef<int64_t> OutputShape() const { return output_shape_; }
+
+  const Layout& OutputLayout() const { return output_layout_; }
+
+  mlir::Type ElementType() const { return element_type_; }
+
+  explicit ConvView(mhlo::ConvolutionOp op);
+
+ private:
+  llvm::SmallVector<int64_t, 2> strides_;
+
+  llvm::SmallVector<DimPadding, 2> padding_;
+
+  llvm::SmallVector<int64_t, 2> input_dilations_;
+  llvm::SmallVector<int64_t, 2> kernel_dilations_;
+
+  llvm::SmallVector<bool, 2> window_reversal_;
+
+  Layout input_layout_;
+  Layout kernel_layout_;
+  Layout output_layout_;
+
+  llvm::SmallVector<int64_t, 4> input_shape_;
+  llvm::SmallVector<int64_t, 4> kernel_shape_;
+  llvm::SmallVector<int64_t, 4> output_shape_;
+
+  int64_t batch_group_count_;
+  int64_t feature_group_count_;
+
+  mlir::Type element_type_;
+};
+
+inline bool HasSupportedRank(const ConvView& data) {
+  return data.InputLayout().Rank() == 4 || data.InputLayout().Rank() == 5;
+}
+
+inline bool HasSupportedOutFeatureDims(const ConvView& data) {
+  const int64_t kernel_out_features =
+      data.KernelLayout().SpecialDim2(data.KernelShape());
+  const int64_t out_features =
+      data.OutputLayout().SpecialDim2(data.OutputShape());
+  return kernel_out_features == out_features;
+}
+
+inline bool IsTrivialConv(const ConvView& data) {
+  return llvm::all_of(data.InputDilations(), [](auto d) { return d == 1; });
+}
+
+//
+// Supported non-trivial conv predicates
+//=-----
+
+bool MatchWithResizeBilinearOp(const ConvView& data, bool& align_corners);
+
+inline bool MatchWithResizeBilinearOp(const ConvView& data) {
+  bool align_corners = false;
+  return MatchWithResizeBilinearOp(data, align_corners);
+}
+
+bool IsTransposeConvPaddingValid(mhlo::ConvolutionOp conv_op,
+                                 size_t num_spatial_dims,
+                                 const ArrayRef<int64_t>& strides,
+                                 const ArrayRef<int64_t>& padding);
+
+bool IsTransposeConvPaddingSame(mhlo::ConvolutionOp conv_op,
+                                size_t num_spatial_dims,
+                                const ArrayRef<int64_t>& strides,
+                                const ArrayRef<int64_t>& padding);
+
+inline bool IsSupportedNonTrivialConv(const ConvView& data) {
+  // Only non-trivial 2d convolutions are supported.
+  const bool valid_rank = data.InputLayout().Rank() == 4;
+
+  // Negative padding is unsupported.
+  bool has_nagative_padding = llvm::all_of(
+      data.Padding(),
+      [](const DimPadding& p) { return p.Hi() < 0 || p.Lo() < 0; });
+
+  return (valid_rank && !IsTrivialConv(data) && !has_nagative_padding);
+}
+
+inline bool IsSupportedNonTrivialConv(mhlo::ConvolutionOp op) {
+  const ConvView data(op);
+  return IsSupportedNonTrivialConv(data);
+}
+
+//
+// Standard conv predicates
+//=-----
+
+inline bool HasStandardConvInFeatureDims(const ConvView& data) {
+  // kernel_in_features * feature_groups = input_features by definition.
+  const int64_t input_features =
+      data.InputLayout().SpecialDim2(data.InputShape());
+
+  const bool trivial_kernel_in_features =
+      data.FeatureGroupCount() == input_features;
+  const bool is_grouped_conv = data.FeatureGroupCount() != 1;
+
+  const int64_t rank = data.InputLayout().Rank();
+  return !trivial_kernel_in_features && (!is_grouped_conv || rank == 4);
+}
+
+inline bool IsStandardConv(const ConvView& data) {
+  return HasSupportedRank(data) && IsTrivialConv(data) &&
+         HasStandardConvInFeatureDims(data) && HasSupportedOutFeatureDims(data);
+}
+
+// Does this convolution map to a standard conv_2d or conv_3d
+// (not depthwise or tranpose conv)?
+inline bool IsStandardConv(mhlo::ConvolutionOp op) {
+  const ConvView data(op);
+  return IsStandardConv(data);
+}
+
+//
+// Depthwise conv predicates
+//=-----
+
+inline bool IsDepthwiseConv(const ConvView& data) {
+  const bool valid_rank = data.InputLayout().Rank() == 4;
+  if (!valid_rank || !HasSupportedOutFeatureDims(data) ||
+      !IsTrivialConv(data)) {
+    return false;
+  }
+  const int64_t in_channel_dim =
+      data.InputLayout().SpecialDim2(data.InputShape());
+  return data.FeatureGroupCount() == in_channel_dim;
+}
+
+// Does this convolution map to depthwise conv?
+inline bool IsDepthwiseConv(mhlo::ConvolutionOp op) {
+  const ConvView data(op);
+  return IsDepthwiseConv(data);
+}
+
+//
+// Tfl native layouts
+//=-----
+
+inline int64_t DnumRank(mhlo::ConvDimensionNumbersAttr dnums) {
+  return dnums.getInputSpatialDimensions().size() + 2;
+}
+
+inline Layout GetTFLNativeInputOrOutputLayout(int64_t rank) {
+  auto spatials = llvm::to_vector(llvm::seq<int64_t>(1, rank - 1));
+  return Layout(0, rank - 1, spatials);
+}
+
+inline Layout GetTFLNativeInputOrOutputLayout(
+    mhlo::ConvDimensionNumbersAttr dnums) {
+  return GetTFLNativeInputOrOutputLayout((DnumRank(dnums)));
+}
+
+inline Layout GetTFLNativeStandardConvKernelLayout(int64_t rank) {
+  if (rank != 5) {
+    auto spatials = llvm::to_vector(llvm::seq<int64_t>(1, rank - 1));
+    return Layout(rank - 1, 0, spatials);
+  }
+  auto spatials = llvm::to_vector(llvm::seq(rank - 2));
+  return Layout(rank - 2, rank - 1, spatials);
+}
+
+inline Layout GetTFLNativeDepthwiseConvKernelLayout() {
+  return Layout(0, 3, {1, 2});
+}
+
+inline Layout GetTFLNativeStandardConvKernelLayout(
+    mhlo::ConvDimensionNumbersAttr dnums) {
+  return GetTFLNativeStandardConvKernelLayout(DnumRank(dnums));
+}
+
+inline bool IsTFLNativeLayout(const ConvView& data) {
+  const int64_t rank = data.KernelLayout().Rank();
+  const auto native_io_layout = GetTFLNativeInputOrOutputLayout(rank);
+
+  std::optional<Layout> native_kernel_layout = std::nullopt;
+  if (IsDepthwiseConv(data)) {
+    native_kernel_layout = GetTFLNativeDepthwiseConvKernelLayout();
+  } else if (IsStandardConv(data) || IsSupportedNonTrivialConv(data)) {
+    native_kernel_layout = GetTFLNativeStandardConvKernelLayout(rank);
+  }
+  if (!native_kernel_layout.has_value()) {
+    return false;
+  }
+
+  return data.InputLayout() == native_io_layout &&
+         data.KernelLayout() == *native_kernel_layout &&
+         data.OutputLayout() == native_io_layout;
+}
+
+//
+// ConvDimensionNumbers utils
+//=-----
+
+inline mhlo::ConvDimensionNumbersAttr CloneDnumsWithInputLayout(
+    OpBuilder& b, mhlo::ConvDimensionNumbersAttr dnums, const Layout& layout) {
+  return mhlo::ConvDimensionNumbersAttr::get(
+      b.getContext(), layout.SpecialDim1(), layout.SpecialDim2(),
+      layout.Spatials(), dnums.getKernelInputFeatureDimension(),
+      dnums.getKernelOutputFeatureDimension(),
+      dnums.getKernelSpatialDimensions(), dnums.getOutputBatchDimension(),
+      dnums.getOutputFeatureDimension(), dnums.getOutputSpatialDimensions());
+}
+
+inline mhlo::ConvDimensionNumbersAttr CloneDnumsWithKernelLayout(
+    OpBuilder& b, mhlo::ConvDimensionNumbersAttr dnums, const Layout& layout) {
+  return mhlo::ConvDimensionNumbersAttr::get(
+      b.getContext(), dnums.getInputBatchDimension(),
+      dnums.getInputFeatureDimension(), dnums.getInputSpatialDimensions(),
+      layout.SpecialDim1(), layout.SpecialDim2(), layout.Spatials(),
+      dnums.getOutputBatchDimension(), dnums.getOutputFeatureDimension(),
+      dnums.getOutputSpatialDimensions());
+}
+
+inline mhlo::ConvDimensionNumbersAttr CloneDnumsWithOutputLayout(
+    OpBuilder& b, mhlo::ConvDimensionNumbersAttr dnums, const Layout& layout) {
+  return mhlo::ConvDimensionNumbersAttr::get(
+      b.getContext(), dnums.getInputBatchDimension(),
+      dnums.getInputFeatureDimension(), dnums.getInputSpatialDimensions(),
+      dnums.getKernelInputFeatureDimension(),
+      dnums.getKernelOutputFeatureDimension(),
+      dnums.getKernelSpatialDimensions(), layout.SpecialDim1(),
+      layout.SpecialDim2(), layout.Spatials());
+}
+
+// Wraps the lhs of given conv op in an explicit pad op matching the same
+// behavior implicit in the paddings attribute. Gets result of new pad op.
+Value CreatePadOpFromConvPadding(OpBuilder& b, mhlo::ConvolutionOp op);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CONV_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.h
new file mode 100644
index 00000000..c7c3bdde
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CUSTOM_CALL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CUSTOM_CALL_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+void PopulateCustomCallPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                                ConversionTarget& target);
+
+void PopulateCustomCallPreparePatterns(MLIRContext* ctx,
+                                       RewritePatternSet& patterns);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CUSTOM_CALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.h
new file mode 100644
index 00000000..91df1b63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Legalize mhlo.dot_general to tflite.batch_matmul.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_DOT_GENERAL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_DOT_GENERAL_H_
+
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+// Converts mhlo.dot_general to tfl.BatchMatMul. Reshape and Transpose ops will
+// be inserted to convert to well-formed matrix multiply; i.e., mhlo.dot_general
+// -> tfl.batch_matmul(mhlo.transpose(mhlo.reshape(operand)), ...).
+// Note:
+// 1) Reshape/transpose are inserted because tfl.BatchMatMul requires
+// size(contracting_dimensions) = 1 and size(output_dim) = 1, whereas
+// mhlo.dot_general has no such restriction.
+// 2) Inserted mhlo.reshape/transpose will be legalized to tf.reshape/transpose
+// in LegalizeHloToTf (then from tf to tfl later).
+// 3) If the operands are dynamic shaped tensors, mhlo.DynamicReshapeOp is
+// inserted instead of the regular reshape, and additional ops (e.g. Gather,
+// Concat ) are inserted for shape inference purposes.
+// 4) All the DotOp are converted to DotGeneral during the optimization pass
+// (ConvertDotOp).
+class LowerDotGeneralOp : public OpConversionPattern<mhlo::DotGeneralOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::DotGeneralOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final;
+};
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_DOT_GENERAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.h
new file mode 100644
index 00000000..0c9cf35f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_FFT_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_FFT_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+// Patterns to legalize mhlo.fft to TFL.
+void PopulateLegalizeFftPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                                 ConversionTarget& target);
+
+// Patterns to prepare mhlo.fft to TFL.
+void PopulatePrepareFftPatterns(MLIRContext* ctx, RewritePatternSet& patterns);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_FFT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.h
new file mode 100644
index 00000000..35a36613
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GATHER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GATHER_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+// Patterns to legalize mhlo.gather to TFL
+//
+// Emits: tfl.gather_nd or a combination of tfl.slice, tfl.squeeze, tfl.concat
+void PopulateGatherPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                            ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GATHER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.h
new file mode 100644
index 00000000..6dfc67e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GELU_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GELU_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+// Matches non-approximate GELU patterns.
+//
+//    -> mul 1/sqrt(2) -> erf -> add 1 ->
+// in                                    mul
+//    ---------> mul 0.5 --------------->
+//
+// This pattern assumes all binary ewise ops with one constant argument
+// have that constant argument as the second operand. It works by
+// identifying `erf` ops and validate the structure around them.
+class LowerGELU : public RewritePattern {
+ public:
+  explicit LowerGELU(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/get_dimension_size.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/get_dimension_size.h
new file mode 100644
index 00000000..6cd63730
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/get_dimension_size.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GET_DIMENSION_SIZE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GET_DIMENSION_SIZE_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+void PopulateGetDimensionSizePatterns(MLIRContext* ctx,
+                                      RewritePatternSet& patterns,
+                                      ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_GET_DIMENSION_SIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.h
new file mode 100644
index 00000000..459aabf9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_IF_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_IF_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+
+// Patterns to legalize mhlo.if to TFL.
+void PopulateIfPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                        ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_IF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/iota.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/iota.h
new file mode 100644
index 00000000..7d4f76bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/iota.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_IOTA_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_IOTA_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+void PopulateIotaPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                          ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_IOTA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/op_util_common.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/op_util_common.h
new file mode 100644
index 00000000..9b0e19aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/op_util_common.h
@@ -0,0 +1,148 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_OP_UTIL_COMMON_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_OP_UTIL_COMMON_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+// Class that encodes the "layout" of a tensor. Layouts, generically
+// are some naming of the dimensions of a tensor. In all cases, 2 dimensions
+// are "special" (e.g. batch / feature) and the rest are referred to as "spatial
+// dims". When the special dims are batch and feature, batch is special dim 1
+// and feature is special dim 2. When special dims are input and output features
+// (conv filter), input features is special dim 1 and output features is special
+// dim 2.
+class Layout {
+ public:
+  llvm::ArrayRef<int64_t> Spatials() const { return spatials_; }
+
+  int64_t NumSpatials() const { return spatials_.size(); }
+
+  int64_t Rank() const { return NumSpatials() + 2; }
+
+  Layout(int64_t special_dim1, int64_t special_dim2, ArrayRef<int64_t> spatials)
+      : special_dim1_(special_dim1),
+        special_dim2_(special_dim2),
+        spatials_(spatials) {}
+
+  // TODO: b/351437662 - Consider just using 2 arrays for the case where
+  // there are more than 2 special dims.
+  int64_t SpecialDim1() const { return special_dim1_; }
+
+  // Conveniance accesor for getting the dimension size of the first
+  // special dimension from a shape.
+  int64_t SpecialDim1(llvm::ArrayRef<int64_t> shape) const {
+    return shape[special_dim1_];
+  }
+
+  int64_t SpecialDim2() const { return special_dim2_; }
+
+  // Convenience accesor for getting the dimension size of the second
+  // special dimension from a shape.
+  int64_t SpecialDim2(llvm::ArrayRef<int64_t> shape) const {
+    return shape[special_dim2_];
+  }
+
+  // Conveniance method for equality checking special dims.
+  bool HasSpecialDims(int64_t special_dim1, int64_t special_dim2) const;
+
+  // Determines if the spatial dimensions are all adjacent and in
+  // ascending order.
+  bool AreSpatialsIota() const;
+
+  // Gets a "permutation array" to be used for transposing a tensor
+  // of "this" layout to the given layout. A permutation array is some
+  // permutation of [0, 1, i...] for i < rank(layout). Assumes
+  // "this" and given layout have the same rank.
+  llvm::SmallVector<int64_t, 4> GetPermForReLayout(
+      const Layout& to_layout) const;
+
+  // Permutes given shape based on the permutaion implied to take this Layout to
+  // the given one.
+  llvm::SmallVector<int64_t, 4> PermuteShape(const Layout& to_layout,
+                                             ArrayRef<int64_t> shape) const;
+
+  bool operator==(const Layout& other) const {
+    return SpecialDim1() == other.SpecialDim1() &&
+           SpecialDim2() == other.SpecialDim2() &&
+           Spatials() == other.Spatials();
+  }
+
+  bool operator!=(const Layout& other) const { return !(*this == other); }
+
+ private:
+  int64_t special_dim1_;
+  int64_t special_dim2_;
+  llvm::SmallVector<int64_t> spatials_;
+};
+
+// Wrapper for the padding attrs along a single dimension.
+class DimPadding {
+ public:
+  int64_t Hi() const { return hi_; }
+
+  int64_t Lo() const { return lo_; }
+
+  bool Trivial() const { return Hi() == 0 && Lo() == 0; }
+
+  DimPadding(int64_t lo, int64_t hi) : lo_(lo), hi_(hi) {}
+
+ private:
+  int64_t lo_;
+  int64_t hi_;
+};
+
+inline llvm::SmallVector<int64_t> UnrollI64Splat(DenseElementsAttr data) {
+  if (!data.isSplat()) {
+    return llvm::SmallVector<int64_t>(data.getValues<int64_t>());
+  }
+  return llvm::SmallVector<int64_t>(data.getType().getNumElements(),
+                                    data.getSplatValue<int64_t>());
+}
+
+// Resolves optional strides or dilations attributes. If not present,
+// will return trivial 1's vector.
+llvm::SmallVector<int64_t, 4> ResolveStridesOrDilations(
+    int64_t rank, std::optional<mlir::DenseIntElementsAttr> opt_attr);
+
+// Resolves optional paddings attributes. If not present, will return
+// trivial [0, 0] paddings on each dim.
+llvm::SmallVector<DimPadding, 4> ResolvePadding(
+    int64_t rank, std::optional<mlir::DenseIntElementsAttr> opt_padding);
+
+// Does the padding correspond to "SAME" on given dimension configuration.
+// Assumes given dimension configuration is well formed.
+bool IsSamePaddingOnDim(int64_t in, int64_t dilate, int64_t stride, int64_t k,
+                        const DimPadding& pad);
+
+template <typename T>
+inline DenseElementsAttr BuildScalarDense(Type e_type, T val) {
+  auto type = RankedTensorType::get({}, e_type);
+  return DenseElementsAttr::get(type, val);
+}
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_OP_UTIL_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h
new file mode 100644
index 00000000..a9c0940b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+
+// Patterns to legalize mhlo.pad to TFL
+//
+// Prefers tfl.pad over tfl.padv2 when it can be asserted that the pad
+// values are zero.
+//
+// Matches: mhlo.pad
+//  padding_high/low: all positive or zero
+//. interior_padding: all zero
+//
+// Emits: tfl.pad, tfl.padv2
+void PopulatePadPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                         ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad_util.h
new file mode 100644
index 00000000..50419039
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_UTIL_H_
+
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+
+// Gets elements corresponding to slice starts from negative padding
+// values.
+DenseIntElementsAttr SliceStartFromNegPadLows(mhlo::PadOp op);
+
+// Gets elements corresponding to slice ends from negative padding
+// values.
+DenseIntElementsAttr SliceEndFromNegPadHighs(mhlo::PadOp op);
+
+// Gets a copy of `data` with negative values replaced with 0.
+DenseIntElementsAttr ReplaceNegsWithZero(DenseElementsAttr data);
+
+bool AnyNegativePads(mhlo::PadOp op);
+
+bool TrivialInterior(mhlo::PadOp op);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h
new file mode 100644
index 00000000..3bf03aec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+void PopulateReduceArgMinMaxTFPatterns(MLIRContext* ctx,
+                                       RewritePatternSet& patterns);
+
+void PopulateReducePatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                            ConversionTarget& target);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.h
new file mode 100644
index 00000000..ccc9c27f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_WINDOW_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_WINDOW_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+// Patterns to legalize mhlo.reduce_window to TFL.
+//
+// Maps the following representations of AvgPool in MHLO into a tfl.avg_pool
+// operation when they cleanly map to 2D or 3D average pool with VALID or SAME
+// padding:
+// * div(reduce_sum_window(x), constant(sizeof(window)))
+// * div(reduce_sum_window(x), reduce_sum_window(constant(1)))
+//
+// Emits: tfl.average_pool2d
+void PopulateLegalizeReduceWindowPatterns(MLIRContext* ctx,
+                                          RewritePatternSet& patterns,
+                                          ConversionTarget& target);
+
+// Patterns to prepare mhlo.reduce_window for legalization.
+// Transposes reduce_windows to be NHWC.
+//
+// Emits: tfl.transpose
+void PopulatePrepareReduceWindowPatterns(MLIRContext* ctx,
+                                         RewritePatternSet& patterns);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_WINDOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window_util.h
new file mode 100644
index 00000000..69834345
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window_util.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_WINDOW_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_WINDOW_UTIL_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/op_util_common.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+// Helpers for working with mhlo.reduce_window attrs in the mlir api as
+// native cc types.
+
+namespace mlir::odml {
+
+class ReduceWindowView {
+ public:
+  explicit ReduceWindowView(mhlo::ReduceWindowOp op);
+
+  llvm::ArrayRef<int64_t> WindowDims() const { return window_dims_; }
+  int64_t WindowSize() const { return window_size_; }
+  llvm::ArrayRef<int64_t> WindowStrides() const { return window_strides_; }
+  llvm::ArrayRef<DimPadding> Paddings() const { return paddings_; }
+  llvm::ArrayRef<int64_t> WindowDilations() const { return window_dilations_; }
+  llvm::ArrayRef<int64_t> BaseDilations() const { return base_dilations_; }
+  int64_t Rank() const { return rank_; }
+
+  std::optional<Layout> GuessLayout() const;
+
+ private:
+  int64_t rank_;
+
+  llvm::SmallVector<int64_t, 4> window_dims_;
+  llvm::SmallVector<int64_t, 4> window_strides_;
+  llvm::SmallVector<int64_t, 4> window_dilations_;
+
+  llvm::SmallVector<DimPadding, 4> paddings_;
+
+  llvm::SmallVector<int64_t, 4> base_dilations_;
+
+  int64_t window_size_;
+};
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_REDUCE_WINDOW_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h
new file mode 100644
index 00000000..a7363c68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h
@@ -0,0 +1,109 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SCATTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SCATTER_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+// Convert updates into canonical form as expected by tf.scatter ops.
+//
+// tf.scatter expects `update_window_dims` to be the trailing dimensions.
+//
+// To support scatter ops generated by numpy-like slice updates:
+//   nd_array[:, [i,j]] = [i_values, j_values]
+//
+// `updates` must be transposed when the update_window_dims are the leading
+// dimensions of `updates`.
+//
+// Other values of `update_window_dims` are left unsupported.
+//
+// Eg 1. An update in canonical form:
+//  * indices shape(A,B,C)
+//  * updates shape(A,B,D,E,F)
+// Then:
+//  * D,E,F are the update window dims [2,3,4]
+//  * C is the index vector dimension
+//  * A,B iterate over the updates and indices
+//
+// If `update_window_dims` are not the trailing dimensions then updates must be
+// transposed.
+//
+// Eg 2. An update in non-canonical form:
+//  * indices shape(a,b,c)
+//  * updates shape(d,e,f,a,b)
+// Then:
+//  * d,e,f are the update window dims [0,1,2]
+//  * c is the index vector dimension
+//  * a,b iterate over the updates and indices
+//
+//  The update needs permuting to be in the form (a,b,d,e,f) so that the update
+//  window dims are the trailing dimensions.
+//
+// To canonicalize the updates above, replace the updates with:
+//   transpose(updates, permutation={3,4,0,1,2})
+//
+// Note: NormalizeIndexVector is assumed to have run on the indices already so
+// that the index_vector_dim is the trailing dimension in `indices`.
+LogicalResult CanonicalizeScatterUpdates(
+    Operation* scatter_op, llvm::ArrayRef<int64_t> update_window_dims,
+    const Value& indices, const ShapedType& indices_type, Value& updates,
+    ShapedType& updates_type, ConversionPatternRewriter& rewriter);
+
+template <typename BinaryOp, typename TfOp>
+class ConvertScatterOp : public OpConversionPattern<mhlo::ScatterOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::ScatterOp scatter_op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final;
+};
+
+using ConvertScatterAddOp =
+    ConvertScatterOp<mhlo::AddOp, TF::TensorScatterAddOp>;
+using ConvertScatterMaxOp =
+    ConvertScatterOp<mhlo::MaxOp, TF::TensorScatterMaxOp>;
+using ConvertScatterMinOp =
+    ConvertScatterOp<mhlo::MinOp, TF::TensorScatterMinOp>;
+using ConvertScatterSubOp =
+    ConvertScatterOp<mhlo::SubtractOp, TF::TensorScatterSubOp>;
+using ConvertScatterUpdateOp =
+    ConvertScatterOp<void, TF::TensorScatterUpdateOp>;
+
+template class ConvertScatterOp<mhlo::AddOp, TF::TensorScatterAddOp>;
+template class ConvertScatterOp<mhlo::MaxOp, TF::TensorScatterMaxOp>;
+template class ConvertScatterOp<mhlo::MinOp, TF::TensorScatterMinOp>;
+template class ConvertScatterOp<mhlo::SubtractOp, TF::TensorScatterSubOp>;
+template class ConvertScatterOp<void, TF::TensorScatterUpdateOp>;
+
+}  // end namespace odml
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SCATTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.h
new file mode 100644
index 00000000..024cbb4a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SLICE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SLICE_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+
+// Patterns to legalize mhlo.slice to TFL.
+void PopulateLegalizeSlicePatterns(MLIRContext* ctx,
+                                   RewritePatternSet& patterns,
+                                   ConversionTarget& target);
+
+void PopulatePrepareSlicePatterns(MLIRContext* ctx,
+                                  RewritePatternSet& patterns);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/sort.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/sort.h
new file mode 100644
index 00000000..c293bad9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/sort.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SORT_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SORT_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir::odml {
+
+void PopulateSortPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                          ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_SORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
new file mode 100644
index 00000000..c72fce3f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
@@ -0,0 +1,168 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_UTIL_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+struct PermutationAndShape {
+  DenseIntElementsAttr permutation;
+  ShapedType shape;
+};
+
+// Check that `arr` is an R1 iota with integer element type starting from
+// `start` with `size` number of values.
+bool IsIotaAttr(ArrayRef<int64_t> arr, int64_t size, int64_t start = 0);
+
+// Returns a DenseIntElementsAttr for a permutation and the shape after
+// applying the permutation to a given shape through a transpose.
+PermutationAndShape GetPermutationAndTransposedShape(
+    llvm::ArrayRef<int64_t> permutation_array, ShapedType input_type,
+    ConversionPatternRewriter& rewriter);
+
+// Create a single const integer.
+Value BuildIntConstOp(ImplicitLocOpBuilder& builder,
+                      ConversionPatternRewriter& rewriter, int64_t const_value,
+                      Type type);
+
+// Create a const integer vector tensor (1-dim).
+template <typename ConstOpT = TF::ConstOp>
+Value BuildIntArrayConstOp(ImplicitLocOpBuilder& builder,
+                           ConversionPatternRewriter& rewriter,
+                           ArrayRef<int64_t> const_value, Type type) {
+  DenseIntElementsAttr const_value_raw;
+  if (type == rewriter.getI64Type()) {
+    const_value_raw = rewriter.getI64TensorAttr(const_value);
+  } else {
+    // Convert I64 const array to I32.
+    llvm::SmallVector<int32_t> const_i32_vec;
+    for (auto element : const_value) {
+      const_i32_vec.push_back(static_cast<int32_t>(element));
+    }
+    const_value_raw = rewriter.getI32TensorAttr(const_i32_vec);
+  }
+  Value result_const = builder.create<ConstOpT>(const_value_raw);
+  return result_const;
+}
+
+// Returns the inverse permutation array for a permutation array.
+llvm::SmallVector<int64_t> GetInversePermutationArray(
+    llvm::ArrayRef<int64_t> permutation_array);
+
+// Returns the DenseIntElementsAttr for an inverse permutation given a
+// permutation_array.
+DenseIntElementsAttr GetInversePermutation(
+    llvm::ArrayRef<int64_t> permutation_array,
+    ConversionPatternRewriter& rewriter);
+
+// Returns a DenseIntElementsAttr for an inverse permutation and the shape after
+// applying the inverse permutation to a given shape through a transpose.
+PermutationAndShape GetInversePermutationAndShape(
+    llvm::ArrayRef<int64_t> permutation_array, ShapedType input_type,
+    ConversionPatternRewriter& rewriter);
+
+// Returns true if the op needs reformat.
+bool NeedsReformatTypeAndPermutation(int batch_dim, int feature_dim,
+                                     int spatial_dim_start,
+                                     int default_batch_dim,
+                                     int default_feature_dim,
+                                     int default_spatial_dim_start);
+
+// Gets reformat type and permutation attribute. Call this function only if
+// NeedsReformatTypeAndPermutation returns true. If
+// NeedsReformatTypeAndPermutation returns false, this function returns the pair
+// of input type and no-op permutation.
+
+std::pair<RankedTensorType, DenseIntElementsAttr> GetReformatTypeAndPermutation(
+    int batch_dim, int feature_dim, int spatial_dim_start,
+    int default_batch_dim, int default_feature_dim,
+    int default_spatial_dim_start, int num_spatial_dims, RankedTensorType type,
+    ConversionPatternRewriter& rewriter);
+
+// Insert transpose so the input value is converted to the format specified by
+// the default dims
+Value InsertTranspose(Value value, int batch_dim, int feature_dim,
+                      ArrayRef<int64_t> spatial_dimensions,
+                      int default_batch_dim, int default_feature_dim,
+                      int default_spatial_dim_start, int num_spatial_dims,
+                      ConversionPatternRewriter& rewriter);
+
+// If index_vector_dim == indices.rank() then insert the implicit extra
+// dimension into indices to normalize everything to index_vector_dim ==
+// indices.rank() - 1.
+LogicalResult NormalizeIndexVector(Operation* parent_op, Value& indices,
+                                   ShapedType& indices_type,
+                                   int64_t index_vector_dim,
+                                   ConversionPatternRewriter& rewriter);
+
+// Checks if the specified region is a binary reduction function that takes 2
+// inputs, passes it to an instance of the specified reduction op and then
+// returns the result.
+template <typename ReductionOp>
+LogicalResult MatchBinaryReduceFunction(mlir::Region& function) {
+  Block& body = function.front();
+  if (body.getNumArguments() != 2) return failure();
+
+  mhlo::ReturnOp return_op = dyn_cast<mhlo::ReturnOp>(body.back());
+  if (!return_op) return failure();
+  if (return_op.getNumOperands() != 1) return failure();
+
+  ReductionOp reduce_op = dyn_cast_or_null<ReductionOp>(
+      return_op.getOperands().front().getDefiningOp());
+  if (!reduce_op) return failure();
+  if (reduce_op.getLhs() != body.getArgument(0) ||
+      reduce_op.getRhs() != body.getArgument(1))
+    return failure();
+
+  return success();
+}
+
+// Check if the specified region is a binary reduction function that takes 2
+// inputs and returns the second input. Functions like this are used by update
+// scatter like ops.
+template <>
+LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function);
+
+// Util that casts 'val' to Int32 by adding a tfl cast Op.
+Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter);
+
+// Replaces `region`'s terminator to TFL::Yield.
+void ReplaceTerminatorWithYield(Region& region, PatternRewriter& rewriter);
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.h
new file mode 100644
index 00000000..3b302215
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_WHILE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_WHILE_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+
+void PopulateWhilePatterns(MLIRContext* ctx, RewritePatternSet& patterns,
+                           ConversionTarget& target);
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_WHILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h
new file mode 100644
index 00000000..9594769e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
+
+#include <memory>
+#include <optional>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace func {
+class FuncOp;
+}
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+
+namespace odml {
+
+/// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
+/// list.
+void PopulateLegalizeTfPatterns(MLIRContext* context,
+                                RewritePatternSet* patterns);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h
new file mode 100644
index 00000000..9bcee095
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Adds passes which transform TF_XlaCallModule Op to StableHLO Ops.
+// Note that this pass only supports static shape tensors for now.
+std::unique_ptr<mlir::OperationPass<ModuleOp>>
+CreateLegalizeTFXlaCallModuleToStablehloPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h
new file mode 100644
index 00000000..8d57016b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_OP_STAT_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_OP_STAT_PASS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Creates a pass which prints out a detailed report of conversion stats with:
+// success or not, % of Ops non-converted, list of non-converted Ops, etc.
+std::unique_ptr<Pass> createPrintOpStatsPass(
+    std::vector<std::string> accepted_dialects);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_OP_STAT_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
new file mode 100644
index 00000000..e56b7130
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+std::unique_ptr<Pass> CreateRenameEntrypointToMainPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h
new file mode 100644
index 00000000..61e076e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_SMUGGLE_DISALLOWED_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_SMUGGLE_DISALLOWED_OPS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+std::unique_ptr<Pass> CreateSmuggleDisallowedOpsPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_SMUGGLE_DISALLOWED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h
new file mode 100644
index 00000000..7a02085c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h
@@ -0,0 +1,84 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Unfuses MHLO batch norm inference op into arithmetic ops.
+std::unique_ptr<Pass> createUnfuseBatchNormPass();
+
+// Constant folds broadcast_in_dim op conditionally.
+std::unique_ptr<Pass> createFoldBroadcastPass();
+
+// Fuses MHLO binary element-wise ops and convolution op.
+std::unique_ptr<Pass> createFuseConvolutionPass();
+
+// Applies various optimizations on MHLO IR.
+std::unique_ptr<Pass> createOptimizePass();
+
+// Finds quantization patterns and compose them to uniform
+// quantized types.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateComposeUniformQuantizedTypePass();
+
+// Finds stablehlo ops that accept or produce uniform
+// quantized typed tensors and converts them to equivalent ops in the TFLite
+// dialect.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateUniformQuantizedStableHloToTflPass();
+
+// Commutes transposes through specific ops
+std::unique_ptr<OperationPass<ModuleOp>> CreateTransposeCommuteOpsPass();
+
+// Legalizes MHLO to TF dialect.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfPass();
+
+// Replaces a splat constant tensor with a BroadcastInDim
+// op.
+std::unique_ptr<OperationPass<ModuleOp>> CreateUnfoldSplatConstantPass();
+
+// Legalizes MHLO to TFLite dialect.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfLitePass();
+
+// Lowers stablehlo composite ops to tflite ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCompositeLoweringPass();
+
+// Legalizes CHLO to tflite dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeChloToTflPass();
+
+// Rewrites MHLO in preparation for tflite legalization.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareHloPass();
+
+// Adds the HLO to TF rewrite patterns to the specified pattern list.
+void PopulateLegalizeHloToTfPatterns(RewritePatternSet* patterns,
+                                     MLIRContext* context);
+
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h.inc"
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h
new file mode 100644
index 00000000..066bcc00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace odml {
+
+std::vector<std::string> GetAcceptedStableHLODialects();
+
+std::vector<std::string> GetAcceptedTFLiteDialects();
+
+// Can we find the given `dialect_name` in the `accepted_dialects`?
+bool IsAcceptedDialect(llvm::StringRef dialect_name,
+                       const std::vector<std::string> &accepted_dialects);
+
+// The consolidated logic to verify if each final op is acceptable or not.
+// Also see `PrintOpStatsPass` and `CheckAcceptedOpsPass`.
+bool IsAcceptedOp(llvm::StringRef dialect_name, llvm::StringRef op_name,
+                  const std::vector<std::string> &accepted_dialects);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
new file mode 100644
index 00000000..c26a3f36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Adds passes which transform TF Ops to StableHLO Ops.
+void AddLegalizeTFToStablehloPasses(OpPassManager& pm,
+                                    bool skip_quantization_ops,
+                                    bool skip_resize,
+                                    bool skip_partitioned_calls);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h
new file mode 100644
index 00000000..e6e40762
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_STABLEHLO_PASS_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Creates a pass which transforms TFLite to StableHLO Ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTflToStablehloPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_STABLEHLO_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h
new file mode 100644
index 00000000..abcdd827
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TRANSFORMS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TRANSFORMS_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Adds all the necessary passes to lower a TF module to StableHLO.
+// `skip_resize` enables or disables skipping conversion of tf.ResizeBilinear
+// and tf.ResizeNearestNeighbor ops.
+// `smuggle_disallowed_ops` enables or disables converting disallowed ops
+// like tf.ResizeBilinear or tf.ResizeNearestNeighbor to mhlo.custom_call ops.
+void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
+                            bool smuggle_disallowed_ops);
+
+// This function is a common entry point for all graph optimizations that are
+// not specific to any hardware. It legalizes SHLO->MHLO, does MHLO->MHLO
+// optimizations by calling `AddMhloOptimizationPasses` internally, and
+// legalizes MHLO->SHLO
+void AddStablehloOptimizationPasses(OpPassManager& pm);
+
+// Adds all the backend-agonstic stableHLO optimization passes
+void AddMhloOptimizationPasses(OpPassManager& pm, bool add_fold_broadcast_pass);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TRANSFORMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h
new file mode 100644
index 00000000..fc7c2316
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_UTILS_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+// Builds body for reduce op by using the template binary op as the
+// reducer op.
+template <typename Op>
+void BuildReduceBody(Type element_type, Region* body, OpBuilder* builder) {
+  OpBuilder::InsertionGuard guard(*builder);
+  Block* block = builder->createBlock(body);
+
+  // Block arguments are scalars of the given element type.
+  Type type = RankedTensorType::get(/*shape=*/{}, element_type);
+  Location loc = body->getLoc();
+  block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
+
+  auto reducer =
+      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
+  builder->create<mhlo::ReturnOp>(loc, reducer.getResult());
+}
+
+mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
+                                      OpBuilder* builder);
+
+mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
+                                        OpBuilder* builder);
+
+// Converts an ArrayAttr to a 1D 64-bit dense elements attribute.
+DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr);
+DenseIntElementsAttr GetI64ElementsAttr(llvm::ArrayRef<int64_t> values,
+                                        Builder* builder);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stateful_error_reporter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stateful_error_reporter.h
new file mode 100644
index 00000000..fbb82d3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/stateful_error_reporter.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STATEFUL_ERROR_REPORTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STATEFUL_ERROR_REPORTER_H_
+
+// LINT.IfChange
+#include <string>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+
+namespace tflite_migration {
+
+// Similar to tflite::ErrorReporter, except that it allows callers to get the
+// last error message.
+class StatefulErrorReporter : public tflite::ErrorReporter {
+ public:
+  // Returns last error message. Returns empty string if no error is reported.
+  virtual std::string message() = 0;
+};
+
+}  // namespace tflite_migration
+// LINT.ThenChange(//tensorflow/lite/stateful_error_reporter.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STATEFUL_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_tfl_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
new file mode 100644
index 00000000..3ad5e52b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+
+namespace tensorflow {
+
+// Add the TF to TFLite passes, specified in the pass_config, into a
+// pass_manager. The session object will be provided when the TF MLIR is
+// imported from saved model version one and utilized for capturing resource
+// variables. If the `saved_model_dir` directory path is provided, then the
+// `tf_saved_model.asset` ops will be freezed.
+void AddTFToTFLConversionPasses(llvm::StringRef saved_model_dir,
+                                const tflite::ConverterFlags& converter_flags,
+                                const mlir::TFL::PassConfig& pass_config,
+                                mlir::OpPassManager* pass_manager);
+
+// Adds the first portion of StableHLO->TF passes happening before quantization.
+// The `pass_manager` that runs on a `mlir::ModuleOp` expects a graph containing
+// a `mlir::TF::XlaCallModuleOp` with serialized StableHLO module. The resulting
+// `mlir::ModuleOp` after running these passes will be an MHLO module, or a
+// StableHLO module if `pass_config.enable_stablehlo_quantizer` is `true`. This
+// is because StableHLO Quantizer accepts StableHLO modules.
+void AddPreQuantizationStableHloToTfPasses(
+    mlir::StringRef entry_function_name,
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager& pass_manager);
+
+// Adds the second portion of StableHlo->TF passes happening after quantization.
+// The input module is expected to be an MHLO module, or a quantized StableHLO
+// graph (expressed as `mlir::TF::XlaCallModuleOp`s) if
+// `pass_config.enable_stablehlo_quantizer` is `true`.
+void AddPostQuantizationStableHloToTfPasses(
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager& pass_manager);
+
+// This is the early part of the conversion in isolation. This enables a caller
+// to inject more information in the middle of the conversion before resuming it
+// (like freezing variables for example).
+void AddPreVariableFreezingTFToTFLConversionPasses(
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager* pass_manager);
+
+// This is the later part of the conversion in isolation. This enables a caller
+// to resume the conversion after injecting more information in the middle of
+// it.
+void AddPostVariableFreezingTFToTFLConversionPasses(
+    llvm::StringRef saved_model_dir,
+    const tflite::ConverterFlags& converter_flags,
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager* pass_manager);
+
+// Adds the passes that freeze variables from global tensors and unfreeze
+// mutable global tensors. `pass_config` is used to determine whether to freeze
+// variables and `pass_manager` will be populated with the passes to run.
+void AddVariableFreezingFromGlobalTensorsPasses(
+    const tflite::ConverterFlags& converter_flags,
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager* pass_manager);
+
+// Simplified API for TF->TFLite conversion with default flags.
+void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
+                                mlir::OpPassManager* pass_manager);
+
+// Add the Quantization passes, specified in the pass_config, into a pass
+// manager.
+void AddQuantizationPasses(const mlir::TFL::PassConfig& pass_config,
+                           mlir::OpPassManager& pass_manager);
+
+// Add the DynamicRangeQuantization passes, specified in the pass_config, into a
+// pass manager.
+void AddDynamicRangeQuantizationPasses(const mlir::TFL::PassConfig& pass_config,
+                                       mlir::OpPassManager& pass_manager);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
new file mode 100644
index 00000000..e002fd34
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_TRANSLATE_CL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_TRANSLATE_CL_H_
+
+// This file contains command-line options aimed to provide the parameters
+// required by the TensorFlow Graph(Def) to TF Lite Flatbuffer conversion. It is
+// only intended to be included by binaries.
+
+#include <string>
+
+#include "llvm/Support/CommandLine.h"
+
+// The commandline options are defined in LLVM style, so the caller should
+// use llvm::InitLLVM to initialize the options.
+//
+// Please see the implementation file for documentation of details of these
+// options.
+// TODO(jpienaar): Revise the command line option parsing here.
+extern llvm::cl::opt<std::string> input_file_name;
+extern llvm::cl::opt<std::string> output_file_name;
+extern llvm::cl::opt<bool> use_splatted_constant;
+extern llvm::cl::opt<bool> input_mlir;
+extern llvm::cl::opt<bool> output_mlir;
+extern llvm::cl::list<std::string> custom_opdefs;
+extern llvm::cl::opt<bool> emit_quant_adaptor_ops;
+extern llvm::cl::opt<std::string> quant_stats_file_name;
+extern llvm::cl::opt<bool> convert_tf_while_to_tfl_while;
+extern llvm::cl::opt<std::string> select_user_tf_ops;
+extern llvm::cl::opt<bool> allow_all_select_tf_ops;
+extern llvm::cl::opt<bool> unfold_batchmatmul;
+extern llvm::cl::opt<bool> unfold_large_splat_constant;
+extern llvm::cl::opt<bool> guarantee_all_funcs_one_use;
+extern llvm::cl::opt<bool> enable_dynamic_update_slice;
+extern llvm::cl::opt<bool> preserve_assert_op;
+extern llvm::cl::opt<bool> legalize_custom_tensor_list_ops;
+extern llvm::cl::opt<bool> reduce_type_precision;
+
+// Import saved model.
+extern llvm::cl::opt<bool> import_saved_model_object_graph;
+extern llvm::cl::opt<bool> import_saved_model_signature_defs;
+extern llvm::cl::opt<std::string> saved_model_tags;
+extern llvm::cl::opt<std::string> saved_model_exported_names;
+
+// Import HLO.
+enum HloImportType { proto, hlotxt, mlir_text };
+
+extern llvm::cl::opt<bool> import_hlo;
+extern llvm::cl::opt<HloImportType> hlo_import_type;
+
+// enable_hlo_to_tf_conversion and disable_hlo_to_tfl_conversion are used to
+// control the HLO to TF and HLO to TFLite conversion while debugging an
+// input_mlir. The default value of enable_hlo_to_tf_conversion is false, and
+// the default value of disable_hlo_to_tfl_conversion is true.
+extern llvm::cl::opt<bool> enable_hlo_to_tf_conversion;
+extern llvm::cl::opt<bool> disable_hlo_to_tfl_conversion;
+
+// quantization related flags
+extern llvm::cl::opt<bool> post_training_quantization;
+
+// TF to stablehlo pass flags
+extern llvm::cl::opt<bool> enable_stablehlo_conversion;
+
+// Whether serialize stablehlo ops or not
+extern llvm::cl::opt<bool> serialize_stablehlo_ops;
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_TRANSLATE_CL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
new file mode 100644
index 00000000..ec8569a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Load a TF model from a GraphDef definition or a TF control flow dialect MLIR
+// source into a MLIR module. If `input_mlir` is true, load from a MLIR source
+// file; otherwise, load from a GraphDef.
+// Setting prune_unused_nodes to true, would prune unreachable nodes if
+// output_arrays is specified.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
+    const GraphImportConfig& specs, absl::string_view debug_info_file,
+    absl::string_view input_arrays, absl::string_view input_dtypes,
+    absl::string_view input_shapes, absl::string_view output_arrays,
+    absl::string_view control_output_arrays, llvm::SourceMgr* source_mgr,
+    mlir::MLIRContext* context);
+
+// Load Saved model (either v1 or v2) into MLIR.
+// 'saved_model_bundle' will be initialized if V1 model was loaded.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+    const std::string& input_filename, int saved_model_version,
+    const std::unordered_set<std::string>& tags,
+    absl::Span<const std::string> extra_tf_opdefs,
+    absl::Span<std::string> exported_names, const GraphImportConfig& specs,
+    bool enable_variable_lifting, mlir::MLIRContext* context,
+    std::unique_ptr<tensorflow::SavedModelBundle>* saved_model_bundle);
+
+// Taking a MLIR module in TF executor dialect and a set of parameters,
+// applies a set of passes (configured accordingly to the provided
+// `pass_config`) to convert the module to TF Lite dialect and serializes the
+// result to a string. Depending on an attribute in the module main function,
+// full integer quantization is applied.
+// * `quantizated_buffer_type` can be set to INT8 or FLOAT16 to trigger the
+// corresponding weight quantization.
+// * `export_to_mlir` enables exporting to MLIR text format, otherwise exported
+// in flat buffer. If the
+// * `session` pointer may provided, it will be used to freeze resource
+// variables. If the `saved_model_dir` directory path is provided, then the
+// `tf_saved_model.asset` ops will be freezed.
+absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
+    std::unique_ptr<mlir::MLIRContext>&& context,
+    mlir::OwningOpRef<mlir::ModuleOp> module,
+    tflite::ConverterFlags& converter_flags,
+    const mlir::TFL::PassConfig& pass_config,
+    const std::unordered_set<std::string>& saved_model_tags,
+    llvm::StringRef saved_model_dir, std::string* result,
+    bool serialize_stablehlo_ops, bool export_to_mlir,
+    const quantization::PyFunctionLibrary* quantization_py_function_lib =
+        nullptr);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/command_line_flags.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/command_line_flags.h
new file mode 100644
index 00000000..41e70c94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/command_line_flags.h
@@ -0,0 +1,170 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+// TODO(b/321735756): Remove this file once common library is implemented with
+// the originial file.
+
+// LINT.IfChange
+
+namespace mlir {
+// A simple command-line argument parsing module.
+// Dependency free simplified port of core/util/command_line_flags.
+// This class is written for benchmarks and uses inefficient string
+// concatenation. This was written to avoid dependency on tensorflow/core/util
+// which transitively brings in a lot of other dependencies that are not
+// necessary for tflite benchmarking code.
+// The recommended way of using it is with local variables and an initializer
+// list of Flag objects, for example:
+//
+// int some_int = 10;
+// bool some_switch = false;
+// std::string some_name = "something";
+//
+// std::vector<tensorFlow::Flag> flag_list = {
+//   Flag::CreateFlag("some_int", &some_int, "an integer that affects X"),
+//   Flag::CreateFlag("some_switch", &some_switch, "a bool that affects Y"),
+//   Flag::CreateFlag("some_name", &some_name, "a string that affects Z")
+// };
+// // Get usage message before ParseFlags() to capture default values.
+// std::string usage = Flag::Usage(argv[0], flag_list);
+// bool parsed_values_ok = Flags::Parse(&argc, argv, flag_list);
+//
+// tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+// if (argc != 1 || !parsed_values_ok) {
+//    ...output usage and error message...
+// }
+//
+// The argc and argv values are adjusted by the Parse function so all that
+// remains is the program name (at argv[0]) and any unknown arguments fill the
+// rest of the array. This means you can check for flags that weren't understood
+// by seeing if argv is greater than 1.
+// The result indicates if there were any errors parsing the values that were
+// passed to the command-line switches. For example, --some_int=foo would return
+// false because the argument is expected to be an integer.
+//
+// NOTE: Unlike gflags-style libraries, this library is intended to be
+// used in the `main()` function of your binary. It does not handle
+// flag definitions that are scattered around the source code.
+
+// A description of a single command line flag, holding its name, type, usage
+// text, and a pointer to the corresponding variable.
+class Flag {
+ public:
+  enum FlagType {
+    kPositional = 0,
+    kRequired,
+    kOptional,
+  };
+
+  // The order of the positional flags is the same as they are added.
+  // Positional flags are supposed to be required.
+  template <typename T>
+  static Flag CreateFlag(const char* name, T* val, const char* usage,
+                         FlagType flag_type = kOptional) {
+    return Flag(
+        name, [val](const T& v) { *val = v; }, *val, usage, flag_type);
+  }
+
+// "flag_T" is same as "default_value_T" for trivial types, like int32, bool
+// etc. But when it's a complex type, "default_value_T" is generally a const
+// reference "flag_T".
+#define CONSTRUCTOR_WITH_ARGV_INDEX(flag_T, default_value_T)         \
+  Flag(const char* name,                                             \
+       const std::function<void(const flag_T& /*flag_val*/,          \
+                                int /*argv_position*/)>& hook,       \
+       default_value_T default_value, const std::string& usage_text, \
+       FlagType flag_type);
+
+#define CONSTRUCTOR_WITHOUT_ARGV_INDEX(flag_T, default_value_T)            \
+  Flag(const char* name, const std::function<void(const flag_T&)>& hook,   \
+       default_value_T default_value, const std::string& usage_text,       \
+       FlagType flag_type)                                                 \
+      : Flag(                                                              \
+            name, [hook](const flag_T& flag_val, int) { hook(flag_val); }, \
+            default_value, usage_text, flag_type) {}
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(int32_t, int32_t)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(int32_t, int32_t)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(int64_t, int64_t)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(int64_t, int64_t)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(float, float)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(float, float)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(bool, bool)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(bool, bool)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(std::string, const std::string&)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(std::string, const std::string&)
+
+#undef CONSTRUCTOR_WITH_ARGV_INDEX
+#undef CONSTRUCTOR_WITHOUT_ARGV_INDEX
+
+  FlagType GetFlagType() const { return flag_type_; }
+
+ private:
+  friend class Flags;
+
+  bool Parse(const std::string& arg, int argv_position,
+             bool* value_parsing_ok) const;
+
+  std::string name_;
+  enum {
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_BOOL,
+    TYPE_STRING,
+    TYPE_FLOAT,
+  } type_;
+
+  std::function<bool(const std::string& /*read_value*/, int /*argv_position*/)>
+      value_hook_;
+  std::string default_for_display_;
+
+  std::string usage_text_;
+  FlagType flag_type_;
+};
+
+class Flags {
+ public:
+  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
+  // instances matching flags in flaglist[].  Update the variables associated
+  // with matching flags, and remove the matching arguments from (*argc, argv).
+  // Return true iff all recognized flag values were parsed correctly, and the
+  // first remaining argument is not "--help".
+  // Note:
+  // 1. when there are duplicate args in argv for the same flag, the flag value
+  // and the parse result will be based on the 1st arg.
+  // 2. when there are duplicate flags in flag_list (i.e. two flags having the
+  // same name), all of them will be checked against the arg list and the parse
+  // result will be false if any of the parsing fails.
+  // See *Duplicate* unit tests in command_line_flags_test.cc for the
+  // illustration of such behaviors.
+  static bool Parse(int* argc, const char** argv,
+                    const std::vector<Flag>& flag_list);
+};
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+
+// LINT.ThenChange(//tensorflow/lite/tools/command_line_flags.h)
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/optimize/operator_property.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/optimize/operator_property.h
new file mode 100644
index 00000000..5401fcdd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/optimize/operator_property.h
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_OPTIMIZE_OPERATOR_PROPERTY_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_OPTIMIZE_OPERATOR_PROPERTY_H_
+
+#include <cassert>
+#include <functional>
+#include <initializer_list>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace operator_property {
+
+// The scales of a certain tensor can be derived from the multiplications of all
+// the scales. For example, for bias in conv, derived_scale = {{0, 1}, {}, {}}
+// and for lstm gate bias, the derived scale is {{}, {0}, {2^-10}}
+struct DerivedScale {
+  // MSVC2015 version 14.0 and below doesn't support struct initialization with
+  // initializer lists so emulate the behavior using a float initializer list.
+#if _MSC_VER <= 1900
+  DerivedScale() = default;
+  // Construct this object with a list of initializer lists. All list elements
+  // are cast to float values to avoid ambiguous construction of a union-style
+  // object that could take either std::initializer_list<float> or
+  // std::initializer_list<int>.
+  DerivedScale(std::initializer_list<std::initializer_list<float>> values) {
+    assert(values.size() == 3);
+    std::vector<std::initializer_list<float>> items(values);
+    for (auto& it : items[0]) {
+      input_tensors.push_back(static_cast<int>(it));
+    }
+    for (auto& it : items[1]) {
+      intermediate_tensors.push_back(static_cast<int>(it));
+    }
+    factors.assign(items[2]);
+  }
+#endif  // _MSC_VER <= 1900
+
+  std::vector<int> input_tensors = {};
+  std::vector<int> intermediate_tensors = {};
+  // This is a list of extra factors that are not associated with any other
+  // tensor.
+  std::vector<float> factors = {};
+};
+
+struct TensorProperty {
+  // per_axis also implies symmetric currently.
+  bool per_axis = false;
+  // TODO(jianlijianli): remove dimension index and read it from tensor instead.
+  int per_axis_index = 0;
+  bool symmetric = false;
+
+  // Constraints.
+  bool restriction = false;
+  // scale/zero_point hardcoded.
+  std::pair<float, int> restricted_value_int8 = {0.0f, 0};
+  std::pair<float, int> restricted_value_int16 = {0.0f, 0};
+
+  // Use derived scale.
+  bool use_derived_scale = false;
+  // The derived scale.
+  DerivedScale derived_scale;
+
+  // The number of bits for this tensor. It could be 8, 16, 32 or even not power
+  // of two.
+  int number_of_bits = 8;
+
+  // Extend the range to power of two.
+  bool extend_to_power_of_two = false;
+
+  // State tensor.
+  bool state_tensor = false;
+};
+
+struct OperatorProperty {
+  // Is a quantized operations currently supported.
+  bool quantizable = true;
+  // Is a quantized operations currently supported for 16x8
+  bool quantizable_int16 = true;
+  // Op has arbitrary number of inputs, such as concat.
+  bool arbitrary_inputs = false;
+  // Op has arbitrary number of outputs, such as slice.
+  bool arbitrary_outputs = false;
+  // Input indexes -> input tensor property.
+  // Must be topologically sorted since there are derived scales.
+  std::vector<std::pair<int, TensorProperty>> inputs = {};
+  // Output indexes -> output tensor property.
+  std::vector<std::pair<int, TensorProperty>> outputs = {};
+  // Bias indexes.
+  // TODO(jianlijianli): remove this by putting biases into inputs as well since
+  // we now can model "derived scale".
+  std::vector<int> biases = {};
+
+  // Intermediate indexes -> intermediate tensor property.
+  std::vector<std::pair<int, TensorProperty>> intermediates = {};
+
+  // Force output to reuse the same scale and zero point of input when the
+  // certain type support must require the same scale and zero point
+  // requirement.
+  std::function<bool(TensorType)> restrict_same_input_output_scale =
+      [](TensorType) { return false; };
+
+  // Use same min of min and max of max for each group.
+  // Incompatible with restrict_same_input_output_scale and restricted_value.
+  // Currently it only supports scale pair of {input_index, output_index}.
+  std::vector<std::vector<int>> restrict_scale = {};
+
+  // Op version.
+  int version = 1;
+
+  // When we quantize activations into 16 bit and weights into 8 bit,
+  // we want to quantize all inputs, including constant tensors,
+  // for the operators like Add, Mul into 16-bit as well. The constant
+  // inputs are quantized as weights and this variable indicates
+  // that we want to do quantizations of these tensors as activations.
+  bool quantize_input_as_activations = false;
+};
+
+// The op as well as it variants.
+struct OpVariant {
+  BuiltinOperator op_code;
+  bool use_layer_norm = false;
+  bool use_projection = false;
+  bool use_peephole = false;
+  // An attribute to indicate if quantization is supported for this Op.
+  // This attribute is equivalent to the "quantizable" attribute in
+  // "OperatorProperty". It added here since OpVariants peeks inside the Op and
+  // determines its quantization related properties.
+  bool is_quantizable = true;
+};
+
+OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
+                                     int op_index, int number_of_bits = 8);
+OperatorProperty GetOperatorProperty(OpVariant op_variant,
+                                     int number_of_bits = 8);
+
+}  // namespace operator_property
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_OPTIMIZE_OPERATOR_PROPERTY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h
new file mode 100644
index 00000000..104cc638
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h
@@ -0,0 +1,119 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_OPTIMIZE_REDUCED_PRECISION_METADATA_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_OPTIMIZE_REDUCED_PRECISION_METADATA_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h"
+
+namespace tflite {
+namespace optimize {
+static constexpr char kTfLiteReducedPrecisionKey[] =
+    "reduced_precision_support";
+
+static constexpr char kTfLiteFloat16String[] = "fp16";
+static constexpr char kTfLiteBfloat16String[] = "bf16";
+static constexpr char kTfLiteFloat32String[] = "fp32";
+static constexpr char kTfLiteAccumulationString[] = "acc";
+
+enum class ReducedPrecisionSupport : std::uint8_t {
+  None = 0,
+  Float16Inference = 0x1,
+  Bfloat16Inference = 0x2,
+  Float16Accumulation = 0x4,
+  Float32Accumulation = 0x8,
+};
+
+inline ReducedPrecisionSupport operator|(ReducedPrecisionSupport a,
+                                         ReducedPrecisionSupport b) {
+  return static_cast<ReducedPrecisionSupport>(static_cast<std::uint32_t>(a) |
+                                              static_cast<std::uint32_t>(b));
+}
+
+inline ReducedPrecisionSupport& operator|=(ReducedPrecisionSupport& a,
+                                           ReducedPrecisionSupport b) {
+  return a = static_cast<ReducedPrecisionSupport>(
+             static_cast<std::uint32_t>(a) | static_cast<std::uint32_t>(b));
+}
+
+inline ReducedPrecisionSupport operator&(ReducedPrecisionSupport a,
+                                         ReducedPrecisionSupport b) {
+  return static_cast<ReducedPrecisionSupport>(static_cast<std::uint32_t>(a) &
+                                              static_cast<std::uint32_t>(b));
+}
+
+inline ReducedPrecisionSupport& operator&=(ReducedPrecisionSupport& a,
+                                           ReducedPrecisionSupport b) {
+  return a = static_cast<ReducedPrecisionSupport>(
+             static_cast<std::uint32_t>(a) & static_cast<std::uint32_t>(b));
+}
+
+inline bool SupportsFP16Inference(const ReducedPrecisionSupport& mask) {
+  return static_cast<bool>(mask & ReducedPrecisionSupport::Float16Inference);
+}
+
+inline bool SupportsBfloat16Inference(const ReducedPrecisionSupport& mask) {
+  return static_cast<bool>(mask & ReducedPrecisionSupport::Bfloat16Inference);
+}
+
+inline bool SupportsFP16Accumulation(const ReducedPrecisionSupport& mask) {
+  return static_cast<bool>(mask & ReducedPrecisionSupport::Float16Accumulation);
+}
+
+inline bool SupportsFP32Accumulation(const ReducedPrecisionSupport& mask) {
+  return static_cast<bool>(mask & ReducedPrecisionSupport::Float32Accumulation);
+}
+
+inline bool SupportsReducedPrecisionInference(
+    const ReducedPrecisionSupport& mask) {
+  return SupportsFP16Inference(mask) || SupportsBfloat16Inference(mask);
+}
+
+inline bool SupportsEitherFP16OrFP32Accumulation(
+    const ReducedPrecisionSupport& mask) {
+  return SupportsFP16Accumulation(mask) != SupportsFP32Accumulation(mask);
+}
+
+// Return the key-value pair for reduced precision support metadata.
+// Example: mask = Float16Inference | Bfloat16Inference | Float32Accumulation;
+// Returned value would be <"reduced_precision_support", "fp16bf16accfp32">.
+inline std::pair<std::string, std::string> MetadataForReducedPrecisionSupport(
+    const ReducedPrecisionSupport& mask) {
+  TFLITE_DCHECK(SupportsReducedPrecisionInference(mask));
+  TFLITE_DCHECK(SupportsEitherFP16OrFP32Accumulation(mask));
+  std::string value = "";
+  if (SupportsFP16Inference(mask)) {
+    value += kTfLiteFloat16String;
+  }
+  if (SupportsBfloat16Inference(mask)) {
+    value += kTfLiteBfloat16String;
+  }
+  value += kTfLiteAccumulationString;
+  if (SupportsFP16Accumulation(mask)) {
+    value += kTfLiteFloat16String;
+  } else if (SupportsFP32Accumulation(mask)) {
+    value += kTfLiteFloat32String;
+  }
+  return std::make_pair(std::string(kTfLiteReducedPrecisionKey), value);
+}
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_OPTIMIZE_REDUCED_PRECISION_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h
new file mode 100644
index 00000000..b3da62ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_TF_MLIR_TRANSLATE_CL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_TF_MLIR_TRANSLATE_CL_H_
+
+// This file contains command-line options aimed to provide the parameters
+// required by the TensorFlow Graph(Def) to MLIR module conversion. It is only
+// intended to be included by binaries.
+
+#include <string>
+
+#include "llvm/Support/CommandLine.h"
+
+// Please see the implementation file for documentation of these options.
+
+// Import options.
+extern llvm::cl::opt<std::string> input_arrays;
+extern llvm::cl::opt<std::string> input_dtypes;
+extern llvm::cl::opt<std::string> input_shapes;
+extern llvm::cl::opt<std::string> output_arrays;
+extern llvm::cl::opt<std::string> control_output_arrays;
+extern llvm::cl::opt<std::string> inference_type;
+extern llvm::cl::opt<std::string> min_values;
+extern llvm::cl::opt<std::string> max_values;
+extern llvm::cl::opt<std::string> debug_info_file;
+extern llvm::cl::opt<std::string> xla_compile_device_type;
+extern llvm::cl::opt<bool> prune_unused_nodes;
+extern llvm::cl::opt<bool> convert_legacy_fed_inputs;
+extern llvm::cl::opt<bool> graph_as_function;
+extern llvm::cl::opt<bool> upgrade_legacy;
+// TODO(jpienaar): Temporary flag, flip default and remove.
+extern llvm::cl::opt<bool> enable_shape_inference;
+extern llvm::cl::opt<bool> unconditionally_use_set_output_shapes;
+extern llvm::cl::opt<bool> enable_soft_placement;
+extern llvm::cl::opt<bool> set_original_tf_func_name;
+
+// Export options.
+extern llvm::cl::opt<bool> export_entry_func_to_flib;
+extern llvm::cl::opt<bool> export_original_tf_func_name;
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_TF_MLIR_TRANSLATE_CL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.h
new file mode 100644
index 00000000..5799194f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.h
@@ -0,0 +1,96 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_OP_SIGNATURE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_OP_SIGNATURE_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/core/c/tflite_types.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// OpSignature contains operator parameters for version functions.
+typedef struct {
+  TfLiteType type;
+  std::vector<int32_t> dims;
+  bool is_const;
+  bool is_shape_dynamic;
+} OpSignatureTensorSpec;
+
+typedef struct {
+  BuiltinOperator op;
+  std::vector<OpSignatureTensorSpec> inputs;
+  std::vector<OpSignatureTensorSpec> outputs;
+  void* builtin_data;
+  int version;
+  const void* custom_initial_data;
+  std::string custom_name;
+  union {
+    struct {
+      bool is_per_channel_quantized;
+      bool is_grouped_convolution;
+    } conv_2d;
+    struct {
+      bool is_per_channel_quantized;
+    } depthwise_conv_2d;
+    struct {
+      // TODO(b/156530611): Make this global when more ops support sparse
+      // computation.
+      bool sparse_weight;
+      bool is_per_channel_quantized;
+    } fully_connected;
+    struct {
+      float input1_scale;
+      float input2_scale;
+      float output_scale;
+      bool input_quantized;
+    } mul;
+    struct {
+      int32_t num_dims;
+    } strided_slice;
+    struct {
+      bool input_quantized;
+    } abs;
+    struct {
+      bool is_per_channel_quantized;
+    } dequantize;
+    struct {
+      bool is_per_channel_quantized;
+    } quantize;
+    struct {
+      bool input_quantized;
+    } add;
+    struct {
+      bool is_per_channel_quantized;
+    } embedding_lookup;
+  } ext_options;
+} OpSignature;
+
+// Generate OpSignature with the given OperatorCode, Operator and Tensors (from
+// SubGraph). The OpSignature will be used by GetBuiltinOperatorVersion() and
+// mostly input and output tensor types are enough to figure out op version.
+// But some ops (DEPTHWISE_CONV_2D,  FULLY_CONNECTED, ...) require to pass their
+// options to decide op version.
+//
+// WARNING: The caller is responsible to free the allocated
+// OpSignature.builtin_data memory.
+OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
+                           const SubGraph* subgraph, const Model* model);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_OP_SIGNATURE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/op_version.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/op_version.h
new file mode 100644
index 00000000..bd1f5516
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/op_version.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_OP_VERSION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_OP_VERSION_H_
+
+#include <cstdint>
+
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/tools/versioning/op_signature.h"
+
+namespace tflite {
+
+// Returns version of builtin ops by the given signature.
+int GetBuiltinOperatorVersion(const OpSignature& op_sig);
+
+// Update operator's version of the given TFL flatbuffer model.
+void UpdateOpVersion(uint8_t* model_buffer_pointer);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_OP_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.h
new file mode 100644
index 00000000..7d586df5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
+
+#include <cstdint>
+#include <string>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
+
+namespace tflite {
+// Update minimum runtime version of the given TFL flatbuffer model.
+void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
+
+// Find the minimum runtime version of a given op version. Return an empty
+// string the version is not registered.
+std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
+                                           int op_version);
+
+// Returns true if the first version string precedes the second.
+// For example, '1.9' should precede '1.14', also '1.14' should precede
+// '1.14.1'. If two version string is equal, then false will be returned.
+bool CompareRuntimeVersion(const std::string&, const std::string&);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/canonicalize_boundary_value_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/canonicalize_boundary_value_pass.h
new file mode 100644
index 00000000..e9bd67f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/canonicalize_boundary_value_pass.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CANONICALIZE_BOUNDARY_VALUE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CANONICALIZE_BOUNDARY_VALUE_PASS_H_
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass to canonicalize the IR representations of boundary values.
+
+class CanonicalizeBoundaryValuePass
+    : public TFL::Pass<CanonicalizeBoundaryValuePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeBoundaryValuePass)
+
+  CanonicalizeBoundaryValuePass() = default;
+  CanonicalizeBoundaryValuePass(const CanonicalizeBoundaryValuePass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "CanonicalizeBoundaryValuePass"; }
+  static llvm::StringRef GetArgument() {
+    return "tfl-canonicalize-boundary-value";
+  }
+  static llvm::StringRef GetDescription() {
+    return "Pass to canonicalize the IR representations of boundary values";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, mlir::stablehlo::StablehloDialect,
+                    mlir::arith::ArithDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CANONICALIZE_BOUNDARY_VALUE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h
new file mode 100644
index 00000000..01f71afe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CONVERTER_PASS_OPTIONS_SETTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CONVERTER_PASS_OPTIONS_SETTER_H_
+
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h"
+
+namespace mlir {
+namespace TFL {
+
+class OptimizePassOptions;
+class VariableFreezingPipelineOptions;
+class EmptyPassOptions;
+
+// PassOptionsSetter to set TFLite Converter Pass/Pipeline Options based on
+// ConverterFlags and TFL::PassConfig values.
+class ConverterPassOptionsSetter : public PassOptionsSetter {
+ public:
+  explicit ConverterPassOptionsSetter(
+      const tflite::ConverterFlags& converter_flags,
+      const mlir::TFL::PassConfig& pass_config)
+      : converter_flags_(converter_flags), pass_config_(pass_config) {};
+  ~ConverterPassOptionsSetter() override = default;
+
+  void SetOptions(OptimizePassOptions& options) const override;
+  void SetOptions(VariableFreezingPipelineOptions& options) const override;
+  void SetOptions(EmptyPassOptions& options) const override;
+
+ private:
+  tflite::ConverterFlags converter_flags_;
+  mlir::TFL::PassConfig pass_config_;
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CONVERTER_PASS_OPTIONS_SETTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse_pass.h
new file mode 100644
index 00000000..fa39e09c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse_pass.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass convert dense tensor to sparse format.
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_DENSE_TO_SPARSE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_DENSE_TO_SPARSE_PASS_H_
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+
+namespace mlir {
+namespace TFL {
+
+// This pass encodes sparse weights in the model in the proper format, and adds
+// Densify() op if necessary. The general algorithm is:
+//   1. Get list of operands (weights) of an op that can be sparse.
+//   2. Get list of supported block configurations of the op.
+//   3. Calculate random sparsity of the weight.
+//     3.1. If sparsity level is below the encoding threshold, keep in dense.
+//     3.2. If sparsity level is above the encoding threshold, go to 4.
+//   4. Try to encode the weight with supported block configurations. If the
+//      weight was pruned with the same block config, the blocked sparsity level
+//      should match the random sparsity.
+//     4.1. Return the matching block config if found.
+//     4.2. If no matching block config is found, encode the weight with random
+//          sparsity, and add Densify() op to fall back to dense execution.
+
+class DenseToSparsePass
+    : public Pass<DenseToSparsePass, EmptyPassOptions, func::FuncOp> {
+ public:
+  DenseToSparsePass() = default;
+  DenseToSparsePass(const DenseToSparsePass &other) {}
+
+  void runOnOperation() final;
+
+  /// Returns the command-line argument attached to this pass.
+  static llvm::StringRef GetArgument() { return "tfl-dense-to-sparse"; }
+
+  static llvm::StringRef GetDescription() {
+    return "Convert dense tensor to sparse format.";
+  }
+
+  /// Returns the derived pass name.
+  static llvm::StringRef GetName() { return "DenseToSparsePass"; }
+
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
+  /// Explicitly declare the TypeID for this class. We declare an explicit
+  /// private instantiation because Pass classes should only be visible by the
+  /// current library.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(DenseToSparsePass)
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_DENSE_TO_SPARSE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
new file mode 100644
index 00000000..fe8bb7d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -0,0 +1,498 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This pass identifies patterns for dilated convolution and replace it with
+// a real convolution op.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_DILATED_CONV_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_DILATED_CONV_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/utils/validators.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+// A dilated convolution can be emulated with a regular convolution by chaining
+// SpaceToBatch and BatchToSpace ops before and after it:
+//
+//     SpaceToBatchND -> Conv2D -> BatchToSpaceND
+//
+// This method was common before Conv2D fully supported dilated convolution in
+// TensorFlow. This transformation detects this "emulation", and replaces it
+// with a true dilated convolution, eliminating the SpaceToBatch and
+// BatchtoSpace ops.
+//
+// Detecting this alone would be relatively easy. However, in practice some
+// extra ops are used, so we detect the following patterns:
+//
+//
+//   SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> BatchToSpaceND -> BiasAdd
+//
+//   SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> Pad -> BatchToSpaceND ->
+//   BiasAdd
+//
+//   SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> BiasAdd -> BatchToSpaceND
+//
+//   SpaceToBatchND -> Conv2D -> Pad -> BatchToSpaceND -> BiasAdd
+//
+//   SpaceToBatchND -> Conv2D -> BatchToSpaceND -> BiasAdd
+//
+//
+// The Expand/Squeeze combination is used to adapt a 3D array (such as in
+// WaveNet) to the 4D arrays that Conv2D requires. Padding and BiasAdd are
+// thrown in just for the extra headache. Padding adapts non-conforming input
+// sizes, and can be discarded. The bias is necessary, so is kept.
+template <typename Conv2dOpTy>
+class ConvertTFDilatedConvOp : public OpRewritePattern<Conv2dOpTy> {
+ private:
+  using OpRewritePattern<Conv2dOpTy>::OpRewritePattern;
+
+  // Extract the dilation factor from `block_shape` and pack it in an ArrayAttr.
+  std::optional<ArrayAttr> ExtractDilationsAttrFromBlockShape(
+      Value stb_block_shape, Value bts_block_shape, int64_t expand_axis,
+      PatternRewriter& rewriter) const;
+
+ public:
+  LogicalResult matchAndRewrite(Conv2dOpTy op,
+                                PatternRewriter& rewriter) const override;
+};
+
+template <typename Conv2dOpTy>
+LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
+    Conv2dOpTy op, PatternRewriter& rewriter) const {
+  if (!op.getResult().hasOneUse()) {
+    return rewriter.notifyMatchFailure(
+        op, "result for current op has more than 1 use");
+  }
+  // Make sure Conv2D has 'VALID' padding.
+  if (op->template getAttrOfType<StringAttr>("padding").getValue() != "VALID") {
+    return rewriter.notifyMatchFailure(op,
+                                       "Conv2D op doesn't have valid padding");
+  }
+  // Make sure dilations are all ones if set.
+  const ArrayAttr& dilations =
+      op->template getAttrOfType<ArrayAttr>("dilations");
+  if (dilations && !TFIntListIsAllOnes(dilations)) {
+    return rewriter.notifyMatchFailure(op, "dilations should be all 1");
+  }
+
+  if (!TFL::TFTypeIsFloat32Tensor(op.getInput()) &&
+      !TFL::TFTypeIsBFloat16OrHalfTensor(op.getInput())) {
+    return rewriter.notifyMatchFailure(
+        op, "op's input is not float or half or bfloat16");
+  }
+  if (!TFL::TFDataFormatIsNHWC(op)) {
+    return rewriter.notifyMatchFailure(op, "op's data format isn't NHWC");
+  }
+
+  // Allow dynamic width and height dimensions only.
+  auto result_ty = mlir::cast<TensorType>(op.getResult().getType());
+  if (!result_ty.hasRank() || result_ty.getRank() != 4 ||
+      result_ty.isDynamicDim(0) || result_ty.isDynamicDim(3)) {
+    return rewriter.notifyMatchFailure(
+        op, "only dynamic width and height dimensions are allowed");
+  }
+
+  // Check if the ConvOp's input is defined by `Expand` op, and the output used
+  // by `Squeeze` op.
+  Operation* producer_op = op.getOperand(0).getDefiningOp();
+  if (!producer_op || producer_op->getNumResults() != 1) {
+    return rewriter.notifyMatchFailure(
+        op, "op doesn't have a producer node that has a single result");
+  }
+  if (!producer_op->hasOneUse() ||
+      *(producer_op->getResult(0).user_begin()) != op) {
+    return rewriter.notifyMatchFailure(
+        op, "op's input isn't produced by previous operation");
+  }
+
+  auto tryGetDirectConsumerOp =
+      [&rewriter](Operation* current) -> std::pair<LogicalResult, Operation*> {
+    // Check the current operation has a single result.
+    if (current->getNumResults() != 1) {
+      return {
+          rewriter.notifyMatchFailure(current, "op doesn't have single result"),
+          nullptr};
+    }
+    // Check the current operation has a consumer node.
+    Operation* consumer_op =
+        current->getResult(0).getUses().begin()->getOwner();
+    if (!consumer_op) {
+      return {
+          rewriter.notifyMatchFailure(current, "op doesn't have consumer node"),
+          nullptr};
+    }
+    // Check the current operation's result is used by its successor node.
+    if (!current->hasOneUse() ||
+        *(current->getResult(0).user_begin()) != consumer_op) {
+      return {
+          rewriter.notifyMatchFailure(
+              current, "op's result isn't directly consumed by the next op"),
+          nullptr};
+    }
+    return {LogicalResult::success(), consumer_op};
+  };
+
+  std::pair<LogicalResult, Operation*> maybeConsumer =
+      tryGetDirectConsumerOp(op.getOperation());
+  if (failed(maybeConsumer.first)) {
+    return maybeConsumer.first;
+  }
+  Operation* consumer_op = maybeConsumer.second;
+
+  TF::ExpandDimsOp expand_op;
+  TF::SqueezeOp squeeze_op;
+  int64_t expand_axis = -1;
+  // Expand + Squeeze op.
+  if (llvm::isa<TF::ExpandDimsOp>(producer_op)) {
+    if (!llvm::isa<TF::SqueezeOp>(consumer_op)) {
+      // Expand/Squeeze op must come in pair.
+      return rewriter.notifyMatchFailure(
+          op, "ExpandDimsOp and SqueezeOp should come in pair");
+    }
+    expand_op = llvm::cast<TF::ExpandDimsOp>(producer_op);
+    squeeze_op = llvm::cast<TF::SqueezeOp>(consumer_op);
+    if (!expand_op.getResult().hasOneUse()) {
+      return rewriter.notifyMatchFailure(
+          expand_op, "result for current op has more than 1 use");
+    }
+    if (!squeeze_op.getResult().hasOneUse()) {
+      return rewriter.notifyMatchFailure(
+          squeeze_op, "result for current op has more than 1 use");
+    }
+    // Make sure that the axis in `expand_op` is constant.
+    if (auto const_op =
+            llvm::dyn_cast<TF::ConstOp>(expand_op.getDim().getDefiningOp())) {
+      expand_axis = (*mlir::cast<DenseElementsAttr>(const_op.getValue())
+                          .getValues<APInt>()
+                          .begin())
+                        .getSExtValue();
+      // Canonicalize axis. Some TF python functions, such as
+      // `tf.nn.convolution`, use negative axis.
+      if (expand_axis < 0) {
+        // Always expand 3D input to 4D input.
+        expand_axis += 4;
+      }
+    } else {
+      return rewriter.notifyMatchFailure(
+          expand_op, "ExpandDimsOp doesn't have a constant axis");
+    }
+    // Make sure that the `squeeze_dims` is equal to `expand_axis`.
+    auto squeeze_dims = squeeze_op.getSqueezeDims();
+    if (squeeze_dims.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          squeeze_op, "squeeze dims should have exactly 1 dimension specified");
+    }
+    int64_t squeeze_axis = mlir::cast<IntegerAttr>(squeeze_dims[0]).getInt();
+    if (squeeze_axis < 0) {
+      // Always squeeze 4D input to 3D input.
+      squeeze_axis += 4;
+    }
+    if (squeeze_axis != expand_axis) {
+      return rewriter.notifyMatchFailure(
+          op, "squeeze axis and expand axis doesn't match");
+    }
+
+    // Update previous/next op pointer.
+    Operation* tmp = expand_op.getInput().getDefiningOp();
+    if (!tmp || tmp->getNumResults() != 1) {
+      return rewriter.notifyMatchFailure(
+          producer_op,
+          "op doesn't have a producer node that has a single result");
+    }
+    if (!tmp->hasOneUse() || *(tmp->getResult(0).user_begin()) != producer_op) {
+      return rewriter.notifyMatchFailure(
+          producer_op, "op's input isn't defined by its previous node");
+    }
+    producer_op = tmp;
+    std::pair<LogicalResult, Operation*> maybeConsumer =
+        tryGetDirectConsumerOp(consumer_op);
+    if (failed(maybeConsumer.first)) {
+      return maybeConsumer.first;
+    }
+    consumer_op = maybeConsumer.second;
+  }
+
+  // SpaceToBatchND op.
+  if (!llvm::isa<TF::SpaceToBatchNDOp>(producer_op)) {
+    return rewriter.notifyMatchFailure(producer_op,
+                                       "op should be a SpaceToBatchND op");
+  }
+  // TODO(b/149936532): Check `padding` input, currently ignored.
+  TF::SpaceToBatchNDOp stb_op = llvm::cast<TF::SpaceToBatchNDOp>(producer_op);
+  if (!stb_op.getResult().hasOneUse()) {
+    return rewriter.notifyMatchFailure(
+        stb_op, "result for current op has more than 1 use");
+  }
+
+  // Pad op.
+  TF::PadOp pad_op;
+  ElementsAttr pad_attr;
+  if (llvm::isa<TF::PadOp>(consumer_op)) {
+    pad_op = llvm::cast<TF::PadOp>(consumer_op);
+    if (!pad_op.getResult().hasOneUse()) {
+      return rewriter.notifyMatchFailure(
+          pad_op, "result for current op has more than 1 use");
+    }
+    std::pair<LogicalResult, Operation*> maybeConsumer =
+        tryGetDirectConsumerOp(consumer_op);
+    if (failed(maybeConsumer.first)) {
+      return maybeConsumer.first;
+    }
+    consumer_op = maybeConsumer.second;
+    if (!matchPattern(pad_op.getPaddings(), m_Constant(&pad_attr))) {
+      // If the padding value isn't constant, we can't determine the padding
+      // scheme for Conv2D below, in this case just reject the pattern.
+      return rewriter.notifyMatchFailure(
+          pad_op, "PadOp's padding value isn't constant");
+    }
+  }
+
+  // BatchToSpaceND + BiasAdd.
+  TF::BatchToSpaceNDOp bts_op;
+  TF::BiasAddOp biasadd_op;
+  bool final_op_is_bts = true;
+  if (llvm::isa<TF::BiasAddOp>(consumer_op)) {
+    // Must be BiasAdd + BatchToSpaceND.
+    biasadd_op = llvm::cast<TF::BiasAddOp>(consumer_op);
+    if (!biasadd_op.getResult().hasOneUse()) {
+      return rewriter.notifyMatchFailure(
+          biasadd_op, "result for current op has more than 1 use");
+    }
+    std::pair<LogicalResult, Operation*> maybeConsumer =
+        tryGetDirectConsumerOp(consumer_op);
+    if (failed(maybeConsumer.first)) {
+      return maybeConsumer.first;
+    }
+    if (!llvm::isa<TF::BatchToSpaceNDOp>(maybeConsumer.second)) {
+      return rewriter.notifyMatchFailure(
+          consumer_op, "op's next node isn't BatchToSpaceND op");
+    }
+    consumer_op = maybeConsumer.second;
+    bts_op = llvm::cast<TF::BatchToSpaceNDOp>(consumer_op);
+  } else if (llvm::isa<TF::BatchToSpaceNDOp>(consumer_op)) {
+    // BatchToSpaceND + (optional) BiasAdd.
+    bts_op = llvm::cast<TF::BatchToSpaceNDOp>(consumer_op);
+    std::pair<LogicalResult, Operation*> maybeConsumer =
+        tryGetDirectConsumerOp(consumer_op);
+    Operation* tmp = maybeConsumer.second;
+    if (tmp && llvm::isa<TF::BiasAddOp>(tmp)) {
+      consumer_op = tmp;
+      biasadd_op = llvm::cast<TF::BiasAddOp>(consumer_op);
+      final_op_is_bts = false;
+    }
+  } else {
+    return rewriter.notifyMatchFailure(
+        consumer_op, "next op is neither BiasAdd nor BatchToSpaceND");
+  }
+
+  std::optional<ArrayAttr> dilations_attr = ExtractDilationsAttrFromBlockShape(
+      stb_op.getBlockShape(), bts_op.getBlockShape(), expand_axis, rewriter);
+  if (!dilations_attr.has_value()) {
+    return rewriter.notifyMatchFailure(op, "failed to extract dilation rate");
+  }
+
+  if (expand_op) {
+    if (mlir::dyn_cast<RankedTensorType>(stb_op.getInput().getType()) ==
+        nullptr) {
+      return rewriter.notifyMatchFailure(
+          stb_op, "SpaceToBatchND op's input should have RankedTensorType");
+    }
+  }
+
+  // TODO(b/149936532): Check that the input width & height are multiples of
+  // dilation rate.
+  // TF python library will rewrite dilated conv to
+  // "SpaceToBatch->Conv->BatchToSpace" pattern, and the Conv in the middle
+  // always has 'VALID' padding. The padding tensor in `SpaceToBatch` has two
+  // parts of contributions, one is to reduce padding of CONV from 'SAME' to
+  // 'VALID', and another is to make input shape multiples of dilation rate. The
+  // first part of padding, which is also called `base_padding` will be used
+  // here to determine if the original padding format is 'SAME' or 'VALID'.
+  // According to the following formula we will compute the `base_padding` if
+  // it's a constant. Basically, `paddings` tensor in `SpaceToBatch` and `crops`
+  // tensor  in `BatchToSpace` must satisfy the following:
+  //  paddings[i, 0] = base_paddings[i, 0].
+  //  0 <= paddings[i, 1] - base_paddings[i, 1] < block_shape[i]
+  // (input_shape[i] + paddings[i, 0] + paddings[i, 1]) % block_shape[i] == 0.
+  //  crops[i, 0] = 0.
+  //  crops[i, 1] = paddings[i, 1] - base_paddings[i, 1].
+
+  //  If `paddings` - `crops` != 0, this means that `base_paddings` != 0, which
+  // tells us the original padding is 'SAME' (with one caveat presented below).
+  // Here we need to reset the padding back to `SAME` if `base_padding`
+  // != 0.
+  // TODO(b/149936532): We might not simply rely on `paddings - crops != 0` to
+  // determine the original padding format. For example, users can build
+  // arbitrary valid examples of `STB->Conv->BTS` which doesn't represent a
+  // dilated conv, hence we shouldn't pattern match here. Instead, we need to
+  // check values of `paddings` and `crops` to make sure it really stands for
+  // a dilated conv.
+  auto stb_paddings = stb_op.getPaddings();
+  auto bts_crops = bts_op.getCrops();
+  ElementsAttr stb_paddings_attr, bts_crops_attr;
+  if (!matchPattern(stb_paddings, m_Constant(&stb_paddings_attr)) ||
+      !matchPattern(bts_crops, m_Constant(&bts_crops_attr))) {
+    return rewriter.notifyMatchFailure(
+        op,
+        "either SpaceToBatchND or BatchToSpaceND "
+        "doesn't have constant padding/crops value");
+  }
+  if (stb_paddings_attr.getType() != bts_crops_attr.getType()) {
+    return rewriter.notifyMatchFailure(
+        stb_op,
+        "SpaceToBatchND op's padding doesn't have same shape/type with "
+        "BatchToSpaceND op's crops");
+  }
+  int64_t m = stb_paddings_attr.getShapedType().getDimSize(0);
+  // padding - crop.
+  for (uint64_t i = 0; i < m; ++i) {
+    for (uint64_t j = 0; j < 2; ++j) {
+      // `crops` tensor has shape [M, 2], crops[i] = [crop_start, crop_end]
+      // specifies the amount to crop from input dimension i + 1. If the input
+      // of `BatchToSpaceND` has been padded explicitly, then we need to
+      // take into account the additional padding when determining the padding
+      // scheme for `Conv2D`.
+      int64_t addtional_pad =
+          pad_attr ? pad_attr.getValues<APInt>()[{i + 1, j}].getSExtValue() : 0;
+      if (stb_paddings_attr.getValues<APInt>()[{i, j}].getSExtValue() +
+              addtional_pad !=
+          bts_crops_attr.getValues<APInt>()[{i, j}].getSExtValue()) {
+        op->setAttr("padding", rewriter.getStringAttr("SAME"));
+        break;
+      }
+    }
+  }
+
+  // Set dilations
+  op->setAttr("dilations", dilations_attr.value());
+
+  if (expand_op) {
+    // If there is `expand_op`, we need to rewire the inputs to bypass the
+    // `SpaceToBatch`, `BatchToSpace` and `Pad` op. E.g, turning
+    // 'SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> BatchToSpaceND ->
+    // BiasAdd' to 'Expand -> Conv2D ->Squeeze -> BiasAdd'.
+
+    // Connect `expand_op` with the input of `stb_op`.
+    expand_op.setOperand(0, stb_op.getInput());
+    // Calculate the shape for expand.
+    auto input_shape =
+        mlir::cast<ShapedType>(stb_op.getInput().getType()).getShape();
+    SmallVector<int64_t, 4> expand_shape(input_shape.begin(),
+                                         input_shape.end());
+    expand_shape.insert(expand_shape.begin() + expand_axis, 1);
+
+    auto expand_result_type = RankedTensorType::get(
+        expand_shape, getElementTypeOrSelf(stb_op.getInput()));
+    expand_op.getResult().setType(expand_result_type);
+
+    // Update the conv op's output shape.
+    auto bts_output_shape =
+        mlir::cast<ShapedType>(bts_op.getOutput().getType()).getShape();
+    SmallVector<int64_t, 4> conv_result_shape(bts_output_shape.begin(),
+                                              bts_output_shape.end());
+    conv_result_shape.insert(conv_result_shape.begin() + expand_axis, 1);
+    auto conv_result_type = RankedTensorType::get(
+        conv_result_shape, getElementTypeOrSelf(stb_op.getInput()));
+    op.getResult().setType(conv_result_type);
+
+    squeeze_op.getResult().setType(bts_op.getOutput().getType());
+
+    // Connect `biasadd_op` with the output of `squeeze_op`.
+    if (biasadd_op) {
+      biasadd_op.setOperand(0, squeeze_op.getOutput());
+      biasadd_op.getOutput().setType(squeeze_op.getOutput().getType());
+    }
+  } else {
+    if (biasadd_op) biasadd_op.setOperand(0, op.getOutput());
+    op.setOperand(0, stb_op.getInput());
+    op.getResult().setType(bts_op.getResult().getType());
+  }
+
+  if (final_op_is_bts) {
+    if (bts_op.getInput().getDefiningOp<TF::PadOp>()) {
+      bts_op.getResult().replaceAllUsesWith(pad_op.getInput());
+    } else {
+      bts_op.getResult().replaceAllUsesWith(bts_op.getInput());
+    }
+  }
+
+  stb_op.getResult().dropAllUses();
+  return success();
+}
+
+template <typename Conv2dOpTy>
+std::optional<ArrayAttr>
+ConvertTFDilatedConvOp<Conv2dOpTy>::ExtractDilationsAttrFromBlockShape(
+    Value stb_block_shape, Value bts_block_shape, int64_t expand_axis,
+    PatternRewriter& rewriter) const {
+  ElementsAttr stb_bs_attr, bts_bs_attr;
+  if (!matchPattern(stb_block_shape, m_Constant(&stb_bs_attr)) ||
+      !matchPattern(bts_block_shape, m_Constant(&bts_bs_attr))) {
+    // Returns failure status if block_shape is not a constant.
+    return {};
+  }
+  // Check that the block_shape of `stb_op` and `bts_op` are equal.
+  if (stb_bs_attr.getNumElements() != bts_bs_attr.getNumElements()) return {};
+  for (uint64_t i = 0, end = stb_bs_attr.getNumElements(); i < end; ++i) {
+    if (stb_bs_attr.getValues<Attribute>()[i] !=
+        bts_bs_attr.getValues<Attribute>()[i])
+      return {};
+  }
+
+  int dilation_h_factor = -1, dilation_w_factor = -1;
+  // Set dilation factor.
+  if (stb_bs_attr.getNumElements() >= 2) {
+    dilation_h_factor = stb_bs_attr.getValues<APInt>()[0].getSExtValue();
+    dilation_w_factor = stb_bs_attr.getValues<APInt>()[1].getSExtValue();
+  } else if (stb_bs_attr.getNumElements() == 1) {
+    // For 1d conv, `tf.nn.convolution` expands NWC to NHWC format after
+    // `SpaceToBatchND`. Therefore, `block_shape` of `stb_op` only has one
+    // dilation factor of W dim, and dilation factor of H dim is set to 1.
+    if (expand_axis == 1) {
+      // NWC -> NHWC
+      dilation_h_factor = 1;
+      dilation_w_factor = stb_bs_attr.getValues<APInt>()[0].getSExtValue();
+    } else if (expand_axis == 2) {
+      // NHC -> NHWC
+      dilation_h_factor = stb_bs_attr.getValues<APInt>()[0].getSExtValue();
+      dilation_w_factor = 1;
+    }
+  }
+
+  if (dilation_h_factor == -1 || dilation_w_factor == -1) {
+    return {};
+  }
+
+  return rewriter.getI64ArrayAttr({1, dilation_h_factor, dilation_w_factor, 1});
+}
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_DILATED_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.h
new file mode 100644
index 00000000..e4530480
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_LIFT_TFLITE_FLEX_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_LIFT_TFLITE_FLEX_OPS_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// Creates an instance of the lift TFLite Flex ops pass that lifts TFLite Flex
+// ops into TF dialect operations.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLiftTfliteFlexOpsPass();
+
+void AddLiftTfliteFlexOpsPatterns(MLIRContext *context,
+                                  RewritePatternSet &patterns);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_LIFT_TFLITE_FLEX_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.h
new file mode 100644
index 00000000..85fffcf2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_LOWER_QUANT_ANNOTATIONS_HELPER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_LOWER_QUANT_ANNOTATIONS_HELPER_H_
+
+#include <cstdint>
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir::TFL {
+
+LogicalResult FillCompositeParams(stablehlo::CompositeOp op,
+                                  SmallVector<double, 4>& scales,
+                                  SmallVector<int64_t, 4>& zero_points,
+                                  int& num_bits, bool& is_signed);
+
+LogicalResult GetStorageParams(unsigned num_bits, bool narrow_range,
+                               bool is_signed, MLIRContext* ctx,
+                               Type& storage_type, int64_t& qmin,
+                               int64_t& qmax);
+
+Type GetPerTensorQuantizedTensorType(Builder& builder, double scale,
+                                     int64_t zero_point, Type expressed_type,
+                                     int num_bits, Location loc,
+                                     bool narrow_range, bool is_signed);
+
+Type GetPerAxisQuantizedTensorType(Builder& builder,
+                                   SmallVector<double, 4> scales,
+                                   SmallVector<int64_t, 4> zero_points,
+                                   int32_t quantized_dimension,
+                                   Type expressed_type, int num_bits,
+                                   Location loc, bool narrow_range,
+                                   bool is_signed);
+
+}  // namespace mlir::TFL
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_LOWER_QUANT_ANNOTATIONS_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.h
new file mode 100644
index 00000000..c81548b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.h
@@ -0,0 +1,68 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BATCH_MATMUL_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BATCH_MATMUL_PASS_H_
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+
+namespace mlir {
+namespace TFL {
+
+// Optimize FC with BatchMatmul within the TensorFlow Lite dialect.
+
+class OptimizeBatchMatmulPass
+    : public TFL::Pass<OptimizeBatchMatmulPass, EmptyPassOptions,
+                       func::FuncOp> {
+ public:
+  OptimizeBatchMatmulPass() = default;
+  OptimizeBatchMatmulPass(const OptimizeBatchMatmulPass &other) {}
+
+  void runOnOperation() final;
+
+  /// Returns the command-line argument attached to this pass.
+  static llvm::StringRef GetArgument() { return "tfl-optimize-batch-matmul"; }
+
+  static llvm::StringRef GetDescription() {
+    return "Optimize FC with BatchMatmul within the TensorFlow Lite dialect.";
+  }
+
+  /// Returns the derived pass name.
+  static llvm::StringRef GetName() { return "OptimizeBatchMatmulPass"; }
+
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
+  /// Explicitly declare the TypeID for this class. We declare an explicit
+  /// private instantiation because Pass classes should only be visible by the
+  /// current library.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizeBatchMatmulPass)
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BATCH_MATMUL_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h
new file mode 100644
index 00000000..f13048a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h
@@ -0,0 +1,53 @@
+
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BROADCAST_LIKE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BROADCAST_LIKE_PASS_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass to optimize explicit broadcasting-like patterns.
+class OptimizeBroadcastLikePass
+    : public TFL::Pass<OptimizeBroadcastLikePass, EmptyPassOptions,
+                       func::FuncOp> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizeBroadcastLikePass)
+
+  OptimizeBroadcastLikePass() = default;
+  OptimizeBroadcastLikePass(const OptimizeBroadcastLikePass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "OptimizeBroadcastLikePass"; }
+  static llvm::StringRef GetArgument() { return "tfl-optimize-broadcast-like"; }
+  static llvm::StringRef GetDescription() {
+    return "Pass optimizing explicit broadcasting-like patterns.";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BROADCAST_LIKE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_pass.h
new file mode 100644
index 00000000..86e47726
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_pass.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_PASS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_pass_options.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+
+namespace mlir {
+namespace TFL {
+
+// Optimize TFLite operations in functions.
+class OptimizePass
+    : public Pass<OptimizePass, OptimizePassOptions, func::FuncOp> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizePass)
+
+  OptimizePass() = default;
+  OptimizePass(const OptimizePass &) {}
+  explicit OptimizePass(const mlir::detail::PassOptions &options)
+      : Pass<OptimizePass, OptimizePassOptions, func::FuncOp>(options) {}
+
+  /// Returns the command-line argument attached to this pass.
+  static llvm::StringRef GetArgument() { return "tfl-optimize"; }
+
+  static llvm::StringRef GetDescription() {
+    return "Optimize within the TensorFlow Lite dialect";
+  }
+
+  /// Returns the derived pass name.
+  static llvm::StringRef GetName() { return "OptimizePass"; }
+
+  void runOnOperation() override;
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_pass_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_pass_options.h
new file mode 100644
index 00000000..915dc380
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/optimize_pass_options.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_PASS_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_PASS_OPTIONS_H_
+
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass Options
+////////////////////////////////////////////////////////////////////////////////
+
+struct OptimizePassOptions : public mlir::detail::PassOptions {
+  mlir::detail::PassOptions::Option<bool> enable_canonicalization{
+      *this, "enable-canonicalization",
+      llvm::cl::desc("Enable canonicalization in the optimize pass"),
+      llvm::cl::init(true)};
+  mlir::detail::PassOptions::Option<bool> disable_fuse_mul_and_fc{
+      *this, "disable-fuse-mul-and-fc",
+      llvm::cl::desc("Disable fuse mul and fc in the optimize pass"),
+      llvm::cl::init(false)};
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_PASS_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass.h
new file mode 100644
index 00000000..f2eed518
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass.h
@@ -0,0 +1,113 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_H_
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h"
+
+// Forward declaration for the visitor interface
+// class PassOptionsVisitor;
+
+namespace mlir {
+namespace TFL {
+
+// Interface for setting options for TFLite Converter Pass/Pipeline Options.
+class MutableOptionsPass {
+ public:
+  virtual ~MutableOptionsPass() = default;
+  virtual void ApplyOptionsVisitor(const PassOptionsSetter &visitor) = 0;
+};
+
+// CRTP Class to ensure that the derived passes implement a Options struct
+template <typename DerivedPass, typename DerivedPassOptions = EmptyPassOptions,
+          typename OpType = mlir::ModuleOp>
+class Pass : public PassWrapper<Pass<DerivedPass, DerivedPassOptions, OpType>,
+                                mlir::OperationPass<OpType>>,
+             public MutableOptionsPass {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(Pass);
+
+  Pass() = default;
+  Pass(const Pass &pass) {
+    static_cast<DerivedPass *>(this)->GetOptions().copyOptionValuesFrom(
+        pass.GetOptions());
+  }
+  explicit Pass(const DerivedPassOptions &options) {
+    static_cast<DerivedPass *>(this)->GetOptions().copyOptionValuesFrom(
+        options);
+  }
+
+  explicit Pass(const mlir::detail::PassOptions &options) {
+    static_cast<DerivedPass *>(this)->GetOptions().copyOptionValuesFrom(
+        options);
+  }
+
+  /// Functions to satisfy the mlir::Pass interface
+  llvm::StringRef getArgument() const override {
+    return DerivedPass::GetArgument();
+  }
+
+  llvm::StringRef getDescription() const override {
+    return DerivedPass::GetDescription();
+  }
+
+  llvm::StringRef getName() const override { return DerivedPass::GetName(); }
+
+  /// Support isa/dyn_cast functionality for the derived pass class.
+  static bool classof(const ::mlir::Pass *pass) {
+    return pass->getTypeID() == ::mlir::TypeID::get<DerivedPass>();
+  }
+
+  /// A clone method to create a copy of this pass.
+  std::unique_ptr<::mlir::Pass> clonePass() const override {
+    auto pass =
+        std::make_unique<DerivedPass>(*static_cast<const DerivedPass *>(this));
+    pass->GetOptions().copyOptionValuesFrom(GetOptions());
+    return std::move(pass);
+  }
+  void runOnOperation() override {}
+
+  // ApplyOptionsVisitor method to `accept` the visitor
+  void ApplyOptionsVisitor(const PassOptionsSetter &visitor) override {
+    visitor.SetOptions(GetOptions());
+  }
+
+ protected:
+  DerivedPassOptions &GetOptions() {
+    return static_cast<DerivedPass *>(this)->options_;
+  }
+
+  const DerivedPassOptions &GetOptions() const {
+    return static_cast<const DerivedPass *>(this)->options_;
+  }
+
+ private:
+  DerivedPassOptions options_;
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_options.h
new file mode 100644
index 00000000..7f5bb198
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_options.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_OPTIONS_H_
+
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+struct EmptyPassOptions : public mlir::detail::PassOptions {};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h
new file mode 100644
index 00000000..534b1402
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_OPTIONS_SETTER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_OPTIONS_SETTER_H_
+
+namespace mlir {
+namespace TFL {
+
+class OptimizePassOptions;
+class VariableFreezingPipelineOptions;
+class EmptyPassOptions;
+
+// Interface for setting options for TFLite Converter Pass/Pipeline Options.
+class PassOptionsSetter {
+ public:
+  virtual ~PassOptionsSetter() = default;
+  virtual void SetOptions(OptimizePassOptions& options) const = 0;
+  virtual void SetOptions(VariableFreezingPipelineOptions& options) const = 0;
+  virtual void SetOptions(EmptyPassOptions& options) const = 0;
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  //  TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_OPTIONS_SETTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_registry_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_registry_utils.h
new file mode 100644
index 00000000..43f064c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pass_registry_utils.h
@@ -0,0 +1,98 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_REGISTRY_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_REGISTRY_UTILS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pipeline.h"
+
+namespace mlir {
+namespace TFL {
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass, Pipeline and Options Creation Utilities
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename PassType>
+std::unique_ptr<mlir::Pass> Create() {
+  if constexpr (std::is_base_of_v<Pipeline<PassType, EmptyPassOptions>,
+                                  PassType>) {
+    return std::make_unique<PipelinePass<PassType>>();
+  } else {
+    return std::make_unique<PassType>();
+  }
+}
+
+template <typename PassType, typename PassOptionsType>
+std::unique_ptr<mlir::Pass> Create() {
+  if constexpr (std::is_base_of_v<Pipeline<PassType, PassOptionsType>,
+                                  PassType>) {
+    return std::make_unique<PipelinePass<PassType, PassOptionsType>>();
+  } else {
+    return std::make_unique<PassType>(PassOptionsType());
+  }
+}
+
+template <typename PassType>
+std::unique_ptr<mlir::Pass> Create(const mlir::detail::PassOptions& options) {
+  return std::make_unique<PassType>(options);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Registration Utilities
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility to register a pass without options.
+template <typename PassType>
+void Register() {
+  PassRegistration<PassType> pass([] { return Create<PassType>(); });
+}
+
+// Utility to register a pass with options.
+template <typename PassType, typename PassOptionsType>
+void Register() {
+  auto pass_argument = PassType::GetArgument();
+  auto pass_description = PassType::GetDescription();
+
+  if constexpr (std::is_base_of_v<Pipeline<PassType, PassOptionsType>,
+                                  PassType>) {
+    // PassType is derived from PipelinePass, proceed with registration
+    // of the pipeline.
+    PassPipelineRegistration<PassOptionsType>(
+        pass_argument, pass_description,
+        [](OpPassManager& pm, const PassOptionsType& options) {
+          auto pipeline = PassType();
+          pipeline.AddPasses();
+          pipeline.GetPipeline(pm, options);
+        });
+  } else {
+    PassPipelineRegistration<PassOptionsType>(
+        pass_argument, pass_description,
+        [](OpPassManager& pm, const PassOptionsType& options) {
+          pm.addPass(std::move(Create<PassType>(options)));
+        });
+  }
+}
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASS_REGISTRY_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/passes.h
new file mode 100644
index 00000000..4d8eccca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -0,0 +1,356 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/transforms/canonicalize_boundary_value_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_registry_utils.h"
+#include "tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/tf_legalizations/analyze_variables_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/split_merged_operands_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+
+namespace mlir {
+namespace quant {
+class QuantDialect;
+}
+namespace quantfork {
+class QuantizationForkDialect;
+}
+namespace mhlo {
+class MhloDialect;
+}
+namespace TF {
+class TensorFlowDialect;
+}
+namespace TFL {
+class TFLDialect;
+typedef TFLDialect TensorFlowLiteDialect;
+}  // namespace TFL
+namespace func {
+class FuncOp;
+}
+class ModuleOp;
+template <typename T>
+class OperationPass;
+class Type;
+
+namespace TFL {
+
+////////////////////////////////////////////////////////////////////////////////
+// Forward declarations
+////////////////////////////////////////////////////////////////////////////////
+
+struct OptimizePassOptions;
+
+////////////////////////////////////////////////////////////////////////////////
+// Utilities for backward compatibility
+////////////////////////////////////////////////////////////////////////////////
+
+// Creates an instance of the TensorFlow Lite dialect LegalizeTF pass.
+// When the given run_tfl_runtime_verification value is true, it will check each
+// TFL builtin op towards the TFL runtime capability and the incompatible TF ops
+// will be left in the graph without getting legalized. If `preserve_assert_op`
+// is true, the TF::AssertOp will not be removed.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeTFPass(
+    bool run_tfl_runtime_verification, bool preserve_assert_op = false);
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeTFPass();
+
+// Creates an instance of the TensorFlow Lite dialect Optimize pass.
+inline std::unique_ptr<mlir::Pass> CreateOptimizePass() {
+  return Create<OptimizePass>();
+}
+
+// Creates an instance of the Tensorflow Lite batch matmul Optimize pass.
+inline std::unique_ptr<mlir::Pass> CreateOptimizeBatchMatmulPass() {
+  return Create<OptimizeBatchMatmulPass>();
+}
+
+// Creates an instance of the TensorFlow Lite dialect PrepareTF pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareTFPass(
+    bool unfold_batch_matmul, bool allow_bf16_and_f16_type_legalization,
+    bool use_fake_quant_num_bits = false);
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareTFPass();
+
+// Creates an instance of the TensorFlow Lite dialect LowerStaticTensorList
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLowerStaticTensorListPass(
+    bool allow_tensorlist_pass_through, bool default_to_single_batch,
+    bool enable_dynamic_update_slice);
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLowerStaticTensorListPass();
+
+// Creates an instance of the TensorFlow Lite dialect Quantize pass.
+// Use quant_specs.ops_blocklist and quant_specs.nodes_blocklist if possible
+// as they are now structure variables of QuantizationSpecs.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    const quant::QuantizationSpecs& quant_specs,
+    const absl::flat_hash_set<std::string>& ops_blocklist = {},
+    const absl::flat_hash_set<std::string>& nodes_blocklist = {});
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantizePass();
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLowerQuantAnnotationsPass();
+
+// Overloading of CreateQuantizePass which takes only necessary flags to reduce
+// the binary size.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    bool verify_numeric = false, bool whole_model_verify = false,
+    bool legacy_float_scale = false,
+    const absl::flat_hash_set<std::string>& ops_blocklist = {},
+    const absl::flat_hash_set<std::string>& nodes_blocklist = {});
+
+// Creates an instance of the TensorFlow Lite dialect PrepareQuantize pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
+    const quant::QuantizationSpecs& quant_specs);
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass();
+
+// Creates an instance of the TensorFlow Lite dialect
+// PrepareDynamicRangeQuantize pass.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreatePrepareDynamicRangeQuantizePass(
+    const quant::QuantizationSpecs& quant_specs);
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreatePrepareDynamicRangeQuantizePass();
+
+// Creates an instance of the TensorFlow Lite dialect PostQuantize pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass();
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass(
+    bool emit_quant_adaptor_ops, const quant::CustomOpMap& custom_op_map = {});
+
+// Creates an instance of the TensorFlow Lite dialect QuantizeVariables pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeVariablesPass();
+
+// Creates an instance of the TensorFlow Lite pass that decomposes hybrid
+// quantization patterns to the same dense operation with tfl dequantization
+// and quantization patterns.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDecomposeHybridQuantizationPass();
+
+// Creates an instance of the TensorFlow Lite optimize op order pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateOptimizeOpOrderPass();
+
+// Creates an instance of the TensorFlow Lite dialect TrimFunctions
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTrimFunctionsPass();
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTrimFunctionsPass(
+    const std::vector<std::string>& trim_funcs_allowlist);
+
+// Creates an instance of the TensorFlow Lite dialect PrepareCompositeFunctions
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareCompositeFunctionsPass();
+
+// Creates an instance of the TensorFlow Lite dialect SplitMergedOperandsPass.
+inline std::unique_ptr<mlir::Pass> CreateSplitMergedOperandsPass() {
+  return Create<SplitMergedOperandsPass>();
+}
+
+// Creates an instance of the TensorFlow Lite dialect OptimizeFunctionalOpsPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeFunctionalOpsPass();
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateModifyIONodesPass(
+    mlir::Type input_type, mlir::Type output_type);
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateModifyIONodesPass();
+
+// Creates an instance of the TensorFlow Lite dialect PostQuantizeRemoveQDQ
+// pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizeRemoveQDQPass();
+
+// Creates an instance of the TensorFlow Lite dialect pass to add default
+// quantization parameters.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantParamsPass(
+    double default_min, double default_max, bool is_signed);
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantParamsPass();
+
+// Creates an instance of the IdentifyDilatedConvPass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateIdentifyDilatedConvPass();
+
+// Creates function pass to legalize TF While to TFL While.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFWhilePass();
+
+// Legalize tflite flex ops to TF ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLiftTfliteFlexOpsPass();
+
+// Creates an instance of the TensorFlow Lite dialect WhileOp outline pass.
+inline std::unique_ptr<mlir::Pass> CreateWhileOutlinePass() {
+  return Create<WhileOutlinePass>();
+}
+
+// Creates an instance of the TensorFlow Lite dialect IfOp outline pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateIfOutlinePass();
+
+// Creates a pass to remove operands of TFL WhileOp without changing outcomes.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateReduceWhileOperandsPass();
+
+// Verifies runtime constraints.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateRuntimeVerifyPass();
+
+// Creates raise custom ops pass, which legalize custom ops to TFL::CustomOp
+std::unique_ptr<OperationPass<func::FuncOp>> CreateRaiseCustomOpsPass();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateRaiseCustomOpsPass(
+    const std::vector<std::string>& target_ops);
+
+// Creates raise custom ops pass, which legalize custom ops to TFL::CustomOp
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLowerCustomOpsPass();
+
+// Inserts an TFL::CallOnce op when the tf_saved_model's session initialzer is
+// given.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInsertCallOnceOpFromSessionInitializerPass();
+
+// Replace the tfl wrapped random function body with tfl.customOp.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeJaxRandomPass();
+
+// Creates a pass which is responsible for legalizing TensorFlow variables to
+// TensorFlow Lite variables.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeVariablesPass();
+
+// Creates a pass which analyze the model whether it is safe to use
+// native TFLite variables or not.
+inline std::unique_ptr<mlir::Pass> CreateAnalyzeVariablesPass() {
+  return Create<AnalyzeVariablesPass>();
+}
+
+// Creates a pass which is responsible for legalizing TensorFlow static hash
+// tables to TensorFlow Lite hash tables.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHashTablesPass();
+
+// Creates get arithmetic count pass, which will calculate the arithmetic count
+// for each ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateGetArithmeticCountPass();
+
+// Creates unfold large constant pass, which will replace large splat constant
+// tensors with fill op.
+inline std::unique_ptr<mlir::Pass> CreateUnfoldLargeSplatConstantPass() {
+  return Create<UnfoldLargeSplatConstantPass>();
+}
+
+// Creates a pass which is responsible for unfreezing mutable global tensors.
+inline std::unique_ptr<mlir::Pass> CreateUnfreezeMutableGlobalTensorsPass() {
+  return Create<UnfreezeMutableGlobalTensorsPass>();
+}
+
+// Creates a pass that adds control dependencies to keep the relative
+// execution order of operations with side effects frozen.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePinOpsWithSideEffectsPass();
+
+// Legalize TensorList Ops iff all of them are supported.
+inline std::unique_ptr<mlir::Pass> CreateLegalizeTensorListPass() {
+  return Create<LegalizeTensorListPass>();
+}
+
+// Reduce the type precision of some tensor types if all values within that
+// tensor are within the range of the reduced precision.
+std::unique_ptr<OperationPass<ModuleOp>> CreateReduceTypePrecisionPass();
+
+// Conservatively pushes transposes through element-wise ops to prepare
+// so redundant ones may be grouped and removed.
+inline std::unique_ptr<mlir::Pass> CreatePushTransposeThroughEwisePass() {
+  return Create<PushTransposeThroughEwisePass>();
+}
+
+// Create a pass that canonicalize the boundary values.
+inline std::unique_ptr<mlir::Pass> CreateCanonicalizeBoundaryValuePass() {
+  return Create<CanonicalizeBoundaryValuePass>();
+}
+
+// Creates a pass that brings operations into the same order as graph_info.cc.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreatePartitionedTopologicalSortPass();
+
+#define GEN_PASS_DECL_DEFAULTQUANTPARAMSPASS
+#define GEN_PASS_DECL_LEGALIZETFPASS
+#define GEN_PASS_DECL_LOWERSTATICTENSORLISTPASS
+#define GEN_PASS_DECL_MODIFYIONODESPASS
+#define GEN_PASS_DECL_POSTQUANTIZEPASS
+#define GEN_PASS_DECL_PREPARECOMPOSITEFUNCTIONSPASS
+#define GEN_PASS_DECL_PREPAREDYNAMICRANGEQUANTIZEPASS
+#define GEN_PASS_DECL_PREPAREQUANTIZEPASS
+#define GEN_PASS_DECL_PREPARETFPASS
+#define GEN_PASS_DECL_QUANTIZEPASS
+#define GEN_PASS_DECL_RAISECUSTOMOPSPASS
+#define GEN_PASS_DECL_TRIMFUNCTIONSPASS
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h.inc"
+
+// Creates an instance of the TensorFlow Lite dialect LegalizeTF pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeTFPass(
+    const LegalizeTFPassOptions& options);
+
+// Creates an instance of the TensorFlow Lite dialect PrepareTF pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareTFPass(
+    const PrepareTFPassOptions& options);
+
+// Creates an instance of the TensorFlow Lite dialect LowerStaticTensorList
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLowerStaticTensorListPass(
+    const LowerStaticTensorListPassOptions& options);
+
+// Creates raise custom ops pass, which legalize custom ops to TFL::CustomOp
+std::unique_ptr<OperationPass<func::FuncOp>> CreateRaiseCustomOpsPass(
+    const RaiseCustomOpsPassOptions& options);
+
+// Creates an instance of the TensorFlow Lite dialect pass to add default
+// quantization parameters.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantParamsPass(
+    const DefaultQuantParamsPassOptions& options);
+
+inline void registerTensorFlowLitePasses() {
+  registerTensorFlowLiteTdPasses();
+  // Register TFLite Converter Passes
+  Register<UnfreezeMutableGlobalTensorsPass>();
+
+  // TF Legalization Passes
+  Register<AnalyzeVariablesPass>();
+  Register<LegalizeTensorListPass>();
+  Register<WhileOutlinePass>();
+
+  // TFL Optimization Passes
+  Register<OptimizePass, OptimizePassOptions>();
+  Register<OptimizeBatchMatmulPass>();
+  Register<UnfreezeMutableGlobalTensorsPass>();
+  Register<OptimizeBroadcastLikePass>();
+  Register<PushTransposeThroughEwisePass>();
+  Register<CanonicalizeBoundaryValuePass>();
+
+  // Other TFLite Passes
+  Register<UnfoldLargeSplatConstantPass>();
+  Register<SplitMergedOperandsPass>();
+}
+
+}  // namespace TFL
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pipeline.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pipeline.h
new file mode 100644
index 00000000..f0420b11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/pipeline.h
@@ -0,0 +1,173 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PIPELINE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PIPELINE_H_
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h"
+
+namespace mlir {
+namespace TFL {
+
+/// Pipeline is a base class for pipelines of passes.
+///
+/// A pipeline is a collection of passes that are run in a specific order. The
+/// pipeline can be configured with options that control which passes are
+/// enabled and how they are run.
+///
+/// To create a new pipeline, derive from this class and implement the
+/// `AddPasses` method. This method should add passes to the pipeline using the
+/// `AddPass` method.
+///
+/// Example:
+///
+/// ```cpp
+/// class MyPipeline : public Pipeline<MyPipeline> {
+///  public:
+///   void AddPasses() override {
+///     AddPass<Pass1>();
+///     AddPass<Pass2>();
+///   }
+/// };
+/// ```
+template <typename DerivedPipeline,
+          typename DerivedPipelineOptions = EmptyPassOptions>
+class Pipeline {
+ public:
+  struct PipelineEntry {
+    std::unique_ptr<mlir::Pass> pass;
+    std::function<bool(const DerivedPipelineOptions &)> enable_condition;
+  };
+
+  Pipeline() = default;
+  virtual ~Pipeline() = default;
+  virtual void AddPasses() = 0;
+
+  /// Function to force the derived pipeline to implement the metadata
+  // method.
+  llvm::StringRef getArgument() const { return DerivedPipeline::GetArgument(); }
+
+  llvm::StringRef getDescription() const {
+    return DerivedPipeline::GetDescription();
+  }
+
+  llvm::StringRef getName() const { return DerivedPipeline::GetName(); }
+
+  void GetPipeline(mlir::OpPassManager &pm,
+                   const DerivedPipelineOptions &options) {
+    for (auto &&entry : passes_) {
+      if (entry.enable_condition(options)) {
+        pm.addPass(std::move(entry.pass));
+      }
+    }
+  };
+
+ protected:
+  void AddPass(
+      std::unique_ptr<mlir::Pass> pass,
+      std::function<bool(const DerivedPipelineOptions &)> enable_condition) {
+    passes_.push_back({std::move(pass), enable_condition});
+  }
+
+  template <typename DerivedPipelinePass, typename DerivedPipelinePassOptions>
+  friend class PipelinePass;
+
+  std::vector<mlir::Pass *> GetPasses() {
+    std::vector<mlir::Pass *> passes;
+    passes.reserve(passes_.size());
+    for (auto &&entry : passes_) {
+      passes.push_back(entry.pass.get());
+    }
+    return passes;
+  }
+
+ private:
+  std::vector<PipelineEntry> passes_;
+};
+
+/// PipelinePass is a wrapper class to run a pipeline of passes as a single
+/// pass. This is an implementation detail of the pipelines mechanism in TFL
+/// Converter framework. Users should not need to interact with this class
+/// directly.
+template <typename Pipeline, typename PipelineOptions = EmptyPassOptions>
+class PipelinePass
+    : public Pass<PipelinePass<Pipeline, PipelineOptions>, PipelineOptions> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PipelinePass);
+
+  PipelinePass() { pipeline_->AddPasses(); };
+  PipelinePass(const PipelinePass &) {};
+  explicit PipelinePass(const PipelineOptions &options)
+      : Pass<PipelinePass<Pipeline, PipelineOptions>, PipelineOptions>(
+            options) {
+    pipeline_.AddPasses();
+  };
+
+  std::unique_ptr<::mlir::Pass> clonePass() const override {
+    auto pass = std::make_unique<PipelinePass<Pipeline, PipelineOptions>>();
+    pass->GetOptions().copyOptionValuesFrom(this->GetOptions());
+    return std::move(pass);
+  }
+
+  /// Function to satisfy the mlir::Pass interface
+  static llvm::StringRef GetArgument() { return Pipeline::GetArgument(); }
+
+  static llvm::StringRef GetDescription() { return Pipeline::GetDescription(); }
+
+  static llvm::StringRef GetName() { return Pipeline::GetName(); }
+
+  void runOnOperation() final {
+    ModuleOp module_op = this->getOperation();
+
+    // Create a temporary OpPassManager to run the passes. Nesting is set to be
+    // implicit to allow for the nesting to happen under-the-hood.
+    OpPassManager pm(ModuleOp::getOperationName(),
+                     OpPassManager::Nesting::Implicit);
+    pipeline_->GetPipeline(pm, this->GetOptions());
+    if (failed(this->runPipeline(pm, module_op))) {
+      this->signalPassFailure();
+    }
+  };
+
+  void ApplyOptionsVisitor(const PassOptionsSetter &visitor) final {
+    visitor.SetOptions(this->GetOptions());
+
+    for (auto &&pass : pipeline_->GetPasses()) {
+      if (auto *derived_pass = dynamic_cast<MutableOptionsPass *>(pass)) {
+        derived_pass->ApplyOptionsVisitor(visitor);
+      }
+    }
+  }
+
+ private:
+  std::unique_ptr<Pipeline> pipeline_ = std::make_unique<Pipeline>();
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
new file mode 100644
index 00000000..824976e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -0,0 +1,668 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Transform pass for LSTMs.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PREPARE_QUANTIZE_HELPER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PREPARE_QUANTIZE_HELPER_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+#include "tensorflow/compiler/mlir/lite/tools/optimize/operator_property.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+//===----------------------------------------------------------------------===//
+// The prepare-quantize Pass for LSTM.
+//
+namespace mlir {
+namespace TFL {
+
+constexpr double power_of_two_scale = 32768.0;
+
+// Same with the ordering of //tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+constexpr const char* intermediate_attributes[] = {
+    "input_to_input_intermediate", "input_to_forget_intermediate",
+    "input_to_cell_intermediate", "input_to_output_intermediate",
+    "effective_hidden_scale_intermediate"};
+
+// Calculates the minimum power of two that is not less than the value.
+double PowerOfTwoBound(double value);
+
+tensorflow::DataType GetQuantizedInferenceType(bool is_signed,
+                                               int activation_number_of_bits);
+
+// Returns the element type of LSTM's intermediate tensor designated by the
+// index.
+template <typename LstmOp>
+inline QuantizedType GetIntermediateElementType(LstmOp op, int tensor_index) {
+  if (tensor_index < 0 || tensor_index > 4) return nullptr;
+  TypeAttr attr = op->template getAttrOfType<TypeAttr>(
+      intermediate_attributes[tensor_index]);
+  if (!attr) {
+    return nullptr;
+  }
+  return QuantizedType::getQuantizedElementType(attr.getValue());
+}
+
+namespace operator_property = ::tflite::optimize::operator_property;
+using Q = quantfork::QuantizeCastOp;
+using DQ = quantfork::DequantizeCastOp;
+
+template <typename LstmOp>
+LogicalResult GetLstmProperty(LstmOp op,
+                              operator_property::OpVariant* lstm_variant,
+                              operator_property::OperatorProperty* op_property,
+                              int activation_number_of_bits = 8) {
+  if (llvm::isa<TFL::LSTMOp>(op.getOperation())) {
+    lstm_variant->op_code = tflite::BuiltinOperator_LSTM;
+  } else if (llvm::isa<TFL::UnidirectionalSequenceLSTMOp>(op.getOperation())) {
+    lstm_variant->op_code =
+        tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM;
+  } else {
+    op.emitError("ConvertLstmStatsToQDQs pass only supports LSTMs.");
+    return failure();
+  }
+  lstm_variant->use_projection =
+      !mlir::isa<NoneType>(op.getProjectionWeights().getType());
+  lstm_variant->use_peephole =
+      !mlir::isa<NoneType>(op.getCellToOutputWeights().getType());
+  lstm_variant->use_layer_norm =
+      !mlir::isa<NoneType>(op.getForgetLayerNormCoefficients().getType());
+
+  *op_property = operator_property::GetOperatorProperty(
+      *lstm_variant, activation_number_of_bits);
+
+  // TODO(b/176258587) move this to operator_property.cc if this is needed in
+  // other components, too.
+  bool use_cifg = mlir::isa<NoneType>(op.getInputToInputWeights().getType());
+  if (use_cifg) {
+    const absl::flat_hash_set<int> cifg_non_inputs = {1, 5, 9, 12, 20};
+    const int cifg_non_intermediate = 0;
+    op_property->inputs.erase(
+        std::remove_if(
+            op_property->inputs.begin(), op_property->inputs.end(),
+            [&](std::pair<int, operator_property::TensorProperty> input) {
+              return cifg_non_inputs.find(input.first) != cifg_non_inputs.end();
+            }),
+        op_property->inputs.end());
+    op_property->intermediates.erase(
+        std::remove_if(op_property->intermediates.begin(),
+                       op_property->intermediates.end(),
+                       [&](std::pair<int, operator_property::TensorProperty>
+                               intermediate) {
+                         return intermediate.first == cifg_non_intermediate;
+                       }),
+        op_property->intermediates.end());
+  }
+  return success();
+}
+
+template <typename SourceOp>
+class PrepareLstmOutputScale : public OpRewritePattern<SourceOp> {
+ public:
+  explicit PrepareLstmOutputScale(MLIRContext* context)
+      : OpRewritePattern<SourceOp>(context) {}
+  LogicalResult matchAndRewrite(SourceOp op,
+                                PatternRewriter& rewriter) const override {
+    operator_property::OpVariant lstm_variant;
+    operator_property::OperatorProperty lstm_property;
+
+    if (failed(GetLstmProperty(op, &lstm_variant, &lstm_property))) {
+      return failure();
+    }
+    if (lstm_property.restrict_scale.size() != 1) {
+      op.emitError() << "The LSTM's operator property expects exactly one "
+                     << "restrict scale requirement. Got "
+                     << lstm_property.restrict_scale.size()
+                     << " restrict scale requirements.";
+      return failure();
+    }
+
+    // Use same scale for input and output specified in restrict_scale.
+    const std::vector<int>& tensors = lstm_property.restrict_scale[0];
+    if (tensors.size() != 2) {
+      op.emitError(
+          "Unexpected restricted_scale from operator property."
+          " Should only have a pair of indices.");
+      return failure();
+    }
+    return processRestrictScale(op, tensors[0], tensors[1], rewriter);
+  }
+
+ private:
+  // For LSTM's recurrent input activation and output, they are quantized with
+  // the collective range of both tensors, because theoretically the input
+  // activation value for the very first inference is not reflected in the
+  // output and the input activation is not captured.
+  LogicalResult processRestrictScale(SourceOp op, int input_index,
+                                     int output_index,
+                                     PatternRewriter& rewriter) const {
+    assert(output_index == 0);
+    if (!op.getResult().hasOneUse()) {
+      op.emitError()
+          << "output " << output_index
+          << " should have only one use, which should be quant.stats.";
+      return failure();
+    }
+
+    llvm::SmallVector<quantfork::StatisticsOp, 2> stats_ops = {
+        llvm::dyn_cast_or_null<quantfork::StatisticsOp>(
+            op.getOperand(input_index).getDefiningOp()),
+        llvm::dyn_cast_or_null<quantfork::StatisticsOp>(
+            *op.getResult().getUsers().begin()),
+    };
+
+    if (!stats_ops[0] || !stats_ops[1]) {
+      return failure();  // Already converted to Q-DQ pair.
+    }
+
+    llvm::SmallVector<llvm::APFloat, 4> min_max_values;
+
+    for (auto& stats_op : stats_ops) {
+      auto values =
+          mlir::dyn_cast<DenseFPElementsAttr>(stats_op.getLayerStats())
+              .getValues<llvm::APFloat>();
+      min_max_values.insert(min_max_values.end(), values.begin(), values.end());
+    }
+
+    // min and max values of two stats are already the same.
+    if (min_max_values[0] == min_max_values[2] &&
+        min_max_values[1] == min_max_values[3]) {
+      return failure();
+    }
+
+    mlir::ElementsAttr layer_stats = mlir::DenseFPElementsAttr::get(
+        mlir::RankedTensorType::get({2}, rewriter.getF32Type()),
+        {llvm::minimum(min_max_values[0], min_max_values[2]),
+         llvm::maximum(min_max_values[1], min_max_values[3])});
+    mlir::ElementsAttr axis_stats;
+    mlir::IntegerAttr axis;
+    for (auto& stats_op : stats_ops) {
+      rewriter.setInsertionPointAfter(stats_op);
+      rewriter.replaceOpWithNewOp<quantfork::StatisticsOp>(
+          stats_op, stats_op.getArg(), layer_stats, axis_stats, axis);
+    }
+    return success();
+  }
+};
+
+template <typename SourceOp>
+class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
+ public:
+  explicit ConvertOpStatsToQDQs(MLIRContext* context,
+                                const quant::QuantizationSpecs& quant_specs,
+                                PatternBenefit benefit = 1)
+      : OpRewritePattern<SourceOp>(context, benefit),
+        quant_specs_(quant_specs) {}
+
+ protected:
+  quant::QuantizationSpecs quant_specs_;
+
+  LogicalResult processInputs(
+      SourceOp op, const operator_property::OpVariant& op_variant,
+      const operator_property::OperatorProperty& op_property,
+      PatternRewriter& rewriter) const {
+    for (auto& enumerated_inputs : op_property.inputs) {
+      int index = enumerated_inputs.first;
+      auto& tensor_property = enumerated_inputs.second;
+
+      Value input = op.getOperand(index);
+
+      if (input.getDefiningOp() == nullptr) continue;
+
+      // TODO(b/172517537): make this work with non-PTQ case.
+      if (llvm::isa<func::ConstantOp, arith::ConstantOp, TFL::ConstOp>(
+              input.getDefiningOp())) {
+        // Tensors with derived scale are biases, and handled in propagation.
+        if (tensor_property.use_derived_scale) continue;
+        // For weights, use quantization scale inferred from the values.
+        if (failed(processConstantOp(op, input.getDefiningOp(), index,
+                                     tensor_property, rewriter))) {
+          return failure();
+        }
+      } else {
+        if (auto stats_op = llvm::dyn_cast<quantfork::StatisticsOp>(
+                input.getDefiningOp())) {
+          if (failed(replaceStatsOp(op, stats_op, index, tensor_property,
+                                    rewriter))) {
+            return failure();
+          }
+        } else if (!llvm::isa<DQ>(input.getDefiningOp()) &&
+                   !llvm::isa<SameScalesOpInterface, FixedOutputRangeInterface>(
+                       input.getDefiningOp())) {
+          // Continue if StatisticsOp is already converted to Q-DQ pair, or
+          // stats op is not immediately available to the input because either
+          // it's connected to ops with same scale requirements or it has
+          // fixed output range.
+          // TODO(b/172517537): make this work with non-PTQ case.
+          return failure();
+        }
+      }
+    }
+    return success();
+  }
+
+  LogicalResult processConstantOp(
+      SourceOp op, Operation* const_op, int input_index,
+      const operator_property::TensorProperty& tensor_property,
+      PatternRewriter& rewriter) const {
+    // Non-float tensors are neither weights nor require quantization.
+    auto type = mlir::dyn_cast<ShapedType>(const_op->getResult(0).getType());
+    if (!type || !mlir::isa<FloatType>(type.getElementType())) return success();
+
+    DenseFPElementsAttr attr;
+    if (!matchPattern(const_op->getResult(0), m_Constant(&attr))) {
+      const_op->emitError("Not a constant op.");
+      return failure();
+    }
+
+    UniformQuantizedType quant_type = nullptr;
+    // When the number of bits is 10 (instead of 16), quantize the tensor to
+    // [-512, 512], instead of [-32767, 32767].
+    // For now this behavior is specific for SVDF, where 6 bits are reserved for
+    // the reduce operation after element-wise multiplication between state and
+    // time weights.
+    if (tensor_property.number_of_bits == 10) {
+      SmallVector<double, 4> mins(1, std::numeric_limits<double>::max());
+      SmallVector<double, 4> maxs(1, std::numeric_limits<double>::min());
+      // Computes the effective min/max values of the attribute values.
+      quant::ExtractMinMaxFromAttr(attr, /*dim_size=*/1, /*slice_size=*/1,
+                                   /*symmetric=*/true, mins, maxs);
+      double scale = maxs[0] / -llvm::minIntN(tensor_property.number_of_bits);
+      quant_type = UniformQuantizedType::getChecked(
+          const_op->getLoc(), quant::QuantizationFlags::Signed,
+          rewriter.getIntegerType(16), attr.getType().getElementType(), scale,
+          /*zeroPoint=*/0, llvm::minIntN(10), -llvm::minIntN(10));
+    } else {
+      quant_type = mlir::dyn_cast<quant::UniformQuantizedType>(
+          quant::GetUniformQuantizedTypeForWeight(
+              attr, /*symmetric=*/true,
+              /*num_bits=*/tensor_property.number_of_bits,
+              /*is_signed=*/true,
+              /*narrow_range=*/true, quant_specs_.legacy_float_scale));
+    }
+    if (!quant_type) {
+      const_op->emitError("Failed to get quantized type");
+      return failure();
+    }
+
+    // TODO(b/172517537): duplicate the constant when the bias is shared.
+    Type expressed_type = const_op->getResult(0).getType();
+    Type cast_type = quant_type.castFromExpressedType(expressed_type);
+    rewriter.setInsertionPointAfter(const_op);
+    auto q = rewriter.create<Q>(const_op->getLoc(), cast_type,
+                                const_op->getResult(0));
+    auto dq = rewriter.create<DQ>(const_op->getLoc(), expressed_type, q);
+    op.setOperand(input_index, dq.getResult());
+    return success();
+  }
+
+  LogicalResult replaceStatsOp(
+      SourceOp op, quantfork::StatisticsOp stats_op, int input_index,
+      const operator_property::TensorProperty& tensor_property,
+      PatternRewriter& rewriter) const {
+    if (tensor_property.state_tensor && !stats_op.getResult().hasOneUse()) {
+      // TODO(b/172517537): check if other tensors should go through this
+      // check too.
+      op.emitError() << "Input tensor [" << input_index
+                     << "] is a state tensor, but has more than one use.";
+      return failure();
+    }
+    auto stats = mlir::dyn_cast<DenseFPElementsAttr>(stats_op.getLayerStats());
+    if (!stats || stats.getNumElements() != 2) {
+      stats_op.emitError("Stats should have 2 values.");
+      return failure();
+    }
+    quant::QuantizedType quant_type;
+    double min = FloatAttr::getValueAsDouble(stats.getValues<APFloat>()[0]);
+    double max = FloatAttr::getValueAsDouble(stats.getValues<APFloat>()[1]);
+    // Make sure the range includes zero.
+    min = std::min(min, 0.0);
+    max = std::max(max, 0.0);
+    Type expressed = getElementTypeOrSelf(stats_op.getType());
+
+    if (tensor_property.extend_to_power_of_two) {
+      if (tensor_property.number_of_bits != 16) {
+        op.emitError(
+            "extended power of 2 scale is only supported for 16-bit"
+            " quantization.");
+        return failure();
+      }
+
+      double bound = PowerOfTwoBound(std::max(std::abs(min), std::abs(max)));
+      // Set flags to 1 for signed type.
+      quant_type = UniformQuantizedType::getChecked(
+          op.getLoc(), quant::QuantizationFlags::Signed,
+          rewriter.getIntegerType(tensor_property.number_of_bits), expressed,
+          /*scale=*/bound / -llvm::minIntN(tensor_property.number_of_bits),
+          /*zeroPoint=*/0, llvm::minIntN(tensor_property.number_of_bits),
+          llvm::maxIntN(tensor_property.number_of_bits));
+    } else {
+      // int16 uses range [-32767, 32767]
+      if (tensor_property.number_of_bits == 16) {
+        max = std::max(std::abs(min), std::abs(max));
+        min = -max;
+        quant_type = quantfork::fakeQuantAttrsToType(
+            op.getLoc(), tensor_property.number_of_bits, min, max,
+            /*narrowRange=*/true, expressed,
+            /*isSigned=*/true);
+      } else {
+        quant_type = quantfork::fakeQuantAttrsToType(
+            op.getLoc(), tensor_property.number_of_bits, min, max,
+            /*narrowRange=*/false, expressed,
+            /*isSigned=*/true);
+      }
+      if (quant_specs_.legacy_float_scale) {
+        quant_type = quant::DownCastScale(quant_type, min, max, op.getLoc());
+      }
+    }
+    rewriter.setInsertionPointAfter(stats_op);
+    Type result_type = quant_type.castFromExpressedType(stats_op.getType());
+    auto q =
+        rewriter.create<Q>(stats_op.getLoc(), result_type, stats_op.getArg());
+    rewriter.replaceOpWithNewOp<DQ>(stats_op, stats_op.getType(), q);
+    return success();
+  }
+};
+
+// Quantize LSTM according to its quantization recipe.
+template <typename SourceOp>
+class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
+ public:
+  ConvertLstmStatsToQDQs(MLIRContext* context,
+                         const quant::QuantizationSpecs& quant_specs)
+      : ConvertOpStatsToQDQs<SourceOp>(context, quant_specs),
+        activation_number_of_bits_(quant_specs.GetQuantizationTypeWidth()) {}
+  LogicalResult matchAndRewrite(SourceOp op,
+                                PatternRewriter& rewriter) const override {
+    operator_property::OpVariant lstm_variant;
+    operator_property::OperatorProperty lstm_property;
+    if (failed(GetLstmProperty(op, &lstm_variant, &lstm_property,
+                               activation_number_of_bits_))) {
+      return failure();
+    }
+
+    if (failed(processIntermediates(op, lstm_variant, lstm_property)) ||
+        failed(ConvertOpStatsToQDQs<SourceOp>::processInputs(
+            op, lstm_variant, lstm_property, rewriter))) {
+      return failure();
+    }
+
+    return success();
+  }
+
+ private:
+  LogicalResult processIntermediates(
+      SourceOp op, const operator_property::OpVariant& lstm_variant,
+      const operator_property::OperatorProperty& lstm_property) const {
+    for (auto& enumerated_intermediates : lstm_property.intermediates) {
+      int index = enumerated_intermediates.first;
+      auto& tensor_property = enumerated_intermediates.second;
+      // intermediate tensors 0, 1, 2, 3 are only used with layer normalization.
+      if (!lstm_variant.use_layer_norm && index != 4) {
+        continue;
+      }
+
+      TypeAttr attr =
+          op->template getAttrOfType<TypeAttr>(intermediate_attributes[index]);
+      auto quant_type = GetIntermediateElementType<SourceOp>(op, index);
+      if (!quant_type) {
+        // intermediate tensor 4 is optional, unless the LSTM uses projection.
+        if (index == 4 && !lstm_variant.use_projection) {
+          return success();
+        }
+        op.emitError() << intermediate_attributes[index]
+                       << " is not quantized.";
+        return failure();
+      }
+      auto calibrated_type =
+          mlir::dyn_cast<quant::CalibratedQuantizedType>(quant_type);
+      if (!calibrated_type) {
+        int num_storage_bits = quant_type.getStorageTypeIntegralWidth();
+        if (tensor_property.number_of_bits != num_storage_bits) {
+          op.emitError() << intermediate_attributes[index]
+                         << " is expected to be quantized with "
+                         << tensor_property.number_of_bits << " bits, but got "
+                         << num_storage_bits << " bits instead.";
+          return failure();
+        }
+        continue;  // skip if it is already quantized.
+      }
+      quant::UniformQuantizedType qtype;
+      if (tensor_property.number_of_bits == 8) {
+        qtype = quantfork::fakeQuantAttrsToType(
+            op.getLoc(), tensor_property.number_of_bits,
+            calibrated_type.getMin(), calibrated_type.getMax(),
+            /*narrowRange=*/false, calibrated_type.getExpressedType(),
+            /*isSigned=*/this->quant_specs_.IsSignedInferenceType());
+        if (this->quant_specs_.legacy_float_scale) {
+          qtype = mlir::cast<UniformQuantizedType>(
+              quant::DownCastScale(qtype, calibrated_type.getMin(),
+                                   calibrated_type.getMax(), op.getLoc()));
+        }
+      } else if (tensor_property.number_of_bits == 16) {
+        double max = std::max(std::abs(calibrated_type.getMin()),
+                              std::abs(calibrated_type.getMax()));
+        qtype = quantfork::fakeQuantAttrsToType(
+            op.getLoc(), tensor_property.number_of_bits, -max, max,
+            /*narrowRange=*/true, calibrated_type.getExpressedType(),
+            /*isSigned=*/true);
+      } else {
+        op.emitError() << "Unsupported quantization bits: "
+                       << tensor_property.number_of_bits;
+        return failure();
+      }
+      op->setAttr(intermediate_attributes[index],
+                  TypeAttr::get(qtype.castFromExpressedType(
+                      qtype.castToExpressedType(attr.getValue()))));
+    }
+    return success();
+  }
+
+  int activation_number_of_bits_;
+};
+
+// Returns a function that returns the quantized type of a bias input.
+// The scale of bias is a multiplication of given scale and scales from the
+// quantization type of other operands.
+inline quant::AccumulatorScaleFunc GetUniformQuantizedTypeForBiasWithScale(
+    double scale) {
+  return [=](const std::vector<quant::QuantParams>& quant_params,
+             const int adjusted_quant_dim,
+             const bool legacy_float_scale) -> quant::QuantParams {
+    if (auto qtype = mlir::dyn_cast_or_null<UniformQuantizedType>(
+            quant::GetUniformQuantizedTypeForBias(
+                quant_params, legacy_float_scale, adjusted_quant_dim))) {
+      return quant::UniformQuantizedType::get(
+          qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
+          qtype.getScale() * scale, qtype.getZeroPoint(),
+          qtype.getStorageTypeMin(), qtype.getStorageTypeMax());
+    }
+    return {};
+  };
+}
+
+// Returns quantization spec for LSTMs based on their operator properties.
+template <typename LstmOp>
+std::unique_ptr<quant::OpQuantSpec> GetLstmOpQuantSpec(LstmOp op) {
+  operator_property::OpVariant lstm_variant;
+  operator_property::OperatorProperty lstm_property;
+  if (failed(GetLstmProperty(op, &lstm_variant, &lstm_property))) {
+    return nullptr;
+  }
+
+  auto spec = std::make_unique<quant::OpQuantSpec>();
+
+  for (const auto& enumerated_inputs : lstm_property.inputs) {
+    int index = enumerated_inputs.first;
+    auto& tensor_property = enumerated_inputs.second;
+    if (tensor_property.use_derived_scale) {
+      double scale = 1.0;
+      for (int tensor_index :
+           tensor_property.derived_scale.intermediate_tensors) {
+        auto quant_type = GetIntermediateElementType<LstmOp>(op, tensor_index);
+        if (!quant_type ||
+            !mlir::isa<quant::UniformQuantizedType>(quant_type)) {
+          op->emitError() << "While processing derived scale, intermediate "
+                          << intermediate_attributes[tensor_index]
+                          << " is not quantized.";
+          return nullptr;
+        }
+        scale *=
+            mlir::dyn_cast<quant::UniformQuantizedType>(quant_type).getScale();
+      }
+      for (float factor : tensor_property.derived_scale.factors) {
+        scale *= factor;
+      }
+      spec->biases_params.emplace(
+          index,
+          std::make_pair(tensor_property.derived_scale.input_tensors,
+                         GetUniformQuantizedTypeForBiasWithScale(scale)));
+    }
+  }
+  return spec;
+}
+
+class ConvertSvdfStatsToQDQs : public ConvertOpStatsToQDQs<TFL::SVDFOp> {
+ public:
+  explicit ConvertSvdfStatsToQDQs(
+      MLIRContext* context, const quant::QuantizationSpecs& quant_specs_param)
+      : ConvertOpStatsToQDQs<TFL::SVDFOp>(context, quant_specs_param) {}
+  LogicalResult matchAndRewrite(TFL::SVDFOp op,
+                                PatternRewriter& rewriter) const override {
+    operator_property::OpVariant op_variant;
+    op_variant.op_code = tflite::BuiltinOperator_SVDF;
+    auto op_property = operator_property::GetOperatorProperty(op_variant);
+    return ConvertOpStatsToQDQs<TFL::SVDFOp>::processInputs(
+        op, op_variant, op_property, rewriter);
+  }
+};
+
+class PropagateTransposedPerAxisQuantDim
+    : public OpRewritePattern<TFL::TransposeOp> {
+ public:
+  explicit PropagateTransposedPerAxisQuantDim(MLIRContext* context)
+      : OpRewritePattern<TFL::TransposeOp>(context) {}
+  LogicalResult matchAndRewrite(TFL::TransposeOp transpose_op,
+                                PatternRewriter& rewriter) const override {
+    // Check if the quantization is per-axis
+    auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
+        transpose_op.getOperand(0).getDefiningOp());
+    if (!dq_op) return failure();
+    auto q_op = dyn_cast_or_null<quantfork::QuantizeCastOp>(
+        dq_op.getOperand().getDefiningOp());
+    if (!q_op) return failure();
+    auto qtype =
+        mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
+    auto aqtype = dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(qtype);
+    if (!aqtype) return failure();
+
+    // Return if the result of TransposeOp is already quantized
+    if (!transpose_op.getResult().hasOneUse()) return failure();
+    auto next_op = *transpose_op.getResult().getUsers().begin();
+    if (dyn_cast_or_null<quantfork::QuantizeCastOp>(next_op)) return failure();
+
+    auto input_type = mlir::cast<ShapedType>(transpose_op.getInput().getType());
+    auto perm_type = mlir::cast<ShapedType>(transpose_op.getPerm().getType());
+    if (input_type.hasStaticShape() && perm_type.hasStaticShape()) {
+      if (perm_type.getNumElements() != input_type.getRank()) {
+        return transpose_op.emitOpError(
+            "perm tensor elements size is not equal to input tensor rank");
+      }
+    }
+
+    // Get permutation axes of the TransposeOp
+    DenseIntElementsAttr perm;
+    if (!matchPattern(transpose_op.getPerm(), m_Constant(&perm))) {
+      return failure();
+    }
+
+    SmallVector<int64_t, 4> axes;
+    for (const auto& axis_int : perm.getValues<APInt>()) {
+      int64_t axis = axis_int.getSExtValue();
+      if (axis < 0) {
+        axis += input_type.getRank();
+      }
+      if (axis < 0 || (input_type.hasRank() && axis >= input_type.getRank())) {
+        return transpose_op.emitOpError("perm must be in [-rank, rank)");
+      }
+      if (std::count(axes.begin(), axes.end(), axis) > 0) {
+        return transpose_op.emitOpError("perm cannot have duplicated axis");
+      }
+      axes.push_back(axis);
+    }
+
+    // Find what the quantized dimension has been transposed to
+    int new_out_quant_dim = -1;
+    for (int i = 0; i < axes.size(); ++i) {
+      if (axes[i] == aqtype.getQuantizedDimension()) {
+        new_out_quant_dim = i;
+        break;
+      }
+    }
+    if (new_out_quant_dim == -1) {
+      return transpose_op.emitOpError(
+          "new quantization dimension not found in perm");
+    }
+
+    // Insert a QDQ pair with the new quantized dimension after TransposeOp
+    auto new_qtype = quant::CreateI8F32UniformQuantizedPerAxisType(
+        transpose_op.getLoc(), *rewriter.getContext(), aqtype.getScales(),
+        aqtype.getZeroPoints(), new_out_quant_dim, /*narrow_range=*/true);
+    auto new_tensor_type = RankedTensorType::getChecked(
+        transpose_op.getLoc(), transpose_op.getType().getShape(), new_qtype);
+    rewriter.setInsertionPointAfter(transpose_op);
+    auto new_q_op = rewriter.create<quantfork::QuantizeCastOp>(
+        transpose_op.getLoc(), new_tensor_type, q_op.getArg());
+    auto new_dq_op = rewriter.create<quantfork::DequantizeCastOp>(
+        new_q_op.getLoc(), transpose_op.getResult().getType(),
+        new_q_op.getResult());
+    transpose_op.getResult().replaceAllUsesWith(new_dq_op.getResult());
+    new_q_op.setOperand(transpose_op.getResult());
+
+    return success();
+  }
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PREPARE_QUANTIZE_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.h
new file mode 100644
index 00000000..41114864
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PUSH_TRANSPOSE_THROUGH_EWISE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PUSH_TRANSPOSE_THROUGH_EWISE_PASS_H_
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+
+namespace mlir {
+namespace TFL {
+
+class PushTransposeThroughEwisePass
+    : public Pass<PushTransposeThroughEwisePass> {
+ public:
+  PushTransposeThroughEwisePass() = default;
+  PushTransposeThroughEwisePass(const PushTransposeThroughEwisePass &other) {}
+
+  void runOnOperation() final;
+
+  /// Returns the command-line argument attached to this pass.
+  static llvm::StringRef GetArgument() {
+    return "tfl-push-transpose-through-ewise";
+  }
+
+  static llvm::StringRef GetDescription() {
+    return "Push transpose ops through element-wise ops.";
+  }
+
+  /// Returns the derived pass name.
+  static llvm::StringRef GetName() { return "PushTransposeThroughEwisePass"; }
+
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
+  /// Explicitly declare the TypeID for this class. We declare an explicit
+  /// private instantiation because Pass classes should only be visible by the
+  /// current library.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PushTransposeThroughEwisePass)
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_PUSH_TRANSPOSE_THROUGH_EWISE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/analyze_variables_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/analyze_variables_pass.h
new file mode 100644
index 00000000..8d5914d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/analyze_variables_pass.h
@@ -0,0 +1,55 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_ANALYZE_VARIABLES_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_ANALYZE_VARIABLES_PASS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass which analyzes the variables in the graph and add an attribute whether
+// variables should be legalized to TFLite native ones.
+// This pass needs to run post TF->TFL legalization and before variable
+// legalization.
+
+class AnalyzeVariablesPass : public TFL::Pass<AnalyzeVariablesPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AnalyzeVariablesPass)
+
+  AnalyzeVariablesPass() = default;
+  AnalyzeVariablesPass(const AnalyzeVariablesPass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "AnalyzeVariablesPass"; }
+  static llvm::StringRef GetArgument() { return "tfl-analyze-variables-pass"; }
+  static llvm::StringRef GetDescription() {
+    return "Pass to analyze variables in the graph";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_ANALYZE_VARIABLES_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.h
new file mode 100644
index 00000000..8eb9f728
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_LEGALIZE_TENSORLIST_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_LEGALIZE_TENSORLIST_PASS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass to Legalize TensorFlow tensorlist ops to TensorFlow Lite custom.
+
+class LegalizeTensorListPass : public TFL::Pass<LegalizeTensorListPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LegalizeTensorListPass)
+
+  LegalizeTensorListPass() = default;
+  LegalizeTensorListPass(const LegalizeTensorListPass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "LegalizeTensorListPass"; }
+  static llvm::StringRef GetArgument() { return "tfl-legalize-tensorlist"; }
+  static llvm::StringRef GetDescription() {
+    return "Pass to Legalize TensorFlow tensorlist ops to TensorFlow Lite "
+           "custom.";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_LEGALIZE_TENSORLIST_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.h
new file mode 100644
index 00000000..6c114ced
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_WHILE_LOOP_OUTLINE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_WHILE_LOOP_OUTLINE_PASS_H_
+
+#include <string>
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass to hoist while op regions into functions.
+// This pass outlines the cond/body region of the TFL WhileOp into functions and
+// replaces the regions with calls to these outlined functions.
+class WhileOutlinePass : public TFL::Pass<WhileOutlinePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WhileOutlinePass)
+
+  WhileOutlinePass() = default;
+  WhileOutlinePass(const WhileOutlinePass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "WhileOutlinePass"; }
+  static llvm::StringRef GetArgument() { return "tfl-while-loop-outline"; }
+  static llvm::StringRef GetDescription() {
+    return "Pass to hoist while op regions into functions";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  }
+
+  // Outlines the regions of the WhileOp's cond and body and insert function
+  // calls instead,
+  void OutlineWhile(WhileOp while_op);
+
+  // Get unique name by using the loc to name mapping.
+  std::string GetName(Operation* op, StringRef suffix);
+
+  tensorflow::OpOrArgLocNameMapper mapper_;
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TF_LEGALIZATIONS_WHILE_LOOP_OUTLINE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tflite_passes/split_merged_operands_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tflite_passes/split_merged_operands_pass.h
new file mode 100644
index 00000000..54be99ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tflite_passes/split_merged_operands_pass.h
@@ -0,0 +1,89 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_SPLIT_MERGED_OPERANDS_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_SPLIT_MERGED_OPERANDS_PASS_H_
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
+
+namespace mlir {
+namespace TFL {
+
+// Background info:
+// Currently the model taken to MLIRConverter is frozen (all the variables have
+// been converted to constants, all the assign ops are gone, etc.). However,
+// TFLite has these variable tensors semantics. So the variable mapping from TF
+// to TFLite is actually broken here, we sort of hard-code the variable tensors
+// based on the actual ops using them, such as unidirectional_sequence_lstm.
+//
+// MLIRConverter also benefits from lots of typical compiler optimization like
+// merging same input values if they're identical. These optimizations are
+// desirable but not for those TFLite ops which have variable tensors as inputs.
+// Yes, they have identical input values, but those identical values are
+// "stateful", their values can change during invocations.
+//
+// A typical example is unidirectional_sequence_lstm have two variable tensor
+// inputs: activation state & cell state. They may have same initial values
+// (typical zero-initialized), but their values will be changed. So we cannot
+// just merge those values.
+//
+// This pass is more like short-term workaround since we don't have a good
+// variable representation right now.
+//
+// This pass will duplicate input values for those variable tensor inputs.
+
+class SplitMergedOperandsPass
+    : public TFL::Pass<SplitMergedOperandsPass, EmptyPassOptions,
+                       func::FuncOp> {
+ public:
+  SplitMergedOperandsPass() = default;
+  SplitMergedOperandsPass(const SplitMergedOperandsPass &other) {}
+
+  void runOnOperation() final;
+
+  /// Returns the command-line argument attached to this pass.
+  static llvm::StringRef GetArgument() { return "tfl-split-merged-operands"; }
+
+  static llvm::StringRef GetDescription() {
+    return "Split merged stateful operands for tfl operations.";
+  }
+
+  /// Returns the derived pass name.
+  static llvm::StringRef GetName() { return "SplitMergedOperandsPass"; }
+
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
+  /// Explicitly declare the TypeID for this class. We declare an explicit
+  /// private instantiation because Pass classes should only be visible by the
+  /// current library.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SplitMergedOperandsPass)
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_SPLIT_MERGED_OPERANDS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.h
new file mode 100644
index 00000000..18ee20ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_UNFOLD_LARGE_SPLAT_CONSTANTS_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_UNFOLD_LARGE_SPLAT_CONSTANTS_PASS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass to unfold large splat constant tensors.
+// This Pass will replace large splat constant tensors to `tfl.Fill` op to
+// reduce the size of the generated flatbuffer model size.
+class UnfoldLargeSplatConstantPass
+    : public TFL::Pass<UnfoldLargeSplatConstantPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnfoldLargeSplatConstantPass)
+
+  UnfoldLargeSplatConstantPass() = default;
+  UnfoldLargeSplatConstantPass(const UnfoldLargeSplatConstantPass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "UnfoldLargeSplatConstantPass"; }
+  static llvm::StringRef GetArgument() { return "unfold-large-splat-constant"; }
+  static llvm::StringRef GetDescription() {
+    return "Pass to unfold large splat constant tensors.";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_UNFOLD_LARGE_SPLAT_CONSTANTS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.h
new file mode 100644
index 00000000..f79c9ccf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.h
@@ -0,0 +1,62 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_UNFREEZE_GLOBAL_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_UNFREEZE_GLOBAL_CONSTANTS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace TFL {
+
+// This pass "unfreezes" the use of global constant tensor ops found in the
+// module and converts them to `tf.VarHandleOp`s. Also, an initialization
+// pattern `tf.AssignVariableOp(tf.VarHandleOp, tf.ConstOp)` is inserted to the
+// initializer function of type "init_op" for each of the unfrozen constants.
+
+class UnfreezeMutableGlobalTensorsPass
+    : public Pass<UnfreezeMutableGlobalTensorsPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnfreezeMutableGlobalTensorsPass)
+
+  UnfreezeMutableGlobalTensorsPass() = default;
+  UnfreezeMutableGlobalTensorsPass(const UnfreezeMutableGlobalTensorsPass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() {
+    return "UnfreezeMutableGlobalTensorsPass";
+  }
+  static llvm::StringRef GetArgument() {
+    return "unfreeze-mutable-global-tensors";
+  }
+  static llvm::StringRef GetDescription() {
+    return "Pass to unfreeze mutable global tensor ops";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect,
+                    tf_saved_model::TensorFlowSavedModelDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_UNFREEZE_GLOBAL_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline.h
new file mode 100644
index 00000000..cfd5a1c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_VARIABLE_FREEZING_PIPELINE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_VARIABLE_FREEZING_PIPELINE_H_
+
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/pipeline.h"
+#include "tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline_options.h"
+
+namespace mlir {
+namespace TFL {
+
+class VariableFreezingPipeline
+    : public Pipeline<VariableFreezingPipeline,
+                      VariableFreezingPipelineOptions> {
+ public:
+  void AddPasses() override;
+
+  /// Returns the command-line argument attached to this pass.
+  static llvm::StringRef GetArgument() {
+    return "tfl-variable-freezing-pipeline";
+  }
+
+  static llvm::StringRef GetDescription() {
+    return "Variable Freezing Pipeline";
+  }
+
+  /// Returns the derived pass name.
+  static llvm::StringRef GetName() { return "VariableFreezingPipeline"; }
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_VARIABLE_FREEZING_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline_options.h
new file mode 100644
index 00000000..d7e9ed8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline_options.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_VARIABLE_FREEZING_PIPELINE_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_VARIABLE_FREEZING_PIPELINE_OPTIONS_H_
+
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass Options
+////////////////////////////////////////////////////////////////////////////////
+
+struct VariableFreezingPipelineOptions : public mlir::detail::PassOptions {
+  mlir::detail::PassOptions::Option<bool> enable_tflite_variables{
+      *this, "enable_tflite_variables",
+      llvm::cl::desc("Enable Mutable Variables in TFLite")};
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_VARIABLE_FREEZING_PIPELINE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h
new file mode 100644
index 00000000..c851d73b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ARITHMETIC_COUNT_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ARITHMETIC_COUNT_UTIL_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// For add/mul/div/sub and other broadcastable ops.
+class ArithmeticCountUtilHelper {
+ public:
+  static bool GetFirstOutputCount(mlir::Operation* op, int64_t* count) {
+    auto output = op->getResult(0);
+    auto output_type =
+        mlir::dyn_cast_or_null<mlir::RankedTensorType>(output.getType());
+    if (!output_type || !output_type.hasStaticShape()) return false;
+
+    *count = output_type.getNumElements();
+    return true;
+  }
+
+  static bool GetInputTensorTotalSize(mlir::Operation* op, int64_t* count) {
+    int64_t total_count = 0;
+    for (auto input : op->getOperands()) {
+      auto input_type =
+          mlir::dyn_cast_or_null<mlir::RankedTensorType>(input.getType());
+      if (!input_type || !input_type.hasStaticShape()) {
+        return false;
+      }
+      total_count += input_type.getNumElements();
+    }
+    *count = total_count;
+    return true;
+  }
+
+  // For conv2d/depthwise_conv/fully_connected ops.
+  // This algorithm actually comes from TOCO tooling_util.cc
+  static bool GetArithmeticCountForConvAndFullyconnectedOp(mlir::Operation* op,
+                                                           int64_t* count) {
+    auto weight = op->getOperand(1);
+    auto weight_type =
+        mlir::dyn_cast_or_null<mlir::RankedTensorType>(weight.getType());
+    if (weight_type == nullptr || !weight_type.hasStaticShape()) return false;
+
+    auto output = op->getResult(0);
+    auto output_type =
+        mlir::dyn_cast_or_null<mlir::RankedTensorType>(output.getType());
+    if (output_type == nullptr || !output_type.hasStaticShape()) return false;
+
+    int64_t cols = 1;
+    for (int i = 0; i < output_type.getRank() - 1; ++i) {
+      cols *= output_type.getDimSize(i);
+    }
+    const int64_t cost_per_col = 2 * weight_type.getNumElements();
+
+    *count = cost_per_col * cols;
+
+    auto bias = op->getOperand(2);
+    if (bias) {
+      auto bias_type =
+          mlir::dyn_cast_or_null<mlir::RankedTensorType>(bias.getType());
+      if (bias_type && bias_type.hasStaticShape()) {
+        *count += output_type.getNumElements();
+      }
+    }
+
+    return true;
+  }
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ARITHMETIC_COUNT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/attribute_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
new file mode 100644
index 00000000..565b71c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// Returns true if none of the three attributes are empty.
+inline bool HasAll3Attrs(Attribute a, Attribute b, Attribute c) {
+  return a != Attribute() && b != Attribute() && c != Attribute();
+}
+
+// Returns the single float element from an ElementsAttr. Returns empty
+// attribute if the number of elements in the attribute is not 1 or the
+// element isn't a float attribute.
+FloatAttr ExtractSingleElementAsFloat(ElementsAttr attr);
+
+// Returns the single float element if the input is an ElementsAttr, or return
+// itself as a float element. Returns empty attribute if the number of elements
+// in the attribute is not 1, the element or itself isn't a float attribute.
+FloatAttr GetSingleElementAsFloatOrSelf(Attribute attr);
+
+// Returns the single integer element from an ElementsAttr. Returns empty
+// attribute if the number of elements in the attribute is not 1 or the
+// element isn't a integer attribute.
+IntegerAttr ExtractSingleElementAsInteger(ElementsAttr attr);
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h
new file mode 100644
index 00000000..477c5c67
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h
@@ -0,0 +1,111 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONST_TENSOR_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONST_TENSOR_UTILS_H_
+
+#include <stdbool.h>
+
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/meta/type_traits.h"
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace mlir {
+namespace TFL {
+
+bool IsQuantized(const tflite::TensorT& tensor);
+
+absl::StatusOr<mlir::quant::QuantizedType> GetQuantizedType(
+    const tflite::TensorT& tensor, mlir::Builder builder,
+    bool is_constant = false, mlir::Type storage_type = {});
+
+// Imports float tensor with calibration value into calibrated quantized type.
+absl::StatusOr<mlir::quant::QuantizedType> GetCalibratedQuantizedType(
+    const tflite::TensorT& tensor, mlir::Builder builder);
+
+absl::StatusOr<mlir::TensorType> GetTensorType(const tflite::TensorT& tensor,
+                                               mlir::Builder builder,
+                                               bool is_constant = false,
+                                               bool is_intermediate = false,
+                                               bool get_storage = false);
+
+// Gets a constant splat for the given value of type. Requires value to be of
+// type static shaped RankedTensorType. `unique_index` is used to get the unique
+// value for the attribute.
+mlir::ElementsAttr GetSplat(mlir::RankedTensorType type, int unique_index,
+                            mlir::Builder builder);
+
+absl::StatusOr<mlir::ElementsAttr> ConvertIntBuffer(
+    mlir::RankedTensorType shaped_type, const std::vector<uint8_t>& buffer,
+    bool truncate = false);
+
+absl::StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
+    mlir::RankedTensorType shaped_type, const std::vector<uint8_t>& buffer);
+
+tensorflow::TensorProto ConvertTfliteConstTensor(
+    const tflite::TensorT& tensor, const std::vector<uint8_t>& buffer);
+
+// Get the size of the type in bits. The type can be ComplexType, FloatType,
+// IntegerType, QuantizedType, or ShapeType of other supported types.
+//
+// Sub-byte types, e.g. qu4 and i2, are treated as a full i8.
+int64_t GetSizeInBits(mlir::ShapedType shaped_type);
+int64_t GetSizeInBits(mlir::Type type);
+int64_t GetSizeInBits(mlir::quant::QuantizedType quant_type);
+
+// Get the size of the type in bytes.
+//
+// Sub-byte element types, e.g. qu4 and i2, are treated as a full i8.
+// e.g. GetSizeInBytes(tensor<4xi2>) == 4, instead of 1.
+int64_t GetSizeInBytes(mlir::Type type);
+
+// Performs an integer divide and checks that the remainder is zero.
+// It supports int64 version as well.
+template <typename Integer,
+          typename = std::enable_if_t<std::is_same<int32_t, Integer>::value ||
+                                      std::is_same<uint32_t, Integer>::value ||
+                                      std::is_same<int64_t, Integer>::value ||
+                                      std::is_same<uint64_t, Integer>::value>>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Integer ExactIntegerDivide(Integer numerator,
+                                                        int64_t denominator) {
+  const Integer ratio = numerator / denominator;
+  assert((numerator % denominator) == 0);
+  return ratio;
+}
+
+template <typename IntType,
+          absl::enable_if_t<!std::is_unsigned<IntType>::value, int> = 0>
+ABSL_ATTRIBUTE_ALWAYS_INLINE bool IsPowerOfTwo(IntType n) {
+  static_assert(std::is_integral<IntType>::value, "");
+  return n > 0 && (n & (n - 1)) == 0;
+}
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONST_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/constant_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/constant_utils.h
new file mode 100644
index 00000000..1340aa0f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/constant_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace TFL {
+
+// Returns a Constant op with a single value.
+absl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type, int value);
+
+// Returns a Constant op with a splat vector value.
+absl::StatusOr<arith::ConstantOp> CreateConstOpWithVectorValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type, int value);
+
+}  // namespace TFL
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/control_edges.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/control_edges.h
new file mode 100644
index 00000000..e5a16ba7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/control_edges.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONTROL_EDGES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONTROL_EDGES_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace tflite {
+
+// LINT.IfChange
+
+using ControlEdge = std::pair<int32_t, int32_t>;
+using ControlEdges = std::vector<ControlEdge>;
+
+// LINT.ThenChange(//tensorflow/lite/graph_info.h)
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONTROL_EDGES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/convert_type.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/convert_type.h
new file mode 100644
index 00000000..118f9cd4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/convert_type.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+class Builder;
+}  // namespace mlir
+
+namespace tflite {
+// Convert the MLIR type to the corresponding TFLite tensor.
+tflite::TensorType ConvertTypeToTensorType(mlir::Type type);
+
+// Convert the scalar type of a TFlite tensor to the corresponding MLIR type.
+mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder);
+
+// Convert the scalar type of a TFLite tensor to the corresponding
+// Tensorflow type
+tensorflow::DataType TflTypeToTfType(tflite::TensorType type);
+
+// Convert the Tensorflow scalar type to the corresponding TFLite type
+absl::StatusOr<tflite::TensorType> TfTypeToTflType(tensorflow::DataType type);
+
+// Returns element type from attribute Type 'type_attr'.
+mlir::Type GetShapeStrippedType(mlir::TypeAttr type_attr);
+
+// Returns true if 'val' is not from Quantize op or
+// from Quantize Op with same quant type as 'qtype_attr'
+bool NotFromQuantOpOrSameQuantType(mlir::Value val, mlir::TypeAttr qtype_attr);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
new file mode 100644
index 00000000..146cae1f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -0,0 +1,177 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with tf.FakeQuant* ops.
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_FAKE_QUANT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_FAKE_QUANT_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+
+namespace mlir {
+namespace TFL {
+
+template <class TFFakeQuantOp>
+struct FetchMinMaxAttrs {
+  using AttrType = FloatAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    min_value = tf_op.getMinAttr();
+    max_value = tf_op.getMaxAttr();
+    return true;  // Successfully matched and fetched.
+  }
+};
+
+template <class TFFakeQuantOp>
+struct FetchConstantMinMaxInputs {
+  using AttrType = DenseFPElementsAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    Value min = tf_op.getMin(), max = tf_op.getMax();
+    if (!matchPattern(min, m_Constant(&min_value))) {
+      return false;
+    }
+    if (!matchPattern(max, m_Constant(&max_value))) {
+      return false;
+    }
+    return true;  // Successfully matched and fetched.
+  }
+};
+
+// Inserts a "tfl.quantize" and "tfl.dequantize" op pair (QDQs) after the
+// tf.FakeQyantWithMinMax{Vars|VarsPerChannel|Args}Op
+// before the op being constant folded. Since the constant
+// folding logic will use a "arith.constant" op to replace the
+// "tf.FakeQuantWithMinMaxVarsOp", the "tfl.quantize" op is used to preserve
+// the quantization parameters as a TypeAttr and "tfl.dequantize" op used to
+// convert the output type to the next op. Here are the transformations:
+//
+// input   min cst       max cst          input   min cst       max cst
+//  \       |             |                \       |             |
+//   \  (tf.Identity) (tf.Identity)   =>    \  (tf.Identity) (tf.Identity)
+//    \     |             |                  \     |             |
+//       tf.FakeQuantWithMinMaxVars       tf.FakeQuantWithMinMaxVars
+//                   |                                 |
+//                                                tfl.quantize
+//                                                     |
+//                                                tfl.dequantize
+//                                                     |
+// If the input is a constant, the result pattern will eventually converted to
+//
+//            quant-emulated input
+//                   |
+//               tfl.quantize
+//                   |
+//              tfl.dequantize
+//                   |
+//
+//
+// Warns if the (most likely unwanted, currently not quite correctly handled)
+// case of back-to-back tf.FakeQuant occurs
+//
+//             tf.FakeQuant*
+//                   |
+//             tf.FakeQuant*
+//
+template <typename TFFakeQuantOp, bool PerAxis, class FetchMinMax>
+class InsertTFLQuantOpsAfterTFFakeQuantOp {
+ public:
+  explicit InsertTFLQuantOpsAfterTFFakeQuantOp(bool use_fake_quant_num_bits)
+      : use_fake_quant_num_bits_(use_fake_quant_num_bits) {}
+
+  FetchMinMax fetch_min_max_;
+
+  using FetchAttrType = typename FetchMinMax::AttrType;
+  LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
+                                OpBuilder &rewriter) const {
+    // We don't want to insert quantize/dequantize if the quantize op exists.
+    auto res = tf_op.getOutputs();
+    if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin())) {
+      return failure();
+    }
+
+    // Extract the min/max constant values from the operands. We also consider
+    // a special case that there are tf.Identity ops between the min/max
+    // constants and the tf.FakeQuantWithMinMaxVarsOp.
+
+    FetchAttrType min_value, max_value;
+    if (!fetch_min_max_(tf_op, min_value, max_value)) {
+      return failure();
+    }
+
+    int quant_dim = -1;
+    if (PerAxis) {
+      // This is a special case that the quant_dim is the last dimensions.
+      quant_dim = mlir::cast<ShapedType>(res.getType()).getRank() - 1;
+    }
+    // Use the min/max from the operands and the num_bits and narrow_range
+    // attribute to create the quantization parameter for the new quantize op.
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
+    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
+    Type res_type = tf_op.getType();
+    TypeAttr qtype = quant::GetQuantizedTypeAttr(
+        rewriter, res_type, min_value, max_value, quant_dim, num_bits,
+        narrow_range, /*is_signed=*/false, /*legacy_float_scale=*/false,
+        use_fake_quant_num_bits_);
+    if (!qtype) {
+      return failure();
+    }
+
+    // Finally, use the quantization parameter to create the quantize and
+    // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
+    // and its users.
+    Value value = tf_op.getOutputs();
+    auto quantize = rewriter.create<TFL::QuantizeOp>(
+        tf_op.getLoc(), qtype.getValue(), value, qtype);
+    auto dequantize = rewriter.create<TFL::DequantizeOp>(
+        tf_op.getLoc(), res_type, quantize.getOutput());
+    value.replaceAllUsesWith(dequantize);
+    quantize.getOperation()->replaceUsesOfWith(dequantize, value);
+
+    return success();
+  }
+
+  bool use_fake_quant_num_bits_;
+};
+
+// Removes the wrapper of the tf.FakeQuant* ops and creates the tfl.quantize
+// and tfl.dequantize pairs before tf.FakeQuant* being foled.
+LogicalResult ConvertFakeQuantOps(func::FuncOp func, MLIRContext *ctx,
+                                  bool use_fake_quant_num_bits = false);
+
+// Returns the names of all the considered tf.FakeQuant* ops.
+std::vector<std::string> AllTfFakeQuantOps();
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_FAKE_QUANT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h
new file mode 100644
index 00000000..fa9bd851
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LOW_BIT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LOW_BIT_UTILS_H_
+
+#include <cstdint>
+#include <vector>
+
+namespace tflite {
+// Assumes that `src_tensor` is a buffer where each element is a 4-bit value
+// stored in 8-bit.
+// Returns a new buffer that is packed densely with 2 4-bit values in a byte.
+// The packing format is low-bits-first, i.e. the lower nibble of a byte is
+// filled first, followed by the upper nibble.
+std::vector<uint8_t> PackInt4ValuesDensely(std::vector<uint8_t> src_buffer);
+
+// Assumes `src_buffer` contains 2 4-bit elements packed in 8-bit.
+// Returns a vector where each int8 element contains a int4 sign-extended value.
+std::vector<char> UnpackDenseInt4IntoInt8(
+    const std::vector<uint8_t>& src_buffer, int64_t num_elements);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LOW_BIT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/lstm_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
new file mode 100644
index 00000000..8d9a5ab1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
@@ -0,0 +1,224 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LSTM_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LSTM_UTILS_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
+
+namespace mlir {
+namespace TFL {
+
+constexpr char kTFImplements[] = "tf._implements";
+constexpr char kLstmCellSimple[] = "LSTMCellSimple";
+constexpr char kLayerNormalizedLstmCellSimple[] =
+    "LayerNormalizedLstmCellSimple";
+constexpr char kCoupleInputForgetGates[] = "CoupleInputForgetGates";
+
+// A utility class that enables the conversion of the LSTMCellSimple composite
+// op into a fused TFL LSTM op. The fused op is contained within a FuncOp
+// that also contains other supporting ops needed to construct the operands for
+// the fused op. The caller provides the containing FuncOp as input with
+// arguments specifying the input, weight, projection and bias.
+// The weight, projection, bias and layer norm scale all need to be
+// RankedTensorType.
+// This class sets the layer norm coefficients to NoneType.
+class ConvertLSTMCellSimpleToFusedLSTM {
+ public:
+  explicit ConvertLSTMCellSimpleToFusedLSTM(mlir::func::FuncOp fused_func_op)
+      : fused_func_op_(fused_func_op),
+        couple_input_forget_gates_(false),
+        builder_(fused_func_op.getBody()) {}
+
+  // not copyable.
+  ConvertLSTMCellSimpleToFusedLSTM(const ConvertLSTMCellSimpleToFusedLSTM&) =
+      delete;
+  ConvertLSTMCellSimpleToFusedLSTM& operator=(
+      const ConvertLSTMCellSimpleToFusedLSTM&) = delete;
+  virtual ~ConvertLSTMCellSimpleToFusedLSTM() = default;
+
+  virtual llvm::StringRef GetCompositeOpName() { return kLstmCellSimple; }
+
+  // Rewrite the func body with constructed fused lstm.
+  LogicalResult RewriteFunc();
+
+  int GetNumInputs() { return n_input_; }
+
+ protected:
+  // verify input func op arguments/attributes and initialize internal state.
+  virtual LogicalResult InitializeFromFuncAttributes();
+  virtual LogicalResult Initialize();
+
+  void UpdateFuncSignature();
+  void GenerateFusedOpOperands();
+
+  void SetWeightForInputToCellGate();
+  void SetWeightForInputToInputGate();
+  void SetWeightForInputToForgetGate();
+  void SetWeightForInputToOutputGate();
+
+  void SetWeightForRecurrentToCellGate();
+  void SetWeightForRecurrentToInputGate();
+  void SetWeightForRecurrentToForgetGate();
+  void SetWeightForRecurrentToOutputGate();
+
+  void SetBiasToCellGate();
+  void SetBiasToInputGate();
+  void SetBiasToForgetGate();
+  void SetBiasToOutputGate();
+
+  void SetProjection();
+  void SetProjectionBias();
+
+  void SetInputActivationState();
+  void SetInputCellState();
+
+  virtual void SetCellLayerNormCoefficients();
+  virtual void SetInputLayerNormCoefficients();
+  virtual void SetForgetLayerNormCoefficients();
+  virtual void SetOutputLayerNormCoefficients();
+
+  // specified state
+  func::FuncOp fused_func_op_;
+  Value input_;
+  Value weight_;
+  Value bias_;
+  Value projection_;
+  bool couple_input_forget_gates_;
+
+  // internal state
+  Value weight_transposed_;
+  Value projection_transposed_;
+  RankedTensorType weight_type_;
+  RankedTensorType projection_type_;
+  int num_gates_;
+  int n_cell_;
+  int n_output_;
+  int n_input_;
+  int num_cols_weight_transposed_;
+  int num_cols_projection_transposed_;
+
+  // input -> cifg
+  Value input2input_;
+  Value input2forget_;
+  Value input2cell_;
+  Value input2output_;
+
+  // recurrent -> cifg
+  Value rec2input_;
+  Value rec2forget_;
+  Value rec2cell_;
+  Value rec2output_;
+
+  // bias -> cifg
+  Value bias2input_;
+  Value bias2forget_;
+  Value bias2cell_;
+  Value bias2output_;
+
+  // projection
+  Value proj_weight_;
+  Value proj_bias_;
+
+  // state
+  Value input_activation_state_;
+  Value input_cell_state_;
+
+  // layer norm coefficients
+  Value input_layer_norm_coefficients_;
+  Value forget_layer_norm_coefficients_;
+  Value cell_layer_norm_coefficients_;
+  Value output_layer_norm_coefficients_;
+
+  mlir::TFL::LSTMOp lstm_;
+
+  Value none_;
+  SmallVector<int64_t, 1> bias_slice_shape_;
+  SmallVector<int64_t, 1> bias_size_values_;
+  SmallVector<int64_t, 2> weight_slice_shape_;
+  SmallVector<int64_t, 2> weight_slice_size_input_values_;
+  SmallVector<int64_t, 2> weight_slice_size_recurrent_values_;
+  OpBuilder builder_;
+};
+
+// A utility class that enables the conversion of the
+// LayerNormalizedLSTMCellSimple composite op into a fused TFL LSTM op. The
+// fused op is contained within a FuncOp that also contains other supporting ops
+// needed to construct the operands for the fused op. The caller provides the
+// containing FuncOp as input with arguments specifying the input, weight,
+// projection, bias and layer norm scale. The weight, projection, bias and
+// layer norm scale all need to be RankedTensorType.
+// This class overrides the layer norm coefficient setters from the base class.
+class ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM
+    : public ConvertLSTMCellSimpleToFusedLSTM {
+ public:
+  explicit ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM(
+      mlir::func::FuncOp fused_func_op)
+      : ConvertLSTMCellSimpleToFusedLSTM(fused_func_op) {}
+
+  // not copyable.
+  ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM(
+      const ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM&) = delete;
+  ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM& operator=(
+      const ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM&) = delete;
+  ~ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM() override = default;
+
+  llvm::StringRef GetCompositeOpName() override {
+    return kLayerNormalizedLstmCellSimple;
+  }
+
+ protected:
+  LogicalResult Initialize() override;
+
+  void SetCellLayerNormCoefficients() override;
+  void SetInputLayerNormCoefficients() override;
+  void SetForgetLayerNormCoefficients() override;
+  void SetOutputLayerNormCoefficients() override;
+
+ private:
+  // specified state
+  Value layer_norm_scale_;
+
+  // internal state
+  RankedTensorType layer_norm_scale_type_;
+  SmallVector<int64_t, 1> layer_norm_slice_shape_;
+  SmallVector<int64_t, 1> layer_norm_size_values_;
+};
+
+LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
+                                    OpBuilder* builder);
+
+LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
+                                    OpBuilder* builder, bool indy);
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LSTM_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/nms_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/nms_utils.h
new file mode 100644
index 00000000..e3487ba9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/nms_utils.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with NMS ops in TFLite.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_NMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_NMS_UTILS_H_
+
+#include <string>
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+
+namespace mlir {
+namespace TFL {
+
+// Abstracts the conversion of the padded NMS composite function.
+class ConvertNMSPaddedFunc {
+ public:
+  explicit ConvertNMSPaddedFunc(func::FuncOp func) : func_(func) {}
+
+  void RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  func::FuncOp func_;
+};
+
+// Abstracts the conversion of the SSD post-processing composite function to
+// TFLite.
+class ConvertSSDPostProcessFunc {
+ public:
+  explicit ConvertSSDPostProcessFunc(func::FuncOp func, mlir::TF::FuncAttr attr)
+      : func_(func), attr_(attr) {}
+
+  LogicalResult RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  LogicalResult CreateNMSCustomOptions(func::FuncOp func, DictionaryAttr attrs,
+                                       std::string& custom_option_buffer);
+
+  LogicalResult AddIntAttr(func::FuncOp func, DictionaryAttr attrs,
+                           const std::string& attribute,
+                           flexbuffers::Builder* builder);
+
+  LogicalResult AddFloatAttr(func::FuncOp func, DictionaryAttr attrs,
+                             const std::string& attribute,
+                             flexbuffers::Builder* builder);
+
+  LogicalResult HasIntAttr(func::FuncOp func, DictionaryAttr attrs,
+                           const std::string& attribute);
+
+  LogicalResult HasFloatAttr(func::FuncOp func, DictionaryAttr attrs,
+                             const std::string& attribute);
+
+  func::FuncOp func_;
+  mlir::TF::FuncAttr attr_;
+};
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.h
new file mode 100644
index 00000000..609534f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_PERCEPTION_OPS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_PERCEPTION_OPS_UTILS_H_
+
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+
+namespace mlir {
+namespace TFL {
+
+// Fuse MaxUnpooling2D ops annotated by tf.function to a TFLite custom op.
+class ConvertMaxUnpoolingFunc {
+ public:
+  explicit ConvertMaxUnpoolingFunc(func::FuncOp func, mlir::TF::FuncAttr attr)
+      : func_(func), attr_(attr) {}
+
+  LogicalResult RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  LogicalResult CreateCustomOptions(std::string& custom_option_buffer);
+
+  func::FuncOp func_;
+  mlir::TF::FuncAttr attr_;
+};
+
+// Fuse DenseImageWarp ops annotated by tf.function to a TFLite custom op.
+class ConvertDenseImageWarpFunc {
+ public:
+  explicit ConvertDenseImageWarpFunc(func::FuncOp func) : func_(func) {}
+
+  LogicalResult RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  func::FuncOp func_;
+};
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_PERCEPTION_OPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/region_isolation.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/region_isolation.h
new file mode 100644
index 00000000..b32b2df2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/region_isolation.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_REGION_ISOLATION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_REGION_ISOLATION_H_
+
+#include <optional>
+
+#include "llvm/ADT/SetVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// Isolates op's contained regions. Replaces all references to values defined
+// above these (single block) regions with a block argument. The union of all
+// values referenced this way is returned. Each region will have an identical
+// signature, which is the types of the returned vector in the same order.
+// NOTE: Critically, llvm::SetVector iterates deterministically in order of
+// insertion.
+std::optional<llvm::SetVector<Value>> IsolateRegions(Operation* op_with_regions,
+                                                     OpBuilder& b);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_REGION_ISOLATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/size_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/size_utils.h
new file mode 100644
index 00000000..52aa50c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/size_utils.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_SIZE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_SIZE_UTILS_H_
+
+#include <cstdint>
+
+namespace mlir {
+namespace TFL {
+
+// Converts a TF size (64-bit) to TFLite (32-bit) and properly converts TF's
+// value for dynamic size (`std::numeric_limits<int64_t>::min()`) to the
+// TFLite-specific value.
+int32_t ConvertToTfliteSize(int64_t size);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_SIZE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
new file mode 100644
index 00000000..e7e3e721
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
+
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// Check if the given op has stateful operands and return their stateful
+// operand indices.
+bool IsStatefulOp(Operation* op, std::vector<int>* stateful_operand_indices);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/string_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/string_utils.h
new file mode 100644
index 00000000..e1ede084
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/string_utils.h
@@ -0,0 +1,110 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Util methods to store a an ordered collection of strings in a char buffer.
+// The format of the char buffer is:
+//   [0, 3] 4 bytes: N, num of strings in the collection.
+//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian,
+//                                 for i from 0 to N-1.
+//   [(N+1)*4, (N+1)*4+3] 4 bytes: length of the whole char buffer.
+//   [offset(i), offset(i+1) - 1] : content of i-th string.
+//
+// A typical usage:
+//   SimpleDynamicBuffer buf;
+//   char* buffer;
+//   # Add string "AB", string is stored in dynamic buffer.
+//   buf.AddString("AB", 2);
+//   # Write content of SimpleDynamicBuffer to buffer in format described above.
+//   buf.WriteToBuffer(&buffer)
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STRING_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STRING_UTILS_H_
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <limits>
+#include <vector>
+
+namespace mlir::TFL {
+
+constexpr uint64_t kDefaultMaxLength = std::numeric_limits<int>::max();
+
+class SimpleDynamicBuffer {
+ public:
+  explicit SimpleDynamicBuffer(size_t max_length = kDefaultMaxLength)
+      : offset_({0}), max_length_(max_length) {}
+
+  // Add string to dynamic buffer by resizing the buffer and copying the data.
+  bool AddString(const char* str, size_t len);
+
+  // Fill content into a buffer and returns the number of bytes stored.
+  // The function allocates space for the buffer but does NOT take ownership.
+  int WriteToBuffer(char** buffer);
+
+ protected:
+  // Data buffer to store contents of strings, not including headers.
+  std::vector<char> data_;
+  // Offset of the starting index of each string in data buffer.
+  std::vector<size_t> offset_;
+  // Max length in number of characters that we permit the total
+  // buffer containing the concatenation of all added strings to be.
+  // For historical reasons this is limited to 32bit length. At this files
+  // inception, sizes were represented using 32bit which forced an implicit cap
+  // on the size of the buffer. When this was refactored to use size_t (which
+  // could be 64bit) we enforce that the buffer remains at most 32bit length to
+  // avoid a change in behavior.
+  const size_t max_length_;
+};
+
+// Convenient structure to store string pointer and length. Note that
+// methods on SimpleDynamicBuffer enforce that the whole buffer (and by
+// extension every contained string) is of max length (2ul << 30) - 1. See
+// string_util.cc for more info.
+typedef struct {
+  const char* str;
+  size_t len;
+} StringRef;
+
+// Return num of strings in a String tensor.
+inline int GetStringCount(const void* raw_buffer) {
+  // The first integers in the raw buffer is the number of strings.
+  //
+  // NOTE: The string buffer is accessed here as if it's native endian (instead
+  // of small endian, as documented in the header). This will protentially break
+  // when TFLite is ported to big endian platforms.
+  // TODO(b/165919229): This code will need changing if/when we port to a
+  // big-endian platform.
+  return *static_cast<const int32_t*>(raw_buffer);
+}
+
+// Get String pointer and length of index-th string in tensor.
+// NOTE: This will not create a copy of string data.
+inline StringRef GetString(const void* raw_buffer, int string_index) {
+  // NOTE: The string buffer is accessed here as if it's native endian (instead
+  // of small endian, as documented in the header). This will protentially break
+  // when TFLite is ported to big endian platforms.
+  // TODO(b/165919229): This code will need changing if/when we port to a
+  // big-endian platform.
+  const int32_t* offset =
+      static_cast<const int32_t*>(raw_buffer) + (string_index + 1);
+  const size_t string_len = (*(offset + 1)) - (*offset);
+  return StringRef{static_cast<const char*>(raw_buffer) + (*offset),
+                   string_len};
+}
+
+}  // namespace mlir::TFL
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STRING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/tftext_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
new file mode 100644
index 00000000..eafa2d44
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace mlir {
+namespace TFL {
+
+// Fuse TF.Text APIs annotated by tf.function to a TFLite custom op.
+LogicalResult ConvertTFTextAPI(mlir::func::FuncOp func, llvm::StringRef api,
+                               mlir::TF::FuncAttr attr);
+
+// Check if TF.Text Tensorflow ops are registered.
+bool IsTFTextRegistered(const tensorflow::OpRegistry* op_registery);
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/utils.h
new file mode 100644
index 00000000..53f6a038
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/utils.h
@@ -0,0 +1,408 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+using llvm::ArrayRef;
+using mlir::Operation;
+using mlir::ShapedType;
+using mlir::Value;
+
+// Returns true if the value is the min float value.
+inline bool IsNegInfiniteValue(APFloat value) {
+  if (!value.isNegative()) return false;
+  return value.isInfinity();
+}
+
+// Returns true if the value is the max float value.
+inline bool IsPosInfiniteValue(APFloat value) {
+  if (value.isNegative()) return false;
+  return value.isInfinity();
+}
+
+// Returns true if all tensor value in `values` has static shape and same shape.
+inline bool OpHasSameStaticShapes(Operation* op) {
+  auto values = op->getOperands();
+  int operand_num = 0;
+  ArrayRef<int64_t> shape;
+  for (Value value : values) {
+    auto shaped_type = value.getType().dyn_cast<ShapedType>();
+    if (!shaped_type || !shaped_type.hasStaticShape()) {
+      return false;
+    }
+    if (operand_num == 0) {
+      shape = shaped_type.getShape();
+    } else {
+      if (shape != shaped_type.getShape()) {
+        return false;
+      }
+    }
+    ++operand_num;
+  }
+  return true;
+}
+
+// Utility function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+inline DenseElementsAttr RemapPermutation(Value permutation1,
+                                          DenseElementsAttr perm2_const) {
+  SmallVector<int32_t> initial_permutation;
+  DenseElementsAttr perm1_const;
+
+  SmallVector<int32_t> new_permutation;
+  if (matchPattern(permutation1, m_Constant(&perm1_const))) {
+    for (int32_t idx = 0; idx < perm1_const.getNumElements(); ++idx) {
+      initial_permutation.push_back(idx);
+    }
+    for (auto perm : perm2_const.getValues<APInt>()) {
+      new_permutation.push_back(
+          initial_permutation[perm1_const
+                                  .getValues<APInt>()[perm.getSExtValue()]
+                                  .getSExtValue()]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(new_permutation.size())},
+          mlir::IntegerType::get(permutation1.getContext(), 32)),
+      llvm::ArrayRef(new_permutation));
+}
+
+// Utility function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+inline DenseElementsAttr RemapPermutation(Value permutation1,
+                                          Value permutation2) {
+  DenseElementsAttr perm2_const;
+  (void)matchPattern(permutation2, m_Constant(&perm2_const));
+
+  return RemapPermutation(permutation1, perm2_const);
+}
+
+// Returns true if the transpose op is trivial. Trivial means that
+// the permutation is a cyclic permutation of the original shape with only the
+// identity dimensions permuted.
+inline bool IsTransposeTrivial(llvm::ArrayRef<int64_t> input_shape,
+                               Value perm) {
+  DenseElementsAttr perm_values_attr;
+  if (!matchPattern(perm, m_Constant(&perm_values_attr))) return false;
+
+  SmallVector<int64_t, 8> perm_values;
+  for (const auto& dim : perm_values_attr.getValues<APInt>())
+    perm_values.push_back(dim.getSExtValue());
+
+  // This should never happen unless the input graph is malformed.
+  if (input_shape.size() != perm_values.size()) {
+    return false;
+  }
+
+  SmallVector<int, 8> old_major_index_ordering;
+  SmallVector<int, 8> new_major_index_ordering;
+  for (int i = 0, end = input_shape.size(); i < end; i++) {
+    if (input_shape[i] != 1) {
+      old_major_index_ordering.push_back(i);
+    }
+
+    if (input_shape[perm_values[i]] != 1) {
+      new_major_index_ordering.push_back(perm_values[i]);
+    }
+  }
+  return (old_major_index_ordering == new_major_index_ordering);
+}
+
+// Returns the permutation that maps the input shape to the output shape.
+// This is only valid for trivial reshape ops.
+inline DenseElementsAttr GetPermutationFromTrivialReshape(
+    ShapedType input_type, ShapedType output_type) {
+  ArrayRef<int64_t> in_shape = input_type.getShape();
+  ArrayRef<int64_t> out_shape = output_type.getShape();
+
+  // Get the indexes of the non-identity dimensions and the identity dimensions
+  // in the input shape.
+  SmallVector<int32_t> input_nonidentity_dims_index_array;
+  SmallVector<int32_t> input_identity_dims_index_array;
+
+  // Since the reshape is trivial, the input and output shapes should have the
+  // same number of dimensions. And the non-identity dimensions must be in the
+  // same cyclic order.
+  for (size_t idx = 0; idx < in_shape.size(); ++idx) {
+    if (in_shape[idx] != 1) {
+      input_nonidentity_dims_index_array.push_back(idx);
+    } else {
+      input_identity_dims_index_array.push_back(idx);
+    }
+  }
+
+  // Get the permutation that maps the input shape to the output shape.
+  SmallVector<int32_t> permutation;
+  size_t nonidentity_dims_index_poiter = 0;
+  size_t identity_dims_index_pointer = 0;
+  for (auto out_dim : out_shape) {
+    if (out_dim != 1) {
+      permutation.push_back(
+          input_nonidentity_dims_index_array[nonidentity_dims_index_poiter++]);
+    } else {
+      permutation.push_back(
+          input_identity_dims_index_array[identity_dims_index_pointer++]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(permutation.size())},
+          mlir::IntegerType::get(input_type.getContext(), 32)),
+      llvm::ArrayRef(permutation));
+}
+
+// Returns true if the reshape op is equivalent to a transpose op.
+// This is true if the reshape op is a trivial reshape op, meaning no change in
+// the order of non-identity dimensions.
+inline bool IsReshapeEquivalentToTranspose(ShapedType input_type,
+                                           ShapedType output_type) {
+  std::vector<int64_t> in_shape{input_type.getShape().vec()};
+  std::vector<int64_t> out_shape{output_type.getShape().vec()};
+
+  // If the reshape changes the number of dimensions so it cannot be interpreted
+  // as a transpose.
+  if (in_shape.size() != out_shape.size()) {
+    return false;
+  }
+
+  in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1),
+                 in_shape.end());
+  out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1),
+                  out_shape.end());
+  return in_shape == out_shape;
+}
+
+// Checks if all elements in the constant attribute value are 1.
+inline bool IsAllOnesConstant(Attribute value) {
+  auto values = value.cast<DenseElementsAttr>().getValues<int32_t>();
+  return !std::any_of(values.begin(), values.end(),
+                      [](int32_t element_value) { return element_value != 1; });
+}
+
+// Checks if all elements in the constant attribute value are non-negative.
+inline bool HasNonNegativeValues(Attribute value) {
+  auto values = value.cast<DenseElementsAttr>().getValues<APInt>();
+  return !std::any_of(
+      values.begin(), values.end(),
+      [](const APInt& element_value) { return element_value.isNegative(); });
+}
+
+// Utility function to get the offset between two dense attribute values.
+inline TypedAttr GetOffSet(Attribute begin, Attribute end) {
+  auto begin_values = begin.cast<DenseElementsAttr>().getValues<int32_t>();
+  auto end_values = end.cast<DenseElementsAttr>().getValues<int32_t>();
+
+  SmallVector<int32_t> offsets;
+  if (begin_values.size() == end_values.size()) {
+    for (size_t i = 0; i < begin_values.size(); ++i) {
+      offsets.push_back(end_values[i] - begin_values[i]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get({static_cast<int>(offsets.size())},
+                            mlir::IntegerType::get(begin.getContext(), 32)),
+      llvm::ArrayRef(offsets));
+}
+
+// Check if the offset between two dense attribute values is non-negative.
+inline bool HasNonNegativeOffset(Attribute begin, Attribute end) {
+  return HasNonNegativeValues(GetOffSet(begin, end));
+}
+
+// Return true if the permutation value only swaps the last two dimensions
+inline bool AreLastTwoDimsTransposed(Value permutation) {
+  if (!permutation) return false;
+  DenseElementsAttr perm_values_attr;
+
+  if (!matchPattern(permutation, m_Constant(&perm_values_attr))) return false;
+  auto perm_values = perm_values_attr.getValues<APInt>();
+  size_t idx = 0;
+  for (; idx < perm_values_attr.size() - 2; ++idx) {
+    if (perm_values[idx].getSExtValue() != idx) return false;
+  }
+
+  return (perm_values[idx].getSExtValue() == perm_values_attr.size() - 1) &&
+         (perm_values[idx + 1].getSExtValue() == idx);
+}
+
+// Gets the new type after transposing the last 2 dimensions.
+inline Type TransposeLastTwoDims(Type type) {
+  auto shaped_type = type.dyn_cast<ShapedType>();
+  if (!shaped_type.hasStaticShape() || shaped_type.getRank() < 2) {
+    return nullptr;
+  }
+  int rank = shaped_type.getRank();
+  if (rank < 2) {
+    return nullptr;
+  }
+  SmallVector<int64_t> new_shape(shaped_type.getShape().begin(),
+                                 shaped_type.getShape().end());
+  std::swap(new_shape[rank - 1], new_shape[rank - 2]);
+  return shaped_type.clone(new_shape);
+}
+
+// Returns a ShapedType for a permutation and the shape of input after
+// applying the permutation to the given shape through a transpose.
+inline ShapedType GetTransposedType(Value input,
+                                    llvm::ArrayRef<int64_t> permutation_array) {
+  auto input_type = input.getType().cast<ShapedType>();
+  if (permutation_array.size() != input_type.getRank()) {
+    return nullptr;
+  }
+  llvm::SmallVector<int64_t> transposed_shape(permutation_array.size());
+  for (int64_t i = 0; i < permutation_array.size(); ++i) {
+    transposed_shape[i] = input_type.getDimSize(permutation_array[i]);
+  }
+  auto transposed_type =
+      RankedTensorType::get(transposed_shape, input_type.getElementType());
+  return transposed_type;
+}
+
+// Return the resultant shape if the shape of the supplied attribute/value is
+// expanded by n leading 1s'.
+inline SmallVector<int32_t> GetExpandedShape(Value input_val, int n) {
+  auto input_shape = mlir::cast<ShapedType>(input_val.getType()).getShape();
+  SmallVector<int32_t> expanded_shape;
+  expanded_shape.reserve(input_shape.size() + n);
+  for (int i = 0; i < n; ++i) {
+    expanded_shape.push_back(1);
+  }
+  expanded_shape.insert(expanded_shape.end(), input_shape.begin(),
+                        input_shape.end());
+  return expanded_shape;
+}
+
+// Return the resultant shape as a DenseElementsAttr if the shape of the
+// supplied attribute/value is expanded by n leading 1s'.
+inline DenseElementsAttr GetExpandedShapeAttr(Value input_val, int n) {
+  auto expanded_shape = GetExpandedShape(input_val, n);
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get({static_cast<int>(expanded_shape.size())},
+                            mlir::IntegerType::get(input_val.getContext(), 32)),
+      llvm::ArrayRef(expanded_shape));
+}
+
+// Return the resultant shape type if the shape of the supplied attribute/value
+// is expanded by n leading 1s'.
+inline ShapedType GetExpandedShapeType(Value input_val, int n) {
+  auto expanded_shape = GetExpandedShape(input_val, n);
+  return RankedTensorType::get(
+      SmallVector<int64_t>{expanded_shape.begin(), expanded_shape.end()},
+      mlir::cast<ShapedType>(input_val.getType()).getElementType());
+}
+
+// Returns shape of a ranked tensor.
+// Precondition: output_val's is ranked tensor.
+// Returns a truncated shape when `truncate` is set to true.
+inline DenseElementsAttr GetShape(Value output_val, bool truncate = false) {
+  auto output_shape = output_val.getType().dyn_cast<ShapedType>().getShape();
+
+  SmallVector<int32_t> shape;
+  shape.reserve(output_shape.size());
+
+  bool needs_truncation = true;
+  for (size_t dim_idx = 0; dim_idx < output_shape.size(); ++dim_idx) {
+    int64_t dim = output_shape[dim_idx];
+    if (truncate && needs_truncation && dim == 1) {
+      continue;
+    } else if (needs_truncation && dim != 1) {
+      needs_truncation = false;
+    }
+    shape.push_back(ShapedType::isDynamic(dim) ? -1
+                                               : static_cast<int32_t>(dim));
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(shape.size())},
+          mlir::IntegerType::get(output_val.getContext(), 32)),
+      llvm::ArrayRef(shape));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////// OP BROADCASTING UTILITIES ////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// Returns whether the resultant type of any broadcastable operation with
+// operands `a` and `b` matches `expected_output`. Returns false if `a` is not
+// broadcast-compatible with `b`.
+inline bool OperandsBroadcastToOutputType(Type a, Type b,
+                                          Type expected_output) {
+  Type output_element_type =
+      mlir::cast<ShapedType>(expected_output).getElementType();
+  Type broadcasted_type =
+      OpTrait::util::getBroadcastedType(a, b, output_element_type);
+  return broadcasted_type != Type() && broadcasted_type == expected_output;
+}
+
+// Returns int, float or complex DenseElementsAttr with scalar shape with the
+// given element type and the integer value.
+template <typename T>
+DenseElementsAttr GetScalarOfType(Type ty, T raw_value) {
+  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
+  if (auto float_ty = mlir::dyn_cast<FloatType>(ty)) {
+    FloatAttr attr = FloatAttr::get(float_ty, raw_value);
+    return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto int_ty = mlir::dyn_cast<IntegerType>(ty)) {
+    IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
+    return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto complex_ty = mlir::dyn_cast<ComplexType>(ty)) {
+    Type complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<float>>(raw_value));
+    } else if (complex_element_ty.isF64()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<double>>(raw_value));
+    }
+  }
+  llvm_unreachable("unsupported type");
+}
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/validators.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/validators.h
new file mode 100644
index 00000000..be24f40f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/validators.h
@@ -0,0 +1,126 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common validators used by TFLite transformation
+// passes to validate op attributes or values.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
+
+#include <cstdint>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// TODO(jpienaar): Change these to being one of these variants and/or generate
+// these predicates.
+
+// Returns true if the given TensorFlow op does not have a `data_format`
+// attribute (then default to "NHWC"), or its `data_format` attribute is "NHWC".
+inline bool TFDataFormatIsNHWC(Operation *op) {
+  auto attr = op->getAttrOfType<StringAttr>("data_format");
+  return !attr || attr.getValue() == "NHWC";
+}
+
+// Returns true if the given TensorFlow op does not have a `data_format`
+// attribute (then default to "NDHWC"), or its `data_format` attribute is
+// "NDHWC".
+inline bool TFDataFormatIsNDHWC(Operation *op) {
+  auto attr = op->getAttrOfType<StringAttr>("data_format");
+  return !attr || attr.getValue() == "NDHWC";
+}
+
+// Returns true if the given `op`
+//   * has an attribute with the given `name`,
+//   * and the attribute is an integer list of the form [1, X, Y, 1],
+// and writes X, Y as 32-bit integer attribute to `x`, `y`.
+bool TFIntListIs1XY1(Operation *op, StringRef name, IntegerAttr *x,
+                     IntegerAttr *y);
+
+// Returns true if the attribute is an integer list of the form [1, X, Y, 1].
+bool TFIntListIs1XY1(Attribute attr);
+
+// Returns true if the attribute is an integer list of the form [1, 1, X, Y].
+bool TFIntListIs11XY(Attribute attr);
+
+// Returns true if the given `op`
+//   * has an attribute with the given `name`,
+//   * and the attribute is an integer list of the form [1, X, Y, Z, 1],
+// and writes X, Y as 32-bit integer attribute to `x`, `y`, z.
+bool TFIntListIs1XYZ1(Operation *op, StringRef name, IntegerAttr *x,
+                      IntegerAttr *y, IntegerAttr *z);
+
+// Returns true if every element of the attribute is 1. All elements of `attr`
+// must be `IntegerAttr`.
+bool TFIntListIsAllOnes(Attribute attr);
+
+// Returns true iff the given value is a float32 tensor.
+// is "DT_FLOAT".
+inline bool TFTypeIsFloat32Tensor(Value value) {
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
+  if (!tensorType) return false;
+  return tensorType.getElementType().isF32();
+}
+
+// Returns true iff the given value is a bf16 tensor.
+inline bool TFTypeIsBFloat16Tensor(Value value) {
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
+  if (!tensorType) return false;
+  return tensorType.getElementType().isBF16();
+}
+
+// Returns true iff the given value is a f16 tensor.
+inline bool TFTypeIsHalfTensor(Value value) {
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
+  if (!tensorType) return false;
+  return tensorType.getElementType().isF16();
+}
+
+// Returns true iff the given value is a f16 or bf16 tensor.
+inline bool TFTypeIsBFloat16OrHalfTensor(Value value) {
+  return TFTypeIsBFloat16Tensor(value) || TFTypeIsHalfTensor(value);
+}
+
+// Returns true iff the given TensorFlow op has a `padding` attribute whose
+// value is "SAME" or "VALID", and writes the attribute to `padding`.
+inline bool TFPaddingIsSameOrValid(Operation *op, StringAttr *padding) {
+  auto padding_attr = op->getAttrOfType<StringAttr>("padding");
+  if (padding_attr.getValue() != "SAME" && padding_attr.getValue() != "VALID")
+    return false;
+  *padding = padding_attr;
+  return true;
+}
+
+/// Returns whether the given `a` and `b` have broadcast-compatible
+/// types.
+bool IsBroadcastableElementsAttrs(mlir::TypedAttr a, mlir::TypedAttr b);
+// Returns true if every dimension of the attribute is 1 except the last one.
+bool IsDimensionsDegenerateExceptLastOne(mlir::TypedAttr val);
+// Returns true if every element is 1 except the last one.
+bool IsDimensionsDegenerateExceptLastOne(ArrayRef<int64_t> elements_shape);
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/variables_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/variables_utils.h
new file mode 100644
index 00000000..570f9afd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/utils/variables_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VARIABLES_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VARIABLES_UTILS_H_
+
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace utils {
+
+// Returns true if 'op' has type that is supported by native TFLite
+// variables.
+bool IsSupportedVariableType(Operation* op);
+
+// Returns true if 'type' is supported by native tflite variables.
+bool IsSupportedVariableType(ShapedType type);
+
+}  // namespace utils
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VARIABLES_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/version.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/version.h
new file mode 100644
index 00000000..321bd395
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/lite/version.h
@@ -0,0 +1,25 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_VERSION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_VERSION_H_
+
+// LINT.IfChange(tflite_schema_version)
+// The version number of the Schema. Ideally all changes will be backward
+// compatible. If that ever changes, we must ensure that version is the first
+// entry in the new tflite root so that we can see that version is not 1.
+#define TFLITE_SCHEMA_VERSION (3)
+// LINT.ThenChange(//tensorflow/lite/version.h)
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
new file mode 100644
index 00000000..1e817d0a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
@@ -0,0 +1,232 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
+#include "absl/log/check.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// -------------------------------------------------------------------------- //
+// MLIR passes running on Tensorflow function graphs (Tensorflow V2).
+// -------------------------------------------------------------------------- //
+
+// Disabled - skip execution of the pass.
+// Enabled - execute the pass, propagate errors to the caller if any.
+// FallbackEnabled - execute the pass and commit all the changes to the MLIR
+//   module in case of success. Do not commit any changes in case of failures,
+//   let the rest of the pipeline run.
+enum class MlirOptimizationPassState { Disabled, Enabled, FallbackEnabled };
+
+// An API for registering MLIR ModulePass with the Tensorflow runtime. These
+// passes are running only for function graphs built by Tensorflow V2 and
+// instantiated by the process_function_library_runtime (see
+// FunctionOptimizationPass for details).
+class MlirOptimizationPass {
+ public:
+  virtual ~MlirOptimizationPass() = default;
+  virtual llvm::StringRef name() const = 0;
+
+  // Returns an enum value:
+  //   Enabled if the pass is enabled for the given graph with specified config.
+  //   Disabled if the pass is disabled.
+  //   FallbackEnabled if the pass needs to be executed in fallback mode.
+  //
+  // When the pass is FallbackEnabled, the pass is executed and the changes it
+  // makes to the MLIR module will be committed only if the pass was successful,
+  // otherwise no changes are committed and the rest of the pipeline is run.
+  //
+  // `device_set` can be nullptr if the devices information is not
+  // available or no device specific filtering is required.
+  // `function_library` contains function definitions for function calls in
+  // `graph` not included in the `graph` FunctionLibraryDefinition.
+  virtual MlirOptimizationPassState GetPassState(
+      const DeviceSet* device_set, const ConfigProto& config_proto,
+      const Graph& graph,
+      const FunctionLibraryDefinition& function_library) const = 0;
+
+  virtual absl::Status Run(
+      const std::string& function_name, const ConfigProto& config_proto,
+      mlir::ModuleOp module, const Graph& graph,
+      const FunctionLibraryDefinition& function_library) = 0;
+};
+
+class MlirOptimizationPassRegistry {
+ public:
+  struct PassRegistration {
+    int priority;
+    std::unique_ptr<MlirOptimizationPass> pass;
+  };
+
+  struct PriorityComparator {
+    bool operator()(const PassRegistration& x,
+                    const PassRegistration& y) const {
+      return x.priority < y.priority;
+    }
+  };
+
+  using Passes = std::set<PassRegistration, PriorityComparator>;
+
+  // Returns the global registry of MLIR optimization passes.
+  static MlirOptimizationPassRegistry& Global();
+
+  // Register optimization `pass` with the given `priority`.
+  void Add(int priority, std::unique_ptr<MlirOptimizationPass> pass) {
+    auto inserted = passes_.insert({priority, std::move(pass)});
+    CHECK(inserted.second)
+        << "Pass priority must be unique. "
+        << "Previously registered pass with the same priority: "
+        << inserted.first->pass->name().str();
+  }
+
+  // Free the memory allocated for all passes.
+  void ClearPasses() { passes_.clear(); }
+
+  const Passes& passes() const { return passes_; }
+
+ private:
+  Passes passes_;
+};
+
+// Function optimization pass that runs all MLIR passes registered in
+// MlirOptimizationPassRegistry.
+class MlirFunctionOptimizationPass : public FunctionOptimizationPass {
+ public:
+  explicit MlirFunctionOptimizationPass(
+      const MlirOptimizationPassRegistry* registry =
+          &MlirOptimizationPassRegistry::Global())
+      : registry_(registry) {}
+
+  // Executes all of the underlying registered MlirOptimizationPasses.
+  absl::Status Run(
+      const std::string& function_name, const DeviceSet& device_set,
+      const ConfigProto& config_proto,
+      const FunctionOptimizationPass::FunctionOptions& function_options,
+      std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
+      std::vector<std::string>* control_ret_node_names,
+      bool* control_rets_updated) override;
+
+ private:
+  const MlirOptimizationPassRegistry* registry_;
+};
+
+// -------------------------------------------------------------------------- //
+// MLIR passes running on Tensorflow V1 graphs.
+// -------------------------------------------------------------------------- //
+
+// An API for registering MLIR ModulePass with the Tensorflow runtime. These
+// passes are running only for V1 graphs (legacy graphs) executed via Session
+// runtime. Graph importer updates legacy graph behavior to V2 constructs (e.g.
+// it raises control flow from Switch/Merge nodes to functional control flow
+// with If/While operations).
+class MlirV1CompatOptimizationPass {
+ public:
+  virtual ~MlirV1CompatOptimizationPass() = default;
+  virtual llvm::StringRef name() const = 0;
+
+  // Returns a MlirOptimizationPassState based on the given graph and
+  // config. See comments on `MlirOptimizationPassState` enum for more info
+  // on exact values.
+  virtual MlirOptimizationPassState GetPassState(
+      const DeviceSet* device_set, const ConfigProto& config_proto,
+      const Graph& graph,
+      const FunctionLibraryDefinition& function_library) const = 0;
+
+  virtual absl::Status Run(const GraphOptimizationPassOptions& options,
+                           mlir::ModuleOp module) = 0;
+};
+
+class MlirV1CompatOptimizationPassRegistry {
+ public:
+  // Returns the global registry of MLIR optimization passes.
+  static MlirV1CompatOptimizationPassRegistry& Global();
+
+  void Add(std::unique_ptr<MlirV1CompatOptimizationPass> pass) {
+    CHECK(pass_ == nullptr) << "Only a single pass can be registered";
+    pass_ = std::move(pass);
+  }
+
+  MlirV1CompatOptimizationPass* pass() const {
+    return pass_ ? pass_.get() : nullptr;
+  }
+
+  // Free the memory allocated for the single pass.
+  // This method is used for testing mostly.
+  void ClearPass() { pass_.reset(); }
+
+ private:
+  std::unique_ptr<MlirV1CompatOptimizationPass> pass_{};
+};
+
+class MlirV1CompatGraphOptimizationPass : public GraphOptimizationPass {
+ public:
+  explicit MlirV1CompatGraphOptimizationPass(
+      const MlirV1CompatOptimizationPassRegistry* registry =
+          &MlirV1CompatOptimizationPassRegistry::Global())
+      : registry_(registry) {}
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  const MlirV1CompatOptimizationPassRegistry* registry_;
+};
+
+// -------------------------------------------------------------------------- //
+// Helper classes for static registration of MLIR (V1 Compat) passes in the
+// corresponding registry.
+// -------------------------------------------------------------------------- //
+
+namespace mlir_pass_registration {
+
+class MlirOptimizationPassRegistration {
+ public:
+  explicit MlirOptimizationPassRegistration(
+      int priority, std::unique_ptr<MlirOptimizationPass> pass) {
+    MlirOptimizationPassRegistry::Global().Add(priority, std::move(pass));
+  }
+};
+
+class MlirV1CompatOptimizationPassRegistration {
+ public:
+  explicit MlirV1CompatOptimizationPassRegistration(
+      std::unique_ptr<MlirV1CompatOptimizationPass> pass) {
+    MlirV1CompatOptimizationPassRegistry::Global().Add(std::move(pass));
+  }
+};
+
+}  // namespace mlir_pass_registration
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/op_or_arg_name_mapper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
new file mode 100644
index 00000000..f8c596ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_OP_OR_ARG_NAME_MAPPER_H_
+#define TENSORFLOW_COMPILER_MLIR_OP_OR_ARG_NAME_MAPPER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// PointerUnion for operation and value.
+// TODO(jpienaar): Rename the files.
+using OpOrVal = llvm::PointerUnion<mlir::Operation*, mlir::Value>;
+
+// Mapper from operation or value to name.
+class OpOrArgNameMapper {
+ public:
+  // Returns unique name for the given prefix.
+  llvm::StringRef GetUniqueName(llvm::StringRef prefix, int hash_value = 0);
+
+  // Returns unique name for the operation or value.
+  llvm::StringRef GetUniqueName(OpOrVal op_or_val, int hash_value = 0);
+
+  // Returns unique name as a string_view for the operation or value.
+  absl::string_view GetUniqueNameView(OpOrVal op_or_val);
+
+  // Initializes operation or value to map to name. Returns number of
+  // operations or value already named 'name' which should be 0 else
+  // GetUniqueName could return the same names for different operations or
+  // values.
+  // Note: Its up to the caller to decide the behavior when assigning two
+  // operations or values to the same name.
+  int InitOpName(OpOrVal op_or_val, llvm::StringRef name);
+
+  virtual ~OpOrArgNameMapper();
+
+ protected:
+  // Returns true if the name is unique. A derived class can override it if the
+  // class maintains uniqueness in a different scope.
+  virtual bool IsUnique(llvm::StringRef name);
+
+  // Returns a constant view of the underlying map.
+  const llvm::DenseMap<OpOrVal, absl::string_view>& GetMap() const {
+    return op_or_val_to_name_;
+  }
+
+  // Returns the separator used before uniqueing suffix.
+  virtual llvm::StringRef GetSuffixSeparator() { return ""; }
+
+  virtual llvm::StringRef GetDashSeparator() { return "_"; }
+
+ private:
+  // Returns name from the location of the operation or value.
+  virtual std::string GetName(OpOrVal op_or_val) = 0;
+
+  // Maps string name to count. This map is used to help keep track of unique
+  // names for operations or values.
+  llvm::StringMap<int64_t> name_to_count_;
+  // Maps operation or values to name. Value in map is a view of the string
+  // name in `name_to_count_`. Names in `name_to_count_` are never removed.
+  llvm::DenseMap<OpOrVal, absl::string_view> op_or_val_to_name_;
+};
+
+// OpOrArgNameMapper that returns, for operations or values not initialized
+// to a specific name, a name based on the location of the operation or
+// value.
+class OpOrArgLocNameMapper : public OpOrArgNameMapper {
+ protected:
+  std::string GetName(OpOrVal op_or_val) override;
+};
+
+// OpOrArgNameMapper that returns, for operations or values not initialized
+// to a specific name, a short name.
+class OpOrArgStripNameMapper : public OpOrArgNameMapper {
+ private:
+  std::string GetName(OpOrVal op_or_val) override;
+
+  // Number of ops mapped.
+  int count_ = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_OP_OR_ARG_NAME_MAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/python/mlir.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/python/mlir.h
new file mode 100644
index 00000000..99a17ca1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/python/mlir.h
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions for getting information about kernels registered in the binary.
+// Migrated from previous SWIG file (mlir.i) authored by aminim@.
+#ifndef TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/tf_status.h"
+
+namespace tensorflow {
+
+// Simple wrapper to support tf.mlir.experimental.convert_graph_def.
+// Load a GraphDef (binary or textual proto format), convert to MLIR, and
+// (optionally) optimize the module before returning it as a string.
+// This is an early experimental API, ideally we should return a wrapper object
+// around a Python binding to the MLIR module.
+std::string ImportGraphDef(const std::string &proto,
+                           const std::string &pass_pipeline,
+                           bool show_debug_info, TF_Status *status);
+
+// Simple wrapper to support tf.mlir.experimental.convert_function.
+// Load FunctionDef (binary or textual proto format), convert to MLIR, and
+// (optionally) optimize the module before returning it as a string.
+// This is an early experimental API, ideally we should return a wrapper object
+// around a Python binding to the MLIR module.
+std::string ImportFunction(const std::string &functiondef_proto,
+                           const std::string &pass_pipeline,
+                           bool show_debug_info, TFE_Context *context,
+                           TF_Status *status);
+
+// This wrapper passes the graph_def taking names of input nodes, the shapes and
+// types of its inputs and the output nodes as parameters to MLIR.
+std::string ImportGraphDef(const std::string &proto,
+                           const std::string &pass_pipeline,
+                           bool show_debug_info, absl::string_view(input_names),
+                           absl::string_view(input_data_types),
+                           absl::string_view(input_data_shapes),
+                           absl::string_view(output_names), TF_Status *status);
+
+// Load a SavedModel and return a textual MLIR string corresponding to it.
+//
+// Args:
+//   saved_model_path: File path from which to load the SavedModel.
+//   exported_names_str: Comma-separated list of names to export.
+//                       Empty means "export all".
+//
+// Returns:
+//   A string of textual MLIR representing the raw imported SavedModel.
+std::string ExperimentalConvertSavedModelToMlir(
+    const std::string &saved_model_path, const std::string &exported_names_str,
+    bool show_debug_info, TF_Status *status);
+
+// Load a SavedModel V1 and return a textual MLIR string corresponding to it
+// without any MLIR graph transformation.
+//
+// Args:
+//   saved_model_path: File path from which to load the SavedModel.
+//   tags: Tags to identify MetaGraphDef that need to be loaded.
+//   upgrade_legacy: Boolean flag that indicates whether to upgrade legacy
+//                   graphs
+//
+// Returns:
+//   A string of textual MLIR representing the raw imported SavedModel.
+std::string ExperimentalConvertSavedModelV1ToMlirLite(
+    const std::string &saved_model_path, const std::string &exported_names_str,
+    const std::string &tags, bool upgrade_legacy, bool show_debug_info,
+    TF_Status *status);
+
+// Load a SavedModel V1 and return a textual MLIR string corresponding to it.
+//
+// Args:
+//   saved_model_path: File path from which to load the SavedModel.
+//   tags: Tags to identify MetaGraphDef that need to be loaded.
+//   lift_variables: Boolean flag that indicates whether to hoist variables
+//                   after loading the SavedModel.
+//
+// Returns:
+//   A string of textual MLIR representing the raw imported SavedModel.
+std::string ExperimentalConvertSavedModelV1ToMlir(
+    const std::string &saved_model_path, const std::string &exported_names_str,
+    const std::string &tags, bool lift_variables,
+    bool include_variables_in_initializers, bool upgrade_legacy,
+    bool show_debug_info, TF_Status *status);
+
+std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
+                                        const std::string &pass_pipeline,
+                                        bool show_debug_info,
+                                        TF_Status *status);
+
+// Writes the input textual MLIR as bytecode to output file.
+void ExperimentalWriteBytecode(const std::string &filename,
+                               const std::string &mlir_txt, TF_Status *status);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
new file mode 100644
index 00000000..f9fbed1c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H_
+#define TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H_
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
+
+namespace py = pybind11;
+
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
+
+#endif  // TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
new file mode 100644
index 00000000..8d805d93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -0,0 +1,263 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
+
+#include <array>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+
+namespace mlir::quant {
+
+constexpr char kAttrMapAttribute[] = "attr_map";
+
+// Name of the string attribute attached to `XlaCallModuleOp`, which is the
+// textproto representation of `Method`.
+inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
+
+// Permutation from the NHWC tensor format to NCHW. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNhwcToNchwPermutation = {0, 3, 1, 2};
+
+// Permutation from the NCHW tensor format to NHWC. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNchwToNhwcPermutation = {0, 2, 3, 1};
+
+// Permutation from the OIHW (== (output features, input features, height,
+// width)) tensor format to HWIO. This is commonly used to transpose convolution
+// weights represented as OIHW format to HWIO, which is more desirable for
+// certain downstream optimization passes (e.g. XLA).
+inline constexpr std::array<int64_t, 4> kOihwToHwioPermutation = {2, 3, 1, 0};
+
+// Returns true if the value has static shape.
+bool HasStaticShape(Value value);
+
+// Returns true if the value has static shape at given dims.
+bool HasStaticShapeAtDims(Value value, ArrayRef<int> dims);
+
+// Whether `value` has known rank of `rank`. Returns false when it is not a
+// `ShapedType` or its rank is unknown.
+inline bool HasRankOf(Value value, const int64_t rank) {
+  auto shaped_type = mlir::dyn_cast_or_null<ShapedType>(value.getType());
+  return shaped_type && shaped_type.hasRank() && shaped_type.getRank() == rank;
+}
+
+// Creates a new type that has the shape from the `old_type` and the element
+// type from the `element_type`.
+Type CloneTypeWithNewElementType(Type old_type, Type element_type);
+
+// Creates an array with integer/float type.
+template <typename T,
+          typename = std::enable_if_t<
+              (std::is_integral_v<T> || std::is_same_v<T, float>), void>>
+Value CreateConstValue(OpBuilder& builder, const Location loc,
+                       const SmallVector<int64_t>& shape,
+                       const SmallVector<T>& values) {
+  if constexpr (std::is_integral_v<T>) {
+    auto shape_type =
+        RankedTensorType::get(shape, builder.getIntegerType(sizeof(T) * 8));
+
+    const auto attr = DenseIntElementsAttr::get(shape_type, values);
+    return builder.create<TF::ConstOp>(loc, attr);
+  }
+
+  const auto type = RankedTensorType::get(shape, builder.getF32Type());
+  const auto value_attr = DenseFPElementsAttr::get(type, values);
+  return builder.create<TF::ConstOp>(loc, value_attr);
+}
+
+// Creates a 1D array with integer/float type.
+template <typename T>
+Value Create1DConstValue(OpBuilder& builder, const Location loc,
+                         const SmallVector<T>& values) {
+  return CreateConstValue<T>(builder, loc,
+                             {static_cast<int64_t>(values.size())}, values);
+}
+
+// Creates a scalar with integer / float type.
+template <typename T>
+Value CreateScalarConstValue(OpBuilder& builder, const Location loc,
+                             const T value) {
+  return CreateConstValue<T>(builder, loc, /*shape=*/{}, {value});
+}
+
+// Checks if the value is a constant and return its splat value.
+template <typename T,
+          typename = std::enable_if_t<
+              (std::is_integral_v<T> || std::is_same_v<T, float>), void>>
+bool GetSplatValue(Value value, T& splat_value) {
+  if constexpr (std::is_integral_v<T>) {
+    DenseIntElementsAttr value_attr;
+    if (!matchPattern(value, m_Constant(&value_attr)) ||
+        !value_attr.isSplat()) {
+      return false;
+    }
+    splat_value = value_attr.getSplatValue<T>();
+    return true;
+  }
+
+  DenseFPElementsAttr value_attr;
+  if (!matchPattern(value, m_Constant(&value_attr)) || !value_attr.isSplat()) {
+    return false;
+  }
+  splat_value = value_attr.getSplatValue<T>();
+  return true;
+}
+
+// Checks if the value is a constant and its splat value is equal to x.
+template <typename T>
+bool IsSplatValueEqual(Value value, const T x) {
+  T splat_value;
+  if (!GetSplatValue(value, splat_value)) return false;
+
+  return splat_value == x;
+}
+
+// Checks if two values are constants and their splat values are equal.
+template <typename T>
+bool AreSplatValuesEqual(Value x, Value y) {
+  T splat_x, splat_y;
+  if (!GetSplatValue(x, splat_x) || !GetSplatValue(y, splat_y)) {
+    return false;
+  }
+
+  return splat_x == splat_y;
+}
+
+// Clones an operation with new operands while keeping attributes.
+SmallVector<Value> CloneOpWithReplacedOperands(OpBuilder& builder,
+                                               Operation* op,
+                                               ArrayRef<Value> new_operands);
+
+// Tries casting `op` with a concrete op type `T`. If the cast fails or `op` is
+// a `nullptr`, returns `failure` and prints a debugging message identifying
+// the cast attempt as `name`.
+template <typename T>
+FailureOr<T> TryCast(Operation* op, const StringRef name) {
+  auto cast_op = dyn_cast_or_null<T>(op);
+  if (cast_op) {
+    return cast_op;
+  } else {
+    DEBUG_WITH_TYPE("mlir-quant-attrs-and-constraints",
+                    llvm::dbgs() << "Failed to match " << name << " ("
+                                 << T::getOperationName() << ").\n");
+    return failure();
+  }
+}
+
+FailureOr<int32_t> CastI64ToI32(int64_t value);
+
+// Tries to cast an array of int64 to int32. If any of the element in the
+// array is not in the range of int32, returns failure().
+FailureOr<SmallVector<int32_t>> CastI64ArrayToI32(
+    ArrayRef<int64_t> int64_array);
+
+// Returns the first operation with the given type in the function.
+template <typename OpType>
+OpType FindOperationOfType(func::FuncOp function) {
+  for (auto op : function.getBody().getOps<OpType>()) {
+    return op;
+  }
+  return nullptr;
+}
+
+// Returns the first user of the given operation, optionally of the given
+// type if provided. If there is no user or user of type, return nullptr.
+template <typename T = Operation*>
+Operation* FindUserOfType(Operation* op) {
+  for (Operation* user : op->getUsers()) {
+    if (isa<T>(user)) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+// Returns the first user of the given operation, optionally of the given
+// type if provided. If there is no user or user of type, return nullptr.
+template <typename T = Operation*>
+Operation* FindOperandOfType(Operation* op) {
+  for (Value operand_value : op->getOperands()) {
+    if (isa<T>(operand_value.getDefiningOp())) {
+      return operand_value.getDefiningOp();
+    }
+  }
+  return nullptr;
+}
+
+// Returns the function attribute for the given call op which is lifted for
+// quantization.
+inline FlatSymbolRefAttr GetFuncAttr(TF::PartitionedCallOp call_op) {
+  return mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+}
+
+inline FlatSymbolRefAttr GetFuncAttr(TF::XlaCallModuleOp call_op) {
+  return call_op->getAttrOfType<FlatSymbolRefAttr>(
+      TF::kStablehloEntryFunctionAttrName);
+}
+
+// Returns the entry function name for the given tf.XlaCallModule op. Returns
+// empty string if such attribute does not exist.
+StringRef GetEntryFunctionName(TF::XlaCallModuleOp op);
+
+// Checks whether the given op contains QuantizationTrait::FullyQuantizable.
+inline bool HasQuantizableTrait(Operation* op) {
+  return op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+         op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
+             QuantTraitValues[QuantizationTrait::FullyQuantizable];
+}
+
+// Returns true if `op` has two operands and one result and only second operand
+// is quantized.
+bool IsHybridQuantizedOp(Operation* op);
+
+// Returns whether a given `stablehlo.dot_general` can be legalizable to
+// `tfl.fully_connected`.
+absl::StatusOr<bool> IsDotGeneralFullyConnected(
+    ::mlir::stablehlo::DotGeneralOp dot_general_op);
+
+// Returns the quantization dimension for a given `stablehlo.dot_general` op,
+// or `std::nullopt` if the given op is not per-channel quantizable.
+std::optional<int64_t> GetDotGeneralQuantizationDim(
+    ::mlir::stablehlo::DotGeneralOp dot_general_op);
+
+// Checks if a `StringRef` contains 'conv' or 'dot_general'.
+bool ContainsConvOrDot(StringRef str);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/func.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/func.h
new file mode 100644
index 00000000..ade7bcfc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/func.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_FUNC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_FUNC_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir::quant {
+
+// Returns a public `func::FuncOp` in `module_op` whose name matches either
+// `main` or `serving_default`. If `func::FuncOps` with both names exist, the
+// function with name "main" takes precedence. Returns null if no such a
+// function exists.
+func::FuncOp FindMainFuncOp(ModuleOp module_op);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_FUNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
new file mode 100644
index 00000000..9e0e0e63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
@@ -0,0 +1,79 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+//
+// This file defines support utilities for interoperating with FakeQuant* based
+// QAT (Quantized Aware Training) computations, as implemented by TFLite. Note
+// that FakeQuant* operators mix multiple concerns specific to how TFLite
+// originally implemented quantization. As such, utilities here enforce
+// opinions taken by that codebase (vs providing any amount of genericity).
+//
+// Specifically, it combines the following concerns, each of which would be
+// independent variables in a more generic setup:
+//   - numBits and isSigned imply storage data type (uint8, int8, int16)
+//   - numBits < 8 is promoted to uint8 or int8
+//   - "narrow_range" narrows the lower bound of the storage type's range by
+//     1
+//   - the specified min/max values are "nudged" so that the result has a zero
+//     that can be exactly expressed
+//   - min=max=0 implies scale=0 and zero_point=0
+//
+// With the above assumptions applied, every conforming specified FakeQuant op
+// can be represented by a UniformQuantizedType. This scheme is not expected to
+// be generalized further in the future and should be considered to be a
+// legacy set of rules.
+//
+// As canonically used in TensorFlow graphs, the presence of a FakeQuant node
+// is a hint that the specific math represented here has been simulated at
+// training time. As such, it is usually not advised to arbitrarily change
+// quantization parameters derived from FakeQuant.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
+
+#include <cstdint>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace quantfork {
+
+/// Converts per-layer FakeQuant attributes to the corresponding type.
+/// In the event that the parameters cannot be converted, returns a nullptr
+/// convertible Type and issues an appropriate error.
+/// Note that there are multiple variants of a per-layer FakeQuant op, so
+/// this function takes the attributes discretely vs taking a reference to the
+/// originating op.
+quant::UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
+                                                 double rmin, double rmax,
+                                                 bool narrowRange,
+                                                 Type expressedType,
+                                                 bool isSigned = false);
+
+/// Converts per-channel FakeQuant attributes to the corresponding type.
+/// In the event that the parameters cannot be converted, returns a nullptr
+/// convertible Type and issues an appropriate error.
+quant::UniformQuantizedPerAxisType fakeQuantAttrsToType(
+    Location loc, unsigned numBits, int32_t quantizedDimension,
+    ArrayRef<double> rmins, ArrayRef<double> rmax, bool narrowRange,
+    Type expressedType, bool isSigned = false);
+}  // namespace quantfork
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h
new file mode 100644
index 00000000..699b2582
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_QUANTOPS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_QUANTOPS_H_
+
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOpsDialect.h.inc"
+#define GET_OP_CLASSES
+
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_QUANTOPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
new file mode 100644
index 00000000..f4dcc8bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
@@ -0,0 +1,247 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quantfork {
+
+// Performs type conversion from an arbitrary input type to a type
+// that is expressed by a QuantizedType.
+//
+// This handles cases where the inputType is a supported primitive type
+// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
+// elemental type.
+//
+// Since conversion often involves introspecting some attributes of the
+// input type in order to determine how to represent it, this is a two step
+// process.
+struct ExpressedToQuantizedConverter {
+  // Creates a converter for the given input type.
+  static ExpressedToQuantizedConverter forInputType(Type input_type);
+
+  // Converts the inputType to be based on the given elemental type,
+  // returning the new type (or nullptr and emit an error on failure).
+  Type convert(quant::QuantizedType elemental_type) const;
+
+  // Whether the conversion is legal.
+  explicit operator bool() const { return (bool)expressed_type; }
+
+  // The input type that is being converted from.
+  // This may be an elemental or composite type.
+  const Type input_type;
+
+  // Supported, elemental expressed type (i.e. f32).
+  // Will be nullptr if conversion is not supported.
+  const Type expressed_type;
+};
+
+// Reference implementation of converting between real numbers and values
+// represented by a UniformQuantizedType.
+// Note that this is not expected to be speedy and may be superseded eventually
+// by a more optimal implementation.
+// Also, the interface assumes that quantization is done per-layer and will
+// need to be wider for various per-channel schemes. As such, this is a
+// placeholder.
+class UniformQuantizedValueConverter {
+ public:
+  explicit UniformQuantizedValueConverter(
+      quant::UniformQuantizedType uniform_type)
+      : UniformQuantizedValueConverter(
+            uniform_type.getScale(),
+            static_cast<double>(uniform_type.getZeroPoint()),
+            static_cast<double>(uniform_type.getStorageTypeMin()),
+            static_cast<double>(uniform_type.getStorageTypeMax()),
+            uniform_type.getStorageTypeIntegralWidth(),
+            uniform_type.isSigned()) {
+    assert(isa<FloatType>(uniform_type.getExpressedType()));
+    assert(uniform_type.getStorageType().isSignlessInteger());
+  }
+
+  UniformQuantizedValueConverter(double scale, double zero_point,
+                                 double clamp_min, double clamp_max,
+                                 uint32_t storage_bit_width, bool is_signed)
+      : scale_(scale),
+        zero_point_(zero_point),
+        clamp_min_(clamp_min),
+        clamp_max_(clamp_max),
+        scale_double_(scale),
+        zero_point_double_(zero_point),
+        clamp_min_double_(clamp_min),
+        clamp_max_double_(clamp_max),
+        storage_bit_width_(storage_bit_width),
+        is_signed_(is_signed),
+        round_mode_(APFloat::rmNearestTiesToAway) {}
+
+  UniformQuantizedValueConverter(double scale, double zero_point,
+                                 const APFloat& clamp_min,
+                                 const APFloat& clamp_max,
+                                 uint32_t storage_bit_width, bool is_signed)
+      : scale_(scale),
+        zero_point_(zero_point),
+        clamp_min_(clamp_min),
+        clamp_max_(clamp_max),
+        scale_double_(scale),
+        zero_point_double_(zero_point),
+        clamp_min_double_(clamp_min.convertToDouble()),
+        clamp_max_double_(clamp_max.convertToDouble()),
+        storage_bit_width_(storage_bit_width),
+        is_signed_(is_signed),
+        round_mode_(APFloat::rmNearestTiesToAway) {}
+
+  virtual APInt quantizeFloatToInt(APFloat expressed_value) const {
+    // This function is a performance critical code path in quantization
+    // since it runs for each single float parameter value.
+
+    // Specialize f32->u8/i8 case to optimize performance.
+    if (&expressed_value.getSemantics() == &APFloat::IEEEsingle() &&
+        storage_bit_width_ == 8 &&
+        round_mode_ == llvm::APFloatBase::rmNearestTiesToAway) {
+      return quantizeF32ToInt8(expressed_value);
+    }
+
+    bool lossy;
+    expressed_value.convert(scale_.getSemantics(), round_mode_, &lossy);
+    // fixed_point = clamp(clamp_min, clamp_max, (
+    //   roundHalfToEven(expressed / scale) + zero_point))
+    APFloat scaled = (expressed_value / scale_);
+    scaled.roundToIntegral(round_mode_);
+    scaled.add(zero_point_, round_mode_);
+    APFloat fixed_point = llvm::minimum(scaled, clamp_max_);
+    fixed_point = llvm::maximum(fixed_point, clamp_min_);
+
+    llvm::APSInt result(storage_bit_width_, !is_signed_);
+    fixed_point.convertToInteger(result, round_mode_, &lossy);
+
+    return std::move(result);
+  }
+
+  int64_t quantizeFloatToInt64(APFloat expressed_value) const {
+    const APInt q_value = quantizeFloatToInt(std::move(expressed_value));
+    return is_signed_ ? q_value.getSExtValue() : q_value.getZExtValue();
+  }
+
+  virtual ~UniformQuantizedValueConverter() = default;
+
+ private:
+  // An optimized implementation to quantize f32 to i8/u8 with C++ native
+  // arithmetic.
+  virtual APInt quantizeF32ToInt8(const APFloat& expressed_value) const {
+    assert(&expressed_value.getSemantics() == &APFloat::IEEEsingle());
+    assert(storage_bit_width_ == 8);
+    assert(round_mode_ == llvm::APFloatBase::rmNearestTiesToAway);
+
+    const float real_value = expressed_value.convertToFloat();
+
+    const double scaled = real_value / scale_double_ + zero_point_double_;
+    // Round to nearest integer with halfway cases rounded away from zero.
+    const double scaled_rounded = std::round(scaled);
+    const double clamped = std::min(std::max(scaled_rounded, clamp_min_double_),
+                                    clamp_max_double_);
+
+    uint64_t signless_result;
+    if (is_signed_) {
+      int64_t clamped_int = static_cast<int8_t>(clamped);
+      memcpy(&signless_result, &clamped_int, sizeof(clamped_int));
+    } else {
+      signless_result = static_cast<uint8_t>(clamped);
+    }
+    return APInt(storage_bit_width_, signless_result, /*isSigned=*/is_signed_);
+  }
+
+  // Keep both APFloat and double versions of the quantization parameters
+  // around since they will be used in generic and specialized arithmetic,
+  // respectively.
+  const APFloat scale_;
+  const APFloat zero_point_;
+  const APFloat clamp_min_;
+  const APFloat clamp_max_;
+
+  const double scale_double_;
+  const double zero_point_double_;
+  const double clamp_min_double_;
+  const double clamp_max_double_;
+
+  const uint32_t storage_bit_width_;
+  const bool is_signed_;
+  const llvm::APFloat::roundingMode round_mode_;
+};
+
+// An utility class to quantize an attribute by the per-axis quantization
+// parameters. The size of the quantization dim in the converted elements
+// attribute should match the size of of scales/zero_points vectors in the
+// quantization parameters.
+class UniformQuantizedPerAxisValueConverter {
+ public:
+  explicit UniformQuantizedPerAxisValueConverter(
+      quant::UniformQuantizedPerAxisType uniform_type)
+      : scales_(uniform_type.getScales()),
+        zero_points_(uniform_type.getZeroPoints()),
+        clamp_min_(static_cast<double>(uniform_type.getStorageTypeMin())),
+        clamp_max_(static_cast<double>(uniform_type.getStorageTypeMax())),
+        storage_bit_width_(uniform_type.getStorageTypeIntegralWidth()),
+        is_signed_(uniform_type.isSigned()),
+        quantization_dim_(uniform_type.getQuantizedDimension()) {
+    assert(isa<FloatType>(uniform_type.getExpressedType()));
+    assert(uniform_type.getStorageType().isSignlessInteger());
+    assert(scales_.size() == zero_points_.size());
+  }
+
+  // Quantize an Attribute by the quantization parameters. Return nullptr if
+  // the conversion fails or the input array isn't an ElementsAttr.
+  ElementsAttr convert(Attribute real_value);
+
+ private:
+  // Quantize an DenseFPElementsAttr by the quantization parameters.
+  DenseElementsAttr convert(DenseFPElementsAttr attr);
+
+  // Get a uniform converter for the index-th chunk along the quantizationDim.
+  // All the elements in this chunk is quantized by the returned converter.
+  UniformQuantizedValueConverter getPerChunkConverter(int index) const {
+    return UniformQuantizedValueConverter(scales_[index], zero_points_[index],
+                                          clamp_min_, clamp_max_,
+                                          storage_bit_width_, is_signed_);
+  }
+
+  const ArrayRef<double> scales_;
+  const ArrayRef<int64_t> zero_points_;
+  const APFloat clamp_min_;
+  const APFloat clamp_max_;
+  const uint32_t storage_bit_width_;
+  const bool is_signed_;
+  int32_t quantization_dim_;
+};
+
+}  // namespace mlir::quantfork
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
new file mode 100644
index 00000000..22e0307f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -0,0 +1,118 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::quant {
+
+// This attribute will be set for functions created by this pass.
+// Presence of this attribute will mark the function as quantization target.
+inline constexpr StringRef kFusedFunctionAttr = "tf_quant.composite_function";
+// The keyword to detect if this is a `NullAttribute`.
+inline constexpr StringRef kNullAttributeValue = "N/A";
+
+// Prefixes attached to lifted functions.
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+
+// The attribute will be used for TF::XlaCallModuleOp to restore the original
+// function name when loading it back.
+inline constexpr StringRef kOriginalStablehloEntryFunctionAttrName =
+    "_original_entry_function";
+
+// FunctionCallOpType to be generated as the function call operator when
+// function lifting will happen.
+enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
+
+// Checks if an op is inside a lifted function.
+// If the given op pointer is a nullptr, returns false.
+bool IsInLiftedFunc(Operation* op);
+
+// Checks if the op is inside a StableHLO op with region.
+// If the given op pointer is a nullptr, returns false.
+bool IsInStableHloOpRegion(Operation* op);
+
+// Checks if a given einsum op is supported for XlaDotV2 quantization.
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
+
+// Gets the quantization method from `op`. It is retrieved from the
+// `kQuantizationMethodAttr` string attribute. Returns
+// `absl::InvalidArgumentError` when the attribute doesn't exist. Returns
+// `absl::InternalError` when parsing the attribute to `Method` failed.
+// `op` must be non-null.
+absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
+    absl::Nonnull<Operation*> op);
+
+// Gets the quantization method from `op`. It is retrieved from the
+// `kQuantizationMethodAttr` string attribute. Returns a default instance of
+// `Method` iff the attribute doesn't exist or the attribute contains an invalid
+// textproto for `Method`. `op` must be non-null.
+::stablehlo::quantization::Method GetQuantizationMethodOrDefault(
+    absl::Nonnull<Operation*> op);
+
+// Creates a function to wrap the section between arguments and results.
+// The generated function call op type will be decided by the given call_op_type
+// argument. Currently, it supports TF::XlaCallModuleOp and
+// TF::PartitionedCallOp function call op generations.
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results,
+                                         ArrayRef<NamedAttribute> attributes);
+
+// Same as above but with empty attributes.
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results);
+
+// Add the second argument to the first argument, which is expected to be an
+// argument list.
+// Used to attach bias to einsum argument list.
+SmallVector<Value> AppendToVector(ArrayRef<Value> arguments, Value append);
+
+// Checks if the `Method` attatched to the given `tf.XlaCallModule` op has
+// `WeightOnlyPtq`.
+bool HasWeightOnlyPtqMethod(TF::XlaCallModuleOp xla_call_module_op);
+
+// Checks if an op is a `tf.XlaCallModule` op, contains 'conv' or 'dot_general'
+// in its name and has `Method` with `WeightOnlyPtq`.
+bool IsWeightOnlyQuantizableOp(const Operation& op);
+
+// Lists the functions in a ModuleOp sorted by their names.
+SmallVector<func::FuncOp> GetSortedFunctions(ModuleOp module_op);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h
new file mode 100644
index 00000000..cb9dac20
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h
@@ -0,0 +1,252 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines node specs for quantization and the methods to parse
+// command line flags to these specs.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_CONFIG_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace quant {
+
+// Stores information about how to quantize a user-specified custom operation.
+struct CustomOpInfo {
+  std::vector<std::int32_t> quantizable_input_indices;
+  bool is_weight_only = false;
+  bool no_side_effect = true;
+};
+
+using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
+enum CustomOpUpdateOptions { kInputIndices, kWeightOnly, kNoSideEffect };
+enum class QDQConversionMode { kQDQNone, kQDQStatic, kQDQDynamic };
+
+struct QuantizationSpecs {
+  // Which function this node quant specifications belong to.
+  std::string target_func = "main";
+
+  // Whether to trigger quantization passses for post-training quantization.
+  // If true, the model input doesn't require user specified input ranges.
+  bool post_training_quantization = false;
+
+  // Whether to allow dynamic range quantization. This is the easiest
+  // quantization mode which doesn't require QAT or sample inputs.
+  // This option only targets `DT_HALF` and `DT_QINT8` inference type.
+  bool weight_quantization = false;
+
+  // Whether to use the MLIR dynamic range quantizer instead of TOCO.
+  bool enable_mlir_dynamic_range_quantizer = false;
+
+  // Whether to allow weight-only quantization. This scheme quantizes
+  // weights but will dequantize them back at runtime which is useful for
+  // memory bound case without kernel support available in lower precisions.
+  // Used in MLIR dynamic range quantizer.
+  bool weight_only_quantization = false;
+
+  // The minimum number of elements in a weights array required to apply
+  // quantization. This is especially useful not to quantize small tensors as
+  // it is hard to get performance benefits from them with quantization. Used
+  // in MLIR dynamic range quantizer with int8 weight data type.
+  int64_t minimum_elements_for_weights = 1024;
+
+  // Whether to calculate scales in float to keep quantized values the same with
+  // old TOCO quantizer.
+  bool legacy_float_scale = false;
+
+  // Whether to perform per-tensor quantization. Currently, this option is only
+  // valid when the quantization parameters need to be created by scanning the
+  // constant content (post-training quantization or QAT without weight
+  // FakeQuant).
+  bool disable_per_channel = false;
+
+  // Whether to disable per-channel weight quantization and enable legacy per
+  // tensor quantization. The legacy quantization for Dense layers is
+  // inconsistent with Conv 1x1 which always performs per channel quantization.
+  bool disable_per_channel_for_dense_layers = false;
+
+  // Whether to use fixed output ranges of the activation ops (tanh, sigmoid,
+  // etc.) and not infer weight constants.
+  // If this option is set, quantization emulation ops should be placed after
+  // the ops in the input graph. This flag should be set to false for
+  // post-training quantization.
+  bool disable_infer_tensor_range = false;
+
+  // Whether to use the unfrozen variable quantization in MLIR. Typically,
+  // variables are frozen for passing passes, but some variables aren't frozen.
+  // If it is true, QuantizeVariables pass will be added after the
+  // PrepareQuantizePass.
+  bool enable_mlir_variable_quantization = false;
+
+  // The node type when the model is exported. Currently this is limited to
+  // DT_FLOAT, DT_HALF, DT_QINT8, and DT_QUINT8. When DT_HALF is used, the
+  // `weight_quantization` flag needs to set to true. When DT_QUINT8 is used,
+  // the `weight_quantization` flag needs to set to false.
+  tensorflow::DataType inference_type = tensorflow::DT_FLOAT;
+
+  // The input and output data type during inference. This flag is only used
+  // when `inference_type` is different from DT_FLOAT. This flag can only be set
+  // to DT_FLOAT or as same as `inference_type`. If this flag is different
+  // from `inference_type`, adaptor ops are inserted as heading and tailing ops
+  // in the result model.
+  tensorflow::DataType inference_input_type = tensorflow::DT_FLOAT;
+
+  // Input node ranges. These ranges are stored as the same order of function
+  // arguments. They are only used when `weight_quantization` is set to false,
+  // and the model is required to have quantization parameters, either from
+  // quantization aware training or calibration, for the remaining tensors.
+  std::vector<std::pair<std::optional<double>, std::optional<double>>>
+      input_ranges;
+
+  // Whether to disable setting the quantization parameters of the input nodes
+  // using input ranges.
+  bool disable_set_input_nodes_quantization_params = false;
+
+  // The default ranges can be used when a tensor doesn't have quantization
+  // parameters and couldn't be quantized. Used only for latency tests.
+  std::pair<std::optional<double>, std::optional<double>> default_ranges;
+
+  // A serialized "QuantizationInfo" object to specify value ranges for some of
+  // the tensors with known names.
+  std::string serialized_quant_stats = "";
+
+  // A bitmask to encode support for reduced precision inference in the model.
+  tflite::optimize::ReducedPrecisionSupport support_mask =
+      tflite::optimize::ReducedPrecisionSupport::None;
+
+  // Whether to run the passes to propagate the quantization parameters and
+  // graph rewrites. Returns false if the inference_type is DT_FLOAT or
+  // `weight_quantization` flag is set.
+  bool RunPropagationAndRewriteQuantizationPasses() const {
+    return inference_type != tensorflow::DT_FLOAT && !weight_quantization;
+  }
+
+  // TODO: b/202075505 - make implicit weight type clearer
+  // Whether run the passes and graph rewrites for dynamic range quantization.
+  bool RunAndRewriteDynamicRangeQuantizationPasses() const {
+    bool dynamic_range_quantize =
+        (inference_type != tensorflow::DT_FLOAT) && weight_quantization &&
+        !post_training_quantization && !disable_infer_tensor_range &&
+        enable_mlir_dynamic_range_quantizer;
+    return dynamic_range_quantize;
+  }
+
+  // Returns whether this inference type represents a signed storage type.
+  bool IsSignedInferenceType() const {
+    switch (inference_type) {
+      case tensorflow::DT_QUINT8:
+      case tensorflow::DT_QUINT16:
+        return false;
+      default:
+        return true;
+    }
+  }
+
+  // Gets the width of this quantization type. Returns 0 if it isn't a
+  // quantization type.
+  int64_t GetQuantizationTypeWidth() const {
+    switch (inference_type) {
+      case tensorflow::DT_INT8:
+      case tensorflow::DT_UINT8:
+      case tensorflow::DT_QINT8:
+      case tensorflow::DT_QUINT8:
+        return 8;
+      case tensorflow::DT_INT16:
+      case tensorflow::DT_UINT16:
+      case tensorflow::DT_QINT16:
+      case tensorflow::DT_QUINT16:
+        return 16;
+      case tensorflow::DT_INT32:
+      case tensorflow::DT_QINT32:
+        return 32;
+      default:
+        return 0;
+    }
+  }
+
+  // Whether to add the NumericVerify ops to verify numbers before and after
+  // quantization.
+  bool verify_numeric = false;
+  // Whether to add verification for layer by layer, or on whole model. When
+  // disabled (per-layer) float and quantized ops will be run from same input
+  // (output of previous quantized layer). When enabled, float and quantized ops
+  // will run with respective float and quantized output of previous ops.
+  bool whole_model_verify = false;
+
+  // Whether to use fake quant attributes to calculate quantization parameters.
+  bool use_fake_quant_num_bits = false;
+
+  // Names of ops to block from quantization. Used in QuantizePass.
+  // For dynamic range quantization, ops in blocklist are quantized in weight-
+  // only manner.
+  absl::flat_hash_set<std::string> ops_blocklist;
+
+  // Names of locations to block from quantization. Used in QuantizePass.
+  absl::flat_hash_set<std::string> nodes_blocklist;
+
+  // Map from custom op code to custom op quantization information.
+  // For dynamic range quantization, among the custom ops in the graph those
+  // specified in this map are subject to quantization.
+  CustomOpMap custom_map;
+
+  // If other than kQDQNone, the model is a floating point graph with QDQ ops
+  // to be eliminated and fused into quantized kernels.
+  QDQConversionMode qdq_conversion_mode = QDQConversionMode::kQDQNone;
+
+  // When set, adheres to the QDQ annotations added by the framework when
+  // possible rather than quantizing any op that is possible to quantize.
+  bool strict_qdq_mode = false;
+};
+
+// Parses the command line flag strings to the CustomOpMap specification.
+void ParseCustomOpSpecs(absl::string_view node_names,
+                        const CustomOpUpdateOptions& update_option,
+                        CustomOpMap& custom_op_map);
+
+// Parses the command line flag strings to the quantization specification for
+// input arrays of a graph. The array names are not stored in the spec, and will
+// be matched by position. Returns true if failed.
+bool ParseInputNodeQuantSpecs(absl::string_view node_names,
+                              absl::string_view min_values,
+                              absl::string_view max_values,
+                              absl::string_view inference_type,
+                              QuantizationSpecs* quant_specs);
+
+// Gets the quantization specification for input arrays. The array names are not
+// stored in the spec, and will be matched by position. The min/max will be
+// ignored if the inference_type isn't a quantized type. Returns true if failed.
+bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
+                            const std::vector<std::optional<double>>& node_mins,
+                            const std::vector<std::optional<double>>& node_maxs,
+                            tensorflow::DataType inference_type,
+                            QuantizationSpecs* quant_specs);
+
+// Returns a human-readable string of the QDQQuantMode enum class
+std::string GetQDQQuantModeString(QDQConversionMode mode);
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
new file mode 100644
index 00000000..43edaab9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
@@ -0,0 +1,387 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+
+namespace mlir {
+namespace quant {
+
+// The state for each op result during the quantization parameters propagation.
+struct QuantState {
+  // Quantization parameters propagated to an op result.
+  QuantizedType params;
+  // A flag indicates this state (the params) shouldn't be changed after it is
+  // initialized. This flag will be set to true if the quantization parameters
+  // are from the quantization-aware training.
+  const bool immutable;
+
+  bool IsEmpty() const { return params == nullptr; }
+};
+
+// The state for rescaling the propagated quantization parameters. This can be
+// on the input side to satisfy the constraint of previous operation, or on the
+// output side to satisfy the constraint of the next operation.
+struct RequantizeState {
+  // Sometimes, we have to "requantize" the quantization result to satisfy all
+  // the constraints. The "requantize" can happen either on the input or output
+  // of the quantization result.
+  enum RequantizePosition {
+    NO_REQUANTIZE,
+    ON_INPUT,
+    ON_OUTPUT
+  } pos = NO_REQUANTIZE;
+
+  // Quantization parameters will be used to add the requantize ops.
+  QuantizedType params;
+
+  // Avoid clobbering all uses of the value, limit to just these ops.
+  SmallVector<std::pair<Operation*, int>> users;
+};
+
+using RequantizeStates = SmallVector<RequantizeState>;
+
+// This is a worklist-driven driver for propagating quantization parameters
+// across operations.
+//
+// The initial quantization parameters are extracted from the quantized type
+// between adjacent `quantfork::QuantizeCastOp` and
+// `quantfork::DequantizeCastOp`s. All these initial parameters are marked as
+// immutable because they are from quantization-aware training.
+//
+// The algorithm traverses each op and sets the quantization parameters of its
+// operands and results, according to its quantization specification, and then
+// adds the operands and results to the worklist. If there are any conflicts
+// (for example, there are quantization parameters propagated from the previous
+// iteration), this process stops if the existing parameters are the immutable,
+// or adding `requantize` op to resolve the conflicts.
+//
+// After the algorithm is converged, pairs of `quantfork::QuantizeCastOp` and
+// `quantfork::DequantizeCastOp` are inserted to the right position to
+// materialize the propagation and requantize results.
+//
+class QuantizationDriver {
+ public:
+  // Type alias of int used to access `states_`.
+  using QuantStateIndex = int;
+
+  // (op, operand index) pair.
+  using OpWithOperandIndex = std::pair<Operation*, int>;
+
+  // (op, result index) pair.
+  using OpWithResultIndex = std::pair<Operation*, int>;
+
+  explicit QuantizationDriver(func::FuncOp func_op, const bool is_signed,
+                              const int bit_width,
+                              const bool disable_per_channel,
+                              OpQuantSpecGetter op_quant_spec_getter,
+                              OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+                              const bool infer_tensor_range,
+                              const bool legacy_float_scale = false,
+                              const bool is_qdq_conversion = false)
+      : fn_(func_op),
+        builder_(func_op.getBody()),
+        is_signed_(is_signed),
+        bit_width_(bit_width),
+        disable_per_channel_(disable_per_channel),
+        op_quant_spec_getter_(op_quant_spec_getter),
+        op_quant_scale_spec_getter_(op_quant_scale_spec_getter),
+        infer_tensor_range_(infer_tensor_range),
+        legacy_float_scale_(legacy_float_scale),
+        is_qdq_conversion_(is_qdq_conversion) {}
+
+  // The entry point of the quantization parameters propagation.
+  void Run();
+
+  // Sets up the states for all the op results in the function.
+  void Initialize();
+
+  // Propagates the quantization parameters across all the ops.
+  bool PropagateParamsAndReturnIfChanged();
+
+  // Inserts the Quantize and Dequantize ops according to the propagation
+  // result.
+  void Finalize();
+
+  SmallVector<BlockArgument, 4> GetArgs() { return args_; }
+
+  llvm::DenseMap<std::pair<mlir::Operation*, int>, int> GetResultStates() {
+    return result_states_;
+  }
+
+  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
+
+  // Returns the state of the block argument.
+  QuantState& GetArgQuantState(BlockArgument arg) {
+    return states_[arg_states_[arg]];
+  }
+
+  // Returns the state of the index-th result of the op.
+  QuantState& GetResultQuantState(Operation* op, const int index) {
+    return states_[result_states_[{op, index}]];
+  }
+
+ private:
+  // Duplicates the constant op if it has multiple uses, and replaces
+  // target_op->operand[operand_index] with the newly created op. This also
+  // replaces corresponsing quantization states.
+  arith::ConstantOp DuplicateConstantOpIfNeeded(arith::ConstantOp op,
+                                                Operation* target_op,
+                                                int operand_index);
+
+  // Adjusts bias scale that is derived from other scales (fc, conv ops) to
+  // prevent overflow of quantized bias values. This also changes quantization
+  // state of other inputs when needed.
+  bool SetBiasParamsWithAdjustments(Operation* op, int bias_index,
+                                    ArrayRef<int> input_indices,
+                                    QuantizedType params);
+
+  // Checks preconditions to adjust bias scale.
+  bool ShouldCheckBiasScale(Operation* op, int bias_index,
+                            ArrayRef<int> input_indices,
+                            QuantizedType quantized_type, int& input_index,
+                            int& filter_index);
+
+  // Preprocesses the constants by doing the following:
+  //   - Duplicates constants if it is used by multiple ops. For example, if a
+  //     constant is used by multiple ops as a bias, duplicate constants and
+  //     let each op assign its own quantization parameter for bias.
+  //   - Adds all the non-bias constants (weights) to a set for looking up
+  //     later.
+  //   - Adds all per-channel weights to a set for looking up later.
+  void PreprocessConstantOps();
+
+  // Sets up all the data structures for quantization propagation.
+  void SetupAllStates();
+
+  // Returns Whether the constant is a weight, which shouldn't be shared by
+  // different ops.
+  bool IsWeight(Operation* cst) { return llvm::is_contained(weights_, cst); }
+
+  // Returns all the related quantization constraints of the op.
+  std::unique_ptr<OpQuantSpec> GetQuantSpec(Operation* op);
+  std::unique_ptr<OpQuantScaleSpec> GetQuantScaleSpec(Operation* op);
+
+  // Returns whether quantization parameters have been propagated to the results
+  // of this op.
+  bool IsQuantized(Operation* op);
+
+  // Adds all the users of index-th result of op to the work list.
+  void AddUserToList(Operation* op, const int index) {
+    for (Operation* user : op->getResult(index).getUsers()) {
+      work_list_.push_back(user);
+    }
+  }
+
+  // Adds the defining op of index-th operand of op to the work list.
+  void AddOperandToList(Operation* op, const int index) {
+    if (Operation* operand_op = op->getOperand(index).getDefiningOp();
+        operand_op != nullptr) {
+      work_list_.push_back(operand_op);
+    }
+  }
+
+  // Returns the quantization params for the bias input from the non-bias
+  // operands which have their indexes in the `non_biases` vector. The returned
+  // parameters are calculated by `func`.
+  QuantizedType GetBiasParams(Operation* op, int bias_index,
+                              ArrayRef<int> non_bias_operand_indices,
+                              AccumulatorScaleFunc func);
+
+  // Sets the quantization parameters of the result to `quantized_type`. If
+  // any quantization parameters have been propagated, a requantize will
+  // happen on the input of propagated quantization. Returns `true` if internal
+  // state has been modified.
+  bool SetResultParams(Operation* op, int result_index,
+                       QuantizedType quantized_type);
+
+  // Sets the quantization parameters of the operand to `quantized_type`. If any
+  // quantization parameters have been propagated, a `requantize` will happen on
+  // the output of propagated quantization. When `override` is set, quantization
+  // state of the value is replaced instead of adding requantization. Returns
+  // `true` if internal state has been modified.
+  bool SetOperandParams(Operation* op, int operand_index,
+                        QuantizedType quantized_type, bool override = false);
+
+  // Sets the quantization parameters of the constant result according to its
+  // content.
+  bool SetConstantResultParams(Operation* op);
+
+  // Inserts the Quantize and Dequantize ops after `op`'s `index`-th result. The
+  // quantized element type for the result is `quantized_type`.
+  void QuantizeOpResult(Operation* op, int result_index,
+                        QuantizedType quantized_type);
+
+  // Inserts the Quantize and Dequantize ops after `arg`. The quantized element
+  // type for `arg` is `quantized_type`.
+  void QuantizeArg(BlockArgument arg, QuantizedType quantized_type);
+
+  // Inserts the Quantize and Dequantize ops (i.e. QDQ) after `value`. The
+  // quantized element type for `value` is `quantized_type`.
+  void QuantizeValue(Value value, QuantizedType quantized_type, Location loc);
+
+  // Inserts the Quantize ops for requantizing the index-th result of the op.
+  void RequantizeOpResult(Operation* op, int result_index,
+                          RequantizeStates& states);
+
+  // Inserts the Quantize ops for requantizing a block argument.
+  void RequantizeArg(BlockArgument arg, RequantizeStates& states);
+
+  // Inserts the Quantize and Dequantize ops to quantize the value and returns
+  // the Quantize op.
+  void RequantizeValue(Value value, RequantizeStates& states, Location loc);
+
+  // Returns the quantization parameter satisfies the same scale
+  // constraints for the op. Returns an empty option if this quantization
+  // parameter doesn't exist.
+  QuantizedType GetQuantParamsForSameScaleConstraint(Operation* op);
+
+  // Returns the state of the index-th operand of the op.
+  QuantState& GetOperandQuantState(Operation* op, const int index) {
+    return states_[operand_states_[{op, index}]];
+  }
+
+  // Returns the states of the index-th operand of the op.
+  RequantizeStates& GetOperandRequantizeStates(Operation* op, const int index) {
+    return rescale_states_[operand_states_[{op, index}]];
+  }
+
+  // Returns the states of the index-th result of the op.
+  RequantizeStates& GetResultRequantizeStates(Operation* op, const int index) {
+    return rescale_states_[result_states_[{op, index}]];
+  }
+
+  // Returns the states of the arg.
+  RequantizeStates& GetArgRequantizeStates(BlockArgument arg) {
+    return rescale_states_[arg_states_[arg]];
+  }
+
+  // Sets the state of an argument. If this value is cached, uses the cached
+  // result without creating new entry in the state vector. Otherwise, allocate
+  // a new entry in the state vector.
+  void InitializeArgState(BlockArgument arg, Value arg_value);
+
+  // Sets the state of the index-th operand of the op. If this operand is
+  // cached, uses the cached result without creating new entry in the state
+  // vector. Otherwise, allocate a new entry in the state vector.
+  void InitializeOperandState(Operation* op, int index, Value value);
+
+  // Sets the state of the index-th result of the op. If this result is cached,
+  // uses the cached result without creating new entry in the state vector.
+  // Otherwise, allocate a new entry in the state vector.
+  void InitializeResultState(Operation* op, int index, Value value);
+
+  func::FuncOp fn_;
+  OpBuilder builder_;
+  const bool is_signed_;
+  const int bit_width_;
+  const bool disable_per_channel_;
+
+  // We should distinguish weights and bias constants. Biases are specified by
+  // the quantization spec or are the operands of ops with same scale spec. The
+  // rest are weights.
+  DenseSet<Operation*> weights_;
+
+  // The weights require narrow_range quantization. This map collects all the
+  // weight operands defined by the op quant spec. The value of each entry is
+  // the quantization dimension. If it is positive, per-channel quantization is
+  // required.
+  DenseMap<Operation*, int> optimized_weights_;
+
+  // All the ops needs to propagate the quantization parameters to.
+  std::vector<Operation*> work_list_;
+  absl::flat_hash_set<Operation*> quantized_;
+
+  // The vector contains all the quantization parameters propagated from the
+  // defining operations of the value, or from the quantization aware training.
+  std::vector<QuantState> states_;
+
+  // The map contains all the quantization parameters which are required to
+  // satisfy the same operands and results constraint. The keys of this map are
+  // the values from `operand_states_` and `result_state_`.
+  absl::flat_hash_map<QuantStateIndex, RequantizeStates> rescale_states_;
+
+  // Maps of indexes to the propagation state vector from the ops operands,
+  // results and arguments.
+  DenseMap<OpWithOperandIndex, QuantStateIndex> operand_states_;
+  DenseMap<BlockArgument, QuantStateIndex> arg_states_;
+  DenseMap<Value, QuantStateIndex> value_to_state_;
+
+  // This vector is to preserve the arguments order, so the newly inserted
+  // quantized ops for the arguments are deterministically ordered.
+  SmallVector<BlockArgument, 4> args_;
+
+  OpQuantSpecGetter op_quant_spec_getter_;
+  OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
+
+  // Infer output ranges for activation ops and constants. This is usually
+  // required for post-training quantization.
+  const bool infer_tensor_range_;
+
+  // Calculate scales in float instead of double, so that the scales and
+  // quantized values are exactly the same with the TOCO quantizer.
+  const bool legacy_float_scale_;
+
+  // If true, the model is a floating point graph with QDQ ops to be eliminated
+  // and fused into quantized kernels.
+  const bool is_qdq_conversion_;
+};
+
+// Propagates quantization parameters across ops in this function and satisfies
+// the quantization specification of the ops. This methods assumes the initial
+// quantization parameters are stored as adjacent quantize and dequantize ops
+// and the propagation results are materialized by inserting pairs of quantize
+// and dequantize ops to this function. Set `disable_per_channel` to true to not
+// use per channel quantization even the op supports it.
+// Setting `infer_tensor_range` to true, to infer quantization parameters from
+// the activation ops and weight constants. This is only used for post-training
+// quantization.
+void ApplyQuantizationParamsPropagation(func::FuncOp func, bool is_signed,
+                                        int bit_width, bool disable_per_channel,
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool infer_tensor_ranges,
+                                        bool legacy_float_scale,
+                                        bool is_qdq_conversion);
+
+void ApplyQuantizationParamsPropagation(
+    func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
+    OpQuantSpecGetter op_quant_spec_getter,
+    OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
+    bool legacy_float_scale, bool is_qdq_conversion);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h
new file mode 100644
index 00000000..e93cc4cf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the op traits used in the MLIR TensorFlow Lite dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_TRAITS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_TRAITS_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+using QuantizedType = mlir::quant::QuantizedType;
+using UniformQuantizedType = mlir::quant::UniformQuantizedType;
+
+namespace mlir {
+namespace quant {
+// Verifies that the op satisfies the same operands and results scales
+// constraints. Note that this constraint can only be applied on some
+// storage types of the op.
+LogicalResult VerifySameScales(Operation* op);
+}  // namespace quant
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_interface.h.inc"
+
+namespace OpTrait {
+namespace quant {
+
+// The base class that all the quantization related OpTrait implements.
+template <typename ConcreteType, template <typename> class TraitType>
+struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
+  static bool IsBias(int index) { return false; }
+  static bool IsQuantizable() { return true; }
+};
+
+// This class provides the API for ops that has a fixed output value range.
+// This is used as a trait like this:
+//
+//   class SoftmaxOp
+//       : public Op<SoftmaxOp,
+//           OpTrait::quant::FixedResultUniformScale<
+//               8, -128, 390625, -8, 0, 255, false>::Impl> {
+//
+// TODO(fengliuai): create a better way to express floating point scale in the
+// template argument list.
+template <unsigned BitWidth, int ZeroPoint, int ScaleMantissa, int ScaleExp,
+          int64_t StorageTypeMin, int64_t StorageTypeMax, bool Sign>
+class FixedResultUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, FixedResultUniformScale<
+                              BitWidth, ZeroPoint, ScaleMantissa, ScaleExp,
+                              StorageTypeMin, StorageTypeMax, Sign>::Impl> {
+   public:
+    QuantizedType GetResultQuantizedType(int index) {
+      auto op = this->getOperation();
+      const auto result_type =
+          op->getResult(index).getType().template cast<ShapedType>();
+      if (!result_type.getElementType().template isa<FloatType>()) return {};
+      Builder builder(op->getContext());
+      const IntegerType storage_type = builder.getIntegerType(BitWidth);
+      const double scale = static_cast<double>(ScaleMantissa) *
+                           std::pow(10.0, static_cast<double>(ScaleExp));
+      return UniformQuantizedType::getChecked(
+          Sign, storage_type, result_type.getElementType(), scale, ZeroPoint,
+          StorageTypeMin, StorageTypeMax, builder.getUnknownLoc());
+    }
+  };
+};
+
+// This class provides the API for ops that has input as bias. This is used
+// as a trait like this:
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::quant::AccumulatorScale<2, 0, 1>::Impl>
+//
+// TODO(fengliuai): supports a configurable accumulator bit width.
+template <int Bias, int... Operands>
+class AccumulatorUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, AccumulatorUniformScale<Bias, Operands...>::Impl> {
+   public:
+    // Whether the index-th operand is a bias.
+    static bool IsBias(int index) { return index == Bias; }
+
+    // Returns the indexes of all the non-bias operands.
+    static std::vector<int> GetAllNonBiasOperands() {
+      return std::vector<int>({Operands...});
+    }
+  };
+};
+
+// The trait to specify the operand index of the coefficient for an affine op
+// and also the quantization dimension if per-axis quantization is support.
+// If the quantization dimension is -1, per-axis quantization isn't supported.
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::quant::AffineOpCoefficient<0>::Impl>
+//
+template <int QuantDim, int OperandIndex = 1>
+class AffineOpCoefficient {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public TraitBase<ConcreteType,
+                         AffineOpCoefficient<QuantDim, OperandIndex>::Impl> {
+   public:
+    static int GetCoefficientOperandIndex() { return OperandIndex; }
+    static int GetQuantizationDim() { return QuantDim; }
+  };
+};
+
+// This class provides the API for ops that can be quantized.
+// This is as a trait like this:
+//
+//   class LessOp : public Op<LessOp, OpTrait::quant::QuantizableResult> {
+//
+template <typename ConcreteType>
+class QuantizableResult
+    : public QuantizationSpecTraitBase<ConcreteType, QuantizableResult> {};
+
+}  // namespace quant
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_TRAITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
new file mode 100644
index 00000000..94169e3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -0,0 +1,973 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace quant {
+
+// A unit attribute can be attached to the quantize/dequantize ops which are
+// added by the quantization passes. These ops can be removed erased without
+// losing accuracy.
+inline constexpr char kVolatileOpAttrName[] = "volatile";
+
+// Following attributes are used to mark ops that are not quantizable during
+// debug model generation process for whole-model verify mode. If these
+// attributes are attached, the upstream float/quantized ops know which ops to
+// connect to, and it also prevents these ops from being copied again.
+inline constexpr char kDebugModeOpFloatAttrName[] = "debug_float";
+inline constexpr char kDebugModeOpQuantAttrName[] = "debug_quant";
+
+// Used to annotate custom ops if they are quantizable.
+inline constexpr char kQuantTraitAttrName[] = "_tfl_quant_trait";
+enum QuantizationTrait { FullyQuantizable = 0, NotQuantizable = 1 };
+inline constexpr absl::string_view QuantTraitValues[] = {"fully_quantizable",
+                                                         "not_quantizable"};
+inline constexpr char kOutputQuantized[] = "_output_quantized";
+
+inline constexpr double kNearZeroTolerance = 1.0e-6;
+
+using QuantParams = QuantizedType;
+using QuantSpec = QuantizationSpecs;
+using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
+using QuantParamsForResults = llvm::SmallVector<QuantizedType, 4>;
+using AccumulatorScaleFunc =
+    std::function<QuantizedType(const std::vector<QuantizedType>&, int, bool)>;
+using BiasParamsMap =
+    absl::flat_hash_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
+// UniformQuantizedType GetFixedOutputRange(bool sign, int bit_width)
+using GetFixedOutputRangeFunc = std::function<UniformQuantizedType(bool, int)>;
+// bool RequiredSameOperandsAndResultsScale(bool sign, int $bit_width)
+using RequiredSameOperandsAndResultsScaleFunc = std::function<bool(bool, int)>;
+// bool RequiredSameQuantizedAxes()
+using RequiredSameQuantizedAxesFunc = std::function<bool()>;
+
+using CustomMap = quant::CustomOpMap;
+
+// Quantization spec of an op, driving the quantization algorithm.
+struct OpQuantSpec {
+  // Maps the operand index of a bias input to its quantization specifications,
+  // including the non-bias operand indexes and the method retrieving
+  // quantization parameters from list of parameters of the non-bias operands.
+  // This map is empty if the op doesn't have a bias operand.
+  BiasParamsMap biases_params;
+
+  // Quantization parameters for value restricted outputs. This is the
+  // "hard-coded" parameters and should be used unconditionally for the
+  // quantized op. This vector is empty if the op doesn't have value restricted
+  // outputs.
+  llvm::DenseMap<SignedInteger, QuantParamsForResults> restricted_output_params;
+
+  // Coefficient operand index and whether supporting per-channel quantization.
+  // For QAT, this information is carried by the FakeQuant*/Quantize/Dequantize
+  // ops, but post-training quantization, the quantization parameters need to be
+  // inferred from the tensor content and op property. A "-1" value indicates
+  // the operand doesn't support per-channel quantization.
+  llvm::DenseMap<int, int> coeff_op_quant_dim;
+
+  // Indices of quantizable operands. Biases are not included in this field,
+  // the indices of biases can be found in the `biases_params`.
+  absl::flat_hash_set<int> quantizable_operands;
+};
+
+// A function signature for getting the particular OpQuantSpec for the provided
+// op.
+using OpQuantSpecGetter =
+    std::function<std::unique_ptr<OpQuantSpec>(Operation*)>;
+
+// Quantization scale spec of an op. The information defined in the MLIR
+// interfaces FixedOutputRangeInterface and SameOperandsAndResultsScale should
+// be checked first if present.
+// TODO: b/323478683: Consider deprecating this.
+struct OpQuantScaleSpec {
+  // Whether this op has a fixed range requirement (e.g. sigmoid)
+  bool has_fixed_output_range = false;
+  // Whether this op should have same operand and result scales (e.g. concat)
+  bool has_same_scale_requirement = false;
+  // Whether this op should have same operand and result type (e.g. gather)
+  bool has_same_operand_and_result_type_requirement = false;
+  // Returns the fixed output range, when has_fixed_output_range is set.
+  GetFixedOutputRangeFunc fixed_output_range_func;
+  // Returns whether same operands and results scales are required.
+  RequiredSameOperandsAndResultsScaleFunc required_same_scale_func =
+      [](bool sign, int bit_width) { return true; };
+  // Returns whether operands and results must have the same quantized axis.
+  RequiredSameQuantizedAxesFunc required_same_quantized_axes_func = []() {
+    return true;
+  };
+};
+
+// A function signature for getting the particular OpQuantScaleSpec for the
+// provided op.
+using OpQuantScaleSpecGetter =
+    std::function<std::unique_ptr<OpQuantScaleSpec>(Operation*)>;
+
+// Used in TFL Numeric Verify
+struct NumericVerifySpec {
+  // Whether to enable numeric verification
+  bool verify_numeric = false;
+
+  // Tolerance level from the quantized value for verification. If the tolerance
+  // is very small(<0.1), only the stats of the diff is displayed.
+  float error_tolerance = 5.0f;
+
+  // Whether to verify numerical correctness layer by layer or by whole model
+  bool whole_model_verify = false;
+
+  // Whether to enable log for failures
+  bool log_if_failed_flag = false;
+};
+
+// Used in TFL Quantize Pass
+struct QuantPassSpec {
+  // Variables to control TFL Numeric Verify
+  NumericVerifySpec numeric_verify_spec;
+
+  // Variables related to quantization
+  QuantSpec quant_spec;
+};
+
+// Re-calculates scales again in float instead of simply downcasting existing
+// scales.
+quant::QuantizedType DownCastScale(quant::QuantizedType type,
+                                   const SmallVectorImpl<double>& mins,
+                                   const SmallVectorImpl<double>& maxs,
+                                   Location loc);
+
+quant::QuantizedType DownCastScale(quant::QuantizedType type, double min,
+                                   double max, Location loc);
+
+bool IsOpQuantizable(Operation* op);
+bool QuantizableOpSupportsFloatOutputType(Operation* op);
+
+// Specialized version of location to string for flatbuffer exported locations.
+inline std::string GetTensorNameFromLoc(Location loc) {
+  if (auto name_loc = loc.dyn_cast<NameLoc>()) {
+    return name_loc.getName().str();
+  }
+  return "";
+}
+
+template <typename QuantizeOpT, typename DequantizeOpT>
+struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
+  ConvertStatsToQDQs(int num_bits, bool narrow_range, bool is_signed,
+                     bool legacy_float_scale, MLIRContext* context)
+      : OpRewritePattern<quantfork::StatisticsOp>(context),
+        num_bits(num_bits),
+        narrow_range(narrow_range),
+        is_signed(is_signed),
+        legacy_float_scale(legacy_float_scale) {}
+
+  LogicalResult matchAndRewrite(quantfork::StatisticsOp op,
+                                PatternRewriter& rewriter) const override {
+    Type expressed = op.getType().cast<ShapedType>().getElementType();
+    quant::QuantizedType quant_type;
+    SmallVector<double, 4> mins, maxs;
+
+    if (op.getAxisStats().has_value()) {
+      // Per axis quantization (or per channel quantization)
+      int stats_num = op.getAxisStats()->getNumElements();
+      if (stats_num == 0 || stats_num % 2 != 0) return failure();
+      auto stats = op.getAxisStats()->dyn_cast<DenseFPElementsAttr>();
+      if (!stats) return failure();
+
+      for (auto it = stats.begin(), e = stats.end(); it != e; ++it) {
+        double rmin = FloatAttr::getValueAsDouble(*it++);
+        double rmax = FloatAttr::getValueAsDouble(*it);
+        // The default nudging implementation of mlir quant library might cause
+        // clamping during inference if the calibration range isn't wide enough.
+        // So here we adjust the range to include 0.0.
+        rmin = std::min(rmin, 0.0);
+        rmax = std::max(rmax, 0.0);
+        if (num_bits == 16) {
+          // TODO: b/266536261 - Since the kernel implementation assumes that
+          // 16x8 integer quantization is symmetric, this MLIR quantizer
+          // supports only symmetric quantization.
+          rmax = std::max(std::abs(rmin), std::abs(rmax));
+          rmin = -rmax;
+        }
+        TensorRangeSanityCheck(op, rmin, rmax);
+        mins.push_back(rmin);
+        maxs.push_back(rmax);
+      }
+      quant_type = quantfork::fakeQuantAttrsToType(
+          op.getLoc(), num_bits, *op.getAxis(), mins, maxs, narrow_range,
+          expressed, is_signed);
+      if (legacy_float_scale) {
+        quant_type = DownCastScale(quant_type, mins, maxs, op->getLoc());
+      }
+    } else if (auto stats =
+                   op.getLayerStats().dyn_cast<DenseFPElementsAttr>()) {
+      // Per tensor quantization
+      auto statValues = stats.getValues<APFloat>();
+      double rmin = FloatAttr::getValueAsDouble(statValues[0]);
+      double rmax = FloatAttr::getValueAsDouble(statValues[1]);
+      // The default nudging implementation of mlir quant library might cause
+      // clamping during inference if the calibration range isn't wide enough.
+      // So here we adjust the range to include 0.0.
+      rmin = std::min(rmin, 0.0);
+      rmax = std::max(rmax, 0.0);
+      if (num_bits == 16) {
+        // TODO: b/266536261 - Since the kernel implementation assumes that
+        // 16x8 integer quantization is symmetric, this MLIR quantizer supports
+        // only symmetric quantization.
+        rmax = std::max(std::abs(rmin), std::abs(rmax));
+        rmin = -rmax;
+      }
+      TensorRangeSanityCheck(op, rmin, rmax);
+      quant_type =
+          quantfork::fakeQuantAttrsToType(op.getLoc(), num_bits, rmin, rmax,
+                                          narrow_range, expressed, is_signed);
+      if (legacy_float_scale) {
+        quant_type = DownCastScale(quant_type, rmin, rmax, op->getLoc());
+      }
+    } else {
+      return failure();
+    }
+
+    rewriter.setInsertionPointAfter(op.getOperation());
+    Type result_type = quant_type.castFromExpressedType(op.getType());
+    auto q =
+        rewriter.create<QuantizeOpT>(op.getLoc(), result_type, op.getArg());
+    q->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
+
+    auto dq = rewriter.create<DequantizeOpT>(op.getLoc(), op.getType(), q);
+    op.getResult().replaceAllUsesWith(dq);
+    q.getOperation()->replaceUsesOfWith(dq, op.getArg());
+    op.erase();
+
+    return success();
+  }
+
+ private:
+  int num_bits;
+  bool narrow_range;
+  bool is_signed;
+  bool legacy_float_scale;
+
+  // Emits an op warning message if the calibrated range is larger than 10.0 and
+  // the storage type is less than or equal to 8 bits.
+  void TensorRangeSanityCheck(quantfork::StatisticsOp op, double& min,
+                              double& max) const {
+    double range = std::fabs(max - min);
+    if (num_bits <= 8 && range >= 10.0) {
+      op.emitWarning()
+          << "Tensor range is too wide to be quantized. Use tf.clip_by_value "
+             "or tf.relu6 to narrow the tensor range. Range: "
+          << range << ", bit width: " << num_bits;
+    }
+    if (std::abs(max - min) < kNearZeroTolerance) {
+      op.emitWarning() << "Tensor range (" << min << ", " << max
+                       << ") is too narrow and it might cause overflow. "
+                          "Expanding range symmetrically by "
+                       << kNearZeroTolerance;
+      min -= kNearZeroTolerance;
+      max += kNearZeroTolerance;
+    }
+  }
+};
+
+template <typename VerifierT>
+bool UsedBy(Operation* op) {
+  for (Operation* user : op->getUsers()) {
+    if (llvm::isa_and_nonnull<VerifierT>(user)) return true;
+  }
+  return false;
+}
+
+template <typename VerifierT>
+void CreateVerifier(Operation* quantizing_op, Operation* quantized_op,
+                    PatternRewriter& rewriter, int result_idx,
+                    const QuantPassSpec& quant_params) {
+  rewriter.setInsertionPointAfter(quantized_op);
+  FloatAttr tolerance = rewriter.getF32FloatAttr(
+      quant_params.numeric_verify_spec.error_tolerance);
+  BoolAttr log =
+      rewriter.getBoolAttr(quant_params.numeric_verify_spec.log_if_failed_flag);
+  // Verify the quantized value by sending the result to the verifier.
+  rewriter.create<VerifierT>(
+      quantizing_op->getLoc(), quantized_op->getResult(result_idx).getType(),
+      quantized_op->getResult(result_idx), quantizing_op->getResult(result_idx),
+      tolerance, log);
+}
+
+template <>
+inline bool UsedBy<void>(Operation* op) {
+  return false;
+}
+
+// This specialization is not going to be called, but needed for compilation.
+template <>
+inline void CreateVerifier<void>(Operation* quantizing_op,
+                                 Operation* quantized_op,
+                                 PatternRewriter& rewriter, int result_idx,
+                                 const QuantPassSpec& quant_params) {}
+
+// A base rewrite pattern which matches any N-in-M-out operations with
+// quantization parameters propagated to at least one of its operands. The
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
+//
+// The concrete pattern, extends from this base pattern, can specify whether it
+// allows dynamic range quantized operands and results for the operations in the
+// current context. These "DynamicRangeQuantized" operands and results don't
+// have quantization parameters propagated to, so will be in float in the
+// quantized results. The concrete pattern should define the following two
+// functions:
+//
+//   bool AllowDynamicRangeQuantizedOperand(Operation *) const
+//   bool AllowDynamicRangeQuantizedResult(Operation *) const
+//
+// Full integer quantization disallows "DynamicRangeQuantized" operands or
+// results. Dynamic range quantization allows "DynamicRangeQuantized" operands
+// and results.
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT>
+class QuantizationPattern : public RewritePattern {
+ public:
+  using BaseType = QuantizationPattern<ConcreteT, QuantizeOpT, DequantizeOpT,
+                                       VerifierT, RootOpT>;
+
+  explicit QuantizationPattern(MLIRContext* context,
+                               const QuantPassSpec& quant_params)
+      // Set the score to a large number so it is always preferred.
+      : RewritePattern(RootOpT::getOperationName(), 300, context),
+        quant_params_(quant_params) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    llvm::SmallVector<Operation*, 4> quantizing_ops;
+
+    // Collect all the ops to quantize, as the user / producer of the root op.
+    if constexpr (std::is_same_v<RootOpT, DequantizeOpT>) {
+      if (op->getNumResults() != 1) {
+        return failure();
+      }
+      auto users = op->getResult(0).getUsers();
+      quantizing_ops.append(users.begin(), users.end());
+    } else if constexpr (std::is_same_v<RootOpT, QuantizeOpT>) {
+      if (op->getNumOperands() != 1) {
+        return failure();
+      }
+      Value quantize_operand = op->getOperand(0);
+      if (QuantizedType::getQuantizedElementType(quantize_operand.getType())) {
+        // The input of this QuantizeOp has already been quantized, i.e.
+        // rescale.
+        return failure();
+      }
+      DenseFPElementsAttr attr;
+      if (matchPattern(quantize_operand, m_Constant(&attr))) {
+        // Const-> QuantizeOp pattern will be handled separately.
+        return failure();
+      }
+      if (Operation* quantizing_op = quantize_operand.getDefiningOp()) {
+        quantizing_ops.push_back(quantizing_op);
+      }
+    }
+
+    tensorflow::DataType inference_type =
+        quant_params_.quant_spec.inference_type;
+    bool weight_only_quantization =
+        quant_params_.quant_spec.weight_only_quantization;
+    bool enable_verify = quant_params_.numeric_verify_spec.verify_numeric;
+    bool enable_whole_model_verify =
+        quant_params_.numeric_verify_spec.whole_model_verify;
+    absl::flat_hash_set<std::string> ops_blocklist =
+        quant_params_.quant_spec.ops_blocklist;
+    absl::flat_hash_set<std::string> nodes_blocklist =
+        quant_params_.quant_spec.nodes_blocklist;
+    CustomMap custom_map = quant_params_.quant_spec.custom_map;
+
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* quantizing_op : quantizing_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (llvm::isa<QuantizeOpT, DequantizeOpT>(quantizing_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, not quantizable or any ops from the mlir quant
+      // ops dialect, we shouldn't rewrite. In case of whole-model verify debug
+      // mode, not-quantizable ops should be duplicated to keep parallel
+      // float/quant model execution.
+      if (quantizing_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!IsOpQuantizable(quantizing_op) &&
+          !static_cast<const ConcreteT*>(this)->IsQuantizableCustomOp(
+              quantizing_op, custom_map)) {
+        if (!(enable_verify && enable_whole_model_verify)) {
+          return failure();
+        }
+        if (quantizing_op->hasAttr(kDebugModeOpQuantAttrName) ||
+            quantizing_op->hasAttr(kDebugModeOpFloatAttrName)) {
+          return failure();
+        }
+
+        rewriter.setInsertionPoint(quantizing_op);
+        Operation* float_op = rewriter.clone(*quantizing_op);
+        quantizing_op->setAttr(kDebugModeOpQuantAttrName,
+                               rewriter.getUnitAttr());
+        float_op->setAttr(kDebugModeOpFloatAttrName, rewriter.getUnitAttr());
+        RewireFloatModelBackbone(quantizing_op, float_op);
+        return success();
+      }
+
+      // Blocklist op is checked in advance for non-dynamic range quantization
+      // case.
+      if (!quant_params_.quant_spec.weight_quantization &&
+          (ops_blocklist.find(quantizing_op->getName().getStringRef().str()) !=
+           ops_blocklist.end())) {
+        return failure();
+      }
+
+      if (!nodes_blocklist.empty()) {
+        if (auto name_loc = quantizing_op->getLoc().dyn_cast<NameLoc>()) {
+          std::string sloc = name_loc.getName().str();
+          if (!sloc.empty() &&
+              (nodes_blocklist.find(sloc) != nodes_blocklist.end())) {
+            return failure();
+          }
+        }
+      }
+
+      // An op with float inputs and outputs are expected when it's used by a
+      // NumericVerify op. Skip this op.
+      if (enable_verify && UsedBy<VerifierT>(quantizing_op)) {
+        continue;
+      }
+
+      bool is_operand_or_result_modified = false;
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(quantizing_op->getNumOperands());
+      for (auto operand : quantizing_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (operand_type.isa<NoneType>()) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        auto ele_type = operand.getType().cast<TensorType>().getElementType();
+        if (static_cast<const ConcreteT*>(this)
+                ->AllowDynamicRangeQuantizedOperand(quantizing_op,
+                                                    custom_map)) {
+          auto dq_op = dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp());
+
+          if (dq_op && inference_type == tensorflow::DT_QINT8 &&
+              !static_cast<const ConcreteT*>(this)->IsWeightOnlyOp(
+                  quantizing_op, ops_blocklist, weight_only_quantization,
+                  custom_map)) {
+            // Dynamic range quantization is applied by having QuantizeOp as an
+            // input. Only int8 weight is supported for now.
+            inputs.push_back(dq_op.getOperand());
+            is_operand_or_result_modified = true;
+          } else {
+            // Otherwise, it's the case where the operand is activations or the
+            // quantizing_op is non-supported/weight-only.
+            inputs.push_back(operand);
+          }
+        } else {
+          if (auto dq_op =
+                  dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+            is_operand_or_result_modified = true;
+            inputs.push_back(dq_op.getOperand());
+          } else if (!ele_type.isF32()) {
+            // If the operand is an integer tensor, then it doesn't require the
+            // DequantizeOp in the pattern.
+            inputs.push_back(operand);
+          } else {
+            return failure();
+          }
+        }
+      }
+
+      Operation* quantized_op;
+      if (QuantizableOpSupportsFloatOutputType(quantizing_op)) {
+        rewriter.setInsertionPointAfter(quantizing_op);
+        OperationState new_state(
+            quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
+            inputs, quantizing_op->getResultTypes(), quantizing_op->getAttrs());
+        for (const auto& indexed_regions :
+             llvm::enumerate(quantizing_op->getRegions())) {
+          Region* target_region = new_state.addRegion();
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(target_region, mapping);
+        }
+        quantized_op = rewriter.create(new_state);
+        rewriter.replaceOp(quantizing_op, quantized_op);
+      } else {
+        // Collect all the quantized outputs and replace them by the results of
+        // the new quantized op.
+        llvm::SmallDenseMap<Value, int> outputs_replaced;
+        SmallVector<Type, 4> output_types;
+        output_types.reserve(quantizing_op->getNumResults());
+        for (const auto& enumerated_result :
+             llvm::enumerate(quantizing_op->getResults())) {
+          Value result = enumerated_result.value();
+          Type result_type = result.getType();
+          // Add this to the test coverage once we create test ops with none
+          // type results.
+          if (result_type.isa<NoneType>()) {
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result_type);
+            continue;
+          }
+          Type result_ele_type =
+              result.getType().cast<TensorType>().getElementType();
+          // If the user is the QuantizeOp, it must be the only user.
+          if (result.hasOneUse() &&
+              llvm::isa<QuantizeOpT>(*result.user_begin())) {
+            auto user = llvm::cast<QuantizeOpT>(*result.user_begin());
+            outputs_replaced.insert(
+                {user.getResult(), enumerated_result.index()});
+            output_types.push_back(user.getType());
+            is_operand_or_result_modified = true;
+          } else if (!result_ele_type.isF32()) {
+            // If the result is an integer tensor, then it doesn't require the
+            // D op in the pattern.
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result.getType());
+          } else if (static_cast<const ConcreteT*>(this)
+                         ->AllowDynamicRangeQuantizedResult(quantizing_op,
+                                                            custom_map)) {
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result.getType());
+          } else {
+            return failure();
+          }
+        }
+
+        // For float16 quantization if none of the operand or result is
+        // modified, replacing the op. See b/335025403.
+        if (inference_type == tensorflow::DT_HALF &&
+            !is_operand_or_result_modified) {
+          return failure();
+        }
+
+        rewriter.setInsertionPointAfter(quantizing_op);
+        OperationState new_state(
+            quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
+            inputs, output_types, quantizing_op->getAttrs());
+        for (int i = 0; i < quantizing_op->getNumRegions(); ++i) {
+          new_state.addRegion();
+        }
+        quantized_op = rewriter.create(new_state);
+        if (quantizing_op->getNumRegions() != 0) {
+          for (const auto& indexed_regions :
+               llvm::enumerate(quantizing_op->getRegions())) {
+            Region& target_region =
+                quantized_op->getRegion(indexed_regions.index());
+            IRMapping mapping;
+            indexed_regions.value().cloneInto(&target_region, mapping);
+          }
+        }
+        for (auto output : outputs_replaced) {
+          output.getFirst().replaceAllUsesWith(
+              quantized_op->getResult(output.getSecond()));
+        }
+      }
+
+      // To verify the numericals, the original floating-point ops are
+      // preserved in the graph. The result of these floating-point ops are sent
+      // to a numeric verifier op as the reference.
+      if (enable_verify && !std::is_same_v<VerifierT, void>) {
+        // For constant operands, the floating-point constant is duplicated in
+        // case it is quantized.
+        for (int i = 0, e = quantized_op->getNumOperands(); i < e; ++i) {
+          auto def = quantized_op->getOperand(i).getDefiningOp();
+          if (auto q = llvm::dyn_cast_or_null<QuantizeOpT>(def)) {
+            DenseFPElementsAttr attr;
+            if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
+              continue;
+            }
+            auto cst = rewriter.create<arith::ConstantOp>(
+                quantized_op->getLoc(), attr);
+            quantizing_op->setOperand(i, cst.getResult());
+          }
+        }
+
+        for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
+          if (!quantizing_op->getResult(i)
+                   .getType()
+                   .cast<ShapedType>()
+                   .getElementType()
+                   .isa<FloatType>()) {
+            continue;
+          }
+          CreateVerifier<VerifierT>(quantizing_op, quantized_op, rewriter, i,
+                                    quant_params_);
+
+          if (enable_whole_model_verify) {
+            RewireFloatModelBackbone(quantized_op, quantizing_op);
+          }
+        }
+      }
+    }
+    return success();
+  }
+
+ private:
+  // Reconnects float ops in the whole-model verify mode. Works for both
+  // Quantizable ops and Unquantizable ops
+  void RewireFloatModelBackbone(Operation* quantized_op,
+                                Operation* float_op) const {
+    for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
+      if (!float_op->getResult(i)
+               .getType()
+               .cast<ShapedType>()
+               .getElementType()
+               .isF32()) {
+        continue;
+      }
+      // Find the Quantize/Dequantize users of the new op results, and replace
+      // the usage. Then all the floating-point ops are connected, forming a
+      // separate float "backbone" model that the quantized model can be
+      // compared against in parallel.
+      // N.B. the return op will use this floating-point result.
+      Value result;
+      if (!IsOpQuantizable(float_op)) {
+        // For not quantizable ops, search for dequantize attached to the
+        // quantized op of the output.
+        if (Operation* quantize_op = dyn_cast_or_null<QuantizeOpT>(
+                *quantized_op->getResult(i).getUsers().begin())) {
+          result = quantize_op->getResult(0);
+        } else {
+          quantized_op->emitError()
+              << "Output[" << i
+              << "] is expected to have only one user [QUANTIZE]";
+          return;
+        }
+      } else {
+        result = quantized_op->getResult(i);
+      }
+      for (auto user : result.getUsers()) {
+        // Skip the Requantize op and set the user to the following dequantize
+        // op. This happens when the quantizer tries to match the scale conflict
+        // with QuantizeOp - QuantizeOp(requant) - DequantizeOp triples. The
+        // correct float op should be the user of the last DequantizeOp.
+        if (llvm::isa<QuantizeOpT>(user)) {
+          user = *user->getResult(0).getUsers().begin();
+        }
+        if (auto dequantize = llvm::dyn_cast<DequantizeOpT>(user)) {
+          // Replace all uses, except not quantizable ops that are being used in
+          // the float backbone.
+          dequantize.getResult().replaceUsesWithIf(
+              float_op->getResult(i), [&](OpOperand& use) {
+                return !use.getOwner()->hasAttr(kDebugModeOpQuantAttrName);
+              });
+        }
+      }
+    }
+  }
+
+  QuantPassSpec quant_params_;
+};
+
+// A pattern that removes debug attributes that are annotated to ops during
+// the debug model creation.
+class RemoveDebugAttrPattern : public RewritePattern {
+ public:
+  explicit RemoveDebugAttrPattern(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Converts quantized tensor type with signed integer type to quantized tensor
+// type with unsigned integer type.
+Type ConvertSignedQuantizedToUnsigned(Type signed_tensor_type, Location loc);
+
+// Converts quantize ops with unsigned quantized types to these with signed
+// quantized types and preserves the scales.
+template <typename QuantizeOpT>
+struct ConvertUnsignedToSigned : public OpRewritePattern<QuantizeOpT> {
+  using BaseType = ConvertUnsignedToSigned<QuantizeOpT>;
+  using QType = quant::QuantizedType;
+
+  explicit ConvertUnsignedToSigned(MLIRContext* context)
+      : OpRewritePattern<QuantizeOpT>(context, 1) {}
+
+  LogicalResult matchAndRewrite(QuantizeOpT op,
+                                PatternRewriter& rewriter) const override {
+    Type output_type = op.getResult().getType();
+    auto qtype = QType::getQuantizedElementType(output_type);
+    if (!qtype || qtype.isSigned()) return failure();
+
+    int num_bits = qtype.getStorageTypeIntegralWidth();
+    if (num_bits == 8) {
+      // If storage is 8-bit, trained num bits may be less than 8 so check here.
+      num_bits =
+          static_cast<int>(std::ceil(std::log2(qtype.getStorageTypeMax())));
+    }
+    // This is a positive value, and will be applied on zero points and fixed
+    // point ranges.
+    int64_t offset =
+        QType::getDefaultMinimumForInteger(/*isSigned=*/false, num_bits) -
+        QType::getDefaultMinimumForInteger(/*isSigned=*/true, num_bits);
+
+    auto flags = quant::QuantizationFlags::Signed;
+    QType new_qtype;
+    if (auto uqtype = qtype.template dyn_cast<quant::UniformQuantizedType>()) {
+      new_qtype = quant::UniformQuantizedType::getChecked(
+          op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
+          uqtype.getScale(), uqtype.getZeroPoint() - offset,
+          uqtype.getStorageTypeMin() - offset,
+          uqtype.getStorageTypeMax() - offset);
+    } else if (auto aqtype = qtype.template dyn_cast<
+                             quant::UniformQuantizedPerAxisType>()) {
+      auto zero_points = aqtype.getZeroPoints();
+      llvm::SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
+                                                    zero_points.end());
+      for (int i = 0, e = new_zero_points.size(); i < e; ++i) {
+        new_zero_points[i] -= offset;
+      }
+      new_qtype = quant::UniformQuantizedPerAxisType::getChecked(
+          op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
+          aqtype.getScales(), new_zero_points, aqtype.getQuantizedDimension(),
+          aqtype.getStorageTypeMin() - offset,
+          aqtype.getStorageTypeMax() - offset);
+    } else {
+      return failure();
+    }
+
+    if (!new_qtype) return failure();
+    Type new_output_type = new_qtype.castFromExpressedType(
+        QType::castToExpressedType(output_type));
+    rewriter.replaceOpWithNewOp<QuantizeOpT>(op, new_output_type, op.getArg());
+    return success();
+  }
+};
+
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RequantizeOpT>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RequantizeOpT> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RequantizeOpT>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RequantizeOpT op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op->getOperand(0);
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (llvm::isa<FixedOutputRangeInterface, SameScalesOpInterface>(def) ||
+        !def->hasTrait<OpTrait::quant::QuantizableResult>()) {
+      return failure();
+    }
+
+    // This op should not clobber def, if more than one requant of this value.
+    if (!pre_quantized.hasOneUse()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.getResult().getType());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.create(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
+// Given a quantized type `input`, magnifying its scales by the factor stored in
+// `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
+// dimension size of `input` or isn't floating-point, nullptr will be returned.
+TypeAttr RescaleQuantizedType(Type input, Attribute factor);
+
+// Converts the min/max/num_bits/narrow_range information to a
+// QuantizedType, and then returns the attribute containing the QuantizedType.
+// The `min` and `max` arguments can be FloatAttr or DenseFPElementsAttr and
+// returns UniformQuantizedType or UniformQuantizedPerAxisType respectively.
+// `narrow_range` is set to true for weights and `is_signed` is set to true
+// if it is using signed int symmetric quantization.
+//
+// Note that this method may broadcast min and max to match the dimension length
+// of `input_type`, if the `quant_dim` is valid. On the other hand, the
+// symmetry of min and max is not adjusted by this method. The QAT workflow
+// should set min/max correctly (and use `narrow_range`=true, `is_signed`=true)
+// if symmetric quantization is required.
+TypeAttr GetQuantizedTypeAttr(Builder builder, Type input_type, Attribute min,
+                              Attribute max, int quant_dim,
+                              IntegerAttr num_bits, BoolAttr narrow_range,
+                              bool is_signed, bool legacy_float_scale = false,
+                              bool use_fake_quant_num_bits = false);
+
+// Casts the `target` type to a quantized type by using the quantization
+// parameters from the type in the `source` type attribute.
+// Examples:
+//   f32 -> !quant.uniform<i8:f32, 1.0>
+//   tensor<4xf32> -> tensor<4x!quant.uniform<i8:f32, 1.0>>
+// The result is wrapped by a type attribute. Returns nullptr if the cast
+// isn't valid.
+//
+// `axis` is to specify the quantization dimension in the `target` and only
+// used if the element type of `source` is a per-channel quantized type. During
+// the casting, the quantization dimension of the result type needs to be set
+// this new `axis` value.
+TypeAttr CastQuantizedTypeAttrFromExpressedType(Builder builder,
+                                                TypeAttr source, Type target,
+                                                int axis);
+
+// Quantizes the elements in the attribute `real_value` by the quantization
+// parameters in `tensor_type`. Returns empty Attribute if the
+// `tensor_type` is not a QuantizedType or the quantization fails.
+ElementsAttr Quantize(Attribute real_value, Type tensor_type);
+
+// Quantizes the elements in "legacy mode", where it calls TOCO's methods to
+// to quantize values with float scale.
+ElementsAttr QuantizeLegacy(Attribute real_value, Type tensor_type);
+
+// Returns the quantized type for an element attribute. The quantization
+// parameters in this type is based on the min and max element of the
+// attribute. When the elements in the `attr` are not in floating-point, or
+// the value range isn't straddling zero, an empty type is returned. The min/max
+// are adjusted to be symmetric if `symmetric` flag is set to True. And
+// `symmetric` can only be set to true when it is signed and narrow_range.
+Type GetUniformQuantizedTypeForWeight(ElementsAttr attr, bool symmetric,
+                                      unsigned num_bits, bool is_signed,
+                                      bool narrow_range,
+                                      bool legacy_float_scale = false,
+                                      bool use_fake_quant_num_bits = false);
+
+// Returns the per channel quantized type for an element attribute.
+// `quant_dim` defines the quantization axis. The channel min/max are adjusted
+// to be symmetric if `symmetric` flag is set to True. And `symmetric` can only
+// be set to true when it is signed and narrow_range.
+Type GetUniformQuantizedPerAxisTypeForWeight(
+    ElementsAttr attr, int quant_dim, bool symmetric, unsigned num_bits,
+    bool is_signed, bool narrow_range, bool legacy_float_scale = false,
+    bool use_fake_quant_num_bits = false);
+
+// Returns the quantized type of a bias input, given the quantized types of
+// other operands which are multiply-accumulated (the bias is added to the
+// accumulated value).
+quant::QuantizedType GetUniformQuantizedTypeForBias(
+    const std::vector<quant::QuantizedType>& op_types, int adjusted_quant_dim,
+    bool legacy_float_scale = false);
+
+// Gets quantization scale specs (e.g. fixed output range, same result and
+// operand scales) from the default quantization interfaces. The op should
+// outlive returned spec for its interface methods to be properly referenced.
+std::unique_ptr<OpQuantScaleSpec> GetDefaultQuantScaleSpec(Operation* op);
+
+// The function might contain more stats ops than required, and it will
+// introduce requantize if the calibration stats have conflicts. This method
+// tries to remove all the redundant stats ops.
+bool RemoveRedundantStatsOps(mlir::func::FuncOp func,
+                             OpQuantSpecGetter op_quant_spec_getter,
+                             OpQuantScaleSpecGetter op_quant_scale_spec_getter =
+                                 GetDefaultQuantScaleSpec);
+
+// Given quantization parameters for int8, compute the quantization parameters
+// for uint if it is required, and wrap the result in an UniformQuantizedType.
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point,
+                                                int64_t storage_min,
+                                                int64_t storage_max);
+
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point);
+
+// Extracts min and max values from the DenseFPElementsAttr, and stores them
+// into `mins` and `maxs`. When mins and maxs are extracted per-channel,
+// `dim_size` is number of channels and `slice_size` is the size of slice per
+// each channel. When `symmetric` is true, the range is expanded to [-M, M].
+void ExtractMinMaxFromAttr(DenseFPElementsAttr values, int dim_size,
+                           int slice_size, bool symmetric,
+                           SmallVectorImpl<double>& mins,
+                           SmallVectorImpl<double>& maxs);
+
+// Returns the quantized type for the
+// input_type/min/max/storage_type_width/narrow_range.
+Type GetQuantizedType(Builder builder, Type input_type, ArrayRef<double> min,
+                      ArrayRef<double> max, int quant_dim,
+                      int storage_type_width, bool narrow_range, bool is_signed,
+                      bool legacy_float_scale = false,
+                      bool use_fake_quant_num_bits = false);
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/test_base.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/test_base.h
new file mode 100644
index 00000000..f33e586c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -0,0 +1,87 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir::quant {
+
+using ::testing::Test;
+
+class QuantizationTestBase : public Test {
+ protected:
+  QuantizationTestBase()
+      : ctx_(stablehlo::CreateMlirContextForQuantization()),
+        builder_(ctx_.get()) {
+    ctx_->loadDialect<
+        arith::ArithDialect, mlir::stablehlo::StablehloDialect,
+        func::FuncDialect, TF::TensorFlowDialect, TFL::TensorFlowLiteDialect,
+        tf_saved_model::TensorFlowSavedModelDialect,
+        tf_executor::TensorFlowExecutorDialect, quant::QuantDialect,
+        quantfork::QuantizationForkDialect>();
+  }
+
+  // Parses `module_op_str` to create a `ModuleOp`.
+  OwningOpRef<ModuleOp> ParseModuleOpString(
+      const absl::string_view module_op_str) {
+    return parseSourceString<ModuleOp>(module_op_str, ctx_.get());
+  }
+
+  // Convenience function that returns the first operation of type `OpT` from
+  // the `@main` function in `module_op`. Useful when testing with a text
+  // representation of a `ModuleOp` containing a single function `@main`.
+  // Returns `failure` iff there is no `@main` or no such operation is found in
+  // `@main`.
+  template <typename OpT>
+  FailureOr<OpT> FindFirstOpFromMainFunc(ModuleOp module_op) {
+    func::FuncOp main_func_op = FindMainFuncOp(module_op);
+    if (main_func_op == nullptr) return failure();
+
+    auto ops = main_func_op.getOps<OpT>();
+    if (ops.empty()) return failure();
+
+    return *ops.begin();
+  }
+
+  std::unique_ptr<MLIRContext> ctx_;
+  OpBuilder builder_;
+};
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
new file mode 100644
index 00000000..99815f73
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
@@ -0,0 +1,120 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_UNIFORM_QUANTIZED_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_UNIFORM_QUANTIZED_TYPES_H_
+
+#include <cstdint>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+// Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
+// values. The produced type has f32 as its expressed type and i8 as its
+// storage type. The available values use the full range of the storage value,
+// i.e. [-128, 127]. Assumes asymmetric quantization, meaning the zero point
+// value can be a non-zero value.
+// If `narrow_range` is set true (ex: for weights), a restricted range of
+// integers will be used for symmetric mapping, i.e. [-127, 127].
+UniformQuantizedType CreateI8F32UniformQuantizedType(Location loc,
+                                                     MLIRContext& context,
+                                                     double scale,
+                                                     int64_t zero_point,
+                                                     bool narrow_range = false);
+
+// Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
+// values. The produced type has f32 as its expressed type and i32 as its
+// storage type. The available values use the full range of the storage value.
+// Assumes asymmetric quantization, meaning the zero point value can be
+// a non-zero value.
+UniformQuantizedType CreateI32F32UniformQuantizedType(Location loc,
+                                                      MLIRContext& context,
+                                                      double scale,
+                                                      int64_t zero_point);
+
+// Creates a `UniformQuantizedPerAxisType` with the given `scales` and
+// `zero_points` values. The produced type has f32 as its expressed type and
+// i8 as its storage type. The available values use the full range of the
+// storage value, i.e. [-128, 127]. Assumes asymmetric quantization, meaning the
+// zero point values can be non-zero values.
+// If `narrow_range` is set true (ex: for weights), a restricted range of
+// integers will be used for symmetric mapping, i.e. [-127, 127].
+UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
+    Location loc, MLIRContext& context, ArrayRef<double> scales,
+    ArrayRef<int64_t> zero_points, int quantization_dimension,
+    bool narrow_range = false);
+
+// Creates a `UniformQuantizedPerAxisType` with the given `scales` and
+// `zero_points` values. The produced type has f32 as its expressed type and
+// i32 as its storage type. The available values use the full range of the
+// storage value. Assumes asymmetric quantization, meaning the
+// zero point values can be non-zero values.
+UniformQuantizedPerAxisType CreateI32F32UniformQuantizedPerAxisType(
+    Location loc, MLIRContext& context, ArrayRef<double> scales,
+    ArrayRef<int64_t> zero_points, int quantization_dimension);
+
+bool IsStorageTypeI8(QuantizedType quantized_type);
+
+bool IsStorageTypeI32(QuantizedType quantized_type);
+
+bool IsExpressedTypeF32(QuantizedType quantized_type);
+
+// Given a value, extract the `ElementType`.
+// `value` should be a non-null `TensorType`.
+inline Type GetElementType(const Value value) {
+  return mlir::cast<TensorType>(value.getType()).getElementType();
+}
+
+// Returns true iff `type` is a uniform quantized type whose storage type is
+// 8-bit integer and expressed type is f32.
+bool IsI8F32UniformQuantizedType(Type type);
+
+// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
+// whose storage type is 8-bit integer and expressed type is f32.
+bool IsI8F32UniformQuantizedPerAxisType(Type type);
+
+// Returns true iff `type` is a uniform quantized type whose storage type is
+// 32-bit integer and expressed type is f32.
+bool IsI32F32UniformQuantizedType(Type type);
+
+// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
+// whose storage type is 32-bit integer and expressed type is f32.
+bool IsI32F32UniformQuantizedPerAxisType(Type type);
+
+// Determines whether the storage type of a quantized type is supported by
+// `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
+bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type);
+
+// Returns true if a type is quantized tensor type.
+bool IsQuantizedTensorType(Type type);
+
+// Returns true if all operands and results are quantized.
+bool IsOpFullyQuantized(Operation* op);
+
+// Returns true iff none among operand and result tensors are quantized.
+bool IsOpNotQuantized(Operation* op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_UNIFORM_QUANTIZED_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
new file mode 100644
index 00000000..9e1950af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_CALIBRATION_PARAMETERS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_CALIBRATION_PARAMETERS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace stablehlo::quantization {
+
+// Calculates the bin width from the range and expected number of bins. The
+// bin width is formalized to the form of 2^n. As a consequence, the actual
+// number of bins might be smaller than the given `num_bins`.
+inline float CalculateBinWidth(const float min_value, const float max_value,
+                               const int32_t num_bins) {
+  const float raw_bin_width = (max_value - min_value) / num_bins;
+  return std::pow(2, std::ceil(std::log2(raw_bin_width)));
+}
+
+// Calculates the lower bound of the histogram. The lower bound is in form of
+// `N * bin_width`.
+inline float CalculateLowerBound(const float min_value, const float bin_width) {
+  return std::floor(min_value / bin_width) * bin_width;
+}
+
+// Calculates the bin index of the current value.
+inline int32_t CalculateBinIndex(const float value, const float lower_bound,
+                                 const float bin_width) {
+  return std::floor((value - lower_bound) / bin_width);
+}
+
+// Same as `CalculateBinIndex` but clamps to avoid out-of-bound.
+inline int32_t CalculateBinIndexSafe(const float value, const float lower_bound,
+                                     const float bin_width,
+                                     const int32_t num_bins) {
+  const int32_t bin_index = CalculateBinIndex(value, lower_bound, bin_width);
+  return std::clamp(bin_index, 0, num_bins - 1);
+}
+
+// Checks if the given method is a histogram-based calibration method.
+inline bool IsHistogramCalibration(
+    const CalibrationOptions::CalibrationMethod method) {
+  return method ==
+             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE ||
+         method ==
+             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE ||
+         method == CalibrationOptions::
+                       CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY ||
+         method ==
+             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC;
+}
+
+// Gets the number of bins for the given calibration method.
+inline int32_t GetNumBins(const CalibrationOptions& calib_opts) {
+  return IsHistogramCalibration(calib_opts.calibration_method())
+             ? calib_opts.calibration_parameters().num_bins()
+             : 0;
+}
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_CALIBRATION_PARAMETERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
new file mode 100644
index 00000000..03d2dd93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
@@ -0,0 +1,122 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_COMPONENT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_COMPONENT_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs post-calibration graph transformation as part of post-training
+// static-range quantization.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the statistics collected
+// after the calibration step, corresponding to each `TF::CustomAggregatorOp`s
+// in the input module op.
+//
+// TODO: b/320607042 - Add tests for this component on the python layer.
+class CalibrationComponent : public Component {
+ public:
+  // Name of the post-training quantization post-calibration step. Used for
+  // debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_calibration";
+
+  // `CalibrationComponent` ctor with necessary information required to run
+  // calibration on a `ModuleOp`. Meta information like `function_aliases`,
+  // `tags`, `signature_def_map`, and `signature_keys` are required to properly
+  // save and load the module_op to and from SavedModel.
+  // `representative_dataset_file_map` contains information about the
+  // calibration dataset.
+  CalibrationComponent(
+      absl::Nonnull<MLIRContext*> ctx,
+      absl::Nonnull<const tensorflow::quantization::PyFunctionLibrary*>
+          py_function_lib,
+      absl::string_view src_saved_model_path,
+      absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases,
+      std::unordered_set<std::string> tags,
+      absl::flat_hash_map<std::string, tensorflow::SignatureDef>
+          signature_def_map,
+      std::vector<std::string> signature_keys);
+
+  // Runs calibration on `module_op` and returns a calibrated ModuleOp with
+  // calibrated statistics embedded.
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  // Exports `module_op` to SavedModel at `dst_saved_model_path`. This is used
+  // to export the pre-calibrated `module_op` to SavedModel so that the
+  // calibration process can use it to load and run the graph with the
+  // representative dataset. Returns a failure status if the export fails.
+  absl::Status ExportToSavedModel(ModuleOp module_op,
+                                  absl::string_view calibration_data_dir,
+                                  bool force_regenerate_calibration_data,
+                                  absl::string_view dst_saved_model_path);
+
+  // Imports the SavedModel at `calibrated_saved_model_path` to `ModuleOp` after
+  // running calibration.
+  absl::StatusOr<ModuleOp> ImportCalibratedSavedModel(
+      absl::string_view calibrated_saved_model_path);
+
+  absl::Nonnull<MLIRContext*> ctx_;
+
+  // Contains function implementations from the python layer. Should be injected
+  // from the python level using pybind11.
+  absl::Nonnull<const tensorflow::quantization::PyFunctionLibrary*>
+      py_function_lib_;
+
+  // Path to the pre-calibrated SavedModel.
+  std::string src_saved_model_path_;
+
+  // Function alias mapping for pre-calibrated SavedModel. Used to preserve
+  // aliased functions.
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases_;
+
+  // Tags to identify the MetaGraphDef to load from a SavedModel.
+  const std::unordered_set<std::string> tags_;
+
+  const absl::flat_hash_map<std::string, tensorflow::SignatureDef>
+      signature_def_map_;
+
+  // Signature keys to identify the functions to load & quantize.
+  const std::vector<std::string> signature_keys_;
+};
+
+// Runs passes to prepare the calibration model.
+absl::Status RunCalibrationPasses(mlir::ModuleOp module_op, MLIRContext& ctx,
+                                  absl::string_view calibration_data_dir,
+                                  bool force_regenerate_calibration_data);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_COMPONENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h
new file mode 100644
index 00000000..5302bad4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_MIN_MAX_VALUE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_MIN_MAX_VALUE_H_
+
+#include <utility>
+
+namespace stablehlo::quantization {
+
+// Represents the (min, max) value pair, representing the range of values after
+// calibrating for quantization.
+using MinMaxValue = std::pair<float, float>;
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_MIN_MAX_VALUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h
new file mode 100644
index 00000000..33357630
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_REPRESENTATIVE_DATASET_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_REPRESENTATIVE_DATASET_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace stablehlo::quantization {
+
+// Translates a set of `RepresentativeDatsetConfig` to signature key ->
+// `RepresentativeDatasetFile` mapping. This is useful when using
+// `RepresentativeDatasetConfig`s at places that accept the legacy
+// `RepresentativeDatasetFile` mapping.
+// Returns a non-OK status when there is a duplicate signature key among
+// `representative_dataset_configs`.
+absl::StatusOr<absl::flat_hash_map<
+    std::string, tensorflow::quantization::RepresentativeDatasetFile>>
+CreateRepresentativeDatasetFileMap(absl::Span<const RepresentativeDatasetConfig>
+                                       representative_dataset_configs);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_REPRESENTATIVE_DATASET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
new file mode 100644
index 00000000..41f78be3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+
+namespace stablehlo::quantization {
+
+// Reads the calibration statistics from the given directory.
+absl::StatusOr<absl::flat_hash_map<
+    std::string, tensorflow::calibrator::CalibrationStatistics>>
+ReadStatistics(absl::string_view calibration_data_dir);
+
+// Adds calibrated min / max values to CustomAggregator nodes in `graph_def`.
+// The min and max values will be added to the "min" and "max" attributes,
+// respectively. `calibration_options` provides the strategy to retrieve min and
+// max values.
+absl::Status AddCalibrationStatistics(
+    mlir::ModuleOp module_op, absl::string_view calibration_data_dir,
+    const stablehlo::quantization::CalibrationOptions& calibration_options,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+
+// Checks if the model required calibration.
+bool IsCalibrationRequired(mlir::ModuleOp module_op);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h
new file mode 100644
index 00000000..a1ddb5cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_COMPONENT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_COMPONENT_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Component is a public abstraction for StableHLO Quantizer that represents the
+// most basic unit of action applied to the StableHLO graph. Derived classes
+// should override the `Run` method to implement the action.
+class Component {
+ public:
+  virtual ~Component() = default;
+
+  // Runs the action to the StableHLO graph, passed by the `module_op`. `config`
+  // should provide information necessary to configure the action's behavior.
+  virtual absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) = 0;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_COMPONENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
new file mode 100644
index 00000000..f668cacd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
+
+#include <optional>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace stablehlo::quantization {
+
+// Returns a copy of `user_provided_config` with default values populated where
+// the user did not explicitly specify.
+QuantizationConfig PopulateDefaults(
+    const QuantizationConfig& user_provided_config);
+
+// Returns a copy of `QuantizationConfig` where presets are expanded and
+// transformed into other fields in `QuantizationConfig`.
+//
+// The expansion rules are as follows:
+// * StaticRangePtqPreset
+//   - The preset's `representative_datasets` field will be transferred to
+//   `QuantizationConfig.calibration_options.representative_datasets`, unless
+//   the user explicitly provided representative dataset configs to
+//   `calibration_options`. In that case, the explicit configs take precedence
+//   and the preset's configs are ignored.
+//   - For `QuantizationSpecs`, the expanded `QuantizationSpec`s will be
+//   populated first and user-provided `QuantizationSpec`s, if any, will be
+//   appended. This expresses the fact that user-provided specs take precedence.
+// * Preset unspecified
+//   - No-op.
+QuantizationConfig ExpandPresets(const QuantizationConfig& config);
+
+// Returns whether a given QuantizationSpecs has the given quantization method.
+bool HasQuantizationMethod(const QuantizationSpecs& specs,
+                           Method::MethodCase method_case);
+
+// Convenience function for converting the optional `report_file_path` field to
+// `std::optional<absl::string_view>`, where `std::nullopt` represents that the
+// field is not explicitly set. The returned value is a reference type
+// (`absl::string_view`) so its lifetime is bound to the input `config`.
+inline std::optional<absl::string_view> GetReportFilePath(
+    const QuantizationConfig& config ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  return config.has_report_file_path()
+             ? std::make_optional<absl::string_view>(config.report_file_path())
+             : std::nullopt;
+}
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h
new file mode 100644
index 00000000..7d03564a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONTEXT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONTEXT_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace mlir::quant::stablehlo {
+
+// Creates an MLIRContext with the extensions required for quantization are
+// registered.
+inline std::unique_ptr<MLIRContext> CreateMlirContextForQuantization() {
+  DialectRegistry registry{};
+  func::registerAllExtensions(registry);
+  return std::make_unique<MLIRContext>(registry);
+}
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
new file mode 100644
index 00000000..feae1444
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace stablehlo::quantization {
+
+// Disables debugging on `DumpTensor` ops.
+void DisableDebugging(mlir::ModuleOp module_op);
+
+// Changes the filename from `unquantized_tensor_data.pb` to
+// `quantized_tensor_data.pb`.
+void ChangeToQuantizedFilename(mlir::ModuleOp module_op);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h
new file mode 100644
index 00000000..5796b18e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_GRAPH_DEF_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_GRAPH_DEF_H_
+
+#include <type_traits>
+
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace stablehlo::quantization {
+
+// Mutates all `NodeDef`s in `graph_def` by applying `func`. It modifies the
+// top-level `NodeDef`s as well as all `NodeDef`s in the function library.
+// `func` should accept a `NodeDef` reference.
+template <typename FuncT, typename = std::enable_if_t<std::is_invocable_r_v<
+                              void, FuncT, tensorflow::NodeDef&>>>
+void MutateNodeDefs(tensorflow::GraphDef& graph_def, FuncT&& func) {
+  for (tensorflow::NodeDef& node_def : *graph_def.mutable_node()) {
+    func(node_def);
+  }
+
+  for (tensorflow::FunctionDef& function_def :
+       *graph_def.mutable_library()->mutable_function()) {
+    for (tensorflow::NodeDef& node_def : *function_def.mutable_node_def()) {
+      func(node_def);
+    }
+  }
+}
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_GRAPH_DEF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
new file mode 100644
index 00000000..39c99436
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+
+namespace stablehlo::quantization::io {
+
+// Generates a unique local tmp file name. This function only generates the name
+// (path) and doesn't actually creates the file.
+absl::StatusOr<std::string> GetLocalTmpFileName(tsl::Env* env);
+
+// Generates a unique local tmp file name. This function only generates the name
+// (path) and doesn't actually creates the file. The default environment
+// `tsl::Env::Default` is used to generate the name.
+absl::StatusOr<std::string> GetLocalTmpFileName();
+
+// Creates a temporary directory on an environment defined by the implementation
+// of `tsl::Env` and returns its path. Returns an InternalError status if
+// failed.
+absl::StatusOr<std::string> CreateTmpDir(tsl::Env* env);
+
+// Creates a temporary directory and returns its path. Returns an InternalError
+// status if failed. The file system used will be the default environment
+// returned by `tsl::Env::Default`.
+absl::StatusOr<std::string> CreateTmpDir();
+
+// Convenience function for writing string `data` to file without the need to
+// pass `tsl::Env` instance. Internally it uses the default `tsl::Env::Default`.
+absl::Status WriteStringToFile(absl::string_view file_path,
+                               absl::string_view data);
+
+// Convenience function for reading string data from file at `file_path` without
+// the need to pass `tsl::Env` instance. Internally it uses the default
+// `tsl::Env::Default`. Returns an OK status with string data containing file
+// contents. Returns non-ok status upon error, e.g. file doesn't exist.
+absl::StatusOr<std::string> ReadFileToString(absl::string_view file_path);
+
+// Lists all files and directories under the given directory.
+absl::StatusOr<std::vector<std::string>> ListDirectory(
+    absl::string_view directory);
+
+template <class MessageT>
+absl::StatusOr<MessageT> ReadBinaryProto(const std::string& binary_file_path) {
+  MessageT message;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadBinaryProto(tsl::Env::Default(), binary_file_path, &message));
+  return message;
+}
+
+}  // namespace stablehlo::quantization::io
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
new file mode 100644
index 00000000..408152f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PASS_PIPELINE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PASS_PIPELINE_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Adds passes for static-range quantization pre-calibration. Inserts ops
+// required to collect tensor statistics.
+void AddPreCalibrationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::CalibrationOptions& calibration_options,
+    const ::stablehlo::quantization::QuantizationSpecs& specs,
+    const ::stablehlo::quantization::DebuggerConfig& debugger_config);
+
+// Adds passes for static-range quantization post-calibration. Utilizes tensor
+// statistics collected from the calibration step and performs quantization.
+void AddPostCalibrationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::QuantizationSpecs& specs);
+
+// Adds passes for weight-only quantization.
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::DebuggerConfig& debugger_config);
+
+// Deserializes StableHLO functions serialized and embedded in XlaCallModuleOps.
+void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm);
+
+// Legalizes shape/tensor/arith dialect ops to StableHLO for handling dynamic
+// shapes, by going through a round-trip to MHLO.
+void AddShapeLegalizationPasses(OpPassManager& pm);
+
+// Serializes the StableHLO module into a tf.XlaCallModuleOp for compatibility
+// with passes that expect TF format. This also allows the StableHLO ops to be
+// exported as a TF SavedModel.
+void AddCallModuleSerializationPasses(OpPassManager& pm);
+
+// Passes for unpacking quantized ops to int valued StableHLO ops. This is
+// useful when uniform quantized types are suboptimal for the hardware. It goes
+// through a StableHLO <-> MHLO roundtrip to utilize the MHLOQuantToInt pass.
+void AddStablehloQuantToIntPasses(OpPassManager& pm);
+
+// Processes tensors with NCHW format (== (batch, channel, height, weight)) by
+// converting them to NHWC formats along with extra optimizations such as
+// constant folding the transpose->convolution pattern. This is useful when
+// downstream pipeline (e.g. XLA) is more optimized when accepting NHWC formats.
+void AddProcessNchwTensorPasses(OpPassManager& pm);
+
+// Registers quantization pass pipelines. This is only required when running
+// MLIR opt binaries and not required when adding passes programmatically.
+void RegisterPassPipelines();
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PASS_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
new file mode 100644
index 00000000..35b1082b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "llvm/ADT/ArrayRef.h"  // IWYU pragma: keep; required to include the definition of ArrayRef
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"  // IWYU pragma: keep; required to include the definition of SmallVector
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quant {
+
+// Permutes `values` with `permutation`. Returns the permuted values. Sizes of
+// `values` and `permutation` must be equal, and the elements of `permutation`
+// should be less than `values.size()`.
+template <typename T,
+          typename = std::enable_if_t<std::is_default_constructible_v<T>, void>>
+SmallVector<T> Permute(const ArrayRef<T> values,
+                       const ArrayRef<int64_t> permutation) {
+  SmallVector<T> permuted_values(/*Size=*/values.size(), /*Value=*/T{});
+  for (auto [i, permutation_idx] : llvm::enumerate(permutation)) {
+    permuted_values[i] = std::move(values[permutation_idx]);
+  }
+  return permuted_values;
+}
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
new file mode 100644
index 00000000..6e376281
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_POST_CALIBRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_POST_CALIBRATION_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs post-calibration graph transformation as part of post-training
+// static-range quantization.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the statistics collected
+// after the calibration step, corresponding to each `TF::CustomAggregatorOp`s
+// in the input module op.
+class PostCalibrationComponent : public Component {
+ public:
+  // Name of the post-training quantization post-calibration step. Used for
+  // debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_post_calibration";
+
+  explicit PostCalibrationComponent(absl::Nonnull<MLIRContext*> ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+  void AddPasses(
+      OpPassManager& pm,
+      const ::stablehlo::quantization::QuantizationSpecs& specs,
+      const ::stablehlo::quantization::PipelineConfig& pipeline_config) const;
+
+ private:
+  absl::Nonnull<MLIRContext*> ctx_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_POST_CALIBRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h
new file mode 100644
index 00000000..bdc61baf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PRE_CALIBRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PRE_CALIBRATION_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs pre-calibration graph transformation as part of post-training
+// static-range quantization.
+
+// The resulting `ModuleOp` contains `TF::CustomAggregatorOp`s for collecting
+// quantization statistics, along with `TF::XlaCallModuleOp`s that correspond to
+// lifted quantizable functions.
+class PreCalibrationComponent : public Component {
+ public:
+  // Name of the post-training quantization pre-calibration step. Used for
+  // debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_pre_calibration";
+
+  explicit PreCalibrationComponent(absl::Nonnull<MLIRContext*> ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  absl::Nonnull<MLIRContext*> ctx_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PRE_CALIBRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
new file mode 100644
index 00000000..8252dda6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// A class that manages information about `QuantizableUnit`s post-quantization,
+// internally in the form of `QuantizationUnits`. It is used to collect
+// quantization summary from a quantized `ModuleOp` and emit it in a human- and
+// machine-readable format.
+class QuantizationReport {
+ public:
+  QuantizationReport() = default;
+
+  // Initializes `QuantizationReport` by collecting `QuantizationResults` from
+  // `module_op`.
+  explicit QuantizationReport(ModuleOp module_op);
+
+  // Adds a `QuantizationResult` to the report.
+  void AddQuantizationResult(
+      ::stablehlo::quantization::QuantizationResult&& result);
+
+  // Returns `QuantizationResults` that are registered in this report.
+  const ::stablehlo::quantization::QuantizationResults& GetQuantizationResults()
+      const {
+    return quantization_results_;
+  }
+
+  // Returns a human-readable string representation of this report.
+  std::string ToString() const;
+
+  // Prints a human-readable report to stdout.
+  void Print() const;
+
+  // Saves the report to `file_path`. The textproto representation of
+  // `QuantizationResults` will be written to the file. Returns non-ok status
+  // when the file write fails.
+  absl::Status Save(StringRef file_path) const;
+
+ private:
+  ::stablehlo::quantization::QuantizationResults CollectResultsFromModuleOp(
+      ModuleOp module_op) const;
+
+  // Quantization results that are registered in this report. A quantization
+  // result may be added manually by calling `AddQuantizationResult`.
+  ::stablehlo::quantization::QuantizationResults quantization_results_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
new file mode 100644
index 00000000..357c5b0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
@@ -0,0 +1,142 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functionalities for exporting MLIR ModuleOp to TensorFlow SavedModel.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_EXPORT_H_
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Suffix string for the module export step. Used for debugging.
+constexpr absl::string_view kExportStepSuffix = "_export";
+
+// Options when running passes for exporting an MLIR ModuleOp.
+struct ExportOptions {
+  // If set to `true`, it runs `DuplicateShapeDeterminingConstantsPass` before
+  // lowering to tf_executor dialect.
+  bool duplicate_shape_determining_constants = true;
+
+  // If set to `true`, unfreezes constants into variables and saves them to a
+  // checkpoint file. Setting this to `true` is an experimental feature that has
+  // no stability guarantees.
+  bool unfreeze_constants = false;
+
+  // Path to the directory where checkpoint files are saved.
+  std::string checkpoint_dir = "";
+
+  // Name used to identify the ModuleOp this is exporting. Only used for
+  // debugging and does not modify the behavior of the export.
+  std::string debug_name = "stablehlo_quant";
+};
+
+// Creates `ExportedModel` from `module_op`. `module_op` goes through post
+// process passes before an `ExportModel` is created.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<tensorflow::quantization::ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op);
+
+// Factory function for `ExportedModel`.
+[[nodiscard]] tensorflow::quantization::ExportedModel
+CreateExportedModelFromGraphDef(
+    tensorflow::GraphDef&& graph_def, absl::string_view init_node_name,
+    absl::string_view checkpoint_dir,
+    std::optional<tensorflow::SaverDef> saver_def,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases,
+    const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
+
+// Creates a new `SaverDef` instance, which contains information regarding
+// checkpoint saving and restoring. This function returns a `SaverDef` instance
+// with four fields populated: `version`, `filename_tensor_name`,
+// `restore_op_name` and `save_tensor_name`. For valid quantized `graph_def` and
+// `control_ret_node_names`, it should be able to retrieve the last three fields
+// if there is at lest one variable in the graph.
+//
+// Returns a `std::nullopt` if there are no variables in the graph and no saving
+// & restoring are required. Returns an `InternalError` status for when the
+// required fields are only partially provided.
+absl::StatusOr<std::optional<tensorflow::SaverDef>> CreateSaverDef(
+    const std::vector<std::string>& control_ret_node_names,
+    const tensorflow::GraphDef& graph_def);
+
+// Adds passes for transforming the MLIR module op so that it can be exported
+// back to GraphDef. Roughly, this consists of:
+//   1) Inserting the @main function, which will become the main Graph.
+//   2) Duplicating shape-determining constants.
+//   3) Converting TF dialect -> tf_executor dialect.
+//   4) Adding initializer function's ops into @main function for correct
+//      resource initialization when loading the exported model.
+//
+// Duplicating shape-determining constants is required to place constants that
+// affect the shape of a tensor to be placed in the TPU graph instead of in the
+// CPU graph, when the graph gets converted for TPU inference. This allows these
+// constants to be known at XLA compilation time.
+void AddExportPasses(mlir::PassManager& pm,
+                     bool duplicate_shape_determining_constants);
+
+// Converts MLIR ModuleOp to `ExportedModel`. Returns `InternalError` status
+// when the conversion fails.
+//
+// * `checkpoint_dir` is the directory where checkpoints where variable values
+// are stored. This value will be fed to the "file_prefix" tensor to restore the
+// variables.
+// * `function_aliases` maps the actual function name to the function alias.
+// This associates the quantized functions to the original functions' aliases.
+// If there were no function aliases in the input model, this should be empty.
+// * `asset_file_defs` include information about the assets, if any, that are
+// used directly to initialize resources (like hash tables). If no assets are
+// used in the model, this should be empty.
+absl::StatusOr<tensorflow::quantization::ExportedModel>
+ConvertMlirModuleToExportedModel(
+    mlir::ModuleOp module_op, absl::string_view checkpoint_dir,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases,
+    const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
+
+// Sets up and runs the passes for exporting `module_op`. The behavior of the
+// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
+// associate the input arguments of @main and the asset file names. Asset file
+// names will be used to feed the corresponding tensors during initialization
+// upon model loading.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<SmallVector<::tensorflow::AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
new file mode 100644
index 00000000..9918b144
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functionalities for importing MLIR ModuleOp from TensorFlow SavedModel.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_IMPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_IMPORT_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Represents a pair of `mlir::ModuleOp` and `tensorflow::SavedModelBundle`. The
+// SavedModelBundle complements the imported ModuleOp by providing access to
+// `tensorflow::Session` which may be useful when reading values from resources
+// (e.g. `TF::VarHandleOp`s).
+using ImportedMlirModuleOp =
+    std::pair<OwningOpRef<ModuleOp>,
+              std::unique_ptr<::tensorflow::SavedModelBundle>>;
+
+// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
+//
+// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
+// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
+// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
+// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
+// Gets the function aliases from the SavedModel.
+absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+GetFunctionAliases(absl::string_view saved_model_path,
+                   const std::unordered_set<std::string>& tags);
+
+// Updates the function aliases. `module_op` may have different
+// function names from the original model, so it re-associates the aliases
+// with the new function names. Both the input `function_aliases` and the
+// returned value are function name -> alias mappings. `function_aliases` is
+// the function alias mapping of the original function. The original function's
+// name is retrieved by looking at the "tf._original_func_name" string attribute
+// attached to a `func::FuncOp`.
+void UpdateFunctionAliases(
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    ModuleOp module_op);
+
+// Loads a SavedModel to `mlir::ModuleOp` and performs preprocesses including
+// shape inference and graph freezing.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportSavedModel(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_IMPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
new file mode 100644
index 00000000..69bd9da6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
@@ -0,0 +1,103 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_STATIC_RANGE_PTQ_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_STATIC_RANGE_PTQ_H_
+
+#include <array>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Component for static-range post-training quantization (PTQ).
+// TODO: b/320607042 - Add tests in python level.
+class StaticRangePtqComponent : public Component {
+ public:
+  // Name of this component. Used for debugging purposes.
+  static constexpr absl::string_view kName = "quant_static_range_ptq";
+
+  // Constructs `StaticRangePtqComponent` by creating three sub-components:
+  // `PreCalibrationComponent`, `CalibrationComponent`, and
+  // `PostCalibrationComponent`. These are stored in `sub_components_` in
+  // sequence. All arguments except `ctx` is used to initialize
+  // `CalibrationComponent`. For detailed explanation of each argument, see the
+  // comment of `CalibrationComponent`'s constructor.
+  StaticRangePtqComponent(
+      absl::Nonnull<MLIRContext*> ctx,
+      absl::Nonnull<const tensorflow::quantization::PyFunctionLibrary*>
+          py_function_library,
+      absl::string_view src_saved_model_path,
+      std::vector<std::string> signature_keys,
+      std::unordered_set<std::string> tags,
+      absl::flat_hash_map<std::string, tensorflow::SignatureDef>
+          signature_def_map,
+      absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases);
+
+  // Runs the static-range post-training quantization (PTQ) on `module_op`.
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  // A non-owning `MLIRContext`. This `MLIRContext` should exceed the lifetime
+  // of `StaticRangePtqComponent`.
+  absl::Nonnull<MLIRContext*> ctx_;
+  // This component consists of three sub-components, `PreCalibrationComponent`,
+  // `CalibrationComponent`, and `PostCalibrationComponent`.
+  std::array<std::unique_ptr<Component>, 3> sub_components_;
+};
+
+// Runs static-range post-training quantization (PTQ) on a SavedModel at
+// `src_saved_model_path` and saves the resulting model to
+// `dst_saved_model_path`.
+//
+// `quantization_config` configures the quantization behavior for the
+// static-range PTQ.
+//
+// `signature_keys` specify the signatures that correspond to functions to be
+// quantized. `signature_def_map` connects the signature keys to
+// `SignatureDef`s.
+//
+// Returns a non-OK status when the quantization is not successful.
+// LINT.IfChange
+absl::Status QuantizeStaticRangePtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+// LINT.ThenChange(../python/pywrap_quantization.cc:static_range_ptq)
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_STATIC_RANGE_PTQ_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h
new file mode 100644
index 00000000..c2166330
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TYPES_H_
+
+#include <string>
+
+namespace mlir::quant::stablehlo {
+
+// Introduces aliases for `std::string` to distinguish btw. function name and
+// its alias, to prevent confusion when used together in a container. For
+// example, it is easy to confuse function name -> alias mapping with alias ->
+// function name mapping when both are just represented as `std::string`.
+using FunctionAlias = std::string;
+using FunctionName = std::string;
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
new file mode 100644
index 00000000..bf23e932
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs int8 weight-only quantization on dot_general ops.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the weight constants, not
+// relying on calibration.
+class WeightOnlyPtqComponent : public Component {
+ public:
+  // Used for debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_weight_only";
+
+  explicit WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  absl::Nonnull<MLIRContext*> ctx_;
+};
+
+// Runs weight-only quantization on a SavedModel at
+// `src_saved_model_path` and saves the resulting model to
+// `dst_saved_model_path`.
+//
+// `quantization_config` configures the quantization behavior for the
+// weight-only quantization.
+//
+// `signature_keys` specify the signatures that correspond to functions to be
+// quantized. `signature_def_map` connects the signature keys to
+// `SignatureDef`s.
+//
+// Returns a non-OK status when the quantization is not successful.
+// LINT.IfChange
+absl::Status QuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path,
+    ::stablehlo::quantization::QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+// LINT.ThenChange(../python/pywrap_quantization.cc:weight_only_ptq)
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h
new file mode 100644
index 00000000..e690e625
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_SAVE_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_SAVE_REPORT_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+
+namespace mlir::quant::stablehlo {
+
+// A `PassInstrumentation` that saves quantization report to file after
+// `QuantizeCompositeFunctionsPass` is run. It inspects the `ModuleOp` after
+// quantization and analyzes the quantizable units and quantization methods
+// used. The report file will be saved at the `file_path`. The report file
+// contains textproto of `QuantizationResults`. `file_path`'s base directories
+// should exist (this pass instrumentation will not `mkdir` them).
+//
+// See `QuantizationReport` for further details on the quantization report.
+class SaveQuantizationReportInstrumentation : public PassInstrumentation {
+ public:
+  // `file_path` is the path to save the report file. The report file is in
+  // textproto format so a `.txtpb` extension is preferred but it doesn't result
+  // in error if other extension is used. This instrumentation will not be run
+  // if `file_path` is a `nullopt`.
+  explicit SaveQuantizationReportInstrumentation(
+      std::optional<absl::string_view> file_path);
+
+  void runAfterPass(Pass* pass, Operation* op) override;
+
+ private:
+  std::optional<std::string> file_path_;  // Path to file to save the report.
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_SAVE_REPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
new file mode 100644
index 00000000..6c688e82
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_STABLEHLO_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_STABLEHLO_OP_QUANT_SPEC_H_
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Returns StableHLO quantization specs for an op.
+std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op);
+
+// Returns quantization constraints (ex: fixed output, same scale) given
+// a StableHLO op.
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantConstraints(Operation* op);
+
+// Checks if an op is quantizable in StableHLO quantizer. Argument op is not
+// necessarily a StableHLO op.
+bool IsOpQuantizableStableHlo(Operation* op);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_STABLEHLO_OP_QUANT_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h
new file mode 100644
index 00000000..9d19c6e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_BRIDGE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_BRIDGE_PASSES_H_
+
+#include <memory>
+
+#define GEN_PASS_DECL
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir::quant::stablehlo {
+
+// Creates an instance of the ConvertTFQuantOpsToMHLOPass pass, which will
+// convert TF uniform quantized ops to the corresponding quantized MHLO ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateConvertTFQuantOpsToMHLOPass();
+
+// TODO(b/288094093): Migrate uniform quantization legalization in a separate
+// pass.
+void PopulateLegalizeTfQuantizationPatterns(MLIRContext *context,
+                                            RewritePatternSet *patterns);
+
+// Creates an instance of the ConvertTFQuantTypes pass, which will convert TF
+// qint types to int types and surround TF UniformQuantized ops with qint <->
+// int casts.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertTFQuantTypesPass();
+
+// Creates an instance of the VerifyQuantLegalization pass, which verifies all
+// quant ops and types are lowered.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateVerifyQuantLegalizationPass();
+
+// Add all passes for lowering TF quant ops and types to MHLO int.
+void AddQuantizationLoweringPasses(mlir::OpPassManager &pm);
+
+// Creates an instance of OptimizeIntGraphPass, which optimizes the int graph
+// lowered from the quantized graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateOptimizeIntGraphPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_CONVERTTFQUANTOPSTOMHLO
+#define GEN_PASS_DECL_CONVERTTFQUANTTYPES
+#define GEN_PASS_DECL_VERIFYQUANTLEGALIZATION
+#define GEN_PASS_DECL_OPTIMIZEINTGRAPH
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h.inc"
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_BRIDGE_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
new file mode 100644
index 00000000..d13c589c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_PASSES_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Creates a pass that quantizes weight component of StableHLO graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
+    const ::stablehlo::quantization::QuantizationComponentSpec&
+        quantization_component_spec = {});
+
+// Converts a serialized StableHLO module to bfloat16 and output serialized
+// module.
+absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
+    StringRef serialized_stablehlo_module);
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs);
+
+// Creates a pass that inserts CalibrationStatisticsSaverOp.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInsertCalibrationStatisticsSaverPass(
+    StringRef calibration_data_dir,
+    const std::vector<std::string>& aggregator_ops_to_ignore);
+
+// Adds generated pass default constructors or options definitions.
+#define GEN_PASS_DECL
+// Adds generated pass registration functions.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
new file mode 100644
index 00000000..5e45d6d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -0,0 +1,258 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Checks whether an op is connected with a quantized composite function. If
+// not, the same-scale op will not be quantized. This decision is based on the
+// current assumption that the performance gain of the same-scale op itself
+// could not beat the overhead of the quantize and dequantize routines need to
+// be added around that op. When the assumption changes, this policy might
+// change as well.
+bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op);
+
+// A base rewrite pattern which matches any N-in-M-out operations with
+// quantization parameters propagated to at least one of its operands. The
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
+//
+// Quantization method is determined by the `_quantization_method` attributes
+// attached to each quantizable units.
+//
+// Template constraints are imposed as follows:
+//
+// * `QuantizeOpT` should have only one operand.
+// * `DequantizeOpT` should have only one result.
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT,
+          typename = std::enable_if_t<
+              QuantizeOpT::template hasTrait<OpTrait::OneOperand>() &&
+              DequantizeOpT::template hasTrait<OpTrait::OneResult>()>>
+class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
+ public:
+  explicit StableHloQuantizationPattern(MLIRContext* context)
+      // Set the benefit to a large number so that it is always preferred.
+      : OpRewritePattern<RootOpT>(context, /*benefit=*/300) {}
+
+ private:
+  // Collects all candidate ops for quantization, which are the
+  // `dequantize_op`'s users.
+  FailureOr<SmallVector<Operation*>> CollectCandidateOps(
+      DequantizeOpT dequantize_op) const {
+    auto users = dequantize_op->getResult(0).getUsers();
+    return SmallVector<Operation*>(users.begin(), users.end());
+  }
+
+  // Collects all candidate ops for quantization, which is the operand of
+  // `quantize_op`. If successful, this always returns one element which is the
+  // operand of `quantize_op`.
+  FailureOr<SmallVector<Operation*>> CollectCandidateOps(
+      QuantizeOpT quantize_op) const {
+    Value operand = quantize_op->getOperand(0);
+    if (QuantizedType::getQuantizedElementType(operand.getType())) {
+      // The input of the quantize op has already been quantized, i.e.
+      // rescale.
+      return failure();
+    }
+
+    Operation* operand_op = operand.getDefiningOp();
+    if (operand_op == nullptr) {
+      // When `QuantizeOpT`'s operand does not have a defining op, it means it
+      // is a `BlockArgument`. The pattern does not match if there is no op to
+      // quantize.
+      return failure();
+    }
+
+    if (operand_op->hasTrait<OpTrait::ConstantLike>()) {
+      // Const-> QuantizeOp pattern will be handled separately.
+      return failure();
+    }
+
+    return SmallVector<Operation*>{operand_op};
+  }
+
+  LogicalResult matchAndRewrite(RootOpT op,
+                                PatternRewriter& rewriter) const override {
+    // Collect all the candidate ops for quantization.
+    FailureOr<SmallVector<Operation*>> candidate_ops = CollectCandidateOps(op);
+    // Safeguard check to ensure that there is at least one quantizable op.
+    if (failed(candidate_ops) || candidate_ops->empty()) return failure();
+
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* candidate_op : *candidate_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (isa<QuantizeOpT, DequantizeOpT>(candidate_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, we shouldn't rewrite.
+      if (candidate_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!IsOpQuantizableStableHlo(candidate_op)) {
+        return failure();
+      }
+
+      if (GetStableHloQuantConstraints(candidate_op)
+              ->has_same_scale_requirement &&
+          !IsConnectedWithQuantizedCompsiteFunction(candidate_op)) {
+        return failure();
+      }
+
+      // Ops with regions will be quantized in a separate pattern.
+      if (isa<mlir::stablehlo::ReduceWindowOp>(candidate_op)) {
+        return failure();
+      }
+
+      const bool weight_only_quantizable =
+          IsWeightOnlyQuantizableOp(*candidate_op);
+
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(candidate_op->getNumOperands());
+      for (auto operand : candidate_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (mlir::isa<NoneType>(operand_type)) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        auto ele_type =
+            mlir::cast<TensorType>(operand.getType()).getElementType();
+        if (auto dq_op =
+                dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+          inputs.push_back(dq_op.getOperand());
+        } else if (!ele_type.isF32()) {
+          // If the operand is an integer tensor, then it doesn't require the
+          // DequantizeOp in the pattern.
+          inputs.push_back(operand);
+        } else if (weight_only_quantizable) {
+          inputs.push_back(operand);
+        } else {
+          return failure();
+        }
+      }
+
+      // Collect all the quantized outputs and replace them by the results of
+      // the new quantized op.
+      llvm::SmallDenseMap<Value, int> outputs_replaced;
+      SmallVector<Type, 4> output_types;
+      output_types.reserve(candidate_op->getNumResults());
+      for (const auto& enumerated_result :
+           llvm::enumerate(candidate_op->getResults())) {
+        Value result = enumerated_result.value();
+        Type result_type = result.getType();
+        // Add this to the test coverage once we create test ops with none type
+        // results.
+        if (mlir::isa<NoneType>(result_type)) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result_type);
+          continue;
+        }
+        Type result_ele_type =
+            mlir::cast<TensorType>(result.getType()).getElementType();
+        // If the user is the QuantizeOp, it must be the only user.
+        if (result.hasOneUse() && isa<QuantizeOpT>(*result.user_begin())) {
+          auto user = cast<QuantizeOpT>(*result.user_begin());
+          outputs_replaced.insert(
+              {user.getResult(), enumerated_result.index()});
+          output_types.push_back(user.getType());
+        } else if (!result_ele_type.isF32()) {
+          // If the result is an integer tensor, then it doesn't require the
+          // D op in the pattern.
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else if (weight_only_quantizable) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else {
+          return failure();
+        }
+      }
+
+      rewriter.setInsertionPointAfter(candidate_op);
+      OperationState new_state(candidate_op->getLoc(),
+                               candidate_op->getName().getStringRef(), inputs,
+                               output_types, candidate_op->getAttrs());
+      for (int i = 0; i < candidate_op->getNumRegions(); ++i) {
+        new_state.addRegion();
+      }
+      Operation* quantized_op = rewriter.create(new_state);
+      if (candidate_op->getNumRegions() != 0) {
+        for (const auto& indexed_regions :
+             llvm::enumerate(candidate_op->getRegions())) {
+          Region& target_region =
+              quantized_op->getRegion(indexed_regions.index());
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(&target_region, mapping);
+        }
+      }
+      for (auto output : outputs_replaced) {
+        output.getFirst().replaceAllUsesWith(
+            quantized_op->getResult(output.getSecond()));
+      }
+    }
+    return success();
+  }
+};
+
+// Populates common patterns that are usually compute heavy or memory bound.
+void PopulateCommonQuantizationPatterns(
+    MLIRContext& ctx, RewritePatternSet& patterns,
+    bool enable_per_channel_quantized_weight);
+
+// Populates conversion patterns for all quantizable ops, including
+// ops that are not compute-heavy and data movement ops.
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
new file mode 100644
index 00000000..a8a59d1c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_PASSES_H_
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+
+namespace mlir::quant::stablehlo::testing {
+
+// Identifies predefined `QuantizationSpecs` for
+// `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`. The pass
+// option argument is specified in line comments for each enum value.
+enum class TestQuantizationSpecs {
+  kEmpty,                         // empty
+  kDisableAllDotGeneral,          // disable-all-dot-general
+  kStaticRangePtqToAll,           // static-range-ptq-to-all
+  kStaticRangePtqToComputeHeavy,  // static-range-ptq-to-compute-heavy
+};
+
+// Adds generated pass default constructors or options definitions.
+#define GEN_PASS_DECL
+// Adds generated pass registration functions.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h.inc"
+
+}  // namespace mlir::quant::stablehlo::testing
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
new file mode 100644
index 00000000..ff724aba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PYTHON_PYWRAP_QUANTIZATION_LIB_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PYTHON_PYWRAP_QUANTIZATION_LIB_H_
+
+// Contains mirror functions from StableHLO Quantizer to be exposed to python
+// via `pywrap_quantization`.
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace stablehlo::quantization::pywrap {
+
+// Function used by the pywrap_quantization module to mirror
+// `::mlir::quant::stablehlo::QuantizeStaticRangePtq`.
+absl::Status PywrapQuantizeStaticRangePtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path, const QuantizationConfig& config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+
+// Function used by the pywrap_quantization module to mirror
+// `::mlir::quant::stablehlo::QuantizeWeightOnlyPtq`.
+absl::Status PywrapQuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path, const QuantizationConfig& config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+
+// Function used by the pywrap_quantization module to mirror
+// `::stablehlo::quantization::PopulateDefaults`.
+QuantizationConfig PywrapPopulateDefaults(
+    const QuantizationConfig& user_provided_config);
+
+// Function used by the pywrap_quantization module to mirror
+// `::stablehlo::quantization::ExpandPresets`.
+QuantizationConfig PywrapExpandPresets(const QuantizationConfig& config);
+
+}  // namespace stablehlo::quantization::pywrap
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PYTHON_PYWRAP_QUANTIZATION_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h
new file mode 100644
index 00000000..d754be94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_QUANTIZE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_QUANTIZE_PASSES_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace stablehlo {
+namespace quantization {
+// Adds passes for quantization of individual quantizable components.
+// (i.e. activation, weight, bias)
+void AddQuantizationPasses(mlir::PassManager& pass_manager,
+                           const QuantizationOptions& quantization_options);
+
+}  // namespace quantization
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_QUANTIZE_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.h
new file mode 100644
index 00000000..2873b071
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_BFLOAT16_TYPE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_BFLOAT16_TYPE_H_
+
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+namespace mlir::quant::stablehlo {
+
+// Returns true if the type or its element type is a float type with bit_width
+// > 16.
+bool IsLargeFloatType(Type type);
+
+// Converts large float type to bfloat16. Otherwise returns original type.
+Type ToBfloat16Type(Type type);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_BFLOAT16_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h
new file mode 100644
index 00000000..691d4c35
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_FILL_QUANTIZATION_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_FILL_QUANTIZATION_OPTIONS_H_
+
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationOptions;
+
+// Returns QuantizationOptions filled with detailed specs when user specifies
+// an optional preset method name. The preset methods are defined in
+// quantization_options.proto. This function will only be executed if a user
+// gives a preset method, not a custom method.
+QuantizationOptions FillPresetQuantizationOptions(
+    QuantizationOptions quantization_options);
+
+// Returns LogicalResult depending on the look up of activation bit width in the
+// custom quantization method. If such information exists, returns success,
+// otherwise, returns false.
+LogicalResult GetActivationBitWidth(QuantizationOptions quantization_options,
+                                    int* bit_width);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_FILL_QUANTIZATION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils.h
new file mode 100644
index 00000000..f63e06a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils.h
@@ -0,0 +1,32 @@
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_MATH_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_MATH_UTILS_H_
+
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir::quant::stablehlo {
+
+// Decomposes a given floating point value num into a normalized and quantized
+// fraction and an integral power of two.
+LogicalResult QuantizeMultiplier(double double_multiplier,
+                                 int32_t& quantized_fraction, int32_t& shift);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_MATH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h
new file mode 100644
index 00000000..81dfb576
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_STABLEHLO_TYPE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_STABLEHLO_TYPE_UTILS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir::quant::stablehlo {
+
+// Checks if an op is from StableHLO dialect.
+inline bool IsStablehloOp(Operation* op) {
+  return op->getDialect()->getNamespace() ==
+         mlir::stablehlo::StablehloDialect::getDialectNamespace();
+}
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_STABLEHLO_TYPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.h
new file mode 100644
index 00000000..9eab6b00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_TF_TYPE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_TF_TYPE_UTILS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir::quant::tensorflow {
+
+// GetDenseAttrFromTensorProtoAttr returns DenseElementsAttr from tensor proto.
+FailureOr<mlir::DenseElementsAttr> GetDenseAttrFromTensorProtoAttr(
+    llvm::StringRef mangled_tensor_proto, TensorType result_tensor_type);
+
+// Check if a type is TF qint type.
+bool IsTFQintType(Type type);
+
+// Convert qint type to the corresponding int type. Return original type if it
+// is not qint type.
+Type GetIntTypeFromTFQint(Type type);
+
+// Check if an op is TF UniformQuantized op.
+bool IsTFUniformQuantizedOp(Operation* op);
+
+}  // namespace mlir::quant::tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_TF_TYPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h
new file mode 100644
index 00000000..f6a5da84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_AVERAGE_MIN_MAX_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_AVERAGE_MIN_MAX_H_
+
+#include <optional>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace tensorflow {
+namespace calibrator {
+
+using ::stablehlo::quantization::CalibrationOptions;
+
+// AverageMinMax calibration calculates the average of min and max values.
+// average of min = sum of min values / number of samples
+// average of max = sum of max values / number of samples
+class CalibrationStatisticsCollectorAverageMinMax
+    : public CalibrationStatisticsCollectorBase {
+ public:
+  explicit CalibrationStatisticsCollectorAverageMinMax() { ClearData(); }
+
+  void ClearData() override;
+
+  void Collect(float min, float max,
+               absl::Span<const int64_t> histogram) override;
+
+  std::optional<CalibrationStatistics> GetStatistics() const override;
+
+ private:
+  CalibrationStatistics::AverageMinMaxStatistics average_min_max_statistics_;
+};
+}  // namespace calibrator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_AVERAGE_MIN_MAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h
new file mode 100644
index 00000000..9ce6a819
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_BASE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_BASE_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+
+namespace tensorflow {
+namespace calibrator {
+
+// Abstract base class for CalibrationStatisticsCollcetor such as
+// CalibrationStatisticsCollectorMinMax. Each class collects different
+// statistics based on the calibration methods.
+class CalibrationStatisticsCollectorBase {
+ public:
+  // Collect data for calibration.
+  virtual void Collect(float min, float max,
+                       absl::Span<const int64_t> histogram) = 0;
+
+  virtual void ClearData() = 0;
+  // Return the statistics needed for a given calibration method.
+  virtual std::optional<CalibrationStatistics> GetStatistics() const = 0;
+  virtual ~CalibrationStatisticsCollectorBase() = default;
+};
+
+}  // namespace calibrator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h
new file mode 100644
index 00000000..84f641a5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_HISTOGRAM_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_HISTOGRAM_H_
+
+#include <cstdint>
+#include <deque>
+#include <optional>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace tensorflow {
+namespace calibrator {
+
+
+class CalibrationStatisticsCollectorHistogram
+    : public CalibrationStatisticsCollectorBase {
+ public:
+  explicit CalibrationStatisticsCollectorHistogram() { ClearData(); }
+
+  void ClearData() override;
+
+  void Collect(float min, float max,
+               absl::Span<const int64_t> histogram) override;
+
+  std::optional<CalibrationStatistics> GetStatistics() const override;
+
+ private:
+  // Expands the histogram so the lower_bound and upper_bound can fit in the
+  // histogram. Returns the indexes associated to those values.
+  std::pair<int32_t, int32_t> ExpandHistogramIfNeeded(float lower_bound,
+                                                      float upper_bound);
+
+  // hist_freq_[i] saves frequency of range [bins[i], bins[i + 1]).
+  // bins[i]     = lower_bound_ + bin_width_ * i
+  // bins[i + 1] = lower_bound_ + bin_width_ * (i + 1)
+  std::deque<float> hist_freq_;
+
+  // Width of bin
+  float bin_width_;
+
+  // The first bin's left value. [left, right)
+  float lower_bound_;
+};
+
+}  // namespace calibrator
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_HISTOGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h
new file mode 100644
index 00000000..8ee545e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_MIN_MAX_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_MIN_MAX_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace tensorflow {
+namespace calibrator {
+
+using ::stablehlo::quantization::CalibrationOptions;
+
+// MinMax calibration calculates the global min and global max values.
+// global min = min of given sample inputs
+// global max = max of given sample inputs
+class CalibrationStatisticsCollectorMinMax
+    : public CalibrationStatisticsCollectorBase {
+ public:
+  explicit CalibrationStatisticsCollectorMinMax() { ClearData(); }
+
+  void ClearData() override;
+
+  void Collect(float min, float max,
+               absl::Span<const int64_t> histogram) override;
+
+  std::optional<CalibrationStatistics> GetStatistics() const override;
+
+ private:
+  CalibrationStatistics::MinMaxStatistics min_max_statistics_;
+};
+
+}  // namespace calibrator
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_MIN_MAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h
new file mode 100644
index 00000000..884ac938
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONST_OP_SIZE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONST_OP_SIZE_H_
+
+#include <cstdint>
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace quant {
+
+// Returns the size in bytes of the underlying data of `const_op`. If the
+// underlying type's size cannot be determined, it assumes 4 bytes per element.
+int64_t GetSizeInBytes(TF::ConstOp const_op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONST_OP_SIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h
new file mode 100644
index 00000000..d0a4157b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONSTANT_FOLD_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONSTANT_FOLD_H_
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+// Applies constant folding recursively if the operation and all of its operands
+// are foldable. Returns the constants generated by constant-folding or the
+// original operation's outputs if not folded.
+SmallVector<Value> ConstantFoldOpIfPossible(Operation* op);
+
+// This pattern tries to constant-fold the quantizable operands of supported
+// TF operations.
+struct ConstantFoldQuantizableOperands : public RewritePattern {
+ public:
+  explicit ConstantFoldQuantizableOperands(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONSTANT_FOLD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h
new file mode 100644
index 00000000..7ff335fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONVERT_ASSET_ARGS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONVERT_ASSET_ARGS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::quant {
+
+// Converts arguments of the @main function that are bound to
+// `tf_saved_model::AssetOp`s into regular tensor args. Returns `AsestFileDef`s
+// that associates the arg with the asset.
+//
+// In detail, this function performs the following:
+// * Replaces "tf_saved_model.bound_input" attributes to
+//   "tf_saved_model.index_path", if the bound input is attached to the
+//   `tf_saved_model::AssetOp`.
+// * Strips the "assets/" prefix of the filename when setting it to
+//   `AssetFileDef`.
+FailureOr<SmallVector<tensorflow::AssetFileDef>> ConvertAssetArgs(
+    ModuleOp module_op);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONVERT_ASSET_ARGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h
new file mode 100644
index 00000000..32fb6f89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_QUANTIZATION_UNIT_LOC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_QUANTIZATION_UNIT_LOC_H_
+
+#include <optional>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace quant {
+
+// QuantizationUnitLoc uses CallSiteLoc as the base class so it can be printed
+// with AsmPrinter and used to set the node name in MLIR to GraphDef exporter.
+// The callee is named as `node_name@func_name` with child loc named as
+// `op_type` while the caller is the quantization unit.
+class QuantizationUnitLoc : public CallSiteLoc {
+ public:
+  using QuantizationUnit =
+      tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizationUnitLoc)
+
+  QuantizationUnitLoc(MLIRContext* context, const QuantizationUnit& unit);
+
+  // Checks if the given location is QuantizationUnitLoc. Users could call
+  // `isa<QuantizationUnitLoc>(loc)` to check if the type matches.
+  static bool classof(Attribute attr);
+};
+
+// Finds the QuantizationUnit from location info.
+std::optional<QuantizationUnitLoc::QuantizationUnit>
+FindQuantizationUnitFromLoc(Location loc);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_QUANTIZATION_UNIT_LOC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
new file mode 100644
index 00000000..06db2acb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_RUN_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_RUN_PASSES_H_
+
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// Runs MLIR passes with `module_op`. The passes are added by calling
+// `add_passes_func`, which is a callable receiving mlir::PassManager& as its
+// only argument. `name` identifies the set of passes added by `add_passes_func`
+// and is used for debugging. Changing the `name` does not modify the behavior
+// of the passes.
+//
+// It will try to dump intermediate MLIRs if certain conditions are met. See the
+// description from `MaybeEnableIrPrinting` for the details about the
+// conditions.
+//
+// Returns a non-OK status when the pass run fails or it fails to create an MLIR
+// dump file.
+template <typename FuncT>
+absl::Status RunPasses(const absl::string_view name, FuncT add_passes_func,
+                       mlir::MLIRContext& ctx, mlir::ModuleOp module_op) {
+  mlir::PassManager pm{&ctx};
+  add_passes_func(pm);
+
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler{&ctx};
+  TF_RETURN_IF_ERROR(MaybeEnableIrPrinting(pm, name));
+
+  if (failed(pm.run(module_op))) {
+    return absl::InternalError(
+        absl::StrFormat("Failed to run pass: %s. %s", name,
+                        diagnostic_handler.ConsumeStatus().message()));
+  }
+
+  return absl::OkStatus();
+}
+
+// Runs MLIR passes with `module_op` on a `pass_manager`.
+//
+// It will try to dump intermediate MLIRs if certain conditions are met. See the
+// description from `MaybeEnableIrPrinting` for the details about the
+// conditions.
+//
+// Returns a non-OK status when the pass run fails or it fails to create an MLIR
+// dump file.
+absl::Status RunPassesOnModuleOp(
+    std::optional<absl::string_view> mlir_dump_file_name,
+    mlir::PassManager& pass_manager, mlir::ModuleOp module_op);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_RUN_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h
new file mode 100644
index 00000000..124f2a5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_SAVE_VARIABLES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_SAVE_VARIABLES_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+// Saves variables in `module_op` to the checkpoint file inside `prefix`.
+// It finds variables that are initialized with "tf.AssignVariableOp" inside the
+// initializer function with type "restore_op". The "tf.Const"s used to
+// initialize the variables are saved. This function does not modify the
+// `module_op`. Returns a list of saved names of the saved variables.
+absl::StatusOr<std::vector<std::string>> SaveVariablesToCheckpoint(
+    absl::string_view prefix, mlir::ModuleOp module_op);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_SAVE_VARIABLES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
new file mode 100644
index 00000000..38a9c4fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+// Enables IR printing for `pm`. When the passes are run, each pass will dump to
+// its own file with prefix `file_name_prefix`.
+void EnableIrPrinting(mlir::PassManager &pm,
+                      absl::string_view file_name_prefix);
+
+// If verbosity level >= 1, this will dump intermediate IRs of passes to a file.
+// The dumped mlir files with be under a directory determined by
+// the TF_QUANT_MLIR_DUMP_PREFIX env variable. The PassManager will dump to a
+// new file for each pass. The file name will have the format
+// {file_name_prefix}_{pass_number}_{pass_name}_{before|after}.mlir.
+// * `file_name_prefix` is from input.
+// * `pass_number` increments from 1 for each pass.
+// * `pass_name` is the name of the pass.
+// * `before|after` indicates whether the dump occurs before or after the pass.
+absl::Status MaybeEnableIrPrinting(mlir::PassManager &pm,
+                                   absl::string_view file_name_prefix);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h
new file mode 100644
index 00000000..44c60b61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functions for quantization specifications of TensorFlow ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_OP_QUANT_SPEC_H_
+
+#include <memory>
+#include <optional>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace quant {
+
+// Check if the op has data movement trait. Ops with this trait do not perform
+// any computations but just move data and has one result operand.
+bool IsOpWithDataMovementTrait(Operation* op);
+
+// Check if the op is quantizable. Currently, the scope of quantizable op is
+// limited to compute intense operations and the ops that supports integer
+// operands.
+bool IsOpWithQuantizableTrait(Operation* op);
+
+// Check if the op's operand accepts int8 type.
+bool IsOpWithInt8TypeOperand(Operation* op);
+
+// Check if the data is in quantizable precision. Currently, a value in f32 or
+// bf16 is quantizable.
+bool IsValueWithQuantizablePrecision(Value val);
+
+std::optional<tensorflow::quantization::QuantizationComponentSpec>
+GetWeightComponentSpec(
+    const tensorflow::quantization::QuantizationOptions& quantization_options);
+
+// Returns the spec for the given operation that can be used for both of
+// dynamic and static range quantization.
+std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op);
+
+// Returns quantization scale specs (fixed output, same scale) for a TF op.
+std::unique_ptr<OpQuantScaleSpec> GetTfQuantScaleSpec(Operation* op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_OP_QUANT_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
new file mode 100644
index 00000000..bc6031ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides a list of supported quantization algorithms in the format
+// of "apply<Name of the Quantization Algorithm>Quantization".
+// After applying the function, a quantize/dequantize functions are created
+// where the body of each function contains a specific quantization algorithm.
+// The input of the quantize function has one operand of
+// IsValueWithQuantizablePrecision and the output is a tensor with supported
+// quantized precision (like int8). For dequantize function, it is the other way
+// around.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_QUANTIZE_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_QUANTIZE_OP_H_
+
+#include <optional>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace quant {
+
+std::optional<TF::PartitionedCallOp> ApplyUniformQuantization(
+    PatternRewriter& rewriter, TF::ConstOp op,
+    tensorflow::quantization::QuantizationComponentSpec& weight_spec);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_QUANTIZE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h
new file mode 100644
index 00000000..8a062a16
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functions for quantization specifications of Uniform Quantized ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_UNIFORM_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_UNIFORM_OP_QUANT_SPEC_H_
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+
+namespace mlir {
+namespace quant {
+
+// Returns the spec for the given operation that can be used for both of
+// dynamic and static range quantization.
+std::unique_ptr<OpQuantSpec> GetUniformOpQuantSpec(Operation* op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_UNIFORM_OP_QUANT_SPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h
new file mode 100644
index 00000000..6be6f05a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_CONSTANTS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+// Name of the save function. The "tf_quant__" prefix is for avoiding conflict
+// with existing function's name.
+inline constexpr StringRef kTfQuantSaveFuncName = "tf_quant__save";
+
+// Name of the TensorFlow Operation to be fetched to save the variables to
+// checkpoint. This save op follows the SavedModel's load semantics, so it
+// should return the file prefix of the checkpoint as a string tensor.
+inline constexpr StringRef kTfQuantSaveOpName = "tf_quant__save_op";
+
+// Name the file prefix string tensor. The tensor is used to identify the prefix
+// to the checkpoint where the variables are saved / loaded. This may be present
+// in a function argument's "tf_saved_model.index_path" attribute to identify
+// the file prefix function argument.
+inline constexpr StringRef kTfFilePrefix = "__tf_file_prefix";
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h
new file mode 100644
index 00000000..d42ad360
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_MANIPULATE_MODEL_ATTR_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_MANIPULATE_MODEL_ATTR_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+// Adds a new input name to the `inputs` field of the `tf.entry_function`
+// attribute if the attribute exist in the given function. Otherwise, no
+// attribute is modified.
+void AddEntryFunctionInput(StringRef input_name, func::FuncOp func_op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_MANIPULATE_MODEL_ATTR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
new file mode 100644
index 00000000..9a0084ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -0,0 +1,250 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_PASSES_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace quant {
+
+// Creates a main function if it doesn't exist in the module. This is a
+// workaround to make ConvertMlirToGraphdef work for multi-signatures graphs.
+// TODO(b/204265523): Removes this pass after the exporting MLIR to SavedModel
+// path is available.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertMainFunctionPass();
+
+// Converts FakeQuant ops to quant.qcast and quant.dcast (QDQ) pairs.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertFakeQuantToQdqPass();
+
+// Lifts the quantizable spots as composite functions.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const tensorflow::quantization::QuantizationOptions& quant_options);
+
+// Apply graph optimizations such as fusing and constant folding to prepare
+// lifting.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareLiftingPass(
+    tensorflow::quantization::OpSet target_opset);
+
+// Lifts the dynamic range quantizable spots as composite functions.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsDRQPass(
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    tensorflow::quantization::OpSet op_set, int min_num_elements_for_weights);
+
+// Replaces tf.CustomAggregator ops with quant.Stats ops for finalizing the
+// calibration procedure.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateConvertCustomAggregationOpToQuantStatsPass();
+
+// Inserts quantized function library.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    tensorflow::quantization::OpSet target_opset);
+
+// Inserts custom aggregation operators for the calibration procedure.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateInsertCustomAggregationOpsPass(
+    const ::stablehlo::quantization::CalibrationOptions& calib_opts);
+
+// Replaces composite functions with quantized composite functions. After this
+// pass runs, functions in the given graph will be replaced with their quantized
+// versions. By doing so, the quantization will be applied to the given input.
+// mlir_dump_file_prefix is an optional field that is used for debugging to save
+// mlir dump files.
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    tensorflow::quantization::OpSet target_opset,
+    bool enable_per_channel_quantization, int min_num_elements_for_weights,
+    bool enable_legacy_weight_only = false,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+// Converts dequantize-(quantizable) call-quantize pattern to a single call op
+// that has quantized input and output types. It is expected for this pass to
+// emit illegal IR with unsupported quantized input and output types. The
+// pass following immediately after this one will be responsible for legalizing
+// input and output types by unwrapping quantization parameters.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass();
+
+// Overloading of CreateQuantizePass which takes QuantizationSpecs.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    QuantizationSpecs quant_specs,
+    tensorflow::quantization::OpSet target_opset);
+
+// Creates an instance of the PrepareQuantize pass, which will perform similar
+// transformations as TFL::PrepareQuantizePass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
+    const QuantizationSpecs& quant_specs,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method);
+
+// Creates an instance of the PrepareQuantizeDRQ pass, which will
+// perform similar transformations as TFL::PrepareQuantizeDynamicRangePass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
+    const QuantizationSpecs& quant_specs,
+    tensorflow::quantization::OpSet op_set);
+
+// Creates an instance of the PreprocessOp pass, which will perform op
+// preprocessing to allow multi-axis quantization, prior to quantization.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
+    tensorflow::quantization::OpSet op_set,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+// Creates an instance of the PostQuantize pass, which will remove unnecessary
+// ops from the final quantized graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass();
+
+// Applies optimization patterns after quantization.
+std::unique_ptr<OperationPass<mlir::func::FuncOp>> CreateOptimizePass();
+
+// Creates an instance of the ReplaceCastHacksWithTFXLAOpsPass, which will
+// replace mixed-type convolution and matmul cast hacks by XLA Conv2DOp and
+// MatmulOp.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplaceCastHacksWithTFXLAOpsPass();
+
+// Creates a pass that moves & merges initializer function's ops into the @main
+// function. This pass should be run on a valid tf_executor dialect. The control
+// output of the initializer function for non-variable resource initialization
+// will be passed on as a dependency to a new `tf.NoOp`, whose control output
+// will be merged into the main function's FetchOp. The initializer functions
+// will be removed.
+//
+// Running this pass essentially has the effect of inlining the initializer
+// functions into the main graph. This is beneficial when we wish to find and
+// fetch the node that restores resources, after the ModuleOp has been exported
+// as GraphDef.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMergeInitializerFunctionOpsToMainPass();
+
+// Creates a pass that moves & merges the "@tf_quant__save" function to "@main"
+// function. A new `IdentityOp` will be created. It will have control dependency
+// to the save function and returns the file_prefix argument (typed
+// `tensor<!tf_type.string>`). The file_prefix argument, which can be identified
+// if the "tf_saved_model.index_path" attribute has "__tf_file_prefix", will be
+// reused if it already exist in @main. Otherwise a new file prefix argument
+// will be created. @tf_quant__save function will be erased.
+//
+// Running this pass essentially has the effect of inlining the @tf_quant__save
+// into the main graph. This is beneficial when we wish to find and fetch
+// the node that saves the variables, after the ModuleOp has been exported as
+// GraphDef.
+std::unique_ptr<OperationPass<ModuleOp>> CreateMergeSaveFunctionOpsToMainPass();
+
+// Creates a pass that "unfreezes" ConstOps into variables. Each ConstOp's use
+// will be replaced by a VarHandleOp -> ReadVariableOp pattern. The newly
+// created variables will be initialized in the session initializer function via
+// AssignVariableOps.
+std::unique_ptr<OperationPass<ModuleOp>> CreateUnfreezeConstantsPass();
+
+// Creates a pass that duplicates constants that affect the shape of a tensor
+// after some computation.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDuplicateShapeDeterminingConstantsPass();
+
+// Creates a pass that creates a RestoreV2 op in the initializer function with
+// type "restore_op" that initializes variables from the checkpoint. It finds
+// tf.AssignVariableOp(tf.VarHandleOp, tf.Const) patterns in the initializer
+// function and replaces tf.Consts with the results of RestoreV2.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertRestoreOpPass();
+
+// Creates a pass that creates a new function that wraps the newly created
+// SaveV2 op. The new function's name is "tf_quant__save". The function accepts
+// a single string tensor as argument, which specifies the path to the
+// checkpoint to which the variable's tensor values are saved. It finds
+// `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern in the initializer
+// function of type "restore_op" to identify the VarHandleOps that should be
+// saved using the SaveV2 op.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertSaveOpPass();
+
+// Creates a pass that marks functions with the attribute `tf._noinline = true`
+// to avoid being inlined by the `InlinerPass`. `noinline_functions` is the name
+// of the functions to mark.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMarkFunctionsNoinlinePass(
+    ArrayRef<std::string> noinline_functions);
+
+// Removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns from the
+// initializer function (type = "restore_op").
+// Note: initializing values (`tf.Const`s) will be removed and this may result
+// in an information loss and uninitialized variables eventually. Make sure that
+// this effect is desired (e.g. there is a `tf.RestoreV2Op` that restores the
+// variables instead).
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateRemoveVariableInitializationByConstPass();
+
+// Creates a pass that converts Tensorflow Xla ops to non-Xla ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertTfXlaOpToTfOpPass();
+
+// Creates a pass that converts TPU models for CPU by removing TPU related ops
+// such as TPUPartitionedCall, TPUReplicatedOp, etc. The TF quantizer does not
+// work with models specifically designed for TPU, so this pass makes the input
+// TPU model compatible with the TF quantizer by rewriting the TPU ops. The
+// output model of this pass is expected to be ready for the TF quantizer.
+std::unique_ptr<OperationPass<ModuleOp>> CreateConvertTpuModelToCpuPass();
+
+// Creates a pass that casts BFloat16 operations to Float32 operations. This
+// pass is a part of the ConvertTpuModelToCpu pass to support BF16 optimized TPU
+// model quantization.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCastBf16OpsToF32Pass();
+
+// Creates a pass that lifts HashTable ops as function arguments. In the graph
+// execution mode, resource ops with the same `shared_name` attribute point to
+// the same underlying resource. This is not true in the eager execution mode.
+// Lifting resource ops as arguments will help unifying them across functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftHashTableOpsAsArgsPass();
+
+// Creates a pass that merges duplicate resource ops in each function. Two
+// resource ops are considered duplicated if they have the same `shared_name`.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateMergeDuplicateResourceOpsPass();
+
+// Apply quantization to weights based on the provided schemes.
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeWeightsPass(
+    const tensorflow::quantization::QuantizationOptions& quant_options);
+
+// Propagate quantized type through allowed ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePropagateQuantizeTypePass();
+
+// Create a pass that inserts dump tensor to quantizable layer's output.
+std::unique_ptr<OperationPass<ModuleOp>> CreateAddDumpTensorOpPass(
+    ::stablehlo::quantization::DebuggerConfig::DebuggerType debugger_type,
+    std::string log_dir_path);
+
+// Creates a pass that add QuantizationUnitLoc to quantizable layers.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateAddQuantizationUnitLocPass();
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h
new file mode 100644
index 00000000..8fe144d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_REMOVE_IDENTITY_OP_PATTERN_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_REMOVE_IDENTITY_OP_PATTERN_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace quant {
+
+// Copied from tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc.
+// By removing identity ops, constant operands with dynamic shapes have static
+// shape information which is necessary for correct pattern matching in this
+// pass.
+struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
+  using OpRewritePattern<TF::IdentityOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::IdentityOp identity,
+                                PatternRewriter &rewriter) const override;
+};
+
+}  // namespace quant
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_REMOVE_IDENTITY_OP_PATTERN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h
new file mode 100644
index 00000000..6c8ad1ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_TF_QUANT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_TF_QUANT_OPS_H_
+
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h.inc"
+
+namespace mlir {
+namespace quant {
+
+// Function to register TensorFlow Uniform Quantized ops.
+void RegisterOps();
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_TF_QUANT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
new file mode 100644
index 00000000..fbba7247
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
@@ -0,0 +1,114 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow::quantization {
+
+// Declares pure virtual member functions for a python-side derived class to
+// override. This allows calling python implementations from the C++ layer.
+// Member functions should be pure not stateful; they should not access or rely
+// on member fields.
+class PyFunctionLibrary {
+ public:
+  virtual ~PyFunctionLibrary() = default;
+
+  // Saves `exported_model` to `dst_saved_model_path` as SavedModel.
+  // `src_saved_model_path` is the path to the source SavedModel from which the
+  // exported model is produced. It is used to copy the asset files to
+  // `dst_saved_model_path`. `tags` will be attached to the saved
+  // `MetaGraphDef`. `signature_def_map` will be passed to the
+  // `add_meta_graph_and_variables` function, which is internally used to add a
+  // `MetaGraphDef` to save to the SavedModel.
+  //
+  // Returns `true` if successful. Returns `std::nullopt` otherwise.
+  //
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting and definition should also change.
+  // LINT.IfChange(save_exported_model)
+  virtual std::optional<bool> SaveExportedModel(
+      absl::string_view dst_saved_model_path,
+      const ExportedModel& exported_model,
+      absl::string_view src_saved_model_path,
+      const std::unordered_set<std::string>& tags,
+      const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+          signature_def_map) const = 0;
+  // LINT.ThenChange(
+  //     pywrap_function_lib.pyi:save_exported_model,
+  //     py_function_lib.py:save_exported_model,
+  // )
+
+  // Runs calibration on a model saved at `saved_model_path`. `exported_model`
+  // should be the corresponding exported model resulting from the
+  // pre-calibration step. `signature_keys` is a set of keys that identify a
+  // SignatureDef to run the calibration on. `tags` is a set of strings that
+  // identify the `MetaGraphDef`. `calibration_options` provides configurations
+  // for the calibration behavior. `representative_dataset` is a python object
+  // of type `RepresentativeDatasetOrMapping`, which is used to run the
+  // calibration.
+  //
+  // Returns `true` if successful. Returns `std::nullopt` otherwise.
+  //
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting and definition should also change.
+  // LINT.IfChange(run_calibration)
+  virtual std::optional<bool> RunCalibration(
+      absl::string_view saved_model_path,
+      const std::vector<std::string>& signature_keys,
+      const std::unordered_set<std::string>& tags,
+      bool force_graph_mode_calibration,
+      const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
+          representative_dataset_file_map) const = 0;
+  // LINT.ThenChange(
+  //     pywrap_function_lib.pyi:run_calibration,
+  //     py_function_lib.py:run_calibration,
+  // )
+
+  // Retrieves min and max value from `calibration_statistics`, based on the
+  // calibration method specified by `calibration_options`.
+  //
+  // Returns `std::nullopt` if unsuccessful.
+  //
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting and definition should also change.
+  // LINT.IfChange(get_calibration_min_max_value)
+  virtual std::optional<stablehlo::quantization::MinMaxValue>
+  GetCalibrationMinMaxValue(const tensorflow::calibrator::CalibrationStatistics&
+                                calibration_statistics,
+                            const ::stablehlo::quantization::CalibrationOptions&
+                                calibration_options) const = 0;
+  // LINT.ThenChange(
+  //     pywrap_function_lib.pyi:get_calibration_min_max_value,
+  //     py_function_lib.py:get_calibration_min_max_value,
+  // )
+};
+
+}  // namespace tensorflow::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
new file mode 100644
index 00000000..9e36ce52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -0,0 +1,77 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// Names of the TensorFlow Quantization steps. These names are used primarily
+// for debugging.
+inline constexpr absl::string_view kTfQuantPtqPreCalibrationStepName =
+    "tf_quant_ptq_pre_calibration";
+inline constexpr absl::string_view kTfQuantPtqPostCalibrationStepName =
+    "tf_quant_ptq_post_calibration";
+inline constexpr absl::string_view kTfQuantQatStepName = "tf_quant_qat";
+inline constexpr absl::string_view kTfQuantPtqDynamicRangeStepName =
+    "tf_quant_ptq_dynamic_range";
+inline constexpr absl::string_view kTfQuantWeightOnlyStepName =
+    "tf_quant_weight_only";
+
+absl::StatusOr<ExportedModel> QuantizeQatModel(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quantization_options);
+
+// Applies post-training dynamic-range quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeDynamicRangePtq(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quantization_options);
+
+// Applies post-training static-range weight-only quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeWeightOnly(
+    absl::string_view saved_model_path,
+    const QuantizationOptions& quantization_options);
+
+// Applies post-training static-range quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quantization_options,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library,
+    const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
+        representative_dataset_file_map_serialized);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h
new file mode 100644
index 00000000..dd5fe761
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h
@@ -0,0 +1,158 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TYPE_CASTERS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TYPE_CASTERS_H_
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/cast.h"  // from @pybind11
+#include "pybind11/detail/common.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace pybind11::detail {
+namespace internal {
+
+// Serializes a protobuf object. Raises python ValueError if serialization
+// fails.
+inline std::string Serialize(const tsl::protobuf::Message& protobuf_object) {
+  const std::string serialized = protobuf_object.SerializeAsString();
+
+  // Empty string means it failed to serialize the protobuf with an error. See
+  // the docstring for SerializeAsString for details.
+  if (serialized.empty()) {
+    // Show the name of the protobuf message type to provide more information
+    // and easier debugging.
+    const absl::string_view descriptor_name =
+        protobuf_object.GetDescriptor() == nullptr
+            ? absl::string_view("unknown")
+            : absl::string_view(protobuf_object.GetDescriptor()->full_name());
+    throw py::value_error(absl::StrFormat(
+        "Failed to serialize protobuf object: %s.", descriptor_name));
+  }
+
+  return serialized;
+}
+
+// Handles `ProtoT` (c++) <-> `bytes` (python) conversion. The `bytes`
+// object in the python layer is a serialization of `ProtoT`.
+//
+// The caller of c++ interfaces should make sure to pass valid serialized
+// `ProtoT` objects as arguments. Failing to do so results in raising a
+// `ValueError`. Similarly, the python implementation of a c++ virtual member
+// function that return an `ProtoT` should return a valid serialized `ProtoT`.
+//
+// See https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
+template <typename ProtoT, typename = std::enable_if_t<std::is_base_of_v<
+                               tsl::protobuf::Message, ProtoT>>>
+struct SerializedProtobufCaster {
+ public:
+  PYBIND11_TYPE_CASTER(ProtoT, const_name<ProtoT>());
+
+  // Loads an `ProtoT` instance from a python `bytes` object (`src`).
+  bool load(handle src, const bool convert) {
+    auto caster = make_caster<absl::string_view>();
+    // Make sure the user passed a valid python string.
+    if (!caster.load(src, convert)) return false;
+
+    const absl::string_view serialized_proto =
+        cast_op<absl::string_view>(std::move(caster));
+
+    // NOLINTNEXTLINE: Explicit std::string conversion required for OSS.
+    return value.ParseFromString(std::string(serialized_proto));
+  }
+
+  // Constructs a `bytes` object by serializing `src`.
+  static handle cast(ProtoT&& src, return_value_policy policy, handle parent) {
+    // release() prevents the reference count from decreasing upon the
+    // destruction of py::bytes and returns a raw python object handle.
+    return py::bytes(Serialize(src)).release();
+  }
+
+  // Constructs a `bytes` object by serializing `src`.
+  static handle cast(const ProtoT& src, return_value_policy policy,
+                     handle parent) {
+    // release() prevents the reference count from decreasing upon the
+    // destruction of py::bytes and returns a raw python object handle.
+    return py::bytes(Serialize(src)).release();
+  }
+};
+
+}  // namespace internal
+
+// The following explicit specializations of protobuf `type_caster`s for
+// specific protobuf message types are there to have higher priority over those
+// defined in `native_proto_caster.h` during the resolution process. This is
+// because the type casters in `native_proto_caster.h`, which allow seamlessly
+// exchanging protobuf messages across c++-python boundaries, potentially
+// without serialization, fail in the open-source environment.
+// Explicitly-specialized type casters for serialized protobufs are added on an
+// on-demand basis for quantization library.
+// TODO: b/308532051 - Make `native_proto_caster.h` work in the open-source
+// environment.
+
+template <>
+struct type_caster<tensorflow::quantization::ExportedModel>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::quantization::ExportedModel> {};
+
+template <>
+struct type_caster<tensorflow::quantization::QuantizationOptions>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::quantization::QuantizationOptions> {};
+
+template <>
+struct type_caster<::stablehlo::quantization::CalibrationOptions>
+    : public internal::SerializedProtobufCaster<
+          ::stablehlo::quantization::CalibrationOptions> {};
+
+template <>
+struct type_caster<tensorflow::SignatureDef>
+    : public internal::SerializedProtobufCaster<tensorflow::SignatureDef> {};
+
+template <>
+struct type_caster<tensorflow::GraphDef>
+    : public internal::SerializedProtobufCaster<tensorflow::GraphDef> {};
+
+template <>
+struct type_caster<tensorflow::calibrator::CalibrationStatistics>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::calibrator::CalibrationStatistics> {};
+
+template <>
+struct type_caster<stablehlo::quantization::QuantizationConfig>
+    : public internal::SerializedProtobufCaster<
+          stablehlo::quantization::QuantizationConfig> {};
+
+template <>
+struct type_caster<tensorflow::quantization::RepresentativeDatasetFile>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::quantization::RepresentativeDatasetFile> {};
+
+}  // namespace pybind11::detail
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TYPE_CASTERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h
new file mode 100644
index 00000000..3086d705
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_UNFREEZE_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_UNFREEZE_CONSTANTS_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+inline constexpr absl::string_view kTfQuantConstantUnfreezingStepName =
+    "tf_quant_constant_unfreezing";
+inline constexpr absl::string_view kTfQuantInsertRestoreOpStepName =
+    "tf_quant_insert_restore_op";
+
+absl::Status UnfreezeConstantsAndSaveVariables(absl::string_view checkpoint_dir,
+                                               mlir::MLIRContext &ctx,
+                                               mlir::ModuleOp module_op);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_UNFREEZE_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h
new file mode 100644
index 00000000..b9c765c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PASSES_H_
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// mlir_dump_file_prefix is an optional field that is used for debugging to save
+// mlir dump files.
+void AddQuantizeQatPasses(mlir::OpPassManager &pm,
+                          const QuantizationOptions &quantization_options,
+                          std::optional<const absl::string_view>
+                              mlir_dump_file_prefix = std::nullopt);
+
+void AddQuantizePtqDynamicRangePasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+void AddQuantizeWeightOnlyPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+void AddQuantizePtqPreCalibrationPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options);
+
+void AddQuantizePtqPostCalibrationPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
new file mode 100644
index 00000000..47bed2e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
@@ -0,0 +1,86 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// Default MLIR dump file prefix for TensorFlow quantization passes.
+inline constexpr absl::string_view kDefaultTfQuantMlirDumpFilePrefix =
+    "tf_quant";
+
+// Preprocesses the `module_op` for quantization. The preprocess steps include
+// freezing the variables in the graph into constants. `is_inliner_run`
+// determines whether the `InlinerPass` should be run after unfreezing.
+//
+// `mlir_dump_file_prefix` is primarily used for debugging and does not affect
+// the preprocessing behavior. Instructions for producing MLIR dump files are in
+// the comments of `tensorflow::quantization::MaybeEnableIrPrinting` function.
+absl::Status PreprocessAndFreezeGraph(
+    absl::string_view mlir_dump_file_prefix, bool is_inliner_run,
+    const absl::flat_hash_set<std::string>& noinline_functions,
+    mlir::ModuleOp module_op, mlir::MLIRContext* context,
+    std::optional<Session*> session, bool run_tf_to_stablehlo,
+    bool deserialize_xla_call_module,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes = {});
+
+// Overload of `PreprocessAndFreezeGraph` that uses the default MLIR dump file
+// prefix.
+inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
+                                             mlir::MLIRContext* context,
+                                             std::optional<Session*> session) {
+  return PreprocessAndFreezeGraph(
+      /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
+      /*is_inliner_run=*/true, /*noinline_functions=*/{}, module_op, context,
+      session, /*run_tf_to_stablehlo=*/false,
+      /*deserialize_xla_call_module=*/false, /*input_arg_shapes=*/{});
+}
+
+// Overload of `PreprocessAndFreezeGraph` that uses the default MLIR dump file
+// prefix.
+inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
+                                             mlir::MLIRContext* context) {
+  return PreprocessAndFreezeGraph(
+      /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
+      /*is_inliner_run=*/true, /*noinline_functions=*/{}, module_op, context,
+      nullptr, /*run_tf_to_stablehlo=*/false,
+      /*deserialize_xla_call_module=*/false, /*input_arg_shapes=*/{});
+}
+
+// TF->StableHLO has limited support for dynamic shapes.
+// Some models can only be converted with explicitly provided input argument
+// shapes.
+void AddTFToStablehloPasses(
+    mlir::PassManager& pm,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes = {});
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
new file mode 100644
index 00000000..702e1950
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
@@ -0,0 +1,160 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TF-Quant transformation
+// passes to work with tf.FakeQuant* ops. Copied and modified from
+// //third_party/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_FAKE_QUANT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_FAKE_QUANT_UTILS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+
+namespace mlir {
+namespace quant {
+
+template <class TFFakeQuantOp>
+struct FetchMinMaxAttrs {
+  using AttrType = FloatAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    min_value = tf_op.getMinAttr();
+    max_value = tf_op.getMaxAttr();
+    return true;  // Successfully matched and fetched.
+  }
+};
+
+template <class TFFakeQuantOp>
+struct FetchConstantMinMaxInputs {
+  using AttrType = DenseFPElementsAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    Value min = tf_op.getMin(), max = tf_op.getMax();
+    if (auto min_id = min.getDefiningOp<TF::IdentityOp>()) {
+      min = min_id.getInput();
+    }
+    if (auto max_id = max.getDefiningOp<TF::IdentityOp>()) {
+      max = max_id.getInput();
+    }
+
+    if (!matchPattern(min, m_Constant(&min_value))) {
+      return false;
+    }
+    if (!matchPattern(max, m_Constant(&max_value))) {
+      return false;
+    }
+    return true;  // Successfully matched and fetched.
+  }
+};
+
+// Inserts a "quant.qcast" and "quant.dcast" op pair (QDQs) in place of the
+// tf.FakeQyantWithMinMax{Vars|VarsPerChannel|Args}Op
+// before the op being constant folded. Since the constant
+// folding logic will use a "arith.constant" op to replace the
+// "tf.FakeQuantWithMinMaxVarsOp", the "quant.qcast" op is used to preserve
+// the quantization parameters as a TypeAttr and "quant.dcast" op used to
+// convert the output type to the next op. Here are the transformations:
+//
+// input   min cst       max cst              input
+//  \       |             |                     |
+//   \  (tf.Identity) (tf.Identity)   =>   quant.qcast
+//    \     |             |                     |
+//       tf.FakeQuantWithMinMaxVars        quant.dcast
+//                   |                          |
+//
+// Warns if the (most likely unwanted, currently not quite correctly handled)
+// case of back-to-back tf.FakeQuant occurs
+//
+//             tf.FakeQuant*
+//                   |
+//             tf.FakeQuant*
+//
+template <typename TFFakeQuantOp, bool PerAxis, class FetchMinMax>
+class ConvertFakeQuantOpToQuantOps {
+ public:
+  explicit ConvertFakeQuantOpToQuantOps(bool use_fake_quant_num_bits)
+      : use_fake_quant_num_bits_(use_fake_quant_num_bits) {}
+
+  FetchMinMax fetch_min_max_;
+
+  using FetchAttrType = typename FetchMinMax::AttrType;
+  LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
+                                OpBuilder &rewriter) const {
+    if (tf_op.getNumBits() != 8) {
+      return failure();
+    }
+
+    // Extract the min/max constant values from the operands. We also consider
+    // a special case that there are tf.Identity ops between the min/max
+    // constants and the tf.FakeQuantWithMinMaxVarsOp.
+    FetchAttrType min_value, max_value;
+    if (!fetch_min_max_(tf_op, min_value, max_value)) {
+      return failure();
+    }
+
+    Value input = tf_op.getInputs();
+    int quant_dim = -1;
+    auto input_type = mlir::cast<ShapedType>(input.getType());
+    if (PerAxis) {
+      if (!input_type.hasRank()) {
+        tf_op.emitError("The input should have known rank for per-channel op.");
+        return failure();
+      }
+      // This is a special case that the quant_dim is the last dimensions.
+      quant_dim = input_type.getRank() - 1;
+    }
+    // Use the min/max from the operands and the num_bits and narrow_range
+    // attribute to create the quantization parameter for the new quantize op.
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
+    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
+    Type res_type = tf_op.getType();
+    TypeAttr qtype = quant::GetQuantizedTypeAttr(
+        rewriter, input_type, min_value, max_value, quant_dim, num_bits,
+        narrow_range, /*is_signed=*/true, /*legacy_float_scale=*/false,
+        use_fake_quant_num_bits_);
+    if (!qtype) {
+      return failure();
+    }
+
+    // Finally, use the quantization parameter to create the quantize and
+    // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
+    // and its users.
+    auto quantize = rewriter.create<quantfork::QuantizeCastOp>(
+        tf_op.getLoc(), qtype.getValue(), input);
+    auto dequantize = rewriter.create<quantfork::DequantizeCastOp>(
+        tf_op.getLoc(), res_type, quantize.getResult());
+    tf_op.getOutputs().replaceAllUsesWith(dequantize);
+
+    return success();
+  }
+
+  bool use_fake_quant_num_bits_;
+};
+
+// Removes the wrapper of the tf.FakeQuant* ops and creates the quant.qcast
+// and quant.dcast pairs before tf.FakeQuant* ops are being folded.
+LogicalResult ConvertFakeQuantOps(func::FuncOp func, MLIRContext *ctx,
+                                  bool use_fake_quant_num_bits);
+
+}  // namespace quant
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_FAKE_QUANT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.h
new file mode 100644
index 00000000..2e573e28
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_QUANTIZE_OP_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_QUANTIZE_OP_UTILS_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+UnrankedTensorType CreateUnknownShapeFromElementType(Type tensor_type);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_QUANTIZE_OP_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
new file mode 100644
index 00000000..922729d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
@@ -0,0 +1,72 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This header file defines common utils used when transforming TF ops to
+// Uniform Quantized ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
+
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+
+namespace mlir::quant {
+
+LogicalResult FillAttributesForUniformQuantizedDotOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedAddOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedClipByValueOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformRequantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
new file mode 100644
index 00000000..80212b9a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used when transforming TF ops to XLA
+// ops.
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_XLA_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_XLA_ATTRIBUTE_UTILS_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+
+namespace mlir::quant {
+
+// Caclulate padding values for XLA ops.
+// Padding values for Uniform Quantized ops can be generated with this method as
+// well as it shares the same definition for padding attribute with the XLA ops.
+Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
+                                     Value input, Value filter,
+                                     int8_t input_zp_value, ArrayAttr strides,
+                                     ArrayAttr dilations,
+                                     StringAttr conv_padding,
+                                     ArrayAttr explicit_paddings,
+                                     Value &padding, int num_dims = 4);
+
+// Given value that is in 8bit type, but holds 4bit data in unpacked format,
+// pack to nibble format along pack_dim.
+// If the pack_dim size is odd, add 1-size 0 padding and then pack.
+Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_XLA_ATTRIBUTE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/register_common_dialects.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/register_common_dialects.h
new file mode 100644
index 00000000..d88bcc83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/register_common_dialects.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_REGISTER_COMMON_DIALECTS_H_
+#define TENSORFLOW_COMPILER_MLIR_REGISTER_COMMON_DIALECTS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+
+namespace mlir {
+
+// Inserts common Tensorflow dialects used for offline tools.
+void RegisterCommonToolingDialects(mlir::DialectRegistry& registry);
+
+};  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_REGISTER_COMMON_DIALECTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h
new file mode 100644
index 00000000..5ba65901
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_PER_FUNCTION_AGGREGATE_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_PER_FUNCTION_AGGREGATE_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+namespace detail {
+
+// This template defines an aggregate analysis base class, which analyzes a
+// module but the analysis info is stored per function.
+template <typename InfoT>
+class PerFunctionAggregateAnalysis {
+ public:
+  using Info = InfoT;
+
+  // Returns the analysis info for the given function.
+  const Info& GetAnalysisForFunc(func::FuncOp func) const {
+    auto it = info_map_.find(func);
+    assert(it != info_map_.end());
+    return it->second;
+  }
+
+ protected:
+  // Since `InfoT` might be large, DenseMap is used instead of SmallDenseMap to
+  // avoid stack overflow.
+  llvm::DenseMap<func::FuncOp, InfoT> info_map_;
+};
+
+}  // namespace detail
+
+// Base CRTP class to help write passes that are consumes a per-function
+// aggregate analysis and operate on all non-extern functions (similar to a
+// OperationPass<func::FuncOp>, but with no concurrency between functions). The
+// derived classes need to provide a runOnFunction() method that accepts the
+// function and the analysis information for that function.
+template <typename DerivedT, typename AnalysisT>
+class PerFunctionAggregateAnalysisConsumerPass
+    : public PassWrapper<
+          PerFunctionAggregateAnalysisConsumerPass<DerivedT, AnalysisT>,
+          OperationPass<ModuleOp>> {
+ public:
+  static ::mlir::TypeID resolveTypeID() {
+    static ::mlir::SelfOwningTypeID id;
+    return id;
+  }
+
+ private:
+  void runOnOperation() override {
+    ModuleOp op = this->getOperation();
+    DerivedT& derived = *static_cast<DerivedT*>(this);
+    auto& analysis = this->template getAnalysis<AnalysisT>();
+
+    for (auto func : op.getOps<func::FuncOp>())
+      if (!func.isExternal())
+        derived.runOnFunction(func, analysis.GetAnalysisForFunc(func));
+  }
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_PER_FUNCTION_AGGREGATE_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
new file mode 100644
index 00000000..c49852c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
@@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_ALIAS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_ALIAS_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+namespace detail {
+class BacktrackAnalysis;
+class BacktrackAnalysisInfo;
+
+// Resource alias analysis information for a single function.
+class ResourceAliasAnalysisInfo {
+ public:
+  // Constructs analysis info by analyzing the given function.
+  ResourceAliasAnalysisInfo(func::FuncOp func,
+                            const BacktrackAnalysis& backtrack_analysis,
+                            SymbolTableCollection& symbol_table_collection);
+
+  ResourceAliasAnalysisInfo(ResourceAliasAnalysisInfo&&) = default;
+
+  // Returns if the analysis fails to resolve a resource-type value.
+  bool IsUnknownResource(Value resource) const;
+
+  // Returns the set of unique IDs which `resource` could alias. Requires that
+  // IsUnknownResource(resource) == false.
+  const llvm::SmallSet<int64_t, 8>& GetResourceUniqueIds(Value resource) const;
+
+  // Returns the set of values that are potentially aliases of `value`. Requires
+  // `IsUnknownResource(resource) == false`.
+  llvm::SmallSetVector<Value, 8> GetResourceAliases(Value resource) const;
+
+  llvm::SmallSetVector<Value, 8> GetValuesForResourceId(int64_t id) const {
+    auto it = id_to_resource_values_.find(id);
+    if (it == id_to_resource_values_.end()) {
+      return {};  // return empty set
+    }
+    return it->getSecond();
+  }
+
+  // Returns true iff given resource is allocated by op with
+  // `UniqueResourceAllocation` trait. This can be utilized for while-loop
+  // parallelization.
+  bool IsUniqueResourceAllocationId(int64_t resource_id) const {
+    return unique_resource_allocation_ids_.contains(resource_id);
+  }
+
+ private:
+  // Maps resource value to unique ID and vice-versa. Returns true if the
+  // mapping has changed.
+  bool AddValueUniqueIDMapping(Value value, int64_t id) {
+    resource_value_to_ids_[value].insert(id);
+    return id_to_resource_values_[id].insert(value);
+  }
+
+  // Returns the set unique Values which map to `id`.
+  const llvm::SmallSetVector<Value, 8>& GetUniqueIdResources(int64_t id) const;
+
+  // Propagates the resource IDs from an input operand to a result. Returns
+  // true of the mapping has changed.
+  bool PropagateInputToOutput(const Value& operand, const OpResult& result);
+
+  // Analyzes while loops to compute resource IDs for the loop results.
+  // `body_info` is the backtrack analysis info for the loop body.
+  void AnalyzeWhileLoop(Operation* while_op,
+                        const BacktrackAnalysisInfo& body_info);
+
+  // Analyzes tf.Case/tf.If ops to compute resource IDs.
+  template <class CaseOrIfOp>
+  void AnalyzeFunctionalCaseOrIfOp(CaseOrIfOp case_or_if_op,
+                                   llvm::ArrayRef<func::FuncOp> functions,
+                                   const BacktrackAnalysis& backtrack_analysis);
+
+  // Analyzes tf.CaseRegion/tf.IfRegion ops to compute resource IDs.
+  void AnalyzeRegionCaseOrIfOp(Operation* case_or_if_op,
+                               const BacktrackAnalysis& backtrack_analysis);
+
+  // Maps each resource-type value to a set of unique IDs that it could alias.
+  llvm::SmallDenseMap<Value, llvm::SmallSet<int64_t, 8>, 8>
+      resource_value_to_ids_;
+
+  // Maps each unique ID to a set of resource-type values that could alias to
+  // it. This is inverse of `resource_value_to_ids_` map.
+  llvm::SmallDenseMap<int64_t, llvm::SmallSetVector<Value, 8>, 8>
+      id_to_resource_values_;
+
+  // Maps MLIR type IDs for resource types to internal resource type IDs.
+  llvm::SmallDenseMap<TypeID, int64_t> type_id_to_internal_type_id_;
+
+  // Contains IDs of all resources that are allocated by ops with
+  // `UniqueResourceAllocation` trait.
+  llvm::SmallDenseSet<int64_t, 32> unique_resource_allocation_ids_;
+
+ public:
+  // Resource IDs have the following semantics:
+  // a) -1 represents an unknown resource (both instance and type unknown)
+  // b) IDs in range [0,kMaxResourceTypeId] represent resource type IDs; we use
+  //    such IDs when we know the resource type but not the instance
+  // c) IDs > kMaxResourceTypeId represent resource instance IDs (i.e., we know
+  //    the specific resource instance)
+  //
+  // Note: In general, there can be different ops allocating a resource of the
+  // same type, for one we might assign a resource type ID and for the other
+  // a resource instance ID. That means, they will be treated as non-aliasing.
+  // This is correct for all current cases. A problematic case could be if we
+  // had two ops A and B, A has the `ResourceHandleAllocatorInterface` and B has
+  // not, and both ops might return a handle to the same resource (depending on
+  // attributes). In this case, the return value of A would get a different ID
+  // than the return value of B although both could point to the same resource.
+  // It seems highly unlikely to encounter such a case but, to be safe, this
+  // should be revisited for new resource-allocators that might potentially
+  // break our currently guaranteed correctness.
+  // For context, we are very conservative here compared to
+  // `auto_control_deps.py` where it is assumed that allocated resource values
+  // NEVER alias. We should align our assumptions in the future.
+  static constexpr int64_t kUnknownResourceId = -1;
+  static constexpr int64_t kInvalidResourceId = -2;
+  static constexpr int64_t kMaxResourceTypeId = 9999;
+};
+
+}  // namespace detail
+
+// An analysis that runs on a module and maps each resource-type value to a
+// set of unique IDs representing the possible resources it could alias.
+//
+// Note that this is not an inter-procedural or inter-regional analysis, i.e.,
+// each function and region are handled separately and cross-function or cross-
+// region aliasing cannot be checked by this analysis.
+class ResourceAliasAnalysis : public detail::PerFunctionAggregateAnalysis<
+                                  detail::ResourceAliasAnalysisInfo> {
+ public:
+  // Constructs analysis by analyzing the given module operation.
+  explicit ResourceAliasAnalysis(ModuleOp module);
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_ALIAS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
new file mode 100644
index 00000000..1e68ac41
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_DATAFLOW_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_DATAFLOW_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/DataFlowFramework.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+
+// Used as a lattice value.
+struct ResourceConstructingOps {
+  explicit ResourceConstructingOps(Operation *op = nullptr);
+  static ResourceConstructingOps EntryState(MLIRContext *context);
+  static ResourceConstructingOps EntryState(Value value);
+  bool operator==(const ResourceConstructingOps &rhs) const {
+    return ops == rhs.ops;
+  }
+
+  static ResourceConstructingOps join(const ResourceConstructingOps &lhs,
+                                      const ResourceConstructingOps &rhs);
+  void print(raw_ostream &os) const;
+
+  // The operation(s) which created the resource value.
+  // IR constructs (i.e., GlobalTensorOp) are not const-correct.
+  mutable DenseSet<Operation *> ops;
+};
+
+struct IsComposite {
+  explicit IsComposite(Operation *op = nullptr);
+  static IsComposite EntryState(MLIRContext *context);
+  static IsComposite EntryState(Value value);
+  bool operator==(const IsComposite &rhs) const {
+    return is_on_composite_device == rhs.is_on_composite_device;
+  }
+
+  static IsComposite join(const IsComposite &lhs, const IsComposite &rhs);
+  void print(raw_ostream &os) const;
+
+  bool is_on_composite_device = false;
+};
+
+typedef dataflow::Lattice<ResourceConstructingOps> ResourceDataflowState;
+typedef dataflow::Lattice<IsComposite> IsCompositeDataflowState;
+
+void LoadResourceDataflowAnalysis(DataFlowSolver &solver);
+void LoadIsCompositeDataflowAnalysis(DataFlowSolver &solver);
+
+}  // namespace TF
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_DATAFLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
new file mode 100644
index 00000000..738d8c1d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
@@ -0,0 +1,79 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_VALUE_TYPED_ANALYZER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_VALUE_TYPED_ANALYZER_H_
+
+#include <tuple>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+
+class ResourceAnalyzer {
+ public:
+  explicit ResourceAnalyzer(ModuleOp module, bool skip_session_init = false);
+
+  bool IsPotentiallyWritten(Value resource) const;
+
+ private:
+  // Analyze the specified region for resource mutating operations, namely
+  // TF::AssignVariableOp, if so, set the resource associated as "potentially
+  // written".
+  LogicalResult AnalyzeRegion(Region& region);
+
+  // If an op is not one of the handled ones, we assume all resource usages
+  // within its purview are mutating in nature.
+  void PropagatePotentiallyWrittenWithinUnhandledOp(Operation* op);
+
+  // Given a Region associated with the callee and operands from the
+  // corresponding callOp, propagate the potentially written decision to the
+  // callOp's operands, if the corresponding region's arguments are potentially
+  // written resources.
+  void PropagatePotentiallyWrittenUpFromCallee(
+      Region& region, Operation::operand_range propagate_to);
+
+  // Marks 'resource' as written.
+  void SetPotentiallyWritten(Value resource);
+
+  struct ResourceInfo {
+    bool potentially_written = false;
+  };
+  // Key: Resource Value's
+  // Value: Information we know about that Value.
+  // Note that these Value's are in general in different functions.
+  DenseMap<Value, ResourceInfo> resource_infos_;
+  // The set of regions we already discovered.
+  DenseSet<Region*> discovered_;
+  // Identifiers about mutable variables.
+  // All variables are identified by (device, container, shared_name).
+  DenseSet<std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef>>
+      mutable_variables_;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_VALUE_TYPED_ANALYZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
new file mode 100644
index 00000000..feb90de1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
@@ -0,0 +1,343 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_SIDE_EFFECT_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_SIDE_EFFECT_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
+
+namespace mlir {
+namespace TF {
+using ResourceId = int64_t;
+inline constexpr ResourceId kUnknownResourceId =
+    ResourceAliasAnalysis::Info::kUnknownResourceId;
+static_assert(kUnknownResourceId < 0, "kUnknownResourceId must be < 0");
+
+// Maps group IDs to branch IDs.
+using ParallelIdsMap = std::map<std::string, std::string>;
+using OpToParallelIdsMap = absl::flat_hash_map<Operation*, ParallelIdsMap>;
+
+namespace detail {
+
+class OpSideEffectCollector;
+
+using StackResourceToOps = std::vector<
+    absl::flat_hash_map<ResourceId, absl::flat_hash_set<Operation*>>>;
+
+// Side effect analysis info for a single function.
+//
+// This class provides an interface for querying control predecessors and
+// successors for ops of the given function. This information is computed from
+// side effects, using resource alias analysis where possible.
+// Remarks:
+// - Control dependencies model execution order constraints for side-effecting
+//   ops. For example, two ops writing to the same resource cannot switch their
+//   order and cannot be executed in parallel.
+// - A control dependency (A,B) means that op A has to be executed before op B.
+//   A is a control predecessor of B, and B is a control successor of A.
+// - The control dependencies provided by side effect analysis are guaranteed to
+//   be sufficient for correct execution but they are not guaranteed to be
+//   minimal (that means, some control dependencies might not be required for
+//   correct execution).
+class SideEffectAnalysisInfo {
+ public:
+  SideEffectAnalysisInfo() = default;
+
+  // Constructs analysis info by analyzing the given function.
+  SideEffectAnalysisInfo(func::FuncOp func_op,
+                         const OpSideEffectCollector& op_side_effect_collector,
+                         const TF::ResourceAliasAnalysis::Info& alias_analysis,
+                         const OpToParallelIdsMap& op_to_parallel_ids)
+      : op_side_effect_collector_(op_side_effect_collector),
+        alias_analysis_(alias_analysis),
+        op_to_parallel_ids_(op_to_parallel_ids) {
+    AnalyzeFunction(func_op);
+  }
+
+  // Constructs analysis info by analyzing the given region.
+  SideEffectAnalysisInfo(Region* region,
+                         const OpSideEffectCollector& op_side_effect_collector,
+                         const TF::ResourceAliasAnalysis::Info& alias_analysis,
+                         const OpToParallelIdsMap& op_to_parallel_ids)
+      : op_side_effect_collector_(op_side_effect_collector),
+        alias_analysis_(alias_analysis),
+        op_to_parallel_ids_(op_to_parallel_ids) {
+    AnalyzeRegion(region);
+  }
+
+  SideEffectAnalysisInfo(SideEffectAnalysisInfo&&) = default;
+
+  // Returns a vector of ops that are direct control predecessors of `op`,
+  // sorted in program order. If `filter` is provided, only predecessors that
+  // pass the filter (returning true) will be included.
+  const llvm::SmallVector<Operation*, 4>& DirectControlPredecessors(
+      Operation* op) const;
+  llvm::SmallVector<Operation*, 4> DirectControlPredecessors(
+      Operation* op, llvm::function_ref<bool(Operation*)> filter) const;
+
+  // pass the filter (returning true) will be included.
+  const llvm::SmallVector<Operation*, 4>& DirectControlSuccessors(
+      Operation* op) const;
+  llvm::SmallVector<Operation*, 4> DirectControlSuccessors(
+      Operation* op, llvm::function_ref<bool(Operation*)> filter) const;
+
+  // Returns a vector of ops that are control sinks (i.e. side-effecting ops
+  // with no control successors).
+  llvm::ArrayRef<Operation*> ControlSinks() const {
+    return sorted_control_sinks_;
+  }
+
+  // Returns a vector with IDs of all resources that might be accessed by `op`.
+  // This includes both op-based and value-based resources. The bool indicates
+  // whether a resource is accessed read-only.
+  const llvm::SmallVector<std::pair<ResourceId, bool>>& GetResourceIds(
+      Operation* op) const;
+
+  // Returns true iff given resource is allocated by op with
+  // `UniqueResourceAllocation` trait. This can be utilized for while-loop
+  // parallelization.
+  bool IsUniqueResourceAllocationId(ResourceId resource_id) const {
+    return alias_analysis_.IsUniqueResourceAllocationId(resource_id);
+  }
+
+  const TF::ResourceAliasAnalysis::Info& GetAliasAnalysis() const {
+    return alias_analysis_;
+  }
+
+ private:
+  // Runs the analysis and populates `sorted_control_predecessors_` and
+  // `sorted_control_successors_` for `func_op`. Clears `control_predecessors_`.
+  void AnalyzeFunction(func::FuncOp func_op);
+
+  // Runs the analysis and populates `control_predecessors_` for `region`.
+  void AnalyzeRegion(Region* region);
+
+  // Runs the analysis and populates `control_predecessors_` for `op`.
+  void AnalyzeOp(Operation* op);
+
+  // Updates `control_predecessors_` for given `resource_id` and `op`.
+  void AddPredecessorsForAccess(ResourceId resource_id, Operation* op,
+                                bool read_only);
+
+  // Updates resource access for given `resource_id` and `op` in
+  // `per_resource_access_info_` and `op_to_resource_ids_`.
+  void UpdateAccess(ResourceId resource_id, Operation* op, bool read_only);
+
+  // Returns true iff the last unknown resource access is already indirectly
+  // tracked by a previous `resource` access. `read_only` specifies the type of
+  // access considered.
+  bool IsUnknownAccessIndirectlyTrackedByResource(ResourceId resource,
+                                                  bool read_only);
+
+  // Returns a set of resource IDs that have potential dependencies to
+  // `resource_id` (i.e., there are potential dependencies between the
+  // resources corresponding to the IDs).
+  llvm::SmallSet<ResourceId, 8> GetDependentIds(ResourceId resource_id,
+                                                bool is_fetch_op) const;
+
+  // Returns the parallel ids of the op.
+  ParallelIdsMap GetParallelIdsMap(Operation* op);
+
+  // Converts from read/write state that relates ops with the same parallel id
+  // to a set of last accesses for use with other parallel ids. Reads/writes
+  // between parallel ids are conservatively approximated as writes.
+  absl::flat_hash_set<Operation*> GetLastWrites(ResourceId resource_id);
+
+  // Sets the read/write state for ops within the same parallel id.
+  void SetLastWrites(ResourceId resource_id,
+                     absl::flat_hash_set<Operation*> last_writes);
+
+  // Enters a sequence of ops that have the same parallel id. This converts
+  // stack state to per_resource_access_info_.
+  void Enter();
+
+  // Exits a sequence of ops that have the same parallel id. This converts
+  // per_resource_access_info_ to stack state.
+  void Exit();
+
+  // Steps down one parallel nesting level (i.e. increase parallel id size
+  // by 1).
+  void Down();
+
+  // Steps laterally between parallel nesting levels.
+  void Lateral();
+
+  // Steps up one parallel nesting level.
+  void Up();
+
+  // Transitions nesting levels from `from` to `to`.
+  void Transition(ParallelIdsMap from, ParallelIdsMap to);
+
+  // Transitions nesting levels from the previous parallel id to `to`.
+  void TransitionToParallelIdsMap(ParallelIdsMap to);
+
+  // Transitions nesting levels from the previous parallel id to `to`.
+  void TransitionToOp(Operation* to);
+
+  // Initializes stack state for a function.
+  void InitFunction();
+
+  // Uninitializes stack state for a function.
+  void UninitFunction();
+
+  // Maps from an op to its control predecessors.
+  llvm::SmallDenseMap<Operation*, llvm::SmallPtrSet<Operation*, 4>, 8>
+      control_predecessors_;
+  // Maps from an op to its control predecessors sorted in program order.
+  llvm::SmallDenseMap<Operation*, llvm::SmallVector<Operation*, 4>, 8>
+      sorted_control_predecessors_;
+  // Maps from an op to its control successors sorted in program order.
+  llvm::SmallDenseMap<Operation*, llvm::SmallVector<Operation*, 4>, 8>
+      sorted_control_successors_;
+  // Side-effecting ops with no control successors in this function.
+  llvm::SmallVector<Operation*, 4> sorted_control_sinks_;
+
+  // Maps from an op to its resource IDs along with a bool indicating if the
+  // resource is accessed `read-only`.
+  llvm::SmallDenseMap<Operation*,
+                      llvm::SmallVector<std::pair<ResourceId, bool>>>
+      op_to_resource_ids_;
+  llvm::SmallVector<std::pair<ResourceId, bool>> empty_resource_ids_;
+
+  // For predecessor / successor queries on ops we don't track.
+  llvm::SmallVector<Operation*, 4> empty_operation_set_;
+
+  // Internal per-resource data structure for building the dependencies.
+  struct PerResourceAccessInfo {
+    // Last writes to resource before the current op is being analyzed. In
+    // general there can be multiple most recent accesses when ops have
+    // different parallel ids.
+    absl::flat_hash_set<Operation*> last_writes;
+    // Read ops since `last_write` before the current op is being analyzed.
+    llvm::SmallVector<Operation*, 8> reads_since_last_write;
+    // Whether a previous access of this resource already tracks the last
+    // unknown read(s).
+    bool are_last_unknown_reads_tracked = false;
+    // Whether a previous write access of this resource already tracks the last
+    // unknown write.
+    bool is_last_unknown_write_tracked_by_write = false;
+    // Whether a previous read or write access of this resource already tracks
+    // the last unknown write.
+    bool is_last_unknown_write_tracked = false;
+  };
+
+  // Resource access info per resource ID.
+  llvm::SmallDenseMap<ResourceId, PerResourceAccessInfo, 8>
+      per_resource_access_info_;
+
+  // Hold the last set of reads and writes that
+  // will be depended on by ops with greater nesting depths.
+  // For example, the last read/write with parallel_ids `{group0:branch0}`
+  // lives at stack depth 1 and is depended on by ops with parallel_ids
+  // of the form `{group0:branch0, ...}`.
+  //
+  // We track a set of reads/writes rather than a single read/write because
+  // multiple parallel ops may be live at any particular point.
+  StackResourceToOps stack_down_;
+
+  // Hold the last set of reads and writes that will be depended on by
+  // ops with lesser nesting depths. For example, the last read/writes
+  // with parallel_ids `{group0:branch0}` and `{group0:branch1}` live at
+  // stack depth 1 and are depended on by ops with parallel_ids `{}`.
+  StackResourceToOps stack_up_;
+
+  // Parallel ids of the previously traversed op in the same function.
+  // The transition from the previous parallel_ids to the current parallel_ids
+  // determines which stack actions occur.
+  ParallelIdsMap previous_parallel_ids_;
+
+  const OpSideEffectCollector& op_side_effect_collector_;
+  const TF::ResourceAliasAnalysis::Info& alias_analysis_;
+
+  // Map op to parallel_ids. If an op is not a key then it has empty parallel
+  // ids, which corresponds to nesting depth 0.
+  const OpToParallelIdsMap& op_to_parallel_ids_;
+};
+
+}  // namespace detail
+
+// An analysis that runs on a function and infers the control predecessors and
+// successors for each op, based on side effects on known and unknown resources.
+// Side-effecting ops on unknown resources are conservatively treated as
+// interfering with all known resource op accesses. It distinguishes accesses
+// based on whether they are read-only, and read-only ops do not interfere with
+// each other.
+//
+// If there are nested regions, each region is handled separately, and control
+// dependencies are only tracked for ops under the same parent op.
+class SideEffectAnalysis : public detail::PerFunctionAggregateAnalysis<
+                               detail::SideEffectAnalysisInfo> {
+ public:
+  // Constructs analysis by analyzing the given module operation. Because no
+  // parallel_ids are given, the program has sequential memory semantics.
+  explicit SideEffectAnalysis(ModuleOp module_op);
+
+  // Constructs analysis by analyzing the given module operation where
+  // `op_to_parallel_ids` supplies the group to branch map. This is the map
+  // that is encoded by op attribute `_parallel_execution_ids`. This map is
+  // used to code which ops should be executed in parallel and which
+  // ops should be executed in sequence after ops have been flattened.
+  // For example, children of
+  // `tf_device.parallel_execute` will be executed in parallel and
+  // each replica child of a `tf_device.replicate` will be executed in parallel.
+  // Otherwise, by default, an op's children will be executed in sequence.
+  //
+  // Two ops with the same groups and different branches are considered
+  // parallel so are not made dependent. For example if `OpA` has parallel_ids
+  //   `{group0:branch0, group1:branch0}`
+  // and `OpB` has parallel_ids
+  //   `{group0:branch1, graph1:branch0}`
+  // then `OpA` and `OpB` are executed in parallel because `group0` is common
+  // with a different branch.
+  //
+  // Two ops with the same branches between common groups are executed in
+  // sequence so are made dependent. For example, if `OpA` has parallel_ids
+  //   `{group0:branch0, group1:branch0}`
+  // and `OpB` has parallel_ids
+  //   `{group0:branch0, group2:branch0}`
+  // then `OpA` and `OpB` are executed in sequence because the common groups
+  // have the same branch.
+  //
+  // If an op is not in `op_to_parallel_ids` then it is considered to have the
+  // empty map from groups to branches.
+  SideEffectAnalysis(ModuleOp module_op, OpToParallelIdsMap op_to_parallel_ids);
+
+ private:
+  ResourceAliasAnalysis alias_analysis_;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_SIDE_EFFECT_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/tf_dataflow.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/tf_dataflow.h
new file mode 100644
index 00000000..a7d622c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/analysis/tf_dataflow.h
@@ -0,0 +1,92 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_TF_DATAFLOW_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_TF_DATAFLOW_H_
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+
+template <typename L>
+class TensorflowDataflowAnalysis
+    : public dataflow::SparseForwardDataFlowAnalysis<dataflow::Lattice<L>> {
+ public:
+  using StateT = dataflow::Lattice<L>;
+  using dataflow::SparseForwardDataFlowAnalysis<
+      StateT>::SparseForwardDataFlowAnalysis;
+  using dataflow::SparseForwardDataFlowAnalysis<StateT>::getLatticeElement;
+  ~TensorflowDataflowAnalysis() override = default;
+
+  bool ForwardThroughTFOperation(Operation *op,
+                                 ArrayRef<const StateT *> operands,
+                                 ArrayRef<StateT *> results) {
+    if (auto cast = dyn_cast<TF::CastOp>(op)) {
+      this->join(results[0], *operands[0]);
+    } else if (auto while_op = dyn_cast<TF::WhileRegionOp>(op)) {
+      for (auto &region : while_op->getRegions()) {
+        for (auto [arg, value] :
+             llvm::zip(region.getArguments(), while_op->getOperands())) {
+          this->join(getLatticeElement(arg), *getLatticeElement(value));
+        }
+      }
+    } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
+      func::FuncOp cond = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+          while_op, while_op.getCondAttr());
+      func::FuncOp body = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+          while_op, while_op.getBodyAttr());
+      for (auto &arg : while_op->getOpOperands()) {
+        BlockArgument cond_arg = cond.getArgument(arg.getOperandNumber());
+        this->join(getLatticeElement(cond_arg), *getLatticeElement(arg.get()));
+        BlockArgument body_arg = body.getArgument(arg.getOperandNumber());
+        this->join(getLatticeElement(body_arg), *getLatticeElement(arg.get()));
+      }
+    } else if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
+      for (auto &arg : graph.GetFetch()->getOpOperands()) {
+        if (arg.getOperandNumber() < graph.getNumResults()) {
+          auto result = graph.getResult(arg.getOperandNumber());
+          this->join(getLatticeElement(result), *getLatticeElement(arg.get()));
+        }
+      }
+    } else if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
+      for (auto &arg : island.GetYield()->getOpOperands()) {
+        auto result = island.getResult(arg.getOperandNumber());
+        this->join(getLatticeElement(result), *getLatticeElement(arg.get()));
+      }
+    } else {
+      return false;
+    }
+    return true;
+  }
+
+  void setToEntryState(StateT *lattice) override {
+    this->propagateIfChanged(
+        lattice, lattice->join(L::EntryState(lattice->getAnchor())));
+  }
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_TF_DATAFLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/dialect_registration.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/dialect_registration.h
new file mode 100644
index 00000000..3f8305ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/dialect_registration.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_DIALECT_REGISTRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_DIALECT_REGISTRATION_H_
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/MLProgram/IR/MLProgram.h"  // from @llvm-project
+#include "mlir/Dialect/MLProgram/IR/MLProgramAttributes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/ir/ops.h"
+
+namespace mlir {
+// Inserts all the TensorFlow dialects in the provided registry. This is
+// intended for tools that need to register dialects before parsing .mlir files.
+// If include_extensions is set (default), also registers extensions. Otherwise
+// it is the responsibility of the caller, typically required when the registry
+// is appended to the context in a parallel context, which does not allow for
+// extensions to be added.
+inline void RegisterAllTensorFlowDialectsImpl(DialectRegistry &registry,
+                                              bool include_extensions = true) {
+  registry
+      .insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
+              mlir::ml_program::MLProgramDialect, mlir::TF::TensorFlowDialect,
+              mlir::tf_type::TFTypeDialect, mlir::cf::ControlFlowDialect,
+              mlir::tf_device::TensorFlowDeviceDialect,
+              mlir::tf_executor::TensorFlowExecutorDialect,
+              mlir::tf_saved_model::TensorFlowSavedModelDialect,
+              mlir::tfg::TFGraphDialect>();
+  if (include_extensions) {
+    mlir::func::registerAllExtensions(registry);
+  }
+}
+
+// Inserts all the TensorFlow dialects in the provided registry. This is
+// intended for tools that need to register dialects before parsing .mlir files.
+inline void RegisterAllTensorFlowDialects(DialectRegistry &registry) {
+  RegisterAllTensorFlowDialectsImpl(registry, true);
+}
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_DIALECT_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h
new file mode 100644
index 00000000..73243e2f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_HOST_RUNTIME_TFRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_HOST_RUNTIME_TFRT_OPS_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_HOST_RUNTIME_TFRT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
new file mode 100644
index 00000000..64b5d2e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
@@ -0,0 +1,133 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ARITH_OPS_FOLDER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ARITH_OPS_FOLDER_H_
+
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+
+class Operation;
+
+namespace TF {
+
+class AddV2Op;
+class SubOp;
+class MulOp;
+class DivOp;
+class RealDivOp;
+
+// Verifies an reduction op's `input` and reduction `dims`.
+LogicalResult VerifyReductionInputAndDims(Value input, Value dims,
+                                          Location loc);
+
+// A type range with description (in singular form) attached to it.
+using TypeRangeWithDesc = std::pair<TypeRange, StringRef>;
+
+LogicalResult VerifyTypeRangesAreCompatible(Operation *op,
+                                            TypeRangeWithDesc range0,
+                                            TypeRangeWithDesc range1);
+
+// Fold Arithmetic Op if one of the operands is a constant known to be an
+// Identity (e.g. X+0, X*1, etc...). For commutative operations fold if
+// known identity value is either lhs or rhs.
+template <
+    typename OpT,
+    typename std::enable_if<llvm::is_one_of<
+        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
+OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
+                                        ArrayRef<Attribute> operands) {
+  auto lhs_type = mlir::cast<ShapedType>(arithmetic_op.getX().getType());
+  auto rhs_type = mlir::cast<ShapedType>(arithmetic_op.getY().getType());
+  auto result_type =
+      mlir::cast<ShapedType>(arithmetic_op.getResult().getType());
+
+  // We can fold arithmetic operation only of we can prove that we will not
+  // accidentally hide a broadcasting error.
+  auto is_valid_broadcasting = [](ShapedType operand_ty, ShapedType identity_ty,
+                                  ShapedType result_ty) -> bool {
+    // Scalar identity is broadcastable to any operand shape, we only need to
+    // check that operand has the same shape as a result.
+    bool scalar_identity = identity_ty.hasRank() && identity_ty.getRank() == 0;
+    if (scalar_identity) return operand_ty == result_ty;
+
+    // If identity is not a scalar, we must verify that identity shape is
+    // statically known to be broadcastable to the operand shape and the operand
+    // and result shape are equal.
+    return operand_ty == result_ty && identity_ty.hasStaticShape() &&
+           result_ty.hasStaticShape() &&
+           OpTrait::util::staticallyKnownBroadcastable(operand_ty.getShape(),
+                                                       identity_ty.getShape());
+  };
+
+  // Check that we have a constant operand on one side (candidate for identity).
+  const bool is_commutative =
+      (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
+  auto lhs_attr = mlir::dyn_cast_or_null<DenseElementsAttr>(operands[0]);
+  auto rhs_attr = mlir::dyn_cast_or_null<DenseElementsAttr>(operands[1]);
+  if (!rhs_attr && !(is_commutative && lhs_attr)) return {};
+
+  // Mul and Div ops have identity value one while AddV2 and SubOp have identity
+  // value zero.
+  const int identity =
+      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
+       std::is_same<OpT, RealDivOp>::value)
+          ? 1
+          : 0;
+
+  Type element_ty = lhs_type.getElementType();
+  Attribute identity_attr;
+  if (auto ty = mlir::dyn_cast<FloatType>(element_ty)) {
+    identity_attr = FloatAttr::get(ty, static_cast<double>(identity));
+  } else if (auto ty = mlir::dyn_cast<IntegerType>(element_ty)) {
+    identity_attr = IntegerAttr::get(ty, static_cast<int64_t>(identity));
+  } else {
+    return {};
+  }
+
+  // Fold: Op(Operand, Identity) -> Operand.
+  if (rhs_attr && is_valid_broadcasting(lhs_type, rhs_type, result_type)) {
+    if (rhs_attr.isSplat() &&
+        rhs_attr.getSplatValue<Attribute>() == identity_attr)
+      return arithmetic_op.getX();
+  }
+
+  // Fold: Op(Identity, Operand) -> Operand for commutative operations.
+  if (lhs_attr && is_commutative &&
+      is_valid_broadcasting(rhs_type, lhs_type, result_type)) {
+    if (lhs_attr.isSplat() &&
+        lhs_attr.getSplatValue<Attribute>() == identity_attr)
+      return arithmetic_op.getY();
+  }
+
+  return {};
+}
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ARITH_OPS_FOLDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
new file mode 100644
index 00000000..d5223870
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the attributes used in the TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
+
+#include "tensorflow/core/ir/types/dialect.h"
+
+namespace mlir {
+namespace TF {
+
+// This all moved under tensorflow/core/ir/types and these using declaration are
+// to help with the transition.
+using mlir::tf_type::FuncAttr;         // NOLINT
+using mlir::tf_type::PlaceholderAttr;  // NOLINT
+using mlir::tf_type::ShapeAttr;        // NOLINT
+using mlir::tf_type::TensorProtoAttr;  // NOLINT
+
+}  // end namespace TF
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
new file mode 100644
index 00000000..0c7ff33e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the tf_device dialect: it contains operations that model
+// TensorFlow's actions to launch computations on accelerator devices.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_device {
+
+// The TensorFlow Device dialect.
+//
+// This dialect contains operations to describe/launch computations on devices.
+// These operations do not map 1-1 to TensorFlow ops and requires a lowering
+// pass later to transform them into Compile/Run op pairs, like XlaCompile and
+// XlaRun.
+class TensorFlowDeviceDialect : public Dialect {
+ public:
+  static StringRef getDialectNamespace() { return "tf_device"; }
+  // Constructing TensorFlowDevice dialect under an non-null MLIRContext.
+  explicit TensorFlowDeviceDialect(MLIRContext* context);
+};
+
+}  // namespace tf_device
+}  // namespace mlir
+
+// Declares the operations for this dialect using the generated header.
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h
new file mode 100644
index 00000000..cad01806
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h
@@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the standard MLIR TensorFlow dialect after control
+// dependences are raise to the standard form.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DIALECT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DIALECT_H_
+
+#include <functional>
+#include <utility>
+
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+
+class TensorFlowRegistryEffectInterfaceFallback;
+
+class TensorFlowDialect final : public Dialect {
+ public:
+  explicit TensorFlowDialect(MLIRContext *context);
+  ~TensorFlowDialect() override;
+
+  static StringRef getDialectNamespace() { return "tf"; }
+
+  // Overrides to redirect to tf_type dialect.
+  Attribute parseAttribute(DialectAsmParser &parser, Type type) const override;
+  Type parseType(DialectAsmParser &parser) const override;
+
+  // Gradient attribute ("tf.gradient") in the list of NamedAttributes in a
+  // function references to its gradient function. This attribute in TensorFlow
+  // Dialect is used to model TF GradientDef. GetGradientAttrName() returns the
+  // string description of gradient attribute.
+  static StringRef GetGradientAttrName() { return "tf.gradient"; }
+
+  // This attribute marks if a function is stateful.
+  // Returns the string description of stateful attribute.
+  static StringRef GetStatefulAttrName() { return "tf.signature.is_stateful"; }
+
+  // Returns true if the op can be duplicated during transformations.
+  static bool CanDuplicate(Operation *op);
+
+  // Returns true if the op can have side effects.
+  static bool CanHaveSideEffects(Operation *op);
+
+  // Registered hook to materialize a constant operation from a given attribute
+  // value with the desired resultant type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+
+  typedef std::function<void(TensorFlowDialect &dialect)> AdditionalOpFunction;
+
+  // Register an op registration hook which is invoked during construction.
+  //
+  // A hook may use the public addOperations() method to add additional
+  // operations to the dialect. Hooks will only apply to subsequent
+  // instantations of the Dialect/MLIRContext.
+  static void RegisterAdditionalOperationHook(TypeID uniqueId,
+                                              AdditionalOpFunction fn);
+
+  // Re-define publicly the protected addOperations() method from the Dialect
+  // class, usually used in a Dialect constructor. This allows hook
+  // functions to register operations on the TensorFlow dialect using the
+  // same interface.
+  template <typename... Args>
+  void addOperations() {
+    Dialect::addOperations<Args...>();
+  }
+
+  using ConstantFoldHook = LogicalResult (*)(Operation *, ArrayRef<Attribute>,
+                                             SmallVectorImpl<OpFoldResult> &);
+  static void RegisterConstantFoldHook(ConstantFoldHook fn) {
+    constant_fold_hook_ = std::move(fn);
+  }
+
+  static LogicalResult constantFold(Operation *op, ArrayRef<Attribute> operands,
+                                    SmallVectorImpl<OpFoldResult> &results) {
+    if (constant_fold_hook_) return constant_fold_hook_(op, operands, results);
+    return failure();
+  }
+
+  static bool HasConstantFoldHook() { return constant_fold_hook_; }
+
+  // Provides a hook for op interface.
+  void *getRegisteredInterfaceForOp(mlir::TypeID interface,
+                                    mlir::OperationName opName) override;
+
+ private:
+  static ConstantFoldHook constant_fold_hook_;
+
+  // Storage for a custom fallback interface.
+  TensorFlowRegistryEffectInterfaceFallback *fallback_effect_op_interface_;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#define TF_DIALECT_REGISTER_ADDITIONAL_OPERATIONS(hookFn)           \
+  {                                                                 \
+    static bool key;                                                \
+    ::mlir::TF::TensorFlowDialect::RegisterAdditionalOperationHook( \
+        ::mlir::TypeID::getFromOpaquePointer(&key), hookFn);        \
+  }
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
new file mode 100644
index 00000000..a3c95bdf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the tf_executor dialect: it models the TensorFlow executor
+// semantics and can represent arbitrary TensorFlow graphs. As such it follows
+// the existing execution model that includes deadness propagation, concurrent
+// semantics, and control dependencies.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_EXECUTOR_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace tf_executor {
+
+class TensorFlowExecutorDialect : public Dialect {
+ public:
+  static StringRef getDialectNamespace() { return "tf_executor"; }
+  explicit TensorFlowExecutorDialect(MLIRContext *context);
+
+  // Parses a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  // Prints a type registered to this dialect.
+  void printType(Type type, DialectAsmPrinter &os) const override;
+};
+
+// The Control type is a token-like value that models control dependencies from
+// TensorFlow graphs.
+class ControlType : public Type::TypeBase<ControlType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr ::mlir::StringLiteral name = "tf_executor.control";
+};
+
+class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr ::mlir::StringLiteral name = "tf_executor.token";
+};
+
+}  // namespace tf_executor
+}  // namespace mlir
+
+// Declares the operations for this dialect using the generated header.
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
new file mode 100644
index 00000000..db820889
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
+
+#include <string>
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace mlir {
+namespace TF {
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Contraction Fusion.
+//===----------------------------------------------------------------------===//
+
+struct ContractionFusion {
+  explicit ContractionFusion(
+      StringRef output_kernel, ArrayRef<int> additional_arguments = {},
+      ArrayRef<NamedAttribute> additional_attributes = {})
+      : output_kernel(output_kernel.str()),
+        additional_arguments(additional_arguments.begin(),
+                             additional_arguments.end()),
+        additional_attributes(additional_attributes.begin(),
+                              additional_attributes.end()) {}
+
+  // Name of the output kernel implementing the contraction fusion.
+  std::string output_kernel;
+
+  // Indices of additional arguments that will be forwarded to the fused
+  // operation (e.g. forward bias vector if fusing BiasAdd operation).
+  SmallVector<int, 4> additional_arguments;
+
+  // Add additional attributes to the fused node.
+  SmallVector<NamedAttribute, 4> additional_attributes;
+};
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Resource Handles.
+//===----------------------------------------------------------------------===//
+
+inline bool IsResourceHandleAnonymous(StringRef name) {
+  return name == ::tensorflow::ResourceHandle::ANONYMOUS_NAME;
+}
+
+// Helper struct representing an identifier for a resource handle. For resource
+// handles created explicitly and shared across resource allocator ops,
+// `container`, `name`, and `device` can be set. If an resource handle is tied
+// to an instance of an operation (e.g. TensorFlow runtime operation caching),
+// `op` can be set instead.
+struct ResourceHandle {
+  ResourceHandle(StringRef container, StringRef name, StringRef device,
+                 Operation* op)
+      : container(container), name(name), device(device), op(op) {}
+
+  bool operator==(const ResourceHandle& rhs) const {
+    return container == rhs.container && name == rhs.name &&
+           device == rhs.device && op == rhs.op;
+  }
+
+  // Make ResourceHandle hashable.
+  friend ::llvm::hash_code hash_value(const ResourceHandle& resource_handle);
+
+  StringRef container;
+  StringRef name;
+  StringRef device;
+  Operation* op = nullptr;
+};
+
+// Make ResourceHandle hashable.
+inline ::llvm::hash_code hash_value(const ResourceHandle& resource_handle) {
+  return ::llvm::hash_combine(resource_handle.container, resource_handle.name,
+                              resource_handle.device, resource_handle.op);
+}
+
+// Helper struct holding a resource handle value and unique id associated to the
+// resource handle.
+struct ResourceHandleValueAndId {
+  ResourceHandleValueAndId(Value value, int64_t id) : value(value), id(id) {}
+
+  Value value;
+  int64_t id = -1;
+};
+
+//===----------------------------------------------------------------------===//
+// TF op helper functions for handling resource handles and ids.
+//===----------------------------------------------------------------------===//
+
+// Returns device of op if present. If op has no device set, an empty string ref
+// is returned instead.
+llvm::StringRef GetDeviceOrEmpty(Operation* op);
+
+// Returns resource handle value and id for resource op based on attributes. If
+// a resource handle is anonymous, a new id is always returned.
+ResourceHandleValueAndId GetResourceHandleValueAndIdBase(
+    llvm::StringRef container, llvm::StringRef shared_name,
+    llvm::StringRef device, Value resource,
+    llvm::SmallDenseMap<ResourceHandle, int64_t>& resource_handle_id_map,
+    int64_t& next_id);
+
+// Shape functions for ops that are using TF_SameOperandsAndResultTypeResolveRef
+// and have at least one operand, result type can be inferred using the first
+// operand's type.
+
+#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
+  LogicalResult Op::inferReturnTypeComponents(                        \
+      MLIRContext* context, std::optional<Location> location,         \
+      ValueShapeRange operands, DictionaryAttr attributes,            \
+      OpaqueProperties properties, RegionRange regions,               \
+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
+    return inferReturnTypeComponentsFromOperands(                     \
+        context, location, operands, attributes, properties, regions, \
+        inferredReturnShapes);                                        \
+  }
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
+}  // namespace TF
+}  // namespace mlir
+
+namespace llvm {
+template <>
+struct DenseMapInfo<mlir::TF::ResourceHandle> {
+  static mlir::TF::ResourceHandle getEmptyKey() {
+    return {/*container=*/"", /*name=*/"", /*device=*/"",
+            /*op=*/DenseMapInfo<mlir::Operation*>::getEmptyKey()};
+  }
+
+  static mlir::TF::ResourceHandle getTombstoneKey() {
+    return {/*container=*/"", /*name=*/"", /*device=*/"",
+            /*op=*/DenseMapInfo<mlir::Operation*>::getTombstoneKey()};
+  }
+
+  static unsigned getHashValue(
+      const mlir::TF::ResourceHandle& resource_handle) {
+    return mlir::TF::hash_value(resource_handle);
+  }
+
+  static bool isEqual(const mlir::TF::ResourceHandle& lhs,
+                      const mlir::TF::ResourceHandle& rhs) {
+    return lhs == rhs;
+  }
+};
+}  // namespace llvm
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
new file mode 100644
index 00000000..30c503aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the standard MLIR TensorFlow dialect
+// after control dependences are raise to the standard form.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
new file mode 100644
index 00000000..2956174f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+// IWYU pragma: private, include "third_party/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+
+class YieldOp;
+
+}  // namespace TF
+}  // namespace mlir
+
+// TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose
+// purpose is to catch bug on `tensorflow::mutex_lock`. We don't use
+// `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and
+// `tf.ConsumeMutexLock`) with getter methods named as `mutex_lock()`. Need to
+// undefine here to avoid expanding the getter symbol as macro when including
+// both mutex.h and this header file.
+#undef mutex_lock
+
+#define GET_OP_FWD_DEFINES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_canonicalization_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_canonicalization_helper.h
new file mode 100644
index 00000000..fa171a00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_canonicalization_helper.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_CANONICALIZATION_HELPER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_CANONICALIZATION_HELPER_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+
+namespace mlir {
+namespace TF {
+
+// Eliminate attributes that are not needed, but can get attached to Ops
+// during import.
+template <typename Op>
+struct DropAttributes : public OpRewritePattern<Op> {
+  using OpRewritePattern<Op>::OpRewritePattern;
+
+  // Drop the "output_shapes" attribute.
+  LogicalResult matchAndRewrite(Op op,
+                                PatternRewriter &rewriter) const override {
+    bool found = !!op->removeAttr("output_shapes");
+    return success(found);
+  }
+};
+
+// Helper function to create TF op while copying all underscore attributes from
+// another TF op.
+// TODO(jpienaar): This is a workaround until behavior is established.
+template <typename OpTy, typename... Args>
+OpTy CreateTfOp(RewriterBase &b, Operation *op, Args &&...args) {
+  auto ret = b.create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+  CopyDeviceAndUnderscoredAttributes(op, ret.getOperation());
+  return ret;
+}
+
+// Helper function to replace TF op with another op while copying all underscore
+// attributes from the TF op.
+// TODO(jpienaar): This is a workaround until behavior is established.
+template <typename OpTy, typename... Args>
+OpTy ReplaceTfOpWithNewOp(RewriterBase &b, Operation *op, Args &&...args) {
+  auto ret = CreateTfOp<OpTy>(b, op, std::forward<Args>(args)...);
+  b.replaceOp(op, ret.getOperation()->getResults());
+  return ret;
+}
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_CANONICALIZATION_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_device_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_device_helper.h
new file mode 100644
index 00000000..4657fb18
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_device_helper.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_DEVICE_HELPER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_DEVICE_HELPER_H_
+
+namespace mlir {
+
+class Operation;
+
+namespace TF {
+
+class RuntimeDevices;
+
+// Returns true if at least one GPU device is available at runtime.
+bool CanUseGpuDevice(const RuntimeDevices &devices);
+
+// Returns true if all of the GPUs available at runtime support TensorCores
+// (NVIDIA compute capability >= 7.0).
+bool CanUseTensorCores(const RuntimeDevices &devices);
+
+// Returns true if operation does not have explicit device placement that would
+// prevent it from running on GPU device.
+bool CanUseGpuDevice(Operation *op);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_DEVICE_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
new file mode 100644
index 00000000..29dae271
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
@@ -0,0 +1,137 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_LAYOUT_HELPER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_LAYOUT_HELPER_H_
+
+#include <array>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+
+namespace mlir {
+
+class MLIRContext;
+
+namespace TF {
+
+SmallVector<int64_t, 4> ReversePermutation(ArrayRef<int64_t> permutation);
+
+SmallVector<int64_t, 4> GetDataFormatPermutation(StringRef from, StringRef to);
+
+// Shuffle elements in the `attr` according to the permutation. Optional
+// `inner_size` allows to shuffle array attributes created from rank 2 tensors
+// on outer dimension only.
+ArrayAttr ShuffleArrayAttr(ArrayAttr attr, ArrayRef<int64_t> permutation,
+                           int inner_size = 1);
+
+// Shuffle ranked tensor dimensions according to the permutation.
+Type ShuffleRankedTensorType(Type type, ArrayRef<int64_t> permutation);
+
+bool AreCancellablePermutations(DenseIntElementsAttr perm0,
+                                DenseIntElementsAttr perm1);
+
+// Default implementation of `LayoutSensitiveInterface::UpdateDataFormat` for
+// layout sensitive operations that do not have any additional layout dependent
+// attributes besides `data_format` string.
+template <typename Op>
+LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
+  auto perm = GetDataFormatPermutation(op->getDataFormat(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data format attribute.
+  (*op)->setAttr("data_format", StringAttr::get(op->getContext(), data_format));
+
+  // Update types for all layout sensitive results.
+  auto layout_sensitive = cast<LayoutSensitiveInterface>(op->getOperation());
+  for (unsigned idx : layout_sensitive.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(ShuffleRankedTensorType(result.getType(), perm));
+  }
+
+  return success();
+}
+
+// Default implementation for folding operand transpose into the operation.
+// See `FoldOperandsTransposeInterface::FoldOperandsPermutation`.
+template <typename Op>
+LogicalResult FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation, Op *op,
+    ArrayRef<std::pair<StringRef, ArrayAttr>> shuffle_attrs = {}) {
+  MLIRContext *context =
+      (*op)->template getParentOfType<ModuleOp>().getContext();
+
+  // We only support NHWC <-> NCHW permutations.
+  static constexpr std::array<int64_t, 4> kNchwToNhwc = {0, 2, 3, 1};
+  static constexpr std::array<int64_t, 4> kNhwcToNchw = {0, 3, 1, 2};
+
+  // Operation data format after folding `permutation`.
+  StringRef target_data_format = [&]() -> StringRef {
+    if (op->getDataFormat() == "NHWC" && permutation.equals(kNchwToNhwc)) {
+      return "NCHW";  // cancel NCHW->NHWC operand permutation
+    } else if (op->getDataFormat() == "NCHW" &&
+               permutation.equals(kNhwcToNchw)) {
+      return "NHWC";  // cancel NHWC->NCHW operand permutation
+    } else {
+      return "";
+    }
+  }();
+  if (target_data_format.empty()) return failure();
+
+  // To fold operand `permutation` into the `op` we need shuffle all layout
+  // dependent attributes and types with a reverse permutation, and change
+  // operation data format to `target_data_format`.
+  //
+  // Example:
+  //   %1 = SomeOp(...)   {data_format = NHWC}
+  //   %2 = Transpose(%1) {permutation = NHWC->NCHW}
+  //   %3 = Op(%2)        {data_format = NCHW}
+  //
+  // To bypass %2 we have to change data format to shuffle data format from NCHW
+  // to NHWC, which is the reverse of operand permutation (function argument).
+  auto reverse_permutation =
+      GetDataFormatPermutation(op->getDataFormat(), target_data_format);
+  if (reverse_permutation.empty()) return failure();
+
+  (*op)->setAttr("data_format", StringAttr::get(context, target_data_format));
+
+  for (auto pair : shuffle_attrs) {
+    StringRef attr_name = pair.first;
+    ArrayAttr attr_value = pair.second;
+    (*op)->setAttr(attr_name,
+                   ShuffleArrayAttr(attr_value, reverse_permutation));
+  }
+
+  auto fold = cast<FoldOperandsTransposeInterface>(op->getOperation());
+  for (unsigned idx : fold.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(
+        ShuffleRankedTensorType(result.getType(), reverse_permutation));
+  }
+
+  return success();
+}
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_LAYOUT_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
new file mode 100644
index 00000000..7812cc4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+// IWYU pragma: private, include "third_party/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+#define GET_OP_FWD_DEFINES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h
new file mode 100644
index 00000000..e77ea7d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h
@@ -0,0 +1,94 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_TENSOR_HELPER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_TENSOR_HELPER_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+
+class Builder;
+
+namespace TF {
+
+// Returns the RankedTensorType for the given operand. TensorFlow constant ops
+// may have non-static shape because the shape is not propagated during constant
+// folding. If the defining op for the given operand is a constant op, this
+// routine uses the constant op's attribute to get the actual shape.
+RankedTensorType GetRankedTensorTypeForOperand(Value operand);
+
+// Returns true if the given `value` is of ranked float tensor type with the
+// given `rank`.
+inline bool IsOfRankedFloatTensorType(RankedTensorType type, int rank) {
+  return type && type.getRank() == rank &&
+         mlir::isa<FloatType>(type.getElementType());
+}
+
+// Returns true if the given `value` has the specified rank or has unranked
+// type.
+inline bool IsOfRankOrUnranked(Value value, int64_t rank) {
+  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+  return !type || type.getRank() == rank;
+}
+
+// Returns true if the given `value` has at least the specified rank or has
+// unranked type.
+inline bool HasRankAtLeast(Value value, int64_t rank) {
+  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+  return !type || type.getRank() >= rank;
+}
+
+// Returns true if the given `value` has at most the specified rank or has
+// unranked type.
+inline bool HasRankAtMost(Value value, int64_t rank) {
+  RankedTensorType type = GetRankedTensorTypeForOperand(value);
+  return !type || type.getRank() <= rank;
+}
+
+inline bool IsUnknownDimOrRank(int64_t dim_or_rank) {
+  return dim_or_rank == -1;
+}
+
+// Returns dimension index for the given TensorFlow axis that supports negative
+// indexing.
+inline int64_t GetDimForAxis(int64_t axis, int64_t rank) {
+  return axis >= 0 ? axis : axis + rank;
+}
+
+// Returns the tf.Equal/tf.NotEqual result type given `x` and `y` and inputs. If
+// `incompatible_shape_error` is true, reports error if `x` and `y` has
+// incompatible shapes. Otherwise, returns a tensor type with unknown rank.
+Type DeduceEqualCmpOpType(Builder *builder, Location loc, Value x, Value y,
+                          BoolAttr incompatible_shape_error);
+
+Type InferReductionOpType(Value input, Value reduction_indices,
+                          BoolAttr keep_dims);
+
+// Verifies that the given types are cast compatible. If not, emits appropriate
+// error for the given op. If mask_one_dim is set to true, then the types are
+// allowed to have one mismatching dimension. Masking one of the dimensions is
+// useful for ops like Concat that requires all ranked inputs to have the same
+// rank and match dimension sizes for all but one of the dimensions.
+LogicalResult VerifyTypesCompatibility(Operation::operand_type_range types,
+                                       bool mask_one_dim, Operation *op);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_TENSOR_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
new file mode 100644
index 00000000..8e9f32cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+#define GET_OP_FWD_DEFINES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
new file mode 100644
index 00000000..208cd7ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_SAVED_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_SAVED_MODEL_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_saved_model {
+
+// The name of the attribute indicating under what name an object is exported.
+inline constexpr StringRef kTfSavedModelExportedNamesAttr =
+    "tf_saved_model.exported_names";
+
+// The name of the attribute attached to input arguments or results of a
+// function to represent the path which one would use to index into a structured
+// value to reach a given tensor.
+inline constexpr StringRef kTfSavedModelIndexPathAttr =
+    "tf_saved_model.index_path";
+
+// Name of the attribute that inidicates the type of initializer. It should be
+// on a function and the function should exist in the initializers attribute of
+// the SessionInitializerOp.
+inline constexpr StringRef kTfSavedModelInitializerTypeAttr =
+    "tf_saved_model.initializer_type";
+
+// Indicates that the initializer corresponds to the restore op.
+inline constexpr StringRef kTfSavedModelInitializerRestoreType = "restore_op";
+
+// Indicates that the initializer corresponds to the init op.
+inline constexpr StringRef kTfSavedModelInitializerInitType = "init_op";
+
+class TensorFlowSavedModelDialect : public Dialect {
+ public:
+  explicit TensorFlowSavedModelDialect(MLIRContext *context);
+  LogicalResult verifyRegionArgAttribute(Operation *op, unsigned region_index,
+                                         unsigned arg_index,
+                                         NamedAttribute named_attr) override;
+  LogicalResult verifyRegionResultAttribute(Operation *op,
+                                            unsigned region_index,
+                                            unsigned result_index,
+                                            NamedAttribute named_attr) override;
+  LogicalResult verifyOperationAttribute(Operation *op,
+                                         NamedAttribute named_attr) override;
+
+  static StringRef getDialectNamespace() { return "tf_saved_model"; }
+};
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+// Declares the operations for this dialect using the generated header.
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h.inc"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Returns the list of exported names for `op`.
+// An empty list means `op` is not exported.
+SmallVector<StringRef, 2> GetExportedNames(Operation *op);
+
+// Returns true if `op` is exported.
+bool IsExported(Operation *op);
+
+// Returns true if `module` has tf_saved_model linkage semantics.
+bool HasTfSavedModelSemantics(ModuleOp module_op);
+
+// Returns the tf_saved_model.global_tensor op that func's arg_index'th argument
+// refers to as a bound input, or null.
+Operation *LookupBoundInput(func::FuncOp func, int arg_index,
+                            const SymbolTable &symbol_table);
+
+template <typename T>
+T LookupBoundInputOfType(func::FuncOp func, int arg_index,
+                         const SymbolTable &symbol_table) {
+  return llvm::dyn_cast_or_null<T>(
+      LookupBoundInput(func, arg_index, symbol_table));
+}
+
+// Gets the type that an exported function arg that is bound to symbol ops such
+// as `global_tensor` and `asset` should have.
+Type GetBoundInputArgTypeFor(mlir::Operation *op);
+
+// Returns the session initializer of this module if it exists. Returns null
+// otherwise.
+SessionInitializerOp GetSessionInitializerOp(ModuleOp module_op);
+
+// Returns the exported name for the session initializer function.
+SmallVector<StringRef, 2> GetSessionInitializerExportedName(ModuleOp module_op);
+
+// Returns initializer function ops. These functions' symbols are in the
+// "initializers" attribute of the session initializer op.
+SmallVector<func::FuncOp, 2> GetInitializerFunctions(ModuleOp module_op);
+
+// Returns the initializer function whose `tf_saved_model.initializer_type`
+// attribute matches `initializer_type`. Returns a null op if it doesn't exist.
+func::FuncOp GetInitializerFunction(ModuleOp module_op,
+                                    StringRef initializer_type);
+
+// Checks if the module restores variables from a Checkpoint.
+bool IsRestoreGraph(ModuleOp module);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_SAVED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
new file mode 100644
index 00000000..6384d077
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -0,0 +1,136 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the side effect definition file for TensorFlow.
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_SIDE_EFFECTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_SIDE_EFFECTS_H_
+
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+namespace ResourceEffects {
+
+struct Variable : ::mlir::SideEffects::Resource::Base<Variable> {
+  StringRef getName() final { return "Variable"; }
+};
+
+struct Stack : ::mlir::SideEffects::Resource::Base<Stack> {
+  StringRef getName() final { return "Stack"; }
+};
+
+struct TensorArray : ::mlir::SideEffects::Resource::Base<TensorArray> {
+  StringRef getName() final { return "TensorArray"; }
+};
+
+struct Summary : ::mlir::SideEffects::Resource::Base<Summary> {
+  StringRef getName() final { return "Summary"; }
+};
+
+struct LookupTable : ::mlir::SideEffects::Resource::Base<LookupTable> {
+  StringRef getName() final { return "LookupTable"; }
+};
+
+struct DatasetSeedGenerator
+    : ::mlir::SideEffects::Resource::Base<DatasetSeedGenerator> {
+  StringRef getName() final { return "DatasetSeedGenerator"; }
+};
+
+struct DatasetMemoryCache
+    : ::mlir::SideEffects::Resource::Base<DatasetMemoryCache> {
+  StringRef getName() final { return "DatasetMemoryCache"; }
+};
+
+struct DatasetIterator : ::mlir::SideEffects::Resource::Base<DatasetIterator> {
+  StringRef getName() final { return "DatasetIterator"; }
+};
+
+// Special resource type to track TPU Embedding specific ops, which must execute
+// but do not have side effects with one another or with resource variable ops.
+struct TPUEmbedding : ::mlir::SideEffects::Resource::Base<TPUEmbedding> {
+  StringRef getName() final { return "TPUEmbedding"; }
+};
+
+// Resource corresponding to GeneratorOp.
+struct GeneratorOp : public ::mlir::SideEffects::Resource::Base<GeneratorOp> {
+  StringRef getName() final { return "Generator"; }
+};
+
+struct Send : public ::mlir::SideEffects::Resource::Base<Send> {
+  StringRef getName() final { return "Send"; }
+};
+
+struct Recv : public ::mlir::SideEffects::Resource::Base<Recv> {
+  StringRef getName() final { return "Recv"; }
+};
+
+struct XlaHostCompute
+    : public ::mlir::SideEffects::Resource::Base<XlaHostCompute> {
+  StringRef getName() final { return "XlaHostCompute"; }
+};
+
+struct RandomGenerator
+    : public ::mlir::SideEffects::Resource::Base<RandomGenerator> {
+  StringRef getName() final { return "RandomGenerator"; }
+};
+
+struct TPUExecute : public ::mlir::SideEffects::Resource::Base<TPUExecute> {
+  StringRef getName() final { return "TPUExecute"; }
+};
+
+struct MustExecute : public ::mlir::SideEffects::Resource::Base<MustExecute> {
+  StringRef getName() final { return "MustExecute"; }
+};
+
+struct CollectiveReduceOrdering
+    : public ::mlir::SideEffects::Resource::Base<CollectiveReduceOrdering> {
+  StringRef getName() final { return "CollectiveReduceOrdering"; }
+};
+
+struct NcclAllReduceOrdering
+    : public ::mlir::SideEffects::Resource::Base<NcclAllReduceOrdering> {
+  StringRef getName() final { return "NcclAllReduceOrdering"; }
+};
+
+struct GlobalIterId : public ::mlir::SideEffects::Resource::Base<GlobalIterId> {
+  StringRef getName() final { return "GlobalIterId"; }
+};
+
+struct XlaLaunch : public ::mlir::SideEffects::Resource::Base<XlaLaunch> {
+  StringRef getName() final { return "XlaLaunch"; }
+};
+
+struct WriteTrainingPredictions
+    : public ::mlir::SideEffects::Resource::Base<WriteTrainingPredictions> {
+  StringRef getName() final { return "WriteTrainingPredictions"; }
+};
+
+struct _XlaRun : public ::mlir::SideEffects::Resource::Base<_XlaRun> {
+  StringRef getName() final { return "_XlaRun"; }
+};
+
+// Returns true iff resource type with given ID is only self-dependent, i.e.,
+// there are no dependencies to other resource types (including unknown resource
+// type).
+inline bool IsOnlySelfDependent(TypeID resource_type_id) {
+  return resource_type_id == ResourceEffects::Send::getResourceID() ||
+         resource_type_id == ResourceEffects::Recv::getResourceID();
+}
+
+}  // namespace ResourceEffects
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_SIDE_EFFECTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
new file mode 100644
index 00000000..2c6fd05d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the types used in the standard MLIR TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_STRUCTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_STRUCTS_H_
+
+#include <optional>
+
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/core/ir/types/dialect.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace mlir {
+namespace TF {
+
+using GpuDeviceMetadata = tf_type::GpuDeviceMetadataAttr;
+
+// Tensorflow devices available at runtime with corresponding metadata if it is
+// available. It's completely valid to have a device without any metadata
+// attached to it.
+class RuntimeDevices {
+  using DeviceNameUtils = ::tensorflow::DeviceNameUtils;
+  using ParsedName = ::tensorflow::DeviceNameUtils::ParsedName;
+
+ public:
+  // Adds a device with and empty metadata. Device can be of any type.
+  void AddDevice(const ParsedName& device);
+
+  // Adds a GPU device with GPU specific metadata.
+  void AddGpuDevice(const ParsedName& device,
+                    const GpuDeviceMetadata& metadata);
+
+  llvm::ArrayRef<ParsedName> device_names() const { return device_names_; }
+  size_t NumDevices() const { return device_names_.size(); }
+
+  // Returns GPU device metadata if it is available, otherwise returns None.
+  std::optional<GpuDeviceMetadata> GetGpuDeviceMetadata(
+      const ParsedName& device) const;
+
+ private:
+  llvm::SmallVector<ParsedName, 8> device_names_;
+  // TODO(ezhulenev): Add DenseMapInfo<ParsedName> specialization to be able to
+  // use ParsedName as a key in a DenseMap.
+  llvm::StringMap<GpuDeviceMetadata> gpu_metadata_;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_STRUCTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
new file mode 100644
index 00000000..c6abd768
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -0,0 +1,329 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the op traits used in the MLIR TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TRAITS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TRAITS_H_
+
+#include <optional>
+
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace OpTrait {
+namespace TF {
+
+// Verifies if 'ref_type' is a REF type corresponding to 'type'.
+static inline LogicalResult VerifyRefTypeMatch(mlir::Type type,
+                                               mlir::Type maybe_ref_type) {
+  if (auto ref_type =
+          mlir::dyn_cast<mlir::tf_type::TensorFlowRefType>(maybe_ref_type))
+    return success(ref_type.RemoveRef().getTypeID() == type.getTypeID());
+  return failure();
+}
+
+// This class provides verification for ops that are known to have the same
+// result types and all operands are either of the same type as result or a REF
+// type corresponding to the result type.
+// TODO(jpienaar): Update the name and the description.
+template <typename ConcreteType>
+class OperandsSameAsResultsTypeOrRef
+    : public TraitBase<ConcreteType, OperandsSameAsResultsTypeOrRef> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    LogicalResult shapeMatch = impl::verifySameOperandsAndResultShape(op);
+    if (failed(shapeMatch)) return shapeMatch;
+    Type type = op->getResult(0).getType();
+    // Verify that the first result type is same as the rest of the results.
+    // We skip the comparison against itself.
+    for (auto result_type : llvm::drop_begin(op->getResultTypes(), 1)) {
+      if (!mlir::tf_type::HasCompatibleElementTypes(type, result_type))
+        return op->emitOpError()
+               << "requires all return types to have compatible element types";
+    }
+    for (auto operand_type : op->getOperandTypes()) {
+      if (!mlir::tf_type::HasCompatibleElementTypes(
+              operand_type, type, /*may_ignore_ref_type_lhs=*/true))
+        return op->emitError() << "requires all operands and results to have "
+                                  "compatible element types";
+    }
+    return success();
+  }
+};
+
+namespace detail {
+inline LogicalResult verifySameOperandsAndResultElementTypeResolveRef(
+    Operation* op) {
+  Type element_type;
+  if (op->getNumResults() > 0) {
+    element_type = mlir::tf_type::GetElementTypeOrSelfResolveRef(
+        op->getResult(0).getType());
+  } else if (op->getNumOperands() > 0) {
+    element_type = mlir::tf_type::GetElementTypeOrSelfResolveRef(
+        op->getOperand(0).getType());
+  } else {
+    // Nothing to check.
+    return success();
+  }
+  // Verify that all result element types are compatible to `element_type`.
+  for (const auto& result_type : op->getResultTypes()) {
+    if (mlir::tf_type::GetElementTypeOrSelfResolveRef(result_type) !=
+        element_type) {
+      return op->emitOpError(
+          "requires compatible element types for all operands and results");
+    }
+  }
+  // Verify that all operand element types are compatible to `element_type`.
+  for (const auto& operand_type : op->getOperandTypes()) {
+    if (mlir::tf_type::GetElementTypeOrSelfResolveRef(operand_type) !=
+        element_type) {
+      return op->emitOpError(
+          "requires compatible element types for all operands and results");
+    }
+  }
+  return success();
+}
+
+inline ShapedType MergeType(ShapedType a, ShapedType b) {
+  if (!a.hasRank()) {
+    return b;
+  }
+  if (!b.hasRank()) {
+    return a;
+  }
+  int64_t rank = a.getRank();
+  SmallVector<int64_t, 4> dims;
+  dims.resize(rank);
+  for (int i = 0, e = rank; i != e; i++) {
+    int64_t dim0 = a.getDimSize(i);
+    int64_t dim1 = b.getDimSize(i);
+    dims[i] = (dim0 == ShapedType::kDynamic) ? dim1 : dim0;
+  }
+  return RankedTensorType::get(dims, a.getElementType());
+}
+}  // namespace detail
+
+// Verifies that op has the same operand and result element types (or type
+// itself, if scalar) after resolving reference types (i.e., after converting
+// reference types to their corresponding TensorFlow or standard types).
+template <typename ConcreteType>
+class SameOperandsAndResultElementTypeResolveRef
+    : public TraitBase<ConcreteType,
+                       SameOperandsAndResultElementTypeResolveRef> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    return detail::verifySameOperandsAndResultElementTypeResolveRef(op);
+  }
+};
+
+// Verifies that op has the same operand and result types after resolving
+// reference types (i.e., after converting reference types to their
+// corresponding TensorFlow or standard types).
+template <typename ConcreteType>
+class SameOperandsAndResultTypeResolveRef
+    : public TraitBase<ConcreteType, SameOperandsAndResultTypeResolveRef> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    if (failed(impl::verifySameOperandsAndResultShape(op))) return failure();
+    return detail::verifySameOperandsAndResultElementTypeResolveRef(op);
+  }
+
+  static LogicalResult inferReturnTypeComponentsFromOperands(
+      MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+      DictionaryAttr attributes, OpaqueProperties properties,
+      RegionRange regions,
+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+    if (operands.empty())
+      return emitOptionalError(
+          location,
+          "Expected non-empty operands for [CompatibleOperandsAndResultType]");
+
+    auto result_ty = llvm::dyn_cast_or_null<ShapedType>(operands[0].getType());
+    if (!result_ty) {
+      return emitOptionalError(location, "Expected shape type for operand 0");
+    }
+    for (auto [index, ty] :
+         llvm::drop_begin(llvm::enumerate(operands.getTypes()), 1)) {
+      auto shape_type = llvm::dyn_cast_or_null<ShapedType>(ty);
+      if (!shape_type) {
+        return emitOptionalError(location, "Expected shape type for operand ",
+                                 index);
+      }
+      result_ty = detail::MergeType(shape_type, result_ty);
+    }
+    inferredReturnShapes.push_back(result_ty);
+    return success();
+  }
+};
+
+// Layout agnostic operations do not depend on the operands data layout (data
+// format), as and example all element wise operations are layout agnostic.
+template <typename ConcreteType>
+class LayoutAgnostic : public TraitBase<ConcreteType, LayoutAgnostic> {};
+
+// Trait to indicate operations that cannot be duplicated as they might carry
+// certain state around within their implementations.
+template <typename ConcreteType>
+class CannotDuplicate : public TraitBase<ConcreteType, CannotDuplicate> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    if (isMemoryEffectFree(op))
+      return op->emitError(
+          "operations with no side effects cannot have CannotDuplicate trait");
+    return success();
+  }
+};
+
+// Trait to indicate an operation cannot be constant folded.
+template <typename ConcreteType>
+class NoConstantFold : public TraitBase<ConcreteType, NoConstantFold> {};
+
+// Coefficient-wise binary operation with implicit broadcasting support, for
+// example tf.Sub operation.
+template <typename ConcreteType>
+class CwiseBinary : public TraitBase<ConcreteType, CwiseBinary> {};
+
+// Coefficient-wise unary operation, for example tf.Sqrt operation.
+template <typename ConcreteType>
+class CwiseUnary : public TraitBase<ConcreteType, CwiseUnary> {};
+
+namespace detail {
+
+inline LogicalResult verifyIsIdempotent(Operation* op) {
+  // TODO(b/246518997): Add back check for no side effects on operation.
+  // Currently adding it would cause the shared library build
+  // to fail since there would be a dependency of IR on SideEffectInterfaces
+  // which is cyclical.
+  return success();
+}
+
+inline OpFoldResult foldIdempotent(Operation* op) {
+  if (op->getNumOperands() == 1) {
+    auto* argumentOp = op->getOperand(0).getDefiningOp();
+    if (argumentOp && op->getName() == argumentOp->getName()) {
+      // Replace the outer operation output with the inner operation.
+      return op->getOperand(0);
+    }
+  } else if (op->getOperand(0) == op->getOperand(1)) {
+    return op->getOperand(0);
+  }
+
+  return {};
+}
+
+inline LogicalResult verifyIsInvolution(Operation* op) {
+  // TODO(b/246518997): Add back check for no side effects on operation.
+  // Currently adding it would cause the shared library build
+  // to fail since there would be a dependency of IR on SideEffectInterfaces
+  // which is cyclical.
+  return success();
+}
+
+inline OpFoldResult foldInvolution(Operation* op) {
+  auto* argumentOp = op->getOperand(0).getDefiningOp();
+  if (argumentOp && op->getName() == argumentOp->getName()) {
+    // Replace the outer involutions output with inner's input.
+    return argumentOp->getOperand(0);
+  }
+
+  return {};
+}
+
+}  // namespace detail
+
+// This class adds property that the operation is idempotent.
+// This means a unary to unary operation "f" that satisfies f(f(x)) = f(x),
+// or a binary operation "g" that satisfies g(x, x) = x.
+template <typename ConcreteType>
+class IsIdempotent : public TraitBase<ConcreteType, IsIdempotent> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    static_assert(ConcreteType::template hasTrait<OneResult>(),
+                  "expected operation to produce one result");
+    static_assert(ConcreteType::template hasTrait<OneOperand>() ||
+                      ConcreteType::template hasTrait<NOperands<2>::Impl>(),
+                  "expected operation to take one or two operands");
+    static_assert(
+        ConcreteType::template hasTrait<SameOperandsAndResultTypeResolveRef>(),
+        "expected operation to preserve type");
+    // Idempotent requires the operation to be side effect free as well
+    // but currently this check is under a FIXME and is not actually done.
+    return detail::verifyIsIdempotent(op);
+  }
+
+  static OpFoldResult foldTrait(Operation* op, ArrayRef<Attribute> operands) {
+    return detail::foldIdempotent(op);
+  }
+};
+
+/// This class adds property that the operation is an involution.
+/// This means a unary to unary operation "f" that satisfies f(f(x)) = x
+template <typename ConcreteType>
+class IsInvolution : public TraitBase<ConcreteType, IsInvolution> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    static_assert(ConcreteType::template hasTrait<OneResult>(),
+                  "expected operation to produce one result");
+    static_assert(ConcreteType::template hasTrait<OneOperand>(),
+                  "expected operation to take one operand");
+    static_assert(
+        ConcreteType::template hasTrait<SameOperandsAndResultTypeResolveRef>(),
+        "expected operation to preserve type");
+    // TODO(b/246518997): Involution requires the operation to be side effect
+    // free as well but currently this check is under a FIXME and is not
+    // actually done.
+    return detail::verifyIsInvolution(op);
+  }
+
+  static OpFoldResult foldTrait(Operation* op, ArrayRef<Attribute> operands) {
+    return detail::foldInvolution(op);
+  }
+};
+
+// Indicates that any returned resource is unique.
+template <typename ConcreteType>
+class UniqueResourceAllocation
+    : public TraitBase<ConcreteType, UniqueResourceAllocation> {
+ public:
+  // Implements method required for `ResourceHandleAllocatorInterface`.
+  llvm::SmallVector<mlir::TF::ResourceHandleValueAndId>
+  GetResourceHandleValueAndIdList(
+      llvm::SmallDenseMap<mlir::TF::ResourceHandle, int64_t>&
+          resource_handle_id_map,
+      int64_t& next_id) {
+    llvm::SmallVector<mlir::TF::ResourceHandleValueAndId> resource_vec;
+    for (Value resource :
+         mlir::tf_type::filter_resources(this->getOperation()->getResults())) {
+      resource_vec.push_back({resource, next_id++});
+    }
+    return resource_vec;
+  }
+};
+
+}  // namespace TF
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TRAITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
new file mode 100644
index 00000000..31233f56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the types used in the standard MLIR TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TYPES_H_
+
+#include "tensorflow/core/ir/types/dialect.h"
+
+namespace mlir {
+namespace TF {
+
+// This all moved under tensorflow/core/ir/types and these using declaration are
+// to help with the transition.
+
+using ::mlir::tf_type::AreCastCompatible;          // NOLINT
+using ::mlir::tf_type::ArraysAreCastCompatible;    // NOLINT
+using ::mlir::tf_type::BroadcastCompatible;        // NOLINT
+using ::mlir::tf_type::DropRefType;                // NOLINT
+using ::mlir::tf_type::filter_resources;           // NOLINT
+using ::mlir::tf_type::GetCastCompatibleType;      // NOLINT
+using ::mlir::tf_type::HasCompatibleElementTypes;  // NOLINT
+using ::mlir::tf_type::IsValidTFTensorType;        // NOLINT
+using ::mlir::tf_type::OperandShapeIterator;       // NOLINT
+using ::mlir::tf_type::ResourceType;               // NOLINT
+using ::mlir::tf_type::ResultShapeIterator;        // NOLINT
+using ::mlir::tf_type::ResultShapeRange;           // NOLINT
+using ::mlir::tf_type::StringType;                 // NOLINT
+using ::mlir::tf_type::TensorFlowRefType;          // NOLINT
+using ::mlir::tf_type::TensorFlowType;             // NOLINT
+using ::mlir::tf_type::TensorFlowTypeWithSubtype;  // NOLINT
+using ::mlir::tf_type::VariantType;                // NOLINT
+
+#define HANDLE_TF_TYPE(tftype, enumerant, name) \
+  using tftype##Type = mlir::tf_type::tftype##Type;
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
+
+
+}  // end namespace TF
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
new file mode 100644
index 00000000..8fbf54c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_VERIFIERS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_VERIFIERS_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Verifies correctness of ops implementing LayoutSensitiveInterface (see
+// definition in tf_op_base.td):
+// (1) Operation must have valid `data_format` attribute.
+// (2) Layout dependent arguments and results indices must be in
+//     [0, getNumOperands/getNumResults) range.
+LogicalResult VerifyLayoutSensitiveInterface(Operation* op);
+
+// Verifies correctness of ops implementing FoldOperandsTransposeInterface (see
+// definition in tf_op_base.td):
+// (1) Layout dependent arguments and results indices must be in
+//     [0, getNumOperands/getNumResults) range.
+LogicalResult VerifyFoldOperandsTransposeInterface(Operation* op);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_VERIFIERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h
new file mode 100644
index 00000000..c8160418
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TPU_EMBEDDING_OPS_REGISTRY_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TPU_EMBEDDING_OPS_REGISTRY_H_
+
+#include "llvm/ADT/DenseSet.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// A global ops registry that is used to hold TPU embedding ops.
+//
+// Example:
+//    TPUEmbeddingOpsRegistry::Global().Add<TF::FooOp>();
+//    for (auto op_type_id : TPUEmbeddingOpsRegistry::Global().GetOpsTypeIds())
+//    {
+//      ...
+//    }
+class TPUEmbeddingOpsRegistry {
+ public:
+  // Add the op to the registry.
+  //
+  // Adding an op here will allow use old bridge legalization from the MLIR
+  // bridge with the use of fallback mechanism. Therefore, addition of any op
+  // here must have a python test with MLIR bridge enabled to verify that the
+  // fallback works correctly.
+  template <typename OpType>
+  void Add() {
+    ops_type_ids_.insert(TypeID::get<OpType>());
+  }
+
+  // Returns the type id of the ops in the TPUEmbeddingOpRegistry.
+  const llvm::SmallDenseSet<mlir::TypeID>& GetOpsTypeIds();
+
+  // Returns the global registry.
+  static TPUEmbeddingOpsRegistry& Global();
+
+ private:
+  llvm::SmallDenseSet<mlir::TypeID> ops_type_ids_{};
+};
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TPU_EMBEDDING_OPS_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
new file mode 100644
index 00000000..81af0f63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BRIDGE_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BRIDGE_H_
+
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace mlir {
+namespace TF {
+
+inline constexpr char kStandardPipelineBefore[] = "standard_pipeline_before";
+inline constexpr char kStandardPipelineAfter[] = "standard_pipeline_after";
+
+// Runs all passes involved in transforming or optimizing an MLIR graph without
+// any target specialization. When enable_logging is true, enables
+// tensorflow::BridgeLogger. When enable_inliner is true, enables the inliner
+// pass.
+ABSL_DEPRECATED(
+    "This is legacy code and is unsupported. Use at your own risk. Use "
+    "tf2xla/api/v2/* for specific functionality")
+absl::Status RunBridgeWithStandardPipeline(ModuleOp module, bool enable_logging,
+                                           bool enable_inliner);
+}  // namespace TF
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BRIDGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.h
new file mode 100644
index 00000000..e3c0ee5c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.h
@@ -0,0 +1,294 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CLUSTER_OPS_BY_POLICY_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CLUSTER_OPS_BY_POLICY_H_
+
+#include <optional>
+#include <type_traits>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+
+namespace mlir {
+namespace TFDevice {
+
+// -------------------------------------------------------------------------- //
+// ValueConstraint.
+// -------------------------------------------------------------------------- //
+
+// In order to be clustered operation can require its operands to satisfy
+// some constraints (e.g. reduction operation can require reduction dimension
+// operand to be a constant value).
+enum class ValueConstraint {
+  // Operand must have statically known rank.
+  kRank = 0,
+  // Operand must have statically known shape (all dimensions are known at
+  // compile time).
+  kShape = 1,
+  // Operand must have statically known value (operand must be defined by a
+  // constant operation).
+  kValue = 2,
+};
+
+// Returns the more restrictive constraint of `a` and `b`:
+//
+//    Value >> Shape >> Rank
+//
+// If you know the value, you always know the shape and the rank. If you know
+// the shape, you always know the rank.
+ValueConstraint Merge(ValueConstraint a, ValueConstraint b);
+
+// Returns success if constraint can be resolved statically based on the value
+// type, e.g. `shape` constraint can be resolved if the value is a tensor of
+// statically known shape.
+LogicalResult IsStaticallyResolved(Value value, ValueConstraint constraint);
+
+raw_ostream& operator<<(raw_ostream& os, const ValueConstraint& constraint);
+
+// -------------------------------------------------------------------------- //
+// ValuesConstraintSet.
+// -------------------------------------------------------------------------- //
+
+// A set of constraints for values, that either operation results or operands.
+class ValuesConstraintSet {
+  using ConstraintsMap = llvm::SmallDenseMap<Value, ValueConstraint>;
+  using ConstIterator = typename ConstraintsMap::const_iterator;
+
+ public:
+  ValuesConstraintSet() = default;
+
+  // Inserts a new constraint for the `value`. If the `value` already has some
+  // constraint, it will merge it with a new one, and will return a new
+  // constraint value. Returned pair has a constraint value that was set for
+  // a value, and a boolean flag that is true if the constraint was updated.
+  std::pair<ValueConstraint, bool> Insert(Value value,
+                                          ValueConstraint constraint);
+
+  // Inserts constraints for multiple values.
+  void Insert(ValueRange value, ValueConstraint constraint);
+
+  // Walk all the constraints owned by this set.
+  void Walk(llvm::function_ref<void(Value, ValueConstraint)> walk) const;
+
+  // Returns the constraint of the value if it exists, or None otherwise.
+  std::optional<ValueConstraint> GetConstraint(Value value) const;
+  bool HasConstraint(Value value) const;
+
+  // Merges all constrains from the other constraints set into this one.
+  void MergeAll(const ValuesConstraintSet& other);
+
+  // Remove constraints that can be statically resolved from the type of the
+  // constrained value (see `IsStaticallyResolved` defined above).
+  ValuesConstraintSet& Resolve();
+
+  // Reset all constraints.
+  ValuesConstraintSet& Reset();
+
+  // Return the number of constrained values in the set.
+  size_t Size() const;
+
+  // Returns true if the constraint set is empty.
+  bool Empty() const;
+
+  ConstIterator begin() const { return constraints_.begin(); }
+  ConstIterator end() const { return constraints_.end(); }
+
+ private:
+  llvm::SmallDenseMap<Value, ValueConstraint> constraints_;
+};
+
+// -------------------------------------------------------------------------- //
+// ClusteringPolicy.
+// -------------------------------------------------------------------------- //
+
+// Clustering policy specifies if the operation can be clustered (in practice it
+// usually means that operation can be added to a cluster that will be later
+// compiled) given the set of constraints on its results, and might propagate or
+// create new constraints on the operation operands.
+//
+// Clustering policy must make a local decision just for a single operation. It
+// is the responsibility of a clustering pass to combine all these individual
+// operations constraints to form a valid cluster.
+//
+// Example: compilation using XLA (MHLO) lowering
+//
+//   %0 = "tf.Transpose"(%input, %perm)
+//        : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+//
+//   XLAs `mhlo.transpose` operation requires permutation to be an attribute
+//   (compile time value), so it means that if we want to put `tf.Transpose`
+//   into a cluster that will be compiled with XLA, the `%perm` operand must
+//   be a known compiled time value, e.g. result of a `tf.Const` operation.
+//
+class ClusteringPolicy {
+ public:
+  virtual ~ClusteringPolicy() = default;
+
+  // Returns success if an operation can be clustered given the constraints on
+  // the operation results. Updates operands constraits to satisfy all the
+  // results constraints.
+  virtual LogicalResult MatchAndUpdateConstraints(
+      Operation* operation, const ValuesConstraintSet& results,
+      ValuesConstraintSet& operands) const = 0;
+};
+
+// Clustering policy for a specific operation type.
+template <typename OpTy>
+class OpClusteringPolicy : public ClusteringPolicy {
+ public:
+  LogicalResult MatchAndUpdateConstraints(
+      Operation* operation, const ValuesConstraintSet& results,
+      ValuesConstraintSet& operands) const final {
+    if (auto op = dyn_cast<OpTy>(operation))
+      return MatchAndUpdateConstraints(op, results, operands);
+    return failure();
+  }
+
+  virtual LogicalResult MatchAndUpdateConstraints(
+      OpTy op, const ValuesConstraintSet& results,
+      ValuesConstraintSet& operands) const = 0;
+};
+
+// -------------------------------------------------------------------------- //
+// ClusteringPolicySet.
+// -------------------------------------------------------------------------- //
+
+// A set of clustering policies for different operations.
+class ClusteringPolicySet {
+ public:
+  using Policies = std::vector<std::unique_ptr<ClusteringPolicy>>;
+
+  const Policies& policies() const { return policies_; }
+
+  // Add an instance of each of the policy types 'Ts'. Return a reference to
+  // `this` for chaining insertions.
+  template <typename... Ts>
+  ClusteringPolicySet& Add() {
+    (void)std::initializer_list<int>{0, (AddImpl<Ts>(), 0)...};
+    return *this;
+  }
+
+  // ClusteringPolicySet is move only type.
+  ClusteringPolicySet() = default;
+  ClusteringPolicySet(const ClusteringPolicySet&) = delete;
+  ClusteringPolicySet(ClusteringPolicySet&&) = default;
+  ClusteringPolicySet& operator=(const ClusteringPolicySet&) = delete;
+  ClusteringPolicySet& operator=(ClusteringPolicySet&&) = default;
+
+ private:
+  template <typename T, typename... Args>
+  void AddImpl(Args&&... args) {
+    static_assert(std::is_base_of<ClusteringPolicy, T>::value,
+                  "T must implement ClusteringPolicy");
+    policies_.emplace_back(std::make_unique<T>(std::forward<Args>(args)...));
+  }
+
+  std::vector<std::unique_ptr<ClusteringPolicy>> policies_;
+};
+
+// -------------------------------------------------------------------------- //
+// Discovering clusters of operations based on the policy.
+// -------------------------------------------------------------------------- //
+
+// Cluster groups together operations in the single basic block based on the
+// given clustering policy set. Clusters can be outlined into nested modules
+// later device specific compilation (e.g. for TFRT JIT compiler).
+struct Cluster {
+  llvm::SmallVector<Operation*> operations;
+  ValuesConstraintSet constraints;
+};
+
+// Returns clusters of operations in the given `block` based on the provided
+// clustering policy. If `filter` is defined, it will be used to filter
+// operations that can be considered for clustering based on the policy.
+//
+// TODO(ezhulenev): Additional filter function is a workaround for customizing
+// clustering policies at runtime for experimentation. In the long term,
+// clustering policy should be enough.
+llvm::SmallVector<Cluster> FindClustersInTheBlock(
+    Block* block, const ClusteringPolicySet& policies,
+    std::function<bool(Operation* op)> filter = {});
+
+// Creates a `tf_device.cluster` operation from the clustered operations.
+tf_device::ClusterOp CreateClusterOp(Cluster& cluster, StringAttr policy = {});
+
+// -------------------------------------------------------------------------- //
+// Helper functions for value constraints propagations and analysis.
+// -------------------------------------------------------------------------- //
+
+// Propagates initial constraints on the values defined by the `constraints` set
+// with operations in the `root` as a starting point, using user provided set of
+// clustering policies.
+//
+// Filter predicate specifies if constraints should be propagated across the
+// given operation. Operations in the root set will be also filtered using
+// the `filter` predicate.
+//
+// Optionally resolve constraints that can be statically satisfied by the
+// value type, and stop constraints propagation early.
+//
+// Optionally emits remarks attached to operation that failed to propagate
+// results constraints to its operands (for testing purpose).
+//
+// Returns failure if constraints can't be propagated through some of the
+// operations accepted by the filter (there is no clustering policy for an
+// operation, or constraints can't be satisfied by the policy), and attaches
+// error diagnostics to the operation that prevented constraints propagation.
+mlir::LogicalResult PropagateValuesConstraints(
+    llvm::ArrayRef<Operation*> root, std::function<bool(Operation*)> filter,
+    const ClusteringPolicySet& policies, ValuesConstraintSet& constraints,
+    bool resolve = false, bool emit_remarks = false);
+
+// Propagates initial constraints on the values in the `region` to the other
+// values in the same region, using user provided set of clustering policies.
+mlir::LogicalResult PropagateValuesConstraints(
+    mlir::Region& region, const ClusteringPolicySet& policies,
+    ValuesConstraintSet& constraints, bool resolve = false,
+    bool emit_remarks = false);
+
+// Emits constraints remarks for all operations that use constrained values.
+void EmitValueConstraintsRemarks(const ValuesConstraintSet& constraints);
+
+// Emits constraints remarks for function inputs that are in the constraints
+// set (entry block arguments have constraints).
+void EmitInputsConstraintsRemarks(func::FuncOp func,
+                                  const ValuesConstraintSet& constraints);
+
+// Infers constraints for the values in the function body from the function
+// results attributes.
+//
+// Example:
+//   func @test(...) -> (tensor<?x?xf32> {tf.constraint = "shape"}) {
+//     .....
+//     %v = "some_operation"() : () -> tensor<?x?xf32>
+//     return %v : tensor<?x?xf32>
+//   }
+LogicalResult InferFunctionBodyValuesConstraints(
+    func::FuncOp func, ValuesConstraintSet& constraints);
+
+}  // namespace TFDevice
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CLUSTER_OPS_BY_POLICY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h
new file mode 100644
index 00000000..e43a1ec4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_COLLECTION_OPS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_COLLECTION_OPS_UTIL_H_
+
+#include <optional>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+namespace collection_ops_util {
+
+// This file includes utilities for decomposing collection ops (stack, tensor
+// list, tensor array) in TF. We represent such a data structure as a buffer of
+// shape [max_element_count, element_shape].
+
+// Creates an i32 scalar tf.Const.
+Value CreateScalarConst(int32_t value, OpBuilder builder, Location loc);
+
+// Creates an integer vector tf.Const.
+Value GetR1Const(ArrayRef<int64_t> r1, OpBuilder builder, Location loc,
+                 int bitwidth = 32);
+
+// Returns the type of the size tensor used to track a data structure's element
+// count. It is a tensor<1xi32>, and we use R1 instead of a scalar because it is
+// easier to concat it with other offsets.
+TensorType GetSizeType(OpBuilder builder);
+
+// Reshapes a scalar value to match the size type tensor<i32>.
+Value ReshapeScalarToSizeType(OpBuilder builder, Value scalar, Location loc);
+
+// Creates ops that represent the indices of the slice for an element in the
+// buffer. Requires `index` to have tensor<1xi32> type.
+Value GetIndicesForElement(Value index, Value buffer, OpBuilder builder,
+                           Location loc);
+
+// Creates ops that slice the element out of a buffer at the given index.
+// Requires `index` to have tensor<1xi32> type.
+Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc,
+                 bool keep_slice_shape = false);
+
+// Creates ops that copy the buffer and update an element at the given index.
+// Requires `index` to have tensor<1xi32> type.
+Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
+                 Location loc);
+
+// Creates the buffer for the data structure with given element shape, type and
+// maximum size.
+LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
+                                    int64_t max_size, Operation* op,
+                                    Type element_dtype, OpBuilder builder,
+                                    Value* buffer);
+
+// Same as above, but uses a Value as max_size and check if it is a constant.
+LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
+                                    Value max_size, Operation* op,
+                                    Type element_dtype, OpBuilder builder,
+                                    Value* buffer);
+
+// Tries to infer the element type with full shape based its write accesses.
+// `infer_from_user` should check if the provided op is an accessing op that
+// could be used to infer the type.
+std::optional<RankedTensorType> GetElementTypeFromAccess(
+    Value collection, ModuleOp module,
+    llvm::function_ref<std::optional<Type>(Operation*)> infer_from_op);
+
+// Creates a ReadVariableOp on a local variable.
+Value ReadLocalVariable(Value local_var, OpBuilder builder, Location loc);
+
+// Creates an AssignVariableOp on a local variable.
+TF::AssignVariableOp WriteLocalVariable(Value local_var, Value value,
+                                        OpBuilder builder, Location loc);
+
+// Adds two values, or creates a logical-or if they are boolean type.
+Value AccumulateBuffers(Value a, Value b, OpBuilder builder, Location loc);
+
+// Gathers elements in buffer with the indices.
+Value GatherElements(Value indices, Value buffer, OpBuilder builder,
+                     Location loc);
+
+// Scatters elements into buffer, where each scattered element is accumulated
+// with the old value in buffer.
+Value ScatterAccumulateElements(Value indices, Value updates, Value buffer,
+                                OpBuilder builder, Location loc);
+
+}  // namespace collection_ops_util
+}  // namespace TF
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_COLLECTION_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
new file mode 100644
index 00000000..887eea74
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+LogicalResult ConstantFoldFallbackHook(
+    Operation *inst, ArrayRef<Attribute> operands,
+    SmallVectorImpl<OpFoldResult> &results);  // NOLINT
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h
new file mode 100644
index 00000000..636dde98
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_UTILS_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Checks whether the given TF operation can be folded or not.
+bool CanBeFolded(Operation* inst);
+
+// Evaluates the operation with given operand values.
+LogicalResult EvaluateOperation(Operation* inst,
+                                llvm::ArrayRef<ElementsAttr> operands,
+                                llvm::SmallVector<Attribute>& results);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.h
new file mode 100644
index 00000000..c8f0c84b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_DECOMPOSE_RESOURCE_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_DECOMPOSE_RESOURCE_OPS_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Populates rewrite patterns that decompose composite resource operations into
+// primitive ones like ReadVariableOp, AssignVariableOp and other computations
+// to facilitate transformations like resource op lifting.
+// NOTE: These patterns do not support `use_locking=true` for a lot of resource
+// operations. So decomposition may not be correct outside of backends like XLA,
+// which automatically locks all resource variables.
+void PopulateDecomposeResourceOpsPatterns(MLIRContext *context,
+                                          RewritePatternSet *patterns);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_DECOMPOSE_RESOURCE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/einsum.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/einsum.h
new file mode 100644
index 00000000..65e05280
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/einsum.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This pass identifies patterns for certain Einsum Ops and replaces them
+// with other equivalent TF Ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_EINSUM_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_EINSUM_H_
+
+#include <cstdint>
+#include <initializer_list>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TF {
+
+// TF.Einsum provides fully general tensor contractions. For a few select
+// cases, we can convert this op to other TF Ops, which in later passes
+// properly convert to TF Lite ops.
+struct ConvertTFEinsumOp : public OpRewritePattern<TF::EinsumOp> {
+ public:
+  explicit ConvertTFEinsumOp(MLIRContext* context)
+      : OpRewritePattern<TF::EinsumOp>(context) {}
+
+  LogicalResult matchAndRewrite(TF::EinsumOp op,
+                                PatternRewriter& rewriter) const override;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_EINSUM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
new file mode 100644
index 00000000..0de93ca4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+
+namespace mlir {
+namespace TF {
+
+// Bundle generic MLIR graph optimization passes (some derived from TF Grappler
+// graph optimizers) into a single MLIR optimization pass.
+class MlirGraphOptimizationPass : public ::tensorflow::MlirOptimizationPass {
+ public:
+  llvm::StringRef name() const override { return "graph_optimization"; }
+
+  ::tensorflow::MlirOptimizationPassState GetPassState(
+      const ::tensorflow::DeviceSet* device_set,
+      const ::tensorflow::ConfigProto& config_proto,
+      const tensorflow::Graph& graph,
+      const tensorflow::FunctionLibraryDefinition& function_library)
+      const override {
+    return config_proto.experimental().enable_mlir_graph_optimization()
+               ? tensorflow::MlirOptimizationPassState::Enabled
+               : tensorflow::MlirOptimizationPassState::Disabled;
+  }
+
+  absl::Status Run(
+      const std::string& function_name,
+      const ::tensorflow::ConfigProto& config_proto, ModuleOp module,
+      const ::tensorflow::Graph& graph,
+      const tensorflow::FunctionLibraryDefinition& function_library) override;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.h
new file mode 100644
index 00000000..5fe8ab12
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GROUP_BY_DIALECT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GROUP_BY_DIALECT_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Create a pass that groups ops into functions that only contain one dialect.
+std::unique_ptr<Pass> CreateGroupByDialectPass();
+
+// Register this pass in the global registry of MLIR.
+void RegisterGroupByDialectPass();
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GROUP_BY_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h
new file mode 100644
index 00000000..3640f53a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_LOWER_CLUSTER_TO_RUNTIME_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_LOWER_CLUSTER_TO_RUNTIME_OPS_H_
+
+#include "absl/base/attributes.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "xla/tsl/framework/device_type.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Given a MLIR module with tf_device.cluster ops, insert specific Runtime ops
+// such as TPUExecute or XlaExecute depending on the device type and specific
+// host runtime. Also does some optimization. Will return an error if it fails.
+// The output Runtime ops depends on both Device Type and Runtime Host.
+//
+// Input:
+//     Tensorflow Dialect MLIR with tf_device.cluster ops and virtual devices.
+//     xla_device_type - The device type that is being targeted.
+// Output:
+//     Tensorflow Dialect MLIR with Runtime specific ops. All tf_device.cluster
+//     ops are removed. Physical devices are assigned to ops instead of virtual
+//     devices.
+absl::Status RunLowerClusterToRuntimeOpsPassPipeline(
+    mlir::ModuleOp module, tsl::DeviceType xla_device_type,
+    llvm::StringRef module_name = llvm::StringRef());
+
+// The same API as RunLowerClusterToRuntimeOpsPassPipeline but as an MLIR pass
+// pipeline.
+void RegisterTPULowerClusterToRuntimeOpsPassPipeline();
+void RegisterNonTPULowerClusterToRuntimeOpsPassPipeline();
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_LOWER_CLUSTER_TO_RUNTIME_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h
new file mode 100644
index 00000000..7012d6a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_RUNTIME_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_RUNTIME_PASSES_H_
+
+#include <memory>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFTPU {
+
+// Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime
+// ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateTPURewritePass(
+    llvm::StringRef module_name = llvm::StringRef());
+
+// Creates a pass that adds ops which perform formatting on variables at
+// run-time according to compilation result.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUVariableRuntimeReformattingPass();
+
+// Creates a pass that merges device variable reads/updates into the surrounded
+// TPUExecute node. This allows the execute node to perform in-place variable
+// updates.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUMergeVariablesWithExecutePass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_TPUMERGEVARIABLESWITHEXECUTEPASS
+#define GEN_PASS_DECL_TPUREWRITEPASS
+#define GEN_PASS_DECL_TPUVARIABLERUNTIMEREFORMATTINGPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h.inc"
+
+}  // namespace TFTPU
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_RUNTIME_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h
new file mode 100644
index 00000000..b58401eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_TPU_METADATA_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_TPU_METADATA_UTILS_H_
+
+#include <optional>
+
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// Populates a TPUCompileMetadataProto from attributes of a
+// `tf_device::ClusterFuncOp`. If any necessary attributes are missing from the
+// op, a failure will be returned.
+// TODO(lyandy): Support session handle and guaranteed consts.
+LogicalResult SetMetadataProtoFromClusterFuncOp(
+    tf_device::ClusterFuncOp op, int num_replicas, int num_cores_per_replica,
+    std::optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata);
+}  // namespace TFTPU
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_TPU_METADATA_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.h
new file mode 100644
index 00000000..623e5f4e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_INITIALIZE_VARIABLES_IN_SESSION_INIT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_INITIALIZE_VARIABLES_IN_SESSION_INIT_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Initializes all variables in Session Init function for all variables in
+// 'session'.
+LogicalResult InitializeVariablesInSessionInitializer(
+    ModuleOp module, tensorflow::Session *session);
+
+}  // namespace tf_saved_model
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_INITIALIZE_VARIABLES_IN_SESSION_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h
new file mode 100644
index 00000000..a0a218f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Creates GlobalTensorOp for each variable from function arguments and converts
+// them to the corresponding saved model arguments.
+LogicalResult LiftVariables(ModuleOp module, ::tensorflow::Session* session,
+                            bool import_variables_as_dense_resources = false);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.h
new file mode 100644
index 00000000..e66bf22f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_GLOBALS_TO_ML_PROGRAM_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_GLOBALS_TO_ML_PROGRAM_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Create a pass that removes function arguments that map to global tensors.
+std::unique_ptr<Pass> CreateLowerGlobalsToMlProgramPass();
+
+// Register this pass in the global registry of MLIR.
+void RegisterLowerGlobalsToMlProgramPass();
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_GLOBALS_TO_ML_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h
new file mode 100644
index 00000000..b8e26302
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_TF_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Populates TensorFlow lowering patterns to lower some of the TensorFlow
+// operations that can be represented using other TensorFlow operations.
+// TODO(laurenzo): For some reason, TFLite uses this pass and has exact
+// requirements on what it can do. This is fragile and should be fixed (at a
+// minimum, names should clearly convey scope). In the mean time, for a real
+// compiler, use PopulateTFLoweringBeforeHLOPatterns.
+void PopulateLoweringTFPatterns(MLIRContext *context,
+                                RewritePatternSet *patterns);
+
+// Populates TensorFlow lowering patterns to lower some of the TensorFlow
+// operations that can be represented by means of other TensorFlow operations.
+// This pattern collection preserves those TensorFlow operations that will later
+// be lowered to equivalent operations in CHLO or MHLO. This allows for
+// HLO-specific lowerings.
+void PopulateTFLoweringBeforeHLOPatterns(MLIRContext *context,
+                                         RewritePatternSet *patterns);
+
+// Populates TensorFlow lowering patterns to lower some of the TensorFlow
+// operations that can be represented using other TensorFlow operations.
+// Patterns are from ops with some inputs or outputs that are quantized types
+// only to ops that allow non-quantized types on all inputs and outputs.
+void PopulateLoweringQuantizedPatterns(MLIRContext *context,
+                                       RewritePatternSet *patterns);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_TF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.h
new file mode 100644
index 00000000..4a18c096
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_MARK_INITIALIZED_VARIABLES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_MARK_INITIALIZED_VARIABLES_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+// Marks all variables in 'function' whether they are initialized
+// in 'session' or not by setting an attribute named 'is_initialized'
+// on each variable op with value true/false based on variable is initialized
+// in the session or not.
+// If 'session' is NULL the function is no-op.
+// Returns failure in case fetching variables from session failed, success
+// otherwise.
+LogicalResult MarkInitializedVariablesInFunction(func::FuncOp function,
+                                                 tensorflow::Session* session);
+// Apply `MarkInitializedVariablesInFunction` to every non-empty function in the
+// module.
+LogicalResult MarkInitializedVariablesInFunction(ModuleOp module,
+                                                 tensorflow::Session* session);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_MARK_INITIALIZED_VARIABLES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.h
new file mode 100644
index 00000000..61cbf4d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_MLPROGRAM_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_MLPROGRAM_H_
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+
+namespace tensorflow {
+
+void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_MLPROGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.h
new file mode 100644
index 00000000..0268a89a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ORDER_BY_DIALECT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ORDER_BY_DIALECT_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Create an instance of a pass that reorders ops so ops of the same dialect are
+// next to each other.
+std::unique_ptr<Pass> CreateOrderByDialectPass();
+
+// Register this pass in the global registry of MLIR.
+void RegisterOrderByDialectPass();
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ORDER_BY_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
new file mode 100644
index 00000000..54dad08e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -0,0 +1,715 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+
+namespace mlir {
+
+// Creates a pass that breaks up an island with multiple ops into multiple
+// islands, each with a single op.
+std::unique_ptr<OperationPass<ModuleOp>> CreateBreakUpIslandsPass();
+
+// Creates a pass that converts mlir functions consisting of mlir ops into a
+// tf_executor dialect as a single island.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateFunctionalToExecutorDialectConversionPass();
+
+// Creates a pass that lifts inner ops of tf_executor.island ops in
+// tf_executor.graph into the same block as the tf_executor.graph.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateExecutorDialectToFunctionalConversionPass();
+
+namespace TF {
+// Creates a pass that canonicalizes legacy compilation and replication
+// attributes.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateCanonicalizeCompileAndReplicateAttributesPass();
+
+// Creates a pass that drops `shape_invariant` attribute from While/WhileRegion
+// ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDropWhileShapeInvariantPass();
+
+// Creates a pass that drops `shape_invariant` attribute from While/WhileRegion
+// ops within device cluster.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDropWhileShapeInvariantInDeviceClusterPass();
+
+// Creates a pass that moves writes to replicate invariant resource variables
+// outside tf_device.replicate op.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateHoistReplicateInvariantResourceWritesPass();
+
+// Transforms functional control flow operations in the TensorFlow dialect to
+// MLIR Control Flow Graph (CFG) form.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTFFunctionalControlFlowToCFG();
+
+// Transforms functional control flow operations in the TensorFlow dialect to
+// their region based counterparts.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFFunctionalControlFlowToRegions();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFFunctionalControlFlowToRegions(
+    bool allow_passthrough_args);
+
+// Transforms region bases control flow operations in the TensorFlow dialect to
+// their functional counterparts.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFRegionControlFlowToFunctional();
+
+// Materialize the MlirPassthroughOp by replacing it with the MLIR module
+// attached as an attribute.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateMaterializePassthroughOpPass();
+
+// Replicates the TensorList init op by undoing some CSE needed for correct
+// shape assignment in shape_inference.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplicateTensorListInitOpsPass();
+
+// Performs Shape Inference on the TensorFlow dialect using the global registry.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass(
+    ArrayRef<ArrayRef<int64_t>> input_shapes = {});
+
+// Performs TF.data optimizations.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTFDataOptimizationPass();
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMoveTransposesPass();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLayoutAssignmentPass();
+
+// Guarantee that all FuncOp's have a single use.
+std::unique_ptr<OperationPass<ModuleOp>> CreateGuaranteeAllFuncsOneUsePass();
+
+// Optional pass which will unroll BatchMatMul and use only MatMul
+std::unique_ptr<OperationPass<func::FuncOp>> CreateUnrollBatchMatMulPassPass();
+
+// Optional pass which will map TF BatchMatMul to TF Einsum
+std::unique_ptr<OperationPass<func::FuncOp>> CreateBatchMatMulToEinsumPass();
+
+// Pass that transform Einsum to other TF Ops for the supported variants.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTransformEinsumPass();
+
+// Optimizes Tensorflow graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTFOptimizePass();
+void RegisterTFOptimizePassPipeline();
+
+// Creates pass to rewrite RecvTPUEmbeddingActivationsOp and
+// SendTPUEmbeddingGradients ops to internal variants.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateRewriteTPUEmbeddingOpsPass();
+
+// Performs specific fusion for GPU targets.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateGpuOpFusionPass();
+
+// Creates a pass that decomposes to be compiled ReduceDataset ops into a while
+// loop that iterates the dataset and calls the reduction function.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDecomposeReduceDatasetPass();
+
+// Create a pass that convert ops that copy tensors between devices, e.g.
+// tf.Identity.
+std::unique_ptr<OperationPass<mlir::func::FuncOp>>
+CreateTensorDeviceCopyConversionPass();
+
+// Returns a pass that folds tf.BroadcastTo nodes with subsequent nodes if they
+// have built in broadcasting support.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateBroadcastFoldPass();
+
+void populateTfControlFlowToScfPatterns(MLIRContext* context,
+                                        RewritePatternSet* patterns);
+// Create a pass to convert TensorFlow control flow to SCF.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTfControlFlowToScfPass();
+
+struct LayoutOptimizationPipelineOptions
+    : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
+  Option<std::string> force_data_format{
+      *this, "force-data-format",
+      llvm::cl::desc("Force data format for all layout sensitive ops")};
+  Option<bool> skip_fold_transpose_in_ops{
+      *this, "skip-fold-transpose-in-ops",
+      llvm::cl::desc("Skip folding transpose operands in Ops which can support "
+                     "different layouts.")};
+};
+
+// Layout optimization assigns optimal data layout for layout sensitive
+// operations, and cancels all redundant transposes.
+void CreateLayoutOptimizationPipeline(
+    OpPassManager& pm,  // NOLINT - MLIR contract is pass by mutable reference.
+    const LayoutOptimizationPipelineOptions& options);
+
+struct StandardPipelineOptions
+    : public PassPipelineOptions<StandardPipelineOptions> {
+  Option<bool> enable_inliner{*this, "enable-inliner",
+                              llvm::cl::desc("Enable inliner."),
+                              llvm::cl::init(false)};
+  Option<bool> form_clusters{*this, "form-clusters",
+                             llvm::cl::desc("Enable Cluster Formation pass."),
+                             llvm::cl::init(false)};
+};
+
+// Propagates the pass manager with the passes involved in transforming or
+// optimizing an MLIR graph without any target specialization.
+// NOLINTNEXTLINE - MLIR contract is pass by mutable reference.
+void CreateTFStandardPipeline(OpPassManager& pm,
+                              const StandardPipelineOptions& options);
+
+// Propagates device attributes of resources from callers to callees.
+std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass();
+
+// Creates a pass that promotes resource reads/writes in `functions` to inputs
+// and outputs of `functions`, assuming that resource operations have already
+// been decomposed and function calls have already been inlined. If `functions`
+// is empty, the pass is applied to the main function by default. The pass also
+// annotates the input arguments for resources with the indices of their
+// aliasing output arguments.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass(
+    llvm::ArrayRef<std::string> functions = {});
+
+// Creates a pass that promotes tf.VarHandleOp to resource arguments for all
+// functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
+
+// Creates a pass that converts readonly reference variables to the
+// corresponding resource variables.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateConvertReadonlyReferenceVariablesToResourceVariablesPass();
+
+// Creates a simple device assignment pass on TF dialect for CoreRT use case.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateSimpleTFDeviceAssignmentPass(
+    llvm::StringRef default_device = "cpu");
+
+// Creates a pass to perform device assignment for TF dialect ops that do not
+// have device assignment, by using the device attribute of the function.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTFDeviceAssignmentByFuncAttrPass();
+
+// Performs resource lifting on the function body to hoist resource variable
+// accesses outside all control flow statements.
+LogicalResult ResourceLiftingForFunctionalControlFlow(func::FuncOp function);
+
+// Converts stack ops into operations on local variables, which can later be
+// removed by resource lifting. Requires known maximum sizes of stacks and
+// known element shapes of push ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateStackOpsDecompositionPass();
+
+// Creates a pass to strip the "tf._noinline" attribute from the functions in
+// the module.
+std::unique_ptr<OperationPass<ModuleOp>> CreateStripNoinlineAttributePass();
+
+// Converts tensor list operations into operations on buffers and sizes. Needs
+// static shapes and known max element count.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTensorListOpsDecompositionPass();
+
+// Converts tensor array ops into operations on local variables, which can later
+// be removed by resource lifting. Requires known sizes and known element shapes
+// (either defined in TensorArrayV3 or implied in the first write).
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTensorArrayOpsDecompositionPass();
+
+// Create a pass that legalize TFG to TF dialect.
+std::unique_ptr<Pass> CreateLegalizeTFGToTFEPass();
+
+// Matches sequence of ops to TensorFlow fused kernels. This pass should not be
+// generally used beyond exporting to runtimes that supports these ops. In the
+// future these fusions may be codegen'd automatically.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateFusedKernelMatcherPass();
+
+// Creates function pass to select device index/fold tf.DeviceIndex.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDeviceIndexSelectorPass();
+
+// Creates function pass to replace InitializeTableFromTextFileV2Ops with
+// LookupTableImportV2Op ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateInitTextFileToImportPass(
+    std::string saved_model_dir = "");
+
+// Creates function pass to cluster TensorFlow ops by host. The program
+// generated by this pass will have one function per host where all operations
+// in the same function are placed on the same host. Each result of the per-host
+// function will have a "tf.device" attribute which specifies the device
+// assignment of the result.
+std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateClusterTFOpsByHostPass();
+
+// Creates a pass to insert tf_device.send and tf_device.receive ops to make
+// sure any argument of any op is on the same host of the op itself.
+std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateCrossHostTransferPass();
+
+// Creates a pass that adds the device attribute to every tf.Const op based on
+// the device attribute of the operations that read its result. If the result of
+// a tf.Const op is read by operations placed on multiple devices, then the pass
+// will replicate the tf.Const op once for each device.
+std::unique_ptr<OperationPass<ModuleOp>> CreateConstantOpDeviceAssignmentPass();
+
+// Returns pass that verifies whether all functions in module are of single
+// tf_executor.graph and each tf_executor.island in tf_executor.graph only has a
+// single op.
+std::unique_ptr<OperationPass<ModuleOp>> CreateVerifySuitableForExportPass();
+
+// Returns pass that prepares TPU computation to be legal for export to
+// TensorFlow.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreatePrepareTpuComputationForTfExportPass();
+
+// Rewrites ops that require quantized inputs or outputs to ops that allow
+// non-quantized inputs and outputs.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLowerQuantizedPass();
+
+// Reorders ops so ops of the same dialect are next to each other.
+std::unique_ptr<Pass> CreateOrderByDialectPass();
+
+// Groups ops into functions that only contain one dialect.
+std::unique_ptr<Pass> CreateGroupByDialectPass();
+
+// Removes unused parameters from functions & their callers.
+std::unique_ptr<OperationPass<ModuleOp>> CreateRemoveUnusedArgumentsPass();
+
+// Removes unused results from WhileRegion ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateRemoveUnusedWhileResultsPass();
+
+// Hoists loop invariant ops to the outside of the loop.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateHoistLoopInvariantPass();
+
+// Creates VarHandleOps right next to the operations that use them.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLocalizeVarHandlesPass();
+
+// Removes all TF attributes
+std::unique_ptr<OperationPass<ModuleOp>> CreateStripTfAttributesPass();
+
+// Converts AnonymousIteratorOps to (named) IteratorOps.
+std::unique_ptr<OperationPass<ModuleOp>> CreateNameAnonymousIteratorsPass();
+
+// Creates a pass that breaks up an island with multiple ops into multiple
+// islands, each with a single op. This pass intentionally does not propagate
+// control dependencies across newly created islands and is handled by
+// CreateTFExecutorUpdateControlDependenciesPass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateSplitIntoIslandPerOpPass();
+
+// Prints, but otherwise pipes through without changes, the current module.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrintPass(
+    raw_ostream* os = nullptr);
+
+// Moves TPUCompileMlir ops as far to the front as possible.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMoveTpuCompileToFrontPass();
+
+// Decomposes OptionalFromValue, OptionalGetValue, OptionalNone,
+// and OptionalHasValue
+std::unique_ptr<OperationPass<ModuleOp>> CreateDecomposeOptionalsPass();
+
+//===----------------------------------------------------------------------===//
+// XlaCallModule
+//===----------------------------------------------------------------------===//
+
+// Creates a pass that deserializes functions in the StableHLO modules from
+// `tf.XlaCallModule` to the top-level module.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateXlaCallModuleDeserializationPass();
+
+// Creates a pass that serializes StableHLO functions referenced by
+// `tf.XlaCallModule` from the top-level module to `tf.XlaCallModule`'s
+// `module` attribute.
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaCallModuleSerializationPass();
+
+}  // namespace TF
+
+namespace tf_executor {
+
+// Creates a pass to chain control outputs of while loop body.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorConvertControlToDataOutputsPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorConvertControlToDataOutputsPass(
+    bool composite_tpuexecute_side_effects);
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorCheckControlDependenciesPass();
+
+// Creates a pass to merge IslandOps from TFExecutor dialect.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTFExecutorIslandCoarseningPass();
+
+// Creates a pass to merge IslandOps for operation marked for execution on TPU.
+// This is a V1 backward compatibility.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorTPUV1IslandCoarseningPass();
+
+// Creates a pass to outlining TPU clusters from single IslandOp into a nested
+// module suitable for being processed as-if it was a V2 module.
+// This is a V1 backward compatibility.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorTPUV1IslandOutliningPass();
+
+// Creates a pass to inline calls to the nested TPU module, this reverses the
+// effect of the `TFExecutorTPUV1IslandOutlining` pass above.
+// This is a V1 backward compatibility.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorTPUV1IslandInliningPass();
+
+// Creates a pass to prune tf_executor.graph from dead nodes.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTFExecutorGraphPruningPass(
+    llvm::ArrayRef<std::string> ops_to_preserve = {});
+
+// Creates a pass to update control dependencies.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorUpdateControlDependenciesPass();
+
+}  // namespace tf_executor
+
+namespace TFDevice {
+// Creates a pass that forms clusters from instructions that are assigned to
+// same device.
+std::unique_ptr<OperationPass<ModuleOp>> CreateClusterFormationPass();
+
+// Sinks `tf.Const` operations in the ClusterOp region using them. This is
+// performed in order to limit the number of values implicitly captured in this
+// region before outlining.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateClusterConstantSinkingPass(
+    llvm::function_ref<bool(tf_device::ClusterOp, ElementsAttr)> filter = {});
+
+// Creates a pass that outlines regions of tf_device.cluster operations.
+std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass();
+
+// Creates a pass that outlines regions of tf_device.launch operations.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchOutliningPass();
+
+// Creates a pass that converts tf_device::LaunchFuncOp into
+// TF::PartitionedCallOp.
+std::unique_ptr<OperationPass<ModuleOp>> CreateConvertLaunchFuncToTFCallPass();
+
+// A pass that decomposes composite resource operations into primitive ones like
+// ReadVariableOp, AssignVariableOp and other computations to facilitate
+// transformations like resource op lifting.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDecomposeResourceOpsPass();
+
+// A pass that decomposes composite resource operations in device cluster
+// (tf_device.cluster op) into primitive ones like ReadVariableOp,
+// AssignVariableOp and other computations to facilitate transformations like
+// resource op lifting.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateDecomposeResourceOpsInClusterPass();
+
+// Creates a pass that marks TPU cluster input-output pairs reading and writing
+// to same resource variable as aliases.
+std::unique_ptr<OperationPass<ModuleOp>> CreateMarkInputOutputAliasesPass();
+
+// Creates a pass that lifts operations on external resource variables from
+// device computation nested in `tf_device::LaunchOp` out so that resource
+// variable load operations are all before device computation while resource
+// variable store operations are all after device computation. After this pass,
+// device computation no longer interacts with external resource variables.
+std::unique_ptr<OperationPass<ModuleOp>> CreateResourceOpLiftingPass();
+
+// Creates a pass that lifts operations from the main function.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateResourceOpLiftingForMainFunctionPass();
+
+// Lifts resource operations from tf_device.launch_func ops nested in `op`
+// outside. Returns a failure if there are remaining resource-type values that
+// can not be lifted.
+LogicalResult LiftResourceOps(Operation* op);
+
+// Creates a pass that hoists invariant operations in a `tf_device.replicate`.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplicateInvariantOpHoistingPass();
+
+// Creates a pass that forms replica `tf_executor.island` from a single
+// `tf_device.replicate` island.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass(
+    bool legacy_graph_export = true);
+
+// Creates a pass that sets the device ordinal attribute of the required op
+// using the replica id attribute.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplicaIDToDeviceOrdinalPass();
+
+// Creates a pass that creates `tf_executor.island` from a single
+// `tf_device.parallel_execute` island.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
+    bool legacy_graph_export = true);
+
+// Creates a pass that annotates whether a LaunchFuncOp's parameters have the
+// same data across replicas.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateAnnotateParameterReplicationPass();
+
+// Creates a pass that merges control flow with similar predicates.
+std::unique_ptr<OperationPass<ModuleOp>> CreateMergeControlFlowPass();
+
+// Creates a pass that wraps each TensorFlow dialect with `device` attribute
+// in a `tf_device.launch` op with the same `device` attribute.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDeviceAttributeToLaunchPass();
+
+// Creates a pass that hoists a `tf_device.launch` body and assigns a `device`
+// attribute to each TensorFlow dialect op in the body based on the `device`
+// attribute on the `tf_device.launch`.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLaunchToDeviceAttributePass(
+    bool legacy_graph_export = true);
+
+// Creates a pass to ensure that the `_xla_outside_compilation` and
+// tf_device.launch op no longer exist after Outside Compilation is complete.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateVerifyNoOutsideCompilationMarkersPass();
+
+// Create a pass that inlines the StatefulPartitionedCallOp op based in the
+// parent region.
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaInlineDeviceOpsPass();
+
+// Creates a pass that rewrites partitioned calls with `_xla_compile_device
+// type` with `tf.XlaLaunch` ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaRewritePass();
+
+// Create a pass that validates the input graph to the CPU/GPU bridge.
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaValidateInputsPass();
+}  // namespace TFDevice
+
+namespace TFTPU {
+// Creates a pass that converts unified compilation and replication
+// attributes back to legacy attributes.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateConvertToLegacyCompileAndReplicateAttributesPass();
+
+// Creates a pass that converts all TPUPartitionedInput to TPUPartitionedInputV2
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUPartitionedOpConversionPass();
+
+// Creates a pass that cleans up `_replication_info` attribute on operations
+// that are inside a cluster.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUClusterCleanupAttributesPass();
+
+// Creates a pass that removes Identity/IdentityN ops from a cluster.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass();
+
+// Creates a pass that allows TPU program inputs to have layouts determined at
+// run time.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass();
+
+// Creates a pass that adds `tf.ReadVariableOp` to a TPU cluster for resources
+// the cluster only writes to.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass();
+
+// Creates a pass that reorders partitiioned resource reads and replicated
+// inputs.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUReorderReplicateAndPartitionedInputsPass();
+
+// Creates a pass that partitions unpartitioned resource read/write to
+// partitioned resource variables.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUResourceReadsWritesPartitioningPass();
+
+// Creates a pass that looks for usage of the result of
+// TPUCopyWithDynamicShapeOp and annotate these values to be dynamic shape. This
+// ensures that the generated tpu program has the correct inputs annotation.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUAnnotateDynamicShapeInputsPass();
+
+// Creates a pass that moves `tf.AssignVariableOp` into a
+// `tf_device.parallel_execute` region if the `tf.AssignVariableOp` is the
+// only consumer of a `tf_device.parallel_execute` result.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUParallelExecuteSinkResourceWritePass();
+
+// Create a pass that extract TPUCopyWithDynamicShapeOp from the host launch op
+// and wrap them in device launch op. This allows this op executed on TPU while
+// still compiled on host.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateExtractTPUCopyWithDynamicShapeOpPass();
+
+// Creates a pass that wraps ReadVariableOp/AssignVariable op that consumes a
+// packed tensor to have same device placement as underlying TPU device.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUColocateCompositeResourceOps();
+
+// Creates a pass that expands outside compilation cluster at the head/tail of
+// TPU computation by adding outside compilation attribute to identity/cast ops
+// that are only used for host computation.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUHostComputationExpansionPass();
+
+// Creates a pass that updates inputs to TPU embedding layer enqueue ops so that
+// correct ops are invoked during training and evaluation.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUUpdateEmbeddingEnqueueOpInputsPass();
+
+// Creates a pass that propagates TPU devices to users.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTPUDevicePropagationPass();
+
+// Create a pass that colocates each `Split` with its predecessor.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTPUColocateSplitsPass();
+
+// Creates a pass that replicates the tf._TPUCompileMlir op on each host that
+// needs the compiled program. It helps avoid transferring the compiled binary
+// between hosts.
+std::unique_ptr<OperationPass<mlir::ModuleOp>>
+CreateTPUCompileOpReplicationPass();
+
+// Creates a pass that applies space to depth transform
+// for the first or frontier convolutions consume host inputs on TPU.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUSpaceToDepthPass();
+
+// Adjusts the device on TPUCopyWithDynamicShape ops.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateColocateTPUCopyWithDynamicShapePass();
+
+}  // namespace TFTPU
+
+// Define the registrations in a detail namespace, just so that we can overload
+// the main entry point `registerTensorFlowPasses` to inject
+// RegisterTFOptimizePassPipeline.
+namespace detail {
+
+// Direction in which to move transposes in MoveTransposePass.
+enum MoveTransposeDirection { kBegin, kEnd };
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_BATCHMATMULTOEINSUMPASS
+#define GEN_PASS_DECL_BREAKUPISLANDSPASS
+#define GEN_PASS_DECL_BROADCASTFOLDPASS
+#define GEN_PASS_DECL_CANONICALIZECOMPILEANDREPLICATEATTRIBUTESPASS
+#define GEN_PASS_DECL_CLUSTERCONSTANTSINKINGPASS
+#define GEN_PASS_DECL_CLUSTERFORMATIONPASS
+#define GEN_PASS_DECL_CLUSTEROUTLININGPASS
+#define GEN_PASS_DECL_CLUSTERTFOPSBYHOSTPASS
+#define GEN_PASS_DECL_CONSTANTOPDEVICEASSIGNMENTPASS
+#define GEN_PASS_DECL_CONVERTLAUNCHFUNCTOTFCALLPASS
+#define GEN_PASS_DECL_CONVERTREADONLYREFERENCEVARIABLESTORESOURCEVARIABLESPASS
+#define GEN_PASS_DECL_CONVERTTFCONTROLFLOWTOSCFPASS
+#define GEN_PASS_DECL_CONVERTTOLEGACYCOMPILEANDREPLICATEATTRIBUTESPASS
+#define GEN_PASS_DECL_DECOMPOSEREDUCEDATASETPASS
+#define GEN_PASS_DECL_DEVICEINDEXSELECTORPASS
+#define GEN_PASS_DECL_DROPWHILESHAPEINVARIANTINDEVICECLUSTERPASS
+#define GEN_PASS_DECL_DROPWHILESHAPEINVARIANTPASS
+#define GEN_PASS_DECL_EXECUTORCHECKCONTROLDEPENDENCIESPASS
+#define GEN_PASS_DECL_EXECUTORCONVERTCONTROLTODATAOUTPUTSPASS
+#define GEN_PASS_DECL_EXECUTORDIALECTTOFUNCTIONALPASS
+#define GEN_PASS_DECL_EXECUTORGRAPHPRUNINGPASS
+#define GEN_PASS_DECL_EXECUTORISLANDCOARSENINGPASS
+#define GEN_PASS_DECL_EXECUTORTPUV1ISLANDINLININGPASS
+#define GEN_PASS_DECL_EXECUTORUPDATECONTROLDEPENDENCIESPASS
+#define GEN_PASS_DECL_FUNCTIONALCONTROLFLOWTOCFGPASS
+#define GEN_PASS_DECL_FUNCTIONALCONTROLFLOWTOREGIONSPASS
+#define GEN_PASS_DECL_FUNCTIONALTOEXECUTORDIALECTCONVERSIONPASS
+#define GEN_PASS_DECL_FUSEDKERNELMATCHERPASS
+#define GEN_PASS_DECL_GROUPBYDIALECTPASS
+#define GEN_PASS_DECL_GUARANTEEALLFUNCSONEUSEPASS
+#define GEN_PASS_DECL_HOISTREPLICATEINVARIANTRESOURCEWRITESPASS
+#define GEN_PASS_DECL_INITTEXTFILETOIMPORTPASS
+#define GEN_PASS_DECL_LAUNCHOUTLININGPASS
+#define GEN_PASS_DECL_LAYOUTASSIGNMENTPASS
+#define GEN_PASS_DECL_LEGALIZEHLOTOTFPASS
+#define GEN_PASS_DECL_LEGALIZETFGTOTFPASS
+#define GEN_PASS_DECL_LOCALIZEVARHANDLESPASS
+#define GEN_PASS_DECL_LOWERQUANTIZEDPASS
+#define GEN_PASS_DECL_MARKINPUTOUTPUTALIASESPASS
+#define GEN_PASS_DECL_MATERIALIZEPASSTHROUGHOP
+#define GEN_PASS_DECL_MERGECONTROLFLOWPASS
+#define GEN_PASS_DECL_MOVETRANSPOSESPASS
+#define GEN_PASS_DECL_ORDERBYDIALECTPASS
+#define GEN_PASS_DECL_PARALLELEXECUTETOISLANDSPASS
+#define GEN_PASS_DECL_PREPARETPUCOMPUTATIONFORTFEXPORTPASS
+#define GEN_PASS_DECL_PROMOTERESOURCESTOARGSPASS
+#define GEN_PASS_DECL_PROMOTEVARHANDLESTOARGSPASS
+#define GEN_PASS_DECL_REGIONCONTROLFLOWTOFUNCTIONALPASS
+#define GEN_PASS_DECL_REMOVEUNUSEDARGUMENTSPASS
+#define GEN_PASS_DECL_REMOVEUNUSEDWHILERESULTSPASS
+#define GEN_PASS_DECL_REPLICAIDTODEVICEORDINALPASS
+#define GEN_PASS_DECL_REPLICATEINVARIANTOPHOISTINGPASS
+#define GEN_PASS_DECL_REPLICATETOISLANDPASS
+#define GEN_PASS_DECL_RESOURCEDEVICEINFERENCEPASS
+#define GEN_PASS_DECL_REWRITETPUEMBEDDINGOPSPASS
+#define GEN_PASS_DECL_SIMPLETFDEVICEASSIGNMENTPASS
+#define GEN_PASS_DECL_SPLITINTOISLANDPEROPPASS
+#define GEN_PASS_DECL_STACKOPSDECOMPOSITIONPASS
+#define GEN_PASS_DECL_STRIPNOINLINEATTRIBUTEPASS
+#define GEN_PASS_DECL_TFDATAOPTIMIZATIONPASS
+#define GEN_PASS_DECL_TFDEVICEASSIGNMENTBYFUNCATTRPASS
+#define GEN_PASS_DECL_TPUBRIDGEEXECUTORISLANDOUTLININGPASS
+#define GEN_PASS_DECL_TPUCLEANUPCLUSTERATTRIBUTESPASS
+#define GEN_PASS_DECL_TPUCLUSTERFORMATIONPASS
+#define GEN_PASS_DECL_TPUCOLOCATECOMPOSITERESOURCEOPSPASS
+#define GEN_PASS_DECL_TPUDEVICEPROPAGATIONPASS
+#define GEN_PASS_DECL_TPUDYNAMICLAYOUTPASS
+#define GEN_PASS_DECL_TPUHOSTCOMPUTATIONEXPANSIONPASS
+#define GEN_PASS_DECL_TPUIDENTITYPRUNINGPASS
+#define GEN_PASS_DECL_EXTRACTTPUCOPYWITHDYNAMICSHAPEOPPASS
+#define GEN_PASS_DECL_TPUPARALLELEXECUTESINKRESOURCEWRITEPASS
+#define GEN_PASS_DECL_TPUREORDERREPLICATEANDPARTITIONEDINPUTSPASS
+#define GEN_PASS_DECL_TPURESOURCEREADFORWRITEPASS
+#define GEN_PASS_DECL_TPURESOURCEREADSWRITESPARTITIONINGPASS
+#define GEN_PASS_DECL_TPUSPACETODEPTHPASS
+#define GEN_PASS_DECL_TPUUPDATEEMBEDDINGENQUEUEOPINPUTSPASS
+#define GEN_PASS_DECL_TENSORARRAYOPSDECOMPOSITIONPASS
+#define GEN_PASS_DECL_TENSORDEVICECOPYCONVERSIONPASS
+#define GEN_PASS_DECL_TENSORFLOWOPTIMIZEPASS
+#define GEN_PASS_DECL_TENSORFLOWSHAPEINFERENCEPASS
+#define GEN_PASS_DECL_TENSORLISTOPSDECOMPOSITIONPASS
+#define GEN_PASS_DECL_TENSORFLOWGPUFUSION
+#define GEN_PASS_DECL_TPUV1BRIDGEEXECUTORISLANDCOARSENINGPASS
+#define GEN_PASS_DECL_TRANSFORMEINSUMPASS
+#define GEN_PASS_DECL_UNROLLBATCHMATMULPASS
+#define GEN_PASS_DECL_VERIFYSUITABLEFOREXPORTPASS
+#define GEN_PASS_DECL_XLACALLMODULEDESERIALIZATIONPASS
+#define GEN_PASS_DECL_XLACALLMODULESERIALIZATIONPASS
+#define GEN_PASS_DECL_XLACALLMODULECUSTOMCALLTFFUNCTIONRENAMINGPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+}  // namespace detail
+using namespace detail;  // NOLINT
+inline void registerTensorFlowPasses() {
+  detail::registerTensorFlowPasses();
+  TF::RegisterTFOptimizePassPipeline();
+}
+
+namespace TFDevice {
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_ANNOTATEPARAMETERREPLICATIONPASS
+#define GEN_PASS_DECL_DECOMPOSERESOURCEOPSINCLUSTERPASS
+#define GEN_PASS_DECL_DECOMPOSERESOURCEOPSPASS
+#define GEN_PASS_DECL_DEVICEATTRIBUTETOLAUNCHPASS
+#define GEN_PASS_DECL_HOSTLAUNCHTOOUTSIDECOMPILEDPASS
+#define GEN_PASS_DECL_LAUNCHTODEVICEATTRIBUTEPASS
+#define GEN_PASS_DECL_OUTSIDECOMPILEDTOHOSTLAUNCHPASS
+#define GEN_PASS_DECL_RESOURCEOPLIFTINGFORMAINFUNCTIONPASS
+#define GEN_PASS_DECL_RESOURCEOPLIFTINGPASS
+#define GEN_PASS_DECL_VERIFYNOOUTSIDECOMPILATIONMARKERSPASS
+#define GEN_PASS_DECL_XLACLUSTERFORMATIONPASS
+#define GEN_PASS_DECL_XLAINLINEDEVICEOPSPASS
+#define GEN_PASS_DECL_XLAREWRITEPASS
+#define GEN_PASS_DECL_XLAREWRITEV2PASS
+#define GEN_PASS_DECL_XLAVALIDATEINPUTSPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
+}  // namespace TFDevice
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h
new file mode 100644
index 00000000..f526acc1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+// Performs IR cleanup and canonicalization in preparation for Resource Op
+// Lifting pass. It does several things:
+// - Eliminate identity nodes to remove (most) of resource aliasing
+// - Canonicalize functional control flow. For functional control flow we
+//   expect that any resource output of these ops matches the corresponding
+//   input, and then forward that input to the output. Fails if this is not the
+//   case. If successful, the following invariants will hold true:
+//   (a) For if/case, any resource type results will be deleted.
+//   (b) For while, any resource type results will be unused.
+// - Canonicalize region based control flow. Again, any resource outputs are
+//   expected to be resolved to be one of the captured resource inputs. Fails
+//   if this is not the case. If successful, the following invariants will hold
+//   true:
+//   (a) For if/case, any resource type results will be deleted.
+//   (b) For while, any resource type results will be unused.
+namespace mlir {
+namespace TF {
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(ModuleOp module);
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(func::FuncOp func);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
new file mode 100644
index 00000000..b8bc0a1d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
@@ -0,0 +1,95 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_REWRITE_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_REWRITE_UTIL_H_
+
+#include <complex>
+
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Returns int, float or complex DenseElementsAttr with scalar shape with the
+// given element type and the integer value.
+template <typename T>
+DenseElementsAttr GetScalarOfType(Type ty, T raw_value) {
+  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
+  if (auto float_ty = mlir::dyn_cast<FloatType>(ty)) {
+    FloatAttr attr = FloatAttr::get(float_ty, raw_value);
+    return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto int_ty = mlir::dyn_cast<IntegerType>(ty)) {
+    IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
+    return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto complex_ty = mlir::dyn_cast<ComplexType>(ty)) {
+    Type complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<float>>(raw_value));
+    } else if (complex_element_ty.isF64()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<double>>(raw_value));
+    }
+  }
+  llvm_unreachable("unsupported type");
+}
+
+// Returns true if `value` is compile-time constant and its splat value equals
+// to `raw_value`.
+template <typename T>
+bool IsConstantValueOf(Value value, T raw_value) {
+  auto element_type = mlir::cast<ShapedType>(value.getType()).getElementType();
+  if (mlir::isa<FloatType>(element_type)) {
+    DenseFPElementsAttr float_attr;
+    if (matchPattern(value, m_Constant(&float_attr)) && float_attr.isSplat() &&
+        float_attr.getSplatValue<APFloat>().isExactlyValue(raw_value))
+      return true;
+  } else if (mlir::isa<IntegerType>(element_type)) {
+    DenseIntElementsAttr int_attr;
+    if (matchPattern(value, m_Constant(&int_attr)) && int_attr.isSplat() &&
+        int_attr.getSplatValue<APInt>() == raw_value)
+      return true;
+  }
+
+  return false;
+}
+
+// Returns true if `op` is placed on GPU device, and false if it's on other
+// devices or the device is not specified.
+bool IsOnGpuDevice(mlir::Operation *op);
+
+// Wrappers for CopyDeviceAndUnderscoredAttributes
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::OpResult src,
+                                               mlir::OpResult dest);
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::Operation *src,
+                                               mlir::OpResult dest);
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::Operation *src,
+                                               mlir::Operation *dest);
+
+// Wrappers for CopyXlaOutsideCompilationAttributes
+void CopyXlaOutsideCompilationAttributesAdaptor(mlir::OpResult src,
+                                                mlir::OpResult dest);
+void CopyXlaOutsideCompilationAttributesAdaptor(mlir::Operation *src,
+                                                mlir::OpResult dest);
+void CopyXlaOutsideCompilationAttributesAdaptor(mlir::Operation *src,
+                                                mlir::Operation *dest);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_REWRITE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.h
new file mode 100644
index 00000000..8b634b60
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SET_TPU_INFEED_LAYOUT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SET_TPU_INFEED_LAYOUT_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir {
+
+// Set layouts attribute of tf.InfeedDequeueTuple ops.
+bool SetTPUInfeedLayout(ModuleOp mlir_module);
+
+// Try to determine the right TPU infeed layout.
+FailureOr<Attribute> GetTPUInfeedLayout(ArrayRef<Type> types,
+                                        OpBuilder& rewriter);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SET_TPU_INFEED_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
new file mode 100644
index 00000000..9075754d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SHAPE_INFERENCE_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SHAPE_INFERENCE_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+inline constexpr char kMLIRContextSingleThreadVar[] =
+    "TF_USE_SINGLE_THREAD_MLIR_CONTEXT";
+
+// Returns whether type can be further refined.
+bool CanBeRefined(Type type);
+
+// Returns a new arg type based on the shape and element type. If there are
+// dynamic bounds attribute to the arg, update the bounds based on the shape
+// as well.
+Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
+                   Type element_type, mlir::MLIRContext* context);
+
+// Refines all the shapes in a module, skipping the inference for all ops
+// whose type is in ops_to_skip.
+// Returns a failure() on error, otherwise returns true to indicate that it
+// reached convergence, false otherwise.
+// If input shapes are provided, first refines the `main` function using
+// InferShapeForFunction.
+FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations = 10,
+                                 ArrayRef<TypeID> ops_to_skip = {},
+                                 ArrayRef<ArrayRef<int64_t>> input_shapes = {});
+
+// Given a tensorflow NodeShape string, returns a vector of argument shapes
+// that can be used with InferShapeForFunction.
+// TF NodeShape uses `,` to separate dimensions, and `:` to separate arguments.
+// Ex: 1,2:3,4,5:6,? --> [[1, 2], [3, 4, 5], [6, ?]]
+absl::StatusOr<SmallVector<SmallVector<int64_t>>> ParseArgumentShapes(
+    absl::string_view input_shapes);
+
+// Given a list of refined shapes matching the function arguments of func, runs
+// shape inference over the function to propagate this updated information,
+// skipping the inference for all ops whose type is in ops_to_skip.
+// If arg_shapes are empty, then argument shapes will be left unchanged.
+// Note: This affects the entire module, and changes are not just scoped to the
+// function being inferred.
+// Returns a failure() on error, otherwise returns true to indicate that it
+// reached convergence, false otherwise.
+FailureOr<bool> InferShapeForFunction(func::FuncOp func,
+                                      ArrayRef<ArrayRef<int64_t>> arg_shapes,
+                                      int64_t graph_version,
+                                      int64_t max_iterations = 10,
+                                      ArrayRef<TypeID> ops_to_skip = {});
+
+// Create a MLIRContext based on the threading setup in the env var.
+std::unique_ptr<MLIRContext> MakeMLIRContextWithThreading();
+
+}  // namespace TF
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SHAPE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
new file mode 100644
index 00000000..8944745d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFDevice {
+
+// For architectures that support accelerated embedding lookups, this pass will
+// rewrite the graph to use pipelining for better device utilization.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingSequencingPass();
+
+// This is a strictly sequential and formally correct fallback option for the
+// embedding pipelining pass intended for debugging during pipelining
+// development.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
+
+// Passes in the program key to embedding ops, by moving the embedding ops
+// after the _TPUCompileMlir op.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateEmbeddingProgramKeyPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_EMBEDDINGSEQUENCINGPASS
+#define GEN_PASS_DECL_EMBEDDINGPIPELININGPASS
+#define GEN_PASS_DECL_EMBEDDINGPROGRAMKEYPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
+
+}  // namespace TFDevice
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h
new file mode 100644
index 00000000..f2a3eeba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h
@@ -0,0 +1,79 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TEST_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TEST_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_test {
+
+// Returns test pass for variable freezing.
+std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeVariableTestPass();
+
+// Test pass for applying TF->TF lowering patterns.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTestTFLowerTFPass();
+
+// Test passes for visitor util.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTestVisitorUtilPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTestVisitorUtilInterruptPass();
+
+// Test operation clustering based on user defined policy.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTestClusteringPolicyPass();
+
+// Test pass for analyzing side-effect analysis result.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTestSideEffectAnalysisPass();
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTestResourceAliasAnalysisPass();
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateInitTextFileToImportTestPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInitTextFileToImportSavedModelTestPass();
+
+// Variable Lifting test passes: only useful for lit testing.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesTestPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftVariablesInvalidSessionTestPass();
+
+// Create a test pass for the above with a "fake" session, for lit testing.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInitializeVariablesInSessionInitializerTestPass();
+
+// Create a test pass that emits remarks for each analysis result for resources.
+// This pass is only used for lit testing.
+std::unique_ptr<OperationPass<ModuleOp>> CreateResourceAnalyzerTestPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_FREEZEVARIABLESTESTPASS
+#define GEN_PASS_DECL_INITTEXTFILETOIMPORTSAVEDMODELTESTPASS
+#define GEN_PASS_DECL_INITTEXTFILETOIMPORTTESTPASS
+#define GEN_PASS_DECL_INITIALIZEVARIABLESINSESSIONINITIALIZERPASS
+#define GEN_PASS_DECL_LIFTVARIABLESINVALIDSESSIONTESTPASS
+#define GEN_PASS_DECL_LIFTVARIABLESTESTPASS
+#define GEN_PASS_DECL_RESOURCEANALYZERTESTPASS
+#define GEN_PASS_DECL_TESTCLUSTERINGPOLICYPASS
+#define GEN_PASS_DECL_TESTRESOURCEALIASANALYSIS
+#define GEN_PASS_DECL_TESTSIDEEFFECTANALYSISPASS
+#define GEN_PASS_DECL_TESTTENSORFLOWLOWERTFPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h.inc"
+
+}  // namespace tf_test
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TEST_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h
new file mode 100644
index 00000000..b8a176da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_DATA_OPTIMIZATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_DATA_OPTIMIZATION_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Populates patterns to perform optimizations specific to tf.data operations.
+void PopulateTFDataOptimizationPatterns(MLIRContext *context,
+                                        RewritePatternSet *patterns);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_DATA_OPTIMIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h
new file mode 100644
index 00000000..2b601395
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_GRAPH_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_GRAPH_OPTIMIZATION_PASS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// Create a module pass that will execute the given TF GraphOptimization passes
+// in sequence.
+// Pass requires that the module ran on is convertible to TF Graph.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTensorFlowGraphOptimizationPass(
+    std::vector<tensorflow::GraphOptimizationPass*> tf_passes);
+
+// Same as above but pass names instead of the passes provided. The registered
+// passes are queried, if a TF graph optimization pass is not found in registry
+// then the pass fails.
+// Pass requires that the module ran on is convertible to TF Graph.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTensorFlowGraphOptimizationPass(
+    const std::vector<std::string>& pass_names);
+
+// Register the pass for command line testing.
+void RegisterGraphOptimizationPasses();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_GRAPH_OPTIMIZATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h
new file mode 100644
index 00000000..9c08e2d3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_ASSET_SINKING_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_ASSET_SINKING_PASS_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Helper function that sets up a module for an AssetSinkingPass. The sole
+// argument of the main function of `module` is prepared to be inlined with
+// the value `checkpoint_path`.
+// Also adds SessionInitializer op.
+absl::Status AddSessionInitializerAndInlineCheckpoint(
+    ModuleOp module, absl::string_view checkpoint_path);
+
+// Creates a pass that sinks SavedModel asset filenames to constants.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAssetSinkingPass(
+    llvm::StringRef saved_model_dir);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_ASSET_SINKING_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.h
new file mode 100644
index 00000000..7bfb9871
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_FREEZE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_FREEZE_UTILS_H_
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_saved_model {
+// Container to hold all update actions on ops.
+// Key: Operation to update.
+// Value: optional list of argument indices to delete from this op.
+// Note that we use MapVector because we want to iterate on the same order
+// of insertion.
+LogicalResult EraseObsoleteResourceUses(
+    llvm::MapVector<Operation*, llvm::SmallVector<unsigned int, 4>>
+        arguments_to_erase);
+
+// Traces usage of 'var_handle_op' or 'resources' and replaces it's usage with
+// constant value 'value'. All op operands updates are captured in
+// 'arguments_to_erase'.
+LogicalResult ReplaceVarWithConstant(
+    mlir::Value::use_range uses, ElementsAttr value,
+    llvm::MapVector<Operation*, llvm::SmallVector<unsigned int, 4>>*
+        arguments_to_erase);
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_FREEZE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h
new file mode 100644
index 00000000..ad8d20d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_FREEZE_VARIABLES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_FREEZE_VARIABLES_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Freezes readonly variables in the graph.
+LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session);
+
+}  // namespace tf_saved_model
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_FREEZE_VARIABLES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
new file mode 100644
index 00000000..801eaaeb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Creates a pass that optimizes tf_saved_model.global_tensor ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass();
+
+// Creates a pass that freezes tf_saved_model.global_tensor ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeGlobalTensorsPass(
+    bool allow_mutable_tensors = false);
+
+// Creates a pass that freezes tf_saved_model.asset ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeAssetsPass(
+    std::string saved_model_dir = "");
+
+// Creates as pass that removes variables in the session initializer.
+// This job is required with lifting variable passes. Originally, the session
+// initializer function does assigning variables. However, the read-only
+// variable assignments will be done via lifting variables pass by converting
+// the read-only variables to constant ops, instead. This pass removes the
+// redundant operations. This pass should be located in front of the pass for
+// lifting read-only variables.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateRemoveVariablesInSessionInitializerPass();
+
+// Creates a pass that removes duplicate 'tf_saved_model.bound_input' bindings.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDedupBoundInputBindingPass();
+
+// Create a pass that removes function arguments that map to global tensors.
+std::unique_ptr<Pass> CreateLowerGlobalsToMlProgramPass();
+
+// Create a pass that lowers variable read/write ops to ml_program ops.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLowerVariableOpsToMlProgramPass();
+
+// Strips saved_model attributes from a module and its functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreateStripSavedModuleMetadataPass();
+
+// Convert the session initializer to a function.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateConvertSessionInitializerToFunctionPass();
+
+// Creates forwarding functions for 'exported_names'.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateAddFunctionsForExportedNamesPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_DEDUPBOUNDINPUTBINDINGPASS
+#define GEN_PASS_DECL_FREEZEASSETSPASS
+#define GEN_PASS_DECL_FREEZEGLOBALTENSORSPASS
+#define GEN_PASS_DECL_LOWERGLOBALSTOMLPROGRAMPASS
+#define GEN_PASS_DECL_LOWERVARIABLEOPSTOMLPROGRAMPASS
+#define GEN_PASS_DECL_OPTIMIZEGLOBALTENSORSPASS
+#define GEN_PASS_DECL_REMOVEVARIABLESINSESSIONINITIALIZERPASS
+#define GEN_PASS_DECL_STRIPSAVEDMODULEMETADATAPASS
+#define GEN_PASS_DECL_ADDFUNCTIONSFOREXPORTEDNAMESPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.h.inc"
+
+}  // namespace tf_saved_model
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.h
new file mode 100644
index 00000000..39ceab7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_UNROLL_BATCH_MATMUL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_UNROLL_BATCH_MATMUL_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TF {
+
+// Populate patterns to unroll tf.BatchMatMulV2 op into a sequence of TF ops.
+// Since TFLite does not support BatchMatMul operation, it unrolls a BatchMatMul
+// op into tf.Reshape, tf.Slice, tf.MatMul, tf.Pack, and tf.Reshape ops.
+void PopulateUnrollTfBatchMatMul(MLIRContext* context,
+                                 RewritePatternSet& patterns);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_UNROLL_BATCH_MATMUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
new file mode 100644
index 00000000..47bc42e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_EXPORT_TF_DIALECT_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_EXPORT_TF_DIALECT_OP_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Extracts the attributes of a MLIR operation and populates the converted
+// attributes in a proto map<string, AttrValue>.
+absl::Status GetAttrValuesFromOperation(
+    mlir::Operation* inst, llvm::StringRef name,
+    const tensorflow::OpRegistrationData* op_reg_data,
+    bool ignore_unregistered_attrs, AttrValueMap* attributes);
+
+// Converts a MLIR operation to TensorFlow NodeDef with given node name. This
+// name should be unique to the graph it is being inserted to. If the
+// `ignore_unregistered_attrs` argument is set to true, the attributes which are
+// not in the op registry will be ignored. If the `ignore_unregistered_attrs`
+// argument is not set to true, _output_shapes attribute is added to nodes with
+// ShapedType for the leading values with ShapedType in the results of the
+// nodes. Set it to true if the returned NodeDef will be executed by the linked
+// TF Eager runtime.
+absl::StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
+    mlir::Operation* inst, llvm::StringRef name,
+    bool ignore_unregistered_attrs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_EXPORT_TF_DIALECT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
new file mode 100644
index 00000000..fe7684ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -0,0 +1,139 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/bundle_v2.h"
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+inline constexpr absl::string_view kImportModelDefaultGraphFuncName = "main";
+
+// Given a GraphDef, returns a MLIR module containing the graph, expressed with
+// tf_executor dialect.
+ABSL_DEPRECATED("Use tensorflow::tf2xla::v2::ConvertGraphToTfExecutor instead.")
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphdefToMlir(
+    const GraphDef& graphdef, const GraphDebugInfo& debug_info,
+    const GraphImportConfig& specs, mlir::MLIRContext* context);
+
+// Given a SavedModel, returns a MLIR module containing the functions, expressed
+// with tf_executor dialect.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelToMlir(
+    SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
+    absl::Span<std::string> exported_names, MLIRImportOptions options = {});
+
+// Given a V1 SavedModel, returns a MLIR module containing the functions,
+// expressed with tf_executor dialect.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlir(
+    const SavedModelBundle& saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context, MLIRImportOptions options = {});
+
+// Given a V1 SavedModel, returns a MLIR module containing the functions,
+// expressed with tf_executor dialect. It does not require a session to be
+// created and it does not perform any graph transformation. If `exported_names`
+// is std::nullopt, all signatures will be imported. Otherwise, only names
+// in `exported_names` are imported.
+//
+// Note that the word `Lite` means it is a lighter version compared to
+// ConvertSavedModelV1ToMlir(), and is not related to TFLite.
+//
+// TODO(b/179683149): Rename this class to avoid confusion with TFLite.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
+    const MetaGraphDef& meta_graph_def, const GraphDebugInfo& debug_info,
+    std::optional<absl::Span<const std::string>> exported_names,
+    mlir::MLIRContext* context, MLIRImportOptions options);
+
+// SavedModelMLIRImportInput is an adapter class for users to inject custom
+// graph transformation logic on Tensorflow graphs before importing to MLIR. It
+// serves as the source that provides the subgraphs requested by the savedmodel
+// MLIR importer, and at the same time it allows the implementation of this
+// class to transform the graph before feeding it to the importer.
+class SavedModelMLIRImportInput {
+ public:
+  SavedModelMLIRImportInput(const MetaGraphDef* meta_graph_def,
+                            const GraphDebugInfo& debug_info)
+      : meta_graph_def_(meta_graph_def), debug_info_(debug_info) {
+    DCHECK(meta_graph_def);
+  }
+
+  virtual ~SavedModelMLIRImportInput();
+
+  // The original MetaGraphDef of the savedmodel.
+  const MetaGraphDef& meta_graph_def() const { return *meta_graph_def_; }
+
+  const GraphDebugInfo& debug_info() const { return debug_info_; }
+
+  // GetSubGraph() is expected to return a tensorflow::Graph that contains the
+  // node set specified in `specs`. The implementation is free to transform the
+  // graph in the original savedmodel as needed, as long as it produces the same
+  // results and effects. If the transformation requires some configs in `spec`
+  // (e.g., control_outputs) to be changed, they should be updated accordingly
+  // and remain valid for the graph.
+  // `name` is a unique identifier for this subgraph, so the implementation can
+  // use it for eg. debugging or caching compilation results.
+  virtual absl::StatusOr<const Graph*> GetSubGraph(
+      absl::string_view name, GraphImportConfig& specs) = 0;
+
+ private:
+  const MetaGraphDef* meta_graph_def_ = nullptr;
+  GraphDebugInfo debug_info_;
+};
+
+// Given the SavedModelMLIRImportInput for a saved model, returns a MLIR module
+// containing the functions, expressed with tf_executor dialect. It does not
+// require a session to be created. If `exported_names` is std::nullopt, all
+// signatures will be imported. Otherwise, only names in `exported_names` are
+// imported.
+
+//
+// Note that the word `Lite` means it is a lighter version compared to
+// ConvertSavedModelV1ToMlir(), and is not related to TFLite.
+//
+// TODO(b/179683149): Rename this class to avoid confusion with TFLite.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
+    SavedModelMLIRImportInput& input,
+    std::optional<absl::Span<const std::string>> exported_names,
+    mlir::MLIRContext* context,
+    bool unconditionally_use_set_output_shapes = false);
+
+// Serialize a MLIR module to a string.
+std::string MlirModuleToString(mlir::ModuleOp m, bool show_debug_info = false);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h
new file mode 100644
index 00000000..b49ed7bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_IMPORT_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_IMPORT_OPTIONS_H_
+
+namespace tensorflow {
+
+// TODO(jpienaar): This file and class are confusingly named. This seems to be
+// a SavedModel only import options file that exposes a subset of the
+// GraphImportConfig options, but the naming would make one think it is more
+// general.
+struct MLIRImportOptions {
+  // If true, functionalize the input graph before importing it into MLIR.
+  bool upgrade_legacy = false;
+
+  // Whether to unconditionally use the shape set via _output_shapes on import.
+  bool unconditionally_use_set_output_shapes = false;
+
+  // Apply default attributes from the op definition to the loaded op.
+  bool add_default_attributes = true;
+
+  // If set, promote tf.VarHandleOp to resource arguments for all functions.
+  bool lift_variables = true;
+
+  // Keeps the variables in initializers before lifting variables (when
+  // `lift_variables == true`) or newly adding variable initialization patterns
+  // in the initializer functions. One might want to set this to `true` because
+  // the `RemoveVariablesInSessionInitializerPass` pass, which runs otherwise,
+  // may unexpectedly also remove the initialization patterns for non-variable
+  // resources (like hash tables) if they involve variables. Such a case is
+  // illustrated in the test file
+  // "../tests/tf_saved_model_remove_vars_in_session_initializer.mlir".
+  // This defaults to `false` to avoid breaking existing uses.
+  bool include_variables_in_initializers = false;
+
+  // Load the model without restoring associated variables from disk. Enables
+  // loading raw programs without checkpoints.
+  bool allow_uninitialized_variables = false;
+
+  // If true, variables are imported as DenseResourceElementsAttr; else,
+  // variables are imported as DenseElementsAttr.
+  bool import_variables_as_dense_resources = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_IMPORT_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
new file mode 100644
index 00000000..cf90b7ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -0,0 +1,160 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_FLAGS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_FLAGS_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+struct ArrayInfoBase {
+  // The node type when the input node is imported. Typically needs to be
+  // specified when passing arbitrary nodes (some node attributes are removed).
+  DataType imported_dtype;
+
+  // Node "shape" attribute value.
+  TensorShapeProto shape;
+};
+
+struct ArrayInfo : public ArrayInfoBase {
+  using SubTypeInfo = ArrayInfoBase;
+  // DT_RESOURCE and DT_VARIANT have subtypes
+  std::vector<SubTypeInfo> subtypes;
+};
+
+struct GraphImportConfig {
+  // Returns string representation of config.
+  std::string str() const;
+
+  using InputArrays =
+      llvm::MapVector<std::string, ArrayInfo, llvm::StringMap<unsigned>>;
+  // The name assigned to the function which is the import result of the given
+  // graph. If empty, a default one will be used.
+  std::string graph_func_name;
+  // Maps input node names to node data types and shapes.
+  InputArrays inputs;
+  // name:index strings for the data outputs.
+  std::vector<string> outputs;
+  // name strings for the control outputs.
+  std::vector<string> control_outputs;
+  // Setting prune_unused_nodes to true, would prune unreachable nodes if
+  // output_arrays is specified.
+  bool prune_unused_nodes = false;
+  // If true, inputs of type LegacyFedInput are replaced with Placeholder ops.
+  // LegacyFedInput ops have two outputs unlike Placeholder which has only one
+  // output, so if both outputs of the LegacyFedInput ops are used then returns
+  // an error.
+  bool convert_legacy_fed_inputs = false;
+  // If true, the main graph will be treated as a function.
+  bool graph_as_function = false;
+  // If true, upgrade legacy features of the graph (for instance, functionalize
+  // control-flow).
+  bool upgrade_legacy = false;
+  // If true, functionalization is restricted to nodes that will be
+  // XLA-compiled. This is only needed if
+  // - `upgrade_legacy` is true
+  // - upgrading legacy features of the graph (which includes functionalization)
+  //   runs before compilation cluster extraction (as for MLIR-based TPU bridge)
+  // - session runtime is used (session runtime has issues with function names
+  //   rewritten by functionalization).
+  // Otherwise, this parameter should be set to false.
+  bool restrict_functionalization_to_compiled_nodes = false;
+  // If true, enables shape inference on input.
+  // TODO(jpienaar): This will be removed shortly.
+  bool enable_shape_inference = true;
+  // _output_shapes is an unregistered attribute which is used during
+  // GraphConstructor::ConvertGraph to override shapes. It is unfortunately
+  // not always set correctly (which is undesirable and should be addressed)
+  // so make it opt-in to consider it unconditionally also when importing the
+  // graph.
+  bool unconditionally_use_set_output_shapes = false;
+  // If set, use the value as the device type and mark the function graph for
+  // XLA compilation.
+  string xla_compile_device_type;
+  // If true, enables moving ops to different devices or moving unsupported ops
+  // out of a compilation cluster.
+  bool enable_soft_placement = false;
+  // If true, a function attribute, `tf._original_func_name`, will be set in
+  // functions which contains the corresponding original TF function name.
+  bool set_original_tf_func_name = false;
+
+  // If true, all functions in the graph will be converted to MLIR regardless of
+  // whether the functions are referenced by the nodes. This is needed if
+  // aliases and saved model object graph function matching is needed.
+  bool convert_all_functions_to_mlir = false;
+};
+
+struct GraphExportConfig {
+  // Whether to export the entry function to function library instead of the
+  // graph.
+  bool export_entry_func_to_flib = false;
+  // Whether to export functions using the name set in the attribute
+  // `tf._original_func_name` if it exists.
+  bool export_original_tf_func_name = false;
+};
+
+// Parses the command line flag strings to the specification of nodes in
+// the Graph.
+absl::Status ParseOutputArrayInfo(absl::string_view array_names,
+                                  std::vector<string>* outputs);
+
+absl::Status ParseOutputArrayInfo(const std::vector<string>& output_names,
+                                  std::vector<string>* outputs);
+
+// Parses the command line flag strings to the specification of nodes in
+// the Graph. `data_types` input string can be empty since the flag is optional.
+absl::Status ParseInputArrayInfo(absl::string_view array_names,
+                                 absl::string_view data_types,
+                                 absl::string_view shapes,
+                                 GraphImportConfig::InputArrays* inputs);
+
+absl::Status ParseInputArrayInfo(
+    const std::vector<string>& node_names,
+    const std::vector<string>& node_dtypes,
+    const std::vector<std::optional<std::vector<int>>>& node_shapes,
+    GraphImportConfig::InputArrays* inputs);
+
+// Parses shapes from the given string into shapes_vector which is a structured
+// format.
+// NOTE: If shapes_str is empty, shapes_vector will also be empty.
+absl::Status ParseNodeShapes(
+    absl::string_view shapes_str,
+    std::vector<std::optional<std::vector<int>>>& shapes_vector);
+
+// Parses names from the given string into the names_vector.
+// NOTE: If names_str is empty, names_vector will also be empty.
+absl::Status ParseNodeNames(absl::string_view names_str,
+                            std::vector<std::string>& names_vector);
+
+// Parses data types from the given string into the data_type_vector.
+// NOTE: If data_types_str is empty, data_type_vector will also be empty.
+absl::Status ParseNodeDataTypes(absl::string_view data_types_str,
+                                std::vector<std::string>& data_type_vector);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
new file mode 100644
index 00000000..8d404575
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+
+namespace tensorflow {
+
+using tsl::Status;
+using tsl::StatusOr;
+
+struct GraphdefToMlirOptions {
+  std::string debug_info_file;
+  std::string xla_compile_device_type;
+  bool prune_unused_nodes;
+  bool convert_legacy_fed_inputs;
+  bool graph_as_function;
+  bool upgrade_legacy;
+  bool enable_shape_inference;
+  bool unconditionally_use_set_output_shapes;
+  bool enable_soft_placement;
+  bool set_original_tf_func_name = false;
+};
+
+// TODO(antiagainst): Directly manipulating files in library functions is not
+// a good idea. We should pass in a string/stream here.
+
+// Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
+// Creates MLIR entities into the given MLIR `context`.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToMlirTranslateFunction(
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::optional<std::vector<int>>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
+
+ABSL_DEPRECATED(
+    "Please use the other overload of this function which accepts structured "
+    "inputs instead of strings")
+// Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
+// Creates MLIR entities into the given MLIR `context`.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToMlirTranslateFunction(
+    llvm::StringRef input, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, absl::string_view control_output_arrays,
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
+
+// Similar as the above function, but replaces all constant tensors
+// with randomly generated splat values.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToSplattedMlirTranslateFunction(
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
+    const std::vector<std::string>& input_dtypes,
+    const std::vector<std::vector<int>>& input_shapes,
+    const std::vector<std::string>& output_arrays,
+    const std::vector<std::string>& control_output_arrays,
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
+
+ABSL_DEPRECATED(
+    "Please use the other overload of this function which accepts structured "
+    "inputs instead of strings")
+// Similar as the above function, but replaces all constant tensors
+// with randomly generated splat values.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToSplattedMlirTranslateFunction(
+    llvm::StringRef input, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, absl::string_view control_output_arrays,
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
+
+// Converts a TensorFlow SavedModel stored in the directory with the given
+// `saved_model_dir` into a MLIR module. Creates MLIR entities into the
+// given MLIR `context`.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelObjectGraphToMlirImport(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context,
+    bool unconditionally_use_set_output_shapes = false,
+    bool import_variables_as_dense_resources = false);
+
+// Converts a TensorFlow V1 SavedModel stored in the directory with the given
+// `saved_model_dir` into a MLIR module. Creates MLIR entities into the
+// given MLIR `context`.
+// 'saved_model_bundle' if not null, will be initialized with the model bundle.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelSignatureDefsToMlirImport(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context,
+    MLIRImportOptions options,
+    std::unique_ptr<tensorflow::SavedModelBundle>* saved_model_bundle =
+        nullptr);
+
+// Converts a TensorFlow V1 SavedModel stored in the directory with the given
+// `saved_model_dir` into a MLIR module. Creates MLIR entities into the
+// given MLIR `context`. This does not create session internally so it is faster
+// and does not perform any graph transformation.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelSignatureDefsToMlirImportLite(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context,
+    MLIRImportOptions options);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_TF_MLIR_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
new file mode 100644
index 00000000..31baee55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+class GraphDef;
+class MetaGraphDef;
+
+// Generate the shared_name for resource handle ops in the graph and functions
+// if their shared_names are empty. Resource handle ops with empty shared_name
+// may have undesired semantics.
+absl::Status GenerateResourceSharedNameIfEmpty(
+    GraphDef& gdef, const OpRegistryInterface* default_registry);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
new file mode 100644
index 00000000..6ed684e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
+
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+
+namespace mlir {
+namespace TF {
+
+// TODO(b/229028654) Use definitions from tf2xla_defs.h directly. We currently
+// don't do this to avoid explicit casts (implicit conversion from
+// `absl::string_view` to `llvm::StringRef` is not supported until C++17).
+
+// Whether soft placement is allowed. If true, the marked node is eligible for
+// outside compilation.
+inline constexpr llvm::StringRef kAllowSoftPlacementAttr =
+    "allow_soft_placement";
+
+// Marks a node for XLA compilation. The attribute value indicates the
+// compilation device type.
+inline constexpr llvm::StringRef kCompileDeviceTypeAttr =
+    "_xla_compile_device_type";
+// The attribute value speicifes the preferred outlined function name in
+// ClusterOutliningPass.
+inline constexpr llvm::StringRef kClusterOutlinedFunctionNameAttr =
+    "_cluster_outlined_function_name";
+// Marks a node for replication. The attribute value indicates the replication
+// metadata op.
+inline constexpr llvm::StringRef kReplicationInfoAttr = "_replication_info";
+// Marks a node for XLA-TPU compilation. The attribute value indicates the
+// associated compilation cluster and replication metadata op.
+inline constexpr llvm::StringRef kTpuReplicateAttr = "_tpu_replicate";
+// Device types.
+inline constexpr llvm::StringRef kTpuDevice = "TPU";
+// _xla_outside_compilation
+inline constexpr llvm::StringRef kXlaOutsideCompilationAttr =
+    "_xla_outside_compilation";
+// device attr
+inline constexpr llvm::StringRef kDeviceAttr = "device";
+// Function attribute to signal that a function should be skipped from TPU
+// island outlining. The attribute is set in
+// `TpuV1BridgeExecutorIslandCoarsening` and removed in the subsequent
+// `TPUBridgeExecutorIslandOutlining` pass.
+inline constexpr llvm::StringRef kSkipIslandOutlining =
+    "_skip_island_outlining";
+// Function attribute to signal which argument contains bounded dynamic
+// dimension.
+inline constexpr llvm::StringRef kDynamicArgIndexAttr = "_dynamic_arg_index";
+
+// This string attribute encodes parallel execution groups and their associated
+// branches. It has the following format:
+// `_parallel_execution_ids= group1:branch1,group2:branch2,...`
+// For example, if we have IR as follows:
+//
+// tf_executor.island wraps "tf.OpA"
+// tf_executor.island {
+//  "tf_device.replicate" {n = 2} {
+//    "tf.OpB"
+//    "tf_device.parallel_execute"() ({
+//      "tf.OpC"
+//    }, {
+//      "tf.OpD"
+//    })
+//  }
+//
+// The above IR will be flattened after `ReplicateToIslandPass` and
+// `ParallelExecuteToIslandsPass` as follows:
+//
+// tf_executor.island wraps "tf.OpA"
+// tf_executor.island {_parallel_execution_ids=r0:0} wraps "tf.OpB"
+// tf_executor.island {_parallel_execution_ids=r0:0,p0:0} wraps "tf.OpC"
+// tf_executor.island {_parallel_execution_ids=r0:0,p0:1} wraps "tf.OpD"
+// tf_executor.island {_parallel_execution_ids=r0:1} wraps "tf.OpB"
+// tf_executor.island {_parallel_execution_ids=r0:1,p0:0} wraps "tf.OpC"
+// tf_executor.island {_parallel_execution_ids=r0:1,p0:1} wraps "tf.OpD"
+//
+// "tf.OpA" will not have `_parallel_execution_ids` attr,
+//          means it does not belong to any parallel execution groups.
+// First instance of "tf.OpB" after flattening will have
+//          `_parallel_execution_ids = "r0:0"`,
+//          which represents the first branch of replicate group 0.
+// Second instance of "tf.OpB" after flattening will have
+//          `_parallel_execution_ids = "r0:1"`
+//          which represents the second branch of replicate group 0.
+// First instance of "tf.OpC" after flattening will have
+//          `_parallel_execution_ids = "r0:0,p0:0"`
+//          which represents the first branch of replicate group 0 and
+//          the first branch of parallel group 0.
+// Second instance of "tf.OpC" after flattening will have
+//          `_parallel_execution_ids = "r0:1,p0:0"`
+//          which represents the second branch of replicate group 0 and
+//          the first branch of parallel group 0.
+// First instance of "tf.OpD" after flattening will have
+//          `_parallel_execution_ids = "r0:0,p0:1"`
+//          which represents the first branch of replicate group 0 and
+//          the second branch of parallel group 0.
+// Second instance of "tf.OpD" after flattening will have
+//          `_parallel_execution_ids = "r0:1,p0:1"`
+//          which represents the second branch of replicate group 0 and
+//          the second branch of parallel group 0.
+inline constexpr llvm::StringRef kParallelExecAnnotation =
+    "_parallel_execution_ids";
+
+// Logging
+
+// Name of component for error logging. This name is fixed and required to
+// enable logging.
+inline const char kBridgeComponent[] = "TFXLABridge";
+inline const char kMlirPh1BridgeCounterReplicated[] = "replicated";
+inline const char kMlirPh1BridgeCounterNonReplicated[] = "nonreplicated";
+inline const char kMlirPh1BridgeCounterV1[] = "v1";
+inline const char kMlirPh1BridgeCounterV2[] = "v2";
+inline const char kMlirPh1BridgeCounterTpu[] = "tpu";
+inline const char kMlirPh1BridgeCounterNonTpu[] = "cpu/gpu";
+inline const char kXlaOutsideCompilation[] = "_xla_outside_compilation";
+
+// Copies attributes that satisfy the given predicate from `from` to `to`.
+template <typename Predicate>
+void CopyAttributes(Operation *from, Operation *to, Predicate P) {
+  for (const NamedAttribute &attr : from->getAttrs())
+    if (P(attr)) to->setAttr(attr.getName(), attr.getValue());
+}
+
+// Copies attributes whose name begins with an _ from `from` to `to`.
+inline void CopyUnderscoredAttributes(Operation *from, Operation *to) {
+  CopyAttributes(from, to, [](const NamedAttribute &attr) {
+    return attr.getName().strref().front() == '_';
+  });
+}
+
+// Copies outside compilation attribute from `from` to `to`.
+inline void CopyXlaOutsideCompilationAttributes(Operation *from,
+                                                Operation *to) {
+  CopyAttributes(from, to, [](const NamedAttribute &attr) {
+    return attr.getName().strref() == kXlaOutsideCompilationAttr;
+  });
+}
+
+// Copies attributes that are either `device` or whose name begins with an _
+// from `from` to `to`.
+// TODO(b/158769932): This should be a general feature instead post some policy
+// discussion.
+inline void CopyDeviceAndUnderscoredAttributes(Operation *from, Operation *to) {
+  auto device = mlir::StringAttr::get(from->getContext(), "device");
+  CopyAttributes(from, to, [&device](const NamedAttribute &attr) {
+    return attr.getName().strref().front() == '_' || attr.getName() == device;
+  });
+}
+
+// Forward declare these passthrough ops.
+// TODO(jpienaar): Remove these and use trait instead.
+class IdentityOp;
+class IdentityNOp;
+
+// Returns if a value corresponds to a constant, returns the matched constant
+// as an attribute.
+template <typename AttrT>
+bool GetValueAsConstant(Value val, AttrT &attr) {
+  while (auto result = mlir::dyn_cast<OpResult>(val)) {
+    Operation *op = result.getOwner();
+    if (!isa<IdentityOp>(op) && !isa<IdentityNOp>(op)) break;
+    val = op->getOperand(result.getResultNumber());
+  }
+  return matchPattern(val, m_Constant(&attr));
+}
+
+// Checks if both compilation and replication attributes are present in the
+// operation, and if their values are valid.
+LogicalResult HasValidCompilationAndReplicationAttributes(Operation &op);
+
+// Checks if the device attribute is valid.
+LogicalResult IsValidDeviceTypeOrEmpty(StringAttr attr);
+
+using ParallelExecutionIdPairs =
+    llvm::SmallVector<std::pair<std::string, std::string>, 8>;
+// Parses the parallel execution attribute for `op` and fills `id_pairs` with
+// the corresponding (group ID,branch ID) pairs.
+// Returns `failure` if the attribute is malformed.
+LogicalResult ParseParallelExecutionIds(Operation *op,
+                                        ParallelExecutionIdPairs &id_pairs);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
new file mode 100644
index 00000000..84bc1c60
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_BRIDGE_LOGGER_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_BRIDGE_LOGGER_H_
+
+#include <string>
+#include <vector>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Logger for logging MLIR modules before and after passes in MLIR TPU bridge.
+//
+// The IR logging can be restricted to a particular set of pass invocations via
+// filters that are specified with the `MLIR_BRIDGE_LOG_PASS_FILTER` and
+// `MLIR_BRIDGE_LOG_STRING_FILTER` environment variables.
+// `MLIR_BRIDGE_LOG_PASS_FILTER` takes a semicolon-separated list of pass class
+// names, `MLIR_BRIDGE_LOG_STRING_FILTER` takes a semicolon-separated list of
+// strings, and IR is only dumped for a pass invocation if the pass name exactly
+// matches any of the provided pass names and if the serialized operation on
+// which the pass is invoked contains any of the specified strings as a
+// substring. An empty list is interpreted as no restriction. The string filter
+// can be handy e.g. if one is only interested in a certain function or when
+// checking where a certain attribute gets lost. Note that we use a semicolon
+// instead of comma as the separator to allow strings that contain commas (which
+// frequently appear in MLIR). The strings can contain any characters (including
+// spaces) except semicolons.
+//
+// Example: Setting the environment variables
+// `MLIR_BRIDGE_LOG_PASS_FILTER="LegalizeTF;Canonicalizer"` and
+// `MLIR_BRIDGE_LOG_STRING_FILTER="my_string"` will dump IR only for invocations
+// of `LegalizeTF` and `Canonicalizer` where the string `my_string` is contained
+// in the serialized operation on which the pass is invoked. For verbose log
+// level >= 1, `bridge_logger.cc` prints details about pass invocations for
+// which the IR dumping was skipped because of a filter.
+class BridgeLoggerConfig : public mlir::PassManager::IRPrinterConfig {
+ public:
+  explicit BridgeLoggerConfig(
+      bool print_module_scope = false, bool print_after_only_on_change = true,
+      mlir::OpPrintingFlags op_printing_flags = mlir::OpPrintingFlags());
+
+  // A hook that may be overridden by a derived config that checks if the IR
+  // of 'operation' should be dumped *before* the pass 'pass' has been
+  // executed. If the IR should be dumped, 'print_callback' should be invoked
+  // with the stream to dump into.
+  void printBeforeIfEnabled(mlir::Pass* pass, mlir::Operation* op,
+                            PrintCallbackFn print_callback) override;
+
+  // A hook that may be overridden by a derived config that checks if the IR
+  // of 'operation' should be dumped *after* the pass 'pass' has been
+  // executed. If the IR should be dumped, 'print_callback' should be invoked
+  // with the stream to dump into.
+  void printAfterIfEnabled(mlir::Pass* pass, mlir::Operation* op,
+                           PrintCallbackFn print_callback) override;
+
+  // Returns `true` iff we should log IR for given `pass` and `op`.
+  // Note: Visibility of this function is public for use in unit testing.
+  bool ShouldPrint(mlir::Pass* pass, mlir::Operation* op);
+
+ private:
+  // Get `filter` encoded by environment variable `env_var`.
+  static std::vector<std::string> GetFilter(const std::string& env_var);
+  // Returns `true` iff any of the strings in `filter` matches `str`, either
+  // exactly or as a substring, depending on `exact_match`.
+  static bool MatchesFilter(const std::string& str,
+                            const std::vector<std::string>& filter,
+                            bool exact_match);
+  // Determines whether only top-level passes should be dumped.
+  // Returns true unless the environment variable is set to "0" or "false".
+  static bool ShouldOnlyDumpTopLevelPasses();
+
+  // Only log pass invocations whose pass name exactly matches any string in
+  // `pass_filter_` (or when `pass_filter_` is empty).
+  const std::vector<std::string> pass_filter_;
+  // Only log pass invocations where the serialized operation on which the pass
+  // is invoked contains any of the specified strings as a substring (or when
+  // `string_filter_` is empty).
+  const std::vector<std::string> string_filter_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_BRIDGE_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h
new file mode 100644
index 00000000..ddefbd0a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h
@@ -0,0 +1,119 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CALL_GRAPH_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CALL_GRAPH_UTIL_H_
+
+#include <functional>
+#include <stack>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+
+// Return a list of attribute names that indicates an entry function.
+std::vector<llvm::StringRef> GetEntryFunctionAttributeNames();
+
+// Check if a function is an entry in an MLIR module.
+bool IsEntryFunction(func::FuncOp func);
+
+// Get all the entry functions in an MLIR module.
+llvm::SmallVector<func::FuncOp> GetEntryFunctions(ModuleOp module);
+
+// Get all the functions referenced in a symber user op and save them in
+// `callees`.
+LogicalResult GetCallees(SymbolUserOpInterface op, SymbolTable &symtab,
+                         llvm::SmallVector<func::FuncOp> &callees);
+
+// Find the first op with any of the specified types on each path rooted at the
+// `root` node in a tree. Additional checks can be applied via `predicate`. The
+// results are stored in `ops`.
+template <typename T, typename... Types>
+LogicalResult GetFirstOpsOfType(
+    func::FuncOp root, SymbolTable &symtab,
+    const std::function<bool(SymbolUserOpInterface)> &predicate,
+    llvm::SmallVector<SymbolUserOpInterface> &ops) {
+  std::stack<func::FuncOp> worklist;
+  worklist.push(root);
+  while (!worklist.empty()) {
+    func::FuncOp u = worklist.top();
+    worklist.pop();
+    auto result = u.walk([&](SymbolUserOpInterface op) {
+      if (llvm::isa<T, Types...>(op) && (!predicate || predicate(op))) {
+        ops.push_back(op);
+        return WalkResult::advance();
+      }
+      llvm::SmallVector<func::FuncOp> callees;
+      if (GetCallees(op, symtab, callees).failed()) {
+        return WalkResult::interrupt();
+      }
+      for (auto callee : callees) {
+        worklist.push(callee);
+      }
+      return WalkResult::advance();
+    });
+    if (result.wasInterrupted()) return failure();
+  }
+  return success();
+}
+
+// Find the nodes with any of the specified types on the tree rooted at `root`
+// node. Additional checks can be applied via `predicate`. The search skips
+// the current path if a node with the specified types fails the check, and
+// continues on the next path. The passing ops are stored in `hits`, while the
+// first failing on on each path is stored in `first_misses`.
+template <typename T, typename... Types>
+LogicalResult GetOpsOfTypeUntilMiss(
+    func::FuncOp root, SymbolTable &symtab,
+    const std::function<bool(SymbolUserOpInterface)> &predicate,
+    llvm::SmallVector<SymbolUserOpInterface> &hits,
+    llvm::SmallVector<SymbolUserOpInterface> &first_misses) {
+  std::stack<func::FuncOp> worklist;
+  worklist.push(root);
+  while (!worklist.empty()) {
+    func::FuncOp u = worklist.top();
+    worklist.pop();
+    auto result = u.walk([&](SymbolUserOpInterface op) {
+      if (llvm::isa<T, Types...>(op)) {
+        if (!predicate || predicate(op)) {
+          hits.push_back(op);
+        } else {
+          first_misses.push_back(op);
+          return WalkResult::advance();
+        }
+      }
+      llvm::SmallVector<func::FuncOp> callees;
+      if (GetCallees(op, symtab, callees).failed()) {
+        return WalkResult::interrupt();
+      }
+      for (auto callee : callees) {
+        worklist.push(callee);
+      }
+      return WalkResult::advance();
+    });
+    if (result.wasInterrupted()) return failure();
+  }
+  return success();
+}
+
+// Check if a function has one region and one block only.
+bool HasSingleBlock(func::FuncOp func);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CALL_GRAPH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/cluster_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/cluster_util.h
new file mode 100644
index 00000000..c521298e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/cluster_util.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CLUSTER_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CLUSTER_UTIL_H_
+
+#include <functional>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
+
+namespace mlir::TF {
+
+// Cluster structure captures all the operations that are assigned to same
+// device and can form a legal strict cluster.
+// Ops must follow same ordering in their parent block. We rely on this
+// assumption to perform analysis.
+struct Cluster {
+  llvm::SetVector<Operation*> ops;
+  std::string target;
+};
+
+// Builds the op clusters in the `block`. Ops are filtered by the function
+// `get_target` that takes an op and returns the target name. `is_ignored_op` is
+// a hook to ignore certain ops that are not included in any clusters.
+llvm::StringMap<SmallVector<Cluster>> BuildAllClusters(
+    Block& block, const TF::SideEffectAnalysis::Info& side_effect_analysis,
+    std::function<std::string(Operation*)> get_target,
+    std::function<bool(Operation*)> is_ignored_op);
+
+// Reorder all users of the given op's results to after the op.
+//
+// Since launch ops are inserted after the last op in the region, the region is
+// guaranteed to dominate all live-in values. On the other hand, it is still
+// possible that live-out values don't dominate the region. For example:
+//
+// ```
+// %0 = "tf.OpA"()
+// %1 = "tf.OpB"(%0)
+// %2 = "tf.OpC"(%0)
+// ```
+//
+// Assuming `tf.OpA` and `tf.OpC` are clustered together, the region will be
+// inserted right after `tf.OpC`. The live-out `%0`, however, is used by
+// `tf.OpB`, which won't dominate the region. This function reorders all users
+// of the cluster op to be placed after the cluster op itself so that SSA
+// dominance is preserved after cluster op creation.
+void ReorderOpResultUses(mlir::Operation* cluster);
+
+}  // namespace mlir::TF
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CLUSTER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
new file mode 100644
index 00000000..10271fcb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_ATTR_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_ATTR_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+using tsl::StatusOr;
+
+// Converts non func AttrValue proto into an MLIR attribute. Func attribute is
+// exclused in this function because the function might be renamed when the
+// function definition is imported.
+absl::StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(
+    const AttrValue& value, mlir::Builder* builder);
+
+// Converts all kinds of AttrValue proto into an MLIR attribute.
+absl::StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
+                                                      mlir::Builder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_ATTR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
new file mode 100644
index 00000000..ba5cd3d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_TENSOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_TENSOR_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+using tsl::StatusOr;
+
+// Converts an TensorFlow tensor proto into an MLIR elements attribute.
+absl::StatusOr<mlir::ElementsAttr> ConvertTensorProto(
+    const TensorProto& input_tensor, mlir::Builder* builder,
+    bool convert_to_dense_resource = false);
+
+// Converts an TensorFlow tensor into an MLIR elements attribute.
+absl::StatusOr<mlir::ElementsAttr> ConvertTensor(
+    const Tensor& input_tensor, mlir::Builder* builder,
+    bool convert_to_dense_resource = false);
+
+// Converts a shape from MLIR to a TensorFlow tensor shape proto.
+void ConvertToTensorShapeProto(llvm::ArrayRef<int64_t> shape,
+                               TensorShapeProto* output_shape);
+
+// Converts an MLIR type to a TensorFlow tensor shape.
+PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type);
+
+// Converts an MLIR shaped type to a TensorFlow shape attribute.
+mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type);
+
+// Converts an MLIR shaped type to a Tensorflow tensor spec proto.
+absl::StatusOr<TensorSpecProto> ConvertTypeToTensorSpecProto(
+    const mlir::Type& type);
+
+// Converts a TensorFlow shape attribute to an MLIR shape attribute.
+absl::StatusOr<mlir::Attribute> ConvertTensorShapeProto(
+    const TensorShapeProto& shape, mlir::MLIRContext* context);
+
+// Converts an MLIR elements attribute to a TensorFlow tensor proto.
+absl::Status ConvertToTensorProto(mlir::ElementsAttr attr,
+                                  TensorProto* output_tensor);
+
+// Converts an MLIR elements attribute to a TensorFlow tensor.
+absl::Status ConvertToTensor(mlir::ElementsAttr attr, Tensor* output_tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
new file mode 100644
index 00000000..1ce9d054
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_TYPE_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_TYPE_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+using tsl::StatusOr;
+
+// Converts the TensorFlow DataType 'dtype' into an MLIR (scalar) type.
+absl::Status ConvertDataType(DataType dtype, mlir::Builder builder,
+                             mlir::Type* type);
+
+// Converts a scalar MLIR type to a TensorFlow Datatype.
+absl::Status ConvertScalarTypeToDataType(mlir::Type type, DataType* dtype);
+
+// Converts an MLIR type to TensorFlow DataType. If 'type' is a scalar type, it
+// is converted directly. If it is a shaped type, the element type is converted.
+absl::Status ConvertToDataType(mlir::Type type, DataType* dtype);
+
+// Converts an TensorFlow shape to the one used in MLIR.
+void ConvertToMlirShape(const TensorShape& input_shape,
+                        llvm::SmallVectorImpl<int64_t>* shape);
+
+// Converts an TensorFlow shape proto to the one used in MLIR.
+absl::Status ConvertToMlirShape(const TensorShapeProto& input_shape,
+                                llvm::SmallVectorImpl<int64_t>* shape);
+
+// Given a tensor shape and dtype, get the corresponding MLIR tensor type.
+absl::StatusOr<mlir::Type> ConvertToMlirTensorType(
+    const TensorShapeProto& shape, DataType dtype, mlir::Builder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h
new file mode 100644
index 00000000..e45479bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DATA_DUMPER_LOGGER_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DATA_DUMPER_LOGGER_CONFIG_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
+
+namespace tensorflow {
+
+class DataDumperLoggerConfig : public ::tensorflow::BridgeLoggerConfig {
+ public:
+  explicit DataDumperLoggerConfig(
+      std::function<std::string(const std::string &, mlir::Operation *op)>
+          get_filename,
+      const std::string &pass_prefix = "", bool print_module_scope = false,
+      bool print_after_only_on_change = true,
+      mlir::OpPrintingFlags op_printing_flags = mlir::OpPrintingFlags());
+
+  void printBeforeIfEnabled(mlir::Pass *pass, mlir::Operation *op,
+                            PrintCallbackFn print_callback) override;
+
+  void printAfterIfEnabled(mlir::Pass *pass, mlir::Operation *op,
+                           PrintCallbackFn print_callback) override;
+
+ private:
+  static void DumpMlir(const std::string &filename,
+                       BridgeLoggerConfig::PrintCallbackFn print_callback);
+
+  // The function to dump the target MLIR string to file.
+  // The parameter that will be sent to the dump_func_ is:
+  // The pass name (std::string)
+  std::function<std::string(const std::string &, mlir::Operation *op)>
+      get_filename_;
+
+  // The pass prefix.
+  std::string pass_prefix_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DATA_DUMPER_LOGGER_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/device_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
new file mode 100644
index 00000000..14e48bf7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Collects all devices known to the system by name and adds them as a
+// `tf.devices` dictionary attribute with a full device name as a key, and
+// device metadata as a value.
+//
+// Device names added in full parsed device form:
+//   /job:<name>/replica:<replica>/task:<task>/device:<type>:<device_num>
+//
+// Supported device metadata types:
+// (1) GpuDeviceMetadata: GPU device compute capability.
+void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set);
+
+// Collects devices information from an op `tf.devices` attributes. Returns
+// failure if can't parse device metadata from the attribute.
+mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
+                                     mlir::TF::RuntimeDevices* devices);
+
+// Parses a device string and returns its ordinal (id). This will return an
+// error if the device string is invalid or has no id.
+mlir::LogicalResult GetDeviceOrdinalFromDeviceString(mlir::Location loc,
+                                                     llvm::StringRef device,
+                                                     int64_t* device_ordinal);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
new file mode 100644
index 00000000..ae6e0b61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_GRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_GRAPH_H_
+
+#include <optional>
+#include <string>
+
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+struct MlirDumpConfig;
+
+// Dumps 'graph_def' to a file, as textual IR. Returns the file name chosen.
+//
+// Note: This is for debugging use and is not optimized for performance.
+absl::Status DumpTextualIRToFile(const MlirDumpConfig& config,
+                                 const Graph& graph,
+                                 const FunctionLibraryDefinition* flib_def,
+                                 WritableFile* file);
+
+// Config of the textual dump.
+struct MlirDumpConfig {
+  enum class Dialect {
+    // Tensorflow Graph Dialect
+    kTFG,
+  };
+
+  // The limit of element size that gets printed.
+  MlirDumpConfig& elide_large_attributes(int large_element_limit = 16) {
+    this->op_printing_flags.elideLargeElementsAttrs(large_element_limit);
+    return *this;
+  }
+
+  // Enable printing of debug information. If 'pretty_form' is set to true,
+  // debug information is printed in a more readable 'pretty' form but this
+  // pretty form is not parsable (so only for human readability).
+  MlirDumpConfig& emit_location_information(bool pretty_form = false) {
+    this->op_printing_flags.enableDebugInfo(/*enable=*/true, pretty_form);
+    return *this;
+  }
+
+  MlirDumpConfig& emit_dialect(Dialect dialect) {
+    this->dialect = dialect;
+    return *this;
+  }
+
+  // Op printing flags.
+  mlir::OpPrintingFlags op_printing_flags = std::nullopt;
+
+  // The target MLIR dialect.
+  Dialect dialect = Dialect::kTFG;
+};
+
+// Change DumpGraphToFile to dump MLIR textual IR instead of protobuf.
+void UseMlirForGraphDump(const MlirDumpConfig& = {});
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
new file mode 100644
index 00000000..87d53e8b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+inline constexpr absl::string_view kCrashReproducerStdErr = "-";
+inline constexpr absl::string_view kCrashReproducerCrashAnalysis =
+    "crash_analysis";
+
+// Creates a file to use for dumping and returns success if a file could be
+// created. The opened file is placed in 'os' and the path of the file used is
+// placed in 'filepath'.
+//
+// If the TF_DUMP_GRAPH_PREFIX environment variable is kCrashReproducerStdErr,
+// then the LOG(INFO) macro is used instead.
+//
+// This will create a file name via prefixing `name` with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if `dirname` is empty and
+// suffixing `name` with ".mlir".
+absl::Status CreateFileForDumping(llvm::StringRef name,
+                                  std::unique_ptr<llvm::raw_ostream>* os,
+                                  std::string* filepath,
+                                  llvm::StringRef dirname = "");
+
+// Dumps MLIR operation to a file and returns the file name used.
+//
+// If the TF_DUMP_GRAPH_PREFIX environment variable is kCrashReproducerStdErr,
+// then the MLIR operation will be logged (using the LOG(INFO) macro) instead.
+//
+// This will create a file name via prefixing `name` with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if `dirname` is empty and
+// suffixing `name` with ".mlir".
+// If `pass_manager` is provided, prints a header with the pass pipeline.
+std::string DumpMlirOpToFile(llvm::StringRef name, mlir::Operation* op,
+                             llvm::StringRef dirname = "",
+                             const mlir::PassManager* pass_manager = nullptr);
+
+// Reads the directory to dump the MLIR module from environment variables.
+// Default is reading from TF_DUMP_GRAPH_PREFIX, and if the string is 'sponge'
+// read from TEST_UNDECLARED_OUTPUTS_DIR. Returns nullptr if the directory
+// cannot be determined and generates a warning message.
+std::string GetDumpDirFromEnvVar();
+
+// Dumps a raw string to a file and returns the file name used.
+//
+// This will create a file name via prefixing `name` with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if `dirname` is empty and
+// suffixing `name` with ".mlir".
+std::string DumpRawStringToFile(llvm::StringRef name, llvm::StringRef content,
+                                llvm::StringRef dirname = "");
+
+// Enable the crash reproducer on the provided PassManager to the provided
+// directory path.
+// If the provided path is empty, it is retrieved from the
+// environment variable `MLIR_CRASH_REPRODUCER_DIRECTORY`.
+// If the provided path is the string "sponge", the file will be included
+// in the sponge "Output Files" by looking up the environment to infer
+// the directory path.
+// If the provided path is the string kCrashReproducerStdErr, the data is
+// dumped into the stderr.
+// If the provided path is the string kCrashReproducerCrashAnalysis, the data
+// is dumped to the crash analysis system. Note, environment var
+// `MLIR_CRASH_REPRODUCER_DIRECTORY` can be used to override
+// kCrashReproducerCrashAnalysis settings.
+void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path = "");
+
+// This applies both the PassManagerCLOptions provided by MLIR along with any
+// tensorflow specific options.
+//
+// Note that this function should be in a more appropriate file, but it is
+// unclear what a proper file would be as no other functions would currently be
+// in the file also.
+void applyTensorflowAndCLOptions(mlir::PassManager& pm,
+                                 llvm::StringRef dir_path = "");
+
+// Prints the pass pipeline of `pass_manager` to `os`.
+void PrintPassPipeline(const mlir::PassManager& pass_manager,
+                       mlir::Operation* op, llvm::raw_ostream& os);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h
new file mode 100644
index 00000000..a06d9664
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DYNAMIC_SHAPE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DYNAMIC_SHAPE_UTILS_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+
+namespace tensorflow {
+
+llvm::SmallVector<int64_t> ConvertTFShapeToMlir(llvm::ArrayRef<int64_t> shapes);
+
+llvm::SmallVector<int64_t> ConvertMlirShapeToTF(llvm::ArrayRef<int64_t> shape);
+
+static constexpr int64_t kTFDynamicSize = -1;
+mlir::RankedTensorType GetTypeFromTFTensorShape(llvm::ArrayRef<int64_t> shape,
+                                                mlir::Type elementType,
+                                                mlir::Attribute encoding = {});
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DYNAMIC_SHAPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/error_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
new file mode 100644
index 00000000..bd958c8c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ERROR_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ERROR_UTIL_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/mlir/utils/error_util.h"
+#include "tensorflow/core/platform/status.h"
+
+// Error utilities for MLIR when interacting with code using Status returns.
+namespace mlir {
+
+// TensorFlow's Status is used for error reporting back to callers.
+using ::tensorflow::Status;
+
+// TF customized diagnostic handler that collects all the diagnostics reported
+// and can produce a Status to return to callers. This is for the case where
+// MLIR functions are called from a function that will return a Status: MLIR
+// code still uses the default error reporting, and the final return function
+// can return the Status constructed from the diagnostics collected.
+// todo: [b/253331656]. Note ConsumeStatus() and Combine() are wrappers
+// of what is inherited from the BaseScopedDiagnosticHandler  to
+// support cases where tensorflow::Status is still being used (base class uses
+// absl::Status)
+class StatusScopedDiagnosticHandler : public BaseScopedDiagnosticHandler {
+ public:
+  // Constructs a diagnostic handler in a context. If propagate is true, then
+  // diagnostics reported are also propagated back to the original diagnostic
+  // handler.  If filter_stack is true, a reduced stack will be produced.
+
+  explicit StatusScopedDiagnosticHandler(MLIRContext* context,
+                                         bool propagate = false,
+                                         bool filter_stack = false);
+
+  ~StatusScopedDiagnosticHandler() = default;
+  // Returns Status corresponding to the diagnostics reported. This consumes
+  // the diagnostics reported and returns a Status of type Unknown. It is
+  // required to consume the error status, if there is one, before destroying
+  // the object.
+  Status ConsumeStatus();
+
+  // Returns the combination of the passed in status and consumed diagnostics.
+  // This consumes the diagnostics reported and either appends the diagnostics
+  // to the error message of 'status' (if 'status' is already an error state),
+  // or returns an Unknown status (if diagnostics reported), otherwise OK.
+  Status Combine(Status status);
+};
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ERROR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/eval_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/eval_util.h
new file mode 100644
index 00000000..e3e14afc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/eval_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EVAL_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EVAL_UTIL_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/c/eager/c_api.h"
+
+namespace tensorflow {
+
+// Attempts to evaluates an MLIR Operation in TensorFlow eager mode with the
+// specified operands. The op is always executed on the local host CPU
+// irrespective of the device attribute of the given op. If there is a CPU
+// kernel registered for the op and is executed successfully, this fills in the
+// results vector.  If not, results vector is unspecified.
+//
+mlir::LogicalResult EvaluateOperation(
+    mlir::Operation* inst, llvm::ArrayRef<mlir::ElementsAttr> operands,
+    TFE_Context* context, llvm::SmallVectorImpl<mlir::Attribute>* results);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EVAL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
new file mode 100644
index 00000000..28d5df0c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EXPORT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EXPORT_UTILS_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace mlir {
+class ShapedType;
+}  // namespace mlir
+
+namespace tensorflow {
+
+using tsl::StatusOr;
+
+// Add custom op prefix for TensorFlow dialects.
+absl::Status AddTensorFlowOpPrefix(std::string);
+
+// Maps an MLIR op name in the TensorFlow dialect or the TensorFlow control
+// dialect back into a TensorFlow valid op name.
+absl::StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
+
+// Converts an MLIR operation to TensorFlow NodeDef with given node name. This
+// name should be unique to the graph it is being inserted into.
+absl::StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
+    mlir::Operation* inst, llvm::StringRef name);
+
+// Converts MLIR attributes with values to their tensorflow equivalent.
+// "name" and "device" attributes are ignored by default. Use attrs_to_ignore to
+// specify any other attributes that should be ignored.
+absl::Status ConvertAttributes(
+    llvm::ArrayRef<mlir::NamedAttribute> attrs,
+    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
+    bool remove_ref_type, AttrValueMap* values);
+
+// Fill in the contents of TensorShapeProto for the given shape.
+// ShapeContainerT is any type with the following methods:
+//   bool hasRank()
+//   ArrayRef<int64_t> getShape()
+// This includes mlir::TF::ShapeAttr and mlir::ShapedType.
+template <typename ShapeContainerT>
+void SetTensorShapeProto(ShapeContainerT shape, TensorShapeProto* proto) {
+  if (shape.hasRank()) {
+    for (int64_t dim : shape.getShape()) {
+      proto->add_dim()->set_size(mlir::ShapedType::isDynamic(dim) ? -1 : dim);
+    }
+  } else {
+    proto->set_unknown_rank(true);
+  }
+}
+
+// Sets shape attribute with the given name. If the attribute already exists
+// with a different value, returns an error.
+absl::Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shape,
+                               AttrValueMap* values);
+
+// Returns true if the given instruction is an mlir::TF::LegacyCallOp or the
+// result of such an operation transformed by the
+// ExecutorToControlDialectConversion pass.
+//
+// TODO(b/145706023): When the ExecutorToControlDialectConversion pass runs
+// before the exporter, it mutates an mlir::TF::LegacyCallOp instruction to
+// an instruction with a different operation name. As such, this routine checks
+// both forms of a LegacyCall instruction. We only need to check for
+// mlir::TF::LegacyCallOp when the ticket is resolved.
+bool IsLegacyCallInstruction(mlir::Operation* inst);
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EXPORTER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h
new file mode 100644
index 00000000..6ded27b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h
@@ -0,0 +1,85 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_FAKE_SESSION_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_FAKE_SESSION_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace TF {
+namespace test_util {
+// FakeSession is for testing only.
+class FakeSession : public tensorflow::Session {
+ public:
+  FakeSession();
+
+  absl::Status Create(const tensorflow::GraphDef& graph) override;
+  absl::Status Extend(const tensorflow::GraphDef& graph) override;
+
+  absl::Status Close() override;
+
+  absl::Status ListDevices(
+      std::vector<tensorflow::DeviceAttributes>* response) override;
+
+  absl::Status LocalDeviceManager(
+      const tensorflow::DeviceMgr** deviceMgrPtr) override;
+
+  absl::Status Run(
+      const std::vector<std::pair<std::string, ::tensorflow::Tensor>>& inputs,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_nodes,
+      std::vector<::tensorflow::Tensor>* outputs) override;
+
+  absl::Status Run(
+      const tensorflow::RunOptions& run_options,
+      const std::vector<std::pair<std::string, ::tensorflow::Tensor>>& inputs,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_nodes,
+      std::vector<::tensorflow::Tensor>* outputs,
+      tensorflow::RunMetadata* run_metadata) override;
+
+  absl::Status Run(
+      const tensorflow::RunOptions& run_options,
+      const std::vector<std::pair<std::string, ::tensorflow::Tensor>>& inputs,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_nodes,
+      std::vector<::tensorflow::Tensor>* outputs,
+      tensorflow::RunMetadata* run_metadata,
+      const tensorflow::thread::ThreadPoolOptions& thread_pool_options)
+      override;
+
+ private:
+  void InitVariables();
+  void BuildDeviceManager();
+  void Initialize();
+
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr_;
+  bool initialized_ = false;
+};
+
+}  // namespace test_util
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_FAKE_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
new file mode 100644
index 00000000..8b0aaa37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_IMPORT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_IMPORT_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Reads text (.pbtext) or binary (.pb) format of a proto message from the given
+// buffer. Returns error status of the file is not found or malformed proto.
+// Note that text protos can only be parsed when full protobuf::Message protos
+// are used, and will fail for protobuf::MessageLite protos.
+absl::Status LoadProtoFromBuffer(absl::string_view input,
+                                 protobuf::Message* proto);
+absl::Status LoadProtoFromBuffer(absl::string_view input,
+                                 protobuf::MessageLite* proto);
+
+// Reads text (.pbtext) or binary (.pb) format of a proto message from the given
+// file path. Returns error status of the file is not found or malformed proto.
+// Note that text protos can only be parsed when full protobuf::Message protos
+// are used, and will fail for protobuf::MessageLite protos.
+absl::Status LoadProtoFromFile(absl::string_view input_filename,
+                               protobuf::Message* proto);
+absl::Status LoadProtoFromFile(absl::string_view input_filename,
+                               protobuf::MessageLite* proto);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_IMPORT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/location_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/location_utils.h
new file mode 100644
index 00000000..c65cbb3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/location_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_LOCATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_LOCATION_UTILS_H_
+
+#include "mlir/IR/Location.h"  // from @llvm-project
+
+namespace tensorflow {
+
+mlir::Location GetLocationWithoutOpType(mlir::Location loc);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_IMPORT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
new file mode 100644
index 00000000..a0c14f27
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_MANGLING_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_MANGLING_UTIL_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace mangling_util {
+// The type of a mangled string.
+enum class MangledKind { kUnknown, kDataType, kTensorShape, kTensor };
+
+// Mangles an attribute name, marking the attribute as a TensorFlow attribute.
+string MangleAttributeName(absl::string_view str);
+
+// Returns true if 'str' was mangled with MangleAttributeName.
+bool IsMangledAttributeName(absl::string_view str);
+
+// Demangles an attribute name that was manged with MangleAttributeName.
+// REQUIRES: IsMangledAttributeName returns true.
+absl::string_view DemangleAttributeName(absl::string_view str);
+
+// Returns the type of a mangled string, or kUnknown.
+MangledKind GetMangledKind(absl::string_view str);
+
+// Return a TensorShapeProto mangled as a string.
+string MangleShape(const TensorShapeProto& shape);
+// Demangle a string mangled with MangleShape.
+absl::Status DemangleShape(absl::string_view str, TensorShapeProto* proto);
+
+// Return a TensorProto mangled as a string.
+string MangleTensor(const TensorProto& tensor);
+// Demangle a string mangled with MangleTensor.
+absl::Status DemangleTensor(absl::string_view str, TensorProto* proto);
+
+// Return a DataType mangled as a string.
+string MangleDataType(const DataType& dtype);
+// Demangle a string mangled with MangleDataType.
+absl::Status DemangleDataType(absl::string_view str, DataType* proto);
+
+}  // namespace mangling_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_MANGLING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h
new file mode 100644
index 00000000..0359d38c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_MLPROGRAM_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_MLPROGRAM_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace tensorflow {
+
+void RegisterMlProgramPasses();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_MLPROGRAM_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.h
new file mode 100644
index 00000000..1b0e0201
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_PARALLEL_EXECUTE_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_PARALLEL_EXECUTE_UTIL_H_
+
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+
+namespace mlir {
+namespace TF {
+
+// TODO(b/243076653): Once the ParallelExecute is added do not remove it. This
+//   means BuildSingletonParallelExecuteOp will be used in one location, and
+//   RemoveSingletonParallelExecuteOp can be removed.
+
+// Wrap `cluster_func` in a `ParallelExecute` with only one child. This
+// can be used to canonicalize IR, so there is always one `ParallelExecute`.
+tf_device::ParallelExecuteOp BuildParallelExecuteOp(
+    tf_device::ClusterFuncOp cluster_func, OpBuilder* builder);
+
+// Unwrap `parallel_execute`'s contents if it only has one child.
+LogicalResult RemoveSingletonParallelExecuteOp(
+    tf_device::ParallelExecuteOp parallel_execute, OpBuilder* builder);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_PARALLEL_EXECUTE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h
new file mode 100644
index 00000000..fdeec88c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_PARSE_TEXT_PROTO_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_PARSE_TEXT_PROTO_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Sets output to the given input with `prefix` stripped, or returns an error if
+// the prefix doesn't exist.
+absl::Status ConsumePrefix(absl::string_view str, absl::string_view prefix,
+                           absl::string_view* output);
+
+// Strips `prefix_to_strip` from `text_proto`, parses, and returns the parsed
+// proto.
+absl::Status ParseTextProto(absl::string_view text_proto,
+                            absl::string_view prefix_to_strip,
+                            protobuf::Message* parsed_proto);
+inline absl::Status ParseTextProto(absl::string_view /* text_proto */,
+                                   absl::string_view /* prefix_to_strip */,
+                                   protobuf::MessageLite* /* parsed_proto */) {
+  return errors::Unavailable("Cannot parse text protos on mobile.");
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_PARSE_TEXT_PROTO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
new file mode 100644
index 00000000..fc204413
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
+
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Prints a MLIR module `module_op` and returns it as a string.
+std::string SerializeMlirModule(mlir::ModuleOp module_op);
+
+// Parses a MLIR module from `mlir_module_string` into `mlir_module` with
+// context `mlir_context`.
+absl::Status DeserializeMlirModule(
+    llvm::StringRef serialized_mlir_module, mlir::MLIRContext* mlir_context,
+    mlir::OwningOpRef<mlir::ModuleOp>* mlir_module);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h
new file mode 100644
index 00000000..be2d3786
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SESSION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SESSION_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Returns the variable for the provided 'var_handle_op'.
+std::string GetVariableName(TF::VarHandleOp var_handle_op);
+
+// Returns pointer to the variable from 'session' that 'var_handle_op'
+// refers to which is in 'device_name' device. If failed to fetch the value null
+// will be returned.
+// Note, caller is responsible for Unref the variable.
+tensorflow::Var* GetVariableFromSession(mlir::TF::VarHandleOp var_handle_op,
+                                        llvm::StringRef device_name,
+                                        const tensorflow::DeviceMgr* mgr);
+
+// Returns resource tensors from session for all variables in 'module'.
+absl::StatusOr<std::vector<tensorflow::Tensor>> GetResourcesFromSession(
+    llvm::ArrayRef<TF::VarHandleOp> var_handle_ops,
+    tensorflow::Session* session);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SESSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
new file mode 100644
index 00000000..28e2c93f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
+
+#include <optional>
+
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/ir/utils/shape_inference_utils.h"
+
+namespace mlir {
+
+class Operation;
+
+namespace TF {
+
+// Runs TensorFlow shape inference associated to the op type registered in the
+// TensorFlow op registry based on the Graph version, operands, and attributes.
+// Invoking this shape function will create conversions of parameters to the
+// TensorFlow Graph equivalent data structures and back to MLIR equivalent data
+// structures. This does not use a natively implemented shape inference in MLIR,
+// and instead is temporary until shape functions are reimplemented/migrated to
+// being in MLIR instead of the TensorFlow op registry.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    std::optional<Location> location, Operation* op, int64_t graph_version,
+    tfg::OperandAsConstantFn operand_as_constant_fn,
+    tfg::OpResultAsShapeFn op_result_as_shape_fn,
+    tfg::ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h
new file mode 100644
index 00000000..0c6e1532
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SIDE_EFFECT_ANALYSIS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SIDE_EFFECT_ANALYSIS_UTIL_H_
+
+#include <string>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+
+std::string GetDeviceAttrAsResourceInstanceStr(Operation* op);
+
+void MarkResourceAsReadAndWrite(
+    OpOperand& op_operand,
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effect);
+
+void MarkResourceAsReadOnly(
+    OpOperand& op_operand,
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effect);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SIDE_EFFECT_ANALYSIS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h
new file mode 100644
index 00000000..ea1ae8c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STABLEHLO_CUSTOM_CALL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STABLEHLO_CUSTOM_CALL_H_
+
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir {
+namespace TF {
+
+// Returns whether the custom call op represents a TF function call.
+bool IsTfFuncCustomCall(stablehlo::CustomCallOp op);
+
+// Returns the `called_func` symbol ref attribute in the `tf.backend_config`
+// dictionary attribute.
+//
+// If the op does not represent a TF function call, returns nullptr.
+// Otherwise, if the op does not have `caller_name`, returns failure.
+FailureOr<SymbolRefAttr> GetTfFuncCustomCallFuncName(
+    stablehlo::CustomCallOp op);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STABLEHLO_CUSTOM_CALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/string_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/string_util.h
new file mode 100644
index 00000000..56410385
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/string_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STRING_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STRING_UTIL_H_
+
+#include <ostream>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+// Utility functions for dumping operations/attributes as strings and ostream
+// bindings.
+
+namespace tensorflow {
+std::string OpAsString(mlir::Operation& op);
+std::string AttrAsString(mlir::Attribute& attr);
+
+// b/281863212 enable automatic without Op/AttrAsString.
+// We add logging via a wrapper struct in order to respect ODS and avoid
+// multiple symbol definitions if MLIR or someone else decides to add ostream
+// definitions for the MLIR symbols.
+struct LoggableOperation {
+  mlir::Operation& v;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LoggableOperation(mlir::Operation& v) : v(v) {}
+};
+std::ostream& operator<<(std::ostream& o, const LoggableOperation& op);
+
+struct LoggableAttribute {
+  mlir::Attribute& v;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LoggableAttribute(mlir::Attribute& v) : v(v) {}
+};
+std::ostream& operator<<(std::ostream& o, const LoggableAttribute& attr);
+
+struct LoggableStringRef {
+  const llvm::StringRef& v;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LoggableStringRef(const llvm::StringRef& v) : v(v) {}
+};
+std::ostream& operator<<(std::ostream& o, const LoggableStringRef& ref);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STRING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h
new file mode 100644
index 00000000..1daab855
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h
@@ -0,0 +1,74 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TOPOLOGICAL_SORT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TOPOLOGICAL_SORT_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// A function that determines which op to emit next in the case of ties.
+// The predecessor (which can be null) is the last op we emitted,
+// and op is the candidate we're considering. A larger returned integer
+// means the op has a higher chance of being emitted first.
+typedef int (*PriorityFunction)(Operation *predecessor, Operation *op);
+
+// A function that returns extra dependencies for each op. These might
+// e.g. be known side-effects (or control dependencies) between ops.
+// If "incoming" is true, then the list of (extra) predecessors of the
+// op should be returned. If "incoming" is false, the list of successors.
+// The algorithm assumes that these are consistent which each other. So
+// if (and only if) op1 is in extra_dependencies(op2, true), then op2
+// must also be in extra_dependencies(op1, false).
+// This function is called multiple times during the topological sort,
+// so the implementation should preferably be constant-time.
+typedef llvm::function_ref<llvm::SmallVector<Operation *, 4> const &(
+    Operation *, bool incoming)>
+    ExtraDependenciesFunction;
+
+// Convenience function if there are no extra dependencies to declare.
+// (Unlike nullptr, this also works inside the ternary operator)
+extern ExtraDependenciesFunction no_extra_dependencies;
+
+// Sort a block topologically, so that for all ops, all operands are
+// available at the time of execution.  This is similar to MLIR's topological
+// sort (lib/Transforms/TopologicalSort.cpp) but also takes a priority
+// function to determine the next op to emit in the case of ambiguity. This
+// makes it possible to group operations by certain attributes. For example,
+// the order_by_dialect pass uses this function to group by dialect.
+// Only the operations nested directly under the block will be reordered.
+// Nested blocks will be left alone.
+// Also takes a list of control dependencies (vector of operation pairs,
+// from->to) that will be honored when ordering the ops together with the
+// data dependencies given through (the ops/results of) the operations
+// themselves.
+std::vector<Operation *> SortBlockTopologically(
+    Block &block, PriorityFunction priorityFunction,
+    ExtraDependenciesFunction extraDependencies = no_extra_dependencies);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h
new file mode 100644
index 00000000..46ead1b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_CLUSTER_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_CLUSTER_UTIL_H_
+
+#include <functional>
+#include <optional>
+#include <string>
+
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// For each TPU cluster in `module`, walk over all ops inside the cluster
+// and reachable in the call graph from the cluster.
+// For each op walked, `callback` is applied to the op, the root cluster, and
+// the root cluster's host device. `callback` returning WasInterrupted
+// indicates failure.
+// The host device is null when the tpu_cluster HasModelParallelism: The
+// HasModelParallelism case is currently unsupported in combination with
+// outside compilation.
+mlir::LogicalResult WalkReachableFromTpuCluster(
+    ModuleOp module, std::function<WalkResult(Operation*, tf_device::ClusterOp,
+                                              std::optional<std::string>)>
+                         callback);
+
+// Like above, except TPU clusters are not required to have a host device, and
+// no host device is passed to `callback`.
+mlir::LogicalResult WalkReachableFromTpuCluster(
+    ModuleOp module,
+    std::function<WalkResult(Operation*, tf_device::ClusterOp)> callback);
+
+}  // namespace TFTPU
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_CLUSTER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
new file mode 100644
index 00000000..cdbf7396
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -0,0 +1,311 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/tpu/topology.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+using tsl::StatusOr;
+
+inline constexpr absl::string_view kNumCoresPerReplicaAttr =
+    "num_cores_per_replica";
+inline constexpr absl::string_view kTopologyAttr = "topology";
+inline constexpr absl::string_view kDeviceAssignmentAttr = "device_assignment";
+
+// A TPU device for execution alongside its associated host CPU device.
+struct TPUDeviceAndHost {
+  TPUDeviceAndHost() = default;
+  TPUDeviceAndHost(llvm::StringRef device, llvm::StringRef host)
+      : device(device), host(host) {}
+
+  std::string device;
+  std::string host;
+};
+
+// TPU devices to be used for execution (e.g. devices for TPUExecute ops) and
+// their associated host CPU devices (for outside compilation). They are ordered
+// by `num_replicas` followed by `num_cores_per_replica`.
+using TPUDevicesAndHosts =
+    llvm::SmallVector<llvm::SmallVector<TPUDeviceAndHost, 8>, 8>;
+
+// TPU compilation device, execution and associated host devices, and optionally
+// execution device IDs. Execution device IDs are populated if `topology` and
+// `device_assignment` are provided.
+struct TPUDeviceAssignment {
+  TPUDeviceAssignment(llvm::StringRef compilation_device,
+                      TPUDevicesAndHosts&& tpu_devices)
+      : compilation_device(compilation_device),
+        tpu_devices(std::move(tpu_devices)) {}
+
+  TPUDeviceAssignment(llvm::StringRef compilation_device,
+                      TPUDevicesAndHosts&& tpu_devices,
+                      xla::DeviceAssignmentProto&& xla_device_assignment)
+      : compilation_device(compilation_device),
+        tpu_devices(std::move(tpu_devices)),
+        xla_device_assignment(std::move(xla_device_assignment)) {}
+
+  std::string compilation_device;
+  TPUDevicesAndHosts tpu_devices;
+  std::optional<xla::DeviceAssignmentProto> xla_device_assignment;
+};
+
+// Extracts device coordinates from a device assignment attribute on an op.
+absl::StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr);
+
+// Finds the TPU compilation device and execution devices from `devices` for a
+// TPU computation subgraph. Compilation device is determined from looking up
+// all TPU_SYSTEM:0 devices and choosing the CPU device associated to the first
+// TPU_SYSTEM device sorted lexicographically by replica and task. Execution
+// devices are determined by looking up all TPU devices associated with each
+// TPU_SYSTEM:0 device found, alongside associated `topology_attr` and
+// `device_assignment_attr`. If `topology_attr` not an empty string (parsable to
+// TopologyProto), `device_assignment_attr` must not be empty also. When
+// `topology_attr` and `device_assignment_attr` are not empty, a general device
+// assignment based on those two attributes are used. Otherwise when
+// `topology_attr` and `device_assignment_attr` are empty, a full mesh device
+// assignment is used instead. A failure will be returned if it is not possible
+// (e.g. invalid devices or invalid parameters).
+//
+//
+// For example, for `devices`:
+//   {
+//     /job:localhost/replica:0/task:0/device:CPU:0,
+//     /job:worker/replica:0/task:0/device:CPU:0,
+//     /job:worker/replica:0/task:0/device:TPU_SYSTEM:0,
+//     /job:worker/replica:0/task:0/device:TPU:0,
+//     /job:worker/replica:0/task:0/device:TPU:1,
+//     /job:worker/replica:0/task:0/device:TPU:2,
+//     /job:worker/replica:0/task:0/device:TPU:3,
+//     /job:worker/replica:0/task:1/device:CPU:0,
+//     /job:worker/replica:0/task:1/device:TPU_SYSTEM:0,
+//     /job:worker/replica:0/task:1/device:TPU:0,
+//     /job:worker/replica:0/task:1/device:TPU:1,
+//     /job:worker/replica:0/task:1/device:TPU:2,
+//     /job:worker/replica:0/task:1/device:TPU:3
+//   }
+//
+//
+// With the following parameters (full mesh device assignment):
+//   `num_replicas` = 8
+//   `num_cores_per_replica` = 1
+//   `topology_attr` = ""
+//   `device_assignment_attr` = {}
+//
+// The `compilation_device` will be:
+//   /job:worker/replica:0/task:0/device:CPU:0
+//
+// `execution_devices` will be:
+//   {
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:0
+//     },
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:1
+//     },
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:2
+//     },
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:3
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:0
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:1
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:2
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:3
+//     }
+//   }
+//
+// and `xla_device_assignment` will not be set.
+//
+//
+// With the following parameters (general device assignment):
+//   `num_replicas` = 4
+//   `num_cores_per_replica` = 2
+//   `topology_attr` (in proto debug string format) =
+//     {
+//       mesh_shape: 2
+//       mesh_shape: 2
+//       mesh_shape: 2
+//       num_tasks: 2
+//       num_tpu_devices_per_task: 4
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 1
+//     }
+//   `device_assignment` =
+//     {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1}
+//
+// The `compilation_device` will be:
+//   /job:worker/replica:0/task:0/device:CPU:0
+//
+// `execution_devices` will be:
+//   {
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:0",
+//       "/job:worker/replica:0/task:1/device:TPU:3"
+//     },
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:1",
+//       "/job:worker/replica:0/task:1/device:TPU:2"
+//     },
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:3",
+//       "/job:worker/replica:0/task:1/device:TPU:0"
+//     },
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:2",
+//       "/job:worker/replica:0/task:1/device:TPU:1"
+//     }
+//   }
+//
+// and `xla_device_assignment` will be:
+//   {
+//     replica_count: 4
+//     computation_count: 2
+//     computation_devices {
+//       replica_device_ids: 0
+//       replica_device_ids: 4
+//       replica_device_ids: 2
+//       replica_device_ids: 6
+//     }
+//     computation_devices {
+//       replica_device_ids: 1
+//       replica_device_ids: 5
+//       replica_device_ids: 3
+//       replica_device_ids: 7
+//     }
+//   }
+absl::StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
+    llvm::ArrayRef<DeviceNameUtils::ParsedName> devices, int num_replicas,
+    int num_cores_per_replica, llvm::StringRef topology_attr,
+    llvm::ArrayRef<int64_t> device_assignment_attr);
+
+// Converts a device assignment attribute to an XLA device assignment proto.
+absl::StatusOr<xla::DeviceAssignmentProto> GetXlaDeviceAssignmentProto(
+    llvm::StringRef topology_attr, int num_replicas, int num_cores_per_replica,
+    llvm::ArrayRef<int64_t> device_assignment_attr);
+
+// Virtual device name of the passed logical core. The logical core is the index
+// of a core within a replica.
+std::string GetDeviceAliasForLogicalCore(int core_index);
+
+// Virtual device name of the host that is associated with the passed logical
+// core. The logical core is the index of a core within a replica.
+std::string GetDeviceAliasForHostOfLogicalCore(int core_index);
+
+// Returns true if cluster contains model parallelism based on
+// `num_cores_per_replica_attribute`. Otherwise returns false.
+bool HasModelParallelism(mlir::tf_device::ClusterOp cluster);
+
+// Returns true if the devices list contain any TPU devices
+bool HasTPUDevice(const mlir::TF::RuntimeDevices& devices);
+
+// Returns the host device used for outside compilation in generic pipeline.
+mlir::LogicalResult GetHostDeviceOutsideCompilationInGenericPipeline(
+    mlir::TF::RuntimeDevices devices, std::string* host_device);
+
+// Parses XLA compilation and execution devices from a tf_device.cluster and
+// returns the host device for the head and tail computations. For TPU device,
+// if the computation is replicated, GetDeviceAliasForHostOfLogicalCore(0) is
+// returned instead.
+mlir::LogicalResult GetHostDeviceOutsideComputation(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    std::string* host_device);
+
+// Checks if a device string is a TPU device.
+bool IsTPUDevice(llvm::StringRef device);
+
+// Checks if a device string is a TPU replicated core device.
+bool IsTPUReplicatedCore(llvm::StringRef device);
+
+// Checks if `type` is allowed for XLA. String and resources are not XLA types.
+// There are other TF types that are not XLA types which will be removed by
+// successive passes in TF/XLA bridge phase 2.
+bool TypeValidForXLA(const mlir::Type& type);
+
+// Returns the map from core to the host that is associated with the
+// core. If `cluster` is not replicated then the core is a physical core index
+// and the host is a physical host name. If `cluster` is replicated then the
+// core with index `i` is a logical core (`TPU_REPLICATED_CORE_i`), and the host
+// is the associated virtual device name (`TPU_REPLICATED_HOST_i`).
+mlir::LogicalResult GetDeviceToHostMap(
+    mlir::tf_device::ClusterOp cluster,
+    llvm::SmallVector<std::string, 8>& core_to_host);
+
+// Returns the first TPU device, for use in the non-replicated case. The list of
+// TPU devices is retrived from `op`'s module ancestor.
+mlir::LogicalResult GetNonReplicatedTPU0(mlir::Operation* op,
+                                         std::string* tpu0_device);
+
+// Returns the CPU of the first TPU device, for use in the non-replicated case.
+// The list of devices is retrived from `op`'s module ancestor.
+mlir::LogicalResult GetNonReplicatedCPU0(mlir::Operation* op,
+                                         std::string* cpu0_device);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
new file mode 100644
index 00000000..60beacc8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TRANSLATE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TRANSLATE_UTILS_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Populates the tf.versions attribute on a module, given a corresponding
+// graph VersionDef proto.
+void PopulateTfVersions(mlir::ModuleOp module, const VersionDef& versions);
+
+// Extracts TensorFlow GraphDef version information from the given module.
+// Returns failure if version attribute is missing or any of the sub attributes
+// are invalid.
+mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
+                                      VersionDef* versions);
+
+// Returns TensorFlow GraphDef producer version for the given module. Returns an
+// error if the version information is missing for the module or is not valid.
+absl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TRANSLATE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h
new file mode 100644
index 00000000..3ec239c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VERIFICATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VERIFICATION_UTILS_H_
+
+#include <cstdint>
+
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Returns success when the given shape argument of the Reshape op is valid.
+LogicalResult VerifyShapeOfReshapeOp(ArrayRef<int64_t> shape);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VERIFICATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/verify_suitable_for_graph_export.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/verify_suitable_for_graph_export.h
new file mode 100644
index 00000000..31a6e25a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/verify_suitable_for_graph_export.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VERIFY_SUITABLE_FOR_GRAPH_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VERIFY_SUITABLE_FOR_GRAPH_EXPORT_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Returns whether all functions in module are of single tf_executor.graph and
+// each tf_executor.island in tf_executor.graph only has a single op.
+mlir::LogicalResult VerifyExportSuitable(mlir::ModuleOp module);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VERIFY_SUITABLE_FOR_GRAPH_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/visitor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/visitor.h
new file mode 100644
index 00000000..9fd25569
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/visitor.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Walks the function by following function call chains and calling the callback
+// for each reachable function (including `func`). Each function is visited only
+// once even if it's called from multiple places and/or recursively.
+//
+// The current implementation follows direct calls to `mlir::func::FuncOp` only
+// and returns a `mlir::WalkResult::interrupt()` when it encounters a call whose
+// callee cannot be resolved to `mlir::func::FuncOp`.
+mlir::WalkResult WalkReachableFunctions(
+    mlir::func::FuncOp func,
+    llvm::function_ref<mlir::WalkResult(mlir::func::FuncOp)> callback,
+    mlir::SymbolTableCollection* symbol_table = nullptr);
+
+// Creates a new MLIR module that contains only the given functions and all
+// reachable functions from them.
+mlir::FailureOr<mlir::OwningOpRef<mlir::ModuleOp>> CreatePrunedModule(
+    mlir::ModuleOp module, llvm::ArrayRef<llvm::StringRef> function_names);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h
new file mode 100644
index 00000000..264e1b4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_CALL_MODULE_ATTRS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_CALL_MODULE_ATTRS_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace TF {
+
+// The main function's name in the serialized stablehlo module embedded in
+// XlaCallModule's `module` attribute.
+constexpr llvm::StringRef kStablehloMainFunctionName = "main";
+
+// After deserializing the stablehlo functions from XlaCallModule,
+// this XlaCallModule attribute refers to the deserialized stablehlo main
+// function.
+constexpr llvm::StringRef kStablehloEntryFunctionAttrName = "_entry_function";
+
+// The StableHLO version of the serialized stablehlo module embedded in
+// XlaCallModule's `module` attribute, set on deserialization.
+constexpr llvm::StringRef kStablehloVersionAttrName = "_stablehlo_version";
+
+// Every stablehlo function deserialized from XlaCallModule has this attribute.
+constexpr llvm::StringRef kFromXlaCallModuleAttrName = "_from_xla_call_module";
+
+// Name of `tf.XlaCallModule`'s dictionary attribute for keeping the
+// deserialized stablehlo module's attributes.
+constexpr llvm::StringRef kStablehloModuleAttrsAttrName =
+    "_stablehlo_module_attrs";
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_CALL_MODULE_ATTRS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.h
new file mode 100644
index 00000000..8ce5403e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_REWRITE_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_REWRITE_UTIL_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+// Erase rewritten ClusterFuncOp(s). If TPUPartitionedInputV2Op /
+// TPUPartitionedOutputV2Op are present, they must be removed along with the
+// ClusterFuncOp(s).
+mlir::LogicalResult EraseClusterFuncs(
+    llvm::MutableArrayRef<mlir::tf_device::ClusterFuncOp> to_be_erased);
+
+// Move child processes of the ParallelExecute that do not change. These are all
+// children except for the child with the ClusterFunc.
+// Returns the index of the child with the ClusterFunc.
+int MovePreservedParallelExecuteChildren(
+    int num_cores_per_replica,
+    llvm::SmallVector<mlir::Type, 8>& concatenated_output_types,
+    mlir::OpBuilder* builder, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::tf_device::ParallelExecuteOp old_parallel_execute,
+    mlir::tf_device::ParallelExecuteOp* new_parallel_execute);
+
+// Wraps single op in `tf_device.launch` for explicit device assignment.
+mlir::tf_device::LaunchOp WrapOpInLaunch(mlir::OpBuilder* builder,
+                                         mlir::Location loc,
+                                         mlir::Operation* op,
+                                         llvm::StringRef device);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_REWRITE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
new file mode 100644
index 00000000..8b87b1c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
@@ -0,0 +1,172 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
+
+#include <stdbool.h>
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace tensorflow {
+
+inline constexpr llvm::StringRef kInputShardingAttr =
+    "input_sharding_configuration";
+inline constexpr llvm::StringRef kOutputShardingAttr =
+    "output_sharding_configuration";
+
+inline constexpr llvm::StringRef kICIWeightDistributionMlirBridgeMarker =
+    "_ici_weight_distribution_mlir_bridge_marker";
+
+// Parses the sharding string. This sharding string can be binary (serialized)
+// or human readable.
+mlir::LogicalResult DecodeShardingAttribute(const std::string& shard_str,
+                                            xla::OpSharding& sharding,
+                                            bool report_error = true);
+
+// Encodes the sharding in human readable form.
+mlir::LogicalResult DecodeShardingAttribute(mlir::Attribute shard_attr,
+                                            xla::OpSharding& sharding,
+                                            bool report_error = true);
+
+// Parses the sharding attr. This sharding attr can be binary (serialized)
+// or human readable.
+void EncodeSharding(mlir::Operation* op, llvm::StringRef shard_str);
+
+// Parses "input_sharding_configuration" attribute and returns a list where i-th
+// element is a list of mlir::Value's which represent inputs for the TPU
+// computation corresponding to i-th logical device. If the attribute does not
+// exist, the all inputs are placed on logical core 0.
+mlir::LogicalResult ExtractInputsForLogicalDevices(
+    int num_cores_per_replica, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::OpBuilder* builder,
+    llvm::SmallVectorImpl<llvm::SmallVector<mlir::Value, 4>>* input_list);
+
+// Same as above, except creates tf.XlaSplitND Op for split sharding if
+// use_xla_nd_ops is true, otherwise creates tf.Split op.
+mlir::LogicalResult ExtractInputsForLogicalDevices(
+    int num_cores_per_replica, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::OpBuilder* builder, bool use_xla_nd_ops,
+    llvm::SmallVectorImpl<llvm::SmallVector<mlir::Value, 4>>* input_list);
+
+// Extracts a list of OpSharding that represent output sharding configuration of
+// `tf_device.cluster`.
+mlir::LogicalResult ParseAndValidateOutputSharding(
+    int num_cores_per_replica, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::SmallVector<xla::OpSharding, 4>* output_sharding_list);
+
+// Retrieves output types for TPUExecute op representing execution for provided
+// logical device id. TPUExecute op for different logical device may have
+// different outputs depending on the output sharding configuration.
+mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
+    int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::ClusterFuncOp cluster_func,
+    llvm::SmallVectorImpl<mlir::Type>* output_types,
+    llvm::SmallVectorImpl<int>* cluster_to_core_index);
+
+// Same as above, except creates tf.XlaSplitND Op for split sharding if
+// use_xla_nd_ops is true, otherwise creates tf.Split op.
+mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
+    int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::ClusterFuncOp cluster_func,
+    llvm::SmallVectorImpl<mlir::Type>* output_types, bool use_xla_nd_ops,
+    llvm::SmallVectorImpl<int>* cluster_to_core_index);
+
+// Remaps outputs of `new_parallel_execute` op that represent concurrent
+// execution of the `tf_device.cluster_func` at index `cluster_idx` of
+// `old_parallel_execute` with its users.
+// `num_results_pre_cluster` represent the # of outputs of
+// `new_parallel_execute` which are from ops before `tf_device.cluster_func` op.
+mlir::LogicalResult RemapOutputsFromLogicalDevices(
+    const mlir::Location& location,
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    llvm::SmallVector<llvm::SmallVector<int, 4>, 4> cluster_to_core_index,
+    int num_results_pre_cluster,
+    mlir::tf_device::ParallelExecuteOp old_parallel_execute, int cluster_idx,
+    mlir::tf_device::ParallelExecuteOp new_parallel_execute,
+    mlir::OpBuilder* builder);
+
+// Same as above, except creates tf.XlaConcatNd Op for split sharding if
+// use_xla_nd_ops is true, otherwise creates tf.Concat op.
+mlir::LogicalResult RemapOutputsFromLogicalDevices(
+    const mlir::Location& location,
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    llvm::SmallVector<llvm::SmallVector<int, 4>, 4> cluster_to_core_index,
+    int num_results_pre_cluster,
+    mlir::tf_device::ParallelExecuteOp old_parallel_execute, int cluster_idx,
+    mlir::tf_device::ParallelExecuteOp new_parallel_execute,
+    bool use_xla_nd_ops, mlir::OpBuilder* builder);
+
+// Determines each logical core argument to metadata argument index mapping,
+// based on sharding. The return value is indexed first by logical core then by
+// argument index.
+llvm::SmallVector<llvm::SmallVector<int64_t, 4>, 4> GetMetadataArgumentMapping(
+    const tpu::TPUCompileMetadataProto& metadata);
+
+// Gets the proper tensor dimension from XLA OpSharding.
+// "replicate_on_last_tile_dim" and "last_tile_dims" should be deducted from the
+// real Tensor dimensions when tiled.
+// For example:
+// f32[8,512](sharding={devices=[1,1,2]0,1 last_tile_dims={REPLICATED})
+// also means a replicated tensor over all devices.
+//
+// See xla_data.proto for detailed explanations on the fields.
+int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding);
+
+// A sharding with OTHER type may be REPLICATED if:
+// 'replicate_on_last_tile_dim' is true OR
+// 'last_tile_dims' is not empty
+// AND
+// other than replicated last tile dims, all other dims are not sharded.
+bool IsOtherReplicatedSharding(const xla::OpSharding& xla_sharding);
+
+// Returns whether the sharding is split sharding. i.e. A sharding with OTHER
+// type but not replicated.
+bool IsSplitSharding(const xla::OpSharding& sharding);
+
+// Returns whether the sharding is replicated. It includes sharding with
+// REPLICATED type and replicated OTHER type.
+bool IsReplicatedSharding(const xla::OpSharding& sharding);
+
+// Returns whether the shape of inputs and outputs is statically known when
+// split sharding is done on inputs or outputs.
+bool AreInputOutputShapesStaticallyKnownForSplitSharding(
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::ClusterFuncOp cluster_func);
+
+// Returns a map of dimension indices and number of splits for tiled sharding.
+absl::StatusOr<std::map<int, int>> GetDimensionIndicesAndNumSplitsFromSharding(
+    const xla::OpSharding& sharding);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h
new file mode 100644
index 00000000..efc8be06
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TO_STABLEHLO_PYTHON_PYWRAP_TENSORFLOW_TO_STABLEHLO_LIB_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TO_STABLEHLO_PYTHON_PYWRAP_TENSORFLOW_TO_STABLEHLO_LIB_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace mlir::tensorflow_to_stablehlo::pywrap {
+
+// Converts a TensorFlow SavedModel to a StableHLO MLIR module and serializes it
+// to bytecode.
+//
+// Args:
+//   input_path: The path to the SavedModel directory.
+//    exported_model_signatures: Comma-separated list of exported model
+//   signatures to convert. tag_names: Comma-separated list of tags for loading
+//    SavedModel.
+//   input_arg_shapes_str: A string representation of input argument
+//    shapes for 'main' entry-point, separating tensors with ':', dimension
+//    with ',', and using '?' for unknown sizes. For example,
+//    'input-arg-shapes=1,2::1,?' expresses argument shapes [1,2], [] and [1,?].
+//
+// Returns:
+//   An absl::StatusOr containing the serialized bytecode of the StableHLO
+//   module on success, or an error status on failure.
+absl::StatusOr<std::string> PywrapSavedModelToStablehlo(
+    absl::string_view input_path,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str);
+
+// Converts a TensorFlow MLIR module string to a StableHLO MLIR module and
+// serializes it to bytecode.
+//
+// Args:
+//   module_op_str: TensorFlow MLIR module string.
+//   input_arg_shapes_str: A string representation of input argument
+//    shapes for 'main' entry-point, separating tensors with ':', dimension
+//    with ',', and using '?' for unknown sizes. For example,
+//    'input-arg-shapes=1,2::1,?' expresses argument shapes [1,2], [] and [1,?].
+//
+// Returns:
+//   An absl::StatusOr containing the serialized bytecode of the StableHLO
+//   module on success, or an error status on failure.
+absl::StatusOr<std::string> PywrapTfModuleToStablehlo(
+    absl::string_view module_op_str, absl::string_view input_arg_shapes_str);
+
+}  // namespace mlir::tensorflow_to_stablehlo::pywrap
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TO_STABLEHLO_PYTHON_PYWRAP_TENSORFLOW_TO_STABLEHLO_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow_to_stablehlo/tf_to_stablehlo.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow_to_stablehlo/tf_to_stablehlo.h
new file mode 100644
index 00000000..bb0e2a07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tensorflow_to_stablehlo/tf_to_stablehlo.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TO_STABLEHLO_TF_TO_STABLEHLO_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TO_STABLEHLO_TF_TO_STABLEHLO_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace mlir {
+
+// Converts a TensorFlow model (either from a SavedModel or an MLIR module) to a
+// StableHLO MLIR module.
+//
+// Args:
+//  input_path: The path to the input TensorFlow SavedModel or MLIR module.
+//  context: The MLIR context to use for parsing or creating the MLIR module.
+//  exported_model_signatures: List of exported model signatures (strings) to
+//    convert.
+//  tag_names: List of tag names (strings) used for loading SavedModel.
+//    Ignored for MLIR input.
+//  input_arg_shapes_str:  A string representation of input argument shapes for
+//    'main' entry-point, separating tensors with ':', dimension with ',', and
+//    using '?' for unknown sizes. For example, 'input-arg-shapes=1,2::1,?'
+//    expresses argument shapes [1,2], [] and [1,?].
+//  is_input_mlir_module: If true, `input_path` is treated as an MLIR
+//    module instead of a SavedModel.
+//
+// Returns:
+//   An absl::StatusOr containing the converted StableHLO MLIR module on
+//   success, or an absl::Status with an error message on failure.
+absl::StatusOr<OwningOpRef<ModuleOp>> TfToStablehlo(
+    absl::string_view input_path, MLIRContext* context,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str, bool is_input_mlir_module);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TO_STABLEHLO_TF_TO_STABLEHLO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
new file mode 100644
index 00000000..b290554d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_CLUSTER_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_CLUSTER_TF_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+// Run all the passes involved in transforming the graph before execution so
+// that it is suitable for targeting devices when called via the TF1 Session
+// API.
+// These transformations take as input a Tensorflow Graph as an MLIR Module
+// and transforms the module in place to cluster the given ops for compilation
+// that is compatible with the given device_type. The MLIR should be in the TF
+// Executor Dialect for graph nodes and edges or TF Functional. It will convert
+// to TF Functional internally. Individual Op inside a node should be the
+// Tensorflow Dialect. The output MLIR is in the TF Functional Dialect.  The
+// input MLIR should not have infeed and outfeed ops, which are unsupported via
+// this API. Returns OkStatus if passed, otherwise an error.
+absl::Status RunSessionTf2xlaClusteringBridge(mlir::ModuleOp module,
+                                              bool is_in_fallback_enabled_mode);
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_CLUSTER_TF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
new file mode 100644
index 00000000..53431dfe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
@@ -0,0 +1,238 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_COMPILE_MLIR_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_COMPILE_MLIR_UTIL_H_
+
+#include <memory>
+
+#include "absl/base/attributes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+// Lowers MLIR module to XLA HLO inside an XlaComputation. The input module
+// should only contain operations in tf dialect. If the input module contains
+// operation in the tf_executor dialect, for example, returns an error.
+// Exception to this are tf_executor dialect ops that are optimized away through
+// canonicalization.
+//
+// Operations in tf dialect are lowered to XLA HLO through the following steps:
+//   . Legalizes control flow operations.
+//   . Decomposes compound resource operations so that the only remaining
+//     operations on resource variables are resource reads/writes..
+//   . Replaces resource reads/writes with function inputs/outputs and
+//     eliminates the use of resource variables.
+//   . Legalizes the operations to XLA HLO operations.
+//   . Canonicalizes the XLA HLO operations.
+//
+// device_type: XLA JIT device to use for compilation such as "XLA_CPU_JIT",
+//   "XLA_GPU_JIT" or "XLA_TPU_JIT".
+// use_tuple_args: when this is true, always create a tuple argument for the
+//   entry computation.
+// enable_op_fallback: when this is true, prefer tf2xla fallback kernels over
+// MLIR
+//   native kernels for legalization to HLO.
+// return_tuple: when this is true, always create a tuple result for the
+//   entry computation.
+// shape_determination_fns: Contains layout preference fn and shape
+//   representation fn. The two functions are used to determine argument and
+//   result shapes.
+// custom_legalization_passes: passes to run before the default TF legalization
+//   passes for backend-specific ops.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+absl::Status ConvertMLIRToXlaComputation(
+    mlir::ModuleOp module_op, llvm::StringRef device_type,
+    xla::XlaComputation* xla_computation, bool use_tuple_args,
+    bool enable_op_fallback, bool return_tuple,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns = {},
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {},
+    llvm::StringRef module_name = llvm::StringRef());
+
+// Creates a MLIR pipeline that lowers MLIR module to MHLO dialect. The input
+// module should only contain operations in tf dialect. For example, if the
+// input module contains operation in the tf_executor dialect, the pass raises
+// an error unless the tf_executor dialect ops are optimized away by
+// canonicalization.
+//
+// The pipeline is used in ConvertMLIRToXlaComputation. And it generally has the
+// following pass structure:
+// - TensorFlow passes
+// - Legalization passes
+// - MHLO passes
+//
+// device_type: XLA JIT device to use for compilation such as "XLA_CPU_JIT",
+//   "XLA_GPU_JIT" or "XLA_TPU_JIT".
+// enable_op_fallback: when this is true, prefer tf2xla fallback kernels over
+// MLIR
+//   native kernels for legalization to HLO.
+// custom_legalization_passes: passes to run before the default TF legalization
+//   passes for backend-specific ops.
+// lower_to_xla_hlo: Temporary parameter to be removed in imminent update. If
+//   true, includes legalization and MHLO lowering passes.
+// allow_partial_conversion: when this is true, allow operations that can't be
+//   legalized.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+void CreateConvertMlirToXlaHloPipeline(
+    mlir::OpPassManager& pm, llvm::StringRef device_type,
+    bool enable_op_fallback,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes,
+    bool lower_to_xla_hlo = true, bool allow_partial_conversion = false);
+
+// Helper struct representing argument tensor or resource handle shapes.
+struct TensorOrResourceShape {
+  TensorShape shape;
+  bool is_resource = false;
+};
+
+// Refine MLIR types based on new shape information.
+ABSL_DEPRECATED("Not meant to be used directly and should be a util.")
+absl::Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+                          mlir::ModuleOp module);
+
+// Lower TF to MHLO and insert HLO into the XlaBuilder. xla_params are HLO-level
+// inputs to module_op that have already been added to the XlaBuilder. returns
+// are the returned XlaOps.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+absl::Status BuildHloFromTf(mlir::ModuleOp module_op, xla::XlaBuilder& builder,
+                            llvm::ArrayRef<xla::XlaOp> xla_params,
+                            std::vector<xla::XlaOp>& returns,
+                            llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+                            llvm::StringRef device_type,
+                            llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+                                custom_legalization_passes);
+
+// Apply shape, description, and resource information to inputs and outputs
+// in the XlaCompilationResult. This should be called after
+// compilation_result->computation was set.
+ABSL_DEPRECATED("Not meant to be used directly and should be a util.")
+absl::Status PopulateResultIOInfo(
+    mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+    bool use_tuple_args, bool use_resource_updates_for_aliases,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    XlaCompilationResult* compilation_result);
+
+// Runs MLIR Bridge on an MLIR module.
+//
+// If lower_to_xla_hlo is true then compiles down into XLA HLO, generates all
+// accompanying metadata and stores them in CompilationResult.
+//
+// If enable_op_fallback is set to false, graph is legalized only if the graph
+// analysis for the graph is successful. Otherwise, an error is returned.
+//
+// Running the MLIR Bridge performs many transformations on the input module
+// which is modified in place.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+absl::Status CompileMlirToXlaHlo(
+    mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+    llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
+    bool use_return_tuple, bool use_resource_updates_for_aliases,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes,
+    llvm::StringRef module_name = llvm::StringRef(),
+    bool lower_to_xla_hlo = true);
+
+// Runs MLIR Bridge on a MLIR module.
+//
+// If lower_to_xla_hlo is true then compiles down into XLA HLO, generates all
+// accompanying metadata and stores them in CompilationResult.
+//
+// If enable_op_fallback is set to false, graph is legalized only if the graph
+// analysis for the graph is successful. Otherwise, an error is returned.
+//
+// On success, returns the serialized MLIR module.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+absl::StatusOr<std::string> CompileMlirToXlaHloAndSerialize(
+    mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+    llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
+    bool use_return_tuple, bool use_resource_updates_for_aliases,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes,
+    llvm::StringRef module_name = llvm::StringRef(),
+    bool lower_to_xla_hlo = true);
+
+// Runs MLIR Bridge on a serialized MLIR module.
+//
+// If lower_to_xla_hlo is true then compiles down into XLA HLO, generates all
+// accompanying metadata and stores them in CompilationResult.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+absl::StatusOr<std::string> CompileSerializedMlirToXlaHlo(
+    llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
+    llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {},
+    llvm::StringRef module_name = llvm::StringRef(),
+    bool lower_to_xla_hlo = true);
+
+// Compiles a TensorFlow Graph (already converted to MLIR, imported with
+// tf_executor dialect still present) into XLA HLO, generates all accompanying
+// metadata and stores them in CompilationResult. This will rewrite arguments
+// and run the TensorFlow standard pipeline prior to invoking
+// `CompileMlirToXlaHlo`.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
+absl::Status CompileGraphToXlaHlo(
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args,
+    llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
+    bool use_return_tuple,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
+
+// Compiles a Graph from TF to HLO and adds the resulting HLO to the
+// XlaBuilder. This function adds HLO to a larger HLO computation, so
+// HLO-level inputs are supplied, and HLO-level outputs are produced.
+// xla_params is the HLO-level inputs and returns is the HLO-level outputs.
+// If unconditionally_use_output_shapes is true then the unregistered
+// attribute _output_shapes is always used to set the output shapes of the ops.
+ABSL_DEPRECATED(
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHlo instead.")
+absl::Status BuildHloFromGraph(
+    const Graph& graph, xla::XlaBuilder& builder,
+    mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,
+    std::vector<xla::XlaOp>& returns, bool unconditionally_use_output_shapes,
+    llvm::ArrayRef<XlaArgument> args, llvm::ArrayRef<std::string> control_rets,
+    llvm::StringRef device_type, const FunctionLibraryDefinition& flib_def);
+
+static inline absl::Status CompileToHloGraphAnalysisFailedError() {
+  return errors::Internal("disabled after graph analysis");
+}
+
+// Register a convenient pipeline for invoking TF/XLA lowering from the command
+// line.
+void RegisterConvertMlirToXlaHloPipelineWithDefaults();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_COMPILE_MLIR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h
new file mode 100644
index 00000000..7007d70b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_COMPILE_TF_GRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_COMPILE_TF_GRAPH_H_
+
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/variant.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/compile_only_client.h"
+#include "xla/pjrt/compile_options.pb.h"
+#include "xla/shape.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+// Compiles the given Tensorflow graph into xla::HLO. The result is in
+// compilation_result. If the input computation is in MLIR, it will be
+// converted to a Tensorflow graph. Otherwise, the graph compiler will be run.
+absl::Status CompileTensorflowGraphToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_funcs,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    tsl::DeviceType device_type,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client,
+    XlaCompiler::CompilationResult* compilation_result);
+
+}  // namespace v1
+}  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_COMPILE_TF_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
new file mode 100644
index 00000000..d41627b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
+
+#include "absl/base/attributes.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+// Given the input Module op that's in the Tensorflow Dialect, convert the MLIR
+// module in place to the Tensorflow Executor Dialect. Returns an OK Status if
+// success, otherwise failure with an error message.
+// The Tensorflow Executor Dialect is required to export an MLIR module to a
+// Tensorflow GraphDef. This API will add control dependencies and verify that
+// the conversion was successful. This version adds extra control dependencies
+// for replication and parallel execution ops, which may slow performance.
+// Prefer to use the v2 of this API.
+//
+// This also converts the Tensorflow Dialect MLIR into the Tensorflow Executor
+// dialect that is suitable to be exported to GraphDef. Graph -> MLIR -> Graph
+// is not perfectly round trippable, so this API will attempt to make the module
+// exportable and verify some properties of the Tensorflow Executor MLIR that
+// are required by Graph Export. It will return an error if it cannot.
+//
+// Input: A MLIR Module in the Tensorflow Dialect with no
+// `tf_device.cluster_func` ops.
+// Output: A MLIR module in the Tensorflow Executor Dialect.
+
+ABSL_DEPRECATED(
+    "Use v2/tf_dialect_to_executor.h::ExportFromTensorflowDialectToExecutor "
+    "instead.")
+absl::Status ExportFromTensorflowDialectToExecutor(
+    mlir::ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
new file mode 100644
index 00000000..6e9576fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
@@ -0,0 +1,61 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_CLUSTER_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_CLUSTER_TF_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+// Run all the passes involved in transforming the graph before execution so
+// that it is suitable for targeting devices when called with the TF 2 Function
+// API. Users that need clustering with the Session API should use the v1 Bridge
+// API. These transformations take as input a Tensorflow Graph as an MLIR Module
+// and transforms the module in place to cluster the given ops for compilation
+// that is compatible with the given device_type. The MLIR should be in the TF
+// Executor Dialect for graph nodes and edges or be in TF Functional already.
+// Individual Op inside a node should be the Tensorflow Functional Dialect. The
+// output MLIR is in the TF Functional Dialect. Returns OkStatus if passed,
+// otherwise an error.
+//
+// Inputs:
+//   module - The MLIR Module that will be clustered. Expected to be in TF
+//   Executor Dialect or TF Functional Dialect. Will convert to TF Functional.
+//   is_supported_by_replicated_brige - If the graph targets the replicated
+//   bridge. Set it to true for replicated/partitioned graphs. e.g. replicated
+//   and single-core TPU graphs. Set this to false if the graph is not
+//   replicated, e.g. CPU/GPU graphs. is_in_fallback_enabled_mode - Whether this
+//   was called with fallback to the non-MLIR Bridge. This is just for logging
+//   purposes and doesn't affect logic. module_name - What the input module name
+//   is for debugging help.
+//
+// Output: Modifies the input module in place with clustered operations.
+//   status - Whether the transformation to cluster the input MLIR module was
+//   successful.
+absl::Status RunFunctionTf2xlaClusteringBridge(
+    mlir::ModuleOp module, bool is_supported_by_replicated_brige,
+    bool is_in_fallback_enabled_mode,
+    llvm::StringRef module_name = llvm::StringRef());
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_CLUSTER_TF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.h
new file mode 100644
index 00000000..1af93e6b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_GRAPH_TO_TF_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_GRAPH_TO_TF_EXECUTOR_H_
+
+#include <string>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+inline constexpr absl::string_view kImportModelDefaultGraphFuncName = "main";
+
+// Given a Graph, returns a MLIR module containing the graph, expressed with
+// tf_executor dialect.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphToTfExecutor(
+    const Graph& graph, const GraphDebugInfo& debug_info,
+    const FunctionLibraryDefinition& flib_def, const GraphImportConfig& specs,
+    mlir::MLIRContext* context,
+    std::unordered_map<std::string, std::string>* tf_name_to_mlir_name =
+        nullptr,
+    const ConfigProto& config_proto = {},
+    tensorflow::TF2XLABridgeVersion bridge_version =
+        tensorflow::TF2XLABridgeVersion::kNotBridgeUseCase);
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_GRAPH_TO_TF_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h
new file mode 100644
index 00000000..14a8271d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_LEGALIZE_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_LEGALIZE_TF_H_
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/variant.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/client/compile_only_client.h"
+#include "xla/pjrt/compile_options.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+// Legalizes the given mlir::Module into XLA HLO. If successful, returns the
+// compiled XLA HLO. V1 of the tf2xla uses MLIR whereas V0 does not use MLIR.
+//
+// Inputs:
+//  computation - The MLIR module op. It currently takes in
+//  tpu::FunctionToHloArgs but this is deprecated. arg_shapes - The shapes of
+//  the arguments in module_op. device_type - The device type to compile for.
+//  use_tuple_args - Pack the incoming arg shapes into a single tuple.
+//  custom_legalization_passes - Extra passes to lower from TF -> MHLO.
+//  arg_shapes  - The shapes of the args.
+//  arg_core_mapping - Which args go on which cores.
+//  per_core_arg_shapes - For each core, the shapes for each argument.
+//  client - The Xla Compilation client.
+absl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    llvm::StringRef device_type,
+    std::vector<std::unique_ptr<mlir::Pass>>& custom_legalization_passes,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client);
+
+};  // namespace v2
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_LEGALIZE_TF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/testing/compile_mlir.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/testing/compile_mlir.h
new file mode 100644
index 00000000..7394fe37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/testing/compile_mlir.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TESTING_COMPILE_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TESTING_COMPILE_MLIR_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+namespace testing {
+
+// Compiles the given MLIR module to XLA HLO.
+absl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
+    const char* mlir_module_str,
+    ConfigProto::Experimental::MlirBridgeRollout rollout_state,
+    absl::string_view device_type = "XLA_TPU_JIT");
+
+}  // namespace testing
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TESTING_COMPILE_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/testing/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/testing/utils.h
new file mode 100644
index 00000000..b2c2cf62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/testing/utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TESTING_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TESTING_UTILS_H_
+
+#include <string>
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+namespace testing {
+
+// Returns the path to the testdata directory.
+std::string TestDataPath();
+
+}  // namespace testing
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TESTING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
new file mode 100644
index 00000000..185cefa5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+// Given the input Module op that's in the Tensorflow Dialect, convert the MLIR
+// module in place to the Tensorflow Executor Dialect. Returns an OK Status if
+// success, otherwise failure with an error message.
+// The Tensorflow Executor Dialect is required to export an MLIR module to a
+// Tensorflow GraphDef. This API will add control dependencies and verify that
+// the conversion was successful.
+//
+// This also converts the Tensorflow Dialect MLIR into the Tensorflow Executor
+// dialect that is suitable to be exported to GraphDef. Graph -> MLIR -> Graph
+// is not perfectly round trippable, so this API will attempt to make the module
+// exportable and verify some properties of the Tensorflow Executor MLIR that
+// are required by Graph Export. It will return an error if it cannot.
+//
+// Input: A MLIR Module in the Tensorflow Dialect with no
+// `tf_device.cluster_func` ops.
+// Output: A MLIR module in the Tensorflow Executor Dialect.
+absl::Status ExportFromTensorflowDialectToExecutor(
+    mlir::ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/tf_executor_to_graph.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/tf_executor_to_graph.h
new file mode 100644
index 00000000..8fd7607a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/api/v2/tf_executor_to_graph.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_EXECUTOR_TO_GRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_EXECUTOR_TO_GRAPH_H_
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+// Converts an MLIR module to TensorFlow graph and FunctionLibraryDefinition.
+// The "main" function of the module is stored in the graph and the rest of
+// functions are stored in the library. Control ret nodes are stored separately
+// in `control_ret_nodes`.
+absl::Status ConvertTfExecutorToGraph(
+    mlir::ModuleOp module, const GraphExportConfig& configs,
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
+    absl::flat_hash_set<Node*>* control_ret_nodes);
+
+// Converts an MLIR function and adds it to a FunctionLibraryDefinition.
+absl::Status ConvertMlirFunctionToFunctionLibraryDef(
+    mlir::func::FuncOp func, const GraphExportConfig& configs,
+    FunctionDef* function_def);
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_EXECUTOR_TO_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h
new file mode 100644
index 00000000..6f8595cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_CLUSTERING_BRIDGE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_CLUSTERING_BRIDGE_PASSES_H_
+
+#include "absl/base/attributes.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Given the pass manager, add Bridge passes to cluster the replicated input
+// graphs.
+void AddReplicatedBridgeClusteringPipelinePasses(
+    mlir::OpPassManager& pm, llvm::StringRef module_name = llvm::StringRef());
+
+// Same as above but for non replicated graphs.
+void AddNonReplicatedBridgeClusteringPipelinePasses(mlir::OpPassManager& pm);
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_CLUSTERING_BRIDGE_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/compilation_timer.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/compilation_timer.h
new file mode 100644
index 00000000..2eb46935
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/compilation_timer.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_COMPILATION_TIMER_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_COMPILATION_TIMER_H_
+
+#include <chrono>  // NOLINT(build/c++11)
+
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+
+// Time the execution of kernels (in CPU cycles). Meant to be used as RAII.
+struct CompilationTimer {
+  uint64_t start_cycles =
+      tensorflow::profile_utils::CpuUtils::GetCurrentClockCycle();
+
+  uint64_t ElapsedCycles() {
+    return tensorflow::profile_utils::CpuUtils::GetCurrentClockCycle() -
+           start_cycles;
+  }
+
+  int64_t ElapsedCyclesInMilliseconds() {
+    std::chrono::duration<double> duration =
+        tensorflow::profile_utils::CpuUtils::ConvertClockCycleToTime(
+            ElapsedCycles());
+
+    return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+        .count();
+  }
+};
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_COMPILATION_TIMER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util.h
new file mode 100644
index 00000000..c08a2c39
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_GRAPH_TO_TF_EXECUTOR_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_GRAPH_TO_TF_EXECUTOR_UTIL_H_
+
+#include <optional>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// These are used for grouping the recorded stats appropriately. Specifically,
+// we're considering different entrypoints to the bridge as having potentially
+// interesting differences at least in the domain of accepted graphs so we want
+// to separately track graph features based on these unique entrypoints. One key
+// example of this distinction is for TFRT which uses the "nominal" TPU bridge
+// pipeline, but may potentially allow graphs with v1 control flow. This
+// separate grouping will allow us to dig into these differences granularly.
+enum class TF2XLABridgeVersion {
+  kNominal = 0,
+  kV1Compat,
+  kTFRTNominal,
+  kNotBridgeUseCase,
+};
+
+// Analyzes whether the graph has features not guaranteed to be supported by the
+// MLIR-based TF XLA bridge for phase 1. If MLIR bridge phase 1 is not used,
+// then MLIR bridge phase 2 will not be used. The optional `function_library`
+// can be provided if it contains function definitions not including in the
+// `graph` FunctionLibraryDefinition.
+//
+// Conservatively, during the initial rollout, we are not supporting graphs for
+// which any of the following are true:
+//
+//  - Not known to be TF2
+//  - Contains one or more reference variables
+//  - Contains one or more TPUPartitionedCall ops (which is a proxy for
+//    inference), but the graph is not v1 compat
+//  - Uses V1 control flow
+//  - Graph is invalid or otherwise encounters error during traversal
+// If `single_core_inference_mode` is true, we skip some of check conditions
+// because they are not applicable.
+// TODO(b/241702857): remove single_core_inference_mode
+bool GraphHasUnsupportedFeaturesInMlirBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library,
+    std::optional<ConfigProto> config_proto, TF2XLABridgeVersion bridge_version,
+    bool single_core_inference_mode);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_GRAPH_TO_TF_EXECUTOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h
new file mode 100644
index 00000000..7d4bf660
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_INFERENCE_INFERENCE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_INFERENCE_INFERENCE_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf2xla {
+namespace internal {
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateInferenceMetricsPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_INFERENCEMETRICSPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h.inc"
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_INFERENCE_INFERENCE_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h
new file mode 100644
index 00000000..fec64c0f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LEGALIZE_TF_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LEGALIZE_TF_MLIR_H_
+
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Runs all the MLIR Bridge passes on the given MLIR module.
+// If compile_to_xla_hlo is true then those passes include all the Legalization
+// to XLA HLO which is returned in the compilation_result.
+absl::Status CompileFromMlirToXlaHlo(
+    bool lower_to_xla_hlo, mlir::ModuleOp mlir_module_op,
+    const tpu::TPUCompileMetadataProto& metadata, llvm::StringRef device_type,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns& shape_determination_fns,
+    bool use_tuple_args, XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>>& custom_legalization_passes,
+    const std::vector<TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes);
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LEGALIZE_TF_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h
new file mode 100644
index 00000000..664bd549
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LEGALIZE_TF_TO_HLO_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LEGALIZE_TF_TO_HLO_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/client/compile_only_client.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Legalize the given MLIR module to XLA HLO using a combination of the MLIR
+// Bridge and XlaBuilder
+absl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
+    const tpu::MlirToHloArgs& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    llvm::StringRef device_type,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    std::vector<std::unique_ptr<mlir::Pass>>& custom_legalization_passes,
+    xla::CompileOnlyClient* client, XlaCompilationResult* compilation_result);
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LEGALIZE_TF_TO_HLO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h
new file mode 100644
index 00000000..61c5028a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LOGGING_HOOKS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LOGGING_HOOKS_H_
+
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Setup the input pass manager to enable IR dumping after each pass.
+// Note a side effect of this method is that multi threading will be disabled.
+void EnablePassIRPrinting(mlir::PassManager& pm,
+                          const std::string& dump_group_name,
+                          llvm::StringRef module_name = llvm::StringRef());
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_LOGGING_HOOKS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h
new file mode 100644
index 00000000..c0f2a5e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_BRIDGE_PASS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_BRIDGE_PASS_UTIL_H_
+
+#include <optional>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+
+// Checks if a graph or reachable functions in the library have any
+// StatefulPartitionedOps with _XlaMustCompile=true. The function library will
+// be skipped if nullptr is provided.
+bool IsSupportedByNonReplicatedBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library);
+
+// Checks if a graph or reachable functions in the library have any ops with
+// _tpu_replicate or _xla_compile_device_type=TPU. The function library will be
+// skipped if nullptr is provided.
+
+bool IsSupportedByReplicatedBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library);
+
+// Check if an MLIR module has any ops with _tpu_replicate or
+// _xla_compile_device_type=TPU.
+bool IsSupportedByReplicatedBridge(mlir::ModuleOp module);
+
+// Check if an MLIR module contains TPUPartitionedCall op. If so, we define
+// such graph as an inference graph. Otherwise, it is non inference graph.
+bool HasTPUPartitionedCallOpInModule(mlir::ModuleOp module);
+
+// Check if a graph contains TPUPartitionedCall op, including its reachable
+// functions. The function library is used to store the functions that are
+// defined in a TensorFlow program
+bool IsInferenceGraph(const Graph& graph,
+                      const FunctionLibraryDefinition* function_library);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_BRIDGE_PASS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h
new file mode 100644
index 00000000..f4375dfc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_PASS_INSTRUMENTATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_PASS_INSTRUMENTATION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+
+namespace mlir {
+
+void RegisterPassInstrumentor(
+    const std::string& name,
+    std::function<std::unique_ptr<PassInstrumentation>()> creator);
+std::vector<std::function<std::unique_ptr<PassInstrumentation>()>>
+GetPassInstrumentors();
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_PASS_INSTRUMENTATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/node_order.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/node_order.h
new file mode 100644
index 00000000..a6f65006
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/node_order.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_NODE_ORDER_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_NODE_ORDER_H_
+
+#include <functional>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+struct GroupByDevice {
+  std::string operator()(const Node* node) const {
+    return node->requested_device();
+  }
+};
+
+// Performs a topological ordering of nodes.
+// This has the property that any child node of a parent node p is emitted
+// before p. A grouping function is used to break ties if multiple child nodes
+// (of possibly different parents) are ready to be emitted at some point, which
+// is when we prefer to stay in the current group. Remaining ties are broken by
+// node name.
+// The "emit" function is used for outputing the result, and is called once
+// for each node.
+// This algorithm is O(n * k * log k), with k the largest node degree.
+void TopologicalOrdering(
+    const Graph& g, const std::function<void(Node*)>& emit,
+    const std::function<std::string(Node*)>& get_grouping_key);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_NODE_ORDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
new file mode 100644
index 00000000..4d91f113
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
@@ -0,0 +1,93 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_CLUSTERING_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_CLUSTERING_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Verifies that all MLIR Ops have the expected attributes.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateVerifyClusteringPass();
+
+// Creates a pass that forms clusters from operations of the same
+// `_replication_info` attribute.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTPUClusterFormationPass(bool strict_clusters = false);
+
+// Creates a pass that extracts outside compilation (Host ops inside device
+// cluster) at head/tail of Device cluster to run before/after XLA computation.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateExtractHeadTailOutsideCompilationPass();
+
+// Creates a pass that extract outside compilation (Host ops inside cevice
+// cluster) ops to a separate parallel_execute region to run on CPU.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateExtractOutsideCompilationPass();
+
+// Create a pass that encapsulates StatefulPartitionedCallOp within a cluster.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateXlaClusterFormationPass();
+
+// Creates a pass that marks unsupported ops in device cluster for outside
+// compilation.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateMarkOpsForOutsideCompilationPass();
+
+// Creates a pass that hoists reads out of a replicate that are on a variable
+// whose value is broacast to all replicas.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateHoistBroadcastReadPass();
+
+// Creates a pass that moves broadcasts from TF host ops to XLA code, encoded as
+// XlaAllReduces. This enables use of the device network for broadcasts, which
+// is faster.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateXlaBroadcastPass();
+
+// Creates a pass that identifies XLASharding ops in launch op for TPU
+// computation.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTPUShardingIdentificationPass();
+
+// Creates a pass that validates the inputs to a TPU computation.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTPUValidateSessionInputsPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTPUValidateInputsPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_MARKOPSFOROUTSIDECOMPILATIONPASS
+#define GEN_PASS_DECL_TPUCLUSTERFORMATIONPASS
+#define GEN_PASS_DECL_TPUEXTRACTHEADTAILOUTSIDECOMPILATIONPASS
+#define GEN_PASS_DECL_TPUEXTRACTOUTSIDECOMPILATIONPASS
+#define GEN_PASS_DECL_TPUSHARDINGIDENTIFICATIONPASS
+#define GEN_PASS_DECL_TPUVALIDATEINPUTSPASS
+#define GEN_PASS_DECL_TPUVALIDATESESSIONINPUTSPASS
+#define GEN_PASS_DECL_VERIFYCLUSTERINGPASS
+#define GEN_PASS_DECL_XLACLUSTERFORMATIONPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_CLUSTERING_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/lowering_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/lowering_passes.h
new file mode 100644
index 00000000..0be689c6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/lowering_passes.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_LOWERING_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_LOWERING_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Create a pass that just collects metrics about the input MLIR. Does not
+// logically transform the program.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateInputLoweringMetricsPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_INPUTLOWERINGMETRICSPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/lowering_passes.h.inc"
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_LOWERING_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h
new file mode 100644
index 00000000..4e28930b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_MLIR_TO_GRAPH_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_MLIR_TO_GRAPH_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Verifies that Executor input is of the expected format.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateVerifyInputDialectToExecutorPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_VERIFYINPUTDIALECTTOEXECUTORPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h.inc"
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_MLIR_TO_GRAPH_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils.h
new file mode 100644
index 00000000..152b2e02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_TPU_VALIDATE_INPUTS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_TPU_VALIDATE_INPUTS_UTILS_H_
+
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+constexpr char kTpuReplicatedCoreZeroAttr[] = "TPU_REPLICATED_CORE:0";
+
+using mlir::ModuleOp;
+using mlir::Operation;
+using mlir::StringAttr;
+using mlir::TypeID;
+using mlir::TF::InfeedDequeueTupleOp;
+using mlir::TF::kDeviceAttr;
+using mlir::tf_executor::GraphOp;
+
+bool IsPotentialUnsupportedOp(Operation* op);
+
+bool HasV1ControlFlow(GraphOp graph);
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_TPU_VALIDATE_INPUTS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/test_matchers.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/test_matchers.h
new file mode 100644
index 00000000..57c65bbf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/test_matchers.h
@@ -0,0 +1,91 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_TEST_MATCHERS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_TEST_MATCHERS_H_
+
+#include <gmock/gmock.h>
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
+#include "tsl/platform/statusor.h"
+
+template <typename T>
+bool WasGraphAnalysisFailure(const absl::StatusOr<T>& status) {
+  return (status.status() ==
+          tensorflow::CompileToHloGraphAnalysisFailedError());
+}
+
+/* The third party version of the Graph Analysis always returns disabled so
+ * these matchers short circuit on that error. */
+MATCHER(IsOkOrFiltered,
+        "Status was OK or equal to the Graph Analysis failure") {
+  bool is_ok = arg.ok();
+  auto graph_analysis_failure = WasGraphAnalysisFailure(arg);
+  return testing::ExplainMatchResult(
+      testing::IsTrue(), is_ok || graph_analysis_failure, result_listener);
+}
+
+MATCHER_P2(IncrementedOrFiltered, metric, value,
+           "Metric was incremented by value or Status equal to the Graph "
+           "Analysis failure") {
+  auto graph_analysis_failure = WasGraphAnalysisFailure(arg);
+  if (graph_analysis_failure) {
+    return testing::ExplainMatchResult(testing::IsTrue(),
+                                       graph_analysis_failure, result_listener);
+  }
+  return testing::ExplainMatchResult(testing::Eq(metric), value,
+                                     result_listener);
+}
+
+MATCHER_P(ComputationProtoContains, regex,
+          "If not a Graph Analysis failure then matches the computation result "
+          "with the regex") {
+  auto graph_analysis_failure = WasGraphAnalysisFailure(arg);
+  if (graph_analysis_failure) {
+    return testing::ExplainMatchResult(testing::IsTrue(),
+                                       graph_analysis_failure, result_listener);
+  }
+  auto proto = arg.value().computation->proto().DebugString();
+  return testing::ExplainMatchResult(testing::ContainsRegex(regex), proto,
+                                     result_listener);
+}
+
+MATCHER_P(XlaComputationProtoContains, regex,
+          "If not a Graph Analysis failure then matches the computation result "
+          "with the regex") {
+  auto graph_analysis_failure = WasGraphAnalysisFailure(arg);
+  if (graph_analysis_failure) {
+    return testing::ExplainMatchResult(testing::IsTrue(),
+                                       graph_analysis_failure, result_listener);
+  }
+  auto proto = arg.value().proto().DebugString();
+  return testing::ExplainMatchResult(testing::ContainsRegex(regex), proto,
+                                     result_listener);
+}
+
+MATCHER_P(
+    HasMlirModuleWith, expected,
+    "If not a Graph Analysis failure then matches the mlir module result") {
+  auto graph_analysis_failure = WasGraphAnalysisFailure(arg);
+  if (graph_analysis_failure) {
+    return testing::ExplainMatchResult(testing::IsTrue(),
+                                       graph_analysis_failure, result_listener);
+  }
+  auto actual = arg.value();
+  return testing::ExplainMatchResult(testing::ContainsRegex(expected), actual,
+                                     result_listener);
+}
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_TEST_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h
new file mode 100644
index 00000000..6dd9851f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_DIALECT_DETECTION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_DIALECT_DETECTION_UTILS_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Returns true if the op has a valid namespace during clustering & tf dialect
+// to executor components of the Bridge.
+bool IsInBridgeAcceptableDialects(mlir::Operation* op);
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_DIALECT_DETECTION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/utils/test_metadata_config.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/utils/test_metadata_config.h
new file mode 100644
index 00000000..83d6beb2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/internal/utils/test_metadata_config.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_TEST_METADATA_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_TEST_METADATA_CONFIG_H_
+
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Fills in arg_shapes and metadata_proto with appropriate values based on the
+// input mlir module.
+absl::Status ConfigureMetadata(absl::string_view mlir_module_str,
+                               std::vector<TensorShape>& arg_shapes,
+                               tpu::TPUCompileMetadataProto& metadata_proto);
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_TEST_METADATA_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
new file mode 100644
index 00000000..7508a8d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_MLIR_BRIDGE_ROLLOUT_POLICY_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_MLIR_BRIDGE_ROLLOUT_POLICY_H_
+
+#include <optional>
+
+#include "mlir/IR/BuiltinOps.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+enum class MlirBridgeRolloutPolicy {
+  // The MLIR bridge is explicitly disabled by the user and must not be run.
+  kDisabledByUser = 0,
+  // The MLIR bridge is explicitly enabled by the user and must be run. If the
+  // MLIR bridge errors, the fallback path should NOT be used.
+  kEnabledByUser,
+  // The bridge was not explicitly enabled or disabled by the user. Based on the
+  // features in the model, the MLIR bridge should not be run.
+  kDisabledAfterGraphAnalysis,
+  // The bridge was not explicitly enabled or disabled by the user. Based on the
+  // features in the model, the MLIR bridge should be run. If the MLIR Bridge
+  // errors, the fallback path should be used whenever possible.
+  kEnabledAfterGraphAnalysis,
+};
+
+// Analyzes the user requested policy as well as the contents of the graph and
+// returns true when the MLIR Bridge should be run.
+//
+// If the user explicitly requests the bridge be enabled or disabled, this
+// function will respect the request. If the user does not explicitly request
+// enabled or disabled, it will decide whether or not to run the bridge.
+//
+// The config_proto param is a required input for all TF1 graphs but it is
+// redundant for TF2 graphs.
+// If getting rollout policy involves graph analysis, `record_stats` is used
+// to decide whether to emit metrics on unsupported features of the graph.
+MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
+    const tensorflow::Graph& graph,
+    const FunctionLibraryDefinition* function_library,
+    std::optional<tensorflow::ConfigProto> config_proto,
+    bool is_supported_by_replicated_brige, bool is_v1_compat,
+    bool record_stats);
+
+static inline MlirBridgeRolloutPolicy GetMlirBridge2ndPhaseRolloutPolicy(
+    mlir::ModuleOp module) {
+  return MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis;
+}
+
+// Explicit Interface for when we want to log features vs test the validity of
+// the graph for MLIR bridge processing.  Note that right now the logging
+// which is done in the logic used by GraphHasFeaturesUnsupportedByMlirBridge
+// has diverged and logs supported features as well.  Parameters are the same
+// as for GetMlirBridgeRolloutPolicy with the exception of
+// record_stats, which isn't needed because this interface will always record.
+void LogGraphFeatures(const Graph& graph,
+                      const FunctionLibraryDefinition* function_library,
+                      std::optional<ConfigProto> config_proto,
+                      bool is_v1_compat);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_MLIR_BRIDGE_ROLLOUT_POLICY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h
new file mode 100644
index 00000000..b94f3370
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZATION_OP_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZATION_OP_CONFIG_H_
+
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+
+namespace mlir {
+namespace mhlo {
+
+// Given the type ID, check if it's legalized with MLIR.
+bool IsTypeLegalizedWithMlir(const TypeID& type_id);
+
+// Returns true if the op is considered a dynamic padder op.
+bool IsDynamicPadderOp(const TypeID& type_id);
+
+// Returns True if this op has a Tf2XLA fallback. Currently, this is not the
+// inverse of the !IsOpLegalizedWithMlir, but it should be.
+bool HasTf2XlaFallback(const TypeID& type_id);
+
+// Whether this type is allowed to have a TF2XLA fallback.
+bool IsOpAllowedTf2xlaFallback(const TypeID& type_id);
+
+// Whether this type is Preferred to use a TF2XLA fallback kernel when using
+// the MLIR bridge. If this is true, then the TF2XLA fallback kernel will be
+// used over the MLIR lowering.
+bool IsOpAllowedTf2xlaPreferred(const TypeID& type_id);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZATION_OP_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h
new file mode 100644
index 00000000..8c83fb56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZE_TF_WITH_TF2XLA_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZE_TF_WITH_TF2XLA_PASSES_H_
+
+#include <memory>
+#include <optional>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace func {
+class FuncOp;
+}
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+
+namespace mhlo {
+
+/// Converter to be used along with the fallback Tf2Xla patterns below.
+class Tf2XlaTypeConverter : public TypeConverter {
+ public:
+  Tf2XlaTypeConverter();
+};
+
+/// Adds the TF to XLA via TF2XLA rewrite patterns to the pattern list.
+/// `prefer_tf2xla` means an op will be included iff it is not in
+/// `MlirLegalizedUnderPreferTf2XlaSet`. `!prefer_tf2xla` mean an op will be
+/// included if there is no native MLIR legalization for the op.
+void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
+                                          RewritePatternSet& patterns,
+                                          MLIRContext* ctx,
+                                          Tf2XlaTypeConverter& converter,
+                                          bool prefer_tf2xla = false);
+
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZE_TF_WITH_TF2XLA_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
new file mode 100644
index 00000000..0b9f5a1e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
@@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <optional>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace func {
+class FuncOp;
+}
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+
+namespace mhlo {
+
+/// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
+/// false, emits an error if there is any operation that can't be legalized.
+/// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
+/// patterns from TF2XLA fallback for provided device type (see
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
+/// used.
+/// Note: This is a module pass because when legalizing with TF2XLA fallback,
+/// functions are imported into the module. Importing functions into a
+/// module is not thread safe.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFPass(
+    bool legalize_chlo = true,
+    std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt,
+    bool prefer_tf2xla = false);
+
+/// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
+/// list.
+void PopulateLegalizeTfPatterns(MLIRContext* context,
+                                RewritePatternSet* patterns);
+
+// Populates TF to MHLO legalization for some of the quantization ops.
+//
+// TODO(hinsu): Remove this once we combine quantized and non quantized op
+// legalization in the ODML conversion pipeline.
+void PopulateLegalizeTfQuantizationPatterns(MLIRContext* context,
+                                            RewritePatternSet* patterns);
+
+/// Converts the provided Operation as well as all nested operations into HLO
+/// dialect using the conversion patterns registered by the HLO dialect. When
+/// allow_partial_conversion is false, emits an error if there is any operation
+/// that can't be legalized.
+/// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
+/// patterns from TF2XLA fallback for provided device type (see
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
+/// used.
+LogicalResult legalizeTF(
+    Operation* op, bool allow_partial_conversion = false,
+    bool legalize_chlo = true,
+    std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt,
+    bool prefer_tf2xla = false);
+
+// Legalizes TF/XLA communication ops (TF dialect) to HLO dialect communication
+// ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCommunicationPass();
+
+// Legalizes TF/XLA collective ops (TF dialect) to HLO dialect collective
+// ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCollectivePass();
+
+// Verifies that the TF/XLA ops have all been lowered to MHLO.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateVerifyTFXLALegalizationPass(
+    bool legalize_chlo = true);
+
+// Transforms TFXLA Device specific ops into device independent ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTFXLADeviceSpecificTransformsPass(
+    std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt);
+
+// Adjusts XLA layout for Infeed ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateInfeedsOpsXlaAdjustLayoutPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_INFEEDSOPSXLAADJUSTLAYOUT
+#define GEN_PASS_DECL_LEGALIZETF
+#define GEN_PASS_DECL_LEGALIZETFCOLLECTIVE
+#define GEN_PASS_DECL_LEGALIZETFMODULEPASS
+#define GEN_PASS_DECL_LEGALIZETFTYPESPASS
+#define GEN_PASS_DECL_TFXLADEVICESPECIFICTRANSFORMS
+#define GEN_PASS_DECL_VERIFYTFXLALEGALIZATION
+#include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.h.inc"
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_LEGALIZETFCOMMUNICATIONPASS
+#include "tensorflow/compiler/mlir/tf2xla/transforms/tf_xla_passes.h.inc"
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.h
new file mode 100644
index 00000000..21de5950
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_SPLIT_INTO_ISLAND_PER_OP_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_SPLIT_INTO_ISLAND_PER_OP_PASS_H_
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+
+namespace mlir {
+namespace TF {
+
+// Converts a single island into multiple islands (one for each op).
+void SplitIsland(mlir::tf_executor::IslandOp island_op,
+                 mlir::tf_executor::ControlType control_type);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_SPLIT_INTO_ISLAND_PER_OP_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
new file mode 100644
index 00000000..0ad6e9af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace mhlo {
+namespace test {
+
+// Given a raw string, return a ModuleOp that can be used with the given
+// MLIRContext.
+absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+    absl::string_view module_string, MLIRContext* mlir_context);
+
+}  // namespace test
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
new file mode 100644
index 00000000..c5c417e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
@@ -0,0 +1,128 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace mlir {
+namespace mhlo {
+
+class Tf2XlaRewriterTestPeer;
+
+class Tf2XlaRewriter {
+ public:
+  static mlir::LogicalResult RewriteOp(mlir::Operation* op,
+                                       mlir::PatternRewriter& rewriter,
+                                       const std::string& device_type);
+
+ private:
+  friend class Tf2XlaRewriterTestPeer;
+
+  Tf2XlaRewriter(mlir::Operation* op, mlir::PatternRewriter& rewriter,
+                 const std::string& device_type);
+
+  ~Tf2XlaRewriter();
+
+  // Compiles the given Operation with XlaBuilder and imports the generated HLO
+  // via the HLO -> MHLO importer.
+  absl::StatusOr<mhlo::TupleOp> CompileWithHloImporter(
+      tensorflow::OpKernelContext& op_context);
+
+  // Import the given XlaComputation into the parent module. Returns the given
+  // generated function.
+  absl::StatusOr<mhlo::TupleOp> ImportXlaComputation(
+      xla::XlaComputation& computation);
+
+  // Prepares OpKernelContext params common to all the ops.
+  // Emits an error on failure.
+  mlir::LogicalResult PrepareParams();
+
+  // Given the required_consts, it will fill the 3 output vectors with
+  // their respective data.
+  // Expressions: Output XLA expressions as required by the compiled kernel.
+  // Tensors: Vector of tensors that back the TensorValue inputs
+  // Inputs: Vector of inputs that are backed by tensors.
+  mlir::LogicalResult PrepareKernelInputs(
+      const llvm::SmallDenseSet<int>& required_consts,
+      std::vector<tensorflow::XlaExpression>& expressions,
+      std::vector<tensorflow::Tensor>& tensors,
+      std::vector<tensorflow::TensorValue>& inputs);
+
+  mlir::LogicalResult VerifyOpResults(tensorflow::OpKernelContext& op_context);
+  mlir::LogicalResult GetKernelOutputs(tensorflow::OpKernelContext& op_context,
+                                       mhlo::TupleOp tuple_results,
+                                       llvm::SmallVector<Value>& outputs);
+
+  // Given a translated function with a single return value, unpack the tuple
+  // results.
+  mlir::LogicalResult UnpackTupleResults(mhlo::TupleOp tuple_result,
+                                         llvm::SmallVector<Value>& outputs);
+
+  // Tries to legalize the specified TensorFlow op, if supported.
+  //
+  // Emits an error and returns failure if an error is encountered during
+  // conversion. Note that success return value doesn't mean successful
+  // legalization.
+  mlir::LogicalResult LegalizeOp();
+
+  // Converts the given operand to expression of kind kConstant or kXlaOp.
+  // Emits a remark and returns expression of kind kInvalid on failure.
+  tensorflow::XlaExpression GetExprForOperand(mlir::Value operand,
+                                              mlir::Operation* op,
+                                              int64_t operand_index);
+
+  mlir::Operation* op_;
+  std::string device_type_;
+
+  mlir::PatternRewriter& rewriter_;
+  std::unique_ptr<tensorflow::OpOrArgLocNameMapper> name_mapper_;
+
+  tensorflow::XlaContext* context_;  // Ref-counted.
+
+  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
+  tensorflow::Device* device_;  // Owned by device_mgr_;
+  std::unique_ptr<tensorflow::ScopedStepContainer> step_container_;
+  std::unique_ptr<tensorflow::FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr_;
+  tensorflow::OpKernelContext::Params params_;
+
+  xla::XlaBuilder xla_builder_;
+};
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
new file mode 100644
index 00000000..5dba4a4d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_UTILS_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Builds body for reduce op by using the template binary op as the
+// reducer op.
+template <typename Op>
+void BuildReduceBody(Type element_type, Region* body, OpBuilder* builder) {
+  OpBuilder::InsertionGuard guard(*builder);
+  Block* block = builder->createBlock(body);
+
+  // Block arguments are scalars of the given element type.
+  Type type = RankedTensorType::get(/*shape=*/{}, element_type);
+  Location loc = body->getLoc();
+  block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
+
+  auto reducer =
+      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
+  builder->create<ReturnOp>(loc, reducer.getResult());
+}
+
+ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
+                                OpBuilder* builder);
+
+ConstantOp GetScalarNegZeroOfType(Type ty, Location loc, OpBuilder* builder);
+
+// Converts an ArrayAttr to a 1D 64-bit dense elements attribute.
+DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr);
+DenseIntElementsAttr GetI64ElementsAttr(llvm::ArrayRef<int64_t> values,
+                                        Builder* builder);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h
new file mode 100644
index 00000000..1711e039
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace mhlo {
+
+// Returns a ConversionTarget that includes default legalized MLIR dialects
+// for conversion to XLA.
+// If legalize_chlo is true, the resulting conversion target cannot have CHLO.
+mlir::ConversionTarget GetDefaultLegalConversionTargets(
+    MLIRContext& mlir_context, bool legalize_chlo);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
new file mode 100644
index 00000000..73d241b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+extern const char* const kTFRLibEnv;
+
+using tsl::StatusOr;
+
+// An wrapper for all the objects used to decompose a module (graph mode) and
+// node_def (eager mode). Note that this class owns the decomposition library.
+class TFRDecomposeContext {
+ public:
+  // The entry function to get a decompose context. All the required passes have
+  // been initialized.
+  static absl::StatusOr<std::unique_ptr<TFRDecomposeContext>> Get(
+      mlir::MLIRContext* mlir_ctx);
+
+  // Constructor of the decompose context. To share the decompose library, the
+  // whole decompose TFR function library is loaded.
+  explicit TFRDecomposeContext(mlir::ModuleOp tfr_module);
+
+  // Constructs the decompose context from the tfr text module and the mlir
+  // context. The tfr text module is added to the mlir context.
+  static std::unique_ptr<TFRDecomposeContext> GetFromText(
+      StringPiece tfr_raw_text, mlir::MLIRContext* mlir_ctx);
+
+  // Decomposes the op in the NodeDef to a set of primitive ops according to the
+  // decompose library in the context. Wrap the decomposed result in a
+  // FunctionDef.
+  absl::StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                         StringPiece func_name);
+
+  // Runs the decompose passes on the user_module.
+  absl::Status DecomposeGraph(mlir::ModuleOp user_module);
+
+  // Erases the tfr_module created.
+  void Destroy();
+
+ private:
+  mlir::ModuleOp tfr_module_;
+  mlir::PassManager pm_;
+
+  GraphExportConfig export_confs_;
+};
+
+// Decomposes the NodeDef to a set of primitive ops according to the decompose
+// library loaded. Wrap the decomposed result in a FunctionDef.
+absl::StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                       StringPiece func_name);
+
+// Decomposes the ops in the ModuleOp to a set of primitive ops according to
+// decompose library in the context.
+absl::Status DecomposeGraph(mlir::ModuleOp user_module);
+
+}  // namespace tfr
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/ir/tfr_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/ir/tfr_ops.h
new file mode 100644
index 00000000..2066d7ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/ir/tfr_ops.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_OPS_H_
+
+#include "llvm/ADT/StringSet.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+constexpr char kAttrArgumentNameAttr[] = "tfr.name";
+constexpr char kAttrArgumentDefaultAttr[] = "tfr.default";
+constexpr char kAttrArgumentTypeAttr[] = "tfr.type";
+
+class TFRDialect : public Dialect {
+ public:
+  explicit TFRDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "tfr"; }
+
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+
+  // Parse a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  // Prints a type registered to this dialect.
+  void printType(Type ty, DialectAsmPrinter &os) const override;
+};
+
+}  // namespace TFR
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/ir/tfr_types.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
new file mode 100644
index 00000000..e0e24f4a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
@@ -0,0 +1,126 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/TypeSupport.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+class TFRType : public Type {
+ public:
+  using Type::Type;
+
+  static bool classof(Type type);
+};
+
+namespace detail {
+
+struct TFRTypeStorage final
+    : public TypeStorage,
+      public llvm::TrailingObjects<TFRTypeStorage, StringAttr> {
+  using KeyTy = ArrayRef<StringAttr>;
+
+  explicit TFRTypeStorage(unsigned num_attrs) : num_attrs(num_attrs) {}
+
+  static TFRTypeStorage* construct(TypeStorageAllocator& allocator, KeyTy key) {
+    // Allocate a new storage instance.
+    auto byteSize = TFRTypeStorage::totalSizeToAlloc<StringAttr>(key.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(TFRTypeStorage));
+    auto result = ::new (rawMem) TFRTypeStorage(key.size());
+
+    // Copy in the string attributes into the trailing storage.
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<StringAttr>());
+    return result;
+  }
+
+  bool operator==(const KeyTy& attrs) const { return attrs == GetAttrs(); }
+
+  KeyTy GetAttrs() const {
+    return {getTrailingObjects<StringAttr>(), num_attrs};
+  }
+
+  unsigned num_attrs;
+};
+
+template <typename Derived>
+class TFRTypeImpl : public Type::TypeBase<Derived, TFRType, TFRTypeStorage> {
+ public:
+  using Base = Type::TypeBase<Derived, TFRType, TFRTypeStorage>;
+  using TFRBase = TFRTypeImpl<Derived>;
+  using Base::Base;
+
+  static Derived get(ArrayRef<StringAttr> attrs, MLIRContext* context) {
+    return Base::get(context, attrs);
+  }
+
+  static Derived getChecked(ArrayRef<StringAttr> attrs, Location loc) {
+    return Base::getChecked(loc, loc.getContext(), attrs);
+  }
+  static Derived getChecked(function_ref<InFlightDiagnostic()> emitError,
+                            MLIRContext* context, ArrayRef<StringAttr> attrs) {
+    return Base::getChecked(emitError, context, attrs);
+  }
+
+  static Derived get(MLIRContext* context) { return get({}, context); }
+
+  // TODO(fengliuai): fix the implementation
+  static LogicalResult verify(function_ref<InFlightDiagnostic()> emitError,
+                              ArrayRef<StringAttr> attrs) {
+    return success();
+  }
+
+  ArrayRef<StringAttr> getAttrKeys() { return Base::getImpl()->GetAttrs(); }
+};
+}  // namespace detail
+
+class TFRTensorType : public detail::TFRTypeImpl<TFRTensorType> {
+ public:
+  using TFRBase::TFRBase;
+  static constexpr StringLiteral name = "tfr.tensor";
+  static std::string getTypeName() { return "TFRTensorType"; }
+};
+
+class TFRTensorListType : public detail::TFRTypeImpl<TFRTensorListType> {
+ public:
+  using TFRBase::TFRBase;
+  static constexpr StringLiteral name = "tfr.tensor_list";
+  static std::string getTypeName() { return "TFRTensorListType"; }
+};
+
+class TFRAttrType : public Type::TypeBase<TFRAttrType, TFRType, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name = "tfr.attr";
+  static std::string getTypeName() { return "TFRAttrType"; }
+};
+
+}  // namespace TFR
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/passes/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/passes/passes.h
new file mode 100644
index 00000000..00bf1187
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/passes/passes.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_PASSES_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_PASSES_PASSES_H_
+
+#include <memory>
+#include <optional>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+// Scans the func op and adds all the canonicalization patterns of the ops
+// except the tf ops, inside the function.
+void populateCanonicalizationPatterns(func::FuncOp func,
+                                      RewritePatternSet &patterns);
+
+// Decompose ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateDecomposeTFOpsPass(
+    std::optional<ModuleOp> tfr_module = std::nullopt);
+
+// Rewrites quantized operands and results with their storage types.
+// This pass should be run at module level after decomposition, if there are
+// quantized operands or results.
+std::unique_ptr<OperationPass<ModuleOp>> CreateRewriteQuantizedIOPass();
+
+// Raise to TF ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateRaiseToTFOpsPass(
+    std::optional<ModuleOp> tfr_module = std::nullopt,
+    bool materialize_derived_attrs = false);
+
+}  // namespace TFR
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_PASSES_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/utils/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/utils/utils.h
new file mode 100644
index 00000000..911015ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfr/utils/utils.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_UTILS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_UTILS_UTILS_H_
+
+#include <string>
+
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+
+namespace mlir {
+namespace TFR {
+
+// This is a hardcoded rule for mapping a TF op name to the corresponding
+// TFR function name. Examples:
+//   tf.Pack => tf__pack
+//   tf.ConcatV2 => tf__concat_v2
+// TODO(fengliuai): move to an util file.
+std::string GetComposeFuncName(StringRef tf_op_name);
+
+// This is a hardcoded rule for mapping a TFR function op name to the
+// corresponding TF opname. Examples:
+//   tf__pack -> tf.Pack
+//   tf__concat_v2 => tf.ConcatV2
+std::string GetTFOpName(StringRef compose_func_name);
+
+// Validate the attributes of 'src' is either contained in the registered
+// attribute sets or in the allowed list.
+LogicalResult ValidateAttrs(Operation* src, const StringSet<>& registered);
+
+// Copies all the allowed attributes in 'src' to 'dst'. The copy failed if the
+// 'dst' has the attribute. Return a failure if there are any attributes are not
+// allowed and also unregistered.
+LogicalResult CopyAllowedUnregisteredAttrs(Operation* src, CallOp dst,
+                                           const StringSet<>& registered);
+
+// Copies all the allowed attributes in 'src' to 'dst'. FlatSymbolRefAttr is
+// excluded.
+LogicalResult CopyNonSymbolRefAttrs(CallOp src, Operation* dst);
+
+// Propagates all the attributes in 'src' to the operations between 'begin' and
+// 'end'. Operation 'end' is excluded.
+void PropagateAttrsToOperations(CallOp src, Block::iterator begin,
+                                Block::iterator end);
+
+}  // namespace TFR
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_UTILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
new file mode 100644
index 00000000..b27b6aa9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
@@ -0,0 +1,98 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COST_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COST_ANALYSIS_H_
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/op_cost_map.pb.h"
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Analyze costs for tensorflow operations.
+//
+// The current heuristic used is quite simple, which is to calculate the total
+// size of input tensors. The exception is that ops whose cost is irrelevant to
+// input sizes, such as tf.Shape and tf.Reshape, are whitelisted to have cheap
+// cost. This cost analysis is expected to be used conservatively (eg. use a low
+// threshold to decide whether a cost is cheap or expensive), as it might not be
+// accurate in some cases.
+//
+class CostAnalysis {
+ public:
+  explicit CostAnalysis(
+      mlir::func::FuncOp func_op,
+      const tfrt_stub::CostRecorder* cost_recorder = nullptr) {
+    cost_recorder_ = cost_recorder;
+    AnalyzeArguments(func_op);
+    AnalyzeBlock(&func_op.front());
+  }
+
+  int64_t GetCost(mlir::Operation* op) const;
+
+ private:
+  void AnalyzeArguments(mlir::func::FuncOp func_op);
+  void AnalyzeBlock(mlir::Block* block);
+  void EvaluateCost(mlir::Operation* op);
+
+  int64_t max_arg_size_ = 1;
+  llvm::DenseMap<mlir::Operation*, int64_t> cost_map_;
+  const tfrt_stub::CostRecorder* cost_recorder_;
+};
+
+struct CostContext {
+  int64_t default_unranked_tensor_size;
+};
+
+using CostFunction =
+    std::function<int64_t(const CostContext&, mlir::Operation*)>;
+
+void RegisterCostFunction(absl::string_view op_name,
+                          CostFunction cost_function);
+
+template <typename OpType, typename F>
+void RegisterCostFunction(F f) {
+  RegisterCostFunction(
+      OpType::getOperationName().str(),
+      [f = std::move(f)](const CostContext& context, mlir::Operation* op) {
+        return f(context, llvm::cast<OpType>(op));
+      });
+}
+
+template <typename OpType>
+struct CostFunctionRegistration {
+  explicit CostFunctionRegistration(
+      std::function<int64_t(const CostContext&, OpType)> cost_function) {
+    RegisterCostFunction<OpType>(std::move(cost_function));
+  }
+};
+
+bool HasCostFunctionRegistered(absl::string_view op_name);
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COST_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.h
new file mode 100644
index 00000000..4f8501b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.h
@@ -0,0 +1,52 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_TENSOR_ARRAY_SIDE_EFFECT_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_TENSOR_ARRAY_SIDE_EFFECT_ANALYSIS_H_
+
+#include "llvm/ADT/DenseSet.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Return true if it is a TensorArrayOp, eg. TensorArrayV3Op.
+bool IsTensorArrayOp(mlir::Operation* op);
+
+// This class provides utilities for analyzing side effects for TensorArray ops
+// in the graph. mlir::TF::SideEffectAnalysis currently produces suboptimal
+// side-effect analysis for TensorArray ops. On the other hand, control
+// dependencies are already sorted out for TensorArray ops in the original TF
+// graph. Each TensorArray op will take or produce a `flow` value and they are
+// already properly chained in the origninal TF graph.
+class TensorArraySideEffectAnalysis {
+ public:
+  explicit TensorArraySideEffectAnalysis(mlir::ModuleOp module);
+
+  // Return if the function contains only non-side-effecting ops or TensorArray
+  // ops.
+  bool HasAtMostTensorArrayEffect(mlir::func::FuncOp func_op) const {
+    return set_.count(func_op) > 0;
+  }
+
+ private:
+  llvm::DenseSet<mlir::func::FuncOp> set_;
+};
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_TENSOR_ARRAY_SIDE_EFFECT_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/backend_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/backend_compiler.h
new file mode 100644
index 00000000..7167c8ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/backend_compiler.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BACKEND_COMPILER_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_BACKEND_COMPILER_H_
+
+#include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+
+namespace tensorflow {
+
+class BackendCompiler {
+ public:
+  virtual ~BackendCompiler();
+
+  virtual void GetDependentDialects(mlir::DialectRegistry& registry) const {}
+
+  // Compile the `module` in TF dialect. The result module should be also in TF
+  // dialect.
+  virtual absl::Status CompileTensorflow(
+      tfrt_stub::ModelRuntimeContext& model_context,
+      mlir::ModuleOp module) const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_BACKEND_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/constants.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/constants.h
new file mode 100644
index 00000000..ed6e773c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/constants.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Use __ prefix to indicate this is internal attribute.
+inline constexpr char kOpKeyAttrName[] = "__op_key";
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/function/function.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/function/function.h
new file mode 100644
index 00000000..8d09f8cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/function/function.h
@@ -0,0 +1,83 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_FUNCTION_FUNCTION_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_FUNCTION_FUNCTION_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+#include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
+
+namespace tfrt {
+class CoreRuntime;
+}
+
+namespace mlir {
+class ModuleOp;
+}
+
+namespace tensorflow {
+
+struct TfrtFunctionCompileOptions : public TfrtCompileOptions {
+  // Currently only SavedModel API inference uses the tpu_fuse_ops option
+  TfrtFunctionCompileOptions() {
+    tpu_fuse_ops = false;
+    // Currently grappler is not correctly applied in the eager execution of TF
+    // functions, as it may sometimes remove arguments and results.
+    enable_grappler = false;
+  }
+
+  // If true, use ServingCoreSelector to pick TPU core. Otherwise, obtain core
+  // location from assigned device name.
+  // Currently we don't use core_selector for training use cases.
+  bool tpu_use_core_selector = false;
+
+  // If true, use BundledTransferToTpuOp to transfer variables and input tensors
+  // to TPU.
+  bool tpu_use_bundled_transfer = false;
+
+  // If true, lower an TF op that's placed on TPU device to be executed with
+  // tfrt_fallback.execute.
+  // Currently for training use cases we need to lower the op to corert.execute
+  // to execute with TPU OpHandler, and with TFRT's native implementation.
+  // TODO(b/188940204): remove this config after we clear up the TPU variable
+  // implementation.
+  bool tpu_lower_to_fallback = false;
+  // If true, transfer the result of TPUExecuteOp from TPU to host.
+  // Currently for training and Python bulk inference use cases, we don't need
+  // to proactively transfer the result to host since the consumer op (or
+  // function) of the result may still be on TPU.
+  // TODO(b/194081364): remove this option once we unify servo TPU serving
+  // result transfer behavior.
+  bool tpu_transfer_result_to_host = false;
+};
+
+// Compile MLIR generated by tf.function in TF dialect into BEF.
+absl::Status CompileTFMLIRToBEF(const TfrtFunctionCompileOptions& options,
+                                mlir::ModuleOp module,
+                                tfrt::BefBuffer* bef_buffer);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_FUNCTION_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h
new file mode 100644
index 00000000..ff06f069
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_GPU_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_GPU_OPS_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+
+using namespace mlir;  // NOLINT
+
+namespace tfrt {
+namespace gpu {
+
+// Dialect for TFRT GPU operations.
+class GpuRuntimeDialect : public Dialect {
+ public:
+  explicit GpuRuntimeDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "gpurt"; }
+};
+
+}  // namespace gpu
+}  // namespace tfrt
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_GPU_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
new file mode 100644
index 00000000..644de261
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_DIALECT_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_DIALECT_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+
+namespace mlrt {
+namespace compiler {
+
+class MlrtDialect : public mlir::Dialect {
+ public:
+  explicit MlrtDialect(mlir::MLIRContext *context);
+  static llvm::StringRef getDialectNamespace() { return "mlrt"; }
+
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+  void printType(mlir::Type type, mlir::DialectAsmPrinter &os) const override;
+};
+
+// The MLIR type represents a C++ mlrt::Future.
+class FutureType
+    : public mlir::Type::TypeBase<FutureType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr mlir::StringLiteral name = "mlrt.compiler.future";
+};
+
+// The MLIR type represents a C++ mlrt::Promise.
+class PromiseType
+    : public mlir::Type::TypeBase<PromiseType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr mlir::StringLiteral name = "mlrt.compiler.promise";
+};
+
+// The MLIR type represents a C++ mlrt::AsyncHandle.
+class AsyncHandleType : public mlir::Type::TypeBase<AsyncHandleType, mlir::Type,
+                                                    mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr mlir::StringLiteral name = "mlrt.compiler.async_handle";
+};
+
+}  // namespace compiler
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h
new file mode 100644
index 00000000..e3922c6e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_OPS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
new file mode 100644
index 00000000..a542373e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_OPS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
+#include "tfrt/compiler/opdefs/tfrt_op_interfaces.h"  // from @tf_runtime
+#include "tfrt/compiler/opdefs/tfrt_traits.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+class TensorflowMlrtDialect : public mlir::Dialect {
+ public:
+  explicit TensorflowMlrtDialect(mlir::MLIRContext *context);
+  static llvm::StringRef getDialectNamespace() { return "tf_mlrt"; }
+
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+  void printType(mlir::Type type, mlir::DialectAsmPrinter &os) const override;
+};
+
+// The MLIR type represents a tensorflow::Tensor.
+class TFTensorType
+    : public mlir::Type::TypeBase<TFTensorType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr mlir::StringLiteral name = "tensorflow.tf_mlrt.tf_tensor";
+};
+
+// The MLIR type represents a tensorflow::Device*
+class TFDeviceType
+    : public mlir::Type::TypeBase<TFDeviceType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr mlir::StringLiteral name = "tensorflow.tf_mlirt.tf_device";
+};
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h
new file mode 100644
index 00000000..a428488d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_TPU_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_TPU_OPS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf_mlrt_tpu {
+
+class TensorflowMlrtTpuDialect : public mlir::Dialect {
+ public:
+  explicit TensorflowMlrtTpuDialect(mlir::MLIRContext *context);
+  static llvm::StringRef getDialectNamespace() { return "tf_mlrt_tpu"; }
+};
+
+}  // namespace tf_mlrt_tpu
+}  // namespace tensorflow
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_TPU_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h
new file mode 100644
index 00000000..24fa464f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+using namespace mlir;  // NOLINT
+
+namespace tfrt {
+namespace fallback {
+
+// Dialect for fallback operations.
+class FallbackDialect : public Dialect {
+ public:
+  explicit FallbackDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "tfrt_fallback"; }
+
+  Type parseType(DialectAsmParser &parser) const override;
+  void printType(Type type, DialectAsmPrinter &os) const override;
+};
+
+// The MLIR type represents a tensorflow::Tensor.
+class TFTensorType : public Type::TypeBase<TFTensorType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name = "tfrt.tf_tensor";
+};
+
+// The MLIR type represents a tensorflow::Allocator.
+class TFAllocatorType
+    : public Type::TypeBase<TFAllocatorType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name = "tfrt.tf_allocator";
+};
+
+}  // namespace fallback
+}  // namespace tfrt
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h
new file mode 100644
index 00000000..eab44d1d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_ASYNC_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_ASYNC_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tfrt/compiler/opdefs/tfrt_op_interfaces.h"  // from @tf_runtime
+#include "tfrt/compiler/opdefs/tfrt_traits.h"  // from @tf_runtime
+#include "tfrt/core_runtime/opdefs/traits.h"  // from @tf_runtime
+
+using namespace mlir;  // NOLINT
+
+namespace tfrt {
+namespace fallback_async {
+
+// Dialect for fallback async operations.
+class FallbackAsyncDialect : public Dialect {
+ public:
+  explicit FallbackAsyncDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "tfrt_fallback_async"; }
+};
+
+}  // namespace fallback_async
+}  // namespace tfrt
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_ASYNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
new file mode 100644
index 00000000..0cddb101
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
@@ -0,0 +1,127 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_COMMON_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_COMMON_H_
+
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
+
+namespace tfrt {
+namespace fallback_common {
+
+template <typename OpTy>
+mlir::LogicalResult VerifyExecuteOpCommon(OpTy op) {
+  auto op_attr_array = op.getOpAttrs().getValue();
+  for (auto op_attr : op_attr_array) {
+    auto key_value = mlir::dyn_cast<mlir::ArrayAttr>(op_attr);
+    if (!key_value || key_value.getValue().size() != 2 ||
+        !mlir::isa<mlir::StringAttr>(key_value.getValue()[0]))
+      return op.emitOpError() << "each op_attr should be a key-value pair, "
+                                 "where the key is a string";
+  }
+  return mlir::success();
+}
+
+template <typename OpTy>
+mlir::LogicalResult VerifyFallbackExecuteOp(OpTy op) {
+  auto result = VerifyExecuteOpCommon(op);
+  if (failed(result)) return result;
+
+  // Verify function attributes.
+  auto op_func_attr_array = op.getOpFuncAttrs().getValue();
+  for (auto op_attr : op_func_attr_array) {
+    auto key_value = mlir::dyn_cast<mlir::ArrayAttr>(op_attr);
+    if (!key_value || key_value.getValue().size() != 2 ||
+        !mlir::isa<mlir::StringAttr>(key_value.getValue()[0]) ||
+        !mlir::isa<mlir::StringAttr>(key_value.getValue()[1]))
+      return op.emitOpError() << "each op_func_attr should be a key-value "
+                                 "pair, where both the key and the value are "
+                                 "strings";
+  }
+  return mlir::success();
+}
+
+template <typename OpTy>
+void PrintExecuteOpFuncAttribute(mlir::OpAsmPrinter &p, OpTy op) {
+  auto op_func_attrs = op.getOpFuncAttrs();
+  if (!op_func_attrs.empty()) {
+    auto print_key_value = [&](mlir::Attribute attr) {
+      auto key_value = mlir::cast<mlir::ArrayAttr>(attr).getValue();
+      auto key = key_value[0];
+      auto value = key_value[1];
+
+      p << mlir::cast<mlir::StringAttr>(key).getValue();
+      p << " = ";
+      p << value;
+    };
+
+    auto op_func_attr_array = op_func_attrs.getValue();
+    p << " {";
+    llvm::interleaveComma(op_func_attr_array, p, print_key_value);
+    p << '}';
+  }
+}
+
+template <typename OpTy>
+void PrintExecuteOpCommon(mlir::OpAsmPrinter &p, OpTy op) {
+  auto op_attrs = op.getOpAttrs();
+  if (!op_attrs.empty()) {
+    auto print_key_value = [&](mlir::Attribute attr) {
+      auto key_value = mlir::cast<mlir::ArrayAttr>(attr).getValue();
+      auto key = key_value[0];
+      auto value = key_value[1];
+
+      p << mlir::cast<mlir::StringAttr>(key).getValue();
+      p << " = ";
+      p << value;
+    };
+
+    auto op_attr_array = op_attrs.getValue();
+    p << " {";
+    llvm::interleaveComma(op_attr_array, p, print_key_value);
+    p << '}';
+  }
+}
+
+void GetExecuteOpAttrsCommon(
+    mlir::MLIRContext *context, llvm::ArrayRef<mlir::Attribute> op_attr_array,
+    llvm::SmallVectorImpl<std::pair<llvm::StringRef, mlir::Attribute>>
+        *op_attrs);
+
+struct ParseExecuteOpOptions {
+  bool has_chain = false;
+  bool has_key = false;
+  bool has_device = false;
+  bool has_func_attr = false;
+  bool has_cost = false;
+  bool has_op_name = true;
+  bool has_symbol_ref = false;
+};
+
+mlir::ParseResult ParseExecuteOpCommon(mlir::OpAsmParser &parser,
+                                       mlir::Builder &builder,
+                                       mlir::OperationState &result,
+                                       mlir::Type tensor_type,
+                                       const ParseExecuteOpOptions &options);
+}  // namespace fallback_common
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.h
new file mode 100644
index 00000000..78e99830
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_SYNC_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_SYNC_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tfrt/core_runtime/opdefs/traits.h"  // from @tf_runtime
+#include "tfrt/tensor/opdefs/tensor.h"  // from @tf_runtime
+
+using namespace mlir;  // NOLINT
+
+namespace tfrt {
+namespace fallback_sync {
+
+// Dialect for fallback operations.
+class FallbackSyncDialect : public Dialect {
+ public:
+  explicit FallbackSyncDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "tfrt_fallback_sync"; }
+};
+
+}  // namespace fallback_sync
+}  // namespace tfrt
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_SYNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_util.h
new file mode 100644
index 00000000..93235ec6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_UTIL_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace tfrt {
+namespace fallback_async {
+
+bool IsArgConsumedByFallback(mlir::func::FuncOp func, int arg_index);
+
+void ForEachArgConsumedByFallback(
+    mlir::func::FuncOp func, llvm::function_ref<void(int arg_index)> action);
+
+void ForEachArgConsumedByFallback(
+    mlir::ModuleOp module,
+    llvm::function_ref<void(llvm::StringRef func_name, int arg_index)> action);
+
+}  // namespace fallback_async
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h
new file mode 100644
index 00000000..41c2b818
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h
@@ -0,0 +1,63 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_EXECUTOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+#include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+class RuntimeFallbackExecutor {
+ public:
+  explicit RuntimeFallbackExecutor(int64_t num_threads);
+
+  // Prepare() needs to be called once before calling Execute(). It sets up all
+  // things necessary to execute the given 'mlir_input' with the fallback to
+  // tensorflow.
+  void Prepare(llvm::StringRef mlir_input);
+
+  // Execute() can be called several times after the call to Prepare() (e.g. for
+  // benchmarking).
+  llvm::SmallVector<Tensor> Execute(llvm::StringRef function_name,
+                                    llvm::ArrayRef<Tensor> arguments);
+
+ private:
+  void RunTfrtInitializer();
+
+  std::unique_ptr<thread::ThreadPoolInterface> intra_op_;
+  std::unique_ptr<tfrt::HostContext> host_context_;
+  tfrt::ResourceContext resource_context_;
+  std::unique_ptr<tfrt::ExecutionContext> exec_ctx_;
+  tfrt::BefBuffer bef_buffer_;
+  tfrt::RCReference<tfrt::BEFFile> bef_file_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
new file mode 100644
index 00000000..9d77a1a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the Runtime Fallback dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tfrt/tensor/opdefs/tensor.h"  // from @tf_runtime
+
+namespace mlir {
+namespace tfd {
+
+// Dialect for TFRT delegate operations.
+class RuntimeFallbackDialect : public Dialect {
+ public:
+  explicit RuntimeFallbackDialect(MLIRContext* context);
+  static StringRef getDialectNamespace() { return "tfd"; }
+};
+
+}  // namespace tfd
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
new file mode 100644
index 00000000..087d50de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
+
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+#include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
+
+namespace tfrt {
+class CoreRuntime;
+}
+
+namespace mlir {
+class ModuleOp;
+}
+
+namespace tensorflow {
+
+// TFRTSavedModelSignatureInfo contains the metadata for a signature in the
+// savedmodel such as function name, inputs/outputs' names and types. This can
+// be used to retrieve these information in a tf_saved_model module.
+struct TFRTSavedModelSignatureInfo {
+  llvm::StringRef func_name;
+
+  // The following are metadata for inputs.
+  llvm::ArrayRef<llvm::StringRef> input_names;
+  llvm::ArrayRef<
+      std::pair<tensorflow::DataType, tensorflow::PartialTensorShape>>
+      input_specs;
+  llvm::ArrayRef<llvm::StringRef> input_devices;
+
+  // The following are metadata for outputs.
+  llvm::ArrayRef<llvm::StringRef> output_names;
+  llvm::ArrayRef<
+      std::pair<tensorflow::DataType, tensorflow::PartialTensorShape>>
+      output_specs;
+
+  // The following are metadata for bound_inputs, ie. captures.
+  llvm::ArrayRef<mlir::Operation*> bound_inputs;
+};
+
+// Apply `map_fn` on every exported function in the module with the
+// corresponding signature metadata populated in TFRTSavedModelSignatureInfo for
+// the function.
+absl::Status MapFunctionSignaturesFromTFSavedModelMLIR(
+    mlir::ModuleOp module,
+    llvm::function_ref<void(const TFRTSavedModelSignatureInfo&)> map_fn);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/tfrt_fallback_registration.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/tfrt_fallback_registration.h
new file mode 100644
index 00000000..65f1554e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/tfrt_fallback_registration.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements TFRuntimeFallback tensor conversion function for
+// converting to host tensor.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TFRT_FALLBACK_REGISTRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TFRT_FALLBACK_REGISTRATION_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tfd {
+
+// Register conversion functions for TFRuntimeFallbackTensors.
+void RegisterTfrtFallbackDialect(mlir::DialectRegistry &registry);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TFRT_FALLBACK_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.h
new file mode 100644
index 00000000..791cb346
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_ATTR_LOWERING_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_ATTR_LOWERING_UTILS_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// TODO(chky): attributes "_output_shapes" should be removed by any tool that
+// generates TF MLIR dialect, as they are not used by CoreRuntime. Remove this
+// filtering logic once unused attributes are cleaned up in the upper layer.
+bool IsUnusedTfrtAttribute(llvm::StringRef name);
+
+// Create a single attribute that contains the named attribute lists. It is an
+// array of pairs. The key must be a string attribute, and the value can be
+// any attribute that is supported by CoreRuntime.
+mlir::ArrayAttr CreateTfrtOpAttrs(llvm::ArrayRef<mlir::NamedAttribute> attrs,
+                                  mlir::Builder& builder);
+
+bool IsSupportedTfrtNumericDType(mlir::Type type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_ATTR_LOWERING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h
new file mode 100644
index 00000000..be212e44
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h
@@ -0,0 +1,175 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_CORERT_CONVERTER_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_CORERT_CONVERTER_H_
+
+#include <array>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
+#include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
+#include "tfrt/core_runtime/opdefs/core_runtime.h"  // from @tf_runtime
+#include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+struct ParseDeviceNameResult {
+  std::string device_type;
+  std::string device_name;
+  std::string op_handler_name;
+};
+
+// A helper class for converting CoreRT types and attributes.
+class CoreRTConverter : public mlir::TypeConverter {
+ public:
+  CoreRTConverter(
+      mlir::MLIRContext *context,
+      const mlir::TF::SideEffectAnalysis::Info *side_effect_analysis);
+  // Materialize all derived attributes. Note that this is only needed by
+  // CoreRT ops and fallback ops.
+  void MaterializeDerivedAttributes(mlir::Operation *op);
+
+  // Similar to CreateOpAttrs, create a single attribute that contains the
+  // named attribute lists, which is an array of pairs, with keys and values
+  // both being string attributes. The values represent function names.
+  // This method also populates a vector of attribute keys to be removed.
+  // If `use_mlir_func_name` is true, the function name given by MLIR will be
+  // used, which could be different from the original function name in the graph
+  // function library. This is used when the original function has been changed
+  // by lowering passes, and hence it needs to be exported to function library
+  // for runtime to use.
+  mlir::ArrayAttr CreateOpFuncAttrs(
+      const mlir::SymbolTable &symbol_table,
+      llvm::ArrayRef<mlir::NamedAttribute> attrs,
+      llvm::SmallVector<mlir::StringAttr, 4> *func_attr_keys,
+      bool use_mlir_func_name = false);
+
+  // Parse the device name of `op` to TFRT's device name. For example, "/CPU:0"
+  // will be parsed as "cpu". Return None if no device is assigned.
+  std::optional<ParseDeviceNameResult> ParseDeviceName(
+      llvm::StringRef device_name) const;
+  std::optional<ParseDeviceNameResult> ParseDeviceName(
+      mlir::Operation *op) const;
+
+  // Convert the device name in a TF op to a op_handler value produced by the
+  // corresponding GetOpHandler in the current block. If there does not exist
+  // one, insert a GetOpHandler to the beginning of the block and return the
+  // device value.
+  mlir::Value ConvertOpHandler(mlir::Operation *op, llvm::StringRef device_name,
+                               mlir::ConversionPatternRewriter *rewriter);
+
+  // Get a DistributedContext value to be used by the given op. The
+  // DistributedContext value should be shared by all operations in the body
+  // of the same FuncOp. If there does not exist one, return a null Value.
+  mlir::Value GetDistributedContext(mlir::Operation *op,
+                                    mlir::ConversionPatternRewriter *rewriter);
+
+  // Get a RemoteChainManager value to be used by the given op. The
+  // RemoteChainManager value should be shared by all operations in the body
+  // of the same FuncOp. If there does not exist one, return a null Value.
+  mlir::Value GetRemoteChainManager(mlir::Operation *op,
+                                    mlir::ConversionPatternRewriter *rewriter);
+
+  // Get a TaskHandle value with the given task name. If the TaskHandle value
+  // has already been created for the given task name within the same FuncOp,
+  // return this TaskHandle value. Otherwise, return a null Value.
+  mlir::Value GetTaskHandle(mlir::Operation *op, StringRef task_name,
+                            mlir::ConversionPatternRewriter *rewriter);
+
+  // Any local operation which uses any result of the `op` should depend on the
+  // given `chain`.
+  void RegisterLocalSideEffectChain(mlir::Operation *op, mlir::Value chain) {
+    local_side_effect_chains_[op] = chain;
+  }
+
+  // Return a local chain for side effects for `op`. If there are multiple
+  // chains, a merge_chains kernel will be inserted and the merged chain will be
+  // returned.
+  mlir::Value GetLocalSideEffectChain(
+      mlir::Operation *op, mlir::ConversionPatternRewriter *rewriter);
+
+  mlir::Type op_handler_type() {
+    return builder_.getType<::tfrt::corert::OpHandlerType>();
+  }
+
+  mlir::Type tensor_handle_type() {
+    return builder_.getType<::tfrt::corert::TensorHandleType>();
+  }
+
+  mlir::Type chain_type() {
+    return builder_.getType<::tfrt::compiler::ChainType>();
+  }
+
+  mlir::Builder &builder() { return builder_; }
+
+ private:
+  // TODO(chky): attributes "_output_shapes" should be removed by any tool that
+  // generates TF MLIR dialect, as they are not used by CoreRuntime. Remove this
+  // filtering logic once unused attributes are cleaned up in the upper layer.
+  bool IsUnusedAttribute(llvm::StringRef name) const {
+    // NOTE: attributes "f.*" are function attribute related and
+    // are added during importing graph to MLIR TF Executor dialect. These
+    // attributes are not actually used by TF ops with function attributes.
+    // TODO(b/180399811): Re-evaluate the usage of these attributes.
+    static const char *const kUnusedAttributes[] = {
+        "_output_shapes",
+        "result_segment_sizes",
+        "operand_segment_sizes",
+    };
+
+    for (auto attr : kUnusedAttributes) {
+      if (name == attr) {
+        return true;
+      }
+    }
+
+    return name.contains("f.");
+  }
+
+  // Returns the converted attribute in TFRT dialect. If the conversion fails,
+  // returns a null attribute instead.
+  mlir::Attribute ConvertAttribute(mlir::Attribute attr);
+
+  mlir::TypeAttr ConvertTypeAttribute(mlir::TypeAttr type_attr);
+
+  mlir::Builder builder_;
+
+  const mlir::TF::SideEffectAnalysis::Info &side_effect_analysis_;
+
+  llvm::DenseMap<mlir::Operation *, mlir::Value> local_side_effect_chains_;
+  llvm::DenseMap<mlir::Operation *, mlir::Value> distributed_context_by_func_;
+  llvm::DenseMap<mlir::Operation *, mlir::Value> remote_chain_mgr_by_func_;
+  llvm::DenseMap<mlir::Operation *, llvm::StringMap<mlir::Value>>
+      task_handles_by_func_;
+  llvm::StringMap<mlir::Value> op_handler_by_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_CORERT_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.h
new file mode 100644
index 00000000..c1c1d42a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.h
@@ -0,0 +1,96 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_FALLBACK_CONVERTER_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_FALLBACK_CONVERTER_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+inline llvm::StringRef GetDefaultCpuDeviceName() {
+  static constexpr char kCpuDeviceName[] =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  return kCpuDeviceName;
+}
+
+class FallbackConverter : public mlir::TypeConverter {
+ public:
+  explicit FallbackConverter(mlir::MLIRContext *context);
+
+  // Return the next dense key for fallback ops. The key is simply an array
+  // index so that in runtime, the fallback ops can be efficiently retrieved.
+  int64_t GetNextFallbackKey() const { return fallback_ops_.size(); }
+
+  void RegisterFallbackOp(mlir::Operation *op) { fallback_ops_.push_back(op); }
+
+  void ReplaceFallbackOp(int64_t key, mlir::Operation *op) {
+    fallback_ops_[key] = op;
+  }
+
+  llvm::ArrayRef<mlir::Operation *> GetFallbackOps() const {
+    return fallback_ops_;
+  }
+
+ private:
+  mlir::Builder builder_;
+  // Using a vector to keep fallback ops in order, and the key for a fallback op
+  // is its corresponding index here.
+  llvm::SmallVector<mlir::Operation *, 8> fallback_ops_;
+};
+
+// Convert the `value` that is a !corert.tensorhandle to
+// !tfrt_fallback.tf_tensor. If needed, tensor conversion kernels will be added.
+// On error it returns nullptr.
+mlir::Value ConvertCoreRTTensorHandleToFallbackTensor(
+    mlir::Location loc, llvm::StringRef device, mlir::Value value,
+    mlir::ConversionPatternRewriter &rewriter);
+
+// Convert the `value` that is a !tfrt_fallback.tf_tensor to
+// !corert.tensorhandle. If needed, tensor conversion kernels will be added. On
+// error it returns nullptr.
+mlir::Value ConvertFallbackTensorToCoreRTTensorHandle(
+    mlir::Location loc, mlir::Value value,
+    mlir::ConversionPatternRewriter &rewriter);
+
+// Convert operands that might be !tfrt_fallback.tf_tensor for corert operations
+// that take only !corert.tensorhandle.
+mlir::LogicalResult ConvertCoreRTOperands(
+    mlir::Operation *op, mlir::ValueRange operands,
+    llvm::SmallVectorImpl<mlir::Value> *new_operands,
+    mlir::ConversionPatternRewriter &rewriter);
+
+// Convert operands that might be !corert.tensorhandle for fallback operations
+// that take only !tfrt_fallback.tf_tensor.
+mlir::LogicalResult ConvertFallbackOperands(
+    mlir::Operation *op, llvm::StringRef device, mlir::ValueRange operands,
+    llvm::SmallVectorImpl<mlir::Value> *new_operands,
+    mlir::ConversionPatternRewriter &rewriter);
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_FALLBACK_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h
new file mode 100644
index 00000000..801e10bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_GPU_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_GPU_PASSES_H_
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Registers dialects used in TFRT GPU lowering.
+void RegisterGpuDialects(mlir::DialectRegistry *registry);
+
+// Adds a target dialect and rewrite patterns for TFRT GPU lowering.
+void AddGpuTargetDialectAndPatterns(mlir::MLIRContext *context,
+                                    mlir::ConversionTarget *target,
+                                    mlir::RewritePatternSet *patterns);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_GPU_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h
new file mode 100644
index 00000000..a345d1d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_EXTRACT_CALLBACK_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_EXTRACT_CALLBACK_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Extracts a module that consists of a public callback function in name of
+// `callback_key` and all its reachables.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ExtractCallbackModule(
+    mlir::ModuleOp module, absl::string_view callback_key);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_EXTRACT_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
new file mode 100644
index 00000000..0dfaa081
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_BACKEND_COMPILER_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_BACKEND_COMPILER_H_
+
+#include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Implements the custom backend compiler for IFRT based serving in TFRT.
+class IfrtBackendCompiler : public tensorflow::BackendCompiler {
+ public:
+  struct Options {
+    // If true, disable running TFRTSetTPUDeviceAttrPass which set the default
+    // `tf.device` and `device_assignment` attributes.
+    // This is a server-level option for now. We can consider to make it a
+    // per-model option in the future.
+    bool disable_set_default_tpu_device_and_device_assignment_attributes = true;
+  };
+
+  explicit IfrtBackendCompiler(TpuCompiler* tpu_compiler = nullptr)
+      : tpu_compiler_(tpu_compiler) {}
+
+  explicit IfrtBackendCompiler(const Options& ifrt_backend_compile_options,
+                               TpuCompiler* tpu_compiler = nullptr)
+      : tpu_compiler_(tpu_compiler),
+        compile_options_(ifrt_backend_compile_options) {}
+
+  void GetDependentDialects(mlir::DialectRegistry& registry) const override {
+    if (tpu_compiler_) {
+      tpu_compiler_->RegisterTPUDialects(&registry);
+    }
+  }
+
+  // Rewrites the tensorflow graph in MLIR for IFRT serving. The methods
+  // extracts regions for IFRT execution on accelerator (e.g. TPU).
+  absl::Status CompileTensorflow(
+      tensorflow::tfrt_stub::ModelRuntimeContext& model_context,
+      mlir::ModuleOp module) const override;
+
+ private:
+  TpuCompiler* tpu_compiler_;  // Not owned.
+  Options compile_options_;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_BACKEND_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h
new file mode 100644
index 00000000..3e497182
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_CONSTANTS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Attribute name of a text TpuCompileMetadataProto. Note that the text proto is
+// not backward compatible and shall not be serialized.
+inline constexpr absl::string_view kMetadataTextAttrName =
+    "__tpu_compile_metadata_text";
+
+// Name of a variable as loaded IFRT array .
+inline constexpr absl::string_view kVariableArrayNameAttr =
+    "__variable_array_name";
+
+// Attribute of a text `VariableDeviceShardingConfigProto`.
+inline constexpr absl::string_view kVariableShardingConfigTextAttr =
+    "__variable_sharding_config_text";
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
new file mode 100644
index 00000000..c64672cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+struct DtypeAndShape {
+  tensorflow::DataType dtype;
+  tensorflow::TensorShape shape;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
new file mode 100644
index 00000000..7122f26e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -0,0 +1,86 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_TF2HLO_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_TF2HLO_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_compilation.pb.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/service/hlo.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+struct Tf2HloArg {
+  mlir::ModuleOp module;
+  // `input_dtypes_and_shapes` can be mutable during Tf2HLO compilation.
+  std::vector<DtypeAndShape> input_dtypes_and_shapes;
+  absl::Span<const int> variable_arg_indices;
+  absl::string_view entry_function_name;
+  // `compile_metadata` can be mutable during Tf2HLO compilation.
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+  tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn;
+  std::shared_ptr<xla::ifrt::Topology> topology;
+  absl::string_view platform_name;
+  bool enable_r1_optimization = true;
+
+  absl::StatusOr<uint64_t> Fingerprint() const;
+};
+
+struct Tf2HloResult {
+  xla::HloModuleProto hlo_module_proto;
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+  tf2xla::HostComputeMetadata host_compute_metadata;
+  Tf2HLOResultProto ToProto() const;
+};
+
+absl::Status UpdateCompileMetadata(
+    tensorflow::tpu::TPUCompileMetadataProto& metadata,
+    absl::Span<const DtypeAndShape> inputs);
+
+absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
+    mlir::ModuleOp module, const xla::ifrt::Client& ifrt_client);
+
+class TfToHloCompiler {
+ public:
+  TfToHloCompiler() = default;
+  virtual ~TfToHloCompiler() = default;
+
+  // Returns a cache key that can be used to identify the result of
+  // CompileTfToHlo.
+  virtual absl::StatusOr<std::string> Key(const Tf2HloArg& arg);
+
+  virtual absl::StatusOr<Tf2HloResult> CompileTfToHlo(Tf2HloArg& arg);
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_TF2HLO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
new file mode 100644
index 00000000..34490c74
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
@@ -0,0 +1,86 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_TF_IFRT_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_TF_IFRT_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Create a pass to convert tf_device.cluster_func to tf.ifrt_program_call.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateRewriteClusterToIfrtCallPass();
+
+// Creates a pass that sinks variable tensor argument to `tf.IfrtCall` as named
+// arrays and lowers `tf.ReadVariableOp` to `tf.IfrtLoadVariableOp`.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSinkVariableAsNamedArrayPass();
+
+// Creates a pass that splits `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestoreSplittingPass();
+
+// Creates a pass that merges `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestoreMergingPass();
+
+// Creates a pass that propagates inputs of no-op identity ops to their outputs.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfIdentityPropagationPass();
+
+// Creates a pass that prunes unused `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestorePruningPass();
+
+// Creates a pass that lower `tf.RestoreVariableOp` to
+// `tf.IfrtRestoreVariableOp`.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateLowerToIfrtRestoreVariablePass();
+
+// Creates a pass that cleans up device attributes from all ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfDeviceCleanupPass();
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+// Register all passes.
+void RegisterTfIfrtPasses();
+
+// Setup the input pass manager to enable IR dumping after each pass.
+// Note a side effect of this method is that multi threading will be disabled.
+void EnablePassIRPrinting(mlir::PassManager& pm,
+                          const std::string& dump_group_name,
+                          llvm::StringRef module_name);
+
+// Convert tf_device.cluster_func to tf.ifrt_program_call.
+// The callee function is converted to a ifrt_program.
+absl::Status RunClusterToIfrtRuntimeOpsPassPipeline(
+    mlir::ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_TF_IFRT_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h
new file mode 100644
index 00000000..6ed9f1e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASSIGN_OP_KEY_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASSIGN_OP_KEY_H_
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Create a pass that assigns an op_key to every fallback OP. The op_key
+// provides a uniform key to look up online cost for a specific op.
+// This pass is expected to run before parallerization.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAssignOpKeyPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASSIGN_OP_KEY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/async_while.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/async_while.h
new file mode 100644
index 00000000..684e8dd1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/async_while.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASYNC_WHILE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASYNC_WHILE_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Creates a pass that converts applicable tf.While to tf_mlrt.AsyncWhile.
+// tf_mlrt.AsyncWhile dispatch iterations asynchronously, thus allowing
+// pipelining between iterations to reduce latency. This is intended for
+// tf.While that is not converted from tf.MapFn, but still can benefit from
+// asynchronous execution of iterations to reduce latency.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAsyncWhilePass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASYNC_WHILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h
new file mode 100644
index 00000000..93dde814
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_EXECUTE_OP_REGISTRY_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_EXECUTE_OP_REGISTRY_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+class ExecuteOpRegistry {
+ public:
+  mlir::LogicalResult RegisterExecuteOp(mlir::Operation* op, uint32_t op_key) {
+    if (op_key >= execute_ops_.size()) {
+      execute_ops_.resize(op_key + 1);
+    }
+    if (auto* register_op = execute_ops_[op_key]) {
+      if (register_op->getName() != op->getName() ||
+          register_op->getAttrs() != op->getAttrs()) {
+        return op->emitError() << "Key " << op_key << " already registered.";
+      }
+      return mlir::success();
+    }
+    execute_ops_[op_key] = op;
+    return mlir::success();
+  }
+
+  void ReplaceExecuteOp(int64_t key, mlir::Operation* op) {
+    execute_ops_[key] = op;
+  }
+
+  llvm::ArrayRef<mlir::Operation*> GetExecuteOps() const {
+    return execute_ops_;
+  }
+
+ private:
+  // Using a vector to keep fallback ops in order, and the key for a fallback op
+  // is its corresponding index here.
+  llvm::SmallVector<mlir::Operation*, 8> execute_ops_;
+};
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_EXECUTE_OP_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h
new file mode 100644
index 00000000..6f772a89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_FUSE_MLRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_FUSE_MLRT_OPS_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateFuseMlrtOpPass();
+
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_FUSE_MLRT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/ifrt_set_tpu_host_allocator.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/ifrt_set_tpu_host_allocator.h
new file mode 100644
index 00000000..ddd8ee0a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/ifrt_set_tpu_host_allocator.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IFRT_SET_TPU_HOST_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IFRT_SET_TPU_HOST_ALLOCATOR_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Creates a pass that set tpu input producers to use tpu host allocators.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateIfrtSetTpuHostAllocatorPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IFRT_SET_TPU_HOST_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
new file mode 100644
index 00000000..1258c953
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IMPORT_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IMPORT_MODEL_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Converts an MLIR `module` in TF dialect to MLRT's bytecode format. If
+// `module_with_op_keys` is non-null, the intermediate module on which passes
+// until (including) AssignOpKeyPass have run will be cloned to it.
+//
+// This is for initial conversion.
+absl::StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
+    const TfrtCompileOptions& options, tfrt_stub::FallbackState& fallback_state,
+    mlir::ModuleOp module, tfrt_stub::ModelRuntimeContext& model_context,
+    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys = nullptr,
+    std::vector<std::string>* added_xla_function_names = nullptr);
+
+// Converts an MLIR `module_with_op_keys` in TF dialect to MLRT's bytecode
+// format, with op costs from `cost_recorder`.
+//
+// This is for re-conversion.
+absl::StatusOr<mlrt::bc::Buffer> ConvertTfMlirWithOpKeysToBytecode(
+    const TfrtCompileOptions& options,
+    const tfrt_stub::FallbackState& fallback_state,
+    mlir::ModuleOp module_with_op_keys,
+    const tfrt_stub::CostRecorder& cost_recorder);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IMPORT_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/mlrt_device_constants.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/mlrt_device_constants.h
new file mode 100644
index 00000000..3c2c588a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/mlrt_device_constants.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_MLRT_DEVICE_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_MLRT_DEVICE_CONSTANTS_H_
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+inline constexpr char kTfMlrtCustomDevice[] = "tf_mlrt.custom_device";
+inline constexpr char kTpuHostDevice[] = "tpu_host_device";
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_MLRT_DEVICE_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h
new file mode 100644
index 00000000..71221276
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PARALLELIZATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PARALLELIZATION_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateParallelizationPass(
+    uint64_t cost_threshold, bool merge_inter_dependent_streams,
+    const tfrt_stub::CostRecorder* cost_recorder = nullptr);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateParallelizationPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PARALLELIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h
new file mode 100644
index 00000000..f9bf621b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PASSES_H_
+
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+void RegisterMlrtPasses();
+
+// Creates a pipeline of passes that lowers MLIR TF dialect to MLRT dialects.
+// The op costs from `cost_recorder` (if non-null) are used for Stream Analysis.
+void CreateTfToMlrtPipeline(
+    mlir::OpPassManager& pm, const TfrtPipelineOptions& options,
+    const tfrt_stub::FallbackState* fallback_state,
+    const tfrt_stub::CostRecorder* cost_recorder = nullptr);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h
new file mode 100644
index 00000000..1423011b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_REWRITE_IFRT_LOAD_VARIABLE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_REWRITE_IFRT_LOAD_VARIABLE_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Creates a pass that converts tf.IfrtLoadVariableOp to
+// tf_mlrt.TFIfrtLoadVariableOp and inserts tf_mlrt.Await on the returned future
+// from tf_mlrt.TFIfrtLoadVariableOp if it is used by CPU ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateRewriteIfrtLoadVariablePass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_REWRITE_IFRT_LOAD_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h
new file mode 100644
index 00000000..1206f66f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TF_TO_MLRT_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TF_TO_MLRT_H_
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// The conversion pass that is run before 'tf-mlrt-parallelization' passes. The
+// parallelization pass changes the graph content, so any rewrite/conversion
+// that depends on the graph instead of individual ops should be done before
+// parallelization.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtPreParallelizationConversionPass(
+    const TfrtPipelineOptions& options);
+
+// The conversion pass that is run after 'tf-mlrt-parallelization' passes. The
+// parallelization pass changes the graph content, so this pass should only
+// contain conversion that depends on individual ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtConversionPass(const TfrtPipelineOptions& options);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtConversionPass(const TfrtPipelineOptions& options,
+                             const tfrt_stub::FallbackState* fallback_state);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TF_TO_MLRT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h
new file mode 100644
index 00000000..20592c95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TPU_CONVERSION_PATTERNS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TPU_CONVERSION_PATTERNS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+void RegisterTpuDialect(mlir::DialectRegistry& registry);
+
+void PopulateTpuPreParallelizationConversionPatterns(
+    mlir::ConversionTarget& target, mlir::RewritePatternSet& patterns,
+    const TfrtPipelineOptions& options);
+
+void PopulateTpuConversionPatterns(mlir::ConversionTarget& target,
+                                   mlir::RewritePatternSet& patterns,
+                                   mlir::TypeConverter& type_converter,
+                                   ExecuteOpRegistry& execute_op_registry,
+                                   const TfrtPipelineOptions& options);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TPU_CONVERSION_PATTERNS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h
new file mode 100644
index 00000000..c47471f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_UTIL_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Use fallback by default for anything that does not have a native kernel
+// with some exceptions.
+bool UseFallback(mlir::Operation *op);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h
new file mode 100644
index 00000000..a45c0387
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_WHILE_TO_MAP_FN_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_WHILE_TO_MAP_FN_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateWhileToMapFnPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_WHILE_TO_MAP_FN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/passes.h
new file mode 100644
index 00000000..8dad2c71
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/passes.h
@@ -0,0 +1,169 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+class PassManager;
+}
+
+namespace tensorflow {
+
+namespace tfrt_compiler {
+
+// Create a pass to insert kernels that copy fallback tensors when they are
+// passed to multiple threads, to avoid atomic contention on their refcounts.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateInsertFallbackTensorCopyPass();
+
+// Create a pass to reorder tf.Assert ops or tf.If ops that contains only
+// tf.Assert ops to the end of the function, to avoid unnecessary control
+// dependencies to other ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateReorderTfAssertPass();
+
+// Create a pass to optimize the side-effect of control flow ops. eg. if both
+// branches of a tf.If op contains only non-side-effecting ops, its
+// `is_stateless` attribute will be set to true.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateOptimizeTfControlFlowSideEffectPass();
+
+// Create a pass to remove tf.If ops' operands that are produced by tf.Const
+// ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateRemoveTfIfConstArgsPass();
+
+// Create a pass to merge non-side-effecting tf.If ops that have the same
+// operands.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateMergeTfIfOpsPass();
+
+// Create a pass to deduplicate the function invoked by tf.BatchFunction with
+// the same shared_name.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDeduplicateFunctionsInovkedByBatchFunctionPass();
+
+// Create a pass to lower bound the number of threads in tf.BatchFunction.
+struct ReconfigBatchOpPassOptions {
+  int64_t min_num_batch_threads = 1;
+  int64_t min_max_enqueued_batches = 1;
+  std::string batch_padding_policy = "";
+  int64_t num_batch_threads = 0;
+  int64_t max_batch_size = 0;
+  int64_t batch_timeout_micros = 0;
+  llvm::ArrayRef<int64_t> allowed_batch_sizes = {};
+  int64_t max_enqueued_batches = 0;
+};
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateReconfigBatchOpPass(
+    ReconfigBatchOpPassOptions options);
+
+// Create a pass to fuse the TPU Ops for TFRT.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateFuseTpuCompileAndExecutePass();
+
+// Create a pass to optimize TF dialect for TFRT workflow.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateOptimizeTfForTfrtPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateTfrtXlaRewritePass();
+
+// Create a pass to deduplicate results of tf.If ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDeduplicateIfResultPass();
+
+}  // namespace tfrt_compiler
+
+class CoreRTConverter;
+
+// Create a pass that sink in the var handle op to the callee function when
+// proper.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSinkInInvariantOpsPass();
+
+// Create a pass that rewrites tf_saved_model dialect's ops according to TFRT's
+// requirements.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateLowerTFSavedModelPass(bool hoist_invariant_ops,
+                            bool fuse_get_resource_ops);
+
+// Create a pass that converts ref variables to resource variables in a limited
+// number of cases.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateConvertReferenceVariableToResourceVariablePass();
+
+// Run *ToCoreRTConversionPassRun as free functions. Useful for
+// reusing the pass logic in a custom pass with additional conversions.
+mlir::LogicalResult TFSavedModelToCoreRTConversionPassRun(
+    mlir::MLIRContext* context, mlir::func::FuncOp func,
+    mlir::ConversionTarget* target, mlir::RewritePatternSet* patterns,
+    CoreRTConverter* corert_converter);
+
+// Create an operation pass that removes the device attribute from every
+// corert.executeop.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateRemoveDeviceAttributePass();
+
+// Create an operation pass that inserts corert.transfer op to make sure any
+// argument of any op is on the same device of the op itself.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateCrossDeviceTransferPass();
+
+// Create a pass that converts MLIR TF dialect to MLIR TFRT dialect.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToTfrtConversionPass(const TfrtPipelineOptions& options);
+
+// Creates a pipeline of passes that lowers MLIR TF dialect to TFRT dialects.
+void CreateTfToTfrtPipeline(mlir::OpPassManager& pm,
+                            const TfrtPipelineOptions& options);
+
+// Creates a pipeline of passes that lowers MLIR TF dialect from tf.function to
+// TFRT dialect. SavedModel related conversions are not included.
+absl::Status CreateTfExecutorToTfrtPipeline(mlir::PassManager& pm,
+                                            const TfrtPipelineOptions& options);
+
+// Creates a pipeline of passes that lowers MLIR TF Executor dialect to TF
+// dialect for CoreRT purposes.
+absl::Status CreateTFExecutorToTFPipeline(mlir::PassManager& pm,
+                                          const TfrtPipelineOptions& options);
+
+// TODO(deqiangc): refactor below helpers once mlrt is OSSed.
+void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
+    mlir::OpPassManager& pm, const TfrtPipelineOptions& options);
+void CreateTFExecutorToTFInvariantOptimizationPipelineHelper(
+    mlir::OpPassManager& pm, const TfrtPipelineOptions& options);
+
+absl::Status CreateTFExecutorToTFPreInvariantOptimizationPipeline(
+    mlir::PassManager& pm, const TfrtPipelineOptions& options);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h
new file mode 100644
index 00000000..44929772
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_SET_SHAPE_INVARIANT_IN_WHILE_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_SET_SHAPE_INVARIANT_IN_WHILE_OPS_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Create a pass to set shape_invariant attribute for all tf.While ops except
+// those are on TPU.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateSetShapeInvariantInWhileOps();
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_SET_SHAPE_INVARIANT_IN_WHILE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
new file mode 100644
index 00000000..2588d0f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
@@ -0,0 +1,191 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_PIPELINE_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_PIPELINE_OPTIONS_H_
+
+#include <cstdint>
+#include <string>
+
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+
+namespace tensorflow {
+
+struct TfrtPipelineOptions
+    : public mlir::PassPipelineOptions<TfrtPipelineOptions> {
+  Option<std::string> saved_model_dir{*this, "saved-model-dir",
+                                      llvm::cl::desc(""), llvm::cl::init("")};
+  Option<std::string> default_device{
+      *this, "default-device", llvm::cl::desc("default device assignment"),
+      llvm::cl::init("/job:localhost/replica:0/task:0/device:CPU:0")};
+  Option<bool> enable_optimizer{
+      *this, "enable-optimizer",
+      llvm::cl::desc("run optimization passes on corert dialect"),
+      llvm::cl::init(false)};
+  Option<bool> decompose_resource_ops{
+      *this, "decompose-resource-ops",
+      llvm::cl::desc("decompose composite resource ops into ReadVariableOp and "
+                     "non-resource ops. This is currently used in TFRT "
+                     "savedmodel pipeline."),
+      llvm::cl::init(false)};
+  Option<std::string> force_data_format{
+      *this, "force-data-format",
+      llvm::cl::desc("force data format for all layout sensitive operations")};
+  // TODO(tfrt-devs): consider making compiler to figure out whether to fold
+  // transpose or not instead of exposing the specific option.
+  Option<bool> skip_fold_transpose_in_ops{
+      *this, "skip-fold-transpose-in-ops",
+      llvm::cl::desc("Skip folding transpose operands in Ops which can support "
+                     "different layouts.")};
+  Option<bool> target_tpurt{*this, "target-tpurt",
+                            llvm::cl::desc("target TPURT dialect if true"),
+                            llvm::cl::init(false)};
+  Option<bool> tpu_use_core_selector{
+      *this, "tpu-use-core-selector",
+      llvm::cl::desc("If true, use ServingCoreSelector to pick TPU core. "
+                     "Otherwise, use the assigned core. Currently we use "
+                     "core selector for Servo serving use cases."),
+      llvm::cl::init(true)};
+  Option<bool> tpu_use_bundled_transfer{
+      *this, "tpu-use-bundled-transfer",
+      llvm::cl::desc("If true, use BundledTransferToTpuOp to transfer "
+                     "variables and input tensors to TPU."),
+      llvm::cl::init(true)};
+  Option<bool> tpu_lower_to_fallback{
+      *this, "tpu-lower-to-fallback",
+      llvm::cl::desc("If true, lower an TF op that's placed on TPU device "
+                     "to be executed by tfrt_fallback.execute."),
+      llvm::cl::init(true)};
+  Option<bool> tpu_fuse_ops{
+      *this, "tpu-fuse-ops",
+      llvm::cl::desc("If true, use the TPU fused compile_and_execute kernel"),
+      llvm::cl::init(false)};
+  // TODO(b/194081364): remove this option once we unify servo TPU serving
+  // result transfer behavior.
+  Option<bool> tpu_transfer_result_to_host{
+      *this, "tpu-transfer-result-to-host",
+      llvm::cl::desc("If true, transfer the result of tpurt.execute from TPU "
+                     "to host."),
+      llvm::cl::init(true)};
+  Option<bool> use_tpu_host_allocator_for_inputs{
+      *this, "use-tpu-host-allocator-for-inputs",
+      llvm::cl::desc("If true, fallback executeops that produce inputs to tpu "
+                     "program will use tpu host allocator."),
+      llvm::cl::init(false)};
+  Option<TfrtCompileOptions::TpuAllowUnpaddedBatch> tpu_allow_unpadded_batch{
+      *this, "tpu-allow-unpadded-batch",
+      llvm::cl::desc("To allow unpadded batch for TPU execution."),
+      llvm::cl::values(
+          clEnumValN(TfrtCompileOptions::TpuAllowUnpaddedBatch::kDisabled,
+                     "disabled", "Disable this feature."),
+          clEnumValN(TfrtCompileOptions::TpuAllowUnpaddedBatch::kAuto, "auto",
+                     "Enable this feature when in-graph batching is detected."),
+          clEnumValN(TfrtCompileOptions::TpuAllowUnpaddedBatch::kEnforced,
+                     "enforced", "Force to enable this feature.")),
+      llvm::cl::init(TfrtCompileOptions::TpuAllowUnpaddedBatch::kDisabled)};
+
+  Option<bool> target_gpu{
+      *this, "target-gpu",
+      llvm::cl::desc("If true, target GPU compiler passes."),
+      llvm::cl::init(false)};
+
+  // TODO(b/294895431): Remove the flag and default to the fused op.
+  Option<bool> use_gpu_compile_and_execute_op{
+      *this, "use-gpu-compile-and-execute-op",
+      llvm::cl::desc("If true, gpurt.compile_and_execute is used for GPU"),
+      llvm::cl::init(false)};
+
+  Option<bool> enable_while_parallel_iterations{
+      *this, "enable-while-parallel-iterations",
+      llvm::cl::desc("If true, tf.While op will be parallelized. This is "
+                     "currently experimental."),
+      llvm::cl::init(false)};
+
+  Option<bool> hoist_invariant_ops{
+      *this, "hoist-invariant-ops",
+      llvm::cl::desc("If true, invariant ops in savedmodels will be hoisted "
+                     "out to run during loading."),
+      llvm::cl::init(false)};
+
+  Option<bool> fuse_get_resource_ops_in_hoisting{
+      *this, "fuse-get-resource-ops-in-hoisting",
+      llvm::cl::desc("If true, get_resource_op will be fused during hoisting"),
+      llvm::cl::init(true)};
+
+  Option<bool> sink_in_invariant_ops{
+      *this, "sink-in-invariant-ops",
+      llvm::cl::desc("If true, sink the selected invariant ops in to the "
+                     "nested functions to facilitate invariant ops hoisting."),
+      llvm::cl::init(false)};
+
+  Option<uint64_t> cost_threshold{
+      *this, "tfrt-cost-threshold",
+      llvm::cl::desc(
+          "The cost threshold to decide whether a sequence of operations is "
+          "cheap, and then whether it can be executed inline."),
+      llvm::cl::init(1)};
+
+  Option<int64_t> min_num_batch_threads{
+      *this, "tfrt-min-num-batch-threads",
+      llvm::cl::desc("The minimum number of batch threads"), llvm::cl::init(1)};
+
+  Option<int64_t> min_max_enqueued_batches{
+      *this, "tfrt-min-max-enqueued-batches",
+      llvm::cl::desc(
+          "The minimum of the maximum number of outstanding enqueued batches"),
+      llvm::cl::init(1)};
+
+  Option<std::string> batch_padding_policy{
+      *this, "tfrt-batch-padding-policy",
+      llvm::cl::desc("The policy used when padding (or splitting) batches."),
+      llvm::cl::init("")};
+
+  Option<int64_t> num_batch_threads{
+      *this, "tfrt-num-batch-threads",
+      llvm::cl::desc(
+          "The number of threads for processing batches in parallel"),
+      llvm::cl::init(0)};
+
+  Option<int64_t> max_batch_size{
+      *this, "tfrt-max-batch-size",
+      llvm::cl::desc("The maximum allowed batch size"), llvm::cl::init(0)};
+
+  Option<int64_t> batch_timeout_micros{
+      *this, "tfrt-batch-timeout-micros",
+      llvm::cl::desc("The maximum number of microseconds before outputting an "
+                     "incomplete batch"),
+      llvm::cl::init(0)};
+
+  ListOption<int64_t> allowed_batch_sizes{
+      *this, "tfrt-allowed-batch-sizes",
+      llvm::cl::desc("Allowed sizes for padding (or splitting) batches")};
+
+  Option<int64_t> max_enqueued_batches{
+      *this, "tfrt-max-enqueued-batches",
+      llvm::cl::desc("The maximum number of batches enqueued for processing "
+                     "before requests are failed fast"),
+      llvm::cl::init(0)};
+
+  Option<bool> merge_inter_dependent_streams{
+      *this, "tfrt-merge-inter-dependent-streams",
+      llvm::cl::desc("If true, streams with inter data depenedencies will be "
+                     "preferred to be merged for inline execution."),
+      llvm::cl::init(false)};
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_PIPELINE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h
new file mode 100644
index 00000000..3cae00e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TPU_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TPU_PASSES_H_
+
+// This file contains stub implementations for Google internal TPU APIs.
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Pass/PassOptions.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+
+namespace tensorflow {
+
+class CoreRTConverter;
+
+namespace tfrt_compiler {
+
+class FallbackConverter;
+
+}
+
+struct TfrtTpuCompileOptions
+    : mlir::PassPipelineOptions<TfrtTpuCompileOptions> {
+  Option<bool> move_resource_gather_to_host{
+      *this, "move-resource-gather-to-host",
+      llvm::cl::desc("Move resource gather ops to host"),
+      llvm::cl::init(false)};
+  Option<int64_t> gather_table_width_threshold_bytes{
+      *this, "gather-table-width-threshold-bytes",
+      llvm::cl::desc(
+          "The threshold to control whether a TPU resource gather op should be "
+          "moved to host. A negative values means all are moved."),
+      llvm::cl::init(-1)};
+};
+
+struct TfrtTpuExecuteOpConversionOptions {
+  bool use_core_selector = false;
+  bool use_bundled_transfer = false;
+  bool transfer_result_to_host = false;
+  bool use_tpu_host_allocator_for_inputs = false;
+  TfrtCompileOptions::TpuAllowUnpaddedBatch allow_unpadded_batch =
+      TfrtCompileOptions::TpuAllowUnpaddedBatch::kDisabled;
+};
+
+// Registers a set of dialects used in TFRT TPU lowering.
+inline void RegisterTPUDialects(mlir::DialectRegistry *registry) {}
+
+// Adds a target dialect and a set of rewrite patterns for TFRT TPU lowering.
+inline void AddTPUTargetDialectAndPatterns(
+    mlir::ConversionTarget *target, mlir::RewritePatternSet *patterns,
+    mlir::MLIRContext *context, CoreRTConverter *corert_converter,
+    tfrt_compiler::FallbackConverter *fallback_converter,
+    const TfrtTpuExecuteOpConversionOptions &tpu_exec_conv_opts,
+    bool tpu_lower_to_fallback) {}
+
+// Rewrites specific TF TPU ops to equivalent TF ops in a module.
+inline mlir::LogicalResult RunTPUBackwardCompatConversion(
+    mlir::ModuleOp module, const TfrtTpuCompileOptions &options) {
+  return mlir::failure();
+}
+
+// The rewrite rules to support the fallback execution of TPUPartitionedCallOp.
+inline mlir::LogicalResult RunTPUPartitionedCallFallbackCompatConversion(
+    mlir::ModuleOp module) {
+  return mlir::failure();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TPU_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h
new file mode 100644
index 00000000..99b7c192
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UPDATE_OP_COST_IN_TFRT_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UPDATE_OP_COST_IN_TFRT_MLIR_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Updates the existing costs for all the fallback ops with the records in
+// `cost_recorder`.
+void UpdateOpCostInTfrtMlir(mlir::ModuleOp op,
+                            const tfrt_stub::CostRecorder& cost_recorder);
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UPDATE_OP_COST_IN_TFRT_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/utils.h
new file mode 100644
index 00000000..0b94fc79
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/transforms/utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UTILS_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Checks if the given `value` is a resource argument.
+bool IsResourceArgument(mlir::Value value);
+
+// Checks if an operand is the value of a variable.
+bool IsResultVariable(const mlir::Value &original_operand,
+                      const mlir::Value &operand);
+
+// Canonicalize the symbol attr to the original TF function name.
+std::optional<std::string> CanonicalizeTensorflowFunctionName(
+    const mlir::SymbolTable &symbol_table, absl::string_view mlir_func_name,
+    bool use_mlir_func_name = false);
+
+// Returns true if the function is a session initializer in tf_saved_model
+// dialect.
+bool IsSessionInitializer(mlir::func::FuncOp op);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/import_model.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/import_model.h
new file mode 100644
index 00000000..9459f90c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/import_model.h
@@ -0,0 +1,73 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/function/function.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+struct FunctionBody;
+
+// Converts an MLIR `module` in TF dialect to TFRT's Binary Executable Format.
+// If `fallback_state` is not null, the MLIR functions for XLA clusters in
+// the form of XlaLaunch will be exported and added to the function library when
+// needed. The nested functions will also be exported. If
+// `added_xla_function_names` is not null, it will be populated with the names
+// of the added XLA functions.
+absl::Status ConvertTfMlirToBef(
+    const TfrtCompileOptions& options, mlir::ModuleOp module,
+    tfrt::BefBuffer* bef_buffer, tfrt_stub::ModelRuntimeContext& model_context,
+    tfrt_stub::FallbackState* fallback_state = nullptr,
+    std::vector<std::string>* added_xla_function_names = nullptr);
+
+absl::Status ConvertTfMlirToRuntimeExecutable(
+    const TfrtCompileOptions& options, mlir::ModuleOp module,
+    absl::FunctionRef<
+        absl::Status(mlir::PassManager&, mlir::ModuleOp,
+                     const tensorflow::TfrtPipelineOptions& options)>
+        emit_executable,
+    tfrt_stub::ModelRuntimeContext& model_context,
+    tfrt_stub::FallbackState* fallback_state = nullptr,
+    std::vector<std::string>* added_xla_function_names = nullptr);
+
+std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
+    const TfrtCompileOptions& options);
+
+// Adds MLIR functions for XLA clusters to the function library.
+absl::Status AddXlaFunctions(
+    tfrt_stub::FallbackState* fallback_state, mlir::ModuleOp mlir_module,
+    std::vector<std::string>* added_xla_function_names = nullptr);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h
new file mode 100644
index 00000000..95086564
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h
@@ -0,0 +1,135 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_MLIR_TO_BYTECODE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_MLIR_TO_BYTECODE_H_
+
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+
+class ModuleEmitterContext;
+
+// Defines a custom attribute encoding registry. Users can register custom
+// attribute encoding for their dialects in this registry. If no custom encoder
+// is registered for a dialect, the default encoding with a limited support, the
+// EncodeSimpleAttribute() below, will be used.
+class AttributeEncoderRegistry {
+ public:
+  using EncoderFn = std::function<absl::StatusOr<std::string>(
+      const ModuleEmitterContext&, mlir::Attribute)>;
+
+  void Register(absl::string_view dialect, EncoderFn encoder) {
+    encoders_[dialect] = std::move(encoder);
+  }
+
+  // Returns the encoder for the specified dialect. It can be nullptr if it is
+  // not registered for this dialect. The returned reference will be invalidated
+  // if Register() is called.
+  const EncoderFn* Get(absl::string_view dialect) const {
+    auto iter = encoders_.find(dialect);
+    if (iter != encoders_.end()) return &iter->second;
+    return nullptr;
+  }
+
+ private:
+  absl::flat_hash_map<std::string, EncoderFn> encoders_;
+};
+
+class ModuleEmitterContext {
+ public:
+  explicit ModuleEmitterContext(
+      const AttributeEncoderRegistry* attribute_encoder_registry)
+      : attribute_encoder_registry_(*attribute_encoder_registry) {}
+
+  void AddKernelName(std::string name) {
+    AddData(std::move(name), kernels_, kernel_id_map_);
+  }
+
+  int GetKernelId(llvm::StringRef name) const {
+    return kernel_id_map_.at(name);
+  }
+
+  absl::Status AddAttribute(mlir::Operation* op, mlir::Attribute attr);
+
+  int GetAttributeId(mlir::Attribute attr) const {
+    return attribute_id_map_.lookup(attr);
+  }
+
+  int AddFunction(mlir::func::FuncOp func);
+
+  int GetFunctionId(absl::string_view name) const {
+    return function_name_id_map_.at(name);
+  }
+
+  absl::Span<const std::string> kernels() const { return kernels_; }
+  absl::Span<const std::string> attributes() const { return attributes_; }
+  absl::Span<const mlir::func::FuncOp> functions() const { return functions_; }
+
+ private:
+  int AddData(std::string data, std::vector<std::string>& data_vector,
+              absl::flat_hash_map<std::string, int>& data_map) {
+    auto iter = data_map.find(data);
+    if (iter != data_map.end()) return iter->second;
+
+    int id = data_vector.size();
+    data_map[data] = id;
+    data_vector.push_back(std::move(data));
+    return id;
+  }
+
+  absl::StatusOr<std::string> DefaultEncodeAttribute(mlir::Attribute attr);
+
+  const AttributeEncoderRegistry& attribute_encoder_registry_;
+
+  std::vector<std::string> kernels_;
+  absl::flat_hash_map<std::string, int> kernel_id_map_;
+
+  std::vector<std::string> attributes_;
+  llvm::DenseMap<mlir::Attribute, int> attribute_id_map_;
+  absl::flat_hash_map<std::string, int> attribute_data_id_map_;
+
+  std::vector<mlir::func::FuncOp> functions_;
+  absl::flat_hash_map<std::string, int> function_name_id_map_;
+};
+
+// Encodes a few simple attributes. Users can use this function in their custom
+// attribute encoder.
+std::optional<std::string> EncodeSimpleAttribute(
+    const ModuleEmitterContext& module_context, mlir::Attribute attr);
+
+absl::StatusOr<bc::Buffer> EmitExecutable(
+    const AttributeEncoderRegistry& attribute_encoder_registry,
+    mlir::ModuleOp module);
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_MLIR_TO_BYTECODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h
new file mode 100644
index 00000000..6140c711
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h
@@ -0,0 +1,119 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_TEST_UTILS_H_
+
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
+#include "tensorflow/core/tfrt/mlrt/attribute/attribute.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/stubs/tfrt_native_lowering_stub.h"
+#include "tensorflow/core/tfrt/utils/tensor_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/support/string_util.h"  // from @tf_runtime
+#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/dense_tensor_utils.h"  // from @tf_runtime
+
+namespace mlrt {
+namespace testing {
+
+absl::StatusOr<std::string> EncodeAttribute(const tensorflow::AttrValue& attr);
+
+absl::Status EncodeAttributes(AttributeTable& attributes,
+                              const tensorflow::AttrValueMap& attr_map);
+
+absl::StatusOr<std::pair<mlrt::bc::Kernel, mlrt::bc::Vector<mlrt::bc::String>>>
+CreateKernelAndAttrs(int num_inputs, int num_outputs,
+                     mlrt::ExecutionContext& exec_ctx, mlrt::bc::Buffer* buffer,
+                     const tensorflow::AttrValueMap& attrs = {});
+
+template <typename T>
+absl::Status TestMlrtKernel(
+    absl::string_view kernel_name, absl::Span<mlrt::Value> regs,
+    tfrt::HostContext* host, int num_inputs, int num_outputs,
+    absl::Span<const tensorflow::Tensor> expected_outputs,
+    mlrt::KernelRegistry* registry, bool approx_equal = false,
+    const tensorflow::AttrValueMap& attrs = {}) {
+  mlrt::ExecutionContext execution_context(nullptr);
+
+  mlrt::bc::Buffer buffer;
+  TF_ASSIGN_OR_RETURN(auto kernel_and_attrs,
+                      CreateKernelAndAttrs(num_inputs, num_outputs,
+                                           execution_context, &buffer, attrs));
+
+  tensorflow::tfrt_stub::SyncResourceState sync_resource_state;
+  tfrt::AddSyncContext(execution_context, *host, &sync_resource_state);
+
+  auto kernel_fn = registry->Get(kernel_name);
+  mlrt::KernelFrame::State state(regs, kernel_and_attrs.second,
+                                 &execution_context);
+  mlrt::KernelFrame frame(&state);
+  frame.set_kernel(kernel_and_attrs.first);
+
+  kernel_fn(frame);
+
+  TF_RETURN_IF_ERROR(execution_context.status());
+
+  for (int i = 0, j = num_inputs; i < expected_outputs.size(); ++i, ++j) {
+    const auto& expected_output = expected_outputs[i];
+    auto expected_dht = tfrt::ConvertTfTensorToDHT(expected_output);
+    if (!expected_dht) {
+      return absl::InternalError(tfrt::StrCat(expected_dht.takeError()));
+    }
+
+    if (!approx_equal) {
+      if (!tfrt::TensorEqual<T>(regs[j].Get<tfrt::DenseHostTensor>(),
+                                *expected_dht)) {
+        return absl::InternalError(
+            absl::StrCat("wrong result for ", kernel_name));
+      }
+    } else {
+      if (!tfrt::TensorApproxEqual<T>(regs[j].Get<tfrt::DenseHostTensor>(),
+                                      *expected_dht)) {
+        return absl::InternalError(
+            absl::StrCat("wrong result for ", kernel_name));
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace testing
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
new file mode 100644
index 00000000..e75fdc35
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
@@ -0,0 +1,190 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_TFRT_COMPILE_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_TFRT_COMPILE_OPTIONS_H_
+
+#include <cstdint>
+#include <iosfwd>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class BackendCompiler;
+
+enum class TfrtDeviceInfraTarget {
+  kCpu,             // CPU only, no device support.
+  kTpurt,           // Target TPURT dialect and kernels.
+  kTfFallback,      // Target TPU kernels in TF Fallback.
+  kBridgeFallback,  // TPU support but choose kTpurt or kTfFallback depending on
+                    // whether the graph has unsupported feature in Bridge.
+  kGpu,             // Target GPU specific compiler passes and runtime
+                    // initializations.
+};
+
+std::ostream& operator<<(std::ostream& os, TfrtDeviceInfraTarget device_target);
+
+struct TfrtCompileOptions {
+  std::string saved_model_dir;
+  // TODO(tfrt-devs): Ideally, compiler should make the decision where
+  // to place the variable.
+  std::string variable_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  std::string default_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+
+  // Enable compiler optimization in TFRT dialect.
+  bool enable_optimizer = true;
+
+  // If true, run grappler passes before compiling.
+  bool enable_grappler = true;
+
+  // Graph rewrite options that will be applied on GraphDef before converting to
+  // MLIR.
+  GraphOptions graph_options;
+
+  // Force data format for all layout sensitive operations, eg. setting it to
+  // "NHWC" will changes all data format in the graph to "NHWC" by inserting
+  // or removing related tf.Transpose op. Currently the supported formats are
+  // "NHWC" and "NCHW".
+  //
+  // TODO(tfrt-devs): Ideally compiler should figure out whether the
+  // data format should be changed, instead of controlled by users.
+  std::string force_data_format;
+
+  // The target device infrastructure to use. This will trigger target specific
+  // compiler passes and runtime initialization.
+  TfrtDeviceInfraTarget device_target = TfrtDeviceInfraTarget::kCpu;
+
+  // The custom compiler for device compilation. Instead of using the enum above
+  // to choose predefined device target, users can use this `backend_compiler`
+  // to inject their customized implementation.
+  BackendCompiler* backend_compiler = nullptr;
+
+  // If true, use the fused TPU compile_and_execute kernel, which performs all
+  // TPU inference related operations, e.g. core selection, h2d/d2h transfers,
+  // compile and execute.
+  bool tpu_fuse_ops = false;
+
+  // If true, resource gather ops in the device graph are moved to host graphs
+  // in order to saved TPU memory usage. This option is experimental.
+  bool tpu_move_resource_gather_to_host = false;
+
+  // The threshold in bytes that controls whether a resource gather op on TPU
+  // should be moved to host. A negative value means there is no threshold. This
+  // option is experimental.
+  int64_t tpu_gather_table_width_threshold_bytes = -1;
+
+  // If true, fallback executeops that produce inputs to tpu program will use
+  // tpu host allocator. This options is experimental.
+  bool use_tpu_host_allocator_for_inputs = false;
+
+  // To allow unpadded batch for TPU execution.
+  enum class TpuAllowUnpaddedBatch {
+    // Disable this feature.
+    kDisabled,
+    // Enable this feature when in-graph batching is detected.
+    kAuto,
+    // Force to enable this feature.
+    kEnforced,
+  };
+  TpuAllowUnpaddedBatch tpu_allow_unpadded_batch =
+      TpuAllowUnpaddedBatch::kDisabled;
+
+  // If true, the compiler will try to hoist invariant ops (e.g., const ops and
+  // their non-side-effecting consumers) to loading phase, which avoids the
+  // runtime cost during later running.
+  // TODO(tfrt-devs): Set the default value to true after testing as it is
+  // supposed to be turned on by default.
+  bool hoist_invariant_ops = false;
+
+  // If true, get_resource_op will be fused during hoisting.
+  bool fuse_get_resource_ops_in_hoisting = true;
+
+  // If true, the compiler will try to sink in the invariant ops (e.g. const
+  // ops, var handle ops, etc.) to the nested function (e.g. batch function) to
+  // facilitate invariant ops hoisting.
+  // TODO(tfrt-devs): Set the default value to true after testing as it is
+  // supposed to be turned on by default.
+  bool sink_in_invariant_ops = false;
+
+  // This flag behaves differently for TFRT and MLRT.
+  // For TFRT, if true, tf.While's iterations will be parallelized on a
+  // best-effort basis. This is currently experimental. MLRT attempts to convert
+  // tf.while to tf_mlrt.map_fn regardless of this flag. For tf.While that
+  // cannot be converted tf_mlrt.map_fn, MLRT try to parallelize tf.while's
+  // iterations on a best-effort basis.
+  bool enable_while_parallel_iterations = false;
+
+  // The cost threshold to decide whether a sequence of operations is cheap, and
+  // then whether it can be executed inline. If the cost is smaller than the
+  // threshold, it will be considered as cheap operations. Since the cost must
+  // be positive integers, setting the threshold to 1 makes all operations
+  // expensive.
+  uint64_t cost_threshold = 1;
+
+  // The minimum number of batch threads. This number provides a lower bound on
+  // the number of batch threads on top of what is specified in the model. If
+  // the number of batch threads is too small (e.g. smaller than the number of
+  // parallel hardware accelerator available), it can lead to under utilization
+  // of resources.
+  int64_t min_num_batch_threads = 1;
+
+  // The minimum of the maximum number of enqueued batches. This number provides
+  // a lower bound on top of what is specified in the model. If the number of
+  // max_enqueued_batches is too small, it can lead to under utilization of
+  // resources.
+  int64_t min_max_enqueued_batches = 1;
+
+  // The policy used by a BatchScheduler to pad (or split) batches.
+  std::string batch_padding_policy;
+
+  // Batching parameters to be rewritten in the existing BatchFunction ops.
+  BatchingOptions batch_options;
+
+  // If true, streams with inter data dependencies will be preferred to be
+  // merged for inline execution.
+  bool merge_inter_dependent_streams = true;
+
+  // Whether to enable the DecomposeResourceOpsPass.
+  bool decompose_resource_ops = true;
+
+  // Whether to compile to sync TFRT dialect.
+  bool compile_to_sync_tfrt_dialect = false;
+
+  // Whether to use gpurt.compile_and_execute for GPU.
+  // TODO(b/294895431): Remove the flag and default to the fused op.
+  bool use_gpu_compile_and_execute_op = false;
+
+  // If true, MLIR module will be serialized to aot_packages.
+  bool serialize_mlir_module_to_aot_packages = false;
+
+  // Serialized MLIR module file under aot_packages.
+  std::string aot_mlir_module_file;
+
+  // If true, BEF will be serialized to aot_packages.
+  bool serialize_bef_to_aot_packages = false;
+
+  // Serialized BEF file under aot_packages.
+  std::string aot_bef_file;
+};
+
+std::ostream& operator<<(std::ostream& os, const TfrtCompileOptions& options);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_TFRT_COMPILE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/utils/export.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/utils/export.h
new file mode 100644
index 00000000..84f0e272
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/utils/export.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_EXPORT_H_
+
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow {
+
+// Exports every function in `module` into `tensorflow.FunctionDef` and calls
+// `callback` for each `tensorflow.FunctionDef`. Modifies `module` in place to
+// be suitable for FunctionDef export.
+absl::Status ExportFunctionDefs(
+    mlir::ModuleOp module,
+    absl::AnyInvocable<absl::Status(tensorflow::FunctionDef)> callback,
+    bool export_tf_original_func_name = true);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/utils/host_context.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/utils/host_context.h
new file mode 100644
index 00000000..7b2e143d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tfrt/utils/host_context.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_HOST_CONTEXT_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_HOST_CONTEXT_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/base/attributes.h"
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// The name of the default host device for running fallback kernels.
+ABSL_CONST_INIT extern const char* const kDefaultHostDeviceName;
+
+std::unique_ptr<tfrt::HostContext> CreateSingleThreadedHostContext();
+std::unique_ptr<tfrt::HostContext> CreateMultiThreadedHostContext(
+    int64_t num_threads);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_HOST_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
new file mode 100644
index 00000000..1241a73d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the TFFramework dialect.
+//
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
+
+#include "absl/status/status.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeSupport.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_status.h.inc"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+/// OpKernelContextType corresponds to C++ class OpKernelContext defined in
+/// tensorflow/core/framework/op_kernel.h
+class OpKernelContextType
+    : public Type::TypeBase<OpKernelContextType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name =
+      "kernel_gen.tf_framework.op_kernel_context";
+};
+
+class JITCallableType
+    : public Type::TypeBase<JITCallableType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name = "kernel_gen.tf_framework.jit_callable";
+};
+
+absl::StatusCode ConvertAttrToEnumValue(ErrorCode error_code);
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.h.inc"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
new file mode 100644
index 00000000..8fa1f26d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- kernel_creator.h -----------------------------------------*- C++ -*-===//
+//
+// This file declares the function to compile a TF kernel function to gpu
+// binary (hsaco for AMD, cubin for NVIDIA) or to a gpu binary with host side.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+
+// Parses tf_code to create a module. An MLIRContext is taken in case any
+// unexpected dialects are needed.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SetupContextAndParseModule(
+    mlir::MLIRContext& context, llvm::StringRef tf_code);
+
+// Converts TF code to LLVM with or without GPU support.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForHloCode(
+    mlir::MLIRContext& context, llvm::StringRef tf_code,
+    llvm::ArrayRef<std::string> architectures,
+    llvm::ArrayRef<int64_t> tile_sizes, llvm::ArrayRef<int64_t> unroll_factors,
+    bool print_ptx, bool print_llvmir, bool enable_ftz, bool index_64bit,
+    bool jit_compile, bool jit_i64_indexed_for_large_tensors,
+    bool apply_cl_options);
+
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
new file mode 100644
index 00000000..66c84df4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_FRAMEWORK_C_INTERFACE_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_FRAMEWORK_C_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "mlir/ExecutionEngine/RunnerUtils.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc(
+    void* op_kernel_ctx, size_t num_elements, size_t element_size,
+    int32_t output_index, int32_t num_candidates,
+    int32_t* candidate_input_indices);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc(
+    void* op_kernel_ctx, void* ptr);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_report_error(
+    void* op_kernel_ctx, int32_t error_code, char* msg);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_jit_compile(
+    void* op_kernel_ctx, char* code, int64_t num_tile_sizes,
+    int64_t* tile_sizes_ptr, int64_t num_unroll_factors,
+    int64_t* unroll_factors_ptr, bool enable_ftz, bool index_64bit);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_jit_execute(
+    void* op_kernel_ctx, void* callable, void* result, int64_t num_args,
+    void* args_ptr);
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_FRAMEWORK_C_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h
new file mode 100644
index 00000000..54d8b0dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h
@@ -0,0 +1,98 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_GPU_RUNTIME_WRAPPERS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_GPU_RUNTIME_WRAPPERS_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/ExecutionEngine/RunnerUtils.h"  // from @llvm-project
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/hash.h"
+#include "tsl/platform/thread_annotations.h"
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+#if TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#endif
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+class GPURuntimeCache : public tensorflow::ResourceBase {
+ public:
+#if GOOGLE_CUDA
+  using GPUModule = CUmodule;
+  using GPUFunction = CUfunction;
+#endif
+#if TENSORFLOW_USE_ROCM
+  using GPUModule = hipModule_t;
+  using GPUFunction = hipFunction_t;
+#endif
+
+  ~GPURuntimeCache() override;
+  static constexpr const char* kDefaultResourceName = "mlir-gpu-runtime-cache";
+  static absl::Status Create(GPURuntimeCache** dst);
+  std::string DebugString() const override;
+
+  // Assumes that no two modules are loaded from the same memory location over
+  // the lifetime of this cache. This allows to use the pointer as a key. All
+  // modules are unloaded on destruction of this cache.
+  GPUModule LookupOrLoadModule(void* data);
+
+  GPUFunction LookupOrGetFunction(GPUModule module, const char* kernel_name);
+
+ private:
+  struct FunctionKey {
+    GPUModule module;
+    const char* kernel_name;
+
+    friend bool operator==(const FunctionKey& lhs, const FunctionKey& rhs) {
+      return lhs.module == rhs.module && lhs.kernel_name == rhs.kernel_name;
+    }
+
+    struct Hash {
+      size_t operator()(const FunctionKey& key) const {
+        return tsl::Hash64Combine(tsl::hash<GPUModule>()(key.module),
+                                  tsl::Hash64(key.kernel_name));
+      }
+    };
+  };
+
+  tensorflow::mutex mu_;
+  absl::flat_hash_map<void*, GPUModule> gpu_module_by_data_ptr_
+      TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<FunctionKey, GPUFunction, FunctionKey::Hash>
+      gpu_function_by_module_and_name_ TF_GUARDED_BY(mu_);
+};
+
+// Implements a C wrapper around the TensorFlow runtime and CUDA (or ROCm)
+// library that allows launching a kernel on the current device and stream from
+// a binary blob for the module and function name.
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_launch_kernel(
+    void* ctx, void* module_blob, char* kernel_name, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, void** params);
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_GPU_RUNTIME_WRAPPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h
new file mode 100644
index 00000000..15d105ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h
@@ -0,0 +1,59 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_JIT_CACHE_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_JIT_CACHE_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"  // from @llvm-project
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+class JITCache : public tensorflow::ResourceBase {
+ public:
+  static constexpr const char* kDefaultResourceName = "mlir-jit-cache";
+  static absl::Status Create(JITCache** dst);
+
+  std::string DebugString() const override;
+  ExecutionEngine* LookupOrCompile(
+      std::string code,
+      std::function<llvm::Expected<std::unique_ptr<ExecutionEngine>>()>
+          compile_callback);
+  size_t Size();
+
+ private:
+  tensorflow::mutex mu_;
+  absl::flat_hash_map<std::string, std::unique_ptr<ExecutionEngine>>
+      execution_engine_by_key_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TF_JIT_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
new file mode 100644
index 00000000..45e248ce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+#define GEN_PASS_DECL_TFKERNELTOLLVMPASS
+#define GEN_PASS_DECL_EMBEDTFFRAMEWORKPASS
+#define GEN_PASS_DECL_REWRITETFFRAMEWORKASSERT
+#define GEN_PASS_DECL_FUNCTOJITINVOCATIONPASS
+#define GEN_PASS_DECL_BUFFERREUSEPASS
+#define GEN_PASS_DECL_SHAPETODESCRIPTORSPASS
+#define GEN_PASS_DECL_KERNELGENFINALBUFFERIZEPASS
+#define GEN_PASS_DECL_GPUKERNELTOBLOBPASS
+#define GEN_PASS_DECL_PARALLELLOOPSTOSEQUENTIAL
+#define GEN_PASS_DECL_PROPAGATETFABIKNOWLEDGETOKERNELS
+#define GEN_PASS_DECL_PROPAGATESHAPEKNOWLEDGETOKERNELS
+#define GEN_PASS_DECL_FUSEINNERPARALLELLOOPSPASS
+#define GEN_PASS_DECL_COPYCLEANUPPASS
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+// Pass to replace some of the Standard ops with TF Framework ops.
+// * adds tf_framework::OpKernelContextType argument to the function
+// * std.alloc becomes tf_framework.alloc_raw
+// * std.dealloc becomes tf_framework.dealloc_raw
+// * std.assert becomes tf_framework.assert
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbedTFFrameworkPass();
+
+// Pass to convert tf_framework.assert operations to calls to
+// tf_framework.report_error and create the required control flow to abort the
+// function on failed execution.
+std::unique_ptr<OperationPass<ModuleOp>> CreateRewriteTFFrameworkAssert();
+
+}  // namespace tf_framework
+
+namespace transforms {
+
+// Pass to find and annotate candidates for buffer reuse.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateBufferReusePass();
+
+// Pass to rewrite all functions to JIT invocations through the TF
+// framework.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateFuncToJITInvocationPass(
+    llvm::ArrayRef<int64_t> tile_sizes = {},
+    llvm::ArrayRef<int64_t> unroll_factors = {}, bool enable_ftz = false,
+    bool index_64bit = false, bool cpu_codegen = false,
+    bool jit_i64_indexed_for_large_tensors = false);
+
+// Pass for applying LLVM legalization patterns.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFKernelToLLVMPass(
+    mlir::StringRef blob_annotation = {});
+
+// Pass to tranform shape computations in shape dialect to standard and scf
+// using memref descriptors.
+std::unique_ptr<OperationPass<ModuleOp>> CreateShapeToDescriptorsPass();
+
+// Pass to convert scf::ParallelOp to scf::ForOp.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelLoopsToSequential();
+
+// Pass to annotate GPU Module with its PTX.
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
+    mlir::StringRef blob_annotation = {},
+    ArrayRef<std::string> architectures = {}, bool print_ptx = false,
+    bool print_llvmir = false, bool enable_ftz = false);
+
+// Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreatePropagateTfAbiKnowledgeToKernels();
+
+// Pass to propagate shape equalities across kernel boundaries.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreatePropagateShapeKnowledgeToKernels();
+
+/// Greedily maps loops to GPU hardware dimensions.
+std::unique_ptr<mlir::OperationPass<func::FuncOp>> CreateMapParallelLoopsPass();
+
+/// We need to direct fusion to the inner loops. This cannot be done with
+/// a passmanager alone ATM, as nested pass managers require operations to
+/// be closed from above.
+std::unique_ptr<mlir::OperationPass<func::FuncOp>>
+CreateFuseInnerParallelLoopsPass();
+
+// Pass to remove copies which are consumed by a GenericOp.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateCopyCleanupPass();
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateKernelgenFinalBufferizePass();
+
+}  // namespace transforms
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
new file mode 100644
index 00000000..e85d14d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_REWRITERS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_REWRITERS_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace mlir {
+namespace bufferization {
+class BufferizeTypeConverter;
+}
+class ConversionTarget;
+class LLVMTypeConverter;
+class MLIRContext;
+class RewritePatternSet;
+class TypeConverter;
+
+namespace kernel_gen {
+namespace tf_framework {
+
+/// Collects a set of patterns to convert from the TF Framework dialect to LLVM.
+void PopulateTFFrameworkToLLVMConversionPatterns(LLVMTypeConverter *converter,
+                                                 RewritePatternSet *patterns);
+
+/// Collects a set of patterns to rewrite functions for use with TF framework
+/// and also replace `alloc`, `dealloc` and `assert`.
+void PopulateEmbedTFFrameworkPatterns(RewritePatternSet *patterns);
+void PopulateEmbedTFFrameworkAssertPattern(RewritePatternSet *patterns);
+
+}  // namespace tf_framework
+
+namespace transforms {
+
+/// Collects a set of patterns that bufferize operations from the standard and
+/// other dialects.
+void populateExtraBufferizeDialects(DialectRegistry &registry);
+void populateExtraBufferizePatterns(ConversionTarget &target,
+                                    MLIRContext *context,
+                                    TypeConverter *converter,
+                                    RewritePatternSet *patterns);
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_REWRITERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.h
new file mode 100644
index 00000000..e0b67b73
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_UTILS_H_
+
+#include <string>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+
+// Attempts to find function symbol in the module, adds it if not found.
+FlatSymbolRefAttr GetOrInsertLLVMFunction(StringRef func_name, Type func_type,
+                                          Operation* op, OpBuilder* b);
+
+// Attemts to find a global string constant in the module, adds it if not found.
+Value CreateOrFindGlobalStringConstant(Location loc, StringRef global_name,
+                                       StringRef content, OpBuilder* builder);
+
+// Generates a global name with the format "base_hash(content)".
+std::string GetGlobalName(StringRef base, StringRef content);
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/optimize/quantization_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/optimize/quantization_utils.h
new file mode 100644
index 00000000..aa22d546
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tools/optimize/quantization_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+
+#include <cstdint>
+#include <vector>
+
+namespace tflite_migration {
+namespace optimize {
+namespace utils {
+
+template <typename BiasType>
+std::vector<BiasType> SymmetricBiasQuantize(const float* data,
+                                            uint64_t num_elements,
+                                            const std::vector<float>& scales);
+
+std::vector<int16_t> SymmetricQuantizeFloatsToInt16(const float* data,
+                                                    uint64_t num_elements,
+                                                    float scaling_factor);
+
+// Quantize the values given an array of scales.
+void SymmetricPerChannelQuantizeValues(const float* input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int32_t>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value);
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite_migration
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tf_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tf_passes.h
new file mode 100644
index 00000000..53388a99
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tf_passes.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TF_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TOSA_TF_PASSES_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+
+namespace mlir {
+namespace tosa {
+
+struct TOSATFLegalizationPipelineOptions
+    : public PassPipelineOptions<TOSATFLegalizationPipelineOptions> {};
+
+// Legalizes TF dialect(s) to Tosa.
+void createTFtoTOSALegalizationPipeline(
+    OpPassManager& pm, const TOSATFLegalizationPipelineOptions& opts);
+
+void registerTFtoTOSALegalizationPipeline();
+
+}  // namespace tosa
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOSA_TF_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tf_tfl_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tf_tfl_passes.h
new file mode 100644
index 00000000..93a67f9c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tf_tfl_passes.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TF_TFL_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TOSA_TF_TFL_PASSES_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+
+namespace mlir {
+namespace tosa {
+
+struct TOSATFTFLLegalizationPipelineOptions
+    : public PassPipelineOptions<TOSATFTFLLegalizationPipelineOptions> {
+  bool dequantize_tfl_softmax = false;
+};
+
+// Legalizes TF dialect(s) to Tosa.
+void createTFTFLtoTOSALegalizationPipeline(
+    OpPassManager& pm, const TOSATFTFLLegalizationPipelineOptions& opts);
+
+void registerTFTFLtoTOSALegalizationPipeline();
+
+}  // namespace tosa
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOSA_TF_TFL_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tfl_passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tfl_passes.h
new file mode 100644
index 00000000..96d3cabf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/tfl_passes.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TFL_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TOSA_TFL_PASSES_H_
+
+#include <optional>
+#include <string>
+
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace tosa {
+
+struct TOSATFLLegalizationPipelineOptions
+    : public PassPipelineOptions<TOSATFLLegalizationPipelineOptions> {
+  ArrayRef<std::string> disabled_patterns;
+  ArrayRef<std::string> enabled_patterns;
+
+  PassOptions::Option<bool> target_compilation_backend{
+      *this, "target-compilation-backend",
+      llvm::cl::desc("Whether targetting compilation backend"),
+      llvm::cl::init(false)};
+
+  PassOptions::Option<bool> dequantize_tfl_softmax{
+      *this, "dequantize-tfl-softmax",
+      llvm::cl::desc("Dequantize the TFLite softmax"), llvm::cl::init(false)};
+
+  TOSATFLLegalizationPipelineOptions() {
+    disabled_patterns = std::nullopt;
+    enabled_patterns = std::nullopt;
+  }
+};
+
+// Legalizes TFL (TensorFlow lite) dialect(s) to Tosa.
+void createTFLtoTOSALegalizationPipeline(
+    OpPassManager& pm, const TOSATFLLegalizationPipelineOptions& opts);
+
+void registerTFLtoTOSALegalizationPipeline();
+
+}  // namespace tosa
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOSA_TFL_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
new file mode 100644
index 00000000..cfe06340
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
@@ -0,0 +1,313 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_COMMON_H_
+#define TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_COMMON_H_
+
+#include <optional>
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+// This file contains legalizations common to mapping both TensorFlow and
+// TensorFlow Lite to TOSA.
+//
+// Conversion functions return None on a failure or result value on success.
+// Callers must check and return a LogicalResult failure on nullptr.
+//
+// For these functions, the framework-specific operands/attributes/defaults
+// are already extracted and placed in a common form for lowering.
+
+namespace mlir {
+namespace tosa {
+
+// Lowers the Pack operator to TOSA.
+std::optional<Value> convertPackOp(PatternRewriter& rewriter, Operation* op,
+                                   Value result_value,
+                                   SmallVectorImpl<Value>& inputs,
+                                   int32_t axis);
+
+// Lowers the Unpack operator to TOSA.
+std::optional<SmallVector<Value>> convertUnpackOp(PatternRewriter& rewriter,
+                                                  Operation* op,
+                                                  Value input_value,
+                                                  int32_t axis);
+
+// Lowers the Select operator to TOSA.
+std::optional<Value> convertSelectOp(PatternRewriter& rewriter, Operation* op,
+                                     Value result_value, Value condition_value,
+                                     Value x_value, Value y_value);
+
+// Lowers the ZerosLike operator to TOSA by creating a constant
+// of the desired type and shape.
+std::optional<Value> convertZerosLikeOp(PatternRewriter& rewriter,
+                                        Operation* op, Value result,
+                                        Value input);
+
+// Lowers the Mul operator to TOSA.  For quantized types, this requires
+// inserting rescale operators before and after the operation.
+std::optional<Value> convertMultiplyOp(PatternRewriter& rewriter, Operation* op,
+                                       Value output_val, Value input_lhs_val,
+                                       Value input_rhs_val);
+
+// Lowers the SquaredDifference operator to TOSA.
+std::optional<Value> convertSquaredDifferenceOp(PatternRewriter& rewriter,
+                                                Operation* op, Value result,
+                                                Value x, Value y);
+
+// Lowers the Round operator to TOSA.
+std::optional<Value> convertRoundOp(PatternRewriter& rewriter, Operation* op,
+                                    Value result, Value input);
+
+// Lowers ConcatV2 to TOSA.
+std::optional<Value> convertConcatV2Op(PatternRewriter& rewriter, Operation* op,
+                                       ShapedType result_type,
+                                       SmallVectorImpl<Value>& values,
+                                       int32_t axis);
+
+// Lowers SpaceToBatchND to TOSA.
+std::optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
+                                             Operation* op, Value result_value,
+                                             Value input_value,
+                                             Value block_shape_value,
+                                             Value paddings_value);
+
+// Lowers BatchToSpaceND to TOSA.
+std::optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
+                                             Operation* op, Value result_value,
+                                             Value input_value,
+                                             Value block_shape_value,
+                                             Value crops_value);
+
+// Lowers ExpandDims to TOSA.
+std::optional<Value> convertExpandDimsOp(PatternRewriter& rewriter,
+                                         Operation* op, Value result_value,
+                                         Value input_value, Value dim_value);
+
+// Lowers Squeeze to TOSA.
+std::optional<Value> convertSqueezeOp(PatternRewriter& rewriter, Operation* op,
+                                      Value result_value, Value input_value,
+                                      SmallVectorImpl<int32_t>& squeeze_dims);
+
+// Lowers ELU to a sequence of TOSA ops.
+std::optional<Value> convertEluOp(PatternRewriter& rewriter, Operation* op,
+                                  Value result_value, Value features_value);
+
+// Lowers Softmax to a sequence of TOSA ops.
+std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
+                                      Value result_value, Value logits_value,
+                                      double beta);
+
+// Lowers LogSoftmax to a sequence of TOSA ops.
+std::optional<Value> convertLogSoftmaxOp(PatternRewriter& rewriter,
+                                         Operation* op, Value result_value,
+                                         Value logits_value);
+
+// Lowers SpaceToDepth to a sequence of TOSA ops.  Supports NHWC.
+std::optional<Value> convertSpaceToDepthOp(PatternRewriter& rewriter,
+                                           Operation* op, Value result_value,
+                                           Value input_value,
+                                           IntegerAttr block_size_attr,
+                                           StringAttr data_format);
+
+// Lowers DepthToSpace to a sequence of TOSA ops.  Supports NHWC.
+std::optional<Value> convertDepthToSpaceOp(PatternRewriter& rewriter,
+                                           Operation* op, Value result_value,
+                                           Value input_value,
+                                           IntegerAttr block_size_attr,
+                                           StringAttr data_format);
+
+// Lowers Split to a sequence of TOSA ops.
+std::optional<SmallVector<Value>> convertSplitOp(
+    PatternRewriter& rewriter, Operation* op, Value result_value,
+    Value input_value, int32_t num_split, int32_t axis);
+
+// Lowers SplitV to a sequence of TOSA ops.
+std::optional<SmallVector<Value>> convertSplitVOp(
+    PatternRewriter& rewriter, Operation* op, Value result_value,
+    Value input_value, SmallVectorImpl<int32_t>& size_split, int32_t axis);
+
+// Lowers StridedSlice to a sequence of TOSA ops.
+std::optional<Value> convertStridedSliceOp(
+    PatternRewriter& rewriter, Operation* op, Value result_value,
+    Value input_value, Value begin_value, Value end_value, Value strides_value,
+    int32_t begin_mask, int32_t end_mask, int32_t ellipsis_mask,
+    int32_t new_axis_mask, int32_t shrink_axis_mask);
+
+// Lowers FloorDiv to a sequence of TOSA operators.
+std::optional<Value> convertFloorDivOp(PatternRewriter& rewriter, Operation* op,
+                                       Value result_value, Value lhs_value,
+                                       Value rhs_value);
+
+// Lowers FloorMod to a sequence of TOSA operators.
+std::optional<Value> convertFloorModOp(PatternRewriter& rewriter, Operation* op,
+                                       Value result_value, Value lhs_value,
+                                       Value rhs_value);
+
+// Lowers FusedActivation to a sequence of TOSA ops.
+std::optional<Value> convertFusedActivation(PatternRewriter& rewriter,
+                                            Operation* op, Value input_value,
+                                            StringAttr fused_activation_fn);
+
+// Helper function for implementing quantized divide by power-of-two in TOSA
+// ops.
+std::optional<Value> convertRoundingDivideByPOT(PatternRewriter& rewriter,
+                                                Operation* op,
+                                                Value input_value,
+                                                Value rshift_value);
+
+// Lowers ReduceAll to a sequence of TOSA ops.
+std::optional<Value> convertReduceAllOp(PatternRewriter& rewriter,
+                                        Operation* op,
+                                        RankedTensorType output_type,
+                                        Value input_value,
+                                        ElementsAttr axes_elems);
+
+// Lowers ReduceAny to a sequence of TOSA ops.
+std::optional<Value> convertReduceAnyOp(PatternRewriter& rewriter,
+                                        Operation* op,
+                                        RankedTensorType output_type,
+                                        Value input_value,
+                                        ElementsAttr axes_elems);
+
+// Lowers ReduceMin to a sequence of TOSA ops.
+std::optional<Value> convertReduceMinOp(PatternRewriter& rewriter,
+                                        Operation* op,
+                                        RankedTensorType output_type,
+                                        Value input_value,
+                                        ElementsAttr axes_elems);
+
+// Lowers ReduceMax to a sequence of TOSA ops.
+std::optional<Value> convertReduceMaxOp(PatternRewriter& rewriter,
+                                        Operation* op,
+                                        RankedTensorType output_type,
+                                        Value input_value,
+                                        ElementsAttr axes_elems);
+
+// Lowers ReduceProd to a sequence of TOSA ops.
+std::optional<Value> convertReduceProdOp(PatternRewriter& rewriter,
+                                         Operation* op,
+                                         RankedTensorType output_type,
+                                         Value input_value,
+                                         ElementsAttr axes_elems);
+
+// Lowers ReduceSum to a sequence of TOSA ops.
+std::optional<Value> convertReduceSumOp(PatternRewriter& rewriter,
+                                        Operation* op,
+                                        RankedTensorType output_type,
+                                        Value input_value,
+                                        ElementsAttr axes_elems);
+
+// Lowers ReduceMean to a sequence of TOSA ops.
+std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
+                                         Operation* op,
+                                         RankedTensorType output_type,
+                                         Value input_value,
+                                         ElementsAttr axes_elem);
+
+// Lowers ResizeBilinear and ResizeNearestNeighbor to TOSA resize.
+std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
+                                     RankedTensorType output_type,
+                                     Value input_value, StringRef mode,
+                                     bool align_corners,
+                                     bool half_pixel_centers);
+
+// Lowers Quantize to a sequence of TOSA quantization ops.
+std::optional<Value> convertQuantizeOp(PatternRewriter& rewriter, Operation* op,
+                                       ShapedType output_type,
+                                       Value input_value, double scale,
+                                       int64_t zeropoint);
+
+// Lowers Dequantize to a sequence of TOSA dequantization ops.
+std::optional<Value> convertDequantizeOp(PatternRewriter& rewriter,
+                                         Operation* op, ShapedType output_type,
+                                         Value input_value,
+                                         ArrayRef<float> scale,
+                                         ArrayRef<float> zeropoint,
+                                         int64_t dim);
+
+// Lowers FakeQuant to a sequence of TOSA quantization ops.
+std::optional<Value> convertFakeQuantOp(PatternRewriter& rewriter,
+                                        Operation* op, ShapedType output_type,
+                                        Value input_value, double min,
+                                        double max, int64_t num_bits,
+                                        bool narrow_range);
+
+// Align to TF_MirrorPadOp::mode and TFL_MirrorPadOp::mode
+enum class TFTFLMirrorPaddingType : uint32_t {
+  REFLECT = 0,
+  SYMMETRIC = 1,
+};
+
+std::optional<Value> convertMirrorPadCommon(PatternRewriter& rewriter,
+                                            Operation* op,
+                                            RankedTensorType output_type,
+                                            Value input, Value pad,
+                                            TFTFLMirrorPaddingType mode);
+
+// Lowers TensorFlow Conv2D to a sequence of TOSA quantization ops.
+std::optional<Value> convertTFConv2DCommon(
+    PatternRewriter& rewriter, Operation* op, RankedTensorType output_type,
+    Value input, Value filter, Value bias, ArrayAttr strides_attr,
+    ArrayAttr dilations_attr, ArrayAttr explicit_padding_attr,
+    StringRef padding_ref, StringRef data_format_ref);
+
+// Lowers TensorFlow and TensorFlow Lite Conv3D to a sequence of TOSA
+// quantization ops.
+std::optional<Value> convertConv3DCommon(PatternRewriter& rewriter,
+                                         Operation* op, ShapedType output_type,
+                                         Value input, Value filter, Value bias,
+                                         ArrayRef<int64_t> strides,
+                                         ArrayRef<int64_t> dilations,
+                                         StringRef padding_ref,
+                                         StringRef data_format_ref);
+
+// Preprocess TensorFlow Conv3D attributes prior to calling
+// `convertConv3DCommon`
+std::optional<Value> convertTFConv3DCommon(
+    PatternRewriter& rewriter, Operation* op, ShapedType output_type,
+    Value input, Value filter, Value bias, ArrayAttr strides_attr,
+    ArrayAttr dilations_attr, StringRef padding_ref, StringRef data_format_ref);
+
+// Lowers Gather operator to a sequence of TOSA ops.
+std::optional<Value> convertGatherOp(PatternRewriter& rewriter, Operation* op,
+                                     Value params_value, Value indices_value,
+                                     int32_t batch_dims, int32_t axis,
+                                     bool tosaOnly = true);
+
+// Lowers GatherNd operator to a sequence of TOSA ops.
+std::optional<Value> convertGatherNdOp(PatternRewriter& rewriter, Operation* op,
+                                       Value result_value, Value params_value,
+                                       Value indices_value);
+
+// Lowers OneHot operator to a sequence of TOSA ops.
+std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
+                                     Value result_value, Value indices_value,
+                                     Value on_value, Value off_value,
+                                     int32_t depth, int32_t axis);
+
+// Lowers Sign operator to a sequence of TOSA ops.
+std::optional<Value> convertSignOp(PatternRewriter& rewriter, Operation* op,
+                                   Value input, RankedTensorType output_type);
+
+// Lowers BroadcastTo operator to a sequence of TOSA ops.
+std::optional<Value> convertBroadcastToOp(PatternRewriter& rewriter,
+                                          Operation* op, Value input,
+                                          Value shape);
+
+};  // namespace tosa
+};  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
new file mode 100644
index 00000000..c576504d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -0,0 +1,259 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_UTILS_H_
+
+#include <cfloat>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <numeric>
+#include <optional>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/kernels/conv_grad_shape_utils.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace mlir {
+namespace tosa {
+
+LogicalResult getDynamicDims(PatternRewriter& rewriter, Value value,
+                             llvm::SmallVector<Value>& dims);
+
+std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
+                                                 Operation* op,
+                                                 Value input_value,
+                                                 ShapedType output_type,
+                                                 llvm::ArrayRef<Value> dims);
+
+// Create a TOSA rescale op from TFLite scaling multiplier, scaling shift, zero
+// points and rounding mode
+Value buildRescale(PatternRewriter& rewriter, Operation* op,
+                   ShapedType output_type, Value input_val,
+                   int32_t scale_multiplier, int32_t scale_shit,
+                   int64_t input_zp, int64_t output_zp, bool double_round,
+                   bool scale32);
+
+// Create a TOSA rescale op from TFLite scaling, zero points and rounding mode
+Value buildRescale(PatternRewriter& rewriter, Operation* op,
+                   ShapedType output_type, Value input_val, double scale,
+                   int64_t input_zp, int64_t output_zp, bool double_round,
+                   bool scale32);
+
+// Removes the zero point and cast to int32, no need to handle roundings modes
+Value removeZeroPointAndCastToInt32(PatternRewriter& rewriter, Operation* op,
+                                    Value input_val, int64_t input_zp);
+
+// Creates TOSA rescale op with int32 output
+Value buildRescaleToInt32(PatternRewriter& rewriter, Operation* op,
+                          Value input_val, int32_t input_scale_multiplier,
+                          int32_t input_scale_shift, int64_t input_zp);
+
+// Creates TOSA rescale op with int32 output
+Value buildRescaleToInt32(PatternRewriter& rewriter, Operation* op,
+                          Value input_val, double input_scale,
+                          int64_t input_zp);
+
+// Creates TOSA rescale op with int32 input
+Value buildRescaleFromInt32(PatternRewriter& rewriter, Operation* op,
+                            ShapedType output_type, Value input_val,
+                            double output_scale, int64_t output_zp);
+
+// Creates a TOSA rescale op based on conv2d parameters.
+Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
+                               Value conv_val, ShapedType input_type,
+                               ShapedType weight_type, ShapedType output_type);
+
+// Create a 8-bit TOSA TABLE constant tensor
+Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
+                            double input_scale, int32_t input_zp,
+                            double output_scale, int32_t output_zp,
+                            std::function<double(double)> func);
+
+// Create a 16-bit TOSA TABLE constant tensor
+Value getTosaConst16bitTable(PatternRewriter& rewriter, Operation* op,
+                             std::function<double(double)> func, double min,
+                             double max);
+
+// Create a 32-bit TOSA TABLE for Softmax Exp
+void getTosaConst32bitSoftmaxExpTable(PatternRewriter& rewriter, Operation* op,
+                                      double beta, double input_scale,
+                                      Value& first_const, Value& second_const,
+                                      Value& third_const, Value& fourth_const);
+
+// Create 8 bit TOSA TABLE constant tensor for the RSqrt operator
+Value getTosaConstRsqrt8bitTable(PatternRewriter& rewriter, Operation* op,
+                                 float input_scale, int32_t input_zp,
+                                 float output_scale, int32_t output_zp);
+
+// Create a 32-bit float constant operator from a float
+Value getTosaConstTensorSingleF32(PatternRewriter& rewriter, Operation* op,
+                                  float val);
+
+// Create a 32-bit integer constant operator from an int
+Value getTosaConstTensorSingleI32(PatternRewriter& rewriter, Operation* op,
+                                  int32_t val);
+
+// Create an expected bitwidth integer constant operator based on the type
+// parameter.
+Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
+                                  int64_t val);
+
+// Create a vector from a 32-bit value tensor.  Returns vector size on success
+// or -1 on error.
+LogicalResult getVectorFromValue32(Value val, SmallVectorImpl<int32_t>& vec);
+
+// Calculates the TOSA padding values based on TF operators padded with
+// SAME/VALID.
+bool getPaddingValuesFromPadType(tensorflow::Padding tf_pad,
+                                 tensorflow::TensorFormat data_format_tf,
+                                 uint32_t first_filter_spatial_dim,
+                                 ShapedType input_type, ShapedType filter_type,
+                                 DenseI64ArrayAttr strides,
+                                 DenseI64ArrayAttr dilations,
+                                 PatternRewriter& rewriter,
+                                 DenseI64ArrayAttr& explicit_pad);
+
+// Calculates the TOSA padding values for explicit-padded TF operators.
+DenseI64ArrayAttr getPaddingValuesFromExplicitPadAttr(
+    ArrayAttr explicit_pad, tensorflow::TensorFormat data_format_tf,
+    PatternRewriter& rewriter);
+
+// Calculates the TOSA padding values for transposeConv2d
+bool getTransposeConv2dPaddingValues(
+    tensorflow::Padding tf_pad, tensorflow::TensorFormat data_format_tf,
+    uint32_t first_filter_spatial_dim, ShapedType input_type,
+    ShapedType filter_type, ShapedType output_type, DenseI64ArrayAttr strides,
+    PatternRewriter& rewriter, DenseI64ArrayAttr& explicit_pad);
+
+// Templated function to create a constant op for given type and shape.
+// T: storage C type.
+// Default template creates a constant tensor in T.
+// To create INT48 TOSA constant, need to pass in llvm::APInt instead.
+template <typename T>
+std::optional<Value> getConstTensor(PatternRewriter& rewriter, Operation* op,
+                                    ArrayRef<T> vec, ArrayRef<int64_t> shape);
+
+// Check if scale32 mode is used for given output_element_type
+bool isScale32(mlir::quant::UniformQuantizedType output_element_type);
+
+// Applies a set of patterns greedily to the specified function, then applies
+// a cleanup to guarantee the function contract and constants are valid. This
+// means patterns can performed shape inference while not altering immutable
+// types.
+LogicalResult ApplyPatternsWithShapeResolution(
+    func::FuncOp func, const FrozenRewritePatternSet& patterns);
+
+// Creates a TOSA operation and performs shape inference on the individual
+// op. This allows shape inference during the TFLite to TOSA lowering.
+template <typename TosaOp, typename... Args>
+TosaOp CreateOpAndInfer(ImplicitLocOpBuilder& builder, Type result_ty,
+                        Args&&... args) {
+  auto op = builder.create<TosaOp>(result_ty, args...);
+
+  InferShapedTypeOpInterface shapeInterface =
+      dyn_cast<InferShapedTypeOpInterface>(op.getOperation());
+  if (!shapeInterface) return op;
+
+  SmallVector<ShapedTypeComponents> returnedShapes;
+  if (shapeInterface
+          .inferReturnTypeComponents(op.getContext(), builder.getLoc(),
+                                     op->getOperands(), op->getAttrDictionary(),
+                                     op->getPropertiesStorage(),
+                                     op->getRegions(), returnedShapes)
+          .failed())
+    return op;
+
+  // We need to use the element type of the existing result type to generate
+  // the new result shaped type. This is because rescale can include a cast to
+  // different bit-width types and does not have a TypeAttr to define the
+  // target type.
+  auto result = op->getResult(0);
+  auto predictedShape = returnedShapes[0];
+  auto currentKnowledge = ValueKnowledge::getKnowledgeFromType(result_ty);
+
+  // Compute the knowledge based on the inferred type.
+  auto inferredKnowledge = ValueKnowledge::getPessimisticValueState();
+  inferredKnowledge.dtype = mlir::cast<ShapedType>(result_ty).getElementType();
+  inferredKnowledge.hasRank = predictedShape.hasRank();
+  if (predictedShape.hasRank()) {
+    for (auto dim : predictedShape.getDims()) {
+      inferredKnowledge.sizes.push_back(dim);
+    }
+  }
+
+  // Compute the new type based on the joined version.
+  auto newKnowledge = ValueKnowledge::join(currentKnowledge, inferredKnowledge);
+  Type new_ty =
+      newKnowledge.hasRank
+          ? Type{tensorflow::GetTypeFromTFTensorShape(
+                llvm::ArrayRef(newKnowledge.sizes), newKnowledge.dtype)}
+          : Type{mlir::UnrankedTensorType::get(newKnowledge.dtype)};
+  result.setType(new_ty);
+  return op;
+}
+
+template <typename TosaOp, typename... Args>
+TosaOp CreateOpAndInfer(PatternRewriter& rewriter, Location loc, Type result_ty,
+                        Args&&... args) {
+  ImplicitLocOpBuilder builder(loc, rewriter);
+  return CreateOpAndInfer<TosaOp>(builder, result_ty, args...);
+}
+
+template <typename TosaOp, typename... Args>
+void CreateReplaceOpAndInfer(PatternRewriter& rewriter, Operation* op,
+                             Type result_ty, Args&&... args) {
+  auto result =
+      CreateOpAndInfer<TosaOp>(rewriter, op->getLoc(), result_ty, args...);
+  rewriter.replaceOp(op, result->getResults());
+}
+
+void TrimQuantizedIntegerRangeMin(mlir::quant::UniformQuantizedType dtype,
+                                  int64_t& val_min);
+
+void TrimQuantizedIntegerRangeMax(mlir::quant::UniformQuantizedType dtype,
+                                  int64_t& val_max);
+
+void TrimQuantizedIntegerRange(mlir::quant::UniformQuantizedType dtype,
+                               int64_t& val_min, int64_t& val_max);
+
+inline bool IsTFLDoubleRoundingMode() {
+#if TFLITE_SINGLE_ROUNDING
+  return false;
+#else
+  return true;
+#endif  // TFLITE_SINGLE_ROUNDING
+}
+
+}  // namespace tosa
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/passes.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/passes.h
new file mode 100644
index 00000000..de0872b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -0,0 +1,94 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_set>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace quant {
+class QuantDialect;
+}
+
+namespace quantfork {
+class QuantizationForkDialect;
+}
+
+namespace TFL {
+class TFLDialect;
+}
+
+namespace tosa {
+class TosaDialect;
+
+void populateLegalizeTFPatterns(MLIRContext* ctx, RewritePatternSet& patterns);
+void populateLegalizeTFLPatterns(MLIRContext* ctx, RewritePatternSet& patterns);
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createFuseBiasTFPass();
+
+// `disabledPatterns` is a set of labels used to filter out input patterns with
+// a debug label or debug name in this set.
+// `enabledPatterns` is a set of labels used to filter out input patterns that
+//  do not have one of the labels in this set.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFLPass(
+    ArrayRef<std::string> disabled_patterns = std::nullopt,
+    ArrayRef<std::string> enabled_patterns = std::nullopt);
+
+std::unique_ptr<OperationPass<ModuleOp>> createRetainCallOnceFuncsPass();
+std::unique_ptr<OperationPass<ModuleOp>> createStripModuleMetadataPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertTFLUint8Pass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createConvertFunctionMetadataPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createDequantizeTFLSoftmaxPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFTFLPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexTypesPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createStripFunctionMetadataPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createStripQuantTypesPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass();
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFLStatefulPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_CLASSES
+#define GEN_PASS_DECL_TOSALEGALIZETFPASS
+#define GEN_PASS_DECL_TOSALEGALIZETFLPASS
+#define GEN_PASS_DECL_TOSALEGALIZETFTFLPASS
+#define GEN_PASS_DECL_TOSAFUSEBIASTFPASS
+#define GEN_PASS_DECL_TOSACONVERTTFLUINT8PASS
+#define GEN_PASS_DECL_TOSASTRIPQUANTTYPESPASS
+#define GEN_PASS_DECL_TOSALOWERCOMPLEXTYPESPASS
+#define GEN_PASS_DECL_TOSADEQUANTIZETFLSOFTMAXPASS
+#define GEN_PASS_DECL_RETAINCALLONCEFUNCS
+#define GEN_PASS_DECL_STRIPFUNCTIONMETADATA
+#define GEN_PASS_DECL_STRIPMODULEMETADATA
+#define GEN_PASS_DECL_VERIFYFULLYCONVERTED
+#define GEN_PASS_DECL_CONVERTFUNCTIONMETADATA
+#define GEN_PASS_DECL_TOSALEGALIZESTATEFULPASS
+
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+}  // namespace tosa
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/array_container_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/array_container_utils.h
new file mode 100644
index 00000000..80fa14e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/array_container_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
+
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+
+template <typename T>
+inline llvm::ArrayRef<T> SpanToArrayRef(absl::Span<const T> span) {
+  return llvm::ArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline llvm::ArrayRef<T> SpanToArrayRef(absl::Span<T> span) {
+  return llvm::ArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline llvm::MutableArrayRef<T> SpanToMutableArrayRef(absl::Span<T> span) {
+  return llvm::MutableArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline absl::Span<const T> ArrayRefToSpan(llvm::ArrayRef<T> ref) {
+  return absl::Span<const T>(ref.data(), ref.size());
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/name_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/name_utils.h
new file mode 100644
index 00000000..356b4d25
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/name_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
+
+#include <string>
+
+#include "mlir/IR/Location.h"  // from @llvm-project
+
+namespace mlir {
+
+// Converts characters in name that are considered illegal in TensorFlow Node
+// name to '.'.
+void LegalizeNodeName(std::string& name);
+
+// Returns the TensorFlow node name associated with a location.
+std::string GetNameFromLoc(Location loc);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/string_container_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/string_container_utils.h
new file mode 100644
index 00000000..fb2fa06c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/mlir/utils/string_container_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+
+inline absl::string_view StringRefToView(llvm::StringRef ref) {
+  return absl::string_view(ref.data(), ref.size());
+}
+
+inline llvm::StringRef StringViewToRef(absl::string_view view) {
+  return llvm::StringRef(view.data(), view.size());
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/common/datavec.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/common/datavec.h
new file mode 100644
index 00000000..eff32f1f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/common/datavec.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_DATAVEC_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_DATAVEC_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Input/output data format for OpConverterTest::BuildAndRun().
+struct InputOutputData {
+  size_t TotalBytes() const { return tensor.TotalBytes(); }
+  string name;
+  Tensor tensor;
+};
+
+using DataVec = std::vector<InputOutputData>;
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/common/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/common/utils.h
new file mode 100644
index 00000000..0bc63ecd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+
+#include <numeric>
+#include <tuple>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+// Returns the compile time TensorRT library version information
+// {Maj, Min, Patch}.
+std::tuple<int, int, int> GetLinkedTensorRTVersion();
+
+// Returns the runtime time TensorRT library version information
+// {Maj, Min, Patch}.
+std::tuple<int, int, int> GetLoadedTensorRTVersion();
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+#define ERROR_LOC __FILE__, ":", __LINE__
+
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                          \
+  return errors::Internal("TFTRT::", __FUNCTION__, "\n", ERROR_LOC, \
+                          " failed to add TRT layer, at: ", node);
+
+#define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \
+  if (ptr == nullptr) {                          \
+    TFTRT_INTERNAL_ERROR_AT_NODE(node);          \
+  }
+
+// Use this macro within functions that return a Status or StatusOR<T> to check
+// boolean conditions. If the condition fails, it returns an
+// errors::Internal message with the file and line number.
+#define TRT_ENSURE(x)                                          \
+  if (!(x)) {                                                  \
+    return errors::Internal(ERROR_LOC, " TRT_ENSURE failure"); \
+  }
+
+// Checks that a Status or StatusOr<T> object does not carry an error message.
+// If it does have an error, returns an errors::Internal instance
+// containing the error message, along with the file and line number. For
+// pointer-containing StatusOr<T*>, use the below TRT_ENSURE_PTR_OK macro.
+#define TRT_ENSURE_OK(x)                                              \
+  if (!x.ok()) {                                                      \
+    return errors::Internal(ERROR_LOC, " TRT_ENSURE_OK failure:\n  ", \
+                            x.status().ToString());                   \
+  }
+
+// Checks that a StatusOr<T* >object does not carry an error, and that the
+// contained T* is non-null. If it does have an error status, returns an
+// errors::Internal instance containing the error message, along with the file
+// and line number.
+#define TRT_ENSURE_PTR_OK(x)                                       \
+  TRT_ENSURE_OK(x);                                                \
+  if (*x == nullptr) {                                             \
+    return errors::Internal(ERROR_LOC, " pointer had null value"); \
+  }
+
+namespace tensorflow {
+namespace tensorrt {
+
+#define IS_TRT_VERSION_GE(major, minor, patch, build)           \
+  ((NV_TENSORRT_MAJOR > major) ||                               \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
+    NV_TENSORRT_PATCH > patch) ||                               \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
+    NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build))
+
+#define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
+
+// Initializes the TensorRT plugin registry if this hasn't been done yet.
+void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger);
+
+class IONamePrefixes {
+ public:
+  static constexpr const char* const kInputPHName = "TensorRTInputPH_";
+  static constexpr const char* const kOutputPHName = "TensorRTOutputPH_";
+};
+
+// Gets the binding index of a tensor in an engine.
+//
+// The binding index is looked up using the tensor's name and the profile index.
+// Profile index should be set to zero, if we do not have optimization profiles.
+Status GetTrtBindingIndex(const char* tensor_name, int profile_index,
+                          const nvinfer1::ICudaEngine* cuda_engine,
+                          int* binding_index);
+
+// Gets the binding index of a tensor in an engine.
+//
+// Same as above, but uses the network input index to identify the tensor.
+Status GetTrtBindingIndex(int network_input_idx, int profile_index,
+                          const nvinfer1::ICudaEngine* cuda_engine,
+                          int* binding_index);
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+namespace nvinfer1 {
+// Prints nvinfer1::Dims or any drived type to the given ostream. Per GTest
+// printing requirements, this must be in the nvinfer1 namespace.
+inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& v) {
+  os << "nvinfer1::Dims[";
+  os << absl::StrJoin(std::vector<int>(v.d, v.d + v.nbDims), ",");
+  os << "]";
+  return os;
+}
+
+// Returns true if any two derived nvinfer1::Dims type structs are equivalent.
+inline bool operator==(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
+  if (rhs.nbDims != lhs.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < lhs.nbDims; i++) {
+    if (rhs.d[i] != lhs.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns false if any 2 subclasses of nvinfer1::Dims are equivalent.
+inline bool operator!=(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
+  return !(rhs == lhs);
+}
+
+// Prints nvinfer1::INetworkDefinition* information to the given ostream.
+inline std::ostream& operator<<(std::ostream& os,
+                                nvinfer1::INetworkDefinition* n) {
+  os << "nvinfer1::INetworkDefinition{\n";
+  std::vector<int> layer_idxs(n->getNbLayers());
+  std::iota(layer_idxs.begin(), layer_idxs.end(), 0);
+  os << absl::StrJoin(layer_idxs, "\n ",
+                      [n](std::string* out, const int layer_idx) {
+                        out->append(n->getLayer(layer_idx)->getName());
+                      });
+  os << "}";
+  return os;
+}
+
+// Prints the TensorFormat enum name to the stream.
+std::ostream& operator<<(std::ostream& os,
+                         const nvinfer1::TensorFormat& format);
+
+// Prints the DataType enum name to the stream.
+std::ostream& operator<<(std::ostream& os, const nvinfer1::DataType& data_type);
+
+}  // namespace nvinfer1
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h
new file mode 100644
index 00000000..0a9ee702
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h
@@ -0,0 +1,121 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include <array>
+#include <memory>
+#include <set>
+
+#include "absl/types/optional.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// Implements core algorithm selection logic in a testable manner. The policy
+// implemented depends on the given TRT version. We have this class because TRT
+// interfaces make it difficult to directly test an IAlgorithmSelector
+// implementation.
+class AlgorithmSelectorImpl {
+ public:
+  using TRTVersion = std::array<int, 4>;
+  using ImplementationID = int64_t;
+  using TacticID = int64_t;
+
+  static constexpr TRTVersion CompileTimeTRTVersion() {
+    return TRTVersion{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH,
+                      NV_TENSORRT_BUILD};
+  }
+
+  explicit AlgorithmSelectorImpl(
+      const TRTVersion& version = CompileTimeTRTVersion())
+      : version_(version) {}
+
+  bool IsShuffleLayer(ImplementationID id) const;
+
+  bool IsBannedTactic(TacticID id) const;
+
+  // Returns true if the algorithm implementing the IShuffleLayer is acceptable.
+  bool AllowShuffleAlgorithm(TacticID tactic, nvinfer1::DataType input_dtype,
+                             nvinfer1::TensorFormat input_format) const;
+
+  bool IsTrtVersionGE(const TRTVersion& version) const;
+
+  // Returns true if we know at compile time that the algorithm selector
+  // should be required. This is a conservative estimate.
+  bool IsAlgorithmSelectorRequired() const;
+
+  static std::set<TacticID> GetBannedTRT72TuringTactics();
+
+ private:
+  TRTVersion version_;
+};
+
+// Implements the TRT IAlgorithmSelector interface. The method
+// "selectAlgorithms" selects allowable algorithms for each layer, and
+// "reportAlgorithms" summarizes the algorithms selected by TensorRT.
+class TftrtAlgorithmSelector : public nvinfer1::IAlgorithmSelector {
+ private:
+  using TacticID = AlgorithmSelectorImpl::TacticID;
+
+  // An index we should choose for all algorithms. Used for debugging.
+  std::optional<int32_t> fixed_algorithm_idx_;
+
+  AlgorithmSelectorImpl selector_;
+
+ public:
+  TftrtAlgorithmSelector();
+
+  // If the environment variable TF_TRT_FIXED_ALGORITHM_ID is empty, this
+  // function returns nullopt. Otherwise, it returns the specified number.
+  static std::optional<int64_t> GetFixedAlgorithmID();
+
+  // Returns true if the algorithm associated with context is acceptable.
+  bool AlgorithmPolicy(const nvinfer1::IAlgorithmContext& context,
+                       const nvinfer1::IAlgorithm& alg) const;
+
+  // This function fills the array "selection" with the indices of selected
+  // algorithm candidates from "algoChoices", each of which is an implementation
+  // for the kernel described by the given IAlgorithmContext. It should return a
+  // number in [0, nbChoices] indicating the number of selected indices. If 0 is
+  // returned, TensorRT will use its default selection mechanism.
+  int32_t selectAlgorithms(const nvinfer1::IAlgorithmContext& algoContext,
+                           const nvinfer1::IAlgorithm* const* algoChoices,
+                           int32_t nbChoices,
+                           int32_t* selection) noexcept override;
+
+  // Called by TensorRT to report choices it made.
+  void reportAlgorithms(const nvinfer1::IAlgorithmContext* const* algoContexts,
+                        const nvinfer1::IAlgorithm* const* algoChoices,
+                        int32_t nbAlgorithms) noexcept override;
+
+  bool IsRequired() const {
+    return selector_.IsAlgorithmSelectorRequired() ||
+           fixed_algorithm_idx_ != std::nullopt;
+  }
+};
+
+// Returns an initialized AlgorithmSelector if an algorithm selector is required
+// for the current TRT version. Otherwise, returns nullptr.
+std::unique_ptr<TftrtAlgorithmSelector> MaybeCreateAlgorithmSelector();
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
new file mode 100644
index 00000000..0607fb85
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// These functions are internal implementation functions for the
+// TRTOptimizationPass.
+
+// Performs segmentation and conversion on the given Grappler item. This method
+// contains the core logic of the TRTOptimizationPass.
+Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params,
+                    grappler::GrapplerItem& grappler_item,
+                    const std::vector<string>& input_output_names,
+                    grappler::Cluster* cluster, GraphDef* output);
+
+// Helper method for the conversion, expose for testing.
+std::pair<int, Allocator*> GetDeviceAndAllocator(
+    const grappler::Cluster* cluster, const EngineInfo& engine);
+
+// Helper method that registers `segment_graph` as a function to the function
+// library in `graph`.
+Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
+                                      Graph* graph, const string& engine_name);
+
+// Creates and serializes an ICudaEngine. Used only in is_dynamic_op=false,
+// a.k.a. static engine mode.
+Status CreateStaticEngine(const TRTOptimizationPass::ConversionParams& params,
+                          const EngineInfo& info, int max_batch_size,
+                          const std::vector<PartialTensorShape>& input_shapes,
+                          TrtShapeOptimizationProfile* profile,
+                          string* segment_string, grappler::Cluster* cluster);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
new file mode 100644
index 00000000..9664f1a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -0,0 +1,593 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+namespace convert {
+using ::tsl::StatusOr;
+
+struct EngineConnection {
+  // Constructs a non-control edge.
+  EngineConnection(const string& outside, int out_id, int out_port,
+                   const string& inside, int in_id, int in_port,
+                   bool input_edge, int port)
+      : outside_node_name(outside),
+        outside_id(out_id),
+        outside_port(out_port),
+        inside_node_name(inside),
+        inside_id(in_id),
+        inside_port(in_port),
+        is_input_edge(input_edge),
+        port_number(port) {}
+
+  // Constructs a control edge.
+  EngineConnection(const string& outside, int out_id, const string& inside,
+                   int in_id, bool input_edge)
+      : outside_node_name(outside),
+        outside_id(out_id),
+        outside_port(Graph::kControlSlot),
+        inside_node_name(inside),
+        inside_id(in_id),
+        inside_port(Graph::kControlSlot),
+        is_input_edge(input_edge),
+        port_number(Graph::kControlSlot) {}
+
+  bool is_control_edge() const { return port_number == Graph::kControlSlot; }
+
+  const string outside_node_name;
+  const int outside_id;
+  const int outside_port;
+  PartialTensorShape outside_shape;  // Only set for input edge.
+
+  const string inside_node_name;
+  const int inside_id;
+  const int inside_port;
+  PartialTensorShape inside_shape;  // Only set for output edge.
+
+  DataType connection_type;
+  const bool is_input_edge;
+
+  // The port number of the TRT node connected with this edge.
+  const int port_number;
+};
+
+struct EngineInfo {
+  EngineInfo()
+      : engine_type(EngineType::TRTStatic),
+        max_workspace_size_bytes(0),
+        max_batch_size(std::nullopt),
+        maximum_cached_engines(0),
+        precision_mode(TrtPrecisionMode::FP32),
+        use_calibration(true),
+
+        allow_build_at_runtime(true),
+        use_explicit_precision(false) {}
+
+  string engine_name;
+  string device;
+  GraphDef segment_graph_def;
+
+  // Non-control input connections inside this vector are sorted in a way such
+  // that, the segment nodes connecting to them are topological sorted.
+  // In addition, for non-control connections, there must be no duplicates.
+  std::vector<EngineConnection> connections;
+
+  enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
+  EngineType engine_type;
+  int64 max_workspace_size_bytes;
+  std::optional<int> max_batch_size;
+  int maximum_cached_engines;
+  TrtPrecisionMode precision_mode;
+  bool use_calibration;
+  bool allow_build_at_runtime;
+  bool use_explicit_precision;
+};
+
+// Constructs a graphdef from the segment in the given graph and stores it to
+// the engine_info. Adds _Arg nodes for input edges (InputPH_*) and _Retval
+// nodes for output edges (OutputPH_*). Maintains the topological order of the
+// non-input/output nodes in the graphdef. This function needs to be called
+// before TensorRT layers are created because it prepares the original graph
+// for TensorRT conversion.
+//
+// - subgraph_node_names: the node names of the subgraph.
+// - subgraph_node_ids: the node ids of the subgraph, must be sorted in
+//   topological order.
+// - engine_info: a data structure that records the information about the
+//   engine containing the subgraph.
+//
+// TODO(aaroey): add tests to validate these properties.
+Status ConvertSegmentToGraphDef(
+    const Graph* graph, const grappler::GraphProperties& graph_properties,
+    const std::vector<const Node*>& subgraph_nodes, EngineInfo* engine_info);
+
+// Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
+// 'builder' successfully build the engine. If the result is not ok, 'engine'
+// will be set to nullptr
+// Once returned, 'builder' is not needed any more and can be safely destroyed.
+//
+// - convert_successfully: indicates whether the conversion to TensorRT network
+//   is successful. This is different than successfully building the engine:
+//   building can still fail afterwards.
+// Note: When 'cluster' is not null, it contains the graph to be converted.
+//       We may perform additional optimizations to the graph before converting
+//       the graph.
+Status ConvertGraphDefToEngine(
+    const GraphDef& gdef, OpKernelContext* ctx, TrtPrecisionMode precision_mode,
+    int max_batch_size, size_t max_workspace_size_bytes,
+    const std::vector<PartialTensorShape>& input_shapes,
+    nvinfer1::ILogger* logger, nvinfer1::IGpuAllocator* allocator,
+    TRTInt8Calibrator* calibrator,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
+    const bool use_implicit_batch, bool* convert_successfully,
+    TrtShapeOptimizationProfile* profiles, absl::string_view engine_name,
+    bool use_explicit_precision,
+    tensorflow::grappler::Cluster* cluster = nullptr,
+    const string& device = "");
+
+// Helper class for the segmenter to determine whether an output edge from the
+// TRT segment is valid.
+class OutputEdgeValidator {
+ public:
+  // Return true if the specified edge is eligible to be an output edge of the
+  // TRT segment.
+  bool operator()(const Edge* out_edge) const;
+};
+
+// Class to verify if specific TF node is supported by TRT.
+class TrtNodeValidator {
+ public:
+  // 'graph_properties' is the GraphProperties of the graph whose nodes will be
+  // checked by IsTensorRTCandidate() later. It is used to get the shape and
+  // data type information of a tensor for validation purpose.
+  TrtNodeValidator(const grappler::GraphProperties& graph_properties,
+                   TrtPrecisionMode precision_mode, bool use_calibration,
+                   bool use_implicit_batch, bool use_explicit_precision);
+
+  // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
+  // to TRT subgraph and later converted into TRT engine.
+  Status IsTensorRTCandidate(const Node* node);
+
+  static const std::set<string>* quantize_ops;
+
+  // Returns validator by op type. If no validator is registered for
+  // specific op, it means no validation is needed and ValidateNode() will
+  // return OK.
+  StatusOr<OpConverter> GetValidator(const std::string& op);
+
+ private:
+  // Convert a Const node to a TRT_TensorOrWeights.
+  Status ConvertConstToWeights(const NodeDef& const_node_def,
+                               const std::vector<TRT_TensorOrWeights>& inputs,
+                               TRT_TensorOrWeights* output);
+
+  // Convert a VariableV2 node to a TRT_TensorOrWeights.
+  Status ConvertVariableToWeights(
+      const NodeDef& const_node_def,
+      const std::vector<TRT_TensorOrWeights>& inputs,
+      TRT_TensorOrWeights* output);
+
+  // Convert the output tensor at 'output_port' of 'node_def' to a
+  // TRT_TensorOrWeights which will be later used as an input to other nodes and
+  // passed to ValidateNode() below.
+  Status ConvertToTensorOrWeights(const NodeDef& node_def, int output_port,
+                                  TRT_TensorOrWeights* tensor_or_weights);
+
+  // Store the weights added during validation. Some validations (e.g.
+  // validation for Const node) may produce weights.
+  TrtWeightStore weight_store_;
+
+  // GraphProperties of the graph whose nodes are to be validated by
+  // IsTensorRTCandidate().
+  const grappler::GraphProperties& graph_properties_;
+
+  // Quantization ops are only converted when using quantized precisions.
+  const TrtPrecisionMode precision_mode_;
+
+  const bool use_calibration_;
+
+  const bool use_implicit_batch_;
+
+  const bool use_explicit_precision_;
+
+  friend class ValidatorTest;
+  friend class OpConverterTest;
+};
+
+// Class to convert TF nodes to TRT network.
+class Converter {
+ public:
+  // Used for Converter::RenameAndMarkOutputTensors()
+  struct EngineOutputInfo {
+    // The TRT tensor name which produces the output.
+    string source_tensor_name;
+    // The TensorFlow node name which is receiving the output from the TRT
+    // engine. This should always be the Identity node created in
+    // ConvertSegmentToGraphDef.
+    string dest_node_name;
+    // Output type. TensorRT requires this to be explicitly set for engine
+    // outputs.
+    nvinfer1::DataType trt_dtype;
+  };
+
+  static StatusOr<std::unique_ptr<Converter>> Create(
+      TrtPrecisionMode precision_mode, bool use_calibration,
+      nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
+      absl::string_view engine_name, bool use_explicit_precision = false,
+      OpKernelContext* ctx = nullptr);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Methods used by the TRT engine builder to build a TRT network from a TF
+  // function/subgraph.
+
+  // Convert the node to TRT network.
+  Status ConvertNode(const NodeDef& node_def);
+
+  // Add input tensor to the TRT network with given 'name', 'dtype', 'dims' and
+  // 'batch_size'.
+  Status AddInputTensor(const string& name, nvinfer1::DataType dtype,
+                        const nvinfer1::Dims& dims, int batch_size);
+
+  // Store the ResourceHandle as a TRT_TensorOrWeights object. This can be
+  // later used as input to other nodes.
+  Status AddInputResource(const string& name, const ResourceHandle& resource);
+
+  // Mark the tensors with names specified by source_tensor_name as output of
+  // the TRT network, and set their names in the TRT network as dest_node_name.
+  Status RenameAndMarkOutputTensors(
+      const std::vector<EngineOutputInfo>& output_tensors);
+
+  // Build a TRT engine using the created network.
+  Status BuildCudaEngine(TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+                         int max_batch_size, size_t max_workspace_size_bytes,
+                         nvinfer1::IGpuAllocator* allocator,
+                         TRTInt8Calibrator* calibrator,
+                         TrtShapeOptimizationProfile* profiles);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Methods used by op converters to convert individual TF node and add layers
+  // to the TRT network.
+
+  // Op converters (e.g. ConvertReshape) need to access the TRT network in order
+  // to add TRT layers.
+  nvinfer1::INetworkDefinition* network() { return trt_network_.get(); }
+
+  // What precision are we targeting?
+  TrtPrecisionMode precision_mode() const { return precision_mode_; }
+
+  // Variable converters need the context to read variable values.
+  OpKernelContext* context() { return ctx_; }
+
+  // Calibration will be or was previously performed on this network?
+  bool use_calibration() const { return use_calibration_; }
+
+  // Whether implicit batch mode is enabled
+  bool use_implicit_batch() const { return use_implicit_batch_; }
+
+  // This function should be called when we know the quantization range of a
+  // tensor from a quantize/dequantize node.
+  void ProvideQuantizationRange(ITensorProxyPtr* tensor, float min_range,
+                                float max_range);
+
+  // Should be called when full TRT network has been constructed and before
+  // building the engine.
+  void MaybeApplyQuantizationRanges();
+
+  // Below are helper methods for op converters to add different layers to the
+  // TRT network.
+
+  // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to
+  // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch
+  // dimension which should always be 0. If this is for adding a transpose layer
+  // to support the conversion of 'node_def', callers need to provide a
+  // non-empty 'sub_op_name' appended to the name of 'node_def' to avoid layer
+  // name conflicts.
+  Status TransposeTensor(ITensorProxyPtr input_tensor,
+                         const std::vector<int>& order_with_batch_dim,
+                         ITensorProxyPtr* output_tensor,
+                         const NodeDef& node_def,
+                         absl::string_view sub_op_name = "");
+
+  // Reshapes a dynamic shape tensor by removing or adding dimensions of size 1,
+  // and/or permuting the dimensions. The new shape is derived from the shape of
+  // the input tensor according to the slices and size_for_added_dims arguments.
+  //
+  // If there would be at most one unknown dimension, we could set the new shape
+  // using IShuffleLayer::setReshapeDimensions, which treats -1 as a special
+  // value (the same way as TF). In general, we can have more than one unknown
+  // dimensions, and we have to manipulate the shape tensors during runtime to
+  // define the new shape. This helper function defines the necessary shape
+  // inference layers and calls reshape using the calculated new shape.
+  //
+  // Example:
+  //
+  // Assume that we want to reshape a tensor from shape {A,B,C,D} to {C,D,A,B}
+  // (no transpose, just change the shape). In dynamic shape mode, the A,B,C,D
+  // values are not necessarily known at conversion time, they can be all -1. We
+  // can only define the new shape at runtime, when the actual shape is already
+  // known. To define the new shape:
+  // - We use an IShapeLayer to retrieve a shape tensor with the {A,B,C,D}
+  //   values.
+  // - Create two slices {C,D} and {A,B} of the shape tensor.
+  // - Concatenate these slices {C,D,A,B},
+  // - Set the {C,D,A,B} shape tensor as an input shape tensor for
+  // IShuffleLayer.
+  //
+  // This can be achieved by calling DynamicReshape(input, {{2,4},{0,2}},
+  // params).
+  //
+  // Before each slice we can insert new dims if the corresponding
+  // size_for_added_dims element is not negative. The size_for_added_dims array
+  // can have more than slices.size() elements, in order to insert a dimension
+  // after the last slice. For example, to add two leading 1 dimensions, and
+  // three trailing 1 dimensions, call DynamicReshape(input, {{0,nbDims}},
+  // {2, 3}).
+  //
+  // Parameters:
+  // input - input tensor
+  // slices - [start, end) pairs of slices
+  // params - conversion parameters
+  // output - reshaped tensor
+  // size_for_added_dims - size of dimension inserted right before slice[i]. We
+  //   only insert a new dim if size_for_added_dims[i] >= 0.
+  Status DynamicReshape(ITensorProxyPtr input,
+                        std::vector<std::pair<int, int>> slices,
+                        const OpConverterParams* params,
+                        ITensorProxyPtr* output,
+                        std::vector<int> size_for_added_dims = {},
+                        std::optional<int> op_instance = std::nullopt);
+
+  // Inserts a singleton dimension at axis for a dynamic shape tensor.
+  Status DynamicExpandDims(ITensorProxyPtr input, const nvinfer1::Dims& dims,
+                           int axis, const OpConverterParams* params,
+                           ITensorProxyPtr* output,
+                           std::optional<int> op_instance = std::nullopt);
+
+  // Helper function to add a squeeze op to the network.
+  //
+  // The input_dims argument stores the TRT dimensions of the input tensor,
+  // where the dimensions to be squeezed are replaced by 0.
+  Status SqueezeTensor(ITensorProxyPtr input, std::vector<int>* input_dims,
+                       const OpConverterParams* params, ITensorProxyPtr* output,
+                       std::optional<int> op_instance = std::nullopt);
+
+  // Creates an IConstantLayer using 'weights' whose dimensions are specified by
+  // 'dims', and returns the output ITensor.
+  ITensorProxyPtr CreateConstantLayer(const TRT_ShapedWeights& weights,
+                                      const nvinfer1::Dims& dims);
+
+  // Gets the min and max value in a TRT_ShapedWeights
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
+
+  // Constructs a name and passed it to the TensorRT layer to support xprof.
+  void SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def,
+                    absl::string_view sub_op_name = "",
+                    std::optional<int> sub_op_instance = std::nullopt,
+                    std::optional<std::string> origin_node_name = std::nullopt);
+
+  void SetLayerName(nvinfer1::ILayer* layer, absl::string_view main_op_name,
+                    absl::string_view sub_op_name,
+                    std::optional<int> sub_op_instance = std::nullopt);
+
+  std::unordered_map<string, TRT_TensorOrWeights>& TensorsMap() {
+    return trt_tensors_;
+  }
+
+  bool UseExplicitPrecision() const { return use_explicit_precision_; }
+
+ private:
+  Converter(TrtPrecisionMode precision_mode, bool use_calibration,
+            nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
+            absl::string_view engine_name, bool use_explicit_precision,
+            OpKernelContext* ctx);
+
+  Status Init(nvinfer1::ILogger* trt_logger);
+
+  // Verify the provided batch_size is consistent with batch_size_ and update it
+  // if necessary.
+  Status MaybeUpdateBatchSize(int batch_size);
+
+  // Add the provided tensor/weights to the map trt_tensors_.
+  Status AddTensorOrWeights(const string& name, TRT_TensorOrWeights input);
+
+  // Get the tensor/weights from trt_tensors_ by 'name'.
+  Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output);
+
+  // Get the inputs of 'node_def' from trt_tensors_.
+  Status GetInputs(const NodeDef& node_def,
+                   std::vector<TRT_TensorOrWeights>* inputs) const;
+
+  // Tensors/weights added during construction of trt_network_.
+  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
+
+  // The TRT builder used to create the network and build the engine. Not owned.
+  TrtUniquePtrType<nvinfer1::IBuilder> trt_builder_;
+
+  // The TRT network being built.
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> trt_network_;
+
+  // Store the weights added during construction of trt_network_.
+  TrtWeightStore weight_store_;
+
+  // Store the context.
+  OpKernelContext* ctx_;
+
+  // During conversion, this table is populated with quantization ranges per
+  // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
+  // quantization ranges. Since TRT only supports symmetric ranges, we will
+  // store the range as a single float = max(abs(min_range), abs(max_range)).
+  // Range refers to the floating point values, e.g. min_range = 0.0f, max_range
+  // = 6.0f for Relu6.
+  std::unordered_map<ITensorProxyPtr*, float> quantization_ranges_proxy_;
+  std::unordered_map<nvinfer1::ITensor*, float> quantization_ranges_;
+
+  const TrtPrecisionMode precision_mode_;
+
+  const bool use_calibration_;
+
+  // If this is false, all dimensions including the batch dimension are
+  // set explicitly.
+  const bool use_implicit_batch_;
+
+  // Batch size of inputs to trt_network_ added by AddInputTensor(). During
+  // network construction it will update this, use it to verify the batch
+  // size of all inputs are compatible, and make sure individual TF node is
+  // acceptable by TRT.
+  int batch_size_ = -1;
+
+  // Assign a ID to each constant layer we create, so that we can assign a
+  // unique name to the layer.
+  int next_constant_layer_id_ = 0;
+
+  // The name of the TRTEngineOp node.
+  absl::string_view engine_name_;
+
+  // Indicates whether to use explicit precision in TensorRT (Q/DQ support).
+  bool use_explicit_precision_;
+
+  friend class ConverterTest;
+  friend class OpConverterTest;
+};
+
+// Converts a TensorFlow tensor to TRT shaped weights.
+Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
+                            TRT_ShapedWeights* weights);
+
+// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
+// (which doesn't contain the batch dimension).
+//
+// If validation_only is true, it doesn't do the conversion but only do some
+// minimum validation for the eligibility of the conversion, and *tensor will
+// be set to nullptr.
+// If validation_only is false converter must not be nullptr.
+Status PrepareTensorForShape(
+    Converter* converter, const TRT_TensorOrWeights& input,
+    const DimsAdapter& dims, const bool validation_only,
+    ITensorProxyPtr* tensor, const NodeDef& node_def,
+    std::optional<int> op_instance = std::nullopt,
+    std::optional<std::string> origin_node_name = std::nullopt);
+
+// Return OK if the broadcast scheme is supported and compute the shapes after
+// broadcasting. check_feasibility can be set to false in cases where dimensions
+// do not need to match exactly (as in the case of BatchMatMulV2).
+Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r,
+                            const bool check_feasibility,
+                            const bool use_implicit_batch,
+                            nvinfer1::Dims* operand_l_new_dims,
+                            nvinfer1::Dims* operand_r_new_dims);
+
+template <typename T>
+using OperationMap = std::unordered_map<std::string, T>;
+
+// Map from Tensorflow operation names to TensorRT unary operations.
+using UnaryOperationMapType = OperationMap<nvinfer1::UnaryOperation>;
+const UnaryOperationMapType* UnaryOperationMap();
+
+// Map from Tensorflow boolean operation names to TensorRT unary operations.
+const UnaryOperationMapType* UnaryBooleanOperationMap();
+
+// Map of all supported ActivationTypes.
+using ActivationTypeMapType = OperationMap<nvinfer1::ActivationType>;
+const ActivationTypeMapType* ActivationTypeMap();
+
+// Map from Tensorflow binary operation names to TensorRT binary operations
+// types.
+using BinaryOperationMapType = OperationMap<nvinfer1::ElementWiseOperation>;
+const BinaryOperationMapType* BinaryOperationMap();
+
+// Map from Tensorflow boolean binary operation names to TensorRT binary
+// operations types.
+const BinaryOperationMapType* BinaryBooleanOperationMap();
+
+template <typename T>
+absl::InlinedVector<std::string, 10> GetOperationNames(const T& set) {
+  absl::InlinedVector<std::string, 10> result;
+  absl::c_transform(set, std::back_inserter(result),
+                    [](const auto x) { return x.first; });
+  return result;
+}
+
+// Adds a matrix multiplication operation to the TensorRT graph. The "params"
+// pointer is only used to access the TRT network builder. The inputs and
+// parameters for the op are fully specified by input_[a|b] and transpose_[a|b].
+StatusOr<ITensorProxyPtr> ConvertMatMulImpl(const OpConverterParams* params,
+                                            TRT_TensorOrWeights input_a,
+                                            TRT_TensorOrWeights input_b,
+                                            bool transpose_a, bool transpose_b);
+
+Status ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights>& operand,
+                      const DimsAdapter& broadcasted_dims,
+                      const OpConverterParams* params,
+                      std::optional<int> op_instance);
+
+std::string convert_range_error_msg(float start, float limit, float delta);
+std::string convert_range_expected_msg(const NodeDef& node_def);
+std::string bool_weight_error_msg(const NodeDef& node_def);
+std::string unexpected_type_error_msg(nvinfer1::DataType type_being_checked,
+                                      nvinfer1::DataType type_expected,
+                                      const NodeDef& node_def, int idx = 0);
+std::string then_else_dtypes_error_msg(nvinfer1::DataType type_then,
+                                       nvinfer1::DataType type_else,
+                                       const NodeDef& node_def);
+std::string input_shapes_error_msg(const nvinfer1::Dims& shape1,
+                                   const nvinfer1::Dims& shape2,
+                                   const NodeDef& node,
+                                   bool then_vs_else = false);
+std::string batch_size_error(absl::string_view name, absl::string_view comment);
+
+inline bool find_name(const string& name, const std::vector<string> names) {
+  return std::find(names.begin(), names.end(), name) != names.end();
+}
+
+Status check_type(nvinfer1::DataType type_being_checked,
+                  nvinfer1::DataType type_expected, const NodeDef& node_def,
+                  int idx = 0);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
new file mode 100644
index 00000000..2a265cf7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class LoggerRegistry {
+ public:
+  virtual Status Register(const string& name, nvinfer1::ILogger* logger) = 0;
+  virtual nvinfer1::ILogger* LookUp(const string& name) = 0;
+  virtual ~LoggerRegistry() {}
+};
+
+LoggerRegistry* GetLoggerRegistry();
+
+class RegisterLogger {
+ public:
+  RegisterLogger(const string& name, nvinfer1::ILogger* logger) {
+    TF_CHECK_OK(GetLoggerRegistry()->Register(name, logger));
+  }
+};
+
+#define REGISTER_TENSORRT_LOGGER(name, logger) \
+  REGISTER_TENSORRT_LOGGER_UNIQ_HELPER(__COUNTER__, name, logger)
+#define REGISTER_TENSORRT_LOGGER_UNIQ_HELPER(ctr, name, logger) \
+  REGISTER_TENSORRT_LOGGER_UNIQ(ctr, name, logger)
+#define REGISTER_TENSORRT_LOGGER_UNIQ(ctr, name, logger)                 \
+  static ::tensorflow::tensorrt::RegisterLogger register_trt_logger##ctr \
+      TF_ATTRIBUTE_UNUSED =                                              \
+          ::tensorflow::tensorrt::RegisterLogger(name, logger)
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/op_converter.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/op_converter.h
new file mode 100644
index 00000000..7ebaaeb1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/op_converter.h
@@ -0,0 +1,224 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <memory>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class Converter;
+
+// Specifies the expected type taken by a TRT_TensorOrWeights input during op
+// conversion.
+// kResource is only used for resource variable ops. For an operation like
+// Add(tensor, ReadVariableOp(...)), the second operand of Add is the result of
+// the ReadVariableOp, which is a kWeight.
+enum class TrtInputArg { kTensor = 1, kWeight = 2, kBoth = 3, kResource = 4 };
+
+// Parameters for each op converter.
+struct OpConverterParams {
+  // Constructor used for validation only.
+  OpConverterParams(const NodeDef& node_def,
+                    const std::vector<TRT_TensorOrWeights>& inputs,
+                    std::vector<TRT_TensorOrWeights>* outputs,
+                    TrtWeightStore* weight_store,
+                    TrtPrecisionMode precision_mode, bool use_calibration,
+                    bool use_implicit_batch, bool use_explicit_precision);
+
+  // Constructor used for conversion.
+  OpConverterParams(Converter* converter, const NodeDef& node_def,
+                    const std::vector<TRT_TensorOrWeights>& inputs,
+                    std::vector<TRT_TensorOrWeights>* outputs,
+                    TrtWeightStore* weight_store);
+
+  Converter* converter = nullptr;
+  const NodeDef& node_def;
+  const std::vector<TRT_TensorOrWeights>& inputs;
+  std::vector<TRT_TensorOrWeights>* outputs;
+  const bool validation_only;
+  TrtWeightStore* weight_store;
+  const TrtPrecisionMode precision_mode;
+  const bool use_calibration;
+  const bool use_implicit_batch;
+  const bool use_explicit_precision;
+};
+
+// Operation converter function specification.
+using OpConverter = std::function<Status(const OpConverterParams*)>;
+
+struct InputArgSpec {
+  absl::string_view name;
+  TrtInputArg allowed_roles;
+
+  static constexpr InputArgSpec Create(absl::string_view n, TrtInputArg role) {
+    return InputArgSpec{n, role};
+  }
+};
+
+template <typename T>
+std::string convert_not_supported_dtype_msg(const T& allowed_types,
+                                            DataType tf_type,
+                                            const NodeDef& node) {
+  string allowed_types_string =
+      absl::StrJoin(allowed_types, ", ", [](string* out, const DataType& type) {
+        absl::StrAppendFormat(out, "%s", DataTypeString(type));
+      });
+
+  return absl::StrCat("Data type ", DataTypeString(tf_type),
+                      " is not supported for ", node.op(), ", must be one of [",
+                      allowed_types_string, "]");
+}
+
+std::string convert_not_supported_implicit(const std::string& pOpName,
+                                           const std::string& pNodeName,
+                                           const char* pOpType = NULL);
+
+// A Curiously recurring template pattern (CRTP) template class for operation
+// converters.
+template <typename Impl>
+class OpConverterBase {
+ public:
+  explicit OpConverterBase(const OpConverterParams* params,
+                           const std::vector<DataType>& data_types =
+                               {DataType::DT_FLOAT, DataType::DT_HALF})
+      : params_(params),
+        node_def_attrs_(params->node_def),
+        allowed_dtypes_(data_types) {}
+
+  // Default NodeDef attribute name to inspect in order to determine node data
+  // type. The Impl class can override this by implementing the same function.
+  static constexpr const char* NodeDefDataTypeAttributeName() { return "T"; }
+
+  // Validate data type of the given NodeDef against allowed types.
+  Status ValidateNodeDefDataType() {
+    // If the attribute name is empty, we should skip this check.
+    if (absl::string_view(Impl::NodeDefDataTypeAttributeName()).empty()) {
+      return OkStatus();
+    }
+
+    // Get the NodeDef data type.
+    auto dtype = GetAttrValue<DataType>(Impl::NodeDefDataTypeAttributeName());
+    if (!dtype.ok()) {
+      return errors::InvalidArgument("Attribute with name ",
+                                     Impl::NodeDefDataTypeAttributeName(),
+                                     " not found.");
+    }
+
+    // Check allowed data types.;
+    if (std::find(allowed_dtypes_.begin(), allowed_dtypes_.end(), *dtype) ==
+        allowed_dtypes_.end()) {
+      return errors::Unimplemented(convert_not_supported_dtype_msg(
+          allowed_dtypes_, *dtype, params_->node_def));
+    }
+    return OkStatus();
+  }
+
+  static constexpr bool HasFixNumberOfInputs() { return true; }
+
+  // Validates input argument roles and data types.
+  Status ValidateInputs() {
+    const NodeDef& node_def = params_->node_def;
+    const auto& inputs = params_->inputs;
+    if (Impl::HasFixNumberOfInputs()) {
+      TRT_ENSURE(inputs.size() == Impl::InputSpec().size());
+    } else {
+      TRT_ENSURE(inputs.size() <= Impl::InputSpec().size());
+    }
+    for (int i = 0; i < inputs.size(); i++) {
+      const InputArgSpec arg_spec = Impl::InputSpec()[i];
+      if (arg_spec.allowed_roles == TrtInputArg::kWeight &&
+          inputs.at(i).is_tensor()) {
+        return errors::Unimplemented("The input \"", arg_spec.name, "\" for ",
+                                     node_def.op(), " must be a constant, at ",
+                                     node_def.name());
+      }
+      if (arg_spec.allowed_roles == TrtInputArg::kTensor &&
+          inputs.at(i).is_weights()) {
+        return errors::Unimplemented("The input \"", arg_spec.name, "\" for ",
+                                     node_def.op(), " must be a tensor, at ",
+                                     node_def.name());
+      }
+    }
+    return OkStatus();
+  }
+
+  Status operator()() {
+    // Validate data type and inputs.
+    TF_RETURN_IF_ERROR(this->ValidateNodeDefDataType());
+    TF_RETURN_IF_ERROR(this->ValidateInputs());
+
+    // Perform op-level validation.
+    TF_RETURN_IF_ERROR(reinterpret_cast<Impl*>(this)->Validate());
+    if (params_->validation_only) {
+      return OkStatus();
+    }
+
+    // Perform conversion.
+    return reinterpret_cast<Impl*>(this)->Convert();
+  }
+
+ protected:
+  Status NotSupportedInImplicitBatch(const char* pOpType = nullptr) {
+    if (params_->use_implicit_batch) {
+      const auto& op = params_->node_def.op();
+      const auto& nodeName = params_->node_def.name();
+      const auto& error = convert_not_supported_implicit(op, nodeName, pOpType);
+      return errors::Unimplemented(error);
+    }
+    return OkStatus();
+  }
+
+  void AddOutput(const TRT_TensorOrWeights& out) {
+    params_->outputs->push_back(out);
+  }
+
+  template <typename T>
+  StatusOr<T> GetAttrValue(absl::string_view key) const {
+    T result;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node_def_attrs_, key, &result));
+    return result;
+  }
+
+  const OpConverterParams* const params_;
+  const AttrSlice node_def_attrs_;
+  const std::vector<DataType> allowed_dtypes_;
+};
+
+// Constructs and returns a converter function for a given operation converter
+// class T. This requires T to be a derived class of StructuredOpConverter.
+template <typename T>
+OpConverter MakeConverterFunction() {
+  return [](const OpConverterParams* params) -> Status {
+    T converter(params);
+    return converter();
+  };
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h
new file mode 100644
index 00000000..8780aa68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_
+
+#include <initializer_list>
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <array>
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class OpConverterRegistry {
+ public:
+  OpConverterRegistry();
+  ~OpConverterRegistry() = default;
+
+  InitOnStartupMarker Register(const string& name, const int priority,
+                               OpConverter converter);
+
+  InitOnStartupMarker Register(const std::initializer_list<std::string>& names,
+                               const int priority, OpConverter converter) {
+    for (const auto& name : names) {
+      Register(name, priority, converter);
+    }
+    return {};
+  }
+
+  template <typename T,
+            typename std::enable_if<std::is_convertible<
+                typename T::value_type, std::string>::value>::type* = nullptr>
+  InitOnStartupMarker Register(const T& names, const int priority,
+                               OpConverter converter) {
+    for (const auto& name : names) {
+      Register(name, priority, converter);
+    }
+    return {};
+  }
+
+  // Clear all registered converters for the given Tensorflow operation name.
+  void Clear(const std::string& name);
+
+  StatusOr<OpConverter> LookUp(const string& name);
+
+  std::vector<std::string> ListRegisteredOps() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+OpConverterRegistry* GetOpConverterRegistry();
+
+class RegisterOpConverter {
+ public:
+  RegisterOpConverter(const string& name, const int priority,
+                      OpConverter converter) {
+    GetOpConverterRegistry()->Register(name, priority, converter);
+  }
+};
+
+constexpr int kDefaultConverterPriority = 1;
+
+}  // namespace convert
+}  // namespace tensorrt
+
+#define REGISTER_TRT_OP_CONVERTER_IMPL(ctr, func, priority, ...)    \
+  static ::tensorflow::InitOnStartupMarker const                    \
+      register_trt_op_converter##ctr TF_ATTRIBUTE_UNUSED =          \
+          TF_INIT_ON_STARTUP_IF(true)                               \
+          << tensorrt::convert::GetOpConverterRegistry()->Register( \
+                 __VA_ARGS__, priority, func)
+
+#define REGISTER_TRT_OP_CONVERTER(func, priority, ...)               \
+  TF_NEW_ID_FOR_INIT(REGISTER_TRT_OP_CONVERTER_IMPL, func, priority, \
+                     __VA_ARGS__)
+
+#define REGISTER_DEFAULT_TRT_OP_CONVERTER(func, ...) \
+  REGISTER_TRT_OP_CONVERTER(                         \
+      func, tensorrt::convert::kDefaultConverterPriority, __VA_ARGS__)
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
new file mode 100644
index 00000000..f31af032
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
@@ -0,0 +1,715 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <type_traits>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "third_party/tensorrt/NvInfer.h"
+#include "third_party/tensorrt/NvInferRuntimeCommon.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+namespace convert {
+
+// Facilitates the creation of TensorRT layers inside a network. The user
+// provides a INetworkDefinition pointer during construction. They can then add
+// operations to the network through the provided functions. Each function
+// returns a struct which contains the symbolic result of the operation (ITensor
+// pointer) as well as a pointer to the last TensorRT ILayer created. Some
+// operations may create multiple layers in order to accomplish the desired
+// result (e.g. Sign).
+class TRTNetworkBuilder {
+ public:
+  static StatusOr<TRTNetworkBuilder> Create(
+      nvinfer1::INetworkDefinition* network, TrtWeightStore* weight_store) {
+    TRT_ENSURE(network);
+    TRT_ENSURE(weight_store);
+    return TRTNetworkBuilder(network, weight_store);
+  }
+
+ private:
+  TRTNetworkBuilder(nvinfer1::INetworkDefinition* network,
+                    TrtWeightStore* weight_store)
+      : network_(network), weight_store_(weight_store) {}
+
+ public:
+  // Adds an Add operation to the network.
+  StatusOr<nvinfer1::IElementWiseLayer*> Add(nvinfer1::ITensor* lhs,
+                                             nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUM);
+    TRT_ENSURE(layer);
+    return layer;
+  };
+
+  // Adds an elementwise min(lhs, rhs) operation to the network. The output has
+  // the same data type as the input.
+  StatusOr<nvinfer1::IElementWiseLayer*> Min(nvinfer1::ITensor* lhs,
+                                             nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kMIN);
+    TRT_ENSURE(layer);
+    return layer;
+  };
+
+  // Adds an elementwise max(lhs, rhs) operation to the network. The output has
+  // the same datatype as the input.
+  StatusOr<nvinfer1::IElementWiseLayer*> Max(nvinfer1::ITensor* lhs,
+                                             nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kMAX);
+    TRT_ENSURE(layer);
+    return layer;
+  };
+
+  // Adds an absolute value operation to the network. Note that this unary
+  // operation will do an implicit float conversion. For int32 tensors, use
+  // "AbsInt".
+  StatusOr<nvinfer1::IUnaryLayer*> AbsFloat(nvinfer1::ITensor* input) noexcept {
+    TRT_ENSURE(input);
+    TRT_ENSURE(input->getType() != nvinfer1::DataType::kFLOAT &&
+               input->getType() != nvinfer1::DataType::kHALF);
+    nvinfer1::IUnaryLayer* layer =
+        network_->addUnary(*input, nvinfer1::UnaryOperation::kABS);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Performs Abs without implicit float conversion. The input should be of type
+  // kInt32. For float datatypes, use "Abs".
+  StatusOr<nvinfer1::IElementWiseLayer*> AbsInt(
+      nvinfer1::ITensor* input) noexcept {
+    TRT_ENSURE(input);
+    TRT_ENSURE(input->getType() == nvinfer1::DataType::kINT32);
+    StatusOr<nvinfer1::IElementWiseLayer*> sign = this->SignInt(input);
+    return this->Mul(input, (*sign)->getOutput(0));
+  }
+
+  // Returns elementwise sign(x) for int32 input tensors where sign(x) is
+  // defined as 1 where x > 0, -1 where x < 0 and 0 where x == 0.
+  StatusOr<nvinfer1::IElementWiseLayer*> SignInt(
+      nvinfer1::ITensor* input) noexcept {
+    TRT_ENSURE(input);
+
+    // Create constants +1 and -1.
+    StatusOr<nvinfer1::IConstantLayer*> one =
+        this->Constant<int32>(1, input->getDimensions().nbDims);
+    TRT_ENSURE_PTR_OK(one);
+
+    StatusOr<nvinfer1::IConstantLayer*> neg_one =
+        this->Constant<int32>(-1, input->getDimensions().nbDims);
+    TRT_ENSURE_PTR_OK(neg_one);
+
+    // Turn all negaitve elements into -1, positive and zero elements
+    // unaffected.
+    StatusOr<nvinfer1::IElementWiseLayer*> max =
+        this->Max(input, (*neg_one)->getOutput(0));
+    TRT_ENSURE_PTR_OK(max);
+
+    // Turn all positive elements into +1, negative and zero elements
+    // unaffected.
+    StatusOr<nvinfer1::IElementWiseLayer*> min =
+        this->Min((*max)->getOutput(0), (*one)->getOutput(0));
+    TRT_ENSURE_PTR_OK(min);
+    return min;
+  }
+
+  // Adds a Sub operation to the network.
+  StatusOr<nvinfer1::IElementWiseLayer*> Sub(nvinfer1::ITensor* lhs,
+                                             nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUB);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds an Greater operation to the network.
+  StatusOr<nvinfer1::IElementWiseLayer*> Greater(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kGREATER);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds an Equal operation to the network.
+  StatusOr<nvinfer1::IElementWiseLayer*> Equal(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kEQUAL);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds a FloorDiv operation to the network.
+  StatusOr<nvinfer1::IElementWiseLayer*> FloorDiv(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kFLOOR_DIV);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Returns the equivalent of ceil_divide(abs(x)/abs(y))) operation. The inputs
+  // "lhs" and "rhs" should be int32 tensors.
+  StatusOr<nvinfer1::IElementWiseLayer*> AbsCeilDivInt(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    TRT_ENSURE(lhs->getType() == nvinfer1::DataType::kINT32);
+    TRT_ENSURE(rhs->getType() == nvinfer1::DataType::kINT32);
+
+    StatusOr<nvinfer1::IElementWiseLayer*> rhs_abs = this->AbsInt(rhs);
+    TRT_ENSURE_PTR_OK(rhs_abs);
+    StatusOr<nvinfer1::IElementWiseLayer*> lhs_abs = this->AbsInt(lhs);
+    TRT_ENSURE_PTR_OK(lhs_abs);
+    StatusOr<nvinfer1::IElementWiseLayer*> add1 =
+        this->Add((*lhs_abs)->getOutput(0), (*rhs_abs)->getOutput(0));
+    TRT_ENSURE_PTR_OK(add1);
+    StatusOr<nvinfer1::IConstantLayer*> one_const =
+        this->Constant<int32>(1, rhs->getDimensions().nbDims);
+    TRT_ENSURE_PTR_OK(one_const);
+    StatusOr<nvinfer1::IElementWiseLayer*> numerator =
+        this->Sub((*add1)->getOutput(0), (*one_const)->getOutput(0));
+    TRT_ENSURE_PTR_OK(numerator);
+    return FloorDiv((*numerator)->getOutput(0), (*rhs_abs)->getOutput(0));
+  }
+
+  // Adds an elementwise multiplication operation to the network.
+  StatusOr<nvinfer1::IElementWiseLayer*> Mul(nvinfer1::ITensor* lhs,
+                                             nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kPROD);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds a sequence of elementwise multiplication operations to the network.
+  // The returned layer's output contains the cumulative elementwise product of
+  // all tensors in the input.
+  StatusOr<nvinfer1::ILayer*> CumulativeProd(
+      absl::Span<nvinfer1::ITensor*> inputs) noexcept {
+    TRT_ENSURE(!absl::c_any_of(
+        inputs, [](nvinfer1::ITensor* x) { return x == nullptr; }));
+    nvinfer1::ILayer* out = nullptr;
+    if (inputs.size() == 1) {
+      out = network_->addIdentity(*inputs[0]);
+      TRT_ENSURE(out != nullptr);
+      return out;
+    }
+    nvinfer1::ITensor* last = inputs[0];
+    for (int i = 1; i < inputs.size(); i++) {
+      StatusOr<nvinfer1::IElementWiseLayer*> mul = this->Mul(last, inputs[i]);
+      TRT_ENSURE_PTR_OK(mul);
+      out = *mul;
+      last = (*mul)->getOutput(0);
+    }
+    return out;
+  }
+
+  // Adds a Constant layer whose output is a TensorRT shape tensor. The shape
+  // tensor's size and values correspond to dim's nbDims and d[], respectively.
+  StatusOr<nvinfer1::IConstantLayer*> ConstantShape(
+      const DimsAdapter& shape_data) noexcept {
+    TRT_ENSURE(shape_data.NumDims() > 0);
+    nvinfer1::Dims shape_dims;
+    shape_dims.nbDims = 1;
+    shape_dims.d[0] = shape_data.NumDims();
+    StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(nvinfer1::DataType::kINT32, shape_dims);
+    TRT_ENSURE_OK(const_weights);
+    absl::c_copy(shape_data, const_weights->GetPointer<int32>());
+    StatusOr<nvinfer1::Dims> trt_dims = const_weights->Shape().AsTrtDims();
+    TRT_ENSURE_OK(trt_dims);
+    nvinfer1::IConstantLayer* const_layer =
+        network_->addConstant(*trt_dims, const_weights->GetTrtWeights());
+    TRT_ENSURE(const_layer);
+    nvinfer1::ITensor* output = const_layer->getOutput(0);
+    TRT_ENSURE(output);
+    TRT_ENSURE(output->getType() == nvinfer1::DataType::kINT32);
+    return const_layer;
+  }
+
+  // Adds a Constant layer whose output is a TensorRT shape tensor. The shape
+  // tensor's size and values correspond to dim's nbDims and d[], respectively.
+  StatusOr<nvinfer1::IConstantLayer*> Constant(
+      const std::vector<int>& data) noexcept {
+    nvinfer1::Dims shape_dims;
+    shape_dims.nbDims = 1;
+    shape_dims.d[0] = data.size();
+    StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(nvinfer1::DataType::kINT32, shape_dims);
+    TRT_ENSURE_OK(const_weights);
+    int32* values = const_weights->GetPointer<int32>();
+    for (int i = 0; i < data.size(); i++) {
+      values[i] = static_cast<int32>(data[i]);
+    }
+    StatusOr<nvinfer1::Dims> trt_dims = const_weights->Shape().AsTrtDims();
+    TRT_ENSURE_OK(trt_dims);
+    nvinfer1::IConstantLayer* const_layer =
+        network_->addConstant(*trt_dims, const_weights->GetTrtWeights());
+    TRT_ENSURE(const_layer);
+    nvinfer1::ITensor* output = const_layer->getOutput(0);
+    TRT_ENSURE(output);
+    TRT_ENSURE(output->getType() == nvinfer1::DataType::kINT32);
+    TRT_ENSURE(const_layer);
+    return const_layer;
+  }
+
+  // Adds a Constant layer that produces a tensor of shape "shape",
+  // type "data_type" and filled with value "scalar".
+  template <typename T>
+  StatusOr<nvinfer1::IConstantLayer*> Constant(
+      const T value, nvinfer1::Dims shape,
+      nvinfer1::DataType data_type) noexcept {
+    StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(data_type, shape);
+    TRT_ENSURE_OK(const_weights);
+    TRT_ENSURE(const_weights->SetValues(value).ok());
+    nvinfer1::IConstantLayer* const_layer =
+        network_->addConstant(shape, const_weights->GetTrtWeights());
+    TRT_ENSURE(const_layer);
+    return const_layer;
+  }
+
+  // Adds a Constant layer that produces a tensor with a single value "scalar".
+  // The tensor has "nb_dims" dimensions and each dimension has only one
+  // element. The data type of the tensor is determined by the data type of
+  // "scalar".
+  template <typename T,
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  StatusOr<nvinfer1::IConstantLayer*> Constant(const T scalar,
+                                               const int nb_dims) noexcept {
+    TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS);
+    auto data_type = nvinfer1::DataType::kINT32;
+    if (std::is_floating_point<T>::value) {
+      data_type = nvinfer1::DataType::kFLOAT;
+    }
+    nvinfer1::Dims zero_shape;
+    zero_shape.nbDims = nb_dims;
+    std::fill_n(zero_shape.d, nb_dims, 1);
+    return Constant<T>(scalar, zero_shape, data_type);
+  }
+
+  // Adds a Constant layer from a TRT_ShapedWeights object.
+  StatusOr<nvinfer1::IConstantLayer*> WeightsToConstant(
+      const nvinfer1::Weights& weights, const DimsAdapter& dims) noexcept {
+    StatusOr<int64_t> vol = dims.Volume();
+    TRT_ENSURE_OK(vol);
+    TRT_ENSURE(*vol == weights.count);
+    StatusOr<nvinfer1::Dims> trt_dims = dims.AsTrtDims();
+    TRT_ENSURE_OK(trt_dims);
+    nvinfer1::IConstantLayer* const_layer =
+        network_->addConstant(*trt_dims, weights);
+    TRT_ENSURE(const_layer);
+    return const_layer;
+  }
+
+  Status get_tensor4TensorOrWeights(const TRT_TensorOrWeights& input,
+                                    ITensorProxyPtr* pTensor) {
+    if (input.is_weights()) {
+      StatusOr<nvinfer1::IConstantLayer*> const_layer = WeightsToConstant(
+          input.weights().GetTrtWeights(), input.GetTrtDims());
+      if (!const_layer.status().ok()) return const_layer.status();
+      *pTensor = (*const_layer)->getOutput(0);
+    } else {
+      *pTensor = input.tensor();
+    }
+    return OkStatus();
+  }
+
+  // Creates a nvinfer1::Weights object containing a single scalar.
+  template <typename T,
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  StatusOr<nvinfer1::Weights> ScalarWeights(const T scalar,
+                                            const int nb_dims) noexcept {
+    TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS);
+    auto data_type = nvinfer1::DataType::kINT32;
+    if (std::is_floating_point<T>::value) {
+      data_type = nvinfer1::DataType::kFLOAT;
+    }
+    nvinfer1::Dims weights_shape;
+    weights_shape.nbDims = nb_dims;
+    std::fill_n(weights_shape.d, nb_dims, 1);
+    StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(data_type, weights_shape);
+    TRT_ENSURE_OK(const_weights);
+    const_weights->GetPointer<T>()[0] = scalar;
+    return const_weights->GetTrtWeights();
+  }
+
+  // Adds a TensorRT Slice operation to the network.
+  StatusOr<nvinfer1::ISliceLayer*> Slice(
+      nvinfer1::ITensor* input, const nvinfer1::Dims& begin,
+      const nvinfer1::Dims& size, const nvinfer1::Dims& stride) noexcept {
+    nvinfer1::ISliceLayer* layer =
+        network_->addSlice(*input, begin, size, stride);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds a TensorRT Concatenate operation to the network.
+  StatusOr<nvinfer1::IConcatenationLayer*> Concat(
+      absl::Span<nvinfer1::ITensor* const> inputs, const int axis) {
+    for (nvinfer1::ITensor* input : inputs) {
+      TRT_ENSURE(input);
+    }
+    nvinfer1::IConcatenationLayer* layer = network_->addConcatenation(
+        inputs.data(), static_cast<int32_t>(inputs.size()));
+    TRT_ENSURE(layer);
+    layer->setAxis(axis);
+    return layer;
+  }
+
+  // Adds a TensorRT Concatenate operation to the network.
+  StatusOr<nvinfer1::IConcatenationLayer*> Concat(
+      const std::vector<nvinfer1::ITensor*>& inputs, const int axis) {
+    return this->Concat(absl::MakeSpan(inputs), axis);
+  }
+
+  // Adds a TensorRT Shape operation, which determines the runtime shape of the
+  // input tensor, to the network.
+  StatusOr<nvinfer1::IShapeLayer*> Shape(nvinfer1::ITensor* input) {
+    TRT_ENSURE(input);
+    nvinfer1::IShapeLayer* layer = network_->addShape(*input);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Creates a Gather operation on the shape of the input tensor. The output of
+  // the gather operation is a 1D shape tensor where output[i] = (!sub_one ?
+  // input_shape[i] : input_shape[i] -1) if i is in "indices", otherwise zero.
+  StatusOr<nvinfer1::IGatherLayer*> GetPartialShapeOf(
+      nvinfer1::ITensor* input, absl::InlinedVector<int64, 4> indices,
+      bool sub_one = false) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(indices.size() <= nvinfer1::Dims::MAX_DIMS);
+
+    // Get the runtime shape of input;
+    StatusOr<nvinfer1::IShapeLayer*> shape_layer = this->Shape(input);
+    TRT_ENSURE_PTR_OK(shape_layer);
+    nvinfer1::ITensor* runtime_shape = (*shape_layer)->getOutput(0);
+
+    if (sub_one) {
+      StatusOr<nvinfer1::IConstantLayer*> ones = this->Constant<int32>(1, 1);
+      TRT_ENSURE_PTR_OK(ones);
+      StatusOr<nvinfer1::IElementWiseLayer*> sub =
+          this->Sub(runtime_shape, (*ones)->getOutput(0));
+      TRT_ENSURE_PTR_OK(sub);
+      runtime_shape = (*sub)->getOutput(0);
+    }
+
+    // Create a constant tensor containing the gather indices.
+    // For any dim not in "indices", we mark it size to gather a zero.
+    const int input_nb_dims = input->getDimensions().nbDims;
+    std::vector<int> indices_all(input_nb_dims, input_nb_dims);
+    for (auto idx : indices) {
+      TRT_ENSURE(idx < input_nb_dims);
+      indices_all[idx] = idx;
+    }
+
+    StatusOr<nvinfer1::IConstantLayer*> indices_result =
+        this->Constant(indices_all);
+    TRT_ENSURE_PTR_OK(indices_result);
+    nvinfer1::ITensor* gather_indices = (*indices_result)->getOutput(0);
+    TRT_ENSURE(gather_indices->getDimensions().nbDims == 1);
+    TRT_ENSURE(gather_indices->getType() == nvinfer1::DataType::kINT32);
+
+    // Append a zero to the shape tensor.
+    StatusOr<nvinfer1::IConstantLayer*> zero_result =
+        this->Constant(std::vector<int>{0});
+    TRT_ENSURE_PTR_OK(zero_result);
+    std::array<nvinfer1::ITensor*, 2> cat_inputs = {
+        runtime_shape, (*zero_result)->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat_layer =
+        network_->addConcatenation(cat_inputs.data(), cat_inputs.size());
+    TRT_ENSURE(cat_layer);
+    nvinfer1::ITensor* gather_input = cat_layer->getOutput(0);
+    TRT_ENSURE(gather_input);
+
+    // Finally, gather the indices from the input.
+    nvinfer1::IGatherLayer* gather =
+        network_->addGather(*gather_input, *gather_indices, 0);
+    TRT_ENSURE(gather);
+    return gather;
+  }
+
+  // Adds a scale layer that uniformly scales the input tensor by the specified
+  // amount.
+  StatusOr<nvinfer1::IScaleLayer*> AddUniformScale(nvinfer1::ITensor* input,
+                                                   float scale,
+                                                   const std::string& name) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(!name.empty());
+    StatusOr<nvinfer1::Weights> weight = this->ScalarWeights<float>(scale, 1);
+    TRT_ENSURE_OK(weight);
+    const nvinfer1::Weights empty_weights =
+        nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IScaleLayer* scale_layer =
+        network_->addScale(*input, nvinfer1::ScaleMode::kUNIFORM, empty_weights,
+                           (*weight), empty_weights);
+    TRT_ENSURE(scale_layer != nullptr);
+    scale_layer->setName(name.c_str());
+    TRT_ENSURE((*scale_layer).getPower().count == 0);
+    TRT_ENSURE((*scale_layer).getShift().count == 0);
+    TRT_ENSURE((*scale_layer).getScale().count == 1);
+    return scale_layer;
+  }
+
+  StatusOr<nvinfer1::ILayer*> AddFill(const TRT_TensorOrWeights& value_input,
+                                      const TRT_TensorOrWeights& dims_input,
+                                      bool is_value_static, bool is_dims_static,
+                                      int nbDims,
+                                      const nvinfer1::Dims& trt_dims,
+                                      ITensorProxyPtr scalar_tensor = nullptr,
+                                      ITensorProxyPtr beta_tensor = nullptr,
+                                      const float delta = 0) {
+    // TensorRT IFillLayer requires a rank 0 scalar.
+    nvinfer1::Dims scalar_dims;
+    scalar_dims.nbDims = 0;
+    if (is_value_static) {
+      StatusOr<nvinfer1::IConstantLayer*> const_layer =
+          WeightsToConstant(value_input.weights().GetTrtWeights(), scalar_dims);
+      if (!const_layer.status().ok()) return const_layer.status();
+      scalar_tensor = (*const_layer)->getOutput(0);
+    } else {
+      if (scalar_tensor == nullptr) {
+        StatusOr<nvinfer1::IShuffleLayer*> shuffler_layer =
+            Reshape(value_input.tensor()->trt_tensor(), scalar_dims);
+        if (!shuffler_layer.status().ok()) return shuffler_layer.status();
+        scalar_tensor = (*shuffler_layer)->getOutput(0);
+      }
+    }
+
+    if (beta_tensor == nullptr) {
+      nvinfer1::Dims beta_shape{1, {nbDims}};
+      StatusOr<nvinfer1::IConstantLayer*> const_layer =
+          Constant(delta, beta_shape, value_input.TrtDType());
+      TF_RETURN_IF_ERROR(const_layer.status());
+      beta_tensor = (*const_layer)->getOutput(0);
+    }
+
+    nvinfer1::IFillLayer* layer =
+        network_->addFill(trt_dims, nvinfer1::FillOperation::kLINSPACE);
+    TRT_ENSURE(layer);
+    if (!is_dims_static) {
+      layer->setInput(0, *dims_input.tensor()->trt_tensor());
+    }
+    layer->setInput(1, *scalar_tensor->trt_tensor());
+    layer->setInput(2, *beta_tensor->trt_tensor());
+    return layer;
+  }
+
+  // Adds a quantization layer that uniformly scales the input tensor
+  // by the given multiplicative "scaling_factor", then rounds
+  // (round-to-nearest-ties-to-even) to the nearest integer and clamps in the
+  // range of [-128, 127].
+  StatusOr<nvinfer1::ILayer*> Quantize(nvinfer1::ITensor* input,
+                                       const float scaling_factor,
+                                       const std::string& name) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(!name.empty());
+    // Preprocessor usage here is unavoidable because TRT8 API is new.
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    // The TensorRT IQuantizeLayer divides by the scale factor rather than
+    // multiplies. To be consistent, in this function we expect a multiplicative
+    // scale factor, so we take the reciprical.
+    StatusOr<nvinfer1::IConstantLayer*> scaling_const =
+        this->Constant<float>(1.0f / scaling_factor, 1);
+    TRT_ENSURE_PTR_OK(scaling_const);
+    (*scaling_const)->setDimensions(nvinfer1::Dims{0, {}});
+    nvinfer1::IQuantizeLayer* quant_layer =
+        network_->addQuantize(*input, *(*scaling_const)->getOutput(0));
+    TRT_ENSURE(quant_layer);
+    quant_layer->setAxis(1);
+    return quant_layer;
+#else
+    StatusOr<nvinfer1::IScaleLayer*> result =
+        this->AddUniformScale(input, scaling_factor, name);
+    TRT_ENSURE_PTR_OK(result);
+    (*result)->setOutputType(0, nvinfer1::DataType::kINT8);
+    (*result)->setPrecision(nvinfer1::DataType::kFLOAT);
+    return result;
+#endif
+  }
+
+  // Adds a dequantize layer that casts the input tensor to TensorRT float type
+  // and scales it uniformly by the given multiplicative "scaling_factor".
+  StatusOr<nvinfer1::ILayer*> Dequantize(nvinfer1::ITensor* input,
+                                         const float scaling_factor,
+                                         const std::string& name) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(!name.empty());
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    StatusOr<nvinfer1::IConstantLayer*> scaling_const =
+        this->Constant<float>(scaling_factor, 1);
+    TRT_ENSURE_PTR_OK(scaling_const);
+    (*scaling_const)->setDimensions(nvinfer1::Dims{0, {}});
+    nvinfer1::IDequantizeLayer* dequant_layer =
+        network_->addDequantize(*input, *(*scaling_const)->getOutput(0));
+    dequant_layer->setAxis(1);
+    TRT_ENSURE(dequant_layer);
+    return dequant_layer;
+#else
+    StatusOr<nvinfer1::IScaleLayer*> result =
+        this->AddUniformScale(input, scaling_factor, name);
+    TRT_ENSURE_PTR_OK(result);
+    (*result)->setOutputType(0, nvinfer1::DataType::kFLOAT);
+    (*result)->setPrecision(nvinfer1::DataType::kINT8);
+    return result;
+#endif
+  }
+
+  // Adds TensorRT Q/DQ operations. This is for explicit precision mode.
+  StatusOr<nvinfer1::ILayer*> UniformQuantizeDequantizeExplicit(
+      nvinfer1::ITensor* input, float quantize_scale, float dequantize_scale,
+      const std::string& name) {
+    TRT_ENSURE(input);
+    if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+      TRT_ENSURE(network_->hasExplicitPrecision());
+    }
+    TRT_ENSURE(IS_TRT_VERSION_GE(7, 1, 0, 0));
+
+    static int count = 0;
+    TRT_ENSURE(input->getType() == nvinfer1::DataType::kFLOAT);
+    std::string quant_name = absl::StrCat(input->getName(), "_quant_", count);
+
+    StatusOr<nvinfer1::ILayer*> quant =
+        this->Quantize(input, quantize_scale, quant_name);
+    TRT_ENSURE_PTR_OK(quant);
+
+    std::string dequant_name =
+        absl::StrCat(input->getName(), "_dequant_", count);
+    StatusOr<nvinfer1::ILayer*> dequant = this->Dequantize(
+        (*quant)->getOutput(0), dequantize_scale, dequant_name);
+    TRT_ENSURE_PTR_OK(dequant);
+
+    count++;
+    return dequant;
+  }
+
+  StatusOr<nvinfer1::IShuffleLayer*> Reshape(nvinfer1::ITensor* input,
+                                             const nvinfer1::Dims& new_shape) {
+    TRT_ENSURE(input);
+    nvinfer1::IShuffleLayer* layer = network_->addShuffle(*input);
+    TRT_ENSURE(layer);
+    layer->setReshapeDimensions(new_shape);
+    return layer;
+  }
+
+  StatusOr<nvinfer1::ILayer*> FindProducerOf(const nvinfer1::ITensor* tensor) {
+    const char* name = tensor->getName();
+    const int num_layers = network_->getNbLayers();
+    for (int i = 0; i < num_layers; i++) {
+      nvinfer1::ILayer* layer = network_->getLayer(i);
+      const int num_outputs = layer->getNbOutputs();
+      for (int j = 0; j < num_outputs; j++) {
+        nvinfer1::ITensor* t = layer->getOutput(j);
+        if (std::string(t->getName()) == name) {
+          return layer;
+        }
+      }
+    }
+    return errors::NotFound("could not find producing layer of ", name);
+  }
+
+  StatusOr<nvinfer1::ILayer*> UniqueParentOf(const nvinfer1::ILayer* layer,
+                                             int input_idx = 0) {
+    return FindProducerOf(layer->getInput(input_idx));
+  }
+
+  nvinfer1::INetworkDefinition* Network() { return network_; }
+
+ private:
+  nvinfer1::INetworkDefinition* network_;
+  TrtWeightStore* weight_store_;
+};
+
+class ShuffleBuilder {
+ private:
+  explicit ShuffleBuilder(TRTNetworkBuilder* builder, nvinfer1::ITensor* input)
+      : builder_(builder) {
+    layer_ = builder->Network()->addShuffle(*input);
+  }
+
+ public:
+  static StatusOr<ShuffleBuilder> Create(TRTNetworkBuilder* builder,
+                                         nvinfer1::ITensor* input) {
+    TRT_ENSURE(builder != nullptr);
+    TRT_ENSURE(input != nullptr);
+    return ShuffleBuilder(builder, input);
+  }
+
+  ShuffleBuilder& SetReshape(const nvinfer1::Dims& dims) {
+    layer_->setReshapeDimensions(dims);
+    return *this;
+  }
+
+  ShuffleBuilder& SetReshape(nvinfer1::ITensor* shape) {
+    layer_->setInput(1, *shape);
+    return *this;
+  }
+
+  ShuffleBuilder& SetFirstTranspose(const nvinfer1::Permutation& perm) {
+    layer_->setFirstTranspose(perm);
+    return *this;
+  }
+
+  ShuffleBuilder& SetSecondTranspose(const nvinfer1::Permutation& perm) {
+    layer_->setSecondTranspose(perm);
+    return *this;
+  }
+
+  StatusOr<nvinfer1::ITensor*> Output() {
+    TRT_ENSURE(layer_ != nullptr);
+    TRT_ENSURE(layer_->getOutput(0) != nullptr);
+    return layer_->getOutput(0);
+  }
+
+ private:
+  TRTNetworkBuilder* builder_;
+  nvinfer1::IShuffleLayer* layer_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h
new file mode 100644
index 00000000..280dc1e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+constexpr std::array<const char*, 4> kQuantizationOpNames = {
+    "QuantizeAndDequantizeV2",
+    "QuantizeAndDequantizeV3",
+    "FakeQuantWithMinMaxVars",
+    "FakeQuantWithMinMaxArgs",
+};
+
+// Operations with supported conversion to Q/DQ ops in TensorRT explicit
+// precision mode.
+constexpr std::array<const char*, 1> kExplicitQuantizationOpNames = {
+    "QuantizeAndDequantizeV2",
+};
+
+// Contains two scaling factors for quantization and dequantization
+// respectively. A shift factor is omitted as TensorRT only supports symmetric
+// quantization.
+template <typename T, size_t N>
+struct QuantizationScales {
+  std::array<T, N> quantize_scale;
+  std::array<T, N> dequantize_scale;
+};
+
+// In TensorRT 7 and 8, only uniform tensor scaling is supported for
+// activations.
+using UniformQuantizationScales = QuantizationScales<float, 1>;
+
+// Per-channel scaling is supported for weights in TensorRT version >= 8.0.
+template <size_t ChannelDimSize>
+using PerChannelQuantizationScales = QuantizationScales<float, ChannelDimSize>;
+
+template <typename T, size_t N>
+std::ostream& operator<<(std::ostream& os,
+                         const QuantizationScales<T, N>& scales) {
+  os << absl::StrFormat("QuantizationScales[quantize={%s},dequantize={%s}]",
+                        absl::StrJoin(scales.quantize_scale, ","),
+                        absl::StrJoin(scales.dequantize_scale, ","));
+  return os;
+}
+
+// Returns true if the Tensorflow node is a quantize and dequantize operation.
+bool IsQuantizeAndDequantizeOp(const Node*);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.h
new file mode 100644
index 00000000..4dd281ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.h
@@ -0,0 +1,70 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_SLICE_OPS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_SLICE_OPS_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/strided_slice_op.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+using SliceDims = absl::InlinedVector<int64, 4>;
+
+// Creates a strided slice operation using the given information. This function
+// expects that the begin, stride, and end vectors have already been validated.
+// This function converts the [begin:stride:end] specification to the TensorRT
+// [begin:stride:size] ISliceLayer specification. The following algorithm is
+// used to perform this conversion: 1) The given (input_dims,
+// [begin:stride:end]) specification is dividied into
+//  "static dimensions" and "dynamic dimensions". "Dynamic dimensions"
+//  includes all dimensions of the slice where input_dims[i] == -1.
+// 2a) If there are no dynamic dimensions, then the "begin", "stride", and
+//  "size" variables are passed to the ISLiceLayer creation as build-time
+//  constants in the form of nvinfer1::Dims objects.
+// 2b) If there are any dynamic dimensions, then the "begin", "stride", and
+//  "size" variables are treated as runtime dynamic shape Tensors in the
+//  TensorRT graph. In this case, we must calculate "size" at runtime for all
+//  dynamic dimensions, while static dimensions use the constant values.
+//
+// Note that when any dynamic indices are present (2b), the "strided_slice_spec"
+// must be specified. This structure can be obtained through the
+// "tensorflow::ValidateStridedSliceOp" function, or it can be constructed
+// directly. When the ValidateStridedSliceOp helper function is used, it will
+// also return the "begin", "stride", and "end" vectors. When all dimensions are
+// static (2a), the "strided_slice_spec" variable is not required.
+//
+// If the "final_shape" variable is specified, then a reshape operation will be
+// added to the graph to achieve this shape. The shape must be fully specified.
+//
+// "op_instance" is only required if the caller needs to pass this variable
+// through to the Converter functions optionally accept it (SetLayerName,
+// PrepareTensorForShape).
+Status ConvertStridedSliceHelper(
+    const OpConverterParams* params, const TRT_TensorOrWeights& input,
+    const PartialTensorShape& input_dims, const SliceDims& begin,
+    const SliceDims& stride, const SliceDims& end,
+    std::optional<nvinfer1::Dims> final_shape = std::nullopt,
+    std::optional<int> op_instance = std::nullopt,
+    std::optional<StridedSliceShapeSpec> strided_slice_spec = std::nullopt);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_SLICE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h
new file mode 100644
index 00000000..4d43b1d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// A registry for holding serialized TensorRT autotuner timing caches.
+// For TensorRT versions < 8.0, the timing cache is not serializable, so these
+// operations become no-ops.
+class TimingCacheRegistry {
+ public:
+  TimingCacheRegistry() = default;
+  ~TimingCacheRegistry() = default;
+
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  using TimingCache = nvinfer1::ITimingCache;
+  using TimingCachePtr = std::unique_ptr<TimingCache>;
+#else
+  struct TimingCache {};
+  using TimingCachePtr = std::unique_ptr<TimingCache>;
+#endif
+
+  // Insert or update a registry into the map using the given name. The cache
+  // will be serialized before being placed into the map.
+  void Upsert(const string& name, TimingCache* cache);
+
+  // Find a timing cache using the given name. The provided BuilderConfig is
+  // used to deserialize the cache. If no timing cache is found, a new timing
+  // cache is returned.
+  StatusOr<TimingCachePtr> LookUp(const string& name,
+                                  nvinfer1::IBuilderConfig* builder_config);
+
+ private:
+  using SerializedTimingCache = std::vector<uint8_t>;
+
+  mutex mu_;
+  std::unordered_map<std::string, SerializedTimingCache> map_;
+};
+
+TimingCacheRegistry* GetTimingCacheRegistry();
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h
new file mode 100644
index 00000000..e91b3cd8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h
@@ -0,0 +1,69 @@
+/* Copyright 20121 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_
+
+#include <string>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#if !IS_TRT_VERSION_GE(7, 0, 0, 0)
+#error From version 2.6, we only support NVIDIA TensorRT version 7 or newer.
+#error Please update your environment and relaunch the compilation.
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+class TRTLayoutOptimizationPass : public grappler::CustomGraphOptimizer {
+ public:
+  TRTLayoutOptimizationPass(const string& name = "TRTLayoutOptimizationPass");
+
+  string name() const override { return name_; };
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  Status Init(
+      const RewriterConfig_CustomGraphOptimizer* config = nullptr) override;
+
+  Status Optimize(grappler::Cluster* cluster,
+                  const grappler::GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  /*  void PrintDebugInfo(grappler::Cluster* cluster,
+                        const grappler::GrapplerItem& item);
+  */
+
+ private:
+  const string name_;
+  string trt_logger_name_;
+  int minimum_segment_size_;
+  bool is_dynamic_op_;
+  int max_cached_batches_;
+  int64_t max_workspace_size_bytes_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
new file mode 100644
index 00000000..abc3bdce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#if !IS_TRT_VERSION_GE(7, 0, 0, 0)
+#error From version 2.6, we only support NVIDIA TensorRT version 7 or newer.
+#error Please update your environment and relaunch the compilation.
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
+ public:
+  struct ConversionParams {
+    string trt_logger_name = "DefaultLogger";
+    size_t max_batch_size = -1;
+    size_t max_workspace_size_bytes = 1 << 30;
+    TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
+    int minimum_segment_size = 3;
+    // Whether to create engine on conversion or execution time
+    bool is_dynamic_op = false;
+    // maximum number of cached engines
+    int max_cached_engines = 1;
+    bool use_calibration = true;
+    bool use_implicit_batch = true;
+    ProfileStrategy profile_strategy = ProfileStrategy::kRange;
+    bool allow_build_at_runtime = true;
+    bool use_explicit_precision = false;
+  };
+
+  TRTOptimizationPass(const string& name = "TRTOptimizationPass")
+      : name_(name) {}
+
+  string name() const override { return name_; };
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  Status Init(
+      const RewriterConfig_CustomGraphOptimizer* config = nullptr) override;
+
+  Status Optimize(grappler::Cluster* cluster,
+                  const grappler::GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+ private:
+  const string name_;
+
+  ConversionParams params_;
+
+  std::vector<int> batches_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h
new file mode 100644
index 00000000..3f44bb5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h
@@ -0,0 +1,72 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// The PrecisionMode controls the precision used in TRT converted parts of the
+// model. Setting PrecisionMode other than FP32 enables TensorRT to select
+// lower-precision implementations when searching for the fastest kernels.
+//
+// For regularized models whose input dynamic range is approximately one, this
+// typically produces significant speedups with negligible change in accuracy.
+// There is additional complexity when working with INT8, see Calibration.
+//
+// - FP32
+// - FP16 Enable FP16 layer selection, with FP32 fallback.
+// - INT8 Enable Int8 layer selection, with FP32 and FP16 fallback.
+//
+// Note that TensorRT will still choose a higher-precision kernel if it results
+// in overall lower runtime, or if no low-precision implementation exists.
+enum class TrtPrecisionMode { FP32, FP16, INT8 };
+
+Status TrtPrecisionModeToName(const TrtPrecisionMode mode, string* name);
+
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode);
+
+string DebugString(const TrtPrecisionMode mode);
+
+// Optimization profile generation strategies.
+// - `kRange`: create one profile that works for inputs with dimension values
+//   in the range of [min_dims, max_dims] where min_dims and max_dims are
+//   derived from the provided inputs.
+// - `kOptimal`: create one profile for each input. The profile only works for
+//   inputs with the same dimensions as the input it is created for. The GPU
+//   engine will be run with optimal performance with such inputs.
+// - `kRangeOptimal`: create the profiles for both `Range` and `Optimal`.
+// - `kImplicitBatchModeCompatible`: create the profiles that will produce the
+//   same GPU engines as the implicit_batch_mode would produce.
+enum class ProfileStrategy {
+  kRange,
+  kOptimal,
+  kRangeOptimal,
+  kImplicitBatchModeCompatible,
+};
+
+string ProfileStrategyToName(const ProfileStrategy strategy);
+Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/utils.h
new file mode 100644
index 00000000..3e2d54f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -0,0 +1,399 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/env_var.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+#define TFTRT_ERROR(func, ...)                                              \
+  do {                                                                      \
+    return func("TFTRT::", __FUNCTION__, ":", __LINE__, ": ", __VA_ARGS__); \
+  } while (0)
+
+#define TFTRT_CHECK_SHAPE_TENSOR(tensor)                                 \
+  if (!IsTrtShapeTensorCompatible(tensor)) {                             \
+    TFTRT_ERROR(errors::InvalidArgument, "Tensor of type ",              \
+                DebugString(tensor.dtype()), " having shape ",           \
+                tensor.shape().DebugString(), " is not TRT compatible"); \
+  }
+
+namespace tensorflow {
+namespace tensorrt {
+
+static constexpr char kCastOutputTypeAttrName[] = "DstT";
+
+#if !IS_TRT_VERSION_GE(8, 2, 0, 0)
+template <typename T>
+struct TrtDestroyer {
+  void operator()(T* t) {
+    if (t) t->destroy();
+  }
+};
+template <typename T>
+using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
+#else
+template <typename T>
+using TrtUniquePtrType = std::unique_ptr<T>;
+#endif
+
+// Define a hash function for vector<TensorShape> because it is used as the key
+// for the engine cache.
+struct VectorTensorShapeHasher {
+  std::size_t operator()(const std::vector<TensorShape>& key) const {
+    return std::hash<std::string>()(TensorShapeUtils::ShapeListString(key));
+  }
+};
+
+using absl::StrAppend;
+using absl::StrCat;
+
+// This utility template converts an arithmetic type to a string. This function
+// is necessary to allow the following function to behave recursively:
+// `string DebugString(const std::vector<CType>&)`.
+template <typename CType, typename = typename std::enable_if<
+                              std::is_arithmetic<CType>::value, CType>::type>
+string DebugString(const CType& el) {
+  string el_str = std::to_string(el);
+  // Prettify std::to_string which can sometimes returns 1.50000 instead of 1.5.
+  // In short it removes trailing 0s in a string-formatted number.
+  el_str.erase(el_str.find_last_not_of('0') + 1, std::string::npos);
+  return el_str;
+}
+// This utility template converts nested vectors to a string for debug purposes.
+template <typename CType>
+string DebugString(const std::vector<CType>& vector) {
+  string tmp_s = "";
+  for (const auto el : vector) {
+    StrAppend(&tmp_s, StrCat(DebugString(el), ", "));
+  }
+  return StrCat("{", tmp_s.substr(0, tmp_s.length() - 2), "}");
+}
+string DebugString(const nvinfer1::Dims& dims);
+string DebugString(const nvinfer1::DataType trt_dtype);
+string DebugString(const DataType tf_type);
+string DebugString(const nvinfer1::Permutation& permutation, int len);
+string DebugString(const ITensorProxyPtr& tensor);
+string DebugString(const nvinfer1::ITensor& tensor);
+string DebugString(const std::vector<nvinfer1::Dims>& dimvec);
+string DebugString(const std::vector<TensorShape>& shapes);
+string DebugString(const std::vector<PartialTensorShape>& shapes);
+
+template <size_t N>
+string DebugString(const absl::InlinedVector<int64, N>& data) {
+  return absl::StrCat("[", absl::StrJoin(data, ","), "]");
+}
+
+inline bool HasStaticShape(const nvinfer1::Dims& dims) {
+  if (dims.nbDims < 0) return false;
+  for (int d = 0; d < dims.nbDims; ++d) {
+    if (dims.d[d] < 0) return false;
+  }
+  return true;
+}
+
+template <typename T>
+bool HasStaticShape(const T& dims) {
+  return !absl::c_any_of(dims, [](int i) { return i < 0; });
+}
+
+// Returns whether a shape is compatible with a TRT shape tensor.
+template <typename TensorShapeType>
+inline bool IsTrtShapeTensorCompatible(const TensorShapeType& shape) {
+  return (
+      shape.dims() == 0 ||
+      (shape.dims() == 1 && shape.num_elements() <= nvinfer1::Dims::MAX_DIMS));
+}
+
+// Returns whether a TF tensor could be interpreted as a TRT shape tensor.
+inline bool IsTrtShapeTensorCompatible(const Tensor& tensor) {
+  return tensor.dtype() == DT_INT32 &&
+         IsTrtShapeTensorCompatible(tensor.shape());
+}
+
+// Adapts various representations of shape (TF Shape, TRT Dims, plain
+// containers) and provides methods for properties (length, volume) and
+// conversion between types. Note that unlike TF's TensorShape, the underlying
+// storage will only contain active dimensions. In the case of scalar shapes,
+// `NumDims` is allowed to return 0 or 1, but the `storage_` vector will contain
+// 1 element in both cases. In the non-scalar case, `NumDims() ==
+// storage_.size()`.
+class DimsAdapter {
+ public:
+  using StorageType = absl::InlinedVector<int64_t, 4>;
+
+ private:
+  template <typename T>
+  using EnableIfNotTensorShapeType =
+      std::enable_if_t<!std::is_base_of<TensorShapeBase<T>, T>::value>;
+
+  template <typename T>
+  using EnableIfInt = std::enable_if_t<std::is_arithmetic<T>::value &&
+                                       std::is_integral<T>::value>;
+
+ public:
+  //----- Constructors ------
+
+  // Constructs from an absl::Span.
+  template <typename T>
+  explicit DimsAdapter(absl::Span<T> shape)
+      : num_dims_(static_cast<int32_t>(shape.size())) {
+    absl::c_copy(shape, std::back_inserter(storage_));
+  }
+
+  // Constructs from an absl::Span.
+  template <typename T>
+  explicit DimsAdapter(const std::vector<T>& shape)
+      : num_dims_(static_cast<int32_t>(shape.size())) {
+    absl::c_copy(shape, std::back_inserter(storage_));
+  }
+
+  // Constructs from a TRT dims object.
+  DimsAdapter(const nvinfer1::Dims& dims) : num_dims_(dims.nbDims) {
+    absl::c_copy(absl::MakeSpan(dims.d, dims.d + std::max(dims.nbDims, 0)),
+                 std::back_inserter(storage_));
+  }
+
+  // Constructs explicitly specifying num_dims and storage data.
+  DimsAdapter(int32_t num_dims, StorageType data)
+      : num_dims_(num_dims), storage_(std::forward<StorageType>(data)) {}
+
+  // Constructs from a TensorShape or PartialTensorShape.
+  template <typename T>
+  static StatusOr<DimsAdapter> Create(const TensorShapeBase<T>& shape,
+                                      bool ignore_first_dim = false) {
+    if (shape.dims() > nvinfer1::Dims::MAX_DIMS)
+      return errors::InvalidArgument("dims of TensorShape exceed MAX_DIMS");
+    if (ignore_first_dim && shape.dims() <= 0)
+      return errors::InvalidArgument(
+          "removing first dim requires explicit batch dimension");
+    if (shape.dims() == -1) {
+      return DimsAdapter(-1, StorageType{});
+    }
+    if (shape.dims() == 0) {
+      return DimsAdapter(0, StorageType{1});
+    }
+    auto offt = (ignore_first_dim ? 1 : 0);
+    return DimsAdapter(
+        absl::MakeSpan(shape.dim_sizes().begin() + offt, shape.dims() - offt));
+  }
+
+  // Constructs from a container.
+  template <typename InputSequence,
+            typename = EnableIfNotTensorShapeType<InputSequence>>
+  static StatusOr<DimsAdapter> Create(const InputSequence& shape,
+                                      bool ignore_first_dim = false) {
+    if (ignore_first_dim && shape.size() <= 0) {
+      return errors::InvalidArgument(
+          "removing first dim requires explicit batch dimension");
+    }
+    return DimsAdapter(
+        absl::MakeSpan(shape).subspan(ignore_first_dim ? 1 : 0, shape.size()));
+  }
+
+  //----- Conversion Utilities ------
+
+  //  Converts to an nvinfers::Dims and assign the result to the object passed
+  //  in via the result pointer.
+  void TrtDims(nvinfer1::Dims* result) const {
+    result->nbDims = num_dims_;
+    absl::c_copy(storage_, static_cast<int32_t*>(result->d));
+  }
+
+  // Converts to an nvinfer1::Dims and return by value.
+  nvinfer1::Dims AsTrtDims() const {
+    nvinfer1::Dims result;
+    TrtDims(&result);
+    return result;
+  }
+
+  // Converts to a TensorShape and assigns the result to the object passed in
+  // via the shape pointer.
+  Status TensorShape(TensorShape* shape,
+                     std::optional<int> batch_size = std::nullopt) const {
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+        static_cast<const int64_t*>(storage_.data()), storage_.size(), shape));
+    if (batch_size) shape->InsertDim(0, *batch_size);
+    return OkStatus();
+  }
+
+  // Converts to a PartialTensorShape and assigns the result to the object
+  // passed in via the shape pointer.
+  Status PartialTensorShape(
+      PartialTensorShape* shape,
+      std::optional<int> batch_size = std::nullopt) const {
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+        static_cast<const int64_t*>(storage_.data()), storage_.size(), shape));
+    if (batch_size) shape->InsertDim(0, *batch_size);
+    return OkStatus();
+  }
+
+  // Copies the dimension values to the vector passed in via the shape pointer.
+  template <typename T, typename = EnableIfInt<T>>
+  Status Vector(std::vector<T>* shape) const {
+    shape->clear();
+    absl::c_copy(storage_, std::back_inserter(*shape));
+    return OkStatus();
+  }
+
+  //----- Property Accessors ------
+
+  // Returns true if the shape has no dynamic dimensions.
+  bool IsStatic() const {
+    return !absl::c_any_of(storage_, [](auto i) { return i < 0; });
+  }
+
+  // Returns product of all dimensions.
+  int64_t Volume() const {
+    return absl::c_accumulate(storage_, static_cast<int64_t>(1),
+                              std::multiplies<>());
+  }
+
+  int32_t NumDims() const { return num_dims_; }
+
+  // Returns true if the shape should be interpreted as a scalar. This follows
+  // TensorRT conversions: a scalar shape can have NumDims()==1 or NumDims()==0,
+  // but the underlying storage_ container has a single dimension of size 1.
+  bool IsScalar() const {
+    return (num_dims_ == 0 || num_dims_ == 1) && storage_.size() == 1 &&
+           storage_[0] == 1;
+  }
+
+  // Returns true if the dimension storage is empty. This indicates an empty
+  // shape in both the scalar and non-scalar case.
+  bool IsEmpty() const { return storage_.empty(); }
+
+  string DebugString() const {
+    auto vol = absl::c_accumulate(storage_, static_cast<int64_t>(1),
+                                  std::multiplies<>());
+    return absl::StrCat("DimsAdapter(num_dims=", num_dims_, ",shape=[",
+                        absl::StrJoin(storage_, ","), "],", "vol=", vol, ")");
+  }
+
+  // Returns beginning iterator for the underlying storage.
+  StorageType::const_iterator begin() const { return storage_.begin(); }
+
+  // Returns ending iterator for the underlying storage.
+  StorageType::const_iterator end() const { return storage_.end(); }
+
+  // Returns the size of the dimension at `idx`.
+  StorageType::value_type dim(size_t idx) const { return storage_[idx]; }
+
+  // Returns a references to the dimension at `idx`.
+  StorageType::value_type& dim(size_t idx) { return storage_[idx]; }
+
+  //----- Non-Const Operators ------
+
+  DimsAdapter& Append(int32_t dim) {
+    StatusOr<bool> is_scalar = IsScalar();
+    if (!is_scalar.ok()) return *this;
+    num_dims_ = *is_scalar ? 2 : num_dims_ + 1;
+    storage_.push_back(dim);
+    return *this;
+  }
+
+  DimsAdapter& Prepend(std::optional<int32_t> dim) {
+    if (dim) {
+      num_dims_ = IsScalar() ? 2 : num_dims_ + 1;
+      storage_.insert(storage_.begin(), *dim);
+    }
+    return *this;
+  }
+
+  Status RemoveBatchDimension() {
+    if (storage_.empty())
+      return errors::InvalidArgument(
+          "attempted to remove batch dim from scalar");
+    num_dims_ -= 1;
+    storage_.erase(storage_.begin());
+    return OkStatus();
+  }
+
+  //----- Comparison Operators ------
+
+  bool operator==(const DimsAdapter& rhs) const {
+    if (rhs.num_dims_ != num_dims_) return false;
+    for (int i = 0; i < num_dims_; i++) {
+      if (rhs.storage_[i] != storage_[i]) return false;
+    }
+    return true;
+  }
+
+  bool operator!=(const DimsAdapter& rhs) const { return !(*this == rhs); }
+
+ private:
+  int32_t num_dims_{0};
+  StorageType storage_{};
+};
+
+Status GetNetworkInputShapes(const nvinfer1::INetworkDefinition* network,
+                             std::vector<PartialTensorShape>* input_shapes);
+
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
+
+// Returns true if an engine built for cached_shapes can also run actual_shapes.
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes);
+
+// Returns the number of inputs for the engine, which also correspends to the
+// number of input tensors for the network. This can differ from the number of
+// input bindings, because the number of total input bindings equals the number
+// of profiles times the number of engine inputs.
+int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine);
+
+// Returns the string representation for the assigned device or the requested
+// device of the given node.
+absl::string_view GetDeviceName(const Node* node);
+
+// Returns the ParsedName representation for the assigned device or the
+// requested device string of the given node. If the device string is invalid,
+// returns std::nullopt.
+std::optional<DeviceNameUtils::ParsedName> GetDeviceParsedName(
+    const Node* node);
+
+// If the given two device assignments as compatible, returns the merge of the
+// two assignments. Otherwise, returns std::nullopt.
+std::optional<DeviceNameUtils::ParsedName> MergeIfCompatible(
+    const DeviceNameUtils::ParsedName& a, const DeviceNameUtils::ParsedName& b);
+// Similar to the above, except that the second device assignment is represented
+// by a string_view.
+std::optional<DeviceNameUtils::ParsedName> MergeIfCompatible(
+    const DeviceNameUtils::ParsedName& a, absl::string_view b);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/weights.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/weights.h
new file mode 100644
index 00000000..20b66e98
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/convert/weights.h
@@ -0,0 +1,295 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight.
+class TRT_ShapedWeights {
+ public:
+  explicit TRT_ShapedWeights(
+      nvinfer1::DataType type = nvinfer1::DataType::kFLOAT);
+
+  // Constructs a weights from another weights.
+  //
+  // NOTE: this does not copy the underlying buffer but only increase its
+  // reference count.
+  TRT_ShapedWeights(const TRT_ShapedWeights& rhs) = default;
+
+  nvinfer1::Weights GetTrtWeights() const;
+
+  const Tensor& GetTensor() const { return tensor_; }
+
+  // Returns a pointer of type const T to the underlying buffer of the tensor.
+  template <typename T>
+  const T* GetPointer() const {
+    int64 num_elem =
+        (tensor_.NumElements() * DataTypeSize(tensor_.dtype())) / sizeof(T);
+    return tensor_.bit_casted_shaped<T, 1>({num_elem}).data();
+  }
+
+  // Returns a pointer of type T to the underlying buffer of the tensor.
+  template <typename T>
+  T* GetPointer() {
+    int64 num_elem =
+        (tensor_.NumElements() * DataTypeSize(tensor_.dtype())) / sizeof(T);
+    return tensor_.bit_casted_shaped<T, 1>({num_elem}).data();
+  }
+
+  // Fills all the weight values with value.
+  template <typename T>
+  Status SetValues(T value) {
+    switch (type_) {
+      case nvinfer1::DataType::kFLOAT: {
+        float* ptr = tensor_.flat<float>().data();
+        std::fill(ptr, ptr + volume_, value);
+        break;
+      }
+      case nvinfer1::DataType::kHALF: {
+        Eigen::half* ptr = tensor_.flat<Eigen::half>().data();
+        std::fill(ptr, ptr + volume_, Eigen::half(value));
+        break;
+      }
+      case nvinfer1::DataType::kINT32: {
+        int32* ptr = tensor_.flat<int32>().data();
+        std::fill(ptr, ptr + volume_, value);
+        break;
+      }
+      default:
+        return errors::InvalidArgument(
+            "Unsupported data type ", tensorflow::tensorrt::DebugString(type_));
+    }
+    return OkStatus();
+  }
+
+  Status SetShape(DimsAdapter dims);
+  void SetShapeUnsafe(DimsAdapter dims) { shape_ = std::move(dims); }
+
+  // Returns total number of elements. Returning 0 means either some dim is 0
+  // or the number of dims is 0. Note that a TF scalar constant is marked as
+  // Dims{0, {1}}, and has a count() == 1.
+  int64_t count() const { return volume_; }
+
+  size_t size_bytes() const;
+
+  string DebugString() const;
+
+  template <typename T>
+  absl::Span<const T> GetSpan() const {
+    return absl::Span<const T>(tensor_.flat<T>().data(), volume_);
+  }
+
+  template <typename T>
+  std::vector<T> ToVector() const {
+    auto span = GetSpan<T>();
+    return std::vector<T>(span.data(), span.data() + span.size());
+  }
+
+  nvinfer1::DataType TrtDType() const { return type_; }
+
+  const DimsAdapter& Shape() const { return shape_; }
+  DimsAdapter& Shape() { return shape_; }
+
+ private:
+  // The shape of the weights. Defaults to the empty shape.
+  DimsAdapter shape_;
+
+  // This creation method is only used by TrtWeightStore, which creates the
+  // underlying buffer.
+  static StatusOr<TRT_ShapedWeights> CreateWithTensor(nvinfer1::DataType type,
+                                                      DimsAdapter dims,
+                                                      Tensor tensor);
+
+  nvinfer1::DataType type_;
+
+  // All weights should be stored inside TrtWeightStore to make sure lifetime of
+  // all the underlying tensors are available until the engine is built. For
+  // this reason, tensor_ should never be reassigned to a different value that
+  // is not already present in the TrtWeightStore.
+  Tensor tensor_;
+  // Contains the volume of the weight's shape.
+  int64_t volume_;
+
+  friend class TrtWeightStore;
+};
+
+// Container for TRT_ShapedWeights. We need this container because TRT does not
+// manage the lifetime of the weights buffer, it only keeps a pointer to it and
+// requires that the data referenced by the pointer be available until the
+// building of engine is complete. For more information see
+// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html
+//
+// TODO(laigd): consider adding garbage collection to the unused weights.
+class TrtWeightStore {
+ public:
+  // Gets a TRT_ShapedWeights with 'type' and 'dims'.
+  StatusOr<TRT_ShapedWeights> GetTempWeights(nvinfer1::DataType trt_type,
+                                             const DimsAdapter& dims);
+
+  // Gets a TRT_ShapedWeights with the same data type and dimensions as
+  // 'weights'.
+  StatusOr<TRT_ShapedWeights> GetTempWeights(const TRT_ShapedWeights& weights) {
+    return GetTempWeights(weights.TrtDType(), weights.Shape());
+  }
+
+ private:
+  // The backend storage of the TRT_ShapedWeights.
+  std::vector<Tensor> store_;
+};
+
+// Enumerates the possible types of arguments of a converter. This determines
+// what object is contained in TRT_TensorOrWeights, and converters can require
+// a specific type for each of their arguments.
+enum class TRT_ArgumentType {
+  TENSOR = 0,
+  WEIGHTS = 1,
+  RESOURCE = 2,
+};
+
+struct OpConverterParams;
+
+// Represents a TRT-style input to a TF node, it can be either a
+// ITensorProxyPtr (representing nvinfer1::ITensor* or SimpleITensor),
+// or TRT_ShapedWeights which is compile-time constant.
+//
+// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument.
+class TRT_TensorOrWeights {
+ public:
+  TRT_TensorOrWeights() {}
+  TRT_TensorOrWeights(ITensorProxyPtr);
+  TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size);
+
+  // Constructs a wrapper for the given ITensor.
+  // This is used by Converter when building the TRT network, where the ITensor
+  // is owned by the TRT network being built. See comment for 'trt_tensor_'
+  // in trt_proxy_tensor.h.
+  explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1);
+
+  // Creates a SimpleITensor for trt_dtype and trt_dims and takes ownership of
+  // the object. Constructs a wrapper for the SimpleITensor. This is used by
+  // TrtNodeValidator to encapsulate the type and shape information for
+  // validation of graph nodes, and the created ITensor is fake and temporary,
+  // and should not be used to build any TRT network. See comment for
+  // 'simple_tensor_' in trt_proxy_tensor.h.
+  explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
+                               const nvinfer1::Dims& trt_dims, int batch_size);
+
+  // Constructs a wrapper for the given weights.
+  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights);
+
+  // Constructs a wrapper for the given resource handle.
+  explicit TRT_TensorOrWeights(const ResourceHandle& resource);
+
+  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs);
+
+  void operator=(const TRT_TensorOrWeights& rhs);
+
+  bool is_tensor() const {
+    return initialized_ && arg_type_ == TRT_ArgumentType::TENSOR;
+  }
+  bool is_weights() const {
+    return initialized_ && arg_type_ == TRT_ArgumentType::WEIGHTS;
+  }
+  bool is_resource() const {
+    return initialized_ && arg_type_ == TRT_ArgumentType::RESOURCE;
+  }
+
+  ITensorProxyPtr tensor() const;
+
+  ResourceHandle resource() const;
+
+  ITensorProxyPtr as_tensor(const OpConverterParams* params);
+
+  TRT_ShapedWeights& weights() {
+    DCHECK(is_weights());
+    return weights_;
+  }
+
+  const TRT_ShapedWeights& weights() const {
+    DCHECK(is_weights());
+    return weights_;
+  }
+
+  nvinfer1::Dims GetTrtDims() const;
+
+  Status GetTfType(DataType* tf_type) const;
+
+  int batch_size() const { return batch_size_; }
+
+  string DebugString() const;
+
+  nvinfer1::DataType TrtDType() const {
+    if (arg_type_ == TRT_ArgumentType::RESOURCE) {
+      VLOG(0) << "Calling TrtDType() with a RESOURCE argument is undefined "
+                 "behavior.";
+    }
+    return arg_type_ == TRT_ArgumentType::TENSOR ? tensor_proxy_ptr_->getType()
+                                                 : weights_.TrtDType();
+  }
+
+ private:
+  void set_batch_size(int batch_size) { batch_size_ = batch_size; }
+
+  // First dimension of the TF tensor (NOT tensor_) that is represented by
+  // tensor_ is treated as the "batch dimension" by TRT, and tensor_'s
+  // dimensions (obtained via tensor_->getDimensions()) do not contain the batch
+  // dimension. For example, when a TF tensor with shape (A,B,C) is represented
+  // in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A.
+  //
+  // This requires that all tensors in the subgraph that is converted to a TRT
+  // engine have the same batch size are represented by the first dimension of
+  // their shape, and Converter will verify this during conversion. The drawback
+  // is that currently it cannot convert a graph that doesn't have the batch
+  // size represented in the shapes or the batch sizes are different. See
+  // b/118387490 for more details.
+  //
+  // If use_implicit_batch is false, batch_size_ is unused and
+  // tensor_->getDimensions() will contain the entire shape (A,B,C).
+  //
+  // tensor_proxy_ptr_ is used when arg_type_ == TENSOR.
+  ITensorProxyPtr tensor_proxy_ptr_ = nullptr;
+  int batch_size_ = -1;
+
+  // For DT_RESOURCE arguments (there is no corresponding type in TRT).
+  // resource_ is used when arg_type_ == RESOURCE.
+  ResourceHandle resource_;
+
+  // weights_ is used when arg_type_ == WEIGHTS.
+  TRT_ShapedWeights weights_;
+  bool initialized_ = false;
+  TRT_ArgumentType arg_type_ = TRT_ArgumentType::WEIGHTS;
+
+  friend class Converter;
+};
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
new file mode 100644
index 00000000..8976cc6e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+extern const char* kTfTrtPluginVersion;
+extern const char* kTfTrtPluginNamespace;
+
+// A wrapper class for TensorRT plugin. User application should inherit from
+// this class to write custom kernels.
+class TrtPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  TrtPlugin() { setPluginNamespace(kTfTrtPluginNamespace); }
+
+  TrtPlugin(const void* serialized_data, size_t length) {}
+
+  TrtPlugin(const TrtPlugin& rhs) : namespace_(rhs.namespace_) {}
+
+  int initialize() noexcept override { return 0; }
+
+  void terminate() noexcept override {}
+
+  void destroy() noexcept override { delete this; }
+
+  void setPluginNamespace(const char* plugin_namespace) noexcept override {
+    namespace_ = plugin_namespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return namespace_.c_str();
+  }
+
+ protected:
+  template <typename T>
+  void WriteToBuffer(const T& val, char** buffer) const {
+    *reinterpret_cast<T*>(*buffer) = val;
+    *buffer += sizeof(T);
+  }
+
+  template <typename T>
+  T ReadFromBuffer(const char** buffer) {
+    T val = *reinterpret_cast<const T*>(*buffer);
+    *buffer += sizeof(T);
+    return val;
+  }
+
+ private:
+  std::string namespace_;
+};
+
+template <typename T>
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar() {
+    getPluginRegistry()->registerCreator(creator, kTfTrtPluginNamespace);
+  }
+
+ private:
+  T creator;
+};
+
+#define REGISTER_TFTRT_PLUGIN(name)                       \
+  static ::tensorflow::tensorrt::TrtPluginRegistrar<name> \
+      plugin_registrar_##name {}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/segment/segment.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/segment/segment.h
new file mode 100644
index 00000000..06a3893d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
+
+#include <set>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+constexpr char kTftrtOpMaxBatchSizeAttr[] = "_tftrt_op_max_batch_size";
+
+struct SegmentOptions {
+  // This struct holds per graph segmenting parameters.
+  // Segment must contain at least this many nodes.
+  int minimum_segment_size = 2;
+  bool use_implicit_batch = true;
+  // The maximum batch size used to build the engines in the graph, when
+  // use_implicit_batch is true.
+  std::optional<int> maximum_batch_size = std::nullopt;
+  // When use_implicit_batch is false or when we are building dynamic engines,
+  // we allow dynamic non-batch dimensions.
+  bool allow_dynamic_non_batch_dim = false;
+  // The name of the device to put the segment on.
+  std::set<string> exclude_node_list;
+};
+
+struct NodePtrCompare {
+  bool operator()(const Node* lhs, const Node* rhs) const {
+    return lhs->name() < rhs->name();
+  }
+};
+
+struct Segment {
+  Segment() {}
+  Segment(const ClusterProperty& property,
+          const std::set<const Node*, NodePtrCompare>& nodes)
+      : property(property), nodes(nodes) {}
+  ClusterProperty property;
+  std::set<const Node*, NodePtrCompare> nodes;
+};
+
+// Vector of segments, each entry contains a set of node pointers.
+using SegmentVector = std::vector<Segment>;
+
+// Get the subgraphs of a graph that can be handled by TensorRT.
+//
+// @param tf_graph Graph of the network.
+// @graph_properties is the static graph properties.
+// @param candidate_fn A function that returns OK for a Node* if
+// that node can be handled by TensorRT.
+// @param segments Returns the TensorRT segments/subgraphs. Each entry
+// in the vector describes a subgraph by giving a set of the names of
+// all the NodeDefs in that subgraph.
+// @return the status.
+Status SegmentGraph(const Graph* tf_graph,
+                    const grappler::GraphProperties* graph_properties,
+                    const std::function<Status(const Node*)>& candidate_fn,
+                    const std::function<bool(const Edge*)>& input_candidate_fn,
+                    const std::function<bool(const Edge*)>& output_candidate_fn,
+                    const SegmentOptions& options, SegmentVector* segments);
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/segment/union_find.h
new file mode 100644
index 00000000..41dd9ff1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -0,0 +1,218 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+// ClusterBatchSize is a data structure to record the batch size we have seen
+// for a cluster during segmentation.
+//
+// With the help of shape inference, all the dynamic batch sizes are converted
+// to a negative integer number.
+// If the number is -1, then nothing is known about the dynamic batch size.
+// Ideally, we should not put nodes with -1 batch size into the same cluster,
+// as they will likely have different batch sizes at runtime. However, we
+// currently treat -1 as an equivalent class for simple implementation. We may
+// need to revise this if it causes performance issues.
+// If the number is strictly less than -1, then it represents a equivalent
+// class. It is inferred that all the nodes with the same equivalent class
+// (strictly less than -1) shall have the same batch size at runtime.
+//
+// When constructing clusters for implicit batch mode, we support both
+// dynamic batch sizes and static batch sizes. As all the nodes inside the same
+// cluster shall have the same batch size at runtime, we restrict nodes inside a
+// cluster to either have the same dynamic batch size equivalent class or the
+// same static batch size value.
+//
+// Besides, all the nodes with an annotated max batch size inside the same
+// cluster shall have the same annotated max batch size. (It is allowed if
+// part or all the nodes inside the cluster doesn't have annotated max batch
+// size). Static batch sizes are treated as max batch size annotations. The
+// converter max batch size is used for an OP with a dynamic batch size and no
+// annotated max batch size.
+//
+// cluster:  a = a1[1,3] + a1[1,3]
+// ClusterBatchSize: batch_size_ = 1
+//                   max_batch_size_ = 1
+//
+// cluster:  b = b1[-1,3] + b2[-1, 3]
+// ClusterBatchSize: batch_size_ = -1
+//                   max_batch_size_ = null
+//
+// cluster:  c = c1[-2,3] + c2[-2, 3](max_batch_size=100)
+// ClusterBatchSize: batch_size_ = -2
+//                   max_batch_size_ = 100
+//
+// When constructing cluster for explicit batch mode, all ClusterBatchSize is
+// irrelevant.
+//
+
+class ClusterBatchSize {
+ public:
+  ClusterBatchSize();
+
+  bool operator==(const ClusterBatchSize& other);
+  bool operator!=(const ClusterBatchSize& other) { return !(*this == other); }
+
+  // Sets the batch size assuming that the object doesn't have a batch size yet:
+  //   A non-negative input representing a static batch size value.
+  //   A negative input representing a dynamic batch size equivalent class.
+  ClusterBatchSize& SetBatchSize(int batch_size);
+  bool HasBatchSize() const;
+  int GetBatchSize() const;
+
+  // Sets the max batch size assuming that the object doesn't have a max batch
+  // size yet.
+  ClusterBatchSize& SetMaxBatchSize(int max_batch_size);
+  std::optional<int> GetOptionalMaxBatchSize() const;
+
+  // Merge `other` into the current ClusterBatchSize if the two are not
+  // conflicting. Two ClusterBatchSizes are conflicting iff they both have a
+  // value and their values are different.
+  bool MergeIfCompatible(const ClusterBatchSize& other);
+
+  // Returns a string for the batch size and the annotated max batch size.
+  // For the batch size:
+  //   If the object has a static batch size, return a string representing a
+  //     non-negative integer.
+  //   If the object has a dynamic batch size, return a string representing a
+  //     negative integer as an equivalent class.
+  //   If the object doesn't have a batch size yet, return "?".
+  // For the annotated max batch size:
+  //   If the cluster has annotated max batch size in at least one of the nodes,
+  //     return a string representing the annotated max batch size. Otherwise,
+  //     return "?".
+  std::string ToString() const;
+
+ private:
+  ClusterBatchSize& SetBatchSize(const std::optional<int>& batch_size);
+  ClusterBatchSize& SetMaxBatchSize(const std::optional<int>& batch_size);
+
+  std::optional<int> batch_size_;
+  std::optional<int> max_batch_size_;
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const ClusterBatchSize& batch_size) {
+  return os << batch_size.ToString();
+}
+
+// Represents the accumulated properties of a cluster during segmentation,
+// including information about batch size and device assignment. Clusters shall
+// have compatible properties in order to be merged together.
+class ClusterProperty {
+ public:
+  ClusterProperty() {}
+  ClusterProperty(const ClusterBatchSize& batch_size,
+                  const DeviceNameUtils::ParsedName& device_name);
+
+  // Returns the batch size of the cluster and compresses the path from this
+  // object to the root object.
+  const ClusterBatchSize& BatchSize() const { return batch_size_; }
+
+  // Returns the device name of the cluster and compresses the path from this
+  // object to the root object.
+  const DeviceNameUtils::ParsedName& DeviceName() const { return device_name_; }
+
+  Status Merge(const ClusterProperty& other);
+
+ private:
+  ClusterBatchSize batch_size_;
+  DeviceNameUtils::ParsedName device_name_;
+};
+
+// Represents a disjoint set of copyable value with type T and accumulated
+// property of the values with type P. Most of the methods in this class are
+// side-effecting as they also compress the path from the object to the parent
+// of its containing set.
+template <typename T, typename P = ClusterProperty>
+class UnionFind {
+ public:
+  UnionFind() : size_(1), parent_(nullptr) {}
+  UnionFind(const T& v, const P& p)
+      : size_(1), parent_(nullptr), value_(v), property_(p) {}
+  UnionFind(const T& v, P&& p)
+      : size_(1), parent_(nullptr), value_(v), property_(p) {}
+
+  // Returns the number of elements in the set and compresses the path from
+  // this object to the root of the set.
+  int Size() { return FindRoot()->size_; }
+
+  // Returns the accumulated property of all the elements in the set and
+  // compresses the path from this object to the root of the set.
+  const P& Property() { return FindRoot()->property_; }
+
+  // Merges this set with 'other'. This updates the size_ and property_ of the
+  // set. The size_ and property_ of 'other' becomes inaccessible as only the
+  // size_ and property_ of the root of the set is accessible.
+  Status Merge(UnionFind* other);
+
+  // Retrieves the value for the root of the set.
+  const T& ParentValue() { return FindRoot()->value_; }
+
+  // Returns the value for the object.
+  const T& Value() const { return value_; }
+
+ private:
+  // Returns the root object for the set and compresses the path from this
+  // object to the root object.
+  UnionFind* FindRoot();
+
+  int size_;
+  UnionFind* parent_;
+  T value_;
+  P property_;
+};
+
+template <typename T, typename P>
+Status UnionFind<T, P>::Merge(UnionFind* other) {
+  UnionFind<T>* a = FindRoot();
+  UnionFind<T>* b = other->FindRoot();
+  if (a == b) return OkStatus();
+
+  P merged_property(a->property_);
+  TF_RETURN_IF_ERROR(merged_property.Merge(b->property_));
+  b->parent_ = a;
+  a->size_ += b->size_;
+  a->property_ = std::move(merged_property);
+  return OkStatus();
+}
+
+template <typename T, typename P>
+UnionFind<T, P>* UnionFind<T, P>::FindRoot() {
+  if (!parent_) return this;
+  // Path compression: update intermediate nodes to point to the root of the
+  // equivalence class.
+  parent_ = parent_->FindRoot();
+  return parent_;
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/trt_convert_api.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/trt_convert_api.h
new file mode 100644
index 00000000..bba45add
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/trt_convert_api.h
@@ -0,0 +1,129 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_
+
+#include <climits>
+#include <string>
+#include <vector>
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+struct SavedModelBundle;
+
+namespace tensorrt {
+
+struct TfTrtConversionParams {
+  // Corresponds 'workspaceSize' parameter of
+  // nvinfer1::IBuilderConfig::setMaxWorkspaceSize.
+#if IS_TRT_VERSION_GE(8, 4, 0, 0)
+  // Must use `LLONG_MAX - 512` to avoid overflow during casting.
+  size_t max_workspace_size_bytes = LLONG_MAX - 512;
+#else
+  size_t max_workspace_size_bytes = 1 << 30;  // 1,073,741,824
+#endif
+
+  // Minimum precision used by the TRT Engine.
+  TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
+
+  // The minimum number of nodes required for a subgraph to be replaced by
+  // TRTEngineOp. Note that many small TRT subgraphs could be detrimental for
+  // performance, increasing the minimum segment size can help avoid the
+  // problem.
+  int minimum_segment_size = 3;
+
+  // Max number of cached TRT engines for dynamic TRT ops (by default we have
+  // dynamic TRT ops).
+  int max_cached_engines = 1;
+
+  // Note that calibration is currently not implemented with the C++ converter.
+  // This argument is ignored if precision_mode is not INT8. If set to True, the
+  // implementation will use the user provided inputs to generate calibration
+  // data. If set to False, quantization nodes will be expected for every tensor
+  // in the graph (excluding those which will be fused). If a range is missing,
+  // an error will occur. Please note that accuracy may be negatively affected
+  // if there is a mismatch between which tensors TRT quantizes and which
+  // tensors were trained with fake quantization.
+  bool use_calibration = true;
+
+  // Whether to enable dynamic shape mode for the TRT engines. It is
+  // recommended to use_dynamic_shape mode to handle dynamic input shape.
+  // Enabling dynamic shape mode can also improve the conversion rate of graphs
+  // with static input shape.
+  bool use_dynamic_shape = true;
+
+  // In dynamic shape mode we create an engine that can handle various input
+  // shape ranges. We derive the shape optimization profiles for the TRT engines
+  // in the graph based on user provided input data and profile_strategy.
+  ProfileStrategy profile_strategy = ProfileStrategy::kRange;
+
+  // Whether to allow building TRT engines at runtime. If no TensorRT engine can
+  // be found in cache that can handle the given inputs during runtime, then a
+  // new TensorRT engine is built at runtime if allow_build_at_runtime=True,
+  // otherwise native TF is used. We recommend to set this value false and build
+  // the engine in advance, to avoid runtime overhead.
+  bool allow_build_at_runtime = true;
+
+  // Record the TRT engine as an attribute of the TRTEngineOp. This is only
+  // valid when max_cached_engines == 1. Note: the frozen graph together with
+  // the serialized engines have to be below 2GiB (protobuf size limit). If
+  // convert_to_static_engine = false, then the converted graph_def only
+  // contains placeholder TRTEngineOp nodes.
+  bool convert_to_static_engine = true;
+};
+
+/**
+ * Converts the graph with TF-TRT.
+ *
+ * Performs TF-TRT conversion and returns the converted GraphDef. If inputs is
+ * not empty and convert_to_static_engine is requested, we also build the
+ * engines and convert the engines to static engines.
+ *
+ * Arguments:
+ * - frozen_graph_def input graph, it is assumed to be frozen
+ * - input_names names of the input tensors
+ * - output_names names of the output tensors
+ * - inputs tensors that we will use as input while building the TRT engines
+ * - conv_params parameters for the TF-TRT conversion
+ *
+ * Returns the converted graph_def.
+ */
+StatusOr<GraphDef> ConvertAndBuild(
+    const GraphDef& frozen_graph_def, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<std::vector<tensorflow::Tensor>>& inputs,
+    const TfTrtConversionParams& conv_params);
+
+StatusOr<GraphDef> ConvertAndBuild(
+    SavedModelBundle* bundle,
+    const std::string& signature_key = "serving_default",
+    const std::vector<std::vector<tensorflow::Tensor>>& inputs = {},
+    const TfTrtConversionParams& conversion_params = TfTrtConversionParams());
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/py_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
new file mode 100644
index 00000000..b888dc5d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
+
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+namespace tensorrt {
+
+bool IsGoogleTensorRTEnabled();
+
+std::vector<std::string> GetRegisteredOpConverters();
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
new file mode 100644
index 00000000..2812aa06
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+// std::align is not supported, so this function mimic its behavior.
+void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+class TRTBaseAllocator : public nvinfer1::IGpuAllocator {
+  // Base allocator class so we can have a virtual destructor;
+ public:
+  // python wrapper seems to be not happy with an pure virtual destructor;
+  virtual ~TRTBaseAllocator() = default;
+};
+
+class TRTDeviceAllocator : public TRTBaseAllocator {
+  // Allocator implementation wrapping TF device allocators.
+ public:
+  TRTDeviceAllocator(Allocator* allocator);
+
+  // TODO(aaroey): base class doesn't have a virtual destructor, work with
+  // Nvidia to fix it.
+  virtual ~TRTDeviceAllocator() {
+    VLOG(1) << "Destroying allocator attached to " << allocator_->Name();
+  }
+  void* allocate(uint64_t size, uint64_t alignment,
+                 uint32_t flags) noexcept override;
+  void free(void* memory) noexcept override;
+
+ private:
+  mutex mu_;
+  Allocator* allocator_;
+
+  // supporting alignment from allocation request requires a map to free;
+  std::unordered_map<void*, void*> mem_map_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
new file mode 100644
index 00000000..b0935afb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+using ::tsl::StatusOr;
+
+// Creates a TensorRT execution context.
+ExecutionContext CreateExecutionContext(nvinfer1::ICudaEngine* cuda_engine);
+
+// Sets input buffers for TRT from a list of input tensors. The input tensors
+// are either defined by ctx or by input_vec.
+Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine,
+                          nvinfer1::IExecutionContext* execution_context,
+                          const int trt_profile_idx,
+                          std::vector<void*>& buffers, bool use_implicit_batch,
+                          int num_batch,
+                          const TrtShapeOptimizationProfile& profiles,
+                          OpKernelContext* ctx = nullptr,
+                          const DataVec* input_vec = nullptr);
+
+// Returns the shape of a binding from TensorRT.
+//
+// The binding is identified by its binding_index. The batch_size argument is
+// ignored if use_implicit_batch==false. The shape is returned in the last
+// argument.
+Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
+                          const nvinfer1::IExecutionContext* execution_context,
+                          int binding_index, bool use_implicit_batch,
+                          int batch_size, TensorShape& shape);
+
+// Defines output buffers for TRT. The buffers are allocated by ctx, if ctx is
+// not null. Otherwise it is expected that the outputs DataVec is not null, and
+// the Tensors in outputs are already allocated.
+Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine,
+                           nvinfer1::IExecutionContext* execution_context,
+                           int trt_profile_idx, std::vector<void*>& buffers,
+                           bool use_implicit_batch, int batch_size = 0,
+                           OpKernelContext* ctx = nullptr,
+                           DataVec* outputs = nullptr);
+
+// Enqueues TensorRT inference job. The batch_size argument is only relevant in
+// implicit batch mode.
+Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
+                  std::vector<void*>& buffers, cudaStream_t stream,
+                  bool use_implicit_batch, int batch_size = 1);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h
new file mode 100644
index 00000000..05b5cefb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXECUTION_CONTEXT_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXECUTION_CONTEXT_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// A wrapper for the TensorRT execution context which will destroy the TensorRT
+// execution context when the object goes out of scope.
+class ExecutionContext : public TrtUniquePtrType<nvinfer1::IExecutionContext> {
+ public:
+  ExecutionContext(nvinfer1::IExecutionContext* context, bool has_memory)
+      : TrtUniquePtrType<nvinfer1::IExecutionContext>(context),
+        has_device_memory_(has_memory) {}
+  static ExecutionContext Create(nvinfer1::ICudaEngine* cuda_engine);
+
+  bool HasDeviceMemory() { return has_device_memory_; }
+
+ private:
+  bool has_device_memory_;
+};
+
+};  // namespace tensorrt
+};  // namespace tensorflow
+#endif
+#endif
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h
new file mode 100644
index 00000000..1a502c5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+bool isExperimentalFeatureActivated(string feature_name);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
new file mode 100644
index 00000000..2fa22662
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+// This class provides a 1 element queue to match TFs push model to
+// TRTs pull model for calibration. When TRT implements a means for
+// a push calibration This class should be updated accordingly
+
+// IInt8EntropyCalibrator2 is preferred for TRT 5.1+.
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
+ public:
+  // Construct a calibrator for future calibration.
+  TRTInt8Calibrator(
+      const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
+      int batch_size, string engine_name);
+
+  // Construct a finalized calibrator where we don't need to run calibration any
+  // more, as the calibration data is provided.
+  TRTInt8Calibrator(const string& calibration_data);
+
+  ~TRTInt8Calibrator();
+
+  int getBatchSize() const noexcept override;
+
+  bool getBatch(void* bindings[], const char* names[],
+                int num_bindings) noexcept override;
+
+  // Feed calibration data to the calibrator, and return true if the data is
+  // accepted. Return false if the calibrator has been terminated.
+  bool setBatch(const std::unordered_map<string, void*>& data,
+                const cudaStream_t stream);
+
+  // Wait until the last batch is consumed by the calibrator and set done.
+  void waitAndSetDone();
+
+  // Notify that calibration is done and future batches provided by setBatch()
+  // will be ignored.
+  void setDone();
+
+  // If not null, calibration is skipped.
+  const void* readCalibrationCache(std::size_t& length) noexcept override;
+
+  void writeCalibrationCache(const void* ptr,
+                             std::size_t length) noexcept override;
+
+  const string& getCalibrationTableAsString() { return calibration_table_; }
+
+ private:
+  const int batch_size_;
+
+  // mutex for condition_variable
+  mutex cond_mtx_;
+
+  // condition variable to implement producer-consumer queue for calibration
+  condition_variable cond_;
+
+  // Is calibration finished?
+  bool done_;
+
+  // Map to keep tensorrt input buffers and sizes keyed with buffer names
+  std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
+
+  bool calib_running_;
+  bool batch_is_set_;
+
+  string engine_name_;
+  string calibration_table_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
new file mode 100644
index 00000000..8002df53
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Logger for GIE info/warning/errors
+class Logger : public nvinfer1::ILogger {
+ public:
+  Logger(string name = "DefaultLogger") : name_(name) {}
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) noexcept override;
+  void suppressLoggerMsgs(nvinfer1::ILogger::Severity severity);
+  void unsuppressLoggerMsgs(nvinfer1::ILogger::Severity severity);
+  void unsuppressAllLoggerMsgs() { suppressedMsg_ = 0; }
+  static Logger* GetLogger();
+
+ private:
+  bool isValidSeverity(nvinfer1::ILogger::Severity severity,
+                       const char* msg = nullptr) noexcept;
+  const string name_;
+  unsigned int suppressedMsg_ = 0;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
new file mode 100644
index 00000000..dbcea12a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -0,0 +1,261 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+
+#include <list>
+#include <thread>
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+template <class Key, class Value, class HashFunction>
+class LRUCache {
+ public:
+  typedef Value value_type;
+  typedef Key key_type;
+  typedef HashFunction hasher;
+  typedef typename std::unordered_map<key_type, value_type, hasher> map_type;
+  typedef typename map_type::iterator iterator;
+  typedef typename map_type::const_iterator const_iterator;
+
+  LRUCache() : capacity_(0) {}
+  explicit LRUCache(size_t capacity) : capacity_(capacity) {}
+
+  size_t capacity() const { return capacity_; }
+
+  void reserve(size_t capacity) {
+    capacity_ = capacity;
+    DiscardOld();
+  }
+
+  size_t size() const { return objects_.size(); }
+
+  size_t count(const key_type& key) const { return objects_.count(key); }
+
+  value_type& at(const key_type& key) { return Touch(key); }
+
+  const_iterator begin() const { return objects_.begin(); }
+  const_iterator end() const { return objects_.end(); }
+
+  iterator begin() { return objects_.begin(); }
+  iterator end() { return objects_.end(); }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    DiscardOld(1);
+    std::pair<iterator, bool> result =
+        objects_.emplace(std::forward<Args>(args)...);
+    key_type key = result.first->first;
+    if (result.second) {
+      keys_.push_front(key);
+    } else {
+      TouchNoCheck(key);  // The key must exist in this case.
+    }
+    return result;
+  }
+
+ private:
+  std::unordered_map<key_type, value_type, hasher> objects_;
+  std::list<key_type> keys_;
+  size_t capacity_;
+  value_type not_found_value_;
+
+  value_type& Touch(const key_type& key) {
+    // Check that the key exists, and let it return std::out_of_range error if
+    // not.
+    value_type& value = objects_.at(key);
+    TouchNoCheck(key);
+    return value;
+  }
+
+  void TouchNoCheck(const key_type& key) {
+    auto rank = std::find(keys_.begin(), keys_.end(), key);
+    if (rank != keys_.begin()) {
+      keys_.erase(rank);
+      keys_.push_front(key);
+    }
+  }
+
+  // Creates n free positions in cache
+  void DiscardOld(size_t n = 0) {
+    DCHECK(capacity_ >= n) << "Insufficient capacity in cache (capacity = "
+                           << capacity_ << ", requested " << n << ")";
+    while (objects_.size() > (capacity_ - n)) {
+      key_type discard_key = keys_.back();
+      keys_.pop_back();
+      objects_.erase(discard_key);
+    }
+  }
+};
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+struct EngineContext {
+  EngineContext() {}  // Creates an empty context.
+  EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& cuda_engine,
+                ExecutionContext&& execution_context)
+      : cuda_engine_(std::move(cuda_engine)) {
+    execution_contexts.push_back(std::move(execution_context));
+    device_memory_size_ =
+        cuda_engine_ ? cuda_engine_->getDeviceMemorySize() : 0;
+  }
+  EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& cuda_engine,
+                std::vector<ExecutionContext>&& execution_contexts)
+      : cuda_engine_(std::move(cuda_engine)),
+        execution_contexts(std::move(execution_contexts)) {
+    device_memory_size_ =
+        cuda_engine_ ? cuda_engine_->getDeviceMemorySize() : 0;
+  }
+
+  mutex mu;
+
+  nvinfer1::ICudaEngine* GetCudaEngine() { return cuda_engine_.get(); }
+
+  Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx,
+                             bool* has_device_memory)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    if (idx >= execution_contexts.size()) {
+      return errors::Internal("Requested engine context with index ", idx,
+                              ", but only ", execution_contexts.size(),
+                              "contexts are present.");
+    }
+    *exec_ctx = execution_contexts[idx].get();
+    *has_device_memory = execution_contexts[idx].HasDeviceMemory();
+    return OkStatus();
+  }
+
+  int GetNumContexts() {
+    mutex_lock lock(mu);
+    return execution_contexts.size();
+  }
+
+  size_t GetDeviceMemorySize() { return device_memory_size_; }
+
+ private:
+  // Note: declaration has to come before execution_contexts, to ensure proper
+  // order of destruction.
+  TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine_;
+
+ public:
+  // In explicit batch mode, we maintain a vector of contexts for each engine,
+  // where each context is created for a specific profile. This is because it is
+  // either not possible or non-trivial to change the profile of a context for
+  // the following reasons:
+  // - To switch profiles (from TRT 7), one must first ensure that all inference
+  //   calls in that context are finished. This would require an additional
+  //   synchronization before we call setOptimizationProfile. To avoid this
+  //   extra sync call, we maintain separate execution context for each profile.
+  // IExecutionContext object is not thread safe: only one thread should use it
+  // for inference at a time therefore we need a mutex. More details at
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety
+  // Additional discussion about execution context management and thread safety
+  // at https://github.com/tensorflow/tensorflow/issues/36959
+  std::vector<ExecutionContext> execution_contexts TF_GUARDED_BY(mu);
+
+ private:
+  // Until TRT 8.4 ICudaEngine::getDeviceMemorySize() has a non-negligible
+  // latency. Since its value remains constant, we can cache it.
+  size_t device_memory_size_;
+};
+// Contains the context required to build the calibration data.
+class CalibrationContext {
+ public:
+  string TerminateCalibration();
+
+  // Lookup table for temporary staging areas of input tensors for calibration.
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
+  // Temporary staging areas for calibration inputs.
+  std::vector<Tensor> device_tensors_;
+
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  // TODO(sami): Use threadpool threads!
+  std::unique_ptr<std::thread> thr_;
+
+ private:
+  mutex mu_;
+  bool terminated_ TF_GUARDED_BY(mu_) = false;
+  std::string calibration_table_ TF_GUARDED_BY(mu_);
+};
+
+ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
+
+class TRTEngineCacheResource : public ResourceBase {
+ public:
+  // According to the TensorRT API, the logger is considered a singleton by the
+  // TensorRT library, and multiple instances of IRuntime and/or IBuilder must
+  // all use the same logger. So here we make it a singleton.
+  //
+  // TODO(laigd): use this logger in all places where conversion happens.
+  static Logger& GetLogger();
+
+  TRTEngineCacheResource(OpKernelContext* ctx, size_t capacity);
+
+  ~TRTEngineCacheResource() override;
+
+  string DebugString() const override;
+
+  // Returns the EngineContext that is compatible with input_shapes.
+  // Returns nullptr if no compatible EngineContexts is found in cache.
+  EngineContext* GetEngineContext(const std::vector<TensorShape>& input_shapes);
+
+  // Returns the EngineContext that is compatible with profile_id.
+  // This function should be only called in explicit batch mode where
+  // cache size is expected to be at most one.
+  // Returns nullptr if no compatible EngineContexts is found in cache.
+  EngineContext* GetEngineContext(const int profile_id);
+
+  // Keep device allocator for TRT.
+  std::unique_ptr<TRTBaseAllocator> allocator_;
+
+  // Declare cache after allocator so that it is destroyed before allocator is.
+  LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
+           VectorTensorShapeHasher>
+      cache_;
+
+  // TODO(hinsu): Use different calibration context for the available shapes and
+  // attach it to each item of the cache.
+  std::unique_ptr<CalibrationContext> calib_ctx_;
+
+  // This object maintains all the optimization profiles during profile
+  // generation and engine build. During runtime the list of profiles is used to
+  // look up a matching profile for the input data.
+  TrtShapeOptimizationProfile profiles_;
+};
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
new file mode 100644
index 00000000..e2d8fdb6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -0,0 +1,351 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
+
+#include <list>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Stores optimization profile parameters (min/opt/max of each input shape).
+//
+// A TensorRT optimization profile describes the possible min/max values of
+// each dynamic input shape along with an optimum value. These values are used
+// by the TensorRT builder to select the best kernel for the optimum value among
+// those kernels that are valid for all input tensors in the [min, max] range.
+struct OptimizationProfileConfig {
+  // Length of vector == 2*num_inputs to engine. min[0:num_inputs-1] are the min
+  // input dimensions for execution tensors. If engine has shape input tensors,
+  // then min[num_inputs + i] store the shape value for input i. For inputs that
+  // are not shape tensors min = opt = max = {0, {}}.
+  //
+  // When the OptimizationProfileConfig is created from the network definition
+  // (AddProfiles), then each elements of the min, opt, max vectors are defined.
+  // When the OptimizationProfileConfig object is restored during engine
+  // deserialization (RestoreProfiles), then some inputs can be pruned
+  // (see TrtShapeOptimizationProfile::is_pruned_input_). In that case min[i]
+  // is not defined for pruned inputs (same is true for opt and max).
+  std::vector<nvinfer1::Dims> min;
+  std::vector<nvinfer1::Dims> opt;
+  std::vector<nvinfer1::Dims> max;
+
+  string DebugString() const {
+    using absl::StrCat;
+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),
+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),
+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");
+  }
+
+  // Sets the min/opt/max dimensions for profile.
+  //
+  // The given min/opt/max dimensions should satisfy the condition
+  // min <= opt <= max. Additionally TRT requires that the min/opt/max values
+  // are compatible with the network input. Compatibility is defined the
+  // following way: let dim be the shape of an input binding and min/opt/max the
+  // corresponding profile dims. TRT requires that dim.d[k] must be -1 if
+  // (min.d[k] != dim.d[k] || opt.d[k] != dim.d[k] || max.d[k] != dim.d[k]).
+  //
+  // Parameters:
+  // network - TensorRT network, used to enumerate all the input tensors
+  // profile - on exit the profile information will be set for each input tensor
+  // input_mask - 1 for TRT inputs, 0 for TF inputs that are not TRT inputs
+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,
+                       nvinfer1::IOptimizationProfile* profile,
+                       const std::vector<bool>& input_mask) const {
+    int n_inputs_trt = network->getNbInputs();
+    int n_inputs_tf = opt.size() / 2;
+    /// TODO(lsugy): check that the sum of the mask equals n_inputs.
+    if (input_mask.size() != n_inputs_tf) {
+      return errors::Internal("Incorrect input mask size: ", input_mask.size());
+    }
+    int n_mask_true = 0;
+    for (bool mask_val : input_mask) {
+      if (mask_val) {
+        n_mask_true++;
+      }
+    }
+    if (n_mask_true != n_inputs_trt) {
+      return errors::Internal(
+          "Number of true elements in input_mask (", n_mask_true,
+          ") doesn't match expected TRT inputs (", n_inputs_trt, ")");
+    }
+    int j = 0;
+    for (int i = 0; i < n_inputs_tf; i++) {
+      if (input_mask[i]) {
+        const ITensorProxyPtr input = network->getInput(j);
+        const char* name = input->getName();
+        if (input->isShapeTensor()) {
+          int idx = i + n_inputs_tf;
+          VLOG(2) << "Setting shape values for " << name << ", "
+                  << ::tensorflow::tensorrt::DebugString(opt[idx]);
+          profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN,
+                                  min[idx].d, min[idx].nbDims);
+          profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT,
+                                  opt[idx].d, opt[idx].nbDims);
+          profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX,
+                                  max[idx].d, max[idx].nbDims);
+        }
+        VLOG(2) << "Setting input dimensions for " << name << ", "
+                << ::tensorflow::tensorrt::DebugString(opt[i]);
+        profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN,
+                               min[i]);
+        profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT,
+                               opt[i]);
+        profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX,
+                               max[i]);
+
+        j++;
+      }
+    }
+    return OkStatus();
+  }
+
+  // Returns true if profile range completely includes the given shapes.
+  bool IncludesShapes(const std::vector<TensorShape>& shapes,
+                      bool has_shape_tensor,
+                      const std::vector<nvinfer1::Dims>& shape_values,
+                      const std::vector<bool>& is_pruned_input,
+                      const std::vector<bool>& is_shape_tensor) const {
+    // min, max, and opt must have the same size which is already verified in
+    // SetDimensions.
+    if (min.size() != shapes.size() * 2 ||
+        (has_shape_tensor && min.size() != shape_values.size() * 2)) {
+      VLOG(2) << "Profile size mismatch min size " << min.size()
+              << " vs input shapes size " << shapes.size() << " "
+              << shape_values.size();
+      return false;
+    }
+    for (int i = 0; i < shapes.size(); i++) {
+      if (is_pruned_input[i]) {
+        continue;
+      }
+      auto current_shape = shapes[i];
+      // min, max, and opt must have the same nbDims, which is already verified
+      // in SetDimensions.
+      if (min[i].nbDims != current_shape.dims()) {
+        return false;
+      }
+      // Check if range [min, max] includes current_shape.
+      for (int dim = 0; dim < current_shape.dims(); dim++) {
+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||
+            (max[i].d[dim] < current_shape.dim_size(dim))) {
+          return false;
+        }
+      }
+    }
+    // Check shape values.
+    if (has_shape_tensor) {
+      int offset = shapes.size();
+      for (int i = 0; i < shape_values.size(); i++) {
+        if (is_pruned_input[i] || !is_shape_tensor[i]) {
+          continue;
+        }
+        auto shape_val = shape_values[i];
+        // min, max, and opt must have the same nbDims, which is already
+        // verified in SetDimensions.
+        if (min[i + offset].nbDims != shape_val.nbDims) {
+          return false;
+        }
+        // Check if range [min, max] includes shape_val.
+        for (int dim = 0; dim < shape_val.nbDims; dim++) {
+          if (min[i + offset].d[dim] > shape_val.d[dim] ||
+              max[i + offset].d[dim] < shape_val.d[dim]) {
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+
+// Manages Optimization profiles during TRT Engine construction.
+//
+// An optimization profile describes a range of dimensions for each TRT network
+// input, and the optimal dimensions that the auto-tuner should use for
+// optimization.
+//
+// This class stores the list of input shapes that were seen during the
+// build/profile_generation_mode phase, and using them it creates a set of
+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig
+// before the engine is created.
+class TrtShapeOptimizationProfile {
+ public:
+  TrtShapeOptimizationProfile() {}
+
+  // Stores input shape information during profile_generation_mode.
+  void AddShape(const std::vector<TensorShape>& shapes) {
+    input_shapes_.push_back(shapes);
+    input_shape_values_.push_back(actual_shape_values_);
+    VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles.";
+  }
+
+  // Stores the input mask.
+  void SetInputMask(const std::vector<bool>& input_mask) {
+    input_mask_ = input_mask;
+  }
+
+  // Collects ShapeTensorCompatible tensor values. This is needed both during
+  // profile_generation_mode and during normal inference calls.
+  Status CollectShapeValues(OpKernelContext* ctx);
+
+  // Collects ShapeTensorCompatible tensor values, used only for unit tests.
+  Status CollectShapeValues(const DataVec& input);
+
+  void clear() { profiles_.clear(); }
+
+  // Returns the profile number that should be used to execute the network with
+  // the given input shapes. Returns -1 if none of cached profiles are
+  // compatible with the given input shapes.
+  int GetProfileNumber(const std::vector<TensorShape>& shapes);
+
+  // Creates optimization profiles and add them to the builder config.
+  Status ConfigureBuilder(nvinfer1::IBuilder* builder,
+                          nvinfer1::IBuilderConfig* config,
+                          const nvinfer1::INetworkDefinition* network);
+
+  // Creates execution contexts for each optimization profile.
+  Status CreateExecutionContexts(nvinfer1::ICudaEngine* engine,
+                                 std::vector<ExecutionContext>* exec_contexts);
+
+  Status SetInputShapeBinding(int input_index, int binding_index,
+                              nvinfer1::ICudaEngine* cuda_engine,
+                              nvinfer1::IExecutionContext* exec_context) const;
+
+  // Creates optimization profiles profiles_ for the set of concrete input
+  // shapes collected in input_shapes_. The input_partial_shapes of the network
+  // is used to ensure that the created optimization profiles are compatible
+  // with the network.
+  void InitProfiles(const std::vector<PartialTensorShape>& input_partial_shapes,
+                    ProfileStrategy strategy);
+
+  void InitCalibProfile(const std::vector<TensorShape>& shapes);
+
+  // Returns number of created profiles.
+  int GetNumProfiles() const;
+
+  bool HasShape() const { return !input_shapes_.empty(); }
+  bool NeedProfiles() const { return need_profiles_; }
+
+  // Restores profiles from the engine (used after deserialization).
+  Status RestoreProfiles(const nvinfer1::ICudaEngine* engine,
+                         int n_network_inputs);
+
+  // Whether the network has any shape tensors.
+  bool HasShapeTensor() const { return has_shape_tensor_; }
+
+  void SetShapeTensorMask(const nvinfer1::INetworkDefinition* network);
+
+  // Whether the optimization profiles describe input that can be handled with
+  // a static engine (only 1 profile with min=max).
+  bool IsStaticCompatible() {
+    return strategy_ == ProfileStrategy::kOptimal && profiles_.size() == 1
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+           && !HasShapeTensor()
+#endif
+        ;
+    // TODO(tfeher): remove !HasShapeTensor() condition once the
+    // FixShapeValueProfile workaround is turned off.
+  }
+
+ private:
+  // Set of input shape vetors that we collect during profile_generation_mode.
+  std::vector<std::vector<TensorShape>> input_shapes_;
+
+  // Input shape values that we collect during profile_generation_mode. If the
+  // tensor is not compatible with a TRT shape tensor then an empty shape is
+  // stored.
+  std::vector<std::vector<nvinfer1::Dims>> input_shape_values_;
+
+  // Shape values present in the current inference call.
+  std::vector<nvinfer1::Dims> actual_shape_values_;
+
+  // The optimization profiles generated from input_shapes_.
+  std::vector<OptimizationProfileConfig> profiles_;
+
+  // The optimization profile for calibration.
+  OptimizationProfileConfig calib_profiles_;
+
+  // A TRTEngineOp can have resource inputs. These are treated as constants:
+  // their value is read during conversion and stored as weights in the TRT
+  // engine. This means that resource inputs have no corresponding TRT engine
+  // input, and we do not need to provide profile information for these. The
+  // input mask helps to identify the TRT inputs, where we need to define
+  // optimization profiles.
+  std::vector<bool> input_mask_;
+
+  // Whether the network has any shape tensors. Initially we assume that the
+  // network might have a shape value input. This will be updated when the
+  // network is created / engine is deserialized.
+  bool has_shape_tensor_ = true;
+
+  // Whether the network/engine requires optimization profiles.
+  bool need_profiles_ = false;
+
+  // Whether an input tensor is a shape tensor.
+  std::vector<bool> is_shape_tensor_;
+
+  // Whether a network input was pruned (only in TRT 7).
+  std::vector<bool> is_pruned_input_;
+
+  // Optimization profile generation strategy.
+  ProfileStrategy strategy_;
+
+  // Adds optimization profiles to the builder config.
+  Status AddProfiles(nvinfer1::IBuilder* builder,
+                     nvinfer1::IBuilderConfig* config,
+                     const nvinfer1::INetworkDefinition* network);
+
+  void SetShapeTensorMask(const nvinfer1::ICudaEngine* engine, int n_inputs);
+  void SetShapeTensorMask(
+      const std::vector<PartialTensorShape>& input_partial_shapes);
+
+  Status SetPrunedMask(const nvinfer1::ICudaEngine* engine,
+                       int n_network_inputs);
+
+  void ImplicitBatchModeCompatibleStrategy(
+      const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes);
+  void OptimalStrategy(
+      const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes);
+  Status RangeStrategy(
+      const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes);
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h
new file mode 100644
index 00000000..5eea183f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h
@@ -0,0 +1,458 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+
+namespace tensorrt {
+
+// SimpleITensor implements part of the ITensor interfaces to support the TF-TRT
+// validator, as well as some TF-TRT tests. The former use case only utilizes
+// the interfaces related to shape and type information.
+class SimpleITensor {
+ public:
+  SimpleITensor(nvinfer1::DataType trt_dtype, const nvinfer1::Dims& trt_dims)
+      : trt_dtype_(trt_dtype), trt_dims_(trt_dims) {}
+
+  SimpleITensor() : dynamic_range_min_(0.0f), dynamic_range_max_(0.0f) {}
+  SimpleITensor(const nvinfer1::Dims& dims)
+      : trt_dims_(dims), dynamic_range_min_(0.0f), dynamic_range_max_(0.0f) {}
+
+  SimpleITensor(const std::vector<int>& dims) {
+    trt_dims_.nbDims = dims.size();
+    for (int i = 0; i < dims.size(); ++i) {
+      trt_dims_.d[i] = dims[i];
+    }
+    dynamic_range_min_ = 0.0f;
+    dynamic_range_max_ = 0.0f;
+  }
+
+  void setName(const char* name) {}
+
+  const char* getName() const { return ""; }
+
+  void setDimensions(nvinfer1::Dims dimensions) { trt_dims_ = dimensions; }
+
+  nvinfer1::Dims getDimensions() const { return trt_dims_; }
+
+  void setType(nvinfer1::DataType trt_dtype) { trt_dtype_ = trt_dtype; }
+
+  nvinfer1::DataType getType() const { return trt_dtype_; }
+
+  bool isNetworkInput() const { return false; }
+
+  bool isNetworkOutput() const { return false; }
+
+  void setBroadcastAcrossBatch(bool broadcastAcrossBatch) {}
+
+  bool getBroadcastAcrossBatch() const { return false; }
+
+  nvinfer1::TensorLocation getLocation() const { return location_; }
+
+  void setLocation(nvinfer1::TensorLocation location) { location_ = location; }
+  bool setDynamicRange(float min, float max) {
+    dynamic_range_max_ = max;
+    dynamic_range_min_ = min;
+    return true;
+  }
+
+  float getDynamicRange() const {
+    return (std::abs(dynamic_range_min_) + dynamic_range_max_) / 2.f;
+  }
+  bool dynamicRangeIsSet() const { return true; }
+
+  void resetDynamicRange() {
+    dynamic_range_min_ = 0.f;
+    dynamic_range_max_ = 0.f;
+  }
+  float getDynamicRangeMin() const { return dynamic_range_min_; }
+
+  float getDynamicRangeMax() const { return dynamic_range_max_; }
+
+  void setAllowedFormats(nvinfer1::TensorFormats formats) {}
+
+  nvinfer1::TensorFormats getAllowedFormats() const { return 1; }
+
+  bool isShapeTensor() const { return false; }
+  bool isExecutionTensor() const { return true; }
+
+ private:
+  nvinfer1::DataType trt_dtype_;
+  nvinfer1::Dims trt_dims_;
+  std::string name_;
+  nvinfer1::TensorLocation location_;
+  float dynamic_range_min_;
+  float dynamic_range_max_;
+};
+
+enum class TensorType : int { kTRT, kSIMPLE };
+
+class ITensorProxy {
+ public:
+  //! ITensor not owned
+  ITensorProxy(nvinfer1::ITensor* trt_tensor)
+      : trt_tensor_(trt_tensor), ttype_(TensorType::kTRT) {}
+
+  //! SimpleITensor owned
+  ITensorProxy(SimpleITensor* simple_itensor)
+      : simple_tensor_(simple_itensor), ttype_(TensorType::kSIMPLE) {}
+
+  //! SimpleITensor owned
+  explicit ITensorProxy(nvinfer1::DataType trt_dtype,
+                        const nvinfer1::Dims& trt_dims)
+      : simple_tensor_(std::unique_ptr<SimpleITensor>(
+            new SimpleITensor(trt_dtype, trt_dims))),
+        ttype_(TensorType::kSIMPLE) {}
+
+  //! Variants for testing purposes
+  ITensorProxy()
+      : simple_tensor_(std::unique_ptr<SimpleITensor>(new SimpleITensor())),
+        ttype_(TensorType::kSIMPLE) {}
+
+  explicit ITensorProxy(const nvinfer1::Dims& dims)
+      : simple_tensor_(std::unique_ptr<SimpleITensor>(new SimpleITensor(dims))),
+        ttype_(TensorType::kSIMPLE) {}
+
+  explicit ITensorProxy(const std::vector<int>& dims)
+      : simple_tensor_(std::unique_ptr<SimpleITensor>(new SimpleITensor(dims))),
+        ttype_(TensorType::kSIMPLE) {}
+
+  bool is_trt_tensor() const {
+    CHECK(validate());
+    return trt_tensor_ != nullptr;
+  }
+
+  bool is_simple_tensor() const {
+    CHECK(validate());
+    return simple_tensor_ != nullptr;
+  }
+
+  TensorType ttype() const { return ttype_; }
+
+  nvinfer1::ITensor* trt_tensor() const {
+    CHECK_NOTNULL(trt_tensor_);
+    CHECK(ttype_ == TensorType::kTRT);
+    return trt_tensor_;
+  }
+
+  SimpleITensor* simple_tensor() const {
+    CHECK_NOTNULL(simple_tensor_);
+    CHECK(ttype_ == TensorType::kSIMPLE);
+    return simple_tensor_.get();
+  }
+
+  void setName(const char* name) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setName(name);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setName(name);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  const char* getName() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getName();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getName();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  void setDimensions(nvinfer1::Dims dimensions) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setDimensions(dimensions);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setDimensions(dimensions);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  nvinfer1::Dims getDimensions() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getDimensions();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getDimensions();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  void setType(nvinfer1::DataType type) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setType(type);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setType(type);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  nvinfer1::DataType getType() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getType();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getType();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool isNetworkInput() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->isNetworkInput();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->isNetworkInput();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool isNetworkOutput() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->isNetworkOutput();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->isNetworkOutput();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  void setBroadcastAcrossBatch(bool broadcastAcrossBatch) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setBroadcastAcrossBatch(broadcastAcrossBatch);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setBroadcastAcrossBatch(broadcastAcrossBatch);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool getBroadcastAcrossBatch() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getBroadcastAcrossBatch();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getBroadcastAcrossBatch();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  nvinfer1::TensorLocation getLocation() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getLocation();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getLocation();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  void setLocation(nvinfer1::TensorLocation location) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setLocation(location);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setLocation(location);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool setDynamicRange(float min, float max) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setDynamicRange(min, max);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setDynamicRange(min, max);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool dynamicRangeIsSet() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->dynamicRangeIsSet();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->dynamicRangeIsSet();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  void resetDynamicRange() {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->resetDynamicRange();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->resetDynamicRange();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+  float getDynamicRangeMin() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getDynamicRangeMin();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getDynamicRangeMin();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  float getDynamicRangeMax() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getDynamicRangeMax();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getDynamicRangeMax();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+  float getDynamicRange() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getDynamicRange();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getDynamicRange();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+#endif
+  void setAllowedFormats(nvinfer1::TensorFormats formats) {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->setAllowedFormats(formats);
+      case TensorType::kSIMPLE:
+        return simple_tensor_->setAllowedFormats(formats);
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  nvinfer1::TensorFormats getAllowedFormats() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->getAllowedFormats();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->getAllowedFormats();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool isShapeTensor() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->isShapeTensor();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->isShapeTensor();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+  bool isExecutionTensor() const {
+    switch (ttype_) {
+      case TensorType::kTRT:
+        return trt_tensor_->isExecutionTensor();
+      case TensorType::kSIMPLE:
+        return simple_tensor_->isExecutionTensor();
+    }
+    LOG(FATAL) << "Unsupported itensor_ type";
+  }
+
+ private:
+  bool validate() const {
+    return (trt_tensor_ && !simple_tensor_) || (!trt_tensor_ && simple_tensor_);
+  }
+
+  // When ITensorProxy represents an ITensor, the ITensor can be either passed
+  // by the caller via the constructor that takes an ITensor* as parameter, or
+  // be created as a SimpleITensor.
+  //
+  // In the first case, the ITensor pointer is stored in 'tensor_' below, and
+  // the ITensor itself is not owned by this class. This method is used by
+  // Converter (e.g. AddInputTensor) and op converters during TRT network
+  // construction, where the TRT network owns the ITensor.
+  //
+  nvinfer1::ITensor* trt_tensor_ = nullptr;  // Not owned.
+  // In the second case, the created SimpleITensor is stored in
+  // 'simple_itensor_' below and is owned by this class. SimpleITensor is a fake
+  // implementation of ITensor and is used for testing and by TrtNodeValidator
+  //  to validate the graph nodes.
+  std::shared_ptr<SimpleITensor> simple_tensor_ = nullptr;
+
+  TensorType ttype_;
+};
+
+class ITensorProxyPtr {
+ public:
+  ITensorProxyPtr(std::nullptr_t) : p_(nullptr) {}
+  ITensorProxyPtr(ITensorProxy* p) : p_(p) {}
+  ITensorProxyPtr(nvinfer1::ITensor* p) : p_(new ITensorProxy(p)) {}
+  ITensorProxyPtr(SimpleITensor* p) : p_(new ITensorProxy(p)) {}
+
+  ITensorProxyPtr() : p_(new ITensorProxy()) {}
+  ITensorProxyPtr(const nvinfer1::Dims& dims) : p_(new ITensorProxy(dims)) {}
+  ITensorProxyPtr(const std::vector<int>& dims) : p_(new ITensorProxy(dims)) {}
+
+  std::shared_ptr<ITensorProxy> p_{nullptr};
+  ITensorProxy* operator->() { return p_.get(); }
+  ITensorProxy* operator->() const { return p_.get(); }
+  ITensorProxy* operator*() { return p_.get(); }
+  ITensorProxy* operator*() const { return p_.get(); }
+};
+
+inline bool operator==(const ITensorProxyPtr& p1, const ITensorProxyPtr& p2) {
+  if (p1.p_ == nullptr) {
+    return p2.p_ == nullptr;
+  }
+  if (p2.p_ == nullptr) {
+    return p1.p_ == nullptr;
+  }
+  return (p1->ttype() == p2->ttype()) &&
+         ((p1->ttype() == TensorType::kTRT &&
+           p1->trt_tensor() == p2->trt_tensor()) ||
+          (p1->ttype() == TensorType::kSIMPLE &&
+           p1->simple_tensor() == p2->simple_tensor()));
+}
+
+inline bool operator!=(const ITensorProxyPtr& p1, const ITensorProxyPtr& p2) {
+  return !(p1 == p2);
+}
+
+struct ITensorProxyHash {
+  size_t operator()(const ITensorProxyPtr& tensor) const {
+    return reinterpret_cast<std::uintptr_t>(tensor.p_.get());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h
new file mode 100644
index 00000000..e0b9a036
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h
@@ -0,0 +1,183 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <algorithm>
+#include <map>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
+#include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor.pb.h"    // NOLINT
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+// Creates a node with the given op, inputs, and attributes.
+NodeDef MakeNodeDef(const std::string& name, const std::string& op,
+                    const std::vector<std::string>& inputs,
+                    const std::map<std::string, AttrValue> attrs = {});
+
+// Creates a constant node with the given name and values arranged in the given
+// shape.
+template <typename T>
+NodeDef MakeConstNodeDef(const std::string& name, const std::vector<T>& vals,
+                         const TensorShape& shape) {
+  Scope s = Scope::NewRootScope();
+  Tensor t = test::AsTensor<T>(vals, shape);
+  auto const_op = ops::Const(s.WithOpName(name), t);
+  return const_op.node()->def();
+}
+
+// Creates a constant node with the given name and values, assuming a 1-D shape.
+template <typename T>
+NodeDef MakeConstNodeDef(const std::string& name, const std::vector<T>& vals) {
+  TensorShape shape;
+  const std::vector<int32> shape_dims = {static_cast<int32>(vals.size())};
+  TF_EXPECT_OK(TensorShapeUtils::MakeShape(shape_dims, &shape));
+  return MakeConstNodeDef(name, vals, shape);
+}
+
+// Creates an nvinfer1::Dims struct from the given vector.
+nvinfer1::Dims CreateDims(const std::vector<int>& d);
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+::testing::Matcher<std::vector<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5,
+    bool nan_sensitive = false);
+
+// nvinfer1::Dims gMock matchers
+
+// matches nvinfer1::Dims to initializer list or vector of ints
+// Example: EXPECT_THAT(my_dims, DimsAreArray({1, 2, 3}))
+MATCHER_P(DimsAreArrayHelper, array_value,
+          absl::StrFormat("%s [%s]", negation ? "are" : "are not",
+                          ::testing::PrintToString(array_value))) {
+  if (arg.nbDims != array_value.size()) return false;
+  for (int i = 0; i < arg.nbDims; ++i) {
+    if (arg.d[i] != array_value[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+using DimsAreArray = DimsAreArrayHelperMatcherP<std::vector<int>>;
+
+// nvinfer1::INetworkDefinition gMock matchers
+
+// Checks that layer names are equal to initializer list or vector of strings.
+// Example: EXPECT_THAT(my_network, LayerNamesAreArray({"conv1", "conv2"}))
+MATCHER_P(LayerNamesAreArrayHelper, array_value,
+          absl::StrFormat("layer names %s [%s]", negation ? "are" : "are not",
+                          ::testing::PrintToString(array_value))) {
+  if (array_value.size() != arg->getNbLayers()) return false;
+  for (int i = 0; i < arg->getNbLayers(); ++i) {
+    if (arg->getLayer(i)->getName() == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+using LayerNamesAreArray =
+    LayerNamesAreArrayHelperMatcherP<std::vector<std::string>>;
+
+// Checks layer names are all non-empty.
+MATCHER(LayerNamesNonEmpty, "") {
+  for (int i = 0; i < arg->getNbLayers(); ++i) {
+    if (arg->getLayer(i)->getName() == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TRT_ShapedWeights gMock matchers.
+
+// Checks that the weight dimensions are values are equal to the given values.
+// Example: EXPECT_THAT(my_weights,
+//                      ShapedWeightsHasDimsAndValues({1, 2},{1.0f, 2.0f}))
+MATCHER_P2(ShapedWeightsHasDimsAndValuesHelper, dims_vec, expected_values, "") {
+  DimsAdapter dims(dims_vec);
+  if (arg.Shape() != dims) {
+    return false;
+  }
+  if (arg.count() != expected_values.size()) {
+    return false;
+  }
+  using T = typename decltype(expected_values)::value_type;
+  const T* actual_values = arg.template GetPointer<T>();
+  for (int i = 0; i < expected_values.size(); ++i) {
+    if (expected_values[i] != actual_values[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+using ShapedWeightsHasDimsAndValues =
+    ShapedWeightsHasDimsAndValuesHelperMatcherP2<std::vector<int>,
+                                                 std::vector<T>>;
+
+// std::vector convenience utilities.
+
+// Creates a new vector by casting all values of the given InCType vector to
+// OutCType.
+template <typename InCType, typename OutCType>
+std::vector<OutCType> CastVector(
+    const gtl::ArraySlice<InCType>& vals) {  // non-absl ok
+  std::vector<OutCType> res(vals.size());
+  std::transform(vals.begin(), vals.end(), res.begin(),
+                 [](const InCType in_val) -> OutCType {
+                   return static_cast<OutCType>(in_val);
+                 });
+  return res;
+}
+
+// Creates a new vector of the given size and fills it with an increasing
+// sequence starting from the given start_value using std::iota.
+template <typename CType>
+std::vector<CType> CreateVectorIota(int size, CType start_value = CType(0)) {
+  std::vector<CType> res(size);
+  std::iota(res.begin(), res.end(), start_value);
+  return res;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/const_analysis.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/const_analysis.h
new file mode 100644
index 00000000..ea7d9eb8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/const_analysis.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_CONST_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_CONST_ANALYSIS_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Backwards dataflow analysis that finds nodes in a graph that must be
+// compile-time constants for us to be able to lower the graph to XLA.
+//
+// The indices of the arguments to `graph` that must be constant are returned in
+// `compile_time_const_arg_indices`, if `compile_time_const_arg_indices` is not
+// null.
+//
+// The ids of the nodes in `graph` that must be constant are returned in
+// `compile_time_const_nodes`, if `compile_time_const_nodes` is not null.
+//
+// If `edge_filter` is non-null, only propagate const-ness along edges for which
+// `edge_filter` returns true.
+absl::Status BackwardsConstAnalysis(
+    const Graph& g, std::vector<bool>* compile_time_const_arg_indices,
+    std::vector<bool>* compile_time_const_nodes,
+    FunctionLibraryRuntime* flib_runtime,
+    std::function<bool(const Edge&)> edge_filter_input = nullptr);
+
+// Given an op kernel and function library runtime, return all the indices of
+// inputs that need to be compile time constant.
+absl::Status GetCompileTimeConstInputs(const OpKernel* op_kernel,
+                                       std::vector<int>* const_input_idxs,
+                                       FunctionLibraryRuntime* flib_runtime);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_CONST_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/frontend_attributes_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/frontend_attributes_util.h
new file mode 100644
index 00000000..2f8436fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/frontend_attributes_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FRONTEND_ATTRIBUTES_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FRONTEND_ATTRIBUTES_UTIL_H_
+
+#include <string>
+
+#include "absl/types/optional.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Return the FrontendAttributes stored in the AttrSlice if there are some.
+//
+// Return an InvalidArgument error if some attributes are present but
+// cannot be parsed.
+absl::StatusOr<std::optional<xla::FrontendAttributes>>
+GetFrontendAttributesFromAttrSlice(const AttrSlice& attrs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FRONTEND_ATTRIBUTES_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_cond.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_cond.h
new file mode 100644
index 00000000..e37555b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -0,0 +1,291 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
+
+#include <deque>
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Functionalize all the switch-merge nodes of a loop-free graph into If
+// nodes. That is, attempt to transform every remaining switch and merge nodes
+// in the graph into If nodes.
+//
+// If `node_filter` is defined, then only conditions for whose nodes
+// `node_filter` returns true are functionalized.
+//
+// Preconditions:
+// a) Same as for `FunctionalizeControlFlow` (see comment there).
+// b) While loops must have been functionalized before according to
+//    `node_filter` (e.g., by calling `FunctionalizeWhileLoop` with the same
+//    filter before calling this function).
+absl::Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                               const NodeFilter& node_filter = {});
+
+// Internal functions/classes exposed for testing purposes.
+namespace functionalize_cond {
+
+// All nodes are assumed to be either in no branch, then branch, else branch,
+// or both branches (such as merge nodes).
+// The code below relies on Else and Then being 0 and 1 (corresponding to the
+// switch outputs). Both and Neither are arbitrary.
+enum class BranchType {
+  kElseBranch = 0,
+  kThenBranch = 1,
+  kBoth = 2,
+  kNeither = 3,
+};
+
+// When we keep track of which switch/merge node's feed into a node, we record
+// 1) predicate for non-dead switch node,
+// 2) the switch node itself for dead switch node,
+// 3) the merge node itself for merge node.
+// Case 1) is an optimization. With this optimization, if there are nodes from
+// different switch nodes but those switch nodes have the same predicate, the
+// nodes will still have same AncestorState, and they will be clustered into a
+// single "If".
+struct AncestorNode {
+  enum class AncestorNodeType {
+    kPred = 0,
+    kSwitch = 1,
+    kMerge = 2,
+  };
+
+  OutputTensor output_tensor;
+  AncestorNodeType type;
+
+  // Compare two AncestorNodes by (node id, index, type).
+  bool operator<(const AncestorNode& other) const;
+  bool operator==(const AncestorNode& other) const;
+
+  struct Hash {
+    size_t operator()(const AncestorNode&) const;
+  };
+};
+
+// StateMap is responsible for mapping from each graph Node to
+// * a CondState, where each CondState is a map from predicate to branch (i,e.,
+//   what predicates have to hold or not hold).
+// * a AncestorState, where each AncestorState is a set of switch/merge nodes
+//   that are an ancestor of the node in the graph;
+// For efficiency, this class interns the CondState (AncestorState), so that
+// CondState (AncestorState) equality comparisons are simply pointer
+// comparisons.
+class StateMap {
+ public:
+  explicit StateMap(Graph* graph);
+
+  // Compare two OutputTensors by (node id, index).
+  struct OutputTensorLess {
+    bool operator()(const OutputTensor& lhs, const OutputTensor& rhs) const;
+  };
+
+  // A node in the graph is executed when multiple conditions hold. Keep track
+  // of the predicates that must hold for a node to execute.
+  using CondState = std::map<OutputTensor, BranchType, OutputTensorLess>;
+
+  // Every unique ID is mapped to a CondState.
+  using CondId = const CondState*;
+
+  // Keep track of which switch/merge node's feed into a node's values.
+  using AncestorState = std::set<AncestorNode>;
+
+  // Every unique ID is mapped to a AncestorState.
+  using AncestorId = const AncestorState*;
+
+  // Returns the CondId for a given node.
+  CondId LookupCondId(const Node* node) const;
+
+  // Returns the unique CondId for CondState.
+  CondId GetCondId(const CondState& state);
+
+  // Resets the CondId for a given node.
+  void ResetCondId(const Node* node, CondId id);
+
+  // Returns the AncestorId for a given node.
+  AncestorId LookupAncestorId(const Node* node) const;
+
+  // Returns the unique AncestorId for CondState.
+  AncestorId GetAncestorId(const AncestorState& state);
+
+  // Resets the AncestorId for a given node.
+  void ResetAncestorId(const Node* node, AncestorId id);
+
+  // Marks `node` as dead.
+  void MarkDead(const Node* node);
+
+  // Determine branch execution of CondState.
+  BranchType FindBranchOf(CondId id, OutputTensor predicate) const;
+
+  // Returns textual representation of node's CondState.
+  string CondStateToString(const Node* node) const;
+  string CondStateToString(CondId id) const;
+
+  // Returns textual representation of node's AncestorState.
+  string AncestorStateToString(const Node* node) const;
+
+  // Returns whether the cond state is the dead state.
+  bool IsDead(CondId id) const;
+
+  // Returns whether the cond state is the empty state.
+  bool IsEmpty(CondId id) const;
+
+ private:
+  // Hash for CondState and AncestorState.
+  struct Hash {
+    size_t operator()(const CondState& map) const;
+    size_t operator()(const AncestorState& map) const;
+  };
+
+  // Set to keep track of unique CondStates.
+  // Pointers to the entries in the unordered set are used as identifiers:
+  // unordered_set guarantees that the pointers remain the same.
+  std::unordered_set<CondState, Hash> condstate_set_;
+
+  // Mapping from Node id to CondId.
+  std::vector<CondId> node_to_condid_map_;
+
+  // Track the CondId for newly inserted nodes. We use a vector to quickly map
+  // from Node id in the original graph to the CondId, but there will be nodes
+  // added to the original graph (such as If nodes) whose CondState needs to be
+  // tracked too.
+  std::unordered_map<int, CondId> added_node_condid_mapping_;
+
+  // AncestorId variants of the CondId members.
+  std::unordered_set<AncestorState, Hash> ancestorstate_set_;
+  std::vector<AncestorId> node_to_ancestorid_map_;
+  std::unordered_map<int, AncestorId> added_node_ancestorid_mapping_;
+
+  // Identifier of the dead flow state. The empty flow state is represented with
+  // a nullptr.
+  CondId dead_id_;
+};
+
+// FunctionalizeCond groups all the state used by functionalizing conditionals
+// of the given graph together.
+class FunctionalizeCond {
+ public:
+  // See comment for function `FunctionalizeCond`.
+  static absl::Status Functionalize(Graph* graph,
+                                    FunctionLibraryDefinition* library,
+                                    const NodeFilter& node_filter);
+
+  // Build identity node with the same name as the merge that will be replaced
+  // in case the output is fetched/colocated.
+  absl::Status AddIdentityNode(const Node* replacee, Node* if_node, int port);
+
+  // Add a If node to the graph defined by def that will, amongst other, replace
+  // replacee in the graph.
+  absl::StatusOr<Node*> AddIfNode(const NodeDef& def, const Node* replacee,
+                                  const OutputTensor& predicate);
+
+  // Propagates the state of a newly inserted node.
+  absl::Status PropagateUpdatedState(const Node* replacee);
+
+  // Dump graph with the CondState annotated.
+  void DumpGraphWithCondState(const string& name);
+
+  // Adds `switch_id` to the list of Switch node ids.
+  void AddSwitchId(int switch_id);
+
+ private:
+  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                    const NodeFilter& node_filter);
+
+  // Performs the actual cond functionalization. Iterate over groups of merge
+  // nodes (linked by common predicates & ancestor IDs), from innermost to
+  // outermost, and extract into If nodes.
+  absl::Status FunctionalizeInternal();
+
+  // Returns the forward flow state propagated along edge `e`.
+  // This may modify state_map_.
+  StateMap::CondId StateAlongEdge(const Edge* e);
+
+  // Determines the CondState and AncestorState of all the nodes in the given
+  // vector where the input is expected in reverse topological order.
+  // This populates the state_map_.
+  absl::Status DetermineStates(std::vector<Node*> rev_topo_order);
+
+  // Determine the CondState for a given node using the incoming edges
+  // to the node. Note: it is expected that this node's CondState is only
+  // determined once its input's CondState is.
+  absl::Status DetermineCondState(Node* dst) {
+    if (IsMerge(dst)) return DetermineCondStateMerge(dst);
+    return DetermineCondStateNonMerge(dst);
+  }
+
+  // Helper functions for DetermineCondState.
+  absl::Status DetermineCondStateNonMerge(Node* dst);
+  absl::Status DetermineCondStateMerge(Node* dst);
+
+  // Determines the dst node's CondState by joining the src and dst's CondState
+  // where either the dst node is a merge or not.
+  // These may modify state_map_.
+  absl::StatusOr<StateMap::CondId> JoinCondStatesMerge(Node* merge,
+                                                       StateMap::CondId src,
+                                                       StateMap::CondId dst);
+  absl::StatusOr<StateMap::CondId> JoinCondStatesNonMerge(StateMap::CondId src,
+                                                          StateMap::CondId dst);
+
+  // Determines which switch/merge nodes are ancestors of this node.
+  absl::Status DetermineAncestorState(Node* dst);
+
+  // Checks if a merge node is redundant and if so removes it from the graph.
+  absl::Status RemoveRedundantMerge(Node* node);
+
+  // Checks if a switch node is redundant and if so removes it from the graph.
+  absl::Status RemoveRedundantSwitch(Node* node);
+
+  // Sorts merge nodes (in reverse topological order) in order of increasing
+  // nesting depth.
+  void SortMergeNodes(std::vector<Node*>* merge_order);
+
+  // Deletes all nodes in/consumers reachable from switch/merge nodes that were
+  // extracted.
+  void DeleteReachableAndDeadNodes(const std::vector<Node*>& merge_order);
+
+  // Member used to unique the CondState to a unique CondId (AncestorState to a
+  // unique AncestorId) and keep track of CondState/CondId
+  // (AncestorState/AncestorId) per Node.
+  StateMap state_map_;
+
+  // Mapping from merge nodes to predicate.
+  std::unordered_map<Node*, OutputTensor> merge_to_predicate_;
+
+  // Mapping from merge nodes to corresponding If node outputs.
+  std::unordered_map<Node*, OutputTensor> merge_to_replacement_;
+
+  FunctionLibraryDefinition* library_;
+  Graph* graph_;
+
+  friend class FunctionalizeCondTest;
+
+  std::vector<int> switch_ids_;
+
+  // Controls which nodes are skipped for functionalization.
+  NodeFilter node_filter_ = {};
+};
+
+}  // namespace functionalize_cond
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_control_flow.h
new file mode 100644
index 00000000..ec728885
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+const char kFunctionalizeControlFlowFailureMessage[] =
+    "Failed to functionalize Control Flow V1 ops. Consider using Control "
+    "Flow V2 ops instead. See "
+    "https://www.tensorflow.org/api_docs/python/tf/"
+    "compat/v1/enable_control_flow_v2.";
+
+// Transformation that converts tf.while_loop() loops into functional While
+// operators and tf.cond() conditionals into function If operators, suitable for
+// XLA compilation.
+//
+// If `node_filter` is defined, then only loops and conditions for whose
+// nodes `node_filter` returns true are functionalized.
+
+// If `include_functions` is true, then loops and conditions inside of functions
+// that are associated with nodes in `graph` (e.g., a function called from a
+// node in `graph`) are also functionalized, otherwise they are not.
+// This also handles transitive cases, e.g., a function body will be
+// functionalized when it is called in another function that is called by some
+// node in `graph` (and so on). The node filter also applies here.
+//
+// Precondition:
+// For any node in a loop or condition for which `node_filter` returns true,
+// all nodes inside of the same loop or condition must also return true
+// (including nodes in other nested loops and conditions inside of that loop or
+// condition).
+// This means that a "not to be functionalized" loop or condition is not allowed
+// inside a "to be functionalized" loop or condition.
+//
+// The user of this function is responsible for using a node filter that
+// satisfies the above conditions.
+absl::Status FunctionalizeControlFlow(Graph* graph,
+                                      FunctionLibraryDefinition* library,
+                                      const NodeFilter& node_filter = {},
+                                      bool include_functions = false);
+
+absl::Status FunctionalizeControlFlowForGraphDef(
+    GraphDef* graph_def, FunctionLibraryDefinition* library,
+    const NodeFilter& node_filter = {}, bool include_functions = false);
+
+// Rewrites the graph by turning V1 control flow structure
+// (Switch/Merge/etc.) into V2 control flow structure (If/While), only modifies
+// functions that will be executed by XLA.
+class FunctionalizeControlFlowForXlaPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
new file mode 100644
index 00000000..970f62da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_
+
+#include "absl/strings/str_join.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+
+// Utility functions shared between functionalize cond and while
+// or used by other graph optimization passes.
+
+namespace tensorflow {
+
+using NodeFilter = std::function<bool(const Node*)>;
+
+// Information about a loop argument.
+struct WhileLoopArg {
+  // Every loop argument has an Enter node.
+  Node* enter;
+
+  // Is the loop argument a loop-invariant value? Taken from the `is_constant`
+  // attribute on the Enter node.
+  bool is_loop_invariant;
+
+  // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant
+  // arguments must have all of the following nodes:
+  Node* merge = nullptr;
+  Node* switch_node = nullptr;
+  Node* next_iteration = nullptr;
+  Node* exit = nullptr;
+};
+
+// Information about a loop frame.
+struct WhileLoopFrame {
+  string name;
+
+  // Pointer to the parent frame. The root frame has a pointer to itself.
+  WhileLoopFrame* parent = nullptr;
+  int num_children = 0;
+
+  // Arguments to this loop.
+  std::vector<WhileLoopArg> args;
+
+  // The loop condition of the loop. There should be exactly one loop condition
+  // in every loop.
+  Node* loop_cond = nullptr;
+
+  // Set of nodes that belong to the loop frame.
+  std::unordered_set<Node*> nodes;
+
+  // After `ExtractWhileLoopFrames` this is true if for all control flow nodes
+  // of this frame `node_filter` returns true, i.e., the frame should be
+  // functionalized, and false otherwise.
+  bool should_be_functionalized = true;
+};
+
+// Extracts v1 while loops within a graph and creates a map of
+// <ControlFLowInfo.name, WhileLoopFrame>.
+// If `node_filter` is defined, then we keep track of frames that should be
+// functionalized according to the filter (see comment for
+// `FunctionalizeControlFlow` for more details about node filters).
+absl::Status ExtractWhileLoopFrames(
+    const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
+    std::unordered_map<string, WhileLoopFrame>* frames,
+    const NodeFilter& node_filter = {});
+
+// Check that the graph has no cycle containing the given node.
+absl::Status CheckNodeNotInCycle(const Node* node, const int num_nodes);
+
+// Comparison function used for sorting nodes consistently.
+// a) resource variables are last, and
+// b) sort lexicographically by name (for deterministic output).
+struct NodeCmpByNameResourcesLast {
+  bool operator()(const Node* lhs, const Node* rhs) const;
+};
+
+// Returns the Node* created from the NodeDef in the Graph.
+absl::StatusOr<Node*> AddNodeDefToGraph(const NodeDef& node_def, Graph* graph);
+
+// Build a retval node of given type and index.
+absl::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index);
+
+// Returns a textual representation of the names of the nodes in the input.
+template <typename T>
+string NodesToString(const T& nodes) {
+  return absl::StrCat("{",
+                      absl::StrJoin(nodes, ",",
+                                    [](string* output, const Node* node) {
+                                      absl::StrAppend(output, node->name());
+                                    }),
+                      "}");
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_while.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_while.h
new file mode 100644
index 00000000..e9b361f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/functionalize_while.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Transformation that converts tf.while_loop() loops into functional While
+// operators, suitable for XLA compilation. If lookup_library is provided, use
+// it to make the library for control flow self-contained.
+//
+// If `node_filter` is defined, then only loops for whose nodes `node_filter`
+// returns true are functionalized.
+//
+// Preconditions:
+// Same as for `FunctionalizeControlFlow` (see comment there).
+absl::Status FunctionalizeWhileLoop(Graph* graph,
+                                    FunctionLibraryDefinition* library,
+                                    const NodeFilter& node_filter = {});
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/graph_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/graph_compiler.h
new file mode 100644
index 00000000..6ab20955
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/graph_compiler.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_GRAPH_COMPILER_H_
+#define TENSORFLOW_COMPILER_TF2XLA_GRAPH_COMPILER_H_
+
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "xla/client/local_client.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// GraphCompiler compiles the graph in topological order in the current
+// thread. It also resolves the nondeterminism in the graph by enforcing a
+// total order on all inputs to a node. This abstraction helps us create the
+// same XLA computation given two structurally equivalent TensorFlow graphs.
+// If a function call is visited during the graph traversal, it is then
+// compiled through the xla_context into a computation and a `Call` operation
+// is inserted to call into that computation.
+//
+// Note: GraphCompiler was created to remove our dependency to TF Executor in
+// the history. There are still some todos so that we can completely decouple
+// from Executor.
+//
+// TODO(yunxing): Remove usage of XlaCompilationDevice.
+//
+// TODO(yunxing): Remove the hack that wraps XlaExpression within a tensor now
+// that we don't use TF Executor to pass around a tensor.
+//
+// TODO(yunxing): Make XlaOpkernel not a subclass of OpKernel so that it can
+// handle a XlaExpression directly instead of a Tensor. This may require our own
+// op registration infrastructure instead of FunctionLibraryRuntime.
+class GraphCompiler {
+ public:
+  GraphCompiler(XlaCompilationDevice* device, Graph* graph,
+                FunctionLibraryRuntime* flib,
+                ScopedStepContainer* step_container)
+      : device_(device),
+        graph_(graph),
+        flib_(flib),
+        step_container_(step_container) {}
+
+  // Compiles the graph. The results are written in xla_context stored in the
+  // resource_manager of the 'XlaCompilationDevice' that's passed into the
+  // constructor.
+  absl::Status Compile();
+
+ private:
+  // Partially sets params. This partially set params can be reused
+  // across multiple nodes visit.
+  void PartiallySetupParams(OpKernelContext::Params* params);
+
+  // Compiles a functional node and writes result to OpkernelContext. A
+  // functional node represents a defined computation and should be compiled
+  // using `compiler_`.
+  absl::Status CompileFunctionalNode(Node* n, OpKernelContext* op_context);
+
+  XlaCompilationDevice* device_;
+  Graph* graph_;
+  FunctionLibraryRuntime* flib_;
+  ScopedStepContainer* step_container_;
+  // A buffer to hold tensor inputs to a node, this is reused across the graph
+  // traversal.
+  absl::InlinedVector<TensorValue, 4> tensor_inputs_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_GRAPH_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/graph_compiler_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/graph_compiler_util.h
new file mode 100644
index 00000000..ebdf07f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/graph_compiler_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_GRAPH_COMPILER_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_GRAPH_COMPILER_UTIL_H_
+
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+
+// Fills in xla_args from the corresponding _Arg nodes in the graph.
+absl::Status CreateXlaArgs(const Graph& graph,
+                           std::vector<XlaCompiler::Argument>* xla_args);
+
+// Populate xla_args for the given XLA config.
+void PopulateXlaArgs(const tf2xla::Config& config,
+                     std::vector<XlaCompiler::Argument>* xla_args);
+
+// InitGraph creates a graph based on the graph_def, that may then be converted
+// to an xla::XlaComputation via ConvertGraphToXla.
+//
+// The graph is rewritten with _Arg and _Retval nodes, representing the inputs
+// and outputs of the function that will be compiled.  Each feed id causes a new
+// _Arg node to be created, where we first collect all existing edges pointing
+// from the named node's output index, and then rewrite them to point from that
+// _Arg node instead.  Each fetch id causes a new _Retval node to be created,
+// with a new edge pointing from the named node's output index to that _Retval
+// node.
+absl::Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
+                       std::unique_ptr<Graph>* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_GRAPH_COMPILER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/case_op.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/case_op.h
new file mode 100644
index 00000000..a4c01bea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional switch/case primitive.
+//
+// The outputs of the branches must agree on the number, types, and
+// shapes of the Tensors carried around the two bodies.
+//
+// Computations in branch bodies may read from and write to resource variables.
+// Resource variables may be passed as arguments to the branch function's
+// bodies. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the branch bodies output. This ensures the branch bodies output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+class XlaCaseOp : public XlaOpKernel {
+ public:
+  explicit XlaCaseOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  XlaCaseOp(const XlaCaseOp&) = delete;
+  void operator=(const XlaCaseOp&) = delete;
+
+  // If the branch_index input is a constant: prunes out all but the branch
+  // corrresponding to that constant branch index, and returns that branch and
+  // the literal 0 (as the first and second component of the pair).
+  //
+  // If the branch_index input is not a constant: returns unpruned_branches_ and
+  // the branch_index input.
+  std::pair<std::vector<NameAttrList>, xla::XlaOp> GetPrunedBranchesAndIndex(
+      XlaOpKernelContext* ctx);
+
+  std::vector<NameAttrList> unpruned_branches_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
+  bool has_token_input_output_;
+  std::vector<string> token_input_nodes_;
+  string original_node_name_;
+  // Whether to propagate compile time consts into the cond branches.
+  // This is not supported by default now since it may cause HBM memory
+  // overheads.
+  bool propagate_compile_time_consts_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
new file mode 100644
index 00000000..f53f9fd0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_CONV_OP_HELPERS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_CONV_OP_HELPERS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+// This header exposes utilities for translating TensorFlow convolution ops into
+// XLA ops.
+//
+// conv_ops.cc contains lowerings for many of these TF convolution ops (e.g.
+// Conv2D, Conv3DBackpropFilterV2), but you might want to use the utilities in
+// this header to implement a new and exciting convolution op, for example a
+// fused TensorFlow op that contains a convolution and other things.
+
+namespace tensorflow {
+
+// We don't support integers for convolutions for GPU, so we list the supported
+// types for non-gpu and gpu here.
+std::vector<DataType> GetXlaConvTypesForNonGpu();
+std::vector<DataType> GetXlaConvTypesForGpu();
+
+// ConvOpAttrs contains all of the metadata necessary to specify a TF or XLA
+// convolution.
+struct ConvOpAttrs {
+  // Constructs a ConvOpAttrs, reading most of the attributes from `ctx`.
+  static absl::StatusOr<ConvOpAttrs> Create(int num_spatial_dims,
+                                            bool depthwise,
+                                            OpKernelConstruction* ctx);
+
+  bool depthwise;
+  int num_spatial_dims;
+  std::vector<int32> dilations;
+  std::vector<int32> strides;
+  Padding padding;
+  std::vector<int64_t> explicit_paddings;
+  TensorFormat data_format;
+};
+
+// Helper for the general Conv Op.
+struct ConvNDOpAttrs {
+  // Constructs a ConvOpAttrs, reading most of the attributes from `ctx`.
+  static absl::StatusOr<ConvNDOpAttrs> Create(OpKernelConstruction* ctx);
+
+  int groups;
+  int batch_dims;
+  std::vector<int32> dilations;
+  std::vector<int32> strides;
+  Padding padding;
+  std::vector<int64_t> explicit_paddings;
+  TensorFormat data_format;
+};
+
+// Creates a new XLA forward or backward convolution with the given inputs and
+// attributes.
+absl::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(absl::string_view type_string,
+                                                xla::XlaOp conv_input,
+                                                xla::XlaOp filter,
+                                                const ConvOpAttrs& attrs);
+absl::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
+    absl::string_view type_string, const xla::Shape& input_shape,
+    xla::XlaOp filter, xla::XlaOp out_backprop, const ConvOpAttrs& attrs,
+    xla::XlaOp* input_sizes = nullptr);
+absl::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
+    absl::string_view type_string, xla::XlaOp activations,
+    const xla::Shape& filter_shape, xla::XlaOp gradients,
+    const ConvOpAttrs& attrs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_CONV_OP_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
new file mode 100644
index 00000000..d22e6eb7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific base classes for Unary and Binary Ops.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_CWISE_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_CWISE_OPS_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "xla/client/client_library.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+// Coefficient-wise binary operations. Each binary Op expects two
+// inputs that can be broadcast to the same shape. The base class
+// contains pure virtual methods to override: description is a textual
+// description of the operation; and Computation adds the
+// implementation of the operation to a xla::XlaBuilder. For most
+// arithmetic Ops XLA handles the broadcasting automatically given the input
+// tensors.
+class XlaBinaryOp : public XlaOpKernel {
+ public:
+  explicit XlaBinaryOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    const DataType lhs = BaseType(input_type(0));
+    const DataType rhs = BaseType(input_type(1));
+    OP_REQUIRES(ctx, lhs == rhs,
+                errors::InvalidArgument("Input types of binary op must match"));
+  }
+  ~XlaBinaryOp() override = default;
+
+  // Implement the (tensor,tensor)->tensor lambda that should be
+  // applied to the inputs. The desired computation should be added to
+  // 'tc->builder()' and '(lhs,rhs)' are the function's inputs and
+  // (lhs_shape,rhs_shape) are their respective
+  // shapes. 'broadcast_helper' contains metadata about the shapes of
+  // the inputs and the dimensions that need to be broadcast, which
+  // may be useful for Ops that can't use standard XLA automatic
+  // broadcasting. 'extend_dimension' is non-empty if lhs and rhs have
+  // different ranks, and indicates which dimensions of the
+  // higher-rank input should be matched when broadcasting the
+  // lower-rank input. See comment below and the documentation on broadcasting
+  // in the XLA documentation.
+  virtual xla::XlaOp Computation(
+      XlaOpKernelContext* ctx, const xla::XlaOp& lhs,
+      const absl::Span<const int64_t>& lhs_shape, const xla::XlaOp& rhs,
+      const absl::Span<const int64_t>& rhs_shape, const BCast& broadcast_helper,
+      const std::vector<int64_t>& extend_dimensions) = 0;
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+  // Helper function that performs the broadcasting described by
+  // 'broadcast_helper', yielding arguments 'lhs' and 'rhs' that have the same
+  // shape.
+  static std::pair<xla::XlaOp, xla::XlaOp> Broadcast(
+      xla::XlaOp lhs, xla::XlaOp rhs, const BCast& broadcast_helper);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_CWISE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/elu_op.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/elu_op.h
new file mode 100644
index 00000000..09c88fcb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/elu_op.h
@@ -0,0 +1,26 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_ELU_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_ELU_OP_H_
+
+#include "xla/hlo/builder/lib/constants.h"
+#include "xla/hlo/builder/xla_builder.h"
+
+namespace xla {
+XlaOp Elu(XlaOp x);
+XlaOp Selu(XlaOp x);
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_ELU_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
new file mode 100644
index 00000000..8a8a6666
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper methods for XLA Gather Ops.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_HELPERS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_HELPERS_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "xla/client/client_library.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+// Adds to builder an XLA computation that performs a gather on input (of
+// shape input_shape) keyed on indices (of shape indices_shape).
+//
+// index_type must be must be DT_INT32 or DT_INT64.
+// If `indices_are_nd` is true, the last dimension of `indices` are treated as
+// a multidimensional index values. Otherwise, `indices` is treated as a tensor
+// of scalar indices.
+absl::Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
+                       const xla::XlaOp& indices,
+                       const TensorShape& indices_shape, int64_t axis,
+                       bool indices_are_nd, DataType dtype, DataType index_type,
+                       xla::XlaBuilder* builder, xla::XlaOp* gather_output);
+
+// The implementation of Gather and ResourceGather through XLA. Uses `input` as
+// the input instead of context->input(0) in order to allow ResourceGather to
+// handle obtaining the data from the ResourceVariable.
+absl::Status XlaGatherWithBatchDimsOpImpl(XlaOpKernelContext* context,
+                                          xla::XlaOp input,
+                                          const TensorShape& input_shape,
+                                          int batch_dims,
+                                          xla::XlaOp* gather_output);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/if_op.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/if_op.h
new file mode 100644
index 00000000..fc6dd2e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional conditional primitive.
+//
+// The outputs of the then/else branches must agree on the number, types, and
+// shapes of the Tensors carried around the two bodies.
+//
+// Computations in then/else bodies may read from and write to resource
+// variables.
+// Resource variables may be passed as arguments to the then/else function's
+// bodies. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the then/else bodies output. This ensures the then/else bodies output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+class XlaIfOp : public XlaOpKernel {
+ public:
+  explicit XlaIfOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  XlaIfOp(const XlaIfOp&) = delete;
+  void operator=(const XlaIfOp&) = delete;
+
+  NameAttrList then_branch_;
+  NameAttrList else_branch_;
+  DataType cond_type_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool has_token_input_output_;
+  std::vector<string> token_input_nodes_;
+  string original_node_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/if_while_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/if_while_utils.h
new file mode 100644
index 00000000..1800e5a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/if_while_utils.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_WHILE_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_WHILE_UTILS_H_
+
+#include <functional>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+extern const char kPropagateCompileTimeConsts[];
+
+// Convert arguments in `args` to constants provided they are compile-time
+// constants and they satisfy the condition in `should_resolve_constant`. The
+// argument `xla_expression_offset` determines what offset is needed to get the
+// input expression from context given the argument index in `args`.
+//
+// Returns a list of indices which were converted to constants.
+absl::InlinedVector<int, 5> ConvertCompileTimeConstArgumentsToConst(
+    XlaOpKernelContext* ctx, std::vector<XlaCompiler::Argument>* args,
+    int xla_expression_offset,
+    std::function<bool(int arg_idx)> should_resolve_constant);
+
+// Find and populate `must_be_const_nodes` and `body` of the function
+// corresponding to the kernel with context `ctx` with name `func_name`.
+absl::Status FindMustBeConstNodes(XlaOpKernelContext* ctx,
+                                  const NameAttrList& func_name,
+                                  std::vector<bool>* must_be_const_nodes,
+                                  const FunctionBody** body);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_WHILE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/image_resize_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/image_resize_ops.h
new file mode 100644
index 00000000..8d0fff23
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/image_resize_ops.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IMAGE_RESIZE_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IMAGE_RESIZE_OPS_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "xla/primitive_util.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class ResizeNearestNeighborOp : public XlaOpKernel {
+ public:
+  explicit ResizeNearestNeighborOp(OpKernelConstruction* ctx);
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ protected:
+  bool align_corners_ = true;
+  bool half_pixel_centers_ = true;
+  bool is_kernel_bilinear_ = false;
+};
+
+class ResizeBilinearOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ protected:
+  bool align_corners_ = true;
+  bool half_pixel_centers_ = true;
+  bool is_kernel_bilinear_ = true;
+};
+
+class ResizeBilinearGradOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearGradOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ protected:
+  bool align_corners_;
+  bool half_pixel_centers_ = true;
+  xla::PrimitiveType output_type_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_IMAGE_RESIZE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/index_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/index_ops.h
new file mode 100644
index 00000000..ef2b9e6b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/index_ops.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declarations of the ArgMax/ArgMin ops using a pure XLA implementation.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_INDEX_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_INDEX_OPS_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class XlaArgMinMaxOp : public XlaOpKernel {
+ public:
+  explicit XlaArgMinMaxOp(OpKernelConstruction* ctx, bool is_min);
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  const bool is_min_;  // Are we computing ArgMin (true) or ArgMax (false)?
+};
+
+class XlaArgMaxOp : public XlaArgMinMaxOp {
+ public:
+  explicit XlaArgMaxOp(OpKernelConstruction* ctx);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_INDEX_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h
new file mode 100644
index 00000000..f9c42e03
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_LIGHT_OUTSIDE_COMPILATION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_LIGHT_OUTSIDE_COMPILATION_H_
+
+#include <map>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/tf2xla/kernels/callback.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Using std::map as the maps are presumed to be tiny, and we want a
+// deterministic iteration order.
+//
+// Dimension -> bound.
+using DimensionBoundsMap = std::map<int, int>;
+
+// Output -> dimension -> bound.
+using OutputDimensionBoundsMap = std::map<int, DimensionBoundsMap>;
+
+// Generic kernel for registering TF2XLA kernels which call back into the TF
+// runtime to run a given kernel defined by the wrapped node.
+//
+// Cf. example usages in light_outside_compilation_kernels_for_test.cc.
+//
+// Currently does not support dynamic shape or resource variables. Currently
+// works only on GPU.
+class LightOutsideCompilationOp : public XlaOpKernel {
+ public:
+  explicit LightOutsideCompilationOp(OpKernelConstruction* context);
+  void Compile(XlaOpKernelContext* ctx) override;
+
+  // Override to provide statically known bounds on output in case of dynamic
+  // shapes.
+  virtual absl::StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions(
+      const NodeDef& ndef, XlaOpKernelContext* ctx) const {
+    return OutputDimensionBoundsMap{};
+  }
+
+ private:
+  absl::Status CompileToCustomCallCallingTfKernel(int graph_def_version,
+                                                  const NodeDef& node_def,
+                                                  XlaOpKernelContext* ctx);
+  static absl::Status CallTfKernel(void* stream_handle, void** buffers,
+                                   const char* opaque, int opaque_len);
+
+  NodeDef def_;
+  int graph_def_version_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_LIGHT_OUTSIDE_COMPILATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/random_ops_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
new file mode 100644
index 00000000..11ff4460
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_RANDOM_OPS_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_RANDOM_OPS_UTIL_H_
+
+#include <functional>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "xla/hlo/builder/lib/prng.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+inline constexpr int kRandomKeyInputIdx = 1;
+inline constexpr int kRandomCounterInputIdx = 2;
+inline constexpr int kRandomAlgInputIdx = 3;
+
+// Returns a tensor containing 'shape' random values uniformly distributed in
+// the range [minval, maxval). The raw random bits are generated by the given
+// `bit_generator` and converted to the requested data type and range. This
+// routine requires 2 32-bit integer seeds and currently only supports 'shape's
+// of type F32, S32 and S64.
+xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
+                               xla::XlaOp seeds, const xla::Shape& shape,
+                               xla::XlaOp minval, xla::XlaOp maxval);
+
+// Converts to bfloat16 if `dtype` equals DT_BFLOAT16, no-op otherwise.
+// It masks the last 16 bit. With normal rounding, values near "maxval" would be
+// converted to "maxval" which is out of range ["minval", "maxval"). In
+// addition, the distribution near the limit is not uniform.
+xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype);
+
+// Combines two signed 32-bit seeds into a single unsigned 64 bit seed.
+xla::XlaOp GetU64FromS32Seeds(xla::XlaOp seed0, xla::XlaOp seed1);
+
+absl::StatusOr<int> GetAlgId(XlaOpKernelContext* ctx, int alg_input_idx);
+
+xla::RngOutput BitGenerator(xla::RandomAlgorithm const& alg, xla::XlaOp key,
+                            xla::XlaOp counter, const xla::Shape& shape);
+
+// Gets user specified RNG algorithm.
+absl::StatusOr<xla::RandomAlgorithm> AlgorithmFromInput(
+    XlaOpKernelContext* ctx, int alg_input_idx,
+    absl::string_view device_type_string);
+
+xla::XlaOp MaybeSliceCounter(xla::RandomAlgorithm const& alg,
+                             TensorShape const& counter_shape,
+                             xla::XlaOp counter);
+
+DataType MaybeConvertBF16ToF32(DataType const& dtype);
+
+// Builds uniform randoms from a stateless RNG with given data type and device
+// type, in the given low and high range, where low and high are expressed in
+// XLA functions.
+absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
+    XlaOpKernelContext* ctx, DataType dtype, string device_type_string,
+    TensorShape shape,
+    std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> lo,
+    std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> hi);
+
+// Overloads BuildUniformRandoms where low and high range are expressed in XLA
+// ops.
+absl::StatusOr<xla::XlaOp> BuildUniformRandoms(XlaOpKernelContext* ctx,
+                                               DataType dtype,
+                                               string device_type_string,
+                                               xla::Shape xla_shape,
+                                               xla::XlaOp lo, xla::XlaOp hi);
+}  // namespace tensorflow
+
+namespace xla {
+
+int GetCounterSize(RandomAlgorithm const& alg);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_RANDOM_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
new file mode 100644
index 00000000..9c222224
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific base classes for Reduction Ops.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+// Reduction operations. The base class contains pure virtual methods
+// to override: description is a textual description of the mapped
+// function; InitialValue constructs the base case for the reduction;
+// BuildReducer adds the implementation of the reduction lambda to a
+// xla::XlaBuilder and BuildFinalizer adds the
+// implementation of the finalizer lambda (if there is one) to a
+// xla::XlaBuilder.
+class XlaReductionOp : public XlaOpKernel {
+ public:
+  XlaReductionOp(OpKernelConstruction* ctx, DataType reduction_type);
+  ~XlaReductionOp() override = default;
+
+  // Return the base case for the reduction.
+  virtual xla::XlaOp InitialValue(xla::XlaBuilder* builder) = 0;
+
+  // Implement the (scalar,scalar)->scalar lambda that should be
+  // applied to each pair of elements to be reduced. The desired
+  // computation should be added to 'builder' and
+  // '(scalar_lhs,scalar_rhs)' are the function's inputs.
+  virtual void BuildReducer(xla::XlaBuilder* builder,
+                            const xla::XlaOp& scalar_lhs,
+                            const xla::XlaOp& scalar_rhs) = 0;
+
+  // Applies a transformation to the output of the reduction. The desired
+  // computation should be added to 'builder'. Argument 'input' is the original
+  // input of the reduction; 'reduce_output' is the output of the reduction.
+  // Returns the transformed reduction output. Defaults to returning
+  // 'reduce_output' converted to the input type.
+  virtual xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* builder, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64_t>& dimensions_to_reduce);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  // True if the number of dimensions should be maintained.
+  bool keep_dims_;
+
+ protected:
+  DataType reduction_type_;
+  xla::PrimitiveType xla_reduction_type_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/relu_op.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/relu_op.h
new file mode 100644
index 00000000..b980df77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/relu_op.h
@@ -0,0 +1,26 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_RELU_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_RELU_OP_H_
+
+#include "xla/hlo/builder/lib/constants.h"
+#include "xla/hlo/builder/xla_builder.h"
+
+namespace xla {
+XlaOp Relu(XlaOp x);
+XlaOp Relu6(XlaOp x);
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_RELU_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/resampler_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/resampler_ops.h
new file mode 100644
index 00000000..7ecc2e93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/resampler_ops.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_RESAMPLER_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_RESAMPLER_OPS_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// XLA op kernel for both contrib and addon flavors of TenforFlow Resampler
+class ResamplerOp : public XlaOpKernel {
+ public:
+  explicit ResamplerOp(OpKernelConstruction* ctx);
+  void Compile(XlaOpKernelContext* ctx) override;
+};
+
+// XLA op kernel for both contrib and addon flavors of TenforFlow Resampler
+// gradient.
+class ResamplerGradOp : public XlaOpKernel {
+ public:
+  explicit ResamplerGradOp(OpKernelConstruction* ctx);
+  void Compile(XlaOpKernelContext* ctx) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_RESAMPLER_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/rng_converter_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/rng_converter_utils.h
new file mode 100644
index 00000000..ec45834d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/rng_converter_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_RNG_CONVERTER_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_RNG_CONVERTER_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/rng_alg.h"
+
+namespace tensorflow {
+
+// Given the XLA::RandomAlgorithm, return the Tensorflow equivalent.
+Algorithm ToTensorflowAlgorithm(xla::RandomAlgorithm alg);
+
+// Given the device type, return the default XLA::RandomAlgorithm
+xla::RandomAlgorithm DefaultRngAlgForDeviceType(
+    absl::string_view device_type_string);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_RNG_CONVERTER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/shape_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/shape_util.h
new file mode 100644
index 00000000..bfce0919
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/shape_util.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+
+#include <limits>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Converts a TensorShape to a constant Tensor.
+//
+// The input TensorShape input_shape is used to populate the elements of
+// shape_constant, which is modified in place.
+absl::Status TensorShapeToConstant(const TensorShape& input_shape,
+                                   Tensor* shape_constant);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
new file mode 100644
index 00000000..e4aeb015
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
@@ -0,0 +1,135 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/shape.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Whether the input expression at `index` corresponds to a TensorList.
+bool IsTensorListInput(XlaOpKernelContext* ctx, int index);
+
+// Whether the TensorList is initialized (has known data type and shape).
+absl::Status IsTensorListInitialized(xla::XlaOp list, bool* is_initialized);
+
+// Whether the TensorList is a nested TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status IsNestedTensorList(xla::XlaOp list, bool* is_nested_list);
+
+// Builds a non-nested TensorList from `buffer` and `push_index`.
+absl::Status BuildNonNestedTensorList(xla::XlaOp buffer, xla::XlaOp push_index,
+                                      xla::XlaOp* output_list);
+
+// Returns buffer shape for the TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status GetTensorListBufferShape(xla::XlaOp list,
+                                      xla::Shape* buffer_shape);
+
+// Returns buffer for the TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status GetTensorListBuffer(xla::XlaOp list, xla::XlaOp* buffer);
+
+// Returns push index for the TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status GetTensorListPushIndex(xla::XlaOp list, xla::XlaOp* push_index);
+
+// Returns a new TensorList with given push_index.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
+                                    xla::XlaOp* result);
+
+// Returns an uninitialized TensorList.
+xla::XlaOp BuildUninitializedTensorList(xla::XlaBuilder* b,
+                                        int64_t leading_dimension,
+                                        bool leading_size_is_dynamic,
+                                        xla::XlaOp leading_dim_size);
+
+// Returns leading dimension for the TensorList as well as a dynamic op
+// representing the dynamic size. Input can be initialized or uninitialized
+// TensorList. Non-nested and nested TensorLists are both supported.
+absl::Status GetLeadingDimForTensorList(xla::XlaOp list, int64_t* leading_dim,
+                                        bool* leading_dim_is_dynamic,
+                                        xla::XlaOp* leading_dim_dynamic_size);
+
+// Returns TensorList shape for the element shape.
+// Element shape must be a normal tensor shape.
+absl::Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
+                                                int64_t leading_dim,
+                                                bool leading_dim_is_dynamic,
+                                                xla::Shape* tensor_list_shape);
+
+// Returns a TensorList filled by zeros with the given shape.
+absl::Status CreateZerosTensorListWithShape(
+    xla::XlaBuilder* b, const xla::Shape& list_shape,
+    const std::vector<std::vector<xla::XlaOp>>& dynamic_dims, xla::XlaOp* list);
+
+// If the TensorList is initialized, check that its shape matches element shape;
+// If the TensorList is uninitialized, initialize it with the element shape.
+// Input can be initialized or uninitialized TensorList.
+// "element" can be normal tensor or TensorList.
+absl::Status GetInitializedTensorListForElement(xla::XlaOp list,
+                                                xla::XlaOp element,
+                                                bool element_is_tensor_list,
+                                                xla::XlaOp* initialized_list);
+
+// Executes TensorListPushBack with given TensorList and element.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
+                                       bool element_is_tensor_list,
+                                       xla::XlaOp* result);
+
+// Executes TensorListPopBack with given TensorList.
+// Input must be an initialized TensorList.
+// Non-nested and nested TensorLists are both supported.
+absl::Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
+                                      xla::XlaOp* element_result,
+                                      bool* element_is_tensor_list);
+
+// Executes TensorListSetItem with given TensorList, index and element.
+// Input must be an initialized TensorList.
+// Only non-nested TensorList is supported.
+absl::Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
+                                      xla::XlaOp element, xla::XlaOp* result);
+
+// Executes TensorListGetItem with given TensorList and index.
+// Input must be an initialized TensorList.
+// Only non-nested TensorList is supported.
+absl::Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
+                                      xla::XlaOp* result);
+
+// Executes TensorListPushBack with given tensor and push index.
+// "tensor" must be a normal tensor.
+absl::Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
+                                         xla::XlaOp* result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/while_op.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/while_op.h
new file mode 100644
index 00000000..8e9f317a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional iteration primitive.
+//
+// The inputs and outputs of the loop body must agree on the number, types, and
+// shapes of the Tensors carried around the loop body.
+//
+// Computations in while loops may read from and write to resource variables.
+// Resource variables may be passed as arguments to a function's body and
+// condition functions. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the body's output. This ensures the loop body's input and output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+//
+// For example, suppose we have a loop body with arguments:
+// DT_INT32, DT_RESOURCE (pointing to a DT_BOOL var), DT_FLOAT
+// and return values
+// DT_INT32, DT_FLOAT
+// It is an error for the body to return DT_RESOURCE values.
+//
+// The body will be lowered into an XLA computation that takes and returns a
+// tuple with XLA type (I32, F32, PRED). Note the resource variable appears at
+// the end of both the loop body's input and output argument lists.
+class XlaWhileOp : public XlaOpKernel {
+ public:
+  explicit XlaWhileOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  NameAttrList cond_name_attr_;
+  NameAttrList body_name_attr_;
+  bool has_token_input_output_;
+  std::vector<string> token_input_nodes_;
+  string original_node_name_;
+  // Whether to propagate compile time consts into the loop body.
+  // This is not supported by default now since it may cause HBM memory
+  // overheads.
+  bool propagate_compile_time_consts_ = false;
+
+  XlaWhileOp(const XlaWhileOp&) = delete;
+  void operator=(const XlaWhileOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
new file mode 100644
index 00000000..3b75ca3b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_XLA_CALL_MODULE_LOADER_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_XLA_CALL_MODULE_LOADER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/shape.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+bool IsTokenType(mlir::Type type);
+
+class XlaCallModuleLoader {
+ public:
+  static absl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> Create(
+      mlir::MLIRContext* context, int version, mlir::StringRef module_str,
+      std::vector<std::string> disabled_checks,
+      std::vector<std::string> platforms, int num_invocation_args,
+      bool main_has_token_input_output);
+
+  int NrInputs() { return main_.getNumArguments(); }
+  mlir::TypeRange InputTypes() { return main_.getArgumentTypes(); }
+
+  int NrOutputs() { return main_.getNumResults(); }
+  mlir::TypeRange OutputTypes() { return main_.getResultTypes(); }
+
+  // Sets the platform index argument, if the module is compiled for multiple
+  // platforms, and then erases the argument.
+  absl::Status SetPlatformIndex(absl::string_view compilation_platform);
+
+  // Refines the dynamic module arguments based on the static argument shapes.
+  // This assumes that the module has a "main" function without dimension args,
+  // but possibly with dynamic shapes. We read the static shapes of the inputs,
+  // then set them as the types of the function parameters, and run StableHLO
+  // shape refinement to specialize all dynamic shapes in the StableHLO program
+  // to static shapes.
+  // Starting with version 9, the "main" function may accept token arguments.
+  //
+  // If the module uses multi-platform lowering, and you called SetPlatformIndex
+  // then the refinement will also remove the dead platform code.
+  //
+  // This method accepts a list of `llvm::ArrayRef` instead of `mlir::Type`.
+  // This is to prevent callers from accidentally passing `mlir::Type` owned by
+  // a context that's different from the one passed to `Create`, which could
+  // cause lifetime issues.
+  // The input_shapes includes only the non-token and the non-platform-index
+  // arguments.
+  absl::Status RefineDynamicShapes(llvm::ArrayRef<xla::Shape> input_shapes);
+
+  // Validates that the module only contains ops from valid dialects.
+  absl::Status ValidateDialect();
+
+  // Validates that the module represents a statically-shaped StableHLO program,
+  // otherwise all sorts of weirdness might happen in the HLO exporter which is
+  // much easier to detect here.
+  absl::Status ValidateStaticShapes();
+
+  // Lowers the StableHLO module to MHLO in place.
+  absl::Status LowerModuleToMhlo();
+
+  // Lowers the MHLO module to XlaComputation and returns it.
+  //
+  // REQUIRES: `LowerModuleToMhlo()` is called beforehand.
+  absl::StatusOr<xla::XlaComputation> ToXlaComputation();
+
+  // Returns the deserialized stablehlo module.
+  mlir::ModuleOp module() & { return *module_; }
+  mlir::OwningOpRef<mlir::ModuleOp> module() && { return std::move(module_); }
+
+ private:
+  XlaCallModuleLoader() = default;
+
+  // Initializes the loader with the given serialized module string.
+  absl::Status LoadModule(mlir::MLIRContext* context, int version,
+                          mlir::StringRef module_str,
+                          std::vector<std::string> disabled_checks,
+                          std::vector<std::string> platforms,
+                          int num_invocation_args,
+                          bool main_has_token_input_output);
+
+  // Adds a wrapper for the "main" function to compute the platform index and
+  // the dimension arguments.
+  absl::Status AddMainWrapper();
+
+  mlir::MLIRContext* context_;
+  int version_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
+  std::vector<std::string> platforms_;
+  bool platform_index_arg_set_ = false;
+  // The disabled checks at loading time, including those from the
+  // disabled_checks attribute and the TF_XLA_FLAGS environment variable.
+  std::vector<std::string> loading_disabled_checks_;
+  mlir::func::FuncOp main_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_XLA_CALL_MODULE_LOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/layout_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/layout_util.h
new file mode 100644
index 00000000..dcb19561
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/layout_util.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for working with XLA layout and shapes.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LAYOUT_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LAYOUT_UTIL_H_
+
+#include <functional>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+class XlaShapeLayoutHelpers {
+ public:
+  // The following defines the layout preference of an xla tensor.
+  // The return value of LayoutPreferenceFn can be used in
+  // XlaHelper::ShapeRepresentationFn.
+  typedef std::function<XlaLayoutPreference(const TensorShape&, DataType,
+                                            std::optional<XlaArgument::Kind>)>
+      LayoutPreferenceFn;
+
+  // A bundle of LayoutPreferenceFn and ShapeRepresentationFn.
+  struct ShapeDeterminationFns {
+    // Use no preference function, and identity shape representation function,
+    // as default value.
+    ShapeDeterminationFns();
+
+    ShapeDeterminationFns(
+        LayoutPreferenceFn layout_preference_fn,
+        XlaHelpers::ShapeRepresentationFn shape_representation_fn)
+        : layout_preference_fn(layout_preference_fn),
+          shape_representation_fn(shape_representation_fn) {}
+
+    LayoutPreferenceFn layout_preference_fn;
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn;
+  };
+};
+
+// Return a LayoutPreferenceFn that always uses kNoPreference layout.
+XlaShapeLayoutHelpers::LayoutPreferenceFn UseNoPreferenceLayoutFn();
+
+// Rewrites the layout of xla_shape if there is tiled sharding.
+absl::Status RewriteLayoutWithShardedShape(
+    const std::optional<xla::HloSharding>& sharding, bool use_fast_memory,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    xla::Shape* xla_shape);
+
+// Adds reshapes to fix the layout of an output, if a shape_representation_fn or
+// sharding is present.
+absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    std::optional<xla::OpSharding> sharding, bool fast_mem);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/broadcast.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/broadcast.h
new file mode 100644
index 00000000..48dec32a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/broadcast.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Forwards to xla::BroadcastTo.
+// TODO(cheshire): Call the underlying function directly.
+absl::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
+                                       absl::Span<int64_t const> output_dims);
+
+// Forwards to xla::BroadcastOpsToSame.
+absl::Status BroadcastOpsToSame(xla::XlaOp* lhs, xla::XlaOp* rhs);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/data_format.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/data_format.h
new file mode 100644
index 00000000..131f5491
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/data_format.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_DATA_FORMAT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_DATA_FORMAT_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// Reformat from NCHW_VECT_C to NCHW.
+//
+// Prerequisites: the last dimension of the input must be of size 4.
+absl::StatusOr<xla::XlaOp> NCHW_VECT_CToNCHW(xla::XlaOp input);
+
+// Reformat from NCHW to NCHW_VECT_C.
+//
+// Prerequisites: the vectorized dimension `C` must be a multiple of 4.
+absl::StatusOr<xla::XlaOp> NCHWToNCHW_VECT_C(xla::XlaOp input);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_DATA_FORMAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/random.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/random.h
new file mode 100644
index 00000000..3c03633d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/random.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Builds an array of values sampled from a truncated normal distribution:
+//
+// uniform: an array of random numbers in uniform distribution (0, 1).
+// mu: the mean of the normal distribution.
+// sigma: the standard deviation of the normal distribution.
+// a: the lower bound of the generated values.
+// b: the upper bound of the generated values.
+xla::XlaOp ParameterizedTruncatedNormal(xla::XlaOp uniform, xla::XlaOp mu,
+                                        xla::XlaOp sigma, xla::XlaOp a,
+                                        xla::XlaOp b);
+
+// A specialized version of ParameterizedTruncatedNormal, with mu=0, sigma=1,
+// a=-2 and b=2.
+xla::XlaOp TruncatedNormal(xla::XlaOp uniform);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/scatter.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/scatter.h
new file mode 100644
index 00000000..90af6e63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_SCATTER_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_SCATTER_H_
+
+#include <functional>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Builds an XLA computation that performs a scatter operation on `buffer`,
+// returning an updated buffer.
+// For each i0, i1, ..., sets
+// buffer[indices[i0, i1, ...], ...] := updates[i0, i1, ...]
+//
+// If `indices_are_vectors` is false, then each index in indices is a scalar,
+// and the shape of `indices` must be a prefix of the shape of updates.
+// Otherwise, `indices_are_vectors`, then indices are multidimensional and the
+// minor dimension of `indices` represents a vector of indices.
+//
+// If `updates` is a scalar, then it will be broadcasted into the expected shape
+// of updates.
+//
+// If any part of the update region is out-of-bounds, the corresponding update
+// is discarded.
+//
+// If a `combiner` is provided, updates are combined with the existing values in
+// the buffer using the combiner function. Otherwise, the updates replace the
+// existing values. The order of updates is implementation-defined.
+absl::StatusOr<xla::XlaOp> XlaScatter(
+    const xla::XlaOp& buffer, const xla::XlaOp& updates,
+    const xla::XlaOp& indices, bool indices_are_vectors,
+    bool indices_are_sorted,
+    const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
+        combiner,
+    xla::XlaBuilder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_SCATTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/util.h
new file mode 100644
index 00000000..eaf52188
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/lib/util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Returns a floating point scalar constant of 'type' with 'value'.
+// If 'type' is complex, returns a real value with zero imaginary component.
+xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
+                        double value);
+
+// Makes a 1D tensor [0, ..., x, y] from two tensors x and y with zeros
+// prepended until the array is length n_dims.
+xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
+                                   absl::Span<const xla::XlaOp> starts);
+
+// Returns a integer scalar constant of 'type' with 'value'.
+// If 'type' is complex, returns a real value with zero imaginary component.
+xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
+                          int64_t value);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/literal_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/literal_util.h
new file mode 100644
index 00000000..4463024e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/literal_util.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for working with XLA Literals.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Returns a BorrowingLiteral that utilizes the same underlying buffer owned by
+// 'host_tensor'.
+absl::Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
+                                          xla::BorrowingLiteral* literal);
+// Similar as above, except the literal shape is explicitly provided and used
+// instead of obtaining it from the 'host_tensor'. The provided literal shape
+// 'xla_shape' must be compatible with the shape of 'host_tensor'.
+absl::Status HostTensorToBorrowingLiteral(const xla::Shape& xla_shape,
+                                          const Tensor& host_tensor,
+                                          xla::BorrowingLiteral* literal);
+
+// Returns a Literal with the contents of 'host_tensor', backed by its own
+// storage (i.e., not reusing 'host_tensor's buffers.)
+absl::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor);
+
+// Returns a MutableBorrowingLiteral that utilizes the same underlying buffer
+// owned by 'host_tensor', but is mutable via the xla::Literal methods.
+absl::Status HostTensorToMutableBorrowingLiteral(
+    Tensor* host_tensor, xla::MutableBorrowingLiteral* literal);
+// Similar as above, except the literal shape is explicitly provided and used
+// instead of obtaining it from the 'host_tensor'. The provided literal shape
+// 'xla_shape' must be compatible with the shape of 'host_tensor'.
+absl::Status HostTensorToMutableBorrowingLiteral(
+    const xla::Shape& xla_shape, Tensor* host_tensor,
+    xla::MutableBorrowingLiteral* literal);
+
+// Returns a BorrowingLiteral tuple that utilizes the same underlying buffers
+// owned by 'host_tensors'.
+absl::Status HostTensorsToBorrowingLiteralTuple(
+    absl::Span<const Tensor> host_tensors, xla::BorrowingLiteral* literal);
+
+// Copies 'literal' to freshly allocated 'host_tensor', which is allocated of
+// type <target_type>.
+// Fails if the literal's primitive type !=
+// DataTypeToPrimitiveType(target_type). Note that <target_type> is not
+// derivable from the type of <literal>, because multiple tensorflow types map
+// to the same XLA type (e.g. INT32 and QINT32 both map to INT32 in
+// XLA).
+absl::Status LiteralToHostTensor(const xla::LiteralSlice& literal,
+                                 DataType target_type, Tensor* host_tensor);
+
+// Copies the contents of 'literal' to a previously allocated tensor
+// 'host_tensor'. The tensor and the literal must have the same number of
+// elements and the same type.
+absl::Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
+                                     Tensor* host_tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/mlir_bridge_pass.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
new file mode 100644
index 00000000..eae5fb83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// This pass uses MLIR to implement all the conversion steps to target XLA from
+// a TensorFlow Function Graph. It is meant to expose a very limited set of
+// functionalities during the bring-up of MLIR-based bridge.
+class MlirBridgePass : public MlirOptimizationPass {
+ public:
+  llvm::StringRef name() const override { return "bridge"; }
+
+  MlirOptimizationPassState GetPassState(
+      const DeviceSet* device_set, const ConfigProto& config_proto,
+      const Graph& graph,
+      const FunctionLibraryDefinition& function_library) const override;
+
+  // This should be used as a thin mapper around mlir::ModulePass::runOnModule
+  // API integrated with the Tensorflow runtime.
+  absl::Status Run(const std::string& function_name,
+                   const ConfigProto& config_proto, mlir::ModuleOp module,
+                   const Graph& graph,
+                   const FunctionLibraryDefinition& function_library) override;
+};
+
+// This pass uses MLIR to implement all the conversion steps to target XLA from
+// a TensorFlow V1 Graph. It is meant to expose a very limited set of
+// functionalities during the bring-up of MLIR-based bridge.
+class MlirBridgeV1CompatPass : public MlirV1CompatOptimizationPass {
+ public:
+  llvm::StringRef name() const override { return "bridge"; }
+
+  MlirOptimizationPassState GetPassState(
+      const DeviceSet* device_set, const ConfigProto& config_proto,
+      const Graph& graph,
+      const FunctionLibraryDefinition& function_library) const override;
+
+  // This should be used as a thin mapper around mlir::ModulePass::runOnModule
+  // API integrated with the Tensorflow runtime.
+  absl::Status Run(const GraphOptimizationPassOptions& options,
+                   mlir::ModuleOp module) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h
new file mode 100644
index 00000000..6053f5d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_MLIR_XLA_OP_KERNEL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_MLIR_XLA_OP_KERNEL_H_
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// An XlaOpKernel that's implemented by lowering using MLIR TensorFlow to HLO
+// legalization.
+class MlirXlaOpKernel : public XlaOpKernel {
+ public:
+  explicit MlirXlaOpKernel(OpKernelConstruction* ctx);
+
+ private:
+  absl::Status ContextToXlaArgs(XlaOpKernelContext* ctx,
+                                std::vector<XlaCompiler::Argument>& xla_args);
+  void Compile(XlaOpKernelContext* ctx) override;
+  absl::Status ConstructXlaOp(XlaOpKernelContext* ctx);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_MLIR_XLA_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/rearrange_function_argument.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/rearrange_function_argument.h
new file mode 100644
index 00000000..1a290017
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/rearrange_function_argument.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_REARRANGE_FUNCTION_ARGUMENT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_REARRANGE_FUNCTION_ARGUMENT_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// For the given graph `g`:
+// 1. Rewrite If/While node functions to rearrange arguments and return values,
+//    so that all resource arguments/return values are placed in the end (as
+//    required by XlaCompiler),
+// 2. Inline StatefulPartitionedCall nodes so we do not need to rearrange
+//    arguments and return values.
+// `get_function_body_fn` is used to instantiate FunctionDef.
+// `fld` is used to store rewritten functions.
+// `global_fld` is used to potentially supply stack traces for functions when
+// they are not found in `fld`.
+absl::Status RearrangeFunctionArguments(
+    std::function<absl::Status(const NameAttrList&, const FunctionBody**)>
+        get_function_body_fn,
+    Graph* g, FunctionLibraryDefinition* fld,
+    const FunctionLibraryDefinition* global_fld = nullptr);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_REARRANGE_FUNCTION_ARGUMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/resource_operation_table.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/resource_operation_table.h
new file mode 100644
index 00000000..61c7a56f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/resource_operation_table.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+
+// Exposes information about the resource operations supported by tf2xla in a
+// structured form.
+
+namespace tensorflow {
+enum class XlaResourceOpKind {
+  kRead,      // Only reads from resources.
+  kWrite,     // Only writes to resources.
+  kReadWrite  // Reads from and writes to resources.
+};
+
+enum class XlaResourceKind {
+  kVariable,    // Operates on resource variables.
+  kStack,       // Operates on stacks.
+  kTensorArray  // Operates on tensor arrays.
+};
+
+class XlaResourceOpInfo {
+ public:
+  explicit XlaResourceOpInfo(XlaResourceOpKind op_kind,
+                             XlaResourceKind resource_kind)
+      : op_kind_(op_kind), resource_kind_(resource_kind) {}
+
+  XlaResourceOpKind kind() const { return op_kind_; }
+  XlaResourceKind resource_kind() const { return resource_kind_; }
+
+  static absl::string_view XlaResourceOpKindToString(XlaResourceOpKind op_kind);
+
+ private:
+  XlaResourceOpKind op_kind_;
+  XlaResourceKind resource_kind_;
+};
+
+// Returns a XlaResourceOpInfo describing `op` if it is a resource operation
+// supported by tf2xla, otherwise returns null (i.e. if this returns null then
+// `op` is either not a resource operation or is unsupported by XLA).
+const XlaResourceOpInfo* GetResourceOpInfoForOp(absl::string_view op);
+
+namespace resource_op_table_internal {
+// NB! Implementation detail exposed for unit testing, do not use.
+//
+// Returns the set of resource operations known by this module.
+std::vector<absl::string_view> GetKnownResourceOps();
+}  // namespace resource_op_table_internal
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/resource_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/resource_util.h
new file mode 100644
index 00000000..e4bdb511
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/resource_util.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_RESOURCE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_RESOURCE_UTIL_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+class ResourceUsageAnalysis {
+ public:
+  // NodeInfo is a triple of function_name:node_name:op to uniquely identity a
+  // node in graph. ResourceUsageAnalysis uses it to represent resource sources
+  // and users.
+  class NodeInfo {
+   public:
+    std::optional<std::string> function_name_;
+    std::string node_name_;
+    std::string op_;
+
+    NodeInfo() {}
+
+    NodeInfo(const std::optional<std::string>& function_name,
+             std::string node_name, std::string op)
+        : function_name_(function_name),
+          node_name_(std::move(node_name)),
+          op_(std::move(op)) {}
+
+    std::string DebugString() const {
+      return absl::StrJoin({function_name_.value_or(""), node_name_, op_}, ":");
+    }
+
+    bool operator==(const NodeInfo& o) const {
+      return function_name_ == o.function_name_ && node_name_ == o.node_name_ &&
+             op_ == o.op_;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const NodeInfo& o) {
+      return H::combine(std::move(h), o.function_name_, o.node_name_, o.op_);
+    }
+  };
+
+  // This method analyzes a Tensorflow graph and finds all operations that
+  // create Stack/TensorArray resources and all the operations that consume
+  // resource created by them.
+  //
+  // Note that _Arg nodes that introduce resources are not considered sources.
+  // Note again that Control Flow v1 nodes
+  // (Enter/Exit/Switch/Merge/NextIteration) are not supported. Graphs contain
+  // these nodes cause analysis failures. However Control Flow v2 nodes
+  // (While/If) will be supported.
+  //
+  // TODO(b/135628319): Support analyzing functional while/if as pass-through
+  // ops.
+  //
+  // For example, consider following subgraph:
+  //
+  // TensorArrayOp -> Identity -> TensorArrayWriteOp
+  //
+  // It should be able to tell that TensorArrayWriteOp actually operates on the
+  // resource created by TensorArrayOp even though there might be
+  // non-resource-specific operations like Identity (or other pass-through
+  // operations).
+  //
+  // source_to_path maps the nodes that creates resources to all nodes that
+  // operate on the corresponding resource, not including sources themselves. It
+  // is cleared upon calling this method.
+  static absl::Status Analyze(
+      const Graph* graph, FunctionLibraryRuntime* lib_runtime,
+      absl::flat_hash_map<NodeInfo, absl::flat_hash_set<NodeInfo>>*
+          source_to_path);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_TF2XLA_RESOURCE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/shape_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/shape_util.h
new file mode 100644
index 00000000..018ab191
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/shape_util.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for working with XLA shapes.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
+
+#include <vector>
+
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Convert an XLA Shape into the equivalent TensorFlow shape. May fail since
+// not all XLA shapes can be represented as TensorShapes.
+absl::Status XLAShapeToTensorShape(const xla::Shape& shape,
+                                   TensorShape* tensor_shape);
+
+// Convert a TensorShape into the equivalent XLA Shape proto. Unlike Tensorflow,
+// XLA shapes include the type. Not all `dtype` values can be represented by
+// XLA, so this conversion may fail.
+absl::Status TensorShapeToXLAShape(DataType dtype,
+                                   const TensorShape& tensor_shape,
+                                   xla::Shape* shape);
+
+absl::StatusOr<xla::Shape> TensorShapeToXLAShape(
+    DataType dtype, const TensorShape& tensor_shape);
+
+// Converts a TensorShape into the equivalent XLA Shape proto, taking an
+// xla::PrimitiveType to specify the element type.  This never fails.
+xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
+                                 const TensorShape& tensor_shape);
+
+// Convert a PartialTensorShape into the equivalent XLA Shape proto. An shape
+// with unknown rank is represented by an r1 with empty dimension.
+absl::Status TensorShapeToXLAShape(DataType dtype,
+                                   const PartialTensorShape& tensor_shape,
+                                   xla::Shape* shape);
+
+// Convert a PartialTensorShape into the equivalent XLA Shape proto. An shape
+// with unknown rank is represented by an r1 with empty dimension.
+xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
+                                 const PartialTensorShape& tensor_shape);
+
+absl::Status TensorShapeToBoundedXLAShape(
+    DataType dtype, const PartialTensorShape& tensor_shape,
+    const TensorShape& bound, xla::Shape* shape);
+
+// Given an XLA shape with layouts, builds a layout vector in the form able to
+// be fed to ops like InfeedEnqueue/InfeedEnqueueTuple/XRTAllocateV2/....
+// THe returned vector is a linearized sequence of the minor-to-major values of
+// the layouts held within the input shape.
+// In case the input shape is a tuple, the minor-to-major values will be in the
+// order of the tuple elements within the tuple shape.
+// If a shape (or a subshape of a tuple shape) has missing layout, a rank long
+// sequence of -1 values will be emitted.
+absl::StatusOr<std::vector<int>> GetShapeLayoutVector(const xla::Shape& shape);
+
+// Given the input shape and a linearized sequence of the minor-to-major values
+// of the layouts, create the output shape by rewriting the input shape layouts.
+// If a layout is missing (has -1 values) for a matching tuple subshape, the
+// layout_func will be called, if not nullptr.
+absl::Status GetShapeWithLayout(
+    const xla::Shape& input_shape, absl::Span<const int64_t> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* output_shape);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/sharding_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/sharding_util.h
new file mode 100644
index 00000000..473ad1dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/sharding_util.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_
+
+#include <string>
+
+#include "xla/hlo/builder/sharding_builder.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Parses the op sharding from the 'replicated core' device_name <device_name>.
+// Returns an error:
+// - if the device name is invalid.
+// - the core is parsed and is out of the range [0, num_cores_per_replica).
+//
+// Otherwise, returns either:
+// - explicit_sharding if explicit_sharding.has_value()
+// - a non-value if there is no assigned core or
+// - a sharding set as per xla::sharding_builder::AssignDevice.
+absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const string& device_name, int num_cores_per_replica,
+    std::optional<xla::OpSharding> explicit_sharding = std::nullopt,
+    std::optional<xla::OpMetadata> metadata = std::nullopt);
+
+absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const Node& node, int num_cores_per_replica, bool add_metadata);
+
+absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const NodeDef& node_def, int num_cores_per_replica, bool add_metadata);
+
+absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromEdgeSource(
+    const Edge& edge, int num_cores_per_replica, bool add_metadata);
+
+void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
+
+// Get sharding inforamtion from node.
+absl::StatusOr<std::optional<xla::OpSharding>> GetShardingFromNodeDef(
+    const NodeDef& node_def, bool add_metadata);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/side_effect_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/side_effect_util.h
new file mode 100644
index 00000000..34f30eb7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_SIDE_EFFECT_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_SIDE_EFFECT_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Side-effecting nodes will have this attribute set. Its value is the list of
+// node names which this node has side-effect dependencies on.
+//
+// Nodes like HostCompute, SendToHost, RecvFromHost always have this attribute,
+// because they always have side-effect.
+// If and While nodes may or may not have this attribute, depending on whether
+// their bodies have side-effecting nodes.
+extern const char kXlaTokenInputNodesAttrName[];
+
+// This node name is used in kXlaTokenInputNodesAttrName attr to signal that a
+// node has side-effect dependency on current graph's token input.
+extern const char kXlaTokenArgNodeName[];
+
+// This node have XlaRecvAtHost/XlaSendFromHost in its associated functions.
+extern const char kXlaHasHostTransferAttrName[];
+
+// This attribute is the replica id for an outside compilation node node.
+extern const char kXlaReplicaIdAttrName[];
+
+// This node is a Placeholder node added for tail outside compilation.
+extern const char kXlaIsPlaceholderForTailOcAttrName[];
+
+// This attribute is the original node name for this node.
+extern const char kXlaOriginalOutsideCompilationNodeName[];
+
+// Sets device ordinal attribute for nodes with attribute
+// `kXlaHasHostTransferAttrName`.
+absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal);
+
+// Calculates side-effect dependencies for the graph's token output.
+// Returns a set of node names representing these dependencies.
+std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g);
+
+// Returns whether a graph contains side-effecting nodes.
+bool HasSideEffectingNodes(const Graph& g);
+
+// Parse the mapping from outside_compilation_subgraph name to core number,
+// which is specified in an attr as a list of strings
+// <subgraph_name>:<core_index>.
+absl::Status ParseHostComputeCoreList(absl::Span<const string> list_from_attr,
+                                      std::map<string, int>* host_compute_core);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_SIDE_EFFECT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/test_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/test_util.h
new file mode 100644
index 00000000..2b2eb4f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/test_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for tests.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TEST_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TEST_UTIL_H_
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/equal_graph_def.h"
+
+namespace tensorflow {
+
+// Same as InstantiationResult, but has a GraphDef instead of just nodes.
+struct InstantiationResultForTest {
+  DataTypeVector arg_types;
+  DataTypeVector ret_types;
+  GraphDef gdef;
+};
+
+// Instantiates a function, producing a GraphDef to compare against the
+// expected graph.
+absl::Status InstantiateFunctionForTest(
+    const string& name, const FunctionLibraryDefinition& library,
+    InstantiationResultForTest* result);
+
+}  // namespace tensorflow
+
+// Variant of TF_EXPECT_GRAPH_EQ that also compares internal attributes for
+// equality.
+#define TF_EXPECT_GRAPH_EQ_INTERNAL(expected, actual)               \
+  do {                                                              \
+    string diff;                                                    \
+    EqualGraphDefOptions eq_options;                                \
+    eq_options.ignore_internal_attrs = false;                       \
+    EXPECT_TRUE(EqualGraphDef(actual, expected, &diff, eq_options)) \
+        << diff << "\nActual: " << SummarizeGraphDef(actual);       \
+  } while (false)
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla.h
new file mode 100644
index 00000000..095ad49a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "xla/client/client.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Converts a tensorflow::GraphDef into an xla::XlaComputation. The given
+// `config` specifies the portion of the graph to convert, via feeds and
+// fetches. Each feed is a positional input argument for the generated
+// computation, while each fetch is a positional output argument.
+//
+// The computation is built in the context of the given `client`, which may
+// subsequently be used to compile or execute the computation.
+absl::Status ConvertGraphDefToXla(GraphDef graph_def,
+                                  const tf2xla::Config& config,
+                                  xla::Client* client,
+                                  xla::XlaComputation* computation);
+
+// Similar to ConvertGraphDefToXla, but uses MLIR and handle debug information.
+//
+// debug_info_filename: the file for the debug information proto.
+// debug_info_path_begin_marker: if not empty, file pathes in the debug
+//   information are trimmed from the beginning to the first appearance of the
+//   marker.
+absl::Status ConvertGraphDefToXlaViaMlir(
+    GraphDef graph_def, const tf2xla::Config& config,
+    xla::XlaComputation* computation, absl::string_view debug_info_filename,
+    absl::string_view debug_info_path_begin_marker);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_defs.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_defs.h
new file mode 100644
index 00000000..2f81d2dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_defs.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_DEFS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_DEFS_H_
+
+#include <array>
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+
+// Marks a node for XLA compilation. The attribute value indicates the
+// compilation device type.
+inline constexpr absl::string_view kCompileDeviceTypeAttr =
+    "_xla_compile_device_type";
+// Marks a node for XLA compilation.
+inline constexpr absl::string_view kMustCompileAttr = "_XlaMustCompile";
+// Marks a node for replication. The attribute value indicates the replication
+// metadata op.
+inline constexpr absl::string_view kReplicationInfoAttr = "_replication_info";
+// Marks a node for XLA-TPU compilation. The attribute value indicates the
+// associated compilation cluster and replication metadata op.
+inline constexpr absl::string_view kTpuReplicateAttr = "_tpu_replicate";
+// Marks a node inside of an XLA compilation cluster to be placed outside of the
+// cluster.
+inline constexpr absl::string_view kXlaOutsideCompilationAttr =
+    "_xla_outside_compilation";
+// Frontend attributes ID.
+inline constexpr absl::string_view kXlaFrontendAttributesAttrName =
+    "_XlaFrontendAttributes";
+// Device types.
+inline constexpr absl::string_view kDeviceAttr = "device";
+inline constexpr absl::string_view kCpuDevice = "CPU";
+inline constexpr absl::string_view kGpuDevice = "GPU";
+inline constexpr absl::string_view kTpuDevice = "TPU";
+inline constexpr absl::string_view kEmptyDevice = "";
+// Device type may be empty in ops such as TF.PartitionedCall.
+inline constexpr std::array<absl::string_view, 4> kValidDeviceTypes = {
+    kCpuDevice, kGpuDevice, kTpuDevice, kEmptyDevice};
+// Attributes that need to be propagated during rewrites (e.g., in
+// functionalization).
+inline constexpr std::array<absl::string_view, 5> kAttrsToPropagate = {
+    kCompileDeviceTypeAttr,
+    kReplicationInfoAttr,
+    kXlaFrontendAttributesAttrName,
+    kXlaOutsideCompilationAttr,
+    kTpuReplicateAttr,
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_DEFS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_opset.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_opset.h
new file mode 100644
index 00000000..37fa8f39
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_opset.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_OPSET_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_OPSET_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+
+namespace tensorflow {
+
+absl::StatusOr<std::vector<std::string>> GetRegisteredXlaOpsForDevice(
+    absl::string_view device_name);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_OPSET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
new file mode 100644
index 00000000..1b45fb4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+
+namespace tensorflow {
+namespace tf2xla {
+
+// The implementation of a main function for a binary that prints a table of
+// supported tf2xla operators for a given device, along with their type
+// constraints, to stdout.
+//
+// Pass the argc and argv from main, unmodified.  Use regen_run to specify the
+// command used to regenerate the table.
+void SupportedOpsMain(int argc, char** argv, const char* regen_run);
+
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_util.h
new file mode 100644
index 00000000..f2ce3944
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -0,0 +1,226 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
+
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// ValidateConfig returns OK iff config is valid.
+absl::Status ValidateConfig(const tf2xla::Config& config);
+
+// Modifies <graph_def> to include placeholders for each fed tensor, and
+// update references to the fed tensors to refer to the placeholders.
+// The existing nodes referenced by the feeds are not removed or modified
+// (except where their input edges are modified by the replacement of other
+// feeds).
+absl::Status AddPlaceholdersForFeeds(
+    const tf2xla::Config& config, const OpRegistryInterface* op_registry,
+    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def);
+
+// Returns in <out> a copy of <in>, pruned to only include fetches from
+// <config>.
+absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
+                               GraphDef* out);
+
+// Returns node:port for the given <id>.
+string TensorIdToString(const tf2xla::TensorId& id);
+
+// Updates the sharding of <n> based on the sharding of its neighbors.
+// If <out_edges> is true, outgoing edges from <n> are considered; else incoming
+// edges are considered.
+absl::Status SetNodeShardingFromNeighbors(Node* n, bool out_edges);
+
+// Add an allowed data type to the AttrConstraint with the given name.
+void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
+                                   KernelDef* kdef);
+
+// Returns the next random seed to use for seeding xla rng.
+uint32 GetXLARandomSeed();
+
+// Indicates how a FunctionDef is associated with a graph node (e.g. the node is
+// a function call, or the node has function attrs).
+class AssociatedFunctionInfo {
+ public:
+  enum AssociatedFunctionType {
+    kFunctionAttr = 0,
+    kFunctionCallNode = 1,
+    kSymbolicGradient = 2,
+  };
+
+  // The function is an attr of the node.
+  static AssociatedFunctionInfo FunctionAttr(const string& func_name,
+                                             const AttrValueMap& attrs,
+                                             const string& attr_name) {
+    return AssociatedFunctionInfo(kFunctionAttr, func_name, attrs, attr_name);
+  }
+
+  // The node is a function call.
+  static AssociatedFunctionInfo FunctionCall(const string& func_name,
+                                             const AttrValueMap& attrs) {
+    // attr_name will not be used in this case.
+    return AssociatedFunctionInfo(kFunctionCallNode, func_name, attrs,
+                                  /*attr_name=*/"");
+  }
+
+  // The node is a SymbolicGradient op.
+  static AssociatedFunctionInfo SymbolicGradient(const string& func_name,
+                                                 const AttrValueMap& attrs) {
+    // attr_name will not be used in this case.
+    return AssociatedFunctionInfo(kSymbolicGradient, func_name, attrs,
+                                  /*attr_name=*/"");
+  }
+
+  AssociatedFunctionType type() const { return type_; }
+
+  const string& func_name() const { return func_name_; }
+
+  const string& attr_name() const { return attr_name_; }
+
+  const AttrValueMap& attrs() const { return attrs_; }
+
+ private:
+  AssociatedFunctionInfo(AssociatedFunctionType type, const string& func_name,
+                         const AttrValueMap& attrs, const string& attr_name)
+      : type_(type),
+        func_name_(func_name),
+        attrs_(attrs),
+        attr_name_(attr_name) {}
+
+  // Available for all instances.
+  AssociatedFunctionType type_;
+  string func_name_;
+  AttrValueMap attrs_;
+
+  // Only available if the function is defined in an attr.
+  string attr_name_;
+};
+
+// Returns if the NodeDef has associated function.
+bool HasAssociatedFunction(const NodeDef& node_def,
+                           const FunctionLibraryDefinition* fld);
+
+// Gets functions associated with the node. Current cases:
+// 1. For function call node, its function name;
+// 2. For SymbolicGradient op, returned func_name will be "SymbolicGradient",
+//    and returned attrs will be this node's attributes;
+// 3. For nodes like XlaWhile/XlaIf, all their function attributes.
+std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
+    const Node& node, const FunctionLibraryDefinition* fld);
+
+// Changes associated functions for the node. Current cases:
+// 1. For function call node, creates a new node with the new function name and
+//    remove the old node;
+// 2. For SymbolicGradient op, add or replace GradientDef in
+//    FunctionLibraryDefinition;
+// 3. For nodes like XlaWhile/XlaIf, modify their function attributes.
+absl::Status RewriteAssociatedFunction(
+    Graph* graph, Node* node, FunctionLibraryDefinition* fld,
+    const AssociatedFunctionInfo& associated_function,
+    const string& rewritten_function_name);
+
+// Class to act as cache for FunctionLibraryRuntime::Handle objects.
+class CachedFunctionHandles {
+ public:
+  CachedFunctionHandles(FunctionLibraryRuntime* flr) : flr_(flr) {}
+
+  // Populates `handle` for requested function and attributes. If we have
+  // instantiated the function with the same attributes before, `handle` will be
+  // cached handle; otherwise instantiate the function and populate `handle`.
+  absl::Status GetOrInstantiate(const string& func_name, AttrSlice attrs,
+                                FunctionLibraryRuntime::Handle* handle);
+
+  // Releases all handles in the cache. Returns first non-OK status if any;
+  // returns OK otherwise.
+  absl::Status ReleaseAllHandles();
+
+  ~CachedFunctionHandles() { ReleaseAllHandles().IgnoreError(); }
+
+ private:
+  FunctionLibraryRuntime* flr_;
+  std::map<string, FunctionLibraryRuntime::Handle> handles_;
+
+  CachedFunctionHandles(const CachedFunctionHandles&) = delete;
+  void operator=(const CachedFunctionHandles&) = delete;
+};
+
+// Struct for node's output edge info.
+struct OutEdgeInfo {
+  Node* dst;
+  int src_output, dst_input;
+};
+
+// Replaces node `n` with a new node whose NodeDef is `node_def`.
+absl::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def);
+
+// Helper function that builds an Identity node.
+absl::StatusOr<Node*> BuildIdentityNode(Graph* graph, const string& node_name,
+                                        DataType dtype, const Node* input,
+                                        std::optional<string> requested_device);
+
+// For "If"/"While" nodes, if some of their inputs are Const nodes, rewrite
+// body functions to use the Const nodes instead of original _Arg nodes.
+//
+// For example, say we have the following computation:
+//     shape = constant_op.constant([1])
+//     return tf.cond(pred, lambda: tf.ones(shape), lambda: tf.zeros(shape))
+// If we do not rewrite then/else function, they will use _Arg node as shape
+// input for tf.ones/tf.zeros. But XLA requires that shape input to be compile
+// time constant, so XLA compilation will fail. This rewriting process will
+// change the shape input to Const node.
+absl::Status PropagateConstIntoFunctionalNodes(
+    Graph* g, const FunctionLibraryDefinition* lookup_fld,
+    FunctionLibraryDefinition* fld);
+
+// Prunes unreachable FunctionDefs from FunctionLibraryDefinition.
+absl::Status PruneUnreachableFunctionsFromGraph(const Graph& g,
+                                                FunctionLibraryDefinition* fld);
+
+// Finds the following pattern in the graph:
+// 1) EmptyTensorList -> forward While op -> backward While op,
+// 2) in forward While op, a Const node is pushed,
+// 3) in backward While op, data is popped from the tensor list.
+// And rewrites backward While op to use Const node instead of TensorListPopBack
+// result.
+// TODO(b/128633174) remove the TensorList and related TensorList ops.
+absl::Status RewriteTensorListWithConstElement(Graph* g,
+                                               FunctionLibraryDefinition* fld);
+
+inline bool IsConstTraversableOpType(const Node* node) {
+  return node->type_string() == "Identity" ||
+         node->type_string() == "IdentityN" || node->IsWhileNode();
+}
+
+// Determines whether a loop body is invariant for the given argument index.
+absl::StatusOr<bool> IsLoopInvariant(
+    const FunctionBody* loop_body, int index,
+    const FunctionLibraryDefinition* lookup_fld);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/type_util.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/type_util.h
new file mode 100644
index 00000000..a3027a5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/type_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TYPE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TYPE_UTIL_H_
+
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Converts a Tensorflow DataType to an XLA PrimitiveType.
+absl::Status DataTypeToPrimitiveType(DataType data_type,
+                                     xla::PrimitiveType* type);
+
+// Converts an XLA PrimitiveType to a TensorFlow DataType.
+// Caution: The mapping from TF types to XLA types is not one-to-one: for
+// example, both DT_INT8 and DT_QINT8 map to xla::S8. So the inverse is not a
+// uniquely defined function. This is fine if you want a way to encode an XLA
+// object as a TensorFlow object (e.g., in XRT); whereas if you started with a
+// TensorFlow object in the first place, you most likely should preserve the
+// original TensorFlow type, rather than trying to convert an XLA type back into
+// a TensorFlow type.
+absl::StatusOr<DataType> EncodePrimitiveTypeAsDataType(xla::PrimitiveType type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_argument.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_argument.h
new file mode 100644
index 00000000..9e2eccd2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_argument.h
@@ -0,0 +1,136 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_ARGUMENT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_ARGUMENT_H_
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Describes how to derive the value of each _Arg node in the graph/function
+// being compiled. There must be one Argument for each _Arg index.
+struct XlaArgument {
+  enum Kind {
+    // Default value; not a valid kind.
+    kInvalid,
+
+    // Argument is a compile-time constant. No associated runtime parameter.
+    kConstant,
+
+    // Argument is a Variable, TensorArray, or Stack resource. Has an
+    // associated runtime parameter iff `initialized` is true.
+    kResource,
+
+    // A resource variable with a constant value known at compile time.
+    kConstantResource,
+
+    // Argument is a run-time parameter.
+    kParameter,
+
+    // Argument is an XLA token.
+    kToken,
+
+    // Argument is a TensorList.
+    kTensorList,
+  };
+
+  Kind kind = kInvalid;
+
+  // The type of the argument. If the argument is a resource, this
+  // is the type of the variable's value, not DT_RESOURCE.
+  DataType type = DT_INVALID;
+
+  // The shape of the argument. For:
+  // * a parameter: the shape of the parameter. We allow setting the xla shape
+  //   if known. This helps avoid conversions to and from TensorShape.
+  // * a constant: ignored; the shape given by constant_value is used
+  //     instead.
+  // * an uninitialized resource: ignored. We don't yet know the shape of an
+  //     uninitialized resource (otherwise we would have initialized it!)
+  // * an initialized variable: the shape of the variable's value.
+  // * an initialized TensorArray or Stack resource: the shape of an entry in
+  //   the TensorArray/Stack. Note this is the size of a single entry, not the
+  //   XLA data structure that represents the complete stack/array.
+  absl::variant<TensorShape, xla::Shape> shape;
+
+  // The value of the argument, if it is a compile-time constant. Must be a
+  // host-memory tensor.
+  Tensor constant_value;
+
+  // The upper bounds of the value.
+  std::optional<Tensor> value_bound;
+
+  // Indicates whether each value is dynamic or constant.
+  std::optional<Tensor> value_dynamism;
+
+  // The name of this argument, used for debugging.
+  string name;
+
+  // The name of TensorFlow _Arg node, used for debugging.
+  string node_name;
+
+  // For a kResource, what kind of resource is it?
+  XlaResource::Kind resource_kind = XlaResource::kInvalid;
+
+  // For a kResource, has this resource been initialized?
+  bool initialized = false;
+
+  // For a kResource, is this resource on Fast Memory.
+  bool fast_mem = false;
+
+  // For a TensorArray or Stack resource, what is the array's declared size?
+  // (Used for lazy initialization.)
+  int64_t max_array_size = -1;
+
+  // TensorArray resource parameters are passed as (array, gradient array 0,
+  // ..., gradient array k), where the gradient arrays are in the same order
+  // as `tensor_array_gradients`.
+  std::set<string> tensor_array_gradients;
+
+  // Whether this argument will receive the same data across all replicas.
+  bool is_same_data_across_replicas = false;
+
+  bool operator==(const XlaArgument& other) const;
+
+  // Returns a human-readable summary of the argument.
+  string HumanString() const;
+
+  // Returns the dimension sizes for either TensorShape or xla::Shape.
+  std::vector<int64_t> DimensionSizes() const;
+  absl::InlinedVector<int64_t, 4> DimensionSizesAsInlinedVector() const;
+
+  // Returns the human-readable string for either TensorShape or xla::Shape.
+  string ShapeHumanString() const;
+
+  // Whether to broadcast this parameter to all replicas before use.
+  // When true, xla_compiler should input/output alias this arg to prevent
+  // unnecessary HBM usage.
+  bool requires_broadcast = false;
+  std::optional<ManagedStackTrace> definition_stack_trace;
+};
+
+// Returns true if any of `args` is an uninitialized resource variable.
+bool AnyUninitializedResourceArg(absl::Span<const XlaArgument> args);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_ARGUMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compilation_device.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compilation_device.h
new file mode 100644
index 00000000..e3f6571c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+// Class is defined in xla_compilation_device.cc, reference
+// included here only so the XlaCompilationDevice allocator_ member can be
+// declared.
+class XlaCompilationAllocator;
+
+// This is a 'dummy' TensorFlow device that is only used to execute a
+// subgraph of XLA compilation Ops to construct a compiled version
+// of the subgraph's computation. It has a 'dummy' allocator that
+// backs each Tensor with an XlaExpression. The shape of the Tensor
+// matches the shape of XlaExpression.
+//
+// We deliberately don't register a device factory because we *never*
+// want placement to put Ops on a compilation device. The device is created
+// manually, not using a factory.
+//
+// XLA compilation is not thread-safe. OpKernels registered on the
+// XlaCompilationDevice must not use threads or concurrency.
+class XlaCompilationDevice : public LocalDevice {
+ public:
+  XlaCompilationDevice(const SessionOptions& options, DeviceType type);
+
+  ~XlaCompilationDevice() override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
+  absl::Status Sync() override;
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
+
+ private:
+  std::unique_ptr<XlaCompilationAllocator> allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
new file mode 100644
index 00000000..db280e23
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -0,0 +1,470 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "xla/cpu_function_runtime.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/buffer_desc.h"
+#include "xla/service/custom_call_status_internal.h"
+#include "tensorflow/core/platform/types.h"
+
+// Forward-declare, rather than include, to reduce code size for users that
+// never use this functionality.
+namespace xla {
+class ProgramShapeProto;
+class HloProfilePrinterData;
+
+namespace cpu {
+class CpuExecutable;
+}  // namespace cpu
+}  // namespace xla
+
+namespace tensorflow {
+
+// Represents a function compiled by XLA, produced via either JIT or AOT.
+//
+// The Run method invokes the actual computation, with inputs read from arg
+// buffers, and outputs written to result buffers. Each Run call may also use a
+// set of temporary buffers for the computation.
+//
+// By default each instance of this class manages its own arg, result and temp
+// buffers. The AllocMode constructor parameter may be used to modify the buffer
+// allocation strategy.
+//
+// Under the default allocation strategy, this class is thread-compatible:
+// o Calls to non-const methods require exclusive access to the object.
+// o Concurrent calls to const methods are OK, if those calls are made while it
+//   is guaranteed that no thread may call a non-const method.
+class XlaCompiledCpuFunction {
+ public:
+  // Type of the raw XLA Classic function, produced by either JIT or AOT.
+  using RawFunction = void (*)(void* result,
+                               const xla::ExecutableRunOptions* run_options,
+                               const void** args, void** temps,
+                               XlaCustomCallStatus*, int64_t* profile_counters);
+
+  // Simple struct to describe a tensor's shape.
+  // Note: this is a poor man's substitute for xla::ShapeProto, but we cannot
+  // depend on protobuf's in this library.
+  // TODO(ecg): extend ShapeInfo to support tuples, if needed.
+  struct ShapeInfo {
+    const int32_t* dimensions = nullptr;
+    int32_t num_dimensions = 0;
+  };
+
+  // StaticData represents the state necessary to run an XLA-compiled
+  // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
+  // AOT this is backed by data compiled into the object file.
+  //
+  // The contents of StaticData are XLA-internal implementation details and
+  // should not be relied on by clients (and therefore are private).
+  class StaticData {
+   private:
+    // The raw function to call.
+    RawFunction raw_function_;
+
+    // Contains information about the buffers used by the XLA computation.
+    const xla::cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
+    int32_t num_buffers_ = 0;
+
+    // Result parameter i is described by
+    // buffer_infos[result_index_table[i]].
+    const int32* result_index_table_ = nullptr;
+
+    // There are num_results result parameters.
+    int64_t num_results_ = 0;
+
+    // Entry parameter i is described by
+    // buffer_infos[arg_index_table[i]].
+    const int32* arg_index_table_ = nullptr;
+
+    // There are num_args entry parameters.
+    int64_t num_args_ = 0;
+
+    // There are num_variables variables.
+    int64_t num_variables_ = 0;
+
+    // The 0-based index of the result tuple, in the temp buffers.
+    size_t result_index_ = 0;
+
+    const ShapeInfo* arg_shape_infos_ = nullptr;
+    const ShapeInfo* result_shape_infos_ = nullptr;
+
+    // [Optional] Arrays of arg and result names. These are arrays of C-style
+    // strings, where the array is terminated by nullptr.
+    const char** arg_names_ = nullptr;
+    const char** variable_names_ = nullptr;
+    const char** result_names_ = nullptr;
+
+    // [Optional] Arg and result shapes.
+    const xla::ProgramShapeProto* program_shape_ = nullptr;
+
+    // [Optional] Profile printer data.  Null if profiling is disabled.
+    const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
+
+    // [Optional] The number of profile counters expected in the profile counter
+    // buffer by the generated code and hlo_profile_printer.  0 if profiling is
+    // disabled.  This information is already present in
+    // hlo_profile_printer_data but xla::HloProfilePrinterData is forward
+    // declared so we don't have access to that information here.
+    int64_t profile_counters_size_ = 0;
+
+    // Only XlaCompiledCpuFunction is allowed to read and write the above
+    // fields.
+    friend class XlaCompiledCpuFunction;
+  };
+
+  // AllocMode controls the buffer allocation mode.
+  enum class AllocMode {
+    // Allocate all buffers - args, results, profile and temps.
+    ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS,
+
+    // Only allocate result, profile and temp buffers.
+    // Use set_arg_data to set argument buffers before Run is called.
+    RESULTS_PROFILES_AND_TEMPS_ONLY,
+  };
+
+  explicit XlaCompiledCpuFunction(
+      const StaticData& static_data,
+      AllocMode alloc_mode =
+          AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS);
+  virtual ~XlaCompiledCpuFunction();
+
+  XlaCompiledCpuFunction(const XlaCompiledCpuFunction&) = delete;
+  XlaCompiledCpuFunction& operator=(const XlaCompiledCpuFunction&) = delete;
+  XlaCompiledCpuFunction(XlaCompiledCpuFunction&&) = default;
+  XlaCompiledCpuFunction& operator=(XlaCompiledCpuFunction&&) = default;
+
+  // Sets the intra-op thread pool used to run individual ops concurrently.
+  void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
+    run_options_.set_intra_op_thread_pool(pool);
+  }
+
+  // Runs the computation, with inputs read from arg buffers, and outputs
+  // written to result buffers. Returns true on success and false on failure.
+  bool Run();
+
+  // Returns the error message from the previous failed Run call.
+  //
+  // TODO(fschneider): For now this always returns an empty string because there
+  // is no support for error reporting in XLA. Remove this once all callers are
+  // updated.
+  string error_msg() const { return {}; }
+
+  // ------------------------------
+  // Arg methods for managing input buffers. Buffers are in row-major order.
+
+  // Returns the buffer for the positional argument at the given `index`.
+  void* arg_data(size_t index) {
+    return buffer_table_[arg_index_table_[index]];
+  }
+  const void* arg_data(size_t index) const {
+    return buffer_table_[arg_index_table_[index]];
+  }
+
+  int num_results() const { return num_results_; }
+
+  int num_args() const { return num_args_; }
+
+  int num_variables() const { return num_variables_; }
+
+  // Returns the size of entry parameter `idx`.
+  //
+  // There is a static version of this method on tfcompile generated subclasses
+  // of XlaCompiledCpuFunction, but try to prefer this when possible since it
+  // works both for XlaJitCompiledCpuFunction and AOT compiled subclasses.
+  int arg_size(int idx) const {
+    assert(idx < num_args());
+    return buffer_infos_[arg_index_table_[idx]].size();
+  }
+
+  // Sets the buffer for the positional argument at the given `index` to `data`.
+  // Must be called before Run to have an effect. May be called under any
+  // AllocMode; if the AllocMode is RESULTS_AND_TEMPS_ONLY, this method must be
+  // called for each positional argument, in order to set the argument buffers.
+  //
+  // Allocated memory must be aligned to the size specified by
+  // xla::cpu_function_runtime::MinAlign(). If possible, use the functions in
+  // tensorflow/compiler/tf2xla/cpu_function_runtime.h to ensure correct
+  // alignment.
+  //
+  // Aliasing of argument and result buffers is not allowed, and results in
+  // undefined behavior.
+  void set_arg_data(size_t index, const void* data) {
+    assert((arg_size(index) < xla::cpu_function_runtime::MinAlign() ||
+            (uintptr_t)data % xla::cpu_function_runtime::MinAlign() == 0) &&
+           "Underaligned pointer!");
+    // The const_cast is safe because the generated code does not write to arg
+    // buffers.
+    //
+    // buffer_table_ contains pointers to buffers that _will_ be written to by
+    // generated code so it would be misleading to make buffer_table_ a `const
+    // void**`.
+    buffer_table_[arg_index_table_[index]] = const_cast<void*>(data);
+  }
+
+  // ------------------------------
+  // Result methods for managing output buffers. Buffers are in row-major order.
+  // Must only be called after a successful Run call. Unlike the arg methods,
+  // there is no set_resultN_data method. The result buffers are managed
+  // internally, and may change after each call to Run.
+
+  // Returns the underlying array of result buffers, where results()[I] is the
+  // buffer for the positional result at index I.
+  void** results() { return static_cast<void**>(buffer_table_[result_index_]); }
+  const void* const* results() const {
+    return static_cast<const void* const*>(buffer_table_[result_index_]);
+  }
+
+  // Profile counters for this XLA computation.
+  //
+  // When Hlo profiling is enabled (`hlo_profiling_enabled()` return true in
+  // this case) these counters are non-null and are automatically populated by
+  // `Run`.  The counters can then be pretty-printed using
+  // `hlo_profile_printer()`.
+  //
+  // When Hlo profiling is disabled, this accessor returns null.
+  const int64_t* profile_counters() const { return profile_counters_; }
+
+  // Returns the buffer for the positional result at the given `index`.
+  void* result_data(size_t index) { return results()[index]; }
+  const void* result_data(size_t index) const { return results()[index]; }
+
+  // ------------------------------
+  // Methods for extracting optional metadata.
+
+  // Returns true iff data is available for the Lookup{Arg,Variable,Result}Index
+  // methods. E.g. the data might not be compiled into the binary for AOT.
+  bool HasNameIndices() const {
+    return arg_names_ != nullptr && variable_names_ != nullptr &&
+           result_names_ != nullptr;
+  }
+
+  // Returns the 0-based index for the argument with the given `name`.
+  // Returns -1 if the name wasn't found, or data isn't available.
+  //
+  // The index remains constant for every instance of XlaCompiledCpuFunction
+  // generated from the same static data, and might not be cheap to determine.
+  // Recommended usage is to capture this in a variable for re-use.
+  int LookupArgIndex(const string& name) const;
+
+  // Returns the 0-based index for the variable with the given `name`.
+  // Returns -1 if the name wasn't found, or data isn't available.
+  //
+  // The index remains constant for every instance of XlaCompiledCpuFunction
+  // generated from the same static data, and might not be cheap to determine.
+  // Recommended usage is to capture this in a variable for re-use.
+  int LookupVariableIndex(const string& name) const;
+
+  // Returns the 0-based index for the result with the given `name`.
+  // Returns -1 if the name wasn't found, or data isn't available.
+  //
+  // The index remains constant for every instance of XlaCompiledCpuFunction
+  // generated from the same static data, and might not be cheap to determine.
+  // Recommended usage is to capture this in a variable for re-use.
+  int LookupResultIndex(const string& name) const;
+
+  // Returns the name of the argument at `index`.
+  // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
+  const char* GetArgName(int index) const;
+
+  // Returns the name of the variable at `index`.
+  // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
+  const char* GetVariableName(int index) const;
+
+  // Returns the name of the result at `index`.
+  // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
+  const char* GetResultName(int index) const;
+
+  // Returns the shape of the args and results. May return nullptr if the
+  // program shape isn't available.
+  const xla::ProgramShapeProto* ProgramShape() const { return program_shape_; }
+
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
+  const xla::HloProfilePrinterData& hlo_profile_printer_data() const {
+    assert(hlo_profiling_enabled());
+    return *hlo_profile_printer_data_;
+  }
+
+ protected:
+  // ---------------------------------------------------------------------------
+  // Accessors for reading from and writing to instances of `StaticData`.
+  //
+  // Classes generated by tfcompile can call these because the generated classes
+  // inherit from `XlaCompiledCpuFunction`.  `XlaJitCompiledCpuFunction` can
+  // call these because it is explicitly added as a friend.
+
+  static void set_static_data_raw_function(StaticData* static_data,
+                                           RawFunction raw_function) {
+    static_data->raw_function_ = raw_function;
+  }
+
+  static void set_static_data_buffer_infos(
+      StaticData* static_data,
+      const xla::cpu_function_runtime::BufferInfo* buffer_infos) {
+    static_data->buffer_infos_ = buffer_infos;
+  }
+
+  static void set_static_data_num_buffers(StaticData* static_data,
+                                          size_t num_buffers) {
+    static_data->num_buffers_ = num_buffers;
+  }
+
+  static void set_static_data_result_index_table(
+      StaticData* static_data, const int32* result_index_table) {
+    static_data->result_index_table_ = result_index_table;
+  }
+
+  static void set_static_data_num_results(StaticData* static_data,
+                                          int64_t num_results) {
+    static_data->num_results_ = num_results;
+  }
+
+  static void set_static_data_arg_index_table(StaticData* static_data,
+                                              const int32* arg_index_table) {
+    static_data->arg_index_table_ = arg_index_table;
+  }
+
+  static void set_static_data_num_args(StaticData* static_data,
+                                       int64_t num_args) {
+    static_data->num_args_ = num_args;
+  }
+
+  static void set_static_data_num_variables(StaticData* static_data,
+                                            int64_t num_variables) {
+    static_data->num_variables_ = num_variables;
+  }
+
+  static void set_static_data_result_index(StaticData* static_data,
+                                           size_t result_index) {
+    static_data->result_index_ = result_index;
+  }
+
+  static void set_static_data_arg_shape_infos(StaticData* static_data,
+                                              const ShapeInfo* shape_infos) {
+    static_data->arg_shape_infos_ = shape_infos;
+  }
+
+  static void set_static_data_result_shape_infos(StaticData* static_data,
+                                                 const ShapeInfo* shape_infos) {
+    static_data->result_shape_infos_ = shape_infos;
+  }
+
+  static void set_static_data_arg_names(StaticData* static_data,
+                                        const char** arg_names) {
+    static_data->arg_names_ = arg_names;
+  }
+
+  static void set_static_data_variable_names(StaticData* static_data,
+                                             const char** variable_names) {
+    static_data->variable_names_ = variable_names;
+  }
+
+  static void set_static_data_result_names(StaticData* static_data,
+                                           const char** result_names) {
+    static_data->result_names_ = result_names;
+  }
+
+  static void set_static_data_program_shape(
+      StaticData* static_data, const xla::ProgramShapeProto* program_shape) {
+    static_data->program_shape_ = program_shape;
+  }
+
+  static void set_static_data_hlo_profile_printer_data(
+      StaticData* static_data,
+      const xla::HloProfilePrinterData* hlo_profile_printer_data) {
+    static_data->hlo_profile_printer_data_ = hlo_profile_printer_data;
+  }
+
+  static const xla::HloProfilePrinterData*
+  get_static_data_hlo_profile_printer_data(StaticData* static_data) {
+    return static_data->hlo_profile_printer_data_;
+  }
+
+  static void set_static_data_profile_counters_size(
+      StaticData* static_data, int64_t profile_counters_size) {
+    static_data->profile_counters_size_ = profile_counters_size;
+  }
+
+  // TODO(ezhulenev): This is a no-op after removing xla runtime, however it is
+  // still required for building some targets. Figure out why and delete!
+  static void set_static_data_use_xla_runtime(StaticData* static_data, bool) {}
+
+ private:
+  const RawFunction raw_function_;
+
+  const size_t result_index_;
+
+  // Array containing pointers to argument and temp buffers (slots corresponding
+  // to constant and on-stack buffers are null).
+  void** const buffer_table_;
+
+  // Describes the buffers used by the XLA computation.
+  const xla::cpu_function_runtime::BufferInfo* const buffer_infos_;
+  const int32 num_buffers_;
+
+  // Indices of expanded result tuple.
+  const int32 num_results_;
+  const int32* const result_index_table_;
+
+  // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
+  // for XLA generated code to be able to find it.
+  const int32* const arg_index_table_;
+
+  // The number of incoming arguments.
+  const int32 num_args_;
+
+  // The number of incoming variables.
+  const int32 num_variables_;
+
+  // Shapes of the input arguments.
+  const ShapeInfo* const arg_shape_infos_;
+
+  // Shapes of the results.
+  const ShapeInfo* const result_shape_infos_;
+
+  // Backing memory for buffer_table_ and args_, the latter depending on
+  // AllocMode.
+  void* alloc_buffer_table_ = nullptr;
+
+  // Backing memory for profiling counters.
+  int64_t* profile_counters_ = nullptr;
+
+  // Options and context passed to the compiled function.
+  xla::ExecutableRunOptions run_options_;
+
+  // Optional metadata.
+  const char** arg_names_ = nullptr;
+  const char** variable_names_ = nullptr;
+  const char** result_names_ = nullptr;
+  const xla::ProgramShapeProto* program_shape_ = nullptr;
+  const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
+
+  // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
+  // `set_static_data_*` static methods above.
+  friend class XlaJitCompiledCpuFunction;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compiler.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compiler.h
new file mode 100644
index 00000000..cbb57f38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -0,0 +1,403 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILER_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILER_H_
+
+#include <stack>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/client/local_client.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+class XlaContext;
+
+// The XlaCompiler class is responsible for compilation of a self-contained
+// subgraph of a TensorFlow computation using the XLA linear algebra runtime.
+// It does a symbolic execution of the graph starting from specific input
+// shapes, using a JIT device to convert operators into XLA computations.
+//
+// XlaCompiler is typically invoked from an `XlaLaunch` operator once the
+// shapes of all input parameters to the computation are known. This is
+// because the symbolic execution requires known shapes for all operations.
+//
+// XlaCompiler compiles Tensorflow graphs that received inputs via _Arg nodes,
+// and return outputs via _Retval nodes.
+//
+// The XlaCompiler requires one Argument struct for each _Arg index, that
+// describes each argument. Arguments can be compile-time constants
+// (kind kConstant), run-time parameters (kind kParameter), or resources
+// (kind kResource).
+//
+// Only kParameter and initialized kResource arguments become runtime parameters
+// to the generated XLA computation.
+//
+// The run-time outputs of the XLA computation are arranged in the following
+// order:
+//   +------------------+-----------------------------------------+
+//   |  _Retval values  |  Updated values of kResource arguments  |
+//   +------------------+-----------------------------------------+
+// _Retval values are ordered by _Retval index, whereas kResource values are
+// ordered by the original _Arg position of the variable.
+//
+// If a shape representation function is provided as part of
+// XlaCompiler::CompileOptions, kParameter arguments and return values to an
+// entry computation will be reshaped in accordance to the shape function.
+// Arguments and return values to a non-entry computation are not reshaped.
+// Variable resource arguments are passed and returned in reshaped form, even
+// for non-entry computations. This feature allows TensorFlow to keep on-device
+// tensors with a different shape to their representation inside the XLA
+// computation.
+//
+// In computation outputs, updated kResource values are placed the end. When
+// emitting While loop bodies, we must ensure that the loop body has
+// identical input and output signatures. By passing variable values
+// at the end of the argument list and using the
+// `return_updated_values_for_all_variables` option, we can ensure that the
+// input and output values of resources appear at the same positions.
+//
+// Resources are passed as parameters or returned as resource updates in
+// "packed" form.
+// kStack resources are packed as (array, size of stack) XLA tuples.
+// kTensorArray resources without gradients are packed as the array that
+// backs the TensorArray. If gradients are present (`tensor_array_gradients`),
+// the packed representation is a (array, gradient0, gradient1, ...) tuple,
+// where gradient_k is the value of the k-th gradient in the
+// `tensor_array_gradients` ordered set.
+class XlaCompiler {
+ public:
+  // TODO(b/255826209): Remove this alias. Depending on XlaCompiler just to use
+  // XlaArgument seeems weird and can cause circular dependencies.
+  using Argument = ::tensorflow::XlaArgument;
+
+  // Options pertaining to an individual call to CompileGraph() or
+  // CompileFunction().
+  struct CompileOptions {
+    // If `use_tuple_arg` is true, a single tuple parameter will be used for all
+    // arguments; if false, each argument gets its own parameter.
+    bool use_tuple_arg = false;
+
+    // If 'return_updated_values_for_all_resources' is true, then updated
+    // values of all resource arguments will be included in the
+    // 'resource_updates' of the computation, even if the resource was not
+    // modified by the computation. Used when compiling loop bodies to ensure
+    // the input and output signatures match.
+    bool return_updated_values_for_all_resources = false;
+
+    // If 'always_return_tuple' is true, then the output of a computation will
+    // always be a tuple. Otherwise, a single-element output will not be wrapped
+    // in a tuple.
+    bool always_return_tuple = true;
+
+    // True when compiling the entry computation, false for subcomputations
+    // (while, call, etc.)
+    bool is_entry_computation = true;
+
+    // True when we should add XLA input & output to the graph/function.
+    bool add_token_input_output = false;
+
+    // Resource updates are converted into input / output of xla. The two
+    // buffers are aliased with other if this option is true.
+    bool alias_resource_update = false;
+  };
+
+  using OutputDescription = ::tensorflow::XlaOutputDescription;
+
+  using ResourceUpdate = ::tensorflow::XlaResourceUpdate;
+
+  using CompilationResult = ::tensorflow::XlaCompilationResult;
+
+  struct Options {
+    // Name of the compilation device to use. It must be set by the caller.
+    // The default empty value is invalid.
+    DeviceType device_type = DeviceType("");
+
+    // The device to use during compilation to execute instructions on, for
+    // example for auto-tuning.
+    // Valid values are defined by `xla::Backend::devices_ordinal_supported()`.
+    // -1 indicates the default device should be used.
+    int device_ordinal = -1;
+
+    xla::Client* client = nullptr;
+
+    // Function library in which to find function definitions. Must be non-null.
+    const FunctionLibraryDefinition* flib_def = nullptr;
+
+    // The graph def version to be compiled.
+    int graph_def_version = TF_GRAPH_DEF_VERSION;
+
+    // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
+    // for CPU.
+    bool allow_cpu_custom_calls = false;
+
+    // A ShapeDeterminationFns (i.e., a bundle of LayoutSelectionFn and
+    // ShapeRepresentationFn). Each bundle describes the XLA representation of
+    // arguments represented to XLA as the shape given by this shape function.
+    // Arguments are input activations or weights to an XLA entry computation.
+    // Variables are reshaped to this shape on write, and reshaped to their
+    // original shape on read.
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns;
+
+    // If not nullptr, populate_resource_manager is called with the
+    // compilation device's resource manager when the compilation
+    // device is created, and can be used to create metadata objects
+    // that can be accessed by XLA op kernels.
+    std::function<absl::Status(ResourceMgr*)>* populate_resource_manager =
+        nullptr;
+
+    // If not nullptr, this memory allocator can be used by the compiler for
+    // temporary allocations it might want to make during compilation.
+    //
+    // For example, the compiler may want to try out different algorithms and
+    // choose the fastest one, and it might run those algorithms over buffers
+    // created using this allocator.
+    //
+    // The compiler can function correctly without an explicit allocator given
+    // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
+    // allocate most or all available memory on the device, leaving none for the
+    // compiler to access, unless it can use TensorFlow's allocator.
+    // This must be a shared_ptr, as this is passed all the way down to the
+    // cluster compilation. This allows asynchronous compilation to hold a
+    // reference until the compilation is finished.
+    std::shared_ptr<se::DeviceMemoryAllocator> device_allocator;
+
+    // Alias input and output buffers for parameters that are passed-through XLA
+    // modules without being changed.
+    bool alias_passthrough_params = false;
+
+    // Enable detailed logging of compilation metadata.
+    bool detailed_logging = true;
+  };
+
+  // Argument for compiling a single op.
+  struct SingleOpCompileArgument {
+    // Data type of the output tensors. This is used to create _Retval node.
+    std::vector<DataType> output_dtypes;
+
+    // The NodeDef representing the op.
+    NodeDef node_def;
+
+    // This is currently only used to obtain MLIR TPU bridge rollout state.
+    // Can be removed once full rollout is complete.
+    ConfigProto config_proto;
+
+    SingleOpCompileArgument() = default;
+
+    explicit SingleOpCompileArgument(const OpKernelContext& ctx);
+  };
+
+  explicit XlaCompiler(Options options);
+
+  ~XlaCompiler();
+
+  // Helper function to populate an XlaCompiler::Argument from XlaResource.
+  static void PopulateArgumentFromResource(const XlaResource& resource,
+                                           Argument* arg);
+
+  absl::Status CompileFunction(const CompileOptions& options,
+                               const NameAttrList& fn_name_attrs,
+                               absl::Span<const Argument> args,
+                               CompilationResult* result);
+
+  absl::Status CompileSingleOp(
+      const CompileOptions& options,
+      const SingleOpCompileArgument& single_op_compile_argument,
+      absl::Span<const Argument> args, CompilationResult* result);
+
+  // Compiles a tensorflow::Graph into an xla::XlaComputation.
+  // Similar to CompileFunction, but takes a Graph as input rather than a
+  // function.
+  absl::Status CompileGraph(const CompileOptions& options, string const& name,
+                            std::unique_ptr<Graph> graph,
+                            absl::Span<const Argument> args,
+                            CompilationResult* result);
+
+  // Returns the shape of the XLA parameter for an argument 'arg'.
+  // See the class comment for more details about the argument passing
+  // convention.
+  absl::Status XLAShapeForArgument(
+      const Argument& arg, bool is_entry_computation,
+      const std::optional<xla::HloSharding>& arg_sharding,
+      xla::Shape* xla_shape) const;
+
+  // Retrieves the channel handle associated with `key`. Allocates
+  // a new channel handle if none exists.
+  // Channel handles can be used to communicate between different
+  // computations. Computations that communicate should be compiled with the
+  // same XlaCompiler.
+  absl::Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
+
+  // Retrieves the host-to-device channel handle associated with `key`.
+  // Allocates a new channel handle if none exists.
+  absl::Status GetHostToDeviceChannelHandle(const string& key,
+                                            xla::ChannelHandle* channel);
+
+  // Retrieves the device-to-host channel handle associated with `key`.
+  // Allocates a new channel handle if none exists.
+  absl::Status GetDeviceToHostChannelHandle(const string& key,
+                                            xla::ChannelHandle* channel);
+
+  // Sets the shapes and types for the device to host transfer associated with
+  // 'key'.
+  absl::Status SetDeviceToHostMetadata(const string& key,
+                                       absl::Span<const DataType> types,
+                                       absl::Span<const TensorShape> shapes);
+
+  // Gets the shapes the device to host transfer associated with 'key'.
+  absl::Status GetDeviceToHostShapes(const string& key,
+                                     std::vector<TensorShape>* shapes) const;
+
+  // Sets the shapes and types for the host to device transfer associated with
+  // 'key'.
+  absl::Status SetHostToDeviceMetadata(const string& key,
+                                       absl::Span<const DataType> types,
+                                       absl::Span<const TensorShape> shapes);
+
+  // In order to avoid deadlocks from dependencies in host computations, it can
+  // be necessary to enforce a partial order on the execution of HostCompute
+  // Ops. In particular it may be necessary to constrain the SendToHost for one
+  // HostCompute to run before blocking on the RecvAtHost for another
+  // HostCompute. The compiler maintains a mapping from 'host_compute_name' to
+  // handle, where the handle is an 'output' of the HostCompute Op corresponding
+  // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced
+  // later can add the handle as an 'input' to enforce the constraints.
+  // 'host_compute_name' can be any string the client wishes to use to identify
+  // a given HostCompute Op as long as the names are unique within the
+  // compilation.
+  absl::Status GetHostComputeControlDependency(const string& host_compute_name,
+                                               xla::XlaOp* handle);
+  absl::Status SetHostComputeControlDependency(const string& host_compute_name,
+                                               xla::XlaOp handle);
+
+  const Options& options() const { return options_; }
+  xla::Client* client() const { return options_.client; }
+  FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; }
+
+  void PushNodeTokenMapping();
+  absl::Status PopNodeTokenMapping();
+  absl::Status SetNodeToken(const string& node_name, xla::XlaOp op);
+  absl::StatusOr<xla::XlaOp> GetNodeToken(const string& node_name);
+
+  // Sets the function body `fbody` to the one registered as `function`.
+  absl::Status FindFunctionBody(const NameAttrList& function,
+                                const FunctionBody** fbody,
+                                const ConfigProto** config_proto = nullptr);
+
+ private:
+  absl::Mutex channel_mutex_;
+  // Returns the optimized graph object in this function body.
+  std::unique_ptr<Graph> GetGraph(const FunctionBody* fbody);
+
+  // Builds XLA computations for each of the arguments to the computation.
+  // `args` are the arguments to the computation.
+  absl::Status BuildArguments(
+      const Graph& graph, const std::vector<XlaCompiler::Argument>& args,
+      bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
+      const std::map<int, xla::OpSharding>& arg_shardings,
+      std::vector<XlaExpression>* arg_expressions,
+      std::vector<int>* input_to_args, std::vector<xla::Shape>* input_shapes,
+      bool is_entry_computation);
+
+  xla::ChannelHandle NewChannel(xla::ChannelHandle::ChannelType type);
+
+  // Graph compiler needs to know how to get an optimized graph from a function
+  // body.
+  friend class GraphCompiler;
+  friend class XlaCompilerTest;
+
+  Options options_;
+
+  // Status set to non-OK in the constructor if initialization fails.
+  absl::Status initialization_status_;
+
+  // Returns the next step sequence number.
+  int64_t NextStepId();
+
+  // Internal sequence number for steps executed on the compilation device.
+  int64_t next_step_id_;
+
+  XlaCompilationDevice* device_;  // Owned by device_mgr_
+  StaticDeviceMgr device_mgr_;
+
+  // The next sequence number to assign to a channel.
+  int64_t next_channel_ ABSL_GUARDED_BY(channel_mutex_) = 1;
+
+  // To avoid copying the client's function library, use a local function
+  // library and runtime for functions created as part of the functionalize
+  // control flow transformation.
+  std::unique_ptr<FunctionLibraryDefinition> local_flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> local_pflr_;
+
+  FunctionLibraryRuntime* local_flib_runtime_;  // owned by local_pflr_.
+  FunctionLibraryRuntime* flib_runtime_;        // owned by pflr_.
+
+  struct SignatureHash {
+    uint64 operator()(
+        const std::pair<string, std::vector<Argument>>& signature) const;
+  };
+
+  std::unordered_map<std::pair<string, std::vector<Argument>>,
+                     CompilationResult, SignatureHash>
+      cache_;
+
+  std::unordered_map<string, xla::ChannelHandle> channels_;
+
+  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
+  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
+
+  std::unordered_map<string, xla::XlaOp> host_compute_control_output_;
+
+  // This is used to store <node name, token output> mapping. Side-effecting
+  // ops call SetNodeToken() to record its token output, so later side-effecting
+  // ops can use GetNodeToken() to get it and use it as token input.
+  //
+  // It's a stack because we need a mapping like this for each level of nested
+  // CompileGraph() call. In CompileGraph(), we will push a new mapping to the
+  // stack, and pop the mapping before returning.
+  std::stack<std::map<string, xla::XlaOp>> node_token_mapping_stack_;
+
+  XlaCompiler(const XlaCompiler&) = delete;
+  void operator=(const XlaCompiler&) = delete;
+};
+
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_context.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_context.h
new file mode 100644
index 00000000..9184fb43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_context.h
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the contexts used during XLA compilation.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_CONTEXT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_CONTEXT_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/status_macros.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class XlaOpKernelContext;
+class XlaCompiler;
+
+// The XlaContext is the data structure that holds the state of an XLA
+// compilation, that is accessible from OpKernelContexts when compiling a
+// subgraph of Ops using XLA.
+class XlaContext : public ResourceBase {
+ public:
+  // Retrieves the XlaContext of the current compilation.
+  static XlaContext& Get(const OpKernelContext* ctx);
+
+  // Creates a new XlaContext. See the documentation on the class data fields
+  // for descriptions of the arguments.
+  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
+             const Graph* graph);
+
+  // Virtual method defined by ResourceBase.
+  string DebugString() const override;
+
+  XlaCompiler* compiler() const { return compiler_; }
+
+  const AbstractStackTrace* StackTraceForNodeName(const std::string& name) {
+    const auto& it = stack_traces_.find(name);
+    if (it != stack_traces_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
+  }
+
+  // Returns the XlaBuilder that Ops use for compiling new expressions.
+  xla::XlaBuilder* builder() { return builder_; }
+
+  const std::vector<XlaExpression>& args() const { return args_; }
+  void set_args(std::vector<XlaExpression> args);
+
+  const std::vector<XlaExpression>& retvals() { return retvals_; }
+
+  // Sets a return value.
+  // Since we do not always know in advance how many return values there are,
+  // grows the return values vector to size index+1 if it is smaller.
+  void SetRetval(int index, const XlaExpression& expression);
+
+  // Adds 'resource' to the set of resources owned by the context.
+  XlaResource* AddResource(std::unique_ptr<XlaResource> resource);
+
+  const std::vector<std::unique_ptr<XlaResource>>& resources() {
+    return resources_;
+  }
+
+  // Get an XLA lambda to compute Max. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateMax(const DataType type);
+
+  // Get an XLA lambda to compute Min. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateMin(const DataType type);
+
+  // Get an XLA lambda to compute Add. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateAdd(const DataType type);
+
+  // Get an XLA lambda to compute LogAddExp. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateLogAddExp(const DataType type);
+
+  // Get an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateMul(const DataType type);
+
+  // The name of the XlaContext resource during symbolic graph execution.
+  static const char kXlaContextResourceName[];
+
+  // Records the collective information from the nested compilation `result`.
+  absl::Status RecordCollectiveInfoFromNestedCompilationResult(
+      const XlaCompilationResult& result);
+
+  // Records the collective configurations for all the collectives in the XLA
+  // cluster and returns the channel_id to be used for the next collective.
+  absl::StatusOr<int64_t> RecordCollectiveInfo(int group_key, int group_size);
+
+  const std::optional<XlaCompilationResult::CollectiveInfo>&
+  GetCollectiveInfo() {
+    return collective_info_;
+  }
+
+ private:
+  XlaCompiler* const compiler_;
+
+  // The XlaBuilder used to construct the subgraph's compiled representation.
+  xla::XlaBuilder* builder_;
+
+  // Stack traces for the graph used for compilation.
+  StackTracesMap stack_traces_;
+
+  // Arguments to the Tensorflow graph, indexed by _Arg index.
+  // Includes both compile-time constant arguments and runtime parameters.
+  std::vector<XlaExpression> args_;
+
+  // Return values of the Tensorflow graph, indexed by _Retval index.
+  std::vector<XlaExpression> retvals_;
+
+  // Holds ownership of resources. The resources are not ordered.
+  std::vector<std::unique_ptr<XlaResource>> resources_;
+
+  // Information about encountered collective ops. We allow only a
+  // single configuration per cluster.
+  std::optional<XlaCompilationResult::CollectiveInfo> collective_info_;
+
+  // Cache of prebuilt computations indexed by their type.
+  using ComputationMap = std::map<DataType, xla::XlaComputation>;
+
+  // Finds the value for the given type in out map if it already
+  // exists or makes a new value with create function and keeps it the
+  // map. The returned value != nullptr and is owned by the map.
+  const xla::XlaComputation* LookupOrCreate(
+      DataType type, ComputationMap* out,
+      const std::function<xla::XlaComputation()>& create);
+
+  // Cached computation to compute Max of two elements, specialized by type.
+  ComputationMap max_func_;
+
+  // Cached computation to compute Min of two elements, specialized by type.
+  ComputationMap min_func_;
+
+  // Cached computation to compute Sum of two elements, specialized by type.
+  ComputationMap add_func_;
+
+  // Cached computation to compute Mul of two elements, specialized by type.
+  ComputationMap mul_func_;
+
+  // Cached computation to compute Log(Add(Exp())) of two elements, specialized
+  // by type.
+  ComputationMap log_add_exp_func_;
+
+  // Cached computation to compute Sigmoid of an element, specialized by type.
+  ComputationMap sigmoid_func_;
+
+  XlaContext(const XlaContext&) = delete;
+  void operator=(const XlaContext&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_expression.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_expression.h
new file mode 100644
index 00000000..d410b79a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_expression.h
@@ -0,0 +1,173 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "xla/client/client.h"
+#include "xla/hlo/builder/value_inference.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// A XlaExpression represents a symbolic TensorFlow value in a TF->XLA
+// compilation.
+// An expression is one of:
+// * a constant tensor.
+// * an xla::XlaOp, representing a symbolic XLA value.
+// * a resource, e.g., a variable, represented as an XlaResource pointer.
+// * a tensor list, represented by a tuple of tensors and the list length.
+//
+// Constant tensors are mostly an optimization to avoid passing large constants
+// to XLA, but are also sometimes used to represent tensors that have no XLA
+// representation, for example, DT_STRING tensors. A canonical use case might be
+// an error message string.
+//
+// Tensor lists are very similar to xla::XlaOp, however they require some
+// specific logic around shape management since the tuples are not supported by
+// TensorFlow.
+class XlaExpression {
+ public:
+  enum class Kind {
+    kInvalid,
+    kConstant,
+    kXlaOp,
+    kResource,
+    kTensorList,
+  };
+
+  XlaExpression();
+  XlaExpression(const XlaExpression&) = default;
+  XlaExpression& operator=(const XlaExpression&) = default;
+
+  // Builds an invalid expression. (Same as the default constructor, but makes
+  // the intent clearer.)
+  static XlaExpression Invalid();
+
+  // Builds a constant XLA expression.
+  static XlaExpression Constant(Tensor value);
+
+  // Builds a XlaOp expression. Since the mapping from TF data types to XLA
+  // types is not 1-1, the TF type must also be provided; in general it cannot
+  // be derived from the XLA type.
+  static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
+
+  // Builds a tensor list expression.
+  static XlaExpression TensorList(xla::XlaOp tensor_list);
+
+  // Builds a resource expression.
+  static XlaExpression Resource(XlaResource* resource);
+
+  // Builds a resource whose value is known at a compile time.
+  static XlaExpression ConstantResource(Tensor value, XlaResource* resource);
+
+  Kind kind() const { return kind_; }
+
+  DataType dtype() const { return dtype_; }
+
+  // handle() returns the XlaOp that backs a kXlaOp expression.
+  const xla::XlaOp& handle() const { return handle_; }
+
+  // Return a constant value associated with this expression. Always set for
+  // constants, might be set for resources.
+  std::optional<Tensor> constant_value() const {
+    if (kind_ == Kind::kResource && resource_->IsOverwritten()) {
+      // The constant is no longer available if the value was overwritten.
+      return std::nullopt;
+    }
+    return constant_value_;
+  }
+
+  // Set the bound of the expression.
+  void set_value_bound(Tensor tensor) {
+    value_bound_.emplace(std::move(tensor));
+  }
+
+  // Return the bound of the expression, if available.
+  std::optional<Tensor> value_bound() const { return value_bound_; }
+
+  // Set the dynamism of the expression, indicating whether or not each value in
+  // this expression is dynamic.
+  void set_value_dynamism(Tensor tensor) {
+    value_dynamism_.emplace(std::move(tensor));
+  }
+
+  // Return the dynamism of the expression, if available.
+  std::optional<Tensor> value_dynamism() const { return value_dynamism_; }
+
+  XlaResource* resource() const { return resource_; }
+
+  // Returns a human-readable summary of the expression.
+  string HumanString() const;
+
+  // Returns the value of a kValue or kXlaOp as an xla::XlaOp. Returns
+  // an erroneous XlaOp if the expression is not a constant or an expression.
+  xla::XlaOp AsXlaOp(xla::XlaBuilder* builder) const;
+
+  // If a kXlaOp or kValue expression can be resolved to a compile-time
+  // constant, returns the value as a host-memory Tensor. Returns an empty
+  // optional if it cannot be resolved. Returns an error if passed a resource
+  // expression.
+  absl::StatusOr<std::optional<Tensor>> ResolveConstant(
+      xla::Client* client, bool dynamic_dimension_is_minus_one = false,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue) const;
+
+  // ResolveDynamism computes where a value inside this op is dynamic or can be
+  // inferred at compile time.
+  absl::StatusOr<Tensor> ResolveDynamism() const;
+
+  // Returns the shape of the tensor.
+  // The shape of a resource is the shape of a resource handle (i.e., a scalar),
+  // not the shape of the resource's value.
+  absl::StatusOr<TensorShape> GetShape() const;
+  absl::StatusOr<xla::Shape> GetXlaShape() const;
+
+  // Retrieves an XlaExpression that was allocated by a previous Op.
+  static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor);
+
+  // Assigns an XlaExpression to a tensor on an XLA compilation device.
+  static void AssignExpressionToTensor(const XlaExpression& value,
+                                       Tensor* tensor);
+
+ private:
+  Kind kind_ = Kind::kInvalid;
+
+  DataType dtype_ = DT_INVALID;
+
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp or
+  // a tuple expression if kind_ == kTensorList.
+  xla::XlaOp handle_;
+
+  // The value of the constant, if available.
+  std::optional<Tensor> constant_value_;
+
+  // The bound of the expression, if available.
+  std::optional<Tensor> value_bound_;
+
+  // Indicate whether each value inside a tensor is dynamic or not.
+  std::optional<Tensor> value_dynamism_;
+
+  // The resource, if kind_ == kResource. Not owned.
+  XlaResource* resource_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_helpers.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_helpers.h
new file mode 100644
index 00000000..38f01c83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -0,0 +1,214 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines helper routines for the XLA device.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
+
+#include <string>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/translate/mhlo_to_hlo/layout_util.h"
+#include "xla/service/computation_placer.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+using XlaLayoutPreference = mlir::XlaLayoutPreference;
+
+inline std::string GetDeviceToHostChannelName(absl::string_view channel_key,
+                                              int index) {
+  return absl::StrCat(channel_key, "_dtoh_", index);
+}
+inline std::string GetHostToDeviceChannelName(absl::string_view channel_key,
+                                              int index) {
+  return absl::StrCat(channel_key, "_htod_", index);
+}
+
+// Helper methods for building XLA computations.
+class XlaHelpers {
+ public:
+  // Returns a handle representing the zero value of a scalar
+  // element of data_type.
+  static xla::XlaOp Zero(xla::XlaBuilder* b, DataType data_type);
+
+  // Returns a handle representing the one value of a scalar
+  // element of data_type.
+  static xla::XlaOp One(xla::XlaBuilder* b, DataType data_type);
+
+  // Returns a handle representing the given value of an integer scalar
+  // element of data_type.
+  // Note that unlike One and Zero, does not work on boolean types.
+  static xla::XlaOp IntegerLiteral(xla::XlaBuilder* b, DataType data_type,
+                                   int64_t value);
+
+  // Returns a handle representing the given value of a floating-point scalar
+  // element of data_type.
+  static xla::XlaOp FloatLiteral(xla::XlaBuilder* b, DataType data_type,
+                                 double value);
+
+  // Reshapes literal 'input' to have 'shape'. Both the original shape and
+  // 'shape' must contain the same number of elements.
+  static absl::Status ReshapeLiteral(const xla::Literal& input,
+                                     absl::Span<const int64_t> shape,
+                                     xla::Literal* output);
+
+  // Converts `indices` into a one-hot representation. `depth` is the size
+  // of the new axis to add. `axis` is the position at which to add the new
+  // axis. `indices_shape` is the shape of `indices`. `on_value` and
+  // `off_value` represent the values to use for the on and off positions,
+  // respectively.
+  static absl::Status OneHot(xla::XlaBuilder* builder, int64_t depth, int axis,
+                             DataType index_type,
+                             const TensorShape& indices_shape,
+                             xla::XlaOp indices, xla::XlaOp on_value,
+                             xla::XlaOp off_value, xla::XlaOp* one_hot);
+
+  // Certain DataTypes should use increased precision DataTypes when performing
+  // reductions.  This function remaps a given DataType to a higher precision
+  // DataType if needed.
+  static DataType SumAccumulationType(const DataType& dtype);
+
+  // A helper for creating a ConvertElementType xla op given a DataType rather
+  // than the xla::PrimitiveType.
+  static xla::XlaOp ConvertElementType(xla::XlaOp operand,
+                                       const DataType new_element_type);
+
+  typedef std::function<absl::StatusOr<xla::Shape>(const TensorShape&, DataType,
+                                                   bool, XlaLayoutPreference)>
+      ShapeRepresentationFn;
+};
+
+// Creates an identity shape representation function.
+XlaHelpers::ShapeRepresentationFn IdentityShapeRepresentationFn();
+
+struct XlaOutputDescription {
+  // Type and shape of the output. The shape is the unflattened shape.
+  // When `type` is DT_RESOURCE, `shape` is the shape of the resource
+  // variable's value.
+  DataType type;
+  TensorShape shape;
+
+  // Constant output value, if known to be constant at JIT compilation time.
+  // 'Tensor' is in host memory.
+  bool is_constant = false;
+  Tensor constant_value;
+
+  // When this output is a resource, i.e. `type == DT_RESOURCE`, this is
+  // the index of the input that contains the resource.
+  int input_index;
+
+  // Whether this output is a TensorList.
+  bool is_tensor_list = false;
+};
+
+// Describes a variable write side effect of the computation.
+struct XlaResourceUpdate {
+  // Index of the input that contains the variable resource to write to.
+  int input_index;
+
+  // Type and shape of the tensor to be written back.
+  // The `shape` field has the same meaning as the Argument::shape field.
+  DataType type;
+  TensorShape shape;
+
+  // Was the value of the variable modified by the computation?
+  // (Always true, unless `return_updated_values_for_all_resources` is true.)
+  bool modified;
+
+  // If the resource is a TensorArray, the set of gradients read or written.
+  std::set<string> tensor_array_gradients_accessed;
+};
+
+struct XlaCompilationResult {
+  // Vector that maps from the parameters of the XLA computation to their
+  // original argument positions. To handle compile-time constant inputs, the
+  // parameters to the XLA computation may be a subset of the original
+  // arguments. The relative ordering of parameters are maintained.
+  std::vector<int> input_mapping;
+
+  // Input shapes of the computation. If we are flattening inputs, these are
+  // the flattened shapes.
+  std::vector<xla::Shape> xla_input_shapes;
+
+  // Output shape in XLA format. The output shape is always a tuple. If we
+  // are flattening outputs, these are the flattened shapes.
+  xla::Shape xla_output_shape;
+
+  // TensorFlow shapes of outputs, together with the values of any
+  // constant arguments. Vector indexed by Tensorflow _Retval number,
+  // containing both constant and non-constant results.
+  std::vector<XlaOutputDescription> outputs;
+
+  // TensorFlow shapes and types of sends/recvs from HostCompute Ops to their
+  // matching RecvAtHost/SendFromHost Ops in the outer graph.
+  tf2xla::HostComputeMetadata host_compute_metadata;
+
+  // Resources whose values were updated by the computation, ordered
+  // by return value position (which is the same as the order the resources
+  // were passed as arguments). Resource updates follow the non-constant
+  // results in the outputs of XLA computation.
+  std::vector<XlaResourceUpdate> resource_updates;
+
+  // The XLA computation built from the tensorflow subgraph.
+  std::shared_ptr<xla::XlaComputation> computation;
+
+  // Meta-info about encountered collective ops.
+  struct CollectiveInfo {
+    int group_key;
+    int group_size;
+    int next_id;
+
+    template <typename H>
+    friend H AbslHashValue(H h, const CollectiveInfo& info) {
+      return H::combine(std::move(h), info.group_key, info.group_size,
+                        info.next_id);
+    }
+
+    friend bool operator==(const CollectiveInfo& lhs,
+                           const CollectiveInfo& rhs) {
+      return lhs.group_key == rhs.group_key &&
+             lhs.group_size == rhs.group_size && lhs.next_id == rhs.next_id;
+    }
+  };
+
+  // Information of the collectives encountered during the translation.
+  std::optional<CollectiveInfo> collective_info;
+};
+
+// Resolves the device assignment based on CollectiveInfo.
+// CollectiveInfo records collective ops in the cluster. Note that
+// this relies on a rendezvous and blocks until all replicas are there.
+//
+// Takes several extra configuration objects by reference since
+// xla::ExecutableRunOptions does not take ownership; these are configured and
+// bundled into `run_options` if applicable.
+absl::Status ResolveDeviceAssignment(
+    OpKernelContext* ctx,
+    const XlaCompilationResult::CollectiveInfo& collective_info,
+    xla::ExecutableRunOptions& run_options,
+    xla::DeviceAssignment& device_assignment,
+    xla::gpu::GpuExecutableRunOptions& gpu_options);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
new file mode 100644
index 00000000..c3982bb5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -0,0 +1,100 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_JIT_COMPILED_CPU_FUNCTION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_JIT_COMPILED_CPU_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "xla/client/local_client.h"
+#include "xla/cpu_function_runtime.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Represents the result of JIT compilation by XLA down to a function. This
+// class holds the state necessary to create XlaCompiledCpuFunction instances,
+// which are used to actually invoke the compiled computation.
+//
+// XlaJitCompiledCpuFunction must outlive the XlaCompiledCpuFunctions that are
+// created from it. It holds state shared by all of the functions, including the
+// JIT-compiled function itself, along with buffer sizes and other metadata
+// necessary for execution.
+class XlaJitCompiledCpuFunction {
+ public:
+  // Compile a tensorflow::GraphDef into an XlaJitCompiledCpuFunction. The given
+  // `config` specifies the portion of the graph to compile, via feeds and
+  // fetches. Each feed is a positional input argument for the compiled
+  // function, while each fetch is a positional output argument.
+  static absl::StatusOr<std::unique_ptr<XlaJitCompiledCpuFunction>> Compile(
+      const GraphDef& graph_def, const tf2xla::Config& config,
+      const xla::ExecutableBuildOptions& build_options);
+
+  XlaJitCompiledCpuFunction(const XlaJitCompiledCpuFunction&) = delete;
+  XlaJitCompiledCpuFunction& operator=(const XlaJitCompiledCpuFunction&) =
+      delete;
+
+  // Returns static data used to create an XlaCompiledCpuFunction instance,
+  // which represents the JIT-compiled function. The static data is unchanging
+  // across each instance.
+  const XlaCompiledCpuFunction::StaticData& StaticData() const {
+    return static_data_;
+  }
+
+  const xla::LocalExecutable& LocalExecutable() const {
+    CHECK(executable_);  // Crash ok
+    return *executable_;
+  }
+
+ private:
+  XlaJitCompiledCpuFunction() {}
+
+  // The executable holds the underlying function.
+  std::unique_ptr<xla::LocalExecutable> executable_;
+
+  // The static data is backed by the rest of the state in this class.
+  XlaCompiledCpuFunction::StaticData static_data_;
+
+  // The backing array for buffer infos.
+  std::vector<xla::cpu_function_runtime::BufferInfo> buffer_infos_;
+
+  // The backing array for the arg index table.
+  std::vector<int32> arg_index_table_;
+
+  // The backing arrays of arg and result names. We hold the actual strings in
+  // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static
+  // data to refer to.
+  std::vector<string> nonempty_arg_names_;
+  std::vector<string> nonempty_variable_names_;
+  std::vector<string> nonempty_result_names_;
+  std::vector<const char*> arg_names_;
+  std::vector<const char*> variable_names_;
+  std::vector<const char*> result_names_;
+
+  // The backing data for the program shape. The proto form of program shape is
+  // used because the program shape is serialized and embedded in the object
+  // file.
+  std::unique_ptr<const xla::ProgramShapeProto> program_shape_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_JIT_COMPILED_CPU_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_op_kernel.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_op_kernel.h
new file mode 100644
index 00000000..b0830d07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -0,0 +1,390 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
+
+#include "absl/base/attributes.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "xla/hlo/builder/value_inference.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class XlaOpKernelContext;
+
+// Implementations of operators that generate XLA code should usually subclass
+// XlaOpKernel and implement the Compile() method. Unlike a regular OpKernel,
+// an XlaOpKernel produces and consumes symbolic values during compilation.
+//
+// See the comments in xla_context.h for more details.
+class XlaOpKernel : public OpKernel {
+ public:
+  explicit XlaOpKernel(OpKernelConstruction* construction);
+
+  // Subclasses should implement Compile(), much as standard OpKernels implement
+  // Compute().
+  virtual void Compile(XlaOpKernelContext* context) = 0;
+
+ private:
+  void Compute(OpKernelContext* context) final;
+};
+
+// The context passed to the Compile() method of XlaOpKernel. An
+// XlaOpKernelContext is a variant of the standard OpKernel class, tailored for
+// implementing operators that perform symbolic execution as part of the XLA
+// compiler. The key difference is that XlaOpKernelContext produces and consumes
+// data as XLA computations, rather than as standard Tensors.
+//
+// Under the hood, symbolic execution communicates using special Tensors that
+// wrap XlaExpression objects, however this is an implementation detail that
+// this class hides. The *only* correct way to allocate a Tensor during
+// compilation is using the XlaOpKernelContext methods, since they ensure there
+// is a valid XlaExpression backing the tensor. No Op should ever call
+// allocate_output or allocate_temp directly on the underlying OpKernelContext.
+class XlaOpKernelContext {
+ public:
+  explicit XlaOpKernelContext(OpKernelContext* context);
+
+  XlaContext* xla_context() const;
+
+  // Returns the XLA XlaBuilder containing the output of compilation.
+  xla::XlaBuilder* builder() const;
+
+  xla::ValueInference& value_inference();
+
+  // Inputs
+
+  // Returns the number of inputs to the operator.
+  int num_inputs() const { return context_->num_inputs(); }
+
+  // Returns the type of input `index`.
+  DataType input_type(int index) const;
+
+  // Returns the type of input `name`.
+  DataType InputType(absl::string_view name);
+
+  // Returns the type of input `index` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType input_xla_type(int index);
+
+  // Returns the type of input `name` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType InputXlaType(absl::string_view name);
+
+  // Returns the shape of input at `index` or input the given `name`. Note that
+  // in case the shape of the input is not static, then the returned shape has
+  // bounds as the dimension size instead of having unknown dimensions. Use
+  // InputXlaShape instead that provides shapes with dynamism information.
+  //
+  ABSL_DEPRECATED(
+      "Prefer InputXlaShape which handles dynamic shapes accurately.")
+  TensorShape InputShape(int index);
+  ABSL_DEPRECATED(
+      "Prefer InputXlaShape which handles dynamic shapes accurately.")
+  TensorShape InputShape(absl::string_view name);
+
+  // Returns input `index` as a XlaOp. Unlike
+  // OpKernelContext::Input returns a symbolic value rather than a concrete
+  // Tensor.
+  xla::XlaOp Input(int index);
+  // Returns input `name` as a XlaOp.
+  xla::XlaOp Input(absl::string_view name);
+
+  // Returns the xla input shape for a given index.
+  absl::StatusOr<xla::Shape> InputXlaShape(int index);
+  absl::StatusOr<xla::Shape> InputXlaShape(absl::string_view name);
+
+  // Returns true if all inputs are the same shape, otherwise sets the
+  // status to a non-OK value and returns false.
+  // Usage: if (!context->ValidateInputsAreSameShape(this)) return;
+  bool ValidateInputsAreSameShape(OpKernel* op) TF_MUST_USE_RESULT;
+
+  // Returns the named list-valued immutable input in "list", as
+  // defined in the OpDef.  If the named output is not list-valued,
+  // returns a one-element list.
+  absl::Status InputList(absl::string_view name,
+                         std::vector<xla::XlaOp>* handles,
+                         std::vector<TensorShape>* shapes);
+  // Evaluates input and returns their dynamism vector in a vector of
+  // predicates.
+  absl::Status ResolveInputDynamismIntoPredVector(int index,
+                                                  std::vector<bool>* out);
+  absl::Status ResolveInputDynamismIntoPred(int index, bool* out);
+  absl::Status ResolveInputDynamismIntoPredVector(absl::string_view name,
+                                                  std::vector<bool>* out);
+  absl::Status ResolveInputDynamismIntoPred(absl::string_view name, bool* out);
+
+  absl::Status ResolveInputDynamism(int index, xla::Literal* dynamism_literal);
+  absl::Status ResolveInputDynamism(absl::string_view name,
+                                    xla::Literal* dynamism_literal);
+
+  absl::Status ResolveInputDynamismReshaped(int index,
+                                            absl::Span<const int64_t> new_dims,
+                                            xla::Literal* dynamism_literal);
+  // Helper methods for constant inputs.
+
+  // Evaluates input `index` and stores it in `*constant_literal`. If the
+  // expression cannot be evaluated, e.g., because it depends on unbound
+  // parameters, returns a non-OK status. This function can also be used to
+  // infer constant input upper or lower bounds, by changing the `mode`
+  // parameter.
+  absl::Status ConstantInput(
+      int index, xla::Literal* constant_literal,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+  absl::Status ConstantInput(
+      absl::string_view name, xla::Literal* constant_literal,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Converts a constant scalar int32 or int64 tensor into an int64.
+  absl::Status ConstantInputAsIntScalar(
+      int index, int64_t* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+  absl::Status ConstantInputAsIntScalar(
+      absl::string_view name, int64_t* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  absl::StatusOr<int64_t> ConstantInputAsIntScalar(
+      absl::string_view name,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Converts a constant scalar float32 or float64 tensor into a float64.
+  absl::Status ConstantInputAsFloatScalar(
+      int index, double* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Converts a constant 1D int32 or int64 tensor into a vector of int64s.
+  absl::Status ConstantInputAsIntVector(
+      int index, std::vector<int64_t>* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+  absl::Status ConstantInputAsIntVector(
+      absl::string_view name, std::vector<int64_t>* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Reshapes and converts a constant int32 or int64 tensor into a vector of
+  // int64s.
+  absl::Status ConstantInputReshapedToIntVector(
+      int index, std::vector<int64_t>* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+  absl::Status ConstantInputReshapedToIntVector(
+      absl::string_view name, std::vector<int64_t>* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
+  absl::Status ConstantInputAsInt64Literal(
+      int index, xla::Literal* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+  absl::Status ConstantInputAsInt64Literal(
+      absl::string_view name, xla::Literal* out,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Converts a constant 1D int32 or int64 tensor into a TensorShape.
+  absl::Status ConstantInputAsShape(
+      int index, TensorShape* shape,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Converts a constant 1D int32 or int64 tensor, or a scalar with value -1
+  // into a PartialTensorShape.
+  absl::Status ConstantInputAsPartialShape(int index,
+                                           PartialTensorShape* shape);
+
+  // Returns the named list-valued immutable input in "list", as
+  // defined in the OpDef.  If the named output is not list-valued,
+  // returns a one-element list.
+  absl::Status ConstantInputList(
+      absl::string_view name, std::vector<xla::Literal>* outputs,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Returns the Tensor representation of the constant input.
+  absl::StatusOr<Tensor> ConstantInputTensor(
+      int index,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  // Returns an XlaExpression describing the value of 'index'.
+  const XlaExpression& InputExpression(int index);
+  const XlaExpression& InputExpression(absl::string_view name);
+
+  // Outputs
+
+  int num_outputs() const { return context_->num_outputs(); }
+  DataType expected_output_dtype(int index) const {
+    return context_->expected_output_dtype(index);
+  }
+
+  // Returns the type of output `index` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType output_xla_type(int index);
+
+  // Sets output `index` to the XlaOp `handle`.
+  // All outputs should be set using SetOutput and SetConstantOutput, not
+  // via the underlying OpKernelContext.
+  void SetOutput(int index, const xla::XlaOp& handle);
+
+  // Sets output `index` to compile-time constant `host_tensor`, where
+  // `host_tensor` is a tensor in host memory. It is preferable to use
+  // SetConstantOutput where possible.
+  void SetConstantOutput(int index, const Tensor& host_tensor);
+
+  // Returns an XlaExpression describing the value of 'index'.
+  void SetOutputExpression(int index, const XlaExpression& expression);
+
+  // Sets output `index` to the Tensor List `handle`.
+  void SetTensorListOutput(int index, const xla::XlaOp& handle);
+
+  // Status handling.
+  void SetStatus(const absl::Status& status) { context_->SetStatus(status); }
+  absl::Status status() { return context_->status(); }
+
+  // Variables
+
+  // Sets `*resource` to the resource associated with input `index`.
+  absl::Status GetResourceInput(int index, XlaResource** resource);
+
+  // Sets output `index` to be a reference to resource `resource`.
+  void SetResourceOutput(int index, XlaResource* resource);
+
+  // Sets `*type` and `*shape` to the current type and shape of a variable's
+  // value.
+  absl::Status GetVariableTypeAndShape(int index, DataType* type,
+                                       TensorShape* shape) const;
+
+  // When dynamic_dimension_is_minus_one is set, querying a dynamic dimension
+  // returns "-1", this is useful when the underlying ops expect explicit
+  // dynamic index like reshape.
+  void set_dynamic_dimension_is_minus_one(bool value) {
+    dynamic_dimension_is_minus_one_ = value;
+  }
+
+  bool dynamic_dimension_is_minus_one() const {
+    return dynamic_dimension_is_minus_one_;
+  }
+
+  bool is_dynamic_dimension(int64_t dim_size) { return dim_size == -1; }
+
+  // Reads the current value of the resource variable referred to by input
+  // `index`. If `shape` is not nullptr, sets `*shape` to the shape of the
+  // variable. Returns an error if the variable has not been initialized, or if
+  // its type does not match `type`.
+  absl::Status ReadVariableInput(int index, DataType type, TensorShape* shape,
+                                 xla::XlaOp* value);
+  // Reads the current value of the resource variable referred to by input
+  // `name`.
+  absl::Status ReadVariableInput(absl::string_view name, DataType type,
+                                 TensorShape* shape, xla::XlaOp* value);
+
+  // Assigns the value `handle` to the variable referenced by input
+  // `input_index`. The variable must be of `type`. Returns an error if the
+  // variable has been initialized with a different type or with a
+  // different shape.
+  absl::Status AssignVariable(int input_index, DataType type,
+                              xla::XlaOp handle);
+  // Assigns the value `handle` to the variable referenced by input `name`.
+  absl::Status AssignVariable(absl::string_view name, DataType type,
+                              xla::XlaOp handle);
+
+  // Helper routines for the OP_REQUIRES macros
+  void CtxFailure(const absl::Status& s);
+  void CtxFailureWithWarning(const absl::Status& s);
+  void CtxFailure(const char* file, int line, const absl::Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const absl::Status& s);
+
+  // If this kernel invocation is within a function execution,
+  // call_frame() returns the call frame for the function call.
+  CallFrameInterface* call_frame() const { return context_->call_frame(); }
+
+  FunctionLibraryRuntime* function_library() const {
+    return context_->function_library();
+  }
+
+  const OpKernel& op_kernel() const { return context_->op_kernel(); }
+
+  // Returns the underlying OpKernelContext. Use rarely.
+  OpKernelContext* op_kernel_context() const { return context_; }
+
+  // Returns the XlaCompiler that is performing the compilation. Used for, e.g.,
+  // While to compile nested computations.
+  XlaCompiler* compiler() const;
+
+  // TODO(phawkins): find a better home for these helpers.
+
+  // Gets an XLA lambda to compute Max. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateMax(const DataType type);
+
+  // Gets an XLA lambda to compute Min. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateMin(const DataType type);
+
+  // Gets an XLA lambda to compute Add. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateAdd(const DataType type);
+
+  // Gets an XLA lambda to compute LogAddExp. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateLogAddExp(const DataType type);
+
+  // Gets an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::XlaComputation* GetOrCreateMul(const DataType type);
+
+  // Returns stack trace encoded as a string at a given module, or an empty
+  // string if none found.
+  std::string StackTrace() const;
+
+ private:
+  // Returns the tensor of input `name`.
+  const Tensor& GetInputTensorByName(absl::string_view name);
+  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
+  // InputShape(index), and stores it in `*constant_literal`. If the input
+  // cannot be evaluated, e.g., because it depends on unbound parameters,
+  // returns a non-Ok status. If InputShape(index).num_elements() !=
+  // new_shape.num_elements(), returns an error status.
+  absl::Status ConstantInputReshaped(
+      int index, absl::Span<const int64_t> new_dims,
+      xla::Literal* constant_literal,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
+  OpKernelContext* const context_;
+  bool dynamic_dimension_is_minus_one_;
+  xla::ValueInference value_inference_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_op_registry.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_op_registry.h
new file mode 100644
index 00000000..11bbbf2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -0,0 +1,440 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_OP_REGISTRY_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tsl/platform/errors.h"
+
+namespace tensorflow {
+
+// Names of the XLA compilation devices. These are not user-visible, and are
+// used internally by the Tensorflow/XLA bridge to perform symbolic execution of
+// a Tensorflow graph.
+
+extern const char* const DEVICE_CPU_XLA_JIT;  // "CPU_XLA_JIT"
+extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
+
+extern const char* const DEVICE_XLA_CPU;
+extern const char* const DEVICE_XLA_GPU;
+
+// Do not include DT_FLOAT8_* as float or numeric types since they are only
+// supported in a very limited set of ops.
+constexpr std::array<DataType, 4> kFloatTypes = {
+    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
+constexpr std::array<DataType, 6> kFloatAndComplexTypes = {
+    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16, DT_COMPLEX64, DT_COMPLEX128}};
+constexpr std::array<DataType, 14> kNumericTypes = {
+    {DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_INT8, DT_INT16, DT_INT32,
+     DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128,
+     DT_BFLOAT16}};
+
+constexpr std::array<DataType, 22> kCpuAllTypes = {
+    {DT_UINT8,      DT_QUINT8, DT_UINT16,   DT_UINT32,      DT_UINT64,
+     DT_INT8,       DT_QINT8,  DT_INT16,    DT_INT32,       DT_QINT32,
+     DT_INT64,      DT_HALF,   DT_FLOAT,    DT_DOUBLE,      DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL,   DT_BFLOAT16, DT_FLOAT8_E5M2, DT_FLOAT8_E4M3FN,
+     DT_INT4,       DT_UINT4}};
+
+constexpr std::array<DataType, 22> kGpuAllTypes = {
+    {DT_UINT8,      DT_QUINT8, DT_UINT16,   DT_UINT32,      DT_UINT64,
+     DT_INT8,       DT_QINT8,  DT_INT16,    DT_INT32,       DT_QINT32,
+     DT_INT64,      DT_HALF,   DT_FLOAT,    DT_DOUBLE,      DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL,   DT_BFLOAT16, DT_FLOAT8_E5M2, DT_FLOAT8_E4M3FN,
+     DT_INT4,       DT_UINT4}};
+
+// Class that manages registrations of operators and devices for the XLA JIT.
+// Not thread-safe.
+class XlaOpRegistry {
+ public:
+  typedef OpKernel* (*Factory)(OpKernelConstruction*);
+
+  enum class AutoclusteringPolicy {
+    // Enable autoclustering if the user requests it, e.g., via
+    // experimental_jit_scope. Does not autocluster if the JIT is enabled
+    // globally (e.g., via the OptimizerOptions in the TF session
+    // configuration.)
+    kIfExplicitlyRequested,
+    // Enable autoclustering if explicitly requested, or if the JIT is enabled
+    // globally in the session options, or via TF_XLA_FLAGS=--tf_xla_auto_jit=N.
+    kIfEnabledGlobally,
+    // Always try to autocluster ops placed on this device.
+    kAlways,
+  };
+
+  // Describes how to compile operators assigned to a device.
+  struct DeviceRegistration {
+    // The name of the an XLA compilation device to use to compile code.
+    string compilation_device_name;
+
+    // When should we autocluster operators assigned to this device?
+    AutoclusteringPolicy autoclustering_policy;
+
+    // If we should ignore the resource variable memory model when clustering
+    // resource variable reads and writes placed on this device.
+    bool cluster_resource_variable_ops_unsafely = false;
+
+    // If we should auto-cluster Stack operations placed on this device.
+    bool cluster_stack_ops = false;
+
+    // If we should auto-cluster TensorArray operations placed on this device.
+    bool cluster_tensor_array_ops = false;
+
+    // If we should auto-cluster stateful RNG operations placed on this device.
+    // Stateful RNG semantics are not properly supported by XLA so it is not
+    // necessarily correct to auto-cluster stateful RNG ops in general.
+    bool cluster_stateful_rng_ops = false;
+
+    // If we should auto-cluster ControlTrigger operations placed on this
+    // device.  ControlTrigger operations are not necessarily safe to cluster
+    // since they affect deadness (a dead ControlTrigger produces a live
+    // output).
+    bool cluster_control_trigger = false;
+
+    // If we should cluster Assert and CheckNumerics by eliding them (XLA does
+    // not natively support Assert or CheckNumerics).
+    bool elide_assert_and_checknumerics = false;
+
+    // If we should cluster operations returning DT_VARIANT.
+    bool cluster_variant_ops = false;
+
+    // Whether ops known to be slow should be auto-clustered.
+    bool cluster_slow_ops = false;
+
+    // Whether ops known to have numerical accuracy issues should be
+    // auto-clustered.
+    bool cluster_inaccurate_ops = false;
+  };
+
+  // Registers an XLA backend. `compilation_device_name` is the name of the
+  // device used for symbolic execution during compilation. `supported_types`
+  // is the list of non-resource types supported by the device. Each operators
+  // will be registered for the intersection of the operator's supported types
+  // and the device's supported types. `backend_op_filter` is a function used
+  // to exclude or modify operator registrations on the device; it may be
+  // nullptr, in which case all ops are included.
+  // `backend_op_filter` should return true if the op should be registered on
+  // the device; it may optionally modify the KernelDef.
+  typedef bool (*BackendOpFilter)(KernelDef* kdef);
+  static void RegisterBackend(const string& compilation_device_name,
+                              absl::Span<const DataType> supported_types,
+                              BackendOpFilter op_filter);
+
+  // Returns the names of the registered backends.
+  static std::vector<string> BackendNames();
+
+  // Returns true iff a backend with the given name is registered.
+  static bool IsBackendRegistered(const string& name);
+
+  // Registers `device_name` for XLA compilation, using information from
+  // `registration`.
+  // Does nothing if a registration for `device_name` already exists.
+  static void RegisterCompilationDevice(const string& device_name,
+                                        const DeviceRegistration& registration);
+
+  // Returns whether the device name is for the JIT device used exclusively for
+  // TF2XLA conversion.
+  static bool IsCompilationDevice(const string& device_name);
+
+  // Returns the JIT device name associated with 'device_name', setting
+  // 'jit_device_name', 'requires_jit', and 'enabled_jit_by_default', if they
+  // are not null. Returns false and leaves the outputs unchanged if no matching
+  // JIT device is registered.
+  // '*enable_jit_by_default' is set to true if we should try to JIT using this
+  // device when the JIT is enabled via the Session OptimizerOptions.
+  static bool GetCompilationDevice(const string& device_name,
+                                   const DeviceRegistration** registration);
+
+  // Registers all JIT kernels on JIT devices, if not already registered.
+  // Does nothing otherwise.
+  static void RegisterCompilationKernels();
+
+  // Returns KernelDefs for compilation ops registered on
+  // 'compilation_device_name'.  Does not include kernels registered as
+  // CompilationOnly, iff include_compilation_only_kernels=false.
+  static std::vector<const KernelDef*> DeviceKernels(
+      const string& compilation_device_name,
+      bool include_compilation_only_kernels);
+
+  // Returns all operations for which there are XLA kernels on any device.
+  static std::vector<string> GetAllRegisteredOps();
+
+  // Returns (via `result`) the indices of inputs to `node_def` that must be
+  // compile-time constants. Returns an empty vector if the op is not
+  // registered.
+  //
+  // `result` is sorted.
+  static absl::Status CompileTimeConstantInputs(const NodeDef& node_def,
+                                                const OpDef& op_def,
+                                                std::vector<int>* result) {
+    return CompileTimeConstantInputs(node_def, /*op_kernel=*/nullptr, &op_def,
+                                     result);
+  }
+
+  static absl::StatusOr<std::vector<int>> CompileTimeConstantInputs(
+      const NodeDef& node_def, const OpDef& op_def) {
+    std::vector<int> out;
+    TF_RETURN_IF_ERROR(CompileTimeConstantInputs(node_def, op_def, &out));
+    return out;
+  }
+
+  // Returns (via `result`) the indices of inputs to `op_kernel` that must be
+  // compile-time constants.
+  //
+  // `result` is sorted.
+  static absl::Status CompileTimeConstantInputs(const OpKernel& op_kernel,
+                                                std::vector<int>* result) {
+    return CompileTimeConstantInputs(op_kernel.def(), /*op_kernel=*/&op_kernel,
+                                     /*op_def=*/nullptr, result);
+  }
+
+  // Return names of arguments for a given op which are supposed to be
+  // constants.
+  static const std::unordered_set<std::string>*
+  CompileTimeConstantInputArgNames(const string& op);
+
+  // Returns true if `op` is a "metadata" op, one that only looks at the shapes
+  // of its operands and not their values.
+  static bool IsMetadataOp(const string& op);
+
+ private:
+  friend class XlaBackendRegistrar;
+  friend class XlaOpRegistrar;
+  friend class XlaOpRegistrationBuilder;
+
+  static XlaOpRegistry& Instance();
+
+  XlaOpRegistry();
+  ~XlaOpRegistry();
+
+  mutex mutex_;
+
+  // Describes an XLA backend.
+  struct Backend {
+    // Which types are supported by this device?
+    std::set<DataType> supported_types;
+
+    // The per-backend operator filter function. See the comment on
+    // RegisterBackend() for details.
+    BackendOpFilter op_filter;
+
+    // KernelDefs built by RegisterCompilationKernels() for each op supported
+    // by the device.
+    std::vector<std::unique_ptr<KernelDef>> kernel_defs;
+  };
+
+  // Map from compilation device names to a description of the backend.
+  std::unordered_map<string, Backend> backends_ TF_GUARDED_BY(mutex_);
+
+  // Map from Tensorflow device names to the corresponding JIT device metadata.
+  std::unordered_map<string, DeviceRegistration> compilation_devices_
+      TF_GUARDED_BY(mutex_);
+
+  // A description of a Tensorflow operator that can be compiled to XLA.
+  struct OpRegistration {
+    string name;
+
+    // Should this operator be registered only on compilation devices, without a
+    // dummy kernel registered on the corresponding XLA device?
+    bool compilation_only = false;
+
+    // Should we allow resource types for type attributes? Used by _Arg to
+    // allow DT_RESOURCE.
+    bool allow_resource_types = false;
+
+    // Should we allow variant types for type attributes? Used by While to
+    // allow TensorList which is of type DT_VARIANT.
+    bool allow_variant_types = false;
+
+    // Should we allow string type for type attributes? Used by PartitionedCall
+    // to allow DT_STRING.
+    bool allow_string_type = false;
+
+    // Mapping from attribute name to a list of supported types.
+    std::unordered_map<string, std::set<DataType>> type_constraints;
+
+    // An optional allowlist of devices. If there is no allowlist, all devices
+    // are permitted.
+    bool has_device_allowlist = false;
+    std::unordered_set<string> device_allowlist;
+
+    // Names of arguments that must be compile-time constants.
+    std::unordered_set<string> compile_time_constant_inputs;
+
+    // True if this is a "metadata" op, one that only looks at the shapes of its
+    // operands and not their values.
+    bool is_metadata_op = false;
+
+    std::string label;
+
+    // Factory used to build OpKernels that perform symbolic execution.
+    Factory factory;
+  };
+
+  // Returns true if registrations x and y can both be added to the registry.
+  // This is always the case if they refer to different ops. If they refer to
+  // the same op name, they must: have the same values for compilation_only,
+  // allow_resource_types and allow_variant_types; use a device_allowlist; and
+  // their allowlists must not intersect.
+  static bool IsCompatible(const OpRegistration& x, const OpRegistration& y);
+
+  static absl::Status CompileTimeConstantInputs(const NodeDef& node_def,
+                                                const OpKernel* op_kernel,
+                                                const OpDef* op_def,
+                                                std::vector<int>* result);
+
+  // Map from operator name to OpRegistrations, populated by REGISTER_XLA_OP.
+  // Registrations present under the same key must satisfy IsCompatible above,
+  // and this is checked during registration.
+  std::unordered_map<string, std::vector<std::unique_ptr<OpRegistration>>> ops_
+      TF_GUARDED_BY(mutex_);
+
+  // Have we already registered the JIT kernels on the JIT devices?
+  bool jit_kernels_registered_ = false;
+
+  // Holds ownership of OpKernelRegistrars that represent the Tensorflow kernel
+  // registrations created by RegisterCompilationKernels() and
+  // RegisterDeviceKernels().
+  std::vector<std::unique_ptr<kernel_factory::OpKernelRegistrar>>
+      kernel_registrars_ TF_GUARDED_BY(mutex_);
+};
+
+// REGISTER_XLA_OP() registers an XLA OpKernel by name, for example:
+// REGISTER_XLA_OP(Name("Add"), AddOp);
+// where 'AddOp' is the name of a JIT OpKernel class that implements "Add".
+//
+// We don't use a variadic macro here because we don't expect JIT operators to
+// be templated.
+
+#define REGISTER_XLA_OP(NAME, OP) \
+  REGISTER_XLA_OP_UNIQ_HELPER(__COUNTER__, NAME, OP)
+
+#define REGISTER_XLA_CONV_OP(BUILDER, OP)                                      \
+  REGISTER_XLA_OP(BUILDER.TypeConstraint("T", GetXlaConvTypesForNonGpu()), OP) \
+  REGISTER_XLA_OP(BUILDER.TypeConstraint("T", GetXlaConvTypesForGpu())         \
+                      .Device(DEVICE_GPU_XLA_JIT),                             \
+                  OP)
+
+class XlaOpRegistrationBuilder {
+ public:
+  // Starts an operator registration chain.
+  static XlaOpRegistrationBuilder Name(absl::string_view name);
+
+  // Specifies a allowlist of devices on which the operator may run.
+  XlaOpRegistrationBuilder& Device(absl::string_view devices);
+  XlaOpRegistrationBuilder& Device(absl::Span<const absl::string_view> devices);
+
+  // Specifies a type constraint for a type variable attribute. Each constraint
+  // specifies the set of types that the type variable may assume.
+  XlaOpRegistrationBuilder& TypeConstraint(absl::string_view attr_name,
+                                           DataType allowed);
+
+  XlaOpRegistrationBuilder& TypeConstraint(absl::string_view attr_name,
+                                           absl::Span<const DataType> allowed);
+
+  // Specifies that a dummy copy of this operator should not be registered on
+  // XLA_* devices, but may be used during compilation.
+  XlaOpRegistrationBuilder& CompilationOnly();
+
+  // Allow DT_RESOURCE types for type parameters.
+  XlaOpRegistrationBuilder& AllowResourceTypes();
+
+  // Allow DT_VARIANT types for type parameters.
+  XlaOpRegistrationBuilder& AllowVariantTypes();
+
+  // Allow DT_STRING type for type parameters.
+  XlaOpRegistrationBuilder& AllowStringType();
+
+  // Mark 'input_name' as an argument whose value must be known at compile-time.
+  XlaOpRegistrationBuilder& CompileTimeConstantInput(
+      absl::string_view input_name);
+
+  // Mark this op as a "metadata" op, one that only looks at the shapes of its
+  // operands and not their values.
+  XlaOpRegistrationBuilder& IsMetadataOp();
+
+  // Specifies a particular value for the "_kernel" attr.
+  XlaOpRegistrationBuilder& Label(std::string label);
+
+  std::unique_ptr<XlaOpRegistry::OpRegistration> Build(
+      XlaOpRegistry::Factory factory);
+
+ private:
+  XlaOpRegistrationBuilder(absl::string_view name);
+
+  std::unique_ptr<XlaOpRegistry::OpRegistration> registration_;
+};
+
+// REGISTER_XLA_BACKEND() registers an XLA backend. Example usage:
+// REGISTER_XLA_BACKEND(DEVICE_GPU_XLA_JIT, kGpuAllTypes, GpuOpFilter);
+#define REGISTER_XLA_BACKEND(NAME, ...) \
+  REGISTER_XLA_BACKEND_UNIQ_HELPER(__COUNTER__, NAME, __VA_ARGS__)
+
+// Implementation details.
+
+class XlaOpRegistrar {
+ public:
+  XlaOpRegistrar(std::unique_ptr<XlaOpRegistry::OpRegistration> registration);
+};
+
+#define REGISTER_XLA_OP_UNIQ_HELPER(COUNTER, BUILDER, OP) \
+  REGISTER_XLA_OP_UNIQ(COUNTER, BUILDER, OP)
+
+#define REGISTER_XLA_OP_UNIQ(CTR, BUILDER, OP)                                 \
+  static ::tensorflow::XlaOpRegistrar xla_op_registrar__body__##CTR##__object( \
+      ::tensorflow::XlaOpRegistrationBuilder::BUILDER.Build(                   \
+          [](::tensorflow::OpKernelConstruction* context)                      \
+              -> ::tensorflow::OpKernel* { return new OP(context); }));
+
+class XlaBackendRegistrar {
+ public:
+  XlaBackendRegistrar(absl::string_view name, absl::Span<const DataType> types,
+                      XlaOpRegistry::BackendOpFilter op_filter = nullptr);
+};
+
+#define REGISTER_XLA_BACKEND_UNIQ_HELPER(COUNTER, NAME, ...) \
+  REGISTER_XLA_BACKEND_UNIQ(COUNTER, NAME, __VA_ARGS__)
+
+#define REGISTER_XLA_BACKEND_UNIQ(CTR, NAME, ...) \
+  static ::tensorflow::XlaBackendRegistrar        \
+      xla_backend_registrar__body__##CTR##__object(NAME, __VA_ARGS__);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_OP_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_resource.h b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_resource.h
new file mode 100644
index 00000000..d4c8f7c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/compiler/tf2xla/xla_resource.h
@@ -0,0 +1,197 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_RESOURCE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_RESOURCE_H_
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+namespace tensorflow {
+
+// Represents a resource, such as a Variable or TensorArray.
+class XlaResource {
+ public:
+  enum Kind {
+    kInvalid,
+    kVariable,
+    kTensorArray,
+    kStack,
+  };
+  static absl::string_view KindToString(Kind kind);
+
+  // Creates a new Stack resource.
+  static std::unique_ptr<XlaResource> CreateStack(string name, DataType type,
+                                                  int64_t max_size);
+
+  // Creates a new TensorArray resource.
+  static std::unique_ptr<XlaResource> CreateTensorArray(
+      string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+      int64_t max_array_size);
+
+  XlaResource(Kind kind, int arg_num, string name, DataType type,
+              TensorShape shape, xla::XlaOp initial_value,
+              int64_t max_array_size,
+              const std::set<string>& tensor_array_gradients,
+              bool tensor_array_multiple_writes_aggregate,
+              const std::optional<ManagedStackTrace>& definition_stack_trace =
+                  std::nullopt);
+
+  XlaResource(const XlaResource&) = delete;
+  XlaResource(XlaResource&&) = delete;
+  XlaResource& operator=(const XlaResource&) = delete;
+  XlaResource& operator=(XlaResource&&) = delete;
+
+  Kind kind() const { return kind_; }
+
+  // If this resource is visible externally to the computation, what was its
+  // argument number?
+  // < 0 means "not visible externally".
+  int arg_num() const { return arg_num_; }
+
+  // A descriptive name for the resource, used in error messages.
+  const string& name() const { return name_; }
+
+  // Current type and value of the resource. Uninitialized resources are
+  // represented by a default (zero) handle and type DT_INVALID.
+  // While the type of a resource is notionally fixed during execution, when
+  // a resource is first initialized we do not yet know its type, so we keep
+  // track of its type dynamically.
+  DataType type() const { return type_; }
+
+  // Shape of the resource. For an uninitialized resource, this is ignored.
+  // For a Variable, this is the shape of the value. For a TensorArray or Stack
+  // this is the shape of each entry in the TensorArray/Stack.
+  const TensorShape& shape() const { return shape_; }
+
+  const xla::XlaOp& value() const { return value_; }
+
+  // Value of the resource at computation entry. Used to detect which
+  // variables have new values that need to be written back.
+  const xla::XlaOp& initial_value() const { return initial_value_; }
+
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  const std::optional<xla::Shape>& representation_shape() const {
+    return representation_shape_;
+  }
+
+  // A variable is initialized if it has a value.
+  bool initialized() const { return value_.valid(); }
+
+  // Sets the type and shape of the resource. The type and shape of a resource
+  // must not change once the variable has been initialized.
+  absl::Status SetTypeAndShape(DataType type, const TensorShape& shape);
+
+  // Sets the current value of the resource. Returns an error if the type is not
+  // set to a valid value.
+  absl::Status SetValue(xla::XlaOp value);
+
+  // Sets the current value of the resource to an all-zero value.
+  absl::Status SetZeroValue(xla::XlaBuilder* builder);
+
+  // Sets the representational shape of the resource on device.
+  void SetRepresentationShape(const xla::Shape& shape) {
+    representation_shape_ = absl::make_optional(shape);
+  }
+
+  // Looks up the gradient for `source`, or creates it if it does not already
+  // exist. The call target must be an initialized TensorArray resource. A
+  // TensorArray can have multiple named gradients; see the operator
+  // documentation for TensorArrayGradV3 for details.
+  absl::Status GetOrCreateTensorArrayGradient(const string& source,
+                                              xla::XlaBuilder* builder,
+                                              XlaResource** gradient_out);
+
+  // Packs a resource into a single XLA value `pack`, suitable for use as
+  // an XlaCompiler::Argument. For non-TensorArrays or TensorArrays without
+  // gradients, sets `*pack` to `value`.
+  // For TensorArrays with gradients, packs the value and its gradient values in
+  // a tuple; the gradients values are packed in order by source name.
+  absl::Status Pack(xla::XlaOp* pack, xla::XlaBuilder* builder) const;
+
+  // Updates the resource with values from `pack`. If `gradient_sources` is
+  // non-empty, treats `pack` as a tuple that represents a TensorArray and
+  // its gradients, and unpacks and updates the gradient resources.
+  // If `reset_initial_values` is true, sets the initial_values as well as the
+  // values.
+  // Opposite of Pack().
+  absl::Status SetFromPack(const std::set<string>& gradient_sources,
+                           xla::XlaOp pack, xla::XlaBuilder* builder);
+
+  bool IsOverwritten() { return is_overwritten_; }
+
+  // TensorArray and Stack specific fields
+  // TODO(phawkins): refactor this code to use subclasses, rather than putting
+  // kind-specific fields in XlaResource.
+
+  // 'max_array_size' stores the expected size of the TensorArray or Stack.
+  // We need to store this since sometimes TensorArrays must be initialized
+  // lazily since we do not know the element shape at construction time.
+  // Used by both TensorArrays and Stacks.
+  int64_t max_array_size() const { return max_array_size_; }
+  void set_max_array_size(int64_t size) { max_array_size_ = size; }
+
+  bool tensor_array_multiple_writes_aggregate() const {
+    return tensor_array_multiple_writes_aggregate_;
+  }
+
+  // 'tensor_array_gradient' is a map from TensorArrayGradV3 'source' attributes
+  // to an XlaResource containing the gradient TensorArrays. We store a pointer
+  // here since there should only be one gradient TensorArray per 'source'
+  // string, irrespective of the number of calls to TensorArrayGrad. The map
+  // is ordered since values are packed into tuples by Pack() sorted by name
+  // order.
+  const std::map<string, std::unique_ptr<XlaResource>>& tensor_array_gradients()
+      const {
+    return tensor_array_gradients_;
+  }
+
+ private:
+  const Kind kind_;
+  const int arg_num_;
+  const string name_;
+
+  DataType type_;
+  TensorShape shape_;
+  xla::XlaOp value_;
+  xla::XlaOp initial_value_;
+
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  std::optional<xla::Shape> representation_shape_;
+
+  int64_t max_array_size_ = -1;
+  bool tensor_array_multiple_writes_aggregate_ = false;
+
+  std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
+  bool is_overwritten_ = false;
+
+  std::optional<ManagedStackTrace> definition_stack_trace_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_RESOURCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/activity_watcher/activity.h b/third_party/tflite-hdrs/tensorflow/core/activity_watcher/activity.h
new file mode 100644
index 00000000..eecd207a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/activity_watcher/activity.h
@@ -0,0 +1,186 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_ACTIVITY_WATCHER_ACTIVITY_H_
+#define TENSORFLOW_CORE_ACTIVITY_WATCHER_ACTIVITY_H_
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+class CoordinationServiceAgent;
+}
+
+namespace tensorflow {
+
+namespace activity_watcher {
+
+using ActivityId = tsl::uint64;
+constexpr ActivityId kActivityNotRecorded = 0;
+constexpr int kWatcherDisabled = 0;
+
+enum ActivityCategory {
+  kCollective = 0,
+  kRemoteFunction = 1,
+  kMisc = 2,
+  kDatasetOp = 3,
+  kTpuOp = 4,
+  kRendezvous = 5,
+};
+
+static tsl::string ToString(ActivityCategory category) {
+  switch (category) {
+    case ActivityCategory::kCollective:
+      return "Collective";
+    case ActivityCategory::kRemoteFunction:
+      return "Remote Function";
+    case ActivityCategory::kMisc:
+      return "Miscellaneous";
+    case ActivityCategory::kDatasetOp:
+      return "Dataset Op";
+    case ActivityCategory::kTpuOp:
+      return "TPU Op";
+    case ActivityCategory::kRendezvous:
+      return "Rendezvous";
+  }
+}
+
+// An activity to be recorded.
+struct Activity {
+  using Attributes = absl::flat_hash_map<tsl::string, tsl::string>;
+  // A human readable title of the activity.
+  tsl::string title;
+  // The category of the activity.
+  ActivityCategory category = ActivityCategory::kMisc;
+  // Key/value pairs that are attached to the activity.
+  Attributes attributes;
+  Activity() = default;
+  Activity(tsl::string title, ActivityCategory category)
+      : title(std::move(title)), category(category) {}
+  Activity(tsl::string title, ActivityCategory category, Attributes attributes)
+      : title(std::move(title)),
+        category(category),
+        attributes(std::move(attributes)) {}
+};
+
+// Enable activity wathcer to send own workers activities to coordination
+// service and also fetch all workers' activities.
+void MaybeEnableMultiWorkersWatching(tsl::CoordinationServiceAgent* agent);
+
+namespace tfw_internal {
+
+#if defined(TF_ENABLE_ACTIVITY_WATCHER)
+
+// Records an activity start without checking whether the watcher is enabled.
+ActivityId RecordActivityStart(std::unique_ptr<Activity> activity);
+// Records an activity end without checking whether the activity_id is valid.
+void RecordActivityEnd(ActivityId activity_id);
+
+TF_EXPORT extern std::atomic<int> g_watcher_level;
+
+// Returns whether the activitity watcher is enabled.
+inline bool WatcherEnabled(int level = 1) {
+  return g_watcher_level.load(std::memory_order_acquire) >= level;
+}
+
+#endif
+
+// NOTE: Borrowed from boost C++ libraries because std::is_invocable_r is not
+// available in Android NDK.
+template <typename R, typename F, typename... Args>
+struct is_invocable_r
+    : std::is_constructible<
+          std::function<R(Args...)>,
+          std::reference_wrapper<typename std::remove_reference<F>::type>> {};
+
+}  // namespace tfw_internal
+
+template <typename F>
+constexpr bool is_activity_generator =
+    tfw_internal::is_invocable_r<std::unique_ptr<Activity>, F>::value;
+
+// Records an activity explicitly. Useful when the start and end of an activity
+// happen in different threads. Generates the Activity only if activity
+// watching is enabled, useful for avoiding expensive operations when activity
+// watching is disabled.
+// Example Usage:
+//   auto aid = ActivityStart([&]() {
+//     return std::make_unique<Activity>(
+//         op_name, category,
+//         Activity::Attributes{{"key1", value1}, {"key2", value2}});
+//   }, /*level=*/2);
+//   DoSomething();
+//   ActivityEnd(aid);
+template <
+    typename ActivityGenerator,
+    std::enable_if_t<is_activity_generator<ActivityGenerator>, bool> = true>
+inline ActivityId ActivityStart(ActivityGenerator&& gen, int level = 1) {
+#if defined(TF_ENABLE_ACTIVITY_WATCHER)
+  if (TF_PREDICT_FALSE(tfw_internal::WatcherEnabled(level))) {
+    return tfw_internal::RecordActivityStart(
+        std::forward<ActivityGenerator>(gen)());
+  }
+#endif
+  return kActivityNotRecorded;
+}
+
+inline void ActivityEnd(ActivityId id) {
+#if defined(TF_ENABLE_ACTIVITY_WATCHER)
+  if (TF_PREDICT_FALSE(id != kActivityNotRecorded)) {
+    tfw_internal::RecordActivityEnd(id);
+  }
+#endif
+}
+
+// ActivityScope marks a scope as an activity and record it with a global
+// ActivityRecorder.
+// Example Usage:
+//   {
+//     ActivityScope activity_scope([&]() {
+//       return std::make_unique<Activity>(
+//           op_name, ActivityCategory::kMisc,
+//           Activity::Attributes{{"key1", value1}, {"key2", value2}});
+//     }, /*level=*/2);
+//     DoSomething();
+//   }
+class ActivityScope {
+ public:
+  template <
+      typename ActivityGenerator,
+      std::enable_if_t<is_activity_generator<ActivityGenerator>, bool> = true>
+  explicit ActivityScope(ActivityGenerator&& gen, int level = 1) {
+    activity_id_ = ActivityStart(std::forward<ActivityGenerator>(gen), level);
+  }
+  ActivityScope(ActivityScope&& activity) {
+    activity_id_ = activity.activity_id_;
+    activity.activity_id_ = kActivityNotRecorded;
+  }
+  ~ActivityScope() { ActivityEnd(activity_id_); }
+
+ private:
+  ActivityId activity_id_;
+  ActivityScope(const ActivityScope&) = delete;
+  void operator=(const ActivityScope&) = delete;
+};
+
+}  // namespace activity_watcher
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_ACTIVITY_WATCHER_ACTIVITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/activity_watcher/activity_utils.h b/third_party/tflite-hdrs/tensorflow/core/activity_watcher/activity_utils.h
new file mode 100644
index 00000000..64958cd5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/activity_watcher/activity_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_ACTIVITY_WATCHER_ACTIVITY_UTILS_H_
+#define TENSORFLOW_CORE_ACTIVITY_WATCHER_ACTIVITY_UTILS_H_
+
+#include <memory>
+
+#include "xla/tsl/platform/types.h"
+#include "tensorflow/core/activity_watcher/activity.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace activity_watcher {
+
+// A convenient way to create an activity. Writes OpKernelContext information
+// and given attributes to a new activity and returns.
+std::unique_ptr<Activity> ActivityFromContext(
+    OpKernelContext* context, tsl::string name, ActivityCategory category,
+    Activity::Attributes additional_attributes = Activity::Attributes());
+
+}  // namespace activity_watcher
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_ACTIVITY_WATCHER_ACTIVITY_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/api_def/excluded_ops.h b/third_party/tflite-hdrs/tensorflow/core/api_def/excluded_ops.h
new file mode 100644
index 00000000..409e5d32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/api_def/excluded_ops.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_API_DEF_EXCLUDED_OPS_H_
+#define TENSORFLOW_CORE_API_DEF_EXCLUDED_OPS_H_
+
+#include <string>
+#include <unordered_set>
+
+namespace tensorflow {
+
+// Returns a list of ops excluded from ApiDef.
+// TODO(annarev): figure out if we should keep ApiDefs for these ops as well
+const std::unordered_set<std::string>* GetExcludedOps();
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_API_DEF_EXCLUDED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/api_def/update_api_def.h b/third_party/tflite-hdrs/tensorflow/core/api_def/update_api_def.h
new file mode 100644
index 00000000..1e285c06
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/api_def/update_api_def.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+#define TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+// Functions for updating ApiDef when new ops are added.
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns ApiDefs text representation in multi-line format
+// constructed based on the given op.
+string CreateApiDef(const OpDef& op);
+
+// Removes .Doc call for the given op.
+// If unsuccessful, returns original file_contents and prints an error.
+// start_location - We search for .Doc call starting at this location
+//   in file_contents.
+string RemoveDoc(const OpDef& op, const string& file_contents,
+                 size_t start_location);
+
+// Creates api_def_*.pbtxt files for any new ops (i.e. ops that don't have an
+// api_def_*.pbtxt file yet).
+// If op_file_pattern is non-empty, then this method will also
+// look for a REGISTER_OP call for the new ops and removes corresponding
+// .Doc() calls since the newly generated api_def_*.pbtxt files will
+// store the doc strings.
+void CreateApiDefs(const OpList& ops, const string& api_def_dir,
+                   const string& op_file_pattern);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/all_to_all.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/all_to_all.h
new file mode 100644
index 00000000..f0fb1651
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/all_to_all.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ALL_TO_ALL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ALL_TO_ALL_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device.h"
+
+namespace tensorflow {
+
+// Implementation of collective all-to-all.
+class AllToAll : public CollectiveImplementationInterface {
+ public:
+  AllToAll();
+
+  void Run(StatusCallback done) override;
+
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override {
+    return absl::OkStatus();
+  }
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  absl::Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+ private:
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+  std::vector<Tensor> input_chunks_;
+  Tensor output_buffer_;
+  std::vector<Tensor> output_chunks_;
+  StatusCallback done_;
+  mutex mu_;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+  int counter_ TF_GUARDED_BY(mu_);
+
+  void DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
+                    const StatusCallback& done);
+
+  void DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
+                    const StatusCallback& done);
+
+  // Atomically increments counter_ by one for sending, one for receiving.
+  // Invokes done when counter_ reaches 2.
+  // The purpose of checking counter_ is to ensure that done_ is called once.
+  StatusCallback CheckCounterAndCallDone();
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ALL_TO_ALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/allocator_retry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/allocator_retry.h
new file mode 100644
index 00000000..842b82db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/allocator_retry.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
+
+#include "xla/tsl/framework/allocator_retry.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::AllocatorRetry;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/arg_ret_placement.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/arg_ret_placement.h
new file mode 100644
index 00000000..e0b40182
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/arg_ret_placement.h
@@ -0,0 +1,158 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ARG_RET_PLACEMENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ARG_RET_PLACEMENT_H_
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow::full_type {
+
+// Set the contents of memory_types for args (inputs to functions, "_Arg" ops)
+// based on dtype. Raises an error if an int32 arg does not have
+// expected full_type information. If an error raised about bad full
+// time information causes a breakage, changing `SetMemoryTypeForArgs` to
+// `WeakSetMemoryTypeForArgs` is a possible work around.
+absl::Status SetMemoryTypeForArgs(const absl::InlinedVector<Node*, 4UL>& nodes,
+                                  const DataTypeVector& dtypes,
+                                  MemoryTypeVector& memory_types);
+
+// TODO(b/258849883) Delete the `Weak...` versions of these functions once
+// everything is working with the version without `Weak`.
+
+// Set the contents of memory_types for args (inputs to functions, "_Arg" ops)
+// based on dtype. Logging of warnings if an int32 arg does not have
+// expected full_type information can be enabled.
+absl::Status WeakSetMemoryTypeForArgs(
+    const absl::InlinedVector<Node*, 4UL>& nodes, const DataTypeVector& dtypes,
+    MemoryTypeVector& memory_types);
+
+// Set the contents of memory_types for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Raises an error if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information). If an error raised about bad
+// full time information causes a breakage, changing `SetMemoryTypeForRets` to
+// `WeakSetMemoryTypeForRets` is a possible work around.
+absl::Status SetMemoryTypeForRets(const absl::InlinedVector<Node*, 4UL>& nodes,
+                                  const DataTypeVector& dtypes,
+                                  MemoryTypeVector& memory_types);
+
+// Set the contents of memory_types for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Logging of warnings if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information) can be enabled.
+absl::Status WeakSetMemoryTypeForRets(
+    const absl::InlinedVector<Node*, 4UL>& nodes, const DataTypeVector& dtypes,
+    MemoryTypeVector& memory_types);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// based on dtype. Raises an error if an int32 arg does not have
+// expected full_type information. If an error raised about bad full
+// time information causes a breakage, changing `SetAllocAttrsForArgs` to
+// `WeakSetAllocAttrsForArgs` is a possible work around.
+absl::Status SetAllocAttrsForArgs(
+    const absl::InlinedVector<Node*, 4UL>& nodes, const DataTypeVector& dtypes,
+    std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// based on dtype. Logging of warnings if an int32 arg does not have
+// expected full_type information can be enabled.
+absl::Status WeakSetAllocAttrsForArgs(
+    const absl::InlinedVector<Node*, 4UL>& nodes, const DataTypeVector& dtypes,
+    std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Raises an error if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information). If an error raised about bad
+// full time information causes a breakage, changing `SetAllocAttrsForRets` to
+// `WeakSetAllocAttrsForRets` is a possible work around.
+absl::Status SetAllocAttrsForRets(
+    const absl::InlinedVector<Node*, 4UL>& nodes, const DataTypeVector& dtypes,
+    std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Logging of warnings if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information) can be enabled.
+absl::Status WeakSetAllocAttrsForRets(
+    const absl::InlinedVector<Node*, 4UL>& nodes, const DataTypeVector& dtypes,
+    std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// for a single device funtion based on dtype. Raises an error if an int32 arg
+// does not have expected full_type information. If an error raised about bad
+// full time information causes a breakage, changing
+// `SingleDeviceSetAllocAttrsForArgs` to `WeakSingleDeviceSetAllocAttrsForArgs`
+// is a possible work around. The DataType specified by the "T" attr of input
+// nodes is used.
+absl::Status SingleDeviceSetAllocAttrsForArgs(
+    std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// for a single device based on dtype. Logging of warnings if an int32 arg does
+// not have expected full_type information can be enabled. The DataType
+// specified by the "T" attr of input nodes is used.
+absl::Status WeakSingleDeviceSetAllocAttrsForArgs(
+    std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) for a single device based on dtype. Raises an error if an int32 ret does
+// not have expected full_type information (i.e. if the source of the input to
+// the ret does not have expected full type information). If an error raised
+// about bad full time information causes a breakage, changing
+// `SingleDeviceSetAllocAttrsForRets` to `WeakSingleDeviceSetAllocAttrsForRets`
+// is a possible work around. The DataType specified by the "T" attr of input
+// nodes is used.
+absl::Status SingleDeviceSetAllocAttrsForRets(
+    std::vector<std::pair<Node*, int>> ret_nodes, bool ints_on_device,
+    std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) for a single device based on dtype. Logging of warnings if an int32 ret
+// does not have expected full_type information (i.e. if the source of the input
+// to the ret does not have expected full type information) can be enabled. The
+// DataType specified by the "T" attr of input nodes is used.
+absl::Status WeakSingleDeviceSetAllocAttrsForRets(
+    std::vector<std::pair<Node*, int>> ret_nodes, bool ints_on_device,
+    std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Given a FullTypeId, return the corresponding MemoryTypes (i.e. return
+// HOST_MEMORY for TFT_SHAPE_TENSOR, DEVICE_MEMORY othersize).
+MemoryType MemoryTypeFromFullTypeId(FullTypeId id);
+
+// Check that use_host_memory is true iff FT has type_id TFT_SHAPE_TENSOR
+// and logging of a warning if not can be enabled. Returns true if check passes.
+// Note the FT is expected to be the full type information for a tensor, not for
+// the whole ouput of an op, i.e. it should not have an outer TFT_PRODUCT.
+bool LogMemoryTypeMismatch(bool use_host_memory, const FullTypeDef& ft);
+
+// Check that use_host_memory is true iff FT has type_id TFT_SHAPE_TENSOR
+// and raise an error if not. Note the FT is expected to be the full type
+// information for a tensor, not for the whole ouput of an op, i.e. it should
+// not have an outer TFT_PRODUCT.
+absl::Status CheckMemoryType(bool use_host_memory, const FullTypeDef& ft);
+
+}  // namespace tensorflow::full_type
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ARG_RET_PLACEMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/base_collective_executor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/base_collective_executor.h
new file mode 100644
index 00000000..0c4689bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/base_collective_executor.h
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+namespace tensorflow {
+class CollectiveImplementation;
+class DeviceMgr;
+class Device;
+
+// Helper interface that aliases regular subfields of a Tensor as separate
+// Tensors for in-place update.
+class CollectiveAdapter {
+ public:
+  virtual ~CollectiveAdapter() {}
+
+  // Move the backing tensor to 'output' with its original storage and
+  // shape. After this call this CollectiveAdapter object should be
+  // deleted immediately without calling any of its other methods.
+  virtual void ConsumeFinalValue(Tensor* output) = 0;
+
+  // const access to entire intermediate value for debugging
+  virtual const Tensor& Value() const = 0;
+
+  // Returns tensor for chunk i which aliases the backing buffer.
+  virtual Tensor ChunkAlias(int i) = 0;
+
+  // Returns tensor allocated on the same device but with its own
+  // separate backing buffer.  Will have same type and size as
+  // chunk i.
+  virtual Tensor TempChunk(int i) const = 0;
+
+  // Bytes in chunk i
+  virtual int64_t ChunkBytes(int i) const = 0;
+
+  // Generate a CPU RAM scalar tensor of the same DataType as the
+  // backing tensor with the given integer value.
+  virtual Tensor Scalar(int v) const = 0;
+
+  // Generate a scalar tensor of same DataType and on the same device
+  // as the backing tensor.
+  virtual Tensor Scalar(Allocator* a,
+                        const AllocationAttributes& attr) const = 0;
+
+  // Debugging string describing buffer location
+  virtual string TBounds(const Tensor& t) const = 0;
+
+  virtual string DebugString() const = 0;
+
+  // Computes the number of elements per alias chunk tensor.
+  //
+  // A CHECK in tensor.cc expects that the memory buffer backing a
+  // Tensor will be aligned according to EIGEN_MAX_ALIGN_BYTES.  To
+  // ensure that all chunk aliasing Tensors maintain this alignment we
+  // need to pick a chunk size that preserves it.  Note than in extreme
+  // cases (impractical, but possible with very small tensors) one or
+  // more tail chunks can end up emptby.
+  static int64_t AlignedChunkElts(int64_t elt_bytes, int64_t total_elts,
+                                  int64_t num_chunks);
+};
+
+// Create a CollectiveAdaptor wrapping 'output', specialized to its
+// data-type and shape.  If align_chunks == true then chunk size may
+// be larger than output->NumElements() / num_chunks and one or more
+// of the suffix chunks may be empty.  Chunks will be arranged to start
+// and end on alignment boundaries.  If align_chunks == false then
+// output->NumElements() % num_chunks must be 0 and all chunks will
+// have exactly the same size, ignoring alignment issues.
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator,
+                                         bool align_chunks = true);
+
+// Default implementation of CollectiveExecutor.  Delegates the actual
+// work of moving data to a class specialized for the operation type,
+// arguments and device+interconnect topology.
+class BaseCollectiveExecutor : public CollectiveExecutor {
+ public:
+  BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
+                         CollectiveRemoteAccess* remote_access, int64_t step_id,
+                         const DeviceMgr* dev_mgr,
+                         std::shared_ptr<UnboundedWorkQueue> work_queue)
+      : CollectiveExecutor(cem),
+        step_id_(step_id),
+        dev_mgr_(dev_mgr),
+        remote_access_(remote_access),
+        work_queue_(std::move(work_queue)) {}
+
+  ~BaseCollectiveExecutor() override;
+
+  void StartAbort(const absl::Status& s) override TF_LOCKS_EXCLUDED(status_mu_);
+
+  void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams* col_params,
+                    const string& exec_key, StatusCallback done) override;
+
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           StatusCallback done) override;
+
+  CollectiveRemoteAccess* remote_access() override {
+    return remote_access_.get();
+  }
+
+  void RunClosure(std::function<void()> closure) override {
+    work_queue_->Schedule(std::move(closure));
+  }
+
+  // If we need to enforce an ordering on any portion of collective
+  // implementation, and the ordering is encoded via attribute on the collective
+  // op, this function will block until all dependencies for this collective
+  // have completed.
+  void WaitForDependencies(const CollectiveParams& col_params) override;
+  // Record that this collective has completed the portion of the implementation
+  // that needs to be ordered wrt other collectives, to unblock any of its
+  // dependent ops.
+  void UnblockDependencies(const CollectiveParams& col_params) override;
+
+ protected:
+  const int64_t step_id_;
+  const DeviceMgr* dev_mgr_;  // Not owned.
+  std::unique_ptr<CollectiveRemoteAccess> remote_access_;
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
+  mutex launch_mu_;
+  condition_variable launch_cv_;
+  // collective instance key -> number of local devices for which NCCL ops have
+  // been launched.
+  std::unordered_map<int32, int32> launched_ TF_GUARDED_BY(launch_mu_);
+  mutex status_mu_;
+  absl::Status status_ TF_GUARDED_BY(status_mu_);
+
+ private:
+  absl::Status CreateCollective(const CollectiveParams& col_params,
+                                CollectiveImplementationInterface** col_impl);
+  // Check if all ops on which this collective depends on have launched.
+  bool CheckDependencies(const CollectiveParams& col_params)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
+  // Tries to return the status that is the original error. It returns the
+  // aborted status if the collective executor is aborted.
+  absl::Status GetStatus(const absl::Status& s) TF_LOCKS_EXCLUDED(status_mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/bfc_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/bfc_allocator.h
new file mode 100644
index 00000000..c8becd4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/bfc_allocator.h
@@ -0,0 +1,45 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+
+#include <array>
+#include <deque>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "xla/tsl/framework/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class MemoryDump;         // NOLINT
+using tsl::BFCAllocator;  // NOLINT
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/buf_rendezvous.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/buf_rendezvous.h
new file mode 100644
index 00000000..8c2d201e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+
+#include <functional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class Device;
+class DeviceContext;
+class DeviceMgr;
+class Tensor;
+
+// EXPERIMENTAL: RDMA oriented producer/consumer rendezvous on a local
+// Tensor value for which DMAHelper::CanUseDMA() is true, i.e. dense
+// numeric types.  Similar to Rendezvous but never owns a Ref on the
+// tensor, instead it uses an explicit callback to the producer when
+// the consumer side is finished with the value.  This allows the
+// producer to perform in-place updates on the source buffer or to take
+// other actions that depend on knowing the consumer has passed a certain
+// execution point.
+class BufRendezvous {
+ public:
+  explicit BufRendezvous(uint64 step_id, const DeviceMgr* dev_mgr)
+      : step_id_(step_id), dev_mgr_(dev_mgr) {}
+
+  virtual ~BufRendezvous();
+
+  // Inform all waiting parties that this BufRendezvous is defunct because of
+  // an error Status interrupting the Step.
+  void StartAbort(const absl::Status& s);
+
+  struct Hook;
+  // Provided by the consumer to be called when access to the buffer
+  // is available.  If the Status arg is not OK, then hook will not
+  // be populated.  Ownership of Hook passes to consumer with the
+  // callback.
+  typedef std::function<void(const absl::Status&, Hook*)> ConsumerCallback;
+  // Provided by the producer to be called when the consumer has finished
+  // reading the buffer and will no longer access it.
+  typedef std::function<void(const absl::Status&)> ProducerCallback;
+
+  struct Hook {
+    Device* prod_dev;
+    DeviceContext* prod_ctx;
+    const Tensor* prod_value;
+    AllocatorAttributes prod_attr;
+    ProducerCallback prod_cb;
+    ConsumerCallback cons_cb;
+    CancellationManager* cancellation_manager;
+    CancellationToken cancellation_token;
+    explicit Hook(CancellationManager* cancellation_manager,
+                  CancellationToken cancellation_token)
+        : prod_dev(nullptr),
+          prod_ctx(nullptr),
+          prod_value(nullptr),
+          prod_cb(nullptr),
+          cons_cb(nullptr),
+          cancellation_manager(cancellation_manager),
+          cancellation_token(cancellation_token) {}
+    string DebugString() const;
+  };
+
+  // Called to advertise availability of a Tensor value corresponding
+  // to key.  That value must stay valid until done is called.
+  //
+  // If a non-null cancellation manager is provided, this function registers a
+  // callback to delete the hook and invoke provider/consumer callbacks with
+  // cancelled error.
+  void ProvideBuf(const string& key, Device* dev, DeviceContext* dev_ctx,
+                  const Tensor* v, const AllocatorAttributes& attr,
+                  const ProducerCallback& done,
+                  CancellationManager* cancellation_manager);
+
+  // Called to request access to a Tensor value corresponding to key.
+  // Consumer is provided with a Hook as soon as available.
+  //
+  // This function also checks that the current incarnation number of the
+  // `device` that produced this value matches the `incarnation` expected by the
+  // consumer, and invokes `done` with `FailedPrecondition` status and
+  // `nullptr` hook if it does not match.
+  //
+  // If a non-null cancellation manager is provided, this function registers a
+  // callback to delete the hook and invoke provider/consumer callbacks with
+  // cancelled error.
+  virtual void ConsumeBuf(const string& key, const string& device,
+                          const uint64 incarnation,
+                          const ConsumerCallback& done,
+                          CancellationManager* cancellation_manager);
+
+  // Cancel the rendezvous entry corresponding to `key`.  Triggered by the
+  // cancellation manager. No-op if the rendezvous was already successful.
+  void CancelHook(const string& key);
+
+  // Consumer must call this function when it's done reading the Hook provided
+  // by the ConsumerCallback.  This function will invoke the producer callback
+  // and then delete h.
+  static void DoneWithHook(Hook* h);
+
+  // Write the current contents of the table to the INFO log.
+  void LogContents();
+
+ protected:
+  const uint64 step_id_;
+  const DeviceMgr* const dev_mgr_;  // Not owned.
+  mutex mu_;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+  typedef absl::flat_hash_map<string, Hook*> HookTable;
+  HookTable hook_table_ TF_GUARDED_BY(mu_);
+
+  void PurgeTable(const absl::Status& s, HookTable* table);
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/build_graph_options.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/build_graph_options.h
new file mode 100644
index 00000000..f33d43fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/build_graph_options.h
@@ -0,0 +1,48 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BUILD_GRAPH_OPTIONS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BUILD_GRAPH_OPTIONS_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/collective_order.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+struct BuildGraphOptions {
+  CallableOptions callable_options;
+
+  // If `true`, uses Arg/Retval to implement feeds/fetches; otherwise
+  // uses Recv/Send to implement feeds/fetches.
+  // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
+  bool use_function_convention = false;
+
+  static constexpr int64_t kNoCollectiveGraphKey = 0;
+  int64_t collective_graph_key = kNoCollectiveGraphKey;
+
+  // If not `kNone`, order all CollectiveReduce operations statically and
+  // deterministically.  If `kEdges`, encode dependencies as explicit control
+  // edges, if `kAttrs` encode as attribute on collective op.
+  GraphCollectiveOrder collective_order = GraphCollectiveOrder::kNone;
+
+  string DebugString() const;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BUILD_GRAPH_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_executor_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_executor_mgr.h
new file mode 100644
index 00000000..dddaa7ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+
+class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  CollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverInterface> dev_resolver,
+      std::unique_ptr<ParamResolverInterface> param_resolver,
+      std::unique_ptr<NcclCommunicatorInterface> nccl_communicator);
+
+  virtual ~CollectiveExecutorMgr();
+
+  CollectiveExecutor* FindOrCreate(int64_t step_id) override;
+
+  void Cleanup(int64_t step_id) override;
+
+  void CleanupAll() override;
+
+  ParamResolverInterface* GetParamResolver() const override {
+    return param_resolver_.get();
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    return dev_resolver_.get();
+  }
+
+  NcclCommunicatorInterface* GetNcclCommunicator() const override {
+    return nccl_communicator_.get();
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override;
+
+  void RefreshStepIdSequenceAsync(int64_t graph_key,
+                                  const StatusCallback& done) override;
+
+  int64_t NextStepId(int64_t graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64_t graph_key, int64_t step_id) override {}
+
+ protected:
+  // Called by FindOrCreate when table entry does not yet exist.
+  virtual CollectiveExecutor* Create(int64_t step_id);
+
+  const DeviceMgr* dev_mgr_;
+  std::unique_ptr<DeviceResolverInterface> dev_resolver_;
+  std::unique_ptr<ParamResolverInterface> param_resolver_;
+  string gpu_ring_order_;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator_;
+  // Unbounded work queue for scheduling potentially-blocking work during
+  // collective op execution.  Ownership is shared between `this` and
+  // `CollectiveRemoteAccessLocal`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
+
+ private:
+  mutex exec_mu_;
+  // Map from step_id to CollectiveExecutor
+  gtl::FlatMap<int64_t, CollectiveExecutor*> executor_table_
+      TF_GUARDED_BY(exec_mu_);
+};
+
+// Creates a local CollectiveExecutorMgr with production implementations of each
+// components. Cases that need to inject other implementations of these
+// components should call CollectiveExecutorMgr constructor directly. This only
+// supports a single host. For distributed use case, use
+// CreateProdRpcCollectiveExecutorMgr() instead.
+std::unique_ptr<CollectiveExecutorMgr> CreateProdLocalCollectiveExecutorMgr(
+    const ConfigProto& config, const DeviceMgr* device_mgr,
+    std::unique_ptr<NcclCommunicatorInterface> nccl_communicator);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_param_resolver_local.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_param_resolver_local.h
new file mode 100644
index 00000000..88813b0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -0,0 +1,215 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class ConfigProto;
+class DeviceMgr;
+
+// Implements ParamResolverInterface for a single-task context.
+// It also implements the functionality necessary to serve as the
+// group leader for param resolution in a multi-task context.
+class CollectiveParamResolverLocal : public ParamResolverInterface {
+ public:
+  CollectiveParamResolverLocal(const ConfigProto& config,
+                               const DeviceMgr* dev_mgr,
+                               DeviceResolverInterface* dev_resolver,
+                               NcclCommunicatorInterface* nccl_communicator,
+                               const string& task_name);
+
+  ~CollectiveParamResolverLocal() override {}
+
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override;
+
+  void CompleteGroupAsync(const DeviceAttributes& device,
+                          CollGroupParams* group_params,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override;
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override;
+
+  absl::Status LookupGroup(int32_t group_key, CollGroupParams* group) override;
+
+  void StartAbort(const absl::Status& s) override;
+
+ protected:
+  // For access to InstanceRec and CompleteDefaultRanking.
+  friend class CollectiveParamResolverLocalTest;
+
+  // Used to complete/verify CollGroup.
+  struct GroupRec {
+    mutable mutex mu;
+    CollGroupParams group TF_GUARDED_BY(mu);
+    absl::Status status TF_GUARDED_BY(mu);
+    std::unordered_map<string, int64_t> incarnations_by_device_name
+        TF_GUARDED_BY(mu);
+    std::vector<CollGroupParams*> pending_params TF_GUARDED_BY(mu);
+    std::vector<StatusCallback> pending_done TF_GUARDED_BY(mu);
+  };
+
+  // Finds the GroupRec that corresponds to group_params->group_key.
+  // Also populates group_params from that group_rec.
+  // Will wait until GroupRec is fully populated or an error arises before
+  // calling done.  Callback GroupRec* arg is only valid if status is ok.
+  // Ownership of GroupRec stays with this object and does not pass to the
+  // callback.
+  void CompleteGroupLocal(const DeviceAttributes& device,
+                          CollGroupParams* group_params,
+                          CancellationManager* cancel_mgr, StatusCallback done)
+      TF_LOCKS_EXCLUDED(group_mu_);
+
+  // Finishes the group parameters once all members of the group are there.
+  void FinishGroup(GroupRec* gr) TF_EXCLUSIVE_LOCKS_REQUIRED(gr->mu);
+
+  // Cancels the group if it's still pending.
+  void CancelGroup(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
+
+  // Lookup and populate parameters from an already initialized group.
+  absl::Status LookupAndPopulateGroupParams(CollGroupParams* group_params);
+
+  // Used to complete/verify CollInstance.
+  struct InstanceRec;
+
+  typedef std::function<void(InstanceRec*)> IRConsumer;
+  struct InstanceRec {
+    mutex mu;
+    // Values to be shared by all instances, constant after initialization.
+    CollectiveParams* shared;
+    // If an error occurs during initialization this structure stays in the
+    // table with a non-OK status. Purging the table and restarting needs to be
+    // done at a higher level.
+    absl::Status status TF_GUARDED_BY(mu);
+
+    // These fields are used to count the instances that have called
+    // in and become known while resolving broadcast source identity and
+    // communicator key.
+    int source_rank TF_GUARDED_BY(mu);
+    string communicator_key TF_GUARDED_BY(mu);
+    int known_count TF_GUARDED_BY(mu);
+    std::vector<bool> known TF_GUARDED_BY(mu);
+    std::vector<IRConsumer> known_waiters TF_GUARDED_BY(mu);
+
+    InstanceRec()
+        : shared(new CollectiveParams()), source_rank(-1), known_count(0) {}
+    ~InstanceRec() { shared->Unref(); }
+  };
+
+  // Find the InstanceRec with the same instance_key as cp.  If it doesn't
+  // already exist, create and initialize from gr and cp.
+  // created is set to true if a new IRec is created, false otherwise.
+  //
+  // Precondition: *gr must be a complete GroupRec, i.e. the value set
+  // by CompleteGroupLocal. *cp must be populated with all the fields
+  // required by InitInstanceSharedParams.  Ownership of InstanceRec stays
+  // with this object and does not pass to the callback.
+  InstanceRec* GetOrCreateInstanceRec(CollectiveParams* cp, bool* created)
+      TF_LOCKS_EXCLUDED(instance_mu_, group_mu_);
+
+  // Populate *ir with device membership from gr, then initialize to be specific
+  // to cp->instance_key, i.e. order the devices and tasks.
+  //
+  // Preconditions:
+  //  cp is populated with all DeviceLocalities
+  void InitInstanceSharedParams(const CollectiveParams* cp, InstanceRec* ir);
+
+  // Establishes the final order of gp->device_names and gp->task_names by
+  // considering localities of all devices.
+  void CompleteDefaultRanking(CollGroupParams* gp);
+
+  // Finish populating *cp.
+  // Precondition: *gr has been fully populated by CompleteGroupLocal.
+  void CompleteInstanceLocal(const string& device, CollectiveParams* cp,
+                             const StatusCallback& done)
+      TF_LOCKS_EXCLUDED(instance_mu_, group_mu_);
+
+  // Finish populating *cp from fully initialized *ir.
+  // Precondition: *gr and *ir are fully populated.
+  void CompleteInstanceFromInitializedIRec(const string& device,
+                                           CollectiveParams* cp,
+                                           InstanceRec* ir,
+                                           const StatusCallback& done)
+      TF_LOCKS_EXCLUDED(ir->mu);
+
+  // Complete instance params after waiting for group.
+  // Precondition: *cp has complete group data and default_rank.
+  void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, const IRConsumer& f)
+      TF_LOCKS_EXCLUDED(ir->mu);
+
+  // If cp.device_names contains only devices local to this process
+  // populates *localities, else returns an error.
+  absl::Status GetLocalDeviceLocalities(
+      const CollectiveParams& cp, std::vector<DeviceLocality>* localities);
+
+  // Sets cp->instance_default_rank according to location of device in
+  // current ordering of cp->instance.device_names.
+  void SetDefaultRank(const string& device, CollectiveParams* cp);
+
+  // Sets cp->instance.type based on collective op type, and attempts to assign
+  // best implementation.
+  void AssignCollectiveType(CollectiveParams* cp);
+
+  void StartAbortLocal(const absl::Status& s)
+      TF_LOCKS_EXCLUDED(status_mu_, group_mu_, instance_mu_);
+
+  const bool nccl_;
+  const DeviceMgr* dev_mgr_;
+  DeviceResolverInterface* dev_resolver_;  // Not owned.
+  NcclCommunicatorInterface* nccl_communicator_;  // Not owned.
+  string task_name_;
+  string gpu_ring_order_;
+  mutex group_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
+      TF_GUARDED_BY(group_mu_);
+  struct TupleHash {
+    std::size_t operator()(const std::tuple<int64_t, int32_t> x) const {
+      // The hash does not need to be unique and a value of 20 is picked
+      // arbitrarily as an effort to reduce probability of conflicts.
+      return (std::get<0>(x) << 20) + std::get<1>(x);
+    }
+  };
+  mutex instance_mu_;
+  gtl::FlatMap<int32_t, gtl::FlatMap<std::tuple<int64_t, int32_t>,
+                                     std::unique_ptr<InstanceRec>, TupleHash>>
+      instance_table_ TF_GUARDED_BY(instance_mu_);
+  mutex status_mu_;
+  absl::Status status_ TF_GUARDED_BY(status_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_rma_local.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_rma_local.h
new file mode 100644
index 00000000..2c51b87a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_rma_local.h
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
+
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+// Basic implementation of PerStepCollectiveRemoteAccess.
+class CollectiveRemoteAccessLocal : public CollectiveRemoteAccess {
+ public:
+  CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
+                              DeviceResolverInterface* dev_resolver,
+                              int64_t step_id)
+      : dev_mgr_(dev_mgr),
+        dev_resolver_(dev_resolver),
+        buf_rendezvous_(step_id, dev_mgr),
+        step_id_(step_id) {}
+
+  ~CollectiveRemoteAccessLocal() override = default;
+
+  void StartAbort(const absl::Status& s) override;
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
+                    const StatusCallback& done) override;
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
+                  const StatusCallback& done) override;
+
+  void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+                       const StatusCallback& done) override;
+
+  BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
+
+  // Copy utility that always copies bytes from src to dst even if
+  // they are on the same device, unlike CopyTensor::ViaDMA which will
+  // just change the dst buffer pointer in that case.
+  static void MemCpyAsync(DeviceContext* src_dev_ctx,
+                          DeviceContext* dst_dev_ctx, Device* src_dev,
+                          Device* dst_dev, const AllocatorAttributes& src_attr,
+                          const AllocatorAttributes& dst_attr,
+                          const Tensor* src, Tensor* dst,
+                          int dev_to_dev_stream_index,
+                          const StatusCallback& done);
+
+ protected:
+  const DeviceMgr* dev_mgr_;               // not owned
+  DeviceResolverInterface* dev_resolver_;  // not owned
+  BufRendezvous buf_rendezvous_;
+  int64_t step_id_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_test_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_test_util.h
new file mode 100644
index 00000000..492097c5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_test_util.h
@@ -0,0 +1,109 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_TEST_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_TEST_UTIL_H_
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+namespace tensorflow {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64_t step_id);
+
+  // Sets when it should fail. Setting to zero disables the failure.
+  void set_fail_after(int fail_after) {
+    mutex_lock l(mu_);
+    fail_after_ = fail_after;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
+                    const StatusCallback& done) override;
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  CancellationManager* cancellation_manager,
+                  const StatusCallback& done) override;
+
+ private:
+  bool MaybeFail(const StatusCallback& done);
+
+  mutex mu_;
+  int fail_after_ TF_GUARDED_BY(mu_);
+};
+
+struct CollectiveTestEnv {
+  int num_workers;
+  int num_devices_per_worker;
+  DeviceType device_type;
+  std::unique_ptr<ParamResolverInterface> param_resolver;
+  std::unique_ptr<TestCollectiveExecutorMgr> col_exec_mgr;
+  std::shared_ptr<UnboundedWorkQueue> work_queue;
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr;
+  std::unique_ptr<DeviceResolverInterface> device_resolver;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator;
+  core::RefCountPtr<CollectiveExecutor> col_exec;
+  FailTestRMA* remote_access;
+
+  CollectiveTestEnv() : device_type(DEVICE_DEFAULT) {}
+};
+
+std::unique_ptr<CollectiveTestEnv> CreateCollectiveTestEnv(
+    int num_workers, int num_devices_per_worker, DeviceType device_type,
+    bool use_nccl = false);
+
+core::RefCountPtr<CollectiveParams> CreateCollectiveParams(
+    const CollectiveTestEnv& test_env, int rank, const string& collective_name,
+    CollectiveType collective_type, DataType dtype, const TensorShape& shape,
+    const std::vector<std::vector<int>> user_specified_rank_per_worker = {{}});
+
+std::vector<int> GenerateEvenSubdivOffsets(int num_devices_per_worker,
+                                           int num_subdivs);
+
+// Runs a collective. input and output should be on the host.
+absl::Status RunCollective(CollectiveTestEnv* test_env,
+                           CollectiveParams* col_params, Device* device,
+                           Tensor* input, Tensor* output);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_util.h
new file mode 100644
index 00000000..79cd5d50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/collective_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace collective_util {
+
+absl::Status InitializeDeviceAndLocality(const DeviceMgr* dev_mgr,
+                                         const string& device_name,
+                                         Device** device,
+                                         DeviceLocality* device_locality);
+string SubdivPermDebugString(const CollectiveParams& col_params);
+
+// Used for executing a sub-operation, e.g. a merge_op instance, with
+// an OpKernelContext based on the one passed into this Op.
+class SubContext {
+ public:
+  OpKernelContext::Params sub_params_;
+  absl::InlinedVector<TensorValue, 4UL> sub_inputs_;
+  absl::InlinedVector<AllocatorAttributes, 4UL> sub_input_attr_;
+  absl::InlinedVector<DeviceContext*, 4UL> sub_input_dc_;
+  // Used only for Binary and Unary Ops for which we require
+  // the calculation to be in-place on the first input.
+  int forward_from_ = 0;
+  std::unique_ptr<OpKernelContext> sub_ctx_;
+  SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+             OpKernel* op, Tensor* output, Tensor* input);
+  ~SubContext() = default;
+};
+
+absl::Status ComputeBinOp(OpKernelContext* op_ctx,
+                          OpKernelContext::Params* params, Device* device,
+                          OpKernel* op, Tensor* output, Tensor* input);
+
+}  // namespace collective_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h
new file mode 100644
index 00000000..b1c1eea6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h
@@ -0,0 +1,138 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATE_PREDECESSOR_TREES_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATE_PREDECESSOR_TREES_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+// TODO(b/344910755): Use the marker in Fill op to find the identity op. This
+// makes the heuristic more straightforward.
+// Colocate a tree of unplaced nodes with its placed Identity node. Identify a
+// dangling tree of ops whose Identify nodes are assigned but rest of ops are
+// not assigned. Then it should colocate the rest of the ops.
+//
+// For example, the graph before pass is:
+//
+//   node {
+//     name: "const0"
+//     op: "Const"
+//   }
+//   node {
+//     name: "const1"
+//     op: "Const"
+//   }
+//   node {
+//     name: "fill0"
+//     op: "Fill"
+//     input: "const1"
+//     input: "const0"
+//   }
+//   node {
+//     name: "id0"
+//     op: "Identity"
+//     input: "fill0"
+//     device: "/job:worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {
+//     name: "id1"
+//     op: "Identity"
+//     input: "fill0"
+//     device: "/job:worker/replica:0/task:2/device:CPU:0"
+//   }
+//
+// The graph after pass is:
+//
+//   node {
+//     name: "const0"
+//     op: "Const"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "const1"
+//     op: "Const"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "fill0"
+//     op: "Fill"
+//     input: "const1"
+//     input: "const0"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "id0"
+//     op: "Identity"
+//     input: "fill0"
+//     device: "/job:worker/replica:0/task:2/device:CPU:0"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "id1"
+//     op: "Identity"
+//     input: "fill0"
+//     device: "/job:worker/replica:0/task:2/device:CPU:0"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+
+namespace tensorflow {
+
+// This pass can place each tree of unassigned nodes with its Identity nodes,
+// when the Identity nodes are already assigned to a device. Placement is
+// instructed here with the colocation class attribute _class. This is a good
+// heuristic because it reduces number of cut edges and tends to load balance.
+class ColocatePredecessorTreesPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATE_PREDECESSOR_TREES_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/colocation_graph.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/colocation_graph.h
new file mode 100644
index 00000000..a31a2aad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/colocation_graph.h
@@ -0,0 +1,394 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/inspecting_placer.h"
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// Represents a node in the disjoint node forest and the
+// accumulated constraints on the device used by that node.
+class Member {
+ public:
+  Member() = default;
+
+  absl::Status SetParentAndSupportedDevices(
+      const Node& node, const std::vector<DeviceType>& types,
+      const DeviceNameUtils::ParsedName* local_address_spec);
+
+  const DeviceNameUtils::ParsedName& requested_device_name() const {
+    return requested_device_name_;
+  }
+
+  absl::Status SetAssignedDeviceName(const string& device_name);
+  absl::Status SetResourceDeviceName(const Node& node);
+  absl::Status SetRequestedDeviceName(const Node& node);
+
+  absl::Status FillPossibleDevices(PossibleDevices* possible_device) const;
+
+  // Returns whether `src_root` is assigned to a CompositeDevice and `this` is
+  // assigned to a physical device.
+  bool IsEdgeFromCompositeDeviceToPhysicalDevice(const Member& src_root) const;
+
+  absl::Status EnsureCompatibilityAcrossResourceEdge(
+      const Node& src, const Member& src_root,
+      const Node& dst, /*dst_root is this*/
+      bool log_device_placement);
+
+  const PrioritizedDeviceTypeVector& supported_device_types() const {
+    return supported_device_types_;
+  }
+
+  // If `dry_run` is true, just sets `new_root` and `old_root` and does not
+  // actually modify anything in the `tree`.
+  static void Merge(std::vector<Member>* tree, int x_root, int y_root,
+                    Member** new_root, Member** old_root, bool dry_run);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  // FindRoot should be called only for debugging or after the members have
+  // been updated with direct root pointers because it does not update
+  // root pointers and can traverse many links. It exists to have
+  // a const version of FindAndUpdateRoot
+  static int FindRoot(const std::vector<Member>& tree, int node_id);
+  static int FindAndUpdateRoot(std::vector<Member>* tree, int node_id);
+
+  absl::Status MergeDeviceNames(const Member& other, bool allow_soft_placement);
+
+  // Updates this to contain the intersection of the device types in
+  // this and "other". If the intersection is empty, returns false and does
+  // not update this. Else returns true and updates this.
+  bool MergeSupportedDevices(const Member& other);
+
+  absl::Status AssignDevice(const Node& node);
+
+  // If user does not explicitly request XLA device and non-XLA device is
+  // supported for this node, use only the non-XLA device. See b/140896502.
+  void MaybeExcludeXlaDevices();
+
+  // Limit the possible devices of this (should be a root) to the device
+  // specifications in `devices`.
+  absl::Status LimitToPossibleDevices(const PossibleDevices& devices,
+                                      bool allow_soft_placement);
+
+  void set_possible_devices(std::vector<Device*>&& devices) {
+    possible_devices_ = devices;
+  }
+  const std::vector<Device*>& possible_devices() { return possible_devices_; }
+
+  // Returns a (parsed) device name that is based on requested_device_name()
+  // but with potentially cleared device type and ID fields. A field is cleared
+  // if the assigned_device_name does not specify it. If it does, the field
+  // is not cleared because soft placement cannot violate assigned device names.
+  DeviceNameUtils::ParsedName GetSoftDeviceName() const;
+
+  // Same as GetSoftDeviceName but device type and device ID fields are not
+  // cleared if resource device has them set.
+  DeviceNameUtils::ParsedName GetPreferredSoftDeviceName() const;
+
+  string DebugString() const;
+
+  bool has_assigned_device_name() const { return assigned_device_name_.has_id; }
+
+ private:
+  // Updates this to contain the intersection of the device types in
+  // this and `other_devices`.
+  bool MergeSupportedDevices(const PrioritizedDeviceTypeVector& other_devices);
+
+  // The id of the node that is the parent of this one, or its own
+  // id if it is a root. parent <= 0 indicates that this member is invalid.
+  int parent_ = -1;
+
+  // A proxy for the depth of the tree that is used to prefer
+  // connecting smaller trees to larger trees when merging disjoint
+  // sets.
+  int rank_ = 0;
+
+  // Once colocation groups have been formed, the Placer starts actually
+  // choosing devices. All nodes in a group must be assigned to the same
+  // device. Once we assigned the first device to some node in this group,
+  // we set assigned_device_name_index to this device name's index in the
+  // graph.
+  // The `*_device_name_` fields will contain the parsed name of this device
+  // and `possible_devices`, if computed, will contain just this device.
+  // `assigned_device_name_index` is an optimization to avoid parsing and
+  // comparing device names. The value of -1 signals that a single device
+  // has not been chosen yet.
+  int assigned_device_name_index_ = -1;
+
+  // The merged form of the device requested for this node, with those of all of
+  // its children. requested_device_name_ is always kept a specialization (i.e.
+  // DeviceNameUtils::IsSpecification) of assigned_device_name_. When no device
+  // is requested, this field is set to assigned_device_name_.  As a
+  // specialization of assigned_device_name_, requested_device_name_ represents
+  // the most specific form of all assigned and requested devices of this node
+  // and its children, if this node is a root. requested_device_name_ is used
+  // to finally select devices for nodes.  We can override requested devices due
+  // to resource colocation constraints but not assigned devices (unless soft
+  // placement is on).
+  // INVARIANT: requested_device_name_ is always kept a
+  // DeviceNameUtils::IsSpecification of assigned_device_name_ and
+  // resource_device_name_. This makes requested_device_name_ the "accumulation
+  // of all wishes" about the device.
+  DeviceNameUtils::ParsedName requested_device_name_;
+
+  // The merged form of the device assigned for this node, with
+  // those of all of its children.
+  // This field is used to raise errors due to unsatisfiable constraints.
+  // Can be a partial specification.
+  DeviceNameUtils::ParsedName assigned_device_name_;
+
+  // The merged form of the requested resource device assigned for this node,
+  // with those of all of its children.
+  // This field is used to raise errors due to unsatisfiable constraints.
+  // Can be a partial specification.
+  // resource_device_name_ is initialized with user-requested device on nodes
+  // producing resources, e.g. VarHandleOp.
+  // For historical reasons, with soft placement enabled, Placer can "move"
+  // resources (place resource producing ops on a device different from what
+  // the user explicitly requested) when the colocation group of a resource
+  // producing op contains ops that are not supported on the user-requested
+  // resource device. A classic example of this is a sparse optimizer (only
+  // supported on CPU) used on a GPU variable. In this case, the whole group
+  // will be assigned to some device supported by all ops in the colocation
+  // group. This is a surprising and unfortunate behavior because:
+  //   1. Since soft_placement is on by default, users don't know that their
+  //   variables are created on a different device than what they requested.
+  //   Among other things, this can lead to surprising poor performance.
+  //   2. Eager runtime cannot "move" resources. The same code can "work" when
+  //   wrapped in tf.function but will fail when run eagerly.
+  //   3. Extra complexity here to preserve these resource moving capabilities.
+  DeviceNameUtils::ParsedName resource_device_name_;
+
+  // The intersection of all device types supported by this node,
+  // and those of all of its children, in priority order
+  // of the preferred device.
+  // It is possible that supported_device_types_ has an empty intersection with
+  // requested/assigned/resource devices. We could have detected such cases
+  // as soon as they happen and raise an error. Instead, for historical reasons,
+  // we leave such error detection to the final device picking stage.
+  PrioritizedDeviceTypeVector supported_device_types_;
+
+  // If this node is a root, stores a list of Devices to which this node
+  // and all of its children can be assigned.
+  // `possible_devices` is empty if they have not yet been computed.
+  std::vector<Device*> possible_devices_;
+};
+
+// This class maintains the connected components of a colocation
+// constraint graph, and uses this information to assign a satisfying
+// device placement to the nodes of the graph.
+//
+// This implementation uses the Union-Find algorithm to efficiently maintain the
+// connected components and incrementally adds edges via
+// ColocationGraph::ColocateNodes() invocations.
+//
+// ColocationGraph does not assign any devices to graph nodes. The
+// `log_device_placement` argument is used to log messages when requested
+// device is ignored.
+class ColocationGraph {
+ public:
+  // graph, flib_def, and device_set must not be null and must outlive
+  // this ColocationGraph. default_local_device can be null. If not, must
+  // outlive this.
+  ColocationGraph(const Graph* graph, const FunctionStack& stack,
+                  const FunctionLibraryDefinition* flib_def,
+                  const DeviceSet* device_set,
+                  const Device* default_local_device, bool allow_soft_placement,
+                  bool log_device_placement);
+
+  absl::Status Initialize();
+
+  const std::vector<Member>& members() const { return members_; }
+
+  // Limit the group containing `node` to the device specifications in
+  // `devices`.
+  absl::Status LimitToPossibleDevices(const Node& node,
+                                      const PossibleDevices& devices);
+
+  // Limits the possible devices of `node`'s colocation group to the device
+  // to which `node` is assigned. This makes sure that all nodes in this
+  // colocation group will be assigned to the same device. Without this
+  // explicit restriction, heuristics can choose a different possible device
+  // for other nodes in the group.
+  absl::Status LimitToAssignedDevice(const Node& node);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  // Updates the internal pointers so that future calls will returns faster.
+  int FindAndUpdateRoot(int node_id) {
+    return Member::FindAndUpdateRoot(&members_, node_id);
+  }
+
+  // For the given node, subject to the constraints previously given
+  // to this ColocationGraph, set its assigned_device_name. Returns OK
+  // if a satisfying device can be found, otherwise an error.
+  //
+  // Note: This method returns a pointer to a field within members_.
+  // The caller must not use the returned pointer after there is any possibility
+  // that the members_[i].possible_devices field has been modified.
+  absl::Status GetDevicesForNode(Node* node,
+                                 const std::vector<Device*>** possible_devices);
+
+  // Returns debugging info for the node referred to by 'node_root'.
+  string DebugInfo(const int node_root) const;
+
+  string DebugString() const;
+
+  // Returns a list of devices having type in supported_device_types.  The
+  // returned list is sorted by preferred type (higher numeric type is
+  // preferred).
+  static std::vector<Device*> FilterSupportedDevices(
+      const std::vector<Device*>& devices,
+      const PrioritizedDeviceTypeVector& supported_device_types,
+      const Device* default_local_device);
+
+ private:
+  // Adds each node of the Graph to this ColocationGraph as a singleton.
+  //
+  // NOTE: The implementation assumes that the ids of nodes passed to
+  // this method are dense and zero-based; the memory used will be linear in
+  // the largest node ID.
+  // NOTE: If this method returns an error, *this is left in an undefined
+  // state.
+  absl::Status ColocateAllNodes();
+
+  absl::Status ColocateResourceOrRefEdge(const Node* src, const Node* dst);
+
+  // Adds colocation constraints to data types known not to support copying.
+  absl::Status ColocateUncopiableTypeEdges(
+      std::unordered_set<Node*>* inspection_required);
+
+  // Updates this ColocationGraph by making sure that all nodes
+  // touching resource and/or ref tensors are colocated.
+  // As it iterates over the edges, fills the `inspection_required` set with
+  // the nodes that
+  // PlacerInspectionRequiredOpChecker::IsPlacerInspectionRequired
+  // deems as requiring deep inspection by placer. This is an optimization.
+  // TODO(mdan): Deprecate in favor of ColocateUncopiableTypeEdges.
+  absl::Status ColocateResourceAndRefEdges(
+      std::unordered_set<Node*>* inspection_required);
+
+  // Updates this ColocationGraph by making sure that all nodes having inputs of
+  // a DT_VARIANT data type with a host-only underlying types (e.g. strings) can
+  // be placed only on CPU device. We do that by reverse-DFS traversal from all
+  // nodes that take variant inputs to the node that produces that variant.
+  // TODO(ezhulenev): This function does not yet support "deep op" inspection,
+  // that we have for DT_RESOURCE edges.
+  absl::Status AddHostOnlyDataTypesConstraints();
+
+  absl::Status AddInspectionConstraints(
+      const std::unordered_set<Node*>& inspection_required);
+
+  // Applies colocation groups for `node`'s inputs and outputs to this
+  // ColocationGraph.
+  // `groups` are the colocation groups to which `nodes`'s inputs and outputs
+  // belong.
+  // `node` is a node requiring deep inspection (e.g. a node calling
+  // a function)
+  //
+  // For example, consider a `node` taking two inputs and producing one output
+  //    a  b
+  //    |  |
+  //    v  v
+  //    node
+  //     |
+  //     v
+  //     c
+  //
+  // `groups` can tell us that `a` and `c` must be colocated and their device
+  // must be a GPU. `b` might be in a group by itself without any device
+  // restrictions.
+  //
+  // ApplyIOColocationGroups will have an effect of calling
+  // ColocateNodes(a, c) and LimitToPossibleDevices(`a`, "GPU"). The colocation
+  // group of the `node` itself is not directly impacted.
+  //
+  absl::Status ApplyIOColocationGroups(const IOColocationGroups& groups,
+                                       const Node& node);
+
+  absl::Status ColocateNodeToGroup(
+      std::unordered_map<absl::string_view, const Node*, StringPieceHasher>*
+          colocation_group_root,
+      const Node* node, absl::string_view colocation_group);
+
+  // Merge the (possibly disjoint) sets containing nodes "x" and
+  // "y". Returns OK if the all nodes in the union of these sets can
+  // be placed on the same device type.
+  //
+  // If this method returns an error, *this is unchanged.
+  absl::Status ColocateNodes(const Node& x, const Node& y);
+
+  // This overload of ColocateNodes() allows a caller to provide the root node
+  // ids for the two nodes. For large graphs, this noticeably reduces the
+  // graph load time.
+  // If this method returns an error, *this is unchanged.
+  absl::Status ColocateNodes(const Node& x, int x_root, const Node& y,
+                             int y_root);
+
+  void GetSoftDeviceCandidates(const Node& node, const Member& root_member,
+                               int root_id,
+                               std::vector<Device*>* possible_devices);
+
+  absl::Status InitializeMembers();
+
+  absl::Status InitializeMemberWithAssignedDevice(
+      const string& assigned_device_name, const string& node_type,
+      Member* member);
+
+  absl::Status InitializeMember(const Node& node, Member* member);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  // FindRoot should be called only for debugging or after the members have
+  // been updated with direct root pointers because it does not update
+  // root pointers and can traverse many links. It exists to have
+  // a const version of FindAndUpdateRoot
+  int FindRoot(int node_id) const {
+    return Member::FindRoot(members_, node_id);
+  }
+
+  const Graph& graph_;
+  const FunctionStack stack_;
+  std::vector<Member> members_;
+  InspectingPlacer inspecting_placer_;
+  PlacerInspectionRequiredOpChecker inspection_required_checker_;
+  const DeviceSet& device_set_;
+  const std::vector<DeviceType> device_types_;
+  const DeviceNameUtils::ParsedName local_address_spec_;
+  const Device* default_local_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+
+  ColocationGraph(const ColocationGraph&) = delete;
+  void operator=(const ColocationGraph&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/composite_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/composite_device.h
new file mode 100644
index 00000000..6e79542a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/composite_device.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COMPOSITE_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COMPOSITE_DEVICE_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+extern const char* const kCompositeDeviceType;
+
+// A virtual device which represents a set of devices. We don't execute any
+// op on this virtial device.
+class CompositeDevice : public Device {
+ public:
+  absl::Status Sync() override {
+    return errors::Internal(
+        "Sync() should never been invoked on CompositeDevice.");
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+
+  const std::vector<string>* underlying_devices() const {
+    return &underlying_devices_;
+  }
+
+  // Helper for creating a CompositeDevice on the same task as the given host
+  // CPU.
+  static std::unique_ptr<CompositeDevice> MakeDevice(
+      const std::vector<string>& underlying_devices, const int unique_device_id,
+      const DeviceNameUtils::ParsedName& host_name, absl::Status* status);
+
+  // Helper for creating a CompositeDevice with the given device name.
+  static std::unique_ptr<CompositeDevice> MakeDevice(
+      const std::vector<string>& underlying_devices, const string& device_name,
+      absl::Status* status);
+
+  bool IsRemoteCallAllowed() const override { return false; }
+
+ private:
+  CompositeDevice(const DeviceAttributes& device_attributes,
+                  const std::vector<string>& underlying_devices)
+      : Device(/*env=*/nullptr, device_attributes),
+        underlying_devices_(underlying_devices) {}
+
+  const std::vector<string> underlying_devices_;
+
+  CompositeDevice(const CompositeDevice&) = delete;
+  void operator=(const CompositeDevice&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COMPOSITE_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/constant_folding.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/constant_folding.h
new file mode 100644
index 00000000..fd74a554
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/constant_folding.h
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_CONSTANT_FOLDING_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_CONSTANT_FOLDING_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
+
+// TODO(skyewm): can this be combined with EvaluateConstantTensor?
+
+namespace tensorflow {
+
+// This generator type is used to generate a name for the newly folded node
+// based on the node's old name.
+using ConstantFoldNameGenerator =
+    std::function<string(Graph* graph, string old_name)>;
+
+// Options specific to constant folding optimizations.
+struct ConstantFoldingOptions {
+  // If "consider" is not a nullptr, then only constant fold a node "n" if
+  // consider(n) returns true.
+  std::function<bool(const Node*)> consider = nullptr;
+  // If shape_map is not a nullptr, it is a map from node n to a
+  // vector of the (potentially partially-known) shapes of its
+  // outputs.
+  const std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map =
+      nullptr;  // not owned
+  // The maximum size of each constant created during constant folding
+  // optimization.
+  int64_t max_constant_size_in_bytes = 10 * 1024 * 1024;
+
+  // A generator for the name suffix of constant folded nodes. A
+  // default id generator that monotonically increases is used if nullptr is
+  // passed.
+  ConstantFoldNameGenerator generate_new_name = nullptr;
+};
+
+// Perform constant folding optimization on "graph".
+// Looks for nodes in "graph" that can be completely evaluated statically, i.e.,
+// that are only dependent on constants. Evaluates those nodes on a CPU device
+// and replaces those nodes with the result of the evaluation.
+// "partition_device", if non-null, is the device where all the graph nodes are
+// assumed to execute.
+// Sets `was_mutated` to true if and only if "graph" has been mutated.
+// The status is only set to a non-OK state if an unexpected error is hit
+// running the graph.
+absl::Status ConstantFold(const ConstantFoldingOptions& opts,
+                          FunctionLibraryRuntime* function_library, Env* env,
+                          const Device* partition_device, Graph* graph,
+                          bool* was_mutated);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_CONSTANT_FOLDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/copy_tensor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/copy_tensor.h
new file mode 100644
index 00000000..0f621603
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/copy_tensor.h
@@ -0,0 +1,80 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class CopyTensor {
+ public:
+  typedef void (*CopyFunction)(
+      DeviceContext* send_dev_context, DeviceContext* recv_dev_context,
+      Device* src, Device* dst, const AllocatorAttributes src_alloc_attr,
+      const AllocatorAttributes dst_alloc_attr, const Tensor* input,
+      Tensor* output, int dev_to_dev_stream_index, StatusCallback done);
+
+  // Copies "input" to "output" between devices accessible to the
+  // local process via some DMA-like method.  "edge_name" is the name
+  // of the tensor being copied, for debugging purposes. Depending on
+  // the type of devices and memory in use, the copy may be performed
+  // synchronously or asynchronously.  'done' will be invoked only
+  // after the copy is actually complete.
+  static void ViaDMA(absl::string_view edge_name,
+                     DeviceContext* send_dev_context,
+                     DeviceContext* recv_dev_context, Device* src, Device* dst,
+                     const AllocatorAttributes src_alloc_attr,
+                     const AllocatorAttributes dst_alloc_attr,
+                     const Tensor* input, Tensor* output,
+                     int dev_to_dev_stream_index, StatusCallback done,
+                     bool sync_dst_compute = true);
+
+  // Object used to call Register() at static-initialization time.
+  // Note: This should only ever be used as a global-static object; no stack
+  // or heap instances.
+  class Registration {
+   public:
+    Registration(DeviceType sender_device_type, DeviceType receiver_device_type,
+                 CopyFunction copy_function) {
+      TF_QCHECK_OK(Register(sender_device_type, receiver_device_type,
+                            copy_function, /*is_pluggable_device=*/false));
+    }
+  };
+
+  // Register a function for copying between two specific DeviceTypes.
+  // Note: This should only be called via the constructor of
+  // CopyTensor::Registration or from PluggableDevice implementation.
+  static absl::Status Register(DeviceType sender_device_type,
+                               DeviceType receiver_device_type,
+                               CopyFunction copy_function,
+                               bool is_pluggable_device);
+};
+
+void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
+                      Allocator* out_allocator, absl::string_view edge_name,
+                      Device* src, Tensor* output,
+                      DeviceContext* send_dev_context, StatusCallback done);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_constants.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_constants.h
new file mode 100644
index 00000000..df01bf53
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_constants.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COST_CONSTANTS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COST_CONSTANTS_H_
+
+namespace tensorflow {
+
+// Types of per-request cost.
+inline constexpr char kGpuCostName[] = "gpu";
+inline constexpr char kTpuCostName[] = "tpu";
+inline constexpr char kGcuCostName[] = "gcu";
+inline constexpr char kNoOpCostName[] = "no_op";
+
+// Each type of per-request cost could have the following versions.
+//
+// A server may have costs that cannot be directly attributed to a specific
+// query. Each request will be assigned a portion of it, and the cost ends with
+// '_with_smear" includes this part.
+inline constexpr char kWithSmearSuffix[] = "_with_smear";
+inline constexpr char kNoSmearSuffix[] = "_no_smear";
+inline constexpr char kNonBatchingSuffix[] = "_non_batching";
+
+// Full names of per-request cost.
+inline constexpr char kTpuWithSmearCostName[] = "tpu_with_smear";
+inline constexpr char kTpuNoSmearCostName[] = "tpu_no_smear";
+inline constexpr char kTpuDecodeWithSmearCostName[] = "tpu_decode_with_smear";
+inline constexpr char kTpuDecodeNoSmearCostName[] = "tpu_decode_no_smear";
+inline constexpr char kTpuPrefillWithSmearCostName[] = "tpu_prefill_with_smear";
+inline constexpr char kTpuPrefillNoSmearCostName[] = "tpu_prefill_no_smear";
+inline constexpr char kTpuNonBatchingCostName[] = "tpu_non_batching";
+inline constexpr char kGpuWithSmearCostName[] = "gpu_with_smear";
+inline constexpr char kGpuNoSmearCostName[] = "gpu_no_smear";
+inline constexpr char kGpuDecodeWithSmearCostName[] = "gpu_decode_with_smear";
+inline constexpr char kGpuDecodeNoSmearCostName[] = "gpu_decode_no_smear";
+inline constexpr char kGpuPrefillWithSmearCostName[] = "gpu_prefill_with_smear";
+inline constexpr char kGpuPrefillNoSmearCostName[] = "gpu_prefill_no_smear";
+inline constexpr char kGpuNonBatchingCostName[] = "gpu_non_batching";
+inline constexpr char kGcuWithSmearCostName[] = "gcu_with_smear";
+inline constexpr char kGcuNoSmearCostName[] = "gcu_no_smear";
+inline constexpr char kGcuNonBatchingCostName[] = "gcu_non_batching";
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COST_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_measurement.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_measurement.h
new file mode 100644
index 00000000..3da322e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_measurement.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COST_MEASUREMENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COST_MEASUREMENT_H_
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+
+namespace tensorflow {
+
+// An interface for cost measurement.
+class CostMeasurement {
+ public:
+  // Context of the CostMeasurement.
+  struct Context {
+    // Whether this CostMeasurement is running within a per-query context (e.g.
+    // rpc handler) or not (e.g. batching).
+    bool is_per_query = false;
+  };
+
+  explicit CostMeasurement(const Context& context) {}
+
+  virtual ~CostMeasurement() {}
+
+  virtual absl::Duration GetTotalCost() = 0;
+
+  virtual absl::string_view GetCostType() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COST_MEASUREMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_measurement_registry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_measurement_registry.h
new file mode 100644
index 00000000..b2f17273
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_measurement_registry.h
@@ -0,0 +1,72 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COST_MEASUREMENT_REGISTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COST_MEASUREMENT_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/cost_measurement.h"
+
+namespace tensorflow {
+
+// CostMeasurementRegistry allows to
+// - register a CostMeasurement type to the global map
+// - create an instance of registered CostMeasurement.
+class CostMeasurementRegistry {
+ public:
+  // Creates an instance of registered CostMeasurement by name. If the named
+  // CostMeasurement is not registered yet, returns nullptr. Any returned
+  // std::unique_ptr<CostMeasurement> should not be moved.
+  // TODO(b/185852990): create a non-moveable wrapper class for the returned
+  // unique_ptr<CostMeasurement>.
+  static std::unique_ptr<CostMeasurement> CreateByNameOrNull(
+      const std::string& name, const CostMeasurement::Context& context);
+
+  using Creator = std::function<std::unique_ptr<CostMeasurement>(
+      const CostMeasurement::Context&)>;
+
+  // Registers a CostMeasurement type to the global map. Registering different
+  // types of CostMeasurement with the same name is prohibited.
+  static void RegisterCostMeasurement(absl::string_view name, Creator creator);
+};
+
+// Registers a CostMeasurement type to the global map. Registering different
+// types of CostMeasurement with the same name is prohibited.
+class CostMeasurementRegistrar {
+ public:
+  explicit CostMeasurementRegistrar(absl::string_view name,
+                                    CostMeasurementRegistry::Creator creator) {
+    CostMeasurementRegistry::RegisterCostMeasurement(name, std::move(creator));
+  }
+};
+
+#define REGISTER_COST_MEASUREMENT(name, MyCostMeasurementClass)        \
+  namespace {                                                          \
+  static ::tensorflow::CostMeasurementRegistrar                        \
+      MyCostMeasurementClass##_registrar(                              \
+          (name), [](const CostMeasurement::Context& context) {        \
+            return std::make_unique<MyCostMeasurementClass>(context); \
+          });                                                          \
+  }  // namespace
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COST_MEASUREMENT_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_util.h
new file mode 100644
index 00000000..aa1102c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/cost_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COST_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COST_UTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/cost_measurement.h"
+#include "tensorflow/core/common_runtime/request_cost_accessor.h"
+
+namespace tensorflow {
+
+// Creates instances of CostMeasurement. The types to create are determined by
+// env.
+std::vector<std::unique_ptr<CostMeasurement>> CreateCostMeasurements(
+    const CostMeasurement::Context& context);
+
+// Creates an instance of RequestCostAccessor. The type to create is determined
+// by env. Returns nullptr if the type is not specified in env, or the type of
+// CostMeasurement is unregistered..
+std::unique_ptr<RequestCostAccessor> CreateRequestCostAccessor();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/costmodel_manager.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/costmodel_manager.h
new file mode 100644
index 00000000..8ea8a137
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/costmodel_manager.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COSTMODEL_MANAGER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COSTMODEL_MANAGER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/iterator_range.h"
+
+namespace tensorflow {
+
+// Used to manage all the cost models for a session.
+class CostModelManager {
+ public:
+  ~CostModelManager();
+
+  typedef std::unordered_map<const Graph*, CostModel*> CostModelMap;
+  typedef CostModelMap::iterator CostModelMapIter;
+
+  void ExportCostModels(CostModelMap* cost_models) {
+    mutex_lock l(mu_);
+    *cost_models = cost_models_;
+  }
+
+  CostModel* FindOrCreateCostModel(const Graph* graph);
+
+  bool RemoveCostModelForGraph(const Graph* graph);
+
+  absl::Status AddToCostGraphDef(const Graph* graph, CostGraphDef* cost_graph);
+
+ private:
+  mutex mu_;
+  CostModelMap cost_models_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COSTMODEL_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/debugger_state_interface.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/debugger_state_interface.h
new file mode 100644
index 00000000..1b9f190e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/debugger_state_interface.h
@@ -0,0 +1,123 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
+
+namespace tensorflow {
+
+// Returns a summary string for the list of debug tensor watches.
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches);
+
+// An abstract interface for storing and retrieving debugging information.
+class DebuggerStateInterface {
+ public:
+  virtual ~DebuggerStateInterface() {}
+
+  // Publish metadata about the debugged Session::Run() call.
+  //
+  // Args:
+  //   global_step: A global step count supplied by the caller of
+  //     Session::Run().
+  //   session_run_index: A chronologically sorted index for calls to the Run()
+  //     method of the Session object.
+  //   executor_step_index: A chronologically sorted index of invocations of the
+  //     executor charged to serve this Session::Run() call.
+  //   input_names: Name of the input Tensors (feed keys).
+  //   output_names: Names of the fetched Tensors.
+  //   target_names: Names of the target nodes.
+  virtual absl::Status PublishDebugMetadata(
+      const int64_t global_step, const int64_t session_run_index,
+      const int64_t executor_step_index, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_nodes) = 0;
+};
+
+class DebugGraphDecoratorInterface {
+ public:
+  virtual ~DebugGraphDecoratorInterface() {}
+
+  // Insert special-purpose debug nodes to graph and dump the graph for
+  // record. See the documentation of DebugNodeInserter::InsertNodes() for
+  // details.
+  virtual absl::Status DecorateGraph(Graph* graph, Device* device) = 0;
+
+  // Publish Graph to debug URLs.
+  virtual absl::Status PublishGraph(const Graph& graph,
+                                    const string& device_name) = 0;
+};
+
+typedef std::function<std::unique_ptr<DebuggerStateInterface>(
+    const DebugOptions& options)>
+    DebuggerStateFactory;
+
+// Contains only static methods for registering DebuggerStateFactory.
+// We don't expect to create any instances of this class.
+// Call DebuggerStateRegistry::RegisterFactory() at initialization time to
+// define a global factory that creates instances of DebuggerState, then call
+// DebuggerStateRegistry::CreateState() to create a single instance.
+class DebuggerStateRegistry {
+ public:
+  // Registers a function that creates a concrete DebuggerStateInterface
+  // implementation based on DebugOptions.
+  static void RegisterFactory(const DebuggerStateFactory& factory);
+
+  // If RegisterFactory() has been called, creates and supplies a concrete
+  // DebuggerStateInterface implementation using the registered factory,
+  // owned by the caller and return an OK Status. Otherwise returns an error
+  // Status.
+  static absl::Status CreateState(
+      const DebugOptions& debug_options,
+      std::unique_ptr<DebuggerStateInterface>* state);
+
+ private:
+  static DebuggerStateFactory* factory_;
+
+  DebuggerStateRegistry(const DebuggerStateRegistry&) = delete;
+  void operator=(const DebuggerStateRegistry&) = delete;
+};
+
+typedef std::function<std::unique_ptr<DebugGraphDecoratorInterface>(
+    const DebugOptions& options)>
+    DebugGraphDecoratorFactory;
+
+class DebugGraphDecoratorRegistry {
+ public:
+  static void RegisterFactory(const DebugGraphDecoratorFactory& factory);
+
+  static absl::Status CreateDecorator(
+      const DebugOptions& options,
+      std::unique_ptr<DebugGraphDecoratorInterface>* decorator);
+
+ private:
+  static DebugGraphDecoratorFactory* factory_;
+
+  DebugGraphDecoratorRegistry(const DebugGraphDecoratorRegistry&) = delete;
+  void operator=(const DebugGraphDecoratorRegistry&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device.h
new file mode 100644
index 00000000..83785e33
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device.h
@@ -0,0 +1,20 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
+
+#include "tensorflow/core/framework/device.h"
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_event_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_event_mgr.h
new file mode 100644
index 00000000..7725a941
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_event_mgr.h
@@ -0,0 +1,160 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_EVENT_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_EVENT_MGR_H_
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// TODO(annarev): Check if we can use a more general option representation here
+// that could work for other device types as well.
+class GPUOptions;
+
+// The callback provided to EventMgr::ThenExecute must not block or take a long
+// time.  If it does, performance may be impacted and device memory may be
+// exhausted.  This macro is for checking that an EventMgr thread is not
+// accidentally entering blocking parts of the code, e.g. the RPC subsystem.
+//
+// Intended use is something like
+//
+//   void RespondToAnRPC(Params* params) {
+//      WARN_IF_IN_EVENT_MGR_THREAD;
+//      if (params->status.ok()) { ...
+//
+namespace device_event_mgr {
+// Logs a stack trace if current execution thread belongs to this EventMgr
+// object.  If f is not nullptr, executes instead of  logging the stack trace.
+// trace.
+void WarnIfInCallback(std::function<void()> f);
+}  // namespace device_event_mgr
+#define WARN_IF_IN_EVENT_MGR_THREAD \
+  ::tensorflow::device_event_mgr::WarnIfInCallback(nullptr)
+
+// EventMgr lets you register a callback to be executed when a given
+// StreamExecutor stream completes all the work that's thus-far been enqueued on
+// the stream.
+class EventMgr {
+ public:
+  virtual ~EventMgr();
+
+  // Execute `func` when all pending stream actions have completed.  func must
+  // be brief and non-blocking since it executes in the one thread used for all
+  // such callbacks and also buffer deletions.
+  void ThenExecute(se::Stream* stream, std::function<void()> func) {
+    ToFreeVector to_free;
+    {
+      mutex_lock l(mu_);
+      EnqueueCallback(stream, std::move(func));
+      PollEvents(stream, &to_free);
+    }
+    FreeMemory(to_free);
+  }
+
+ private:
+  friend class TEST_EventMgr;
+  friend class TEST_EventMgrHelper;
+  friend class EventMgrFactory;
+
+  se::StreamExecutor* const exec_;
+  const int32 polling_active_delay_usecs_;
+  mutex mu_;
+  condition_variable events_pending_ TF_GUARDED_BY(mu_);
+
+  struct InUse {
+    se::Event* event;
+    std::function<void()> func;
+  };
+
+  typedef absl::InlinedVector<InUse, 4UL> ToFreeVector;
+
+  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
+
+  void FreeMemory(const ToFreeVector& to_free) {
+    for (const auto& iu : to_free) {
+      // The function must be called in another thread.
+      if (iu.func != nullptr) threadpool_.Schedule(iu.func);
+    }
+  }
+
+  // Set up `func` to be called once `stream` completes all its outstanding
+  // work.
+  void EnqueueCallback(se::Stream* stream, std::function<void()> func)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // This function should be called at roughly the same tempo as QueueTensors()
+  // to check whether pending events have recorded, and then retire them.
+  //
+  // If `stream` is not null, we only poll events for that stream.  Otherwise we
+  // poll events for all streams.
+  void PollEvents(se::Stream* stream, ToFreeVector* to_free)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // An internal polling loop that runs at a low frequency to clear straggler
+  // Events.
+  void PollLoop();
+
+  // Setup/Teardown functions for the polling loop.
+  void StartPollingLoop();
+  void StopPollingLoop();
+
+  // A stack of unused events
+  std::vector<std::unique_ptr<se::Event>> free_events_ TF_GUARDED_BY(mu_);
+
+  // Callbacks waiting on their events to complete.
+  absl::flat_hash_map<
+      se::Stream*,
+      std::deque<std::pair<std::unique_ptr<se::Event>, std::function<void()>>>>
+      callbacks_ TF_GUARDED_BY(mu_);
+
+  bool stop_polling_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<Notification> polling_stopped_;
+
+  // The main PollLoop for the event manager runs in this threadpool.
+  thread::ThreadPool threadpool_;
+};
+
+// Manages all the EventMgr instances.
+class EventMgrFactory {
+ public:
+  static EventMgrFactory* Singleton();
+
+  EventMgr* GetEventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
+
+ private:
+  mutex mu_;
+
+  // Maintain one EventMgr per physical device (StreamExecutor is
+  // per-physical-device).
+  absl::flat_hash_map<se::StreamExecutor*, EventMgr*> event_mgr_map_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_EVENT_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_host_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_host_allocator.h
new file mode 100644
index 00000000..0ed688fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_host_allocator.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_HOST_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_HOST_ALLOCATOR_H_
+
+#include "xla/stream_executor/integrations/device_host_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+using stream_executor::DeviceHostAllocator;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_HOST_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_id.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_id.h
new file mode 100644
index 00000000..d64a83cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_id.h
@@ -0,0 +1,91 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_H_
+
+#include "xla/tsl/framework/device_id.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// There are three types of device ids:
+// - *physical* device id: this is the integer index of a device in the
+//   physical machine, it can be filtered (for e.g. using environment variable
+//   CUDA_VISIBLE_DEVICES when using CUDA). Note that this id is not visible to
+//   Tensorflow, but result after filtering is visible to TF and is called
+//   platform device id as below.
+//   For CUDA, see
+//   http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
+//   for more details.
+// - *platform* device id (also called *visible* device id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that is
+//   visible to Tensorflow after filtering (for e.g. by CUDA_VISIBLE_DEVICES).
+//   For CUDA, this id is generated by the CUDA GPU driver. It starts from 0
+//   and is used for CUDA API calls like cuDeviceGet().
+// - TF device id (also called *virtual* device id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that
+//   Tensorflow generates and exposes to its users. It is the id in the <id>
+//   field of the device name "/device:GPU:<id>", and is also the identifier of
+//   a BaseGPUDevice. Note that the configuration allows us to create multiple
+//   BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
+//   hardware, so the mapping between TF GPU id and platform GPU id is not a 1:1
+//   mapping, see the example below.
+//
+// For example, assuming that in the machine we have GPU device with index 0, 1,
+// 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create
+// the following mapping between platform GPU id and physical GPU id:
+//
+//        platform GPU id ->  physical GPU id
+//                 0  ->  1
+//                 1  ->  2
+//                 2  ->  3
+//
+// Note that physical GPU id 0 is invisible to TF so there is no mapping entry
+// for it.
+//
+// Assuming we configure the Session to create one BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mapping between TF device id and platform device id:
+//
+//                  TF GPU id  ->  platform GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  0
+//
+// Note that platform device id 1 is filtered out by
+// GPUOptions::visible_device_list, so it won't be used by the TF process.
+//
+// On the other hand, if we configure it to create 2 BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mapping between TF device id and platform device id:
+//
+//                  TF GPU id  ->  platform GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  2
+//      2 (i.e. /device:GPU:2) ->  0
+//      3 (i.e. /device:GPU:3) ->  0
+//
+// We create strong-typed integer classes for both TF device id and platform
+// device id to minimize programming errors and improve code readability. Except
+// for the StreamExecutor interface (as we don't change its API), whenever we
+// need a TF device id (or platform device id) we should use TfDeviceId (or
+// PlatformDeviceId) instead of a raw integer.
+using tsl::PlatformDeviceId;  // NOLINT
+using tsl::TfDeviceId;        // NOLINT
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_id_manager.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_id_manager.h
new file mode 100644
index 00000000..058e94fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_id_manager.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_MANAGER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_MANAGER_H_
+
+#include "xla/tsl/framework/device_id_manager.h"
+#include "tensorflow/core/common_runtime/device/device_id.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+using tsl::DeviceIdManager;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_mem_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_mem_allocator.h
new file mode 100644
index 00000000..44e516b9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_mem_allocator.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_MEM_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_MEM_ALLOCATOR_H_
+
+#include "xla/stream_executor/integrations/device_mem_allocator.h"
+#include "tensorflow/core/common_runtime/device/device_id.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+using stream_executor::DeviceMemAllocator;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_MEM_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_utils.h
new file mode 100644
index 00000000..5447c729
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device/device_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_UTILS_H_
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace device_utils {
+
+// Validate device type. Device type must start with a capital letter and
+// consist of capital letters and underscores. Reasoning behind this decision:
+// * At the minimum we want to disallow '/' and ':' since
+//   these characters are used in device spec, for e.g.
+//   /job:foo/replica:12/device:GPU:1.
+// * Underscores seem useful, for e.g. XLA_GPU uses underscores.
+// * Allowing lowercase might get confusing. For example, say someone
+//   registers a new type called "Gpu". It might be confusing for users that
+//   "Gpu" is not the same device type as "GPU".
+//   Note that lowercase "cpu" and "gpu" are currently supported only for
+//   legacy reasons:
+//   https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/python/framework/device_spec.py;l=46;drc=d3a378f9665d8eee827c74cb9ecbee81e4c288dd
+absl::Status ValidateDeviceType(absl::string_view type);
+
+}  // namespace device_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_factory.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_factory.h
new file mode 100644
index 00000000..1b5a6626
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_factory.h
@@ -0,0 +1,20 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
+
+#include "tensorflow/core/framework/device_factory.h"
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_id_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_id_utils.h
new file mode 100644
index 00000000..f0cab86b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_id_utils.h
@@ -0,0 +1,42 @@
+
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_ID_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_ID_UTILS_H_
+
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/device_id.h"
+#include "xla/tsl/framework/device_id_manager.h"
+
+namespace tensorflow {
+
+// Utility method for getting the associated executor given a TfDeviceId.
+class DeviceIdUtil {
+ public:
+  static absl::StatusOr<stream_executor::StreamExecutor*> ExecutorForTfDeviceId(
+      const tsl::DeviceType& type, stream_executor::Platform* device_manager,
+      tsl::TfDeviceId tf_device_id) {
+    tsl::PlatformDeviceId platform_device_id;
+    TF_RETURN_IF_ERROR(tsl::DeviceIdManager::TfToPlatformDeviceId(
+        type, tf_device_id, &platform_device_id));
+    return device_manager->ExecutorForDevice(platform_device_id.value());
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_ID_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_mgr.h
new file mode 100644
index 00000000..3e0abb14
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_mgr.h
@@ -0,0 +1,180 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/lib/core/arena.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class DeviceAttributes;
+
+// Represents a set of devices.
+class DeviceMgr {
+ public:
+  DeviceMgr() = default;
+  virtual ~DeviceMgr();
+
+  // Returns attributes of all devices.
+  virtual void ListDeviceAttributes(
+      std::vector<DeviceAttributes>* devices) const = 0;
+
+  // Returns raw pointers to the underlying devices.
+  virtual std::vector<Device*> ListDevices() const = 0;
+
+  // Returns a string listing all devices.
+  virtual string DebugString() const = 0;
+
+  // Returns a string of all the device mapping.
+  virtual string DeviceMappingString() const = 0;
+
+  // Assigns *device with pointer to Device of the given name.
+  // Accepts either a full device name, or just the replica-local suffix.
+  virtual absl::Status LookupDevice(absl::string_view name,
+                                    Device** device) const = 0;
+
+  // Check if the current device manager contains device with the given
+  // incarnation ID. Looking up by incarnation IDs because they are randomly
+  // generated and not intentionally reused (unlike device pointers).
+  virtual bool ContainsDevice(int64_t device_incarnation) const = 0;
+
+  // Clears given containers of all devices if 'container' is
+  // non-empty. Otherwise, clears default containers of all devices.
+  virtual void ClearContainers(absl::Span<const string> containers) const = 0;
+
+  virtual int NumDeviceType(const string& type) const = 0;
+
+  virtual int NumDevices() const = 0;
+
+  // Returns an arbitrary CPU device if one is present, otherwise return
+  // nullptr.
+  virtual Device* HostCPU() const = 0;
+
+  DeviceMgr(const DeviceMgr&) = delete;
+  void operator=(const DeviceMgr&) = delete;
+};
+
+
+// Size of stale device buffer for temporary storage of removed devices.
+static const size_t kStaleDeviceBufferSize = 8192;
+
+// Represents a dynamic set of devices
+class DynamicDeviceMgr : public DeviceMgr {
+ public:
+  // Constructs an empty DynamicDeviceMgr.
+  DynamicDeviceMgr();
+
+  // Constructs a DynamicDeviceMgr from a list of devices.
+  explicit DynamicDeviceMgr(std::vector<std::unique_ptr<Device>>&& devices);
+  explicit DynamicDeviceMgr(std::unique_ptr<Device>&& device);
+
+  ~DynamicDeviceMgr() override;
+
+  void ListDeviceAttributes(
+      std::vector<DeviceAttributes>* devices) const override;
+  std::vector<Device*> ListDevices() const override;
+  string DebugString() const override;
+  string DeviceMappingString() const override;
+  absl::Status LookupDevice(absl::string_view name,
+                            Device** device) const override;
+  bool ContainsDevice(int64_t device_incarnation) const override;
+  void ClearContainers(absl::Span<const string> containers) const override;
+  int NumDeviceType(const string& type) const override;
+  int NumDevices() const override;
+  Device* HostCPU() const override;
+
+  // Add devices to device manager. Returns error for repeated device names.
+  absl::Status AddDevices(std::vector<std::unique_ptr<Device>> devices);
+
+  // Remove devices from device manager.
+  // Returns error for non-existing devices or if the HostCPU() device is in the
+  // input list. If an error is returned, the device list is not modified.
+  absl::Status RemoveDevices(const std::vector<Device*>& devices);
+
+  // Remove devices from device manager by their names. Returns error for
+  // non-existing devices or if the HostCPU() device is given in the input list.
+  // If an error is returned, the device list is not modified.
+  absl::Status RemoveDevicesByName(const std::vector<string>& device_names);
+
+ private:
+  mutable mutex devices_mu_;
+
+  // Using an ordered map to ensure deterministic ordering of devices.
+  // Not a set, because we need to do find(Device*) and own the devices
+  // at the same time.
+  // We still have to override C++'s default pointer ordering.
+  struct DereferenceDevicePtrLess {
+    bool operator()(const Device* a, const Device* b) const {
+      return Device::LessByParsedName(*a, *b);
+    }
+  };
+  std::map<Device*, std::unique_ptr<Device>, DereferenceDevicePtrLess>
+      dynamic_devices_ TF_GUARDED_BY(devices_mu_);
+
+  absl::flat_hash_set<int64_t> device_incarnation_set_
+      TF_GUARDED_BY(devices_mu_);
+  std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
+
+  std::unordered_map<string, int> device_type_counts_
+      TF_GUARDED_BY(devices_mu_);
+
+  mutable std::atomic<Device*> cpu_device_;  // memoize `HostCPU` result
+
+  class DeviceCircularBuffer {
+   public:
+    DeviceCircularBuffer() : index_(0) {
+      devices_.resize(kStaleDeviceBufferSize);
+    }
+    void add(std::unique_ptr<Device> device) {
+      devices_[index_] = std::move(device);
+      index_ = (index_ + 1) % kStaleDeviceBufferSize;
+    }
+
+   private:
+    int index_;
+    std::vector<std::unique_ptr<Device>> devices_;
+  };
+
+  // Buffer to temporarily store the removed devices. Raw device pointers are
+  // accessible to DeviceSet, and if the function instantiation process directly
+  // access fields through the device set, the underlying device object must
+  // still be available to avoid segmentation fault. We keep the devices in this
+  // buffer only for that purpose.
+  DeviceCircularBuffer stale_devices_ TF_GUARDED_BY(devices_mu_);
+
+  DynamicDeviceMgr(const DynamicDeviceMgr&) = delete;
+  void operator=(const DynamicDeviceMgr&) = delete;
+};
+
+// TODO(b/183966398): Remove StaticDeviceMgr since there's no usage.
+using StaticDeviceMgr = DynamicDeviceMgr;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_propagation.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_propagation.h
new file mode 100644
index 00000000..20f5f916
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_propagation.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_PROPAGATION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_PROPAGATION_H_
+
+#include <functional>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+
+namespace device_propagation {
+
+typedef std::function<bool(absl::string_view)> DeviceFilter;
+typedef std::function<bool(const Node&)> NodeFilter;
+}  // namespace device_propagation
+
+// Propagates device assignments from a certain types of nodes to their outputs
+// to avoid unnecessary D2H or H2D copies.
+// If an node satisfies the following conditions, it will be placed on the same
+// device as its inputs:
+//   (1) The node can accept device update (`node_filter` returns true).
+//   (2) The node itself has no requested or assigned devices.
+//   (3) The source nodes of this node's input edges, except for edges that are
+//   "LoopCond->Switch" or "Enter->Merge", are all placed on the same device.
+//   (4) The device can be propagated (`device_filter` returns true)
+void PropagateDevices(const device_propagation::NodeFilter& node_filter,
+                      const device_propagation::DeviceFilter& device_filter,
+                      Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_resolver_local.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_resolver_local.h
new file mode 100644
index 00000000..814bea88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_resolver_local.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Implements DeviceResolverInterface in a single-task context.
+class DeviceResolverLocal : public DeviceResolverInterface {
+ public:
+  explicit DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
+
+  absl::Status GetDeviceAttributes(const string& device,
+                                   DeviceAttributes* attributes) override;
+
+  absl::Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) override;
+
+  absl::Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) override;
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_set.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_set.h
new file mode 100644
index 00000000..16dcd0ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/device_set.h
@@ -0,0 +1,139 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_SET_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_SET_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+typedef std::vector<std::pair<Device*, int32>> PrioritizedDeviceVector;
+
+// DeviceSet is a container class for managing the various types of
+// devices used by a model.
+class DeviceSet {
+ public:
+  DeviceSet();
+  ~DeviceSet();
+
+  // Does not take ownership of 'device'.
+  void AddDevice(Device* device) TF_LOCKS_EXCLUDED(devices_mu_);
+
+  // Set the device designated as the "client".  This device
+  // must also be registered via AddDevice().
+  void set_client_device(Device* device) {
+    DCHECK(client_device_ == nullptr);
+    client_device_ = device;
+  }
+
+  // Returns a pointer to the device designated as the "client".
+  Device* client_device() const { return client_device_; }
+
+  // Return the list of devices in this set.
+  const std::vector<Device*>& devices() const { return devices_; }
+
+  // Given a DeviceNameUtils::ParsedName (which may have some
+  // wildcards for different components), fills "*devices" with all
+  // devices in "*this" that match "spec".
+  void FindMatchingDevices(const DeviceNameUtils::ParsedName& spec,
+                           std::vector<Device*>* devices) const;
+
+  // Finds the device with the given "fullname". Returns nullptr if
+  // not found.
+  Device* FindDeviceByName(const string& fullname) const;
+
+  // Return the list of unique device types in this set, ordered
+  // with more preferable devices earlier.
+  std::vector<DeviceType> PrioritizedDeviceTypeList() const;
+
+  // Return the prioritized list of devices in this set.
+  // Devices are prioritized first by `DeviceTypeOrder`, then by name.
+  const PrioritizedDeviceVector& prioritized_devices() const
+      TF_LOCKS_EXCLUDED(devices_mu_);
+
+  // Return the prioritized list of unique device types in this set.
+  //
+  // The list will be ordered by decreasing priority. The priorities (the second
+  // element in the list's `std::pair<DeviceType, int32>`) will be initialized
+  // to the value of `DeviceTypeOrder` for the device types.
+  const PrioritizedDeviceTypeVector& prioritized_device_types() const
+      TF_LOCKS_EXCLUDED(devices_mu_);
+
+  // An order to sort by device types according to system-determined
+  // priority.
+  //
+  // Higher result implies higher priority.
+  static int DeviceTypeOrder(const DeviceType& d);
+
+  // Sorts a PrioritizedDeviceVector according to devices and explicit
+  // priorities.
+  //
+  // After a call to this function, the argument vector will be sorted by
+  // explicit priority (the second element in the `std::pair<DeviceType,
+  // int32>`), then by `DeviceTypeOrder` of the device type, then by device
+  // locality, and lastly by device name.
+  static void SortPrioritizedDeviceVector(PrioritizedDeviceVector* vector);
+
+  // Sorts a PrioritizedDeviceTypeVector according to types and explicit
+  // priorities.
+  //
+  // After a call to this function, the argument vector will be sorted by
+  // explicit priority (the second element in the `std::pair<DeviceType,
+  // int32>`), then by `DeviceTypeOrder` of the device type.
+  static void SortPrioritizedDeviceTypeVector(
+      PrioritizedDeviceTypeVector* vector);
+
+ private:
+  mutable mutex devices_mu_;
+
+  mutable absl::flat_hash_map<DeviceNameUtils::ParsedName, std::vector<Device*>>
+      matching_device_cache_;
+
+  // Not owned.
+  std::vector<Device*> devices_;
+
+  // Cached prioritized vector, created on-the-fly when
+  // prioritized_devices() is called.
+  mutable PrioritizedDeviceVector prioritized_devices_
+      TF_GUARDED_BY(devices_mu_);
+
+  // Cached prioritized vector, created on-the-fly when
+  // prioritized_device_types() is called.
+  mutable PrioritizedDeviceTypeVector prioritized_device_types_
+      TF_GUARDED_BY(devices_mu_);
+
+  // Fullname -> device* for device in devices_.
+  std::unordered_map<string, Device*> device_by_name_;
+
+  // client_device_ points to an element of devices_ that we consider
+  // to be the client device (in this local process).
+  Device* client_device_ = nullptr;
+
+  DeviceSet(const DeviceSet&) = delete;
+  void operator=(const DeviceSet&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_SET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/direct_session.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/direct_session.h
new file mode 100644
index 00000000..c43827ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/direct_session.h
@@ -0,0 +1,449 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DIRECT_SESSION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DIRECT_SESSION_H_
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/costmodel_manager.h"
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/graph_execution_state.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/session_state.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+class CostModel;
+class DebugGateway;
+class Device;
+class DirectSessionFactory;
+
+class DirectSession : public Session {
+ public:
+  typedef std::function<void(Session*)> CloseCallback;
+
+  // Takes ownership of 'device_mgr'.
+  // 'factory' is used to unregister the DirectSession with 'factory' when its
+  // closed. This ensures that Reset requests from the 'factory' don't get sent
+  // to sessions that are already closed.
+  DirectSession(const SessionOptions& options, const DeviceMgr* device_mgr,
+                DirectSessionFactory* factory);
+  ~DirectSession() override;
+
+  typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
+  typedef std::unordered_map<absl::string_view, Node*, StringPieceHasher>
+      NameNodeMap;
+
+  absl::Status Create(const GraphDef& graph) override;
+  absl::Status Create(GraphDef&& graph) override;
+  absl::Status Extend(const GraphDef& graph) override;
+  absl::Status Extend(GraphDef&& graph) override;
+  absl::Status Run(const NamedTensorList& inputs,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   std::vector<Tensor>* outputs) override;
+
+  // NOTE: Experimental and subject to change.
+  absl::Status Run(const ::tensorflow::RunOptions& run_options,
+                   const NamedTensorList& inputs,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   std::vector<Tensor>* outputs,
+                   RunMetadata* run_metadata) override;
+
+  // NOTE: Experimental and subject to change.
+  absl::Status Run(
+      const ::tensorflow::RunOptions& run_options,
+      const NamedTensorList& inputs, const std::vector<string>& output_names,
+      const std::vector<string>& target_nodes, std::vector<Tensor>* outputs,
+      RunMetadata* run_metadata,
+      const thread::ThreadPoolOptions& threadpool_options) override;
+
+  // NOTE: PRunSetup and PRun are added to support partial execution. This
+  // feature is experimental and subject to change.
+  absl::Status PRunSetup(const std::vector<string>& input_names,
+                         const std::vector<string>& output_names,
+                         const std::vector<string>& target_nodes,
+                         string* handle) override;
+  absl::Status PRun(const string& handle, const NamedTensorList& inputs,
+                    const std::vector<string>& output_names,
+                    std::vector<Tensor>* outputs) override;
+
+  // Reset clears 'containers' from the device_mgr of the DirectSession.
+  // If 'containers' is empty, then Reset clears the default container.
+  absl::Status Reset(const std::vector<string>& containers);
+
+  absl::Status ListDevices(std::vector<DeviceAttributes>* response) override;
+  absl::Status Close() override;
+  absl::Status LocalDeviceManager(const DeviceMgr** output) override {
+    *output = device_mgr_.get();
+    return absl::OkStatus();
+  }
+
+  void ExportCostModels(CostModelManager::CostModelMap* cost_models) {
+    cost_model_manager_.ExportCostModels(cost_models);
+  }
+
+  absl::Status MakeCallable(const CallableOptions& callable_options,
+                            CallableHandle* out_handle) override;
+
+  absl::Status RunCallable(CallableHandle handle,
+                           const std::vector<Tensor>& feed_tensors,
+                           std::vector<Tensor>* fetch_tensors,
+                           RunMetadata* run_metadata) override;
+
+  absl::Status RunCallable(
+      CallableHandle handle, const std::vector<Tensor>& feed_tensors,
+      std::vector<Tensor>* fetch_tensors, RunMetadata* run_metadata,
+      const thread::ThreadPoolOptions& threadpool_options) override;
+
+  absl::Status ReleaseCallable(CallableHandle handle) override;
+
+  absl::Status Finalize() override;
+
+  const SessionOptions& options() const { return options_; }
+
+ private:
+  // For access to collective_graph_key_.
+  friend class DirectSessionCollectiveTest;
+
+  // We create one executor and its dependent library runtime for
+  // every partition.
+  struct PerPartitionExecutorsAndLib {
+    std::unique_ptr<Graph> graph = nullptr;
+    Device* device = nullptr;                // not owned.
+    FunctionLibraryRuntime* flib = nullptr;  // not owned.
+    std::unique_ptr<Executor> executor;
+  };
+
+  // An ExecutorsAndKeys is created for a given set of feeds/fetches.
+  // 'step_count' is the number of times this graph is executed.
+  // 'graph' is the entire graph being executed. 'name_to_node'
+  // maps node name to node. We keep 'graph' and 'name_to_node' only in
+  // the case of partial runs. Each item in 'items' is the executor for
+  // a partition of the graph bundled with its dependent library runtime.
+  // 'input_keys' are the rendezvous keys for the feeds and 'output_keys'
+  // are rendezvous keys for the fetches.
+  struct ExecutorsAndKeys {
+    ExecutorsAndKeys() : step_count(0) {}
+
+    std::atomic_int_fast64_t step_count;
+    std::unique_ptr<Graph> graph;
+    NameNodeMap name_to_node;
+    std::vector<PerPartitionExecutorsAndLib> items;
+    std::unordered_map<string, size_t> input_name_to_index;
+    std::unordered_map<string, string> input_name_to_rendezvous_key;
+    std::unordered_map<string, size_t> output_name_to_index;
+    std::unordered_map<string, string> output_name_to_rendezvous_key;
+
+    DataTypeVector input_types;
+    DataTypeVector output_types;
+
+    CallableOptions callable_options;
+
+    int64_t collective_graph_key = BuildGraphOptions::kNoCollectiveGraphKey;
+  };
+
+  // A FunctionInfo object is created for every unique set of feeds/fetches.
+  // This info could be folded into the ExecutorsAndKeys object but we would
+  // like to maintain a deletion order in which the OpKernels (owned by the
+  // executor) should be destroyed first, followed by the resources in the
+  // device and then followed by the function stuff.
+  // TODO(rohanj): Consolidate function library definitions so that we can
+  // instantiate only one ProcFLR and lib_def and make this just a member
+  // variable and not a vector.
+  // 'flib_def' is the function library used.
+  // 'proc_flr' is the collection of FunctionLibraryRuntime objects, one per
+  // device.
+  struct FunctionInfo {
+    std::unique_ptr<FunctionLibraryDefinition> flib_def;
+    std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr;
+  };
+
+  // For each live Run() call, the session maintains a RunState.
+  // 'status' is the current status of the execution.
+  struct RunState {
+    mutex mu;
+    absl::Status status TF_GUARDED_BY(mu);
+    std::unique_ptr<CollectiveExecutor::Handle> collective_executor;
+    std::unique_ptr<StepStatsCollector> collector;
+    TensorStore tensor_store;
+    ScopedStepContainer step_container;
+
+    RunState(int64_t step_id, const std::vector<Device*>* devices);
+  };
+
+  // For each live partial execution, the session maintains a PartialRunState.
+  // 'executor_done' is "notified" when all executors are done. 'pending_inputs'
+  // are the set of pending feeds and 'pending_outputs' are the set of pending
+  // fetches.
+  struct PartialRunState : public RunState {
+    Notification executors_done;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
+    core::RefCountPtr<IntraProcessRendezvous> rendez = nullptr;
+
+    PartialRunState(const std::vector<string>& pending_input_names,
+                    const std::vector<string>& pending_output_names,
+                    int64_t step_id, const std::vector<Device*>* devices);
+
+    // Returns true if all pending inputs and outputs have been completed.
+    bool PendingDone() const;
+
+    ~PartialRunState();
+  };
+
+  struct RunStateArgs {
+    explicit RunStateArgs(const DebugOptions& options)
+        : debug_options(options) {}
+
+    bool is_partial_run = false;
+    string handle;
+    std::unique_ptr<Graph> graph;
+    const DebugOptions& debug_options;
+    int64_t collective_graph_key = BuildGraphOptions::kNoCollectiveGraphKey;
+  };
+
+  // Retrieves an already existing set of executors to run 'inputs' and
+  // 'outputs', or creates and caches them for future use.
+  absl::Status GetOrCreateExecutors(absl::Span<const string> inputs,
+                                    absl::Span<const string> outputs,
+                                    absl::Span<const string> target_nodes,
+                                    ExecutorsAndKeys** executors_and_keys,
+                                    RunStateArgs* run_state_args);
+
+  // Creates a set of executors to run the subgraph defined by
+  // `callable_options`.
+  absl::Status CreateExecutors(
+      const CallableOptions& callable_options,
+      std::unique_ptr<ExecutorsAndKeys>* out_executors_and_keys,
+      std::unique_ptr<FunctionInfo>* out_func_info,
+      RunStateArgs* run_state_args);
+
+  // Creates several graphs given the existing graph_def_ and the
+  // input feeds and fetches, given 'devices'. The graphs share a common
+  // function library 'flib_def'.
+  absl::Status CreateGraphs(
+      const BuildGraphOptions& options,
+      std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
+      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+      RunStateArgs* run_state_args, DataTypeVector* input_types,
+      DataTypeVector* output_types, int64_t* collective_graph_key);
+
+  absl::Status RunInternal(int64_t step_id, const RunOptions& run_options,
+                           CallFrameInterface* call_frame,
+                           ExecutorsAndKeys* executors_and_keys,
+                           RunMetadata* run_metadata,
+                           const thread::ThreadPoolOptions& threadpool_options);
+
+  // Returns whether inter-op execution uses a global pool or the input
+  // `run_options` requests being run on inter_op_thread_pool = 0 in case
+  // multiple pools are configured.
+  bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
+
+  absl::Status ExtendLocked(GraphDef&& graph)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
+
+  absl::Status ResourceHandleToInputTensor(const Tensor& resource_tensor,
+                                           Tensor* retrieved_tensor);
+
+  // Feeds more inputs to the executors, triggering further execution.
+  absl::Status SendPRunInputs(
+      const std::vector<std::pair<string, Tensor>>& inputs,
+      const ExecutorsAndKeys* executors_and_keys,
+      IntraProcessRendezvous* rendez);
+
+  // Fetches more outputs from the executors. It waits until the output
+  // tensors are computed.
+  absl::Status RecvPRunOutputs(const std::vector<string>& output_names,
+                               const ExecutorsAndKeys* executors_and_keys,
+                               PartialRunState* run_state,
+                               std::vector<Tensor>* outputs);
+
+  // Check if the specified fetches can be computed from the feeds
+  // that we have already provided.
+  absl::Status CheckFetch(const std::vector<std::pair<string, Tensor>>& feeds,
+                          const std::vector<string>& fetches,
+                          const ExecutorsAndKeys* executors_and_keys,
+                          const PartialRunState* run_state);
+
+  // Use the appropriate WaitForNotification function based on whether
+  // operation_timeout_in_ms is greater than 0.
+  //
+  // If the timeout expires, the `cm->StartCancel()` will be called.
+  absl::Status WaitForNotification(Notification* n, int64_t timeout_in_ms);
+  void WaitForNotification(Notification* n, RunState* run_state,
+                           CancellationManager* cm, int64_t timeout_in_ms);
+
+  absl::Status CheckNotClosed() {
+    mutex_lock l(closed_lock_);
+    if (closed_) return errors::Cancelled("Session has been closed.");
+    return absl::OkStatus();
+  }
+
+  absl::Status CheckGraphCreated(const char* method) {
+    mutex_lock l(graph_state_lock_);
+    if (!graph_created_) {
+      return errors::InvalidArgument(
+          "Session was not created with a graph before ", method, "!");
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status CreateDebuggerState(
+      const CallableOptions& options, int64_t global_step,
+      int64_t session_run_index, int64_t executor_step_index,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
+  absl::Status DecorateAndPublishGraphForDebug(
+      const DebugOptions& debug_options, Graph* graph, Device* device);
+
+  const SessionOptions options_;
+
+  // Device structures.
+  const std::unique_ptr<const DeviceMgr> device_mgr_;
+  std::vector<Device*> devices_;  // not owned
+  DeviceSet device_set_;
+
+  // Unique session identifier.
+  string session_handle_;
+  mutex graph_state_lock_;
+  bool graph_created_ TF_GUARDED_BY(graph_state_lock_) = false;
+  bool finalized_ TF_GUARDED_BY(graph_state_lock_) = false;
+
+  // The thread-pools to use for running ops, with a bool indicating if the pool
+  // is owned.
+  std::vector<std::pair<thread::ThreadPool*, bool>> thread_pools_;
+
+  absl::Status init_error_;  // Set to an error if construction failed.
+
+  // If true, blocks until device has finished all queued operations in a step.
+  bool sync_on_finish_ = true;
+
+  std::vector<std::unique_ptr<FunctionInfo>> functions_
+      TF_GUARDED_BY(executor_lock_);
+
+  mutex executor_lock_;  // protects executors_
+  // Holds mappings from signature to the executors that process
+  // it. The reason for a level of indirection around mapped_type is
+  // to guarantee address stability.
+  // The map value is a shared_ptr since multiple map keys can point to the
+  // same ExecutorsAndKey object.
+  std::unordered_map<string, std::shared_ptr<ExecutorsAndKeys>> executors_
+      TF_GUARDED_BY(executor_lock_);
+
+  class RunCallableCallFrame;
+  struct Callable {
+    std::shared_ptr<ExecutorsAndKeys> executors_and_keys;
+    std::shared_ptr<FunctionInfo> function_info;
+    ~Callable();
+  };
+  mutex callables_lock_;
+  int64_t next_callable_handle_ TF_GUARDED_BY(callables_lock_) = 0;
+  std::unordered_map<int64_t, Callable> callables_
+      TF_GUARDED_BY(callables_lock_);
+
+  // Holds mappings from handle to partial run state.
+  std::unordered_map<string, std::unique_ptr<PartialRunState>> partial_runs_
+      TF_GUARDED_BY(executor_lock_);
+
+  // This holds all the tensors that are currently alive in the session.
+  SessionState session_state_;
+
+  DirectSessionFactory* const factory_;  // not owned
+  CancellationManager* cancellation_manager_;
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
+
+  // Map of placed stateful nodes, i.e. nodes for which is_stateful()
+  // is true, such as "params" and "queue" nodes.  Once placed these
+  // nodes can not be moved to a different device.  Maps node names to
+  // device names.
+  std::unordered_map<string, string> stateful_placements_
+      TF_GUARDED_BY(graph_state_lock_);
+
+  // Execution_state; used when placing the entire graph.
+  std::unique_ptr<GraphExecutionState> execution_state_
+      TF_GUARDED_BY(graph_state_lock_);
+
+  // The function library, before any rewrites or optimizations have been
+  // performed. In particular, CreateGraphs() may need to modify the function
+  // library; it copies and modifies the function library.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+
+  // true if the Session has been Closed.
+  mutex closed_lock_;
+  bool closed_ TF_GUARDED_BY(closed_lock_) = false;
+
+  // For generating unique names for this session instance.
+  std::atomic<int64_t> edge_name_counter_ = {0};
+  std::atomic<int64_t> handle_name_counter_ = {0};
+
+  // For generating step ids that are unique among all sessions.
+  static std::atomic_int_fast64_t step_id_counter_;
+
+  // Global timeout for all blocking operations in this session.
+  const int64_t operation_timeout_in_ms_ = 0;
+
+  // Manages all the cost models for the graphs executed in this session.
+  CostModelManager cost_model_manager_;
+
+  // For testing collective graph key generation.
+  mutex collective_graph_key_lock_;
+  int64_t collective_graph_key_ TF_GUARDED_BY(collective_graph_key_lock_) = -1;
+
+  // Run in caller's thread if RunOptions.inter_op_thread_pool is negative or
+  // all of following conditions are met:
+  // 1. This session doesn't own any thread pool.
+  // 2. RunOptions.inter_op_thread_pool is unspecified or 0.
+  // 3. This session has a single executor.
+  // 4. config.inter_op_parallelism_threads is specified to negative explicitly
+  //    or through environment variable TF_NUM_INTEROP_THREADS.
+  // 5. RunOptions.experimental.use_run_handler_pool is unspecified or false.
+  // Otherwise run in global thread pool, session owned thread pool or handler
+  // pool according to other specifications of RunOptions and ConfigProto.
+  bool run_in_caller_thread_ = false;
+
+  DirectSession(const DirectSession&) = delete;
+  void operator=(const DirectSession&) = delete;
+
+  // EXPERIMENTAL: debugger (tfdbg) related
+  friend class DebugGateway;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DIRECT_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/dma_helper.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/dma_helper.h
new file mode 100644
index 00000000..4a76cff1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/dma_helper.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DMA_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DMA_HELPER_H_
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// For TensorFlow internal use only.
+class DMAHelper {
+ public:
+  static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); }
+  static const void* base(const Tensor* t) { return t->base<const void>(); }
+  static void* base(Tensor* t) { return t->base<void>(); }
+  static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
+  static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
+  static void UnsafeSetShape(Tensor* t, const TensorShape& s) {
+    t->set_shape(s);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DMA_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/attr_builder.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/attr_builder.h
new file mode 100644
index 00000000..9dc480d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -0,0 +1,223 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
+
+// Support for eager execution of TensorFlow kernels.
+
+#include <memory>
+#include <optional>
+#include <unordered_map>
+
+#include "tensorflow/c/eager/abstract_op_attrs.h"
+#include "tensorflow/c/tf_attrtype.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+
+// Maps attribute name to an encoding of the type of the attribute value.
+// If the type is not a list type, the value is the same as the TF_AttrType type
+// of the value. Else, the highest order bit is on, and the rest of the bits
+// represent the TF_AttrType type of the values in the list.
+typedef std::unordered_map<string, uint32> AttrTypeMap;
+
+// Look up OpDef for `op_name`.
+absl::Status OpDefForOp(const string& op_name, const OpDef** op_def);
+
+// Returns the AttrTypeMap for the TensorFlow operation named op_name.
+// If op_name is not registered in global op registry, AttrTypeMapForOp assumes
+// the op to be a function and returns the default attributes for a function.
+// `is_function` is set to true in this case.
+absl::Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                              bool* is_function);
+
+// Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
+absl::Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+                            TF_AttrType* out, unsigned char* is_list);
+
+// KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
+// An AttrBuilder is a convenience class to help with that - providing a smaller
+// interface than NodeDefBuilder and avoiding expensive (unnecessary?) sanity
+// checks (like number of inputs matching the OpDef - we only care about
+// attributes here).
+//
+// TODO(ashankar): Take a closer look at checks in NodeDefBuilder and see which
+// ones make sense to replicate.
+
+// This is a helper class for creating a NodeDef. Additionally, this class
+// allows computing a cache key based on fingerprinting the attributes of this
+// NodeDef.
+//
+// Example usage:
+// AttrBuilder a;
+// a.NumInputs(2);
+// a.Set("T", TF_FLOAT);
+// tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
+// const NodeDef& n = a.BuildNodeDef();
+//
+// Calls to NumInputs or Set between multiple invocations to CacheKey may cause
+// different values to be returned by CacheKey.
+//
+// If NumInputs or Set is called, BuildNodeDef should be called again to update
+// the NodeDef.
+//
+// For performance reasons, the class internally delays the actual construction
+// of the NodeDef till BuildNodeDef is called, or Set is called with certain
+// uncommon types (see template specializations of Set to see which types
+// trigger a NodeDef creation).
+//
+// Setting attributes via `Set` may cause arena-allocated protocol buffer
+// messages to be destructed, which is not thread safe. This means that it is
+// currently not safe to set attributes on *different* AttrBuilder objects from
+// multiple threads. This does not apply to `CopyAttributes`.
+class AttrBuilder : public AbstractOpAttrs {
+ public:
+  AttrBuilder()
+      : AbstractOpAttrs(AbstractOpAttrs::AbstractOpAttrsKind::kEager) {}
+
+  ~AttrBuilder() override = default;
+  explicit AttrBuilder(const char* op)
+      : AbstractOpAttrs(AbstractOpAttrs::AbstractOpAttrsKind::kEager) {
+    Reset(op);
+  }
+
+  void Reset(const char* op) {
+    op_name_ = op;
+    num_inputs_ = 0;
+    encoded_attrs_.clear();
+    node_def_finalized_ = false;
+    cached_cache_key_ = std::nullopt;
+    device_for_cached_cache_key_.clear();
+  }
+
+  const string& op_name() const { return op_name_; }
+  void set_op_name(const string& name) { op_name_ = name; }
+
+  // Needed to work around call to ValidateNodeDef in CreateOpKernel.
+  AttrBuilder& NumInputs(int n);
+
+  template <class T>
+  AttrBuilder& Set(absl::string_view attr_name, T&& value) {
+    SetAttrValue(value, &attr_tmp_);
+    AddAttrIfNotPresent(attr_name, attr_tmp_);
+    node_def_finalized_ = false;
+    cached_cache_key_ = std::nullopt;
+    return *this;
+  }
+
+  size_t NumAttributes() const { return encoded_attrs_.size(); }
+
+  AttrBuilder& Set(absl::string_view attr_name, const AttrValue& value) {
+    AddAttrIfNotPresent(attr_name, value);
+    cached_cache_key_ = std::nullopt;
+    return *this;
+  }
+
+  // Retrieves the attribute value.
+  // Note that Get() can involve a linear scan of all attributes with the same
+  // value type in this Node. This is not an issue, because Get is used rarely
+  // and nodes have a small number of attributes.
+  template <class T>
+  absl::Status Get(absl::string_view attr_name, T* value) const {
+    // Common attributes are stored in AttrVecs. This Get() template
+    // is specialized for them below. If we end up here, the type must be
+    // among those that we store in the node_def_.
+    if (!node_def_finalized_) {
+      return errors::NotFound("No attr named'", attr_name,
+                              "' found in AttrBuilder for ", op_name_);
+    }
+    return GetNodeAttr(AttrSlice(node_def_), attr_name, value);
+  }
+
+  tensorflow::Fprint128 CacheKey(absl::string_view device);
+
+  // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
+  // well as any default attr-value pairs from the associated op_def, if there
+  // is one.
+  void FillAttrValueMap(AttrValueMap* m) const;
+
+  // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far except
+  // when the value matches the default for this attr.
+  // More precisely, if the global op registry contains an OpDef for this op
+  // and if an attribute value is the same as the default (according to the
+  // OpDef), this attr-value pair is not added to `m`.
+  void FillAttrValueMapWithoutDefaults(AttrValueMap* m) const;
+  const NodeDef& BuildNodeDef();
+
+  // Transfers the attributes from `other` to this AttrBuilder. Does not
+  // overwrite existing attributes. Since it does not require deserializing and
+  // re-serializing attributes, it is much more efficient than going through an
+  // AttrValueMap.
+  void CopyAttributes(const AttrBuilder& other);
+
+  void GetNameAttrList(tensorflow::NameAttrList* name_and_attrs) const override;
+
+  bool GetInt(absl::string_view attr_name, int64_t* result) const override;
+  bool GetFloat(absl::string_view attr_name, float* result) const override;
+  bool GetBool(absl::string_view attr_name, bool* result) const override;
+  bool GetType(absl::string_view attr_name,
+               tensorflow::DataType* result) const override;
+  absl::Status GetTypeList(
+      absl::string_view attr_name,
+      absl::InlinedVector<DataType, 4>* type_list) const override;
+
+ private:
+  tensorflow::Fprint128 BuildCacheKeyForDevice(absl::string_view device) const;
+
+  template <class T>
+  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
+                         T&& value) const {
+    DCHECK(!node_def_finalized_)
+        << "Calling SetInAttrValueMap after BuildNodeDef.";
+    // If attribute is set more than once, its first value prevails
+    m->insert({attr_name, value});
+  }
+
+  void AddAttrIfNotPresent(absl::string_view attr_name, const AttrValue& value);
+
+  gtl::FlatMap<string, string> encoded_attrs_;
+  mutable AttrValue attr_tmp_;  // For encoding
+
+  string op_name_;
+  int num_inputs_;
+  NodeDef node_def_;
+  bool node_def_initialized_;
+  bool node_def_finalized_;
+
+  std::optional<tensorflow::Fprint128> cached_cache_key_;
+  string device_for_cached_cache_key_;
+};
+
+template <>
+absl::Status AttrBuilder::Get(absl::string_view attr_name, int* value) const;
+template <>
+absl::Status AttrBuilder::Get(absl::string_view attr_name, float* value) const;
+template <>
+absl::Status AttrBuilder::Get(absl::string_view attr_name, bool* value) const;
+template <>
+absl::Status AttrBuilder::Get(absl::string_view attr_name,
+                              tensorflow::DataType* value) const;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/context.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/context.h
new file mode 100644
index 00000000..8440e298
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/context.h
@@ -0,0 +1,968 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/custom_device.h"
+#include "tensorflow/core/common_runtime/eager/custom_device_op_handler.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/rendezvous_cache.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/refcount.h"
+
+// "tensorflow/core/platform/platform.h" must be included first before using
+// IS_MOBILE_PLATFORM.
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#endif  // !IS_MOBILE_PLATFORM
+
+namespace tensorflow {
+
+namespace eager {
+// We need this forward declaration because we have circular dependency:
+// Context -> RemoteMgr -> TensorHandle -> Context.
+// TODO(fishx): Remove this once we remove Context dependency in TensorHandle.
+class RemoteMgr;
+}  // namespace eager
+
+// Check the value of the environment variable,
+// `TF_REMOTE_HANDLE_SKIP_WAIT_FOR_READY` from its cached copy in memory and if
+// not cached, reads from the environment variable.
+bool SkipRemoteHandleWaitReady();
+
+class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
+ public:
+  static constexpr uint64 kInvalidContextId = 0;
+
+  static uint64 NewContextId() {
+    uint64 context_id = random::New64();
+    while (context_id == kInvalidContextId) {
+      context_id = random::New64();
+    }
+    return context_id;
+  }
+
+  EagerContext(
+      const SessionOptions& opts,
+      ContextDevicePlacementPolicy default_device_placement_policy, bool async,
+      /*const*/ DeviceMgr* device_mgr, bool device_mgr_owned,
+      /*const*/ tsl::core::RefCountPtr<Rendezvous> rendezvous,
+      DistributedFunctionLibraryRuntime* cluster_flr = nullptr,
+      CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr,
+      bool run_eager_op_as_function = false, bool jit_compile_rewrite = false);
+
+  void Release() override { Unref(); }
+
+  AbstractTensorInterface* CreateInt64Scalar(int64_t value) override;
+  AbstractTensorInterface* CreateUint64Scalar(uint64 value) override;
+  AbstractTensorInterface* CreateInt32Scalar(int32_t value) override;
+  AbstractTensorInterface* CreateFloatScalar(float value) override;
+  AbstractTensorInterface* CreateDoubleScalar(double value) override;
+  AbstractTensorInterface* CreateHalfScalar(Eigen::half value) override;
+  AbstractTensorInterface* CreateStringScalar(
+      tensorflow::tstring value) override;
+  AbstractTensorInterface* CreateComplex128Scalar(
+      tensorflow::complex128 value) override;
+  AbstractTensorInterface* CreateBoolScalar(bool value) override;
+
+  AbstractTensorInterface* CreateTensor(
+      DataType dtype, absl::Span<const int64_t> dim_sizes) override;
+  AbstractTensorInterface* CreateTensor(DataType dtype, const int64_t* dims,
+                                        int num_dims, void* data, size_t len,
+                                        MemoryReleaser memory_releaser,
+                                        void* memory_releaser_arg) override;
+
+  ImmediateExecutionTensorHandle* CreateLocalHandle(
+      AbstractTensorInterface* t) override;
+  // Create an abstract tensor handle from tensorflow::Tensor.
+  ImmediateExecutionTensorHandle* CreateLocalHandleFromTFTensor(
+      tensorflow::Tensor& t, const char* d_name) override;
+  ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
+      absl::Status* status) override;
+  ImmediateExecutionOperation* CreateOperation() override;
+
+  // This is a virtual helper function to convert TFRT TensorHandle to
+  // tensorflow::TensorHandle. In current runtime EagerContext, just forward
+  // the input since the input tensor handle is already a
+  // tensorflow::TensorHandle.
+  ImmediateExecutionTensorHandle* TFTensorHandleFromInterface(
+      ImmediateExecutionTensorHandle* handle) override;
+
+  absl::Status RegisterFunction(AbstractFunction* f) override;
+
+  bool UsesTFRT() override;
+
+  bool RunEagerOpAsFunction() const;
+
+  void SetRunEagerOpAsFunction(bool enable) override;
+
+  bool JitCompileRewrite() const;
+
+  void SetJitCompileRewrite(bool enable) override;
+
+  void ListDevices(std::vector<DeviceAttributes>* device_attributes) override;
+
+  absl::Status AddDevices(
+      std::vector<std::unique_ptr<Device>> devices) override;
+
+  thread::ThreadPool* GetThreadPool() { return thread_pool_.get(); }
+
+  // Returns the function library runtime for the given device.
+  FunctionLibraryRuntime* func_lib(const Device* d) const {
+    return pflr_->GetFLR(d->name());
+  }
+
+  ProcessFunctionLibraryRuntime* pflr() const { return pflr_.get(); }
+
+  std::function<void(std::function<void()>)>* runner() { return &runner_; }
+
+  // Specify a executor for this thread.
+  void SetExecutorForThread(EagerExecutor* executor) override;
+
+  std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list()
+      const {
+    mutex_lock l(device_type_list_mu_);
+    return prioritized_device_type_list_;
+  }
+
+  // Clear pending nodes in thread executors and kernel caches.
+  void ClearCachesAndThreadExecutors() override;
+  // Clear pending nodes in default executor and kernel caches.
+  void ClearCachesAndDefaultExecutor();
+
+  // Sets the device placement policy for the current thread.
+  void SetThreadLocalDevicePlacementPolicy(
+      ContextDevicePlacementPolicy policy) override;
+
+  // Returns the device placement policy for the current thread.
+  ContextDevicePlacementPolicy GetDevicePlacementPolicy() const override;
+
+  // Select an appropriate device for an operation.
+  //
+  // Given the preferred device for the operation, and the node_def, finds the
+  // best suitable device for the operation in this context.
+  //
+  // The preferred device is specified as a `ParsedName` containing the elements
+  // (details) that the resulting device should match. If there are no such
+  // devices, and the context currently allows soft device placement, a suitable
+  // device not matching `preferred` will be chosen.
+  //
+  // The chosen device is stored in the `device` argument. The argument is not
+  // modified unless this method returns `OkStatus()`.
+  absl::Status SelectDevice(DeviceNameUtils::ParsedName preferred,
+                            const NodeDef& ndef, Device** out) const;
+
+  // TODO(mdan): Rename to ContainsFunction.
+  bool FindFunctionByName(const string& name) const;
+
+  absl::Status FindFunctionOpData(
+      const string& name, const tensorflow::OpRegistrationData** op_data);
+
+  const FunctionDef* FindFunctionDef(const string& name) const override;
+  core::RefCountPtr<FunctionRecord> FindRecord(
+      const string& name) const override;
+
+  Device* HostCPU() const { return host_cpu_device_; }
+  Device* CanonicalDevice(Device* d) const {
+    return HostCPU() == d ? nullptr : d;
+  }
+  const DeviceNameUtils::ParsedName& HostCPUParsedName() const override {
+    return HostCPU()->parsed_name();
+  }
+
+  const string& HostCPUName() const override { return HostCPU()->name(); }
+
+  GraphCollector* GetGraphCollector() { return &graph_collector_; }
+
+  EagerExecutor& Executor() override;
+
+  // Add the given `fdef` to the local FunctionLibraryDefinition. And add an
+  // entry to the KernelAndDevice cache for it if it's not exist.
+  absl::Status AddFunctionDef(const FunctionDef& fdef) override;
+
+  absl::Status AddFunctionDefWithStackTraces(
+      const FunctionDef& fdef, const StackTracesMap& stack_traces) override;
+
+  // `library` contains all FunctionDefs and GradientDefs to expand `fdef`. Add
+  // it to the local FunctionLibraryDefinition as well, but no need to add it
+  // to the KernelAndDevice cache since they won't be executed as
+  // KernelAndDevices.
+  absl::Status AddFunctionDef(const FunctionDef& fdef,
+                              const FunctionDefLibrary& library,
+                              bool add_to_local_only = false,
+                              const StackTracesMap& stack_traces = {});
+
+  // `library` contains all FunctionDefs and GradientDefs to expand `fdef`. Add
+  // it to the local FunctionLibraryDefinition as well, but no need to add it
+  // to the KernelAndDevice cache since they won't be executed as
+  // KernelAndDevices.
+  absl::Status AddFunctionRecord(core::RefCountPtr<FunctionRecord> func_record,
+                                 const FunctionDefLibrary& library,
+                                 bool add_to_local_only = false);
+
+  // Adds a component function (i.e. containing a subgraph of a multi-process
+  // function) implemented as `fdef`.
+  //
+  // REQUIRES: `library` must contain all functions reachable from `fdef`. It
+  //   should not contain `fdef` itself.
+  absl::Status AddComponentFunction(const FunctionDef& fdef,
+                                    const FunctionDefLibrary& library);
+
+  const FunctionDef* GetFunctionDef(const string& function_name);
+
+  std::vector<string> ListFunctionNames() override;
+  tensorflow::ImmediateExecutionContext::CacheStats GetCacheStats() override;
+
+  absl::Status RemoveFunction(const string& func) override;
+  absl::Status AddRemoveFunctionNotifier(
+      const string& func, std::function<void()> notifier) override;
+
+  // Wait for pending nodes to be finished in local executors (including context
+  // default executor and thread executors) and executors on remote workers.
+  // Return combined status of remote executors. If there are multiple errors,
+  // the Status code will be the same as the first remote executor that has
+  // errors, and the error message will be combined from all executors.
+  absl::Status SyncExecutors();
+
+  absl::Status AsyncWait() override { return SyncExecutors(); }
+
+  core::RefCountPtr<KernelAndDevice> GetCachedKernel(Fprint128 cache_key);
+  Device* GetCachedDevice(Fprint128 device_cache_key);
+
+  core::RefCountPtr<KernelAndDevice> AddKernelToCache(
+      Fprint128 cache_key, core::RefCountPtr<KernelAndDevice> kernel);
+  void AddDeviceToCache(Fprint128 device_cache_key, Device* device);
+
+  bool LogDevicePlacement() const { return log_device_placement_; }
+  void SetLogDevicePlacement(bool enable) override {
+    log_device_placement_ = enable;
+  }
+
+  bool AllowSoftPlacement() const { return allow_soft_placement_; }
+  void SetAllowSoftPlacement(bool enable) override {
+    allow_soft_placement_ = enable;
+  }
+  bool LogMemory() const { return log_memory_; }
+
+  // Returns a borrowed pointer to the global rendezvous. The rendezvous may
+  // become invalid if this Context is destroyed.
+  Rendezvous* GetRendezvous() const { return rendezvous_.get(); }
+
+  void ResetGlobalRendezvousForFunction() override {
+    mutex_lock l(global_rendezvous_mu_);
+    // Remove the global rendezvous instance from the local rendezvous table
+    // if it uses local rendezvous type, which forces EagerContext to create a
+    // new local rendezvous instance in the table.
+    // TODO(b/274683676) Why can't we abort the old rendezvous here?
+    local_rendezvous_cache_.Remove(-1);
+    TF_CHECK_OK(CreateRendezvousFactory()(-1, nullptr,
+                                          &global_rendezvous_for_functions_));
+  }
+
+  // Returns the global_rendezvous_for_functions' underlying LocalRendezvous'
+  // status. If the underlying Rendezvous is not in the local_rendezvous_cache_
+  // returns OK.
+  absl::Status GetGlobalRendezvousForFunctionLocalRendezvousStatus();
+
+  // Returns a factory which maps from step_id to rendezvous.
+  //
+  // When tensor transfer across functions/eager executions using send/recv ops
+  // are required, `reuse_rendezvous_for_functions` can be set to true so that
+  // function executions and eager executions use the same rendezvous instance,
+  // instead of creating new instance per function calls.
+  //
+  // The caller of the returned function owns a reference to the resulting
+  // Rendezvous.
+  Rendezvous::Factory RendezvousFactory(
+      bool reuse_rendezvous_for_functions = false) {
+    // There is an implicit assumption that the global_rendezvous_for_functions_
+    // is always an IntraProcessRendezvous to match the behaviour of the
+    // EagerContext's rendezvous.
+    // Ref: tensorflow/c/eager/c_api.cc;l=143;rcl=396387348
+    // If a cross process kernel needs a rendezvous a new InterProcessRendezvous
+    // should be created.
+    if (reuse_rendezvous_for_functions && rendezvous_creator_ == nullptr &&
+#if !defined(IS_MOBILE_PLATFORM)
+        worker_env_ == nullptr &&
+#endif
+        remote_device_mgr() == nullptr) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        mutex_lock l(global_rendezvous_mu_);
+        *r = global_rendezvous_for_functions_.GetNewRef();
+        return absl::OkStatus();
+      }};
+    } else {
+      return CreateRendezvousFactory();
+    }
+  }
+
+  CollectiveExecutorMgrInterface* collective_executor_mgr() {
+    return collective_executor_mgr_.Get();
+  }
+  std::unique_ptr<CollectiveExecutor::Handle> GetCollectiveExecutorHandle() {
+    return std::make_unique<CollectiveExecutor::Handle>(
+
+        collective_executor_mgr()->FindOrCreate(0), true /*inherit_ref*/);
+  }
+
+  void SetCollectiveExecutorMgr(CollectiveExecutorMgrInterface* mgr) {
+    collective_executor_mgr_.Reset(mgr);
+  }
+  tensorflow::DeviceMgr* local_device_mgr() const {
+    return local_device_manager_.Get();
+  }
+  const tensorflow::DynamicDeviceMgr* remote_device_mgr() const {
+    return remote_device_manager_.Get();
+  }
+
+  tensorflow::DynamicDeviceMgr* GetOwnedRemoteDeviceMgr() {
+    return remote_device_manager_.GetOwned();
+  }
+
+  std::vector<Device*> ListLocalTfDevices() override {
+    return local_device_mgr()->ListDevices();
+  }
+
+  std::vector<Device*> ListAllTfDevices() override;
+
+  // TODO(apassos) clean up RunMetadata storage.
+  mutex* MetadataMu() TF_LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
+  bool ShouldStoreGraphs() TF_LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreGraphs(bool value) override;
+  RunMetadata* RunMetadataProto() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_) {
+    return run_metadata_.get();
+  }
+  std::unique_ptr<RunMetadata> ExportRunMetadata() override
+      TF_LOCKS_EXCLUDED(metadata_mu_);
+
+  void StartStep() override;
+  void EndStep() override;
+  ScopedStepContainer* StepContainer();
+
+  FunctionLibraryDefinition* FuncLibDef() override { return &func_lib_def_; }
+
+  FunctionLibraryDefinition* GetComponentFunctionFunctionLibraryDefinition(
+      const string& function_name) {
+    tf_shared_lock lock(cache_mu_);
+    auto iter = component_function_libraries_.find(function_name);
+    if (iter != component_function_libraries_.end()) {
+      return iter->second.get();
+    }
+    return nullptr;
+  }
+
+#if !defined(IS_MOBILE_PLATFORM)
+  // Assign the EagerClient pointer to `client` based on the given device / task
+  // name, and increment the refcount of the client. The reference ownership is
+  // transferred to the caller, and the unref should automatically happen when
+  // destructing the RefCountPtr object at the caller's side.
+  // `client` must not be initialized or holding a reference of another object
+  // before calling this method.
+  absl::Status GetClient(Device* device,
+                         core::RefCountPtr<eager::EagerClient>* client);
+  absl::Status GetClient(const DeviceNameUtils::ParsedName& device_name,
+                         core::RefCountPtr<eager::EagerClient>* client);
+  absl::Status GetClient(const string& remote_task,
+                         core::RefCountPtr<eager::EagerClient>* client);
+
+  uint64 GetContextId() const;
+  uint64 GetContextViewId() const;
+  void IncrementContextViewId();
+
+  absl::Status EnableCollectiveOps(const ServerDef& server_def) override;
+
+  // TODO(nareshmodi): Encapsulate remote state into a separate
+  // class/struct.
+  //
+  // Enables the eager context to communicate with remote devices. When
+  // initializing with this method, this context will be the primary context,
+  // which will kill all its remote contexts in shutdown.
+  //
+  // - server: A ServerInterface that exports the tensorflow.WorkerService.
+  // Note that this class expects the server to already have been started.
+  // - remote_eager_workers: A cache from which we can get "EagerClient"s to
+  // communicate with remote eager services.
+  // - remote_device_mgr: A DeviceMgr* which contains all remote devices
+  // (should contain no local devices).
+  // - remote_contexts: A vector containing task names.
+  // TODO(b/184375824): clean up parameter order for better readability.
+  absl::Status InitializeRemoteMaster(
+      std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
+      std::shared_ptr<WorkerSession> worker_session,
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
+      const std::vector<string>& remote_contexts, uint64 context_id,
+      tsl::core::RefCountPtr<Rendezvous> r,
+      /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
+      DistributedFunctionLibraryRuntime* cluster_flr,
+      std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
+          remote_mgr);
+
+  // Update an existing master context with a new set of remote workers (i.e., a
+  // new "view" of cluster membership. Similar to InitializeRemoteMaster but
+  // this will keep the current context_id and increment a context_view_id, will
+  // keep the current resource manager so that resources from the previous view
+  // can still be accessed, and will automatically register existing functions
+  // if there are newly added hosts.
+  absl::Status UpdateRemoteMaster(
+      uint64 context_id,
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      const std::vector<string>& add_remote_contexts,
+      const std::vector<string>& remove_remote_contexts);
+
+  // Similar with InitializeRemoteMaster but this context will not kill remote
+  // contexts in shutdown.
+  absl::Status InitializeRemoteWorker(
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      DynamicDeviceMgr* remote_device_mgr,
+      const std::vector<string>& remote_contexts, uint64 context_id,
+      uint64 context_view_id,
+      std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
+          rendezvous_creator,
+      DistributedFunctionLibraryRuntime* cluster_flr,
+      std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
+          remote_mgr,
+      std::function<void()> resource_deallocator);
+
+  // Similar with InitializeRemoteWorker but will reuse existing context and
+  // increment context_view_id.
+  absl::Status UpdateRemoteWorker(
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      const std::vector<string>& remote_contexts, uint64 context_id);
+
+  absl::Status StoreCollectiveOpsServer(
+      std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
+      CollectiveExecutorMgrInterface* rpc_collective_executor_mgr);
+
+  // For the specified remote worker, preprocess and set its device filters.
+  absl::Status SetRemoteDeviceFilters(
+      const string& remote_worker, const std::vector<string>& device_filters);
+
+  // For the specified remote worker, apply the stored device filters to the
+  // list of device attributes following these rules:
+  // (1) if the remote worker does not have device filters, all devices are
+  //     visible to the worker;
+  // (2) if the device is on the remote worker, then it is visible;
+  // (3) if the device matches at least one device filter, then it is visible.
+  // The result is saved as a boolean vector of the same length (i.e.,
+  // filtered_device_mask) indicating whether each of the devices is visible to
+  // the remote worker.
+  void FilterDevicesForRemoteWorkers(
+      const string& remote_worker,
+      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attrs,
+      std::vector<bool>* filtered_device_mask);
+
+  // TODO(fishx): Remove the custom deleter once we remove forward declaration.
+  const std::unique_ptr<eager::RemoteMgr,
+                        std::function<void(eager::RemoteMgr*)>>&
+  RemoteMgr() {
+    return remote_mgr_;
+  }
+
+  // If true, then tensors should be shipped across processes via the
+  // EagerService.Enqueue(SendTensorOp). If false, _Send/_Recv ops should be
+  // used instead (which in-turn use WorkerService.RecvTensor RPCs).
+  bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
+
+  tensorflow::ServerInterface* GetServer() { return server_.get(); }
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
+  // Function to support distributed C API.
+  void SetDistributedManager(
+      std::unique_ptr<ImmediateExecutionDistributedManager> distributed)
+      override {
+    distributed_manager_ = std::move(distributed);
+  }
+  ImmediateExecutionDistributedManager* GetDistributedManager() override {
+    return distributed_manager_.get();
+  }
+
+  // May only be used during multi-client setup so that a RemoteRendezvous
+  // can be initialized instead of defaulting to the IntraProcessRendezvous.
+  void SetWorkerEnv(WorkerEnv* worker_env,
+                    std::shared_ptr<WorkerSession> worker_session);
+#endif  // IS_MOBILE_PLATFORM
+
+  // Closes remote eager contexts, waits for all RPCs to finish, and
+  // destroys the EagerClientCache. No RPCs can be made through this context
+  // after this method has been called.
+  // This method exists to aid a clean shutdown. It causes all RPCs to finish
+  // and remote TensorHandles to release their references to this context.
+  // To avoid deadlocks, this method must not be called on the thread
+  // processing RPCs because it makes RPCs and waits for their completion.
+  //
+  // On mobile, it just cleans the caches.
+  void WaitForAndCloseRemoteContexts();
+
+  bool PinSmallOpsToCPU() const { return pin_small_ops_to_cpu_; }
+
+  tensorflow::Env* TFEnv() const { return env_; }
+
+  absl::Status FindDeviceFromName(const char* device_name,
+                                  Device** device) const;
+
+  absl::Status FindCompositeDeviceFromName(absl::string_view device_name,
+                                           CompositeDevice** device) const;
+
+  bool IsCustomDevice(const string& device_name) override;
+
+  absl::Status RegisterCustomDevice(
+      const string& name, std::unique_ptr<CustomDevice> device) override;
+
+  CustomDeviceOpHandler& GetCustomDeviceOpHandler() override {
+    return custom_device_op_handler_;
+  };
+
+  // Find or create a composite device with the given `underlying_devices` and
+  // `device_name` (if not empty).
+  absl::Status FindOrCreateCompositeDevice(
+      const std::vector<string>& underlying_devices, const string& device_name,
+      CompositeDevice** composite_device);
+
+  bool OnSameTask(const Device* first, const Device* second) const;
+  // Gets the CPU device on the task of device.
+  absl::Status CPUDeviceOnTask(const Device* device, Device** cpu_device) const;
+
+  const SessionOptions& session_options() const { return opts_; }
+  void InitPrioritizedDeviceTypeList();
+
+  // Re-assign cluster-FLR and re-initialize devices and FLR in process-FLR
+  void UpdateClusterFLRAndInitDevices(
+      DistributedFunctionLibraryRuntime* cluster_flr);
+
+  // A constant representing the step id used for the global rendezvous.
+  // This is used to distibguish whether a user-specified step id should be set.
+  // Step id value of kGlobalRendezvous is reserved and should not be specified
+  // by the user.
+  static const int64_t kGlobalRendezvousId;
+
+ private:
+  // The class for caching Rendezvous instances per step_id.
+  // If the Rendezvous object is destroyed for the step, a new one will be
+  // created on demand.
+  class LocalRendezvousCache {
+   public:
+    LocalRendezvousCache()
+        : cache_(new RendezvousCache<IntraProcessRendezvous>) {}
+
+    tsl::core::RefCountPtr<IntraProcessRendezvous> FindOrCreate(
+        int64_t step_id, DeviceMgr* device_mgr);
+
+    tsl::core::RefCountPtr<IntraProcessRendezvous> Find(int64_t step_id) const {
+      return cache_->Find(step_id);
+    }
+
+    std::vector<int64_t> GetActiveStepIds() const {
+      return cache_->GetActiveStepIds();
+    }
+
+    void Remove(int64_t step_id) { cache_->Remove(step_id); }
+
+   private:
+    tsl::core::RefCountPtr<RendezvousCache<IntraProcessRendezvous>> cache_;
+  };
+
+  Rendezvous::Factory CreateRendezvousFactory() {
+    if (rendezvous_creator_ != nullptr) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        VLOG(6) << "Creating rendezvous using the rendezvous_creator_.";
+        *r = rendezvous_creator_(step_id);
+        return absl::OkStatus();
+      }};
+    }
+
+#if !defined(IS_MOBILE_PLATFORM)
+    if (worker_env_ != nullptr && worker_env_->rendezvous_mgr != nullptr) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        VLOG(6) << "Creating rendezvous using the worker_env's rendezvous_mgr.";
+        // TODO(hhb): Add a Create method and use it here.
+        auto remote_r = worker_env_->rendezvous_mgr->Find(step_id);
+        remote_r->Initialize(worker_session_.get()).IgnoreError();
+        *r = std::move(remote_r);
+        return absl::OkStatus();
+      }};
+    }
+#endif
+
+    if (remote_device_mgr() == nullptr) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        VLOG(6) << "Creating rendezvous using local_device_mgr.";
+        *r = local_rendezvous_cache_.FindOrCreate(step_id, local_device_mgr());
+        return absl::OkStatus();
+      }};
+    }
+
+    return Rendezvous::Factory();
+  }
+
+  ~EagerContext() override;
+
+  absl::Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
+  absl::Status MaybeRemoveFunctionRemotely(const string& function_name);
+  absl::Status RegisterExistingFunctionsOnRemoteWorkers(
+      const std::vector<string>& remote_workers);
+
+  void ResetPFLR(const DeviceMgr* device_mgr, Env* env,
+                 const ConfigProto* config, int graph_def_version,
+                 const FunctionLibraryDefinition* lib_def,
+                 const OptimizerOptions& optimizer_options,
+                 thread::ThreadPool* thread_pool = nullptr,
+                 DistributedFunctionLibraryRuntime* cluster_flr = nullptr);
+
+  void ResetClusterFLR(DistributedFunctionLibraryRuntime* cluster_flr);
+  void UpdateGlobalRendezvousDeviceManager(tensorflow::DeviceMgr* device_mgr);
+
+  void ClearResourceContainer(const string& name);
+
+  template <typename T>
+  struct OwnedOrUnownedHelper {
+   public:
+    OwnedOrUnownedHelper() = default;
+    explicit OwnedOrUnownedHelper(T* object, const bool owned = false) {
+      Reset(object, owned);
+    }
+
+    void Reset(std::unique_ptr<T> object) {
+      owned_object = std::move(object);
+      unowned_object_ptr = nullptr;
+    }
+
+    void Reset(T* object, const bool owned = false) {
+      if (owned) {
+        owned_object.reset(object);
+        unowned_object_ptr = nullptr;
+      } else {
+        owned_object.reset(nullptr);
+        unowned_object_ptr = object;
+      }
+    }
+
+    bool Owned() const { return owned_object != nullptr; }
+
+    T* GetOwned() const { return owned_object.get(); }
+    T* Get() const {
+      return owned_object ? owned_object.get() : unowned_object_ptr;
+    }
+
+    std::unique_ptr<T> owned_object = nullptr;
+    T* unowned_object_ptr = nullptr;
+  };
+
+  SessionOptions opts_;
+  const ContextDevicePlacementPolicy default_device_placement_policy_;
+
+  // Note: we cannot use C++11 thread_local here as there is no concept of a
+  // thread-local-object-local variable in C++11.
+  mutable mutex policy_map_mu_;
+  std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
+      device_placement_policy_ TF_GUARDED_BY(policy_map_mu_);
+
+  // This device manager maintains only the local devices on this worker.
+  OwnedOrUnownedHelper<DeviceMgr> local_device_manager_;
+  // Maintain copy of all previously created local device managers.
+  std::vector<std::unique_ptr<DeviceMgr>> old_local_device_managers_;
+
+  // Unowned DynamicDeviceMgr is set on remote worker to allow running
+  // multi-device function on remote worker.
+  // This device manager maintains all the devices (including both local and
+  // remote to this worker) in the cluster.
+  OwnedOrUnownedHelper<DynamicDeviceMgr> remote_device_manager_;
+
+  Device* host_cpu_device_;  // Owned by device_manager
+  mutable mutex device_type_list_mu_;
+  std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list_
+      TF_GUARDED_BY(device_type_list_mu_);
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_;
+  std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
+      rendezvous_creator_;
+  CustomDeviceOpHandler custom_device_op_handler_;
+
+  mutable mutex composite_devices_mu_;
+  // Maps from the fingerprint of a set of device names to a virtual
+  // CompositeDevice.
+  // TODO(b/145922293): Consider taking device names as keys.
+  absl::flat_hash_map<uint64, std::unique_ptr<CompositeDevice>>
+      composite_devices_ ABSL_GUARDED_BY(composite_devices_mu_);
+
+  FunctionLibraryDefinition func_lib_def_{OpRegistry::Global(),
+                                          FunctionDefLibrary()};
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  // EagerContext owns the DistributedFunctionLibraryRuntime(
+  // EagerClusterFunctionLibraryRuntime) if using EagerService for remote
+  // function execution (lazy_copy_function_remote_inputs_=true).
+  OwnedOrUnownedHelper<DistributedFunctionLibraryRuntime> cluster_flr_;
+  // One FunctionLibraryRuntime per device.
+  // func_libs[i] is the FunctionLibraryRuntime corresponding to
+  // session->devices[i].
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+
+  std::function<void(std::function<void()>)> runner_;
+
+  mutex cache_mu_;
+  mutex device_cache_mu_;
+  mutex remove_function_notifiers_mu_;
+  struct RegisteredFunction : public core::RefCounted {
+    ~RegisteredFunction() override = default;
+
+    std::unique_ptr<std::vector<Fprint128>> cached_kernel_keys;
+  };
+  std::unordered_map<Fprint128, core::RefCountPtr<KernelAndDevice>,
+                     Fprint128Hasher>
+      kernel_cache_ TF_GUARDED_BY(cache_mu_);
+  std::unordered_map<string, RegisteredFunction*> registered_functions_
+      TF_GUARDED_BY(cache_mu_);
+
+  std::unordered_map<string, std::unique_ptr<FunctionLibraryDefinition>>
+      component_function_libraries_ TF_GUARDED_BY(cache_mu_);
+  absl::flat_hash_map<Fprint128, Device*, Fprint128Hasher> device_cache_
+      TF_GUARDED_BY(device_cache_mu_);
+  std::unordered_map<std::string, std::vector<std::function<void()>>>
+      remove_function_notifiers_ TF_GUARDED_BY(remove_function_notifiers_mu_);
+
+  // Whether we should compute RunMetadata.
+  std::atomic<bool> should_store_graphs_{false};
+  mutex metadata_mu_;
+  std::unique_ptr<RunMetadata> run_metadata_ TF_GUARDED_BY(metadata_mu_);
+  GraphCollector graph_collector_;
+  std::atomic<bool> log_device_placement_;
+  std::atomic<bool> allow_soft_placement_;
+
+  // Information related to step containers.
+  std::atomic<int> num_active_steps_;
+  std::unique_ptr<ScopedStepContainer> step_container_
+      TF_GUARDED_BY(metadata_mu_);
+
+  EagerExecutor default_executor_;
+  mutable mutex executor_map_mu_;
+  // Not owned.
+  std::unordered_map<std::thread::id, EagerExecutor*> thread_local_executor_
+      TF_GUARDED_BY(executor_map_mu_);
+  std::unordered_map<std::thread::id, absl::flat_hash_set<EagerExecutor*>>
+      has_cleanup_ TF_GUARDED_BY(executor_map_mu_);
+
+  const bool log_memory_;
+
+  // The table of local rendezvous instances for intra-process communication.
+  // This make sures only one local rendezvous instance exists per step id.
+  LocalRendezvousCache local_rendezvous_cache_;
+
+  // Whether to use same rendezvous instance across function/eager executions.
+  std::atomic<bool> reuse_rendezvous_for_functions_{false};
+  mutable mutex global_rendezvous_mu_;
+
+  // Keeps alive the global rendezvous object.
+  core::RefCountPtr<Rendezvous> global_rendezvous_for_functions_
+      TF_GUARDED_BY(global_rendezvous_mu_);
+
+  Env* const env_;
+
+  OwnedOrUnownedHelper<CollectiveExecutorMgrInterface> collective_executor_mgr_;
+
+#if !defined(IS_MOBILE_PLATFORM)
+  std::vector<string> GetRemoteContexts() TF_LOCKS_EXCLUDED(remote_state_mu_);
+  bool IsRemoteContextsEmpty() TF_LOCKS_EXCLUDED(remote_state_mu_);
+  void CloseAndClearAllRemoteContexts();
+  void CloseRemoteContexts(const std::vector<string>& remote_contexts,
+                           uint64 context_id, uint64 context_view_id);
+
+  // TODO(b/184375824): clean up parameter order for better readability.
+  absl::Status SetMasterContextState(
+      std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
+      std::shared_ptr<WorkerSession> worker_session,
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
+      uint64 context_id, uint64 context_view_id,
+      tsl::core::RefCountPtr<Rendezvous> r,
+      /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
+      DistributedFunctionLibraryRuntime* cluster_flr,
+      std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
+          remote_mgr);
+
+  // The server_ is not const since we release it when the context is destroyed.
+  // Therefore the server_ object is not marked as const (even though it should
+  // be).
+  std::unique_ptr<ServerInterface> server_;
+  WorkerEnv* worker_env_ = nullptr;
+  std::shared_ptr<WorkerSession> worker_session_;
+
+  mutable mutex remote_state_mu_;
+
+  uint64 context_id_ TF_GUARDED_BY(remote_state_mu_);
+  // The view id of an eager context should be set to 0 when context is created,
+  // and continuously incremented when context with the same context_id gets
+  // updated. The view id should be consistent between master and workers.
+  uint64 context_view_id_ TF_GUARDED_BY(remote_state_mu_);
+  std::vector<string> remote_contexts_ TF_GUARDED_BY(remote_state_mu_);
+  std::unique_ptr<eager::EagerClientCache> remote_eager_workers_
+      TF_GUARDED_BY(remote_state_mu_);
+
+  int keep_alive_secs_ TF_GUARDED_BY(remote_state_mu_);
+  std::atomic<int> sleep_for_secs_;
+
+  std::unique_ptr<Thread> keep_alive_thread_;
+  mutex keep_alive_thread_shutdown_mu_;
+  condition_variable keep_alive_thread_cv_;
+  bool shutting_down_ TF_GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
+
+  std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
+      remote_mgr_;
+  bool is_master_ TF_GUARDED_BY(remote_state_mu_);
+
+  // Maps from a remote worker to a list of parsed device filters.
+  std::unordered_map<string, std::vector<DeviceNameUtils::ParsedName>>
+      cluster_device_filters_ TF_GUARDED_BY(remote_state_mu_);
+
+  // A distributed manager that helps setup, update, and check liveness of
+  // member tasks in the cluster.
+  std::unique_ptr<ImmediateExecutionDistributedManager> distributed_manager_;
+
+#endif  // IS_MOBILE_PLATFORM
+
+  // For a multi device function, the target device of each input is unknown
+  // until the function is instantiated on the default function device.
+  // If false, eagerly copy all remote inputs to the default function device;
+  // if true, lazily copy remote inputs to their target devices to avoid
+  // redundant copies.
+  bool lazy_copy_function_remote_inputs_ = false;
+  bool use_send_tensor_rpc_;
+  const bool pin_small_ops_to_cpu_;
+
+  // Function that will be invoked in destructor to deallocate resources related
+  // to this context.
+  std::function<void()> resource_deallocator_ = nullptr;
+  bool run_eager_op_as_function_;
+  bool jit_compile_rewrite_;
+
+  // Controls the behavior of
+  // `EagerContext::RegisterFunction(AbstractFunction*)` in distributed
+  // settings.
+  //
+  // By default, each abstract function will be registered on all workers in
+  // a cluster. If the environment variable
+  // `TF_EAGER_REGISTER_ABSTRACT_FUNCTIONS_LOCAL_ONLY=1` is set, each abstract
+  // function will be registered on the local worker only.
+  //
+  // In the common case that all functions are initially dispatched to
+  // a local device, the `ProcessFunctionLibraryRuntime`
+  // will ensure that the precise dependencies of that function are shipped to
+  // the remote device. Since PFLR instantiation often involves optimization,
+  // passes such as lowering control flow and inlining function calls, this will
+  // result in (1) sending a substantially smaller set of functions to each
+  // worker, and (2) the unoptimized functions never being called.
+  //
+  // Therefore setting `TF_EAGER_REGISTER_ABSTRACT_FUNCTIONS_LOCAL_ONLY=1` can
+  // significantly reduce both the startup time and the memory footprint on
+  // remote workers by avoiding the shipping of unneeded functions.
+  //
+  // TODO(b/326251557): Infer automatically when it is necessary to register a
+  // function or its dependencies on remote hosts; then remove the environment
+  // variable.
+  bool register_abstract_functions_local_only_;
+};
+
+inline EagerContext* ContextFromInterface(ImmediateExecutionContext* context) {
+  return down_cast<EagerContext*>(context);
+}
+
+namespace internal {
+struct EagerContextDeleter {
+  void operator()(EagerContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using EagerContextPtr =
+    std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
+
+// Sets the EagerContext owned by the current Python eager Context (see
+// TFE_Py_SetEagerContext in python/eager/pywrap_tfe.h). This is always called
+// in tandem with TFE_Py_SetEagerContext (but not called by it, because its
+// py_context argument is opaque).
+//
+// Do not use this function in production. It is only intended for testing.
+// (see _reset_context in context.py).
+//
+// Not thread-safe.
+void SetCEagerContext(EagerContext* ctx);
+
+// Returns the EagerContext owned by the current Python eager Context (see
+// TFE_Py_SetEagerContext in pywrap_tfe.h).
+//
+// Not thread-safe.
+EagerContext* GetCEagerContext();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/context_distributed_manager.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/context_distributed_manager.h
new file mode 100644
index 00000000..9db43d9e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/context_distributed_manager.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_DISTRIBUTED_MANAGER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_DISTRIBUTED_MANAGER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_distributed_manager.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/status.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
+#endif  // !IS_MOBILE_PLATFORM
+
+namespace tensorflow {
+#if !defined(IS_MOBILE_PLATFORM)
+class EagerContext;
+class ServerDef;
+
+class EagerContextDistributedManager
+    : public ImmediateExecutionDistributedManager {
+ public:
+  explicit EagerContextDistributedManager(EagerContext* context)
+      : context_(context) {}
+
+  // When running in a distributed context, `init_timeout_in_ms` requests the
+  // amount of time to wait for remote workers to respond.
+
+  absl::Status SetOrUpdateServerDef(
+      const ServerDef& server_def, bool reset_context, int keep_alive_secs,
+      int64_t init_timeout_in_ms, int retries,
+      bool clear_existing_contexts = false) override;
+
+  absl::Status InitializeLocalOnlyContext(const ServerDef& server_def,
+                                          int keep_alive_secs) override;
+
+  absl::Status EnableCollectiveOps(const ServerDef& server_def) override;
+
+  absl::Status CheckRemoteAlive(const std::string& remote_task_name,
+                                bool* is_alive) override;
+
+  tsl::CoordinationServiceAgent* GetCoordinationServiceAgent() override {
+    return coordination_service_agent_;
+  }
+  void SetCoordinationServiceAgent(tsl::CoordinationServiceAgent* agent) {
+    coordination_service_agent_ = agent;
+  }
+  void SetPreemptionNotifier(
+      std::unique_ptr<tsl::PreemptionNotifier> notifier) {
+    preemption_notifier_ = std::move(notifier);
+  }
+
+ private:
+  EagerContext* context_;
+  // Owned by context_->GetServer()->worker_env()->session_mgr.
+  tsl::CoordinationServiceAgent* coordination_service_agent_ = nullptr;
+  std::unique_ptr<tsl::PreemptionNotifier> preemption_notifier_;
+};
+#endif  // !IS_MOBILE_PLATFORM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_DISTRIBUTED_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/copy_to_device_node.h
new file mode 100644
index 00000000..37d943b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
+
+namespace tensorflow {
+
+class CopyToDeviceNode : public EagerNode {
+ public:
+  CopyToDeviceNode(TensorHandle* src, TensorHandle* dst, Device* dstd,
+                   const EagerContext& ctx, bool async, bool mirror)
+      : EagerNode(),
+        src_(src),
+        dst_(dst),
+        dstd_(dstd),
+        ctx_(ctx),
+        async_(async),
+        mirror_(mirror) {
+    if (async_) {
+      src_->Ref();
+      dst_->Ref();
+    }
+  }
+
+  ~CopyToDeviceNode() override {
+    if (async_) {
+      src_->Unref();
+      dst_->Unref();
+    }
+  }
+
+  absl::Status Run() override {
+    tensorflow::Tensor tensor;
+    tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
+        "eager::CopyToDeviceNode", "dynamic", tensor.dtype(),
+        [&tensor]() { return tensor.shape().DebugString(); });
+    TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &tensor));
+    if (!async_ && mirror_) {
+      absl::Status s = dst_->AddLocalMirror(std::move(tensor), dstd_);
+      // If a mirror was added since we called HasLocalMirror then just return
+      // and ignore the error.
+      if (s.ok() || (s.code() == error::Code::ALREADY_EXISTS)) {
+        return absl::OkStatus();
+      }
+      return s;
+    } else {
+      return dst_->SetTensor(std::move(tensor), dstd_);
+    }
+  }
+
+  void Abort(absl::Status status) override { dst_->Poison(status, dstd_); }
+
+  string DebugString() const override {
+    string out = "[CopyToDeviceNode]";
+    strings::StrAppend(&out, " src_tensor: ", src_->DebugString());
+    strings::StrAppend(&out, ", dst_tensor: ", dst_->DebugString());
+    strings::StrAppend(&out, ", dst_device: ", dstd_ ? dstd_->name() : "[]");
+    return out;
+  }
+
+  TensorHandle* dst() { return dst_; }
+
+ private:
+  TensorHandle* src_;
+  TensorHandle* dst_;
+  Device* dstd_;
+  const EagerContext& ctx_;
+  bool async_;
+  bool mirror_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/custom_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/custom_device.h
new file mode 100644
index 00000000..2f4f5acc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/custom_device.h
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_H_
+
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+class TensorHandle;
+class EagerOperation;
+class CustomDeviceTensorHandle;
+
+// Custom devices intercept the execution of operations (the `Execute` method),
+// typically implemented with one or more of the custom device's own executions.
+class CustomDevice {
+ public:
+  virtual ~CustomDevice() = default;
+  virtual const string& name() = 0;
+  virtual absl::Status CopyTensorToDevice(
+      ImmediateExecutionTensorHandle* tensor,
+      ImmediateExecutionTensorHandle** result) = 0;
+
+  virtual absl::Status CopyTensorFromDevice(
+      ImmediateExecutionTensorHandle* tensor, const string& target_device_name,
+      ImmediateExecutionTensorHandle** result) = 0;
+
+  virtual absl::Status Execute(const ImmediateExecutionOperation* op,
+                               ImmediateExecutionTensorHandle** retvals,
+                               int* num_retvals) = 0;
+
+  // Creates a packed TensorHandle from a group of custom device TensorHandles,
+  // one of which is on this custom device.
+  virtual absl::Status Pack(absl::Span<ImmediateExecutionTensorHandle*> handles,
+                            ImmediateExecutionTensorHandle** result) = 0;
+
+  // Returns true signifying to pin to the current custom device.
+  // Returns false to pin to the physical device.
+  virtual absl::StatusOr<bool> ShallPinToThisDevice(
+      const ImmediateExecutionOperation* op) = 0;
+};
+
+// Custom devices do many of the same things as physical Devices, but have a
+// much more restricted interface. We pass around ambiguous pointers since
+// operations may be placed either on custom or physical devices.
+using VariantDevice = std::variant<Device*, CustomDevice*>;
+
+// Indicates either HostCPU or an unset physical device. We never set a null
+// CustomDevice*.
+const VariantDevice kVariantDeviceNull = static_cast<Device*>(nullptr);
+
+// A tensor handle produced by a custom device. Generally they can only be
+// consumed by executing an operation on the same custom device that produced it
+// originally, or by attempting to copy the handle off the custom device.
+//
+// TODO(allenl): Currently custom devices are tied to the eager C API. They
+// should be renamed op handlers and subclass AbstractTensorHandle instead so
+// they are eager/graph agnostic.
+//
+// full_type_ is not set by the constructor (because it is not currently
+// needed). If full type information is needed in the future, the constructor
+// could use map_dtype_to_child_of_tensor() from core/framework/types.h to set
+// it based on dtype. Update test CustomDevice.TestTensorHandle in
+// custom_device_test.cc if this changes.
+class CustomDeviceTensorHandle : public ImmediateExecutionTensorHandle {
+ public:
+  CustomDeviceTensorHandle(ImmediateExecutionContext* context,
+                           CustomDevice* device, tensorflow::DataType dtype)
+      : ImmediateExecutionTensorHandle(kCustomDevice),
+        context_(context),
+        device_(device),
+        dtype_(dtype) {}
+
+  // TODO(allenl): Should this be a generic method of
+  // ImmediateExecutionTensorHandle to support TFE_TensorHandleDevicePointer?
+  virtual void* DevicePointer() const = 0;
+
+  tensorflow::DataType DataType() const override { return dtype_; }
+  tensorflow::FullTypeDef FullType() const override { return full_type_; }
+  absl::Status Shape(PartialTensorShape* shape) const override;
+  absl::Status NumElements(int64_t* num_elements) const override;
+
+  const char* DeviceName(absl::Status* status) const override {
+    return device_->name().c_str();
+  }
+  const char* BackingDeviceName(absl::Status* status) const override {
+    return device_->name().c_str();
+  }
+  CustomDevice* device() const { return device_; }
+  const char* DeviceType(absl::Status* status) const override;
+  int DeviceId(absl::Status* status) const override;
+
+  AbstractTensorInterface* Resolve(absl::Status* status) override;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kCustomDevice;
+  }
+
+ protected:
+  const DeviceNameUtils::ParsedName* ParsedName(absl::Status* status) const;
+
+  ImmediateExecutionContext* const context_;
+  CustomDevice* const device_;
+  const tensorflow::DataType dtype_;
+  tensorflow::FullTypeDef full_type_;
+
+  mutable std::optional<DeviceNameUtils::ParsedName> parsed_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/custom_device_op_handler.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
new file mode 100644
index 00000000..6c38e50d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_OP_HANDLER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_OP_HANDLER_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/eager/custom_device.h"
+#include "tensorflow/core/lib/core/status.h"
+namespace tensorflow {
+
+// TODO(tfrt-devs): Figure out a way to unify it with OpHandler in TFRT.
+class CustomDeviceOpHandler {
+ public:
+  ~CustomDeviceOpHandler() = default;
+  // Register a new custom device.
+  absl::Status RegisterCustomDevice(const string& device_name,
+                                    std::unique_ptr<CustomDevice> device);
+
+  // Find the custom device from given name. Return true if it finds one.
+  bool FindCustomDeviceFromName(const string& name,
+                                CustomDevice** device) const;
+
+  absl::Status Execute(ImmediateExecutionOperation* op,
+                       ImmediateExecutionTensorHandle** retvals,
+                       int* num_retvals);
+
+  ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionContext* context,
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
+      absl::Status* status);
+
+  // Determine whether to place an op on a custom device. This method is
+  // exposed as public for test only.
+  absl::Status MaybePinToCustomDevice(
+      CustomDevice** device, const ImmediateExecutionOperation& op) const;
+
+  void Clear();
+
+ private:
+  std::unordered_map<string, std::unique_ptr<CustomDevice>> custom_devices_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_OP_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_executor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_executor.h
new file mode 100644
index 00000000..cec897b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -0,0 +1,291 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <functional>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+class AsyncEagerNode;
+class AsyncRemoteExecuteNode;
+namespace eager {
+class EagerClient;
+}
+
+// A unit of execution for the EagerExecutor class below. Example subclasses
+// encapsulate execution of a TFE_Op, or copying a TFE_TensorHandle from one
+// device to another.
+class EagerNode {
+ public:
+  EagerNode() = default;
+
+  virtual ~EagerNode() = default;
+
+  // Prepares the node when adding it into EagerExecutor. If any errors happens,
+  // EagerExecutor will abort the node immediately.
+  virtual absl::Status Prepare() { return absl::OkStatus(); }
+
+  // Runs the computation corresponding to this node and blocks till the
+  // execution is done.
+  virtual absl::Status Run() = 0;
+
+  // Called when this node will not be run due to some error contained in
+  // `status`. `status` must not be OK.
+  // For example, if the node would have computed some tensors in the Run(),
+  // it should poison the corresponding tensor handles in this method.
+  virtual void Abort(absl::Status status) = 0;
+
+  // Returns nullptr iff this Eager node is synchronous.
+  virtual AsyncEagerNode* AsAsync() { return nullptr; }
+  virtual AsyncRemoteExecuteNode* AsAsyncRemoteExecuteNode() { return nullptr; }
+
+  virtual string DebugString() const = 0;
+
+  // Indicates whether a node failure should make the executor unusable.
+  virtual bool Fatal() const { return true; }
+};
+
+class AsyncEagerNode : public EagerNode {
+ public:
+  using EagerNode::EagerNode;  // Lift EagerNode constructors.
+
+  // This node will be cleaned up once the done callback is called.
+  virtual void RunAsync(StatusCallback done) = 0;
+
+  AsyncEagerNode* AsAsync() final { return this; }
+
+  absl::Status Run() final {
+    return errors::Unimplemented("Don't call AsyncEagerNode::Run().");
+  }
+};
+
+class AsyncRemoteExecuteNode : public AsyncEagerNode {
+ public:
+  AsyncRemoteExecuteNode* AsAsyncRemoteExecuteNode() final { return this; }
+
+  virtual const eager::EagerClient* eager_client() const = 0;
+  virtual bool needs_remote_inputs() const = 0;
+  virtual bool allow_multiple_pending_requests() const = 0;
+  virtual absl::Status SyncExecutors() = 0;
+};
+
+// A class for handling async execution (see TFE_ContextSetAsync).
+// Note that this class is thread-safe.
+// TODO(agarwal): TFE_OpAddInput may currently block if it tries to access the
+// device of the input handle. Fix that.
+// TODO(agarwal): Implement support for control dependencies.
+// TODO(agarwal): Support out-of-order execution and dispatching multiple
+// EagerNode in parallel.
+// TODO(agarwal): Implement optimizations over EagerNode traces.
+class EagerExecutor {
+ public:
+  explicit EagerExecutor(bool async, bool enable_streaming_enqueue = true,
+                         int in_flight_nodes_limit = 0);
+
+  ~EagerExecutor();
+
+  // Puts this in a shutdown state. In this state, AddOrExecute() will return an
+  // error and not add new EagerNodes. After putting this in the shutdown state,
+  // blocks until all pendings nodes have finished running.
+  // Returns the status of executing pending nodes.
+  // If async was not enabled, aborts and destroys all pending nodes.
+  absl::Status ShutDown();
+
+  bool Async() const;
+
+  bool StreamingEnqueue() const;
+
+  // Inline execute node if executor is in sync mode.
+  absl::Status SyncExecute(EagerNode* node);
+
+  // - Async Mode: schedules `node` for execution.
+  // - Sync Mode: inline execute the 'node' directly.
+  // If an error occurs (e.g. EagerExecutor has already been shut down), the
+  // `node` is not added to this executor and its Abort() method is called.
+  absl::Status AddOrExecute(std::unique_ptr<EagerNode> node);
+
+  // Blocks till all currently pending ops are done.
+  // In particular, if EnableAsync() has not beed called, it will not return
+  // until that happens (and pendings, at the time of call, nodes finish
+  // running). If this executor has already been shut down, its final status is
+  // returned.
+  absl::Status WaitForAllPendingNodes();
+
+  // Clears all currently set errors which re-enables async execution.
+  void ClearError();
+
+  // Returns Status based on any errors that occurred during async execution.
+  absl::Status status() const {
+    if (ok()) return absl::OkStatus();
+
+    tf_shared_lock l(node_queue_mutex_);
+    return status_;
+  }
+
+  bool ok() const TF_NO_THREAD_SAFETY_ANALYSIS { return ok_; }
+
+  // On destruction, runs `callback`. Used by the EagerContext for clearing
+  // thread-local executors.
+  void AddCleanup(intptr_t key, std::function<void()> callback);
+  // If `key` (e.g. a context) is destroyed before the executor, the associated
+  // callbacks are no longer safe to run.
+  void RemoveCleanups(intptr_t key);
+
+ private:
+  // Possible states for this executor.
+  // Executor starts in kActive state. When Shutdown() is called, Executor
+  // is put in the kShuttingDown state. In this state, the executor thread
+  // continues to run, but no new nodes are accepted. Finally, when all nodes
+  // are drained, the executor is put in the kShutDown state, which causes the
+  // thread to exit.
+  // If this executor is destroyed without calling shutdown first, it
+  // transitions to kShutDown state immediately which causes the thread to exit
+  // without running pending nodes.
+  enum class ExecutorState {
+    kActive,
+    kShuttingDown,
+    kShutDown,
+  };
+
+  enum class NodeState {
+    kPENDING,
+    kSCHEDULED,
+    kDONE,
+  };
+
+  struct NodeItem : core::RefCounted {
+    // Unique id generated in EagerExecutor::Add(). If item1.id < item2.id, it
+    // means item1.node is added before item2.node.
+    uint64 id;
+    std::unique_ptr<EagerNode> node;
+    NodeState state;
+  };
+
+  const char* StateStringLocked()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+
+  void NodeDone(const core::RefCountPtr<NodeItem>& item,
+                const absl::Status& status, bool from_queue);
+  void NotifyWaiters(uint64 id) TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+
+  // Starts execution of pending EagerNodes. This function loops till executor
+  // state_ is set to kShutDown. If any errors are encountered, these are set
+  // inside `status_`. The loop blocks anytime there are no pending nodes, or if
+  // `status_` is not ok.
+  void Run();
+
+  absl::Status RunItem(core::RefCountPtr<NodeItem> item, bool from_queue);
+  absl::Status MoveToUnfinished(core::RefCountPtr<NodeItem> item,
+                                bool from_queue);
+
+  // The impl of WaitForAllPendingNodes
+  // `lock` is the lock that holds node_queue_mutex_.
+  absl::Status WaitForAllPendingNodesLocked(mutex_lock* lock)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+
+  absl::Status WaitImpl(bool wait_all, uint64 node_id);
+
+  std::atomic<uint64> next_node_id_;
+
+  mutable mutex node_queue_mutex_;
+
+  // Used to signal that some EagerNodes are pending execution.
+  condition_variable nodes_pending_ TF_GUARDED_BY(node_queue_mutex_);
+  // Used to signal that some EagerNodes are done.
+  condition_variable nodes_done_ TF_GUARDED_BY(node_queue_mutex_);
+
+  // Queue of pending NodeItems. Ordered by NodeItem::id.
+  std::queue<core::RefCountPtr<NodeItem>> node_queue_
+      TF_GUARDED_BY(node_queue_mutex_);
+
+  // Ordered by NodeItem::id.
+  std::map<uint64, core::RefCountPtr<NodeItem>, std::less<uint64>>
+      unfinished_nodes_ TF_GUARDED_BY(node_queue_mutex_);
+
+  // `status_` is set based on any errors raised during execution of a
+  // EagerNode.  It remains set until ClearError is called.
+  absl::Status status_ TF_GUARDED_BY(node_queue_mutex_);
+  std::atomic<bool> ok_ TF_GUARDED_BY(node_queue_mutex_);
+
+  // Map from id of a EagerNode to condition_variables (not owned by the map).
+  // These condition_variables are notified and removed when that EagerNode is
+  // done executing, or if an error is found in execution of any EagerNode.
+  // The map is ordered by id.
+  std::multimap<uint64, condition_variable*, std::less<uint64>>
+      node_done_notifications_ TF_GUARDED_BY(node_queue_mutex_);
+
+  // thread_exited_notification_ is notified by the `thread_` right before it
+  // exits.
+  Notification thread_exited_notification_;
+
+  // When state_ is set to kShutDown, it indicates that `thread_` should stop as
+  // soon as it is done executing the current EagerNode.
+  ExecutorState state_ TF_GUARDED_BY(node_queue_mutex_) =
+      ExecutorState::kActive;
+
+  // Thread object that calls the `Run` method in async mode.This thread runs
+  // until state_ is set to kShuttingDown. It is `nullptr` in sync mode.
+  const std::unique_ptr<Thread> thread_;
+
+  // Last device where remote function with remote inputs was executed.
+  const eager::EagerClient* last_eager_client_;
+
+  const bool enable_async_wait_for_remote_function_;
+
+  // Enable sending remote executions through streaming enqueue.
+  const bool enable_streaming_enqueue_;
+
+  // Callbacks to run on destruction.
+  absl::flat_hash_map<intptr_t, std::vector<std::function<void()>>> cleanups_;
+
+  // Limit the number of in-flight nodes. When the number of in-flight eager
+  // async nodes reach this number, enqueuing to the eager async queue is
+  // blocked.
+  const int64_t in_flight_nodes_limit_;
+};
+
+inline bool EagerExecutor::Async() const { return thread_ != nullptr; }
+
+inline bool EagerExecutor::StreamingEnqueue() const {
+  return enable_streaming_enqueue_;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
new file mode 100644
index 00000000..bd709847
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OP_REWRITE_REGISTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OP_REWRITE_REGISTRY_H_
+
+#include <array>
+#include <list>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+
+namespace tensorflow {
+
+// Eager op rewrites should inherit from this class and
+// implement the Run method.
+class EagerOpRewrite {
+ public:
+  EagerOpRewrite(string name, string file, string line) {
+    debug_info_.name = name;
+    debug_info_.file = file;
+    debug_info_.line = line;
+  }
+
+  virtual ~EagerOpRewrite() = default;
+
+  // To be implemented by an Eager op rewrite pass.
+  virtual absl::Status Run(
+      EagerOperation* orig_op,
+      std::unique_ptr<tensorflow::EagerOperation>* out_op) = 0;
+
+  // Holds information about the rewrite registration.
+  struct DebugInfo {
+    string name, file, line;
+  };
+
+  // Returns information about the registered Eager op rewrite.
+  DebugInfo GetDebugInfo() const { return debug_info_; }
+
+ private:
+  DebugInfo debug_info_;
+};
+
+class EagerOpRewriteRegistry {
+ public:
+  // Phases at which the Eager op rewrite pass should run.
+  enum Phase {
+    PRE_EXECUTION = 0,  // right before executing an eager op
+    POST_PLACEMENT = 1  // after device placement
+  };
+
+  // Add a rewrite pass to the registry.
+  void Register(Phase phase, int32_t ordinal,
+                std::unique_ptr<EagerOpRewrite> pass);
+
+  // Run the rewrite pass registered for a given phase.
+  absl::Status RunRewrite(Phase phase, EagerOperation* orig_op,
+                          std::unique_ptr<tensorflow::EagerOperation>* out_op);
+
+  // Returns the global registry of rewrite passes.
+  static EagerOpRewriteRegistry* Global();
+
+ private:
+  static constexpr int32_t kNumPhases = 2;
+  // Holds all the registered Eager op rewrites and their ordinal numbers.
+  std::array<std::list<std::pair<std::unique_ptr<EagerOpRewrite>, int32>>,
+             kNumPhases>
+      rewrites_;
+};
+
+namespace eager_rewrite_registration {
+
+// This class is used to register a new Eager Op rewrite.
+class EagerRewriteRegistration {
+ public:
+  EagerRewriteRegistration(EagerOpRewriteRegistry::Phase phase, int32_t ordinal,
+                           std::unique_ptr<EagerOpRewrite> pass) {
+    EagerOpRewriteRegistry::Global()->Register(phase, ordinal, std::move(pass));
+  }
+};
+
+}  // namespace eager_rewrite_registration
+
+#define REGISTER_REWRITE(phase, ordinal, rewrite)                      \
+  REGISTER_REWRITE_UNIQ_HELPER(__COUNTER__, __FILE__, __LINE__, phase, \
+                               ordinal, rewrite)
+
+#define REGISTER_REWRITE_UNIQ_HELPER(ctr, file, line, phase, ordinal, rewrite) \
+  REGISTER_REWRITE_UNIQ(ctr, file, line, phase, ordinal, rewrite)
+
+#define REGISTER_REWRITE_UNIQ(ctr, file, line, phase, ordinal, rewrite)       \
+  static ::tensorflow::eager_rewrite_registration::EagerRewriteRegistration   \
+      register_rewrite_##ctr(phase, ordinal,                                  \
+                             ::std::unique_ptr<::tensorflow::EagerOpRewrite>( \
+                                 new rewrite(#rewrite, file, #line)))
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OP_REWRITE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_operation.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_operation.h
new file mode 100644
index 00000000..b81b0fc7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -0,0 +1,347 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+namespace tensorflow {
+
+class EagerOperation : public ImmediateExecutionOperation {
+ public:
+  explicit EagerOperation(tensorflow::EagerContext* ctx)
+      : ImmediateExecutionOperation(kEager), ctx_(*ctx), is_function_(false) {}
+  ~EagerOperation() override {
+    for (ImmediateExecutionTensorHandle* h : inputs_) {
+      h->Unref();
+    }
+  }
+
+  void Release() override { delete this; }
+
+  void Clear() override;
+  absl::Status Reset(const char* op, const char* raw_device_name) override {
+    return Reset(op, raw_device_name, false, nullptr);
+  }
+
+  const string& Name() const override { return attrs_.op_name(); }
+
+  const string& DeviceName() const override { return device_name_; }
+
+  ImmediateExecutionContext* GetContext() const override { return &ctx_; }
+
+  const DeviceNameUtils::ParsedName& GetDeviceParsedName() const {
+    return device_parsed_name_;
+  }
+
+  // Replaces the previous device name with the given one (see
+  // AbstractOperation::SetDeviceName for more details).
+  //
+  // This also resets the internal device pointer, unless the given name refers
+  // to a known custom device, in which case the internal device pointer is
+  // updated to that device.
+  absl::Status SetDeviceName(const char* name) override;
+
+  void SetDevice(VariantDevice device) {
+    device_ = device;
+    device_name_ = std::visit(
+        [](auto* device) { return device == nullptr ? "" : device->name(); },
+        device);
+    DeviceNameUtils::ParseFullName(device_name_, &device_parsed_name_);
+    // TODO(b/154133594): Due to intricacies of external logic, we can not
+    // set this do device_name_ as it would be natural, because we need the
+    // next call to SetDeviceName to reset the device pointer.
+    last_set_device_name_ = "\177";  // DEL (an invalid value)
+  }
+
+  absl::Status SetAttrValue(const char* attr_name, const AttrValue& value);
+
+  absl::Status AddInput(AbstractTensorHandle* input) override;
+  absl::Status AddInputList(
+      absl::Span<AbstractTensorHandle* const> inputs) override;
+  absl::Status SetInput(size_t index,
+                        ImmediateExecutionTensorHandle* input) override;
+  absl::Span<ImmediateExecutionTensorHandle* const> GetInputs() const override;
+  bool HasCustomDeviceInput() const override {
+    return custom_device_tensor_handles_count_ > 0;
+  }
+  absl::Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                       int* num_retvals) override;
+  const tensorflow::OpDef* OpDef() const override { return op_def_; };
+
+  absl::Status SetAttrString(const char* attr_name, const char* data,
+                             size_t length) override;
+  absl::Status SetAttrInt(const char* attr_name, int64_t value) override;
+  absl::Status SetAttrFloat(const char* attr_name, float value) override;
+  absl::Status SetAttrBool(const char* attr_name, bool value) override;
+  absl::Status SetAttrType(const char* attr_name, DataType value) override;
+  absl::Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                            int num_dims) override;
+  absl::Status SetAttrFunction(const char* attr_name,
+                               const AbstractOperation* value) override;
+  absl::Status SetAttrFunctionName(const char* attr_name, const char* data,
+                                   size_t length) override;
+  absl::Status SetAttrTensor(const char* attr_name,
+                             AbstractTensorInterface* tensor) override;
+  absl::Status SetAttrStringList(const char* attr_name,
+                                 const void* const* values,
+                                 const size_t* lengths,
+                                 int num_values) override;
+  absl::Status SetAttrFloatList(const char* attr_name, const float* values,
+                                int num_values) override;
+  absl::Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                              int num_values) override;
+  absl::Status SetAttrTypeList(const char* attr_name, const DataType* values,
+                               int num_values) override;
+  absl::Status SetAttrBoolList(const char* attr_name,
+                               const unsigned char* values,
+                               int num_values) override;
+  absl::Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                                const int* num_dims, int num_values) override;
+  absl::Status SetAttrFunctionList(
+      const char* attr_name,
+      absl::Span<const AbstractOperation*> values) override;
+
+  absl::Status InputLength(const char* input_name, int* length) override;
+  absl::Status OutputLength(const char* output_name, int* length) override;
+
+  const AbstractOpAttrs* GetOpAttrs() const override;
+  void AddAttrs(const AbstractOpAttrs* op_attrs) override;
+
+  void SetStackTrace(ManagedStackTrace stack_trace) override {
+    stack_trace_ = stack_trace;
+  }
+
+  std::optional<ManagedStackTrace> GetStackTrace() override {
+    return stack_trace_;
+  }
+
+  absl::Status Reset(
+      const char* op, const char* device_name, bool remote,
+      EagerExecutor* executor,
+      absl::optional<EagerFunctionParams> eager_func_params = std::nullopt);
+
+  bool is_function() const { return is_function_; }
+  bool colocation_exempt() const { return colocation_exempt_; }
+
+  tensorflow::EagerContext& EagerContext() const { return ctx_; }
+
+  const FunctionLibraryDefinition* FuncLibDef() const {
+    if (eager_func_params_.has_value() &&
+        eager_func_params_.value().func_lib_def_override) {
+      return eager_func_params_.value().func_lib_def_override;
+    } else {
+      return ctx_.FuncLibDef();
+    }
+  }
+
+  const FunctionDef* GetFunctionDef() const {
+    if (is_function_) {
+      return FuncLibDef()->Find(attrs_.op_name());
+    } else {
+      return nullptr;
+    }
+  }
+
+  AttrBuilder* MutableAttrs() { return &attrs_; }
+  const AttrBuilder& Attrs() const { return attrs_; }
+
+  // TensorHandleInputs and MutableTensorHandleInputs first check that all
+  // inputs are TensorHandles, i.e. that there are no custom device inputs. They
+  // return a bad status otherwise.
+  absl::Status TensorHandleInputs(
+      const absl::InlinedVector<TensorHandle*, 4>** inputs) const;
+  absl::Status MutableTensorHandleInputs(
+      absl::InlinedVector<TensorHandle*, 4>** inputs);
+
+  const absl::InlinedVector<ImmediateExecutionTensorHandle*, 4>& Inputs()
+      const {
+    return inputs_;
+  }
+
+  void UpdateInput(int i, TensorHandle* h);
+
+  // This is useful if we want the EagerOperation to point to a different
+  // function.
+  void UpdateName(const string& name) {
+    op_name_ = name.c_str();
+    attrs_.set_op_name(name);
+  }
+
+  // Like TensorHandles, EagerOperations may be placed either on a virtual
+  // CustomDevice or on a physical Device.
+  VariantDevice Device() const { return device_; }
+
+  // Indicates whether the op is assigned to a device that is local to the
+  // current host.
+  bool IsLocal() const;
+
+  CancellationManager* GetCancellationManager() const {
+    return cancellation_manager_;
+  }
+  void SetCancellationManager(
+      CancellationManager* cancellation_manager) override {
+    cancellation_manager_ = cancellation_manager;
+  }
+
+  // Assign step_id value only if op has valid step id.
+  // When eager_func_params.has_value() returns true, we can directly overwrite
+  // its step id according to Op's step id (if not default value). However, when
+  // eager_func_params.has_value() returns false, we need to first create a new
+  // EagerFuncParams object for it before assigning step_id; otherwise,
+  // directly assigning step_id in this case leaves eager_func_params to be
+  // in a weird state where:
+  // (1) eager_func_params.has_value() returns false, but
+  // (2) eager_func_params->step_id.has_value() returns true.
+  void SetStepId(int64_t step_id) override {
+    assert(is_function());
+    if (step_id != EagerContext::kGlobalRendezvousId) {
+      if (eager_func_params_.has_value()) {
+        eager_func_params_->step_id = step_id;
+      } else {
+        eager_func_params_ = EagerFunctionParams{
+            kInvalidOpId, /*is_component_function=*/false, step_id};
+      }
+    } else {
+      LOG(WARNING) << "SetStepId() should not receive a gloabl rendezvous id.";
+    }
+  }
+
+  EagerExecutor& Executor() { return *executor_; }
+
+  string DebugString() const;
+
+  const absl::optional<EagerFunctionParams>& eager_func_params() const {
+    return eager_func_params_;
+  }
+
+  // Op name recorded for memory debugging purpose.
+  const char* op_name() const { return op_name_; }
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
+ private:
+  void AddTensorHandle(ImmediateExecutionTensorHandle* h);
+
+  const tensorflow::OpDef* GetOpDef(absl::Status* status);
+
+  void ClearInferenceState() {
+    op_def_ = nullptr;
+    inference_arg_idx_ = 0;
+    inference_attrs_.clear_no_resize();
+  }
+
+  absl::Status MaybeInferSingleInputAttrs(
+      ImmediateExecutionTensorHandle* handle);
+  absl::Status InferInputListAttrs(int num_inputs);
+
+  void InferSingleTypeInputListAttrs(const OpDef::ArgDef& input_def,
+                                     DataType dtype, int num_inputs);
+  void InferMixedTypeInputListAttrs(const OpDef::ArgDef& input_def,
+                                    const std::vector<DataType>& dtypes);
+
+  tensorflow::EagerContext& ctx_;
+  const char* op_name_ = nullptr;
+  AttrBuilder attrs_;
+  const AttrTypeMap* attr_types_;
+
+  // The number of custom device TensorHandle inputs. These inputs need to be
+  // processed by CustomDeviceOpHandler first.
+  int custom_device_tensor_handles_count_ = 0;
+  absl::InlinedVector<ImmediateExecutionTensorHandle*, 4> inputs_;
+
+  // The last device name given to SetDeviceName.
+  // This is used to avoid having to re-process the same device in repeated
+  // calls to SetDeviceName.
+  string last_set_device_name_;
+
+  // The operation's device name.
+  // This contains the named passed to SetDeviceName until device_ is set,
+  // at which point it contains the device_ name.
+  string device_name_;
+
+  // The parsed device name.
+  // This will always contain the result of
+  // DeviceNameUtils::ParseFullName(device_name_).
+  DeviceNameUtils::ParsedName device_parsed_name_;
+
+  // The operation's device.
+  // This is set by the execution device placement logic, and should conform
+  // with the contents of device_name_. Once it is set, the device_name_ is
+  // updated accordingly.
+  VariantDevice device_;
+
+  std::optional<ManagedStackTrace> stack_trace_;
+  bool is_function_;  // Conceptually const, but can't be because of Reset
+  bool colocation_exempt_;
+  CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
+  EagerExecutor* executor_;                              // Not owned.
+
+  std::optional<EagerFunctionParams> eager_func_params_;
+
+  // Inference information
+  const tensorflow::OpDef* op_def_;  // op definition from protobuf
+  int inference_arg_idx_;  // arg definition index for the next input to be
+                           // added
+  gtl::FlatSet<std::string> inference_attrs_;  // attributes inferred so far
+};
+
+inline void EagerOperation::UpdateInput(int i, TensorHandle* h) {
+  ImmediateExecutionTensorHandle** slot = &inputs_[i];
+  ImmediateExecutionTensorHandle* existing = *slot;
+  if (existing != h) {
+    h->Ref();
+    existing->Unref();
+    *slot = h;  // Update inputs_[i] to h
+  }
+}
+
+inline EagerOperation* OperationFromInterface(
+    ImmediateExecutionOperation* operation) {
+  return down_cast<EagerOperation*>(operation);
+}
+
+inline const EagerOperation* OperationFromInterface(
+    const ImmediateExecutionOperation* operation) {
+  return down_cast<const EagerOperation*>(operation);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/execute.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/execute.h
new file mode 100644
index 00000000..cbd1e0c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/execute.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Utility function that executes a fully constructed EagerOperation.
+// There are a few possible different combinations of how things can be
+// executed:
+//  - Async (the op context is configured to schedule asynchronously)
+//    Eager execute should return quickly after scheduling this operation to
+//    execute.
+//  - Remote (the op device is on a remote task)
+//    Eager execute will send an RPC to execute the op on a remote device.
+//  Note that in the Async + Remote case, EagerExecute should still return
+//  quickly, but it will schedule the op to be executed remotely.
+//
+// 'retvals' must point to a pre-allocated array of TensorHandle* and
+// '*num_retvals' should be set to the size of this array. It is an error if
+// the size of 'retvals' is less than the number of outputs. This call sets
+// *num_retvals to the number of outputs.
+absl::Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
+                          int* num_retvals);
+
+// Low-level utility to execute the kernel specified by `kernel` on
+// `kernel->device()`, with the inputs op_inputs, in the context 'ctx'.
+absl::Status EagerKernelExecute(
+    EagerContext* ctx, const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
+    const absl::optional<EagerFunctionParams>& eager_func_params,
+    const core::RefCountPtr<KernelAndDevice>& kernel,
+    GraphCollector* graph_collector, CancellationManager* cancellation_manager,
+    absl::Span<TensorHandle*> retvals,
+    const absl::optional<ManagedStackTrace>& stack_trace = {});
+
+// Low-level utility to copy a tensor handle from one device to another. If
+// successful, result TensorHandle will be populated. If the caller requests for
+// the mirror flag, EagerCopyToDevice will attempt to add a mirror to the
+// original handle and update *result to point to h. Since this is not
+// guaranteed, callers should always use the value in *result.
+absl::Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                               EagerExecutor* executor, Device* device,
+                               bool mirror, TensorHandle** result);
+
+// Utility function that executes a fully constructed EagerOperation
+// asynchronously on the local task. This function works differently from
+// EagerExecute in several ways:
+//  - It supports local execution only.
+//  - It returns after launching the eager operation to run asynchronously.
+//    Different from EagerExecute with async context that apends the operation
+//    to the end of the eager executor schedule queue, this call bypasses the
+//    executor logic and directly launches op execution. Ops running through
+//    this call does NOT have an ordering and can be executed in parallel.
+//  - It takes a StatusCallback which will be triggered after execution with the
+//    execution status.
+//
+// Does not support custom device.
+//
+// 'retvals' must point to a pre-allocated array of TensorHandle* and
+// '*num_retvals' should be set to the size of this array. It is an error if
+// the size of 'retvals' is less than the number of outputs. This call sets
+// *num_retvals to the number of outputs.
+void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
+                            int* num_retvals, StatusCallback done);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/execute_node.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/execute_node.h
new file mode 100644
index 00000000..52bf1ecf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/execute_node.h
@@ -0,0 +1,252 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+#endif  // IS_MOBILE_PLATFORM
+
+namespace tensorflow {
+
+class ExecuteNodeArgs : public EagerKernelArgs {
+ public:
+  explicit ExecuteNodeArgs(int count) : EagerKernelArgs(count) {}
+
+  absl::Status Init(EagerContext* ctx,
+                    const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    const core::RefCountPtr<KernelAndDevice>& kernel);
+
+  absl::Status GetLocalArg(const FunctionArgIndex& index,
+                           Tensor* val) const override;
+
+  bool HasRemoteOrPackedInputs() const override {
+    return has_remote_inputs_ || has_packed_inputs_;
+  };
+
+#if !defined(IS_MOBILE_PLATFORM)
+  absl::Status GetRemoteArg(const FunctionArgIndex& index,
+                            eager::RemoteTensorHandle* val) const override {
+    return serialize_remote_handle_(index, val);
+  }
+#endif  // IS_MOBILE_PLATFORM
+
+ private:
+#if !defined(IS_MOBILE_PLATFORM)
+  // Returns whether `handle` is a remote handle or has a remote mirror on
+  // `input_device`
+  bool IsRemote(EagerContext* ctx, Device* input_device, TensorHandle* handle);
+#endif  // IS_MOBILE_PLATFORM
+
+  // Initialize a packed TensorHandle which is the `index`-th argument.
+  absl::Status InitPackedHandle(int index, EagerContext* ctx,
+                                Device* input_device,
+                                TensorHandle* packed_handle);
+
+  bool has_remote_inputs_ = false;
+  bool has_packed_inputs_ = false;
+  // Maps from the index of a packed arg to a list of sub-args.
+  absl::flat_hash_map<int, absl::InlinedVector<TensorValue, 4UL>> packed_args_;
+#if !defined(IS_MOBILE_PLATFORM)
+  std::function<absl::Status(const FunctionArgIndex&,
+                             eager::RemoteTensorHandle*)>
+      serialize_remote_handle_;
+#endif  // IS_MOBILE_PLATFORM
+};
+
+class ExecuteNode : public EagerNode {
+ public:
+  ExecuteNode(EagerContext* ctx,
+              const absl::InlinedVector<TensorHandle*, 4>& inputs,
+              const absl::optional<EagerFunctionParams>& eager_func_params,
+              const core::RefCountPtr<KernelAndDevice>& kernel,
+              GraphCollector* graph_collector,
+              CancellationManager* cancellation_manager,
+              absl::Span<TensorHandle*> retvals,
+              std::optional<ManagedStackTrace> stack_trace)
+      : EagerNode(),
+        ctx_(ctx),
+        inputs_(inputs),
+        eager_func_params_(eager_func_params),
+        kernel_(kernel),
+        graph_collector_(graph_collector),
+        cancellation_manager_(cancellation_manager),
+        retvals_(retvals),
+        stack_trace_(stack_trace) {}
+
+  absl::Status Run() override {
+    int i = 0;
+    for (TensorHandle* h : inputs_) {
+      if (h->RefCountIsOne()) {
+        const Device* d = ctx_->CanonicalDevice(kernel_->InputDevice(i));
+        absl::Status s = h->Unprotect(d);
+        if (!s.ok()) {
+          VLOG(1) << "Unable to unprotect tensor: " << s;
+        }
+      }
+      ++i;
+    }
+    return EagerKernelExecute(ctx_, inputs_, eager_func_params_, kernel_,
+                              graph_collector_, cancellation_manager_, retvals_,
+                              stack_trace_);
+  }
+
+  void Abort(absl::Status status) override {}
+
+  std::string DebugString() const override {
+    std::string out = "[ExecuteNode]";
+    strings::StrAppend(&out, " kernel: ", kernel_->name());
+    return out;
+  }
+
+ private:
+  EagerContext* ctx_;
+  const absl::InlinedVector<TensorHandle*, 4>& inputs_;
+  const absl::optional<EagerFunctionParams>& eager_func_params_;
+  const core::RefCountPtr<KernelAndDevice>& kernel_;
+  GraphCollector* graph_collector_;
+  CancellationManager* const cancellation_manager_;
+  absl::Span<TensorHandle*> retvals_;
+  std::optional<ManagedStackTrace> stack_trace_;
+};
+
+class AsyncExecuteNode : public EagerNode {
+ public:
+  AsyncExecuteNode(EagerContext* ctx,
+                   const absl::InlinedVector<TensorHandle*, 4>& inputs,
+                   const absl::optional<EagerFunctionParams>& eager_func_params,
+                   core::RefCountPtr<KernelAndDevice> kernel,
+                   GraphCollector* graph_collector,
+                   CancellationManager* cancellation_manager,
+                   absl::Span<TensorHandle*> retvals,
+                   std::optional<ManagedStackTrace> stack_trace)
+      : EagerNode(),
+        ctx_(ctx),
+        inputs_(inputs),
+        eager_func_params_(eager_func_params),
+        kernel_(std::move(kernel)),
+        graph_collector_(graph_collector),
+        cancellation_manager_(cancellation_manager),
+        stack_trace_(stack_trace) {
+    // Copy the output handles, since the container for them might get
+    // destroyed.
+    for (auto handle : retvals) {
+      handle->Ref();
+      retvals_.push_back(handle);
+    }
+
+    // This is required to ensure that the tensor handles stay alive across
+    // the execution.
+    for (auto handle : inputs_) {
+      handle->Ref();
+    }
+  }
+
+  ~AsyncExecuteNode() override {
+    for (auto handle : retvals_) {
+      handle->Unref();
+    }
+
+    for (auto handle : inputs_) {
+      handle->Unref();
+    }
+  }
+
+  absl::Status Run() override {
+    int i = 0;
+    for (TensorHandle* h : inputs_) {
+      if (h->RefCountIsOne()) {
+        const Device* d = ctx_->CanonicalDevice(kernel_->InputDevice(i));
+        absl::Status s = h->Unprotect(d);
+        if (!s.ok()) {
+          VLOG(1) << "Unable to unprotect tensor: " << s;
+        }
+      }
+      ++i;
+    }
+    absl::Status status = EagerKernelExecute(
+        ctx_, inputs_, eager_func_params_, kernel_, graph_collector_,
+        cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
+    if (!status.ok()) {
+      if (stack_trace_.has_value()) {
+        errors::SetStackTrace(
+            status, stack_trace_->ToStackFrames(
+                        {}, {}, /*reverse_traversal=*/false, /*limit=*/-1));
+      }
+      Abort(status);
+      return status;
+    }
+    // If status is ok, EagerKernelExecute would have called SetTensor on
+    // all the output handles.
+    return absl::OkStatus();
+  }
+
+  void Abort(absl::Status status) override {
+    int i = 0;
+    for (auto handle : retvals_) {
+      handle->Poison(status, ctx_->CanonicalDevice(kernel_->OutputDevice(i)));
+      ++i;
+    }
+  }
+
+  std::string DebugString() const override {
+    std::string out = "[AsyncExecuteNode]";
+    strings::StrAppend(&out, " kernel: ", kernel_->name());
+    return out;
+  }
+
+ private:
+  EagerContext* ctx_;
+  absl::InlinedVector<TensorHandle*, 4> inputs_;
+  const absl::optional<EagerFunctionParams> eager_func_params_;
+  core::RefCountPtr<KernelAndDevice> kernel_;
+  GraphCollector* graph_collector_;
+  CancellationManager* const cancellation_manager_;
+  std::optional<ManagedStackTrace> stack_trace_;
+  absl::InlinedVector<TensorHandle*, 2> retvals_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/kernel_and_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/kernel_and_device.h
new file mode 100644
index 00000000..c13e1524
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -0,0 +1,426 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
+
+// Support for eager execution of TensorFlow kernels.
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "absl/memory/memory.h"
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+#endif  // IS_MOBILE_PLATFORM
+
+namespace tensorflow {
+
+static constexpr const char* const kOutputsOnOpDevice = "_OutputsOnOpDevice";
+
+class ProcessFunctionLibraryRuntime;
+class FunctionLibraryRuntime;
+
+const int64_t kInvalidOpId = -1;
+
+// This struct is used for:
+// 1. Setting `op_id` and `step_id`, `is_component_function` for single-client
+// remote function scenario,
+// 2. Setting `step_id` for multi-client parallel_device scenario.
+// 3. Supplying an overriding, private `FunctionLibraryDefinition` for component
+// functions.
+struct EagerFunctionParams {
+  int64_t op_id = kInvalidOpId;
+  bool is_component_function;
+  std::optional<int64_t> step_id = std::nullopt;
+  FunctionLibraryDefinition* func_lib_def_override =
+      nullptr;  // Not owned (owned by `EagerContext`). If not null, functions
+                // called by the function will be looked up in this library.
+};
+
+class EagerKernelArgs : public FunctionArgsInterface {
+ public:
+  EagerKernelArgs() = default;
+
+  explicit EagerKernelArgs(int count) : tensor_args_(count) {}
+
+  explicit EagerKernelArgs(absl::InlinedVector<TensorValue, 4UL>&& tensor_args)
+      : tensor_args_(std::move(tensor_args)) {}
+
+  ~EagerKernelArgs() override = default;
+
+  bool HasRemoteOrPackedInputs() const override { return false; };
+  TensorValue* MutableInput(int i) { return &tensor_args_[i]; }
+
+  absl::Status GetLocalArg(const FunctionArgIndex& index,
+                           Tensor* val) const override;
+
+  std::vector<Tensor> GetLocalTensors() const override;
+
+  const absl::InlinedVector<TensorValue, 4UL>* GetTensorValues() const {
+    return &tensor_args_;
+  }
+
+ protected:
+  absl::InlinedVector<TensorValue, 4UL> tensor_args_;
+};
+
+typedef std::variant<Tensor, TensorShape> EagerKernelRet;
+
+// KernelAndDevice encapsulates the logic needed to run a computation eagerly.
+// The computation can be a single instantiated kernel (implemented by
+// KernelAndDeviceOp below) or a multi-device function (implemented by
+// KernelAndDeviceFunc below).
+//
+// Also see:
+// https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+// and
+// https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
+class KernelAndDevice : public core::RefCounted {
+ public:
+  // Populates this with a kernel appropriate for 'ndef'.
+  //
+  // The provided FunctionLibraryRuntime MUST outlive all calls to
+  // Run() on the returned KernelAndDevice.
+  virtual absl::Status Init(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params) = 0;
+
+  // Non-multi-device functions are run using regular CallOp and look like
+  // primitive operations from KernelAndDevice perspective.
+  // `flr` can be nullptr if the operation is not run on any specific device
+  // (currently can happen only for multi-device functions).
+  KernelAndDevice(
+      FunctionLibraryRuntime* flr,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : device_(flr == nullptr ? nullptr : flr->device()),
+        host_cpu_device_(host_cpu_device),
+        flr_(flr),
+        collective_executor_(std::move(collective_executor)),
+        runner_(runner) {}
+
+  // Not thread safe.
+  ~KernelAndDevice() override = default;
+
+  virtual bool IsFunction() { return false; }
+
+  virtual bool IsCrossProcess() { return false; }
+
+  // TODO(ashankar): Handle list-valued inputs.
+  virtual absl::Status Run(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      const absl::optional<ManagedStackTrace>& stack_trace,
+      tsl::CoordinationServiceAgent* coordination_service_agent) = 0;
+
+  // Execute kernel asynchronously when applicable. Different from `Run` which
+  // blocks the caller thread and waits for the execution of the op/function,
+  // `RunAsync` could return before finishing the execution. The `done` callback
+  // will be triggered once the op/function execution finishes.
+  // Currently, calling RunAsync on ops might not honor the asynchronicity when
+  // it is called on an instance with only sync implementation, execute the
+  // kernel synchronously and then call the callback with the return status
+  // from sync execution.
+  virtual void RunAsync(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      tsl::CoordinationServiceAgent* coordination_service_agent,
+      StatusCallback done) = 0;
+
+  virtual Device* InputDevice(int i) const = 0;
+  virtual Device* OutputDevice(int idx) const = 0;
+  // If idx'th output is a resource, returns the device backing the resource.
+  // Else, returns nullptr.
+  virtual Device* OutputResourceDevice(int idx) const = 0;
+
+  // Returns the kernel that will be used to run this.
+  // Returns nullptr if this will be run using function library runtime.
+  virtual const OpKernel* kernel() const = 0;
+
+  // Returns the device on which this kernel will run. In the case of
+  // multi-device functions, this is the default device that is passed to the
+  // placer but actual computation can happen on a different set of devices.
+  // Also, outputs can be produced on devices different from what this method
+  // returns.
+  Device* device() const { return device_; }
+
+  virtual const DataTypeVector& input_dtypes() const = 0;
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  virtual int num_inputs() const = 0;
+  virtual int num_outputs() const = 0;
+  virtual const string& name() const = 0;
+
+ protected:
+  std::function<void(std::function<void()>)>* get_runner() const;
+
+  Device* const device_;               // can be null
+  Device* const host_cpu_device_;      // non-null
+  FunctionLibraryRuntime* const flr_;  // can be null
+  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
+
+ private:
+  std::function<void(std::function<void()>)>* const runner_;  // can be null
+};
+
+// Represents an op kernel and the device it will be run on.
+class KernelAndDeviceOp final : public KernelAndDevice {
+ public:
+  KernelAndDeviceOp(
+      tensorflow::Rendezvous* rendezvous, bool log_memory,
+      FunctionLibraryRuntime* flr,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : KernelAndDevice(flr, runner, std::move(collective_executor),
+                        host_cpu_device),
+        rendezvous_(rendezvous),
+        log_memory_(log_memory) {}
+
+  ~KernelAndDeviceOp() override = default;
+
+  absl::Status Init(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params) override;
+
+  absl::Status Run(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      const absl::optional<ManagedStackTrace>& stack_trace,
+      tsl::CoordinationServiceAgent* coordination_service_agent) override;
+
+  void RunAsync(ScopedStepContainer* step_container,
+                const EagerKernelArgs& inputs,
+                std::vector<EagerKernelRet>* outputs,
+                CancellationManager* cancellation_manager,
+                const absl::optional<EagerFunctionParams>& eager_func_params,
+                tsl::CoordinationServiceAgent* coordination_service_agent,
+                StatusCallback done) override {
+    // Trivial async implementation on top of the sync version
+    done(Run(step_container, inputs, outputs, cancellation_manager,
+             eager_func_params, {}, coordination_service_agent));
+  }
+
+  const OpKernel* kernel() const override { return kernel_.get(); }
+
+  Device* InputDevice(int i) const override;
+  Device* OutputDevice(int idx) const override;
+  Device* OutputResourceDevice(int idx) const override;
+
+  const DataTypeVector& input_dtypes() const override {
+    return kernel_->input_types();
+  }
+  const DataTypeVector& output_dtypes() const override {
+    return kernel_->output_types();
+  }
+  int num_inputs() const override { return kernel_->num_inputs(); }
+  int num_outputs() const override { return kernel_->num_outputs(); }
+  const string& name() const override { return kernel_->name(); }
+
+ private:
+  std::unique_ptr<OpKernel> kernel_;
+  bool is_distributed_communication_op_;
+  absl::InlinedVector<AllocatorAttributes, 4UL> input_alloc_attrs_;
+  std::vector<Device*> input_devices_;
+  absl::InlinedVector<AllocatorAttributes, 1UL> output_alloc_attrs_;
+  Rendezvous* const rendezvous_;
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
+  const bool log_memory_;
+};
+
+// Represents a multi-device function. Functions can also be run using
+// various function-calling kernels including CallOp and PartitionedCallOp.
+// In such cases, KernelAndDeviceOp is used.
+class KernelAndDeviceFunc : public KernelAndDevice {
+ public:
+  // `flr` can be nullptr.
+  // `pflr` must not be nullptr.
+  // `host_cpu_device` must not be nullptr.
+  KernelAndDeviceFunc(
+      FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
+      std::vector<Device*> input_devices,
+      absl::flat_hash_map<string, const std::vector<string>*> composite_devices,
+      std::unordered_map<int, DtypeAndPartialTensorShape>
+          input_resource_dtypes_and_shapes,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device, const string& name,
+      const bool outputs_on_op_device,
+      const bool allow_small_function_optimizations,
+      const bool allow_control_flow_sync_execution,
+      const bool shape_inference_on_tfe_dialect_import,
+      const bool int_args_and_retvals_on_device,
+      std::optional<string> xla_compile_device_type,
+      const bool allow_soft_placement, Rendezvous::Factory rendezvous_factory,
+      std::function<int64_t()> get_op_id)
+      : KernelAndDevice(flr, runner, std::move(collective_executor),
+                        host_cpu_device),
+        pflr_(pflr),
+        handle_(kInvalidHandle),
+        outputs_on_op_device_(outputs_on_op_device),
+        allow_small_function_optimizations_(allow_small_function_optimizations),
+        allow_control_flow_sync_execution_(allow_control_flow_sync_execution),
+        shape_inference_on_tfe_dialect_import_(
+            shape_inference_on_tfe_dialect_import),
+        int_args_and_retvals_on_device_(int_args_and_retvals_on_device),
+        xla_compile_device_type_(xla_compile_device_type),
+        allow_soft_placement_(allow_soft_placement),
+        input_devices_(std::move(input_devices)),
+        composite_devices_(std::move(composite_devices)),
+        input_resource_dtypes_and_shapes_(
+            std::move(input_resource_dtypes_and_shapes)),
+        name_(name),
+        rendezvous_factory_(std::move(rendezvous_factory)),
+        get_op_id_(std::move(get_op_id)) {}
+
+  ~KernelAndDeviceFunc() override;
+
+  bool IsFunction() override { return true; };
+
+  bool IsCrossProcess() override { return is_cross_process_; }
+
+  absl::Status InstantiateFunc(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params);
+
+  absl::Status Init(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params) override;
+
+  absl::Status Run(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      const absl::optional<ManagedStackTrace>& stack_trace,
+      tsl::CoordinationServiceAgent* coordination_service_agent) override;
+
+  void RunAsync(ScopedStepContainer* step_container,
+                const EagerKernelArgs& inputs,
+                std::vector<EagerKernelRet>* outputs,
+                CancellationManager* cancellation_manager,
+                const absl::optional<EagerFunctionParams>& eager_func_params,
+                tsl::CoordinationServiceAgent* coordination_service_agent,
+                StatusCallback done) override;
+
+  const OpKernel* kernel() const override { return nullptr; }
+
+  Device* InputDevice(int i) const override;
+  Device* OutputDevice(int idx) const override;
+  Device* OutputResourceDevice(int idx) const override;
+
+  const DataTypeVector& input_dtypes() const override { return input_dtypes_; }
+  const DataTypeVector& output_dtypes() const override {
+    return output_dtypes_;
+  }
+  int num_inputs() const override { return input_dtypes_.size(); }
+  int num_outputs() const override { return output_dtypes_.size(); }
+  const string& name() const override { return name_; };
+
+ private:
+  std::shared_ptr<FunctionLibraryRuntime::Options> PrepareForRun(
+      ScopedStepContainer* step_container, std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      const absl::optional<ManagedStackTrace>& stack_trace,
+      tsl::CoordinationServiceAgent* coordination_service_agent,
+      tsl::core::RefCountPtr<Rendezvous>* rendezvous);
+
+  ProcessFunctionLibraryRuntime* const pflr_;  // non-null
+  FunctionLibraryRuntime::Handle handle_;
+  // Indicates whether the function needs to execute cross process.
+  bool is_cross_process_;
+
+  // If true, function outputs are explicitly assigned to the default device;
+  // if false, the output devices are inferred by pflr_.
+  bool outputs_on_op_device_;
+
+  // If True, allow optimizations which should be targeted at a limited
+  // set of small functions.  (For example, running kernels synchronously can
+  // be faster under some conditions.)
+  const bool allow_small_function_optimizations_;
+
+  // If True, allows control nodes to run on the single threaded executor.
+  const bool allow_control_flow_sync_execution_;
+
+  // TODO(b/176491312): Remove this if shape inference on import flag is
+  // removed. If True, allows mlir roundtrip to run shape inference on import.
+  const bool shape_inference_on_tfe_dialect_import_;
+
+  const bool int_args_and_retvals_on_device_;
+
+  const absl::optional<string> xla_compile_device_type_;
+
+  const bool allow_soft_placement_;
+
+  // CPU devices are null. Resource handles' devices are actual backing
+  // devices.
+  std::vector<Device*> output_devices_;
+  // CPU devices are not null. Resource handles' devices are actual backing
+  // devices.
+  std::vector<Device*> input_devices_;
+  // Maps from a CompositeDevice name to a list of physical device names.
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices_;
+  std::unordered_map<int, DtypeAndPartialTensorShape>
+      input_resource_dtypes_and_shapes_;
+
+  DataTypeVector input_dtypes_;
+  DataTypeVector output_dtypes_;
+  string name_;
+
+  Rendezvous::Factory rendezvous_factory_;
+  std::function<int64_t()> get_op_id_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/placement_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/placement_utils.h
new file mode 100644
index 00000000..fa51f198
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/placement_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
+
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace eager {
+
+bool IsColocationExempt(absl::string_view op_name);
+
+bool IsFunction(absl::string_view op_name);
+
+// TODO(b/154234908): Unify placement logic.
+
+// Pin the op to cpu if all op inputs are on the CPU, small (<64 elements) and
+// integers (int32/int64). This can be disabled by setting the environment
+// variable "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+absl::Status MaybePinSmallOpsToCpu(
+    bool* result, absl::string_view op_name,
+    absl::Span<ImmediateExecutionTensorHandle* const> args,
+    absl::string_view cpu_device_name);
+
+// If a resource touching input is specified, all resource-touching ops run in
+// the device the resource is, regardless of anything else that has been
+// specified. This is identical to the graph mode behavior.
+absl::Status MaybePinToResourceDevice(Device** device,
+                                      const EagerOperation& op);
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/rendezvous_cache.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/rendezvous_cache.h
new file mode 100644
index 00000000..e79171f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/rendezvous_cache.h
@@ -0,0 +1,146 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_RENDEZVOUS_CACHE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_RENDEZVOUS_CACHE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/refcount.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// The class for caching Rendezvous instances per step_id.
+// If the Rendezvous object is destroyed for the step, a new one will be
+// created on demand.
+template <typename T>
+class RendezvousCache : public tsl::core::WeakRefCounted {
+ public:
+  RendezvousCache() = default;
+  ~RendezvousCache() override {
+    for (auto& p : table_) {
+      auto rendez = p.second.GetNewRef();
+      if (rendez) {
+        rendez->StartAbort(tsl::errors::Aborted("Shutdown"));
+      }
+    }
+  }
+
+  // Returns a new Reference.
+  template <typename RendezvousCreator>
+  tsl::core::RefCountPtr<T> FindOrCreate(int64_t step_id,
+                                         RendezvousCreator create_fn) {
+    tsl::mutex_lock l(table_lock_);
+    tsl::core::RefCountPtr<T> rendz = nullptr;
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      rendz = iter->second.GetNewRef();
+      VLOG(5) << "step_id:" << step_id << " "
+              << "WeakPtr returned:" << rendz.get();
+      if (!rendz) {
+        table_.erase(iter);
+      }
+    }
+    if (!rendz) {  // Deleted or not found
+      rendz = create_fn();
+      VLOG(5) << "step_id:" << step_id << " "
+              << "Rendezvous not found, inserting a new one." << rendz.get();
+      auto cleanup_fn = [weak_cache = tsl::core::WeakPtr<RendezvousCache>(this),
+                         step_id]() {
+        tsl::core::RefCountPtr<RendezvousCache> cache = weak_cache.GetNewRef();
+        if (cache != nullptr) {
+          // If the rendezvous is released, Find() will clean it up from the
+          // map.
+          cache->Find(step_id);
+        }
+      };
+      table_.insert({step_id, tsl::core::WeakPtr<T>{rendz.get(), cleanup_fn}});
+    }
+    return rendz;
+  }
+
+  // Returns a new Reference.
+  tsl::core::RefCountPtr<T> Find(int64_t step_id) {
+    tsl::mutex_lock l(table_lock_);
+    auto iter = table_.find(step_id);
+    if (iter == table_.end()) return nullptr;
+    tsl::core::RefCountPtr<T> res = iter->second.GetNewRef();
+    // Cleans the record if the rendezvous is already destroyed.
+    if (res == nullptr) {
+      table_.erase(iter);
+    }
+    return res;
+  }
+
+  // Removes a Rendezvous weak reference from table.
+  void Remove(int64_t step_id) {
+    tsl::mutex_lock l(table_lock_);
+    table_.erase(step_id);
+  }
+
+  // Removes a Rendezvous weak reference from table, and abort the rendezvous.
+  void RemoveAndAbort(int64_t step_id) {
+    tsl::core::RefCountPtr<T> rendez = nullptr;
+    {
+      tsl::mutex_lock l(table_lock_);
+      auto iter = table_.find(step_id);
+      if (iter != table_.end()) {
+        rendez = iter->second.GetNewRef();
+        table_.erase(iter);
+      }
+    }
+    if (rendez) {
+      rendez->StartAbort(tsl::errors::Aborted("Cleanup ", step_id));
+    }
+  }
+
+  void RemoveAll() {
+    tsl::mutex_lock l(table_lock_);
+    table_.clear();
+  }
+
+  // Returns a list of active step ids. This result is only informative
+  // at time of the call. The returned vector may contain step ids that have
+  // been invalidated after the call.
+  std::vector<int64_t> GetActiveStepIds() {
+    std::vector<int64_t> list;
+    tsl::mutex_lock l(table_lock_);
+    list.reserve(table_.size());
+    for (const auto& iter : table_) {
+      list.push_back(iter.first);
+    }
+    return list;
+  }
+
+  size_t Size() const {
+    tsl::mutex_lock l(table_lock_);
+    return table_.size();
+  }
+
+ private:
+  mutable tsl::mutex table_lock_;
+  absl::flat_hash_map<int64_t, tsl::core::WeakPtr<T>> table_
+      TF_GUARDED_BY(table_lock_);
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_RENDEZVOUS_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/shape_inference.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/shape_inference.h
new file mode 100644
index 00000000..be386f97
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/shape_inference.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SHAPE_INFERENCE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SHAPE_INFERENCE_H_
+
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+namespace eager {
+
+absl::Status RunShapeInference(
+    const NodeDef& ndef, const FunctionLibraryDefinition& lib_def,
+    const absl::InlinedVector<TensorHandle*, 4UL>& inputs,
+    const absl::InlinedVector<TensorHandle*, 2UL>& retvals);
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SHAPE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/small_constants_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/small_constants_optimizer.h
new file mode 100644
index 00000000..cb70fb99
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/small_constants_optimizer.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SMALL_CONSTANTS_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SMALL_CONSTANTS_OPTIMIZER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow::small_constants_optimizer {
+
+// Checks whether small constant optimization is enabled for a tf.function.
+bool IsSmallConstantOptimizationEnabled(const FunctionDef& fdef);
+
+// Generates new FunctionDefs with the boolean input tensors folded as
+// constants into the FunctionDef.
+std::vector<FunctionDef> FoldInputTensors(
+    const FunctionDef& fdef, const FunctionLibraryDefinition& flib);
+
+// Generates the FunctionDef name for the folded function.
+std::string FoldedFunctionName(absl::string_view fname,
+                               absl::string_view input_name, bool input_value);
+
+}  // namespace tensorflow::small_constants_optimizer
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SMALL_CONSTANTS_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/summary_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/summary_optimizer.h
new file mode 100644
index 00000000..0b337e04
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/summary_optimizer.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SUMMARY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SUMMARY_OPTIMIZER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow::summary_optimizer {
+namespace internal {
+
+// Normalizes an edge's name to match the names stored in a NodeDef.
+std::string NormalizeEdgeName(absl::string_view name);
+
+}  // namespace internal
+
+// Returns the name of the input_arg and the bool value that determines whether
+// or not to disable summaries. If no such arg exists returns an empty string.
+std::pair<absl::string_view, bool> GetDisableSummariesInputArg(
+    const FunctionDef& fdef);
+
+// Generates new FunctionDef(s) with the summaries stripped out.
+// This function will traverse all the nested functions and generate a version
+// of the nested functions with summaries stripped out.
+std::vector<FunctionDef> StripSummaries(const FunctionDef& fdef,
+                                        const FunctionLibraryDefinition& flib);
+
+// Generates a new function name for the stripped function.
+std::string StrippedFunctionName(absl::string_view fname);
+
+}  // namespace tensorflow::summary_optimizer
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SUMMARY_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/tensor_handle.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/tensor_handle.h
new file mode 100644
index 00000000..ca60815d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -0,0 +1,419 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "absl/types/variant.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle_data.h"
+#include "tensorflow/core/common_runtime/function.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h"
+#endif  // IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/tensor.h"
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+class EagerContext;
+
+// Associates a Tensor and a Device, used in the eager runtime. Internal version
+// of the TFE_TensorHandle struct and the python EagerTensor class
+// (unrelated to python TensorHandle).
+class TensorHandle : public ImmediateExecutionTensorHandle {
+  // TensorHandle for dtype != DT_RESOURCE
+  TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
+               Device* resource_device, EagerContext* ctx);
+  // TensorHandle for dtype == DT_RESOURCE
+  TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
+               EagerContext* ctx);
+  TensorHandle(Device* d, Device* op_device, Device* resource_device,
+               tensorflow::DataType dtype, EagerContext* ctx);
+
+#if !defined(IS_MOBILE_PLATFORM)
+  TensorHandle(int64_t op_id, int32_t output_num, const string& remote_task,
+               tensorflow::DataType dtype, Device* device, EagerContext* ctx,
+               bool unknown_device);
+  TensorHandle(int64_t op_id, int32_t output_num, tensorflow::DataType dtype,
+               Device* device, bool is_ready, EagerContext* ctx);
+#endif  // IS_MOBILE_PLATFORM
+
+ public:
+  // TensorHandle with no assigned device
+  static TensorHandle* CreateLocalHandle(const tensorflow::Tensor& t);
+  static TensorHandle* CreateLocalHandle(tensorflow::Tensor&& t, Device* d,
+                                         Device* op_device, EagerContext* ctx);
+  static TensorHandle* CreateLocalHandle(tensorflow::Tensor&& t, Device* d,
+                                         Device* op_device,
+                                         Device* resource_device,
+                                         EagerContext* ctx);
+  static TensorHandle* CreateEmptyLocalHandle(Device* d, Device* op_device,
+                                              Device* resource_device,
+                                              tensorflow::DataType dtype,
+                                              EagerContext* ctx);
+
+  // Create a handle which packs the given handles of the same dtype and shape.
+  // If handles are on different devices, assign the packed handle to a
+  // CompositeDevice.
+  //
+  // The new tensor handle shares ownership of the given handle: their reference
+  // count will be increased by one after a call to `CreatePackedHandle`.
+  // TODO(b/170414377): Use `TensorHandlePtr` instead.
+  static absl::Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                         tensorflow::DataType dtype,
+                                         const tensorflow::TensorShape& shape,
+                                         const string& device_name,
+                                         EagerContext* ctx,
+                                         TensorHandle** packed_handle);
+  static absl::Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                         EagerContext* ctx,
+                                         TensorHandle** packed_handle);
+
+#if !defined(IS_MOBILE_PLATFORM)
+  // An unshaped remote handle refers to a tensor on a remote worker. It's not
+  // ready until the shape is set. It controls the lifetime of the remote
+  // tensor.
+  static TensorHandle* CreateUnshapedRemoteHandle(int64_t op_id,
+                                                  int32_t output_num,
+                                                  const string& remote_task,
+                                                  tensorflow::DataType dtype,
+                                                  Device* d, EagerContext* ctx,
+                                                  bool unknown_device = false);
+  // A lazy remote handle refers to a tensor on a remote worker. The lifetime of
+  // the remote tensor is controlled by the remote worker, but not by the lazy
+  // remote handle. Lazy handles are normally created on a default function
+  // device.
+  static TensorHandle* CreateLazyRemoteHandle(int64_t op_id, int32_t output_num,
+                                              tensorflow::DataType dtype,
+                                              Device* d, bool is_ready,
+                                              EagerContext* ctx);
+#endif  // IS_MOBILE_PLATFORM
+
+  // Templated struct `AutoReleaser` in
+  // core/runtime_fallback/runtime/kernel_utils.h needs a Release() method
+  // defined.
+  void Release();
+
+  tensorflow::DataType DataType() const override;
+  absl::Status Shape(tensorflow::PartialTensorShape* shape) const override;
+  absl::Status NumDims(int* num_dims) const override;
+  absl::Status NumElements(int64_t* num_elements) const override;
+  absl::Status Dim(int dim_index, int64_t* dim) const override;
+
+  const char* DeviceName(absl::Status* status) const override;
+  const char* BackingDeviceName(absl::Status* status) const override;
+  const char* DeviceType(absl::Status* status) const override;
+  int DeviceId(absl::Status* status) const override;
+  AbstractTensorInterface* Resolve(absl::Status* status) override;
+
+  // Subclasses may return True to instruct the string formatter
+  // to use SummarizeValue instead of the NumPy formatter.
+  bool PreferCustomSummarizer() const override {
+    return dtype == DT_VARIANT || dtype == DT_RESOURCE;
+  }
+
+  // Return the Tensor from the default device.
+  absl::Status Tensor(const tensorflow::Tensor** t) const;
+  // Return the Tensor from the specified device which could be either the
+  // default device or a local mirror. The device pointer should be nullptr if
+  // requesting the HostCPU.
+  absl::Status TensorFromDevice(const Device* d,
+                                const tensorflow::Tensor** t) const;
+
+  // Return the TensorValue from the specified device which could be either the
+  // default device or a local mirror. The device pointer should be nullptr if
+  // requesting the HostCPU.
+  absl::Status TensorValue(const Device* d, tensorflow::TensorValue* t);
+
+  Device* device() const { return device_; }
+  Device* op_device() const { return op_device_; }
+  Device* resource_device() const { return resource_device_; }
+  int64_t resource_remote_device_incarnation() const {
+    return resource_remote_device_incarnation_;
+  }
+
+  // If the devices are unknown at creation time, block until the actual devices
+  // are set (data is ready).
+  absl::Status WaitUnknownDevice() const;
+
+  Device* DeviceOrHostCPU(const EagerContext& ctx) const;
+
+  absl::Status Shape(tensorflow::TensorShape* shape);
+
+  absl::Status Unprotect(const Device* d);
+
+  // Checks if a mirror tensor exists for the specified device. Mirrors are only
+  // maintained for local devices, like CPUs & GPUs. Note a mirror may be empty,
+  // as it is still to be set by an async operation.
+  bool HasLocalMirror(const Device* d) const;
+  // Add an empty mirror placeholder for the specified device. The expectation
+  // is this will be populated by a call to SetTensor.
+  absl::Status AddEmptyLocalMirror(const Device* d);
+  // Add a local mirror. This will fail if an empty local mirror was previously
+  // added. For that case, SetTensor should be used instead.
+  absl::Status AddLocalMirror(tensorflow::Tensor&& tensor, const Device* d);
+
+#if !defined(IS_MOBILE_PLATFORM)
+  bool HasRemoteMirror(const Device* d, uint64 context_view_id) const;
+  bool HasResourceShapeMirror(const Device* d, uint64 context_view_id) const;
+
+  absl::Status AddUnshapedRemoteMirror(const Device* d, int64_t op_id,
+                                       int output_num,
+                                       const string& remote_task,
+                                       EagerContext* ctx);
+  absl::Status AddResourceShapeMirror(const Device* d, int64_t op_id,
+                                      int output_num, EagerContext* ctx);
+
+  // Return the op_id and output num if the handle refers to a remote tensor.
+  // If wait_until_ready is true, block until the remote tensor is ready on the
+  // given remote worker.
+  absl::Status RemoteAddress(const Device* d, bool wait_until_ready,
+                             int64_t* op_id, int32* output_num) const;
+
+  // Called on an async remote tensor once it's shape has been determined. This
+  // transitions the tensor handle from a non-ready to a ready state by
+  // replacing the backing data abstraction to allow for the shape to be
+  // queried.
+  // creating a TensorHandle (e.g. a remote output of a remote function).
+  // This method or Poison must be called exactly once for remote tensors that
+  // were created without a known shape.
+  absl::Status SetRemoteShape(const TensorShape& shape, const Device* d,
+                              uint64 context_view_id);
+  // If op_device is not empty, reset the devices of a remote tensor which is
+  // created without known devices (e.g. function outputs).
+  absl::Status SetRemoteShapeAndDevice(const TensorShape& shape,
+                                       const Device* d, uint64 context_view_id,
+                                       string op_device);
+
+  // Poisons either this handle or a remote mirror with error `status`.
+  // Poisoning means that the handle will become ready and methods trying
+  // to access the remote shape will return this error `status`.
+  // Exactly one of SetRemoteShape or PoisonRemote methods must be called on a
+  // unshaped handle on a remote device.
+  void PoisonRemote(absl::Status status, const Device* d,
+                    uint64 context_view_id);
+#endif
+
+  // Sets the `tensor` for this async non-ready handle making it ready.
+  // This method or Poison must be called exactly once for non-ready async
+  // handles to make them ready.
+  absl::Status SetTensor(tensorflow::Tensor&& tensor, const Device* d);
+
+  // Poisons either this handle or a local mirror with error `status`.
+  // Poisoning means that the handle will become ready and methods trying
+  // to access the actual tensor or shape will return this error `status`.
+  // Exactly one of SetTensor or Poison methods must be called on a non-ready
+  // tensor for a specific device.
+  void Poison(absl::Status status, const Device* d);
+
+  // TODO(b/154282629): Consider moving it to EagerContext.
+  // Copies to the tensor on the given device `d`, or to host iff `d` is null.
+  absl::Status CopyToDevice(const EagerContext& ctx, tensorflow::Device* d,
+                            tensorflow::Tensor* output) const;
+
+  absl::Status InferenceShape(
+      shape_inference::InferenceContext* inference_context,
+      shape_inference::ShapeHandle* shape_handle);
+  void SetInferenceShape(shape_inference::InferenceContext* inference_context,
+                         const shape_inference::ShapeHandle& shape_handle);
+  absl::Status CopyInferenceShape(TensorHandle* other);
+
+  // dtype for the handle. It must be the same as t.dtype() once the handle is
+  // ready.
+  const tensorflow::DataType dtype;
+
+  enum HandleType { LOCAL = 0, PACKED = 1, REMOTE = 2 };
+
+  HandleType Type() const;
+  string TypeString() const;
+
+  void SetResourceHandleDtypeAndShape(
+      std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes);
+
+  // If this TensorHandle is 1) a local tensor, and 2) a resource handle,
+  // return data types and shapes of the underlying resource.
+  absl::Status GetResourceHandleDtypesAndShapes(
+      std::vector<DtypeAndPartialTensorShape>* result);
+
+  // Returns the number of packed handles. 0 if the handle type is not PACKED.
+  int NumPackedHandles() const;
+  // It's called on a packed TensorHandle. Extract a handle with the given
+  // index.
+  absl::Status ExtractPackedHandle(int index, TensorHandle** handle) const;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
+  tensorflow::FullTypeDef FullType() const override { return full_type_; }
+
+  void SetFullType(FullTypeDef& full_type) { full_type_ = full_type; }
+
+ private:
+  friend class PackedTensorHandleTest;
+
+  TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
+               tensorflow::DataType dtype, const tensorflow::TensorShape& shape,
+               EagerContext* ctx);
+
+  ~TensorHandle() override;
+
+  // The TensorHandleData can either represent a local or remote tensor handle.
+  // Further, it can be in a non-ready state. It would become ready with a call
+  // to either SetTensor or SetRemoteShape which replaces the underlying data
+  // with a ready version of the tensor handle data.
+  bool IsReady() const;
+  absl::Status WaitReady(const char* caller) const;
+
+  tensorflow::Device* device_;
+
+  // Device in which the op producing this tensor was executed. Equals to
+  // device_ for constant tensors.
+  // Can be nullptr if the op producing this tensor was a function executed
+  // with function library runtime.
+  tensorflow::Device* op_device_;
+
+  // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
+  // backing the resource. Else resource_device_ is nullptr.
+  tensorflow::Device* resource_device_;
+  // Incarnation ID of the resource device if it locates on a remote device, or
+  // 0 if it locates on a local device.
+  int64_t resource_remote_device_incarnation_;
+
+  // If true, the handle refers to a remote tensor which is created without
+  // known devices. The actual devices are set by SetRemoteShape. The devices
+  // should be accessed once the handle is ready.
+  const bool unknown_device_ = false;
+
+  mutable mutex mu_;
+
+  // Map of local mirrors. This can include both ready and non-ready mirrors.
+  std::unordered_map<const tensorflow::Device*, LocalTensorHandleData>
+      local_mirrors_ TF_GUARDED_BY(mu_);
+#if !defined(IS_MOBILE_PLATFORM)
+  // TODO(yujingzhang): Remove resource_shape_mirrors_ once scalable per-replica
+  // variable is ready, since we could get the shape locally without remote copy
+  // then.
+  std::unordered_map<string, RemoteTensorHandleData> resource_shape_mirrors_
+      TF_GUARDED_BY(mu_);
+  std::unordered_map<string, RemoteTensorHandleData> remote_mirrors_
+      TF_GUARDED_BY(mu_);
+#endif
+
+  // `ctx` is only guaranteed to be set if the handle is not "ready". This is
+  // typically true when the handle was produced during async execution.
+  // `ctx` object is not owned and should outlive this handle.
+  //
+  // TODO(b/150614042): Reference count EagerContext to ensure that 'device_' of
+  // a TensorHandle does not outlive the EagerContext from which it came?
+  EagerContext* const ctx_;
+
+  // If this TensorHandle 1) is a local tensor, and 2) is a resource handle or
+  // refers to a remote resource handle, we store data types and shapes for
+  // the underlying resource.
+  std::vector<DtypeAndPartialTensorShape> handle_dtypes_and_shapes_;
+
+  // A handle data which refers to multiple TensorHandles of the same dtype and
+  // shape.
+  class PackedTensorHandleData {
+   public:
+    // Initialize handle data from list of tensor handles.
+    // Ownership of the tensor handles is shared between the
+    // `PackedTensorHandleData` and the caller (the reference count for the
+    // given handles is incremented).
+    // TODO(b/170414377): Use `TensorHandlePtr` instead.
+    PackedTensorHandleData(std::vector<TensorHandle*>&& handles,
+                           const TensorShape& shape);
+
+    ~PackedTensorHandleData();
+
+    absl::Status Shape(TensorShape* shape) const;
+    absl::Status NumDims(int* num_dims) const;
+    absl::Status Dim(int dim_index, int64_t* dim) const;
+    absl::Status NumElements(int64_t* num_elements) const;
+    absl::Status Unprotect();
+    bool IsReady() const;
+    absl::Status WaitReady(const char* caller) const;
+    void Poison(absl::Status status);
+    string DebugString() const;
+
+    // Number of packed handles.
+    int NumPackedHandles() const;
+    // Extract a handle on the given index.
+    absl::Status ExtractPackedHandle(int index, TensorHandle** handle) const;
+
+   private:
+    // TODO(b/170414377): Use `TensorHandlePtr` instead.
+    const std::vector<TensorHandle*> handles_;
+    const TensorShape shape_;
+
+    mutable mutex mu_;
+    absl::Status is_poisoned_ TF_GUARDED_BY(mu_);
+  };
+
+  // Does not need synchronization because it can be accessed only after
+  // WaitReady() has returned. At that point, data_ is immutable.
+#if !defined(IS_MOBILE_PLATFORM)
+  std::variant<LocalTensorHandleData, PackedTensorHandleData,
+               RemoteTensorHandleData>
+      data_;
+#else
+  absl::variant<LocalTensorHandleData, PackedTensorHandleData> data_;
+#endif
+
+  PartialTensorShape inference_shape_;
+
+  FullTypeDef full_type_;
+};
+
+// Returns the device backing the resource. Else, returns nullptr.
+Device* GetResourceDevice(const ResourceHandle& handle, EagerContext* ctx);
+
+class TensorHandleInterface : public ImmediateExecutionTensorHandle {
+ public:
+};
+
+template <typename T>
+inline TensorHandle* TensorHandleFromInterface(T* handle) {
+  return down_cast<TensorHandle*>(handle);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/tensor_handle_data.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/tensor_handle_data.h
new file mode 100644
index 00000000..ed58e83a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_DATA_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_DATA_H_
+
+#include <utility>
+#include <variant>
+
+#include "absl/types/variant.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Local Tensor Handle: Handle to a Tensor present on the local host.
+class LocalTensorHandleData {
+ public:
+  LocalTensorHandleData() : ctrl_(absl::in_place_type<BlockingControl>) {}
+  explicit LocalTensorHandleData(tensorflow::Tensor&& t)
+      : tensor_(std::move(t)),
+        forwarding_protection_tensor_(tensor_),
+        ctrl_(absl::in_place_type<NonBlockingControl>) {}
+
+  // A local tensor handle should be able to satisfy all of these requests.
+  absl::Status Tensor(const tensorflow::Tensor** t) const;
+  absl::Status TensorValue(tensorflow::TensorValue* t);
+  absl::Status Shape(TensorShape* shape) const;
+  absl::Status NumDims(int* num_dims) const;
+  absl::Status Dim(int dim_index, int64_t* dim) const;
+  absl::Status NumElements(int64_t* num_elements) const;
+  absl::Status Unprotect();
+
+  bool IsReady() const {
+    return std::visit([](auto& data) { return data.IsReady(); }, ctrl_);
+  }
+
+  absl::Status WaitReady(const char* caller) const {
+    return std::visit([caller](auto& data) { return data.WaitReady(caller); },
+                      ctrl_);
+  }
+  void Poison(absl::Status status) {
+    return std::visit([status](auto& data) { data.Poison(status); }, ctrl_);
+  }
+  absl::Status IsPoisoned() const {
+    return std::visit([](auto& data) { return data.IsPoisoned(); }, ctrl_);
+  }
+
+  absl::Status SetTensor(tensorflow::Tensor&& t);
+
+  string DebugString() const;
+
+ private:
+  tensorflow::Tensor tensor_;
+  // TensorHandle has its own reference counting which is distinct from the
+  // backing Tensor. As a result, if the Tensor reference count is 1 while
+  // executing an op, the TensorBuffer could be reused for the output. We avoid
+  // this behavior maintaining another reference count with the
+  // forwarding_protection_tensor_ Tensor. When Unprotect() is called, we
+  // release this Tensor to allow forwarding.
+  tensorflow::Tensor forwarding_protection_tensor_;
+
+  // We distinguish between ready and empty tensors with the ctrl_ variant.
+  // which contains 2 implementations of the waiting logic. The
+  // NonBlockingControl is a simple no-op class whereas the BlockingControl
+  // actually uses a mutex. By using a variant we avoid the overhead of
+  // constructing and destructing the mutex for ready local tensors.
+  class NonBlockingControl {
+   public:
+    bool IsReady() const { return true; }
+    absl::Status WaitReady(const char* caller) const {
+      return absl::OkStatus();
+    }
+    void Poison(absl::Status status) {}
+    absl::Status IsPoisoned() const { return absl::OkStatus(); }
+  };
+
+  class BlockingControl {
+   public:
+    bool IsReady() const {
+      tf_shared_lock l(mu_);
+      return is_ready_;
+    }
+    void SetReady();
+    absl::Status WaitReady(const char* caller) const;
+    void Poison(absl::Status status);
+    absl::Status IsPoisoned() const {
+      tf_shared_lock l(mu_);
+      return is_poisoned_;
+    }
+
+   private:
+    mutable mutex mu_;
+    bool is_ready_ TF_GUARDED_BY(mu_);
+    absl::Status is_poisoned_ TF_GUARDED_BY(mu_);
+  };
+
+  std::variant<NonBlockingControl, BlockingControl> ctrl_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/entry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/entry.h
new file mode 100644
index 00000000..82bf44ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/entry.h
@@ -0,0 +1,141 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ENTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ENTRY_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/manual_constructor.h"
+
+namespace tensorflow {
+
+class Tensor;
+
+// An Entry store a single input value for an individual kernel invocation in
+// an executor.
+//
+// Either a tensor pointer (pass-by-reference) or a tensor (pass-by-value).
+struct Entry {
+  enum class State {
+    NO_VALUE = 0,      // The default state for a newly-created Entry.
+    HAS_VALUE,         // `this->val` is valid.
+    HAS_CONST_TENSOR,  // `this->const_tensor` is valid.
+    HAS_REF_TENSOR,    // `this->ref_tensor` is valid.
+  };
+
+  Entry() : state(State::NO_VALUE) {}
+  Entry(const Entry& other) : state(other.state), alloc_attr(other.alloc_attr) {
+    switch (state) {
+      case State::NO_VALUE:
+        break;
+      case State::HAS_VALUE:
+        val.Init(*other.val);
+        break;
+      case State::HAS_CONST_TENSOR:
+        const_tensor = other.const_tensor;
+        break;
+      case State::HAS_REF_TENSOR:
+        ref_tensor = other.ref_tensor;
+        break;
+    }
+  }
+
+  ~Entry() {
+    if (state == State::HAS_VALUE) val.Destroy();
+  }
+
+  Entry& operator=(const Entry& other) {
+    if (state == State::HAS_VALUE) {
+      val.Destroy();
+    }
+    state = other.state;
+    alloc_attr = other.alloc_attr;
+    switch (state) {
+      case State::NO_VALUE:
+        break;
+      case State::HAS_VALUE:
+        val.Init(*other.val);
+        break;
+      case State::HAS_CONST_TENSOR:
+        const_tensor = other.const_tensor;
+        break;
+      case State::HAS_REF_TENSOR:
+        ref_tensor = other.ref_tensor;
+        break;
+    }
+    return *this;
+  }
+
+  Entry& operator=(Entry&& other) {
+    if (state == State::HAS_VALUE) {
+      val.Destroy();
+    }
+    state = other.state;
+    alloc_attr = other.alloc_attr;
+    switch (state) {
+      case State::NO_VALUE:
+        break;
+      case State::HAS_VALUE:
+        val.Init(std::move(*other.val));
+        break;
+      case State::HAS_CONST_TENSOR:
+        const_tensor = other.const_tensor;
+        break;
+      case State::HAS_REF_TENSOR:
+        ref_tensor = other.ref_tensor;
+        break;
+    }
+    return *this;
+  }
+
+  // Clears the <val> field, and sets this entry to the `NO_VALUE` state.
+  void ClearVal() {
+    if (state == State::HAS_VALUE) {
+      val.Destroy();
+    }
+    state = State::NO_VALUE;
+  }
+
+  union {
+    // A tensor value. Valid iff `state_ == HAS_VALUE`.
+    ManualConstructor<Tensor> val;
+
+    // A pointer to a constant tensor value. Valid iff `state_ ==
+    // HAS_CONST_TENSOR`.
+    const Tensor* const_tensor;
+
+    // A tensor reference and associated mutex. Valid iff `state_ ==
+    // HAS_REF_TENSOR`.
+    struct {
+      Tensor* tensor;
+      mutex* mu;
+    } ref_tensor;
+  };
+
+  // The current state of this entry, indicating which member of the above
+  // union is active.
+  State state;
+
+  // The attributes of the allocator that creates the tensor.
+  AllocatorAttributes alloc_attr;
+};
+
+// TODO(b/152925936): Re-evaluate this constant with current usage patterns.
+typedef absl::InlinedVector<Entry, 4UL> EntryVector;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ENTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/eval_const_tensor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eval_const_tensor.h
new file mode 100644
index 00000000..049a3e9f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/eval_const_tensor.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EVAL_CONST_TENSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EVAL_CONST_TENSOR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/functional/function_ref.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+class GraphRunner;
+class Node;
+class OpRegistryInterface;
+class ShapeRefiner;
+class Tensor;
+
+// Configuration of the graph runner for constant folding.
+struct EvaluateConstantTensorRunner {
+  // Op registry for temporary graphs. By default, the global registry will
+  // be used.
+  const OpRegistryInterface* op_registry = nullptr;
+  // Version of the graph API to use.
+  int32_t graph_def_version = 0;
+  // Graph runner for constant folding. By default, a temporary graph runner
+  // will be created.
+  GraphRunner* graph_runner = nullptr;
+};
+
+// Attempts to evaluate an output of the given node. This will only be possible
+// if it doesn't depend on any graph inputs (this function is safe to call
+// if this isn't the case though).
+//
+// When the evaluation is successful, the function returns a tensor, otherwise
+// it returns std::nullopt.
+absl::StatusOr<std::optional<Tensor>> EvaluateConstantTensor(
+    // The tensor to be evaluated.
+    const Node& node, int node_output,
+    // Used to fetch inference contexts for nodes in the graph.
+    const ShapeRefiner& refiner,
+    // Used to both lookup cached results and request function arguments.
+    absl::FunctionRef<std::optional<Tensor>(const Node&, int)> lookup,
+    // Configuration of the graph runner. If not set, no attempt to fold a
+    // constant subgraph will be made.
+    std::optional<EvaluateConstantTensorRunner> runner);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EVAL_CONST_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/executor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/executor.h
new file mode 100644
index 00000000..2a13ff0c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/executor.h
@@ -0,0 +1,265 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
+
+#include <optional>
+
+#include "absl/time/time.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/local_executor_params.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/session_state.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool_interface.h"
+#include "tensorflow/core/platform/error_logging.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+namespace tensorflow {
+
+class StepStatsCollector;
+
+// Executor runs a graph computation.
+// Example:
+//   Graph* graph = ...;
+//      ... construct graph ...
+//   Executor* executor;
+//   TF_CHECK_OK(NewSimpleExecutor(my_device, graph, &executor));
+//   Rendezvous* rendezvous = NewNaiveRendezvous();
+//   TF_CHECK_OK(rendezvous->Send("input", some_input_tensor));
+//   TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr}));
+//   TF_CHECK_OK(rendezvous->Recv("output", &output_tensor));
+//   ... ...
+//
+// Multiple threads can call Executor::Run concurrently.
+class Executor {
+ public:
+  virtual ~Executor() {}
+
+  // RunAsync() executes the graph computation. "done" is run when the
+  // graph computation completes. If any error happens during the
+  // computation, "done" is run and the error is passed to "done".
+  //
+  // RunAsync() is given a few arguments in Args. The caller must
+  // ensure objects passed in Args (rendezvous, stats_collector, etc.)
+  // are alive at least until done is invoked. All pointers to the
+  // argument objects can be nullptr.
+  //
+  // "step_id" is a process-wide unique identifier for the step being
+  // run. Executors on different devices may receive the same step_id
+  // in the case that a step runs Ops on more than one device. The
+  // step_id is used for tracking resource usage of a given step.
+  //
+  // RunAsync() uses the given "rendezvous", if not null, as the
+  // mechanism to communicate inputs and outputs of the underlying
+  // graph computation.
+  //
+  // RunAsync() calls "stats_collector", if not null, to keep track of
+  // stats. This allows us to collect statistics and traces on demand.
+  //
+  // RunAsync() is provided a "call_frame", if the executor is used
+  // for executing a function, is used to pass arguments and return
+  // values between the caller and the callee.
+  //
+  // RunAsync() uses "cancellation_manager", if not nullptr, to
+  // register callbacks that should be called if the graph computation
+  // is canceled. Note that the callbacks merely unblock any
+  // long-running computation, and a canceled step will terminate by
+  // returning/calling the DoneCallback as usual.
+  //
+  // RunAsync() dispatches closures to "runner". Typically, "runner"
+  // is backed up by a bounded threadpool.
+  //
+  // "start_time_usecs" is a timestamp for the start of RunAsync()
+  // execution. Used for system-wide latency metrics.
+  struct Args {
+    int64_t step_id = 0;
+    // Used only by tracer/profiler, applicable only when running under
+    // FunctionRuntimeLibrary, unique per invocation.
+    std::optional<int64_t> function_trace_id;
+    RendezvousInterface* rendezvous = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
+    CallFrameInterface* call_frame = nullptr;
+    CancellationManager* cancellation_manager = nullptr;
+    const ConfigProto* session_config = nullptr;
+    SessionState* session_state = nullptr;
+    // Unique session identifier. Can be empty.
+    string session_handle;
+    TensorStore* tensor_store = nullptr;
+    ScopedStepContainer* step_container = nullptr;
+    CollectiveExecutor* collective_executor = nullptr;
+    thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr;
+    tsl::CoordinationServiceAgent* coordination_service_agent = nullptr;
+    int64_t start_time_usecs = 0;
+    // The deadline for the kernel to complete by. Empty if unspecified.
+    absl::optional<absl::Time> deadline;
+    absl::optional<ManagedStackTrace> stack_trace = absl::nullopt;
+
+    // If true, calls Sync() on the device.
+    bool sync_on_finish = false;
+
+    typedef std::function<void()> Closure;
+    typedef std::function<void(Closure)> Runner;
+    Runner runner = nullptr;
+
+    // If true, all kernels will be treated as "inexpensive", and hence executed
+    // on the scheduling thread.
+    bool run_all_kernels_inline = false;
+  };
+  typedef std::function<void(const absl::Status&)> DoneCallback;
+
+  void RunAsync(const Args& args, DoneCallback done) {
+    RunAsyncInternal(args, [done = std::move(done)](const absl::Status& s) {
+      if (!s.ok()) Log("TFExecutor", "Run", s.message()).IgnoreError();
+      done(s);
+    });
+  }
+
+  // Synchronous wrapper for RunAsync().
+  virtual absl::Status Run(const Args& args) {
+    absl::Status ret;
+    Notification n;
+    RunAsync(args, [&ret, &n](const absl::Status& s) {
+      ret = s;
+      n.Notify();
+    });
+    n.WaitForNotification();
+    return ret;
+  }
+
+ private:
+  virtual void RunAsyncInternal(const Args& args, DoneCallback done) = 0;
+};
+
+// Creates an Executor that computes the given "graph".
+//
+// If successful, returns the constructed executor in "*executor". Otherwise,
+// returns an error status.
+//
+// "params" provides a set of context for the executor. We expect that
+// different context would provide different implementations.
+absl::Status NewLocalExecutor(const LocalExecutorParams& params,
+                              const Graph& graph, Executor** executor);
+
+// A class to help run multiple executors in parallel and wait until
+// all of them are complete.
+//
+// ExecutorBarrier deletes itself after the function returned by Get()
+// is called.
+class ExecutorBarrier {
+ public:
+  typedef std::function<void(const absl::Status&)> StatusCallback;
+
+  // Create an ExecutorBarrier for 'num' different executors.
+  //
+  // 'r' is the shared Rendezvous object that is used to communicate
+  // state.  If any of the executors experiences an error, the
+  // rendezvous object will be aborted exactly once.
+  //
+  // 'done' is called after the last executor completes, and
+  // ExecutorBarrier is deleted.
+  ExecutorBarrier(size_t num, Rendezvous* r, StatusCallback done)
+      : rendez_(r), done_cb_(done), pending_(num) {}
+
+  ~ExecutorBarrier() {}
+
+  // Returns a closure that Executors must call when they are done
+  // computing, passing the status of their execution as an argument.
+  StatusCallback Get() {
+    return std::bind(&ExecutorBarrier::WhenDone, this, std::placeholders::_1);
+  }
+
+ private:
+  Rendezvous* rendez_ = nullptr;
+  StatusCallback done_cb_ = nullptr;
+
+  mutable mutex mu_;
+  int pending_ TF_GUARDED_BY(mu_) = 0;
+  StatusGroup status_group_ TF_GUARDED_BY(mu_);
+
+  void WhenDone(const absl::Status& s) {
+    Rendezvous* error_rendez = nullptr;
+    StatusCallback done = nullptr;
+    absl::Status status;
+
+    {
+      mutex_lock l(mu_);
+
+      // If we are the first error encountered, trigger an abort of the
+      // Rendezvous object by this thread only.
+      if (status_group_.ok() && !s.ok()) {
+        error_rendez = rendez_;
+        error_rendez->Ref();
+      }
+
+      if (!s.ok() && !StatusGroup::IsDerived(s) &&
+          !status_group_.HasLogMessages()) {
+        status_group_.AttachLogMessages();
+      }
+
+      status_group_.Update(s);
+
+      // If this is the last call to WhenDone, call the final callback
+      // below.
+      if (--pending_ == 0) {
+        CHECK(done_cb_ != nullptr);
+        std::swap(done, done_cb_);
+        status = status_group_.as_summary_status();
+      }
+    }
+
+    if (error_rendez != nullptr) {
+      error_rendez->StartAbort(
+          errors::Aborted("Stopping remaining executors."));
+      error_rendez->Unref();
+    }
+
+    if (done != nullptr) {
+      delete this;
+      if (!status.ok()) {
+        VLOG(1) << "ExecutorBarrier finished with bad status: " << status;
+      }
+      done(status);
+    }
+  }
+
+  ExecutorBarrier(const ExecutorBarrier&) = delete;
+  void operator=(const ExecutorBarrier&) = delete;
+};
+
+// A few helpers to facilitate create/delete kernels.
+
+// Creates a kernel based on "props" on device "device". The kernel can
+// access the functions in the "flib". The caller takes ownership of
+// returned "*kernel".
+absl::Status CreateNonCachedKernel(
+    Device* device, FunctionLibraryRuntime* flib,
+    const std::shared_ptr<const NodeProperties>& props, int graph_def_version,
+    OpKernel** kernel);
+
+// Deletes "kernel" returned by CreateKernel.
+void DeleteNonCachedKernel(OpKernel* kernel);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/executor_factory.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/executor_factory.h
new file mode 100644
index 00000000..14a8d277
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/executor_factory.h
@@ -0,0 +1,50 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_FACTORY_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Executor;
+class Graph;
+struct LocalExecutorParams;
+
+class ExecutorFactory {
+ public:
+  virtual absl::Status NewExecutor(const LocalExecutorParams& params,
+                                   const Graph& graph,
+                                   std::unique_ptr<Executor>* out_executor) = 0;
+  virtual ~ExecutorFactory() {}
+
+  static void Register(const string& executor_type, ExecutorFactory* factory);
+  static absl::Status GetFactory(const string& executor_type,
+                                 ExecutorFactory** out_factory);
+};
+
+absl::Status NewExecutor(const string& executor_type,
+                         const LocalExecutorParams& params, const Graph& graph,
+                         std::unique_ptr<Executor>* out_executor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/function.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function.h
new file mode 100644
index 00000000..f86732b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function.h
@@ -0,0 +1,81 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// Get default customizable kernel creator if set
+const CustomKernelCreator* GetDefaultCustomKernelCreator();
+
+// Registers a default customizable kernel creator for a function call.
+//
+// If c->CanCreateKernel returns false, we still fall back to an executor-based
+// interpreter op kernel to execute a function. Else c->CreateKernel() can be
+// used to create a kernel that will compile the function with XLA and run the
+// resulting program.
+void RegisterDefaultCustomKernelCreator(CustomKernelCreator* c);
+
+// Creates a FunctionLibraryRuntime, which instantiates functions
+// defined in "lib_def" and executes functions on the "device".
+// "device_mgr" must contain the "device".
+//
+// The returned object does not take ownerships of "device" or
+// "lib_def".  The caller must ensure "device" and "lib_def" outlives
+// the returned object.
+//
+// The "parent" is a pointer to the ProcessFunctionLibraryRuntime object that
+// typically owns the created FunctionLibraryRuntime object. The parent pointer
+// is not owned by the FunctionLibraryRuntime object.
+core::RefCountPtr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
+    const DeviceMgr* device_mgr, Env* env, const ConfigProto* config,
+    Device* device, int graph_def_version,
+    const FunctionLibraryDefinition* lib_def, thread::ThreadPool* thread_pool,
+    const OptimizerOptions& optimizer_options,
+    const SessionMetadata* session_metadata,
+    ProcessFunctionLibraryRuntime* parent);
+
+// Given a numerical function "f", returns another numerical function
+// "g", such that if "f" takes N inputs and produces M outputs, "g"
+// takes N + M inputs and produces N outputs. I.e., if
+//   (y1, y2, ..., y_M) = f(x1, x2, ..., x_N),
+// g is a function which is
+//   (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N,
+//                                     dL/dy1, dL/dy2, ..., dL/dy_M),
+// where L is a scalar-value function of (...x_i...).
+//
+// TODO(zhifengc): Asks math expert to say the comment again.
+std::unique_ptr<FunctionBody> SymbolicGradient(const FunctionBody& f);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_body.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_body.h
new file mode 100644
index 00000000..959f9803
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_body.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_BODY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_BODY_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/refcount.h"
+
+namespace tensorflow {
+
+class FunctionRecord;
+class Graph;
+class Node;
+
+// FunctionLibraryRuntime::GetFunctionBody returns a description of an
+// instantiated function that is represented as a Graph with arg/ret
+// nodes annotated.
+struct FunctionBody {
+  core::RefCountPtr<FunctionRecord> record;
+  Graph* graph = nullptr;  // owned.
+  DataTypeVector arg_types;
+  DataTypeVector ret_types;
+  // arg_nodes[i] contains the i'th function input. In other words,
+  // GetNodeAttr(arg_nodes[i]->attrs(), "index") == i.
+  absl::InlinedVector<Node*, 4UL> arg_nodes;
+  // ret_nodes[i] contains the i'th function output. In other words,
+  // GetNodeAttr(ret_nodes[i]->attrs(), "index") == i.
+  absl::InlinedVector<Node*, 4UL> ret_nodes;
+  absl::InlinedVector<Node*, 4UL> control_ret_nodes;
+
+  FunctionBody() {}
+  FunctionBody(core::RefCountPtr<FunctionRecord>&& record,
+               DataTypeSlice arg_types, DataTypeSlice ret_types, Graph* g);
+  ~FunctionBody();
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_BODY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_def_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_def_utils.h
new file mode 100644
index 00000000..cd3b021e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_def_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/refcount.h"
+
+namespace tensorflow {
+
+class AttrSlice;
+struct FunctionBody;
+class FunctionDef;
+class FunctionLibraryDefinition;
+class FunctionRecord;
+class OpDef;
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef.
+absl::Status FunctionDefToBodyHelper(core::RefCountPtr<FunctionRecord>&& record,
+                                     const AttrSlice& attrs,
+                                     const FunctionLibraryDefinition* lib_def,
+                                     std::unique_ptr<FunctionBody>* fbody);
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef.
+//
+// NOTE(mrry): This implementation incurs a copy of `fdef`. If possible, use
+//   the overload that takes a `core::RefCountPtr<FunctionRecord>`.
+absl::Status FunctionDefToBodyHelper(const FunctionDef& fdef,
+                                     const AttrSlice& attrs,
+                                     const FunctionLibraryDefinition* lib_def,
+                                     std::unique_ptr<FunctionBody>* fbody);
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef. Use custom function
+// signature lookup, in case instantiated function is not in the 'lib_def'.
+absl::Status FunctionDefToBodyHelper(
+    core::RefCountPtr<FunctionRecord>&& record, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* lib_def,
+    const std::function<absl::Status(const string&, const OpDef**)>&
+        get_func_sig,
+    std::unique_ptr<FunctionBody>* fbody);
+
+// Removes all stateless nodes that do not contribute to a return
+// value from the function body. Unlike `RemoveDeadNodes()`, which is
+// triggered by `OptimizerOptions.do_function_inlining`, this pass
+// ignores the SINK node, from which (by definition) all nodes are
+// reverse reachable, and preserves all nodes that are reachable from
+// control output nodes.
+void PruneFunctionBody(const FunctionDef& fdef, Graph* g,
+                       absl::Span<Node*> additional_root_nodes = {});
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_optimization_registry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_optimization_registry.h
new file mode 100644
index 00000000..ba501d3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_optimization_registry.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_OPTIMIZATION_REGISTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_OPTIMIZATION_REGISTRY_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+// Classes to maintain a static registry of Graph based passes to be applied to
+// a function graph.
+
+namespace tensorflow {
+
+// A pass to be registered with the FunctionOptimizationPassRegistry. This pass
+// takes in a DeviceSet (available devices for executing the Graph), ConfigProto
+// (session configuration parameters), an optional target device for XLA
+// compilation, Graph (computation),
+// FunctionLibraryDefinition (mapping between function names and function
+// definitions of the Graph), control ret/target node names (names of nodes that
+// must execute but their data outputs, if they have any, are irrelevant), and
+// whether control ret nodes (via thier name) were updated. Mutations to the
+// Graph and other associated arguments are performed inplace by the pass.
+class FunctionOptimizationPass {
+ public:
+  // Grouped Options for the optimized function.
+  struct FunctionOptions {
+    // Specifies the compilation device type(CPU, GPU, etc)
+    // that should be used for entire function.
+    std::string xla_compile_device_type = "";
+    // Whether soft placement and outside compilation
+    // are enabled for the function.
+    bool allow_soft_placement = false;
+  };
+
+  virtual ~FunctionOptimizationPass() {}
+  virtual absl::Status Run(const std::string& function_name,
+                           const DeviceSet& device_set,
+                           const ConfigProto& config_proto,
+                           const FunctionOptions& function_options,
+                           std::unique_ptr<Graph>* graph,
+                           FunctionLibraryDefinition* flib_def,
+                           std::vector<std::string>* control_ret_node_names,
+                           bool* control_rets_updated) = 0;
+};
+
+// A global function optimization pass registry that is used to hold one
+// FunctionOptimizationPass. Passes registered to this registry will run before
+// passes registered in OptimizationPassRegistry.
+class FunctionOptimizationPassRegistry {
+ public:
+  // Initializes registry with a pass. Only one pass should be set. An assertion
+  // will be triggered if the registry already has a pass set and is being
+  // initialized with another pass.
+  void Init(std::unique_ptr<FunctionOptimizationPass> pass);
+
+  // Runs a pass if the registry contains one.
+  absl::Status Run(
+      const std::string& function_name, const DeviceSet& device_set,
+      const ConfigProto& config_proto,
+      const FunctionOptimizationPass::FunctionOptions& function_options,
+      std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
+      std::vector<std::string>* control_ret_node_names,
+      bool* control_rets_updated);
+
+  // Returns the global registry of function graph passes.
+  static FunctionOptimizationPassRegistry& Global();
+
+ private:
+  std::unique_ptr<FunctionOptimizationPass> pass_;
+};
+
+namespace function_optimization_registration {
+
+class FunctionOptimizationPassRegistration {
+ public:
+  explicit FunctionOptimizationPassRegistration(
+      std::unique_ptr<FunctionOptimizationPass> pass) {
+    FunctionOptimizationPassRegistry::Global().Init(std::move(pass));
+  }
+};
+
+}  // namespace function_optimization_registration
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_OPTIMIZATION_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_testlib.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_testlib.h
new file mode 100644
index 00000000..9618c408
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_testlib.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+namespace test {
+namespace function {
+
+// {} -> y:DT_STRING (device where this op runs).
+FunctionDef FindDevice();
+FunctionDef FindDeviceWithUuid();
+
+class BlockingOpState {
+ public:
+  void AwaitState(int awaiting_state);
+
+  void MoveToState(int expected_current, int next);
+
+ private:
+  mutex mu_;
+  condition_variable cv_;
+  int state_ = 0;
+};
+
+extern BlockingOpState* blocking_op_state;
+
+FunctionDef BlockingOpFn();
+
+// Adds a function call to the given scope and returns the output for the node.
+// TODO(phawkins): replace with C++ API for calling functions, when that exists.
+Output Call(Scope* scope, const string& op_name, const string& fn_name,
+            absl::Span<const Input> inputs);
+
+}  // namespace function
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_utils.h
new file mode 100644
index 00000000..cfbfe869
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/function_utils.h
@@ -0,0 +1,105 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class AttrSlice;
+class Graph;
+class GraphDef;
+class NameAttrList;
+class Node;
+class NodeDef;
+class OpDef;
+
+// Debugging facility.  Returns a debug string for a graph
+// representing an instantiated function.
+string DebugString(const Graph* g);
+
+// Dump the contents of the "graph" to log files if the logging level is
+// sufficiently high.
+void DumpGraph(absl::string_view label, const Graph* g);
+
+// Convert the Graph of a function to a GraphDef.
+//
+// Handles renaming of nodes to avoid duplicate names which may
+// be present after various rewriting operations.
+void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
+
+// Extracts function name and attributes from `call_def`
+// `call_def` can be a native function call (where the op type is the function
+// name) or a call through PartitionedCall/StatefulPartitionedCall.
+absl::Status NameAndAttrsFromFunctionCall(const NodeDef& call_def,
+                                          NameAttrList* function);
+
+// A few hand-crafted optimization on the instantiated function body
+// (a Graph*).
+
+// Removes nodes that are
+//   1. not stateful; and
+//   2. not _Arg; and
+//   3. not reachable from _Retval.
+//
+// This function is triggered by function inlining, unlike 'PruneFunctionBody'
+// it doesn't preserve nodes that are reachable from control returns. Function
+// inlining is responsible for connecting control return nodes with the nodes
+// that have input control edges from the inlined function call node.
+//
+// Assuming that automatic control dependency tracking is correct, absence of
+// outgoing control edge from the function call node means that no one needs to
+// observe side-effect that might have been generated by the function (see
+// documentation in common_runtime/function.cc for details).
+//
+// Returns true iff any node is removed from "g".
+bool RemoveDeadNodes(Graph* g);
+
+// Find a pattern:
+//   src -(in)-> node -(out)-> dst, where
+// 1) node is an identity node;
+// 2) in is the only incoming data edge;
+// 3) out is the only outgoing data edge;
+//
+// Rewrites the above pattern with src->dst and relevant data
+// dependencies updated. Repeat the process until no such pattern
+// left.
+bool RemoveIdentityNodes(Graph* g);
+
+// Rewrites _ListToArray and _ArrayToList to a set of Identity nodes.
+bool RemoveListArrayConverter(Graph* g);
+
+// Extracts function name and attributes from `call_def` and invokes
+// flr->Instantiate(name, attrs, handle).
+// `call_def` can be a native function call (where the op type is the function
+// name) or a call through PartitionedCall/StatefulPartitionedCall.
+absl::Status InstantiateFunctionCall(const NodeDef& call_def,
+                                     FunctionLibraryRuntime* flr,
+                                     FunctionLibraryRuntime::Handle* handle);
+
+// Returns true iff `n` represents a function call. `n` can be a native
+// function call (n.type_string() is the function name),
+// a PartitionedCall/StatefulPartitionedCall, or a SymbolicGradient (which
+// has been deprecated for a while).
+bool IsFunctionCall(const FunctionLibraryDefinition& lib_def, const Node& n);
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
new file mode 100644
index 00000000..9d025cc0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/bfc_allocator.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+
+// A GPU memory allocator that implements a 'best-fit with coalescing'
+// algorithm.
+class GPUBFCAllocator : public tsl::BFCAllocator {
+ public:
+  // See BFCAllocator::Options.
+  struct Options {
+    // Overridden by TF_FORCE_GPU_ALLOW_GROWTH if that envvar is set.
+    bool allow_growth = false;
+
+    // If nullopt, defaults to TF_ENABLE_GPU_GARBAGE_COLLECTION, or true if that
+    // envvar is not present.
+    //
+    // Note:
+    //
+    //  - BFCAllocator defaults garbage_collection to false, not true.
+    //  - this is not the same override behavior as TF_FORCE_GPU_ALLOW_GROWTH.
+    std::optional<bool> garbage_collection;
+
+    double fragmentation_fraction = 0;
+    bool allow_retry_on_failure = true;
+  };
+
+  GPUBFCAllocator(std::unique_ptr<tsl::SubAllocator> sub_allocator,
+                  size_t total_memory, const std::string& name,
+                  const Options& opts);
+
+  ~GPUBFCAllocator() override {}
+
+  GPUBFCAllocator(const GPUBFCAllocator&) = delete;
+  void operator=(const GPUBFCAllocator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
new file mode 100644
index 00000000..ba08f096
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/device_id.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+
+// An allocator which directly uses cuMemAlloc and cuMemFree to allocate and
+// free memory.
+class GPUcudaMallocAllocator : public tsl::Allocator {
+ public:
+  explicit GPUcudaMallocAllocator(tsl::PlatformDeviceId platform_device_id);
+  std::string Name() override { return "gpu_debug"; }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+  bool TracksAllocationSizes() const override;
+
+  tsl::AllocatorMemoryType GetMemoryType() const override {
+    return tsl::AllocatorMemoryType::kDevice;
+  }
+
+ private:
+  se::StreamExecutor* stream_exec_;  // Not owned.
+
+  GPUcudaMallocAllocator(const GPUcudaMallocAllocator&) = delete;
+  void operator=(const GPUcudaMallocAllocator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
new file mode 100644
index 00000000..13f10007
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -0,0 +1,93 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/device_id.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+
+// An allocator that wraps a GPU allocator and adds debugging
+// functionality that verifies that users do not write outside their
+// allocated memory.
+class GPUDebugAllocator : public tsl::Allocator {
+ public:
+  explicit GPUDebugAllocator(tsl::Allocator* allocator,
+                             tsl::PlatformDeviceId platform_device_id);
+  ~GPUDebugAllocator() override;
+  std::string Name() override { return "gpu_debug"; }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+  bool TracksAllocationSizes() const override;
+  size_t RequestedSize(const void* ptr) const override;
+  size_t AllocatedSize(const void* ptr) const override;
+  int64_t AllocationId(const void* ptr) const override;
+  std::optional<tsl::AllocatorStats> GetStats() override;
+  bool ClearStats() override;
+
+  // For testing.
+  bool CheckHeader(void* ptr);
+  bool CheckFooter(void* ptr);
+
+ private:
+  tsl::Allocator* base_allocator_ = nullptr;  // owned
+
+  se::StreamExecutor* stream_exec_;  // Not owned.
+
+  GPUDebugAllocator(const GPUDebugAllocator&) = delete;
+  void operator=(const GPUDebugAllocator&) = delete;
+};
+
+// An allocator that wraps a GPU allocator and resets the memory on
+// allocation and free to 'NaN', helping to identify cases where the
+// user forgets to initialize the memory.
+class GPUNanResetAllocator : public tsl::Allocator {
+ public:
+  explicit GPUNanResetAllocator(tsl::Allocator* allocator,
+                                tsl::PlatformDeviceId platform_device_id);
+  ~GPUNanResetAllocator() override;
+  std::string Name() override { return "gpu_nan_reset"; }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+  size_t RequestedSize(const void* ptr) const override;
+  size_t AllocatedSize(const void* ptr) const override;
+  std::optional<tsl::AllocatorStats> GetStats() override;
+  bool ClearStats() override;
+
+  tsl::AllocatorMemoryType GetMemoryType() const override {
+    return base_allocator_->GetMemoryType();
+  }
+
+ private:
+  tsl::Allocator* base_allocator_ = nullptr;  // owned
+
+  se::StreamExecutor* stream_exec_;  // Not owned.
+
+  GPUNanResetAllocator(const GPUNanResetAllocator&) = delete;
+  void operator=(const GPUNanResetAllocator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_device.h
new file mode 100644
index 00000000..d09cdc2f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -0,0 +1,477 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
+#endif
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+
+// TODO(b/282059652): Merge google internal and open-source code path once TF
+// dependency issue is resolved.
+#if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
+#define TF_GPU_USE_PJRT
+#endif  // PLATFORM_GOOGLE && TF_PLATFORM_LINUX_X86_64
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#ifdef TF_GPU_USE_PJRT
+#include "tensorflow/compiler/jit/pjrt_device_context.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "xla/pjrt/local_device_state.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#endif  // TF_GPU_USE_PJRT
+#include "xla/tsl/framework/device_id.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/node_file_writer.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace Eigen {
+class StreamInterface;
+}
+
+namespace tensorflow {
+class GPUKernelTracker;
+
+class ConcretePerOpGpuDevice : public PerOpGpuDevice {
+ public:
+  ConcretePerOpGpuDevice();
+
+  void Reinitialize(OpKernelContext* context, void* gpu_stream,
+                    tsl::TfDeviceId tf_device_id, Allocator* base_allocator,
+                    char* scratch);
+
+  void Reinitialize(OpKernelContext* context, void* gpu_stream,
+                    tsl::PlatformDeviceId platform_device_id,
+                    Allocator* base_allocator, char* scratch);
+
+  const Eigen::GpuDevice& device() const override;
+
+ private:
+  std::unique_ptr<::Eigen::StreamInterface> stream_device_;
+};
+
+class BaseGPUDevice : public LocalDevice {
+ public:
+  BaseGPUDevice(const SessionOptions& options, const std::string& name,
+                Bytes memory_limit, const DeviceLocality& locality,
+                tsl::TfDeviceId tf_device_id,
+                const std::string& physical_device_desc,
+                Allocator* gpu_allocator, Allocator* cpu_allocator,
+                bool sync_every_op);
+
+  ~BaseGPUDevice() override;
+
+  struct StreamGroup {
+    se::Stream* compute = nullptr;
+#if TENSORFLOW_USE_ROCM
+    se::Stream* nccl = nullptr;
+#endif
+    se::Stream* host_to_device = nullptr;
+    se::Stream* device_to_host = nullptr;
+    gtl::InlinedVector<se::Stream*, 4> device_to_device;
+    int priority = 0;
+  };
+
+  // Initialize the device and return the status of initialization.
+#ifdef TF_GPU_USE_PJRT
+  Status Init(const SessionOptions& options,
+              xla::LocalDeviceState* xla_local_device_state);
+#else
+  Status Init(const SessionOptions& options);
+#endif  // TF_GPU_USE_PJRT
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
+  Status Sync() override;
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override;
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override;
+
+  // The caller owns the returned device.
+  PerOpGpuDevice* MakeGpuDevice() override;
+
+  Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                               DeviceContext* dc,
+                               Allocator* allocator) override;
+
+  // Returns the platform GPU id of this device within the native driver system;
+  // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
+  int gpu_id() const {
+    tsl::PlatformDeviceId platform_device_id;
+    TF_CHECK_OK(
+        GpuIdManager::TfToPlatformDeviceId(tf_device_id_, &platform_device_id));
+    return platform_device_id.value();
+  }
+
+  // The executor that provides control for the device; e.g., for CUDA this
+  // corresponds to the cuda context.
+  se::StreamExecutor* executor() const { return executor_; }
+
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64_t step_id) override;
+
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return scoped_allocator_mgr_.get();
+  }
+
+  // The following two functions always return 0 unless one of the
+  // related experimental config options has been specified.
+
+  // If returned value is > 0 then GPU Memory chunks freed before this count
+  // are guaranteed not to be in use by any kernel pending on this device.
+  uint64 SafeAllocFrontier(uint64 old_value) override;
+
+  // Returns the number of kernels that have been queued for execution on
+  // the compute stream and are not yet known to have completed.
+  int PendingKernels();
+
+  int priority() const { return stream_->priority; }
+
+  // Helper method for unit tests to reset the streams. Never use in production.
+  static void TestOnlyReset();
+
+  se::Stream* compute_stream() { return stream_->compute; }
+
+  // Given the compute stream for a GPU or virtual GPU, return the TfDeviceId
+  // for the GPU or vGPU.
+  static std::optional<tsl::TfDeviceId> FindTfDeviceId(se::Stream* compute);
+
+  bool merge_host_to_device_stream() const override {
+    return stream_merge_options_.merge_host_to_device_stream();
+  }
+
+  bool merge_device_to_host_stream() const override {
+    return stream_merge_options_.merge_device_to_host_stream();
+  }
+
+  bool merge_device_to_device_stream() const override {
+    return stream_merge_options_.merge_device_to_device_stream();
+  }
+
+ protected:
+  Allocator* gpu_allocator_;  // not owned
+  Allocator* cpu_allocator_;  // not owned
+
+  se::StreamExecutor* executor_;  // not owned
+  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
+
+ private:
+  friend class GPUDeviceTestHelper;
+  class StreamGroupFactory;
+
+  core::RefCountPtr<DeviceContext> pjrt_device_context_;
+  StreamGroup* stream_;
+  mutex scratch_init_mutex_;
+  char* scratch_ = nullptr;
+  GPUDeviceContext* device_context_;
+  DeviceBase::AcceleratorDeviceInfo* accelerator_device_info_ = nullptr;
+  mutex trace_mu_;
+  tsl::TfDeviceId tf_device_id_;
+  const bool sync_every_op_ = false;
+  EventMgr* em_ = nullptr;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  int32 pending_cap_ = 0;
+  bool timestamped_allocator_ = false;
+  NodeFileWriter* node_file_writer_ = nullptr;  // not owned
+  const GPUOptions::Experimental::StreamMergeOptions stream_merge_options_;
+
+  // Initialize scratch buffers used by Eigen.
+  Status InitScratchBuffers();
+
+  void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                          int stream_id, Allocator* allocator);
+
+  std::string ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                         const int& stream_id);
+
+  // This method returns an initialization status, in addition to
+  // calling the "done" StatusCallback, if there is a failure to
+  // allocate memory or if the tensor "from" is not DMA-copyable.
+  // If there is no error prior to enqueueing the copy, an OK status
+  // is returned.
+  Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
+                              const Tensor& from, Tensor* to,
+                              StatusCallback done);
+
+  Tensor CopyGpuTensorToHostDebugOnly(const Tensor& gpu_tensor);
+  void LogInputs(OpKernel* op_kernel, OpKernelContext* context);
+  void LogOutputs(OpKernel* op_kernel, OpKernelContext* context);
+};
+
+// A per-compute-stream utility that keeps track of kernels that have been
+// queued for execution but may not yet have terminated and also the queued
+// time of the most recently terminated kernel.
+class GPUKernelTracker {
+ public:
+  // Controls the strategy for inserting tracking events after GPU kernels.
+  //   If max_interval >= 0, then insert an event after this many kernels
+  //     if an event has not been inserted for another reason.
+  //   If max_bytes > 0, then insert an event after kernels allocating this
+  //     many bytes have been queued since the last event.
+  //   If max_pending > 0, then track up to this many events at once.  If
+  //     this limit is reached the GPU::Compute() method will delay starting
+  //     additional ops until some event completes.  If 0 and one of the other
+  //     fields is non-zero, then a reasonable default will be selected.
+  struct Params {
+    int max_interval = 0;
+    int max_bytes = 0;
+    int max_pending = 0;
+    Params(int mi, int mb, int mp)
+        : max_interval(mi), max_bytes(mb), max_pending(mp) {}
+  };
+
+  // If we're going to share a SharedCounter with an allocator, it's owned
+  // by the allocator because allocators are initialized once per process.
+  // Devices are per-session.
+  explicit GPUKernelTracker(const Params& params, Env* env,
+                            se::Stream* compute_stream,
+                            SharedCounter* timing_counter, Allocator* allocator,
+                            EventMgr* event_manager)
+      : params_(params),
+        env_(env),
+        stream_(compute_stream),
+        timing_counter_(timing_counter),
+        allocator_(allocator),
+        em_(event_manager),
+        pending_kernels_(
+            params.max_pending > 0 ? std::max(8, 2 * params.max_pending) : 64) {
+    mem_since_last_ = 0;
+    if (!timing_counter_) {
+      // There's not a preexisting counter owned by GPUProcessState, i.e.
+      // pending_cap > 0 but timestamped_allocator == false.
+      owned_counter_ = std::make_unique<SharedCounter>();
+      timing_counter_ = owned_counter_.get();
+    }
+  }
+
+  // Determine whether a GPU kernel should have a recording event queued
+  // immediately afterwards.  If so, advance the counter and return the new
+  // counter value after enqueuing.
+  uint64 MaybeQueue(OpKernelContext* ctx);
+
+  // Record that a GPU kernel has just been enqueued on the compute stream.
+  // Inserts the supplied counter value in a new PendingKernel record appended
+  // to the end of the ring buffer then returns that same count.
+  // Caller is responsible for ensuring that RecordTerminate() is eventually
+  // called with the same counter value.
+  void RecordQueued(uint64 queued_count, int weight)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Takes a count value returned by RecordQueued and finds the corresponding
+  // PendingKernel record in the ring buffer.  Marks the kernel as completed and
+  // advances the completion frontier accordingly.
+  void RecordTerminated(uint64 queued_count);
+
+  // Returns the largest timing count such that all kernels queued no
+  // later than that count are known to have terminated.
+  inline uint64 LastTerminatedCount(uint64 old_value) {
+    uint64 new_value = last_terminated_count_.load(std::memory_order_relaxed);
+    if (new_value == old_value) {
+      MaybeQueueProgressEvent();
+    }
+    return new_value;
+  }
+
+  // Returns the number of kernels enqueued that are not yet known to
+  // have terminated.
+  int NumPending() {
+    mutex_lock l(mu_);
+    return num_pending_;
+  }
+
+  // Yield current thread until number of pending kernels no longer
+  // exceeds the cap.
+  void PauseWhilePendingExceeds(int cap) TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    while (num_pending_ > cap) {
+      VLOG(1) << "num_pending_=" << num_pending_ << " cap=" << cap;
+      pending_decreased_.wait(l);
+    }
+  }
+
+ private:
+  friend class GPUKernelTrackerTest;
+  Params params_;
+  Env* env_;
+  se::Stream* stream_;
+  SharedCounter* timing_counter_;
+  std::unique_ptr<SharedCounter> owned_counter_;
+  Allocator* allocator_ = nullptr;
+  EventMgr* em_ = nullptr;
+  std::atomic<uint64> last_terminated_count_ = {1};
+
+  void MaybeQueueProgressEvent();
+
+  // Records when a kernel was queued for execution.  Kernel launches are
+  // identified by a unique count value from a per-GPU device timing counter.
+  struct PendingKernel {
+    uint64 queued_count;
+    int weight;
+    bool terminated;
+    PendingKernel(const PendingKernel& pk) = default;
+    PendingKernel() : queued_count(0), weight(0), terminated(false) {}
+  };
+  mutex mu_;
+  int32 mem_since_last_ TF_GUARDED_BY(mu_);
+  int32 ops_since_last_ TF_GUARDED_BY(mu_);
+  // Ring buffer of PendingKernel records.
+  std::vector<PendingKernel> pending_kernels_ TF_GUARDED_BY(mu_);
+  // Next unused slot in pending_kernels_.
+  int first_available_ TF_GUARDED_BY(mu_) = 0;
+  // Last completed PendingKernel such that all prior PendingKernels are
+  // also completed.  With out-of-order completion there may be a mixture
+  // of completed and uncompleted entries between last_completed_ and
+  // first_available_.
+  int last_completed_ TF_GUARDED_BY(mu_) = -1;
+  // Sum of weights of the outstanding events marking tracked kernels.
+  int num_pending_ TF_GUARDED_BY(mu_) = 0;
+  condition_variable pending_decreased_ TF_GUARDED_BY(mu_);
+};
+
+class BaseGPUDeviceFactory : public DeviceFactory {
+ public:
+  Status ListPhysicalDevices(std::vector<string>* devices) override;
+  Status CreateDevices(const SessionOptions& options,
+                       const std::string& name_prefix,
+                       std::vector<std::unique_ptr<Device>>* devices) override;
+  Status GetDeviceDetails(int device_index,
+                          std::unordered_map<string, string>* details) override;
+
+  struct InterconnectMap {
+    // Name of interconnect technology, if known.
+    std::string name;
+    // If possible, strength should approximate Gb/sec bandwidth rate.
+    // Where architecture-specific subclassing is not done that won't
+    // always be possible.  The minimum expectation is that
+    // faster links should have a higher value than slower links.
+    int32 strength;
+    static const int kSameDeviceStrength;
+    static const int kStreamExecutorStrength;
+    std::set<std::pair<tsl::PlatformDeviceId, tsl::PlatformDeviceId>>
+        directed_links;
+  };
+
+ protected:
+  // Populates *maps with interconnect maps for all local direct access
+  // pathways between GPUs.
+  virtual Status GetInterconnectMaps(
+      const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
+      se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
+
+  struct TfDeviceIdHash {
+    std::size_t operator()(const tsl::TfDeviceId& id) const noexcept {
+      return std::hash<int>{}(id.value());
+    }
+  };
+  typedef std::unordered_map<tsl::TfDeviceId, DeviceLocality, TfDeviceIdHash>
+      LocalityMap;
+  // Populates *localities with the DeviceLocality descriptor for
+  // every TfDeviceId.
+  virtual Status GetDeviceLocalities(
+      int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
+      LocalityMap* localities);
+
+ private:
+  // Creates a BaseGPUDevice associated with 'tf_device_id', and adds it to the
+  // 'devices' vector. The 'gpu_allocator' is created by the caller and usually
+  // preallocates a set amount of GPU memory.
+#ifdef TF_GPU_USE_PJRT
+  Status CreateGPUDevice(const SessionOptions& options,
+                         const std::string& name_prefix,
+                         tsl::TfDeviceId tf_device_id,
+                         const DeviceLocality& dev_locality,
+                         xla::LocalDeviceState* xla_local_device_state,
+                         Allocator* gpu_allocator,
+                         std::vector<std::unique_ptr<Device>>* devices);
+#else
+  Status CreateGPUDevice(const SessionOptions& options,
+                         const std::string& name_prefix,
+                         tsl::TfDeviceId tf_device_id,
+                         const DeviceLocality& dev_locality,
+                         Allocator* gpu_allocator,
+                         std::vector<std::unique_ptr<Device>>* devices);
+#endif  // TF_GPU_USE_PJRT
+
+  virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& dev_locality, tsl::TfDeviceId tf_device_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) = 0;
+
+  Status EnablePeerAccess(
+      const std::vector<tsl::PlatformDeviceId>& visible_gpu_order);
+
+  // Returns into 'ids' the list of valid platform GPU ids, in the order that
+  // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
+  // based upon 'visible_gpu_order' which was generated by parsing
+  // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
+  // ROCm GPU ids.
+  Status GetValidDeviceIds(
+      const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
+      std::vector<tsl::PlatformDeviceId>* ids);
+
+  // Cache the valid device IDs if not already cached. Cached IDs are stored in
+  // field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
+  // GetValidDeviceIds, so this should only be used in functions where all
+  // devices should be treated as visible, like ListPhysicalDevices.
+  Status CacheDeviceIds();
+
+  // visible_gpu_initialized_[platform_device_id] is true if visible GPU
+  // platform_device_id has been initialized by the process.
+  std::unordered_map<int, bool> visible_gpu_initialized_;
+
+  // Cached device IDs, as returned by GetValidDeviceIds when every physical
+  // device is visible. Cache should not be used if some devices are not
+  // visible.
+  std::vector<tsl::PlatformDeviceId> cached_device_ids_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
new file mode 100644
index 00000000..601119fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -0,0 +1,23 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// TODO(annarev): remove this file once all includes are updated to
+// include device_event_mgr.h instead.
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+
+#include "tensorflow/core/common_runtime/device/device_event_mgr.h"
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_id.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_id.h
new file mode 100644
index 00000000..c2849d2d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_id.h
@@ -0,0 +1,22 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+
+#include "tensorflow/core/common_runtime/device/device_id.h"
+
+// TODO(sanjoy): Delete the header and forward the references.
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
new file mode 100644
index 00000000..aa8553f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
+
+#include "xla/tsl/framework/device_id.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Class that maintains a map from TfDeviceId to PlatformDeviceId, and manages
+// the translation between them.
+class GpuIdManager {
+ public:
+  // Adds a mapping from tf_device_id to platform_device_id.
+  static absl::Status InsertTfPlatformDeviceIdPair(
+      tsl::TfDeviceId tf_device_id, tsl::PlatformDeviceId platform_device_id);
+
+  // Gets the platform_device_id associated with tf_device_id. Returns OK if
+  // found.
+  static absl::Status TfToPlatformDeviceId(
+      tsl::TfDeviceId tf_device_id, tsl::PlatformDeviceId* platform_device_id);
+
+  // Clears the map. Used in unit tests only.
+  static void TestOnlyReset();
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
new file mode 100644
index 00000000..78c57ca2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+
+#include <string>
+
+#include "xla/tsl/framework/allocator.h"
+
+namespace tensorflow {
+
+// An allocator for CUDA unified memory. Memory allocated with this allocator
+// can be accessed from both host and device. CUDA transparently migrates dirty
+// pages, which can be slow. Therefore, this allocator is intended for
+// convenience in functional tests only.
+class GpuManagedAllocator : public tsl::Allocator {
+ public:
+  std::string Name() override { return "GpuManagedAllocator"; }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_process_state.h
new file mode 100644
index 00000000..19f8448e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -0,0 +1,187 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_PROCESS_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_PROCESS_STATE_H_
+
+// TODO(b/282059652): Merge google internal and open-source code path once TF
+// dependency issue is resolved.
+#if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
+#define TF_GPU_USE_PJRT
+#endif  // PLATFORM_GOOGLE && TF_PLATFORM_LINUX_X86_64
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "xla/tsl/framework/device_id.h"
+#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class GPUBFCAllocator;
+class PoolAllocator;
+
+// Singleton that manages per-process state when GPUs are present.
+class GPUProcessState {
+ public:
+  // If ps == nullptr, returns pointer to the single instance of this class to
+  // be used within this process.
+  //
+  // If ps != nullptrs, accepts a value to be returned by all subsequent calls.
+  // A non-null ps may ONLY be provided during program static storage
+  // initialization.  Must not be called more than once with a non-null ps.
+  //
+  // If a derived class of GPUProcessState is ever used in a process, it must
+  // always be used in place of this class.  In order to ensure that existing
+  // calls to GPUProcessState::singleton() all resolve to the derived instance
+  // instead, this function must be called once during startup, supplying the
+  // derived instance value, prior to any accessor call to this function.
+  static GPUProcessState* singleton(GPUProcessState* ps = nullptr);
+
+  // Query whether any GPU device has been created so far.
+  // Disable thread safety analysis since a race is benign here.
+  bool HasGPUDevice() const TF_NO_THREAD_SAFETY_ANALYSIS {
+    return gpu_device_enabled_;
+  }
+
+  // Set the flag to indicate a GPU device has been created.
+  // Disable thread safety analysis since a race is benign here.
+  void EnableGPUDevice() TF_NO_THREAD_SAFETY_ANALYSIS {
+    gpu_device_enabled_ = true;
+  }
+
+  // Returns the one GPU allocator used for the indexed GPU.
+  // Note that this is a system GPU index, not (necessarily) a brain
+  // device index.
+  //
+  // 'total_bytes' is the total number of bytes that should be made
+  // available to the allocator.  The first call to this function for
+  // a given tf_device_id creates the allocator, so only the total_bytes
+  // used on that first call is used.
+  //
+  // "Allocator type" describes the type of algorithm to use for the
+  // underlying allocator.  REQUIRES: Must be a valid type (see
+  // config.proto for the list of supported strings.).
+  //
+  // `options` is read on the very first call to this function in the process.
+  // After that if you pass in a set of options, they will be ignored.
+  //
+  // REQUIRES: tf_device_id must be a valid id for a BaseGPUDevice available in
+  // the current system environment.  Otherwise returns nullptr.
+  virtual Allocator* GetGPUAllocator(
+      const GPUOptions& options, tsl::TfDeviceId tf_device_id,
+      size_t total_bytes, const std::vector<tsl::TfDeviceId>& peer_gpu_ids);
+
+  Allocator* GetGPUAllocator(tsl::TfDeviceId tf_device_id) {
+    return GetGPUAllocator(/*options=*/{}, tf_device_id, /*total_bytes=*/0,
+                           /*peer_gpu_ids=*/{});
+  }
+
+  int NumGPUAllocators() {
+    mutex_lock l(mu_);
+    return gpu_allocators_.size();
+  }
+
+  // `options` is read on the very first call to this function in the process,
+  // e.g. to set the memory limit on this allocator.  After that if you pass in
+  // a different set of options, they will be ignored.
+  virtual Allocator* GetGpuHostAllocator(const GPUOptions& options,
+                                         int numa_node);
+
+  // Registers a Visitor to be invoked on new chunks of memory allocated by the
+  // SubAllocator of every GPU proximate to the specified bus.  The AllocVisitor
+  // is provided with a memory pointer, a GPU id, and the size of the area it
+  // identifies.  The pointer is not guaranteed to be valid after the call
+  // terminates.  The intention is for this interface to be used for network
+  // device memory registration.  "bus_id" is platform-specific.  On many
+  // platforms it should be 0.  On machines with multiple PCIe buses, it should
+  // be the index of one of the PCIe buses (maybe the NUMA node at which the
+  // PCIe is rooted).  If the bus_id is invalid, results are undefined.
+  virtual void AddGPUAllocVisitor(int bus_id,
+                                  const SubAllocator::Visitor& visitor);
+
+  // Registers a Visitor to be invoked on new chunks of memory allocated by
+  // the SubAllocator of the GpuHostAllocator for the given numa_node.
+  virtual void AddGpuHostAllocVisitor(int numa_node,
+                                      const SubAllocator::Visitor& visitor);
+
+  // Registers a Visitor to be invoked on each chunk handed back for freeing to
+  // the SubAllocator of the GpuHostAllocator for the given numa_node.
+  virtual void AddGpuHostFreeVisitor(int numa_node,
+                                     const SubAllocator::Visitor& visitor);
+
+  // Returns bus_id for the given GPU id.
+  virtual int BusIdForGPU(tsl::TfDeviceId tf_device_id);
+
+  SharedCounter* GPUAllocatorCounter(tsl::TfDeviceId tf_device_id);
+
+ protected:
+  // GPUProcessState is a singleton that should not normally be deleted except
+  // at process shutdown.
+  GPUProcessState();
+  virtual ~GPUProcessState() {}
+  friend class GPUDeviceTest;
+
+  // Helper method for unit tests to reset the ProcessState singleton by
+  // cleaning up everything. Never use in production.
+  virtual void TestOnlyReset();
+
+  ProcessState::MDMap* mem_desc_map() {
+    if (process_state_) return &process_state_->mem_desc_map_;
+    return nullptr;
+  }
+
+  static GPUProcessState* instance_;
+  ProcessState* process_state_;  // Not owned.
+  bool gpu_device_enabled_;
+
+  mutex mu_;
+
+  struct AllocatorParts {
+    std::unique_ptr<Allocator> allocator;
+    std::unique_ptr<SharedCounter> counter;
+    GPUBFCAllocator* bfc_allocator;
+    SubAllocator* sub_allocator;  // owned by allocator
+    std::unique_ptr<Allocator> recording_allocator;
+
+#ifdef TF_GPU_USE_PJRT
+    // Not owning GPU allocator. The allocator is owned by PJRT. If
+    // `allocator_not_owned` is set, `allocator` owned by AllocatorParts won't
+    // be set.
+    Allocator* allocator_not_owned;
+#endif  // TF_GPU_USE_PJRT
+  };
+  std::vector<AllocatorParts> gpu_allocators_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_
+      TF_GUARDED_BY(mu_);
+
+  std::vector<AllocatorParts> gpu_host_allocators_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_alloc_visitors_
+      TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_free_visitors_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_PROCESS_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_scheduling_metrics_storage.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_scheduling_metrics_storage.h
new file mode 100644
index 00000000..5e665414
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_scheduling_metrics_storage.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SCHEDULING_METRICS_STORAGE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SCHEDULING_METRICS_STORAGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "xla/tsl/framework/real_time_in_memory_metric.h"
+
+namespace tensorflow {
+
+// Storage class that holds all the exported in memory metrics exported by GPU
+// runtime.
+class GpuSchedulingMetricsStorage {
+ public:
+  static GpuSchedulingMetricsStorage& GetGlobalStorage();
+
+  // Gets the metrics for estimated total GPU load.
+  tsl::RealTimeInMemoryMetric<int64_t>& TotalGpuLoadNs() {
+    return total_gpu_load_ns_;
+  }
+
+  const tsl::RealTimeInMemoryMetric<int64_t>& TotalGpuLoadNs() const {
+    return total_gpu_load_ns_;
+  }
+
+ private:
+  tsl::RealTimeInMemoryMetric<int64_t> total_gpu_load_ns_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SCHEDULING_METRICS_STORAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
new file mode 100644
index 00000000..51a342fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
@@ -0,0 +1,94 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SERVING_DEVICE_SELECTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SERVING_DEVICE_SELECTOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/fixed_array.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/framework/serving_device_selector.h"
+#include "tensorflow/core/framework/resource_base.h"
+
+namespace tensorflow {
+namespace gpu {
+class GpuServingDeviceSelector;
+const char kGpuServingDeviceSelectorResourceName[] =
+    "gpu_serving_device_selector";
+
+class GpuServingDeviceSelectorResource : public ResourceBase {
+ public:
+  explicit GpuServingDeviceSelectorResource(
+      int num_devices, std::unique_ptr<tsl::ServingDeviceSelector::Policy>
+                           device_selector_policy)
+      : selector_(std::make_unique<GpuServingDeviceSelector>(
+            num_devices, std::move(device_selector_policy))) {}
+
+  std::string DebugString() const override {
+    return "GpuServingDeviceSelectorResource";
+  };
+
+  GpuServingDeviceSelector* selector() const { return selector_.get(); }
+
+ private:
+  std::unique_ptr<GpuServingDeviceSelector> selector_;
+};
+
+class GpuServingDeviceSelector : public tsl::ServingDeviceSelector {
+ public:
+  GpuServingDeviceSelector(
+      int num_devices,
+      std::unique_ptr<ServingDeviceSelector::Policy> device_selector_policy);
+
+  tsl::DeviceReservation ReserveDevice(
+      absl::string_view program_fingerprint) override;
+
+  // Enqueues the program on the stream of index `index_on_host`.
+  void Enqueue(int32_t index_on_host, absl::string_view fingerprint) override;
+
+  // Marks the completion of a program on the given stream.
+  // If `had_error` is true, this function doesn't update program's execution
+  // time stats to avoid incorrect estimates.
+  void Completed(int32_t index_on_host, bool had_error) override;
+
+ private:
+  friend class ServingDeviceSelectorTestHelper;
+  static void OverwriteNowNsFunctionForTest(int64_t (*now_ns)());
+
+  void FreeDeviceReservation(
+      const tsl::DeviceReservation& reservation) override;
+
+  // Only for metrics reporting purposes.
+  int64_t TotalEstimatedTimeTillIdleNs() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Mutex mu_;
+  absl::FixedArray<DeviceState, 8> device_states_ ABSL_GUARDED_BY(mu_);
+  std::unique_ptr<ServingDeviceSelector::Policy> device_selector_policy_;
+  int64_t req_id_counter_ ABSL_GUARDED_BY(mu_);
+  // Map from program fingerprint to execution info.
+  absl::node_hash_map<std::string, ExecutionInfo> execution_info_
+      ABSL_GUARDED_BY(mu_);
+  std::optional<int64_t> min_exec_time_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace gpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SERVING_DEVICE_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_util.h
new file mode 100644
index 00000000..0b650ad9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -0,0 +1,111 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+class RecvTensorResponse;
+class TensorProto;
+
+class GPUUtil {
+ public:
+  // "tensor" is GPU-local.  "dev" is the hosting GPU.
+  // "device_context" should be the context of the GPU "_Send" op
+  // which provides the Tensor.
+  // Sets all necessary fields of "proto" by transferring value
+  // bytes from GPU to CPU RAM. "is_dead" indicates that the
+  // tensor is dead with an uninit value.
+  static void SetProtoFromGPU(const Tensor& tensor, Device* dev,
+                              const DeviceContext* device_context,
+                              TensorProto* proto, bool is_dead,
+                              StatusCallback done);
+
+  // Copies the data in 'gpu_tensor' into 'cpu_tensor'.
+  // 'gpu_tensor''s backing memory must be on 'gpu_device' and
+  // 'cpu_tensor' must be allocated to be of the same size as
+  // 'gpu_tensor'. Synchronous: may block.
+  static void CopyGPUTensorToCPU(Device* gpu_device,
+                                 const DeviceContext* device_context,
+                                 const Tensor* gpu_tensor, Tensor* cpu_tensor,
+                                 StatusCallback done);
+
+  // Blocks until all operations queued on the stream associated with
+  // "gpu_device" at the time of the call have completed.  Returns any
+  // error pending on the stream at completion.
+  static absl::Status Sync(Device* gpu_device);
+
+  // Blocks until all operations queued on all streams associated with the
+  // corresponding GPU device at the time of call have completed.
+  // Returns any error pending on the stream at completion.
+  static absl::Status SyncAll(Device* gpu_device);
+
+  // For debugging purpose, given a "device" and a "tensor" allocated
+  // on the device, return a string printing each byte in the tensor
+  // (up to a limit).  "device" can be either a CPU or a GPU device.
+  static string MemoryDebugString(const Device* device, Tensor* tensor);
+
+  // Map a Tensor as a DeviceMemory object wrapping the given typed
+  // buffer.
+  //
+  // NOTE: will be removed soon, see StreamExecutorUtil::AsDeviceMemory
+  // instead.
+  template <typename T>
+  static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
+    T* ptr = reinterpret_cast<T*>(const_cast<void*>(DMAHelper::base(&t)));
+    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
+  }
+
+  // Computes a checksum over the contents of "tensor", which is allocated
+  // on "gpu_device".
+  static uint64 Checksum(Device* gpu_device,
+                         const DeviceContext* device_context,
+                         const Tensor& tensor);
+
+  // Computes a checksum over the contents of "tensor", which is allocated
+  // in local CPU RAM.
+  static uint64 Checksum(const Tensor& tensor);
+
+  static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
+                                 const DeviceContext* device_context,
+                                 Device* gpu_device, Tensor* gpu_tensor,
+                                 StatusCallback done, bool sync_dst_compute);
+
+  static void DeviceToDeviceCopy(
+      DeviceContext* send_dev_context, DeviceContext* recv_dev_context,
+      Device* src, Device* dst, AllocatorAttributes src_alloc_attr,
+      AllocatorAttributes dst_alloc_attr, const Tensor* input, Tensor* output,
+      int dev_to_dev_stream_index, StatusCallback done);
+
+  // Deep-copying of GPU tensor on the same device.
+  // 'src_gpu_tensor''s and 'dst_gpu_tensor''s backing memory must be on
+  // 'gpu_device' and 'dst_cpu_tensor' must be allocated to be of the same
+  // size as 'src_gpu_tensor'.
+  static void CopyGPUTensorToSameGPU(Device* gpu_device,
+                                     const DeviceContext* device_context,
+                                     const Tensor* src_gpu_tensor,
+                                     Tensor* dst_gpu_tensor,
+                                     StatusCallback done);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu_device_context.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu_device_context.h
new file mode 100644
index 00000000..e7486e97
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gpu_device_context.h
@@ -0,0 +1,107 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace stream_executor {
+class Stream;
+}  // namespace stream_executor
+
+namespace tensorflow {
+
+class GPUDeviceContext : public DeviceContext {
+ public:
+  // Does not take ownership of streams.
+  GPUDeviceContext(
+      int stream_id, se::Stream* stream,
+#if TENSORFLOW_USE_ROCM
+      se::Stream* nccl_stream,
+#endif
+      se::Stream* host_to_device_stream, se::Stream* device_to_host_stream,
+      absl::InlinedVector<se::Stream*, 4UL> device_to_device_stream,
+      Allocator* host_memory_allocator)
+      : stream_id_(stream_id),
+        stream_(stream),
+#if TENSORFLOW_USE_ROCM
+        nccl_stream_(nccl_stream),
+#endif
+        host_to_device_stream_(host_to_device_stream),
+        device_to_host_stream_(device_to_host_stream),
+        device_to_device_stream_(device_to_device_stream),
+        host_memory_allocator_(host_memory_allocator) {
+  }
+
+  ~GPUDeviceContext() override {}
+
+  se::Stream* stream() const override { return stream_; }
+#if TENSORFLOW_USE_ROCM
+  se::Stream* nccl_stream() const { return nccl_stream_; }
+#endif
+  se::Stream* host_to_device_stream() const { return host_to_device_stream_; }
+  se::Stream* device_to_host_stream() const { return device_to_host_stream_; }
+  se::Stream* device_to_device_stream(int index) const {
+    return device_to_device_stream_[index % device_to_device_stream_.size()];
+  }
+  int stream_id() const { return stream_id_; }
+  Allocator* host_memory_allocator() const override {
+    return host_memory_allocator_;
+  }
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view edge_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
+  void MaintainLifetimeOnStream(const Tensor* t,
+                                se::Stream* stream) const override {}
+
+  absl::Status ThenExecute(Device* device, se::Stream* stream,
+                           std::function<void()> func) override;
+
+ private:
+  int stream_id_;
+  // The default primary stream to use for this context.
+  // All the memory belongs to this stream.
+  se::Stream* stream_;
+#if TENSORFLOW_USE_ROCM
+  // The stream to use for nccl operations.
+  se::Stream* nccl_stream_;
+#endif
+  // The stream to use for copying data from host into GPU.
+  se::Stream* host_to_device_stream_;
+  // The stream to use for copying data from GPU to host.
+  se::Stream* device_to_host_stream_;
+  // Streams to use for copying data between GPUs.
+  absl::InlinedVector<se::Stream*, 4UL> device_to_device_stream_;
+  // The allocator to use for allocating pinned host memory.
+  // Not owned.
+  Allocator* host_memory_allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/gradients.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gradients.h
new file mode 100644
index 00000000..aaa9cad8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/gradients.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRADIENTS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRADIENTS_H_
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// Represents the output of 'node' at 'index'.
+struct NodeOut {
+  Node* node;
+  int index;
+
+  // Returns the string name that represents the output of this node.
+  string name() const;
+  // Returns the data type of the output of this node.
+  DataType dtype() const;
+};
+
+// NOTE: This API is a work in progress and will likely be changing frequently.
+//
+// Given initial gradient-node outputs 'y_grad_node_outputs' (which compute the
+// symbolic partial derivatives of some loss function 'L' w.r.t the node outputs
+// 'y_node_outputs'), adds gradient nodes to 'graph' that compute the symbolic
+// partial derivatives of 'L' w.r.t the node outputs 'x_node_outputs'.
+//
+// REQUIRES: Each node in 'x_node_outputs' to be unique, and so to have a single
+// output (this restriction will be removed in a subsequent change).
+
+// TODO(andydavis) Add symbolic gradient support for general graphs (the current
+// implementation only supports gradients for functions). In particular,
+// the nodes in 'x_nodes' are currently restricted to have one output.
+
+absl::Status AddSymbolicGradients(absl::Span<const NodeOut> y_node_outputs,
+                                  absl::Span<const NodeOut> x_node_outputs,
+                                  absl::Span<const NodeOut> y_grad_node_outputs,
+                                  std::vector<NodeOut>* x_grad_node_outputs,
+                                  Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRADIENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_constructor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_constructor.h
new file mode 100644
index 00000000..5f97f387
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_constructor.h
@@ -0,0 +1,210 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_CONSTRUCTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_CONSTRUCTOR_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class ShapeRefiner;
+
+// Construct a Graph *g out of a GraphDef gdef. Returns non-OK on
+// error, in which case *g is left in an incomplete state.
+//
+// *g is expected to be an empty graph (with no more than a source and sink
+// nodes) when provided to ConvertGraphDefToGraph. To enhance an existing Graph,
+// see ImportGraphDef.
+struct GraphConstructorOptions {
+  GraphConstructorOptions() = default;
+
+  // If true, allows internal ops in the GraphDef.
+  bool allow_internal_ops = false;
+
+  // If true, the graph def is expected to have fully specified
+  // devices for all nodes. A node in the resulting graph "g" has the
+  // device name set accordingly.
+  //
+  // TODO(zhifengc): if possible, consider removing this option.
+  bool expect_device_spec = false;
+
+  // If true, validates that nodes being converted have all expected attrs
+  // set and no unknown attrs set by calling ValidateNodeDef().
+  // Setting validate_nodes without add_default_attributes, will fail if
+  // the GraphDef does not have all required attributes set.
+  bool validate_nodes = false;
+
+  // If true, GraphConstructor will add attributes with their default
+  // value to the Node when they are missing from the NodeDef.
+  bool add_default_attributes = true;
+};
+extern absl::Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
+                                           const GraphDef& gdef, Graph* g);
+extern absl::Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
+                                           GraphDef&& gdef, Graph* g);
+
+// Same as ConvertGraphDefToGraph, but takes just nodes.  Used by function
+// instantiation.
+// TODO(irving): This will turn into std::vector<NodeInfoPtr> soon.
+extern absl::Status ConvertNodeDefsToGraph(
+    const GraphConstructorOptions& opts, absl::Span<const NodeDef> nodes,
+    Graph* g, const GraphDebugInfo* debug_info = nullptr);
+
+// Options for calling ImportGraphDef().
+struct ImportGraphDefOptions {
+  ImportGraphDefOptions()
+      : uniquify_names(false),
+        uniquify_prefix(false),
+        skip_mapped_nodes(false),
+        validate_shape(true),
+        propagate_device_spec(false) {}
+
+  // Name prefix to use for nodes imported from the GraphDef.  For example, if
+  // prefix="animals" and GraphDef contains a node "bunny" then the node will be
+  // named "animals/bunny" in *g. Must not be already used as a node name or
+  // prefix in the graph.
+  string prefix;
+
+  // If true, imported node names will be modified if their name already exists
+  // in the graph. If false, conflicting names will be treated as an error. Note
+  // that this option has no effect if `prefix` is specified, since `prefix`
+  // will guarantee all node names are unique.
+  bool uniquify_names;
+
+  // If true, `prefix` will be modified if it already exists as a node name or
+  // prefix in the graph. If false, a conflicting prefix will be treated as an
+  // error. This option has no effect if `prefix` isn't specified.
+  bool uniquify_prefix;
+
+  // Maps tensors in `gdef` to existing tensors in `g`. Inputs in `gdef`
+  // corresponding to `input_map` keys will be remapped to the nodes in `g`
+  // corresponding to the values.
+  //
+  // Keys should not include `prefix`, i.e., a key ID's name should be the name
+  // as it originally appears in `gdef`.
+  //
+  // If this is non-empty, ImportGraphDef must be called with the shape refiner
+  // used to create the existing nodes referenced in `input_map`.
+  // TODO(skyewm): can we remove this requirement? How do we access the original
+  // shape refiner?
+  std::map<SafeTensorId, SafeTensorId> input_map;
+
+  // If true, nodes that will have all output edges removed because of
+  // overrides in `input_map` will not be imported.
+  bool skip_mapped_nodes;
+
+  // The names of existing nodes in `g` that the imported graph should have
+  // control dependencies on.
+  //
+  // Note that to avoid creating many redundant control edges, ImportGraphDef()
+  // won't add control edges to nodes that will inherit the dependencies from
+  // other nodes in `gdef`.
+  std::vector<string> control_dependencies;
+
+  // Tensors in `gdef` that will be returned via the ImportGraphDefResults
+  // output parameter of `ImportGraphDef()`. If this list is non-empty, the
+  // caller must pass a results object to `ImportGraphDef()`. The
+  // `return_tensors` field will be populated with the imported nodes in `g`.
+  //
+  // Entries should not include `prefix`, i.e., each ID's name should be the
+  // name as it originally appears in `gdef`.
+  //
+  // If this contains a tensor that's also being remapped via `input_map`, the
+  // corresponding existing tensor in `g` will be returned.
+  std::vector<SafeTensorId> return_tensors;
+
+  // The names of nodes in `gdef` that will be returned via the
+  // ImportGraphDefResults output parameter of `ImportGraphDef()`. If this list
+  // is non-empty, the caller must pass a results object to
+  // `ImportGraphDef()`. The `return_nodes` field will be populated with the
+  // imported nodes in `g`.
+  //
+  // Entries should not include `prefix`, i.e., each node's name should be the
+  // name as it originally appears in `gdef`.
+  //
+  // Unlike `return_tensors`, `input_map` has no effect on the nodes
+  // returned. `return_nodes` must be empty if `skip_mapped_nodes` is true.
+  // TODO(skyewm): make this work with `skip_mapped_nodes` if there's a need.
+  std::vector<string> return_nodes;
+
+  // If true, checks that all colocation constraints are nodes in the GraphDef.
+  bool validate_colocation_constraints = true;
+
+  // If false skips shape validation.
+  bool validate_shape;
+
+  // TODO(ashankar): Enable handling of GraphDefs produced by newer binaries
+  // with ops that are not defined in the binary calling ImportGraphDef.
+  // Similar to the producer_op_list argument to import_graph_def in the
+  // python API.
+
+  // Try to set default execution device for this grapth.
+  string default_device;
+
+  // If true, propagates a node's assigned device. By default the runtime
+  // will recompute the assigned device every time.
+  bool propagate_device_spec;
+};
+
+// Optional results that may be returned by ImportGraphDef.
+struct ImportGraphDefResults {
+  // The requested tensors associated with
+  // ImportGraphDefOptions::return_tensors. Note that the index may be different
+  // than the requested index if the returned tensor has been remapped according
+  // to `input_map`.
+  typedef int Index;
+  std::vector<std::pair<Node*, Index>> return_tensors;
+
+  // The requested nodes associated with ImportGraphDefOptions::return_nodes.
+  std::vector<Node*> return_nodes;
+
+  // Keys in ImportGraphDefOptions::input_map that don't appear in `gdef` and
+  // weren't used as an input to any node in `gdef`. These keys are likely due
+  // to typos, and callers may wish to treat their existence as an error.
+  std::vector<SafeTensorId> missing_unused_input_map_keys;
+};
+
+// Adds the graph in GraphDef `gdef` into an existing Graph `*g`.
+//
+// On error, returns non-OK and leaves `*g` unmodified.
+//
+// `refiner` can be null. It should be non-null if the caller
+// intends to add additional nodes to the graph after the import. This
+// allows the caller to validate shapes of those nodes (since
+// ShapeRefiner::AddNode must be called in topological order).
+//
+// `results` must be non-null if `opts.return_tensors` or `opts.result_nodes` is
+// non-empty. It can also be set to fetch the unused input map keys. If it's
+// non-null, all the vector fields must be empty.
+//
+// TODO(ashankar): Push this mechanism and get rid of Session::Extend()
+// as a means of enhancing an existing Graph.
+extern absl::Status ImportGraphDef(const ImportGraphDefOptions& opts,
+                                   const GraphDef& gdef, Graph* g,
+                                   ShapeRefiner* refiner,
+                                   ImportGraphDefResults* results = nullptr);
+
+// Make a copy of "src" into "*dest".
+//
+// REQUIRES: "*dest" is a freshly allocated graph without any nodes or edges
+// other than the implicit Source/Sink nodes.
+extern void CopyGraph(const Graph& src, Graph* dest);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_CONSTRUCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_def_builder_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_def_builder_util.h
new file mode 100644
index 00000000..8fb53997
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_def_builder_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_DEF_BUILDER_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_DEF_BUILDER_UTIL_H_
+
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Graph;
+
+// Converts the `GraphDef` being built by `builder` to a `Graph` and
+// stores it in `*graph`.
+// TODO(josh11b): Make this faster; right now it converts
+// Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
+// edges from the source and to the sink node, resolves back edges
+// by name), and makes sure the resulting graph is valid.
+absl::Status GraphDefBuilderToGraph(const GraphDefBuilder& builder,
+                                    Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_DEF_BUILDER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_execution_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_execution_state.h
new file mode 100644
index 00000000..4f713ae9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_execution_state.h
@@ -0,0 +1,243 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_EXECUTION_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_EXECUTION_STATE_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/build_graph_options.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+struct SessionOptions;
+
+namespace subgraph {
+struct RewriteGraphMetadata;
+}
+
+struct GraphExecutionStateOptions {
+  const DeviceSet* device_set = nullptr;
+  const SessionOptions* session_options = nullptr;
+  // Unique session identifier. Can be empty.
+  string session_handle;
+  // A map from node name to device name, representing the unchangeable
+  // placement of stateful nodes.
+  std::unordered_map<string, string> stateful_placements;
+  // Whether to run Placer on the graph.
+  bool run_placer = true;
+
+  // Whether to enable tf2xla mlir bridge. The default is true and intends to
+  // work for almost all models. Non default values should only applied to
+  // selective models.
+  bool enable_tf2xla_mlir_bridge = true;
+};
+
+// A ClientGraph is simply a sub-graph of the full graph as induced by
+// BuildGraphOptions.
+struct ClientGraph {
+  explicit ClientGraph(std::unique_ptr<FunctionLibraryDefinition> flib,
+                       DataTypeVector feed_types, DataTypeVector fetch_types,
+                       int64_t collective_graph_key)
+      : flib_def(std::move(flib)),
+        graph(flib_def.get()),
+        feed_types(std::move(feed_types)),
+        fetch_types(std::move(fetch_types)),
+        collective_graph_key(collective_graph_key) {}
+  // Each client-graph gets its own function library since optimization passes
+  // post rewrite for execution might want to introduce new functions.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def;
+  Graph graph;
+  DataTypeVector feed_types;
+  DataTypeVector fetch_types;
+  int64_t collective_graph_key;
+};
+
+// GraphExecutionState is responsible for generating an
+// executable ClientGraph from the original GraphDef that specifies
+// the complete graph and from BuildGraphOptions which specifies
+// input/output nodes.
+//
+// An executable Graph differs from a GraphDef by being Placed,
+// meaning that each Node is assigned to a single Device in the
+// available set.
+//
+// When GraphExecutionState is first constructed it instantiates
+// a full Graph from the provided GraphDef, and places it, using only
+// the static device assignments from the GraphDef.  Nodes without are
+// currently placed in a very naive way.  Since stateful Nodes cannot
+// be moved after initial placement, it is important that stateful
+// Nodes get sensible initial device assignments in the graph
+// definition.
+//
+// Subsequently, GraphExecutionState generates a SimpleClientGraph on
+// demand, which is a sub-graph of the latest placement of the full
+// Graph.  MasterSession uses such a ClientGraph to execute one or
+// more similar client requests.
+//
+// GraphExecutionState is thread-safe.
+
+class GraphExecutionState {
+ public:
+  virtual ~GraphExecutionState();
+
+  // Creates a new `GraphExecutionState` for the given
+  // `graph_def`, which represents the entire graph for a session.
+  static absl::Status MakeForBaseGraph(
+      GraphDef&& graph_def, const GraphExecutionStateOptions& options,
+      std::unique_ptr<GraphExecutionState>* out_state);
+
+  // Creates a new `GraphExecutionState` and `SimpleClientGraph`
+  // for the subgraph of `original_graph_def` defined by
+  // `subgraph_options`.
+  static absl::Status MakeForPrunedGraph(
+      const GraphExecutionState& base_execution_state,
+      const GraphExecutionStateOptions& options,
+      const BuildGraphOptions& subgraph_options,
+      std::unique_ptr<GraphExecutionState>* out_state,
+      std::unique_ptr<ClientGraph>* out_client_graph);
+
+  // Creates a new GraphExecutionState representing the
+  // concatenation of this graph, and the graph defined by
+  // "extension_def". The same name may not be used to define a node
+  // in both this graph and "extension_def".
+  //
+  // If successful, returns OK and the caller takes ownership of "*out".
+  // Otherwise returns an error and does not modify "*out".
+  //
+  // After calling `old_state->Extend()`, `old_state` may no longer be
+  // used.
+  //
+  // NOTE(mrry): This method respects the placement of stateful nodes in
+  // in *this, but currently does not transfer any other placement
+  // or cost model information to the new graph.
+  //
+  // Note that using this interface requires setting the value of
+  // config.experimental().disable_optimize_for_static_graph() in the state
+  // options to `true`, otherwise it will return an error.
+  absl::Status Extend(const GraphDef& extension_def,
+                      std::unique_ptr<GraphExecutionState>* out) const;
+
+  // Builds a ClientGraph (a sub-graph of the full graph as induced by
+  // the Node set specified in "options").  If successful, returns OK
+  // and the caller takes the ownership of "*out". Otherwise, returns
+  // an error.
+  absl::Status BuildGraph(const BuildGraphOptions& options,
+                          std::unique_ptr<ClientGraph>* out);
+
+  // Optimize the graph with the node set specified in `options`.
+  absl::Status OptimizeGraph(
+      const BuildGraphOptions& options, const Graph& graph,
+      const FunctionLibraryDefinition* flib_def,
+      std::unique_ptr<Graph>* optimized_graph,
+      std::unique_ptr<FunctionLibraryDefinition>* optimized_flib);
+
+  // The graph returned by BuildGraph may contain only the pruned
+  // graph, whereas some clients may want access to the full graph.
+  const Graph* full_graph() { return graph_; }
+
+  // The original graph.
+  GraphDef* original_graph_def() { return original_graph_def_.get(); }
+
+  // The original function library of this graph.
+  const FunctionLibraryDefinition& flib_def() const { return *flib_def_; }
+
+  // Returns the node with the given name, or null if it does not exist.
+  const Node* get_node_by_name(const string& name) const {
+    NodeNameToCostIdMap::const_iterator iter =
+        node_name_to_cost_id_map_.find(name);
+    if (iter != node_name_to_cost_id_map_.end()) {
+      return graph_->FindNodeId(iter->second);
+    } else {
+      return nullptr;
+    }
+  }
+
+  // Returns the map of stateful placements as a map of
+  // node name to placement string.
+  std::unordered_map<string, string> GetStatefulPlacements() const {
+    return stateful_placements_;
+  }
+
+ private:
+  GraphExecutionState(std::unique_ptr<GraphDef>&& graph_def,
+                      std::unique_ptr<FunctionLibraryDefinition>&& flib_def,
+                      const GraphExecutionStateOptions& options);
+
+  absl::Status InitBaseGraph(std::unique_ptr<Graph>&& graph,
+                             bool enable_tf2xla_mlir_bridge = true);
+
+  // Map of placed stateful nodes, i.e. nodes for which is_stateful()
+  // is true, such as "params" and "queue" nodes.  Once placed these
+  // nodes can not be moved to a different device.  Maps node names to
+  // device names.
+  std::unordered_map<string, string> stateful_placements_;  // Immutable after
+                                                            // ctor.
+  void SaveStatefulNodes(Graph* graph);
+  void RestoreStatefulNodes(Graph* graph);
+
+  // Extract the subset of the graph that needs to be run, adding feed/fetch
+  // ops as needed.
+  absl::Status PruneGraph(const BuildGraphOptions& options, Graph* graph,
+                          subgraph::RewriteGraphMetadata* out_rewrite_metadata);
+
+  // The GraphExecutionState must store a copy of the original GraphDef if
+  // either of the following conditions holds:
+  //
+  // * `session_options_.config.graph_options().place_pruned_graph()` is true.
+  // * `session_options_.config.experimental().optimize_for_static_graph()` is
+  //   false.
+  const std::unique_ptr<GraphDef> original_graph_def_;
+
+  const DeviceSet* device_set_;            // Not owned
+  const SessionOptions* session_options_;  // Not owned
+  // Unique session identifier. Can be empty.
+  string session_handle_;
+
+  // Map from name to Node for the full graph in placed_.
+  NodeNameToCostIdMap node_name_to_cost_id_map_;
+
+  // 'flib_def_' is initialized from the initial graph def's library,
+  // and may be updated by a graph optimization pass.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+
+  // `rewrite_metadata_` is only set for GraphExecutionState
+  // objects created by `MakeForPrunedGraph()`.
+  std::unique_ptr<subgraph::RewriteGraphMetadata> rewrite_metadata_;
+
+  // The dataflow graph owned by this object.
+  Graph* graph_;
+
+  // Whether to run Placer.
+  bool run_placer_;
+
+  GraphExecutionState(const GraphExecutionState&) = delete;
+  void operator=(const GraphExecutionState&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_EXECUTION_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_optimizer.h
new file mode 100644
index 00000000..f8322cfe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_optimizer.h
@@ -0,0 +1,100 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class GraphOptimizer {
+ public:
+  using NodePredicate = std::function<bool(const Node*)>;
+
+  struct Options {
+    // If not null it maps from nodes in graph to partially-known
+    // shapes of their outputs, and may be used, e.g., in the constant folding
+    // pass. The use of shape_map implies that the mapping from node name to the
+    // vector of partial shapes of its outputs is stable, i.e., no optimization
+    // pass may replace a node with a different node of the same name that has a
+    // different number of outputs, or outputs with different known shapes.
+    // TODO(b/65453533) introduce a unique way to name nodes in a graph.
+    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map =
+        nullptr;
+
+    // If not null then only nodes for which cse_consider_fn returns true will
+    // be considered for CSE.
+    NodePredicate cse_consider_fn = nullptr;
+
+    // If not null then only nodes for which cf_consider_fn returns true will be
+    // considered for CF.
+    NodePredicate cf_consider_fn = nullptr;
+
+    // If true, multi-device functions will be inlined if
+    // opts_.do_function_inlining() is true.
+    bool inline_multi_device_functions = false;
+
+    // If true, functions in implementation selection group will be inlined if
+    // opts_.do_function_inlining() is true.
+    bool inline_impl_selection_group_functions = false;
+
+    // If true all functions will be inlined with a single device function
+    // body placer strategy.
+    bool inline_with_single_device_body_placer = false;
+
+    // If true, the _noinline attribute on functions and callers is ignored.
+    bool ignore_noinline = false;
+  };
+
+  explicit GraphOptimizer(const OptimizerOptions& opts);
+  ~GraphOptimizer();
+
+  // Applies optimization passes specified in 'opts' to 'graph'.
+  // Maybe replace *graph with a new graph object.  'device' is device
+  // on which the 'graph' will execute. It's passed to the optimizers
+  // so that they can respect constraints if any, that should be
+  // respected.
+  void Optimize(FunctionLibraryRuntime* runtime, Env* env, const Device* device,
+                std::unique_ptr<Graph>* graph,
+                const Options& graph_optimizer_options);
+
+  const OptimizerOptions& options() { return opts_; }
+
+ private:
+  OptimizerOptions opts_;
+
+  GraphOptimizer(const GraphOptimizer&) = delete;
+  void operator=(const GraphOptimizer&) = delete;
+};
+
+// Applies graph rewrite optimization such as inlining, dead code
+// removal, etc.
+//
+// **g is a graph constructed based on the runtime library 'lib'.
+// OptimizeGraph mutates **g extensively and replaces '*g' with a
+// complete copy. Therefore, the caller should not keep any references
+// to nodes *g.
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options);
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_runner.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_runner.h
new file mode 100644
index 00000000..a40d17b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_runner.h
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_RUNNER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+using Env = tsl::Env;
+
+class Device;
+class Graph;
+
+// GraphRunner takes a Graph, some inputs to feed, and some outputs
+// to fetch and executes the graph required to feed and fetch the
+// inputs and outputs.
+//
+// This class is only meant for internal use where one needs to
+// partially evaluate inexpensive nodes in a graph, such as for shape
+// inference or for constant folding.  Because of its limited, simple
+// use-cases, it executes all computation on the given device (CPU by default)
+// and is not meant to be particularly lightweight, fast, or efficient.
+class GraphRunner {
+ public:
+  // REQUIRES: `env` is not nullptr.
+  GraphRunner(Env* env);
+  // REQUIRES: 'device' is not nullptr. Not owned.
+  GraphRunner(Device* device);
+  ~GraphRunner();
+
+  // Function semantics for `inputs`, `output_names` and `outputs`
+  // matches those from Session::Run().
+  //
+  // NOTE: The output tensors share lifetime with the GraphRunner, and could
+  // be destroyed once the GraphRunner is destroyed.
+  //
+  // REQUIRES: `graph`, `env`, and `outputs` are not nullptr.
+  // `function_library` may be nullptr.
+  typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
+  absl::Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
+                   const NamedTensorList& inputs,
+                   const std::vector<string>& output_names,
+                   std::vector<Tensor>* outputs);
+
+ private:
+  std::unique_ptr<Device> device_deleter_;
+  Device* const device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_view.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_view.h
new file mode 100644
index 00000000..d1fe278a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/graph_view.h
@@ -0,0 +1,258 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_VIEW_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Device;
+class Graph;
+class Node;
+class OpKernel;
+class Tensor;
+
+// Represents a single data edge in a `NodeItem`.
+struct EdgeInfo {
+  // The node ID of the destination in the containing `GraphView`.
+  int dst_id;
+  // The index of the output that produces values on this edge.
+  int output_slot : 31;
+  // true if this is the last info for output_slot in the EdgeInfo list.
+  bool is_last : 1;
+  // The index of the input that consumes values on this edge.
+  int input_slot;
+};
+
+// Represents a single control edge in a `NodeItem`.
+struct ControlEdgeInfo {
+  // The node ID of the destination in the containing `GraphView`.
+  int dst_id;
+};
+
+// Compact structure representing a graph node and its associated kernel.
+//
+// Each NodeItem is an element of exactly one GraphView.
+struct NodeItem {
+  // The index of this node's item in its GraphView.
+  int node_id = -1;
+
+  // Cached attributes of this node for fast lookup.
+  bool kernel_is_async : 1;     // True iff kernel->AsAsync() != nullptr
+  bool is_merge : 1;            // True iff IsMerge(node)
+  bool is_enter : 1;            // True iff IsEnter(node)
+  bool is_constant_enter : 1;   // True iff IsEnter(node) and
+                                // node->GetAttr("is_constant") == true.
+  bool is_exit : 1;             // True iff IsExit(node)
+  bool is_control_trigger : 1;  // True iff IsControlTrigger(node)
+  bool is_source : 1;           // True iff IsSource(node)
+  // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
+  bool is_enter_exit_or_next_iter : 1;
+  bool is_transfer_node : 1;      // True iff IsTransferNode(node)
+  bool is_initialization_op : 1;  // True iff IsInitializationOp(node)
+  bool is_recv_or_switch : 1;     // True iff IsRecv(node) || IsSwitch(node)
+  bool is_next_iteration : 1;     // True iff IsNextIteration(node)
+  bool is_noop : 1;  // True iff item->kernel->type_string_view() == "NoOp")
+  bool
+      is_any_consumer_merge_or_control_trigger : 1;  // True iff the destination
+                                                     // of any output edge is a
+                                                     // merge or control trigger
+                                                     // node.
+  bool is_any_input_ref_typed : 1;  // True iff any IsRefType(dt) for dt in this
+                                    // node's input types.
+  bool is_distributed_communication : 1;  // True iff the op is registered to
+                                          // use distributed communication.
+
+  // The kernel for this node.
+  OpKernel* kernel = nullptr;
+
+  // If the kernel is a Const op, this containts points to the constant tensor.
+  const Tensor* const_tensor = nullptr;
+
+  // Cached values of node->num_inputs() and node->num_outputs(), to
+  // avoid levels of indirection.
+  int num_inputs;
+  int num_outputs;
+
+  // ExecutorImpl::tensors_[input_start] is the 1st positional input
+  // for this node.
+  int input_start = 0;
+
+  // Number of output edges, excluding control edges.
+  int32 num_output_edges;
+
+  // Number of output control edges.
+  int32 num_output_control_edges;
+
+  // If non-null, contains an array of num_outputs bools, where the ith bool
+  // is true if and only if the ith output is consumed by another node.
+  std::unique_ptr<bool[]> outputs_required;
+
+  absl::Span<EdgeInfo> mutable_output_edges() {
+    return absl::Span<EdgeInfo>(output_edge_base(), num_output_edges);
+  }
+
+  gtl::ArraySlice<EdgeInfo> output_edges() const {
+    return gtl::ArraySlice<EdgeInfo>(output_edge_base(), num_output_edges);
+  }
+
+  gtl::ArraySlice<ControlEdgeInfo> output_control_edges() const {
+    return gtl::ArraySlice<const ControlEdgeInfo>(output_control_edge_base(),
+                                                  num_output_control_edges);
+  }
+
+  DataType input_type(int i) const {
+    DCHECK_LT(i, num_inputs);
+    return static_cast<DataType>(input_type_base()[i]);
+  }
+  DataType output_type(int i) const {
+    DCHECK_LT(i, num_outputs);
+    return static_cast<DataType>(output_type_base()[i]);
+  }
+
+  // Return array of per-output allocator attributes.
+  const AllocatorAttributes* output_attrs() const { return output_attr_base(); }
+
+  // Return array of expected input index from which each output should
+  // be forwarded:
+  // kNeverForward (-2) for DO NOT FORWARD (must allocate).
+  // kNoReservation (-1) for no expected forwarding.
+  // 0... for forward from that input.
+  const int* forward_from() const { return forward_from_base(); }
+
+  string DebugString() const;
+
+ private:
+  friend class GraphView;
+
+  NodeItem() {}
+
+  // Variable length section starts immediately after *this
+  // (uint8 is enough for DataType).
+  //   EdgeInfo            out_edges[num_output_edges];
+  //   ControlEdgeInfo     out_control_edges[num_output_control_edges];
+  //   AllocatorAttributes output_attr[num_outputs];
+  //   int                 forward_from[num_outputs];
+  //   uint8               input_type[num_inputs];
+  //   uint8               output_type[num_outputs];
+
+  // Return pointer to variable length section.
+  char* var() const {
+    return const_cast<char*>(reinterpret_cast<const char*>(this) +
+                             sizeof(NodeItem));
+  }
+
+  EdgeInfo* output_edge_base() const {
+    return reinterpret_cast<EdgeInfo*>(var());
+  }
+
+  ControlEdgeInfo* output_control_edge_base() const {
+    return reinterpret_cast<ControlEdgeInfo*>(var() + sizeof(EdgeInfo) *
+                                                          num_output_edges);
+  }
+
+  AllocatorAttributes* output_attr_base() const {
+    return reinterpret_cast<AllocatorAttributes*>(
+        var() + sizeof(EdgeInfo) * num_output_edges +
+        sizeof(ControlEdgeInfo) * num_output_control_edges);
+  }
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(ControlEdgeInfo) *
+                                      num_output_control_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs);
+  }
+  uint8* input_type_base() const {
+    return reinterpret_cast<uint8*>(
+        var() + sizeof(EdgeInfo) * num_output_edges +
+        sizeof(ControlEdgeInfo) * num_output_control_edges +
+        sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs);
+  }
+  uint8* output_type_base() const {
+    return reinterpret_cast<uint8*>(
+        var() + sizeof(EdgeInfo) * num_output_edges +
+        sizeof(ControlEdgeInfo) * num_output_control_edges +
+        sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs +
+        sizeof(uint8) * num_inputs);
+  }
+
+  NodeItem(const NodeItem&) = delete;
+  void operator=(const NodeItem&) = delete;
+};
+
+// Immutable view of a Graph organized for efficient execution.
+//
+// TODO(b/152651962): Add independent unit tests for this class.
+class GraphView {
+ public:
+  GraphView() : space_(nullptr) {}
+  ~GraphView();
+
+  absl::Status Initialize(const Graph* g);
+  absl::Status SetAllocAttrs(const Graph* g, const Device* device);
+  void SetScopedAllocatorAttrs(const std::vector<const Node*>& sa_nodes);
+
+  // Returns a mutable pointer to the `NodeItem` with the given `id` if it
+  // exists in the graph, or `nullptr` if it does not.
+  NodeItem* node(int32_t id) const {
+    DCHECK_GE(id, 0);
+    DCHECK_LT(id, num_nodes_);
+    uint32 offset = node_offsets_[id];
+    return ((offset == kuint32max)
+                ? nullptr
+                : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
+  }
+
+  // Returns the `NodeItem` with the given `id`.
+  //
+  // REQUIRES: `id` must be the ID of a valid node in the graph.
+  const NodeItem& node_ref(int32_t id) const {
+    DCHECK_GE(id, 0);
+    DCHECK_LT(id, num_nodes_);
+    uint32 offset = node_offsets_[id];
+    DCHECK_NE(offset, kuint32max);
+    return *reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]);
+  }
+
+  int32 num_nodes() const { return num_nodes_; }
+
+ private:
+  char* InitializeNode(char* ptr, const Node* n);
+  size_t NodeItemBytes(const Node* n);
+
+  int32 num_nodes_ = 0;
+  uint32* node_offsets_ = nullptr;  // array of size "num_nodes_"
+  // node_offsets_[id] holds the byte offset for node w/ "id" in space_
+
+  char* space_;  // NodeItem objects are allocated here
+
+  GraphView(const GraphView&) = delete;
+  void operator=(const GraphView&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
new file mode 100644
index 00000000..fd5ee985
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_HIERARCHICAL_TREE_BROADCASTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_HIERARCHICAL_TREE_BROADCASTER_H_
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+
+// Hierarchical tree-algorithm implementation of collective broadcast.
+class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface {
+ public:
+  HierarchicalTreeBroadcaster();
+  ~HierarchicalTreeBroadcaster() override = default;
+
+  // Establishes the subdiv permutations needed for a hierarchical broadcast.
+  // If all devices are local, establishes a single subdiv comprising all
+  // devices.  If any devices are on a different task, establishes n+1 subdivs
+  // for n tasks.
+  // The first subdiv comprises one device per task which gets the tensor on
+  // each task.  Subdiv i+1 corresponds to a task-local tree-broadcast for task
+  // i.
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override;
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  absl::Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+  // Begins async execution of the hierarchical tree broadcast.
+  // Must be called in a blockable thread.
+  // TODO(b/80529858): remove the previous warning when we have a dedicated
+  // collective threadpool.
+  void Run(StatusCallback done) override;
+
+  // Returns the rank of the device from which this device should receive
+  // its value, -1 if no value should be received.
+  static int TreeRecvFrom(const CollectiveParams& cp, int subdiv);
+
+  // Populates targets with the ranks of the devices to which this device
+  // should forward the value.
+  static void TreeSendTo(const CollectiveParams& cp, int subdiv,
+                         std::vector<int>* targets);
+
+ private:
+  // Get the task to which the device at `device_rank` belongs.
+  int GetDeviceTask(int device_rank, const std::vector<int>& dev_per_task);
+
+  // Sends `src_tensor` asynchronously from this device to device at `dst_rank`
+  // in `subdiv`.  Calls `done` upon completion.
+  void DispatchSend(int subdiv, int dst_rank, int src_rank,
+                    const Tensor* src_tensor, const StatusCallback& done);
+
+  // Receives a tensor into the memory buffer owned by `dst_tensor` at this
+  // device from device at `src_rank` in `subdiv`.  Calls `done` upon
+  // completion.
+  void DispatchRecv(int subdiv, int src_rank, int dst_rank, Tensor* dst_tensor,
+                    const StatusCallback& done);
+
+  // Executes the hierarchical broadcast defined by this op.
+  void RunTree();
+
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  absl::Status status_;
+  bool is_source_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_HIERARCHICAL_TREE_BROADCASTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/immutable_executor_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/immutable_executor_state.h
new file mode 100644
index 00000000..6a12bc1f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -0,0 +1,163 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_IMMUTABLE_EXECUTOR_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_IMMUTABLE_EXECUTOR_STATE_H_
+
+#include <atomic>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/common_runtime/graph_view.h"
+#include "tensorflow/core/common_runtime/local_executor_params.h"
+#include "tensorflow/core/common_runtime/pending_counts.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Graph;
+
+// Represents the state of an executor (graph and control flow information)
+// that is immutable throughout execution.
+//
+// TODO(b/152651962): Add independent unit tests for this class.
+class ImmutableExecutorState {
+ public:
+  struct FrameInfo {
+    explicit FrameInfo(string name)
+        : name(std::move(name)),
+          input_count(0),
+          total_inputs(0),
+          pending_counts(nullptr),
+          nodes(nullptr),
+          parallel_iterations(-1) {}
+
+    // The name of the frame.
+    string name;
+
+    // The total number of inputs to a frame.
+    int input_count;
+
+    // The total number of input tensors of a frame.
+    // == sum(nodes[*].num_inputs()) where nodes are the nodes in the frame.
+    int total_inputs;
+
+    // Used to determine the next place to allocate space in the
+    // pending_counts data structure we'll eventually construct
+    PendingCounts::Layout pending_counts_layout;
+
+    // Each frame has its own PendingCounts only for the nodes in the frame.
+    std::unique_ptr<PendingCounts> pending_counts;
+
+    // The nodes in a frame. Used only for debugging.
+    std::unique_ptr<std::vector<const NodeItem*>> nodes;
+
+    // The number of iterations of this frame that can execute concurrently.
+    int32 parallel_iterations;
+  };
+
+  explicit ImmutableExecutorState(const LocalExecutorParams& p)
+      : params_(p), gview_() {}
+  ~ImmutableExecutorState();
+
+  absl::Status Initialize(const Graph& graph);
+
+  // Process all Nodes in the current graph, attempting to infer the
+  // memory allocation attributes to be used wherever they may allocate
+  // a tensor buffer.
+  absl::Status SetAllocAttrs();
+
+  const LocalExecutorParams& params() const { return params_; }
+  const GraphView& graph_view() const { return gview_; }
+  const std::vector<PendingCounts::Handle>& pending_ids() const {
+    return pending_ids_;
+  }
+  const std::vector<const NodeItem*>& root_nodes() const { return root_nodes_; }
+
+  const FrameInfo& get_root_frame_info() const { return *root_frame_info_; }
+
+  const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {
+    DCHECK(node_item.is_enter);
+    return *enter_frame_info_[node_item.node_id];
+  }
+
+  bool requires_control_flow_support() const { return requires_control_flow_; }
+
+  // Copies the pending counts for nodes in this graph to the given array.
+  //
+  // This method provides a more efficient way of initializing
+  // `SimplePropagatorState` than individually accessing the pending counts from
+  // `get_root_frame_info().counts`.
+  //
+  // REQUIRES: `!requires_control_flow_support && len(dest) ==
+  // graph_view().num_nodes()`.
+  void copy_pending_counts(std::atomic<int32>* dest) const {
+    DCHECK(!requires_control_flow_);
+    memcpy(dest, atomic_pending_counts_.get(),
+           graph_view().num_nodes() * sizeof(std::atomic<int32>));
+    std::atomic_thread_fence(std::memory_order_release);
+  }
+
+ private:
+  struct ControlFlowInfo {
+    gtl::FlatSet<string> unique_frame_names;
+    std::vector<string> frame_names;
+  };
+
+  static absl::Status BuildControlFlowInfo(const Graph* graph,
+                                           ControlFlowInfo* cf_info);
+  void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
+
+  FrameInfo* EnsureFrameInfo(const string& fname);
+
+  // Owned.
+  LocalExecutorParams params_;
+  GraphView gview_;
+  bool requires_control_flow_;
+  std::vector<PendingCounts::Handle> pending_ids_;
+
+  // Root nodes (with no in edges) that should form the initial ready queue
+  std::vector<const NodeItem*> root_nodes_;
+
+  // Mapping from frame name to static information about the frame.
+  // TODO(yuanbyu): We could cache it along with the graph so to avoid
+  // the overhead of constructing it for each executor instance.
+  absl::flat_hash_map<absl::string_view, std::unique_ptr<FrameInfo>>
+      frame_info_;
+  const FrameInfo* root_frame_info_;  // Not owned.
+
+  // If the graph contains any "Enter" or "RefEnter" nodes, this vector maps
+  // dense node IDs to the corresponding FrameInfo.
+  std::vector<FrameInfo*> enter_frame_info_;
+
+  // If `requires_control_flow_` is false, this points to an array of initial
+  // pending counts for the nodes in the graph, indexed by node ID.
+  std::unique_ptr<std::atomic<int32>[]> atomic_pending_counts_;
+
+  // Shallow copies of the constant tensors used in the graph.
+  std::vector<Tensor> const_tensors_;
+
+  ImmutableExecutorState(const ImmutableExecutorState&) = delete;
+  void operator=(const ImmutableExecutorState&) = delete;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_IMMUTABLE_EXECUTOR_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/inline_function_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/inline_function_utils.h
new file mode 100644
index 00000000..94c118fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/inline_function_utils.h
@@ -0,0 +1,241 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INLINE_FUNCTION_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INLINE_FUNCTION_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/lower_function_call_inline_policy.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+static constexpr const char* const kNoInlineAttr = "_noinline";
+
+// Optionally override device assignment for nodes added to the graph for
+// inlined functions:
+// (1) Identity nodes added in place of function input arguments.
+// (2) Identity nodes added in place of function return values.
+// (3) Special NoOp nodes that enforce side-effects execution order.
+// (4) All nodes inside function body specified in FunctionDef.
+class InlinedFunctionBodyPlacer {
+ public:
+  virtual ~InlinedFunctionBodyPlacer() = default;
+
+  virtual absl::optional<string> InputNodeDevice(int input_index) const = 0;
+  virtual absl::optional<string> OutputNodeDevice(int output_index) const = 0;
+  // Returns true if the added input/output identity nodes should be colocated
+  // with the corresponding input/output from the function body.
+  virtual bool ColocateInputOutputIdentities() const = 0;
+  virtual absl::optional<string> ControlNodeDevice() const = 0;
+  virtual absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const = 0;
+
+  // LINT.IfChange
+  // Place input nodes on the same device as the corresponding caller input
+  // node. Do not specify any placement for all other nodes.
+  static std::unique_ptr<InlinedFunctionBodyPlacer> DefaultPlacer(
+      const Graph& graph, const Node& caller);
+
+  // Place all nodes on the same device as caller node.
+  static std::unique_ptr<InlinedFunctionBodyPlacer> SingleDevicePlacer(
+      const Graph& graph, const Node& caller);
+
+  // Place input nodes on the same device as the corresponding caller input
+  // node. Do not place output node. Place control nodes on the same device as
+  // caller node. For all function body nodes set job, replica and task
+  // parts of the device assignment to match function caller node where those
+  // are unspecified.
+  static std::unique_ptr<InlinedFunctionBodyPlacer> MultiDevicePlacer(
+      const Graph& graph, const Node& caller);
+  // LINT.ThenChange(lower_function_call_inline_policy.h)
+
+  using Factory = std::function<std::unique_ptr<InlinedFunctionBodyPlacer>(
+      const Graph&, const Node&)>;
+
+  struct Config {
+    string name;
+    Factory get;
+  };
+
+  static Config Default() { return {"default", DefaultPlacer}; }
+  static Config SingleDevice() { return {"single_device", SingleDevicePlacer}; }
+  static Config MultiDevice() { return {"multi_device", MultiDevicePlacer}; }
+};
+
+struct InlineFunctionBodyOptions {
+  // All nodes that have incoming control edge *from* the function call node,
+  // will be forwarded to the "output control node". There are two options for
+  // choosing which nodes will have a control edge *to* the "output control
+  // node":
+  //   a) control returns            (`control_ret` field in FunctionDef)
+  //   b) data returns               (`ret` field in FunctionDef)
+  enum class OutputControlSource { kDataOutputs, kControlOutputs };
+
+  // Keep a node in a graph with the same name as the function call node:
+  //
+  // a) DoNotKeep: Function call node is fully inlined, and there is no node in
+  //    a graph with the same name.
+  //
+  // b) Fetchable: Add an IdentityN node to the graph in place of the inlined
+  //    function call node. It will have a control edge from inlined
+  //    'output_control_node' and data edges from function output nodes.
+  //    The IdentityN node will be placed on the same device as the caller node.
+  //
+  //    This is mostly for compatibility with Tensorflow v1 and sessions.
+  //    When we prepare a graph for execution in
+  //    GraphExecutionState::MakeForBaseGraph we don't know what nodes will be
+  //    fetched, so we can't safely remove any of them. When graph executed as a
+  //    function it has 'Retval' nodes for all fetched tensors, and we can
+  //    safely inline function calls.
+  //
+  // c) Targetable: Add a NoOp node to the graph in place of the inlined
+  //    function call node. It will have a control edge from inline
+  //    'output_control_node' and no data edges. NoOp node will be placed on the
+  //    same device as the caller node. This will keep the inlined function call
+  //    node a valid 'session.run' target, and also will keep it a valid control
+  //    output node.
+  enum class KeepCallerNode { kDoNotKeep, kFetchable, kTargetable };
+
+  // If 'true' function inlining is completely disabled. This allows to control
+  // function inlining for different types of function calls (see
+  // 'ExpandInlineFunctionsOptions' below).
+  bool disable_inlining = false;
+  // Ignore '_noinline' function attribute.
+  bool ignore_noinline = false;
+  // If 'true' function inlining will inline functions in implementation
+  // selection group. Normally those functions should not be inlined; they will
+  // be handled by Grappler.
+  bool inline_impl_selection_group_functions = false;
+  // Controls if we want to keep a node with the name as the function call node
+  // in a graph after function inlining.
+  KeepCallerNode keep_caller_node = KeepCallerNode::kDoNotKeep;
+  // For compatibility with Tensorflow v1 by default we will use data outputs.
+  // Control returns were added to Tensorflow v2 with automatic control
+  // dependencies tracking in Eager mode.
+  OutputControlSource output_control_src = OutputControlSource::kDataOutputs;
+  // Inlined function body placer decides what requested device assignments
+  // should be added to the nodes added to the graph. See documentation above
+  // for available strategies.
+  InlinedFunctionBodyPlacer::Config inlined_function_body_placer =
+      InlinedFunctionBodyPlacer::Default();
+  // If true, frame names in the function body will be
+  // made unique in the resulting graph (e.g. by prepending a unique prefix).
+  // NOTE(mrry): Only set this option to false when there is a single function
+  // call in the graph (e.g. when making a remote function call via
+  // ClusterFunctionLibraryRuntime). This option is provided because the graph
+  // partitioner generates frame names that must remain unmodified across all
+  // partitions of a multi-device function.
+  bool uniquify_frame_names = true;
+
+  // A human-readable debug string for this options.
+  string DebugString() const;
+};
+
+// Returns 'OkStatus()' iff the function '*fbody' can be inlined at 'node'
+// based on the type signature of 'node' and 'fbody':
+//
+// (1) Caller node has the same number of inputs and outputs as the function.
+// (2) Caller node inputs and outputs have the same data types as function
+//     inputs and returns.
+// (3) Validation rules defined in InlineFunctionBodyOptions.
+//
+// If function can't be safely inlined, returns error message with details why
+// inlining is not possible or safe.
+absl::Status ValidateInlining(const Node* node, const FunctionBody* fbody,
+                              const InlineFunctionBodyOptions& options);
+
+// Given a "caller" in graph "g", which is a function call of a function
+// to "fbody". Replaces the "caller" with fbody->graph and connects
+// edges properly. "override_device" specifies whether inlining should replace
+// explicitly specified devices inside fbody with the callee's device.
+//
+// Returns 'OkStatus()' if function was successfully inlined into the graph.
+// If function inlining is not possible returns an error with a reason, and
+// leaves the graph in unmodified state.
+absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
+                                Graph* g, Node* caller,
+                                const FunctionBody* fbody,
+                                const InlineFunctionBodyOptions& options);
+
+// There are three types of function calls that could be invoked during
+// *Tensorflow graph execution*:
+//
+// 1) Native function call (node.type_string() is the function name). These
+//    functions are always executed on a single-device, which is the device of
+//    the function call node.
+//
+// 2) Multi-device function calls (PartitionedCall or StatefulPartitionedCall
+//    ops) can execute on multiple devices and accept DT_RESOURCE inputs that
+//    belong to different devices. This type of functions was added in
+//    Tensorflow 2.0 Eager mode, and it has control outputs to represent
+//    side-effects that must always execute (see `control_ret` in FunctionDef).
+//
+// 3) SymbolicGradient has been deprecated for a while, but we still keep it and
+//    use `native` options for inlining for compatibility.
+//
+// We need to have distinct inlining rules for compatibility with Tensorflow v1.
+//
+// There are few other places in Tensorflow that could execute functions:
+//
+// 1) common_runtime/eager/kernel_and_device.{h,cc} - executes "top level"
+//    functions directly via function library runtime, without going through
+//    the graph.
+// 2) tf.data pipelines - also execute functions directly via function library
+//    runtime with custom executors.
+struct ExpandInlineFunctionsOptions {
+  ExpandInlineFunctionsOptions() : native_options(), multi_device_options() {
+    using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+    multi_device_options.output_control_src = OutputControlSrc::kControlOutputs;
+  }
+
+  InlineFunctionBodyOptions native_options;
+  InlineFunctionBodyOptions multi_device_options;
+};
+
+// WARNING(ezhulenev): PLEASE DO NOT USE THIS FUNCTION. This is a temporary
+// workaround that will be enabled only during the function inlining unification
+// (b/126811947). Contact ezhulenev@ if you think you need it.
+// TODO(ezhulenev): Delete this function.
+bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
+                           const ExpandInlineFunctionsOptions& options);
+
+// For each node in "graph", if "lib" indicates that the node is a
+// function call, inline the function body. Returns true if at least
+// one node is inlined.
+//
+// This routine goes through "graph" nodes once and applies the
+// inlining. The caller may decide to apply the inlining on "graph"
+// multiple times by calling ExpandInlineFunctions a few times.
+//
+// Function calls that can't be safely inlined into the graph (ValidateInlining
+// returns error), are ignored.
+//
+// TODO(ezhulenev): We do not FunctionLibraryRuntime for this. We need just the
+// FunctionLibraryDefinition and FunctionDefToBodyHelper to implement this (see
+// lower_function_call.cc).
+inline bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
+  return ExpandInlineFunctions(lib, graph, ExpandInlineFunctionsOptions());
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INLINE_FUNCTION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/input_colocation_exemption_registry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
new file mode 100644
index 00000000..c393fe74
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. Al Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INPUT_COLOCATION_EXEMPTION_REGISTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INPUT_COLOCATION_EXEMPTION_REGISTRY_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// TensorFlow runtime (both eager and graph) will aim to colocate ops with
+// their resource inputs so that the ops can access the resource state. In some
+// cases, such as tf.data ops, this is not desirable as the ops themselves might
+// not have a kernel registered for the device on which the resource is placed
+// and instead use a mechanism, such as a multi-device function, to access the
+// resource state.
+//
+// This registry can be used to register and list ops that should be exempt from
+// the input colocation described above.
+//
+// Example usage:
+//   REGISTER_INPUT_COLOCATION_EXEMPTION("MapDataset");
+class InputColocationExemptionRegistry {
+ public:
+  // Returns a pointer to a global InputColocationExemptionRegistry object.
+  static InputColocationExemptionRegistry* Global();
+
+  // Returns the set of ops exempt from the input colocation constraints.
+  const gtl::FlatSet<string>& Get() { return ops_; }
+
+  // Registers an op to be excluded from the input colocation constraints.
+  void Register(const string& op);
+
+ private:
+  gtl::FlatSet<string> ops_;
+};
+
+namespace input_colocation_exemption_registration {
+
+class InputColocationExemptionRegistration {
+ public:
+  explicit InputColocationExemptionRegistration(const string& op) {
+    InputColocationExemptionRegistry::Global()->Register(op);
+  }
+};
+
+}  // namespace input_colocation_exemption_registration
+
+#define REGISTER_INPUT_COLOCATION_EXEMPTION(op) \
+  REGISTER_INPUT_COLOCATION_EXEMPTION_UNIQ_HELPER(__COUNTER__, op)
+
+#define REGISTER_INPUT_COLOCATION_EXEMPTION_UNIQ_HELPER(ctr, op) \
+  REGISTER_INPUT_COLOCATION_EXEMPTION_UNIQ(ctr, op)
+
+#define REGISTER_INPUT_COLOCATION_EXEMPTION_UNIQ(ctr, op) \
+  static input_colocation_exemption_registration::        \
+      InputColocationExemptionRegistration                \
+          input_colocation_exemption_registration_fn_##ctr(op)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INPUT_COLOCATION_EXEMPTION_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/inspecting_placer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/inspecting_placer.h
new file mode 100644
index 00000000..90df36c5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/inspecting_placer.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INSPECTING_PLACER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INSPECTING_PLACER_H_
+
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// TODO(iga): Convert this struct into a class to ensure invariants between
+// device names, i.e.
+//  DeviceNameUtils::IsSpecification(resource_device_name,
+//                                   requested_device_name)
+// PossibleDevices does not contain assigned_device_name because we don't
+// assign devices to nested functions.
+struct PossibleDevices {
+  // The same as Member::requested_device_name_ in colocation_graph.cc.
+  DeviceNameUtils::ParsedName requested_device_name;
+
+  // The same as Member::resource_device_name_ in colocation_graph.cc.
+  DeviceNameUtils::ParsedName resource_device_name;
+
+  // A device type outside of this set will not be supported by some
+  // internal op.
+  PrioritizedDeviceTypeVector device_types;
+};
+
+// A struct for communicating constraints on devices that can
+// be chosen for inputs and outputs of an op requiring deep placer inspection.
+struct IOColocationGroups {
+  // input_groups[i] contains the group id that i'th input belongs to.
+  // List inputs are not supported.
+  std::vector<int> input_groups;
+  // output_groups[i] contains the group id that i'th output belongs to.
+  // List inputs are not supported.
+  std::vector<int> output_groups;
+  // group_devices[i] contains possible devices for group with id i.
+  std::vector<PossibleDevices> group_devices;
+
+  string DebugString() const;
+};
+
+class InspectingPlacer {
+ public:
+  // graph and device_set must not be null and must outlive this
+  // InspectingPlacer. default_device can be null. If not, must outlive this.
+  // TODO(iga): Add a "stack trace" to detect recursion and improve log
+  // messages. Currently, we will enter an infinite loop for recursive
+  // functions.
+  InspectingPlacer(const FunctionStack& stack,
+                   const FunctionLibraryDefinition* flib_def,
+                   const DeviceSet* device_set, const Device* default_device,
+                   bool allow_soft_placement, bool log_device_placement);
+
+  // `node` must be
+  // PlacerInspectionRequiredOpsChecker::IsPlacerInspectionRequired.
+  absl::Status ComputeIOColocationGroups(const Node& node,
+                                         IOColocationGroups* groups);
+
+ private:
+  const FunctionStack stack_;
+  const FunctionLibraryDefinition& flib_def_;
+  const DeviceSet& device_set_;
+  const Device* default_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+
+  InspectingPlacer(const InspectingPlacer&) = delete;
+  void operator=(const InspectingPlacer&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INSPECTING_PLACER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/int32_fulltype.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/int32_fulltype.h
new file mode 100644
index 00000000..1a55e0bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/int32_fulltype.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INT32_FULLTYPE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INT32_FULLTYPE_H_
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// An optimization (graph rewrite) pass to automatically set TFT_SHAPE_TENSOR
+// full type information annotations for all int32 tensors, creating or
+// modifying existing full type information as needed. This allows placement
+// mechanisms using full type information to always place int32 on host.
+class Int32FulltypePass {
+ public:
+  Int32FulltypePass() = default;
+  explicit Int32FulltypePass(string debug_location)
+      : debug_location_(debug_location) {}
+
+  // For each node in this graph that outputs int32 tensors, set full
+  // type information such that the int32 tensors use TFT_SHAPE_TENSOR
+  // (or TFT_TENSOR if ints_on_device is true, which is only for single
+  // device functions including the functions with just one op used for
+  // eager execution).
+  //
+  // This method is not thread-safe.
+  absl::Status ProcessGraph(Graph* graph, bool ints_on_device);
+
+  // Update full type information for int32 tensors that are in HOST_MEMORY
+  // to use TFT_SHAPE_TENSOR. The type_id of TENSOR_T is expected to be
+  // TFT_UNSET, TFT_TENSOR or TFT_SHAPE_TENSOR on input and will be updated
+  // to TFT_SHAPE_TENSOR on output for int32 tensors if it is not
+  // TFT_SHAPE_TENSOR already. For tensors that are not int32, if the input full
+  // type information is TFT_UNSET, it will only be updated if SET_ONLY_INT32 is
+  // false. Note that TENSOR_T is not the full type information for the outputs
+  // of a node, so it does have an outer TFT_PRODUCT. NODE and OUTPUT_IDX are
+  // optional and only used in an error message to say that the tensor is output
+  // OUTPUT_IDX of node NODE.
+  absl::Status Int32FullTypeForTensor(DataType dtype, FullTypeDef* tensor_t,
+                                      bool set_only_int32, Node* node = nullptr,
+                                      int output_idx = 0);
+
+ private:
+  // Location of where annotations were added for debug messages.
+  string debug_location_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INT32_FULLTYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h
new file mode 100644
index 00000000..1bcdc001
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ISOLATE_PLACER_INSPECTION_REQUIRED_OPS_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ISOLATE_PLACER_INSPECTION_REQUIRED_OPS_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+// Adds Identities for each input/output of function-calling ops.
+//
+// For example, the following graph calling a function on inputs `a` and `b`
+// and producing output `y` will be rewritted to include identities on all
+// edges:
+//
+//      a             b
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//         v
+//         y
+//
+// is transformed to
+//
+//      a             b
+//      |             |
+//  a_f (Identity)   a_f (Identity)
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//      f_y (Identity)
+//         |
+//         v
+//         y
+//
+// This pass is currently needed to simplify correctly placing the nodes
+// producing inputs for as well as consuming output from function-calling ops.
+//
+// This pass should also help to implement replacing PartitionedCallOp with
+// component function calls (to avoid copying input/output tensors), if we get
+// to it.
+class IsolatePlacerInspectionRequiredOpsPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ISOLATE_PLACER_INSPECTION_REQUIRED_OPS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
new file mode 100644
index 00000000..fcab9a65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -0,0 +1,86 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Device;
+class FunctionLibraryRuntime;
+class ProcessFunctionLibraryRuntime;
+struct SessionOptions;
+class DynamicDeviceMgr;
+
+namespace test {
+
+class Benchmark {
+ public:
+  // "device" must be either "cpu" or "gpu".  Takes ownership of "g",
+  // "init", and one reference on "rendez" (if not null).
+  //
+  // old_benchmark_api: If true, the benchmark is running with older API
+  //   * In the old API, the timer needs to be stopped/restarted
+  //     by users.
+  //   * In the new API, the timer starts automatically at the first
+  //     iteration of the loop and stops after the last iteration.
+  // TODO(vyng) Remove this once we have migrated all code to newer API.
+  Benchmark(const string& device, Graph* g,
+            const SessionOptions* options = nullptr, Graph* init = nullptr,
+            Rendezvous* rendez = nullptr, const char* executor_type = "",
+            bool old_benchmark_api = false);
+
+  Benchmark(const string& device, Graph* g, bool old_benchmark_api);
+
+  ~Benchmark();
+
+  void Run(benchmark::State& state);
+
+  void RunWithRendezvousArgs(
+      const std::vector<std::pair<string, Tensor>>& inputs,
+      const std::vector<string>& outputs, benchmark::State& state);
+
+ private:
+  thread::ThreadPool* pool_ = nullptr;  // Not owned.
+  Device* device_ = nullptr;            // Not owned.
+  Rendezvous* rendez_ = nullptr;
+  std::unique_ptr<DynamicDeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_;  // Not owned.
+  std::unique_ptr<Executor> exec_;
+
+  Benchmark(const Benchmark&) = delete;
+  void operator=(const Benchmark&) = delete;
+};
+
+// Returns the rendezvous key associated with the given Send/Recv node.
+string GetRendezvousKey(const Node* node);
+
+}  // end namespace test
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/layout_pass_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/layout_pass_util.h
new file mode 100644
index 00000000..909ff86f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/layout_pass_util.h
@@ -0,0 +1,82 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LAYOUT_PASS_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LAYOUT_PASS_UTIL_H_
+
+#if defined(INTEL_MKL) || defined(AMD_ZENDNN)
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Temporarily wrapping these helper functions in the zendnn namespace
+// to avoid crashing with similar functions in mkl_layout_pass.cc.
+// TODO(penporn): Delete the functions in mkl_layout_pass and use the functions
+// here after TF 2.12 branch cut.
+namespace zendnn {
+
+// Is OpDef::ArgDef a list type? It could be N * T or list(type).
+// Refer to opdef.proto for details of list type.
+inline bool ArgIsList(const OpDef::ArgDef &arg);
+
+// Get length of a list in 'n' if 'arg' is of list type. Refer to
+// description of ArgIsList for definition of list type.
+inline int GetTensorListLength(const OpDef::ArgDef &arg, const Node *n);
+
+// Can op represented by node 'n' run on DEVICE_CPU?
+// Op can run on CPU with ZenDNN if the runtime assigned device or the
+// user requested device contains device CPU, or both are empty.
+bool CanOpRunOnCPUDevice(const Node *n);
+
+// Get nodes that will feed a list of TF tensors to the new
+// node that we are constructing.
+//
+// @input inputs - inputs to old node that we are using for constructing
+//                 new inputs,
+// @input input_idx - the index in the 'inputs' vector pointing to the
+//                    current input that we have processed so far
+// @output input_idx - index will be incremented by the number of nodes
+//                     from 'inputs' that are processed
+// @input list_length - The expected length of list of TF tensors
+// @output output_nodes - the list of new nodes creating TF tensors
+//
+// @return None
+void GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node *, int>, 4> &inputs, int *input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut> *output_nodes);
+
+// Create new inputs by copying old inputs 'inputs' for the rewritten node
+// in 'nb' in graph 'g'. Original node is input in 'orig_node'. This is mostly
+// used in the context of rewrite for just operator name change in which
+// inputs of old operator and new operator are same.
+//
+// Returns OkStatus() if setting up inputs is successful, otherwise
+// returns appropriate status code.
+Status CopyInputs(
+    const Node *old_node,
+    const gtl::InlinedVector<std::pair<Node *, int>, 4> &old_node_inputs,
+    NodeBuilder *nb);
+
+}  // namespace zendnn
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL || AMD_ZENDNN
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LAYOUT_PASS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_device.h
new file mode 100644
index 00000000..595d3b88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_device.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+namespace test {
+class Benchmark;
+}
+struct SessionOptions;
+
+// This class is shared by ThreadPoolDevice and GPUDevice and
+// initializes a shared Eigen compute device used by both.  This
+// should eventually be removed once we refactor ThreadPoolDevice and
+// GPUDevice into more 'process-wide' abstractions.
+class LocalDevice : public Device {
+ public:
+  LocalDevice(const SessionOptions& options,
+              const DeviceAttributes& attributes);
+  ~LocalDevice() override;
+
+ private:
+  static bool use_global_threadpool_;
+
+  static void set_use_global_threadpool(bool use_global_threadpool) {
+    use_global_threadpool_ = use_global_threadpool;
+  }
+
+  struct EigenThreadPoolInfo;
+  std::unique_ptr<EigenThreadPoolInfo> owned_tp_info_;
+
+  friend class test::Benchmark;
+
+  LocalDevice(const LocalDevice&) = delete;
+  void operator=(const LocalDevice&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_executor_params.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_executor_params.h
new file mode 100644
index 00000000..a363f113
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_executor_params.h
@@ -0,0 +1,57 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_EXECUTOR_PARAMS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_EXECUTOR_PARAMS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class Device;
+class StepStatsCollector;
+class SessionMetadata;
+class FunctionLibraryRuntime;
+class NodeProperties;
+class OpKernel;
+
+// LocalExecutorParams provides arguments that will be shared by all invocations
+// of an executor. We expect that different contexts would provide different
+// implementations (e.g. local versus distributed).
+struct LocalExecutorParams {
+  Device* device;
+
+  const SessionMetadata* session_metadata = nullptr;
+
+  // The library runtime support.
+  FunctionLibraryRuntime* function_library = nullptr;
+
+  // create_kernel returns an instance of op kernel based on NodeDef.
+  // delete_kernel is called for every kernel used by the executor
+  // when the executor is deleted.
+  std::function<absl::Status(const std::shared_ptr<const NodeProperties>&,
+                             OpKernel**)>
+      create_kernel;
+  std::function<void(OpKernel*)> delete_kernel;
+
+  // Whether control flow nodes are allowed to be executed synchronously.
+  bool allow_control_flow_sync_execution = false;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_EXECUTOR_PARAMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_session_selection.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_session_selection.h
new file mode 100644
index 00000000..9e21c8d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/local_session_selection.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_SESSION_SELECTION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_SESSION_SELECTION_H_
+
+namespace tensorflow {
+
+// The TF Session implementations that can be used to run local sessions, i.e.
+// when session_target in SessionOptions is empty.
+enum class LocalSessionImpl {
+  kDirectSession,
+  kTfrtSession,
+};
+
+void SetDefaultLocalSessionImpl(LocalSessionImpl impl);
+LocalSessionImpl GetDefaultLocalSessionImpl();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_SESSION_SELECTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_case_op.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_case_op.h
new file mode 100644
index 00000000..65b56e51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_case_op.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_CASE_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_CASE_OP_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Graph;
+class Node;
+
+// Replaces Case node `n` with a lowered form that uses _SwitchN/Merge nodes.
+absl::Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_CASE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_function_call_inline_policy.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_function_call_inline_policy.h
new file mode 100644
index 00000000..6dc48f8e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_function_call_inline_policy.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_INLINE_POLICY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_INLINE_POLICY_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// LINT.IfChange
+enum class FunctionCallInlinePolicy {
+  // Place input nodes on the same device as the corresponding caller input
+  // node. Do not specify any placement for all other nodes.
+  kDefaultPlacer,
+
+  // Place all nodes on the same device as caller node.
+  kSingleDevicePlacer,
+
+  // Place input nodes on the same device as the corresponding caller input
+  // node. Do not place output node. Place control nodes on the same device as
+  // caller node. For all function body nodes overrides job, replica and task
+  // parts of the device assignment to match function caller node.
+  kMultiDevicePlacer
+};
+// LINT.ThenChange(inline_function_utils.h,\
+//   ../../compiler/mlir/tensorflow/ir/tf_ops.cc)
+
+struct LowerFunctionalOpsConstants {
+  static constexpr const char* const kLowerUsingSwitchMergeAttr =
+      "_lower_using_switch_merge";
+  static constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
+      "_lower_as_multi_device_function";
+};
+
+// Inliner policy used in common runtime's lower function call op.
+
+// Returns the function call inline policy to use for a given call.
+FunctionCallInlinePolicy GetFunctionCallInlinePolicy(const Node* n);
+
+// Overload of GetFunctionCallInlinePolicy that doesn't require an op but only
+// the features required.
+FunctionCallInlinePolicy GetFunctionCallInlinePolicy(
+    bool is_partioned_call, bool has_lower_as_multi_device_function_attr);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_INLINE_POLICY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_function_call_op.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_function_call_op.h
new file mode 100644
index 00000000..71d5e807
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_function_call_op.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_OP_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FunctionLibraryDefinition;
+class Graph;
+class Node;
+
+// Replaces function call node `n` with its function body. Uses
+// InlineFunctionBody from `common_runtime/function.{h,cc}`. If function
+// inlining is not possible or safe (see ValidateInlining), leaves the graph in
+// unmodified state and returns OkStatus();
+absl::Status RewriteFunctionCallNode(Node* n, Graph* g,
+                                     const FunctionLibraryDefinition& flib_def,
+                                     bool keep_caller_fetchable);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTION_CALL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_functional_ops.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_functional_ops.h
new file mode 100644
index 00000000..a849550a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_functional_ops.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTIONAL_OPS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTIONAL_OPS_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Rewrite functional ops into low level primitives:
+// - If/While ops lowered into low level control flow primitives: Switch, Merge,
+//   Enter, Exit, NextIteration
+// - Function calls inlined into the main graph
+//
+// IMPORTANT: Although SymbolicGradient is a function call, we currently do not
+// lower it, because it has been deprecated for a while.
+class LowerFunctionalOpsPass : public GraphOptimizationPass {
+ public:
+  LowerFunctionalOpsPass() = default;
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+  static constexpr const char* const kLowerUsingSwitchMergeAttr =
+      LowerFunctionalOpsConstants::kLowerUsingSwitchMergeAttr;
+  static constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
+      LowerFunctionalOpsConstants::kLowerAsMultiDeviceFunctionAttr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_FUNCTIONAL_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_if_op.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_if_op.h
new file mode 100644
index 00000000..c125a197
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_if_op.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Graph;
+class Node;
+
+// Replaces If node `n` with its lowered form that uses Switch and Merge nodes.
+absl::Status RewriteIfNode(Node* n, Graph* g, bool keep_node_fetchable);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_while_op.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_while_op.h
new file mode 100644
index 00000000..98095dee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/lower_while_op.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_WHILE_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_WHILE_OP_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Graph;
+class Node;
+class FunctionLibraryDefinition;
+
+// Replaces While node `n` with its lowered form that uses Enter, Exit, Switch,
+// Merge, NextIteration and LoopCond nodes.
+absl::Status RewriteWhileNode(Node* n, Graph* g,
+                              const FunctionLibraryDefinition* flib_def,
+                              bool keep_node_fetchable);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_WHILE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/memory_types.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/memory_types.h
new file mode 100644
index 00000000..46a943c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/memory_types.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+
+#include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Returns an error iff *g running on a single device of 'device_type'
+// has memory type mismatch for any edge's source and destination.
+absl::Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g);
+
+// Updates '*g' so that every edge's source and destination has
+// compatible memory types by inserting proper HostSend/Recv and
+// Send/HostRecv nodes.  'device_type' specifies the type of device on
+// which '*g' is going to run on and that device has the name
+// 'device_name'.
+//
+// Returns OK if '*g' is updated properly (ValidateMemoryTypes(g) must
+// be OK). Otherwise, returns an error and '*g' may be in an
+// invalidate state and the caller should discard it.
+absl::Status EnsureMemoryTypes(const DeviceType& device_type,
+                               const string& device_name, Graph* g);
+
+// Get the memory type for 'index'th output of node 'n' in graph 'g', when
+// running on 'device_type'.
+absl::Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
+                                 const Node* n, int index,
+                                 MemoryType* memory_type);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/mkl_cpu_allocator.h
new file mode 100644
index 00000000..54d60fdc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -0,0 +1,331 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A simple CPU allocator that intercepts malloc/free calls from MKL library
+// and redirects them to Tensorflow allocator
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+
+#ifdef INTEL_MKL
+
+#include <cstdlib>
+
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/numa.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/onednn_env_vars.h"
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
+namespace tensorflow {
+
+static bool mkl_small_allocator_collect_stats = false;
+
+class MklSubAllocator : public BasicCPUAllocator {
+ public:
+  MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
+  ~MklSubAllocator() override {}
+};
+
+// CPU allocator that handles small-size allocations by calling
+// suballocator directly. Mostly, it is just a wrapper around a suballocator
+// (that calls malloc and free directly) with support for bookkeeping.
+class MklSmallSizeAllocator : public Allocator {
+ public:
+  MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory,
+                        const string& name)
+      : sub_allocator_(sub_allocator), name_(name) {
+    stats_.bytes_limit = total_memory;
+  }
+  ~MklSmallSizeAllocator() override {}
+
+  MklSmallSizeAllocator(const MklSmallSizeAllocator&) = delete;
+  void operator=(const MklSmallSizeAllocator&) = delete;
+
+  inline string Name() override { return name_; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    void* ptr = port::AlignedMalloc(num_bytes, alignment);
+    if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes);
+    return ptr;
+  }
+
+  void DeallocateRaw(void* ptr) override {
+    if (ptr == nullptr) {
+      LOG(ERROR) << "tried to deallocate nullptr";
+      return;
+    }
+
+    if (mkl_small_allocator_collect_stats) {
+      const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
+      DecrementStats(alloc_size);
+    }
+    port::AlignedFree(ptr);
+  }
+
+  absl::optional<AllocatorStats> GetStats() override {
+    mutex_lock l(mutex_);
+    return stats_;
+  }
+
+  bool ClearStats() override {
+    mutex_lock l(mutex_);
+    stats_.num_allocs = 0;
+    stats_.peak_bytes_in_use = 0;
+    stats_.largest_alloc_size = 0;
+    stats_.bytes_in_use = 0;
+    stats_.bytes_limit = 0;
+    return true;
+  }
+
+ private:
+  // Increment statistics for the allocator handling small allocations.
+  inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    ++stats_.num_allocs;
+    stats_.bytes_in_use += alloc_size;
+    stats_.peak_bytes_in_use =
+        std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+    stats_.largest_alloc_size =
+        std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size));
+  }
+
+  // Decrement statistics for the allocator handling small allocations.
+  inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    stats_.bytes_in_use -= dealloc_size;
+  }
+
+  SubAllocator* sub_allocator_;  // Not owned by this class.
+
+  // Mutex for protecting updates to map of allocations.
+  mutable mutex mutex_;
+
+  // Allocator name
+  string name_;
+
+  // Allocator stats for small allocs
+  AllocatorStats stats_ TF_GUARDED_BY(mutex_);
+};
+
+/// CPU allocator for MKL that wraps BFC allocator and intercepts
+/// and redirects memory allocation calls from MKL.
+class MklCPUAllocator : public Allocator {
+ public:
+  // Constructor and other standard functions
+
+  /// Environment variable that user can set to upper bound on memory allocation
+  static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES";
+
+  /// Default upper limit on allocator size - 64GB
+  static constexpr size_t kDefaultMaxLimit = 64LL << 30;
+
+  MklCPUAllocator() { TF_CHECK_OK(Initialize()); }
+
+  ~MklCPUAllocator() override {
+    delete small_size_allocator_;
+    delete large_size_allocator_;
+  }
+
+  Status Initialize() {
+    VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
+
+    // Set upper bound on memory allocation to physical RAM available on the
+    // CPU unless explicitly specified by user
+    uint64 max_mem_bytes = kDefaultMaxLimit;
+#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
+    max_mem_bytes =
+        (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE);
+#endif
+    char* user_mem_bytes = getenv(kMaxLimitStr);
+
+    if (user_mem_bytes != NULL) {
+      uint64 user_val = 0;
+      if (!strings::safe_strtou64(user_mem_bytes, &user_val)) {
+        return errors::InvalidArgument("Invalid memory limit (", user_mem_bytes,
+                                       ") specified for MKL allocator through ",
+                                       kMaxLimitStr);
+      }
+#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
+      if (user_val > max_mem_bytes) {
+        LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr
+                     << "=" << user_val
+                     << " greater than available physical memory: "
+                     << max_mem_bytes
+                     << ". This could significantly reduce performance!";
+      }
+#endif
+      max_mem_bytes = user_val;
+    }
+
+    VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes;
+
+    sub_allocator_ = new MklSubAllocator();
+
+    // SubAllocator is owned by BFCAllocator, so we do not need to deallocate
+    // it in MklSmallSizeAllocator.
+    small_size_allocator_ =
+        new MklSmallSizeAllocator(sub_allocator_, max_mem_bytes, kName);
+
+    BFCAllocator::Options large_allocator_opts;
+    large_allocator_opts.allow_growth = kAllowGrowth;
+    large_size_allocator_ =
+        new BFCAllocator(absl::WrapUnique(sub_allocator_), max_mem_bytes, kName,
+                         large_allocator_opts);
+    return OkStatus();
+  }
+
+  inline string Name() override { return kName; }
+  inline bool IsSmallSizeAllocation(const void* ptr) const
+      TF_LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    return large_allocations_map_.find(ptr) == large_allocations_map_.end();
+  }
+  // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
+  inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    if (ptr != nullptr) {
+      std::pair<void*, size_t> map_val(ptr, num_bytes);
+      large_allocations_map_.insert(map_val);
+    }
+  }
+  inline void RemoveLargeAllocMap(void* ptr)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    auto map_iter = large_allocations_map_.find(ptr);
+    if (map_iter != large_allocations_map_.end()) {
+      large_allocations_map_.erase(map_iter);
+    } else {
+      LOG(ERROR) << "tried to deallocate invalid pointer";
+    }
+    return;
+  }
+
+  inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    // If the allocation size is less than threshold, call small allocator,
+    // otherwise call large-size allocator (BFC). We found that BFC allocator
+    // does not deliver good performance for small allocations when
+    // inter_op_parallelism_threads is high.
+    if (UseSystemAlloc() || num_bytes < kSmallAllocationsThreshold) {
+      return small_size_allocator_->AllocateRaw(alignment, num_bytes);
+    } else {
+      mutex_lock l(mutex_);
+      void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes);
+      AddLargeAllocMap(ptr, num_bytes);
+      return ptr;
+    }
+  }
+  inline void DeallocateRaw(void* ptr) override {
+    // Check if ptr is for "small" allocation. If it is, then call Free
+    // directly. Otherwise, call BFC to handle free.
+    if (UseSystemAlloc() || IsSmallSizeAllocation(ptr)) {
+      small_size_allocator_->DeallocateRaw(ptr);
+    } else {
+      mutex_lock l(mutex_);
+      RemoveLargeAllocMap(ptr);
+      large_size_allocator_->DeallocateRaw(ptr);
+    }
+  }
+  absl::optional<AllocatorStats> GetStats() override {
+    auto s_stats = small_size_allocator_->GetStats();
+    auto l_stats = large_size_allocator_->GetStats();
+
+    // Combine statistics from small-size and large-size allocator.
+    mutex_lock l(mutex_);
+    stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs;
+    stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use;
+    stats_.peak_bytes_in_use =
+        l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use;
+
+    // Since small-size allocations go to MklSmallSizeAllocator,
+    // max_alloc_size from large_size_allocator would be the maximum
+    // size allocated by MklCPUAllocator.
+    stats_.largest_alloc_size = l_stats->largest_alloc_size;
+    stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit);
+    return stats_;
+  }
+
+  bool ClearStats() override {
+    bool stats_cleared = small_size_allocator_->ClearStats();
+    stats_cleared &= large_size_allocator_->ClearStats();
+    return stats_cleared;
+  }
+
+ private:
+  // Hooks provided by this allocator for memory allocation routines from MKL
+  static inline void* MallocHook(size_t size) {
+    VLOG(3) << "MklCPUAllocator: In MallocHook";
+    return cpu_allocator()->AllocateRaw(kAlignment, size);
+  }
+
+  static inline void FreeHook(void* ptr) {
+    VLOG(3) << "MklCPUAllocator: In FreeHook";
+    cpu_allocator()->DeallocateRaw(ptr);
+  }
+
+  static inline void* CallocHook(size_t num, size_t size) {
+    Status s = Status(absl::StatusCode::kUnimplemented,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+    return nullptr;  // return a value and make static code analyzers happy
+  }
+
+  static inline void* ReallocHook(void* ptr, size_t size) {
+    Status s = Status(absl::StatusCode::kUnimplemented,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+    return nullptr;  // return a value and make static code analyzers happy
+  }
+
+  // Do we allow growth in BFC Allocator
+  static const bool kAllowGrowth = true;
+
+  // Name
+  static constexpr const char* kName = "mklcpu";
+
+  // The alignment that we need for the allocations
+  static constexpr const size_t kAlignment = 64;
+
+  Allocator* large_size_allocator_;              // owned by this class
+  MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
+
+  SubAllocator* sub_allocator_;  // not owned by this class
+  mutable mutex mutex_;
+  AllocatorStats stats_ TF_GUARDED_BY(mutex_);
+
+  // Hash map to keep track of "BFC" allocations
+  // We do not use BFC allocator for small allocations.
+  std::unordered_map<const void*, size_t> large_allocations_map_
+      TF_GUARDED_BY(mutex_);
+
+  // Size in bytes that defines the upper-bound for "small" allocations.
+  // Any allocation below this threshold is "small" allocation.
+  static constexpr const size_t kSmallAllocationsThreshold = 262144;
+
+  // Prevent copying and assignment
+  MklCPUAllocator(const MklCPUAllocator&) = delete;
+  void operator=(const MklCPUAllocator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/mkl_layout_pass.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/mkl_layout_pass.h
new file mode 100644
index 00000000..6b5c586c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/mkl_layout_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A graph pass that rewrites graph for propagating MKL layout as a tensor
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_LAYOUT_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_LAYOUT_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h
new file mode 100644
index 00000000..baeebef6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_EXAMPLE_PLUGIN_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_EXAMPLE_PLUGIN_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+// This is an example plugin that impelements several basic APIs for event. This
+// is for testing only.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TFNPD_DeviceEvent {
+  tfrt::RCReference<tfrt::AsyncValue> event;
+};
+
+// Does not pass ownership of returned TFNPD_Api* to caller.
+const TFNPD_Api* GetExamplePluginApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+namespace example_plugin {
+
+// A helper method that generates a TFNPD_DeviceEvent, and makes the event
+// available (or ready) in two seconds.
+TFNPD_DeviceEvent* CreateDeviceEventAndSetAvailable(tfrt::HostContext* host,
+                                                    bool set_as_error = false);
+
+}  // namespace example_plugin
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_EXAMPLE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/outside_compilation_params.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/outside_compilation_params.h
new file mode 100644
index 00000000..21b17104
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/outside_compilation_params.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_OUTSIDE_COMPILATION_PARAMS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_OUTSIDE_COMPILATION_PARAMS_H_
+
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct SE_OutsideCompilationParams {
+  char* device_name;
+  char* rendezvous_key;
+  TF_RendezvousThunk* rendezvous;
+  TpuSerializedProto host_transfers;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_OUTSIDE_COMPILATION_PARAMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
new file mode 100644
index 00000000..e44e5f3f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
@@ -0,0 +1,176 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_C_API_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_C_API_H_
+
+#include <cstddef>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "xla/c/c_api_decl.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+
+#define TFNPD_MAJOR 0
+#define TFNPD_MINOR 0
+#define TFNPD_PATCH 1
+
+// Experimental C API for TensorFlow Next Pluggable device (TFNPD).
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ----------------------------  Event  ----------------------------------------
+typedef struct TFNPD_DeviceEvent TFNPD_DeviceEvent;
+
+typedef TFNPD_DeviceEvent* TFNPD_NewDeviceEvent();
+
+typedef void TFNPD_DeviceEventAwait(TFNPD_DeviceEvent* event,
+                                    TF_Status* status);
+
+typedef bool TFNPD_DeviceEventIsReady(TFNPD_DeviceEvent* event);
+
+// Invokes the callback after event becomes ready.
+typedef void TFNPD_DeviceEventAndThen(TFNPD_DeviceEvent* event,
+                                      void (*callback)(void*),
+                                      void* callback_arg);
+
+typedef void TFNPD_DeviceEventDelete(TFNPD_DeviceEvent* event);
+
+// --------------------------  Allocator  --------------------------------------
+typedef struct TFNPD_DeviceAllocator TFNPD_DeviceAllocator;
+
+typedef TFNPD_DeviceAllocator* TFNPD_DeviceAllocatorCreate(int device_ordinal);
+
+typedef void* TFNPD_DeviceAllocateRaw(TFNPD_DeviceAllocator* allocator,
+                                      size_t alignment, size_t num_bytes);
+
+typedef void TFNPD_DeviceDeallocateRaw(TFNPD_DeviceAllocator* allocator,
+                                       void* ptr);
+
+typedef TF_StringView TFNPD_DeviceAllocatorName(
+    TFNPD_DeviceAllocator* allocator);
+
+typedef bool TFNPD_DeviceAllocatorAllocatesOpaqueHandle(
+    TFNPD_DeviceAllocator* allocator);
+
+typedef void TFNPD_DeviceAllocatorDelete(TFNPD_DeviceAllocator* allocator);
+
+// ------------------------  Tensor Transfers  ---------------------------------
+typedef struct TFNPD_DeviceContext TFNPD_DeviceContext;
+
+// TODO(chuanhao): use an option struct to create context. Plugin can define the
+// option so that we support more features in the DeviceContext, e.g.
+// shape_determination_fns.
+typedef TFNPD_DeviceContext* TFNPD_DeviceContextCreate(int device_ordinal);
+
+typedef TFNPD_DeviceEvent* TFNPD_DeviceTensorToHostTensor(
+    TFNPD_DeviceContext* device_context, const TF_Tensor* device_tensor,
+    TF_Tensor* cpu_tensor, TF_Status* status);
+
+typedef TFNPD_DeviceEvent* TFNPD_HostTensorToDeviceTensor(
+    TFNPD_DeviceContext* device_context, const TF_Tensor* cpu_tensor,
+    TF_Tensor* device_tensor, TF_Status* status);
+
+typedef TFNPD_DeviceEvent* TFNPD_SameDeviceTensorCopy(
+    TFNPD_DeviceContext* context);
+
+typedef PJRT_Buffer* TFNPD_SameDevicePjRtBufferCopy(PJRT_Buffer* src_buffer,
+                                                    PJRT_Client* c_client,
+                                                    TF_Status* status);
+
+typedef void TFNPD_DeviceContextDelete(TFNPD_DeviceContext* context);
+
+// ------------------------------  TF2XLA  -------------------------------------
+// TODO(b/254484247): either separate XLA_Shape to its own file, or use PJRT
+// solution when it is ready.
+typedef void TFNPD_XlaShapeToDeviceShapeRepresentation(
+    XLA_Shape* serialized_xla_shape, int data_type, bool use_fast_memory,
+    XLA_LayoutPreference layout_preference, XLA_Shape* serialized_device_shape,
+    TF_Status* tf_status);
+
+// -----------------------  Plugin System related  -----------------------------
+typedef int32_t TFNPD_GetDeviceCount(TF_Status* status);
+
+// Initialize any per-device states or resources that are internal to plugin.
+typedef void TFNPD_InitPluginInternalDeviceStates(TF_Status* status);
+
+// --------------------------- C API access ------------------------------------
+#define TFNPD_API_STRUCT_FN(fn_type) fn_type* fn_type
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+
+  TFNPD_API_STRUCT_FN(TFNPD_NewDeviceEvent);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventAwait);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventIsReady);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventAndThen);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventDelete);
+
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorCreate);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocateRaw);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceDeallocateRaw);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorName);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorAllocatesOpaqueHandle);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorDelete);
+
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceContextCreate);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceContextDelete);
+
+  // TODO(chuanhao): Deprecate the tensor transfer C APIs when PJRT API
+  // development is ready since we plan to adopt PJRT as Device API.
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceTensorToHostTensor);
+  TFNPD_API_STRUCT_FN(TFNPD_HostTensorToDeviceTensor);
+  TFNPD_API_STRUCT_FN(TFNPD_SameDeviceTensorCopy);
+  TFNPD_API_STRUCT_FN(TFNPD_SameDevicePjRtBufferCopy);
+
+  TFNPD_API_STRUCT_FN(TFNPD_XlaShapeToDeviceShapeRepresentation);
+
+  TFNPD_API_STRUCT_FN(TFNPD_GetDeviceCount);
+  TFNPD_API_STRUCT_FN(TFNPD_InitPluginInternalDeviceStates);
+} TFNPD_Api;
+
+const size_t TFNPD_Api_STRUCT_SIZE =
+    TF_OFFSET_OF_END(TFNPD_Api, TFNPD_InitPluginInternalDeviceStates);
+
+#undef TFNPD_API_STRUCT_FN
+
+typedef struct TFNPD_PluginParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  const char* device_type;              // output, set by plugin
+  const char* compilation_device_name;  // output, set by plugin
+  int32_t priority;                     // output, set by plugin
+  // Certain devices may set this one to false to avoid using device copy logic
+  // implemented for legacy PluggableDevice.
+  bool is_pluggable_device;         // output, set by plugin
+  bool use_pjrt_on_demand_compile;  // output, set by plugin
+} TFNPD_PluginParams;
+const size_t TFNPD_PLUGIN_PARAMS_STRUCT_SIZE =
+    TF_OFFSET_OF_END(TFNPD_PluginParams, is_pluggable_device);
+const TFNPD_Api* TFNPD_InitPlugin(TFNPD_PluginParams* params,
+                                  TF_Status* tf_status);
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif  // defined(__cplusplus)
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api.h
new file mode 100644
index 00000000..507faaf4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api.h
@@ -0,0 +1,87 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_Tensor TF_Tensor;
+typedef struct TSL_Status TF_Status;
+
+// Structs for TF_StatusCallback.
+
+typedef void (*TF_StatusCallback_Function)(void*, TF_Status*);
+typedef struct TF_StatusCallback {
+  void* context;
+  TF_StatusCallback_Function callback;
+} TF_StatusCallback;
+
+// Structs for CopyCPUTensorToDevice API.
+typedef struct TF_DeviceContext_CopyCPUTensorToDevice_Params {
+  TF_Tensor* cpu_tensor;
+  // API for `Device` is not available.
+  // Device* device;
+  TF_Tensor* device_tensor;  // out
+  TF_StatusCallback* done;
+  bool sync_dst_compute;
+} TF_DeviceContext_CopyCPUTensorToDevice_Params;
+
+typedef void (*TF_DeviceContext_CopyCPUTensorToDevice_Function)(
+    void*, TF_DeviceContext_CopyCPUTensorToDevice_Params*);
+
+// Structs for CopyDeviceTensorToCPU API.
+typedef struct TF_DeviceContext_CopyDeviceTensorToCPU_Params {
+  TF_Tensor* device_tensor;
+  char* tensor_name;
+  // API for `Device` is not available.
+  // Device* device;
+  uint32_t tensor_name_len;
+  TF_Tensor* cpu_tensor;  // out
+  TF_StatusCallback* done;
+} TF_DeviceContext_CopyDeviceTensorToCPU_Params;
+
+typedef void (*TF_DeviceContext_CopyDeviceTensorToCPU_Function)(
+    void*, TF_DeviceContext_CopyDeviceTensorToCPU_Params*);
+
+// Structs for CopyTensorInSameDevice API.
+typedef struct TF_DeviceContext_CopyTensorInSameDevice_Params {
+  TF_Tensor* input_tensor;
+  // API for `Device` is not available.
+  // Device* device;
+  TF_Tensor* output_tensor;  // out
+  TF_StatusCallback* done;
+} TF_DeviceContext_CopyTensorInSameDevice_Params;
+
+typedef void (*TF_DeviceContext_CopyTensorInSameDevice_Function)(
+    void*, TF_DeviceContext_CopyTensorInSameDevice_Params*);
+
+/* DeviceContext */
+typedef struct TF_DeviceContext {
+  void* device_context;
+  TF_DeviceContext_CopyCPUTensorToDevice_Function cpu_to_device_func;
+  TF_DeviceContext_CopyDeviceTensorToCPU_Function device_to_cpu_func;
+  TF_DeviceContext_CopyTensorInSameDevice_Function same_device_func;
+} TF_DeviceContext;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api_helper.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api_helper.h
new file mode 100644
index 00000000..c037f48a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api_helper.h
@@ -0,0 +1,33 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_HELPER_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+TF_DeviceContext* DeviceContext_ToC(DeviceContext* device_context);
+
+void DeviceContext_Destroy(TF_DeviceContext* c_device_context);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api_internal.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api_internal.h
new file mode 100644
index 00000000..52bf1ead
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api_internal.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_INTERNAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_device_context_c_api.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+DeviceContext* DeviceContext_FromC(TF_DeviceContext* c_device_context);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_DEVICE_CONTEXT_C_API_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h
new file mode 100644
index 00000000..706efe42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h
@@ -0,0 +1,102 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_H_
+
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"  // IWYU pragma: export
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_DeviceContext TF_DeviceContext;
+
+typedef struct TFDevice_AllocatorAttributes {
+  uint32_t value;
+  int32_t scope_id;
+} TFDevice_AllocatorAttributes;
+
+typedef struct TFE_CancellationManager TFE_CancellationManager;
+
+typedef struct TF_RendezvousArgsStruct {
+  TF_DeviceContext* device_context;
+  TFDevice_AllocatorAttributes alloc_attrs;
+  TFE_CancellationManager* cancellation_manager;
+} TF_RendezvousArgsStruct;
+
+typedef struct TF_RendezvousParsedKey {
+  char* full_key;
+  uint32_t full_key_size;
+} TF_RendezvousParsedKey;
+
+typedef struct TF_RendezvousSend_Params {
+  const TF_RendezvousParsedKey* key;
+  const TF_RendezvousArgsStruct* args;
+  TF_Tensor* tensor;
+  bool is_dead;
+
+  TF_Status* status;  // out
+} TF_RendezvousSend_Params;
+
+typedef void (*TF_RendezvousSend_Function)(void*, TF_RendezvousSend_Params*);
+
+typedef struct TF_RendezvousDoneCallback_Params {
+  void* context;
+  const TF_Status* status;
+  // TODO: Pass args through.
+  // const TF_RendezvousArgsStruct* sender_args;
+  // const TF_RendezvousArgsStruct* recver_args;
+  const TF_Tensor* tensor;
+  bool is_dead;
+} TF_RendezvousDoneCallback_Params;
+
+typedef void (*TF_RendezvousDoneCallback_Function)(
+    void*, TF_RendezvousDoneCallback_Params*);
+
+typedef struct TF_RendezvousDoneCallbackImpl {
+  void* context;
+  TF_RendezvousDoneCallback_Function callback;
+} TF_RendezvousDoneCallbackImpl;
+
+typedef struct TF_RendezvousAsyncRecv_Params {
+  void* context;
+  const TF_RendezvousParsedKey* key;
+  const TF_RendezvousArgsStruct* args;
+  TF_RendezvousDoneCallbackImpl on_done;
+} TF_RendezvousAsyncRecv_Params;
+
+typedef void (*TF_RendezvousAsyncRecv_Function)(void*,
+                                                TF_RendezvousAsyncRecv_Params*);
+
+typedef void (*TF_RendezvousStartAbort_Function)(void* context,
+                                                 const TF_Status*);
+
+typedef struct TF_RendezvousThunk {
+  void* rendezvous;
+  TF_RendezvousSend_Function send_func;
+  TF_RendezvousAsyncRecv_Function async_recv_func;
+  TF_RendezvousStartAbort_Function start_abort_func;
+} TF_RendezvousThunk;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_defn.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_defn.h
new file mode 100644
index 00000000..9e9cbccd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_defn.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_DEFN_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_DEFN_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+
+struct TF_CancellationManager {
+  tensorflow::CancellationManager* cancellation_manager;  // not owned
+};
+
+struct TF_TensorWrapper {
+  tensorflow::Tensor tensor;
+};
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_DEFN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_helper.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_helper.h
new file mode 100644
index 00000000..e55b8583
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_helper.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_HELPER_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+std::unique_ptr<tensorflow::RendezvousInterface> FromC(
+    const TF_RendezvousThunk* thunk);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_internal.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_internal.h
new file mode 100644
index 00000000..30cc3ec0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_INTERNAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_INTERNAL_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+TF_RendezvousThunk* ToC(tensorflow::RendezvousInterface* rendezvous);
+void Destroy(TF_RendezvousThunk* thunk);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_RENDEZVOUS_C_API_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_tensor_utils.h
new file mode 100644
index 00000000..f1a35ffc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c/tf_tensor_utils.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_TENSOR_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_TENSOR_UTILS_H_
+
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+void CopyTF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
+
+TF_Tensor* CopyTensorToTF_Tensor(const Tensor& src);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_TF_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h
new file mode 100644
index 00000000..8d9d3268
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+#include "tensorflow/c/kernels_experimental.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+class CPluginCoordinationServiceAgent : public PluginCoordinationServiceAgent {
+ public:
+  explicit CPluginCoordinationServiceAgent(void* agent)
+      : agent_(reinterpret_cast<TF_CoordinationServiceAgent*>(agent)) {}
+
+  bool IsInitialized() const override {
+    if (agent_ == nullptr) return false;
+    return TF_CoordinationServiceIsInitialized(agent_);
+  }
+
+  absl::Status InsertKeyValue(std::string_view key,
+                              std::string_view value) override;
+
+  absl::StatusOr<std::string> GetKeyValue(std::string_view key) override;
+  absl::StatusOr<std::string> GetKeyValue(std::string_view key,
+                                          absl::Duration timeout) override;
+  absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) override;
+
+  absl::Status DeleteKeyValue(std::string_view key) override;
+
+ private:
+  TF_CoordinationServiceAgent* agent_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h
new file mode 100644
index 00000000..fa7206c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h
@@ -0,0 +1,177 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_OP_KERNEL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_OP_KERNEL_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/c/kernels.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+class CPluginOpKernelConstruction : public PluginOpKernelConstruction {
+ public:
+  explicit CPluginOpKernelConstruction(void* ctx)
+      : ctx_(reinterpret_cast<TF_OpKernelConstruction*>(ctx)) {}
+
+  absl::Status GetBoolAttr(std::string_view attr_name,
+                           bool* value) const override;
+  absl::Status GetInt32Attr(std::string_view attr_name,
+                            int* value) const override;
+  absl::Status GetInt32AttrList(std::string_view attr_name,
+                                std::vector<int32_t>* value) const override;
+  absl::Status GetInt64Attr(std::string_view attr_name,
+                            int64_t* value) const override;
+  absl::Status GetStringAttr(std::string_view attr_name,
+                             std::string* value) const override;
+  absl::Status GetFunctionAttr(std::string_view attr_name,
+                               NameAttrList* function) const override;
+
+  void CtxFailure(const absl::Status& status) override;
+  void CtxFailure(const char* file, int line,
+                  const absl::Status& status) override;
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  TF_OpKernelConstruction* ctx_;  // not owned.
+};
+
+class CPluginOpKernelContext : public PluginOpKernelContext {
+ public:
+  explicit CPluginOpKernelContext(void* ctx)
+      : ctx_(reinterpret_cast<TF_OpKernelContext*>(ctx)) {}
+
+  std::string_view GetResourceMgrDefaultContainerName() override;
+
+  absl::Status LookupOrCreateResource(std::string_view container_name,
+                                      std::string_view plugin_resource_name,
+                                      void** result_plugin_resource,
+                                      void* (*create_func)(void*),
+                                      void* create_func_args,
+                                      void (*delete_func)(void*)) override;
+
+  std::unique_ptr<PluginCoordinationServiceAgent>
+  GetPluginCoordinationServiceAgent() const override;
+
+  absl::Status CreatePluginVariable(int index,
+                                    PluginVariable** variable) const override;
+
+  absl::Status AllocateTempForPluginVariable(PluginVariable* variable) override;
+
+  int NumInputs() const override { return TF_NumInputs(ctx_); }
+
+  absl::Status GetInput(int index, const Tensor** tensor) const override;
+
+  absl::Status GetInput(const char* name, const Tensor** tensor) const override;
+
+  absl::Status GetInputRange(std::string_view name,
+                             std::pair<int, int>* range) const override;
+
+  DataType GetInputDataType(int index) const override;
+
+  std::string_view GetOpKernelRequestedInput(int index) const override;
+
+  std::string_view GetOpKernelName() const override;
+
+  uint64_t GetFrameId() const override { return TF_GetFrameId(ctx_); }
+
+  int64_t GetIterId() const override { return TF_GetIterId(ctx_); }
+
+  int64_t GetStepId() const override { return TF_GetStepId(ctx_); }
+
+  int GetDeviceId() const override { return TF_GetDeviceId(ctx_); }
+
+  std::string_view GetDeviceName() const override;
+
+  std::string GetSessionName() const override {
+    // TODO(haoyuzhang): Implement with ctx_->session_metadata() if needed.
+    return "";
+  }
+
+  absl::Status GetConfigProto(const ConfigProto** config_proto) const override;
+
+  // Note: this function is only meant to clear up `config_proto` created by the
+  // above `CPluginOpKernelContext::GetConfigProto()`.
+  void MaybeDeleteConfigProto(const ConfigProto* config_proto) const override {
+    delete config_proto;
+  }
+
+  absl::Status GetFunctionLibraryDefinition(
+      const FunctionLibraryDefinition** flib_def) const override;
+
+  // Note: this function is only meant to clear up `flib_def` created by the
+  // above `CPluginOpKernelContext::GetFunctionLibraryDefinition()`.
+  void MaybeDeleteFunctionLibraryDefinition(
+      const FunctionLibraryDefinition* flib_def) const override {
+    delete flib_def;
+  }
+
+  absl::Status GetResourceHandle(int index,
+                                 const ResourceHandle** handle) const override;
+
+  // Note: this function is only meant to clear up `handle` created by the above
+  // `CPluginOpKernelContext::GetResourceHandle()`.
+  void MaybeDeleteResourceHandle(const ResourceHandle* handle) const override {
+    delete handle;
+  }
+
+  int GetGraphDefVersion() const override {
+    return TF_GetGraphDefVersion(ctx_);
+  }
+
+  absl::Status AllocateOutput(int index, const TensorShape& shape,
+                              Tensor** out) override;
+
+  absl::Status SetOutput(int index, const Tensor& tensor) override;
+
+  void CtxFailure(const absl::Status& status) override;
+  void CtxFailure(const char* file, int line,
+                  const absl::Status& status) override;
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  mutable mutex mu_;
+
+  // A cache for tensors obtained from the ctx_. This is needed to extend the
+  // lifetime of the c++ tensorflow::Tensor created from `TF_TensorToTensor`.
+  // Use std::deque here to make sure elements in the container are pointer
+  // stable.
+  // "insertion and deletion at either end of a deque never invalidates pointers
+  //  or references to the rest of the elements."
+  mutable std::deque<Tensor> obtained_tensors_ TF_GUARDED_BY(mu_);
+  TF_OpKernelContext* ctx_;  // not owned.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
new file mode 100644
index 00000000..157c5b45
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_VARIABLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_VARIABLE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class CPluginOpKernelContext;
+
+class CPluginVariable : public PluginVariable {
+ public:
+  ~CPluginVariable() override;
+  explicit CPluginVariable(TF_VariableInfo* var_info) : var_info_(var_info) {}
+
+  absl::Status GetTensor(const Tensor** result_tensor) override;
+
+  absl::Status GetMutableTensor(Tensor** result_tensor) override;
+
+  TF_VariableInfo* GetVariableInfo() { return var_info_; }
+
+  friend class CPluginOpKernelContext;
+
+ private:
+  absl::Status GetTensorInternal();
+
+  TF_VariableInfo* var_info_;  // Owned. Cleared by destructor.
+  bool tensor_obtained_ = false;
+  tensorflow::Tensor tensor_;  // Tensor obtained from variable.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
new file mode 100644
index 00000000..930efed4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+
+#include <string>
+#include <string_view>
+
+#include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+class DirectPluginCoordinationServiceAgent
+    : public PluginCoordinationServiceAgent {
+ public:
+  explicit DirectPluginCoordinationServiceAgent(void* agent)
+      : agent_(reinterpret_cast<tsl::CoordinationServiceAgent*>(agent)) {}
+
+  bool IsInitialized() const override {
+    if (agent_ == nullptr) return false;
+    return agent_->IsInitialized();
+  }
+
+  absl::Status InsertKeyValue(std::string_view key,
+                              std::string_view value) override {
+    return agent_->InsertKeyValue(key, value);
+  }
+
+  absl::StatusOr<std::string> GetKeyValue(std::string_view key) override {
+    return agent_->GetKeyValue(key);
+  }
+
+  absl::StatusOr<std::string> GetKeyValue(std::string_view key,
+                                          absl::Duration timeout) override {
+    return agent_->GetKeyValue(key, timeout);
+  }
+
+  absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) override {
+    return agent_->TryGetKeyValue(key);
+  }
+
+  absl::Status DeleteKeyValue(std::string_view key) override {
+    return agent_->DeleteKeyValue(key);
+  }
+
+ private:
+  tsl::CoordinationServiceAgent* agent_;  // Not owned.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
new file mode 100644
index 00000000..3df3543b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
@@ -0,0 +1,196 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_OP_KERNEL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_OP_KERNEL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+class DirectPluginOpKernelConstruction : public PluginOpKernelConstruction {
+ public:
+  explicit DirectPluginOpKernelConstruction(void* ctx)
+      : ctx_(reinterpret_cast<OpKernelConstruction*>(ctx)) {}
+
+  absl::Status GetBoolAttr(std::string_view attr_name,
+                           bool* value) const override;
+  absl::Status GetInt32Attr(std::string_view attr_name,
+                            int* value) const override;
+  absl::Status GetInt32AttrList(std::string_view attr_name,
+                                std::vector<int32_t>* value) const override;
+  absl::Status GetInt64Attr(std::string_view attr_name,
+                            int64_t* value) const override;
+  absl::Status GetStringAttr(std::string_view attr_name,
+                             std::string* value) const override;
+  absl::Status GetFunctionAttr(std::string_view attr_name,
+                               NameAttrList* function) const override;
+
+  void CtxFailure(const absl::Status& status) override {
+    ctx_->CtxFailure(status);
+  }
+
+  void CtxFailure(const char* file, int line,
+                  const absl::Status& status) override {
+    ctx_->CtxFailure(file, line, status);
+  }
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  OpKernelConstruction* ctx_;  // not owned.
+};
+
+class DirectPluginOpKernelContext : public PluginOpKernelContext {
+ public:
+  explicit DirectPluginOpKernelContext(OpKernelContext* ctx) : ctx_(ctx) {}
+
+  std::string_view GetResourceMgrDefaultContainerName() override;
+
+  absl::Status LookupOrCreateResource(std::string_view container_name,
+                                      std::string_view plugin_resource_name,
+                                      void** result_plugin_resource,
+                                      void* (*create_func)(void*),
+                                      void* create_func_args,
+                                      void (*delete_func)(void*)) override;
+
+  std::unique_ptr<PluginCoordinationServiceAgent>
+  GetPluginCoordinationServiceAgent() const override {
+    return CreatePluginCoordinationServiceAgent(
+        ctx_->coordination_service_agent());
+  }
+
+  absl::Status CreatePluginVariable(int index,
+                                    PluginVariable** variable) const override;
+
+  absl::Status AllocateTempForPluginVariable(PluginVariable* variable) override;
+
+  int NumInputs() const override { return ctx_->num_inputs(); }
+
+  absl::Status GetInput(int index, const Tensor** tensor) const override;
+
+  absl::Status GetInput(const char* name, const Tensor** tensor) const override;
+
+  absl::Status GetInputRange(std::string_view name,
+                             std::pair<int, int>* range) const override;
+
+  DataType GetInputDataType(int index) const override {
+    return ctx_->input_dtype(index);
+  }
+
+  std::string_view GetOpKernelRequestedInput(int index) const override {
+    return ctx_->op_kernel().requested_input(index);
+  }
+
+  std::string_view GetOpKernelName() const override {
+    return ctx_->op_kernel().name();
+  }
+
+  uint64_t GetFrameId() const override { return ctx_->frame_iter().frame_id; }
+
+  int64_t GetIterId() const override { return ctx_->frame_iter().iter_id; }
+
+  int64_t GetStepId() const override { return ctx_->step_id(); }
+
+  int GetDeviceId() const override;
+
+  std::string_view GetDeviceName() const override;
+
+  std::string GetSessionName() const override {
+    return ctx_->session_metadata() ? ctx_->session_metadata()->name() : "";
+  }
+
+  absl::Status GetConfigProto(const ConfigProto** config_proto) const override {
+    *config_proto = ctx_->function_library()->config_proto();
+    return absl::OkStatus();
+  }
+
+  void MaybeDeleteConfigProto(const ConfigProto* config_proto) const override {
+    // We don't need to specifically delete ConfigProto since it is obtained
+    // from FunctionLibraryRuntime in `ctx_`.
+  }
+
+  absl::Status GetFunctionLibraryDefinition(
+      const FunctionLibraryDefinition** flib_def) const override {
+    *flib_def = ctx_->function_library()->GetFunctionLibraryDefinition();
+    return absl::OkStatus();
+  }
+
+  void MaybeDeleteFunctionLibraryDefinition(
+      const FunctionLibraryDefinition* flib_def) const override {
+    // We don't need to specifically delete FunctionLibraryDefinition since it
+    // is obtained from FunctionLibraryRuntime in `ctx_`.
+  }
+
+  absl::Status GetResourceHandle(int index,
+                                 const ResourceHandle** handle) const override {
+    *handle = &HandleFromInput(ctx_, index);
+    return absl::OkStatus();
+  }
+
+  void MaybeDeleteResourceHandle(const ResourceHandle* handle) const override {
+    // We don't need to specifically delete ResourceHandle since it is obtained
+    // from `ctx_`.
+  }
+
+  int GetGraphDefVersion() const override {
+    return ctx_->function_library()->graph_def_version();
+  }
+
+  absl::Status AllocateOutput(int index, const TensorShape& shape,
+                              Tensor** out) override {
+    return ctx_->allocate_output(index, shape, out);
+  }
+
+  absl::Status SetOutput(int index, const Tensor& tensor) override {
+    ctx_->set_output(index, tensor);
+    return absl::OkStatus();
+  }
+
+  void CtxFailure(const absl::Status& status) override {
+    ctx_->CtxFailure(status);
+  }
+
+  void CtxFailure(const char* file, int line,
+                  const absl::Status& status) override {
+    LOG(WARNING) << "Plugin OP_REQUIRES failed at " << file << ": " << line
+                 << ": " << status;
+    ctx_->CtxFailure(file, line, status);
+  }
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  OpKernelContext* ctx_;  // not owned.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
new file mode 100644
index 00000000..bbbcfee6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_VARIABLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_VARIABLE_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+
+class DirectPluginOpKernelContext;
+
+class DirectPluginVariable : public PluginVariable {
+ public:
+  DirectPluginVariable(int index, const std::string& name, Var* var);
+  absl::Status GetTensor(const Tensor** result_tensor) override {
+    *result_tensor = var_info_.var()->tensor();
+    return absl::OkStatus();
+  }
+
+  absl::Status GetMutableTensor(Tensor** result_tensor) override {
+    *result_tensor = var_info_.var()->tensor();
+    return absl::OkStatus();
+  }
+
+  VariableInfo* GetVariableInfo() { return &var_info_; }
+
+  friend DirectPluginOpKernelContext;
+
+ private:
+  VariableInfo var_info_{0, "", nullptr};
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/flags.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/flags.h
new file mode 100644
index 00000000..681155e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/flags.h
@@ -0,0 +1,23 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_FLAGS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_FLAGS_H_
+
+#include "absl/flags/declare.h"
+
+ABSL_DECLARE_FLAG(bool, next_pluggable_device_use_c_api);
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
new file mode 100644
index 00000000..cb8ecf51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
@@ -0,0 +1,96 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/jit/pjrt_base_device.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
+
+namespace tensorflow {
+
+class NextPluggableDeviceAllocator;
+
+class NextPluggableDevice : public PjRtBaseDevice {
+ public:
+  struct Options {
+    // The device name's prefix (e.g., "/task:7")
+    string device_name_prefix;
+
+    // The name of the  device (e.g., "GPU")
+    string device_name;
+
+    // The name of the compilation device (e.g., "XLA_TPU_JIT");
+    string compilation_device_name;
+
+    // The TfDeviceId.
+    int device_ordinal = -1;
+
+    // A vector of ShapeDeterminationFn (i.e., a bundle of LayoutSelectionFn,
+    // ShapeRepresentationFn). Each bundle describes how the on-host shapes of
+    // a) argument and return value, for entry computations b) variables, for
+    // all computations, should be represented in XLA. Parameters/return values
+    // will be shaped according to the function pair, and reshaped back to/from
+    // their declared shapes for computations. Must be non-empty.
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns;
+  };
+
+  NextPluggableDevice(const SessionOptions& session_options,
+                      const Options& options);
+
+  ~NextPluggableDevice() override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+  absl::Status Sync() override;
+
+  void Sync(const DoneCallback& done) override;
+
+  absl::Status TryGetDeviceContext(DeviceContext** out_context) override;
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
+
+  int GetDeviceOrdinal() const { return device_ordinal_; }
+
+ private:
+  int device_ordinal_;
+  // Need to use RefCountPtr since DeviceContext is a ref counted object.
+  core::RefCountPtr<DeviceContext> device_context_;
+  std::unique_ptr<NextPluggableDeviceAllocator> tfnpd_allocator_;
+  std::unique_ptr<AsyncValueAllocator> pjrt_allocator_;
+  Allocator* allocator_ = nullptr;  // Not owned.
+  std::unique_ptr<DeviceBase::AcceleratorDeviceInfo> accelerator_device_info_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h
new file mode 100644
index 00000000..15cb583b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_ALLOCATOR_H_
+
+#include <cstddef>
+#include <string>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/framework/allocator.h"
+
+class TFNPD_DeviceAllocator;
+
+namespace tensorflow {
+
+class NextPluggableDeviceAllocator : public Allocator {
+ public:
+  explicit NextPluggableDeviceAllocator(int device_ordinal);
+
+  ~NextPluggableDeviceAllocator() override;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void DeallocateRaw(void* ptr) override;
+
+  std::string Name() override { return device_allocator_name_; }
+
+  bool AllocatesOpaqueHandle() const override {
+    return allocates_opaque_handle_;
+  }
+
+ private:
+  const TFNPD_Api* api_;
+  int device_ordinal_;
+  std::string device_allocator_name_;
+  bool allocates_opaque_handle_;
+  TFNPD_DeviceAllocator* device_allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
new file mode 100644
index 00000000..026febe2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_API_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_API_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+// Global TFNPD_Api* singleton.
+const TFNPD_Api* TfnpdApi();
+void SetTfnpdApi(const TFNPD_Api* api);
+
+typedef const TFNPD_Api* (*TFNPDInitPluginFn)(TFNPD_PluginParams*, TF_Status*);
+absl::StatusOr<TFNPD_PluginParams> InitNextPluggableDevicePlugin(
+    TFNPDInitPluginFn init_fn);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h
new file mode 100644
index 00000000..185e5f5e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h
@@ -0,0 +1,53 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_CONTEXT_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/status.h"
+
+class TFNPD_DeviceContext;
+
+namespace tensorflow {
+
+// Helper class for managing data transfers between host and accelerator
+// devices.
+class NextPluggableDeviceContext : public DeviceContext {
+ public:
+  explicit NextPluggableDeviceContext(int device_ordinal);
+
+  ~NextPluggableDeviceContext() override;
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
+ private:
+  const TFNPD_Api* api_;
+  TFNPD_DeviceContext* context_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
new file mode 100644
index 00000000..5ccfb6dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_FACTORY_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+#include "tensorflow/core/framework/device_factory.h"
+
+namespace tensorflow {
+
+class NextPluggableDeviceFactory : public DeviceFactory {
+ public:
+  explicit NextPluggableDeviceFactory(
+      const std::string& device_type,
+      const std::string& compilation_device_name)
+      : api_(TfnpdApi()),
+        device_type_(device_type),
+        compilation_device_name_(compilation_device_name) {}
+
+  absl::Status ListPhysicalDevices(std::vector<string>* devices) override;
+
+  absl::Status CreateDevices(
+      const SessionOptions& session_options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) override;
+
+  const std::string& compilation_device_name() const {
+    return compilation_device_name_;
+  }
+
+ private:
+  const TFNPD_Api* api_;
+  const std::string device_type_;
+  const std::string compilation_device_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
new file mode 100644
index 00000000..4d3a1734
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+
+#include <string>
+#include <string_view>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+class PluginCoordinationServiceAgent {
+ public:
+  PluginCoordinationServiceAgent() = default;
+  virtual ~PluginCoordinationServiceAgent() = default;
+
+  virtual bool IsInitialized() const = 0;
+
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
+
+  virtual absl::StatusOr<std::string> GetKeyValue(std::string_view key) = 0;
+  virtual absl::StatusOr<std::string> GetKeyValue(std::string_view key,
+                                                  absl::Duration timeout) = 0;
+  virtual absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
+
+  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
new file mode 100644
index 00000000..a5adfa50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
+
+#include <memory>
+
+#include "absl/flags/flag.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/flags.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+
+namespace tensorflow {
+
+inline std::unique_ptr<PluginCoordinationServiceAgent>
+CreatePluginCoordinationServiceAgent(void* agent) {
+  if (!absl::GetFlag(FLAGS_next_pluggable_device_use_c_api)) {
+    return std::make_unique<DirectPluginCoordinationServiceAgent>(agent);
+  } else {
+    return std::make_unique<CPluginCoordinationServiceAgent>(agent);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h
new file mode 100644
index 00000000..b0123999
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h
@@ -0,0 +1,174 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class ConfigProto;
+class FunctionLibraryDefinition;
+class OpInputList;
+class PluginCoordinationServiceAgent;
+class PluginVariable;
+class Tensor;
+class TensorShape;
+
+// A wrapper base class that provides convenience for developers to implement
+// to plugin OpKernels that suites internal and external requirements, without
+// duplicating code.
+//
+// Internal build: Plugin and TF are built together and statically linked. In
+// this case, we can directly cast between `TF_OpKernelContext*` and
+// `OpKernelContext*`, and directly call C++ API. This way don't need to pay the
+// potential performance panelty (e.g. proto serialization/deserialization)
+// brought by C API.
+//
+// External build: Plugin and TF are built separately (potentially on different
+// platform and by different compilers). Plugin is dynamically loaded by TF.
+// In this case, we need to call C API to ensure binary compatibility.
+//
+// `DirectPluginOpKernel*` and `CPluginOpKernel*` implement `PluginOpKernel*`
+// to support the above mentioned internal and external build cases. OpKernel
+// developers can conveniently use the `Wrapper` C++ API to implement `Create`
+// and `Compute` functions, and use the helper macro to register the functions
+// as a Plugin OpKernel. This method benefit kernel developers in two ways: 1).
+// Plugin OpKernel developers don't have to directly deal with C API. 2). In the
+// OpKernels are performance critical and developers want to introduce an
+// internal version of the same OpKernels, they don't have to implement again
+// with mostly duplicated code.
+class PluginOpKernelConstruction {
+ public:
+  PluginOpKernelConstruction() = default;
+  virtual ~PluginOpKernelConstruction() = default;
+
+  virtual absl::Status GetBoolAttr(std::string_view attr_name,
+                                   bool* value) const = 0;
+  virtual absl::Status GetInt32Attr(std::string_view attr_name,
+                                    int* value) const = 0;
+  virtual absl::Status GetInt32AttrList(std::string_view attr_name,
+                                        std::vector<int32_t>* value) const = 0;
+  virtual absl::Status GetInt64Attr(std::string_view attr_name,
+                                    int64_t* value) const = 0;
+  virtual absl::Status GetStringAttr(std::string_view attr_name,
+                                     std::string* value) const = 0;
+  virtual absl::Status GetFunctionAttr(std::string_view attr_name,
+                                       NameAttrList* function) const = 0;
+
+  virtual void CtxFailure(const absl::Status& status) = 0;
+  virtual void CtxFailure(const char* file, int line,
+                          const absl::Status& status) = 0;
+
+  virtual void* GetContext() const = 0;
+};
+
+class PluginOpKernelContext {
+ public:
+  PluginOpKernelContext() = default;
+  virtual ~PluginOpKernelContext() = default;
+
+  virtual std::string_view GetResourceMgrDefaultContainerName() = 0;
+
+  virtual absl::Status LookupOrCreateResource(
+      std::string_view container_name, std::string_view plugin_resource_name,
+      void** result_plugin_resource, void* (*create_func)(void*),
+      void* create_func_args, void (*delete_func)(void*)) = 0;
+
+  virtual std::unique_ptr<PluginCoordinationServiceAgent>
+  GetPluginCoordinationServiceAgent() const = 0;
+
+  // This method will allocate a new `PluginVariable`. Caller is responsible
+  // for managing it's lifetime.
+  virtual absl::Status CreatePluginVariable(
+      int index, PluginVariable** variable) const = 0;
+
+  virtual absl::Status AllocateTempForPluginVariable(
+      PluginVariable* variable) = 0;
+
+  virtual int NumInputs() const = 0;
+
+  virtual absl::Status GetInput(int index, const Tensor** tensor) const = 0;
+
+  virtual absl::Status GetInput(const char* name,
+                                const Tensor** tensor) const = 0;
+
+  virtual absl::Status GetInputRange(std::string_view name,
+                                     std::pair<int, int>* range) const = 0;
+
+  virtual DataType GetInputDataType(int index) const = 0;
+
+  virtual std::string_view GetOpKernelRequestedInput(int index) const = 0;
+
+  virtual std::string_view GetOpKernelName() const = 0;
+
+  virtual uint64_t GetFrameId() const = 0;
+
+  virtual int64_t GetIterId() const = 0;
+
+  virtual int64_t GetStepId() const = 0;
+
+  virtual int GetDeviceId() const = 0;
+
+  virtual std::string_view GetDeviceName() const = 0;
+
+  virtual std::string GetSessionName() const = 0;
+
+  virtual absl::Status GetConfigProto(
+      const ConfigProto** config_proto) const = 0;
+
+  virtual void MaybeDeleteConfigProto(
+      const ConfigProto* config_proto) const = 0;
+
+  virtual absl::Status GetFunctionLibraryDefinition(
+      const FunctionLibraryDefinition** flib_def) const = 0;
+
+  virtual void MaybeDeleteFunctionLibraryDefinition(
+      const FunctionLibraryDefinition* flib_def) const = 0;
+
+  virtual absl::Status GetResourceHandle(
+      int index, const ResourceHandle** handle) const = 0;
+
+  virtual void MaybeDeleteResourceHandle(
+      const ResourceHandle* handle) const = 0;
+
+  virtual int GetGraphDefVersion() const = 0;
+
+  virtual absl::Status AllocateOutput(int index, const TensorShape& shape,
+                                      Tensor** out) = 0;
+
+  virtual absl::Status SetOutput(int index, const Tensor& tensor) = 0;
+
+  virtual void CtxFailure(const absl::Status& status) = 0;
+  virtual void CtxFailure(const char* file, int line,
+                          const absl::Status& status) = 0;
+
+  virtual void* GetContext() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
new file mode 100644
index 00000000..1f51f7c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
@@ -0,0 +1,124 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
+
+#include "absl/flags/flag.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/flags.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+
+inline PluginOpKernelConstruction* CreatePluginOpKernelConstruction(void* ctx) {
+  if (!absl::GetFlag(FLAGS_next_pluggable_device_use_c_api)) {
+    return new DirectPluginOpKernelConstruction(ctx);
+  } else {
+    return new CPluginOpKernelConstruction(ctx);
+  }
+}
+
+inline void DeletePluginOpKernelConstruction(
+    PluginOpKernelConstruction* wrapper) {
+  delete wrapper;
+}
+
+inline PluginOpKernelContext* CreatePluginOpKernelContext(void* ctx) {
+  if (!absl::GetFlag(FLAGS_next_pluggable_device_use_c_api)) {
+    return new DirectPluginOpKernelContext(
+        reinterpret_cast<OpKernelContext*>(ctx));
+  } else {
+    return new CPluginOpKernelContext(ctx);
+  }
+}
+
+inline void DeletePluginOpKernelContext(PluginOpKernelContext* wrapper) {
+  delete wrapper;
+}
+
+#define PLUGIN_OP_REQUIRES_OK(CTX, ...)          \
+  do {                                           \
+    absl::Status _s(__VA_ARGS__);                \
+    if (!TF_PREDICT_TRUE(_s.ok())) {             \
+      (CTX)->CtxFailure(__FILE__, __LINE__, _s); \
+      return;                                    \
+    }                                            \
+  } while (0)
+
+// A helper to register C OpKernel. CREATE_FN, COMPUTE_FN, and DELETE_FN are
+// expected to be defined in the same file where this macro is used.
+//
+// HOST_MEMORY_ARGS a string containing names of args to be placed on host
+// memory. Names are expected to be comma separated.
+//
+// TODO(chuanhao): simplify the registration macro. reference:
+// REGISTER_KERNEL_BUILDER
+#define REGISTER_WRAPPED_C_OPKERNEL_HOST_MEM_ARGS(                            \
+    KERNEL_NAME, CREATE_FN, COMPUTE_FN, DELETE_FN, DEVICE, PRIORITY,          \
+    HOST_MEMORY_ARGS)                                                         \
+  {                                                                           \
+    typedef void* (*wrapped_create_func)(TF_OpKernelConstruction*);           \
+    typedef void (*wrapped_compute_func)(void*, TF_OpKernelContext*);         \
+                                                                              \
+    TF_StatusPtr status_ptr(TF_NewStatus());                                  \
+                                                                              \
+    wrapped_create_func create_func =                                         \
+        [](TF_OpKernelConstruction* ctx) -> void* {                           \
+      PluginOpKernelConstruction* ctx_wrapper =                               \
+          CreatePluginOpKernelConstruction(ctx);                              \
+      void* kernel = CREATE_FN(ctx_wrapper);                                  \
+      delete ctx_wrapper;                                                     \
+      return kernel;                                                          \
+    };                                                                        \
+                                                                              \
+    wrapped_compute_func compute_func = [](void* kernel,                      \
+                                           TF_OpKernelContext* ctx) -> void { \
+      PluginOpKernelContext* ctx_wrapper = CreatePluginOpKernelContext(ctx);  \
+      COMPUTE_FN(kernel, ctx_wrapper);                                        \
+      delete ctx_wrapper;                                                     \
+    };                                                                        \
+                                                                              \
+    auto* builder = TF_NewKernelBuilder(KERNEL_NAME, DEVICE, create_func,     \
+                                        compute_func, &DELETE_FN);            \
+                                                                              \
+    /* NOTE: We explicitly set the priority to 1 to overwrite the */          \
+    /* StreamExecutor based OpKernel of the same op.              */          \
+    TF_KernelBuilder_Priority(builder, PRIORITY);                             \
+                                                                              \
+    std::stringstream s_stream(HOST_MEMORY_ARGS);                             \
+    while (s_stream.good()) {                                                 \
+      std::string host_mem_arg;                                               \
+      std::getline(s_stream, host_mem_arg, ',');                              \
+      if (host_mem_arg.empty()) break;                                        \
+      TF_KernelBuilder_HostMemory(builder, host_mem_arg.c_str());             \
+    }                                                                         \
+                                                                              \
+    TF_RegisterKernelBuilder(KERNEL_NAME, builder, status_ptr.get());         \
+    CHECK_EQ(TF_OK, TF_GetCode(status_ptr.get()))                             \
+        << "Error while registering " << KERNEL_NAME << " kernel.";           \
+  }
+
+#define REGISTER_WRAPPED_C_OPKERNEL(KERNEL_NAME, CREATE_FN, COMPUTE_FN, \
+                                    DELETE_FN, DEVICE, PRIORITY)        \
+  REGISTER_WRAPPED_C_OPKERNEL_HOST_MEM_ARGS(                            \
+      KERNEL_NAME, CREATE_FN, COMPUTE_FN, DELETE_FN, DEVICE, PRIORITY, "")
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h
new file mode 100644
index 00000000..c72fe952
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_RESOURCE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_RESOURCE_H_
+
+#include <string>
+#include <string_view>
+
+#include "tensorflow/core/framework/resource_base.h"
+
+namespace tensorflow {
+
+// A wrapper class for plugin to create resources to the ResourceMgr managed by
+// TensorFlow. The main motivation is to make resources in plugin have the same
+// lifetime as TensorFlow ResourceMgr.
+//
+// Usage:
+// Plugin uses a TensorFlow C API `TF_CreatePluginResource()`,
+// to register the `PluginResource` to the ResourceMgr managed by TensorFlow.
+// `PluginResource` holds a opaque pointer and a deleter function. The deleter
+// will be called at `PluginResource`'s destruction.
+class PluginResource : public ResourceBase {
+ public:
+  PluginResource(void* plugin_resource, std::string_view plugin_resource_name,
+                 void (*delete_func)(void* plugin_resource))
+      : resource_(plugin_resource),
+        resource_name_(plugin_resource_name),
+        delete_func_(delete_func) {}
+  ~PluginResource() override;
+
+  void* GetOpaquePluginResource() { return resource_; }
+
+  std::string DebugString() const override { return resource_name_; }
+
+ private:
+  void* resource_;
+  std::string resource_name_;
+  void (*delete_func_)(void* plugin_resource);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_RESOURCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
new file mode 100644
index 00000000..ab2ec9a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_VARIABLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_VARIABLE_H_
+
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+
+class Tensor;
+
+// A helper base class that wraps tensorflow::VariableInfo for the convenience
+// of passing between plugin and tensorflow. Similar to `PluginOpKernelContext`,
+// the implementations can accomodate for "Internal build" and "External build",
+// meaning the plugin is built with TensorFlow either together or separately. In
+// repsective build modes, the implementations can either include
+// tensorflow::VariableInfo and use C++ API directly, or include the C structure
+// `TF_VariableInfo` and use the corresponding C API.
+class PluginVariable {
+ public:
+  PluginVariable() = default;
+  virtual ~PluginVariable() = default;
+
+  // `result_tensor` will point to the tensor possessed by the variable if
+  // status is ok.
+  virtual absl::Status GetTensor(const Tensor** result_tensor) = 0;
+
+  virtual absl::Status GetMutableTensor(Tensor** result_tensor) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/utils.h
new file mode 100644
index 00000000..9739c009
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/next_pluggable_device/utils.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_UTILS_H_
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/c/c_api_decl.h"
+
+namespace tensorflow {
+
+XLA_LayoutPreference ConvertToCXlaLayoutPreference(XlaLayoutPreference input);
+XlaLayoutPreference ConvertFromCXlaLayoutPreference(XLA_LayoutPreference input);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/no_op_cost_measurement.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/no_op_cost_measurement.h
new file mode 100644
index 00000000..6c2cc659
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/no_op_cost_measurement.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NO_OP_COST_MEASUREMENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NO_OP_COST_MEASUREMENT_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/cost_measurement.h"
+#include "tensorflow/core/common_runtime/cost_measurement_registry.h"
+
+namespace tensorflow {
+
+// This class does not do the real cost measurement. It will always return zero
+// Duration as the total cost. It's created to allow callers to skip collecting
+// costs.
+class NoOpCostMeasurement : public CostMeasurement {
+ public:
+  using CostMeasurement::CostMeasurement;
+
+  // Always returns zero Duration as the total cost.
+  absl::Duration GetTotalCost() override;
+  absl::string_view GetCostType() const override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NO_OP_COST_MEASUREMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/node_file_writer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/node_file_writer.h
new file mode 100644
index 00000000..4b92453b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/node_file_writer.h
@@ -0,0 +1,72 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NODE_FILE_WRITER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NODE_FILE_WRITER_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+// Writes out the NodeDef and the input shapes/dtypes for an executed node to a
+// file. This allows the set of executed nodes for a model or test to be
+// examined and processed. Currently this is used by an internal tool which
+// checks that ops executed by tests are deterministic.
+class NodeFileWriter {
+ public:
+  // Creates or reuses a NodeFileWriter if environmental variable
+  // TF_NODE_FILE_WRITER_DIRECTORY is set, which specifies the directory where
+  // the node file will be created in. Otherwise, returns nullptr. When called
+  // with the same device_name, the same NodeFileWriter will be returned.
+  static absl::StatusOr<NodeFileWriter*> GetNodeFileWriterIfEnabled(
+      const std::string& device_name, Env* env);
+
+  // Records the execution of a node, if eligible, by writing the node to the
+  // file. Only writes the node if the exact node with the given input
+  // shapes/dtypes hasn't already been written. Should be called once every time
+  // a node is run.
+  absl::Status RecordNodeExecution(OpKernel* op_kernel,
+                                   OpKernelContext* context);
+
+  const std::string& filename() { return filename_; }
+
+ private:
+  explicit NodeFileWriter(std::string filename)
+      : filename_{std::move(filename)} {}
+
+  absl::Status Init(Env* env) {
+    return env->NewWritableFile(filename_, &node_def_file_);
+  }
+
+  // Writes the NodeDef to a file, if it hasn't already been written yet.
+  absl::Status MaybeWriteNodeDefToFile(const NodeDef& def);
+
+  const std::string filename_;
+  mutex mu_;
+  // Hashes of the NodeDefs already written to the file
+  absl::flat_hash_set<uint64> written_hashes_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<WritableFile> node_def_file_ TF_PT_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NODE_FILE_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/null_request_cost_accessor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/null_request_cost_accessor.h
new file mode 100644
index 00000000..daae603f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/null_request_cost_accessor.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NULL_REQUEST_COST_ACCESSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NULL_REQUEST_COST_ACCESSOR_H_
+
+#include "tensorflow/core/common_runtime/request_cost_accessor_registry.h"
+
+namespace tensorflow {
+
+// NullRequestCostAccessor always returns nullptr as the RequestCost of current
+// rpc. It's created to allow callers to skip collecting the request cost.
+class NullRequestCostAccessor : public RequestCostAccessor {
+ public:
+  // Always returns nullptr as the RequestCost of current rpc.
+  RequestCost* GetRequestCost() const override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NULL_REQUEST_COST_ACCESSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimization_registry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimization_registry.h
new file mode 100644
index 00000000..9de93a6b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimization_registry.h
@@ -0,0 +1,191 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of whole-graph optimization
+// passes to be applied by the Session when it initializes a graph.
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZATION_REGISTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZATION_REGISTRY_H_
+
+#include <functional>
+#include <map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+struct SessionOptions;
+
+// All the parameters used by an optimization pass are packaged in
+// this struct. They should be enough for the optimization pass to use
+// as a key into a state dictionary if it wants to keep state across
+// calls.
+struct GraphOptimizationPassOptions {
+  // Filled in by DirectSession for PRE_PLACEMENT optimizations. Can be empty.
+  string session_handle;
+  const SessionOptions* session_options = nullptr;
+  const CostModel* cost_model = nullptr;
+
+  FunctionLibraryDefinition* flib_def = nullptr;  // Not owned.
+  // The DeviceSet contains all the devices known to the system and is
+  // filled in for optimizations run by the session master, i.e.,
+  // PRE_PLACEMENT, POST_PLACEMENT, and POST_REWRITE_FOR_EXEC. It is
+  // nullptr for POST_PARTITIONING optimizations which are run at the
+  // workers.
+  const DeviceSet* device_set = nullptr;  // Not owned.
+
+  // Maps from a CompositeDevice name to a list of underlying physical
+  // devices.
+  const std::vector<CompositeDevice*>* composite_devices =
+      nullptr;  // Not owned.
+
+  // The graph to optimize, for optimization passes that run before
+  // partitioning. Null for post-partitioning passes.
+  // An optimization pass may replace *graph with a new graph object.
+  std::unique_ptr<Graph>* graph = nullptr;
+
+  // Graphs for each partition, if running post-partitioning. Optimization
+  // passes may alter the graphs, but must not add or remove partitions.
+  // Null for pre-partitioning passes.
+  std::unordered_map<string, std::unique_ptr<Graph>>* partition_graphs =
+      nullptr;
+
+  // Indicator of whether or not the graph was derived from a function.
+  bool is_function_graph = false;
+  // Set when is_function_graph is true. The default device where the function
+  // runs. If nullptr, it runs on the local host.
+  const Device* default_function_device = nullptr;
+  // Set when is_function_graph is true. The function where the graph was
+  // derived. `graph` doesn't contain all the information in the function_def,
+  // e.g. function attributes.
+  const FunctionDef* function_def = nullptr;
+
+  // TODO(b/176491312): Remove this if shape inference on import flag is
+  // removed. If True, allows mlir roundtrip to run shape inference on import.
+  bool shape_inference_on_tfe_dialect_import = true;
+
+  // A unique filename prefix (using hostname, process ID, thread ID and
+  // timestamp) for graph dumps.
+  string debug_filename_prefix;
+
+  // Whether to enable tf2xla mlir bridge in compiling SavedModel.
+  bool enable_tf2xla_mlir_bridge = true;
+};
+
+// Optimization passes are implemented by inheriting from
+// GraphOptimizationPass.
+class GraphOptimizationPass {
+ public:
+  virtual ~GraphOptimizationPass() {}
+  virtual absl::Status Run(const GraphOptimizationPassOptions& options) = 0;
+  void set_name(const string& name) { name_ = name; }
+  string name() const { return name_; }
+
+ private:
+  // The name of the optimization pass, which is the same as the inherited
+  // class name.
+  string name_;
+};
+
+// The key is a 'phase' number. Phases are executed in increasing
+// order. Within each phase the order of passes is undefined.
+typedef std::map<int, std::vector<std::unique_ptr<GraphOptimizationPass>>>
+    GraphOptimizationPasses;
+
+// A global OptimizationPassRegistry is used to hold all passes.
+class OptimizationPassRegistry {
+ public:
+  // Groups of passes are run at different points in initialization.
+  enum Grouping {
+    PRE_PLACEMENT,          // after cost model assignment, before placement.
+    POST_PLACEMENT,         // after placement.
+    POST_REWRITE_FOR_EXEC,  // after re-write using feed/fetch endpoints.
+    POST_PARTITIONING,      // after partitioning
+  };
+
+  // Add an optimization pass to the registry.
+  void Register(Grouping grouping, int phase,
+                std::unique_ptr<GraphOptimizationPass> pass);
+
+  const std::map<Grouping, GraphOptimizationPasses>& groups() {
+    return groups_;
+  }
+
+  // Run all passes in grouping, ordered by phase, with the same
+  // options.
+  absl::Status RunGrouping(Grouping grouping,
+                           const GraphOptimizationPassOptions& options);
+
+  // Returns the global registry of optimization passes.
+  static OptimizationPassRegistry* Global();
+
+  // Prints registered optimization passes for debugging.
+  void LogGrouping(Grouping grouping, int vlog_level);
+  void LogAllGroupings(int vlog_level);
+
+ private:
+  std::map<Grouping, GraphOptimizationPasses> groups_;
+
+  const char* GetGroupingName(Grouping grouping) const {
+    switch (grouping) {
+      case PRE_PLACEMENT:
+        return "pre_placement";
+      case POST_PLACEMENT:
+        return "post_placement";
+      case POST_REWRITE_FOR_EXEC:
+        return "post_rewrite_for_exec";
+      case POST_PARTITIONING:
+        return "post_partitioning";
+    }
+    return "unknown";
+  }
+};
+
+namespace optimization_registration {
+
+class OptimizationPassRegistration {
+ public:
+  OptimizationPassRegistration(OptimizationPassRegistry::Grouping grouping,
+                               int phase,
+                               std::unique_ptr<GraphOptimizationPass> pass,
+                               string optimization_pass_name) {
+    pass->set_name(optimization_pass_name);
+    OptimizationPassRegistry::Global()->Register(grouping, phase,
+                                                 std::move(pass));
+  }
+};
+
+}  // namespace optimization_registration
+
+#define REGISTER_OPTIMIZATION(grouping, phase, optimization) \
+  REGISTER_OPTIMIZATION_UNIQ_HELPER(__COUNTER__, grouping, phase, optimization)
+
+#define REGISTER_OPTIMIZATION_UNIQ_HELPER(ctr, grouping, phase, optimization) \
+  REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization)
+
+#define REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization)         \
+  static ::tensorflow::optimization_registration::OptimizationPassRegistration \
+      register_optimization_##ctr(                                             \
+          grouping, phase,                                                     \
+          ::std::unique_ptr<::tensorflow::GraphOptimizationPass>(              \
+              new optimization()),                                             \
+          #optimization)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZATION_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimize_cross_host_control_deps.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimize_cross_host_control_deps.h
new file mode 100644
index 00000000..dde9d3e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimize_cross_host_control_deps.h
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_CROSS_HOST_CONTROL_DEPS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_CROSS_HOST_CONTROL_DEPS_H_
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Optimize the graph by reducing cross-host control output edges.
+// Once we find any nodes in the graph having not less than
+// `cross_host_edges_threshold` control output edges in one host, we create
+// a `NoOp` node in the destination host to proxy the control edges between the
+// oringal node and the destination control output nodes.
+absl::Status OptimizeCrossHostControlOutputEdges(
+    Graph* graph, int cross_host_edges_threshold);
+
+// Optimize the graph by reducing cross-host data output edges.
+// Once we find any nodes in the graph having not less than
+// `cross_host_edges_threshold` data output edges in one host, we create
+// a `IdentityN` node in the destination host to proxy the data edges between
+// the original node and the destination output nodes.
+absl::Status OptimizeCrossHostDataOutputEdges(Graph* graph,
+                                              int cross_host_edges_threshold);
+
+// Optimize the graph by reducing cross-host control input edges.
+// Once we find any nodes in the graph having not less than
+// `cross_host_edges_threshold` control input edges in one host, we create
+// a `NoOp` node in the source host to proxy the control edges between the
+// source control input nodes and oringal node.
+absl::Status OptimizeCrossHostControlInputEdges(Graph* graph,
+                                                int cross_host_edges_threshold);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_CROSS_HOST_CONTROL_DEPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimize_function_graph_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimize_function_graph_utils.h
new file mode 100644
index 00000000..d5cd2159
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimize_function_graph_utils.h
@@ -0,0 +1,94 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file contains util functions related to function graph instantiation and
+// optimizations.
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_FUNCTION_GRAPH_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_FUNCTION_GRAPH_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+// TODO(b/246646753): add more tests.
+
+// The name of the env variable for the caching location of graph optimization.
+// Note: if the caching location retrieved by the env variable is empty it means
+// no caching would be performed.
+static const char kGraphCachingEnvVariableName[] = "TF_GRAPH_CACHING";
+// The threshold of the graph optimization duration to be cached.
+// Note: setting this threshold to 0 means to cache for every function.
+constexpr absl::Duration kCachingThresholdDuration = absl::Seconds(3);
+
+// TODO(iga): Reword
+// Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
+// corresponding resource lives. This ensures that the Placer assigns ops that
+// access these resources to the appropriate devices.
+absl::Status PinArgsAndRets(const std::vector<string>& input_devices,
+                            const std::vector<string>& output_devices,
+                            const DeviceSet& device_set,
+                            const std::vector<Node*>& arg_nodes,
+                            const std::vector<Node*>& ret_nodes,
+                            const FunctionLibraryDefinition* lib_def,
+                            Device* default_device);
+
+// Outputs graph optimization result after all the graph optimization (up till
+// before graph partitioning); returns error if optimization fails. Note that
+// the `input_lib_def` will be used only if the lib_def in `options` is nullptr.
+absl::StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Device* default_device, Env* env,
+    OptimizedFunctionGraph::OptimizationSource optimization_source);
+
+// Outputs graph optimization results (as OptimizedFunctionGraphInfo proto),
+// either by running the actual graph optimization passes,  or by reloading from
+// the file cache if existent. If cache loading fails, it goes ahead and runs
+// the graph optimization passes. Returns error if running the optimization
+// passes fails.
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizeFunctionGraphOrReadFromFileCache(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Device* default_device, Env* env,
+    absl::Duration caching_threshold_duration = kCachingThresholdDuration);
+
+// Pre-processes, partitions and post-optimizes the input graph; returns
+// subgraph result (maps from device name to the subgraph); returns error if any
+// optimization or partitioning step fails.
+absl::StatusOr<
+    std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
+PreprocessAndPartitionGraph(
+    const std::string& function_name,
+    OptimizedFunctionGraphInfo& input_optimized_graph,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Env* env);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_FUNCTION_GRAPH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimized_function_graph_info.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimized_function_graph_info.h
new file mode 100644
index 00000000..c23d7221
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/optimized_function_graph_info.h
@@ -0,0 +1,90 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZED_FUNCTION_GRAPH_INFO_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZED_FUNCTION_GRAPH_INFO_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/optimized_function_graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Function graph related information after optimizations. This struct can be
+// converted to and from
+// third_party/tensorflow/core/framework/optimized_function_graph.proto.
+struct OptimizedFunctionGraphInfo {
+  // Function name.
+  string name;
+  // Optimized function graph.
+  std::unique_ptr<Graph> function_graph;
+  // Optimized function library.
+  FunctionLibraryDefinition lib_def;
+  // Map from original node names to control return names.
+  std::unordered_map<string, string> node_name_to_control_ret;
+  // Return node types of the function.
+  DataTypeVector ret_types;
+  // Number of return nodes.
+  size_t num_return_nodes;
+  // Time (in microseconds) spent on running the graph optimization passes for
+  // this function.
+  uint64_t optimization_duration_usecs;
+  // Indicates the source environment where the optimization is created.
+  OptimizedFunctionGraph::OptimizationSource optimization_source;
+
+  ~OptimizedFunctionGraphInfo() = default;
+  OptimizedFunctionGraphInfo() : lib_def(OpRegistry::Global()) {}
+  OptimizedFunctionGraphInfo(
+      const std::string& name, std::unique_ptr<Graph>&& graph,
+      FunctionLibraryDefinition&& lib_def,
+      const std::unordered_map<string, string>& node_name_to_control_ret,
+      const DataTypeVector& ret_types, size_t num_return_nodes,
+      uint64_t optimization_duration_usecs,
+      OptimizedFunctionGraph::OptimizationSource optimization_source)
+      : name(name),
+        function_graph(std::move(graph)),
+        lib_def(std::move(lib_def)),
+        node_name_to_control_ret(node_name_to_control_ret),
+        ret_types(ret_types),
+        num_return_nodes(num_return_nodes),
+        optimization_duration_usecs(optimization_duration_usecs),
+        optimization_source(optimization_source) {}
+
+  OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo& info) = delete;
+  OptimizedFunctionGraphInfo& operator=(OptimizedFunctionGraphInfo& info) =
+      delete;
+  OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo&& info) =
+      default;  // NOLINT
+  OptimizedFunctionGraphInfo& operator=(
+      OptimizedFunctionGraphInfo&& info) noexcept = default;  // NOLINT
+
+  // Converts from the struct to OptimizedFunctionGraph proto.
+  static OptimizedFunctionGraph ToProto(const OptimizedFunctionGraphInfo& info);
+
+  // Converts from the proto to struct OptimizedFunctionGraphInfo. Returns error
+  // if the conversion fails.
+  static absl::StatusOr<OptimizedFunctionGraphInfo> FromProto(
+      OptimizedFunctionGraph&& proto);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZED_FUNCTION_GRAPH_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/partitioning_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/partitioning_utils.h
new file mode 100644
index 00000000..6bc9befb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/partitioning_utils.h
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Given a `device_set` and a `graph`, partitions the `graph` into
+// `subgraphs`. `subgraphs` maps device names to the graph assigned to that
+// device. `graph` must have been placed (e.g. by running Placer),
+// i.e. all nodes must have an assigned_device set.
+// `graph` is non-const because the underlying Partition() function transforms
+// the graph to correctly partition distributed control flow.
+// `get_tensor_name_attr` computes the "tensor_name" attr value of Send/Recv ops
+// inserted during partitioning. Use the default one if not set. It needs to be
+// thread safe if it's shared in multple threads.
+absl::Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs,
+    std::function<string(const Edge*)> get_tensor_name_attr = nullptr);
+
+// Inserts send/recv ops to `graph` if nodes are assigned to multiple devices.
+// Returns the new graph with the added nodes. Moreover, the dependency between
+// a send/recv pair is made explicit by adding a control dependency between
+// them.
+// Note that, the returned graph is intended to be used by TF MLIR importer.
+// The dependencies between send/recv pairs ensure the importer will generate TF
+// MLIR ops in a valid order.
+absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph);
+
+// This function performs bookkeeping to track which `Arg` and `Retval` nodes
+// were placed on a particular device / graph.
+//
+// More specifically, this function
+//
+//  (1) rewrites the indices of the `Arg` and `Retval` nodes in `graph` to be
+//      consecutive.
+//
+//      These indices might not be consecutive after grappler's pruning
+//      optimization (e.g. removing redundant Args), or graph partitioning. In
+//      the latter case, the nodes in `graph` are placed on `device_type`, and
+//      each such graph partition gets a subset of the arguments and return
+//      values. The `index` attributes of these _Arg and _Retval nodes reflect
+//      the indices of these parameters in the original function. To convert
+//      `subgraph` to a function, we need to replace there original indices with
+//      0, 1, 2, ... .
+//
+//      The argument and return value order in `graph` is determined by the
+//      argument and return value order in the original function. This stability
+//      is important because it enables us to treat a single-partition function
+//      as having the same signature as the subgraph.
+//
+//  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
+//      device in `*_indices`, and
+//  (3) records which `Arg` and `Retval` nodes live in host memory in
+//      `*_alloc_attrs`. If these vectors are NULL, do nothing here. If
+//      `ints_on_device` is false, int32 `Arg` and `Retval` nodes are placed on
+//      host else not. This is needed because in certain special cases e.g.
+//      when graph is placed on TPU/XLA device or when the `Retval` is an output
+//      of an iterator, int32 tensors live on device.
+absl::Status UpdateArgAndRetvalMetadata(
+    Graph* graph, std::vector<FunctionArgIndex>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs, bool ints_on_device);
+
+// Utility for generating function names not present in `flib_def`, using
+// given `name` as the base for the name.
+class FunctionNameGenerator {
+ public:
+  // `flib_def` must outlive this.
+  FunctionNameGenerator(const FunctionLibraryDefinition* flib_def,
+                        const string& name)
+      : flib_def_(flib_def), name_(name), counter_(0) {}
+
+  // Returns a function name not present in `flib_def` using `name` as
+  // the base and appending a numeric suffix.
+  string GetName();
+
+ private:
+  const FunctionLibraryDefinition* flib_def_;
+  const string name_;
+  uint32 counter_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pending_counts.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pending_counts.h
new file mode 100644
index 00000000..cff837ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pending_counts.h
@@ -0,0 +1,573 @@
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <atomic>
+
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// PendingCounts is an internal helper class to keep track of pending and
+// dead counts for nodes, for use in the ExecutorState module.  It
+// holds a map from Handles to various counts for that handle.  This
+// information is needed per frame iteration. The amount of memory
+// needed for an iteration is the same across all executions of the
+// iteration. The memory amount and handles are precomputed at startup
+// using a Layout object.
+//
+//    PendingCounts::Layout layout;
+//    std::vector<PendingCounts::Handle> h(C);
+//    for (int id = 0; id < C; id++) {
+//      h[id] = r.AddHandle(max_pending[id], max_dead[id]);
+//    }
+//
+// When we actually want to start an iteration we first create a
+// PendingCounts object and then index into it using the precomputed
+// handles:
+
+//    PendingCounts counts(layout);
+//    ...
+//    counts.decrement_pending(h[id], 1);
+class PendingCounts {
+ public:
+  // The state machine for a node's execution.
+  enum NodeState {
+    // The pending count for the node > 0.
+    PENDING_NOTREADY,
+    // The pending count for the node == 0, but the node has not
+    // started executing.
+    PENDING_READY,
+    // The node has started executing.
+    STARTED,
+    // The node has finished executing.
+    COMPLETED
+  };
+
+  // An opaque handle indicating where in the PendingCounts data structure
+  // the appropriate count information can be found.
+  class Handle;
+  // Given a node that needs to represent counts no larger than the
+  // specified "max_pending_count" and "max_dead_count", create a
+  // handle that can be passed to various PendingCounts routines
+  // to retrieve the count data for this node.
+  class Layout {
+   public:
+    Handle CreateHandle(size_t max_pending_count, size_t max_dead_count);
+
+   private:
+    friend class PendingCounts;
+    int next_offset_ = 0;  // Next byte offset to allocate
+  };
+
+  // Create a new PendingCounts object that can hold the state of
+  // all the Handles allocated from "final_allocator".
+  explicit PendingCounts(Layout layout)
+      : num_bytes_(layout.next_offset_), bytes_(new char[num_bytes_]()) {
+    if (num_bytes_ >= sizeof(LargeCounts)) {
+      CHECK_EQ(uintptr_t(bytes_) % alignof(LargeCounts), 0);
+    }
+  }
+
+  // Create a new PendingCounts object with the same layout and counts
+  // as "other".
+  explicit PendingCounts(const PendingCounts& other)
+      : num_bytes_(other.num_bytes_), bytes_(new char[num_bytes_]) {
+    if (num_bytes_ >= sizeof(LargeCounts)) {
+      CHECK_EQ(uintptr_t(bytes_) % alignof(LargeCounts), 0);
+    }
+    memcpy(bytes_, other.bytes_, other.num_bytes_);
+  }
+
+  ~PendingCounts() { delete[] bytes_; }
+
+  void set_initial_count(Handle h, size_t pending_count) {
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending = pending_count;
+      c.dead_count = 0;
+      c.has_started = 0;
+      c_ptr->store(c, std::memory_order_relaxed);
+    } else {
+      DCHECK_LE(pending_count, kMaxCountForPackedCounts);
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending = pending_count;
+      c.dead_count = 0;
+      c.has_started = 0;
+      c_ptr->store(c, std::memory_order_relaxed);
+    }
+  }
+
+  NodeState node_state(Handle h) {
+    if (h.is_large_) {
+      return NodeStateForStruct(Large(h)->load(std::memory_order_relaxed));
+    } else {
+      return NodeStateForStruct(Packed(h)->load(std::memory_order_relaxed));
+    }
+  }
+  void mark_started(Handle h) {
+    DCHECK_EQ(pending(h), 0);
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 0);
+      c.has_started = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 0);
+      c.has_started = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
+    }
+  }
+  void mark_completed(Handle h) {
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 1);
+      c.pending = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 1);
+      c.pending = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
+    }
+  }
+  int pending(Handle h) {
+    if (h.is_large_) {
+      LargeCounts c = Large(h)->load(std::memory_order_relaxed);
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        return c.pending;
+      } else {
+        // The pending count encodes the state once the node has
+        // started, so just return 0.
+        return 0;
+      }
+    } else {
+      PackedCounts c = Packed(h)->load(std::memory_order_relaxed);
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        return c.pending;
+      } else {
+        // The pending count encodes the state once the node has
+        // started, so just return 0.
+        return 0;
+      }
+    }
+  }
+  struct AdjustResult {
+    int dead_count;
+    int pending_count;
+
+    AdjustResult(int dead_count, int pending_count)
+        : dead_count(dead_count), pending_count(pending_count) {}
+  };
+  int decrement_pending(Handle h, int v) {
+    DCHECK_GE(pending(h), v);
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending -= v;
+      c_ptr->store(c, std::memory_order_relaxed);
+      return c.pending;
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending -= v;
+      c_ptr->store(c, std::memory_order_relaxed);
+      return c.pending;
+    }
+  }
+
+  // Mark a merge node as live
+  // REQUIRES: Node corresponding to "h" is a merge node
+  void mark_live(Handle h) {
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      // Only do anything if the node hasn't already started executing.
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        c.pending &= ~static_cast<int>(0x1);
+        c_ptr->store(c, std::memory_order_relaxed);
+      }
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      // Only do anything if the node hasn't already started executing.
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        static_assert(7 == kMaxCountForPackedCounts,
+                      "Live flag incorrect for max packed count");
+        c.pending &= 0x6;
+        c_ptr->store(c, std::memory_order_relaxed);
+      }
+    }
+  }
+
+  int dead_count(Handle h) {
+    int r = h.is_large_ ? Large(h)->load(std::memory_order_relaxed).dead_count
+                        : Packed(h)->load(std::memory_order_relaxed).dead_count;
+    return r;
+  }
+  void increment_dead_count(Handle h) {
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        c.dead_count++;
+        c_ptr->store(c, std::memory_order_relaxed);
+      }
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        DCHECK_LT(c.dead_count, kMaxCountForPackedCounts);
+        c.dead_count++;
+        c_ptr->store(c, std::memory_order_relaxed);
+      }
+    }
+  }
+
+  // Mark a merge node as live. Please note that the pending count it returns
+  // is before the update.
+  AdjustResult adjust_for_mark_live(Handle h) {
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      auto ret_pending = 0;
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        ret_pending = c.pending;
+        c.pending &= ~static_cast<int>(0x1);
+        c_ptr->store(c, std::memory_order_relaxed);
+      }
+      return AdjustResult(c.dead_count, ret_pending);
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      auto ret_pending = 0;
+      if (PENDING_NOTREADY == NodeStateForStruct(c)) {
+        static_assert(7 == kMaxCountForPackedCounts,
+                      "Live flag incorrect for max packed count");
+        ret_pending = c.pending;
+        c.pending &= 0x6;
+        c_ptr->store(c, std::memory_order_relaxed);
+      }
+      return AdjustResult(c.dead_count, ret_pending);
+    }
+  }
+
+  // The same as the above, but performs the operation atomically. This
+  // is thread-safe to run concurrently with other threads.
+  AdjustResult adjust_for_mark_live_atomic(Handle h) {
+    if (h.is_large_) {
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto old_val = c_ptr->load(std::memory_order_relaxed);
+      while (true) {
+        auto new_val = old_val;
+        auto ret_pending = 0;
+        // Only do anything if the node hasn't already started executing.
+        if (PENDING_NOTREADY == NodeStateForStruct(new_val)) {
+          ret_pending = old_val.pending;
+          new_val.pending &= ~static_cast<int>(0x1);
+        }
+        AdjustResult ret(old_val.dead_count, ret_pending);
+        if (TF_PREDICT_TRUE(c_ptr->compare_exchange_weak(old_val, new_val)))
+          return ret;
+      }
+    } else {
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto old_val = c_ptr->load(std::memory_order_relaxed);
+      while (true) {
+        auto new_val = old_val;
+        auto ret_pending = 0;
+        // Only do anything if the node hasn't already started executing.
+        if (PENDING_NOTREADY == NodeStateForStruct(new_val)) {
+          static_assert(7 == kMaxCountForPackedCounts,
+                        "Live flag incorrect for max packed count");
+          ret_pending = old_val.pending;
+          new_val.pending &= 0x6;
+        }
+        AdjustResult ret(old_val.dead_count, ret_pending);
+        if (TF_PREDICT_TRUE(c_ptr->compare_exchange_weak(old_val, new_val)))
+          return ret;
+      }
+    }
+  }
+
+  // A streamlined routine that does several pieces of bookkeeping at
+  // once.  Equivalent to:
+  //    increment_dead_count(h);
+  //    return {dead_count(h) pending(h)};
+  AdjustResult adjust_for_increment_dead(Handle h) {
+    if (h.is_large_) {
+      return adjust_for_increment_dead_shared(Large(h));
+    } else {
+      return adjust_for_increment_dead_shared(Packed(h));
+    }
+  }
+
+  // The same as the above, but performs the operation atomically. This
+  // is thread-safe to run concurrently with other threads.
+  AdjustResult adjust_for_increment_dead_atomic(Handle h) {
+    if (h.is_large_) {
+      return adjust_for_increment_dead_shared_atomic(Large(h));
+    } else {
+      return adjust_for_increment_dead_shared_atomic(Packed(h));
+    }
+  }
+
+  // A streamlined routine that does several pieces of bookkeeping at
+  // once.  Equivalent to:
+  //    decrement_pending(h, decrement_pending);
+  //    return {dead_count(h) pending(h)};
+  AdjustResult adjust_for_decrement_pending(Handle h, int decrement_pending) {
+    DCHECK_GE(pending(h), decrement_pending);
+    if (h.is_large_) {
+      return adjust_for_decrement_pending_shared(Large(h), decrement_pending);
+    } else {
+      return adjust_for_decrement_pending_shared(Packed(h), decrement_pending);
+    }
+  }
+
+  // The same as the above, but performs the operation atomically. This
+  // is thread-safe to run concurrently with other threads.
+  AdjustResult adjust_for_decrement_pending_atomic(Handle h,
+                                                   int decrement_pending) {
+    DCHECK_GE(pending(h), decrement_pending);
+    if (h.is_large_) {
+      return adjust_for_decrement_pending_shared_atomic(Large(h),
+                                                        decrement_pending);
+    } else {
+      return adjust_for_decrement_pending_shared_atomic(Packed(h),
+                                                        decrement_pending);
+    }
+  }
+
+  // A streamlined routine that does several pieces of bookkeeping at
+  // once.  Equivalent to:
+  //    if (increment_dead) increment_dead_count(h);
+  //    decrement_pending(h, 1);
+  //    return {dead_count(h), pending(h)};
+  AdjustResult adjust_for_activation(Handle h, bool increment_dead) {
+    DCHECK_GE(pending(h), 1);
+    if (h.is_large_) {
+      return adjust_for_activation_shared(Large(h), increment_dead);
+    } else {
+      return adjust_for_activation_shared(Packed(h), increment_dead);
+    }
+  }
+
+  // The same as the above, but performs the operation atomically. This
+  // is thread-safe to run concurrently with other threads.
+  AdjustResult adjust_for_activation_atomic(Handle h, bool increment_dead) {
+    DCHECK_GE(pending(h), 1);
+    if (h.is_large_) {
+      return adjust_for_activation_shared_atomic(Large(h), increment_dead);
+    } else {
+      return adjust_for_activation_shared_atomic(Packed(h), increment_dead);
+    }
+  }
+
+  class Handle {
+   public:
+    Handle() : byte_offset_(0), is_large_(0) {}
+
+   private:
+    friend class PendingCounts;
+    int byte_offset_ : 31;  // Byte offset of the rep in PendingCounts object
+    bool is_large_ : 1;  // If true, rep is LargeCounts; otherwise PackedCounts
+  };
+
+ private:
+  template <typename T>
+  inline AdjustResult adjust_for_increment_dead_shared(std::atomic<T>* c) {
+    T val = c->load(std::memory_order_relaxed);
+    auto ret_pending = 0;
+    // Only do anything if the node hasn't already started executing.
+    if (PENDING_NOTREADY == NodeStateForStruct(val)) {
+      val.dead_count++;
+      ret_pending = val.pending;
+      c->store(val, std::memory_order_relaxed);
+    }
+    return AdjustResult(val.dead_count, ret_pending);
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_increment_dead_shared_atomic(
+      std::atomic<T>* c) {
+    T old_val = c->load(std::memory_order_relaxed);
+    while (true) {
+      auto new_val = old_val;
+      auto ret_pending = 0;
+      // Only do anything if the node hasn't already started executing.
+      if (PENDING_NOTREADY == NodeStateForStruct(new_val)) {
+        ret_pending = new_val.pending;
+        new_val.dead_count++;
+      }
+      AdjustResult ret(new_val.dead_count, ret_pending);
+      if (TF_PREDICT_TRUE(c->compare_exchange_weak(old_val, new_val)))
+        return ret;
+    }
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_decrement_pending_shared(
+      std::atomic<T>* c, int decrement_pending) {
+    T val = c->load(std::memory_order_relaxed);
+    DCHECK_GE(val.pending, decrement_pending);
+    val.pending -= decrement_pending;
+    c->store(val, std::memory_order_relaxed);
+    return AdjustResult(val.dead_count, val.pending);
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_decrement_pending_shared_atomic(
+      std::atomic<T>* c, int decrement_pending) {
+    T old_val = c->load(std::memory_order_relaxed);
+    while (true) {
+      T new_val = old_val;
+      DCHECK_GE(new_val.pending, decrement_pending);
+      new_val.pending -= decrement_pending;
+      AdjustResult ret(new_val.dead_count, new_val.pending);
+      if (TF_PREDICT_TRUE(c->compare_exchange_weak(old_val, new_val)))
+        return ret;
+    }
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_activation_shared(std::atomic<T>* c,
+                                                   bool increment_dead) {
+    T val = c->load(std::memory_order_relaxed);
+    if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(val)) {
+      val.dead_count++;
+    }
+    DCHECK_GE(val.pending, 1);
+    val.pending--;
+    c->store(val, std::memory_order_relaxed);
+    return AdjustResult(val.dead_count, val.pending);
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_activation_shared_atomic(std::atomic<T>* c,
+                                                          bool increment_dead) {
+    T old_val = c->load(std::memory_order_relaxed);
+    while (true) {
+      T new_val = old_val;
+      if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(new_val)) {
+        new_val.dead_count++;
+      }
+      DCHECK_GE(new_val.pending, 1);
+      new_val.pending--;
+      AdjustResult ret(new_val.dead_count, new_val.pending);
+      if (TF_PREDICT_TRUE(c->compare_exchange_weak(old_val, new_val)))
+        return ret;
+    }
+  }
+
+  // We keep track of the pending count and dead input count for each
+  // graph node.  The representation used here is designed to be cache
+  // efficient for graphs with large numbers of nodes, where most
+  // nodes have relatively small maximum pending counts (e.g. for one
+  // LSTM model, 99% of 5000+ nodes had in-degrees of 3 or less).  We
+  // use one byte to hold both the pending and dead count for a node
+  // where these together can fit in one byte, and we use a hash table
+  // to handle the rare node ids that need larger counts than this.
+  // Each frame in this subgraph has its own PendingCounts.
+
+  // We use 3 bits each for dead_count and pending.
+  static constexpr int kMaxCountForPackedCounts = 7;
+
+  // Most counts are small, so we pack a pending count and a dead
+  // count into 3 bits each, use 1 bit to indicate that the node has
+  // started computing.
+  struct PackedCounts {
+    uint8 pending : 3;
+    uint8 dead_count : 3;
+    uint8 has_started : 1;
+  };
+
+  // NOTE: alignas(8) is critical to implement efficient atomic<LargeCounts>
+  // on MSVC.
+  struct alignas(8) LargeCounts {
+    uint32 pending;
+    uint32 dead_count : 31;
+    // NOTE(tlipcon): MSVC won't pack this struct into 8 bytes unless
+    // all of the member types are uint32.
+    uint32 has_started : 1;
+  };
+
+  template <typename T>
+  NodeState NodeStateForStruct(const T& c) const {
+    if (c.has_started) {
+      return (c.pending == 0) ? STARTED : COMPLETED;
+    } else {
+      return (c.pending == 0) ? PENDING_READY : PENDING_NOTREADY;
+    }
+  }
+  inline std::atomic<LargeCounts>* Large(Handle h) {
+    DCHECK(h.is_large_);
+    DCHECK_LE(h.byte_offset_ + sizeof(std::atomic<LargeCounts>), num_bytes_);
+    DCHECK_EQ(h.byte_offset_ % alignof(std::atomic<LargeCounts>), 0);
+    return reinterpret_cast<std::atomic<LargeCounts>*>(bytes_ + h.byte_offset_);
+  }
+  inline std::atomic<PackedCounts>* Packed(Handle h) {
+    DCHECK(!h.is_large_);
+    DCHECK_LE(h.byte_offset_ + sizeof(PackedCounts), num_bytes_);
+    return reinterpret_cast<std::atomic<PackedCounts>*>(bytes_ +
+                                                        h.byte_offset_);
+  }
+
+  const int num_bytes_;  // Just for bounds checking in debug mode
+  char* bytes_;          // Array of num_bytes_ bytes
+
+  void operator=(const PendingCounts&) = delete;
+};
+
+inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
+    size_t max_pending_count, size_t max_dead_count) {
+  Handle result;
+  if ((max_pending_count > kMaxCountForPackedCounts) ||
+      (max_dead_count > kMaxCountForPackedCounts)) {
+    constexpr int B = sizeof(std::atomic<LargeCounts>);
+    // Round byte offset to proper alignment
+    static_assert(
+        sizeof(std::atomic<LargeCounts>) >= alignof(std::atomic<LargeCounts>),
+        "std::atomic<LargeCounts> must be packed");
+    int64_t offset = ((static_cast<int64_t>(next_offset_) + B - 1) / B) * B;
+    result.byte_offset_ = offset;
+    result.is_large_ = true;
+    next_offset_ = result.byte_offset_ + B;
+  } else {
+    result.byte_offset_ = next_offset_;
+    result.is_large_ = false;
+    static_assert(sizeof(std::atomic<PackedCounts>) == 1,
+                  "std::atomic<PackedCounts> should be a single byte");
+    next_offset_ += sizeof(std::atomic<PackedCounts>);
+  }
+  return result;
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/permuter.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/permuter.h
new file mode 100644
index 00000000..57704dd1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/permuter.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Implementation of collective permute.
+//
+// Permute takes
+// - a list of devices participating in the collective
+// - a permutation as a list of integers.
+// - a tensor
+//
+// The list of devices replaces the need for group_key and group_size. The
+// number of inputs only scales with the number of devices within one group.
+//
+// The integers in the permutation are based on indices of the list of devices.
+// E.g. devices = {"GPU:0", "GPU:1"} and permutation = {1,0} means
+// - devices[0] sends to devices[permutation[0]] and
+// - devices[1] sends to devices[permutation[1]].
+//
+// Each device sends exactly one tensor and receives exactly one tensor.
+class Permuter : public CollectiveImplementationInterface {
+ public:
+  Permuter();
+  ~Permuter() override = default;
+
+  void Run(StatusCallback done) override;
+
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override {
+    return absl::OkStatus();
+  }
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  absl::Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+ private:
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  mutex mu_;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+  int counter_ TF_GUARDED_BY(mu_);
+
+  void DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
+                    const StatusCallback& done);
+
+  void DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
+                    const StatusCallback& done);
+
+  // Atomically increments counter_ by one for sending, one for receiving.
+  // Invokes done when counter_ reaches 2.
+  // The purpose of checking counter_ is to ensure that done_ is called once.
+  StatusCallback CheckCounterAndCallDone();
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/placer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/placer.h
new file mode 100644
index 00000000..d7b89fd3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/placer.h
@@ -0,0 +1,112 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+// A placement algorithm that assigns the nodes of the given Graph to
+// devices the given DeviceSet, respecting the following constraints:
+//
+// 1. Existing device assignments remain unchanged.
+// 2. Requested (partial or complete) device specifications given by device name
+//    for each node are granted.
+// 3. Nodes connected by edges of a reference type are colocated on
+//    the same device.
+// 4. Given nodes "A" and "B", if node "B" has a colocation group
+//    "@loc:A", nodes "A" and "B" will be colocated on the same device.
+//
+// The implementation builds a constraint graph with the same set of
+// nodes, and edges that represent colocation constraints between
+// nodes.  Each connected component in the resulting constraint graph
+// is then assigned to a set of valid devices.
+//
+// Run() will finally assign the device to each node given the list of
+// possible devices.
+//
+// TODO(mrry): "Soft" constraints, such as "place node 'x' as close as
+// possible to node 'y' while respecting the other constraints"?
+// TODO(mrry): Create a common interface for this and the other
+// placement algorithms so that they may be injected into the graph
+// builder.
+class Placer {
+ public:
+  // Creates an instance of the Placer algorithm for the given
+  // Graph "graph" (nodes in which may or may not be assigned) on the
+  // given DeviceSet "devices".
+  // "function_name" should be set to the name of the function whose body is
+  // represented by "graph". If "graph" is not representing a function body,
+  // "function_name" should be empty.
+  //
+  // If non-null, default_local_device is used where possible as a placement for
+  // nodes which do not have a device specified, ahead of other devices which
+  // would otherwise be higher priority. default_local_device should be on the
+  // local host so that its FLR is directly accessible by the current process.
+  //
+  // The "graph", "devices", and "default_local_device" pointer arguments are
+  // borrowed by this Placer, and must outlive it.
+  Placer(Graph* graph, const string& function_name,
+         const FunctionLibraryDefinition* flib_def, const DeviceSet* devices,
+         const Device* default_local_device, bool allow_soft_placement,
+         bool log_device_placement);
+  Placer(Graph* graph, const string& function_name,
+         const FunctionLibraryDefinition* flib_def, const DeviceSet* devices);
+  Placer(Graph* graph, const string& function_name,
+         const FunctionLibraryDefinition* flib_def, const DeviceSet* devices,
+         const Device* default_local_device);
+
+  ~Placer();
+
+  // Assigns each node in this Placer's graph to a device in its
+  // set of devices.
+  //
+  // This method is not thread-safe.
+  // Run() may be invoked at most once.
+  absl::Status Run();
+  absl::Status Run(const GraphOptimizationPassOptions& options);
+
+ private:
+  // Returns true if the device type of 'candidate_device_name' is
+  // found in 'devices'.
+  bool CanAssignToDevice(const string& candidate_device_name,
+                         const std::vector<Device*>& devices) const;
+
+  Graph* const graph_;  // Not owned.
+  const string function_name_;
+  const FunctionLibraryDefinition* const flib_def_;  // Not owned.
+  const DeviceSet* const devices_;                   // Not owned.
+  const Device* default_local_device_;               // Not owned.
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+
+  Placer(const Placer&) = delete;
+  void operator=(const Placer&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h
new file mode 100644
index 00000000..4f8982d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.h
@@ -0,0 +1,157 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_INSPECTION_REQUIRED_OPS_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_INSPECTION_REQUIRED_OPS_UTILS_H_
+
+// Operations calling functions are becoming ubiquitous in TF 2.0.
+// Examples include PartitionedCallOp, functional If/While, and Dataset ops.
+// Such operations might require deep inspection - looking at the body of the
+// called function - to place them and surrounding ops correctly.
+
+// This file contains some utilities for placer to correctly place such ops
+// including:
+// - PlacerInspectionRequiredOpChecker: A simple class with a single
+// IsPlacerInspectionRequired method.
+// - IsolatePlacerInspectionRequiredOps: This function adds Identity ops for
+// each input/output of ops requiring placer inspection. It greatly simplifies
+// the implementation of placing such ops.
+
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// PlacerInspectionRequiredOpChecker allows one to check if Placer needs to
+// look deeply into the op to place ops consuming the outputs correctly.
+//
+// It is a class instead of a standalone method because checking whether
+// a function returns a resource takes non-trivial time and we cache the
+// results.
+class PlacerInspectionRequiredOpChecker {
+ public:
+  // Constructs a PlacerInspectionRequiredOpChecker for nodes of `graph`.
+  // The functions referenced by nodes in `graph` will be looked up in
+  // `flib_def`
+  PlacerInspectionRequiredOpChecker(const Graph* graph,
+                                    const FunctionLibraryDefinition* flib_def);
+
+  // If `node` is considered a deep op, sets `*is_deep` to true and returns
+  // OkStatus(). If an error occurs, returns that error, and the value of
+  // `*is_deep` is undefined.
+  // Currently, an op is considered deep, if it is a calling a function
+  // returning a resource. This definition is driven by Placer's need to
+  // look inside the op.
+  // REQUIRES: `node` is part of `graph` passed into constructor.
+  absl::Status IsPlacerInspectionRequired(const Node& node, bool* is_deep);
+
+ private:
+  const Graph& graph_;
+  const FunctionLibraryDefinition& flib_def_;
+  // Indexed by the node id.
+  // If cache_[node_id] is empty, the deepness of the node with id `node_id` has
+  // not been computed yet. Else, it contains the value already computed.
+  std::vector<absl::optional<bool>> cache_;
+};
+
+// Extracts `fdef` and `func` from `flib_def` for the function identified
+// in "f" attribute of `node`.
+absl::Status GetFunctionDefAndAttrs(const FunctionLibraryDefinition& flib_def,
+                                    const Node& node,
+                                    core::RefCountPtr<FunctionRecord>* fdef,
+                                    NameAttrList* func);
+
+// The "call" stack of functions.
+// Useful for better error messages as well as for detecting recursion.
+// Stores references to graph nodes. These references must outlive this.
+class FunctionStack {
+ public:
+  explicit FunctionStack(const string& function_name);
+
+  // `node_in_current_function` must outlive this.
+  FunctionStack Push(const Node* node_in_current_function,
+                     const string& new_current_function) const;
+
+  // Returns true iff this stack already includes `function_name`.
+  bool HasFunction(const string& function_name) const;
+
+  const string& current_function_name() const { return current_function_name_; }
+
+  // Format's this suitable for error interpolation that retrieves
+  // Python files and line numbers.
+  string FormatForError() const;
+
+ private:
+  struct Frame {
+    Frame(const string& function, const Node* node)
+        : function_name(function), node(node) {}
+
+    string function_name;
+    const Node* node;
+  };
+
+  // The function at the top of the stack. In other words, the function
+  // that is currently being inspected for placement.
+  string current_function_name_;
+
+  // The stack of frames that got the placement to the current_function_name_.
+  // frames_[0].function_name is the top function that Placer was constructed
+  // with. frames_[0].function_name can be empty if placer was constructed with
+  // a nameless graph, not a function.  frames_[0].node_name is a name of a node
+  // in frames_[0].function_name that required deep inspection (e.g. a
+  // PartitionedCallOp). The function that this node invoked is
+  // frames_[1].function_name, if frames_.size() > 1.  Else, the function that
+  // this node invoked is current_function_name_.
+  std::vector<Frame> frames_;
+};
+
+// Adds Identities for each input and output of function-calling ops in `graph`
+//
+// For example, the following graph calling a function on inputs `a` and `b`
+// and producing output `y` will be rewritten to include identities on all
+// edges:
+//
+//      a             b
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//         v
+//         y
+//
+// is transformed to
+//
+//      a             b
+//      |             |
+//  a_f (Identity)   b_f (Identity)
+//      |             |
+//      v             v
+//    f (PartitionedCallOp)
+//         |
+//      f_y (Identity)
+//         |
+//         v
+//         y
+//
+absl::Status IsolatePlacerInspectionRequiredOps(
+    const FunctionLibraryDefinition& flib_def, Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_INSPECTION_REQUIRED_OPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
new file mode 100644
index 00000000..bfcbc16d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
@@ -0,0 +1,122 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tensorflow/core/common_runtime/device/device_event_mgr.h"
+#include "tensorflow/core/common_runtime/device/device_id.h"
+#include "tensorflow/core/common_runtime/device/device_id_manager.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class PluggableDevice : public LocalDevice {
+ public:
+  PluggableDevice(const SessionOptions& options, const std::string& name,
+                  const string& device_type, const string& platform_name,
+                  Bytes memory_limit, const DeviceLocality& locality,
+                  TfDeviceId tf_device_id,
+                  const std::string& physical_device_desc,
+                  Allocator* device_allocator, Allocator* cpu_allocator,
+                  bool sync_every_op);
+
+  ~PluggableDevice() override;
+
+  // Initialize the device and return the status of initialization.
+  absl::Status Init(const SessionOptions& options);
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
+  absl::Status Sync() override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override;
+
+  // The executor that provides control for the pluggable device;
+  se::StreamExecutor* executor() const { return executor_; }
+
+ private:
+  Allocator* device_allocator_;
+  Allocator* cpu_allocator_;
+
+  se::StreamExecutor* executor_ = nullptr;
+  struct StreamGroup {
+    se::Stream* compute = nullptr;
+    se::Stream* host_to_device = nullptr;
+    se::Stream* device_to_host = nullptr;
+    absl::InlinedVector<se::Stream*, 4UL> device_to_device;
+  };
+
+  class StreamGroupFactory;
+
+  StreamGroup* stream_;
+  PluggableDeviceContext* device_context_;
+  // TODO(penpornk): Investigate renaming `GpuDeviceInfo` to `DeviceInfo`.
+  DeviceBase::AcceleratorDeviceInfo* pluggable_device_info_ = nullptr;
+  TfDeviceId tf_device_id_;
+  const string platform_name_;
+  const bool sync_every_op_ = false;
+  EventMgr* em_ = nullptr;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  bool force_gpu_compatible_ = false;
+  std::string ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                         int stream_id);
+
+  // This method returns an initialization status, in addition to
+  // calling the "done" StatusCallback, if there is a failure to
+  // allocate memory or if the tensor "from" is not DMA-copyable.
+  // If there is no error prior to enqueueing the copy, an OK status
+  // is returned.
+  absl::Status MaybeCopyTensorToPluggableDevice(
+      const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
+      StatusCallback done);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
new file mode 100644
index 00000000..898e3834
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_BFC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_BFC_ALLOCATOR_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// A PluggableDevice memory allocator that implements a 'best-fit with
+// coalescing' algorithm
+class PluggableDeviceBFCAllocator : public BFCAllocator {
+ public:
+  PluggableDeviceBFCAllocator(DeviceMemAllocator* sub_allocator,
+                              size_t total_memory, const string& name,
+                              bool force_memory_growth_requested);
+  PluggableDeviceBFCAllocator(DeviceMemAllocator* sub_allocator,
+                              size_t total_memory,
+                              const GPUOptions& gpu_options, const string& name,
+                              bool force_memory_growth_requested);
+  ~PluggableDeviceBFCAllocator() override = default;
+
+  PluggableDeviceBFCAllocator(const PluggableDeviceBFCAllocator&) = delete;
+  void operator=(const PluggableDeviceBFCAllocator&) = delete;
+
+ private:
+  static bool GetAllowGrowthValue(const GPUOptions& gpu_options,
+                                  bool force_memory_growth_requested);
+  static bool GetGarbageCollectionValue();
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_BFC_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
new file mode 100644
index 00000000..596341fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
@@ -0,0 +1,93 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_CONTEXT_H_
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace stream_executor {
+class Stream;
+}  // namespace stream_executor
+
+namespace tensorflow {
+
+class PluggableDeviceContext : public DeviceContext {
+ public:
+  // Does not take ownership of streams.
+  PluggableDeviceContext(
+      int stream_id, se::Stream* stream, se::Stream* host_to_device_stream,
+      se::Stream* device_to_host_stream,
+      absl::InlinedVector<se::Stream*, 4UL> device_to_device_stream)
+      : stream_id_(stream_id),
+        stream_(stream),
+        host_to_device_stream_(host_to_device_stream),
+        device_to_host_stream_(device_to_host_stream),
+        device_to_device_stream_(device_to_device_stream) {}
+
+  ~PluggableDeviceContext() override = default;
+
+  se::Stream* stream() const override { return stream_; }
+  se::Stream* host_to_device_stream() const { return host_to_device_stream_; }
+  se::Stream* device_to_host_stream() const { return device_to_host_stream_; }
+  se::Stream* device_to_device_stream(int index) const {
+    return device_to_device_stream_[index % device_to_device_stream_.size()];
+  }
+  int stream_id() const { return stream_id_; }
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
+  void MaintainLifetimeOnStream(const Tensor* t,
+                                se::Stream* stream) const override {}
+
+  absl::Status ThenExecute(Device* device, se::Stream* stream,
+                           std::function<void()> func) override;
+
+  bool IsPluggableDevice() override;
+
+ private:
+  int stream_id_;
+  // The default primary stream to use for this context.
+  // All the memory belongs to this stream.
+  se::Stream* stream_;
+  // The stream to use for copying data from host into PluggableDevice.
+  se::Stream* host_to_device_stream_;
+  // The stream to use for copying data from PluggableDevice to host.
+  se::Stream* device_to_host_stream_;
+  // Streams to use for copying data between PluggableDevices.
+  absl::InlinedVector<se::Stream*, 4UL> device_to_device_stream_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
new file mode 100644
index 00000000..3f6ab10f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
@@ -0,0 +1,66 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_FACTORY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/device/device_id.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+class PluggableDeviceFactory : public DeviceFactory {
+ public:
+  PluggableDeviceFactory(const string& device_type,
+                         const string& platform_name);
+  absl::Status ListPhysicalDevices(std::vector<string>* devices) override;
+  absl::Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) override;
+  absl::Status GetDeviceDetails(
+      int device_index, std::unordered_map<string, string>* details) override;
+
+ private:
+  // Populates *device_localities with the DeviceLocality descriptor for
+  // every TfDeviceId.
+  absl::Status GetDeviceLocalities(
+      int num_tf_devices, std::vector<DeviceLocality>* device_localities);
+  // Create a PluggableDevice associated with 'tf_device_id', allocates
+  // (strictly) 'memory_limit' bytes of PluggableDevice memory to it, and adds
+  // it to the 'devices' vector.
+  absl::Status CreatePluggableDevice(
+      const SessionOptions& options, const std::string& name_prefix,
+      TfDeviceId tf_device_id, int64_t memory_limit,
+      const DeviceLocality& dev_locality,
+      std::vector<std::unique_ptr<Device>>* devices);
+
+  const string device_type_;
+  const string platform_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
new file mode 100644
index 00000000..b77917d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_INIT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_INIT_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace stream_executor {
+class Platform;
+}  // namespace stream_executor
+
+namespace tensorflow {
+
+// Initializes the PluggableDevice platform and returns OK if the
+// PluggableDevice platform could be initialized.
+absl::Status ValidatePluggableDeviceMachineManager(const string& platform_name);
+
+// Returns the PluggableDevice machine manager singleton, creating it and
+// initializing the PluggableDevices on the machine if needed the first time it
+// is called.  Must only be called when there is a valid PluggableDevice
+// environment in the process (e.g., ValidatePluggableDeviceMachineManager()
+// returns OK).
+stream_executor::Platform* PluggableDeviceMachineManager(
+    const string& platform_name);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h
new file mode 100644
index 00000000..9676a706
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PLUGIN_INIT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PLUGIN_INIT_H_
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+absl::Status RegisterPluggableDevicePlugin(void* library_filename);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PLUGIN_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
new file mode 100644
index 00000000..0c396588
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
@@ -0,0 +1,128 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PROCESS_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PROCESS_STATE_H_
+
+#include <cstddef>
+#include <functional>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device/device_id.h"
+#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class PluggableDeviceBFCAllocator;
+class PluggableDeviceSimpleAllocator;
+class PoolAllocator;
+
+// Singleton that manages per-process state when PluggableDevices are present.
+class PluggableDeviceProcessState {
+ public:
+  // Singleton that manages each platform's per-process state. e.g. allocation
+  // of shared resource.
+  static PluggableDeviceProcessState* singleton(const string& device_type,
+                                                const string& platform_name);
+
+  // Query whether any PluggableDevice has been created so far.
+  // Disable thread safety analysis since a race is benign here.
+  bool HasPluggableDevice() const TF_NO_THREAD_SAFETY_ANALYSIS {
+    return pluggable_device_enabled_;
+  }
+
+  // Set the flag to indicate a PluggableDevice has been created.
+  // Disable thread safety analysis since a race is benign here.
+  void EnablePluggableDevice() TF_NO_THREAD_SAFETY_ANALYSIS {
+    pluggable_device_enabled_ = true;
+  }
+
+  // Returns the one PluggableDevice allocator used for the indexed
+  // PluggableDevice. Note that this is a system PluggableDevice index.
+  //
+  // 'total_bytes' is the total number of bytes that should be made
+  // available to the allocator.  The first call to this function for
+  // a given tf_device_id creates the allocator, so only the
+  // total_bytes used on that first call is used.
+  //
+  // 'allocator_type' describes the type of algorithm to use for the
+  // underlying allocator.  REQUIRES: Must be a valid type (see
+  // config.proto for the list of supported strings.).
+  //
+  // REQUIRES: tf_device_id must be a valid id for a PluggableDevice
+  // available in the current system environment. Otherwise returns nullptr.
+  virtual Allocator* GetPluggableDeviceAllocator(const GPUOptions& options,
+                                                 TfDeviceId tf_device_id,
+                                                 size_t total_bytes);
+
+  int NumPluggableDeviceAllocators() {
+    mutex_lock l(mu_);
+    return pluggable_device_allocators_.size();
+  }
+
+  virtual Allocator* GetPluggableDeviceHostAllocator(int numa_node);
+
+  // Returns bus_id for the given PluggableDevice id.
+  virtual int BusIdForPluggableDevice(TfDeviceId tf_device_id);
+
+ protected:
+  // PluggableDeviceProcessState is a singleton that should not normally be
+  // deleted except at process shutdown.
+  PluggableDeviceProcessState(const string& device_type,
+                              const string& platform_name);
+  virtual ~PluggableDeviceProcessState() = default;
+
+  ProcessState::MDMap* mem_desc_map() {
+    if (process_state_) return &process_state_->mem_desc_map_;
+    return nullptr;
+  }
+
+  static PluggableDeviceProcessState* instance_;
+  ProcessState* process_state_;  // Not owned.
+  bool pluggable_device_enabled_;
+  const string device_type_;
+  const string platform_name_;
+  mutex mu_;
+
+  struct AllocatorParts {
+    std::unique_ptr<Allocator> allocator;
+    Allocator* device_allocator;
+    SubAllocator* sub_allocator;  // owned by allocator
+  };
+
+  std::vector<AllocatorParts> pluggable_device_allocators_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> pluggable_device_visitors_
+      TF_GUARDED_BY(mu_);
+
+  std::vector<AllocatorParts> pluggable_device_host_allocators_
+      TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>>
+      pluggable_device_host_alloc_visitors_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>>
+      pluggable_device_host_free_visitors_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PROCESS_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
new file mode 100644
index 00000000..7cddbfb6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
@@ -0,0 +1,58 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_SIMPLE_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_SIMPLE_ALLOCATOR_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class PluggableDeviceSimpleAllocator : public Allocator {
+ public:
+  explicit PluggableDeviceSimpleAllocator(DeviceMemAllocator* sub_allocator);
+  ~PluggableDeviceSimpleAllocator() override = default;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+
+  bool TracksAllocationSizes() const override { return false; }
+  string Name() override { return "Simple allocator"; }
+  std::optional<AllocatorStats> GetStats() override;
+
+  AllocatorMemoryType GetMemoryType() const override {
+    return sub_allocator_->GetMemoryType();
+  }
+
+ private:
+  PluggableDeviceSimpleAllocator(const PluggableDeviceSimpleAllocator&) =
+      delete;
+  void operator=(const PluggableDeviceSimpleAllocator&) = delete;
+  std::unique_ptr<SubAllocator> sub_allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_SIMPLE_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h
new file mode 100644
index 00000000..7d5f1e2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_UTIL_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+class RecvTensorResponse;
+class TensorProto;
+
+class PluggableDeviceUtil {
+ public:
+  // Copies the data in 'device_tensor' into 'cpu_tensor'.
+  // 'device_tensor''s backing memory must be on 'device' and
+  // 'cpu_tensor' must be allocated to be of the same size as
+  // 'device_tensor'. Synchronous: may block.
+  static void CopyPluggableDeviceTensorToCPU(
+      Device* device, const DeviceContext* device_context,
+      const Tensor* device_tensor, Tensor* cpu_tensor, StatusCallback done);
+  // Blocks until all operations queued on the stream associated with
+  // 'device' at the time of the call have completed. Returns any
+  // error pending on the stream at completion.
+  static absl::Status Sync(Device* device);
+
+  // Blocks until all operations queued on all streams associated with the
+  // corresponding 'device' at the time of call have completed.
+  // Returns any error pending on the stream at completion.
+  static absl::Status SyncAll(Device* device);
+
+  static void CopyCPUTensorToPluggableDevice(
+      const Tensor* cpu_tensor, const DeviceContext* device_context,
+      Device* device, Tensor* device_tensor, StatusCallback done,
+      bool sync_dst_compute);
+
+  static void DeviceToDeviceCopy(
+      DeviceContext* send_dev_context, DeviceContext* recv_dev_context,
+      Device* src, Device* dst, AllocatorAttributes src_alloc_attr,
+      AllocatorAttributes dst_alloc_attr, const Tensor* input, Tensor* output,
+      int dev_to_dev_stream_index, StatusCallback done);
+
+  // Deep-copying of PluggableDevice tensor on the same device.
+  // 'src_device_tensor''s and 'dst_device_tensor''s backing memory must be on
+  // 'device' and 'dst_cpu_tensor' must be allocated to be of the same
+  // size as 'src_device_tensor'.
+  static void CopyPluggableDeviceTensorToSameDevice(
+      Device* device, const DeviceContext* device_context,
+      const Tensor* src_device_tensor, Tensor* dst_device_tensor,
+      StatusCallback done);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/pool_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pool_allocator.h
new file mode 100644
index 00000000..6ce3b788
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/pool_allocator.h
@@ -0,0 +1,181 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
+
+// Simple LRU pool allocators for various flavors of CPU RAM.
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Interface of an object that rounds up integers.
+class RoundUpInterface {
+ public:
+  virtual ~RoundUpInterface() {}
+  virtual size_t RoundUp(size_t num_bytes) = 0;
+};
+
+// Size-limited pool of memory buffers obtained from a SubAllocator
+// instance.  Pool eviction policy is LRU.
+class PoolAllocator : public Allocator {
+ public:
+  // "pool_size_limit" is the maximum number of returned, re-usable
+  // memory buffers to keep in the pool.  If pool_size_limit == 0, the
+  // pool is effectively a thin wrapper around the allocator.
+  // If "auto_resize" is true, then the pool_size_limit will gradually
+  // be raised so that deallocations happen very rarely, if at all.
+  // Transitory start-up objects may deallocate, but the long-term
+  // working-set should not. Auto-resizing can raise pool_size_limit
+  // but will never lower it.
+  // "allocator" is the object that performs the underlying memory
+  // malloc/free operations.  This object takes ownership of allocator.
+  PoolAllocator(size_t pool_size_limit, bool auto_resize,
+                SubAllocator* allocator, RoundUpInterface* size_rounder,
+                string name);
+  ~PoolAllocator() override;
+
+  string Name() override { return name_; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void DeallocateRaw(void* ptr) override;
+
+  // Allocate an unused memory region of size "num_bytes".  Fetch from
+  // the pool if available, otherwise call allocator_.
+  void* Get(size_t num_bytes);
+
+  // Return a no-longer needed memory region to the pool.  It is an error
+  // to deference "ptr" after this call.  If the pool is full, the least
+  // recently used region will be deallocated.
+  void Put(void* ptr, size_t num_bytes);
+
+  // Reset the pool to empty.
+  void Clear();
+
+  // The following accessors permit monitoring the effectiveness of
+  // the pool at avoiding repeated malloc/frees on the underlying
+  // allocator.  Read locks are not taken on the theory that value
+  // consistency with other threads is not important.
+
+  // Number of Get() requests satisfied from pool.
+  int64_t get_from_pool_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
+    return get_from_pool_count_;
+  }
+  // Number of Put() requests.
+  int64_t put_count() const TF_NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
+  // Number of Get() requests requiring a fresh allocation.
+  int64_t allocated_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
+    return allocated_count_;
+  }
+  // Number of pool evictions.
+  int64_t evicted_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
+    return evicted_count_;
+  }
+  // Current size limit.
+  size_t size_limit() const TF_NO_THREAD_SAFETY_ANALYSIS {
+    return pool_size_limit_;
+  }
+
+  AllocatorMemoryType GetMemoryType() const override {
+    return allocator_->GetMemoryType();
+  }
+
+ private:
+  struct PtrRecord {
+    void* ptr;
+    size_t num_bytes;
+    PtrRecord* prev;
+    PtrRecord* next;
+  };
+
+  // Remove "pr" from the double-linked LRU list.
+  void RemoveFromList(PtrRecord* pr) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Add "pr" to the head of the double-linked LRU list.
+  void AddToList(PtrRecord* pr) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Delete the least recently used record.
+  void EvictOne() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  const string name_;
+  const bool has_size_limit_;
+  const bool auto_resize_;
+  size_t pool_size_limit_;
+  std::unique_ptr<SubAllocator> allocator_;
+  std::unique_ptr<RoundUpInterface> size_rounder_;
+  mutex mutex_;
+  std::multimap<const size_t, PtrRecord*> pool_ TF_GUARDED_BY(mutex_);
+  PtrRecord* lru_head_ TF_GUARDED_BY(mutex_) = nullptr;
+  PtrRecord* lru_tail_ TF_GUARDED_BY(mutex_) = nullptr;
+  int64_t get_from_pool_count_ TF_GUARDED_BY(mutex_) = 0;
+  int64_t put_count_ TF_GUARDED_BY(mutex_) = 0;
+  int64_t allocated_count_ TF_GUARDED_BY(mutex_) = 0;
+  int64_t evicted_count_ TF_GUARDED_BY(mutex_) = 0;
+};
+
+// Do-nothing rounder. Passes through sizes unchanged.
+class NoopRounder : public RoundUpInterface {
+ public:
+  size_t RoundUp(size_t num_bytes) override { return num_bytes; }
+};
+
+// Power of 2 rounder: rounds up to nearest power of 2 size.
+class Pow2Rounder : public RoundUpInterface {
+ public:
+  size_t RoundUp(size_t num_bytes) override {
+    return 1uLL << Log2Ceiling64(num_bytes);
+  }
+};
+
+class BasicCPUAllocator : public SubAllocator {
+ public:
+  BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
+                    const std::vector<Visitor>& free_visitors)
+      : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
+
+  ~BasicCPUAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes,
+              size_t* bytes_received) override;
+
+  void Free(void* ptr, size_t num_bytes) override;
+
+  bool SupportsCoalescing() const override { return false; }
+
+  AllocatorMemoryType GetMemoryType() const override {
+    return AllocatorMemoryType::kHostPageable;
+  }
+
+ private:
+  int numa_node_;
+
+  BasicCPUAllocator(const BasicCPUAllocator&) = delete;
+  void operator=(const BasicCPUAllocator&) = delete;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_function_library_runtime.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_function_library_runtime.h
new file mode 100644
index 00000000..0b3b9dc0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -0,0 +1,545 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tsl/platform/thread_annotations.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+#endif  // !IS_MOBILE_PLATFORM
+
+namespace tensorflow {
+
+class FunctionArgsInterface {
+ public:
+  virtual ~FunctionArgsInterface() {}
+
+  virtual bool HasRemoteOrPackedInputs() const = 0;
+
+  virtual absl::Status GetLocalArg(const FunctionArgIndex& index,
+                                   Tensor* val) const = 0;
+
+  virtual std::vector<Tensor> GetLocalTensors() const = 0;
+
+#if !defined(IS_MOBILE_PLATFORM)
+  virtual absl::Status GetRemoteArg(const FunctionArgIndex& index,
+                                    eager::RemoteTensorHandle* val) const {
+    return errors::Unimplemented(
+        "Serializing a remote argument is not implemented.");
+  }
+#endif  // IS_MOBILE_PLATFORM
+};
+
+// A class that stores all the FunctionLibraryRuntime objects, one per device.
+class ProcessFunctionLibraryRuntime {
+ public:
+  // Creates FunctionLibraryRuntime objects for each device in the provided
+  // DeviceMgr. Caller needs to make sure that device_mgr, lib_def and parent
+  // (if provided) outlive this object.
+  ProcessFunctionLibraryRuntime(
+      const DeviceMgr* device_mgr, Env* env, const ConfigProto* config,
+      int graph_def_version, const FunctionLibraryDefinition* lib_def,
+      const OptimizerOptions& optimizer_options,
+      thread::ThreadPool* thread_pool = nullptr,
+      DistributedFunctionLibraryRuntime* parent = nullptr,
+      const SessionMetadata* session_metadata = nullptr,
+      Rendezvous::Factory rendezvous_factory = Rendezvous::Factory(),
+      StatsPublisherFactory stats_publisher_factory = CreateNoOpStatsPublisher);
+
+  ~ProcessFunctionLibraryRuntime() {
+    // Deleting the FunctionLibraryRuntime map will delete the function handles
+    // registered in it, which may call ReleaseHandle in this class again to
+    // release their sub-function. These circular calls may cause segfault
+    // since the flr_map_ may have already been deleted. Explicitly releasing
+    // flr_map_ here and checking flr_map_ in ReleaseHandle to avoid this.
+    flr_map_.reset();
+  }
+
+  // Sends `tensors_to_send` from `source_device` to `target_device` using
+  // `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the
+  // Rendezvous. `device_context` should be the DeviceContext of the device
+  // doing the sending. `alloc_attrs` should either be empty or be the size of
+  // `tensors_to_send` and indicates how the input tensors are allocated. Method
+  // takes references on each of the `tensors_to_send`. Method doesn't block.
+  static absl::Status SendTensors(
+      const string& source_device, const string& target_device,
+      const string& key_prefix, int64_t src_incarnation,
+      absl::Span<const Tensor> tensors_to_send, DeviceContext* device_context,
+      const std::vector<AllocatorAttributes>& alloc_attrs,
+      RendezvousInterface* rendezvous);
+
+  // Receives `received_tensors` from `target_device` (originally sent from
+  // `source_device`) using `rendezvous`. Uses `key_prefix` to construct the
+  // keys to be retrieved. `device_context` should be for the device receiving
+  // the tensors. `alloc_attrs` indicates how to allocate the received
+  // tensors and should either be empty or `num_tensors` in size. Method doesn't
+  // block and calls `done` when `num_tensors` are fetched.
+  static void ReceiveTensorsAsync(
+      const string& source_device, const string& target_device,
+      const string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
+      DeviceContext* device_context,
+      const std::vector<AllocatorAttributes>& alloc_attrs,
+      RendezvousInterface* rendezvous, std::vector<Tensor>* received_tensors,
+      StatusCallback done);
+
+  static const char kDefaultFLRDevice[];
+  // Returns the FunctionLibraryRuntime for the corresponding device_name.
+  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
+
+  // Returns the return types for the function identified by handle `h`.
+  absl::Status GetRetTypes(FunctionLibraryRuntime::Handle h,
+                           DataTypeVector* ret_types);
+
+  // Returns the device incarnation for the given device_name.
+  absl::Status GetDeviceIncarnation(const string& device_name,
+                                    int64_t* incarnation) const;
+
+  // For a given canonicalized key signature of the function instantiated
+  // on device `device_name` and a `local_handle`, creates a handle and returns
+  // that value. Uses core/common_runtime/framework/function.h::Canonicalize
+  // to canonicalize the function signature.
+  FunctionLibraryRuntime::Handle AddHandle(
+      const string& function_key, const string& device_name,
+      FunctionLibraryRuntime::LocalHandle local_handle);
+
+  // Returns a handle if found for the given key, else returns kInvalidHandle.
+  FunctionLibraryRuntime::Handle GetHandle(const string& function_key) const;
+
+  // For the given handle instantiated on device `device_name` returns the local
+  // index of instantiation of that function. If the function was not
+  // instantiated on `device_name` or the function is multi-device,
+  // returns kInvalidLocalHandle.
+  //
+  // If `include_multi_device` is true and `handle` is a multi-device function
+  // with a single component that is placed on `device_name`, then this method
+  // will return the local handle for that component.
+  FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
+      const string& device_name, FunctionLibraryRuntime::Handle handle,
+      bool include_multi_device = false) const;
+
+  // Fills `output_devices` with the devices on which the results will
+  // be produced. If some output is produced on CPU, the corresponding Device*
+  // is set to nullptr. If some output is DT_RESOURCE, the corresponding Device*
+  // is set to the device backing the resource.
+  // REQUIRES: `handle` identifies a multi-device function.
+  absl::Status GetOutputDevices(FunctionLibraryRuntime::Handle handle,
+                                std::vector<Device*>* output_devices) const;
+
+  // Instantiates the function. See framework/function.h for more details.
+  // Allows for function_name to be instantiated on different devices
+  // as specified in attrs.
+  absl::Status Instantiate(
+      const string& function_name, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::Handle* handle);
+
+  // Returns whether the function represented by the given handle needs to
+  // execute cross process.
+  absl::Status IsCrossProcess(FunctionLibraryRuntime::Handle handle,
+                              bool* is_cross_process) const;
+
+  // Delegates to the local FLR that owns state corresponding to `handle` and
+  // tells it to release it. If the `handle` isn't needed at all, the local FLR
+  // might call RemoveHandle on this to get rid of the state owned by the Proc
+  // FLR.
+  // For multi-device functions, calls ReleaseHandle on local FLRs for each
+  // component function that is part of this multi-device function.
+  // Each local FLR might call RemoveHandle on this.
+  absl::Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
+
+  // Runs the function with given `handle`. Function could have been
+  // instantiated on any device. More details in framework/function.h
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::Handle handle, absl::Span<const Tensor> args,
+           std::vector<Tensor>* rets,
+           FunctionLibraryRuntime::DoneCallback done) const;
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::Handle handle, CallFrameInterface* frame,
+           FunctionLibraryRuntime::DoneCallback done) const;
+
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::Handle handle,
+           const FunctionArgsInterface& args, std::vector<FunctionRet>* rets,
+           FunctionLibraryRuntime::DoneCallback done) const;
+
+  absl::Status RunSync(const FunctionLibraryRuntime::Options& opts,
+                       FunctionLibraryRuntime::Handle handle,
+                       absl::Span<const Tensor> args,
+                       std::vector<Tensor>* rets) const;
+  absl::Status RunSync(const FunctionLibraryRuntime::Options& opts,
+                       FunctionLibraryRuntime::Handle handle,
+                       CallFrameInterface* frame) const;
+
+  const DeviceMgr* device_mgr() { return device_mgr_; }
+
+  const std::shared_ptr<DeviceSet> device_set() const {
+    tf_shared_lock l(mu_);
+    return device_set_;
+  }
+
+  // Initialize the set of local and remote devices and corresponding flr for op
+  // device selection.
+  void InitializeDeviceAndFlr();
+
+  const ConfigProto* config() const { return config_ ? &(*config_) : nullptr; }
+
+  const FunctionLibraryDefinition* GetFunctionLibraryDefinition() const {
+    return lib_def_;
+  }
+
+  // Add a CompositeDevice to `device_set_`
+  void AddCompositeDevice(CompositeDevice* d) TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    device_set_->AddDevice(d);
+    composite_devices_.push_back(d);
+  }
+
+ protected:
+  friend class FunctionLibraryRuntimeImpl;
+
+  struct InternalArgs {
+    std::vector<FunctionArg> args;
+#if !defined(IS_MOBILE_PLATFORM)
+    // Holds the RemoteTensorHandles referred by args.
+    std::vector<std::unique_ptr<eager::RemoteTensorHandle>> remote_args;
+#endif  // IS_MOBILE_PLATFORM
+  };
+
+  // Structure detailing the asynchronous assumptions of a component function,
+  // such as whether it can support synchronous execution and any information
+  // needed to execute in proper order to resolve inter-subgraph dependencies.
+  class AsyncAttributes {
+   public:
+    enum Summary { kSafeForSync = 0, kSendOnly, kRecvOnly, kAsyncRequired };
+
+    AsyncAttributes()
+        : allow_control_flow_sync_execution_(false), summary_(kSafeForSync) {}
+    explicit AsyncAttributes(const Graph* graph,
+                             bool allow_control_flow_sync_execution)
+        : allow_control_flow_sync_execution_(allow_control_flow_sync_execution),
+          summary_(Summarize(graph)) {}
+    Summary summary() const { return summary_; }
+    bool allow_control_flow_sync_execution() const {
+      return allow_control_flow_sync_execution_;
+    }
+
+   private:
+    // This data member should be initialized before the summary_.
+    bool allow_control_flow_sync_execution_;
+    Summary summary_;
+    Summary Summarize(const Graph* graph);
+  };
+
+  // Structure to keep track of how a component function (a single-device
+  // piece of a multi-device function) fits into the multi-device function.
+  struct ComponentFunctionData {
+    // The handle for the instantiated component function.
+    FunctionLibraryRuntime::Handle handle;
+    // The name for the component function.
+    string name;
+    // arg_indices.size() is the number of arguments to the component function.
+    // The i-th argument of the component function comes from the
+    // `arg_indices[i]`-th argument of the multi-device function.
+    std::vector<FunctionArgIndex> arg_indices;
+    // ret_indices.size() is the number of return values of the component
+    // function.  The i-th return value of the component function goes to the
+    // `ret_indices[i]`-th return value of the multi-device function.
+    std::vector<int> ret_indices;
+    // arg_alloc_attrs[i] are the allocator attributes of the i-th argument to
+    // the component function.
+    std::vector<AllocatorAttributes> arg_alloc_attrs;
+    // ret_alloc_attrs[i] are the allocator attributes of the i-th return value
+    // of the component function.
+    std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+    AsyncAttributes async_attributes;
+  };
+
+  // Data structure holding information for a single instantiated multi-device
+  // function.
+  // The fields are filled in during instantiation. Once the object is
+  // added to mdevice_data_, all fields are constant.
+  struct MultiDeviceFunctionData {
+    MultiDeviceFunctionData(const string& function_name,
+                            const string& function_key, int num_outputs,
+                            DataTypeVector ret_types)
+        : function_name_(function_name),
+          function_key_(function_key),
+          instantiation_counter_(1),
+          num_outputs_(num_outputs),
+          ret_types_(std::move(ret_types)),
+          is_cross_process_(false),
+          has_remote_outputs(false) {}
+
+    const string function_name_;
+    const string function_key_;
+    uint64 instantiation_counter_;
+    // Stored here to resize the output tensor vector when function is run.
+    const int num_outputs_;
+    DataTypeVector ret_types_;
+
+    // Indicates whether this function needs to execute cross process.
+    bool is_cross_process_;
+    // Indicates whether this function has remote outputs.
+    bool has_remote_outputs;
+
+    //  Indicates if running this function synchronously is both allowed + safe.
+    bool enable_sync_execution;
+
+    // Maps the device name to the information about the component function
+    // be run on this device.
+    std::unordered_map<string, ComponentFunctionData> glue_;
+  };
+
+  struct CleanUpItem {
+    string device;
+    uint64 step_id;
+    FunctionLibraryRuntime::Handle local_handle;
+  };
+
+  // If `handle` represents a multi-device function, returns the multi-device
+  // data associated with `handle`. Else, nullptr.
+  MultiDeviceFunctionData* IsMultiDevice(
+      FunctionLibraryRuntime::Handle handle) const;
+
+  DistributedFunctionLibraryRuntime* const parent_;
+
+ private:
+  FunctionLibraryRuntime::Handle AddHandleLocked(
+      const string& function_key, const string& device_name,
+      FunctionLibraryRuntime::LocalHandle local_handle)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // For a given device_name, returns a DeviceContext for copying
+  // tensors to/from the device.
+  absl::Status GetDeviceContext(const string& device_name,
+                                DeviceContext** device_context) const;
+
+  // Looks up the information for the given `handle` and returns the name
+  // of the device where the function is registered.
+  string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
+
+  // Removes handle from the state owned by this object.
+  absl::Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
+
+  // Clones ProcessFunctionLibraryRuntime and FunctionLibraryDefinition
+  // (transferring ownership of both to the caller). Note that the
+  // ProcessFunctionLibraryRuntime borrows a pointer to the
+  // FunctionLibraryDefinition and so the FunctionLibraryDefinition should
+  // outlive the ProcessFunctionLibraryRuntime.
+  //
+  // The `skip_flib_def` argument controls whether the method should clone the
+  // FunctionLibraryDefinition (default behavior) or return an empty function
+  // library. The latter is used by tf.data, which manages
+  // FunctionLibraryDefinitions for its functions independently (and passes
+  // these into the FunctionLibraryRuntime through an overlay), to avoid linear
+  // runtime w.r.t. to number of functions in the current function library.
+  absl::Status Clone(Env* env, int graph_def_version,
+                     const OptimizerOptions& optimizer_options,
+                     std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+                     std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+                     bool skip_flib_def = false) const;
+
+  absl::Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
+
+  absl::Status InstantiateMultiDevice(
+      const string& function_name, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::Handle* handle);
+
+  void InstantiateRemote(
+      const string& function_name, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::Handle* handle,
+      FunctionLibraryRuntime::DoneCallback done);
+
+  FunctionLibraryRuntime::Handle AddMultiDeviceHandle(
+      const std::unique_ptr<MultiDeviceFunctionData> data,
+      const string& function_key);
+
+  bool HasMultiDeviceHandle(FunctionLibraryRuntime::Handle handle) const;
+
+  void RunInternal(const FunctionLibraryRuntime::Options& opts,
+                   FunctionLibraryRuntime::Handle handle,
+                   absl::Span<const FunctionArg> args,
+                   std::vector<FunctionRet>* rets,
+                   std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
+                   FunctionLibraryRuntime::DoneCallback done) const;
+
+  absl::Status CreateRendezvous(
+      FunctionLibraryRuntime::Options& opts,
+      tsl::core::RefCountPtr<Rendezvous>* created_rendezvous) const;
+
+  FunctionLibraryRuntime::DoneCallback ApplyCleanUpToDoneCallback(
+      std::vector<std::unique_ptr<CleanUpItem>>* items,
+      FunctionLibraryRuntime::DoneCallback done,
+      const FunctionLibraryRuntime::Options& opts,
+      tsl::core::RefCountPtr<Rendezvous> rendezvous) const;
+
+  void CleanUp(std::vector<std::unique_ptr<CleanUpItem>>* items,
+               FunctionLibraryRuntime::DoneCallback done) const;
+
+  static absl::Status GetComponentArgs(absl::Span<const Tensor> args,
+                                       const ComponentFunctionData& comp_data,
+                                       InternalArgs* comp_args);
+
+#if !defined(IS_MOBILE_PLATFORM)
+  static absl::Status GetComponentArgs(const FunctionArgsInterface& args,
+                                       const ComponentFunctionData& comp_data,
+                                       InternalArgs* comp_args);
+#endif  // IS_MOBILE_PLATFORM
+
+  std::vector<string> GetOrderedSubgraphs(
+      const MultiDeviceFunctionData* data) const;
+
+  absl::Status PrepareRunMultiDevice(
+      const FunctionLibraryRuntime::Options& opts,
+      FunctionLibraryRuntime::Handle handle,
+      const MultiDeviceFunctionData** data) const;
+
+  absl::Status RunMultiDeviceSync(
+      const FunctionLibraryRuntime::Options& opts,
+      FunctionLibraryRuntime::Handle handle, std::vector<FunctionRet>* rets,
+      std::function<absl::Status(const ComponentFunctionData& comp_data,
+                                 InternalArgs* args)>
+          get_component_args) const;
+
+  void RunMultiDeviceAsync(
+      const FunctionLibraryRuntime::Options& opts,
+      FunctionLibraryRuntime::Handle handle, std::vector<FunctionRet>* rets,
+      std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
+      FunctionLibraryRuntime::DoneCallback done,
+      std::function<absl::Status(const ComponentFunctionData& comp_data,
+                                 InternalArgs* args)>
+          get_component_args) const;
+
+  void PublishSubgraphs(
+      const std::string& function_name,
+      std::vector<core::RefCountPtr<FunctionRecord>>&& function_records);
+
+  // Data structure holding information for a single instantiated remote
+  // (to be executed on `target_device`) function.
+  class FunctionData {
+   public:
+    FunctionData(const string& target_device,
+                 FunctionLibraryRuntime::LocalHandle local_handle,
+                 const string& function_key)
+        : target_device_(target_device),
+          local_handle_(local_handle),
+          function_key_(function_key) {}
+
+    const string& target_device() { return target_device_; }
+    const string& function_key() { return function_key_; }
+
+    FunctionLibraryRuntime::LocalHandle local_handle() {
+      mutex_lock l(mu_);
+      return local_handle_;
+    }
+
+    // Initializes the FunctionData object by potentially making an Initialize
+    // call to the DistributedFunctionLibraryRuntime.
+    void DistributedInit(
+        DistributedFunctionLibraryRuntime* parent, const string& function_name,
+        const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+        const FunctionLibraryRuntime::InstantiateOptions& options,
+        FunctionLibraryRuntime::DoneCallback done);
+
+    bool is_cross_process() {
+      mutex_lock l(mu_);
+      return is_cross_process_;
+    }
+
+   private:
+    mutex mu_;
+
+    const string target_device_;
+    FunctionLibraryRuntime::LocalHandle local_handle_ TF_GUARDED_BY(mu_);
+    const string function_key_;
+    bool is_cross_process_ TF_GUARDED_BY(mu_) = false;
+    bool init_started_ TF_GUARDED_BY(mu_) = false;
+    absl::Status init_result_ TF_GUARDED_BY(mu_);
+    Notification init_done_;
+  };
+
+  mutable mutex mu_;
+
+  Env* const env_;
+  const std::optional<const ConfigProto> config_;
+  const DeviceMgr* const device_mgr_;
+  const FunctionLibraryDefinition* lib_def_;
+  thread::ThreadPool* default_thread_pool_;
+
+  // Cluster update can reinitialize the device_set_ due to remote device
+  // changes. At the same time, InstantiateMultiDevice can use the cached
+  // devices to instantiate multi-worker functions. Function instantiation would
+  // fail if it spans the changed remote devices.
+  std::shared_ptr<DeviceSet> device_set_ TF_GUARDED_BY(mu_);
+
+  // Composite devices owned by a EagerContext.
+  std::vector<CompositeDevice*> composite_devices_ TF_GUARDED_BY(mu_);
+
+  // Holds all the function instantiations. Maps function_keys to handles.
+  std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
+      TF_GUARDED_BY(mu_);
+
+  // Function data for instantiated remote functions.
+  std::unordered_map<FunctionLibraryRuntime::Handle,
+                     std::unique_ptr<FunctionData>>
+      function_data_ TF_GUARDED_BY(mu_);
+
+  // Function data for instantiated multi-device functions.
+  std::unordered_map<FunctionLibraryRuntime::Handle,
+                     std::unique_ptr<MultiDeviceFunctionData>>
+      mdevice_data_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<
+      std::unordered_map<Device*, core::RefCountPtr<FunctionLibraryRuntime>>>
+      flr_map_;
+  int next_handle_ TF_GUARDED_BY(mu_);
+  const SessionMetadata* const session_metadata_;
+  const Rendezvous::Factory rendezvous_factory_;
+
+  const OptimizerOptions optimizer_options_;
+  const int graph_def_version_;
+
+  StatsPublisherFactory stats_publisher_factory_;
+  // Holds all stats publishers, one for publishing subgraphs of each
+  // instantiated function.
+  std::vector<std::unique_ptr<StatsPublisherInterface>> stats_publishers_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_state.h
new file mode 100644
index 00000000..dd667cc2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_state.h
@@ -0,0 +1,161 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_
+
+#include <functional>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+class PoolAllocator;
+
+// Singleton that manages per-process state, e.g. allocation of
+// shared resources.
+class ProcessState : public ProcessStateInterface {
+ public:
+  static ProcessState* singleton();
+
+  // Descriptor for memory allocation attributes, used by optional
+  // runtime correctness analysis logic.
+  struct MemDesc {
+    enum MemLoc { CPU, GPU };
+    MemLoc loc;
+    int dev_index;
+    bool gpu_registered;
+    bool nic_registered;
+    MemDesc()
+        : loc(CPU),
+          dev_index(0),
+          gpu_registered(false),
+          nic_registered(false) {}
+    string DebugString();
+  };
+
+  // If NUMA Allocators are desired, call this before calling any
+  // Allocator accessor.
+  void EnableNUMA() { numa_enabled_ = true; }
+
+  // Returns what we know about the memory at ptr.
+  // If we know nothing, it's called CPU 0 with no other attributes.
+  MemDesc PtrType(const void* ptr);
+
+  // Returns the one CPUAllocator used for the given numa_node.
+  // Treats numa_node == kNUMANoAffinity as numa_node == 0.
+  Allocator* GetCPUAllocator(int numa_node) override;
+
+  // Registers alloc visitor for the CPU allocator(s).
+  // REQUIRES: must be called before GetCPUAllocator.
+  void AddCPUAllocVisitor(SubAllocator::Visitor v);
+
+  // Registers free visitor for the CPU allocator(s).
+  // REQUIRES: must be called before GetCPUAllocator.
+  void AddCPUFreeVisitor(SubAllocator::Visitor v);
+
+  typedef std::unordered_map<const void*, MemDesc> MDMap;
+
+ protected:
+  ProcessState();
+  virtual ~ProcessState() {}
+  friend class GPUProcessState;
+  friend class PluggableDeviceProcessState;
+
+  // If these flags need to be runtime configurable consider adding
+  // them to ConfigProto.
+  static constexpr bool FLAGS_brain_mem_reg_gpu_dma = true;
+  static constexpr bool FLAGS_brain_gpu_record_mem_types = false;
+
+  // Helper method for unit tests to reset the ProcessState singleton by
+  // cleaning up everything. Never use in production.
+  void TestOnlyReset();
+
+  static ProcessState* instance_;
+  bool numa_enabled_;
+
+  mutex mu_;
+
+  // Indexed by numa_node.  If we want numa-specific allocators AND a
+  // non-specific allocator, maybe should index by numa_node+1.
+  std::vector<Allocator*> cpu_allocators_ TF_GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_);
+
+  // A cache of cpu allocators indexed by a numa node. Used as a fast path to
+  // get CPU allocator by numa node id without locking the mutex. We can't use
+  // `cpu_allocators_` storage in the lock-free path because concurrent
+  // operation can deallocate the vector storage.
+  std::atomic<int> cpu_allocators_cached_;
+  std::array<Allocator*, 8> cpu_allocators_cache_;
+
+  // Optional RecordingAllocators that wrap the corresponding
+  // Allocators for runtime attribute use analysis.
+  MDMap mem_desc_map_;
+  std::vector<Allocator*> cpu_al_ TF_GUARDED_BY(mu_);
+};
+
+namespace internal {
+class RecordingAllocator : public Allocator {
+ public:
+  RecordingAllocator(ProcessState::MDMap* mm, Allocator* a,
+                     ProcessState::MemDesc md, mutex* mu)
+      : mm_(mm), a_(a), md_(md), mu_(mu) {}
+
+  string Name() override { return a_->Name(); }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    void* p = a_->AllocateRaw(alignment, num_bytes);
+    mutex_lock l(*mu_);
+    (*mm_)[p] = md_;
+    return p;
+  }
+  void DeallocateRaw(void* p) override {
+    mutex_lock l(*mu_);
+    auto iter = mm_->find(p);
+    mm_->erase(iter);
+    a_->DeallocateRaw(p);
+  }
+  bool TracksAllocationSizes() const override {
+    return a_->TracksAllocationSizes();
+  }
+  size_t RequestedSize(const void* p) const override {
+    return a_->RequestedSize(p);
+  }
+  size_t AllocatedSize(const void* p) const override {
+    return a_->AllocatedSize(p);
+  }
+  absl::optional<AllocatorStats> GetStats() override { return a_->GetStats(); }
+  bool ClearStats() override { return a_->ClearStats(); }
+
+  AllocatorMemoryType GetMemoryType() const override {
+    return a_->GetMemoryType();
+  }
+
+  ProcessState::MDMap* mm_;  // not owned
+  Allocator* a_;             // not owned
+  ProcessState::MemDesc md_;
+  mutex* mu_;
+};
+}  // namespace internal
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_util.h
new file mode 100644
index 00000000..cc2bc439
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/process_util.h
@@ -0,0 +1,64 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_UTIL_H_
+
+#include <functional>
+
+#include "absl/functional/any_invocable.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/public/session_options.h"
+
+// TODO(vrv, mrry): Remove this library: its interface circumvents the
+// callers' Env and calls Env::Default() directly.
+
+namespace tensorflow {
+
+// Returns a process-wide ThreadPool for scheduling compute operations
+// using 'options'.  Caller does not take ownership over threadpool.
+thread::ThreadPool* ComputePool(const SessionOptions& options);
+
+// Returns the TF_NUM_INTEROP_THREADS environment value, or 0 if not specified.
+int32 NumInterOpThreadsFromEnvironment();
+
+// Returns the TF_NUM_INTRAOP_THREADS environment value, or 0 if not specified.
+int32 NumIntraOpThreadsFromEnvironment();
+
+// Returns the number of inter op threads specified in `options` or a default.
+// If no value or a negative value is specified in the provided options, then
+// the function returns the value defined in the TF_NUM_INTEROP_THREADS
+// environment variable. If neither a value is specified in the options or in
+// the environment, this function will return a reasonable default value based
+// on the number of schedulable CPUs, and any MKL and OpenMP configurations.
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
+
+// Creates a thread pool with number of inter op threads.
+// The number is set if `num_threads` > 0, otherwise it will be configured by
+// SessionOptions.
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options, int32_t num_threads = 0);
+
+// Schedule "closure" in the default thread queue.
+void SchedClosure(absl::AnyInvocable<void()> closure);
+
+// Schedule "closure" after the given number of microseconds in the
+// fixed-size ThreadPool used for non-blocking compute tasks.
+void SchedNonBlockingClosureAfter(int64_t micros,
+                                  absl::AnyInvocable<void()> closure);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/profile_handler.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/profile_handler.h
new file mode 100644
index 00000000..71aac10b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/profile_handler.h
@@ -0,0 +1,68 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+// A profile handler collects event stats from a running step.
+class ProfileHandler {
+ public:
+  ProfileHandler() {}
+  virtual ~ProfileHandler() {}
+
+  // Records that a single Op was executed in the current step.
+  //
+  // Implementations of this method must be thread-safe.
+  //
+  // Args:
+  // - device: Device on which the Op was executed.
+  // - stats: Statistics of node execution timing.
+  // - is_copy: True if the op was a copy, send or recv.
+  // - label: Extra content for timeline click text.
+  // - op_type: String name of the Op.
+  // - details: Main content for timeline click text.
+  virtual void RecordOneOp(const string& device, const NodeExecStats& stats,
+                           bool is_copy, absl::string_view label,
+                           absl::string_view op_type,
+                           absl::string_view details) = 0;
+
+  // Records that the current step finished.
+  //
+  // Implementations of this method need not be thread-safe.
+  //
+  // Args:
+  // - start_time: The time at which the step started.
+  // - finish_time: The time at which the step finished.
+  // - cleanup_time: The time at which cleanup for the step finished.
+  // - total_runops: The number of ops that ran during this step.
+  // - final_status: The status that this step finished with.
+  virtual void StepDone(Microseconds start_time, Microseconds finish_time,
+                        Microseconds cleanup_time, int total_runops,
+                        absl::Status final_status) = 0;
+
+  // Returns true if the caller should collect rpc activity.
+  virtual bool should_collect_rpcs() = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/propagator_debug_utils.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/propagator_debug_utils.h
new file mode 100644
index 00000000..2e837104
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/propagator_debug_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_DEBUG_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_DEBUG_UTILS_H_
+
+namespace tensorflow {
+
+struct Entry;
+struct NodeItem;
+class Tensor;
+
+// Returns a pointer to the tensor in `input` if one exists, or `nullptr`.
+const Tensor* GetTensorValueForDump(const Entry& input);
+
+// Writes a LOG(WARNING) message describing the state of the given pending node
+// in the graph described by `immutable_state`.
+void DumpPendingNodeState(const NodeItem& node_item, const Entry* input_vector,
+                          const bool show_nodes_with_no_ready_inputs);
+
+// Writes a LOG(WARNING) message describing the state of the given active node
+// in the graph described by `immutable_state`.
+void DumpActiveNodeState(const NodeItem& node_item, const Entry* input_vector);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_DEBUG_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/propagator_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/propagator_state.h
new file mode 100644
index 00000000..e5f4fd6b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/propagator_state.h
@@ -0,0 +1,598 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_STATE_H_
+
+#include <queue>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/entry.h"
+#include "tensorflow/core/common_runtime/immutable_executor_state.h"
+#include "tensorflow/core/common_runtime/pending_counts.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef absl::InlinedVector<AllocatorAttributes, 4UL> AllocatorAttributeVec;
+
+// Represents the ephemeral "edge state" associated with one invocation of
+// `Executor::Run()`.
+//
+// `PropagatorState` is responsible for propagating values along dataflow
+// edges in a TensorFlow graph and determining which nodes are runnable. The
+// executor primarily updates `PropagatorState` by calling `PropagateOutputs()`
+// after processing a node, and `PropagatorState` dispatches `TaggedNode`s by
+// adding them to a `TaggedNodeSeq`.
+class PropagatorState {
+ public:
+  PropagatorState(const ImmutableExecutorState& immutable_state,
+                  int64_t step_id, bool vlog);
+  ~PropagatorState();
+
+ private:
+  // Forward declaration so that `TaggedNode` can include a `FrameState*` and an
+  // `IterationState*`.
+  struct FrameState;
+  struct IterationState;
+
+ public:
+  // A `TaggedNode` corresponds to a single invocation of a node's kernel,
+  // and it is created when the kernel becomes runnable (in a particular
+  // iteration of a particular frame).
+  struct TaggedNode {
+    const NodeItem* node_item;
+    FrameState* input_frame;
+    IterationState* input_iter;
+    bool is_dead;
+
+    TaggedNode() = default;
+    TaggedNode(const NodeItem* node_item, FrameState* in_frame,
+               IterationState* in_iter, bool dead)
+        : node_item(node_item),
+          input_frame(in_frame),
+          input_iter(in_iter),
+          is_dead(dead) {}
+
+    const NodeItem& get_node_item() const { return *node_item; }
+
+    bool get_is_dead() const { return is_dead; }
+    int64_t get_iter_num() const;
+  };
+
+  // A drop-in replacement for std::deque<TaggedNode>.  We typically don't
+  // have that many nodes in the ready queue, so we just use a vector and
+  // don't free up memory from the queue as we consume nodes.
+  class TaggedNodeReadyQueue {
+   public:
+    TaggedNodeReadyQueue() : front_index_(0) {}
+
+    void push_back(const TaggedNode& node) { ready_.push_back(node); }
+
+    TaggedNode front() const {
+      DCHECK_LT(front_index_, ready_.size());
+      return ready_[front_index_];
+    }
+
+    void pop_front() {
+      DCHECK_LT(front_index_, ready_.size());
+      front_index_++;
+      if ((front_index_ == ready_.size()) || (front_index_ > kSpillThreshold)) {
+        if (front_index_ == ready_.size()) {
+          ready_.clear();
+        } else {
+          // Lots of unused entries at beginning of vector: move everything
+          // down to start of vector.
+          ready_.erase(ready_.begin(), ready_.begin() + front_index_);
+        }
+        front_index_ = 0;
+      }
+    }
+    bool empty() const { return ready_.empty(); }
+    int size() const { return ready_.size() - front_index_; }
+
+   private:
+    // TODO(b/152925936): Re-evaluate these constants with current usage
+    // patterns.
+    static constexpr int kSpillThreshold = 16384;
+    absl::InlinedVector<TaggedNode, 16UL> ready_;
+    int front_index_;
+  };
+
+  // TODO(b/152925936): Re-evaluate this constant with current usage patterns.
+  typedef absl::InlinedVector<TaggedNode, 8UL> TaggedNodeSeq;
+
+ private:
+  // The state of an iteration in a particular frame.
+  struct IterationState {
+    explicit IterationState(int64_t iter_num,
+                            const PendingCounts* pending_counts,
+                            int total_input_tensors)
+        : iter_num(iter_num),
+          input_tensors(new Entry[total_input_tensors]),
+          outstanding_ops(0),
+          outstanding_frame_count(0),
+          counts(*pending_counts) {  // Initialize with copy of *pending_counts
+    }
+
+    const int64_t
+        iter_num;  // The index of this iteration in the enclosing loop.
+
+    // One copy per iteration. For iteration k, i-th node's j-th input is in
+    // input_tensors[k][immutable_state_.nodes[i].input_start + j]. An entry is
+    // either a tensor pointer (pass-by-reference) or a tensor (pass-by-value).
+    //
+    // NOTE: No need to protect input_tensors[i] by any locks because it
+    // is resized once. Each element of tensors_ is written once by the
+    // source node of an edge and is cleared by the destination of the same
+    // edge. The latter node is never run concurrently with the former node.
+    Entry* input_tensors;
+
+    // The number of outstanding ops for each iteration.
+    std::atomic<size_t> outstanding_ops;
+
+    // The number of outstanding frames for each iteration.
+    int outstanding_frame_count;
+    int pending(PendingCounts::Handle h) { return counts.pending(h); }
+    int decrement_pending(PendingCounts::Handle h, int v) {
+      return counts.decrement_pending(h, v);
+    }
+    // Mark a merge node as live
+    // REQUIRES: Node corresponding to "h" is a merge node
+    void mark_live(PendingCounts::Handle h) { counts.mark_live(h); }
+    // Mark a node to show that processing has started.
+    void mark_started(PendingCounts::Handle h) { counts.mark_started(h); }
+    // Mark a node to show that processing has completed.
+    void mark_completed(PendingCounts::Handle h) { counts.mark_completed(h); }
+    PendingCounts::NodeState node_state(PendingCounts::Handle h) {
+      return counts.node_state(h);
+    }
+
+    int dead_count(PendingCounts::Handle h) { return counts.dead_count(h); }
+    void increment_dead_count(PendingCounts::Handle h) {
+      counts.increment_dead_count(h);
+    }
+    // REQUIRES: Node corresponding to "h" is a merge node
+    PendingCounts::AdjustResult adjust_for_mark_live(PendingCounts::Handle h) {
+      return counts.adjust_for_mark_live(h);
+    }
+    // REQUIRES: Node corresponding to "h" is a merge node
+    PendingCounts::AdjustResult adjust_for_mark_live_atomic(
+        PendingCounts::Handle h) {
+      return counts.adjust_for_mark_live_atomic(h);
+    }
+    PendingCounts::AdjustResult adjust_for_decrement_pending(
+        PendingCounts::Handle h, int decrement_pending) {
+      return counts.adjust_for_decrement_pending(h, decrement_pending);
+    }
+    PendingCounts::AdjustResult adjust_for_decrement_pending_atomic(
+        PendingCounts::Handle h, int decrement_pending) {
+      return counts.adjust_for_decrement_pending_atomic(h, decrement_pending);
+    }
+    PendingCounts::AdjustResult adjust_for_increment_dead(
+        PendingCounts::Handle h) {
+      return counts.adjust_for_increment_dead(h);
+    }
+    PendingCounts::AdjustResult adjust_for_increment_dead_atomic(
+        PendingCounts::Handle h) {
+      return counts.adjust_for_increment_dead_atomic(h);
+    }
+    PendingCounts::AdjustResult adjust_for_activation(PendingCounts::Handle h,
+                                                      bool increment_dead) {
+      return counts.adjust_for_activation(h, increment_dead);
+    }
+    PendingCounts::AdjustResult adjust_for_activation_atomic(
+        PendingCounts::Handle h, bool increment_dead) {
+      return counts.adjust_for_activation_atomic(h, increment_dead);
+    }
+
+    ~IterationState() { delete[] input_tensors; }
+
+   private:
+    PendingCounts counts;
+  };
+
+  struct FrameState {
+    explicit FrameState(const ImmutableExecutorState& immutable_state,
+                        int parallel_iters)
+        : immutable_state(immutable_state),
+          max_parallel_iterations(parallel_iters),
+          num_outstanding_iterations(1),
+          iterations(parallel_iters + 1),
+          iterations_raw(iterations.data()) {}
+
+    // A new frame is created for each loop. Execution starts at iteration 0.
+    // When a value at iteration 0 passes through a NextIteration node,
+    // iteration 1 is created and starts running. Note that iteration 0 may
+    // still be running so multiple iterations may run in parallel. The
+    // frame maintains the state of iterations in several data structures
+    // such as pending_count and input_tensors. When iteration 0 completes,
+    // we garbage collect the state of iteration 0.
+    //
+    // A frame instance is considered "done" and can be garbage collected
+    // if all its inputs have entered and all its iterations are "done".
+    //
+    // A frame manages the live iterations of an iterative computation.
+    // Iteration i is considered "done" when there are no outstanding ops,
+    // frames at iteration i are done, all recvs for this iteration are
+    // completed, and iteration i-1 is done. For iteration 0, we instead
+    // wait for there to be no more pending inputs of the frame.
+    //
+    // Frames and iterations are garbage collected once they are done.
+    // The state we need to keep around is highly dependent on the
+    // parallelism enabled by the scheduler. We may want to have the
+    // scheduler dynamically control the outstanding number of live
+    // parallel frames and iterations. To reduce the state space, the
+    // scheduler might want to schedule ops in inner frames first and
+    // lower iterations first.
+    //
+    // This frame state is mostly initialized lazily on demand so we
+    // don't introduce unnecessary overhead.
+
+    // The immutable state of the executor the frame is in.
+    const ImmutableExecutorState& immutable_state;
+
+    // The name of this frame, which is the concatenation of its parent
+    // frame name, the iteration of the parent frame when this frame was
+    // created, and the value of the attr 'frame_name'.
+    string frame_name;
+
+    // The unique id for this frame. Generated by fingerprinting
+    // frame_name.
+    uint64 frame_id;
+
+    // The iteration state of its parent frame when this frame is created.
+    // nullptr if there is no parent frame. The frame_name/parent_iter pair
+    // uniquely identifies this FrameState.
+    IterationState* parent_iter = nullptr;
+
+    // The FrameState of its parent frame.
+    FrameState* parent_frame = nullptr;
+
+    // The maximum allowed number of parallel iterations.
+    const int max_parallel_iterations;
+
+    // The number of inputs this frame is still waiting.
+    int num_pending_inputs = 0;
+
+    // The highest iteration number we have reached so far in this frame.
+    int64_t iteration_count TF_GUARDED_BY(mu) = 0;
+
+    // The number of outstanding iterations.
+    int num_outstanding_iterations TF_GUARDED_BY(mu) = 1;
+
+   private:
+    // The active iteration states of this frame.
+    absl::InlinedVector<IterationState*, 12UL> iterations;
+    IterationState** const iterations_raw TF_GUARDED_BY(mu);
+    IterationState* iterations_first TF_GUARDED_BY(mu);
+
+   public:
+    // The NextIteration nodes to enter a new iteration. If the number of
+    // outstanding iterations reaches the limit, we will defer the start of
+    // the next iteration until the number of outstanding iterations falls
+    // below the limit.
+    std::vector<std::pair<const NodeItem*, Entry>> next_iter_roots
+        TF_GUARDED_BY(mu);
+
+    // The values of the loop invariants for this loop. They are added into
+    // this list as they "enter" the frame. When a loop invariant enters,
+    // we make it available to all active iterations. When the frame starts
+    // a new iteration, we make all the current loop invariants available
+    // to the new iteration.
+    std::vector<std::pair<const NodeItem*, Entry>> inv_values
+        TF_GUARDED_BY(iter_mu);
+
+    // The list of dead exit node items for the current highest iteration. We
+    // will only "execute" the dead exits of the final iteration.
+    std::vector<const NodeItem*> dead_exits TF_GUARDED_BY(iter_mu);
+
+    // Static information specific to this frame.
+    PendingCounts* pending_counts = nullptr;
+    int total_input_tensors = 0;
+    std::vector<const NodeItem*>* nodes = nullptr;
+
+    // Lock ordering: ExecutorState.mu_ < mu < iter_mu;
+    // during structured traversal: parent_frame->mu < mu.
+    mutex mu;
+
+    // This mutex lock should only be held when entering next iteration.
+    mutex iter_mu;
+
+    void InitializeFrameInfo(const ImmutableExecutorState::FrameInfo& finfo);
+
+    inline IterationState* GetIteration(int64_t iter)
+        TF_SHARED_LOCKS_REQUIRED(mu) {
+      if (TF_PREDICT_TRUE(iter == 0)) {
+        return iterations_first;
+      } else {
+        size_t index = iter % (max_parallel_iterations + 1);
+        return iterations_raw[index];
+      }
+    }
+
+    void SetIteration(int64_t iter, IterationState* state);
+
+    // Adjust the outstanding op count by 'delta' and clean up the iterations in
+    // the frame if no more ops are oustanding. Return true iff the execution of
+    // the frame is done.
+    //
+    // Avoids acquiring the lock in the common case that the frame is not done.
+    bool AdjustOutstandingOps(IterationState* iter_state, int delta,
+                              TaggedNodeSeq* ready);
+
+    bool AdjustOutstandingOpsLocked(IterationState* iter_state, int delta,
+                                    TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    bool AdjustOutstandingOpsFastPath(IterationState* iter_state, int delta)
+        TF_SHARED_LOCKS_REQUIRED(mu);
+
+    // Convenience methods for the above 'Adjust' calls where delta takes the
+    // common value of -1.
+    bool DecrementOutstandingOps(IterationState* iter_state,
+                                 TaggedNodeSeq* ready);
+
+    bool DecrementOutstandingOpsLocked(IterationState* iter_state,
+                                       TaggedNodeSeq* ready);
+
+    // Returns true if the computation in the frame is completed.
+    bool IsFrameDone();
+
+    // Returns true if the iteration of the frame is completed.
+    bool IsIterationDone(IterationState* iter_state)
+        TF_SHARED_LOCKS_REQUIRED(mu);
+
+    // Increments the iteration id. If this is a new iteration, initialize it.
+    //
+    // Returns a pointer to the new iteration.
+    IterationState* IncrementIteration(TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    // Activate all the deferred NextIteration nodes in a new iteration.
+    void ActivateNexts(IterationState* iter_state, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    // Activate all the current loop invariants in a new iteration.
+    void ActivateLoopInvs(IterationState* iter_state, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    // Add a new loop invariant and make it available to all active
+    // iterations.
+    void AddLoopInv(const NodeItem* item, const Entry& entry,
+                    TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    // Activate the successors of a node. Contents of *outputs are left in an
+    // indeterminate state after returning from this method.
+    //
+    // In the case that 'item' is a simple node (no merge/control outputs) this
+    // will acquire a shared lock and can run concurrently with other
+    // invocations.
+    //
+    // Return true if the frame is done after activation.
+    bool ActivateNodesAndAdjustOutstanding(
+        const NodeItem* item, const bool is_dead, IterationState* iter_state,
+        EntryVector* outputs, TaggedNodeSeq* ready, int decrement_activation);
+
+    // Same as the above, but requires 'mu' already held in exclusive mode.
+    int ActivateNodesLocked(const NodeItem* item, const bool is_dead,
+                            IterationState* iter_state, EntryVector* outputs,
+                            TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    // Cleanup iterations of this frame starting from the given iteration.
+    bool CleanupIterations(IterationState* iter_state, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    void DumpIterationState(PropagatorState* parent) {
+      mutex_lock l(mu);
+      for (IterationState* iteration : iterations) {
+        if (iteration) {
+          LOG(WARNING) << "  Iteration:";
+          parent->DumpIterationState(this, iteration);
+        }
+      }
+    }
+
+    ~FrameState() {
+      for (size_t i = 0; i < iterations.size(); ++i) {
+        delete iterations[i];
+        iterations[i] = nullptr;
+      }
+    }
+
+   private:
+    // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
+    // This variant does not use atomic operations to modify the pending counts
+    // and thus must hold the exclusive lock.
+    int ActivateNodesFastPathLocked(const NodeItem* item, bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
+    // This variant uses atomic operations to modify the pending counts.
+    int ActivateNodesFastPathShared(const NodeItem* item, bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_SHARED_LOCKS_REQUIRED(mu);
+
+    int ActivateNodesSlowPathLocked(const NodeItem* item, bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    int ActivateNodesSlowPathShared(const NodeItem* item, bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_SHARED_LOCKS_REQUIRED(mu);
+
+    // Implementation templates. Not for public use.
+    template <bool atomic>
+    int ActivateNodesFastPathInternal(const NodeItem* item, bool is_dead,
+                                      IterationState* iter_state,
+                                      EntryVector* outputs,
+                                      TaggedNodeSeq* ready);
+    template <bool atomic>
+    int ActivateNodesSlowPathInternal(const NodeItem* item, bool is_dead,
+                                      IterationState* iter_state,
+                                      EntryVector* outputs,
+                                      TaggedNodeSeq* ready);
+  };
+
+ public:
+  // Creates and adds a `TaggedNode` for each node in `roots` to `*ready`.
+  void ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
+                     TaggedNodeSeq* ready);
+
+  // After processing the outputs, propagates the outputs to their dsts.
+  // Contents of *outputs are left in an indeterminate state after
+  // returning from this method.
+  void PropagateOutputs(const TaggedNode& tagged_node, EntryVector* outputs,
+                        TaggedNodeSeq* ready);
+
+  // Returns an array of `Entry` objects corresponding to the inputs of
+  // `tagged_node`.
+  //
+  // NOTE: Thread safety analysis is disabled on this method, because the
+  // underlying `IterationState` and its array of `input_tensors` retain the
+  // same address while the iteration is live.
+  Entry* GetInputTensors(const TaggedNode& tagged_node) const
+      TF_NO_THREAD_SAFETY_ANALYSIS {
+    return tagged_node.input_iter->input_tensors +
+           tagged_node.node_item->input_start;
+  }
+
+  FrameAndIter GetFrameAndIter(const TaggedNode& tagged_node) const {
+    return {tagged_node.input_frame->frame_id,
+            tagged_node.input_iter->iter_num};
+  }
+
+  // Provide debugging output of the state of the executor.
+  void DumpState();
+
+  // For debugging/logging only.
+  void MaybeMarkStarted(const TaggedNode& tagged_node) {
+    // TODO(misard) Replace with a finer-grain enabling flag once we add better
+    // optional debugging support.
+    if (TF_PREDICT_FALSE(vlog_) && VLOG_IS_ON(1)) {
+      mutex_lock l(tagged_node.input_frame->mu);
+      tagged_node.input_iter->mark_started(
+          immutable_state_.pending_ids()[tagged_node.node_item->node_id]);
+    }
+  }
+
+  void MaybeMarkCompleted(const TaggedNode& tagged_node) {
+    // TODO(misard) Replace with a finer-grain enabling flag once we add better
+    // optional debugging support.
+    if (TF_PREDICT_FALSE(vlog_) && VLOG_IS_ON(1)) {
+      mutex_lock l(tagged_node.input_frame->mu);
+      tagged_node.input_iter->mark_completed(
+          immutable_state_.pending_ids()[tagged_node.node_item->node_id]);
+    }
+  }
+
+ private:
+  // Find an existing or create a new child frame in the frame 'frame' at
+  // iteration 'iter'.
+  void FindOrCreateChildFrame(FrameState* frame, IterationState* iter_state,
+                              const NodeItem& node_item, FrameState** child);
+
+  // Delete a frame. Called when the frame is done.
+  void DeleteFrame(FrameState* frame, TaggedNodeSeq* ready);
+
+  // Cleanup frames and iterations starting from frame/iter. Called when
+  // a child frame is done.
+  void CleanupFramesIterations(FrameState* frame, IterationState* iter_state,
+                               TaggedNodeSeq* ready);
+
+  // Provide debugging output about an outstanding iteration in the executor.
+  void DumpIterationState(const FrameState* frame, IterationState* iteration);
+
+  const ImmutableExecutorState& immutable_state_;
+  const int64_t step_id_;
+  const bool vlog_;
+
+  mutex mu_;
+
+  // The root frame in which the execution of this step is started.
+  FrameState* root_frame_;
+
+  // Mapping from frame ID to outstanding frames. A new frame is created
+  // at some iteration of an active frame. So the unique key for the new
+  // child frame is a hash composed of the ID of the parent frame, the iteration
+  // number at which the parent frame is creating the new frame, and the
+  // name of the new frame from nodedef.
+  absl::flat_hash_map<uint64, FrameState*> outstanding_frames_
+      TF_GUARDED_BY(mu_);
+
+  PropagatorState(const PropagatorState&) = delete;
+  void operator=(const PropagatorState&) = delete;
+};
+
+inline int64_t PropagatorState::TaggedNode::get_iter_num() const {
+  return input_iter->iter_num;
+}
+
+// `OrderedPropagatorState` replaces `PropagatorState`s `TaggedNodeReadyQueue`
+// with a priority queue. This ensures that the order in which we dequeue
+// `TaggedNode&`s is stable with respect to ASLR.
+//
+// This is not always needed, as in a multithreaded environment, executions are
+// expected to happen nondeterministically, but this nondeteminism can be a
+// problem: For example, In usecases that are running close to the RAM limit of
+// a device, reordering ops can cause an increase in memory fragmenenation,
+// causing an OOM.
+// This codepath is enabled using TF_DETERMINISTIC_ORDER=1 in executor.cc
+class OrderedPropagatorState : public PropagatorState {
+  using PropagatorState::PropagatorState;
+
+ public:
+  class TaggedNodeReadyQueue : PropagatorState::TaggedNodeReadyQueue {
+   public:
+    TaggedNodeReadyQueue() : readyp_(compare) {}
+    void push_back(const TaggedNode& node) { readyp_.push(node); }
+    TaggedNode front() const { return readyp_.top(); }
+    void pop_front() { readyp_.pop(); }
+    bool empty() const { return readyp_.empty(); }
+    int size() const { return readyp_.size(); }
+
+   private:
+    static bool compare(TaggedNode const& lhs, TaggedNode const& rhs) {
+      std::tuple<int, uint64, int64_t> lhs_prio{lhs.node_item->node_id,
+                                                lhs.input_frame->frame_id,
+                                                lhs.input_iter->iter_num};
+      std::tuple<int, uint64, int64_t> rhs_prio{rhs.node_item->node_id,
+                                                rhs.input_frame->frame_id,
+                                                rhs.input_iter->iter_num};
+      return lhs_prio < rhs_prio;
+    }
+
+    std::priority_queue<TaggedNode, std::vector<TaggedNode>, decltype(&compare)>
+        readyp_;
+  };
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROPAGATOR_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/quantize_training.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/quantize_training.h
new file mode 100644
index 00000000..de3ed6b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/quantize_training.h
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_QUANTIZE_TRAINING_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_QUANTIZE_TRAINING_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Rewrites graph for quantized training.
+// Rewrites the forward pass to include the precision loss with quantization so
+// the model can learn to deal with such loss and achieve better accuracy when
+// it is quantized later for inference.
+// Note that the num_bits should be in [1, 63] and 'g' must be not null.
+// quant_op_type specifies which quantization op should be used.
+// Current ops supported:
+// - QuantizeAndDequantizeV2.
+// - FakeQuantWithMinMaxVars.
+//
+// On success, returns OK.
+//
+// On failure, returns the error status. Possible errors include:
+//    - num_bits out of range.
+//    - g is null.
+//    - More than 1 unknown ops encountered.
+absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
+                                Graph* g);
+
+// Converts the input serialized GraphDef and returns a rewritten serialized
+// GraphDef for quantized training.
+absl::Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
+                                                    int32_t num_bits,
+                                                    const string& quant_op_type,
+                                                    string* result_graph);
+
+// Converts the input GraphDef and returns a rewritten GraphDef for quantized
+// training.
+absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
+                                          int32_t num_bits,
+                                          const string& quant_op_type,
+                                          GraphDef* result_graphdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_QUANTIZE_TRAINING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/renamed_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/renamed_device.h
new file mode 100644
index 00000000..e4b4b8ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/renamed_device.h
@@ -0,0 +1,173 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/lib/core/threadpool_interface.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Wraps a device with a new name, delegating work to the wrapped device.
+//
+// This class is used to wrap local devices when using clusterspec propagation
+// where the name of a particular device may change in the context of a given
+// session.
+class RenamedDevice : public Device {
+ public:
+  static std::unique_ptr<Device> NewRenamedDevice(
+      const string& new_base, Device* underlying, bool owns_underlying,
+      bool isolate_session_state,
+      thread::ThreadPoolInterface* underlying_threadpool = nullptr);
+
+  ~RenamedDevice() override;
+
+  const DeviceBase* UnderlyingDevice() const override {
+    return underlying_device_->UnderlyingDevice();
+  }
+  DeviceBase* UnderlyingDevice() override {
+    return underlying_device_->UnderlyingDevice();
+  }
+
+  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
+    if (underlying_threadpool_) {
+      return Device::tensorflow_cpu_worker_threads();
+    }
+    return underlying_device_->tensorflow_cpu_worker_threads();
+  }
+
+  const DeviceBase::AcceleratorDeviceInfo* tensorflow_accelerator_device_info()
+      const override {
+    return underlying_device_->tensorflow_accelerator_device_info();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return underlying_device_->GetAllocator(attr);
+  }
+
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64_t step_id) override {
+    return underlying_device_->GetScopedAllocator(attr, step_id);
+  }
+
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return underlying_device_->GetScopedAllocatorMgr();
+  }
+
+  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
+    // Use the underlying threadpool only if the underlying device supports
+    // eigen_cpu_device.
+    if (underlying_threadpool_ && underlying_device_->has_eigen_cpu_device()) {
+      return Device::eigen_cpu_device();
+    }
+    return underlying_device_->eigen_cpu_device();
+  }
+
+  thread::ThreadPool* tensorflow_device_thread_pool() override {
+    // Use the underlying threadpool instead of tensorflow_device_thread_pool
+    // of the underlying device only if tensorflow_device_thread_pool is defined
+    // for the underlying device.
+    if (underlying_threadpool_ &&
+        underlying_device_->tensorflow_device_thread_pool() != nullptr) {
+      return Device::tensorflow_device_thread_pool();
+    }
+    return underlying_device_->tensorflow_device_thread_pool();
+  }
+
+  bool has_eigen_cpu_device() const override {
+    return underlying_device_->has_eigen_cpu_device();
+  }
+
+
+  PerOpGpuDevice* MakeGpuDevice() override {
+    return underlying_device_->MakeGpuDevice();
+  }
+
+  absl::Status ReinitializeGpuDevice(OpKernelContext* context,
+                                     PerOpGpuDevice* device, DeviceContext* dc,
+                                     Allocator* allocator) override {
+    return underlying_device_->ReinitializeGpuDevice(context, device, dc,
+                                                     allocator);
+  }
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override {
+    return underlying_device_->MakeTensorFromProto(tensor_proto, alloc_attrs,
+                                                   tensor);
+  }
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override {
+    underlying_device_->CopyTensorInSameDevice(input_tensor, output_tensor,
+                                               device_context, std::move(done));
+  }
+
+  // Below are virtual methods defined on Device
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override {
+    underlying_device_->Compute(op_kernel, context);
+  }
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override {
+    underlying_device_->ComputeAsync(op_kernel, context, std::move(done));
+  }
+
+  absl::Status Sync() override { return underlying_device_->Sync(); }
+
+  absl::Status MaybeRewriteGraph(std::unique_ptr<Graph>* graph) override {
+    return underlying_device_->MaybeRewriteGraph(graph);
+  }
+
+  absl::Status TryGetDeviceContext(DeviceContext** out_context) override {
+    return underlying_device_->TryGetDeviceContext(out_context);
+  }
+
+  // Returns the resource manager associated w/ this device.
+  ResourceMgr* resource_manager() override {
+    if (isolate_session_state_) {
+      return Device::resource_manager();
+    } else {
+      return underlying_device_->resource_manager();
+    }
+  }
+
+  bool IsLocal() const override { return underlying_device_->IsLocal(); }
+
+  bool IsRemoteCallAllowed() const override {
+    return underlying_device_->IsRemoteCallAllowed();
+  }
+
+ private:
+  RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
+                bool owns_underlying, bool isolate_session_state,
+                thread::ThreadPoolInterface* underlying_threadpool);
+  Device* const underlying_device_;
+  const bool owns_underlying_device_;
+  const bool isolate_session_state_;
+
+  std::unique_ptr<thread::ThreadPool> underlying_threadpool_;
+  // eigen_worker_threads_ is stored here so that we can pass the pointer
+  // of eigen_worker_threads_.workers to the parent class.
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/rendezvous_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/rendezvous_mgr.h
new file mode 100644
index 00000000..23c07b3d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/rendezvous_mgr.h
@@ -0,0 +1,106 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/local_rendezvous.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// The IntraProcessRendezvous classes are implementations of a Rendezvous that
+// expects all producers and consumers to be devices immediately accessible
+// within the process. That is, it will never be necessary to perform an RPC to
+// communicate with either.
+//
+// Buffering of Tensor values is delegated to a `LocalRendezvous`. An
+// IntraProcessRendezvous. just adds functionality to coordinate multiple
+// process-local devices.
+
+// Reference-counted implementation that may be shared between multiple threads.
+class RefCountedIntraProcessRendezvous : public Rendezvous {
+ public:
+  explicit RefCountedIntraProcessRendezvous(const DeviceMgr* device_mgr);
+
+  // Implementation of RendezvousInterface methods.
+  // NOTE: The methods may clear the Item list and destroy 'this' if there are
+  // no other references to the RefCountedIntraProcessRendezvous object.
+  // If the caller intend to keep a longer life time then it shall keep its own
+  // reference to the RefCountedIntraProcessRendezvous.
+  absl::Status Send(const ParsedKey& key, const Rendezvous::Args& args,
+                    const Tensor& val, const bool is_dead) override;
+  void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
+                 DoneCallback done) override;
+  void StartAbort(const absl::Status& status) override;
+
+  // Returns the member LocalRendezvous' status.
+  absl::Status GetLocalRendezvousStatus();
+
+  inline void UpdateDeviceManager(DeviceMgr* device_mgr) {
+    device_mgr_ = device_mgr;
+  }
+
+ private:
+  const DeviceMgr* device_mgr_;  // Not owned.
+  LocalRendezvous local_;
+
+  ~RefCountedIntraProcessRendezvous() override;
+
+  RefCountedIntraProcessRendezvous(const RefCountedIntraProcessRendezvous&) =
+      delete;
+  void operator=(const RefCountedIntraProcessRendezvous&) = delete;
+};
+
+// RefCountedIntraProcessRendezvous is aliased to IntraProcessRendezvous for
+// backwards compatibility with existing users.
+using IntraProcessRendezvous = RefCountedIntraProcessRendezvous;
+
+// Non-reference-counted implementation that may be stack-allocated for
+// performance.
+//
+// Prefer to use PrivateIntraProcessRendezvous in new code.
+class PrivateIntraProcessRendezvous : public RendezvousInterface {
+ public:
+  explicit PrivateIntraProcessRendezvous(const DeviceMgr* device_mgr);
+  ~PrivateIntraProcessRendezvous() override;
+
+  // Implementation of RendezvousInterface methods.
+  absl::Status Send(const ParsedKey& key, const Rendezvous::Args& args,
+                    const Tensor& val, const bool is_dead) override;
+  void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
+                 DoneCallback done) override;
+  void StartAbort(const absl::Status& status) override;
+
+ private:
+  const DeviceMgr* device_mgr_;
+  LocalRendezvous local_;
+
+  PrivateIntraProcessRendezvous(const PrivateIntraProcessRendezvous&) = delete;
+  void operator=(const PrivateIntraProcessRendezvous&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/rendezvous_util.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/rendezvous_util.h
new file mode 100644
index 00000000..8ed1dd7a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/rendezvous_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+
+#include <map>
+
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+typedef std::map<string, Tensor> NamedTensors;
+typedef std::function<void(const absl::Status&)> StatusCallback;
+
+// Uses `rendezvous` to send tensors in `tensors_to_send`. `device_context`
+// should be the DeviceContext associated with the source of the tensors.
+// `alloc_attrs` contains information about how the `tensors_to_send` are
+// allocated. `alloc_attrs` should either be {} or should match the length of
+// `keys`.
+absl::Status SendTensorsToRendezvous(
+    RendezvousInterface* rendezvous, DeviceContext* device_context,
+    const std::vector<AllocatorAttributes>& alloc_attrs,
+    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send);
+
+// Uses `rendezvous` to obtain tensors. `device_context` should be the
+// DeviceContext associated with the receiving device. `alloc_attrs` contains
+// information as how to store the received tensors. Should be {} or match the
+// length of `keys`.
+void RecvOutputsFromRendezvousAsync(
+    RendezvousInterface* rendezvous, DeviceContext* device_context,
+    const std::vector<AllocatorAttributes>& alloc_attrs,
+    const std::vector<string>& keys, std::vector<Tensor>* received_tensors,
+    StatusCallback done);
+
+absl::Status RecvOutputsFromRendezvous(RendezvousInterface* rendezvous,
+                                       NamedTensors* out,
+                                       const Rendezvous::Args& args);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/replicate_constants_pass.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/replicate_constants_pass.h
new file mode 100644
index 00000000..b215d301
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/replicate_constants_pass.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+// Small constants are replicated to the hosts of their successors. This pass
+// only applies when there are multiple successors.
+//
+// For example, the graph:
+//   C -> {Op0, Op1, Op2, Op3}
+//   C's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
+//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
+//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
+//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
+//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
+// is rewritten to:
+//   C0 -> {Op0, Op1}
+//   C1 -> {Op2, Op3}
+//   C0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
+//   C1's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:CPU:0
+//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
+//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
+//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
+//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
+
+namespace tensorflow {
+
+class ReplicateConstantsPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/replicate_per_replica_nodes.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
new file mode 100644
index 00000000..4be95ea3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_PER_REPLICA_NODES_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_PER_REPLICA_NODES_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// `composite_device` maps from a virtual device to a set of devices.
+// In a function graph, for each node assigned to a composite device
+// (representing N devices), replace it with N replicated nodes (one per
+// device).
+// REQUIREMENTS:
+// 1) Each node has been assigned to a device (including composite device).
+// 2) Each cluster of nodes assigned to a composite device should include at
+// least one "_Arg" node.
+// composite device.
+// 3) Clusters assigned to different composite devices should have no data
+// dependency.
+// TODO(b/145922293): Register it as a POST_REWRITE_FOR_EXEC pass.
+absl::Status ReplicatePerReplicaNodesInFunctionGraph(
+    const absl::flat_hash_map<string, const std::vector<string>*>&
+        composite_devices,
+    Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_PER_REPLICA_NODES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost.h
new file mode 100644
index 00000000..3cb40ec8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+
+namespace tensorflow {
+
+// RequestCost collects the costs and metrics for processing an rpc request.
+class RequestCost {
+ public:
+  // Records costs. The inputs should be pairs of cost type and cost.
+  // It's thread-safe, and can be called from different threads.
+  void RecordCost(
+      const std::vector<std::pair<absl::string_view, absl::Duration>>& costs);
+
+  // Scales all types of costs for processing an rpc request.
+  // It's thread-safe. It's expected to be called at the end of processing an
+  // rpc request, when all the costs have been collected.
+  void ScaleCosts(int scale_factor);
+
+  // Gets all types of costs for processing an rpc request.
+  // It's thread-safe. It's expected to be called at the end of processing an
+  // rpc request, when all the costs have been collected.
+  absl::flat_hash_map<std::string, absl::Duration> GetCosts() const;
+
+  // Records metrics. The inputs should be pairs of metric name and value.
+  // It's thread-safe, and can be called from different threads. Unlike
+  // RecordCosts where costs are summed up if recorded with the same key,
+  // metrics are replaced.
+  void RecordMetrics(
+      const std::vector<std::pair<absl::string_view, double>>& metrics);
+
+  // Gets all types of metrics for processing an rpc request.
+  // It's thread-safe. It's expected to be called at the end of processing an
+  // rpc request, when all the metrics have been collected.
+  absl::flat_hash_map<std::string, double> GetMetrics() const;
+
+  // Metrics of each batch that processes this rpc request.
+  struct BatchMetrics {
+    // Size of the batch.
+    int64_t processed_size = 0;
+    // In this batch, input size from this rpc request.
+    int64_t input_size = 0;
+    // In this batch, the padding amount.
+    int64_t padding_size = 0;
+    // Costs for processing this batch.
+    absl::flat_hash_map<std::string, absl::Duration> batch_costs;
+  };
+
+  // Records the metrics of a batch.
+  // It's thread-safe, and can be called from different threads. It may be
+  // called multiple times if a request is processed by more than one batches.
+  void RecordBatchMetrics(const BatchMetrics& batch_metrics);
+
+  // Scales costs of all the batches that process this rpc request.
+  // It's thread-safe. It's expected to be called at the end of processing an
+  // rpc request, when all batch processing has completed.
+  void ScaleBatchCosts(int scale_factor);
+
+  // Get metrics of all the batches that process this rpc request.
+  // It's thread-safe. It's expected to be called at the end of processing an
+  // rpc request, when all batch processing has completed.
+  std::vector<BatchMetrics> GetBatchMetrics() const;
+
+ private:
+  mutable absl::Mutex mutex_;
+
+  // Query costs. Map from cost type to cost.
+  absl::flat_hash_map<std::string, absl::Duration> cost_map_
+      ABSL_GUARDED_BY(mutex_);
+  // Query metrics. Map from metric name to value.
+  absl::flat_hash_map<std::string, double> metric_map_ ABSL_GUARDED_BY(mutex_);
+
+  // Metrics of batches that process this rpc request.
+  std::vector<BatchMetrics> batch_metrics_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost_accessor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost_accessor.h
new file mode 100644
index 00000000..ba64da4b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost_accessor.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_ACCESSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_ACCESSOR_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/common_runtime/request_cost.h"
+
+namespace tensorflow {
+
+// An interface for accessing the RequestCost associated with the current rpc
+// request.
+class RequestCostAccessor {
+ public:
+  virtual ~RequestCostAccessor() {}
+  virtual RequestCost* GetRequestCost() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_ACCESSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost_accessor_registry.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost_accessor_registry.h
new file mode 100644
index 00000000..6e91678f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/request_cost_accessor_registry.h
@@ -0,0 +1,71 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_ACCESSOR_REGISTRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_ACCESSOR_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/request_cost_accessor.h"
+
+namespace tensorflow {
+
+// TODO(b/185852990): Create a template Registry that allows registering
+// different types (e.g  RequestCostAccessor, CostMeasurement).
+//
+// RequestCostAccessorRegistry allows to
+// - register a RequestCostAccessor type to the global map
+// - create an instance of registered RequestCostAccessor.
+class RequestCostAccessorRegistry {
+ public:
+  // Creates an instance of registered RequestCostAccessor by name. If the named
+  // RequestCostAccessor is not registered yet, returns nullptr.
+  static std::unique_ptr<RequestCostAccessor> CreateByNameOrNull(
+      absl::string_view name);
+
+  using Creator = std::function<std::unique_ptr<RequestCostAccessor>()>;
+
+  // Registers a RequestCostAccessor type to the global map. Registering
+  // different types of RequestCostAccessor with the same name is prohibited.
+  static void RegisterRequestCostAccessor(absl::string_view name,
+                                          Creator creator);
+};
+
+// Registers a RequestCostAccessor type to the global map. Registering different
+// types of RequestCostAccessor with the same name is prohibited.
+class RequestCostAccessorRegistrar {
+ public:
+  explicit RequestCostAccessorRegistrar(
+      absl::string_view name, RequestCostAccessorRegistry::Creator creator) {
+    RequestCostAccessorRegistry::RegisterRequestCostAccessor(
+        name, std::move(creator));
+  }
+};
+
+#define REGISTER_REQUEST_COST_ACCESSOR(name, MyRequestCostAccessorClass) \
+  namespace {                                                            \
+  static ::tensorflow::RequestCostAccessorRegistrar                      \
+      MyRequestCostAccessorClass##_registrar((name), [] {                \
+        return std::make_unique<MyRequestCostAccessorClass>();          \
+      });                                                                \
+  }  // namespace
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REQUEST_COST_ACCESSOR_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_alg.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_alg.h
new file mode 100644
index 00000000..df907258
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_alg.h
@@ -0,0 +1,121 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Basic ring-algorithm implementation to be further specialized
+// for specific collective functions.
+class RingAlg : public CollectiveImplementationInterface {
+ public:
+  explicit RingAlg(CollectiveType type, const string& name);
+  ~RingAlg() override {}
+
+  // Establishes the requested number of subdivision permutations based on the
+  // ring order implicit in the device order.
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override;
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  absl::Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+ protected:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const absl::Status& s);
+  void Finish(bool ok);
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    absl::Status status;
+    string DebugString() const;
+  };
+  virtual void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                             int field_idx);
+  void AdvanceToSecondPass(RingField* rf);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(const Tensor& tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ TF_GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ TF_GUARDED_BY(pcq_mu_);
+  };
+
+  const CollectiveType type_;
+  const string name_;
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  int group_size_;
+  int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  mutex status_mu_;
+  absl::Status status_ TF_GUARDED_BY(status_mu_);
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_gatherer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_gatherer.h
new file mode 100644
index 00000000..ac894a38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_gatherer.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Ring-algorithm implementation of collective all-gather.
+class RingGatherer : public RingAlg {
+ public:
+  RingGatherer() : RingAlg(GATHER_COLLECTIVE, "Gather") {}
+  ~RingGatherer() override {}
+
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override;
+
+  // Begins async execution of the ring gather algorithm.
+  // Must be called in a blockable thread.
+  // TODO(b/80529858): remove the previous warning when we have a dedicated
+  // collective threadpool.
+  void Run(StatusCallback done) override;
+
+ private:
+  bool RunAsyncParts();
+
+  friend class RingGathererTest;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_reducer.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_reducer.h
new file mode 100644
index 00000000..77317235
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/ring_reducer.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Ring-algorithm implementation of collective all-reduce.
+class RingReducer : public RingAlg {
+ public:
+  RingReducer() : RingAlg(REDUCTION_COLLECTIVE, "Reduce") {}
+  ~RingReducer() override;
+
+  // Begins async execution of the ring reduce algorithm.
+  // Must be called in a blockable thread.
+  // TODO(b/80529858): remove the previous warning when we have a dedicated
+  // collective threadpool.
+  void Run(StatusCallback done) override;
+
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override;
+
+ protected:
+  void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                     int field_idx) override;
+
+ private:
+  void ContinueAfterInputCopy();
+  bool RunAsyncParts();
+
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+
+  friend class RingReducerTest;
+  friend class RingReducerInitParamsTest;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/scoped_allocator.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/scoped_allocator.h
new file mode 100644
index 00000000..5b22deb2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/scoped_allocator.h
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class ScopedAllocatorContainer;
+class ScopedAllocatorInstance;
+
+// Manages a single backing tensor and a collection of aliases.
+class ScopedAllocator {
+ public:
+  static constexpr int32_t kInvalidId = 0;
+  static constexpr size_t kMaxAlignment = 64;
+
+  // A subrange of the TensorBuffer associated with this object that
+  // will be the backing memory for one aliased tensor.
+  struct Field {
+    int32 scope_id;
+    size_t offset;
+    size_t bytes_requested;
+    size_t bytes_allocated;
+  };
+  // Field index that refers to backing tensor, not any aliased field.
+  static constexpr int32_t kBackingIndex = -1;
+
+  // backing_tensor is expected to be newly allocated by a ScopedAllocatorOp
+  // instance.  It must be large enough to back all of the specified
+  // (offset, byte) ranges of the fields.
+  ScopedAllocator(const Tensor& backing_tensor, int32_t scope_id,
+                  const std::string& name, const absl::Span<const Field> fields,
+                  int32_t expected_call_count,
+                  ScopedAllocatorContainer* container);
+
+  // Automatically deletes when last use expires, or when
+  // ScopedAllocatorContainer decides to delete.
+  ~ScopedAllocator() TF_LOCKS_EXCLUDED(mu_);
+
+  // For debugging: returns true iff p is a pointer that could have
+  // been returned by AllocateRaw.
+  bool VerifyPointer(const void* p);
+  bool VerifyTensor(const Tensor* t);
+
+  const Tensor& tensor() const { return backing_tensor_; }
+
+  const std::string& name() const { return name_; }
+
+ private:
+  friend class ScopedAllocatorInstance;
+  // Only ScopedAllocatorInstances can call AllocateRaw and DeallocateRaw on a
+  // ScopedAllocator
+  void* AllocateRaw(int32_t field_index, size_t num_bytes)
+      TF_LOCKS_EXCLUDED(mu_);
+  void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_);
+  Tensor backing_tensor_;
+  TensorBuffer* tbuf_;
+  int32 id_;
+  std::string name_;
+  ScopedAllocatorContainer* container_;
+  std::vector<Field> fields_;
+  mutex mu_;
+  int32 expected_call_count_ TF_GUARDED_BY(mu_);
+  int32 live_alloc_count_ TF_GUARDED_BY(mu_);
+};
+
+// An Allocator that will return a pointer into the backing buffer of
+// a previously allocated tensor, allowing creation of an alias
+// tensor.  There is a one-to-one mapping between the fields of a
+// ScopedAllocator and ScopedAllocatorInstances.  There is also a one-to-one
+// mapping between scope_ids and ScopedAllocatorInstances.  It should be
+// discarded immediately after a single use.
+class ScopedAllocatorInstance : public Allocator {
+ public:
+  explicit ScopedAllocatorInstance(ScopedAllocator* sa, int32_t field_index);
+
+ private:
+  ~ScopedAllocatorInstance() override {
+    VLOG(1) << "~ScopedAllocatorInstance " << this;
+  }
+
+ public:
+  // When a ScopedAllocatorContainer "Drops" a scope_id, it calls DropFromTable
+  // on the underlying ScopedAllocatorInstance.  If this instance has already
+  // deallocated the tensor slice, we can safely delete this.
+  void DropFromTable() TF_LOCKS_EXCLUDED(mu_);
+  void* AllocateRaw(size_t alignment, size_t num_bytes)
+      TF_LOCKS_EXCLUDED(mu_) override;
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocator_attr) override {
+    return AllocateRaw(alignment, num_bytes);
+  }
+  void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_) override;
+  bool TracksAllocationSizes() const override { return false; }
+  size_t RequestedSize(const void* ptr) const override { return 0; }
+  size_t AllocatedSize(const void* ptr) const override { return 0; }
+  int64_t AllocationId(const void* ptr) const override { return 0; }
+  size_t AllocatedSizeSlow(const void* ptr) const override { return 0; }
+  std::string Name() override;
+
+ private:
+  mutex mu_;
+  ScopedAllocator* scoped_allocator_;
+  int32 field_index_;
+  bool allocated_ TF_GUARDED_BY(mu_);
+  bool deallocated_ TF_GUARDED_BY(mu_);
+  bool in_table_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/scoped_allocator_mgr.h
new file mode 100644
index 00000000..dbbf7c32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -0,0 +1,111 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_MGR_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class ScopedAllocatorMgr;
+
+// At most one of these exists per <device, step_id> pair.
+// A Ref is held by every ScopedAllocator and also by the ScopedAllocatorMgr.
+class ScopedAllocatorContainer : public core::RefCounted {
+ public:
+  // Establishes a reachable ScopedAllocator.
+  absl::Status AddScopedAllocator(
+      const Tensor& backing_tensor, int32_t scope_id,
+      const std::string& scope_name,
+      const absl::Span<const ScopedAllocator::Field>& fields,
+      int32_t expected_call_count);
+
+  ScopedAllocatorInstance* GetInstance(int32_t scope_id);
+  ScopedAllocator* GetAllocator(int32_t scope_id);
+
+  // Retire the scope_id.
+  void Drop(int32_t scope_id, ScopedAllocator* sa);
+
+ protected:
+  friend class ScopedAllocatorMgr;
+  ScopedAllocatorContainer(const ScopedAllocatorMgr* mgr, int64_t step_id)
+      : mgr_(mgr), step_id_(step_id) {}
+  ~ScopedAllocatorContainer();
+
+ private:
+  const ScopedAllocatorMgr* mgr_;
+  int64_t step_id_;
+  mutex mu_;
+  struct SAField {
+    int32 field_index;
+    union {
+      ScopedAllocator* scoped_allocator;
+      ScopedAllocatorInstance* instance;
+    };
+    SAField(int32_t fi, ScopedAllocatorInstance* sai)
+        : field_index(fi), instance(sai) {}
+    SAField(int32_t fi, ScopedAllocator* sa)
+        : field_index(fi), scoped_allocator(sa) {}
+    SAField()
+        : field_index(ScopedAllocator::kBackingIndex),
+          scoped_allocator(nullptr) {}
+  };
+  std::unordered_map<int32, SAField> allocators_ TF_GUARDED_BY(mu_);
+};
+
+// At most one of these exists per device.
+class ScopedAllocatorMgr {
+ public:
+  explicit ScopedAllocatorMgr(const std::string& device_name)
+      : device_name_(device_name) {}
+  ~ScopedAllocatorMgr();
+
+  ScopedAllocatorContainer* GetContainer(int64_t step_id);
+
+  // Establishes a reachable ScopedAllocator.
+  absl::Status AddScopedAllocator(
+      const Tensor& backing_tensor, int64_t step_id, int32_t scope_id,
+      const std::string& scope_name,
+      const absl::Span<const ScopedAllocator::Field>& fields,
+      int32_t expected_call_count);
+
+  void Cleanup(int64_t step_id);
+
+  // Populate the bytes and offset members of Field.  Instance allocaters get
+  // consecutive scope_id values following that of the base ScopedAllocator.
+  // Returns the total number of bytes required to be allocated in the
+  // backing tensor, for convenience.  (The same value can be obtained
+  // by summing offset and bytes in the last field.)
+  static size_t PopulateFields(int32_t scope_id,
+                               const absl::Span<const TensorShape>& shapes,
+                               const DataType dtype,
+                               std::vector<ScopedAllocator::Field>* fields);
+
+  const std::string& device_name() const { return device_name_; }
+
+ private:
+  std::string device_name_;
+  mutex mu_;
+  std::unordered_map<int64_t, ScopedAllocatorContainer*> per_step_map_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/session_factory.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/session_factory.h
new file mode 100644
index 00000000..ffadb29a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/session_factory.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_FACTORY_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Session;
+struct SessionOptions;
+
+class SessionFactory {
+ public:
+  // Creates a new session and stores it in *out_session, or fails with an error
+  // status if the Session could not be created. Caller takes ownership of
+  // *out_session if this returns OkStatus().
+  virtual absl::Status NewSession(const SessionOptions& options,
+                                  Session** out_session) = 0;
+
+  virtual bool AcceptsOptions(const SessionOptions& options) = 0;
+
+  // Abort and close all existing sessions, disconnecting their resources from
+  // future sessions.
+  //
+  // Reset() allows misbehaving or slow sessions to be aborted and closed, and
+  // causes their resources eventually to be released.  Reset() does not wait
+  // for the computations in old sessions to cease; it merely starts the
+  // process of tearing them down.  However, if a new session is started after
+  // a Reset(), the new session is isolated from changes that old sessions
+  // (started prior to the Reset()) may continue to make to resources, provided
+  // all those resources are in containers listed in "containers".
+  //
+  // Old sessions may continue to have side-effects on resources not in
+  // containers listed in "containers", and thus may affect future
+  // sessions' results in ways that are hard to predict.  Thus, if well-defined
+  // behavior is desired, is it recommended that all containers be listed in
+  // "containers".
+  //
+  // If the "containers" vector is empty, the default container is assumed.
+  // If the "containers" vector is non-empty, the default container should be
+  // listed explicitly.
+  //
+  // Sessions that support resource containers should override this function.
+  virtual absl::Status Reset(const SessionOptions& options,
+                             const std::vector<string>& containers) {
+    return errors::Unimplemented("Reset()");
+  }
+
+  virtual ~SessionFactory() {}
+  static void Register(const string& runtime_type, SessionFactory* factory);
+  static absl::Status GetFactory(const SessionOptions& options,
+                                 SessionFactory** out_factory);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/shape_refiner.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/shape_refiner.h
new file mode 100644
index 00000000..580dafb0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/shape_refiner.h
@@ -0,0 +1,293 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace grappler {
+class GraphProperties;
+}
+
+// ShapeRefiner performs shape inference for TensorFlow Graphs.  It is
+// responsible for instantiating InferenceContext objects for each
+// Node in the Graph, and providing/storing the 'input_tensor' Tensors
+// used by Shape Inference functions, when available at graph
+// construction time.
+class ShapeRefiner {
+ public:
+  ShapeRefiner(int graph_def_version, const OpRegistryInterface* ops);
+
+  // Same as ShapeRefiner(versions.producer(), ops)
+  ShapeRefiner(const VersionDef& versions, const OpRegistryInterface* ops);
+
+  ~ShapeRefiner();
+
+  // Performs validation of 'node' and runs 'node's shape function,
+  // storing its shape outputs.
+  //
+  // All inputs of 'node' must be added to ShapeRefiner prior to
+  // adding 'node'.
+  //
+  // Returns an error if:
+  //  - the shape function for 'node' was not registered.
+  //  - 'node' was added before its inputs.
+  //  - The shape inference function returns an error.
+  absl::Status AddNode(const Node* node);
+
+  // Sets 'node's 'output_port' output to have shape 'shape'.
+  //
+  // Returns an error if 'node' was not previously added to this
+  // object, if 'output_port' is invalid, or if 'shape' is
+  // not compatible with the existing shape of the output.
+  absl::Status SetShape(const Node* node, int output_port,
+                        shape_inference::ShapeHandle shape);
+
+  // Update the input shapes of node in case the shapes of the fan-ins of 'node'
+  // have themselves been modified (For example, in case of incremental shape
+  // refinement). If 'relax' is true, a new shape with the broadest set of
+  // information will be set as the new input (see InferenceContext::RelaxInput
+  // for full details and examples). Sets refined to true if any shapes have
+  // changed (in their string representations). Note that shapes may have been
+  // updated to newer versions (but with identical string representations) even
+  // if <*refined> is set to false.
+  absl::Status UpdateNode(const Node* node, bool relax, bool* refined);
+
+  // Returns the InferenceContext for 'node', if present.
+  shape_inference::InferenceContext* GetContext(const Node* node) const {
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return it->second.get();
+  }
+
+  // Getters and setters for graph_def_version_.
+  int32 graph_def_version() const { return graph_def_version_; }
+  void set_graph_def_version(int32_t version) { graph_def_version_ = version; }
+
+  void set_require_shape_inference_fns(bool require_shape_inference_fns) {
+    require_shape_inference_fns_ = require_shape_inference_fns;
+  }
+  void set_disable_constant_propagation(bool disable) {
+    disable_constant_propagation_ = disable;
+  }
+
+  // Set function library to enable function shape inference.
+  // Without function library, function inference always yields unknown shapes.
+  // With this enabled, shape inference can take more time since it descends
+  // into all function calls. It doesn't do inference once for each function
+  // definition, but once for each function call.
+  // The function library must outlive the shape refiner.
+  void set_function_library_for_shape_inference(
+      const tensorflow::FunctionLibraryDefinition* lib) {
+    function_library_ = lib;
+  }
+
+  bool function_shape_inference_supported() const {
+    return function_library_ != nullptr;
+  }
+
+ private:
+  friend class ShapeRefinerTest;
+  friend class ::tensorflow::grappler::GraphProperties;
+
+  // Returns true if the ranks and all dimensions of <s0> and <s1> are either
+  // equal in value or both unknown.
+  static bool SameDefinedShape(shape_inference::InferenceContext* c,
+                               shape_inference::ShapeHandle s0,
+                               shape_inference::ShapeHandle s1);
+
+  // Returns true if the shapes and types stored in <*existing> are identical in
+  // value to the shapes and types in <*updated>.
+  static bool IsUpdatedShapesOrTypes(
+      shape_inference::InferenceContext* c,
+      const std::vector<shape_inference::ShapeAndType>& existing,
+      const std::vector<shape_inference::ShapeAndType>& updated);
+
+  // Performs shape inference for the given function_def within the
+  // given outer_context. Internally it instantiates the function as a graph
+  // and runs shape inference recursively on it with the input shapes provided
+  // by the outer_context.
+  //
+  // Returns an error if:
+  // - number of inputs/outputs on outer_context doesn't match the function_def
+  //
+  // On success:
+  // - outer_context will contain output shapes inferred from input shapes
+  absl::Status InferShapesForFunction(
+      const FunctionDef* function_def, AttrSlice attributes,
+      shape_inference::InferenceContext* outer_context);
+
+  // Performs shape inference for a node inside a function.
+  //
+  // 'outer_context' is the 'InferenceContext' for the function's call op.
+  absl::Status InferShapesForFunctionSubNode(
+      const Node* node, shape_inference::InferenceContext* outer_context);
+
+  // Performs validation of 'node' and runs 'node's shape function,
+  // storing its shape outputs.
+  //
+  // All inputs of 'node' must be added to ShapeRefiner prior to
+  // adding 'node'.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  //
+  // Returns an error if:
+  //  - the shape function for 'node' was not registered.
+  //  - 'node' was added before its inputs.
+  //  - The shape inference function returns an error.
+  absl::Status AddNodeInternal(
+      const Node* node, shape_inference::InferenceContext* outer_context);
+
+  // Attempts to evaluate the 'dst_idx'-th input to 'node'. If the input edge
+  // value can be evaluated, 'evaluated' is set to true and the value returned
+  // in 'result'. Otherwise 'evaluated' is set to false.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  absl::Status EvaluateConstantTensorForEdge(
+      const Node* node, int dst_idx, bool* evaluated, Tensor* result,
+      shape_inference::InferenceContext* outer_context);
+
+  // Wrapper around EvaluateConstantTensorForEdge for scalar int32/int64 input
+  // tensors. The caller is responsible for checking that the specified edge is
+  // scalar and int32 or int64.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  absl::Status EvaluateConstantIntScalarEdge(
+      const Node* node, int dst_idx, bool* evaluated, int64_t* result,
+      shape_inference::InferenceContext* outer_context);
+
+  // This function tries to materialize as much information about the 'node''s
+  // dst_idx input as a statically computable shape, and the result may be
+  // partially known, depending on what is statically inferable.
+  //
+  // This is called when node.input[dst_idx] is a tensor that is used to define
+  // the shape of some other tensor (e.g., the second argument to Reshape is a
+  // <shape> tensor, where each element of the shape tensor is a dimension of
+  // the target tensor).  It returns in <result> a shape for that input.
+  //
+  // Unlike simply resolving node.input[dst_idx] to a constant and then
+  // converting that to a shape, this function can return a partial shape. This
+  // is useful for cases where the shape tensor is only partially defined, such
+  // as with calls for: reshape(x, shape(y)) where shape(y) is partially
+  // defined.
+  //
+  // The implementation has op implementations for ops commonly called on shape
+  // tensors, and the implementations are specialized to shape tensors (namely,
+  // the output is a vector).
+  //
+  // <target_context> is used when creating new DimensionHandle and ShapeHandle
+  // objects.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  absl::Status ConstantPartialShape(
+      shape_inference::InferenceContext* target_context, const Node* node,
+      int dst_idx, shape_inference::ShapeHandle* result,
+      shape_inference::InferenceContext* outer_context);
+
+  // Implementation of ConstantPartialShape for StridedSlice nodes.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  absl::Status PartialStridedSliceShape(
+      Node* slice_node, shape_inference::InferenceContext* ctx,
+      shape_inference::ShapeHandle* result,
+      shape_inference::InferenceContext* outer_context);
+
+  // Runs the shape function registered for the node's op type.
+  //
+  // Optionally, if 'node' is in a nested function, the 'InferenceContext' for
+  // the call op of the function can be passed as 'outer_context' (pass nullptr
+  // otherwise). This gets used to perform constant propagation across Arg nodes
+  // by requesting the constant of value of the incoming tensor from the
+  // 'outer_context'.
+  absl::Status RunShapeFn(
+      const Node* node, const OpRegistrationData* op_reg_data,
+      shape_inference::InferenceContext* context,
+      shape_inference::InferenceContext* outer_context = nullptr);
+
+  int32 graph_def_version_;
+  const OpRegistryInterface* const ops_registry_;
+
+  // The lifetime of the tensors are bound to the runner, so it should be the
+  // deleted after the tensors.
+  GraphRunner graph_runner_;
+
+  // Stores a map from a node to its InferenceContext.
+  absl::flat_hash_map<const Node*,
+                      std::unique_ptr<shape_inference::InferenceContext>,
+                      hash<const Node*>>
+      node_to_context_;
+
+  // Holds a cache from tensor id (node id:node output) to the tensor that
+  // is evaluable as a constant expression. This reduces repeated execution
+  // of the entire constant subgraph as a graph is being built up. This could
+  // be changed to some kind of size-based LRU cache to avoid consuming too much
+  // memory, if that eventually becomes a concern.
+  //
+  // Only tensors less than 1KiB are currently stored in the cache.
+  static constexpr int64_t kMaxTensorSize = 1024;
+  absl::flat_hash_map<std::pair<int, int>, Tensor> const_tensor_map_;
+
+  bool require_shape_inference_fns_ = true;
+  bool disable_constant_propagation_ = false;
+
+  // Function library is optional, but has to be set to enable function
+  // shape inference.
+  const tensorflow::FunctionLibraryDefinition* function_library_ = nullptr;
+
+  // Cache the graph corresponding to each function definition for which shapes
+  // are refined.
+  absl::flat_hash_map<std::string, std::unique_ptr<const Graph>> functions_;
+
+  ShapeRefiner(const ShapeRefiner&) = delete;
+  void operator=(const ShapeRefiner&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/shared_counter.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/shared_counter.h
new file mode 100644
index 00000000..d40f24f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/shared_counter.h
@@ -0,0 +1,26 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+
+#include <atomic>
+
+#include "xla/tsl/framework/shared_counter.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::SharedCounter;  // NOLINT
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/simple_propagator_state.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/simple_propagator_state.h
new file mode 100644
index 00000000..9f465ef1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -0,0 +1,190 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SIMPLE_PROPAGATOR_STATE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SIMPLE_PROPAGATOR_STATE_H_
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/entry.h"
+#include "tensorflow/core/common_runtime/immutable_executor_state.h"
+#include "tensorflow/core/common_runtime/pending_counts.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Represents the ephemeral "edge state" associated with one invocation of
+// `Executor::Run()`.
+//
+// NOTE: `SimplePropagatorState` does not support "v1-style" control flow,
+// including "dead tensors", "Switch" and "Merge" nodes, and cycles in the
+// graph. Use `PropagatorState` for graphs with those features.
+// `SimplePropagatorState` *does* support "v2-style" or "functional" control
+// flow.
+//
+// `SimplePropagatorState` is responsible for propagating values along dataflow
+// edges in a TensorFlow graph and determining which nodes are runnable. The
+// executor primarily updates `SimplePropagatorState` by calling
+// `PropagateOutputs()` after processing a node, and `SimplePropagatorState`
+// dispatches `TaggedNode`s by adding them to a `TaggedNodeSeq`.
+class SimplePropagatorState {
+ public:
+  SimplePropagatorState(const ImmutableExecutorState& immutable_state,
+                        int64_t step_id, bool vlog);
+  ~SimplePropagatorState();
+
+  // A `TaggedNode` corresponds to a single invocation of a node's kernel,
+  // and it is created when the kernel becomes runnable.
+  struct TaggedNode {
+    const NodeItem* node_item;
+
+    explicit TaggedNode(const NodeItem* node_item) : node_item(node_item) {}
+
+    const NodeItem& get_node_item() const { return *node_item; }
+
+    bool get_is_dead() const { return false; }
+    int64_t get_iter_num() const { return 0; }
+  };
+
+  // A drop-in replacement for std::deque<TaggedNode>.  We typically don't
+  // have that many nodes in the ready queue, so we just use a vector and
+  // don't free up memory from the queue as we consume nodes.
+  // TODO(mrry): Extract this and share it with the version in
+  // `PropagatorState`. The correct constants might be different, since
+  // sizeof(TaggedNode) is smaller in this version.
+  class TaggedNodeReadyQueue {
+   public:
+    TaggedNodeReadyQueue() : front_index_(0) {}
+
+    void push_back(const TaggedNode& node) { ready_.push_back(node); }
+    TaggedNode front() const {
+      DCHECK_LT(front_index_, ready_.size());
+      return ready_[front_index_];
+    }
+    void pop_front() {
+      DCHECK_LT(front_index_, ready_.size());
+      front_index_++;
+      if ((front_index_ == ready_.size()) || (front_index_ > kSpillThreshold)) {
+        if (front_index_ == ready_.size()) {
+          ready_.clear();
+        } else {
+          // Lots of unused entries at beginning of vector: move everything
+          // down to start of vector.
+          ready_.erase(ready_.begin(), ready_.begin() + front_index_);
+        }
+        front_index_ = 0;
+      }
+    }
+    bool empty() const { return ready_.empty(); }
+    int size() const { return ready_.size() - front_index_; }
+
+   private:
+    // TODO(b/152925936): Re-evaluate these constants with current usage
+    // patterns.
+    static constexpr int kSpillThreshold = 16384;
+    absl::InlinedVector<TaggedNode, 16UL> ready_;
+    int front_index_;
+  };
+
+  // TODO(b/152925936): Re-evaluate this constant with current usage patterns.
+  typedef absl::InlinedVector<TaggedNode, 8UL> TaggedNodeSeq;
+
+  // Creates and adds a `TaggedNode` for each node in `roots` to `*ready`.
+  void ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
+                     TaggedNodeSeq* ready);
+
+  // After processing the outputs, propagates the outputs to their dsts.
+  // Contents of *outputs are left in an indeterminate state after
+  // returning from this method.
+  void PropagateOutputs(const TaggedNode& tagged_node, EntryVector* outputs,
+                        TaggedNodeSeq* ready);
+
+  // Returns an array of `Entry` objects corresponding to the inputs of
+  // `tagged_node`.
+  Entry* GetInputTensors(const TaggedNode& tagged_node) {
+#if defined(THREAD_SANITIZER) || defined(DEBUG)
+    // NOTE: This read of `pending_[...]` works around a limitation in TSAN.
+    // To avoid false positive data race reports, we need to perform an atomic
+    // object access that will establish the happens-before relation between
+    // the write to input_tensors_ in `PropagateOutputs()` and the read in
+    // `PrepareInputs()`.
+    CHECK_EQ(pending_[tagged_node.node_item->node_id], 0);
+#endif  // defined(THREAD_SANITIZER) || defined(DEBUG)
+    return input_tensors_.data() + tagged_node.node_item->input_start;
+  }
+
+  FrameAndIter GetFrameAndIter(const TaggedNode& tagged_node) const {
+    return {0, 0};
+  }
+
+  // Provide debugging output of the state of the executor.
+  void DumpState();
+
+  // For debugging/logging only.
+  void MaybeMarkStarted(const TaggedNode& tagged_node) {
+    // TODO(misard) Replace with a finer-grain enabling flag once we add better
+    // optional debugging support.
+    if (TF_PREDICT_FALSE(vlog_) && VLOG_IS_ON(1)) {
+      mutex_lock l(mu_);
+      (*active_)[tagged_node.node_item->node_id] = true;
+    }
+  }
+  void MaybeMarkCompleted(const TaggedNode& tagged_node) {
+    // TODO(misard) Replace with a finer-grain enabling flag once we add better
+    // optional debugging support.
+    if (TF_PREDICT_FALSE(vlog_) && VLOG_IS_ON(1)) {
+      mutex_lock l(mu_);
+      (*active_)[tagged_node.node_item->node_id] = false;
+    }
+  }
+
+ private:
+  SimplePropagatorState(const ImmutableExecutorState& immutable_state_,
+                        int64_t step_id,
+                        const ImmutableExecutorState::FrameInfo& finfo,
+                        bool vlog);
+
+  const ImmutableExecutorState& immutable_state_;
+  const int64_t step_id_;
+  const bool vlog_;
+
+  // The i-th node's j-th input is stored at
+  // `input_tensors[impl_->nodes[i].input_start + j]`.
+  //
+  // NOTE: No need to protect input_tensors[i] by any locks because it
+  // is resized once. Each element of input_tensors is written once by the
+  // source node of an edge and is cleared by the destination of the same
+  // edge. The destination node always runs after the source node, so there
+  // is never concurrent access to the same entry.
+  std::vector<Entry> input_tensors_;
+
+  std::unique_ptr<std::atomic<int32>[]> pending_;
+
+  // If `vlog_` is true, this stores a bit vector of active nodes, indexed by
+  // node ID.
+  mutex mu_;
+  std::unique_ptr<std::vector<bool>> active_ TF_GUARDED_BY(mu_);
+
+  const std::vector<const NodeItem*>* const nodes_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SIMPLE_PROPAGATOR_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.h
new file mode 100644
index 00000000..553e298f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SIMPLIFY_ICI_DUMMY_VARIABLES_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SIMPLIFY_ICI_DUMMY_VARIABLES_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/platform/status.h"
+
+// Create new dummy zero variables to TPUExecute Op for ICI
+// weight distribution, which is a critical feature in TF2/Min. The new dummy
+// zero variables will be put on the same task as the TPUExecute Op. The old
+// dummy zero variables will be removed afterwards.
+//
+// For example, in the following graph, the inputs to TPUExecute Op are on
+// task:0, after the pass, the dummy zero variables will be put on task:2.
+// which is the same as the TPUExecute.
+//
+// The graph before pass is:
+//
+//   node {name: "const0", op: "Const"}
+//   node {name: "const1", op: "Const"}
+//   node {name: "fill0", op: "Fill", input: "const1", input: "const0"}
+//   node {name: "Identity0", op: "Identity", input: "fill0",
+//     device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+//     attr {
+//       key: "_ici_weight_distribution_mlir_bridge_marker", value {b: true}
+//     }
+//   }
+//   node {name: "const2", op: "Const"}
+//   node {name: "const3", op: "Const"}
+//   node {name: "fill1", op: "Fill", input: "const2", input: "const3"}
+//   node {name: "identity1", op: "Identity", input: "fill1"
+//     device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+//     attr {
+//       key: "_ici_weight_distribution_mlir_bridge_marker", value {b: true}
+//     }
+//   }
+//   node {name: "const4", op: "Const"}
+//   node {name: "split0", op: "Split", input: "const4", input: "identity1"
+//     attr {
+//       key: "_ici_weight_distribution_mlir_bridge_marker"
+//       value {b: true}
+//     }
+//   }
+//   node {name: "TPUExecute0", op: "TPUExecute"
+//     input: "identity0", input: "split0:1"
+//     device: "/job:worker/replica:0/task:2/device:TPU:0"
+//     attr {
+//      key: "_parallel_execution_ids"
+//      value {s: "r0:1,p0:2"}
+//     }
+//   }
+//
+// The graph after pass is:
+//
+//   node {name: "const0_dummy", op: "Const",
+//     device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {name: "const1_dummy", op: "Const",
+//     device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {name: "fill0_dummy", op: "Fill",
+//     input: "const1_dummy", input: "const0_dummy",
+//     device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {name: "const2_dummy", op: "Const",
+//     device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {name: "const3_dummy", op: "Const",
+//     device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {name: "fill1_dummy", op: "Fill",
+//     input: "const2_dummy", input: "const3_dummy",
+//     device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+//   }
+//   node {name: "TPUExecute0", op: "TPUExecute"
+//     input: "fill0_dummy", input: "fill1_dummy"
+//     device: "/job:worker/replica:0/task:2/device:TPU:0"
+//     attr {
+//      key: "_parallel_execution_ids"
+//      value {s: "r0:1,p0:2"}
+//     }
+//   }
+
+namespace tensorflow {
+
+// This pass will simplify the dummy variables for ICI weight distribution.
+// The dummy variables will be put on the same task as the TPUExecute Op.
+class SimplifyIciDummyVariablesPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SIMPLIFY_ICI_DUMMY_VARIABLES_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/single_threaded_cpu_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/single_threaded_cpu_device.h
new file mode 100644
index 00000000..3498e4aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+using Env = tsl::Env;
+
+class Device;
+
+// Returns a simple single-threaded CPU device. This can be used to run
+// inexpensive computations. In particular, using this avoids initializing the
+// global thread pools in LocalDevice.
+//
+// The returned pointer is owned by the caller.
+Device* NewSingleThreadedCpuDevice(Env* env);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/single_threaded_executor.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/single_threaded_executor.h
new file mode 100644
index 00000000..55749ed6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/single_threaded_executor.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_EXECUTOR_H_
+
+#include "tensorflow/core/common_runtime/executor.h"
+
+namespace tensorflow {
+
+// Creates a new `Executor` for executing `graph` synchronously on the caller
+// thread.
+//
+// NOTE(mrry): The returned executor is optimized to impose low overhead on
+// graphs that perform a small amount of work (e.g. <15us of work per graph on
+// present architectures). It eschews concurrency, because issuing work to
+// multiple threads can dominate the cost of executing small ops synchronously,
+// and because contention in the executor data structures can reduce throughput
+// (in terms of ops executed per unit time).
+//
+// However, the current implementation has the following limitations:
+//
+// 1. Reference-typed tensors are not supported and will not be supported in
+//    future.
+// 2. Graphs with control flow (containing "Switch" and "Merge" nodes) are not
+//    currently supported. The current plan is to extend support to "functional"
+//    control flow after the TensorFlow APIs transition to building graphs in
+//    that form (e.g. `tf.cond_v2()`).
+// 3. Partitioned graphs (containing "_Recv" nodes) are not currently supported.
+//    The present implementation executes kernels one at a time in topological
+//    order, and cannot currently distinguish between disconnected subgraphs
+//    that are logically connected by subgraphs on a different device.
+// 4. Memory logging is not currently supported.
+// 5. Allocation forwarding is not currently supported.
+// 6. Non-default device contexts are not currently supported. In effect, this
+//    limits the executor to CPU devices.
+// 7. Ops that rely on `OpKernelContext::slice_reader_cache()` being non-null
+//    are not currently supported.
+//
+// The single-threaded executor is primarily suitable for executing simple
+// TensorFlow functions, such as one might find in a `tf.data` pipeline.
+absl::Status NewSingleThreadedExecutor(const LocalExecutorParams& params,
+                                       const Graph& graph, Executor** executor);
+
+// Returns OkStatus() for ops which are compatible with synchronous execution,
+// and otherwise returns an error message appropriate for propagation if needed.
+// If `allow_control_flow_sync_execution` is set to `true` control
+// nodes are marked as safe for execution on the SingleThreadedExecutor.
+absl::Status ValidateOpIsSafeForSyncExecution(
+    const Node& n, bool allow_control_flow_sync_execution);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/stats_publisher_interface.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/stats_publisher_interface.h
new file mode 100644
index 00000000..450683e6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -0,0 +1,85 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/common_runtime/build_graph_options.h"
+#include "tensorflow/core/common_runtime/profile_handler.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class StatsPublisherInterface;
+
+typedef std::function<std::unique_ptr<StatsPublisherInterface>(
+    const std::string&, const BuildGraphOptions&, const SessionOptions&)>
+    StatsPublisherFactory;
+
+// StatsPublisherInterface describes objects that publish information exported
+// by Sessions.
+// NOTE: This interface is experimental and subject to change.
+// Implementations must be thread-safe.
+class StatsPublisherInterface {
+ public:
+  // PublishStatsProto publishes step_stats.
+  // When PublishStatsProto is called multiple times, only the step_stats
+  // corresponding to the latest call will be published.
+  virtual void PublishStatsProto(const StepStats& step_stats) = 0;
+
+  // PublishGraphProto publishes the graph_defs corresponding to each partition
+  // in the session.
+  // When PublishGraphProto is called multiple times, only the graph_defs
+  // corresponding to the latest call will be published.
+  virtual void PublishGraphProto(
+      const std::vector<const GraphDef*>& graph_defs) = 0;
+  virtual void PublishGraphProto(std::vector<GraphDef> graph_defs) = 0;
+  virtual void PublishGraphProto(
+      std::vector<core::RefCountPtr<FunctionRecord>>&& function_records) = 0;
+
+  // Returns a profile handler for the given step based on the execution_count
+  // and RunOptions.
+  //
+  // This method may return a null pointer, if no handler was created.
+  virtual std::unique_ptr<ProfileHandler> GetProfileHandler(
+      uint64 step, int64_t execution_count, const RunOptions& ropts) = 0;
+
+  virtual ~StatsPublisherInterface() {}
+
+  static void RegisterStatsPublisher(StatsPublisherFactory factory_fn);
+
+  static StatsPublisherFactory GetStatsPublisherFactory();
+
+ private:
+  static StatsPublisherFactory** GetStatsPublisherFactoryPtr() {
+    static StatsPublisherFactory* stats_publisher_factory = nullptr;
+    return &stats_publisher_factory;
+  }
+};
+
+std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
+    const string& session, const BuildGraphOptions& bopts,
+    const SessionOptions& sopts);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/step_stats_collector.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/step_stats_collector.h
new file mode 100644
index 00000000..277630cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/step_stats_collector.h
@@ -0,0 +1,208 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tracking_allocator.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class AllocatorMemoryUsed;
+class CostModelManager;
+class Graph;
+class NodeDef;
+class NodeExecStats;
+class OpKernelContext;
+class StepStats;
+class StepStatsCollector;
+class Tensor;
+
+// Statistics collection interface for individual node execution.
+//
+// See `NodeExecStatsWrapper` for a concrete implementation of this interface
+// that interfaces with the `Session` layer.
+class NodeExecStatsInterface {
+ public:
+  virtual ~NodeExecStatsInterface() {}
+
+  // Called when the statistics collection for the node has finished. Once this
+  // method is called, the caller should not make assumptions about the validity
+  // of this object.
+  virtual void Done(const string& device) = 0;
+
+  // Called immediately after this node starts being processed by the executor.
+  virtual void RecordExecutorStarted() = 0;
+
+  // Called immediately before this node's `Compute()` or `ComputeAsync()`
+  // method is called.
+  virtual void RecordComputeStarted() = 0;
+
+  // Called immediately after this node's `Compute()` method returned (or, for
+  // asynchronous operations, the callback passed to its `ComputeAsync()` method
+  // was called).
+  virtual void RecordComputeEnded() = 0;
+
+  // Called immediately after this executor finishes processing this node.
+  virtual void RecordExecutorEnded() = 0;
+
+  // Returns `true` if this object should track memory allocations.
+  virtual bool TrackAllocations() const = 0;
+
+  // Records information about the memory allocated during the execution of this
+  // node.
+  //
+  // Takes ownership of any `TrackingAllocator` objects stored in `ctx`.
+  virtual void SetMemory(OpKernelContext* ctx) = 0;
+
+  // Records information about the tensor produced by this node at the given
+  // output slot.
+  virtual void SetOutput(int slot, const Tensor* tensor) = 0;
+
+  // Records the absolute time in nanoseconds at which this node became
+  // runnable (i.e. was scheduled for execution).
+  virtual void SetScheduled(int64_t nanos) = 0;
+};
+
+// Wraps NodeExecStats and adds allocation to it.
+class NodeExecStatsWrapper : public NodeExecStatsInterface {
+ public:
+  // Does not take ownership of `node` or `step_stats_collector`.
+  NodeExecStatsWrapper(const NodeDef* node,
+                       StepStatsCollector* step_stats_collector);
+
+  // Takes ownership of 'stats' but not `node` or `step_stats_collector`.
+  NodeExecStatsWrapper(std::unique_ptr<NodeExecStats> stats,
+                       const NodeDef* node,
+                       StepStatsCollector* step_stats_collector);
+
+  // Destructor calls Finalize() to release the TrackingAllocators.
+  ~NodeExecStatsWrapper() override { Finalize(); }
+
+  void Done(const string& device) override;
+  void RecordExecutorStarted() override;
+  void RecordComputeStarted() override;
+  void RecordComputeEnded() override;
+  void RecordExecutorEnded() override;
+  bool TrackAllocations() const override { return true; }
+  void SetMemory(OpKernelContext* ctx) override;
+  void SetOutput(int slot, const Tensor* tensor) override;
+  void SetScheduled(int64_t nanos) override;
+
+ private:
+  friend class StepStatsCollector;
+
+  NodeExecStats* stats() { return stats_.get(); }
+
+  // Populates stats_ and releases TrackingAllocator.
+  void Finalize();
+
+  // Does not take ownership of the `allocator`.
+  // Takes ownership of `tracking_allocator`.
+  void AddAllocation(Allocator* allocator,
+                     TrackingAllocator* tracking_allocator);
+
+  absl::InlinedVector<std::pair<AllocatorMemoryUsed*, TrackingAllocator*>, 2UL>
+      allocations_;
+  std::unique_ptr<NodeExecStats> stats_;
+  const NodeDef* const node_;                       // Not owned.
+  StepStatsCollector* const step_stats_collector_;  // Not owned.
+};
+
+// Statistics collection interface for step execution.
+//
+// See `StepStatsCollector` for a concrete implementation of this interface
+// that interfaces with the `Session` layer.
+class StepStatsCollectorInterface {
+ public:
+  virtual ~StepStatsCollectorInterface() {}
+
+  // Creates an instance of `NodeExecStatsInterface` that should be used for
+  // collecting statistics about individual node execution.
+  virtual NodeExecStatsInterface* CreateNodeExecStats(const NodeDef* node) = 0;
+
+  // Generates a string reporting the currently used memory based
+  // on ResourceExhausted OOM `err` message.
+  // `err` message needs to contain device name and allocator name, e.g.:
+  // "ResourceExhaustedError: OOM when allocating tensor ...
+  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
+  virtual string ReportAllocsOnResourceExhausted(absl::string_view err) = 0;
+};
+
+// StepStatsCollector manages the collection of a StepStats object.
+// The StepStats object holds multiple DeviceStats.
+// Each DeviceStats object holds multiple NodeExecStats.
+class StepStatsCollector : public StepStatsCollectorInterface {
+ public:
+  // Does not take ownership of `step_stats`.
+  explicit StepStatsCollector(StepStats* step_stats);
+
+  // BuildCostModel builds or updates a CostModel managed by cost_model_manager,
+  // using the currently collected DeviceStats associated with the devices in
+  // device_map.
+  void BuildCostModel(
+      CostModelManager* cost_model_manager,
+      const std::unordered_map<string, const Graph*>& device_map);
+
+  // Saves node statistics to the DeviceStats object associated with device.
+  // Should be called before Finalize.
+  void Save(const string& device, NodeExecStats* node_stats_pb);
+  void Save(const string& device, NodeExecStatsWrapper* node_stats);
+
+  // Saves thread name.
+  void SaveThreadName(const string& device, const uint32 thread_id,
+                      const string& thread_name);
+
+  NodeExecStatsInterface* CreateNodeExecStats(const NodeDef* node) override;
+  string ReportAllocsOnResourceExhausted(absl::string_view err) override;
+
+  // The following 2 Finalize methods populate the StepStats passed
+  // from the constructor. Calling it more than once won't have any effect.
+  // User shouldn't call Save() methods after Finalize.
+  void Finalize();
+  // swaps the content of StepStats* from constructor with 'ss'.
+  void FinalizeAndSwap(StepStats* step_stats);
+
+ private:
+  // TODO(suharshs): Make this configurable if its not possible to find a value
+  // that works for all cases.
+  static constexpr uint64 kMaxCollectedNodes = 1 << 20;
+
+  typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
+  typedef std::unordered_map<uint32, string> ThreadNamesMap;
+
+  void FinalizeInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  mutex mu_;
+  bool finalized_ TF_GUARDED_BY(mu_);
+  std::unordered_map<string, NodeStatsVector> dev_stats_ TF_GUARDED_BY(mu_);
+  std::unordered_map<string, ThreadNamesMap> thread_names_ TF_GUARDED_BY(mu_);
+  StepStats* step_stats_ TF_GUARDED_BY(mu_);
+  uint64 collected_nodes_ TF_GUARDED_BY(mu_) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/test_collective_executor_mgr.h
new file mode 100644
index 00000000..0d0b190a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+
+// Mock objects that can't actually execute a Collective, but satisfy
+// general infrastructure expectations within tests that don't require
+// full functionality.
+
+class TestCollectiveExecutor : public CollectiveExecutor {
+ public:
+  explicit TestCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
+                                  CollectiveRemoteAccess* rma = nullptr)
+      : CollectiveExecutor(cem), rma_(rma) {}
+
+  void RunClosure(std::function<void()> fn) override { fn(); }
+
+  CollectiveRemoteAccess* remote_access() override { return rma_; }
+
+ private:
+  CollectiveRemoteAccess* rma_;
+};
+
+class TestParamResolver : public ParamResolverInterface {
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void CompleteGroupAsync(const DeviceAttributes& device,
+                          CollGroupParams* group_params,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  absl::Status LookupGroup(int32_t group_key, CollGroupParams* group) override {
+    return errors::Internal("Unimplemented");
+  }
+
+  void StartAbort(const absl::Status& s) override {}
+};
+
+class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  explicit TestCollectiveExecutorMgr(ParamResolverInterface* param_resolver,
+                                     CollectiveRemoteAccess* rma)
+      : param_resolver_(param_resolver), rma_(rma) {}
+
+  TestCollectiveExecutorMgr() : param_resolver_(nullptr), rma_(nullptr) {}
+
+  ~TestCollectiveExecutorMgr() override {
+    for (auto& iter : table_) {
+      iter.second->Unref();
+    }
+  }
+
+  CollectiveExecutor* FindOrCreate(int64_t step_id) override {
+    mutex_lock l(mu_);
+    CollectiveExecutor* ce = nullptr;
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      ce = iter->second;
+    } else {
+      ce = new TestCollectiveExecutor(this, rma_);
+      table_[step_id] = ce;
+    }
+    ce->Ref();
+    return ce;
+  }
+
+  void Cleanup(int64_t step_id) override {
+    mutex_lock l(mu_);
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      iter->second->Unref();
+      table_.erase(iter);
+    }
+  }
+
+  void CleanupAll() override {
+    mutex_lock l(mu_);
+    for (auto& iter : table_) {
+      iter.second->Unref();
+    }
+    table_.clear();
+  }
+
+  ParamResolverInterface* GetParamResolver() const override {
+    return param_resolver_;
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  NcclCommunicatorInterface* GetNcclCommunicator() const override {
+    return nullptr;
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  void RefreshStepIdSequenceAsync(int64_t graph_key,
+                                  const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  int64_t NextStepId(int64_t graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64_t graph_key, int64_t step_id) override {}
+
+ protected:
+  mutex mu_;
+  gtl::FlatMap<int64_t, CollectiveExecutor*> table_ TF_GUARDED_BY(mu_);
+  ParamResolverInterface* param_resolver_;
+  CollectiveRemoteAccess* rma_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/threadpool_device.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/threadpool_device.h
new file mode 100644
index 00000000..08175ccb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/threadpool_device.h
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/node_file_writer.h"
+
+namespace tensorflow {
+
+// CPU device implementation.
+class ThreadPoolDevice : public LocalDevice {
+ public:
+  ThreadPoolDevice(const SessionOptions& options, const string& name,
+                   Bytes memory_limit, const DeviceLocality& locality,
+                   Allocator* allocator);
+  ~ThreadPoolDevice() override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64_t step_id) override;
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return scoped_allocator_mgr_.get();
+  }
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override;
+
+  absl::Status Sync() override { return absl::OkStatus(); }
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+ private:
+  void LogInputs(OpKernel* op_kernel, OpKernelContext* context);
+  void LogOutputs(OpKernel* op_kernel, OpKernelContext* context);
+
+  Allocator* allocator_;  // Not owned
+  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
+  NodeFileWriter* node_file_writer_ = nullptr;  // not owned
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/common_runtime/type_inference.h b/third_party/tflite-hdrs/tensorflow/core/common_runtime/type_inference.h
new file mode 100644
index 00000000..fdbf6e27
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/common_runtime/type_inference.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TYPE_INFERENCE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_TYPE_INFERENCE_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// Run a very basic type inference on the graph. It simply propagates type
+// information along edges, until reaching stability.
+//
+// The pass is designed to run as a graph diffusion process, refining type
+// information until it reaches a fixed point. However, the current
+// implementation is a simplification that only ensures that:
+//   1. each node is visited at least once
+//   2. a successful update of a node's type ID prevents future visits
+//   3. each node is visited at most a fixed number of times
+//
+// If needed, we can drop rule #3 and change rule #2 to consider an update to
+// be any deep type change (rather than just the type ID).
+//
+// The state of the diffusion process is the NodeDef.experimental_full_type
+// field, while the diffusion function is the node's corresponding
+// OpRegistrationData.fwd_type_fn function.
+//
+// TODO(mdan): Use a regular union-based algorithm instead?
+class TypeInferencePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+// A version of TypeInferencePass that prints a warning on error, instead
+// of returning error status. This is done because there are a few graphs
+// currently in the wild which don't actually type check.
+// TODO(mdan): Turn this into an error, once all offenders are clean.
+class WeakTypeInferencePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_TYPE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/config/flag_defs.h b/third_party/tflite-hdrs/tensorflow/core/config/flag_defs.h
new file mode 100644
index 00000000..d6bc4d95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/config/flag_defs.h
@@ -0,0 +1,80 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_CONFIG_FLAG_DEFS_H_
+#define TENSORFLOW_CORE_CONFIG_FLAG_DEFS_H_
+
+#include "tensorflow/core/config/flags.h"
+
+namespace tensorflow {
+namespace flags {
+
+class Flags {
+ public:
+  // Test only flags. See flags_test.cc for example usage.
+  TF_DECLARE_FLAG(test_only_experiment_1, true, "Test only experiment 1.");
+  TF_DECLARE_FLAG(test_only_experiment_2, false, "Test only experiment 2.");
+
+  // Declare flags below here.
+  // LINT.IfChange
+  TF_DECLARE_FLAG(enable_nested_function_shape_inference, false,
+                  "Allow ops such as tf.cond to invoke the ShapeRefiner on "
+                  "their nested functions.");
+  TF_DECLARE_FLAG(enable_quantized_dtypes_training, false,
+                  "Set quantized dtypes, like tf.qint8, to be trainable.");
+  TF_DECLARE_FLAG(graph_building_optimization, false,
+                  "Optimize graph building for faster tf.function tracing.");
+  TF_DECLARE_FLAG(
+      op_building_optimization, true,
+      "Optimize tf.Operation building for faster tf.function tracing.");
+  TF_DECLARE_FLAG(saved_model_fingerprinting, true,
+                  "Add fingerprint to SavedModels.");
+  TF_DECLARE_FLAG(
+      tf_shape_default_int64, false,
+      "The default output of tf.shape (i.e. when out_type is not specified) is "
+      "int64 when this flag is true and int32 otherwise. Setting this to true "
+      "is an unsupported, experimental setting that causes known breakages.");
+  TF_DECLARE_FLAG(more_stack_traces, false,
+                  "Enable experimental code that preserves and propagates "
+                  "graph node stack traces in C++.");
+  TF_DECLARE_FLAG(publish_function_graphs, true,
+                  "Enables the publication of partitioned function graphs "
+                  "via StatsPublisherInterface. Disabling this flag can "
+                  "reduce memory consumption.");
+  TF_DECLARE_FLAG(enable_aggressive_constant_replication, true,
+                  "Replicate constants across CPU devices and even for local "
+                  "CPUs within the same task if available.")
+  TF_DECLARE_FLAG(enable_colocation_key_propagation_in_while_op_lowering, false,
+                  "If true, colocation key attributes for the ops will be "
+                  "propagated during while op lowering to switch/merge ops.")
+  TF_DECLARE_FLAG(enable_tf2min_ici_weight, false,
+                  "If true, ici weight optimization will be used in tf2/min.")
+  // TODO(b/341325107): Make this behavior the default and remove the flag.
+  TF_DECLARE_FLAG(enable_function_pruning_before_inlining, false,
+                  "If true, functions will be pruned before inlining.")
+  TF_DECLARE_FLAG(enable_skip_encapsulation_for_non_tpu_graphs, false,
+                  "If true, TF2XLA encapsulation will be skipped for non-TPU "
+                  "graphs.")
+  TF_DECLARE_FLAG(enable_graph_debug_info_caching_for_stack_frames, true,
+                  "If true, graph debug info will cache the stack frames.")
+  // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
+};
+
+Flags& Global();
+
+}  // namespace flags
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_CONFIG_FLAG_DEFS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/config/flags.h b/third_party/tflite-hdrs/tensorflow/core/config/flags.h
new file mode 100644
index 00000000..c882cd39
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/config/flags.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_CONFIG_FLAGS_H_
+#define TENSORFLOW_CORE_CONFIG_FLAGS_H_
+
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace config {
+
+// Container class for a single feature flag.
+// Note: this class is not thread safe.
+class Flag {
+ public:
+  explicit Flag(absl::string_view flag_name, bool default_value);
+  bool value() { return value_; }
+  void reset(bool value) { value_ = value; }
+
+ private:
+  bool value_;
+};
+
+// Macro to declare new flags. Declare all flags in core/config/flag_defs.h
+// These flags can be overridden by setting the associated environment variable
+// TF_FLAG_* flag to true or false. E.g. setting TF_FLAG_MY_FLAG=false will
+// override the default value for a flag named `my_flag` to false.
+#define TF_DECLARE_FLAG(flag_name, default_value, doc) \
+  ::tensorflow::config::Flag flag_name =               \
+      ::tensorflow::config::Flag("TF_FLAG_" #flag_name, default_value);
+
+}  // namespace config
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_CONFIG_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/captured_function.h b/third_party/tflite-hdrs/tensorflow/core/data/captured_function.h
new file mode 100644
index 00000000..553f09b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/captured_function.h
@@ -0,0 +1,340 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_CAPTURED_FUNCTION_H_
+#define TENSORFLOW_CORE_DATA_CAPTURED_FUNCTION_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class Device;
+class OpKernelContext;
+class ResourceMgr;
+
+namespace data {
+
+class CapturedFunction;
+class InstantiatedCapturedFunction;
+
+// Creates an iterator for a dataset which is created by applying the given
+// function to the given input element.
+absl::Status MakeIteratorFromInputElement(
+    IteratorContext* ctx, const DatasetBaseIterator* parent,
+    const std::vector<Tensor>& input_element, int64_t thread_index,
+    const InstantiatedCapturedFunction& inst_captured_func,
+    absl::string_view prefix, std::unique_ptr<IteratorBase>* out_iterator);
+
+// Creates an iterator for a dataset which is created by applying the given
+// function to the given input element. Pass non-null `node` to record
+// processing time for modeling Iterator's GetNext() resource usage.
+absl::Status MakeIteratorFromInputElement(
+    IteratorContext* ctx, const DatasetBaseIterator* parent,
+    const std::vector<Tensor>& input_element, int64_t thread_index,
+    const InstantiatedCapturedFunction& inst_captured_func,
+    absl::string_view prefix, std::unique_ptr<IteratorBase>* out_iterator,
+    const std::shared_ptr<model::Node>& node);
+
+struct ShortCircuitInfo {
+  std::vector<int> indices;
+  std::vector<bool> can_move;
+};
+
+// Metadata shared across all captures of the same function.
+class FunctionMetadata {
+ public:
+  struct Params {
+    bool use_inter_op_parallelism = true;
+    bool use_default_device = true;
+  };
+
+  // Creates a new instance of the `FunctionMetadata` class, fetching function
+  // from a context argument.
+  static absl::Status Create(tensorflow::OpKernelConstruction* ctx,
+                             const string& func_name, Params params,
+                             std::shared_ptr<FunctionMetadata>* out_metadata);
+
+  // Creates a new instance of the `FunctionMetadata` class, using the provided
+  // function.
+  static absl::Status Create(tensorflow::OpKernelConstruction* ctx,
+                             NameAttrList&& func, Params params,
+                             std::shared_ptr<FunctionMetadata>* out_metadata);
+
+  // Returns the named list of function arguments.
+  const NameAttrList& func() const { return func_; }
+
+  // Returns a borrowed pointer to the function library that contains the
+  // transitive closure of definitions used by the function.
+  const FunctionLibraryDefinition* lib_def() const { return lib_def_.get(); }
+
+  // Returns short-circuit information.
+  const ShortCircuitInfo& short_circuit_info() const {
+    return short_circuit_info_;
+  }
+
+  // Indicates whether a default device should be used for executing function
+  // ops.
+  bool use_default_device() const { return use_default_device_; }
+
+  // Indicates whether to use inter-op parallelism for execution of the
+  // function.
+  bool use_inter_op_parallelism() const { return use_inter_op_parallelism_; }
+
+  // Indicates whether the function should a multi-device function backend.
+  bool use_multi_device_function() const { return use_multi_device_function_; }
+
+ private:
+  FunctionMetadata(NameAttrList&& func, Params params)
+      : func_(std::move(func)),
+        use_default_device_(params.use_default_device),
+        use_inter_op_parallelism_(params.use_inter_op_parallelism) {}
+
+  NameAttrList func_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_ = nullptr;
+  ShortCircuitInfo short_circuit_info_;
+  bool use_default_device_ = true;
+  bool use_inter_op_parallelism_ = true;
+  bool use_multi_device_function_ = true;
+};
+
+// Constructs and stores the parameters for the CapturedFunction Instantiate
+// function.
+struct InstantiateCapturedFunctionParams {
+  explicit InstantiateCapturedFunctionParams(IteratorContext* ctx) {
+    flr = ctx->flr();
+    function_handle_cache = ctx->function_handle_cache();
+    runner = ctx->runner();
+  }
+
+  explicit InstantiateCapturedFunctionParams(OpKernelContext* ctx) {
+    flr = ctx->function_library();
+    function_handle_cache = nullptr;
+    runner = ctx->runner();
+  }
+
+  FunctionLibraryRuntime* flr;
+  FunctionHandleCache* function_handle_cache;
+  std::function<void(std::function<void()>)>* runner;
+};
+
+// A `CapturedFunction` encapsulates a TensorFlow function, plus any "captured"
+// arguments that it closed over in the user program.
+class CapturedFunction {
+ public:
+  // Creates a new instance using a list of named attributes, fetching captured
+  // inputs from a context argument.
+  static absl::Status Create(OpKernelContext* ctx,
+                             std::shared_ptr<const FunctionMetadata> metadata,
+                             const string& argument_name,
+                             std::unique_ptr<CapturedFunction>* out_function);
+
+  // Creates a new instance using a list of named attributes, using provided
+  // captured inputs.
+  static absl::Status Create(OpKernelContext* ctx,
+                             std::shared_ptr<const FunctionMetadata> metadata,
+                             std::vector<Tensor>&& captured_inputs,
+                             std::unique_ptr<CapturedFunction>* out_function);
+
+  // Adds the definition of this captured function into the given graph,
+  // returning its captured inputs and types through the respective output
+  // arguments.
+  absl::Status AddToGraph(SerializationContext* ctx,
+                          DatasetBase::DatasetGraphDefBuilder* b,
+                          std::vector<Node*>* other_arguments,
+                          DataTypeVector* other_arguments_types) const;
+
+  // Instantiates this function for use in the given context, providing an
+  // InstantiatedCapturedFunction that can be used to execute functions.
+  absl::Status Instantiate(IteratorContext* ctx,
+                           std::unique_ptr<InstantiatedCapturedFunction>*
+                               instantiated_captured_function);
+
+  absl::Status Instantiate(InstantiateCapturedFunctionParams params,
+                           std::unique_ptr<InstantiatedCapturedFunction>*
+                               instantiated_captured_function);
+
+  // Determines whether the captured function is stateful.
+  absl::Status CheckExternalState() const;
+
+  // Returns the additional captured inputs that will be passed to the function.
+  const std::vector<Tensor>& captured_inputs() const {
+    return captured_inputs_;
+  }
+
+  // Returns the named list of function arguments.
+  const NameAttrList& func() const { return metadata_->func(); }
+
+  // Returns the transitive set of function definition required to instantiate
+  // this function.
+  const FunctionLibraryDefinition* lib_def() const {
+    return metadata_->lib_def();
+  }
+
+  // If every function output corresponds to one of its inputs, the method
+  // returns the mapping from output indices to input indices. Otherwise, it
+  // returns an empty list.
+  const ShortCircuitInfo& short_circuit_info() const {
+    return metadata_->short_circuit_info();
+  }
+
+  // Indicates whether the function should use inter op parallelism.
+  bool use_inter_op_parallelism() const {
+    return metadata_->use_inter_op_parallelism();
+  }
+
+ private:
+  CapturedFunction(std::shared_ptr<const FunctionMetadata> metadata,
+                   std::vector<Tensor> captured_inputs);
+
+  absl::Status IsMultiDevice(FunctionLibraryRuntime* flr,
+                             bool* is_multi_device) const;
+
+  const std::shared_ptr<const FunctionMetadata> metadata_;
+  const std::vector<Tensor> captured_inputs_;
+
+  CapturedFunction(const CapturedFunction&) = delete;
+  void operator=(const CapturedFunction&) = delete;
+};
+
+// `InstantiatedCapturedFunction` encapsulates all the runtime support needed
+// to execute a tensorflow function.
+//
+// While `CapturedFunction` encapsulates constant attributes of the function,
+// such as its name and captured arguments, `InstantiatedCapturedFunction`
+// encapsulates runtime aspects, such as `FunctionLibraryRuntime` and function
+// handle.
+//
+// The `Iterator` related classes use `InstantiatedCapturedFunction` to execute
+// functions outside of the normal `OpKernel::Compute()` context.
+class InstantiatedCapturedFunction {
+ public:
+  // Runs the instantiated captured function. This method takes ownership of
+  // the tensors in `args`, in order to be able to deallocate them as early as
+  // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
+  // ownership of the `args`.
+  absl::Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
+                   std::vector<Tensor>* rets) const;
+
+  // Runs the instantiated captured function. This method takes ownership of
+  // the tensors in `args`, in order to be able to deallocate them as early as
+  // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
+  // ownership of the `args`. Pass non-null `node` to record processing time
+  // for modeling Iterator's GetNext() resource usage. When non-null node is
+  // provided, the pre-requisite is that the calling thread has previously
+  // called `DatasetBaseIterator::RecordStart().
+  absl::Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
+                   std::vector<Tensor>* rets,
+                   const std::shared_ptr<model::Node>& node) const;
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible.
+  absl::Status RunWithBorrowedArgs(IteratorContext* ctx,
+                                   const std::vector<Tensor>& args,
+                                   std::vector<Tensor>* rets) const;
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible. Pass non-null `node` to record processing time for modeling
+  // Iterator's GetNext() resource usage. When non-null node is provided, the
+  // pre-requisite is that the calling thread has previously called
+  // `DatasetBaseIterator::RecordStart().
+  absl::Status RunWithBorrowedArgs(
+      IteratorContext* ctx, const std::vector<Tensor>& args,
+      std::vector<Tensor>* rets,
+      const std::shared_ptr<model::Node>& node) const;
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible. This can be useful for calling a captured function in cases where
+  // an `IteratorContext*` is not available (such as a destructor).
+  //
+  // TODO(b/144278100): Avoid running functions without IteratorContext.
+  absl::Status RunInstantiated(const std::vector<Tensor>& args,
+                               std::vector<Tensor>* rets);
+
+  // Asynchronously runs the captured function on the given `args`, stores the
+  // results in `*rets`, and calls the given `done` callback when the function
+  // returns. This method takes ownership of the tensors in `args`, in order to
+  // be able to deallocate them as early as possible. Pass non-null `node` to
+  // record processing time for modeling Iterator's GetNext() resource usage.
+  // When non-null node is provided, the pre-requisite is that the calling
+  // thread has previously called `DatasetBaseIterator::RecordStart().
+  void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
+                std::vector<Tensor>* rets,
+                FunctionLibraryRuntime::DoneCallback done,
+                const std::shared_ptr<model::Node>& node) const {
+    RunAsync(*(ctx->runner()), ctx->cancellation_manager(),
+             ctx->collective_executor(), std::move(args), rets, done, node);
+  }
+
+  // A version of `RunAsync` that does not take an `IteratorContext` but a
+  // runner, a cancellation manager, and a collective executor.
+  void RunAsync(std::function<void(std::function<void()>)> runner,
+                CancellationManager* parent_cancellation_manager,
+                CollectiveExecutor* collective_executor,
+                std::vector<Tensor>&& args, std::vector<Tensor>* rets,
+                FunctionLibraryRuntime::DoneCallback done,
+                const std::shared_ptr<model::Node>& node) const;
+
+  std::string func_name() const { return captured_func_->func().name(); }
+
+ private:
+  friend class CapturedFunction;
+
+  InstantiatedCapturedFunction(
+      FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+      DataTypeVector ret_types,
+      std::function<void(std::function<void()>)> runner,
+      CapturedFunction* captured_func, bool is_multi_device);
+
+  // Determines whether a rendezvous object should be created when running the
+  // instantiated function.
+  bool ShouldCreateRendezvous() const;
+
+  FunctionLibraryRuntime* const lib_;  // Not owned.
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const DataTypeVector ret_types_;
+  // Note: We capture the runner at function instantiation time to be able to
+  // run the function without `IteratorContext` via `RunInstantiated`.
+  std::function<void(std::function<void()>)> captured_runner_;
+  CapturedFunction* const captured_func_;  // Not owned.
+  const bool is_multi_device_;
+
+  InstantiatedCapturedFunction(const InstantiatedCapturedFunction&) = delete;
+  void operator=(const InstantiatedCapturedFunction&) = delete;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_CAPTURED_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/compression_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/compression_utils.h
new file mode 100644
index 00000000..8b4d5179
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/compression_utils.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_COMPRESSION_UTILS_H_
+#define TENSORFLOW_CORE_DATA_COMPRESSION_UTILS_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/dataset.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// Compresses the components of `element` into the `CompressedElement` proto.
+//
+// In addition to writing the actual compressed bytes, `Compress` fills
+// out the per-component metadata for the `CompressedElement`.
+//
+// Returns an error if the uncompressed size of the element exceeds 4GB.
+absl::Status CompressElement(const std::vector<Tensor>& element,
+                             CompressedElement* out);
+
+// Uncompresses a `CompressedElement` into a vector of tensor components.
+absl::Status UncompressElement(const CompressedElement& compressed,
+                               std::vector<Tensor>* out);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_COMPRESSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/dataset_test_base.h b/third_party/tflite-hdrs/tensorflow/core/data/dataset_test_base.h
new file mode 100644
index 00000000..0ef63825
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/dataset_test_base.h
@@ -0,0 +1,1128 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_DATASET_TEST_BASE_H_
+#define TENSORFLOW_CORE_DATA_DATASET_TEST_BASE_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+namespace data {
+
+typedef std::vector<
+    std::pair<string, tensorflow::FunctionDefHelper::AttrValueWrapper>>
+    AttributeVector;
+
+constexpr int kDefaultCPUNum = 2;
+constexpr int kDefaultThreadNum = 2;
+
+// Creates a tensor with the specified dtype, shape, and value.
+template <typename T>
+static Tensor CreateTensor(const TensorShape& input_shape,
+                           gtl::ArraySlice<T> input_data) {
+  Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+  test::FillValues<T>(&tensor, input_data);
+  return tensor;
+}
+
+// Creates a tensor with the specified dtype and shape, with values 0, 1, 2, ...
+template <typename T>
+static Tensor CreateTensor(const TensorShape& input_shape) {
+  Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+  test::FillIota<T>(&tensor, 0);
+  return tensor;
+}
+
+// Creates a vector of tensors with the specified dtype, shape, and values.
+template <typename T>
+std::vector<Tensor> CreateTensors(
+    const TensorShape& shape, const std::vector<gtl::ArraySlice<T>>& values) {
+  std::vector<Tensor> result;
+  result.reserve(values.size());
+  for (auto& value : values) {
+    result.emplace_back(CreateTensor<T>(shape, value));
+  }
+  return result;
+}
+
+enum class CompressionType { ZLIB = 0, GZIP = 1, RAW = 2, UNCOMPRESSED = 3 };
+
+// Returns a string representation for the given compression type.
+string ToString(CompressionType compression_type);
+
+// Gets the specified zlib compression options according to the compression
+// type. Note that `CompressionType::UNCOMPRESSED` is not supported because
+// `ZlibCompressionOptions` does not have an option.
+io::ZlibCompressionOptions GetZlibCompressionOptions(
+    CompressionType compression_type);
+
+// Used to specify parameters when writing data into files with compression.
+// `input_buffer_size` and `output_buffer_size` specify the input and output
+// buffer size when ZLIB and GZIP compression is used.
+struct CompressionParams {
+  CompressionType compression_type = CompressionType::UNCOMPRESSED;
+  int32 input_buffer_size = 0;
+  int32 output_buffer_size = 0;
+};
+
+// Writes the input data into the file without compression.
+absl::Status WriteDataToFile(const string& filename, const char* data);
+
+// Writes the input data into the file with the specified compression.
+absl::Status WriteDataToFile(const string& filename, const char* data,
+                             const CompressionParams& params);
+
+// Writes the input data into the TFRecord file with the specified compression.
+absl::Status WriteDataToTFRecordFile(
+    const string& filename, const std::vector<absl::string_view>& records,
+    const CompressionParams& params);
+
+// Provides the parameters for running the dataset op.
+class DatasetParams {
+ public:
+  DatasetParams(DataTypeVector output_dtypes,
+                std::vector<PartialTensorShape> output_shapes,
+                string node_name);
+
+  virtual ~DatasetParams() = default;
+
+  // Returns the inputs (except the input datasets) as a tensor vector.
+  virtual std::vector<Tensor> GetInputTensors() const = 0;
+
+  // Returns the dataset input names as a string vector.
+  virtual absl::Status GetInputNames(
+      std::vector<string>* input_names) const = 0;
+
+  // Returns the dataset attributes as a vector.
+  virtual absl::Status GetAttributes(AttributeVector* attributes) const = 0;
+
+  // Checks if the tensor is a dataset variant tensor.
+  static bool IsDatasetTensor(const Tensor& tensor);
+
+  string node_name() const { return node_name_; }
+
+  DataTypeVector output_dtypes() const { return output_dtypes_; }
+
+  std::vector<PartialTensorShape> output_shapes() const {
+    return output_shapes_;
+  }
+
+  string iterator_prefix() const { return iterator_prefix_; }
+
+  const std::vector<std::shared_ptr<DatasetParams>>& input_dataset_params()
+      const {
+    return input_dataset_params_;
+  }
+
+  // Returns the functions that will be used when running the dataset op.
+  virtual std::vector<FunctionDef> func_lib() const { return {}; }
+
+  // Returns the dataset type for the op represented by these parameters. This
+  // type usually needs to match the constant called `kDatasetType` defined in
+  // the dataset kernel.
+  virtual string dataset_type() const = 0;
+
+  // Returns the dataset op name. By default, it returns the Op::kDatasetType
+  // concatenated with "Dataset". For ops that do not have "Dataset" suffix,
+  // this method can be overriden to return a different name.
+  virtual string op_name() const {
+    name_utils::OpNameParams params;
+    params.op_version = op_version();
+    return name_utils::OpName(dataset_type(), params);
+  }
+
+  virtual int op_version() const { return op_version_; }
+
+ protected:
+  std::vector<std::shared_ptr<DatasetParams>> input_dataset_params_;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  string node_name_;
+  string iterator_prefix_ = "Iterator";
+  int op_version_ = 1;
+};
+
+// `RangeDatasetParams` is a common dataset parameter type that are used in
+// testing.
+class RangeDatasetParams : public DatasetParams {
+ public:
+  RangeDatasetParams(int64_t start, int64_t stop, int64_t step,
+                     DataTypeVector output_dtypes,
+                     std::vector<PartialTensorShape> output_shapes,
+                     string node_name);
+
+  RangeDatasetParams(int64_t start, int64_t stop, int64_t step);
+
+  RangeDatasetParams(int64_t start, int64_t stop, int64_t step,
+                     DataTypeVector output_dtypes);
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+
+ private:
+  int64_t start_;
+  int64_t stop_;
+  int64_t step_;
+};
+
+// `BatchDatasetParams` is a common dataset parameter type that are used in
+// testing.
+class BatchDatasetParams : public DatasetParams {
+ public:
+  template <typename T>
+  BatchDatasetParams(T input_dataset_params, int64_t batch_size,
+                     bool drop_remainder, bool parallel_copy,
+                     DataTypeVector output_dtypes,
+                     std::vector<PartialTensorShape> output_shapes,
+                     string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        batch_size_(batch_size),
+        drop_remainder_(drop_remainder),
+        parallel_copy_(parallel_copy) {
+    input_dataset_params_.push_back(std::make_unique<T>(input_dataset_params));
+    op_version_ = 2;
+    iterator_prefix_ =
+        name_utils::IteratorPrefix(input_dataset_params.dataset_type(),
+                                   input_dataset_params.iterator_prefix());
+  }
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+
+ private:
+  int64_t batch_size_;
+  bool drop_remainder_;
+  bool parallel_copy_;
+};
+
+// `MapDatasetParams` is a common dataset parameter type that are used in
+// testing.
+class MapDatasetParams : public DatasetParams {
+ public:
+  template <typename T>
+  MapDatasetParams(T input_dataset_params, std::vector<Tensor> other_arguments,
+                   FunctionDefHelper::AttrValueWrapper func,
+                   std::vector<FunctionDef> func_lib,
+                   DataTypeVector type_arguments, DataTypeVector output_dtypes,
+                   std::vector<PartialTensorShape> output_shapes,
+                   bool use_inter_op_parallelism, bool preserve_cardinality,
+                   string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        other_arguments_(std::move(other_arguments)),
+        func_(std::move(func)),
+        func_lib_(std::move(func_lib)),
+        type_arguments_(std::move(type_arguments)),
+        use_inter_op_parallelism_(use_inter_op_parallelism),
+        preserve_cardinality_(preserve_cardinality) {
+    input_dataset_params_.push_back(std::make_unique<T>(input_dataset_params));
+    iterator_prefix_ =
+        name_utils::IteratorPrefix(input_dataset_params.dataset_type(),
+                                   input_dataset_params.iterator_prefix());
+  }
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+
+  std::vector<FunctionDef> func_lib() const override;
+
+ private:
+  std::vector<Tensor> other_arguments_;
+  FunctionDefHelper::AttrValueWrapper func_;
+  std::vector<FunctionDef> func_lib_;
+  DataTypeVector type_arguments_;
+  bool use_inter_op_parallelism_;
+  bool preserve_cardinality_;
+};
+
+// `TensorSliceDatasetParams` is a common dataset parameter type that are used
+// in testing.
+class TensorSliceDatasetParams : public DatasetParams {
+ public:
+  TensorSliceDatasetParams(std::vector<Tensor> components, string node_name,
+                           bool is_files = false);
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+
+  int64_t num_slices() const { return components_[0].dim_size(0); }
+
+  size_t num_tensors_per_slice() const { return components_.size(); }
+
+ private:
+  DataTypeVector TensorSliceDtypes(const std::vector<Tensor>& input_components);
+
+  std::vector<PartialTensorShape> TensorSliceShapes(
+      const std::vector<Tensor>& input_components);
+
+ public:
+  std::vector<Tensor> components_;
+  bool is_files_;
+};
+
+// `TakeDatasetParams` is a common dataset parameter type that are used in
+// testing.
+class TakeDatasetParams : public DatasetParams {
+ public:
+  template <typename T>
+  TakeDatasetParams(T input_dataset_params, int count,
+                    DataTypeVector output_dtypes,
+                    std::vector<PartialTensorShape> output_shapes,
+                    string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        count_(count) {
+    input_dataset_params_.push_back(std::make_unique<T>(input_dataset_params));
+    iterator_prefix_ =
+        name_utils::IteratorPrefix(input_dataset_params.dataset_type(),
+                                   input_dataset_params.iterator_prefix());
+  }
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+
+ private:
+  int64_t count_;
+};
+
+// `ConcatenateDatasetParams` is a common dataset parameter type that are used
+// in testing.
+class ConcatenateDatasetParams : public DatasetParams {
+ public:
+  template <typename T, typename P>
+  ConcatenateDatasetParams(T input_dataset_params_0, P input_dataset_params_1,
+                           DataTypeVector output_dtypes,
+                           std::vector<PartialTensorShape> output_shapes,
+                           string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)) {
+    input_dataset_params_.push_back(
+        std::make_unique<T>(input_dataset_params_0));
+    input_dataset_params_.push_back(
+        std::make_unique<T>(input_dataset_params_1));
+    iterator_prefix_ =
+        name_utils::IteratorPrefix(input_dataset_params_0.dataset_type(),
+                                   input_dataset_params_0.iterator_prefix());
+  }
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+};
+
+// `OptionsDatasetParams` is a common dataset parameter type that is used in
+// testing.
+class OptionsDatasetParams : public DatasetParams {
+ public:
+  template <typename T>
+  OptionsDatasetParams(T input_dataset_params, const string& serialized_options,
+                       DataTypeVector output_dtypes,
+                       std::vector<PartialTensorShape> output_shapes,
+                       string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name)),
+        serialized_options_(serialized_options) {
+    input_dataset_params_.push_back(std::make_unique<T>(input_dataset_params));
+  }
+
+  std::vector<Tensor> GetInputTensors() const override;
+
+  absl::Status GetInputNames(std::vector<string>* input_names) const override;
+
+  absl::Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  string dataset_type() const override;
+
+ private:
+  string serialized_options_;
+};
+
+template <typename T>
+struct GetNextTestCase {
+  GetNextTestCase(T dataset_params, std::vector<Tensor> expected_outputs,
+                  bool compare_order = true)
+      : dataset_params(std::move(dataset_params)),
+        expected_outputs(std::move(expected_outputs)),
+        compare_order(compare_order) {}
+
+  T dataset_params;
+  std::vector<Tensor> expected_outputs;
+  bool compare_order;
+};
+
+template <typename T>
+struct SkipTestCase {
+  SkipTestCase(T dataset_params, int num_to_skip, int expected_num_skipped,
+               bool get_next = false, std::vector<Tensor> expected_outputs = {},
+               bool compare_order = true)
+      : dataset_params(std::move(dataset_params)),
+        num_to_skip(num_to_skip),
+        expected_num_skipped(expected_num_skipped),
+        get_next(get_next),
+        expected_outputs(std::move(expected_outputs)),
+        compare_order(compare_order) {}
+
+  T dataset_params;
+  int num_to_skip;
+  int expected_num_skipped;
+  bool get_next;
+  std::vector<Tensor> expected_outputs;
+  bool compare_order;
+};
+
+template <typename T>
+struct DatasetNodeNameTestCase {
+  T dataset_params;
+  string expected_node_name;
+};
+
+template <typename T>
+struct DatasetTypeStringTestCase {
+  T dataset_params;
+  string expected_dataset_type_string;
+};
+
+template <typename T>
+struct DatasetOutputDtypesTestCase {
+  T dataset_params;
+  DataTypeVector expected_output_dtypes;
+};
+
+template <typename T>
+struct DatasetOutputShapesTestCase {
+  T dataset_params;
+  std::vector<PartialTensorShape> expected_output_shapes;
+};
+
+template <typename T>
+struct CardinalityTestCase {
+  T dataset_params;
+  int64_t expected_cardinality;
+};
+
+template <typename T>
+struct DatasetSaveTestCase {
+  T dataset_params;
+};
+
+template <typename T>
+struct IteratorOutputDtypesTestCase {
+  T dataset_params;
+  DataTypeVector expected_output_dtypes;
+};
+
+template <typename T>
+struct IteratorOutputShapesTestCase {
+  T dataset_params;
+  std::vector<PartialTensorShape> expected_output_shapes;
+};
+
+template <typename T>
+struct IteratorPrefixTestCase {
+  T dataset_params;
+  string expected_iterator_prefix;
+};
+
+template <typename T>
+struct IteratorSaveAndRestoreTestCase {
+  IteratorSaveAndRestoreTestCase(T dataset_params, std::vector<int> breakpoints,
+                                 std::vector<Tensor> expected_outputs,
+                                 bool compare_order = true)
+      : dataset_params(std::move(dataset_params)),
+        breakpoints(std::move(breakpoints)),
+        expected_outputs(std::move(expected_outputs)),
+        compare_order(compare_order) {}
+
+  T dataset_params;
+  std::vector<int> breakpoints;
+  std::vector<Tensor> expected_outputs;
+  bool compare_order;
+};
+
+// Class composing a dataset with its dependencies.
+class TestDataset {
+ public:
+  // TestDataset expects that the caller has Ref'd the wrapped dataset. When
+  // TestDataset is destroyed, it will Unref the dataset.
+  TestDataset(std::unique_ptr<OpKernel> kernel_,
+              std::unique_ptr<OpKernelContext::Params> ctx_params,
+              std::unique_ptr<OpKernelContext> ctx,
+              std::vector<std::unique_ptr<Tensor>> input_tensors,
+              DatasetBase* dataset)
+      : kernel_(std::move(kernel_)),
+        ctx_params_(std::move(ctx_params)),
+        ctx_(std::move(ctx)),
+        input_tensors_(std::move(input_tensors)),
+        dataset_(dataset),
+        scoped_unref_(dataset) {}
+
+  DatasetBase* dataset() const { return dataset_; }
+
+  OpKernelContext* op_kernel_context() const { return ctx_.get(); }
+
+ protected:
+  std::unique_ptr<OpKernel> kernel_;
+  std::unique_ptr<OpKernelContext::Params> ctx_params_;
+  std::unique_ptr<OpKernelContext> ctx_;
+  // The input tensors that this dataset depends on. They must outlive the
+  // dataset.
+  std::vector<std::unique_ptr<Tensor>> input_tensors_;
+  DatasetBase* dataset_;
+  core::ScopedUnref scoped_unref_;
+};
+
+// Class composing a dataset iterator with its dependencies.
+class TestIterator {
+ public:
+  TestIterator(std::unique_ptr<IteratorContext> ctx,
+               std::unique_ptr<IteratorBase> iterator)
+      : iterator_(std::move(iterator)), ctx_(std::move(ctx)) {}
+
+  IteratorBase* iterator() const { return iterator_.get(); }
+
+  IteratorContext* ctx() const { return ctx_.get(); }
+
+  absl::Status GetNext(std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence) {
+    return iterator_->GetNext(ctx(), out_tensors, end_of_sequence);
+  }
+
+ protected:
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<IteratorContext> ctx_;
+};
+
+// Helpful functions to test Dataset op kernels.
+class DatasetOpsTestBase : public ::testing::Test {
+ public:
+  DatasetOpsTestBase();
+
+  // Initializes the runtime and creates a dataset and iterator.
+  absl::Status Initialize(const DatasetParams& dataset_params);
+
+  // Initializes the parts of the runtime needed to run dataset ops.
+  absl::Status InitializeRuntime(const DatasetParams& dataset_params);
+
+  // Creates a dataset.
+  absl::Status MakeDataset(const DatasetParams& dataset_params,
+                           std::unique_ptr<TestDataset>* dataset);
+
+  // Creates an iterator for the given dataset, using the specified split
+  // providers.
+  absl::Status MakeIterator(
+      const DatasetParams& dataset_params, const TestDataset& dataset,
+      std::vector<std::unique_ptr<SplitProvider>> split_providers,
+      std::unique_ptr<TestIterator>* iterator);
+  // Creates an iterator for the given dataset.
+  absl::Status MakeIterator(const DatasetParams& dataset_params,
+                            const TestDataset& dataset,
+                            std::unique_ptr<TestIterator>* iterator);
+
+  // Runs the dataset operation according to the predefined dataset params and
+  // produces outputs. Different from `MakeDataset()` which returns a Dataset
+  // object, `RunDatasetOp()` executes the dataset kernel based on the input
+  // DatasetParams and returns the produced outputs as a tensor vector. It can
+  // be used to run some dataset operations that do not have an internal
+  // customized `Dataset` class (e.g. `ReduceDatasetOp`).
+  absl::Status RunDatasetOp(const DatasetParams& dataset_params,
+                            std::vector<Tensor>* outputs);
+
+  // The method validates whether the two tensors have the same shape, dtype,
+  // and value.
+  static absl::Status ExpectEqual(const Tensor& a, const Tensor& b);
+
+  // The method validates whether the two tensor vectors have the same tensors.
+  // If `compare_order` is false, the method will only evaluate whether the two
+  // vectors have the same elements regardless of order.
+  static absl::Status ExpectEqual(std::vector<Tensor> produced_tensors,
+                                  std::vector<Tensor> expected_tensors,
+                                  bool compare_order);
+
+  // Checks `IteratorBase::GetNext()`.
+  absl::Status CheckIteratorGetNext(const std::vector<Tensor>& expected_outputs,
+                                    bool compare_order);
+
+  // Checks `IteratorBase::GetNext()`.
+  absl::Status CheckIteratorGetNext(TestIterator* iterator,
+                                    const std::vector<Tensor>& expected_outputs,
+                                    bool compare_order);
+
+  // Checks `IteratorBase::GetNext()`.
+  absl::Status CheckIteratorGetNext(IteratorBase* iterator,
+                                    IteratorContext* ctx,
+                                    const std::vector<Tensor>& expected_outputs,
+                                    bool compare_order);
+
+  // Checks `IteratorBase::Skip()`
+  absl::Status CheckIteratorSkip(int num_to_skip, int expected_num_skipped,
+                                 bool get_next,
+                                 const std::vector<Tensor>& expected_outputs,
+                                 bool compare_order);
+
+  // Checks that iterating through the dataset using a split provider produces
+  // the expected outputs.
+  absl::Status CheckSplitProviderFullIteration(
+      const DatasetParams& params, const std::vector<Tensor>& expected_outputs);
+
+  // Checks that iterating through the dataset using a sharded split provider
+  // with the given `num_shards` and `shard_index` produces the expected
+  // outputs.
+  absl::Status CheckSplitProviderShardedIteration(
+      const DatasetParams& params, int64_t num_shards, int64_t shard_index,
+      const std::vector<Tensor>& expected_outputs);
+
+  // Checks `DatasetBase::node_name()`.
+  absl::Status CheckDatasetNodeName(const string& expected_dataset_node_name);
+
+  // Checks `DatasetBase::type_string()`.
+  absl::Status CheckDatasetTypeString(const string& expected_type_str);
+
+  // Checks `DatasetBase::output_dtypes()`.
+  absl::Status CheckDatasetOutputDtypes(
+      const DataTypeVector& expected_output_dtypes);
+
+  // Checks `DatasetBase::output_shapes()`.
+  absl::Status CheckDatasetOutputShapes(
+      const std::vector<PartialTensorShape>& expected_output_shapes);
+
+  // Checks `DatasetBase::Cardinality()`.
+  absl::Status CheckDatasetCardinality(int expected_cardinality);
+
+  // Checks `DatasetBase::options()`.
+  absl::Status CheckDatasetOptions(const Options& expected_options);
+
+  // Checks `IteratorBase::output_dtypes()`.
+  absl::Status CheckIteratorOutputDtypes(
+      const DataTypeVector& expected_output_dtypes);
+
+  // Checks `IteratorBase::output_shapes()`.
+  absl::Status CheckIteratorOutputShapes(
+      const std::vector<PartialTensorShape>& expected_output_shapes);
+
+  // Checks `IteratorBase::prefix()`.
+  absl::Status CheckIteratorPrefix(const string& expected_iterator_prefix);
+
+  absl::Status CheckIteratorSaveAndRestore(
+      DatasetBase* dataset, IteratorContext* iterator_ctx,
+      const std::string& iterator_prefix,
+      const std::vector<Tensor>& expected_outputs,
+      const std::vector<int>& breakpoints, bool compare_order);
+
+  absl::Status CheckIteratorSaveAndRestore(
+      const std::string& iterator_prefix,
+      const std::vector<Tensor>& expected_outputs,
+      const std::vector<int>& breakpoints, bool compare_order);
+
+  // A class for testing variant tensors.
+  class TestVariant {
+   public:
+    TestVariant() = default;
+    explicit TestVariant(const std::vector<Tensor>& tensors)
+        : tensors_(tensors) {}
+
+    bool operator!=(const TestVariant& rhs) const {
+      return !ExpectEqual(tensors_, rhs.tensors_, /*compare_order=*/true).ok();
+    }
+
+    constexpr static const char kTypeName[] = "tensorflow::data::TestVariant";
+
+    string TypeName() const { return kTypeName; }
+
+    // Encodes the contents of this object into `data`.  This function signature
+    // is required for objects to be stored in `tensorflow::Variant`s.  See the
+    // docs for `tensorflow::Variant` for more information and see
+    // `tensorflow::Variant::Encode` for how this is used.
+    void Encode(VariantTensorData* data) const {
+      data->set_type_name(TypeName());
+      for (const auto& tensor : tensors_) {
+        data->add_tensor(tensor);
+      }
+    }
+
+    // Decodes `data` and updates the contents of this object.  This function
+    // signature is required for objects to be stored in `tensorflow::Variant`s.
+    // See the docs for `tensorflow::Variant` for more information and see
+    // `tensorflow::Variant::Decode` for how this is used.
+    bool Decode(VariantTensorData data) {
+      tensors_ = data.tensors();
+      return true;
+    }
+
+    string DebugString() const {
+      string result = "TestVariant([";
+      for (const auto& tensor : tensors_) {
+        if (&tensor != &tensors_[0]) result += ", ";
+        result += tensor.DebugString();
+      }
+      result += "])";
+      return result;
+    }
+
+   private:
+    std::vector<Tensor> tensors_;
+  };
+
+  // Returns a scalar variant tensor containing a `TestVariant` object
+  // containing `tensors`.
+  static Tensor CreateTestVariantTensor(const std::vector<Tensor>& tensors) {
+    Tensor tensor{DT_VARIANT, TensorShape({})};
+    TestVariant test_variant{tensors};
+    tensor.scalar<Variant>()() = test_variant;
+    return tensor;
+  }
+
+ protected:
+  // Make destructor protected so that DatasetOpsTestBase objects cannot
+  // be instantiated directly. Only subclasses can be instantiated.
+  ~DatasetOpsTestBase() override;
+
+  // Creates a thread pool for parallel tasks.
+  absl::Status InitThreadPool(int thread_num);
+
+  // Initializes the runtime for computing the dataset operation and registers
+  // the input function definitions. `InitThreadPool()' needs to be called
+  // before this method if we want to run the tasks in parallel.
+  absl::Status InitFunctionLibraryRuntime(const std::vector<FunctionDef>& flib,
+                                          int cpu_num);
+
+  // Creates a new op kernel based on the node definition.
+  absl::Status CreateOpKernel(const NodeDef& node_def,
+                              std::unique_ptr<OpKernel>* op_kernel);
+
+  // Creates a new op kernel context.
+  absl::Status CreateDatasetContext(
+      OpKernel* dateset_kernel, absl::InlinedVector<TensorValue, 4>* inputs,
+      std::unique_ptr<OpKernelContext::Params>* dataset_context_params,
+      std::unique_ptr<OpKernelContext>* dataset_context);
+
+  // Creates a new dataset.
+  absl::Status CreateDataset(OpKernel* kernel, OpKernelContext* context,
+                             DatasetBase** dataset);
+
+  // Restores the state of the input iterator. It resets the iterator before
+  // restoring it to make sure the input iterator does not hold any
+  // resources or tasks. Otherwise, restoring an existing iterator may cause
+  // the timeout issue or duplicated elements.
+  absl::Status RestoreIterator(IteratorContext* ctx,
+                               IteratorStateReader* reader,
+                               const string& output_prefix,
+                               const DatasetBase& dataset,
+                               std::unique_ptr<IteratorBase>* iterator);
+
+  // Fetches the dataset from the operation context.
+  absl::Status GetDatasetFromContext(OpKernelContext* context, int output_index,
+                                     DatasetBase** dataset);
+
+  // Runs an operation producing outputs.
+  absl::Status RunOpKernel(OpKernel* op_kernel, OpKernelContext* context);
+
+  // Executes a function producing outputs.
+  absl::Status RunFunction(const FunctionDef& fdef, test::function::Attrs attrs,
+                           const std::vector<Tensor>& args,
+                           const GraphConstructorOptions& graph_options,
+                           std::vector<Tensor*> rets);
+
+  // Checks that the size of `inputs` matches the requirement of the op kernel.
+  absl::Status CheckOpKernelInput(
+      const OpKernel& kernel,
+      const absl::InlinedVector<TensorValue, 4>& inputs);
+
+  // Creates a new context for running the dataset operation.
+  absl::Status CreateOpKernelContext(
+      OpKernel* kernel, absl::InlinedVector<TensorValue, 4>* inputs,
+      std::unique_ptr<OpKernelContext>* context);
+
+  // Creates a new context for running the dataset operation.
+  absl::Status CreateOpKernelContext(
+      OpKernel* kernel, absl::InlinedVector<TensorValue, 4>* inputs,
+      std::unique_ptr<OpKernelContext::Params>* params,
+      std::unique_ptr<OpKernelContext>* context);
+
+  // Creates a new iterator context for iterating the dataset.
+  absl::Status CreateIteratorContext(
+      OpKernelContext* op_context,
+      std::unique_ptr<IteratorContext>* iterator_context);
+
+  // Creates a new iterator context for iterating the dataset.
+  // Creates a new serialization context for serializing the dataset and
+  // iterator.
+  absl::Status CreateSerializationContext(
+      std::unique_ptr<SerializationContext>* context);
+
+  // Creates the dataset op kernel.
+  absl::Status MakeGetOptionsOpKernel(const DatasetParams& dataset_params,
+                                      std::unique_ptr<OpKernel>* op_kernel);
+
+ private:
+  // Runs the dataset operation according to the predefined dataset params and
+  // the produced outputs will be stored in `dataset_ctx`.
+  absl::Status RunDatasetOp(
+      const DatasetParams& dataset_params,
+      std::unique_ptr<OpKernel>* dataset_kernel,
+      std::unique_ptr<OpKernelContext::Params>* dataset_ctx_params,
+      std::vector<std::unique_ptr<Tensor>>* created_tensors,
+      std::unique_ptr<OpKernelContext>* dataset_ctx);
+
+  absl::Status MakeDataset(
+      const DatasetParams& dataset_params,
+      std::unique_ptr<OpKernel>* dataset_kernel,
+      std::unique_ptr<OpKernelContext::Params>* dataset_ctx_params,
+      std::unique_ptr<OpKernelContext>* dataset_ctx,
+      std::vector<std::unique_ptr<Tensor>>* created_tensors,
+      DatasetBase** dataset);
+
+  // Creates the dataset op kernel.
+  absl::Status MakeDatasetOpKernel(const DatasetParams& dataset_params,
+                                   std::unique_ptr<OpKernel>* dataset_kernel);
+
+  // Creates a dataset tensor according to the input dataset params.
+  absl::Status MakeDatasetTensor(
+      const DatasetParams& dataset_params,
+      std::vector<std::unique_ptr<Tensor>>* created_tensors,
+      std::unique_ptr<Tensor>* dataset);
+
+  // Adds an empty tensor with the specified dtype and shape to the input
+  // vector.
+  absl::Status AddDatasetInput(absl::InlinedVector<TensorValue, 4>* inputs,
+                               DataTypeVector input_types, DataType dtype,
+                               const TensorShape& shape);
+
+ protected:
+  std::unique_ptr<Device> device_;
+  DeviceType device_type_;
+  int cpu_num_;
+  int thread_num_;
+  Allocator* allocator_;  // Owned by `AllocatorFactoryRegistry`.
+  std::vector<AllocatorAttributes> allocator_attrs_;
+  std::unique_ptr<ScopedStepContainer> step_container_;
+
+  // Device manager is used by function handle cache and needs to outlive it.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_;  // Owned by `pflr_`.
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ResourceMgr> resource_mgr_;
+  std::unique_ptr<checkpoint::TensorSliceReaderCacheWrapper>
+      slice_reader_cache_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::vector<std::unique_ptr<Tensor>> tensors_;  // Owns tensors.
+  mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs.
+  std::unique_ptr<CancellationManager> cancellation_manager_;
+
+  // Indicates if the below fields have been initialized.
+  bool initialized_ = false;
+  std::unique_ptr<OpKernel> dataset_kernel_;
+  std::unique_ptr<OpKernelContext::Params> params_;
+  std::unique_ptr<OpKernelContext> dataset_ctx_;
+  DatasetBase* dataset_ = nullptr;
+  std::unique_ptr<IteratorContext> iterator_ctx_;
+  std::unique_ptr<IteratorBase> iterator_;
+};
+
+#define ITERATOR_GET_NEXT_TEST_P(dataset_op_test_class, dataset_params_class, \
+                                 test_cases)                                  \
+  class ParameterizedGetNextTest                                              \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            GetNextTestCase<dataset_params_class>> {};                        \
+                                                                              \
+  TEST_P(ParameterizedGetNextTest, GetNext) {                                 \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
+    TF_ASSERT_OK(                                                             \
+        CheckIteratorGetNext(test_case.expected_outputs,                      \
+                             /*compare_order=*/test_case.compare_order));     \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedGetNextTest,                        \
+      ::testing::ValuesIn(                                                    \
+          std::vector<GetNextTestCase<dataset_params_class>>(test_cases)));
+
+#define ITERATOR_SKIP_TEST_P(dataset_op_test_class, dataset_params_class,   \
+                             test_cases)                                    \
+  class ParameterizedSkipTest : public dataset_op_test_class,               \
+                                public ::testing::WithParamInterface<       \
+                                    SkipTestCase<dataset_params_class>> {}; \
+                                                                            \
+  TEST_P(ParameterizedSkipTest, Skip) {                                     \
+    auto test_case = GetParam();                                            \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                     \
+    TF_ASSERT_OK(CheckIteratorSkip(                                         \
+        test_case.num_to_skip, test_case.expected_num_skipped,              \
+        test_case.get_next, test_case.expected_outputs,                     \
+        /*compare_order=*/test_case.compare_order));                        \
+  }                                                                         \
+                                                                            \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
+      dataset_op_test_class, ParameterizedSkipTest,                         \
+      ::testing::ValuesIn(                                                  \
+          std::vector<SkipTestCase<dataset_params_class>>(test_cases)));
+
+#define DATASET_NODE_NAME_TEST_P(dataset_op_test_class, dataset_params_class, \
+                                 test_cases)                                  \
+  class ParameterizedDatasetNodeNameTest                                      \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetNodeNameTestCase<dataset_params_class>> {};                \
+                                                                              \
+  TEST_P(ParameterizedDatasetNodeNameTest, DatasetNodeName) {                 \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
+    TF_ASSERT_OK(CheckDatasetNodeName(test_case.expected_node_name));         \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetNodeNameTest,                \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetNodeNameTestCase<dataset_params_class>>(         \
+              test_cases)));
+
+#define DATASET_TYPE_STRING_TEST_P(dataset_op_test_class,                \
+                                   dataset_params_class, test_cases)     \
+  class ParameterizedDatasetTypeStringTest                               \
+      : public dataset_op_test_class,                                    \
+        public ::testing::WithParamInterface<                            \
+            DatasetTypeStringTestCase<dataset_params_class>> {};         \
+                                                                         \
+  TEST_P(ParameterizedDatasetTypeStringTest, DatasetTypeString) {        \
+    auto test_case = GetParam();                                         \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                  \
+    TF_ASSERT_OK(                                                        \
+        CheckDatasetTypeString(test_case.expected_dataset_type_string)); \
+  }                                                                      \
+                                                                         \
+  INSTANTIATE_TEST_SUITE_P(                                              \
+      dataset_op_test_class, ParameterizedDatasetTypeStringTest,         \
+      ::testing::ValuesIn(                                               \
+          std::vector<DatasetTypeStringTestCase<dataset_params_class>>(  \
+              test_cases)));
+
+#define DATASET_OUTPUT_DTYPES_TEST_P(dataset_op_test_class,                   \
+                                     dataset_params_class, test_cases)        \
+                                                                              \
+  class ParameterizedDatasetOutputDtypesTest                                  \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetOutputDtypesTestCase<dataset_params_class>> {};            \
+                                                                              \
+  TEST_P(ParameterizedDatasetOutputDtypesTest, DatasetOutputDtypes) {         \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
+    TF_ASSERT_OK(CheckDatasetOutputDtypes(test_case.expected_output_dtypes)); \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetOutputDtypesTest,            \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetOutputDtypesTestCase<dataset_params_class>>(     \
+              test_cases)));
+
+#define DATASET_OUTPUT_SHAPES_TEST_P(dataset_op_test_class,                   \
+                                     dataset_params_class, test_cases)        \
+                                                                              \
+  class ParameterizedDatasetOutputShapesTest                                  \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            DatasetOutputShapesTestCase<dataset_params_class>> {};            \
+                                                                              \
+  TEST_P(ParameterizedDatasetOutputShapesTest, DatasetOutputShapes) {         \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
+    TF_ASSERT_OK(CheckDatasetOutputShapes(test_case.expected_output_shapes)); \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedDatasetOutputShapesTest,            \
+      ::testing::ValuesIn(                                                    \
+          std::vector<DatasetOutputShapesTestCase<dataset_params_class>>(     \
+              test_cases)));
+
+#define DATASET_CARDINALITY_TEST_P(dataset_op_test_class,                  \
+                                   dataset_params_class, test_cases)       \
+                                                                           \
+  class ParameterizedCardinalityTest                                       \
+      : public dataset_op_test_class,                                      \
+        public ::testing::WithParamInterface<                              \
+            CardinalityTestCase<dataset_params_class>> {};                 \
+                                                                           \
+  TEST_P(ParameterizedCardinalityTest, Cardinality) {                      \
+    auto test_case = GetParam();                                           \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                    \
+    TF_ASSERT_OK(CheckDatasetCardinality(test_case.expected_cardinality)); \
+  }                                                                        \
+                                                                           \
+  INSTANTIATE_TEST_SUITE_P(                                                \
+      dataset_op_test_class, ParameterizedCardinalityTest,                 \
+      ::testing::ValuesIn(                                                 \
+          std::vector<CardinalityTestCase<dataset_params_class>>(          \
+              test_cases)));
+
+#define ITERATOR_OUTPUT_DTYPES_TEST_P(dataset_op_test_class,                  \
+                                      dataset_params_class, test_cases)       \
+  class ParameterizedIteratorOutputDtypesTest                                 \
+      : public dataset_op_test_class,                                         \
+        public ::testing::WithParamInterface<                                 \
+            IteratorOutputDtypesTestCase<dataset_params_class>> {};           \
+                                                                              \
+  TEST_P(ParameterizedIteratorOutputDtypesTest, IteratorOutputDtypes) {       \
+    auto test_case = GetParam();                                              \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
+    TF_ASSERT_OK(CheckDatasetOutputDtypes(test_case.expected_output_dtypes)); \
+  }                                                                           \
+                                                                              \
+  INSTANTIATE_TEST_SUITE_P(                                                   \
+      dataset_op_test_class, ParameterizedIteratorOutputDtypesTest,           \
+      ::testing::ValuesIn(                                                    \
+          std::vector<IteratorOutputDtypesTestCase<dataset_params_class>>(    \
+              test_cases)));
+
+#define ITERATOR_OUTPUT_SHAPES_TEST_P(dataset_op_test_class,                   \
+                                      dataset_params_class, test_cases)        \
+  class ParameterizedIteratorOutputShapesTest                                  \
+      : public dataset_op_test_class,                                          \
+        public ::testing::WithParamInterface<                                  \
+            IteratorOutputShapesTestCase<dataset_params_class>> {};            \
+                                                                               \
+  TEST_P(ParameterizedIteratorOutputShapesTest, IteratorOutputShapes) {        \
+    auto test_case = GetParam();                                               \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                        \
+    TF_ASSERT_OK(CheckIteratorOutputShapes(test_case.expected_output_shapes)); \
+  }                                                                            \
+                                                                               \
+  INSTANTIATE_TEST_SUITE_P(                                                    \
+      dataset_op_test_class, ParameterizedIteratorOutputShapesTest,            \
+      ::testing::ValuesIn(                                                     \
+          std::vector<IteratorOutputShapesTestCase<dataset_params_class>>(     \
+              test_cases)));
+
+#define ITERATOR_PREFIX_TEST_P(dataset_op_test_class, dataset_params_class, \
+                               test_cases)                                  \
+  class ParameterizedIteratorPrefixTest                                     \
+      : public dataset_op_test_class,                                       \
+        public ::testing::WithParamInterface<                               \
+            IteratorPrefixTestCase<dataset_params_class>> {};               \
+                                                                            \
+  TEST_P(ParameterizedIteratorPrefixTest, IteratorPrefix) {                 \
+    auto test_case = GetParam();                                            \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                     \
+    TF_ASSERT_OK(CheckIteratorPrefix(test_case.expected_iterator_prefix));  \
+  }                                                                         \
+                                                                            \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
+      dataset_op_test_class, ParameterizedIteratorPrefixTest,               \
+      ::testing::ValuesIn(                                                  \
+          std::vector<IteratorPrefixTestCase<dataset_params_class>>(        \
+              test_cases)));
+
+#define ITERATOR_SAVE_AND_RESTORE_TEST_P(dataset_op_test_class,              \
+                                         dataset_params_class, test_cases)   \
+  class ParameterizedIteratorSaveAndRestoreTest                              \
+      : public dataset_op_test_class,                                        \
+        public ::testing::WithParamInterface<                                \
+            IteratorSaveAndRestoreTestCase<dataset_params_class>> {};        \
+  TEST_P(ParameterizedIteratorSaveAndRestoreTest, IteratorSaveAndRestore) {  \
+    auto test_case = GetParam();                                             \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckIteratorSaveAndRestore(                                \
+        test_case.dataset_params.iterator_prefix(),                          \
+        test_case.expected_outputs, test_case.breakpoints,                   \
+        test_case.compare_order));                                           \
+  }                                                                          \
+  INSTANTIATE_TEST_SUITE_P(                                                  \
+      dataset_op_test_class, ParameterizedIteratorSaveAndRestoreTest,        \
+      ::testing::ValuesIn(                                                   \
+          std::vector<IteratorSaveAndRestoreTestCase<dataset_params_class>>( \
+              test_cases)));
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_DATASET_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/dataset_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/dataset_utils.h
new file mode 100644
index 00000000..929af873
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/dataset_utils.h
@@ -0,0 +1,429 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_DATASET_UTILS_H_
+#define TENSORFLOW_CORE_DATA_DATASET_UTILS_H_
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+// Constant used for indicating that the argument of tf.data.Dataset.shard
+// should be supplied by the auto-sharding rewrite.
+constexpr int kShardHint = -1;
+
+// Creates a resource handle with a unique name for the given resource where
+// the resource is managed by the Resource Manager.
+template <typename T>
+absl::Status CreateWeakHandle(OpKernelContext* ctx, T* resource,
+                              const string& container_name,
+                              ResourceHandle* handle) {
+  static std::atomic<int64_t> resource_id_counter(0);
+  string unique_name =
+      strings::StrCat(container_name, resource_id_counter.fetch_add(1));
+  ResourceMgr* mgr = ctx->resource_manager();
+  TF_RETURN_IF_ERROR(mgr->Create<T>(container_name, unique_name, resource));
+
+  *handle = MakeResourceHandle(container_name, unique_name, *ctx->device(),
+                               TypeIndex::Make<T>());
+  return absl::OkStatus();
+}
+
+// Creates a ref-counting resource handle for the given resource, where the
+// resource is owned by the handle.
+template <typename T>
+absl::Status CreateHandle(OpKernelContext* ctx, T* resource,
+                          ResourceHandle* handle) {
+  ResourceMgr* mgr = ctx->resource_manager();
+  *handle =
+      ResourceHandle::MakeRefCountingHandle(resource, ctx->device()->name());
+  TF_RETURN_IF_ERROR(
+      mgr->CreateUnowned<T>(handle->container(), handle->name(), resource));
+  return absl::OkStatus();
+}
+
+// TODO(b/198162355): Merge this class with ResourceOpKernel.
+template <typename T>
+class AnonymousResourceOp : public OpKernel {
+ public:
+  // Creates an AnonymousResourceOp.
+  // ref_counting: Determines if the Op returns a ref-counting ResourceHandle.
+  // ResourceHandle. See go/tf-resource-handle-ref-count.
+  // return_deleter: Determines if the Op outputs a deleter tensor in addition
+  // to the resource handle tensor.
+  // If the resource handle is ref-counting, a no-op deleter is returned.
+  explicit AnonymousResourceOp(OpKernelConstruction* context, bool ref_counting,
+                               bool return_deleter)
+      : OpKernel(context),
+        ref_counting_(ref_counting),
+        return_deleter_(return_deleter) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    FunctionLibraryRuntime* lib;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+    OP_REQUIRES_OK(
+        ctx, ctx->function_library()->Clone(&flib_def, &pflr, &lib, true));
+    T* resource;
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, std::move(flib_def),
+                                       std::move(pflr), lib, &resource));
+
+    ResourceHandle handle;
+    if (ref_counting_) {
+      OP_REQUIRES_OK(ctx, CreateHandle(ctx, resource, &handle));
+    } else {
+      OP_REQUIRES_OK(ctx, CreateWeakHandle(ctx, resource, name(), &handle));
+    }
+    Tensor* handle_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle_t));
+    handle_t->scalar<ResourceHandle>()() = handle;
+
+    if (return_deleter_) {
+      Tensor* deleter_t;
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t, attr));
+      // TODO(feyu): Consider returning an OptionalVariant.
+      if (!ref_counting_) {
+        // A deleter output that deletes the resource when destroyed.
+        deleter_t->scalar<Variant>()() =
+            ResourceDeleter(handle, ctx->resource_manager());
+      }
+    }
+  }
+
+ protected:
+  virtual string name() = 0;
+
+  virtual absl::Status CreateResource(
+      OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib, T** resource) = 0;
+
+ private:
+  const bool ref_counting_;
+  const bool return_deleter_;
+};
+
+// Returns OkStatus() if `expected` and `received` types match,
+// errors::InvalidArgument otherwise.
+absl::Status VerifyTypesMatch(const DataTypeVector& expected,
+                              const DataTypeVector& received);
+
+absl::Status VerifyTypesMatch(const DataTypeVector& expected,
+                              const std::vector<Tensor>& received);
+
+// Returns OkStatus() if `expected` and `received` shapes are compatible,
+// errors::InvalidArgument otherwise.
+absl::Status VerifyShapesCompatible(
+    const std::vector<PartialTensorShape>& expected,
+    const std::vector<PartialTensorShape>& received);
+
+absl::Status VerifyShapesCompatible(
+    const std::vector<PartialTensorShape>& expected,
+    const std::vector<Tensor>& received);
+
+// Dataset op level determinism policy.
+class DeterminismPolicy {
+ public:
+  enum class Type : int {
+    // The op must produce elements deterministically.
+    kDeterministic,
+    // The op may relax determinism to improve performance.
+    kNondeterministic,
+    // The determinism policy is not specified at the op level. In this case we
+    // use the experimental_deterministic dataset option to determine the
+    // determinism policy.
+    kDefault,
+  };
+  static constexpr const char* const kDeterministic = "true";
+  static constexpr const char* const kNondeterministic = "false";
+  static constexpr const char* const kDefault = "default";
+
+  DeterminismPolicy() : determinism_(Type::kDefault) {}
+  explicit DeterminismPolicy(Type determinism) : determinism_(determinism) {}
+  // Creates a DeterminismPolicy with Type kDeterministic or
+  // kNondeterministic, depending on the values of `is_deterministic`.
+  explicit DeterminismPolicy(bool is_deterministic);
+
+  static absl::Status FromString(const std::string& s, DeterminismPolicy* out);
+
+  // Returns the string representing the determinism policy. This will be one of
+  // the string constants defined above.
+  std::string String() const;
+
+  /// Convenience methods for checking the DeterminismPolicy::Type.
+  bool IsDeterministic() const { return determinism_ == Type::kDeterministic; }
+  bool IsNondeterministic() const {
+    return determinism_ == Type::kNondeterministic;
+  }
+  bool IsDefault() const { return determinism_ == Type::kDefault; }
+
+ private:
+  Type determinism_;
+};
+
+// Resolves non-deterministic seeds if necessary, returning either the original
+// seeds or the resolved seeds.
+//
+// By TensorFlow convention, if both seeds are 0, they should be replaced with
+// non-deterministically chosen seeds.
+std::pair<int64_t, int64_t> MaybeOverrideSeeds(
+    std::pair<int64_t, int64_t> seeds);
+
+// Adds the functions in `to_add` to `base`. If a function with a matching
+// signature already exists in `base`, replaces it with the function from
+// `to_add`.
+absl::Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                                  const FunctionLibraryDefinition& to_add);
+absl::Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                                  const FunctionDefLibrary& to_add);
+
+// Determines whether the given function is stateful.
+absl::Status IsFunctionStateful(const FunctionLibraryDefinition& library,
+                                const FunctionDef& function_def);
+
+// Determines whether the given node is stateful.
+absl::Status IsNodeStateful(const FunctionLibraryDefinition& library,
+                            const NodeDef& node);
+
+// Creates a runner that runs functions with limited parallelism.
+std::function<void(std::function<void()>)> RunnerWithMaxParallelism(
+    std::function<void(std::function<void()>)> runner, int max_parallelism);
+
+// Op for creating a typed dummy resource.
+//
+// This op is used to provide a resource "placeholder" for ops such as
+// `CacheDatasetV2` or `ShuffleDatasetV2` that expects a resource input.
+// Originally, the lifetime of the resources passed into these ops was managed
+// externally. After the implementation changed to manage the lifetime of the
+// resources (including creation) by the ops themselves, the resource input is
+// only needed to pass a resource handle through graph rewrites. When they are
+// invoked from user code, the implementation passes in a dummy resource.
+template <typename ResourceType>
+class DummyResourceOp : public OpKernel {
+ public:
+  explicit DummyResourceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &tensor));
+    tensor->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, /*container=*/"", /*name=*/"dummy_resource");
+  }
+};
+
+// Given an op prefix and an op to match, returns whether the op to match
+// is a match for any version of the op prefix. For example,
+// MatchesAnyVersion("BatchDataset", "BatchDataset") == true
+// MatchesAnyVersion("BatchDataset", "BatchDatasetV2") == true
+// MatchesAnyVersion("BatchDataset", "BatchDatasetV3") == true
+// MatchesAnyVersion("PaddedBatchDataset", "BatchDataset") == false
+bool MatchesAnyVersion(absl::string_view op_prefix,
+                       absl::string_view op_to_match);
+
+// Returns the index-th slice of a given tensor. If the index-th slice of
+// the tensor is not aligned, returns a deep copy of the tensor.
+Tensor MaybeCopySubSlice(const Tensor& tensor, int64 index);
+
+// Removes device placements from the ops of all functions in `library`.
+void StripDevicePlacement(FunctionDefLibrary* library);
+
+// Copies partial of the batch output.
+absl::Status CopyPartialBatch(int64_t num_elements, const Tensor& value,
+                              Tensor* output);
+
+// Reads a batch when restoring the iterator.
+absl::Status ReadBatch(IteratorContext* ctx, IteratorStateReader* reader,
+                       int64_t batch_size, const string& iterator_prefix,
+                       const string& batch_prefix, std::vector<Tensor>* batch);
+
+// Writes a batch when saving the iterator.
+absl::Status WriteBatch(int64_t batch_size, int64_t num_elements,
+                        const string& iterator_prefix,
+                        const string& batch_prefix, IteratorStateWriter* writer,
+                        std::vector<Tensor>* batch);
+
+// Reads a status when restoring the iterator.
+absl::Status ReadStatus(const string& iterator_prefix, const string& prefix,
+                        IteratorStateReader* reader, absl::Status* status);
+
+// Writes a status when saving the iterator.
+absl::Status WriteStatus(const string& iterator_prefix, const string& prefix,
+                         const absl::Status& status,
+                         IteratorStateWriter* writer);
+
+// Processes a batch to output. In the case a partial batch is encountered, copy
+// only partial of the batch.
+absl::Status ProcessBatch(int64_t batch_size, int64_t num_elements,
+                          bool drop_remainder, const absl::Status& status,
+                          IteratorContext* ctx, std::vector<Tensor>* output,
+                          bool* end_of_sequence, std::vector<Tensor>* batch);
+
+// Copies the input elements to a batch.
+//
+// The `batch_elements` argument contains the individual elements to copy into a
+// batch. The `parallel_copy` argument indicates whether to parallelize the
+// copy.
+// The `out_tensors` argument will be used to store the resulting batch (one for
+// each component of the input).
+absl::Status CopyBatch(AnyContext ctx,
+                       std::vector<std::vector<Tensor>>&& batch_elements,
+                       bool parallel_copy, std::vector<Tensor>* out_tensors);
+
+// Computes the set of experiments to apply based on the job name, task id,
+// rollout percentage of registered experiments, and the
+// TF_DATA_EXPERIMENT_OPT_IN and TF_DATA_EXPERIMENT_OPT_OUT environment
+// variables.
+absl::flat_hash_set<string> GetExperiments();
+absl::flat_hash_set<string> GetExperiments(
+    const std::string& job_name, int64_t task_id,
+    std::function<uint64_t(const string&)> hash_func);
+
+// Logs and records the experiments that will be applied.
+void LogAndRecordExperiments(const absl::flat_hash_set<string>& experiments);
+
+// Computes the set of enabled, disabled, and default optimizations based on the
+// given options. An optimization must be a graph optimizer name that has been
+// registered with Grappler.
+void GetOptimizations(const Options& options,
+                      absl::flat_hash_set<tstring>* optimizations_enabled,
+                      absl::flat_hash_set<tstring>* optimizations_disabled,
+                      absl::flat_hash_set<tstring>* optimizations_default);
+
+// Creates graph rewrite configs based on the given options. The configs will
+// only be used if their corresponding optimizers registered with Grappler are
+// enabled.
+// A config is a string with the following format:
+//   <optimizer name>:<attribute name>:<attribute value>
+absl::flat_hash_set<tstring> CreateGraphRewriteConfigs(const Options& options);
+
+// Determines whether max intra-op parallelism should be configured.
+bool ShouldConfigureMaxIntraOpParallelism(const Options& options);
+
+// Determines whether private threadpool should be used.
+bool ShouldUsePrivateThreadPool(const Options& options);
+
+// Determines whether autotuning should be used.
+bool ShouldUseAutotuning(const Options& options);
+
+// Determines whether optimizations should be applied.
+bool ShouldApplyOptimizations(
+    const Options& options,
+    const absl::flat_hash_set<tstring>& optimizations_enabled,
+    const absl::flat_hash_set<tstring>& optimizations_default);
+
+// Returns the default CPU budget.
+inline int GetCpuBudget() {
+  static bool in_experiment = GetExperiments().contains("tune_cpu_budget");
+  return (in_experiment ? 1.2 : 1.0) * port::NumSchedulableCPUs();
+}
+
+// Returns the initial value for parallelism parameter before the first Autotune
+// optimization.
+int64 GetAutotuneDefaultParallelism(IteratorContext* ctx);
+
+// Creates an iterator context appropriate for a nested dataset's iterator. A
+// nested dataset is a dataset created within another dataset, e.g. by the
+// function passed to `interleave` or `flat_map`.
+IteratorContext MakeNestedIteratorContext(IteratorContext* ctx);
+
+// A `DatasetExperimentRegistry::JobSelector` that randomly selects
+// `rollout_pct` percent of all jobs. `name_hash` is a hash of the experiment
+// and job names.
+template <int64_t rollout_pct>
+bool RandomJobSamplePercentage(uint64_t name_hash) {
+  return name_hash % 100 < rollout_pct;
+}
+
+// A `DatasetExperimentRegistry::TaskSelector` that selects all tasks.
+bool AllTasks(int64_t unused_task_id, bool unused_evens);
+
+// A `DatasetExperimentRegistry::TaskSelector` that selects the tasks for half
+// of all hosts. Typically, one or two consecutive tasks run on a single host.
+// If `evens` is `true`, selects tasks 0,1,4,5,8,9,..., otherwise selects tasks
+// 2,3,6,7,10,11,...
+bool IndependentHostTasks(int64_t task_id, bool evens);
+
+// Registry of tf.data experiments.
+class DatasetExperimentRegistry {
+ public:
+  using JobSelector = std::function<bool(uint64_t name_hash)>;
+  using TaskSelector = std::function<bool(int64_t task_id, bool evens)>;
+
+  struct ExperimentSelector {
+    JobSelector job_selector;
+    TaskSelector task_selector;
+  };
+
+  // Registers the experiment.
+  static void Register(const string& experiment, JobSelector job_selector,
+                       TaskSelector task_selector);
+
+  // Returns all registered experiments.
+  static absl::flat_hash_map<string, ExperimentSelector> Experiments();
+};
+
+// Helper class to register a dataset experiment.
+class DatasetExperimentRegistrar {
+ public:
+  explicit DatasetExperimentRegistrar(
+      const string& experiment,
+      DatasetExperimentRegistry::JobSelector job_selector,
+      DatasetExperimentRegistry::TaskSelector task_selector) {
+    DatasetExperimentRegistry::Register(experiment, job_selector,
+                                        task_selector);
+  }
+};
+
+// Macro that can be used to register a dataset experiment.
+#define REGISTER_DATASET_EXPERIMENT(experiment, job_selector, task_selector)  \
+  REGISTER_DATASET_OP_NAME_UNIQ_HELPER(__COUNTER__, experiment, job_selector, \
+                                       task_selector)
+
+#define REGISTER_DATASET_OP_NAME_UNIQ_HELPER(ctr, experiment, job_selector, \
+                                             task_selector)                 \
+  REGISTER_DATASET_OP_NAME_UNIQ(ctr, experiment, job_selector, task_selector)
+
+#define REGISTER_DATASET_OP_NAME_UNIQ(ctr, experiment, job_selector, \
+                                      task_selector)                 \
+  static ::tensorflow::data::DatasetExperimentRegistrar              \
+      registrar__body__##ctr##__object(experiment, job_selector,     \
+                                       task_selector)
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_DATASET_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/finalization_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/finalization_utils.h
new file mode 100644
index 00000000..07e1d75b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/finalization_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_FINALIZATION_UTILS_H_
+#define TENSORFLOW_CORE_DATA_FINALIZATION_UTILS_H_
+
+#include <functional>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns the finalized version of the dataset. The returned DatasetBase is
+// unowned and lives for as long as this dataset.
+absl::StatusOr<DatasetBase*> GetFinalizedDataset(OpKernelContext* ctx,
+                                                 const DatasetBase* dataset);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_FINALIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/flat_map_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/flat_map_utils.h
new file mode 100644
index 00000000..658f6855
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/flat_map_utils.h
@@ -0,0 +1,112 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_FLAT_MAP_UTILS_H_
+#define TENSORFLOW_CORE_DATA_FLAT_MAP_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tsl/platform/refcount.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace data {
+
+// Utility class for computing the cardinality of a flat map dataset.
+class FlatMapRandomAccessHandler {
+ public:
+  // Initializes the counter. This will save necessary information from `ctx`.
+  // `input_dataset` is the input dataset passed to `flat_map` (not the flat_map
+  // dataset). `captured_map_func` is the captured map function.
+  FlatMapRandomAccessHandler(OpKernelContext* ctx,
+                             const DatasetBase* input_dataset,
+                             CapturedFunction& captured_map_func);
+  virtual ~FlatMapRandomAccessHandler();
+  FlatMapRandomAccessHandler(const FlatMapRandomAccessHandler&) = delete;
+  FlatMapRandomAccessHandler& operator=(const FlatMapRandomAccessHandler&) =
+      delete;
+
+  // Returns the dataset cardinality.
+  absl::StatusOr<int64_t> Cardinality();
+
+  // Returns the cumulative cardinality at the index-th dataset.
+  absl::StatusOr<int64_t> CumulativeCardinality(size_t index);
+
+  // Given the flattened element position `element_position`, returns the index
+  // of the dataset to which the element belongs.
+  absl::StatusOr<int64_t> GetDatasetIndex(size_t element_position);
+
+  // Creates the dataset iterators.
+  absl::StatusOr<std::vector<std::unique_ptr<IteratorBase>>> MakeInputIterators(
+      IteratorContext* ctx, const DatasetBaseIterator* parent,
+      const std::string& prefix);
+
+ private:
+  // Computes the cumulative cardinalities.
+  absl::StatusOr<std::vector<int64_t>> ComputeCardinalities();
+
+  // Creates the input datasets. Each dataset is the result of applying the map
+  // function to one element from the input iterator.
+  absl::StatusOr<std::deque<DatasetBase*>> MakeInputDatasets() const;
+  absl::StatusOr<DatasetBase*> MakeInputDataset(
+      std::vector<Tensor> input_tensors,
+      const InstantiatedCapturedFunction& map_func) const;
+
+  const DatasetBase* input_dataset_;
+  CapturedFunction& captured_map_func_;
+
+  // The iterator context which bundles together the necessary runtime support
+  // to create and get elements from the input dataset.
+  std::unique_ptr<IteratorContext> ctx_;
+  FunctionLibraryRuntime* flr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> interop_threadpool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  ResourceMgr resource_mgr_;
+  CancellationManager cancellation_manager_;
+  UnboundedThreadPool unbounded_thread_pool_;
+
+  // Input datasets generated by running the map function. Each dataset is the
+  // result of applying the map function to one element from the input iterator.
+  std::deque<DatasetBase*> input_datasets_;
+
+  // Cumulative cardinalities. Before `ComputeCardinalities` is called, this is
+  // an empty vector. After `ComputeCardinalities` is called, the last element
+  // is the dataset cardinality.
+  absl::StatusOr<std::vector<int64_t>> cumulative_cardinalities_ =
+      std::vector<int64_t>{};
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_FLAT_MAP_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/global_shuffle_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/global_shuffle_utils.h
new file mode 100644
index 00000000..66e2ff1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/global_shuffle_utils.h
@@ -0,0 +1,100 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_GLOBAL_SHUFFLE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_GLOBAL_SHUFFLE_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+// Builds and selects the `IteratorContext` to use based on whether the dataset
+// is globally shuffled.
+//
+// Example usage in `Iterator::GetNextInternal`:
+//
+// ```
+// IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
+// TF_RETURN_IF_ERROR(input_impl_->GetNext(
+//     ctx_with_index_mapper.Get(), out_tensors, end_of_sequence));
+// ctx_with_index_mapper.MergeCheckpoint();
+// ```
+//
+// The iterator should also implement `GetIndexMapper` if it needs to customize
+// the index mapping behavior.
+class IteratorContextWithIndexMapper {
+ public:
+  // Caller keeps ownership of both pointers.
+  explicit IteratorContextWithIndexMapper(IteratorContext* ctx,
+                                          const IteratorBase* iterator);
+  virtual ~IteratorContextWithIndexMapper() = default;
+  IteratorContextWithIndexMapper(const IteratorContextWithIndexMapper&) =
+      delete;
+  IteratorContextWithIndexMapper& operator=(
+      const IteratorContextWithIndexMapper&) = delete;
+
+  IteratorContext* Get();
+  void MergeCheckpoint();
+
+ private:
+  IteratorContext* ctx_;
+  std::optional<IteratorContext> ctx_with_index_mapper_;
+};
+
+// For source datasets that support random access, this class adapts the dataset
+// random access API to support globally shuffled iterators.
+class GlobalShuffleIterator {
+ public:
+  // The dataset is expected to support random access by implementing the
+  // absl::Status Get(int64_t index, std::vector<Tensor>* out_tensors) const.
+  explicit GlobalShuffleIterator(const DatasetBase* dataset)
+      : dataset_(dataset) {}
+
+  // Returns the next shuffled element.
+  // REQUIRES: ctx->index_mapper() != nullptr.
+  absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence);
+
+  absl::Status Save(const std::string& parent_iterator_prefix,
+                    SerializationContext* ctx, IteratorStateWriter* writer);
+
+  // Restores the element count.
+  // REQUIRES: ctx->restored_element_count() != nullopt.
+  absl::Status Restore(const std::string& parent_iterator_prefix,
+                       IteratorContext* ctx, IteratorStateReader* reader);
+
+ private:
+  const DatasetBase* const dataset_;
+
+  mutable absl::Mutex mu_;
+
+  // Count of elements produced by this iterator when it runs in the random
+  // access mode.
+  int64_t element_count_ ABSL_GUARDED_BY(mu_) = 0;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_GLOBAL_SHUFFLE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/hash_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/hash_utils.h
new file mode 100644
index 00000000..2effd416
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/hash_utils.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_HASH_UTILS_H_
+#define TENSORFLOW_CORE_DATA_HASH_UTILS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns a stable hash of the subgraph rooted at the given node.
+//
+// NOTE: There is currently no guarantee that the hash of a subgraph will stay
+// the same between TensorFlow builds.
+absl::Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash);
+absl::Status HashNode(const GraphDef& graph, const NodeDef& node,
+                      const FunctionLibraryDefinition& flib_def, uint64* hash);
+
+// Returns a stable hash of the given tensor.
+//
+// NOTE: There is currently no guarantee that the hash of a subgraph will stay
+// the same between TensorFlow builds.
+absl::Status HashTensor(const Tensor& tensor, uint64* hash);
+
+// Returns a stable hash of the given graph.
+//
+// NOTE: There is currently no guarantee that the hash of a subgraph will stay
+// the same between TensorFlow builds.
+absl::Status HashGraph(const GraphDef& graph, uint64* hash);
+
+// Determines whether the given graphs are equal, following the same logic used
+// for HashGraph. Returns OK if the graphs can be determined to be equal,
+// otherwise returns an error message explaining why the graphs couldn't be
+// determined to be equal.
+absl::Status CheckGraphsEqual(const GraphDef& a, const GraphDef& b);
+
+// Determines whether the subgraphs rooted at the given nodes are equal
+// following the same logic used for HashGraph. Returns OK if the graphs can be
+// determined to be equal, otherwise returns an error message explaining why the
+// graphs couldn't be determined to be equal.
+absl::Status CheckSubgraphsEqual(const GraphDef& a, const NodeDef* node_a,
+                                 const GraphDef& b, const NodeDef* node_b);
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_HASH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/metric_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/metric_utils.h
new file mode 100644
index 00000000..7d67cb92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/metric_utils.h
@@ -0,0 +1,87 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_METRIC_UTILS_H_
+#define TENSORFLOW_CORE_DATA_METRIC_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Exports the metrics for `GetNext` calls by tf.data iterators. When the user
+// calls `RecordStart` and `RecordStop`, it will export a latency sample. It
+// also exports throughput, tf.data iterator life time, etc. This class is
+// thread-safe. Example usage:
+//
+//   ```
+//   IteratorMetricsCollector metrics_collector(DEVICE_CPU, env);
+//   absl::Time start_time = metrics_collector.RecordStart();
+//   auto status = iterator_->GetNext(IteratorContext(std::move(params)),
+//                                    out_tensors, end_of_sequence);
+//   metrics_collector.RecordStop(start_time, *out_tensors);
+//   ```
+class IteratorMetricsCollector {
+ public:
+  // Constructs a `IteratorMetricsCollector`. `device_type` is one of the
+  // devices defined in `types.h` (DEVICE_CPU, DEVICE_GPU, DEVICE_TPU, etc).
+  // We only collect metrics for CPU devices. This is a heuristic to avoid
+  // collecting metrics for device-side iterators created by the multi-device
+  // iterator mechanism.
+  IteratorMetricsCollector(const std::string& device_type, const Env& env);
+
+  // Starts the timer for the next `GetNext` call. Returns the start time.
+  absl::Time RecordStart();
+
+  // Records metrics for the most recent `GetNext` call, including the latency,
+  // bytes fetched, iterator life time, etc. `start_time` is the start time
+  // returned by `RecordStart`. `output` is the output of the `GetNext` call.
+  void RecordStop(absl::Time start_time, const std::vector<Tensor>& output);
+
+ private:
+  // We only collect metrics for CPU devices.
+  bool ShouldCollectMetrics() const;
+
+  // One of the devices defined in `types.h`
+  // (DEVICE_CPU, DEVICE_GPU, DEVICE_TPU, etc).
+  const std::string device_type_;
+  const Env& env_;
+
+  mutex mu_;
+
+  // Records the number of currently active `GetNext` calls.
+  uint64_t num_active_calls_ TF_GUARDED_BY(mu_) = 0;
+
+  // Records the start time (in microseconds) of the first `RecordStart()` call
+  // that followed the last period of inactivity.
+  uint64_t first_start_time_us_ TF_GUARDED_BY(mu_) = 0;
+
+  // Records the end time (in microseconds) of the most recent `RecordStop()`
+  // call.
+  uint64_t end_time_us_ TF_GUARDED_BY(mu_) = 0;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_METRIC_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/name_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/name_utils.h
new file mode 100644
index 00000000..72e870a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/name_utils.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_NAME_UTILS_H_
+#define TENSORFLOW_CORE_DATA_NAME_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+namespace name_utils {
+
+extern const char kDelimiter[];
+extern const char kDefaultDatasetDebugStringPrefix[];
+
+struct OpNameParams {
+  int op_version = 1;
+};
+
+struct DatasetDebugStringParams {
+  template <typename... T>
+  void set_args(T... input_args) {
+    args = {static_cast<const strings::AlphaNum&>(input_args).data()...};
+  }
+
+  int op_version = 1;
+  string dataset_prefix = "";
+  std::vector<string> args;
+};
+
+struct IteratorPrefixParams {
+  int op_version = 1;
+  string dataset_prefix = "";
+};
+
+// Merge the given args in the format of "(arg1, arg2, ..., argn)".
+//
+// e.g. ArgsToString({"1", "2", "3"}) -> "(1, 2, 3)"; ArgsToString({}) -> "".
+string ArgsToString(const std::vector<string>& args);
+
+// Returns the dataset op name.
+//
+// e.g. OpName("Map") -> "MapDataset".
+string OpName(const string& dataset_type);
+
+// Returns the dataset op names.
+//
+// e.g. OpName(ConcatenateDatasetOp::kDatasetType, OpNameParams())
+// -> "ConcatenateDataset"
+//
+// OpNameParams params;
+// params.op_version = 2;
+// OpName(ParallelInterleaveDatasetOp::kDatasetType, params)
+// -> "ParallelInterleaveDatasetV2"
+string OpName(const string& dataset_type, const OpNameParams& params);
+
+// Returns a human-readable debug string for this dataset in the format of
+// "FooDatasetOp(arg1, arg2, ...)::Dataset".
+//
+// e.g. DatasetDebugString("Map") -> "MapDatasetOp::Dataset";
+string DatasetDebugString(const string& dataset_type);
+
+// Returns a human-readable debug string for this dataset in the format of
+// "FooDatasetOp(arg1, arg2, ...)::Dataset".
+//
+// e.g.
+//  DatasetDebugStringParams range_params;
+//  range_params.set_args(0, 10, 3);
+//  DatasetDebugString(RangeDatasetOp::kDatasetType, range_params)
+//  -> "RangeDatasetOp(0, 10, 3)::Dataset");
+string DatasetDebugString(const string& dataset_type,
+                          const DatasetDebugStringParams& params);
+
+// Returns a string that identifies the sequence of iterators leading up to
+// the iterator of this dataset.
+//
+// e.g. IteratorPrefix("Map", "Iterator::Range") -> "Iterator::Range::Map".
+string IteratorPrefix(const string& dataset_type, const string& prefix);
+
+// Returns a string that identifies the sequence of iterators leading up to
+// the iterator of this dataset.
+//
+// e.g.
+// IteratorPrefixParams params;
+// params.op_version = 2;
+// IteratorPrefix(BatchDatasetOp::KDatasetType, "Iterator::Range", params) ->
+// "Iterator::Range::BatchV2".
+string IteratorPrefix(const string& dataset_type, const string& prefix,
+                      const IteratorPrefixParams& params);
+
+}  // namespace name_utils
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_NAME_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/rewrite_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/rewrite_utils.h
new file mode 100644
index 00000000..addd6f20
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/rewrite_utils.h
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_REWRITE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_REWRITE_UTILS_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide this functionality because not all of its
+// dependencies are available there.
+#if !defined(IS_MOBILE_PLATFORM)
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+RewriterConfig CreateRewriterConfig(
+    const absl::flat_hash_set<tstring>& optimizations,
+    const absl::flat_hash_set<tstring>& optimizations_configs);
+
+// Rewrites the input dataset using the given config. The rewritten_input
+// stored in the core::RefCountPtr<DatasetBase>* output parameter is owned.
+absl::Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                            std::function<RewriterConfig(void)> config_factory,
+                            bool record_fingerprint,
+                            core::RefCountPtr<DatasetBase>* rewritten_input);
+
+// Creates a grappler item for `graph_def`, which is required for graph
+// optimization.
+// `dataset_node` is the name of the node corresponding to the dataset.
+// If `add_fake_sinks` is true, it adds fake sink node to graph and functions to
+// allow rewriting the actual sink nodes.
+// If `apply_optimizations` is true, general grappler optimizations at level
+// `tensorflow::OptimizerOptions::L1` are applied to the graph.
+// TODO(b/118820916): When MetaOptimizer adds provisions for function retvals to
+// be optimizable, we will no longer need to add fake nodes.
+std::unique_ptr<tensorflow::grappler::GrapplerItem> GetGrapplerItem(
+    GraphDef* graph_def, std::string* dataset_node, bool add_fake_sinks,
+    bool apply_optimizations = true);
+
+// Returns the name of the node corresponding to the dataset. It is indicated by
+// the symbolic `_Retval` node.
+absl::StatusOr<std::string> GetDatasetNode(const GraphDef& graph_def);
+
+// Like `GetDatasetNode` above, but returns the entire node object.
+absl::StatusOr<NodeDef> GetDatasetNodeDef(const GraphDef& graph_def);
+
+// Determines which optimizations should be applied.
+//
+// The result will contain any optimizations that are explicitly enabled, any
+// default optimization that are not explicitly disabled, and any experiment
+// that corresponds to an optimization as long as the optimization is not
+// explicitly disabled.
+absl::flat_hash_set<tstring> SelectOptimizations(
+    const absl::flat_hash_set<string>& experiments,
+    const absl::flat_hash_set<tstring>& optimizations_enabled,
+    const absl::flat_hash_set<tstring>& optimizations_disabled,
+    const absl::flat_hash_set<tstring>& optimizations_default);
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
+
+#endif  // TENSORFLOW_CORE_DATA_REWRITE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/root_dataset.h b/third_party/tflite-hdrs/tensorflow/core/data/root_dataset.h
new file mode 100644
index 00000000..e5b8f8db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/root_dataset.h
@@ -0,0 +1,108 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_ROOT_DATASET_H_
+#define TENSORFLOW_CORE_DATA_ROOT_DATASET_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/model.pb.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/refcount.h"
+
+namespace tensorflow {
+namespace data {
+
+// Dataset transformation responsible for internal tf.data logic such as
+// autotuning, applying threading configuration.
+class RootDataset : public DatasetBase {
+ public:
+  struct Params {
+    bool autotune = true;
+    model::AutotuneAlgorithm autotune_algorithm;
+    std::function<int64_t()> autotune_cpu_budget_func;
+    double ram_budget_share;
+    int64_t autotune_ram_budget_from_options;
+    int64_t max_intra_op_parallelism = 1;
+    int64_t private_threadpool_size = 0;
+
+    int64_t ComputeInitialAutotuneRamBudget() const {
+      if (autotune_ram_budget_from_options > 0) {
+        return autotune_ram_budget_from_options;
+      } else {
+        return ram_budget_share * port::AvailableRam();
+      }
+    }
+  };
+
+  static absl::Status FromOptions(const DatasetBase* input,
+                                  DatasetBase** output);
+  static absl::Status FromOptions(core::RefCountPtr<DatasetBase> input,
+                                  DatasetBase** output);
+
+  ~RootDataset() override;
+
+  const DataTypeVector& output_dtypes() const override;
+  const std::vector<PartialTensorShape>& output_shapes() const override;
+
+  int64_t CardinalityInternal(CardinalityOptions options) const override;
+  absl::Status Get(OpKernelContext* ctx, int64 index,
+                   std::vector<Tensor>* out_tensors) const override;
+  absl::Status CheckExternalState() const override;
+  string DebugString() const override;
+  absl::Status InputDatasets(
+      std::vector<const DatasetBase*>* inputs) const override;
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
+ protected:
+  absl::Status AsGraphDefInternal(SerializationContext* ctx,
+                                  DatasetGraphDefBuilder* b,
+                                  Node** output) const override;
+
+ private:
+  class Iterator;
+
+  RootDataset(const DatasetBase* input, const Params& params);
+
+  RootDataset(core::RefCountPtr<DatasetBase> input, const Params& params);
+
+  const DatasetBase* input_;
+  core::RefCountPtr<DatasetBase> owned_input_;
+  const Params params_;
+  TraceMeMetadata traceme_metadata_;
+  absl::Status random_indexing_compatible_;
+};
+
+// Finalizes the `input` dataset, which is expected to be called before the
+// dataset is about to be iterated. This can for instance apply static graph
+// optimizations or inject internal tf.data transformations responsible for
+// autotuning or threading configuration. The caller must ensure that the
+// input dataset to be finalized outlives the output.
+absl::Status FinalizeDataset(OpKernelContext* ctx, const DatasetBase* input,
+                             DatasetBase** output);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_ROOT_DATASET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/serialization_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/serialization_utils.h
new file mode 100644
index 00000000..e59ac959
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/serialization_utils.h
@@ -0,0 +1,244 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERIALIZATION_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERIALIZATION_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+inline constexpr absl::string_view kRetvalOp = "_Retval";
+
+// Reads dataset elements from the checkpoint reader using the given key prefix.
+absl::Status ReadElementsFromCheckpoint(
+    IteratorContext* ctx, IteratorStateReader* reader,
+    absl::string_view key_prefix, std::vector<std::vector<Tensor>>* elements);
+
+// Writes dataset elements to the checkpoint writer using the given key prefix.
+// The elements can be read back by passing the same key prefix to
+// ReadElementsFromCheckpoint. Only one list of elements can be written under
+// the same key_prefix.
+absl::Status WriteElementsToCheckpoint(
+    IteratorStateWriter* writer, absl::string_view key_prefix,
+    const std::vector<std::vector<Tensor>>& elements);
+
+// Updates the dataset elements in the checkpoint for given `checkpoint_indices`
+// using the given key prefix, assuming that vector of elements have
+// checkpointed these before. The elements can be read back by passing the same
+// key prefix to ReadElementsFromCheckpoint.
+absl::Status UpdateCheckpointElements(
+    IteratorStateWriter* writer, absl::string_view key_prefix,
+    const std::vector<std::vector<Tensor>>& elements,
+    const absl::flat_hash_set<int64_t>& checkpoint_indices);
+
+// Helper class for reading data from a vector of VariantTensorData objects.
+class VariantTensorDataReader : public IteratorStateReader {
+ public:
+  explicit VariantTensorDataReader(
+      const std::vector<const VariantTensorData*>& data);
+
+  bool Contains(absl::string_view key) const override;
+  bool Contains(absl::string_view name, absl::string_view key) const override;
+
+  absl::Status ReadScalar(absl::string_view key, int64_t* val) const override;
+  absl::Status ReadScalar(absl::string_view name, absl::string_view key,
+                          int64_t* val) const override;
+  absl::Status ReadScalar(absl::string_view key, tstring* val) const override;
+  absl::Status ReadScalar(absl::string_view name, absl::string_view key,
+                          tstring* val) const override;
+  absl::Status ReadTensor(absl::string_view key, Tensor* val) const override;
+  absl::Status ReadTensor(FunctionLibraryRuntime* flr, absl::string_view key,
+                          Tensor* val) const override;
+  absl::Status ReadTensor(absl::string_view name, absl::string_view key,
+                          Tensor* val) const override;
+  absl::Status ReadTensor(FunctionLibraryRuntime* flr, absl::string_view name,
+                          absl::string_view key, Tensor* val) const override;
+
+ private:
+  template <typename T>
+  absl::Status ReadScalarInternal(absl::string_view name, absl::string_view key,
+                                  T* val) const;
+  absl::Status ReadTensorInternal(FunctionLibraryRuntime* flr,
+                                  absl::string_view name, absl::string_view key,
+                                  Tensor* val) const;
+  absl::Status ReadDatasetInternal(FunctionLibraryRuntime* flr,
+                                   absl::string_view name,
+                                   absl::string_view key, Tensor* val) const;
+  // Produces all key/value pairs stored in this reader. Useful for debugging.
+  std::map<string, Tensor> ReadAllTensors();
+
+  // For access to ReadAllTensors()
+  friend absl::StatusOr<absl::flat_hash_map<std::string, int64_t>>
+  CheckpointStats(const std::string& checkpoint_bytes);
+
+  std::map<string, std::map<string, size_t>> map_;
+  std::map<string, const VariantTensorData*> data_;  // Not owned.
+};
+
+// Helper class used to build a list of VariantTensorData objects, one for each
+// iterator which is determined from the key supplied from the Write* calls.
+// Sample usage:
+// VariantTensorDataWriter writer;
+// writer.WriteScalar(full_name("buffer_size"), buffer_.size());
+// writer.WriteScalar(full_name("num_threads"), threadpool_.size());
+// ....
+// std::vector<std::unique_ptr<VariantTensorData>> variants;
+// writer.ReleaseData(&variants);
+// Now the VariantTensorData objects can be used to serialize.
+class VariantTensorDataWriter : public IteratorStateWriter {
+ public:
+  absl::Status WriteScalar(absl::string_view key, int64_t val) override;
+  absl::Status WriteScalar(absl::string_view name, absl::string_view key,
+                           int64_t val) override;
+
+  absl::Status WriteScalar(absl::string_view key, const tstring& val) override;
+  absl::Status WriteScalar(absl::string_view name, absl::string_view key,
+                           const tstring& val) override;
+
+  absl::Status WriteTensor(absl::string_view key, const Tensor& val) override;
+  absl::Status WriteTensor(absl::string_view name, absl::string_view key,
+                           const Tensor& val) override;
+
+  // Releases the built VariantTensorData's to `variants`. Clears out all
+  // class state.
+  void ReleaseData(std::vector<std::unique_ptr<VariantTensorData>>* variants);
+
+  // Obtains a read-only version of the VariantTensorData's built.
+  void GetData(std::vector<const VariantTensorData*>* variants);
+
+ private:
+  void MaybeFlush();
+  void Reset();
+
+  template <typename T>
+  absl::Status WriteScalarInternal(absl::string_view name,
+                                   absl::string_view key, const T& val);
+  absl::Status WriteTensorInternal(absl::string_view name,
+                                   absl::string_view key, const Tensor& val);
+  absl::Status WriteDatasetInternal(absl::string_view name,
+                                    absl::string_view key,
+                                    const DatasetBase* dataset);
+
+  bool is_flushed_ = false;
+  std::map<string, std::unique_ptr<VariantTensorData>> data_;
+  std::map<string, std::vector<string>> keys_;
+};
+
+// Wrapper for encoding/decoding the iterator state stored in a Variant tensor.
+// The `GetData()` method returns an VariantTensorData object which contains all
+// the state needed to restore a single iterator.
+//
+// Usage example:
+//
+// Encoding:
+//
+//   Tensor t(DT_VARIANT, TensorShape({}));
+//   t->scalar<Variant>()() = IteratorStateVariant();
+//
+// Encode() sets the type_name of the VariantTensorData object to
+// IteratorStateVariant::TypeName().
+//
+// Decoding:
+//
+//   Variant v = <VariantTensorDataProto object>;
+//   DecodeUnaryVariant(&v);
+//   IteratorStateVariant* wrapper = v.get<IteratorStateVariant>();
+//   IteratorStateReader reader({wrapper->GetData()});
+//   iterator_resource->Restore(ctx, &reader);
+//
+// The type_name of the VariantTensorData object to be decoded must match
+// IteratorStateVariant::TypeName().
+class IteratorStateVariant {
+ public:
+  IteratorStateVariant() = default;
+  IteratorStateVariant(const IteratorStateVariant& other);
+  IteratorStateVariant& operator=(IteratorStateVariant&& other) = default;
+  IteratorStateVariant& operator=(const IteratorStateVariant& other) = delete;
+
+  static std::string TypeName();
+
+  // Initializes `this` from a VariantTensorData object.
+  absl::Status InitializeFromVariantData(
+      std::unique_ptr<VariantTensorData> data);
+
+  // Returns a borrowed pointer to the underlying VariantTensorData.
+  const VariantTensorData* GetData() const { return data_.get(); }
+
+  // Encodes this `IteratorStateVariant` into `*data`. Data will be compressed
+  // and stored as a scalar `CompressedElement` tensor, or left uncompressed if
+  // compression fails.
+  void Encode(VariantTensorData* data) const;
+
+  // Decodes from `data`. If `data` contains a single scalar `CompressedElement`
+  // tensor, it is assumed to be compressed by `Encode`, and will be
+  // uncompressed as part of `Decode`.
+  bool Decode(VariantTensorData data);
+
+  std::string DebugString() const;
+
+ private:
+  // Returns the compressed element in `data`. If `data` does not contain a
+  // compressed element, returns nullptr.
+  static const CompressedElement* GetCompressedElement(
+      const VariantTensorData& data);
+
+  std::unique_ptr<VariantTensorData> data_;
+};
+
+// Returns a GraphDef representation of the given dataset.
+absl::Status AsGraphDef(const DatasetBase* dataset,
+                        SerializationContext&& serialization_ctx,
+                        GraphDef* graph_def);
+
+// Returns a GraphDef representation of the given dataset suitable for
+// optimization rewrites. It sets serialization parameters to export a minimum
+// graph with additional information for optimization (i.e. ignoring external
+// state, not serializing data tensors, not failing if there are datasets which
+// do not have AsGraphDef implemented). Sets the `dataset_node` parameter to the
+// dataset's node name in the resulting GraphDef.
+absl::Status AsGraphDefForRewrite(
+    OpKernelContext* ctx, const DatasetBase* input,
+    std::vector<std::pair<string, Tensor>>* input_list, GraphDef* result,
+    string* dataset_node);
+
+// Analyzes the bytes of a tf.data iterator checkpoint to identify all of the
+// keys in the checkpoint along with their sizes in bytes.
+absl::StatusOr<absl::flat_hash_map<std::string, int64_t>> CheckpointStats(
+    const std::string& checkpoint_bytes);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SERIALIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/auto_scaler.h b/third_party/tflite-hdrs/tensorflow/core/data/service/auto_scaler.h
new file mode 100644
index 00000000..edd09863
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/auto_scaler.h
@@ -0,0 +1,180 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_AUTO_SCALER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_AUTO_SCALER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/time/time.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Estimates the optimal number of tf.data service workers for an Iteration
+// based on the current workload.
+// Note: It is assumed that all reported times correspond to the same Iteration.
+//
+// Glossary:
+// * Consumer: A client that consumes elements from tf.data service.
+// * Worker: A tf.data service worker.
+// * Processing time (PT): The estimated time it takes a worker to process and
+// produce an element.
+// * Target processing time (TPT): From the perspective of a consumer,
+// it is the maximum time a tf.data input pipeline can take to produce an
+// element such that the downstream processor wait time is 0. In other words,
+// this is the ideal time the tf.data pipeline should take to produce an element
+// so that training doesn't slow down due to waiting for elements. This means
+// that we want processing time <= target processing time, so that when an
+// element is requested, the pipeline has processed it already.
+// * Worker throughput (WT): It is the multiplicative inverse of processing time
+// (1 / PT). This refers to the number of elements produced by a worker per
+// second.
+// * Consumption rate (CR): It is the multiplicative inverse of target
+// processing time (1 / TPT). This refers to the number of elements requested by
+// a consumer per second.
+//
+// **AutoScaler overview**
+//
+// 1. It keeps track of the most recent worker throughputs reported by each
+// worker in the data service cluster, as well as the most recent consumption
+// rates reported by each consumer. WTs and CRs are derived from reporting PTs
+// and TPTs, respectively.
+// 2. Having this information, it estimates the optimal number of workers N as
+// follows:
+//  N = (Sum of CRs reported by all consumers) /
+//      (Average of WTs reported by all workers)
+//
+// AutoScaler is thread-safe.
+class AutoScaler {
+ public:
+  AutoScaler() = default;
+  // Returns the estimated optimal number of workers according to the current
+  // observed workload. If there are no previously reported processing and
+  // target processing times, returns nullopt.
+  std::optional<int64_t> GetOptimalNumberOfWorkers() const
+      TF_LOCKS_EXCLUDED(mu_);
+  // Reports the latest observed processing time from the worker with
+  // `worker_address`. Returns an error if `processing_time` is ZeroDuration or
+  // negative.
+  absl::Status ReportProcessingTime(const std::string& worker_address,
+                                    absl::Duration processing_time)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Reports the latest observed target processing time from the consumer
+  // identified by `consumer_id`. Returns an error if `target_processing_time`
+  // is ZeroDuration or negative.
+  absl::Status ReportTargetProcessingTime(int64_t consumer_id,
+                                          absl::Duration target_processing_time)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Unregisters the worker with `worker_address`, removing its reported
+  // processing time from consideration of the current workload estimation.
+  // Returns an error if the specified worker does not exist.
+  absl::Status RemoveWorker(const std::string& worker_address)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Unregisters the consumer identified by `consumer_id`, removing its reported
+  // target processing time from consideration of the current workload
+  // estimation. Returns an error if the specified consumer does not exist.
+  absl::Status RemoveConsumer(int64_t consumer_id) TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  mutable tsl::mutex mu_;
+  // Map from worker address to worker throughput.
+  absl::flat_hash_map<std::string, double> worker_throughputs_
+      TF_GUARDED_BY(mu_);
+  // Map from consumer id to consumption rate.
+  absl::flat_hash_map<int64_t, double> consumption_rates_ TF_GUARDED_BY(mu_);
+};
+
+// Exports a metric (/tensorflow/data/service/optimal_number_of_workers) with
+// the estimated optimal number of tf.data service workers, according to
+// the observed cluster workload.
+//
+// It estimates the number of workers as the maximum of the estimated optimal
+// number of workers for all Iterations running in the tf.data service cluster.
+//
+// MultipleIterationsAutoScaler is thread-safe.
+class MultipleIterationsAutoScaler {
+ public:
+  MultipleIterationsAutoScaler() = default;
+  // Unregisters iteration with `iteration_id`, removing its reported
+  // times from consideration of the current workload estimation.
+  // Returns an error if the specified iteration does not exist.
+  absl::Status UnregisterIteration(int64_t iteration_id) TF_LOCKS_EXCLUDED(mu_);
+  // Updates the metric value with the current estimated optimal number of
+  // workers. The estimate is limited to min(4 * `current_number_of_workers`,
+  // `current_number_of_workers` + 500). Returns an error if there are no
+  // previously reported processing and target processing times for at least one
+  // iteration, or `current_number_of_workers` is not positive.
+  absl::Status UpdateOptimalNumberOfWorkersMetric(
+      int64_t current_number_of_workers) TF_LOCKS_EXCLUDED(mu_);
+  // Returns the estimated optimal number of workers according to the current
+  // observed workload. If there are no previously reported processing and
+  // target processing times for at least one iteration, returns nullopt.
+  std::optional<int64_t> GetOptimalNumberOfWorkers() const
+      TF_LOCKS_EXCLUDED(mu_);
+  // Reports the latest observed processing time from the worker with
+  // `worker_address` for iteration with `iteration_id`. Returns an error if
+  // `processing_time` is ZeroDuration or negative.
+  absl::Status ReportProcessingTime(int64_t iteration_id,
+                                    const std::string& worker_address,
+                                    absl::Duration processing_time)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Reports the latest observed target processing time from the consumer
+  // identified by `consumer_id` for iteration with `iteration_id`. Returns an
+  // error if `target_processing_time` is ZeroDuration or negative.
+  absl::Status ReportTargetProcessingTime(int64_t iteration_id,
+                                          int64_t consumer_id,
+                                          absl::Duration target_processing_time)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Unregisters the worker with `worker_address` for iteration with
+  // `iteration_id`, removing its reported processing time from consideration of
+  // the current workload estimation. Returns an error if there are no
+  // previously reported processing times for iteration with `iteration_id` and
+  // the specified worker.
+  absl::Status RemoveWorker(int64_t iteration_id,
+                            const std::string& worker_address)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Unregisters the consumer identified by `consumer_id` for iteration with
+  // `iteration_id`, removing its reported target processing time from
+  // consideration of the current workload estimation. Returns an error if there
+  // are no previously reported processing times for iteration with
+  // `iteration_id` and the specified consumer.
+  absl::Status RemoveConsumer(int64_t iteration_id, int64_t consumer_id)
+      TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  // Registers iteration with `iteration_id` if it does not exist already,
+  // allowing its future reported times to be considered for the current
+  // workload estimation.
+  void EnsureIterationIsRegistered(int64_t iteration_id)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  mutable tsl::mutex mu_;
+  // Map from iteration id to AutoScaler.
+  absl::flat_hash_map<int64_t, std::unique_ptr<AutoScaler>> auto_scalers_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_AUTO_SCALER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/byte_size.h b/third_party/tflite-hdrs/tensorflow/core/data/service/byte_size.h
new file mode 100644
index 00000000..84d16533
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/byte_size.h
@@ -0,0 +1,198 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_BYTE_SIZE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_BYTE_SIZE_H_
+
+#include <cstddef>
+#include <ostream>
+#include <string>
+
+namespace tensorflow {
+namespace data {
+
+// A `ByteSize` represents data space usage measured in bytes. It is constructed
+// using Bytes, KB, MB, GB, or TB. Supports common arithmetic operations. Uses
+// `size_t` in its internal representation. Thus, it only supports non-negative
+// sizes, and the maximum byte size is std::numeric_limits<size_t>::max().
+//
+// Usage example:
+//
+//   constexpr ByteSize kAllocatedMemoryLimit = ByteSize::MB(64);
+//
+//   Tensor data = ...
+//   ByteSize tensor_size = ByteSize::Bytes(data.AllocatedBytes());
+//   if (tensor_size > 0.95 * kAllocatedMemoryLimit) {
+//     LOG(WARNING) << "Tensor memory usage is " << tensor_size << ". This is "
+//                  << "close to the limit " << kAllocatedMemoryLimit << ".";
+//   }
+class ByteSize final {
+ public:
+  // The default is 0 bytes.
+  constexpr ByteSize() = default;
+  constexpr ByteSize(const ByteSize&) = default;
+  ByteSize& operator=(const ByteSize&) = default;
+
+  // Constructs byte sizes of bytes, KB, MB, GB, and TB.
+  constexpr static ByteSize Bytes(size_t n);
+
+  // In this and following templates, `T` should be a numeric type,
+  // e.g.: size_t, double, etc.
+  template <class T>
+  constexpr static ByteSize KB(T n);
+
+  template <class T>
+  constexpr static ByteSize MB(T n);
+
+  template <class T>
+  constexpr static ByteSize GB(T n);
+
+  template <class T>
+  constexpr static ByteSize TB(T n);
+
+  // Compound assignment operators.
+  ByteSize& operator+=(ByteSize rhs);
+
+  // Does not support negative bytes. If *this < rhs, returns 0 bytes.
+  ByteSize& operator-=(ByteSize rhs);
+
+  template <class T>
+  ByteSize& operator*=(T rhs);
+
+  template <class T>
+  ByteSize& operator/=(T rhs);
+
+  // Converts the measurement into the specified unit.
+  size_t ToUnsignedBytes() const;
+  double ToDoubleBytes() const;
+  double ToDoubleKB() const;
+  double ToDoubleMB() const;
+  double ToDoubleGB() const;
+  double ToDoubleTB() const;
+
+  // Returns a human-readable string of the byte size. For example, "5KB",
+  // "1GB", etc.
+  std::string DebugString() const;
+
+ private:
+  constexpr explicit ByteSize(double bytes) : bytes_(bytes) {}
+
+  size_t bytes_ = 0;
+};
+
+constexpr ByteSize ByteSize::Bytes(size_t n) { return ByteSize(n); };
+
+template <class T>
+constexpr ByteSize ByteSize::KB(T n) {
+  return ByteSize::Bytes(n * (size_t{1} << 10));
+}
+
+template <class T>
+constexpr ByteSize ByteSize::MB(T n) {
+  return ByteSize::Bytes(n * (size_t{1} << 20));
+}
+
+template <class T>
+constexpr ByteSize ByteSize::GB(T n) {
+  return ByteSize::Bytes(n * (size_t{1} << 30));
+}
+
+template <class T>
+constexpr ByteSize ByteSize::TB(T n) {
+  return ByteSize::Bytes(n * (size_t{1} << 40));
+}
+
+// Compound assignments.
+inline ByteSize& ByteSize::operator+=(ByteSize rhs) {
+  bytes_ += rhs.ToUnsignedBytes();
+  return *this;
+}
+
+inline ByteSize& ByteSize::operator-=(ByteSize rhs) {
+  if (bytes_ < rhs.ToUnsignedBytes()) {
+    bytes_ = 0;
+    return *this;
+  }
+  bytes_ -= rhs.ToUnsignedBytes();
+  return *this;
+}
+
+template <class T>
+inline ByteSize& ByteSize::operator*=(T rhs) {
+  bytes_ *= rhs;
+  return *this;
+}
+
+template <class T>
+inline ByteSize& ByteSize::operator/=(T rhs) {
+  bytes_ /= rhs;
+  return *this;
+}
+
+// Binary arithmetic operators.
+inline ByteSize operator+(ByteSize lhs, ByteSize rhs) {
+  return lhs += rhs;
+}
+
+inline ByteSize operator-(ByteSize lhs, ByteSize rhs) {
+  return lhs -= rhs;
+}
+
+template <class T>
+inline ByteSize operator*(ByteSize lhs, T rhs) { return lhs *= rhs; }
+
+template <class T>
+inline ByteSize operator*(T lhs, ByteSize rhs) { return rhs *= lhs; }
+
+template <class T>
+inline ByteSize operator/(ByteSize lhs, T rhs) { return lhs /= rhs; }
+
+inline double operator/(ByteSize lhs, ByteSize rhs) {
+  return lhs.ToDoubleBytes() / rhs.ToDoubleBytes();
+}
+
+// Comparison operators.
+inline bool operator<(ByteSize lhs, ByteSize rhs) {
+  return lhs.ToUnsignedBytes() < rhs.ToUnsignedBytes();
+}
+
+inline bool operator>(ByteSize lhs, ByteSize rhs) {
+  return rhs < lhs;
+}
+
+inline bool operator>=(ByteSize lhs, ByteSize rhs) {
+  return !(lhs < rhs);
+}
+
+inline bool operator<=(ByteSize lhs, ByteSize rhs) {
+  return !(rhs < lhs);
+}
+
+inline bool operator==(ByteSize lhs, ByteSize rhs) {
+  return lhs.ToUnsignedBytes() == rhs.ToUnsignedBytes();
+}
+
+inline bool operator!=(ByteSize lhs, ByteSize rhs) {
+  return !(lhs == rhs);
+}
+
+// Output operator, which supports logging with LOG(*).
+inline std::ostream& operator<<(std::ostream& os, ByteSize byte_size) {
+  return os << byte_size.DebugString();
+}
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_BYTE_SIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/client/common.h b/third_party/tflite-hdrs/tensorflow/core/data/service/client/common.h
new file mode 100644
index 00000000..58c0f0a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/client/common.h
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_CLIENT_COMMON_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_CLIENT_COMMON_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// tf.data service parameters.
+struct DataServiceParams final {
+  std::string dataset_id;
+  ProcessingModeDef processing_mode;
+  std::string address;
+  std::string protocol;
+  std::string data_transfer_protocol;
+  std::string job_name;
+  int64_t repetition = 0;
+  std::optional<int64_t> num_consumers;
+  std::optional<int64_t> consumer_index;
+  int64_t max_outstanding_requests = 0;
+  absl::Duration task_refresh_interval;
+  TargetWorkers target_workers = TargetWorkers::TARGET_WORKERS_UNSPECIFIED;
+  DataServiceMetadata metadata;
+  std::optional<CrossTrainerCacheOptions> cross_trainer_cache_options;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_CLIENT_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/client/data_service_client.h b/third_party/tflite-hdrs/tensorflow/core/data/service/client/data_service_client.h
new file mode 100644
index 00000000..7c211d55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/client/data_service_client.h
@@ -0,0 +1,274 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_CLIENT_DATA_SERVICE_CLIENT_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_CLIENT_DATA_SERVICE_CLIENT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/data/service/client/common.h"
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/worker_client.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Interface for interacting with the tf.data service iterator context.
+class DataServiceContext {
+ public:
+  virtual ~DataServiceContext() = default;
+  virtual std::unique_ptr<Thread> StartThread(const string& name,
+                                              std::function<void()> fn) = 0;
+  virtual void RecordBufferEnqueue(const std::vector<Tensor>& element) = 0;
+  virtual void RecordBufferDequeue(const std::vector<Tensor>& element) = 0;
+  // Returns the time in nanoseconds a tf.data input pipeline can take to
+  // produce an element such that the downstream processor wait time is 0.
+  // Returns 0 if there are not sufficient recorded iterator gap times to
+  // produce a good estimate, or the tf.data Model instance is null.
+  virtual double GetTargetProcessingTimeNsec() const = 0;
+  // Updates the `max_outstanding_requests` with
+  // `requested_outstanding_requests`.
+  // Returns the new max outstanding requests which may be different from the
+  // requested one depending on available ram.
+  virtual int64_t UpdateMaxOutstandingRequests(
+      int64_t max_outstanding_requests,
+      int64_t requested_outstanding_requests) = 0;
+};
+
+using DataServiceContextFactory =
+    std::function<std::unique_ptr<DataServiceContext>()>;
+
+// API for reading data from tf.data service.
+//
+// The client works by reading from tf.data workers in parallel and interleaving
+// the dataset elements. It periodically queries the dispatcher to decide which
+// workers to read from (in case workers are added or removed). The data reading
+// is non-deterministic. This class is thread-safe.
+class DataServiceClient {
+ public:
+  explicit DataServiceClient(const DataServiceParams& params);
+  virtual ~DataServiceClient();
+  DataServiceClient(const DataServiceClient&) = delete;
+  DataServiceClient& operator=(const DataServiceClient&) = delete;
+
+  // Initializes the client.
+  absl::Status Initialize(
+      const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+      Allocator* allocator);
+
+  // Reads the next element from tf.data workers. Blocks if the next element is
+  // not ready.
+  virtual absl::StatusOr<GetNextResult> GetNext(
+      DataServiceContextFactory context_factory);
+
+  // Cancels the client.
+  void Cancel();
+
+  TraceMeMetadata GetTraceMeMetadata() const;
+
+ private:
+  struct Task {
+    Task(const TaskInfo& info, std::unique_ptr<DataServiceWorkerClient> worker)
+        : info(info), worker(std::move(worker)) {}
+
+    const TaskInfo info;
+    // Client for fetching task elements from the tf.data service worker.
+    std::unique_ptr<DataServiceWorkerClient> worker;
+    // The next round to read from the task.
+    int64_t round = 0;
+    // Whether the task has been removed. The task will eventually be
+    // deleted from `tasks_` on the next dispatcher heartbeat.
+    bool removed = false;
+    bool skipped_previous_round = false;
+    // Indicates whether a worker thread is currently processing the task.
+    bool in_use TF_GUARDED_BY(&DataServiceClient::mu_) = false;
+    // Indicates whether the worker has returned end_of_sequence for the task.
+    bool end_of_sequence TF_GUARDED_BY(&DataServiceClient::mu_) = false;
+    // Number of retries. The more it is retried, the longer it should wait
+    // before the next retry.
+    int64_t num_retries = 0;
+  };
+
+  struct Result {
+    Result() = default;
+    Result(Result&&) = default;
+    Result& operator=(Result&&) = default;
+    Result(const Result&) = delete;
+    Result& operator=(const Result&) = delete;
+
+    // Whether the result has been computed yet. GetNext needs to block
+    // until the next result is ready.
+    bool ready TF_GUARDED_BY(&DataServiceClient::mu_) = false;
+    std::vector<Tensor> element TF_GUARDED_BY(&DataServiceClient::mu_);
+    // The element's index within the tf.data worker it came from. Used for
+    // debugging.
+    int64_t element_index TF_GUARDED_BY(&DataServiceClient::mu_) = -1;
+    // The id of the task that generated the result.
+    int64_t task_id TF_GUARDED_BY(&DataServiceClient::mu_) = -1;
+    bool end_of_sequence TF_GUARDED_BY(&DataServiceClient::mu_) = false;
+    bool skip TF_GUARDED_BY(&DataServiceClient::mu_) = false;
+  };
+
+  void EnsureThreadsStarted();
+  void CancelThreads();
+  // Returns whether the client has finished and should return.
+  bool Finished() const;
+  // Returns whether the job has more data.
+  bool ShouldWaitForNext() const;
+  void DeleteLocalWorkerTasks();
+  bool ShouldDeleteLocalTask(const TaskInfo& task) const;
+  // Periodically refresh the task list.
+  // Maintain one thread fetching elements for each task.
+  // TODO(aaudibert): Instead of polling, have dispatcher send updates when
+  // the list of tasks changes.
+  void TaskThreadManager();
+  void TryBlockRound(int64_t round) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void UpdateIterationFinished(bool iteration_finished);
+  absl::Status AddTask(const TaskInfo& task_info);
+  absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>> CreateWorkerClient(
+      const TaskInfo& task_info);
+  absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>> CreateWorkerClient(
+      const std::string& protocol, const TaskInfo& task_info);
+  absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>>
+  CreateGrpcWorkerClient(const TaskInfo& task_info);
+  absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>>
+  CreateAlternativeWorkerClientMaybeWithGrpcFallback(
+      const DataTransferServerInfo& transfer_server, const TaskInfo& task_info);
+  void Heartbeat();
+  void UpdateTasks(const ClientHeartbeatResponse& resp);
+  bool ShouldReadFromTask(const TaskInfo& task) const;
+  void RecordTFMetrics(const ClientHeartbeatResponse& resp);
+  void UpdateBufferSize();
+  void UpdateWorkerThreads();
+  void RunWorkerThread(std::function<void()> done);
+  // Reports whether we can request another element without violating
+  // `max_outstanding_requests_`.
+  bool ShouldProcessTask();
+  // Searches for a task to process, visiting tasks in-order and giving every
+  // task a chance to proceed.
+  std::shared_ptr<Task> GetTaskToProcess();
+  void AdvanceTaskIndex();
+  absl::Status TryGetElement(const Task& task, bool allow_skip,
+                             GetElementResult& result);
+  void ProcessGetElementResponse(bool enqueue_result,
+                                 GetElementResult& get_element_result,
+                                 std::shared_ptr<Result> result, Task& task);
+  absl::Status GetElementTraced(Task* task, int64_t deadline_micros,
+                                bool enqueue_result, bool allow_skip,
+                                std::shared_ptr<Result> result);
+  absl::Status MaybeRemoveTask(Task& task, int64_t deadline_micros,
+                               Result& result);
+  absl::Status GetElement(Task* task, int64_t deadline_micros,
+                          bool enqueue_result, bool allow_skip,
+                          std::shared_ptr<Result> result);
+  bool ResultReady() const;
+  std::shared_ptr<Result> PopNextResult();
+  bool IsCoordinatedRead() const;
+  std::string DebugString() const;
+
+  const DataServiceParams params_;
+
+  mutable mutex mu_;
+  condition_variable get_next_cv_ TF_GUARDED_BY(mu_);
+  condition_variable worker_thread_cv_ TF_GUARDED_BY(mu_);
+  condition_variable manager_thread_cv_ TF_GUARDED_BY(mu_);
+
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+
+  // Number of outstanding requests.
+  int64_t outstanding_requests_ TF_GUARDED_BY(mu_) = 0;
+
+  // max_outstanding_requests controls how many elements may be held in memory
+  // at the same time. This count includes both in-progress requests for
+  // elements as well as completed requests which haven't yet been produced.
+  int64_t max_outstanding_requests_ TF_GUARDED_BY(mu_);
+
+  // The number of threads in `worker_threads_` which are still running.
+  int64_t num_running_worker_threads_ TF_GUARDED_BY(mu_) = 0;
+
+  // The index of the next task in `tasks_` to read from.
+  int64_t next_task_index_ TF_GUARDED_BY(mu_) = 0;
+
+  // The number tasks in the `tasks_` list that have reached end_of_sequence.
+  int64_t finished_tasks_ TF_GUARDED_BY(mu_) = 0;
+
+  // List of tasks to read from.
+  std::vector<std::shared_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+
+  // The current round robin round we are engaged in. A round involves reading
+  // from each task once.
+  int64_t current_round_ TF_GUARDED_BY(mu_) = 0;
+
+  // Maximum round robin round to read up to before blocking, not inclusive.
+  // INVARIANT: current_round_ <= round_robin_round_limit_.
+  //            If current_round_ == round_robin_round_limit_,
+  //            next_task_index_ must be 0.
+  std::optional<int64_t> round_robin_round_limit_ TF_GUARDED_BY(mu_);
+
+  // A status to be returned from the next call to `GetNext`. This is set by
+  // asynchronous threads when they encounter errors.
+  absl::Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
+  // A queue of results for `GetElement` requests to read from. When doing
+  // strict round robin reads, the queue will contain placeholder results with
+  // their `Result::ready` field false until their data has been retrieved
+  // from a worker. When not doing round-robin reads, results are only added
+  // to the queue after they are ready, to avoid head-of-line blocking.
+  std::queue<std::shared_ptr<Result>> results_ TF_GUARDED_BY(mu_);
+
+  bool initialized_ = false;
+  std::unique_ptr<DataServiceContext> ctx_ TF_GUARDED_BY(mu_);
+
+  // Set once in Initialize().
+  int64_t job_id_;
+  int64_t iteration_client_id_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
+  const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info_;
+  Allocator* allocator_;
+
+  int64_t get_next_index_ TF_GUARDED_BY(mu_) = 0;
+
+  bool iteration_finished_ TF_GUARDED_BY(mu_) = false;
+  bool should_finish_iteration_ TF_GUARDED_BY(mu_) = true;
+
+  // The set of worker UIDs that we have already recorded metrics for.
+  absl::flat_hash_set<int64_t> worker_uids_ TF_GUARDED_BY(mu_);
+
+  std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<Thread> task_thread_manager_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_CLIENT_DATA_SERVICE_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/client/utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/client/utils.h
new file mode 100644
index 00000000..2d2a0b77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/client/utils.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_CLIENT_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_CLIENT_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Gets the `DataServiceMetadata` for `dataset_id`.
+absl::StatusOr<DataServiceMetadata> GetDataServiceMetadata(
+    const std::string& dataset_id, const std::string& address,
+    const std::string& protocol);
+
+// Gets the `DisableCompressAtRuntimeResponse.compression_disabled_at_runtime`
+// for the given dataset.
+absl::StatusOr<bool> CompressionDisabledAtRuntime(
+    const std::string& dataset_id, const std::string& address,
+    const std::string& protocol, bool disable_compression_at_runtime);
+
+// Gets the `DataServiceConfig` for the data service running at `address`.
+absl::StatusOr<DataServiceConfig> GetDataServiceConfig(
+    const std::string& address, const std::string& protocol);
+
+// Gets the compression from `metadata`. If `metadata` specifies no valid
+// compression, returns an internal error.
+absl::StatusOr<DataServiceMetadata::Compression> GetValidatedCompression(
+    const std::string& dataset_id, const DataServiceMetadata& metadata);
+
+// Estimates the cardinality of a data service dataset.
+int64_t EstimateCardinality(const ProcessingModeDef& processing_mode,
+                            const DataServiceMetadata& metadata,
+                            bool is_coordinated_read);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_CLIENT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/client/validate_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/client/validate_utils.h
new file mode 100644
index 00000000..07645004
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/client/validate_utils.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_CLIENT_VALIDATE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_CLIENT_VALIDATE_UTILS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/data/service/client/common.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// Validates data service dataset parameters.
+absl::Status ValidateDataServiceParams(
+    const DataServiceParams& data_service_params);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_CLIENT_VALIDATE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/common.h b/third_party/tflite-hdrs/tensorflow/core/data/service/common.h
new file mode 100644
index 00000000..e9760e56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/common.h
@@ -0,0 +1,120 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_COMMON_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Increment this when making backwards-incompatible changes to communication
+// between tf.data clients and servers.
+constexpr int kDataServiceVersion = 9;
+
+// If the user starts a colocated tf.data worker on each TF host, the worker
+// will be applied a "COLOCATED" tag. This is used to avoid reading from tf.data
+// workers on other TF hosts when the host runs a local tf.data service worker.
+constexpr absl::string_view kColocatedWorkerTag = "COLOCATED";
+
+// Container to hold the result of a `GetNext` call.
+struct GetNextResult final {
+  explicit GetNextResult() = default;
+  GetNextResult(const GetNextResult&) = delete;
+  GetNextResult& operator=(const GetNextResult&) = delete;
+  GetNextResult(GetNextResult&&) = default;
+  GetNextResult& operator=(GetNextResult&&) = delete;
+
+  static GetNextResult EndOfSequence() {
+    GetNextResult result;
+    result.end_of_sequence = true;
+    return result;
+  }
+
+  std::vector<Tensor> tensors;
+  bool end_of_sequence = false;
+};
+
+// Returns true if `processing_mode` specifies no sharding policy.
+bool IsNoShard(const ProcessingModeDef& processing_mode);
+
+// Returns true if `processing_mode` is dynamic sharding.
+bool IsDynamicShard(const ProcessingModeDef& processing_mode);
+
+// Returns true if `processing_mode` is static sharding.
+bool IsStaticShard(const ProcessingModeDef& processing_mode);
+
+// Returns an internal error if `processing_mode` is invalid.
+absl::Status ValidateProcessingMode(const ProcessingModeDef& processing_mode);
+
+// Converts tf.data service `sharding_policy` to `AutoShardPolicy`. Returns an
+// internal error if `sharding_policy` is not supported.
+absl::StatusOr<AutoShardPolicy> ToAutoShardPolicy(
+    ProcessingModeDef::ShardingPolicy sharding_policy);
+
+// Parses a string representing a `TargetWorkers` (case-insensitive).
+// Returns InvalidArgument if the string is not recognized.
+absl::StatusOr<TargetWorkers> ParseTargetWorkers(absl::string_view s);
+
+// Converts a `TargetWorkers` enum to string.
+std::string TargetWorkersToString(TargetWorkers target_workers);
+
+// Parses a string representing a `DeploymentMode` (case-insensitive).
+// Returns InvalidArgument if the string is not recognized.
+absl::StatusOr<DeploymentMode> ParseDeploymentMode(absl::string_view s);
+
+// Returns true if `status` is a retriable error that indicates preemption.
+bool IsPreemptedError(const absl::Status& status);
+
+// Base class for data service clients. Data service clients are
+// threadsafe.
+class DataServiceClientBase {
+ public:
+  DataServiceClientBase(const std::string& address, const std::string& protocol)
+      : address_(address), protocol_(protocol) {}
+
+  virtual ~DataServiceClientBase() = default;
+  // Not copyable or movable.
+  DataServiceClientBase(const DataServiceClientBase&) = delete;
+  DataServiceClientBase& operator=(const DataServiceClientBase&) = delete;
+
+  // Initializes the client. Calling `Initialize()` is not required since the
+  // first RPC will perform any necessary initialization. However, it can be
+  // useful to call `Initialize()` proactively so that any errors that happen
+  // during initialization can be surfaced earlier.
+  virtual absl::Status Initialize() { return EnsureInitialized(); }
+
+ protected:
+  // Initializes the client if it isn't already initialized.
+  virtual absl::Status EnsureInitialized() = 0;
+
+  const std::string address_;
+  const std::string protocol_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/credentials_factory.h b/third_party/tflite-hdrs/tensorflow/core/data/service/credentials_factory.h
new file mode 100644
index 00000000..d6a3bff5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/credentials_factory.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_CREDENTIALS_FACTORY_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_CREDENTIALS_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// Credential factory implementations should be threadsafe since all callers
+// to `GetCredentials` will get the same instance of `CredentialsFactory`.
+class CredentialsFactory {
+ public:
+  virtual ~CredentialsFactory() = default;
+
+  // Returns a protocol name for the credentials factory. This is the string to
+  // look up with `GetCredentials` to find the registered credentials factory.
+  virtual std::string Protocol() = 0;
+
+  // Stores server credentials to `*out`.
+  virtual absl::Status CreateServerCredentials(
+      std::shared_ptr<::grpc::ServerCredentials>* out) = 0;
+
+  // Stores client credentials to `*out`.
+  virtual absl::Status CreateClientCredentials(
+      std::shared_ptr<::grpc::ChannelCredentials>* out) = 0;
+
+  // Registers a credentials factory.
+  static void Register(CredentialsFactory* factory);
+
+  // Creates server credentials using the credentials factory registered as
+  // `protocol`, and stores them to `*out`.
+  static absl::Status CreateServerCredentials(
+      absl::string_view protocol,
+      std::shared_ptr<::grpc::ServerCredentials>* out);
+
+  // Creates client credentials using the credentials factory registered as
+  // `protocol`, and stores them to `*out`.
+  static absl::Status CreateClientCredentials(
+      absl::string_view protocol,
+      std::shared_ptr<::grpc::ChannelCredentials>* out);
+
+  // Returns whether a factory has been registered under the given protocol
+  // name.
+  static bool Exists(absl::string_view protocol);
+
+ private:
+  // Gets the credentials factory registered via `Register` for the specified
+  // protocol, and stores it to `*out`.
+  static absl::Status Get(const absl::string_view protocol,
+                          CredentialsFactory** out);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_CREDENTIALS_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/cross_trainer_cache.h b/third_party/tflite-hdrs/tensorflow/core/data/service/cross_trainer_cache.h
new file mode 100644
index 00000000..3ef48fe4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/cross_trainer_cache.h
@@ -0,0 +1,355 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_CROSS_TRAINER_CACHE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_CROSS_TRAINER_CACHE_H_
+
+#include <cstddef>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/data/service/byte_size.h"
+#include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Sliding-window cache shared across concurrent trainers. Readers call `Get` to
+// read elements they haven't read. After a trainer reads an element, it remains
+// in the cache and the data is shared with other trainers. This is useful for
+// datasets involving expensive computation, and multiple models use the same
+// data for training. For example, for hyperparameter tuning.
+//
+// The cache progresses when a trainer that has consumed all elements in the
+// cache requests additional data. It has a bounded size. Elements are garbage
+// collected when the cache becomes full. Consequently, trainers read from a
+// sliding window through the dataset and may not read the full dataset.
+//
+// The `CrossTrainerCache` class is thread-safe.
+//
+// Example usage:
+//
+//   // `InfiniteRange` returns 1, 2, 3, ... in the `GetNext` calls.
+//   class InfiniteRange : public CachableSequence<int64_t> {
+//    public:
+//     StatusOr<int64_t> GetNext() override {
+//       return next_++;
+//     }
+//
+//     size_t GetElementSizeBytes(const int64_t& element) const override {
+//       return sizeof(element);
+//     }
+//
+//    private:
+//     int64_t next_ = 1;
+//   };
+//
+//   CrossTrainerCache<int64_t> cache(
+//       /*max_cache_size_bytes=*/10 * (size_t{1} << 30),  // 10GB
+//       std::make_unique<InfiniteRange>());
+//
+//   std::shared_ptr<int64_t> next;
+//   TF_ASSIGN_OR_RETURN(next, cache.Get("Trainer 1"));  // Returns 1
+//   TF_ASSIGN_OR_RETURN(next, cache.Get("Trainer 2"));  // Returns 1
+//   TF_ASSIGN_OR_RETURN(next, cache.Get("Trainer 1"));  // Returns 2
+//   TF_ASSIGN_OR_RETURN(next, cache.Get("Trainer 2"));  // Returns 2
+
+// To use the cache, the user needs to define a `CachableSequence` to generate
+// an infinite sequence of data. It should implement a `GetNext` method to
+// produce elements, and a `GetElementSizeBytes` method to estimate the element
+// size in bytes.
+template <class ElementType>
+class CachableSequence {
+ public:
+  virtual ~CachableSequence() = default;
+
+  // Returns the next element to be cached.
+  virtual StatusOr<ElementType> GetNext() = 0;
+
+  // Returns the estimated size of the element in bytes.
+  virtual size_t GetElementSizeBytes(const ElementType&) const = 0;
+};
+
+// Sliding-window cache shared across concurrent trainers.
+template <class ElementType>
+class CrossTrainerCache {
+ public:
+  // Creates a `CrossTrainerCache` with `max_cache_size_bytes` of memory budget.
+  // The cache should be able to hold at least one element, i.e.:
+  // REQUIRES: `max_cache_size_bytes >= max(GetElementSizeBytes(*))`
+  explicit CrossTrainerCache(
+      size_t max_cache_size_bytes,
+      std::unique_ptr<CachableSequence<ElementType>> cachable_sequence);
+  virtual ~CrossTrainerCache() = default;
+  CrossTrainerCache(const CrossTrainerCache&) = delete;
+  CrossTrainerCache& operator=(const CrossTrainerCache&) = delete;
+
+  // Gets the next element for a trainer. A `trainer_id` identifies the trainer
+  // reading from the cache. A trainer reads the next element it hasn't read
+  // before. After a trainer reads data, the data is cached and reused by other
+  // trainers.
+  StatusOr<std::shared_ptr<const ElementType>> Get(
+      const std::string& trainer_id);
+
+  // Cancels the cache with `status` and notifies the readers. After cancelling,
+  // all `Get` calls will return `status`.
+  // REQUIRES: !status.ok()
+  void Cancel(absl::Status status);
+
+  // Returns true if the cache has been cancelled.
+  bool IsCancelled() const;
+
+ private:
+  struct CacheQueryResult {
+    std::shared_ptr<const ElementType> element;
+    bool cache_hit;
+  };
+
+  // Returns the next element and metrics about this query.
+  StatusOr<CacheQueryResult> GetCacheQueryResult(const std::string& trainer_id);
+
+  // Returns true if element is ready for `trainer_id`. An element is ready if
+  // other trainers have read the data and the data remains in the cache. If the
+  // data is not ready, one of the trainers need to extend the cache.
+  bool IsElementReady(const std::string& trainer_id);
+
+  // Returns the absolute element index relative to the dataset (not relative to
+  // the cached elements).
+  size_t GetElementIndex(const std::string& trainer_id);
+
+  // Returns the next element for `trainer_id`.
+  StatusOr<std::shared_ptr<const ElementType>> GetElement(
+      const std::string& trainer_id);
+
+  // Reads a new element and writes it into the cache.
+  absl::Status ExtendCache();
+
+  // Frees old elements to keep the cache size below `max_cache_size_bytes_`.
+  // `new_element_size_bytes` is the size of the new element being inserted.
+  void FreeSpace(size_t new_element_size_bytes);
+
+  // Records the cache hit rate and cache size.
+  void RecordMetrics(const CacheQueryResult& result);
+
+  // Maximum cache size in bytes.
+  const size_t max_cache_size_bytes_;
+
+  // The element sequence over which the sliding window cache operates.
+  std::unique_ptr<CachableSequence<ElementType>> cachable_sequence_;
+
+  mutable mutex mu_;
+  mutable condition_variable cv_;
+
+  // If `status_` is non-OK, the cache is cancelled, and all method calls will
+  // return this status.
+  absl::Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
+
+  // `cache_` stores the cached elements.
+  std::deque<std::shared_ptr<const ElementType>> cache_ TF_GUARDED_BY(mu_);
+  size_t cache_size_bytes_ TF_GUARDED_BY(mu_) = 0;
+  size_t cache_start_index_ TF_GUARDED_BY(mu_) = 0;
+
+  // True if one thread is extending the cache.
+  bool extending_cache_ TF_GUARDED_BY(mu_) = false;
+
+  // Maps trainer IDs to element indices. The indices are absolute indices
+  // within the dataset. The actual index to use with `cache_` would be
+  // `trainer_to_element_index_map_[trainer_id] - cache_start_index_`.
+  absl::flat_hash_map<std::string, size_t> trainer_to_element_index_map_
+      TF_GUARDED_BY(mu_);
+};
+
+template <class ElementType>
+CrossTrainerCache<ElementType>::CrossTrainerCache(
+    size_t max_cache_size_bytes,
+    std::unique_ptr<CachableSequence<ElementType>> cachable_sequence)
+    : max_cache_size_bytes_(max_cache_size_bytes),
+      cachable_sequence_(std::move(cachable_sequence)) {
+  DCHECK_GT(max_cache_size_bytes, 0)
+      << "CrossTrainerCache size must be greater than 0.";
+  VLOG(2) << "Initialized tf.data service cross-trainer cache with "
+          << ByteSize::Bytes(max_cache_size_bytes) << " of memory.";
+}
+
+template <class ElementType>
+StatusOr<std::shared_ptr<const ElementType>>
+CrossTrainerCache<ElementType>::Get(const std::string& trainer_id)
+    TF_LOCKS_EXCLUDED(mu_) {
+  if (trainer_id.empty()) {
+    return errors::InvalidArgument(
+        "tf.data service cross-trainer cache requires a non-empty trainer ID.");
+  }
+
+  TF_ASSIGN_OR_RETURN(CacheQueryResult result, GetCacheQueryResult(trainer_id));
+  RecordMetrics(result);
+  return result.element;
+}
+
+template <class ElementType>
+StatusOr<typename CrossTrainerCache<ElementType>::CacheQueryResult>
+CrossTrainerCache<ElementType>::GetCacheQueryResult(
+    const std::string& trainer_id) {
+  bool should_extend_cache = false;
+  while (true) {
+    {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(status_);
+      if (IsElementReady(trainer_id)) {
+        TF_ASSIGN_OR_RETURN(std::shared_ptr<const ElementType> element,
+                            GetElement(trainer_id));
+        return CacheQueryResult{element,
+                                /*is_cache_hit=*/!should_extend_cache};
+      }
+
+      // Extends the cache or waits for another thread to extend the cache. When
+      // concurrent trainers wait for the next element, only one of them should
+      // extend the cache.
+      if (extending_cache_) {
+        should_extend_cache = false;
+        cv_.wait(l);
+      } else {
+        should_extend_cache = true;
+        extending_cache_ = true;
+      }
+    }
+
+    if (should_extend_cache) {
+      absl::Status s = ExtendCache();
+      mutex_lock l(mu_);
+      extending_cache_ = false;
+      cv_.notify_all();
+      TF_RETURN_IF_ERROR(s);
+    }
+  }
+}
+
+template <class ElementType>
+bool CrossTrainerCache<ElementType>::IsElementReady(
+    const std::string& trainer_id) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  return GetElementIndex(trainer_id) < cache_start_index_ + cache_.size();
+}
+
+template <class ElementType>
+StatusOr<std::shared_ptr<const ElementType>>
+CrossTrainerCache<ElementType>::GetElement(const std::string& trainer_id)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  size_t element_index = GetElementIndex(trainer_id);
+  if (element_index >= std::numeric_limits<size_t>::max()) {
+    return errors::Internal(
+        "tf.data service caching element index exceeds integer limit. Got ",
+        element_index);
+  }
+
+  std::shared_ptr<const ElementType> result =
+      cache_[element_index - cache_start_index_];
+  trainer_to_element_index_map_[trainer_id] = element_index + 1;
+  return result;
+}
+
+template <class ElementType>
+size_t CrossTrainerCache<ElementType>::GetElementIndex(
+    const std::string& trainer_id) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  size_t element_index = trainer_to_element_index_map_[trainer_id];
+  if (element_index < cache_start_index_) {
+    element_index = cache_start_index_;
+  }
+  return element_index;
+}
+
+template <class ElementType>
+absl::Status CrossTrainerCache<ElementType>::ExtendCache()
+    TF_LOCKS_EXCLUDED(mu_) {
+  TF_ASSIGN_OR_RETURN(ElementType element, cachable_sequence_->GetNext());
+  size_t new_element_size_bytes =
+      cachable_sequence_->GetElementSizeBytes(element);
+  if (new_element_size_bytes > max_cache_size_bytes_) {
+    return errors::InvalidArgument(
+        "tf.data service element size is larger than cache size in bytes. Got ",
+        "element size: ", new_element_size_bytes,
+        " and cache size: ", max_cache_size_bytes_);
+  }
+
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(status_);
+  FreeSpace(new_element_size_bytes);
+  cache_.push_back(std::make_shared<ElementType>(std::move(element)));
+  cache_size_bytes_ += new_element_size_bytes;
+  return absl::OkStatus();
+}
+
+template <class ElementType>
+void CrossTrainerCache<ElementType>::FreeSpace(size_t new_element_size_bytes)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  size_t num_elements_discarded = 0;
+  while (!cache_.empty() &&
+         cache_size_bytes_ + new_element_size_bytes > max_cache_size_bytes_) {
+    size_t free_bytes =
+        cachable_sequence_->GetElementSizeBytes(*cache_.front());
+    cache_.pop_front();
+    cache_size_bytes_ -= free_bytes;
+    ++cache_start_index_;
+    ++num_elements_discarded;
+  }
+
+  VLOG(3) << "Freed " << num_elements_discarded << " element(s) from "
+          << "tf.data service cross-trainer cache. Memory usage: "
+          << ByteSize::Bytes(cache_size_bytes_) << ".";
+}
+
+template <class ElementType>
+void CrossTrainerCache<ElementType>::Cancel(absl::Status status)
+    TF_LOCKS_EXCLUDED(mu_) {
+  DCHECK(!status.ok())
+      << "Cancelling CrossTrainerCache requires a non-OK status. Got "
+      << status;
+  VLOG(2) << "Cancel tf.data service cross-trainer cache with status "
+          << status;
+  mutex_lock l(mu_);
+  status_ = std::move(status);
+  cv_.notify_all();
+}
+
+template <class ElementType>
+bool CrossTrainerCache<ElementType>::IsCancelled() const
+    TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  return !status_.ok();
+}
+
+template <class ElementType>
+void CrossTrainerCache<ElementType>::RecordMetrics(
+    const CacheQueryResult& result) {
+  metrics::RecordTFDataServiceCrossTrainerCacheQuery(result.cache_hit);
+  size_t cache_size_bytes = 0;
+  {
+    mutex_lock l(mu_);
+    cache_size_bytes = cache_size_bytes_;
+  }
+  metrics::RecordTFDataServiceCrossTrainerCacheSizeBytes(cache_size_bytes);
+}
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_CROSS_CLIENT_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/data_transfer.h b/third_party/tflite-hdrs/tensorflow/core/data/service/data_transfer.h
new file mode 100644
index 00000000..23c8247d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/data_transfer.h
@@ -0,0 +1,152 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DATA_TRANSFER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DATA_TRANSFER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// The result of a GetElement request. Exactly one of the following will be
+// true: (1) `components` is nonempty (2) `end_of_sequence` is true (3) `skip`
+// is true.
+struct GetElementResult {
+  GetElementResult() = default;
+  GetElementResult(const GetElementResult&) = delete;
+  GetElementResult& operator=(const GetElementResult&) = delete;
+  GetElementResult(GetElementResult&&) = default;
+  GetElementResult& operator=(GetElementResult&&) = default;
+
+  // Creates a copy of this result. This is used to create multiple copies of
+  // the same cached value.
+  GetElementResult Copy() const;
+
+  // Estimated memory used by this object, measured in bytes.
+  size_t EstimatedMemoryUsageBytes() const;
+
+  // A dataset element produced by a GetElement request.
+  std::vector<Tensor> components;
+  // The element's index within the task it came from.
+  int64_t element_index = 0;
+  // If true, indicates that there is no more data to read.
+  bool end_of_sequence = false;
+  // If true, indicates that there is still data, but the caller should skip
+  // reading from the worker. This is used for load balancing when doing round
+  // robin reads.
+  bool skip = false;
+};
+
+// Client for communicating with the tf.data service transfer server.
+class DataTransferClient {
+ public:
+  struct Config {
+    absl::string_view protocol;
+    std::string address;
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info;
+    Allocator* allocator;
+  };
+  using ClientFactoryT =
+      std::function<absl::Status(Config, std::unique_ptr<DataTransferClient>*)>;
+  virtual ~DataTransferClient() = default;
+
+  // Fetches the next element.
+  virtual absl::Status GetElement(const GetElementRequest& req,
+                                  GetElementResult& result) = 0;
+
+  // Makes a best effort to cancel all outstanding calls in progress for the
+  // client, and causes further calls to return Cancelled status.
+  virtual void TryCancel() = 0;
+
+  // Registers a DataTransferClient factory under `name`.
+  static void Register(std::string name, ClientFactoryT factory);
+
+  // Builds a DataTransferClient from the factory registered under `name`.
+  static absl::Status Build(std::string name, Config config,
+                            std::unique_ptr<DataTransferClient>* out);
+
+  // Returns a string describing properties of the client relevant for checking
+  // compatibility with a server for a given protocol.
+  virtual absl::StatusOr<std::string> GetCompatibilityInfo() const {
+    return std::string();
+  }
+
+  // Returns an error if the client is incompatible with a server which has the
+  // properties described in `server_compatibility_info`.
+  virtual absl::Status CheckCompatibility(
+      const std::string& server_compatibility_info) const {
+    return absl::OkStatus();
+  }
+
+ protected:
+  Env* const env_ = Env::Default();
+};
+
+// Server for communicating with the tf.data service transfer client.
+class DataTransferServer {
+ public:
+  using GetElementT =
+      std::function<absl::Status(const GetElementRequest*, GetElementResult*)>;
+  using ServerFactoryT = std::function<absl::Status(
+      GetElementT, std::shared_ptr<DataTransferServer>*)>;
+  virtual ~DataTransferServer() = default;
+
+  // Starts DataTransferServer, it should be available for requests afterwards.
+  virtual absl::Status Start(const experimental::WorkerConfig& config) = 0;
+
+  // Return the port that this server is listening on.
+  virtual int Port() const = 0;
+
+  // Register a DataTransferServer factory under `name`.
+  static void Register(std::string name, ServerFactoryT factory);
+
+  // Builds a DataTransferServer from the factory registered with `name`.
+  static absl::Status Build(std::string name, GetElementT get_element,
+                            std::shared_ptr<DataTransferServer>* out);
+
+  // Returns a string describing properties of the server relevant for checking
+  // compatibility with a client for a given protocol.
+  virtual absl::StatusOr<std::string> GetCompatibilityInfo() const {
+    return std::string();
+  }
+
+  // If `true`, data service clients should fall back to gRPC for this server if
+  // they fail to create a data transfer client for it.
+  virtual bool FallBackToGrpcAtClientCreationTime() const { return true; }
+
+  // If `true`, data service clients should fall back to gRPC for this server if
+  // it nonretryably fails to transfer an element.
+  virtual bool FallBackToGrpcAtGetElementTime() const { return true; }
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DATA_TRANSFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/dataset_store.h b/third_party/tflite-hdrs/tensorflow/core/data/service/dataset_store.h
new file mode 100644
index 00000000..f79120bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/dataset_store.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DATASET_STORE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DATASET_STORE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/data/service/dispatcher_state.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+// An interface for storing and getting dataset definitions.
+class DatasetStore {
+ public:
+  virtual ~DatasetStore() = default;
+
+  // Stores the given dataset under the given key. Overwrites a dataset if it
+  // already exists.
+  virtual absl::Status Put(const std::string& key,
+                           const DatasetDef& dataset) = 0;
+  // Gets the dataset for the given key, storing the dataset in `dataset_def`.
+  virtual absl::Status Get(const std::string& key,
+                           std::shared_ptr<const DatasetDef>& dataset_def) = 0;
+};
+
+// Dataset store which reads and writes datasets within a directory.
+// The dataset with key `key` is stored at the path "datasets_dir/key".
+class FileSystemDatasetStore : public DatasetStore {
+ public:
+  explicit FileSystemDatasetStore(const std::string& datasets_dir);
+  FileSystemDatasetStore(const FileSystemDatasetStore&) = delete;
+  FileSystemDatasetStore& operator=(const FileSystemDatasetStore&) = delete;
+
+  absl::Status Put(const std::string& key, const DatasetDef& dataset) override;
+  absl::Status Get(const std::string& key,
+                   std::shared_ptr<const DatasetDef>& dataset_def) override;
+
+ private:
+  const std::string datasets_dir_;
+};
+
+// DatasetStore which stores all datasets in memory. This is useful when the
+// dispatcher doesn't have a work directory configured.
+class MemoryDatasetStore : public DatasetStore {
+ public:
+  MemoryDatasetStore() = default;
+  MemoryDatasetStore(const MemoryDatasetStore&) = delete;
+  MemoryDatasetStore& operator=(const MemoryDatasetStore&) = delete;
+
+  absl::Status Put(const std::string& key, const DatasetDef& dataset) override;
+  absl::Status Get(const std::string& key,
+                   std::shared_ptr<const DatasetDef>& dataset_def) override;
+
+ private:
+  // Mapping from key to dataset definition.
+  absl::flat_hash_map<std::string, std::shared_ptr<const DatasetDef>> datasets_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DATASET_STORE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_client.h b/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_client.h
new file mode 100644
index 00000000..253d8ec0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_client.h
@@ -0,0 +1,153 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_CLIENT_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_CLIENT_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Client for communicating with the tf.data service dispatcher.
+class DataServiceDispatcherClient : public DataServiceClientBase {
+ public:
+  DataServiceDispatcherClient(const std::string& address,
+                              const std::string& protocol)
+      : DataServiceClientBase(address, protocol) {}
+
+  absl::Status Initialize() override;
+
+  // Sends a heartbeat to the dispatcher. If the worker wasn't already
+  // registered with the dispatcher, this will register the worker. The
+  // dispatcher will report which new tasks the worker should run, and which
+  // tasks it should delete.
+  absl::StatusOr<WorkerHeartbeatResponse> WorkerHeartbeat(
+      const WorkerHeartbeatRequest& request);
+
+  // Updates the dispatcher with information about the worker's state.
+  absl::Status WorkerUpdate(const std::string& worker_address,
+                            std::vector<TaskProgress>& task_progress);
+
+  // Gets a dataset definition for the given dataset id, and stores the
+  // definition in `dataset_def`.
+  absl::Status GetDatasetDef(const std::string& dataset_id,
+                             DatasetDef& dataset_def);
+
+  // Gets the next split for the specified iteration id, repetition, and split
+  // provider index.
+  absl::Status GetSplit(int64_t iteration_id, int64_t repetition,
+                        int64_t split_provider_index, Tensor& split,
+                        bool& end_of_splits);
+
+  // Gets the next split for the specified source of a stream of the snapshot in
+  // `base_path`. If `end_of_splits` returns true, then there are no more splits
+  // to be processed for the specified stream source.
+  virtual absl::Status GetSnapshotSplit(
+      const std::string& worker_address, const std::string& base_path,
+      int64_t stream_index, int64_t source_index, int64_t repetition_index,
+      Tensor& split, int64_t& local_split_index, bool& end_of_splits);
+
+  // Initiates the process of materializing `dataset`'s output to `path`.
+  absl::Status Snapshot(
+      const DatasetDef& dataset, const std::string& path,
+      const experimental::DistributedSnapshotMetadata& metadata);
+
+  // Registers a dataset with the tf.data service, and stores the generated
+  // dataset id in `dataset_id`.
+  absl::Status RegisterDataset(
+      const DatasetDef& dataset, const DataServiceMetadata& metadata,
+      const std::optional<std::string>& requested_dataset_id,
+      std::string& dataset_id);
+
+  // If `job_name` is set, looks up a job matching `job_name`.
+  // If `job_name` is absent or no matching job is found, creates a
+  // new job. The resulting job id is stored in `job_id`.
+  absl::Status GetOrCreateJob(const std::string& dataset_id,
+                              const ProcessingModeDef& processing_mode,
+                              const std::optional<std::string>& job_name,
+                              std::optional<int64_t> num_consumers,
+                              bool use_cross_trainer_cache,
+                              TargetWorkers target_workers, int64_t& job_id);
+
+  // Looks up an iteration of a job, creating an iteration if one doesn't
+  // already exist. The returned `iteration_client_id` can be used to query
+  // information about the iteration. The client should call
+  // `ReleaseIterationClient` when finished with the iteration, so that
+  // resources can be reclaimed.
+  absl::Status GetOrCreateIteration(int64_t job_id, int64_t repetition,
+                                    int64_t& iteration_client_id);
+
+  // Releases a iteration client id, indicating that the id will no longer be
+  // used to read from the iteration.
+  absl::Status ReleaseIterationClient(int64_t iteration_client_id);
+
+  // Attempts to remove a task. The task is removed if all consumers try to
+  // remove the task in the same round.
+  absl::Status MaybeRemoveTask(int64_t task_id, int64_t consumer_index,
+                               int64_t round, bool& removed);
+
+  // Heartbeats to the dispatcher, getting back the tasks that should be
+  // running, and whether the iteration is finished.
+  absl::Status ClientHeartbeat(ClientHeartbeatRequest& req,
+                               ClientHeartbeatResponse& resp);
+
+  // Queries the dispatcher for its registered workers. The worker info will be
+  // stored in `workers`.
+  absl::Status GetWorkers(std::vector<WorkerInfo>& workers);
+
+  // Returns data service metadata for the registered dataset.
+  absl::Status GetDataServiceMetadata(const std::string& dataset_id,
+                                      DataServiceMetadata& metadata);
+
+  // Returns data service config of the data service cluster.
+  absl::Status GetDataServiceConfig(DataServiceConfig& config);
+
+  // Returns information about the decision to disable compression at runtime
+  // for a given dataset.
+  absl::Status DisableCompressionAtRuntime(
+      const std::string& dataset_id, bool disable_compression_at_runtime,
+      DisableCompressionAtRuntimeResponse& response);
+
+ protected:
+  absl::Status EnsureInitialized() override;
+
+ private:
+  mutex mu_;
+  // Initialization is guarded by `mu_`, but using the stub does not require
+  // holding `mu_`
+  std::unique_ptr<DispatcherService::Stub> stub_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_impl.h b/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_impl.h
new file mode 100644
index 00000000..6fa299dc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_impl.h
@@ -0,0 +1,412 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/auto_scaler.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dataset_store.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_state.h"
+#include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_manager.h"
+#include "tensorflow/core/data/service/task_remover.h"
+#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// A service which coordinates a pool of workers to serve dataset elements over
+// RPC.
+//
+// Glossary:
+// * Dataset: A definition of how to generate a potentially large collection of
+//   elements.
+// * Iteration: A coordinated phase of reading from the tf.data service. An
+//   iteration produces some amount of data, and (potentially multiple)
+//   consumers consume the data from the iteration until there is no data left.
+//   Each iteration has a ProcessingModeDef which determines what data it
+//   produces.
+// * Task: An iteration is broken into multiple tasks, which each represent
+//   iterating over all of or part of the dataset. Workers process tasks.
+// * Consumer: A process reading from the tf.data service.
+//
+// **Adding workers**
+//
+// tf.data service supports adding workers mid-iteration. When a new worker
+// connects to the dispatcher, the dispatcher creates a new task for the worker,
+// one task for each outstanding iteration. Consumers periodically heartbeat to
+// the dispatcher to learn about new tasks.
+//
+// For non-round-robin-reads, there is no coordination among consumers. Each
+// consumer will start reading from the new task as soon as it learns about the
+// task from its heartbeat. Round robin reads, on the other hand, require
+// consumers to read from the same task at each step. This requires coordination
+// to ensure that all consumers start reading from the new task in the same
+// round.
+//
+// The protocol for adding round robin tasks works as follows:
+//
+// - The dispatcher keeps track of which round each round-robin iteration is on.
+// This
+//   information is reported by consumers in their heartbeats.
+// - When a new worker joins and there is an outstanding round-robin iteration,
+//   we create a new task for the iteration and assign it to the worker.
+//   However, we don't yet report the task in consumer heartbeats.
+//   We call the task a "pending task" and add it to its iteration's "pending
+//   tasks" queue.
+// - When we create a pending task, we choose a "target round" to try adding
+//   the task to. The target round is chosen by adding a "target round delta" to
+//   the latest reported round for the iteration.
+// - When a consumer heartbeats for an iteration and there is a pending task for
+//   that iteration, the dispatcher sends a heartbeat response telling the
+//   consumer to block before reading from the target round.
+// - When a consumer receives a heartbeat response telling it to block
+//   (before reading) a round, the consumer try to block the round. If the
+//   consumer has already started the round, it will too late to block the
+//   round.
+// - When consumers heartbeat, they tell the dispatcher their current round and
+//   whether they have blocked themselves from reading past a certain round. If
+//   a consumer reports a current round exceeding the target round, the target
+//   round has failed and needs to be increased. We choose a new target round by
+//   doubling the previous target round delta. If the consumer reports that it
+//   has blocked before the target round, we record that the consumer is ready
+//   to add the new task. Once all consumers are ready to add the new task, we
+//   remove the task from the pending tasks list and begin reporting the task to
+//   consumers. We set the "starting_round" field of the task to indicate the
+//   target round where all consumers should start reading from the task.
+// - If a new worker joins while there are already pending tasks, a pending
+//   task for the new worker is created and queued behind the existing tasks.
+//   The new task won't be considered until all previous pending tasks have been
+//   successfully added.
+//
+// An example of executing this protocol with two consumers could go as follows:
+// 1. Consumers read up to round 50 and heartbeat that they are on round 50.
+// 2. A new worker joins. Dispatcher chooses round 51 as the target round.
+// 3. Consumer 1 heartbeats that its current round is 50. Dispatcher tells it to
+//    block round 51.
+// 4. Consumer 2 heartbeats that its current round is 51. Dispatcher realizes
+//    that it is too late to block round 51 and chooses round 53 as the new
+//    target round. Dispatcher tells consumer 2 to block round 53.
+// 5. Consumer 1 heartbeats that its current round is 50 and that it has blocked
+//    round 51. Dispatcher tells it to block round 53 instead. Dispatcher
+//    records that consumer 1 is ready to add a task in round 53.
+// 6. Consumer 2 heartbeats that its current round is 52 and it has blocked
+//    round 53. Dispatcher realizes that all consumers are blocked on round 53
+//    or earlier and promotes the task from pending to regular. Dispatcher sends
+//    consumer 2 a task list containing the new task, and tells consumer 2 that
+//    it no longer needs to block.
+// 7. Consumer 1 heartbeats. Dispatcher sends consumer 1 the task list
+//    containing the new task, and tells it that it no longer needs to block.
+//
+class DataServiceDispatcherImpl {
+ public:
+  explicit DataServiceDispatcherImpl(
+      const experimental::DispatcherConfig& config);
+
+  ~DataServiceDispatcherImpl();
+
+  // Starts the dispatcher. If there is a journal, this will read from the
+  // journal to restore the dispatcher's state.
+  absl::Status Start();
+
+  // Stops the dispatcher. After stopping, RPCs should return without blocking.
+  void Stop();
+
+  // Returns the number of active iterations.
+  size_t NumActiveIterations() TF_LOCKS_EXCLUDED(mu_);
+
+  // See dispatcher.proto for API documentation.
+
+  /// Worker-facing API.
+  absl::Status WorkerHeartbeat(const WorkerHeartbeatRequest* request,
+                               WorkerHeartbeatResponse* response);
+  absl::Status WorkerUpdate(const WorkerUpdateRequest* request,
+                            WorkerUpdateResponse* response);
+  absl::Status GetDatasetDef(const GetDatasetDefRequest* request,
+                             GetDatasetDefResponse* response);
+  absl::Status GetSplit(const GetSplitRequest* request,
+                        GetSplitResponse* response);
+
+  /// Client-facing API.
+  absl::Status GetVersion(const GetVersionRequest* request,
+                          GetVersionResponse* response);
+  absl::Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
+                                    GetOrRegisterDatasetResponse* response);
+  absl::Status GetDataServiceMetadata(
+      const GetDataServiceMetadataRequest* request,
+      GetDataServiceMetadataResponse* response);
+  absl::Status GetDataServiceConfig(const GetDataServiceConfigRequest* request,
+                                    GetDataServiceConfigResponse* response);
+  absl::Status GetOrCreateJob(const GetOrCreateJobRequest* request,
+                              GetOrCreateJobResponse* response);
+  absl::Status GetOrCreateIteration(const GetOrCreateIterationRequest* request,
+                                    GetOrCreateIterationResponse* response);
+  absl::Status ReleaseIterationClient(
+      const ReleaseIterationClientRequest* request,
+      ReleaseIterationClientResponse* response);
+  absl::Status MaybeRemoveTask(const MaybeRemoveTaskRequest* request,
+                               MaybeRemoveTaskResponse* response);
+  absl::Status ClientHeartbeat(const ClientHeartbeatRequest* request,
+                               ClientHeartbeatResponse* response);
+  absl::Status GetWorkers(const GetWorkersRequest* request,
+                          GetWorkersResponse* response);
+  absl::Status Snapshot(const SnapshotRequest* request,
+                        SnapshotResponse* response);
+  absl::Status GetSnapshotSplit(const GetSnapshotSplitRequest* request,
+                                GetSnapshotSplitResponse* response);
+  absl::Status GetSnapshotStreams(const GetSnapshotStreamsRequest* request,
+                                  GetSnapshotStreamsResponse* response);
+  absl::Status DisableCompressionAtRuntime(
+      const DisableCompressionAtRuntimeRequest* request,
+      DisableCompressionAtRuntimeResponse* response);
+
+  // Exports the dispatcher state for debugging.
+  DispatcherStateExport ExportState() const;
+
+ private:
+  // A thread which periodically checks for iterations to clean up, clients to
+  // release, workers to consider missing, and snapshot streams to reassign.
+  void MaintenanceThread();
+
+  // Restores split providers from the state in `iteration` and stores them in
+  // `restored`.
+  absl::Status RestoreSplitProviders(
+      const DispatcherState::Iteration& iteration,
+      std::vector<std::unique_ptr<SplitProvider>>& restored)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Makes split providers for the specified `dataset_id`, and stores them in
+  // `split_providers`.
+  absl::Status MakeSplitProviders(
+      const std::string& dataset_id,
+      std::vector<std::unique_ptr<SplitProvider>>& split_providers)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Registers a dataset, storing the new dataset's id in `dataset_id`.
+  absl::Status RegisterDataset(const DatasetDef& dataset,
+                               const DataServiceMetadata& metadata,
+                               const std::string& requested_dataset_id,
+                               std::string& dataset_id)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Finds the dataset ID with the requested dataset ID.
+  // Returns nullptr if no such dataset exists.
+  absl::StatusOr<std::optional<std::string>> FindDataset(
+      const GetOrRegisterDatasetRequest& request);
+  // Gets a worker's stub from `worker_stubs_`, or if none exists, creates a
+  // stub and stores it in `worker_stubs_`. A borrowed pointer to the stub is
+  // stored in `out_stub`.
+  absl::Status GetOrCreateWorkerStub(const std::string& worker_address,
+                                     WorkerService::Stub*& out_stub)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Creates a job and stores it in `job`.
+  absl::Status CreateJob(const std::string& job_name,
+                         const GetOrCreateJobRequest& request,
+                         std::shared_ptr<const DispatcherState::Job>& job)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates an iteration and stores it in `iteration`. This method updates the
+  // dispatcher state with the new iteration, but does not assign tasks to
+  // workers.
+  absl::Status CreateIteration(
+      const GetOrCreateIterationRequest& request,
+      std::shared_ptr<const DispatcherState::Iteration>& iteration)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates tasks for the specified worker, one task for every unfinished
+  // iteration.
+  absl::Status CreateTasksForWorker(const std::string& worker_address);
+  // Finds tasks that should be deleted from a worker, updating the heartbeat
+  // response.
+  absl::Status FindTasksToDelete(
+      const absl::flat_hash_set<int64_t>& current_tasks,
+      const std::vector<std::shared_ptr<const DispatcherState::Task>>&
+          assigned_tasks,
+      WorkerHeartbeatResponse* response);
+  // Finds new tasks that should be assigned to a worker and adds them to
+  // the heartbeat response.
+  absl::Status FindNewTasks(
+      const std::string& worker_address,
+      const absl::flat_hash_set<int64_t>& current_tasks,
+      std::vector<std::shared_ptr<const DispatcherState::Task>>& assigned_tasks,
+      WorkerHeartbeatResponse* response);
+  // Reports the processing time of each active task to `auto_scaler_`.
+  void ReportProcessingTimesFromActiveTasks(
+      const std::vector<ActiveTask>& active_tasks,
+      const std::string& worker_address) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Acquires an iteration client id to read from the given iteration and sets
+  // `iteration_client_id`.
+  absl::Status AcquireIterationClientId(
+      const std::shared_ptr<const DispatcherState::Iteration>& iteration,
+      int64_t& iteration_client_id) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates one task for each worker, for the given iteration. The created
+  // tasks are stored in `tasks`. This method only updates dispatcher metadata
+  // with the new tasks, but doesn't assign the tasks to the workers.
+  absl::Status CreateTasksForIteration(
+      std::shared_ptr<const DispatcherState::Iteration> iteration,
+      std::vector<std::shared_ptr<const DispatcherState::Task>>& tasks)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Creates a new task for an iteration. The created task may be either
+  // pending or active.
+  absl::Status CreateTask(
+      std::shared_ptr<const DispatcherState::Iteration> iteration,
+      const std::string& worker_address,
+      std::shared_ptr<const DispatcherState::Task>& task)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates a pending task for a round robin iteration. All consumers need to
+  // agree on which round to add the task in before the pending task can be
+  // promoted to a regular task.
+  absl::Status CreatePendingTask(
+      std::shared_ptr<const DispatcherState::Iteration> iteration,
+      const std::string& worker_address) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates a new active task for an iteration, storing the created task in
+  // `task`.
+  absl::Status CreateActiveTask(
+      std::shared_ptr<const DispatcherState::Iteration> iteration,
+      const std::string& worker_address,
+      std::shared_ptr<const DispatcherState::Task>& task);
+  // Assigns the list of tasks to the workers indicated by their
+  // `worker_address` fields.
+  absl::Status AssignTasks(
+      std::vector<std::shared_ptr<const DispatcherState::Task>> tasks)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Assigns a task to the worker indicated by its `worker_address` field.
+  absl::Status AssignTask(std::shared_ptr<const DispatcherState::Task> task)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Validates that an existing job matches a given request.
+  // Returns an error status describing any difference.
+  absl::Status ValidateMatchingJob(
+      std::shared_ptr<const DispatcherState::Job> job,
+      const GetOrCreateJobRequest& request) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Fills out a TaskDef with information about a task.
+  absl::Status PopulateTaskDef(
+      std::shared_ptr<const DispatcherState::Task> task,
+      TaskDef* task_def) const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Checks that the dispatcher has started, returning UNAVAILABLE if it hasn't.
+  absl::Status CheckStarted() TF_LOCKS_EXCLUDED(mu_);
+  // Restores ongoing tf.data snapshots.
+  absl::Status RestoreSnapshots();
+  // Records that a split was produced by a call to `GetSplit`.
+  absl::Status RecordSplitProduced(int64_t iteration_id, int64_t repetition,
+                                   int64_t split_provider_index, bool finished)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Applies a state update, updating both the journal and the in-memory state.
+  absl::Status Apply(const Update& update) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Applies a state update, but doesn't update the journal. Only meant to be
+  // used when recovering state when the dispatcher starts.
+  absl::Status ApplyWithoutJournaling(const Update& update)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Removes the client with `client_id` from `auto_scaler_`
+  void RemoveClientFromAutoScaler(int64_t client_id)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Releases iteration clients that haven't heartbeated recently.
+  absl::Status ReleaseMissingClients() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Removes the worker with `worker_address` from `auto_scaler_`, which is
+  // potentially associated with multiple iterations.
+  void RemoveWorkerFromAutoScaler(const std::string& worker_address)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Checks for workers that haven't heartbeated recently and alerts the
+  // snapshot managers.
+  void DetectMissingWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Scans for old iterations and marks them as finished.
+  absl::Status GcOldIterations() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Returns true if an iteration should be garbage collected.
+  bool ShouldGcIteration(const DispatcherState::Iteration& iteration,
+                         int64_t now_us) const;
+  // Gets a `DatasetDef` from `dataset_store_` for the given dataset id, and
+  // stores it in `dataset_def`.
+  absl::Status GetDatasetDef(const std::string& dataset_id,
+                             std::shared_ptr<const DatasetDef>& dataset_def)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Gets a `DatasetDef` from `dataset_store_` for the given dataset, and
+  // stores it in `dataset_def`.
+  absl::Status GetDatasetDef(const DispatcherState::Dataset& dataset,
+                             std::shared_ptr<const DatasetDef>& dataset_def)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const experimental::DispatcherConfig config_;
+  Env* env_;
+
+  mutable mutex mu_;
+  // Uses a separate mutex for `GetSplit` requests. `GetSplit` may be blocking.
+  // Locking `mu_` in `GetSplit` could block all other RPCs.
+  mutable mutex get_split_mu_;
+  bool started_ TF_GUARDED_BY(mu_) = false;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+
+  // Cached worker stubs for communicating with workers.
+  absl::flat_hash_map<std::string, std::unique_ptr<WorkerService::Stub>>
+      worker_stubs_ TF_GUARDED_BY(mu_);
+  // Store of dataset definitions.
+  std::unique_ptr<DatasetStore> dataset_store_ TF_GUARDED_BY(mu_);
+  // Mapping from iteration id to the split providers for the iteration.
+  absl::flat_hash_map<int64_t, std::vector<std::unique_ptr<SplitProvider>>>
+      split_providers_ TF_GUARDED_BY(mu_);
+  // Mapping from round robin iteration id to the round the iteration is
+  // currently on. This is based on the data provided by client heartbeats,
+  // and may be stale.
+  absl::flat_hash_map<int64_t, int64_t> round_robin_rounds_ TF_GUARDED_BY(mu_);
+  // Map from task id to a TaskRemover which determines when to remove the task.
+  absl::flat_hash_map<int64_t, std::shared_ptr<TaskRemover>>
+      remove_task_requests_ TF_GUARDED_BY(mu_);
+  // Map from client id to the time of the client's last heartbeat.
+  absl::flat_hash_map<int64_t, absl::Time> latest_client_heartbeats_time_
+      TF_GUARDED_BY(mu_);
+  // Map from worker address to the time of the worker's last heartbeat.
+  absl::flat_hash_map<std::string, absl::Time> latest_worker_heartbeats_time_
+      TF_GUARDED_BY(mu_);
+
+  // A manager for each snapshot resumed or started during the lifetime of this
+  // dispatcher instance.  Note that these are *not* garbage collected; managers
+  // for completed snapshots will remain here for the lifetime of the dispatcher
+  // instance.  They will even be recovered if the dispatcher is restarted.
+  absl::flat_hash_map<std::string, std::unique_ptr<SnapshotManager>> snapshots_
+      TF_GUARDED_BY(mu_);
+  // A single stream assignment manager shared by all managers in `snapshots_`.
+  SnapshotAssignmentManager snapshot_assignment_manager_;
+
+  std::optional<std::unique_ptr<JournalWriter>> journal_writer_
+      TF_GUARDED_BY(mu_);
+  DispatcherState state_ TF_GUARDED_BY(mu_);
+  // Condition variable for waking up the gc thread.
+  condition_variable maintenance_thread_cv_;
+  std::unique_ptr<Thread> maintenance_thread_;
+  MultipleIterationsAutoScaler auto_scaler_;
+
+  DataServiceDispatcherImpl(const DataServiceDispatcherImpl&) = delete;
+  void operator=(const DataServiceDispatcherImpl&) = delete;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_state.h b/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_state.h
new file mode 100644
index 00000000..054c3203
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/dispatcher_state.h
@@ -0,0 +1,381 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_STATE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_STATE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/graph_rewriters.h"
+#include "tensorflow/core/data/service/journal.h"
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// A class encapsulating the journaled state of the dispatcher. All state
+// modifications must be done via `Apply`. This helps to ensure that
+// replaying the journal will allow us to restore the exact same state.
+//
+// The following usage pattern will keep the journal in sync with the state of
+// the dispatcher:
+// {
+//   mutex_lock l(mu_);
+//   Update update = ...  // create an update
+//   dispatcher_state.Apply(update);
+//   journal_writer.write(Update);
+//   // Unlock mu_
+// }
+//
+// The division of functionality between DispatcherImpl and DispatcherState is
+// as follows:
+//   - DispatcherImpl is responsible for handling RPC requests, reading from
+//     DispatcherState, and deciding what updates to apply to DispatcherState.
+//     DispatcherImpl handles all synchronization.
+//   - DispatcherState is responsible for making the state changes requested by
+//     DispatcherImpl and for providing DispatcherImpl with read-only access to
+//     the state.
+//
+// DispatcherState is thread-compatible but not thread-safe.
+class DispatcherState {
+ public:
+  DispatcherState();
+  explicit DispatcherState(
+      const experimental::DispatcherConfig& dispatcher_config);
+  DispatcherState(const DispatcherState&) = delete;
+  DispatcherState& operator=(const DispatcherState&) = delete;
+
+  // Applies the given update to the dispatcher's state.
+  absl::Status Apply(const Update& update);
+
+  // A dataset registered with the dispatcher.
+  struct Dataset {
+    explicit Dataset(const std::string& dataset_id,
+                     const DataServiceMetadata& metadata)
+        : dataset_id(dataset_id), metadata(metadata) {}
+
+    const std::string dataset_id;
+    const DataServiceMetadata metadata;
+  };
+
+  // A worker registered with the dispatcher.
+  struct Worker {
+    explicit Worker(const RegisterWorkerUpdate& register_worker)
+        : address(register_worker.worker_address()),
+          transfer_servers({register_worker.transfer_servers().begin(),
+                            register_worker.transfer_servers().end()}),
+          tags(register_worker.worker_tags().begin(),
+               register_worker.worker_tags().end()),
+          uid(register_worker.worker_uid()) {}
+
+    const std::string address;
+    const std::vector<DataTransferServerInfo> transfer_servers;
+    const std::vector<std::string> tags;
+    const int64_t uid;
+  };
+
+  // A key for identifying an iteration. The key contains a job name,
+  // as well as a repetition number describing which repetition of the job
+  // we are on.
+  struct IterationKey {
+    explicit IterationKey(absl::string_view name, int64_t repetition)
+        : name(name), repetition(repetition) {}
+
+    friend bool operator==(const IterationKey& lhs, const IterationKey& rhs) {
+      return lhs.name == rhs.name && lhs.repetition == rhs.repetition;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const IterationKey& k) {
+      return H::combine(std::move(h), k.name, k.repetition);
+    }
+
+    std::string DebugString() const {
+      return absl::StrCat(name, "/", repetition);
+    }
+
+    const std::string name;
+    const int64_t repetition;
+  };
+
+  struct DistributedEpochState {
+    explicit DistributedEpochState(int64_t num_split_providers)
+        : repetitions(num_split_providers), indices(num_split_providers) {}
+
+    // The current repetition for each split provider.
+    std::vector<int64_t> repetitions;
+    // Number of splits produced so far by each split provider.
+    std::vector<int64_t> indices;
+  };
+
+  struct Task;
+
+  struct PendingTask {
+    explicit PendingTask(std::shared_ptr<Task> task, int64_t target_round)
+        : task(std::move(task)), target_round(target_round) {}
+
+    std::shared_ptr<Task> task;
+    // The target round where we want to insert the task.
+    int64_t target_round;
+    // Which consumers have responded that they have successfully blocked
+    // before the target round.
+    absl::flat_hash_set<int64_t> ready_consumers;
+    // How many times we have failed to add the task.
+    int64_t failures = 0;
+  };
+
+  struct Job {
+    explicit Job(int64_t id, const std::string& dataset_id,
+                 const ProcessingModeDef& processing_mode, std::string job_name,
+                 std::optional<int64_t> num_consumers,
+                 bool use_cross_trainer_cache, TargetWorkers target_workers)
+        : id(id),
+          dataset_id(dataset_id),
+          processing_mode(processing_mode),
+          job_name(job_name),
+          num_consumers(num_consumers),
+          use_cross_trainer_cache(use_cross_trainer_cache),
+          target_workers(target_workers) {}
+
+    const int64_t id;
+    const std::string dataset_id;
+    const ProcessingModeDef processing_mode;
+    const std::string job_name;
+    const std::optional<int64_t> num_consumers;
+    const bool use_cross_trainer_cache;
+    const TargetWorkers target_workers;
+  };
+
+  // An iteration for processing a dataset.
+  struct Iteration {
+    explicit Iteration(int64_t iteration_id, IterationKey iteration_key,
+                       int64_t num_split_providers, std::shared_ptr<Job> job)
+        : iteration_id(iteration_id), iteration_key(iteration_key), job(job) {
+      if (IsDynamicShard(job->processing_mode)) {
+        distributed_epoch_state = DistributedEpochState(num_split_providers);
+      }
+    }
+
+    bool IsRoundRobin() const { return job->num_consumers.has_value(); }
+
+    std::string DebugString() const {
+      return absl::StrCat(iteration_key.name, "_", iteration_key.repetition);
+    }
+
+    const int64_t iteration_id;
+    const IterationKey iteration_key;
+    const std::shared_ptr<Job> job;
+    std::optional<DistributedEpochState> distributed_epoch_state;
+    std::queue<PendingTask> pending_tasks;
+    int64_t num_clients = 0;
+    int64_t last_client_released_micros = -1;
+    bool finished = false;
+    // Indicates whether the iteration was garbage collected.
+    bool garbage_collected = false;
+  };
+
+  struct Task {
+    template <class T>
+    explicit Task(const T& create_task_update,
+                  const std::shared_ptr<Iteration>& iteration)
+        : task_id(create_task_update.task_id()),
+          iteration(iteration),
+          worker_address(create_task_update.worker_address()),
+          transfer_servers(create_task_update.transfer_servers().begin(),
+                           create_task_update.transfer_servers().end()),
+          worker_tags(create_task_update.worker_tags().begin(),
+                      create_task_update.worker_tags().end()),
+          worker_uid(create_task_update.worker_uid()) {}
+
+    const int64_t task_id;
+    const std::shared_ptr<Iteration> iteration;
+    const std::string worker_address;
+    const std::vector<DataTransferServerInfo> transfer_servers;
+    const std::vector<std::string> worker_tags;
+    const int64_t worker_uid;
+    int64_t starting_round = 0;
+    bool finished = false;
+    bool removed = false;
+  };
+
+  using TasksById = absl::flat_hash_map<int64_t, std::shared_ptr<Task>>;
+
+  // Returns the next available dataset ID.
+  std::string NextAvailableDatasetId() const;
+
+  // Gets a dataset by id. Returns NOT_FOUND if there is no such dataset.
+  absl::Status DatasetFromId(const std::string& id,
+                             std::shared_ptr<const Dataset>& dataset) const;
+
+  // Gets a worker by address. Returns NOT_FOUND if there is no such worker.
+  absl::Status WorkerFromAddress(const std::string& address,
+                                 std::shared_ptr<const Worker>& worker) const;
+  // Lists all workers registered with the dispatcher.
+  std::vector<std::shared_ptr<const Worker>> ListWorkers() const;
+
+  // Returns the next available job id.
+  int64_t NextAvailableJobId() const;
+  // Gets a job by id. Returns NOT_FOUND if there is no such job.
+  absl::Status JobFromId(int64_t job_id, std::shared_ptr<const Job>& job) const;
+  // Gets a job by name. Returns NOT_FOUND if there is no such job.
+  absl::Status JobByName(const std::string& job_name,
+                         std::shared_ptr<const Job>& job) const;
+
+  // Returns the next available iteration id.
+  int64_t NextAvailableIterationId() const;
+  // Returns a list of all iterations.
+  std::vector<std::shared_ptr<const Iteration>> ListIterations() const;
+  // Gets an iteration by id. Returns NOT_FOUND if there is no such iteration.
+  absl::Status IterationFromId(
+      int64_t id, std::shared_ptr<const Iteration>& iteration) const;
+  // Gets an iteration by key. Returns NOT_FOUND if there is no such iteration.
+  absl::Status IterationByKey(
+      IterationKey key, std::shared_ptr<const Iteration>& iteration) const;
+
+  // Returns the iteration associated with the given iteration client id.
+  // Returns NOT_FOUND if the iteration_client_id is unknown or has been
+  // released.
+  absl::Status IterationForIterationClientId(
+      int64_t iteration_client_id, std::shared_ptr<const Iteration>& iteration);
+  // Returns a list of all active client ids.
+  std::vector<int64_t> ListActiveClientIds();
+  // Returns the next available iteration client id.
+  int64_t NextAvailableIterationClientId() const;
+
+  // Returns the next available task id.
+  int64_t NextAvailableTaskId() const;
+  // Gets a task by id. Returns NOT_FOUND if there is no such task.
+  absl::Status TaskFromId(int64_t id, std::shared_ptr<const Task>& task) const;
+  // Stores a list of all tasks for the given iteration to `tasks`. Returns
+  // NOT_FOUND if there is no such iteration.
+  absl::Status TasksForIteration(
+      int64_t iteration_id,
+      std::vector<std::shared_ptr<const Task>>& tasks) const;
+  // Stores a list of all tasks for the given worker to `tasks`. Returns
+  // NOT_FOUND if there is no such worker.
+  absl::Status TasksForWorker(
+      const absl::string_view worker_address,
+      std::vector<std::shared_ptr<const Task>>& tasks) const;
+
+  // If the dispatcher config explicitly specifies a list of workers, validates
+  // `worker_address` is in the list.
+  absl::Status ValidateWorker(absl::string_view worker_address) const;
+
+  // If the dispatcher config specifies worker addresses, `GetWorkerIndex`
+  // returns the worker index according to the list. This is useful for
+  // deterministically sharding a dataset among a fixed set of workers.
+  absl::StatusOr<int64_t> GetWorkerIndex(
+      absl::string_view worker_address) const;
+
+  // Returns the paths of all snapshots initiated during the lifetime of this
+  // journal.
+  const absl::flat_hash_set<std::string>& ListSnapshotPaths() const {
+    return snapshot_paths_;
+  }
+
+  // Returns a bool describing whether or not compression was disabled at
+  // runtime for the given dataset, if such a decision has been made.
+  std::optional<bool> CompressionDisabledAtRuntime(
+      const std::string& dataset_id) const;
+
+  // Returns the current number of registered workers.
+  int64_t GetNumberOfRegisteredWorkers() const { return workers_.size(); }
+
+ private:
+  void RegisterDataset(const RegisterDatasetUpdate& register_dataset);
+  void RegisterWorker(const RegisterWorkerUpdate& register_worker);
+  void CreateJob(const CreateJobUpdate& create_job);
+  void CreateIteration(const CreateIterationUpdate& create_iteration);
+  void ProduceSplit(const ProduceSplitUpdate& produce_split);
+  void AcquireIterationClient(
+      const AcquireIterationClientUpdate& acquire_iteration_client);
+  void ReleaseIterationClient(
+      const ReleaseIterationClientUpdate& release_iteration_client);
+  void GarbageCollectIteration(
+      const GarbageCollectIterationUpdate& garbage_collect_iteration);
+  void RemoveTask(const RemoveTaskUpdate& remove_task);
+  void CreatePendingTask(const CreatePendingTaskUpdate& create_pending_task);
+  void ClientHeartbeat(const ClientHeartbeatUpdate& client_heartbeat);
+  void CreateTask(const CreateTaskUpdate& create_task);
+  void FinishTask(const FinishTaskUpdate& finish_task);
+  void Snapshot(const SnapshotUpdate& snapshot);
+  void CompressionDisabledAtRuntime(const CompressionDisabledAtRuntimeUpdate&
+                                        compression_disabled_at_runtime);
+
+  // Updates the next available dataset ID.
+  void UpdateNextAvailableDatasetId();
+
+  int64_t next_available_dataset_id_ = 1000;
+  // Registered datasets, keyed by dataset ids.
+  absl::flat_hash_map<std::string, std::shared_ptr<Dataset>> datasets_by_id_;
+
+  // Registered workers, keyed by address.
+  absl::flat_hash_map<std::string, std::shared_ptr<Worker>> workers_;
+
+  // Assigns an index to each worker according to worker addresses list
+  // specified in the dispatcher config.
+  WorkerIndexResolver worker_index_resolver_;
+
+  int64_t next_available_job_id_ = 5000;
+  // Jobs, keyed by job ids.
+  absl::flat_hash_map<int64_t, std::shared_ptr<Job>> jobs_by_id_;
+  // Jobs, keyed by job names.
+  absl::flat_hash_map<std::string, std::shared_ptr<Job>> jobs_by_name_;
+
+  int64_t next_available_iteration_id_ = 2000;
+  // Iterations, keyed by iteration ids.
+  absl::flat_hash_map<int64_t, std::shared_ptr<Iteration>> iterations_;
+  // Iterations, keyed by their iteration keys.
+  absl::flat_hash_map<IterationKey, std::shared_ptr<Iteration>>
+      iterations_by_key_;
+
+  int64_t next_available_iteration_client_id_ = 3000;
+  // Mapping from client ids to the iterations they are associated with.
+  absl::flat_hash_map<int64_t, std::shared_ptr<Iteration>>
+      iterations_for_client_ids_;
+
+  int64_t next_available_task_id_ = 4000;
+  // Tasks, keyed by task ids.
+  TasksById tasks_;
+  // List of tasks associated with each iteration.
+  absl::flat_hash_map<int64_t, std::vector<std::shared_ptr<Task>>>
+      tasks_by_iteration_;
+  // Tasks, keyed by worker addresses. The values are a map from task id to
+  // task.
+  absl::flat_hash_map<std::string, TasksById> tasks_by_worker_;
+  // Paths for all snapshots initiated during the lifetime of this journal.
+  absl::flat_hash_set<std::string> snapshot_paths_;
+  // A mapping of dataset id to a boolean describing whether or not compression
+  // was disabled at runtime for that dataset.
+  absl::flat_hash_map<std::string, bool> compression_disabled_at_runtime_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/graph_rewriters.h b/third_party/tflite-hdrs/tensorflow/core/data/service/graph_rewriters.h
new file mode 100644
index 00000000..e1244fd5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/graph_rewriters.h
@@ -0,0 +1,108 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_GRAPH_REWRITERS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_GRAPH_REWRITERS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Rewrites the dataset graph by removing the compression map.
+class RemoveCompressionMapRewriter {
+ public:
+  // Returns `graph_def` with the compression map removed.
+  absl::StatusOr<GraphDef> ApplyRemoveCompressionMapRewrite(
+      const GraphDef& graph_def);
+
+ private:
+  tensorflow::RewriterConfig::CustomGraphOptimizer GetRewriteConfig() const;
+};
+
+// Rewrites the dataset graph by applying an auto-shard policy.
+class AutoShardRewriter {
+ public:
+  // Creates an `AutoShardRewriter` according to `task_def`. Returns an error if
+  // the sharding policy is not a valid auto-shard policy.
+  static absl::StatusOr<AutoShardRewriter> Create(const TaskDef& task_def);
+
+  // Applies auto-sharding to `graph_def`. If auto-shard policy is OFF, returns
+  // the same graph as `graph_def`. Otherwise, returns the re-written graph.
+  absl::StatusOr<GraphDef> ApplyAutoShardRewrite(const GraphDef& graph_def);
+
+ private:
+  AutoShardRewriter(AutoShardPolicy auto_shard_policy, int64_t num_workers,
+                    int64_t worker_index);
+
+  // Creates a rewrite config based on the auto-shard policy.
+  tensorflow::RewriterConfig::CustomGraphOptimizer GetRewriteConfig() const;
+
+  const AutoShardPolicy auto_shard_policy_;
+  const int64_t num_workers_;
+  const int64_t worker_index_;
+};
+
+// Maps a worker to its index, given a list of workers. For example, suppose
+// `worker_addresses` contains
+//   /worker/task/0:worker, /worker/task/1:worker, /worker/task/2:worker,
+// then
+//   /worker/task/0:worker maps to index 0,
+//   /worker/task/1:worker maps to index 1,
+//   /worker/task/2:worker maps to index 2.
+// This is useful for deterministically sharding a dataset among a fixed set of
+// tf.data service workers.
+class WorkerIndexResolver {
+ public:
+  // Constructs a `WorkerIndexResolver` to generate worker indexes according to
+  // the specified worker addresses. The worker addresses can be "host" or
+  // "host:port", where "port" is a number, named port, or "%port%" to be
+  // replaced with the actual port.
+  template <class T>
+  explicit WorkerIndexResolver(const T& worker_addresses)
+      : worker_addresses_(worker_addresses.cbegin(), worker_addresses.cend()) {}
+
+  // Validates `worker_address`. Returns an error if the `worker_addresses` list
+  // is non-empty and `worker_address` is not specified in the worker addresses
+  // list (with optional port replacement).
+  absl::Status ValidateWorker(absl::string_view worker_address) const;
+
+  // Processes a worker at address `worker_address`. Its index can be retrieved
+  // by calling `GetWorkerIndex`.
+  void AddWorker(absl::string_view worker_address);
+
+  // Returns the worker index for the worker at `worker_address`. Returns a
+  // NotFound error if the worker is not registered.
+  absl::StatusOr<int64_t> GetWorkerIndex(
+      absl::string_view worker_address) const;
+
+ private:
+  std::vector<std::string> worker_addresses_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_GRAPH_REWRITERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_dispatcher_impl.h b/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_dispatcher_impl.h
new file mode 100644
index 00000000..50d5e2c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_GRPC_DISPATCHER_IMPL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_GRPC_DISPATCHER_IMPL_H_
+
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
+#include "tensorflow/core/data/service/dispatcher_impl.h"
+#include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// This class is a wrapper that handles communication for gRPC.
+class GrpcDispatcherImpl : public DispatcherService::Service {
+ public:
+  // Constructs a GrpcDispatcherImpl with the given config, and registers it
+  // with `server_builder`.
+  explicit GrpcDispatcherImpl(const experimental::DispatcherConfig& config,
+                              ::grpc::ServerBuilder& server_builder);
+  ~GrpcDispatcherImpl() override { Stop(); }
+
+  absl::Status Start();
+  void Stop();
+
+  size_t NumActiveIterations();
+
+  DispatcherStateExport ExportState() const;
+
+#define HANDLER(method)                                 \
+  ::grpc::Status method(::grpc::ServerContext* context, \
+                        const method##Request* request, \
+                        method##Response* response) override;
+  HANDLER(WorkerHeartbeat);
+  HANDLER(WorkerUpdate);
+  HANDLER(GetDatasetDef);
+  HANDLER(GetSplit);
+  HANDLER(GetVersion);
+  HANDLER(GetOrRegisterDataset);
+  HANDLER(ReleaseIterationClient);
+  HANDLER(MaybeRemoveTask);
+  HANDLER(GetOrCreateJob);
+  HANDLER(GetOrCreateIteration);
+  HANDLER(ClientHeartbeat);
+  HANDLER(GetWorkers);
+  HANDLER(GetDataServiceMetadata);
+  HANDLER(GetDataServiceConfig);
+  HANDLER(Snapshot);
+  HANDLER(GetSnapshotSplit);
+  HANDLER(GetSnapshotStreams);
+  HANDLER(DisableCompressionAtRuntime);
+#undef HANDLER
+
+ private:
+  DataServiceDispatcherImpl impl_;
+
+  GrpcDispatcherImpl(const GrpcDispatcherImpl&) = delete;
+  void operator=(const GrpcDispatcherImpl&) = delete;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_GRPC_DISPATCHER_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_util.h b/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_util.h
new file mode 100644
index 00000000..8fff6312
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_GRPC_UTIL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_GRPC_UTIL_H_
+
+#include <functional>
+#include <string>
+
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+namespace grpc_util {
+
+// Wraps a grpc::Status in a tensorflow::Status with the given message.
+absl::Status WrapError(const std::string& message,
+                       const ::grpc::Status& status);
+
+// Retries the given function if the function produces UNAVAILABLE, ABORTED, or
+// CANCELLED status codes. We retry these codes because they can all indicate
+// preemption of a server. The retries continue until the deadline is exceeded
+// or the `should_retry` callback returns false. `description` may be used to
+// log that retries are happening. It should contain a description of the action
+// being retried, e.g. "register dataset" The retry loop uses exponential
+// backoff between retries. `deadline_micros` is interpreted as microseconds
+// since the epoch.
+absl::Status Retry(const std::function<absl::Status()>& f,
+                   const std::function<bool()>& should_retry,
+                   const std::string& description, int64_t deadline_micros);
+
+// Same as `Retry` above, but with a `should_retry` callback that always returns
+// `true`.
+absl::Status Retry(const std::function<absl::Status()>& f,
+                   const std::string& description, int64_t deadline_micros);
+
+}  // namespace grpc_util
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_GRPC_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_worker_impl.h b/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_worker_impl.h
new file mode 100644
index 00000000..4513c0ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/grpc_worker_impl.h
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_GRPC_WORKER_IMPL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_GRPC_WORKER_IMPL_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/service/worker_impl.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// This class is a wrapper that handles communication for gRPC.
+class GrpcWorkerImpl : public WorkerService::Service {
+ public:
+  // Constructs a GrpcWorkerImpl with the given config, and registers it with
+  // `server_builder`.
+  explicit GrpcWorkerImpl(const experimental::WorkerConfig& config,
+                          ::grpc::ServerBuilder& server_builder);
+  ~GrpcWorkerImpl() override { Stop(); }
+
+  absl::Status Start(
+      const std::string& worker_address,
+      const std::vector<DataTransferServerInfo>& transfer_servers);
+  void Stop();
+
+  std::function<absl::Status(const GetElementRequest*, GetElementResult*)>
+  get_element_getter() {
+    return [this](const GetElementRequest* request, GetElementResult* result) {
+      return impl_->GetElementResult(request, result);
+    };
+  }
+
+  WorkerStateExport ExportState() const;
+
+#define HANDLER(method)                                 \
+  ::grpc::Status method(::grpc::ServerContext* context, \
+                        const method##Request* request, \
+                        method##Response* response) override;
+  HANDLER(ProcessTask);
+  HANDLER(GetElement);
+  HANDLER(GetWorkerTasks);
+  HANDLER(GetSnapshotTaskProgresses);
+#undef HANDLER
+
+ private:
+  std::string worker_address_;
+  // A std::shared_ptr allows clients to access local servers and directly call
+  // the servers' methods to avoid RPC calls and data copy.
+  std::shared_ptr<DataServiceWorkerImpl> impl_;
+
+  GrpcWorkerImpl(const GrpcWorkerImpl&) = delete;
+  void operator=(const GrpcWorkerImpl&) = delete;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_GRPC_WORKER_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/journal.h b/third_party/tflite-hdrs/tensorflow/core/data/service/journal.h
new file mode 100644
index 00000000..0c15856b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/journal.h
@@ -0,0 +1,118 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_JOURNAL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_JOURNAL_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/data/service/journal.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns the location of the journal file within the journal directory.
+std::string DataServiceJournalFile(const std::string& journal_dir,
+                                   int64_t sequence_number);
+
+// Interface for writing to a journal.
+class JournalWriter {
+ public:
+  virtual ~JournalWriter() = default;
+  // Writes and syncs an update to the journal.
+  virtual absl::Status Write(const Update& update) = 0;
+  // Initializes the writer if it is not yet initialized.
+  virtual absl::Status EnsureInitialized() = 0;
+};
+
+// FileJournalWriter is not thread-safe, requiring external synchronization when
+// used by multiple threads.
+//
+// FileJournalWriter writes journal files to a configured journal directory. The
+// directory is laid out in the following format:
+//
+// journal_dir/
+//   journal_0
+//   journal_1
+//   ...
+//
+// When the writer is created, it lists the directory to find the next available
+// journal file name. For example, if the journal directory contains
+// "journal_0", "journal_1", and "journal_2", the writer will write to
+// "journal_3". The writer will flush updates as they are written, so that they
+// can be stored durably in case of machine failure.
+class FileJournalWriter : public JournalWriter {
+ public:
+  // Creates a journal writer to write to the given journal directory.
+  // If there is already journal data there, the journal writer will append to
+  // the existing journal.
+  explicit FileJournalWriter(Env* env, const std::string& journal_dir);
+  FileJournalWriter(const FileJournalWriter&) = delete;
+  FileJournalWriter& operator=(const FileJournalWriter&) = delete;
+
+  absl::Status Write(const Update& update) override;
+  absl::Status EnsureInitialized() override;
+
+ private:
+  Env* env_;
+  const std::string journal_dir_;
+  std::unique_ptr<WritableFile> file_;
+  std::unique_ptr<io::RecordWriter> writer_;
+};
+
+// Interface for reading from a journal.
+class JournalReader {
+ public:
+  virtual ~JournalReader() = default;
+  // Reads the next update from the journal. Sets `end_of_journal=true` if
+  // there are no more updates left in the journal.
+  virtual absl::Status Read(Update& update, bool& end_of_journal) = 0;
+};
+
+// JournalReader is not thread-safe, requiring external synchronization when
+// used by multiple threads.
+//
+// The journal reader reads through all journal files in the configured journal
+// directory, in order of their sequence numbers. See FileJournalWriter above.
+class FileJournalReader : public JournalReader {
+ public:
+  explicit FileJournalReader(Env* env, absl::string_view journal_dir);
+  FileJournalReader(const FileJournalReader&) = delete;
+  FileJournalReader& operator=(const FileJournalReader&) = delete;
+
+  absl::Status Read(Update& update, bool& end_of_journal) override;
+
+ private:
+  // Initializes the reader if it is not yet initialized.
+  absl::Status EnsureInitialized();
+  // Updates the `FileJournalReader` to read from a new file.
+  absl::Status UpdateFile(const std::string& filename);
+
+  Env* env_;
+  const std::string journal_dir_;
+  // Sequence number of current journal file.
+  int64_t sequence_number_ = 0;
+  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<io::SequentialRecordReader> reader_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_JOURNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/py_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/py_utils.h
new file mode 100644
index 00000000..b0ea8928
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/py_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_PY_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_PY_UTILS_H_
+
+#include <string>
+
+// Utilities called from the Python API through pybind. We define this file
+// separately from other utils to keep the transitive closure of dependencies
+// minimal, avoiding linking conflicts.
+namespace tensorflow {
+namespace data {
+
+// Returns the default protocol to use for tf.data service control flow.
+std::string DefaultProtocol();
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_PY_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/server_lib.h b/third_party/tflite-hdrs/tensorflow/core/data/service/server_lib.h
new file mode 100644
index 00000000..56a8f8d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/server_lib.h
@@ -0,0 +1,189 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SERVER_LIB_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "grpcpp/server.h"
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_transfer.h"
+#include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Forward declared because transitively depending on .grpc.pb.h files causes
+// issues in the pywrap build.
+class GrpcDispatcherImpl;
+class GrpcWorkerImpl;
+
+// A grpc server for the tf.data service.
+class GrpcDataServerBase {
+ public:
+  // Constructs a tf.data server with the specified port. If the port is 0, the
+  // server will find an available port in `Start()`. The chosen port can be
+  // found by calling `BoundPort()`.
+  GrpcDataServerBase(
+      int requested_port, const std::string& protocol,
+      const std::string& server_type,
+      std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> options = {});
+  virtual ~GrpcDataServerBase() = default;
+
+  // Starts the server running asynchronously.
+  absl::Status Start();
+
+  // Stops the server. This will block until all outstanding requests complete.
+  void Stop();
+
+  // Blocks until the server stops.
+  void Join();
+
+  // Returns the port bound by the server. Only valid after calling Start().
+  int BoundPort();
+
+  // Exports the server state to improve debuggability.
+  virtual ServerStateExport ExportState() const = 0;
+
+ protected:
+  virtual void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) = 0;
+  void AddProfilerServiceToBuilder(::grpc::ServerBuilder& builder);
+  // Starts the service. This will be called after building the service, so
+  // bound_port() will return the actual bound port.
+  virtual absl::Status StartServiceInternal() = 0;
+  virtual void StopServiceInternal() {}
+
+  int bound_port() { return bound_port_; }
+
+  const int requested_port_;
+  const std::string protocol_;
+  const std::string server_type_;
+
+ private:
+  int bound_port_;
+  bool started_ = false;
+  bool stopped_ = false;
+
+  std::unique_ptr<::grpc::Server> server_;
+  // TensorFlow profiler service implementation.
+  std::unique_ptr<grpc::ProfilerService::Service> profiler_service_ = nullptr;
+  std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> server_options_;
+};
+
+// A wrapper for `SnapshotStreamInfo` for use with pybind.
+struct SnapshotStreamInfoWrapper {
+  SnapshotStreamInfoWrapper() = default;
+  explicit SnapshotStreamInfoWrapper(const SnapshotStreamInfo& info)
+      : index(info.index()), state(info.state()) {}
+  int64_t index;
+  int64_t state;
+};
+
+class DispatchGrpcDataServer : public GrpcDataServerBase {
+ public:
+  explicit DispatchGrpcDataServer(
+      const experimental::DispatcherConfig& config,
+      std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> options = {});
+  ~DispatchGrpcDataServer() override;
+
+  // Returns the number of workers registered with the dispatcher.
+  absl::Status NumWorkers(int* num_workers);
+  // Returns the number of active (non-finished) iterations running on the
+  // dispatcher.
+  size_t NumActiveIterations();
+  // Returns information about all the streams for the snapshot at `path`.
+  absl::Status SnapshotStreams(const std::string& path,
+                               std::vector<SnapshotStreamInfoWrapper>* streams);
+
+  ServerStateExport ExportState() const override;
+
+ protected:
+  void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) override;
+  absl::Status StartServiceInternal() override;
+  void StopServiceInternal() override;
+
+ private:
+  const experimental::DispatcherConfig config_;
+  // Owned. We use a raw pointer because GrpcDispatcherImpl is forward-declared.
+  GrpcDispatcherImpl* service_;
+};
+
+// A wrapper for `SnapshotTaskProgress` for use with pybind.
+struct SnapshotTaskProgressWrapper {
+  SnapshotTaskProgressWrapper() = default;
+  explicit SnapshotTaskProgressWrapper(const SnapshotTaskProgress& progress)
+      : snapshot_task_base_path(progress.snapshot_task().base_path()),
+        snapshot_task_stream_index(progress.snapshot_task().stream_index()),
+        completed(progress.completed()) {}
+  std::string snapshot_task_base_path;
+  int64_t snapshot_task_stream_index;
+  bool completed;
+};
+
+class WorkerGrpcDataServer : public GrpcDataServerBase {
+ public:
+  explicit WorkerGrpcDataServer(
+      const experimental::WorkerConfig& config,
+      std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> options = {});
+  ~WorkerGrpcDataServer() override;
+
+  // Returns the number of tasks currently being executed by the worker.
+  absl::Status NumTasks(int* num_tasks);
+
+  // Returns the progresses of the snapshot tasks currently being executed by
+  // the worker.
+  absl::Status SnapshotTaskProgresses(
+      std::vector<SnapshotTaskProgressWrapper>* snapshot_task_progresses);
+
+  ServerStateExport ExportState() const override;
+
+ protected:
+  void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) override;
+  absl::Status StartServiceInternal() override;
+  void StopServiceInternal() override;
+
+ private:
+  // If an alternative data transfer protocol is configured, tries to start a
+  // transfer server for it, adding an entry to `transfer_servers` if
+  // successful.
+  void MaybeStartAlternativeDataTransferServer(
+      std::vector<DataTransferServerInfo>& transfer_servers);
+
+  const experimental::WorkerConfig config_;
+  // Owned. We use a raw pointer because GrpcWorkerImpl is forward-declared.
+  GrpcWorkerImpl* service_;
+  std::shared_ptr<DataTransferServer> transfer_server_;
+};
+
+// Creates a dispatch tf.data server and stores it in `out_server`.
+absl::Status NewDispatchServer(
+    const experimental::DispatcherConfig& config,
+    std::unique_ptr<DispatchGrpcDataServer>& out_server);
+
+// Creates a worker tf.data server and stores it in `out_server`.
+absl::Status NewWorkerServer(const experimental::WorkerConfig& config,
+                             std::unique_ptr<WorkerGrpcDataServer>& out_server);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SERVER_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/file_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/file_utils.h
new file mode 100644
index 00000000..2a6ca60a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/file_utils.h
@@ -0,0 +1,74 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_FILE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_FILE_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/protobuf.h"
+
+namespace tensorflow {
+namespace data {
+
+// Atomically writes `str` to `filename`. Overwrites existing contents if the
+// file already exists.
+absl::Status AtomicallyWriteStringToFile(absl::string_view filename,
+                                         absl::string_view str, tsl::Env* env);
+
+// Atomically writes the binary representation of `proto` to `filename`.
+// Overwrites existing contents if the file already exists.
+absl::Status AtomicallyWriteBinaryProto(absl::string_view filename,
+                                        const tsl::protobuf::Message& proto,
+                                        tsl::Env* env);
+
+// Atomically writes the text representation of `proto` to `filename`.
+// Overwrites existing contents if the file already exists.
+absl::Status AtomicallyWriteTextProto(absl::string_view filename,
+                                      const tsl::protobuf::Message& proto,
+                                      tsl::Env* env);
+
+// Atomically writes `tensor` to `filename` in TFRecord format. Overwrites
+// existing contents if the file already exists.
+absl::Status AtomicallyWriteTFRecords(absl::string_view filename,
+                                      const std::vector<Tensor>& tensors,
+                                      absl::string_view compression,
+                                      tsl::Env* env);
+
+// Returns the relative paths of the children of `directory`, ignoring temporary
+// files. Returns an empty vector if the directory does not have any children.
+absl::StatusOr<std::vector<std::string>> GetChildren(
+    absl::string_view directory, tsl::Env* env);
+
+// Returns true if `filename` is a temporary file and should be ignored in
+// normal data processing.
+bool IsTemporaryFile(absl::string_view filename);
+
+// Returns the total number of chunks for a distributed snapshot:
+// - If the snapshot is finished, returns the number of committed chunks.
+// - If the snapshot is unfinished or has failed, returns kUnknownCardinality.
+int64_t SnapshotChunksCardinality(absl::string_view snapshot_path,
+                                  tsl::Env* env);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_FILE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h
new file mode 100644
index 00000000..db6cd182
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h
@@ -0,0 +1,144 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PARALLEL_TFRECORD_WRITER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PARALLEL_TFRECORD_WRITER_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/service/byte_size.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace data {
+
+// Uses multiple threads to write TFRecords in parallel. Users add data without
+// waiting for the file writes, and it writes one shard of file per thread.
+// Returns the file names when writes are finished. This class is thread-safe.
+//
+// Usage example:
+//
+// ParallelTFRecordWriter writer(
+//     "/path/to/file", tsl::io::compression::kSnappy, Env::Default());
+//
+// std::vector<Tensor> record;
+// bool end_of_sequence = false;
+// TF_RETURN_IF_ERROR(iterator.GetNext(record, end_of_sequence));
+// while (!end_of_sequence) {
+//   TF_RETURN_IF_ERROR(writer.Write(record));
+//   TF_RETURN_IF_ERROR(iterator.GetNext(record, end_of_sequence));
+// }
+// TF_ASSIGN_OR_RETURN(ParallelTFRecordWriter::FileToStatsMap file_stats,
+//                     writer.Finalize());
+class ParallelTFRecordWriter {
+ public:
+  explicit ParallelTFRecordWriter(const std::string& file_prefix,
+                                  const std::string& compression, tsl::Env* env,
+                                  ByteSize max_file_size = ByteSize::GB(6),
+                                  int64_t num_write_threads = 2,
+                                  int64_t buffer_size = 1);
+  virtual ~ParallelTFRecordWriter();
+  ParallelTFRecordWriter(const ParallelTFRecordWriter&) = delete;
+  ParallelTFRecordWriter& operator=(const ParallelTFRecordWriter&) = delete;
+
+  // Writes `record`. If there is sufficient buffer space, it returns without
+  // waiting for the record to be written to the file. If the buffer is full,
+  // blocks until there is enough space to buffer the record.
+  absl::Status Write(std::vector<Tensor> record);
+
+  // File stats: number of records in a file and the estimated size of the file.
+  struct FileStats {
+    int64_t num_records = 0;
+    ByteSize estimated_size;
+  };
+  using FileToStatsMap = absl::flat_hash_map<std::string, FileStats>;
+
+  // Flushes the writer and finalizes the files. Returns a map from absolute
+  // paths to the file stats. After the writer is finalized, `Write` will return
+  // `FailedPreconditionErrors`. The caller should make sure all `Write` calls
+  // have finished before calling `Finalize`. Will block until the writer is
+  // finalized or an error occurs.
+  absl::StatusOr<FileToStatsMap> Finalize();
+
+ private:
+  // Run by a thread to write buffered records to sharded files.
+  void WriteFiles();
+
+  // Whether there are more records to be written.
+  bool HasNext() const;
+
+  // Writes a new file.
+  absl::Status WriteFile();
+
+  // Whether the file can hold more records without exceeding `max_file_size_`.
+  bool ShouldWriteFile(const std::string& filename) const;
+
+  // Writes one record to file.
+  absl::Status WriteRecord(const std::string& filename,
+                           snapshot_util::TFRecordWriter& writer);
+
+  // Gets the next record from the buffer to write. Returns `std::nullopt` if
+  // there are no more records to write.
+  absl::StatusOr<std::optional<std::vector<Tensor>>> GetNextRecord(
+      const std::string& filename);
+
+  // Deletes the file if it's empty.
+  absl::Status DeleteEmptyFile(const std::string& filename);
+
+  // Generates a unique file name in the requested directory.
+  absl::StatusOr<std::string> GetUniqueFile() const;
+
+  // Updates the status of the writer and notifies waiters.
+  void UpdateStatus(absl::Status status);
+
+  tsl::Env* const env_;
+  const std::string file_prefix_;
+  const std::string compression_;
+  const ByteSize max_file_size_;
+  const int64_t buffer_size_;
+
+  mutable absl::Mutex mu_;
+  mutable absl::CondVar ready_to_push_;
+  mutable absl::CondVar ready_to_pop_;
+
+  bool finalized_ ABSL_GUARDED_BY(mu_) = false;
+  absl::Status status_ ABSL_GUARDED_BY(mu_);
+
+  // A map from absolute paths to the number of records in the files.
+  FileToStatsMap file_stats_ ABSL_GUARDED_BY(mu_);
+
+  // Buffer to hold the records to be written. The size should be bounded by
+  // `buffer_size_`.
+  std::deque<std::vector<Tensor>> buffer_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PARALLEL_TFRECORD_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/path_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/path_utils.h
new file mode 100644
index 00000000..63c88556
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/path_utils.h
@@ -0,0 +1,134 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns the directory path for the assigned streams of a snapshot.
+std::string StreamsDirectory(absl::string_view snapshot_path);
+
+// Returns the directory path for a worker writing one stream of the snapshot.
+std::string StreamDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index);
+
+// Returns the directory path for the assigned splits for a worker writing one
+// stream of a snapshot.
+std::string SplitsDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index);
+
+// Returns the directory path for the assigned splits for one source, for a
+// worker writing one stream of a snapshot.
+std::string SourceDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index, int64_t source_index);
+
+// Returns the directory path for one repetition of a split provider.
+std::string RepetitionDirectory(absl::string_view snapshot_path,
+                                int64_t stream_index, int64_t source_index,
+                                int64_t repetition_index);
+
+// Returns the file path for an assigned split for a worker writing one stream
+// of a snapshot.
+std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
+                      int64_t source_index, int64_t repetition_index,
+                      int64_t local_index, int64_t global_index);
+
+// Returns the index of the stream. The expected format of
+// `stream_directory_name` is:
+// stream_<stream_index>
+absl::StatusOr<int64_t> ParseStreamDirectoryName(
+    absl::string_view stream_directory_name);
+
+// Returns the index of the source. The expected format of
+// `source_directory_name` is:
+// source_<stream_index>
+absl::StatusOr<int64_t> ParseSourceDirectoryName(
+    absl::string_view source_directory_name);
+
+// Returns the index of the repetition. The expected format of
+// `repetition_directory_name` is:
+// repetition_<stream_index>
+absl::StatusOr<int64_t> ParseRepetitionDirectoryName(
+    absl::string_view repetition_directory_name);
+
+// Returns a pair of {local_split_index, global_split_index} of the split. The
+// expected format of `split_filename` is:
+// split_<local_split_index>_<global_split_index>
+absl::StatusOr<std::pair<int64_t, int64_t>> ParseSplitFilename(
+    absl::string_view split_filename);
+
+// Returns a pair of {checkpoint_index, checkpoint_num_elements} of the
+// checkpoint. The expected format of `checkpoint_filename` is:
+// checkpoint_<checkpoint_index>_<checkpoint_num_elements>
+absl::StatusOr<std::pair<int64_t, int64_t>> ParseCheckpointFilename(
+    absl::string_view checkpoint_filename);
+
+// Returns a tuple of {stream_index, stream_chunk_index, chunk_num_elements} of
+// the chunk. The expected format of `chunk_filename` is:
+// chunk_<stream_index>_<stream_chunk_index>_<chunk_num_elements>
+absl::StatusOr<std::tuple<int64_t, int64_t, int64_t>> ParseChunkFilename(
+    absl::string_view chunk_filename);
+
+// Returns the path of the DONE file of a snapshot stream.
+std::string StreamDoneFilePath(absl::string_view snapshot_path,
+                               int64_t stream_index);
+
+// Returns the path of the owner_worker file of a snapshot stream.
+std::string StreamWorkerFilePath(absl::string_view snapshot_path,
+                                 int64_t stream_index);
+
+// Returns the path of the owner_worker file of a snapshot stream.
+std::string StreamWorkerFilePath(absl::string_view stream_path);
+
+// Returns the path of the DONE file of a snapshot.
+std::string SnapshotDoneFilePath(absl::string_view snapshot_path);
+
+// Returns the path of the ERROR file of a snapshot.
+std::string SnapshotErrorFilePath(absl::string_view snapshot_path);
+
+// Returns the path of the serialized metadata for a snapshot.
+std::string SnapshotMetadataFilePath(absl::string_view snapshot_path);
+
+// Returns the path of the serialized graph of the dataset for a snapshot.
+std::string DatasetDefFilePath(absl::string_view snapshot_path);
+
+// Returns the path of the serialized element spec of the dataset for a
+// snapshot.
+std::string DatasetSpecFilePath(absl::string_view snapshot_path);
+
+// Returns the directory path for snapshot checkpoints.
+std::string CheckpointsDirectory(absl::string_view snapshot_path,
+                                 int64_t stream_index);
+
+// Returns the directory path for committed chunks.
+std::string CommittedChunksDirectory(absl::string_view snapshot_path);
+
+// Returns the directory path for uncommitted chunks.
+std::string UncommittedChunksDirectory(absl::string_view snapshot_path,
+                                       int64_t stream_index);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/prefetched_split_provider.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/prefetched_split_provider.h
new file mode 100644
index 00000000..2ec9472c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/prefetched_split_provider.h
@@ -0,0 +1,158 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PREFETCHED_SPLIT_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PREFETCHED_SPLIT_PROVIDER_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/btree_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace data {
+
+// Uses multiple threads to prefetch splits and write them to temporary files.
+// Used to speed up tf.data snapshot manager where splits should be persisted
+// before returning to the users. This class is thread-safe.
+//
+// Usage example:
+//
+// std::unique_ptr<SplitProvider> split_provider = ...
+// PrefetchedSplitProvider prefetched_split_provider(
+//     std::move(split_provider), "/tmp/directory", Env::Default());
+// TF_ASSIGN_OR_RETURN(std::optional<Tensor> split,
+//                     prefetched_split_provider.GetSplit(SplitPath(...)));
+// if (split.has_value) {
+//   return *split;
+// }
+class PrefetchedSplitProvider {
+ public:
+  // Creates a prefetched split provider by prefetching given `split_provider`.
+  // `directory` is where to write temporary splits. The splits will be moved to
+  // a target file when returned to the client (see the comment for `GetSplit`).
+  // `num_write_threads` is the number of threads to prefetch and write splits.
+  // `buffer_size_per_thread` is the size of the buffer holding the prefetched
+  // but unread splits. For every prefetched split, we keep: (1) an in-memory
+  // Tensor in the buffer, and (2) an on-disk file representing the same split.
+  explicit PrefetchedSplitProvider(
+      std::unique_ptr<SplitProvider> split_provider,
+      const std::string& directory, tsl::Env* env,
+      size_t num_write_threads = 20, size_t buffer_size_per_thread = 5);
+  virtual ~PrefetchedSplitProvider();
+  PrefetchedSplitProvider(const PrefetchedSplitProvider&) = delete;
+  PrefetchedSplitProvider& operator=(const PrefetchedSplitProvider&) = delete;
+
+  // Writes the split to `target_split_path` and returns the split. Returns
+  // `std::nullopt` if no more splits are available. If there are more available
+  // splits but not currently ready for reading, blocks until they are ready.
+  absl::StatusOr<std::optional<Tensor>> GetNext(const std::string& split_path);
+
+  // Resets the split provider.
+  absl::Status Reset();
+
+  // Cancels the split provider. After cancelling, concurrent `GetNext` calls
+  // will return a Cancelled error.
+  void Cancel();
+
+ private:
+  // Prefetched split and its split index.
+  struct SplitAndIndex {
+    Tensor split;
+    size_t index = 0;
+
+    // Returns the absolute path of the prefetched split.
+    std::string SplitPath(const std::string& directory) const {
+      return tsl::io::JoinPath(directory,
+                               absl::StrCat("split_", index, ".tfrecord"));
+    }
+
+    friend bool operator<(const SplitAndIndex& lhs, const SplitAndIndex& rhs) {
+      return lhs.index < rhs.index;
+    }
+  };
+
+  // Initializes directories for writing. This cleans up all existing files in
+  // `directory_`.
+  absl::Status InitDirs();
+
+  // Runs the prefetch threads.
+  std::unique_ptr<tsl::thread::ThreadPool> RunPrefetchThreads();
+
+  // The prefetching threads run this method to prefetch the splits.
+  void PrefetchLoop();
+
+  // Whether the prefetching thread should try to fetch more splits.
+  bool ShouldPrefetchSplit() const;
+
+  // If there is enough buffer space, prefetches one split and writes it to a
+  // temporary file. If the buffer is full, blocks until there is buffer space.
+  absl::StatusOr<bool> PrefetchSplit();
+
+  // Gets the next split from the split provider.
+  absl::StatusOr<std::optional<SplitAndIndex>> GetSplitFromProvider();
+
+  // Updates the status and notifies waiters.
+  void UpdateStatus(absl::Status status);
+
+  tsl::Env* const env_;
+  const std::string directory_;
+  const size_t num_write_threads_;
+  const size_t buffer_size_;
+
+  mutable absl::Mutex mu_;
+  mutable absl::CondVar ready_to_push_;
+  mutable absl::CondVar ready_to_pop_;
+
+  std::unique_ptr<SplitProvider> split_provider_;
+
+  absl::Status status_ ABSL_GUARDED_BY(mu_);
+
+  // Whether the split provider is being reset.
+  bool reset_ ABSL_GUARDED_BY(mu_) = false;
+
+  // The indices ensure the splits are returned in order. When prefetching a
+  // split, associates each split with the `split_index_to_write_`. The buffer
+  // is sorted by the split index. When reading, waits for the split with index
+  // `split_index_to_read_`.
+  size_t split_index_to_read_ ABSL_GUARDED_BY(mu_) = 0;
+  size_t split_index_to_write_ ABSL_GUARDED_BY(mu_) = 0;
+
+  // Number of finished threads. If `finished_threads_ >= num_write_threads_`,
+  // then all the splits have been pushed to the buffer. Otherwise, the split
+  // provider has not produced all the splits, or some thread is still writing
+  // splits to the files.
+  size_t finished_threads_ ABSL_GUARDED_BY(mu_) = 0;
+
+  // Buffer to hold the splits. The size should be bounded by `buffer_size_`.
+  absl::btree_set<SplitAndIndex> buffer_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PREFETCHED_SPLIT_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
new file mode 100644
index 00000000..fefc4998
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
@@ -0,0 +1,124 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_CHUNK_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_CHUNK_PROVIDER_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/btree_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+// Provides the next chunk to read. Blocks until the next chunk is unavailable,
+// or all the chunks have been read. This class is thread-safe.
+class SnapshotChunkProvider : public SplitProvider {
+ public:
+  SnapshotChunkProvider(absl::string_view snapshot_path, tsl::Env* env);
+  ~SnapshotChunkProvider() override = default;
+  SnapshotChunkProvider(const SnapshotChunkProvider&) = delete;
+  SnapshotChunkProvider& operator=(const SnapshotChunkProvider&) = delete;
+
+  // Returns the absolute file path of next snapshot chunk to read. If there is
+  // no available chunk, blocks until the next chunk is unavailable, or all the
+  // chunks are read. Sets `end_of_splits` to true if all chunks have been read.
+  absl::Status GetNext(Tensor* split, bool* end_of_splits) override;
+
+  absl::Status Reset() override;
+
+  // Supports checkpointing.
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer) override;
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader) override;
+
+  // If the snapshot is finished, returns the number of committed chunks.
+  // If the snapshot is unfinished or has failed, returns kUnknownCardinality.
+  int64_t Cardinality() const override;
+
+  // Cancels the provider. After cancelling, if the snapshot is unfinished,
+  // in-flight `GetNext` calls will return Cancelled status.
+  void Cancel() override;
+
+ private:
+  // State of the snapshot.
+  struct SnapshotState {
+    SnapshotState() = default;
+    explicit SnapshotState(bool snapshot_is_done)
+        : snapshot_is_done(snapshot_is_done) {}
+    explicit SnapshotState(absl::Status status) : status(std::move(status)) {}
+
+    // True if the snapshot is done without errors.
+    bool snapshot_is_done = false;
+
+    // Non-OK status if writing the snapshot fails.
+    absl::Status status = absl::OkStatus();
+  };
+
+  // Used to sort chunks by chunk indexes so that chunks are read evenly across
+  // streams and chunks of early repetitions are read first.
+  struct ChunkOrder {
+    bool operator()(const std::string& chunk1, const std::string& chunk2) const;
+  };
+  using OrderedChunkSet = absl::btree_set<std::string, ChunkOrder>;
+
+  // String conversions to support `Save` and `Restore`.
+  static std::string SetToString(const OrderedChunkSet& s);
+  static OrderedChunkSet SetFromString(absl::string_view s);
+
+  // Updates the snapshot state and available chunks.
+  absl::Status UpdateSnapshot();
+
+  // Reads the DONE or ERROR file and returns a SnapshotState indicating whether
+  // the snapshot is complete.
+  absl::StatusOr<SnapshotState> GetSnapshotState();
+
+  // Reads the available chunks from disk and returns a vector of chunk file
+  // names.
+  absl::StatusOr<std::vector<std::string>> GetAvailableChunks();
+
+  const std::string snapshot_path_;
+  tsl::Env* const env_;
+
+  mutable absl::Mutex mu_;
+
+  // The set of read chunks.
+  OrderedChunkSet chunks_read_ ABSL_GUARDED_BY(mu_);
+
+  // The set of unread chunks. Uses an ordered set to make sure repeated reads
+  // produce data in a deterministic order.
+  OrderedChunkSet chunks_unread_ ABSL_GUARDED_BY(mu_);
+
+  // State of the snapshot.
+  SnapshotState snapshot_state_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_CHUNK_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_manager.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_manager.h
new file mode 100644
index 00000000..dd3a76d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -0,0 +1,378 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_MANAGER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_MANAGER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/time/time.h"
+#include "xla/tsl/protobuf/status.pb.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/snapshot/prefetched_split_provider.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// A helper shared among `SnapshotManager`s to limit workers' stream assignments
+// across ongoing snapshots. This class is thread-safe.
+class SnapshotAssignmentManager {
+ public:
+  explicit SnapshotAssignmentManager(int64_t worker_max_concurrent_snapshots)
+      : worker_max_concurrent_snapshots_(worker_max_concurrent_snapshots) {}
+
+  // Tries to record the event of a worker being assigned a stream. Returns
+  // `false` if the worker has too many assignments. Returns an error if the
+  // worker is already known to have been assigned this stream.
+  absl::StatusOr<bool> TryAddAssignment(absl::string_view snapshot_path,
+                                        absl::string_view worker_address,
+                                        int64_t stream_index);
+
+  // Records the event of a worker stopping work on a stream.
+  void RemoveAssignment(absl::string_view snapshot_path,
+                        absl::string_view worker_address, int64_t stream_index);
+
+  // Adds a new snapshot.
+  void AddSnapshot(absl::string_view snapshot_path);
+
+  // Load balances snapshots by the number of assigned streams. Given a worker,
+  // returns snapshots in the following order:
+  // - Snapshots already assigned to this worker.
+  // - Snapshots with the fewest assignments.
+  std::vector<std::string> LoadBalanceSnapshots(
+      absl::string_view worker_address);
+
+  // Returns the maximum concurrent snapshots processed by each worker.
+  int64_t worker_max_concurrent_snapshots() const {
+    return worker_max_concurrent_snapshots_;
+  }
+
+ private:
+  struct Assignment {
+    std::string snapshot_path;
+    int64_t stream_index;
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Assignment& a) {
+      return H::combine(std::move(h), a.snapshot_path, a.stream_index);
+    }
+
+    friend bool operator==(const Assignment& lhs, const Assignment& rhs) {
+      return lhs.snapshot_path == rhs.snapshot_path &&
+             lhs.stream_index == rhs.stream_index;
+    }
+
+    std::string DebugString() const {
+      return absl::Substitute(
+          "Assignment { snapshot_path: $0, stream_index: $1 }", snapshot_path,
+          stream_index);
+    }
+  };
+
+  // A mapping of worker address to ongoing assignments.
+  absl::flat_hash_map<std::string, absl::flat_hash_set<Assignment>> assignments_
+      TF_GUARDED_BY(mu_);
+
+  // A mapping from snapshot to the number of assigned workers.
+  absl::flat_hash_map<std::string, int64_t> snapshot_assignment_counts_
+      TF_GUARDED_BY(mu_);
+
+  // The maximum number of snapshots that a worker can concurrently process at a
+  // given point in time. This is a tradeoff between worker resource usage and
+  // snapshot wall time. A value of 0 indicates that the decision should be left
+  // up to the runtime.
+  const int64_t worker_max_concurrent_snapshots_;
+
+  mutable tsl::mutex mu_;
+};
+
+// A helper used by `DataServiceDispatcherImpl` to manage a call to `Snapshot`.
+//
+// Two mirrored states are maintained:
+// - An in-memory state (objects in the `SnapshotManager` instance).
+// - An on-disk state (files in the `SnapshotManager::path_`).
+//
+// The on-disk state has this structure:
+// - snapshot_path
+//   - DONE
+//   - ERROR
+//   - snapshot.metadata
+//   - dataset_def.proto
+//   - dataset_spec.pb
+//   - chunks
+//     - chunk_<stream_index>_<stream_chunk_index>_<num_elements>
+//   - streams
+//     - stream_0
+//       - DONE
+//       - ERROR
+//       - splits
+//         - source_0
+//           - split_<local_split_index>_<global_split_index>
+//       - uncommitted_chunks
+//         - chunk_<chunk_index>
+//       - checkpoints
+//         - checkpoint_<chunk_index>_<num_elements>
+//
+class SnapshotManager {
+ public:
+  // Initiates a new snapshot process, creating a fresh in-memory state and
+  // writing an on-disk state to `path`. Returns an error if `path` already
+  // exists in the filesystem.
+  static absl::StatusOr<std::unique_ptr<SnapshotManager>> Start(
+      const SnapshotRequest& request,
+      SnapshotAssignmentManager& assignment_manager, Env* env);
+  // Resumes an existing snapshot process, reading from the on-disk state in
+  // `path` to derive an in-memory state. Returns an error if `path` is in a bad
+  // state.
+  static absl::StatusOr<std::unique_ptr<SnapshotManager>> Resume(
+      absl::string_view path, SnapshotAssignmentManager& assignment_manager,
+      Env* env);
+
+  // Handles the work pertaining to this snapshot process for the respective
+  // `DispatcherService` API calls:
+  // - `WorkerHeartbeat`: Returns a stream assignment for the worker.
+  // - `GetSnapshotSplit`: Returns a split assignment for the worker.
+  // - `GetSnapshotStreams`: Returns information about all streams.
+  absl::Status WorkerHeartbeat(const WorkerHeartbeatRequest& request,
+                               WorkerHeartbeatResponse& response);
+  absl::Status GetSnapshotSplit(const GetSnapshotSplitRequest& request,
+                                GetSnapshotSplitResponse& response);
+  absl::Status GetSnapshotStreams(GetSnapshotStreamsResponse& response);
+
+  // Cancels the SnapshotManager and finishes in-progress threads.
+  void Cancel();
+
+ private:
+  SnapshotManager(absl::string_view path,
+                  SnapshotAssignmentManager& assignment_manager, Env* env)
+      : path_(path),
+        env_(env),
+        last_progress_log_time_(absl::FromUnixMicros(env->NowMicros())),
+        assignment_manager_(assignment_manager) {}
+
+  // Helpers for `Start` above. These update the on-disk state.
+  absl::Status Start(const SnapshotRequest& request);
+  absl::Status WriteOnDiskSkeleton();
+  absl::Status WriteOnDiskMetadata(const SnapshotRequest& request);
+
+  // Helpers for `Resume` above. These update the in-memory state.
+  absl::Status Resume();
+  absl::Status ReadOnDiskMetadata();
+  absl::Status ReadOnDiskStreams();
+
+  // Helpers for `WorkerHeartbeat` above. These may update the in-memory and
+  // on-disk states.
+  // Gets or creates a new stream. Returns the stream index and a bool value
+  // indicating whether a new stream has been created. Returns `std::nullopt`
+  // if there are no more streams to write or there is an error.
+  absl::StatusOr<std::optional<std::pair<int64_t, bool>>>
+  MaybeGetOrCreateStreamAssignment(
+      absl::string_view worker_address,
+      const SnapshotTaskProgress* snapshot_progress);
+  absl::Status HandleStreamCompletion(int64_t stream_index,
+                                      absl::string_view worker_address);
+  void ReassignPreviouslyAssignedStream(int64_t stream_index,
+                                        absl::string_view worker_address);
+  std::optional<int64_t> MaybeAssignOrphanStream(
+      absl::string_view worker_address);
+  absl::StatusOr<std::optional<int64_t>> MaybeCreateAndAssignNewStream(
+      absl::string_view worker_address);
+  absl::Status HandleStreamError(absl::string_view worker_address,
+                                 const StatusProto& status_proto);
+
+  mutable tsl::mutex mu_;
+  // Uses a separate mutex for `GetSnapshotSplit` RPCs. `GetSnapshotSplit` uses
+  // file IO and may be slow, which may slow down `WorkerHeartbeat` RPCs if they
+  // share one mutex.
+  mutable tsl::mutex get_split_mu_;
+
+  // The filepath of the on-disk state.
+  const std::string path_;
+  // A tensorflow environment interface used to write to and read from `path_`.
+  tsl::Env* const env_;
+  // Distributed snapshot metadata.
+  experimental::DistributedSnapshotMetadata metadata_ TF_GUARDED_BY(mu_);
+  // The last time progress was logged.
+  absl::Time last_progress_log_time_ TF_GUARDED_BY(mu_);
+
+  // The addresses of all workers considered to be dead based on heartbeat
+  // timeout.
+  absl::flat_hash_set<std::string> dead_workers_ TF_GUARDED_BY(mu_);
+
+  struct Stream {
+    explicit Stream(int64_t num_sources)
+        : num_assigned_splits_per_source(num_sources) {}
+
+    enum class State {
+      // The stream is not finished and the worker is heartbeating.
+      kActive,
+      // The stream is finished.
+      kDone,
+    };
+
+    // A counter of assigned splits for each source.
+    std::vector<int64_t> num_assigned_splits_per_source;
+
+    int64_t num_assigned_splits() const {
+      return absl::c_accumulate(num_assigned_splits_per_source, 0);
+    }
+
+    State state = State::kActive;
+  };
+
+  struct Source {
+    Source(std::unique_ptr<PrefetchedSplitProvider> split_provider,
+           int64_t repetition_index, int64_t cardinality)
+        : split_provider(std::move(split_provider)),
+          repetition_index(repetition_index),
+          cardinality(cardinality) {}
+
+    // A split provider for each input source of the dataset being snapshotted.
+    std::unique_ptr<PrefetchedSplitProvider> split_provider;
+    // The number of times the split provider has repeated.
+    int64_t repetition_index = 0;
+    // The number of splits in `split_provider`.
+    const int64_t cardinality;
+  };
+
+  // Helper class to restore a stream. Multiple stream restorers are safe to run
+  // in parallel. After it reads the on-disk stream, the client is responsible
+  // to apply the data to actually restore its internal states.
+  class StreamRestorer {
+   public:
+    explicit StreamRestorer(tsl::Env* env, absl::string_view path,
+                            int64_t stream_index, int64_t num_sources,
+                            SnapshotAssignmentManager& assignment_manager)
+        : env_(env),
+          path_(path),
+          stream_index_(stream_index),
+          num_sources_(num_sources),
+          assignment_manager_(assignment_manager) {}
+
+    // Reads snapshot stream from the files and collects data for restoration.
+    absl::Status ReadOnDiskStream();
+
+    // Accessors for collected data. Should be called *after* `ReadOnDiskStream`
+    // is called.
+    const std::optional<Stream>& GetStream() const { return restored_stream_; }
+    int64_t StreamIndex() const { return stream_index_; }
+    const std::string& WorkerAddress() const { return worker_address_; }
+    const absl::flat_hash_set<int64_t>& GlobalSplitIndices() const {
+      return global_split_indices_;
+    }
+
+   private:
+    absl::StatusOr<std::string> OwnerWorkerAddress() const;
+    absl::Status ReadOnDiskSource(int64_t source_index);
+    absl::Status ReadOnDiskSplit(int64_t source_index,
+                                 const std::vector<std::string>& split_files,
+                                 const std::string& split_file);
+    absl::Status SkipSplit(SplitProvider& split_provider);
+
+    tsl::Env* const env_;
+    const std::string path_;
+    const int64_t stream_index_;
+    const int64_t num_sources_;
+    SnapshotAssignmentManager& assignment_manager_;
+
+    std::string worker_address_;
+    std::optional<Stream> restored_stream_;
+    absl::flat_hash_set<int64_t> global_split_indices_;
+  };
+
+  // Applies the data collected by `stream_restorer` to actually restore the
+  // snapshot manager.
+  absl::Status RestoreFrom(
+      const StreamRestorer& stream_restorer,
+      const std::vector<std::string>& stream_directories,
+      std::vector<std::unique_ptr<SplitProvider>>& split_providers,
+      std::vector<int64_t>& repetition_indices,
+      absl::flat_hash_set<int64_t>& global_split_indices);
+
+  // Gets the snapshot stream.
+  Stream& GetStream(int64_t stream_index);
+  // Initializes the stream directory.
+  absl::Status InitStreamDirectory(
+      int64_t stream_index, const std::string& worker_address,
+      const std::vector<int64_t>& repetitions_per_source);
+
+  std::vector<Source> sources_ TF_GUARDED_BY(mu_);
+  // Creates sources for the specified dataset.
+  absl::StatusOr<std::vector<Source>> CreateSources(
+      const DatasetDef& dataset_def) const;
+  // Returns the total number of splits.
+  absl::StatusOr<int64> GetSplitsCardinality();
+  // Resets a source when it runs out of splits, to support repetitions.
+  absl::Status ResetSource(Source& source, int64_t source_index);
+  int64_t num_sources() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return sources_.size();
+  }
+
+  // All streams for this snapshot.
+  absl::btree_map<int64_t, Stream> streams_ TF_GUARDED_BY(mu_);
+  // A counter of completed streams for this snapshot.
+  int64_t num_completed_streams_ TF_GUARDED_BY(mu_) = 0;
+
+  // A mapping of worker to assigned stream index for this snapshot.
+  absl::flat_hash_map<std::string, int64_t> assignments_ TF_GUARDED_BY(mu_);
+  // A mapping of worker to assigned streams for all snapshots.
+  SnapshotAssignmentManager& assignment_manager_ TF_GUARDED_BY(mu_);
+
+  // A counter of assigned splits for this snapshot.
+  int64_t num_assigned_splits_ TF_GUARDED_BY(mu_) = 0;
+  // The number of splits in a single repetition of the data in `sources_`.
+  int64_t num_total_splits_ TF_GUARDED_BY(mu_) = 0;
+
+  enum class Mode {
+    // No streams are done.
+    kActive,
+    // At least one source is fully processed, but not all streams are done.
+    kWindingDown,
+    // All streams are done.
+    kDone,
+    // If any stream fails, the snapshot is in an error state. `status_` will
+    // contain the error status.
+    kError,
+  };
+
+  // If not `kActive`, at least one source has finished processing and no new
+  // streams are created or assigned.
+  Mode mode_ TF_GUARDED_BY(mu_) = Mode::kActive;
+
+  // If `mode_` is in an error state, `status_` will contain the error status.
+  absl::Status status_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_split_provider.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
new file mode 100644
index 00000000..b5ca603e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
@@ -0,0 +1,106 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_SPLIT_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_SPLIT_PROVIDER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/btree_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Split provider that supports writing distributed snapshots.
+class SnapshotSplitProvider : public SplitProvider {
+ public:
+  SnapshotSplitProvider(const std::string& worker_address,
+                        const SnapshotTaskDef& snapshot_task,
+                        int64_t source_index, absl::Duration timeout,
+                        std::unique_ptr<DataServiceDispatcherClient> dispatcher,
+                        Env* env);
+
+  absl::Status GetNext(Tensor* split, bool* end_of_splits) override;
+  absl::Status Reset() override;
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer) override;
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader) override;
+
+ private:
+  const std::string worker_address_;
+  const SnapshotTaskDef snapshot_task_;
+  const int64_t source_index_;
+  const absl::Duration timeout_;
+  Env* const env_;
+
+  // Gets the next split from file or dispatcher and validates it.
+  absl::Status GetAndValidateSplit(Tensor* split, bool* end_of_splits);
+
+  // Gets the next split by reading from the splits directory.
+  absl::Status GetSplitFromFile(const std::string& split_file, Tensor* split,
+                                bool* end_of_splits);
+
+  // Gets the next split by sending an RPC to the dispatcher. Returns the local
+  // split index from the dispatcher.
+  absl::StatusOr<int64_t> GetSplitFromDispatcher(Tensor* split,
+                                                 bool* end_of_splits);
+
+  // Reads from the split directory and returns a map of split index to absolute
+  // file path of the split, starting at `start_index`.
+  absl::StatusOr<absl::btree_map<int64_t, std::string>> GetSplitsFiles(
+      int64_t start_index) const;
+
+  // Verifies `split_files` contains consecutive splits starting at
+  // `start_index`.
+  absl::Status ValidateSplitFiles(
+      const absl::btree_map<int64_t, std::string>& split_files,
+      int64_t start_index) const;
+
+  // Verifies `split_files` contains consecutive splits starting at
+  // `start_index` and ending at `end_index`.
+  absl::Status ValidateSplitFiles(
+      const absl::btree_map<int64_t, std::string>& split_files,
+      int64_t start_index, int64_t end_index, bool end_of_splits) const;
+
+  mutable mutex mu_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_ TF_GUARDED_BY(mu_);
+
+  // The next split to read.
+  int64_t next_split_index_ TF_GUARDED_BY(mu_) = 0;
+
+  // Number of times the dataset has repeated.
+  int64_t repetition_index_ TF_GUARDED_BY(mu_) = 0;
+
+  // Maps the local split index to the absolute split file path.
+  absl::btree_map<int64_t, std::string> split_to_file_map_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_SPLIT_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
new file mode 100644
index 00000000..09d72d86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
@@ -0,0 +1,245 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/substitute.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/byte_size.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+constexpr ByteSize kDefaultMaxChunkSize = ByteSize::GB(6);
+constexpr absl::Duration kDefaultCheckpointInterval = absl::Minutes(30);
+
+struct SnapshotWriterParams {
+  // The directory path of the snapshot. See the comment on SnapshotStreamWriter
+  // for how the directory is structured.
+  std::string snapshot_path;
+
+  // The index of the snapshot stream. A stream is one shard of the snapshot
+  // processed by a worker.
+  int64_t stream_index = 0;
+
+  // Compression method as defined in tsl/lib/io/compression.h.
+  std::string compression;
+
+  // The Tensorflow environment.
+  Env* env = nullptr;
+
+  // The maximum number of bytes in each chunk.
+  ByteSize max_chunk_size = kDefaultMaxChunkSize;
+
+  // How often should checkpoints be written at the steady state. We write
+  // checkpoints (and committing chunks) more frequently at the startup time to
+  // avoid starving training jobs during startup.
+  absl::Duration checkpoint_interval = kDefaultCheckpointInterval;
+
+  // If true, keep temporary files (e.g., checkpoints) after completing the
+  // snapshot. Used only for unit testing.
+  bool test_only_keep_temp_files = false;
+
+  std::string StreamDirectory() const {
+    return tensorflow::data::StreamDirectory(snapshot_path, stream_index);
+  }
+
+  std::string CommittedChunksDirectory() const {
+    return tensorflow::data::CommittedChunksDirectory(snapshot_path);
+  }
+
+  std::string UncommittedChunksDirectory() const {
+    return tensorflow::data::UncommittedChunksDirectory(snapshot_path,
+                                                        stream_index);
+  }
+
+  std::string CheckpointsDirectory() const {
+    return tensorflow::data::CheckpointsDirectory(snapshot_path, stream_index);
+  }
+
+  std::string DebugString() const {
+    return absl::Substitute(
+        "SnapshotWriterParams { base_path: $0, stream: $1, compression: $2 }",
+        snapshot_path, stream_index, compression);
+  }
+};
+
+// Responsible for writing one snapshot stream, which is organized as following:
+//
+// - snapshot
+//   - DONE
+//   - ERROR
+//   - snapshot.metadata
+//   - dataset_def.proto
+//   - chunks
+//     - chunk_<stream_index>_<chunk_index>_<num_elements>
+//   - streams
+//     - stream_0
+//       - DONE
+//       - ERROR
+//       - splits
+//         - split_<local_split_index>_<global_split_index>
+//       - uncommitted chunks
+//         - chunk_<chunk_index>
+//       - checkpoints
+//         - checkpoint_<chunk_index>_<num_elements>
+//
+// This class is thread-safe.
+class SnapshotStreamWriter {
+ public:
+  // Creates a SnapshotStreamWriter. Once created, it will start writing the
+  // snapshot stream. Users can call `Wait` to wait for it to finish.
+  explicit SnapshotStreamWriter(const SnapshotWriterParams& params,
+                                std::unique_ptr<TaskIterator> iterator);
+  virtual ~SnapshotStreamWriter() = default;
+  SnapshotStreamWriter(const SnapshotStreamWriter&) = delete;
+  SnapshotStreamWriter& operator=(const SnapshotStreamWriter&) = delete;
+
+  // Returns true if the snapshot stream has completed. A snapshot stream is
+  // completed if the dataset has reached the end of sequence and a DONE file is
+  // written. Returns an error if the snapshot has failed. This does not block
+  // the caller.
+  absl::StatusOr<bool> Completed() const;
+
+  // Waits for the writer to finish writing the snapshot stream and returns the
+  // final status.
+  absl::StatusOr<bool> Wait();
+
+  // Cancels the writer. If cancelled, `Wait` will return a Cancelled error.
+  void Cancel();
+
+ private:
+  // Writes the snapshot and any debugging log when necessary.
+  void WriteSnapshotAndLog();
+
+  // Writes the snapshot. Returns an error if writing fails or the task has been
+  // cancelled.
+  absl::Status WriteSnapshot();
+
+  // Returns true if the stream is already completed and there is no additional
+  // work to perform.
+  bool StreamAlreadyCompleted() const;
+
+  // Creates directories to store uncommitted chunks and checkpoints.
+  absl::Status InitializeDirectories();
+
+  // Returns true until the snapshot stream writer is finished, which may be due
+  // to reaching the end of its iterator, encountering an error, or being
+  // cancelled.
+  bool ShouldWriteChunks() const;
+
+  // Writes the chunk files.
+  absl::Status WriteChunks();
+
+  // Returns true if it should write more records to the current chunks. Returns
+  // false if it should checkpoint and commit the current chunks, there are no
+  // more records to write, or there is an error.
+  bool ShouldWriteRecord() const;
+
+  // Writes the next record to the current chunks.
+  absl::Status WriteRecord(ParallelTFRecordWriter& writer);
+
+  // Commits the chunks since the last commit.
+  absl::Status Commit(const ParallelTFRecordWriter::FileToStatsMap& file_stats);
+
+  // Writes a DONE file when the stream is finished. Writes an ERROR file if it
+  // failed.
+  absl::Status FinalizeStream(absl::Status status);
+  absl::Status WriteDoneFile();
+  absl::Status WriteErrorFile(const absl::Status& status);
+
+  // Saves an iterator checkpoint.
+  absl::Status Save(const ParallelTFRecordWriter::FileToStatsMap& file_stats);
+
+  // After committing a checkpoint, deletes the previous checkpoints.
+  absl::Status DeleteOutdatedCheckpoints(int64_t checkpoint_index);
+
+  // Deletes all checkpoints.
+  absl::Status DeleteCheckpoints();
+
+  // Restores from the last checkpoint.
+  absl::Status Restore();
+
+  // Returns the filename of the most recent checkpoint.
+  absl::StatusOr<std::string> LastCheckpointName() const;
+
+  // Synchronizes the checkpoint with the committed chunks. This is called when
+  // the worker restores the snapshot in case the worker fails after writing the
+  // checkpoint but before committing a chunk file. If no checkpoint has been
+  // written, `checkpoint_index` is nullopt.
+  absl::Status SyncCheckpointWithChunks(std::optional<int64_t> checkpoint_index,
+                                        int64_t checkpoint_num_elements);
+
+  // Index of the last committed chunk.
+  absl::StatusOr<int64_t> LastCommittedChunkIndex();
+
+  // Returns the path of the checkpoint for `chunk_index` with
+  // `chunk_num_elements`.
+  std::string CheckpointPath(int64_t chunk_index,
+                             int64_t chunk_num_elements) const;
+
+  // Returns the path of the checkpoint for `checkpoint_name`.
+  std::string CheckpointPath(const std::string& checkpoint_name) const;
+
+  const SnapshotWriterParams params_;
+
+  // The dataset iterator that produces the dataset elements.
+  std::unique_ptr<TaskIterator> iterator_;
+
+  // Index of the next chunk to write.
+  int64_t chunk_index_ = 0;
+  // Timestamp when the last chunks are committed.
+  absl::Time last_commit_time_ = absl::Now();
+
+  // True if the dataset is exhausted.
+  bool end_of_sequence_ = false;
+
+  mutable mutex mu_;
+
+  // Whether the writer is completed:
+  // - If the snapshot is successful, this is true.
+  // - If any error happens during the snapshot write, it is the error status.
+  // - If the snapshot has not finished, this is false.
+  absl::StatusOr<bool> completed_ TF_GUARDED_BY(mu_) = false;
+
+  std::unique_ptr<Thread> snapshot_thread_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/test_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/test_utils.h
new file mode 100644
index 00000000..efa31121
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/test_utils.h
@@ -0,0 +1,125 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_TEST_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_TEST_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/byte_size.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/path.h"
+
+namespace tensorflow {
+namespace data {
+namespace testing {
+
+// Reads the records from a distributed tf.data snapshot written at `base_path`.
+template <class T>
+absl::StatusOr<std::vector<T>> ReadSnapshot(const std::string& base_path,
+                                            const std::string& compression) {
+  std::vector<T> result;
+  std::string chunks_directory = CommittedChunksDirectory(base_path);
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> chunk_files,
+                      GetChildren(chunks_directory, Env::Default()));
+  for (const std::string& chunk_file : chunk_files) {
+    std::string chunk_file_path =
+        tsl::io::JoinPath(chunks_directory, chunk_file);
+    snapshot_util::TFRecordReader tfrecord_reader(chunk_file_path, compression,
+                                                  DataTypeVector{DT_INT64});
+    TF_RETURN_IF_ERROR(tfrecord_reader.Initialize(Env::Default()));
+
+    while (true) {
+      std::vector<Tensor> tensors;
+      absl::Status status = tfrecord_reader.ReadTensors(&tensors);
+      if (absl::IsOutOfRange(status)) {
+        break;
+      }
+      TF_RETURN_IF_ERROR(status);
+      result.push_back(tensors[0].unaligned_flat<T>().data()[0]);
+    }
+  }
+  return result;
+}
+
+// Writes a partial snapshot to test checkpointing and recovering. It can be
+// used to write the specified committed chunks, uncommitted chunks, and
+// checkpoints.
+class PartialSnapshotWriter {
+ public:
+  static absl::StatusOr<PartialSnapshotWriter> Create(
+      const DatasetDef& dataset, const std::string& snapshot_path,
+      int64_t stream_index, const std::string& compression,
+      ByteSize max_chunk_size = ByteSize::Bytes(1),
+      absl::Duration checkpoint_interval = absl::Microseconds(1));
+  virtual ~PartialSnapshotWriter() = default;
+  PartialSnapshotWriter(const PartialSnapshotWriter&) = delete;
+  PartialSnapshotWriter& operator=(const PartialSnapshotWriter&) = delete;
+  PartialSnapshotWriter(PartialSnapshotWriter&&) = default;
+  PartialSnapshotWriter& operator=(PartialSnapshotWriter&&) = delete;
+
+  // Writes the specified chunks.
+  absl::Status WriteCommittedChunks(
+      const absl::flat_hash_set<int64_t>& committed_chunk_indexes) const;
+
+  // Writes the specified uncommitted chunks.
+  absl::Status WriteUncommittedChunks(
+      const absl::flat_hash_set<int64_t>& uncommitted_chunk_indexes) const;
+
+  // Writes the specified checkpoints.
+  absl::Status WriteCheckpoints(
+      const absl::flat_hash_set<int64_t>& checkpoint_indexes) const;
+
+ private:
+  PartialSnapshotWriter(const DatasetDef& dataset,
+                        const std::string& snapshot_path, int64_t stream_index,
+                        const std::string& compression, ByteSize max_chunk_size,
+                        absl::Duration checkpoint_interval);
+
+  absl::Status Initialize();
+
+  const DatasetDef dataset_;
+  const std::string snapshot_path_;
+  const int64_t stream_index_;
+  const std::string compression_;
+  const ByteSize max_chunk_size_;
+  const absl::Duration checkpoint_interval_;
+
+  std::string tmp_snapshot_path_;
+};
+
+// Creates a test iterator for the input dataset. The iterator will generate all
+// elements of the dataset.
+absl::StatusOr<std::unique_ptr<StandaloneTaskIterator>> TestIterator(
+    const DatasetDef& dataset_def);
+
+}  // namespace testing
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/utils.h
new file mode 100644
index 00000000..1ea4d80b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/snapshot/utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_UTILS_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/service/byte_size.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// Estimates the size of the Tensors when serialized as TensorProtos.
+ByteSize EstimatedSize(const std::vector<Tensor>& tensors);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/split_provider.h b/third_party/tflite-hdrs/tensorflow/core/data/service/split_provider.h
new file mode 100644
index 00000000..c426fe1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/split_provider.h
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SPLIT_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SPLIT_PROVIDER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// SplitProvider which reads splits from a tf.data service dispatcher over RPC.
+class DataServiceSplitProvider : public SplitProvider {
+ public:
+  DataServiceSplitProvider(const std::string& address,
+                           const std::string& protocol, int64_t iteration_id,
+                           int64_t split_provider_index, int64_t timeout_ms)
+      : address_(address),
+        protocol_(protocol),
+        iteration_id_(iteration_id),
+        split_provider_index_(split_provider_index),
+        timeout_ms_(timeout_ms) {}
+
+  absl::Status GetNext(Tensor* split, bool* end_of_splits) override;
+  absl::Status Reset() override;
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer) override;
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader) override;
+
+ private:
+  const std::string address_;
+  const std::string protocol_;
+  const int64_t iteration_id_;
+  const int64_t split_provider_index_;
+  const int64_t timeout_ms_;
+
+  mutex mu_;
+  int64_t repetition_ TF_GUARDED_BY(mu_) = 0;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_ TF_GUARDED_BY(mu_);
+};
+
+// Makes split providers for `dataset_def` and stores them in `split_providers`.
+absl::Status CreateSplitProviders(
+    const DatasetDef& dataset_def,
+    std::vector<std::unique_ptr<SplitProvider>>& split_providers);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SPLIT_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/task_remover.h b/third_party/tflite-hdrs/tensorflow/core/data/service/task_remover.h
new file mode 100644
index 00000000..1daf6306
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/task_remover.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_TASK_REMOVER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_TASK_REMOVER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace data {
+
+// A `TaskRemover` maintains state about a single task and decides whether the
+// task should be removed.
+class TaskRemover {
+ public:
+  explicit TaskRemover(int64_t num_consumers);
+
+  // Attempts to remove the task. The task is removed when all consumers
+  // concurrently reach a barrier in this method.
+  // Returns true if the task is successfully removed.
+  // Returns false if either:
+  //  - There is a timeout waiting for other consumers to request task removal.
+  //    This timeout is hardcoded into the implementation.
+  //  - Another consumer requests removal at a different round.
+  bool RequestRemoval(int64_t consumer_index, int64_t round);
+
+ private:
+  const int64_t num_consumers_;
+  mutex mu_;
+  condition_variable cv_;
+  // The round we are considering removing the task in.
+  int64_t round_ TF_GUARDED_BY(mu_);
+  bool removed_ TF_GUARDED_BY(mu_) = false;
+  // Consumers currently blocked in RequestRemoval.
+  absl::flat_hash_set<int64_t> consumers_waiting_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_TASK_REMOVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/task_runner.h b/third_party/tflite-hdrs/tensorflow/core/data/service/task_runner.h
new file mode 100644
index 00000000..79d698f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/task_runner.h
@@ -0,0 +1,307 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_TASK_RUNNER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_TASK_RUNNER_H_
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/cross_trainer_cache.h"
+#include "tensorflow/core/data/service/data_transfer.h"
+#include "tensorflow/core/data/service/thread_safe_buffer.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/standalone.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Iterator over a task's elements.
+class TaskIterator {
+ public:
+  virtual ~TaskIterator() = default;
+  // If the iterator is not yet exhausted, `GetNext` stores the next element in
+  // `element` and sets `end_of_sequence` to `false`. Otherwise, sets
+  // `end_of_sequence to `true`.
+  virtual absl::Status GetNext(std::vector<Tensor>& element,
+                               bool& end_of_sequence) = 0;
+  // Reports the cardinality of the dataset that created this iterator.
+  virtual int64_t Cardinality() const = 0;
+
+  // Saves a checkpoint of the iterator. Returns Tensors that can be called with
+  // `Restore()`.
+  virtual absl::StatusOr<std::vector<Tensor>> Save() {
+    return errors::Unimplemented(
+        "Serializing a tf.data service task iterator is unsupported.");
+  }
+
+  // Restores the iterator from a checkpoint. `saved_iterator` is the serialized
+  // iterator saved by calling `Save()`.
+  virtual absl::Status Restore(const std::vector<Tensor>& saved_iterator) {
+    return errors::Unimplemented(
+        "Restoring from a tf.data service task iterator is unsupported.");
+  }
+
+  // Returns the dataset model for performance analysis.
+  virtual std::shared_ptr<model::Model> model() const { return nullptr; }
+};
+
+// Implementation of TaskIterator wrapping a standalone iterator.
+class StandaloneTaskIterator : public TaskIterator {
+ public:
+  // `dataset` should be the dataset that created `iterator`.
+  // StandaloneTaskIterator takes ownership of the dataset to ensures it
+  // lives as long as `iterator`.
+  StandaloneTaskIterator(std::unique_ptr<standalone::Dataset> dataset,
+                         std::unique_ptr<standalone::Iterator> iterator);
+  absl::Status GetNext(std::vector<Tensor>& element,
+                       bool& end_of_sequence) override;
+  int64_t Cardinality() const override;
+  absl::StatusOr<std::vector<Tensor>> Save() override;
+  absl::Status Restore(const std::vector<Tensor>& saved_iterator) override;
+  std::shared_ptr<model::Model> model() const override;
+
+ private:
+  std::unique_ptr<standalone::Dataset> dataset_;
+  std::unique_ptr<standalone::Iterator> iterator_;
+};
+
+// Interface for providing elements to task consumers.
+class TaskRunner {
+ public:
+  // Creates a `TaskRunner` and stores it in `out`.
+  static absl::Status Create(const experimental::WorkerConfig& worker_config,
+                             const TaskDef& task_def,
+                             std::unique_ptr<TaskIterator> iterator,
+                             std::unique_ptr<TaskRunner>& out);
+  virtual ~TaskRunner() = default;
+  // Gets the next element for the given request.
+  virtual absl::Status GetNext(const GetElementRequest& req,
+                               GetElementResult& result) = 0;
+  // Cancels in-progress `GetNext` requests.
+  virtual void Cancel() = 0;
+  // Returns the dataset model for performance analysis.
+  virtual std::shared_ptr<model::Model> model() const = 0;
+};
+
+// A task runner which provides elements on a first-come first-served basis.
+// It does not consider which consumer is making the request.
+class FirstComeFirstServedTaskRunner : public TaskRunner {
+ public:
+  explicit FirstComeFirstServedTaskRunner(
+      std::unique_ptr<TaskIterator> iterator);
+  ~FirstComeFirstServedTaskRunner() override;
+
+  // Gets the next element. It may block if the element is not ready yet.
+  absl::Status GetNext(const GetElementRequest& req,
+                       GetElementResult& result) override;
+  absl::Status GetNext(GetElementResult& result);
+
+  void Cancel() override;
+
+  std::shared_ptr<model::Model> model() const override;
+
+ private:
+  // Function to continually prefetch the next element. Returns an error if the
+  // task has been cancelled.
+  absl::Status PrefetchFn();
+
+  // Runs `PrefetchFn` on a dedicated thread.
+  void RunPrefetchThread();
+
+  // Gets the next element from the input iterator.
+  absl::StatusOr<GetElementResult> GetNextFromInputIterator()
+      TF_LOCKS_EXCLUDED(mu_);
+
+  const std::shared_ptr<model::Model> model_;
+  mutex mu_;
+  std::unique_ptr<TaskIterator> iterator_ TF_GUARDED_BY(mu_);
+  int64_t element_index_ TF_GUARDED_BY(mu_) = 0;
+
+  ThreadSafeBuffer<GetElementResult> buffer_;
+  std::unique_ptr<Thread> prefetch_thread_;
+
+  FirstComeFirstServedTaskRunner(const FirstComeFirstServedTaskRunner&) =
+      delete;
+  void operator=(const FirstComeFirstServedTaskRunner&) = delete;
+};
+
+// A task runner which prefetches elements on a first-come first-served basis
+// and caches elements in a sliding-window `CrossTrainerCache`. The cache has a
+// bounded size and progresses when a trainer that has consumed all elements in
+// the cache. Trainers read from a sliding window of the dataset and may not
+// read the full dataset.
+class CachingTaskRunner : public TaskRunner {
+ public:
+  explicit CachingTaskRunner(std::unique_ptr<TaskIterator> iterator,
+                             size_t max_cache_size_bytes);
+  ~CachingTaskRunner() override;
+
+  // Gets the next element from the cross-trainer cache, blocking if the data is
+  // not ready.
+  // REQUIRES: !req.trainer_id().empty()
+  absl::Status GetNext(const GetElementRequest& req,
+                       GetElementResult& result) override;
+
+  // Cancel the task runner. After cancelling, all the `GetNext` calls will
+  // return a Cancelled status.
+  void Cancel() override;
+
+  // Returns the dataset model for performance analysis.
+  std::shared_ptr<model::Model> model() const override;
+
+ private:
+  // The `GetElementResultSequence` generates a sequence of elements from the
+  // `FirstComeFirstServedTaskRunner`. It is used for the `CrossTrainerCache` to
+  // generate cached elements.
+  class GetElementResultSequence : public CachableSequence<GetElementResult> {
+   public:
+    explicit GetElementResultSequence(
+        FirstComeFirstServedTaskRunner& fcfs_task_runner);
+    absl::StatusOr<GetElementResult> GetNext() override;
+    size_t GetElementSizeBytes(const GetElementResult& element) const override;
+
+   private:
+    FirstComeFirstServedTaskRunner& fcfs_task_runner_;
+  };
+
+  FirstComeFirstServedTaskRunner fcfs_task_runner_;
+  CrossTrainerCache<GetElementResult> cache_;
+
+  CachingTaskRunner(const CachingTaskRunner&) = delete;
+  void operator=(const CachingTaskRunner&) = delete;
+};
+
+// An element produced by a task.
+struct Element {
+  explicit Element(std::vector<Tensor>&& components, int64_t index)
+      : components(components), index(index) {}
+  // The components of the element.
+  std::vector<Tensor> components;
+  // The element's index within the task, e.g. 0 for the first element produced
+  // by the task, 1 for the second element, etc.
+  int64_t index;
+};
+
+// Thread for prefetching a round worth of elements.
+class PrefetchThread {
+ public:
+  explicit PrefetchThread(std::unique_ptr<TaskIterator> iterator,
+                          int64_t round_size);
+  ~PrefetchThread();
+  // Runs the prefetch thread. It runs until an error is encountered or the
+  // destructor is called.
+  void Run();
+  // Fills `out` with a round of data. Waits for up to `wait_us` microseconds
+  // before giving up and returning with `out` empty. A negative `wait_us`
+  // signals to wait indefinitely.
+  absl::Status FillBuffer(int64_t wait_us,
+                          std::vector<std::unique_ptr<Element>>& out);
+  // Returns the status for any failures encountered by the prefetch thread.
+  absl::Status GetStatus();
+  // Returns the dataset model for performance analysis.
+  std::shared_ptr<model::Model> model() const;
+
+ private:
+  const std::unique_ptr<TaskIterator> iterator_;
+  const int64_t round_size_;
+  mutex mu_;
+  int64_t index_ TF_GUARDED_BY(mu_) = 0;
+  // Buffered results for the next round.
+  std::vector<std::unique_ptr<Element>> buffer_ TF_GUARDED_BY(mu_);
+  // The status if the prefetch thread fails.
+  absl::Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
+  // Condition variable notified when elements are added to or removed from
+  // `buffer_`, or when `status_` is changed.
+  condition_variable cv_;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  // Thread which constantly tries to fill `buffer_` up with
+  // `num_consumers` elements.
+  std::unique_ptr<Thread> thread_;
+};
+
+// A task runner which enforces round-robin order for consuming a task's
+// elements. `RoundRobinTaskRunner` provides elements in a series of "rounds".
+// In each successive round, the runner waits to receive requests from all
+// consumers. These requests are blocked until all requests arrive. Once all
+// requests arrive, the runner hands out elements to consumers in order of their
+// consumer indices.
+//
+// Consumers are expected to successively request consecutive element indices,
+// starting at 0. The same element can be requested multiple times by the same
+// consumer, as long as the consumer hasn't yet requested the next element (at
+// the start of each round we discard elements from the previous round).
+//
+// If the worker restarts mid-round, a situation arises where some consumers
+// are requesting element index `n` while others are requesting element index
+// `n + 1`. To remedy this, the first round after restart may be a partial
+// round, where we only serve elements to consumers requesting data for element
+// index `n`, blocking other consumers until the second round.
+class RoundRobinTaskRunner : public TaskRunner {
+ public:
+  RoundRobinTaskRunner(std::unique_ptr<TaskIterator> iterator,
+                       int64_t num_consumers, string worker_address);
+
+  absl::Status GetNext(const GetElementRequest& req,
+                       GetElementResult& result) override;
+  void Cancel() override;
+  std::shared_ptr<model::Model> model() const override;
+
+ private:
+  // Prepares a full round of data. `wait_us` indicates how long to wait before
+  // skipping if a full round of data is not yet ready.
+  absl::Status PrepareFullRound(int64_t wait_us)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Prepares a partial round to get consumers back in sync.
+  absl::Status PreparePartialRound() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  absl::Status ValidateRequest(const GetElementRequest& req);
+  // Prepares data for the next round, blocking until the round is ready to
+  // start.
+  absl::Status PrepareRound(const GetElementRequest& req);
+  const int64_t num_consumers_;
+  const string worker_address_;
+  mutex mu_;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  // Condition variable notified whenever we start a new round of round-robin.
+  condition_variable new_round_cv_;
+  // Outstanding requests, indexed by round number and then consumer index.
+  absl::flat_hash_map<int64_t,
+                      absl::flat_hash_map<int64_t, const GetElementRequest*>>
+      requests_ TF_GUARDED_BY(mu_);
+  // Index of the first round we plan to serve. At startup, this is the minimum
+  // of all requested element indices.
+  int64_t first_round_ TF_GUARDED_BY(mu_) = kint64max;
+  int64_t current_round_ TF_GUARDED_BY(mu_) = -1;
+  bool round_skipped_ TF_GUARDED_BY(mu_) = false;
+  // Buffered results for the current round.
+  std::vector<std::unique_ptr<Element>> buffer_ TF_GUARDED_BY(mu_);
+  // Thread which constantly tries to prepare `num_consumers` elements for the
+  // next round.
+  PrefetchThread prefetch_thread_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_TASK_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/test_cluster.h b/third_party/tflite-hdrs/tensorflow/core/data/service/test_cluster.h
new file mode 100644
index 00000000..b1d242fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/test_cluster.h
@@ -0,0 +1,288 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_TEST_CLUSTER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_TEST_CLUSTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_transfer.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/data/service/server_lib.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/service/worker_client.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Helper class for unit testing a tf.data service cluster.
+class TestCluster {
+ public:
+  struct Config {
+   public:
+    int num_workers = 3;
+    int64_t client_timeout_ms = 0;
+    int64_t worker_heartbeat_interval_ms = 0;
+    int64_t job_gc_check_interval_ms = 0;
+    int64_t job_gc_timeout_ms = 0;
+    int64_t worker_max_concurrent_snapshots = 0;
+    std::string work_dir;
+  };
+
+  // Creates a new test cluster with a dispatcher and `num_workers` workers.
+  explicit TestCluster(
+      int num_workers,
+      std::optional<std::string> data_transfer_protocol = std::nullopt);
+  explicit TestCluster(const Config& config);
+  virtual ~TestCluster();
+
+  // Initializes the test cluster. This must be called before interacting with
+  // the cluster. Initialize should be called only once.
+  absl::Status Initialize();
+  // Adds a new worker to the cluster.
+  absl::Status AddWorker(
+      std::optional<int> port = std::nullopt,
+      std::optional<std::string> data_transfer_protocol = std::nullopt);
+  // Returns the number of workers in this cluster.
+  size_t NumWorkers() const { return workers_.size(); }
+  // Returns the port number of a worker.
+  int WorkerBoundPort(size_t worker_index) const {
+    return workers_[worker_index]->BoundPort();
+  }
+  // Returns the number of active iterations.
+  absl::StatusOr<size_t> NumActiveIterations() const {
+    return dispatcher_->NumActiveIterations();
+  }
+  // Returns the dispatcher address in the form "hostname:port".
+  std::string DispatcherAddress() const;
+  // Returns the address of the worker at the specified index, in the form
+  // "hostname:port". The index must be non-negative and less than the number of
+  // workers in the cluster.
+  std::string WorkerAddress(int index) const;
+
+  // Stops one worker.
+  void StopWorker(size_t index);
+  // Stops all workers.
+  void StopWorkers();
+
+  // Returns the server state exports.
+  ServerStateExport ExportDispatcherState() const;
+  ServerStateExport ExportWorkerState(size_t index) const;
+
+ private:
+  bool initialized_ = false;
+  int num_workers_;
+  std::optional<std::string> data_transfer_protocol_;
+  Config config_;
+  std::unique_ptr<DispatchGrpcDataServer> dispatcher_;
+  std::string dispatcher_address_;
+  std::vector<std::unique_ptr<WorkerGrpcDataServer>> workers_;
+  std::vector<std::string> worker_addresses_;
+};
+
+// A test utility to provide a `DatasetDef` to a `TestCluster` and generate data
+// from each worker for verification. For example:
+//
+// TestCluster cluster(/*num_workers=*/2);
+// TF_ASSERT_OK(cluster.Initialize());
+// DatasetClient<int64_t> dataset_reader(cluster);
+//
+// EXPECT_THAT(
+//     dataset_reader.Read(RangeDataset(4), ProcessingModeDef::DATA,
+//                         TARGET_WORKERS_LOCAL),
+//     IsOkAndHolds(UnorderedElementsAre(
+//         Pair(cluster.WorkerAddress(0), ElementsAre(0, 2)),
+//         Pair(cluster.WorkerAddress(1), ElementsAre(1, 3)))));
+template <class T>
+class DatasetClient {
+ public:
+  // Creates a dataset client. It will process datasets in `cluster`.
+  explicit DatasetClient(const TestCluster& cluster);
+
+  // Registers the dataset and returns the dataset ID.
+  absl::StatusOr<std::string> RegisterDataset(const DatasetDef& dataset);
+
+  // Maps a worker address to the data it produces when calling `Read`.
+  using WorkerResultMap = absl::flat_hash_map<std::string, std::vector<T>>;
+
+  // Processes `dataset` and retrieves the data from workers. Returns the data
+  // produced by each worker, keyed by the worker address.
+  StatusOr<WorkerResultMap> Read(
+      const DatasetDef& dataset,
+      ProcessingModeDef::ShardingPolicy sharding_policy,
+      TargetWorkers target_workers);
+  // Creates an iteration and returns the iteration client ID.
+  absl::StatusOr<int64_t> CreateIteration(const DatasetDef& dataset);
+  // Gets the tasks for iteration `iteration_client_id`. The iteration has one
+  // task processed by every worker.
+  absl::StatusOr<std::vector<TaskInfo>> GetTasks(int64_t iteration_client_id);
+
+ private:
+  // Creates an iteration and returns the iteration client ID.
+  absl::StatusOr<int64_t> CreateIteration(
+      const std::string& dataset_id,
+      ProcessingModeDef::ShardingPolicy sharding_policy,
+      TargetWorkers target_workers);
+  // Reads values from `tasks`, one task at a time, until all tasks have
+  // finished.
+  StatusOr<WorkerResultMap> ReadFromTasks(const std::vector<TaskInfo>& tasks);
+  // Reads the next element from the specified task.
+  absl::StatusOr<GetElementResult> ReadFromTask(const TaskInfo& task_info);
+
+  const TestCluster& cluster_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_client_;
+  absl::flat_hash_map<std::string, std::unique_ptr<DataServiceWorkerClient>>
+      worker_clients_;
+};
+
+template <class T>
+DatasetClient<T>::DatasetClient(const TestCluster& cluster)
+    : cluster_(cluster) {
+  dispatcher_client_ = std::make_unique<DataServiceDispatcherClient>(
+      cluster_.DispatcherAddress(), "grpc");
+
+  for (size_t i = 0; i < cluster.NumWorkers(); ++i) {
+    worker_clients_[cluster_.WorkerAddress(i)] =
+        std::make_unique<DataServiceWorkerClient>(
+            cluster_.WorkerAddress(i), /*protocol=*/"grpc",
+            /*transfer_protocol=*/"grpc",
+            /*fall_back_to_grpc_at_get_element_time=*/true,
+            /*accelerator_device_info=*/nullptr, /*allocator=*/nullptr);
+  }
+}
+
+template <class T>
+StatusOr<typename DatasetClient<T>::WorkerResultMap> DatasetClient<T>::Read(
+    const DatasetDef& dataset,
+    ProcessingModeDef::ShardingPolicy sharding_policy,
+    TargetWorkers target_workers) {
+  TF_ASSIGN_OR_RETURN(const std::string dataset_id, RegisterDataset(dataset));
+  TF_ASSIGN_OR_RETURN(
+      const int64_t iteration_client_id,
+      CreateIteration(dataset_id, sharding_policy, target_workers));
+  TF_ASSIGN_OR_RETURN(const std::vector<TaskInfo> tasks,
+                      GetTasks(iteration_client_id));
+  return ReadFromTasks(tasks);
+}
+
+template <class T>
+absl::StatusOr<std::string> DatasetClient<T>::RegisterDataset(
+    const DatasetDef& dataset) {
+  std::string dataset_id;
+  TF_RETURN_IF_ERROR(dispatcher_client_->RegisterDataset(
+      dataset, DataServiceMetadata(), /*requested_dataset_id=*/std::nullopt,
+      dataset_id));
+  return dataset_id;
+}
+
+template <class T>
+absl::StatusOr<int64_t> DatasetClient<T>::CreateIteration(
+    const std::string& dataset_id,
+    ProcessingModeDef::ShardingPolicy sharding_policy,
+    TargetWorkers target_workers) {
+  ProcessingModeDef processing_mode_def;
+  processing_mode_def.set_sharding_policy(sharding_policy);
+  int64_t job_id;
+  TF_RETURN_IF_ERROR(dispatcher_client_->GetOrCreateJob(
+      dataset_id, processing_mode_def, /*job_name=*/std::nullopt,
+      /*num_consumers=*/std::nullopt, /*use_cross_trainer_cache=*/false,
+      target_workers, job_id));
+  int64_t iteration_client_id;
+  TF_RETURN_IF_ERROR(dispatcher_client_->GetOrCreateIteration(
+      job_id, /*repetition=*/0, iteration_client_id));
+  return iteration_client_id;
+}
+
+template <class T>
+absl::StatusOr<int64_t> DatasetClient<T>::CreateIteration(
+    const DatasetDef& dataset) {
+  TF_ASSIGN_OR_RETURN(const std::string dataset_id, RegisterDataset(dataset));
+  return CreateIteration(dataset_id, ProcessingModeDef::OFF,
+                         TARGET_WORKERS_ANY);
+}
+
+template <class T>
+absl::StatusOr<std::vector<TaskInfo>> DatasetClient<T>::GetTasks(
+    const int64_t iteration_client_id) {
+  ClientHeartbeatRequest request;
+  ClientHeartbeatResponse response;
+  request.set_iteration_client_id(iteration_client_id);
+  TF_RETURN_IF_ERROR(dispatcher_client_->ClientHeartbeat(request, response));
+  if (response.task_info().empty()) {
+    return errors::NotFound("No task found for iteration ", iteration_client_id,
+                            ".");
+  }
+  return std::vector<TaskInfo>(response.task_info().begin(),
+                               response.task_info().end());
+}
+
+template <class T>
+StatusOr<typename DatasetClient<T>::WorkerResultMap>
+DatasetClient<T>::ReadFromTasks(const std::vector<TaskInfo>& tasks) {
+  WorkerResultMap result;
+  bool all_workers_finished = false;
+  while (!all_workers_finished) {
+    all_workers_finished = true;
+    for (const TaskInfo& task : tasks) {
+      absl::StatusOr<GetElementResult> element_result = ReadFromTask(task);
+      // A task may be cancelled when it has finished but other workers are
+      // still producing data.
+      if (absl::IsCancelled(element_result.status())) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(element_result.status());
+      if (element_result->end_of_sequence) {
+        continue;
+      }
+      all_workers_finished = false;
+      result[task.worker_address()].push_back(
+          element_result->components[0].unaligned_flat<T>().data()[0]);
+    }
+  }
+  return result;
+}
+
+template <class T>
+absl::StatusOr<GetElementResult> DatasetClient<T>::ReadFromTask(
+    const TaskInfo& task_info) {
+  GetElementRequest request;
+  GetElementResult element_result;
+  request.set_task_id(task_info.task_id());
+  TF_RETURN_IF_ERROR(worker_clients_[task_info.worker_address()]->GetElement(
+      request, element_result));
+  return element_result;
+}
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_TEST_CLUSTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/test_util.h b/third_party/tflite-hdrs/tensorflow/core/data/service/test_util.h
new file mode 100644
index 00000000..2180675b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/test_util.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_TEST_UTIL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_TEST_UTIL_H_
+
+#include <functional>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace testing {
+
+// Creates a local tempfile and returns the path.
+std::string LocalTempFilename();
+
+// Creates a dataset graph for testing. `dataset_name` is one of the filenames
+// defined in `testdata` (without `.pbtxt`). `args` specifies arguments passed
+// to the dataset. These args appear as `$0`, `$1`, etc, in the dataset
+// definition and will be replaced with the specified args.
+absl::StatusOr<DatasetDef> GetTestDataset(
+    absl::string_view dataset_name, const std::vector<std::string>& args = {});
+
+// Returns a test dataset representing
+// tf.data.Dataset.range(range). Useful for testing dataset graph execution.
+DatasetDef RangeDataset(int64_t range);
+
+// Returns a test dataset representing
+// tf.data.Dataset.range(range).map(lambda x: x*x).
+DatasetDef RangeSquareDataset(int64_t range);
+
+// Returns a test dataset representing
+// tf.data.Dataset.range(range).shard(SHARD_HINT, SHARD_HINT).
+DatasetDef RangeDatasetWithShardHint(int64_t range);
+
+// Returns a test dataset representing
+// tf.data.Dataset.range(100000000).repeat().
+DatasetDef InfiniteDataset();
+
+// Returns a distributed snapshot metadata for a dummy dataset.
+experimental::DistributedSnapshotMetadata
+CreateDummyDistributedSnapshotMetadata();
+
+// Returns a test dataset representing
+// tf.data.Dataset.from_tensor_slices(["filenames"]).interleave(
+//     lambda filepath: tf.data.TextLineDataset(filepath),
+//     cycle_length=10)
+absl::StatusOr<DatasetDef> InterleaveTextlineDataset(
+    const std::vector<tstring>& filenames,
+    const std::vector<tstring>& contents);
+
+// Repeatedly calls `f()`, blocking until `f()` returns `false`.
+//
+// Returns an error if `f()` returns an error.
+absl::Status WaitWhile(std::function<absl::StatusOr<bool>()> f);
+
+// TODO(b/229726259): Make EqualsProto available in Googletest
+// (Public feature request: https://github.com/google/googletest/issues/1761).
+class ProtoStringMatcher {
+ public:
+  explicit ProtoStringMatcher(const tensorflow::protobuf::Message& expected)
+      : expected_(expected.ShortDebugString()) {}
+
+  template <typename Message>
+  bool MatchAndExplain(const Message& p,
+                       ::testing::MatchResultListener*) const {
+    return p.ShortDebugString() == expected_;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_;
+  }
+
+ private:
+  const std::string expected_;
+};
+
+inline ::testing::PolymorphicMatcher<ProtoStringMatcher> EqualsProto(
+    const tensorflow::protobuf::Message& x) {
+  return ::testing::MakePolymorphicMatcher(ProtoStringMatcher(x));
+}
+}  // namespace testing
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/thread_safe_buffer.h b/third_party/tflite-hdrs/tensorflow/core/data/service/thread_safe_buffer.h
new file mode 100644
index 00000000..570fb5ce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/thread_safe_buffer.h
@@ -0,0 +1,122 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_THREAD_SAFE_BUFFER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_THREAD_SAFE_BUFFER_H_
+
+#include <deque>
+#include <utility>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+// A thread-safe bounded buffer with cancellation support.
+template <class T>
+class ThreadSafeBuffer final {
+ public:
+  // Creates a buffer with the specified `buffer_size`.
+  // REQUIRES: buffer_size > 0
+  explicit ThreadSafeBuffer(size_t buffer_size);
+
+  // Gets the next element. Blocks if the buffer is empty. Returns an error if
+  // a non-OK status was pushed or the buffer has been cancelled.
+  StatusOr<T> Pop();
+
+  // Writes the next element. Blocks if the buffer is full. Returns an error if
+  // the buffer has been cancelled.
+  absl::Status Push(StatusOr<T> value);
+
+  // Cancels the buffer with `status` and notifies waiting threads. After
+  // cancelling, all `Push` and `Pop` calls will return `status`.
+  // REQUIRES: !status.ok()
+  void Cancel(absl::Status status);
+
+  // Returns whether the buffer is empty.
+  bool Empty() const;
+
+ private:
+  const size_t buffer_size_;
+
+  mutable mutex mu_;
+  condition_variable ready_to_pop_;
+  condition_variable ready_to_push_;
+  std::deque<StatusOr<T>> results_ TF_GUARDED_BY(mu_);
+  absl::Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
+
+  ThreadSafeBuffer(const ThreadSafeBuffer&) = delete;
+  void operator=(const ThreadSafeBuffer&) = delete;
+};
+
+template <class T>
+ThreadSafeBuffer<T>::ThreadSafeBuffer(size_t buffer_size)
+    : buffer_size_(buffer_size) {
+  DCHECK_GT(buffer_size, 0)
+      << "ThreadSafeBuffer must have a positive buffer size. Got "
+      << buffer_size << ".";
+}
+
+template <class T>
+bool ThreadSafeBuffer<T>::Empty() const {
+  tf_shared_lock l(mu_);
+  return results_.empty();
+}
+
+template <class T>
+StatusOr<T> ThreadSafeBuffer<T>::Pop() {
+  mutex_lock l(mu_);
+  while (status_.ok() && results_.empty()) {
+    ready_to_pop_.wait(l);
+  }
+  if (!status_.ok()) {
+    return status_;
+  }
+  StatusOr<T> result = std::move(results_.front());
+  results_.pop_front();
+  ready_to_push_.notify_one();
+  return result;
+}
+
+template <class T>
+absl::Status ThreadSafeBuffer<T>::Push(StatusOr<T> value) {
+  mutex_lock l(mu_);
+  while (status_.ok() && results_.size() >= buffer_size_) {
+    ready_to_push_.wait(l);
+  }
+  if (!status_.ok()) {
+    return status_;
+  }
+  results_.push_back(std::move(value));
+  ready_to_pop_.notify_one();
+  return absl::OkStatus();
+}
+
+template <class T>
+void ThreadSafeBuffer<T>::Cancel(absl::Status status) {
+  DCHECK(!status.ok())
+      << "Cancelling ThreadSafeBuffer requires a non-OK status. Got " << status;
+  mutex_lock l(mu_);
+  status_ = std::move(status);
+  ready_to_push_.notify_all();
+  ready_to_pop_.notify_all();
+}
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_THREAD_SAFE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/url.h b/third_party/tflite-hdrs/tensorflow/core/data/service/url.h
new file mode 100644
index 00000000..84afa162
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/url.h
@@ -0,0 +1,52 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_URL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_URL_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace data {
+
+// Parses URLs of form host[:port] and provides methods to retrieve its
+// components. The port can be a number, named port, or dynamic port
+// (i.e.: %port_name%). For example:
+//
+//   URL url("/worker/task/0:worker");
+//   url.has_protocol() == false;
+//   url.host() == "/worker/task/0";
+//   url.has_port() == true;
+//   url.port() == "worker";
+class URL {
+ public:
+  explicit URL(absl::string_view url);
+
+  absl::string_view host() const { return host_; }
+  bool has_port() const { return !port_.empty(); }
+  absl::string_view port() const { return port_; }
+
+ private:
+  void Parse(absl::string_view url);
+
+  std::string host_;
+  std::string port_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_URL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/utils.h
new file mode 100644
index 00000000..482d306e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
+
+#include <string>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/platform/env.h"
+
+// Utilities shared between the dispatcher and worker servers.
+namespace tensorflow {
+namespace data {
+
+// Writes a dataset definition to the specified path. If the file already
+// exists, it will be overwritten.
+absl::Status WriteDatasetDef(const std::string& path,
+                             const DatasetDef& dataset_def);
+
+// Reads a dataset definition from specified path, and stores it in
+// `dataset_def`. Returns NOT_FOUND if the path cannot be found.
+absl::Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/validate_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/service/validate_utils.h
new file mode 100644
index 00000000..c4278023
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/validate_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_VALIDATE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_VALIDATE_UTILS_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Verifies the datasets with the same ID have the same metadata. If the
+// metadata differs, returns an invalid argument error.
+absl::Status ValidateMatchingDataset(const std::string& dataset_id,
+                                     const DataServiceMetadata& metadata1,
+                                     const DataServiceMetadata& metadata2);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_VALIDATE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/worker_client.h b/third_party/tflite-hdrs/tensorflow/core/data/service/worker_client.h
new file mode 100644
index 00000000..64ac446b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/worker_client.h
@@ -0,0 +1,106 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_WORKER_CLIENT_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_WORKER_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_transfer.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+constexpr const char kLocalTransferProtocol[] = "local";
+constexpr const char kGrpcTransferProtocol[] = "grpc";
+
+// Client for communicating with the tf.data service worker.
+class DataServiceWorkerClient : public DataServiceClientBase {
+ public:
+  DataServiceWorkerClient(
+      const std::string& address, const std::string& protocol,
+      const std::string& transfer_protocol,
+      bool fall_back_to_grpc_at_get_element_time,
+      const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+      Allocator* allocator)
+      : DataServiceClientBase(address, protocol),
+        transfer_protocol_(transfer_protocol),
+        fall_back_to_grpc_at_get_element_time_(
+            fall_back_to_grpc_at_get_element_time),
+        accelerator_device_info_(accelerator_device_info),
+        allocator_(allocator) {}
+
+  // Fetches an element from the worker.
+  absl::Status GetElement(const GetElementRequest& req,
+                          GetElementResult& result);
+
+  // Makes a best effort to cancel all outstanding calls in progress for the
+  // client, and causes further calls to return Cancelled status.
+  void TryCancel();
+
+  // Returns an error if the client is incompatible with a server which has the
+  // properties described in `compatibility_info`.
+  absl::Status CheckCompatibility(
+      const std::string& server_compatibility_info) const {
+    return client_->CheckCompatibility(server_compatibility_info);
+  }
+
+  // If `true`, data service clients should fall back to gRPC for this worker
+  // client if it nonretryably fails to transfer an element using an alternative
+  // data transfer protocol.
+  bool FallBackToGrpcAtGetElementTime() const {
+    return fall_back_to_grpc_at_get_element_time_;
+  }
+
+  // Returns the data transfer protocol, preferring to use the local transfer
+  // protocol if a local tf.data worker exists.
+  std::string GetDataTransferProtocol() const;
+
+ protected:
+  absl::Status EnsureInitialized() override;
+
+ private:
+  std::string transfer_protocol_;
+  bool fall_back_to_grpc_at_get_element_time_;
+  const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info_;
+  Allocator* allocator_;
+
+  mutex mu_;
+  // Initialization is guarded by `mu_`, but using the stub does not require
+  // holding `mu_`
+  std::unique_ptr<DataTransferClient> client_;
+};
+
+// Creates and initializes a new tf.data service worker client to read
+// from the data transfer server specified in `info`.
+absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>>
+CreateDataServiceWorkerClient(
+    const std::string& dispatcher_protocol, const DataTransferServerInfo& info,
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+    Allocator* allocator);
+
+// If true, clients should use local protocol for data transfer (disregarding
+// any other user-specified or runtime-defaulted protocol).
+bool ForceLocalProtocol(const std::string& worker_address);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_WORKER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/service/worker_impl.h b/third_party/tflite-hdrs/tensorflow/core/data/service/worker_impl.h
new file mode 100644
index 00000000..c256c88c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/service/worker_impl.h
@@ -0,0 +1,251 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_WORKER_IMPL_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_WORKER_IMPL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_transfer.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/standalone.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// A TensorFlow DataService serves dataset elements over RPC.
+class DataServiceWorkerImpl {
+ public:
+  explicit DataServiceWorkerImpl(const experimental::WorkerConfig& config);
+  ~DataServiceWorkerImpl();
+
+  // Starts the worker. The worker needs to know its own address so that it can
+  // register with the dispatcher. This is set in `Start` instead of in the
+  // constructor because the worker may be binding to port `0`, in which case
+  // the address isn't known until the worker has started and decided which port
+  // to bind to.
+  absl::Status Start(
+      const std::string& worker_address,
+      const std::vector<DataTransferServerInfo>& transfer_servers);
+  // Stops the worker, attempting a clean shutdown by rejecting new requests
+  // and waiting for outstanding requests to complete.
+  void Stop();
+
+  // Serves a GetElement request, storing the result in `*result`. See
+  // worker.proto for GetElement API documentation.
+  absl::Status GetElementResult(const GetElementRequest* request,
+                                GetElementResult* result);
+
+  // Deletes the local task and iterator. Only called by local clients to delete
+  // unused task iterators assuming the task is not read by remote clients. This
+  // method is not visible to gRPC clients.
+  void DeleteLocalTask(const TaskInfo& task_info);
+
+  // See worker.proto for API documentation.
+
+  /// Dispatcher-facing API.
+  absl::Status ProcessTask(const ProcessTaskRequest* request,
+                           ProcessTaskResponse* response);
+
+  /// Client-facing API.
+  absl::Status GetElement(const GetElementRequest* request,
+                          GetElementResponse* response);
+  absl::Status GetWorkerTasks(const GetWorkerTasksRequest* request,
+                              GetWorkerTasksResponse* response);
+  absl::Status GetSnapshotTaskProgresses(
+      const GetSnapshotTaskProgressesRequest* request,
+      GetSnapshotTaskProgressesResponse* response);
+
+  // Exports the worker state for debugging.
+  WorkerStateExport ExportState() const;
+
+ private:
+  struct Task {
+    explicit Task(TaskDef task_def) : task_def(std::move(task_def)) {}
+
+    TaskDef task_def;
+    mutex mu;
+    bool initialized TF_GUARDED_BY(mu) = false;
+    int64_t outstanding_requests TF_GUARDED_BY(&DataServiceWorkerImpl::mu_) = 0;
+    std::unique_ptr<TaskRunner> task_runner;
+  };
+
+  struct SnapshotTask {
+    // Base directory of the snapshot.
+    std::string base_path;
+
+    // Index of the snapshot stream written by this worker.
+    int64_t stream_index = 0;
+
+    // This is required to use it as a `flat_hash_map` key.
+    template <typename H>
+    friend H AbslHashValue(H h, const SnapshotTask& task) {
+      return H::combine(std::move(h), task.base_path, task.stream_index);
+    }
+
+    friend bool operator==(const SnapshotTask& task1,
+                           const SnapshotTask& task2) {
+      return task1.base_path == task2.base_path &&
+             task1.stream_index == task2.stream_index;
+    }
+  };
+
+  // Validates the worker config.
+  absl::Status ValidateWorkerConfig() const;
+  // Creates and initializes a dispatcher client.
+  absl::StatusOr<std::unique_ptr<DataServiceDispatcherClient>>
+  CreateDispatcherClient() const TF_LOCKS_EXCLUDED(mu_);
+  // Sends task status to the dispatcher and checks for dispatcher commands.
+  absl::Status SendTaskUpdates() TF_LOCKS_EXCLUDED(mu_);
+  // Creates an iterator to process a task.
+  absl::Status ProcessTaskInternal(const TaskDef& task)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  absl::Status EnsureTaskInitialized(Task& task);
+  // Stops a task, cancelling the task's outstanding requests and waiting for
+  // them to finish.
+  void StopTask(Task& task) TF_LOCKS_EXCLUDED(mu_);
+  // A thread for notifying the dispatcher when tasks complete.
+  void TaskCompletionThread() TF_LOCKS_EXCLUDED(mu_);
+  // A thread for doing periodic heartbeats to the dispatcher.
+  void HeartbeatThread() TF_LOCKS_EXCLUDED(mu_);
+  // Performs a heartbeat to the dispatcher.
+  absl::Status Heartbeat();
+  // Check with the dispatcher to see whether or not to disable compression.
+  absl::StatusOr<bool> DisableCompressionAtRuntime(
+      const std::string& dataset_id) const;
+  // Returns the active tasks of this worker.
+  std::vector<ActiveTask> GetActiveTasks() const TF_LOCKS_EXCLUDED(mu_);
+  // Returns the task IDs of `active_tasks`.
+  std::vector<int64_t> GetTaskIds(
+      const std::vector<ActiveTask>& active_tasks) const;
+  // Builds a heartbeat request.
+  WorkerHeartbeatRequest BuildWorkerHeartbeatRequest() const
+      TF_LOCKS_EXCLUDED(mu_);
+  // Updates the tasks according to the heartbeat response.
+  void UpdateTasks(const WorkerHeartbeatResponse& response)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Updates the distributed snapshot tasks according to the heartbeat response.
+  absl::Status UpdateSnapshotWriters(const WorkerHeartbeatResponse& response)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Creates an dataset iterator for snapshot writers.
+  absl::StatusOr<std::unique_ptr<StandaloneTaskIterator>>
+  MakeSnapshotTaskIterator(const SnapshotTaskDef& snapshot_task,
+                           const DatasetDef& dataset_def) const;
+  // Gets the snapshot task progress from the snapshot writers.
+  std::vector<SnapshotTaskProgress> GetSnapshotTaskProgress() const;
+  // Gets the DatasetDef for `task_def`.
+  absl::StatusOr<DatasetDef> GetDatasetDef(const TaskDef& task_def) const;
+  // Creates a dataset from `dataset_def`.
+  absl::StatusOr<std::unique_ptr<standalone::Dataset>> MakeDataset(
+      const DatasetDef& dataset_def, const TaskDef& task_def) const;
+  // Creates an iterator for `dataset`.
+  absl::StatusOr<std::unique_ptr<standalone::Iterator>> MakeDatasetIterator(
+      standalone::Dataset& dataset, const TaskDef& task_def) const;
+
+  const experimental::WorkerConfig config_;
+  // Worker Borg job UID for telemetry. -1 if not supported.
+  const int64_t worker_uid_;
+
+  // The worker's own address.
+  std::string worker_address_;
+  // The data transfer servers available to worker clients.
+  std::vector<DataTransferServerInfo> transfer_servers_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
+
+  mutable mutex mu_;
+  condition_variable cv_;
+  // Information about tasks, keyed by task ids. The tasks are updated based on
+  // the heartbeat responses from the dispatcher.
+  absl::flat_hash_map<int64_t, std::shared_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+  // Ids of tasks that have finished.
+  absl::flat_hash_set<int64_t> finished_tasks_ TF_GUARDED_BY(mu_);
+  // Completed tasks which haven't yet been communicated to the dispatcher.
+  absl::flat_hash_set<int64_t> pending_completed_tasks_ TF_GUARDED_BY(mu_);
+  // Tasks deleted by the local client. If the client tries to read from them
+  // again, the worker will return a non-retriable FailedPrecondition error.
+  absl::flat_hash_set<int64_t> deleted_tasks_ TF_GUARDED_BY(mu_);
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  // Whether the worker has registered with the dispatcher yet.
+  bool registered_ TF_GUARDED_BY(mu_) = false;
+  condition_variable task_completion_cv_ TF_GUARDED_BY(mu_);
+  condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
+  CancellationManager cancellation_manager_;
+
+  absl::flat_hash_map<SnapshotTask, std::unique_ptr<SnapshotStreamWriter>,
+                      absl::Hash<SnapshotTask>>
+      snapshot_writers_ TF_GUARDED_BY(mu_);
+
+  // A thread for notifying the dispatcher when tasks complete.
+  std::unique_ptr<Thread> task_completion_thread_;
+  // A thread for performing regular heartbeats to the dispatcher.
+  std::unique_ptr<Thread> heartbeat_thread_;
+
+  DataServiceWorkerImpl(const DataServiceWorkerImpl&) = delete;
+  void operator=(const DataServiceWorkerImpl&) = delete;
+};
+
+// Local in-process workers shared among clients and servers. If clients and
+// workers colocate in the same process, clients can read from local workers to
+// reduce RPC calls and data copy.
+class LocalWorkers {
+ public:
+  // Adds a `worker` at `worker_address`. If a worker already exists at the
+  // address, it will be updated to the new `worker`.
+  // REQUIRES: worker != nullptr.
+  static void Add(absl::string_view worker_address,
+                  std::shared_ptr<DataServiceWorkerImpl> worker);
+
+  // Gets a local worker at `worker_address`. Returns nullptr if a worker is not
+  // found.
+  static std::shared_ptr<DataServiceWorkerImpl> Get(
+      absl::string_view worker_address);
+
+  // Returns if there are any local workers in the process.
+  static bool Empty();
+
+  // Removes a worker at `worker_address`. It is no-op if a worker is not found
+  // at the address.
+  static void Remove(absl::string_view worker_address);
+
+ private:
+  using AddressToWorkerMap =
+      absl::flat_hash_map<std::string, std::shared_ptr<DataServiceWorkerImpl>>;
+  static mutex mu_;
+  static AddressToWorkerMap* local_workers_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_WORKER_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/snapshot_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/snapshot_utils.h
new file mode 100644
index 00000000..f083cbe4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/snapshot_utils.h
@@ -0,0 +1,459 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SNAPSHOT_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SNAPSHOT_UTILS_H_
+
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+
+namespace tensorflow {
+
+class GraphDef;
+
+namespace data {
+
+namespace experimental {
+
+class SnapshotMetadataRecord;
+class SnapshotTensorMetadata;
+
+}  // namespace experimental
+
+namespace snapshot_util {
+
+constexpr char kMetadataFilename[] = "snapshot.metadata";
+
+constexpr char kModeAuto[] = "auto";
+constexpr char kModeWrite[] = "write";
+constexpr char kModeRead[] = "read";
+constexpr char kModePassthrough[] = "passthrough";
+constexpr char kShardDirectorySuffix[] = ".shard";
+
+enum Mode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
+
+// Returns the name of the "hash" directory for the given base path and hash ID.
+std::string HashDirectory(const std::string& path, uint64 hash);
+
+// Returns the name of the "run" directory for the given base path and run ID.
+std::string RunDirectory(const std::string& hash_directory, uint64 run_id);
+std::string RunDirectory(const std::string& hash_directory,
+                         const std::string& run_id);
+
+// Returns the name of the "shard" directory for the given base path and shard
+// ID.
+std::string ShardDirectory(const std::string& run_directory, int64_t shard_id);
+
+// Returns the checkpoint file name for the given directory and checkpoint ID.
+std::string GetCheckpointFileName(const std::string& shard_directory,
+                                  uint64 checkpoint_id);
+
+// This is a interface class that exposes snapshot writing functionality.
+class Writer {
+ public:
+  // Creates a new writer object.
+  static absl::Status Create(Env* env, const std::string& filename,
+                             const std::string& compression_type, int version,
+                             const DataTypeVector& dtypes,
+                             std::unique_ptr<Writer>* out_writer);
+
+  // Writes a vector of tensors to the snapshot writer file.
+  virtual absl::Status WriteTensors(const std::vector<Tensor>& tensors) = 0;
+
+  // Flushes any in-memory buffers to disk.
+  virtual absl::Status Sync() = 0;
+
+  // Closes and finalizes the snapshot file. All calls to any other method will
+  // be invalid after this call.
+  virtual absl::Status Close() = 0;
+
+  virtual ~Writer() = default;
+
+ protected:
+  virtual absl::Status Initialize(tensorflow::Env* env) = 0;
+};
+
+// Writes snapshots with the standard TFRecord file format.
+class TFRecordWriter : public Writer {
+ public:
+  TFRecordWriter(const std::string& filename,
+                 const std::string& compression_type);
+
+  absl::Status Initialize(tensorflow::Env* env) override;
+
+  absl::Status WriteTensors(const std::vector<Tensor>& tensors) override;
+
+  absl::Status Sync() override;
+
+  absl::Status Close() override;
+
+  ~TFRecordWriter() override;
+
+ private:
+  const std::string filename_;
+  const std::string compression_type_;
+
+  std::unique_ptr<WritableFile> dest_;
+  std::unique_ptr<io::RecordWriter> record_writer_;
+};
+
+// Writes snapshot with a custom (legacy) file format.
+class CustomWriter : public Writer {
+ public:
+  static constexpr const size_t kHeaderSize = sizeof(uint64);
+
+  static constexpr const char* const kClassName = "SnapshotWriter";
+  static constexpr const char* const kWriteStringPiece = "WriteStringPiece";
+  static constexpr const char* const kWriteCord = "WriteCord";
+  static constexpr const char* const kSeparator = "::";
+
+  CustomWriter(const std::string& filename, const std::string& compression_type,
+               const DataTypeVector& dtypes);
+
+  absl::Status WriteTensors(const std::vector<Tensor>& tensors) override;
+
+  absl::Status Sync() override;
+
+  absl::Status Close() override;
+
+  ~CustomWriter() override;
+
+ protected:
+  absl::Status Initialize(tensorflow::Env* env) override;
+
+ private:
+  absl::Status WriteRecord(const absl::string_view& data);
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status WriteRecord(const absl::Cord& data);
+#endif  // TF_CORD_SUPPORT
+
+  std::unique_ptr<WritableFile> dest_;
+  const std::string filename_;
+  const std::string compression_type_;
+  const DataTypeVector dtypes_;
+  // We hold zlib_dest_ because we may create a ZlibOutputBuffer and put that
+  // in dest_ if we want compression. ZlibOutputBuffer doesn't own the original
+  // dest_ and so we need somewhere to store the original one.
+  std::unique_ptr<WritableFile> zlib_underlying_dest_;
+  std::vector<bool> simple_tensor_mask_;  // true for simple, false for complex.
+  int num_simple_ = 0;
+  int num_complex_ = 0;
+};
+
+// Interface class for reading snapshot files previous written with Writer.
+class Reader {
+ public:
+  // Op kernel that creates an instance of `Reader::Dataset` needed to support
+  // serialization and deserialization of `Reader::Dataset`.
+  class DatasetOp : public DatasetOpKernel {
+   public:
+    explicit DatasetOp(OpKernelConstruction* ctx);
+
+   protected:
+    void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+   private:
+    DataTypeVector output_types_;
+    std::vector<PartialTensorShape> output_shapes_;
+    std::string compression_;
+    int64_t version_;
+  };
+
+  // Op kernel that creates an instance of `Reader::NestedDataset` needed to
+  // support serialization and deserialization of `Reader::NestedDataset`.
+  class NestedDatasetOp : public DatasetOpKernel {
+   public:
+    explicit NestedDatasetOp(OpKernelConstruction* ctx);
+
+   protected:
+    void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+   private:
+    DataTypeVector output_types_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  // Creates a new Reader object that reads data from `filename`. Note that
+  // the `version`, `compression_type`, and `dtypes` arguments passed into
+  // `Writer` and `Reader` must be the same for the reading to succeed.
+  static absl::Status Create(Env* env, const std::string& filename,
+                             const string& compression_type, int version,
+                             const DataTypeVector& dtypes,
+                             std::unique_ptr<Reader>* out_reader);
+
+  // Returns a nested dataset for a set of given snapshot file names.
+  //
+  // This function takes a vector of snapshot files, and returns a nested
+  // dataset. Each element within the nested dataset is itself a dataset, and
+  // contains all the elements written out to each individual snapshot file.
+  static absl::Status MakeNestedDataset(
+      Env* env, const std::vector<std::string>& shard_dirs,
+      const string& compression_type, int version, const DataTypeVector& dtypes,
+      const std::vector<PartialTensorShape>& shapes, int64_t start_index,
+      DatasetBase** output);
+
+  // Returns a nested dataset for the given datasets.
+  static void MakeNestedDataset(const std::vector<DatasetBase*>& datasets,
+                                DatasetBase** output);
+
+  // Reads a vector of Tensors from the snapshot file.
+  virtual absl::Status ReadTensors(std::vector<Tensor>* read_tensors) = 0;
+
+  // Skips `num_records`. Equivalent to calling `ReadTensors` `num_records`
+  // times then discarding the results.
+  virtual absl::Status SkipRecords(int64_t num_records);
+
+  virtual ~Reader() = default;
+
+ protected:
+  virtual absl::Status Initialize(Env* env) = 0;
+
+  class Dataset;
+  class NestedDataset;
+};
+
+class TFRecordReaderImpl {
+ public:
+  // Constructs a `TFRecordReaderImpl`.
+  // `filename` is the file to read from.
+  // `compression_type` is the compression method, as defined in
+  // tensorflow/compiler/xla/tsl/lib/io/compression.h.
+  // `output_buffer_size` specifies the buffer size required by Snappy/Zlib
+  // compression algorithms. Ignored if compression is not enabled.
+  TFRecordReaderImpl(const std::string& filename, const string& compression,
+                     std::optional<int64_t> output_buffer_size = std::nullopt);
+
+  // Initializes the reader. Callers must initialize the reader before calling
+  // `GetNext` or `GetTensors`.
+  absl::Status Initialize(Env* env);
+
+  // Reads the next Tensor in the input file.
+  absl::StatusOr<Tensor> GetNext();
+
+  // Reads all Tensors in the input file.
+  absl::StatusOr<std::vector<Tensor>> GetTensors();
+
+  // Returns the number of bytes read.
+  uint64_t BytesRead() const { return bytes_read_; }
+
+ private:
+  // Parses `record` into a Tensor.
+  absl::StatusOr<Tensor> Parse(const tstring& record);
+
+  std::string filename_;
+  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<io::RecordReader> record_reader_;
+  uint64_t offset_ = 0;
+  uint64_t bytes_read_ = 0;
+
+  const string compression_;
+  const std::optional<int64_t> output_buffer_size_;
+};
+
+// Reads snapshots previously written with `TFRecordWriter`.
+class TFRecordReader : public Reader {
+ public:
+  TFRecordReader(const std::string& filename, const string& compression,
+                 const DataTypeVector& dtypes,
+                 std::optional<int64_t> output_buffer_size = std::nullopt)
+      : reader_impl_(filename, compression, output_buffer_size),
+        dtypes_(dtypes) {}
+
+  // Initializes the reader. Callers must initialize the reader before calling
+  // `ReadTensors`.
+  absl::Status Initialize(Env* env) override {
+    return reader_impl_.Initialize(env);
+  }
+
+  // Reads Tensors into `read_tensors`. Returns OK on success, OutOfRange for
+  // end of file, or an error status if there is an error.
+  absl::Status ReadTensors(std::vector<Tensor>* read_tensors) override;
+
+  // Returns the number of bytes read.
+  uint64_t BytesRead() const { return reader_impl_.BytesRead(); }
+
+ private:
+  TFRecordReaderImpl reader_impl_;
+  const DataTypeVector dtypes_;
+};
+
+// Reads snapshots previously written with `CustomWriter`.
+class CustomReader : public Reader {
+ public:
+  // The reader input buffer size is deliberately large because the input reader
+  // will throw an error if the compressed block length cannot fit in the input
+  // buffer.
+  static constexpr const int64_t kSnappyReaderInputBufferSizeBytes =
+      1 << 30;  // 1 GiB
+  // TODO(b/148804377): Set this in a smarter fashion.
+  static constexpr const int64_t kSnappyReaderOutputBufferSizeBytes =
+      32 << 20;  // 32 MiB
+  static constexpr const size_t kHeaderSize = sizeof(uint64);
+
+  static constexpr const char* const kClassName = "SnapshotReader";
+  static constexpr const char* const kReadString = "ReadString";
+  static constexpr const char* const kReadCord = "ReadCord";
+  static constexpr const char* const kSeparator = "::";
+
+  CustomReader(const std::string& filename, const string& compression_type,
+               int version, const DataTypeVector& dtypes);
+
+  absl::Status ReadTensors(std::vector<Tensor>* read_tensors) override;
+
+  ~CustomReader() override = default;
+
+ protected:
+  absl::Status Initialize(Env* env) override;
+
+ private:
+  absl::Status ReadTensorsV0(std::vector<Tensor>* read_tensors);
+
+  absl::Status SnappyUncompress(
+      const experimental::SnapshotTensorMetadata* metadata,
+      std::vector<Tensor>* simple_tensors,
+      std::vector<std::pair<std::unique_ptr<char[]>, size_t>>*
+          tensor_proto_strs);
+
+  absl::Status ReadRecord(tstring* record);
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status ReadRecord(absl::Cord* record);
+#endif
+
+  std::string filename_;
+  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<io::InputStreamInterface> input_stream_;
+  const string compression_type_;
+  const int version_;
+  const DataTypeVector dtypes_;
+  int num_simple_ = 0;
+  int num_complex_ = 0;
+  std::vector<bool> simple_tensor_mask_;  // true for simple, false for complex.
+};
+
+// Writes snapshot metadata to the given directory.
+absl::Status WriteMetadataFile(
+    Env* env, const string& dir,
+    const experimental::SnapshotMetadataRecord* metadata);
+
+// Writes distributed snapshot metadata to the given directory. An error is
+// returned if `dir` is unable to be created or if `metadata` is unable to be
+// written.
+absl::Status WriteMetadataFile(
+    Env* env, const string& dir,
+    const experimental::DistributedSnapshotMetadata* metadata);
+
+// Reads snapshot metadata from the given directory.
+absl::Status ReadMetadataFile(Env* env, const string& dir,
+                              experimental::SnapshotMetadataRecord* metadata,
+                              bool* file_exists);
+
+// Reads distributed snapshot metadata from the given directory. If the file
+// doesn't exist in `dir`, `file_exists` is set to true and an ok status is
+// returned. If the file exists in `dir` but is unable to be opened, an error
+// is returned.
+absl::Status ReadMetadataFile(
+    Env* env, const string& dir,
+    experimental::DistributedSnapshotMetadata* metadata, bool* file_exists);
+
+// Writes a dataset graph to the given directory.
+absl::Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
+                              const GraphDef* graph);
+
+absl::Status DetermineOpState(
+    const std::string& mode_string, bool file_exists,
+    const experimental::SnapshotMetadataRecord* metadata,
+    uint64 pending_snapshot_expiry_seconds, Mode* mode);
+
+// Represents a dataset element or EOF.
+struct ElementOrEOF {
+  std::vector<Tensor> value;
+  bool end_of_sequence = false;
+};
+
+// AsyncWriter provides API for asynchronously writing dataset elements
+// (each represented as a vector of tensors) to a file.
+//
+// The expected use of this API is:
+//
+// std::unique_ptr<AsyncWriter> writer = absl_make_unique<AsyncWriter>(...);
+//
+// while (data_available()) {
+//   std::vector<Tensor> data = read_data()
+//   writer->Write(data);
+// }
+// writer->SignalEOF();
+// writer = nullptr;  // This will block until writes are flushed.
+class AsyncWriter {
+ public:
+  explicit AsyncWriter(Env* env, int64_t file_index,
+                       const std::string& shard_directory, uint64 checkpoint_id,
+                       const std::string& compression, int64_t version,
+                       const DataTypeVector& output_types,
+                       std::function<void(absl::Status)> done);
+
+  // Writes the given tensors. The method is non-blocking and returns without
+  // waiting for the element to be written.
+  void Write(const std::vector<Tensor>& tensors) TF_LOCKS_EXCLUDED(mu_);
+
+  // Signals the end of input. The method is non-blocking and returns without
+  // waiting for the writer to be closed.
+  void SignalEOF() TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  void Consume(ElementOrEOF* be) TF_LOCKS_EXCLUDED(mu_);
+  bool ElementAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  absl::Status WriterThread(Env* env, const std::string& shard_directory,
+                            uint64 checkpoint_id,
+                            const std::string& compression, int64_t version,
+                            DataTypeVector output_types);
+
+  mutex mu_;
+  std::deque<ElementOrEOF> deque_ TF_GUARDED_BY(mu_);
+
+  // This has to be last. During destruction, we need to make sure that the
+  // Thread object is destroyed first as its destructor blocks on thread
+  // completion. If there are other member variables after this, they may get
+  // destroyed first before the thread finishes, potentially causing the
+  // thread to access invalid memory.
+  std::unique_ptr<Thread> thread_;
+};
+
+}  // namespace snapshot_util
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SNAPSHOT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/split_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/split_utils.h
new file mode 100644
index 00000000..a0fdef8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/split_utils.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SPLIT_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SPLIT_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// A class which produces splits for a dataset of size N that can be indexed
+// into.
+class IndexSplitProvider : public SplitProvider {
+ public:
+  explicit IndexSplitProvider(int64_t n);
+  absl::Status GetNext(Tensor* split, bool* end_of_splits) override;
+  absl::Status Reset() override;
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer) override;
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader) override;
+  int64_t Cardinality() const override;
+
+ private:
+  tsl::mutex mu_;
+  int64_t i_ TF_GUARDED_BY(mu_);
+  const int64_t n_;
+};
+
+// A SplitProvider which wraps another split provider, but drops all splits
+// where `index != shard_index % num_shards`
+class ShardingSplitProvider : public SplitProvider {
+ public:
+  ShardingSplitProvider(int64_t num_shards, int64_t shard_index,
+                        std::shared_ptr<SplitProvider> split_provider);
+
+  absl::Status GetNext(Tensor* split, bool* end_of_splits) override;
+  absl::Status Reset() override;
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer) override;
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader) override;
+
+ private:
+  const int64_t num_shards_;
+  const int64_t shard_index_;
+  tsl::mutex mu_;
+  std::shared_ptr<SplitProvider> split_provider_ TF_GUARDED_BY(mu_);
+  int64_t num_to_skip_ TF_GUARDED_BY(mu_);
+};
+
+// Returns split providers for all sources of the given dataset.
+absl::StatusOr<std::vector<std::unique_ptr<SplitProvider>>> GetSplitProviders(
+    const DatasetBase* dataset);
+
+// Gets the single split provider from the context, or returns an error if the
+// context has zero or multiple split providers. The `dataset` argument is used
+// to produce a more useful error message.
+absl::StatusOr<std::shared_ptr<SplitProvider>> GetSingleSplitProvider(
+    IteratorContext* ctx, const DatasetBase* dataset);
+
+// Creates iterator contexts for datasets inputs. The split providers
+// in `ctx` will be divided among the inputs of `dataset`, so that each input
+// gets a number of split providers that matches its number of source datasets.
+// If no split providers are defined, the contexts will be the same as `ctx`.
+absl::StatusOr<std::vector<IteratorContext>> CreateInputIteratorContexts(
+    IteratorContext* ctx, const DatasetBase* dataset);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SPLIT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/standalone.h b/third_party/tflite-hdrs/tensorflow/core/data/standalone.h
new file mode 100644
index 00000000..5b2b2b2c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/standalone.h
@@ -0,0 +1,163 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_STANDALONE_H_
+#define TENSORFLOW_CORE_DATA_STANDALONE_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
+#include "tensorflow/core/data/unbounded_thread_pool.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+// The purpose of the API in this file is to facilitate standalone execution of
+// a tf.data input pipeline graph.
+//
+// The API exposes two abstractions -- a `Dataset` and an `Iterator` -- which
+// encapsulate TensorFlow runtime.
+//
+// The `Dataset` abstraction represents an input pipeline as a collection
+// of data sources and a logical plan of transformations that operate over the
+// data.
+//
+// The `Iterator` abstraction represents an execution of an input pipeline that
+// can be used to enumerate its elements.
+//
+// Example usage:
+//
+//   // Create a `Dataset` by running the `graph_def` graph.
+//   tensorflow::data:standalone::Dataset::Params params;
+//   std::unique_ptr<tensorflow::data::standalone::Dataset> dataset;
+//   Status s = tensorflow::data::standalone::Dataset::FromGraph(
+//      params, graph_def, &dataset);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   std::unique_ptr<tensorflow::data::standalone::Iterator> iterator;
+//   s = dataset->MakeIterator(&iterator);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   bool end_of_input = false;
+//   while (!end_of_input) {
+//     std::vector<tensorflow::Tensor> outputs;
+//     s = iterator->GetNext(&outputs, &end_of_input);
+//     if (!s.ok()) { /* error handling */ }
+//     if (!end_of_input) { /* output handling */ }
+//   }
+
+class Dataset;
+
+// Represents an execution of an input pipeline that can be used to enumerate
+// its elements.
+class Iterator {
+ public:
+  virtual ~Iterator();
+
+  // Returns the next element of the input pipeline (if there is one) and an
+  // indication of whether the end of the input pipeline has been reached.
+  absl::Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
+
+  // Saves a checkpoint of the iterator. Returns Tensors that can be called with
+  // `Restore()`.
+  absl::StatusOr<std::vector<Tensor>> Save();
+
+  // Restores the iterator from a checkpoint. `saved_iterator` is the serialized
+  // iterator saved by calling `Save()`.
+  absl::Status Restore(const std::vector<Tensor>& saved_iterator);
+
+  // Returns the dataset model for performance analysis.
+  std::shared_ptr<model::Model> model() const;
+
+ private:
+  friend class Dataset;
+
+  Iterator(IteratorBase* iterator, IteratorContext* ctx,
+           SerializationContext* serialization_ctx);
+
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<IteratorContext> ctx_;
+  std::unique_ptr<SerializationContext> serialization_ctx_;
+  std::shared_ptr<TfDatazMetricsCollector> tf_dataz_metrics_collector_;
+};
+
+// Represents an input pipeline as a collection of data sources and a logical
+// plan of transformations that operate over the data.
+class Dataset {
+ public:
+  // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
+  struct Params {
+    SessionOptions session_options;
+  };
+
+  // Creates a new `Dataset` instance by running the given dataset graph.
+  static absl::Status FromGraph(Params params, const GraphDef& graph_def,
+                                std::unique_ptr<Dataset>* result);
+
+  ~Dataset();
+
+  // Creates an iterator for this dataset.
+  absl::Status MakeIterator(std::unique_ptr<Iterator>* result);
+  // Creates an iterator, optionally with a split provider.
+  absl::Status MakeIterator(
+      std::vector<std::unique_ptr<SplitProvider>> split_providers,
+      std::unique_ptr<Iterator>* result);
+
+  // Creates split providers for this dataset.
+  absl::Status MakeSplitProviders(
+      std::vector<std::unique_ptr<SplitProvider>>* result);
+  // Returns a pointer to the underlying dataset.
+  const DatasetBase* Get() const;
+
+ private:
+  Dataset(DatasetBase* finalized_dataset, DatasetBase* original_dataset,
+          DeviceMgr* device_mgr, ProcessFunctionLibraryRuntime* pflr,
+          FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool,
+          std::function<void(std::function<void()>)> runner);
+
+  DatasetBase* finalized_dataset_;  // owned
+  DatasetBase* original_dataset_;   // owned
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> interop_threadpool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  ResourceMgr resource_mgr_;
+  CancellationManager cancellation_manager_;
+  UnboundedThreadPool unbounded_thread_pool_;
+};
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_STANDALONE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/stats_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/stats_utils.h
new file mode 100644
index 00000000..5fa1eae3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/stats_utils.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_STATS_UTILS_H_
+#define TENSORFLOW_CORE_DATA_STATS_UTILS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+namespace stats_utils {
+extern const char kDelimiter[];
+extern const char kExecutionTime[];
+extern const char kThreadUtilization[];
+extern const char kBufferSize[];
+extern const char kBufferCapacity[];
+extern const char kBufferUtilization[];
+extern const char kFilteredElements[];
+extern const char kDroppedElements[];
+extern const char kFeaturesCount[];
+extern const char kFeatureValuesCount[];
+extern const char kExamplesCount[];
+
+// Name for tf.data function execution time (in ns) histogram metrics.
+string ExecutionTimeHistogramName(const string& prefix);
+
+// Name for thread utilization (ratio of threads being used and maximum number
+// of threads allocated) scalar metrics.
+string ThreadUtilizationScalarName(const string& prefix);
+
+// Name for buffer size scalar metrics.
+string BufferSizeScalarName(const string& prefix);
+
+// Name for buffer capacity (maximum allocated buffer size) scalar metrics.
+string BufferCapacityScalarName(const string& prefix);
+
+// Name for buffer utilization (ratio of buffer size and maximum allocated
+// buffer size.) histogram metrics.
+string BufferUtilizationHistogramName(const string& prefix);
+
+// Name for filtered elements scalar metrics.
+string FilterdElementsScalarName(const string& prefix);
+
+// Name for dropped elements scalar mereics.
+string DroppedElementsScalarName(const string& prefix);
+
+// Name for features count histogram metrics.
+string FeatureHistogramName(const string& prefix);
+
+// Name for feature-values count histogram metrics.
+string FeatureValueHistogramName(const string& prefix);
+
+}  // namespace stats_utils
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_STATS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/test_utils.h b/third_party/tflite-hdrs/tensorflow/core/data/test_utils.h
new file mode 100644
index 00000000..61da1807
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/test_utils.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_TEST_UTILS_H_
+#define TENSORFLOW_CORE_DATA_TEST_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+class TestContext {
+ public:
+  static absl::StatusOr<std::unique_ptr<TestContext>> Create();
+  virtual ~TestContext() = default;
+
+  OpKernelContext* op_ctx() const { return op_ctx_.get(); }
+  IteratorContext* iter_ctx() const { return iter_ctx_.get(); }
+
+ private:
+  TestContext() = default;
+
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::function<void(std::function<void()>)> runner_;
+  OpKernelContext::Params params_;
+  std::unique_ptr<OpKernelContext> op_ctx_;
+  std::unique_ptr<IteratorContext> iter_ctx_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/tf_data_memory_logger.h b/third_party/tflite-hdrs/tensorflow/core/data/tf_data_memory_logger.h
new file mode 100644
index 00000000..7978fefc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/tf_data_memory_logger.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_TF_DATA_MEMORY_LOGGER_H_
+#define TENSORFLOW_CORE_DATA_TF_DATA_MEMORY_LOGGER_H_
+
+namespace tensorflow {
+namespace data {
+
+// Starts the iterator memory logger if it is not already started. The logger is
+// only active at VLOG level 4.
+void EnsureIteratorMemoryLoggerStarted();
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_TF_DATA_MEMORY_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/tfdataz_metrics.h b/third_party/tflite-hdrs/tensorflow/core/data/tfdataz_metrics.h
new file mode 100644
index 00000000..e37daf89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/tfdataz_metrics.h
@@ -0,0 +1,150 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_TFDATAZ_METRICS_H_
+#define TENSORFLOW_CORE_DATA_TFDATAZ_METRICS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+
+// Calculates the approximate average latency for past 1, 5 and 60 minutes.
+// The implementation uses ring buffers to maintain the cumulative latency
+// values and count for the past 60 minutes.
+class ApproximateLatencyEstimator {
+ public:
+  enum class Duration {
+    kMinute = 1,
+    kFiveMinutes = 5,
+    kSixtyMinutes = 60,
+  };
+
+  explicit ApproximateLatencyEstimator(const Env& env);
+
+  // Records the latency with the current timestamp.
+  void AddLatency(int64_t latency_usec);
+
+  // Returns the average latency for the duration (1,5 and 60 minutes)
+  // specified.
+  absl::Duration GetAverageLatency(Duration duration);
+
+ private:
+  static constexpr int64_t kSecondsPerMinute = 60;
+  static constexpr int64_t kMinutesPerHour = 60;
+  static constexpr int64_t kSlots = kMinutesPerHour;
+
+  // Updates the latency value and count ring buffers with the latest cumulative
+  // value and count. Resets the entire ring buffer with the last cumulative
+  // values stored if the elapsed time duration is greater than 60 minutes.
+  void UpdateRingBuffer() TF_LOCKS_EXCLUDED(mu_);
+  // Moves the `next_slot_` to the next index in the ring buffer.
+  void IncrementNextSlot() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Returns the slot index which is behind the current slot in ring buffer by
+  // `steps` indices.
+  int PrevSlot(int steps) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const Env& env_;
+
+  // The time when the ring buffer was last updated.
+  int64_t last_updated_time_mins_ TF_GUARDED_BY(mu_);
+
+  mutex mu_;
+
+  // Counters storing the cumulative sums of latency values and counts recorded
+  // so far.
+  int64_t latency_value_counter_ TF_GUARDED_BY(mu_);
+  int64_t latency_count_counter_ TF_GUARDED_BY(mu_);
+
+  // Next slot in the ring buffer.
+  int next_slot_ TF_GUARDED_BY(mu_);
+
+  // Ring buffer storing the cumulative sum of latency values and counts for the
+  // last 60 minutes.
+  int64_t latency_value_[kSlots] TF_GUARDED_BY(mu_);
+  int64_t latency_count_[kSlots] TF_GUARDED_BY(mu_);
+};
+
+// Collects and exports the tf.data performance metrics to /tfdataz.
+class TfDatazMetricsCollector {
+ public:
+  // Constructs a `TfDatazMetricsCollector`.
+  // We only collect metrics for CPU devices. This is a heuristic to avoid
+  // collecting metrics for device-side iterators created by the multi-device
+  // iterator mechanism.
+  TfDatazMetricsCollector(const Env& env, DatasetBaseIterator* iterator,
+                          std::shared_ptr<model::Model> model);
+
+  // Records `GetNext` call latency.
+  void RecordGetNextLatency(int64_t get_next_latency_usec);
+
+  // Returns the average `GetNext` latency for past 1 minute.
+  absl::Duration GetAverageLatencyForLastOneMinute();
+
+  // Returns the average `GetNext` latency for past 5 minutes.
+  absl::Duration GetAverageLatencyForLastFiveMinutes();
+
+  // Returns the average `GetNext` latency for past 60 minutes.
+  absl::Duration GetAverageLatencyForLastSixtyMinutes();
+
+  // Returns the dataset name if one was set.
+  std::optional<std::string> DatasetName();
+
+  // Returns the total memory (in bytes) used by the iterator.
+  // Total memory used by the iterator includes the total number of bytes
+  // buffered in all nodes in the subtree.
+  int64_t GetIteratorTotalMemoryUsage();
+
+  std::shared_ptr<model::Model> GetModel();
+
+ private:
+  DatasetBaseIterator* iterator_;  // not owned
+  std::shared_ptr<model::Model> model_;
+  ApproximateLatencyEstimator latency_estimator_;
+};
+
+// Thread-safe global registry for the /tfdataz metrics. All callers to
+// `TfDatazMetricsRegistry` use the same instance to register and deregister
+// iterator's `TfDatazMetricsCollector`.
+class TfDatazMetricsRegistry {
+ public:
+  // Registers the iterator specific `TfDatazMetricsCollector` in the global
+  // TfDatazMetricsRegistry.
+  static void Register(std::shared_ptr<TfDatazMetricsCollector> collector);
+
+  // Deregisters the iterator specific `TfDatazMetricsCollector` from the global
+  // TfDatazMetricsRegistry.
+  static void Deregister(std::shared_ptr<TfDatazMetricsCollector> collector);
+
+  // Returns all the registered `TfDatazMetricsCollector`s.
+  static absl::flat_hash_set<std::shared_ptr<TfDatazMetricsCollector>>
+  GetIteratorMetricCollectors();
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_TFDATAZ_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/unbounded_thread_pool.h b/third_party/tflite-hdrs/tensorflow/core/data/unbounded_thread_pool.h
new file mode 100644
index 00000000..f790c938
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/unbounded_thread_pool.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_UNBOUNDED_THREAD_POOL_H_
+#define TENSORFLOW_CORE_DATA_UNBOUNDED_THREAD_POOL_H_
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/thread_factory.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool_interface.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+namespace tensorflow {
+namespace data {
+
+// An `UnboundedThreadPool` provides a mechanism for temporally multiplexing a
+// potentially large number of "logical" threads onto a smaller number of
+// "physical" threads. The multiplexing is achieved by using an
+// `UnboundedWorkQueue`.
+class UnboundedThreadPool : public thread::ThreadPoolInterface {
+ public:
+  UnboundedThreadPool(Env* env, const string& thread_name)
+      : unbounded_work_queue_(env, thread_name) {}
+  UnboundedThreadPool(Env* env, const string& thread_name,
+                      const ThreadOptions& thread_options)
+      : unbounded_work_queue_(env, thread_name, thread_options) {}
+  ~UnboundedThreadPool() override = default;
+
+  // Returns an implementation of `ThreadFactory` that can be used to create
+  // logical threads in this pool.
+  std::shared_ptr<ThreadFactory> get_thread_factory();
+
+  void Schedule(std::function<void()> fn) override;
+  int NumThreads() const override;
+  int CurrentThreadId() const override;
+
+ private:
+  class LogicalThreadFactory;
+  class LogicalThreadWrapper;
+
+  void ScheduleOnWorkQueue(std::function<void()> fn,
+                           std::shared_ptr<Notification> done);
+
+  UnboundedWorkQueue unbounded_work_queue_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_UNBOUNDED_THREAD_POOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/data/utils.h b/third_party/tflite-hdrs/tensorflow/core/data/utils.h
new file mode 100644
index 00000000..64b8e6f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/data/utils.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_UTILS_H_
+#define TENSORFLOW_CORE_DATA_UTILS_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+// Records latency of fetching data from tf.data iterator.
+void AddLatencySample(int64_t microseconds);
+
+// Records bytes produced by a tf.data iterator.
+void IncrementThroughput(int64_t bytes);
+
+// Returns a modified file name that can be used to do implementation specific
+// file name manipulation/optimization.
+std::string TranslateFileName(const std::string& fname);
+
+// Returns the data transfer protocol to use if one is not specified by the
+// user.
+std::string DefaultDataTransferProtocol();
+
+// Returns a path pointing to the same file as `path` with a potential locality
+// optimization.
+std::string LocalityOptimizedPath(const std::string& path);
+
+// Returns `true` if tf.data service compression should be disabled at runtime
+// based on (1) the inputs or (2) the properties of the calling trainer.
+absl::StatusOr<bool> DisableCompressionAtRuntime(
+    const std::string& data_transfer_protocol, DeploymentMode deployment_mode,
+    DataServiceMetadata::Compression compression);
+
+// Log filenames into TfDataLogger. Uses the same  TfDataFileLoggerClient at
+// every call. Thread safe.
+// TODO (shushanik) Implement streamz error reporting in case the logging is not
+// successful
+void LogFilenames(const std::vector<std::string>& files);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/debug/debug_callback_registry.h b/third_party/tflite-hdrs/tensorflow/core/debug/debug_callback_registry.h
new file mode 100644
index 00000000..94b57401
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/debug/debug_callback_registry.h
@@ -0,0 +1,71 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_CALLBACK_REGISTRY_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_CALLBACK_REGISTRY_H_
+
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/debug/debug_node_key.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// Supports exporting observed debug events to clients using registered
+// callbacks.  Users can register a callback for each debug_url stored using
+// DebugTensorWatch.  The callback key be equivalent to what follows
+// "memcbk:///".
+//
+// All events generated for a watched node will be sent to the call back in the
+// order that they are observed.
+//
+// This callback router should not be used in production or training steps.  It
+// is optimized for deep inspection of graph state rather than performance.
+class DebugCallbackRegistry {
+ public:
+  using EventCallback = std::function<void(const DebugNodeKey&, const Tensor&)>;
+
+  // Provides singleton access to the in memory event store.
+  static DebugCallbackRegistry* singleton();
+
+  // Returns the registered callback, or nullptr, for key.
+  EventCallback* GetCallback(const string& key);
+
+  // Associates callback with key.  This must be called by clients observing
+  // nodes to be exported by this callback router before running a session.
+  void RegisterCallback(const string& key, EventCallback callback);
+
+  // Removes the callback associated with key.
+  void UnregisterCallback(const string& key);
+
+ private:
+  DebugCallbackRegistry();
+
+  // Mutex to ensure that keyed events are never updated in parallel.
+  mutex mu_;
+
+  // Maps debug_url keys to callbacks for routing observed tensors.
+  std::map<string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
+
+  static DebugCallbackRegistry* instance_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_CALLBACK_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/debug/debug_graph_utils.h b/third_party/tflite-hdrs/tensorflow/core/debug/debug_graph_utils.h
new file mode 100644
index 00000000..27cfb357
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/debug/debug_graph_utils.h
@@ -0,0 +1,124 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_GRAPH_UTILS_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_GRAPH_UTILS_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
+
+namespace tensorflow {
+
+class DebugNodeInserter {
+ public:
+  // EXPERIMENTAL: Insert special debug ops (e.g., DebugIdentity) to graph for
+  // debugging. Currently, such ops need to take exactly one input and has the
+  // string attribute "tensor_name" to indicate what tensor it watches.
+  // For example, before the node insertion, the graph may look like:
+  //
+  // A:0 -----------1----------> B
+  //      |
+  //      ---------2-----------> C
+  //
+  // wherein the output slot 0 of node A feeds as the input to nodes B through
+  // edge 1 and to node C through edge 2.
+  // After the node insertion, assuming both B and C have non-Ref input, the
+  // graph becomes:
+  // A:0 ---3---> Copy -----------4----------> B
+  //                       |
+  //                       ---------5--------> C
+  //                       |
+  //                       ---------6--------> X
+  //
+  // If a node (e.g., B) has Ref input, the graph becomes:
+  //
+  //           --------------------------------> B
+  //           |
+  // A:0 ---3-----> Copy -----------4----------> C
+  //                       |
+  //                       -----------5--------> X
+  //
+  // In other words, we do not feed Refs to deep-copies to downstream nodes.
+  //
+  // Copy is the inserted deep-copy node that copies the input tensor on-device
+  // (e.g., CPU-to-CPU or GPU-to-GPU deep copy) that reduces the likelihood of
+  // racy updates during the debug watches. X is the newly created debug node
+  // that transforms the input (copy of the watched tensor) into a debug signal.
+  //
+  // DebugIdentity is the simplest debugging paradigm, in which the debug signal
+  // (i.e., X:0) equals the tensor itself. More sophisticated debug ops can be
+  // used to transform the tensor into other debug signals. An example is the
+  // DebugNanCounter op.
+  //
+  // If the nodes (A, B and C) are located on GPU and the edges from A to B or C
+  // is HOST_MEMORY, then the CopyHost op will be used instead of the Copy op.
+  static absl::Status InsertNodes(
+      const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
+      Device* device);
+
+  // Set the parallel_iterations attribute of TensorFlow while loops
+  // (specifically the nodes for which IsEnter() returns true) to 1 to prevent
+  // any node from being executed multiple times concurrently and
+  // generating temporally-overlapping debug Tensor dumps.
+  static void DeparallelizeWhileLoops(Graph* graph, Device* device);
+
+  // Get canonical name of a copy node.
+  static const string GetCopyNodeName(const string& node_name,
+                                      const int output_slot);
+
+  // Get canonical name of a debug node.
+  static const string GetDebugNodeName(const string& tensor_name,
+                                       const int debug_op_num,
+                                       const string& debug_op_name);
+
+ private:
+  static absl::Status CreateCopyNode(
+      Graph* graph, const DeviceType device_type, const bool is_host_memory,
+      const string& src_node_name, const int src_output, const DataType src_dt,
+      const string& tensor_name, const std::vector<string>& debug_ops,
+      const std::vector<string>& debug_urls, Node** copy_node);
+
+  // Parse the debug_op_name string to extract proper op name and attributes.
+  // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
+  // It can also contain customizable keys and values. Each key-value pair is
+  // connected with an equal sign ("="). Multiple key-value pairs are separated
+  // with semicolons (";"), which optional whitespace in between, e.g.,
+  // "DebugNumericSummary(mute_if_healthy=true, lower_bound=-100.0)".
+  static absl::Status ParseDebugOpName(
+      const string& debug_op_name, string* debug_op_name_proper,
+      std::unordered_map<string, string>* attributes);
+
+  static absl::Status SetDebugNodeAttributes(
+      Node* debug_node, const std::unordered_map<string, string>& attributes);
+
+  static absl::Status CreateDebugNode(
+      Graph* graph, const Device& device, const string& src_copy_node_name,
+      const DataType src_dt, const string& tensor_name,
+      const std::vector<string>& debug_urls, const int debug_op_num,
+      const string& debug_op_name, Node** debug_node);
+  // TODO(cais): Cut down the number of args to this method.
+
+  friend class DebugGraphUtilsTest;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_GRAPH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/debug/debug_grpc_testlib.h b/third_party/tflite-hdrs/tensorflow/core/debug/debug_grpc_testlib.h
new file mode 100644
index 00000000..2a57df8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/debug/debug_grpc_testlib.h
@@ -0,0 +1,87 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_GRPC_TESTLIB_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_GRPC_TESTLIB_H_
+
+#include <atomic>
+#include <unordered_set>
+
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/debug/debug_service.grpc.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace test {
+
+class TestEventListenerImpl final : public grpc::EventListener::Service {
+ public:
+  TestEventListenerImpl() : stop_requested_(false), stopped_(false) {}
+
+  void RunServer(const int server_port);
+  void StopServer();
+
+  ::grpc::Status SendEvents(
+      ::grpc::ServerContext* context,
+      ::grpc::ServerReaderWriter< ::tensorflow::EventReply,
+                                  ::tensorflow::Event>* stream) override;
+
+  // Clear debug data (e.g., Tensors) received so far.
+  void ClearReceivedDebugData();
+
+  void RequestDebugOpStateChangeAtNextStream(
+      const EventReply::DebugOpStateChange::State new_state,
+      const DebugNodeKey& debug_node_key);
+
+  std::vector<string> debug_metadata_strings;
+  std::vector<string> encoded_graph_defs;
+  std::vector<string> device_names;
+  std::vector<string> node_names;
+  std::vector<int32> output_slots;
+  std::vector<string> debug_ops;
+  std::vector<Tensor> debug_tensors;
+
+ private:
+  std::atomic_bool stop_requested_;
+  std::atomic_bool stopped_;
+
+  std::vector<DebugNodeKey> debug_node_keys_ TF_GUARDED_BY(states_mu_);
+  std::vector<EventReply::DebugOpStateChange::State> new_states_
+      TF_GUARDED_BY(states_mu_);
+
+  std::unordered_set<DebugNodeKey> write_enabled_debug_node_keys_;
+
+  mutex states_mu_;
+};
+
+// Poll a gRPC debug server by sending a small tensor repeatedly till success.
+//
+// Args:
+//   server_url: gRPC URL of the server to poll, e.g., "grpc://foo:3333".
+//   max_attempts: Maximum number of attempts.
+//
+// Returns:
+//   Whether the polling succeeded within max_attempts.
+bool PollTillFirstRequestSucceeds(const string& server_url,
+                                  const size_t max_attempts);
+
+}  // namespace test
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_GRPC_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/debug/debug_io_utils.h b/third_party/tflite-hdrs/tensorflow/core/debug/debug_io_utils.h
new file mode 100644
index 00000000..95864c71
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/debug/debug_io_utils.h
@@ -0,0 +1,446 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/debug/debug_node_key.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/event.pb.h"
+
+namespace tensorflow {
+
+absl::Status ReadEventFromFile(const string& dump_file_path, Event* event);
+
+struct DebugWatchAndURLSpec {
+  DebugWatchAndURLSpec(const string& watch_key, const string& url,
+                       const bool gated_grpc)
+      : watch_key(watch_key), url(url), gated_grpc(gated_grpc) {}
+
+  const string watch_key;
+  const string url;
+  const bool gated_grpc;
+};
+
+// TODO(cais): Put static functions and members in a namespace, not a class.
+class DebugIO {
+ public:
+  static const char* const kDebuggerPluginName;
+
+  static const char* const kCoreMetadataTag;
+  static const char* const kGraphTag;
+  static const char* const kHashTag;
+
+  static const char* const kFileURLScheme;
+  static const char* const kGrpcURLScheme;
+  static const char* const kMemoryURLScheme;
+
+  static absl::Status PublishDebugMetadata(
+      const int64_t global_step, const int64_t session_run_index,
+      const int64_t executor_step_index, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_nodes,
+      const std::unordered_set<string>& debug_urls);
+
+  // Publishes a tensor to a debug target URL.
+  //
+  // Args:
+  //   debug_node_key: A DebugNodeKey identifying the debug node. If
+  //     `debug_node_key.io_of_node` is non-empty, publish for node
+  //     inputs/outputs dumping feature.
+  //   tensor: The Tensor object being published.
+  //   wall_time_us: Time stamp for the Tensor. Unit: microseconds (us).
+  //   debug_urls: An array of debug target URLs, e.g.,
+  //     "file:///foo/tfdbg_dump", "grpc://localhost:11011"
+  //   gated_grpc: Whether this call is subject to gRPC gating.
+  //   step_id: Step ID associated with the tensor.
+  static absl::Status PublishDebugTensor(
+      const DebugNodeKey& debug_node_key, const Tensor& tensor,
+      const uint64 wall_time_us, const absl::Span<const string> debug_urls,
+      bool gated_grpc, int64_t step_id = -1);
+
+  // Convenience overload of the method above for no gated_grpc by default.
+  static absl::Status PublishDebugTensor(
+      const DebugNodeKey& debug_node_key, const Tensor& tensor,
+      const uint64 wall_time_us, const absl::Span<const string> debug_urls);
+
+  // Publishes a graph to a set of debug URLs.
+  //
+  // Args:
+  //   graph: The graph to be published.
+  //   debug_urls: The set of debug URLs to publish the graph to.
+  static absl::Status PublishGraph(
+      const Graph& graph, const string& device_name,
+      const std::unordered_set<string>& debug_urls);
+
+  // Determines whether a copy node needs to perform deep-copy of input tensor.
+  //
+  // The input arguments contain sufficient information about the attached
+  // downstream debug ops for this method to determine whether all the said
+  // ops are disabled given the current status of the gRPC gating.
+  //
+  // Args:
+  //   specs: A vector of DebugWatchAndURLSpec carrying information about the
+  //     debug ops attached to the Copy node, their debug URLs and whether
+  //     they have the attribute value gated_grpc == True.
+  //
+  // Returns:
+  //   Whether any of the attached downstream debug ops is enabled given the
+  //   current status of the gRPC gating.
+  static bool IsCopyNodeGateOpen(
+      const std::vector<DebugWatchAndURLSpec>& specs);
+
+  // Determines whether a debug node needs to proceed given the current gRPC
+  // gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   debug_urls: the debug URLs of the debug node.
+  //
+  // Returns:
+  //   Whether this debug op should proceed.
+  static bool IsDebugNodeGateOpen(const string& watch_key,
+                                  const std::vector<string>& debug_urls);
+
+  // Determines whether debug information should be sent through a grpc://
+  // debug URL given the current gRPC gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   debug_url: the debug URL, e.g., "grpc://localhost:3333",
+  //     "file:///tmp/tfdbg_1".
+  //
+  // Returns:
+  //   Whether the sending of debug data to the debug_url should
+  //     proceed.
+  static bool IsDebugURLGateOpen(const string& watch_key,
+                                 const string& debug_url);
+
+  static absl::Status CloseDebugURL(const string& debug_url);
+};
+
+// Helper class for debug ops.
+class DebugFileIO {
+ public:
+  // Encapsulates the Tensor in an Event protobuf and write it to a directory.
+  // The actual path of the dump file will be a contactenation of
+  // dump_root_dir, tensor_name, along with the wall_time.
+  //
+  // For example:
+  //   let dump_root_dir = "/tmp/tfdbg_dump",
+  //       node_name = "foo/bar",
+  //       output_slot = 0,
+  //       debug_op = DebugIdentity,
+  //       and wall_time_us = 1467891234512345,
+  // the dump file will be generated at path:
+  //   /tmp/tfdbg_dump/foo/bar_0_DebugIdentity_1467891234512345.
+  //
+  // Args:
+  //   debug_node_key: A DebugNodeKey identifying the debug node.
+  //   wall_time_us: Wall time at which the Tensor is generated during graph
+  //     execution. Unit: microseconds (us).
+  //   dump_root_dir: Root directory for dumping the tensor.
+  //   dump_file_path: The actual dump file path (passed as reference).
+  static absl::Status DumpTensorToDir(const DebugNodeKey& debug_node_key,
+                                      const Tensor& tensor,
+                                      const uint64 wall_time_us,
+                                      const string& dump_root_dir,
+                                      string* dump_file_path);
+
+  // Similar to the above, but for node inputs/outputs dumping feature.
+  static absl::Status DumpTensorToDirForNodeDumping(
+      const DebugNodeKey& debug_node_key, const Tensor& tensor,
+      uint64 wall_time_us, const string& dump_root_dir, string* dump_file_path,
+      int64_t step_id);
+
+  // Get the full path to the dump file.
+  //
+  // Args:
+  //   dump_root_dir: The dump root directory, e.g., /tmp/tfdbg_dump
+  //   node_name: Name of the node from which the dumped tensor is generated,
+  //     e.g., foo/bar/node_a
+  //   output_slot: Output slot index of the said node, e.g., 0.
+  //   debug_op: Name of the debug op, e.g., DebugIdentity.
+  //   wall_time_us: Time stamp of the dumped tensor, in microseconds (us).
+  static string GetDumpFilePath(const string& dump_root_dir,
+                                const DebugNodeKey& debug_node_key,
+                                const uint64 wall_time_us);
+
+  // Similar to the above, but for node inputs/outputs dumping feature.
+  static string GetDumpFilePathForNodeDumping(
+      const string& dump_root_dir, const DebugNodeKey& debug_node_key,
+      uint64 wall_time_us, int64_t step_id);
+
+  // Dumps an Event proto to a file.
+  //
+  // Args:
+  //   event_prot: The Event proto to be dumped.
+  //   dir_name: Directory path.
+  //   file_name: Base file name.
+  static absl::Status DumpEventProtoToFile(const Event& event_proto,
+                                           const string& dir_name,
+                                           const string& file_name);
+
+  // Request additional bytes to be dumped to the file system.
+  //
+  // Does not actually dump the bytes, but instead just performs the
+  // bookkeeping necessary to prevent the total dumped amount of data from
+  // exceeding the limit (default 100 GBytes or set customly through the
+  // environment variable TFDBG_DISK_BYTES_LIMIT).
+  //
+  // Args:
+  //   bytes: Number of bytes to request.
+  //
+  // Returns:
+  //   Whether the request is approved given the total dumping
+  //   limit.
+  static bool requestDiskByteUsage(uint64 bytes);
+
+  // Reset the disk byte usage to zero.
+  static void resetDiskByteUsage();
+
+  static uint64 global_disk_bytes_limit_;
+
+ private:
+  // Encapsulates the Tensor in an Event protobuf and write it to file.
+  static absl::Status DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
+                                            const Tensor& tensor,
+                                            const uint64 wall_time_us,
+                                            const string& file_path);
+
+  // Implemented ad hoc here for now.
+  // TODO(cais): Replace with shared implementation once http://b/30497715 is
+  // fixed.
+  static absl::Status RecursiveCreateDir(Env* env, const string& dir);
+
+  // Tracks how much disk has been used so far.
+  static uint64 disk_bytes_used_;
+  // Mutex for thread-safe access to disk_bytes_used_.
+  static mutex bytes_mu_;
+  // Default limit for the disk space.
+  static const uint64 kDefaultGlobalDiskBytesLimit;
+
+  friend class DiskUsageLimitTest;
+};
+
+}  // namespace tensorflow
+
+namespace std {
+
+template <>
+struct hash<::tensorflow::DebugNodeKey> {
+  size_t operator()(const ::tensorflow::DebugNodeKey& k) const {
+    return ::tensorflow::Hash64(
+        ::tensorflow::strings::StrCat(k.device_name, ":", k.node_name, ":",
+                                      k.output_slot, ":", k.debug_op, ":"));
+  }
+};
+
+}  // namespace std
+
+// TODO(cais): Support grpc:// debug URLs in open source once Python grpc
+//   genrule becomes available. See b/23796275.
+#ifndef PLATFORM_WINDOWS
+#include "grpcpp/channel.h"
+#include "tensorflow/core/debug/debug_service.grpc.pb.h"
+
+namespace tensorflow {
+
+class DebugGrpcChannel {
+ public:
+  // Constructor of DebugGrpcChannel.
+  //
+  // Args:
+  //   server_stream_addr: Address (host name and port) of the debug stream
+  //     server implementing the EventListener service (see
+  //     debug_service.proto). E.g., "127.0.0.1:12345".
+  explicit DebugGrpcChannel(const string& server_stream_addr);
+
+  virtual ~DebugGrpcChannel() {}
+
+  // Attempt to establish connection with server.
+  //
+  // Args:
+  //   timeout_micros: Timeout (in microseconds) for the attempt to establish
+  //     the connection.
+  //
+  // Returns:
+  //   OK Status iff connection is successfully established before timeout,
+  //   otherwise return an error Status.
+  absl::Status Connect(const int64_t timeout_micros);
+
+  // Write an Event proto to the debug gRPC stream.
+  //
+  // Thread-safety: Safe with respect to other calls to the same method and
+  //   calls to ReadEventReply() and Close().
+  //
+  // Args:
+  //   event: The event proto to be written to the stream.
+  //
+  // Returns:
+  //   True iff the write is successful.
+  bool WriteEvent(const Event& event);
+
+  // Read an EventReply proto from the debug gRPC stream.
+  //
+  // This method blocks and waits for an EventReply from the server.
+  // Thread-safety: Safe with respect to other calls to the same method and
+  //   calls to WriteEvent() and Close().
+  //
+  // Args:
+  //   event_reply: the to-be-modified EventReply proto passed as reference.
+  //
+  // Returns:
+  //   True iff the read is successful.
+  bool ReadEventReply(EventReply* event_reply);
+
+  // Receive and process EventReply protos from the gRPC debug server.
+  //
+  // The processing includes setting debug watch key states using the
+  // DebugOpStateChange fields of the EventReply.
+  //
+  // Args:
+  //   max_replies: Maximum number of replies to receive. Will receive all
+  //     remaining replies iff max_replies == 0.
+  void ReceiveAndProcessEventReplies(size_t max_replies);
+
+  // Receive EventReplies from server (if any) and close the stream and the
+  // channel.
+  absl::Status ReceiveServerRepliesAndClose();
+
+ private:
+  string server_stream_addr_;
+  string url_;
+  ::grpc::ClientContext ctx_;
+  std::shared_ptr<::grpc::Channel> channel_;
+  std::unique_ptr<grpc::EventListener::Stub> stub_;
+  std::unique_ptr<::grpc::ClientReaderWriterInterface<Event, EventReply>>
+      reader_writer_;
+
+  mutex mu_;
+};
+
+class DebugGrpcIO {
+ public:
+  static const size_t kGrpcMessageSizeLimitBytes;
+  static const size_t kGrpcMaxVarintLengthSize;
+
+  // Sends a tensor through a debug gRPC stream.
+  static absl::Status SendTensorThroughGrpcStream(
+      const DebugNodeKey& debug_node_key, const Tensor& tensor,
+      const uint64 wall_time_us, const string& grpc_stream_url,
+      const bool gated);
+
+  // Sends an Event proto through a debug gRPC stream.
+  // Thread-safety: Safe with respect to other calls to the same method and
+  // calls to CloseGrpcStream().
+  //
+  // Args:
+  //   event_proto: The Event proto to be sent.
+  //   grpc_stream_url: The grpc:// URL of the stream to use, e.g.,
+  //     "grpc://localhost:11011", "localhost:22022".
+  //   receive_reply: Whether an EventReply proto will be read after event_proto
+  //     is sent and before the function returns.
+  //
+  // Returns:
+  //   The Status of the operation.
+  static absl::Status SendEventProtoThroughGrpcStream(
+      const Event& event_proto, const string& grpc_stream_url,
+      const bool receive_reply = false);
+
+  // Receive an EventReply proto through a debug gRPC stream.
+  static absl::Status ReceiveEventReplyProtoThroughGrpcStream(
+      EventReply* event_reply, const string& grpc_stream_url);
+
+  // Check whether a debug watch key is read-activated at a given gRPC URL.
+  static bool IsReadGateOpen(const string& grpc_debug_url,
+                             const string& watch_key);
+
+  // Check whether a debug watch key is write-activated (i.e., read- and
+  // write-activated) at a given gRPC URL.
+  static bool IsWriteGateOpen(const string& grpc_debug_url,
+                              const string& watch_key);
+
+  // Closes a gRPC stream to the given address, if it exists.
+  // Thread-safety: Safe with respect to other calls to the same method and
+  // calls to SendTensorThroughGrpcStream().
+  static absl::Status CloseGrpcStream(const string& grpc_stream_url);
+
+  // Set the gRPC state of a debug node key.
+  // TODO(cais): Include device information in watch_key.
+  static void SetDebugNodeKeyGrpcState(
+      const string& grpc_debug_url, const string& watch_key,
+      const EventReply::DebugOpStateChange::State new_state);
+
+ private:
+  using DebugNodeName2State =
+      std::unordered_map<string, EventReply::DebugOpStateChange::State>;
+
+  // Returns a global map from grpc debug URLs to the corresponding
+  // DebugGrpcChannels.
+  static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+  GetStreamChannels();
+
+  // Get a DebugGrpcChannel object at a given URL, creating one if necessary.
+  //
+  // Args:
+  //   grpc_stream_url: grpc:// URL of the stream, e.g., "grpc://localhost:6064"
+  //   debug_grpc_channel: A pointer to the DebugGrpcChannel object, passed as a
+  //     a pointer to the pointer. The DebugGrpcChannel object is owned
+  //     statically elsewhere, not by the caller of this function.
+  //
+  // Returns:
+  //   Status of this operation.
+  static absl::Status GetOrCreateDebugGrpcChannel(
+      const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel);
+
+  // Returns a map from debug URL to a map from debug op name to enabled state.
+  static std::unordered_map<string, DebugNodeName2State>*
+  GetEnabledDebugOpStates();
+
+  // Returns a map from debug op names to enabled state, for a given debug URL.
+  static DebugNodeName2State* GetEnabledDebugOpStatesAtUrl(
+      const string& grpc_debug_url);
+
+  // Clear enabled debug op state from all debug URLs (if any).
+  static void ClearEnabledWatchKeys();
+
+  static mutex streams_mu_;
+  static int64_t channel_connection_timeout_micros_;
+
+  friend class GrpcDebugTest;
+  friend class DebugNumericSummaryOpTest;
+};
+
+}  // namespace tensorflow
+#endif  // #ifndef(PLATFORM_WINDOWS)
+
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/debug/debug_node_key.h b/third_party/tflite-hdrs/tensorflow/core/debug/debug_node_key.h
new file mode 100644
index 00000000..5decb5cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/debug/debug_node_key.h
@@ -0,0 +1,56 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_NODE_KEY_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_NODE_KEY_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Encapsulates debug information for a node that was observed.
+struct DebugNodeKey {
+  static const char* const kMetadataFilePrefix;
+  static const char* const kDeviceTag;
+
+  DebugNodeKey(const string& device_name, const string& node_name,
+               int32_t output_slot, const string& debug_op,
+               const string& io_of_node = "", bool is_input = false,
+               int32_t io_index = -1);
+
+  // Converts a device name string to a device path string.
+  // E.g., /job:localhost/replica:0/task:0/cpu:0 will be converted to
+  //   ,job_localhost,replica_0,task_0,cpu_0.
+  static const string DeviceNameToDevicePath(const string& device_name);
+
+  bool operator==(const DebugNodeKey& other) const;
+  bool operator!=(const DebugNodeKey& other) const;
+
+  const string device_name;
+  const string node_name;
+  const int32 output_slot;
+  const string debug_op;
+  const string debug_node_name;
+  const string device_path;
+  const string io_of_node;
+  const bool is_input;
+  const int32 io_index;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_NODE_KEY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/debug/debugger_state_impl.h b/third_party/tflite-hdrs/tensorflow/core/debug/debugger_state_impl.h
new file mode 100644
index 00000000..c34aa8bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/debug/debugger_state_impl.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUGGER_STATE_IMPL_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUGGER_STATE_IMPL_H_
+
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace tensorflow {
+
+class DebuggerState : public DebuggerStateInterface {
+ public:
+  DebuggerState(const DebugOptions& debug_options);
+  ~DebuggerState() override;
+
+  // Publish metadata about the debugged Session::Run() call.
+  //
+  // See the doc string of DebuggerStateInterface::PublishDebugMetadata() for
+  // details.
+  absl::Status PublishDebugMetadata(
+      const int64_t global_step, const int64_t session_run_count,
+      const int64_t executor_step_count, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_names) override;
+
+ private:
+  std::unordered_set<string> debug_urls_;
+};
+
+class DebugGraphDecorator : public DebugGraphDecoratorInterface {
+ public:
+  DebugGraphDecorator(const DebugOptions& debug_options)
+      : debug_options_(debug_options) {}
+  ~DebugGraphDecorator() override {}
+
+  absl::Status DecorateGraph(Graph* graph, Device* device) override;
+  absl::Status PublishGraph(const Graph& graph,
+                            const string& device_name) override;
+
+ private:
+  DebugOptions debug_options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUGGER_STATE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
new file mode 100644
index 00000000..4713f3be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -0,0 +1,297 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/common_runtime/eager/rendezvous_cache.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/local_rendezvous.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/refcount.h"
+
+namespace tensorflow {
+
+class BaseRemoteRendezvous;
+class BaseRecvTensorCall;
+
+// RendezvousMgr keeps track of a set of local rendezvous instances.
+// All tensors sent by this worker are buffered in a RendezvousMgr
+// until the tensor is received.  Each global unique "step_id"
+// corresponds to one local rendezvous instance managed by a
+// RendezvousMgr.
+// RendezvousMgr holds weak references to rendezvous. When a rendezvous is
+// destructed, it will create a new instance to fulfill the Find.
+//
+// E.g.,
+//   Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
+//   fork execution of a graph executor using "rendez" on thread 1;
+//   fork execution of another graph executor using "rendez" on thread 2;
+//   ...
+//   join threads 1 and 2;
+//
+// In the example above, execution in thread 1 and 2 communicates with
+// each other by send/recv operations through `rendez`.
+//
+// Tensors sent and received through a rendezvous managed by this
+// RendezvousMgr must have keys generated by Rendezvous::CreateKey().
+class BaseRendezvousMgr : public RendezvousMgrInterface {
+ public:
+  explicit BaseRendezvousMgr(const WorkerEnv* worker_env);
+
+  ~BaseRendezvousMgr() override;
+
+  // Returns Rendezvous supporting send and recv among workers in the
+  // "step_id".  The caller takes ownership of one reference on the
+  // returned Rendezvous instance.
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  tsl::core::RefCountPtr<RemoteRendezvous> Find(int64_t step_id) override;
+
+  // Finds the local rendezvous instance for the "step_id".  Runs
+  // "done" when the tensor for "key" is produced or an error occurs.
+  //
+  // This method is used by the rpc handler of RecvTensor.
+  void RecvLocalAsync(int64_t step_id, const Rendezvous::ParsedKey& parsed,
+                      Rendezvous::DoneCallback done) override;
+
+  // Synchronous wrapper for RecvLocalAsync.
+  absl::Status RecvLocal(int64_t step_id, const Rendezvous::ParsedKey& parsed,
+                         Tensor* val, bool* is_dead) override;
+
+  // Removes rendezvous for "step_id".
+  void Cleanup(int64_t step_id) override { cache_->RemoveAndAbort(step_id); }
+
+  // Remove all rendezvous instances owned by the rendezvous_mgr.
+  void CleanupAll() override { cache_->RemoveAll(); }
+
+ protected:
+  virtual tsl::core::RefCountPtr<BaseRemoteRendezvous> Create(
+      int64_t step_id, const WorkerEnv* worker_env) = 0;
+
+ private:
+  tsl::core::RefCountPtr<RendezvousCache<BaseRemoteRendezvous>> cache_;
+
+  // Not owned.
+  const WorkerEnv* const worker_env_;
+
+  tsl::core::RefCountPtr<BaseRemoteRendezvous> FindOrCreate(int64_t step_id);
+
+  BaseRendezvousMgr(const BaseRendezvousMgr&) = delete;
+  void operator=(const BaseRendezvousMgr&) = delete;
+};
+
+// RemoteRendezvous is a Rendezvous which can handle either
+// the producer or consumer being in a remote process.
+//
+// Buffering of Tensor values is delegated to a "local" Rendezvous
+// obtained from NewLocalRendezvous().  This class just adds
+// functionality to coordinate with remote workers.
+class BaseRemoteRendezvous : public RemoteRendezvous {
+ public:
+  BaseRemoteRendezvous(const WorkerEnv* env, int64_t step_id);
+
+  // Upgrades the BaseRemoteRendezvous to full initialization.
+  absl::Status Initialize(WorkerSession* session) override;
+
+  void SetRemoteEagerContextDefault() override {
+    remote_eager_context_default_ = true;
+  }
+  bool IsRemoteEagerContextDefault() override {
+    return remote_eager_context_default_;
+  }
+
+  // Forwards to local_, where the Tensor "val" will be buffered and
+  // any waiting callback stored.
+  absl::Status Send(const ParsedKey& key, const Rendezvous::Args& args,
+                    const Tensor& val, const bool is_dead) override;
+
+  // This method is called only by the RecvOp.  It tests to see
+  // whether the value will be produced by a local or remote device
+  // and handles accordingly.  In the local case it forwards to
+  // local_, in the remote case it initiates an RPC request.
+  void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
+                 DoneCallback done) override;
+
+  void StartAbort(const absl::Status& status) override;
+
+  // This method is called only by the local Worker, forwarded through
+  // the same method on RendezvousMgr.  This occurs when the Worker
+  // has received a RecvTensor request, either locally or over the
+  // network.  In either case it needs to retrieve a locally buffered
+  // value from local_, and give it to its caller.
+  //
+  // Runs "done" as soon as the tensor for "parsed" is available or an error
+  // is detected.
+  //
+  // REQUIRES: "parsed" is one that will be Saved into the local rendezvous.
+  void RecvLocalAsync(const ParsedKey& parsed, DoneCallback done);
+
+ protected:
+  virtual void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                                   const Rendezvous::Args& args,
+                                   DoneCallback done) = 0;
+
+  // Returns true if "src" and "dst" are located in the same worker,
+  // and hence may use a local rendezvous.
+  virtual bool IsSameWorker(DeviceNameUtils::ParsedName src,
+                            DeviceNameUtils::ParsedName dst);
+
+  // If aborted, aborts "call". Otherwise, adds "call" into calls_.
+  void RegisterCall(BaseRecvTensorCall* call, const Rendezvous::Args& args);
+
+  // Removes "call" from calls_ if "call" is in calls_.
+  void DeregisterCall(BaseRecvTensorCall* call, const Rendezvous::Args& args);
+
+  WorkerSession* session();
+
+  bool is_initialized();
+
+  ~BaseRemoteRendezvous() override;
+
+  const WorkerEnv* const env_;  // Not owned.
+  const int64_t step_id_;
+
+ private:
+  int num_shards_;
+  LocalRendezvous local_;
+  // Indicates whether this remote rendezvous instance is used as the default
+  // rendezvous for remote eager op-by-op execution. Errors in eager op-by-op
+  // execution should not abort the rendezvous since it is a context-wide
+  // instance and needs to be reused; instead, the errors are propagated through
+  // eager executors.
+  bool remote_eager_context_default_ = false;
+
+  mutable mutex mu_;
+  mutable mutex calls_mu_;
+
+  // Status given by StartAbort() if any.
+  absl::Status status_ TF_GUARDED_BY(mu_);
+
+  WorkerSession* session_ TF_GUARDED_BY(mu_);  // Not owned.
+
+  // Data structures to handle calls when partially initialized.
+  struct DeferredCall {
+    const ParsedKey parsed;
+    DoneCallback done;
+
+    // Keeps a reference to the rendezvous, to keep it alive.
+    tsl::core::RefCountPtr<Rendezvous> rendezvous;
+
+    DeferredCall(const ParsedKey& parsed, DoneCallback done,
+                 tsl::core::RefCountPtr<Rendezvous> rendez);
+  };
+  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(mu_);
+
+  struct CallBucket {
+    mutex mu;
+
+    absl::flat_hash_set<BaseRecvTensorCall*> calls TF_GUARDED_BY(mu);
+  };
+
+  struct PendingCalls {
+    PendingCalls(CancellationToken token, int num_calls, int num_buckets,
+                 tsl::core::RefCountPtr<Rendezvous> rendez)
+        : token(token),
+          num_calls(num_calls),
+          buckets(num_buckets),
+          rendezvous(std::move(rendez)) {}
+    CancellationToken token = CancellationManager::kInvalidToken;
+    std::atomic<int> num_calls = 0;
+    std::vector<CallBucket> buckets;
+
+    // Keeps a reference to the rendezvous, to keep it alive.
+    tsl::core::RefCountPtr<Rendezvous> rendezvous;
+  };
+
+  // "CancellationToken" is stored here so that when there's no active
+  // RecvTensorCalls, we can de-register the callback in the cancellation
+  // manager. RecvTensorCalls are managed in multiple buckets since in large
+  // scaled distributed training, lots of Send/Recv may be triggered
+  // concurrently.
+  //
+  // Note: pointer to CancellationManager can be nullptr in certain use cases.
+  absl::flat_hash_map<CancellationManager*, std::unique_ptr<PendingCalls>>
+      calls_ TF_GUARDED_BY(calls_mu_);
+
+  // Callback for CancellationManager.
+  void CancelledByManager(CancellationManager* cm);
+
+  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(mu_) {
+    return session_ != nullptr;
+  }
+
+  // If "is_src" is true, checks that the rendezvous key "parsed"'s
+  // source is in this process. If "is_src" is false, checks that the
+  // rendezvous key "parsed"'s destination is in this process.
+  absl::Status ValidateDevices(const Rendezvous::ParsedKey& parsed,
+                               bool is_src);
+
+  // Callback handling the case when a rendezvous has been
+  // accomplished in local_ and the consumer is local to this process.
+  // Tensor "in" will be copied into "out". The key "parsed" encodes
+  // the src and dst devices.
+  void SameWorkerRecvDone(const Rendezvous::ParsedKey& parsed,
+                          const Rendezvous::Args& in_args,
+                          const Rendezvous::Args& out_args, const Tensor& in,
+                          Tensor* out, StatusCallback done);
+
+  // Must be called only if fully initialized.
+  void RecvLocalAsyncInternal(const ParsedKey& parsed, DoneCallback done);
+
+  BaseRemoteRendezvous(const BaseRemoteRendezvous&) = delete;
+  void operator=(const BaseRemoteRendezvous&) = delete;
+};
+
+class BaseRecvTensorCall {
+ public:
+  BaseRecvTensorCall() {}
+  virtual ~BaseRecvTensorCall() {}
+
+  virtual void Start(std::function<void()> recv_done) = 0;
+
+  virtual void StartAbort(const absl::Status& s) = 0;
+
+  virtual absl::Status status() const = 0;
+
+ private:
+  BaseRecvTensorCall(const BaseRecvTensorCall&) = delete;
+  void operator=(const BaseRecvTensorCall&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/call_options.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/call_options.h
new file mode 100644
index 00000000..a845bcdc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/call_options.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+
+#include "xla/tsl/distributed_runtime/call_options.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CallOptions;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/cancellable_call.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/cancellable_call.h
new file mode 100644
index 00000000..7311c8e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/cancellable_call.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CANCELLABLE_CALL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CANCELLABLE_CALL_H_
+
+#include <string>
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// Supports client side cancellation of WorkerInterface calls via
+// registration with a CancellationManager.
+class CancellableCall {
+ public:
+  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
+                  WorkerCacheInterface* wc)
+      : is_cancelled_(false),
+        cancel_mgr_(cancel_mgr),
+        remote_worker_(remote_worker),
+        wc_(wc),
+        wi_(wc_->GetOrCreateWorker(remote_worker_)) {}
+
+  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
+
+  virtual void IssueCall(const StatusCallback& done) = 0;
+
+  void Start(const StatusCallback& done);
+
+  // Cancels the RPC if it's not cancelled yet. This must be called after
+  // Start(). This is normally used if there's a needed to cancel the RPC from a
+  // sideband. If appliable, pass a cancellation manager to the constructor
+  // instead of using this method.
+  void Cancel() TF_LOCKS_EXCLUDED(mu_);
+
+ protected:
+  mutex mu_;
+  bool is_cancelled_;
+  CancellationManager* const cancel_mgr_;  // Not owned
+  const string remote_worker_;
+  WorkerCacheInterface* const wc_;  // Not owned
+  WorkerInterface* const wi_;       // Owned by wc_, must be released.
+  CallOptions opts_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CANCELLABLE_CALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
new file mode 100644
index 00000000..a016a5ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -0,0 +1,106 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+
+class WorkerSession;
+
+// ClusterFunctionLibraryRuntime contains methods to Instantiate and Run
+// functions across processes by making RPCs through worker service.
+class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
+ public:
+  ClusterFunctionLibraryRuntime(WorkerSession* worker_session,
+                                bool create_worker_session_called,
+                                DeviceMgr* remote_device_mgr)
+      : worker_session_(worker_session),
+        create_worker_session_called_(create_worker_session_called),
+        remote_device_mgr_(remote_device_mgr) {}
+
+  ~ClusterFunctionLibraryRuntime() override;
+
+  void Instantiate(const string& function_name,
+                   const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+                   const FunctionLibraryRuntime::InstantiateOptions& options,
+                   FunctionLibraryRuntime::LocalHandle* handle,
+                   FunctionLibraryRuntime::DoneCallback done) override;
+
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle,
+           absl::Span<const Tensor> args, std::vector<Tensor>* rets,
+           FunctionLibraryRuntime::DoneCallback done) override;
+
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle,
+           absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
+           FunctionLibraryRuntime::DoneCallback done) override;
+
+  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+               FunctionLibraryRuntime::DoneCallback done) override;
+
+  DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
+
+ private:
+  static absl::Status ConstructFunctionGraph(
+      const OpDef& sig, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      const FunctionLibraryDefinition& flib_def, GraphDef* g,
+      std::vector<string>* send_keys, std::vector<string>* recv_keys);
+  friend class ClusterFunctionLibraryRuntimeTest;
+
+  mutable mutex mu_;
+  WorkerSession* const worker_session_ = nullptr;  // not owned.
+  const bool create_worker_session_called_;
+
+  DeviceMgr* remote_device_mgr_;  // not owned.
+
+  struct FunctionData {
+    const string graph_handle;
+    const string target;
+    // Hold a shared pointer to the underlying worker cache to avoid it being
+    // deleted in potential cluster update.
+    const std::shared_ptr<WorkerCacheInterface> worker_cache;
+    WorkerInterface* wi = nullptr;
+    const std::vector<string> send_keys;
+    const std::vector<string> recv_keys;
+
+    FunctionData(const string& graph_handle, const string& target,
+                 std::shared_ptr<WorkerCacheInterface> worker_cache,
+                 WorkerInterface* wi, const std::vector<string>& send_keys,
+                 const std::vector<string>& recv_keys)
+        : graph_handle(graph_handle),
+          target(target),
+          worker_cache(std::move(worker_cache)),
+          wi(wi),
+          send_keys(send_keys),
+          recv_keys(recv_keys) {}
+  };
+
+  std::vector<FunctionData> function_data_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
new file mode 100644
index 00000000..63006c12
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -0,0 +1,96 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+class ConfigProto;
+class WorkerCacheInterface;
+class DeviceResolverDistributed;
+class DeviceMgr;
+
+class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
+ public:
+  CollectiveParamResolverDistributed(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      DeviceResolverDistributed* dev_resolver,
+      NcclCommunicatorInterface* nccl_communicator,
+      WorkerCacheInterface* worker_cache, const string& task_name);
+
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override;
+
+  void CompleteGroupAsync(const DeviceAttributes& device,
+                          CollGroupParams* group_params,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override;
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override;
+
+  void StartAbort(const absl::Status& s) override;
+
+ protected:
+  // Returns the cached group iff there's an entry for this group_key in the
+  // local group_table_; returns nullptr otherwise.
+  GroupRec* GetCachedGroup(int32_t group_key) TF_LOCKS_EXCLUDED(group_mu_);
+
+  // Updates group_table_ with contents of resp.
+  absl::Status UpdateGroupCache(const CompleteGroupResponse& resp)
+      TF_LOCKS_EXCLUDED(group_mu_);
+
+  // Finds the GroupRec that corresponds to cp->group_key and also
+  // populates cp->group from that GroupRec.
+  //
+  // Semantics are like those of CompleteGroupLocal but will make a
+  // remote call to the group leader if necessary.
+  void CompleteGroupDistributed(const DeviceAttributes& device,
+                                CollGroupParams* group_params,
+                                CancellationManager* cancel_mgr,
+                                const StatusCallback& done);
+
+  // Returns true iff there's an entry for this instance_key in the
+  // local instance_table_.
+  bool InstanceIsCached(int32_t group_key, const CollInstanceParams& instance)
+      TF_LOCKS_EXCLUDED(instance_mu_);
+
+  // Updates instance_table_ with contents of resp.
+  absl::Status UpdateInstanceCache(CollectiveParams* cp,
+                                   const CompleteInstanceResponse& resp)
+      TF_LOCKS_EXCLUDED(instance_mu_, group_mu_);
+
+  // Finish populating *cp.  Semantics are like those of
+  // CompleteInstanceLocal but will make a remote call to the group
+  // leader if necessary.
+  void CompleteInstanceDistributed(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   const StatusCallback& done)
+      TF_LOCKS_EXCLUDED(instance_mu_, group_mu_);
+
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  const string group_leader_;
+  CancellationManager abortion_cancel_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/collective_rma_distributed.h
new file mode 100644
index 00000000..22d4d6f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+
+namespace tensorflow {
+class WorkerCacheInterface;
+
+// Extend CollectiveRemoteAccessLocal with access to remote peers.
+class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
+ public:
+  CollectiveRemoteAccessDistributed(
+      const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+      std::shared_ptr<UnboundedWorkQueue> work_queue,
+      WorkerCacheInterface* worker_cache, int64_t step_id, string task_name)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache),
+        work_queue_(std::move(work_queue)),
+        task_name_(std::move(task_name)) {}
+
+  ~CollectiveRemoteAccessDistributed() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    CancellationManager* cancellation_manager,
+                    const StatusCallback& done) override;
+
+  void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+                       const StatusCallback& done) override;
+
+  void StartAbort(const absl::Status& s) override;
+
+ protected:
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
+  CancellationManager abortion_cancel_mgr_;
+  string task_name_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_client.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_client.h
new file mode 100644
index 00000000..0901d56b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_client.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CoordinationClient;
+using tsl::CoordinationClientCache;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
new file mode 100644
index 00000000..3e0243ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
@@ -0,0 +1,126 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_BARRIER_PROXY_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_BARRIER_PROXY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// A local proxy connecting the coordination service's barrier.
+// The barrier provided by coordination service can only block at tasks (i.e.,
+// TPU workers), but sometimes we need a barrier that can block at different
+// threads. The proxy first waits at threads on a participating
+// task and then issues a barrier wait to the coordination service once all the
+// threads at that task have arrived.
+// Usage:
+//   // Main thread creates a `BarrierProxy`:
+//   barrier = new BarrierProxy(agent, tasks, key, num_local_threads);
+//
+//   // Each participating thread could then call:
+//   auto [status, last_exit] = barrier.Wait();
+//   // The last exited thread is responsible for deleting the barrier.
+//   if (last_exit) {
+//     delete barrier;
+//   }
+class BarrierProxy {
+ public:
+  BarrierProxy(const BarrierProxy&) = delete;
+  void operator=(const BarrierProxy&) = delete;
+  // Construct a BarrierProxy connected to the coordination service via `agent`.
+  // `tasks` specifies all participating coordinated tasks and
+  // `num_local_threads` specifies the number of threads in this task to
+  // particiate. If no tasks are specified, the barrier will block for all the
+  // connected tasks.
+  BarrierProxy(tsl::CoordinationServiceAgent* agent,
+               std::vector<CoordinatedTask> tasks, int num_local_threads,
+               absl::string_view key, absl::Duration timeout)
+      : key_(key),
+        agent_(agent),
+        tasks_(std::move(tasks)),
+        timeout_(timeout),
+        num_local_threads_(num_local_threads) {}
+
+  ~BarrierProxy() = default;
+
+  // Waits at the barrier. The first return value is the status when exiting the
+  // barrier and the second returns `true` for precisely one caller, which may
+  // then destroy the barrier.
+  std::pair<absl::Status, bool> Wait();
+
+ private:
+  const std::string key_;
+  tsl::CoordinationServiceAgent* agent_;
+  const std::vector<CoordinatedTask> tasks_;
+  absl::Duration timeout_;
+
+  mutex mu_;
+  condition_variable cv_ TF_GUARDED_BY(mu_);
+  const int num_local_threads_;
+  int num_entered_ TF_GUARDED_BY(mu_) = 0;
+  int num_to_exit_ TF_GUARDED_BY(mu_) = 0;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+  bool status_set_ TF_GUARDED_BY(mu_) = false;
+};
+
+// Manages the life cycle of BarrierProxies automatically.
+// Usage:
+//   // Main thread creates a `BarrierProxy`:
+//   BarrierProxyManager barrier_mgr;
+//
+//   // Exactly `num_local_threads` threads call:
+//   Status s = barrier_mgr.Wait(agent, task, num_local_threads, key, timeout);
+class BarrierProxyManager {
+ public:
+  BarrierProxyManager(const BarrierProxyManager&) = delete;
+  void operator=(const BarrierProxyManager&) = delete;
+  BarrierProxyManager() = default;
+  ~BarrierProxyManager() = default;
+
+  // Waits at the barrier backed by the coord service `agent` and keyed by
+  // `key`. `tasks` specifies all participating coordinated tasks and
+  // `num_local_threads` specifies the number of threads in this task to
+  // participate. If no tasks are specified, the barrier will block for all the
+  // connected tasks.
+  absl::Status Wait(tsl::CoordinationServiceAgent* agent,
+                    const std::vector<CoordinatedTask>& tasks,
+                    int num_local_threads, absl::string_view key,
+                    absl::Duration timeout);
+  // The number of active BarrierProxies.
+  size_t size() const;
+
+ private:
+  mutable mutex mu_;
+  absl::flat_hash_map<std::string, std::shared_ptr<BarrierProxy>> barriers_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_BARRIER_PROXY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h
new file mode 100644
index 00000000..aa6dfa41
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h
@@ -0,0 +1,27 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::CoordinationErrorPayloadKey;
+using ::tsl::MakeCoordinationError;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
new file mode 100644
index 00000000..d378684d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CoordinationServiceRpcHandler;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/device_resolver_distributed.h
new file mode 100644
index 00000000..b46c288c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_DEVICE_RESOLVER_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_DEVICE_RESOLVER_DISTRIBUTED_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+class DeviceMgr;
+class WorkerCacheInterface;
+
+class DeviceResolverDistributed : public DeviceResolverInterface {
+ public:
+  explicit DeviceResolverDistributed(const DeviceMgr* dev_mgr);
+
+  absl::Status GetDeviceAttributes(const string& device,
+                                   DeviceAttributes* attributes) override;
+
+  absl::Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) override;
+
+  absl::Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) override;
+
+ protected:
+  const string task_name_;
+  mutex mu_;
+  absl::flat_hash_map<string, DeviceAttributes> attr_table_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_DEVICE_RESOLVER_DISTRIBUTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
new file mode 100644
index 00000000..58af5ed9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+
+namespace tensorflow {
+
+class WorkerSession;
+
+namespace eager {
+
+// EagerClusterFunctionLibraryRuntime contains methods to Instantiate and Run
+// functions across processes by making RPCs through eager service.
+class EagerClusterFunctionLibraryRuntime
+    : public DistributedFunctionLibraryRuntime {
+ public:
+  EagerClusterFunctionLibraryRuntime(const uint64 context_id, EagerContext* ctx,
+                                     DeviceMgr* remote_device_mgr)
+      : context_id_(context_id),
+        ctx_(ctx),
+        remote_device_mgr_(remote_device_mgr) {}
+
+  ~EagerClusterFunctionLibraryRuntime() override{};
+
+  // Register a partition (i.e., component function) of a multi-device function
+  // on the remote target specified in `options.target`. This should be
+  // triggered as part of instantiating a multi-device function in
+  // ProcessFunctionLibraryRuntime.
+  void Instantiate(const string& function_name,
+                   const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+                   const FunctionLibraryRuntime::InstantiateOptions& options,
+                   FunctionLibraryRuntime::LocalHandle* handle,
+                   FunctionLibraryRuntime::DoneCallback done) override;
+
+  // Execute the component function specified by `handle` on its instantiated
+  // remote target. This should be triggered as part of driving a multi-device
+  // function execution in ProcessFunctionLibraryRuntime. Running the component
+  // function remotely is purely asynchronous, and multiple component functions
+  // with the same remote target are not executed in any particular ordering.
+  // The main function side must wait for all component functions to finish
+  // (i.e., the done callbacks triggered) before finishing its execution.
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle,
+           absl::Span<const Tensor> args, std::vector<Tensor>* rets,
+           FunctionLibraryRuntime::DoneCallback done) override;
+
+  // The component function inputs `args` and outputs `rets` may refer to remote
+  // tensors on a remote device, which will be lazily resolved remotely where
+  // the inputs/outputs are actually consumed.
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle,
+           absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
+           FunctionLibraryRuntime::DoneCallback done) override;
+
+  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+               FunctionLibraryRuntime::DoneCallback done) override;
+
+  DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
+
+ private:
+  const uint64 context_id_;
+  EagerContext* ctx_;
+  DeviceMgr* remote_device_mgr_;  // not owned.
+
+  struct FunctionData {
+    const string target;
+    const absl::optional<std::vector<int>> ret_indices;
+    core::RefCountPtr<EagerClient> eager_client;
+    std::unique_ptr<EagerOperation> op;
+
+    FunctionData(const string& target,
+                 const absl::optional<std::vector<int>>& ret_indices,
+                 EagerClient* eager_client, std::unique_ptr<EagerOperation> op)
+        : target(target),
+          ret_indices(ret_indices),
+          eager_client(core::RefCountPtr<EagerClient>(eager_client)),
+          op(std::move(op)) {
+      eager_client->Ref();
+    }
+  };
+
+  mutable mutex mu_;
+  std::vector<FunctionData> function_data_ TF_GUARDED_BY(mu_);
+};
+
+DistributedFunctionLibraryRuntime* CreateClusterFLR(
+    const uint64 context_id, EagerContext* ctx, WorkerSession* worker_session);
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
new file mode 100644
index 00000000..a9b9ead8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_DESTROY_TENSOR_HANDLE_NODE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_DESTROY_TENSOR_HANDLE_NODE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// DestroyTensorHandleNode is an implementation of EagerNode which enqueues a
+// request to destroy a remote tensor handle.
+class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
+ public:
+  DestroyTensorHandleNode(std::unique_ptr<EnqueueRequest> request,
+                          core::RefCountPtr<EagerClient> eager_client,
+                          bool ready)
+      : tensorflow::AsyncEagerNode(),
+        request_(std::move(request)),
+        eager_client_(std::move(eager_client)),
+        ready_(ready) {}
+
+  ~DestroyTensorHandleNode() override {}
+
+  void RunAsync(StatusCallback done) override {
+    EnqueueResponse* response = new EnqueueResponse;
+    bool ready = ready_;
+    // NOTE(fishx): Don't use StreamingEnqueueAsync here. When a
+    // StreamingEnqueueAsync request fails all following requests will fail as
+    // well. We don't want this request poison following requests since it is
+    // safe to ignore a failing destroy tensor handle request.
+    eager_client_->EnqueueAsync(
+        /*call_opts=*/nullptr, request_.get(), response,
+        [response, ready, done](const absl::Status& s) {
+          // Omit the warning if:
+          // 1. The remote tensor isn't ready.
+          // 2. Lost connection to remote worker. In this case client will
+          //    crash. We don't want to spam user with redundant warning logs.
+          if (!s.ok() && ready && !absl::IsUnavailable(s)) {
+            LOG_EVERY_N_SEC(WARNING, 60)
+                << "Ignoring an error encountered when deleting "
+                   "remote tensors handles: "
+                << s.ToString();
+          }
+          done(absl::OkStatus());
+          delete response;
+        });
+  }
+
+  void Abort(absl::Status status) override {}
+
+  // Remote node deletions are best effort
+  bool Fatal() const override { return false; }
+
+  string DebugString() const override {
+    string out = "[DestroyTensorHandleNode]";
+    strings::StrAppend(&out, " request: ", request_->DebugString());
+    return out;
+  }
+
+ private:
+  std::unique_ptr<EnqueueRequest> request_;
+  core::RefCountPtr<EagerClient> eager_client_;
+  const string remote_task_;
+  bool ready_;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_DESTROY_TENSOR_HANDLE_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/eager_client.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/eager_client.h
new file mode 100644
index 00000000..6fc95601
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
+
+#include <cstdint>
+
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This is a base class that can be implemented by a variety of
+// transports (e.g. gRPC which for each of the client methods makes an RPC).
+class EagerClient : public core::RefCounted {
+ public:
+  ~EagerClient() override {}
+#define CLIENT_METHOD(method)                                \
+  virtual void method##Async(const method##Request* request, \
+                             method##Response* response,     \
+                             StatusCallback done) = 0;
+
+  CLIENT_METHOD(CreateContext);
+  CLIENT_METHOD(UpdateContext);
+  CLIENT_METHOD(WaitQueueDone);
+  CLIENT_METHOD(KeepAlive);
+  CLIENT_METHOD(CloseContext);
+
+#undef CLIENT_METHOD
+
+#define CLIENT_METHOD_WITH_TIMEOUT_AND_RETRIES(method)                        \
+  virtual void method##Async(const method##Request* request,                  \
+                             method##Response* response, StatusCallback done, \
+                             int64_t init_timeout_in_ms, int retries) = 0;
+
+  CLIENT_METHOD_WITH_TIMEOUT_AND_RETRIES(CreateContext);
+
+#undef CLIENT_METHOD_WITH_TIMEOUT_AND_RETRIES
+
+#define CLIENT_CANCELABLE_METHOD(method)                      \
+  virtual void method##Async(                                 \
+      CallOptions* call_opts, const method##Request* request, \
+      method##Response* response, StatusCallback done) = 0;
+
+  CLIENT_CANCELABLE_METHOD(Enqueue);
+  CLIENT_CANCELABLE_METHOD(RunComponentFunction);
+
+#undef CLIENT_CANCELABLE_METHOD
+
+  // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
+  // `response` will be filled with the response for this `request`. The
+  // 1-to-1 correspondence between requests and responses is a property
+  // of the current service implementation. When the response is received,
+  // `done` is invoked with the current status of the StreamingEnqueue call.
+  // The status can contain an error because of an earlier request in the
+  // current streaming call.
+  // The client initiates a streaming call the first time StreamingEnqueueAsync
+  // is invoked and keeps it open until some error condition.
+  // Similarly to the methods above, the request can be deleted as soon as
+  // StreamingEnqueueAsync returns.
+  virtual void StreamingEnqueueAsync(bool enable_streaming_enqueue,
+                                     CallOptions* call_opts,
+                                     const EnqueueRequest* request,
+                                     EnqueueResponse* response,
+                                     StatusCallback done) = 0;
+
+  virtual bool allow_multiple_pending_requests() const = 0;
+};
+
+// Simple wrapper class that can be used to retrieve EagerClients.
+class EagerClientCache {
+ public:
+  virtual ~EagerClientCache() {}
+
+  // If the `target` exists, assign the EagerClient pointer to `client` and
+  // increment the refcount of the client. The reference ownership is
+  // transferred to the caller, and the unref should automatically happen when
+  // destructing the RefCountPtr object from the caller's side.
+  virtual absl::Status GetClient(const string& target,
+                                 core::RefCountPtr<EagerClient>* client) = 0;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
new file mode 100644
index 00000000..924a99dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -0,0 +1,243 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+
+namespace tensorflow {
+namespace eager {
+
+// A TensorFlow Eager Worker runs ops and supports worker to worker
+// Tensor transfer.
+//
+// See eager_service.proto for more details about each method.
+// This class can be wrapped by specific classes that implement rpc transports
+// over this (e.g. gRPC).
+class EagerServiceImpl {
+ public:
+  explicit EagerServiceImpl(WorkerEnv* env) : env_(env) {
+    gc_thread_.reset(
+        env_->env->StartThread({}, "EagerServiceContextGC", [this]() {
+          while (true) {
+            {
+              mutex_lock l(gc_thread_shutdown_mu_);
+              gc_thread_cv_.wait_for(l, std::chrono::seconds(1));
+
+              if (shutting_down_) {
+                return;
+              }
+            }
+            {
+              mutex_lock l(contexts_mu_);
+              for (auto it = contexts_.begin(); it != contexts_.end();) {
+                if (it->second->IsStale()) {
+                  it->second->Unref();
+                  it = contexts_.erase(it);
+                } else {
+                  it++;
+                }
+              }
+            }
+          }
+        }));
+  }
+  virtual ~EagerServiceImpl() {
+    {
+      mutex_lock l(gc_thread_shutdown_mu_);
+      shutting_down_ = true;
+      gc_thread_cv_.notify_all();
+    }
+    gc_thread_.reset();
+
+    mutex_lock l(contexts_mu_);
+    for (auto& entry : contexts_) {
+      entry.second->Unref();
+    }
+  }
+
+  absl::Status CreateContext(const CreateContextRequest* request,
+                             CreateContextResponse* response);
+
+  absl::Status UpdateContext(const UpdateContextRequest* request,
+                             UpdateContextResponse* response);
+
+  // Create a ServerContext for master eager context.
+  absl::Status CreateMasterContext(const tensorflow::uint64 context_id,
+                                   EagerContext* context);
+
+  static constexpr uint64 kInvalidStreamId = 0;
+
+  // Used by both Enqueue and StreamingEnqueue RPCs.
+  absl::Status Enqueue(CallOptions* call_opts, const EnqueueRequest* request,
+                       EnqueueResponse* response,
+                       uint64 stream_id = kInvalidStreamId);
+
+  absl::Status WaitQueueDone(const WaitQueueDoneRequest* request,
+                             WaitQueueDoneResponse* response);
+
+  void RunComponentFunction(CallOptions* call_opts,
+                            const RunComponentFunctionRequest* request,
+                            RunComponentFunctionResponse* response,
+                            StatusCallback done);
+
+  absl::Status KeepAlive(const KeepAliveRequest* request,
+                         KeepAliveResponse* response);
+
+  absl::Status CloseContext(const CloseContextRequest* request,
+                            CloseContextResponse* response);
+
+ protected:
+  // This is the server-side execution context. All state regarding execution of
+  // a client's ops is held in this server-side context (all generated tensors,
+  // and the EagerContext).
+  class ServerContext : public core::RefCounted {
+   public:
+    // Create a ServerContext for local master.
+    static ServerContext* CreateMasterContext(tensorflow::EagerContext* ctx,
+                                              const WorkerEnv* env) {
+      return new ServerContext(ctx, -1, env, /* is_master= */ true);
+    }
+
+    explicit ServerContext(tensorflow::EagerContext* ctx,
+                           int64_t destroy_after_secs, const WorkerEnv* env,
+                           const bool is_master = false)
+        : ctx_(ctx), env_(env), is_master_(is_master) {
+      ctx->Ref();
+      destroy_after_micros_ =
+          destroy_after_secs * tensorflow::EnvTime::kSecondsToMicros;
+      RecordAccess();
+    }
+
+    ~ServerContext() override {
+      // TFE_Context is responsible for shutting down master eager context.
+      if (!is_master_) {
+        ctx_->WaitForAndCloseRemoteContexts();
+      }
+      // ctx_->RefCountIsOne() should be true here when is_master_ = false.
+      // TODO(iga): Remove EagerContext refcounting.
+      ctx_->Unref();
+    }
+
+    tensorflow::EagerContext* Context() const { return ctx_; }
+
+    void RecordAccess() {
+      mutex_lock l(last_accessed_mu_);
+      last_accessed_micros_ = env_->env->NowMicros();
+    }
+
+    bool IsStale() {
+      mutex_lock l(last_accessed_mu_);
+      const int64_t time_passed =
+          env_->env->NowMicros() - last_accessed_micros_;
+      return (destroy_after_micros_ > 0 && time_passed > destroy_after_micros_);
+    }
+
+   private:
+    // The context for this execution.
+    tensorflow::EagerContext* ctx_;
+
+    const WorkerEnv* const env_;  // Not owned.
+
+    mutex last_accessed_mu_;
+    int64_t last_accessed_micros_ TF_GUARDED_BY(last_accessed_mu_);
+    int64_t destroy_after_micros_;
+
+    const bool is_master_;
+  };
+  // The returned ServerContext will need to be Unrefed.
+  absl::Status GetServerContext(uint64, ServerContext**);
+
+  class ClientTensorHandleDeleteNode : public EagerNode {
+   public:
+    ClientTensorHandleDeleteNode(
+        ServerContext* context,
+        std::unique_ptr<RemoteTensorHandleInternal> handle_to_delete)
+        : tensorflow::EagerNode(),
+          context_(context),
+          handle_to_delete_(std::move(handle_to_delete)) {
+      context_->Ref();
+    }
+
+    ~ClientTensorHandleDeleteNode() override { context_->Unref(); }
+
+    absl::Status Run() override {
+      VLOG(3) << "ServerContext: Deleting tensor handle "
+              << handle_to_delete_->op_id << ":"
+              << handle_to_delete_->output_num;
+      return context_->Context()->RemoteMgr()->DeleteTensorHandle(
+          *handle_to_delete_);
+    }
+
+    void Abort(absl::Status status) override {}
+
+    // Remote node deletions are best effort
+    bool Fatal() const override { return false; }
+
+    string DebugString() const override {
+      string out = "[ClientTensorHandleDeleteNode]";
+      strings::StrAppend(&out, " op_id: ", handle_to_delete_->op_id);
+      strings::StrAppend(&out, ", output_num: ", handle_to_delete_->output_num);
+      return out;
+    }
+
+   private:
+    // Owns one reference.
+    ServerContext* const context_;
+    const std::unique_ptr<RemoteTensorHandleInternal> handle_to_delete_;
+  };
+
+ private:
+  absl::Status ExecuteOp(CallOptions* call_opts, const Operation& operation,
+                         EagerContext* eager_context,
+                         EagerExecutor* eager_executor,
+                         QueueResponse* queue_response);
+  absl::Status SendTensor(const SendTensorOp& send_tensor,
+                          EagerContext* eager_context);
+  absl::Status SendPackedHandle(const SendPackedHandleOp& send_packed_handle,
+                                EagerContext* eager_context);
+  absl::Status RegisterFunction(const RegisterFunctionOp& register_function,
+                                EagerContext* eager_context);
+  absl::Status RemoveFunction(const RemoveFunctionOp& remove_function,
+                              EagerContext* eager_context);
+  absl::Status CleanupFunction(const CleanupFunctionOp& cleanup_function);
+
+  WorkerEnv* const env_;  // Not owned.
+
+  mutex contexts_mu_;
+  std::unordered_map<uint64, ServerContext*> contexts_
+      TF_GUARDED_BY(contexts_mu_);
+
+  std::unique_ptr<Thread> gc_thread_;
+  mutex gc_thread_shutdown_mu_;
+  condition_variable gc_thread_cv_;
+  bool shutting_down_ TF_GUARDED_BY(gc_thread_shutdown_mu_) = false;
+
+  EagerServiceImpl(const EagerServiceImpl&) = delete;
+  void operator=(const EagerServiceImpl&) = delete;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
new file mode 100644
index 00000000..32f3befd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_COPY_NODE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_COPY_NODE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This node supports copying a tensor in the following way:
+// - Remote -> Local:
+//   We don't block on the remote _Send op and start executing the local
+//   _Recv immediately after issuing the remote _Send. The local _Recv
+//   kernel (or rather the special _Recv handling in KernelAndDeviceOp::Run)
+//   blocks until the tensor is received. If the remote _Send (or some op
+//   before it) fails, the local callback we give to EnqueueAsync will run
+//   and call CancellationManager.StartCancel(). The blocked local _Recv will
+//   get this notification and return with a cancelled error.
+//
+// - Local -> Remote:
+//   The local _Send op is synchronous and non-blocking, thus it should complete
+//   quickly. We issue remote _Recv RPC only after local _Send completes
+//   successfully. At this point, the tensor to be sent is in the local
+//   Rendezvous, hence, remote _Recv op will not deadlock waiting for the tensor
+//   to appear.
+//   When ctx->UseSendTensorRPC() is true, we use EagerService::Enqueue
+//   SendTensor instead of _Send/_Recv.
+//
+// - Remote -> Remote:
+//   We could issue both remote ops asynchronously, but if remote _Send (or some
+//   op before it) fails, we don't have a good way of cancelling the remote
+//   _Recv. The remote _Recv will deadlock in this case. The current approach
+//   to deal with this issue is to wait for remote _Send to complete before
+//   issuing remote _Recv RPC. Another option is to close the whole streaming
+//   RPC that contains the deadlocked remote _Recv. This would not unblock the
+//   deadlocked RPC on the remote machine without some extra code. Luckily, the
+//   remote -> remote case seems to be fairly rare at this point. So, the
+//   current partially synchronous approach seems fine.
+//
+// To copy a tensor within a host, please use copy_to_device_node instead.
+class RemoteCopyNode : public AsyncEagerNode {
+ public:
+  RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor, TensorHandle* src,
+                 TensorHandle* dst, Device* recv_device, uint64 recv_op_id);
+
+  ~RemoteCopyNode() override;
+
+  absl::Status Prepare() override;
+
+  void RunAsync(StatusCallback done) override;
+
+  void Abort(absl::Status status) override;
+
+  string DebugString() const override {
+    string out = "[RemoteCopyNode]";
+    strings::StrAppend(&out, " send_device: ", send_device_->name());
+    strings::StrAppend(&out, ", recv_device: ", recv_device_->name());
+    strings::StrAppend(&out, ", send_tensor: ", src_->DebugString());
+    strings::StrAppend(
+        &out, ", recv_tensor: ", captured_state_->dst()->DebugString());
+    return out;
+  }
+
+ private:
+  // Runs the _Send operation locally or remotely.
+  // StartSend() makes sure that captured_state_->send_status_ is set to the
+  // final _Send status after captured_state->send_done_.WaitForNotification()
+  // returns.
+  void StartSend();
+
+  // Synchronously runs local send `op` and returns its status.
+  absl::Status RunLocalSend(EagerOperation* op);
+
+  // Runs the _Recv operation locally or remotely.
+  // An error return value indicates that _Recv did not run successfully. It
+  // does not indicate that _Send op has completed since StartRecv could have
+  // encountered an error before waiting for _Send's completion.
+  // An OK return value does NOT necessarily indicate that _Recv has completed
+  // successfully (it does now, but won't when streaming RPCs are turned on).
+  // StartRecv() makes sure that dst_ tensor handle is handled correctly
+  // (potentially after this methods returns); a tensor is set in the local
+  // case, a remote shape is set in the remote case, the dst_ handle is
+  // poisoned in either case if there is an error.
+  void StartRecv(StatusCallback done);
+
+  // Synchronously runs local receive `op` and returns its status.
+  // Does not wait for the send to complete before running receive.
+  absl::Status RunLocalRecv(EagerOperation* op, std::vector<Tensor>* outputs);
+
+  // Waits for send to complete, then issues remote receive `op` and
+  // returns its status.
+  void RunRemoteRecv(EagerOperation* op, StatusCallback done);
+
+  // When !ctx->UseSendTensorRPC(), then tensors are shipped between remote
+  // devices by the receiver invoking the WorkerService.RecvTensor RPC *on the
+  // sender* (Rendezvous::RecvAsync() invoked by the _Recv kernel).
+  //
+  // However, in some configurations the node that has the tensor to be copied
+  // isn't running a server (WorkerService RPC interface). For such cases,
+  // this function enables sending tensors using the EagerService.Enqueue
+  // SendTensor RPC *on the receiver*.
+  void StartRemoteSendTensor(StatusCallback done);
+
+  // Send a local packed TensorHandle to a remote device.
+  void StartSendPackedHandle(StatusCallback done);
+
+  // State that is captured by Send and/or Recv callbacks (depending on which
+  // one(s) is remote) and outlives this node in the case of remote->remote
+  // copy.
+  class CapturedSharedState {
+   public:
+    explicit CapturedSharedState(TensorHandle* d) : dst_(d) { dst_->Ref(); }
+    ~CapturedSharedState() { dst_->Unref(); }
+
+    void SetSendStatus(absl::Status status) {
+      send_status_.Update(status);
+      send_done_.Notify();
+    }
+
+    absl::Status GetSendStatus() {
+      send_done_.WaitForNotification();
+      return send_status_;
+    }
+
+    // src_shape_ is not thread-safe. It should only be set in one thread.
+    void SetSrcShape(const TensorShape& shape) { src_shape_ = shape; }
+
+    const TensorShape& GetSrcShape() { return src_shape_; }
+
+    TensorHandle* dst() { return dst_; }
+    CancellationManager* recv_cancellation() { return &recv_cancellation_; }
+
+   private:
+    TensorHandle* const dst_;
+    CancellationManager recv_cancellation_;
+    // send_status_ is safe to read only after send_done_.WaitForNotification()
+    // has returned.
+    absl::Status send_status_;
+    Notification send_done_;
+    TensorShape src_shape_;
+  };
+
+  TensorHandle* const src_;
+  EagerContext* const ctx_;
+  EagerExecutor* const executor_;
+  Device* const send_device_;
+  Device* const recv_device_;
+  const string wire_id_;
+  const uint64 recv_op_id_;
+
+  std::shared_ptr<CapturedSharedState> captured_state_;
+  bool started_;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_COPY_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
new file mode 100644
index 00000000..d1c5359d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/shape_inference.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// RemoteExecuteNode is an implementation of EagerNode which enqueues
+// an operation via RPC in a remote EagerService.
+class RemoteExecuteNode : public AsyncRemoteExecuteNode {
+ public:
+  RemoteExecuteNode(EagerContext* eager_context,
+                    std::unique_ptr<EnqueueRequest> request, Device* device,
+                    uint64 context_view_id, EagerClient* eager_client,
+                    CancellationManager* cancellation_manager,
+                    const NodeDef& ndef,
+                    const FunctionLibraryDefinition* lib_def,
+                    const absl::InlinedVector<TensorHandle*, 4UL>& inputs,
+                    absl::Span<TensorHandle*> retvals)
+      : AsyncRemoteExecuteNode(),
+        eager_context_(eager_context),
+        request_(std::move(request)),
+        device_(device),
+        context_view_id_(context_view_id),
+        eager_client_(eager_client),
+        cancellation_manager_(cancellation_manager),
+        ndef_(ndef),
+        lib_def_(lib_def),
+        inputs_(inputs) {
+    // Copy the output handles, since the container for them might get
+    // destroyed.
+    for (auto handle : retvals) {
+      handle->Ref();
+      retvals_.push_back(handle);
+    }
+
+    // This is required to ensure that the tensor handles stay alive across the
+    // execution.
+    for (auto handle : inputs_) {
+      handle->Ref();
+    }
+    eager_client_->Ref();
+
+    needs_remote_inputs_ = false;
+    for (const TensorHandle* input : inputs_) {
+      // TODO(bramandia): Should this be op_device() instead?
+      if (input->resource_device() != nullptr &&
+          input->resource_device() != device_) {
+        needs_remote_inputs_ = true;
+        break;
+      }
+    }
+  }
+
+  ~RemoteExecuteNode() override {
+    for (auto handle : retvals_) {
+      handle->Unref();
+    }
+
+    for (auto handle : inputs_) {
+      handle->Unref();
+    }
+    eager_client_->Unref();
+  }
+
+  absl::Status Prepare() override {
+    return RunShapeInference(ndef_, *lib_def_, inputs_, retvals_);
+  }
+
+  void RunAsync(StatusCallback done) override;
+
+  absl::Status SyncExecutors() override {
+    return eager_context_->SyncExecutors();
+  }
+
+  void Abort(absl::Status status) override {
+    int i = 0;
+    for (auto handle : retvals_) {
+      handle->PoisonRemote(status, device_, context_view_id_);
+      ++i;
+    }
+  }
+
+  const EagerClient* eager_client() const override { return eager_client_; }
+
+  bool needs_remote_inputs() const override { return needs_remote_inputs_; }
+
+  bool allow_multiple_pending_requests() const override {
+    return eager_client_->allow_multiple_pending_requests();
+  }
+
+  string DebugString() const override {
+    string out = "[RemoteExecuteNode]";
+    strings::StrAppend(&out, " request: ", request_->DebugString());
+    strings::StrAppend(&out, ", target_device: ", device_->name());
+    return out;
+  }
+
+ private:
+  EagerContext* eager_context_;  // Not owned, and must outlive this node.
+  std::unique_ptr<EnqueueRequest> request_;
+  Device* device_;             // Not owned
+  uint64 context_view_id_;
+  bool needs_remote_inputs_;
+  EagerClient* eager_client_;  // Not owned, and must outlive this node.
+  CancellationManager* cancellation_manager_;
+  const NodeDef ndef_;
+  const FunctionLibraryDefinition* lib_def_;
+  absl::InlinedVector<TensorHandle*, 4UL> inputs_;
+  absl::InlinedVector<TensorHandle*, 2UL> retvals_;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_mgr.h
new file mode 100644
index 00000000..b62134cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -0,0 +1,139 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_MGR_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This class manages the states required to setup an eager cluster.
+// TODO(fishx): Move remote state from context to this class.
+class RemoteMgr {
+ public:
+  RemoteMgr(bool is_master, EagerContext* ctx)
+      : is_master_(is_master), parent_(ctx) {}
+
+  ~RemoteMgr() {
+    for (const auto& entry : remote_tensor_handle_map_) {
+      entry.second->Unref();
+    }
+  }
+
+  bool IsMaster() { return is_master_; }
+
+  void AddOperationOutputs(
+      const absl::Span<tensorflow::TensorHandle* const> handles,
+      int64_t operation_id);
+
+  void AddOperationOutput(tensorflow::TensorHandle* handles,
+                          int64_t operation_id, int32_t output_num);
+
+  absl::Status GetTensorHandle(const RemoteTensorHandleInternal& remote_handle,
+                               tensorflow::TensorHandle** handle);
+
+  absl::Status DeleteTensorHandle(
+      const RemoteTensorHandleInternal& remote_handle);
+
+  // Helper function to create monotonically increasing ids unique to this
+  // context.
+  uint64 NextOpId() {
+    DCHECK(is_master_);
+    mutex_lock l(next_id_mutex_);
+    return next_op_id_++;
+  }
+
+  // Serialize a remote TensorHandle to a RemoteTensorHandle.
+  // If wait_until_ready is true, block until the remote handle is ready on a
+  // remote worker.
+  absl::Status SerializeRemoteTensorHandle(
+      TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+      Device* device, absl::string_view device_name = "",
+      const bool serialize_resource_dtype_and_shape = false);
+
+  // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote).
+  // The output holds a reference to the TensorHandle.
+  absl::Status DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
+                                             TensorHandle** out);
+
+  EagerExecutor& GetOrCreateExecutorForStream(uint64 stream_id);
+
+  void DeleteExecutorForStream(uint64 stream_id);
+
+ protected:
+  mutex next_id_mutex_;
+  uint64 next_op_id_ TF_GUARDED_BY(next_id_mutex_) = 1;
+
+ private:
+  // Returns the op_id and output_num if the given local TensorHandle exists in
+  // remote_tensor_handle_map_.
+  absl::Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                                     const bool wait_until_ready,
+                                     int64_t* op_id, int32* output_num)
+      TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
+
+  absl::Status GetTensorHandleImpl(
+      const RemoteTensorHandleInternal& remote_handle,
+      tensorflow::TensorHandle** handle)
+      TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
+
+  absl::Status GetMirroredResourceShape(
+      const RemoteTensorHandleInternal& remote_handle,
+      std::vector<DtypeAndPartialTensorShape>* handle);
+
+  bool is_master_;
+
+  using RemoteTensorHandleMap =
+      gtl::FlatMap<RemoteTensorHandleInternal, tensorflow::TensorHandle*,
+                   RemoteTensorHandleInternalHash,
+                   RemoteTensorHandleInternalEquals>;
+  using MirroredResourceShapeMap = gtl::FlatMap<
+      RemoteTensorHandleInternal, std::vector<DtypeAndPartialTensorShape>,
+      RemoteTensorHandleInternalHash, RemoteTensorHandleInternalEquals>;
+
+  mutex remote_tensor_handle_mu_;
+  // This map maintains the TensorHandles that are required by remote workers
+  // in the cluster. Each map key is generated by the master, so it should be
+  // globally unique. This map owns references on the handles it contains.
+  RemoteTensorHandleMap remote_tensor_handle_map_
+      TF_GUARDED_BY(remote_tensor_handle_mu_);
+
+  mutex mirrored_resource_shape_mu_;
+  // This map maintains the data types and shapes of resource variables required
+  // by remote workers in the cluster. Each map key is generated by the master,
+  // so it should be globally unique.
+  MirroredResourceShapeMap mirrored_resource_shape_map_
+      TF_GUARDED_BY(mirrored_resource_shape_mu_);
+
+  EagerContext* parent_;  // not owned.
+
+  mutex executor_map_mu_;
+  std::unordered_map<uint64, EagerExecutor> executor_map_
+      TF_GUARDED_BY(executor_map_mu_);
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
new file mode 100644
index 00000000..903d0191
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
+
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+struct RemoteTensorHandleInternal {
+  explicit RemoteTensorHandleInternal(const RemoteTensorHandle& tensor_handle)
+      : op_id(tensor_handle.op_id()), output_num(tensor_handle.output_num()) {}
+  RemoteTensorHandleInternal(int64_t op_id, int32_t output_num)
+      : op_id(op_id), output_num(output_num) {}
+  int64_t op_id;
+  int32 output_num;
+};
+
+struct RemoteTensorHandleInternalHash {
+  std::size_t operator()(const RemoteTensorHandleInternal& handle) const {
+    return FingerprintCat64(handle.op_id, handle.output_num);
+  }
+};
+
+struct RemoteTensorHandleInternalEquals {
+  bool operator()(const RemoteTensorHandleInternal& first,
+                  const RemoteTensorHandleInternal& second) const {
+    return first.op_id == second.op_id && first.output_num == second.output_num;
+  }
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
new file mode 100644
index 00000000..892d82bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_DATA_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_DATA_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Remote Tensor Handle: A handle to a Tensor on a remote host. Note that only
+// the shape is known.
+class RemoteTensorHandleData {
+ public:
+  // Constructor for lazy remote handles. A lazy remote handle is created on
+  // a remote worker with an op_id and an output_num. It doesn't control the
+  // lifetime of a remote handle that it refers to. If it refers to a remote
+  // function input, it's sent by a client which won't serialize it until
+  // the corresponding remote tensor is ready. So the remote tensor should be
+  // ready when we create a lazy remote handle. If it refers to a remote output,
+  // it's not ready until the shape is set.
+  RemoteTensorHandleData(int64_t op_id, int output_num, uint64 context_view_id,
+                         bool is_ready);
+  // Constructor for unshaped remote handles. It controls the lifetime of a
+  // remote handle that it refers to.
+  RemoteTensorHandleData(int64_t op_id, int output_num,
+                         const string& remote_task, EagerContext* ctx);
+  ~RemoteTensorHandleData();
+
+  // A remote tensor handle does not have a Tensor object, hence it can only
+  // support the shape requests.
+  absl::Status Shape(TensorShape* shape) const;
+  absl::Status NumDims(int* num_dims) const;
+  absl::Status Dim(int dim_index, int64_t* dim) const;
+  absl::Status NumElements(int64_t* num_elements) const;
+  absl::Status Unprotect() { return absl::OkStatus(); }
+
+  bool IsReady() const;
+  absl::Status WaitReady(const char* caller) const;
+  absl::Status SetShape(const TensorShape& shape);
+  absl::Status SetShapeAndRemoteTask(const TensorShape& shape,
+                                     const string& remote_task);
+  void Poison(absl::Status status);
+  absl::Status IsPoisoned() const;
+
+  string DebugString() const;
+
+  // Return the op id and output num. If wait_until_ready is true, block until
+  // the remote tensor is ready on a remote worker.
+  absl::Status OpIdAndOutputNum(bool wait_until_ready, int64_t* op_id,
+                                int32* output_num) const;
+
+  uint64 context_view_id() const { return context_view_id_; }
+
+ private:
+  mutable mutex mu_;
+  bool is_ready_ TF_GUARDED_BY(mu_);
+  absl::Status is_poisoned_ TF_GUARDED_BY(mu_);
+  TensorShape shape_ TF_GUARDED_BY(mu_);
+
+  // IDs required when this class is representing a remote tensor handle.
+  const int64_t op_id_;
+  const int32 output_num_;
+  string remote_task_ TF_GUARDED_BY(mu_);
+  uint64 context_id_;
+  uint64 context_view_id_;
+  EagerContext* ctx_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/error_payloads.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/error_payloads.h
new file mode 100644
index 00000000..ae3b3e5e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/error_payloads.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_ERROR_PAYLOADS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_ERROR_PAYLOADS_H_
+
+// This file lists the proto payloads that may be inserted  by the code within
+// `tensorflow/core/distributed_runtime/` into Status instances.
+
+namespace tensorflow {
+// Proto: tensorflow::distributed_runtime::WorkerPossiblyRestarted
+// Location: tensorflow/core/protobuf/distributed_runtime_payloads.proto
+// Usage: Flags the Status to be a possible outcome of a worker restart.
+constexpr char kWorkerPossiblyRestarted[] =
+    "type.googleapis.com/"
+    "tensorflow.distributed_runtime.WorkerPossiblyRestarted";
+
+constexpr char kWorkerPreemption[] =
+    "type.googleapis.com/tensorflow.distributed_runtime.WorkerPreemption";
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_ERROR_PAYLOADS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/graph_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/graph_mgr.h
new file mode 100644
index 00000000..5c8c7ce0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -0,0 +1,214 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_GRAPH_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_GRAPH_MGR_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/costmodel_manager.h"
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+class CoordinationServiceAgent;
+}
+
+namespace tensorflow {
+
+class ExecutorOpts;
+class StepStatsCollector;
+class RendezvousMgrInterface;
+class DeviceMgr;
+class WorkerSession;
+
+// GraphMgr keeps track of a set of graphs that are registered with a
+// TensorFlow worker. Each registered graph is identified by a handle
+// that is generated by GraphMgr and returned to the caller.
+//
+// After a successful registration, the caller executes a graph using
+// the graph handle. Each execution is distinguished from others by a
+// caller generated global unique id "step_id". Multiple executions
+// can use the same graph concurrently and independently as long as
+// "step_id" used are different.
+//
+// Multiple threads can call GraphMgr methods concurrently.
+//
+// E.g.,
+//   GraphMgr gmgr(worker_env);
+//   string handle;
+//   TF_CHECK_OK(gmgr.Register("session", { graph computes c = a + b },
+//   &handle));
+//   GraphMgr::NamedTensors in = { { "a", Tensor({1, 2}) },
+//                                { "b", Tensor({3, 4}) } };
+//   GraphMgr::NamedTensors out = { { "c", Tensor() } };
+//   TF_CHECK_OK(gmgr.Execute(handle, 0x0001, in, &out));
+//   EXPECT_EQ(out["c"], Tensor({4, 6}));
+class GraphMgr {
+ public:
+  explicit GraphMgr(const WorkerEnv* worker_env, const DeviceMgr* device_mgr);
+  ~GraphMgr();
+
+  // Registers a graph. Fills in "handle". The registered graph retains a
+  // reference to cluster_flr to do cross process function calls.
+  absl::Status Register(const string& handle, const GraphDef& gdef,
+                        const GraphOptions& graph_options,
+                        const DebugOptions& debug_options,
+                        const ConfigProto& config_proto,
+                        int64_t collective_graph_key, WorkerSession* session,
+                        DistributedFunctionLibraryRuntime* cluster_flr,
+                        string* graph_handle);
+
+  // Executes one step of a registered graph "handle".
+  //
+  // If "out" is not nullptr, "out" specifies all keys the execution
+  // should receive upon finish.
+  typedef std::map<string, Tensor> NamedTensors;
+  typedef std::function<void(const absl::Status&)> StatusCallback;
+  void ExecuteAsync(const string& handle, const int64_t step_id,
+                    const ExecutorOpts& opts, const NamedTensors& in,
+                    WorkerSession* session, StepStatsCollector* collector,
+                    MutableRunGraphResponseWrapper* response,
+                    CancellationManager* cancellation_manager,
+                    tsl::CoordinationServiceAgent* coordination_service_agent,
+                    StatusCallback done);
+
+  absl::Status SendInputs(const int64_t step_id, const NamedTensors& in);
+  absl::Status RecvOutputs(const int64_t step_id, NamedTensors* out);
+  void RecvOutputsAsync(const int64_t step_id, NamedTensors* out,
+                        StatusCallback done);
+
+  // Deregisters a graph.
+  absl::Status Deregister(const string& handle);
+
+  // Deregister all graphs.
+  absl::Status DeregisterAll();
+
+ private:
+  typedef GraphMgr ME;
+
+  struct ExecutionUnit {
+    std::unique_ptr<Graph> graph = nullptr;
+    Device* device = nullptr;               // not owned.
+    Executor* root = nullptr;               // not owned.
+    FunctionLibraryRuntime* lib = nullptr;  // not owned.
+    // Build the cost model if this value is strictly positive.
+    int64_t build_cost_model = 0;
+  };
+
+  struct Item : public core::RefCounted {
+    // TODO(zhifengc): Keeps a copy of the original graph if the need arises.
+    // TODO(zhifengc): Stats, updated by multiple runs potentially.
+    // TODO(zhifengc): Dup-detection. Ensure step_id only run once.
+    ~Item() override;
+
+    // Session handle.
+    string session;
+
+    // Graph handle.
+    string handle;
+
+    // Session configuration options for the graph.
+    ConfigProto session_config;
+
+    std::unique_ptr<FunctionLibraryDefinition> lib_def;
+    // Owns the FunctionLibraryRuntime objects needed to execute functions, one
+    // per device.
+    std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr;
+    // A graph is partitioned over multiple devices.  Each partition
+    // has a root executor which may call into the runtime library.
+    std::vector<ExecutionUnit> units;
+
+    // Used to deregister a cost model when cost model is required in graph
+    // manager.
+    GraphMgr* graph_mgr;
+
+    int64_t collective_graph_key;
+  };
+
+  const WorkerEnv* worker_env_;  // Not owned.
+  const DeviceMgr* device_mgr_;
+
+  CostModelManager cost_model_manager_;
+
+  // Owned.
+  mutex mu_;
+  int64_t next_id_ TF_GUARDED_BY(mu_) = 0;
+
+  // If true, blocks until device has finished all queued operations in a step.
+  bool sync_on_finish_ = true;
+
+  // Table mapping graph handles to registered graphs.
+  //
+  // TODO(zhifengc): If the client does not call Deregister, we'll
+  // lose memory over time. We should implement a timeout-based
+  // mechanism to gc these graphs.
+  std::unordered_map<string, Item*> table_;
+
+  void StartParallelExecutors(
+      const string& handle, int64_t step_id, Item* item, Rendezvous* rendezvous,
+      CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
+      CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
+      WorkerSession* session, int64_t start_time_usecs,
+      tsl::CoordinationServiceAgent* coordination_service_agent,
+      StatusCallback done);
+
+  // Don't attempt to process cost models unless explicitly requested for at
+  // least one of the items.
+  bool skip_cost_models_ = true;
+
+  void BuildCostModel(Item* item, StepStatsCollector* collector,
+                      CostGraphDef* cost_graph);
+
+  absl::Status InitItem(const string& handle, const GraphDef& gdef,
+                        const GraphOptions& graph_options,
+                        const DebugOptions& debug_options,
+                        const ConfigProto& config_proto,
+                        int64_t collective_graph_key, WorkerSession* session,
+                        DistributedFunctionLibraryRuntime* cluster_flr,
+                        Item* item);
+
+  absl::Status DecorateAndPublishGraphForDebug(
+      const DebugOptions& debug_options, Graph* graph, Device* device);
+
+  GraphMgr(const GraphMgr&) = delete;
+  void operator=(const GraphMgr&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_GRAPH_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/local_master.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/local_master.h
new file mode 100644
index 00000000..e4fc37e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/local_master.h
@@ -0,0 +1,113 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+
+#include <memory>
+
+#include "tensorflow/core/distributed_runtime/master_interface.h"
+
+namespace tensorflow {
+
+class Master;
+
+// An implementation of the TensorFlow master interface that enables direct
+// intraprocess communication between the client and the master implementation.
+//
+// This master implementation is intended to provide more efficient access to
+// a master service that has been created in the same process as the client.
+//
+// TODO(mrry): Add methods that avoid protobuf encoding the request/response
+// objects where this affects performance.
+// TODO(mrry): Avoid closure creation/context switch overhead for synchronous
+// invocation of Master methods.
+// TODO(mrry): Make all potentially blocking Master methods take CallOptions
+// for cancellation.
+class LocalMaster : public MasterInterface {
+ public:
+  ~LocalMaster() override {}
+
+  absl::Status CreateSession(CallOptions* call_options,
+                             const CreateSessionRequest* request,
+                             CreateSessionResponse* response) override;
+
+  absl::Status ExtendSession(CallOptions* call_options,
+                             const ExtendSessionRequest* request,
+                             ExtendSessionResponse* response) override;
+
+  absl::Status PartialRunSetup(CallOptions* call_options,
+                               const PartialRunSetupRequest* request,
+                               PartialRunSetupResponse* response) override;
+
+  absl::Status RunStep(CallOptions* call_options,
+                       RunStepRequestWrapper* request,
+                       MutableRunStepResponseWrapper* response) override;
+
+  MutableRunStepRequestWrapper* CreateRunStepRequest() override;
+
+  MutableRunStepResponseWrapper* CreateRunStepResponse() override;
+
+  absl::Status CloseSession(CallOptions* call_options,
+                            const CloseSessionRequest* request,
+                            CloseSessionResponse* response) override;
+
+  absl::Status ListDevices(CallOptions* call_options,
+                           const ListDevicesRequest* request,
+                           ListDevicesResponse* response) override;
+
+  // See tensorflow::Reset() and the comment on ResetRequest.
+  absl::Status Reset(CallOptions* call_options, const ResetRequest* request,
+                     ResetResponse* response) override;
+
+  absl::Status MakeCallable(CallOptions* call_options,
+                            const MakeCallableRequest* request,
+                            MakeCallableResponse* response) override;
+  absl::Status RunCallable(CallOptions* call_options,
+                           const RunCallableRequest* request,
+                           RunCallableResponse* response) override;
+  absl::Status ReleaseCallable(CallOptions* call_options,
+                               const ReleaseCallableRequest* request,
+                               ReleaseCallableResponse* response) override;
+
+  // Registers the mapping from the given `target` to the given `master`.
+  //
+  // WARNING: The `master` pointer remains owned by the caller. It is
+  // the responsibility of the caller to ensure that `master` outlives
+  // any LocalMaster objects that may wrap this master. There is no
+  // corresponding deregister method, since clean server shutdown is
+  // not currently implemented for any server type.
+  static void Register(const string& target, Master* master,
+                       int64_t default_timeout_in_ms);
+
+  // Returns a pointer to the local master associated with the given
+  // `target`, or nullptr if none exists.
+  static std::unique_ptr<LocalMaster> Lookup(const string& target);
+
+ private:
+  Master* master_impl_;  // Not owned.
+  const int64_t default_timeout_in_ms_;
+
+  // See `LocalMaster::Lookup` for the factory function that creates
+  // objects of this type.
+  LocalMaster(Master* master_impl, const int64_t default_timeout_in_ms);
+
+  LocalMaster(const LocalMaster&) = delete;
+  void operator=(const LocalMaster&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master.h
new file mode 100644
index 00000000..a3930249
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master.h
@@ -0,0 +1,118 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+class Master {
+ public:
+  explicit Master(MasterEnv* env, double session_gc_seconds);
+  virtual ~Master();
+
+  // Convenient typedef for a closure passing a Status.
+  typedef std::function<void(const absl::Status&)> MyClosure;
+
+  void CreateSession(const CreateSessionRequest* req,
+                     CreateSessionResponse* resp, MyClosure done);
+
+  void ExtendSession(const ExtendSessionRequest* req,
+                     ExtendSessionResponse* resp, MyClosure done);
+
+  void PartialRunSetup(const PartialRunSetupRequest* req,
+                       PartialRunSetupResponse* resp, MyClosure done);
+
+  void RunStep(CallOptions* opts, const RunStepRequestWrapper* req,
+               MutableRunStepResponseWrapper* resp, MyClosure done);
+
+  void CloseSession(const CloseSessionRequest* req, CloseSessionResponse* resp,
+                    MyClosure done);
+
+  void ListDevices(const ListDevicesRequest* req, ListDevicesResponse* resp,
+                   MyClosure done);
+
+  // See tensorflow::Reset() and the comment on ResetRequest.
+  void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
+
+  void MakeCallable(const MakeCallableRequest* req, MakeCallableResponse* resp,
+                    MyClosure done);
+  void RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                   RunCallableResponse* resp, MyClosure done);
+  void ReleaseCallable(const ReleaseCallableRequest* req,
+                       ReleaseCallableResponse* resp, MyClosure done);
+
+ private:
+  typedef Master ME;
+
+  // Not owned.
+  MasterEnv* env_ = nullptr;
+
+  // Owned.
+  mutex mu_;
+
+  // shutdown_ is set to true by the dtor.
+  condition_variable shutdown_cv_;
+  bool shutdown_ TF_GUARDED_BY(mu_) = false;
+  Thread* gc_thread_;
+
+  // Maps session handles to sessions.
+  std::unordered_map<string, MasterSession*> sessions_ TF_GUARDED_BY(mu_);
+
+  // Moving average of step times.
+  MovingAverage last_1000_steps_ TF_GUARDED_BY(mu_);
+
+  // Cumulative number of steps executed.
+  int64_t step_count_ TF_GUARDED_BY(mu_);
+
+  // If a session is not active for this many seconds, it will be
+  // closed automatically.
+  const double session_gc_seconds_;
+
+  // Used to track ids for incoming requests so we can detect duplicates.
+  RecentRequestIds recent_request_ids_;
+
+  // Call CleanupAll on all workers.
+  void CleanupWorkers(const ResetRequest& reset);
+
+  // Cleanup unused session.
+  void GC();
+
+  // Find master session by session handle, and increments the reference count
+  // on the returned MasterSession if not null.
+  MasterSession* FindMasterSession(const string& handle);
+
+  Master(const Master&) = delete;
+  void operator=(const Master&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_env.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_env.h
new file mode 100644
index 00000000..b8dcf196
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_env.h
@@ -0,0 +1,113 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_ENV_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_ENV_H_
+
+#include <functional>
+#include <vector>
+
+#include "xla/tsl/protobuf/rpc_options.pb.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+using Env = tsl::Env;
+
+class CollectiveExecutorMgrInterface;
+class Device;
+class DeviceSet;
+class MasterSession;
+class OpRegistryInterface;
+
+// Options passed to the worker_cache_factory function.
+struct WorkerCacheFactoryOptions {
+  ClusterDef cluster_def;
+  string job_name;
+  int task_index;
+  int replica_index = 0;
+  RPCOptions rpc_options;
+
+  explicit WorkerCacheFactoryOptions() = default;
+
+  // Construct from a ServerDef proto.
+  explicit WorkerCacheFactoryOptions(const ServerDef& server_def) {
+    if (server_def.has_cluster() && !server_def.job_name().empty()) {
+      cluster_def = server_def.cluster();
+      job_name = server_def.job_name();
+      task_index = server_def.task_index();
+      rpc_options = server_def.default_session_config().rpc_options();
+      replica_index = server_def.replica();
+    }
+  }
+};
+
+// The master environment class, which holds a bag of pointers to
+// per-master state.
+//
+// MasterEnv does not own its member pointers.
+struct MasterEnv {
+  Env* env = nullptr;
+
+  // Object from which WorkerInterface instances can be obtained. Not owned.
+  WorkerCacheInterface* worker_cache = nullptr;
+
+  // The operation definitions to use.  Must be filled before use.
+  const OpRegistryInterface* ops = nullptr;
+
+  // Local devices co-located with this master.  Devices are not owned
+  // by the master service.
+  //
+  // REQUIRES: !local_devices.empty().
+  std::vector<Device*> local_devices;
+
+  // In large scaled distributed training, many singleton components (e.g.
+  // Rendezvous) can becomes the bottleneck of the system. This field allows
+  // us to shard the single components. This number will scale up with number
+  // of tasks in this cluster. It is always greater than 1.
+  int experimental_num_shards = 1;
+
+  // Factory for creating master sessions, given session options and a
+  // vector of devices.
+  //
+  // The caller of the function takes ownership of the returned
+  // `MasterSession`, which may not be null. Ownership of the
+  // `MasterEnv*` is retained by the caller.
+  std::function<MasterSession*(
+      SessionOptions, MasterEnv*,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
+      std::unique_ptr<WorkerCacheInterface>,
+      std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list)>
+      master_session_factory;
+
+  std::function<absl::Status(const WorkerCacheFactoryOptions&,
+                             WorkerCacheInterface**)>
+      worker_cache_factory;
+
+  // Generates per-step CollectiveExecutors and has access to utilities
+  // supporting collective operations. Not owned.
+  CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_interface.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_interface.h
new file mode 100644
index 00000000..df9894f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_interface.h
@@ -0,0 +1,118 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_INTERFACE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_INTERFACE_H_
+
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+
+namespace tensorflow {
+
+// Abstract interface for communicating with the TensorFlow Master service.
+//
+// This interface supports both RPC-based master implementations, and
+// in-process master implementations that do not require an RPC
+// roundtrip.
+class MasterInterface {
+ public:
+  virtual ~MasterInterface() {}
+  virtual absl::Status CreateSession(CallOptions* call_options,
+                                     const CreateSessionRequest* request,
+                                     CreateSessionResponse* response) = 0;
+
+  virtual absl::Status ExtendSession(CallOptions* call_options,
+                                     const ExtendSessionRequest* request,
+                                     ExtendSessionResponse* response) = 0;
+
+  virtual absl::Status PartialRunSetup(CallOptions* call_options,
+                                       const PartialRunSetupRequest* request,
+                                       PartialRunSetupResponse* response) {
+    return errors::Unimplemented("Partial run not implemented for this master");
+  }
+
+  virtual absl::Status RunStep(CallOptions* call_options,
+                               RunStepRequestWrapper* request,
+                               MutableRunStepResponseWrapper* response) = 0;
+
+  virtual absl::Status RunStep(CallOptions* call_options,
+                               const RunStepRequest* request,
+                               RunStepResponse* response) {
+    std::unique_ptr<RunStepRequestWrapper> wrapped_request(
+        new ProtoRunStepRequest(request));
+    std::unique_ptr<MutableRunStepResponseWrapper> wrapped_response(
+        new NonOwnedProtoRunStepResponse(response));
+    return RunStep(call_options, wrapped_request.get(), wrapped_response.get());
+  }
+
+  // Returns a request object for use in calls to
+  // `RunStep()`. Ownership is transferred to the caller.
+  //
+  // The message returned from this method must only be used in a
+  // `RunStep()` call on the same `MasterInterface` instance.
+  virtual MutableRunStepRequestWrapper* CreateRunStepRequest() {
+    MutableProtoRunStepRequest* ret = new MutableProtoRunStepRequest;
+    ret->request_.set_request_id(GetUniqueRequestId());
+    return ret;
+  }
+
+  // Returns a response object for use in calls to
+  // `RunStep()`. Ownership is transferred to the caller.
+  //
+  // The message returned from this method must only be used in a
+  // `RunStep()` call on the same `MasterInterface` instance.
+  virtual MutableRunStepResponseWrapper* CreateRunStepResponse() {
+    return new OwnedProtoRunStepResponse;
+  }
+
+  virtual absl::Status CloseSession(CallOptions* call_options,
+                                    const CloseSessionRequest* request,
+                                    CloseSessionResponse* response) = 0;
+
+  virtual absl::Status ListDevices(CallOptions* call_options,
+                                   const ListDevicesRequest* request,
+                                   ListDevicesResponse* response) = 0;
+
+  virtual absl::Status Reset(CallOptions* call_options,
+                             const ResetRequest* request,
+                             ResetResponse* response) = 0;
+
+  virtual absl::Status MakeCallable(CallOptions* call_options,
+                                    const MakeCallableRequest* request,
+                                    MakeCallableResponse* response) = 0;
+  virtual absl::Status RunCallable(CallOptions* call_options,
+                                   const RunCallableRequest* request,
+                                   RunCallableResponse* response) = 0;
+  virtual absl::Status ReleaseCallable(CallOptions* call_options,
+                                       const ReleaseCallableRequest* request,
+                                       ReleaseCallableResponse* response) = 0;
+
+ protected:
+  // NOTE: This should only be called by implementations of this
+  // interface whose CreateRunStepResponse() method returns a
+  // proto-based wrappers for the RunStepResponse message.
+  RunStepResponse* get_proto_from_wrapper(
+      MutableRunStepResponseWrapper* wrapper) {
+    return wrapper->get_proto();
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_session.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_session.h
new file mode 100644
index 00000000..f7016518
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/master_session.h
@@ -0,0 +1,265 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_SESSION_H_
+
+#include <atomic>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/graph_execution_state.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class Device;
+struct MasterEnv;
+
+// A session encapsulates a graph computation (resource allocation,
+// placement, execution, etc.).
+class MasterSession : public core::RefCounted {
+ public:
+  // This session encapsulates the graph computation for a graph.
+  //
+  // The session places nodes on devices in "remote_devs" and executes
+  // operations on these devices.
+  //
+  // The caller takes ownership of all remote devices.
+  MasterSession(
+      const SessionOptions& options, const MasterEnv* env,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list,
+      StatsPublisherFactory stats_publisher_factory);
+
+  // Initialize the MasterSession for "def".  Must be called before Extend(),
+  // Run(), or Close().
+  absl::Status Create(GraphDef&& def, const ClusterDef& cluster_def);
+
+  // Returns the session handle.
+  const string& handle() const { return handle_; }
+
+  // Returns the last access time (the number of micro-seconds since
+  // some fixed point in time) of this session.
+  uint64 last_access_time_usec() const { return last_access_time_usec_.load(); }
+
+  // Attempt to extend the graph according to the given "req".
+  // (See master.proto for details of valid extensions.)
+  //
+  // PRECONDITION: The current version of this session's graph
+  //   is "req->current_graph_version".
+  //
+  // POSTCONDITION: The current version of this session's graph
+  //   is "resp->new_graph_version".
+  //
+  // Extend() may block the caller thread for a long time.
+  absl::Status Extend(const ExtendSessionRequest* req,
+                      ExtendSessionResponse* resp);
+
+  // Setup a partial run call.
+  absl::Status PartialRunSetup(const PartialRunSetupRequest* req,
+                               PartialRunSetupResponse* resp);
+
+  // Run one step.
+  absl::Status Run(CallOptions* opts, const RunStepRequestWrapper& req,
+                   MutableRunStepResponseWrapper* resp);
+
+  absl::Status ListDevices(ListDevicesResponse* resp) const;
+
+  absl::Status MakeCallable(const MakeCallableRequest& req,
+                            MakeCallableResponse* resp);
+
+  absl::Status RunCallable(CallOptions* opts, const RunCallableRequest& req,
+                           RunCallableResponse* resp);
+
+  absl::Status ReleaseCallable(const ReleaseCallableRequest& req,
+                               ReleaseCallableResponse* resp);
+
+  // Close this session and delete "*this". Returns OK if all known
+  // states are cleanup successfully.
+  //
+  // Close() may block the caller thread for a long time.
+  absl::Status Close();
+
+  // Close this session and release a reference on "*this".
+  //
+  // Note that, unlike Close(), this method does not block on the
+  // completion of all work.
+  void GarbageCollect();
+
+ private:
+  SessionOptions session_opts_;
+
+  // Not owned.
+  const MasterEnv* env_;
+
+  // The opaque session handle.
+  const string handle_;
+
+  std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
+
+  // The optional session-specific worker cluster.
+  // TODO(saeta): Convert to std::optional when available.
+  const std::unique_ptr<WorkerCacheInterface> worker_cache_;
+  // Retrieves either worker_cache_ or the env_->worker_cache as appropriate.
+  WorkerCacheInterface* get_worker_cache() const;
+
+  // The device set used by this session.
+  std::unique_ptr<DeviceSet> devices_;
+
+  // The (partial device) names of remote worker tasks that this
+  // session will contact.
+  const std::vector<string> filtered_worker_list_;
+
+  StatsPublisherFactory stats_publisher_factory_;
+
+  std::atomic_ulong last_access_time_usec_;
+
+  std::atomic<int64_t> partial_run_handle_counter_ = {0};
+
+  uint64 NewStepId(int64_t graph_key);
+
+  mutex mu_;
+  std::unique_ptr<GraphExecutionState> execution_state_ TF_GUARDED_BY(mu_);
+  int64_t graph_version_;
+
+  // We keep a map from a signature of a run request to the
+  // ReffedClientGraph the can execute it.  We keep up to one old copy
+  // of each ReffedClientGraph around because if it gets deallocated
+  // before a new substitute has been created, Variables can go out of
+  // scope and lose their state.
+  class ReffedClientGraph;
+  typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
+  RCGMap run_graphs_ TF_GUARDED_BY(mu_);
+  RCGMap partial_run_graphs_ TF_GUARDED_BY(mu_);
+  int64_t next_callable_handle_ TF_GUARDED_BY(mu_) = 0;
+  RCGMap callables_ TF_GUARDED_BY(mu_);
+
+  struct PerStepState {
+    bool collect_costs = false;
+    bool collect_timeline = false;
+    bool collect_rpcs = false;
+    bool collect_partition_graphs = false;
+    bool report_tensor_allocations_upon_oom = false;
+    Microseconds start_micros = Microseconds(0);
+    Microseconds end_micros = Microseconds(0);
+    std::vector<StepStats> step_stats;  // per partition
+    StepStats rpc_stats;                // for RPC layer
+    CostGraphDef cost_graph;
+  };
+
+  struct RunState {
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
+    ReffedClientGraph* rcg = nullptr;
+    uint64 step_id;
+    int64_t collective_graph_key;
+    int64_t count = 0;
+    PerStepState pss;
+    std::unique_ptr<ProfileHandler> ph;
+    bool step_started = false;
+
+    RunState(const std::vector<string>& input_names,
+             const std::vector<string>& output_names, ReffedClientGraph* rcg,
+             const uint64 step_id, const int64_t count);
+
+    bool PendingDone() const;
+
+    ~RunState();
+  };
+  std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
+      TF_GUARDED_BY(mu_);
+
+  // Active RunStep calls.
+  condition_variable num_running_is_zero_;
+  int32 num_running_ TF_GUARDED_BY(mu_) = 0;
+
+  bool closed_ TF_GUARDED_BY(mu_) = false;
+  bool garbage_collected_ TF_GUARDED_BY(mu_) = false;
+
+  std::unordered_map<uint64, int64_t> subgraph_execution_counts_
+      TF_GUARDED_BY(mu_);
+
+  // We need to ensure that certain nodes added (e.g., send and recv
+  // nodes) are unique across all sub-graphs within this session.
+  int64_t next_node_id_ TF_GUARDED_BY(mu_) = 0;
+
+  // Used to cancel running steps on Close().
+  CancellationManager cancellation_manager_;
+
+  // Private dtor. The client must call Close().
+  ~MasterSession() override;
+
+  // Creates sessions on all workers.
+  //
+  // If this session is operating using the new ClusterSpec propagation behavior
+  // call this method in order to propagate the cluster membership to all
+  // workers.
+  absl::Status CreateWorkerSessions(const ClusterDef& cluster_def);
+
+  bool should_delete_worker_sessions_ = false;
+  absl::Status DeleteWorkerSessions();
+
+  absl::Status StartStep(const BuildGraphOptions& opts, bool is_partial,
+                         ReffedClientGraph** out_rcg, int64_t* out_count);
+  void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
+                      RCGMap* rcg_map) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                        const RunOptions& run_options, uint64 step_id,
+                        int64_t count, PerStepState* out_pss,
+                        std::unique_ptr<ProfileHandler>* out_ph);
+  absl::Status DoRunWithLocalExecution(CallOptions* opts,
+                                       const RunStepRequestWrapper& req,
+                                       MutableRunStepResponseWrapper* resp);
+  absl::Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
+                            MutableRunStepResponseWrapper* resp);
+  absl::Status DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                             const RunCallableRequest& req,
+                             RunCallableResponse* resp);
+  absl::Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
+                              uint64 step_id, const RunOptions& run_options,
+                              PerStepState* pss,
+                              const std::unique_ptr<ProfileHandler>& ph,
+                              const absl::Status& run_status,
+                              RunMetadata* out_run_metadata);
+
+  void MarkRunCompletion();
+  void UpdateLastAccessTime();
+
+  absl::Status BuildAndRegisterPartitions(ReffedClientGraph* rcg);
+
+  absl::Status CreateDebuggerState(
+      const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+      int64_t rcg_execution_count,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
+  MasterSession(const MasterSession&) = delete;
+  void operator=(const MasterSession&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/message_wrappers.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/message_wrappers.h
new file mode 100644
index 00000000..d4b07fb5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -0,0 +1,746 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Wrapper classes for the `MasterService.RunStep` request message.
+//
+// The `RunStepRequest` message can contain potentially large tensor
+// data as part of its `feed` submessages. Here we provide specialized
+// wrappers that avoid copying the tensor data wherever possible.
+//
+// See `RunStepRequest` in tensorflow/core/protobuf/master.proto for the
+// protocol buffer definition.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Abstract interface for an immutable RunStepRequest message.
+//
+// This interface is typically used by server-side components in the
+// TensorFlow master.
+class RunStepRequestWrapper {
+ public:
+  virtual ~RunStepRequestWrapper() {}
+
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  virtual const string& session_handle() const = 0;
+
+  // Partial run handle (optional). If specified, this will be a partial run
+  // execution, run up to the specified fetches.
+  virtual const string& partial_run_handle() const = 0;
+
+  // Tensors to be fed in the step. Each feed is a named tensor.
+  virtual size_t num_feeds() const = 0;
+  virtual const string& feed_name(size_t i) const = 0;
+
+  // Stores the content of the feed value at index `i` in `tensor`.
+  virtual absl::Status FeedValue(size_t i, Tensor* out_tensor) const = 0;
+  virtual absl::Status FeedValue(size_t i, TensorProto* out_tensor) const = 0;
+
+  // Fetches. A list of tensor names. The caller expects a tensor to
+  // be returned for each fetch[i] (see RunStepResponse.tensor). The
+  // order of specified fetches does not change the execution order.
+  virtual size_t num_fetches() const = 0;
+  virtual const string& fetch_name(size_t i) const = 0;
+
+  // Target Nodes. A list of node names. The named nodes will be run
+  // to but their outputs will not be fetched.
+  virtual size_t num_targets() const = 0;
+  virtual const string& target_name(size_t i) const = 0;
+
+  // Options for the run call.
+  virtual const RunOptions& options() const = 0;
+
+  // If true then some errors, e.g., execution errors that have long
+  // error messages, may return an OK RunStepResponse with the actual
+  // error saved in the status_code/status_error_message fields of the
+  // response body. This is a workaround since the RPC subsystem may
+  // truncate long metadata messages.
+  virtual bool store_errors_in_response_body() const = 0;
+
+  // Unique identifier for this request. Every RunGraphRequest must have a
+  // unique request_id, and retried RunGraphRequests must have the same
+  // request_id. If request_id is zero, retry detection is disabled.
+  virtual int64_t request_id() const = 0;
+
+  // Returns a human-readable representation of this message for debugging.
+  virtual string DebugString() const = 0;
+
+  // Returns the wrapped data as a protocol buffer message.
+  virtual const RunStepRequest& ToProto() const = 0;
+};
+
+// Abstract interface for a mutable RunStepRequest message.
+//
+// See `RunStepRequestWrapper` above for a description of the fields.
+class MutableRunStepRequestWrapper : public RunStepRequestWrapper {
+ public:
+  virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_partial_run_handle(const string& handle) = 0;
+  virtual void add_feed(const string& name, const Tensor& value) = 0;
+  virtual void add_fetch(const string& name) = 0;
+  virtual void add_target(const string& name) = 0;
+  virtual RunOptions* mutable_options() = 0;
+  virtual void set_store_errors_in_response_body(bool store_errors) = 0;
+};
+
+// Specialized (and mutable) wrapper for RunStep requests between a client and
+// master in the same address space.
+class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
+ public:
+  // RunStepRequestWrapper methods.
+  const string& session_handle() const override;
+  const string& partial_run_handle() const override;
+  size_t num_feeds() const override;
+  const string& feed_name(size_t i) const override;
+  absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
+  absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
+  size_t num_fetches() const override;
+  const string& fetch_name(size_t i) const override;
+  size_t num_targets() const override;
+  const string& target_name(size_t i) const override;
+  const RunOptions& options() const override;
+  string DebugString() const override;
+  const RunStepRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
+  int64_t request_id() const override;
+
+  // MutableRunStepRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
+  void set_partial_run_handle(const string& handle) override;
+  void add_feed(const string& name, const Tensor& value) override;
+  void add_fetch(const string& name) override;
+  void add_target(const string& name) override;
+  RunOptions* mutable_options() override;
+  void set_store_errors_in_response_body(bool store_errors) override;
+
+ private:
+  string session_handle_;
+  string partial_run_handle_;
+  absl::InlinedVector<std::pair<string, Tensor>, 4UL> feeds_;
+  absl::InlinedVector<string, 4UL> fetches_;
+  absl::InlinedVector<string, 4UL> targets_;
+  RunOptions options_;
+  bool store_errors_in_response_body_ = false;
+
+  // Holds a cached and owned representation of the proto
+  // representation of this request, if needed, so that `ToProto()`
+  // can return a const RunStepRequest&.
+  // NOTE(mrry): Although calls to `ToProto()` on this class are
+  // expected to be rare, retaining ownership of the returned message
+  // makes it easier to return a reference from the proto-backed
+  // representations.
+  mutable std::unique_ptr<RunStepRequest> proto_version_;
+};
+
+// Wrapper for mutable RunStep requests that uses a protobuf message.
+//
+// This wrapper class should be used for RunStep requests between a
+// client and master in different address spaces.
+class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
+ public:
+  // RunStepRequestWrapper methods.
+  const string& session_handle() const override;
+  const string& partial_run_handle() const override;
+  size_t num_feeds() const override;
+  const string& feed_name(size_t i) const override;
+  absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
+  absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
+  size_t num_fetches() const override;
+  const string& fetch_name(size_t i) const override;
+  size_t num_targets() const override;
+  const string& target_name(size_t i) const override;
+  const RunOptions& options() const override;
+  string DebugString() const override;
+  const RunStepRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
+  int64_t request_id() const override;
+
+  // MutableRunStepRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
+  void set_partial_run_handle(const string& handle) override;
+  void add_feed(const string& name, const Tensor& value) override;
+  void add_fetch(const string& name) override;
+  void add_target(const string& name) override;
+  RunOptions* mutable_options() override;
+  void set_store_errors_in_response_body(bool store_errors) override;
+
+ private:
+  RunStepRequest request_;
+  friend class MasterInterface;
+};
+
+// Wrapper for immutable RunStep requests that use a non-owned
+// protobuf message.
+//
+// This interface is typically used by server-side components in the
+// TensorFlow master, where the incoming message is a (possibly const)
+// `RunStepRequest*`.
+class ProtoRunStepRequest : public RunStepRequestWrapper {
+ public:
+  ProtoRunStepRequest(const RunStepRequest* request);
+
+  // RunStepRequestWrapper methods.
+  const string& session_handle() const override;
+  const string& partial_run_handle() const override;
+  size_t num_feeds() const override;
+  const string& feed_name(size_t i) const override;
+  absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
+  absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
+  size_t num_fetches() const override;
+  const string& fetch_name(size_t i) const override;
+  size_t num_targets() const override;
+  const string& target_name(size_t i) const override;
+  const RunOptions& options() const override;
+  string DebugString() const override;
+  const RunStepRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
+  int64_t request_id() const override;
+
+ private:
+  const RunStepRequest* const request_;  // Not owned.
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Wrapper classes for the `WorkerService.RunGraph` request message.
+//
+// The `RunGraphRequest` message can contain potentially large tensor
+// data as part of its `send` submessages. Here we provide specialized
+// wrappers that avoid copying the tensor data wherever possible.
+//
+// See `RunGraphRequest` in tensorflow/core/protobuf/worker.proto for the
+// protocol buffer definition.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Abstract interface for an immutable RunGraphRequest message.
+//
+// This interface is typically used by server-side components in the
+// TensorFlow worker.
+class RunGraphRequestWrapper {
+ public:
+  virtual ~RunGraphRequestWrapper() {}
+
+  // The session handle used to register the graph. If empty, a single global
+  // namespace is used.
+  virtual const string& session_handle() const = 0;
+
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  virtual bool create_worker_session_called() const = 0;
+
+  // REQUIRED: graph_handle must be returned by a RegisterGraph call
+  // to the same WorkerService.
+  virtual const string& graph_handle() const = 0;
+
+  // A unique ID to distinguish different runs of the same graph.
+  //
+  // The master generates a global unique `step_id` to distinguish
+  // different runs of the graph computation. Subgraphs communicate
+  // (e.g., send/recv ops) with each other using `step_id` to
+  // distinguish tensors generated by different runs.
+  virtual int64_t step_id() const = 0;
+
+  // Options for this step.
+  virtual const ExecutorOpts& exec_opts() const = 0;
+
+  // Sends the tensors in "send" into the graph before the run.
+  virtual size_t num_sends() const = 0;
+  virtual const string& send_key(size_t i) const = 0;
+  virtual absl::Status SendValue(size_t i, Tensor* out_tensor) const = 0;
+
+  // Fetches the keys into `RunGraphResponse.recv` after the run.
+  virtual size_t num_recvs() const = 0;
+  virtual const string& recv_key(size_t i) const = 0;
+
+  // True if the RunGraphRequest is a partial run request.
+  virtual bool is_partial() const = 0;
+
+  // True if this is the last partial run request in a sequence of requests.
+  virtual bool is_last_partial_run() const = 0;
+
+  // If true then some errors, e.g., execution errors that have long
+  // error messages, may return an OK RunStepResponse with the actual
+  // error saved in the status_code/status_error_message fields of the
+  // response body. This is a workaround since the RPC subsystem may
+  // truncate long metadata messages.
+  virtual bool store_errors_in_response_body() const = 0;
+
+  virtual int64_t request_id() const = 0;
+
+  // Returns the wrapped data as a protocol buffer message.
+  virtual const RunGraphRequest& ToProto() const = 0;
+};
+
+// Abstract interface for a mutable RunGraphRequest message.
+//
+// See `RunGraphRequestWrapper` above for a description of the fields.
+class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
+ public:
+  virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_create_worker_session_called(bool called) = 0;
+  virtual void set_graph_handle(const string& handle) = 0;
+  virtual void set_step_id(int64_t step_id) = 0;
+  virtual ExecutorOpts* mutable_exec_opts() = 0;
+
+  // Stores the i^{th} feed value in `run_step_request` in this
+  // request with the given `send_key`.
+  virtual absl::Status AddSendFromRunStepRequest(
+      const RunStepRequestWrapper& run_step_request, size_t i,
+      const string& send_key) = 0;
+  virtual absl::Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) = 0;
+
+  virtual void add_recv_key(const string& recv_key) = 0;
+  virtual void set_is_partial(bool is_partial) = 0;
+  virtual void set_is_last_partial_run(bool is_last_partial_run) = 0;
+  virtual void set_store_errors_in_response_body(bool store_errors) = 0;
+  virtual void set_request_id(int64_t request_id) = 0;
+};
+
+class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
+ public:
+  // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
+  const string& graph_handle() const override;
+  bool create_worker_session_called() const override;
+  int64_t step_id() const override;
+  const ExecutorOpts& exec_opts() const override;
+  size_t num_sends() const override;
+  const string& send_key(size_t i) const override;
+  absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
+  size_t num_recvs() const override;
+  const string& recv_key(size_t i) const override;
+  bool is_partial() const override;
+  bool is_last_partial_run() const override;
+  const RunGraphRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
+  int64_t request_id() const override;
+
+  // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
+  void set_graph_handle(const string& handle) override;
+  void set_step_id(int64_t step_id) override;
+  ExecutorOpts* mutable_exec_opts() override;
+  absl::Status AddSendFromRunStepRequest(
+      const RunStepRequestWrapper& run_step_request, size_t i,
+      const string& send_key) override;
+  absl::Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
+  void add_recv_key(const string& recv_key) override;
+  void set_is_partial(bool is_partial) override;
+  void set_is_last_partial_run(bool is_last_partial_run) override;
+  void set_store_errors_in_response_body(bool store_errors) override;
+  void set_request_id(int64_t request_id) override;
+
+ private:
+  string session_handle_;
+  bool create_worker_session_called_ = false;
+  string graph_handle_;
+  int64_t step_id_;
+  ExecutorOpts exec_opts_;
+  absl::InlinedVector<std::pair<string, Tensor>, 4UL> sends_;
+  absl::InlinedVector<string, 4UL> recvs_;
+  bool is_partial_ = false;
+  bool is_last_partial_run_ = false;
+  bool store_errors_in_response_body_ = false;
+  int64_t request_id_ = 0;
+
+  // Holds a cached and owned representation of the proto
+  // representation of this request, if needed, so that `ToProto()`
+  // can return a const RunGraphRequest&.
+  // NOTE(mrry): Although calls to `ToProto()` on this class are
+  // expected to be rare, retaining ownership of the returned message
+  // makes it easier to return a reference from the proto-backed
+  // representations.
+  mutable std::unique_ptr<RunGraphRequest> proto_version_;
+};
+
+class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
+ public:
+  // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
+  bool create_worker_session_called() const override;
+  const string& graph_handle() const override;
+  int64_t step_id() const override;
+  const ExecutorOpts& exec_opts() const override;
+  size_t num_sends() const override;
+  const string& send_key(size_t i) const override;
+  absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
+  size_t num_recvs() const override;
+  const string& recv_key(size_t i) const override;
+  bool is_partial() const override;
+  bool is_last_partial_run() const override;
+  bool store_errors_in_response_body() const override;
+  int64_t request_id() const override;
+  const RunGraphRequest& ToProto() const override;
+
+  // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
+  void set_graph_handle(const string& handle) override;
+  void set_step_id(int64_t step_id) override;
+  ExecutorOpts* mutable_exec_opts() override;
+  absl::Status AddSendFromRunStepRequest(
+      const RunStepRequestWrapper& run_step_request, size_t i,
+      const string& send_key) override;
+  absl::Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
+  void add_recv_key(const string& recv_key) override;
+  void set_is_partial(bool is_partial) override;
+  void set_is_last_partial_run(bool is_last_partial_run) override;
+  void set_store_errors_in_response_body(bool store_errors) override;
+  void set_request_id(int64_t request_id) override;
+
+ private:
+  RunGraphRequest request_;
+};
+
+class ProtoRunGraphRequest : public RunGraphRequestWrapper {
+ public:
+  ProtoRunGraphRequest(const RunGraphRequest* request);
+
+  // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
+  bool create_worker_session_called() const override;
+  const string& graph_handle() const override;
+  int64_t step_id() const override;
+  const ExecutorOpts& exec_opts() const override;
+  size_t num_sends() const override;
+  const string& send_key(size_t i) const override;
+  absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
+  size_t num_recvs() const override;
+  const string& recv_key(size_t i) const override;
+  bool is_partial() const override;
+  bool is_last_partial_run() const override;
+  bool store_errors_in_response_body() const override;
+  int64_t request_id() const override;
+  const RunGraphRequest& ToProto() const override;
+
+ private:
+  const RunGraphRequest* const request_;  // Not owned.
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Wrapper classes for the `WorkerService.RunGraph` response message.
+//
+// The `RunGraphResponse` message can contain potentially large tensor
+// data as part of its `recv` submessages. Here we provide specialized
+// wrappers that avoid copying the tensor data wherever possible.
+//
+// See `RunGraphResponse` in tensorflow/core/protobuf/worker.proto for the
+// protocol buffer definition.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Abstract interface for a mutable RunGraphResponse message.
+//
+// Note that there is no corresponding (immutable)
+// RunGraphResponseWrapper class, because the RunGraphResponse object
+// is always used as a mutable pointer.
+class MutableRunGraphResponseWrapper {
+ public:
+  virtual ~MutableRunGraphResponseWrapper() {}
+
+  // A list of tensors corresponding to those requested by
+  // `RunGraphRequest.recv_key`.
+  virtual size_t num_recvs() const = 0;
+  virtual const string& recv_key(size_t i) const = 0;
+  // NOTE: The following methods may perform a destructive read, for
+  // efficiency.
+  virtual absl::Status RecvValue(size_t i, TensorProto* out_tensor) = 0;
+  virtual absl::Status RecvValue(size_t i, Tensor* out_tensor) = 0;
+  virtual void AddRecv(const string& key, const Tensor& value) = 0;
+
+  // Submessages that store performance statistics about the subgraph
+  // execution, if necessary.
+  virtual StepStats* mutable_step_stats() = 0;
+  virtual CostGraphDef* mutable_cost_graph() = 0;
+  virtual size_t num_partition_graphs() const = 0;
+  virtual GraphDef* mutable_partition_graph(size_t i) = 0;
+  virtual void AddPartitionGraph(const GraphDef& partition_graph) = 0;
+
+  // Returned status if requested.
+  virtual absl::Status status() const = 0;
+  virtual absl::StatusCode status_code() const = 0;
+  virtual void set_status(const absl::Status& status) = 0;
+
+ protected:
+  // Returns a mutable protobuf message that represents the contents of
+  // this wrapper, for passing to an RPC subsystem that will populate
+  // the message.
+  //
+  // NOTE: Only `WorkerInterface` subclasses may call this method. The
+  // `InMemoryRunGraphResponse` subclass does not implement this
+  // method, and attempts to call it will fail with a fatal
+  // error. However, as long as callers always call
+  // `WorkerInterface::RunGraphAsync()` with a wrapper object returned
+  // from `WorkerInterface::CreateRunGraphResponse()` called on the
+  // *same* WorkerInterface object, this error will never trigger.
+  virtual RunGraphResponse* get_proto() = 0;
+  friend class WorkerInterface;
+};
+
+class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
+ public:
+  // MutableRunGraphResponseWrapper methods.
+  size_t num_recvs() const override;
+  const string& recv_key(size_t i) const override;
+  absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
+  absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
+  void AddRecv(const string& key, const Tensor& value) override;
+  StepStats* mutable_step_stats() override;
+  CostGraphDef* mutable_cost_graph() override;
+  size_t num_partition_graphs() const override;
+  GraphDef* mutable_partition_graph(size_t i) override;
+  void AddPartitionGraph(const GraphDef& partition_graph) override;
+  absl::Status status() const override;
+  absl::StatusCode status_code() const override;
+  void set_status(const absl::Status& status) override;
+
+ protected:
+  // NOTE: This method is not implemented. See
+  // MutableRunGraphResponseWrapper for an explanation.
+  RunGraphResponse* get_proto() override;
+
+ private:
+  absl::InlinedVector<std::pair<string, Tensor>, 4UL> recvs_;
+  StepStats step_stats_;
+  CostGraphDef cost_graph_;
+  std::vector<GraphDef> partition_graphs_;
+  // Store the code and message separately so that they can be updated
+  // independently by setters.
+  absl::Status status_;
+};
+
+// Proto-based message wrapper for use on the client side of the RunGraph RPC.
+class OwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
+ public:
+  // MutableRunGraphResponseWrapper methods.
+  size_t num_recvs() const override;
+  const string& recv_key(size_t i) const override;
+  absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
+  absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
+  void AddRecv(const string& key, const Tensor& value) override;
+  StepStats* mutable_step_stats() override;
+  CostGraphDef* mutable_cost_graph() override;
+  size_t num_partition_graphs() const override;
+  GraphDef* mutable_partition_graph(size_t i) override;
+  void AddPartitionGraph(const GraphDef& partition_graph) override;
+  absl::Status status() const override;
+  absl::StatusCode status_code() const override;
+  void set_status(const absl::Status& status) override;
+
+ protected:
+  RunGraphResponse* get_proto() override;
+
+ private:
+  RunGraphResponse response_;
+};
+
+// Proto-based message wrapper for use on the server side of the RunGraph RPC.
+class NonOwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
+ public:
+  NonOwnedProtoRunGraphResponse(RunGraphResponse* response);
+
+  // MutableRunGraphResponseWrapper methods.
+  size_t num_recvs() const override;
+  const string& recv_key(size_t i) const override;
+  absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
+  absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
+  void AddRecv(const string& key, const Tensor& value) override;
+  StepStats* mutable_step_stats() override;
+  CostGraphDef* mutable_cost_graph() override;
+  size_t num_partition_graphs() const override;
+  GraphDef* mutable_partition_graph(size_t i) override;
+  void AddPartitionGraph(const GraphDef& partition_graph) override;
+  absl::Status status() const override;
+  absl::StatusCode status_code() const override;
+  void set_status(const absl::Status& status) override;
+
+ protected:
+  RunGraphResponse* get_proto() override;
+
+ private:
+  RunGraphResponse* const response_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Wrapper classes for the `MasterService.RunStep` response message.
+//
+// The `RunStepResponse` message can contain potentially large tensor
+// data as part of its `tensor` submessages. Here we provide specialized
+// wrappers that avoid copying the tensor data wherever possible.
+//
+// See `RunStepResponse` in tensorflow/core/protobuf/master.proto for the
+// protocol buffer definition.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Abstract interface for a mutable RunStepResponse message.
+//
+// Note that there is no corresponding (immutable)
+// RunStepResponseWrapper class, because the RunStepResponse object is
+// always used as a mutable pointer.
+class MutableRunStepResponseWrapper {
+ public:
+  virtual ~MutableRunStepResponseWrapper();
+
+  // The values of the tensors whose fetching was requested in the
+  // RunStep call.
+  //
+  // NOTE: The order of the returned tensors may or may not match
+  // the fetch order specified in RunStepRequest.
+  virtual size_t num_tensors() const = 0;
+  virtual const string& tensor_name(size_t i) const = 0;
+  virtual absl::Status TensorValue(size_t i, Tensor* out_tensor) const = 0;
+
+  // Stores the i^{th} recv value in `run_graph_response` in this
+  // response with the given `name`.
+  virtual absl::Status AddTensorFromRunGraphResponse(
+      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+      size_t i) = 0;
+
+  // Returned metadata if requested in the options.
+  virtual const RunMetadata& metadata() const = 0;
+  virtual RunMetadata* mutable_metadata() = 0;
+
+  // Returned status if requested.
+  virtual absl::Status status() const = 0;
+  virtual absl::StatusCode status_code() const = 0;
+  virtual void set_status(const absl::Status& status) = 0;
+
+ protected:
+  // Returns a mutable protobuf message that represents the contents of
+  // this wrapper, for passing to an RPC subsystem that will populate
+  // the message.
+  //
+  // NOTE: Only `MasterInterface` subclasses may call this method. The
+  // `InMemoryRunStepResponse` subclass does not implement this
+  // method, and attempts to call it will fail with a fatal
+  // error. However, as long as callers always call
+  // `MasterInterface::RunStep()` with a wrapper object returned
+  // from `MasterInterface::CreateRunStepResponse()` called on the
+  // *same* MasterInterface object, this error will never trigger.
+  virtual RunStepResponse* get_proto() = 0;
+  friend class MasterInterface;
+};
+
+class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
+ public:
+  // MutableRunStepResponseWrapper methods.
+  size_t num_tensors() const override;
+  const string& tensor_name(size_t i) const override;
+  absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
+  absl::Status AddTensorFromRunGraphResponse(
+      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+      size_t i) override;
+  const RunMetadata& metadata() const override;
+  RunMetadata* mutable_metadata() override;
+  absl::Status status() const override;
+  absl::StatusCode status_code() const override;
+  void set_status(const absl::Status& status) override;
+
+ protected:
+  // NOTE: This method is not implemented. See
+  // MutableRunGraphResponseWrapper for an explanation.
+  RunStepResponse* get_proto() override;
+
+ private:
+  absl::InlinedVector<std::pair<string, Tensor>, 4UL> tensors_;
+  RunMetadata metadata_;
+  // Store the code and message separately so that they can be updated
+  // independently by setters.
+  absl::Status status_;
+};
+
+// Proto-based message wrapper for use on the client side of the RunStep RPC.
+class OwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
+ public:
+  // MutableRunStepResponseWrapper methods.
+  size_t num_tensors() const override;
+  const string& tensor_name(size_t i) const override;
+  absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
+  absl::Status AddTensorFromRunGraphResponse(
+      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+      size_t i) override;
+  const RunMetadata& metadata() const override;
+  RunMetadata* mutable_metadata() override;
+  absl::Status status() const override;
+  absl::StatusCode status_code() const override;
+  void set_status(const absl::Status& status) override;
+
+ protected:
+  RunStepResponse* get_proto() override;
+
+ private:
+  RunStepResponse response_;
+};
+
+// Proto-based message wrapper for use on the server side of the RunStep RPC.
+class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
+ public:
+  NonOwnedProtoRunStepResponse(RunStepResponse* response);
+
+  // MutableRunStepResponseWrapper methods.
+  size_t num_tensors() const override;
+  const string& tensor_name(size_t i) const override;
+  absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
+  absl::Status AddTensorFromRunGraphResponse(
+      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+      size_t i) override;
+  const RunMetadata& metadata() const override;
+  RunMetadata* mutable_metadata() override;
+  absl::Status status() const override;
+  absl::StatusCode status_code() const override;
+  void set_status(const absl::Status& status) override;
+
+ protected:
+  RunStepResponse* get_proto() override;
+
+ private:
+  RunStepResponse* response_;  // Not owned.
+};
+
+bool ParseTensorProtoToTensor(const TensorProto& tensor_proto,
+                              Tensor* out_tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/partial_run_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/partial_run_mgr.h
new file mode 100644
index 00000000..bf2b2b1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/partial_run_mgr.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// PartialRunMgr keeps track of pending partial run requests, and ensures that
+// the partial run is only marked complete when the corresponding executor is
+// run to completion.
+//
+// In tensorflow workers, the executor runs operations asynchronously until
+// specified fetches (operations that return tensors) or targets (operations
+// that don't return tensors) are reached. A PartialRun has two components: a
+// setup which specifies all desired fetches and targets, and run calls that
+// specify fetch values (from the setup calls) to retrieve.
+// On the last partial run call, it is possible to satisfy the
+// required fetches before the executor has completed running the graph to all
+// the desired targets.
+// PartialRunMgr is used to ensure that we don't complete and return the final
+// partial run call to the user until both the partial run and executor have
+// completed.
+//
+// PartialRunMgr is thread-safe.
+class PartialRunMgr {
+ public:
+  // Find or create the CancellationManager associated with step_id.
+  // The PartialRunMgr owns the cancellation_manager.
+  // Returns true if a new CancellationManager was created
+  // (i.e this is a new partial run).
+  bool FindOrCreate(int step_id, CancellationManager** cancellation_manager);
+
+  // Calls the final callback if the PartialRunRequest has already completed.
+  // Otherwise stores the executor_status to be propagated when the
+  // PartialRunRequest completes (PartialRunDone has been called).
+  void ExecutorDone(int step_id, const absl::Status& executor_status);
+
+  // Calls done if the executor has already completed (ExecutorDone has been
+  // called). Otherwise, stores the status and done callback, calling them when
+  // ExecutorDone is called. The callback will either be called by the calling
+  // thread of either PartialRunDone or ExecutorDone.
+  // If executor_status in ExecutorDone is not OK, it takes precedence over
+  // status and is passed to the done callback.
+  void PartialRunDone(int step_id, StatusCallback done,
+                      const absl::Status& status);
+
+ private:
+  // PartialRunState stores state associated with a pending partial run request.
+  // This is protected by the mutex in PartialRunMgr.
+  struct PartialRunState {
+    std::unique_ptr<CancellationManager> cancellation_manager;
+
+    bool executor_done = false;
+    StatusCallback final_callback = nullptr;
+    absl::Status final_status;
+  };
+
+  mutex mu_;
+
+  std::unordered_map<int, std::unique_ptr<PartialRunState>>
+      step_id_to_partial_run_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
new file mode 100644
index 00000000..cbe03db6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
@@ -0,0 +1,27 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+
+#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CreatePreemptionSyncManager;
+using tsl::PreemptionSyncManager;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/recent_request_ids.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/recent_request_ids.h
new file mode 100644
index 00000000..2eb35ac7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+// RecentRequestIds tracks recent 64-bit request_ids. When maximum capacity is
+// reached, the oldest request_id is evicted. Thread safe.
+//
+// Some RPCs like RecvTensor are unsafe to retry. For example, RecvTensor pairs
+// one sender and one receiver, and the receiver waits for the sender's tensor.
+// Retried RecvTensor requests are problematic, because the original RecvTensor
+// request may have consumed the sender's tensor, so a retried request might
+// block forever. RecentRequestIds identifies retried requests, so we can fail
+// them instead of blocking forever.
+//
+// Internally, recent request_ids are stored in two data structures: a set and a
+// circular buffer. The set is used for efficient lookups, and the circular
+// buffer tracks the oldest request_id. When the buffer is full, the new
+// request_id replaces the oldest request_id in the circular buffer, and the
+// oldest request_id is removed from the set.
+class RecentRequestIds {
+ public:
+  // num_tracked_request_ids should be much larger than the number of RPCs that
+  // can be received in a small time window. For example, we observed a peak RPC
+  // rate of ~700 RecvTensor RPC/s when training inception v3 on TPUs, so we
+  // currently set num_tracked_request_ids to 100,000 for RecvTensor.
+  // Having a large `num_shars` can prevent run into lock contention in this
+  // class.
+  explicit RecentRequestIds(int num_tracked_request_ids, int num_shards = 1);
+
+  // Returns OK iff request_id has not been seen in the last
+  // num_tracked_request_ids insertions. For backwards compatibility, this
+  // always returns OK for request_id 0. The method_name and the request's
+  // ShortDebugString are added to returned errors.
+  absl::Status TrackUnique(int64_t request_id, const string& method_name,
+                           const protobuf::Message& request);
+  // Overloaded version of the above function for wrapped protos.
+  template <typename RequestWrapper>
+  absl::Status TrackUnique(int64_t request_id, const string& method_name,
+                           const RequestWrapper* wrapper);
+
+ private:
+  bool Insert(int64_t request_id);
+
+  struct IndexBucket {
+    mutex mu;
+    // next_index indexes into circular_buffer_, and points to the next storage
+    // space to use. When the buffer is full, next_index_ points at the oldest
+    // request_id.
+    int next_index TF_GUARDED_BY(mu) = 0;
+    std::vector<int64_t> circular_buffer TF_GUARDED_BY(mu);
+    absl::flat_hash_set<int64_t> set TF_GUARDED_BY(mu);
+  };
+
+  // This vector is immutable so we don't need to use a mutex to protect it.
+  std::vector<IndexBucket> index_buckets_;
+};
+
+// Implementation details
+
+template <typename RequestWrapper>
+absl::Status RecentRequestIds::TrackUnique(int64_t request_id,
+                                           const string& method_name,
+                                           const RequestWrapper* wrapper) {
+  if (Insert(request_id)) {
+    return absl::OkStatus();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           wrapper->ToProto().ShortDebugString());
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/remote_device.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/remote_device.h
new file mode 100644
index 00000000..591531f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/remote_device.h
@@ -0,0 +1,72 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REMOTE_DEVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REMOTE_DEVICE_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+using Env = tsl::Env;
+class DeviceAttributes;
+class Device;
+class WorkerCacheInterface;
+
+// This callback should have the same definition as DeviceMgr::LookupDevice
+// It assigns *device with pointer to Device of the given 'name', where 'name'
+// is either a full device name, or just the replica-local suffix.
+typedef std::function<absl::Status(absl::string_view name, Device** device)>
+    LookupLocalDevice;
+
+// Creates Remote Devices for the provided device attributes. Helpful when the
+// list of attributes is known, and doesn't need to be discovered via RPC.
+void AsRemoteDevices(
+    Env* env,
+    const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    LookupLocalDevice lookup_local_device,
+    std::vector<std::unique_ptr<Device>>* remote_devices);
+
+// NewRemoteDevices discovers available devices on the
+// 'worker_name'.  The implementation uses 'channel_cache' to
+// discover how to communicate with the 'worker_name' (via gRPC, for
+// example).
+//
+// NewRemoteDevices does not block.
+//
+// On success, the 'done' callback is given the OK status and a vector
+// of Device*. The caller should take ownership of these devices.
+//
+// Otherwise, the 'done' callback is given an error status and the
+// vector is empty.
+typedef std::function<void(const absl::Status&, std::vector<Device*>*)>
+    NewRemoteDevicesDone;
+void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
+                      const string& worker_name, NewRemoteDevicesDone done);
+
+// Create Remote Device based on the given attributes.
+std::unique_ptr<Device> NewRemoteDevice(Env* env,
+                                        DeviceAttributes device_attribute);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REMOTE_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
new file mode 100644
index 00000000..6ec759d4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -0,0 +1,110 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_
+
+#include <string>
+
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class WorkerSession;
+
+// RemoteRendezvous follow a 2-part initialization. First the objects are
+// constructed. Eventually, they will be initialized. Clients of the
+// RendezvousMgrInterface must guarantee to call Initialize on the returned
+// RemoteRendezvous eventually.
+//
+// Partially initialized RemoteRendezvous must respect the Rendezvous interface
+// (i.e. Send() must never block), however implementations are not expected to
+// actually perform the underlying operations until after the RemoteRendezvous
+// has been Initialize'd.
+class RemoteRendezvous : public Rendezvous {
+ public:
+  // Fully construct the RemoteRendezvous.
+  virtual absl::Status Initialize(WorkerSession* session) = 0;
+
+  // In remote eager, set current instance as context default rendezvous which
+  // will be used for eager op-by-op execution.
+  virtual void SetRemoteEagerContextDefault() = 0;
+  // In remote eager, get if current instance is context default rendezvous.
+  virtual bool IsRemoteEagerContextDefault() = 0;
+
+ protected:
+  bool is_cross_process() override { return true; }
+};
+
+// RendezvousMgr keeps track of a set of local rendezvous instances.
+// All tensors sent by this worker are buffered in a RendezvousMgr
+// until the tensor is received.  Each global unique "step_id"
+// corresponds to one local rendezvous instance managed by a
+// RendezvousMgr.
+//
+// E.g.,
+//   Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
+//   fork execution of an graph executor using "rendez"  on thread 1;
+//   fork execution of another graph executor using "rendez" on thread 2;
+//   ...
+//   join threads 1 and 2;
+//
+// In the example above, execution in thread 1 and 2 communicates with
+// each other by send/recv operations through the "rend".
+//
+// Tensors sent and recved through rendezvous managed by this
+// RendezvousMgr must have keys generated by Rendezvous::CreateKey.
+class RendezvousMgrInterface {
+ public:
+  RendezvousMgrInterface() = default;
+  virtual ~RendezvousMgrInterface() {}
+
+  // Returns Rendezvous supporting send and recv among workers in the
+  // "step_id".  The caller takes ownership of one reference on the
+  // returned Rendezvous instance.
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  virtual tsl::core::RefCountPtr<RemoteRendezvous> Find(int64_t step_id) = 0;
+
+  // Finds the local rendezvous instance for the "step_id".  Runs
+  // "done" when the tensor for "key" is produced or an error occurs.
+  //
+  // This method is used by the rpc handler of RecvTensor.
+  virtual void RecvLocalAsync(int64_t step_id,
+                              const Rendezvous::ParsedKey& parsed,
+                              Rendezvous::DoneCallback done) = 0;
+
+  // Synchronous wrapper for RecvLocalAsync.
+  virtual absl::Status RecvLocal(int64_t step_id,
+                                 const Rendezvous::ParsedKey& parsed,
+                                 Tensor* val, bool* is_dead) = 0;
+
+  // Removes rendezvous for "step_id".
+  //
+  // TODO(zhifengc): Have a background thread in worker that
+  // periodically calls CleanupAll().
+  virtual void Cleanup(int64_t step_id) = 0;
+
+  // Remove all rendezvous instances owned by the rendezvous_mgr.
+  virtual void CleanupAll() = 0;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/request_id.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/request_id.h
new file mode 100644
index 00000000..2f7b3b46
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/request_id.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/random.h"
+
+namespace tensorflow {
+
+// Returns a request_id for use with RecentRequestIds. This number will not be
+// zero, and must be unique over RecentRequestIds' window of
+// num_tracked_request_ids. See recent_request_ids.h for more details.
+int64_t GetUniqueRequestId();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
new file mode 100644
index 00000000..b692ce70
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+
+#include <memory>
+
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::NewGrpcCoordinationClient;
+using tsl::NewGrpcCoordinationClientCache;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
new file mode 100644
index 00000000..9e0a218a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::GrpcCoordinationServiceImpl;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h
new file mode 100644
index 00000000..2eb41b8a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
+
+#include <memory>
+
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+
+namespace tensorflow {
+namespace eager {
+// The GrpcChannelCache is not owned.
+EagerClientCache* NewGrpcEagerClientCache(
+    std::shared_ptr<tensorflow::GrpcChannelCache> channel);
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
new file mode 100644
index 00000000..24cd17a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
+
+#include "tensorflow/core/protobuf/eager_service.grpc.pb.h"
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
new file mode 100644
index 00000000..7acc2955
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -0,0 +1,175 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "grpcpp/alarm.h"
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/server_builder.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This class is a wrapper that handles communication for gRPC.
+class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
+ public:
+  template <class RequestMessage, class ResponseMessage>
+  using EagerCall =
+      tsl::Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
+                RequestMessage, ResponseMessage>;
+  template <class RequestMessage, class ResponseMessage>
+  using StreamingCall =
+      tsl::ServerBidirectionalStreamingCall<GrpcEagerServiceImpl,
+                                            grpc::EagerService::AsyncService,
+                                            RequestMessage, ResponseMessage>;
+
+  GrpcEagerServiceImpl(WorkerEnv* env, ::grpc::ServerBuilder* server_builder);
+  virtual ~GrpcEagerServiceImpl() {}
+
+  // Create a master context in eager service.
+  absl::Status CreateMasterContext(tensorflow::uint64 context_id,
+                                   EagerContext* context);
+
+  void HandleRPCsLoop() override;
+  void Shutdown() override;
+
+ private:
+#define HANDLER(method)                                                       \
+  void method##Handler(EagerCall<method##Request, method##Response>* call) {  \
+    env_->compute_pool->Schedule([this, call]() {                             \
+      call->SendResponse(                                                     \
+          ToGrpcStatus(local_impl_.method(&call->request, &call->response))); \
+    });                                                                       \
+    tsl::Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,         \
+              method##Request, method##Response>::                            \
+        EnqueueRequest(&service_, cq_.get(),                                  \
+                       &grpc::EagerService::AsyncService::Request##method,    \
+                       &GrpcEagerServiceImpl::method##Handler, false);        \
+  }
+  HANDLER(CreateContext);
+  HANDLER(UpdateContext);
+  HANDLER(WaitQueueDone);
+  HANDLER(KeepAlive);
+  HANDLER(CloseContext);
+#undef HANDLER
+
+  void EnqueueHandler(EagerCall<EnqueueRequest, EnqueueResponse>* call) {
+    env_->compute_pool->Schedule([this, call]() {
+      auto call_opts = std::make_shared<CallOptions>();
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      call->SendResponse(ToGrpcStatus(local_impl_.Enqueue(
+          call_opts.get(), &call->request, &call->response)));
+    });
+    tsl::Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
+              EnqueueRequest, EnqueueResponse>::
+        EnqueueRequest(&service_, cq_.get(),
+                       &grpc::EagerService::AsyncService::RequestEnqueue,
+                       &GrpcEagerServiceImpl::EnqueueHandler,
+                       /*supports_cancel=*/true);
+  }
+
+  void RunComponentFunctionHandler(
+      EagerCall<RunComponentFunctionRequest, RunComponentFunctionResponse>*
+          call) {
+    env_->compute_pool->Schedule([this, call]() {
+      auto call_opts = std::make_shared<CallOptions>();
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      local_impl_.RunComponentFunction(
+          call_opts.get(), &call->request, &call->response,
+          [call, call_opts](const absl::Status& s) {
+            call->ClearCancelCallback();
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    tsl::Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
+              RunComponentFunctionRequest, RunComponentFunctionResponse>::
+        EnqueueRequest(
+            &service_, cq_.get(),
+            &grpc::EagerService::AsyncService::RequestRunComponentFunction,
+            &GrpcEagerServiceImpl::RunComponentFunctionHandler,
+            /*supports_cancel=*/true);
+  }
+
+  // Called when a new request has been received as part of a StreamingEnqueue
+  // call.
+  // StreamingEnqueueHandler gets the request from the `call` and fills the
+  // response (also found in `call`) by invoking the local EagerServiceImpl.
+  // The local EagerServiceImpl is invoked in a single-threaded thread pool. We
+  // do this to preserve request order. The local service can parallelize based
+  // on context_id in request if necessary. Remote contexts are created in async
+  // mode by default, so the local service impl just puts the request on eager
+  // executor queue.
+  void StreamingEnqueueHandler(
+      StreamingCall<EnqueueRequest, EnqueueResponse>* call) {
+    call->Ref();
+    enqueue_streaming_thread_.Schedule([this, call]() {
+      if (call->RefCountIsOne()) {
+        // This StreamingCall has already been shutdown. Don't need to anything.
+        call->Unref();
+        return;
+      }
+      // NOTE(fishx): Use the address of StreamingCall as the stream_id since we
+      // reuse the same StreamingCall for multiple requests in the same
+      // streaming connection.
+      absl::Status status = local_impl_.Enqueue(
+          /*call_opts=*/nullptr, &call->request(), call->mutable_response(),
+          reinterpret_cast<uint64>(static_cast<void*>(call)));
+
+      if (status.ok()) {
+        VLOG(1) << "local_impl_.Enqueue completed successfully";
+        call->SendResponse();
+      } else {
+        VLOG(1) << "local_impl_.Enqueue failed with " << status.ToString()
+                << " on request " << call->request().DebugString();
+        call->Finish(ToGrpcStatus(status));
+      }
+      call->Unref();
+
+      // We do not tell gRPC to accept a new StreamingEnqueue request because
+      // this method can be called multiple times for a given streaming call.
+      // The StreamingCall does this per call instead, after a call has been
+      // opened.
+    });
+  }
+
+  WorkerEnv* const env_;  // Not owned.
+  EagerServiceImpl local_impl_;
+
+  // A single-threaded thread pool to handle streaming enqueue rpc request.
+  thread::ThreadPool enqueue_streaming_thread_;
+  std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
+
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  grpc::EagerService::AsyncService service_;
+
+  GrpcEagerServiceImpl(const GrpcEagerServiceImpl&) = delete;
+  void operator=(const GrpcEagerServiceImpl&) = delete;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
new file mode 100644
index 00000000..b9bc118e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::ChannelCreationFunction;
+using tsl::ConvertToChannelCreationFunction;
+using tsl::GetChannelArguments;
+using tsl::GrpcChannelCache;
+using tsl::GrpcChannelSpec;
+using tsl::NewGrpcChannelCache;
+using tsl::NewHostPortGrpcChannel;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
new file mode 100644
index 00000000..30822036
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+
+#include "xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::GrpcClientCQTag;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
new file mode 100644
index 00000000..bd203163
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -0,0 +1,36 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+
+#include <memory>
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+
+namespace tsl {
+class AsyncServiceInterface;
+}
+namespace tensorflow {
+class Master;
+
+tsl::AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
new file mode 100644
index 00000000..bdf683fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -0,0 +1,218 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/client_context.h"
+#include "grpcpp/impl/codegen/completion_queue.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/server_context.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+
+namespace tensorflow {
+
+namespace grpc {
+
+// Implementation of `tensorflow.MasterService`, based on the
+// definition in "//tensorflow/core/protobuf/master_service.proto",
+// and the gRPC generated stub and service classes.
+// See that file for the definition of methods and messages.
+class MasterService final {
+ public:
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    virtual ::grpc::Status CreateSession(::grpc::ClientContext* context,
+                                         const CreateSessionRequest& request,
+                                         CreateSessionResponse* response) = 0;
+    virtual ::grpc::Status ExtendSession(::grpc::ClientContext* context,
+                                         const ExtendSessionRequest& request,
+                                         ExtendSessionResponse* response) = 0;
+    virtual ::grpc::Status PartialRunSetup(
+        ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
+        PartialRunSetupResponse* response) = 0;
+    virtual ::grpc::Status RunStep(::grpc::ClientContext* context,
+                                   const RunStepRequest& request,
+                                   RunStepResponse* response) = 0;
+    virtual ::grpc::Status CloseSession(::grpc::ClientContext* context,
+                                        const CloseSessionRequest& request,
+                                        CloseSessionResponse* response) = 0;
+    virtual ::grpc::Status ListDevices(::grpc::ClientContext* context,
+                                       const ListDevicesRequest& request,
+                                       ListDevicesResponse* response) = 0;
+    virtual ::grpc::Status Reset(::grpc::ClientContext* context,
+                                 const ResetRequest& request,
+                                 ResetResponse* response) = 0;
+    virtual ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                        const MakeCallableRequest& request,
+                                        MakeCallableResponse* response) = 0;
+    virtual ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                                       const RunCallableRequest& request,
+                                       RunCallableResponse* response) = 0;
+    virtual ::grpc::Status ReleaseCallable(
+        ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+        ReleaseCallableResponse* response) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
+    ::grpc::Status CreateSession(::grpc::ClientContext* context,
+                                 const CreateSessionRequest& request,
+                                 CreateSessionResponse* response) override;
+    ::grpc::Status ExtendSession(::grpc::ClientContext* context,
+                                 const ExtendSessionRequest& request,
+                                 ExtendSessionResponse* response) override;
+    ::grpc::Status PartialRunSetup(::grpc::ClientContext* context,
+                                   const PartialRunSetupRequest& request,
+                                   PartialRunSetupResponse* response) override;
+    ::grpc::Status RunStep(::grpc::ClientContext* context,
+                           const RunStepRequest& request,
+                           RunStepResponse* response) override;
+    ::grpc::Status CloseSession(::grpc::ClientContext* context,
+                                const CloseSessionRequest& request,
+                                CloseSessionResponse* response) override;
+    ::grpc::Status ListDevices(::grpc::ClientContext* context,
+                               const ListDevicesRequest& request,
+                               ListDevicesResponse* response) override;
+    ::grpc::Status Reset(::grpc::ClientContext* context,
+                         const ResetRequest& request,
+                         ResetResponse* response) override;
+    ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                const MakeCallableRequest& request,
+                                MakeCallableResponse* response) override;
+    ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                               const RunCallableRequest& request,
+                               RunCallableResponse* response) override;
+    ::grpc::Status ReleaseCallable(::grpc::ClientContext* context,
+                                   const ReleaseCallableRequest& request,
+                                   ReleaseCallableResponse* response) override;
+
+   private:
+    std::shared_ptr< ::grpc::ChannelInterface> channel_;
+    const ::grpc::internal::RpcMethod rpcmethod_CreateSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_ExtendSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_PartialRunSetup_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunStep_;
+    const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
+    const ::grpc::internal::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_MakeCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReleaseCallable_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService();
+    virtual ~AsyncService();
+    void RequestCreateSession(
+        ::grpc::ServerContext* context, CreateSessionRequest* request,
+        ::grpc::ServerAsyncResponseWriter<CreateSessionResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestExtendSession(
+        ::grpc::ServerContext* context, ExtendSessionRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ExtendSessionResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(1, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestPartialRunSetup(
+        ::grpc::ServerContext* context, PartialRunSetupRequest* request,
+        ::grpc::ServerAsyncResponseWriter<PartialRunSetupResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(2, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRunStep(
+        ::grpc::ServerContext* context, RunStepRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RunStepResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(3, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestCloseSession(
+        ::grpc::ServerContext* context, CloseSessionRequest* request,
+        ::grpc::ServerAsyncResponseWriter<CloseSessionResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(4, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestListDevices(
+        ::grpc::ServerContext* context, ListDevicesRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ListDevicesResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(5, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestReset(
+        ::grpc::ServerContext* context, ResetRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ResetResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(6, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestMakeCallable(
+        ::grpc::ServerContext* context, MakeCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<MakeCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(7, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRunCallable(
+        ::grpc::ServerContext* context, RunCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RunCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(8, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestReleaseCallable(
+        ::grpc::ServerContext* context, ReleaseCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ReleaseCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(9, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+  };
+};
+
+}  // namespace grpc
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
new file mode 100644
index 00000000..c80668e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+
+#include "tensorflow/core/distributed_runtime/master_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace tensorflow {
+// Returns a MasterInterface wrapped around the gRPC channel `channel`.
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
new file mode 100644
index 00000000..97e590e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+
+#include <memory>
+
+#include "grpcpp/completion_queue.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+class WorkerCacheLogger;
+class WorkerInterface;
+
+WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
+                                     ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
+                                     WorkerCacheLogger* logger,
+                                     const string& target);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
new file mode 100644
index 00000000..ca162c19
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -0,0 +1,242 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+
+// GrpcServer manages the lifecycle of an Eager, Worker and Master service.
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/env.h"
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tensorflow {
+
+class GrpcWorker;
+class Master;
+
+// function that creates a RendezvousMgr.
+typedef std::function<RendezvousMgrInterface*(const WorkerEnv*)>
+    RendezvousMgrCreationFunction;
+
+// function that creates a CollectiveExecutorMgr.
+typedef std::function<CollectiveExecutorMgrInterface*(
+    const ConfigProto&, const WorkerEnv*, WorkerCacheInterface*)>
+    CollectiveMgrCreationFunction;
+
+// function that registers a service to the server. The service needs to
+// be registered before builder.BuildAndStart().
+typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
+    ServiceInitFunction;
+
+// function that creates a grpc based worker implementation.
+typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
+                                                  const ConfigProto& config)>
+    WorkerCreationFunction;
+
+struct GrpcServerOptions {
+  ServiceInitFunction service_func = nullptr;
+  RendezvousMgrCreationFunction rendezvous_mgr_func = nullptr;
+  CollectiveMgrCreationFunction collective_mgr_func = nullptr;
+  WorkerCreationFunction worker_func = nullptr;
+  StatsPublisherFactory stats_factory = CreateNoOpStatsPublisher;
+  GrpcWorkerServiceOptions worker_service_options;
+  DeviceMgr* local_device_mgr = nullptr;
+};
+
+class GrpcServer : public ServerInterface {
+ protected:
+  GrpcServer(const ServerDef& server_def, Env* env);
+  GrpcServer(const ServerDef& server_def, DeviceMgr* local_device_mgr,
+             Env* env);
+  // Allow children classes to override this and provide custom args to the
+  // server before it is constructed. Default behavior is to do nothing.
+  // requested_port provides the port requested by caller as bound_port() is
+  // not available till BuildAndStart has been called.
+  virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder,
+                                  int requested_port) {}
+
+ public:
+  static absl::Status Create(const ServerDef& server_def, Env* env,
+                             std::unique_ptr<ServerInterface>* out_server);
+  static absl::Status Create(const ServerDef& server_def, Env* env,
+                             std::unique_ptr<GrpcServer>* out_server);
+  // Reuse the local_device_mgr.
+  static absl::Status Create(const ServerDef& server_def, Env* env,
+                             DeviceMgr* local_device_mgr,
+                             std::unique_ptr<ServerInterface>* out_server);
+
+  // Destruction is only supported in the factory method. Clean
+  // shutdown is not currently implemented for this server type.
+  virtual ~GrpcServer();
+
+  // Implementations of ServerInterface methods.
+  absl::Status Start() override;
+  absl::Status Stop() override;
+  absl::Status Join() override;
+  const string target() const override;
+
+  WorkerEnv* worker_env() override { return &worker_env_; }
+  MasterEnv* master_env() override { return &master_env_; }
+
+  // Add master eager context to local eager service in order to handle enqueue
+  // requests from remote workers.
+  absl::Status AddMasterEagerContextToEagerService(
+      const tensorflow::uint64 context_id,
+      tensorflow::EagerContext* context) override;
+  // Update the set of workers that can be reached by the GRPC server
+  absl::Status UpdateServerDef(const ServerDef& server_def) override;
+  // Pass coordination service agent instance to server's RPC handler
+  absl::Status SetCoordinationServiceAgentInstance(
+      tsl::CoordinationServiceAgent* agent) override;
+  // TODO(hanyangtay): Remove this method once gRPC server clean shutdown is
+  // supported.
+  absl::Status StopCoordinationService() override;
+
+ protected:
+  virtual absl::Status GetHostAndPort(const ServerDef& server_def,
+                                      string* host_name, int* port) const;
+  absl::Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
+
+  // A subclass can override this method to support secure credentials.
+  virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
+      const ServerDef& server_def) const;
+
+  virtual ChannelCreationFunction GetChannelCreationFunction() const;
+
+  virtual std::unique_ptr<Master> CreateMaster(MasterEnv* master_env);
+
+  // Creates a WorkerCacheInterface for a session.
+  virtual absl::Status WorkerCacheFactory(
+      const WorkerCacheFactoryOptions& options,
+      WorkerCacheInterface** worker_cache);
+
+  // Override to return extra services to be brought up and managed along with
+  // the standard {master, worker, eager} services. The map key is an aribtrary
+  // string and the value is a pointer to the service to be brought up.
+  // Ownership of the pointer is transferred to GrpcServer after this call
+  // returns, and the service will be destroyed during the destruction of
+  // GrpcServer. Each service will have its HandleRPCsLoop called in a separate
+  // thread. An example usage would be to add a RDMA based partial worker
+  // service to offload tensor and data buffer transfers.
+  virtual std::map<std::string, tsl::AsyncServiceInterface*> ExtraServices(
+      ::grpc::ServerBuilder*) {
+    return {};
+  }
+
+  virtual std::map<std::string, tsl::AsyncServiceInterface*>
+  GetExtraServices() {
+    return extra_services_;
+  }
+
+  // Parses a WorkerCacheFactoryOptions into a GrpcChannelSpec.
+  absl::Status ParseChannelSpec(const WorkerCacheFactoryOptions& options,
+                                GrpcChannelSpec* channel_spec);
+
+  // Returns the port to which this server is bound.
+  // This method may only be called after `this->Init()` returns successfully.
+  int bound_port() const { return bound_port_; }
+
+  // Returns hostname.
+  const string& host_name() const { return host_name_; }
+
+  const ServerDef& server_def() const { return server_def_; }
+  GrpcWorker* worker_impl() const { return worker_impl_.get(); }
+  GrpcWorkerEnv* grpc_worker_env() const { return grpc_worker_env_.get(); }
+
+  absl::Status SetCoordinationServiceInstance(
+      tsl::CoordinationServiceInterface* service);
+
+ private:
+  Env* env_;
+
+  // The port to which this server is bound.
+  int bound_port_ = 0;
+
+  // The host name of this server
+  string host_name_;
+
+  // Guards server configuration, server, and state.
+  mutex mu_;
+
+  // Represents the current state of the server, which changes as follows:
+  //
+  //                 Join()            Join()
+  //                  ___               ___
+  //      Start()     \ /    Stop()     \ /
+  // NEW ---------> STARTED --------> STOPPED
+  //   \                          /
+  //    \________________________/
+  //            Stop(), Join()
+  enum State { NEW, STARTED, STOPPED };
+  State state_ TF_GUARDED_BY(mu_);
+
+  // Implementation of a TensorFlow master, and RPC polling thread.
+  MasterEnv master_env_;
+  std::unique_ptr<Master> master_impl_;
+  tsl::AsyncServiceInterface* master_service_ = nullptr;
+  std::unique_ptr<Thread> master_thread_ TF_GUARDED_BY(mu_);
+
+  std::map<std::string, tsl::AsyncServiceInterface*> extra_services_;
+  std::vector<std::unique_ptr<Thread>> extra_service_threads_
+      TF_GUARDED_BY(mu_);
+
+  // Implementation of a TensorFlow worker, and RPC polling thread.
+  WorkerEnv worker_env_;
+  std::unique_ptr<const DeviceMgr> owned_device_manager_;
+  std::unique_ptr<GrpcWorker> worker_impl_;
+  tsl::AsyncServiceInterface* worker_service_ = nullptr;
+  std::unique_ptr<Thread> worker_thread_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<GrpcWorkerEnv> grpc_worker_env_;
+
+  // TensorFlow Eager implementation, and RPC polling thread.
+  tsl::AsyncServiceInterface* eager_service_ = nullptr;
+  std::unique_ptr<Thread> eager_thread_ TF_GUARDED_BY(mu_);
+  std::shared_ptr<WorkerSession> worker_session_;
+
+  // Experimental coordination service implementation, and RPC polling thread.
+  tsl::AsyncServiceInterface* coordination_service_ = nullptr;
+  std::unique_ptr<Thread> coordination_thread_ TF_GUARDED_BY(mu_);
+
+  // TensorFlow profiler service implementation.
+  std::unique_ptr<grpc::ProfilerService::Service> profiler_service_ = nullptr;
+
+  // The overall server configuration.
+  ServerDef server_def_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<::grpc::Server> server_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_session.h
new file mode 100644
index 00000000..fe92f7c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -0,0 +1,156 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class MasterInterface;
+
+// A Session instance lets the caller drive a TensorFlow graph
+// computation on potentially remote sets of devices. This is a thin
+// wrapper around tensorflow::grpc::MasterService.
+//
+// Multiple threads must synchronize their accesses to a single
+// session.
+class GrpcSession : public Session {
+ protected:
+  explicit GrpcSession(const SessionOptions& options);
+
+ public:
+  static absl::Status Create(const SessionOptions& options,
+                             std::unique_ptr<GrpcSession>* out_session);
+  // Resets the resource containers.
+  static absl::Status Reset(const SessionOptions& options,
+                            const std::vector<string>& containers);
+
+  ~GrpcSession() override;
+
+  // Creates a session with the "target". The session carries out
+  // the graph computation defined by "graph", and will have version
+  // number "initial_version".
+  absl::Status Create(const GraphDef& graph) override;
+  absl::Status Create(const RunOptions& run_options,
+                      const GraphDef& graph) override;
+  absl::Status Create(GraphDef&& graph) override;
+  absl::Status Create(const RunOptions& run_options, GraphDef&& graph) override;
+
+  // Runs with and without RunOptions.
+  absl::Status Run(const std::vector<std::pair<string, Tensor> >& inputs,
+                   const std::vector<string>& output_tensor_names,
+                   const std::vector<string>& target_node_names,
+                   std::vector<Tensor>* outputs) override;
+  absl::Status Run(const RunOptions& run_options,
+                   const std::vector<std::pair<string, Tensor> >& inputs,
+                   const std::vector<string>& output_tensor_names,
+                   const std::vector<string>& target_node_names,
+                   std::vector<Tensor>* outputs,
+                   RunMetadata* run_metadata) override;
+
+  absl::Status Extend(const GraphDef& graph) override;
+  absl::Status Extend(const RunOptions& run_options,
+                      const GraphDef& graph) override;
+  absl::Status Extend(GraphDef&& graph) override;
+  absl::Status Extend(const RunOptions& run_options, GraphDef&& graph) override;
+
+  absl::Status Close() override;
+
+  // NOTE: This API is still experimental and may change.
+  absl::Status PRunSetup(const std::vector<string>& input_names,
+                         const std::vector<string>& output_names,
+                         const std::vector<string>& target_nodes,
+                         string* handle) override;
+
+  // NOTE: This API is still experimental and may change.
+  absl::Status PRun(const string& handle,
+                    const std::vector<std::pair<string, Tensor> >& inputs,
+                    const std::vector<string>& output_names,
+                    std::vector<Tensor>* outputs) override;
+
+  absl::Status ListDevices(std::vector<DeviceAttributes>* response) override;
+
+  absl::Status MakeCallable(const CallableOptions& callable_options,
+                            CallableHandle* out_handle) override;
+  absl::Status RunCallable(CallableHandle handle,
+                           const std::vector<Tensor>& feed_tensors,
+                           std::vector<Tensor>* fetch_tensors,
+                           RunMetadata* run_metadata) override;
+  absl::Status ReleaseCallable(CallableHandle handle) override;
+
+ protected:
+  // Takes ownership of `*master`.
+  void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
+  // Allows subclasses to customize Session creation.
+  void SetHandleAndGraphVersion(string handle, int64_t graph_version)
+      TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  const SessionOptions options_;
+  std::unique_ptr<MasterInterface> master_;
+  mutex mu_;
+
+  // handle_ returned by the master to identify this session.
+  string handle_ TF_GUARDED_BY(mu_);
+
+  // The current version of the graph.
+  int64_t current_graph_version_ TF_GUARDED_BY(mu_);
+
+  bool is_local_ = false;
+
+  absl::Status Handle(string* out_handle) TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status RunHelper(const RunOptions& run_options,
+                         const std::vector<std::pair<string, Tensor> >& inputs,
+                         const std::vector<string>& output_tensor_names,
+                         const std::vector<string>& target_node_names,
+                         std::vector<Tensor>* outputs,
+                         RunMetadata* run_metadata, const string& prun_handle);
+
+  absl::Status RunProto(CallOptions* call_options,
+                        MutableRunStepRequestWrapper* req,
+                        MutableRunStepResponseWrapper* resp);
+
+  // Implementations for all the public interfaces.
+  absl::Status CreateImpl(CallOptions* call_options, GraphDef graph);
+  absl::Status ExtendImpl(CallOptions* call_options, GraphDef graph);
+
+  GrpcSession(const GrpcSession&) = delete;
+  void operator=(const GrpcSession&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_state.h
new file mode 100644
index 00000000..4c5f560e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -0,0 +1,541 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+
+#include <queue>
+#include <utility>
+
+#include "grpcpp/generic/generic_stub.h"
+#include "grpcpp/grpcpp.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::RPCState;
+// NOLINTEND(misc-unused-using-decls)
+
+// Represents state associated with one streaming RPC call.
+// Similarly to above, we extract the methods of StreamingRPCState that don't
+// need to be templated into this abstract class.
+// Currently, *StreamingRPCState does not support client closing the call as
+// there is no use case for it - current clients keep the streaming call open
+// as long as possible. If/when the need arises, support can be added
+// by calling GenericClientAsyncReaderWriter::WritesDone with a new tag
+// TagType::kClientFinished and handling the completion in a new callback.
+class UntypedStreamingRPCState : public core::RefCounted {
+ public:
+  virtual void CallStarted(bool ok) = 0;
+  virtual void RequestWriteCompleted(bool ok) = 0;
+  virtual void ResponseReadCompleted(bool ok) = 0;
+  virtual void CallFinished(bool ok) = 0;
+
+  virtual string DebugString() const = 0;
+
+  class Tag : public GrpcClientCQTag {
+   public:
+    // One enum value per supported callback.
+    enum class TagType {
+      kCallStarted,
+      kRequestWriteCompleted,
+      kResponseReadCompleted,
+      kCallFinished,
+    };
+
+    Tag(UntypedStreamingRPCState* streaming_state, Tag::TagType type);
+
+    // Calls the callback associated with this tag and Unrefs
+    // `this->streaming_state_`.
+    void OnCompleted(bool ok) override;
+
+   private:
+    // OnCompleted() consumes on reference each time it is called.
+    UntypedStreamingRPCState* const streaming_state_;
+    const Tag::TagType type_;
+  };
+};
+
+const char* ToString(UntypedStreamingRPCState::Tag::TagType tag_type);
+
+// Represents a single request/response exchange between client and the server.
+// A single streaming call contains a sequence of exchanges. Besides the
+// messages, exchange contains:
+//  - the user callback to invoke when exchange completes (response is received
+//    or an error occurs).
+//  - The current state of the exchange.
+class Exchange {
+ public:
+  enum class State {
+    kExchangeCreated,
+    kRequestWriteIssued,
+    kRequestWriteCompleted,
+    kResponseReadIssued,
+  };
+
+  Exchange(const ::grpc::ByteBuffer& request_buf, protobuf::Message* response,
+           StatusCallback cb, string debug_string)
+      : state_(State::kExchangeCreated),
+        request_buf_(request_buf),
+        response_(response),
+        cb_(std::move(cb)),
+        debug_string_(std::move(debug_string)) {}
+
+  const ::grpc::ByteBuffer& request_buf() { return request_buf_; }
+  ::grpc::ByteBuffer* response_buf() { return &response_buf_; }
+
+  void MarkRequestWriteIssued() {
+    DCHECK(state_ == State::kExchangeCreated);
+    state_ = State::kRequestWriteIssued;
+  }
+  void MarkRequestWriteCompleted() {
+    DCHECK(state_ == State::kRequestWriteIssued);
+    state_ = State::kRequestWriteCompleted;
+  }
+  void MarkResponseReadIssued() {
+    DCHECK(state_ == State::kRequestWriteCompleted);
+    state_ = State::kResponseReadIssued;
+  }
+
+  // If `status` is success, completes this exchange by parsing the
+  // response_buf_ and invoking cb_ with OkStatus(). Else, invokes the
+  // callback with `status`.
+  void Complete(absl::Status status);
+
+  const State& state() const { return state_; }
+
+  string DebugString() const;
+
+ private:
+  State state_;
+  ::grpc::ByteBuffer request_buf_;
+  ::grpc::ByteBuffer response_buf_;
+  protobuf::Message* response_;
+  StatusCallback cb_;
+  string debug_string_;
+};
+
+const char* ToString(Exchange::State s);
+
+std::ostream& operator<<(std::ostream& os, const Exchange::State& state);
+
+// Represents a queue of exchanges.
+// When a client sends a new request a new exchange is created and added to the
+// end of the queue. Completed exchanges are popped from the front of the queue.
+// An explicit exchange queue is needed to brdige the client, which can send new
+// requests at any time, with gRPC infrastructure, which can handle a single
+// read and a single write request at a time.
+//
+// As the exchange progresses (request sending initiated, request sending
+// completed, response reading initiated) the queue helps to make sure that the
+// right operation is issued on the right exchange at the right time.
+//
+// To satisfy gRPC constraints, the states of exchanges must be as follows
+// starting from the front of the queue:
+//  - 0 or 1 exchange in kResponseReadIssued state
+//  - 0 or more exchanges in kRequestWriteCompleted state
+//  - 0 or 1 exchange in kRequestWriteIssued state
+//  - 0 or more exchanges in kExchangeCreated state
+//
+// Thread-compatible.
+class ExchangeQueue {
+ public:
+  // Creates a new exchange and adds it to the end of the queue.
+  void Emplace(const ::grpc::ByteBuffer& request_buf,
+               protobuf::Message* response, StatusCallback cb,
+               std::string debug_string);
+
+  // Returns an exchange for which we can initiate request writing, if any.
+  // Returns nullptr if there is no such exchange.
+  Exchange* GetReadyForRequestWriting();
+
+  // Returns an exchange for which we can initiate response reading, if any.
+  // Returns nullptr if there is no such exchange.
+  Exchange* GetReadyForResponseReading();
+
+  // Changes the state of the exchange that is current in kRequestWriteIssued
+  // state to kRequestWriteCompleted state.
+  // REQUIRES: There is an exchange in kRequestWriteIssued state.
+  void MarkRequestWriteCompleted();
+
+  // Returns the exchange at the front of the queue.
+  // REQUIRES: ExchangeQueue is not empty.
+  Exchange& GetFront();
+
+  // Removes the exchange at the front of the queue.
+  // REQUIRES: ExchangeQueue is not empty.
+  void PopFront();
+
+  // Returns a string containing addresses and states of all exchanges in this
+  // queue.
+  string DebugString() const;
+
+  // Swaps the contents of this and `other`.
+  void Swap(ExchangeQueue* other);
+
+  // Completes all exchanges in this with `status`.
+  void CompleteAll(absl::Status status);
+
+  void CallStarted() { call_started_ = true; }
+
+ private:
+  // Does nothing by default. Turn on VLOG(5) to enable.
+  // Checks that this ExchangeQueue is in a valid state.
+  // Kills the process if not.
+  void CheckInvariants();
+
+  // We can't process any exchanges until the call has started.
+  bool call_started_ = false;
+
+  // std::queue is based on std::deque by default. std::deque provides
+  // fairly strong iterator stability.
+  std::deque<Exchange> exchanges_;
+};  // namespace tensorflow
+
+// Represents state associated with one streaming RPC call.
+// Thread-safe
+template <class Response>
+class StreamingRPCState : public UntypedStreamingRPCState {
+ public:
+  // Default behavior is to set fail_fast = False and handle timeouts
+  // manually.
+  StreamingRPCState(
+      std::unique_ptr<::grpc::GenericClientAsyncReaderWriter> call,
+      const std::shared_ptr<::grpc::ClientContext>& context)
+      : context_(context), call_(std::move(call)), call_state_(State::kActive) {
+    Ref();
+    VLOG(3) << "Created new StreamingRPCState " << this;
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::StartCall";
+    call_->StartCall(&call_started_tag_);
+  }
+
+  ~StreamingRPCState() override {
+    VLOG(3) << "Destructing StreamingRPCState " << this;
+  }
+
+  // Attempts to send the next request. `done` is invoked when
+  // `response` has been filled with the data from the server, or if there
+  // is an error. `done` can be invoked before SendNextRequest returns.
+  // Return `true` if the call is alive and the `done` callback has or
+  // will be invoked. If the call is dead, returns `false`. `done` callback
+  // will not be invoked in this case.
+  // REQUIRES: The call has been started, i.e. WaitForCallStarted() has
+  // returned.
+  bool SendNextRequest(const protobuf::Message& request, Response* response,
+                       const StatusCallback& done) {
+    ::grpc::ByteBuffer request_buf;
+    ::grpc::Status s = tsl::GrpcMaybeUnparseProto(request, &request_buf);
+    if (!s.ok()) {
+      absl::Status status = FromGrpcStatus(s);
+      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
+                 << status.ToString();
+      done(status);
+      return true;
+    }
+
+    mutex_lock l(mu_);
+    if (call_state_ != State::kActive) {
+      // `done` is not invoked intentionally.
+      return false;
+    }
+    if (VLOG_IS_ON(3)) {
+      // If vlog 3 is enabled, include first 100 chars of request as debug
+      // string.
+      exchanges_.Emplace(request_buf, response, done,
+                         request.ShortDebugString().substr(0, 100));
+    } else {
+      exchanges_.Emplace(request_buf, response, done, "");
+    }
+    MaybeIssueRequestWriteLocked();
+    return true;
+  }
+
+  void CallStarted(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this << ")::CallStarted(ok=" << ok
+            << ")";
+    mutex_lock l(mu_);
+    if (!ok) {
+      call_state_ = State::kDone;
+      return;
+    }
+    exchanges_.CallStarted();
+    // Now that the call has started, we can write our first request, if any.
+    MaybeIssueRequestWriteLocked();
+  }
+
+  void RequestWriteCompleted(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this
+            << ")::RequestWriteCompleted(ok=" << ok << ")";
+    mu_.lock();
+    if (call_state_ != State::kActive) {
+      mu_.unlock();
+      return;
+    }
+    exchanges_.MarkRequestWriteCompleted();
+    // Issue ResponseRead regardless of OK status on completing RequestWrite.
+    // If the underlying completion queue is in Not-OK status due to previous
+    // request failuress (i.e., `ok` from `Next` call on completion queue is
+    // False), delay the error in ResponseRead so we can get the remote error
+    // message from response buffer.
+    MaybeIssueResponseReadLocked();
+
+    if (ok) {
+      MaybeIssueRequestWriteLocked();
+    }
+    mu_.unlock();
+  }
+
+  void ResponseReadCompleted(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this
+            << ")::ResponseReadCompleted(ok=" << ok << ")";
+    mu_.lock();
+    if (call_state_ != State::kActive) {
+      mu_.unlock();
+      return;
+    }
+    if (!ok) {
+      IssueCallFinishLocked();
+      mu_.unlock();
+      return;
+    }
+
+    // Complete the exchange without holding the lock because user's
+    // callback can call back into this RPC code resulting in a deadlock.
+    // No other thread can pop this exchange while we release the lock because
+    // this is the only method that pops exchanges and it is called from a
+    // single thread that waits on completion queue events.
+    Exchange* e;
+    e = &exchanges_.GetFront();
+    mu_.unlock();
+
+    e->Complete(absl::OkStatus());
+
+    {
+      mutex_lock l(mu_);
+      exchanges_.PopFront();
+      MaybeIssueResponseReadLocked();
+    }
+  }
+
+  void CallFinished(bool ok) override {
+    VLOG(3) << "StreamingRPCState(" << this << ")::CallFinished(ok=" << ok
+            << ")";
+    mu_.lock();
+    DCHECK(call_state_ != State::kActive);
+    if (call_state_ != State::kFinishing) {
+      mu_.unlock();
+      return;
+    }
+
+    absl::Status s = FromGrpcStatus(call_status_);
+    if (s.ok() && !ok) {
+      s.Update(
+          errors::Internal("GRPC status is okay but CompletionQueueStatus is "
+                           "not.  This should never happen.",
+                           context_->debug_error_string()));
+    }
+    // unlocks mu_
+    MarkDoneAndCompleteExchanges(s);
+  }
+
+  string DebugString() const override {
+    mutex_lock l(mu_);
+    return exchanges_.DebugString();
+  }
+
+ private:
+  enum class State {
+    kActive,
+    kFinishing,
+    kDone,
+  };
+
+  void MarkDoneAndCompleteExchanges(absl::Status status)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_UNLOCK_FUNCTION(mu_) {
+    call_state_ = State::kDone;
+    VLOG(2) << "Ending gRPC streaming call on the client side due to "
+            << status.ToString();
+    // Swap the exchanges_ into a temporary ExchangeQueue so that we can
+    // complete all exchanges without holding mu_ in case user callback
+    // reach back into this. This should be impossible now, but safer for
+    // the future.
+    ExchangeQueue queue;
+    exchanges_.Swap(&queue);
+    mu_.unlock();
+    queue.CompleteAll(status);
+  }
+
+  void MaybeIssueRequestWriteLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Exchange* exchange = exchanges_.GetReadyForRequestWriting();
+    if (exchange == nullptr) {
+      // There are no queued exchanges, there is already an outstanding write,
+      // or there are no just created exchanges.
+      return;
+    }
+    exchange->MarkRequestWriteIssued();
+    Ref();
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Write";
+    call_->Write(exchange->request_buf(), &request_write_completed_tag_);
+  }
+
+  void MaybeIssueResponseReadLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Exchange* exchange = exchanges_.GetReadyForResponseReading();
+    if (exchange == nullptr) {
+      return;
+    }
+    exchange->MarkResponseReadIssued();
+    Ref();
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Read";
+    call_->Read(exchange->response_buf(), &response_read_completed_tag_);
+  }
+
+  void IssueCallFinishLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    call_state_ = State::kFinishing;
+    Ref();
+    VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Finish";
+    // We call finish in response to completed (with error) response reading tag
+    // on some exchange. We let this exchange hang in ResponseReadIssued state.
+    // ExchangeQueue makes sure that there is at most one exchange in this
+    // state. So, no new reads will be issued.
+    call_->Finish(&call_status_, &finished_tag_);
+  }
+
+  // Holds state for a single request/response exchange between the client
+  // and the server.
+  typedef typename UntypedStreamingRPCState::Tag Tag;
+
+  // Order of context_ and call_ is important because context_ must outlive
+  // call_.
+  const std::shared_ptr<const ::grpc::ClientContext> context_;
+  std::unique_ptr<::grpc::GenericClientAsyncReaderWriter> call_;
+
+  mutable mutex mu_;
+  ExchangeQueue exchanges_ TF_GUARDED_BY(mu_);
+  State call_state_ TF_GUARDED_BY(mu_);
+  ::grpc::Status call_status_ TF_GUARDED_BY(mu_);
+
+  // We can get away with having single instances of these tags per
+  // StreamingRPCState because we make sure (as gRPC requires) that
+  // there is at most one outstanding Read and at most one outstanding Write
+  // in the completion queue.
+  // Tags are immutable. No need to guard them.
+  Tag call_started_tag_{this, Tag::TagType::kCallStarted};
+  Tag request_write_completed_tag_{this, Tag::TagType::kRequestWriteCompleted};
+  Tag response_read_completed_tag_{this, Tag::TagType::kResponseReadCompleted};
+  Tag finished_tag_{this, Tag::TagType::kCallFinished};
+};
+
+// Creates streaming calls and dispatches requests to them.
+// In the common case, the client would create a StreamingRPCDispatcher for
+// each bidirectional streaming RPC it might want to make. The first time, it
+// calls SendNextRequest, a streaming call is initiated and the request is
+// sent within this call. Initiation of the call blocks the client. If there are
+// no errors, subsequent calls to SendNextRequest would use the already active
+// call. If there was an error, the call object will be destroyed after all
+// the callbacks for outstanding requests have been invoked. The next call to
+// SendNextRequest will initiate a new call.
+//
+// Callbacks that are part of the same call, are invoked in the order they were
+// provided, but callbacks across calls (a failed and a new one) can be invoked
+// in any order.
+//
+// Thread-safe.
+template <class Response>
+class StreamingRPCDispatcher {
+ public:
+  StreamingRPCDispatcher(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+                         const ::grpc::string& method)
+      : stub_(stub), cq_(cq), method_(method) {}
+
+  // Attempts to send the next request. If there is no active streaming call,
+  // starts one and sends the request on top of it. `done` is invoked when
+  // `response` has been filled with the data from the server, or if there
+  // is an error. `done` can be invoked before SendNextRequest returns.
+  void SendNextRequest(const protobuf::Message& request, Response* response,
+                       StatusCallback done) {
+    mutex_lock l(mu_);
+    if (state_ == nullptr) {
+      CreateStreamingState();
+    }
+
+    bool is_call_alive = state_->SendNextRequest(request, response, done);
+    if (is_call_alive) {
+      return;
+    }
+
+    // The attempt to send failed because the call was dead, create a new
+    // call and try again. When the call is dead SendNextRequest does not call
+    // `done`.
+    CreateStreamingState();
+
+    is_call_alive = state_->SendNextRequest(request, response, done);
+    if (!is_call_alive) {
+      // Consider retrying to create and start a call few more times.
+      done(errors::Unknown("gRPC call failed right after it was created"));
+    }
+  }
+
+  // Request to cancel the current streaming call. Non-blocking.
+  void CancelCall() {
+    mutex_lock l(mu_);
+    if (state_ == nullptr) {
+      return;
+    }
+    context_->TryCancel();
+    state_ = nullptr;
+  }
+
+ private:
+  void CreateStreamingState() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    // ClientContext cannot be reused across calls.
+    context_ = std::make_shared<::grpc::ClientContext>();
+    // Don't immediately fail StartCall if the channel is not ready. Wait for
+    // the channel to become ready.
+    context_->set_wait_for_ready(true);
+
+    std::unique_ptr<::grpc::GenericClientAsyncReaderWriter> call =
+        stub_->PrepareCall(context_.get(), method_, cq_);
+
+    state_.reset(new StreamingRPCState<Response>(std::move(call), context_));
+  }
+
+  mutable mutex mu_;
+
+  // Both are thread-safe
+  ::grpc::GenericStub* const stub_;
+  ::grpc::CompletionQueue* const cq_;
+
+  // Does not need synchronization since it is constant.
+  const ::grpc::string method_;
+
+  std::shared_ptr<::grpc::ClientContext> context_ TF_GUARDED_BY(mu_);
+  core::RefCountPtr<StreamingRPCState<Response>> state_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h
new file mode 100644
index 00000000..393ef2a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TENSOR_CODING_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TENSOR_CODING_H_
+
+#include "grpcpp/impl/codegen/byte_buffer.h"
+#include "absl/status/status.h"
+
+namespace tensorflow {
+class Tensor;
+class RecvTensorResponse;
+
+// TODO(jeff,sanjay): this should not be grpc specific.  Instead of
+// grpc::ByteBuffer*, it should accept an object of an interface type
+// to which owned byte-arrays can be added.
+namespace grpc {
+
+// Encode a RecvTensorResponse protocol buffer into a byte buffer in a
+// format that is parseable as a RecvTensorResponse protocol buffer
+// holding "proto".
+//
+// Discards original contents of *result.
+void EncodeRecvTensorResponseToByteBuffer(const RecvTensorResponse& proto,
+                                          ::grpc::ByteBuffer* result);
+
+// Encode a Tensor into a byte buffer in a format that is parseable
+// as a RecvTensorResponse protocol buffer holding "val".
+//
+// "is_dead" is the value to encode for "RecvTensorResponse::is_dead"
+// (tensor is the output of a dead node and content is invalid because
+// control flow operations elsewhere caused the path on which this
+// Tensor exists to not be taken).
+//
+// "val" holds the tensor value to be encoded.
+//
+// Discards original contents of *result.
+absl::Status EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
+                                      bool require_ack,
+                                      ::grpc::ByteBuffer* result);
+
+}  // namespace grpc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TENSOR_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
new file mode 100644
index 00000000..9101ca92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -0,0 +1,100 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TESTLIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TESTLIB_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class Device;
+
+namespace test {
+
+struct TestJob {
+  std::string name;
+  int num_tasks;
+  int num_replicas = 1;
+};
+
+struct TestClusterConfig {
+  std::string binary_path;
+  SessionOptions options;
+  std::vector<TestJob> jobs;
+
+  TestClusterConfig& Options(const SessionOptions& options) {
+    this->options = options;
+    return *this;
+  }
+  TestClusterConfig& Jobs(const std::vector<TestJob>& jobs) {
+    this->jobs = jobs;
+    return *this;
+  }
+};
+
+// Provides a handle to a set of TensorFlow servers (masters and
+// workers) for testing purposes.
+//
+// This class currently runs the servers in separate processes; the
+// lifetime of this object is coterminous with the lifetimes of those
+// processes.
+class TestCluster {
+ public:
+  // Creates a new test cluster based on the given `options` (which
+  // configure the number of devices of each type) and a count of
+  // processes `n`. On success, the test cluster is stored in
+  // *out_cluster, and this function returns OK. Otherwise an error is
+  // returned.
+  static absl::Status MakeTestCluster(
+      const TestClusterConfig& config,
+      std::unique_ptr<TestCluster>* out_cluster);
+  ~TestCluster();
+
+  // Returns a vector of string "<hostname>:<port>" pairs that may be
+  // used as targets to construct a GrpcSession.
+  const std::vector<string>& targets(std::string job_name = "localhost") {
+    return targets_.at(job_name);
+  }
+
+  // Returns a vector of devices available in this test cluster.
+  const std::vector<DeviceAttributes>& devices() const { return devices_; }
+
+ private:
+  TestCluster() = default;
+
+  std::vector<std::unique_ptr<SubProcess>> subprocesses_;
+  absl::flat_hash_map<std::string, std::vector<std::string>> targets_;
+  std::vector<DeviceAttributes> devices_;
+
+  TestCluster(const TestCluster&) = delete;
+  void operator=(const TestCluster&) = delete;
+};
+
+}  // end namespace test
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_util.h
new file mode 100644
index 00000000..0db18382
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::FromGrpcStatus;
+using tsl::SharedGrpcChannelPtr;
+using tsl::ToGrpcStatus;
+// NOLINTEND(misc-unused-using-decls)
+
+// Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse
+// an efficient byte reader from which to decode a RecvTensorResponse.
+class GrpcByteSource : public TensorResponse::Source {
+ public:
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::ProtoBufferReader Reader;
+
+  protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  ::grpc::ByteBuffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;    // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
+inline string GrpcIdKey() { return "tf-rpc"; }
+
+// Decode a TensorResponse without extra copying. This function is an optimized
+// variant of tsl::GrpcMaybeParseProto.
+bool GrpcMaybeParseTensorResponse(::grpc::ByteBuffer* src, TensorResponse* dst);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
new file mode 100644
index 00000000..2dfbc79a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
@@ -0,0 +1,76 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace tensorflow {
+
+class GrpcWorkerEnv {
+ public:
+  GrpcWorkerEnv(size_t num_completion_queues, size_t num_threads);
+
+  ~GrpcWorkerEnv();
+
+  thread::ThreadPool* GetThreadPool() const { return threadpool_.get(); }
+
+  size_t CompletionQueueSize() const { return threads_.size(); }
+
+  ::grpc::CompletionQueue* GetCompletionQueue(size_t index) const {
+    return threads_.at(index).completion_queue();
+  }
+
+ private:
+  // Thread wrapping class that drives work over a single gRPC
+  // CompletionQueue.
+  class GrpcWorkerCacheThread {
+   public:
+    GrpcWorkerCacheThread();
+
+    ~GrpcWorkerCacheThread();
+
+    ::grpc::CompletionQueue* completion_queue() const {
+      return &completion_queue_;
+    }
+
+   private:
+    mutable ::grpc::CompletionQueue completion_queue_;
+    std::unique_ptr<Thread> thread_;
+  };
+
+  std::unique_ptr<thread::ThreadPool> threadpool_;
+  std::vector<GrpcWorkerCacheThread> threads_;
+};
+
+// Create a GrpcWorkerEnv instance that can be used as argument to create
+// gRPC worker cache. Caller should take the ownership of the returned instance.
+GrpcWorkerEnv* CreateGrpcWorkerEnv();
+
+// The returned WorkerCacheInterface object takes the ownership of "cc".
+WorkerCacheInterface* NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache> cc,
+                                         GrpcWorkerEnv* worker_env);
+
+WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
+    std::shared_ptr<GrpcChannelCache> cc, GrpcWorkerEnv* worker_env,
+    WorkerInterface* local_worker, const string& local_target);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
new file mode 100644
index 00000000..ebb1ac91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -0,0 +1,92 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h"
+#include "tensorflow/core/distributed_runtime/worker.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+
+namespace tsl {
+class AsyncServiceInterface;
+}
+
+namespace tensorflow {
+
+class ConfigProto;
+struct WorkerEnv;
+class WorkerSession;
+class RpcResponseCache;
+
+class GrpcWorker : public Worker {
+ public:
+  GrpcWorker(WorkerEnv* env, const ConfigProto& config);
+
+  // Specialized version of RecvTensor for gRPC, which avoids a copy.
+  virtual void GrpcRecvTensorAsync(CallOptions* opts,
+                                   const RecvTensorRequest* request,
+                                   ::grpc::ByteBuffer* response,
+                                   StatusCallback done);
+
+  void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
+                    StatusCallback done) override;
+
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override;
+
+  void CleanupGraphAsync(const CleanupGraphRequest* request,
+                         CleanupGraphResponse* response,
+                         StatusCallback done) override;
+
+  WorkerEnv* env();
+
+  void EnableResponseCache();
+
+  void RemoveCacheEntryForId(int64_t request_id);
+
+ private:
+  std::unique_ptr<RpcResponseCache> response_cache_;
+  const int32 recv_buf_max_chunk_;
+};
+
+std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env,
+                                          const ConfigProto& config);
+
+struct GrpcWorkerServiceOptions {
+  // Map from GrpcWorkerMethod id to queue depth.  If set this overrides the
+  // default queue depth for a method.
+  std::unordered_map<int, int> queue_depth;
+  int num_serving_threads = 8;
+};
+
+// Returns an implementation of WorkerService rpc service.
+std::unique_ptr<tsl::AsyncServiceInterface> NewGrpcWorkerService(
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions options = GrpcWorkerServiceOptions());
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
new file mode 100644
index 00000000..25f5ec97
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -0,0 +1,118 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
+#include "grpcpp/support/byte_buffer.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace grpc {
+
+// Support parsing/unparsing of tensorflow::TensorResponse.
+// Wire-format is identical to RecvTensorResponse.
+// This is specializing an existing template, so it's okay to do this in a
+// namespace that we don't own.
+template <>
+class SerializationTraits<tensorflow::TensorResponse> {
+ public:
+  static Status Serialize(const tensorflow::TensorResponse& msg, ByteBuffer* bp,
+                          bool* own_buffer) {
+    LOG(FATAL) << "TODO(sanjay,jeff): Implement";
+    return Status();
+  }
+  static Status Deserialize(ByteBuffer* buffer,
+                            tensorflow::TensorResponse* msg) {
+    if (buffer == nullptr) {
+      return Status(StatusCode::INTERNAL, "No payload");
+    }
+    Status result = Status::OK;
+    if (result.ok()) {
+      ::tensorflow::GrpcByteSource source(buffer);
+      auto s = msg->ParseFrom(&source);
+      if (!s.ok()) {
+        result = Status(StatusCode::INTERNAL,
+                        ::tensorflow::strings::StrCat(
+                            "TensorResponse parse error", s.message()));
+      }
+    }
+    buffer->Clear();
+    return result;
+  }
+};
+
+}  // namespace grpc
+
+namespace tensorflow {
+
+// Names of worker methods.
+enum class GrpcWorkerMethod {
+  kGetStatus,
+  kCreateWorkerSession,
+  kDeleteWorkerSession,
+  kRegisterGraph,
+  kDeregisterGraph,
+  kRunGraph,
+  kCleanupGraph,
+  kCleanupAll,
+  kRecvTensor,
+  kRecvBuf,
+  kLogging,
+  kTracing,
+  kCompleteGroup,
+  kCompleteInstance,
+  kGetStepSequence,
+  kMarkRecvFinished,
+};
+
+static const int kGrpcNumWorkerMethods =
+    static_cast<int>(GrpcWorkerMethod::kMarkRecvFinished) + 1;
+
+const char* GrpcWorkerMethodName(GrpcWorkerMethod id);
+
+namespace grpc {
+
+// Implementation of `tensorflow.WorkerService`, based on the
+// definition in "//tensorflow/core/protobuf/worker_service.proto",
+// and the gRPC generated stub and service classes.
+// See the proto file for the definition of methods and messages.
+class WorkerService final {
+ public:
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService();
+    virtual ~AsyncService();
+
+    // Make RequestAsyncUnary public for grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+};
+
+}  // namespace grpc
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
new file mode 100644
index 00000000..42eda4ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -0,0 +1,60 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
+
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class DeviceMgr;
+
+// RendezvousMgr keeps track of a set of local rendezvous instances.
+// All tensors sent by this worker are buffered in a RendezvousMgr
+// until the tensor is received.  Each global unique "step_id"
+// corresponds to one local rendezvous instance managed by a
+// RendezvousMgr.
+//
+// E.g.,
+//   Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
+//   fork execution of an graph executor using "rendez"  on thread 1;
+//   fork execution of another graph executor using "rendez" on thread 2;
+//   ...
+//   join threads 1 and 2;
+//
+// In the example above, execution in thread 1 and 2 communicates with
+// each other by send/recv operations through the "rend".
+//
+// Tensors sent and recved through rendezvous managed by this
+// RendezvousMgr must have keys generated by Rendezvous::CreateKey.
+class RpcRendezvousMgr : public BaseRendezvousMgr {
+ public:
+  explicit RpcRendezvousMgr(const WorkerEnv* env);
+
+ protected:
+  tsl::core::RefCountPtr<BaseRemoteRendezvous> Create(
+      int64_t step_id, const WorkerEnv* worker_env) override;
+
+ private:
+  RpcRendezvousMgr(const RpcRendezvousMgr&) = delete;
+  void operator=(const RpcRendezvousMgr&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h
new file mode 100644
index 00000000..0f31ddaf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RESPONSE_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RESPONSE_CACHE_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mutex.h"
+
+// gRPC response caching.  Most WorkerService methods cannot be retried directly
+// as they will fail or deadlock.  To enable retrying, we can instead cache
+// responses and reply to duplicate requests from the cache. The cache will be
+// cleaned when the MarkRecvFinishedRequest is received from the receiver or the
+// session step is completed.
+namespace tensorflow {
+
+// Track and cache the state of worker service RPCs.  An RPC can be in 3 states:
+//
+// * PENDING: this is the first call of the RPC, and it will transition to
+// * ACTIVE: another thread is active processing this RPC
+// * FINISHED: the worker has finished processing the method
+
+class RpcResponseCache {
+ public:
+  using FinishResponseCB = std::function<void(
+      const Tensor& tensor, bool is_dead, const absl::Status& status)>;
+
+  // Add the given request to the cache.
+  // If the request is in the cache,
+  //    If it is finished, invoke `cb` immediately
+  //    If active, cb will be invoked when the current call completes.
+  //    In either case, return true.
+  // Otherwise, store the request and cb in the cache, and return false.
+  // Note FinishResponseCB is assumed to be thread-safe.
+  bool QueueRequest(int64_t request_id, int64_t step_id,
+                    const FinishResponseCB& cb);
+
+  // Fill the response cache for the given request_id and respond to all
+  // pending request.
+  void RequestFinished(int64_t request_id, const Tensor& tensor, bool is_dead,
+                       const absl::Status& status);
+
+  // Erase the cache entry with the given request_id
+  void EraseRequestId(int64_t request_id);
+
+  // Erase cache entries with the given step_id
+  void CleanEntriesForStep(int64_t step_id);
+
+  int64_t size();
+
+ private:
+  struct ResponseCacheEntry {
+    enum class State {
+      PENDING = 0,
+      ACTIVE = 1,
+      FINISHED = 2,
+    };
+
+    State state = State::PENDING;
+    int64_t step_id = -1;
+    Tensor tensor;
+    bool is_dead = false;
+    absl::Status response_status;
+
+    void FinishResponse(const FinishResponseCB& cb) const {
+      cb(tensor, is_dead, response_status);
+    }
+    std::vector<FinishResponseCB> callbacks;
+  };
+
+  mutex mu_;
+  // response_cache_ is expected to be small, as entries are cleared immediately
+  // on ack from the receiver.
+  gtl::FlatMap<int64_t, ResponseCacheEntry> response_cache_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RESPONSE_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
new file mode 100644
index 00000000..6836204c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class CollectiveParamResolverDistributed;
+class ConfigProto;
+class DeviceMgr;
+class DeviceResolverDistributed;
+class WorkerCacheInterface;
+class StepSequenceRequest;
+class StepSequenceResponse;
+
+// An implementation of CollectiveExecutorMgr for a distributed environment
+// that uses WorkerInterface::RecvBufAsync to route data transfers over RPCs.
+//
+// In some execution environments it may be possible to implement a
+// higher-performance solution and use it in place of this class.
+class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
+ public:
+  RpcCollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverDistributed> dev_resolver,
+      std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
+      WorkerCacheInterface* worker_cache, const string& task_name);
+
+  virtual ~RpcCollectiveExecutorMgr();
+
+  // This function should only be called at the group_leader, by an RPC.
+  // Other needs for StepIds should be satisfied by NextStepId.
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override;
+
+  void RefreshStepIdSequenceAsync(int64_t graph_key,
+                                  const StatusCallback& done) override;
+
+  int64_t NextStepId(int64_t graph_key) override;
+
+  void RetireStepId(int64_t graph_key, int64_t step_id) override;
+
+ protected:
+  virtual CollectiveExecutor* Create(int64_t step_id) override;
+
+  WorkerCacheInterface* const worker_cache_;  // Not owned.
+  const string task_name_;
+  string group_leader_;
+  friend class RpcCollectiveExecutorMgrTest;
+
+ private:
+  absl::Status UpdateStepSequences(const GetStepSequenceResponse& resp);
+
+  // This class maintains the step_id sequencing for a single
+  // collective_graph_key.
+  struct GraphKeySequence {
+    explicit GraphKeySequence(int64_t k)
+        : graph_key_(k), next_step_id_(CollectiveExecutor::kInvalidId) {}
+
+    const int64_t graph_key_;
+    int64_t next_step_id_;
+  };
+
+  mutex sequence_mu_;
+  gtl::FlatMap<int64_t, GraphKeySequence*> sequence_table_
+      TF_GUARDED_BY(sequence_mu_);
+};
+
+// Creates a distributed CollectiveExecutorMgr with production implementations
+// of each components. Cases that need to inject other implementations of these
+// components should call CollectiveExecutorMgr constructor directly.
+std::unique_ptr<RpcCollectiveExecutorMgr> CreateProdRpcCollectiveExecutorMgr(
+    const ConfigProto& config, const DeviceMgr* device_mgr,
+    std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
+    WorkerCacheInterface* worker_cache, const string& default_worker_name);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/scheduler.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/scheduler.h
new file mode 100644
index 00000000..4385db78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/scheduler.h
@@ -0,0 +1,121 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
+
+#include <deque>
+#include <functional>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/costmodel.h"
+
+namespace tensorflow {
+
+class SlackAnalysis {
+ public:
+  SlackAnalysis(const Graph* g, const CostModel* cost_model);
+
+  ~SlackAnalysis() {}
+
+  // Compute the earliest possible start time for each node, based on
+  // a given cost model. 'asap_time' is indexed by node id.
+  Microseconds ComputeAsap(std::vector<Microseconds>* asap_times);
+
+  // Compute the latest possible start time for each node, based on
+  // a given cost model. 'alap_time' is indexed by node id.
+  Microseconds ComputeAlap(std::vector<Microseconds>* alap_times);
+
+  // Compute the "slack" of each node. 'slacks' is indexed by node id.
+  void ComputeSlack(std::vector<int64_t>* slacks);
+
+ private:
+  const Graph* graph_;
+  const CostModel* cost_model_;
+
+  SlackAnalysis(const SlackAnalysis&) = delete;
+  void operator=(const SlackAnalysis&) = delete;
+};
+
+class GreedyScheduler {
+ public:
+  struct Sim {
+    int degree_parallelism;
+    int num_running;
+    std::vector<const Node*> ready_nodes;
+  };
+
+  struct Event {
+    const Node* node;
+    Microseconds time;
+    bool is_completion;
+
+    bool operator<(const Event& other) const { return time < other.time; }
+  };
+
+  GreedyScheduler(const DeviceSet* devices, const CostModel* cost_model,
+                  const Graph* g, std::vector<int64_t>* priority);
+
+  ~GreedyScheduler();
+
+  // Computes the start time of each node given the priorities of
+  // the nodes.
+  Microseconds ComputeSchedule(std::vector<Microseconds>* start_times);
+
+ private:
+  // Returns the ready node with the highest priority for a sim.
+  const Node* GetNodeWithHighestPriority(const std::vector<const Node*>& nodes);
+
+  const DeviceSet* devices_;
+  const CostModel* cost_model_;
+  const Graph* graph_;
+  std::vector<int64_t>* priority_;
+  std::unordered_map<string, Sim*> device_states_;
+
+  GreedyScheduler(const GreedyScheduler&) = delete;
+  void operator=(const GreedyScheduler&) = delete;
+};
+
+class PriorityScheduler {
+ public:
+  PriorityScheduler(const DeviceSet* devices, const CostModel* cost_model,
+                    const Graph* g);
+
+  ~PriorityScheduler() {}
+
+  // Computes a schedule of the ideal start time for each node.
+  // Returns the makespan (the total running time).
+  Microseconds ComputeSchedule(std::vector<Microseconds>* start_times);
+
+  // Computes a schedule and assigns priorities to the nodes based on
+  // the schedule. Returns the makespan.
+  Microseconds AssignPriorities(std::vector<int64_t>* priorities);
+
+ private:
+  const DeviceSet* devices_;
+  const CostModel* cost_model_;
+  const Graph* graph_;
+
+  PriorityScheduler(const PriorityScheduler&) = delete;
+  void operator=(const PriorityScheduler&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/server_lib.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/server_lib.h
new file mode 100644
index 00000000..cc92d0ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/server_lib.h
@@ -0,0 +1,135 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+
+#include <memory>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tsl {
+class CoordinationServiceAgent;
+}  // namespace tsl
+
+namespace tensorflow {
+
+class DeviceMgr;
+class EagerContext;
+class WorkerEnv;
+class MasterEnv;
+
+// This library supports a registration/factory-based mechanism for
+// creating TensorFlow server objects. Each server implementation must
+// have an accompanying implementation of ServerFactory, and create a
+// static "registrar" object that calls `ServerFactory::Register()`
+// with an instance of the factory class. See "rpc/grpc_server_lib.cc"
+// for an example.
+
+// Represents a single TensorFlow server that exports Master and Worker
+// services.
+class ServerInterface {
+ public:
+  ServerInterface() {}
+  virtual ~ServerInterface() {}
+
+  // Starts the server running asynchronously. Returns OK on success, otherwise
+  // returns an error.
+  virtual absl::Status Start() = 0;
+
+  // Stops the server asynchronously. Returns OK on success, otherwise returns
+  // an error.
+  //
+  // After calling `Stop()`, the caller may call `Join()` to block until the
+  // server has stopped.
+  virtual absl::Status Stop() = 0;
+
+  // Blocks until the server has stopped. Returns OK on success, otherwise
+  // returns an error.
+  virtual absl::Status Join() = 0;
+
+  // Returns a target string that can be used to connect to this server using
+  // `tensorflow::NewSession()`.
+  virtual const string target() const = 0;
+
+  virtual WorkerEnv* worker_env() = 0;
+  virtual MasterEnv* master_env() = 0;
+
+  // Update the set of workers that can be reached by the server
+  virtual absl::Status UpdateServerDef(const ServerDef& server_def) = 0;
+
+  // Functions to operate on service-specific properties.
+  //
+  // Add master eager context to local eager service in order to handle enqueue
+  // requests from remote workers.
+  virtual absl::Status AddMasterEagerContextToEagerService(
+      const tensorflow::uint64 context_id, EagerContext* context) = 0;
+  // Set coordination service agent instance to coordination service RPC handler
+  virtual absl::Status SetCoordinationServiceAgentInstance(
+      tsl::CoordinationServiceAgent* agent) = 0;
+  // TODO(hanyangtay): Remove this method once gRPC server clean shutdown is
+  // supported.
+  virtual absl::Status StopCoordinationService() = 0;
+
+ private:
+  ServerInterface(const ServerInterface&) = delete;
+  void operator=(const ServerInterface&) = delete;
+};
+
+class ServerFactory {
+ public:
+  struct Options {
+    // Local DeviceMgr to use.
+    tensorflow::DeviceMgr* local_device_mgr;
+  };
+  // Creates a new server based on the given `server_def`, and stores
+  // it in `*out_server`. Returns OK on success, otherwise returns an
+  // error.
+  virtual absl::Status NewServer(
+      const ServerDef& server_def, const Options& options,
+      std::unique_ptr<ServerInterface>* out_server) = 0;
+
+  // Returns true if and only if this factory can create a server
+  // based on the given `server_def`.
+  virtual bool AcceptsOptions(const ServerDef& server_def) = 0;
+
+  virtual ~ServerFactory() {}
+
+  // For each `ServerFactory` subclass, an instance of that class must
+  // be registered by calling this method.
+  //
+  // The `server_type` must be unique to the server factory.
+  static void Register(const string& server_type, ServerFactory* factory);
+
+  // Looks up a factory that can create a server based on the given
+  // `server_def`, and stores it in `*out_factory`. Returns OK on
+  // success, otherwise returns an error.
+  static absl::Status GetFactory(const ServerDef& server_def,
+                                 ServerFactory** out_factory);
+};
+
+// Creates a server based on the given `server_def`, and stores it in
+// `*out_server`. Returns OK on success, otherwise returns an error.
+absl::Status NewServer(const ServerDef& server_def,
+                       std::unique_ptr<ServerInterface>* out_server);
+absl::Status NewServerWithOptions(const ServerDef& server_def,
+                                  const ServerFactory::Options& options,
+                                  std::unique_ptr<ServerInterface>* out_server);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/session_mgr.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/session_mgr.h
new file mode 100644
index 00000000..55c64f45
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/session_mgr.h
@@ -0,0 +1,169 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+
+#include <functional>
+#include <string>
+
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+class WorkerCacheInterface;
+struct WorkerEnv;
+
+// SessionMgr keeps track of information related to a given session.
+//
+// SessionMgr runs on the workers.
+//
+// SessionMgr is threadsafe.
+class SessionMgr {
+ public:
+  typedef std::function<absl::Status(const ServerDef&, WorkerCacheInterface**)>
+      WorkerCacheFactory;
+
+  explicit SessionMgr(
+      WorkerEnv* worker_env, const std::string& default_worker_name,
+      std::unique_ptr<WorkerCacheInterface> default_worker_cache,
+      WorkerCacheFactory worker_cache_factory,
+      tsl::CoordinationServiceRpcHandler* coordination_handler);
+  ~SessionMgr() {}
+
+  // Allocates state for a new session.
+  absl::Status CreateSession(
+      const std::string& session, const ServerDef& server_def,
+      bool isolate_session_state,
+      StatusCallback coordination_error_callback = [](absl::Status s) {
+        LOG(ERROR) << "Coordination agent is set to error: " << s;
+      });
+  absl::Status CreateSession(
+      const std::string& session, const ServerDef& server_def,
+      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+      bool isolate_session_state);
+
+  // Create WorkerSession from the master with the given `master_task` and
+  // `master_incarnation`. We first look for existing WorkerSessions associated
+  // with the specified master task. If there are sessions created by the same
+  // master but with a different incarnation, it indicates that the remote
+  // master has restarted before deleting the sessions on worker. When it
+  // happens, old sessions associated with the master will be automatically
+  // removed before the new session is created.
+  absl::Status CreateSession(
+      const std::string& session, const ServerDef& server_def,
+      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+      bool isolate_session_state, std::string master_task,
+      int64_t master_incarnation,
+      StatusCallback coordination_error_callback = [](absl::Status s) {
+        LOG(ERROR) << "Coordination agent is set to error: " << s;
+      });
+
+  void ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache);
+
+  // Updates state (worker cache, devices) of worker session identified by
+  // session name (`session`) based on a new server_def and set of devices.
+  absl::Status UpdateSession(const std::string& session,
+                             const ServerDef& server_def,
+                             const protobuf::RepeatedPtrField<DeviceAttributes>&
+                                 cluster_device_attributes);
+
+  // Locates the worker session for a given session handle
+  absl::Status WorkerSessionForSession(
+      const std::string& session_handle,
+      std::shared_ptr<WorkerSession>* out_session);
+  std::shared_ptr<WorkerSession> LegacySession();
+
+  absl::Status DeleteSession(const std::string& session);
+
+  // Deletes all existing sessions.
+  absl::Status DeleteAllSessions();
+
+  // Provides access to the coordination service agent. This method should only
+  // be called after the agent has been initialized during session creation, or
+  // an invalid nullptr is returned. Note: the agent is thread-safe and mutable.
+  tsl::CoordinationServiceAgent* GetCoordinationServiceAgent();
+
+  static std::string WorkerNameFromServerDef(const ServerDef& server_def);
+
+  void SetLogging(bool active);
+
+  void RetrieveLogs(int64_t step_id, LoggingResponse* response);
+
+  void ClearLogs();
+
+  // Agent should be torn down before service as it needs to disconnect first.
+  void TeardownCoordinationServiceAgent();
+  void TeardownCoordinationService();
+
+ private:
+  WorkerEnv* const worker_env_;  // Not owned.
+
+  // A note about destruction:
+  // We must delete graph_mgr before device_mgr, due to shared
+  // ownership of OpKernels in the executors. (The graph_mgr will
+  // free all stateless OpKernels, and pass over borrowed stateful
+  // OpKernels, which are also held in their respective devices'
+  // OpSegments.)
+  //
+  // legacy_session_ owns the worker_env_.device_mgr, and so we must ensure
+  // that sessions_'s WorkerSessions are deleted (which do not own the
+  // underlying devices, but instead own RenamedDevices) before
+  // legacy_session_ is deleted. Further, we must ensure that WorkerSession's
+  // device_mgr is deleted after WorkerSession's graph_mgr.
+
+  std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
+  std::shared_ptr<WorkerSession> legacy_session_;
+  std::unique_ptr<tsl::CoordinationServiceInterface> coordination_service_;
+  std::unique_ptr<tsl::CoordinationServiceAgent> coordination_service_agent_;
+
+  bool is_logging_active_ = false;
+
+  const WorkerCacheFactory worker_cache_factory_;
+
+  // Not owned. And should only be used for setting the coordination service.
+  tsl::CoordinationServiceRpcHandler* coordination_handler_ = nullptr;
+
+  absl::Status WorkerSessionForSessionLocked(
+      const std::string& session_handle,
+      std::shared_ptr<WorkerSession>* out_session)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  mutex mu_;
+  // A map from session identifier to internal session structure.
+  std::map<std::string, std::shared_ptr<WorkerSession>> sessions_
+      TF_GUARDED_BY(mu_);
+
+  // Incarnation and WorkerSession handle associated with a master task.
+  struct MasterAssociatedSession {
+    const int64_t master_incarnation;
+    const std::string session_handle;
+  };
+  // A map from master task name to its associated worker sessions.
+  std::unordered_multimap<std::string, MasterAssociatedSession>
+      master_to_associated_sessions_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/tensor_coding.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/tensor_coding.h
new file mode 100644
index 00000000..1fd40d95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/tensor_coding.h
@@ -0,0 +1,110 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TENSOR_CODING_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TENSOR_CODING_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+class DeviceBase;
+class TensorProto;
+
+// TensorResponse can be used as the destination of an RPC that returns
+// a RecvTensorResponse.  It efficiently decodes the incoming data
+// into Tensor contents as well as associated metadata.
+class TensorResponse {
+ public:
+  TensorResponse() {}
+
+  // Reset to initial state.
+  void Clear();
+
+  // Clear just tensor_ and meta_ members without setting allocation
+  // related members.
+  void ClearTensor();
+
+  // Initialize memory allocation related members.
+  void InitAlloc(DeviceBase* d, const AllocatorAttributes& aa);
+
+  // Source provides a way for a particular RPC implementation to provide
+  // received data to ParseFrom.
+  class Source {
+   public:
+    virtual ~Source();
+
+    // Return the stream that contains the data to be parsed.
+    // Note that this method might be invoked more than once if
+    // ParseFrom needs to fall back to a more expensive parsing method.
+    // Every call must return a stream pointing at the beginning of
+    // the serialized RecvTensorResponse.
+    //
+    // Note that a subsequent call to contents() invalidates previous
+    // results of contents().
+    //
+    // Ownership of the returned stream is retained by the Source and
+    // should not be deleted by the caller.
+    virtual ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() = 0;
+  };
+
+  // Parse the RecvTensorResponse encoded in the data yielded by
+  // source->contents() into *this.
+  absl::Status ParseFrom(Source* source);
+
+  // Initialize tensor from *response.
+  // Leaves *response with unspecified contents.
+  absl::Status InitFrom(RecvTensorResponse* response);
+
+  // Initialize tensor metadata from response and allocate
+  // uninitialized backing storage for actual contents.
+  void InitPartial(const RecvTensorResponse& response,
+                   const AllocationAttributes& allocation_attr);
+
+  // Return a reference to the parsed tensor.  The tensor will remain
+  // live only until *this is destroyed or modified.
+  const Tensor& tensor() const { return tensor_; }
+
+  // Return a reference to the parsed tensor metadata (no contents).
+  // The result will remain live only until *this is destroyed or
+  // modified.
+  const RecvTensorResponse& metadata() const { return meta_; }
+
+  // Return pointer to the device hosting the tensor.
+  DeviceBase* device() const { return device_; }
+
+ private:
+  bool ParseTensorSubmessage(protobuf::io::CodedInputStream* input,
+                             TensorProto* tensor_meta);
+  bool ParseFast(Source* source);
+  bool ParseSlow(Source* source);
+
+  bool on_host_ = false;
+  DeviceBase* device_ = nullptr;
+  AllocatorAttributes alloc_attrs_;
+  Allocator* allocator_ = nullptr;
+  bool already_used_ = false;
+  Tensor tensor_;
+  RecvTensorResponse meta_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TENSOR_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/test_utils.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/test_utils.h
new file mode 100644
index 00000000..e7ad1041
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/test_utils.h
@@ -0,0 +1,202 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TEST_UTILS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TEST_UTILS_H_
+
+#include <unordered_map>
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Some utilities for testing distributed-mode components in a single process
+// without RPCs.
+
+// Implements the worker interface with methods that just respond with
+// "unimplemented" status.  Override just the methods needed for
+// testing.
+class TestWorkerInterface : public WorkerInterface {
+ public:
+  void GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
+                      GetStatusResponse* response, bool fail_fast,
+                      StatusCallback done) override {
+    done(errors::Unimplemented("GetStatusAsync"));
+  }
+
+  void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
+                                CreateWorkerSessionResponse* response,
+                                StatusCallback done) override {
+    done(errors::Unimplemented("CreateWorkerSessionAsync"));
+  }
+
+  void DeleteWorkerSessionAsync(CallOptions* opts,
+                                const DeleteWorkerSessionRequest* request,
+                                DeleteWorkerSessionResponse* response,
+                                StatusCallback done) override {
+    done(errors::Unimplemented("DeleteWorkerSessionAsync"));
+  }
+
+  void RegisterGraphAsync(const RegisterGraphRequest* request,
+                          RegisterGraphResponse* response,
+                          StatusCallback done) override {
+    done(errors::Unimplemented("RegisterGraphAsync"));
+  }
+
+  void DeregisterGraphAsync(const DeregisterGraphRequest* request,
+                            DeregisterGraphResponse* response,
+                            StatusCallback done) override {
+    done(errors::Unimplemented("DeregisterGraphAsync"));
+  }
+
+  void RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
+                     MutableRunGraphResponseWrapper* response,
+                     StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void CleanupGraphAsync(const CleanupGraphRequest* request,
+                         CleanupGraphResponse* response,
+                         StatusCallback done) override {
+    done(errors::Unimplemented("CleanupGraphAsync"));
+  }
+
+  void CleanupAllAsync(const CleanupAllRequest* request,
+                       CleanupAllResponse* response,
+                       StatusCallback done) override {
+    done(errors::Unimplemented("CleanupAllAsync"));
+  }
+
+  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
+                       TensorResponse* response, StatusCallback done) override {
+    done(errors::Unimplemented("RecvTensorAsync"));
+  }
+
+  void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
+                    StatusCallback done) override {
+    done(errors::Unimplemented("LoggingAsync"));
+  }
+
+  void TracingAsync(const TracingRequest* request, TracingResponse* response,
+                    StatusCallback done) override {
+    done(errors::Unimplemented("TracingAsync"));
+  }
+
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    done(errors::Unimplemented("RecvBufAsync"));
+  }
+
+  void CompleteGroupAsync(CallOptions* opts,
+                          const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          StatusCallback done) override {
+    done(errors::Unimplemented("CompleteGroupAsync"));
+  }
+
+  void CompleteInstanceAsync(CallOptions* ops,
+                             const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             StatusCallback done) override {
+    done(errors::Unimplemented("CompleteInstanceAsync"));
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            StatusCallback done) override {
+    done(errors::Unimplemented("GetStepSequenceAsync"));
+  }
+};
+
+class TestWorkerCache : public WorkerCacheInterface {
+ public:
+  virtual ~TestWorkerCache() {}
+
+  void AddWorker(const string& target, WorkerInterface* wi) {
+    workers_[target] = wi;
+  }
+
+  void AddDevice(const string& device_name, const DeviceLocality& dev_loc) {
+    localities_[device_name] = dev_loc;
+  }
+
+  void ListWorkers(std::vector<string>* workers) const override {
+    workers->clear();
+    for (auto it : workers_) {
+      workers->push_back(it.first);
+    }
+  }
+
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {
+    workers->clear();
+    for (auto it : workers_) {
+      DeviceNameUtils::ParsedName device_name;
+      CHECK(DeviceNameUtils::ParseFullName(it.first, &device_name));
+      CHECK(device_name.has_job);
+      if (job_name == device_name.job) {
+        workers->push_back(it.first);
+      }
+    }
+  }
+
+  WorkerInterface* GetOrCreateWorker(const string& target) override {
+    auto it = workers_.find(target);
+    if (it != workers_.end()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {}
+
+  absl::Status GetEagerClientCache(
+      std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
+    return errors::Unimplemented("Unimplemented.");
+  }
+
+  absl::Status GetCoordinationClientCache(
+      std::unique_ptr<CoordinationClientCache>* coord_client_cache) override {
+    return errors::Unimplemented("Unimplemented.");
+  }
+
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    auto it = localities_.find(device);
+    if (it != localities_.end()) {
+      *locality = it->second;
+      return true;
+    }
+    return false;
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    auto it = localities_.find(device);
+    if (it != localities_.end()) {
+      *locality = it->second;
+      done(absl::OkStatus());
+      return;
+    }
+    done(errors::Internal("Device not found: ", device));
+  }
+
+ protected:
+  std::unordered_map<string, WorkerInterface*> workers_;
+  std::unordered_map<string, DeviceLocality> localities_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker.h
new file mode 100644
index 00000000..4c55e1b9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker.h
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/cancellation.h"
+
+namespace tensorflow {
+
+class Device;
+struct WorkerEnv;
+class WorkerSession;
+
+// A TensorFlow Worker runs registered graphs and supports worker-to-worker
+// Tensor transfer.
+//
+// See `../protobuf/worker_service.proto` for more details about each method.
+//
+// This class may be subclassed to provide specialized implementations of
+// particular methods for different transport mechanism. For example,
+// `GrpcWorker` specializes the `RecvTensorAsync()` method to support a more
+// efficient gRPC data structure for handling large binary data.
+class Worker : public WorkerInterface {
+ public:
+  Worker(WorkerEnv* env);
+  virtual ~Worker() {}
+
+  void GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
+                      GetStatusResponse* response, bool fail_fast,
+                      StatusCallback done) override;
+
+  void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
+                                CreateWorkerSessionResponse* response,
+                                StatusCallback done) override;
+
+  void DeleteWorkerSessionAsync(CallOptions* opts,
+                                const DeleteWorkerSessionRequest* request,
+                                DeleteWorkerSessionResponse* response,
+                                StatusCallback done) override;
+
+  void RegisterGraphAsync(const RegisterGraphRequest* request,
+                          RegisterGraphResponse* response,
+                          StatusCallback done) override;
+
+  void DeregisterGraphAsync(const DeregisterGraphRequest* request,
+                            DeregisterGraphResponse* response,
+                            StatusCallback done) override;
+
+  void RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
+                     MutableRunGraphResponseWrapper* response,
+                     StatusCallback done) override;
+
+  MutableRunGraphRequestWrapper* CreateRunGraphRequest() override;
+
+  MutableRunGraphResponseWrapper* CreateRunGraphResponse() override;
+
+  void CleanupGraphAsync(const CleanupGraphRequest* request,
+                         CleanupGraphResponse* response,
+                         StatusCallback done) override;
+
+  void CleanupAllAsync(const CleanupAllRequest* request,
+                       CleanupAllResponse* response,
+                       StatusCallback done) override;
+
+  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
+                       TensorResponse* response, StatusCallback done) override;
+
+  void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
+                    StatusCallback done) override;
+
+  void TracingAsync(const TracingRequest* request, TracingResponse* response,
+                    StatusCallback done) override;
+
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override;
+
+  void CompleteGroupAsync(CallOptions* opts,
+                          const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          StatusCallback done) override;
+
+  void CompleteInstanceAsync(CallOptions* opts,
+                             const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             StatusCallback done) override;
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            StatusCallback done) override;
+
+ protected:
+  WorkerEnv* const env_;  // Not owned.
+  RecentRequestIds recent_request_ids_;
+
+  absl::Status PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
+                                 Device** src_dev);
+
+  void AbortStep(int64_t);
+
+ private:
+  PartialRunMgr partial_run_mgr_;
+
+  CancellationManager cancellation_manager_;
+
+  absl::Status PrepareRunGraph(RunGraphRequestWrapper* req,
+                               GraphMgr::NamedTensors* in,
+                               GraphMgr::NamedTensors* out);
+
+  void DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
+                  MutableRunGraphResponseWrapper* response,
+                  StatusCallback done);
+
+  void DoPartialRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
+                         MutableRunGraphResponseWrapper* response,
+                         StatusCallback done);
+
+  Worker(const Worker&) = delete;
+  void operator=(const Worker&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache.h
new file mode 100644
index 00000000..1ac4de35
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache.h
@@ -0,0 +1,96 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"  // for DeviceLocality
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+typedef std::function<void(const absl::Status&)> StatusCallback;
+
+class ChannelCache;
+class StepStats;
+
+class WorkerCacheInterface {
+ public:
+  virtual ~WorkerCacheInterface() {}
+
+  // Updates *workers with strings naming the remote worker tasks to
+  // which open channels have been established.
+  virtual void ListWorkers(std::vector<string>* workers) const = 0;
+  virtual void ListWorkersInJob(const string& job_name,
+                                std::vector<string>* workers) const = 0;
+
+  // If "target" names a remote task for which an RPC channel exists
+  // or can be constructed, returns a pointer to a WorkerInterface object
+  // wrapping that channel. The returned value must be destroyed by
+  // calling `this->ReleaseWorker(target, ret)`
+  virtual WorkerInterface* GetOrCreateWorker(const string& target) = 0;
+
+  // Release a worker previously returned by this->GetOrCreateWorker(target).
+  //
+  // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
+  // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
+  //                    per-rpc-subsystem WorkerInterface creator.
+  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+    // Subclasses may override to reuse worker objects.
+    delete worker;
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Returns true if *locality
+  // was set, using only locally cached data.  Returns false
+  // if status data for that device was not available.  Never blocks.
+  virtual bool GetDeviceLocalityNonBlocking(const string& device,
+                                            DeviceLocality* locality) = 0;
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Callback gets Status::OK if *locality
+  // was set.
+  virtual void GetDeviceLocalityAsync(const string& device,
+                                      DeviceLocality* locality,
+                                      StatusCallback done) = 0;
+
+  // TODO(b/189159585): Define a general client cache maker function to
+  // construct client cache of different types sharing the same underling RPC
+  // channels, to replace the eager and coordination cache function.
+  // Build and return a EagerClientCache object wrapping that channel.
+  virtual absl::Status GetEagerClientCache(
+      std::unique_ptr<eager::EagerClientCache>* eager_client_cache) = 0;
+
+  // Build and return a CoordinationClientCache object wrapping that channel.
+  virtual absl::Status GetCoordinationClientCache(
+      std::unique_ptr<CoordinationClientCache>* coordination_client_cache) = 0;
+
+  // Start/stop logging activity.
+  virtual void SetLogging(bool active) {}
+
+  // Discard any saved log data.
+  virtual void ClearLogs() {}
+
+  // Return logs for the identified step in *ss.  Any returned data will no
+  // longer be stored.
+  virtual bool RetrieveLogs(int64_t step_id, StepStats* ss) { return false; }
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_logger.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_logger.h
new file mode 100644
index 00000000..f5ef19bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_logger.h
@@ -0,0 +1,89 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_LOGGER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_LOGGER_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class StepStatsCollector;
+
+// WorkerCacheLogger is a thread-safe utility for use by a WorkerCache
+// to optionally log some selected RPC activity.  A single instance
+// should be owned by a WorkerCache, for use by its RemoteWorker
+// instances.
+
+class WorkerCacheLogger {
+ public:
+  // Start/Stop logging activity.  This function increments/decrements
+  // a counter so that if two separate steps turn logging on/off,
+  // logging should be on for the union of the durations of both,
+  // regardless of relative timing.
+  void SetLogging(bool v);
+
+  // Discard any saved log data.
+  void ClearLogs();
+
+  // Return logs for the identified step in *ss.  Any returned data will no
+  // longer be stored.  Returns true iff *ss was modified.
+  bool RetrieveLogs(int64_t step_id, StepStats* ss);
+
+  // Return true if there is any outstanding request for logging on
+  // the RPC channels.
+  bool LoggingActive() {
+    mutex_lock l(count_mu_);
+    return want_logging_count_ > 0;
+  }
+
+  // Generates a NodeExecStats record with the given data, and saves for
+  // later retrieval by RetrieveLogs().
+  void RecordRecvTensor(int64_t step_id, int64_t start_usecs, int64_t end_usecs,
+                        const string& tensor_name, const string& src_device,
+                        const string& dst_device, int64_t bytes);
+
+  // Generates a NodeExecStats record with the given data, and saves for
+  // later retrieval by RetrieveLogs().
+  void RecordDataTransfer(int64_t step_id, int64_t start_usecs,
+                          int64_t end_usecs, const string& tensor_name,
+                          const string& src_device, const string& dst_device,
+                          int64_t bytes, const string& details,
+                          const string& transfer_method_name);
+
+ private:
+  mutex count_mu_;
+  int32 want_logging_count_ TF_GUARDED_BY(count_mu_) = 0;
+
+  struct StepLog {
+    StepStats step_stats;
+    StepStatsCollector* collector;
+  };
+  typedef std::unordered_map<int64_t, StepLog> LogMap;
+  mutex mu_;
+  LogMap log_map_ TF_GUARDED_BY(mu_);
+
+  // Records "ns" in log_map_ under the given device and step.
+  void Save(const string& device, int64_t step_id, NodeExecStats* ns);
+
+  void ClearLogsWithLock() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_partial.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_partial.h
new file mode 100644
index 00000000..b5a500b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_partial.h
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_PARTIAL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_PARTIAL_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+// Implements the part of the interface that caches and returns remote
+// device status attributes.
+class WorkerCachePartial : public WorkerCacheInterface {
+ public:
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override;
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback) override;
+
+  ~WorkerCachePartial() override {}
+
+  // Clear all entries from the DeviceStatus cache.
+  void FlushStatusCache();
+
+ private:
+  mutex mu_;
+
+  // Initiate a GetStatusAsync to the remote task named by "task", and
+  // update the cache with all the DeviceAttributes reported.
+  absl::Status RefreshDeviceStatus(const string& device_name);
+
+  typedef std::unordered_map<string, DeviceAttributes> StatusMap;
+  StatusMap device_status_cache_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_PARTIAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
new file mode 100644
index 00000000..7f709b4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+
+namespace tensorflow {
+
+class WorkerCacheWrapper : public WorkerCacheInterface {
+ public:
+  WorkerCacheWrapper(WorkerCacheInterface* wrapped) : wrapped_(wrapped) {}
+
+  // Updates *workers with strings naming the remote worker tasks to
+  // which open channels have been established.
+  void ListWorkers(std::vector<string>* workers) const override {
+    return wrapped_->ListWorkers(workers);
+  }
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {
+    return wrapped_->ListWorkersInJob(job_name, workers);
+  }
+
+  // If "target" names a remote task for which an RPC channel exists
+  // or can be constructed, returns a pointer to a WorkerInterface object
+  // wrapping that channel. The returned value must be destroyed by
+  // calling `this->ReleaseWorker(target, ret)`
+  WorkerInterface* GetOrCreateWorker(const string& target) override {
+    return wrapped_->GetOrCreateWorker(target);
+  }
+
+  // Release a worker previously returned by this->GetOrCreateWorker(target).
+  //
+  // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
+  // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
+  //                    per-rpc-subsystem WorkerInterface creator.
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+    return wrapped_->ReleaseWorker(target, worker);
+  }
+
+  absl::Status GetEagerClientCache(
+      std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
+    return wrapped_->GetEagerClientCache(eager_client_cache);
+  }
+
+  absl::Status GetCoordinationClientCache(
+      std::unique_ptr<CoordinationClientCache>* coordination_client_cache)
+      override {
+    return wrapped_->GetCoordinationClientCache(coordination_client_cache);
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Returns true if *locality
+  // was set, using only locally cached data.  Returns false
+  // if status data for that device was not available.  Never blocks.
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Callback gets Status::OK if *locality
+  // was set.
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    return wrapped_->GetDeviceLocalityAsync(device, locality, std::move(done));
+  }
+
+  // Start/stop logging activity.
+  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
+
+  // Discard any saved log data.
+  void ClearLogs() override { wrapped_->ClearLogs(); }
+
+  // Return logs for the identified step in *ss.  Any returned data will no
+  // longer be stored.
+  bool RetrieveLogs(int64_t step_id, StepStats* ss) override {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  WorkerCacheInterface* wrapped_;  // Not owned.
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_env.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_env.h
new file mode 100644
index 00000000..350c3e5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_env.h
@@ -0,0 +1,79 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tsl {
+class Env;
+namespace thread {
+class ThreadPool;
+}  // namespace thread
+}  // namespace tsl
+namespace tensorflow {
+using Env = tsl::Env;
+
+namespace thread {
+using tsl::thread::ThreadPool;
+}  // namespace thread
+
+class CollectiveExecutorMgrInterface;
+class Device;
+class DeviceMgr;
+class RendezvousMgrInterface;
+class SessionMgr;
+
+// The worker environment class, which holds a bag of pointers to
+// per-worker singletons.
+//
+// WorkerEnv does not own its member pointers.
+struct WorkerEnv {
+  Env* env = nullptr;
+
+  // session_mgr encapsulates state for each session.
+  SessionMgr* session_mgr = nullptr;
+
+  // In large scaled distributed training, many singleton components (e.g.
+  // Rendezvous) can becomes the bottleneck of the system. This field allows
+  // us to shard the single components. This number will scale up with number
+  // of tasks in this cluster. It is always greater than 1.
+  int experimental_num_shards = 1;
+
+  // device_mgr manages local devices (cpu and gpu). The WorkerService
+  // is the network interface for managed devices.
+  //
+  // Note: Please use the device_mgr associated with your session if appropriate
+  // instead of this one. Using this device_mgr does not support ClusterSpec
+  // propagated sessions.
+  DeviceMgr* device_mgr = nullptr;
+
+  // A set of rendezvous keyed by step ids.
+  RendezvousMgrInterface* rendezvous_mgr = nullptr;
+
+  // Generates per-step CollectiveExecutors and has access to utilities
+  // supporting collective operations.
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr;
+
+  // A pool of threads for scheduling compute work.
+  thread::ThreadPool* compute_pool = nullptr;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_interface.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_interface.h
new file mode 100644
index 00000000..382425bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_interface.h
@@ -0,0 +1,236 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_INTERFACE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_INTERFACE_H_
+
+#include <functional>
+
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+// Status callback.
+typedef std::function<void(const absl::Status&)> StatusCallback;
+
+// Custom decoder for a response to RecvTensorAsync.
+class TensorResponse;
+
+// Interface for talking with the TensorFlow Worker service.
+class WorkerInterface {
+ public:
+  virtual void GetStatusAsync(CallOptions* opts,
+                              const GetStatusRequest* request,
+                              GetStatusResponse* response, bool fail_fast,
+                              StatusCallback done) = 0;
+
+  virtual void CreateWorkerSessionAsync(
+      const CreateWorkerSessionRequest* request,
+      CreateWorkerSessionResponse* response, StatusCallback done) = 0;
+
+  virtual void DeleteWorkerSessionAsync(
+      CallOptions* opts, const DeleteWorkerSessionRequest* request,
+      DeleteWorkerSessionResponse* response, StatusCallback done) = 0;
+
+  virtual void RegisterGraphAsync(const RegisterGraphRequest* request,
+                                  RegisterGraphResponse* response,
+                                  StatusCallback done) = 0;
+
+  virtual void DeregisterGraphAsync(const DeregisterGraphRequest* request,
+                                    DeregisterGraphResponse* response,
+                                    StatusCallback done) = 0;
+
+  virtual void RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
+                             MutableRunGraphResponseWrapper* response,
+                             StatusCallback done) = 0;
+
+  virtual void RunGraphAsync(CallOptions* opts, const RunGraphRequest* request,
+                             RunGraphResponse* response, StatusCallback done) {
+    RunGraphRequestWrapper* wrapped_request = new ProtoRunGraphRequest(request);
+    MutableRunGraphResponseWrapper* wrapped_response =
+        new NonOwnedProtoRunGraphResponse(response);
+    RunGraphAsync(opts, wrapped_request, wrapped_response,
+                  [wrapped_request, wrapped_response,
+                   done = std::move(done)](const absl::Status& s) {
+                    done(s);
+                    delete wrapped_request;
+                    delete wrapped_response;
+                  });
+  }
+
+  // Returns a request object for use in calls to
+  // `RunGraphAsync()`. Ownership is transferred to the caller.
+  //
+  // The message returned from this method must only be used in a
+  // `RunGraph()` call on the same `WorkerInterface` instance.
+  virtual MutableRunGraphRequestWrapper* CreateRunGraphRequest() {
+    return new MutableProtoRunGraphRequest;
+  }
+
+  // Returns a response object for use in calls to
+  // `RunGraphAsync()`. Ownership is transferred to the caller.
+  //
+  // The message returned from this method must only be used in a
+  // `RunGraph()` call on the same `WorkerInterface` instance.
+  virtual MutableRunGraphResponseWrapper* CreateRunGraphResponse() {
+    return new OwnedProtoRunGraphResponse;
+  }
+
+  virtual void CleanupGraphAsync(const CleanupGraphRequest* request,
+                                 CleanupGraphResponse* response,
+                                 StatusCallback done) = 0;
+
+  virtual void CleanupAllAsync(const CleanupAllRequest* request,
+                               CleanupAllResponse* response,
+                               StatusCallback done) = 0;
+
+  virtual void RecvTensorAsync(CallOptions* opts,
+                               const RecvTensorRequest* request,
+                               TensorResponse* response,
+                               StatusCallback done) = 0;
+
+  virtual void LoggingAsync(const LoggingRequest* request,
+                            LoggingResponse* response, StatusCallback done) = 0;
+
+  virtual void TracingAsync(const TracingRequest* request,
+                            TracingResponse* response, StatusCallback done) = 0;
+
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response, StatusCallback done) = 0;
+
+  virtual void CompleteGroupAsync(CallOptions* opts,
+                                  const CompleteGroupRequest* request,
+                                  CompleteGroupResponse* response,
+                                  StatusCallback done) = 0;
+
+  virtual void CompleteInstanceAsync(CallOptions* ops,
+                                     const CompleteInstanceRequest* request,
+                                     CompleteInstanceResponse* response,
+                                     StatusCallback done) = 0;
+
+  virtual void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                    GetStepSequenceResponse* response,
+                                    StatusCallback done) = 0;
+
+  absl::Status GetStatus(const GetStatusRequest* request,
+                         GetStatusResponse* response) {
+    absl::Status ret;
+    Notification n;
+    GetStatusAsync(/*opts=*/nullptr, request, response, /*fail_fast=*/true,
+                   [&ret, &n](const absl::Status& s) {
+                     ret = s;
+                     n.Notify();
+                   });
+    n.WaitForNotification();
+    return ret;
+  }
+
+  absl::Status CreateWorkerSession(const CreateWorkerSessionRequest* request,
+                                   CreateWorkerSessionResponse* response) {
+    return CallAndWait(&ME::CreateWorkerSessionAsync, request, response);
+  }
+
+  absl::Status DeleteWorkerSession(const DeleteWorkerSessionRequest* request,
+                                   DeleteWorkerSessionResponse* response) {
+    return CallAndWaitWithOptions(&ME::DeleteWorkerSessionAsync, request,
+                                  response);
+  }
+
+  absl::Status RegisterGraph(const RegisterGraphRequest* request,
+                             RegisterGraphResponse* response) {
+    return CallAndWait(&ME::RegisterGraphAsync, request, response);
+  }
+
+  absl::Status DeregisterGraph(const DeregisterGraphRequest* request,
+                               DeregisterGraphResponse* response) {
+    return CallAndWait(&ME::DeregisterGraphAsync, request, response);
+  }
+
+  absl::Status CleanupGraph(const CleanupGraphRequest* request,
+                            CleanupGraphResponse* response) {
+    return CallAndWait(&ME::CleanupGraphAsync, request, response);
+  }
+
+  absl::Status CleanupAll(const CleanupAllRequest* request,
+                          CleanupAllResponse* response) {
+    return CallAndWait(&ME::CleanupAllAsync, request, response);
+  }
+
+  absl::Status Logging(const LoggingRequest* request,
+                       LoggingResponse* response) {
+    return CallAndWait(&ME::LoggingAsync, request, response);
+  }
+
+  absl::Status Tracing(const TracingRequest* request,
+                       TracingResponse* response) {
+    return CallAndWait(&ME::TracingAsync, request, response);
+  }
+
+  absl::Status GetStepSequence(const GetStepSequenceRequest* request,
+                               GetStepSequenceResponse* response) {
+    return CallAndWait(&ME::GetStepSequenceAsync, request, response);
+  }
+
+ protected:
+  // Instances of WorkerInterface must be deleted by a call to
+  // WorkerCacheInterface::ReleaseWorker().
+  virtual ~WorkerInterface() {}
+  friend class WorkerCacheInterface;
+
+  // NOTE: This should only be called by implementations of this
+  // interface whose CreateRunGraphResponse() method returns a
+  // proto-based wrappers for the RunGraphResponse message.
+  RunGraphResponse* get_proto_from_wrapper(
+      MutableRunGraphResponseWrapper* wrapper) {
+    return wrapper->get_proto();
+  }
+
+ private:
+  typedef WorkerInterface ME;
+
+  template <typename Method, typename Req, typename Resp>
+  absl::Status CallAndWait(Method func, const Req* req, Resp* resp) {
+    absl::Status ret;
+    Notification n;
+    (this->*func)(req, resp, [&ret, &n](const absl::Status& s) {
+      ret = s;
+      n.Notify();
+    });
+    n.WaitForNotification();
+    return ret;
+  }
+
+  template <typename Method, typename Req, typename Resp>
+  absl::Status CallAndWaitWithOptions(Method func, const Req* req, Resp* resp) {
+    CallOptions call_opts;
+    absl::Status ret;
+    Notification n;
+    (this->*func)(&call_opts, req, resp, [&ret, &n](const absl::Status& s) {
+      ret = s;
+      n.Notify();
+    });
+    n.WaitForNotification();
+    return ret;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_session.h b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_session.h
new file mode 100644
index 00000000..e366accf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/distributed_runtime/worker_session.h
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+
+class ClusterFunctionLibraryRuntime;
+class GraphMgr;
+class WorkerCacheInterface;
+
+// WorkerSession encapsulates all of the state relating to a given session.
+class WorkerSession {
+ public:
+  using DistributedFunctionLibraryRuntimeCreator =
+      std::function<std::unique_ptr<DistributedFunctionLibraryRuntime>(
+          WorkerSession* worker_session, bool create_worker_session_called,
+          DeviceMgr* remote_device_mgr)>;
+
+  // Collection of local devices. These devices are typically
+  // RenamedDevices in all except the SessionMgr.legacy_session_ and
+  // sessions created with `isolate_session_state == false`. In the
+  // those cases, this method returns a pointer to a borrowed
+  // DeviceMgr (typically the `worker_env.device_mgr`).
+  DeviceMgr* device_mgr() {
+    return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
+  }
+
+  DynamicDeviceMgr* remote_device_mgr() { return remote_device_mgr_.get(); }
+
+  const string& session_name() const { return session_name_; }
+  const string& worker_name() const { return worker_name_; }
+
+  WorkerCacheInterface* worker_cache() const {
+    tf_shared_lock l(worker_session_state_mu_);
+    return worker_cache_.get();
+  }
+  GraphMgr* graph_mgr() const { return graph_mgr_.get(); }
+
+  DistributedFunctionLibraryRuntime* cluster_flr() const {
+    return cluster_flr_.get();
+  }
+
+  WorkerSession(const string& session_name, const string& worker_name,
+                std::unique_ptr<WorkerCacheInterface> worker_cache,
+                std::unique_ptr<DeviceMgr> device_mgr,
+                std::unique_ptr<GraphMgr> graph_mgr,
+                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+                DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
+
+  static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
+      const string& session_name, const string& worker_name,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+      std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+      DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
+
+  // In the eager runtime we allow WorkerSession to be updated, where the
+  // worker cache will be recreated. If WorkerSession upate is expected and a
+  // worker in the cache is used in RPCs, the caller should hold a shared
+  // pointer to avoid the workers getting deleted.
+  std::shared_ptr<WorkerCacheInterface> GetSharedWorkerCache() {
+    tf_shared_lock l(worker_session_state_mu_);
+    return worker_cache_;
+  }
+
+  // Update an existing worker session with new set of remote workers and
+  // devices. Added devices will be owned by the worker session, and removed
+  // devices will be freed by their names.
+  absl::Status UpdateWorkerCacheAndDevices(
+      std::unique_ptr<WorkerCacheInterface> new_worker_cache,
+      std::vector<std::unique_ptr<Device>> added_remote_devices,
+      const std::vector<Device*>& removed_remote_devices);
+
+  ~WorkerSession();
+
+ private:
+  WorkerSession(const string& session_name, const string& worker_name,
+                std::unique_ptr<WorkerCacheInterface> worker_cache,
+                DeviceMgr* borrowed_device_mgr,
+                std::unique_ptr<GraphMgr> graph_mgr,
+                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+                DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
+
+  // The name of the session.
+  const string session_name_;
+
+  // The name of the worker. E.g., /job:mnist/replica:0/task:1.
+  const string worker_name_;
+
+  mutable mutex worker_session_state_mu_;
+  // Object from which WorkerInterface instances can be obtained.
+  std::shared_ptr<WorkerCacheInterface> worker_cache_
+      TF_GUARDED_BY(worker_session_state_mu_);
+
+  // graph_mgr keeps track of the registered graphs of this session.
+  //
+  // Note: graph_mgr must be deleted before rendezvous_mgr!
+  // Note: graph_mgr must be deleted before device_mgr!
+  const std::unique_ptr<GraphMgr> graph_mgr_;
+
+  std::unique_ptr<DistributedFunctionLibraryRuntime> cluster_flr_;
+
+  const std::unique_ptr<DeviceMgr> device_mgr_;
+  DeviceMgr* const borrowed_device_mgr_;  // Not owned.
+  std::unique_ptr<DynamicDeviceMgr> remote_device_mgr_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/example/example_parser_configuration.h b/third_party/tflite-hdrs/tensorflow/core/example/example_parser_configuration.h
new file mode 100644
index 00000000..dd2aacae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/example/example_parser_configuration.h
@@ -0,0 +1,56 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
+#define TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/example_parser_configuration.pb.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/example_proto_helper.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+// This is a set of helper methods that will make it possible to share
+// tensorflow::Example proto Tensor conversion code inside the ExampleParserOp
+// OpKernel as well as in external code.
+namespace tensorflow {
+
+// Given a graph and the node_name of a ParseExample op,
+// extract the FixedLenFeature/VarLenFeature configurations.
+absl::Status ExtractExampleParserConfiguration(
+    const tensorflow::GraphDef& graph, const string& node_name,
+    tensorflow::Session* session,
+    std::vector<FixedLenFeature>* fixed_len_features,
+    std::vector<VarLenFeature>* var_len_features);
+
+// Given a config proto, ostensibly extracted via python,
+// fill a vector of C++ structs suitable for calling
+// the tensorflow.Example -> Tensor conversion code.
+absl::Status ExampleParserConfigurationProtoToFeatureVectors(
+    const ExampleParserConfiguration& config_proto,
+    std::vector<FixedLenFeature>* fixed_len_features,
+    std::vector<VarLenFeature>* var_len_features);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/example/feature_util.h b/third_party/tflite-hdrs/tensorflow/core/example/feature_util.h
new file mode 100644
index 00000000..092fabe6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/example/feature_util.h
@@ -0,0 +1,644 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A set of lightweight wrappers which simplify access to Feature protos.
+//
+// TensorFlow Example proto uses associative maps on top of oneof fields.
+// SequenceExample proto uses associative map of FeatureList.
+// So accessing feature values is not very convenient.
+//
+// For example, to read a first value of integer feature "tag":
+//   int id = example.features().feature().at("tag").int64_list().value(0);
+//
+// to add a value:
+//   auto features = example->mutable_features();
+//   (*features->mutable_feature())["tag"].mutable_int64_list()->add_value(id);
+//
+// For float features you have to use float_list, for string - bytes_list.
+//
+// To do the same with this library:
+//   int id = GetFeatureValues<int64_t>("tag", example).Get(0);
+//   GetFeatureValues<int64_t>("tag", &example)->Add(id);
+//
+// Modification of bytes features is slightly different:
+//   auto tag = GetFeatureValues<std::string>("tag", &example);
+//   *tag->Add() = "lorem ipsum";
+//
+// To copy multiple values into a feature:
+//   AppendFeatureValues({1,2,3}, "tag", &example);
+//
+// GetFeatureValues gives you access to underlying data - RepeatedField object
+// (RepeatedPtrField for byte list). So refer to its documentation of
+// RepeatedField for full list of supported methods.
+//
+// NOTE: Due to the nature of oneof proto fields setting a feature of one type
+// automatically clears all values stored as another type with the same feature
+// key.
+//
+// This library also has tools to work with SequenceExample protos.
+//
+// To get a value from SequenceExample.context:
+//   int id = GetFeatureValues<protobuf_int64>("tag", se.context()).Get(0);
+// To add a value to the context:
+//   GetFeatureValues<protobuf_int64>("tag", se.mutable_context())->Add(42);
+//
+// To add values to feature_lists:
+//   AppendFeatureValues({4.0},
+//                       GetFeatureList("images", &se)->Add());
+//   AppendFeatureValues({5.0, 3.0},
+//                       GetFeatureList("images", &se)->Add());
+// This will create a feature list keyed as "images" with two features:
+//   feature_lists {
+//     feature_list {
+//       key: "images"
+//       value {
+//         feature { float_list { value: [4.0] } }
+//         feature { float_list { value: [5.0, 3.0] } }
+//       }
+//     }
+//   }
+// For string-valued features, note that the Append... and Set... functions
+// support absl::string_view containers. This allows you to copy existing
+// buffers into a Feature with only one copy:
+//   std::vector<absl::string_view> image;
+//   image.push_back(image_buffer);               // No copy.
+//   SetFeatureValues(image, "image", &example);  // Copy.
+//
+// Functions exposed by this library:
+//   HasFeature<[FeatureType]>(key, proto) -> bool
+//     Returns true if a feature with the specified key, and optionally
+//     FeatureType, belongs to the Features or Example proto.
+//   HasFeatureList(key, sequence_example) -> bool
+//     Returns true if SequenceExample has a feature_list with the key.
+//
+//   GetFeatureValues<FeatureType>(key, proto) -> RepeatedField<FeatureType>
+//     Returns values for the specified key and the FeatureType.
+//     Supported types for the proto: Example, Features.
+//   GetFeatureList(key, sequence_example) -> RepeatedPtrField<Feature>
+//     Returns Feature protos associated with a key.
+//
+//   AppendFeatureValues(begin, end, feature)
+//   AppendFeatureValues(container or initializer_list, feature)
+//     Copies values into a Feature.
+//   AppendFeatureValues(begin, end, key, proto)
+//   AppendFeatureValues(container or initializer_list, key, proto)
+//     Copies values into Features and Example protos with the specified key.
+//
+//   ClearFeatureValues<FeatureType>(feature)
+//     Clears the feature's repeated field of the given type.
+//
+//   SetFeatureValues(begin, end, feature)
+//   SetFeatureValues(container or initializer_list, feature)
+//     Clears a Feature, then copies values into it.
+//   SetFeatureValues(begin, end, key, proto)
+//   SetFeatureValues(container or initializer_list, key, proto)
+//     Clears Features or Example protos with the specified key,
+//     then copies values into them.
+//
+// Auxiliary functions, it is unlikely you'll need to use them directly:
+//   GetFeatures(proto) -> Features
+//     A convenience function to get Features proto.
+//     Supported types for the proto: Example, Features.
+//   GetFeature(key, proto) -> Feature
+//     Returns a Feature proto for the specified key.
+//     Supported types for the proto: Example, Features.
+//   GetFeatureValues<FeatureType>(feature) -> RepeatedField<FeatureType>
+//     Returns values of the feature for the FeatureType.
+
+#ifndef TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
+#define TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+// Must come after the import for absl::string_view.
+#ifdef ABSL_HAVE_STD_STRING_VIEW
+#include <string_view>
+#endif
+
+namespace tensorflow {
+namespace internal {
+
+// TODO(gorban): Update all clients in a followup CL.
+// Returns a reference to a feature corresponding to the name.
+// Note: it will create a new Feature if it is missing in the example.
+ABSL_DEPRECATED("Use GetFeature instead.")
+Feature& ExampleFeature(absl::string_view name, Example* example);
+
+// Specializations of RepeatedFieldTrait define a type of RepeatedField
+// corresponding to a selected feature type.
+template <typename FeatureType>
+struct RepeatedFieldTrait;
+
+template <>
+struct RepeatedFieldTrait<protobuf_int64> {
+  using Type = protobuf::RepeatedField<protobuf_int64>;
+};
+
+template <>
+struct RepeatedFieldTrait<float> {
+  using Type = protobuf::RepeatedField<float>;
+};
+
+template <>
+struct RepeatedFieldTrait<tstring> {
+  using Type = protobuf::RepeatedPtrField<std::string>;
+};
+
+template <>
+struct RepeatedFieldTrait<std::string> {
+  using Type = protobuf::RepeatedPtrField<std::string>;
+};
+
+// Specializations of FeatureTrait define a type of feature corresponding to a
+// selected value type.
+template <typename ValueType, class Enable = void>
+struct FeatureTrait;
+
+template <typename ValueType>
+struct FeatureTrait<ValueType, typename std::enable_if<
+                                   std::is_integral<ValueType>::value>::type> {
+  using Type = protobuf_int64;
+};
+
+template <typename ValueType>
+struct FeatureTrait<
+    ValueType,
+    typename std::enable_if<std::is_floating_point<ValueType>::value>::type> {
+  using Type = float;
+};
+
+template <typename T>
+struct is_string
+    : public std::integral_constant<
+          bool,
+          std::is_same<char*, typename std::decay<T>::type>::value ||
+              std::is_same<const char*, typename std::decay<T>::type>::value> {
+};
+
+template <>
+struct is_string<std::string> : std::true_type {};
+
+template <>
+struct is_string<absl::string_view> : std::true_type {};
+
+template <>
+struct is_string<tstring> : std::true_type {};
+
+template <typename ValueType>
+struct FeatureTrait<
+    ValueType, typename std::enable_if<is_string<ValueType>::value>::type> {
+  using Type = std::string;
+};
+
+// Port of the C++20 `requires` expressions.
+template <typename... T, typename F>
+constexpr bool Requires(F) {
+  return std::is_invocable<F, T...>::value;
+}
+
+struct NoneSuch {};
+
+// True if the Feature map in a tf.Example supports heterogenous lookup.
+// See https://abseil.io/tips/144.
+// TODO(b/365531379): this cannot be replaced by a lambda because it exposes a
+// Clang bug when used in modules.
+struct CheckFindFunctor {
+  template <class Container>
+  auto operator()(Container&& c) -> decltype(c.find(NoneSuch{})) {}
+};
+inline constexpr bool kFeatureMapHasHeterogeneousLookup =
+    Requires<decltype(Features::default_instance().feature())>(
+        CheckFindFunctor());
+
+// Converts an `absl::string_view` into a string-type compatible for use in the
+// protobuf library (e.g. as lookup keys in `proto2::Map` or as elements addable
+// to a `proto2::RepeatedPtrField`) depending on the BUILD mode.
+//
+// NOTE: While the newest versions of `proto2::Map` support heterogenous lookup,
+// it does so through `std::string_view`. If the type is just an alias (as noted
+// by `ABSL_USES_STD_STRING_VIEW`) then nothing more needs to be done; however,
+// when the type is not an alias an explicit conversion to is necessary.
+//
+// NOTE: This conversion is only necessary until the migration for protobuf to
+// take a dependency on ABSL is complete.
+inline auto ProtoMapKey(absl::string_view str) {
+  if constexpr (kFeatureMapHasHeterogeneousLookup) {
+#ifdef ABSL_USES_STD_STRING_VIEW
+    return str;
+#else
+#ifdef ABSL_HAVE_STD_STRING_VIEW
+    return std::string_view(str.data(), str.size());
+#else
+    return std::string(str);
+#endif
+#endif
+  } else {
+    return std::string(str);
+  }
+}
+
+}  //  namespace internal
+
+// Returns true if sequence_example has a feature_list with the specified key.
+bool HasFeatureList(absl::string_view key,
+                    const SequenceExample& sequence_example);
+
+template <typename T>
+struct TypeHasFeatures : std::false_type {};
+
+template <>
+struct TypeHasFeatures<SequenceExample> : std::true_type {};
+
+template <>
+struct TypeHasFeatures<Example> : std::true_type {};
+
+template <>
+struct TypeHasFeatures<Features> : std::true_type {};
+
+// A family of template functions to return mutable Features proto from a
+// container proto. Supported ProtoTypes: SequenceExample, Example, Features.
+template <typename ProtoType>
+typename std::enable_if<TypeHasFeatures<ProtoType>::value, Features*>::type
+GetFeatures(ProtoType* proto);
+
+template <>
+Features* GetFeatures<Features>(Features* proto);
+template <>
+Features* GetFeatures<Example>(Example* proto);
+template <>
+Features* GetFeatures<SequenceExample>(SequenceExample* proto);
+
+template <typename ProtoType>
+typename std::enable_if<TypeHasFeatures<ProtoType>::value,
+                        const Features&>::type
+GetFeatures(const ProtoType& proto);
+
+template <>
+const Features& GetFeatures<Features>(const Features& proto);
+template <>
+const Features& GetFeatures<Example>(const Example& proto);
+template <>
+const Features& GetFeatures<SequenceExample>(const SequenceExample& proto);
+
+// Base declaration of a family of template functions to return a read only
+// repeated field of feature values.
+template <typename FeatureType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type&
+GetFeatureValues(const Feature& feature);
+
+template <>
+const protobuf::RepeatedField<protobuf_int64>& GetFeatureValues<protobuf_int64>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedField<float>& GetFeatureValues<float>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>& GetFeatureValues<tstring>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>& GetFeatureValues<std::string>(
+    const Feature& feature);
+
+// Returns a read only repeated field corresponding to a feature with the
+// specified name and FeatureType. Supported ProtoTypes: SequenceExample,
+// Example, Features.
+template <typename FeatureType, typename ProtoType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type&
+GetFeatureValues(absl::string_view key, const ProtoType& proto) {
+  return GetFeatureValues<FeatureType>(
+      GetFeatures(proto).feature().at(internal::ProtoMapKey(key)));
+}
+
+// Returns a mutable repeated field of a feature values.
+template <typename FeatureType>
+typename internal::RepeatedFieldTrait<FeatureType>::Type* GetFeatureValues(
+    Feature* feature);
+
+template <>
+protobuf::RepeatedField<protobuf_int64>* GetFeatureValues<protobuf_int64>(
+    Feature* feature);
+template <>
+protobuf::RepeatedField<float>* GetFeatureValues<float>(Feature* feature);
+template <>
+protobuf::RepeatedPtrField<std::string>* GetFeatureValues<tstring>(
+    Feature* feature);
+template <>
+protobuf::RepeatedPtrField<std::string>* GetFeatureValues<std::string>(
+    Feature* feature);
+
+// Returns a mutable repeated field corresponding to a feature with the
+// specified name and FeatureType. Supported ProtoTypes: SequenceExample,
+// Example, Features.
+template <typename FeatureType, typename ProtoType>
+typename internal::RepeatedFieldTrait<FeatureType>::Type* GetFeatureValues(
+    absl::string_view key, ProtoType* proto) {
+  ::tensorflow::Feature& feature =
+      (*GetFeatures(proto)->mutable_feature())[internal::ProtoMapKey(key)];
+  return GetFeatureValues<FeatureType>(&feature);
+}
+
+// Returns a read-only Feature proto for the specified key, throws
+// std::out_of_range if the key is not found. Supported types for the proto:
+// SequenceExample, Example, Features.
+template <typename ProtoType>
+const Feature& GetFeature(absl::string_view key, const ProtoType& proto) {
+  return GetFeatures(proto).feature().at(internal::ProtoMapKey(key));
+}
+
+// Returns a read-only Feature proto for the specified key, returns nullptr
+// if the key is not found. Supported types for the proto: SequenceExample,
+// Example, Features.
+template <typename ProtoType>
+const Feature* MaybeGetFeature(absl::string_view key, const ProtoType& proto) {
+  const protobuf::Map<std::string, Feature>& feature_map =
+      GetFeatures(proto).feature();
+  auto it = feature_map.find(internal::ProtoMapKey(key));
+
+  if (it == feature_map.end()) {
+    return nullptr;
+  }
+
+  return &it->second;
+}
+
+// Base declaration of a family of template functions to return a read only
+// repeated field of feature values or nullptr.
+template <typename FeatureType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type*
+MaybeGetFeatureValues(const Feature& feature);
+
+template <>
+const protobuf::RepeatedField<protobuf_int64>*
+MaybeGetFeatureValues<protobuf_int64>(const Feature& feature);
+template <>
+const protobuf::RepeatedField<float>* MaybeGetFeatureValues<float>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>* MaybeGetFeatureValues<tstring>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>*
+MaybeGetFeatureValues<std::string>(const Feature& feature);
+
+// Returns a read only repeated field corresponding to a feature with the
+// specified name and FeatureType. Supported ProtoTypes: SequenceExample,
+// Example, Features.
+template <typename FeatureType, typename ProtoType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type*
+MaybeGetFeatureValues(absl::string_view key, const ProtoType& proto) {
+  const Feature* feature = MaybeGetFeature(key, proto);
+  if (feature == nullptr) {
+    return nullptr;
+  }
+  return &GetFeatureValues<FeatureType>(*feature);
+}
+
+// Returns a mutable Feature proto for the specified key, creates a new if
+// necessary. Supported types for the proto: SequenceExample, Example, Features.
+template <typename ProtoType>
+Feature* GetFeature(absl::string_view key, ProtoType* proto) {
+  return &(*GetFeatures(proto)->mutable_feature())[internal::ProtoMapKey(key)];
+}
+
+// Returns a repeated field with features corresponding to a feature_list key.
+const protobuf::RepeatedPtrField<Feature>& GetFeatureList(
+    absl::string_view key, const SequenceExample& sequence_example);
+
+// Returns a mutable repeated field with features corresponding to a
+// feature_list key. It will create a new FeatureList if necessary.
+protobuf::RepeatedPtrField<Feature>* GetFeatureList(
+    absl::string_view feature_list_key, SequenceExample* sequence_example);
+
+template <typename IteratorType>
+void AppendFeatureValues(IteratorType first, IteratorType last,
+                         Feature* feature) {
+  using FeatureType = typename internal::FeatureTrait<
+      typename std::iterator_traits<IteratorType>::value_type>::Type;
+  auto& values = *GetFeatureValues<FeatureType>(feature);
+  values.Reserve(std::distance(first, last));
+  for (auto it = first; it != last; ++it) {
+    *values.Add() = *it;
+  }
+}
+
+template <typename ValueType>
+void AppendFeatureValues(std::initializer_list<ValueType> container,
+                         Feature* feature) {
+  using FeatureType = typename internal::FeatureTrait<ValueType>::Type;
+  auto& values = *GetFeatureValues<FeatureType>(feature);
+  values.Reserve(container.size());
+  for (auto& elt : container) {
+    *values.Add() = std::move(elt);
+  }
+}
+
+namespace internal {
+
+// HasSize<T>::value is true_type if T has a size() member.
+template <typename T, typename = void>
+struct HasSize : std::false_type {};
+
+template <typename T>
+struct HasSize<T, absl::void_t<decltype(std::declval<T>().size())>>
+    : std::true_type {};
+
+// Reserves the container's size, if a container.size() method exists.
+template <typename ContainerType, typename RepeatedFieldType>
+auto ReserveIfSizeAvailable(const ContainerType& container,
+                            RepeatedFieldType& values) ->
+    typename std::enable_if_t<HasSize<ContainerType>::value, void> {
+  values.Reserve(container.size());
+}
+
+template <typename ContainerType, typename RepeatedFieldType>
+auto ReserveIfSizeAvailable(const ContainerType& container,
+                            RepeatedFieldType& values) ->
+    typename std::enable_if_t<!HasSize<ContainerType>::value, void> {}
+
+}  // namespace internal
+
+template <typename ContainerType>
+void AppendFeatureValues(const ContainerType& container, Feature* feature) {
+  using IteratorType = typename ContainerType::const_iterator;
+  using FeatureType = typename internal::FeatureTrait<
+      typename std::iterator_traits<IteratorType>::value_type>::Type;
+  auto* values = GetFeatureValues<FeatureType>(feature);
+  internal::ReserveIfSizeAvailable(container, *values);
+  // This is equivalent to std::copy into `values` with a
+  // RepeatedFieldBackInserter, the difference is RFBI isn't compatible with
+  // types that we want to convert (e.g. absl::string_view -> std::string).
+  for (const auto& elt : container) {
+    if constexpr (internal::is_string<FeatureType>::value) {
+      *values->Add() = std::string(elt);
+    } else {
+      *values->Add() = elt;
+    }
+  }
+}
+
+// Copies elements from the range, defined by [first, last) into the feature
+// obtainable from the (proto, key) combination.
+template <typename IteratorType, typename ProtoType>
+void AppendFeatureValues(IteratorType first, IteratorType last,
+                         absl::string_view key, ProtoType* proto) {
+  AppendFeatureValues(first, last, GetFeature(key, GetFeatures(proto)));
+}
+
+// Copies all elements from the container into a feature.
+template <typename ContainerType, typename ProtoType>
+void AppendFeatureValues(const ContainerType& container, absl::string_view key,
+                         ProtoType* proto) {
+  AppendFeatureValues<ContainerType>(container,
+                                     GetFeature(key, GetFeatures(proto)));
+}
+
+// Copies all elements from the initializer list into a Feature contained by
+// Features or Example proto.
+template <typename ValueType, typename ProtoType>
+void AppendFeatureValues(std::initializer_list<ValueType> container,
+                         absl::string_view key, ProtoType* proto) {
+  AppendFeatureValues<ValueType>(container,
+                                 GetFeature(key, GetFeatures(proto)));
+}
+
+// Clears the feature's repeated field (int64, float, or string).
+template <typename... FeatureType>
+void ClearFeatureValues(Feature* feature);
+
+template <>
+void ClearFeatureValues<protobuf_int64>(Feature* feature);
+template <>
+void ClearFeatureValues<float>(Feature* feature);
+template <>
+void ClearFeatureValues<std::string>(Feature* feature);
+template <>
+void ClearFeatureValues<tstring>(Feature* feature);
+
+// Clears the feature's repeated field (int64, float, or string). Copies
+// elements from the range, defined by [first, last) into the feature's repeated
+// field.
+template <typename IteratorType>
+void SetFeatureValues(IteratorType first, IteratorType last, Feature* feature) {
+  using FeatureType = typename internal::FeatureTrait<
+      typename std::iterator_traits<IteratorType>::value_type>::Type;
+  ClearFeatureValues<FeatureType>(feature);
+  AppendFeatureValues(first, last, feature);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the initializer list into the feature's repeated field.
+template <typename ValueType>
+void SetFeatureValues(std::initializer_list<ValueType> container,
+                      Feature* feature) {
+  using FeatureType = typename internal::FeatureTrait<ValueType>::Type;
+  ClearFeatureValues<FeatureType>(feature);
+  AppendFeatureValues(container, feature);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the container into the feature's repeated field.
+template <typename ContainerType>
+void SetFeatureValues(const ContainerType& container, Feature* feature) {
+  using IteratorType = typename ContainerType::const_iterator;
+  using FeatureType = typename internal::FeatureTrait<
+      typename std::iterator_traits<IteratorType>::value_type>::Type;
+  ClearFeatureValues<FeatureType>(feature);
+  AppendFeatureValues(container, feature);
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies
+// elements from the range, defined by [first, last) into the feature's repeated
+// field.
+template <typename IteratorType, typename ProtoType>
+void SetFeatureValues(IteratorType first, IteratorType last,
+                      absl::string_view key, ProtoType* proto) {
+  SetFeatureValues(first, last, GetFeature(key, GetFeatures(proto)));
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the container into the feature's repeated field.
+template <typename ContainerType, typename ProtoType>
+void SetFeatureValues(const ContainerType& container, absl::string_view key,
+                      ProtoType* proto) {
+  SetFeatureValues<ContainerType>(container,
+                                  GetFeature(key, GetFeatures(proto)));
+}
+
+// Clears the feature's repeated field (int64, float, or string). Copies all
+// elements from the initializer list into the feature's repeated field.
+template <typename ValueType, typename ProtoType>
+void SetFeatureValues(std::initializer_list<ValueType> container,
+                      absl::string_view key, ProtoType* proto) {
+  SetFeatureValues<ValueType>(container, GetFeature(key, GetFeatures(proto)));
+}
+
+// Returns true if a feature with the specified key belongs to the Features.
+// The template parameter pack accepts zero or one template argument - which
+// is FeatureType. If the FeatureType not specified (zero template arguments)
+// the function will not check the feature type. Otherwise it will return false
+// if the feature has a wrong type.
+template <typename... FeatureType>
+bool HasFeature(absl::string_view key, const Features& features);
+
+template <>
+bool HasFeature<>(absl::string_view key, const Features& features);
+template <>
+bool HasFeature<protobuf_int64>(absl::string_view key,
+                                const Features& features);
+template <>
+bool HasFeature<float>(absl::string_view key, const Features& features);
+template <>
+bool HasFeature<std::string>(absl::string_view key, const Features& features);
+template <>
+bool HasFeature<tstring>(absl::string_view key, const Features& features);
+
+// Returns true if a feature with the specified key belongs to the Example.
+// Doesn't check feature type if used without FeatureType, otherwise the
+// specialized versions return false if the feature has a wrong type.
+template <typename... FeatureType>
+bool HasFeature(absl::string_view key, const Example& example) {
+  return HasFeature<FeatureType...>(key, GetFeatures(example));
+}
+
+// Returns true if a feature with the specified key belongs to the
+// SequenceExample. Doesn't check feature type if used without FeatureType,
+// otherwise the specialized versions return false if the feature has a wrong
+// type.
+template <typename... FeatureType>
+bool HasFeature(absl::string_view key,
+                const SequenceExample& sequence_example) {
+  return HasFeature<FeatureType...>(key, GetFeatures(sequence_example));
+}
+
+// TODO(gorban): update all clients in a followup CL.
+template <typename... FeatureType>
+ABSL_DEPRECATED("Use HasFeature instead.")
+bool ExampleHasFeature(absl::string_view key, const Example& example) {
+  return HasFeature<FeatureType...>(key, example);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/allocator.h b/third_party/tflite-hdrs/tensorflow/core/framework/allocator.h
new file mode 100644
index 00000000..dbf2c29f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/allocator.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
+
+#include <stdlib.h>
+
+#include <functional>
+#include <limits>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/numa.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::AllocationAttributes;
+using tsl::Allocator;
+using tsl::AllocatorAttributes;
+using tsl::AllocatorMemoryType;
+using tsl::AllocatorStats;
+using tsl::AllocatorWrapper;
+using tsl::cpu_allocator;
+using tsl::cpu_allocator_base;
+using tsl::CPUAllocatorFullStatsEnabled;
+using tsl::CPUAllocatorStatsEnabled;
+using tsl::DisableCPUAllocatorStats;
+using tsl::EnableCPUAllocatorFullStats;
+using tsl::EnableCPUAllocatorStats;
+using tsl::SubAllocator;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/allocator_registry.h b/third_party/tflite-hdrs/tensorflow/core/framework/allocator_registry.h
new file mode 100644
index 00000000..7bc03241
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/allocator_registry.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of memory allocator factories.
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/tsl/framework/allocator_registry.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
+
+namespace tensorflow {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::AllocatorFactory;
+using tsl::AllocatorFactoryRegistration;
+using tsl::AllocatorFactoryRegistry;
+using tsl::ProcessStateInterface;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/attr_value_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/attr_value_util.h
new file mode 100644
index 00000000..b6f7c972
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/attr_value_util.h
@@ -0,0 +1,142 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ATTR_VALUE_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ATTR_VALUE_UTIL_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+namespace attr_value_util_internal {
+// Return the size of the tensor represented by this TensorProto. If shape is
+// not fully defined return -1.
+int64_t TensorByteSize(const TensorProto& t);
+}  // namespace attr_value_util_internal
+
+// Forward declare protos so their symbols can be removed from .so exports
+class AttrValue;
+class NameAttrList;
+
+// A human-readable rendering of attr_value, that is more concise than a
+// text-format proto.
+std::string SummarizeAttrValue(const AttrValue& attr_value);
+
+// Generates an error if attr_value doesn't have the indicated attr type.
+absl::Status AttrValueHasType(const AttrValue& attr_value,
+                              absl::string_view type);
+
+// Converts a text proto value from "text" into the field of *out
+// indicated by "type" (e.g. from the type field of an AttrDef).
+// Examples:
+// * If type:"int" and text:"-14", then *out is set to "i: -14"
+// * If type:"list(string)" and text:"['foo', 'bar']",
+//   then *out is set to "list { s: ['foo', 'bar'] }"
+// Returns true on success.
+bool ParseAttrValue(absl::string_view type, absl::string_view text,
+                    AttrValue* out);
+
+// Sets *out based on the type of value.
+void SetAttrValue(const std::string& value, AttrValue* out);
+void SetAttrValue(const tstring& value, AttrValue* out);
+void SetAttrValue(const char* value, AttrValue* out);
+void SetAttrValue(absl::string_view value, AttrValue* out);
+void SetAttrValue(int64_t value, AttrValue* out);
+void SetAttrValue(int32_t value, AttrValue* out);
+void SetAttrValue(float value, AttrValue* out);
+void SetAttrValue(double value, AttrValue* out);
+void SetAttrValue(bool value, AttrValue* out);
+void SetAttrValue(DataType value, AttrValue* out);
+void SetAttrValue(const TensorShape& value, AttrValue* out);
+void SetAttrValue(const TensorShapeProto& value, AttrValue* out);
+void SetAttrValue(const PartialTensorShape& value, AttrValue* out);
+void SetAttrValue(const Tensor& value, AttrValue* out);
+void SetAttrValue(const TensorProto& value, AttrValue* out);
+void SetAttrValue(const NameAttrList& value, AttrValue* out);
+
+void SetAttrValue(absl::Span<const string> value, AttrValue* out);
+void SetAttrValue(absl::Span<const tstring> value, AttrValue* out);
+void SetAttrValue(absl::Span<const char* const> value, AttrValue* out);
+void SetAttrValue(absl::Span<const absl::string_view> value, AttrValue* out);
+void SetAttrValue(absl::Span<const int64_t> value, AttrValue* out);
+void SetAttrValue(absl::Span<const int32> value, AttrValue* out);
+void SetAttrValue(absl::Span<const float> value, AttrValue* out);
+void SetAttrValue(absl::Span<const double> value, AttrValue* out);
+void SetAttrValue(absl::Span<const bool> value, AttrValue* out);
+void SetAttrValue(const std::vector<bool>& value, AttrValue* out);
+void SetAttrValue(std::initializer_list<bool> value, AttrValue* out);
+void SetAttrValue(DataTypeSlice value, AttrValue* out);
+void SetAttrValue(absl::Span<const TensorShape> value, AttrValue* out);
+void SetAttrValue(absl::Span<const TensorShapeProto> value, AttrValue* out);
+void SetAttrValue(absl::Span<const PartialTensorShape> value, AttrValue* out);
+void SetAttrValue(absl::Span<const Tensor> value, AttrValue* out);
+void SetAttrValue(absl::Span<const TensorProto> value, AttrValue* out);
+void SetAttrValue(absl::Span<const NameAttrList> value, AttrValue* out);
+
+void SetAttrValue(const AttrValue& value, AttrValue* out);
+
+void MoveAttrValue(std::vector<string>&& value, AttrValue* out);
+
+// Returns a hash of `a` that is consistent with AreAttrValuesEqual. In other
+// words, if two AttrValues compare equal according to AreAttrValuesEqual,
+// they will have the same hash value.
+// Similarly to protobuf deterministic serialization, hash value is
+// guaranteed to be stable only for a given binary. In particular, one should
+// probably not persist the returned value.
+uint64 AttrValueHash(const AttrValue& a);
+
+// WARNING: Equality check might return false-negative for large (> 32mb)
+// tensors defined with different TensorProto representations.
+//
+// A pair of consistent hash and equals functions that are guaranteed to be fast
+// with AttrValues that potentially can have very large Tensors (larger than
+// 32mb) defined by TensorProto. If large identical Tensors are defined using
+// different representations (e.g. one with tensor content, and second with
+// bool_val), they will have different hash code and equals will return false.
+// Small (less than 32mb) tensors with different TensorProto representations
+// hashed/compared by their tensor content.
+uint64 FastAttrValueHash(const AttrValue& a);
+// Returns true if a and b have the same value. If false negatives are allowed,
+// then compares proto representation to avoid construction of large (> 32mb)
+// tensors.
+bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
+                        bool allow_false_negatives = false);
+
+// Returns true if "val" has a placeholder.
+bool HasPlaceHolder(const AttrValue& val);
+
+// SubstitutePlaceholders recursively replaces placeholders in 'value'
+// with an attr value by calling SubstituteFunc. Returns true iff all
+// placeholders in "value" are replaced with a value.
+//
+// SubstituteFunc is given a placeholder string. If the placeholder is
+// unknown, SubstituteFunc returns false. Otherwise, overwrites the
+// attr value and returns true.
+using SubstituteFunc = std::function<bool(const string&, AttrValue*)>;
+bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ATTR_VALUE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/bfloat16.h b/third_party/tflite-hdrs/tensorflow/core/framework/bfloat16.h
new file mode 100644
index 00000000..4f13039d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/bfloat16.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
+#define TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/platform/types.h"
+
+// Compact 16-bit encoding of floating point numbers. This representation uses
+// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.  It
+// is assumed that floats are in IEEE 754 format so the representation is just
+// bits 16-31 of a single precision float.
+//
+// NOTE: The IEEE floating point standard defines a float16 format that
+// is different than this format (it has fewer bits of exponent and more
+// bits of mantissa).  We don't use that format here because conversion
+// to/from 32-bit floats is more complex for that format, and the
+// conversion for this format is very simple.
+//
+// Because of the existing IEEE float16 type, we do not name our representation
+// "float16" but just use "uint16".
+//
+// <-----our 16bits float------->
+// s e e e e e e e e f f f f f f f f f f f f f f f f f f f f f f f
+// <------------------------------float-------------------------->
+// 3 3             2 2             1 1                           0
+// 1 0             3 2             5 4                           0
+//
+//
+// This type only supports conversion back and forth with float.
+//
+// This file must be compilable by nvcc.
+//
+// The type is defined in framework/numeric_types.h.
+
+namespace tensorflow {
+
+// Convert from float to bfloat16 with rounding-to-nearest-even.
+void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64_t size);
+// Convert from float to bfloat16 with truncation. Notice this conversion is
+// lossy since it truncates the float to 7 mantissa bits without rounding.
+void FloatToBFloat16(const float* src, bfloat16* dst, int64_t size);
+// Convert from bfloat16 to float. This conversion is lossless.
+void BFloat16ToFloat(const bfloat16* src, float* dst, int64_t size);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/bounds_check.h b/third_party/tflite-hdrs/tensorflow/core/framework/bounds_check.h
new file mode 100644
index 00000000..76e6e6dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/bounds_check.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_BOUNDS_CHECK_H_
+#define TENSORFLOW_CORE_FRAMEWORK_BOUNDS_CHECK_H_
+
+#include <type_traits>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Check that 0 <= index < limit using a single comparison, assuming
+// that 0 <= limit if Index is signed.  Intended for use in performance
+// critical contexts where 0 <= index < limit is almost always true.
+template <typename Ta, typename Tb>
+EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool FastBoundsCheck(const Ta index,
+                                                           const Tb limit) {
+  static_assert(std::is_integral<Ta>::value && std::is_integral<Tb>::value,
+                "FastBoundsCheck can only be used on integer types.");
+  typedef typename std::make_unsigned<decltype(index + limit)>::type UIndex;
+  return TF_PREDICT_TRUE(static_cast<UIndex>(index) <
+                         static_cast<UIndex>(limit));
+}
+
+namespace internal {
+// Ensure that the compiler cannot elide a copy into a local, for
+// bounds checking on source tensors that might be updated asynchronously.
+// This function may only be used on primitive integral types (int32, int64,
+// etc).  It does not guarantee any atomicity or barriers.
+template <typename T>
+EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
+  static_assert(std::is_integral<T>::value,
+                "SubtleMustCopy can only be used on integer types.");
+  auto *to_x = reinterpret_cast<const volatile T *>(&x);
+  return *to_x;
+}
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_BOUNDS_CHECK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/cancellation.h b/third_party/tflite-hdrs/tensorflow/core/framework/cancellation.h
new file mode 100644
index 00000000..522de22c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/cancellation.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_CANCELLATION_H_
+#define TENSORFLOW_CORE_FRAMEWORK_CANCELLATION_H_
+
+#include "xla/tsl/framework/cancellation.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CancelCallback;
+using tsl::CancellationManager;
+using tsl::CancellationToken;
+using tsl::RegisterCancellationCallback;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_CANCELLATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/collective.h b/third_party/tflite-hdrs/tensorflow/core/framework/collective.h
new file mode 100644
index 00000000..8fca00f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/collective.h
@@ -0,0 +1,522 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/intrusive_ptr.h"
+
+namespace tensorflow {
+
+class BufRendezvous;
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class Device;
+class DeviceMgr;
+class GetStepSequenceRequest;
+class GetStepSequenceResponse;
+class NcclManager;
+class Tensor;
+
+// Types of supported collective operations.
+enum CollectiveType {
+  REDUCTION_COLLECTIVE = 0,
+  BROADCAST_COLLECTIVE,
+  GATHER_COLLECTIVE,
+  PERMUTE_COLLECTIVE,
+  ALL_TO_ALL_COLLECTIVE,
+  REDUCE_SCATTER_COLLECTIVE,
+  UNDEFINED_COLLECTIVE,
+};
+
+// Some collective op implementations require runtime group configuration from
+// the OpKernel.  Currently, this struct is used to set communicator key for
+// NCCL-based collective implementation.
+struct CollGroupRuntimeDetails {
+  string communicator_key;  // for communicator-based techniques e.g. NCCL
+  string ToString() const;
+};
+
+struct CollGroupMember {
+  DeviceAttributes device;
+  string task;
+  bool is_local;
+  // User provided rank
+  int32 rank = -1;
+};
+
+// Data common to all members of a device group.
+// All members share the same device set but its order is
+// particular to an instance so it is stored there.
+struct CollGroupParams {
+  // Inputs from Collective ops:
+  int32 group_key;
+  int32 group_size;
+  DeviceType device_type;
+  int user_specified_rank = -1;  // rank provided by the user.
+  // Generated from Collective Group Resolver:
+  // Members in this group, in default rank order.
+  std::vector<CollGroupMember> members;
+  // True if every task has the same number of devices.
+  bool same_num_devices_per_task = false;
+  // Task -> number of devices on that task.
+  std::unordered_map<string, int32> num_devices_per_task;
+  int32 num_tasks;  // number of distinct tasks in group
+  CollGroupRuntimeDetails runtime_details;
+  string ToString() const;
+  CollGroupParams()
+      : group_key(0), group_size(0), device_type(DEVICE_CPU), num_tasks(0) {}
+};
+
+// The best implementation of a collective op depends on many factors
+// including the number of devices involved, the topology of
+// interconnects between them and the sizes of inputs.  This structure
+// is used in generating and representing data movement choreography
+// for each specific algorithm, hence it does not have a single, fixed
+// interpretation.  On first execution the runtime will update this
+// structure with decisions that will guide all subsequent executions.
+struct CollImplDetails {
+  string collective_name;
+  std::vector<std::vector<int>> subdiv_permutations;
+  // subdiv_offsets and max_subdivs_per_device are used together as follows:
+  // When subdiv_offsets is provided (non-empty) it is used as is. When
+  // subdiv_offsets is not provided subdivisons are generated dynamically
+  // constrained by max_subdivs_per_device. When subdiv_offsets is empty AND
+  // max_subdivs_per_device = 0 an internal default kMaxSubdivsPerDeviceDefault
+  // is used. When max_subdivs_per_device = -1, no subivision is done.
+  int max_subdivs_per_device = -1;  // Upper bound on subdivisions per device.
+  std::vector<int> subdiv_offsets;
+  std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
+  std::vector<int32>
+      dependencies;           // collective instances on which this node depends
+  string communication_hint;  // user-supplied hint for implementation choice,
+                              // e.g. ring or nccl
+  float timeout_seconds;      // If non zero, set a completion timeout for the
+                              // collective op to detect staleness.
+};
+
+// Data common to all members of a collective instance.
+// TODO(b/163171014) Refactor this struct to not be a union of all fields.
+struct CollInstanceParams {
+  // Identifies all participating graph nodes.
+  int32 instance_key = -1;
+  // The full identifier includes both instance_key and step_id.
+  int64_t step_id = 0;
+  CollectiveType type = UNDEFINED_COLLECTIVE;
+  DataType data_type = DT_FLOAT;
+  TensorShape shape = {0};
+  CollImplDetails impl_details;
+  string ToString() const;
+  CollInstanceParams& operator=(const struct CollInstanceParams& other);
+  std::vector<string> devices;  // permuter only
+
+  // For permuter only
+  // Each rank in the permutation is a receiver.
+  // Indices of each rank means a sender to that rank.
+  // Example: permutation = {2,0,1} means
+  //   rank 0 sends to rank 2
+  //   rank 1 sends to rank 0
+  //   rank 2 sends to rank 1
+  std::vector<int> permutation;
+};
+
+// Unique to a single CollectiveOp node.
+struct CollectiveParams : public core::RefCounted {
+  CollGroupParams group;
+  CollInstanceParams instance;
+
+  string name = "";        // node name used only for log or error messages
+  int default_rank = -1;   // index of this op within device_names
+  bool is_source = false;  // broadcast only
+  int source_rank = -1;    // broadcast only
+  // Rank of this device in each subdivision permutation.
+  std::vector<int> subdiv_rank;
+  OpKernel* merge_op = nullptr;  // reduction only
+  OpKernel* final_op = nullptr;  // reduction only
+  string ToString() const;
+  bool run_group_initialization = true;
+  bool is_stateless = false;
+};
+
+class CollectiveExecutor;
+
+// Interface that provides resolution of device localities.
+class DeviceResolverInterface {
+ public:
+  virtual ~DeviceResolverInterface() {}
+
+  // Populates *attributes with the DeviceAttributes of the specified device.
+  virtual absl::Status GetDeviceAttributes(const string& device,
+                                           DeviceAttributes* attributes) = 0;
+
+  // Returns all device attributes of a task.
+  virtual absl::Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) = 0;
+
+  // Updates device attributes. It returns error if any device already
+  // exists in the DeviceResolver and has a different incarnation.
+  virtual absl::Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) = 0;
+};
+
+// Interface that provides resolution of shared CollectiveParams fields.
+class ParamResolverInterface {
+ public:
+  virtual ~ParamResolverInterface() {}
+
+  // Called by each collective op at first execution in order to fill out
+  // the CollectiveParams structure with data gathered from the full
+  // (maybe distributed) collection of peer nodes.
+  virtual void CompleteParamsAsync(const DeviceAttributes& device,
+                                   CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   const StatusCallback& done) = 0;
+
+  // Completes group_params with data gathered from all devices in the group.
+  // This blocks until all devices are there.
+  virtual void CompleteGroupAsync(const DeviceAttributes& device,
+                                  CollGroupParams* group_params,
+                                  CancellationManager* cancel_mgr,
+                                  const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify data
+  // shared across an instance group.
+  // Note: this works differently from CompleteGroupAsync as a refactor is in
+  // progress.
+  virtual void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                                     CompleteInstanceResponse* response,
+                                     CancellationManager* cancel_mgr,
+                                     const StatusCallback& done) = 0;
+
+  // Looks up a group. It returns an error if the group is not ready or not
+  // found.
+  virtual absl::Status LookupGroup(int32_t group_key,
+                                   CollGroupParams* group) = 0;
+
+  // Aborts the resolver. After abortion the resolver can no longer be used.
+  virtual void StartAbort(const absl::Status& s) = 0;
+};
+
+// Graphs which utilize Collective Ops in a common instance must
+// execute with identical step_ids even if they are disjoint graphs
+// run by otherwise independent tasks.  This interface supplies
+// coordinated step_ids to use in such cases.
+class StepSequenceInterface {
+ public:
+  virtual ~StepSequenceInterface() {}
+
+  // Used with a distributed implementation to coordinate step_id
+  // sequences across tasks.
+  virtual void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                    GetStepSequenceResponse* response,
+                                    const StatusCallback& done) = 0;
+
+  // Refresh the local per-graph_key step_id sequence from collective
+  // group leader, if applicable.
+  virtual void RefreshStepIdSequenceAsync(int64_t graph_key,
+                                          const StatusCallback& done) = 0;
+
+  // Returns the step_id that should be used for initiating a new execution
+  // on the specified graph. May return the same step_id multiple times if
+  // RetireStepId or RefreshStepIdReservation is not called.
+  virtual int64_t NextStepId(int64_t graph_key) = 0;
+
+  // Reports that execution of the given step has completed successfully.
+  // Should be called immediately after a step completes with OK status,
+  // prior to calling NextStepId().  If the step fails, don't call.
+  virtual void RetireStepId(int64_t graph_key, int64_t step_id) = 0;
+};
+
+class NcclCommunicatorInterface;
+
+// Interface that provides access to per-step CollectiveExecutor
+// instances and various distributed resolution capabilities.
+class CollectiveExecutorMgrInterface : public StepSequenceInterface {
+ public:
+  ~CollectiveExecutorMgrInterface() override {}
+
+  // Returns the step-specific CollectiveExecutor, creating if one does not
+  // already exist.  The caller assumes ownership of one Ref on the object.
+  virtual CollectiveExecutor* FindOrCreate(int64_t step_id) = 0;
+
+  // If there is a CollectiveExecutor for step_id, remove it from the
+  // table.
+  virtual void Cleanup(int64_t step_id) = 0;
+
+  // Cleanup the entire table, removing all entries for step_ids.
+  virtual void CleanupAll() = 0;
+
+  virtual ParamResolverInterface* GetParamResolver() const = 0;
+
+  virtual DeviceResolverInterface* GetDeviceResolver() const = 0;
+
+  virtual NcclCommunicatorInterface* GetNcclCommunicator() const = 0;
+};
+
+// Interface that a Collective Op implementation uses to exchange data
+// with peers.  Note that data exchange is currently limited to types
+// for which DMAHelper::CanUseDMA() returns true, i.e.  dense numeric
+// types.
+class CollectiveRemoteAccess {
+ public:
+  virtual ~CollectiveRemoteAccess() {}
+
+  virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
+                            bool peer_is_local, const string& key,
+                            Device* to_device, DeviceContext* to_device_ctx,
+                            const AllocatorAttributes& to_alloc_attr,
+                            Tensor* to_tensor,
+                            const DeviceLocality& client_locality,
+                            int dev_to_dev_stream_index,
+                            CancellationManager* cancellation_manager,
+                            const StatusCallback& done) = 0;
+
+  virtual void PostToPeer(const string& peer_device, const string& peer_task,
+                          const string& key, Device* from_device,
+                          DeviceContext* from_device_ctx,
+                          const AllocatorAttributes& from_alloc_attr,
+                          const Tensor* from_tensor,
+                          const DeviceLocality& client_locality,
+                          CancellationManager* cancellation_manager,
+                          const StatusCallback& done) = 0;
+
+  // Checks the health of a collective peer. It probes the peer to see if it is
+  // alive. Note that if a peer has restarted, it's considered a different one,
+  // so CheckPeerHealth fails.
+  virtual void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+                               const StatusCallback& done) = 0;
+
+  virtual BufRendezvous* buf_rendezvous() = 0;
+
+  virtual void StartAbort(const absl::Status& s) = 0;
+};
+
+// A step-specific object that can execute a collective operation completely
+// described by a CollectiveParams object.
+class CollectiveExecutor : public core::RefCounted {
+ public:
+  virtual void StartAbort(const absl::Status& s) {}
+
+  virtual void ExecuteAsync(OpKernelContext* ctx,
+                            const CollectiveParams* col_params,
+                            const string& exec_key, StatusCallback done) {
+    done(errors::Internal(
+        "A collective Op has been called in a context in which "
+        "a CollectiveExecutor has not been provided."));
+  }
+
+  virtual void CompleteParamsAsync(const DeviceAttributes& device,
+                                   CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   StatusCallback done) {
+    done(errors::Internal(
+        "A collective Op has been called in a context in which "
+        "a CollectiveExecutor has not been provided."));
+  }
+
+  virtual void CompleteGroupAsync(const DeviceAttributes& device,
+                                  CollGroupParams* group_params,
+                                  CancellationManager* cancel_mgr,
+                                  StatusCallback done) {
+    return cem_->GetParamResolver()->CompleteGroupAsync(device, group_params,
+                                                        cancel_mgr, done);
+  }
+
+  virtual absl::Status LookupGroup(int32_t group_key, CollGroupParams* group) {
+    return cem_->GetParamResolver()->LookupGroup(group_key, group);
+  }
+
+  // Runs the potentially-blocking closure/expensive callback.
+  virtual void RunClosure(std::function<void()> closure) = 0;
+
+  virtual CollectiveRemoteAccess* remote_access() { return nullptr; }
+
+  // `WaitForDependencies` and `Launched` are used for fine-grained control of
+  // execution order between collective instances.  These functions are intended
+  // to be called in `Run` function of collective implementations, and may be
+  // used to make part, or whole, of the collective execution ordered with
+  // respect to other collective instances.
+  //
+  // `WaitForDependencies` will block until it is safe to continue the callee's
+  // execution, where safety is defined as: ordered with respect to the
+  // collective instances defined in the callee's `wait_for` attribute.
+  virtual void WaitForDependencies(const CollectiveParams& col_params) {}
+  // `UnblockDependencies` unblocks the dependent collective instances by
+  // recording that this caller's device has completed the critical portion of
+  // the collective execution.
+  virtual void UnblockDependencies(const CollectiveParams& col_params) {}
+
+  // Used to designate an invalid group or instance key.
+  static int64_t kInvalidId;
+
+  // Lexically scoped handle for Ref.
+  class Handle {
+   public:
+    explicit Handle(CollectiveExecutor* ce, bool inherit_ref) : ce_(ce) {
+      if (!inherit_ref) ce->Ref();
+    }
+    ~Handle() { ce_->Unref(); }
+    CollectiveExecutor* get() const { return ce_; }
+
+   private:
+    CollectiveExecutor* ce_;
+  };
+
+ protected:
+  explicit CollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : cem_(cem) {}
+
+  // For use only by derived classes
+  static OpKernelContext::Params* CtxParams(OpKernelContext* ctx);
+  CollectiveExecutorMgrInterface* cem_;
+
+  CollectiveExecutor(const CollectiveExecutor&) = delete;
+  void operator=(const CollectiveExecutor&) = delete;
+};
+
+struct CollectiveContext {
+  CollectiveExecutor* col_exec;                  // Not owned
+  NcclCommunicatorInterface* nccl_communicator;  // Not owned
+  const DeviceMgr* dev_mgr;                      // Not owned
+  OpKernelContext* op_ctx;                       // Not owned
+  OpKernelContext::Params* op_params;            // Not owned
+  core::IntrusivePtr<const CollectiveParams> col_params;
+  const string exec_key;
+  const int64_t step_id;
+  const Tensor* input;  // Not owned
+  Tensor* output;       // Not owned
+  Device* device;       // The device for which this instance labors
+  const string device_name;
+  DeviceLocality device_locality;
+
+  CollectiveContext(CollectiveExecutor* col_exec,
+                    NcclCommunicatorInterface* nccl_communicator,
+                    const DeviceMgr* dev_mgr, OpKernelContext* ctx,
+                    OpKernelContext::Params* op_params,
+                    const CollectiveParams* col_params, const string& exec_key,
+                    int64_t step_id, const Tensor* input, Tensor* output);
+};
+
+class NcclCommunicatorInterface {
+ public:
+  virtual ~NcclCommunicatorInterface() = default;
+
+  virtual string GenerateCommunicatorKey() = 0;
+
+  virtual void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+                       StatusCallback done) = 0;
+
+  virtual void StartAbort(const absl::Status& s) = 0;
+};
+
+// Interface of a Collective Op implementation.  Each specific CollectiveOp will
+// implement this interface and register the implementation via the
+// CollectiveRegistry detailed below.  See common_runtime/ring_reducer and
+// common_runtime/hierarchical_tree_broadcaster for examples.
+class CollectiveImplementationInterface : public core::RefCounted {
+ public:
+  ~CollectiveImplementationInterface() override = default;
+
+  // Initializes the portions of `col_params` specific to this
+  // implementation.  Called exactly once for every Collective instance during
+  // the CollectiveParams resolution process when the graph is first executed,
+  // at the end of `CompleteInstanceLocal()`.
+  // NOTE(ayushd): This is effectively a static function because it modifies the
+  // `col_params` passed in and should not manipulate any data members.  However
+  // because it is virtual and needs to be implemented by every derived class we
+  // do not mark it as static.
+  virtual absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) = 0;
+
+  // Prepares the CollectiveContext for executing this CollectiveImplementation.
+  // Called from CollectiveExecutor right before calling Run().  The
+  // CollectiveContext passed in must outlive the CollectiveImplementation
+  // object.
+  virtual absl::Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) = 0;
+
+  // Processes and moves data according to the logic of this Collective
+  // implementation.  Relies on appropriate initialization of op-specific
+  // CollectiveParams in InitializeCollectiveParams(), as well as appropriate
+  // context initialization in InitializeCollectiveContext().
+  virtual void Run(StatusCallback done) = 0;
+};
+
+// Static-methods only class for registering and looking up collective
+// implementations.
+class CollectiveRegistry {
+ public:
+  using Factory = std::function<CollectiveImplementationInterface*()>;
+  // Looks up a previously registered CollectiveImplementation under
+  // `collective_name`.  If found, creates an instance of the implementation and
+  // assign to `implementation`.
+  static absl::Status Lookup(
+      const string& collective_name,
+      CollectiveImplementationInterface** implementation);
+
+  // Looks up a previously registered CollectiveImplementation under
+  // `collective_name`.  If found, returns the static instance of this
+  // implementation via `implementation`.  This instance should only be used to
+  // call InitializateCollectiveParams.
+  static absl::Status LookupParamResolverInstance(
+      const string& collective_name,
+      CollectiveImplementationInterface** implementation);
+
+  // Returns all registered collective implementations.
+  static void GetAll(
+      std::vector<CollectiveImplementationInterface*>* implementations);
+
+ private:
+  friend class CollectiveRegistration;
+  // Registers a CollectiveImplementation with name `collective_name` and
+  // factory `factory`.  The latter is a function used to create instances of
+  // the CollectiveImplementation.  Also creates a static instance of the
+  // implementation - this instance is used during param resolution and should
+  // only be used to call InitializeCollectiveParams.
+  static absl::Status Register(const string& collective_name, Factory factory);
+
+  static absl::Status LookupHelper(
+      const string& collective_name,
+      CollectiveImplementationInterface** implementation, bool param_resolver);
+};
+
+// Class used to call CollectiveRegistry::Register.  This should only be used to
+// create a global static object.
+class CollectiveRegistration {
+ public:
+  CollectiveRegistration(const string& collective_name,
+                         CollectiveRegistry::Factory factory) {
+    TF_CHECK_OK(CollectiveRegistry::Register(collective_name, factory));
+  }
+};
+
+#define REGISTER_COLLECTIVE(name, implementation)             \
+  static CollectiveRegistration register_##name##_collective( \
+      #name, []() { return new implementation; });
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/common_shape_fns.h b/third_party/tflite-hdrs/tensorflow/core/framework/common_shape_fns.h
new file mode 100644
index 00000000..1be1633f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/common_shape_fns.h
@@ -0,0 +1,313 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_COMMON_SHAPE_FNS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_COMMON_SHAPE_FNS_H_
+
+#include <array>
+
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace shape_inference {
+
+// Like GetWindowedOutputSize, but deals with DimensionHandles. Does not support
+// EXPLICIT padding.
+absl::Status GetWindowedOutputSizeFromDims(InferenceContext* c,
+                                           DimensionHandle input_size,
+                                           DimensionOrConstant filter_size,
+                                           int64_t stride, Padding padding_type,
+                                           DimensionHandle* output_size);
+
+// The V2 version computes the same outputs with arbitrary dilation_rate, and
+// supports EXPLICIT padding. For detailed equations, refer to the comments
+// for GetWindowedOutputSize(). The 'padding_before' and 'padding_after'
+// parameters are only used if padding_type == EXPLICIT.
+absl::Status GetWindowedOutputSizeFromDimsV2(
+    InferenceContext* c, DimensionHandle input_size,
+    DimensionOrConstant filter_size, int64_t dilation_rate, int64_t stride,
+    Padding padding_type, int64_t padding_before, int64_t padding_after,
+    DimensionHandle* output_size);
+
+// Transfers shape of input(0) to output(0).
+absl::Status UnchangedShape(shape_inference::InferenceContext* c);
+
+// Transfers shape of input(0) to output(0), after asserting its rank is <rank>.
+inline absl::Status UnchangedShapeWithRank(shape_inference::InferenceContext* c,
+                                           int32_t rank) {
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &out));
+  c->set_output(0, out);
+  return absl::OkStatus();
+}
+
+// Transfers shape of input(0) to output(0), after asserting its rank >= <rank>.
+inline absl::Status UnchangedShapeWithRankAtLeast(
+    shape_inference::InferenceContext* c, int32_t rank) {
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), rank, &out));
+  c->set_output(0, out);
+  return absl::OkStatus();
+}
+
+// Transfers shape of input(0) to output(0), after asserting its rank <= <rank>.
+inline absl::Status UnchangedShapeWithRankAtMost(
+    shape_inference::InferenceContext* c, int32_t rank) {
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), rank, &out));
+  c->set_output(0, out);
+  return absl::OkStatus();
+}
+
+// Shape function for use with ops no outputs.
+inline absl::Status NoOutputs(shape_inference::InferenceContext* c) {
+  return absl::OkStatus();
+}
+
+// Shape function for ops that output a single scalar value.
+inline absl::Status ScalarShape(shape_inference::InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return absl::OkStatus();
+}
+
+// Shape function for binary ops where both inputs and the output match.
+inline absl::Status MergeBothInputsShapeFn(InferenceContext* c) {
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &out));
+  c->set_output(0, out);
+  return absl::OkStatus();
+}
+
+// Shape function for dataset iterators.
+absl::Status DatasetIteratorShape(shape_inference::InferenceContext* c);
+
+// Returns a new shape with the specified dims arranged in the specified
+// format. The returned value is owned by this context.
+// Note: if format = "FORMAT_NCHW_VECT_C" then C represents the outer_depth.
+absl::Status MakeShapeFromFormat(
+    TensorFormat format, DimensionOrConstant N,
+    const std::vector<DimensionOrConstant>& spatial, DimensionOrConstant C,
+    ShapeHandle* out, shape_inference::InferenceContext* context);
+
+// Shape function for MatMul-like operations.
+absl::Status MatMulShape(shape_inference::InferenceContext* c);
+
+// Shape function for Batched MatMul-like operations with broadcasting across
+// batch dimensions.
+absl::Status BatchMatMulV2Shape(shape_inference::InferenceContext* c);
+
+// Shape function for BatchMatMul-like operations
+absl::Status BatchMatMulShape(shape_inference::InferenceContext* c);
+
+// Shape function for Einsum.
+absl::Status EinsumShape(shape_inference::InferenceContext* c);
+
+// Shape function for BiasAdd-like operations.
+absl::Status BiasAddShape(shape_inference::InferenceContext* c);
+
+// Shape function for BiasAddGrad-like operations.
+absl::Status BiasAddGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for general Convolution operation
+absl::Status ConvShape(shape_inference::InferenceContext* c);
+
+// Shape function for Conv2D-like operations that support explicit padding.
+absl::Status Conv2DShapeWithExplicitPadding(
+    shape_inference::InferenceContext* c);
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
+absl::Status Conv2DShape(shape_inference::InferenceContext* c);
+
+// Shape function for Conv3D-like operations.
+absl::Status Conv3DShape(shape_inference::InferenceContext* c);
+
+// Shape function for DepthwiseConv2D-like operations that support explicit
+// padding.
+absl::Status DepthwiseConv2DNativeShapeWithExplicitPadding(
+    shape_inference::InferenceContext* c);
+
+// Shape function for DepthwiseConv2D-like operations that do not support
+// explicit padding.
+absl::Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c);
+
+// Shape function for Conv2DBackpropInput.
+absl::Status Conv2DBackpropInputShape(shape_inference::InferenceContext* c);
+
+// Shape function for Conv2DBackpropFilterWithBias.
+absl::Status Conv2DBackpropFilterWithBiasShape(
+    shape_inference::InferenceContext* c);
+
+// Shape function for AvgPool-like operations.
+absl::Status AvgPoolShape(shape_inference::InferenceContext* c);
+
+// Shape function for AvgPoolGrad-like operations.
+absl::Status AvgPoolGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for FusedBatchNorm and FusedBatchNormV2 operations.
+absl::Status FusedBatchNormShape(shape_inference::InferenceContext* c);
+
+// Shape function for FusedBatchNormV3 operations.
+absl::Status FusedBatchNormV3Shape(shape_inference::InferenceContext* c);
+
+// Shape function for _FusedBatchNormEx operations.
+absl::Status FusedBatchNormExShape(shape_inference::InferenceContext* c);
+
+// Shape function for FusedBatchNormGrad and FusedBatchNormGradV2 operations.
+absl::Status FusedBatchNormGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for _FusedBatchNormGradEx operations.
+absl::Status FusedBatchNormGradExShape(shape_inference::InferenceContext* c);
+
+// Shape function for MatrixDiagPartV2 and MatrixDiagPartV3 operations.
+absl::Status MatrixDiagPartV2Shape(shape_inference::InferenceContext* c);
+
+// Shape function for MatrixDiagV2 and MatrixDiagV3 operations.
+absl::Status MatrixDiagV2Shape(shape_inference::InferenceContext* c);
+
+// Shape function for MatrixSetDiagV2 and MatrixSetDiagV3 operations.
+absl::Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c);
+
+// Shape function for MaxPool-like operations that support explicit padding.
+absl::Status MaxPoolShapeWithExplicitPadding(
+    shape_inference::InferenceContext* c);
+
+// Shape function for MaxPool-like operations that do not support explicit
+// padding.
+absl::Status MaxPoolShape(shape_inference::InferenceContext* c);
+
+// Shape function for MaxPoolV2-like operations.
+absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
+                            int num_inputs);
+
+// Shape function for MaxPoolGrad-like operations.
+absl::Status MaxPoolGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for 3D Pooling operations.
+absl::Status Pool3DShape(shape_inference::InferenceContext* c);
+
+// Shape function for MaxPool3DGrad-like operations.
+absl::Status MaxPool3DGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for AvgPool3DGrad-like operations.
+absl::Status AvgPool3DGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for use with ops whose output shapes are unknown.
+absl::Status UnknownShape(shape_inference::InferenceContext* c);
+
+// Shape function for reduction operations.
+absl::Status ReductionShape(shape_inference::InferenceContext* c);
+
+// Shape function for unsorted segment operations.
+absl::Status SegmentReductionWithNumSegmentsShapeFn(InferenceContext* c);
+
+// Shape function for concat operations.
+// <num_inputs_to_concat> is the number of inputs to concatenate and are taken
+// from inputs
+// [1,num_inputs_to_concat] of the op.  Input 0 is the concat_dim input.
+absl::Status ConcatShape(shape_inference::InferenceContext* c,
+                         int num_inputs_to_concat);
+
+// Shape function for concat operations.
+absl::Status ConcatV2Shape(shape_inference::InferenceContext* c);
+
+absl::Status QuantizedConcatV2Shape(InferenceContext* c,
+                                    int num_inputs_to_concat);
+
+// Shape function for binary operators that broadcast their inputs
+// and with output to output_index.
+// Note: out cannot be NULL.
+absl::Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
+                                                  ShapeHandle shape_x,
+                                                  ShapeHandle shape_y,
+                                                  bool incompatible_shape_error,
+                                                  ShapeHandle* out);
+
+// Shape function for binary operators that broadcast their inputs
+// and with output to output_index.
+inline absl::Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c,
+                                                   int output_index) {
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+      c, c->input(0), c->input(1), true, &out));
+  c->set_output(output_index, out);
+  return absl::OkStatus();
+}
+
+// Shape function for binary operators that broadcast their inputs.
+// Tested by ops/math_ops_test.cc.
+inline absl::Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
+  return BroadcastBinaryOpOutputShapeFn(c, 0);
+}
+
+// Shape function for random operations.
+absl::Status RandomShape(shape_inference::InferenceContext* c);
+
+// Shape function for Slice operations.
+absl::Status SliceShape(shape_inference::InferenceContext* c);
+
+// Validates the 3 component tensors of a sparse tensor have the proper
+// shapes. This mimics SparseTensor.__init__ in python/framework/ops.py.
+absl::Status ValidateSparseTensor(InferenceContext* c,
+                                  ShapeHandle indices_shape,
+                                  ShapeHandle values_shape,
+                                  ShapeHandle shape_shape);
+
+absl::Status ValidateVariableResourceHandle(
+    InferenceContext* c, std::vector<ShapeAndType>* shape_and_type);
+
+// Shape function for GatherNd operations.
+absl::Status GatherNdShape(InferenceContext* c);
+
+// Helper shape function for ScatterNd.../TensorScatter... operations.
+absl::Status ScatterNdShapeHelper(InferenceContext* c,
+                                  ShapeHandle indices_shape,
+                                  ShapeHandle updates_shape,
+                                  ShapeHandle input_shape);
+
+// Shape function for ops with an explicit "shape" attribute.
+absl::Status ExplicitShape(InferenceContext* c);
+
+// Shape function for multiple-output ops with an explicit "shapes" attribute.
+absl::Status ExplicitShapes(InferenceContext* c);
+
+// Shape function for SparseReduceMax and SparseReduceSum.
+absl::Status SparseReduceShapeFn(InferenceContext* c);
+
+// Shape function for QuantizedConv2D op.
+absl::Status QuantizedConv2DShape(InferenceContext* c);
+
+// Shape function for _QuantizedConv2D op/fusion.
+absl::Status FusedQuantizedConv2DShape(InferenceContext* c);
+
+// Shape function for _QuantizedDepthwiseConv2D op/fusion.
+absl::Status FusedQuantizedDepthwiseConv2D(InferenceContext* c);
+
+// Shape function for QuantizedAvgPool op
+absl::Status QuantizedAvgPoolShape(InferenceContext* c);
+
+// Shape function for QuantizeV2 op
+absl::Status QuantizeV2Shape(InferenceContext* c);
+
+// Shape function for ReduceScatter ops
+absl::Status ReduceScatterShape(shape_inference::InferenceContext* c);
+
+}  // namespace shape_inference
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_COMMON_SHAPE_FNS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/control_flow.h b/third_party/tflite-hdrs/tensorflow/core/framework/control_flow.h
new file mode 100644
index 00000000..3cc270b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/control_flow.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_CONTROL_FLOW_H_
+#define TENSORFLOW_CORE_FRAMEWORK_CONTROL_FLOW_H_
+
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+const uint64 kIllegalFrameId = ~0uLL;
+const int64_t kIllegalIterId = -1;
+
+// For the purpose of control flow, every tensor produced by TensorFlow is
+// conceptually tagged by a 'FrameAndIter'. FrameAndIter consists of a
+// 'frame_id' and an 'iter_id'. The tensor value it represents is produced
+// in the frame with frame_id at the iteration of iter_id.
+struct FrameAndIter {
+  uint64 frame_id = kIllegalFrameId;
+  int64_t iter_id = kIllegalIterId;
+
+  FrameAndIter() {}
+
+  FrameAndIter(uint64 frame, int64_t iter) {
+    frame_id = frame;
+    iter_id = iter;
+  }
+
+  bool operator==(const FrameAndIter& other) const {
+    return (frame_id == other.frame_id && iter_id == other.iter_id);
+  }
+};
+
+struct FrameAndIterHash {
+  size_t operator()(const FrameAndIter& key) const {
+    // Make sure there are no padding bytes that we don't want
+    CHECK_EQ(sizeof(uint64) + sizeof(int64_t), sizeof(FrameAndIter));
+    return Hash64(reinterpret_cast<const char*>(&key), sizeof(FrameAndIter));
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_CONTROL_FLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/dataset.h b/third_party/tflite-hdrs/tensorflow/core/framework/dataset.h
new file mode 100644
index 00000000..70ebc12a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/dataset.h
@@ -0,0 +1,1846 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+
+#include <cstdlib>
+#include <deque>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/dataset_metadata.pb.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/dataset_stateful_op_allowlist.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/thread_factory.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/core/threadpool_interface.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/thread_annotations.h"
+
+// Polymorphic datasets should support all primitive TensorFlow
+// types. Use this macro to expand `m(T)` once for each primitive type
+// `T`, e.g. to build a `switch` statement.
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
+namespace tensorflow {
+
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class Node;
+
+namespace data {
+
+namespace internal {
+// Merges Options from source to destination. If there is a conflict on a field,
+// the field value from the source takes precedence.
+void MergeOptions(const protobuf::Message& source,
+                  protobuf::Message* destination);
+void MergeOptions(const protobuf::MessageLite& source,
+                  protobuf::MessageLite* destination);
+}  // namespace internal
+
+using TraceMeMetadata = std::vector<std::pair<absl::string_view, string>>;
+
+// Maps the index of dataset elements to a globally shuffled index. See the
+// comment for IteratorContext::Params::index_mapper for more details.
+// Notes:
+// * `absl::OutOfRangeError` indicates the input index argument exceeds
+//   the cardinality of the dataset.
+// * `absl::NotFoundError` indicates we should skip this element.
+//    This happens in the case we mix multiple datasets into one. For example,
+//    `dataset1.concatenate(dataset2)`.
+// See go/tf-data-random-access-iterator and
+// go/tf-data-random-access-iterator-for-concatenate for more info.
+using IndexMapperFn = std::function<absl::StatusOr<size_t>(size_t)>;
+
+constexpr char kTFDataFunction[] = "_tf_data_function";
+
+constexpr int kInfiniteCardinality = -1;
+constexpr int kUnknownCardinality = -2;
+
+// This constant is a magic number that is used (as a prefix) to identify keys
+// used for serialization of iterator state.
+constexpr char kFullNameRandomHex[] = "60d899aa0d8ce4351e7c3b419e92d25b";
+constexpr int kFullNameRandomHexLen = std::size(kFullNameRandomHex) - 1;
+constexpr char kPipe[] = "|";
+constexpr char kColon[] = ":";
+
+constexpr char kTFDataResourceTag[] = "tfdata";
+constexpr char kTraceInfoUnavailable[] = "unavailable";
+constexpr char kMetadata[] = "metadata";
+
+constexpr char kCardinalityAttrForRewrite[] = "_cardinality";
+
+class DatasetBase;
+class IteratorContext;
+class SerializationContext;
+
+inline bool IsTFDataFunction(const FunctionDef& func) {
+  auto iter = func.attr().find(data::kTFDataFunction);
+  return (iter != func.attr().end() && iter->second.b());
+}
+
+// Interface for reading values from a key-value store.
+// Used for restoring iterator state. This class is thread safe.
+// Please see comment on IteratorStateWriter for guidance around using the
+// Read*(key, val) vs Read*(name, key, val).
+class IteratorStateReader {
+ public:
+  // Determines whether the iterator state contains the given key.
+  virtual bool Contains(absl::string_view key) const = 0;
+  virtual bool Contains(absl::string_view name,
+                        absl::string_view key) const = 0;
+
+  // Reads an integer for the given key.
+  virtual absl::Status ReadScalar(absl::string_view key,
+                                  int64_t* val) const = 0;
+  virtual absl::Status ReadScalar(absl::string_view name, absl::string_view key,
+                                  int64_t* val) const = 0;
+
+  // Reads a string for the given key.
+  virtual absl::Status ReadScalar(absl::string_view key,
+                                  tstring* val) const = 0;
+  virtual absl::Status ReadScalar(absl::string_view name, absl::string_view key,
+                                  tstring* val) const = 0;
+
+  // Reads a tensor for the given key.
+  // TODO(jsimsa): Remove non-FLR overrides once all callers are updated.
+  virtual absl::Status ReadTensor(absl::string_view key, Tensor* val) const = 0;
+  virtual absl::Status ReadTensor(FunctionLibraryRuntime* flr,
+                                  absl::string_view key, Tensor* val) const = 0;
+  virtual absl::Status ReadTensor(absl::string_view name, absl::string_view key,
+                                  Tensor* val) const = 0;
+  virtual absl::Status ReadTensor(FunctionLibraryRuntime* flr,
+                                  absl::string_view name, absl::string_view key,
+                                  Tensor* val) const = 0;
+
+  virtual ~IteratorStateReader() {}
+};
+
+// Interface for writing values to a key-value store.
+// Used for saving iterator state. Not thread safe.
+// The IteratorStateWriter creates a tensor for each unique iterator name it
+// sees. For the Write*(key, val) API's the key is expected to encode this
+// name as keys are required to be produced using the full_name() method.
+// Each tensor has an upper limit of 2 GB and so if the state for an iterator
+// might exceed the 2 GB limit, you can pass an explicit name in via the
+// Write*(name, key, val) APIs allowing you to further split up the state
+// into more manageable chunks.
+class IteratorStateWriter {
+ public:
+  // Writes an integer for the given key.
+  virtual absl::Status WriteScalar(absl::string_view key,
+                                   const int64_t val) = 0;
+  virtual absl::Status WriteScalar(absl::string_view name,
+                                   absl::string_view key,
+                                   const int64_t val) = 0;
+
+  // Writes a string for the given key.
+  virtual absl::Status WriteScalar(absl::string_view key,
+                                   const tstring& val) = 0;
+  virtual absl::Status WriteScalar(absl::string_view name,
+                                   absl::string_view key,
+                                   const tstring& val) = 0;
+
+  // Writes a tensor for the given key.
+  virtual absl::Status WriteTensor(absl::string_view key,
+                                   const Tensor& val) = 0;
+  virtual absl::Status WriteTensor(absl::string_view name,
+                                   absl::string_view key,
+                                   const Tensor& val) = 0;
+
+  virtual ~IteratorStateWriter() {}
+
+ protected:
+  // Accessible only through derived concrete class's copy/move constructors
+  IteratorStateWriter() = default;
+  IteratorStateWriter(const IteratorStateWriter&) = default;
+  IteratorStateWriter(IteratorStateWriter&&) = default;
+};
+
+// Generates a full name key for iterator checkpointing. All keys generated for
+// iterator checkpoints should go through this function.
+std::string FullName(const std::string& prefix, const std::string& name);
+
+// Extracts iterator prefix from key generated by `FullName`.
+absl::Status ExtractIteratorPrefix(absl::string_view key, string* prefix);
+
+// Interface for objects that can be checkpointed.
+class Checkpointable {
+ public:
+  Checkpointable() = default;
+  virtual ~Checkpointable() = default;
+
+  virtual absl::Status Save(SerializationContext* ctx,
+                            IteratorStateWriter* writer) = 0;
+  virtual absl::Status Restore(IteratorContext* ctx,
+                               IteratorStateReader* reader) = 0;
+};
+
+// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
+class GraphDefBuilderWrapper {
+ public:
+  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
+
+  // Adds a Const node with scalar value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  template <typename T>
+  absl::Status AddScalar(const T& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+    val_t.scalar<T>()() = val;
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddScalar: Failed to build Const op.");
+    }
+    return absl::OkStatus();
+  }
+
+  // Adds a Const node with vector value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
+  template <typename T>
+  absl::Status AddVector(const std::vector<T>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
+                          TensorShape({static_cast<int64_t>(val.size())}));
+    for (size_t i = 0; i < val.size(); i++) {
+      val_t.flat<T>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status AddVector(const std::vector<string>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<tstring>::v(),
+                          TensorShape({static_cast<int64_t>(val.size())}));
+    for (size_t i = 0; i < val.size(); i++) {
+      val_t.flat<tstring>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return absl::OkStatus();
+  }
+
+  // Adds a `Const` node for the given tensor value to the graph.
+  //
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status. The returned `Node`
+  // pointer is owned by the backing graph of `GraphDefBuilder`.
+  absl::Status AddTensor(const Tensor& val, Node** output) {
+    AddTensorInternal(val, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddTensor: Failed to build Const op.");
+    }
+    return absl::OkStatus();
+  }
+
+  // Adds a `Placeholder` node for the given tensor value to the graph.
+  //
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status. The returned `Node`
+  // pointer is owned by the backing graph of `GraphDefBuilder`.
+  absl::Status AddPlaceholder(const Tensor& val, Node** output) {
+    AddPlaceholderInternal(val, output);
+    if (*output == nullptr) {
+      return errors::Internal(
+          "AddPlaceholder: Failed to build Placeholder op.");
+    }
+    return absl::OkStatus();
+  }
+
+  // Adds a node for the given dataset to the `Graph`. The value of
+  // `DatasetBase::type_string()` is used as the op type for the node. Values
+  // for the `output_types` and `output_shapes` node attributes are also written
+  // if those attributes are defined in the `OpDef`.
+  //
+  // If `use_dataset_name` is set, the value of `DatasetBase::node_name()` is
+  // used as the op name for the node. This argument should only be set when
+  // serializing `DatasetBase` instances which might not have been created
+  // through op kernel execution to make sure the dataset op name is preserved
+  // across serialization boundaries, which is in turn needed to make sure
+  // iterator checkpoints are valid across serialization boundaries. When
+  // `use_dataset_name` is set, the caller is responsible for making sure that
+  // the op name is unique across the graph.
+  //
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status. The returned `Node`
+  // pointer is owned by the backing `Graph` of `GraphDefBuilder`.
+  absl::Status AddDataset(const DatasetBase* dataset,
+                          const std::vector<Node*>& inputs, Node** output);
+  absl::Status AddDataset(
+      const DatasetBase* dataset, const std::vector<Node*>& inputs,
+      const std::vector<std::pair<absl::string_view, AttrValue>>& attrs,
+      Node** output);
+  absl::Status AddDataset(
+      const DatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, absl::Span<Node* const>>>&
+          list_inputs,
+      const std::vector<std::pair<absl::string_view, AttrValue>>& attrs,
+      Node** output);
+  absl::Status AddDataset(
+      const DatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, absl::Span<Node* const>>>&
+          list_inputs,
+      const std::vector<std::pair<absl::string_view, AttrValue>>& attrs,
+      bool use_dataset_name, Node** output);
+
+  // Adds a user-defined function with name `function_name` to the graph and
+  // recursively adds all functions it references. If a function with a matching
+  // name has already been added, returns with OK status. If a user-defined with
+  // name `function_name` is not found in the context's function library,
+  // returns an InvalidArgumentError. If the function with name `function_name`
+  // or any of its dependent functions are stateful, and the context does not
+  // explicitly permit stateful functions, returns an InvalidArgument error.
+  absl::Status AddFunction(SerializationContext* ctx,
+                           const string& function_name,
+                           const FunctionLibraryDefinition& lib_def);
+
+  template <typename T>
+  void BuildAttrValue(const T& value, AttrValue* attr) {
+    SetAttrValue(value, attr);
+  }
+
+  template <typename T>
+  AttrValue BuildAttrValue(const T& value) {
+    AttrValue attr;
+    SetAttrValue(value, &attr);
+    return attr;
+  }
+
+ protected:
+  GraphDefBuilder* builder() { return b_; }
+
+ private:
+  void AddPlaceholderInternal(const Tensor& val, Node** output);
+  void AddTensorInternal(const Tensor& val, Node** output);
+  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+
+  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+    for (const auto& attr : op_def->attr()) {
+      if (attr.name() == attr_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  absl::Status AddAttrFunctions(SerializationContext* ctx,
+                                const AttrValue& attr_value,
+                                const FunctionLibraryDefinition& lib_def) {
+    if (attr_value.has_func()) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name(), lib_def));
+    } else if (attr_value.has_list()) {
+      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
+        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name(), lib_def));
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  GraphDefBuilder* b_;
+};
+
+class StatsAggregator;
+
+// A utility class for running a function and ensuring that there is always a
+// `tensorflow::data` symbol on the stack.
+class Runner {
+ public:
+  virtual ~Runner() {}
+
+  // Runs the given function.
+  virtual void Run(const std::function<void()>& f) = 0;
+
+  // Returns a global singleton Runner.
+  static Runner* get();
+};
+
+// A class which provides a sequence of splits. Splits represent subdivisions of
+// a dataset, e.g. filenames or ranges within files. We use splitting to
+// partition input data into smaller pieces for distributed processing (see
+// go/tf-data-splitting-design). The SplitProvider subclasses are expected to be
+// thread-safe.
+//
+// Datasets provide a `MakeSplitProvider` method to expose a listing of their
+// splits.
+//
+// Iterators created with a split provider will only iterate over the splits
+// provided by the split provider.
+class SplitProvider {
+ public:
+  virtual ~SplitProvider() {}
+  // Stores the next split in `*split`, setting `*end_of_splits` to indicate
+  // whether there were any splits left.
+  virtual absl::Status GetNext(Tensor* split, bool* end_of_splits) = 0;
+  // Resets the split provider to its beginning.
+  virtual absl::Status Reset() = 0;
+  // Saves the state of this split provider.
+  virtual absl::Status Save(std::function<std::string(std::string)> full_name,
+                            IteratorStateWriter* writer) = 0;
+  // Restores the state of this split provider.
+  virtual absl::Status Restore(
+      std::function<std::string(std::string)> full_name,
+      IteratorStateReader* reader) = 0;
+  // Returns the number of splits:
+  // - If there are a finite number of splits, returns a non-negative count.
+  // - If there are an infinite number of splits, returns kInfiniteCardinality.
+  // - If the number of splits is unknown or can't be efficiently computed,
+  // returns kUnknownCardinality.
+  virtual int64_t Cardinality() const { return kUnknownCardinality; }
+  // Cancels the split provider. After cancelling, all other existing and future
+  // calls should return quickly without blocking.
+  virtual void Cancel() {}
+  // Used to determine if the split provider is dynamic. Dynamic split providers
+  // are expected to be non-deterministic and may return different splits upon
+  // reinitialization.
+  virtual bool IsDynamic() const { return false; }
+};
+
+// Returns the runner threadpool size from an OpKernelContext.
+int32_t GetRunnerThreadpoolSizeFromOpKernelContext(OpKernelContext* ctx);
+
+// In-memory representation of a checkpoint. The checkpoint is represented as a
+// collection of key-value pairs and are expected to be written using the
+// `IteratorStateWriter` interface.
+//
+// The implementation is not thread-safe.
+class MemoryCheckpoint final : public IteratorStateWriter {
+ public:
+  // IdRegistry maintains a bi-directional mapping between string and integer
+  // representations of checkpoint keys.
+  //
+  // The reason we need both is that integer ids are used for fast lookups and
+  // comparisons, while string ids are used for prefix matching.
+  class IdRegistry {
+   public:
+    IdRegistry() = default;
+
+    // Adds the given string id to the registry, generating a unique integer id
+    // for it. If the string id already exists, its integer id is returned.
+    int64_t Add(const std::string& prefix, const std::string& key);
+
+    // Gets all integer ids for string ids matching the given prefix.
+    std::vector<int64_t> GetMatchingIds(const std::string& prefix_to_match);
+
+    // Gets the string id for the given integer id.
+    std::pair<std::string, std::string> Get(int64_t id);
+
+    // Removes the entries matching the given integer ids from the registry.
+    void RemoveIds(const std::vector<int64_t>& ids);
+
+   private:
+    mutex mu_;
+    int64_t next_id_ TF_GUARDED_BY(mu_) = 0;
+    absl::flat_hash_map<int64_t, std::pair<std::string, std::string>>
+        int_to_string_ TF_GUARDED_BY(mu_);
+    absl::flat_hash_map<std::pair<std::string, std::string>, int64_t>
+        string_to_int_ TF_GUARDED_BY(mu_);
+  };
+
+  MemoryCheckpoint() = delete;
+  explicit MemoryCheckpoint(std::shared_ptr<IdRegistry> registry)
+      : id_registry_(registry) {}
+
+  MemoryCheckpoint(MemoryCheckpoint&& other) = default;
+  MemoryCheckpoint(const MemoryCheckpoint& other) = default;
+
+  static MemoryCheckpoint CreateRootCheckpoint(
+      std::shared_ptr<IdRegistry> registry) {
+    return MemoryCheckpoint(/*id_registry*/ registry, /*is_root=*/true);
+  }
+
+  // BEGIN implementation of `IteratorStateWriter` interface
+  absl::Status WriteScalar(absl::string_view key, int64_t val) override {
+    string prefix;
+    TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+    return WriteScalar(prefix, key, val);
+  }
+  absl::Status WriteScalar(absl::string_view name, absl::string_view key,
+                           int64_t val) override {
+    auto id = id_registry_->Add(string(name), string(key));
+    int_values_[id] = val;
+    return absl::OkStatus();
+  }
+  absl::Status WriteScalar(absl::string_view key, const tstring& val) override {
+    string prefix;
+    TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+    return WriteScalar(prefix, key, val);
+  }
+  absl::Status WriteScalar(absl::string_view name, absl::string_view key,
+                           const tstring& val) override {
+    auto id = id_registry_->Add(string(name), string(key));
+    str_values_[id] = val;
+    return absl::OkStatus();
+  }
+  absl::Status WriteTensor(absl::string_view key, const Tensor& val) override {
+    string prefix;
+    TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+    return WriteTensor(prefix, key, val);
+  }
+  absl::Status WriteTensor(absl::string_view name, absl::string_view key,
+                           const Tensor& val) override {
+    auto id = id_registry_->Add(string(name), string(key));
+    tensor_values_[id] = val;
+    return absl::OkStatus();
+  }
+  // END implementation of `IteratorStateWriter` interface
+
+  // String representation for the in-memory checkpoint suitable for debugging.
+  std::string DebugString() const;
+
+  // Returns the status of the in-memory checkpoint.
+  absl::Status GetStatus() const { return status_; }
+
+  // Merges state of another checkpoint into this checkpoint, overwriting
+  // existing state (if applicable).
+  //
+  // Merge also garbage collects state that is no longer needed.
+  void Merge(MemoryCheckpoint* other);
+
+  // Purge removes all keys with given prefix from checkpoint. It also adds the
+  // prefix for tracking unless it is the root checkpoint.
+  void Purge(const std::string& prefix);
+
+  // Stores the in-memory checkpoint to the given writer.
+  absl::Status Save(IteratorStateWriter* writer) const;
+
+  // Updates the status of the in-memory checkpoint with the given status.
+  void UpdateStatus(absl::Status status) { status_.Update(status); }
+
+ private:
+  explicit MemoryCheckpoint(std::shared_ptr<IdRegistry> registry, bool is_root)
+      : is_root_(is_root), id_registry_(registry) {}
+  void operator=(const MemoryCheckpoint&) = delete;
+
+  absl::Status status_ = absl::OkStatus();
+  // Only set to true for the checkpoint in IteratorResource.
+  // Root checkpoint does not track expired prefixes.
+  const bool is_root_ = false;
+  absl::flat_hash_map<int64_t, int64_t> int_values_;
+  absl::flat_hash_map<int64_t, std::string> str_values_;
+  absl::flat_hash_map<int64_t, Tensor> tensor_values_;
+
+  // Keeps track of expired prefixes for propagation. Cleaned after it's merged.
+  absl::flat_hash_set<std::string> expired_prefixes_;
+
+  std::shared_ptr<IdRegistry> id_registry_;
+};
+
+// Aggregates runtime support needed for dataset and iterator serialization.
+class SerializationContext {
+ public:
+  // Handles the external state according to the external state policy.
+  absl::Status HandleCheckExternalStateStatus(absl::Status s) {
+    if (s.ok()) {
+      return s;
+    }
+    switch (params_.external_state_policy) {
+      case ExternalStatePolicy::POLICY_WARN:
+        LOG(WARNING) << s.ToString();
+        return absl::OkStatus();
+      case ExternalStatePolicy::POLICY_IGNORE:
+        VLOG(2) << "Ignoring error status: " << s.ToString();
+        return absl::OkStatus();
+      case ExternalStatePolicy::POLICY_FAIL:
+        return s;
+      default:
+        return errors::InvalidArgument("Unexpected value of external policy: ",
+                                       params_.external_state_policy);
+    }
+  }
+
+  struct Params {
+    explicit Params() = default;
+
+    explicit Params(OpKernelContext* ctx)
+        : resource_mgr(ctx->resource_manager()),
+          device_name(ctx->device()->attributes().name()) {}
+
+    std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
+
+    // Indicates what to do if the dataset depends on external state.
+    ExternalStatePolicy external_state_policy =
+        ExternalStatePolicy::POLICY_WARN;
+
+    // Indicates whether the serialization is for rewrites.
+    //
+    // If true:
+    //   * A dataset that doesn't implement serialization is replaced with a
+    //     placeholder returned in `input_list`.
+    //   * Data tensors are replaced with a placeholder returned in
+    //     `input_list`.
+    //   * Datasets that use random seeds should not serialize the random seeds.
+    //     This doesn't affect datasets that use fixed seeds; fixed seeds will
+    //     always be preserved.
+    //   * Cardinality is serialized as an unregistered attribute
+    //     `_cardinality`.
+    // If false:
+    //   * A dataset that doesn't implement serialization should result in an
+    //     error.
+    //   * Data tensors (potentially large) should be serialized.
+    //   * Datasets that use random seeds should serialize the random seeds.
+    bool is_graph_rewrite = false;
+
+    // A resource manager for looking up resources during serialization.
+    ResourceMgr* resource_mgr;
+
+    // The name of the device doing the serialization.
+    std::string device_name;
+
+    // Determines whether checkpointing should represent input pipeline state
+    // symbolically, using cursors into source iterators, or explicitly, by
+    // storing internal state of each iterator.
+    bool symbolic_checkpoint = false;
+  };
+
+  explicit SerializationContext(Params params) : params_(params) {}
+
+  std::vector<std::pair<string, Tensor>>* input_list() {
+    return params_.input_list;
+  }
+
+  ExternalStatePolicy external_state_policy() const {
+    return params_.external_state_policy;
+  }
+
+  bool is_graph_rewrite() const { return params_.is_graph_rewrite; }
+
+  const ResourceMgr* resource_mgr() const { return params_.resource_mgr; }
+
+  const std::string& device_name() const { return params_.device_name; }
+
+  bool symbolic_checkpoint() const { return params_.symbolic_checkpoint; }
+
+ private:
+  Params params_;
+
+  SerializationContext(const SerializationContext&) = delete;
+  void operator=(const SerializationContext&) = delete;
+};
+
+// Specifies the tf.data pipeline run mode.
+enum RunMode { DEFAULT, STANDALONE };
+
+// A cut-down version of `OpKernelContext` for running computations in
+// iterators. Note that we cannot simply use `OpKernelContext` here because we
+// might run computation in an iterator whose lifetime is not nested within the
+// lifetime of a single `OpKernelContext` (e.g. asynchronous prefetching).
+//
+// TODO(mrry): We're making some daring assumptions about the lifetime of the
+// runner passed in here. A runner will be deleted when the original step ends,
+// but all existing runners only close over session-lifetime (or longer-lived)
+// state, so we can make a copy of the function. There's nothing in the
+// definition of the API from which we took the runner to guarantee that what we
+// are doing is safe. We should formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    explicit Params(IteratorContext* ctx)
+        : accelerator_device_info(ctx->accelerator_device_info()),
+          allocator_getter(ctx->allocator_getter()),
+          cancellation_manager(ctx->cancellation_manager()),
+          collective_executor(ctx->collective_executor()),
+          env(ctx->env()),
+          flr(ctx->flr()),
+          function_handle_cache(ctx->function_handle_cache()),
+          interleave_depth(ctx->interleave_depth()),
+          is_restoring(ctx->is_restoring()),
+          model(ctx->model()),
+          options(ctx->options()),
+          ram_budget_manager(ctx->ram_budget_manager()),
+          resource_mgr(ctx->resource_mgr()),
+          runner(*(ctx->runner())),
+          runner_threadpool_size(ctx->runner_threadpool_size()),
+          split_providers(ctx->split_providers()),
+          stats_aggregator(ctx->stats_aggregator()),
+          symbolic_checkpoint(ctx->symbolic_checkpoint()),
+          thread_factory(ctx->thread_factory()),
+          thread_pool(ctx->thread_pool()),
+          id_registry(ctx->id_registry()),
+          warm_start(ctx->warm_start()),
+          index_mapper(ctx->index_mapper()) {}
+
+    explicit Params(OpKernelContext* ctx)
+        : collective_executor(ctx->collective_executor()),
+          env(ctx->env()),
+          flr(ctx->function_library()) {
+      // NOTE: need reinterpret_cast because function.h forward-declares Device.
+      DeviceBase* device =
+          reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+      accelerator_device_info = device->tensorflow_accelerator_device_info();
+      allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+
+      runner_threadpool_size = GetRunnerThreadpoolSizeFromOpKernelContext(ctx);
+
+      // NOTE: Wrap every runner invocation in a call to Runner()->Run(), so
+      // that a symbol in the tensorflow::data namespace is always on the stack
+      // when executing a function inside a Dataset.
+      runner = std::bind(
+          [](
+              // Note: `runner` is a const reference to avoid copying it.
+              const std::function<void(std::function<void()>)>& ctx_runner,
+              std::function<void()> fn) {
+            std::function<void()> wrapped_fn = std::bind(
+                [](const std::function<void()>& fn) { Runner::get()->Run(fn); },
+                std::move(fn));
+            ctx_runner(std::move(wrapped_fn));
+          },
+          *ctx->runner(), std::placeholders::_1);
+    }
+
+    // If non-null, information about the GPU or TPU on which the op is placed.
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info = nullptr;
+
+    // The Allocator to be used to allocate the output of an iterator.
+    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
+
+    // The CancellationManager to be used to cancel execution of ops.
+    CancellationManager* cancellation_manager = nullptr;
+
+    // Collective support.
+    CollectiveExecutor* collective_executor = nullptr;
+
+    // Interface to operating system functionality.
+    Env* env = nullptr;
+
+    // The FunctionLibraryRuntime object to be used to make function calls.
+    FunctionLibraryRuntime* flr = nullptr;
+
+    // A FunctionHandleCache that owns all the function handles. Not owned.
+    FunctionHandleCache* function_handle_cache = nullptr;
+
+    // Records the number of ParallelInterleave operations in the path from the
+    // root node to this node (not including this node) in the input pipeline
+    // tree.
+    int64 interleave_depth = 0;
+
+    // Marks whether the iterator is restored from a checkpoint.
+    bool is_restoring = false;
+
+    // If non-null, identifies the object used for performance modeling.
+    std::shared_ptr<model::Model> model = nullptr;
+
+    // The input pipeline options.
+    const Options* options = nullptr;
+
+    // Manager for the ram budget when using autotune.
+    std::shared_ptr<model::RamBudgetManager> ram_budget_manager = nullptr;
+
+    // A resource manager for storing dataset-related state, e.g. random
+    // seeds or cached tensors. Not owned.
+    ResourceMgr* resource_mgr = nullptr;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // Number of threads used for executing user-defined functions.
+    int32 runner_threadpool_size = 0;
+
+    // Split providers indicating which splits to process. May be empty,
+    // indicating that the iterator should process all splits.
+    std::vector<std::shared_ptr<SplitProvider>> split_providers;
+
+    // The `StatsAggregator` object to record statistics about the iterator.
+    //
+    // TODO(b/147325552): Remove this API and any of its uses after we switch to
+    // using C++ based implementation for tf.data options (on 4/12/2021).
+    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
+
+    // Indicates whether to use symbolic checkpointing.
+    bool symbolic_checkpoint = false;
+
+    // A factory for creating threads to perform blocking work.
+    std::shared_ptr<ThreadFactory> thread_factory = nullptr;
+
+    // A shared thread pool to schedule computation into.
+    thread::ThreadPoolInterface* thread_pool = nullptr;
+
+    std::shared_ptr<MemoryCheckpoint::IdRegistry> id_registry =
+        std::make_shared<MemoryCheckpoint::IdRegistry>();
+
+    // If `true` background threads of asynchronous operations are started when
+    // the iterator is created. Otherwise, they are started upon first `GetNext`
+    // request. Default value is set to false to ensure backward compatibility.
+    bool warm_start = false;
+
+    // Specifies the tf.data pipeline run mode.
+    RunMode run_mode = RunMode::DEFAULT;
+
+    // Maps the index of dataset elements to a shuffled index. In other words,
+    // given an index i, returns the permuted index p(i) for the iterator. Used
+    // to support global shuffling of datasets that support random access.
+    IndexMapperFn index_mapper = nullptr;
+
+    // Records the number of elements that have been produced prior to a
+    // checkpoint. This is set by globally shuffled iterators so that upstream
+    // iterators can restore the element counts in the random access mode.
+    std::optional<size_t> restored_element_count = std::nullopt;
+  };
+
+  explicit IteratorContext(IteratorContext* ctx)
+      : IteratorContext(Params{ctx}) {}
+
+  explicit IteratorContext(OpKernelContext* ctx)
+      : IteratorContext(Params{ctx}) {}
+
+  explicit IteratorContext(Params params)
+      : params_(std::move(params)),
+        checkpoint_(MemoryCheckpoint{params_.id_registry}) {}
+
+  IteratorContext(const IteratorContext& other)
+      : IteratorContext(Params{other.params_}) {
+    // MemoryCheckpoint should not be copied over as the child context should
+    // not care what's in the checkpoint of parent context.
+  }
+
+  std::shared_ptr<MemoryCheckpoint::IdRegistry> id_registry() {
+    return params_.id_registry;
+  }
+
+  const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info() {
+    return params_.accelerator_device_info;
+  }
+
+  Allocator* allocator(AllocatorAttributes attrs) {
+    return params_.allocator_getter(attrs);
+  }
+
+  std::function<Allocator*(AllocatorAttributes)> allocator_getter() {
+    return params_.allocator_getter;
+  }
+
+  CancellationManager* cancellation_manager() {
+    return params_.cancellation_manager;
+  }
+
+  CollectiveExecutor* collective_executor() {
+    return params_.collective_executor;
+  }
+
+  Env* env() const { return params_.env; }
+
+  FunctionLibraryRuntime* flr() { return params_.flr; }
+
+  FunctionHandleCache* function_handle_cache() {
+    return params_.function_handle_cache;
+  }
+
+  MemoryCheckpoint* checkpoint() { return &checkpoint_; }
+
+  int64 interleave_depth() { return params_.interleave_depth; }
+
+  bool is_restoring() { return params_.is_restoring; }
+
+  const std::shared_ptr<model::Model>& model() const { return params_.model; }
+
+  const Options* options() const { return params_.options; }
+
+  const std::shared_ptr<model::RamBudgetManager>& ram_budget_manager() {
+    return params_.ram_budget_manager;
+  }
+
+  ResourceMgr* resource_mgr() { return params_.resource_mgr; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
+
+  std::vector<std::shared_ptr<SplitProvider>> split_providers() const {
+    return params_.split_providers;
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    return params_.stats_aggregator;
+  }
+
+  bool symbolic_checkpoint() { return params_.symbolic_checkpoint; }
+
+  const std::shared_ptr<ThreadFactory>& thread_factory() {
+    return params_.thread_factory;
+  }
+
+  thread::ThreadPoolInterface* thread_pool() { return params_.thread_pool; }
+
+  bool warm_start() { return params_.warm_start; }
+
+  RunMode run_mode() { return params_.run_mode; }
+
+  IndexMapperFn index_mapper() const { return params_.index_mapper; }
+
+  void set_restored_element_count(size_t element_count) {
+    params_.restored_element_count.emplace(element_count);
+  }
+
+  std::optional<int64_t> restored_element_count() const {
+    return params_.restored_element_count;
+  }
+
+  void SetModel(std::shared_ptr<model::Model> model) { params_.model = model; }
+
+  void SetIndexMapper(const IndexMapperFn& index_mapper) {
+    params_.index_mapper = index_mapper;
+  };
+
+  std::unique_ptr<thread::ThreadPool> CreateThreadPool(const string& name,
+                                                       int num_threads) {
+    if (params_.thread_pool) {
+      // Create a `ThreadPool` instance by wrapping `params_.thread_pool` (which
+      // is an instance of `thread::ThreadPoolInterface`). Notably, the
+      // ownership of `params_.thread_pool` is *not* transferred onto the newly
+      // created `ThreadPool` instance.
+      return absl::make_unique<thread::ThreadPool>(params_.thread_pool);
+    } else {
+      return absl::make_unique<thread::ThreadPool>(params_.env, ThreadOptions(),
+                                                   name, num_threads,
+                                                   /*low_latency_hint=*/false);
+    }
+  }
+
+  // Merges the given checkpoint with the checkpoint of this context.
+  //
+  // The intended for this API is that methods, such as
+  // `IteratorBase::Initialize`, `IteratorBase::GetNextInternal`, or
+  // `IteratorBase::RestoreInternal` that store data in the in-memory
+  // checkpoint, use a separate instance of `IteratorContext` for a nested call,
+  // then the checkpoint collected by the `IteratorContext` instance passed into
+  // the callee should be merged into the `IteratorContext` of the caller:
+  //
+  // ```
+  // Status GetNextInternal(IteratorContext* ctx, ...) {
+  //   ...
+  //   IteratorContext nested_ctx(...);
+  //   TF_RETURN_IF_ERROR(input_impl_->GetNext(&nested_ctx, ...));
+  //   ctx->MergeCheckpoint(nested_ctx->checkpoint());
+  //   ...
+  // }
+  // ```
+  void MergeCheckpoint(MemoryCheckpoint* checkpoint) {
+    if (symbolic_checkpoint()) {
+      checkpoint_.Merge(checkpoint);
+    }
+  }
+
+  // Removes any keys with the given prefix from the checkpoint.
+  //
+  // The intended use for this API is to clean the stale state in checkpoint,
+  // e.g. when a pipeline created by `flat_map` is exhausted, the state
+  // associated with the iterator of that pipeline is no longer needed and
+  // should be removed.
+  void PurgeCheckpoint(const std::string& prefix) {
+    if (symbolic_checkpoint()) {
+      checkpoint_.Purge(prefix);
+    }
+  }
+
+  // Saves the state of the given iterator into the checkpoint.
+  void SaveCheckpoint(Checkpointable* iterator) {
+    if (symbolic_checkpoint()) {
+      SerializationContext::Params params;
+      params.symbolic_checkpoint = true;
+      SerializationContext ctx(std::move(params));
+      checkpoint_.UpdateStatus(iterator->Save(&ctx, &checkpoint_));
+    }
+  }
+
+  std::unique_ptr<Thread> StartThread(const string& name,
+                                      std::function<void()> fn) {
+    if (params_.thread_factory) {
+      return params_.thread_factory->StartThread(name, std::move(fn));
+    } else {
+      return absl::WrapUnique(
+          Env::Default()->StartThread({}, name, std::move(fn)));
+    }
+  }
+
+  // Updates the status of the checkpoint with the given status.
+  void UpdateCheckpointStatus(std::function<absl::Status()> status_fn) {
+    if (symbolic_checkpoint()) {
+      checkpoint_.UpdateStatus(status_fn());
+    }
+  }
+
+ private:
+  Params params_;
+  MemoryCheckpoint checkpoint_;
+};
+
+// Generic context that can be constructed with either an `OpKernelContext` or
+// `IteratorContext`.
+struct AnyContext {
+  Allocator* allocator;
+  std::function<void(std::function<void()>)>* runner;
+  int64_t runner_threadpool_size;
+
+  explicit AnyContext(IteratorContext* ctx) {
+    allocator = ctx->allocator({});
+    runner = ctx->runner();
+    runner_threadpool_size = ctx->runner_threadpool_size();
+  }
+
+  explicit AnyContext(OpKernelContext* ctx) {
+    allocator = ctx->get_allocator({});
+    runner = ctx->runner();
+    runner_threadpool_size = GetRunnerThreadpoolSizeFromOpKernelContext(ctx);
+  }
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase : public Checkpointable {
+ public:
+  ~IteratorBase() override {
+    for (auto rit = cleanup_fns_.rbegin(); rit != cleanup_fns_.rend(); ++rit) {
+      (*rit)();
+    }
+  }
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will be stored
+  // in `*end_of_sequence`, and `*out_tensors` will be empty.
+  //
+  // Implementations should never return `OutOfRange` error. If at end of
+  // sequence, set `*end_of_sequence = true` and return `OkStatus()`.
+  // Internally raised `OutOfRange` errors that do not imply end of sequence
+  // should be converted to a different error type before being propagated to
+  // the caller.
+  //
+  // Implementations must explicitly set `*end_of_sequence = false` if an
+  // `OkStatus()` status is returned and the iterator is not at the end of the
+  // sequence.
+  //
+  // `out_tensors` and `end_of_sequence` are output parameters. `*out_tensors`
+  // and `*end_of_sequence` should not be read by implementations of `GetNext`
+  // before they are assigned.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual absl::Status GetNext(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) = 0;
+
+  absl::Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence) {
+    return GetNext(&ctx, out_tensors, end_of_sequence);
+  }
+
+  // If a dataset needs to provide its own index mapper behavior to support
+  // global shuffling, implement this method.
+  virtual IndexMapperFn GetIndexMapper(
+      IndexMapperFn parent_index_mapper) const {
+    return parent_index_mapper;
+  }
+
+  // Skips the next `num_to_skip` outputs from the range that this iterator
+  // is traversing.
+  //
+  // If there are not enough outputs to skip, it will set
+  // `*end_of_sequence = true` and return `OkStatus()`. `*num_skipped` will
+  // store the number of outputs that are skipped. When `*end_of_sequence` is
+  // `false`, `*num_skipped` should equal to `num_to_skip`.
+  virtual absl::Status Skip(IteratorContext* ctx, int num_to_skip,
+                            bool* end_of_sequence, int* num_skipped) = 0;
+
+  virtual absl::Status Skip(IteratorContext&& ctx, int num_to_skip,
+                            bool* end_of_sequence, int* num_skipped) {
+    return Skip(&ctx, num_to_skip, end_of_sequence, num_skipped);
+  }
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // Returns a string that identifies the sequence of iterators leading up to
+  // this iterator.
+  virtual const string& prefix() const = 0;
+
+  // Indicates whether the iterator is compatible with symbolic checkpointing.
+  virtual bool SymbolicCheckpointCompatible() const { return false; }
+
+  // Performs initialization that needs to happen outside of a constructor to
+  // properly propagate errors.
+  virtual absl::Status Initialize(IteratorContext* ctx) {
+    return absl::OkStatus();
+  }
+
+  // Performs initialization of the base iterator.
+  absl::Status InitializeBase(IteratorContext* ctx, const IteratorBase* parent);
+
+  // Saves the state of this iterator.
+  absl::Status Save(SerializationContext* ctx,
+                    IteratorStateWriter* writer) override {
+    int64_t start_us = EnvTime::NowMicros();
+    TF_RETURN_IF_ERROR(SaveInternal(ctx, writer));
+    VLOG(1) << "Saved " << prefix() << " in "
+            << (EnvTime::NowMicros() - start_us) << "us";
+    return absl::OkStatus();
+  }
+
+  // Restores the state of this iterator.
+  absl::Status Restore(IteratorContext* ctx,
+                       IteratorStateReader* reader) override {
+    int64_t start_us = EnvTime::NowMicros();
+    TF_RETURN_IF_ERROR(RestoreInternal(ctx, reader));
+    ctx->SaveCheckpoint(this);
+    VLOG(1) << "Restored " << prefix() << " in "
+            << (EnvTime::NowMicros() - start_us) << "us";
+    return absl::OkStatus();
+  }
+
+  // Returns the total number of bytes buffered by the iterator across all nodes
+  // in the subtree for which autotuning is enabled.
+  int64_t TotalBufferedBytes() const {
+    if (node_) return node_->TotalBufferedBytes();
+    return 0;
+  }
+
+ protected:
+  // Returns a node that models this iterator.
+  virtual std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const = 0;
+
+  // This is needed so that sub-classes of IteratorBase can call
+  // `SaveInternal` on their input iterators.
+  absl::Status SaveInput(SerializationContext* ctx, IteratorStateWriter* writer,
+                         const std::unique_ptr<IteratorBase>& input) {
+    if (ctx->symbolic_checkpoint()) {
+      return absl::OkStatus();
+    }
+    return input->Save(ctx, writer);
+  }
+
+  // This is needed so that sub-classes of IteratorBase can call
+  // `RestoreInternal` on their input iterators.
+  absl::Status RestoreInput(IteratorContext* ctx, IteratorStateReader* reader,
+                            const std::unique_ptr<IteratorBase>& input) {
+    return input->Restore(ctx, reader);
+  }
+
+  absl::Status RestoreInput(IteratorContext&& ctx, IteratorStateReader* reader,
+                            const std::unique_ptr<IteratorBase>& input) {
+    return RestoreInput(&ctx, reader, input);
+  }
+
+  // Saves the state of this iterator.
+  //
+  // This method is used to store the state of the iterator in a checkpoint.
+  // implementations have an override.
+  virtual absl::Status SaveInternal(SerializationContext* ctx,
+                                    IteratorStateWriter* writer) = 0;
+
+  // Restores the state of this iterator.
+  //
+  // This method is used to restore the state of the iterator from a checkpoint.
+  //
+  // Implementations may assume that the iterator is in a clean state. That is,
+  // its `Initialize` method has been called, but its `GetNext` method has
+  // never been called.
+  // implementations have an override.
+  virtual absl::Status RestoreInternal(IteratorContext* ctx,
+                                       IteratorStateReader* reader) = 0;
+
+  // Returns a pointer to the node representing this iterator in the performance
+  // model. It may be null, if performance modeling is not enabled for this
+  // iterator.
+  std::shared_ptr<model::Node> model_node() const { return node_; }
+
+  // Returns the number of elements produced by this iterator.
+  int64_t num_elements() const {
+    if (node_) return node_->num_elements();
+    return 0;
+  }
+
+  std::shared_ptr<model::Node> node_ = nullptr;
+
+ private:
+  // For access to `AddCleanupFunction` and `Restore`.
+  friend class DatasetBase;
+  friend class DatasetBaseIterator;  // for access to `node_`
+
+  std::vector<std::function<void()>> cleanup_fns_;
+  const IteratorBase* parent_ = nullptr;  // Not owned.
+  uint64_t id_ = 0;
+  uint64_t parent_id_ = 0;
+};
+
+// Represents runtime information needed to construct a dataset.
+class DatasetContext {
+ public:
+  struct Params {
+    string type_string;  // op type name of this dataset.
+    string node_name;    // graph node name of this dataset op, uniquely
+                         // identifying the dataset in the graph.
+  };
+
+  explicit DatasetContext(Params params) : params_(std::move(params)) {}
+
+  explicit DatasetContext(OpKernelContext* ctx) {
+    params_.type_string = ctx->op_kernel().type_string();
+    params_.node_name = ctx->op_kernel().name();
+  }
+
+  const string& type_string() const { return params_.type_string; }
+  const string& node_name() const { return params_.node_name; }
+
+ private:
+  Params params_;
+};
+
+// Returns the number of bytes allocated for the given tensor.
+int64_t GetAllocatedBytes(const std::vector<Tensor>& element);
+
+// Returns the estimated memory usage in bytes of the given tensor.
+int64_t GetTotalBytes(const std::vector<Tensor>& element);
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+absl::Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                         DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+absl::Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public core::RefCounted {
+ public:
+  // Key for storing the Dataset graph in the serialized format.
+  TF_EXPORT static const char kDatasetGraphKey[];
+
+  // Key for storing the output node of the Dataset graph in the serialized
+  // format.
+  TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
+
+  explicit DatasetBase(DatasetContext&& ctx)
+      : type_string_(ctx.type_string()), node_name_(ctx.node_name()) {}
+
+  // Op type name of this dataset.
+  const string& type_string() const { return type_string_; }
+
+  // Graph node name of this dataset op, uniquely identifying the dataset in
+  // the graph.
+  const string& node_name() const { return node_name_; }
+
+  const Metadata& metadata() const { return metadata_; }
+
+  const Options& options() const { return options_; }
+
+  int64_t num_sources() const { return num_sources_; }
+
+  // Initializes the dataset using the given metadata.
+  void Initialize(const Metadata& metadata);
+
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // The prefix identifies the sequence of iterators leading up to the newly
+  // created iterator.
+  absl::Status MakeIterator(IteratorContext* ctx, const IteratorBase* parent,
+                            const string& output_prefix,
+                            std::unique_ptr<IteratorBase>* iterator) const;
+
+  absl::Status MakeIterator(IteratorContext&& ctx, const IteratorBase* parent,
+                            const string& output_prefix,
+                            std::unique_ptr<IteratorBase>* iterator) const {
+    return MakeIterator(&ctx, parent, output_prefix, iterator);
+  }
+
+  // Returns a new iterator restored from the checkpoint data in `reader`.
+  absl::Status MakeIteratorFromCheckpoint(
+      IteratorContext* ctx, const string& output_prefix,
+      IteratorStateReader* reader,
+      std::unique_ptr<IteratorBase>* iterator) const {
+    std::unique_ptr<IteratorBase> it;
+    IteratorContext::Params params(ctx);
+    params.is_restoring = true;
+    IteratorContext restore_ctx(std::move(params));
+    TF_RETURN_IF_ERROR(MakeIterator(&restore_ctx,
+                                    /*parent=*/nullptr, output_prefix, &it));
+    TF_RETURN_IF_ERROR(it->Restore(&restore_ctx, reader));
+    ctx->MergeCheckpoint(restore_ctx.checkpoint());
+    *iterator = std::move(it);
+    return absl::OkStatus();
+  }
+
+  absl::Status MakeIteratorFromCheckpoint(
+      IteratorContext&& ctx, const string& output_prefix,
+      IteratorStateReader* reader,
+      std::unique_ptr<IteratorBase>* iterator) const {
+    return MakeIteratorFromCheckpoint(&ctx, output_prefix, reader, iterator);
+  }
+
+  // Returns a split provider which partitions the dataset's data into splits
+  // and provides them in a sequence. The split provider is stored in
+  // `*split_provider`.
+  virtual absl::Status MakeSplitProviders(
+      std::vector<std::unique_ptr<SplitProvider>>* split_providers) const;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // Returns the number of bytes allocated for tensors of this dataset.
+  virtual int64_t AllocatedBytes() const { return 0; }
+
+  // Returns the estimated element size based on `output_shapes()` and
+  // `output_dtypes()`.
+  virtual std::optional<int64_t> GetEstimatedElementSize() const;
+
+  // Returns the estimated number of bytes used for tensors of this dataset.
+  virtual int64_t TotalBytes() const { return 0; }
+
+  // Returns the cardinality of this dataset.
+  // TODO(shilpakrish): Remove this overload once all callers are migrated
+  // to the API which passes in the options parameter.
+  ABSL_DEPRECATED("Use the overload that passes in the options parameter.")
+  int64_t Cardinality() const;
+
+  // Returns the cardinality of this dataset based on the options.
+  int64_t Cardinality(CardinalityOptions options) const;
+
+  // Internal implementation of cardinality for a dataset based on the options.
+  virtual int64_t CardinalityInternal(CardinalityOptions options) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(cardinality_mu_) {
+    return kUnknownCardinality;
+  }
+
+  // A human-readable debug string for this dataset.
+  virtual string DebugString() const = 0;
+
+  // Stores the dataset's input datasets in `*inputs`. The pointers stored in
+  // `*inputs` are borrowed. The only valid non-ok return status is
+  // UNIMPLEMENTED in case `InputDatasets` is not implemented by a dataset
+  // subclass. Implementing `InputDatasets` enables `DatasetBase` to provide a
+  // default implementation of `MakeSplitProvider` when there is a single input
+  // dataset.
+  virtual absl::Status InputDatasets(
+      std::vector<const DatasetBase*>* inputs) const;
+
+  // Indicates whether the dataset depends on any external state which would
+  // prevent it from being serializable. If so, the method returns
+  // `errors::FailedPrecondition` with a message that identifies the external
+  // state. Otherwise, the method returns `OkStatus()`.
+  virtual absl::Status CheckExternalState() const = 0;
+
+  // Indicates whether the dataset is compatible with random access.
+  absl::Status CheckRandomAccessCompatible(const int64 index) const;
+
+  // Return the element at a particular index for a randomly accessible dataset.
+  virtual absl::Status Get(OpKernelContext* ctx, int64 index,
+                           std::vector<Tensor>* out_tensors) const;
+
+  // Same as above, but with an `AnyContext`, which can be constructed from
+  // either an `OpKernelContext` or `IteratorContext`. Used to support datasets
+  // that provide random access through both the dataset and iterator APIs.
+  virtual absl::Status Get(AnyContext ctx, int64 index,
+                           std::vector<Tensor>* out_tensors) const;
+
+  // Returns true if the dataset and its inputs support random access.
+  virtual absl::Status RandomIndexingCompatible() const {
+    return absl::FailedPreconditionError(
+        absl::StrCat(type_string(), " does not support random access."));
+  }
+
+  // Return a finalized version of the dataset.  The returned DatasetBase is
+  // unowned and lives for as long as this dataset.
+  virtual absl::StatusOr<DatasetBase*> Finalize(
+      OpKernelContext* ctx,
+      std::function<absl::StatusOr<core::RefCountPtr<DatasetBase>>()>
+          make_finalized_dataset) const;
+
+  // Wrapper around a GraphDefBuilder which provides support for serializing
+  // Datasets as GraphDefs.
+  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
+   public:
+    explicit DatasetGraphDefBuilder(GraphDefBuilder* b)
+        : GraphDefBuilderWrapper(b) {}
+    absl::Status AddInputDataset(SerializationContext* ctx,
+                                 const DatasetBase* dataset, Node** output);
+    absl::Status AddDatasetOrTensor(SerializationContext* ctx,
+                                    const Tensor& val, Node** output);
+    absl::Status AddIdentity(SerializationContext* ctx,
+                             const std::string& name_prefix, Node** input,
+                             Node** output);
+
+   private:
+    absl::Status AddDatasetOrTensorHelper(SerializationContext* ctx,
+                                          const Tensor& val, Node** output);
+    absl::Status AddResourceHelper(SerializationContext* ctx, const Tensor& val,
+                                   Node** output);
+  };
+
+ protected:
+  friend class CapturedFunction;
+
+  // Serializes the dataset into a `GraphDef`, which has two uses:
+  //
+  // 1) To perform static input pipeline optimizations, tf.data serializes the
+  // dataset graph, applies graph rewrites, and then deserializes the graph.
+  // If a subclass of `DatasetBase` does not implement this method, then it will
+  // be excluded from static optimizations (and so will any upstream datasets).
+  //
+  // 2) To save the dataset so that it can restore at a later point (possibly in
+  // different environment). If a subclass of `DatasetBase` does not implement
+  // this method, then this migration will not be possible.
+  virtual absl::Status AsGraphDefInternal(SerializationContext* ctx,
+                                          DatasetGraphDefBuilder* b,
+                                          Node** node) const = 0;
+
+  virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const = 0;
+
+  void set_options(const Options& options) { options_ = options; }
+
+ private:
+  // Computes and stores the cardinality of a given dataset.
+  absl::Status ComputeCardinality();
+
+  // Computes the number of source datasets feeding into this dataset. A source
+  // dataset is a leaf in the subtree of dataset inputs.
+  absl::Status ComputeNumSources();
+
+  // Merges options from inputs to this dataset. If there is a conflict in a
+  // field value, the options set on this dataset takes precedence over those in
+  // the inputs. The order of precedence on the inputs is in the same order as
+  // how they appear for this dataset.
+  absl::Status MergeOptionsFromInputs();
+
+  const string type_string_;
+  const string node_name_;
+  Metadata metadata_;
+  Options options_;
+  mutable mutex mu_;
+  mutable mutex cardinality_mu_;
+  mutable core::RefCountPtr<DatasetBase> finalized_dataset_;
+  //  The number of source datasets feeding into the dataset. A source dataset
+  //  is a leaf in the subtree of dataset inputs.
+  int64_t num_sources_ = -1;
+  mutable int64_t cardinality_ TF_GUARDED_BY(cardinality_mu_) =
+      kUnknownCardinality;
+};
+
+// Represents an iterator that is associated with a particular dataset.
+class DatasetBaseIterator : public IteratorBase {
+ public:
+  struct BaseParams {
+    // Owns one reference on the shared dataset object.
+    const DatasetBase* dataset;
+
+    // Identifies the sequence of iterators leading up to this iterator.
+    const string prefix;
+  };
+
+  explicit DatasetBaseIterator(const BaseParams& params);
+
+  ~DatasetBaseIterator() override;
+
+  virtual const DatasetBase* dataset() const { return params_.dataset; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return params_.dataset->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return params_.dataset->output_shapes();
+  }
+
+  const string& prefix() const override { return params_.prefix; }
+
+  // Returns a name to be used for the TraceMe event.
+  //
+  // NOTE: TraceMe supports passing key-value pairs of "arguments" using the
+  // following format "name#arg_1=value_,...,arg_n=value_n".
+  string BuildTraceMeName();
+
+  absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence) final;
+
+  absl::Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence) {
+    return GetNext(&ctx, out_tensors, end_of_sequence);
+  }
+
+  absl::Status Skip(IteratorContext* ctx, int num_to_skip,
+                    bool* end_of_sequence, int* num_skipped) final;
+
+  absl::Status Save(SerializationContext* ctx,
+                    IteratorStateWriter* writer) final {
+    VLOG(2) << "Attempting to save checkpoints on iterator (prefix: "
+            << prefix() << ") from " << dataset()->DebugString();
+    return IteratorBase::Save(ctx, writer);
+  }
+
+  // Returns a copy of the `status` where the error message is prepended with
+  // dataset name and the iterator prefix.
+  absl::Status AddErrorContext(const absl::Status& status) const {
+    return absl::Status(
+        status.code(),
+        strings::StrCat("Error in user-defined function passed to ",
+                        dataset()->metadata().name(),
+                        " transformation with iterator: ", prefix(), ": ",
+                        status.message()));
+  }
+
+ protected:
+  absl::Status Restore(IteratorContext* ctx,
+                       IteratorStateReader* reader) final {
+    VLOG(2) << "Attempting to restore checkpoints on iterator (prefix: "
+            << prefix() << ") from " << dataset()->DebugString();
+    return IteratorBase::Restore(ctx, reader);
+  }
+
+  // Internal implementation of GetNext that is wrapped in tracing logic.
+  //
+  // See the docstring of `GetNext` method regaring the contract for
+  // `out_tensors` and `end_of_sequence`. Implementations may assume that
+  // `*out_tensors` is empty.
+  virtual absl::Status GetNextInternal(IteratorContext* ctx,
+                                       std::vector<Tensor>* out_tensors,
+                                       bool* end_of_sequence) = 0;
+
+  // Internal implementation of Skip that is wrapped in tracing logic
+  virtual absl::Status SkipInternal(IteratorContext* ctx, int num_to_skip,
+                                    bool* end_of_sequence, int* num_skipped);
+
+  string full_name(const string& name) const {
+    return FullName(params_.prefix, name);
+  }
+
+  // Returns a map of key-value pairs to included in the TraceMe string.
+  virtual TraceMeMetadata GetTraceMeMetadata() const { return {}; }
+
+  // By default we model iterators using an unknown node, which acts as
+  // pass-through with respect to performance modeling.
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeUnknownNode(std::move(args));
+  }
+
+  // When modeling is enabled, this method disables autotuning for the given
+  // iterator (and the transitive closure of its inputs).
+  void DisableAutotune(IteratorContext* ctx, IteratorBase* iterator) {
+    if (iterator->node_) {
+      iterator->node_->set_autotune(false);
+    }
+  }
+
+  // When modeling is enabled, this method enables autotuning for the given
+  // iterator (and the transitive closure of its inputs).
+  void EnableAutotune(IteratorContext* ctx, IteratorBase* iterator) {
+    if (iterator->node_) {
+      iterator->node_->set_autotune(true);
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that this iterator
+  // has dequeued an element from an internal buffer.
+  void RecordBufferDequeue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (collect_resource_usage(ctx)) {
+      node_->record_buffer_event(-GetAllocatedBytes(element), -1);
+      DCHECK_GE(node_->buffered_elements(), 0);
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that this iterator
+  // has enqueued an element in an internal buffer.
+  void RecordBufferEnqueue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (collect_resource_usage(ctx)) {
+      node_->record_buffer_event(GetAllocatedBytes(element), 1);
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that this iterator
+  // has produced an element and its size in bytes.
+  void RecordElement(IteratorContext* ctx, std::vector<Tensor>* out_tensors) {
+    if (collect_resource_usage(ctx)) {
+      int64_t num_bytes = GetAllocatedBytes(*out_tensors);
+      node_->record_element();
+      node_->record_bytes_produced(num_bytes);
+      if (node_->output()) {
+        node_->output()->record_bytes_consumed(num_bytes);
+      }
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that a thread of
+  // this iterator has started work.
+  void RecordStart(IteratorContext* ctx) {
+    if (collect_resource_usage(ctx)) {
+      int64_t now_nanos = EnvTime::NowNanos();
+      node_->record_start(now_nanos);
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that a thread of
+  // this iterator has stopped work.
+  void RecordStop(IteratorContext* ctx) {
+    if (collect_resource_usage(ctx)) {
+      int64_t now_nanos = EnvTime::NowNanos();
+      node_->record_stop(now_nanos);
+    }
+  }
+
+  // Returns whether work is currently being recorded, i.e. whether we are
+  // currently between a `RecordStart` and a `RecordStop`.
+  bool IsRecording(IteratorContext* ctx) {
+    return node_ && node_->is_recording();
+  }
+
+ private:
+  bool collect_resource_usage(IteratorContext* ctx) {
+    return ctx->model() && node_;
+  }
+
+  string traceme_metadata_;
+  BaseParams params_;
+};
+
+// Represents an iterator that is associated with a particular dataset
+// with a particular type.
+template <class DatasetType>
+class DatasetIterator : public DatasetBaseIterator {
+ public:
+  struct Params {
+    // Borrowed pointer to the dataset.
+    const DatasetType* dataset;
+
+    // Identifies the sequence of iterators leading up to this iterator.
+    const string prefix;
+  };
+
+  explicit DatasetIterator(const Params& params)
+      : DatasetBaseIterator({params.dataset, params.prefix}),
+        typed_dataset_(params.dataset) {}
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const final { return typed_dataset_; }
+
+ private:
+  const DatasetType* const typed_dataset_;  // Not owned.
+};
+
+template <typename T>
+absl::Status ParseScalarArgument(OpKernelContext* ctx,
+                                 const absl::string_view& argument_name,
+                                 T* output) {
+  const Tensor* argument_t;
+  TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+  if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+    return errors::InvalidArgument(argument_name, " must be a scalar");
+  }
+  *output = argument_t->scalar<T>()();
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status ParseVectorArgument(OpKernelContext* ctx,
+                                 const absl::string_view& argument_name,
+                                 std::vector<T>* output) {
+  const Tensor* argument_t;
+  TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+  if (!TensorShapeUtils::IsVector(argument_t->shape())) {
+    return errors::InvalidArgument(argument_name, " must be a vector");
+  }
+  int size = argument_t->vec<T>().size();
+  output->reserve(size);
+  for (int i = 0; i < size; ++i) {
+    output->push_back(argument_t->vec<T>()(i));
+  }
+  return absl::OkStatus();
+}
+
+// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
+// graph execution engine.
+class DatasetOpKernel : public OpKernel {
+ public:
+  explicit DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    if (ctx->HasAttr(kMetadata)) {
+      std::string serialized_metadata;
+      OP_REQUIRES_OK(ctx, ctx->GetAttr(kMetadata, &serialized_metadata));
+      OP_REQUIRES(ctx, metadata_.ParseFromString(serialized_metadata),
+                  errors::InvalidArgument(absl::StrCat(
+                      "Could not parse the 'metadata' attribute.")));
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) final;
+
+  // Checks whether the given op is a tf.data operation.
+  //
+  // NOTE: The check uses a heuristic and can produce both false positives and
+  // false negatives. In particular, tf.data operations are expected to use
+  // names that end with "Dataset" or "DatasetV[0-9]+".
+  static bool IsDatasetOp(const OpDef& op_def);
+
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
+
+ private:
+  Metadata metadata_;
+};
+
+// Encapsulates the work required to plug unary Datasets into the core
+// TensorFlow graph execution engine.
+class UnaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  explicit UnaryDatasetOpKernel(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase** output) = 0;
+};
+
+// Encapsulates the work required to plug binary Datasets into the core
+// TensorFlow graph execution engine.
+class BinaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  explicit BinaryDatasetOpKernel(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase* another_input,
+                           DatasetBase** output) = 0;
+};
+
+// A simple background worker that executes closures asynchronously and without
+// blocking.
+//
+// A `BackgroundWorker` is used to offload blocking work from an `AsyncOpKernel`
+// to avoid blocking an executor thread that may be required by the blocking
+// work.
+//
+// NOTE(mrry): We do not use a regular `tensorflow::thread::ThreadPool` for this
+// purpose because its current implementation (in Eigen) uses a finite-length
+// queue and will block the caller when full. This can lead to deadlock under
+// heavy load. Since the number of concurrent work items in each user of a
+// `BackgroundWorker` is at most one per op invocation, the dynamic allocation
+// overhead is tolerable.
+class BackgroundWorker {
+ public:
+  BackgroundWorker(Env* env, const char* name);
+
+  ~BackgroundWorker();
+
+  void Schedule(std::function<void()> work_item);
+
+ private:
+  void WorkerLoop();
+
+  Env* const env_;
+  const char* const name_;
+
+  std::unique_ptr<Thread> thread_;
+  mutex mu_;
+  condition_variable cond_var_;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  std::deque<std::function<void()>> work_queue_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/dataset_stateful_op_allowlist.h b/third_party/tflite-hdrs/tensorflow/core/framework/dataset_stateful_op_allowlist.h
new file mode 100644
index 00000000..cc25c801
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/dataset_stateful_op_allowlist.h
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_ALLOWLIST_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_ALLOWLIST_H_
+
+#include <unordered_set>
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace data {
+// Registry for stateful ops that need to be used in dataset functions.
+// See below macro for usage details.
+class AllowlistedStatefulOpRegistry {
+ public:
+  absl::Status Add(string op_name) {
+    op_names_.insert(std::move(op_name));
+    return absl::OkStatus();
+  }
+
+  absl::Status Remove(string op_name) {
+    op_names_.erase(op_name);
+    return absl::OkStatus();
+  }
+
+  bool Contains(const string& op_name) { return op_names_.count(op_name); }
+
+  static AllowlistedStatefulOpRegistry* Global() {
+    static auto* reg = new AllowlistedStatefulOpRegistry;
+    return reg;
+  }
+
+ private:
+  AllowlistedStatefulOpRegistry() = default;
+  AllowlistedStatefulOpRegistry(AllowlistedStatefulOpRegistry const& copy) =
+      delete;
+  AllowlistedStatefulOpRegistry operator=(
+      AllowlistedStatefulOpRegistry const& copy) = delete;
+
+  std::unordered_set<string> op_names_;
+};
+
+}  // namespace data
+
+// Use this macro to allowlist an op that is marked stateful but needs to be
+// used inside a map_fn in an input pipeline. This is only needed if you wish
+// to be able to checkpoint the state of the input pipeline. We currently
+// do not allow stateful ops to be defined inside of map_fns since it is not
+// possible to save their state.
+// Note that the state of the allowlisted ops inside functions will not be
+// saved during checkpointing, hence this should only be used if the op is
+// marked stateful for reasons like to avoid constant folding during graph
+// optimization but is not stateful.
+// If possible, try to remove the stateful flag on the op first.
+// Example usage:
+//
+//   ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
+//
+#define ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
+  ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
+#define ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
+  ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
+#define ALLOW_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)       \
+  static ::tensorflow::Status allowlist_op##ctr TF_ATTRIBUTE_UNUSED = \
+      ::tensorflow::data::AllowlistedStatefulOpRegistry::Global()->Add(name)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_ALLOWLIST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/device.h b/third_party/tflite-hdrs/tensorflow/core/framework/device.h
new file mode 100644
index 00000000..7b5bfcb1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/device.h
@@ -0,0 +1,230 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A Device is a something that can perform computations as part of a
+// model.  Devices can be local (runs computation on this machine), or
+// remote (contacts a device local to another machine using an RPC to
+// do the work).  Devices are registered in a DeviceSet, which is also
+// responsible for the Device <-> id mapping.
+//
+// Device names
+// * Every Device should have a unique name with the format:
+//     /job:___/replica:___/task:___/(gpu|cpu):___
+//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
+// * Task numbers are within the specified replica, so there are as
+//   many "task zeros" as replicas.
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_segment.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+class Device : public DeviceBase {
+ public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const absl::Status&)> DoneCallback;
+
+  Device(Env* env, const DeviceAttributes& device_attributes);
+  ~Device() override;
+
+  // A compare function that orders devices by their parsed name.
+  static bool LessByParsedName(const Device& a, const Device& b) {
+    return a.parsed_name() < b.parsed_name();
+  }
+
+  // Full name of this device (see top comment).
+  const std::string& name() const override { return device_attributes_.name(); }
+
+  // Parsed name of this device
+  const DeviceNameUtils::ParsedName& parsed_name() const override {
+    return parsed_name_;
+  }
+
+  // Describes what kind of device this is.  This is intended to be
+  // human-readable and not computer-parsed, except that two devices
+  // with the same device_type() are expected to perform similarly
+  // (both from a computation and communication perspective).
+  const std::string& device_type() const override {
+    return device_attributes_.device_type();
+  }
+
+  // Returns an aggregation of device attributes.
+  const DeviceAttributes& attributes() const override {
+    return device_attributes_;
+  }
+
+  // Performs the actual compute function.
+  //
+  // Subclasses may override this function if they wish to perform
+  // some initialization before each compute.
+  virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) {
+    op_kernel->Compute(context);
+  }
+
+  // Asynchronous kernel's compute.
+  virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                            AsyncOpKernel::DoneCallback done) {
+    op_kernel->ComputeAsync(context, std::move(done));
+  }
+
+  // Blocks until all operations queued on the device at the time of
+  // the call have completed.  Returns any error pending on the device
+  // at completion.
+  virtual absl::Status Sync() = 0;
+
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
+  // On session completion, the executor may call Device::Sync() depending on
+  // flag settings. Override this to return false for devices that don't allow
+  // such calls. Instead, these devices must use other mechanisms (such as
+  // num_deferred_ops) to ensure the device has finished processing necessary
+  // work at session completion. In addition, for these devices, RefreshStatus
+  // must be called at session completion to retrieve execution result status.
+  //
+  // Devices that override this function must also implement RefreshStatus.
+  virtual bool AllowsSyncOnCompletion() const { return true; }
+
+  // This is used in conjunction with AllowsSyncOnCompletion to allow the
+  // executor to get execution result status at session completion.
+  //
+  // For supported devices, this call returns the underlying device stream's
+  // current status in a non-blocking way, without using blocking calls such as
+  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
+  // status is also updated with the retrieved stream status.
+  virtual absl::Status RefreshStatus() {
+    return errors::Unimplemented(
+        "RefreshStatus is not supported on this device.");
+  }
+
+  // Optionally modify the device's GraphDef before execution.
+  //
+  // This method should be considered experimental and is supplied to enable
+  // prototyping of TensorFlow device implementations that need to modify
+  // the GraphDef before execution.
+  //
+  // 'graph' supplies the partition of the graph assigned to this
+  // device.
+  virtual absl::Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
+    return absl::OkStatus();
+  }
+
+  // Sets `out_context` a new DeviceContext* for executing a graph, or nullptr
+  // if the device does not support contexts. Returns an error status if any
+  // error occurred while trying to create a context, otherwise OK.
+  //
+  // The caller takes ownership of one reference on the output DeviceContext*,
+  // and should call Unref().
+  virtual absl::Status TryGetDeviceContext(DeviceContext** out_context) {
+    *out_context = nullptr;
+    return absl::OkStatus();
+  }
+
+  // Returns the op segment of this device.  The caller can reuse op
+  // kernels registered for the same session running on this device.
+  OpSegment* op_segment() { return &op_seg_; }
+
+  // Returns the resource manager associated w/ this device.
+  virtual ResourceMgr* resource_manager() { return rmgr_; }
+
+  // Summarizes the status of this Device, for debugging.
+  std::string DebugString() const { return device_attributes_.DebugString(); }
+
+  // Assembles the parameter components into a complete DeviceAttributes value.
+  static DeviceAttributes BuildDeviceAttributes(
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality, const std::string& physical_device_desc);
+
+  static DeviceAttributes BuildDeviceAttributes(
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality) {
+    // Pass in an empty string as physical device name.
+    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
+  }
+
+  // Updates `attributes()`, indicating the XLA global ID associated with this
+  // device. This ID is unique across clients in a multi-client setup. For TPUs
+  // this does not happen until the TPU system has been initialized.
+  void set_xla_global_id(int64_t id) override {
+    device_attributes_.set_xla_global_id(id);
+  }
+
+  // Clears the resource manager associated with this device.
+  void ClearResourceMgr() { rmgr_->Clear(); }
+
+  virtual bool IsLocal() const { return true; }
+
+  // Informs if this Device can be used as a caller in RemoteCall operation.
+  virtual bool IsRemoteCallAllowed() const;
+
+  // Whether to merge the host_to_device copy stream with the compute stream.
+  // Only useful for GPU devices.
+  virtual bool merge_host_to_device_stream() const { return false; }
+
+  // Whether to merge the device_to_host copy stream with the compute stream.
+  // Only useful for GPU devices.
+  virtual bool merge_device_to_host_stream() const { return false; }
+
+  // Whether to merge the device_to_device copy streams with the compute stream.
+  // Only useful for GPU devices.
+  virtual bool merge_device_to_device_stream() const { return false; }
+
+ protected:
+  void DeleteResourceMgr() {
+    delete rmgr_;
+    rmgr_ = nullptr;
+  }
+
+ private:
+  DeviceAttributes device_attributes_;
+  DeviceNameUtils::ParsedName parsed_name_;
+
+  // op_seg_ maps session handle and op name to OpKernel objects.
+  OpSegment op_seg_;
+
+  // Resources associated w/ this device. E.g., shared variables, etc.
+  ResourceMgr* rmgr_ = nullptr;
+
+  Device(const Device&) = delete;
+  void operator=(const Device&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/device_base.h b/third_party/tflite-hdrs/tensorflow/core/framework/device_base.h
new file mode 100644
index 00000000..fe5099fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/device_base.h
@@ -0,0 +1,313 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}  // end namespace Eigen
+
+namespace stream_executor {
+class Stream;
+}  // namespace stream_executor
+
+namespace tsl {
+class Env;
+namespace thread {
+class ThreadPool;
+}  // namespace thread
+}  // namespace tsl
+namespace tensorflow {
+
+class Device;
+class DeviceAttributes;
+class EventMgr;
+class OpKernelContext;
+class ResourceMgr;
+class ScopedAllocatorMgr;
+class TensorProto;
+
+// A wrapper for an Eigen Gpu Device that includes per-op state. The
+// class is defined even for non-GPU devices since the
+// OpKernelContext::Params structure wants to fill it in.
+class PerOpGpuDevice {
+ public:
+  virtual ~PerOpGpuDevice() {}
+  virtual const Eigen::GpuDevice& device() const = 0;
+};
+
+// A class that devices can subclass to pass around
+// Device-specific context to OpKernels.
+class DeviceContext : public core::RefCounted {
+ public:
+  ~DeviceContext() override {}
+  virtual stream_executor::Stream* stream() const { return nullptr; }
+  virtual void MaintainLifetimeOnStream(const Tensor* t,
+                                        stream_executor::Stream* stream) const {
+  }
+
+  // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
+  // "device_tensor" which is on a non-CPU device "device". "device_tensor"
+  // must be allocated to be of the same size as "cpu_tensor".
+  virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                                     Tensor* device_tensor, StatusCallback done,
+                                     bool sync_dst_compute = true) const {
+    done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
+  }
+
+  // Same as CopyCPUTensorToDevice, but in a synchronous way.
+  absl::Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor,
+                                         Device* device,
+                                         Tensor* device_tensor) const;
+
+  // Copies a tensor in this device.
+  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
+                                      Device* device, Tensor* output_tensor,
+                                      StatusCallback done) const {
+    done(errors::Unimplemented("Copy in same device not implemented."));
+  }
+
+  // "device_tensor" is a tensor on a non-CPU device.  Copies
+  // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
+  // to be of the same size as "device_tensor".
+  virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                                     absl::string_view tensor_name,
+                                     Device* device, Tensor* cpu_tensor,
+                                     StatusCallback done) {
+    done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
+  }
+
+  // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done.
+  absl::Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
+                                         absl::string_view tensor_name,
+                                         Device* device, Tensor* cpu_tensor);
+
+  // If possible, wait for all events on *stream to complete then execute func.
+  // A non-OK Status is returned otherwise.  The stream argument should be the
+  // one provided by AcceleratorDeviceInfo.  This function is not applicable to
+  // devices that don't provide such a value.
+  virtual absl::Status ThenExecute(Device* device,
+                                   stream_executor::Stream* stream,
+                                   std::function<void()> func) {
+    return errors::Internal("ThenExecute not supported by device");
+  }
+
+  // check if device is a pluggable device
+  virtual bool IsPluggableDevice() { return false; }
+
+  // Returns the pinned host memory allocator for the device.
+  virtual Allocator* host_memory_allocator() const { return nullptr; }
+};
+
+class DeviceBase {
+ public:
+  explicit DeviceBase(tsl::Env* env) : env_(env) {}
+  virtual ~DeviceBase();
+
+  tsl::Env* env() const { return env_; }
+
+  struct CpuWorkerThreads {
+    int num_threads = 0;
+    tsl::thread::ThreadPool* workers = nullptr;
+  };
+
+  // Does not take ownership.
+  void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) {
+    cpu_worker_threads_ = t;
+  }
+
+  virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
+    CHECK(cpu_worker_threads_ != nullptr);
+    return cpu_worker_threads_;
+  }
+
+  // "stream" is used in special circumstances (such as the
+  // constructors of Ops) where there is no available OpKernelContext.
+  // "default_context" is used by OpKernelContext whenever a device does not
+  // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only
+  // using a single stream.)
+  // "event_mgr" is used to delay deallocation of temporary GPU buffers.
+  // TODO(pbar) Work out how to move this out of DeviceBase.
+  struct AcceleratorDeviceInfo {
+    // Make sure all the defaults are NULL, so we can spot missing assignments.
+    stream_executor::Stream* stream = nullptr;
+    DeviceContext* default_context = nullptr;
+    DeviceContext* pjrt_context = nullptr;
+    bool use_pjrt_tensor_buffer = false;
+    EventMgr* event_mgr = nullptr;
+    int gpu_id = -1;
+  };
+
+  // Does not take ownership.
+  void set_tensorflow_accelerator_device_info(
+      AcceleratorDeviceInfo* device_info) {
+    accelerator_device_info_ = device_info;
+  }
+
+  virtual const AcceleratorDeviceInfo* tensorflow_accelerator_device_info()
+      const {
+    return accelerator_device_info_;
+  }
+
+  // The preferred thread pool for this device. If it is nullptr, the system
+  // automatically assigns a thread pool for execution.
+  virtual tsl::thread::ThreadPool* tensorflow_device_thread_pool() {
+    return device_thread_pool_;
+  }
+
+  // Does not take ownership.
+  void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
+
+  // Return the Allocator implementation to use based on the allocator
+  // attributes requested.  See allocator.h for more details.
+  virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
+    LOG(FATAL) << "GetAllocator() is not implemented.";
+    return nullptr;
+  }
+
+  // This method is provided for backwards compatibility, and will be removed
+  // in a future release.
+  ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.")
+  Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
+    return GetAllocator(attr);
+  }
+
+  // Return an Allocator prepared for use in particular places by graph
+  // optimization
+  virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                        int64_t step_id) {
+    LOG(FATAL) << "Device does not implement GetScopedAllocator()";
+    return nullptr;
+  }
+
+  virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
+
+  virtual bool has_eigen_cpu_device() const {
+    return !eigen_cpu_devices_.empty();
+  }
+
+  virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
+
+  // Caller owns the return value. The OpKernelContext calls this even
+  // for devices that do not implement an eigen_gpu_device. Overridden
+  // by GPU devices to return a derived type.
+  virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
+
+  virtual DeviceBase* UnderlyingDevice() { return this; }
+  virtual const DeviceBase* UnderlyingDevice() const { return this; }
+
+  // This is overridden by GPU devices to reinitialize the derived
+  // type returned by MakeGpuDevice.
+  virtual absl::Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
+                                             PerOpGpuDevice* /*device*/,
+                                             DeviceContext* /*dc*/,
+                                             Allocator* /*allocator*/) {
+    return absl::OkStatus();
+  }
+
+  // Unimplemented by default
+  virtual const DeviceAttributes& attributes() const;
+  virtual int NumaNode() const { return attributes().locality().numa_node(); }
+  virtual const std::string& name() const;
+  virtual const DeviceNameUtils::ParsedName& parsed_name() const;
+  virtual const std::string& device_type() const;
+
+  // Updates `attributes()`, indicating the XLA global ID associated with this
+  // device. This ID is unique across clients in a multi-client setup. For TPUs
+  // this does not happen until the TPU system has been initialized.
+  //
+  // Implemented in Device.
+  virtual void set_xla_global_id(int64_t id) {}
+
+  // Materializes the given TensorProto into 'tensor' stored in Device
+  // memory.  Most devices will want to override this.
+  //
+  // TODO(vrv): We should be able to put this function into
+  // OpKernelContext and handle the copies from device memory via send
+  // and receive nodes, instead of requiring that each device handle
+  // the copies here as well as in copy ops.
+  virtual absl::Status MakeTensorFromProto(
+      const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
+      Tensor* tensor) {
+    return errors::Internal("Device does not implement MakeTensorFromProto()");
+  }
+
+  // Some devices (i.e. GPUs) may free device memory prior to its actual use
+  // being completed on the assumption that subsequent allocations can only be
+  // used serially with respect to pending uses.  If this function returns a
+  // non-zero value it is the value of a device-specific counter such that any
+  // device memory tagged with an earlier freed-at count is really unencumbered
+  // by pending uses.  For this to be useful the device memory allocator must
+  // be tagging deallocated memory chunks using the same counter.
+  virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
+
+  // Copies `input_tensor` to `output_tensor`, where both tensors are on this
+  // device. This function assumes that `output_tensor` has already been
+  // allocated with a buffer that is large enough to hold `input_tensor`'s data.
+  // Calls `done` from a device-specific thread after copy is finished, which
+  // may be the same as calling thread.
+  //
+  // NOTE(ayushd): This function is for TensorFlow internal use only.  Deep copy
+  // is discouraged and should not be used in OpKernels.
+  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
+                                      Tensor* output_tensor,
+                                      const DeviceContext* device_context,
+                                      StatusCallback done) {
+    done(errors::Internal("Device ", name(), " does not implement ",
+                          "CopyTensorInSameDevice"));
+  }
+
+ protected:
+  // Does not take ownership.
+  void set_tensorflow_device_thread_pool(tsl::thread::ThreadPool* thread_pool) {
+    device_thread_pool_ = thread_pool;
+  }
+
+ private:
+  tsl::Env* const env_;
+  CpuWorkerThreads* cpu_worker_threads_ = nullptr;
+  // Set by GPUs as well as by TPU devices.
+  AcceleratorDeviceInfo* accelerator_device_info_ = nullptr;
+  tsl::thread::ThreadPool* device_thread_pool_ = nullptr;
+  std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
+};
+
+// Methods to create and check for Symbolic execution devices.
+// Such devices are mostly used for TF-XLA bridge. TF should not treat these as
+// normal devices.
+void AddSymbolicExecutionDevice(absl::string_view device_name);
+bool IsSymbolicExecutionDevice(absl::string_view device_name);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/device_factory.h b/third_party/tflite-hdrs/tensorflow/core/framework/device_factory.h
new file mode 100644
index 00000000..8b07d15c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/device_factory.h
@@ -0,0 +1,173 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Device;
+struct SessionOptions;
+
+class DeviceFactory {
+ public:
+  virtual ~DeviceFactory() {}
+  static void Register(const std::string& device_type,
+                       std::unique_ptr<DeviceFactory> factory, int priority,
+                       bool is_pluggable_device);
+  ABSL_DEPRECATED("Use the `Register` function above instead")
+  static void Register(const std::string& device_type, DeviceFactory* factory,
+                       int priority, bool is_pluggable_device) {
+    Register(device_type, std::unique_ptr<DeviceFactory>(factory), priority,
+             is_pluggable_device);
+  }
+  static DeviceFactory* GetFactory(const std::string& device_type);
+
+  // Append to "*devices" CPU devices.
+  static absl::Status AddCpuDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices);
+
+  // Append to "*devices" all suitable devices, respecting
+  // any device type specific properties/counts listed in "options".
+  //
+  // CPU devices are added first.
+  static absl::Status AddDevices(const SessionOptions& options,
+                                 const std::string& name_prefix,
+                                 std::vector<std::unique_ptr<Device>>* devices);
+
+  // Helper for tests.  Create a single device of type "type".  The
+  // returned device is always numbered zero, so if creating multiple
+  // devices of the same type, supply distinct name_prefix arguments.
+  static std::unique_ptr<Device> NewDevice(const string& type,
+                                           const SessionOptions& options,
+                                           const string& name_prefix);
+
+  // Iterate through all device factories and build a list of all of the
+  // possible physical devices.
+  //
+  // CPU is are added first.
+  static absl::Status ListAllPhysicalDevices(std::vector<string>* devices);
+
+  // Iterate through all device factories and build a list of all of the
+  // possible pluggable physical devices.
+  static absl::Status ListPluggablePhysicalDevices(
+      std::vector<string>* devices);
+
+  // Get details for a specific device among all device factories.
+  // 'device_index' indexes into devices from ListAllPhysicalDevices.
+  static absl::Status GetAnyDeviceDetails(
+      int device_index, std::unordered_map<string, string>* details);
+
+  // For a specific device factory list all possible physical devices.
+  virtual absl::Status ListPhysicalDevices(std::vector<string>* devices) = 0;
+
+  // Get details for a specific device for a specific factory. Subclasses
+  // can store arbitrary device information in the map. 'device_index' indexes
+  // into devices from ListPhysicalDevices.
+  virtual absl::Status GetDeviceDetails(
+      int device_index, std::unordered_map<string, string>* details) {
+    return absl::OkStatus();
+  }
+
+  // Most clients should call AddDevices() instead.
+  virtual absl::Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) = 0;
+
+  // Return the device priority number for a "device_type" string.
+  //
+  // Higher number implies higher priority.
+  //
+  // In standard TensorFlow distributions, GPU device types are
+  // preferred over CPU, and by default, custom devices that don't set
+  // a custom priority during registration will be prioritized lower
+  // than CPU.  Custom devices that want a higher priority can set the
+  // 'priority' field when registering their device to something
+  // higher than the packaged devices.  See calls to
+  // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
+  // for built-in devices.
+  static int32 DevicePriority(const std::string& device_type);
+
+  // Returns true if 'device_type' is registered from plugin. Returns false if
+  // 'device_type' is a first-party device.
+  static bool IsPluggableDevice(const std::string& device_type);
+};
+
+namespace dfactory {
+
+template <class Factory>
+class Registrar {
+ public:
+  // Multiple registrations for the same device type with different priorities
+  // are allowed.  Priorities are used in two different ways:
+  //
+  // 1) When choosing which factory (that is, which device
+  //    implementation) to use for a specific 'device_type', the
+  //    factory registered with the highest priority will be chosen.
+  //    For example, if there are two registrations:
+  //
+  //      Registrar<CPUFactory1>("CPU", 125);
+  //      Registrar<CPUFactory2>("CPU", 150);
+  //
+  //    then CPUFactory2 will be chosen when
+  //    DeviceFactory::GetFactory("CPU") is called.
+  //
+  // 2) When choosing which 'device_type' is preferred over other
+  //    DeviceTypes in a DeviceSet, the ordering is determined
+  //    by the 'priority' set during registration.  For example, if there
+  //    are two registrations:
+  //
+  //      Registrar<CPUFactory>("CPU", 100);
+  //      Registrar<GPUFactory>("GPU", 200);
+  //
+  //    then DeviceType("GPU") will be prioritized higher than
+  //    DeviceType("CPU").
+  //
+  // The default priority values for built-in devices is:
+  // GPU: 210
+  // GPUCompatibleCPU: 70
+  // ThreadPoolDevice: 60
+  // Default: 50
+  explicit Registrar(const std::string& device_type, int priority = 50) {
+    DeviceFactory::Register(device_type, std::make_unique<Factory>(), priority,
+                            /*is_pluggable_device*/ false);
+  }
+};
+
+}  // namespace dfactory
+
+#define REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, ...) \
+  INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory,   \
+                                         __COUNTER__, ##__VA_ARGS__)
+
+#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \
+                                               ctr, ...)                    \
+  static ::tensorflow::dfactory::Registrar<device_factory>                  \
+  INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr)(device_type, ##__VA_ARGS__)
+
+// __COUNTER__ must go through another macro to be properly expanded
+#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr) ___##ctr##__object_
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/fake_input.h b/third_party/tflite-hdrs/tensorflow/core/framework/fake_input.h
new file mode 100644
index 00000000..c3062762
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/fake_input.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// These functions return values that may be passed to
+// NodeDefBuilder::Input() to add an input for a test.  Use them when
+// you don't care about the node names/output indices providing the
+// input.  They also allow you to omit the input types and/or
+// list length when they may be inferred.
+FakeInputFunctor FakeInput();  // Infer everything
+FakeInputFunctor FakeInput(DataType dt);
+FakeInputFunctor FakeInput(int n);  // List of length n
+FakeInputFunctor FakeInput(int n, DataType dt);
+FakeInputFunctor FakeInput(DataTypeSlice dts);
+inline FakeInputFunctor FakeInput(std::initializer_list<DataType> dts) {
+  return FakeInput(DataTypeSlice(dts));
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/full_type_inference_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/full_type_inference_util.h
new file mode 100644
index 00000000..3117613b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/full_type_inference_util.h
@@ -0,0 +1,159 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FULL_TYPE_INFERENCE_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FULL_TYPE_INFERENCE_UTIL_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+namespace full_type {
+
+// TODO(mdan): Specific helpers won't get too far. Use a parser instead.
+
+// Helpers that allow shorthand expression for the more common kinds of type
+// inference functions.
+// TODO(mdan): Break into separate header if it grows.
+// Note: The information contained in these functions is also expressed to some
+// extent by opdef attributes of the kind "input: T, output T". But in that
+// context, T has strong DType semantics (i.e. T is DT_VARIANT for most
+// interesting cases). The logic here extends to the op's FullType, so it's best
+// to keep them separate, even though it leads to some redundancy. The
+// same can be said about the shape inference function.
+
+// Note: Unlike type constructors, which describe op definitions, type inference
+// functions are meant to modify the type information of specific nodes (i.e.
+// NodeDef proto).
+
+// Helper for a no-op type inference function that indicates type inference
+// should never alter the node's existing type.
+// This is the same as not defining a type inference function at all, but
+// explicitly communicates that intent.
+TypeInferenceFn KeepExisting();
+
+// A helper for a type inference function that indicates a single output that
+// is a tensor of type t. This is the equivalent of a type construtor since it
+// does not depend on inputs. This can be used with Tuple.
+TypeInferenceFn Tensor(FullTypeId t);
+
+// Helper for a type inference function which has the same type as the i'th
+// input.
+// The n arg allows multiple outputs, e.g. (T -> Product[T, T]).
+// TODO(mdan): Drop defaults for readability if more non-(0, 1) cases appear.
+// TODO(mdan): Rename to just Replicate.
+TypeInferenceFn ReplicateInput(int i = 0, int n = 1);
+
+// Helper for a type inference function which has the same type as a variadic
+// number of inputs, e.g. (T, T -> Product[T]), (T, T, T -> Product[T]), etc.
+// Infers the meet of the input types, in the sense of type meets (see
+// https://en.wikipedia.org/wiki/Join_and_meet). This implementation is
+// simplified to require the two inputs are a subtype of another.
+TypeInferenceFn Merge();
+
+// Helper for ops with semantics of encoding an input, that is,
+// `T -> Encoded[T, <t>]`, where <t> is the encoded type.
+TypeInferenceFn Encode(FullTypeId t, int i);
+
+// Helper for ops with semantics of encoding an input, that is,
+// `Encoded[T, <t>] -> T`, where <t> is the encoded type.
+TypeInferenceFn Decode(FullTypeId t, int i);
+
+// Helper for the type inference counterpart of Unary, that is (U ->
+// PRODUCT[<t>[U]]), where <t> is parameterized by this factory, and U is the
+// type of the input specified by element_idx.
+// Note: when we migrate to a more formal type definition of an op, these two
+// functions will naturally merge.
+TypeInferenceFn UnaryContainerCreate(FullTypeId t, int element_idx);
+
+// Helper for ops with semantics of adding an element to a container (<t>[T]),
+// that is (<t>[U], V -> PRODUCT[<t>[Union[U, V]]]), where <t> is parameterized
+// by this factory, U is the type of the input specified by container_idx, and V
+// is the type of the input specified by element_idx. The homogeneous arg allows
+// for constraints which guarantee that U and V must have a subtyping
+// relationship, case in which either V or U is selected, whichever is the
+// supertype.
+TypeInferenceFn UnaryContainerAdd(FullTypeId t, int container_idx,
+                                  int element_idx, bool homogeneous);
+
+// Helper for ops with semantics of unstacking multiple inputs into a container
+// `<t>[T1, ..., Tn]`, that is `T1, ..., Tn -> <t>[PRODUCT[U1, ..., Un]]`
+// where Ui is obtained from an "unstack" mapping T -> U. Both <t> and the
+// "unstack" mapping are parameterized by this factory.
+// Note that when the "unstack" function is the identity function, this becomes
+// equivalent to ContainerCreate.
+TypeInferenceFn MultiaryUnstack(
+    FullTypeId t, std::function<FullTypeDef(const FullTypeDef&)> unstack);
+
+// Helper for ops with semantics of applying some transformation to the
+// elements of a container:
+// `<t>[PRODUCT[T1, ..., Tn]] -> <t>[PRODUCT[U1, ..., Un]]`,
+// where Ui is obtained by applying a map T -> U. Both <t> and the "map"
+// function are parameterized by this factory. See BatchTensor and ShardTensor
+// for examples of "map".
+TypeInferenceFn ContainerMap(
+    FullTypeId t, int input_idx,
+    std::function<FullTypeDef(const FullTypeDef&)> map);
+
+// Helper for ops with semantics of repacking some element from a container to
+// another `<t> -> <u>`, in a covariant way, that is, `<t>[T] -> <u>[T]`. <t>
+// and <u> are parameterized by this factory. The input type is specified by
+// element_idx.
+TypeInferenceFn MapCovariant(FullTypeId t, FullTypeId u, int input_idx);
+
+// Helper for ops with semantics of calling a function. The function is
+// specified indirectly, as the name of an attribute that holds the actual
+// function name.
+TypeInferenceFn FunctionCall(const string& func_attr_name);
+
+// Compose the type of a function by concatenating the outputs of multiple
+// type inference functions. If func_list is {type inference function 1, type
+// inference function 2} which return PRODUCT[T1], PRODUCT[T2] resprectively,
+// the result is PRODUCT[T1, T2], This supports the Merge op that has an index
+// output in addition to the result of the Merge type inference function.
+TypeInferenceFn Tuple(const std::vector<TypeInferenceFn>& func_list);
+
+// Auxiliary constructs to help creation of type inference functions.
+// TODO(mdan): define these as type inference functions as well.
+
+// Mapping function representing the type function for unstacking of
+// Tensor (or Tensor-like) types. Note that this is a helper to use with
+// other type inference functions; it's not a function itself.
+// TODO(mdan): Replace with a trait, when available.
+FullTypeDef UnstackTensor(const FullTypeDef& t);
+
+// Mapping function representing the type function for an op that changes the
+// batch size of dataset. Note that this is a helper to use with other type
+// inference functions; it's not a function itself.
+// TODO(mdan): Replace with a trait, when available.
+FullTypeDef BatchTensor(const FullTypeDef& t);
+
+// Mapping function representing the type function for an op that creates a
+// fixed (given) number of tensors of a size calculated based on the input. Note
+// that this is a helper to use with other type inference functions; it's not a
+// function itself.
+// TODO(mdan): Replace with a trait, when available.
+FullTypeDef ShardTensor(const FullTypeDef& t);
+}  // namespace full_type
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FULL_TYPE_INFERENCE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/full_type_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/full_type_util.h
new file mode 100644
index 00000000..4039f3c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/full_type_util.h
@@ -0,0 +1,130 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FULL_TYPE_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FULL_TYPE_UTIL_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+namespace full_type {
+
+// TODO(mdan): Specific helpers won't get too far. Use a parser instead.
+// TODO(mdan): Move constructors into a separate file.
+
+// Helpers that allow shorthand expression for the more common kinds of type
+// constructors.
+// Note: The arity below refers to the number of arguments of parametric types,
+// not to the number of return values from a particular op.
+// Note: Type constructors are meant to create static type definitions in the
+// op definition (i.e. the OpDef proto).
+
+// Helper for a no-op type constructor that indicates that the node's type
+// should be set by external means (typically by the user).
+OpTypeConstructor NoOp();
+
+// Helper for a trivial type constructor that indicates a node has no
+// outputs (that is, its output type is an empty TFT_PRODUCT).
+OpTypeConstructor NoOutputs();
+
+// Helper for a type constructor of <t>[] (with no parameters).
+OpTypeConstructor Nullary(FullTypeId t);
+
+// Helper for a type constructor of <t>[FT_VAR[<var_name>]].
+OpTypeConstructor Unary(FullTypeId t, const string& var_name);
+
+// Helper for a type constructor of <t>[FT_ANY].
+OpTypeConstructor UnaryGeneric(FullTypeId t);
+
+// Helper for a type constructor of <t>[FT_TENSOR[<dtype>]].
+OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype);
+
+// Helper for a type constructor of <t>[FT_VAR[<var_name>]].
+OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name);
+
+// Helper for a type constructor of
+// <t>[FT_FOR_EACH[
+//     FT_PRODUCT,
+//     FT_TENSOR[FT_VAR[<var_name>]],
+//     FT_VAR[<var_name>]].
+// Multi-valued type variables will expand the template (see full_type.proto).
+OpTypeConstructor VariadicTensorContainer(FullTypeId t, const string& var_name);
+
+// Type specialization and inference logic. This function narrows the type
+// specified in an op definition. Such types are usually generic and dependent
+// on input types. This function resolves the output types based on the input
+// types specified in a given node def.
+absl::Status SpecializeType(const AttrSlice& attrs, const OpDef& op_def,
+                            FullTypeDef& target);
+
+const FullTypeDef& GetArgDefaultUnset(const FullTypeDef& t, int i);
+const FullTypeDef& GetArgDefaultAny(const FullTypeDef& t, int i);
+
+bool IsEqual(const FullTypeDef& lhs, const FullTypeDef& rhs);
+
+bool IsSubtype(const FullTypeDef& lhs, const FullTypeDef& rhs,
+               bool covariant = true);
+
+uint64_t Hash(const FullTypeDef& arg);
+
+// Determine if the given fulltype is a host memory type.
+// While it is prefered that Placer (placer.cc and colocation_graph.cc) make
+// all host memory type placement decisions, any decision made elsewhere
+// should use this function (e.g. instead of assuming that all variants never
+// contain host memory types).
+inline bool IsHostMemoryType(const FullTypeDef& t) {
+  switch (t.type_id()) {
+    case TFT_TENSOR:
+      return IsHostMemoryType(full_type::GetArgDefaultAny(t, 0));
+    case TFT_ARRAY:
+      return IsHostMemoryType(full_type::GetArgDefaultAny(t, 0));
+    case TFT_DATASET:
+      return true;
+    case TFT_MUTEX_LOCK:
+      return true;
+    case TFT_RAGGED:
+      return IsHostMemoryType(full_type::GetArgDefaultAny(t, 0));
+    case TFT_STRING:
+      return true;
+    case TFT_ITERATOR:
+      return IsHostMemoryType(full_type::GetArgDefaultAny(t, 0));
+    case TFT_OPTIONAL:
+      return IsHostMemoryType(full_type::GetArgDefaultAny(t, 0));
+    case TFT_PRODUCT:
+      for (int i = 0; i < t.args_size(); i++) {
+        if (IsHostMemoryType(full_type::GetArgDefaultAny(t, i))) {
+          return true;
+        }
+      }
+      return false;
+    default:
+      return false;
+  }
+}
+
+}  // namespace full_type
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FULL_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/function.h b/third_party/tflite-hdrs/tensorflow/core/framework/function.h
new file mode 100644
index 00000000..8c77af38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/function.h
@@ -0,0 +1,1260 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#include "xla/tsl/protobuf/error_codes.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/optimized_function_graph.pb.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/stack_frame.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+#endif  // IS_MOBILE_PLATFORM
+
+namespace tensorflow {
+
+class CollectiveExecutor;
+class DeviceSet;
+class Graph;
+class GraphDef;
+class OpKernel;
+class ProcessFunctionLibraryRuntime;
+class ResourceMgr;
+class Rendezvous;
+class ScopedStepContainer;
+class StepStatsCollectorInterface;
+class Node;
+
+// FunctionDefHelper::Create is a convenient helper to construct a
+// FunctionDef proto.
+// E.g.,
+//   FunctionDef my_func = FunctionDefHelper::Create(
+//     "my_func_name",
+//     {"x:T", "y:T" /* one string per argument */},
+//     {"z:T" /* one string per return value */},
+//     {"T: {float, double}" /* one string per attribute  */},
+//     {
+//        {{"o"}, "Mul", {"x", "y"}, {{"T", "$T"}}}
+//        /* one entry per function node */
+//     },
+//     /* Mapping between function returns and function node outputs. */
+//     {{"z", "o:z"}});
+//
+// For the old Function::Node approach, use FunctionDefHelper::Define()
+// E.g.,
+//   FunctionDef my_func = FunctionDefHelper::Define(
+//     "my_func_name",
+//     {"x:T", "y:T" /* one string per argument */},
+//     {"z:T" /* one string per return value */},
+//     {"T: {float, double}" /* one string per attribute  */},
+//     {
+//        {{"z"}, "Mul", {"x", "y"}, {{"T", "$T"}}}
+//        /* one entry per function node */
+//     });
+class FunctionDefHelper {
+ public:
+  // AttrValueWrapper has copy constructors for the type T so that
+  // it's easy to construct a simple AttrValue proto.
+  //
+  // If T is a string type (const char*, string, or StringPiece), and
+  // it starts with "$", we construct a AttrValue of "placeholder".
+  //
+  // E.g.,
+  //   std::<string, AttrValueWrapper> x = {"T", "$T"}
+  // is a named attr value placeholder.
+  struct AttrValueWrapper {
+    AttrValue proto;
+
+    AttrValueWrapper() {}
+
+    template <typename T>
+    AttrValueWrapper(T val) {  // NOLINT(runtime/explicit)
+      SetAttrValue(val, &proto);
+    }
+
+   private:
+    void InitFromString(absl::string_view val);
+  };
+
+  // Constructs an AttrValue.func given the "name" and "attrs".
+  static AttrValueWrapper FunctionRef(
+      const std::string& name,
+      absl::Span<const std::pair<string, AttrValueWrapper>> attrs);
+  static AttrValueWrapper FunctionRef(const std::string& name) {
+    return FunctionRef(name, {});
+  }
+
+  // Node is used to construct FunctionDef.Node using initialization
+  // lists. E.g.,
+  //  Node n = {{"z"}, "Mul", {"x", "y"}, {{"T", "$T"}}};  // z = x * y
+  //
+  // If the op has no inputs, then name is be specified.
+  //  Node n = {{}, "AssignVariable", {"resource", "val"}, {{"dtype",
+  //  "DT_FLOAT"},
+  //            {"update0"}, "CPU:0", "update1"}}
+  struct Node {
+    // When constructing a NodeDef, the first entry in ret is used as
+    // the node name, the remaining values are ignored.
+    std::vector<string> ret;
+    std::string op;
+    std::vector<string> arg;
+    std::vector<std::pair<string, AttrValueWrapper>> attr;
+    std::vector<string> dep;
+    std::string device;
+
+    // Required if the op has zero outputs. Otherwise, ret[0] used as name if
+    // name is left empty.
+    std::string name;
+
+    std::string GetName() const {
+      if (!name.empty()) return name;
+      CHECK(!ret.empty());
+      return ret[0];
+    }
+    std::vector<string> original_node_names;
+    std::vector<string> original_func_names;
+
+    NodeDef ToNodeDef() const;
+  };
+
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
+  // - `control_ret_def` holds a mapping from the function control
+  //   output names to the nodes from `node_def`.
+  static FunctionDef Create(
+      const std::string& function_name, absl::Span<const string> in_def,
+      absl::Span<const string> out_def, absl::Span<const string> attr_def,
+      absl::Span<const Node> node_def,
+      absl::Span<const std::pair<string, string>> ret_def,
+      absl::Span<const std::pair<string, string>> control_ret_def);
+
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
+  static FunctionDef Create(
+      const std::string& function_name, absl::Span<const string> in_def,
+      absl::Span<const string> out_def, absl::Span<const string> attr_def,
+      absl::Span<const Node> node_def,
+      absl::Span<const std::pair<string, string>> ret_def);
+
+  // TODO(josh11b): Get rid of these and transition to the one above.
+  static FunctionDef Define(const std::string& function_name,
+                            absl::Span<const string> arg_def,
+                            absl::Span<const string> ret_def,
+                            absl::Span<const string> attr_def,
+                            absl::Span<const Node> node_def);
+
+  // Defines an anonymous function. I.e., its name is not relevant.
+  static FunctionDef Define(absl::Span<const string> arg_def,
+                            absl::Span<const string> ret_def,
+                            absl::Span<const string> attr_def,
+                            absl::Span<const Node> node_def);
+
+  // Helpers to construct a constant scalar.
+  template <typename T>
+  static Node Const(const std::string& name, const T& val) {
+    Node n = {{name}, "Const"};
+    const DataType dtype = DataTypeToEnum<T>::value;
+    n.attr.push_back({"dtype", dtype});
+    Tensor t(dtype, TensorShape({}));
+    t.scalar<T>()() = val;
+    n.attr.push_back({"value", t});
+    return n;
+  }
+
+  template <typename T>
+  static Node Const(const std::string& name, gtl::ArraySlice<T> vals) {
+    Node n = {{name}, "Const"};
+    const DataType dtype = DataTypeToEnum<T>::value;
+    n.attr.push_back({"dtype", dtype});
+    int64_t num = vals.size();
+    Tensor t(dtype, TensorShape({num}));
+    for (size_t i = 0; i < vals.size(); ++i) {
+      t.flat<T>()(i) = vals[i];
+    }
+    n.attr.push_back({"value", t});
+    return n;
+  }
+};
+
+template <>
+inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(const char* val) {
+  InitFromString(val);
+}
+
+template <>
+inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(
+    const std::string& val) {
+  InitFromString(val);
+}
+
+template <>
+inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(
+    absl::string_view val) {
+  InitFromString(val);
+}
+
+// Instantiate a function.
+//
+// "fdef" encodes a TF function with some attrs in fdef.signature.attr
+// containing placeholders.  InstantiateFunction binds these
+// placeholders and produces an instantiated function encoded in
+// "result.gdef". The value to substitute a placeholder is given by
+// "attr_values", which is a map from a placeholder name to an attr
+// value.
+//
+// InstantiateFunction calls "get_function" to find signatures of other
+// functions and primitive ops.
+
+// GetFunctionSignature(func name, opdef) returns OK if the func name is found
+// and opdef is filled with a pointer to the corresponding signature
+// (a OpDef proto). Otherwise, returns an error.
+typedef std::function<absl::Status(const string&, const OpDef**)>
+    GetFunctionSignature;
+
+struct InstantiationResult {
+  DataTypeVector arg_types;
+  DataTypeVector ret_types;
+  std::vector<NodeDef> nodes;
+};
+absl::Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
+                                 GetFunctionSignature get_function,
+                                 InstantiationResult* result);
+
+// Returns a debug string for a function definition.
+//
+// The returned text is multiple-line. It is intended to be
+// human-readable rather than being friendly to parsers. It is _NOT_
+// intended to be the canonical string representation of "func_def".
+// Particularly, it may not include all information presented in
+// "func_def" (e.g., comments, description of the function arguments,
+// etc.)
+std::string DebugString(const FunctionDef& func_def);
+std::string DebugString(const GraphDef& instantiated_func_def);
+std::string DebugString(absl::Span<const NodeDef> instantiated_func_nodes);
+
+// Returns a debug string for a top level graph (the main program and
+// its supporting functions defined in its library).
+std::string DebugStringWhole(const GraphDef& gdef);
+
+// Returns true if f1 == f2. Compares all fields, including descriptions. Order
+// of NodeDefs doesn't matter.
+bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
+
+// Return a hash of `fdef` that is consistent with FunctionDefsEqual method.
+// In other words, if two fdefs compare equal, their hash values will be the
+// same.
+uint64 FunctionDefHash(const FunctionDef& fdef);
+
+class CallFrameInterface {
+ public:
+  virtual ~CallFrameInterface() {}
+
+  virtual size_t num_args() const = 0;
+  virtual size_t num_retvals() const = 0;
+
+  virtual absl::Status GetArg(int index, const Tensor** val) = 0;
+
+  // Optimized implementation of `GetArg()` that allows the caller to take
+  // ownership of the tensor. This method may only be called once per
+  // value of `index` and `CallFrameInterface` instance.
+  //
+  // REQUIRES: `this->CanConsumeArg(index) == true`.
+  virtual void ConsumeArg(int index, Tensor* val) {
+    LOG(ERROR) << "This `CallFrameInterface` implementation does not support "
+                  "consuming arguments.";
+  }
+  virtual bool CanConsumeArg(int index) const { return false; }
+
+  virtual absl::Status SetRetval(int index, const Tensor& val) = 0;
+};
+
+// Represents a function call frame. I.e., the data structure used to
+// pass arguments to a function and retrieve its results.
+//
+// Runtime must arrange accesses to one FunctionCallFrame s.t.
+//   1. SetArgs() happens before any GetArg();
+//   2. GetRetvals happens after all SetRetval();
+class FunctionCallFrame : public CallFrameInterface {
+ public:
+  FunctionCallFrame(DataTypeSlice arg_types, DataTypeSlice ret_types);
+  ~FunctionCallFrame() override;
+
+  // Caller methods.
+  absl::Status SetArgs(absl::Span<const Tensor> args);
+  absl::Status GetRetvals(std::vector<Tensor>* rets) const;
+
+  // Moves the return values from the frame to rets. If allow_dead_tensors is
+  // false it will fail if any of the retvals do not have a value.
+  absl::Status ConsumeRetvals(std::vector<Tensor>* rets,
+                              bool allow_dead_tensors);
+
+  size_t num_args() const override { return arg_types_.size(); }
+  size_t num_retvals() const override { return ret_types_.size(); }
+
+  // Callee methods.
+  absl::Status GetArg(int index, const Tensor** val) override;
+  absl::Status SetRetval(int index, const Tensor& val) override;
+
+ private:
+  DataTypeVector arg_types_;
+  DataTypeVector ret_types_;
+  absl::InlinedVector<Tensor, 4UL> args_;
+  struct Retval {
+    bool has_val = false;
+    Tensor val;
+  };
+  absl::InlinedVector<Retval, 4UL> rets_;
+
+  FunctionCallFrame(const FunctionCallFrame&) = delete;
+  void operator=(const FunctionCallFrame&) = delete;
+};
+
+// Map of function names to StackTracesMaps.
+using FunctionDefLibraryStackTraces =
+    absl::flat_hash_map<std::string, StackTracesMap>;
+
+// Holds Function information that can be shared in multiple places.
+// FunctionRecord must be explicitly finalized before being saved in
+// FunctionLibraryDefinition or any other place that expects immutability.
+class FunctionRecord : public core::RefCounted {
+ public:
+  FunctionRecord(const FunctionDef& fdef, const StackTracesMap& stack_traces,
+                 bool finalized);
+  FunctionRecord(FunctionDef&& fdef, StackTracesMap&& stack_traces,
+                 bool finalized);
+
+  // Mark FunctionRecord as finalized (disable mutation).
+  void finalize();
+
+  // Get a mutable reference to the FunctionDef owned by the record.
+  // Will fail if record is finalized.
+  absl::StatusOr<FunctionDef*> mutable_fdef();
+
+  // Get an immutable access to FunctionRecord properties.
+  const FunctionDef& fdef() const;
+  const StackTracesMap& stack_traces() const;
+  const OpRegistrationData& op_registration_data() const;
+  const bool finalized() const;
+
+ private:
+  bool finalized_ = false;
+
+  FunctionDef fdef_;
+  const StackTracesMap stack_traces_;
+  const OpRegistrationData op_registration_data_;
+};
+
+// Helper to maintain a map between function names in a given
+// FunctionDefLibrary and function definitions.
+//
+// This class is thread-safe.
+class FunctionLibraryDefinition : public OpRegistryInterface {
+ public:
+  // Ops created for function arguments bear the name given by `kArgOp`; those
+  // created for return values bear the name given by `kRetOp`.
+  static constexpr const char* const kArgOp = "_Arg";
+  static constexpr const char* const kDeviceArgOp = "_DeviceArg";
+  static constexpr const char* const kRetOp = "_Retval";
+  static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
+  static constexpr const char* const kIntsOnDeviceAttr =
+      "experimental_ints_on_device";
+  static constexpr const char* const kSharedRendezvousAttr =
+      "shared_rendezvous";
+
+  static constexpr const char* const kGradientOp = "SymbolicGradient";
+  static constexpr const char* const kFuncAttr = "f";
+
+  // Note: This constructor grabs `lib_def`'s lock in shared mode.
+  FunctionLibraryDefinition(const FunctionLibraryDefinition& lib_def);
+  explicit FunctionLibraryDefinition(
+      const OpRegistryInterface* default_registry,
+      const FunctionDefLibrary& lib_def = {},
+      const FunctionDefLibraryStackTraces& library_traces = {});
+  FunctionLibraryDefinition(const OpRegistryInterface* default_registry,
+                            const GraphDef& graph_def);
+  ~FunctionLibraryDefinition() override;
+
+  FunctionLibraryDefinition& operator=(const FunctionLibraryDefinition&) =
+      delete;
+  FunctionLibraryDefinition& operator=(FunctionLibraryDefinition&& other);
+
+  // Returns True if the library contains `func`, False otherwise.
+  bool Contains(const std::string& func) const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns nullptr if "func" is not defined in "lib_def". Otherwise,
+  // returns its definition proto.
+  //
+  // NB: This function returns a borrowed pointer, which can be invalidated by a
+  // subsequent call to `ReplaceFunction()` with the given name.
+  const FunctionDef* Find(const std::string& func) const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns nullptr if "func" is not defined in "lib_def". Otherwise,
+  // returns a strong reference pointer to the FunctionRecord in the library.
+  core::RefCountPtr<FunctionRecord> FindRecord(const std::string& func) const
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Adds function definition 'fdef' to this function library.
+  // Returns status 'ok' on success, or error otherwise. This is a no-op if
+  // 'fdef' already exists in this function library.
+  // If 'fdef' is successfully added to the library, it will be accessible
+  // from 'LookUp' and included in the proto returned by 'ToProto'.
+  // This operation is atomic.
+  //
+  // Associates `graph` with a function `func_name`. Lifetime assumption:
+  // `graph` has to outlive all instantiated graphs.
+  absl::Status AddFunctionDef(const FunctionDef& fdef,
+                              const StackTracesMap& stack_traces = {})
+      TF_LOCKS_EXCLUDED(mu_);
+  absl::Status AddFunctionDef(FunctionDef&& fdef,
+                              StackTracesMap&& stack_traces = {})
+      TF_LOCKS_EXCLUDED(mu_);
+  absl::Status AddFunctionRecord(core::RefCountPtr<FunctionRecord> record)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Adds gradient definition 'grad' to this function library.
+  // This is a no-op if 'grad' already exists in this function library.
+  // If 'grad' is successfully added, it will be accessible via 'FindGradient'
+  // and included in the proto returned by 'ToProto'.
+  // This operation is atomic.
+  absl::Status AddGradientDef(const GradientDef& grad) TF_LOCKS_EXCLUDED(mu_);
+
+  // Replaces the function corresponding to `func` with `fdef`. Returns
+  // a non-OK status if "func" was not found in the library, OK otherwise.
+  // Please be careful when replacing function: make sure all previous pointers
+  // returned by `Find()` are no longer in use.
+  absl::Status ReplaceFunction(const std::string& func, const FunctionDef& fdef,
+                               const StackTracesMap& stack_traces = {})
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Replaces the gradient corresponding to `grad.function_name()`. Returns
+  // a non-OK status if "grad.function_name()" was not found in the library, OK
+  // otherwise.
+  absl::Status ReplaceGradient(const GradientDef& grad) TF_LOCKS_EXCLUDED(mu_);
+
+  // Removes the function corresponding to 'func'. Returns a non-OK status if
+  // 'func' was not found in the library, OK otherwise.
+  // Please be careful when removing function: make sure there are no other
+  // nodes using the function, and all previous pointers returned by `Find()`
+  // are no longer in use.
+  absl::Status RemoveFunction(const std::string& func) TF_LOCKS_EXCLUDED(mu_);
+
+  // Removes all the functions and gradient functions.
+  void Clear() TF_LOCKS_EXCLUDED(mu_);
+
+  // Adds the functions and gradients in 'other' to this function library.
+  // Duplicate functions and gradients are ignored.
+  // This operation is atomic.
+  absl::Status AddLibrary(const FunctionLibraryDefinition& other)
+      TF_LOCKS_EXCLUDED(mu_);
+  absl::Status AddLibrary(FunctionLibraryDefinition&& other)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Adds the functions and gradients in 'lib_def' to this function library.
+  // Duplicate functions and gradients are ignored. This overload adds the
+  // functions with no stack traces. This operation is atomic.
+  absl::Status AddLibrary(const FunctionDefLibrary& lib_def)
+      TF_LOCKS_EXCLUDED(mu_);
+  absl::Status AddLibrary(FunctionDefLibrary&& lib_def) TF_LOCKS_EXCLUDED(mu_);
+
+  // Adds the functions and gradients in 'lib_def' to this function library.
+  // Duplicate functions and gradients are ignored.
+  // This operation is atomic.
+  absl::Status AddLibrary(const FunctionDefLibrary& lib_def,
+                          const FunctionDefLibraryStackTraces& library_traces)
+      TF_LOCKS_EXCLUDED(mu_);
+  absl::Status AddLibrary(FunctionDefLibrary&& lib_def,
+                          const FunctionDefLibraryStackTraces& library_traces)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // If the gradient function for 'func' is specified explicitly in
+  // the library, returns the gradient function name.  Otherwise,
+  // returns an empty string.
+  std::string FindGradient(const std::string& func) const
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // OpRegistryInterface method. Useful for constructing a Graph.
+  //
+  // If "op" is defined in the library, returns its signature.
+  // Otherwise, assume "op" is a primitive op and returns its op
+  // signature and shape inference function.
+  //
+  // NB: This function outputs a borrowed pointer, which can be invalidated by a
+  // subsequent call to `ReplaceFunction()` with the given name.
+  absl::Status LookUp(const std::string& op_type_name,
+                      const OpRegistrationData** op_reg_data) const override
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Generates new function name with the specified prefix that is unique
+  // across this library.
+  std::string UniqueFunctionName(absl::string_view prefix) const
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Given a node def 'ndef', inspects attributes of the callee
+  // function to derive the attribute 'value' for 'attr'. Returns OK
+  // iff the attribute is given by the function's definition.
+  // TODO(irving): Remove; keep only the const Node& version.
+  template <typename T>
+  absl::Status GetAttr(const NodeDef& ndef, const std::string& attr,
+                       T* value) const;
+
+  // Given a node, inspects attributes of the callee function to derive the
+  // attribute 'value' for 'attr'. Returns OK iff the attribute is given by the
+  // function's definition.
+  template <typename T>
+  absl::Status GetAttr(const Node& node, const std::string& attr,
+                       T* value) const;
+
+  // Returns a proto representation of the state of this function library.
+  FunctionDefLibrary ToProto() const TF_LOCKS_EXCLUDED(mu_);
+
+  size_t num_functions() const TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return records_.size();
+  }
+
+  // Returns all the function names in the FunctionLibraryDefinition.
+  std::vector<string> ListFunctionNames() const TF_LOCKS_EXCLUDED(mu_);
+
+  const OpRegistryInterface* default_registry() const {
+    return default_registry_;
+  }
+  void set_default_registry(const OpRegistryInterface* registry) {
+    default_registry_ = registry;
+  }
+
+  // Returns a copy of `*this` with only the subset of functions that are
+  // reachable from the nodes of `graph` or `func`.
+  FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
+  FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+  FunctionLibraryDefinition ReachableDefinitions(const Graph& graph) const;
+  absl::StatusOr<FunctionLibraryDefinition> ReachableDefinitions(
+      const std::string& function_name) const;
+
+  // Copies the function named `func` from `other` to this
+  // FunctionLibraryDefinition.
+  // REQUIRES: `this->default_registry() == other.default_registry()`.
+  // Returns OK on success, or error otherwise. This is a no-op if a function
+  // name `func` already exists in this function library, and has the same
+  // implementation as in `other`. If the implementations conflict, an invalid
+  // argument error is returned.
+  absl::Status CopyFunctionDefFrom(const std::string& name,
+                                   const FunctionLibraryDefinition& other);
+
+  // Returns graph with debug stack traces for the given function, or `nullptr`
+  // if none found.
+  const StackTracesMap* GetStackTraces(const std::string& func_name) const {
+    core::RefCountPtr<FunctionRecord> entry = FindRecord(func_name);
+    if (entry.get() != nullptr) {
+      return &entry->stack_traces();
+    }
+    return nullptr;
+  }
+
+  // Adds or updates an OptimizedFunctionGraph. Key is `function_name`.
+  //
+  // NOTE: This overload will lead to a copy of a potentially large graph
+  // being stored in memory for the lifetime of the library. Using the lazy
+  // `creator` function overload is recommended in new code.
+  ABSL_DEPRECATED("Use the lazy `creator` function overload in new code.")
+  void AddOptimizedFunctionGraph(const std::string& function_name,
+                                 const OptimizedFunctionGraph& graph)
+      TF_LOCKS_EXCLUDED(mu_) {
+    std::function<absl::StatusOr<OptimizedFunctionGraph>()> creator =
+        [graph]() { return graph; };
+    AddOptimizedFunctionGraph(function_name, std::move(creator));
+  }
+
+  // Adds or updates an OptimizedFunctionGraph, using a `creator` that can
+  // lazily build or load the graph on demand. Key is `function_name`.
+  void AddOptimizedFunctionGraph(
+      const std::string& function_name,
+      std::function<absl::StatusOr<OptimizedFunctionGraph>()> creator)
+      TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    optimized_function_graph_creator_map_.emplace(function_name,
+                                                  std::move(creator));
+  }
+
+  // Look up for OptimizedFunctionGraph given `function_name`. Returns nullopt
+  // if not found.
+  std::optional<absl::StatusOr<OptimizedFunctionGraph>>
+  FindOptimizedFunctionGraph(const std::string& function_name) const
+      TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    if (auto it = optimized_function_graph_creator_map_.find(function_name);
+        it != optimized_function_graph_creator_map_.end()) {
+      return it->second();
+    }
+    return std::nullopt;
+  }
+
+  // Creates a map of function names to stack traces for a FunctionDefLibrary.
+  static FunctionDefLibraryStackTraces CreateStackTracesForFunctionDefLibrary(
+      const FunctionDefLibrary& library, const GraphDebugInfo& debug_info);
+
+ private:
+  void Initialize(const FunctionDefLibrary& library,
+                  const FunctionDefLibraryStackTraces& library_traces);
+
+  core::RefCountPtr<FunctionRecord> FindHelper(const string& func) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+  std::string FindGradientHelper(const std::string& func) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  absl::Status AddHelper(FunctionRecord* registration, bool* added)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Same as AddFunctionDef/AddGradientDef except these methods set
+  // `added` to true if the `fdef`/`grad` were actually added to this.
+  absl::Status AddFunctionDefHelper(FunctionDef&& fdef,
+                                    StackTracesMap&& stack_traces, bool* added)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  absl::Status AddGradientDefHelper(const GradientDef& grad, bool* added)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Helper function for GetAttr. Returns the FunctionDef* to get the
+  // attr from.
+  const FunctionDef* GetAttrImpl(const NodeDef& ndef) const
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Remove all functions in `funcs` and all gradients of functions in
+  // `funcs_with_grads` from this library.
+  absl::Status Remove(const std::vector<string>& funcs,
+                      const std::vector<string>& funcs_with_grads)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Remove `func` from the library. Returns non-OK Status unless `func` is in
+  // the library. This should only be called when there is a guarantee that the
+  // function being removed hasn't been retrieved with `Find`.
+  absl::Status RemoveFunctionHelper(const std::string& func)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Remove gradient of function `func` from the library. Returns non-OK Status
+  // unless `func` has a gradient.
+  absl::Status RemoveGradient(const std::string& func)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  mutable mutex mu_;
+  const OpRegistryInterface* default_registry_;
+  gtl::FlatMap<string, FunctionRecord*> records_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<string, string> func_grad_ TF_GUARDED_BY(mu_);
+  // Maps from function name to optimized function graph.
+  gtl::FlatMap<string, std::function<absl::StatusOr<OptimizedFunctionGraph>()>>
+      optimized_function_graph_creator_map_ TF_GUARDED_BY(mu_);
+};
+
+// Forward declare. Defined in common_runtime/function.h
+struct FunctionBody;
+
+// Forward declare. Defined in common_runtime/device.h
+class Device;
+// Forward declare. Defined in common_runtime/device_mgr.h
+class DeviceMgr;
+
+// Index of an _Arg node.
+struct FunctionArgIndex {
+  explicit FunctionArgIndex(const int index) : index(index) {}
+  FunctionArgIndex(const int index, const int sub_index)
+      : index(index), sub_index(sub_index) {}
+
+  // The value of the attribute "Index" of the _Arg node.
+  int index;
+  // Set only when the _Arg node represents multiple arguments (e.g. an _Arg
+  // node is replicated to multiple devices/subgraphs). Use sub-index to
+  // distinguish arguments with the same index.
+  int sub_index = -1;
+};
+
+class FunctionLibraryRuntime : public core::WeakRefCounted {
+ public:
+  ~FunctionLibraryRuntime() override {}
+
+  // Instantiate a function with the given "attrs".
+  //
+  // Returns OK and fills in "handle" if the instantiation succeeds.
+  // Otherwise returns an error and "handle" is undefined.
+  struct InstantiateOptions {
+    // The canonical device name of the device on which the function
+    // should be instantiated. If empty, the function will be
+    // instantiated on the local device.
+    std::string target;
+
+    // Should the function be instantiated as a multi-device function?
+    bool is_multi_device_function = false;
+
+    // If true, graph passes will be skipped when instantiating the function
+    // since they have already run on the main function side.
+    bool is_component_function = false;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's inputs. The device of resource inputs must be the device
+    // backing the resource, not the CPU device backing the resource handle.
+    // Must have the same length as number of inputs to the function.
+    std::vector<string> input_devices;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's outputs.
+    //
+    // (a) If specified (must have the same length as number of outputs):
+    //
+    // Specified devices will be assigned to Retval nodes inserted into the
+    // function body graph in place of function outputs. It is allowed to
+    // specify output device as empty string, in this case Retval device
+    // assignment will be inferred later when function graph will be placed
+    // before partitioning (this is required for resource outputs). Placer will
+    // respect colocation constraints.
+    //
+    // (b) If not specified:
+    //
+    // Function runtime will infer Retval device by following input edges, until
+    // it will reach a node with a device specification. This device
+    // specification must identify a unique device, i.e. a general specification
+    // like "job:foo" matching multiple devices will result in an error.
+    //
+    // IMPORTANT: Resource outputs
+    //
+    // Multi device functions might return resources on a devices different from
+    // the function call device. If output device is not specified for the
+    // resource output, and node producing that resource is a function call,
+    // runtime will leave device specification empty and will rely on Placer to
+    // infer correct device.
+    std::vector<string> output_devices;
+
+    // If set, it indicates the original output indices of a component function.
+    absl::optional<std::vector<int>> ret_indices = absl::nullopt;
+
+    // Maps from a CompositeDevice name to a list of underlying physical
+    // devices.
+    absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // For multi-device functions, a mapping from _Arg node index to type and
+    // shape for input resources.
+    // REQUIRES: if input_resource_dtypes_and_shapes.count(i) > 0 then i-th
+    // argument type must be DT_RESOURCE.
+    std::unordered_map<int, DtypeAndPartialTensorShape>
+        input_resource_dtypes_and_shapes;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // If non-null, the runtime will use `lib_def` to resolve function(s) named
+    // in `function_name` and `attrs`. Otherwise, the runtime will use its
+    // internal library.
+    //
+    // NOTE(mrry): If provided, all functions defined in `lib_def` must be
+    // self-contained, and cannot refer to functions defined in other libraries.
+    const FunctionLibraryDefinition* lib_def = nullptr;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // If non-empty, the runtime will use `state_handle` to identify
+    // cached state related the instantiated function. Two functions
+    // of the same name and attrs, instantiated with the same
+    // `state_handle` will have the same handle and share the same
+    // state (in stateful kernels); and two functions with different
+    // values for `state_handle` will have independent state.
+    std::string state_handle;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instantiates the function using an executor of the given type. If empty,
+    // the default TensorFlow executor will be used.
+    std::string executor_type;
+
+    // If true, the runtime will attempt to create kernels for the function at
+    // instantiation time, rather than on the first run. This can be used to
+    // surface errors earlier.
+    bool create_kernels_eagerly = false;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instantiates the function with the provided config_proto.
+    ConfigProto config_proto;
+
+    // If provided, this optimization function will be invoked before
+    // the placer for multi-device functions.
+    std::function<absl::Status(std::vector<string> /*ret_node_names*/,
+                               std::vector<string> /*keep_node_names*/,
+                               FunctionLibraryDefinition*, const DeviceSet&,
+                               Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
+        optimize_graph_fn;
+
+    // If set, partitioned functions will be added to `graph_collector`.
+    // `graph_collector` must be alive during the call to Instantiate.
+    GraphCollector* graph_collector = nullptr;
+
+    // Indicates whether the multi-device function backend should default the
+    // placement of ops without request device to `target`.
+    bool default_device_to_target = true;
+
+    // If true, the optimized Graph will be stored so that
+    // `FunctionLibraryRuntime::DebugString(handle)` contains the optimized
+    // Graph. Otherwise, the unoptimized function Graph will be returned.
+    bool include_optimized_graph_in_debug_string = false;
+
+    // If true, the function library runtime cache the function instantiation.
+    bool use_function_cache = false;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // If True, allow optimizations which should be targeted at a limited
+    // set of small functions.  For example, running kernels synchronously can
+    // be faster under some conditions.
+    bool allow_small_function_optimizations = false;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // If True, allow graphs containing control flow nodes to be run on the
+    // single threaded executor.
+    bool allow_control_flow_sync_execution = false;
+
+    // TODO(b/176491312): Remove this if shape inference on import flag is
+    // removed. If True, allows mlir roundtrip to run shape inference on import.
+    bool shape_inference_on_tfe_dialect_import = true;
+
+    // Force int32 _Arg and _Retvals nodes to be left on device instead of
+    // pinning to host.
+    //
+    // Note that we do not pin int32 nodes to host for subgraphs running in
+    // TPU/XLA devices. So this is mainly used to handle the case of multi-CPU
+    // and GPU (non-XLA) graphs.
+    bool int_args_and_retvals_on_device = false;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instantiates the function for XLA compilation on device_type. If empty,
+    // function is not compiled.
+    std::string xla_compile_device_type;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instantiates the function enabling soft placement or outside compilation.
+    bool allow_soft_placement = false;
+  };
+  typedef uint64 Handle;
+  virtual absl::Status Instantiate(const std::string& function_name,
+                                   AttrSlice attrs,
+                                   const InstantiateOptions& options,
+                                   Handle* handle) = 0;
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
+                           Handle* handle) {
+    auto opts = absl::make_unique<InstantiateOptions>();
+    return Instantiate(function_name, attrs, *opts, handle);
+  }
+
+  // Releases state associated with the handle.
+  virtual absl::Status ReleaseHandle(Handle handle) = 0;
+
+  // Returns the function body for the instantiated function given its
+  // handle 'h'. Returns nullptr if "h" is not found.
+  //
+  // *this keeps the ownership of the returned object, which remains alive
+  // as long as *this.
+  virtual const FunctionBody* GetFunctionBody(Handle h) = 0;
+
+  // Returns the return types for the function identified by handle `h`.
+  virtual absl::Status GetRetTypes(Handle h, DataTypeVector* ret_types) = 0;
+
+  // Asynchronously invokes the instantiated function identified by
+  // "handle".
+  //
+  // If function execution succeeds, "done" is called with OK and
+  // "*rets" is filled with the function's return values. Otherwise,
+  // "done" is called with an error status.
+  //
+  // Does not take ownership of "rets".
+  // In the cross-process scenario, runner isn't used for making the Async
+  // RPC calls.
+  struct Options {
+    Options() {}
+    explicit Options(const int64_t step_id) : step_id(step_id) {}
+
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    const int64_t step_id = -std::abs(static_cast<int64_t>(random::New64()));
+
+    // op_id of the function running in eager mode. Set when we want to copy
+    // remote outputs lazily. All components of a remote multi-device function
+    // should use the same op_id, in order to correctly map remote output
+    // tensors to the remote TensorHandles in the default device.
+    absl::optional<int64_t> op_id = absl::nullopt;
+
+    // Not owned. Caller makes sure that the rendezvous outlives this Options.
+    RendezvousInterface* rendezvous = nullptr;
+    CancellationManager* cancellation_manager = nullptr;
+    CollectiveExecutor* collective_executor = nullptr;
+    ScopedStepContainer* step_container = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
+    tsl::CoordinationServiceAgent* coordination_service_agent = nullptr;
+
+    absl::optional<ManagedStackTrace> stack_trace = absl::nullopt;
+
+    std::function<void(std::function<void()>)>* runner = nullptr;
+
+    // Parameters for remote function execution.
+    bool remote_execution = false;
+    std::string source_device = "";  // Fully specified device name.
+
+    // Allocator attributes specifying where the args are / rets should be put.
+    // These should either be {} or match the length of args / retvals. If {},
+    // the default allocator attributes will be assumed for all args / retvals.
+    std::vector<AllocatorAttributes> args_alloc_attrs;
+    std::vector<AllocatorAttributes> rets_alloc_attrs;
+
+    // If true, we create a new IntraProcessRendezvous, else use the existing
+    // one.
+    bool create_rendezvous = false;
+
+    // If True, allow returning dead tensors.
+    bool allow_dead_tensors = false;
+
+    // If True, hint that all kernels should be treated as "inexpensive", and
+    // hence executed on the scheduling thread.
+    bool run_all_kernels_inline = false;
+
+    // If not null, use this thread pool for intra op scheduling.
+    thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr;
+
+    // Returns a human readable representation of this.
+    std::string DebugString() const;
+  };
+  typedef std::function<void(const absl::Status&)> DoneCallback;
+  virtual void Run(const Options& opts, Handle handle,
+                   absl::Span<const Tensor> args, std::vector<Tensor>* rets,
+                   DoneCallback done) = 0;
+  virtual void Run(const Options& opts, Handle handle,
+                   CallFrameInterface* call_frame, DoneCallback done) = 0;
+
+  virtual absl::Status RunSync(Options opts, Handle handle,
+                               absl::Span<const Tensor> args,
+                               std::vector<Tensor>* rets) = 0;
+  virtual absl::Status RunSync(Options opts, Handle handle,
+                               CallFrameInterface* call_frame) = 0;
+
+  // Creates a "kernel" for the given NodeProperties "props".
+  //
+  // If succeeds, returns OK and the caller takes the ownership of the
+  // returned "*kernel". Otherwise, returns an error.
+  virtual absl::Status CreateKernel(
+      const std::shared_ptr<const NodeProperties>& props,
+      OpKernel** kernel) = 0;
+
+  // Returns true iff the function named `function_name` is stateful.
+  //
+  // NOTE(mrry): This method assumes that the runtime is associated with a
+  // default function library, and looks up `function_name` in that library.
+  // It does not support overriding the function library.
+  virtual bool IsStateful(const std::string& function_name) const = 0;
+
+  // Returns the device on which the function executes.
+  virtual Device* device() = 0;
+  virtual const Device* device() const = 0;
+
+  // Returns the default runner in which the ops should be launched. If the
+  // device on which the function executes has a private thread pool, return
+  // runner on the device local thread pool.
+  virtual std::function<void(std::function<void()>)>* runner() = 0;
+
+  // Get the DeviceMgr from which the device was obtained.
+  virtual const DeviceMgr* device_mgr() const = 0;
+
+  // Returns the function library definition that backs this runtime.
+  //
+  // NOTE(mrry): The returned library definition is the default function library
+  // for this runtime. The caller may override the function library used by the
+  // runtime to instantiate functions, which will not be reflected in the return
+  // value of this function.
+  virtual const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
+      const = 0;
+
+  // Returns the environment on which the function executes.
+  virtual Env* env() = 0;
+
+  // Returns the ConfigProto passed to the session used to create the function.
+  virtual const ConfigProto* const config_proto() = 0;
+
+  // Returns a debug string showing the definition of the function of
+  // 'handle'.
+  virtual std::string DebugString(Handle handle) = 0;
+
+  // Returns the graph version number.
+  virtual int graph_def_version() const = 0;
+
+  typedef uint64 LocalHandle;
+
+  // Creates a copy of ProcessFunctionLibraryRuntime (transferring ownership to
+  // the caller), FunctionLibraryRuntime (owned by the returned
+  // ProcessFunctionLibraryRuntime), FunctionLibraryDefinition (transferring
+  // ownership to the caller). Note that both the ProcessFunctionLibraryRuntime
+  // and FunctionLibraryRuntime borrow a pointer to the
+  // FunctionLibraryDefinition and so the FunctionLibraryDefinition should
+  // outlive both.
+  //
+  // The `skip_flib_def` argument controls whether the method should clone the
+  // FunctionLibraryDefinition (default behavior) or return an empty function
+  // library. The latter is used by tf.data, which manages
+  // FunctionLibraryDefinitions for its functions independently (and passes
+  // these into the FunctionLibraryRuntime through an overlay), to avoid linear
+  // runtime w.r.t. to number of functions in the current function library.
+  virtual absl::Status Clone(
+      std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+      FunctionLibraryRuntime** out_flr, bool skip_flib_def = false) = 0;
+
+  // Returns the name of the executor class (in the sense of
+  // `ExecutorFactory::GetFactory()`) that will be used based on the given
+  // dynamic `options` and static `attrs`. If none is specified, this method
+  // will return an empty string, which leaves the decision up to the runtime.
+  static std::string ExecutorType(const InstantiateOptions& options,
+                                  AttrSlice attrs);
+};
+
+// Returns the device of the `arg_index`-th function input. Update
+// `composite_devices` if the input device is a composite device.
+std::string GetFunctionResourceInputDevice(
+    const Tensor& input, const int arg_index, const FunctionDef& function_def,
+    absl::flat_hash_map<string, std::vector<string>>* composite_devices);
+
+// Returns a canonicalized string for the instantiation of the function of the
+// given "name", attributes "attrs", and "options".
+//
+// The returned string is guaranteed to be stable within one address space. But
+// it may be change as the implementation evolves. Therefore, it should not be
+// persisted or compared across address spaces.
+std::string Canonicalize(
+    const std::string& funcname, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options);
+std::string Canonicalize(const std::string& funcname, AttrSlice attrs);
+
+const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
+const FunctionLibraryRuntime::LocalHandle kInvalidLocalHandle = -1;
+
+class CustomKernelCreator {
+ public:
+  virtual ~CustomKernelCreator() {}
+
+  // Given a NodeDef 'node_def' and the function library runtime 'flr',
+  // validate if the class supports creating such a kernel.
+  virtual bool CanCreateKernel(
+      const FunctionLibraryRuntime& flr,
+      const std::shared_ptr<const NodeProperties>& props) const = 0;
+
+  // Given a supported NodeDef, returns a kernel that computes the node.
+  virtual absl::Status CreateKernel(
+      FunctionLibraryRuntime* flr,
+      const std::shared_ptr<const NodeProperties>& props,
+      std::unique_ptr<OpKernel>* kernel) const = 0;
+};
+
+typedef
+#if !defined(IS_MOBILE_PLATFORM)
+    absl::variant<Tensor, eager::RemoteTensorHandle*>
+        FunctionArg;
+#else
+    absl::variant<Tensor>
+        FunctionArg;
+#endif
+
+// Either a local tensor or the shape of a remote tensor.
+typedef absl::variant<Tensor, TensorShape> FunctionRet;
+
+// Used to instantiate and run functions in a distributed system.
+class DistributedFunctionLibraryRuntime {
+ public:
+  virtual ~DistributedFunctionLibraryRuntime() {}
+
+  // Instantiate a function on a remote target specified in `options.target`, by
+  // sending the name and definition of the function to the remote worker. The
+  // local `handle` is filled for the instantiated function data and can be used
+  // for subsequent run function calls on the remote target.
+  virtual void Instantiate(
+      const std::string& function_name,
+      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::LocalHandle* handle,
+      FunctionLibraryRuntime::DoneCallback done) = 0;
+
+  // Run an instantiated remote function (specified by `handle`) with a list of
+  // input Tensors in `args` and get its output Tensors in `rets`. The input
+  // tensor data will be sent with the function execution request, and must be
+  // available on the current caller side.
+  // opts.runner isn't used for execution.
+  virtual void Run(const FunctionLibraryRuntime::Options& opts,
+                   FunctionLibraryRuntime::LocalHandle handle,
+                   absl::Span<const Tensor> args, std::vector<Tensor>* rets,
+                   FunctionLibraryRuntime::DoneCallback done) = 0;
+
+  // Run an instantiated remote function (specified by `handle`) with a list of
+  // input Tensors or RemoteTensorHandles as `args` and get its output Tensors
+  // or TensorShapes in `rets`. When using RemoteTensorHandles as function
+  // inputs or TensorShapes as outputs, the corresponding tensor data will be
+  // resolved on the remote worker, so it is not required to be locally
+  // available on the caller side. Using RemoteTensorHandle inputs is not
+  // supported in TensorFlow v1 runtime.
+  virtual void Run(const FunctionLibraryRuntime::Options& opts,
+                   FunctionLibraryRuntime::LocalHandle handle,
+                   absl::Span<const FunctionArg> args,
+                   std::vector<FunctionRet>* rets,
+                   FunctionLibraryRuntime::DoneCallback done) = 0;
+
+  // Clean up a previously instantiated function on remote worker.
+  virtual void CleanUp(uint64 step_id,
+                       FunctionLibraryRuntime::LocalHandle handle,
+                       FunctionLibraryRuntime::DoneCallback done) = 0;
+
+  // DeviceMgr with *all* available devices (i.e., local and remote).
+  virtual DeviceMgr* remote_device_mgr() const = 0;
+};
+
+// Extracts the actual type from "attr_values" based on its definition
+// "arg_def".
+//
+// If "arg_def" is a N*T type, *is_type_list is set to false, and
+// *dtypes is set to be a vector of size N and each element is T.
+//
+// If "arg_def" is a list(type), *is_type_list is set to true, and
+// *dtypes is set to be a vector of types specified in attrs for
+// arg_def.
+//
+// Otherwise (arg_def is a simple type T), *is_type_list is set to
+// false, and *dtypes is set to a single element vector, whose only
+// element is T.
+absl::Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
+                        bool* is_type_list, DataTypeVector* dtypes);
+
+// To register a gradient function for a builtin op, one should use
+//   REGISTER_OP_GRADIENT(<op_name>, <c++ grad factory>);
+//
+// Typically, the c++ grad factory is a plan function that can be
+// converted into ::tensorflow::gradient::Creator, which is
+//   std::function<Status(const AttrSlice&, FunctionDef*)>.
+//
+// A ::tensorflow::gradient::Creator should populate in FunctionDef* with a
+// definition of a brain function which compute the gradient for the
+// <op_name> when the <op_name> is instantiated with the given attrs.
+//
+// E.g.,
+//
+// Status MatMulGrad(const AttrSlice& attrs, FunctionDef* g) {
+//   bool transpose_a;
+//   TF_RETURN_IF_ERROR(attrs.Get("transpose_a", &transpose_a));
+//   bool transpose_b;
+//   TF_RETURN_IF_ERROR(attrs.Get("transpose_b", &transpose_b));
+//   DataType dtype;
+//   TF_RETURN_IF_ERROR(attrs.Get("dtype", &dtype));
+//   if (!transpose_a && !transpose_b) {
+//     *g = FunctionDefHelper::Define(
+//       "MatMulGrad",
+//       {"x:T ", "y:T", "dz:T"},    // Inputs to this function
+//       {"dx:T", "dy:T"},           // Outputs from this function
+//       {"T: {float, double}"},     // Attributes needed by this function
+//       {
+//         {{"x_t"}, "Transpose", {"x"}, {{"T", "$T"}}},
+//         {{"y_t"}, "Transpose", {"y"}, {{"T", "$T"}}},
+//         {{"dx"}, "MatMul", {"dz", "y_t"}, {{"T", "$T"}}},
+//         {{"dy"}, "MatMul", {"x_", "dz"}, {{"T", "$T"}}},
+//       });
+//   } else {
+//     ... ...
+//   }
+//   return OkStatus();
+// }
+//
+// NOTE: $T is substituted with the type variable "T" when the
+// gradient function MatMul is instantiated.
+//
+// TODO(zhifengc): Better documentation somewhere.
+
+// Macros to define a gradient function factory for a primitive
+// operation.
+#define REGISTER_OP_GRADIENT(name, fn) \
+  REGISTER_OP_GRADIENT_UNIQ_HELPER(__COUNTER__, name, fn)
+
+#define REGISTER_OP_NO_GRADIENT(name) \
+  REGISTER_OP_GRADIENT_UNIQ_HELPER(__COUNTER__, name, nullptr)
+
+#define REGISTER_OP_GRADIENT_UNIQ_HELPER(ctr, name, fn) \
+  REGISTER_OP_GRADIENT_UNIQ(ctr, name, fn)
+
+#define REGISTER_OP_GRADIENT_UNIQ(ctr, name, fn)      \
+  static bool unused_grad_##ctr TF_ATTRIBUTE_UNUSED = \
+      SHOULD_REGISTER_OP_GRADIENT &&                  \
+      ::tensorflow::gradient::RegisterOp(name, fn)
+
+namespace gradient {
+// Register a gradient creator for the "op".
+typedef std::function<absl::Status(const AttrSlice& attrs, FunctionDef*)>
+    Creator;
+bool RegisterOp(const std::string& op, Creator func);
+
+// Returns OK the gradient creator for the "op" is found (may be
+// nullptr if REGISTER_OP_NO_GRADIENT is used.
+absl::Status GetOpGradientCreator(const std::string& op, Creator* creator);
+};  // namespace gradient
+
+// Declare explicit instantiations of GetAttr
+#define GET_ATTR(T)                                          \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const Node&, const string&, T*) const;                 \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const NodeDef&, const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/function_handle_cache.h b/third_party/tflite-hdrs/tensorflow/core/framework/function_handle_cache.h
new file mode 100644
index 00000000..1bd67138
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/function_handle_cache.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+
+// Thread-safe data structure for caching function instantiations.
+class FunctionHandleCache {
+ public:
+  explicit FunctionHandleCache(FunctionLibraryRuntime* lib);
+
+  ~FunctionHandleCache();
+
+  // Looks up the function to be instantiated in the cache first. If present,
+  // returns handle from there. Otherwise, instantiates a new function
+  // and stores handle in the cache.
+  //
+  // The cache retains the ownership of the handle. In particular, the caller
+  // should not invoke `ReleaseHandle`.
+  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+                           FunctionLibraryRuntime::InstantiateOptions options,
+                           FunctionLibraryRuntime::Handle* handle);
+
+  // Releases all the handles in the cache, clearing out the state for all
+  // functions involved.
+  absl::Status Clear();
+
+ private:
+  mutex mu_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned
+  const string state_handle_;
+  std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/function_testlib.h b/third_party/tflite-hdrs/tensorflow/core/framework/function_testlib.h
new file mode 100644
index 00000000..93cae697
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/function_testlib.h
@@ -0,0 +1,187 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_TESTLIB_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_TESTLIB_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace test {
+namespace function {
+
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  Attrs(
+      const std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>>&
+          attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
+
+// Helper to construct a NodeDef.
+NodeDef NDef(
+    absl::string_view name, absl::string_view op,
+    absl::Span<const string> inputs,
+    absl::Span<const std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+        attrs = {},
+    const string& device = "");
+
+// Helper to construct a GraphDef proto.
+GraphDef GDef(absl::Span<const NodeDef> nodes,
+              absl::Span<const FunctionDef> funcs = {});
+
+// For testing convenience, we provide a few simple functions that can
+// be easily executed and tested.
+
+// x: T -> x * 2.
+FunctionDef XTimesTwo();
+// Same as `XTimesTwo` above, but with the `x` input as a control dependency.
+FunctionDef XTimesTwoWithControlInput();
+// Same as `XTimesTwo` above, but with a `dummy` control output node.
+FunctionDef XTimesTwoWithControlOutput();
+// Same as `XTimesTwo` above, but with a dangling `FloorDiv` node.
+FunctionDef XTimesTwoWithDanglingFloorDivNode();
+
+// x: T -> cpu(x * 2) + cpu(x * 3).
+FunctionDef TwoDeviceTimesFive();
+
+// x: T -> cpu(x * 2), gpu(x * 3).
+FunctionDef TwoDeviceMult();
+
+// cpu(x): T, gpu(y): T -> cpu(x * 2), gpu(y * 3).
+FunctionDef TwoDeviceInputOutput();
+
+// Function taking a list of Tensors as input.
+FunctionDef FuncWithListInput();
+
+// Function returning a list of Tensors as output.
+FunctionDef FuncWithListOutput();
+
+// x: T -> x + x.
+FunctionDef XAddX();
+
+// x: T, y: T -> x + y.
+FunctionDef XAddY();
+
+// x: T -> x * 2, where x is int32.
+FunctionDef XTimesTwoInt32();
+
+// x: T -> (x * 2) * 2.
+FunctionDef XTimesFour();
+
+// x: T -> (x * 2) * 2, where x is int32
+FunctionDef XTimesFourInt32();
+
+// x: T -> ((x * 2) * 2) * 2.
+FunctionDef XTimes16();
+
+// w: T, x: T, b: T -> MatMul(w, x) + b
+FunctionDef WXPlusB();
+
+// x: T -> x: T, T is a type which we automatically converts to a bool.
+FunctionDef NonZero();
+
+// x: T -> bool.
+FunctionDef IsZero();
+
+// x: T -> int64
+FunctionDef RandomUniform();
+
+// x: T, y:T  -> y: T, x: T
+FunctionDef Swap();
+
+// x: T, y: T -> y: T, x: T, the body has no nodes.
+FunctionDef EmptyBodySwap();
+
+// x: float, y: resource -> y: resource, 2*x: float.
+FunctionDef ResourceOutput();
+
+// x: resource -> x: resource
+FunctionDef ResourceIdentity();
+
+// x: resource -> y: float.
+FunctionDef ReadResourceVariable();
+
+// Contains simple control flow returning the input via an Enter op.
+FunctionDef ControlFlow();
+
+// Contains malformed control flow which can't be run by the executor.
+FunctionDef InvalidControlFlow();
+
+// x: T -> x <= N.
+FunctionDef LessThanOrEqualToN(int64_t N);
+
+// x: T, y: T -> x + 1, x * y
+FunctionDef XPlusOneXTimesY();
+
+// x: T, y: T -> x <= N
+FunctionDef XYXLessThanOrEqualToN(int64_t N);
+
+// x: T -> bool
+FunctionDef RandomUniformLess();
+
+// start: int64, stop: int64, step: int64 -> y: RangeDatasetOp::Dataset
+FunctionDef MakeRangeDataset();
+
+// input_dataset: variant, batch_size: int64, drop_remainder: bool
+// -> y: BatchDatasetV2::Dataset
+FunctionDef MakeBatchDataset();
+
+// input_dataset: variant, other_arguments: Targuments, f: func,
+// Targuments: list(type), output_types: list(type), output_shapes: list(shape),
+// use_inter_op_parallelism: bool, preserve_cardinality: bool
+// -> y: MapDatasetOp::Dataset
+FunctionDef MakeMapDataset(bool has_other_args);
+
+// input_dataset: variant, count: int64 -> y: TakeDataset::Dataset
+FunctionDef MakeTakeDataset();
+
+// x: T -> y: TensorSliceDatasetOp::Dataset
+FunctionDef MakeTensorSliceDataset();
+
+// x: T -> y: T, idx: out_idx
+FunctionDef Unique();
+
+void FunctionTestSchedClosure(std::function<void()> fn);
+
+}  // end namespace function
+}  // end namespace test
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/graph_def_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/graph_def_util.h
new file mode 100644
index 00000000..a164ac31
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/graph_def_util.h
@@ -0,0 +1,135 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_GRAPH_DEF_UTIL_H_
+
+#include <set>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Forward declare proto so that it's symbols can be removed from .so exports
+class GraphDef;
+class NodeDef;
+
+// Produce a human-readable version of a GraphDef that is more concise
+// than a text-format proto.
+string SummarizeGraphDef(const GraphDef& graph_def);
+
+// Validates the syntax of a GraphDef provided externally.
+//
+// The following is an EBNF-style syntax for GraphDef objects. Note that
+// Node objects are actually specified as tensorflow::NodeDef protocol buffers,
+// which contain many other fields that are not (currently) validated.
+//
+// Graph        = Node *
+// Node         = NodeName, Inputs
+// Inputs       = ( DataInput * ), ( ControlInput * )
+// DataInput    = NodeName, ( ":", [1-9], [0-9] * ) ?
+// ControlInput = "^", NodeName
+// NodeName     = [A-Za-z0-9.], [A-Za-z0-9_./] *
+absl::Status ValidateExternalGraphDefSyntax(const GraphDef& graph_def);
+
+// Adds default attributes to NodeDefs in 'graph_def' starting
+// from the 'node_offset' node in 'graph_def'.
+//
+// Default attributes are defined by 'op_registry'.
+//
+// Returns OK on success, an error if 'graph_def' has a NodeDef
+// that cannot be found in 'op_registry'.
+//
+// REQUIRES: 'graph_def' and 'op_registry' are not nullptr.
+absl::Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
+                                       const OpRegistryInterface& op_registry,
+                                       int node_offset);
+
+// Same as above, except for the fact that it skips nodes that aren't found in
+// op_registry if skip_unknown_ops is true.
+absl::Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
+                                       const OpRegistryInterface& op_registry,
+                                       int node_offset, bool skip_unknown_ops);
+
+// Remove attrs from 'graph_def' that have the default value according
+// to 'producer_op_registry', but don't exist according to
+// 'consumer_op_registry'. This can allow 'graph_def' to run on the
+// consumer even if consumer was built at an earlier CL (before an
+// attr with a default was added). Note that this will not affect
+// attrs with non-default values, so you must run a
+// ValidateGraphDef...() function to see if the result is in fact
+// compatible. If not nullptr, the op/attr pairs that were removed
+// are added to '*op_attr_removed'.
+//
+// Expected usage, for a producer that wants to prepare a graph for
+// a consumer:
+// // For each consumer, update 'graph_def':
+//   OpListOpRegistry consumer_op_registry(consumer_server_op_list);
+//   std::unordered_set<std::pair<string, string>> op_attr_removed;
+//   TF_RETURN_IF_ERROR(RemoveNewDefaultAttrsFromGraphDef(
+//       &graph_def, consumer_op_registry, *OpRegistry::Global(),
+//       &op_attr_removed));
+// // Validate that each consumer can understand the resulting 'graph_def'
+//   TF_RETURN_IF_ERROR(graph::ValidateGraphDefAgainstOpRegistry(
+//       graph_def, consumer_op_registry));
+// // Consumer can use 'graph_def', and 'op_attr_removed' summarizes
+// // what changes had to be made to 'graph_def' for it to work.
+//
+// Expected usage, for a consumer that has a graph and a
+// (optionally-stripped) op_list from a producer (say from a call to
+// StrippedOpListForGraph(), or in the MetaGraphDef):
+//   OpListOpRegistry producer_op_registry(producer_stripped_op_list);
+//   TF_RETURN_IF_ERROR(RemoveNewDefaultAttrsFromGraphDef(
+//       &graph_def, *OpRegistry::Global(), producer_op_registry, nullptr));
+absl::Status RemoveNewDefaultAttrsFromGraphDef(
+    GraphDef* graph_def, const OpRegistryInterface& consumer_op_registry,
+    const OpRegistryInterface& producer_op_registry,
+    std::set<std::pair<string, string>>* op_attr_removed);
+
+// Goes over the `nodes` and removes attributes that are set to their
+// default values according to op_registry.
+// If some node's definition is not found in the `op_registry`, this node is
+// simply skipped. In most cases, these nodes would be function calls.
+// If a stricter behavior is desired, one can add FunctionLibraryDefinition
+// argument to check for functions and their attributes.
+// This is obvious from signature, but as a warning, if `nodes` contain
+// nodes calling functions, e.g. PartitionCallOp or FunctionalIf, this
+// function does not "recurse" into them.
+void StripDefaultAttributes(const OpRegistryInterface& op_registry,
+                            protobuf::RepeatedPtrField<NodeDef>* nodes);
+
+// Two functions that collect the ops used by a graph.
+//
+// This returns the ops used as a set of strings.
+void OpsUsedByGraph(const GraphDef& graph_def,
+                    std::set<string>* ops_used_in_graph);
+
+// This function computes the stripped_op_list field of MetaGraphDef
+// and similar protos.  The op_registry should contain the ops used to
+// produce graph_def.  The resulting stripped_op_list can be
+// communicated from the producer to the consumer, which can use
+// RemoveNewDefaultAttrsFromGraphDef() to improve forwards compatibility
+// (using an OpListOpRegistry as indicated in the example above).
+//
+// Most users will pass *OpRegistry::Global() for op_registry to strip against
+// the list of ops registered in this process.
+absl::Status StrippedOpListForGraph(const GraphDef& graph_def,
+                                    const OpRegistryInterface& op_registry,
+                                    OpList* stripped_op_list);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_DEF_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/graph_to_functiondef.h b/third_party/tflite-hdrs/tensorflow/core/framework/graph_to_functiondef.h
new file mode 100644
index 00000000..369b86ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/graph_to_functiondef.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
+#define TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Graph to FunctionDef conversion. This code is closely modeled on the Python
+// function graph_to_function_def(), which is located in
+// tensorflow/python/framework/graph_to_function_def.py.
+absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
+                                bool append_hash_to_fn_name,
+                                bool set_stateful_from_nodes,
+                                bool copy_placeholder_attrs_from_nodes,
+                                const std::vector<const Node*>& body_nodes,
+                                const std::vector<OutputTensor>& inputs,
+                                const std::vector<OutputTensor>& outputs,
+                                const std::vector<string>& output_names,
+                                const std::vector<const Node*>& control_outputs,
+                                const std::vector<string>& control_output_names,
+                                const char* description, FunctionDef* fdef);
+
+// Converts 'graph' to a FunctionDef 'fdef', with name 'name':
+//
+// (1) 'node->IsArg()' nodes converted to function inputs.
+// (2) 'node->IsRetval()' nodes converted to function output.
+// (3) 'control_ret' returns an optional with a control output name, that will
+//     be added to the function `control_ret` map (see FunctionDef) and
+//     `control_output` in Op definition (see OpDef). Control output name must
+//     be unique for all control output nodes.
+absl::Status GraphToFunctionDef(
+    const Graph& graph, const string& name,
+    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    FunctionDef* fdef);
+
+absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+                                FunctionDef* fdef);
+
+absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+                                const std::vector<std::string>& output_names,
+                                FunctionDef* fdef);
+
+absl::Status GraphToFunctionDef(
+    std::unique_ptr<Graph> graph, const string& name,
+    const std::function<std::optional<string>(const Node*)>& control_ret,
+    FunctionDef* fdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/kernel_def_builder.h b/third_party/tflite-hdrs/tensorflow/core/framework/kernel_def_builder.h
new file mode 100644
index 00000000..b7629c8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/kernel_def_builder.h
@@ -0,0 +1,102 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_BUILDER_H_
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Forward declare proto so that kernels don't need to depend on it
+class KernelDef;
+
+// Builder class passed to the REGISTER_KERNEL_BUILDER() macro.
+class KernelDefBuilder {
+ public:
+  // Starts with just the name field set.
+  // Caller MUST call Build() and take ownership of the result.
+  explicit KernelDefBuilder(const char* op_name);
+  ~KernelDefBuilder();
+
+  // Required: specify the type of device this kernel supports.
+  // Returns *this.
+  KernelDefBuilder& Device(const char* device_type);
+
+  // Specify that this kernel supports a limited set of values for a
+  // particular type or list(type) attr (a further restriction than
+  // what the Op allows).
+  // Returns *this.
+  template <typename T>
+  KernelDefBuilder& AttrConstraint(const char* attr_name,
+                                   gtl::ArraySlice<T> allowed);
+
+  // Like AttrConstraint above but supports just a single value.
+  template <typename T>
+  KernelDefBuilder& AttrConstraint(const char* attr_name, T allowed);
+
+  // Specify that this kernel supports a limited set of values for a
+  // particular type or list(type) attr (a further restriction than
+  // what the Op allows).
+  // Returns *this.
+  KernelDefBuilder& TypeConstraint(const char* attr_name,
+                                   absl::Span<const DataType> allowed);
+
+  // Like TypeConstraint but supports just a single type.
+  KernelDefBuilder& TypeConstraint(const char* attr_name, DataType allowed);
+
+  // Like TypeConstraint, but (a) gets the type from a template parameter
+  // and (b) only supports a constraint to a single type.
+  template <class T>
+  KernelDefBuilder& TypeConstraint(const char* attr_name) TF_ATTRIBUTE_NOINLINE;
+  // TODO(josh11b): Support other types of attr constraints as needed.
+
+  // Specify that this kernel requires/provides an input/output arg
+  // in host memory (instead of the default, device memory).
+  // Returns *this.
+  KernelDefBuilder& HostMemory(const char* arg_name);
+
+  // Specify that this kernel requires a particular value for the
+  // "_kernel" attr.  May only be specified once.  Returns *this.
+  KernelDefBuilder& Label(const char* label);
+
+  // Specify a priority number for this kernel.
+  KernelDefBuilder& Priority(int32_t priority);
+
+  // Returns a pointer to a KernelDef with fields set based on the
+  // above calls to this instance.
+  // Caller takes ownership of the result.
+  const KernelDef* Build();
+
+ private:
+  KernelDef* kernel_def_;
+
+  KernelDefBuilder(const KernelDefBuilder&) = delete;
+  void operator=(const KernelDefBuilder&) = delete;
+};
+
+// IMPLEMENTATION
+
+template <class T>
+KernelDefBuilder& KernelDefBuilder::TypeConstraint(const char* attr_name) {
+  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::v());
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/kernel_def_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/kernel_def_util.h
new file mode 100644
index 00000000..b60b3b2c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/kernel_def_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_UTIL_H_
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+
+namespace tensorflow {
+
+// Returns whether the attrs satisfy the constraints in the kernel_def. Returns
+// an error if attrs in kernel_def are not found, or have a mismatching type.
+absl::Status KernelAttrsMatch(const KernelDef& kernel_def, AttrSlice attrs,
+                              bool* match);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/kernel_shape_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/kernel_shape_util.h
new file mode 100644
index 00000000..6d444e18
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/kernel_shape_util.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_KERNEL_SHAPE_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_KERNEL_SHAPE_UTIL_H_
+
+#include <array>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+// GetWindowedOutputSize(): Given an input tensor, kernel, stride and padding
+// type, the function computes the output and padding dimensions.
+//
+// For example, ignoring batches or multiple features, a 1D convolution
+// takes as input a 1D tensor of shape (H), and convolves it with a filter of
+// shape (K).
+//
+// It also takes in a few additional parameters:
+//
+// Stride (S): the stride with which we apply the filters. This is the offset
+// between locations where we apply the filters. A larger stride
+// means that the output will be spatially smaller.
+//
+// Padding (P): the padding we apply to the input tensor along each
+// dimension. This is usually used to make sure that the spatial dimensions
+// do not shrink when we progress with convolutions. This function supports two
+// types of padding.
+//   SAME: the pad value is computed so that the output will have size H/S.
+//   VALID: no padding is carried out.
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerbose must be
+// called instead. Note the padded area is zero-filled.
+//
+// The output dimensions are computed as follows:
+// - When adding dilation_rate (D), we compute an effective filter size (K'):
+//     K' = (K - 1) * D + 1
+// - When Padding = SAME: the output size is (H'), where
+//     H' = ceil(float(H) / float(S))
+//   where ceil is the ceiling function. The number of padded cells
+//   is computed as:
+//     Pc = ((H' - 1) * S + K' - H) / 2
+//   When the stride is 1, the expression simplifies to
+//     H' = H, Pc = (K'-1)/2.
+//   This is where SAME comes from - the output has the same size as the input
+//   has.
+//
+// - When Padding = VALID: the output size is computed as
+//     H' = ceil(float(H - K' + 1) / float(S))
+//   and the number of padded cells is always zero.
+//   When the stride is 1, the expression simplifies to
+//     H' = H-K'+1.
+//
+// For convolution, mathematically, the output value at location (r')
+// is the inner product of two vectors: the chunk of input at
+//    ((r'*S-Pr) : (r'*S-Pr+K)),
+// and the filter.
+//
+// For 2D and 3D convolutions, the spatial dimensions are orthogonal, so the
+// size and padding of each spatial dimension can be computed by calling
+// GetWindowedOutputSize separately for each dimension.
+//
+absl::Status GetWindowedOutputSize(int64_t input_size, int64_t filter_size,
+                                   int dilation_rate, int64_t stride,
+                                   Padding padding_type, int64_t* output_size,
+                                   int64_t* padding_size);
+
+// Returns the same output dimensions as in GetWindowedOutputSize, but returns
+// verbose padding dimensions (before/after), and EXPLICIT padding is supported.
+// When padding_type is EXPLICIT, *padding_before and *padding_after must
+// already point to initialized integers with the padding amounts. Otherwise,
+// *padding_before and *padding_after are set by this function, and any
+// excess padding (caused by an odd padding size value) is added to the
+// 'padding_after' dimension.
+absl::Status GetWindowedOutputSizeVerbose(
+    int64_t input_size, int64_t filter_size, int64_t dilation_rate,
+    int64_t stride, Padding padding_type, int64_t* output_size,
+    int64_t* padding_before, int64_t* padding_after);
+
+// Given an input tensor, kernel, stride and padding type, populates the 3D size
+// of the output tensor and padding to be applied to the input tensor at the
+// lower end of every dimension. Use for 3D convolutions, where the input data
+// is padded with zeros, as well as for 3D avg/max pooling, where the input data
+// is padded with invalid values that are not considered for pooling. EXPLICIT
+// padding is not supported.
+// The V2 version computes the same outputs with arbitrary dilation_rate. For
+// detailed equations, refer to the comments for GetWindowedOutputSize().
+absl::Status Get3dOutputSizeV2(const std::array<int64_t, 3>& input,
+                               const std::array<int64_t, 3>& window,
+                               const std::array<int64_t, 3>& dilations,
+                               const std::array<int64_t, 3>& strides,
+                               Padding padding_type,
+                               std::array<int64_t, 3>* output_ptr,
+                               std::array<int64_t, 3>* padding_ptr);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_KERNEL_SHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/local_rendezvous.h b/third_party/tflite-hdrs/tensorflow/core/framework/local_rendezvous.h
new file mode 100644
index 00000000..332daaa6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/local_rendezvous.h
@@ -0,0 +1,121 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOCAL_RENDEZVOUS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOCAL_RENDEZVOUS_H_
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Implements the basic logic of matching Send and Recv operations. See
+// RendezvousInterface for more details.
+//
+// NOTE: Most users will use a class that wraps LocalRendezvous, such as
+// IntraProcessRendezvous or RemoteRendezvous. This class does not implement
+// RendezvousInterface because virtual dispatch to LocalRendezvous methods
+// is not expected to be needed.
+class LocalRendezvous {
+ public:
+  // If the class wrapping LocalRendezvous is refcounted (i.e., extending
+  // Rendezvous), pass in its pointer in constructor so the LocalRendezvous
+  // can make sure it outlives the async recv requests.
+  // Pass in nullptr if the wrapping class is not refcounted.
+  explicit LocalRendezvous(Rendezvous* owner, int num_shards)
+      : num_buckets_(num_shards > 0 ? num_shards : 1),
+        rc_owner_(owner),
+        table_buckets_(std::make_unique<TableBucket[]>(num_buckets_)) {}
+  ~LocalRendezvous();
+
+  absl::Status Send(const Rendezvous::ParsedKey& key,
+                    const Rendezvous::Args& send_args, const Tensor& val,
+                    bool is_dead);
+  void RecvAsync(const Rendezvous::ParsedKey& key,
+                 const Rendezvous::Args& recv_args,
+                 Rendezvous::DoneCallback done);
+  void StartAbort(const absl::Status& status);
+  absl::Status status();
+
+  // Releases all the references to the aborted rendezvous. Used in unit tests.
+  static void ReleaseAbortedRendezvous() {
+    mutex_lock l(aborted_rendezs_mu_);
+    aborted_rendezs_.clear();
+  }
+
+ private:
+  void DoAbort(const absl::Status& status);
+
+  tsl::core::RefCountPtr<Rendezvous> GetOwnerRefCountPtr();
+
+  struct Item;
+
+  // By invariant, the item queue under each key is of the form
+  //   [item.type == kSend]* meaning each item is a sent message.
+  // or
+  //   [item.type == kRecv]* meaning each item is a waiter.
+  struct ItemQueue {
+    void push_back(Item* item);
+
+    Item* head = nullptr;
+    Item* tail = nullptr;
+  };
+
+  typedef gtl::FlatMap<uint64, ItemQueue> Table;
+
+  const int num_buckets_;
+  // Pointer to the owner class of this LocalRendezvous if it is refcounted,
+  // nullptr otherwise.
+  Rendezvous* rc_owner_;
+
+  struct TableBucket {
+    mutex mu;
+    Table table TF_GUARDED_BY(mu);
+
+    // Track the number of pening callbacks using a counter.
+    int pending_callback_counter TF_GUARDED_BY(mu) = 0;
+    condition_variable pending_callback_cond_var TF_GUARDED_BY(mu);
+  };
+
+  // Immutable set of buckets. This uses less memory than std::vector.
+  const std::unique_ptr<TableBucket[]> table_buckets_;
+  mutex mu_;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+
+  // We deliberately leak one reference of the aborted rendezvous here, so that
+  // they won't be destructed, and lose the status_.
+  // This is necessary because subsequent calls to RendezvousMgr::Find() will
+  // return the aborted rendezvous, and proper errors will be propagated.
+  // TODO(hhb): find a better way to manage rendezvous lifespan.
+  static mutex& aborted_rendezs_mu_;
+  static std::vector<tsl::core::RefCountPtr<Rendezvous> >& aborted_rendezs_
+      TF_GUARDED_BY(aborted_rendezs_mu_);
+
+  LocalRendezvous(const LocalRendezvous&) = delete;
+  void operator=(const LocalRendezvous&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOCAL_RENDEZVOUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/log_memory.h b/third_party/tflite-hdrs/tensorflow/core/framework/log_memory.h
new file mode 100644
index 00000000..f6c2b07d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/log_memory.h
@@ -0,0 +1,112 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOG_MEMORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOG_MEMORY_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// LogMemory contains methods for recording memory allocations and
+// frees, associating each allocation with a step identified by a
+// process-wide id. For now, logging is enabled whenever VLOG_IS_ON(1)
+// for the log_memory module.
+//
+// Limitations: We don't log memory allocations by Eigen on the CPU
+// since that would require major changes to plumb through to the
+// Eigen::{DefaultDevice,ThreadPoolDevice} allocate and deallocate
+// methods. We do log Eigen allocations on GPU since the plumbing was
+// already in place.
+class LogMemory {
+ public:
+  // Allocations sometimes happen outside any computation step, and
+  // SpecialStepIds lists the ids used for those steps.
+  enum SpecialStepIds {
+    // Used when performing a just-in-time constant folding optimization.
+    CONSTANT_FOLDING_STEP_ID = -1,
+    // Used when constructing an Op kernel before executing a step.
+    OP_KERNEL_CONSTRUCTION_STEP_ID = -2,
+    // Used when allocating a tensor buffer from external code, e.g.,
+    // the C API.
+    EXTERNAL_TENSOR_ALLOCATION_STEP_ID = -3,
+    // Used when allocating a buffer for network transfer.
+    NETWORK_BUFFER_STEP_ID = -4,
+    // Used when allocating a buffer to fill a Proto from the GPU.
+    PROTO_BUFFER_STEP_ID = -5,
+    // Used when allocating a Tensor where the caller has not indicated
+    // the step.
+    UNKNOWN_STEP_ID = -6,
+  };
+
+  static const std::string kLogMemoryLabel;
+
+  // Test to see if memory logging is enabled. For now, logging is
+  // enabled whenever VLOG_IS_ON(2) for the log_memory module.
+  static bool IsEnabled();
+
+  // Log the beginning of a step.
+  static void RecordStep(int64_t step_id, const std::string& handle);
+
+  // Log a tensor buffer allocation. The name indicates which kernel
+  // made the allocation. If the allocation is made through an
+  // OpKernelContext the step_id indicates which step is executing,
+  // otherwise step_id is one of the SpecialStepIds defined in
+  // op_kernel.h, e.g. Op Kernel construction or an optimization pass
+  // such as constant folding.
+  static void RecordTensorAllocation(const std::string& kernel_name,
+                                     int64_t step_id, const Tensor& tensor);
+
+  // Log a tensor buffer deallocation. The deallocation is triggered
+  // when the buffer's refcount falls to zero, and the tracking
+  // mechanism does not associate it with a particular step or
+  // kernel. The allocation_id/allocator_name should match a
+  // corresponding tensor previously passed in to
+  // RecordTensorAllocation.
+  static void RecordTensorDeallocation(int64_t allocation_id,
+                                       const std::string& allocator_name);
+
+  // Log the use of a tensor as an output from a kernel.
+  static void RecordTensorOutput(const std::string& kernel_name,
+                                 int64_t step_id, int index,
+                                 const Tensor& tensor);
+
+  // Log a "raw" allocation, which is just a buffer sized in
+  // bytes. The Eigen allocator, and memory copies, record their
+  // allocations this way, since they do not allocate TensorFlow
+  // tensors. The operation is set to the OpKernel name if this is
+  // called from within an Op execution, otherwise it indicates an
+  // operation such as memcpy. The step_id if >=0 indicates which step
+  // is executing, otherwise step_id is one of the SpecialStepIds
+  // defined in op_kernel.h, e.g. Op Kernel construction or an
+  // optimization pass such as constant folding.
+  static void RecordRawAllocation(const std::string& operation, int64_t step_id,
+                                  size_t num_bytes, void* ptr,
+                                  Allocator* allocator);
+
+  // Log a "raw" deallocation of a buffer. When deferred is true, the
+  // buffer won't be used again, but a GPU kernel may still be
+  // enqueued using the buffer. A deferred deallocation should always
+  // be followed by a matching non-deferred deallocation when the
+  // buffer is actually returned and can be reused.
+  static void RecordRawDeallocation(const std::string& operation,
+                                    int64_t step_id, void* ptr,
+                                    Allocator* allocator, bool deferred);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOG_MEMORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/logging.h b/third_party/tflite-hdrs/tensorflow/core/framework/logging.h
new file mode 100644
index 00000000..9bde3d51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/logging.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
+
+#include <string>
+
+namespace tensorflow {
+
+namespace logging {
+
+// Register a listener method to call on any printed messages.
+// Returns true if it is successfully registered.
+bool RegisterListener(void (*listener)(const char*));
+
+// Log string to active listeners. Returns true if any listeners were
+// registered.
+bool LogToListeners(std::string msg, std::string end = "\n");
+
+}  // namespace logging
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/lookup_interface.h b/third_party/tflite-hdrs/tensorflow/core/framework/lookup_interface.h
new file mode 100644
index 00000000..9d673fbc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/lookup_interface.h
@@ -0,0 +1,164 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOOKUP_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOOKUP_INTERFACE_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace lookup {
+
+// Forward declaration so we can define GetInitializableLookupTable() in
+// LookupInterface.
+class InitializableLookupTable;
+
+// Lookup interface for batch lookups used by table lookup ops.
+class LookupInterface : public ResourceBase {
+ public:
+  // Performs batch lookups, for every element in the key tensor, Find returns
+  // the corresponding value into the values tensor.
+  // If an element is not present in the table, the given default value is used.
+
+  // For tables that require initialization, Find is available once the table
+  // is marked as initialized.
+
+  // Returns the following statuses:
+  // - OK: when the find finishes successfully.
+  // - FailedPrecondition: if the table is not initialized.
+  // - InvalidArgument: if any of the preconditions on the lookup key or value
+  //   fails.
+  // - In addition, other implementations may provide another non-OK status
+  //   specific to their failure modes.
+  virtual absl::Status Find(OpKernelContext* ctx, const Tensor& keys,
+                            Tensor* values, const Tensor& default_value) = 0;
+
+  // Inserts elements into the table. Each element of the key tensor is
+  // associated with the corresponding element in the value tensor.
+  // This method is only implemented in mutable tables that can be updated over
+  // the execution of the graph. It returns Status::NotImplemented for read-only
+  // tables that are initialized once before they can be looked up.
+
+  // Returns the following statuses:
+  // - OK: when the insert finishes successfully.
+  // - InvalidArgument: if any of the preconditions on the lookup key or value
+  //   fails.
+  // - Unimplemented: if the table does not support insertions.
+  virtual absl::Status Insert(OpKernelContext* ctx, const Tensor& keys,
+                              const Tensor& values) = 0;
+
+  // Removes elements from the table.
+  // This method is only implemented in mutable tables that can be updated over
+  // the execution of the graph. It returns Status::NotImplemented for read-only
+  // tables that are initialized once before they can be looked up.
+
+  // Returns the following statuses:
+  // - OK: when the remove finishes successfully.
+  // - InvalidArgument: if any of the preconditions on the lookup key fails.
+  // - Unimplemented: if the table does not support removals.
+  virtual absl::Status Remove(OpKernelContext* ctx, const Tensor& keys) = 0;
+
+  // Returns the number of elements in the table.
+  virtual size_t size() const = 0;
+
+  // Exports the values of the table to two tensors named keys and values.
+  // Note that the shape of the tensors is completely up to the implementation
+  // of the table and can be different than the tensors used for the Insert
+  // function above.
+  virtual absl::Status ExportValues(OpKernelContext* ctx) = 0;
+
+  // Imports previously exported keys and values.
+  // As mentioned above, the shape of the keys and values tensors are determined
+  // by the ExportValues function above and can be different than for the
+  // Insert function.
+  virtual absl::Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
+                                    const Tensor& values) = 0;
+
+  // Returns the data type of the key.
+  virtual DataType key_dtype() const = 0;
+
+  // Returns the data type of the value.
+  virtual DataType value_dtype() const = 0;
+
+  // Returns the shape of a key in the table.
+  virtual TensorShape key_shape() const = 0;
+
+  // Returns the shape of a value in the table.
+  virtual TensorShape value_shape() const = 0;
+
+  // Check format of the key and value tensors for the Insert function.
+  // Returns OK if all the following requirements are satisfied, otherwise it
+  // returns InvalidArgument:
+  // - DataType of the tensor keys equals to the table key_dtype
+  // - DataType of the tensor values equals to the table value_dtype
+  // - the values tensor has the required shape given keys and the tables's
+  //   value shape.
+  virtual absl::Status CheckKeyAndValueTensorsForInsert(const Tensor& keys,
+                                                        const Tensor& values);
+
+  // Similar to the function above but instead checks eligibility for the Import
+  // function.
+  virtual absl::Status CheckKeyAndValueTensorsForImport(const Tensor& keys,
+                                                        const Tensor& values);
+
+  // Check format of the key tensor for the Remove function.
+  // Returns OK if all the following requirements are satisfied, otherwise it
+  // returns InvalidArgument:
+  // - DataType of the tensor keys equals to the table key_dtype
+  virtual absl::Status CheckKeyTensorForRemove(const Tensor& keys);
+
+  // Check the arguments of a find operation. Returns OK if all the following
+  // requirements are satisfied, otherwise it returns InvalidArgument:
+  // - DataType of the tensor keys equals to the table key_dtype
+  // - DataType of the tensor default_value equals to the table value_dtype
+  // - the default_value tensor has the required shape given keys and the
+  //   tables's value shape.
+  absl::Status CheckFindArguments(const Tensor& keys,
+                                  const Tensor& default_value);
+
+  string DebugString() const override {
+    return strings::StrCat("A lookup table of size: ", size());
+  }
+
+  // Returns an InitializableLookupTable, a subclass of LookupInterface, if the
+  // current object is an InitializableLookupTable. Otherwise, returns nullptr.
+  virtual InitializableLookupTable* GetInitializableLookupTable() {
+    return nullptr;
+  }
+
+ protected:
+  ~LookupInterface() override = default;
+
+  // Makes sure that the key and value tensor DataType's match the table
+  // key_dtype and value_dtype.
+  absl::Status CheckKeyAndValueTypes(const Tensor& keys, const Tensor& values);
+
+  // Makes sure that the provided shape is consistent with the table keys shape.
+  absl::Status CheckKeyShape(const TensorShape& shape);
+
+ private:
+  absl::Status CheckKeyAndValueTensorsHelper(const Tensor& keys,
+                                             const Tensor& values);
+};
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOOKUP_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/memory_types.h b/third_party/tflite-hdrs/tensorflow/core/framework/memory_types.h
new file mode 100644
index 00000000..e1247222
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/memory_types.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_MEMORY_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_MEMORY_TYPES_H_
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+class NodeDef;
+
+// Returns into *{input,output}_memory_types the memory type of each
+// {input,output} tensor.
+//
+// REQUIRES: * '*_memory_types' is not nullptr.
+//           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
+absl::Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
+                                const DeviceType& device_type,
+                                const NodeDef& ndef,
+                                MemoryTypeVector* input_memory_types,
+                                MemoryTypeVector* output_memory_types);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_MEMORY_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/metrics.h b/third_party/tflite-hdrs/tensorflow/core/framework/metrics.h
new file mode 100644
index 00000000..18b52c49
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/metrics.h
@@ -0,0 +1,550 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/data_service.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace metrics {
+enum class GraphOptimizationSource {
+  kUnknown,
+  kJit,
+  kAot,
+};
+
+// Records when a data-fetching tf.data operation is executed.
+//
+// The `name` argument identifies the operation type (e.g. "ToSingleElementOp").
+void RecordTFDataFetchOp(const string& name);
+
+// Records that a tf.data.Dataset executed by the program used autotuning.
+//
+// The `name` argument identifies the Dataset type (e.g. "ParallelMap").
+void RecordTFDataAutotune(const string& name);
+
+// Returns a counter that can be used to record the number of bytes produced by
+// a tf.data.Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
+monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
+
+// Returns a counter that can be used to record the number of bytes produced by
+// a tf.data.Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
+monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
+
+// Returns a counter than can be used to record the number of bytes read from
+// the filesystem by a tf.data.Dataset source.
+//
+// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
+//
+// TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
+monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
+
+// Returns a counter than can be used to record the number of elements produced
+// by a tf.data.Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
+monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
+
+// Returns a gauge than can be used to record the performance model information.
+//
+// The `id` argument represents the (unique) model ID.
+monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
+    const string& id);
+
+// Records the number of bytes fetched from tf.data.Dataset iterator.
+void RecordTFDataBytesFetched(int64_t num_bytes);
+
+// Records the number of times a tf.data experiment was applied.
+void RecordTFDataExperiment(const string& name);
+
+// Records the number of times a tf.data experiment could have been applied.
+void RecordTFDataExperimentLive(const string& name);
+
+// Records the number of times a tf.data experiment was opted into.
+void RecordTFDataExperimentOptIn(const string& experiment_name);
+
+// Records the number of times a tf.data experiment was opted out of.
+void RecordTFDataExperimentOptOut(const string& experiment_name);
+
+// Records the time (in microseconds) spent generating an element and
+// transferring it over the network for the given protocol.
+void RecordTFDataServiceGetElementDuration(const string& data_transfer_protocol,
+                                           uint64 duration_us);
+
+// Records the time (in microseconds) spent in a single invocation of
+// `ItertatorResource::GetNext()`.
+void RecordTFDataGetNextDuration(uint64 duration_us);
+
+// Records the histogram of ratios of tf.data autotune algorithm used RAM over
+// the ram budget.
+void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio);
+
+// Records the histogram of ratios of tf.data autotune algorithm max buffer
+// bytes over the ram budget.
+void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio);
+
+// Records the number of times each tf.data fingerprint is used
+// to measure duplicate pre-processing.
+//
+// The `name` argument identifies the Dataset graph fingerprint,
+// created using GraphHash().
+void RecordTFDataFingerprint(const string& name);
+
+// Records the event of a tf.data service pipeline getting a runtime
+// compression decision.
+void RecordTFDataServiceRuntimeCompressionDecision(bool compression_decision);
+
+// Records the event of a tf.data service pipeline making the compression
+// related action.
+void RecordTFDataServiceCompressionAction(const string& action);
+
+// Records the time (in microseconds) during which `IteratorResource` was busy
+// processing at least one `GetNext()` request.
+void RecordTFDataIteratorBusy(uint64 duration_us);
+
+// Records the time (in microseconds) between `IteratorResource` receiving the
+// first `GetNext()` request and responding to the last `GetNext()` request.
+void RecordTFDataIteratorLifetime(uint64 duration_us);
+
+// Records the time histogram (in microseconds) between `IteratorResource`
+// responding to a `GetNext()` request and receiving the next `GetNext()`
+// request.
+void RecordTFDataIteratorGap(uint64 duration_us);
+
+// Records the number of independent graph changes resulting from the
+// application of a tf.data optimization.
+//
+// The `name` argument identifies the optimization (e.g. "noop_elimination").
+void RecordTFDataOptimization(const string& name, int64_t num_changes);
+
+// Records that a tf.data service worker has been created.
+void RecordTFDataServiceWorkerCreated();
+
+// Records that a tf.data service job has been created.
+void RecordTFDataServiceJobsCreated(
+    const data::ProcessingModeDef& processing_mode, bool is_coordinated_read);
+
+// Records tf.data service iterators created by clients.
+void RecordTFDataServiceClientIterators(
+    int64_t worker_uid, data::DeploymentMode deployment_mode,
+    const data::ProcessingModeDef& processing_mode, bool is_coordinated_read);
+
+// Records that a tf.data service worker client has been created that will use
+// `data_transfer_protocol` to get data from the worker server and whether or
+// not the user explicitly specified the protocol.
+void RecordTFDataServiceDataTransferProtocolUsed(
+    const string& data_transfer_protocol, bool user_specified);
+
+// Records that a tf.data service worker client fell back to gRPC rather than
+// use `data_transfer_protocol` because of an error of type `code` with message
+// `error_message`.
+void RecordTFDataServiceDataTransferProtocolFallback(
+    const string& data_transfer_protocol, error::Code code,
+    const string& error_message);
+
+// Records that a tf.data service worker client got an error of non-retriable
+// type `code` with message `error_message` when trying to transfer data over
+// `data_transfer_protocol`.
+void RecordTFDataServiceDataTransferProtocolError(
+    const string& data_transfer_protocol, error::Code code,
+    const string& error_message);
+
+// Records tf.data service cross-trainer cache queries.
+void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit);
+
+// Records tf.data service cross-trainer cache memory usage in bytes.
+void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes);
+
+// Records tf.data distributed snapshot bytes committed.
+void RecordTFDataServiceSnapshotBytesCommitted(int64_t bytes);
+
+// Records tf.data distributed snapshot save/load ops.
+void RecordTFDataServiceSnapshotOp(const std::string& path,
+                                   const std::string& op);
+
+// Records the current estimated optimal number of tf.data service workers.
+void RecordTFDataServiceOptimalNumberOfWorkers(int64_t number_of_workers);
+
+// Records the file name read by a tf.data Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
+void RecordTFDataFilename(const string& name, const string& filename);
+
+// Records the total attempts made by file logger.
+void RecordTFDataFileLoggerAttempts();
+
+// Records an error of type `code` with message `error_message` encountered by
+// file logger.
+void RecordTFDataFileLoggerErrors(error::Code code,
+                                  const string& error_message);
+
+// Records the total number of files attempted to be logged by file logger.
+void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files);
+
+// Records the number of files that encountered an error of type
+// `code` with message `error_message` during logging by file logger with this
+// error code.
+void RecordTFDataFileLoggerErrorsNumFiles(size_t num_files, error::Code code,
+                                          const string& error_message);
+
+// Records statistics of tf.data auto sharding.
+//
+// The `id` is a unique identifier of the input pipeline. The `policy`
+// identifies the auto-sharding policy used, the `num_workers` identifies the
+// number of workers, and `num_replicas` identifies the number of replicas.
+void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
+                           int64 num_workers, int64 num_replicas);
+
+// Records statistics of whether we can rewrite batch size in tf.data auto
+// sharding.
+//
+// The `id` is a unique identifier of the input pipeline. The `eligible`
+// indicates whether the input pipeline is eligible for the rewrite. The
+// `ineligible_reason` is the reason if the input pipeline is ineligible.
+void RecordTFDataAutoShardRewriteBatchSize(
+    bool eligible, const std::vector<string>& ineligible_reason);
+
+// Records the number of times each tf.data autotuning algorithm stopping
+// criterion is met.
+void RecordTFDataAutotuneStoppingCriteria(const string& name);
+
+// Records the number of times this event occured, for debugging.
+void RecordTFDataDebug(const string& event);
+
+// Records the number of times an error of this type occurred with this status
+// code.
+void RecordTFDataError(const string& error_type, const string& error_code);
+
+// Records the framework type used to build the tf.data.Dataset.
+void RecordTFDataFrameworkType(const std::string& framework_type);
+
+// Records the number of times tf.data file logger encountered an error of this
+// type occurred with this status code.
+void RecordTFDataFileLoggerError(const string& error_type,
+                                 const string& error_code);
+
+// Records parsing of dense tensor features.
+void RecordParseDenseFeature(int64_t num_features);
+
+// Records parsing of sparse tensor features.
+void RecordParseSparseFeature(int64_t num_features);
+
+// Records parsing of ragged tensor features.
+void RecordParseRaggedFeature(int64_t num_features);
+
+// Records the size of input/output tensors in bytes.
+void RecordGraphInputTensors(const size_t size);
+void RecordGraphOutputTensors(const size_t size);
+
+// Records the number of cores requested by graphs with XLA SPMD enabled.
+void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
+
+void UpdateGraphExecTime(const uint64 running_time_usecs);
+void UpdateGraphPendingQueueLength(uint64 len);
+
+// Records that one output of an op of type `op_name` was unused.
+void RecordUnusedOutput(const string& op_name);
+
+// Records the pipeline processing time in microseconds
+void RecordPipelineProcessingTime(const string& id,
+                                  double pipeline_processing_time_usec);
+
+// Increments the count of binaries loaded from the persistent cache.
+void UpdatePersistentCacheLoadCount();
+
+// Increments the count of BEF and MLIR deserialized.
+void UpdateAotBefMlirLoadCount();
+
+// Updates the metrics stored about time spent building graphs.
+//
+// By "GraphBuild", we refer to building a client graph, which is a sub-graph of
+// the full graph, induced by a set of options. In particular, these options
+// include the feeds and fetches requested.
+//
+// This includes time spent:
+//   * optimizing the graphs with Grappler
+//   * pruning the sub-graph (unless the place_pruned_graph option is set)
+//
+// When executing eagerly, this will not record any activity.
+//
+// TODO(jtkeeling): Should we record building/optimizing tf.functions?
+void UpdateGraphBuildTime(const uint64 running_time_usecs);
+
+// Updates the metric stored for time spent optimizing function graphs.
+void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs);
+
+// Updates the metric stored for time saved by caching graph optimization.
+void UpdateFunctionGraphOptimizationSavingTime(uint64 saving_time_usec,
+                                               GraphOptimizationSource source);
+
+// Retrieves the total time saved by the graph optimization caching.
+uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+    GraphOptimizationSource source);
+
+// Increments the hit count for the graph optimization cache.
+void IncrementFunctionGraphOptimizationCacheHitCount(
+    int count, GraphOptimizationSource source);
+
+// Gets the hit count for the graph optimization cache.
+int64_t GetFunctionGraphOptimizationCacheHitCount(
+    GraphOptimizationSource source);
+
+// Increments the failure count for the graph optimization cache restoring.
+void IncrementFunctionGraphOptimizationCacheFailureCount(
+    int count, GraphOptimizationSource source);
+
+// Gets the failure count for the graph optimization cache.
+int64_t GetFunctionGraphOptimizationCacheFailureCount(
+    GraphOptimizationSource source);
+
+// Increments the miss count for the graph optimization cache.
+void IncrementFunctionGraphOptimizationCacheMissCount(
+    int count, GraphOptimizationSource source);
+
+// Gets the miss count for the graph optimization cache.
+int64_t GetFunctionGraphOptimizationCacheMissCount(
+    GraphOptimizationSource source);
+
+// Increments the number of restoring function graph optimization cache.
+void IncrementFunctionGraphOptimizationCacheLoadCount(
+    int count, GraphOptimizationSource source);
+
+int64_t GetFunctionGraphOptimizationCacheLoadCount(
+    GraphOptimizationSource source);
+
+// Records the activity of the first phase of the mlir bridge using the
+// tf_metadata.tf_mlir_bridge_first_phase_v2_count metric.
+// bridge_type: replicated, nonreplicated, etc.
+// bridge_version: v1 compat, v2, etc.
+// device_type: tpu, cpu, gpu, etc.
+// fallback_enabled: true if fallback will happen, false if not
+// result: outcome of bridge (success, failure, disabled, invalid_graph, etc.)
+void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
+                                         const std::string& bridge_version,
+                                         const std::string& device_type,
+                                         bool fallback_enabled,
+                                         const std::string& result);
+
+enum class Phase2XlaCompilerMetric {
+  // Bridge phase 2 CompileSingleOp Xla Builder (old version) was successful
+  kCompileSingleOpXlaBuilderSuccess,
+  // Bridge phase 2 CompileSingleOp Xla Builder (old version) failed
+  kCompileSingleOpXlaBuilderFailure,
+  // Bridge phase 2 CompileSingleOp MLIR version was successful
+  kCompileSingleOpMlirSuccess,
+  // Bridge phase 2 CompileSingleOp MLIR version failed
+  kCompileSingleOpMlirFailure,
+  // Bridge phase 2 CompileFunction Xla Builder (old version) was successful
+  kCompileFunctionXlaBuilderSuccess,
+  // Bridge phase 2 CompileFunction Xla Builder (old version) failed
+  kCompileFunctionXlaBuilderFailure,
+  // Bridge phase 2 CompileFunction MLIR version was successful
+  kCompileFunctionMlirSuccess,
+  // Bridge phase 2 CompileFunction MLIR version failed
+  kCompileFunctionMlirFailure,
+};
+
+// Records the activity of the XlaCompiler entry points.
+void IncrementPhase2XlaCompilerCounter(Phase2XlaCompilerMetric metric);
+
+enum class MlirBridgeSecondPhaseMetric {
+  // MLIR bridge phase 2 was executed and the graph was processed successfully
+  // (fallback enabled).
+  kMlirWithFallbackModeSuccess,
+  // MLIR bridge phase 2 compilation was failure (fallback enabled).
+  kMlirWithFallbackModeFailure,
+  // MLIR bridge phase 2 compilation was successful (manually enabled).
+  kMlirModeSuccess,
+  // MLIR bridge phase 2 compilation fails (manually enabled)
+  kMlirModeFailure,
+  // Old bridge compilation was run successfully (was run because MLIR bridge
+  // could not process the graph).
+  kOldBridgeMlirFilteredSuccess,
+  // Old bridge failed (was run b/c MLIR bridge could not process the graph).
+  kOldBridgeMlirFilteredFailure,
+  // Old bridge compilation was successfully run after MLIR bridge ran and
+  // failed.
+  kOldBridgeWithFallbackModeSuccess,
+  // Old Bridge failed in fallback (was run because MLIR bridge failed first).
+  kOldBridgeWithFallbackModeFailure,
+  // MLIR bridge phase 2 Combined Bridge MLIR was successful
+  kMlirCombinedMlirSuccess,
+  // MLIR bridge phase 2 Combined Bridge MLIR failed
+  kMlirCombinedMlirFailure,
+  // MLIR bridge phase 2 Combined Bridge Old bridge was successful
+  kMlirCombinedOldSuccess,
+  // MLIR bridge phase 2 Combined Bridge Old bridge was successful
+  kMlirCombinedOldFailure,
+};
+
+// Records the activity of the second phase of the mlir bridge.
+void IncrementTfMlirBridgeSecondPhaseCounter(
+    MlirBridgeSecondPhaseMetric metric);
+
+// Records the activity per op using the
+// tf_metadata.tf_mlir_bridge_graph_analysis_per_op.
+// op_name: the name of op.
+// construction_context: eager, session, Not tracked.
+// is_single_core_inference_mode: true, false.
+// unsupported_reason: the reason why the graph is not supported in MLIR-based
+// bridge, like invalid graph, has unsupported ops, etc.
+// has_unsupported_features: true indicates MLIR-based bridge is disabled,
+// false indicates MLIR-based bridge is enabled.
+
+void UpdateTfMlirBridgeGraphAnalysisPerOp(
+    const std::string& op_name, const std::string& construction_context,
+    bool is_single_core_inference_mode, const std::string& num_replicas,
+    const std::string& num_cores_per_replica, const std::string& use_tpu,
+    const std::string& allow_soft_placement,
+    const std::string& use_spmd_for_xla_partitioning,
+    const std::string& unsupported_reason, bool has_unsupported_features);
+
+// Records whether a graph contains any of the TF1 features
+void RecordTFVersionByGraphFeatures(const std::string& device,
+                                    const std::string& context,
+                                    bool hasControlFlowV1,
+                                    bool hasReferenceVariables,
+                                    bool hasManualControlDeps);
+
+// Convenience class allowing RAII style of reporting for a monitoring::Counter.
+template <int NumLabels>
+class ScopedCounter final {
+ public:
+  ScopedCounter(monitoring::Counter<NumLabels>* const counter,
+                const std::array<std::string, NumLabels>& labels)
+      : counter_(counter), labels_(labels) {
+    Init();
+  }
+
+  // Report counter and stop it. Counter needs to be reset to perform
+  // next measurement.
+  void ReportAndStop() {
+    if (started_) {
+      started_ = false;
+      ReportInternal(std::make_index_sequence<NumLabels>());
+    }
+  }
+
+  // Start the measurement with the new set of labels.
+  void Reset(const std::array<std::string, NumLabels>& labels) {
+    labels_ = labels;
+    Init();
+  }
+
+  // Start the measurement with the existing set of labels.
+  void Reset() { Init(); }
+
+  // Returns duration of the current interval in case the timer has started.
+  // Returns nullopt otherwise.
+  std::optional<uint64> DurationMicroSec() const {
+    return started_ ? std::optional<uint64>(accumulated_time_ +
+                                            Env::Default()->NowMicros() -
+                                            start_time_)
+                    : std::nullopt;
+  }
+
+  // Temporarily stop the timer, but keep accumulated time.
+  void AccumulateAndStop() {
+    if (started_) {
+      accumulated_time_ = Env::Default()->NowMicros() - start_time_;
+      started_ = false;
+    }
+  }
+
+  // Start previously stopped timer.
+  void Start() {
+    if (started_) return;
+
+    // Keep previously accumulated time if any.
+    start_time_ = Env::Default()->NowMicros();
+    started_ = true;
+  }
+
+  ~ScopedCounter() { ReportAndStop(); }
+
+ private:
+  template <std::size_t... S>
+  void ReportInternal(std::index_sequence<S...>) {
+    uint64 time_interval = Env::Default()->NowMicros() - start_time_;
+    time_interval += accumulated_time_;
+    if (time_interval > 0) {
+      counter_->GetCell(labels_[S]...)->IncrementBy(time_interval);
+    }
+  }
+
+  void Init() {
+    start_time_ = Env::Default()->NowMicros();
+    started_ = true;
+    accumulated_time_ = 0;
+  }
+
+  monitoring::Counter<NumLabels>* counter_;
+  std::array<std::string, NumLabels> labels_;
+  bool started_{false};
+  uint64 start_time_;
+  uint64 accumulated_time_;
+};
+
+// Returns a counter used to capture timing metrics for graph optimization
+// passes.
+monitoring::Counter<2>* GetGraphOptimizationCounter();
+
+// Updates metrics for time to distribute variables to all TPU hosts.
+void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
+
+// Updates the metrics stored about time XLA spents compiling graphs.
+void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
+
+// Increments (by 1) a simple integer counter that is exposed for testing.
+void IncrementTestCounter(const string& name, const string& label);
+
+// Read-only access to a counter for testing.
+const monitoring::CounterCell* TestCounter(const string& name,
+                                           const string& label);
+
+// Read-only wrapper for a TestCounter to track increments between calls.
+class TestDelta {
+ public:
+  TestDelta(const string& name, const string& label);
+  void Reset();
+  int64 Get();
+
+ private:
+  const monitoring::CounterCell* cell_;
+  int64 last_value_;
+};
+void UpdateTpuErrorCounter(const string& op, const string& error_type);
+void UpdateEagerClientErrorCounter(const string& error_source,
+                                   const string& error_type);
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/model.h b/third_party/tflite-hdrs/tensorflow/core/framework/model.h
new file mode 100644
index 00000000..4c78ec7a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/model.h
@@ -0,0 +1,1294 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_MODEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_MODEL_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+// TODO(b/114492873): Move this include into core/platform.
+#include <optional>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/framework/model.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringprintf.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+namespace model {
+
+// A constant that can be used to enable auto-tuning.
+constexpr int64_t kAutotune = -1;
+constexpr char kParallelism[] = "parallelism";
+constexpr char kBufferSize[] = "buffer_size";
+constexpr char kCycleLength[] = "cycle_length";
+constexpr char kDeterministic[] = "deterministic";
+constexpr char kMaxBufferedElements[] = "max_buffered_elements";
+
+// A key used to identify the input time of the model.
+constexpr char kModelInputTimeKey[] = "model_input_time";
+
+// Default share of available RAM that can be used by model's internal buffers.
+constexpr double kRamBudgetShare = 0.5;
+
+// Weight of the latest processing time used in computing the exponential moving
+// average of processing time per element.
+constexpr double kProcessingTimeEmaWeight = 0.1;
+
+enum class TraversalOrder {
+  BFS = 0,
+  REVERSE_BFS = 1,
+};
+
+// Represents thread-safe state that can be shared between an input pipeline and
+// the performance model.
+struct SharedState {
+ public:
+  SharedState(int64_t value, std::shared_ptr<mutex> mu,
+              std::shared_ptr<condition_variable> cond_var)
+      : value(value),
+        mu(std::move(mu)),
+        cond_var(std::move(cond_var)),
+        tunable(value == kAutotune) {}
+
+  double value;
+  const std::shared_ptr<mutex> mu;
+  const std::shared_ptr<condition_variable> cond_var;
+  const bool tunable;
+};
+
+// Represents a parameter.
+struct Parameter {
+  Parameter(const string& name, std::shared_ptr<SharedState> state, double min,
+            double max)
+      : name(name),
+        // Sometimes non-autotune nodes (with `autotune_=false`) may contain
+        // parameters (for example inputs of parallel interleave dataset which
+        // are not in the current cycle). To avoid unrealistic situation
+        // (say `buffer_size=-1` or `parallelism=-1`) in the optimization
+        // computation, if the state value is `kAutotune=-1` (just to indicate
+        // the `SharedState` is tunable), we initialize the parameter value to
+        // be the minimal value of the state.
+        value(state == nullptr || state->value == kAutotune ? min
+                                                            : state->value),
+        min(min),
+        max(max),
+        state(std::move(state)) {}
+
+  explicit Parameter(const std::shared_ptr<Parameter> parameter)
+      : name(parameter->name),
+        value(parameter->value),
+        min(parameter->min),
+        max(parameter->max),
+        state(parameter->state) {}
+
+  // Human-readable name of the parameter.
+  const string name;
+
+  // Identifies the model value of the parameter. This can be different from
+  // the actual value (e.g. during optimization search).
+  double value;
+
+  // Identifies the minimum value of the parameter.
+  const double min;
+
+  // Identifies the maximum value of the parameter.
+  const double max;
+
+  // Shared state of the parameter.
+  std::shared_ptr<SharedState> state;
+};
+
+// Returns a new tunable parameter with the value set to `min`.
+std::shared_ptr<Parameter> MakeParameter(const string& name,
+                                         std::shared_ptr<SharedState> state,
+                                         double min, double max);
+
+// Returns a new tunable parameter with the value set to `value` instead
+// of `min`.
+std::shared_ptr<Parameter> MakeParameter(const string& name,
+                                         std::shared_ptr<SharedState> state,
+                                         double min, double max, double value);
+
+// Returns a new non-tunable parameter.
+std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
+                                                   double value);
+
+// Class for managing the ram budget of an iterator. This is necessary for
+// coordinating ram usage between the model-based autotuner and the legacy
+// prefetch autotuner. Once the legacy autotuner is retired we can remove this
+// class and move all ram budget management to the model autotuner.
+class RamBudgetManager {
+ public:
+  explicit RamBudgetManager(int64_t budget) : budget_(budget) {
+    if (budget <= 0) {
+      LOG(WARNING) << "RAM budget is " << budget
+                   << " which could prevent autotuner from properly adjusting "
+                      "buffer sizes.";
+    }
+  }
+
+  // Requests a new total memory allocation for the parts of the dataset
+  // tuned by the model.
+  //
+  // The autotuner is expected to follow a pattern like
+  //
+  // int64_t budget = ram_budget_manager.AvailableModelRam();
+  // NewModel potential_new_params = OptimizeModel(budget);
+  // int64_t new_ram_used = potential_new_params.RamUsed();
+  // if (ram_budget_manager.RequestModelAllocation(new_ram_used)) {
+  //   ApplyModel(potential_new_params);
+  // }
+  //
+  // Returns whether the request succeeded.
+  bool RequestModelAllocation(int64_t total_bytes) {
+    mutex_lock l(mu_);
+    if (total_bytes > budget_ - legacy_prefetch_allocated_) {
+      return false;
+    }
+    model_allocated_ = total_bytes;
+    return true;
+  }
+
+  // Requests `delta_elements` allocated to the model where each element is of
+  // size `element_size` bytes. `delta_elements` can be negative.
+  // Returns the actual allocated delta elements.
+  int64_t RequestModelBytes(int64_t delta_elements, double element_size) {
+    if (delta_elements == 0) {
+      return 0;
+    }
+    int64_t allocated_delta_elements = delta_elements;
+    mutex_lock l(mu_);
+    // If `delta_elements` is positive, allocate only up to the available
+    // memory.
+    if (delta_elements > 0) {
+      int64_t max_delta_elements = static_cast<int64_t>(
+          (budget_ - legacy_prefetch_allocated_ - model_allocated_) /
+          element_size);
+      if (max_delta_elements < 0) {
+        return 0;
+      }
+      allocated_delta_elements = std::min(max_delta_elements, delta_elements);
+    }
+    model_allocated_ +=
+        static_cast<int64_t>(allocated_delta_elements * element_size);
+    return allocated_delta_elements;
+  }
+
+  // Requests `bytes` additional bytes for the purpose of legacy prefetch
+  // autotuning.
+  //
+  // Unlike RequestModelAllocation, we use a delta number of bytes, since there
+  // can only be one model per iterator but there may be multiple legacy
+  // prefetch autotuners.
+  //
+  // Returns whether there were enough bytes left in the budget to serve the
+  // request. If not, no bytes are allocated.
+  bool RequestLegacyPrefetchBytes(int64_t delta_bytes) {
+    mutex_lock l(mu_);
+    if (delta_bytes > budget_ - legacy_prefetch_allocated_ - model_allocated_) {
+      return false;
+    }
+    legacy_prefetch_allocated_ += delta_bytes;
+    return true;
+  }
+
+  // The total number of bytes that the model could potentially use.
+  int64_t AvailableModelRam() const {
+    tf_shared_lock l(mu_);
+    return budget_ - legacy_prefetch_allocated_;
+  }
+
+  void UpdateBudget(int64_t budget) {
+    mutex_lock l(mu_);
+    budget_ = budget;
+    VLOG(2) << "Updated ram budget to " << budget;
+  }
+
+  std::string DebugString() {
+    mutex_lock l(mu_);
+    return absl::StrCat("RamBudgetManager: budget_: ", budget_,
+                        " prefetch allocated: ", legacy_prefetch_allocated_,
+                        " model allocated: ", model_allocated_);
+  }
+
+ private:
+  mutable mutex mu_;
+  int64_t budget_ TF_GUARDED_BY(mu_) = 0;
+  // Number of bytes allocated by legacy prefetch autotuner.
+  int64_t legacy_prefetch_allocated_ TF_GUARDED_BY(mu_) = 0;
+  // Number of bytes allocated by the model.
+  int64_t model_allocated_ TF_GUARDED_BY(mu_) = 0;
+};
+
+// Abstract representation of a TensorFlow input pipeline node. It collects
+// information about inputs to this node, processing time spent executing the
+// node logic, number of elements produced by the node, various other
+// information (e.g. batch size or execution parallelism).
+//
+// Developers of tf.data transformations are not expected to interact with
+// this class directly. Boiler plate code for creating the abstract
+// representation of the input pipeline and collecting common information has
+// been added to the implementation of `DatasetBase` and `DatasetBaseIterator`
+// respectively.
+//
+// In addition, `DatasetBaseIterator` provides wrappers that can be used for
+// transformation-specific information collection. The `SetMetadata` wrapper
+// can be used to pass arbitrary metadata to the modeling framework, while the
+// `StartWork` and `StopWork` wrappers should be used to correctly account for
+// processing time of multi-threaded transformation that yield the CPU; such
+// transformations should invoke `StartWork()` when a transformation thread
+// starts executing (e.g. when created or woken up) and `StopWork()` when a
+// transformation thread stops executing (e.g. when returning or waiting).
+class Node {
+ public:
+  // Arguments for `Node` constructor.
+  struct Args {
+    int64_t id;
+    string name;
+    std::shared_ptr<Node> output;
+  };
+
+  using Factory = std::function<std::shared_ptr<Node>(Args)>;
+  using NodeVector = std::vector<std::shared_ptr<Node>>;
+  using NodePairList =
+      std::list<std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>>>;
+  using ModelParameters =
+      std::vector<std::pair<string, std::shared_ptr<Parameter>>>;
+  using NodeValues = absl::flat_hash_map<string, double>;
+  using ParameterGradients =
+      absl::flat_hash_map<std::pair<string, string>, double>;
+
+  explicit Node(Args args)
+      : id_(args.id),
+        name_(std::move(args.name)),
+        autotune_(true),
+        buffered_bytes_(0),
+        peak_buffered_bytes_(0),
+        buffered_elements_(0),
+        buffered_elements_low_(std::numeric_limits<int64_t>::max()),
+        buffered_elements_high_(std::numeric_limits<int64_t>::min()),
+        bytes_consumed_(0),
+        bytes_produced_(0),
+        num_elements_(0),
+        processing_time_(0),
+        record_metrics_(true),
+        metrics_(name_),
+        output_(args.output.get()),
+        output_weak_ptr_(args.output) {}
+
+  virtual ~Node() {
+    // Clear the sub-nodes instead of relying on implicit shared pointer
+    // destructor to avoid potential stack overflow when the tree is deep.
+    std::deque<std::shared_ptr<Node>> queue;
+    {
+      mutex_lock l(mu_);
+      while (!inputs_.empty()) {
+        queue.push_back(inputs_.front());
+        inputs_.pop_front();
+      }
+    }
+    while (!queue.empty()) {
+      auto node = queue.back();
+      queue.pop_back();
+      {
+        mutex_lock l(node->mu_);
+        while (!node->inputs_.empty()) {
+          queue.push_back(node->inputs_.front());
+          node->inputs_.pop_front();
+        }
+      }
+    }
+
+    FlushMetrics();
+  }
+
+  // Adds an input.
+  void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    inputs_.push_back(node);
+  }
+
+  // Increments the aggregate processing time by the given delta.
+  void add_processing_time(int64_t delta) TF_LOCKS_EXCLUDED(mu_) {
+    processing_time_ += delta;
+  }
+
+  // Returns an indication whether autotuning is enabled for this node.
+  bool autotune() const TF_LOCKS_EXCLUDED(mu_) { return autotune_; }
+
+  // Returns the number of bytes stored in this node's buffer.
+  int64_t buffered_bytes() const TF_LOCKS_EXCLUDED(mu_) {
+    return buffered_bytes_;
+  }
+
+  // Returns the peak number of bytes stored in this node's buffer.
+  int64_t peak_buffered_bytes() const TF_LOCKS_EXCLUDED(mu_) {
+    return peak_buffered_bytes_;
+  }
+
+  // Returns the number of elements stored in this node's buffer.
+  int64_t buffered_elements() const TF_LOCKS_EXCLUDED(mu_) {
+    return buffered_elements_;
+  }
+
+  // Returns the low watermark of the number of elements stored in this node's
+  // buffer. The watermarks are reset at the beginning of the execution time and
+  // each time the buffer is upsized or downsized.
+  int64_t buffered_elements_low() const TF_LOCKS_EXCLUDED(mu_) {
+    return buffered_elements_low_;
+  }
+
+  // Returns the high watermark of the number of elements stored in this node's
+  // buffer. The watermarks are reset at the beginning of the execution time and
+  // each time the buffer is upsized or downsized.
+  int64_t buffered_elements_high() const TF_LOCKS_EXCLUDED(mu_) {
+    return buffered_elements_high_;
+  }
+
+  // Returns the number of bytes consumed by the node.
+  int64_t bytes_consumed() const TF_LOCKS_EXCLUDED(mu_) {
+    return bytes_consumed_;
+  }
+
+  // Returns the number of bytes produced by the node.
+  int64_t bytes_produced() const TF_LOCKS_EXCLUDED(mu_) {
+    return bytes_produced_;
+  }
+
+  // Indicates whether the node has tunable parameters.
+  bool has_tunable_parameters() const TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    for (const auto& pair : parameters_) {
+      if (pair.second->state->tunable) return true;
+    }
+    return false;
+  }
+
+  // Returns the unique node ID.
+  int64_t id() const TF_LOCKS_EXCLUDED(mu_) { return id_; }
+
+  // Returns the node inputs.
+  std::list<std::shared_ptr<Node>> inputs() const TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return inputs_;
+  }
+
+  // Returns a longer node name that is guaranteed to be unique.
+  string long_name() const { return strings::StrCat(name_, "(id:", id_, ")"); }
+
+  // Returns the node name.
+  const string& name() const { return name_; }
+
+  // Returns the number of elements produced by the node.
+  int64_t num_elements() const TF_LOCKS_EXCLUDED(mu_) { return num_elements_; }
+
+  // Returns the node output.
+  Node* output() const { return output_; }
+  std::shared_ptr<Node> output_shared() { return output_weak_ptr_.lock(); }
+
+  // Returns the parameter value.
+  double parameter_value(const string& name) const TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return parameters_.at(name)->state->value;
+  }
+
+  // Returns the aggregate processing time.
+  int64_t processing_time() const TF_LOCKS_EXCLUDED(mu_) {
+    return processing_time_;
+  }
+
+  // Records that the node consumed the given number of bytes.
+  void record_bytes_consumed(int64_t num_bytes) {
+    bytes_consumed_ += num_bytes;
+  }
+
+  // Records that the node produced the given number of bytes.
+  void record_bytes_produced(int64_t num_bytes) {
+    bytes_produced_ += num_bytes;
+  }
+
+  // Records the change in this node's buffer.
+  void record_buffer_event(int64_t bytes_delta, int64_t elements_delta) {
+    buffered_bytes_ += bytes_delta;
+    peak_buffered_bytes_.store(std::max(peak_buffered_bytes_, buffered_bytes_));
+    buffered_elements_ += elements_delta;
+    // There is no need to maintain watermarks for synchronous ops because we
+    // will not upsize or downsize the buffers of synchronous ops.
+    if (IsAsync()) {
+      int64_t low_watermark =
+          std::min(buffered_elements_low_, buffered_elements_);
+      buffered_elements_low_ = low_watermark;
+      int64_t high_watermark =
+          std::max(buffered_elements_high_, buffered_elements_);
+      buffered_elements_high_ = high_watermark;
+    }
+  }
+
+  // Records that the node produced an element.
+  void record_element() TF_LOCKS_EXCLUDED(mu_) {
+    num_elements_++;
+    {
+      mutex_lock l(mu_);
+      UpdateProcessingTimeEma();
+    }
+  }
+
+  // Records that a node thread has started executing.
+  void record_start(int64_t time_nanos) TF_LOCKS_EXCLUDED(mu_) {
+    DCHECK_EQ(work_start_, 0);
+    work_start_ = time_nanos;
+  }
+
+  // Records that a node thread has stopped executing.
+  void record_stop(int64_t time_nanos) TF_LOCKS_EXCLUDED(mu_) {
+    // TODO(jsimsa): Use DCHECK_NE(work_start_, 0) here.
+    if (work_start_ != 0) {
+      processing_time_ += time_nanos - work_start_;
+      work_start_ = 0;
+    } else {
+      VLOG(1) << "Encountered a stop event without a matching start event.";
+    }
+  }
+
+  // Returns whether work is currently being recorded, i.e. whether we are
+  // currently between a `record_start` and a `record_stop`.
+  bool is_recording() TF_LOCKS_EXCLUDED(mu_) { return work_start_ > 0; }
+
+  // Removes an input.
+  void remove_input(std::shared_ptr<Node> input) TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    inputs_.remove(input);
+  }
+
+  // Sets the value that determines whether autotuning is enabled for this node.
+  void set_autotune(bool autotune) TF_LOCKS_EXCLUDED(mu_) {
+    autotune_.store(autotune);
+  }
+
+  // Resets buffer watermarks to the current buffered elements.
+  void ResetBufferWatermarks() {
+    if (!IsAsync()) {
+      return;
+    }
+    int64_t current_buffer_size = buffered_elements_;
+    buffered_elements_low_ = current_buffer_size;
+    buffered_elements_high_ = current_buffer_size;
+  }
+
+  // Returns true for asynchronous nodes; false otherwise.
+  virtual bool IsAsync() const { return false; }
+
+  // Returns the ratio of the node, which is defined as the number of elements
+  // per input needed by the node to produce an element, e.g. batch size of a
+  // `Batch`. It can be 0 if the ratio is unknown.
+  virtual double Ratio() const { return 1.0; }
+
+  // Computes the self time in nanoseconds of the node to produce one element.
+  virtual double ComputeSelfTime() const;
+
+  // Returns the parameter value if it exists, not ok status otherwise.
+  absl::StatusOr<double> ParameterValue(const std::string& parameter_name) const
+      TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    if (parameters_.contains(parameter_name)) {
+      return parameters_.at(parameter_name)->value;
+    }
+    return errors::NotFound("Parameter ", parameter_name,
+                            " was not found in model node ", long_name());
+  }
+
+  // Given the average time between events when the elements in the buffer are
+  // produced (`producer_time`), the average time between events when elements
+  // in the buffer are consumed (`consumer_time`) and the buffer size, the
+  // method computes the expected time a consumer event will have to wait.
+  //
+  // The wait time is approximated as the product of the probability the buffer
+  // will be empty and the time it takes to produce an element into the buffer.
+  //
+  // The formula used for computing the probability is derived by modeling the
+  // problem as an M/M/1/K queue
+  // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+  //
+  // Collects derivatives of `ComputeWaitTime` w.r.t `producer_time`,
+  // `consumer_time' and `buffer_size` if the corresponding pointers are not
+  // `nullptr`.
+  static double ComputeWaitTime(double producer_time, double consumer_time,
+                                double buffer_size,
+                                double* producer_time_derivative,
+                                double* consumer_time_derivative,
+                                double* buffer_size_derivative);
+
+  // Collects tunable parameters in the subtree rooted in this node.
+  ModelParameters CollectTunableParameters() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Collects tunable parameters in this node.
+  ModelParameters CollectNodeTunableParameters() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns a human-readable representation of this node.
+  string DebugString() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Flushes the metrics recorded by this node.
+  void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns the per-element output time for this node and if `gradients` is not
+  // `nullptr`, collects the output time gradient w.r.t. tunable parameters of
+  // the subtree rooted in this node.
+  double OutputTime(NodeValues* input_times,
+                    ParameterGradients* gradients) const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns a copy of this node, making a deep copy of its inputs and a
+  // shallow copy of its tunable parameters.
+  //
+  // The purpose for this method is to allow the model optimization logic to
+  // operate over immutable state while allowing concurrent model updates.
+  std::shared_ptr<Node> Snapshot() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns the per-element processing time in nanoseconds spent in this node.
+  double SelfProcessingTime() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns the total number of bytes buffered in all nodes in the subtree for
+  // which autotuning is enabled.
+  double TotalBufferedBytes() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Collects the total buffer limit of all nodes in the subtree for which
+  // autotuning is enabled. This number represents the amount of memory that
+  // would be used by the subtree nodes if all of their buffers were full.
+  double TotalMaximumBufferedBytes() const TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns the per-element CPU time in nanoseconds spent in the subtree rooted
+  // in this node. If `processing_times` is not `nullptr`, collects the
+  // per-element CPU time spent in each node of the subtree.
+  double TotalProcessingTime(NodeValues* processing_times)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Produces a proto for this node. Does not produce a proto for input nodes.
+  virtual absl::Status ToProto(ModelProto::Node* node_proto) const;
+
+  // Restores a node from the proto. Does not restore input nodes.
+  static absl::Status FromProto(ModelProto::Node node_proto,
+                                std::shared_ptr<Node> output,
+                                std::shared_ptr<Node>* node);
+
+  // Returns a vector of nodes of the subtree rooted in this node. The nodes are
+  // either in breadth-first search or reverse breadth-first search order
+  // depending on the `order` argument. The nodes are collected based on the
+  // results of the `collect_node` predicate: if the predicate returns `false`
+  // for a given node, then the subtree rooted in this node is excluded. The
+  // root node itself is not collected.
+  NodeVector CollectNodes(TraversalOrder order,
+                          bool collect_node(const std::shared_ptr<Node>)) const
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Downsizes buffer parameters of this node. Returns true if any buffer is
+  // downsized.
+  bool TryDownsizeBuffer();
+
+  // Collects buffer parameters of this node that should be upsized.
+  void CollectBufferParametersToUpsize(
+      absl::flat_hash_map<Node*, Parameter*>& node_parameters);
+
+  // Returns the average size of an element buffered in this node.
+  double AverageBufferedElementSize() const {
+    tf_shared_lock l(mu_);
+    return AverageBufferedElementSizeLocked();
+  }
+
+  // Copies node's parameter state value to parameter value if the parameter
+  // name matches `parameter_name`.
+  void SyncStateValuesToParameterValues(const std::string& parameter_name);
+
+  void SetEstimatedElementSize(std::optional<int64_t> estimated_element_size) {
+    mutex_lock l(mu_);
+    estimated_element_size_ = estimated_element_size;
+  }
+
+ protected:
+  // Used for (incrementally) recording metrics. The class is thread-safe.
+  class Metrics {
+   public:
+    explicit Metrics(const string& name)
+        : bytes_consumed_counter_(metrics::GetTFDataBytesConsumedCounter(name)),
+          bytes_produced_counter_(metrics::GetTFDataBytesProducedCounter(name)),
+          num_elements_counter_(metrics::GetTFDataElementsCounter(name)),
+          recorded_bytes_consumed_(0),
+          recorded_bytes_produced_(0),
+          recorded_num_elements_(0) {}
+
+    // Expects the total number of bytes consumed and records the delta since
+    // last invocation.
+    void record_bytes_consumed(int64_t total_bytes) {
+      int64_t delta =
+          total_bytes - recorded_bytes_consumed_.exchange(total_bytes);
+      bytes_consumed_counter_->IncrementBy(delta);
+    }
+
+    // Expects the total number of bytes produced and records the delta since
+    // last invocation.
+    void record_bytes_produced(int64_t total_bytes) {
+      int64_t delta =
+          total_bytes - recorded_bytes_produced_.exchange(total_bytes);
+      bytes_produced_counter_->IncrementBy(delta);
+    }
+
+    // Expects the total number of elements produced and records the delta since
+    // last invocation.
+    void record_num_elements(int64_t total_elements) {
+      int64_t delta =
+          total_elements - recorded_num_elements_.exchange(total_elements);
+      num_elements_counter_->IncrementBy(delta);
+    }
+
+   private:
+    monitoring::CounterCell* const bytes_consumed_counter_;
+    monitoring::CounterCell* const bytes_produced_counter_;
+    monitoring::CounterCell* const num_elements_counter_;
+    std::atomic<int64_t> recorded_bytes_consumed_;
+    std::atomic<int64_t> recorded_bytes_produced_;
+    std::atomic<int64_t> recorded_num_elements_;
+  };
+
+  // Computes the exponential moving average of processing time per element.
+  void UpdateProcessingTimeEma() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (previous_processing_time_ == 0) {
+      if (num_elements_ > 0) {
+        processing_time_ema_ =
+            static_cast<double>(processing_time_) /
+            static_cast<double>(num_elements_ + buffered_elements_);
+      } else {
+        processing_time_ema_ = static_cast<double>(processing_time_);
+      }
+    } else {
+      processing_time_ema_ =
+          (1.0 - kProcessingTimeEmaWeight) * processing_time_ema_ +
+          kProcessingTimeEmaWeight *
+              static_cast<double>(processing_time_ - previous_processing_time_);
+    }
+    previous_processing_time_ = processing_time_;
+  }
+
+  // Returns the number of inputs.
+  int64_t num_inputs() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+    int64_t num_inputs = 0;
+    for (auto& input : inputs_) {
+      // Inputs for which autotuning is disabled are excluded.
+      if (input->autotune()) {
+        ++num_inputs;
+      }
+    }
+    return num_inputs;
+  }
+
+  // Creates a clone of this node.
+  virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // Returns the average size of an element buffered in this node.
+  double AverageBufferedElementSizeLocked() const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Returns the sum of per-element output time for the tunable inputs of this
+  // node.
+  double OutputTimeForInputs(const NodeValues& output_times) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Returns the sum of output time gradient w.r.t. input time for the tunable
+  // inputs of this node.
+  double OutputTimeGradientsForInputs(const NodeValues& output_time_gradients)
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Computes the input time for this node and stores it in `input_times`.
+  virtual void InputTimeLocked(NodeValues* input_times) const
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // Computes the per-element output time for this node and stores it in
+  // `output_times`. If `gradients` is not `nullptr`, computes the output time
+  // gradient w.r.t. tunable parameters of the subtree rooted in this node and
+  // stores it in `gradients`, also computes the output time gradient w.r.t.
+  // input time and stores it in `output_time_gradients`.
+  virtual void OutputTimeLocked(const NodeValues& input_times,
+                                ParameterGradients* gradients,
+                                NodeValues* output_times,
+                                NodeValues* output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // Returns the sum of per-element processing time for the inputs of this node
+  // by adding values for input nodes in `total_processing_times`. Processing
+  // time for a given input is a weighted combination of a statistic based on
+  // history of input processing time and the actual time. This is done to
+  // improve accuracy of processing time estimation for newly created inputs.
+  //
+  // Uniform distribution of per-element processing times across different
+  // inputs is assumed.
+  double TotalProcessingTimeForInputs(const NodeValues& total_processing_times)
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Returns the per-element processing time spent in this node.
+  double SelfProcessingTimeLocked() const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Computes the per-element CPU time spent in the subtree rooted in this node
+  // and stores it in `total_processing_times`. If `processing_times` is not
+  // `nullptr`, collects the per-element CPU time spent in each node of the
+  // subtree.
+  virtual void TotalProcessingTimeLocked(NodeValues* processing_times,
+                                         NodeValues* total_processing_times)
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // This is the locked version of the public `CollectNodes`.
+  NodeVector CollectNodesLocked(TraversalOrder order,
+                                bool collect_node(const std::shared_ptr<Node>))
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Collects tunable parameters in the subtree rooted in this node assuming
+  // mutex locked.
+  ModelParameters CollectTunableParametersLocked() const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Collect tunable parameters on the nodes which have recorded
+  // elements.
+  void CollectTunableParametersHelper(ModelParameters* parameters) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Build up debug string for the node and store in the debug strings map.
+  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Copy the node and add the (input, copy) pairs to the NodePairList.
+  std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> cloned_output,
+                                       NodePairList* node_pairs) const;
+
+  // Compute total buffered bytes for the node and store in the total bytes map.
+  void TotalBufferedBytesHelper(NodeValues* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Compute total maximum buffered bytes for the node and store in the total
+  // bytes map.
+  void TotalMaximumBufferedBytesHelper(NodeValues* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Compute and return the maximum buffered bytes on the node itself. By
+  // default non-tunable nodes are assumed not to buffer any bytes, so the
+  // tunable nodes as subclasses are expected to override this method to ensure
+  // that the optimization algorithm respects the memory budget.
+  virtual double MaximumBufferedBytes() const TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Restores node from the proto. Note that this is not done recursively, i.e.
+  // input nodes are not restored.
+  static absl::Status FromProtoHelper(ModelProto::Node node_proto,
+                                      std::shared_ptr<Node> node);
+
+  // Stores the time passed to the last call to `Node::record_start()` on the
+  // current thread.
+  //
+  // NOTE: This thread-local variable is shared between all instances of `Node`
+  // on which the same thread calls `record_start()` or `record_stop()`. It
+  // relies on the invariant that at most one `Node` can be "active" on a
+  // particular thread at any time. Therefore if `n->record_start()` is called
+  // on thread `t`, then `n->record_stop()` must be called before another call
+  // to `Node::record_start()` (for any node).
+  static thread_local int64_t work_start_;  // Will be initialized to zero.
+
+  mutable mutex mu_;
+  const int64_t id_;
+  const string name_;
+
+  // Indicates whether the subtree rooted in this node should be included in
+  // autotuning. In particular, if this is `false`, then the subtree is excluded
+  // from computation of output time and processing time.
+  std::atomic<bool> autotune_;
+  std::atomic<int64_t> buffered_bytes_;
+  std::atomic<int64_t> peak_buffered_bytes_;
+  std::atomic<int64_t> buffered_elements_;
+  std::atomic<int64_t> buffered_elements_low_;
+  std::atomic<int64_t> buffered_elements_high_;
+  std::atomic<int64_t> bytes_consumed_;
+  std::atomic<int64_t> bytes_produced_;
+  std::atomic<int64_t> num_elements_;
+  std::atomic<int64_t> processing_time_;
+  std::atomic<bool> record_metrics_;
+  Metrics metrics_;
+  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters_
+      TF_GUARDED_BY(mu_);
+
+  // Statistic of inputs processing time history.
+  double input_processing_time_sum_ = 0.0L;
+  int64_t input_processing_time_count_ = 0;
+
+  // Holds the previous processing time and the per element processing time
+  // exponential moving average.
+  int64_t previous_processing_time_ TF_GUARDED_BY(mu_) = 0;
+  double processing_time_ema_ TF_GUARDED_BY(mu_) = 0.0;
+
+  // Inputs of this node. These can represent an iterator created from the input
+  // dataset but also other input iterators (e.g. created by the user-defined
+  // functions of `flat_map` or `interleave`).
+  std::list<std::shared_ptr<Node>> inputs_ TF_GUARDED_BY(mu_);
+
+  // The reference to the output node is not owned so that deletion of a
+  // node results in recursive deletion of the subtree rooted in the node.
+  Node* const output_;
+  std::weak_ptr<Node> output_weak_ptr_;
+  std::optional<int64_t> estimated_element_size_ TF_GUARDED_BY(mu_) =
+      std::nullopt;
+};
+
+// InterleaveMany is used to model datasets whose inputs are used to create
+// datasets whose elements are then interleaved.
+std::shared_ptr<Node> MakeInterleaveManyNode(
+    Node::Args args, std::vector<std::shared_ptr<Parameter>> parameters);
+
+// AsyncInterleaveMany nodes are the asynchronous version of InterleaveMany
+// nodes.
+std::shared_ptr<Node> MakeAsyncInterleaveManyNode(
+    Node::Args args, std::vector<std::shared_ptr<Parameter>> parameters);
+
+// KnownMany nodes model datasets that synchronously consume known number of
+// input element per output element.
+std::shared_ptr<Node> MakeKnownRatioNode(Node::Args args, double ratio);
+
+// AsyncKnownRatio nodes are the asynchronous version of KnownRate nodes.
+std::shared_ptr<Node> MakeAsyncKnownRatioNode(
+    Node::Args args, double ratio, double memory_ratio,
+    std::vector<std::shared_ptr<Parameter>> parameters,
+    bool is_legacy_prefetch_autotuned = false);
+
+// Makes an AsyncKnownRatioNode. If `estimated_element_size` is provided,
+// it will be used during the estimation of maximum buffered bytes.
+std::shared_ptr<Node> MakeAsyncKnownRatioNode(
+    Node::Args args, double ratio,
+    std::vector<std::shared_ptr<Parameter>> parameters,
+    bool is_legacy_prefetch_autotuned = false,
+    std::optional<int64_t> estimated_element_size = std::nullopt);
+
+// Source nodes represent data sources.
+std::shared_ptr<Node> MakeSourceNode(Node::Args args);
+
+// UnknownMany nodes represent datasets that synchronously consume an
+// unknown number of input elements per output.
+//
+// Unlike KnownRatio nodes which expect the ratio between inputs and outputs is
+// specified as a parameter, UnknownRatio estimates the ratio empirically.
+std::shared_ptr<Node> MakeUnknownRatioNode(Node::Args args);
+
+// AsyncUnknownRatio nodes are the asynchronous version of unknown ratio nodes.
+std::shared_ptr<Node> MakeAsyncUnknownRatioNode(
+    Node::Args args, std::vector<std::shared_ptr<Parameter>> parameters);
+
+// Unknown nodes represent datasets for which we do not have a model. It acts
+// as pass-through between inputs and output.
+std::shared_ptr<Node> MakeUnknownNode(Node::Args args);
+
+// Abstract representation of a TensorFlow input pipeline that can be used
+// for collecting runtime information and optimizing performance. It collects
+// runtime information about execution of the input pipeline that is used to
+// create a performance model, which is in turn used to identify optimal values
+// of tunable parameters.
+//
+// Developers of tf.data transformations are not expected to interact with this
+// class directly. Boiler plate code for creating the abstract representation of
+// the input pipeline and collecting runtime information has been added to the
+// implementation of `DatasetBase` and `DatasetBaseIterator` respectively.
+//
+// The order of locks acquired is SharedState lock, Model lock, Node lock.
+// SharedState lock is acquired first because it shares the same lock as the
+// dataset iterator that contains it.
+class Model {
+ public:
+  using OptimizationParams = ModelProto::OptimizationParams;
+  using ModelParameters = Node::ModelParameters;
+  using NodeValues = Node::NodeValues;
+  using ParameterGradients = Node::ParameterGradients;
+
+  explicit Model(std::optional<std::string> dataset_name);
+  explicit Model() : Model(std::nullopt) {}
+  ~Model();
+
+  // Returns a pointer to the model's output node.
+  std::shared_ptr<Node> output() const {
+    mutex_lock l(mu_);
+    return output_;
+  }
+
+  // Set the experiment that this job is part of.
+  void AddExperiment(const std::string& experiment) {
+    experiments_.insert(experiment);
+  }
+
+  // Adds a node with the given name and given parent.
+  void AddNode(Node::Factory factory, const string& name,
+               std::shared_ptr<Node> parent, std::shared_ptr<Node>* out_node)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns a human-readable string representation of the model. This method
+  // can be invoked automatically by monitoring gauges and to avoid frequent
+  // recomputation, the implementation caches the result.
+  std::string DebugString();
+
+  // Uses the given algorithm and resource budgets to periodically perform the
+  // autotuning optimization.
+  //
+  // `cpu_budget_func` can be used to provide the optimizer with up-to-date
+  // values in cases where CPUs budgets may be changed by the runtime
+  // dynamically.
+  //
+  // `ram_budget_func` is similar to `cpu_budget_func`. This lambda takes a
+  // parameter that is the total number of bytes currently buffered by the
+  // model.
+  //
+  // To terminate the execution of the optimization loop, the caller needs to
+  // invoke `cancellation_mgr->StartCancel()`.
+  absl::Status OptimizeLoop(AutotuneAlgorithm algorithm,
+                            std::function<int64_t()> cpu_budget_func,
+                            double ram_budget_share,
+                            std::optional<int64_t> fixed_ram_budget,
+                            RamBudgetManager& ram_budget_manager,
+                            CancellationManager* cancellation_manager);
+
+  // Uses the given algorithm and resource budgets to perform the autotuning
+  // optimization.
+  void Optimize(AutotuneAlgorithm algorithm,
+                std::function<int64_t()> cpu_budget_func,
+                double ram_budget_share,
+                std::optional<int64_t> fixed_ram_budget,
+                double model_input_time, RamBudgetManager& ram_budget_manager,
+                CancellationManager* cancellation_manager);
+
+  // Optimizes buffers in the pipeline rooted at `snapshot`. It downsizes
+  // buffers that are too large and upsizes buffers that are too small while
+  // respecting the ram budget. If any node is downsized or upsized, the
+  // watermarks of all nodes are reset to the buffered elements.
+  void OptimizeBuffers(std::shared_ptr<Node> snapshot, int64_t ram_budget);
+
+  // Collects the output time and if `gradients` is not `nullptr`, the output
+  // time gradient w.r.t. tunable parameters of the subtree rooted in the given
+  // node.
+  double OutputTime(std::shared_ptr<Node> node, double model_input_time,
+                    ParameterGradients* gradients);
+
+  // Removes the given node.
+  void RemoveNode(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_);
+
+  // Produces a proto for this model.
+  absl::Status ToProto(ModelProto* model_proto);
+
+  // Restores a model from the proto.
+  static absl::Status FromProto(ModelProto model_proto,
+                                std::unique_ptr<Model>* model);
+
+  // Saves this model with a given snapshot and its optimization parameters to a
+  // file. Note that the file directory must already exist.
+  absl::Status Save(const string& fname, std::shared_ptr<Node> snapshot,
+                    const OptimizationParams& optimization_params);
+
+  // Loads a model and its optimization parameters from a file with the given
+  // name.
+  static absl::Status Load(const string& fname, std::unique_ptr<Model>* model,
+                           OptimizationParams* optimization_params);
+
+  // Records gap time between consecutive `GetNext()` calls.
+  void RecordIteratorGapTime(uint64_t duration_usec);
+
+  // Computes the target time in nsecs to use for `STAGE_BASED` autotune
+  // algorithm. Returns 0 if there if there are not sufficient recorded iterator
+  // gap times to produce a good estimate.
+  double ComputeTargetTimeNsec();
+
+  // Computes the target time in nsecs to use for estimating input bottlenecks.
+  // Returns 0 if there are not sufficient recorded iterator gap times to
+  // produce a good estimate.
+  double ComputeExperimentalTargetTimeNsec();
+
+  // Returns the time in nanoseconds it takes the pipeline to produce an
+  // element, according to the latest model snapshot obtained from optimization.
+  // Returns 0 if the model snapshot is empty or null. This may be caused by not
+  // having executed an optimization round before.
+  double ComputeSnapshotProcessingTimeNsec() const;
+
+ private:
+  // Determines whether optimization should stop given total processing time,
+  // estimated output time, and estimated number of buffers bytes.
+  using StopPredicate =
+      std::function<bool(const ModelParameters&, double, double, double)>;
+
+  static constexpr int64_t kOptimizationPeriodMinMs = 10;
+  static constexpr int64_t kOptimizationPeriodMaxMs =
+      60 * EnvTime::kSecondsToMillis;
+
+  // Collects tunable parameters in the tree rooted in the given node, returning
+  // a vector which contains pairs of node names and tunable parameters.
+  ModelParameters CollectTunableParameters(std::shared_ptr<Node> node);
+
+  // Copy parameter state values to parameter values if necessary.For some
+  // nodes, the parameter state values are not tuned by Autotune and hence the
+  // parameter values can be stale. We do not sync all parameters because it may
+  // increase mutex contention with `GetNext()`.
+  void MaybeSyncStateValuesToValues(std::shared_ptr<Node> snapshot);
+
+  // Downsizes buffers that are too large for all nodes rooted at `snapshot`.
+  // Returns true if any buffer is downsized.
+  bool DownsizeBuffers(std::shared_ptr<Node> snapshot);
+
+  // Upsizes buffers that are too small for all nodes rooted at `snapshot` while
+  // respecting the ram budget. Returns true if any buffer is upsized.
+  bool UpsizeBuffers(std::shared_ptr<Node> snapshot, int64_t ram_budget);
+
+  // Reset buffer watermarks of all asynchronous nodes to their buffered
+  // elements.
+  void ResetBufferWatermarks();
+
+  // Collects buffer parameters of all nodes in the model that should be
+  // upsized.
+  absl::flat_hash_map<Node*, Parameter*> CollectBufferParametersToUpsize(
+      std::shared_ptr<Node> snapshot);
+
+  // Flushes metrics recorded by the model.
+  void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
+
+  // This optimization algorithm starts by setting all tunable parallelism
+  // parameters to the minimum value. It then improves current parameters by
+  // making a step in the direction opposite to the gradient of `OutputTime` and
+  // projecting resulting values on the feasible intervals. Improvement step is
+  // repeated until either the output time improvement is smaller than threshold
+  // value or the output time is less than the processing time needed to produce
+  // an element divided by CPU budget.
+  void OptimizeGradientDescent(std::shared_ptr<Node> snapshot,
+                               const OptimizationParams& optimization_params,
+                               CancellationManager* cancellation_manager);
+
+  // Helper method for implementing hill-climb optimization that can be
+  // parametrized by a predicate to use for stopping the optimization.
+  void OptimizeHillClimbHelper(std::shared_ptr<Node> snapshot,
+                               const OptimizationParams& optimization_params,
+                               CancellationManager* cancellation_manager,
+                               int64_t ram_budget,
+                               RamBudgetManager& ram_budget_manager,
+                               StopPredicate should_stop);
+
+  // This optimization algorithm starts by setting all tunable parallelism
+  // parameters to the minimum value. It then repeatedly identifies the
+  // parameter whose increase in parallelism decreases the output time the most.
+  // This process is repeated until all parameters reach their maximum values or
+  // the projected output time is less than or equal to the processing time
+  // needed to produce an element divided by CPU budget.
+  void OptimizeHillClimb(std::shared_ptr<Node> snapshot,
+                         const OptimizationParams& optimization_params,
+                         CancellationManager* cancellation_manager,
+                         RamBudgetManager& ram_budget_manager);
+
+  // This optimization behaves similarly to the hill climb optimization but uses
+  // a relaxed stoping condition, allowing the optimization to oversubscribe
+  // CPU.
+  void OptimizeMaxParallelism(std::shared_ptr<Node> snapshot,
+                              const OptimizationParams& optimization_params,
+                              CancellationManager* cancellation_manager,
+                              RamBudgetManager& ram_budget_manager);
+
+  // This optimization starts by setting all tunable parallelism parameters to
+  // their minimum values. It then repeatedly increases the parallelism
+  // parameter of the longest stage by 1 until either the longest stage is
+  // faster than the target time or the memory or CPU budget is fully utilized.
+  // TODO(b/226910071): The second part of this algorithm optimizes the buffer
+  // sizes of parallel ops.
+  void OptimizeStageBased(std::shared_ptr<Node> snapshot,
+                          const OptimizationParams& optimization_params,
+                          CancellationManager* cancellation_manager,
+                          RamBudgetManager& ram_budget_manager);
+
+  // This is the first part of the stage-based optimization that optimizes
+  // tunable parallelism parameters for async interleave many nodes only. We
+  // separately optimize async interleave many nodes more aggressively because
+  // the variance of IO is difficult to predict.
+  void OptimizeStageBasedAsyncInterleaveManyNodes(
+      std::shared_ptr<Node> snapshot,
+      const OptimizationParams& optimization_params,
+      CancellationManager* cancellation_manager,
+      RamBudgetManager& ram_budget_manager);
+
+  // This is the second part of the stage-based optimization that optimizes
+  // tunable parallelism parameters for all nodes other than async interleave
+  // many nodes.
+  void OptimizeStageBasedNonAsyncInterleaveManyNodes(
+      std::shared_ptr<Node> snapshot, double target_time_nsec,
+      const OptimizationParams& optimization_params,
+      CancellationManager* cancellation_manager,
+      RamBudgetManager& ram_budget_manager);
+
+  // Determines if we should stop the gradient descent optimization iterations
+  // based on number of increasable parameters, CPU budget, RAM budget and
+  // current resource usage.
+  bool ShouldStop(int64_t cpu_budget, int64_t ram_budget,
+                  const ModelParameters& parameters,
+                  const ModelParameters& parallelism_parameters,
+                  const ModelParameters& buffer_size_parameters,
+                  std::shared_ptr<Node> snapshot, bool* cpu_budget_reached);
+
+  // Collects the processing time for the given node.
+  double TotalProcessingTime(std::shared_ptr<Node> node);
+
+  // Collects the total number of bytes buffered in all nodes in the subtree
+  // rooted in the given node for which autotuning is enabled.
+  double TotalBufferedBytes(std::shared_ptr<Node> node);
+
+  // Collects the total buffer limit of all nodes in the subtree rooted in the
+  // given node for which autotuning is enabled. This number represents the
+  // amount of memory that would be used by the subtree nodes if all of their
+  // buffers were full.
+  double TotalMaximumBufferedBytes(std::shared_ptr<Node> node);
+
+  std::optional<std::string> dataset_name_;
+  // Used for coordination between different input pipeline threads. Exclusive
+  // access is required only when adding or removing nodes. Concurrent access to
+  // existing nodes is protected by a node mutex.
+  mutable mutex mu_;
+  // Used for coordinating the optimization loop and model modifications.
+  condition_variable optimize_cond_var_;
+  int64_t id_counter_ TF_GUARDED_BY(mu_) = 1;
+  std::shared_ptr<Node> output_ TF_GUARDED_BY(mu_) = nullptr;
+
+  // Determines the time the optimization loop should wait between
+  // running optimizations.
+  int64_t optimization_period_ms_ TF_GUARDED_BY(mu_);
+
+  // Gauge cell that can be used to collect the state of the model.
+  monitoring::GaugeCell<std::function<std::string()>>* model_gauge_cell_ =
+      nullptr;
+  // Used to synchronize metrics collection attempts against the model's
+  // destruction.
+  struct GuardedBool {
+    explicit GuardedBool(bool val) : val(val) {}
+    bool val TF_GUARDED_BY(mu);
+    mutex mu;
+  };
+  std::shared_ptr<GuardedBool> safe_to_collect_metrics_;
+
+  // Time use for rate limiting the recomputation of human-readable string
+  // representation of the model.
+  absl::Time cache_until_ = absl::InfinitePast();
+  // Cached result of the `DebugString()` invocation used to implement rate
+  // limiting of the computation.
+  std::string cached_debug_string_ = "";
+  // Used to coordinate gap time updates between different threads. Gap time is
+  // the time between the completion of the previous `GetNext()` and the start
+  // of the next `GetNext()`.
+  mutable mutex gap_mu_;
+  // Stores the latest gap times between consecutive `GetNext()`.
+  std::deque<uint64_t> gap_times_usec_ TF_GUARDED_BY(gap_mu_);
+  // The experiment that this job is part of.
+  absl::flat_hash_set<std::string> experiments_;
+  // Stores the optimization snapshot of the Model.
+  std::shared_ptr<Node> snapshot_ TF_GUARDED_BY(mu_);
+  // Stores the optimization parameters used by autotune.
+  OptimizationParams optimization_params_ TF_GUARDED_BY(mu_);
+  // Stores the model id in the string format
+  std::string model_id_;
+};
+
+// Class to compute timing information for a model.
+class ModelTiming {
+ public:
+  struct NodeTiming {
+    // Pipeline ratio is the number of elements this node needs to produce in
+    // order to produce an element at the root of the pipeline.
+    double pipeline_ratio = 0.0;
+    // The self time it takes this node to produce the elements needed to
+    // produce one element of the root of the pipeline.
+    double self_time_nsec = 0.0;
+    // The total time it takes this node and the subtree rooted at this node to
+    // produce the elements needed to produce one element at the root of the
+    // pipeline.
+    double total_time_nsec = 0.0;
+  };
+
+  explicit ModelTiming(std::shared_ptr<Node> root);
+
+  // Returns the timing data for `node`.
+  const NodeTiming* GetTiming(const Node* node) const;
+
+  // Returns the root nodes of all stages.
+  std::vector<std::shared_ptr<Node>> GetStageRoots() const;
+
+  // Returns all the nodes of a stage given the stage root.
+  std::vector<std::shared_ptr<Node>> GetStageNodes(
+      std::shared_ptr<Node> stage_root) const;
+
+  // Computes the total time for a node.
+  void ComputeNodeTotalTime(const Node& node);
+
+ private:
+  // Computes the pipeline ratios of all nodes.
+  void ComputePipelineRatios(const Node::NodeVector& bfs_nodes);
+
+  // Computes the total time for all nodes. The `reverse_bfs_nodes` are assumed
+  // to be a vector of model nodes in reversed BFS manner.
+  void ComputeTotalTimes(const Node::NodeVector& reverse_bfs_nodes);
+
+  // Computes the first input total time of an interleave node.
+  double ComputeInterleaveManyFirstInputTotalTime(const Node& node);
+
+  // Computes the total time of a node of any type other than async interleave.
+  void ComputeNonAsyncInterleaveManyTotalTime(const Node& node);
+
+  // Computes the total time of an async interleave node.
+  void ComputeAsyncInterleaveManyTotalTime(const Node& node);
+  // Computes the interleaved inputs' total time of an async interleave node.
+  double ComputeAsyncInterleaveManyInterleavedInputsTotalTime(const Node& node);
+
+  // Returns a vector of all nodes in the model. The nodes are either in
+  // breadth-first search or reverse breadth-first search order depending on the
+  // `order` argument. The nodes are collected based on the results of the
+  // `collect_node` predicate: if the predicate returns `false` for a given
+  // node, then the subtree rooted in this node is excluded. The root node
+  // itself is not collected.
+  Node::NodeVector CollectNodes(
+      std::shared_ptr<Node> root, TraversalOrder order,
+      bool collect_node(const std::shared_ptr<Node>)) const;
+
+  // Stores a pointer to the root of a model.
+  std::shared_ptr<Node> root_;
+
+  // Holds a mapping from node to its timing node.
+  absl::flat_hash_map<const Node*, NodeTiming> timing_nodes_;
+};
+
+}  // namespace model
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/node_def_builder.h b/third_party/tflite-hdrs/tensorflow/core/framework/node_def_builder.h
new file mode 100644
index 00000000..47b14f18
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/node_def_builder.h
@@ -0,0 +1,198 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_BUILDER_H_
+
+#include <functional>
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_node_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+class NodeDefBuilder;
+typedef std::function<absl::Status(const OpDef&, int, const NodeDef&,
+                                   NodeDefBuilder*)>
+    FakeInputFunctor;
+
+// This is a helper for creating a NodeDef.  Automatically sets attrs
+// that can be inferred from the inputs, and uses default values
+// (where they exist) for unspecified attrs.  Example usage:
+//
+//  NodeDef node_def;
+//  Status status = NodeDefBuilder(node_name, op_name)
+//                           .Input(...)
+//                           .Attr(...)
+//                           .Finalize(&node_def);
+//  if (!status.ok()) return status;
+//  // Use node_def here.
+class NodeDefBuilder {
+ public:
+  // To specify an output to be consumed by one of the Input() methods below.
+  struct NodeOut {
+    NodeOut(absl::string_view n, int i, DataType dt);
+    NodeOut();  // uninitialized, call Reset() before use.
+    void Reset(absl::string_view n, int i, DataType dt);
+    string node;
+    int index;
+    DataType data_type;
+  };
+
+  // Specify the name and the Op (either via an OpDef or the name of
+  // the Op plus a registry) for the NodeDef.  Other fields are
+  // specified by calling the methods below.
+  // REQUIRES: The OpDef must satisfy ValidateOpDef().
+  NodeDefBuilder(absl::string_view name, absl::string_view op_name,
+                 const OpRegistryInterface* op_registry = OpRegistry::Global(),
+                 const NodeDebugInfo* debug = nullptr);
+  NodeDefBuilder(absl::string_view name, absl::string_view op_name,
+                 const NodeDebugInfo& debug);
+  // REQUIRES: in addition, *op_def must outlive *this.
+  NodeDefBuilder(absl::string_view name, const OpDef* op_def);
+
+  // You must call one Input() function per input_arg in the Op,
+  // *and in the same order as the input_args appear in the OpDef.*
+
+  // For inputs that take a single tensor.
+  NodeDefBuilder& Input(absl::string_view src_node, int src_index, DataType dt);
+  NodeDefBuilder& Input(const NodeOut& src);
+
+  // For inputs that take a list of tensors.
+  NodeDefBuilder& Input(absl::Span<const NodeOut> src_list);
+
+  // To create inputs in tests, see fake_input.h.
+  NodeDefBuilder& Input(FakeInputFunctor fake_input);
+
+  // Specify that this node must only run after src_node.
+  NodeDefBuilder& ControlInput(absl::string_view src_node);
+
+  // Constrains what devices this node may be scheduled on.
+  NodeDefBuilder& Device(absl::string_view device_spec);
+
+  // Sets the attr, if not already set.  If already set with a different
+  // value, an error will be returned from Finalize().
+  NodeDefBuilder& Attr(absl::string_view name, const AttrValue& value);
+  NodeDefBuilder& Attr(absl::string_view name, AttrValue&& value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::string_view value);
+  NodeDefBuilder& Attr(absl::string_view name, const char* value);
+  NodeDefBuilder& Attr(absl::string_view name, int32_t value);
+  NodeDefBuilder& Attr(absl::string_view name, int64_t value);
+  NodeDefBuilder& Attr(absl::string_view name, float value);
+  NodeDefBuilder& Attr(absl::string_view name, double value);
+  NodeDefBuilder& Attr(absl::string_view name, bool value);
+  NodeDefBuilder& Attr(absl::string_view name, DataType value);
+  NodeDefBuilder& Attr(absl::string_view name, const PartialTensorShape& value);
+  NodeDefBuilder& Attr(absl::string_view name, const Tensor& value);
+  NodeDefBuilder& Attr(absl::string_view name, const TensorProto& value);
+  NodeDefBuilder& Attr(absl::string_view name, const NameAttrList& value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const absl::string_view> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const char* const> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const string> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const tstring> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int32> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int64_t> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const float> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const bool> value);
+  NodeDefBuilder& Attr(absl::string_view name, const std::vector<bool>& value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const DataType> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const TensorShape> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const PartialTensorShape> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const TensorShapeProto> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const Tensor> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const NameAttrList> value);
+
+  template <class T>
+  NodeDefBuilder& Attr(absl::string_view name, std::initializer_list<T> value) {
+    return Attr(name, gtl::ArraySlice<T>(value));
+  }
+
+  // Finish building the NodeDef, returning any errors or setting
+  // *node_def if none.
+  // If `consume` is true, the builder state will be moved into `node_def`,
+  // and the builder will be left in an undefined state.
+  // WARNING: Not all problems are detected!  The resulting NodeDef may
+  // not be valid!  Call ValidateNodeDef() from node_def_utils to be sure.
+  absl::Status Finalize(NodeDef* node_def, bool consume = false);
+
+  // Accessors for the values set in the constructor.
+  const string& node_name() const { return node_def_.name(); }
+  const OpDef& op_def() const { return *op_def_; }
+
+ private:
+  // Called in the constructors.
+  void Initialize();
+
+  // Get the current ArgDef and advance to the next one. Returns nullptr
+  // if no more inputs are available.
+  const OpDef::ArgDef* NextArgDef();
+
+  // Returns true if there is still an input_arg available in *op_def_,
+  // otherwise adds to error_ and returns false.
+  bool NextArgAvailable();
+
+  // These do the main work of the Input() methods.
+  void SingleInput(const OpDef::ArgDef* input_arg, absl::string_view src_node,
+                   int src_index, DataType dt);
+  void ListInput(const OpDef::ArgDef* input_arg,
+                 absl::Span<const NodeOut> src_list);
+
+  // Add "src_node:src_index" to the list of inputs in the node_def_.
+  void AddInput(absl::string_view src_node, int src_index);
+
+  // Generate an error if you can't pass dt when expected is expected.
+  void VerifyInputType(const OpDef::ArgDef* input_arg, DataType expected,
+                       DataType dt);
+
+  // If input_arg->is_ref() is true, generate an error if dt is not a ref.
+  void VerifyInputRef(const OpDef::ArgDef* input_arg, DataType dt);
+
+  // Makes dt a ref type if that is what the input_arg specifies.
+  DataType MaybeAddRef(const OpDef::ArgDef* input_arg, DataType dt) {
+    return input_arg->is_ref() ? MakeRefType(dt) : dt;
+  }
+
+  // Returns true if an attr named `name` is already present in the node_def_.
+  // If such an attr is already present and `value` is not equal to the present
+  // value, an error is generated.
+  bool AttrValueAlreadyPresent(absl::string_view name, const AttrValue& value);
+
+  const OpDef* op_def_;
+  NodeDef node_def_;
+  int inputs_specified_;
+  std::vector<string> control_inputs_;
+  std::vector<string> errors_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/node_def_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/node_def_util.h
new file mode 100644
index 00000000..2b82c596
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/node_def_util.h
@@ -0,0 +1,462 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+class AttrSlice;
+// We forward declare protos so that kernels don't need to depend on them
+class OpDef;
+class AttrValue;
+class NameAttrList;
+class TensorProto;
+class TensorShapeProto;
+
+// Name of the attribute used to encode node colocation constraints.
+//
+// Nodes can be co-located on the same device. Desire for explicit co-location
+// is described by list(string) attribute containing the name of colocation
+// groups.
+extern const char* const kColocationAttrName;
+
+// String prefix applied to the operation name for colocation constraints.
+extern const char* const kColocationGroupPrefix;
+
+// Constants for host CPU staging op for TPUExecute.
+extern const char* const kTpuExecuteStagingOp;
+extern const char* const kTpuExecuteStagingNodeName;
+
+// Produce a human-readable version of a Node or NodeDef that is more concise
+// than a text-format proto.
+//
+// The parameter `max_inputs_in_summary` specifies how many inputs at most to
+// serialize in the output (in order not to get a string which is overly large).
+// The value `-1` specifies that all inputs will be shown.
+std::string SummarizeNodeDef(const NodeDef& node_def,
+                             int max_inputs_in_summary = -1);
+std::string SummarizeAttrs(const NodeDef& node_def);
+std::string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device);
+
+// Produces a formatted string pattern from the node which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <node_name>}}
+std::string FormatNodeDefForError(const NodeDef& node_def);
+std::string FormatNodeDefForError(
+    absl::string_view node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
+
+typedef protobuf::Map<string, AttrValue> AttrValueMap;
+
+// Adds an attr with name <name> and value <value> to *node_def.
+// The type of the attr is based on the type of value.
+void AddNodeAttr(absl::string_view name, const AttrValue& value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, AttrValue&& value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::string_view value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, const char* value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, int32_t value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, int64_t value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, float value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, double value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, bool value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, DataType value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, const PartialTensorShape& value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, const Tensor& value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, const TensorProto& value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, const NameAttrList& value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name,
+                 absl::Span<const absl::string_view> value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const char* const> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const string> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const int32> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const int64_t> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const float> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const bool> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, const std::vector<bool>& value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const DataType> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const TensorShape> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name,
+                 absl::Span<const PartialTensorShape> value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name,
+                 absl::Span<const TensorShapeProto> value, NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const Tensor> value,
+                 NodeDef* node_def);
+void AddNodeAttr(absl::string_view name, absl::Span<const NameAttrList> value,
+                 NodeDef* node_def);
+
+// Version to workaround C++'s "perfect" forwarding not being able to
+// forward {...} initialization.
+template <class T>
+void AddNodeAttr(absl::string_view name, std::initializer_list<T> value,
+                 NodeDef* node_def) {
+  AddNodeAttr(name, gtl::ArraySlice<T>(value), node_def);
+}
+
+// Adds an attr to an attr value map.
+void AddAttr(absl::string_view name, const AttrValue& value, AttrValueMap* map);
+void AddAttr(absl::string_view name, bool value, AttrValueMap* map);
+
+class AttrSlice {
+ public:
+  AttrSlice(const NodeDef& node_def);  // NOLINT(runtime/explicit)
+
+  AttrSlice();  // Empty
+  explicit AttrSlice(const AttrValueMap* a);
+
+  int size() const { return attrs()->size(); }
+
+  // Returns the attr with attr_name if found.  Otherwise, returns
+  // nullptr.
+  const AttrValue* Find(absl::string_view attr_name) const;
+  const AttrValue* FindByString(const std::string& attr_name) const;
+
+  // Returns the attr_value for attr_name if found. Otherwise, returns a
+  // NotFound status.
+  absl::Status Find(absl::string_view attr_name,
+                    const AttrValue** attr_value) const;
+  absl::Status FindByString(const std::string& attr_name,
+                            const AttrValue** attr_value) const;
+
+  // Helper class to avoid allocations in EqualAttrs.
+  // TODO(irving): Will go away once NodeInfo is used.
+  struct Scratch {
+    std::string a;
+    std::string b;
+  };
+
+  // Check if all attrs and attr values match.  Does not take defaults into
+  // account.
+  //
+  // TODO(irving): There is a bug in this routine inherited from its
+  // OptimizerCSE::EqualAttrs predecessor.  The same tensor attr can be
+  // represented in more than one way as an AttrValue, since TensorProto is
+  // not 1-1.  This bug will go away once I replace everything with NodeInfo,
+  // which stores a Tensor object directly.  The Scratch object will also go
+  // away.
+  bool EqualAttrs(AttrSlice other, Scratch* scratch) const;
+
+  // If this AttrSlice has an attached NodeDef, summarize it.  This is for
+  // error messages only: we intentionally do not provide direct access to the
+  // NodeDef, since it is not always there.
+  std::string SummarizeNode() const;
+
+  // Iteration over all attrs
+  AttrValueMap::const_iterator begin() const { return attrs()->begin(); }
+  AttrValueMap::const_iterator end() const { return attrs()->end(); }
+
+  std::string DebugString() const;
+
+ private:
+  const AttrValueMap* attrs() const {
+    return ndef_ != nullptr ? &ndef_->attr() : attrs_;
+  }
+
+  absl::Status CheckFind(absl::string_view attr_name,
+                         const AttrValue* attr_value) const;
+
+  const NodeDef* ndef_;
+  const AttrValueMap* attrs_;
+};
+
+// Return true if the attr with the name attr_name is defined in node_def.
+bool HasNodeAttr(const NodeDef& node_def, absl::string_view attr_name);
+
+// Look up the attr with name attr_name and set *value to its value.  If no
+// attr with attr_name is found in node_def, or the attr does not have
+// a matching type, a non-ok status will be returned.
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::string* value);  // type: "string"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         tstring* value);  // type: "tstring"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         int64_t* value);  // type: "int"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         int32* value);  // type: "int"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         float* value);  // type: "float"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         bool* value);  // type: "bool"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         DataType* value);  // type: "type"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         TensorShapeProto* value);  // type: "shape"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         TensorShape* value);  // type: "shape"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         PartialTensorShape* value);  // type: "shape"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         Tensor* value);  // type: "tensor"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<string>* value);  // type "list(string)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<tstring>* value);  // type "list(tstring)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<int64_t>* value);  // type "list(int)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<int32>* value);  // type "list(int)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<float>* value);  // type "list(float)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<bool>* value);  // type "list(bool)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<DataType>* value);  // type "list(type)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         DataTypeVector* value);  // type "list(type)"
+absl::Status GetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<TensorShapeProto>* value);  // type "list(shape)"
+absl::Status GetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<TensorShape>* value);  // type "list(shape)"
+absl::Status GetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<PartialTensorShape>* value);  // type "list(shape)"
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         std::vector<Tensor>* value);  // type: "list(tensor)"
+
+template <typename T>
+StatusOr<T> GetNodeAttr(const NodeDef& ndef, absl::string_view attr_name) {
+  T val;
+  TF_RETURN_IF_ERROR(GetNodeAttr(ndef, attr_name, &val));
+  return val;
+}
+
+// This version avoids copying the TensorProto.
+// REQUIRES: Must not use *value beyond the lifetime of node_def.
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         const TensorProto** value);  // type: "tensor"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    const TensorProto** value);  // type: "tensor"
+
+// This version avoids copying the NameAttrList.
+// REQUIRES: Must not use *value beyond the lifetime of node_def.
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         const NameAttrList** value);  // type: "func"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    const NameAttrList** value);  // type: "func"
+
+// These versions copies the NameAttrList(s).
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         NameAttrList* value);  // type: "func"
+absl::Status GetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<NameAttrList>* value);  // type: "list(func)"
+
+// Look up the attr with name attr_name and set *value to its value.  If no
+// attr with attr_name is found in node_def, or the attr does not have
+// a matching type, false is returned.
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::string* value);  // type: "string"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    int64_t* value);  // type: "int"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<int64_t>* value);  // type: "int"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    int32* value);  // type: "int"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    float* value);  // type: "float"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    bool* value);  // type: "bool"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    DataType* value);  // type: "type"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    TensorShape* value);  // type: "shape"
+
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<string>* value);  // type: "list(string)"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<tstring>* value);  // type: "list(tstring)"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<int32>* value);  // type: "list(int)"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<float>* value);  // type: "list(float)"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<bool>* value);  // type: "list(bool)"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<DataType>* value);  // type: "list(type)"
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<TensorShape> value);  // type: "shape"
+
+// Overloads of TryGetNodeAttr() that avoid copying the non-POD attribute
+// values.
+bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                    std::vector<const string*>* value);  // type: "list(string)"
+bool TryGetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<const TensorShapeProto*>* value);  // type: "list(shape)"
+
+// Look up the attr with name attr_name and return a reference to its value.
+// If no attr with attr_name is found in node_def, or the attr does not have
+// a matching type, a reference to an empty string is returned.
+// REQUIRES: Must not use the returned value beyond the lifetime of node_def.
+const std::string& GetNodeAttrString(const AttrSlice& attrs,
+                                     absl::string_view attr_name);
+
+// Specialization to parse an attribute directly into a Padding enum.
+absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
+                         Padding* value);
+
+// Computes the input type for a specific node input.
+// REQUIRES: ValidateOpDef(op_def).ok()
+absl::Status InputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
+                              int input_port, DataType* input_type);
+// Computes the input types for a specific node.
+// REQUIRES: ValidateOpDef(op_def).ok()
+absl::Status InputTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                               DataTypeVector* inputs);
+// Computes the output type for a specific node output.
+// REQUIRES: ValidateOpDef(op_def).ok()
+absl::Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
+                               int output_port, DataType* output_type);
+// Computes the output types for a specific node.
+// REQUIRES: ValidateOpDef(op_def).ok()
+absl::Status OutputTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                                DataTypeVector* outputs);
+absl::Status OutputTypesForNode(const AttrSlice& attrs, const OpDef& op_def,
+                                DataTypeVector* outputs);
+
+// Computes the input and output types for a specific node.
+// REQUIRES: ValidateOpDef(op_def).ok()
+absl::Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                               DataTypeVector* inputs, DataTypeVector* outputs);
+// Computes the number of outputs for a specific node.
+// REQUIRES: ValidateOpDef(op_def).ok()
+absl::Status NumOutputsForNode(const NodeDef& node_def, const OpDef& op_def,
+                               int* num_outputs);
+
+// Map a node/op's input/output port_id to arg_id.
+//
+// The port_id refers to the n-th tensor of the node, while the arg_id refers to
+// the n-th arg of the op. These two can be different if an op's arg is a list
+// of tensors.
+//
+// We return -1 for any invalid port_id (i.e., no corresponding arg_id).
+int OpPortIdToArgId(const NodeDef& node,
+                    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args,
+                    int port_id);
+
+// Validates that the NodeDef:
+// * Defines all expected attrs from the OpDef.
+// * All attrs satisfies constraints from the OpDef.
+// * Has a signature matching SignatureForNode().
+// etc.
+absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def);
+
+// Computes the mapping from input/output argument name to the
+// corresponding input/output index range.  For example,
+// input "foo" corresponds to input indices
+//   [ (*inputs)["foo"].first, (*inputs)["foo"].second ).
+// NOTE(mrry): To reduce allocations when the map is used and save
+// space, the returned `NameRangeMap` objects borrow the input/output
+// argument names from `op_def`. The `op_def` must outlive the
+// returned `NameRangeMap` objects.
+typedef gtl::FlatMap<absl::string_view, std::pair<int, int>,
+                     hash<absl::string_view>>
+    NameRangeMap;
+absl::Status NameRangesForNode(const AttrSlice& attrs, const OpDef& op_def,
+                               NameRangeMap* inputs, NameRangeMap* outputs);
+// Adds default values to *node_def for unspecified attrs from op_def.
+void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def);
+
+// Remove attributes from node_def when the value is the default from the
+// op_def.
+void StripDefaultsFromNodeDef(const OpDef& op_def, NodeDef* node_def);
+
+// Validates the syntax of a NodeDef provided externally.
+//
+// The following is an EBNF-style syntax for NodeDef objects. Note that
+// Node objects are actually specified as tensorflow::NodeDef protocol buffers,
+// which contain many other fields that are not (currently) validated.
+//
+// Node         = NodeName, Inputs
+// Inputs       = ( DataInput * ), ( ControlInput * )
+// DataInput    = NodeName, ( ":", [1-9], [0-9] * ) ?
+// ControlInput = "^", NodeName
+// NodeName     = [A-Za-z0-9.], [A-Za-z0-9_./] *
+absl::Status ValidateExternalNodeDefSyntax(const NodeDef& node_def);
+
+// Returns "status" with formatted NodeDef attached as additional text
+// in the error message. If 'allow_multiple_formatted_node' is false and there
+// is already a formatted NodeDef present in 'status', we simply attach the name
+// of the NodeDef instead of the formatted string.
+absl::Status AttachDef(const absl::Status& status, const NodeDef& node_def,
+                       bool allow_multiple_formatted_node = false);
+// Appends the given prefix and suffix to the original node name in order to
+// make the name unique. If it's an "Enter" node and uniquify_frame_name is
+// true, use the same way to reset attribute "frame_name".
+absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
+                                      absl::string_view suffix,
+                                      NodeDef* node_def,
+                                      bool uniquify_frame_name = true);
+
+// Appends the given prefix to the colocation group name if the name exists
+// in `to_match`.
+absl::Status MaybeAddPrefixToColocationConstraints(
+    const std::unordered_set<string>& match, absl::string_view prefix,
+    NodeDef* node_def);
+
+// Updates the colocation constraint name with the one provided in the map (if
+// it exists in the map) for node_def.
+absl::Status MaybeUpdateColocationConstraintsWithMap(
+    const std::map<absl::string_view, absl::string_view>& node_name_map,
+    NodeDef* node_def);
+
+// For replacing a existing node with a NoOp, change the op and clear full type
+// information (since a NoOp has no output). Note that (duplicate control or
+// all) inputs, (regular, output or all) attributes and output properperties are
+// NOT cleared (and should be cleared if appropriate elsewhere).
+void ChangeToNoOp(NodeDef* node_def);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/node_properties.h b/third_party/tflite-hdrs/tensorflow/core/framework/node_properties.h
new file mode 100644
index 00000000..91c495bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/node_properties.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NODE_PROPERTIES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NODE_PROPERTIES_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class OpRegistryInterface;
+
+struct NodeProperties {
+ public:
+  NodeProperties(const OpDef* op_def, NodeDef node_def,
+                 const DataTypeSlice inputs, const DataTypeSlice outputs)
+      : NodeProperties(op_def, std::move(node_def),
+                       DataTypeVector(inputs.begin(), inputs.end()),
+                       DataTypeVector(outputs.begin(), outputs.end())) {}
+
+  NodeProperties(const OpDef* _op_def, NodeDef&& _node_def,
+                 DataTypeVector inputs, DataTypeVector outputs)
+      : op_def(_op_def),
+        node_def(std::move(_node_def)),
+        input_types(std::move(inputs)),
+        input_types_slice(input_types),
+        output_types(std::move(outputs)),
+        output_types_slice(output_types) {}
+
+  // Resets the 'props' shared pointer to point to a new NodeProperties created
+  // from the given NodeDef. 'op_registry' is used to look up the OpDef
+  // corresponding to node_def.op(). Returns an error if OpDef lookup or
+  // creation failed.
+  static absl::Status CreateFromNodeDef(
+      NodeDef node_def, const OpRegistryInterface* op_registry,
+      std::shared_ptr<const NodeProperties>* props);
+
+  const OpDef* op_def;  // not owned.
+  NodeDef node_def;
+  DataTypeVector input_types;
+  DataTypeSlice input_types_slice;
+  DataTypeVector output_types;
+  DataTypeSlice output_types_slice;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_PROPERTIES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/numeric_op.h b/third_party/tflite-hdrs/tensorflow/core/framework/numeric_op.h
new file mode 100644
index 00000000..0167e21f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/numeric_op.h
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// One input and one output, both the same type.
+template <class T>
+class UnaryOp : public OpKernel {
+ public:
+  explicit UnaryOp(OpKernelConstruction* context) : OpKernel(context) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(context, context->MatchSignature({dt}, {dt}));
+  }
+};
+
+// Two inputs and one output, all the same type.
+template <class T>
+class BinaryOp : public OpKernel {
+ public:
+  explicit BinaryOp(OpKernelConstruction* context) : OpKernel(context) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(context, context->MatchSignature({dt, dt}, {dt}));
+  }
+};
+
+// For operations where the input and output are the same shape.
+//
+// For usage, see ../framework/elementwise_ops.cc.
+template <class T, class CHILD>
+class UnaryElementWiseOp : public UnaryOp<T> {
+ public:
+  using UnaryOp<T>::UnaryOp;
+
+  void Compute(OpKernelContext* context) override {
+    // Output shape is the same as input shape.
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
+    static_cast<CHILD*>(this)->Operate(context, input, output);
+  }
+};
+
+// For binary elementwise operations.
+template <class T, class CHILD>
+class BinaryElementWiseOp : public BinaryOp<T> {
+ public:
+  using BinaryOp<T>::BinaryOp;
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& a = context->input(0);
+    const Tensor& b = context->input(1);
+
+    if (!context->ValidateInputsAreSameShape(this)) {
+      return;
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0, 1}, 0, a.shape(), &output));
+
+    // Dispatch to the descendant's Operate() function.
+    switch (a.dims()) {
+#define NDIM_CASE(NDIMS)                                                       \
+  case NDIMS: {                                                                \
+    static_cast<CHILD*>(this)->template Operate<NDIMS>(context, a, b, output); \
+    break;                                                                     \
+  }
+
+      NDIM_CASE(0);
+      NDIM_CASE(1);
+      NDIM_CASE(2);
+      NDIM_CASE(3);
+      NDIM_CASE(4);
+      NDIM_CASE(5);
+      NDIM_CASE(6);
+      NDIM_CASE(7);
+      NDIM_CASE(8);
+#undef NDIM_CASE
+
+      default:
+        context->SetStatus(errors::InvalidArgument(
+            "We only handle up to Tensor::dims() up to 8, not ", a.dims()));
+        break;
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/numeric_types.h b/third_party/tflite-hdrs/tensorflow/core/framework/numeric_types.h
new file mode 100644
index 00000000..0b22dbaf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/numeric_types.h
@@ -0,0 +1,44 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NUMERIC_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NUMERIC_TYPES_H_
+
+#include <complex>
+
+// clang-format off
+// This include order is required to avoid instantiating templates
+// quantized types in the Eigen namespace before their specialization.
+#include "xla/tsl/framework/numeric_types.h"
+#include "tensorflow/core/platform/types.h"
+// clang-format on
+
+namespace tensorflow {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::complex128;
+using tsl::complex64;
+
+// We use Eigen's QInt implementations for our quantized int types.
+using tsl::qint16;
+using tsl::qint32;
+using tsl::qint8;
+using tsl::quint16;
+using tsl::quint8;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op.h b/third_party/tflite-hdrs/tensorflow/core/framework/op.h
new file mode 100644
index 00000000..41b39fc2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op.h
@@ -0,0 +1,330 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/full_type_inference_util.h"  // IWYU pragma: export
+#include "tensorflow/core/framework/full_type_util.h"  // IWYU pragma: export
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_def_util.h"  // IWYU pragma: export
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Users that want to look up an OpDef by type name should take an
+// OpRegistryInterface.  Functions accepting a
+// (const) OpRegistryInterface* may call LookUp() from multiple threads.
+class OpRegistryInterface {
+ public:
+  virtual ~OpRegistryInterface() = default;
+
+  // Returns an error status and sets *op_reg_data to nullptr if no OpDef is
+  // registered under that name, otherwise returns the registered OpDef.
+  // Caller must not delete the returned pointer.
+  virtual absl::Status LookUp(const std::string& op_type_name,
+                              const OpRegistrationData** op_reg_data) const = 0;
+
+  // Shorthand for calling LookUp to get the OpDef.
+  absl::Status LookUpOpDef(const std::string& op_type_name,
+                           const OpDef** op_def) const;
+};
+
+// The standard implementation of OpRegistryInterface, along with a
+// global singleton used for registering ops via the REGISTER
+// macros below.  Thread-safe.
+//
+// Example registration:
+//   OpRegistry::Global()->Register(
+//     [](OpRegistrationData* op_reg_data)->Status {
+//       // Populate *op_reg_data here.
+//       return OkStatus();
+//   });
+class OpRegistry : public OpRegistryInterface {
+ public:
+  typedef std::function<absl::Status(OpRegistrationData*)>
+      OpRegistrationDataFactory;
+
+  OpRegistry();
+
+  void Register(const OpRegistrationDataFactory& op_data_factory);
+
+  absl::Status LookUp(const std::string& op_type_name,
+                      const OpRegistrationData** op_reg_data) const override;
+
+  // Returns OpRegistrationData* of registered op type, else returns nullptr.
+  const OpRegistrationData* LookUp(const std::string& op_type_name) const;
+
+  // Fills *ops with all registered OpDefs (except those with names
+  // starting with '_' if include_internal == false) sorted in
+  // ascending alphabetical order.
+  void Export(bool include_internal, OpList* ops) const;
+
+  // Returns ASCII-format OpList for all registered OpDefs (except
+  // those with names starting with '_' if include_internal == false).
+  std::string DebugString(bool include_internal) const;
+
+  // A singleton available at startup.
+  static OpRegistry* Global();
+
+  // Get all registered ops.
+  void GetRegisteredOps(std::vector<OpDef>* op_defs);
+
+  // Get all `OpRegistrationData`s.
+  void GetOpRegistrationData(std::vector<OpRegistrationData>* op_data);
+
+  // Registers a function that validates op registry.
+  void RegisterValidator(
+      std::function<absl::Status(const OpRegistryInterface&)> validator) {
+    op_registry_validator_ = std::move(validator);
+  }
+
+  // Watcher, a function object.
+  // The watcher, if set by SetWatcher(), is called every time an op is
+  // registered via the Register function. The watcher is passed the Status
+  // obtained from building and adding the OpDef to the registry, and the OpDef
+  // itself if it was successfully built. A watcher returns a Status which is in
+  // turn returned as the final registration status.
+  typedef std::function<absl::Status(const absl::Status&, const OpDef&)>
+      Watcher;
+
+  // An OpRegistry object has only one watcher. This interface is not thread
+  // safe, as different clients are free to set the watcher any time.
+  // Clients are expected to atomically perform the following sequence of
+  // operations :
+  // SetWatcher(a_watcher);
+  // Register some ops;
+  // op_registry->ProcessRegistrations();
+  // SetWatcher(nullptr);
+  // Returns a non-OK status if a non-null watcher is over-written by another
+  // non-null watcher.
+  absl::Status SetWatcher(const Watcher& watcher);
+
+  // Process the current list of deferred registrations. Note that calls to
+  // Export, LookUp and DebugString would also implicitly process the deferred
+  // registrations. Returns the status of the first failed op registration or
+  // OkStatus() otherwise.
+  absl::Status ProcessRegistrations() const;
+
+  // Defer the registrations until a later call to a function that processes
+  // deferred registrations are made. Normally, registrations that happen after
+  // calls to Export, LookUp, ProcessRegistrations and DebugString are processed
+  // immediately. Call this to defer future registrations.
+  void DeferRegistrations();
+
+  // Clear the registrations that have been deferred.
+  void ClearDeferredRegistrations();
+
+ private:
+  // Ensures that all the functions in deferred_ get called, their OpDef's
+  // registered, and returns with deferred_ empty.  Returns true the first
+  // time it is called. Prints a fatal log if any op registration fails.
+  bool MustCallDeferred() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Calls the functions in deferred_ and registers their OpDef's
+  // It returns the Status of the first failed op registration or OkStatus()
+  // otherwise.
+  absl::Status CallDeferred() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Add 'def' to the registry with additional data 'data'. On failure, or if
+  // there is already an OpDef with that name registered, returns a non-okay
+  // status.
+  absl::Status RegisterAlreadyLocked(
+      const OpRegistrationDataFactory& op_data_factory) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const OpRegistrationData* LookUpSlow(const std::string& op_type_name) const;
+
+  mutable mutex mu_;
+  // Functions in deferred_ may only be called with mu_ held.
+  mutable std::vector<OpRegistrationDataFactory> deferred_ TF_GUARDED_BY(mu_);
+  // Values are owned.
+  mutable absl::flat_hash_map<string, std::unique_ptr<const OpRegistrationData>>
+      registry_ TF_GUARDED_BY(mu_);
+  mutable bool initialized_ TF_GUARDED_BY(mu_);
+
+  // Registry watcher.
+  mutable Watcher watcher_ TF_GUARDED_BY(mu_);
+
+  std::function<absl::Status(const OpRegistryInterface&)>
+      op_registry_validator_;
+};
+
+// An adapter to allow an OpList to be used as an OpRegistryInterface.
+//
+// Note that shape inference functions are not passed in to OpListOpRegistry, so
+// it will return an unusable shape inference function for every op it supports;
+// therefore, it should only be used in contexts where this is okay.
+class OpListOpRegistry : public OpRegistryInterface {
+ public:
+  // Does not take ownership of op_list, *op_list must outlive *this.
+  explicit OpListOpRegistry(const OpList* op_list);
+  absl::Status LookUp(const std::string& op_type_name,
+                      const OpRegistrationData** op_reg_data) const override;
+
+  // Returns OpRegistrationData* of op type in list, else returns nullptr.
+  const OpRegistrationData* LookUp(const std::string& op_type_name) const;
+
+ private:
+  // Values are owned.
+  absl::flat_hash_map<string, std::unique_ptr<const OpRegistrationData>> index_;
+};
+
+// Support for defining the OpDef (specifying the semantics of the Op and how
+// it should be created) and registering it in the OpRegistry::Global()
+// registry.  Usage:
+//
+// REGISTER_OP("my_op_name")
+//     .Attr("<name>:<type>")
+//     .Attr("<name>:<type>=<default>")
+//     .Input("<name>:<type-expr>")
+//     .Input("<name>:Ref(<type-expr>)")
+//     .Output("<name>:<type-expr>")
+//     .Doc(R"(
+// <1-line summary>
+// <rest of the description (potentially many lines)>
+// <name-of-attr-input-or-output>: <description of name>
+// <name-of-attr-input-or-output>: <description of name;
+//   if long, indent the description on subsequent lines>
+// )");
+//
+// Note: .Doc() should be last.
+// For details, see the OpDefBuilder class in op_def_builder.h.
+
+namespace register_op {
+
+class OpDefBuilderWrapper {
+ public:
+  explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
+  OpDefBuilderWrapper& Attr(std::string spec) {
+    builder_.Attr(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper& Attr(const char* spec) TF_ATTRIBUTE_NOINLINE {
+    return Attr(std::string(spec));
+  }
+  OpDefBuilderWrapper& Input(std::string spec) {
+    builder_.Input(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper& Input(const char* spec) TF_ATTRIBUTE_NOINLINE {
+    return Input(std::string(spec));
+  }
+  OpDefBuilderWrapper& Output(std::string spec) {
+    builder_.Output(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper& Output(const char* spec) TF_ATTRIBUTE_NOINLINE {
+    return Output(std::string(spec));
+  }
+  OpDefBuilderWrapper& SetIsCommutative() {
+    builder_.SetIsCommutative();
+    return *this;
+  }
+  OpDefBuilderWrapper& SetIsAggregate() {
+    builder_.SetIsAggregate();
+    return *this;
+  }
+  OpDefBuilderWrapper& SetIsStateful() {
+    builder_.SetIsStateful();
+    return *this;
+  }
+  OpDefBuilderWrapper& SetDoNotOptimize() {
+    // We don't have a separate flag to disable optimizations such as constant
+    // folding and CSE so we reuse the stateful flag.
+    builder_.SetIsStateful();
+    return *this;
+  }
+  OpDefBuilderWrapper& SetAllowsUninitializedInput() {
+    builder_.SetAllowsUninitializedInput();
+    return *this;
+  }
+  OpDefBuilderWrapper& Deprecated(int version, std::string explanation) {
+    builder_.Deprecated(version, std::move(explanation));
+    return *this;
+  }
+  OpDefBuilderWrapper& Doc(std::string text) {
+    builder_.Doc(std::move(text));
+    return *this;
+  }
+  OpDefBuilderWrapper& SetShapeFn(OpShapeInferenceFn fn) {
+    builder_.SetShapeFn(std::move(fn));
+    return *this;
+  }
+  OpDefBuilderWrapper& SetIsDistributedCommunication() {
+    builder_.SetIsDistributedCommunication();
+    return *this;
+  }
+
+  OpDefBuilderWrapper& SetTypeConstructor(OpTypeConstructor fn) {
+    builder_.SetTypeConstructor(std::move(fn));
+    return *this;
+  }
+
+  OpDefBuilderWrapper& SetForwardTypeFn(TypeInferenceFn fn) {
+    builder_.SetForwardTypeFn(std::move(fn));
+    return *this;
+  }
+
+  OpDefBuilderWrapper& SetReverseTypeFn(int input_number, TypeInferenceFn fn) {
+    builder_.SetReverseTypeFn(input_number, std::move(fn));
+    return *this;
+  }
+
+  const ::tensorflow::OpDefBuilder& builder() const { return builder_; }
+
+  InitOnStartupMarker operator()();
+
+ private:
+  mutable ::tensorflow::OpDefBuilder builder_;
+};
+
+}  // namespace register_op
+
+#define REGISTER_OP_IMPL(ctr, name, is_system_op)                         \
+  static ::tensorflow::InitOnStartupMarker const register_op##ctr         \
+      TF_ATTRIBUTE_UNUSED =                                               \
+          TF_INIT_ON_STARTUP_IF(is_system_op || SHOULD_REGISTER_OP(name)) \
+          << ::tensorflow::register_op::OpDefBuilderWrapper(name)
+
+#define REGISTER_OP(name)        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, false)
+
+// The `REGISTER_SYSTEM_OP()` macro acts as `REGISTER_OP()` except
+// that the op is registered unconditionally even when selective
+// registration is used.
+#define REGISTER_SYSTEM_OP(name)        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op")        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op:system") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, true)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_def_builder.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_def_builder.h
new file mode 100644
index 00000000..8009135d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_def_builder.h
@@ -0,0 +1,280 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class and associated machinery for specifying an Op's OpDef and shape
+// inference function for Op registration.
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_DEF_BUILDER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// TODO(b/62899350): Refactor without proto dependencies.
+typedef std::function<absl::Status(OpDef* c)> OpTypeConstructor;
+
+typedef std::vector<std::reference_wrapper<const FullTypeDef>> TypeRefVector;
+
+// A callback into the type inference process, allowing type inference functions
+// to request inferring the type of some function (assumed to exist in the
+// runtime). The function is specified by name.
+typedef std::function<absl::StatusOr<FullTypeDef>(const string&,
+                                                  const TypeRefVector&)>
+    FunctionTypeInferrer;
+
+// A type inference function, called for each node during type inference
+// (possibly multiple times).
+// The first argument (input_types) will hold the type of each of the node's
+// inputs. The second argument (type_vars) will hold the return type of
+// each function referred from any type variable (e.g. `FuncVar`) present
+// in the node's corresponding op definition.
+//
+// TODO(mdan): Consider a vector-in, vector-out contract.
+typedef std::function<absl::StatusOr<FullTypeDef>(const TypeRefVector&,
+                                                  const FunctionTypeInferrer&)>
+    TypeInferenceFn;
+
+class FunctionDefHelper;
+
+namespace shape_inference {
+class InferenceContext;
+}
+typedef std::function<absl::Status(shape_inference::InferenceContext* c)>
+    OpShapeInferenceFn;
+
+struct OpRegistrationData {
+ public:
+  OpRegistrationData() {}
+  OpRegistrationData(const OpDef& def) : op_def(def) {}
+  OpRegistrationData(const OpDef& def, const OpShapeInferenceFn& fn,
+                     bool is_function = false)
+      : op_def(def), shape_inference_fn(fn), is_function_op(is_function) {}
+
+  OpDef op_def;
+  OpShapeInferenceFn shape_inference_fn;
+
+  // Type constructor. This callable initializes the type of this op.
+  // It is provided as a programmatic mechanism for defining an op's
+  // type, as part of its registration. It is to be eventually replaced by a
+  // textual language.
+  //
+  // Important: historically, op registrations only contained partial
+  // input/output type information in non-standardized attribute declarations
+  // (e.g. typically, input types were held in a `dtype` attribute). The type
+  // constructor currently duplicates such attribute information, with the aim
+  // of entirely subsuming it, and eventually deprecating all type-related
+  // attributes.
+  //
+  // Since ops are typically parametrized, the type created by this constructor
+  // is also parametric.
+  //
+  // Example: for an op `Foo(x: T) -> Bar[T]`:
+  //
+  //  * typically, its op registration included a single attribute `T: type`;
+  //    then the respective input was defined as `x: T`; the output type `Bar`
+  //    was implied by the op name.
+  //  * the type constructor creates a FullType object containing `Bar[T]`; this
+  //    still relies on the `T` attribute which it references.
+  //  * in the future, the type constructor will create a FullType containing
+  //    `Callable[(x: T), Bar[T]]`, and the attribute `T` will be deprecated.
+  OpTypeConstructor type_ctor;
+
+  // Forward type inference function. This callable infers the return type of an
+  // op based on its input types.
+  //
+  // Note that the type constructor and forward inference functions need not be
+  // mutually exclusive: if there is some static information that can be set
+  // based on attributes, then that should be set in the constructor. If more
+  // information can be extracted from inputs, that should be done in the
+  // forward inference function.
+  //
+  // This is similar to the shape function, but is more general, and applied
+  // directly to NodeDefs, rather than working on the ShapeAndType structures.
+  // Note that the op input/output declarations may specify some implicit type
+  // constraints through attribute references (i.e. two inputs pointing to the
+  // same type attribute). Those constraints may duplicate what this function
+  // specifies in its body. That's intended, for a gradual transition to a more
+  // formal type system.
+  //
+  // These type inference functions are intermediate solutions as well: once the
+  // op registration has a complete, formal type definition, along with
+  // a solver-based type inference, it will replace these functions.
+  //
+  // TODO(mdan): Merge with shape inference.
+  // TODO(mdan): Replace with a union-based type inference algorithm.
+  TypeInferenceFn fwd_type_fn;
+
+  // Reverse type inference function. This callable infers some input types
+  // based on the return type.
+  //
+  // TODO(mdan): Replace with a union-based type inference algorithm.
+  TypeInferenceFn rev_type_fn;
+
+  // The input number affected by reverse type inference. Only one input may be
+  // updated in this manner.
+  // TODO(mdan): Encode in a manner more consistent with the forward version.
+  int rev_type_input;
+
+  bool is_function_op = false;
+};
+
+// Builder class passed to the REGISTER_OP() macro.
+class OpDefBuilder {
+ public:
+  // Constructs an OpDef with just the name field set.
+  explicit OpDefBuilder(std::string op_name);
+
+  // Adds an attr to this OpDefBuilder (and returns *this). The spec has
+  // format "<name>:<type>" or "<name>:<type>=<default>"
+  // where <name> matches regexp [a-zA-Z][a-zA-Z0-9_]*
+  // (by convention only using capital letters for attrs that can be inferred)
+  // <type> can be:
+  //   "string", "int", "float", "bool", "type", "shape", or "tensor"
+  //   "numbertype", "realnumbertype", "quantizedtype"
+  //       (meaning "type" with a restriction on valid values)
+  //   "{int32,int64}" or {realnumbertype,quantizedtype,string}"
+  //       (meaning "type" with a restriction containing unions of value types)
+  //   "{\"foo\", \"bar\n baz\"}", or "{'foo', 'bar\n baz'}"
+  //       (meaning "string" with a restriction on valid values)
+  //   "list(string)", ..., "list(tensor)", "list(numbertype)", ...
+  //       (meaning lists of the above types)
+  //   "int >= 2" (meaning "int" with a restriction on valid values)
+  //   "list(string) >= 2", "list(int) >= 2"
+  //       (meaning "list(string)" / "list(int)" with length at least 2)
+  // <default>, if included, should use the Proto text format
+  // of <type>.  For lists use [a, b, c] format.
+  //
+  // Note that any attr specifying the length of an input or output will
+  // get a default minimum of 1 unless the >= # syntax is used.
+  //
+  // TODO(josh11b): Perhaps support restrictions and defaults as optional
+  // extra arguments to Attr() instead of encoding them in the spec string.
+  // TODO(josh11b): Would like to have better dtype handling for tensor attrs:
+  // * Ability to say the type of an input/output matches the type of
+  //   the tensor.
+  // * Ability to restrict the type of the tensor like the existing
+  //   restrictions for type attrs.
+  // Perhaps by linking the type of the tensor to a type attr?
+  OpDefBuilder& Attr(std::string spec);
+
+  // Adds an input or output to this OpDefBuilder (and returns *this).
+  // The spec has form "<name>:<type-expr>" or "<name>:Ref(<type-expr>)"
+  // where <name> matches regexp [a-z][a-z0-9_]* and <type-expr> can be:
+  // * For a single tensor: <type>
+  // * For a sequence of tensors with the same type: <number>*<type>
+  // * For a sequence of tensors with different types: <type-list>
+  // Where:
+  //   <type> is either one of "float", "int32", "string", ...
+  //                 or the name of an attr (see above) with type "type".
+  //   <number> is the name of an attr with type "int".
+  //   <type-list> is the name of an attr with type "list(type)".
+  // TODO(josh11b): Indicate Ref() via an optional argument instead of
+  // in the spec?
+  // TODO(josh11b): SparseInput() and SparseOutput() matching the Python
+  // handling?
+  OpDefBuilder& Input(std::string spec);
+  OpDefBuilder& Output(std::string spec);
+
+  // Turns on the indicated boolean flag in this OpDefBuilder (and
+  // returns *this).
+  OpDefBuilder& SetIsCommutative();
+  OpDefBuilder& SetIsAggregate();
+  OpDefBuilder& SetIsStateful();
+  OpDefBuilder& SetAllowsUninitializedInput();
+  OpDefBuilder& SetIsDistributedCommunication();
+
+  // Deprecate the op at a certain GraphDef version.
+  OpDefBuilder& Deprecated(int version, std::string explanation);
+
+  // Adds docs to this OpDefBuilder (and returns *this).
+  // Docs have the format:
+  //   <1-line summary>
+  //   <rest of the description>
+  //   <name>: <description of name>
+  //   <name>: <description of name>
+  //     <if long, indent the description on subsequent lines>
+  // Where <name> is the name of an attr, input, or output.  Please
+  // wrap docs at 72 columns so that it may be indented in the
+  // generated output.  For tensor inputs or outputs (not attrs), you
+  // may start the description with an "=" (like name:= <description>)
+  // to suppress the automatically-generated type documentation in
+  // generated output.
+  OpDefBuilder& Doc(std::string text);
+
+  // Sets the function to be used as type constructor.
+  // See OpRegistrationData::type_ctor.
+  OpDefBuilder& SetTypeConstructor(OpTypeConstructor c);
+
+  // Sets the function to be used for forward type inference.
+  // See OpRegistrationData::fwd_type_fn.
+  OpDefBuilder& SetForwardTypeFn(TypeInferenceFn f);
+
+  // Sets the function to be used for reverse type inference.
+  // See OpRegistrationData::rew_type_fn.
+  OpDefBuilder& SetReverseTypeFn(int input_number, TypeInferenceFn f);
+
+  // Sets the shape function to be used for shape inference.
+  //
+  // Note that currently (October 2016), python code still requires a
+  // RegisterShape call to invoke this; see call_cpp_shape_fn in
+  // python/framework/common_shapes.py
+  OpDefBuilder& SetShapeFn(OpShapeInferenceFn fn);
+
+  // Allows the `<type>` in calls to `Attr()` to be "any".
+  // This is used by PythonAPIWrapper for pass-through parameters.
+  OpDefBuilder& AllowAttrTypeAny();
+
+  // Sets op_reg_data->op_def to the requested OpDef and
+  // op_reg_data->shape_inference_fn to the requested shape inference function,
+  // or returns an error.
+  // Must be called after all of the above methods.
+  //
+  // Note that OpDefBuilder only reports parsing errors.  You should also
+  // call ValidateOpDef() to detect other problems.
+  absl::Status Finalize(OpRegistrationData* op_reg_data) const;
+
+ private:
+  friend class FunctionDefHelper;
+
+  // Adds control output to this OpDefBuilder (and returns *this).
+  // The <name> must be a valid node name (matches regexp
+  // [a-zA-Z][a-zA-Z0-9_]*). Named control output can only exist for functions.
+  OpDefBuilder& ControlOutput(std::string name);
+
+  OpDef* op_def() { return &op_reg_data_.op_def; }
+
+  OpRegistrationData op_reg_data_;
+  std::vector<string> attrs_;
+  std::vector<string> inputs_;
+  std::vector<string> outputs_;
+  std::vector<string> control_outputs_;
+  std::string doc_;
+  std::vector<string> errors_;
+  bool allow_attr_type_any_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_DEF_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_def_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_def_util.h
new file mode 100644
index 00000000..be1f0822
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_def_util.h
@@ -0,0 +1,110 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(josh11b): Probably not needed for OpKernel authors, so doesn't
+// need to be as publicly accessible as other files in framework/.
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_DEF_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Performs a consistency check across the fields of the op_def.
+absl::Status ValidateOpDef(const OpDef& op_def);
+
+// Check if an op is deprecated at the given GraphDef version.  If the op is
+// deprecated at a future version, a warning will be logged.
+absl::Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version);
+
+// Validates that attr_value satisfies the type and constraints from attr.
+// REQUIRES: attr has already been validated.
+absl::Status ValidateAttrValue(const AttrValue& attr_value,
+                               const OpDef::AttrDef& attr);
+
+// The following search through op_def for an attr with the indicated name.
+// Returns nullptr if no such attr is found.
+const OpDef::AttrDef* FindAttr(absl::string_view name, const OpDef& op_def);
+OpDef::AttrDef* FindAttrMutable(absl::string_view name, OpDef* op_def);
+
+// Searches op_def for input argument with the indicated name.
+// Returns nullptr if no such attr is found.
+const OpDef::ArgDef* FindInputArg(absl::string_view name, const OpDef& op_def);
+
+// Searches api_def for input argument with the indicated name.
+// Returns nullptr if no such attr is found.
+const ApiDef::Arg* FindInputArg(absl::string_view name, const ApiDef& api_def);
+
+// Produce a human-readable version of an op_def that is more concise
+// than a text-format proto.  Excludes descriptions.
+std::string SummarizeOpDef(const OpDef& op_def);
+
+// Returns an error if new_op is not backwards-compatible with (more
+// accepting than) old_op.
+// REQUIRES: old_op and new_op must pass validation.
+absl::Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op);
+
+// Returns an error if any attr in penultimate_op that is not in old_op
+// has a different default value in new_op.  In general it is not safe
+// to change the default for an attr that has been added to an op.
+absl::Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
+                                         const OpDef& penultimate_op,
+                                         const OpDef& new_op);
+
+// Returns an error if the default value for any attr is removed or modified
+// in new_op compared to old_op.  Adding new default values is safe, and does
+// not raise an error.
+absl::Status OpDefAttrDefaultsUnchanged(const OpDef& old_op,
+                                        const OpDef& new_op);
+
+// Remove all docs from *op_def / *op_list.
+void RemoveDescriptionsFromOpDef(OpDef* op_def);
+void RemoveDescriptionsFromOpList(OpList* op_list);
+
+// Remove docs from *op_def but leave explanations of deprecations.
+void RemoveNonDeprecationDescriptionsFromOpDef(OpDef* op_def);
+
+// Returns true if `a1` is equal to `a2`.
+// Equality includes all the fields.
+bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2);
+
+// Returns hash of `a` that is consistent with AttrDefEqual.
+uint64 AttrDefHash(const OpDef::AttrDef& a);
+
+// Returns true if all AttrDefs in `a1` equal corresponding AttrDefs in
+// `a2`. Correspondence is established by name.
+bool RepeatedAttrDefEqual(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a1,
+                          const protobuf::RepeatedPtrField<OpDef::AttrDef>& a2);
+
+// Returns hash of `a` that is consistent with RepeatedAttrDefEqual
+uint64 RepeatedAttrDefHash(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a);
+
+// Returns true if `o1` is equal to `o2`.
+// Equality includes all the fields. OpDef.attr field is treated as a set.
+bool OpDefEqual(const OpDef& o1, const OpDef& o2);
+
+// Returns hash of `o` that is consistent with AttrDefEqual.
+uint64 OpDefHash(const OpDef& o);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_DEF_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_gen_lib.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_gen_lib.h
new file mode 100644
index 00000000..27ffe522
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_gen_lib.h
@@ -0,0 +1,100 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_GEN_LIB_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_GEN_LIB_H_
+
+#include <string>
+#include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+// Forward declare protos so their symbols can be removed from .so exports
+class OpDef;
+
+inline string Spaces(int n) { return string(n, ' '); }
+
+// Wrap prefix + str to be at most width characters, indenting every line
+// after the first by prefix.size() spaces.  Intended use case is something
+// like prefix = "  Foo(" and str is a list of arguments (terminated by a ")").
+// TODO(josh11b): Option to wrap on ", " instead of " " when possible.
+string WordWrap(absl::string_view prefix, absl::string_view str, int width);
+
+// Looks for an "=" at the beginning of *description.  If found, strips it off
+// (and any following spaces) from *description and return true.  Otherwise
+// returns false.
+bool ConsumeEquals(absl::string_view* description);
+
+// Convert text-serialized protobufs to/from multiline format.
+string PBTxtToMultiline(absl::string_view pbtxt,
+                        const std::vector<string>& multi_line_fields);
+string PBTxtFromMultiline(absl::string_view multiline_pbtxt);
+
+// Takes a list of files with ApiDefs text protos, and allows you to
+// look up the specific ApiDef for any given op.
+class ApiDefMap {
+ public:
+  // OpList must be a superset of ops of any subsequently loaded
+  // ApiDef.
+  explicit ApiDefMap(const OpList& op_list);
+  ~ApiDefMap();
+
+  // You can call this method multiple times to load multiple
+  // sets of files. Api definitions are merged if the same
+  // op definition is loaded multiple times. Later-loaded
+  // definitions take precedence.
+  // ApiDefs loaded from files must contain a subset of ops defined
+  // in the OpList passed to the constructor.
+  absl::Status LoadFileList(Env* env, const std::vector<string>& filenames);
+
+  // Load a single file. Api definitions are merged if the same
+  // op definition is loaded multiple times. Later-loaded
+  // definitions take precedence.
+  // ApiDefs loaded from file must contain a subset of ops defined
+  // in the OpList passed to the constructor.
+  absl::Status LoadFile(Env* env, const string& filename);
+
+  // Load ApiDefs from string containing ApiDefs text proto.
+  // api_def_file_contents is expected to be in "multiline format".
+  // ApiDefs must contain a subset of ops defined in OpsList
+  // passed to the constructor.
+  absl::Status LoadApiDef(const string& api_def_file_contents);
+
+  // Updates ApiDef docs. For example, if ApiDef renames an argument
+  // or attribute, applies these renames to descriptions as well.
+  // UpdateDocs should only be called once after all ApiDefs are loaded
+  // since it replaces original op names.
+  void UpdateDocs();
+
+  // Look up ApiDef proto based on the given graph op name.
+  // If graph op name is not in this ApiDefMap, returns nullptr.
+  //
+  // Note: Returned ApiDef pointer should stay valid even after calling
+  // Load* functions defined above. Subsequent calls to Load* might modify
+  // returned ApiDef contents, but should never remove the ApiDef itself.
+  const ApiDef* GetApiDef(const string& name) const;
+
+ private:
+  std::unordered_map<string, ApiDef> map_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_GEN_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_kernel.h
new file mode 100644
index 00000000..d925bc21
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_kernel.h
@@ -0,0 +1,1736 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/session_state.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/tracking_allocator.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/manual_constructor.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+// Used to match ops to kernel sources (and eventually to kernel targets)
+#ifdef TF_LOG_KERNEL_SOURCES
+#define LOG_KERNEL_SOURCES(name) \
+  LOG(INFO) << "Kernel found: " << name << " " << __FILE__ << "\n";
+#else
+#define LOG_KERNEL_SOURCES(name)
+#endif
+
+namespace Eigen {
+struct ThreadPoolDevice;
+struct GpuDevice;
+}  // end namespace Eigen
+
+namespace tsl {
+class CoordinationServiceAgent;
+}
+
+namespace tensorflow {
+
+namespace checkpoint {
+class TensorSliceReaderCacheWrapper;
+}  // namespace checkpoint
+
+class AsyncOpKernel;
+class CallFrameInterface;
+class DeviceMgr;
+class FunctionLibraryRuntime;
+class OpKernelConstruction;  // declared below
+class OpKernelContext;       // declared below,
+class OpRegistryInterface;
+class ResourceMgr;
+class ScopedStepContainer;
+class CollectiveExecutor;
+class StepStatsCollectorInterface;
+
+// A label that is added to kernels that are JIT compiled. These labels will be
+// removed before kernels are looked up, so they can be used without specifying
+// the label. This label is a temporary measure to allow JIT kernels to be
+// disabled if needed.
+extern const char* kJitKernelLabel;
+extern const char* kDisableJitKernelsEnvVar;
+
+class OpKernel {
+ public:
+  // OpKernel won't be instantiated by the scheduler, so you may perform
+  // expensive initialization in the descendant's constructor.
+  explicit OpKernel(OpKernelConstruction* context);
+
+  // Specialized constructor that allows a kernel implementation to mark itself
+  // as a "deferred" op. If true, the executor will provide access to the
+  // `OpKernelContext::inc_num_deferred_ops_function()` and
+  // `OpKernelContext::dec_num_deferred_ops_function()` methods at run-time.
+  OpKernel(OpKernelConstruction* context, bool is_deferred);
+
+  // Specialized constructor that enables the descendant to provide a custom
+  // `NodeDef` value. For example, this constructor can be used to provide a
+  // stripped-down `NodeDef` that does not contain the full set of attrs (such
+  // as tensor values) if the descendant stores them in a different form.
+  OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
+           bool is_deferred);
+
+  virtual ~OpKernel();
+
+  // An OpKernel's computation can be either synchronous or
+  // asynchronous. All OpKernel Compute() methods must be thread-safe as they
+  // may be called concurrently (e.g. by multiple executions of the same graph
+  // concurrently).
+  //
+  // Most OpKernels should compute synchronously. They should
+  // subclass OpKernel and override the Compute() method and have it
+  // return after completing the supplied work.
+  //
+  // A synchronous OpKernel *MUST NOT* block the calling thread on a
+  // synchronization mechanism (condition variable, Notification, etc.) that
+  // will be unblocked by the execution of another OpKernel. Execution may
+  // deadlock in that case, because the executor may use a bounded number of
+  // threads.
+  //
+  // If an OpKernel must block on the execution of another OpKernel (e.g. a
+  // RecvOp, or a DequeueOp), the implementation *MUST* subclass AsyncOpKernel,
+  // and override `AsyncOpKernel::ComputeAsync()`. In addition, because the
+  // unblocking kernel may never run (due to an error or cancellation), in most
+  // cases the AsyncOpKernel should implement cancellation support via
+  // `ctx->cancellation_manager()`.
+  //
+  // In both cases, implementations of Compute() and ComputeAsync()
+  // get inputs and write outputs through the given OpKernelContext
+  // and returns a status via context->SetStatus(). They must be
+  // thread-safe.
+
+  // Synchronous compute.
+  //
+  // "context" is guaranteed to be alive until Compute() returns.
+  virtual void Compute(OpKernelContext* context) = 0;
+
+  // Returns nullptr iff this op kernel is synchronous.
+  virtual AsyncOpKernel* AsAsync() { return nullptr; }
+
+  // Returns true iff this op kernel is considered "expensive". The
+  // runtime may use this flag to optimize graph execution for example
+  // to "inline" inexpensive kernels.
+  virtual bool IsExpensive() { return expensive_; }
+
+  // Returns a pointer to the tensor stored inside constant ops.
+  virtual const Tensor* const_tensor() const { return nullptr; }
+
+  // Accessors.
+  const NodeDef& def() const { return props_->node_def; }
+  const std::string& name() const { return props_->node_def.name(); }
+  absl::string_view name_view() const { return name_view_; }
+  const std::string& type_string() const { return props_->node_def.op(); }
+  absl::string_view type_string_view() const { return type_string_view_; }
+  const std::string& requested_input(int i) const {
+    return props_->node_def.input(i);
+  }
+  const std::string& requested_device() const {
+    return props_->node_def.device();
+  }
+
+  int num_inputs() const { return props_->input_types.size(); }
+  DataType input_type(int i) const { return props_->input_types[i]; }
+  const DataTypeVector& input_types() const { return props_->input_types; }
+  const MemoryTypeVector& input_memory_types() const {
+    return input_memory_types_;
+  }
+
+  int num_outputs() const { return props_->output_types.size(); }
+  DataType output_type(int o) const { return props_->output_types[o]; }
+  const DataTypeVector& output_types() const { return props_->output_types; }
+  const MemoryTypeVector& output_memory_types() const {
+    return output_memory_types_;
+  }
+
+  absl::Status InputRange(StringPiece input_name, int* start, int* stop) const;
+  absl::Status OutputRange(StringPiece output_name, int* start,
+                           int* stop) const;
+
+  // Returns `true` if and only if this kernel uses deferred execution.
+  bool is_deferred() const { return is_deferred_; }
+
+  // Returns a trace string for current computation, op name/type and input
+  // tensor shape/dtype are encoded for profiler cost analysis. Most OpKernel
+  // should use the default implementation.
+  virtual std::string TraceString(const OpKernelContext& ctx,
+                                  bool verbose) const;
+
+ protected:
+  std::string ShapeTraceString(const OpKernelContext& ctx) const;
+
+ private:
+  const std::shared_ptr<const NodeProperties> props_;
+  const MemoryTypeVector input_memory_types_;
+  const MemoryTypeVector output_memory_types_;
+  NameRangeMap input_name_map_;
+  NameRangeMap output_name_map_;
+  const absl::string_view name_view_;
+  const absl::string_view type_string_view_;
+  const int graph_def_version_;
+  const bool is_deferred_;
+  bool expensive_;
+
+  OpKernel(const OpKernel&) = delete;
+  void operator=(const OpKernel&) = delete;
+};
+
+class AsyncOpKernel : public OpKernel {
+ public:
+  using OpKernel::OpKernel;  // Lift OpKernel constructors.
+
+  // Asynchronous compute.
+  //
+  // Implementations of ComputeAsync() must ensure that `done` is (eventually)
+  // called exactly once to signal the completion of the computation. The
+  // implementation of ComputeAsync() must not block on the execution of another
+  // OpKernel. `done` may be called by the current thread, or by another thread.
+  // `context` is guaranteed to stay alive until the `done` callback starts.
+  //
+  // Since it is possible that the unblocking kernel may never run (due to an
+  // error or cancellation), in most cases the AsyncOpKernel should implement
+  // cancellation support via `context->cancellation_manager()`.
+  //
+  // WARNING: As soon as the `done` callback starts, `context` and `this` may be
+  // deleted. No code depending on these objects should execute after the call
+  // to `done`.
+  typedef std::function<void()> DoneCallback;
+  virtual void ComputeAsync(OpKernelContext* context, DoneCallback done) = 0;
+
+  AsyncOpKernel* AsAsync() override { return this; }
+
+  void Compute(OpKernelContext* context) override;
+};
+
+class OpKernelConstruction {
+ public:
+  OpKernelConstruction(DeviceType device_type, DeviceBase* device,
+                       Allocator* allocator, FunctionLibraryRuntime* flib,
+                       ResourceMgr* resource_mgr,
+                       const std::shared_ptr<const NodeProperties>& props,
+                       const MemoryTypeSlice& input_memory_types,
+                       const MemoryTypeSlice& output_memory_types,
+                       int graph_def_version, absl::Status* status);
+
+  Env* env() const { return device_->env(); }
+
+  // Allocation of tensors during kernel construction:
+  //
+  // It is legal to temporarily allocate scratch tensor storage during
+  // Op kernel construction. Scratch tensors should be allocated using
+  // allocate_temp below. Some kernels need to keep tensors in between
+  // invocations. If such a Tensor is allocated during kernel
+  // construction this also must be done using allocate_temp, and the
+  // Op may only store the returned Tensor object.
+
+  // Allocates a temporary Tensor of the specified type and shape. The
+  // Tensor must not be used after kernel construction is
+  // complete. See comment above.
+  absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                             Tensor* out_temp);
+  absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                             Tensor* out_temp,
+                             AllocatorAttributes allocator_attr);
+
+  // User-supplied configuration of this operation.
+  const NodeDef& def() const { return props_->node_def; }
+
+  // For inspecting the inputs to this operation.
+  int num_inputs() const { return props_->input_types.size(); }
+  DataType input_type(int i) const { return props_->input_types[i]; }
+  const DataTypeSlice& input_types() const { return props_->input_types_slice; }
+  const MemoryTypeSlice& input_memory_types() const {
+    return input_memory_types_;
+  }
+
+  // For inspecting the outputs expected from this operation.
+  int num_outputs() const { return props_->output_types.size(); }
+  DataType output_type(int i) const { return props_->output_types[i]; }
+  const DataTypeSlice& output_types() const {
+    return props_->output_types_slice;
+  }
+  const MemoryTypeSlice& output_memory_types() const {
+    return output_memory_types_;
+  }
+
+  // If expected_inputs == inputs() and expected_outputs == output_types(),
+  // returns OK, else returns INVALID_ARGUMENT with an error message.
+  // Recommended for Ops with dynamic signatures.
+  absl::Status MatchSignature(const DataTypeSlice expected_inputs,
+                              const DataTypeSlice expected_outputs);
+
+  // For recording configuration errors during construction.
+  void SetStatus(const absl::Status& status);
+  const absl::Status& status() const { return *status_; }
+
+  // Look up the attr with name attr_name and set *value to its value.  If no
+  // attr with attr_name is found in def(), or the attr does not have
+  // a matching type, a non-ok status will be returned.
+  template <class T>
+  absl::Status GetAttr(StringPiece attr_name,
+                       T* value) const TF_ATTRIBUTE_NOINLINE;
+
+  // Return true if the attr_name is defined in def().
+  bool HasAttr(StringPiece attr_name) const;
+
+  // Return the device type.
+  const DeviceType& device_type() const { return device_type_; }
+
+  // If not nullptr, the kernel can instantiate functions defined in
+  // the library. E.g.,
+  // CHECK_NOTNULL(function_library())->Instantiate("Foo", ...).
+  FunctionLibraryRuntime* function_library() const { return flib_; }
+
+  // Shared resources accessible to this kernel.
+  ResourceMgr* resource_manager() const { return resource_mgr_; }
+
+  // The GraphDef version whose behavior we should follow.
+  int graph_def_version() const { return graph_def_version_; }
+
+  // Helper routines for the OP_REQUIRES macros
+  void CtxFailure(const absl::Status& s);
+  void CtxFailureWithWarning(const absl::Status& s);
+  void CtxFailure(const char* file, int line, const absl::Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const absl::Status& s);
+
+  // Unrecommended functions: these are functions that have some
+  // current uses but are not recommended for use, and may go away at
+  // some future major version release.
+
+  // May be used, e.g., to get GPU handles, etc.
+  //
+  // Currently only used to call MakeTensorFromProto() for
+  // implementing ConstantOp for every device.  See comments
+  // on Device::MakeTensorFromProto for longer-term replacement
+  // ideas.
+  DeviceBase* device() const { return device_; }
+
+ private:
+  const DeviceType device_type_;
+  DeviceBase* const device_;
+  Allocator* allocator_;
+  FunctionLibraryRuntime* flib_;
+  ResourceMgr* const resource_mgr_;
+  std::shared_ptr<const NodeProperties> props_;
+  MemoryTypeSlice input_memory_types_;
+  MemoryTypeSlice output_memory_types_;
+  const int graph_def_version_;
+  absl::Status* status_;
+
+  // Allow access from OpKernel ctor.
+  friend class OpKernel;
+
+  OpKernelConstruction(const OpKernelConstruction&) = delete;
+  void operator=(const OpKernelConstruction&) = delete;
+};
+
+// TODO(mrry): Consider converting to a random_access_iterator, and upgrading
+// tensorflow::gtl::iterator_range to make the below container classes
+// unnecessary.
+template <typename ListType, typename ElementType>
+class OpArgIterator {
+ public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = ElementType;
+  using pointer = ElementType*;
+  using const_pointer = const ElementType*;
+  using reference = ElementType&;
+  using const_reference = const ElementType&;
+  using difference_type = ptrdiff_t;
+
+  OpArgIterator(const ListType* list, int i) : list_(list), i_(i) {}
+
+  bool operator==(const OpArgIterator& rhs) {
+    DCHECK(list_ == rhs.list_);
+    return i_ == rhs.i_;
+  }
+
+  bool operator!=(const OpArgIterator& rhs) {
+    DCHECK(list_ == rhs.list_);
+    return i_ != rhs.i_;
+  }
+
+  OpArgIterator operator++() {  // prefix ++it
+    ++i_;
+    return *this;
+  }
+
+  OpArgIterator operator++(int) {  // postfix it++
+    OpArgIterator old_value = *this;
+    ++i_;
+    return old_value;
+  }
+
+  reference operator*() { return (*list_)[i_]; }
+  pointer operator->() { return &(*list_)[i_]; }
+
+  const_reference operator*() const { return (*list_)[i_]; }
+  const_pointer operator->() const { return &(*list_)[i_]; }
+
+ private:
+  const ListType* const list_;
+  int i_;
+};
+
+// Utility class for representing a list of immutable input tensors
+// that are passed to the op as a single named argument.
+class OpInputList {
+ public:
+  typedef OpArgIterator<OpInputList, const Tensor> Iterator;
+  OpInputList() : ctx_(nullptr), start_(0), stop_(0) {}
+  OpInputList(OpKernelContext* ctx, int start, int stop)
+      : ctx_(ctx), start_(start), stop_(stop) {}
+  OpInputList& operator=(const OpInputList& other) = default;
+  const Tensor& operator[](int i) const;
+  int size() const { return stop_ - start_; }
+  Iterator begin() const { return Iterator(this, 0); }
+  Iterator end() const { return Iterator(this, size()); }
+
+ private:
+  OpKernelContext* ctx_;  // not owned
+  int start_;
+  int stop_;
+};
+
+// Utility class for representing a list of mutable ("ref") input tensors
+// that are passed to the op as a single named argument.
+class OpMutableInputList {
+ public:
+  typedef OpArgIterator<OpMutableInputList, Tensor*> Iterator;
+  OpMutableInputList(OpKernelContext* ctx, int start, int stop)
+      : ctx_(ctx), start_(start), stop_(stop) {}
+  OpMutableInputList() : ctx_(nullptr), start_(0), stop_(0) {}
+  OpMutableInputList& operator=(const OpMutableInputList& other) = default;
+  Tensor at(int i, bool lock_held);
+  mutex* ref_mutex(int i);
+  int size() const { return stop_ - start_; }
+  Iterator begin() const { return Iterator(this, 0); }
+  Iterator end() const { return Iterator(this, size()); }
+
+ private:
+  OpKernelContext* ctx_;  // not owned
+  int start_;
+  int stop_;
+};
+
+// Utility class for representing a list of output tensors that are
+// grouped as a single named output.
+class OpOutputList {
+ public:
+  typedef OpArgIterator<OpOutputList, const Tensor*> Iterator;
+  OpOutputList() : ctx_(nullptr), start_(0), stop_(0) {}
+  OpOutputList(OpKernelContext* ctx, int start, int stop)
+      : ctx_(ctx), start_(start), stop_(stop) {}
+  OpOutputList& operator=(const OpOutputList& other) = default;
+  Tensor* operator[](int i);
+  bool required(int i) const;
+  DataType expected_output_dtype(int i) const;
+  absl::Status allocate(int i, const TensorShape& shape, Tensor** output);
+  void set(int i, const Tensor& tensor);
+  void set(int i, Tensor&& tensor);
+  void set_ref(int i, mutex* mu, Tensor* tensor_for_ref);
+  int size() const { return stop_ - start_; }
+  Iterator begin() const { return Iterator(this, 0); }
+  Iterator end() const { return Iterator(this, size()); }
+
+ private:
+  OpKernelContext* ctx_;  // not owned
+  int start_;
+  int stop_;
+};
+
+// Holds a tensor or tensor reference. For tensor references, we need
+// a mutex to prevent concurrent access to the tensor.
+struct TensorValue {
+  TensorValue() : mutex_if_ref(nullptr), tensor(nullptr) {}
+  explicit TensorValue(Tensor* t) : mutex_if_ref(nullptr), tensor(t) {}
+  TensorValue(mutex* mu, Tensor* t) : mutex_if_ref(mu), tensor(t) {}
+  Tensor* operator->() const { return tensor; }
+  bool is_ref() const { return mutex_if_ref != nullptr; }
+
+  // Return the dtype of the Tensor. For references, return the underlying type.
+  DataType dtype() const {
+    if (is_ref()) {
+      return MakeRefType(tensor->dtype());
+    } else {
+      return tensor->dtype();
+    }
+  }
+
+  // Return the dtype of the Tensor. For references, return the underlying type.
+  // This variation on the dtype() acquires the lock for references.
+  //
+  // TODO(b/133843385): Disallow dtype modifications
+  DataType dtype_safe() const {
+    if (is_ref()) {
+      tf_shared_lock ml(*mutex_if_ref);
+      return MakeRefType(tensor->dtype());
+    } else {
+      return tensor->dtype();
+    }
+  }
+
+  mutex* mutex_if_ref;  // nullptr if not a ref, != nullptr if a ref
+  Tensor* tensor;
+};
+
+// Used to store partitioned graphs from function-calling ops.
+struct GraphCollector {
+  mutex mu;
+  std::vector<GraphDef> partitioned_graphs TF_GUARDED_BY(mu);
+  GraphDef raw_graph TF_GUARDED_BY(mu);
+  GraphDef optimized_graph TF_GUARDED_BY(mu);
+
+  bool dirty TF_GUARDED_BY(mu);
+
+  GraphCollector() : dirty(false) {}
+
+  void CollectRawGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    raw_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectOptimizedGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    optimized_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectPartitionedGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    partitioned_graphs.push_back(graph);
+    dirty = true;
+  }
+
+  void ClearGraphs() TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    raw_graph.Clear();
+    optimized_graph.Clear();
+    partitioned_graphs.clear();
+    dirty = false;
+  }
+
+  bool HasUpdatedGraphs() {
+    mutex_lock ml(mu);
+    return dirty;
+  }
+};
+
+class OpKernelContext {
+ public:
+  // The first element of a WrappedAllocator is a "base" Allocator and
+  // the second element is that Allocator wrapped by a
+  // TrackingAllocator
+  typedef std::pair<Allocator*, TrackingAllocator*> WrappedAllocator;
+
+  // TODO(zhifengc): Do some cleanup of Params.
+  // The Params struct is passed in to initialize an OpKernelContext,
+  // and must outlive the OpKernelContext.
+  struct Params {
+    ~Params() { delete eigen_gpu_device; }
+
+    // The step being executed.
+    int64_t step_id = 0;
+
+    // Timestamp for the start of graph execution. Used for latency metrics.
+    int64_t start_time_usecs = 0;
+
+    // The deadline for the session to complete by. Empty if unspecified.
+    std::optional<absl::Time> deadline;
+
+    // The op kernel being computed.
+    OpKernel* op_kernel = nullptr;
+
+    // The device on which the kernel is running.
+    DeviceBase* device = nullptr;
+
+    // The Eigen GPU device wrapper, which may include a per-op
+    // wrapped allocator. The concrete type of this object depends on
+    // the type of this->device, so eigen_gpu_device can't be an
+    // inline member and must be heap allocated. However, we don't
+    // want to allocate a new eigen_gpu_device for every Op that is
+    // executed. Instead this member is allocated on first use using
+    // ensure_eigen_gpu_device, and then if the Params structure is
+    // re-used for subsequent Ops, the eigen_gpu_device is
+    // ReInitialized in the OpKernelContext constructor. Unlike the
+    // other pointers in Params, this one is owned by Params.
+    PerOpGpuDevice* eigen_gpu_device = nullptr;
+
+    inline void ensure_eigen_gpu_device() {
+      DCHECK(device);
+      if (nullptr == eigen_gpu_device) {
+        // Surprisingly, MakeGpuDevice will return nullptr if the
+        // device is not a GPU device. This is ok, since those devices
+        // will never use eigen_gpu_device. It seems better to have
+        // ensure_eigen_gpu_device fall through and regenerate the
+        // nullptr every time an OpKernelContext is instantiated, than
+        // to do an unnecessary allocation of a dummy eigen GPU
+        // device for CPU device Ops.
+        eigen_gpu_device = device->MakeGpuDevice();
+      }
+    }
+
+    bool track_allocations = false;
+    bool log_memory = false;
+
+    // Array indexed by output number for this node
+    const AllocatorAttributes* output_attr_array = nullptr;
+
+    // Shared resources accessible by this op kernel invocation.
+    ResourceMgr* resource_manager = nullptr;
+
+    // Per-step resources accessible by this op kernel invocation should be
+    // stored in this container..
+    ScopedStepContainer* step_container = nullptr;
+
+    // Mechanism used by this op kernel invocation to communicate with
+    // computations running on other devices.
+    RendezvousInterface* rendezvous = nullptr;
+
+    // Mechanism for executing a collective op that needs to coordinate
+    // with parallel instances running on other devices.
+    CollectiveExecutor* collective_executor = nullptr;
+
+    // Session configuration parameters. Can be nullptr.
+    const ConfigProto* session_config = nullptr;
+
+    // The session state for this op.
+    SessionState* session_state = nullptr;
+
+    // Unique session identifier. Can be empty.
+    std::string session_handle;
+
+    // Metadata about the session. Can be nullptr.
+    const SessionMetadata* session_metadata = nullptr;
+
+    // The tensor store for this op.
+    TensorStore* tensor_store = nullptr;
+
+    // Mechanism used by this op kernel invocation to register a callback
+    // for its cancellation.
+    CancellationManager* cancellation_manager = nullptr;
+
+    // Inputs to this op kernel.
+    absl::Span<const TensorValue> inputs;
+    bool is_input_dead = false;
+
+    absl::Span<const AllocatorAttributes> input_alloc_attrs;
+
+    // Device context.
+    DeviceContext* op_device_context = nullptr;
+
+    // Control-flow op supports.
+    FrameAndIter frame_iter;
+
+    // Function call supports.
+    CallFrameInterface* call_frame = nullptr;
+    FunctionLibraryRuntime* function_library = nullptr;
+    std::function<void(std::function<void()>)>* runner = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
+    GraphCollector* graph_collector = nullptr;
+    bool run_all_kernels_inline = false;
+    const std::string* executor_type = nullptr;
+
+    // TensorSliceReaderCache support.
+    checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
+
+    // Support for forwarding reservations (used by ScopedAllocator).
+    static constexpr int kNeverForward = -2;
+    static constexpr int kNoReservation = -1;
+    // Values in [0,...) represent reservations for the indexed output.
+    const int* forward_from_array = nullptr;
+
+    // For tracking actively running deferred ops.
+    std::function<void()> inc_num_deferred_ops_function;
+    std::function<void()> dec_num_deferred_ops_function;
+
+    std::optional<ManagedStackTrace> stack_trace = {};
+
+    // For implementing `OpKernelContext::output_required()`. If null, all
+    // outputs are required.
+    bool* outputs_required_array = nullptr;
+
+    // For access to distributed coordination service.
+    tsl::CoordinationServiceAgent* coordination_service_agent = nullptr;
+  };
+
+  // params must outlive the OpKernelContext.
+  explicit OpKernelContext(Params* params);
+  OpKernelContext(Params* params, int num_outputs);
+  ~OpKernelContext();
+
+  Env* env() const { return params_->device->env(); }
+
+  int64_t step_id() const { return params_->step_id; }
+
+  int64_t start_time_usecs() const { return params_->start_time_usecs; }
+
+  const ConfigProto* session_config() const { return params_->session_config; }
+
+  // The deadline for the session to complete by. Empty if unspecified in
+  // RunOptions.
+  std::optional<absl::Time> deadline() const { return params_->deadline; }
+
+  const OpKernel& op_kernel() const { return *params_->op_kernel; }
+
+  // Stack trace of where the op was defined (if defined in eager mode).
+  const absl::optional<ManagedStackTrace>& stack_trace() const {
+    return params_->stack_trace;
+  }
+
+  // Input/output signature.
+
+  int num_inputs() const { return params_->inputs.size(); }
+  DataType input_dtype(int index) const;
+  absl::Status input_dtype(StringPiece name, DataType* dtype) const;
+  MemoryType input_memory_type(int index) const;
+
+  int num_outputs() const { return outputs_.size(); }
+  DataType expected_output_dtype(int index) const;
+  MemoryType output_memory_type(int index) const;
+
+  // Input
+
+  // Returns an immutable input tensor by index. May only be used for non-Ref
+  // inputs. For Ref inputs use mutable_input below.
+  // REQUIRES: !IsRefType(input_dtype(index))
+  // TODO(mrry): Convert this to return Status.
+  const Tensor& input(int index) const;
+
+  // Returns an immutable input tensor in "tensor" by index. May only be used
+  // for non-Ref inputs. For Ref inputs use mutable_input below.
+  // REQUIRES: !IsRefType(input_dtype(index))
+  absl::StatusOr<const Tensor*> get_input(int index) const;
+
+  // Returns the named immutable input tensor in "tensor", as defined
+  // in the OpDef. May only be used for non-Ref inputs. For Ref inputs
+  // use mutable_input below.
+  // REQUIRES: !IsRefType(input_dtype(index))
+  // REQUIRES: the named input must not be a list.
+  absl::Status input(StringPiece name, const Tensor** tensor);
+
+  // Returns the named list-valued immutable input in "list", as
+  // defined in the OpDef.  If the named output is not list-valued,
+  // returns a one-element list. May only be used for non-Ref
+  // inputs. For Ref inputs use mutable_input below.
+  // REQUIRES: !IsRefType(input_dtype(index))
+  absl::Status input_list(StringPiece name, OpInputList* list);
+
+  // For mutable inputs, use the following together to make sure there
+  // is no concurrent access to mutable_input(), e.g.:
+  // {
+  //   Tensor& t = context->mutable_input(index);
+  //   mutex_lock lock(*context->input_ref_mutex(index));
+  //   // modify the values in t
+  // }
+  // REQUIRES: IsRefType(input_dtype(index))
+  absl::Status input_ref_mutex(StringPiece name, mutex** out_mutex);
+
+  // Returns a mutable input tensor. Must be used to access Ref
+  // inputs.  REQUIRES: IsRefType(input_dtype(index)). The caller may
+  // modify the values stored in the Tensor buffer, and modifications
+  // will be visible to other Ops reading the same ref tensor. If
+  // !lock_held the input mutex will be acquired before returning the
+  // Tensor.
+  // TODO(mrry): Convert this to return Status.
+  Tensor mutable_input(int index, bool lock_held);
+
+  // Returns the named mutable input tensor in "tensor", as defined in
+  // the OpDef. Must be used to access Ref inputs. The values stored
+  // in the Tensor buffer may be modified, and modifications will be
+  // visible to other Ops reading the same ref tensor. If !lock_held
+  // the input mutex will be acquired before returning the Tensor.
+  // REQUIRES: the named input must not be a list.
+  // REQUIRES: the named input must be a ref tensor.
+  absl::Status mutable_input(StringPiece name, Tensor* tensor, bool lock_held);
+
+  // Returns the named list-valued mutable input in "list", as defined
+  // in the OpDef.  If the named input is not list-valued, returns a
+  // one-element list. Must be used to access Ref inputs. The values
+  // stored in the Tensor buffer may be modified, and modifications
+  // will be visible to other Ops reading the same ref tensor.
+  // REQUIRES: the named input must be a ref tensor.
+  absl::Status mutable_input_list(StringPiece name, OpMutableInputList* list);
+
+  // Replace the corresponding Ref Input to use the storage buffer
+  // used by tensor. If !lock_held the input mutex will be acquired
+  // before returning the Tensor.
+  // REQUIRES: IsRefType(input_dtype(index)).
+  void replace_ref_input(int index, const Tensor& tensor, bool lock_held);
+
+  // Replace the corresponding named Ref Input to use the storage
+  // buffer used by tensor. If !lock_held the input mutex will be
+  // acquired before returning the Tensor.
+  // REQUIRES: IsRefType(input_dtype(index)).
+  absl::Status replace_ref_input(StringPiece name, const Tensor& tensor,
+                                 bool lock_held);
+
+  // Deletes the Tensor object used as the Ref Input at
+  // input_index. This is not usually necessary and should be used
+  // with caution. If !lock_held the input mutex will be acquired
+  // before returning the Tensor.
+  // REQUIRES: IsRefType(input_dtype(input_index)).
+  void delete_ref_input(int input_index, bool lock_held);
+
+  // Return true if there is input at the given index. An operator has no
+  // input at index if its tensor is null. This is primarily used by the
+  // merge operator.
+  // TODO(mrry): Convert this to return Status.
+  bool has_input(int index) const;
+
+  // Returns true if all inputs are the same shape, otherwise sets the
+  // status to a non-OK value and returns false.
+  // Usage: if (!context->ValidateInputsAreSameShape(this)) return;
+  bool ValidateInputsAreSameShape(OpKernel* op);
+
+  // If non-null, kernels should populate with any partition subgraphs created.
+  GraphCollector* graph_collector() { return params_->graph_collector; }
+
+  // If True, hint that all kernels in functions called by this kernel, should
+  // be treated as "inexpensive", and hence executed on the scheduling thread.
+  bool run_all_kernels_inline() const {
+    return params_->run_all_kernels_inline;
+  }
+
+  // Returns the registered name for the executor type that is executing the
+  // current kernel. If empty, the default executor is used.
+  const std::string& executor_type() const;
+
+  // Input to output forwarding.
+
+  // Set the output Ref Tensor at output_index to be an alias of the
+  // input Ref Tensor at input_index.
+  // REQUIRES: IsRefType(input_dtype(input_index)).
+  // REQUIRES: IsRefType(output_dtype(output_index)).
+  void forward_ref_input_to_ref_output(int input_index, int output_index);
+
+  // Returns true when an alias to input[input_index], reshaped to output_shape,
+  // which is safe to use for in-place computation was written to *output.
+  // Returns false if input[input_index] has a refcount greater than one, or if
+  // its type does not match the expected output type of output[output_index],
+  // or the number of elements in input[input_index] does not equal the number
+  // of elements in output_shape.
+  bool forward_input_to_output_with_shape(int input_index, int output_index,
+                                          const TensorShape& output_shape,
+                                          Tensor** output) TF_MUST_USE_RESULT;
+  absl::Status forward_input_to_output_with_shape(
+      StringPiece input_name, StringPiece output_name,
+      const TensorShape& output_shape, Tensor** output);
+
+  // Returns a pointer to a Tensor aliasing the underlying buffer backing
+  // input[input_index] iff
+  //   * input[input_index] is not a ref,
+  //   * the data type, shape, memory type, and allocator attributes of
+  //     input[input_index] are compatible with those given in dtype, shape,
+  //     memory_type, and attr,
+  //   * refcount on the underlying buffer is one.
+  //   * Either there is no forwarding reservation for either input_index
+  //     or output_index or the specified input is reserved for the specified
+  //     output. More precisely:
+  //
+  //     These cases mean neither input nor output has a reservation:
+  //        forward_from_array = nullptr
+  //     OR (input_index is not in forward_from_array AND
+  //         (output_index == kNoReservation OR
+  //          forward_from_array[output_index] == kNoReservation))
+  //
+  //     This case means that input_index is reserved for output_index:
+  //        forward_from_array[output_index] == input_index
+  //
+  //     This case means the output is reserved to always be allocated,
+  //     never assigned a forwarded input:
+  //        forward_from_array[output_index] == kNeverForward
+  //
+  // Otherwise returns nullptr.
+  // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
+  // forwarding is only safe if there are no reads via __ldg() after writes
+  // to the same address.
+  std::unique_ptr<Tensor> forward_input(
+      int input_index, int output_index, DataType output_dtype,
+      const TensorShape& output_shape, MemoryType output_memory_type,
+      const AllocatorAttributes& output_attr) TF_MUST_USE_RESULT;
+
+  // Tries to forward one of the inputs given in input_indices to
+  // output[output_index]. If none of the given inputs can be forwarded, calls
+  // allocate_output() to allocate a new output buffer. The index of the
+  // forwarded input will be assign to output argument forwarded_input (if it's
+  // not nullptr). If no inputs are forwarded, forwarded_input will be assigned
+  // -1.
+  absl::Status forward_input_or_allocate_output(
+      absl::Span<const int> candidate_input_indices, int output_index,
+      const TensorShape& output_shape, Tensor** output,
+      int* forwarded_input = nullptr);
+  absl::Status forward_input_or_allocate_output(
+      absl::Span<const StringPiece> candidate_input_names,
+      StringPiece output_name, const TensorShape& output_shape,
+      Tensor** output);
+
+  // Tries to reuse one of the inputs given in input_indices as a temporary.
+  // If none of the given inputs can be forwarded, calls
+  // allocate_temp() to allocate a new temporary buffer.
+  absl::Status forward_input_or_allocate_temp(
+      absl::Span<const int> candidate_input_indices, DataType type,
+      const TensorShape& shape, const AllocatorAttributes& allocator_attr,
+      Tensor* out_temp);
+
+  absl::Status forward_input_or_allocate_temp(
+      absl::Span<const int> candidate_input_indices, DataType type,
+      const TensorShape& shape, Tensor* out_temp) {
+    return forward_input_or_allocate_temp(candidate_input_indices, type, shape,
+                                          AllocatorAttributes(), out_temp);
+  }
+
+  // Output
+
+  // Returns the named list-valued output in "list", as defined in the OpDef.
+  // If the named output is not list-valued, returns a one-element list.
+  absl::Status output_list(StringPiece name, OpOutputList* list);
+
+  // If output_required(index) returns true, the OpKernel's Compute() method
+  // should call allocate_output(index, ...), set_output(index, ...),
+  // set_output_ref(index, ...), or set the status to a non-ok value.
+  // If it returns false, it may output, but is not required to do so.
+  bool output_required(int index) const {
+    return !params_->outputs_required_array ||
+           params_->outputs_required_array[index];
+  }
+
+  // If output_expects_forwarding returns true, the OpKernel's Compute() method
+  // should not allocate the output with allocate_output but instead needs to
+  // use forward_input.
+  bool output_expects_forwarding(int index) const {
+    return params_->forward_from_array != nullptr &&
+           params_->forward_from_array[index] >= 0;
+  }
+
+  // Allocation of tensors during kernel execution inside the Compute
+  // method:
+  //
+  // There are two methods to allocate Tensors when an Op kernel
+  // executes.
+  //
+  // 1) allocate_output. This should be used to allocate any tensor
+  // that is going to be used as an output from the Op at the end of
+  // the current execution. The caller indicates which output the
+  // Tensor will be assigned to, and the call returns the
+  // newly-allocated Tensor. The Tensor can subsequently be assigned
+  // to during kernel execution, and will be used as the designated
+  // output when the kernel execution completes.
+  //
+  // 2) allocate_temp. This should be used to allocate any scratch
+  // storage that is needed while the kernel is executing, and will
+  // not be retained by the Op.
+  //
+  // In some cases a Tensor needs to be used as an output even though
+  // it was previously allocated elsewhere. The Tensor may have been
+  // passed as an input, or stored in a Tensor during a
+  // previous kernel execution, or allocated earlier in the kernel
+  // execution at a time when it was not known which output it would
+  // be assigned to. In this case the kernel can use set_output or
+  // set_output_ref to indicate that the tensor should be used as the
+  // designated output. It is legal to use any previously-allocated
+  // Tensor as an argument to set_output or set_output_ref, including
+  // Tensors allocated via allocate_temp. There may be a performance
+  // penalty to using a Tensor that was not allocated using
+  // allocate_output. This is because allocate_output uses the
+  // AllocatorAttributes stored in output_attr_array for the
+  // designated output. In some cases, using the wrong attributes may
+  // cause an extra copy of the Tensor's buffer.
+
+  // Allocates output for the specified output index with shape.
+  // OpKernelContext retains ownership of the returned pointer. See
+  // comment above.
+  //
+  // If memory allocation fails, returns an error status.
+  //
+  // REQUIRES: !IsRefType(expected_output_dtype(index))
+  absl::Status allocate_output(int index, const TensorShape& shape,
+                               Tensor** tensor);
+  absl::Status allocate_output(StringPiece name, const TensorShape& shape,
+                               Tensor** tensor);
+  // The following methods use the supplied attributes instead of
+  // those in output_attr_array. The caller is responsible for
+  // ensuring that the attributes are "compatible" with the
+  // output_attr_array, e.g. the tensor is allocated on the correct
+  // device. See comment above.
+  absl::Status allocate_output(int index, const TensorShape& shape,
+                               Tensor** tensor, AllocatorAttributes attr);
+  absl::Status allocate_output(StringPiece name, const TensorShape& shape,
+                               Tensor** tensor, AllocatorAttributes attr);
+
+  // Allocates a temporary Tensor of the specified type and
+  // shape. Devices such as GPUs that enqueue Ops for lazy execution
+  // may retain references to the temporary tensors after the Op's
+  // Compute method has run. See comment above.
+  absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                             Tensor* out_temp,
+                             AllocatorAttributes allocator_attr,
+                             const AllocationAttributes& allocation_attr);
+  absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                             Tensor* out_temp,
+                             AllocatorAttributes allocator_attr);
+  absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                             Tensor* out_temp);
+
+  // Copies a tensor (allocated by the caller) to the specified output
+  // index.  REQUIRES: !IsRefType(expected_output_dtype(index))
+  // REQUIRES: 'tensor' must have the same MemoryType as
+  // output_memory_types[index]. See comment above.
+  absl::Status set_output(StringPiece name, const Tensor& tensor);
+  absl::Status set_output(StringPiece name, Tensor&& tensor);
+  void set_output(int index, const Tensor& tensor);
+  void set_output(int index, Tensor&& tensor);
+
+  // To output a reference.  Caller retains ownership of mu and tensor_for_ref,
+  // and they must outlive all uses within the step. See comment above.
+  // REQUIRES: IsRefType(expected_output_dtype(index))
+  absl::Status set_output_ref(StringPiece name, mutex* mu,
+                              Tensor* tensor_for_ref);
+
+  // Returns nullptr if allocate_output() or set_output() have not been called.
+  absl::Status mutable_output(StringPiece name, Tensor** tensor);
+
+  // Return the DeviceContext that should be used for this Op.
+  //
+  // If using the templated function, the type must be a subclass
+  // of DeviceContext.
+  //
+  // Returns nullptr if the device did not provide one.
+  template <typename T>
+  T* op_device_context();
+  DeviceContext* op_device_context() {
+    DeviceContext* ret = params_->op_device_context;
+    if (ret == nullptr) {
+      auto* dev_info = device()->tensorflow_accelerator_device_info();
+      if (dev_info) ret = dev_info->default_context;
+    }
+    return ret;
+  }
+
+  AllocatorAttributes input_alloc_attr(int index) const {
+    if (params_->input_alloc_attrs.empty()) {
+      return AllocatorAttributes();
+    } else {
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, params_->input_alloc_attrs.size());
+      return params_->input_alloc_attrs[index];
+    }
+  }
+
+  AllocatorAttributes output_alloc_attr(int index) const {
+    return params_->output_attr_array[index];
+  }
+
+  absl::InlinedVector<WrappedAllocator, 4UL> ConsumeWrappedAllocators() {
+    absl::InlinedVector<WrappedAllocator, 4UL> retrieved;
+    if (tracking_state_) {
+      mutex_lock lock(tracking_state_->mu);
+      retrieved.swap(tracking_state_->wrapped_allocators);
+    }
+    return retrieved;
+  }
+
+  // Communication.
+  //
+  // An op kernel communicates with outside environment through
+  // Rendezvous Send() and Recv().
+  RendezvousInterface* rendezvous() const { return params_->rendezvous; }
+
+  CollectiveExecutor* collective_executor() const {
+    return params_->collective_executor;
+  }
+
+  // An op kernel can access the session state it belongs to.
+  SessionState* session_state() const { return params_->session_state; }
+
+  // Unique identifier of the session it belongs to. Can be empty.
+  std::string session_handle() const { return params_->session_handle; }
+
+  // Metadata about the session. Can be nullptr.
+  const SessionMetadata* session_metadata() const {
+    return params_->session_metadata;
+  }
+
+  // An op kernel can access the tensor store of the run it belongs to.
+  TensorStore* tensor_store() const { return params_->tensor_store; }
+
+  // Function call support.
+  //
+  // If this kernel invocation is within a function execution,
+  // call_frame() returns the call frame for the function call.
+  CallFrameInterface* call_frame() const { return params_->call_frame; }
+
+  // If not nullptr, the kernel invoke functions defined in the
+  // library. E.g., CHECK_NOTNULL(function_library())->Run("Foo", ...).
+  FunctionLibraryRuntime* function_library() const {
+    return params_->function_library;
+  }
+
+  std::function<void(std::function<void()>)>* runner() const {
+    return params_->runner;
+  }
+  StepStatsCollectorInterface* stats_collector() const {
+    return params_->stats_collector;
+  }
+
+  // Shared resources accessible to this kernel.
+  ResourceMgr* resource_manager() const { return params_->resource_manager; }
+
+  checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache() const {
+    return params_->slice_reader_cache;
+  }
+
+  // Execution.
+  //
+  // OpKernels can use these eigen devices to carry out their
+  // numerical computation.
+  const Eigen::ThreadPoolDevice& eigen_cpu_device() const {
+    return *device()->eigen_cpu_device();
+  }
+  const Eigen::GpuDevice& eigen_gpu_device() const {
+    return params_->eigen_gpu_device->device();
+  }
+  template <typename EigenDeviceType>
+  const EigenDeviceType& eigen_device() const;
+
+  // Error handling.
+
+  // If expected_inputs == inputs() and expected_outputs == output_types(),
+  // returns OK, else returns INVALID_ARGUMENT with an error message.
+  // Recommended for Ops with dynamic signatures, where validation can only
+  // be performed at runtime.
+  absl::Status MatchSignature(const DataTypeSlice expected_inputs,
+                              const DataTypeSlice expected_outputs);
+
+  // An OpKernel should call SetStatus() if Compute() encounters an
+  // error.
+  void SetStatus(const absl::Status& status);
+  const absl::Status& status() const { return status_; }
+
+  // Cancellation.
+  //
+  // EXPERIMENTAL. See the implementation in tensorflow::FIFOQueue for an
+  // example of how to use this API.
+  CancellationManager* cancellation_manager() const {
+    return params_->cancellation_manager;
+  }
+
+  // Other accessors.
+
+  // For control flow.
+  FrameAndIter frame_iter() const { return params_->frame_iter; }
+  bool is_input_dead() const { return params_->is_input_dead; }
+
+  // May be used, e.g., to get GPU handles, etc.
+  // TODO(tucker): Add example usage.
+  DeviceBase* device() const { return params_->device; }
+
+  // Per-step container for use by white-listed internal ops.
+  ScopedStepContainer* step_container() const {
+    return params_->step_container;
+  }
+
+  // Access to distributed coordination service.
+  tsl::CoordinationServiceAgent* coordination_service_agent() const {
+    return params_->coordination_service_agent;
+  }
+
+  // Helper routines for the OP_REQUIRES macros
+  void CtxFailure(const absl::Status& s);
+  void CtxFailureWithWarning(const absl::Status& s);
+  void CtxFailure(const char* file, int line, const absl::Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const absl::Status& s);
+
+  // Unrecommended functions: these are functions that have some
+  // current uses but are not recommended for use, and may go away at
+  // some future major version release.
+  //
+  // The following functions all have versions that return Status
+  // to capture error conditions, and are strongly preferred.
+  Tensor* mutable_output(int index);
+  mutex* input_ref_mutex(int index);
+  void set_output_ref(int index, mutex* mu, Tensor* tensor_for_ref);
+  TensorValue release_output(int index);
+
+  bool track_allocations() const { return params_->track_allocations; }
+
+  // Records temp memory allocation. Tensor object is recorded to identify the
+  // case where temp memory is used as output memory.
+  void record_temp_memory_allocation(int64_t size, const Tensor& t)
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
+
+  // Returns recorded size of temporary memory;
+  int64_t temp_memory_allocated() const
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
+
+  // Records persistent memory allocation, size can be negative indicating
+  // deallocation.
+  void record_persistent_memory_allocation(int64_t size, int64_t alloc_id = -1)
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
+
+  // Returns recorded size and ids of persistent memory.
+  int64_t persistent_memory_allocated() const
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
+
+  std::vector<int64_t> persistent_alloc_ids() const
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
+
+  // Resets counters for temp and persistent memory and recorded ids.
+  void clear_recorded_memory() TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
+
+  bool input_is_ref(int index) const;
+
+  void set_record_memory_consumption(bool v);
+
+  // Used by OpKernel implementations to track actively running deferred ops.
+  //
+  // A deferred op is one whose Compute method returns (or whose ComputeAsync
+  // method invokes the callback) when work is scheduled onto a device. At that
+  // point, we don't know when the work will actually complete (or if it has
+  // already completed) on the device. These functions allow the executor to
+  // track the status of deferred ops and act accordingly.
+  //
+  // Deferred OpKernel implementations must use these methods to get two
+  // functions. It then must call these two functions in pairs, before and after
+  // device execution, respectively.
+  TF_MUST_USE_RESULT std::function<void()> inc_num_deferred_ops_function() {
+    DCHECK(params_->op_kernel->is_deferred());
+    return params_->inc_num_deferred_ops_function
+               ? params_->inc_num_deferred_ops_function
+               : []() {};
+  }
+  TF_MUST_USE_RESULT std::function<void()> dec_num_deferred_ops_function() {
+    DCHECK(params_->op_kernel->is_deferred());
+    return params_->dec_num_deferred_ops_function
+               ? params_->dec_num_deferred_ops_function
+               : []() {};
+  }
+
+  Allocator* get_allocator(AllocatorAttributes attr);
+
+  Params* params() const { return params_; }
+  void set_params(Params* params) { params_ = params; }
+
+  void ResetOutputs(int num_outputs = 0) {
+    for (TensorValue& value : outputs_) {
+      DCHECK(!value.is_ref());
+      delete value.tensor;
+      value.tensor = nullptr;
+    }
+    outputs_.resize(num_outputs);
+  }
+
+ private:
+  bool record_memory_consumption_ = false;
+
+  // Internal common method used when allocating tensor memory
+  absl::Status allocate_tensor(DataType type, const TensorShape& shape,
+                               Tensor* out_tensor,
+                               AllocatorAttributes allocator_attr) {
+    return allocate_tensor(type, shape, out_tensor, allocator_attr,
+                           AllocationAttributes());
+  }
+
+  absl::Status allocate_tensor(DataType type, const TensorShape& shape,
+                               Tensor* out_tensor,
+                               AllocatorAttributes allocator_attr,
+                               const AllocationAttributes& allocation_attr);
+
+  // Helpers for `set_output()`.
+
+  // Returns `true` if the tensor was copied into an allocated output.
+  bool maybe_set_output_by_allocate_and_copy(int index, const Tensor& tensor);
+
+  void maybe_track_allocations_for_set_output(const Tensor& tensor);
+
+  absl::Status get_input_index(StringPiece name, int* out_index) const;
+  absl::Status get_output_index(StringPiece name, int* out_index) const;
+
+  // Initialize the allocated_scope_ids_ set the first time this method is
+  // called.
+  void maybe_initialize_scope_id_set();
+
+  absl::Status status_;
+  friend class CollectiveExecutor;  // for access to params_
+  Params* params_;                  // not owned
+  absl::InlinedVector<TensorValue, 4UL> outputs_;
+
+  // Keep track of calls to ScopedAllocator.
+  // TODO(ayushd): change to absl::flat_hash_set.
+  std::unique_ptr<std::unordered_set<int32>> allocated_scope_ids_;
+
+  // The following data members are only used when allocation tracking is
+  // enabled, memory consumption is being recorded, or tensor access is being
+  // recorded.
+  struct TrackingState {
+    mutable mutex mu;
+    absl::InlinedVector<WrappedAllocator, 4UL> wrapped_allocators
+        TF_GUARDED_BY(mu);
+
+    mutable mutex stats_mu;
+    int64_t temp_memory_allocated TF_GUARDED_BY(stats_mu) = 0;
+
+    int64_t persistent_memory_allocated TF_GUARDED_BY(stats_mu) = 0;
+    absl::InlinedVector<std::pair<const void*, int64_t>, 2UL>
+        temp_tensor_buffer_and_size TF_GUARDED_BY(stats_mu);
+    absl::InlinedVector<int64_t, 2UL> persistent_alloc_ids
+        TF_GUARDED_BY(stats_mu);
+  };
+  std::unique_ptr<TrackingState> tracking_state_;
+
+  // For access to `params_->op_kernel`.
+  friend void CheckNotInComputeAsync(OpKernelContext* ctx,
+                                     const char* correct_macro_name);
+
+  OpKernelContext(const OpKernelContext&) = delete;
+  void operator=(const OpKernelContext&) = delete;
+};
+
+template <>
+const Eigen::ThreadPoolDevice& OpKernelContext::eigen_device() const;
+
+template <>
+const Eigen::GpuDevice& OpKernelContext::eigen_device() const;
+
+// Register your OpKernel by specifying the Op's name, the device the
+// kernel runs on, any type attr constraints for this kernel, any
+// host-memory args, and the class to instantiate.  Examples:
+//
+//  // A kernel that supports all types.
+//  REGISTER_KERNEL_BUILDER(Name("Save").Device(DEVICE_CPU), SaveOp);
+//
+//  // The following are equivalent ways of specifying that the kernel only
+//  // works if the "T" type attr is set to DT_FLOAT.
+//  REGISTER_KERNEL_BUILDER(
+//      Name("Sub").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+//      SubOp<float>);
+//  // (You would then repeat this for every type supported by "Sub".)
+//
+//  // This form allows you to specify a list of types as the constraint.
+//  REGISTER_KERNEL_BUILDER(Name("Sub")
+//                              .Device(DEVICE_CPU)
+//                              .TypeConstraint("T", {DT_FLOAT}),
+//                          SubOp<float>);
+//
+//  // A kernel that expects one of the input tensors in host memory.
+//  REGISTER_KERNEL_BUILDER(
+//      Name("Reshape").Device(DEVICE_GPU).HostMemory("shape"), ReshapeOp);
+//
+//  // A kernel that works on any device. Kernels using DEVICE_DEFAULT
+//  // must aways run on host and all inputs and outputs must use `HostMemory`.
+//  // Kernels for data management, control-flow primitives or working with
+//  // tensor shapes for various devices (including `PluggableDevices`) are
+//  // typical uses.
+//  REGISTER_KERNEL_BUILDER(
+//     Name("TensorListLength").Device(DEVICE_DEFAULT).HostMemory("length"),
+//     TensorListLength);
+//
+// See kernel_def_builder for details.
+
+// Instantiate an OpKernel that has been registered.  Returns nullptr
+// if no operation for that type of device / input signature combination
+// (and a NOT_FOUND *status), or there is an error in construction (and
+// an INVALID_ARGUMENT *status).  Otherwise, the caller takes ownership
+// of the returned pointer.
+// EXPECTED USAGE: unique_ptr<OpKernel> op = CreateOpKernel(...);
+// REQUIRES: def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
+std::unique_ptr<OpKernel> CreateOpKernel(
+    DeviceType device_type, DeviceBase* device, Allocator* allocator,
+    const NodeDef& node_def, int graph_def_version, absl::Status* status);
+
+std::unique_ptr<OpKernel> CreateOpKernel(
+    DeviceType device_type, DeviceBase* device, Allocator* allocator,
+    const std::shared_ptr<const NodeProperties>& props, int graph_def_version,
+    absl::Status* status);
+
+absl::Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
+                            Allocator* allocator, FunctionLibraryRuntime* flib,
+                            const std::shared_ptr<const NodeProperties>& props,
+                            int graph_def_version, OpKernel** kernel);
+
+absl::Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
+                            Allocator* allocator, FunctionLibraryRuntime* flib,
+                            ResourceMgr* resource_mgr,
+                            const std::shared_ptr<const NodeProperties>& props,
+                            int graph_def_version, OpKernel** kernel);
+
+// Returns into 'device_types' the subset of prioritized_types that this
+// binary has registered for the given NodeDef.
+//
+// REQUIRES: * 'device_types' is not nullptr.
+//           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
+absl::Status SupportedDeviceTypesForNode(
+    const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
+    PrioritizedDeviceTypeVector* device_types,
+    const DeviceNameUtils::ParsedName* local_address_spec = nullptr);
+
+// Returns a message with a description of the kernels registered for op
+// `op_name`.
+std::string KernelsRegisteredForOp(StringPiece op_name);
+
+// Call once after Op registration has completed.
+absl::Status ValidateKernelRegistrations(
+    const OpRegistryInterface& op_registry);
+
+// -----------------------------------------------------------------------------
+// OpKernel registration implementation follows, please ignore.
+
+// Allow the REGISTER_KERNEL_BUILDER(Name("op_name").Device(...)...) syntax.
+namespace register_kernel {
+
+class Name : public KernelDefBuilder {
+ public:
+  explicit Name(const char* op);
+};
+
+}  // namespace register_kernel
+
+// Kernel registration appears as:
+//   REGISTER_KERNEL_BUILDER(Name("OpName").Device(DEVICE_CPU)..., OpImpl)
+// We'd like to have "OpName" as a constant-expression, without requiring that
+// of the overall KernelDefBuilder expression (beginning with the
+// register_kernel::Name constructor above).
+//
+// So, we pull the "OpName" part to a separate macro-level argument. This
+// involves treating Name("OpName") as a macro call, via token-pasting (e.g.
+// M_## =>  M_Name("OpName")), and having it expand to '"OpName",
+// Name("OpName")' which is then usable as two arguments.
+#define TF_EXTRACT_KERNEL_NAME_Name(name_str) \
+  name_str, ::tensorflow::register_kernel::Name(name_str)
+#define TF_EXTRACT_KERNEL_NAME_IMPL(m, ...) m(__VA_ARGS__)
+#define TF_EXTRACT_KERNEL_NAME(m, kernel_builder, ...)                    \
+  TF_EXTRACT_KERNEL_NAME_IMPL(m, TF_EXTRACT_KERNEL_NAME_##kernel_builder, \
+                              __VA_ARGS__)
+
+// REGISTER_KERNEL_BUILDER_IMPL_2, with a unique 'ctr' as the first argument.
+// TODO(dodgen): There are some uses of this macro inside functions, where
+// kernel_builder refers to (non-const) locals (they should be fixed). To
+// accommodate those, kernel_builder.Build() appears as an argument to an
+// immediately-called lambda (not in the lambda itself).
+#define REGISTER_KERNEL_BUILDER_IMPL_3(ctr, op_name, kernel_builder_expr,   \
+                                       is_system_kernel, ...)               \
+  static ::tensorflow::InitOnStartupMarker const register_kernel_##ctr      \
+      TF_ATTRIBUTE_UNUSED =                                                 \
+          TF_INIT_ON_STARTUP_IF(is_system_kernel ||                         \
+                                (SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__) && \
+                                 SHOULD_REGISTER_OP(op_name)))              \
+          << ([](::tensorflow::KernelDef const* kernel_def) {               \
+               ::tensorflow::kernel_factory::OpKernelRegistrar registrar(   \
+                   kernel_def, #__VA_ARGS__,                                \
+                   [](::tensorflow::OpKernelConstruction* context)          \
+                       -> ::tensorflow::OpKernel* {                         \
+                     return new __VA_ARGS__(context);                       \
+                   });                                                      \
+               (void)registrar;                                             \
+               LOG_KERNEL_SOURCES(op_name)                                  \
+               return ::tensorflow::InitOnStartupMarker{};                  \
+             })(kernel_builder_expr.Build());
+
+// REGISTER_KERNEL_BUILDER_IMPL, but with kernel_builder split to op_name,
+// kernel_builder_expr.
+#define REGISTER_KERNEL_BUILDER_IMPL_2(op_name, kernel_builder_expr, \
+                                       is_system_kernel, ...)        \
+  TF_NEW_ID_FOR_INIT(REGISTER_KERNEL_BUILDER_IMPL_3, op_name,        \
+                     kernel_builder_expr, is_system_kernel, __VA_ARGS__)
+
+// REGISTER_KERNEL_BUILDER, but with is_system_kernel bound.
+#define REGISTER_KERNEL_BUILDER_IMPL(kernel_builder, is_system_kernel, ...) \
+  TF_EXTRACT_KERNEL_NAME(REGISTER_KERNEL_BUILDER_IMPL_2, kernel_builder,    \
+                         is_system_kernel, __VA_ARGS__)
+
+#define REGISTER_KERNEL_BUILDER(kernel_builder, ...) \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                 \
+  REGISTER_KERNEL_BUILDER_IMPL(kernel_builder, false, __VA_ARGS__)
+
+// The `REGISTER_SYSTEM_KERNEL_BUILDER()` macro acts as
+// `REGISTER_KERNEL_BUILDER()` except that the kernel is registered
+// unconditionally even when selective registration is used.
+#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...) \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                        \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel:system")                 \
+  REGISTER_KERNEL_BUILDER_IMPL(kernel_builder, true, __VA_ARGS__)
+
+// Checks whether a given kernel is registered on device_type.
+bool KernelDefAvailable(const DeviceType& device_type, const NodeDef& node_def);
+
+// If node of node_name, experimental_debug_info, node_op, node_device and
+// node_attrs has a corresponding kernel registered on device_type, returns OK
+// and fill in the kernel def and kernel_class_name. <def> and
+// <kernel_class_name> may be null.
+absl::Status FindKernelDef(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, StringPiece node_device, AttrSlice node_attrs,
+    const KernelDef** def, std::string* kernel_class_name);
+
+// If node_def has a corresponding kernel registered on device_type,
+// returns OK and fill in the kernel def and kernel_class_name. <def> and
+// <kernel_class_name> may be null.
+absl::Status FindKernelDef(const DeviceType& device_type,
+                           const NodeDef& node_def, const KernelDef** def,
+                           std::string* kernel_class_name);
+
+// Writes a list of all registered kernels to LOG(INFO), to help users debug
+// missing kernel errors.
+void LogAllRegisteredKernels();
+
+// Gets a list of all registered kernels.
+KernelList GetAllRegisteredKernels();
+
+// Gets a list of all registered kernels for which predicate returns true
+KernelList GetFilteredRegisteredKernels(
+    const std::function<bool(const KernelDef&)>& predicate);
+
+// Gets a list of all registered kernels for a given op
+KernelList GetRegisteredKernelsForOp(StringPiece op_name);
+
+namespace kernel_factory {
+
+// OpKernelFactory is responsible for creating OpKernels when TensorFlow needs
+// them. You register factories with the TensorFlow core by constructing an
+// OpKernelRegistrar and passing the factory as a constructor parameter.
+class OpKernelFactory {
+ public:
+  virtual OpKernel* Create(OpKernelConstruction* context) = 0;
+  virtual ~OpKernelFactory() = default;
+};
+
+class OpKernelRegistrar {
+ public:
+  // Registers the given kernel factory with TensorFlow. TF will call the
+  // factory Create() method when it determines that a kernel matching the given
+  // KernelDef is required.
+  OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
+                    std::unique_ptr<OpKernelFactory> factory)
+      TF_ATTRIBUTE_NOINLINE {
+    InitInternal(kernel_def, kernel_class_name, std::move(factory));
+  }
+
+  // Registers the given factory function with TensorFlow. This is equivalent
+  // to registering a factory whose Create function invokes `create_fn`.
+  OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
+                    OpKernel* (*create_fn)(OpKernelConstruction*))
+      TF_ATTRIBUTE_NOINLINE {
+    InitInternal(kernel_def, kernel_class_name,
+                 std::make_unique<PtrOpKernelFactory>(create_fn));
+  }
+
+ private:
+  struct PtrOpKernelFactory : public OpKernelFactory {
+    explicit PtrOpKernelFactory(OpKernel* (*create_func)(OpKernelConstruction*))
+        : create_func_(create_func) {}
+
+    OpKernel* Create(OpKernelConstruction* context) override;
+
+    OpKernel* (*create_func_)(OpKernelConstruction*);
+  };
+
+  void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
+                    std::unique_ptr<OpKernelFactory> factory);
+};
+
+}  // namespace kernel_factory
+
+// -----------------------------------------------------------------------------
+// Template and inline method implementations, please ignore
+
+template <class T>
+absl::Status OpKernelConstruction::GetAttr(StringPiece attr_name,
+                                           T* value) const {
+  return GetNodeAttr(def(), attr_name, value);
+}
+
+inline DataType OpKernelContext::input_dtype(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_inputs());
+  const TensorValue& value(params_->inputs[index]);
+  return value.dtype();
+}
+
+inline MemoryType OpKernelContext::input_memory_type(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_inputs());
+  return op_kernel().input_memory_types()[index];
+}
+
+inline DataType OpKernelContext::expected_output_dtype(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_outputs());
+  return params_->op_kernel->output_type(index);
+}
+
+inline MemoryType OpKernelContext::output_memory_type(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_outputs());
+  return op_kernel().output_memory_types()[index];
+}
+
+inline bool OpKernelContext::input_is_ref(int index) const {
+  const TensorValue& value(params_->inputs[index]);
+  return value.is_ref();
+}
+
+// no input if tensor == nullptr.
+inline bool OpKernelContext::has_input(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_inputs());
+  return params_->inputs[index].tensor != nullptr;
+}
+
+inline mutex* OpKernelContext::input_ref_mutex(int index) {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_inputs());
+  DCHECK(input_is_ref(index));
+  return params_->inputs[index].mutex_if_ref;
+}
+
+inline Tensor* OpKernelContext::mutable_output(int index) {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_outputs());
+  return outputs_[index].tensor;
+}
+
+inline TensorValue OpKernelContext::release_output(int index) {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_outputs());
+  TensorValue value = outputs_[index];
+  outputs_[index] = TensorValue();
+  return value;
+}
+
+template <typename T>
+T* OpKernelContext::op_device_context() {
+  static_assert(std::is_base_of<DeviceContext, T>::value,
+                "T is not a subclass of DeviceContext");
+  return static_cast<T*>(op_device_context());
+}
+
+inline const Tensor& OpInputList::operator[](int i) const {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->input(start_ + i);
+}
+
+inline mutex* OpMutableInputList::ref_mutex(int i) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->input_ref_mutex(start_ + i);
+}
+
+inline Tensor OpMutableInputList::at(int i, bool lock_held) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->mutable_input(start_ + i, lock_held);
+}
+
+inline Tensor* OpOutputList::operator[](int i) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->mutable_output(start_ + i);
+}
+
+inline bool OpOutputList::required(int i) const {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->output_required(start_ + i);
+}
+
+inline DataType OpOutputList::expected_output_dtype(int i) const {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->expected_output_dtype(start_ + i);
+}
+
+inline absl::Status OpOutputList::allocate(int i, const TensorShape& shape,
+                                           Tensor** output) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->allocate_output(start_ + i, shape, output);
+}
+
+inline void OpOutputList::set(int i, const Tensor& tensor) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  ctx_->set_output(start_ + i, tensor);
+}
+
+inline void OpOutputList::set(int i, Tensor&& tensor) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  ctx_->set_output(start_ + i, std::move(tensor));
+}
+
+inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  ctx_->set_output_ref(i, mu, tensor_for_ref);
+}
+
+// Generate a fatal error if OP_REQUIRES or OP_REQUIRES_OK are used in
+// AsyncOpKernel implementations. If these macros are used and the condition
+// does not hold, the `done` callback will never be called and the system will
+// deadlock, so a crash failure is preferable. Since the OP_REQUIRES[_OK] macros
+// are legal to use in AsyncOpKernel constructors, we use overload resolution
+// to distinguish between OpKernelConstruction* and OpKernelContext* context
+// types.
+class XlaOpKernelContext;
+inline void CheckNotInComputeAsync(XlaOpKernelContext*, const char*) {}
+inline void CheckNotInComputeAsync(OpKernelConstruction*, const char*) {}
+void CheckNotInComputeAsync(OpKernelContext* ctx,
+                            const char* correct_macro_name);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_kernel_test_base.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_kernel_test_base.h
new file mode 100644
index 00000000..7b3951e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_kernel_test_base.h
@@ -0,0 +1,177 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_TEST_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_TEST_BASE_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+static std::vector<DeviceType> DeviceTypes() {
+  return {DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)};
+}
+
+class OpKernelBuilderTest : public ::testing::Test {
+ protected:
+  // Each attr is described by a "name|type|value".
+  NodeDef CreateNodeDef(const string& op_type,
+                        const std::vector<string>& attrs) {
+    NodeDef node_def;
+    node_def.set_name(op_type + "-op");
+    node_def.set_op(op_type);
+    for (const string& attr_desc : attrs) {
+      std::vector<string> parts = str_util::Split(attr_desc, '|');
+      CHECK_EQ(parts.size(), 3);
+      AttrValue attr_value;
+      CHECK(ParseAttrValue(parts[1], parts[2], &attr_value)) << attr_desc;
+      node_def.mutable_attr()->insert(
+          AttrValueMap::value_type(parts[0], attr_value));
+    }
+    return node_def;
+  }
+
+  std::unique_ptr<OpKernel> ExpectSuccess(const string& op_type,
+                                          const DeviceType& device_type,
+                                          const std::vector<string>& attrs,
+                                          DataTypeSlice input_types = {}) {
+    absl::Status status;
+    NodeDef def = CreateNodeDef(op_type, attrs);
+    for (size_t i = 0; i < input_types.size(); ++i) {
+      def.add_input("a:0");
+    }
+
+    Env* env = Env::Default();
+    DeviceBase device(env);
+
+    // Test CreateOpKernel()
+    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device,
+                                                cpu_allocator(), def,
+                                                TF_GRAPH_DEF_VERSION, &status));
+    EXPECT_TRUE(status.ok()) << status;
+    EXPECT_TRUE(op != nullptr);
+    if (op != nullptr) {
+      EXPECT_EQ(input_types.size(), op->num_inputs());
+      EXPECT_EQ(0, op->num_outputs());
+    }
+
+    // Test SupportedDeviceTypesForNode()
+    PrioritizedDeviceTypeVector devices;
+    TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
+    bool found = false;
+    for (const auto& dt : devices) {
+      if (dt.first == device_type) {
+        found = true;
+      }
+    }
+    EXPECT_TRUE(found) << "Missing " << device_type << " from "
+                       << devices.size() << " devices.";
+
+    // In case the caller wants to use the OpKernel
+    return op;
+  }
+
+  void ExpectFailure(const string& op_type, const DeviceType& device_type,
+                     const std::vector<string>& attrs, error::Code code) {
+    absl::Status status;
+    const NodeDef def = CreateNodeDef(op_type, attrs);
+    Env* env = Env::Default();
+    DeviceBase device(env);
+
+    // Test CreateOpKernel().
+    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device,
+                                                cpu_allocator(), def,
+                                                TF_GRAPH_DEF_VERSION, &status));
+    EXPECT_TRUE(op == nullptr);
+    EXPECT_FALSE(status.ok());
+    if (!status.ok()) {
+      LOG(INFO) << "Status message: " << status.message();
+      EXPECT_EQ(code, status.code());
+
+      // Test SupportedDeviceTypesForNode().
+      PrioritizedDeviceTypeVector devices;
+      if (absl::IsNotFound(status)) {
+        TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
+        for (const auto& dt : devices) {
+          EXPECT_NE(dt.first, device_type);
+        }
+      } else {
+        absl::Status status2 =
+            SupportedDeviceTypesForNode(DeviceTypes(), def, &devices);
+        EXPECT_EQ(status.code(), status2.code());
+      }
+    }
+  }
+
+  string GetKernelClassName(const string& op_type,
+                            const DeviceType& device_type,
+                            const std::vector<string>& attrs,
+                            DataTypeSlice input_types = {}) {
+    NodeDef def = CreateNodeDef(op_type, attrs);
+    for (size_t i = 0; i < input_types.size(); ++i) {
+      def.add_input("a:0");
+    }
+
+    const KernelDef* kernel_def = nullptr;
+    string kernel_class_name;
+    const absl::Status status =
+        FindKernelDef(device_type, def, &kernel_def, &kernel_class_name);
+    if (status.ok()) {
+      return kernel_class_name;
+    } else if (absl::IsNotFound(status)) {
+      return "not found";
+    } else {
+      return status.ToString();
+    }
+  }
+};
+
+class BaseKernel : public ::tensorflow::OpKernel {
+ public:
+  explicit BaseKernel(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(::tensorflow::OpKernelContext* context) override {}
+  virtual int Which() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_requires.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_requires.h
new file mode 100644
index 00000000..d9a7e35c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_requires.h
@@ -0,0 +1,159 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
+
+#include <utility>
+
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Convenience macros for asserting and handling exceptional conditions.
+// Analogous to the CHECK* macros provided by logging.h.
+//
+// Example use:
+// void Compute(OperationContext* context) {
+//   OP_REQUIRES(context, context->num_inputs() == 2,
+//               errors::InvalidArgument("FooOp requires 2 arguments"));
+//   ...
+//   absl::Status status = SomeUncertainMethod();
+//   OP_REQUIRES_OK(context, status);
+//
+//   // Or in one go:
+//   OP_REQUIRES_OK(context, SomeUncertainMethod());
+//   ...
+// }
+//
+// The *_ASYNC versions take a CALLBACK macro argument which is called just
+// before the return in the failure case; the expression in the macro itself
+// is evaluated only in the failure case, and can therefore be expensive or
+// have side effects that must not occur in the successful case. For example:
+//
+//   auto done = MakeCleanup([&]() { /* necessary continuation */ });
+//   OP_REQUIRES_OK_ASYNC(context, SomeUncertainMethod(), done.release());
+//   // `done` is still engaged if and only if control reaches here.
+//
+// These macros depend on CheckNotInComputeAsync and on absl::Status, both
+// of which must be defined before invoking the macros. We specifically don't
+// include op_kernel.h or the Abseil headers from this header to reduce this
+// header's dependencies. These macros may be used with alternative
+// implementations of OpKernelContext with fewer dependencies.
+
+#define OP_REQUIRES(CTX, EXP, STATUS)                     \
+  do {                                                    \
+    if (!TF_PREDICT_TRUE(EXP)) {                          \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_ASYNC"); \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS));    \
+      return;                                             \
+    }                                                     \
+  } while (0)
+
+// The macro arguements passed to the ellipsis must combine to a single
+// expression that is convertible to absl::Status. We accept a variable
+// number of macro arguments only so as to support interior commas.
+#define OP_REQUIRES_OK(CTX, ...)                                        \
+  do {                                                                  \
+    if (!TF_PREDICT_TRUE(                                               \
+            ::tensorflow::op_requires_internal::OkImpl<::absl::Status>( \
+                (CTX), __FILE__, __LINE__,                              \
+                static_cast<const ::absl::Status&>(__VA_ARGS__)))) {    \
+      return;                                                           \
+    }                                                                   \
+  } while (0)
+
+#define OP_REQUIRES_OK_OR_SET_PAYLOAD(CTX, PAYLOAD_KEY, PAYLOAD_VALUE, STATUS) \
+  do {                                                                         \
+    if (!TF_PREDICT_TRUE(STATUS.ok())) {                                       \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC");                   \
+      if (!PAYLOAD_VALUE.empty()) {                                            \
+        STATUS.SetPayload(PAYLOAD_KEY, absl::Cord(PAYLOAD_VALUE));             \
+      }                                                                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, STATUS);                \
+      return;                                                                  \
+    }                                                                          \
+  } while (0)
+
+#define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK)  \
+  do {                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                       \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
+      (CALLBACK)();                                    \
+      return;                                          \
+    }                                                  \
+  } while (0)
+
+#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)                          \
+  do {                                                                       \
+    if (!TF_PREDICT_TRUE(                                                    \
+            ::tensorflow::op_requires_internal::OkAsyncImpl<::absl::Status>( \
+                (CTX), __FILE__, __LINE__, (STATUS)))) {                     \
+      (CALLBACK)();                                                          \
+      return;                                                                \
+    }                                                                        \
+  } while (0)
+
+#define OP_REQUIRES_VALUE(lhs, ctx, rexpr)                                   \
+  OP_REQUIRES_VALUE_IMPL(                                                    \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, ctx, \
+      rexpr)
+
+#define OP_REQUIRES_VALUE_IMPL(statusor, lhs, ctx, rexpr) \
+  auto statusor = (rexpr);                                \
+  OP_REQUIRES_OK(ctx, statusor.status());                 \
+  lhs = std::move(statusor.value())
+
+// The "Impl" functions are implementation details for the above macros. They
+// accept values constructed by the macros, and the values are guaranteed to
+// be alive for the duration of the function call. Passing the macro arguments
+// through a function call is important to support macro arguments that expand
+// to short-lived values (which could not be bound to a reference directly).
+//
+// We use a template parameter S instead of the concrete type absl::Status
+// so as to not require the inclusion of the Abseil header in this file.
+// The header must be included before the macros are used.
+
+namespace op_requires_internal {
+
+// ctx is usually a plain pointer, but could be a smart pointer, so we accept it
+// by const ref.
+template <typename S, typename Ctx>
+bool OkImpl(const Ctx& ctx, const char* file, int line, const S& s) {
+  if (!TF_PREDICT_TRUE(s.ok())) {
+    CheckNotInComputeAsync(ctx, "OP_REQUIRES_OK_ASYNC");
+    ctx->CtxFailureWithWarning(file, line, s);
+    return false;
+  } else {
+    return true;
+  }
+}
+
+// ctx is usually a plain pointer, but could be a smart pointer, so we accept it
+// by const ref.
+template <typename S, typename Ctx>
+bool OkAsyncImpl(const Ctx& ctx, const char* file, int line, const S& s) {
+  if (!TF_PREDICT_TRUE(s.ok())) {
+    ctx->CtxFailureWithWarning(file, line, s);
+    return false;
+  } else {
+    return true;
+  }
+}
+
+}  // namespace op_requires_internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/op_segment.h b/third_party/tflite-hdrs/tensorflow/core/framework/op_segment.h
new file mode 100644
index 00000000..10c4fa46
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/op_segment.h
@@ -0,0 +1,90 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_SEGMENT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_SEGMENT_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// OpSegment keeps track of OpKernels registered for sessions running
+// on a device.
+//
+// The implementation maintains a two-level map. The 1st level maps
+// session handle to the map of registered OpKernels. The 2nd level
+// map maps node names to instantiated OpKernel objects.
+//
+// Each 2-nd level map is reference-counted and the caller can call
+// AddHold to obtain a reference on all kernels of a session and
+// ensure these kernels are alive until a corresponding RemoveHold is
+// called on the same session.
+class OpSegment {
+ public:
+  OpSegment();
+  ~OpSegment();
+
+  // A hold can be placed on a session, preventing all its kernels
+  // from being deleted.
+  void AddHold(const std::string& session_handle);
+  void RemoveHold(const std::string& session_handle);
+
+  // If the kernel for "node_name" has been created in the
+  // "session_handle", returns the existing op kernel in "*kernel".
+  // Otherwise, creates the kernel by calling create_fn(), cache it,
+  // and returns it in "*kernel". If create_fn() fails, returns the
+  // error.
+  //
+  // OpSegment keeps the ownership of the returned "*kernel".
+  typedef std::function<absl::Status(OpKernel**)> CreateKernelFn;
+  absl::Status FindOrCreate(const std::string& session_handle,
+                            const std::string& node_name, OpKernel** kernel,
+                            CreateKernelFn create_fn);
+
+  // Returns true if OpSegment should own the kernel.
+  static bool ShouldOwnKernel(FunctionLibraryRuntime* lib,
+                              const std::string& node_op);
+
+ private:
+  // op name -> OpKernel
+  typedef std::unordered_map<string, OpKernel*> KernelMap;
+  struct Item {
+    int num_holds = 1;      // Num of holds put on the session.
+    KernelMap name_kernel;  // op name -> kernel.
+    ~Item();
+  };
+
+  // session handle -> item.
+  // Session handles are produced by strings::FpToString()
+  typedef std::unordered_map<string, Item*> SessionMap;
+
+  mutable mutex mu_;
+  SessionMap sessions_ TF_GUARDED_BY(mu_);
+
+  OpSegment(const OpSegment&) = delete;
+  void operator=(const OpSegment&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_SEGMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/ops_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/ops_util.h
new file mode 100644
index 00000000..ae73a562
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/ops_util.h
@@ -0,0 +1,116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
+
+// This file contains utilities for various operations.
+
+#include <array>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+// Calculates broadcast starting index and size.  For SAME padding, addition
+// padding could be applied to right, left, top and bottom.  Depending on the
+// current index, input size, kernel size, stride, padding size, the starting
+// index and size for broadcast for that dimension are different from the
+// current index and kernel size.
+// This is mainly used by gradient algorithms for pooling operations.
+absl::Status GetBroadcastSize(const int index, const int in_size,
+                              const int ksize, const int stride,
+                              const int pad_size, int* bindex, int* bsize);
+
+// Converts Brain's Padding to Eigen's PaddingType.
+Eigen::PaddingType BrainPadding2EigenPadding(Padding padding);
+
+// Given a shape 's' of a tensor of type T. Returns true iff the
+// number of bytes occupied by each dim 0 (i.e., &tensor(i + 1, ...) -
+// &tensor(i, ...)) is multiple of EIGEN_MAX_ALIGN_BYTES.
+template <typename T>
+bool IsInnerDimsSizeAligned(const TensorShape& s) {
+  if (s.dims() == 0) return false;
+  const int64_t dim0_size = s.dim_size(0);
+  if (dim0_size == 0) return false;
+#if EIGEN_MAX_ALIGN_BYTES == 0
+  return true;
+#else
+  const int64_t bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T);
+  return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
+#endif
+}
+
+// Given a shape 's' of a tensor of type T and the `start` and `end` index of a
+// dim 0 slice, returns true iff slice is aligned with respect to original
+// tensor. Here aligned implies the address is a multiple of
+// EIGEN_MAX_ALIGN_BYTES.
+template <typename T>
+bool IsDim0SliceAligned(const TensorShape& s, int64_t start,
+                        int64_t end_or_size) {
+  if (s.dims() == 1) {
+#if EIGEN_MAX_ALIGN_BYTES == 0
+    return true;
+#else
+    bool start_aligned = (start * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
+    // End is aligned if either the explicit end index is passed and is a
+    // a multiple of EIGEN_MAX_ALIGN_BYTES, or the start index is aligned and
+    // the size is aligned. So for convenience we can either pass start and
+    // index, or start and size.
+    bool end_aligned = (end_or_size * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
+    return start_aligned && end_aligned;
+#endif
+  } else {
+    return IsInnerDimsSizeAligned<T>(s);
+  }
+}
+
+// Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
+std::string SanitizeThreadSuffix(std::string suffix);
+
+// Helper to compute 'strides' given a tensor 'shape'. I.e.,
+// strides[i] = prod(shape.dim_size[(i+1):])
+template <typename T>
+gtl::InlinedVector<T, 8> ComputeStride(const TensorShape& shape) {
+  const int ndims = shape.dims();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape.dim_size(i));
+  }
+  return strides;
+}
+
+// Helper to compute 'strides' given an Eigen TensorDimensions
+template <typename T, typename EigenDimensions>
+gtl::InlinedVector<T, 8> ComputeEigenStrides(const EigenDimensions& shape) {
+  const int ndims = shape.rank();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape[i]);
+  }
+  return strides;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/partial_tensor_shape.h b/third_party/tflite-hdrs/tensorflow/core/framework/partial_tensor_shape.h
new file mode 100644
index 00000000..fa1ce07d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/partial_tensor_shape.h
@@ -0,0 +1,22 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_PARTIAL_TENSOR_SHAPE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_PARTIAL_TENSOR_SHAPE_H_
+
+// TODO(irving): Remove this forwarding header
+#include "tensorflow/core/framework/tensor_shape.h"
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_PARTIAL_TENSOR_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/queue_interface.h b/third_party/tflite-hdrs/tensorflow/core/framework/queue_interface.h
new file mode 100644
index 00000000..e916b506
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/queue_interface.h
@@ -0,0 +1,102 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_QUEUE_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_QUEUE_INTERFACE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// All implementations must be thread-safe.
+class QueueInterface : public ResourceBase {
+ public:
+  typedef std::vector<Tensor> Tuple;
+  typedef AsyncOpKernel::DoneCallback DoneCallback;
+  typedef std::function<void(const Tuple&)> CallbackWithTuple;
+
+  virtual absl::Status ValidateTuple(const Tuple& tuple) = 0;
+  virtual absl::Status ValidateManyTuple(const Tuple& tuple) = 0;
+
+  // Stashes a function object for future execution, that will eventually
+  // enqueue the tuple of tensors into the queue, and returns immediately. The
+  // function object is guaranteed to call 'callback'.
+  virtual void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                          DoneCallback callback) = 0;
+
+  // Same as above, but the component tensors are sliced along the 0th dimension
+  // to make multiple queue-element components.
+  virtual void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+                              DoneCallback callback) = 0;
+
+  // Stashes a function object for future execution, that will eventually
+  // dequeue an element from the queue and call 'callback' with that tuple
+  // element as argument.
+  virtual void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) = 0;
+
+  // Same as above, but the stashed function object will attempt to dequeue
+  // num_elements items.  If allow_small_batch is true, and the Queue is
+  // closed but at least 1 element is available, there is no blocking
+  // and between 1 and num_elements items are immediately returned.
+  // If the queue does not support the allow_small_batch flag will
+  // return an Unimplemented error.
+  virtual void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                              bool allow_small_batch,
+                              CallbackWithTuple callback) = 0;
+
+  // Signals that no more elements will be enqueued, and optionally
+  // cancels pending Enqueue(Many) operations.
+  //
+  // After calling this function, subsequent calls to Enqueue(Many)
+  // will fail. If `cancel_pending_enqueues` is true, all pending
+  // calls to Enqueue(Many) will fail as well.
+  //
+  // After calling this function, all current and subsequent calls to
+  // Dequeue(Many) will fail instead of blocking (though they may
+  // succeed if they can be satisfied by the elements in the queue at
+  // the time it was closed).
+  virtual void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+                     DoneCallback callback) = 0;
+
+  // Returns true if a given queue is closed and false if it is open.
+  virtual bool is_closed() const = 0;
+
+  // Assuming *this represents a shared queue, verify that it matches
+  // another instantiation indicated by node_def.
+  virtual absl::Status MatchesNodeDef(const NodeDef& node_def) = 0;
+
+  // Returns the number of elements in the queue.
+  virtual int32 size() const = 0;
+
+  virtual const DataTypeVector& component_dtypes() const = 0;
+
+  string DebugString() const override {
+    return strings::StrCat("A Queue of size: ", size());
+  }
+
+ protected:
+  ~QueueInterface() override {}
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_QUEUE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/reader_base.h b/third_party/tflite-hdrs/tensorflow/core/framework/reader_base.h
new file mode 100644
index 00000000..73842644
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/reader_base.h
@@ -0,0 +1,139 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_READER_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_READER_BASE_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+class ReaderBaseState;
+
+// Default implementation of ReaderInterface.
+class ReaderBase : public ReaderInterface {
+ public:
+  // name: For use in error messages, should mention both the name of
+  // the op and the node.
+  explicit ReaderBase(const string& name);
+
+  // Note that methods with names ending in "Locked" are called while
+  // the ReaderBase's mutex is held.
+
+  // Implement this function in descendants -----------------------------------
+
+  // Produce the next key/value pair from the current work item.
+  // This is called "Locked" since it is executed under a mutex
+  // that serializes all Reader calls.
+  // Usage:
+  //  a) If a record was successfully produced, set *produced = true,
+  //  and fill in *key and *value.
+  //  b) If no more records will be produced for this work item, set
+  //  *at_end = true.
+  //  c) If a record was produced, but no more will be produced, you
+  //     may either do both (a) and (b), or do (a) in this call and do (b) in
+  //     the next call to ReadLocked().
+  //  d) If there was an error producing (e.g. an error reading the file,
+  //     data corruption), return a non-OK() status.  ReadLocked may be
+  //     called again if the user reruns this part of the graph.
+  virtual absl::Status ReadLocked(tstring* key, tstring* value, bool* produced,
+                                  bool* at_end) = 0;
+
+  // Descendants may optionally implement these -------------------------------
+
+  // Produce up to num_records next key/value pairs from the current
+  // work item, in the same manner of ReadLocked.
+  virtual absl::Status ReadUpToLocked(int64_t num_records,
+                                      std::vector<tstring>* keys,
+                                      std::vector<tstring>* values,
+                                      int64_t* num_read, bool* at_end);
+
+  // Called when work starts / finishes.
+  virtual absl::Status OnWorkStartedLocked() { return absl::OkStatus(); }
+  virtual absl::Status OnWorkFinishedLocked() { return absl::OkStatus(); }
+
+  // Called to reset the Reader to a newly constructed state.
+  virtual absl::Status ResetLocked();
+
+  // Default implementation generates an Unimplemented error.
+  // See the protected helper methods below.
+  virtual absl::Status SerializeStateLocked(tstring* state);
+  virtual absl::Status RestoreStateLocked(const tstring& state);
+
+  // Accessors ----------------------------------------------------------------
+
+  // Always true during a call to ReadLocked().
+  bool work_in_progress() const { return work_finished_ < work_started_; }
+
+  // Returns the name of the current work item (valid if
+  // work_in_progress() returns true).  May change between calls to
+  // ReadLocked().
+  const tstring& current_work() const { return work_; }
+
+  // What was passed to the constructor.
+  const string& name() const { return name_; }
+
+  // Produce the key name (from current_work and the actual key).
+  tstring KeyName(const tstring& key) const;
+
+ protected:
+  // For descendants wishing to implement serialize & restore state.
+
+  // Writes ReaderBase state to *state.
+  void SaveBaseState(ReaderBaseState* state) const;
+
+  // Restores ReaderBase state from state. Assumes state was filled
+  // using SaveBaseState() above.
+  absl::Status RestoreBaseState(const ReaderBaseState& state);
+
+ private:
+  // For descendants that wish to obtain the next work item in a different way.
+  // For implementing Read().  Dequeues the next work item from
+  // *queue, and if successful returns "work" (a string). May block.
+  virtual string GetNextWorkLocked(QueueInterface* queue,
+                                   OpKernelContext* context) const;
+
+  // Implementations of ReaderInterface methods.  These ensure thread-safety
+  // and call the methods above to do the work.
+  void Read(QueueInterface* queue, tstring* key, tstring* value,
+            OpKernelContext* context) override;
+
+  // Produces up to num_records.
+  // In this implementation all the records come from the same work unit.
+  int64_t ReadUpTo(const int64_t num_records, QueueInterface* queue,
+                   std::vector<tstring>* keys, std::vector<tstring>* value,
+                   OpKernelContext* context) override;
+
+  absl::Status Reset() override;
+  int64_t NumRecordsProduced() override;
+  int64_t NumWorkUnitsCompleted() override;
+  absl::Status SerializeState(tstring* state) override;
+  absl::Status RestoreState(const tstring& state) override;
+
+  mutable mutex mu_;
+  const string name_;
+  int64_t work_started_ = 0;
+  int64_t work_finished_ = 0;
+  int64_t num_records_produced_ = 0;
+  tstring work_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_READER_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/reader_interface.h b/third_party/tflite-hdrs/tensorflow/core/framework/reader_interface.h
new file mode 100644
index 00000000..6210b68f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/reader_interface.h
@@ -0,0 +1,88 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_READER_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_READER_INTERFACE_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class QueueInterface;
+class ReaderInterface;
+
+// Readers are the mechanism for reading records from files in
+// TensorFlow graphs.  Each supported file format has a corresponding
+// ReaderInterface descendant and a corresponding Op & OpKernel
+// (implemented using ReaderOpKernel from reader_op_kernel.h).
+//
+// To use a Reader, you first encode "work" (some string, typically a
+// filename) in the Reader's "work queue".  It then processes the
+// "work" (reading records from the file), to produce key/value
+// strings.  The methods of this class are called by ReaderFoo ops,
+// so see ../ops/io_ops.cc for detailed descriptions.
+//
+// All descendants of this class must be thread-safe.
+class ReaderInterface : public ResourceBase {
+ public:
+  // Read a single record into *key / *value.  May get more work from
+  // *queue if the current work is complete.  Sets the status on
+  // *context with an OutOfRange Status if the current work is
+  // complete and the queue is done (closed and empty).
+  // This method may block.
+  virtual void Read(QueueInterface* queue, tstring* key, tstring* value,
+                    OpKernelContext* context) = 0;
+
+  // Read up to num_records records into keys / values. May get more work from
+  // *queue if the current work is complete.  Sets the status on
+  // *context with an OutOfRange Status if the current work is
+  // complete and the queue is done (closed and empty).
+  // This method may block.
+  // The std::vector keys/value pointers are assumed to point to empty
+  // structures (that have most likely been reserve(num_records)).
+  // Returns how many records were actually read.
+  virtual int64_t ReadUpTo(const int64_t num_records, QueueInterface* queue,
+                           std::vector<tstring>* keys,
+                           std::vector<tstring>* value,
+                           OpKernelContext* context) = 0;
+
+  // Restore this reader to its newly-constructed state.
+  virtual absl::Status Reset() = 0;
+
+  // Accessors
+  virtual int64_t NumRecordsProduced() = 0;
+  virtual int64_t NumWorkUnitsCompleted() = 0;
+
+  // -- Serialization/Restoration support --
+  // Not all readers will support saving and restoring state.
+  virtual absl::Status SerializeState(tstring* state) = 0;
+  // Note: Must Reset on error.
+  virtual absl::Status RestoreState(const tstring& state) = 0;
+
+  string DebugString() const override { return "a reader"; }
+
+ protected:
+  ~ReaderInterface() override {}
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_READER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/reader_op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/framework/reader_op_kernel.h
new file mode 100644
index 00000000..bc1a7629
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/reader_op_kernel.h
@@ -0,0 +1,87 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_READER_OP_KERNEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_READER_OP_KERNEL_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// NOTE: This is now a very thin layer over ResourceOpKernel.
+// TODO(sjhwang): Remove dependencies to this class, then delete this.
+
+// Implementation for ops providing a Reader.
+class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
+ public:
+  using ResourceOpKernel::ResourceOpKernel;
+
+  // Must be called by descendants before the first call to Compute() (typically
+  // called during construction).  factory must return a ReaderInterface
+  // descendant allocated with new that ReaderOpKernel will take ownership of.
+  void SetReaderFactory(std::function<ReaderInterface*()> factory)
+      TF_LOCKS_EXCLUDED(mu_) {
+    DCHECK(get_resource() == nullptr);
+    mutex_lock l(mu_);
+    factory_ = factory;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    if (!IsCancellable()) {
+      ResourceOpKernel<ReaderInterface>::Compute(context);
+    } else {
+      // Install cancellation
+      CancellationManager* cm = context->cancellation_manager();
+      CancellationToken token = cm->get_cancellation_token();
+      bool already_cancelled =
+          !cm->RegisterCallback(token, [this]() { this->Cancel(); });
+
+      if (!already_cancelled) {
+        ResourceOpKernel<ReaderInterface>::Compute(context);
+      } else {
+        context->SetStatus(errors::Cancelled("read operation was cancelled"));
+      }
+    }
+  }
+
+ private:
+  virtual bool IsCancellable() const { return false; }
+  virtual void Cancel() {}
+
+  absl::Status CreateResource(ReaderInterface** reader)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
+    *reader = factory_();
+    if (*reader == nullptr) {
+      return errors::ResourceExhausted("Failed to allocate reader");
+    }
+    std::function<ReaderInterface*()> temp = nullptr;
+    factory_.swap(temp);
+    return absl::OkStatus();
+  }
+
+  std::function<ReaderInterface*()> factory_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_READER_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/ref_var.h b/third_party/tflite-hdrs/tensorflow/core/framework/ref_var.h
new file mode 100644
index 00000000..8e423e81
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/ref_var.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REF_VAR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REF_VAR_H_
+
+#include <functional>
+
+namespace tensorflow {
+class OpKernelContext;
+
+void AssignRefVariable(
+    OpKernelContext* context, int input_ref_index, int output_ref_index,
+    int value_index, bool use_locking, bool validate_shape,
+    bool relax_constraints,
+    std::function<void(OpKernelContext*, Tensor*, const Tensor&)> copy);
+}  //  end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REF_VAR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/register_types.h b/third_party/tflite-hdrs/tensorflow/core/framework/register_types.h
new file mode 100644
index 00000000..eba2ae88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/register_types.h
@@ -0,0 +1,233 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
+// This file is used by cuda code and must remain compilable by nvcc.
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/types.h"
+
+// Two sets of macros:
+// - TF_CALL_float, TF_CALL_double, etc. which call the given macro with
+//   the type name as the only parameter - except on platforms for which
+//   the type should not be included.
+// - Macros to apply another macro to lists of supported types. These also call
+//   into TF_CALL_float, TF_CALL_double, etc. so they filter by target platform
+//   as well.
+// If you change the lists of types, please also update the list in types.cc.
+//
+// See example uses of these macros in core/ops.
+//
+//
+// Each of these TF_CALL_XXX_TYPES(m) macros invokes the macro "m" multiple
+// times by passing each invocation a data type supported by TensorFlow.
+//
+// The different variations pass different subsets of the types.
+// TF_CALL_ALL_TYPES(m) applied "m" to all types supported by TensorFlow.
+// The set of types depends on the compilation platform.
+//.
+// This can be used to register a different template instantiation of
+// an OpKernel for different signatures, e.g.:
+/*
+   #define REGISTER_PARTITION(type)                                      \
+     REGISTER_KERNEL_BUILDER(                                            \
+         Name("Partition").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+         PartitionOp<type>);
+   TF_CALL_ALL_TYPES(REGISTER_PARTITION)
+   #undef REGISTER_PARTITION
+*/
+
+#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \
+    defined(ANDROID_TEGRA)
+
+// All types are supported, so all macros are invoked.
+//
+// Note: macros are defined in same order as types in types.proto, for
+// readability.
+#define TF_CALL_float(m) m(float)
+#define TF_CALL_double(m) m(double)
+#define TF_CALL_int32(m) m(::tensorflow::int32)
+#define TF_CALL_uint32(m) m(::tensorflow::uint32)
+#define TF_CALL_uint8(m) m(::tensorflow::uint8)
+#define TF_CALL_int16(m) m(::tensorflow::int16)
+
+#define TF_CALL_int8(m) m(::tensorflow::int8)
+#define TF_CALL_string(m) m(::tensorflow::tstring)
+#define TF_CALL_tstring(m) m(::tensorflow::tstring)
+#define TF_CALL_resource(m) m(::tensorflow::ResourceHandle)
+#define TF_CALL_variant(m) m(::tensorflow::Variant)
+#define TF_CALL_complex64(m) m(::tensorflow::complex64)
+#define TF_CALL_int64(m) m(::int64_t)
+#define TF_CALL_uint64(m) m(::tensorflow::uint64)
+#define TF_CALL_bool(m) m(bool)
+
+#define TF_CALL_qint8(m) m(::tensorflow::qint8)
+#define TF_CALL_quint8(m) m(::tensorflow::quint8)
+#define TF_CALL_qint32(m) m(::tensorflow::qint32)
+#define TF_CALL_bfloat16(m) m(::tensorflow::bfloat16)
+#define TF_CALL_qint16(m) m(::tensorflow::qint16)
+
+#define TF_CALL_quint16(m) m(::tensorflow::quint16)
+#define TF_CALL_uint16(m) m(::tensorflow::uint16)
+#define TF_CALL_complex128(m) m(::tensorflow::complex128)
+#define TF_CALL_half(m) m(Eigen::half)
+
+#define TF_CALL_float8_e5m2(m) m(::tensorflow::float8_e5m2)
+#define TF_CALL_float8_e4m3fn(m) m(::tensorflow::float8_e4m3fn)
+
+#define TF_CALL_int4(m) m(::tensorflow::int4)
+#define TF_CALL_uint4(m) m(::tensorflow::uint4)
+
+#elif defined(__ANDROID_TYPES_FULL__)
+
+// Only string, half, float, int32, int64, bool, and quantized types
+// supported.
+#define TF_CALL_float(m) m(float)
+#define TF_CALL_double(m)
+#define TF_CALL_int32(m) m(::tensorflow::int32)
+#define TF_CALL_uint32(m)
+#define TF_CALL_uint8(m)
+#define TF_CALL_int16(m)
+
+#define TF_CALL_int8(m)
+#define TF_CALL_string(m) m(::tensorflow::tstring)
+#define TF_CALL_tstring(m) m(::tensorflow::tstring)
+#define TF_CALL_resource(m)
+#define TF_CALL_variant(m)
+#define TF_CALL_complex64(m)
+#define TF_CALL_int64(m) m(::int64_t)
+#define TF_CALL_uint64(m)
+#define TF_CALL_bool(m) m(bool)
+
+#define TF_CALL_qint8(m) m(::tensorflow::qint8)
+#define TF_CALL_quint8(m) m(::tensorflow::quint8)
+#define TF_CALL_qint32(m) m(::tensorflow::qint32)
+#define TF_CALL_bfloat16(m)
+#define TF_CALL_qint16(m) m(::tensorflow::qint16)
+
+#define TF_CALL_quint16(m) m(::tensorflow::quint16)
+#define TF_CALL_uint16(m)
+#define TF_CALL_complex128(m)
+#define TF_CALL_half(m) m(Eigen::half)
+
+#define TF_CALL_float8_e5m2(m)
+#define TF_CALL_float8_e4m3fn(m)
+
+#define TF_CALL_int4(m)
+#define TF_CALL_uint4(m)
+
+#else  // defined(IS_MOBILE_PLATFORM) && !defined(__ANDROID_TYPES_FULL__)
+
+// Only float, int32, and bool are supported.
+#define TF_CALL_float(m) m(float)
+#define TF_CALL_double(m)
+#define TF_CALL_int32(m) m(::tensorflow::int32)
+#define TF_CALL_uint32(m)
+#define TF_CALL_uint8(m)
+#define TF_CALL_int16(m)
+
+#define TF_CALL_int8(m)
+#define TF_CALL_string(m)
+#define TF_CALL_tstring(m)
+#define TF_CALL_resource(m)
+#define TF_CALL_variant(m)
+#define TF_CALL_complex64(m)
+#define TF_CALL_int64(m)
+#define TF_CALL_uint64(m)
+#define TF_CALL_bool(m) m(bool)
+
+#define TF_CALL_qint8(m)
+#define TF_CALL_quint8(m)
+#define TF_CALL_qint32(m)
+#define TF_CALL_bfloat16(m)
+#define TF_CALL_qint16(m)
+
+#define TF_CALL_quint16(m)
+#define TF_CALL_uint16(m)
+#define TF_CALL_complex128(m)
+#define TF_CALL_half(m)
+
+#define TF_CALL_float8_e5m2(m)
+#define TF_CALL_float8_e4m3fn(m)
+
+#define TF_CALL_int4(m)
+#define TF_CALL_uint4(m)
+
+#endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
+
+// Defines for sets of types.
+#define TF_CALL_INTEGRAL_TYPES_NO_INT32(m)                               \
+  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
+      TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+
+#define TF_CALL_INTEGRAL_TYPES(m) \
+  TF_CALL_INTEGRAL_TYPES_NO_INT32(m) TF_CALL_int32(m)
+
+#define TF_CALL_FLOAT_TYPES(m) \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
+
+#define TF_CALL_REAL_NUMBER_TYPES(m) \
+  TF_CALL_INTEGRAL_TYPES(m) TF_CALL_FLOAT_TYPES(m)
+
+#define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
+  TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
+
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                            \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m) \
+      TF_CALL_INTEGRAL_TYPES_NO_INT32(m)
+
+#define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
+
+// Call "m" for all number types, including complex types
+#define TF_CALL_NUMBER_TYPES(m) \
+  TF_CALL_REAL_NUMBER_TYPES(m) TF_CALL_COMPLEX_TYPES(m)
+
+#define TF_CALL_NUMBER_TYPES_NO_INT32(m) \
+  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) TF_CALL_COMPLEX_TYPES(m)
+
+#define TF_CALL_POD_TYPES(m) TF_CALL_NUMBER_TYPES(m) TF_CALL_bool(m)
+
+// Call "m" on all types.
+#define TF_CALL_ALL_TYPES(m) \
+  TF_CALL_POD_TYPES(m) TF_CALL_tstring(m) TF_CALL_resource(m) TF_CALL_variant(m)
+
+// Call "m" on POD and string types.
+#define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_tstring(m)
+
+// Call "m" on all number types supported on GPU.
+#define TF_CALL_GPU_NUMBER_TYPES(m) \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
+
+// Call "m" on all types supported on GPU.
+#define TF_CALL_GPU_ALL_TYPES(m) \
+  TF_CALL_GPU_NUMBER_TYPES(m) TF_CALL_COMPLEX_TYPES(m) TF_CALL_bool(m)
+
+#define TF_CALL_GPU_NUMBER_TYPES_NO_HALF(m) TF_CALL_float(m) TF_CALL_double(m)
+
+// Call "m" on all quantized types.
+// TODO(cwhipkey): include TF_CALL_qint16(m) TF_CALL_quint16(m)
+#define TF_CALL_QUANTIZED_TYPES(m) \
+  TF_CALL_qint8(m) TF_CALL_quint8(m) TF_CALL_qint32(m)
+
+// Types used for save and restore ops.
+#define TF_CALL_SAVE_RESTORE_TYPES(m)      \
+  TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
+  TF_CALL_COMPLEX_TYPES(m)                 \
+  TF_CALL_QUANTIZED_TYPES(m) TF_CALL_bool(m) TF_CALL_tstring(m)
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/register_types_traits.h b/third_party/tflite-hdrs/tensorflow/core/framework/register_types_traits.h
new file mode 100644
index 00000000..b2847d84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/register_types_traits.h
@@ -0,0 +1,93 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
+// This file is used by cuda code and must remain compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Remap POD types by size to equivalent proxy types. This works
+// since all we are doing is copying data around.
+struct UnusableProxyType;
+template <typename Device, int size>
+struct proxy_type_pod {
+  typedef UnusableProxyType type;
+};
+template <>
+struct proxy_type_pod<CPUDevice, 16> {
+  typedef ::tensorflow::complex128 type;
+};
+template <>
+struct proxy_type_pod<CPUDevice, 8> {
+  typedef ::int64_t type;
+};
+template <>
+struct proxy_type_pod<CPUDevice, 4> {
+  typedef ::tensorflow::int32 type;
+};
+template <>
+struct proxy_type_pod<CPUDevice, 2> {
+  typedef ::tensorflow::int16 type;
+};
+template <>
+struct proxy_type_pod<CPUDevice, 1> {
+  typedef ::tensorflow::int8 type;
+};
+template <>
+struct proxy_type_pod<GPUDevice, 8> {
+  typedef double type;
+};
+template <>
+struct proxy_type_pod<GPUDevice, 4> {
+  typedef float type;
+};
+template <>
+struct proxy_type_pod<GPUDevice, 2> {
+  typedef Eigen::half type;
+};
+template <>
+struct proxy_type_pod<GPUDevice, 1> {
+  typedef ::tensorflow::int8 type;
+};
+
+
+/// If POD we use proxy_type_pod, otherwise this maps to identity.
+template <typename Device, typename T>
+struct proxy_type {
+  typedef typename std::conditional<
+      std::is_arithmetic<T>::value,
+      typename proxy_type_pod<Device, sizeof(T)>::type, T>::type type;
+  static_assert(sizeof(type) == sizeof(T), "proxy_type_pod is not valid");
+};
+
+/// The active proxy types
+#define TF_CALL_CPU_PROXY_TYPES(m)                                     \
+  TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
+      TF_CALL_int8(m) TF_CALL_complex128(m)
+#define TF_CALL_GPU_PROXY_TYPES(m)                                       \
+  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_bfloat16(m) \
+      TF_CALL_int32(m) TF_CALL_int8(m)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/registration/registration.h b/third_party/tflite-hdrs/tensorflow/core/framework/registration/registration.h
new file mode 100644
index 00000000..27f2ec2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/registration/registration.h
@@ -0,0 +1,152 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides some common support for 'registration' of e.g. ops and
+// kernels. In particular, it relates to the REGISTER_OP (op registration) and
+// REGISTER_KERNEL_BUILDER (kernel registration) macros.
+//
+// Note that there are two sides to 'registration':
+//   - Definition (compile-time): making op and kernel definitions _available_.
+//   - Usage (run-time): adding particular (available) definitions of ops and
+//     kernels to the global OpRegistry / KernelRegistry, to be found when
+//     constructing and executing graphs.
+//
+// Currently, definition and usage happen to be coupled together: all
+// 'available' definitions (from the REGISTER_*' macros) are added to the global
+// registries on startup / library load.
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REGISTRATION_REGISTRATION_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REGISTRATION_REGISTRATION_H_
+
+#include <string.h>
+
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/core/framework/registration/options.h"
+
+#if !TF_OPTION_REGISTRATION_V2()
+
+#ifdef SELECTIVE_REGISTRATION
+
+// Experimental selective registration support to reduce binary size.
+//
+// To use selective registration, when building:
+// 1. define SELECTIVE_REGISTRATION, e.g. in gcc by passing
+//    -DSELECTIVE_REGISTRATION to compilation.
+// 2. Provide ops_to_register.h. This file is not included in the repo and must
+//    be placed by the user or a tool where the compiler can find it.  It must
+//    define the constants and functions used in the macros below. The
+//    functions should be defined as valid constexpr functions, so that they are
+//    evaluated at compile time: this is needed to make symbols referenced by
+//    un-registered objects unused, and therefore allow the linker to strip them
+//    out.  See python/tools/print_selective_registration_header.py for a tool
+//    that can be used to generate ops_to_register.h.
+//
+// ops_to_register.h should define macros for:
+//   // Ops for which this is false will not be registered.
+//   SHOULD_REGISTER_OP(op)
+//   // If this is false, then no gradient ops are registered.
+//   SHOULD_REGISTER_OP_GRADIENT
+//   // Op kernel classes where this is false won't be registered.
+//   SHOULD_REGISTER_OP_KERNEL(clz)
+// The macros should be defined using constexprs.
+
+#include "ops_to_register.h"
+
+#if (!defined(SHOULD_REGISTER_OP) || !defined(SHOULD_REGISTER_OP_GRADIENT) || \
+     !defined(SHOULD_REGISTER_OP_KERNEL))
+static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros");
+#endif
+#else  // SELECTIVE_REGISTRATION
+#define SHOULD_REGISTER_OP(op) true
+#define SHOULD_REGISTER_OP_GRADIENT true
+#define SHOULD_REGISTER_OP_KERNEL(clz) true
+#endif  // SELECTIVE_REGISTRATION
+
+#else  // ! TF_OPTION_REGISTRATION_V2()
+
+#ifdef SELECTIVE_REGISTRATION
+#error TF_OPTION_REGISTRATION_V2(): Compile-time selective registration is not supported
+#endif
+
+#endif  // ! TF_OPTION_REGISTRATION_V2()
+
+namespace tensorflow {
+
+// An InitOnStartupMarker is 'initialized' on program startup, purely for the
+// side-effects of that initialization - the struct itself is empty. (The type
+// is expected to be used to define globals.)
+//
+// The '<<' operator should be used in initializer expressions to specify what
+// to run on startup. The following values are accepted:
+//   - An InitOnStartupMarker. Example:
+//      InitOnStartupMarker F();
+//      InitOnStartupMarker const kInitF =
+//        InitOnStartupMarker{} << F();
+//   - Something to call, which returns an InitOnStartupMarker. Example:
+//      InitOnStartupMarker const kInit =
+//        InitOnStartupMarker{} << []() { G(); return
+//
+// See also: TF_INIT_ON_STARTUP_IF
+struct InitOnStartupMarker {
+  constexpr InitOnStartupMarker operator<<(InitOnStartupMarker) const {
+    return *this;
+  }
+
+  template <typename T>
+  constexpr InitOnStartupMarker operator<<(T&& v) const {
+    return std::forward<T>(v)();
+  }
+};
+
+// Conditional initializer expressions for InitOnStartupMarker:
+//   TF_INIT_ON_STARTUP_IF(cond) << f
+// If 'cond' is true, 'f' is evaluated (and called, if applicable) on startup.
+// Otherwise, 'f' is *not evaluated*. Note that 'cond' is required to be a
+// constant-expression, and so this approximates #ifdef.
+//
+// The implementation uses the ?: operator (!cond prevents evaluation of 'f').
+// The relative precedence of ?: and << is significant; this effectively expands
+// to (see extra parens):
+//   !cond ? InitOnStartupMarker{} : (InitOnStartupMarker{} << f)
+//
+// Note that although forcing 'cond' to be a constant-expression should not
+// affect binary size (i.e. the same optimizations should apply if it 'happens'
+// to be one), it was found to be necessary (for a recent version of clang;
+// perhaps an optimizer bug).
+//
+// The parens are necessary to hide the ',' from the preprocessor; it could
+// otherwise act as a macro argument separator.
+#define TF_INIT_ON_STARTUP_IF(cond)                \
+  (::std::integral_constant<bool, !(cond)>::value) \
+      ? ::tensorflow::InitOnStartupMarker{}        \
+      : ::tensorflow::InitOnStartupMarker {}
+
+// Wrapper for generating unique IDs (for 'anonymous' InitOnStartup definitions)
+// using __COUNTER__. The new ID (__COUNTER__ already expanded) is provided as a
+// macro argument.
+//
+// Usage:
+//   #define M_IMPL(id, a, b) ...
+//   #define M(a, b) TF_NEW_ID_FOR_INIT(M_IMPL, a, b)
+#define TF_NEW_ID_FOR_INIT_2(m, c, ...) m(c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT_1(m, c, ...) TF_NEW_ID_FOR_INIT_2(m, c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT(m, ...) \
+  TF_NEW_ID_FOR_INIT_1(m, __COUNTER__, __VA_ARGS__)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTRATION_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/rendezvous.h b/third_party/tflite-hdrs/tensorflow/core/framework/rendezvous.h
new file mode 100644
index 00000000..97a5daff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/rendezvous.h
@@ -0,0 +1,177 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RENDEZVOUS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RENDEZVOUS_H_
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+class DeviceMgr;
+
+// A Rendezvous is an abstraction for passing tensors from producers
+// to consumers. A rendezvous is a table of channels. Each channel is
+// keyed by a rendezvous key. The key encodes a pair of <producer,
+// consumer>, where the producer and the consumer are tensorflow
+// devices.
+//
+// The producer calls the Send() method to send one tensor over one
+// named channel. The consumer calls the Recv() method to receive one
+// tensor from a named channel. A sequence of tensors can be passed
+// from the producer to the consumer.  The consumer receives them in
+// the order as the producer sends them.
+//
+// A consumer may safely request the tensor before or after it has
+// been produced.  A consumer has the choice of making a blocking call
+// or providing a callback: in either case, the consumer receives the
+// Tensor as soon as it is available.  A producer never blocks.
+class RendezvousInterface {
+ public:
+  struct Args {
+    DeviceContext* device_context = nullptr;
+    AllocatorAttributes alloc_attrs;
+    CancellationManager* cancellation_manager = nullptr;  // not owned.
+  };
+
+  // Parses the key constructed by CreateKey and parse src/dst device
+  // names into structures respectively.
+  struct ParsedKey {
+    absl::string_view src_device;
+    DeviceNameUtils::ParsedName src;
+    uint64 src_incarnation = 0;
+    absl::string_view dst_device;
+    DeviceNameUtils::ParsedName dst;
+    absl::string_view edge_name;
+
+    ParsedKey() {}
+    ParsedKey(const ParsedKey& b) { *this = b; }
+
+    ParsedKey& operator=(const ParsedKey& b);
+    absl::string_view FullKey() const { return buf_; }
+
+   private:
+    friend class Rendezvous;
+    friend class SendOp;
+    friend class RecvOp;
+    std::string buf_;
+  };
+
+  // The caller is a tensor producer and it sends a message (a tensor
+  // "val" and a bool "is_dead") under the given "key".
+  //
+  // {val, is_dead} is bundled as a message sent and received.
+  // Typically, is_dead is set by some control flow nodes
+  // (e.g., a not-taken branch).  args is passed by Send to the
+  // Recv function to communicate any information that the Recv
+  // function might need.  This is typically only necessary for
+  // Send/Recv on the same worker.
+  //
+  // Send() never blocks.
+  virtual absl::Status Send(const ParsedKey& key, const Args& args,
+                            const Tensor& val, const bool is_dead) = 0;
+
+  // Callback provided by a tensor consumer waiting on the rendezvous.
+  // It will be invoked when the tensor is available, or when a non-OK
+  // status arises in the production of that tensor.  It also gets
+  // two Rendezvous::Args, one provided by the sender, the other by the
+  // receiver, which may be needed when a non-CPU device is in use
+  // by either side.
+  typedef std::function<void(const absl::Status&, const Args&, const Args&,
+                             const Tensor&, const bool)>
+      DoneCallback;
+
+  virtual void RecvAsync(const ParsedKey& key, const Args& args,
+                         DoneCallback done) = 0;
+
+  // Synchronous wrapper for RecvAsync.
+  absl::Status Recv(const ParsedKey& key, const Args& args, Tensor* val,
+                    bool* is_dead, int64_t timeout_ms);
+  absl::Status Recv(const ParsedKey& key, const Args& args, Tensor* val,
+                    bool* is_dead);
+
+  // Aborts all pending and future Send/Recv with the given "status".
+  //
+  // StartAbort() does not wait for ongoing calls to finish.
+  // REQUIRES: !status.ok()
+  virtual void StartAbort(const absl::Status& status) = 0;
+
+  virtual ~RendezvousInterface();
+
+ protected:
+  virtual bool is_cross_process() { return false; }
+  friend class ProcessFunctionLibraryRuntime;
+};
+
+// A reference-counted implementation of RendezvousInterface.
+//
+// This class is used in cases where a rendezvous may be shared between multiple
+// threads with no clear owner.
+class Rendezvous : public RendezvousInterface, public core::WeakRefCounted {
+ public:
+  class Factory {
+   public:
+    // Default to a factory that evaluates to false.
+    Factory() : valid_(false) {}
+
+    explicit Factory(
+        std::function<absl::Status(const int64_t, const DeviceMgr*,
+                                   tsl::core::RefCountPtr<Rendezvous>*)>
+            create_fn)
+        : valid_(true), create_fn_(std::move(create_fn)) {}
+
+    explicit operator bool() const { return valid_; }
+
+    absl::Status operator()(const int64_t step_id, const DeviceMgr* device_mgr,
+                            tsl::core::RefCountPtr<Rendezvous>* rendez) const {
+      return create_fn_(step_id, device_mgr, rendez);
+    }
+
+   private:
+    bool valid_;
+    std::function<absl::Status(const int64_t, const DeviceMgr*,
+                               tsl::core::RefCountPtr<Rendezvous>*)>
+        create_fn_;
+  };
+
+  // Constructs a rendezvous key for the tensor of "name" sent from
+  // "src_device" to "dst_device". The tensor is generated in the frame
+  // and iteration specified by "frame_iter".
+  static std::string CreateKey(const std::string& src_device,
+                               uint64 src_incarnation,
+                               const std::string& dst_device,
+                               const std::string& name,
+                               const FrameAndIter& frame_iter);
+
+  static absl::Status ParseKey(absl::string_view key, ParsedKey* out);
+};
+
+// Returns a Rendezvous instance that is limited to use only by
+// producers and consumers in the local process.  The caller assumes
+// ownership of one Ref() on the returned object.
+Rendezvous* NewLocalRendezvous(int num_shards = 1);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RENDEZVOUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/resource_base.h b/third_party/tflite-hdrs/tensorflow/core/framework/resource_base.h
new file mode 100644
index 00000000..c22adb55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/resource_base.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_BASE_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+// Forward declaration to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class Node;
+
+// This is the base class of all resource classes. Each resource must be
+// represented as a sub-class of ResourceBase (which is reference counted) to be
+// able to work with resource facilities such ResourceHandle and ResourceMgr.
+class ResourceBase : public core::WeakRefCounted {
+ public:
+  // Returns a debug string for *this.
+  virtual std::string DebugString() const = 0;
+
+  // Returns a name for ref-counting handles.
+  virtual std::string MakeRefCountingHandleName(int64_t resource_id) const {
+    return absl::StrFormat("Resource-%d-at-%p", resource_id, this);
+  }
+
+  // Returns memory used by this resource.
+  virtual int64_t MemoryUsed() const { return 0; }
+
+  // Writes a representation of this resource into `builder`, so that executing
+  // `*out` will recreate this resource. The lifetime of the created resource
+  // should not be tied to the graph that created it, since the graph may be
+  // destroyed before the resource is used. To avoid this lifetime issue, you
+  // can usually set a unique `shared_name` attribute for the resource.
+  virtual absl::Status AsGraphDef(GraphDefBuilder* builder, Node** out) const {
+    return errors::Unimplemented("AsGraphDef not implemented for resource ",
+                                 DebugString());
+  }
+};
+}  //  end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/resource_handle.h b/third_party/tflite-hdrs/tensorflow/core/framework/resource_handle.h
new file mode 100644
index 00000000..393a8998
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/resource_handle.h
@@ -0,0 +1,206 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_HANDLE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_HANDLE_H_
+
+#include <optional>
+#include <string>
+
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/intrusive_ptr.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+namespace tensorflow {
+
+class ResourceHandleProto;
+
+// Class representing a handle to a tensorflow resource. Handles are
+// not valid across executions, but can be serialized back and forth from within
+// a single run (except for those created from MakeRefCountingHandle i.e. whose
+// resource_ field is not empty).
+//
+// This is the native C++ class equivalent of ResourceHandleProto.  They are
+// separate so that kernels do not need to depend on protos.
+class ResourceHandle {
+ public:
+  ResourceHandle();
+  ResourceHandle(const ResourceHandleProto& proto);
+  ~ResourceHandle();
+
+  // Use this factory method if the `proto` comes from user controlled input, to
+  // prevent a denial of service.
+  static absl::Status BuildResourceHandle(const ResourceHandleProto& proto,
+                                          ResourceHandle* out);
+
+  // Unique name for the device containing the resource.
+  const std::string& device() const { return device_; }
+
+  void set_device(const std::string& device) { device_ = device; }
+
+  // Container in which this resource is placed.
+  const std::string& container() const { return container_; }
+  void set_container(const std::string& container) { container_ = container; }
+
+  // Unique name of this resource.
+  const std::string& name() const { return name_; }
+  void set_name(const std::string& name) { name_ = name; }
+
+  // Hash code for the type of the resource. Is only valid in the same device
+  // and in the same execution.
+  uint64 hash_code() const { return hash_code_; }
+  void set_hash_code(uint64 hash_code) { hash_code_ = hash_code; }
+
+  // For debug-only, the name of the type pointed to by this handle, if
+  // available.
+  const std::string& maybe_type_name() const { return maybe_type_name_; }
+  void set_maybe_type_name(const std::string& value) {
+    maybe_type_name_ = value;
+  }
+
+  // Data types and shapes for the underlying resource.
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes() const {
+    return dtypes_and_shapes_;
+  }
+  void set_dtypes_and_shapes(
+      const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes) {
+    dtypes_and_shapes_ = dtypes_and_shapes;
+  }
+
+  void set_definition_stack_trace(
+      const absl::optional<ManagedStackTrace>& definition_stack_trace) {
+    definition_stack_trace_ = definition_stack_trace;
+  }
+
+  const absl::optional<ManagedStackTrace>& definition_stack_trace() const {
+    return definition_stack_trace_;
+  }
+
+  // Conversion to and from ResourceHandleProto
+  void AsProto(ResourceHandleProto* proto) const;
+  absl::Status FromProto(const ResourceHandleProto& proto);
+
+  // Serialization via ResourceHandleProto
+  std::string SerializeAsString() const;
+  bool ParseFromString(const std::string& s);
+
+  std::string DebugString() const;
+
+  std::string SummarizeValue() const;
+
+  // GUID for anonymous resources. Resources with this shared_name will have
+  // their shared_name replaced with a GUID at creation time
+  static constexpr const char* ANONYMOUS_NAME =
+      "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+
+  // Creates a `ResourceHandle` that holds a pointer to a resource and takes
+  // ownership of it. Normally a `ResourceHandle` only contains the name (and
+  // some other metadata) of the resource. When created via this function,
+  // the handle will own the resource, in the sense that it will destroy the
+  // resource automatically when the resource is no longer needed. It does this
+  // via automatic ref-counting on the resource: when the handle is copied, it
+  // will call `Ref` on the resource (remember that all resources inherit from
+  // `ResourceBase` which inherits from `RefCounted`), and when the handle is
+  // destroyed, it will call `Unref` on the resource. When the last handle goes
+  // out of scope, the resource's ref-count will go down to zero and the
+  // resource will be destroyed. When calling this function, the `resource`
+  // argument should have a ref-count of one (which is the case when the
+  // resource is newly created).
+  //
+  // For those familiar with `ResourceMgr`, when you create a handle by the
+  // `MakeResourceHandle` function in resource_mgr.h, the handle doesn't hold a
+  // strong reference to the resource, and the resource is owned by the
+  // resource manager whose strong reference must be manually deleted by
+  // calling `ResourceMgr::Delete`. In contrast, a handle created by this
+  // function holds a strong reference to the resource. The resource manager
+  // does not hold a strong reference to the resource.
+  template <typename T>
+  static ResourceHandle MakeRefCountingHandle(
+      T* resource, const string& device_name,
+      const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
+      const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
+    return MakeRefCountingHandle(resource, device_name, TypeIndex::Make<T>(),
+                                 dtypes_and_shapes, definition_stack_trace);
+  }
+
+  static ResourceHandle MakeRefCountingHandle(
+      ResourceBase* resource, const string& device_name,
+      const TypeIndex& type_index,
+      const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
+      const absl::optional<ManagedStackTrace>& definition_stack_trace = {});
+
+  // Pointer to the resource.
+  const core::IntrusivePtr<ResourceBase>& resource() const { return resource_; }
+
+  // Gets the resource pointer in `handle` as `T*`, or an error if the actual
+  // resource type is not `T`.
+  template <typename T>
+  StatusOr<T*> GetResource() const {
+    TF_RETURN_IF_ERROR(ValidateType<T>());
+    return down_cast<T*>(resource_.get());
+  }
+
+  // Returns True if the resource handle is ref-counting.
+  // See MakeRefCountingHandle.
+  bool IsRefCounting() const { return resource_.get() != nullptr; }
+
+  // Validates that the resource type in `handle` is `T`.
+  template <typename T>
+  absl::Status ValidateType() const {
+    return ValidateType(TypeIndex::Make<T>());
+  }
+
+  absl::Status ValidateType(const TypeIndex& type_index) const;
+
+  // Generates unique IDs (e.g. for names of anonymous variables)
+  static int64_t GenerateUniqueId();
+
+ private:
+  std::string device_;
+  std::string container_;
+  std::string name_;
+  uint64 hash_code_ = 0;
+  std::string maybe_type_name_;
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes_;
+  std::optional<ManagedStackTrace> definition_stack_trace_;
+  // A smart pointer to the actual resource. When this field is not empty, the
+  // handle is in a "ref-counting" mode, owning the resource; otherwise it's in
+  // a "weak-ref" mode, only containing the name of the resource (conceptually a
+  // weak reference).
+  core::IntrusivePtr<ResourceBase> resource_;
+  static std::atomic<int64_t> current_id_;
+};
+
+// For backwards compatibility for when this was a proto
+std::string ProtoDebugString(const ResourceHandle& handle);
+
+// Encodes a list of ResourceHandle protos in the given StringListEncoder.
+void EncodeResourceHandleList(const ResourceHandle* p, int64_t n,
+                              std::unique_ptr<port::StringListEncoder> e);
+
+// Decodes a list of ResourceHandle protos from the given StringListDecoder.
+bool DecodeResourceHandleList(std::unique_ptr<port::StringListDecoder> d,
+                              ResourceHandle* ps, int64_t n);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/resource_mgr.h b/third_party/tflite-hdrs/tensorflow/core/framework/resource_mgr.h
new file mode 100644
index 00000000..74e26b43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/resource_mgr.h
@@ -0,0 +1,1042 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_MGR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_MGR_H_
+
+#include <memory>
+#include <string>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/variant.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// A ResourceMgr instance keeps track of named and typed resources
+// grouped into containers.
+//
+// Each named resource is
+// registered with ResourceMgr under a named "container" name. At any
+// time, there is at most one instance of a resource given the container
+// name, the resource type and the resource name.
+//
+// All resources for a given container can be dropped by one call of
+// Cleanup().
+//
+// E.g.,
+//   struct MyVar : public ResourceBase {
+//     mutex mu;
+//     Tensor val;
+//   }
+//
+//   ResourceMgr rm;
+//
+//   // Create a var.
+//   MyVar* my_var = new MyVar;
+//   my_var->val = Tensor(DT_FLOAT, my_shape);
+//   my_var->val.flat<float>().setZeros();   // 0 initialized.
+//   ctx->SetStatus(rm.Create("my_container", "my_name", my_var));
+//
+//   // += a variable.
+//   MyVar* my_var = nullptr;
+//   Status s = rm.Lookup("my_container", "my_name", &my_var);
+//   if (s.ok()) {
+//     my_var->val.flat<float>() += grad;
+//   }
+//   my_var->Unref();   // Or use ScopedUnref().
+//   ctx->SetStatus(s);
+
+// Container used for per-step resources.
+class ScopedStepContainer {
+ public:
+  // step_id: the unique ID of this step. Doesn't have to be sequential, just
+  // has to be unique.
+  // cleanup: callback to delete a container of this name.
+  // prefix: optional string prefix to disambiguate step containers.
+  ScopedStepContainer(const int64_t step_id,
+                      std::function<void(const string&)> cleanup)
+      : step_id_(step_id),
+        container_(strings::StrCat("__per_step_", step_id)),
+        cleanup_(cleanup),
+        dirty_(false) {}
+
+  ScopedStepContainer(const int64_t step_id,
+                      std::function<void(const string&)> cleanup,
+                      const std::string& prefix)
+      : step_id_(step_id),
+        container_(strings::StrCat("__", prefix, "_per_step_", step_id)),
+        cleanup_(cleanup),
+        dirty_(false) {}
+
+  ~ScopedStepContainer() { CleanUp(); }
+
+  void CleanUp() TF_NO_THREAD_SAFETY_ANALYSIS {
+    // NOTE(mrry): Avoid acquiring the mutex in the case that the container is
+    // clean.
+    if (dirty_) {
+      mutex_lock ml(mu_);
+      cleanup_(container_);
+      dirty_ = false;
+    }
+  }
+
+  // Pass through functions for resource lookup and creation. We do this to
+  // ensure that we can appropriately set the dirty_ bit in the
+  // ScopedStepContainer if the name of the container is used to create
+  // resources.
+
+  // Pass through to MakeResourceHandle with the container name
+  template <typename T>
+  ResourceHandle MakeResourceHandle(
+      const std::string& name, const DeviceBase& device) TF_MUST_USE_RESULT;
+  // Pass through to ResourceMgr::Create with the container name
+  template <typename T>
+  absl::Status Create(ResourceMgr* rm, const std::string& name, T* resource);
+  // Pass through to ResourceMgr::Delete with the container name
+  template <typename T>
+  absl::Status Delete(ResourceMgr* rm, const std::string& name);
+  // Pass through to ResourceMgr::Lookup with the container name
+  template <typename T>
+  absl::Status Lookup(ResourceMgr* rm, const std::string& name,
+                      T** resource) const;
+  // Pass through to ResourceMgr::LookupOrCreate with the container name
+  template <typename T>
+  absl::Status LookupOrCreate(ResourceMgr* rm, const std::string& name,
+                              T** resource,
+                              std::function<absl::Status(T**)> creator);
+  int64_t StepId() const { return step_id_; }
+
+ private:
+  const int64_t step_id_;
+  const std::string container_;
+  const std::function<void(const string&)> cleanup_;
+  mutex mu_;
+  mutable std::atomic<bool> dirty_ TF_GUARDED_BY(mu_);
+};
+
+class ResourceMgr {
+ public:
+  ResourceMgr();
+  explicit ResourceMgr(const std::string& default_container);
+  ~ResourceMgr();
+
+  // Returns the default container name for *this.
+  const std::string& default_container() const { return default_container_; }
+
+  // Creates a resource "name" in the "container".  The caller transfers
+  // the ownership of one ref on "resource" to *this, regardless of whether this
+  // operation succeeds or fails.
+  //
+  // REQUIRES: std::is_base_of<ResourceBase, T>
+  // REQUIRES: resource != nullptr.
+  template <typename T>
+  absl::Status Create(const std::string& container, const std::string& name,
+                      T* resource);
+
+  // Creates a unowned resource "name" in the "container".  The caller does NOT
+  // transfer the ownership of any ref on "resource" to *this, regardless of
+  // whether this operation succeeds or fails.
+  //
+  // After the resource is destroyed, lookups from the manager fail.
+  // The caller must call this->Delete() on the name to free up the memory
+  // entry of the name.
+  //
+  // REQUIRES: std::is_base_of<ResourceBase, T>
+  // REQUIRES: resource != nullptr.
+  template <typename T>
+  absl::Status CreateUnowned(const std::string& container,
+                             const std::string& name, T* resource);
+
+  // If "container" has a resource "name", returns it in "*resource" and
+  // the caller takes the ownership of one ref on "*resource".
+  //
+  // REQUIRES: std::is_base_of<ResourceBase, T>
+  // REQUIRES: resource != nullptr
+  template <typename T, bool use_dynamic_cast = false>
+  absl::Status Lookup(const std::string& container, const std::string& name,
+                      T** resource) const;
+
+  // If the resource manager has a resource matching "handle", returns it in
+  // "*resource" and the caller takes the ownership of one ref on "*resource".
+  //
+  // REQUIRES: resource != nullptr
+  absl::Status Lookup(const ResourceHandle& handle,
+                      ResourceBase** resource) const;
+
+  // Similar to Lookup, but looks up multiple resources at once, with only a
+  // single lock acquisition.  If containers_and_names[i] is uninitialized
+  // then this function does not modify resources[i].
+  template <typename T, bool use_dynamic_cast = false>
+  absl::Status LookupMany(
+      absl::Span<std::pair<const string*, const string*> const>
+          containers_and_names,
+      std::vector<core::RefCountPtr<T>>* resources) const;
+
+  // If "container" has a resource "name", returns it in
+  // "*resource". Otherwise, invokes creator() to create the resource.
+  // The caller takes the ownership of one ref on "*resource".
+  //
+  // WARNING: creator() must not call any methods on ResourceMgr during its
+  // execution, because a non-reentrant lock is held during the creator() call
+  // in order to guarantee atomicity of LookupOrCreate().
+  //
+  // REQUIRES: std::is_base_of<ResourceBase, T>
+  // REQUIRES: resource != nullptr
+  template <typename T, bool use_dynamic_cast = false>
+  absl::Status LookupOrCreate(const std::string& container,
+                              const std::string& name, T** resource,
+                              std::function<absl::Status(T**)> creator);
+
+  // Deletes the resource "name" from the "container".
+  //
+  // REQUIRES: std::is_base_of<ResourceBase, T>
+  template <typename T>
+  absl::Status Delete(const std::string& container, const std::string& name);
+
+  // Deletes the resource pointed by "handle".
+  absl::Status Delete(const ResourceHandle& handle);
+
+  // Deletes all resources from the "container" and removes the container.
+  absl::Status Cleanup(const std::string& container);
+
+  // Deletes all resources in all containers.
+  void Clear();
+
+  // Returns a text description for all resources.
+  std::string DebugString() const;
+
+ private:
+  typedef std::pair<uint64, absl::string_view> Key;
+  struct KeyHash {
+    std::size_t operator()(const Key& k) const {
+      return Hash64(k.second.data(), k.second.size(), k.first);
+    }
+  };
+  struct KeyEqual {
+    bool operator()(const Key& x, const Key& y) const {
+      return (x.second == y.second) && (x.first == y.first);
+    }
+  };
+  struct ResourceAndName {
+    std::variant<core::RefCountPtr<ResourceBase>, core::WeakPtr<ResourceBase>>
+        resource;
+    std::unique_ptr<std::string> name;
+
+    ResourceAndName();
+    explicit ResourceAndName(const string& name);
+    ResourceAndName(ResourceAndName&& other) noexcept;
+    ~ResourceAndName();
+
+    ResourceAndName& operator=(ResourceAndName&&) noexcept;
+
+    // Returns a strong reference to resource, or nullptr if the resource is
+    // no longer valid.
+    core::RefCountPtr<ResourceBase> GetResource() const;
+
+   private:
+    ResourceAndName(const ResourceAndName&) = delete;
+    void operator=(const ResourceAndName&) = delete;
+  };
+  typedef absl::flat_hash_map<Key, ResourceAndName, KeyHash, KeyEqual>
+      Container;
+
+  const std::string default_container_;
+  mutable mutex mu_;
+  absl::flat_hash_map<string, Container*> containers_ TF_GUARDED_BY(mu_);
+
+  template <typename T, bool use_dynamic_cast = false>
+  absl::Status LookupInternal(const std::string& container,
+                              const std::string& name, T** resource) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+  absl::Status LookupInternal(const std::string& container,
+                              uint64 type_hash_code, const std::string& name,
+                              ResourceBase** resource) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  absl::Status DoCreate(const std::string& container, TypeIndex type,
+                        const std::string& name, ResourceBase* resource,
+                        bool owns_resource) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Status DoLookup(const std::string& container, TypeIndex type,
+                        const std::string& name, ResourceBase** resource) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+  absl::Status DoLookup(const std::string& container, uint64 type_hash_code,
+                        const std::string& type_name,
+                        const std::string& resource_name,
+                        ResourceBase** resource) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  absl::Status DoDelete(const std::string& container, uint64 type_hash_code,
+                        const std::string& resource_name,
+                        const std::string& type_name);
+  absl::Status DoDelete(const std::string& container, TypeIndex type,
+                        const std::string& resource_name);
+
+  // Pops the ResourceAndName entry. The entry is moved from the list to
+  // the output argument `resource_and_name`.
+  absl::Status PopResourceAndName(const std::string& container,
+                                  uint64 type_hash_code,
+                                  const std::string& resource_name,
+                                  const std::string& type_name,
+                                  ResourceAndName& resource_and_name);
+  // Inserts the type name for 'hash_code' into the hash_code to type name map.
+  absl::Status InsertDebugTypeName(uint64 hash_code,
+                                   const std::string& type_name)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns the type name for the 'hash_code'.
+  // Returns "<unknown>" if a resource with such a type was never inserted into
+  // the container.
+  const char* DebugTypeName(uint64 hash_code) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Map from type hash_code to type name.
+  std::unordered_map<uint64, string> debug_type_names_ TF_GUARDED_BY(mu_);
+
+  ResourceMgr(const ResourceMgr&) = delete;
+  void operator=(const ResourceMgr&) = delete;
+};
+
+// Makes a resource handle with the specified type for a given container /
+// name.
+ResourceHandle MakeResourceHandle(
+    const std::string& container, const std::string& name,
+    const DeviceBase& device, const TypeIndex& type_index,
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
+    const absl::optional<ManagedStackTrace>& definition_stack_trace = {})
+    TF_MUST_USE_RESULT;
+
+template <typename T>
+ResourceHandle MakeResourceHandle(
+    OpKernelContext* ctx, const std::string& container, const std::string& name,
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
+    const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
+  return MakeResourceHandle(container.empty()
+                                ? ctx->resource_manager()->default_container()
+                                : container,
+                            name, *ctx->device(), TypeIndex::Make<T>(),
+                            dtypes_and_shapes, definition_stack_trace);
+}
+
+template <typename T>
+ResourceHandle MakeResourceHandle(
+    OpKernelConstruction* ctx, const std::string& container,
+    const std::string& name,
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
+    const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
+  return MakeResourceHandle(container.empty()
+                                ? ctx->resource_manager()->default_container()
+                                : container,
+                            name, *ctx->device(), TypeIndex::Make<T>(),
+                            dtypes_and_shapes, definition_stack_trace);
+}
+
+absl::Status MakeResourceHandleToOutput(OpKernelContext* context,
+                                        int output_index,
+                                        const std::string& container,
+                                        const std::string& name,
+                                        const TypeIndex& type_index);
+
+// Returns a resource handle from a numbered op input.
+const ResourceHandle& HandleFromInput(OpKernelContext* ctx, int input);
+
+// Safely returns a resource handle from a numbered op input.
+// Prevents segfault by checking for empty resource handle.
+absl::Status HandleFromInput(OpKernelContext* ctx, int input,
+                             ResourceHandle* handle);
+// Returns a resource handle by name, as defined in the OpDef.
+// Also prevents segfault by checking for empty resource handle.
+absl::Status HandleFromInput(OpKernelContext* ctx, absl::string_view input,
+                             ResourceHandle* handle);
+
+// Create a resource pointed by a given resource handle.
+//
+// If successful, the caller transfers the ownership of one ref on `resource` to
+// `ctx->resource_mgr()`.
+template <typename T>
+absl::Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            T* value);
+
+// Looks up a resource pointed by a given resource handle.
+//
+// If the lookup is successful, the caller takes the ownership of one ref on
+// `*value`, and must call its `Unref()` method when it has finished using it.
+template <typename T, bool use_dynamic_cast = false>
+absl::Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            T** value);
+
+// Looks up a resource pointed by a given resource handle.
+//
+// Prefer usage of LookupResource taking `core::RefCountPtr` to avoid
+// requiring the caller to explicitly call `Unref()`.
+template <typename T>
+absl::Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            core::RefCountPtr<T>* value);
+
+// Looks up multiple resources pointed by a sequence of resource handles.  If
+// p[i] is uninitialized then values[i] is unmodified.
+template <typename T>
+absl::Status LookupResources(OpKernelContext* ctx,
+                             absl::Span<ResourceHandle const> p,
+                             std::vector<core::RefCountPtr<T>>* values);
+
+// Looks up or creates a resource.
+//
+// If successful, the caller takes the ownership of one ref on `*value`, and
+// must call its `Unref()` method when it has finished using it. If the
+// `creator` is invoked, its reference on the created resource is transferred
+// to `ctx->resource_mgr()`.
+//
+// Prefer usage of LookupOrCreateResource taking `core::RefCountPtr` to avoid
+// requiring the caller to explicitly call `Unref()`.
+template <typename T>
+absl::Status LookupOrCreateResource(OpKernelContext* ctx,
+                                    const ResourceHandle& p, T** value,
+                                    std::function<absl::Status(T**)> creator);
+
+// Looks up or creates a resource.
+template <typename T>
+absl::Status LookupOrCreateResource(OpKernelContext* ctx,
+                                    const ResourceHandle& p,
+                                    core::RefCountPtr<T>* value,
+                                    std::function<absl::Status(T**)> creator);
+
+// Destroys a resource pointed by a given resource handle.
+template <typename T>
+absl::Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p);
+
+// Same as above, but uses the hash code of the type directly.
+// The type name information will be missing in the debug output when the
+// resource is not present in the container.
+absl::Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p);
+
+// Policy helper to decide which container/shared_name to use for a
+// stateful kernel that accesses shared resource.
+class ContainerInfo {
+ public:
+  // Analyze the node attribute of 'ndef' and decides the container and
+  // resource name the kernel should use for accessing the shared
+  // resource.
+  //
+  // 'ndef' is expected to have node attribute "container" and
+  // "shared_name". Returns non-OK if they are not provided or they are
+  // invalid.
+  //
+  // The policy is as following:
+  // * If the attribute "container" is non-empty, it is used as is.
+  //   Otherwise, uses the resource manager's default container.
+  // * If the attribute "shared_name" is non-empty, it is used as is.
+  //   Otherwise, if "use_node_name_as_default" is true, the kernel's
+  //   node name is used as the resource name. Otherwise, a string
+  //   unique to this process is used.
+  absl::Status Init(ResourceMgr* rmgr, const NodeDef& ndef,
+                    bool use_node_name_as_default);
+  absl::Status Init(ResourceMgr* rmgr, const NodeDef& ndef) {
+    return Init(rmgr, ndef, false);
+  }
+
+  // The policy decides that the kernel should access the resource in
+  // resource_manager(), the resource is in the container() and its
+  // name is name().  If resource_is_private_to_kernel() is true, the
+  // kernel should delete the resource when the kernel is deleted.
+  ResourceMgr* resource_manager() const { return rmgr_; }
+  const std::string& container() const { return container_; }
+  const std::string& name() const { return name_; }
+  bool resource_is_private_to_kernel() const {
+    return resource_is_private_to_kernel_;
+  }
+
+  // Returns a readable string for *this.
+  std::string DebugString() const;
+
+ private:
+  ResourceMgr* rmgr_ = nullptr;
+  std::string container_;
+  std::string name_;
+  bool resource_is_private_to_kernel_ = false;
+};
+
+// Helper for kernels to obtain 'resource' from the
+// ctx->resource_manager().
+//
+// "input_name" specifies the kernel's ref input which gives a string
+// tensor with two elements, which specifies the container and
+// resource name.
+//
+// Returns OK if the resource is found and transfers one ref of
+// *resource to the caller. Otherwise, returns an error.
+template <typename T>
+absl::Status GetResourceFromContext(OpKernelContext* ctx,
+                                    const std::string& input_name,
+                                    T** resource);
+
+// Utility op kernel to check if a handle to resource type T is initialized.
+template <typename T>
+class IsResourceInitialized : public OpKernel {
+ public:
+  explicit IsResourceInitialized(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+// Registers an op which produces just a resource handle to a resource of the
+// specified type. The type will be a part of the generated op name.
+// TODO(apassos): figure out how to get non-cpu-allocated tensors to work
+// through constant folding so this doesn't have to be marked as stateful.
+#define REGISTER_RESOURCE_HANDLE_OP(Type) \
+  REGISTER_OP(#Type "HandleOp")           \
+      .Attr("container: string = ''")     \
+      .Attr("shared_name: string = ''")   \
+      .Output("resource: resource")       \
+      .SetIsStateful()                    \
+      .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+
+// Utility op kernel to produce a handle to a resource of type T.
+template <typename T>
+class ResourceHandleOp : public OpKernel {
+ public:
+  explicit ResourceHandleOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* ctx) override;
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  std::string container_;
+  std::string name_;
+  mutex mutex_;
+  Tensor resource_;
+  std::atomic<bool> initialized_{false};
+};
+
+// Utility op kernel to produce a handle to a resource of type T.
+template <typename T>
+class ResourceHandlesOp : public OpKernel {
+ public:
+  explicit ResourceHandlesOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* ctx) override;
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  std::vector<string> containers_;
+  std::vector<string> names_;
+  mutex mutex_;
+  std::vector<Tensor> resources_;
+  std::atomic<bool> initialized_{false};
+};
+
+// Registers a kernel for an op which produces a handle to a resource of the
+// specified type.
+#define REGISTER_RESOURCE_HANDLE_KERNEL(Type)                        \
+  REGISTER_KERNEL_BUILDER(Name(#Type "HandleOp").Device(DEVICE_CPU), \
+                          ResourceHandleOp<Type>)
+
+// This class is used to guarantee that an anonymous resource is deleted
+// (irrespective of whether a resource deleter op is called explicitly or
+// the execution encounters an error before the op runs).
+//
+// This is achieved by wrapping an instance of this class into a variant
+// tensor which is passed as an input to a resource deleter op. If the
+// execution encounters an error before the op runs, the tensor will be
+// destroyed, essentially triggering the iterator deletion.
+// NOTE: This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot serialize the `ResourceMgr`
+// object, so the `Encode()` and `Decode()` methods are not implemented.
+class ResourceDeleter {
+ public:
+  ResourceDeleter() : deleter_() {}
+
+  ResourceDeleter(ResourceHandle handle, ResourceMgr* resource_manager)
+      : deleter_(std::make_shared<Helper>(handle, resource_manager)) {}
+
+  ResourceDeleter(ResourceDeleter&& rhs) : deleter_(std::move(rhs.deleter_)) {
+    VLOG(3) << "ResourceDeleter move constructor called.";
+  }
+
+  ResourceDeleter(const ResourceDeleter& rhs) : deleter_(rhs.deleter_) {
+    VLOG(3) << "ResourceDeleter copy constructor called.";
+  }
+
+  ResourceDeleter& operator=(const ResourceDeleter& rhs) = delete;
+
+  ResourceDeleter& operator=(ResourceDeleter&& rhs) = default;
+
+  virtual ~ResourceDeleter() {
+    VLOG(3) << "ResourceDeleter destructor called.";
+  }
+
+  void Encode(VariantTensorData*) const {
+    LOG(ERROR) << "The Encode() method is not implemented for ResourceDeleter "
+                  "objects.";
+  }
+
+  bool Decode(const VariantTensorData&) {
+    LOG(ERROR) << "The Decode() method is not implemented for ResourceDeleter "
+                  "objects";
+    return false;  // Not supported.
+  }
+
+ private:
+  // Helper that performs reference counting for the parent class and deletes
+  // the iterator resource when the refcount goes to zero.
+  //
+  // NOTE: The object is borrowing a pointer to the resource manager.
+  // Consequently, the tensor containing this object should not escape the
+  // function in which was created (so that it is guaranteed that the resource
+  // manager will outlive it).
+  struct Helper {
+    Helper(ResourceHandle handle, ResourceMgr* resource_manager)
+        : handle(handle), resource_manager(resource_manager) {}
+
+    Helper(const Helper& rhs) = delete;
+    Helper(Helper&& rhs) = delete;
+
+    ~Helper() {
+      VLOG(3) << "Deleting Resource: " << handle.DebugString();
+      resource_manager->Delete(handle).IgnoreError();
+    }
+
+    ResourceHandle handle;
+    ResourceMgr* resource_manager;  // not owned
+  };
+
+  std::shared_ptr<Helper> deleter_;
+};
+
+// Implementation details below.
+
+template <typename T>
+void CheckDeriveFromResourceBase() {
+  static_assert(std::is_base_of<ResourceBase, T>::value,
+                "T must derive from ResourceBase");
+}
+
+template <typename T>
+absl::Status ResourceMgr::Create(const std::string& container,
+                                 const std::string& name, T* resource) {
+  CheckDeriveFromResourceBase<T>();
+  CHECK(resource != nullptr);
+  mutex_lock l(mu_);
+  return DoCreate(container, TypeIndex::Make<T>(), name, resource,
+                  /* owns_resource */ true);
+}
+
+template <typename T>
+absl::Status ResourceMgr::CreateUnowned(const std::string& container,
+                                        const std::string& name, T* resource) {
+  CheckDeriveFromResourceBase<T>();
+  mutex_lock l(mu_);
+  return DoCreate(container, TypeIndex::Make<T>(), name, resource,
+                  /* owns_resource */ false);
+}
+
+template <typename T, bool use_dynamic_cast>
+absl::Status ResourceMgr::Lookup(const std::string& container,
+                                 const std::string& name, T** resource) const {
+  CheckDeriveFromResourceBase<T>();
+  tf_shared_lock l(mu_);
+  return LookupInternal<T, use_dynamic_cast>(container, name, resource);
+}
+
+template <typename T, bool use_dynamic_cast>
+absl::Status ResourceMgr::LookupMany(
+    absl::Span<std::pair<const string*, const string*> const>
+        containers_and_names,
+    std::vector<core::RefCountPtr<T>>* resources) const {
+  CheckDeriveFromResourceBase<T>();
+  tf_shared_lock l(mu_);
+  resources->resize(containers_and_names.size());
+  for (size_t i = 0; i < containers_and_names.size(); ++i) {
+    T* resource;
+    absl::Status s = LookupInternal<T, use_dynamic_cast>(
+        *containers_and_names[i].first, *containers_and_names[i].second,
+        &resource);
+    if (s.ok()) {
+      (*resources)[i].reset(resource);
+    }
+  }
+  return absl::OkStatus();
+}
+
+// Simple wrapper to allow conditional dynamic / static casts.
+template <typename T, bool use_dynamic_cast>
+struct TypeCastFunctor {
+  static T* Cast(ResourceBase* r) { return static_cast<T*>(r); }
+};
+
+template <typename T>
+struct TypeCastFunctor<T, true> {
+  static T* Cast(ResourceBase* r) { return dynamic_cast<T*>(r); }
+};
+
+template <typename T, bool use_dynamic_cast>
+absl::Status ResourceMgr::LookupInternal(const std::string& container,
+                                         const std::string& name,
+                                         T** resource) const {
+  ResourceBase* found = nullptr;
+  absl::Status s = DoLookup(container, TypeIndex::Make<T>(), name, &found);
+  if (s.ok()) {
+    // It's safe to down cast 'found' to T* since
+    // typeid(T).hash_code() is part of the map key.
+    *resource = TypeCastFunctor<T, use_dynamic_cast>::Cast(found);
+  }
+  return s;
+}
+
+template <typename T, bool use_dynamic_cast>
+absl::Status ResourceMgr::LookupOrCreate(
+    const std::string& container, const std::string& name, T** resource,
+    std::function<absl::Status(T**)> creator) {
+  CheckDeriveFromResourceBase<T>();
+  *resource = nullptr;
+  absl::Status s;
+  {
+    tf_shared_lock l(mu_);
+    s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
+    if (s.ok()) return s;
+  }
+  mutex_lock l(mu_);
+  s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
+  if (s.ok()) return s;
+  TF_RETURN_IF_ERROR(creator(resource));
+  s = DoCreate(container, TypeIndex::Make<T>(), name, *resource,
+               /* owns_resource */ true);
+  if (!s.ok()) {
+    return errors::Internal("LookupOrCreate failed unexpectedly");
+  }
+  (*resource)->Ref();
+  return s;
+}
+
+template <typename T>
+absl::Status ResourceMgr::Delete(const std::string& container,
+                                 const std::string& name) {
+  CheckDeriveFromResourceBase<T>();
+  return DoDelete(container, TypeIndex::Make<T>(), name);
+}
+
+template <typename T>
+absl::Status GetResourceFromContext(OpKernelContext* ctx,
+                                    const std::string& input_name,
+                                    T** resource) {
+  DataType dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &dtype));
+  if (dtype == DT_RESOURCE) {
+    const Tensor* handle;
+    TF_RETURN_IF_ERROR(ctx->input(input_name, &handle));
+    return LookupResource(ctx, handle->scalar<ResourceHandle>()(), resource);
+  }
+  std::string container;
+  std::string shared_name;
+  {
+    mutex* mu;
+    TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu));
+    mutex_lock l(*mu);
+    Tensor tensor;
+    TF_RETURN_IF_ERROR(ctx->mutable_input(input_name, &tensor, true));
+    if (tensor.NumElements() != 2) {
+      return errors::InvalidArgument(
+          "Resource handle must have 2 elements, but had shape: ",
+          tensor.shape().DebugString());
+    }
+    container = tensor.flat<tstring>()(0);
+    shared_name = tensor.flat<tstring>()(1);
+  }
+  return ctx->resource_manager()->Lookup(container, shared_name, resource);
+}
+
+namespace internal {
+
+absl::Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p);
+
+template <typename T>
+absl::Status ValidateDeviceAndType(OpKernelContext* ctx,
+                                   const ResourceHandle& p) {
+  TF_RETURN_IF_ERROR(internal::ValidateDevice(ctx, p));
+  TF_RETURN_IF_ERROR(p.ValidateType<T>());
+  return absl::OkStatus();
+}
+
+}  // namespace internal
+
+// Creates the resource pointed at by "p". The caller transfers the ownership of
+// one ref on "*value" to the resource manager in "ctx", regardless of whether
+// this operation succeeds or fails.
+template <typename T>
+absl::Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            T* value) {
+  TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
+  return ctx->resource_manager()->Create(p.container(), p.name(), value);
+}
+
+// Finds the resource as "*value" from the handle. If the handle is
+// ref-counting, returns the resource owned by the handle. Otherwise, looks up
+// the resource matching "p" from resource manager associated with ctx.
+// Always returns a new reference to the resource in "*value". The caller shall
+// call (*value)->Unref().
+template <typename T, bool use_dynamic_cast>
+absl::Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            T** value) {
+  TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
+  if (p.IsRefCounting()) {
+    TF_ASSIGN_OR_RETURN(*value, p.GetResource<T>());
+    // Transfers out a new reference.
+    (*value)->Ref();
+    return absl::OkStatus();
+  }
+
+  return ctx->resource_manager()->Lookup<T, use_dynamic_cast>(p.container(),
+                                                              p.name(), value);
+}
+
+// Finds the resource as "*value" from the handle. This is a type-erased
+// variant of LookupResource above.
+absl::Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            ResourceBase** value);
+
+// If the resource manager in "ctx" has a resource matching "p", returns it in
+// "*value".
+template <typename T>
+absl::Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
+                            core::RefCountPtr<T>* value) {
+  T* raw_ptr = nullptr;
+  TF_RETURN_IF_ERROR(LookupResource<T, false>(ctx, p, &raw_ptr));
+  value->reset(raw_ptr);
+
+  return absl::OkStatus();
+}
+
+// Similar to Lookup, but looks up multiple resources at once, with only a
+// single lock acquisition.
+template <typename T>
+absl::Status LookupResources(OpKernelContext* ctx,
+                             absl::Span<ResourceHandle const* const> p,
+                             std::vector<core::RefCountPtr<T>>* values) {
+  std::vector<std::pair<const string*, const string*>> containers_and_names(
+      p.size());
+  for (size_t i = 0; i < p.size(); ++i) {
+    TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, *p[i]));
+    containers_and_names[i] = {&p[i]->container(), &p[i]->name()};
+  }
+  return ctx->resource_manager()->LookupMany(containers_and_names, values);
+}
+
+// If the resource manager in "ctx" has a resource pointed at by "p", returns
+// it in "*value". Otherwise, invokes creator() to create the resource.
+// The caller takes the ownership of one ref on "*value".
+//
+// WARNING: creator() must not call any methods on the resource manager during
+// its execution, because a non-reentrant lock is held during the creator() call
+// in order to guarantee atomicity of LookupOrCreateResource().
+template <typename T>
+absl::Status LookupOrCreateResource(OpKernelContext* ctx,
+                                    const ResourceHandle& p, T** value,
+                                    std::function<absl::Status(T**)> creator) {
+  TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
+  return ctx->resource_manager()->LookupOrCreate(p.container(), p.name(), value,
+                                                 creator);
+}
+
+// If the resource manager in "ctx" has a resource pointed at by "p", returns
+// it in "*value". Otherwise, invokes creator() to create the resource.
+//
+// WARNING: creator() must not call any methods on the resource manager during
+// its execution, because a non-reentrant lock is held during the creator() call
+// in order to guarantee atomicity of LookupOrCreateResource().
+template <typename T>
+absl::Status LookupOrCreateResource(OpKernelContext* ctx,
+                                    const ResourceHandle& p,
+                                    core::RefCountPtr<T>* value,
+                                    std::function<absl::Status(T**)> creator) {
+  T* raw_ptr = nullptr;
+  TF_RETURN_IF_ERROR(LookupOrCreateResource<T>(ctx, p, &raw_ptr, creator));
+  value->reset(raw_ptr);
+
+  return absl::OkStatus();
+}
+
+// Deletes the resource pointed by "p", using the resource manager in "ctx".
+template <typename T>
+absl::Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p) {
+  TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
+  // This is a noop because ResourceMgr does not hold a reference.
+  // NOTE(feyu): if we can convert all resources handle to ref-counting, then
+  // DeleteResource can be removed.
+  if (p.IsRefCounting()) {
+    return absl::OkStatus();
+  }
+  return ctx->resource_manager()->Delete<T>(p.container(), p.name());
+}
+
+// Deletes the resource pointed by "p", using the resource manager in "ctx".
+absl::Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p);
+
+template <typename T>
+void IsResourceInitialized<T>::Compute(OpKernelContext* ctx) {
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output));
+  T* object;
+  bool found;
+  if (LookupResource(ctx, HandleFromInput(ctx, 0), &object).ok()) {
+    found = true;
+    object->Unref();
+  } else {
+    found = false;
+  }
+
+  output->flat<bool>()(0) = found;
+}
+
+template <typename T>
+ResourceHandleOp<T>::ResourceHandleOp(OpKernelConstruction* context)
+    : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("container", &container_));
+  OP_REQUIRES_OK(context, context->GetAttr("shared_name", &name_));
+}
+
+template <typename T>
+void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
+  if (name_ == ResourceHandle::ANONYMOUS_NAME) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    Tensor handle;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
+    handle.scalar<ResourceHandle>()() = MakeResourceHandle<T>(
+        ctx, container_, name_, /*dtypes_and_shapes=*/{}, ctx->stack_trace());
+    ctx->set_output(0, handle);
+  } else {
+    if (!initialized_.load()) {
+      mutex_lock ml(mutex_);
+      // Checking again to see if another thread has initialized the resource.
+      if (!initialized_.load()) {
+        AllocatorAttributes attr;
+        attr.set_on_host(true);
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                               &resource_, attr));
+        resource_.scalar<ResourceHandle>()() =
+            MakeResourceHandle<T>(ctx, container_, name_,
+                                  /*dtypes_and_shapes=*/{}, ctx->stack_trace());
+        initialized_.store(true);
+      }
+    }
+    ctx->set_output(0, resource_);
+  }
+}
+
+template <typename T>
+ResourceHandlesOp<T>::ResourceHandlesOp(OpKernelConstruction* context)
+    : OpKernel(context) {
+  int n;
+  OP_REQUIRES_OK(context, context->GetAttr("N", &n));
+  OP_REQUIRES_OK(context, context->GetAttr("containers", &containers_));
+  OP_REQUIRES_OK(context, context->GetAttr("shared_names", &names_));
+  OP_REQUIRES(
+      context, containers_.size() == n,
+      errors::InvalidArgument("Number of containers (", containers_.size(),
+                              ") must be equal to N (", n, ")"));
+  OP_REQUIRES(context, names_.size() == n,
+              errors::InvalidArgument("Number of names (", containers_.size(),
+                                      ") must be equal to N (", n, ")"));
+  resources_.resize(n);
+}
+
+template <typename T>
+void ResourceHandlesOp<T>::Compute(OpKernelContext* ctx) {
+  if (!initialized_.load()) {
+    mutex_lock ml(mutex_);
+    // Checking again to see if another thread has initialized the resource.
+    if (!initialized_.load()) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      for (size_t i = 0; i < resources_.size(); ++i) {
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                               &resources_[i], attr));
+        ResourceHandle h =
+            MakeResourceHandle<T>(ctx, containers_[i], names_[i]);
+        resources_[i].template scalar<ResourceHandle>()() = h;
+      }
+      initialized_.store(true);
+    }
+  }
+  for (size_t i = 0; i < resources_.size(); ++i) {
+    ctx->set_output(i, resources_[i]);
+  }
+}
+
+template <typename T>
+ResourceHandle ScopedStepContainer::MakeResourceHandle(
+    const std::string& name, const DeviceBase& device) {
+  mutex_lock ml(mu_);
+  dirty_ = true;
+  return tensorflow::MakeResourceHandle(container_, name, device,
+                                        TypeIndex::Make<T>(), {});
+}
+
+template <typename T>
+absl::Status ScopedStepContainer::Lookup(ResourceMgr* rm,
+                                         const std::string& name,
+                                         T** resource) const {
+  return rm->Lookup<T>(container_, name, resource);
+}
+
+template <typename T>
+absl::Status ScopedStepContainer::LookupOrCreate(
+    ResourceMgr* rm, const std::string& name, T** resource,
+    std::function<absl::Status(T**)> creator) {
+  mutex_lock ml(mu_);
+  dirty_ = true;
+  return rm->LookupOrCreate<T>(container_, name, resource, creator);
+}
+
+template <typename T>
+absl::Status ScopedStepContainer::Create(ResourceMgr* rm,
+                                         const std::string& name, T* resource) {
+  mutex_lock ml(mu_);
+  dirty_ = true;
+  return rm->Create<T>(container_, name, resource);
+}
+
+template <typename T>
+absl::Status ScopedStepContainer::Delete(ResourceMgr* rm,
+                                         const std::string& name) {
+  return rm->Delete<T>(container_, name);
+}
+
+}  //  end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_MGR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/resource_op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/framework/resource_op_kernel.h
new file mode 100644
index 00000000..9982c02f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/resource_op_kernel.h
@@ -0,0 +1,153 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_OP_KERNEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_OP_KERNEL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// ResourceOpKernel<T> is a virtual base class for resource op implementing
+// interface type T. The inherited op looks up the resource name (determined by
+// ContainerInfo), and creates a new resource if necessary.
+//
+// Requirements:
+//  - Op must be marked as stateful.
+//  - Op must have `container` and `shared_name` attributes. Empty `container`
+//  means using the default container. Empty `shared_name` means private
+//  resource.
+//  - Subclass must override CreateResource().
+//  - Subclass is encouraged to override VerifyResource().
+template <typename T>
+class ResourceOpKernel : public OpKernel {
+ public:
+  explicit ResourceOpKernel(OpKernelConstruction* context) : OpKernel(context) {
+    has_resource_type_ = (context->output_type(0) == DT_RESOURCE);
+    if (!has_resource_type_) {
+      // The resource variant of the op may be placed on non-CPU devices, but
+      // this allocation is always on the host. Fortunately we don't need it in
+      // the resource case.
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DT_STRING, TensorShape({2}), &tensor_));
+    }
+  }
+
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel. Ideally the resource should be deleted when it is no longer held
+  // by anyone, but it would break backward compatibility.
+  ~ResourceOpKernel() override {
+    if (cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<T>(cinfo_.container(), cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    core::RefCountPtr<T> resource_ref_ptr = weak_resource_.GetNewRef();
+    if (resource_ref_ptr == nullptr) {
+      ResourceMgr* mgr = context->resource_manager();
+      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+      T* resource;
+      OP_REQUIRES_OK(context,
+                     mgr->LookupOrCreate<T>(
+                         cinfo_.container(), cinfo_.name(), &resource,
+                         [this](T** ret) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                           absl::Status s = CreateResource(ret);
+                           if (!s.ok() && *ret != nullptr) {
+                             CHECK((*ret)->Unref());
+                           }
+                           return s;
+                         }));
+      // Here the code releases the reference to the resource created by this op
+      // and only holds a WeakPtr to the resource. This way the lifetime of the
+      // resource is owned by the container; otherwise the container may be
+      // cleared (e.g. a Session::Reset()) but the resource lives on inside this
+      // op, causing later lookups in the container by handle to fail.
+      core::ScopedUnref resource_unref(resource);
+      OP_REQUIRES_OK(context, VerifyResource(resource));
+      weak_resource_ = core::WeakPtr<T>(resource);
+      // TODO(b/243544755): delete after scam migrates ResourceKernelOp
+      // subclasses to get_resource() in TF 2.11.
+      resource_ = resource;
+
+      if (!has_resource_type_) {
+        auto h = tensor_.template flat<tstring>();
+        h(0) = cinfo_.container();
+        h(1) = cinfo_.name();
+      }
+    }
+    if (has_resource_type_) {
+      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                  context, 0, cinfo_.container(), cinfo_.name(),
+                                  TypeIndex::Make<T>()));
+    } else {
+      context->set_output_ref(0, &mu_, &tensor_);
+    }
+  }
+
+ protected:
+  // Variables accessible from subclasses.
+  mutex mu_;
+  ContainerInfo cinfo_ TF_GUARDED_BY(mu_);
+  // TODO(b/243544755): delete after scam migrates ResourceKernelOp subclasses
+  // to get_resource() in TF 2.11.
+  ABSL_DEPRECATED("Use get_resource() instead.")
+  T* resource_ TF_GUARDED_BY(mu_) = nullptr;
+
+  core::RefCountPtr<T> get_resource() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    return weak_resource_.GetNewRef();
+  }
+
+ private:
+  core::WeakPtr<T> weak_resource_ TF_GUARDED_BY(mu_) =
+      core::WeakPtr<T>(nullptr);
+
+  // Must return a T descendant allocated with new that ResourceOpKernel will
+  // take ownership of.
+  virtual absl::Status CreateResource(T** resource)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  virtual absl::Status VerifyResource(T* resource) { return absl::OkStatus(); }
+
+  Tensor tensor_ TF_GUARDED_BY(mu_);
+
+  // Is the output of the operator of type DT_RESOURCE?
+  bool has_resource_type_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/resource_var.h b/third_party/tflite-hdrs/tensorflow/core/framework/resource_var.h
new file mode 100644
index 00000000..6c0a8d96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/resource_var.h
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+
+namespace tensorflow {
+
+// Resource stored by variables in the resource manager (new, resource-style
+// version).
+//
+// These variables have a mixed access mode: they can operate on copy-on-write
+// mode (the default) or copy-on-read mode (used only for sparse access).
+//
+// When copy-on-write mode is enabled reading the value of the variable involves
+// grabbing its mutex in shared mode and aliasing the internal tensor as the
+// output of the read operation, increasing its reference count. Writing,
+// conversely, works by, under an exclusive lock, detecting whether there are
+// outstanding aliases of the tensor, using the reference count, copying the
+// tensor if they exist, and writing to either the original or a copy with no
+// outstanding aliases. Sparse operations are not supported in copy-on-write
+// mode.
+//
+// When a variable is accessed sparsely it switches to copy-on-read mode. To
+// switch we need to grab an exclusive lock and might (if there are aliases)
+// need to copy the entire tensor. Once copy-on-read mode is enabled, no tensor
+// is allowed to alias the variable's internal tensor. This means dense reads
+// must return a copy of the variable, done while holding a shared lock. Dense
+// writes do not need to check whether aliases exist, and can always write
+// directly to the buffer without making a copy, while holding an exclusive
+// lock. Sparse reads and sparse writes, on the other hand, can be done under a
+// shared or exclusive mutex (the damage from writes under a shared mutex is
+// limited since no other buffer is allowed to alias the variable's
+// buffer). Using an exclusive mutex disallows concurrent writes and concurrent
+// sparse reads, providing some extra safety at the expense of performance,
+// while shared mutex allow for "hogwild" behavior. Doing sparse writes under a
+// shared mutex prevents them from overlapping with dense writes, which is
+// necessary as dense writes can change the shape the of the tensor.
+//
+// Transitioning a variable from copy-on-read mode to copy-on-write mode is
+// currently not supported. To upgrade a variable from copy-on-write to
+// copy-on-read use `EnsureSparseVariableAccess()`, and then grab the variable's
+// mutex as desired. To access the variable in dense mode grab the mutex either
+// directly or via `MaybeLockVariableInputMutexesInOrder` on all variables being
+// modified and then call `PrepareToUpdateVariable` on them in any order.
+class Var : public ResourceBase {
+ public:
+  explicit Var(DataType dtype) : tensor_(dtype) {}
+  explicit Var(DataType dtype, std::string& debug_name) : tensor_(dtype) {
+    debug_name_ = debug_name;
+  }
+
+  // When locking multiple variables, the locks must be acquired in order of
+  // increasing mu() address.
+  // TODO(ebrevdo): Use LockSet instead of exposing mu.
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  // Uninitializes the variable, by reverting the state of the tensor to
+  // the state when the variable is first created.
+  void Uninitialize() {
+    // move frees the buffer of the tensor after unused goes out of scope.
+    Tensor unused = std::move(tensor_);
+    is_initialized = false;
+  }
+
+  absl::Status AsGraphDef(GraphDefBuilder* builder, Node** out) const override;
+
+  std::string DebugString() const override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+  std::string MakeRefCountingHandleName(int64_t resource_id) const override;
+
+  // Only used in the resource variable path. In resource variables,
+  // tensor.IsInitialized() can be true (i.e. have memory allocated to it) while
+  // there is not a good value there due to a race condition, and it's possible
+  // to stumble upon this during variable.initialized_value(). So it's best to
+  // just store directly whether the variable is initialized.
+  bool is_initialized = false;  // TF_GUARDED_BY(mu_) but annotalysis doesn't
+                                // like it.
+
+  // Also fake-guarded by mu_. Should be set to True whenever any sparse
+  // operation uses the variable. Once this is true no tensor is allowed to
+  // alias the memory of the variable, and we always copy the variable on
+  // reads. This allows sparse operations to happen with only a shared lock if
+  // so desired.
+  std::atomic<bool> copy_on_read_mode{false};
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+  std::string debug_name_;
+
+  ~Var() override {}
+  Var(const Var&) = delete;
+  void operator=(const Var&) = delete;
+};
+
+// Does unlock and unref automatically when going out of scope, and also
+// supports early manual release.
+class TF_SCOPED_LOCKABLE ScopedUnlockUnrefVar {
+ public:
+  explicit ScopedUnlockUnrefVar(Var* var) TF_EXCLUSIVE_LOCK_FUNCTION(var_->mu())
+      : var_(var) {
+    if (var_) {
+      var_->mu()->lock();
+    }
+  }
+  void Release() TF_UNLOCK_FUNCTION() {
+    if (var_) {
+      var_->mu()->unlock();
+      var_->Unref();
+      var_ = nullptr;
+    }
+  }
+  ~ScopedUnlockUnrefVar() TF_UNLOCK_FUNCTION() { Release(); }
+
+ private:
+  Var* var_;
+
+  ScopedUnlockUnrefVar(const ScopedUnlockUnrefVar&) = delete;
+  ScopedUnlockUnrefVar(ScopedUnlockUnrefVar&&) = delete;
+  ScopedUnlockUnrefVar& operator=(const ScopedUnlockUnrefVar&) = delete;
+  ScopedUnlockUnrefVar& operator=(ScopedUnlockUnrefVar&&) = delete;
+};
+
+}  //  end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/rng_alg.h b/third_party/tflite-hdrs/tensorflow/core/framework/rng_alg.h
new file mode 100644
index 00000000..fd756c87
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/rng_alg.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+enum Algorithm {
+  // The Philox algorithm, as described in paper
+  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
+  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+  RNG_ALG_PHILOX = 1,
+  // The ThreeFry algorithm, as described in paper
+  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
+  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+  RNG_ALG_THREEFRY = 2,
+  // An algorithm auto-selected by the system according to device type.
+  RNG_ALG_AUTO_SELECT = 3
+};
+
+// Same as `Algorithm`, but without AUTO_SELECT. We use C++ compiler's -Wswitch
+// and -Werror to check that `switch` covers all cases. When the algorithm
+// auto-selection has been resolved, we use this type so that
+// we don't need to (unnecessarily) handle the AUTO_SELECT case.
+enum class ConcreteRngAlgorithm {
+  RNG_ALG_PHILOX = 1,
+  RNG_ALG_THREEFRY = 2,
+};
+
+// Gets the counter size (in unit of uint64) for a counter-based RNG
+// algorithm `alg`. Callers of this function must ensure that `alg` doesn't have
+// non-enumerator values.
+inline int GetCounterSize(ConcreteRngAlgorithm alg) {
+  switch (alg) {
+    case ConcreteRngAlgorithm::RNG_ALG_PHILOX:
+      return 2;
+    case ConcreteRngAlgorithm::RNG_ALG_THREEFRY:
+      return 1;
+  }
+  LOG(ERROR) << "This point shouldn't have been reached.";
+}
+static constexpr int RNG_MAX_COUNTER_SIZE = 2;
+
+static constexpr int RNG_KEY_SIZE = 1;
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/run_handler.h b/third_party/tflite-hdrs/tensorflow/core/framework/run_handler.h
new file mode 100644
index 00000000..148378bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/run_handler.h
@@ -0,0 +1,315 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}
+
+namespace tensorflow {
+
+class RunHandler;
+
+// RunHandlerPool is a fixed size pool of pre-allocated RunHandlers
+// that can be used for tracking inter-op work for a given Session::Run().
+// RunHandler(s) in the pool are initially 'inactive'. A RunHandler becomes
+// 'active' when its unique_ptr is returned by Get() and is being used by a
+// client. It becomes 'inactive' once more when its unique_ptr gets destroyed.
+//
+// Expected usage:
+//
+// * Create a single RunHandlerPool (say run_handler_pool_).
+//
+// * When a Session::Run() is invoked, obtain a handler by:
+// auto handler = run_handler_pool_->Get();
+//
+// * Use handler for scheduling all inter-op work by:
+// handler->ScheduleInterOpClosure(closure);
+//
+// This class is thread safe.
+class RunHandlerPool {
+ public:
+  explicit RunHandlerPool(int num_inter_op_threads);
+
+  RunHandlerPool(int num_inter_op_threads, int num_intra_op_threads);
+  ~RunHandlerPool();
+
+  // Returns an inactive RunHandler from the pool.
+  //
+  // RunHandlers in RunHandlerPool are initially 'inactive'.
+  // A RunHandler becomes 'active' when its unique_ptr its returned by Get()
+  // and is being used by a client.  It becomes 'inactive' once more when the
+  // unique_ptr is destroyed.
+  //
+  // Will block unless there is an inactive handler.
+  std::unique_ptr<RunHandler> Get(
+      int64_t step_id = 0, int64_t timeout_in_ms = 0,
+      const RunOptions::Experimental::RunHandlerPoolOptions& options =
+          RunOptions::Experimental::RunHandlerPoolOptions());
+
+  // Get the priorities for active handlers. The return result is with the same
+  // order of the active handler list.
+  std::vector<int64_t> GetActiveHandlerPrioritiesForTesting() const;
+
+ private:
+  class Impl;
+  friend class RunHandler;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+// RunHandler can be used to schedule inter/intra-op closures to run on a global
+// pool shared across all Session::Run(s). The closures are enqueued to a
+// handler specific queue, from which the work is stolen in a priority order
+// (time of the Get() call).
+//
+// It can only be created via RunHandlerPool::Get().
+//
+// This class can be used instead of directly scheduling closures on a global
+// pool since it maintains a global view across all sessions and optimizes pool
+// scheduling to improve (median and tail) latency.
+//
+// This class is thread safe.
+class RunHandler {
+ public:
+  void ScheduleInterOpClosure(std::function<void()> fn);
+  thread::ThreadPoolInterface* AsIntraThreadPoolInterface();
+
+  ~RunHandler();
+
+ private:
+  class Impl;
+  friend class RunHandlerPool::Impl;
+
+  explicit RunHandler(Impl* impl);
+
+  Impl* impl_;  // NOT OWNED.
+};
+
+namespace internal {
+
+// TODO(azaks): Refactor with thread:ThreadPool
+class RunHandlerEnvironment {
+  typedef Thread EnvThread;
+  struct TaskImpl {
+    std::function<void()> f;
+    Context context;
+    uint64 trace_id;
+  };
+  Env* const env_;
+  const ThreadOptions thread_options_;
+  const string name_;
+
+ public:
+  struct Task {
+    std::unique_ptr<TaskImpl> f;
+  };
+
+  RunHandlerEnvironment(Env* env, const ThreadOptions& thread_options,
+                        const string& name);
+
+  EnvThread* CreateThread(std::function<void()> f,
+                          const std::string& thread_name);
+
+  Task CreateTask(std::function<void()> f);
+
+  void ExecuteTask(const Task& t);
+};
+
+typedef typename RunHandlerEnvironment::Task Task;
+typedef Eigen::RunQueue<Task, 1024> Queue;
+
+// To reduce cache misses, we use a doubly-linked list of Waiter structs and
+// queue them in LIFO order rather than the FIFO order used by a single
+// condition variable.
+struct Waiter {
+  Waiter() {
+    next = this;
+    prev = this;
+  }
+  condition_variable cv;
+  mutex mu;
+  Waiter* next;
+  Waiter* prev;
+};
+
+class ThreadWorkSource {
+ public:
+  ThreadWorkSource();
+
+  ~ThreadWorkSource();
+
+  Task EnqueueTask(Task t, bool is_blocking);
+
+  Task PopBlockingTask();
+
+  Task PopNonBlockingTask(int start_index, bool search_from_all_queue);
+
+  void WaitForWork(int max_sleep_micros);
+
+  int TaskQueueSize(bool is_blocking);
+
+  int64_t GetTracemeId();
+
+  void SetTracemeId(int64_t value);
+
+  void SetWaiter(uint64 version, Waiter* waiter, mutex* mutex);
+
+  int64_t GetInflightTaskCount(bool is_blocking);
+
+  void IncrementInflightTaskCount(bool is_blocking);
+
+  void DecrementInflightTaskCount(bool is_blocking);
+
+  unsigned NonBlockingWorkShardingFactor();
+
+  std::string ToString();
+
+ private:
+  struct NonBlockingQueue {
+    mutex queue_op_mu;
+    char pad[128];
+    Queue queue;
+  };
+
+  int32 non_blocking_work_sharding_factor_;
+  Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
+
+  std::atomic<int64_t> blocking_inflight_;
+  std::atomic<int64_t> non_blocking_inflight_;
+
+  Queue blocking_work_queue_;
+  mutex blocking_queue_op_mu_;
+  char pad_[128];
+  mutex waiters_mu_;
+  Waiter queue_waiters_ TF_GUARDED_BY(waiters_mu_);
+  std::atomic<int64_t> traceme_id_;
+
+  mutex run_handler_waiter_mu_;
+  uint64 version_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  mutex* sub_thread_pool_waiter_mu_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  Waiter* sub_thread_pool_waiter_ TF_GUARDED_BY(run_handler_waiter_mu_);
+};
+
+class RunHandlerThreadPool {
+ public:
+  struct PerThread {
+    constexpr PerThread() : pool(nullptr), thread_id(-1) {}
+    RunHandlerThreadPool* pool;  // Parent pool, or null for normal threads.
+    int thread_id;               // Worker thread index in pool.
+  };
+
+  RunHandlerThreadPool(int num_blocking_threads, int num_non_blocking_threads,
+                       Env* env, const ThreadOptions& thread_options,
+                       const string& name,
+                       Eigen::MaxSizeVector<mutex>* waiters_mu,
+                       Eigen::MaxSizeVector<Waiter>* queue_waiters);
+
+  ~RunHandlerThreadPool();
+
+  void Start();
+
+  void StartOneThreadForTesting();
+
+  void AddWorkToQueue(ThreadWorkSource* tws, bool is_blocking,
+                      std::function<void()> fn);
+
+  // Set work queues from which the thread 'tid' can steal its work.
+  // The request with start_request_idx will be attempted first. Other requests
+  // will be attempted in FIFO order based on their arrival time.
+  void SetThreadWorkSources(
+      int tid, int start_request_idx, uint64 version,
+      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources);
+
+  PerThread* GetPerThread();
+
+  int CurrentThreadId() const;
+
+  int NumThreads() const;
+
+  int NumBlockingThreads() const;
+
+  int NumNonBlockingThreads() const;
+
+  void WorkerLoop(int thread_id, bool may_steal_blocking_work);
+
+  // Search tasks from Requets range searching_range_start to
+  // searching_range_end. If there is no tasks in the search range and
+  // may_steal_blocking_work is true, then search from all requests.
+  Task FindTask(
+      int searching_range_start, int searching_range_end, int thread_id,
+      int sub_thread_pool_id, int max_blocking_inflight,
+      bool may_steal_blocking_work,
+      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources,
+      bool* task_from_blocking_queue, ThreadWorkSource** tws);
+
+  void WaitForWork(bool is_blocking, int thread_id,
+                   int32_t max_blocking_inflight);
+
+  void WaitForWorkInSubThreadPool(bool is_blocking, int sub_thread_pool_id);
+
+ private:
+  struct ThreadData {
+    ThreadData();
+    mutex mu;
+    uint64 new_version;
+    condition_variable sources_not_empty;
+    std::unique_ptr<Thread> thread;
+    int current_index;
+    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+        new_thread_work_sources TF_GUARDED_BY(mu);
+
+    uint64 current_version;
+    // Should only be accessed by one thread.
+    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+        current_thread_work_sources;
+
+    int sub_thread_pool_id;
+  };
+
+  const int num_threads_;
+  const int num_blocking_threads_;
+  const int num_non_blocking_threads_;
+  Eigen::MaxSizeVector<ThreadData> thread_data_;
+  internal::RunHandlerEnvironment env_;
+  std::atomic<bool> cancelled_;
+  string name_;
+  Eigen::MaxSizeVector<mutex>* waiters_mu_;
+  Eigen::MaxSizeVector<Waiter>* queue_waiters_;
+
+  bool use_sub_thread_pool_;
+  std::vector<int> num_threads_in_sub_thread_pool_;
+
+  // Threads in each sub thread pool will search tasks from the given
+  // start_request_percentage to end_request_percentage in a round robin
+  // fashion.
+  std::vector<double> sub_thread_pool_start_request_percentage_;
+  std::vector<double> sub_thread_pool_end_request_percentage_;
+};
+
+}  // namespace internal
+
+}  // end namespace tensorflow.
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/run_handler_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/run_handler_util.h
new file mode 100644
index 00000000..c63583da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/run_handler_util.h
@@ -0,0 +1,78 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+
+// Assign thread ranges to requests.
+// Requests are numbered 0...num_active_requests-1, and
+// threads are numbered 0...num_threads-1.
+// On return, the range [start_vec->at(i), end_vec->at(i))
+// indicates the subrange of the threads available to request i.
+// The ranges given to different requests may overlap.
+// Lower numbered requests will tend to be assigned more threads.
+// Thus, a client might associate older requests with lower
+// array indices so they receive access to more threads.
+// However, the routine ensures that each request is given access
+// to at least min(min_threads_per_request, num_threads)  threads.
+// Every thread will be assigned to at least one request range,
+// assuming there is at least one request.
+void ComputeInterOpSchedulingRanges(int num_active_requests, int num_threads,
+                                    int min_threads_per_request,
+                                    std::vector<std::uint_fast32_t>* start_vec,
+                                    std::vector<std::uint_fast32_t>* end_vec);
+
+// Assign thread steal ranges to threads.Threads are numbered 0...num_threads-1.
+// On return, the range [start_vec->at(i), end_vec->at(i)) indicates the steal
+// range of the thread i. The ranges given to different threads may overlap.
+void ComputeInterOpStealingRanges(int num_threads, int min_threads_per_domain,
+                                  std::vector<std::uint_fast32_t>* start_vec,
+                                  std::vector<std::uint_fast32_t>* end_vec);
+
+// For each of the num_threads determine the index of the active_request whose
+// work queue should be attempted first by that the thread. Return a vector of
+// size num_threads which represents how threads should be distributed across
+// requests.
+std::vector<int> ChooseRequestsWithExponentialDistribution(
+    int num_active_requests, int num_threads);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. Return 'default_value' otherwise.
+double ParamFromEnvWithDefault(const char* var_name, double default_value);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. The value must be in format val1,val2... Return
+// 'default_value' otherwise.
+std::vector<double> ParamFromEnvWithDefault(const char* var_name,
+                                            std::vector<double> default_value);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. The value must be in format val1,val2... Return
+// 'default_value' otherwise.
+std::vector<int> ParamFromEnvWithDefault(const char* var_name,
+                                         std::vector<int> default_value);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. Return 'default_value' otherwise.
+bool ParamFromEnvBoolWithDefault(const char* var_name, bool default_value);
+
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/session_state.h b/third_party/tflite-hdrs/tensorflow/core/framework/session_state.h
new file mode 100644
index 00000000..d102e153
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/session_state.h
@@ -0,0 +1,91 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SESSION_STATE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SESSION_STATE_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The session state remembers the tensors we choose to keep across
+// multiple run calls.
+class SessionState {
+ public:
+  // Get a tensor from the session state.
+  absl::Status GetTensor(const std::string& handle, Tensor* tensor);
+
+  // Store a tensor in the session state.
+  absl::Status AddTensor(const std::string& handle, const Tensor& tensor);
+
+  // Delete a tensor from the session state.
+  absl::Status DeleteTensor(const std::string& handle);
+
+  int64_t GetNewId();
+
+  static const char* kTensorHandleResourceTypeName;
+
+ private:
+  mutex state_lock_;
+
+  // For generating unique ids for tensors stored in the session.
+  int64_t tensor_id_ = 0;
+
+  // The live tensors in the session. A map from tensor handle to tensor.
+  std::unordered_map<string, Tensor> tensors_;
+};
+
+// The tensor store remembers the tensors we choose to keep for the
+// current run call. It is available to every op kernel.
+class TensorStore {
+ public:
+  struct TensorAndKey {
+    Tensor tensor;
+    int64_t id;
+    std::string device_name;
+
+    std::string GetHandle(const std::string& tensor_name) {
+      return strings::StrCat(tensor_name, ";", id, ";", device_name);
+    }
+  };
+
+  // Add the named tensor to the tensor store for this run.
+  absl::Status AddTensor(const std::string& name, const TensorAndKey& tk);
+
+  // Save the tensors in the tensor store of this run to the session.
+  absl::Status SaveTensors(const std::vector<string>& output_names,
+                           SessionState* session_state);
+
+  // Returns true if no tensors have been added to this store.
+  bool empty() TF_NO_THREAD_SAFETY_ANALYSIS { return !dirty_; }
+
+ private:
+  mutex lock_;
+  std::atomic<bool> dirty_ TF_GUARDED_BY(lock_){false};
+
+  // The tensors that will be saved to session state when this run completes.
+  // A map from tensor string name to tensor.
+  std::unordered_map<string, TensorAndKey> tensors_ TF_GUARDED_BY(lock_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SESSION_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/shape_inference.h b/third_party/tflite-hdrs/tensorflow/core/framework/shape_inference.h
new file mode 100644
index 00000000..8bfd301d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/shape_inference.h
@@ -0,0 +1,924 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+namespace grappler {
+class GraphProperties;
+class SymbolicShapeManager;
+}  // namespace grappler
+
+namespace shape_inference {
+
+struct DimensionOrConstant;
+class InferenceContext;
+
+// This header contains the InferenceContext that is used to infer the shape of
+// the results of an operation or flag an operation with invalid inputs (e.g.,
+// mismatched shapes for elementwise operation) by ShapeRefiner. The shape of an
+// operation is computed using the OpShapeInferenceFn set via SetShapeFn in op
+// registration. The OpShapeInferenceFn uses a per op InferenceContext populated
+// with input shapes to compute resultant shape (including resource shapes).
+//
+// The shapes created in the InferenceContext are bound to the lifetime of the
+// InferenceContext in which it was created. E.g., in
+//
+// ```c++
+//  InferenceContext c;
+//  // Below a ShapeHandle is returned by MakeShape, while UnknownDim returns a
+//  // DimensionHandle.
+//  ShapeHandle in0 = c.MakeShape({10, c.UnknownDim()});
+// ```
+//
+// the ShapeHandle `in0` (and the nested unknown dim inside) is only valid while
+// `c` is in scope, as ShapeHandle and DimensionHandle are effectively
+// wrappers around pointers stored inside the context with the lifetime of the
+// value pointed to managed by the context. The result from one operation's
+// inference context will be passed as input to the inference of consumer
+// operations. Hence it is possible for ShapeHandles produced by inference on a
+// node to consist of ShapeHandles owned by different InferenceContexts. While
+// inferring the shapes of a Graph, the InferenceContext of all nodes/operations
+// in the Graph remain resident for the lifetime of the Graph (e.g, there is a
+// map from each node to its InferenceContext, technically its
+// ExtendedInferencContext which additionally stores the element types of inputs
+// & outputs, which remains resident).
+//
+// For functions, the body of the function is instantiated as a Graph while
+// inferring the result shapes of a function call node. The rules above apply
+// while the function's shape is being inferred, but the contexts associated
+// with nodes in the function body are released once the function call's
+// resultant shapes are inferred. The shapes of results returned by a function
+// are propagated to the InferenceContext of the function call's op (which is
+// associated with a Graph of nodes whose shape is being inferred) as the return
+// values of a function call node are the inputs of its consumer, but the return
+// values are produced by nodes inside the function whose InferenceContexts
+// (which owns the values pointed to by ShapeHandle and DimensionHandle) are
+// reclaimed after inferring function result shapes. Recursive user-defined
+// function are not supported hence inference of functions are fully nested with
+// the InferenceContext's of function calls forming a stack.
+//
+// For example, consider the following call and function:
+//
+// ```python
+// @tf.function
+// def g(st):
+//   d = tf.add(st, st)
+//   return d
+//
+// @tf.function
+// def f():
+//   st = tf.A()
+//   result = g(st)
+//   return h(result)
+// ```
+//
+// During inference of f, the shape of `A` will be inferred and the results from
+// its InferenceContext used as inputs to function call `g(st)`. The call node
+// will have an InferenceContext created (call it outer context) and the graph
+// corresponding to function `g` will be instantiated. The result shape of the
+// Arg nodes of the function will be associated with input from outer context.
+// During inference of `g` (for the callsite `g(st)` in `f`), the
+// InferenceContext of all nodes inside `g` will remain alive. Thus, when shape
+// of `tf.add` is computed it may rely on all inputs. Once the RetVal nodes of a
+// function is reached, we know the shape of its input may correspond to a shape
+// queried in the outer context and it is explicitly copied to outer context. In
+// this case that means that the shape of `d` is copied to the InferenceContext
+// of `g(st)` and so when `h(result)` is executed this shape may be queried.
+// Furthermore, no shapes computed due to call `g(st)` can be queried post this
+// point and, as the RetVal shapes have been coppied into outer context, all
+// InferenceContexts associated with nodes in function `g` instantiated for
+// `g(st)` may be and are released.
+
+// Dimension values are accessed through InferenceContext.
+class Dimension {
+ private:
+  Dimension();
+  Dimension(int64_t value);
+  ~Dimension() {}
+
+  const int64_t value_;
+
+  friend class InferenceContext;
+  friend class ShapeManager;
+  Dimension(const Dimension&) = delete;
+  void operator=(const Dimension&) = delete;
+};
+
+class DimensionHandle {
+ public:
+  DimensionHandle() {}
+  bool SameHandle(DimensionHandle d) const { return ptr_ == d.ptr_; }
+  std::size_t Handle() const { return reinterpret_cast<std::size_t>(ptr_); }
+
+ private:
+  DimensionHandle(const Dimension* dim) { ptr_ = dim; }
+
+  const Dimension* operator->() const { return ptr_; }
+  bool IsSet() const { return ptr_ != nullptr; }
+
+  const Dimension* ptr_ = nullptr;
+
+  friend struct DimensionOrConstant;
+  friend class InferenceContext;
+  friend class ShapeInferenceTest;
+  friend class ShapeInferenceTestutil;
+  friend class ::tensorflow::grappler::GraphProperties;
+  friend class ::tensorflow::grappler::SymbolicShapeManager;
+
+  // Intentionally copyable.
+};
+
+// Shape rank and dimensions are accessed through InferenceContext.
+class Shape {
+ private:
+  Shape();
+  Shape(const std::vector<DimensionHandle>& dims);
+  ~Shape() {}
+
+  const int32 rank_;
+  const std::vector<DimensionHandle> dims_;
+
+  friend class InferenceContext;
+  friend class ::tensorflow::grappler::SymbolicShapeManager;
+
+  Shape(const Shape&) = delete;
+  void operator=(const Shape&) = delete;
+};
+
+class ShapeHandle {
+ public:
+  ShapeHandle() {}
+  bool SameHandle(ShapeHandle s) const { return ptr_ == s.ptr_; }
+  std::size_t Handle() const { return reinterpret_cast<std::size_t>(ptr_); }
+
+ private:
+  ShapeHandle(const Shape* shape) { ptr_ = shape; }
+  const Shape* operator->() const { return ptr_; }
+  bool IsSet() const { return ptr_ != nullptr; }
+
+  const Shape* ptr_ = nullptr;
+
+  friend class InferenceContext;
+  friend class ShapeInferenceTest;
+  friend class ShapeInferenceTestutil;
+  friend class ::tensorflow::grappler::SymbolicShapeManager;
+
+  // Intentionally copyable.
+};
+
+// Struct used to allow functions to take DimensionHandle or a dimension value.
+// Not meant to be constructed directly.
+struct DimensionOrConstant {
+ public:
+  // Intentionally not explicit.
+  DimensionOrConstant(DimensionHandle dim);
+
+  // val must be non-negative or InferenceContext::kUnknownDim.
+  DimensionOrConstant(int64_t val);
+
+  // dim takes precedence. If dim != nullptr, val is ignored.
+  DimensionHandle dim;
+  int64_t val;
+
+ private:
+  DimensionOrConstant();
+};
+
+struct ShapeAndType {
+  ShapeAndType() {}
+  ShapeAndType(ShapeHandle s, DataType t) : shape(s), dtype(t) {}
+  // TODO(mdan): Remove dtype from constructor, and use type_ instead.
+  // dtype is kept here for backward compatibiity. Its information should
+  // be redundant to that in type;
+  ShapeAndType(ShapeHandle s, DataType t, FullTypeDef type_)
+      : shape(s), dtype(t), type(type_) {}
+
+  ShapeHandle shape;
+  DataType dtype = DT_INVALID;
+  FullTypeDef type;
+};
+
+// Shape inference functions registered on ops in REGISTER_OP implement
+// their shape functions in terms of this InferenceContext.  An InferenceContext
+// is created by the framework and passed to a shape inference function.  The
+// shape inference function calls functions on the context, and should call
+// set_output() to set the shape on all outputs.
+//
+// To infer shapes for user-defined functions see ShapeRefiner.
+//
+// All Shape* and Dimension* returned by functions of InferenceContext are owned
+// by the InferenceContext.
+class InferenceContext {
+ public:
+  static constexpr int64_t kUnknownDim = -1;
+  static constexpr int32_t kUnknownRank = -1;
+
+  // <input_tensors> is NULL-padded to be the same size as <input_shapes>.
+  //
+  // Elements of <input_tensors_as_shapes> are used for when a shape function
+  // makes a call to MakeShapeFromShapeTensor; in particular, when the
+  // input_tensors[i] is nullptr but the shape represented by it is partially
+  // known from analysis of the graph.
+  // <input_tensors_as_shapes> can have fewer elements than <input_shapes>.
+  // Values of <input_tensors_as_shapes> do not need to outlive the context.
+  InferenceContext(int graph_def_version, const AttrSlice& attrs,
+                   const OpDef& op_def,
+                   const std::vector<ShapeHandle>& input_shapes,
+                   const std::vector<const Tensor*>& input_tensors,
+                   const std::vector<ShapeHandle>& input_tensors_as_shapes,
+                   std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+                       input_handle_shapes_and_types);
+
+  // <input_tensors> is NULL-padded to be the same size as <input_shapes>.
+  //
+  // Elements of <input_tensors_as_shapes> are used for when a shape
+  // function makes a call to MakeShapeFromShapeTensor; in particular, when
+  // the input_tensors[i] is nullptr but the shape represented by it is
+  // partially known from analysis of the graph. <input_tensors_as_shapes>
+  // can have fewer elements than <input_shapes>. Values of
+  // <input_tensors_as_shapes> do not need to outlive the context.
+  InferenceContext(
+      int graph_def_version, const AttrSlice& attrs, const OpDef& op_def,
+      const std::vector<PartialTensorShape>& input_shapes,
+      const std::vector<const Tensor*>& input_tensors,
+      const std::vector<PartialTensorShape>& input_tensors_as_shapes,
+      const std::vector<std::unique_ptr<
+          std::vector<std::pair<PartialTensorShape, DataType>>>>&
+          input_handle_shapes_and_types);
+
+  ~InferenceContext();
+
+  // Runs the shape inference function 'fn' with 'this' as the
+  // argument, returns the status of the inference.
+  //
+  // On error, additional context is provided in the error message.
+  absl::Status Run(
+      const std::function<absl::Status(shape_inference::InferenceContext* c)>&
+          fn);
+
+  // Merge the stored shape of the input in position idx with <shape> according
+  // to the following rules:
+  //
+  // - If the ShapeHandles are the same or <shape> is unknown, there will be no
+  //   change. Otherwise if the stored shape is unknown, the new shape will be
+  //   <shape>.
+  // - If both shapes are known, then they must have the same rank.
+  // - For any one dimension, if the values for that dimension in both shapes
+  //   are known, then the values must match.
+  // - If one shape has equal or more information than the other shape in every
+  //   dimension, the new shape will become the shape with more information.
+  // - Example: merging [2,?] and [?,2] results in [2,2]
+  // - Example: [2,2] cannot be merged with [1,2]
+  //
+  // This requires idx to be in the [0, num_inputs) range. If the merge is
+  // successful, return true. Return false otherwise.
+  bool MergeInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(inputs_[idx], shape, &new_shape).ok()) return false;
+    inputs_[idx] = new_shape;
+    return true;
+  }
+
+  // Relax the stored shape of the input in position idx with <shape> according
+  // to the following rules:
+  //
+  // - If the ShapeHandles are the same then the stored shape will be returned.
+  // - If either of the ShapeHandles are unknown, then a new UnknownShape will
+  //   be returned. A new shape must be returned because we cannot claim that
+  //   the resulting shape is necessarily the same as either of the input
+  //   shapes.
+  // - If the shapes both have known ranks but their ranks are different, a new
+  //   UnknownShape will be returned.
+  // - For any one dimension, if the value for that dimension in either of the
+  //   shapes is unknown, a new shape will be returned with a new UnknownDim in
+  //   that dimension.
+  // - For any one dimension, if the values for that dimension in both shapes
+  //   are known but do not match, a new shape will be returned with a new
+  //   UnknownDim in that dimension.
+  // - If both shapes have the same known rank and match in every dimension,
+  //   the stored shape will be returned.
+  // - Example: relaxing [2,?] and [?,2] results in [?,?]
+  // - Example: relaxing [2,2] and [3,2] results in [?,2]
+  // - Example: relaxing [2,2] with [1,2,3] results in ?
+  //
+  // This requires idx to be in the [0, num_inputs) range. If the relax is
+  // successful and the new shape differs from the old one, store the new
+  // shape and return true. Return false otherwise.
+  bool RelaxInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    Relax(inputs_[idx], shape, &new_shape);
+    if (inputs_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    inputs_[idx] = new_shape;
+    return true;
+  }
+
+  void SetInput(int idx, ShapeHandle shape) { inputs_[idx] = shape; }
+
+  ShapeHandle input(int64_t idx) const { return inputs_[idx]; }
+  absl::Status input(absl::string_view input_name,
+                     std::vector<ShapeHandle>* output) const;
+  int num_inputs() const { return inputs_.size(); }
+
+  // Returns the input tensor at index <idx>, or nullptr if the input tensor is
+  // not available at the time of shape inference.
+  const Tensor* input_tensor(int idx) {
+    // Mark that this idx was requested.
+    request_input_tensor(idx);
+    return input_tensors_[idx];
+  }
+
+  // Notifies the shape refiner that the value of the tensor at index <idx>
+  // is needed. The shape refiner tries to statically compute this tensor,
+  // and if successful re-runs the  shape function with this tensor available
+  // in the call to 'input_tensor(idx)'.
+  void request_input_tensor(int idx) { requested_input_tensor_[idx] = true; }
+
+  // Returns true iff input_tensor(idx) was called by the shape function.
+  bool requested_input_tensor(int idx) const {
+    return requested_input_tensor_[idx];
+  }
+
+  // Notifies the shape refiner that the value of the tensor at index <idx>
+  // as a partial shape is needed. The shape refiner tries to statically compute
+  // this, and if successful re-runs the  shape function with the
+  // computed PartialTensorShape available in the call to
+  // 'MakeShapeFromShapeTensor(idx, handle)' or
+  // 'MakeShapeFromShapeTensorTreatScalarAsUnknownShape(idx, handle)'.
+  void request_input_tensor_as_partial_shape(int idx) {
+    requested_input_tensor_as_partial_shape_[idx] = true;
+  }
+
+  // Returns true if MakeShapeFromInputTensor was called but the constant
+  // input_tensor was not present.
+  bool requested_input_tensor_as_partial_shape(int idx) const {
+    return requested_input_tensor_as_partial_shape_[idx];
+  }
+
+  void set_input_tensors(const std::vector<const Tensor*>& input_tensors) {
+    input_tensors_ = input_tensors;
+  }
+
+  void set_input_tensors_as_shapes(
+      const std::vector<ShapeHandle>& input_tensors_as_shapes) {
+    input_tensors_as_shapes_ = input_tensors_as_shapes;
+  }
+
+  const std::vector<ShapeHandle>& input_tensors_as_shapes() const {
+    return input_tensors_as_shapes_;
+  }
+
+  ShapeHandle output(int64_t idx) const { return outputs_.at(idx); }
+  void set_output(int idx, ShapeHandle shape) { outputs_.at(idx) = shape; }
+  absl::Status set_output(absl::string_view output_name,
+                          const std::vector<ShapeHandle>& shapes);
+
+  int num_outputs() const { return outputs_.size(); }
+  ShapeHandle output(int idx) const { return outputs_.at(idx); }
+  absl::Status output(absl::string_view output_name,
+                      std::vector<ShapeHandle>* output) const;
+
+  // Returns the value for attribute named `attr_name`.
+  absl::Status GetAttr(absl::string_view attr_name,
+                       const AttrValue** attr_value) const {
+    return attrs_.Find(attr_name, attr_value);
+  }
+  const AttrValue* GetAttr(absl::string_view attr_name) const {
+    return attrs_.Find(attr_name);
+  }
+
+  const FullTypeDef& ret_types() const { return ret_types_; }
+
+  // idx can be negative for an offset from end of dimensions.
+  // idx must be in the range [-1 * s.rank, s.rank).
+  DimensionHandle Dim(ShapeHandle s, int64_t idx) {
+    if (!s.Handle() || s->rank_ == kUnknownRank) {
+      return UnknownDim();
+    }
+    return DimKnownRank(s, idx);
+  }
+  // As above, but asserts that the rank of the shape is known.
+  static DimensionHandle DimKnownRank(ShapeHandle s, int64_t idx) {
+    CHECK_NE(s->rank_, kUnknownRank);
+    if (idx < 0) {
+      return s->dims_[s->dims_.size() + idx];
+    }
+    return s->dims_[idx];
+  }
+
+  static int32 Rank(ShapeHandle s) {
+    return s.IsSet() ? s->rank_ : kUnknownRank;
+  }
+  static bool RankKnown(ShapeHandle s) {
+    return (s.IsSet() && (Rank(s) != kUnknownRank));
+  }
+  static inline int64_t Value(DimensionOrConstant d) {
+    return d.dim.IsSet() ? d.dim->value_ : d.val;
+  }
+  static inline bool ValueKnown(DimensionOrConstant d) {
+    return Value(d) != kUnknownDim;
+  }
+
+  // Fills the output proto with the shape defined by the handle.
+  // "proto" is expected to be empty prior to the call.
+  void ShapeHandleToProto(ShapeHandle handle, TensorShapeProto* proto);
+  TensorShapeProto ShapeHandleToProto(ShapeHandle handle);
+
+  // Returns true if the rank and all dimensions of the Shape are known.
+  bool FullyDefined(ShapeHandle s);
+
+  // Returns the total number of elements, or an unknown dimension for an
+  // incomplete shape.
+  DimensionHandle NumElements(ShapeHandle s);
+
+  std::string DebugString(ShapeHandle s);
+  std::string DebugString(DimensionHandle d);
+  std::string DebugString(const ShapeAndType& shape_and_type);
+  std::string DebugString(absl::Span<const ShapeAndType> shape_and_types);
+
+  // Describes the whole context, for debugging purposes.
+  std::string DebugString() const;
+
+  // If <shape> has rank <rank>, or its rank is unknown, return OK and return
+  // the shape with asserted rank in <*out>. Otherwise return an error.
+  //
+  // Note that <*out> may be set to <shape>.
+  absl::Status WithRank(ShapeHandle shape, int64_t rank, ShapeHandle* out);
+  absl::Status WithRankAtLeast(ShapeHandle shape, int64_t rank,
+                               ShapeHandle* out);
+  absl::Status WithRankAtMost(ShapeHandle shape, int64_t rank,
+                              ShapeHandle* out);
+
+  // If <dim> has value <value>, or its value is unknown, returns OK and returns
+  // the dimension with asserted value in <*out>. Otherwise returns an error.
+  //
+  // Note that <*out> may be set to <dim>.
+  absl::Status WithValue(DimensionHandle dim, int64_t value,
+                         DimensionHandle* out);
+
+  // Merges <s0> and <s1> and returns the merged shape in <*out>. See
+  // 'MergeInput' function for full details and examples.
+  absl::Status Merge(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out);
+
+  // Asserts that <s>'s rank >= <prefix>'s rank, and the first
+  // <prefix.rank> dimensions of <s> are compatible with the dimensions of
+  // <prefix>.
+  // Returns the merged results in <*s_out> and <*prefix_out>.
+  absl::Status MergePrefix(ShapeHandle s, ShapeHandle prefix,
+                           ShapeHandle* s_out, ShapeHandle* prefix_out);
+
+  // Merges <d0> and <d1> and returns the merged dimension in <*out>. If <d0>
+  // and <d1> have incompatible values, returns an error.
+  //
+  // Note that <*out> may be set to <d0> or <d1>.
+  absl::Status Merge(DimensionHandle d0, DimensionHandle d1,
+                     DimensionHandle* out);
+
+  // Returns in <*out> a sub-shape of <s> with dimensions [start:].
+  // <start> can be negative to index from the end of the shape. If <start> >
+  // rank of <s>, then an empty subshape is returned.
+  absl::Status Subshape(ShapeHandle s, int64_t start, ShapeHandle* out);
+
+  // Returns in <*out> a sub-shape of <s>, with dimensions [start:end].
+  // <start> and <end> can be negative, to index from the end of the shape.
+  // <start> and <end> are set to the rank of <s> if > rank of <s>.
+  absl::Status Subshape(ShapeHandle s, int64_t start, int64_t end,
+                        ShapeHandle* out);
+
+  // Returns in <*out> a sub-shape of <s>, with dimensions [start:end:stride].
+  // <start> and <end> can be negative, to index from the end of the shape.
+  // <start> and <end> are set to the rank of <s> if > rank of <s>.
+  // <stride> can be negative, to reverse the <s>.
+  absl::Status Subshape(ShapeHandle s, int64_t start, int64_t end,
+                        int64_t stride, ShapeHandle* out);
+
+  // Returns in <*out> the result of appending the dimensions of <s2> to those
+  // of <s1>.
+  absl::Status Concatenate(ShapeHandle s1, ShapeHandle s2, ShapeHandle* out);
+
+  // Returns in <out> the shape from replacing <s.dim[dim_index]> with
+  // <new_dim>.
+  absl::Status ReplaceDim(ShapeHandle s, int64_t dim_index,
+                          DimensionHandle new_dim, ShapeHandle* out);
+
+  // Returns a new shape with the given dims. The returned value is owned by
+  // this context.
+  ShapeHandle MakeShape(const std::vector<DimensionHandle>& dims);
+  ShapeHandle MakeShape(std::initializer_list<DimensionOrConstant> dims);
+
+  // Returns a new unknown shape.
+  ShapeHandle UnknownShape();
+
+  // Returns a shape with specified rank but unknown dims.
+  ShapeHandle UnknownShapeOfRank(int64_t rank);
+
+  // Returns a new shape of zero dimensions.
+  ShapeHandle Scalar();
+
+  // Returns a new shape of one dimension.
+  ShapeHandle Vector(DimensionOrConstant dim);
+
+  // Returns a new shape of two dimensions.
+  ShapeHandle Matrix(DimensionOrConstant dim1, DimensionOrConstant dim2);
+
+  // Returns in <out> a new shape whose dimension sizes come from input tensor
+  // <input_idx>. The tensor must be a 1-dimensional int32 or int64 tensor.  If
+  // the input tensor is NULL, then an unknown shape is returned.
+  absl::Status MakeShapeFromShapeTensor(int input_idx, ShapeHandle* out);
+
+  // Like the function above, but treats scalar values as unknown
+  // shapes.  **NOTE** If the scalar is statically known, its value
+  // must be -1 or an error is returned.
+  absl::Status MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+      int input_idx, ShapeHandle* out);
+
+  // Returns in <out> a new shape corresponding to <proto>.
+  absl::Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
+                                       ShapeHandle* out);
+
+  // Returns in <out> a new shape corresponding to <partial_shape>.
+  absl::Status MakeShapeFromPartialTensorShape(
+      const PartialTensorShape& partial_shape, ShapeHandle* out);
+
+  // Returns in <out> a new shape corresponding to <shape>.
+  absl::Status MakeShapeFromTensorShape(const TensorShape& shape,
+                                        ShapeHandle* out);
+  absl::StatusOr<ShapeHandle> MakeShapeFromShapeTensor(
+      const TensorShape& shape);
+
+  // Returns a new dimension of the given size.  The returned value is owned by
+  // this context.
+  inline DimensionHandle MakeDim(DimensionOrConstant d) {
+    return shape_manager_.MakeDim(d);
+  }
+
+  inline DimensionHandle UnknownDim() { return MakeDim(kUnknownDim); }
+
+  // Returns in <val> a scalar value from an input tensor <t>.  The input tensor
+  // must be a 0-dimensional int32 or int64 tensor.  Caller must ensure that the
+  // input tensor is not NULL.
+  absl::Status GetScalarFromTensor(const Tensor* t, int64_t* val);
+
+  // Returns in <val> a scalar value from a 1D input tensor <t> with int32 or
+  // int64 elements. Caller must ensure that the input tensor is not NULL.
+  absl::Status GetScalarFromTensor(const Tensor* t, int64_t idx, int64_t* val);
+
+  // Returns a new dimension whose value is given by a scalar input tensor.
+  // The input tensor must be in host memory, since it is dereferenced to get
+  // the value.
+  absl::Status MakeDimForScalarInput(int idx, DimensionHandle* out);
+
+  // Returns a new dimension whose value is given by a scalar input tensor.
+  // This allows for a negative input dimension given the rank of a separate
+  // tensor.  This rank can be negative if unknown.
+  // The input tensor must be in host memory, since it is dereferenced to get
+  // the value.
+  absl::Status MakeDimForScalarInputWithNegativeIndexing(int idx,
+                                                         int input_rank,
+                                                         DimensionHandle* out);
+
+  // Look up the attr being evaluated with name attr_name and set *value to its
+  // value. If no attr with attr_name is found in def(), or the attr does not
+  // have a matching type, a non-ok status will be returned.
+  template <class T>
+  absl::Status GetAttr(absl::string_view attr_name, T* value) const;
+
+  // Returns in <out> the result of dividing <dividend> by <divisor>.
+  // Returns an error if <divisor>  is not positive or if <evenly_divisible>
+  // and <divisor> does not evenly divide <dividend>.
+  absl::Status Divide(DimensionHandle dividend, DimensionOrConstant divisor,
+                      bool evenly_divisible, DimensionHandle* out);
+
+  // Returns in <out> the sum of <first> and <second>.
+  absl::Status Add(DimensionHandle first, DimensionOrConstant second,
+                   DimensionHandle* out);
+
+  // Returns in <out> the dimension that is <first> minus <second>.
+  absl::Status Subtract(DimensionHandle first, DimensionOrConstant second,
+                        DimensionHandle* out);
+
+  // Returns in <out> the product of <first> and <second>.
+  absl::Status Multiply(DimensionHandle first, DimensionOrConstant second,
+                        DimensionHandle* out);
+
+  // Returns in <out> the minimum of <first> and <second>. If either <first> or
+  // <second> is zero the results is zero. Otherwise, if either <first> or
+  // <second> is unknown the results is unknown.
+  absl::Status Min(DimensionHandle first, DimensionOrConstant second,
+                   DimensionHandle* out);
+
+  // Returns in <out> the maximum of <first> and <second>. If either <first> or
+  // <second> is unknown the results is unknown.
+  absl::Status Max(DimensionHandle first, DimensionOrConstant second,
+                   DimensionHandle* out);
+
+  absl::Status construction_status() const { return construction_status_; }
+
+  // Methods to propagate shape and dtype on edges of handles. Handles are the
+  // dtype DT_RESOURCE which can be used to access state stored in a
+  // ResourceManager. When ops (such as variables) consume these handles to
+  // produce tensors they might need to know side-information about the shapes
+  // and dtypes of tensors which can be accessed via the handle. These methods
+  // propagate that information. Output handle dtypes and shapes are ignored if
+  // the output tensor is not of type DT_RESOURCE.
+
+  // Merge the stored shapes and types corresponding to the input handle in
+  // position idx with the specified shapes and types. This requires idx to be
+  // in the [0, num_inputs) range.
+  //
+  // If the merge is successful and any of the new shapes differs from the old
+  // one, or any of the old dtypes was DT_INVALID, store the new shapes and
+  // return true.  Return false otherwise.
+  //
+  // See 'MergeInput' function for full details and examples.
+  bool MergeInputHandleShapesAndTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  // As MergeInputHandleShapesAndTypes, but for an output.
+  bool MergeOutputHandleShapesAndTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  // Relaxes the stored shapes and types corresponding to the input handle in
+  // position idx with the specified shapes and types. This requires idx to be
+  // in the [0, num_inputs) range.
+  //
+  // If the relax is successful (sizes are the same, old dtypes match new ones
+  // or are DT_INVALID), then store the relaxed shapes and return true.
+  // Return false otherwise.
+  //
+  // See 'RelaxInput' function for full details and examples.
+  bool RelaxInputHandleShapesAndMergeTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  // As RelaxInputHandleShapesAndTypes, but for an output.
+  bool RelaxOutputHandleShapesAndMergeTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  void set_input_handle_shapes_and_types(
+      int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+    CHECK_GE(idx, 0) << "idx must be non-negative. Got idx: " << idx << ".";
+    CHECK_LT(idx, input_handle_shapes_and_types_.size())
+        << "Got idx: " << idx << " but only "
+        << input_handle_shapes_and_types_.size() << " inputs.";
+    input_handle_shapes_and_types_[idx] =
+        absl::make_unique<std::vector<ShapeAndType>>(shapes_and_types);
+  }
+
+  // Returns the output handle shapes and types, for the resource tensor output
+  // at index <idx>. Returns NULL if the shape and types were never set.
+  const std::vector<ShapeAndType>* output_handle_shapes_and_types(int idx) {
+    CHECK_GE(idx, 0) << "idx must be non-negative. Got idx: " << idx << ".";
+    CHECK_LT(idx, output_handle_shapes_and_types_.size())
+        << "Got idx: " << idx << " but only "
+        << output_handle_shapes_and_types_.size() << " outputs.";
+    return output_handle_shapes_and_types_[idx].get();
+  }
+
+  // Returns the inputs handle shapes and types, for the resource tensor input
+  // at index <idx>. Returns NULL if the shape and types were not available.
+  const std::vector<ShapeAndType>* input_handle_shapes_and_types(int idx) {
+    CHECK_GE(idx, 0) << "idx must be non-negative. Got idx: " << idx << ".";
+    CHECK_LT(idx, input_handle_shapes_and_types_.size())
+        << "Got idx: " << idx << " but only "
+        << input_handle_shapes_and_types_.size() << " inputs.";
+    return input_handle_shapes_and_types_[idx].get();
+  }
+
+  void set_output_handle_shapes_and_types(
+      int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+    CHECK_GE(idx, 0) << "idx must be non-negative. Got idx: " << idx << ".";
+    CHECK_LT(idx, output_handle_shapes_and_types_.size())
+        << "Got idx: " << idx << " but only "
+        << output_handle_shapes_and_types_.size() << " inputs.";
+    output_handle_shapes_and_types_[idx] =
+        absl::make_unique<std::vector<ShapeAndType>>(shapes_and_types);
+  }
+
+  // Note that shape functions should usually call MakeShapeFromShapeTensor,
+  // as it does more analysis to provide partial shapes.
+  //
+  // Returns in <out> a new shape whose dimension sizes come from tensor <t>.
+  // The tensor must be a 1-dimensional int32 or int64 tensor.  If <t> is NULL,
+  // then an unknown shape is returned.
+  absl::Status MakeShapeFromTensor(const Tensor* t, ShapeHandle tensor_shape,
+                                   ShapeHandle* out);
+
+  int graph_def_version() const { return graph_def_version_; }
+
+  const std::vector<std::pair<ShapeHandle, ShapeHandle>>& MergedShapes() const {
+    return merged_shapes_;
+  }
+  const std::vector<std::pair<DimensionHandle, DimensionHandle>>& MergedDims()
+      const {
+    return merged_dims_;
+  }
+
+  // Adds new outputs; useful when mutating the graph.
+  absl::Status ExpandOutputs(int new_output_size);
+
+ private:
+  // Creates and stores shapes for use in InferenceContext.
+  class ShapeManager {
+   public:
+    ShapeManager();
+    ~ShapeManager();
+
+    // Returns a new shape with the given dims. The returned value is owned by
+    // this class.
+    ShapeHandle MakeShape(const std::vector<DimensionHandle>& dims);
+
+    // Returns a new unknown shape.
+    ShapeHandle UnknownShape();
+
+    // Returns a new dimension of the given size.  The returned value
+    // is owned by this class.
+    inline DimensionHandle MakeDim(DimensionOrConstant d) {
+      if (d.dim.IsSet()) {
+        return d.dim;
+      } else {
+        all_dims_.push_back(new Dimension(d.val));
+        return all_dims_.back();
+      }
+    }
+
+   private:
+    std::vector<Shape*> all_shapes_;    // values are owned.
+    std::vector<Dimension*> all_dims_;  // values are owned.
+  };
+
+  friend class ::tensorflow::grappler::GraphProperties;
+
+  friend class ShapeInferenceTest;      // For testing Relax functions.
+  friend class ShapeInferenceTestutil;  // For testing shapes.
+
+  // Shared initialization across the two constructors.  Remove
+  // once we get rid of one of them.
+  void PreInputInit(const OpDef& op_def,
+                    const std::vector<const Tensor*>& input_tensors,
+                    const std::vector<ShapeHandle>& input_tensors_as_shapes);
+  void PostInputInit(std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+                         input_handle_data);
+
+  absl::Status ReturnUnknownShape(ShapeHandle* out) {
+    *out = UnknownShape();
+    return absl::OkStatus();
+  }
+  absl::Status ReturnCreatedShape(const std::vector<DimensionHandle>& dims,
+                                  ShapeHandle* out) {
+    *out = MakeShape(dims);
+    return absl::OkStatus();
+  }
+
+  // Adds additional context to the given status.
+  absl::Status AttachContext(const absl::Status& status);
+
+  // Relaxes an existing value <d_old> with a new value <d_new> and returns the
+  // relaxed dimension in <*out>. If <d_old> and <d_new> have incompatible
+  // values, returns an error.
+  //
+  // Note that <*out> may be set to <d_old> or <d_new>.
+  void Relax(DimensionHandle d_old, DimensionHandle d_new,
+             DimensionHandle* out);
+  // Relaxes an existing shape <s_old> with a new shape <s_new> and returns the
+  // relaxed shape in <*out>. See 'RelaxInput' function for full details and
+  // examples.
+  void Relax(ShapeHandle s_old, ShapeHandle s_new, ShapeHandle* out);
+
+  // Used to implement MergeInputHandleShapesAndTypes and
+  // MergeOutputHandleShapesAndTypes.
+  bool MergeHandleShapesAndTypes(
+      const std::vector<ShapeAndType>& shapes_and_types,
+      std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
+  // Used to implement RelaxInputHandleShapesAndMergeTypes and
+  // RelaxOutputHandleShapesAndMergeTypes.
+  bool RelaxHandleShapesAndMergeTypes(
+      const std::vector<ShapeAndType>& shapes_and_types,
+      std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
+
+  // Forget all the previous merged shapes and dims.
+  void ForgetMerges() {
+    merged_shapes_.clear();
+    merged_dims_.clear();
+  }
+
+  // Helper method for MakeShapeFromTensor and MakeShapeFromShapeTensor.
+  absl::Status InternalMakeShapeFromTensor(
+      bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+      ShapeHandle tensor_shape, ShapeHandle* out);
+
+  ShapeManager shape_manager_;
+
+  // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
+  // `shape_manager_`.
+  std::vector<ShapeHandle> inputs_;
+  std::vector<const Tensor*> input_tensors_;
+  std::vector<bool> requested_input_tensor_;
+  std::vector<ShapeHandle> outputs_;
+  // Can have fewer elements than inputs_.
+  std::vector<ShapeHandle> input_tensors_as_shapes_;
+  std::vector<bool> requested_input_tensor_as_partial_shape_;
+
+  // input_handle_shapes_and_types_[i] is the list of shape/type pairs available
+  // through the resource handle passed along input i of the node.
+  //
+  // Values may be NULL.
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+      input_handle_shapes_and_types_;
+
+  // output_handle_shapes_and_types_[i] is the list of shape/type pairs
+  // available through the resource handle passed along output i of the node.
+  //
+  // Values may be NULL.
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+      output_handle_shapes_and_types_;
+
+  // Return types for the node this context is associated with. This information
+  // is to eventually consolidate all the dtype and shape info, allowing for
+  // output_handle_shapes_and_types_ to be removed.
+  FullTypeDef ret_types_;
+
+  const int graph_def_version_;
+  AttrSlice attrs_;
+  NameRangeMap input_name_map_;
+  NameRangeMap output_name_map_;
+
+  // An error set during construction. TODO(cwhipkey): remove when test
+  // constructor is removed.
+  absl::Status construction_status_;
+
+  // Pair of shape or dim handles that are equivalent, ie that represent the
+  // same underlying shape of dimension. Note that for each pair at least one of
+  // the handles must contain an unknown shape, since we don't keep track of
+  // known shapes or dims here.
+  std::vector<std::pair<ShapeHandle, ShapeHandle>> merged_shapes_;
+  std::vector<std::pair<DimensionHandle, DimensionHandle>> merged_dims_;
+
+  InferenceContext(const InferenceContext&) = delete;
+  void operator=(const InferenceContext&) = delete;
+};
+
+// -----------------------------------------------------------------------------
+// Template and inline method implementations, please ignore
+
+inline Dimension::Dimension() : value_(InferenceContext::kUnknownDim) {}
+inline Dimension::Dimension(int64_t value) : value_(value) {
+  DCHECK(value >= 0 || value == InferenceContext::kUnknownDim)
+      << "Dimension must be non-negative or equal to "
+         "InferenceContext::kUnknownDim but got "
+      << value;
+}
+
+inline Shape::Shape() : rank_(InferenceContext::kUnknownRank) {}
+inline Shape::Shape(const std::vector<DimensionHandle>& dims)
+    : rank_(dims.size()), dims_(dims) {}
+
+inline DimensionOrConstant::DimensionOrConstant(DimensionHandle dim)
+    : dim(dim) {
+  DCHECK(dim.IsSet()) << "Internal error: Got nullptr for Dimension.";
+}
+
+inline DimensionOrConstant::DimensionOrConstant(int64_t val) : val(val) {
+  DCHECK(val >= 0 || val == InferenceContext::kUnknownDim)
+      << "Dimension must be non-negative or equal to "
+         "InferenceContext::kUnknownDim but got "
+      << val;
+}
+
+template <class T>
+absl::Status InferenceContext::GetAttr(absl::string_view attr_name,
+                                       T* value) const {
+  return GetNodeAttr(attrs_, attr_name, value);
+}
+
+}  // namespace shape_inference
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/shape_inference_testutil.h b/third_party/tflite-hdrs/tensorflow/core/framework/shape_inference_testutil.h
new file mode 100644
index 00000000..c9b9bd74
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/shape_inference_testutil.h
@@ -0,0 +1,103 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/version.h"
+
+// Contains utilities for writing tests for shape inference functions.
+
+namespace tensorflow {
+
+class Tensor;
+
+struct ShapeInferenceTestOp {
+  typedef std::pair<string, DataType> ShapeAndType;
+  explicit ShapeInferenceTestOp(absl::string_view name) : name(string(name)) {}
+  string name;
+  NodeDef node_def;
+  std::vector<const Tensor*> input_tensors;
+  std::vector<std::vector<ShapeAndType>*>
+      input_resource_handle_shapes_and_types;
+  int graph_def_version = TF_GRAPH_DEF_VERSION;
+};
+
+namespace shape_inference {
+
+class ShapeInferenceTestutil {
+ public:
+  // Run shape inference for <op.name>, given inputs specified by <ins>
+  // and returns an error if the inferred shape does not match expected_outs.
+  //
+  // <ins> is a semicolon separated list of shapes. Each shape is formatted
+  // according to the formatting per
+  // shape_inference::InferenceContext::InferenceContext.
+  //
+  // <expected_outs> is a semicolon separated list of shapes. Each shape is
+  // formatted as one of:
+  // * ? - an unknown shape, but not matching an input shape
+  // * in0|in2|... - output shape must be the same as one of these input shapes.
+  // * [1,?,d0_0|d0_1] - output shape is of known rank, with comma-separated
+  //      dimension values.
+  //      Each dimension value is one of:
+  //      * a constant, which means that constant not equal to a specific input
+  //      * ?, which means an unknown dim size not equal to a specific input
+  //      * d0_0|d1_2, indicating that the dim size must be equal to one of
+  //            the given input dimensions; the first number is the input # and
+  //            the second is which dimension in that input it corresponds to.
+  // <expected_outs> can be "e"; this is used to indicate that shape inference
+  // should have failed.
+  static absl::Status InferShapes(ShapeInferenceTestOp op, const string& ins,
+                                  const string& expected_outs);
+
+ private:
+  ShapeInferenceTestutil() = default;
+
+  // Makes a shape out of 'spec'.
+  static absl::Status MakeShapeFromString(
+      InferenceContext::ShapeManager* manager, const string& spec,
+      ShapeHandle* output);
+};
+
+}  // namespace shape_inference
+
+#define INFER_OK(op, i, o)                                                    \
+  EXPECT_EQ(tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
+                op, i, o),                                                    \
+            absl::OkStatus())
+
+#define INFER_ERROR(error_substring, op, i)                                    \
+  {                                                                            \
+    absl::Status status =                                                      \
+        (tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes(     \
+            op, i, "e"));                                                      \
+    std::string error_message = status.ToString();                             \
+    EXPECT_NE(status, absl::OkStatus());                                       \
+    EXPECT_TRUE(absl::StrContains(error_message, error_substring))             \
+        << "Expected to see '" << error_substring << "' in '" << error_message \
+        << "'";                                                                \
+  }
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/shared_ptr_variant.h b/third_party/tflite-hdrs/tensorflow/core/framework/shared_ptr_variant.h
new file mode 100644
index 00000000..337d51d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/shared_ptr_variant.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHARED_PTR_VARIANT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHARED_PTR_VARIANT_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+template <typename T>
+struct SharedPtrVariant {
+  std::shared_ptr<T> shared_ptr;
+
+  SharedPtrVariant() : shared_ptr() {}
+
+  explicit SharedPtrVariant(std::shared_ptr<T>&& ptr)
+      : shared_ptr(std::forward<decltype(ptr)>(ptr)) {
+    VLOG(3) << "Creating shared_ptr of " << shared_ptr.get()
+            << " count is: " << shared_ptr.use_count();
+  }
+
+  SharedPtrVariant(SharedPtrVariant&& rhs)
+      : shared_ptr(std::move(rhs.shared_ptr)) {
+    VLOG(3) << "Moving SharedPtrVariant of " << shared_ptr.get()
+            << " count is: " << shared_ptr.use_count();
+  }
+
+  SharedPtrVariant& operator=(const SharedPtrVariant& rhs) = delete;
+
+  SharedPtrVariant& operator=(SharedPtrVariant&& rhs) {
+    if (&rhs == this) return *this;
+    std::swap(shared_ptr, rhs.shared_ptr);
+    VLOG(3) << "Move-assign of SharedPtrVariant of " << shared_ptr.get()
+            << " count is: " << shared_ptr.use_count();
+    return *this;
+  }
+
+  SharedPtrVariant(const SharedPtrVariant& rhs) : shared_ptr(rhs.shared_ptr) {
+    VLOG(3) << "Copying SharedPtrVariant of " << shared_ptr.get()
+            << " count is: " << shared_ptr.use_count();
+  }
+
+  ~SharedPtrVariant() {
+    VLOG(3) << "Destroying SharedPtrVariant of " << shared_ptr.get()
+            << " count is: " << shared_ptr.use_count();
+  }
+
+  void Encode(VariantTensorData*) const {
+    // Not supported.
+  }
+
+  bool Decode(const VariantTensorData&) {
+    return false;  // Not supported.
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHARED_PTR_VARIANT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/stats_aggregator.h b/third_party/tflite-hdrs/tensorflow/core/framework/stats_aggregator.h
new file mode 100644
index 00000000..5b89a82f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/stats_aggregator.h
@@ -0,0 +1,98 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+class Summary;
+class SummaryWriterInterface;
+namespace data {
+
+// A `StatsAggregator` accumulates statistics incrementally. A
+// `StatsAggregator` can accumulate multiple different statistics, distinguished
+// by a string name.
+//
+// The class currently supports accumulating `Histogram`, `scalar` objects and
+// tfstreamz metrics, and we expect to add other methods in future.
+//
+// NOTE(mrry): `StatsAggregator` is a virtual interface because we anticipate
+// that many different implementations will have the same interface. For
+// example, we have different implementations in "stats_aggregator_ops.cc" for
+// simple in-memory implementation that integrates with the pull-based summary
+// API, and for the push-based `SummaryWriterInterface`, and we may add
+// implementations that work well with other custom monitoring services.
+class StatsAggregator {
+ public:
+  virtual ~StatsAggregator() {}
+
+  // Add the given `values` to the histogram with the given `name`. Each
+  // element of `values` will be treated as a separate sample in the histogram.
+  virtual void AddToHistogram(const string& name,
+                              absl::Span<const double> values,
+                              int64_t global_step) = 0;
+
+  // TODO(shivaniagrawal): consistency in double and float usage.
+  // Add the given `value` as Scalar with the given `name`.
+  virtual void AddScalar(const string& name, float value,
+                         int64_t global_step) = 0;
+
+  // Stores a protocol buffer representation of the aggregator state in the
+  // given `out_summary`.
+  virtual void EncodeToProto(Summary* out_summary) = 0;
+
+  // Sets a `summary_writer` with this stats_aggregator.
+  virtual absl::Status SetSummaryWriter(
+      SummaryWriterInterface* summary_writer) = 0;
+
+  // Increment the `label` cell of metrics mapped with `name` by given `value`.
+  virtual void IncrementCounter(const string& name, const string& label,
+                                int64_t val) = 0;
+};
+
+// A `StatsAggregatorResource` wraps a sharable `StatsAggregator` as a resource
+// in the TensorFlow resource manager.
+//
+// NOTE(mrry): This class is separate from `StatsAggregator` in order to
+// simplify the memory management of the shared object. Most users of
+// `StatsAggregator` interact with a `std::shared_ptr<StatsAggregator>` whereas
+// the `ResourceBase` API requires explicit reference counting.
+class StatsAggregatorResource : public ResourceBase {
+ public:
+  // Creates a new resource from the given `stats_aggregator`.
+  StatsAggregatorResource(std::unique_ptr<StatsAggregator> stats_aggregator)
+      : stats_aggregator_(stats_aggregator.release()) {}
+
+  // Returns the wrapped `StatsAggregator`.
+  std::shared_ptr<StatsAggregator> stats_aggregator() const {
+    return stats_aggregator_;
+  }
+
+  string DebugString() const override { return "StatsAggregatorResource"; }
+
+ private:
+  const std::shared_ptr<StatsAggregator> stats_aggregator_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor.h
new file mode 100644
index 00000000..8f80ea7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor.h
@@ -0,0 +1,1104 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Forward declarations.  In particular, we forward declare protos so that their
+// symbols can be removed from .so exports.
+class AllocationDescription;
+class OpKernelContext;
+class Tensor;
+class TensorBuffer;
+class TensorCApi;
+class TensorInterface;
+class TensorCord;
+class TensorDescription;
+class TensorProto;
+class Var;
+
+namespace batch_util {
+absl::Status CopyElementToSlice(Tensor element, Tensor* parent, int64_t index);
+absl::Status CopySliceToElement(const Tensor& parent, Tensor* element,
+                                int64_t index);
+absl::Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element,
+                                     int64_t index);
+absl::Status CopyContiguousSlices(const Tensor& src, int64_t src_offset,
+                                  int64_t dst_offset, int64_t num_slices,
+                                  Tensor* dst);
+absl::Status MaybeMoveContiguousSlices(Tensor& src, int64_t src_offset,
+                                       int64_t dst_offset, int64_t num_slices,
+                                       Tensor* dst);
+}  // namespace batch_util
+
+/// @ingroup core
+
+/// Interface to access the raw ref-counted data buffer.
+class TensorBuffer : public core::RefCounted {
+ public:
+  explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {}
+  ~TensorBuffer() override {}
+
+  /// \brief data() points to a memory region of size() bytes.
+  ///
+  /// NOTE(mrry): The `data()` method is not virtual for performance reasons.
+  /// It can be called multiple times when the contents of a `Tensor` are
+  /// accessed, and so making it non-virtual allows the body to be inlined.
+  void* data() const { return data_; }
+
+  /// \brief Size (in bytes) of the buffer.
+  virtual size_t size() const = 0;
+
+  /// \brief If this TensorBuffer is sub-buffer of another TensorBuffer,
+  /// returns that TensorBuffer. Otherwise, returns this.
+  virtual TensorBuffer* root_buffer() = 0;
+
+  /// \brief Fills metadata about the allocation into the proto.
+  virtual void FillAllocationDescription(
+      AllocationDescription* proto) const = 0;
+
+  virtual bool GetAllocatedBytes(size_t* out_bytes) const;
+
+  /// \brief Helper method to reinterpret the buffer as an array of `T`.
+  template <typename T>
+  T* base() const {
+    return reinterpret_cast<T*>(data());
+  }
+
+  /// \brief Whether this TensorBuffer owns the underlying memory.
+  virtual bool OwnsMemory() const { return true; }
+
+  /// \brief The type of the underlying memory.
+  virtual AllocatorMemoryType GetMemoryType() const {
+    return AllocatorMemoryType::kUnknown;
+  }
+
+ private:
+  void* const data_;
+};
+
+/// Represents an n-dimensional array of values.
+class Tensor {
+ public:
+  /// \brief Creates a 1-dimensional, 0-element float tensor.
+  ///
+  /// The returned Tensor is not a scalar (shape {}), but is instead
+  /// an empty one-dimensional Tensor (shape {0}, NumElements() ==
+  /// 0). Since it has no elements, it does not need to be assigned a
+  /// value and is initialized by default (IsInitialized() is
+  /// true). If this is undesirable, consider creating a one-element
+  /// scalar which does require initialization:
+  ///
+  /// ```c++
+  ///
+  ///     Tensor(DT_FLOAT, TensorShape({}))
+  ///
+  /// ```
+  Tensor();
+
+  /// \brief Creates a Tensor of the given `type` and `shape`.  If
+  /// LogMemory::IsEnabled() the allocation is logged as coming from
+  /// an unknown kernel and step. Calling the Tensor constructor
+  /// directly from within an Op is deprecated: use the
+  /// OpKernelConstruction/OpKernelContext allocate_* methods to
+  /// allocate a new tensor, which record the kernel and step.
+  ///
+  /// The underlying buffer is allocated using a `CPUAllocator`.
+  Tensor(DataType type, const TensorShape& shape);
+
+  /// \brief Creates a tensor with the input `type` and `shape`, using
+  /// the allocator `a` to allocate the underlying buffer. If
+  /// LogMemory::IsEnabled() the allocation is logged as coming from
+  /// an unknown kernel and step. Calling the Tensor constructor
+  /// directly from within an Op is deprecated: use the
+  /// OpKernelConstruction/OpKernelContext allocate_* methods to
+  /// allocate a new tensor, which record the kernel and step.
+  ///
+  /// `a` must outlive the lifetime of this Tensor.
+  Tensor(Allocator* a, DataType type, const TensorShape& shape);
+
+  /// \brief Creates a tensor with the input `type` and `shape`, using
+  /// the allocator `a` and the specified "allocation_attr" to
+  /// allocate the underlying buffer. If the kernel and step are known
+  /// allocation_attr.allocation_will_be_logged should be set to true
+  /// and LogMemory::RecordTensorAllocation should be called after the
+  /// tensor is constructed. Calling the Tensor constructor directly
+  /// from within an Op is deprecated: use the
+  /// OpKernelConstruction/OpKernelContext allocate_* methods to
+  /// allocate a new tensor, which record the kernel and step.
+  ///
+  /// `a` must outlive the lifetime of this Tensor.
+  Tensor(Allocator* a, DataType type, const TensorShape& shape,
+         const AllocationAttributes& allocation_attr);
+
+  /// \brief Creates a tensor with the input datatype, shape and buf.
+  ///
+  /// Acquires a ref on buf that belongs to this Tensor.
+  Tensor(DataType type, const TensorShape& shape, TensorBuffer* buf);
+
+  /// \brief Creates a tensor with the input datatype, shape and buf.
+  ///
+  /// Takes an ownership of the bufffer from the reference counted pointer.
+  Tensor(DataType type, TensorShape shape, core::RefCountPtr<TensorBuffer> buf);
+
+  /// \brief Creates an empty Tensor of the given data type.
+  ///
+  /// Like Tensor(), returns a 1-dimensional, 0-element Tensor with
+  /// IsInitialized() returning True. See the Tensor() documentation
+  /// for details.
+  explicit Tensor(DataType type);
+
+  /// \brief Initializes a tensor with the input `type` and `shape`, or returns
+  /// an error and leaves `out_tensor` unmodified. This factory method should be
+  /// used instead of the corresponding constructor if calling code cannot
+  /// validate that the `DataType` is valid and supported.
+  ///
+  /// The underlying buffer is allocated using a `CPUAllocator`.
+  static absl::Status BuildTensor(DataType type, const TensorShape& shape,
+                                  Tensor* out_tensor);
+
+ private:
+  // A tag type for selecting the `Tensor` constructor overload that creates a
+  // scalar tensor in host memory.
+  struct host_scalar_tag {};
+
+  class HostScalarTensorBufferBase;
+  template <typename T>
+  struct ValueAndTensorBuffer;
+
+  // Creates a tensor with the given scalar `value` in CPU memory.
+  template <typename T>
+  Tensor(T value, host_scalar_tag tag);
+
+ public:
+  // A series of specialized constructors for scalar tensors in host memory.
+  //
+  // NOTE: The `Variant` host-scalar constructor is not defined, because Variant
+  // is implicitly constructible from many different types, and this causes
+  // ambiguities with some compilers.
+  explicit Tensor(float scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(double scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int32_t scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int16_t scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int8_t scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(tstring scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+  explicit Tensor(complex64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(complex128 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int64_t scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bool scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bfloat16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(Eigen::half scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(ResourceHandle scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+
+  // NOTE: The `const char*` host-scalar constructor is provided as a
+  // convenience because otherwise passing a string literal would surprisingly
+  // construct a DT_BOOL tensor.
+  explicit Tensor(const char* scalar_value)
+      : Tensor(tstring(scalar_value), host_scalar_tag{}) {}
+
+  /// Copy constructor.
+  Tensor(const Tensor& other);
+
+  /// \brief Move constructor. After this call, <other> is safely destructible
+  /// can be assigned to, and IsInitialized() can be called and will return
+  /// false. Other calls on <other> (e.g. shape manipulation) are not valid.
+  Tensor(Tensor&& other);
+
+  // Explicitly delete constructor that take a pointer (except char*)
+  // so that the pointer doesn't get implicitly cast to bool.
+  template <typename T, typename std::enable_if<!std::is_same<T, char>::value,
+                                                T>::type* = nullptr>
+  explicit Tensor(T* t) = delete;
+
+  ~Tensor();
+
+  // I/O operators.
+  friend std::ostream&  // NOLINT: iosfwd
+  operator<<(std::ostream& out, const Tensor& tensor);
+
+  /// Returns the data type.
+  DataType dtype() const { return shape_.data_type(); }
+
+  /// Returns the shape of the tensor.
+  const TensorShape& shape() const { return shape_; }
+
+  /// \brief Convenience accessor for the tensor shape.
+  ///
+  /// For all shape accessors, see comments for relevant methods of
+  /// `TensorShape` in `tensor_shape.h`.
+  int dims() const { return shape().dims(); }
+
+  /// Convenience accessor for the tensor shape.
+  int64_t dim_size(int d) const { return shape().dim_size(d); }
+
+  /// Convenience accessor for the tensor shape.
+  int64_t NumElements() const { return shape().num_elements(); }
+
+  bool IsSameSize(const Tensor& b) const {
+    return shape().IsSameSize(b.shape());
+  }
+
+  // True iff the two tensors use the same underlying refcounted storage
+  bool SharesBufferWith(const Tensor& b) const;
+
+  /// \brief If necessary, has this Tensor been initialized?
+  ///
+  /// Zero-element Tensors are always considered initialized, even if they
+  /// have never been assigned to and do not have any memory allocated.
+  bool IsInitialized() const;
+
+  /// Returns the estimated memory usage of this tensor.
+  size_t TotalBytes() const;
+
+  // Returns the size of allocated memory for this tensor.
+  size_t AllocatedBytes() const;
+
+  /// Returns true iff this tensor is aligned.
+  bool IsAligned() const {
+#if EIGEN_MAX_ALIGN_BYTES == 0
+    return true;
+#else
+    void* ptr = base<void>();
+    return dtype() == DT_STRING || NumElements() == 0 ||
+           (reinterpret_cast<intptr_t>(ptr) % EIGEN_MAX_ALIGN_BYTES == 0);
+#endif
+  }
+
+  /// Assign operator. This tensor shares other's underlying storage.
+  Tensor& operator=(const Tensor& other) {
+    CopyFromInternal(other, other.shape());
+    return *this;
+  }
+
+  /// Move operator.  See move constructor for details.
+  Tensor& operator=(Tensor&& other);
+
+  /// \brief Copy the other tensor into this tensor and reshape it.
+  ///
+  /// This tensor shares other's underlying storage. Returns `true`
+  /// iff `other.shape()` has the same number of elements of the given
+  /// `shape`.
+  bool CopyFrom(const Tensor& other,
+                const TensorShape& shape) TF_MUST_USE_RESULT {
+    if (other.NumElements() != shape.num_elements()) return false;
+    CopyFromInternal(other, shape);
+    return true;
+  }
+
+  /// \brief Slice this tensor along the 1st dimension.
+
+  /// I.e., the returned tensor satisfies
+  ///     returned[i, ...] == this[dim0_start + i, ...].
+  /// The returned tensor shares the underlying tensor buffer with this
+  /// tensor.
+  ///
+  /// NOTE: The returned tensor may not satisfy the same alignment
+  /// requirement as this tensor depending on the shape. The caller
+  /// must check the returned tensor's alignment before calling certain
+  /// methods that have alignment requirement (e.g., `flat()`, `tensor()`).
+  ///
+  /// NOTE: When fed with an N-dimensional tensor, this method returns a tensor
+  /// also with N dimensions. If you want to select a sub tensor, see SubSlice.
+  ///
+  /// REQUIRES: `dims()` >= 1
+  /// REQUIRES: `0 <= dim0_start <= dim0_limit <= dim_size(0)`
+  Tensor Slice(int64_t dim0_start, int64_t dim0_limit) const;
+
+  /// \brief Select a subslice from this tensor along the 1st dimension.
+  ///
+  /// When fed with an N-dimensional tensor, this method returns a tensor with
+  /// N-1 dimensions, where the returned tensor is a subslice of the input
+  /// tensor along the first dimension. The N-1 dimensions of the returned
+  /// tensor are the last N-1 dimensions of the input tensor.
+  ///
+  /// NOTE: The returned tensor may not satisfy the same alignment
+  /// requirement as this tensor depending on the shape. The caller
+  /// must check the returned tensor's alignment before calling certain
+  /// methods that have alignment requirement (e.g., `flat()`, `tensor()`).
+  ///
+  /// REQUIRES: `dims()` >= 1
+  /// REQUIRES: `0 <= index < dim_size(0)`
+  Tensor SubSlice(int64_t index) const;
+
+  /// \brief Parse `other` and construct the tensor.
+
+  /// Returns `true` iff the parsing succeeds. If the parsing fails,
+  /// the state of `*this` is unchanged.
+  bool FromProto(const TensorProto& other) TF_MUST_USE_RESULT;
+  bool FromProto(Allocator* a, const TensorProto& other) TF_MUST_USE_RESULT;
+
+  /// \brief Fills in `proto` with `*this` tensor's content.
+  ///
+  /// `AsProtoField()` fills in the repeated field for `proto.dtype()`, while
+  /// `AsProtoTensorContent()` encodes the content in `proto.tensor_content()`
+  /// in a compact form.
+  void AsProtoField(TensorProto* proto) const;
+  void AsProtoTensorContent(TensorProto* proto) const;
+
+  /// \brief Return the tensor data as an `Eigen::Tensor` with the type and
+  /// sizes of this `Tensor`.
+  ///
+  /// Use these methods when you know the data type and the number of
+  /// dimensions of the Tensor and you want an `Eigen::Tensor`
+  /// automatically sized to the `Tensor` sizes. The implementation check
+  /// fails if either type or sizes mismatch.
+  ///
+  /// Example:
+  ///
+  /// ```c++
+  ///
+  ///     typedef float T;
+  ///     Tensor my_mat(...built with Shape{rows: 3, cols: 5}...);
+  ///     auto mat = my_mat.matrix<T>();    // 2D Eigen::Tensor, 3 x 5.
+  ///     auto mat = my_mat.tensor<T, 2>(); // 2D Eigen::Tensor, 3 x 5.
+  ///     auto vec = my_mat.vec<T>();       // CHECK fails as my_mat is 2D.
+  ///     auto vec = my_mat.tensor<T, 3>(); // CHECK fails as my_mat is 2D.
+  ///     auto mat = my_mat.matrix<int32>();// CHECK fails as type mismatch.
+  ///
+  /// ```
+  template <typename T>
+  typename TTypes<T>::Vec vec() {
+    return tensor<T, 1>();
+  }
+
+  template <typename T>
+  typename TTypes<T>::Matrix matrix() {
+    return tensor<T, 2>();
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor tensor() TF_ATTRIBUTE_NOINLINE;
+
+  /// \brief Return the tensor data to an `Eigen::Tensor` with the
+  /// same size but a bitwise cast to the specified dtype `T`.
+  ///
+  /// Using a bitcast is useful for move and copy operations.
+  /// NOTE: this is the same as `tensor()` except a bitcast is allowed.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor bit_casted_tensor();
+
+  /// \brief Return the tensor data to an `Eigen::Tensor` with the
+  /// last dimension elements converted into single elements of a larger type.
+  ///
+  /// For example, this is useful for kernels that can treat NCHW_VECT_C int8
+  /// tensors as NCHW int32 tensors. The sizeof(T) should equal the size of
+  /// the original element type * num elements in the original last dimension.
+  /// NDIMS should be 1 less than the original number of dimensions.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor reinterpret_last_dimension();
+
+  /// \brief Return the tensor data as an `Eigen::Tensor` of the data type and a
+  /// specified shape.
+  ///
+  /// These methods allow you to access the data with the dimensions
+  /// and sizes of your choice.  You do not need to know the number of
+  /// dimensions of the Tensor to call them.  However, they `CHECK` that
+  /// the type matches and the dimensions requested creates an
+  /// `Eigen::Tensor` with the same number of elements as the tensor.
+  ///
+  /// Example:
+  ///
+  /// ```c++
+  ///
+  ///     typedef float T;
+  ///     Tensor my_ten(...built with Shape{planes: 4, rows: 3, cols: 5}...);
+  ///     // 1D Eigen::Tensor, size 60:
+  ///     auto flat = my_ten.flat<T>();
+  ///     // 2D Eigen::Tensor 12 x 5:
+  ///     auto inner = my_ten.flat_inner_dims<T>();
+  ///     // 2D Eigen::Tensor 4 x 15:
+  ///     auto outer = my_ten.shaped<T, 2>({4, 15});
+  ///     // CHECK fails, bad num elements:
+  ///     auto outer = my_ten.shaped<T, 2>({4, 8});
+  ///     // 3D Eigen::Tensor 6 x 5 x 2:
+  ///     auto weird = my_ten.shaped<T, 3>({6, 5, 2});
+  ///     // CHECK fails, type mismatch:
+  ///     auto bad   = my_ten.flat<int32>();
+  ///
+  /// ```
+  template <typename T>
+  typename TTypes<T>::Flat flat();
+
+  template <typename T>
+  typename TTypes<T>::UnalignedFlat unaligned_flat() {
+    return unaligned_shaped<T, 1>({NumElements()});
+  }
+
+  /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing all
+  /// Tensor dimensions but the last NDIMS-1 into the first dimension of the
+  /// result. If NDIMS > dims() then leading dimensions of size 1 will be
+  /// added to make the output rank NDIMS.
+  template <typename T, size_t NDIMS = 2>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_dims();
+
+  /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing all
+  /// Tensor dimensions but the first NDIMS-1 into the last dimension of the
+  /// result. If NDIMS > dims() then trailing dimensions of size 1 will be
+  /// added to make the output rank NDIMS.
+  template <typename T, size_t NDIMS = 2>
+  typename TTypes<T, NDIMS>::Tensor flat_outer_dims();
+
+  /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing the
+  /// first 'begin' Tensor dimensions into the first dimension of the result and
+  /// the Tensor dimensions of the last dims() - 'begin' - NDIMS into the last
+  /// dimension of the result. If 'begin' < 0 then the |'begin'| leading
+  /// dimensions of size 1 will be added. If 'begin' + NDIMS > dims() then
+  /// 'begin' + NDIMS - dims() trailing dimensions of size 1 will be added.
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64_t begin);
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(absl::Span<const int64_t> new_sizes);
+
+  /// \brief Return the tensor data to an `Eigen::Tensor` with the new
+  /// shape specified in `new_sizes` and cast to a new dtype `T`.
+  ///
+  /// Using a bitcast is useful for move and copy operations.
+  /// The allowed bitcast is the only difference from `shaped()`.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor bit_casted_shaped(
+      absl::Span<const int64_t> new_sizes);
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::UnalignedTensor unaligned_shaped(
+      absl::Span<const int64_t> new_sizes);
+
+  /// \brief Return the Tensor data as a `TensorMap` of fixed size 1:
+  /// `TensorMap<TensorFixedSize<T, 1>>`.
+
+  /// Using `scalar()` allows the compiler to perform optimizations as
+  /// the size of the tensor is known at compile time.
+  template <typename T>
+  typename TTypes<T>::Scalar scalar();
+
+  /// Const versions of all the methods above.
+  template <typename T>
+  typename TTypes<T>::ConstVec vec() const {
+    return tensor<T, 1>();
+  }
+
+  template <typename T>
+  typename TTypes<T>::ConstMatrix matrix() const {
+    return tensor<T, 2>();
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstTensor tensor() const TF_ATTRIBUTE_NOINLINE;
+
+  /// \brief Return the tensor data to an `Eigen::Tensor` with the
+  /// same size but a bitwise cast to the specified dtype `T`.
+  ///
+  /// Using a bitcast is useful for move and copy operations.
+  /// NOTE: this is the same as `tensor()` except a bitcast is allowed.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstTensor bit_casted_tensor() const;
+
+  /// \brief Return the tensor data to an `Eigen::Tensor` with the
+  /// last dimension elements converted into single elements of a larger type.
+  ///
+  /// For example, this is useful for kernels that can treat NCHW_VECT_C int8
+  /// tensors as NCHW int32 tensors. The sizeof(T) should equal the size of
+  /// the original element type * num elements in the original last dimension.
+  /// NDIMS should be 1 less than the original number of dimensions.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstTensor reinterpret_last_dimension() const;
+
+  template <typename T>
+  typename TTypes<T>::ConstFlat flat() const;
+
+  template <typename T>
+  typename TTypes<T>::UnalignedConstFlat unaligned_flat() const {
+    return unaligned_shaped<T, 1>({NumElements()});
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstTensor shaped(
+      absl::Span<const int64_t> new_sizes) const;
+
+  /// \brief Return the tensor data to an `Eigen::Tensor` with the new
+  /// shape specified in `new_sizes` and cast to a new dtype `T`.
+  ///
+  /// Using a bitcast is useful for move and copy operations.
+  /// The allowed bitcast is the only difference from `shaped()`.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstTensor bit_casted_shaped(
+      absl::Span<const int64_t> new_sizes) const;
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::UnalignedConstTensor unaligned_shaped(
+      absl::Span<const int64_t> new_sizes) const;
+
+  template <typename T>
+  typename TTypes<T>::ConstScalar scalar() const;
+
+  template <typename T, size_t NDIMS = 2>
+  typename TTypes<T, NDIMS>::ConstTensor flat_inner_dims() const;
+
+  template <typename T, size_t NDIMS = 2>
+  typename TTypes<T, NDIMS>::ConstTensor flat_outer_dims() const;
+
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::ConstTensor flat_inner_outer_dims(
+      int64_t begin) const;
+
+  /// Render the first `max_entries` values in `*this` into a string.
+  std::string SummarizeValue(int64_t max_entries, bool print_v2 = false) const;
+
+  /// A human-readable summary of the tensor suitable for debugging.
+  // `num_values` is the number of actual data values in the tensor
+  // included in the message. If the tensor might be resident in
+  // GPU/TPU memory use DeviceSafeDebugString instead.
+  std::string DebugString(int num_values) const;
+  std::string DebugString() const { return DebugString(3); }
+
+  // Variant of DebugString() that should be used for possibly non-CPU tensors.
+  // If the tensor is not resident on CPU, we can't read its values as
+  // DebugString() does.
+  std::string DeviceSafeDebugString() const;
+
+  /// Fill in the `TensorDescription` proto with metadata about the
+  /// tensor that is useful for monitoring and debugging.
+  void FillDescription(TensorDescription* description) const;
+
+  /// \brief Returns a `StringPiece` mapping the current tensor's buffer.
+  ///
+  /// The returned `StringPiece` may point to memory location on devices
+  /// that the CPU cannot address directly.
+  ///
+  /// NOTE: The underlying tensor buffer is refcounted, so the lifetime
+  /// of the contents mapped by the `StringPiece` matches the lifetime of
+  /// the buffer; callers should arrange to make sure the buffer does
+  /// not get destroyed while the `StringPiece` is still used.
+  ///
+  /// REQUIRES: `DataTypeCanUseMemcpy(dtype())`.
+  absl::string_view tensor_data() const;
+  void* data() const;
+
+  /// Copy the other tensor into this tensor, reshape it and reinterpret the
+  /// buffer's datatype. If an ok Status is returned, the two tensors now share
+  /// the same underlying storage.
+  ///
+  /// This call requires that the `other` tensor and the given type and shape
+  /// are "compatible" (i.e. they occupy the same number of bytes).
+  ///
+  /// Specifically:
+  ///
+  /// shape.num_elements() * DataTypeSize(type)
+  ///
+  /// must equal
+  ///
+  /// other.num_elements() * DataTypeSize(other.dtype())
+  ///
+  /// In addition, this function requires:
+  ///   * DataTypeSize(other.dtype()) != 0
+  ///   * DataTypeSize(type) != 0
+  ///
+  /// If any of the requirements are not met, errors::InvalidArgument is
+  /// returned.
+  absl::Status BitcastFrom(const Tensor& other, DataType dtype,
+                           const TensorShape& shape);
+
+  /// Like BitcastFrom, but CHECK fails if any preconditions are not met.
+  ///
+  /// Deprecated. Use BitcastFrom instead and check the returned Status.
+  void UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
+                              const TensorShape& shape) {
+    TF_CHECK_OK(BitcastFrom(other, dtype, shape));
+  }
+
+  // Returns true if the refcount on buf_ and any possible underlying root
+  // buffer is one.
+  bool RefCountIsOne() const;
+
+  // Experimental. Returns the refcount on buf_ if it points to a regular
+  // TensorBuffer. If buf_ points to a SubBuffer, returns -1.
+  int RefCount() const;
+
+  // Returns the type of the underlying memory.
+  AllocatorMemoryType GetMemoryType() const { return buf_->GetMemoryType(); }
+
+ private:
+  void CheckType(DataType expected_dtype) const;
+  void CheckTypeAndIsAligned(DataType expected_dtype) const;
+  void CheckIsAlignedAndSingleElement() const;
+  void set_dtype(DataType t) { shape_.set_data_type(t); }
+
+  // TensorShape's InlineVector.
+  static absl::InlinedVector<int64_t, 4UL> ComputeFlatInnerDims(
+      absl::Span<const int64_t> orig, int64_t num_out_dims);
+  static absl::InlinedVector<int64_t, 4UL> ComputeFlatOuterDims(
+      absl::Span<const int64_t> orig, int64_t num_out_dims);
+
+  TensorShape shape_;
+  TensorBuffer* buf_;
+
+  friend class DMAHelper;             // For access to buf_.
+  friend class TensorCApi;            // For access to buf_.
+  friend class TensorCord;            // For access to buf_.
+  friend class TensorReference;       // For access to buf_.
+  friend class VariableOp;            // For access to set_shape.
+  friend class AutoReloadVariableOp;  // For access to set_shape.
+  friend class TensorTestHelper;      // For access to set_shape.
+  friend class TensorInterface;       // For access to set_shape.
+  friend class CastOpBase;            // For access to set_dtype.
+  friend class ScopedAllocator;       // For access to buf_.
+  friend class PjRtTensorBufferUtil;  // For access to buf_.
+  friend absl::Status batch_util::CopyElementToSlice(
+      Tensor element, Tensor* parent,
+      int64_t index);  // For access to base<T>().
+  friend absl::Status batch_util::CopySliceToElement(
+      const Tensor& parent, Tensor* element,
+      int64_t index);  // For access to base<T>().
+  friend absl::Status batch_util::MaybeMoveSliceToElement(
+      Tensor* parent, Tensor* element,
+      int64_t index);  // For access to base<T>().
+  friend absl::Status batch_util::CopyContiguousSlices(
+      const Tensor& src, int64_t src_offset, int64_t dst_offset,
+      int64_t num_slices,
+      Tensor* dst);  // For access to base<T>().
+  friend absl::Status batch_util::MaybeMoveContiguousSlices(
+      Tensor& src, int64_t src_offset, int64_t dst_offset, int64_t num_slices,
+      Tensor* dst);  // For access to base<T>().
+
+  bool CanUseDMA() const;
+
+  // Only needed by variable op to set the shape of an uninitialized
+  // Tensor.
+  // TODO: Remove this when we have a better story for detecting
+  // uninitialized tensors.
+  void set_shape(const TensorShape& shape) {
+    DataType dt = dtype();
+    shape_ = shape;
+    set_dtype(dt);
+  }
+
+  inline void CopyFromInternal(const Tensor& other, const TensorShape& shape) {
+    DCHECK_EQ(shape.num_elements(), other.NumElements());
+    // Data type will be overwritten if this == &other, since dtype is part of
+    // shape.
+    DataType other_dtype = other.dtype();
+    shape_ = shape;
+    set_dtype(other_dtype);
+    if (buf_ != other.buf_) {
+      if (buf_) buf_->Unref();
+      buf_ = other.buf_;
+      if (buf_) buf_->Ref();
+    }
+  }
+
+  template <typename T>
+  T* base() const;
+
+  template <size_t NDIMS>
+  void FillDimsAndValidateCompatibleShape(
+      absl::Span<const int64_t> new_sizes,
+      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
+
+  template <typename T, size_t NDIMS>
+  void FillDimsAndValidateCompatibleShape(
+      absl::Span<const int64_t> new_sizes,
+      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
+};
+
+// Implementation details
+
+// START_SKIP_DOXYGEN
+
+template <typename T>
+T* Tensor::base() const {
+  return buf_ == nullptr ? nullptr : buf_->base<T>();
+}
+
+// This routine is defined out of line for code-space savings
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::tensor() {
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  return typename TTypes<T, NDIMS>::Tensor(base<T>(),
+                                           shape().AsEigenDSizes<NDIMS>());
+}
+
+// This routine is defined out of line for code-space savings
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::tensor() const {
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  return typename TTypes<T, NDIMS>::ConstTensor(base<const T>(),
+                                                shape().AsEigenDSizes<NDIMS>());
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
+  CHECK(IsAligned());
+  return typename TTypes<T, NDIMS>::Tensor(base<T>(),
+                                           shape().AsEigenDSizes<NDIMS>());
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_tensor() const {
+  CHECK(IsAligned());
+  return typename TTypes<T, NDIMS>::ConstTensor(base<const T>(),
+                                                shape().AsEigenDSizes<NDIMS>());
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::reinterpret_last_dimension() {
+  if (NDIMS == dims()) {
+    return tensor<T, NDIMS>();
+  }
+  CHECK(IsAligned());
+  CHECK_EQ(static_cast<int>(NDIMS), dims() - 1);
+  CHECK_EQ(static_cast<int64_t>(sizeof(T)),
+           shape_.dim_sizes()[NDIMS] * DataTypeSize(dtype()));
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  for (int d = 0; d < NDIMS; ++d) {
+    dims[d] = shape_.dim_sizes()[d];
+  }
+  return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::reinterpret_last_dimension()
+    const {
+  if (NDIMS == dims()) {
+    return tensor<T, NDIMS>();
+  }
+  CHECK(IsAligned());
+  CHECK_EQ(static_cast<int>(NDIMS), dims() - 1);
+  CHECK_EQ(static_cast<int64_t>(sizeof(T)),
+           shape_.dim_sizes()[NDIMS] * DataTypeSize(dtype()));
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  for (int d = 0; d < NDIMS; ++d) {
+    dims[d] = shape_.dim_sizes()[d];
+  }
+  return typename TTypes<T, NDIMS>::ConstTensor(base<const T>(), dims);
+}
+
+template <size_t NDIMS>
+void Tensor::FillDimsAndValidateCompatibleShape(
+    absl::Span<const int64_t> new_sizes,
+    Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const {
+  CHECK_EQ(NDIMS, new_sizes.size());
+  int64_t new_num_elements = 1;
+  for (size_t d = 0; d < NDIMS; d++) {
+    new_num_elements *= new_sizes[d];
+    (*dims)[d] = new_sizes[d];
+  }
+  CHECK_EQ(new_num_elements, NumElements());
+}
+
+template <typename T, size_t NDIMS>
+void Tensor::FillDimsAndValidateCompatibleShape(
+    absl::Span<const int64_t> new_sizes,
+    Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const {
+  CHECK_EQ(NDIMS, new_sizes.size());
+  int64_t new_num_elements = 1;
+  for (size_t d = 0; d < NDIMS; d++) {
+    new_num_elements *= new_sizes[d];
+    (*dims)[d] = new_sizes[d];
+  }
+  const int element_size = DataTypeSize(BaseType(dtype()));
+  if (element_size > 0) {
+    CHECK_EQ(new_num_elements * static_cast<int64_t>(sizeof(T)),
+             NumElements() * element_size);
+  } else {
+    // DataTypeSize() returns 0 for some data types. In this case, assume that T
+    // has the same size as the buffer type.
+    // NOTE: If we can be sure that DataTypeSize() does not return 0 for all POD
+    // types, then we should check DataTypeToEnum<T>::v() == dtype(). Or simply
+    // check if `element_size > 0` to err when bit cast is attempted on Tensor
+    // of unknown data type size.
+    CHECK_EQ(new_num_elements, NumElements());
+  }
+}
+
+template <typename T>
+typename TTypes<T>::Flat Tensor::flat() {
+  // Equivalent to 'return shaped<T, 1>({NumElements()});'
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  Eigen::array<Eigen::DenseIndex, 1> dims;
+  dims[0] = NumElements();
+  return typename TTypes<T, 1>::Tensor(base<T>(), dims);
+}
+
+template <typename T>
+typename TTypes<T>::ConstFlat Tensor::flat() const {
+  // Equuivalent to 'return shaped<T, 1>({NumElements()});'
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  Eigen::array<Eigen::DenseIndex, 1> dims;
+  dims[0] = NumElements();
+  return typename TTypes<T, 1>::ConstTensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
+    absl::Span<const int64_t> new_sizes) {
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
+  return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
+    absl::Span<const int64_t> new_sizes) {
+  CHECK(IsAligned());
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
+  return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
+    absl::Span<const int64_t> new_sizes) {
+  CheckType(DataTypeToEnum<T>::v());
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
+  return typename TTypes<T, NDIMS>::UnalignedTensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
+    absl::Span<const int64_t> new_sizes) const {
+  CheckType(DataTypeToEnum<T>::v());
+  CHECK(IsAligned()) << "ptr = " << base<void>();
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
+  return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
+    absl::Span<const int64_t> new_sizes) const {
+  CHECK(IsAligned());
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
+  return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::UnalignedConstTensor Tensor::unaligned_shaped(
+    absl::Span<const int64_t> new_sizes) const {
+  CheckType(DataTypeToEnum<T>::v());
+  Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
+  return typename TTypes<T, NDIMS>::UnalignedConstTensor(base<T>(), dims);
+}
+
+template <typename T>
+typename TTypes<T>::Scalar Tensor::scalar() {
+  static_assert(
+      !std::is_same<T, std::string>::value,
+      "std::string is no longer a scalar type, use tensorflow::tstring");
+  CheckIsAlignedAndSingleElement();
+  return typename TTypes<T>::Scalar(base<T>());
+}
+
+template <typename T>
+typename TTypes<T>::ConstScalar Tensor::scalar() const {
+  static_assert(
+      !std::is_same<T, std::string>::value,
+      "std::string is no longer a scalar type, use tensorflow::tstring");
+  CheckIsAlignedAndSingleElement();
+  return typename TTypes<T>::ConstScalar(base<T>());
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_dims() {
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64_t begin) {
+  absl::InlinedVector<int64_t, 4UL> flat_outer =
+      ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_dims() const {
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(
+    int64_t begin) const {
+  absl::InlinedVector<int64_t, 4UL> flat_outer =
+      ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
+}
+
+inline Tensor::Tensor(const Tensor& other)
+    : shape_(other.shape()), buf_(other.buf_) {
+  if (buf_) buf_->Ref();
+}
+
+inline Tensor::Tensor(Tensor&& other)
+    : shape_(std::move(other.shape_)), buf_(other.buf_) {
+  other.buf_ = nullptr;
+}
+
+class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
+ public:
+  using TensorBuffer::TensorBuffer;
+  bool GetAllocatedBytes(size_t* out_bytes) const final;
+  void FillAllocationDescription(AllocationDescription* proto) const final;
+};
+
+// A packed representation for a single scalar value of type `T`, and a
+// `TensorBuffer` implementation that describes (and manages the lifetime of)
+// that value.
+template <typename T>
+struct Tensor::ValueAndTensorBuffer {
+  class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase {
+   public:
+    explicit HostScalarTensorBuffer(void* data)
+        : HostScalarTensorBufferBase(data) {}
+    size_t size() const final { return sizeof(T); }
+    TensorBuffer* root_buffer() final { return this; }
+
+    // Override `operator delete` so that calling `delete this` in
+    // `core::Refcounted::Unref()` for an object of this type will free
+    // the enclosing `ValueAndTensorBuffer` for the tensor buffer.
+    //
+    // NOTE(mrry): The definition of this method must be outside the class
+    // definition in order to satisfy some compilers.
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
+    }
+
+   private:
+    ~HostScalarTensorBuffer() override { static_cast<T*>(data())->~T(); }
+  };
+
+  T value;
+  HostScalarTensorBuffer tensor_buffer;
+};
+
+/* static */
+template <typename T>
+void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
+    void* ptr) {
+  // Use a dummy object to compute to offset of
+  // `ValueAndTensorBuffer::tensor_buffer`, because `offsetof()` is not
+  // necessarily defined on this non-POD type (until C++17).
+  //
+  // NOTE(mrry): Using `sizeof(Tensor::ValueAndTensorBuffer<T>)` here requires
+  // us to define this method outside the class definition, so that it is not
+  // considered an incomplete type.
+  typename std::aligned_storage<sizeof(Tensor::ValueAndTensorBuffer<T>),
+                                alignof(Tensor::ValueAndTensorBuffer<T>)>::type
+      dummy_storage_;
+  Tensor::ValueAndTensorBuffer<T>* dummy_object =
+      reinterpret_cast<Tensor::ValueAndTensorBuffer<T>*>(&dummy_storage_);
+  intptr_t offset = reinterpret_cast<intptr_t>(&dummy_object->tensor_buffer) -
+                    reinterpret_cast<intptr_t>(dummy_object);
+
+  port::AlignedFree(static_cast<char*>(ptr) - offset);
+}
+
+template <typename T>
+Tensor::Tensor(T value, host_scalar_tag tag) {
+  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
+      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+                          EIGEN_MAX_ALIGN_BYTES));
+  new (&value_and_buf->value) T(std::move(value));
+  new (&value_and_buf->tensor_buffer)
+      typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(
+          value_and_buf);
+  buf_ = &value_and_buf->tensor_buffer;
+  set_dtype(DataTypeToEnum<T>::value);
+}
+
+inline Tensor& Tensor::operator=(Tensor&& other) {
+  // Avoid self-assignment, since we might destroy our underlying buffer.
+  if (&other != this) {
+    shape_ = std::move(other.shape_);
+    if (buf_) buf_->Unref();
+    buf_ = other.buf_;
+    other.buf_ = nullptr;
+  }
+  return *this;
+}
+
+// END_SKIP_DOXYGEN
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_key.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_key.h
new file mode 100644
index 00000000..3bde6fce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_key.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_KEY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_KEY_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+class TensorKey : public Tensor {
+ public:
+  using Tensor::Tensor;
+
+  TensorKey(const Tensor& t) : Tensor(t) {}
+
+  // Equality operator. Needed for absl hashing.
+  friend bool operator==(const TensorKey& t1, const TensorKey& t2) {
+    if (t1.dtype() != t2.dtype() || t1.shape() != t2.shape()) {
+      return false;
+    }
+    if (DataTypeCanUseMemcpy(t1.dtype())) {
+      return t1.tensor_data() == t2.tensor_data();
+    } else if (t1.dtype() == DT_STRING) {
+      const auto s1 = t1.unaligned_flat<tstring>();
+      const auto s2 = t2.unaligned_flat<tstring>();
+      for (int64_t i = 0, n = t1.NumElements(); i < n; ++i) {
+        if (TF_PREDICT_FALSE(s1(i) != s2(i))) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      DCHECK(false) << "Unimplemented dtype " << DataTypeString(t1.dtype())
+                    << std::endl;
+    }
+    return false;
+  }
+
+  friend bool operator!=(const TensorKey& t1, const TensorKey& t2) {
+    return !(t1 == t2);
+  }
+
+  // Needed for absl hash function.
+  template <typename H>
+  friend H AbslHashValue(H h, const TensorKey& k) {
+    if (DataTypeCanUseMemcpy(k.dtype())) {
+      return H::combine(std::move(h), k.tensor_data());
+    } else if (k.dtype() == DT_STRING) {
+      const auto strs = k.unaligned_flat<tstring>();
+      for (int64_t i = 0, n = k.NumElements(); i < n; ++i) {
+        h = H::combine(std::move(h), strs(i));
+      }
+      return h;
+    } else {
+      DCHECK(false) << "Unimplemented dtype " << DataTypeString(k.dtype())
+                    << std::endl;
+    }
+    return h;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_matcher.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_matcher.h
new file mode 100644
index 00000000..e89cfc15
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_matcher.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_MATCHER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_MATCHER_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace test {
+
+// Matcher for tensorflow::Tensor instances. Two tensors match iff
+//
+//   - their dtypes are equal,
+//   - their shapes are equal,
+//   - and their contents are equal.
+//
+// Their contents are matched by ::testing::Pointwise() after calling .flat<T>()
+// method where the type T satisfies:
+//
+//   ::tensorflow::DataTypeToEnum<T>::value == dtype
+//
+// Use this like:
+//
+//   EXPECT_THAT(lhs, TensorEq(rhs));
+//
+// All POD types and DT_STRING type tensors are supported. Note that this
+// utility requires Tensors to point to CPU memory.
+class TensorEq {
+ public:
+  explicit TensorEq(const tensorflow::Tensor& target) : target_(target) {}
+
+  // Matchers depend on implicit casts. Do not make explicit.
+  operator ::testing::Matcher<const tensorflow::Tensor&>() const;  // NOLINT
+
+ private:
+  const tensorflow::Tensor& target_;
+};
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_MATCHER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_reference.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_reference.h
new file mode 100644
index 00000000..59ccd281
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_reference.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_REFERENCE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_REFERENCE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+// An opaque class that holds a reference to an underlying TensorBuffer.
+// Unlike Tensor, it does not have any shape or type information, so
+// it is cheaper to construct/move, but the only thing you can really do
+// with it is Unref it, which releases one of the references to the underlying
+// TensorBuffer.
+// IMPORTANT: If you do not call Unref(), you will likely leak tensor memory.
+class TensorReference {
+ public:
+  // Take the reference of the root buffer so the size will be more accurate
+  explicit TensorReference(const Tensor& tensor)
+      : buf_(tensor.buf_ ? tensor.buf_->root_buffer() : nullptr) {
+    if (buf_) buf_->Ref();
+  }
+
+  ~TensorReference() {}
+
+  void Unref() const {
+    if (buf_) buf_->Unref();
+  }
+
+  void FillDescription(AllocationDescription* description) const {
+    if (buf_) buf_->FillAllocationDescription(description);
+  }
+
+ private:
+  TensorBuffer* buf_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_REFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_shape.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_shape.h
new file mode 100644
index 00000000..0bcf1fc5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_shape.h
@@ -0,0 +1,795 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_SHAPE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_SHAPE_H_
+
+#include <string>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// START_SKIP_DOXYGEN
+template <class Shape>
+class TensorShapeIter;
+class TensorShape;
+class TensorShapeProto;
+class PartialTensorShape;
+// END_SKIP_DOXYGEN
+
+/// Internal representation for both TensorShape and PartialTensorShape.
+class TensorShapeRep {
+ public:
+  ~TensorShapeRep();
+
+  /// Copy the specified shape
+  TensorShapeRep(const TensorShapeRep& b);
+  void operator=(const TensorShapeRep& b);
+
+  /// Move the specified shape.  After moving, `b` is safe for destruction and
+  // can be reassigned into, but its dimensions and number of elements can be
+  // nonsensical (e.g., negative dimension sizes, or number of elements not
+  // properly recomputed).
+  TensorShapeRep(TensorShapeRep&& b);
+  void operator=(TensorShapeRep&& b);
+
+  /// Clear a tensor shape, producing the scalar shape.
+  void Clear();
+
+  // Maximum number of dimensions in a tensor.
+  // It's 254 because 255 = kUnknownRank is used to represent unknown rank.
+  static constexpr int MaxDimensions() { return 254; }
+
+  /// \brief Returns the number of elements in the tensor.
+  ///
+  /// We use `int64` and not `size_t` to be compatible with `Eigen::Tensor`
+  /// which uses `ptrdiff_t`.  For PartialTensorShape, -1 means not fully
+  /// defined.
+  int64_t num_elements() const { return num_elements_; }
+
+  /// For error messages.
+  std::string DebugString() const;
+  static std::string DebugString(const TensorShapeProto& proto);
+
+ protected:
+  // Constructable only via TensorShapeBase
+  TensorShapeRep() = default;
+
+  void ClearAllButDataType();
+
+  // We use 16 bytes to represent a TensorShape.  Because we need to
+  // be able to support full 64-bit dimension sizes and an arbitrary
+  // number of dimensions for a Tensor, but most tensor dimensions are
+  // significantly smaller than 64 bits and most tensors are 1, 2, or 3
+  // dimensions, we have several representations.
+  // Rep16: Supports up to 6 dimensions where each dimension is < 2^16 - 1
+  // Rep32: Supports up to 3 dimensions where each dimension is < 2^32 - 1
+  // Rep64: Supports arbitrary dimensionality, 64-bit dimensions using
+  //        an out of line vector.
+  // For PartialTensorShape, a dimension of static_cast<uint??>(-1) is unknown.
+  // This value is not allowed in TensorShape either for format compatibility.
+  struct Rep16 {
+    uint16 dims_[6];
+  };
+  struct Rep32 {
+    uint32 dims_[3];
+  };
+  struct Rep64 {
+    absl::InlinedVector<int64_t, 4UL>* dims_;
+  };
+
+  // We use the max value of uint16 or uint32 to represent unknown shapes, so
+  // the maximum representable valid shape in these representations is one less.
+  static constexpr int64_t kMaxRep16 = std::numeric_limits<uint16>::max() - 1;
+  static constexpr int64_t kMaxRep32 = std::numeric_limits<uint32>::max() - 1;
+  static constexpr uint16 kUnknownRep16 = std::numeric_limits<uint16>::max();
+  static constexpr uint32 kUnknownRep32 = std::numeric_limits<uint32>::max();
+
+  Rep16* as16() { return reinterpret_cast<Rep16*>(buf()); }
+  Rep32* as32() { return reinterpret_cast<Rep32*>(buf()); }
+  Rep64* as64() { return reinterpret_cast<Rep64*>(buf()); }
+
+  const Rep16* as16() const { return reinterpret_cast<const Rep16*>(buf()); }
+  const Rep32* as32() const { return reinterpret_cast<const Rep32*>(buf()); }
+  const Rep64* as64() const { return reinterpret_cast<const Rep64*>(buf()); }
+
+  enum RepTag { REP16 = 0, REP32 = 1, REP_OUT_OF_LINE = 2 };
+
+  // Since we have a convenient extra byte available, we allow the
+  // Tensor class to store an 8-bit value in this extra storage.  This
+  // allows it to store the Tensor's datatype enum value here and avoid
+  // an extra word of storage.
+  friend class Tensor;
+  friend class TensorShapeTestHelper;
+  DataType data_type() const { return static_cast<DataType>(buf()[13]); }
+  void set_data_type(DataType dt) {
+    // We only have 8 bits available to store DataType, so make sure it fits
+    DCHECK_LT(static_cast<uint32>(dt), 256u);
+    buf()[13] = static_cast<uint8>(dt);
+  }
+
+  // We store the number of dimensions in byte 14, and the RepTag in byte 15.
+  // Bytes [0..13] vary depending on the representation.
+  // A value of 255 indicates unknown rank in the PartialTensorShape case.
+  static constexpr uint8 kUnknownRank = 255;
+  uint8 ndims_byte() const { return buf()[14]; }
+  void set_ndims_byte(uint8 nd) { buf()[14] = nd; }
+
+  RepTag tag() const { return static_cast<RepTag>(buf()[15]); }
+  void set_tag(RepTag tag) { buf()[15] = static_cast<uint8>(tag); }
+
+  void set_num_elements(int64_t n) { num_elements_ = n; }
+
+ private:
+  void DestructorOutOfLine();
+  void SlowCopyFrom(const TensorShapeRep& b);
+
+  uint8* buf() { return &u_.buf[0]; }
+  const uint8* buf() const { return &u_.buf[0]; }
+
+  union {
+    uint8 buf[16];
+    // Force data to be aligned enough for a pointer.
+    Rep64* unused_aligner;
+  } u_;
+  int64_t num_elements_;
+};
+
+/// Base class for TensorShape and PartialTensorShape.
+/// The class is templatized by either TensorShape or PartialTensorShape to
+/// allow skipping known/unknown checks in the TensorShape case, but the
+/// representation is shared exactly for fast conversion.
+template <class Shape>
+class TensorShapeBase : public TensorShapeRep {
+ public:
+  /// \brief Construct a `TensorShapeBase` from the provided sizes.
+  /// REQUIRES: `dim_sizes[i] >= 0` (or >= -1 for PartialTensorShape)
+  explicit TensorShapeBase(absl::Span<const int64_t> dim_sizes);
+  TensorShapeBase(std::initializer_list<int64_t> dim_sizes)
+      : TensorShapeBase(absl::Span<const int64_t>(dim_sizes)) {}
+
+  /// Construct an empty TensorShape, or an unknown rank PartialTensorShape
+  TensorShapeBase();
+
+  // Cannot be made explicit because we rely on conversion between proto and
+  // `TensorShapeBase` throughtout the codebase (needs bigger cleanup)
+  TensorShapeBase(const TensorShapeProto& proto);
+
+  // These factory methods should be used instead of the constructors that take
+  // an array of sizes if calling code cannot validate that the sizes specify a
+  // valid `TensorShape`.
+  // The value in `*out` is valid iff the returned value is `Status::OK`.
+  static absl::Status BuildTensorShapeBase(absl::Span<const int64_t> dim_sizes,
+                                           TensorShapeBase* out);
+  static absl::Status BuildTensorShapeBase(
+      std::initializer_list<int64_t> dim_sizes, TensorShapeBase* out) {
+    return BuildTensorShapeBase(absl::Span<const int64_t>(dim_sizes), out);
+  }
+  static absl::Status BuildTensorShapeBase(const TensorShapeProto& proto,
+                                           TensorShapeBase* out);
+
+  /// Returns `true` iff `proto` is a valid tensor shape.
+  // For TensorShape, the proto shape must be fully defined.
+  static bool IsValid(const TensorShapeProto& proto);
+
+  /// Returns `OK` iff `proto` is a valid tensor shape, and a descriptive error
+  /// status otherwise.
+  static absl::Status IsValidShape(const TensorShapeProto& proto);
+
+  /// Returns `true` iff this is a valid tensor shape.
+  bool IsValid();
+
+  /// \brief Add a dimension to the end ("inner-most").
+  /// REQUIRES: `size >= 0`
+  void AddDim(int64_t size);
+
+  /// Same as `AddDim` but returns a `Status`.
+  /// Use if unsure is `size >= 0`, to prevent `CHECK`-crashes.
+  absl::Status AddDimWithStatus(int64_t size);
+
+  /// Appends all the dimensions from `shape`.
+  void AppendShape(const TensorShapeBase& shape);
+
+  /// Same as `RemoveDim` but returns a `Status`.
+  /// Use if you cannot validate all invariants, to prevent `CHECK`-fail.
+  absl::Status AppendShapeWithStatus(const TensorShapeBase& shape);
+
+  /// \brief Insert a dimension somewhere in the `TensorShape`.
+  /// REQUIRES: `0 <= d <= dims()`
+  /// REQUIRES: `size >= 0`
+  void InsertDim(int d, int64_t size);
+
+  /// Same as `InsertDim` but returns a `Status`.
+  /// Use if unsure if requirements in `InsertDim` are satistified, to prevent
+  /// `CHECK`-fail crashes.
+  absl::Status InsertDimWithStatus(int d, int64_t size);
+
+  /// \brief Modifies the size of the dimension `d` to be `size`
+  /// REQUIRES: `0 <= d < dims()`
+  /// REQUIRES: `size >= 0`
+  void set_dim(int d, int64_t size);
+
+  /// Same as `set_dim` but returns a `Status`.
+  /// Use if unsure if requirements in `set_dim` are satistified, to prevent
+  /// `CHECK`-fail crashes.
+  absl::Status SetDimWithStatus(int d, int64_t size);
+
+  /// \brief Removes dimension `d` from the `TensorShape`.
+  /// REQUIRES: `0 <= d < dims()`
+  void RemoveDim(int d) {
+    CHECK_GE(d, 0);
+    RemoveDimRange(d, d + 1);
+  }
+
+  /// Same as `RemoveDim` but returns a `Status`.
+  /// Use if unsure is `0 <= d < dims()`, to prevent `CHECK`-crashes.
+  absl::Status RemoveDimWithStatus(int64_t d) {
+    if (TF_PREDICT_FALSE(d < 0)) {
+      return errors::Internal(
+          "Expected dimension index to be non-negative, got ", d);
+    }
+    return RemoveDimRangeWithStatus(d, d + 1);
+  }
+
+  /// \brief Removes last `n` dimensions from the `TensorShape`.
+  /// REQUIRES: `0 <= n <= dims()`
+  void RemoveLastDims(int n) {
+    CHECK_LE(n, dims());
+    RemoveDimRange(dims() - n, dims());
+  }
+
+  /// Same as `RemoveLastDims` but returns a `Status`.
+  /// Use if unsure is `0 <= n <= dims()`, to prevent `CHECK`-crashes.
+  absl::Status RemoveLastDimsWithStatus(int64_t n) {
+    if (TF_PREDICT_FALSE(n > dims())) {
+      return errors::Internal("Expected dimension index to be at most ", dims(),
+                              " got ", n);
+    }
+    return RemoveDimRangeWithStatus(dims() - n, dims());
+  }
+
+  /// \brief Removes the dimensions in range `[begin:end)` from `TensorShape`.
+  /// Negative values of `end` are interpreted as `dims() + end + 1` (as in
+  /// Python). The same is true for negative values of `begin`.
+  /// REQUIRES: `-(dims()+1) <= begin <= dims()`
+  /// REQUIRES: `-(dims()+1) <= end <= dims()`
+  void RemoveDimRange(int begin, int end);
+
+  /// Same as `RemoveDimRange` but returns a `Status`.
+  /// Use if unsure if requirements in `RemoveDimRange` are satistified, to
+  /// prevent `CHECK`-fail crashes.
+  absl::Status RemoveDimRangeWithStatus(int begin, int end);
+
+  /// Return whether the rank is unknown
+  bool unknown_rank() const {
+    return kIsPartial && ndims_byte() == kUnknownRank;
+  }
+
+  /// Return the number of dimensions in the tensor.
+  /// Can be -1 meaning unknown rank for PartialTensorShape.
+  int dims() const {
+    uint8 dims = ndims_byte();
+    return kIsPartial && dims == kUnknownRank ? -1 : dims;
+  }
+
+  /// \brief Returns the number of elements in dimension `d`.
+  /// REQUIRES: `0 <= d < dims()`
+  // TODO(touts): Rename to `dimension()` to match
+  // `Eigen::Tensor::dimension()`?
+  int64_t dim_size(int d) const;
+
+  /// Returns sizes of all dimensions.
+  // Returns an empty list for unknown rank PartialTensorShape.
+  absl::InlinedVector<int64_t, 4UL> dim_sizes() const;
+
+  /// Return true iff the rank and all of the dimensions are well defined
+  // TODO(irving): Rename to is_fully_defined now that it's fast.
+  bool IsFullyDefined() const { return !kIsPartial || num_elements() != -1; }
+
+  /// Fill `*proto` from `*this`.
+  void AsProto(TensorShapeProto* proto) const;
+  TensorShapeProto AsProto() const;
+
+  /// For iterating through the dimensions.
+  TensorShapeIter<Shape> begin() const;
+  TensorShapeIter<Shape> end() const;
+
+ protected:
+  // Optimized constructor for a shape representing an empty vector.
+  //
+  // This constructor is provided to optimize the default constructor for
+  // `Tensor`.
+  explicit TensorShapeBase(DataType dt);
+
+ private:
+  absl::Status RecomputeNumElements();
+  absl::Status InitDims(absl::Span<const int64_t> dim_sizes);
+
+  // True for PartialTensorShape, false for TensorShape
+  static constexpr bool kIsPartial =
+      std::is_same<Shape, PartialTensorShape>::value;
+  static_assert(kIsPartial || std::is_same<Shape, TensorShape>::value,
+                "Shape is neither TensorShape nor PartialTensorShape");
+
+  // Used by AddDim and MakeShapeHelper.  Does no error checking.
+  void UnsafeAddDim(int64_t size, int64_t new_num_elements);
+
+  // For use by TensorShapeUtils::MakeShape
+  template <class T, class S>
+  friend absl::Status MakeShapeHelper(const T*, int64_t, S*);
+};
+
+/// Outputs `TensorShapeBase` to `std::ostream`.
+template <typename Shape>
+std::ostream& operator<<(std::ostream& os, const TensorShapeBase<Shape>& tsb) {
+  return os << tsb.DebugString();
+}
+
+/// Represents the shape of a Tensor.
+///
+/// A tensor's shape is denoted by its number of dimensions and a size for each
+/// dimension.  For example, a Tensor represented by a 3 x 4 matrix would have
+/// a shape of 2-D, [3,4].
+///
+/// If you know the exact shape of your Tensor when you create the TensorShape
+/// object, you can specify it then, or you can create a TensorShape with
+/// zero dimensions and one element, and call AddDim() to add dimensions later.
+class TensorShape : public TensorShapeBase<TensorShape> {
+ public:
+  using TensorShapeBase<TensorShape>::TensorShapeBase;
+
+  // These factory methods should be used instead of the constructors that take
+  // an array of sizes if calling code cannot validate that the sizes specify a
+  // valid `TensorShape`.
+  // The value in `*out` is valid iff the returned value is `Status::OK`.
+  static absl::Status BuildTensorShape(absl::Span<const int64_t> dim_sizes,
+                                       TensorShape* out) {
+    return BuildTensorShapeBase(dim_sizes, out);
+  }
+  static absl::Status BuildTensorShape(std::initializer_list<int64_t> dim_sizes,
+                                       TensorShape* out) {
+    return BuildTensorShape(absl::Span<const int64_t>(dim_sizes), out);
+  }
+  static absl::Status BuildTensorShape(const TensorShapeProto& proto,
+                                       TensorShape* out) {
+    return BuildTensorShapeBase(proto, out);
+  }
+
+  static absl::StatusOr<TensorShape> BuildTensorShape(
+      const TensorShapeProto& proto) {
+    TensorShape out;
+    TF_RETURN_IF_ERROR(BuildTensorShape(proto, &out));
+    return out;
+  }
+
+  /// Allow a TensorShape to be used as a PartialTensorShape without copying
+  operator const PartialTensorShape&() const;  // NOLINT(runtime/explicit)
+
+  /// Returns true if `*this` and `b` have the same sizes. Ignores
+  /// dimension names.
+  bool IsSameSize(const TensorShape& b) const;
+
+  /// Fill `*dsizes` from `*this`.
+  /// Notice: Using IndexType=int32 in combination with To32Bit() can
+  /// significantly improve performance on GPU.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  Eigen::DSizes<IndexType, NDIMS> AsEigenDSizes() const;
+
+  // Same as `AsEigenDSizes()` but returns a `Status` instead.
+  // Use this method to surface error to user instead of crashing if `NDMIS` is
+  // not equal to `dims()`.
+  // Caller must take ownership of `out`.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  absl::Status AsEigenDSizesWithStatus(
+      Eigen::DSizes<IndexType, NDIMS>* out) const;
+
+  /// Same as `AsEigenDSizes()` but allows for `NDIMS > dims()` -- in
+  /// which case we pad the rest of the sizes with 1.
+  /// Notice: Using IndexType=int32 in combination with To32Bit() can
+  /// significantly improve performance on GPU.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  Eigen::DSizes<IndexType, NDIMS> AsEigenDSizesWithPadding() const;
+
+  // Same as `AsEigenDSizesWithPadding()` but returns a `Status` instead.
+  // Use this method to surface error to user instead of crashing if `NDMIS` is
+  // not equal to `dims()`.
+  // Caller must take ownership of `out`.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  absl::Status AsEigenDSizesWithPaddingWithStatus(
+      Eigen::DSizes<IndexType, NDIMS>* out) const;
+
+ private:
+  // These CHECK fail to ease debugging.
+  // REQUIRES: dims() == NDIMS
+  void CheckDimsEqual(int NDIMS) const;
+  // REQUIRES: dims() <= NDIMS
+  void CheckDimsAtMost(int NDIMS) const;
+
+  // Fill output from `*this`.
+  // Helper method for common code between `AsEigenDSize()` and
+  // `AsEigenDSizeWithStatus()`.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  Eigen::DSizes<IndexType, NDIMS> AsEigenDSizesCopy() const;
+
+  // Fill output from `*this`.
+  // Helper method for common code between `AsEigenDSizesWithPadding()` and
+  // `AsEigenDSizeWithPaddingWithStatus()`.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  Eigen::DSizes<IndexType, NDIMS> AsEigenDSizesCopyAndPad() const;
+
+  // For access to TensorShapeBase(DataType).
+  friend class Tensor;
+};
+
+inline bool operator==(const TensorShape& a, const TensorShape& b) {
+  return a.IsSameSize(b);
+}
+inline bool operator!=(const TensorShape& a, const TensorShape& b) {
+  return !(a == b);
+}
+
+/// Outputs `TensorShapeBase` to `std::ostream`.
+inline std::ostream& operator<<(std::ostream& os, const TensorShape& ts) {
+  return os << ts.DebugString();
+}
+
+/// Represents the value of one dimension in a TensorShape.
+struct TensorShapeDim {
+  explicit TensorShapeDim(int64_t s) : size(s) {}
+  int64_t size;
+};
+
+// START_SKIP_DOXYGEN
+template <class Shape>
+class TensorShapeIter {
+ public:
+  TensorShapeIter(const Shape* shape, int d) : shape_(shape), d_(d) {}
+  bool operator==(const TensorShapeIter& rhs) {
+    DCHECK(shape_ == rhs.shape_);
+    return d_ == rhs.d_;
+  }
+  bool operator!=(const TensorShapeIter& rhs) {
+    DCHECK(shape_ == rhs.shape_);
+    return d_ != rhs.d_;
+  }
+  void operator++() { ++d_; }
+  TensorShapeDim operator*() { return TensorShapeDim(shape_->dim_size(d_)); }
+
+ private:
+  const Shape* shape_;
+  int d_;
+};
+// END_SKIP_DOXYGEN
+
+/// \brief Static helper routines for `TensorShape`. Includes a few common
+/// predicates on a tensor shape.
+class TensorShapeUtils {
+ public:
+  static bool IsScalar(const TensorShape& shape) { return shape.dims() == 0; }
+
+  static bool IsVector(const TensorShape& shape) { return shape.dims() == 1; }
+
+  static bool IsVectorOrHigher(const TensorShape& shape) {
+    return shape.dims() >= 1;
+  }
+
+  static bool IsMatrix(const TensorShape& shape) { return shape.dims() == 2; }
+
+  static bool IsSquareMatrix(const TensorShape& shape) {
+    return shape.dims() == 2 && shape.dim_size(0) == shape.dim_size(1);
+  }
+
+  static bool IsMatrixOrHigher(const TensorShape& shape) {
+    return shape.dims() >= 2;
+  }
+
+  /// \brief Returns a `TensorShape` whose dimensions are
+  /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.
+  static absl::Status MakeShape(const int32* dims, int64_t n, TensorShape* out);
+  static absl::Status MakeShape(const int64_t* dims, int64_t n,
+                                TensorShape* out);
+  static absl::Status MakeShape(absl::Span<const int32> shape,
+                                TensorShape* out);
+  static absl::Status MakeShape(absl::Span<const int64_t> shape,
+                                TensorShape* out);
+  static absl::Status MakeShape(const int32* dims, int64_t n,
+                                PartialTensorShape* out);
+  static absl::Status MakeShape(const int64_t* dims, int64_t n,
+                                PartialTensorShape* out);
+  static absl::Status MakeShape(absl::Span<const int32> shape,
+                                PartialTensorShape* out);
+  static absl::Status MakeShape(absl::Span<const int64_t> shape,
+                                PartialTensorShape* out);
+
+  static std::string ShapeListString(
+      const absl::Span<const TensorShape>& shapes);
+
+  /// \brief Returns true iff `shape` starts with `prefix`.
+  static bool StartsWith(const TensorShape& shape, const TensorShape& prefix);
+
+  /// \brief Returns true iff `shape` ends with `suffix`.
+  static bool EndsWith(const TensorShape& shape, const TensorShape& suffix);
+
+  /// \brief Returns the product of values in an int64 array,
+  /// or a failing Status if the array represents a value larger than
+  /// a `TensorShape` can hold.
+  static absl::Status NumElements(absl::Span<const int64_t> shape,
+                                  int64_t* num_elements);
+};
+
+/// Manages the partially known dimensions of a Tensor and their sizes.
+class PartialTensorShape : public TensorShapeBase<PartialTensorShape> {
+ public:
+  PartialTensorShape() {}
+  using TensorShapeBase<PartialTensorShape>::TensorShapeBase;
+
+  // These factory methods should be used instead of the constructors that take
+  // an array of sizes if calling code cannot validate that the sizes specify a
+  // valid `PartialTensorShape`.
+  // The value in `*out` is valid iff the returned value is `Status::OK`.
+  static absl::Status BuildPartialTensorShape(
+      absl::Span<const int64_t> dim_sizes, PartialTensorShape* out) {
+    return BuildTensorShapeBase(dim_sizes, out);
+  }
+  static absl::Status BuildPartialTensorShape(
+      std::initializer_list<int64_t> dim_sizes, PartialTensorShape* out) {
+    return BuildPartialTensorShape(absl::Span<const int64_t>(dim_sizes), out);
+  }
+  static absl::Status BuildPartialTensorShape(const TensorShapeProto& proto,
+                                              PartialTensorShape* out) {
+    return BuildTensorShapeBase(proto, out);
+  }
+
+  static absl::StatusOr<PartialTensorShape> BuildPartialTensorShape(
+      const TensorShapeProto& proto) {
+    PartialTensorShape out;
+    TF_RETURN_IF_ERROR(BuildTensorShapeBase(proto, &out));
+    return out;
+  }
+
+  /// Add a dimension to the end ("inner-most"), returns a new
+  /// PartialTensorShape.
+  /// REQUIRES: `size >= -1`, where -1 means unknown
+  PartialTensorShape Concatenate(int64_t size) const;
+
+  /// Similar to `Concatenate` but returning `Status`.
+  /// Use if calling code cannot validate all requirements and if `CHECK`-fails
+  /// are to be avoided.
+  absl::Status ConcatenateWithStatus(int64_t size,
+                                     PartialTensorShape* out) const;
+
+  /// Appends all the dimensions from `shape`.  Returns a new
+  /// PartialTensorShape.
+  PartialTensorShape Concatenate(const PartialTensorShape& shape) const;
+
+  /// Similar to `Concatenate` but returning `Status`.
+  /// Use if calling code cannot validate all requirements and if `CHECK`-fails
+  /// are to be avoided.
+  absl::Status ConcatenateWithStatus(const PartialTensorShape& shape,
+                                     PartialTensorShape* out) const;
+
+  /// Merges all the dimensions from `shape`.  Returns
+  /// `InvalidArgument` error if either `shape` has a different rank
+  /// or if any of the dimensions are incompatible.
+  absl::Status MergeWith(const PartialTensorShape& shape,
+                         PartialTensorShape* result) const;
+
+  /// Exact equality test. Returns true iff the ranks match (i.e., both are
+  /// unknown, or both are known and equal), and all dimensions are equal (i.e.,
+  /// both dimensions are known, or both are known and equal). This is a
+  /// stronger condition that IsCompatibleWith.
+  bool IsIdenticalTo(const PartialTensorShape& shape) const;
+
+  /// Return true iff the ranks match, and if the
+  /// dimensions all either match or one is unknown.
+  bool IsCompatibleWith(const PartialTensorShape& shape) const;
+
+  // Fill `*shape` from `*this`.
+  // If `*this` is not fully defined, returns false and
+  // `*shape` is left in an intermediate state.  Otherwise
+  // returns true.
+  bool AsTensorShape(TensorShape* shape) const;
+
+  /// \brief Returns a `PartialTensorShape` whose dimensions are
+  /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.  Values of -1 are
+  /// considered "unknown".
+  template <class T>
+  static absl::Status MakePartialShape(const T* dims, int n,
+                                       PartialTensorShape* out) {
+    return TensorShapeUtils::MakeShape(dims, n, out);
+  }
+};
+
+inline bool operator==(const PartialTensorShape& a,
+                       const PartialTensorShape& b) {
+  return a.IsIdenticalTo(b);
+}
+
+/// \brief Static helper routines for `PartialTensorShape`. Includes a few
+/// common predicates on a partially known tensor shape.
+class PartialTensorShapeUtils {
+ public:
+  static std::string PartialShapeListString(
+      const absl::Span<const PartialTensorShape>& shapes);
+
+  static bool AreIdentical(const absl::Span<const PartialTensorShape>& shapes0,
+                           const absl::Span<const PartialTensorShape>& shapes1);
+
+  static bool AreCompatible(
+      const absl::Span<const PartialTensorShape>& shapes0,
+      const absl::Span<const PartialTensorShape>& shapes1);
+};
+
+// ----------------------------------------------------------------------------
+// Template method implementation details below
+// ----------------------------------------------------------------------------
+
+template <int NDIMS, typename IndexType>
+Eigen::DSizes<IndexType, NDIMS> TensorShape::AsEigenDSizesCopy() const {
+  Eigen::DSizes<IndexType, NDIMS> dsizes;
+  for (int d = 0; d < NDIMS; d++) {
+    dsizes[d] = static_cast<IndexType>(dim_size(d));
+  }
+  return dsizes;
+}
+
+template <int NDIMS, typename IndexType>
+Eigen::DSizes<IndexType, NDIMS> TensorShape::AsEigenDSizesCopyAndPad() const {
+  static_assert(NDIMS <= TensorShape::MaxDimensions(), "Too many dimensions");
+  Eigen::DSizes<IndexType, NDIMS> dsizes;
+  for (int d = 0; d < dims(); d++) {
+    dsizes[d] = static_cast<IndexType>(dim_size(d));
+  }
+  for (int d = dims(); d < NDIMS; d++) {
+    dsizes[d] = 1;
+  }
+  return dsizes;
+}
+
+template <int NDIMS, typename IndexType>
+Eigen::DSizes<IndexType, NDIMS> TensorShape::AsEigenDSizes() const {
+  CheckDimsEqual(NDIMS);
+  return AsEigenDSizesCopy<NDIMS, IndexType>();
+}
+
+template <int NDIMS, typename IndexType>
+absl::Status TensorShape::AsEigenDSizesWithStatus(
+    Eigen::DSizes<IndexType, NDIMS>* out) const {
+  if (TF_PREDICT_FALSE(NDIMS != dims())) {
+    return errors::Internal("Asking for tensor of ", NDIMS,
+                            " dimensions from a tensor of ", dims(),
+                            " dimensions");
+  }
+  *out = AsEigenDSizesCopy<NDIMS, IndexType>();
+  return absl::OkStatus();
+}
+
+template <int NDIMS, typename IndexType>
+Eigen::DSizes<IndexType, NDIMS> TensorShape::AsEigenDSizesWithPadding() const {
+  CheckDimsAtMost(NDIMS);
+  return AsEigenDSizesCopyAndPad<NDIMS, IndexType>();
+}
+
+template <int NDIMS, typename IndexType>
+absl::Status TensorShape::AsEigenDSizesWithPaddingWithStatus(
+    Eigen::DSizes<IndexType, NDIMS>* out) const {
+  if (TF_PREDICT_FALSE(NDIMS < dims())) {
+    return errors::Internal("Asking for tensor of at most ", NDIMS,
+                            " dimensions from a tensor of ", dims(),
+                            " dimensions");
+  }
+  *out = AsEigenDSizesCopyAndPad<NDIMS, IndexType>();
+  return absl::OkStatus();
+}
+
+// ----------------------------------------------------------------------------
+// Inlining of some performance critical routines
+// ----------------------------------------------------------------------------
+
+inline TensorShapeRep::TensorShapeRep(const TensorShapeRep& b) {
+  num_elements_ = b.num_elements_;
+  if (b.tag() != REP_OUT_OF_LINE) {
+    memcpy(buf(), b.buf(), sizeof(u_.buf));
+    // memcpy above Implicitly does:
+    //   set_ndims_byte(b.ndims_byte());
+    //   set_tag(b.tag());
+  } else {
+    set_tag(REP16);  // So that SlowCopyFrom does not try to deallocate
+    SlowCopyFrom(b);
+  }
+}
+
+inline TensorShapeRep::TensorShapeRep(TensorShapeRep&& b) {
+  num_elements_ = b.num_elements_;
+  memcpy(buf(), b.buf(), sizeof(u_.buf));
+  // memcpy above Implicitly does:
+  //   set_ndims_byte(b.ndims_byte());
+  //   set_tag(b.tag());
+  b.set_tag(REP16);  // other shape no longer owns out-of-line data, if any.
+}
+
+inline TensorShapeRep::~TensorShapeRep() {
+  if (tag() == REP_OUT_OF_LINE) {
+    DestructorOutOfLine();
+  }
+}
+
+inline void TensorShapeRep::operator=(const TensorShapeRep& b) {
+  num_elements_ = b.num_elements_;
+  if (tag() != REP_OUT_OF_LINE && b.tag() != REP_OUT_OF_LINE) {
+    memcpy(buf(), b.buf(), sizeof(u_.buf));
+    // memcpy above implicitly also does:
+    //   set_tag(b.tag());
+    //   set_ndims_byte(b.ndims_byte());
+  } else {
+    SlowCopyFrom(b);
+  }
+}
+
+inline void TensorShapeRep::operator=(TensorShapeRep&& b) {
+  if (tag() == REP_OUT_OF_LINE) {
+    DestructorOutOfLine();
+  }
+  num_elements_ = b.num_elements_;
+  memcpy(buf(), b.buf(), sizeof(u_.buf));
+  // memcpy above Implicitly does:
+  //   set_ndims_byte(b.ndims_byte());
+  //   set_tag(b.tag());
+  b.set_tag(REP16);  // other shape no longer owns out-of-line data, if any.
+}
+
+inline TensorShape::operator const PartialTensorShape&() const {
+  // Downcast to the shared representation and upcast to PartialTensorShape
+  const TensorShapeRep* rep = this;
+  return *static_cast<const PartialTensorShape*>(rep);
+}
+
+template <class Shape>
+inline TensorShapeBase<Shape>::TensorShapeBase(DataType dt) {
+  set_tag(REP16);
+  set_data_type(dt);
+
+  // Optimized implementation of InitDims() where the shape is statically known
+  // to be {0}.
+  set_ndims_byte(1);
+  uint16* dst = as16()->dims_;
+  *dst = 0;
+  set_num_elements(0);
+}
+
+// Declare explicit instantiations in .cc file
+extern template class TensorShapeBase<TensorShape>;
+extern template class TensorShapeBase<PartialTensorShape>;
+
+// A convenient struct to represent a (DataType, PartialTensorShape) pair. It's
+// often used in shape inference.
+struct DtypeAndPartialTensorShape {
+  DataType dtype;
+  PartialTensorShape shape;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_slice.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_slice.h
new file mode 100644
index 00000000..4ada28d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_slice.h
@@ -0,0 +1,231 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_SLICE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_SLICE_H_
+
+#include <string>
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// A tensor slice represents a slice of a given tensor. It is represented by a
+// list of (start, length) pairs, where the size of the list is the rank of the
+// tensor.
+
+class TensorSlice {
+ public:
+  // Construct a tensor slice: you have a number of ways:
+  // -- creating an empty slice
+  // -- from just a dimension (in this case it will create a full slice)
+  // -- from an array of pairs of integers.
+  // -- from a TensorSliceProto protocol buffer
+  // -- from a string format of "start,length:start,length..." where each
+  //    "start,length" pair represents the slice on one dimension. We allow a
+  //    special "-" that means "everything for this dimension". One such example
+  //    is:  0,10:-:14,1:-:-
+  TensorSlice() {}
+  explicit TensorSlice(int dim);
+  explicit TensorSlice(const TensorSliceProto& proto);
+  explicit TensorSlice(
+      std::initializer_list<std::pair<int64_t, int64_t>> extents);
+
+  // This factory methods should be used instead of the constructor that takes a
+  // `TensorSliceProto` if calling code cannot validate that the sizes specify a
+  // valid `TensorSlice`.
+  static absl::Status BuildTensorSlice(const TensorSliceProto& proto,
+                                       TensorSlice* output);
+
+  static absl::Status Parse(const string& str, TensorSlice* output);
+  static TensorSlice ParseOrDie(const string& str) {
+    TensorSlice ret;
+    absl::Status s = Parse(str, &ret);
+    if (!s.ok()) {
+      LOG(FATAL) << "Could not parse TensorSlice";
+    }
+    return ret;
+  }
+
+  void Clear();
+
+  // Accessors
+  int dims() const { return starts_.size(); }
+
+  int64_t start(int d) const {
+    DCHECK_GE(d, 0);
+    DCHECK_LT(d, dims());
+    return starts_[d];
+  }
+
+  int64_t length(int d) const {
+    DCHECK_GE(d, 0);
+    DCHECK_LT(d, dims());
+    return lengths_[d];
+  }
+
+  int64_t end(int d) const {
+    DCHECK_GE(d, 0);
+    DCHECK_LT(d, dims());
+    return start(d) + length(d);
+  }
+
+  void set_start(int d, int64_t x) {
+    DCHECK_GE(d, 0);
+    DCHECK_LT(d, dims());
+    DCHECK_GE(x, 0);
+    starts_[d] = x;
+  }
+
+  void set_length(int d, int64_t x) {
+    DCHECK_GE(d, 0);
+    DCHECK_LT(d, dims());
+    lengths_[d] = x;
+  }
+
+  // If we have a full slice along dimension "d".
+  bool IsFullAt(int d) const {
+    return lengths_[d] == kFullExtent && starts_[d] == 0;
+  }
+
+  // If this is a full slice, i.e. IsFullAt(d) for every d.
+  bool IsFull() const;
+
+  // Set the slice to be a full slice of "dim" dimensions
+  void SetFullSlice(int dim);
+
+  // Extend a slice to "dim" dimensions: all the added dimensions are full.
+  // Requires: dim >= dims().
+  void Extend(int dim);
+
+  // Conversion of a TensorSlice to other formats
+  void AsProto(TensorSliceProto* proto) const;
+  string DebugString() const;
+
+  // Fill *indices and *sizes from *this (so that we can use the slice()
+  // function in eigen tensor). We need a tensor shape in case some of the
+  // slices are full slices.
+  // We allow NDIMS to be greater than dims(), in which case we will pad the
+  // higher dimensions with trivial dimensions.
+  template <int NDIMS>
+  void FillIndicesAndSizes(
+      const TensorShape& shape,
+      Eigen::DSizes<Eigen::DenseIndex, NDIMS>* indices,
+      Eigen::DSizes<Eigen::DenseIndex, NDIMS>* sizes) const;
+
+  // Interaction with other TensorSlices.
+
+  // Compute the intersection with another slice and if "result" is not
+  // nullptr, store the results in *result; returns true if there is any real
+  // intersection.
+  bool Intersect(const TensorSlice& other, TensorSlice* result) const;
+  // A short hand.
+  bool Overlaps(const TensorSlice& other) const {
+    return Intersect(other, nullptr);
+  }
+
+  // Equals iff "*this" and "other" are logically equivalent.
+  bool operator==(const TensorSlice& other) const;
+  bool operator!=(const TensorSlice& other) const { return !(*this == other); }
+
+  // Interaction with TensorShape.
+
+  // Slices a shape and stores the result into *result_shape.
+  // Requires that the shape and *this have the same rank.
+  // For example, given a tensor shape of {3, 4, 5}, and a slice of
+  // 1,2:-:0,2, the result shape is {2, 4, 2}.
+  absl::Status SliceTensorShape(const TensorShape& shape,
+                                TensorShape* result_shape) const;
+
+  // Given slice "sub" where "sub" is fully contained in *this,
+  // (meaning that the intersection of "sub" and *this equals "sub"), computes
+  // the "relative" slice of "sub" with respect to *this.
+  //
+  // In other words, if we use A>S to denote slicing a shape S with a slice A,
+  // then the function is computing a slice X such that:
+  //   X > (this > S) = sub > S
+  // for any shape S.
+  //
+  // In general, along every dimension, the start of the relative slice is the
+  // start of the "sub" slice minus the start of *this; the length of the
+  // relative slice is the length of the "sub" slice.
+  //
+  // For example, say we have a shape of {3, 4, 5}, "this" is 0,2:-:1,2, and
+  // "sub" is 1,1:2:2,1,2, then the related slice is 1,1:2,2:0,2.
+  //
+  // The caller needs to make sure that "sub" is indeed a sub-slice of *this;
+  // otherwise the result is undefined.
+  void ComputeRelative(const TensorSlice& sub, TensorSlice* relative) const;
+
+  // Updates the slice in such a way that it fully covers "other" slice.
+  // Note, "other" slice should refer to the same tensor shape.
+  // Example:
+  //   given a slice [2:4, :, 3:] and "other" slice [:, 1:4, 2:4] the
+  //   updated slice would be [:, :, 2:]. Here is why:
+  //   dim 0: "2:4"  U  ":"    ->  ":"
+  //   dim 1: ":"    U  "1-4"  ->  ":"
+  //   dim 2: "3:"   U  "2:4"  ->  "2:"
+  void UpdateToCover(const TensorSlice& other);
+
+  // Returns true if the length field was specified in an Extent.
+  static bool HasExtentLength(const TensorSliceProto::Extent& extent);
+
+  // Returns the value of the length field in an Extent, or -1 if it
+  // is not present.
+  static int64_t GetExtentLength(const TensorSliceProto::Extent& extent);
+
+ private:
+  // a length value of kFullExtent (-1) means we have a full slice at this
+  // dimension. It's defined in tensor_slice.cc.
+  static const int64_t kFullExtent;
+
+  // TODO(yangke): switch to Eigen once it supports variable size arrays.
+  // A value of
+  absl::InlinedVector<int64_t, 4UL> starts_;
+  absl::InlinedVector<int64_t, 4UL> lengths_;
+};
+
+template <int NDIMS>
+void TensorSlice::FillIndicesAndSizes(
+    const TensorShape& shape, Eigen::DSizes<Eigen::DenseIndex, NDIMS>* indices,
+    Eigen::DSizes<Eigen::DenseIndex, NDIMS>* sizes) const {
+  CHECK_EQ(shape.dims(), dims()) << "Incompatible dimensions between shape "
+                                 << "slices: shape = " << shape.DebugString()
+                                 << ", slice = " << DebugString();
+  CHECK_GE(NDIMS, dims()) << "Asking for a " << NDIMS << "-dim slice from "
+                          << "a slice of dimension " << dims();
+  for (int d = 0; d < dims(); ++d) {
+    if (IsFullAt(d)) {
+      (*indices)[d] = 0;
+      (*sizes)[d] = shape.dim_size(d);
+    } else {
+      (*indices)[d] = starts_[d];
+      (*sizes)[d] = lengths_[d];
+    }
+  }
+  for (int d = dims(); d < NDIMS; ++d) {
+    (*indices)[d] = 0;
+    (*sizes)[d] = 1;
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_testutil.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_testutil.h
new file mode 100644
index 00000000..53ad5969
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_testutil.h
@@ -0,0 +1,162 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_
+
+#include <numeric>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace test {
+
+// Constructs a scalar tensor with 'val'.
+template <typename T>
+Tensor AsScalar(const T& val) {
+  Tensor ret(DataTypeToEnum<T>::value, {});
+  ret.scalar<T>()() = val;
+  return ret;
+}
+
+// Constructs a flat tensor with 'vals'.
+template <typename T>
+Tensor AsTensor(gtl::ArraySlice<T> vals) {
+  Tensor ret(DataTypeToEnum<T>::value, {static_cast<int64_t>(vals.size())});
+  std::copy_n(vals.data(), vals.size(), ret.flat<T>().data());
+  return ret;
+}
+
+// Constructs a tensor of "shape" with values "vals".
+template <typename T>
+Tensor AsTensor(gtl::ArraySlice<T> vals, const TensorShape& shape) {
+  Tensor ret;
+  CHECK(ret.CopyFrom(AsTensor(vals), shape));
+  return ret;
+}
+
+// Fills in '*tensor' with 'vals'. E.g.,
+//   Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2}));
+//   test::FillValues<float>(&x, {11, 21, 21, 22});
+template <typename T>
+void FillValues(Tensor* tensor, gtl::ArraySlice<T> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    std::copy_n(vals.data(), vals.size(), flat.data());
+  }
+}
+
+// Fills in '*tensor' with 'vals', converting the types as needed.
+template <typename T, typename SrcType>
+void FillValues(Tensor* tensor, std::initializer_list<SrcType> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    size_t i = 0;
+    for (auto itr = vals.begin(); itr != vals.end(); ++itr, ++i) {
+      flat(i) = T(*itr);
+    }
+  }
+}
+
+// Fills in '*tensor' with a sequence of value of val, val+1, val+2, ...
+//   Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2}));
+//   test::FillIota<float>(&x, 1.0);
+template <typename T>
+void FillIota(Tensor* tensor, const T& val) {
+  auto flat = tensor->flat<T>();
+  std::iota(flat.data(), flat.data() + flat.size(), val);
+}
+
+// Fills in '*tensor' with a sequence of value of fn(0), fn(1), ...
+//   Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2}));
+//   test::FillFn<float>(&x, [](int i)->float { return i*i; });
+template <typename T>
+void FillFn(Tensor* tensor, std::function<T(int)> fn) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) flat(i) = fn(i);
+}
+
+// Expects "x" and "y" are tensors of the same type, same shape, and identical
+// values (within 4 ULPs for floating point types unless explicitly disabled).
+enum class Tolerance {
+  kNone,
+  kDefault,
+};
+void ExpectEqual(const Tensor& x, const Tensor& y,
+                 Tolerance t = Tolerance ::kDefault);
+
+// Expects "x" and "y" are tensors of the same (floating point) type,
+// same shape and element-wise difference between x and y is no more
+// than atol + rtol * abs(x). If atol or rtol is negative, the data type's
+// epsilon * kSlackFactor is used.
+void ExpectClose(const Tensor& x, const Tensor& y, double atol = -1.0,
+                 double rtol = -1.0);
+
+// Expects "x" and "y" are tensors of the same type T, same shape, and
+// equal values. Consider using ExpectEqual above instead.
+template <typename T>
+void ExpectTensorEqual(const Tensor& x, const Tensor& y) {
+  EXPECT_EQ(x.dtype(), DataTypeToEnum<T>::value);
+  ExpectEqual(x, y);
+}
+
+::testing::AssertionResult IsSameType(const Tensor& x, const Tensor& y);
+::testing::AssertionResult IsSameShape(const Tensor& x, const Tensor& y);
+
+template <typename T>
+void ExpectTensorEqual(const Tensor& x, const Tensor& y,
+                       std::function<bool(const T&, const T&)> is_equal) {
+  EXPECT_EQ(x.dtype(), DataTypeToEnum<T>::value);
+  ASSERT_TRUE(IsSameType(x, y));
+  ASSERT_TRUE(IsSameShape(x, y));
+
+  const T* Tx = x.unaligned_flat<T>().data();
+  const T* Ty = y.unaligned_flat<T>().data();
+  auto size = x.NumElements();
+  int max_failures = 10;
+  int num_failures = 0;
+  for (decltype(size) i = 0; i < size; ++i) {
+    EXPECT_TRUE(is_equal(Tx[i], Ty[i])) << "i = " << (++num_failures, i);
+    ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up.";
+  }
+}
+
+// Expects "x" and "y" are tensors of the same type T, same shape, and
+// approximate equal values. Consider using ExpectClose above instead.
+template <typename T>
+void ExpectTensorNear(const Tensor& x, const Tensor& y, double atol) {
+  EXPECT_EQ(x.dtype(), DataTypeToEnum<T>::value);
+  ExpectClose(x, y, atol, /*rtol=*/0.0);
+}
+
+// For tensor_testutil_test only.
+namespace internal_test {
+::testing::AssertionResult IsClose(Eigen::half x, Eigen::half y,
+                                   double atol = -1.0, double rtol = -1.0);
+::testing::AssertionResult IsClose(float x, float y, double atol = -1.0,
+                                   double rtol = -1.0);
+::testing::AssertionResult IsClose(double x, double y, double atol = -1.0,
+                                   double rtol = -1.0);
+}  // namespace internal_test
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_types.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_types.h
new file mode 100644
index 00000000..2381d6b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_types.h
@@ -0,0 +1,199 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_TYPES_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Helper to define Tensor types given that the scalar is of type T.
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
+struct TTypes {
+  // Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Tensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstTensor;
+
+  // Unaligned Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
+      UnalignedTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> >
+      UnalignedConstTensor;
+
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      Tensor32Bit;
+
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
+      Eigen::Aligned>
+      Scalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      ConstScalar;
+
+  // Unaligned Scalar tensor of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
+      UnalignedScalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType> >
+      UnalignedConstScalar;
+
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Flat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Vec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstVec;
+
+  // Unaligned Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedFlat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedVec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstVec;
+
+  // Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Matrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstMatrix;
+
+  // Unaligned Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
+      UnalignedMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> >
+      UnalignedConstMatrix;
+};
+
+typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
+
+template <typename Index, int NumDims>
+bool SafeFor32BitIndexing(const Eigen::DSizes<Index, NumDims>& in) {
+  for (int i = 0; i < NumDims; ++i) {
+    if (in[i] > std::numeric_limits<Index32>::max()) return false;
+  }
+  return true;
+}
+
+template <typename Index, size_t NumDims>
+bool SafeFor32BitIndexing(const Eigen::array<Index, NumDims>& in) {
+  for (size_t i = 0; i < NumDims; ++i) {
+    if (in[i] > std::numeric_limits<Index32>::max()) return false;
+  }
+  return true;
+}
+
+template <typename TensorType,
+          typename Enable = typename TTypes<
+              typename TensorType::Scalar, TensorType::NumIndices>::Tensor32Bit>
+bool SafeFor32BitIndexing(TensorType in) {
+  return in.size() <= std::numeric_limits<Index32>::max();
+}
+
+template <typename Index, int NumDims>
+Eigen::DSizes<Index32, NumDims> To32Bit(
+    const Eigen::DSizes<Index, NumDims>& in) {
+  DCHECK(SafeFor32BitIndexing(in));
+  Eigen::DSizes<Index32, NumDims> out;
+  for (int i = 0; i < NumDims; ++i) {
+    out[i] = static_cast<Index32>(in[i]);
+  }
+  return out;
+}
+
+template <typename Index, size_t NumDims>
+Eigen::array<Index32, NumDims> To32Bit(const Eigen::array<Index, NumDims>& in) {
+  DCHECK(SafeFor32BitIndexing(in));
+  Eigen::array<Index32, NumDims> out;
+  for (size_t i = 0; i < NumDims; ++i) {
+    out[i] = static_cast<Index32>(in[i]);
+  }
+  return out;
+}
+
+template <typename TensorType>
+typename TTypes<typename TensorType::Scalar,
+                TensorType::NumIndices>::Tensor32Bit
+To32Bit(TensorType in) {
+  typedef typename TTypes<typename TensorType::Scalar,
+                          TensorType::NumIndices>::Tensor32Bit RetType;
+  DCHECK(SafeFor32BitIndexing(in));
+  return RetType(in.data(), To32Bit(in.dimensions()));
+}
+
+namespace internal {
+
+template <typename Device>
+struct MaybeWith32BitIndexingImpl {
+  template <typename Func, typename... Args>
+  void operator()(Func func, Args&&... args) const {
+    func(std::forward<Args>(args)...);
+  }
+};
+
+template <>
+struct MaybeWith32BitIndexingImpl<Eigen::GpuDevice> {
+  template <typename Func, typename... Args>
+  void operator()(Func func, Args&&... args) const {
+    auto all = [](const auto&... bool_vals) {
+      for (bool b : {bool_vals...}) {
+        if (!b) return false;
+      }
+      return true;
+    };
+    if (all(SafeFor32BitIndexing(std::forward<Args>(args))...)) {
+      func(To32Bit(std::forward<Args>(args))...);
+    } else {
+      func(std::forward<Args>(args)...);
+    }
+  }
+};
+
+}  // namespace internal
+
+template <typename Device, typename Func, typename... Args>
+void MaybeWith32BitIndexing(Func func, Args&&... args) {
+  return internal::MaybeWith32BitIndexingImpl<Device>()(
+      func, std::forward<Args>(args)...);
+}
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tensor_util.h b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_util.h
new file mode 100644
index 00000000..eec2bd3f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tensor_util.h
@@ -0,0 +1,358 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensor {
+
+// DeepCopy returns a tensor whose contents are a deep copy of the
+// contents of 'other'.  This function is intended only for
+// convenience, not speed.
+//
+// REQUIRES: 'other' must point to data stored in CPU memory.
+// REQUIRES: 'other' must be a Tensor of a copy-able type if
+//           'other' is not appropriately memory-aligned.
+Tensor DeepCopy(const Tensor& other);
+
+// Deep copies input to output.  This function is similar to above, but assumes
+// that the memory for the output has already been allocated.
+void DeepCopy(const Tensor& input, Tensor* output);
+
+// Concatenates 'tensors' into a single tensor, along their 0th dimension.
+//
+// REQUIRES: All members of 'tensors' must have the same data type parameter.
+// REQUIRES: Each member of 'tensors' must have at least one dimension.
+// REQUIRES: Each member of 'tensors' must point to data stored in CPU memory.
+// REQUIRES: Each member of 'tensors' must be a Tensor of a copy-able type if it
+//           is not appropriately memory-aligned.
+absl::Status Concat(absl::Span<const Tensor> tensors, Tensor* result);
+
+// Splits 'tensor' into 'sizes.size()' individual tensors, along the 0th
+// dimension. The ith output tensor has 0th-dimension size 'sizes[i]'.
+//
+// REQUIRES: 'tensor' must have at least one dimension.
+// REQUIRES: 'tensor.dim_size(0)' must equal the sum of the elements of 'sizes'.
+// REQUIRES: 'tensor' must point to data stored in CPU memory.
+// REQUIRES: 'tensor' must be a Tensor of a copy-able type if it is not
+//           appropriately memory-aligned.
+//
+// Split() and Concat() are inverse operations.
+absl::Status Split(const Tensor& tensor, absl::Span<const int64_t> sizes,
+                   std::vector<Tensor>* result);
+
+namespace internal {
+void SetTensorProtoShape(absl::Span<const size_t> shape,
+                         TensorShapeProto* shape_proto);
+
+template <typename Type>
+class TensorProtoFieldHelper : public std::false_type {};
+
+#define DEFINE_PROTO_FIELD_HELPER(TYPE, FIELDNAME)                            \
+  template <>                                                                 \
+  class TensorProtoFieldHelper<TYPE> : public std::true_type {                \
+   public:                                                                    \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val(0)) FieldType;            \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val()) RepeatedFieldType;     \
+    typedef decltype(std::declval<TensorProto>().mutable_##FIELDNAME##_val()) \
+        MutableRepeatedFieldType;                                             \
+    static MutableRepeatedFieldType GetMutableField(TensorProto* proto) {     \
+      return proto->mutable_##FIELDNAME##_val();                              \
+    }                                                                         \
+    static RepeatedFieldType& GetField(const TensorProto& proto) {            \
+      return proto.FIELDNAME##_val();                                         \
+    }                                                                         \
+  }
+
+// The argument pairs in the following macro instantiations encode the
+// mapping from C++ type ($1) to repeated field name "$2_val" used for storing
+// values in TensorProto. See tensorflow/core/framework/tensor.proto.
+DEFINE_PROTO_FIELD_HELPER(float, float);
+DEFINE_PROTO_FIELD_HELPER(double, double);
+DEFINE_PROTO_FIELD_HELPER(int8, int);
+DEFINE_PROTO_FIELD_HELPER(uint8, int);
+DEFINE_PROTO_FIELD_HELPER(int16, int);
+DEFINE_PROTO_FIELD_HELPER(uint16, int);
+DEFINE_PROTO_FIELD_HELPER(int32, int);
+DEFINE_PROTO_FIELD_HELPER(uint32, uint32);
+DEFINE_PROTO_FIELD_HELPER(int64_t, int64);
+DEFINE_PROTO_FIELD_HELPER(uint64, uint64);
+DEFINE_PROTO_FIELD_HELPER(bool, bool);
+DEFINE_PROTO_FIELD_HELPER(qint8, int);
+DEFINE_PROTO_FIELD_HELPER(quint8, int);
+DEFINE_PROTO_FIELD_HELPER(qint16, int);
+DEFINE_PROTO_FIELD_HELPER(quint16, int);
+DEFINE_PROTO_FIELD_HELPER(qint32, int);
+DEFINE_PROTO_FIELD_HELPER(Eigen::half, half);
+DEFINE_PROTO_FIELD_HELPER(bfloat16, half);
+DEFINE_PROTO_FIELD_HELPER(complex64, scomplex);
+DEFINE_PROTO_FIELD_HELPER(complex128, dcomplex);
+
+#undef DEFINE_PROTO_HELPER
+
+template <typename T>
+struct CopyHelper {
+  template <typename SrcIter, typename DstIter>
+  static void ToArray(SrcIter begin, SrcIter end, DstIter dst) {
+    using SrcType = typename std::iterator_traits<SrcIter>::value_type;
+    using DstType = typename std::iterator_traits<DstIter>::value_type;
+    std::transform(begin, end, dst, [](const SrcType& x) -> DstType {
+      return static_cast<DstType>(x);
+    });
+  }
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, SrcIter dst) {
+    std::copy(begin, end, dst);
+  }
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    ToArray(begin, end, dst);
+  }
+};
+
+// Overloads for Eigen::half and bfloat16 that are 16 bits in size but are
+// stored in an int32 field.
+template <>
+struct CopyHelper<Eigen::half> {
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, Eigen::half* dst) {
+    std::transform(begin, end, dst, [](int x) -> Eigen::half {
+      return Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16>(x));
+    });
+  }
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    std::transform(begin, end, dst, [](Eigen::half h) -> int {
+      return static_cast<int>(Eigen::numext::bit_cast<uint16>(h));
+    });
+  }
+};
+
+template <>
+struct CopyHelper<bfloat16> {
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, bfloat16* dst) {
+    std::transform(begin, end, dst, [](int x) -> bfloat16 {
+      return Eigen::numext::bit_cast<bfloat16>(static_cast<uint16>(x));
+    });
+  }
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    std::transform(begin, end, dst, [](bfloat16 bf16) -> int {
+      return static_cast<int>(Eigen::numext::bit_cast<uint16>(bf16));
+    });
+  }
+};
+
+// Overloads for complex types that store real and imaginary parts
+// at indices 2*i and 2*i+1 in float or double field.
+template <typename RealType>
+struct CopyHelper<std::complex<RealType>> {
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, std::complex<RealType>* dst) {
+    RealType* real_dst = reinterpret_cast<RealType*>(dst);
+    std::copy(begin, end, real_dst);
+  }
+
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    size_t n = std::distance(begin, end);
+    const RealType* real_begin = reinterpret_cast<const RealType*>(&(*begin));
+    std::copy_n(real_begin, 2 * n, dst);
+  }
+};
+
+// Helper class to extract and insert values into TensorProto represented as
+// repeated fields.
+template <typename T>
+class TensorProtoHelper : public std::true_type {
+ public:
+  using FieldHelper = TensorProtoFieldHelper<T>;
+  using FieldType = typename TensorProtoFieldHelper<T>::FieldType;
+
+  static DataType GetDataType() { return DataTypeToEnum<T>::value; }
+
+  // Returns the number of values of type T encoded in the proto.
+  static size_t NumValues(const TensorProto& proto) {
+    size_t raw_size = FieldHelper::GetField(proto).size();
+    return is_complex<T>::value ? raw_size / 2 : raw_size;
+  }
+
+  static void AddValue(const T& value, TensorProto* proto) {
+    const T* val_ptr = &value;
+    AddValues(val_ptr, val_ptr + 1, proto);
+  }
+
+  static T GetValue(size_t index, const TensorProto& proto) {
+    const size_t stride = is_complex<T>::value ? 2 : 1;
+    T val;
+    CopyHelper<T>::ToArray(
+        FieldHelper::GetField(proto).begin() + stride * index,
+        FieldHelper::GetField(proto).begin() + stride * (index + 1), &val);
+    return val;
+  }
+
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    size_t n = std::distance(begin, end);
+    FieldType* dst = AppendUninitialized(n, proto);
+    CopyHelper<T>::FromArray(begin, end, dst);
+  }
+
+  template <typename IterType>
+  static void CopyValues(IterType dst, const TensorProto& proto) {
+    CopyHelper<T>::ToArray(FieldHelper::GetField(proto).begin(),
+                           FieldHelper::GetField(proto).end(), dst);
+  }
+
+  static void Truncate(size_t new_size, TensorProto* proto) {
+    if (is_complex<T>::value) new_size *= 2;
+    FieldHelper::GetMutableField(proto)->Truncate(new_size);
+  }
+
+  static FieldType* AppendUninitialized(size_t n, TensorProto* proto) {
+    if (is_complex<T>::value) n *= 2;
+    auto* field = FieldHelper::GetMutableField(proto);
+    field->Reserve(field->size() + n);
+    return reinterpret_cast<FieldType*>(field->AddNAlreadyReserved(n));
+  }
+};
+
+// Specialization for string.
+template <>
+class TensorProtoHelper<string> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_STRING; }
+  static void AddValue(const string& value, TensorProto* proto) {
+    *proto->mutable_string_val()->Add() = value;
+  }
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    for (IterType it = begin; it != end; ++it) {
+      AddValue(*it, proto);
+    }
+  }
+  template <typename IterType>
+  static void CopyToTensorContent(IterType begin, IterType end,
+                                  TensorProto* proto) {
+    AddValues(begin, end, proto);
+  }
+};
+
+template <typename Type, typename IterType>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProto(IterType values_begin, IterType values_end,
+                  const size_t values_size,
+                  const absl::Span<const size_t> shape) {
+  TensorProto tensor;
+  TensorShapeProto tensor_shape_proto;
+  internal::SetTensorProtoShape(shape, &tensor_shape_proto);
+  if (TensorShape(tensor_shape_proto).num_elements() != values_size) {
+    LOG(ERROR) << "Shape and number of values (" << values_size
+               << ") are incompatible.";
+    return tensor;
+  }
+  using TypeHelper = internal::TensorProtoHelper<Type>;
+  tensor.set_dtype(TypeHelper::GetDataType());
+  *tensor.mutable_tensor_shape() = std::move(tensor_shape_proto);
+  TypeHelper::AddValues(values_begin, values_end, &tensor);
+  return tensor;
+}
+
+}  // namespace internal
+
+// Creates a 'TensorProto' with the specified shape and values. The dtype and a
+// field to represent data values of the returned 'TensorProto' are determined
+// based on Type. Note that unless the argument provided to `values` is already
+// an absl::Span, `Type` will need to be provided as a template parameter--the
+// compiler can't infer it:
+//   auto proto = CreateTensorProtoSpan<float>(my_array, shape);
+template <typename Type>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProtoSpan(const absl::Span<const Type> values,
+                      const absl::Span<const size_t> shape) {
+  return internal::CreateTensorProto<Type>(values.begin(), values.end(),
+                                           values.size(), shape);
+}
+
+// Version of the above that's more convenient if `values` is an std::vector, in
+// which case Type can automatically be inferred:
+//   auto proto = CreateTensorProto(my_vector, shape);
+template <typename Type>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProto(const std::vector<Type>& values,
+                  const absl::Span<const size_t> shape) {
+  // This awkward iterator passing is essentially just to support vector<bool>,
+  // otherwise we could just represent the vector as a Span.
+  return internal::CreateTensorProto<Type>(values.begin(), values.end(),
+                                           values.size(), shape);
+}
+
+// Converts values in tensor to run-length encoded compressed form.
+//
+// The elements of a tensor can be stored in a TensorProto in one of the
+// following two forms:
+// 1. As a raw byte string in the field `tensor_content` containing the
+//    serialized in-memory representation of the tensor.
+// 2. As values of a repeated field depending on the datatype, e.g. that
+//    values of a DT_FLOAT tensor would be stored in the repeated field
+//    `float_val`.
+// Storage scheme 2 may use a simple form of run-length encoding to compress
+// data: If the values contains a tail of identical values, the repeated field
+// will be truncated such that the number of values in the repeated field is
+// less than the number of elements implied by the field`tensor_shape`. The
+// original tensor can be recovered by repeating the final value in the repeated
+// field.
+//
+// The TensorProto will be compressed if a) the tensor contains at least
+// min_num_elements elements and b) the compressed tensor proto is would be at
+// most the size of the original tensor proto divided by min_compression_ratio.
+//
+// Returns true if the tensor was compressed.
+bool CompressTensorProtoInPlace(int64_t min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor);
+
+inline bool CompressTensorProtoInPlace(TensorProto* tensor) {
+  static const int64_t kDefaultMinNumElements = 64;
+  static const float kDefaultMinCompressionRatio = 2.0f;
+  return CompressTensorProtoInPlace(kDefaultMinNumElements,
+                                    kDefaultMinCompressionRatio, tensor);
+}
+
+// Make a TensorShape from the contents of shape_t. Shape_t must be a
+// 1-dimensional tensor of type int32 or int64.
+absl::Status MakeShape(const Tensor& shape_t, TensorShape* out);
+
+}  // namespace tensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/thread_factory.h b/third_party/tflite-hdrs/tensorflow/core/framework/thread_factory.h
new file mode 100644
index 00000000..769ada29
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/thread_factory.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_THREAD_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_THREAD_FACTORY_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tsl {
+class Thread;
+}  // namespace tsl
+namespace tensorflow {
+using tsl::Thread;  // NOLINT
+
+// Virtual interface for an object that creates threads.
+class ThreadFactory {
+ public:
+  virtual ~ThreadFactory() {}
+
+  // Runs `fn` asynchronously in a different thread. `fn` may block.
+  //
+  // NOTE: The caller is responsible for ensuring that this `ThreadFactory`
+  // outlives the returned `Thread`.
+  virtual std::unique_ptr<Thread> StartThread(const string& name,
+                                              std::function<void()> fn) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_THREAD_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/tracking_allocator.h b/third_party/tflite-hdrs/tensorflow/core/framework/tracking_allocator.h
new file mode 100644
index 00000000..ba54b2c5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/tracking_allocator.h
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TRACKING_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TRACKING_ALLOCATOR_H_
+
+#include <unordered_map>
+
+#include "xla/tsl/framework/tracking_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// NOLINTEND(misc-unused-using-decls)
+using tsl::AllocRecord;
+using tsl::TrackingAllocator;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TRACKING_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/type_index.h b/third_party/tflite-hdrs/tensorflow/core/framework/type_index.h
new file mode 100644
index 00000000..d73ca527
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/type_index.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
+
+#include <string>
+
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+#include <typeinfo>
+#endif  // __GXX_RTTI
+
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// On some platforms, we would like to avoid using RTTI in order to have smaller
+// binary sizes. This file provides a thin TypeIndex class that mimics
+// std::type_index but does not use RTTI (with a minimal set of functions needed
+// by the TensorFlow framework, and more can be added if necessary). In the
+// absence of RTTI, it does not provide the actual name of the type, and only
+// returns a pre-baked string specifying that RTTI is disabled. The hash code
+// provided in this class is unique for each class. However, it is generated at
+// runtime so this hash code should not be serialized - the value for the same
+// type can change from run to run.
+class TypeIndex {
+ public:
+  TypeIndex(const TypeIndex& src) : hash_(src.hash_), name_(src.name_) {}
+  TypeIndex& operator=(const TypeIndex& src) {
+    hash_ = src.hash_;
+    name_ = src.name_;
+    return *this;
+  }
+  bool operator==(const TypeIndex& rhs) const { return (hash_ == rhs.hash_); }
+  bool operator!=(const TypeIndex& rhs) const { return (hash_ != rhs.hash_); }
+  ~TypeIndex() {}
+
+  const char* name() const { return name_; }
+
+  uint64 hash_code() const { return hash_; }
+
+  // Returns a TypeIndex object that corresponds to a typename.
+  template <typename T>
+  static TypeIndex Make() {
+#ifdef PLATFORM_CLOUD_TPU
+    static bool hash_bit[1];
+    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
+                     typeid(T).name());
+#endif
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+
+    // Use a hash based on the type name to avoid issues due to RTLD_LOCAL on
+    // MacOS (b/156979412).
+    return TypeIndex(Hash64(typeid(T).name()), typeid(T).name());
+
+#else
+    static bool hash_bit[1];
+#if TARGET_OS_OSX
+    // Warn MacOS users that not using RTTI can cause problems (b/156979412).
+#warning \
+    "Compiling with RTTI disabled on MacOS can cause problems when comparing " \
+    "types across shared libraries."
+#endif  // TARGET_OS_OSX
+
+    // No type names available.
+    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
+                     "[RTTI disabled]");
+#endif  // __GXX_RTTI
+  }
+
+ private:
+  // We hide the constructor of the TypeIndex class. Use the templated
+  // Make<T>() function to create a TypeIndex object.
+  explicit TypeIndex(const uint64 hash, const char* name)
+      : hash_(hash), name_(name) {}
+  uint64 hash_;
+  const char* name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/type_traits.h b/third_party/tflite-hdrs/tensorflow/core/framework/type_traits.h
new file mode 100644
index 00000000..ac1c9e86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/type_traits.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPE_TRAITS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPE_TRAITS_H_
+
+#include <limits>
+#include <utility>
+
+#include "xla/tsl/framework/type_traits.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::false_type;
+using tsl::is_complex;
+using tsl::is_quantized;
+using tsl::is_simple_type;
+using tsl::true_type;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPE_TRAITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/typed_allocator.h b/third_party/tflite-hdrs/tensorflow/core/framework/typed_allocator.h
new file mode 100644
index 00000000..6d89983b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/typed_allocator.h
@@ -0,0 +1,135 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
+
+#include <limits>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Variant;
+
+// Convenience functions to do typed allocation.  C++ constructors
+// and destructors are invoked for complex types if necessary.
+class TypedAllocator {
+ public:
+  // May return NULL if the tensor has too many elements to represent in a
+  // single allocation.
+  template <typename T>
+  static T* Allocate(Allocator* raw_allocator, size_t num_elements,
+                     const AllocationAttributes& allocation_attr) {
+    // TODO(jeff): Do we need to allow clients to pass in alignment
+    // requirements?
+
+    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
+      return nullptr;
+    }
+
+    void* p =
+        raw_allocator->AllocateRaw(Allocator::kAllocatorAlignment,
+                                   sizeof(T) * num_elements, allocation_attr);
+    T* typed_p = reinterpret_cast<T*>(p);
+    if (typed_p) RunCtor<T>(raw_allocator, typed_p, num_elements);
+    return typed_p;
+  }
+
+  template <typename T>
+  static void Deallocate(Allocator* raw_allocator, T* ptr,
+                         size_t num_elements) {
+    if (ptr) {
+      RunDtor<T>(raw_allocator, ptr, num_elements);
+      raw_allocator->DeallocateRaw(ptr, Allocator::kAllocatorAlignment,
+                                   sizeof(T) * num_elements);
+    }
+  }
+
+ private:
+  // No constructors or destructors are run for simple types
+  template <typename T>
+  static void RunCtor(Allocator* raw_allocator, T* p, size_t n) {
+    static_assert(is_simple_type<T>::value, "T is not a simple type.");
+  }
+
+  template <typename T>
+  static void RunDtor(Allocator* raw_allocator, T* p, size_t n) {}
+
+  static void RunVariantCtor(Variant* p, size_t n);
+
+  static void RunVariantDtor(Variant* p, size_t n);
+};
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, tstring* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) tstring();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, tstring* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~tstring();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, ResourceHandle* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, ResourceHandle* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, Variant* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    RunVariantCtor(p, n);
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, Variant* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    RunVariantDtor(p, n);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/types.h b/third_party/tflite-hdrs/tensorflow/core/framework/types.h
new file mode 100644
index 00000000..c91e262c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/types.h
@@ -0,0 +1,530 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
+
+#include <cstddef>
+#include <map>
+#include <set>
+#include <string>
+
+#include "absl/numeric/bits.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/framework/device_type.h"
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Variant;
+
+// MemoryType is used to describe whether input or output Tensors of
+// an OpKernel should reside in "Host memory" (e.g., CPU memory) or
+// "Device" Memory (CPU memory for CPU devices, GPU memory for GPU
+// devices).
+enum MemoryType {
+  DEVICE_MEMORY = 0,
+  HOST_MEMORY = 1,
+};
+
+using tsl::DeviceType;  // NOLINT
+
+// Convenient constants that can be passed to a DeviceType constructor.
+// See comments for CreateOpKernel in op_kernel.h for uses of DEVICE_DEFAULT
+// and other device types.
+TF_EXPORT extern const char* const DEVICE_DEFAULT;     // "DEFAULT"
+TF_EXPORT extern const char* const DEVICE_CPU;         // "CPU"
+TF_EXPORT extern const char* const DEVICE_GPU;         // "GPU"
+TF_EXPORT extern const char* const DEVICE_TPU;         // "TPU"
+TF_EXPORT extern const char* const DEVICE_TPU_SYSTEM;  // "TPU_SYSTEM"
+
+template <typename Device>
+struct DeviceName {};
+
+template <>
+struct DeviceName<Eigen::ThreadPoolDevice> {
+  static const std::string value;
+};
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+template <>
+struct DeviceName<Eigen::GpuDevice> {
+  static const std::string value;
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+typedef absl::InlinedVector<MemoryType, 4UL> MemoryTypeVector;
+typedef absl::Span<const MemoryType> MemoryTypeSlice;
+
+typedef absl::InlinedVector<DataType, 4UL> DataTypeVector;
+typedef absl::Span<const DataType> DataTypeSlice;
+
+typedef absl::InlinedVector<DeviceType, 4UL> DeviceTypeVector;
+typedef absl::InlinedVector<std::pair<DeviceType, int32>, 4UL>
+    PrioritizedDeviceTypeVector;
+
+// Convert the enums to strings for errors:
+std::string DataTypeString(DataType dtype);
+std::string DeviceTypeString(const DeviceType& device_type);
+std::string DataTypeSliceString(const DataTypeSlice dtypes);
+inline std::string DataTypeVectorString(const DataTypeVector& dtypes) {
+  return DataTypeSliceString(dtypes);
+}
+
+// DataTypeSet represents a set of DataType values as a simple and efficient
+// bit mask.  Note that DataTypeSet cannot represent all DataType values; it
+// cannot represent any of the DT_*_REF values.
+class DataTypeSet {
+ private:
+  const uint32 mask_;
+
+  static constexpr uint32 kNumBits = 32;
+
+ public:
+  constexpr DataTypeSet(const DataTypeSet& other) : mask_(other.mask_) {}
+  explicit constexpr DataTypeSet(uint32 mask) : mask_(mask) {}
+
+  constexpr bool Contains(DataType dt) const {
+    return (static_cast<uint32>(dt) < kNumBits) &&
+           ((mask_ >> static_cast<uint32>(dt)) & 1u) != 0u;
+  }
+
+  class Iterator {
+    const DataTypeSet& set_;
+    uint32 pos_;
+
+   public:
+    Iterator(const DataTypeSet& set, uint32 pos) : set_(set), pos_(pos) {
+      DCHECK_LE(pos, kNumBits);
+    }
+    DataType operator*() const { return static_cast<DataType>(pos_); }
+    Iterator& operator++() {
+      ++pos_;
+      DCHECK_LE(pos_, kNumBits);
+      if (pos_ < kNumBits) {
+        uint32 remaining_mask = set_.mask_ >> pos_;
+        if (remaining_mask != 0u) {
+          pos_ += absl::countr_zero(remaining_mask);
+        }
+      }
+      DCHECK_LE(pos_, kNumBits);
+      return *this;
+    }
+    bool operator==(const Iterator& other) const { return pos_ == other.pos_; }
+    bool operator!=(const Iterator& other) const { return !(*this == other); }
+    size_t operator-(const Iterator& other) const {
+      return this->pos_ - other.pos_;
+    }
+  };
+
+  Iterator begin() const {
+    // The begin position is the index of the first bit set to 1 in the entire
+    // bit mask. If there are no bits set to 1, then the index is 0.
+    if (mask_ != 0) {
+      return Iterator(*this, absl::countr_zero(mask_));
+    }
+    // The set is empty.
+    return Iterator(*this, 0);
+  }
+
+  Iterator end() const {
+    // The end position is the index of the highest bit that is set, plus 1.
+    // If there are no bits set to 1, then the index is 0.
+    if (mask_ != 0) {
+      return Iterator(*this, kNumBits - absl::countl_zero(mask_));
+    }
+    // The set is empty.
+    return Iterator(*this, 0);
+  }
+
+  size_t size() const { return absl::popcount(mask_); }
+
+  constexpr DataTypeSet operator|(const DataTypeSet& other) const {
+    return DataTypeSet(mask_ | other.mask_);
+  }
+};
+
+// If "sp" names a valid type, store it in "*dt" and return true.  Otherwise,
+// return false.
+bool DataTypeFromString(absl::string_view sp, DataType* dt);
+
+constexpr inline DataTypeSet ToSet(DataType dt) {
+  return DataTypeSet(1u << static_cast<uint32>(dt));
+}
+
+// DT_FLOAT + kDataTypeRefOffset == DT_FLOAT_REF, etc.
+enum { kDataTypeRefOffset = 100 };
+inline bool IsRefType(DataType dtype) {
+  return dtype > static_cast<DataType>(kDataTypeRefOffset);
+}
+inline DataType MakeRefType(DataType dtype) {
+  DCHECK(!IsRefType(dtype));
+  return static_cast<DataType>(dtype + kDataTypeRefOffset);
+}
+inline DataType RemoveRefType(DataType dtype) {
+  DCHECK(IsRefType(dtype));
+  return static_cast<DataType>(dtype - kDataTypeRefOffset);
+}
+inline DataType BaseType(DataType dtype) {
+  return IsRefType(dtype) ? RemoveRefType(dtype) : dtype;
+}
+
+// Returns true if the actual type is the same as or ref of the expected type.
+inline bool TypesCompatible(DataType expected, DataType actual) {
+  return expected == actual || expected == BaseType(actual);
+}
+
+// Does not include _ref types.
+constexpr DataTypeSet kAllTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_UINT8) |
+    ToSet(DT_INT16) | ToSet(DT_UINT16) | ToSet(DT_INT8) | ToSet(DT_STRING) |
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_INT64) |
+    ToSet(DT_BOOL) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) | ToSet(DT_QINT16) |
+    ToSet(DT_QUINT16) | ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_RESOURCE) |
+    ToSet(DT_VARIANT) | ToSet(DT_UINT32) | ToSet(DT_UINT64) |
+    ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT8_E5M2) | ToSet(DT_FLOAT8_E4M3FN) |
+    ToSet(DT_INT4) | ToSet(DT_UINT4);
+
+inline const DataTypeSet& AllTypes() { return kAllTypes; }
+
+#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
+
+// Types that support '<' and '>'.
+constexpr DataTypeSet kRealNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_INT64) |
+    ToSet(DT_UINT8) | ToSet(DT_INT16) | ToSet(DT_INT8) | ToSet(DT_UINT16) |
+    ToSet(DT_HALF) | ToSet(DT_UINT32) | ToSet(DT_UINT64) | ToSet(DT_BFLOAT16);
+inline const DataTypeSet& RealNumberTypes() { return kRealNumberTypes; }
+
+// Return the list of all numeric types.
+// Includes complex and quantized types.
+// NOTE: On Android, we only include the float and int32 types for now.
+const DataTypeSet kNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT64) | ToSet(DT_INT32) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_QINT8) |
+    ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
+    ToSet(DT_HALF) | ToSet(DT_UINT32) | ToSet(DT_UINT64) | ToSet(DT_BFLOAT16);
+inline const DataTypeSet& NumberTypes() { return kNumberTypes; }
+
+constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                        ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+                                        ToSet(DT_QINT32);
+inline const DataTypeSet& QuantizedTypes() { return kQuantizedTypes; }
+
+// Types that support '<' and '>', including quantized types.
+const DataTypeSet kRealAndQuantizedTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_INT64) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
+    ToSet(DT_QINT8) | ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+    ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_BFLOAT16);
+inline const DataTypeSet& RealAndQuantizedTypes() {
+  return kRealAndQuantizedTypes;
+}
+
+#elif defined(__ANDROID_TYPES_FULL__)
+
+constexpr DataTypeSet kRealNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_INT64) | ToSet(DT_HALF);
+inline DataTypeSet RealNumberTypes() { return kRealNumberTypes; }
+
+constexpr DataTypeSet kNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_INT64) | ToSet(DT_QINT8) |
+    ToSet(DT_QUINT8) | ToSet(DT_QINT32) | ToSet(DT_HALF);
+inline DataTypeSet NumberTypes() { return kNumberTypes; }
+
+constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                        ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+                                        ToSet(DT_QINT32);
+inline DataTypeSet QuantizedTypes() { return kQuantizedTypes; }
+
+constexpr DataTypeSet kRealAndQuantizedTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_INT64) | ToSet(DT_QINT8) |
+    ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
+    ToSet(DT_HALF);
+inline DataTypeSet RealAndQuantizedTypes() { return kRealAndQuantizedTypes; }
+
+#else  // defined(IS_MOBILE_PLATFORM) && !defined(__ANDROID_TYPES_FULL__)
+
+constexpr DataTypeSet kRealNumberTypes = ToSet(DT_FLOAT) | ToSet(DT_INT32);
+inline DataTypeSet RealNumberTypes() { return kRealNumberTypes; }
+
+constexpr DataTypeSet kNumberTypes = ToSet(DT_FLOAT) | ToSet(DT_INT32) |
+                                     ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                     ToSet(DT_QINT32);
+inline DataTypeSet NumberTypes() { return kNumberTypes; }
+
+constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                        ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+                                        ToSet(DT_QINT32);
+inline DataTypeSet QuantizedTypes() { return kQuantizedTypes; }
+
+constexpr DataTypeSet kRealAndQuantizedTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+    ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32);
+inline DataTypeSet RealAndQuantizedTypes() { return kRealAndQuantizedTypes; }
+
+#endif  // defined(IS_MOBILE_PLATFORM)
+
+// Validates type T for whether it is a supported DataType.
+template <class T>
+struct IsValidDataType;
+
+// DataTypeToEnum<T>::v() and DataTypeToEnum<T>::value are the DataType
+// constants for T, e.g. DataTypeToEnum<float>::v() is DT_FLOAT.
+template <class T>
+struct DataTypeToEnum {
+  static_assert(IsValidDataType<T>::value, "Specified Data Type not supported");
+};  // Specializations below
+
+// EnumToDataType<VALUE>::Type is the type for DataType constant VALUE, e.g.
+// EnumToDataType<DT_FLOAT>::Type is float.
+template <DataType VALUE>
+struct EnumToDataType {};  // Specializations below
+
+// Template specialization for both DataTypeToEnum and EnumToDataType.
+#define MATCH_TYPE_AND_ENUM(TYPE, ENUM)                 \
+  template <>                                           \
+  struct DataTypeToEnum<TYPE> {                         \
+    static DataType v() { return ENUM; }                \
+    static DataType ref() { return MakeRefType(ENUM); } \
+    static constexpr DataType value = ENUM;             \
+  };                                                    \
+  template <>                                           \
+  struct IsValidDataType<TYPE> {                        \
+    static constexpr bool value = true;                 \
+  };                                                    \
+  template <>                                           \
+  struct EnumToDataType<ENUM> {                         \
+    typedef TYPE Type;                                  \
+  }
+
+MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
+MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
+MATCH_TYPE_AND_ENUM(int32, DT_INT32);
+MATCH_TYPE_AND_ENUM(uint32, DT_UINT32);
+MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
+MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
+MATCH_TYPE_AND_ENUM(int16, DT_INT16);
+MATCH_TYPE_AND_ENUM(int8, DT_INT8);
+MATCH_TYPE_AND_ENUM(tstring, DT_STRING);
+MATCH_TYPE_AND_ENUM(complex64, DT_COMPLEX64);
+MATCH_TYPE_AND_ENUM(complex128, DT_COMPLEX128);
+MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
+MATCH_TYPE_AND_ENUM(qint8, DT_QINT8);
+MATCH_TYPE_AND_ENUM(quint8, DT_QUINT8);
+MATCH_TYPE_AND_ENUM(qint16, DT_QINT16);
+MATCH_TYPE_AND_ENUM(quint16, DT_QUINT16);
+MATCH_TYPE_AND_ENUM(qint32, DT_QINT32);
+MATCH_TYPE_AND_ENUM(bfloat16, DT_BFLOAT16);
+MATCH_TYPE_AND_ENUM(Eigen::half, DT_HALF);
+MATCH_TYPE_AND_ENUM(float8_e5m2, DT_FLOAT8_E5M2);
+MATCH_TYPE_AND_ENUM(float8_e4m3fn, DT_FLOAT8_E4M3FN);
+MATCH_TYPE_AND_ENUM(int4, DT_INT4);
+MATCH_TYPE_AND_ENUM(uint4, DT_UINT4);
+MATCH_TYPE_AND_ENUM(ResourceHandle, DT_RESOURCE);
+MATCH_TYPE_AND_ENUM(Variant, DT_VARIANT);
+
+template <>
+struct DataTypeToEnum<long> {
+  static DataType v() { return value; }
+  static DataType ref() { return MakeRefType(value); }
+  static constexpr DataType value = sizeof(long) == 4 ? DT_INT32 : DT_INT64;
+};
+template <>
+struct IsValidDataType<long> {
+  static constexpr bool value = true;
+};
+template <>
+struct EnumToDataType<DT_INT64> {
+  typedef int64_t Type;
+};
+
+template <>
+struct DataTypeToEnum<unsigned long> {
+  static DataType v() { return value; }
+  static DataType ref() { return MakeRefType(value); }
+  static constexpr DataType value =
+      sizeof(unsigned long) == 4 ? DT_UINT32 : DT_UINT64;
+};
+template <>
+struct IsValidDataType<unsigned long> {
+  static constexpr bool value = true;
+};
+template <>
+struct EnumToDataType<DT_UINT64> {
+  typedef tensorflow::uint64 Type;
+};
+
+template <>
+struct DataTypeToEnum<long long> {
+  static DataType v() { return DT_INT64; }
+  static DataType ref() { return MakeRefType(DT_INT64); }
+  static constexpr DataType value = DT_INT64;
+};
+template <>
+struct IsValidDataType<long long> {
+  static constexpr bool value = true;
+};
+
+template <>
+struct DataTypeToEnum<unsigned long long> {
+  static DataType v() { return DT_UINT64; }
+  static DataType ref() { return MakeRefType(DT_UINT64); }
+  static constexpr DataType value = DT_UINT64;
+};
+template <>
+struct IsValidDataType<unsigned long long> {
+  static constexpr bool value = true;
+};
+
+#undef MATCH_TYPE_AND_ENUM
+
+// All types not specialized are marked invalid.
+template <class T>
+struct IsValidDataType {
+  static constexpr bool value = false;
+};
+
+// Extra validity checking; not part of public API.
+static_assert(IsValidDataType<int64_t>::value, "Incorrect impl for int64");
+static_assert(IsValidDataType<int32>::value, "Incorrect impl for int32");
+
+// TODO(jeff): Maybe unify this with Tensor::CanUseDMA, or the underlying
+// is_simple<T> in tensor.cc (and possible choose a more general name?)
+constexpr DataTypeSet kDataTypesCanUseMemcpy =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_UINT32) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_INT64) |
+    ToSet(DT_UINT64) | ToSet(DT_BOOL) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+    ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
+    ToSet(DT_BFLOAT16) | ToSet(DT_HALF) | ToSet(DT_FLOAT8_E5M2) |
+    ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_INT4) | ToSet(DT_UINT4);
+inline bool DataTypeCanUseMemcpy(DataType dt) {
+  return kDataTypesCanUseMemcpy.Contains(dt);
+}
+
+// Returns true iff 'dt' is a real, non-quantized floating point type.
+constexpr DataTypeSet kDataTypeIsFloating =
+    ToSet(DT_HALF) | ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) |
+    ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E5M2);
+inline bool DataTypeIsFloating(DataType dt) {
+  return kDataTypeIsFloating.Contains(dt);
+}
+
+// Returns true iff 'dt' is a numeric type.
+inline bool DataTypeIsNumeric(DataType dt) { return kNumberTypes.Contains(dt); }
+
+// Returns true iff 'dt' is a complex type.
+constexpr DataTypeSet kDataTypeIsComplex =
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128);
+inline bool DataTypeIsComplex(DataType dt) {
+  return kDataTypeIsComplex.Contains(dt);
+}
+
+inline bool DataTypeIsQuantized(DataType dt) {
+  return kQuantizedTypes.Contains(dt);
+}
+
+// Is the dtype nonquantized integral?
+constexpr DataTypeSet kDataTypeIsInteger =
+    ToSet(DT_INT4) | ToSet(DT_UINT4) | ToSet(DT_INT8) | ToSet(DT_UINT8) |
+    ToSet(DT_INT16) | ToSet(DT_UINT16) | ToSet(DT_INT32) | ToSet(DT_UINT32) |
+    ToSet(DT_INT64) | ToSet(DT_UINT64);
+inline bool DataTypeIsInteger(DataType dt) {
+  return kDataTypeIsInteger.Contains(dt);
+}
+
+// Is the dtype a signed integral type?
+constexpr DataTypeSet kDataTypeIsSigned = ToSet(DT_INT4) | ToSet(DT_INT8) |
+                                          ToSet(DT_INT16) | ToSet(DT_INT32) |
+                                          ToSet(DT_INT64);
+inline bool DataTypeIsSigned(DataType dt) {
+  return kDataTypeIsSigned.Contains(dt);
+}
+
+// Is the dtype an unsigned integral type?
+constexpr DataTypeSet kDataTypeIsUnsigned = ToSet(DT_UINT4) | ToSet(DT_UINT8) |
+                                            ToSet(DT_UINT16) |
+                                            ToSet(DT_UINT32) | ToSet(DT_UINT64);
+inline bool DataTypeIsUnsigned(DataType dt) {
+  return kDataTypeIsUnsigned.Contains(dt);
+}
+
+// Returns a 0 on failure
+int DataTypeSize(DataType dt);
+
+// Returns HOST_MEMORY if `dtype` is always on host or is a DT_INT32,
+// DEVICE_MEMORY otherwise.
+MemoryType MTypeFromDType(const DataType dtype);
+
+// Returns HOST_MEMORY if `dtype` is always on host, DEVICE_MEMORY otherwise.
+// The reason we have MTypeFromDType() and MTypeFromDTypeIntsOnDevice(): for
+// GPUs, we would like to keep int operations on host for performance concerns.
+// But for TPUs (and other devices), int operations are placed on device.
+MemoryType MTypeFromDTypeIntsOnDevice(const DataType dtype);
+
+// Types that always sit on host: DT_STRING, DT_STRING_REF, DT_RESOURCE.
+// For DT_RESOURCE, the handle always sits on host (even if the underlying
+// object has device-allocated resources).
+bool DataTypeAlwaysOnHost(DataType dt);
+
+// FullType implementation.
+
+// Reference container for a type definition. These values are usually interned.
+// These containers admit a notion of ordering for efficient access. The
+// ordering has no semantic otherwise.
+struct TypeRef {
+  std::shared_ptr<FullTypeDef> full_type;
+
+  bool operator==(const TypeRef& other) const {
+    // TODO(mdan): This should be more efficient.
+    return full_type->SerializeAsString() ==
+           other.full_type->SerializeAsString();
+  }
+  bool operator<(const TypeRef& other) const {
+    return full_type->SerializeAsString() <
+           other.full_type->SerializeAsString();
+  }
+};
+
+struct TypeHasher {
+  std::size_t operator()(const TypeRef& k) const {
+    return std::hash<std::string>()(k.full_type->SerializeAsString());
+  }
+};
+
+// Maps a legacy DType proto enum to an equivalent FullType ID,
+// i.e. sets the type_id of t based on dtype.
+void map_dtype_to_tensor(const DataType& dtype, FullTypeDef& t);
+
+// Set the type id_of t to TFT_TENSOR and add a child arg by mapping
+// a legacy DType proto enun to an equivalent FullType ID, e.g.
+// if dtype is DT_FLOAT, sets t to TFT_TENSOR[TFT_FLOAT].
+void map_dtype_to_child_of_tensor(const DataType& dtype, FullTypeDef& t);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/variant.h b/third_party/tflite-hdrs/tensorflow/core/framework/variant.h
new file mode 100644
index 00000000..152e0538
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/variant.h
@@ -0,0 +1,629 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_H_
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/strcat.h"
+
+namespace tensorflow {
+
+template <typename T>
+std::string TypeNameVariant(const T& value);
+
+template <typename T>
+std::string DebugStringVariant(const T& value);
+
+// Allows for specializations of Variant Decoding.  `data` may be modified in
+// the process of decoding to `value`.
+template <typename T>
+bool DecodeVariant(VariantTensorData* data, T* value);
+
+template <typename T>
+bool DecodeVariant(std::string* buf, T* value);
+
+template <typename T>
+void EncodeVariant(const T& value, VariantTensorData* data);
+
+template <typename T>
+void EncodeVariant(const T& value, std::string* buf);
+
+// This is an implementation of a type-erased container that can store an
+// object of any type. The implementation is very similar to std::any, but has
+// restrictions on the types of objects that can be stored, and eschews some of
+// the fancier constructors available for std::any. An object of
+// tensorflow::Variant is intended to be used as the value that will be stored
+// in a tensorflow::Tensor object when its type is DT_VARIANT.
+//
+// tensorflow::Variant can store an object of a class that satisfies the
+// following constraints:
+//
+// * The class is CopyConstructible.
+// * The class has a default constructor.
+// * It's either a protocol buffer, a tensorflow::Tensor, or defines the
+// following functions:
+//
+//   string TypeName() const;
+//   void Encode(VariantTensorData* data) const;
+//   bool Decode(VariantTensorData data);
+//
+// Simple POD types can elide the Encode/Decode functions, they are provided by
+// helper methods.
+// Here are some typical usage patterns:
+//
+//   Variant x = 10;
+//   EXPECT_EQ(*x.get<int>(), 10);
+//
+//   Tensor t(DT_FLOAT, TensorShape({}));
+//   t.flat<float>()(0) = 42.0f;
+//   Variant x = t;
+//   EXPECT_EQ(x.get<Tensor>()->flat<float>()(0), 42.0f);
+//
+// Accessing the stored object:
+//
+// The get<T> function is the main mechanism to access the object
+// stored in the container. It is type-safe, that is, calling
+// get<T> when the stored object's type is not T, returns a
+// nullptr. A raw pointer to the stored object can be obtained by calling
+// get<void>().
+//
+// Serializing/deserializing Variant object:
+//
+// The Variant class delegates serializing and deserializing operations to the
+// contained object. Helper functions to do these operations are provided for
+// POD data types, tensorflow::Tensor, and protocol buffer objects. However,
+// other classes have to provide Encode/Decode functions to handle
+// serialization.
+//
+// Objects stored in a Variant object often contain references to other
+// tensorflow::Tensors of primitive types (Eg., a list of tensorflow::Tensors).
+// To efficiently support those use cases, a structure is imposed on the
+// serialization format. Namely, classes should serialize their contents into a
+// VariantTensorData object:
+//
+//   struct VariantTensorData {
+//     string type_name;
+//     string metadata;
+//     std::vector<Tensor> tensors;
+//   };
+//
+// Objects with references to other Tensors can simply store those tensors in
+// the `tensors` field, and serialize other metadata content in to the
+// `metadata` field.
+//
+// Serialization example:
+//
+//   Foo f = Foo {...};
+//   Variant x = f;
+//   string serialized_f;
+//   x.Encode(&serialized_f);
+//
+//   Variant y = Foo(); // default constructed Foo.
+//   y.Decode(std::move(serialized_f));
+//   EXPECT_EQ(*x.get<Foo>(), *y.get<Foo>());
+//
+//
+// A Variant storing serialized Variant data (a value of type
+// VariantTensorDataProto) has different behavior from a standard Variant.
+// Namely, its TypeName matches the TypeName of the original Variant;
+// and its non-const get method performs lazy deserialization.
+//
+// Decode and copy example:
+//
+//   Foo f = Foo {...};
+//   Variant x = f;
+//
+//   VariantTensorData serialized_data_f;
+//   VariantTensorDataProto serialized_proto_f;
+//   x.Encode(&serialized_data_f);
+//   serialized_data_f.ToProto(&serialized_proto_f);
+//
+//   Variant y_type_unknown = serialized_proto_f;  // Store serialized Variant.
+//
+//   EXPECT_EQ(x.TypeName(), y_type_unknown.TypeName());  // Looks like Foo.
+//   EXPECT_EQ(TypeIndex::Make<VariantTensorDataProto>(),
+//             y_type_unknown.TypeId());
+//
+class Variant {
+ public:
+  // Constructs a Variant holding no value (aka `is_empty()`).
+  //
+  // This is done by pointing at nullptr via the heap value.
+  Variant() noexcept : heap_value_(/*pointer=*/nullptr), is_inline_(false) {}
+
+  ~Variant();
+
+  Variant(const Variant& other);
+  Variant(Variant&& other) noexcept;
+
+  // Make sure that the type is CopyConstructible and not a
+  // tensorflow::Variant object itself. We want the copy constructor to be
+  // chosen for the tensorflow::Variant case.
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_move_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant(T&& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_copy_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant(const T& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_copy_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant& operator=(const T& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_move_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant& operator=(T&& value);
+
+  Variant& operator=(const Variant& rhs) {
+    if (&rhs == this) return *this;
+    Variant(rhs).swap(*this);
+    return *this;
+  }
+
+  Variant& operator=(Variant&& rhs) noexcept {
+    if (&rhs == this) return *this;
+    Variant(std::move(rhs)).swap(*this);
+    return *this;
+  }
+
+  // Constructs a value of type T with the given args in-place in this Variant.
+  // Returns a reference to the newly constructed value.
+  // The signature is based on std::variant<Types...>::emplace() in C++17.
+  template <typename T, class... Args>
+  T& emplace(Args&&... args) {
+    ResetMemory();
+    is_inline_ = CanInlineType<T>();
+    if (is_inline_) {
+      new (&inline_value_)
+          InlineValue(InlineValue::Tag<T>{}, std::forward<Args>(args)...);
+      return static_cast<Variant::Value<T>*>(inline_value_.AsValueInterface())
+          ->value;
+    } else {
+      new (&heap_value_) HeapValue(
+          absl::make_unique<Value<T>>(InPlace(), std::forward<Args>(args)...));
+      return static_cast<Variant::Value<T>*>(heap_value_.get())->value;
+    }
+  }
+
+  bool is_empty() const { return GetValue() == nullptr; }
+
+  void clear() noexcept;
+
+  void swap(Variant& other) noexcept;
+
+  // Note, unlike TypeName(), TypeId() does not return the TypeIndex
+  // of the original type when a TensorValueDataProto is stored as the
+  // value.  In this case, it returns the TypeIndex of TensorValueDataProto.
+  TypeIndex TypeId() const {
+    const TypeIndex VoidTypeIndex = TypeIndex::Make<void>();
+    if (is_empty()) {
+      return VoidTypeIndex;
+    }
+    return GetValue()->TypeId();
+  }
+
+  std::string DebugString() const {
+    return strings::StrCat("Variant<type: ", TypeName(),
+                           " value: ", SummarizeValue(), ">");
+  }
+
+  std::string SummarizeValue() const {
+    return is_empty() ? "[empty]" : GetValue()->DebugString();
+  }
+
+  // Returns a pointer to the stored value if it is type T, or nullptr
+  // otherwise.
+  template <typename T>
+  T* get() {
+    const TypeIndex TTypeIndex = TypeIndex::Make<T>();
+    if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
+    return std::addressof(static_cast<Variant::Value<T>*>(GetValue())->value);
+  }
+
+  // Returns a pointer to the stored value if it is type T, or nullptr
+  // otherwise.
+  template <typename T>
+  const T* get() const {
+    const TypeIndex TTypeIndex = TypeIndex::Make<T>();
+    if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
+    return std::addressof(
+        static_cast<const Variant::Value<T>*>(GetValue())->value);
+  }
+
+  // Returns TypeNameVariant(value).
+  //
+  // In the special case that a serialized Variant is stored (value
+  // is a VariantTensorDataProto), returns value.TypeName(), the
+  // TypeName field stored in the VariantTensorDataProto buffer.
+  std::string TypeName() const {
+    if (is_empty()) {
+      return "";
+    }
+    return GetValue()->TypeName();
+  }
+
+  // Serialize the contents of the stored object into `data`.
+  void Encode(VariantTensorData* data) const {
+    if (!is_empty()) {
+      GetValue()->Encode(data);
+    }
+  }
+
+  // Deserialize `data` and update the stored object.
+  bool Decode(VariantTensorData data);
+
+  // Helper methods to directly serialize/deserialize from strings.
+  void Encode(std::string* buf) const {
+    if (!is_empty()) {
+      GetValue()->Encode(buf);
+    }
+  }
+  bool Decode(std::string buf) {
+    if (!is_empty()) {
+      return GetValue()->Decode(std::move(buf));
+    }
+    return true;
+  }
+
+  template <typename VT>
+  static constexpr bool CanInlineType() {
+    return ((sizeof(Value<VT>) <= InlineValue::kMaxValueSize) &&
+            (alignof(Value<VT>) <= kMaxInlineValueAlignSize));
+  }
+
+ private:
+  struct in_place_t {};
+  static constexpr in_place_t InPlace() { return in_place_t{}; }
+
+  struct ValueInterface {
+    virtual ~ValueInterface() = default;
+    virtual TypeIndex TypeId() const = 0;
+    virtual void* RawPtr() = 0;
+    virtual const void* RawPtr() const = 0;
+    virtual std::unique_ptr<ValueInterface> Clone() const = 0;
+    virtual void CloneInto(ValueInterface* memory) const = 0;
+    virtual void MoveAssign(ValueInterface* memory) = 0;
+    virtual void MoveInto(ValueInterface* memory) = 0;
+    virtual std::string TypeName() const = 0;
+    virtual std::string DebugString() const = 0;
+    virtual void Encode(VariantTensorData* data) const = 0;
+    virtual bool Decode(VariantTensorData data) = 0;
+    virtual void Encode(std::string* buf) const = 0;
+    virtual bool Decode(std::string data) = 0;
+  };
+
+  template <typename T>
+  struct Value final : ValueInterface {
+    template <class... Args>
+    explicit Value(in_place_t /*tag*/, Args&&... args)
+        : value(std::forward<Args>(args)...) {}
+
+    // NOTE(ebrevdo): Destructor must be explicitly defined for CUDA to happily
+    // build `alignof(Variant<void*>)`.
+    ~Value() final = default;
+
+    TypeIndex TypeId() const final {
+      const TypeIndex value_type_index =
+          TypeIndex::Make<typename std::decay<T>::type>();
+      return value_type_index;
+    }
+
+    void* RawPtr() final { return &value; }
+
+    const void* RawPtr() const final { return &value; }
+
+    std::unique_ptr<ValueInterface> Clone() const final {
+      return absl::make_unique<Value>(InPlace(), value);
+    }
+
+    void MoveAssign(ValueInterface* memory) final {
+      CHECK(TypeId() == memory->TypeId())
+          << TypeId().name() << " vs. " << memory->TypeId().name();
+      static_cast<Value*>(memory)->value = std::move(value);
+    }
+
+    void CloneInto(ValueInterface* memory) const final {
+      new (memory) Value(InPlace(), value);
+    }
+
+    void MoveInto(ValueInterface* memory) final {
+      new (memory) Value(InPlace(), std::move(value));
+    }
+
+    std::string TypeName() const final { return TypeNameVariant(value); }
+
+    std::string DebugString() const final { return DebugStringVariant(value); }
+
+    void Encode(VariantTensorData* data) const final {
+      EncodeVariant(value, data);
+    }
+
+    bool Decode(VariantTensorData data) final {
+      return DecodeVariant(&data, &value);
+    }
+
+    void Encode(std::string* buf) const final { EncodeVariant(value, buf); }
+
+    bool Decode(std::string buf) final { return DecodeVariant(&buf, &value); }
+
+    T value;
+  };
+  static constexpr int kMaxInlineValueAlignSize = alignof(Value<void*>);
+
+  using HeapValue = std::unique_ptr<ValueInterface>;
+
+  struct InlineValue {
+    // We try to size InlineValue so that sizeof(Variant) <= 64 and it can fit
+    // into the aligned space of a TensorBuffer.
+    static constexpr int kMaxValueSize = (64 - /*some extra padding=*/8);
+
+    typedef char ValueDataArray[kMaxValueSize];
+    alignas(kMaxInlineValueAlignSize) ValueDataArray value_data;
+
+    // Tag is used for deducing the right type when constructing a Value in
+    // place.
+    template <typename VT>
+    struct Tag {};
+
+    template <typename VT, class... Args>
+    explicit InlineValue(Tag<VT> /*tag*/, Args&&... args) noexcept {
+      Value<VT>* inline_value_data = reinterpret_cast<Value<VT>*>(value_data);
+      new (inline_value_data) Value<VT>(InPlace(), std::forward<Args>(args)...);
+    }
+
+    InlineValue(const InlineValue& other) noexcept {
+      other.AsValueInterface()->CloneInto(AsValueInterface());
+    }
+
+    InlineValue(InlineValue&& other) noexcept {
+      other.AsValueInterface()->MoveInto(AsValueInterface());
+    }
+
+    void ResetMemory() { AsValueInterface()->~ValueInterface(); }
+
+    InlineValue& operator=(const InlineValue& other) {
+      if (&other == this) return *this;
+      ResetMemory();
+      other.AsValueInterface()->CloneInto(AsValueInterface());
+      return *this;
+    }
+
+    InlineValue& operator=(InlineValue&& other) {
+      if (&other == this) return *this;
+      if (AsValueInterface()->TypeId() == other.AsValueInterface()->TypeId()) {
+        other.AsValueInterface()->MoveAssign(AsValueInterface());
+      } else {
+        ResetMemory();
+        other.AsValueInterface()->MoveInto(AsValueInterface());
+      }
+      return *this;
+    }
+
+    ValueInterface* AsValueInterface() {
+      return reinterpret_cast<ValueInterface*>(value_data);
+    }
+
+    const ValueInterface* AsValueInterface() const {
+      return reinterpret_cast<const ValueInterface*>(value_data);
+    }
+
+    ~InlineValue() { ResetMemory(); }
+  };
+
+  union {
+    HeapValue heap_value_;
+    InlineValue inline_value_;
+  };
+  // is_inline_ provides discrimination between which member of the prior union
+  // is currently within it's lifetime. To switch from one member to the other,
+  // the destructor must be called on the currently alive member before calling
+  // the constructor on the other member. In effect, a member is expected to be
+  // live at any given time and that member is tracked via this boolean.
+  bool is_inline_;
+
+  bool IsInlineValue() const { return is_inline_; }
+
+  // ResetMemory causes the destructor of the currently active member of the
+  // union to be run. This must be follwed with a placement new call on the
+  // member whose lifetime is to start. Additionally, is_inline_ needs to be set
+  // accordingly. ResetAndSetInline and ResetAndSetHeap are simple helper
+  // functions for performing the actions that are required to follow.
+  void ResetMemory() {
+    if (IsInlineValue()) {
+      inline_value_.~InlineValue();
+    } else {
+      heap_value_.~HeapValue();
+    }
+  }
+
+  // ResetAndSetInline clears the current state and then constructs a new value
+  // inline with the provided arguments.
+  template <typename... Args>
+  void ResetAndSetInline(Args&&... args) noexcept {
+    ResetMemory();
+    new (&inline_value_) InlineValue(std::forward<Args>(args)...);
+    is_inline_ = true;
+  }
+
+  // ResetAndSetHeap clears the current state then constructs a new value on the
+  // heap with the provided arguments.
+  template <typename... Args>
+  void ResetAndSetHeap(Args&&... args) noexcept {
+    ResetMemory();
+    new (&heap_value_) HeapValue(std::forward<Args>(args)...);
+    is_inline_ = false;
+  }
+
+  ValueInterface* GetValue() {
+    if (IsInlineValue()) {
+      return inline_value_.AsValueInterface();
+    } else {
+      return heap_value_.get();
+    }
+  }
+
+  const ValueInterface* GetValue() const {
+    if (IsInlineValue()) {
+      return inline_value_.AsValueInterface();
+    } else {
+      return heap_value_.get();
+    }
+  }
+
+  // PRECONDITION: Called on construction or ResetMemory() has been called
+  // before this method.
+  template <typename VT, typename T>
+  void InsertValue(T&& value) {
+    if (IsInlineValue()) {
+      new (&inline_value_)
+          InlineValue(InlineValue::Tag<VT>{}, std::forward<T>(value));
+    } else {
+      new (&heap_value_) HeapValue(
+          absl::make_unique<Value<VT>>(InPlace(), std::forward<T>(value)));
+    }
+  }
+};
+
+// Make sure that a Variant object can reside in a 64-byte aligned Tensor
+// buffer.
+static_assert(sizeof(Variant) <= 64,
+              "Expected internal representation to be 64 bytes.");
+
+inline Variant::Variant(const Variant& other)
+    : is_inline_(other.IsInlineValue()) {
+  if (IsInlineValue()) {
+    new (&inline_value_) InlineValue(other.inline_value_);
+  } else {
+    new (&heap_value_)
+        HeapValue(other.heap_value_ ? other.heap_value_->Clone() : nullptr);
+  }
+}
+
+inline Variant::Variant(Variant&& other) noexcept
+    : is_inline_(other.IsInlineValue()) {
+  if (IsInlineValue()) {
+    new (&inline_value_) InlineValue(std::move(other.inline_value_));
+  } else {
+    new (&heap_value_) HeapValue(std::move(other.heap_value_));
+  }
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_move_constructible<VT>::value,
+                                  void>::type*>
+inline Variant::Variant(T&& value) : is_inline_(CanInlineType<VT>()) {
+  InsertValue<VT>(std::forward<T>(value));
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_copy_constructible<VT>::value,
+                                  void>::type*>
+inline Variant::Variant(const T& value) : is_inline_(CanInlineType<VT>()) {
+  InsertValue<VT>(value);
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_move_constructible<VT>::value,
+                                  void>::type*>
+inline Variant& Variant::operator=(T&& value) {
+  ResetMemory();
+  is_inline_ = CanInlineType<VT>();
+  InsertValue<VT>(std::forward<T>(value));
+  return *this;
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_copy_constructible<VT>::value,
+                                  void>::type*>
+inline Variant& Variant::operator=(const T& value) {
+  ResetMemory();
+  is_inline_ = CanInlineType<VT>();
+  InsertValue<VT>(value);
+  return *this;
+}
+
+inline void Variant::clear() noexcept {
+  // We set the internal unique_ptr to nullptr so that we preserve the
+  // invariant that one of the two states must be set at all times. nullptr
+  // indicates that the variant is empty.
+  ResetAndSetHeap(/*pointer=*/nullptr);
+}
+
+inline void Variant::swap(Variant& other) noexcept {
+  if (is_empty()) {
+    if (other.IsInlineValue()) {
+      ResetAndSetInline(std::move(other.inline_value_));
+    } else {
+      ResetAndSetHeap(std::move(other.heap_value_));
+    }
+    other.clear();
+  } else if (other.is_empty()) {
+    if (IsInlineValue()) {
+      other.ResetAndSetInline(std::move(inline_value_));
+    } else {
+      other.ResetAndSetHeap(std::move(heap_value_));
+    }
+    clear();
+  } else {  // Both Variants have values.
+    if (other.IsInlineValue() && IsInlineValue()) {
+      std::swap(inline_value_, other.inline_value_);
+    } else if (!other.IsInlineValue() && !IsInlineValue()) {
+      std::swap(heap_value_, other.heap_value_);
+    } else if (other.IsInlineValue() && !IsInlineValue()) {
+      HeapValue v = std::move(heap_value_);
+      ResetAndSetInline(std::move(other.inline_value_));
+      other.ResetAndSetHeap(std::move(v));
+    } else {  // !other.IsInlineValue() && IsInlineValue()
+      HeapValue v = std::move(other.heap_value_);
+      other.ResetAndSetInline(std::move(inline_value_));
+      ResetAndSetHeap(std::move(v));
+    }
+  }
+}
+
+template <>
+void* Variant::get();
+
+template <>
+const void* Variant::get() const;
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/variant_encode_decode.h b/third_party/tflite-hdrs/tensorflow/core/framework/variant_encode_decode.h
new file mode 100644
index 00000000..20ceeb93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/variant_encode_decode.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+
+#include <iostream>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Type used for tag-dispatch of the Encode/Decode Variant implementations. This
+// template can determine whether the first type parameter `T` is one of the
+// following:
+//
+// * A POD type (TypeResolver<T, true>)
+// * A tensorflow::Tensor (TypeResolver<T, false, true>)
+// * A protocol buffer (TypeResolver<T, false, false, true>)
+// * None of the above (TypeResolver<T, false, false, false>)
+//
+template <typename T, bool = std::is_pod<typename std::decay<T>::type>::value,
+          bool = std::is_same<typename std::decay<T>::type,
+                              ::tensorflow::Tensor>::value,
+          bool = std::is_base_of<protobuf::MessageLite,
+                                 typename std::decay<T>::type>::value>
+struct TypeResolver {};
+
+// Specialization for POD type
+template <typename T>
+void EncodeVariantImpl(const T& value, TypeResolver<T, true /* is_pod */>,
+                       VariantTensorData* data) {
+  data->set_metadata(value);
+}
+
+// Specialization for tensorflow::Tensor
+template <typename T>
+void EncodeVariantImpl(const T& value,
+                       TypeResolver<T, false /* is_pod */, true /* Tensor */>,
+                       VariantTensorData* data) {
+  data->tensors_.clear();
+  data->tensors_.push_back(value);
+}
+
+// Specialization for protobuf
+template <typename T>
+void EncodeVariantImpl(const T& value,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    true /* protobuf */>,
+                       VariantTensorData* data) {
+  if (!value.SerializeToString(&data->metadata_)) {
+    data->metadata_.clear();
+    LOG(ERROR) << "Failed to encode variant " << value.DebugString();
+  }
+}
+
+// Specialization for other types
+template <typename T>
+void EncodeVariantImpl(const T& value,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    false /* protobuf */>,
+                       VariantTensorData* data) {
+  value.Encode(data);
+}
+
+// Specialization for POD type
+template <typename T>
+bool DecodeVariantImpl(VariantTensorData data,
+                       TypeResolver<T, true /* is_pod */, false /* Tensor */,
+                                    false /* protobuf */>,
+                       T* value) {
+  return data.get_metadata(value);
+}
+
+// Specialization for tensorflow::Tensor
+template <typename T>
+bool DecodeVariantImpl(VariantTensorData data,
+                       TypeResolver<T, false /* is_pod */, true /* Tensor */,
+                                    false /* protobuf */>,
+                       T* value) {
+  *value = data.tensors(0);
+  return true;
+}
+
+// Specialization for protobuf
+template <typename T>
+bool DecodeVariantImpl(VariantTensorData data,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    true /* protobuf */>,
+                       T* value) {
+  std::string metadata;
+  data.get_metadata(&metadata);
+  return value->ParseFromString(std::move(metadata));
+}
+
+// Specialization for other types
+template <typename T>
+bool DecodeVariantImpl(VariantTensorData data,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    false /* protobuf */>,
+                       T* value) {
+  return value->Decode(std::move(data));
+}
+
+template <typename C, typename = void>
+struct has_type_name : std::false_type {};
+
+template <typename C>
+struct has_type_name<
+    C, typename std::enable_if<std::is_same<
+           decltype(std::declval<C>().TypeName()), string>::value>::type>
+    : std::true_type {};
+
+template <typename T, bool = has_type_name<typename std::decay<T>::type>::value,
+          bool = std::is_same<typename std::decay<T>::type,
+                              ::tensorflow::Tensor>::value,
+          bool = std::is_base_of<protobuf::MessageLite,
+                                 typename std::decay<T>::type>::value>
+struct TypeNameResolver {};
+
+template <typename T>
+std::string TypeNameVariantImpl(const T& value,
+                                TypeNameResolver<T, true /* has_type_name */>) {
+  return value.TypeName();
+}
+
+template <typename T>
+std::string TypeNameVariantImpl(
+    const T& value,
+    TypeNameResolver<T, false /* has_type_name */, true /* Tensor */>) {
+  return "tensorflow::Tensor";
+}
+
+template <typename T>
+std::string TypeNameVariantImpl(
+    const T& value, TypeNameResolver<T, false /* has_type_name */,
+                                     false /* Tensor */, true /* protobuf */>) {
+  return std::string(value.GetTypeName());
+}
+
+template <typename T>
+std::string TypeNameVariantImpl(
+    const T& value,
+    TypeNameResolver<T, false /* has_type_name */, false /* Tensor */,
+                     false /* protobuf */>) {
+  return port::MaybeAbiDemangle(TypeIndex::Make<T>().name());
+}
+
+template <typename T>
+std::string TypeNameVariant(const T& value) {
+  return TypeNameVariantImpl(value, TypeNameResolver<T>());
+}
+
+template <typename C, typename = void>
+struct has_debug_string : std::false_type {};
+
+template <typename C>
+struct has_debug_string<
+    C, typename std::enable_if<std::is_same<
+           decltype(std::declval<C>().DebugString()), string>::value>::type>
+    : std::true_type {};
+
+template <typename C, typename = void>
+struct can_strcat : std::false_type {};
+
+template <typename C>
+struct can_strcat<
+    C, typename std::enable_if<std::is_same<
+           decltype(strings::StrCat(std::declval<C>())), string>::value>::type>
+    : std::true_type {};
+
+template <typename T,
+          bool = has_debug_string<typename std::decay<T>::type>::value,
+          bool = can_strcat<typename std::decay<T>::type>::value>
+struct DebugStringResolver {};
+
+// TODO(ebrevdo): Expand DebugStringResolver to return TypeString if
+// there is no StrCat<T>() constructor.
+template <typename T>
+std::string DebugStringVariantImpl(
+    const T& value, DebugStringResolver<T, true /* has_debug_string */>) {
+  return value.DebugString();
+}
+
+template <typename T>
+std::string DebugStringVariantImpl(
+    const T& value, DebugStringResolver<T, false /* has_debug_string */,
+                                        true /* can_strcat */>) {
+  return strings::StrCat(value);
+}
+
+template <typename T>
+std::string DebugStringVariantImpl(
+    const T& value, DebugStringResolver<T, false /* has_debug_string */,
+                                        false /* can_strcat */>) {
+  return "?";
+}
+
+template <typename T>
+std::string DebugStringVariant(const T& value) {
+  return DebugStringVariantImpl(value, DebugStringResolver<T>());
+}
+
+template <typename T>
+void EncodeVariant(const T& value, VariantTensorData* data) {
+  EncodeVariantImpl(value, TypeResolver<T>(), data);
+  data->set_type_name(TypeNameVariant(value));
+}
+
+template <typename T>
+bool DecodeVariant(VariantTensorData* data, T* value) {
+  return DecodeVariantImpl(std::move(*data), TypeResolver<T>(), value);
+}
+
+template <typename T>
+void EncodeVariant(const T& value, std::string* buf) {
+  VariantTensorData data;
+  EncodeVariantImpl(value, TypeResolver<T>(), &data);
+  data.set_type_name(TypeNameVariant(value));
+  DCHECK(buf != nullptr);
+  data.SerializeToString(buf);
+}
+
+template <typename T>
+bool DecodeVariant(std::string* buf, T* value) {
+  VariantTensorData data;
+  if (!data.ParseFromString(*buf)) return false;
+  if (!DecodeVariantImpl(std::move(data), TypeResolver<T>(), value)) {
+    return false;
+  }
+  return true;
+}
+
+// Specializations for VariantTensorDataProto
+template <>
+std::string TypeNameVariant(const VariantTensorDataProto& value);
+
+template <>
+void EncodeVariant(const VariantTensorDataProto& value,
+                   VariantTensorData* data);
+
+template <>
+bool DecodeVariant(VariantTensorData* data, VariantTensorDataProto* value);
+
+template <>
+void EncodeVariant(const VariantTensorDataProto& value, std::string* buf);
+
+template <>
+bool DecodeVariant(std::string* buf, VariantTensorDataProto* value);
+
+// Encodes an array of Variant objects in to the given StringListEncoder.
+// `variant_array` is assumed to point to an array of `n` Variant objects.
+void EncodeVariantList(const Variant* variant_array, int64_t n,
+                       std::unique_ptr<port::StringListEncoder> e);
+
+// Decodes an array of Variant objects from the given StringListDecoder.
+// `variant_array` is assumed to point to an array of `n` Variant objects.
+bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
+                       Variant* variant_array, int64_t n);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/variant_op_registry.h b/third_party/tflite-hdrs/tensorflow/core/framework/variant_op_registry.h
new file mode 100644
index 00000000..c7d8680d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/variant_op_registry.h
@@ -0,0 +1,596 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_OP_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_OP_REGISTRY_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/abi.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+// A global UnaryVariantOpRegistry is used to hold callback functions
+// for different variant types.  To be used by ShapeOp, RankOp, and
+// SizeOp, decoding, etc.
+
+enum VariantUnaryOp {
+  INVALID_VARIANT_UNARY_OP = 0,
+  ZEROS_LIKE_VARIANT_UNARY_OP = 1,
+  CONJ_VARIANT_UNARY_OP = 2,
+};
+
+const char* VariantUnaryOpToString(VariantUnaryOp op);
+
+enum VariantBinaryOp {
+  INVALID_VARIANT_BINARY_OP = 0,
+  ADD_VARIANT_BINARY_OP = 1,
+};
+
+const char* VariantBinaryOpToString(VariantBinaryOp op);
+
+enum VariantDeviceCopyDirection {
+  INVALID_DEVICE_COPY_DIRECTION = 0,
+  HOST_TO_DEVICE = 1,
+  DEVICE_TO_HOST = 2,
+  DEVICE_TO_DEVICE = 3,
+};
+
+class UnaryVariantOpRegistry;
+extern UnaryVariantOpRegistry* UnaryVariantOpRegistryGlobal();
+
+class UnaryVariantOpRegistry {
+ public:
+  typedef std::function<bool(Variant*)> VariantDecodeFn;
+  typedef std::function<absl::Status(OpKernelContext*, const Variant&,
+                                     Variant*)>
+      VariantUnaryOpFn;
+  typedef std::function<absl::Status(OpKernelContext*, const Variant&,
+                                     const Variant&, Variant*)>
+      VariantBinaryOpFn;
+
+  // An AsyncTensorDeviceCopyFn is a function provided to
+  // the user-provided DeviceCopyFn callback as the third argument ("copier").
+  //
+  // Expected inputs:
+  //   from: A Tensor on the host (if performing cpu->gpu copy), or
+  //         device (if performing gpu->cpu or gpu->gpu copy).
+  //   to: An empty/uninitialized tensor.  It will be updated upon
+  //       successful return of the function with the correct dtype and shape.
+  //       However, the copied data will not be available until the compute
+  //       stream has been synchronized.
+  //
+  // Returns:
+  //   The status upon memory allocation / initialization of the
+  //   "to" tensor, and enqueue of the copy onto the compute stream.
+  //   Any failure of the copy itself will update the underlying
+  //   stream status and propagate through the runtime independent
+  //   of the caller.
+  typedef std::function<absl::Status(const Tensor& from, Tensor* to)>
+      AsyncTensorDeviceCopyFn;
+
+  // The AsyncVariantDeviceCopyFn is the signature of the 'device_copy_fn'
+  // expected to be passed to the registration macro
+  // INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION.
+  typedef std::function<absl::Status(const Variant& from, Variant* to,
+                                     AsyncTensorDeviceCopyFn copy_fn)>
+      AsyncVariantDeviceCopyFn;
+
+  // Add a decode function to the registry.
+  void RegisterDecodeFn(const std::string& type_name,
+                        const VariantDecodeFn& decode_fn);
+
+  // Returns nullptr if no decode function was found for the given TypeName.
+  VariantDecodeFn* GetDecodeFn(absl::string_view type_name);
+
+  // Add a copy-to-GPU function to the registry.
+  void RegisterDeviceCopyFn(const VariantDeviceCopyDirection direction,
+                            const TypeIndex& type_index,
+                            const AsyncVariantDeviceCopyFn& device_copy_fn) {
+    AsyncVariantDeviceCopyFn* existing = GetDeviceCopyFn(direction, type_index);
+    CHECK_EQ(existing, nullptr)
+        << "UnaryVariantDeviceCopy for direction: " << direction
+        << " and type_index: " << port::MaybeAbiDemangle(type_index.name())
+        << " already registered";
+    device_copy_fns.insert(
+        std::pair<std::pair<VariantDeviceCopyDirection, TypeIndex>,
+                  AsyncVariantDeviceCopyFn>(
+            std::make_pair(direction, type_index), device_copy_fn));
+  }
+
+  // Returns nullptr if no copy function was found for the given
+  // TypeName and direction.
+  AsyncVariantDeviceCopyFn* GetDeviceCopyFn(
+      const VariantDeviceCopyDirection direction, const TypeIndex& type_index) {
+    auto found = device_copy_fns.find(std::make_pair(direction, type_index));
+    if (found == device_copy_fns.end()) return nullptr;
+    return &found->second;
+  }
+
+  // Add a unary op function to the registry.
+  void RegisterUnaryOpFn(VariantUnaryOp op, const std::string& device,
+                         const TypeIndex& type_index,
+                         const VariantUnaryOpFn& unary_op_fn) {
+    VariantUnaryOpFn* existing = GetUnaryOpFn(op, device, type_index);
+    CHECK_EQ(existing, nullptr)
+        << "Unary VariantUnaryOpFn for type_index: "
+        << port::MaybeAbiDemangle(type_index.name())
+        << " already registered for device type: " << device;
+    unary_op_fns.insert(std::pair<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn>(
+        {op, GetPersistentStringPiece(device), type_index}, unary_op_fn));
+  }
+
+  // Returns nullptr if no unary op function was found for the given
+  // op, device, and TypeName.
+  VariantUnaryOpFn* GetUnaryOpFn(VariantUnaryOp op, absl::string_view device,
+                                 const TypeIndex& type_index) {
+    auto found = unary_op_fns.find({op, device, type_index});
+    if (found == unary_op_fns.end()) return nullptr;
+    return &found->second;
+  }
+
+  // Add a binary op function to the registry.
+  void RegisterBinaryOpFn(VariantBinaryOp op, const std::string& device,
+                          const TypeIndex& type_index,
+                          const VariantBinaryOpFn& add_fn) {
+    VariantBinaryOpFn* existing = GetBinaryOpFn(op, device, type_index);
+    CHECK_EQ(existing, nullptr)
+        << "Unary VariantBinaryOpFn for type_index: "
+        << port::MaybeAbiDemangle(type_index.name())
+        << " already registered for device type: " << device;
+    binary_op_fns.insert(
+        std::pair<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn>(
+            {op, GetPersistentStringPiece(device), type_index}, add_fn));
+  }
+
+  // Returns nullptr if no binary op function was found for the given
+  // op, device and TypeName.
+  VariantBinaryOpFn* GetBinaryOpFn(VariantBinaryOp op, absl::string_view device,
+                                   const TypeIndex& type_index) {
+    auto found = binary_op_fns.find({op, device, type_index});
+    if (found == binary_op_fns.end()) return nullptr;
+    return &found->second;
+  }
+
+  // Get a pointer to a global UnaryVariantOpRegistry object
+  static UnaryVariantOpRegistry* Global() {
+    return UnaryVariantOpRegistryGlobal();
+  }
+
+  // Get a pointer to a global persistent string storage object.
+  // ISO/IEC C++ working draft N4296 clarifies that insertion into an
+  // std::unordered_set does not invalidate memory locations of
+  // *values* inside the set (though it may invalidate existing
+  // iterators).  In other words, one may safely point a StringPiece to
+  // a value in the set without that StringPiece being invalidated by
+  // future insertions.
+  static std::unordered_set<string>* PersistentStringStorage();
+
+ private:
+  struct TypeIndexHash {
+    std::size_t operator()(const TypeIndex& x) const { return x.hash_code(); }
+  };
+
+  gtl::FlatMap<absl::string_view, VariantDecodeFn, StringPieceHasher>
+      decode_fns;
+
+  // Map std::pair<Direction, type_name> to function.
+  struct PairHash {
+    template <typename Direction>
+    std::size_t operator()(const std::pair<Direction, TypeIndex>& x) const {
+      // The hash of an enum is just its value as a std::size_t.
+      std::size_t ret = static_cast<std::size_t>(std::get<0>(x));
+      ret = Hash64Combine(ret, std::get<1>(x).hash_code());
+      return ret;
+    }
+  };
+
+  gtl::FlatMap<std::pair<VariantDeviceCopyDirection, TypeIndex>,
+               AsyncVariantDeviceCopyFn, PairHash>
+      device_copy_fns;
+
+  // Map std::tuple<Op, device, type_name> to function.
+
+  // this breaks by falling victim to "too perfect forwarding"
+  // see https://stackoverflow.com/questions/44475317/variadic-template-issue
+  // and references therein
+  template <typename Op>
+  struct FuncTuple {
+    FuncTuple(const Op& op, const absl::string_view& dev,
+              const TypeIndex& type_index)
+        : op_type_(op), device_(dev), type_index_(type_index) {}
+    Op op_type_;
+    absl::string_view device_;
+    TypeIndex type_index_;
+  };
+  // friend declaration for operator==
+  // needed for clang
+  template <typename Op>
+  friend bool operator==(const FuncTuple<Op>& l, const FuncTuple<Op>& r);
+  struct TupleHash {
+    template <typename Op>
+    std::size_t operator()(
+        const std::tuple<Op, absl::string_view, TypeIndex>& x) const {
+      // The hash of an enum is just its value as a std::size_t.
+      std::size_t ret = static_cast<std::size_t>(std::get<0>(x));
+      ret = Hash64Combine(ret, sp_hasher_(std::get<1>(x)));
+      ret = Hash64Combine(ret, std::get<2>(x).hash_code());
+      return ret;
+    }
+
+    template <typename Op>
+    std::size_t operator()(const FuncTuple<Op>& x) const {
+      // The hash of an enum is just its value as a std::size_t.
+      std::size_t ret = static_cast<std::size_t>(x.op_type_);
+      ret = Hash64Combine(ret, sp_hasher_(x.device_));
+      ret = Hash64Combine(ret, x.type_index_.hash_code());
+      return ret;
+    }
+    StringPieceHasher sp_hasher_;
+  };
+  gtl::FlatMap<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn, TupleHash>
+      unary_op_fns;
+  gtl::FlatMap<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn, TupleHash>
+      binary_op_fns;
+
+  // Find or insert a string into a persistent string storage
+  // container; return the StringPiece pointing to the permanent string
+  // location.
+  static absl::string_view GetPersistentStringPiece(const std::string& str) {
+    const auto string_storage = PersistentStringStorage();
+    auto found = string_storage->find(str);
+    if (found == string_storage->end()) {
+      auto inserted = string_storage->insert(str);
+      return absl::string_view(*inserted.first);
+    } else {
+      return absl::string_view(*found);
+    }
+  }
+};
+template <typename Op>
+inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
+                       const UnaryVariantOpRegistry::FuncTuple<Op>& rhs) {
+  return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
+         (lhs.type_index_ == rhs.type_index_);
+}
+
+// Decodes the Variant whose data_type has a registered decode
+// function.  Returns an Internal error if the Variant does not have a
+// registered decode function, or if the decoding function fails.
+//
+// REQUIRES:
+//   variant is not null.
+//
+bool DecodeUnaryVariant(Variant* variant);
+
+// Copies a variant between CPU<->GPU, or between GPU<->GPU.
+// The variant 'from' must have a registered DeviceCopyFn for the
+// given direction.  The returned variant 'to' will have
+// (some subset of its) tensors stored on destination according to the
+// registered DeviceCopyFn function for the given direction.  Returns
+// an Internal error if the Variant does not have a registered
+// DeviceCopyFn function for the given direction, or if initiating the
+// copy fails.
+//
+// REQUIRES:
+//   'to' is not null.
+//
+absl::Status VariantDeviceCopy(
+    const VariantDeviceCopyDirection direction, const Variant& from,
+    Variant* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy_fn);
+
+// Sets *v_out = unary_op(v).  The variant v must have a registered
+// UnaryOp function for the given Device.  Returns an Internal error
+// if v does not have a registered unary_op function for this device, or if
+// UnaryOp fails.
+//
+// REQUIRES:
+//   v_out is not null.
+//
+template <typename Device>
+absl::Status UnaryOpVariant(OpKernelContext* ctx, VariantUnaryOp op,
+                            const Variant& v, Variant* v_out) {
+  const std::string& device = DeviceName<Device>::value;
+  UnaryVariantOpRegistry::VariantUnaryOpFn* unary_op_fn =
+      UnaryVariantOpRegistry::Global()->GetUnaryOpFn(op, device, v.TypeId());
+  if (unary_op_fn == nullptr) {
+    return errors::Internal("No unary variant unary_op function found for op ",
+                            VariantUnaryOpToString(op),
+                            " Variant type_name: ", v.TypeName(),
+                            " for device type: ", device);
+  }
+  return (*unary_op_fn)(ctx, v, v_out);
+}
+
+// Sets *out = binary_op(a, b).  The variants a and b must be the same type
+// and have a registered binary_op function for the given Device.  Returns an
+// Internal error if a and b are not the same type_name or if
+// if a does not have a registered op function for this device, or if
+// BinaryOp fails.
+//
+// REQUIRES:
+//   out is not null.
+//
+template <typename Device>
+absl::Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
+                              const Variant& a, const Variant& b,
+                              Variant* out) {
+  if (a.TypeId() != b.TypeId()) {
+    return errors::Internal(
+        "BinaryOpVariants: Variants a and b have different "
+        "type ids.  Type names: '",
+        a.TypeName(), "' vs. '", b.TypeName(), "'");
+  }
+  const std::string& device = DeviceName<Device>::value;
+  UnaryVariantOpRegistry::VariantBinaryOpFn* binary_op_fn =
+      UnaryVariantOpRegistry::Global()->GetBinaryOpFn(op, device, a.TypeId());
+  if (binary_op_fn == nullptr) {
+    return errors::Internal("No unary variant binary_op function found for op ",
+                            VariantBinaryOpToString(op),
+                            " Variant type_name: '", a.TypeName(),
+                            "' for device type: ", device);
+  }
+  return (*binary_op_fn)(ctx, a, b, out);
+}
+
+namespace variant_op_registry_fn_registration {
+
+template <typename T>
+class UnaryVariantDecodeRegistration {
+ public:
+  UnaryVariantDecodeRegistration(const std::string& type_name) {
+    // The Variant is passed by pointer because it should be
+    // mutable: get below may Decode the variant, which
+    // is a self-mutating behavior.  The variant is not modified in
+    // any other way.
+    UnaryVariantOpRegistry::Global()->RegisterDecodeFn(
+        type_name, [type_name](Variant* v) -> bool {
+          DCHECK_NE(v, nullptr);
+          VariantTensorDataProto* t = v->get<VariantTensorDataProto>();
+          if (t == nullptr) {
+            return false;
+          }
+          Variant decoded = T();
+          VariantTensorData data(std::move(*t));
+          if (!decoded.Decode(std::move(data))) {
+            return false;
+          }
+          std::swap(decoded, *v);
+          return true;
+        });
+  }
+};
+
+template <typename T>
+class UnaryVariantDeviceCopyRegistration {
+ public:
+  typedef std::function<absl::Status(
+      const T& t, T* t_out, UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn)>
+      LocalVariantDeviceCopyFn;
+  UnaryVariantDeviceCopyRegistration(
+      const VariantDeviceCopyDirection direction, const TypeIndex& type_index,
+      const LocalVariantDeviceCopyFn& device_copy_fn) {
+    const std::string type_index_name =
+        port::MaybeAbiDemangle(type_index.name());
+    UnaryVariantOpRegistry::Global()->RegisterDeviceCopyFn(
+        direction, type_index,
+        [type_index_name, device_copy_fn](
+            const Variant& from, Variant* to,
+            UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn
+                device_copy_tensor_fn) -> absl::Status {
+          DCHECK_NE(to, nullptr);
+          *to = T();
+          if (from.get<T>() == nullptr) {
+            return errors::Internal(
+                "VariantCopyToGPUFn: Could not access object, type_index: ",
+                type_index_name);
+          }
+          const T& t = *from.get<T>();
+          T* t_out = to->get<T>();
+          return device_copy_fn(t, t_out, device_copy_tensor_fn);
+        });
+  }
+};
+
+template <typename T>
+class UnaryVariantUnaryOpRegistration {
+  typedef std::function<absl::Status(OpKernelContext* ctx, const T& t,
+                                     T* t_out)>
+      LocalVariantUnaryOpFn;
+
+ public:
+  UnaryVariantUnaryOpRegistration(VariantUnaryOp op, const std::string& device,
+                                  const TypeIndex& type_index,
+                                  const LocalVariantUnaryOpFn& unary_op_fn) {
+    const std::string type_index_name =
+        port::MaybeAbiDemangle(type_index.name());
+    UnaryVariantOpRegistry::Global()->RegisterUnaryOpFn(
+        op, device, type_index,
+        [type_index_name, unary_op_fn](OpKernelContext* ctx, const Variant& v,
+                                       Variant* v_out) -> absl::Status {
+          DCHECK_NE(v_out, nullptr);
+          *v_out = T();
+          if (v.get<T>() == nullptr) {
+            return errors::Internal(
+                "VariantUnaryOpFn: Could not access object, type_index: ",
+                type_index_name);
+          }
+          const T& t = *v.get<T>();
+          T* t_out = v_out->get<T>();
+          return unary_op_fn(ctx, t, t_out);
+        });
+  }
+};
+
+template <typename T>
+class UnaryVariantBinaryOpRegistration {
+  typedef std::function<absl::Status(OpKernelContext* ctx, const T& a,
+                                     const T& b, T* out)>
+      LocalVariantBinaryOpFn;
+
+ public:
+  UnaryVariantBinaryOpRegistration(VariantBinaryOp op,
+                                   const std::string& device,
+                                   const TypeIndex& type_index,
+                                   const LocalVariantBinaryOpFn& binary_op_fn) {
+    const std::string type_index_name =
+        port::MaybeAbiDemangle(type_index.name());
+    UnaryVariantOpRegistry::Global()->RegisterBinaryOpFn(
+        op, device, type_index,
+        [type_index_name, binary_op_fn](OpKernelContext* ctx, const Variant& a,
+                                        const Variant& b,
+                                        Variant* out) -> absl::Status {
+          DCHECK_NE(out, nullptr);
+          *out = T();
+          if (a.get<T>() == nullptr) {
+            return errors::Internal(
+                "VariantBinaryOpFn: Could not access object 'a', type_index: ",
+                type_index_name);
+          }
+          if (b.get<T>() == nullptr) {
+            return errors::Internal(
+                "VariantBinaryOpFn: Could not access object 'b', type_index: ",
+                type_index_name);
+          }
+          const T& t_a = *a.get<T>();
+          const T& t_b = *b.get<T>();
+          T* t_out = out->get<T>();
+          return binary_op_fn(ctx, t_a, t_b, t_out);
+        });
+  }
+};
+
+};  // namespace variant_op_registry_fn_registration
+
+// Register a unary decode variant function for the given type.
+#define REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, type_name) \
+  REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ_HELPER(__COUNTER__, T, type_name)
+
+#define REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ_HELPER(ctr, T, type_name) \
+  REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ(ctr, T, type_name)
+
+#define REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ(ctr, T, type_name) \
+  static ::tensorflow::variant_op_registry_fn_registration::           \
+      UnaryVariantDecodeRegistration<T>                                \
+          register_unary_variant_op_decoder_fn_##ctr(type_name)
+
+// ****** NOTE ******
+// FOR INTERNAL USE ONLY.  IF YOU USE THIS WE MAY BREAK YOUR CODE.
+// ****** NOTE ******
+//
+// Register a device copy variant function for the given copy
+// direction and type; where direction is the enum
+// VariantDeviceCopyDirection, and the device_copy_fn has signature:
+//
+//   Status device_copy_fn(
+//     const T& t, T* t_out,
+//     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copier);
+//
+// And device_copy_fn calls copier 0 or more times.  For details on
+// the behavior of the copier function, see the comments at the
+// declaration of UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn.
+//
+// Note, the device_copy_fn may choose to keep some tensors
+// on host, e.g. by assigning to->tensor = from.tensor (assuming
+// from.tensor is already on host); or by setting
+//   to->tensor = Tensor(cpu_allocator(), ...)
+// and manually updating its values.
+//
+// If this is the case, the CopyFns for HOST_TO_DEVICE,
+// DEVICE_TO_HOST, and DEVICE_TO_DEVICE must perform host-to-host
+// copies in a consistent manner.  For example, one must always
+// manually copy any "always on host" tensors in all directions instead of e.g.
+//   - performing a host-to-host copy in one direction,
+//   - using the provided copier function in the reverse direction.
+// Doing the latter will cause program failures.
+//
+// ****** NOTE ******
+// FOR INTERNAL USE ONLY.  IF YOU USE THIS WE MAY BREAK YOUR CODE.
+// ****** NOTE ******
+#define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(T, direction,   \
+                                                             device_copy_fn) \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ_HELPER(          \
+      __COUNTER__, T, direction, TypeIndex::Make<T>(), device_copy_fn)
+
+#define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ_HELPER( \
+    ctr, T, direction, type_index, device_copy_fn)                        \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ(              \
+      ctr, T, direction, type_index, device_copy_fn)
+
+#define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ( \
+    ctr, T, direction, type_index, device_copy_fn)                 \
+  static variant_op_registry_fn_registration::                     \
+      UnaryVariantDeviceCopyRegistration<T>                        \
+          register_unary_variant_op_device_copy_fn_##ctr(          \
+              direction, type_index, device_copy_fn)
+
+// Register a unary unary_op variant function with the signature:
+//    Status UnaryOpFn(OpKernelContext* ctx, const T& t, T* t_out);
+// to Variants having TypeIndex type_index, for device string device,
+// for UnaryVariantOp enum op.
+#define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(op, device, T,     \
+                                                 unary_op_function) \
+  REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ_HELPER(             \
+      __COUNTER__, op, device, T, TypeIndex::Make<T>(), unary_op_function)
+
+#define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ_HELPER(       \
+    ctr, op, device, T, type_index, unary_op_function)              \
+  REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ(ctr, op, device, T, \
+                                                type_index, unary_op_function)
+
+#define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ(                       \
+    ctr, op, device, T, type_index, unary_op_function)                       \
+  static ::tensorflow::variant_op_registry_fn_registration::                 \
+      UnaryVariantUnaryOpRegistration<T>                                     \
+          register_unary_variant_op_decoder_fn_##ctr(op, device, type_index, \
+                                                     unary_op_function)
+
+// Register a binary_op variant function with the signature:
+//    Status BinaryOpFn(OpKernelContext* ctx, const T& a, const T& b, T* out);
+// to Variants having TypeIndex type_index, for device string device,
+// for BinaryVariantOp enum OP.
+#define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(op, device, T,      \
+                                                  binary_op_function) \
+  REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ_HELPER(              \
+      __COUNTER__, op, device, T, TypeIndex::Make<T>(), binary_op_function)
+
+#define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ_HELPER( \
+    ctr, op, device, T, type_index, binary_op_function)        \
+  REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ(              \
+      ctr, op, device, T, type_index, binary_op_function)
+
+#define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ(                      \
+    ctr, op, device, T, type_index, binary_op_function)                      \
+  static ::tensorflow::variant_op_registry_fn_registration::                 \
+      UnaryVariantBinaryOpRegistration<T>                                    \
+          register_unary_variant_op_decoder_fn_##ctr(op, device, type_index, \
+                                                     binary_op_function)
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_OP_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/variant_tensor_data.h b/third_party/tflite-hdrs/tensorflow/core/framework/variant_tensor_data.h
new file mode 100644
index 00000000..bfe5899d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/variant_tensor_data.h
@@ -0,0 +1,144 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class VariantTensorDataProto;
+
+// The serialization format for Variant objects. Objects with references to
+// other Tensors can simply store those tensors in the `tensors` field, and
+// serialize other metadata content in to the `metadata` field. Objects can
+// optionally set the `type_name` for type-checking before deserializing an
+// object.
+//
+// This is the native C++ class equivalent of VariantTensorDataProto. They are
+// separate so that kernels do not need to depend on protos.
+class VariantTensorData {
+ public:
+  VariantTensorData() = default;
+
+  // TODO(b/118823936): This silently returns if the proto is invalid.
+  // Consider calling FromProto explicitly instead.
+  VariantTensorData(VariantTensorDataProto proto);
+
+  // Name of the type of objects being serialized.
+  const std::string& type_name() const { return type_name_; }
+  void set_type_name(const std::string& type_name) { type_name_ = type_name; }
+
+  template <typename T, bool = std::is_pod<typename std::decay<T>::type>::value>
+  struct PODResolver {};
+
+  // Portions of the object that are not Tensors.
+  // Directly supported types include string POD types.
+  template <typename T>
+  void set_metadata(const T& value) {
+    SetMetadata(value, PODResolver<T>());
+  }
+
+  template <typename T>
+  bool get_metadata(T* value) const {
+    return GetMetadata(value, PODResolver<T>());
+  }
+
+  std::string& metadata_string() { return metadata_; }
+
+  const std::string& metadata_string() const { return metadata_; }
+
+  // Tensors contained within objects being serialized.
+  int tensors_size() const;
+  const Tensor& tensors(int index) const;
+  const std::vector<Tensor>& tensors() const;
+  Tensor* add_tensors();
+
+  // A more general version of add_tensors. Parameters are perfectly forwarded
+  // to the constructor of the tensor added here.
+  template <typename... TensorConstructorArgs>
+  Tensor* add_tensor(TensorConstructorArgs&&... args);
+
+  // Conversion to and from VariantTensorDataProto
+  void ToProto(VariantTensorDataProto* proto) const;
+  // This allows optimizations via std::move.
+  bool FromProto(VariantTensorDataProto proto);
+  bool FromConstProto(const VariantTensorDataProto& proto);
+
+  // Serialization via VariantTensorDataProto
+  std::string SerializeAsString() const;
+  bool SerializeToString(std::string* buf);
+  bool ParseFromString(std::string s);
+
+  std::string DebugString() const;
+
+ public:
+  std::string type_name_;
+  std::string metadata_;
+  std::vector<Tensor> tensors_;
+
+ private:
+  void SetMetadata(const std::string& value,
+                   PODResolver<std::string, false /* is_pod */>) {
+    metadata_ = value;
+  }
+
+  bool GetMetadata(std::string* value,
+                   PODResolver<std::string, false /* is_pod */>) const {
+    *value = metadata_;
+    return true;
+  }
+
+  // Specialize for bool, it is undefined behvaior to assign a non 0/1 value to
+  // a bool. Now we coerce a non-zero value to true.
+  bool GetMetadata(bool* value, PODResolver<bool, true /* is_pod */>) const {
+    if (metadata_.size() != sizeof(bool)) return false;
+    *value = false;
+    for (size_t i = 0; i < sizeof(bool); ++i)
+      *value = *value || (metadata_.data()[i] != 0);
+    return true;
+  }
+
+  template <typename T>
+  void SetMetadata(const T& value, PODResolver<T, true /* is_pod */>) {
+    metadata_.assign(reinterpret_cast<const char*>(&value), sizeof(T));
+  }
+
+  template <typename T>
+  bool GetMetadata(T* value, PODResolver<T, true /* is_pod */>) const {
+    if (metadata_.size() != sizeof(T)) return false;
+    std::copy_n(metadata_.data(), sizeof(T), reinterpret_cast<char*>(value));
+    return true;
+  }
+};
+
+// For backwards compatibility for when this was a proto
+std::string ProtoDebugString(const VariantTensorData& object);
+
+template <typename... TensorConstructorArgs>
+Tensor* VariantTensorData::add_tensor(TensorConstructorArgs&&... args) {
+  tensors_.emplace_back(std::forward<TensorConstructorArgs>(args)...);
+  return &tensors_.back();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/framework/versions.h b/third_party/tflite-hdrs/tensorflow/core/framework/versions.h
new file mode 100644
index 00000000..a63ff703
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/framework/versions.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VERSIONS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VERSIONS_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class VersionDef;
+
+// Check whether data with the given versions is compatible with the given
+// consumer and min producer.  upper_name and lower_name are used to form
+// error messages upon failure.  Example usage:
+//
+//   #include "tensorflow/core/public/version.h"
+//
+//   TF_RETURN_IF_ERROR(CheckVersions(versions, TF_GRAPH_DEF_VERSION,
+//                                    TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
+//                                    "GraphDef", "graph"));
+absl::Status CheckVersions(const VersionDef& versions, int consumer,
+                           int min_producer, const char* upper_name,
+                           const char* lower_name);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VERSIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/function/runtime_client/runtime_client.h b/third_party/tflite-hdrs/tensorflow/core/function/runtime_client/runtime_client.h
new file mode 100644
index 00000000..789788fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/function/runtime_client/runtime_client.h
@@ -0,0 +1,100 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_RUNTIME_CLIENT_H_
+#define TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_RUNTIME_CLIENT_H_
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace core {
+namespace function {
+
+// TODO(mdan): Get rid of this once pybind can depend on MLIR headers.
+// This empty struct serves to hide a pointer to an actual MLIR TFG dialect
+// FuncOp object.
+struct OpaqueTfgGraphFuncOp;
+
+// TODO(xjun): Get rid of this once pybind can depend on MLIR headers.
+// This empty struct serves to hide a pointer to an actual MLIR TF dialect
+// FuncOp object.
+struct OpaqueTfFuncOp;
+
+// This is the current global context managed by the Python API. For historical
+// reasons, the Python runtime controls this context and all other clients must
+// use it. See tensorflow/python/eager/pywrap_tfe.h and
+// tensorflow/python/eager/context.py.
+//
+// This must always be called after the Python eager context was initialized.
+//
+// If the Python runtime isn't involved, or when writing code that exclusively
+// relies on functions defined in this namespace, users are encouraged to
+// maintain their own EagerContext or use GlobalEagerContext.
+EagerContext& GlobalPythonEagerContext();
+
+// This global context is available for testing and to be shared among various
+// APIs.
+EagerContext& GlobalEagerContext();
+
+using ReturnValues = std::vector<ImmediateTensorHandlePtr>;
+
+// A public API for manipulating and executing functions in a TensorFlow
+// runtime.
+class Runtime {
+ public:
+  explicit Runtime(EagerContext& eager_ctx) : eager_ctx_(eager_ctx) {}
+
+  enum class Dialect {
+    TFG,
+    TF,
+  };
+
+  absl::StatusOr<FunctionDef> GetFunctionProto(absl::string_view name);
+
+  // TODO(mdan): Enforce creation or rename to SetFunction.
+  absl::Status CreateFunction(const FunctionDef& fdef);
+  // TODO(mdan): Change to mlir::tfg::GraphFuncOp once pybind can depend on it.
+  absl::Status CreateFunction(OpaqueTfgGraphFuncOp* fop);
+  // TODO(xjun): Change to mlir::func::FuncOp once pybind can depend on it.
+  absl::Status CreateFunction(OpaqueTfFuncOp* fop);
+  // Applies a MLIR pipeline to an existing function.
+  // The pipeline may rename the function. If it does so, the old function
+  // remains unchanged. If the new name specifies an existing function, it will
+  // be overwritten.
+  absl::Status TransformFunction(absl::string_view name,
+                                 absl::string_view pipeline_name,
+                                 Dialect dialect = Dialect::TFG);
+
+  absl::StatusOr<ReturnValues> CallFunction(
+      absl::string_view name, absl::Span<AbstractTensorHandle* const> args);
+
+ private:
+  EagerContext& eager_ctx_;
+};
+
+}  // namespace function
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_RUNTIME_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/function/testing/test_pass.h b/third_party/tflite-hdrs/tensorflow/core/function/testing/test_pass.h
new file mode 100644
index 00000000..93c2116f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/function/testing/test_pass.h
@@ -0,0 +1,133 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FUNCTION_TESTING_TEST_PASS_H_
+#define TENSORFLOW_CORE_FUNCTION_TESTING_TEST_PASS_H_
+
+#include <memory>
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/ops.h"
+#include "tensorflow/core/ir/tf_op_wrapper.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace core {
+namespace function {
+namespace testing {
+
+// A simple testing pass for BinaryFunction that replaces an AddV2 node named
+// `x_plus_y` with a Mul one.
+struct TestPassTfgDialect
+    : public mlir::PassWrapper<TestPassTfgDialect,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  TestPassTfgDialect() = default;
+
+  llvm::StringRef getArgument() const final { return "test-pass"; }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::OpBuilder builder(module);
+    mlir::tfg::TFGraphDialect* dialect =
+        builder.getContext()->getOrLoadDialect<mlir::tfg::TFGraphDialect>();
+
+    mlir::Operation* target = nullptr;
+    module->walk([&target](mlir::tfg::TFOp op) {
+      if (op.nameAttr() == nullptr) {
+        return;
+      }
+      if (op.name() != "x_plus_y") {
+        return;
+      }
+      target = op.getOperation();
+    });
+    DCHECK(target != nullptr);
+
+    builder.setInsertionPoint(target);
+    mlir::OperationState opstate(builder.getUnknownLoc(), "tfg.Mul");
+    opstate.operands.append(target->getOperands().begin(),
+                            target->getOperands().end());
+    opstate.types.append(target->getResultTypes().begin(),
+                         target->getResultTypes().end());
+    opstate.addAttribute("T", target->getAttr("T"));
+    opstate.addAttribute(dialect->getNameAttrIdentifier(),
+                         builder.getStringAttr("x_times_y"));
+
+    mlir::Operation* replacement = builder.create(opstate);
+    target->replaceAllUsesWith(replacement->getResults());
+    target->erase();
+  }
+};
+
+// A simple testing pass that replaces the first Mul node in the module
+// to a AddV2 node and names it `x_plus_y`.
+struct TestPassTfDialect
+    : public mlir::PassWrapper<TestPassTfDialect,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  TestPassTfDialect() = default;
+
+  llvm::StringRef getArgument() const final { return "test-pass-tf-dialect"; }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::OpBuilder builder(module);
+
+    mlir::Operation* target = nullptr;
+    module->walk([&target](mlir::Operation* op) {
+      if (op->getName().getStringRef() == "tf.Mul") {
+        target = op;
+        return;
+      }
+    });
+    DCHECK(target != nullptr);
+
+    builder.setInsertionPoint(target);
+    auto replacement = builder.create<mlir::TF::AddV2Op>(
+        mlir::NameLoc::get(
+            mlir::StringAttr::get(builder.getContext(), "x_plus_y")),
+        target->getResultTypes(), target->getOperand(0), target->getOperand(1));
+    target->replaceAllUsesWith(replacement->getResults());
+    target->erase();
+  }
+};
+
+inline std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfgDialectTestPass() {
+  return std::make_unique<TestPassTfgDialect>();
+}
+
+inline std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfDialectTestPass() {
+  return std::make_unique<TestPassTfDialect>();
+}
+
+inline void RegisterTestPass() {
+  mlir::registerPass([] { return CreateTfgDialectTestPass(); });
+  mlir::registerPass([] { return CreateTfDialectTestPass(); });
+}
+
+}  // namespace testing
+}  // namespace function
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FUNCTION_TESTING_TEST_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/algorithm.h b/third_party/tflite-hdrs/tensorflow/core/graph/algorithm.h
new file mode 100644
index 00000000..e20d6823
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/algorithm.h
@@ -0,0 +1,154 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_ALGORITHM_H_
+#define TENSORFLOW_CORE_GRAPH_ALGORITHM_H_
+
+#include <functional>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// Comparator for two nodes. This is used in order to get a stable ording.
+using NodeComparator = std::function<bool(const Node*, const Node*)>;
+
+using EdgeFilter = std::function<bool(const Edge&)>;
+
+// Compares two node based on their ids.
+struct NodeComparatorID {
+  bool operator()(const Node* n1, const Node* n2) const {
+    return n1->id() < n2->id();
+  }
+};
+
+// Compare two nodes based on their names.
+struct NodeComparatorName {
+  bool operator()(const Node* n1, const Node* n2) const {
+    return n1->name() < n2->name();
+  }
+};
+
+// Perform a depth-first-search on g starting at the source node.
+// If enter is not empty, calls enter(n) before visiting any children of n.
+// If leave is not empty, calls leave(n) after visiting all children of n.
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+// If edge_filter is set then ignores edges for which edge_filter returns false.
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave,
+         const NodeComparator& stable_comparator = {},
+         const EdgeFilter& edge_filter = {});
+
+// Perform a depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any children of n.
+// If leave is not empty, calls leave(n) after visiting all children of n.
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+// If edge_filter is set then ignores edges for which edge_filter returns false.
+void DFSFrom(const Graph& g, absl::Span<Node* const> start,
+             const std::function<void(Node*)>& enter,
+             const std::function<void(Node*)>& leave,
+             const NodeComparator& stable_comparator = {},
+             const EdgeFilter& edge_filter = {});
+void DFSFrom(const Graph& g, absl::Span<const Node* const> start,
+             const std::function<void(const Node*)>& enter,
+             const std::function<void(const Node*)>& leave,
+             const NodeComparator& stable_comparator = {},
+             const EdgeFilter& edge_filter = {});
+
+// Perform a reverse depth-first-search on g starting at the sink node.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+// If edge_filter is set then ignores edges for which edge_filter returns false.
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave,
+                const NodeComparator& stable_comparator = {},
+                const EdgeFilter& edge_filter = {});
+
+// Perform a reverse depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+// If edge_filter is set then ignores edges for which edge_filter returns false.
+void ReverseDFSFrom(const Graph& g, absl::Span<Node* const> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
+void ReverseDFSFrom(const Graph& g, absl::Span<const Node* const> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
+
+void BreadthFirstTraversal(
+    const Graph& g, absl::Span<const Node* const> start,
+    const std::function<void(const Node*)>& visit,
+    NodeComparator stable_comparator = NodeComparatorID());
+
+void BreadthFirstTraversal(
+    Graph& g, absl::Span<Node* const> start,
+    const std::function<void(Node*)>& visit,
+    NodeComparator stable_comparator = NodeComparatorID());
+
+// Stores in *order the post-order numbering of all nodes
+// in graph found via a depth first search starting at the source node.
+//
+// Note that this is equivalent to reverse topological sorting when the
+// graph does not have cycles.
+//
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+//
+// If edge_filter is set then ignores edges for which edge_filter returns
+// false.
+//
+// REQUIRES: order is not NULL.
+void GetPostOrder(const Graph& g, std::vector<Node*>* order,
+                  const NodeComparator& stable_comparator = {},
+                  const EdgeFilter& edge_filter = {});
+
+// Stores in *order the reverse post-order numbering of all nodes
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+//
+// If edge_filter is set then ignores edges for which edge_filter returns
+// false.
+void GetReversePostOrder(const Graph& g, std::vector<Node*>* order,
+                         const NodeComparator& stable_comparator = {},
+                         const EdgeFilter& edge_filter = {});
+
+// Prune nodes in "g" that are not in some path from the source node
+// to any node in 'nodes'. Returns true if changes were made to the graph.
+// Does not fix up source and sink edges.
+bool PruneForReverseReachability(Graph* g,
+                                 std::unordered_set<const Node*> nodes);
+
+// Connect all nodes with no incoming edges to source.
+// Connect all nodes with no outgoing edges to sink.
+//
+// Returns true if and only if 'g' is mutated.
+bool FixupSourceAndSinkEdges(Graph* g);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_ALGORITHM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/benchmark_testlib.h b/third_party/tflite-hdrs/tensorflow/core/graph/benchmark_testlib.h
new file mode 100644
index 00000000..54716405
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/benchmark_testlib.h
@@ -0,0 +1,191 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_BENCHMARK_TESTLIB_H_
+#define TENSORFLOW_CORE_GRAPH_BENCHMARK_TESTLIB_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace test {
+
+REGISTER_OP("Input").Output("y: float");
+REGISTER_OP("Output")
+    .Input("x: N * float")
+    .Attr("N: int >= 1")
+    .Output("y: float");
+REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("y: float");
+REGISTER_OP("In4Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Output("y: float");
+REGISTER_OP("In8Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Input("e: float")
+    .Input("f: float")
+    .Input("g: float")
+    .Input("h: float")
+    .Output("y: float");
+REGISTER_OP("In16Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Input("e: float")
+    .Input("f: float")
+    .Input("g: float")
+    .Input("h: float")
+    .Input("i: float")
+    .Input("j: float")
+    .Input("k: float")
+    .Input("l: float")
+    .Input("m: float")
+    .Input("n: float")
+    .Input("o: float")
+    .Input("p: float")
+    .Output("y: float");
+
+inline GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
+  const int kNumInNodes = 10 * num_edges_per_node;
+  GraphDef graph_def;
+
+  auto create_node = [](const string& name, const string& op) {
+    NodeDef node;
+    node.set_name(name);
+    node.set_op(op);
+    return node;
+  };
+
+  NodeDef node;
+  for (int in = 0; in < kNumInNodes; ++in) {
+    node = create_node(/*name=*/absl::StrFormat("in%04d", in), /*op=*/"Input");
+    *graph_def.add_node() = std::move(node);
+  }
+
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < num_nodes; ++op) {
+    node = create_node(/*name=*/absl::StrFormat("op%05d", op),
+                       /*op=*/absl::StrFormat("In%dOut1", num_edges_per_node));
+    for (int edge = 0; edge < num_edges_per_node; ++edge) {
+      node.add_input(absl::StrFormat("in%04d", rnd.Uniform(kNumInNodes)));
+    }
+    *graph_def.add_node() = std::move(node);
+  }
+
+  // Add a single sink node. Otherwise a lot of time is spent in
+  // FixupSourceAndSinkEdges().
+  node = create_node(/*name=*/"out", /*op=*/"Output");
+  for (int op = 0; op < num_nodes; ++op) {
+    node.add_input(absl::StrFormat("op%05d", op));
+  }
+  AttrValue attr;
+  attr.set_i(num_nodes);
+  node.mutable_attr()->insert({"N", std::move(attr)});
+  *graph_def.add_node() = std::move(node);
+
+  return graph_def;
+}
+
+inline GraphDef CreateRandomGraph(int size) {
+  random::PhiloxRandom philox(0x12345);
+  random::SimplePhilox rnd(&philox);
+
+  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+
+  GraphDef graph;
+  for (int i = 0; i < size; ++i) {
+    const string name = absl::StrCat(prefix, i);
+    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+
+    NodeDef node;
+    node.set_name(name);
+    for (int n = 0; n < num_inputs; ++n) {
+      const uint32 input_node = rnd.Uniform(i);
+      node.add_input(absl::StrCat(prefix, input_node));
+    }
+
+    *graph.add_node() = std::move(node);
+  }
+
+  return graph;
+}
+
+inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
+                                           int num_regular_fanouts,
+                                           int num_controlling_fanins,
+                                           int num_controlled_fanouts,
+                                           bool fanout_unique_index) {
+  GraphDef graph;
+
+  auto create_node = [](const string& name) {
+    NodeDef node;
+    node.set_name(name);
+    return node;
+  };
+
+  NodeDef node = create_node(/*name=*/"node");
+
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    const string input_node_name = absl::StrFormat("in%05d", i);
+    NodeDef input_node = create_node(/*name=*/input_node_name);
+    *graph.add_node() = std::move(input_node);
+    node.add_input(input_node_name);
+  }
+
+  for (int i = 0; i < num_controlling_fanins; ++i) {
+    const string input_node_name = absl::StrFormat("control_in%05d", i);
+    NodeDef input_node = create_node(/*name=*/input_node_name);
+    *graph.add_node() = std::move(input_node);
+    node.add_input(absl::StrCat("^", input_node_name));
+  }
+
+  for (int i = 0; i < num_regular_fanouts; ++i) {
+    NodeDef output_node = create_node(/*name=*/absl::StrFormat("out%05d", i));
+    const string input_node_index =
+        fanout_unique_index ? absl::StrCat(node.name(), ":", i) : node.name();
+    output_node.add_input(input_node_index);
+    *graph.add_node() = std::move(output_node);
+  }
+
+  const string controlled_fanout_input = absl::StrCat("^", node.name());
+  for (int i = 0; i < num_controlled_fanouts; ++i) {
+    NodeDef output_node =
+        create_node(/*name=*/absl::StrFormat("control_out%05d", i));
+    output_node.add_input(controlled_fanout_input);
+    *graph.add_node() = std::move(output_node);
+  }
+
+  *graph.add_node() = std::move(node);
+
+  return graph;
+}
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_BENCHMARK_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/collective_order.h b/third_party/tflite-hdrs/tensorflow/core/graph/collective_order.h
new file mode 100644
index 00000000..c62017bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/collective_order.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
+#define TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+enum class GraphCollectiveOrder { kNone, kEdges, kAttrs };
+
+// Introduces a deterministic execution order between potentially concurrent
+// CollectiveOps.  This may be used to execute collectives in the same order
+// across all workers in a distributed execution, if all workers are executing
+// the same graph.
+// If `order_type` is `kEdges`, introduce the ordering in the form of explicit
+// control edges between collective graph nodes.  If `order_type` is `kAttrs`,
+// add an attribute to the node which may be used by collective executor to
+// ensure the required ordering.
+absl::Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/colors.h b/third_party/tflite-hdrs/tensorflow/core/graph/colors.h
new file mode 100644
index 00000000..43d22255
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/colors.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_COLORS_H_
+#define TENSORFLOW_CORE_GRAPH_COLORS_H_
+
+namespace tensorflow {
+
+// Return a color drawn from a palette to represent an entity
+// identified by "i".  The return value has the form "#RRGGBB" Note
+// that the palette has a limited set of colors and therefore colors
+// will be reused eventually.
+const char* ColorFor(int dindex);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_COLORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/control_flow.h b/third_party/tflite-hdrs/tensorflow/core/graph/control_flow.h
new file mode 100644
index 00000000..c1e2db33
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/control_flow.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_CONTROL_FLOW_H_
+#define TENSORFLOW_CORE_GRAPH_CONTROL_FLOW_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Control flow info for a graph node.
+struct ControlFlowInfo {
+  // 'frame' and 'parent_frame' are pointers to:
+  //
+  // a) One of the Enter nodes corresponding to the loop body, if the node
+  //    executes inside a loop. If multiple tensors enter the while loop, it's
+  //    undefined which Enter node will be used.
+  //
+  // b) SOURCE node (node.id() == Graph::kSourceId), if the node is not inside
+  //    any of the while loops.
+
+  const Node* frame = nullptr;         // frame of a node
+  const Node* parent_frame = nullptr;  // parent frame of a node
+  string frame_name;                   // frame name of a node
+};
+
+// Clear and populate `info` with each node's frame and the level it belongs to.
+// We check the well-formedness of the graph:
+// 1) All inputs to a node must come from the same frame and have the same
+//    "static" iteration level.
+// 2) Each frame has at most one LoopCond node.
+// 3) Each frame has a single parent frame.
+// If `unreachable_nodes` is set, return names of nodes unreachable from the
+// source node. We cannot build ControlFlowInfo for such nodes. They might be
+// pruned later.
+//
+// NOTE(yuanbyu): For now, we require all sends/recvs have iteration level 0.
+// This essentially means there can't be multiple serial Nexts in an iteration,
+// which all sane front-ends should satisfy.
+absl::Status BuildControlFlowInfo(
+    const Graph* g, std::vector<ControlFlowInfo>* info,
+    std::vector<string>* unreachable_nodes = nullptr);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_CONTROL_FLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/costmodel.h b/third_party/tflite-hdrs/tensorflow/core/graph/costmodel.h
new file mode 100644
index 00000000..795d9472
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/costmodel.h
@@ -0,0 +1,241 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
+#define TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
+
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+typedef std::unordered_map<absl::string_view, int32, StringPieceHasher>
+    NodeNameToCostIdMap;
+
+class StepStats;
+
+// CostModel keeps track of the following runtime statistics for nodes
+// of a single Graph:
+//    * The total number of times a node has executed.
+//    * The accumulated execution time (in microseconds) of a node.
+//    * The accumulated size (in bytes) of each node's output.
+//
+// This class is NOT thread-safe.
+class CostModel {
+ public:
+  // If "global" is true, maintains costs based on Node::cost_id, otherwise
+  // maintains costs based on Node::id.
+  explicit CostModel(bool is_global) : is_global_(is_global) {
+    unknown_shape_.set_unknown_rank(true);
+  }
+
+  // Assigns min_count_ as a function of the median count for a Node.
+  // This value is then used for suppressing the time/size costs of
+  // infrequent operations.
+  // NOTE(tucker): Maybe this should move to a subclass of CostModel.
+  void SuppressInfrequent();
+
+  bool is_global() const { return is_global_; }
+
+  inline int Id(const Node* n) const {
+    if (is_global_) {
+      return n->cost_id();
+    } else {
+      return n->id();
+    }
+  }
+
+  inline int GlobalId(const Node* n, int offset) const {
+    if (is_global_) {
+      return n->cost_id();
+    } else {
+      return n->id() + offset;
+    }
+  }
+
+  // Initializes cost model for 'g'.
+  void InitFromGraph(const Graph& g);
+
+  // Merges costs from cm.
+  // REQUIRES: is_global_ is true for this and for "cm"
+  void MergeFromGlobal(const CostModel& cm);
+
+  // Merges costs from "cm", which has been computed relative to "g".
+  // REQUIRES: is_global_ is true for this, and false for "cm".
+  void MergeFromLocal(const Graph& g, const CostModel& cm);
+
+  void MergeFromStats(const NodeNameToCostIdMap& map, const StepStats& ss);
+
+  // Sets the number of outputs of "node".
+  void SetNumOutputs(const Node* node, int num_outputs);
+
+  // Records that "node" has executed "num_count" more times.
+  void RecordCount(const Node* node, int num_count);
+
+  // Returns how many times "node" has been executed.
+  int32 TotalCount(const Node* node) const;
+
+  // Records that "output_slot" of "node" has produced tensors of
+  // aggregated "bytes".
+  void RecordSize(const Node* node, int output_slot, Bytes bytes);
+
+  // Returns total bytes of tensors produced by "node"s output slot.
+  Bytes TotalBytes(const Node* node, int output_slot) const;
+
+  // Returns a prediction for the size of the tensor at the
+  // output_slot produced by one execution of "node".
+  Bytes SizeEstimate(const Node* node, int output_slot) const;
+
+  // Records that Executions of "node" have taken "time" microseconds.
+  void RecordTime(const Node* node, Microseconds time);
+
+  // Returns the total execution time for "node".
+  Microseconds TotalTime(const Node* node) const;
+
+  // Returns a prediction for one execution of "node".
+  Microseconds TimeEstimate(const Node* node) const;
+
+  // Check that an estimate is available for every OP node in graph.
+  void CheckInitialized(const Graph& graph) const;
+
+  // Records the maximum size in bytes and optionally the corresponding shape of
+  // the tensor generated by "output_slot" of "node". If
+  void RecordMaxMemorySize(const Node* node, int output_slot, Bytes bytes,
+                           const TensorShapeProto& tensor_shape,
+                           const DataType& dtype);
+
+  // Returns the maximum size in bytes of the tensor generated by "output_slot"
+  // of "node".
+  Bytes MaxMemorySize(const Node* node, int output_slot) const;
+
+  // Returns the shape corresponding to the largest memory size of the tensor
+  // generated by "output_slot" of "node".
+  const TensorShapeProto& MaxMemoryShape(const Node* node,
+                                         int output_slot) const;
+
+  // Returns the shape corresponding to the largest memory size of the tensor
+  // generated by "output_slot" of "node".
+  DataType MaxMemoryType(const Node* node, int output_slot) const;
+
+  // Returns the size in bytes of temporary memory consumed by "node".
+  Bytes TempMemorySize(const Node* node) const;
+
+  // Returns the size of persistent memory allocated by "node".
+  Bytes PersistentMemorySize(const Node* node) const;
+
+  // Records memory stats such as temp momory and persistent memory.
+  void RecordMemoryStats(const Node* node, const MemoryStats& memory_stats);
+
+  // Records the maximum execution time (in microseconds) of "node".
+  void RecordMaxExecutionTime(const Node* node, Microseconds time);
+
+  // Returns the maximum execution time (in microseconds) of "node".
+  Microseconds MaxExecutionTime(const Node* node) const;
+
+  // Record the unique id of the tensor generated by "output_slot" of "node".
+  // Any other tensor sharing the same id will be an alias, i.e. it will share
+  // the same underlying memory storage area.
+  void RecordAllocationId(const Node* node, int output_slot, int64_t alloc_id);
+
+  // Return the unique id of the tensor generated by "output_slot" of "node".
+  int64_t AllocationId(const Node* node, int output_slot) const;
+
+  bool IsPersistentTensor(const Node* node, int64_t alloc_id) const;
+
+  // Helper routines to encapsulate static estimation heuristics
+
+  // Compute an estimate of the time to copy "b" bytes over the network,
+  // given a fixed cost of "network_latency_millis" milliseconds and
+  // an estimated bandwidth of "estimated_gbps" gigabits per second (note that
+  // this value is in gigabits, not gigabytes).
+  static Microseconds CopyTimeEstimate(Bytes b, double network_latency_millis,
+                                       double estimated_gbps);
+  static Microseconds ComputationTimeEstimate(int64_t mathops);
+
+  // Add this CostModel into the CostGraphDef.
+  void AddToCostGraphDef(const Graph* graph, CostGraphDef* cost_graph) const;
+
+  // Write the contents of the CostModel to the INFO log.
+  void WriteSummaryToLog() const;
+
+  // Increment the times that the cost model is updated.
+  void IncrementUpdateTimes();
+
+  // Get the times that the cost model is updated.
+  int32 GetUpdateTimes() const;
+
+ private:
+  static Bytes MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
+                                    const DataType& dtype);
+
+  const bool is_global_;
+
+  // Resizes vectors so that they are large enough for "id" and id's outputs.
+  void Ensure(int id, int num_outputs);
+
+  // Nodes and Edges whose count is < this value
+  // get type/byte estimates of 0.
+  int32 min_count_ = 0;
+
+  // The number of times the cost model is updated.
+  int32 update_times_ = 0;
+
+  // Number of times each Node has been executed.
+  std::vector<int32> count_;
+  // Cumulative execution time.
+  std::vector<Microseconds> time_;
+  // Cumulative Bytes output on each channel.
+  std::vector<absl::InlinedVector<Bytes, 2UL>> slot_bytes_;
+
+  // Maximum execution time
+  std::vector<Microseconds> max_exec_time_;
+
+  // Maximum memory usage
+  struct MemUsage {
+    MemUsage() : temp_memory_size(0), persistent_memory_size(0) {}
+
+    // TODO(yuefengz): temp_memory_size is not being used, remove it.
+    Bytes temp_memory_size;
+    Bytes persistent_memory_size;
+
+    absl::InlinedVector<Bytes, 2UL> output_port_mem;
+    absl::InlinedVector<TensorShapeProto, 2UL> output_port_shape;
+    absl::InlinedVector<DataType, 2UL> output_port_type;
+  };
+  std::vector<MemUsage> max_mem_usage_;
+
+  std::vector<absl::InlinedVector<int64_t, 2UL>> output_port_alloc_ids_;
+
+  std::set<int64_t> persistent_alloc_ids_;
+
+  TensorShapeProto unknown_shape_;
+
+  CostModel(const CostModel&) = delete;
+  void operator=(const CostModel&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/default_device.h b/third_party/tflite-hdrs/tensorflow/core/graph/default_device.h
new file mode 100644
index 00000000..011b7c11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/default_device.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_DEFAULT_DEVICE_H_
+#define TENSORFLOW_CORE_GRAPH_DEFAULT_DEVICE_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace graph {
+
+// Sets the default device for all nodes in graph_def to "device",
+// only if not already set.
+inline void SetDefaultDevice(const std::string& device, GraphDef* graph_def) {
+  for (int i = 0; i < graph_def->node_size(); ++i) {
+    auto node = graph_def->mutable_node(i);
+    if (node->device().empty()) {
+      node->set_device(device);
+    }
+  }
+}
+
+}  // namespace graph
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_DEFAULT_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/edgeset.h b/third_party/tflite-hdrs/tensorflow/core/graph/edgeset.h
new file mode 100644
index 00000000..6d6cb3ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/edgeset.h
@@ -0,0 +1,246 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_EDGESET_H_
+#define TENSORFLOW_CORE_GRAPH_EDGESET_H_
+
+#include <stddef.h>
+
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+namespace tensorflow {
+
+class Edge;
+
+// An unordered set of edges.  Uses very little memory for small sets.
+// Unlike gtl::FlatSet, EdgeSet does NOT allow mutations during
+// iteration.
+class EdgeSet {
+ public:
+  EdgeSet();
+  ~EdgeSet();
+
+  typedef const Edge* key_type;
+  typedef const Edge* value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  class const_iterator;
+  typedef const_iterator iterator;
+
+  bool empty() const;
+  size_type size() const;
+  void clear();
+  std::pair<iterator, bool> insert(value_type value);
+  size_type erase(key_type key);
+  void reserve(size_type new_size) {
+    if (new_size > kInline) {
+      auto s = new gtl::FlatSet<const Edge*>(new_size);
+      s->insert(reinterpret_cast<const Edge**>(std::begin(ptrs_)),
+                reinterpret_cast<const Edge**>(&ptrs_[0] + size()));
+      ptrs_[0] = this;
+      ptrs_[1] = s;
+    }
+  }
+
+  // Caller is not allowed to mutate the EdgeSet while iterating.
+  const_iterator begin() const;
+  const_iterator end() const;
+
+ private:
+  // Up to kInline elements are stored directly in ptrs_ (nullptr means none).
+  // If ptrs_[0] == this then ptrs_[1] points to a set<const Edge*>.
+  // kInline must be >= 2, and is chosen such that ptrs_ fills a 64 byte
+  // cacheline.
+  static constexpr int kInline = 64 / sizeof(const void*);
+  const void* ptrs_[kInline];
+
+  gtl::FlatSet<const Edge*>* get_set() const {
+    if (ptrs_[0] == this) {
+      return static_cast<gtl::FlatSet<const Edge*>*>(
+          const_cast<void*>(ptrs_[1]));
+    } else {
+      return nullptr;
+    }
+  }
+
+// To detect mutations while iterating.
+#ifdef NDEBUG
+  void RegisterMutation() {}
+#else
+  uint32 mutations_ = 0;
+  void RegisterMutation() { mutations_++; }
+#endif
+
+  EdgeSet(const EdgeSet&) = delete;
+  void operator=(const EdgeSet&) = delete;
+};
+
+class EdgeSet::const_iterator {
+ public:
+  typedef typename EdgeSet::value_type value_type;
+  typedef const typename EdgeSet::value_type& reference;
+  typedef const typename EdgeSet::value_type* pointer;
+  typedef typename EdgeSet::difference_type difference_type;
+  typedef std::forward_iterator_tag iterator_category;
+
+  const_iterator() {}
+
+  const_iterator& operator++();
+  const_iterator operator++(int /*unused*/);
+  const value_type* operator->() const;
+  value_type operator*() const;
+  bool operator==(const const_iterator& other) const;
+  bool operator!=(const const_iterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class EdgeSet;
+
+  void const* const* array_iter_ = nullptr;
+  typename gtl::FlatSet<const Edge*>::const_iterator tree_iter_;
+
+#ifdef NDEBUG
+  inline void Init(const EdgeSet* e) {}
+  inline void CheckNoMutations() const {}
+#else
+  inline void Init(const EdgeSet* e) {
+    owner_ = e;
+    init_mutations_ = e->mutations_;
+  }
+  inline void CheckNoMutations() const {
+    CHECK_EQ(init_mutations_, owner_->mutations_);
+  }
+  const EdgeSet* owner_ = nullptr;
+  uint32 init_mutations_ = 0;
+#endif
+};
+
+inline EdgeSet::EdgeSet() {
+  for (int i = 0; i < kInline; i++) {
+    ptrs_[i] = nullptr;
+  }
+}
+
+inline EdgeSet::~EdgeSet() { delete get_set(); }
+
+inline bool EdgeSet::empty() const { return size() == 0; }
+
+inline EdgeSet::size_type EdgeSet::size() const {
+  auto s = get_set();
+  if (s) {
+    return s->size();
+  } else {
+    size_t result = 0;
+    for (int i = 0; i < kInline; i++) {
+      if (ptrs_[i]) result++;
+    }
+    return result;
+  }
+}
+
+inline void EdgeSet::clear() {
+  RegisterMutation();
+  delete get_set();
+  for (int i = 0; i < kInline; i++) {
+    ptrs_[i] = nullptr;
+  }
+}
+
+inline EdgeSet::const_iterator EdgeSet::begin() const {
+  const_iterator ci;
+  ci.Init(this);
+  auto s = get_set();
+  if (s) {
+    ci.tree_iter_ = s->begin();
+  } else {
+    ci.array_iter_ = &ptrs_[0];
+  }
+  return ci;
+}
+
+inline EdgeSet::const_iterator EdgeSet::end() const {
+  const_iterator ci;
+  ci.Init(this);
+  auto s = get_set();
+  if (s) {
+    ci.tree_iter_ = s->end();
+  } else {
+    ci.array_iter_ = &ptrs_[size()];
+  }
+  return ci;
+}
+
+inline EdgeSet::const_iterator& EdgeSet::const_iterator::operator++() {
+  CheckNoMutations();
+  if (array_iter_ != nullptr) {
+    ++array_iter_;
+  } else {
+    ++tree_iter_;
+  }
+  return *this;
+}
+
+inline EdgeSet::const_iterator EdgeSet::const_iterator::operator++(
+    int /*unused*/) {
+  CheckNoMutations();
+  const_iterator tmp = *this;
+  operator++();
+  return tmp;
+}
+
+// gcc's set and multiset always use const_iterator since it will otherwise
+// allow modification of keys.
+inline const EdgeSet::const_iterator::value_type* EdgeSet::const_iterator::
+operator->() const {
+  CheckNoMutations();
+  if (array_iter_ != nullptr) {
+    return reinterpret_cast<const value_type*>(array_iter_);
+  } else {
+    return tree_iter_.operator->();
+  }
+}
+
+// gcc's set and multiset always use const_iterator since it will otherwise
+// allow modification of keys.
+inline EdgeSet::const_iterator::value_type EdgeSet::const_iterator::operator*()
+    const {
+  CheckNoMutations();
+  if (array_iter_ != nullptr) {
+    return static_cast<value_type>(*array_iter_);
+  } else {
+    return *tree_iter_;
+  }
+}
+
+inline bool EdgeSet::const_iterator::operator==(
+    const const_iterator& other) const {
+  DCHECK((array_iter_ == nullptr) == (other.array_iter_ == nullptr))
+      << "Iterators being compared must be from same set that has not "
+      << "been modified since the iterator was constructed";
+  CheckNoMutations();
+  if (array_iter_ != nullptr) {
+    return array_iter_ == other.array_iter_;
+  } else {
+    return other.array_iter_ == nullptr && tree_iter_ == other.tree_iter_;
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_EDGESET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/graph.h b/third_party/tflite-hdrs/tensorflow/core/graph/graph.h
new file mode 100644
index 00000000..6e70b0cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/graph.h
@@ -0,0 +1,1116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A Graph describes a set of computations that are to be
+// performed, as well as the dependencies between those
+// computations. The basic model is a DAG (directed acyclic graph) with
+// * internal nodes representing computational operations to be performed;
+// * edges represent dependencies, indicating the target may only be
+//   executed once the source has completed; and
+// * predefined "source" (start) and "sink" (finish) nodes -- the source
+//   should be the only node that doesn't depend on anything, and the sink
+//   should be the only node that nothing depends on.
+//
+// Note: Node ids are intended to be relatively dense in the
+// 0..max_id range, but there may be gaps since ids won't be reused.
+//
+// Note: Some dependencies between operations are due to one operation
+// consuming the output of another. In fact operations can produce
+// multiple outputs and consume multiple inputs, and some
+// optimizations will care about which specific outputs are connected
+// to which specific inputs.  We therefore represent data dependency
+// between output O of layer A and input I of layer B using
+// "input index" and "output index" labels per edge.
+
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/edgeset.h"
+#include "tensorflow/core/lib/core/arena.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/iterator_range.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Edge;
+class EdgeSetTest;
+class Graph;
+class GraphDebugInfo;
+class GraphDef;
+class GraphTest;
+class Node;
+struct OutputTensor;
+class VersionDef;
+class WhileContext;
+
+class NeighborIter;  // Declared below
+class NodeIter;      // Declared below
+
+// Indicates where the graph instance is originated from.
+enum class ConstructionContext {
+  kNotTracked,     // Not tracked.
+  kDirectSession,  // From `tensorflow::DirectSession`, TF1 session API.
+  kEagerRuntime,   // Registered from TF2 eager runtime.
+};
+
+class Node {
+ public:
+  std::string DebugString() const;
+  int id() const { return id_; }
+  int cost_id() const { return cost_id_; }
+  const std::string& name() const;
+  void set_name(std::string name);
+  const std::string& type_string() const;
+
+  // def() provides the NodeDef the user supplied, but the specifics
+  // of this Node may have changed due to placement, optimization, etc.
+  // In particular:
+  // * def().name() will match name();
+  // * def().op() will match type_string() and op_def().name();
+  // * def().input() is not reliable, use "in_edges()" below instead;
+  // * def().device() is the "user's requested device" and may not match
+  //   the actual assigned device, see assigned_device_name() below;
+  // * def().attr() is authoritative.
+  // TODO(irving): Replace with NodeInfo.
+  const NodeDef& def() const;
+  const OpDef& op_def() const;
+
+  NodeDef* mutable_def();
+
+  // input and output types
+  int32 num_inputs() const;
+  DataType input_type(int32_t i) const;
+  const DataTypeVector& input_types() const;
+
+  int32 num_outputs() const;
+  DataType output_type(int32_t o) const;
+  const DataTypeVector& output_types() const;
+
+  // The device requested by the user.  For the actual assigned device,
+  // use assigned_device_name() below.
+  const std::string& requested_device() const;
+
+  // This changes the user requested device but not necessarily the device that
+  // on which the operation will run.
+  void set_requested_device(const std::string& device);
+
+  // This gives the device the runtime has assigned this node to.  If
+  // you want the device the user requested, use def().device() instead.
+  // TODO(josh11b): Validate that the assigned_device, if not empty:
+  // fully specifies a device, and satisfies def().device().
+  // TODO(josh11b): Move assigned_device_name outside of Node into a
+  // NodeId->DeviceName map.
+  const std::string& assigned_device_name() const;
+  void set_assigned_device_name(const std::string& device_name);
+  bool has_assigned_device_name() const {
+    return assigned_device_name_index_ > 0;
+  }
+  int assigned_device_name_index() const { return assigned_device_name_index_; }
+  void set_assigned_device_name_index(int index);
+
+  // Sets 'original_node_names' field of this node's DebugInfo proto to
+  // 'names'.
+  void set_original_node_names(const std::vector<string>& names);
+  void set_original_func_names(const std::vector<string>& names);
+
+  // Read only access to attributes
+  AttrSlice attrs() const;
+
+  // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
+  const protobuf::RepeatedPtrField<string>& requested_inputs() const;
+
+  // Get the neighboring nodes via edges either in or out of this node.  This
+  // includes control edges.
+  gtl::iterator_range<NeighborIter> in_nodes() const;
+  gtl::iterator_range<NeighborIter> out_nodes() const;
+  const EdgeSet& in_edges() const { return in_edges_; }
+  const EdgeSet& out_edges() const { return out_edges_; }
+
+  // Node type helpers.
+  bool IsSource() const { return id() == 0; }
+  bool IsSink() const { return id() == 1; }
+  // Anything other than the special Source & Sink nodes.
+  bool IsOp() const { return id() > 1; }
+
+  // Node class helpers
+  bool IsSwitch() const { return class_ == NC_SWITCH; }
+  bool IsMerge() const { return class_ == NC_MERGE; }
+  bool IsEnter() const { return class_ == NC_ENTER; }
+  bool IsExit() const { return class_ == NC_EXIT; }
+  bool IsNextIteration() const { return class_ == NC_NEXT_ITERATION; }
+  bool IsLoopCond() const { return class_ == NC_LOOP_COND; }
+  bool IsControlTrigger() const { return class_ == NC_CONTROL_TRIGGER; }
+  bool IsSend() const { return class_ == NC_SEND || class_ == NC_HOST_SEND; }
+  bool IsRecv() const { return class_ == NC_RECV || class_ == NC_HOST_RECV; }
+  bool IsConstant() const { return class_ == NC_CONSTANT; }
+  bool IsVariable() const { return class_ == NC_VARIABLE; }
+  bool IsIdentity() const { return class_ == NC_IDENTITY; }
+  bool IsGetSessionHandle() const { return class_ == NC_GET_SESSION_HANDLE; }
+  bool IsGetSessionTensor() const { return class_ == NC_GET_SESSION_TENSOR; }
+  bool IsDeleteSessionTensor() const {
+    return class_ == NC_DELETE_SESSION_TENSOR;
+  }
+  bool IsControlFlow() const {
+    return (class_ != NC_OTHER) &&  // Fast path
+           (IsSwitch() || IsMerge() || IsEnter() || IsExit() ||
+            IsNextIteration());
+  }
+  bool IsHostSend() const { return class_ == NC_HOST_SEND; }
+  bool IsHostRecv() const { return class_ == NC_HOST_RECV; }
+  bool IsScopedAllocator() const { return class_ == NC_SCOPED_ALLOCATOR; }
+  bool IsCollective() const { return class_ == NC_COLLECTIVE; }
+
+  bool IsMetadata() const { return class_ == NC_METADATA; }
+  bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
+  bool IsPartitionedCall() const { return class_ == NC_PARTITIONED_CALL; }
+
+  // Returns true if this node is any kind of function call node.
+  //
+  // NOTE: "function call nodes" include partitioned call ops, symbolic gradient
+  // ops, and ops whose type_string is the name of a function ("function ops").
+  bool IsFunctionCall() const {
+    return class_ == NC_PARTITIONED_CALL || class_ == NC_FUNCTION_OP ||
+           class_ == NC_SYMBOLIC_GRADIENT;
+  }
+
+  bool IsIfNode() const { return class_ == NC_IF; }
+  bool IsWhileNode() const { return class_ == NC_WHILE; }
+  bool IsCaseNode() const { return class_ == NC_CASE; }
+  // Is this node a function input
+  bool IsArg() const { return class_ == NC_ARG; }
+  // Is this node a function output
+  bool IsRetval() const { return class_ == NC_RETVAL; }
+
+  bool IsDistributedCommunication() const {
+    return op_def().is_distributed_communication();
+  }
+
+  template <typename T>
+  void AddAttr(const std::string& name, const T& val) {
+    SetAttrValue(val, AddAttrHelper(name));
+    UpdateProperties();
+  }
+
+  void AddAttr(const std::string& name, std::vector<string>&& val) {
+    MoveAttrValue(std::move(val), AddAttrHelper(name));
+    UpdateProperties();
+  }
+
+  void ClearAttr(const std::string& name);
+
+  // Returns into '*e' the edge connecting to the 'idx' input of this Node.
+  absl::Status input_edge(int idx, const Edge** e) const;
+
+  // Returns into '*edges' the input data edges of this Node, indexed by input
+  // number. Does not return control edges.
+  absl::Status input_edges(std::vector<const Edge*>* edges) const;
+
+  // Returns into '*n' the node that has an output connected to the
+  // 'idx' input of this Node.
+  absl::Status input_node(int idx, const Node** n) const;
+  absl::Status input_node(int idx, Node** n) const;
+
+  // Returns into '*t' the idx-th input tensor of this node, represented as the
+  // output tensor of input_node(idx).
+  absl::Status input_tensor(int idx, OutputTensor* t) const;
+
+  WhileContext* while_ctx() const { return while_ctx_; }
+  void set_while_ctx(WhileContext* while_ctx) {
+    DCHECK(IsExit());
+    DCHECK(while_ctx_ == nullptr);
+    while_ctx_ = while_ctx;
+  }
+
+  std::shared_ptr<NodeProperties> properties() const { return props_; }
+
+  // Sets the stack trace for the node. Assumes that getting and setting the
+  // stack trace for a given node will not race.
+  void SetStackTrace(const std::shared_ptr<AbstractStackTrace>& stack_trace) {
+    stack_trace_ = stack_trace;
+  }
+
+  // Get the stack trace for when the node was instantiated.
+  const std::shared_ptr<AbstractStackTrace>& GetStackTrace() const {
+    return stack_trace_;
+  }
+
+  // Called after an attr has changed. Decides whether we need to update some
+  // property of the node (stored in props_).
+  void UpdateProperties();
+
+  // Erases type information from the node.
+  void ClearTypeInfo();
+
+  // Update type information for a node with a list of inputs and/or outputs
+  // described by its TYPE_ATTR_NAME attr when removing some of these. The keys
+  // of INDEX_MAPPING are the indexes of the inputs/outputs that are not
+  // removed. dtype information in the TYPE_ATTR_NAME attr is always updated.
+  // Use UPDATE_FULL_TYPE=true when this changes the node's outputs to also
+  // update the node's full type information (if present).
+  absl::Status ShrinkTypeInfo(
+      const absl::flat_hash_map<int, int>& index_mapping,
+      const string& type_attr_name, bool update_full_type);
+
+  // Called after an incident non-control edge has changed. Does nothing if not
+  // all input edges are defined.
+  void RunForwardTypeInference();
+
+ private:
+  // TODO(mdan): Drop this.
+  friend class Graph;
+  Node();
+
+  // Stack trace for the user code for node instantiation. Can be shared across
+  // multiple nodes (e.g. when inlining).
+  std::shared_ptr<AbstractStackTrace> stack_trace_;
+
+  // Releases memory from props_, in addition to restoring *this to its
+  // uninitialized state.
+  void Clear();
+
+  // Make a copy of the Node's props_ if props_ is shared with
+  // other nodes. This must be called before mutating properties,
+  // e.g. in AddAttr.
+  void MaybeCopyOnWrite();
+
+  AttrValue* AddAttrHelper(const std::string& name);
+
+  // A set of mutually exclusive classes for different kinds of nodes,
+  // class_ is initialized in the Node::Initialize routine based on the
+  // node's type_string().
+  enum NodeClass {
+    NC_UNINITIALIZED,
+    NC_SWITCH,
+    NC_MERGE,
+    NC_ENTER,
+    NC_EXIT,
+    NC_NEXT_ITERATION,
+    NC_LOOP_COND,
+    NC_CONTROL_TRIGGER,
+    NC_SEND,
+    NC_HOST_SEND,
+    NC_RECV,
+    NC_HOST_RECV,
+    NC_CONSTANT,
+    NC_VARIABLE,
+    NC_IDENTITY,
+    NC_GET_SESSION_HANDLE,
+    NC_GET_SESSION_TENSOR,
+    NC_DELETE_SESSION_TENSOR,
+    NC_METADATA,
+    NC_SCOPED_ALLOCATOR,
+    NC_COLLECTIVE,
+    NC_FAKE_PARAM,
+    NC_PARTITIONED_CALL,
+    NC_FUNCTION_OP,
+    NC_SYMBOLIC_GRADIENT,
+    NC_IF,
+    NC_WHILE,
+    NC_CASE,
+    NC_ARG,
+    NC_RETVAL,
+    NC_OTHER  // Not a special kind of node
+  };
+
+  void Initialize(int id, int cost_id, std::shared_ptr<NodeProperties> props,
+                  NodeClass node_class);
+
+  static NodeClass GetNodeClassForOp(const std::string& ts);
+
+  int id_;       // -1 until Initialize() is called
+  int cost_id_;  // -1 if there is no corresponding cost accounting node
+  NodeClass class_;
+
+  EdgeSet in_edges_;
+  EdgeSet out_edges_;
+
+  // NOTE(skyewm): inheriting from core::RefCounted may have a slight
+  // performance benefit over using shared_ptr, at the cost of manual ref
+  // counting
+  std::shared_ptr<NodeProperties> props_;
+
+  // Index within Graph::device_names_ of the name of device assigned
+  // to perform this computation.
+  int assigned_device_name_index_;
+
+  // A back-pointer to the Graph that owns this node.  Currently, this exists
+  // solely to allow Node::[set_]assigned_device_name() to work. However, if all
+  // callers of Node::[set_]assigned_device_name() are modified to use the
+  // equivalent methods defined directly on Graph, then we can remove this
+  // field and reclaim that memory.
+  Graph* graph_;
+
+  // Set if this is an exit node of a while loop with an associated
+  // WhileContext. Otherwise null. (This is only set for exit nodes because
+  // they're the first nodes of a loop encountered while creating the gradient
+  // graph. Exit nodes that are part of while loop gradient graphs will not have
+  // this set.)
+  WhileContext* while_ctx_;
+
+  Node(const Node&) = delete;
+  void operator=(const Node&) = delete;
+};
+
+// Stores debug information associated with the Node.
+struct NodeDebugInfo {
+  const std::string name;
+  std::vector<string> original_node_names;
+  std::vector<string> original_func_names;
+
+  NodeDebugInfo(const Node& n);
+  NodeDebugInfo(const NodeDef& ndef);
+  NodeDebugInfo(absl::string_view node_name, bool has_experimental_debug_info,
+                const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
+};
+
+// Represents an input of a node, i.e., the `index`-th input to `node`.
+struct InputTensor {
+  Node* node;
+  int index;
+
+  InputTensor(Node* n, int i) : node(n), index(i) {}
+  InputTensor() : node(nullptr), index(0) {}
+
+  // Returns true if this InputTensor is identical to 'other'. Nodes are
+  // compared using pointer equality.
+  bool operator==(const InputTensor& other) const;
+
+  // A hash function for InputTensors. Nodes are hashed based on their pointer
+  // value.
+  struct Hash {
+    uint64 operator()(InputTensor const& s) const;
+  };
+};
+
+// Represents an output of a node, i.e., the `index`-th output of `node`. Note
+// that a single `OutputTensor` can correspond to multiple `Edge`s if the output
+// is consumed by multiple destination nodes.
+struct OutputTensor {
+  Node* node;
+  int index;
+
+  OutputTensor(Node* n, int i) : node(n), index(i) {}
+  OutputTensor() : node(nullptr), index(0) {}
+
+  // Returns true if this OutputTensor is identical to 'other'. Nodes are
+  // compared using pointer equality.
+  bool operator==(const OutputTensor& other) const;
+
+  // A hash function for OutputTensors. Nodes are hashed based on their pointer
+  // value.
+  struct Hash {
+    uint64 operator()(OutputTensor const& s) const;
+  };
+};
+
+class Edge {
+ public:
+  Node* src() const { return src_; }
+  Node* dst() const { return dst_; }
+  int id() const { return id_; }
+
+  // Return the index of the source output that produces the data
+  // carried by this edge.  The special value kControlSlot is used
+  // for control dependencies.
+  int src_output() const { return src_output_; }
+
+  // Return the index of the destination input that consumes the data
+  // carried by this edge.  The special value kControlSlot is used
+  // for control dependencies.
+  int dst_input() const { return dst_input_; }
+
+  // Return true iff this is an edge that indicates a control-flow
+  // (as opposed to a data-flow) dependency.
+  bool IsControlEdge() const;
+
+  std::string DebugString() const;
+
+ private:
+  Edge() {}
+
+  friend class EdgeSetTest;
+  friend class GraphTest;
+  friend class Graph;
+  Node* src_;
+  Node* dst_;
+  int id_;
+  int src_output_;
+  int dst_input_;
+};
+
+// Allows for iteration of the edges of a Graph, by iterating the underlying
+// Graph.edges_ vector while skipping over null entries.
+class GraphEdgesIterable {
+ private:
+  const std::vector<Edge*>& edges_;
+
+ public:
+  explicit GraphEdgesIterable(const std::vector<Edge*>& edges)
+      : edges_(edges) {}
+
+  typedef Edge* value_type;
+
+  class const_iterator {
+   private:
+    // The underlying iterator.
+    std::vector<value_type>::const_iterator iter_;
+
+    // The end of the underlying iterator.
+    std::vector<value_type>::const_iterator end_;
+
+    // Advances iter_ until it reaches a non-null item, or reaches the end.
+    void apply_filter() {
+      while (iter_ != end_ && *iter_ == nullptr) {
+        ++iter_;
+      }
+    }
+
+   public:
+    const_iterator(std::vector<value_type>::const_iterator iter,
+                   std::vector<value_type>::const_iterator end)
+        : iter_(iter), end_(end) {
+      apply_filter();
+    }
+
+    bool operator==(const const_iterator& other) const {
+      return iter_ == other.iter_;
+    }
+
+    bool operator!=(const const_iterator& other) const {
+      return iter_ != other.iter_;
+    }
+
+    // This is the prefix increment operator (++x), which is the operator
+    // used by C++ range iteration (for (x : y) ...).  We intentionally do not
+    // provide a postfix increment operator.
+    const_iterator& operator++() {
+      ++iter_;
+      apply_filter();
+      return *this;
+    }
+
+    value_type operator*() { return *iter_; }
+  };
+
+  const_iterator begin() {
+    return const_iterator(edges_.begin(), edges_.end());
+  }
+  const_iterator end() { return const_iterator(edges_.end(), edges_.end()); }
+};
+
+// Thread compatible but not thread safe.
+class Graph {
+ public:
+  // Constructs a graph with a single SOURCE (always id kSourceId) and a
+  // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
+  //
+  // The graph can hold ops found in the registry. `ops`s lifetime must be at
+  // least that of the constructed graph's.
+  explicit Graph(const OpRegistryInterface* ops);
+
+  // Constructs a graph with a single SOURCE (always id kSourceId) and a
+  // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
+  //
+  // The graph can hold ops found in `flib_def`. Unlike the constructor taking
+  // an OpRegistryInterface, this constructor copies the function definitions in
+  // `flib_def` so its lifetime may be shorter than that of the graph's. The
+  // OpRegistryInterface backing `flib_def` must still have the lifetime of the
+  // graph though.
+  explicit Graph(const FunctionLibraryDefinition& flib_def);
+
+  ~Graph();
+
+  // Clone the current graph into a new one.
+  std::unique_ptr<Graph> Clone();
+
+  static constexpr int kControlSlot = -1;
+
+  // The GraphDef version range of this graph (see graph.proto).
+  const VersionDef& versions() const;
+  void set_versions(const VersionDef& versions);
+
+  // Adds a new node to this graph, and returns it. Infers the Op and
+  // input/output types for the node. *this owns the returned instance.
+  // Returns nullptr and sets *status on error.
+  Node* AddNode(NodeDef node_def, absl::Status* status);
+
+  // Same as above, but using StatusOr. This method is always preferred.
+  absl::StatusOr<Node*> AddNode(NodeDef node_def);
+
+  // Copies *node, which may belong to another graph, to a new node,
+  // which is returned.  Does not copy any edges.  *this owns the
+  // returned instance.
+  Node* CopyNode(const Node* node);
+
+  // Removes a node from this graph, including all edges from or to it.
+  // *node should not be accessed after calling this function.
+  // REQUIRES: node->IsOp()
+  void RemoveNode(Node* node);
+
+  void Copy(const Graph& src);
+
+  // Removes all nodes from this graph, including all edges from or to them.
+  // No Node* references to the Graph are valid post.
+  void Clear();
+
+  // Adds an edge that connects the xth output of `source` to the yth input of
+  // `dest` and returns it. Does not update dest's NodeDef.
+  const Edge* AddEdge(Node* source, int x, Node* dest, int y);
+
+  // Adds a control edge (no data flows along this edge) that connects `source`
+  // to `dest`. If `dest`s NodeDef is missing the corresponding control input,
+  // adds the control input.
+  //
+  // If such a control edge already exists and `allow_duplicates` is false, no
+  // edge is added and the function returns nullptr. Otherwise the edge is
+  // unconditionally created and returned. The NodeDef is not updated if
+  // `allow_duplicates` is true.
+  // TODO(skyewm): // TODO(skyewm): allow_duplicates is needed only by
+  // graph_partition.cc. Figure out if we can do away with it.
+  const Edge* AddControlEdge(Node* source, Node* dest,
+                             bool allow_duplicates = false);
+
+  // Removes edge from the graph. Does not update the destination node's
+  // NodeDef. Does not update the full type information of the source node's
+  // NodeDef. (See ShrinkTypeInfo for an example of updating full type
+  // information when removing some outputs from a node.)
+  // REQUIRES: The edge must exist.
+  void RemoveEdge(const Edge* edge);
+
+  // Removes control edge `edge` from the graph. Note that this also updates
+  // the corresponding NodeDef to reflect the change.
+  // REQUIRES: The control edge must exist.
+  void RemoveControlEdge(const Edge* e);
+
+  // Updates the input to a node.  The existing edge to `dst` is removed and an
+  // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
+  // is also updated.
+  absl::Status UpdateEdge(Node* new_src, int new_src_index, Node* dst,
+                          int dst_index);
+
+  // Add an input to dst that comes from the "src_slot" output of the
+  // node named by "src_name".
+  static void AddInput(NodeDef* dst, absl::string_view src_name, int src_slot);
+
+  // Like AddEdge but updates dst's NodeDef. Used to add an input edge to a
+  // "While" op during gradient construction, see AddInputWhileHack in
+  // python_api.h for more details.
+  absl::Status AddWhileInputHack(Node* new_src, int new_src_index, Node* dst);
+
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name. This overload adds the function definitions with no stack traces.
+  absl::Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib);
+  absl::Status AddFunctionLibrary(FunctionDefLibrary&& fdef_lib);
+
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  absl::Status AddFunctionLibrary(
+      const FunctionDefLibrary& fdef_lib,
+      const FunctionDefLibraryStackTraces& stack_traces);
+  absl::Status AddFunctionLibrary(
+      FunctionDefLibrary&& fdef_lib,
+      const FunctionDefLibraryStackTraces& stack_traces);
+
+  // Adds the function definition and its stacktraces to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  absl::Status AddFunctionDef(const FunctionDef& fdef,
+                              const StackTracesMap& stack_traces);
+
+  // Adds the gradient definition to this graph's op registry. Ignores duplicate
+  // gradients of the same function, and returns a bad status if an imported
+  // gradient differs from an existing gradient of the same function name.
+  absl::Status AddGradientDef(const GradientDef& gdef);
+
+  // The number of live nodes in the graph.
+  //
+  // Because nodes can be removed from the graph, num_nodes() is often
+  // smaller than num_node_ids(). If one needs to create an array of
+  // nodes indexed by node ids, num_node_ids() should be used as the
+  // array's size.
+  int num_nodes() const { return num_nodes_; }
+
+  // The number of live nodes in the graph, excluding the Source and Sink nodes.
+  int num_op_nodes() const {
+    DCHECK_GE(num_nodes_, 2);
+    return num_nodes_ - 2;
+  }
+
+  // The number of live edges in the graph.
+  //
+  // Because edges can be removed from the graph, num_edges() is often
+  // smaller than num_edge_ids(). If one needs to create an array of
+  // edges indexed by edge ids, num_edge_ids() should be used as the
+  // array's size.
+  int num_edges() const { return num_edges_; }
+
+  // Serialize the nodes starting at `from_node_id` to a GraphDef.
+  // `include_flib_def` indicates whether the function library will be populated
+  // in the `graph_def`. `include_flib_def` should be usually set to true so
+  // that the populated `graph_def` will be complete. Setting `include_flib_def`
+  // to false would mean that the returned `graph_def` is incomplete and may
+  // contain references to functions whose definition is not included. It can
+  // make sense to do this in cases where the caller already has a copy of the
+  // function library.
+  // If `include_debug_info` is true, the `debug_info` field of the GraphDef
+  // will be populated with stack traces from the nodes and the function
+  // library. Note that if `include_debug_info` is true and `include_flib_def`
+  // is false, then `debug_info` will contain stack traces for nodes in the
+  // function library, which will not itself be included in the GraphDef.
+  void ToGraphDefSubRange(GraphDef* graph_def, int from_node_id,
+                          bool include_flib_def = true,
+                          bool include_debug_info = false) const;
+
+  // Serialize to a GraphDef. `include_flib_def` indicates whether the function
+  // library will be populated in the `graph_def`. `include_flib_def` should be
+  // usually set to true so that the populated `graph_def` will be complete.
+  // Setting `include_flib_def` to false would mean that the returned
+  // `graph_def` is incomplete and may contain references to functions whose
+  // definition is not included. It can make sense to do this in cases where the
+  // caller already has a copy of the function library.
+  // If `include_debug_info` is true, the `debug_info` field of the GraphDef
+  // will be populated with stack traces from the nodes and the function
+  // library. Note that if `include_debug_info` is true and `include_flib_def`
+  // is false, then `debug_info` will contain stack traces for nodes in the
+  // function library, which will not itself be included in the GraphDef.
+  void ToGraphDef(GraphDef* graph_def, bool include_flib_def = true,
+                  bool include_debug_info = false) const;
+
+  // This version can be called from debugger to inspect the graph content.
+  // Use the previous version outside debug context for efficiency reasons.
+  //
+  // Note: We do not expose a DebugString() API, since GraphDef.DebugString() is
+  // not defined in some TensorFlow builds.
+  GraphDef ToGraphDefDebug() const;
+
+  // Generate new node name with the specified prefix that is unique
+  // across this graph.
+  std::string NewName(absl::string_view prefix);
+
+  // Access to the list of all nodes.  Example usage:
+  //   for (Node* node : graph.nodes()) { ... }
+  gtl::iterator_range<NodeIter> nodes() const;
+
+  // Access to the list of all nodes, excluding the Source and Sink nodes.
+  gtl::iterator_range<NodeIter> op_nodes() const;
+
+  // Returns one more than the maximum id assigned to any node.
+  int num_node_ids() const { return nodes_.size(); }
+
+  // Returns the node associated with an id, or nullptr if no node
+  // with that id (the node with that id was removed and the id has
+  // not yet been re-used). *this owns the returned instance.
+  // REQUIRES: 0 <= id < num_node_ids().
+  Node* FindNodeId(int id) const { return nodes_[id]; }
+
+  // Returns one more than the maximum id assigned to any edge.
+  int num_edge_ids() const { return edges_.size(); }
+
+  // Returns the Edge associated with an id, or nullptr if no edge
+  // with that id (the edge with that id was removed and the id has
+  // not yet been re-used). *this owns the returned instance.
+  // REQUIRES: 0 <= id < num_edge_ids().
+  const Edge* FindEdgeId(int id) const { return edges_[id]; }
+
+  // Access to the set of all edges.  Example usage:
+  //   for (const Edge* e : graph.edges()) { ... }
+  GraphEdgesIterable edges() const { return GraphEdgesIterable(edges_); }
+
+  // The pre-defined nodes.
+  enum { kSourceId = 0, kSinkId = 1 };
+  Node* source_node() const { return FindNodeId(kSourceId); }
+  Node* sink_node() const { return FindNodeId(kSinkId); }
+
+  const OpRegistryInterface* op_registry() const { return &ops_; }
+  const FunctionLibraryDefinition& flib_def() const { return ops_; }
+
+  FunctionLibraryDefinition* mutable_flib_def() { return &ops_; }
+
+  void CheckDeviceNameIndex(int index) {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, static_cast<int>(device_names_.size()));
+  }
+
+  int InternDeviceName(const std::string& device_name);
+
+  const std::string& get_assigned_device_name(const Node& node) const {
+    return device_names_[node.assigned_device_name_index()];
+  }
+
+  void set_assigned_device_name_index(Node* node, int device_name_index) {
+    CheckDeviceNameIndex(device_name_index);
+    node->assigned_device_name_index_ = device_name_index;
+  }
+
+  void set_assigned_device_name(Node* node, const std::string& device_name) {
+    node->assigned_device_name_index_ = InternDeviceName(device_name);
+  }
+
+  // Returns OK if `node` is non-null and belongs to this graph
+  absl::Status IsValidNode(const Node* node) const;
+
+  // Returns OK if IsValidNode(`node`) and `idx` is a valid output.  Does not
+  // accept control outputs.
+  absl::Status IsValidOutputTensor(const Node* node, int idx) const;
+
+  // Returns OK if IsValidNode(`node`) and `idx` a valid input.  Does not accept
+  // control inputs.
+  absl::Status IsValidInputTensor(const Node* node, int idx) const;
+
+  // Create and return a new WhileContext owned by this graph. This is called
+  // when a new while loop is created. `frame_name` must be unique among
+  // WhileContexts in this graph.
+  absl::Status AddWhileContext(absl::string_view frame_name,
+                               std::vector<Node*> enter_nodes,
+                               std::vector<Node*> exit_nodes,
+                               OutputTensor cond_output,
+                               std::vector<OutputTensor> body_inputs,
+                               std::vector<OutputTensor> body_outputs,
+                               WhileContext** result);
+
+  // Builds a node name to node pointer index for all nodes in the graph.
+  std::unordered_map<string, Node*> BuildNodeNameIndex() const;
+
+  absl::optional<std::vector<bool>>& GetConstArgIndicesCache() const {
+    return const_arg_indices_cache_;
+  }
+
+  // TODO(kkb): Add to the constructor when it becomes managable.
+  // Sets the graph construction context.
+  void SetConstructionContext(ConstructionContext construction_context) {
+    construction_context_ = construction_context;
+  }
+
+  // TODO(kkb): Rename to `GetConstructionContext` once we're comfortable
+  // making this stable and make it available widely.
+  // Returns the graph construction context. It's `kUnknown` if not set.
+  ConstructionContext GetConstructionContextInternal() const {
+    return construction_context_;
+  }
+
+  // Set full type information for a node given its name.
+  // Note that if this is called in a loop iterating over all the nodes
+  // elsewhere it would be O(n^2) complexity. If this case was important in the
+  // future, an alternative method could be added that takes in a flat_hash_map
+  // of name: type and simply iterates through the graph once and annotates all
+  // nodes.
+  void SetNodeType(absl::string_view name, const FullTypeDef& type);
+
+  // Get full type information for a node given its name.
+  // Note that if this is called in a loop iterating over all the nodes
+  // elsewhere it would be O(n^2) complexity. If this case was important in the
+  // future, an alternative method could be added that takes in flat_hash_map of
+  // name: type and simply iterates through the graph once and stores all the
+  // information in the map.
+  void NodeType(absl::string_view name, const FullTypeDef** result);
+
+  // Builds a GraphDebugInfo from the functions and nodes in this graph. Stack
+  // traces associated with function definitions will have a key of the form
+  // <node_name> '@' <function_name>. Stack traces associated with other Nodes
+  // will use the node name as the key.
+  GraphDebugInfo BuildDebugInfo() const;
+
+  // TODO(josh11b): uint64 hash() const;
+
+ private:
+  // If cost_node is non-null, then cost accounting (in CostModel)
+  // will be associated with that node rather than the new one being
+  // created.
+  //
+  // Ownership of the returned Node is not transferred to caller.
+  Node* AllocateNode(std::shared_ptr<NodeProperties> props,
+                     const Node* cost_node, Node::NodeClass node_class);
+  void ReleaseNode(Node* node);
+  // Insert edge in free_edges_ for possible reuse.
+  void RecycleEdge(const Edge* edge);
+  // Registry of all known ops, including functions.
+  FunctionLibraryDefinition ops_;
+
+  // GraphDef versions
+  const std::unique_ptr<VersionDef> versions_;
+
+  // Allocator which will give us good locality.
+  core::Arena arena_;
+
+  // Map from node ids to allocated nodes.  nodes_[id] may be nullptr if
+  // the node with that id was removed from the graph.
+  std::vector<Node*> nodes_;
+
+  // Number of nodes alive.
+  int64_t num_nodes_ = 0;
+
+  // Map from edge ids to allocated edges.  edges_[id] may be nullptr if
+  // the edge with that id was removed from the graph.
+  std::vector<Edge*> edges_;
+
+  // The number of entries in edges_ that are not nullptr.
+  int num_edges_ = 0;
+
+  // Allocated but free nodes and edges.
+  std::vector<Node*> free_nodes_;
+  std::vector<Edge*> free_edges_;
+
+  // For generating unique names.
+  int name_counter_ = 0;
+
+  // In most graphs, the number of unique values used for the
+  // Node::assigned_device_name() property is quite small.  If the graph is
+  // large, then this duplication of values can consume a significant amount of
+  // memory.  Instead, we represent the same information using an interning
+  // table, which consists of a vector of unique strings (device_names_), as
+  // well a map (device_names_map_) from unique strings to indices within the
+  // unique string table.
+  //
+  // The InternDeviceName() method handles adding a new entry into the table,
+  // or locating the index of an existing entry.
+  //
+  // The fact that Node::assigned_device_name() is implemented using an
+  // interning table is intentionally public.  This allows algorithms that
+  // frequently access this field to do so efficiently, especially for the case
+  // where the assigned_device_name of one Node is copied directly from that
+  // of another Node.
+
+  // A table of the unique assigned device names.  Indices do NOT correspond
+  // to node IDs.  Index 0 is always the empty string.
+  std::vector<string> device_names_;
+
+  // Maps unique device names to indices within device_names_[i].
+  std::unordered_map<string, int> device_names_map_;
+
+  // All the while contexts owned by this graph, keyed by frame name,
+  // corresponding to all the while loops contained in this graph (including
+  // nested loops). The stored contexts are usually accessed via
+  // AddWhileContext() or Node::while_ctx(), but this manages the lifetime.
+  std::map<string, WhileContext> while_ctxs_;
+
+  // Cache of the indices of the arguments which need to be constant for the XLA
+  // compilation.
+  mutable absl::optional<std::vector<bool>> const_arg_indices_cache_;
+
+  // Indicates the context that this Graph instance is constructed.
+  ConstructionContext construction_context_ = ConstructionContext::kNotTracked;
+
+  Graph(const Graph&) = delete;
+  void operator=(const Graph&) = delete;
+};
+
+// TODO(josh11b): We may want to support keeping an index on various
+// node/edge attributes in a graph, particularly node names.
+
+// Helper routines
+
+inline bool IsSource(const Node* node) { return node->IsSource(); }
+inline bool IsSink(const Node* node) { return node->IsSink(); }
+inline bool IsSwitch(const Node* node) { return node->IsSwitch(); }
+inline bool IsMerge(const Node* node) { return node->IsMerge(); }
+inline bool IsEnter(const Node* node) { return node->IsEnter(); }
+inline bool IsExit(const Node* node) { return node->IsExit(); }
+inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); }
+inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
+inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
+inline bool IsSend(const Node* node) { return node->IsSend(); }
+inline bool IsRecv(const Node* node) { return node->IsRecv(); }
+inline bool IsHostSend(const Node* node) { return node->IsHostSend(); }
+inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); }
+
+// True for Nodes that mediate the transfer of values between processes.
+inline bool IsTransferNode(const Node* n) { return IsSend(n) || IsRecv(n); }
+
+inline bool IsConstant(const Node* node) { return node->IsConstant(); }
+inline bool IsVariable(const Node* node) { return node->IsVariable(); }
+inline bool IsIdentity(const Node* node) { return node->IsIdentity(); }
+
+// Returns true iff 'n' is a control flow node.
+inline bool IsControlFlow(const Node* n) { return n->IsControlFlow(); }
+
+// Returns true if the node only depends on its input's metadata
+// (shape).  Specifically, returns true for "Size", "Shape" and "Rank" ops.
+inline bool IsMetadata(const Node* n) { return n->IsMetadata(); }
+
+inline bool IsScopedAllocator(const Node* n) { return n->IsScopedAllocator(); }
+
+inline bool IsHostMemoryPreserving(const Node* node) {
+  return IsIdentity(node) || IsControlFlow(node);
+}
+
+inline bool IsDistributedCommunication(const Node* n) {
+  return n->IsDistributedCommunication();
+}
+
+// NOTE: We declare Reference type of NodeIter and NeighborIter as Node* (see
+// https://en.cppreference.com/w/cpp/iterator/iterator).
+
+// Iterator for stepping through the nodes of a graph.
+class NodeIter {
+ public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = Node;
+  using difference_type = std::ptrdiff_t;
+  using pointer = Node*;
+  using reference = Node*;
+
+  NodeIter(const Graph* graph, int id);
+  bool operator==(const NodeIter& rhs) const;
+  bool operator!=(const NodeIter& rhs) const;
+  void operator++();
+  reference operator*() const;
+  pointer operator->() const;
+
+ private:
+  // Invariant: id_ == graph_->num_node_ids() || graph_->FindId(id_) != nullptr
+  const Graph* graph_;
+  int id_;
+};
+
+// Iterator for stepping through the neighbors of a node.
+class NeighborIter {
+ public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = Node;
+  using difference_type = std::ptrdiff_t;
+  using pointer = Node*;
+  using reference = Node*;
+
+  NeighborIter(EdgeSet::const_iterator iter, bool incoming);
+  bool operator==(const NeighborIter& rhs) const;
+  bool operator!=(const NeighborIter& rhs) const;
+  void operator++();
+  reference operator*() const;
+  pointer operator->() const;
+
+ private:
+  EdgeSet::const_iterator iter_;
+  bool incoming_;
+};
+
+// IMPLEMENTATION DETAILS, PLEASE IGNORE
+
+inline NodeIter::NodeIter(const Graph* graph, int id)
+    : graph_(graph), id_(id) {}
+
+inline bool NodeIter::operator==(const NodeIter& rhs) const {
+  DCHECK(graph_ == rhs.graph_);
+  return id_ == rhs.id_;
+}
+
+inline bool NodeIter::operator!=(const NodeIter& rhs) const {
+  return !(*this == rhs);
+}
+
+inline void NodeIter::operator++() {
+  while (true) {
+    DCHECK_LE(id_, graph_->num_node_ids());
+    ++id_;
+    if (id_ >= graph_->num_node_ids() || graph_->FindNodeId(id_) != nullptr) {
+      return;
+    }
+  }
+}
+
+inline Node* NodeIter::operator*() const { return graph_->FindNodeId(id_); }
+
+inline Node* NodeIter::operator->() const { return graph_->FindNodeId(id_); }
+
+inline NeighborIter::NeighborIter(EdgeSet::const_iterator iter, bool incoming)
+    : iter_(iter), incoming_(incoming) {}
+
+inline bool NeighborIter::operator==(const NeighborIter& rhs) const {
+  return iter_ == rhs.iter_ && incoming_ == rhs.incoming_;
+}
+
+inline bool NeighborIter::operator!=(const NeighborIter& rhs) const {
+  return !(*this == rhs);
+}
+
+inline void NeighborIter::operator++() { ++iter_; }
+
+inline Node* NeighborIter::operator*() const {
+  const Edge* e = *iter_;
+  return incoming_ ? e->src() : e->dst();
+}
+
+inline Node* NeighborIter::operator->() const {
+  const Edge* e = *iter_;
+  return incoming_ ? e->src() : e->dst();
+}
+
+inline bool Edge::IsControlEdge() const {
+  // Note that if either src_output_ or dst_input_ is kControlSlot,
+  // so is the other one (AddEdge checks this).
+  return src_output_ == Graph::kControlSlot;
+}
+
+inline gtl::iterator_range<NodeIter> Graph::nodes() const {
+  // Note that NodeId 0 is always valid since we don't let the source
+  // node be removed from the graph.
+  return gtl::make_range(NodeIter(this, 0), NodeIter(this, num_node_ids()));
+}
+
+inline gtl::iterator_range<NodeIter> Graph::op_nodes() const {
+  // Note that NodeId 0 is always valid since we don't let the source
+  // node be removed from the graph.
+  //
+  // The current implementation of Graph maintains the invariant that the
+  // first two nodes are the source and sink nodes, and all other nodes are op
+  // nodes. This method (op_nodes()) relies on this invariant.
+  NodeIter begin(this, 0);
+  NodeIter end(this, num_node_ids());
+  if (begin != end) {
+    ++begin;
+  }
+  if (begin != end) {
+    ++begin;
+  }
+  return gtl::make_range(begin, end);
+}
+
+inline void Node::set_assigned_device_name_index(int index) {
+  graph_->CheckDeviceNameIndex(index);
+  assigned_device_name_index_ = index;
+}
+
+inline void Node::set_assigned_device_name(const std::string& device_name) {
+  graph_->set_assigned_device_name(this, device_name);
+}
+
+inline const std::string& Node::assigned_device_name() const {
+  return graph_->get_assigned_device_name(*this);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/graph_debug_info_builder.h b/third_party/tflite-hdrs/tensorflow/core/graph/graph_debug_info_builder.h
new file mode 100644
index 00000000..b1c8fcef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/graph_debug_info_builder.h
@@ -0,0 +1,210 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEBUG_INFO_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_DEBUG_INFO_BUILDER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/platform/stack_frame.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+
+// Language agnostic stack traces.
+class AbstractStackTrace {
+ public:
+  struct TracePrintingOptions {
+    // Show inline the contents of each stack line.
+    bool show_line_contents = false;
+
+    // Drop the common largest prefix of all filenames in stack frames.
+    bool filter_common_prefix = false;
+
+    // Do not show internal frames.
+    bool drop_internal_frames = false;
+  };
+
+  virtual ~AbstractStackTrace() = default;
+
+  // The returned span is alive as long as the AbstractStackTrace is alive.
+  virtual absl::Span<StackFrame const> ToFrames() const = 0;
+
+  // Returns the stack frames without caching any generated data.
+  virtual std::vector<StackFrame> ToUncachedFrames() const = 0;
+
+  // Returns the last stack frame from user code, attempting to ignore the
+  // framework code. Returns an empty frame if no such stack frame was found.
+  virtual StackFrame LastUserFrame() const = 0;
+
+  // Returns stack trace from user code (instead of op creation ones returned in
+  // ToFrames).
+  virtual std::vector<StackFrame> GetUserFrames(int limit) const = 0;
+
+  virtual std::string ToString(const TracePrintingOptions& opts) const = 0;
+};
+
+// A frozen sequence of StackFrames; an adapter for a span of StackFrames that
+// conforms to the AbstractStackTrace contract.
+class FrozenStackTrace : public AbstractStackTrace {
+ public:
+  // Constructs a FrozenStackTrace from a span of StackFrames by making a copy
+  // of each stack frame.
+  explicit FrozenStackTrace(absl::Span<StackFrame const> frames,
+                            absl::Span<StackFrame const> user_frames = {});
+
+  explicit FrozenStackTrace(std::vector<StackFrame>&& frames)
+      : frames_(std::move(frames)), user_frames_({}) {}
+
+  FrozenStackTrace(FrozenStackTrace&&) = default;
+
+  // Constructs a FrozenStackTrace from serialized proto data.
+  FrozenStackTrace(const GraphDebugInfo::StackTrace& stack_trace,
+                   const GraphDebugInfo& debug_info);
+
+  ~FrozenStackTrace() override = default;
+
+  absl::Span<StackFrame const> ToFrames() const override;
+
+  std::vector<StackFrame> ToUncachedFrames() const override;
+
+  StackFrame LastUserFrame() const override;
+
+  std::vector<StackFrame> GetUserFrames(int limit) const override;
+
+  std::string ToString(const TracePrintingOptions& opts) const override;
+
+ private:
+  std::vector<StackFrame> frames_;
+  std::vector<StackFrame> user_frames_;
+};
+
+// Holder type to use `AbstractStackTrace` as a key.
+struct StackTracePointer {
+  std::shared_ptr<AbstractStackTrace> trace;
+
+  template <class H>
+  friend H AbslHashValue(H h, const StackTracePointer& p) {
+    for (const auto& frame : p.trace->ToFrames()) {
+      h = H::combine(std::move(h), frame);
+    }
+    return h;
+  }
+
+  bool operator==(const StackTracePointer& other) const {
+    absl::Span<StackFrame const> other_frames = other.trace->ToFrames();
+    absl::Span<StackFrame const> frames = trace->ToFrames();
+    return frames == other_frames;
+  }
+};
+
+using StackTracesMap =
+    absl::flat_hash_map<std::string,
+                        std::shared_ptr<tensorflow::AbstractStackTrace>>;
+
+// Load all stack traces from `debug_info`.
+StackTracesMap LoadTracesFromDebugInfo(const GraphDebugInfo& debug_info);
+absl::StatusOr<StackTracesMap> LoadTracesFromDebugInfoStr(
+    absl::string_view debug_info_str);
+
+// Generates a GraphDebugInfo proto from a StackTracesMap object. Returns user
+// frames by default. If `user_frames` is false, returns all frames.
+GraphDebugInfo StackTracesMapToGraphDebugInfo(const StackTracesMap& map,
+                                              bool user_frames = true);
+
+// Builder for GraphDebugInfo protos from either an existing map of string keys
+// to stack traces, or individual stack traces, or both. All stack traces in a
+// GraphDebugInfo are stored with a string key in the `traces` field. In the
+// case of an existing map, its keys are used, appended with a key suffix,
+// which may be empty. If it is not empty, it is conventionally of the form
+// "@function_name", although this class doesn't care. In the case of an
+// individual stack trace, a key for `traces` must be provided.
+//
+// This builder will create a list of the unique file names across all stack
+// traces and store it in the `files` field. When storing stack traces into the
+// proto, file names are replaced by their index into `files`.
+//
+// Typical usage is to call one or both of the accumulate methods one or more
+// times and then to call the Build().
+class GraphDebugInfoBuilder {
+ public:
+  struct Options {
+    // Call the AbstractTraceMap GetUserFrames method rather than ToFrames
+    bool user_frames;
+    // Value of `limit` to pass to GetUserFrames if `user_frames` is true,
+    // otherwise ignored
+    int user_frames_limit;
+  };
+
+  GraphDebugInfoBuilder();
+  virtual ~GraphDebugInfoBuilder() = default;
+
+  // Adds a map of stack traces to the GraphDebugInfo proto. For each key (node
+  // id) and stack traces entry in `stack_traces_map`, combine the key with
+  // `key_suffix` to form a new key and use that to add the stack traces to the
+  // `traces` field of the proto. If not empty, the suffix is typically of the
+  // form "@function_name", although this function doesn't care.
+  void AccumulateStackTracesMap(const StackTracesMap& stack_traces_map,
+                                absl::string_view key_suffix = "",
+                                const GraphDebugInfoBuilder::Options& options =
+                                    GraphDebugInfoBuilder::Options());
+
+  // Adds one stack trace to the GraphDebugInfo proto, using `traces_key` as the
+  // key for the `traces` field of the proto.
+  void AccumulateStackTrace(std::shared_ptr<AbstractStackTrace> trace,
+                            absl::string_view traces_key,
+                            const GraphDebugInfoBuilder::Options& options =
+                                GraphDebugInfoBuilder::Options());
+
+  void AppendGraphDebugInfo(absl::string_view prefix,
+                            const GraphDebugInfo& new_info);
+
+  // These string methods are used in the Python bindings  to avoid symbol
+  // resolution errors with pybind on Windows.
+  absl::Status AppendGraphDebugInfoStr(absl::string_view prefix,
+                                       absl::string_view new_info_str);
+
+  std::string ToGraphDebugInfoStr() const;
+
+  // Returns the GraphDebugInfo proto.
+  GraphDebugInfo Build() const;
+
+ private:
+  void AppendToStackTraceProto(const StackFrame& stack_frame,
+                               GraphDebugInfo::StackTrace& stack_trace_proto);
+
+  std::unique_ptr<GraphDebugInfo> debug_info_;
+  absl::flat_hash_map<std::string, int> file_name_to_index_;
+
+  absl::flat_hash_map<StackTracePointer, int> trace_to_index_;
+  absl::flat_hash_map<StackFrame, int> frame_to_index_;
+  int new_name_index_ = 0;
+
+  GraphDebugInfoBuilder(const GraphDebugInfoBuilder&) = delete;
+  void operator=(const GraphDebugInfoBuilder&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEBUG_INFO_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/graph_def_builder.h b/third_party/tflite-hdrs/tensorflow/core/graph/graph_def_builder.h
new file mode 100644
index 00000000..b635ece0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/graph_def_builder.h
@@ -0,0 +1,216 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// Given a function like:
+//   namespace ops {
+//   Node* Identity(NodeOut input, const GraphDefBuilder::Options& opts) {
+//     if (opts.HaveError()) return nullptr;
+//     static const string kOpName = "Identity";
+//     NodeBuilder node_builder(opts.GetNameForOp(kOpName), kOpName,
+//                              opts.op_registry());
+//     node_builder.Input(input);
+//     return opts.FinalizeBuilder(&node_builder);
+//   }
+//   }  // namespace ops
+//
+//   // Or, alternatively:
+//   namespace ops {
+//   Node* Identity(NodeOut input, const GraphDefBuilder::Options& opts) {
+//     static const string kOpName = "Identity";
+//     return UnaryOp(kOpName, input, opts);
+//   }
+//   }  // namespace ops
+//
+// You call it like:
+//   GraphDefBuilder b;
+//   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+//   Node* na = Const(7, b.opts());
+//   // Note: WithName() returns a copy, opts is unchanged.
+//   Node* nb = Const(5, b.opts().WithName("control-input"));
+//   Node* nc = Identity(na, b.opts().WithControlInput(nb));
+//   GraphDef graph_def;
+//   Status status = b.ToGraphDef(&graph_def);
+//   if (!status.ok()) { /* Handle error */ }
+//
+// In tests you can skip the status handling via:
+//   GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+//   ...
+//   b.ToGraphDef(&graph_def);
+
+class GraphDefBuilder {
+ public:
+  // Options for adding a Node to a Graph.
+  class Options {
+   public:
+    // Sets the Graph (that Nodes will be added to) and the status.  The
+    // status may be set to nullptr, in which case errors cause CHECK
+    // failures.  The graph and status must outlive *this.
+    Options(Graph* graph, absl::Status* status);
+    ~Options();
+
+    // Methods for setting options.  These are const methods: they
+    // return a copy of *this with the option set.
+    Options WithName(absl::string_view name) const;
+    Options WithDevice(absl::string_view device) const;
+    Options WithControlInput(Node* control_input) const;
+    Options WithControlInputs(absl::Span<Node* const> control_inputs) const;
+
+    // Override the default value for an optional attr.
+    template <class T>
+    Options WithAttr(absl::string_view attr_name, T&& value) const {
+      return Options(*this).WithAttrImpl(attr_name, std::forward<T>(value));
+    }
+    // Note: overload needed to allow {...} expressions for value.
+    template <class T>
+    Options WithAttr(absl::string_view attr_name,
+                     std::initializer_list<T> value) const {
+      return WithAttr<std::initializer_list<T>>(attr_name, std::move(value));
+    }
+
+    // Methods for using options from a function that creates a Node.
+
+    // Returns true if the status associated with *this has an error.
+    // Use this to skip processing that may depend on prior results.
+    bool HaveError() const { return status_ != nullptr && !status_->ok(); }
+
+    // Returns a string representation of the status associated with *this.
+    // Returns the string `"OK"` if the status doesn't have any error.
+    string StatusToString() const {
+      return status_->ok() ? "OK" : std::string(status_->message());
+    }
+
+    // Given the Op type name, return a name for a node of that type.
+    // Uses the value set in WithName() if that has been called.  Otherwise,
+    // returns a name built out of the Op type name.
+    string GetNameForOp(absl::string_view op) const;
+
+    // Sets the device, adds control inputs, adds attrs, and calls Finalize().
+    // If Finalize returns an error, it is saved and this function returns
+    // nullptr.
+    Node* FinalizeBuilder(NodeBuilder* builder) const;
+
+    // Updates the associated status, if any, or calls TF_CHECK_OK if none.
+    void UpdateStatus(const absl::Status& status) const;
+
+    // Accessor
+    const OpRegistryInterface* op_registry() const {
+      return graph_->op_registry();
+    }
+
+   private:
+    Options WithNameImpl(absl::string_view name);
+    Options WithDeviceImpl(absl::string_view device);
+    Options WithControlInputImpl(Node* control_input);
+    Options WithControlInputsImpl(absl::Span<Node* const> control_inputs);
+    template <class T>
+    Options WithAttrImpl(absl::string_view name, T&& value) {
+      attrs_.emplace_back(string(name), AttrValue());
+      SetAttrValue(std::forward<T>(value), &attrs_.back().second);
+      return *this;
+    }
+
+    Graph* const graph_;
+    absl::Status* const status_;
+    string name_;
+    string device_;
+    std::vector<Node*> control_inputs_;
+    std::vector<std::pair<string, AttrValue>> attrs_;
+  };
+
+  // Start building a new graph.
+  explicit GraphDefBuilder(
+      const OpRegistryInterface* op_registry = OpRegistry::Global())
+      : graph_(op_registry), flib_def_(op_registry), opts_(&graph_, &status_) {}
+
+  // For use in tests, where you want to fail immediately on error instead
+  // of checking the status at the end.
+  enum TestFailImmediatelyType { kFailImmediately };
+  explicit GraphDefBuilder(
+      TestFailImmediatelyType,
+      const OpRegistryInterface* op_registry = OpRegistry::Global())
+      : graph_(op_registry), flib_def_(op_registry), opts_(&graph_, nullptr) {}
+
+  // Gets the Options with the associated Graph and Status.
+  const Options& opts() const { return opts_; }
+
+  // Once all the nodes have been added, call this to get whether it was
+  // successful, and if so fill *graph_def.
+  absl::Status ToGraphDef(GraphDef* graph_def) const;
+
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  absl::Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
+    return flib_def_.AddLibrary(fdef_lib);
+  }
+
+  // Returns whether a user-defined function with `name` already exists in the
+  // graph.
+  bool HasFunction(const string& name) {
+    return flib_def_.Find(name) != nullptr;
+  }
+
+ private:
+  Graph graph_;
+  FunctionLibraryDefinition flib_def_;
+  absl::Status status_;
+  Options opts_;
+};
+
+namespace ops {
+
+// A NodeOut may either be a regular input or back input.  Regular
+// inputs are specified via either a Node* or a Node* and an output
+// index.  Back inputs are specified by a node name, output index, and
+// output type.
+typedef NodeBuilder::NodeOut NodeOut;
+
+// For adding an Op with no inputs to a GraphDefBuilder.
+Node* SourceOp(const string& op_name, const GraphDefBuilder::Options& opts);
+
+// For adding an Op with one input to a GraphDefBuilder.
+Node* UnaryOp(const string& op_name, NodeOut input,
+              const GraphDefBuilder::Options& opts);
+
+// For adding an Op with two inputs to a GraphDefBuilder.
+Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
+               const GraphDefBuilder::Options& opts);
+
+// For adding an Op with three inputs to a GraphDefBuilder.
+Node* TernaryOp(const string& op_name, NodeOut a, NodeOut b, NodeOut c,
+                const GraphDefBuilder::Options& opts);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/graph_node_util.h b/third_party/tflite-hdrs/tensorflow/core/graph/graph_node_util.h
new file mode 100644
index 00000000..146c4c07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/graph_node_util.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_NODE_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_NODE_UTIL_H_
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+class Node;
+struct NodeDebugInfo;
+
+// We forward declare protos so that kernels don't need to depend on them
+class NodeDef;
+class OpDef;
+
+// Produce a human-readable version of a Node or NodeDef that is more concise
+// than a text-format proto.
+string SummarizeNode(const Node& node);
+
+// Produces a formatted string pattern from the node which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <node_name>}}
+string FormatNodeForError(const Node& node);
+
+// Merges the original node names from the debug information of 'from' to the
+// debug information of 'to'.
+void MergeDebugInfo(const NodeDebugInfo& from, Node* to);
+void MergeDebugInfo(const NodeDebugInfo& from, NodeDef* to);
+void MergeDebugInfo(const NodeDef& from, NodeDef* to);
+
+// Computes the mapping from input/output argument name to the
+// corresponding input/output index range.  For example,
+// input "foo" corresponds to input indices
+//   [ (*inputs)["foo"].first, (*inputs)["foo"].second ).
+// NOTE(mrry): To reduce allocations when the map is used and save
+// space, the returned `NameRangeMap` objects borrow the input/output
+// argument names from `op_def`. The `op_def` must outlive the
+// returned `NameRangeMap` objects.
+absl::Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                               NameRangeMap* inputs, NameRangeMap* outputs);
+
+// Returns "status" with formatted Node attached as additional text
+// in the error message. If 'allow_multiple_formatted_node' is false and there
+// is already a formatted Node present in 'status', we simply attach the name
+// of the Node instead of the formatted string.
+absl::Status AttachDef(const absl::Status& status, const Node& node,
+                       bool allow_multiple_formatted_node = false);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_NODE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/graph_partition.h b/third_party/tflite-hdrs/tensorflow/core/graph/graph_partition.h
new file mode 100644
index 00000000..59e9fe0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/graph_partition.h
@@ -0,0 +1,109 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_PARTITION_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_PARTITION_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+struct PartitionOptions {
+  // A function that returns a location for the execution of a given
+  // Node.
+  typedef std::function<string(const Node*)> NodeToLocFunc;
+  NodeToLocFunc node_to_loc = nullptr;
+
+  // A function that returns a unique graph node name with the given
+  // prefix.
+  typedef std::function<string(const string&)> NewNameFunc;
+  NewNameFunc new_name = nullptr;
+
+  // A function that returns the incarnation of a device given the
+  // device's fullname. If not found, GetIncarnationFunc should return
+  // kIllegalIncarnation.
+  static constexpr uint64 kIllegalIncarnation = 0;
+  typedef std::function<uint64(const string&)> GetIncarnationFunc;
+  GetIncarnationFunc get_incarnation = nullptr;
+
+  // If specified, flib_def defines a function library that should be
+  // partitioned and replicated into each resulting partition graphs.
+  const FunctionLibraryDefinition* flib_def = nullptr;
+
+  // True if all the control flow "code" has already been added. The
+  // control flow code needs to be added when we still have the entire
+  // graph before any partitioning. So this flag should be false for
+  // the first partitioning but true for all subsequent partitioning.
+  //
+  // TODO(yuanbyu): We could also make the addition of the control
+  // flow code incremental based on 'node_to_loc'. This makes the
+  // communication a broadcast tree, which could be more efficient when
+  // the number of participating devices is large.
+  bool control_flow_added = false;
+
+  // A function that returns the data type into which the tensor
+  // should be cast before sent over the wire.
+  typedef std::function<DataType(const Edge*)> ShouldCastFunc;
+  ShouldCastFunc should_cast = nullptr;
+
+  // Schedule the execution of the recvs based on their start times
+  // computed by some scheduling algorithm. The recvs are divided into
+  // epochs based on their start times. A recv is enabled only when
+  // execution reaches its epoch - N for some predefined N.
+  bool scheduling_for_recvs = false;
+  // The start time for each node in the graph computed by some scheduling
+  // algorithm. If 'need_to_record_start_times' is true, we record them
+  // in the graph as a node attribute.
+  bool need_to_record_start_times = false;
+  std::vector<Microseconds> start_times;
+
+  // Optional customized function to compute the "tensor_name" attr value of
+  // Send/Recv ops inserted during partitioning.
+  std::function<string(const Edge*)> get_tensor_name_attr = nullptr;
+
+  // If true, the `Partition()` function can make destructive changes to the
+  // passed-in `Graph`.
+  //
+  // TODO(b/327983931): Add wrapper functions for partitioning that clearly
+  // signal this intent by taking a `Graph` or `Graph&&`.
+  bool can_make_destructive_changes = false;
+};
+
+// Partition "input" graph into a set of graphs, one per location.
+// The location for node n is derived by calling opts.node_to_loc(n).
+// New nodes added by Partition use "opts.new_name(old_name)" to
+// generate node names.
+//
+// Stores the partitions in *partitions.
+absl::Status Partition(const PartitionOptions& opts, Graph* input,
+                       std::unordered_map<string, GraphDef>* partitions);
+
+// Add control edges to the partitions to control the ordering
+// and timing of the recv nodes based on the start times calculated
+// using some scheduling algorithm.
+absl::Status AddControlEdges(const PartitionOptions& opts,
+                             std::unordered_map<string, GraphDef>* partitions);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_PARTITION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/mkl_graph_util.h b/third_party/tflite-hdrs/tensorflow/core/graph/mkl_graph_util.h
new file mode 100644
index 00000000..00e2e74b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/mkl_graph_util.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
+#ifdef INTEL_MKL
+
+#include "absl/base/call_once.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their
+// appropriate position based on selected ordering. For contiguous ordering,
+// we need to know the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static const MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
+  }
+}
+
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
+
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
+
+// check if the control between src and dst nodes already exists
+bool inline DoesControlEdgeExist(const Node* src, const Node* dst) {
+  for (const Edge* edge : src->out_edges()) {
+    if (edge->IsControlEdge() && edge->dst() == dst) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// In TF 2.8, oneDNN blocked format will not be supported.
+// TODO(intel_tf): Cleanup shall be done in future:
+//                 (1) Remove this method;
+//                 (2) Update related code wherever it is called.
+bool inline NativeFormatEnabled() { return true; }
+
+// Check if the data_format attribute in the node def represents 5D tensor
+bool inline Check5DFormat(const NodeDef& ndef) {
+  string data_format;
+  TF_CHECK_OK(GetNodeAttr(ndef, "data_format", &data_format));
+  if (data_format.compare("NCDHW") == 0 || data_format.compare("NDHWC") == 0) {
+    return true;
+  }
+  return false;
+}
+
+namespace mkl_op_registry {
+// MKL operators whose kernels are registered with 'MklLayoutDependentOp' label
+// (e.g., MklConv2D) understand input tensors in MKL layout. These operators
+// get additional meta-tensors for actual input tensors.
+static const char* kMklLayoutDependentOpLabel = "MklLayoutDependentOp";
+static const char* kMklLayoutDependentOpLabelPattern =
+    "label='MklLayoutDependentOp'";
+// MKL operators whose kernels are registered with 'MklNameChangeOp' label
+// (e.g., MklMatMul, MklTranspose) do not understand input tensors in MKL
+// layout. These operators do not get additional meta-tensors. The signatures of
+// these operators are the same as the original TensorFlow operators that they
+// correspond to. So these ops just go through a name change during graph
+// rewrite pass.
+static const char* kMklNameChangeOpLabel = "MklNameChangeOp";
+static const char* kMklNameChangeOpLabelPattern = "label='MklNameChangeOp'";
+static const char* kMklQuantizedOpLabel = "QuantizedMklOp";
+static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'";
+
+// Prefix that we add to Tensorflow op name to construct Mkl op name.
+static const char* const kMklOpPrefix = "_Mkl";
+// TODO(intel-tf): PR review feedback (penpornk)
+// Can we add eager_mode (or is_eager) as an op attribute instead?
+// This way we don't need to rename the op just to pass eager_mode
+// through template parameter.
+static const char* const kMklEagerOpPrefix = "_MklEager";
+
+// Prefix that we add to TF op name to construct MKL op that does not
+// depend on layout propagation. It will be used in both Eager and graph
+// modes unless there is a reason to have additional op name with
+// _MklEager prefix.
+static const char* const kMklNativeOpPrefix = "_MklNative";
+
+// Get the name of Mkl Native (does not depend on layout propagation) op
+// from original TensorFlow op.
+inline string GetMklNativeOpName(const string& name) {
+  // There are few operators that don't depend on layout propagation but are
+  // prefixed with _Mkl instead of _MklNative.
+  bool result =
+      (0 == name.compare("ConjugateTranspose") ||
+       0 == name.compare("SparseTensorDenseMatMul") ||
+       0 == name.compare("BatchMatMul") || 0 == name.compare("BatchMatMulV2") ||
+       0 == name.compare("Einsum") || 0 == name.compare("MatMul") ||
+       0 == name.compare("Transpose") || 0 == name.compare("QuantizeV2") ||
+       0 == name.compare("Dequantize") || 0 == name.compare("Softmax") ||
+       0 == name.rfind("Quantized", 0));
+
+  if (result) {
+    return string(kMklOpPrefix) + name;
+  } else {
+    return string(kMklNativeOpPrefix) + name;
+  }
+}
+
+// Get the name of Mkl op from original TensorFlow op
+// We prefix the original op with _Mkl or _MklNative to get Mkl op.
+inline string GetMklOpName(const string& name) {
+  if (!NativeFormatEnabled()) {
+    return string(kMklOpPrefix) + name;
+  } else {
+    return GetMklNativeOpName(name);
+  }
+}
+
+// Get the name of Mkl Eager op from original TensorFlow op
+// We prefix 'MklEager' to the original op to get Mkl Eager op.
+inline string GetMklEagerOpName(const string& name) {
+  return string(kMklEagerOpPrefix) + name;
+}
+
+// Check whether opname with type T is registered as MKL operator
+// that will go through name change or layout change pass.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as MKL op that will go through name
+// change or layout change pass; false otherwise
+static inline bool IsMklOp(const string& op_name, DataType T,
+                           bool is_native_op) {
+  string label = is_native_op ? kMklNameChangeOpLabelPattern
+                              : kMklLayoutDependentOpLabelPattern;
+  string registered_kernels_key = op_name + label + std::to_string(T);
+  thread_local static auto registered_kernels_map =
+      std::make_unique<absl::flat_hash_map<string, bool>>();
+  auto kernel_element = registered_kernels_map->find(registered_kernels_key);
+  bool kernel_registered = false;
+
+  if (kernel_element == registered_kernels_map->end()) {
+    string registered_kernels = KernelsRegisteredForOp(op_name);
+    // String returned by KernelsRegisteredForOp looks like below:
+    //
+    // Op = _MklMatMul, kernels =
+    // device='CPU'; label='MklNameChangeOp'; T in [DT_COMPLEX128]
+    // device='CPU'; label='MklNameChangeOp'; T in [DT_COMPLEX64]
+    // device='CPU'; label='MklNameChangeOp'; T in [DT_DOUBLE]
+    // device='CPU'; label='MklNameChangeOp'; T in [DT_FLOAT]
+
+    if (is_native_op &&
+        registered_kernels.find(kMklQuantizedOpLabelPattern) != string::npos) {
+      // Restrict quantized ops to QUINT8, QINT8 and DT_QINT32
+      kernel_registered = (T == DT_QUINT8 || T == DT_QINT8 || T == DT_QINT32);
+    }
+
+    // Now we just construct a search string to match what we are looking for.
+    string search_string =
+        label + string("; T in [") + DataType_Name(T) + string("]");
+
+    if (registered_kernels.find(search_string) != string::npos) {
+      kernel_registered = is_native_op
+                              ? (T == DT_COMPLEX128 || T == DT_COMPLEX64 ||
+                                 T == DT_DOUBLE || T == DT_FLOAT)
+                              : T == DT_FLOAT;
+      if (!kernel_registered) {
+        if ((T == DT_BFLOAT16 || T == DT_HALF) &&
+            IsDataTypeSupportedByOneDNNOnThisCPU(T)) {
+          kernel_registered = true;
+        } else {
+          DataTypeUnsupportedWarning(T);
+        }
+      }
+    }
+    registered_kernels_map->insert(
+        std::make_pair(registered_kernels_key, kernel_registered));
+  } else {
+    // Kernel is visited at least once. Return stored registration result.
+    kernel_registered = kernel_element->second;
+  }
+  return kernel_registered;
+}
+
+// TODO(intel-tf): QuantizedConv2D is registered with input: QUINT8
+// filter:QINT8 for oneDNN integration. First a dummy kernel is created
+// and then it is replaced by an actual kernel.
+static inline bool IsMklQuantizedOp(const string& op_name, DataType Tinput,
+                                    DataType Tfilter) {
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (IsMklOp(op_name, Tinput, kMklQuantizedOpLabelPattern)) {
+    return (Tfilter == DT_QINT8);
+  }
+  return false;
+}
+
+// Check if the operator with 'op_name' and type 'T' is an MKL operator that
+// will either understand input tensors in MKL layout or will go through name
+// rewrite that some operators go through.
+static inline bool IsMklOp(const string& op_name, DataType T) {
+  return IsMklOp(op_name, T, true) || IsMklOp(op_name, T, false);
+}
+
+static inline bool IsMklOp(const Node* n) {
+  DataType T;
+  return GetNodeAttr(n->def(), "T", &T).ok() && IsMklOp(n->type_string(), T);
+}
+
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op;
+// false otherwise
+static inline bool IsMklElementWiseOp(const string& op_name, DataType T) {
+  if (!IsMklOp(op_name, T)) {
+    return false;
+  }
+  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("AddV2")) ||
+                 0 == op_name.compare(GetMklOpName("Sub")) ||
+                 0 == op_name.compare(GetMklOpName("Mul")) ||
+                 0 == op_name.compare(GetMklOpName("Maximum")) ||
+                 0 == op_name.compare(GetMklOpName("Sigmoid")) ||
+                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+  return result;
+}
+}  // namespace mkl_op_registry
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/mkl_testlib.h b/third_party/tflite-hdrs/tensorflow/core/graph/mkl_testlib.h
new file mode 100644
index 00000000..3dffded1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/mkl_testlib.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_TESTLIB_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_TESTLIB_H_
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace test {
+namespace graph {
+
+Node* oneDNNSoftmax(Graph* g, Node* input);
+
+#ifdef ENABLE_ONEDNN_V3
+Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b);
+#endif  // ENABLE_ONEDNN_V3
+
+}  // namespace graph
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/node_builder.h b/third_party/tflite-hdrs/tensorflow/core/graph/node_builder.h
new file mode 100644
index 00000000..6f249371
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/node_builder.h
@@ -0,0 +1,181 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_NODE_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPH_NODE_BUILDER_H_
+
+#include <vector>
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// This is a helper for creating a Node and adding it to a Graph.
+// Internally, it uses a NodeDefBuilder to automatically set attrs
+// that can be inferred from the inputs, and use default values
+// (where they exist) for unspecified attrs.  Example usage:
+//
+//  Node* node;
+//  Status status = NodeBuilder(node_name, op_name)
+//                           .Input(...)
+//                           .Attr(...)
+//                           .Finalize(&graph, &node);
+//  if (!status.ok()) return status;
+//  // Use node here.
+class NodeBuilder {
+ public:
+  // For specifying the output of a Node to provide to one of the Input()
+  // functions below.  It supports both regular inputs (where you are
+  // connecting to an existing Node*), and inputs from outside the graph
+  // (or haven't been added to the graph yet, like back edges, where
+  // you don't have a Node*). Both types can be mixed, e.g. in an
+  // ArraySlice.
+  struct NodeOut {
+    // For referencing an existing Node.
+    NodeOut(Node* n, int32_t i = 0);
+    NodeOut(OutputTensor t);
+
+    // For referencing Nodes not in the graph being built. It is
+    // useful when preparing a graph for ExtendSession or creating a
+    // back edge to a node that hasn't been added to the graph yet,
+    // but will be.
+    NodeOut(absl::string_view name, int32_t i, DataType t);
+
+    // Default constructor for std::vector<NodeOut>.
+    NodeOut();
+
+    Node* node;
+    // error is set to true if:
+    // * the NodeOut was default constructed and never overwritten,
+    // * a nullptr Node* was passed to the NodeOut constructor, or
+    // * an out-of-range index was passed to the NodeOut constructor.
+    bool error;
+    string name;
+    int32 index;
+    DataType dt;
+  };
+
+  // Specify the name and the Op (either via an OpDef or the name of
+  // the Op plus a registry) for the Node.  Other fields are
+  // specified by calling the methods below.
+  // REQUIRES: The OpDef must satisfy ValidateOpDef().
+  NodeBuilder(absl::string_view name, absl::string_view op_name,
+              const OpRegistryInterface* op_registry = OpRegistry::Global(),
+              const NodeDebugInfo* debug = nullptr);
+  NodeBuilder(absl::string_view name, const OpDef* op_def);
+
+  // Create a NodeBuilder from an existing NodeDefBuilder.
+  NodeBuilder(const NodeDefBuilder& def_builder);
+
+  // You must call one Input() function per input_arg in the Op,
+  // *and in the same order as the input_args appear in the OpDef.*
+
+  // For inputs that take a single tensor.
+  NodeBuilder& Input(Node* src_node, int src_index = 0);
+  NodeBuilder& Input(NodeOut src);
+
+  // For inputs that take a list of tensors.
+  NodeBuilder& Input(absl::Span<const NodeOut> src_list);
+
+  // Require that this node run after src_node(s).
+  NodeBuilder& ControlInput(Node* src_node);
+  NodeBuilder& ControlInputs(absl::Span<Node* const> src_nodes);
+
+  // Sets the "requested device spec" in the NodeDef (not the
+  // "assigned device" in the Node).
+  NodeBuilder& Device(absl::string_view device_spec);
+
+  // Sets the device name in the "assigned device" field in tensorflow::Node.
+  NodeBuilder& AssignedDevice(absl::string_view device);
+
+  // Sets the _XlaCluster attribute in created node to `xla_cluster`.
+  NodeBuilder& XlaCluster(absl::string_view xla_cluster);
+
+  // Set the value of an attr.  attr_name must match the name of one of
+  // attrs defined by the Op, and value must have the corresponding type
+  // (see SetAttrValue() in ../framework/attr_value_util.h for legal
+  // types for value).  Note that attrs will be set automatically if
+  // they can be determined by the inputs.
+  template <class T>
+  NodeBuilder& Attr(absl::string_view attr_name, T&& value);
+  template <class T>
+  NodeBuilder& Attr(absl::string_view attr_name,
+                    std::initializer_list<T> value);
+
+  // Validates the described node and adds it to *graph, adding edges
+  // for all (non-back) inputs.  If created_node is not nullptr,
+  // *created_node will be set to the new node (or nullptr on error).
+  // If `consume` is true, the builder state will be moved into `node_def`,
+  // and the builder will be left in an undefined state.
+  absl::Status Finalize(Graph* graph, Node** created_node,
+                        bool consume = false);
+
+  // Same as `Finalize` above, but using StatusOr to return value. Preferred
+  // form.
+  absl::StatusOr<Node*> Finalize(Graph* graph, bool consume = false);
+
+  // Accessors for the values set in the constructor.
+  const string& node_name() const { return def_builder_.node_name(); }
+  const OpDef& op_def() const { return def_builder_.op_def(); }
+
+ private:
+  static DataType SafeGetOutput(const Node* node, int i, bool* error) {
+    if (node != nullptr && i >= 0 && i < node->num_outputs()) {
+      *error = false;
+      return node->output_type(i);
+    } else {
+      *error = true;
+      return DT_FLOAT;
+    }
+  }
+
+  // If SafeGetOutput indicates a range error, add it to errors_.
+  void AddIndexError(const Node* node, int i);
+
+  // Set *dt and returns true if i is in range. Combines
+  // SafeGetOutput() and AddIndexError().
+  bool GetOutputType(const Node* node, int i, DataType* dt);
+
+  NodeDefBuilder def_builder_;
+  const OpRegistryInterface* op_registry_;
+  std::vector<NodeOut> inputs_;
+  std::vector<Node*> control_inputs_;
+  std::vector<string> errors_;
+  string assigned_device_;
+};
+
+// IMPLEMENTATION -------------------------------------------------------------
+
+template <class T>
+NodeBuilder& NodeBuilder::Attr(absl::string_view attr_name, T&& value) {
+  def_builder_.Attr(attr_name, std::forward<T>(value));
+  return *this;
+}
+
+template <class T>
+NodeBuilder& NodeBuilder::Attr(absl::string_view attr_name,
+                               std::initializer_list<T> value) {
+  def_builder_.Attr(attr_name, value);
+  return *this;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_NODE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/optimizer_cse.h b/third_party/tflite-hdrs/tensorflow/core/graph/optimizer_cse.h
new file mode 100644
index 00000000..ef466fb7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/optimizer_cse.h
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization pass that performs common subexpression elimination.
+
+#ifndef TENSORFLOW_CORE_GRAPH_OPTIMIZER_CSE_H_
+#define TENSORFLOW_CORE_GRAPH_OPTIMIZER_CSE_H_
+
+#include <sys/types.h>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Perform common-subexpression elimination on the graph "*g".  If
+// "consider_fn" is not nullptr, then only nodes for which
+// consider_fn(node) returns true will be considered for combining
+// during the common subexpression elimination.
+//
+// Returns true if and only if 'g' is mutated.
+extern bool OptimizeCSE(Graph* g,
+                        const std::function<bool(const Node*)>& consider_fn);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_OPTIMIZER_CSE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/regularization/simple_delete.h b/third_party/tflite-hdrs/tensorflow/core/graph/regularization/simple_delete.h
new file mode 100644
index 00000000..07ebd00e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/regularization/simple_delete.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_REGULARIZATION_SIMPLE_DELETE_H_
+#define TENSORFLOW_CORE_GRAPH_REGULARIZATION_SIMPLE_DELETE_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow::graph_regularization {
+
+// Regularizes the graph_def by deleting non-deterministic sections.
+void SimpleDelete(GraphDef& graph_def);
+
+}  // namespace tensorflow::graph_regularization
+
+#endif  // TENSORFLOW_CORE_GRAPH_REGULARIZATION_SIMPLE_DELETE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/regularization/util.h b/third_party/tflite-hdrs/tensorflow/core/graph/regularization/util.h
new file mode 100644
index 00000000..2fff6452
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/regularization/util.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_REGULARIZATION_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_REGULARIZATION_UTIL_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow::graph_regularization {
+
+// Computes the Fingerprint64 hash of the GraphDef.
+uint64 ComputeHash(const GraphDef& graph_def);
+
+// Returns the suffix UID of `function_name`, returns an error if there is none.
+absl::StatusOr<int64_t> GetSuffixUID(absl::string_view function_name);
+
+}  // namespace tensorflow::graph_regularization
+
+#endif  // TENSORFLOW_CORE_GRAPH_REGULARIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/subgraph.h b/third_party/tflite-hdrs/tensorflow/core/graph/subgraph.h
new file mode 100644
index 00000000..37013b8f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/subgraph.h
@@ -0,0 +1,165 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_SUBGRAPH_H_
+#define TENSORFLOW_CORE_GRAPH_SUBGRAPH_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace subgraph {
+
+// Information about a graph rewritten by `RewriteGraphForExecution()`.
+struct RewriteGraphMetadata {
+  // The element type of each tensor fed to this subgraph. The order
+  // of types corresponds to the order of tensor names in
+  // `fed_outputs` when calling `RewriteGraphForExecution()`.
+  DataTypeVector feed_types;
+  // The element type of each tensor fetched from this subgraph. The
+  // order of types corresponds to the order of tensor names in
+  // `fetch_outputs` when calling `RewriteGraphForExecution()`.
+  DataTypeVector fetch_types;
+};
+
+// Describes the action to take on a particular tensor endpoint (described by
+// a "<node_name>:<output_index>" pair) when pruning the graph.
+//
+// The `AddNode()` method must be overridden to describe this action. The method
+// will be invoked once during `RewriteGraphForExecution()` with tensor endpoint
+// named by `endpoint_name`, and it may either create a single new node, or fail
+// with an error if the resulting graph would be invalid.
+class PruneRewrite {
+ public:
+  // `endpoint_name` and `device_info` must outlive this object.
+  PruneRewrite(const string* endpoint_name, const DeviceAttributes* device_info)
+      : endpoint_name_(endpoint_name), device_info_(device_info) {}
+  virtual ~PruneRewrite() {}
+
+  // Creates a new node whose output replaces the given `tensor` in graph `g`.
+  // The node will be assigned to the device named in `device_info`.
+  virtual absl::Status AddNode(Graph* g, NodeBuilder::NodeOut tensor,
+                               Node** out_node) = 0;
+
+  // Returns the name of the tensor to which this rewrite applies.
+  const string& endpoint_name() { return *endpoint_name_; }
+
+ protected:
+  // The device on which the new node will be created.
+  const DeviceAttributes& device_info() { return *device_info_; }
+
+ private:
+  const string* const endpoint_name_;          // Not owned.
+  const DeviceAttributes* const device_info_;  // Not owned.
+};
+
+// Rewrite the graph structure of "*g" to deal with feeding node
+// outputs, fetching node outputs, and only running a subset of the
+// graph.  "fed_outputs" and "fetch_outputs" are both lists of
+// output tensor identifiers in the form of
+// "<name>[:<optional_output_index>]", and "target_nodes_str" is a
+// lists of target node names in "*g" "g".
+//
+// In the resulting graph "*g", output edges in "fed_outputs" have
+// been redirected to special "_recv" nodes introduced into the graph.
+// If these fed nodes are not needed in order to compute the effects
+// of the nodes in "target_node_names" and "fetch_outputs", then these may
+// be omitted from the graph.
+//
+// In the resulting graph "*g", additional "_send" nodes are connected
+// to every output in "fetch_outputs".  These "_send" nodes are set up
+// to execute on the device described by device_info.
+//
+// On success, returns OK, and sets "*g" to a version of "*g"
+// that represents the portions of the graph necessary for producing
+// the output of all nodes listed in "target_node_names" and fetching the
+// specific node outputs specified in "fetch_outputs".
+//
+// On failure, returns the error status. Possible errors include:
+//    - fed output "node:output_index" does not exist in "*g"
+//    - fetch output "node:output_index" does not exist in "*g"
+//    - target node "node" does not exist in "*g"
+absl::Status RewriteGraphForExecution(
+    Graph* g, const absl::Span<const string>& fed_outputs,
+    const absl::Span<const string>& fetch_outputs,
+    const absl::Span<const string>& target_node_names,
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata);
+
+// A more general version of the above function that supports
+// customizable rewriting actions for each fed and fetched tensor.
+absl::Status RewriteGraphForExecution(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
+    const absl::Span<const string>& target_node_names,
+    RewriteGraphMetadata* out_metadata);
+
+/////////////////////////////////////////////////////////
+// Custom rewrite actions for fed and fetched tensors. //
+/////////////////////////////////////////////////////////
+
+// A rewrite action that adds an _Arg node for a fed tensor.
+class ArgFeedRewrite : public PruneRewrite {
+ public:
+  ArgFeedRewrite(const string* endpoint_name,
+                 const DeviceAttributes* device_info, int32_t arg_index)
+      : PruneRewrite(endpoint_name, device_info), arg_index_(arg_index) {}
+  absl::Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                       Node** out_node) override;
+
+ private:
+  const int32 arg_index_;
+};
+
+// A rewrite action that adds a client-terminated _Recv node for a fed tensor.
+class RecvFeedRewrite : public PruneRewrite {
+ public:
+  using PruneRewrite::PruneRewrite;
+  absl::Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                       Node** out_node) override;
+};
+
+// A rewrite action that adds a _Retval node for a fetched tensor.
+class RetvalFetchRewrite : public PruneRewrite {
+ public:
+  RetvalFetchRewrite(const string* endpoint_name,
+                     const DeviceAttributes* device_info, int32_t retval_index)
+      : PruneRewrite(endpoint_name, device_info), retval_index_(retval_index) {}
+  absl::Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                       Node** out_node) override;
+
+ private:
+  const int32 retval_index_;
+};
+
+// A rewrite action that adds a client-terminated _Send node for a
+// fetched tensor.
+class SendFetchRewrite : public PruneRewrite {
+ public:
+  using PruneRewrite::PruneRewrite;
+  absl::Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                       Node** out_node) override;
+};
+
+}  // namespace subgraph
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/tensor_id.h b/third_party/tflite-hdrs/tensorflow/core/graph/tensor_id.h
new file mode 100644
index 00000000..0cdfb7d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/tensor_id.h
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_TENSOR_ID_H_
+#define TENSORFLOW_CORE_GRAPH_TENSOR_ID_H_
+
+#include <string>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+struct SafeTensorId;
+
+// Identifier for a tensor within a step.
+// first == operation_name, second == output_index
+// Note: does not own backing storage for name.
+struct TensorId : public std::pair<absl::string_view, int> {
+  typedef std::pair<absl::string_view, int> Base;
+
+  // Inherit the set of constructors.
+  using Base::pair;
+
+  // NOTE(skyewm): this is required on some platforms. I'm not sure why the
+  // using statement above isn't always sufficient.
+  TensorId() : Base() {}
+  TensorId(const SafeTensorId& id);
+
+  const absl::string_view node() const { return first; }
+  int index() const { return second; }
+
+  string ToString() const {
+    if (second == Graph::kControlSlot) return strings::StrCat("^", first);
+    return strings::StrCat(first, ":", second);
+  }
+
+  struct Hasher {
+   public:
+    std::size_t operator()(const TensorId& x) const {
+      return Hash32(x.first.data(), x.first.size(), x.second);
+    }
+  };
+};
+
+TensorId ParseTensorName(const string& name);
+TensorId ParseTensorName(absl::string_view name);
+
+bool IsTensorIdControl(const TensorId& tensor_id);
+
+// Same as TensorId, except owns the backing storage for the op name. This makes
+// the memory management simpler at the expense of a copy.
+struct SafeTensorId : public std::pair<string, int> {
+  typedef std::pair<string, int> Base;
+
+  // NOTE(skyewm): this is required on some platforms. I'm not sure why the
+  // using "using Base::pair;" isn't always sufficient.
+  SafeTensorId() : Base() {}
+  SafeTensorId(const string& str, int idx) : Base(str, idx) {}
+  SafeTensorId(const TensorId& id);
+
+  const string& node() const { return first; }
+  int index() const { return second; }
+
+  string ToString() const {
+    if (second == Graph::kControlSlot) return strings::StrCat("^", first);
+    return strings::StrCat(first, ":", second);
+  }
+
+  struct Hasher {
+   public:
+    std::size_t operator()(const TensorId& x) const {
+      return Hash32(x.first.data(), x.first.size(), x.second);
+    }
+  };
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_TENSOR_ID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/testlib.h b/third_party/tflite-hdrs/tensorflow/core/graph/testlib.h
new file mode 100644
index 00000000..b2d1a416
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/testlib.h
@@ -0,0 +1,230 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DEPRECATED: Use the C++ API defined in tensorflow/cc instead.
+
+#ifndef TENSORFLOW_CORE_GRAPH_TESTLIB_H_
+#define TENSORFLOW_CORE_GRAPH_TESTLIB_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace test {
+namespace graph {
+
+// Converts "g" into its corresponding GraphDef "def".
+ABSL_DEPRECATED("Call g->ToGraphDef(def) instead.")
+void ToGraphDef(Graph* g, GraphDef* def);
+
+// A few helpers to construct a graph.
+
+// Adds a node in "g" producing a constant "tensor".
+Node* Constant(Graph* g, const Tensor& tensor);
+Node* Constant(Graph* g, const Tensor& tensor, const string& name);
+
+// Adds a node in "g" producing a constant "tensor" on the host.
+// The given node which, unlike the regular Constant above, always
+// stores its output on the host.  This is necessary for use
+// in GPU tests where the test Op in question runs on the device
+// but requires some arguments to be pinned to the host.
+Node* HostConstant(Graph* g, const Tensor& tensor);
+Node* HostConstant(Graph* g, const Tensor& tensor, const string& name);
+
+// Adds a variable in "g" of the given "shape" and "dtype".
+Node* Var(Graph* g, const DataType dtype, const TensorShape& shape);
+Node* Var(Graph* g, const DataType dtype, const TensorShape& shape,
+          const string& name);
+
+// Adds an assign node in "g" which assigns "val" into "var".
+Node* Assign(Graph* g, Node* var, Node* val);
+
+// Adds a send node "g" sending "input" as a named "tensor" from
+// "sender" to "receiver".
+Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
+           const uint64 sender_incarnation, const string& receiver);
+
+// Adds a recv node in "g" receiving a named "tensor" from "sender"
+// to "receiver".
+Node* Recv(Graph* g, const string& tensor, const string& type,
+           const string& sender, const uint64 sender_incarnation,
+           const string& receiver);
+
+// Adds a cumsum "node" in "g" doing cumsum(data, axes).
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
+             bool reverse = false);
+
+// Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
+// a reduction, e.g., Sum, Max, Min, Mean, etc.
+Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
+             bool keep_dims = false);
+
+// Adds a Matmul node in g doing in0.contract(in1).
+Node* Matmul(Graph* g, Node* in0, Node* in1, bool transpose_a,
+             bool transpose_b);
+
+// Adds a Matmul node in g doing in0.contract(in1).
+Node* BatchMatmul(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y);
+
+// Adds a Quantize node into g that quantize floats into QUINT8. The range of
+// the input float tensor is assumed to be [-1, 1].
+Node* QuantizeToUINT8(Graph* g, Node* data);
+
+// Adds a unary function "func" "node" in "g" taking "input".
+Node* Unary(Graph* g, const string& func, Node* input, int index = 0);
+
+// Adds an identity node in "g" taking "input" and producing an
+// identity copy.
+Node* Identity(Graph* g, Node* input, int index = 0);
+
+// Adds a binary function "func" node in "g" taking "in0" and "in1".
+Node* Binary(Graph* g, const string& func, Node* in0, Node* in1);
+
+// Adds a function "func" node in "g" taking inputs "ins".
+Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins);
+
+// Adds a binary add node in "g" doing in0 + in1.
+Node* Add(Graph* g, Node* in0, Node* in1);
+
+// Reverses <axis> dimensions of <tensor>>
+Node* Reverse(Graph* g, Node* tensor, Node* axis);
+
+// Generates random unit uniform distribution of the input shape.
+Node* RandomUniform(Graph* g, Node* input, DataType dtype);
+
+// Generates random unit normal distribution of the input shape.
+Node* RandomGaussian(Graph* g, Node* input, DataType dtype);
+
+// Generates random gamma distribution with the given shape and alpha[s].
+// Output dtype determined by alpha.
+Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
+
+// Generates random poisson distribution with the given shape and lam[s].
+// Output dtype determined by lam.
+Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
+
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
+// Generates random parameters from the truncated standard normal distribution
+// of the input shape
+Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
+
+// Adds an error node in "g". The node's computation always
+// generates an error with the given error message "errmsg".
+Node* Error(Graph* g, Node* input, const string& errmsg,
+            bool log_error = false);
+
+// Adds a node that generates a invalid ref output.
+Node* InvalidRefType(Graph* g, DataType out_type, DataType invalid_type);
+
+// Adds a node in "g". Its Compute() sleeps a while and outputs the
+// input (i.e., same as identity).
+Node* Delay(Graph* g, Node* input, Microseconds delay_micros);
+
+// Adds a no-op "node" in "g", with control inputs from all nodes in
+// control_inputs vector.
+Node* NoOp(Graph* g, const std::vector<Node*>& control_inputs);
+
+// Adds a Switch node in "g". If "in1" is true, it forwards "in0" to
+// output 1. Otherwise, it forwards "in0" to output 0.
+Node* Switch(Graph* g, Node* in0, Node* in1);
+
+// Adds an Enter node in "g", which enters a new frame.
+Node* Enter(Graph* g, Node* input, const string& frame_name);
+
+// Adds an Exit node in "g", which exits a frame.
+Node* Exit(Graph* g, Node* input);
+
+// Adds a Merge node in "g" with two inputs "in0" and "in1".
+Node* Merge(Graph* g, Node* in0, Node* in1);
+
+// Adds a Merge node in "g". The first input is "in0", the remaining
+// inputs are only given by their names in remaining_in.
+Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in);
+
+// Adds a NextIteration node in "g", which makes its input available
+// to the next iteration.
+Node* Next(Graph* g, const string& name, Node* input);
+
+// Adds a LoopCond node in "g", representing the "pivot" termination
+// condition of a loop.
+Node* LoopCond(Graph* g, Node* input);
+
+// Adds a less node in "g", which returns true iff "in0" < "in1".
+Node* Less(Graph* g, Node* in0, Node* in1);
+
+// Adds a select node in "g", which outputs either "inx" or "iny"
+// depending on the boolean value of "c".
+Node* Select(Graph* g, Node* c, Node* inx, Node* iny);
+
+// Casts "in" into data type "dst".
+Node* Cast(Graph* g, Node* in, DataType dst);
+
+// Perform gather op on params "in0" with indices "in1" and axis "axis".
+Node* Gather(Graph* g, Node* in0, Node* in1, Node* axis);
+
+// Gets a tensor stored in the session state.
+Node* GetSessionTensor(Graph* g, Node* in);
+
+// Adds a Concat node in "g". The first input is "concat_dim", the
+// dimension to concatenate on, and the tensors to concatenate are
+// given in "tensors".
+Node* Concat(Graph* g, Node* concat_dim, absl::Span<Node* const> tensors);
+
+// Adds a ConcatV2 node in "g". The last input is "concat_dim", the
+// dimension to concatenate on, and the tensors to concatenate are
+// given in "tensors".
+Node* ConcatV2(Graph* g, absl::Span<Node* const> tensors, Node* concat_dim);
+
+// Add a Relu node in "g".
+Node* Relu(Graph* g, Node* in);
+
+// Add a Relu6 node in "g".
+Node* Relu6(Graph* g, Node* in);
+
+// Add a BiasAdd node in "g".
+Node* BiasAdd(Graph* g, Node* value, Node* bias);
+
+// Add a Conv2D node in "g".
+Node* Conv2D(Graph* g, Node* in0, Node* in1);
+
+// Add a Diag node in "g".
+Node* Diag(Graph* g, Node* in, DataType type);
+
+// Add a DiagPart node in "g".
+Node* DiagPart(Graph* g, Node* in, DataType type);
+
+// Add a CheckNumerics node in "g".
+Node* CheckNumerics(Graph* g, Node* in, const string& message);
+
+// Add an _Arg node in "g".
+Node* Arg(Graph* g, int64_t index, DataType type);
+
+// Add a _Retval node in "g".
+Node* Retval(Graph* g, int64_t index, Node* in, int64_t in_index = 0);
+
+}  // end namespace graph
+}  // end namespace test
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/types.h b/third_party/tflite-hdrs/tensorflow/core/graph/types.h
new file mode 100644
index 00000000..05dd03ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/types.h
@@ -0,0 +1,35 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_TYPES_H_
+#define TENSORFLOW_CORE_GRAPH_TYPES_H_
+
+#include "tensorflow/core/lib/gtl/int_type.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// We model running time in microseconds.
+TSL_LIB_GTL_DEFINE_INT_TYPE(Microseconds, int64_t);
+
+// We can also model running time in nanoseconds for more accuracy.
+TSL_LIB_GTL_DEFINE_INT_TYPE(Nanoseconds, int64_t);
+
+// We model size in bytes.
+TSL_LIB_GTL_DEFINE_INT_TYPE(Bytes, int64_t);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/validate.h b/third_party/tflite-hdrs/tensorflow/core/graph/validate.h
new file mode 100644
index 00000000..3d59219b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/validate.h
@@ -0,0 +1,68 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_VALIDATE_H_
+#define TENSORFLOW_CORE_GRAPH_VALIDATE_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace graph {
+
+// Returns OK if every NodeDef in `graph_def` is valid with respect to
+// its corresponding OpDef (as defined by ValidateNodeDef()) as
+// registered in `op_registry`.  Also checks for deprecated ops.
+//
+// REQUIRES:
+//  * `op_registry` is not nullptr.
+//  * `graph_def` has default attrs filled in (see AddDefaultAttrsToGraphDef()).
+absl::Status ValidateGraphDef(const GraphDef& graph_def,
+                              const OpRegistryInterface& op_registry);
+
+// Like ValidateGraphDef() except it makes a copy of `graph_def` and calls
+// AddDefaultAttrsToGraphDef() on the copy, removing that requirement from the
+// caller.
+absl::Status ValidateGraphDefAgainstOpRegistry(
+    const GraphDef& graph_def, const OpRegistryInterface& op_registry);
+
+// Like ValidateGraphDefAgainstOpRegistry() except it takes an OpList
+// instead of an OpRegistryInterface.  Note that the OpList need not
+// have descriptions, which can be a big space savings, see
+// GetOpListForValidation() below.
+absl::Status ValidateGraphDefAgainstOpList(const GraphDef& graph_def,
+                                           const OpList& op_list);
+
+// Get an OpList from `*op_registry` with all the descriptions removed.
+void GetOpListForValidation(
+    OpList* op_list, const OpRegistry& op_registry = *OpRegistry::Global());
+
+// Validate that the graph has no cycle except for legal while loop cycles.
+// This traverses the specified nodes in topological order to verify there are
+// no cycles. Starting with inputless nodes, it visits nodes whose inputs have
+// all been visited, and counts the total number of visited nodes. If there is a
+// cycle, nodes in the cycle will never be visited, and the visited count will
+// be less than the total node count.
+absl::Status ValidateGraphHasNoCycle(const Graph& graph);
+
+// Returns OK if the graph has no duplicate node names.
+absl::Status VerifyNoDuplicateNodeNames(const GraphDef& graph);
+
+}  // namespace graph
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_VALIDATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/while_context.h b/third_party/tflite-hdrs/tensorflow/core/graph/while_context.h
new file mode 100644
index 00000000..e23e9df9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/while_context.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_WHILE_CONTEXT_H_
+#define TENSORFLOW_CORE_GRAPH_WHILE_CONTEXT_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Information about a while loop. Every user-defined while loop has an
+// associated WhileContext, i.e., there is a WhileContext for every execution
+// frame. Created with the while loop and used during gradient
+// construction. Note that the gradient graph of while loop contains while loops
+// itself, but these do not generate separate WhileContexts.
+//
+// TODO(skyewm): this is currently insufficient to handle nested loops and
+// conditionals (and possibly other requirements). This may change a lot in the
+// future to support these features.
+//
+// TODO(skyewm): de/serialize in MetaGraphDef so imported while loops will be
+// differentiable. Figure out backwards compatibility story.
+class WhileContext {
+ public:
+  WhileContext(absl::string_view frame_name, std::vector<Node*> enter_nodes,
+               std::vector<Node*> exit_nodes, OutputTensor cond_output,
+               std::vector<OutputTensor> body_inputs,
+               std::vector<OutputTensor> body_outputs);
+
+  const string& frame_name() const { return frame_name_; }
+  const std::vector<Node*>& enter_nodes() const { return enter_nodes_; }
+  const std::vector<Node*>& exit_nodes() const { return exit_nodes_; }
+  const OutputTensor& cond_output() const { return cond_output_; }
+  const std::vector<OutputTensor>& body_inputs() const { return body_inputs_; }
+  const std::vector<OutputTensor>& body_outputs() const {
+    return body_outputs_;
+  }
+
+ private:
+  // Each user-defined while loop defines a new execution frame, which is
+  // uniquely identified by its frame name. Frames are used by the executor to
+  // manage the iterations of a loop. See the FrameState comment in
+  // core/common_runtime/executor.cc for more details.
+  const string frame_name_;
+
+  // The enter nodes defining the input loop variables to the while loop. This
+  // vector defines the order of the loop variables.
+  const std::vector<Node*> enter_nodes_;
+
+  // The exit nodes defining the outputs of the while loop. These are in loop
+  // variable order.
+  const std::vector<Node*> exit_nodes_;
+
+  // The boolean output of the loop predicate.
+  const OutputTensor cond_output_;
+
+  // The inputs and outputs to the loop body.
+  const std::vector<OutputTensor> body_inputs_;
+  const std::vector<OutputTensor> body_outputs_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_WHILE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/graph/zen_graph_util.h b/third_party/tflite-hdrs/tensorflow/core/graph/zen_graph_util.h
new file mode 100644
index 00000000..7dc23fbc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/graph/zen_graph_util.h
@@ -0,0 +1,83 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_ZEN_GRAPH_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_ZEN_GRAPH_UTIL_H_
+#ifdef AMD_ZENDNN
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+namespace zen_op_registry {
+
+// Prefix that we add to Tensorflow op name to construct Zen op name.
+static const char* const kZenNodePrefix = "_Zen";
+
+// Get the name of Zen op from original TensorFlow op.
+// We prefix the original op with "Zen" to get Zen op.
+inline string GetZenOpName(const string& name) {
+  return string(kZenNodePrefix) + name;
+}
+
+// Check whether op name with type T is registered as Zen operator
+// that will go through name change or layout change pass.
+//
+// @input  op_name - name of the op.
+// @input  T - datatype to be used for checking op.
+// @return true if op name is registered as Zen op that will go through name
+// change or layout change pass; false otherwise.
+static inline bool IsZenOpKernelRegistered(const string& op_name, DataType T) {
+  string registered_kernels_key = op_name + string(DataType_Name(T));
+  thread_local static auto* registered_kernels_map =
+      new absl::flat_hash_map<string, bool>();
+  auto kernel_element = registered_kernels_map->find(registered_kernels_key);
+  bool kernel_registered = false;
+
+  if (kernel_element == registered_kernels_map->end()) {
+    string registered_kernels = KernelsRegisteredForOp(op_name);
+    // String returned by KernelsRegisteredForOp looks like below:
+    //
+    // Op = ZenMatMul, kernels =
+    // device='CPU'; T in [DT_FLOAT]
+    // device='CPU'; T in [DT_DOUBLE]
+
+    // If we have multiple kernels registered for the op. We need to verify
+    // our datatype
+    if (registered_kernels.find(string(DataType_Name(T))) != string::npos) {
+      kernel_registered = true;
+    }
+    registered_kernels_map->insert(
+        std::make_pair(registered_kernels_key, kernel_registered));
+  } else {
+    // Kernel is visited at least once. Return stored registration result.
+    kernel_registered = kernel_element->second;
+  }
+  return kernel_registered;
+}
+
+}  // namespace zen_op_registry
+}  // namespace tensorflow
+
+#endif  // AMD_ZENDNN
+#endif  // TENSORFLOW_CORE_GRAPH_ZEN_GRAPH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/cluster.h b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/cluster.h
new file mode 100644
index 00000000..36aec54c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/cluster.h
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A cluster represents of collection of hardware resources available to run
+// the TensorFlow model.
+// A process can only create a single cluster at a time.
+class Cluster {
+ public:
+  explicit Cluster(int timeout_s);
+  virtual ~Cluster();
+
+  // Returns a string that represent the type of cluster that was instantiated.
+  virtual string type() const = 0;
+
+  // Provision the hardware resources needed to run TensorFlow and start a
+  // TensorFlow session that can take advantage of these resources.
+  // The actual resources that are leveraged depend on the type of cluster
+  // instantiated.
+  // Returns OK iff all the requested resources could be reserved and a
+  // TensorFlow session successfully created. Returns an error otherwise.
+  // There is no graceful degradation to handle the case where only a subset
+  // of the requested resources are available.
+  virtual absl::Status Provision() = 0;
+
+  // Attempts to shutdown the cluster.
+  // Returns OK iff there are no pending calls to the Run() method and all the
+  // resources used by the cluster could be released. Returns an error
+  // otherwise.
+  virtual absl::Status Shutdown() { return absl::OkStatus(); }
+
+  // Whether soft placement is allowed. If allow_soft_placement is true,
+  // an op will be placed on CPU if there's no GPU implementation for the OP
+  // or if no GPU devices are known or registered or if we need to co-locate
+  // with reftype input(s) which are from CPU.
+  void AllowSoftPlacement(bool soft_placement_state);
+
+  // Update the number of inter-op threads for each per-session threadpool
+  void SetNumInterOpThreads(int num_threads);
+
+  // Set the number of steps required to warmup TensorFlow. Must be called
+  // before Provision().
+  void SetNumWarmupSteps(int num_steps);
+
+  // Set executor type to instantiate
+  void SetExecutorType(const string* executor_type);
+
+  // Returns the number of warmup steps.
+  int NumWarmupSteps() const;
+
+  // Disable the collection of detailed statistics. Must be called
+  // before Provision().
+  void DisableDetailedStats(bool disable);
+
+  // Returns true iff the collection of detailed statistics is enabled.
+  bool DetailedStatsEnabled() const;
+
+  // Disable the TensorFlow optimizer. This ensures that the graph that TF
+  // executes is similar to the input graph. Must be called before Provision().
+  void DisableOptimizer(bool disable);
+
+  // Return the list of TensorFlow devices that are available to execute a
+  // graph. This is empty until provision() is called.
+  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+    return devices_;
+  }
+
+  // Convenience method that returns the set of device names. These names are
+  // sorted alphabetically.
+  const std::vector<string> GetDeviceNames() const;
+
+  // The DeviceSet is not always available, but when it is it contains a
+  // superset of the devices listed in GetDevices/GetDeviceNames().
+  virtual const DeviceSet* GetDeviceSet() const { return nullptr; }
+
+  // Enables collecting the allocator stats. If called, must be called before
+  // Provision().
+  virtual absl::Status EnablePeakMemoryStats() {
+    return absl::UnimplementedError(strings ::StrCat(
+        "Peak Memory Stats are not supported on ", type(), " clusters"));
+  }
+
+  // Returns peak memory of all devices during the session creation and session
+  // runs.
+  virtual absl::Status GetPeakMemoryUsage(
+      std::unordered_map<string, uint64>* device_peak_memory) const {
+    return absl::UnimplementedError(
+        "GetPeakMemoryUsage is not implemented for this type of cluster.");
+  }
+
+  // Prepare the session to run the specified grappler item. This include
+  // initializing all the model variables.
+  virtual absl::Status Initialize(const GrapplerItem& item) = 0;
+
+  // Run the specified graph_def and return the corresponding metadata.
+  virtual absl::Status Run(const GraphDef& graph_def,
+                           const std::vector<std::pair<string, Tensor>>& feed,
+                           const std::vector<string>& fetch,
+                           RunMetadata* metadata) = 0;
+
+  // Run the specified GrapplerItem and return the corresponding metadata.
+  virtual absl::Status Run(const GrapplerItem& item, RunMetadata* metadata) {
+    return Run(item.graph, item.feed, item.fetch, metadata);
+  }
+
+ protected:
+  std::unordered_map<string, DeviceProperties> devices_;
+  const int timeout_s_;
+  SessionOptions options_;
+  RunOptions run_options_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/single_machine.h b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/single_machine.h
new file mode 100644
index 00000000..f3f36626
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/single_machine.h
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/cc/training/coordinator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that makes available to grappler a subset of the
+// nodes available on a single local computer.
+class SingleMachine : public Cluster {
+ public:
+  SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus);
+  ~SingleMachine() override;
+
+  string type() const override { return "single_machine"; }
+
+  absl::Status Provision() override;
+  absl::Status Shutdown() override;
+
+  absl::Status Initialize(const GrapplerItem& item) override;
+  absl::Status Run(const GraphDef& item,
+                   const std::vector<std::pair<string, Tensor>>& feed,
+                   const std::vector<string>& fetch,
+                   RunMetadata* metadata) override;
+
+  const DeviceSet* GetDeviceSet() const override { return device_set_.get(); }
+
+  absl::Status EnablePeakMemoryStats() override;
+
+  // It requires EnableAllocatorStats(true) be called before Provision().
+  absl::Status GetPeakMemoryUsage(
+      std::unordered_map<string, uint64>* device_peak_memory) const override;
+
+ private:
+  absl::Status RunWithTimeout(
+      const std::vector<std::pair<string, Tensor>>& feed,
+      const std::vector<string>& fetch, RunMetadata* run_metadata);
+  absl::Status RunWithTimeout(
+      const std::vector<std::pair<string, Tensor>>& feed,
+      const std::vector<string>& fetch, RunMetadata* run_metadata,
+      int64_t timeout_s);
+  absl::Status ResetSession();
+  absl::Status CloseSession(bool use_timeout);
+  absl::Status ShutdownSession();
+  void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
+                  const CostGraphDef& queue_costs);
+
+  absl::Status ClearAllocatorStats() const;
+
+  std::unique_ptr<Session> session_;
+  std::vector<QueueRunnerDef> queue_runner_defs_;
+  string last_graph_id_;
+  mutex last_graph_mu_;
+  const GraphDef* last_graph_ TF_GUARDED_BY(last_graph_mu_) = nullptr;
+  std::vector<string> init_ops_;
+  int64_t expected_init_time_s_;
+  std::unique_ptr<Coordinator> coordinator_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<DeviceSet> device_set_;
+
+  RunMetadata init_metadata_;
+
+  mutex close_mu_;
+  bool closing_ TF_GUARDED_BY(close_mu_);
+
+  bool cpu_allocator_stats_enabled_ = false;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/utils.h
new file mode 100644
index 00000000..8a597854
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns the DeviceProperties of the CPU on which grappler is running.
+DeviceProperties GetLocalCPUInfo();
+
+// Returns the DeviceProperties for the specified GPU attached to the server on
+// which grappler is running.
+DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id);
+
+// Returns the DeviceProperties of the specified device
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/virtual_cluster.h b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/virtual_cluster.h
new file mode 100644
index 00000000..1204a34c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that lists the devices (and their properties)
+// available in a TensorFlow session. This cluster simulates the execution of
+// actual graphs.
+class VirtualCluster : public Cluster {
+ public:
+  explicit VirtualCluster(
+      const std::unordered_map<string, DeviceProperties>& devices);
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
+                 std::unique_ptr<OpLevelCostEstimator> node_estimator,
+                 std::unique_ptr<ReadyNodeManager> node_manager);
+  explicit VirtualCluster(const DeviceSet* device_set);
+
+  ~VirtualCluster() override;
+
+  string type() const override { return "virtual"; }
+
+  absl::Status Provision() override;
+  absl::Status Initialize(const GrapplerItem& item) override;
+  absl::Status Run(const GraphDef& graph,
+                   const std::vector<std::pair<string, Tensor>>& feed,
+                   const std::vector<string>& fetch,
+                   RunMetadata* metadata) override;
+  absl::Status Run(const GrapplerItem& item, RunMetadata* metadata) override;
+  const DeviceSet* GetDeviceSet() const override { return device_set_; }
+
+ private:
+  std::unique_ptr<AnalyticalCostEstimator> estimator_;
+  const DeviceSet* device_set_ = nullptr;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/analytical_cost_estimator.h
new file mode 100644
index 00000000..b31ce39e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item based on the theoretical
+// performance of the hardware that will run the model. Note that this
+// internally uses static shape inference. An option for aggressive shape
+// inference is provided to minimize unknown shapes, and this is only applicable
+// with static shape inference.
+class AnalyticalCostEstimator : public CostEstimator {
+ public:
+  AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes,
+                          bool use_aggressive_shape_inference);
+  AnalyticalCostEstimator(Cluster* cluster,
+                          std::unique_ptr<OpLevelCostEstimator> node_estimator,
+                          std::unique_ptr<ReadyNodeManager> node_manager,
+                          bool use_static_shapes,
+                          bool use_aggressive_shape_inference);
+  AnalyticalCostEstimator(Cluster* cluster,
+                          std::unique_ptr<OpLevelCostEstimator> node_estimator,
+                          std::unique_ptr<ReadyNodeManager> node_manager,
+                          std::unique_ptr<VirtualPlacer> placer,
+                          bool use_static_shapes,
+                          bool use_aggressive_shape_inference);
+  ~AnalyticalCostEstimator() override {}
+
+  // This implementation always returns OK.
+  absl::Status Initialize(const GrapplerItem& item) override;
+
+  // Predict the performance of each node of the optimized graph and annotate
+  // the RunMetadata with the corresponding estimates. Also returns the
+  // expected cost for the whole graph.
+  absl::Status PredictCosts(const GraphDef& optimized_graph,
+                            RunMetadata* run_metadata,
+                            Costs* cost) const override;
+
+  const VirtualScheduler* GetScheduler() const { return scheduler_.get(); }
+
+ private:
+  const GrapplerItem* item_;
+  std::unique_ptr<OpLevelCostEstimator> node_estimator_;
+  std::unique_ptr<ReadyNodeManager> node_manager_;
+  std::unique_ptr<VirtualScheduler> scheduler_;
+
+  bool use_static_shapes_;
+  bool use_aggressive_shape_inference_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/cost_estimator.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/cost_estimator.h
new file mode 100644
index 00000000..b133b369
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/cost_estimator.h
@@ -0,0 +1,259 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+class GraphDef;
+class CostGraphDef;
+
+namespace grappler {
+struct GrapplerItem;
+
+constexpr uint64_t kMemoryUnknown = std::numeric_limits<uint64_t>::max();
+constexpr uint64_t kZeroMemory = 0ULL;
+
+struct DeviceInfo {
+  // Billions of operations executed per second.
+  double gigaops;
+
+  // Bandwidth to main memory in GB per second.
+  double gb_per_sec;
+
+  // Read bandwidth to intermediate memory in GB per second.
+  double intermediate_read_gb_per_sec;
+
+  // Write bandwidth to intermediate memory in GB per second.
+  double intermediate_write_gb_per_sec;
+
+  DeviceInfo()
+      : gigaops(INFINITY),
+        gb_per_sec(INFINITY),
+        intermediate_read_gb_per_sec(INFINITY),
+        intermediate_write_gb_per_sec(INFINITY) {}
+
+  DeviceInfo(const DeviceInfo& input)
+      : gigaops(input.gigaops),
+        gb_per_sec(input.gb_per_sec),
+        intermediate_read_gb_per_sec(input.intermediate_read_gb_per_sec),
+        intermediate_write_gb_per_sec(input.intermediate_write_gb_per_sec) {}
+
+  DeviceInfo(double gigaops, double gb_per_sec,
+             double intermediate_read_gb_per_sec = INFINITY,
+             double intermediate_write_gb_per_sec = INFINITY)
+      : gigaops(gigaops),
+        gb_per_sec(gb_per_sec),
+        intermediate_read_gb_per_sec(intermediate_read_gb_per_sec),
+        intermediate_write_gb_per_sec(intermediate_write_gb_per_sec) {}
+};
+
+// Holds the set of things we might want to estimate or measure in Grappler.
+// Always produce execution time. Other fields are optional depending on the
+// estimator being used.
+struct Costs {
+  // Returns a Costs structure with default values for all of the fields.
+  inline Costs();
+
+  // Builds a Costs structure with all zero values, rather than unknowns.
+  static inline Costs ZeroCosts(bool inaccurate = false);
+
+  struct MilliSeconds : std::chrono::milliseconds {
+    MilliSeconds() : std::chrono::milliseconds(0) {}
+    MilliSeconds(double d)
+        : std::chrono::milliseconds(static_cast<int64_t>(d)) {}
+    MilliSeconds(const std::chrono::milliseconds& d)
+        : std::chrono::milliseconds(d) {}
+    MilliSeconds& operator=(const std::chrono::milliseconds& d) {
+      std::chrono::milliseconds::operator=(d);
+      return *this;
+    }
+  };
+  struct MicroSeconds : std::chrono::microseconds {
+    MicroSeconds() : std::chrono::microseconds(0) {}
+    MicroSeconds(double d)
+        : std::chrono::microseconds(static_cast<int64_t>(d)) {}
+    MicroSeconds(const std::chrono::microseconds& d)
+        : std::chrono::microseconds(d) {}
+    MicroSeconds& operator=(const std::chrono::microseconds& d) {
+      std::chrono::microseconds::operator=(d);
+      return *this;
+    }
+    MilliSeconds asMilliSeconds() const {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
+    }
+  };
+  struct NanoSeconds : std::chrono::nanoseconds {
+    NanoSeconds() : std::chrono::nanoseconds(0) {}
+    NanoSeconds(double d) : std::chrono::nanoseconds(static_cast<int64_t>(d)) {}
+    NanoSeconds(const std::chrono::nanoseconds& d)
+        : std::chrono::nanoseconds(d) {}
+    NanoSeconds& operator=(const std::chrono::nanoseconds& d) {
+      std::chrono::nanoseconds::operator=(d);
+      return *this;
+    }
+    MicroSeconds asMicroSeconds() const {
+      return std::chrono::duration_cast<std::chrono::microseconds>(*this);
+    }
+    MilliSeconds asMilliSeconds() const {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
+    }
+    static NanoSeconds infinity() {
+      return NanoSeconds(std::chrono::nanoseconds::max());
+    }
+  };
+
+  // We store all our times in nanoseconds. If needs be, we can always switch to
+  // picoseconds in the future by updating this typedef.
+  typedef NanoSeconds Duration;
+
+  // Overall cost of running the graph; latency.
+  Duration execution_time;
+
+  // Computation cost of running the graph.
+  Duration compute_time;
+
+  // Memory access cost of running the graph.
+  Duration memory_time;
+
+  // Intermediate memory access cost of running the graph
+  Duration intermediate_memory_time;
+  Duration intermediate_memory_read_time;   // Intermediate memory read cost.
+  Duration intermediate_memory_write_time;  // Intermediate memory write cost.
+
+  // Network time (colelctived ops - all gather, all reduce, etc.)
+  Duration network_time;
+
+  // This field can be a very pessimistic estimate of the main memory
+  // requirements of a graph. For example, it might assume that all activations
+  // are live for all of a graph's execution.
+  uint64_t max_memory;  // Max main memory requirement in bytes over all ops.
+  uint64_t persistent_memory;
+  uint64_t temporary_memory;
+
+  // Output memory usage per port.
+  absl::flat_hash_map<int32_t, int64_t> output_tensor_size_bytes;
+
+  // Track persistent versus temporary memory.
+  absl::flat_hash_set<int32_t> persistent_output_ports;
+
+  // These fields are used for TPU-related estimations. They are per-op
+  // maximums, so each op is evaluated independently, but we want the maximum of
+  // the value over all ops.
+  int64_t max_per_op_buffers;    // Sum of all buffers used by the ops.
+  int64_t max_per_op_streaming;  // Ignore largest input buffer, assuming it
+                                 // streams from main memory.
+
+  // Number of ops included in this Costs in total.
+  // Default initialized to be one.
+  int64_t num_ops_total = 1;
+  // If the time estimation is inaccurate.
+  bool inaccurate = false;
+  // Number of ops that are estimated with unknown shapes.
+  int64_t num_ops_with_unknown_shapes = 0;
+  // TODO(pcma): include a counter for total inaccurate ops and counters for
+  // other reasons causing the inaccuracy
+
+  // Max possible memory usage per device.
+  std::unordered_map<string, uint64> estimated_max_memory_per_device;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Costs::MilliSeconds d) {
+  os << d.count() << "ms";
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
+  os << d.count() << "us";
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const Costs::NanoSeconds d) {
+  os << d.count() << "ns";
+  return os;
+}
+
+Costs::Costs() {
+  execution_time = Duration::zero();
+  compute_time = Duration::zero();
+  memory_time = Duration::zero();
+  intermediate_memory_time = Duration::zero();
+  network_time = Duration::zero();
+  max_memory = kMemoryUnknown;
+  persistent_memory = kMemoryUnknown;
+  temporary_memory = kMemoryUnknown;
+  max_per_op_buffers = kMemoryUnknown;
+  max_per_op_streaming = kMemoryUnknown;
+}
+
+Costs Costs::ZeroCosts(bool inaccurate) {
+  Costs costs;
+  costs.execution_time = Duration::zero();
+  costs.compute_time = Duration::zero();
+  costs.memory_time = Duration::zero();
+  costs.intermediate_memory_time = Duration::zero();
+  costs.network_time = Duration::zero();
+  costs.max_memory = kZeroMemory;
+  costs.persistent_memory = kZeroMemory;
+  costs.temporary_memory = kZeroMemory;
+  costs.max_per_op_buffers = kZeroMemory;
+  costs.max_per_op_streaming = kZeroMemory;
+  costs.inaccurate = inaccurate;
+  return costs;
+}
+
+Costs CombineCosts(const Costs& left, const Costs& right);
+
+// Multiplies Costs by a scalar.
+// Equivalent to applying CombineCosts "multiplier" times.
+Costs MultiplyCosts(const Costs& costs, int multiplier);
+
+// Given a GrapperItem and an optimized implementation of the corresponding
+// TensorFlow graph, the CostEstimator attempts to predicts the actual cost of
+// running the graph.
+class CostEstimator {
+ public:
+  virtual ~CostEstimator() {}
+
+  // Initializes the estimator for the specified grappler item.
+  // The estimator shouldn't be used if this function returns any status other
+  // that OK.
+  virtual absl::Status Initialize(const GrapplerItem& item) = 0;
+
+  // Predicts the cost of running the given optimized version of the grappler
+  // item.
+  // If a RunMetadata is passed, it will be populated with detailed information
+  // about the cost of running each operation of the optimized graph.
+  // if a double value is passed, it will be set to a value that reflects the
+  // overall cost of running the graph (e.g. the latency of the computation).
+  // Returns a status that indicate is the performance could be estimated or
+  // not.
+  virtual absl::Status PredictCosts(const GraphDef& optimized_graph,
+                                    RunMetadata* run_metadata,
+                                    Costs* cost) const = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/graph_memory.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/graph_memory.h
new file mode 100644
index 00000000..fcd9eaeb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/graph_memory.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Infer the worst case memory usage for a given grappler item.
+class GraphMemory {
+ public:
+  struct LiveTensor {
+    string node;
+    int output_id;
+    size_t memory_used;
+    Costs::Duration allocation_time;
+    Costs::Duration deallocation_time;
+  };
+  struct MemoryUsage {
+    int64_t used_memory;
+    std::vector<LiveTensor> live_tensors;
+  };
+
+  explicit GraphMemory(const GrapplerItem& item)
+      : item_(item), unknown_usage_({-1, {}}) {}
+
+  absl::Status InferStatically(
+      const std::unordered_map<string, DeviceProperties>& devices);
+  absl::Status InferDynamically(Cluster* cluster);
+
+  // Worst case memory usage in bytes, or -1 if the usage is unknown. If there
+  // are multiple devices, returns the highest per device memory usage.
+  int64_t GetWorstCaseMemoryUsage() const;
+
+  // Returns the peak memory usage for the specified device.
+  const MemoryUsage& GetPeakMemoryUsage(const string& device) const {
+    auto it = peak_usage_.find(device);
+    if (it == peak_usage_.end()) {
+      return unknown_usage_;
+    }
+    return it->second;
+  }
+
+ private:
+  void InferMemUsageForNodes(const std::vector<const NodeDef*>& nodes,
+                             GraphProperties* properties, int64_t* worst_case,
+                             int64_t* best_case) const;
+  int64_t InferMemUsageForNeighbors(
+      const std::vector<OpInfo::TensorProperties>& props) const;
+
+  void InferFromTrace(const StepStats& timeline);
+
+  const GrapplerItem& item_;
+  std::unordered_map<string, int64_t> worst_case_memory_usage_;
+  std::unordered_map<string, MemoryUsage> peak_usage_;
+  const MemoryUsage unknown_usage_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/graph_properties.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/graph_properties.h
new file mode 100644
index 00000000..1d9575e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/graph_properties.h
@@ -0,0 +1,226 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+
+namespace grappler {
+
+// Optional attributes that tell about node output information.
+// We use these side information, if provided, for static shape inference
+// and VirtualScheduler scheduling.
+
+// Switch op attribute as a vector of int that tells which branch the
+// Switch output is taken on every round of execution.
+// Used for scheduling ops after Switch correctly (e.g., While loop).
+ABSL_CONST_INIT const char kOutputSlots[] = "_output_slot_vector";
+
+// Example:
+// Assume a node has two outputs and iterated for three times. Then it has:
+// _execution_count = 3
+// _output_sizes_vector = [2, 2, 2]
+// _output_dtype_vector.size = 6
+// _output_shape_vector.size = 6
+
+// If all the iterations have same output shapes, then
+// _execution_count = 3
+// _same_output_for_iterations = true
+// _output_sizes_vector = [2]
+// _output_dtype_vector.size = 2
+// _output_shape_vector.size = 2
+
+// How many times this node has been executed.
+ABSL_CONST_INIT const char kExecutionCount[] = "_execution_count";
+
+// Records the output sizes for each round of execution.
+ABSL_CONST_INIT const char kOutputSizes[] = "_output_sizes_vector";
+
+// The node has been scheduled multiple times with outputs that have the same
+// shape.
+ABSL_CONST_INIT const char kOutputSame[] = "_same_output_for_iterations";
+
+// Outputs DataType vector.
+ABSL_CONST_INIT const char kOutputTypes[] = "_output_dtype_vector";
+
+// Outputs TensorShapeProto vector.
+ABSL_CONST_INIT const char kOutputShapes[] = "_output_shape_vector";
+
+class SymbolicShapeRefiner;
+class TopoQueue;
+
+// Infer OpInfo::TensorProperties for graph nodes inputs/outputs.
+//
+// Typical use case, is to infer tensor properties from a graph, before doing
+// optimization pass. Nodes modified during optimization pass have to be
+// invalidated, to prevent further incorrect optimizations based on wrong shape
+// and data type properties.
+class GraphProperties {
+ public:
+  // The item must outlive the properties
+  explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
+
+  // Infer the shapes through abstract interpretation. Feed information can be
+  // incorrect so it should be discarded to ensure correctness of the analysis.
+  // However, it can help infer shapes in the fanout of fed nodes (even though
+  // the correctness of these shapes can't be guaranteed), so in some cases
+  // (such as simulation or scheduling) it makes sense of keep these shapes.
+  // aggressive_shape_inference option executes nodes on the host to identify
+  // output values when possible and does other aggressive strategies.
+  // Similar to assuming_valid_feeds, this may cause incorrectness in graph
+  // analyses, but is useful for simulation or scheduling.
+  // If include_input_tensor_values is true, the values of constant tensors
+  // will included in the input properties.
+  // If include_output_tensor_values is true, the values of constant tensors
+  // will be included in the output properties.
+  absl::Status InferStatically(bool assume_valid_feeds,
+                               bool aggressive_shape_inference,
+                               bool include_input_tensor_values,
+                               bool include_output_tensor_values);
+  absl::Status InferStatically(bool assume_valid_feeds,
+                               bool aggressive_shape_inference,
+                               bool include_tensor_values) {
+    return InferStatically(
+        assume_valid_feeds,
+        /*aggressive_shape_inference=*/aggressive_shape_inference,
+        /*include_input_tensor_values=*/include_tensor_values,
+        /*include_output_tensor_values=*/include_tensor_values);
+  }
+  absl::Status InferStatically(bool assume_valid_feeds) {
+    return InferStatically(assume_valid_feeds,
+                           /*aggressive_shape_inference=*/false,
+                           /*include_tensor_values=*/true);
+  }
+  // Infer the shape by running the graph on the specified cluster and recording
+  // the shapes of the processed tensors.
+  absl::Status InferDynamically(Cluster* cluster);
+  // Extract the properties from a cost graph. For testing only since there is
+  // no way to ensure that the cost graph match the item.
+  absl::Status InferFromCostGraph(const CostGraphDef& cost_graph);
+
+  // Stores `item_.graph` with the inferred output shapes to `output_graph_def`.
+  absl::Status AnnotateOutputShapes(GraphDef* output_graph_def) const;
+
+  // Return the properties of node inputs/outputs, including data types and
+  // shapes. Note that the dimensions in the shapes can be negative. We use the
+  // -1 value to denote that we don't know anything about a dimension. We use
+  // values strictly less than -1 to encode symbolic dimensions: although we
+  // don't know the actual value of the symbolic dimension, we know that all the
+  // dimensions denoted by the same negative value are the equal.
+  bool HasInputProperties(const string& node_name) const;
+  bool HasOutputProperties(const string& node_name) const;
+  const std::vector<OpInfo::TensorProperties>& GetInputProperties(
+      const string& node_name) const;
+  const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
+      const string& node_name) const;
+
+  // Invalidate input/output properties for nodes modified during graph
+  // optimization pass, to prevent potential optimizations, based on incorrect
+  // shape information.
+  void ClearInputProperties(const string& node_name);
+  void ClearOutputProperties(const string& node_name);
+  // Returns true if we have *any* properties.
+  bool has_properties() const {
+    return !input_properties_.empty() || !output_properties_.empty();
+  }
+
+  bool CheckShapeIncompatible(const string& node_name) const {
+    return incompatible_shape_nodes_.find(node_name) !=
+           incompatible_shape_nodes_.end();
+  }
+
+  // Clear all infered properties.
+  void Clear() {
+    input_properties_.clear();
+    output_properties_.clear();
+  }
+
+ private:
+  // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
+  // <*queue_shapes_and_types>.
+  static absl::Status RelaxEnqueueShapesAndMergeTypes(
+      SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
+      const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
+      std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
+
+  // Update the shapes of the enqueue node, port them over to the corresponding
+  // queue, and schedule the reprocessing of the queue if needed.
+  static absl::Status UpdateEnqueue(
+      const NodeDef* enqueue_node,
+      const absl::flat_hash_map<const NodeDef*, const NodeDef*>&
+          resource_handles,
+      SymbolicShapeRefiner* shape_refiner, bool* new_shapes);
+
+  // Update the shapes and types of the Queue node, if not set by Enqueue node.
+  static absl::Status UpdateQueue(const NodeDef* queue_node,
+                                  SymbolicShapeRefiner* shape_refiner,
+                                  bool* new_shapes);
+
+  // Update the output shapes of a Merge node, and enqueue its fanout in
+  // new_shapes if needed.
+  absl::Status UpdateMerge(SymbolicShapeRefiner* shape_refiner,
+                           const NodeDef* node, bool* new_shapes) const;
+  // Process the Enter node, and enqueue its fanout in new_shapes if needed.
+  static absl::Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
+                                  const NodeDef* node, bool* new_shapes);
+  // Update the shapes for node 'n'. If output shapes for n have changed,
+  // enqueue its fanout in 'new_shapes'.
+  absl::Status UpdateShapes(
+      SymbolicShapeRefiner* shape_refiner,
+      const absl::flat_hash_map<const NodeDef*, const NodeDef*>&
+          resource_handles,
+      const NodeDef* n, bool* new_shapes) const;
+  // Propagate the shapes for the nodes enqueued in new_shapes and their
+  // transitive fanout until a fixed point is reached.
+  absl::Status PropagateShapes(
+      SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes,
+      const absl::flat_hash_map<const NodeDef*, const NodeDef*>&
+          resource_handles,
+      int num_loops) const;
+
+  // Data members
+  const GrapplerItem& item_;
+  absl::flat_hash_map<string, std::vector<OpInfo::TensorProperties>>
+      input_properties_;
+  absl::flat_hash_map<string, std::vector<OpInfo::TensorProperties>>
+      output_properties_;
+  const std::vector<OpInfo::TensorProperties> missing_properties_;
+
+  // Nodes with output shape incompatible between shape inference and
+  // annotation.
+  std::unordered_set<string> incompatible_shape_nodes_;
+};
+
+// Helper function for GraphProperties.
+bool IsShapeFullyDefinedIntegerVectorOrScalar(
+    shape_inference::InferenceContext* ic,
+    const shape_inference::ShapeHandle& shape,
+    const shape_inference::ShapeHandle& tensor_as_shape, const DataType& dtype);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/measuring_cost_estimator.h
new file mode 100644
index 00000000..5da9bac9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item by actually running the
+// corresponding TensorFlow graph on the specified cluster and measuring the
+// runtimes.
+class MeasuringCostEstimator : public CostEstimator {
+ public:
+  // Run the model for measurement_steps to measure its average cost.
+  // When measurement_threads is greater than 0, use a threadpool of as many
+  // threads to run the measurements; otherwise, run them serially. Does not
+  // take ownership of cluster.
+  explicit MeasuringCostEstimator(Cluster* cluster, int measurement_steps,
+                                  int measurement_threads);
+  ~MeasuringCostEstimator() override {}
+
+  // Initializes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  absl::Status Initialize(const GrapplerItem& item) override;
+
+  // Runs the optimized version of the graph on the cluster, measures
+  // the runtimes of each operation, and annotates the CostGraphDef of
+  // RunMetadata with the corresponding measurements.
+  // Returns the average latency for the whole graph.
+  absl::Status PredictCosts(const GraphDef& optimized_graph,
+                            RunMetadata* run_metadata,
+                            Costs* cost) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  int measurement_steps_;
+  int measurement_threads_;
+  std::vector<std::pair<string, Tensor>> feed_;
+  std::vector<string> fetch_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/op_context.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/op_context.h
new file mode 100644
index 00000000..90063333
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/op_context.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A structure to keep the context of op execution, including its shape,
+// execution context, and other relevant information.
+struct OpContext {
+  std::string name;
+  std::string device_name;
+  OpInfo op_info;
+  const FunctionDefLibrary* function_library;  // Not owned.
+  // This map is used to stash meta attributes so that they may be
+  // communicated, for instance, from the scheduler that creates them to the
+  // CostEstimator or EventCostManager that uses them.
+  absl::flat_hash_map<std::string, absl::variant<int64_t, std::string>>
+      op_meta_attributes;
+  OpContext() { function_library = nullptr; }
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/op_level_cost_estimator.h
new file mode 100644
index 00000000..cd160d6d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -0,0 +1,346 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
+                                        TensorShapeProto* tensor_shape_proto);
+std::vector<int64_t> MaybeGetMinimumShape(
+    const TensorShapeProto& original_shape, int rank,
+    bool* found_unknown_shapes);
+
+// Node costs; an intermediate structure used within op level cost estimator.
+struct NodeCosts {
+  // If this FLAG is true, override calculated compute time with a minimum
+  // value, instead of calculating it from num_compute_ops and compute ops/sec.
+  // For example, PredictIdentity, PredictVariable, PredictMetadata set this
+  // FLAG.
+  bool minimum_cost_op = false;
+
+  // Compute ops.
+  int64_t num_compute_ops = 0;
+
+  // Memory bytes accessed; note that these may be different to the size of
+  // tensors.
+  std::vector<int64_t> num_input_bytes_accessed;   // ordered by input tensors.
+  std::vector<int64_t> num_output_bytes_accessed;  // ordered by output ports.
+  int64_t internal_read_bytes = 0;
+  int64_t internal_write_bytes = 0;
+
+  // Convenience functions.
+  int64_t num_total_input_bytes() const {
+    return std::accumulate(num_input_bytes_accessed.begin(),
+                           num_input_bytes_accessed.end(), 0LL);
+  }
+  int64_t num_total_read_bytes() const {
+    return num_total_input_bytes() + internal_read_bytes;
+  }
+  int64_t num_total_output_bytes() const {
+    return std::accumulate(num_output_bytes_accessed.begin(),
+                           num_output_bytes_accessed.end(), 0LL);
+  }
+  int64_t num_total_write_bytes() const {
+    return num_total_output_bytes() + internal_write_bytes;
+  }
+  int64_t num_bytes_accessed() const {
+    return num_total_read_bytes() + num_total_write_bytes();
+  }
+
+  // Memory usage.
+  int64_t max_memory = 0;
+  int64_t persistent_memory = 0;
+  int64_t temporary_memory = 0;
+
+  // Stats.
+  int64_t num_nodes = 1;
+  int64_t num_nodes_with_unknown_shapes = 0;
+  int64_t num_nodes_with_unknown_op_type = 0;
+  int64_t num_nodes_with_pure_memory_op = 0;
+  bool inaccurate = false;
+
+  // TODO(dyoon): this is added for compatibility; some old code is hard to
+  // migrate; hence, using these as a backup. Once we clean up, we'll delete
+  // these fields. New code should not use these.
+  bool has_costs = false;
+  Costs costs;
+};
+
+class OpLevelCostEstimator {
+ public:
+  OpLevelCostEstimator();
+  virtual ~OpLevelCostEstimator() {}
+
+  virtual Costs PredictCosts(const OpContext& op_context) const;
+
+  // Returns basic device performance info.
+  virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
+
+ protected:
+  // TODO(dyoon): Consider to remove PredictOpCountBasedCosts() with OpInfo.
+  // Naive cost estimate based on the given operations count and total
+  // input/output tensor sizes of the given op_info combined.
+  Costs PredictOpCountBasedCost(double operations, const OpInfo& op_info) const;
+
+  // Naive cost estimate based on the given operations count and the given total
+  // io size in bytes. Sizes of op_info inputs and outputs are not taken into
+  // consideration.
+  Costs PredictOpCountBasedCost(double operations, double input_io_bytes,
+                                double output_io_bytes,
+                                const OpInfo& op_info) const;
+
+  // Top-level method cost function (PredictCosts calls this method to get
+  // NodeCosts, and then converts it to Costs). PredictNodeCosts() calls other
+  // Predict methods depending on op types.
+  absl::Status PredictNodeCosts(const OpContext& op_context,
+                                NodeCosts* node_costs) const;
+
+  // Predict cost of an op for which no accurate estimator is defined.
+  absl::Status PredictCostOfAnUnknownOp(const OpContext& op_context,
+                                        NodeCosts* node_costs) const;
+
+  // This family of routines predicts the costs to
+  // perform the specified TensorFlow Op on the
+  // device represented by a subclass. The default
+  // implementation just divides the operations to
+  // perform the op (from the "Count" routines,
+  // above) by the device peak operations per
+  // second.
+  // Implementation of costs other than
+  // execution_time is optional, depending on the
+  // device.
+  absl::Status PredictNaryOp(const OpContext& op_context,
+                             NodeCosts* node_costs) const;
+  absl::Status PredictConv2D(const OpContext& op_context,
+                             NodeCosts* node_costs) const;
+  absl::Status PredictCwiseOp(const OpContext& op_context,
+                              NodeCosts* node_costs) const;
+  absl::Status PredictConv2DBackpropInput(const OpContext& op_context,
+                                          NodeCosts* node_costs) const;
+  absl::Status PredictConv2DBackpropFilter(const OpContext& op_context,
+                                           NodeCosts* node_costs) const;
+  absl::Status PredictFusedConv2DBiasActivation(const OpContext& op_context,
+                                                NodeCosts* node_costs) const;
+  absl::Status PredictMatMul(const OpContext& op_context,
+                             NodeCosts* node_costs) const;
+  absl::Status PredictSparseTensorDenseMatMul(const OpContext& op_context,
+                                              NodeCosts* node_costs) const;
+  absl::Status PredictNoOp(const OpContext& op_context,
+                           NodeCosts* node_costs) const;
+  absl::Status PredictIdentity(const OpContext& op_context,
+                               NodeCosts* node_costs) const;
+  absl::Status PredictVariable(const OpContext& op_context,
+                               NodeCosts* node_costs) const;
+  absl::Status PredictBatchMatMul(const OpContext& op_context,
+                                  NodeCosts* node_costs) const;
+  absl::Status PredictMetadata(const OpContext& op_context,
+                               NodeCosts* node_costs) const;
+  absl::Status PredictGatherOrSlice(const OpContext& op_context,
+                                    NodeCosts* node_costs) const;
+  absl::Status PredictScatter(const OpContext& op_context,
+                              NodeCosts* node_costs) const;
+  absl::Status PredictMaxPool(const OpContext& op_context,
+                              NodeCosts* node_costs) const;
+  absl::Status PredictMaxPoolGrad(const OpContext& op_context,
+                                  NodeCosts* node_costs) const;
+  absl::Status PredictAvgPool(const OpContext& op_context,
+                              NodeCosts* node_costs) const;
+  absl::Status PredictAvgPoolGrad(const OpContext& op_context,
+                                  NodeCosts* node_costs) const;
+  absl::Status PredictFusedBatchNorm(const OpContext& op_context,
+                                     NodeCosts* node_costs) const;
+  absl::Status PredictFusedBatchNormGrad(const OpContext& op_context,
+                                         NodeCosts* node_costs) const;
+  absl::Status PredictEinsum(const OpContext& op_context,
+                             NodeCosts* node_costs) const;
+  absl::Status PredictAssignVariableOps(const OpContext& op_context,
+                                        NodeCosts* node_costs) const;
+  absl::Status PredictPureMemoryOp(const OpContext& op_context,
+                                   NodeCosts* node_costs) const;
+  absl::Status PredictSoftmax(const OpContext& op_context,
+                              NodeCosts* node_costs) const;
+  absl::Status PredictResizeBilinear(const OpContext& op_context,
+                                     NodeCosts* node_costs) const;
+  absl::Status PredictCropAndResize(const OpContext& op_context,
+                                    NodeCosts* node_costs) const;
+
+  int64_t GetSoftmaxComputeOps(const OpContext& op_context) const;
+
+  // Generic cost prediction method for fused operations.
+  absl::Status PredictFusedOp(const OpContext& op_context,
+                              const std::vector<OpContext>& fused_op_contexts,
+                              NodeCosts* node_costs) const;
+
+  // Utility function for safe division. Returns 0
+  // if rhs is 0 or negative.
+  static double SafeDiv(const double lhs, const double rhs) {
+    if (rhs > 0) {
+      return lhs / rhs;
+    } else {
+      return 0.0;
+    }
+  }
+
+  // This family of routines counts the number of operations to perform the
+  // specified TensorFlow Op.
+  struct MatMulDimensions {
+    int m;
+    int n;
+    int k;
+  };
+  struct BatchMatMulDimensions {
+    std::vector<int> batch_dims;
+    MatMulDimensions matmul_dims;
+  };
+  struct ConvolutionDimensions {
+    int64_t batch;  // Batch size.
+    int64_t ix;     // Input size x.
+    int64_t iy;     // Input size y.
+    int64_t iz;     // Input depth.
+    int64_t kx;     // Kernel x.
+    int64_t ky;     // Kernel y.
+    int64_t kz;     // Kernel depth (in case of group convolution, this will be
+                    // smaller than input depth).
+    int64_t oz;     // Output depth.
+    int64_t ox;     // Output size x.
+    int64_t oy;     // Output size y.
+    int64_t sx;     // Stride x.
+    int64_t sy;     // Stride y.
+    Padding padding;  // SAME or VALID.
+  };
+  static int64_t CountConv2DOperations(const OpInfo& op_info,
+                                       bool* found_unknown_shapes);
+  static int64_t CountConv2DOperations(const OpInfo& op_info,
+                                       ConvolutionDimensions* conv_info,
+                                       bool* found_unknown_shapes);
+  static int64_t CountMatMulOperations(const OpInfo& op_info,
+                                       bool* found_unknown_shapes);
+  static int64_t CountMatMulOperations(const OpInfo& op_info,
+                                       MatMulDimensions* mat_mul,
+                                       bool* found_unknown_shapes);
+  static int64_t CountMatMulOperations(const OpInfo& op_info, bool transpose_a,
+                                       bool transpose_b,
+                                       MatMulDimensions* mat_mul,
+                                       bool* found_unknown_shapes);
+  bool GenerateBatchMatmulContextFromEinsum(const OpContext& einsum_context,
+                                            OpContext* batch_matmul_context,
+                                            bool* found_unknown_shapes) const;
+  static int64_t CountBatchMatMulOperations(const OpInfo& op_info,
+                                            bool* found_unknown_shapes);
+  static int64_t CountBatchMatMulOperations(
+      const OpInfo& op_info, BatchMatMulDimensions* batch_mat_mul,
+      bool* found_unknown_shapes);
+  static int64_t CountConv2DBackpropInputOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes);
+  static int64_t CountConv2DBackpropFilterOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes);
+
+  // Calculate the element count of an input/output tensor.
+  static int64_t CalculateTensorElementCount(
+      const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes);
+
+  // Calculate the total size in bytes of an input/output tensor.
+  static int64_t CalculateTensorSize(const OpInfo::TensorProperties& tensor,
+                                     bool* found_unknown_shapes);
+
+  // Calculate the element count of the largest
+  // input of specified TensorFlow op.
+  static int64_t CalculateLargestInputCount(const OpInfo& op_info,
+                                            bool* found_unknown_shapes);
+
+  // Calculate the total size in bytes of the all
+  // the inputs of specified TensorFlow op.
+  static int64_t CalculateInputSize(const OpInfo& op_info,
+                                    bool* found_unknown_shapes);
+
+  // Same, but a vector format: one for each input.
+  static std::vector<int64_t> CalculateInputTensorSize(
+      const OpInfo& op_info, bool* found_unknown_shapes);
+
+  // Calculate the total size in bytes of the all
+  // the outputs of specified TensorFlow op.
+  static int64_t CalculateOutputSize(const OpInfo& op_info,
+                                     bool* found_unknown_shapes);
+
+  // Same, but a vector format: one for each output.
+  static std::vector<int64_t> CalculateOutputTensorSize(
+      const OpInfo& op_info, bool* found_unknown_shapes);
+
+  // For convolution and its grad ops.
+  static ConvolutionDimensions ConvolutionDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
+      bool* found_unknown_shapes);
+
+  // For Pooling, FusedBatchNorm, and their grad ops.
+  static absl::StatusOr<ConvolutionDimensions> OpDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape, const OpInfo& op_info,
+      bool* found_unknown_shapes);
+
+  // Helper to construct child operation contexts for the component operations
+  // of fused ops.
+  static OpContext FusedChildContext(
+      const OpContext& parent, const string& op_name,
+      const OpInfo::TensorProperties& output,
+      const std::vector<OpInfo::TensorProperties>& inputs);
+
+  // Helper to construct tensor shapes.
+  static OpInfo::TensorProperties DescribeTensor(
+      DataType type, const std::vector<int64_t>& dims);
+
+  // Helper method for building common case NodeCosts.
+  static absl::Status PredictDefaultNodeCosts(int64_t num_compute_ops,
+                                              const OpContext& op_context,
+                                              bool* found_unknown_shapes,
+                                              NodeCosts* node_costs);
+
+ protected:
+  std::map<string, int> elementwise_ops_;
+  typedef std::function<absl::Status(const OpContext& op_context, NodeCosts*)>
+      CostImpl;
+  std::map<string, CostImpl> device_cost_impl_;
+  // If true, assume compute and memory overlap; hence, the op cost is max of
+  // compute_time and memory_time, instead of sum of those two.
+  bool compute_memory_overlap_;
+  std::set<string> persistent_ops_;
+
+ private:
+  friend class OpLevelCostEstimatorTest;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/robust_stats.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/robust_stats.h
new file mode 100644
index 00000000..f11e608c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/robust_stats.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
+
+#include <vector>
+namespace tensorflow {
+namespace grappler {
+class RobustStats {
+ public:
+  explicit RobustStats(const std::vector<double>& values);
+  explicit RobustStats(std::vector<double>&& values);
+
+  double lo() const { return lo_; }
+  double hi() const { return hi_; }
+  double mean() const { return mean_; }
+
+ private:
+  void HuberMAD(const std::vector<double>& values);
+
+  double lo_;
+  double hi_;
+  double mean_;
+  double stddev_;
+};
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/utils.h
new file mode 100644
index 00000000..94f5c240
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/utils.h
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns a vector of InputProperties for 'node'. The vector will contain one
+// entry for each input of 'node'.
+// For each node in the graph, the 'name_to_cost' map stores a pointer to the
+// corresponding cost graph node indexed by node name. The 'name_to_node' maps a
+// node name to its node definition.
+std::vector<OpInfo::TensorProperties> FindInputFeatures(
+    const NodeDef& node,
+    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
+    const std::unordered_map<string, const NodeDef*>& name_to_node);
+
+// Returns the size of tensor (unit: bytes). For tensor shape with unknown rank,
+// it assumes the tensor to be scalar. For any unknown dimension, it assumes
+// size one.
+int64_t CalculateTensorSize(const OpInfo::TensorProperties& prop);
+
+// Returns the size of output at port_num (unit: bytes). A special case is
+// port_num -1, which is for control dependency and assumed to be 4 bytes.
+int64_t CalculateOutputSize(
+    const std::vector<OpInfo::TensorProperties>& output_properties,
+    int port_num);
+
+// Returns the DeviceProperties of the device on which 'node' runs.
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
+DeviceProperties GetDeviceInfo(const string& device_str);
+
+// Return a string describing a node given a nodeinfo.
+string GetOpDescription(const OpInfo& op_info);
+
+// Builds the OpInfo for node without filling its device information, given all
+// nodes in the graph and its input properties.
+OpInfo BuildOpInfoWithoutDevice(
+    const NodeDef& node,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    const std::vector<OpInfo::TensorProperties>& inputs);
+
+// Gather performance data from a cost graph.
+OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
+                                               const GraphDef& graph);
+
+// Simple histogram for profiling Tensor size; histogram uses logarithmic
+// buckets.
+class TensorSizeHistogram {
+ public:
+  TensorSizeHistogram() : buckets_(kMaxBuckets, 0) {}
+
+  void Add(const uint64 value);
+  void Merge(const TensorSizeHistogram& src);
+  double Average() const {
+    if (num_elem_ > 0) {
+      return static_cast<double>(sum_elem_) / num_elem_;
+    } else {
+      return 0.0;
+    }
+  }
+  uint64 Min() const { return min_; }
+  uint64 Max() const { return max_; }
+  uint64 NumElem() const { return num_elem_; }
+  uint64 SumElem() const { return sum_elem_; }
+  string ToString() const;
+
+ protected:
+  const int Index(const uint64 value) const;
+  const std::vector<uint64>& GetBuckets() const { return buckets_; }
+
+ private:
+  const int kMaxBuckets = 64;
+  uint64 num_elem_ = 0;
+  uint64 sum_elem_ = 0;
+  // min_ and max_ are initialized to a very large value and zero, respectively,
+  // so that any value added can replace the initial min_ and max_.
+  uint64 min_ = kuint64max;
+  uint64 max_ = 0;
+  // Buckets are logarithmic:
+  // 0B, 1B, 2-3B, 4-7B, 8-15B, ..., 2^N - 2^(N+1)-1B, ...
+  std::vector<uint64> buckets_;
+};
+
+// Helper functions for aggregating per-device stats into per-device-class
+// stats.
+string GetDeviceClassForNonChannelDevice(const string& device_name);
+string GetDeviceClass(const string& device_name);
+
+// Get stats in string format from RunMetadata.
+string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
+                                     bool verbosity);
+
+// This method calculates the execution time depending on whether IO can
+// overlap with computation. It assumes the memory and the compute times have
+// already been calculated.
+void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
+                                        Costs* costs);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/virtual_placer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/virtual_placer.h
new file mode 100644
index 00000000..5f6119ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/virtual_placer.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+class NodeDef;
+
+namespace grappler {
+class Cluster;
+
+// The virtual placer emulates the behavior of the TF placer.
+class VirtualPlacer {
+ public:
+  explicit VirtualPlacer(
+      const std::unordered_map<string, DeviceProperties>& devices);
+
+  const DeviceProperties& get_device(const NodeDef& node) const;
+
+  // Returns device name from cluster, which best matches the node.device()
+  // specification. Returns default device if no match was found or the
+  // node.device() could not be parsed.
+  string get_canonical_device_name(const NodeDef& node) const;
+
+ private:
+  // Converts given device name to Lowercase Fully-Qualified Name (LFQN) string.
+  // This helps us disambiguate device names internally and simplify matching.
+  // If device_name couldn't be parsed successfully, returns empty string.
+  string to_lfqn_or_empty(const string& device_name) const;
+
+  // Map based on the cluster info: cluster device name -> device properties.
+  std::unordered_map<string, DeviceProperties> devices_;
+
+  // Maps LFQN to original device name as it was declared in cluster.
+  std::unordered_map<string, string> lfqn_map_;
+
+  string default_device_name_;
+  string default_job_name_lowercase_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/costs/virtual_scheduler.h b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/virtual_scheduler.h
new file mode 100644
index 00000000..f574832b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -0,0 +1,543 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+ABSL_CONST_INIT extern const char kAttrInputSrc[];
+ABSL_CONST_INIT extern const char kAttrSrcDevice[];
+ABSL_CONST_INIT extern const char kAttrDstDevice[];
+ABSL_CONST_INIT extern const char kAttrTensorName[];
+ABSL_CONST_INIT extern const char kChannelDevice[];
+ABSL_CONST_INIT extern const char kStreaming[];
+
+struct NodeState {
+  // A node (i.e., an op) takes a set of input:port pairs and produces
+  // a set of output ports.
+
+  // Cross references to input and output nodes from graphdef.
+  std::vector<std::pair<const NodeDef*, int>> inputs;  // Input, port pairs.
+  // List of output nodes (a list of nodes that takes this output port as input)
+  // keyed by port_num. Note that port_num -1 is used for control dependency.
+  std::unordered_map<int, std::vector<const NodeDef*>> outputs;
+
+  // Info from GraphProperties.
+  std::vector<OpInfo::TensorProperties> input_properties;
+  std::vector<OpInfo::TensorProperties> output_properties;
+
+  // Canonical device name used within VirtualScheduler.
+  string device_name;
+
+  // States updated as scheduling nodes.
+  int num_inputs_ready;
+  std::unordered_map<int, int> num_outputs_executed;
+  Costs::Duration time_ready;
+  Costs::Duration time_scheduled;
+  Costs::Duration time_finished;
+  // Time that all the consumers are executed (hence, no need to keep this
+  // output in memory), keyed by port_num.
+  std::unordered_map<int, Costs::Duration> time_no_references;
+
+  // Note that a node may have multiple output ports. The length of outputs,
+  // num_outputs_executed, and time_no_references should be
+  // identical when a NodeState is fully initialized.
+  // They should be 1 + output_properties.size() as we add [-1] for control
+  // dependency.
+
+  // Node will be ready to be executed at time_ready, scheduled at
+  // time_scheduled, and finishes execution at time_finished.
+  // Each output port uses up memory space from time_scheduled to its
+  // time_no_references.
+
+  Costs node_costs;  // Node costs per execution
+  Costs TotalNodeCosts() const {
+    return MultiplyCosts(node_costs, execution_count);
+  }
+  // How many times this node has been executed, e.g. in a while loop.
+  int execution_count;
+
+  // Output shape incompatible between shape annotation and shape inference.
+  bool shape_incompatible;
+
+  NodeState() {
+    num_inputs_ready = 0;
+    time_ready = Costs::Duration::max();
+    time_scheduled = Costs::Duration::max();
+    time_finished = Costs::Duration::max();
+    execution_count = 0;
+    shape_incompatible = false;
+    // Note that num_outputs_executed and time_no_references are not initialized
+    // here, since we don't know the size (i.e., # outputs for this node).
+  }
+};
+
+struct DeviceState {
+  // Nodes executed on this device in execution order.
+  std::vector<const NodeDef*> nodes_executed;
+
+  struct NodePairHash {
+   public:
+    const std::size_t operator()(
+        const std::pair<const NodeDef*, int>& element) const {
+      return std::hash<const NodeDef*>()(element.first);
+    }
+  };
+
+  // Nodes currently allocated in memory: set of NodeDef* and port_num pairs
+  // so that we can track which output of the node is in memory.
+  std::unordered_set<std::pair<const NodeDef*, int>, NodePairHash>
+      nodes_in_memory;
+
+  // Nodes allocated in memory persistently: e.g., Variables.
+  std::unordered_set<std::pair<const NodeDef*, int>, NodePairHash>
+      persistent_nodes;
+
+  // Snapshot of nodes_in_memory, when memory usage is at peak.
+  // Same to nodes_in_memory, it's a set of NodeDef* and port_num pairs.
+  std::unordered_set<std::pair<const NodeDef*, int>, NodePairHash>
+      mem_usage_snapshot_at_peak;
+
+  // Vector of temporary memory usage trace in execution order.
+  // Each pair represents the current node name and current (accumulated)
+  // temporary memory usage of the device when the node is scheduled.
+  // Only enabled when mem_usage_tracking is enabled.
+  // Note: CPU uses an inter-op threadpool, so the execution order on CPU may
+  // not be deterministic.
+  std::vector<std::pair<std::string, int64_t>> temporary_memory_usage_trace;
+
+  Costs device_costs;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  int64_t memory_usage;      // Current temporary memory usage
+  int64_t max_memory_usage;  // Max temporary memory usage
+
+  // Shape annotation statistics.
+  struct ShapeAnnotationStats {
+    // Number of ops with shape annotated.
+    int64_t num_ops_annotated = 0;
+    // Number of ops executed multiple times (e.g. in a loop).
+    int64_t num_ops_executed_more_than_once = 0;
+    // Number of ops executed: account for execution count.
+    int64_t num_ops_executed = 0;
+    // Number of ops with dynamic shapes (e.g. shape changes in a loop).
+    int64_t num_ops_with_dynamic_shapes = 0;
+    // Number of ops with incompatible shapes between annotation and shape
+    // inference.
+    int64_t num_ops_with_incompatible_shapes = 0;
+  } shape_annotation_stats;
+
+  DeviceState() {
+    device_costs = Costs::ZeroCosts();
+    device_costs.num_ops_total = 0;
+    memory_usage = 0;
+    max_memory_usage = 0;
+  }
+
+  Costs::Duration GetCurrTime() const { return device_costs.execution_time; }
+};
+
+// ReadyNodeManager (abstract class):
+// Keeps ready nodes and picks the best one to be scheduled.
+class ReadyNodeManager {
+ public:
+  ReadyNodeManager() {}
+  virtual ~ReadyNodeManager() {}
+  virtual absl::Status Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_map) {
+    return absl::OkStatus();
+  }
+  virtual void AddNode(const NodeDef* node) = 0;
+  virtual const NodeDef* GetCurrNode() = 0;
+  virtual void RemoveCurrNode() = 0;
+  virtual bool Empty() const = 0;
+};
+
+class FIFOManager : public ReadyNodeManager {
+ public:
+  FIFOManager() : ReadyNodeManager() {}
+  ~FIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() override {
+    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+    return nodes_.front();
+  }
+  void RemoveCurrNode() override { nodes_.pop_front(); }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+};
+
+// The LIFOManager schedules nodes by returning the last one added to the
+// scheduler. A node is executed and then its ready outputs are newly added to
+// the scheduler, so the LIFOManager will return outputs to a node following
+// that node's execution.
+class LIFOManager : public ReadyNodeManager {
+ public:
+  LIFOManager() : ReadyNodeManager() {}
+  ~LIFOManager() override {}
+  void AddNode(const NodeDef* node) override;
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+  // Keep track of the current node being executed by saving its position.
+  // Necessary because nodes may be added to the end of the list while a node is
+  // executing, and we want to remove the correct node (the one that is
+  // executing) rather than the new ones being added.
+  std::list<const NodeDef*>::iterator curr_pos_ = nodes_.end();
+};
+
+// Abstract class that maintains a heap/priority queue for scheduling ready
+// nodes. Derived class needs to implement the Greater() function which returns
+// the comparator for the heap.
+class HeapReadyManager : public ReadyNodeManager {
+ public:
+  HeapReadyManager();
+  absl::Status Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_map) override;
+  ~HeapReadyManager() override {}
+  void AddNode(const NodeDef* node) override;
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override;
+
+ protected:
+  virtual std::function<bool(const NodeDef*, const NodeDef*)> Greater() = 0;
+
+  // nodes_ is the main queue, where we construct heap, and the front is the
+  // current node.
+  std::vector<const NodeDef*> nodes_;
+
+  // Comparator functor for heap; stl heap is max heap, so we use "greater than"
+  // functor for keeping the smallest time_ready node at the front of heap.
+  std::function<bool(const NodeDef*, const NodeDef*)> greater_;
+
+  // NodeState structure from SchedulerState to get time_ready of ready nodes.
+  // Not owned by FirstReadyManager.
+  const std::unordered_map<const NodeDef*, NodeState>* node_map_;
+
+  // Cached curr node. Set back to nullptr from RemoveCurrNode().
+  const NodeDef* curr_node_;
+};
+
+// FirstReadyManager picks a node with the minimum time_ready value.
+// Behavior is deterministic when there are more than one nodes with the minimum
+// time_ready value with unique node names as the tie-breaker.
+class FirstReadyManager : public HeapReadyManager {
+ public:
+  FirstReadyManager() : HeapReadyManager() {}
+  ~FirstReadyManager() override {}
+
+ protected:
+  std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
+};
+
+// PriorityReadyManager uses the given node priorities when picking up next node
+// from all the ready nodes.
+class PriorityReadyManager : public HeapReadyManager {
+ public:
+  PriorityReadyManager() : HeapReadyManager() {}
+  ~PriorityReadyManager() override {}
+  void AddNode(const NodeDef* node) override;
+
+  // Note this should be called after Init().
+  absl::Status SetPriority(
+      const std::unordered_map<string, int>& node_priority);
+
+ protected:
+  std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
+
+ private:
+  // A map from unique node name to priority. Lower number means higher
+  // priority.
+  std::unordered_map<string, int> node_priority_;
+};
+
+// CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
+// ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv
+// ops, and then it chooses FirstReady among the ops chosen from each
+// internal NodeManagers. The objective is to maximize producer-consumer
+// locality within device, while processing nodes across devices, including
+// _Send and _Recv, fairly, in terms of their time_ready.
+class CompositeNodeManager : public ReadyNodeManager {
+ public:
+  CompositeNodeManager();
+  ~CompositeNodeManager() override {}
+
+  absl::Status Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_map) override;
+  void AddNode(const NodeDef* node) override;
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override;
+
+ private:
+  // Internal ready node managers:
+  // LIFO for normal ops to maximize producer consumer locality.
+  // One LIFO per device.
+  std::unordered_map<string, LIFOManager> ops_lifo_map_;
+  // FirstReady for send and recv. Handle send and recv separately ensures that
+  // send and recv do not block previously read ops with LIFO schedule.
+  FirstReadyManager send_manager_;
+  FirstReadyManager recv_manager_;
+
+  // NodeState structure from SchedulerState to get time_ready of ready nodes.
+  // Not owned by CompositeReadyManager.
+  const std::unordered_map<const NodeDef*, NodeState>* node_map_;
+
+  // Cached curr node. Set back to nullptr from RemoveCurrNode().
+  const NodeDef* curr_node_;
+};
+
+// Constructs a ready node manager from the given string.
+std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
+    const string& ready_node_manager);
+
+// Encapsulates all of the various pieces uses to track state of a scheduler;
+// enables reuse of all scheduler state-related utilities across different
+// scheduler implementations.
+class SchedulerState {
+ public:
+  SchedulerState(const bool use_static_shapes,
+                 const bool use_aggressive_shape_inference, Cluster* cluster,
+                 std::unique_ptr<VirtualPlacer> placer);
+  // Move constructor. Explicitly defined because it otherwise gets implicitly
+  // deleted. SchedulerState is a move-only class, as we have a <unique_ptr>
+  // for it in VirtualScheduler. A derivative of VirtualScheduler can move a
+  // <unique_ptr> SchedulerState to VirtualScheduler when it is constructed,
+  // which is where this move constructor is needed.
+  SchedulerState(SchedulerState&& arg) = default;
+  // We explicitly delete assinment and copy operators, this is done implicitly,
+  // but we state it here explicitly for clarity.
+  SchedulerState& operator=(SchedulerState&& arg) = delete;
+  SchedulerState(const SchedulerState&) = delete;
+  SchedulerState& operator=(const SchedulerState&) = delete;
+  // Destructor. Must be defined such that a derivative class can override it
+  // and allow proper desctruction of the derivative class. If this is not done
+  // properly, memory leaks can occur.
+  virtual ~SchedulerState();
+  // Sets up the graph while also performing some necessary transformations
+  // initial_nodes is the set of nodes (primary inputs) discovered by Init()
+  // which may be added by a ReadyNodeManager (or related/derivative scheduler)
+  // to begin node schedule and graph simulation.
+  absl::Status Init(const GrapplerItem* item,
+                    std::vector<const NodeDef*>* initial_nodes,
+                    bool create_explicit_channel_device = true);
+
+  virtual Costs Summary() const;
+  // Like the above, but writes detailed stats to RunMetadata.
+  // If metadata is nullptr, then just calls and return Summary().
+  virtual Costs Summary(RunMetadata* metadata);
+  // Generates RunMetadata's step_stats and partition_graphs fields from results
+  // of the virtual execution of the graph.
+  // TODO(rdegruijl) See if we can make this function and caller Summary()
+  // const.
+  void GenerateRunMetadata(RunMetadata* metadata);
+
+  // Returns per device memory usage.
+  const std::unordered_map<string, int64_t> GetPeakMemoryUsage() const;
+  const std::unordered_map<string, int64_t> GetPersistentMemoryUsage() const;
+  void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
+  // Returns (read only) device and node states.
+  const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
+    return &device_;
+  }
+
+  const std::unordered_map<const NodeDef*, NodeState>* GetNodeStates() const {
+    return &node_map_;
+  }
+
+  virtual OpContext CreateOpContext(const NodeDef* node) const;
+  std::vector<const NodeDef*> MarkNodeExecuted(
+      const NodeDef* node, const Costs& node_costs, const OpContext& op_context,
+      bool extract_execution_count_attr = true,
+      const std::string& override_device_name = "");
+
+  // Some getter functions.
+  const GrapplerItem* GetGrapplerItem() { return grappler_item_; }
+  Costs GetGraphCost() { return graph_costs_; }
+  Cluster* GetCluster() { return cluster_; }
+  bool GetUseStaticShape() { return use_static_shapes_; }
+  bool GetUseAggressiveShapeInference() {
+    return use_aggressive_shape_inference_;
+  }
+  const std::unordered_map<const NodeDef*, NodeState>& GetNodeMap() {
+    return node_map_;
+  }
+
+ protected:
+  // Assigns the time_scheduled in the NodeState of node to the current
+  // execution_time of the device executing this node.
+  void SetNodeStateTimeScheduled(const NodeDef* node);
+
+  // This method can be used by a class derived from SchedulerState to
+  // access the device state map.
+  std::unordered_map<string, DeviceState>* GetMutableDeviceState() {
+    return &device_;
+  }
+
+ private:
+  // Methods called from Init(). Fails if initialize_ is set.
+
+  void MaybeUpdateInputOutput(const NodeDef* node);
+  NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+  // Creates a Send_ and Recv_ pair between from and to. The argument
+  // create_channel_device tells the function to create an explicit device for
+  // the channel.
+  std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
+      const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+      const string& input_name, bool create_channel_device);
+  string DeviceName(const NodeDef* node) const;
+  string SanitizedDeviceName(const NodeDef* node) const;
+  string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
+
+  // Helper methods.
+  void GetOutputNodes(const NodeDef* node, const Costs::Duration& curr_time,
+                      std::vector<const NodeDef*>* output_nodes);
+  // Retrieves output size from node_cost at a port_num.  If the output size has
+  // not been set, defaults back to CalculateOutputSize.
+  int64_t GetOrCalculateOutputSize(const NodeState& node_state,
+                                   int port_num) const;
+
+  std::unordered_map<const NodeDef*, NodeState> node_map_;
+  std::unordered_map<string, DeviceState> device_;
+
+  // Pool of NodeDefs for SendRecv and Identity ops created.
+  std::vector<std::unique_ptr<NodeDef>> additional_nodes_;
+
+  // Stats:
+  // Op counts with key with input shape.
+  // Example key: "[Op=AssignSub, input_shapes=[[7,1,160,160][7,1,160,160]]"
+  std::map<string, int> op_counts_;
+  // Individual op costs with key with input shape.
+  // Integer field for execution time in micro seconds.
+  // Boolean field for whether the cost is accurate.
+  std::map<string, std::pair<int, bool>> op_costs_;
+
+  Costs graph_costs_;                   // Graph cost.
+  std::map<string, Costs> op_to_cost_;  // Per-op cost.
+
+  // Auxiliary data structures for constructing NodeState and DeviceState.
+  std::unique_ptr<GraphProperties> graph_properties_;  // Initialized in Init().
+  Cluster* cluster_;                                   // Not owned.
+  const GrapplerItem* grappler_item_;                  // Not owned.
+  bool use_static_shapes_;
+  bool initialized_;
+  bool track_mem_usage_snapshot_;
+  const bool use_aggressive_shape_inference_;
+  std::unique_ptr<VirtualPlacer> placer_;
+};
+
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  // Does not take ownership of cluster or ready_nodes.
+  VirtualScheduler(const bool use_static_shapes,
+                   const bool use_aggressive_shape_inference, Cluster* cluster,
+                   ReadyNodeManager* ready_nodes,
+                   std::unique_ptr<VirtualPlacer> placer);
+  // This constructor can be called by a derivative of VirtualScheduler to
+  // construct the base class. It lets VirtualScheduler take ownership of
+  // a new SchedulerState or a derivative thereof.
+  // Note that this constructor does not set a VirtualPlacer, in this
+  // constructor the VirtialPlacer is passed as a member of the SchedulerState
+  // that is passed as an argument.
+  VirtualScheduler(ReadyNodeManager* ready_nodes,
+                   std::unique_ptr<SchedulerState> scheduler_state);
+  virtual ~VirtualScheduler();
+
+  // Initializes the scheduler for the specific grappler item.
+  // Should be called immediately after the c'tor or when the scheduler will be
+  // reused for a new grappler item. All internal states of the scheduler
+  // related to the previous grappler item will be reset/cleared.
+  //
+  // This function should be called at least once after the scheduler is
+  // constructed. An uninitialized or failed-to-initialize scheduler will cause
+  // undefined behavior.
+  virtual absl::Status Init(const GrapplerItem* item);
+
+  // Gets the current scheduled node for execution; the caller of this function
+  // can accordingly simulate the execution of the current scheduled node.
+  virtual OpContext GetCurrNode();
+  // Marks the current scheduled node as executed. Note that we should call this
+  // function only after the execution of the node has been simulated;
+  // node_costs_ capture the simulated costs of the node.
+  // Returns true if there is any node to be scheduled.
+  virtual bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  // Prints out summary of execution (timing, memory usage, etc.)
+  Costs Summary() const { return scheduler_state_->Summary(); }
+  // Like the above, but writes detailed stats to RunMetadata.
+  // If metadata is nullptr, then just calls and return Summary().
+  Costs Summary(RunMetadata* metadata) {
+    return scheduler_state_->Summary(metadata);
+  }
+  // Generates RunMetadata's step_stats and partition_graphs fields from results
+  // of the virtual execution of the graph.
+  void GenerateRunMetadata(RunMetadata* metadata) {
+    scheduler_state_->GenerateRunMetadata(metadata);
+  }
+  // Returns per device memory usage.
+  const std::unordered_map<string, int64_t> GetPeakMemoryUsage() const {
+    return scheduler_state_->GetPeakMemoryUsage();
+  }
+  const std::unordered_map<string, int64_t> GetPersistentMemoryUsage() const {
+    return scheduler_state_->GetPersistentMemoryUsage();
+  }
+  // Returns VirtualScheduler (read only) device and node states.
+  const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
+    return scheduler_state_->GetDeviceStates();
+  }
+  const std::unordered_map<const NodeDef*, NodeState>* GetNodeStates() const {
+    return scheduler_state_->GetNodeStates();
+  }
+  void enable_mem_usage_tracking() {
+    scheduler_state_->enable_mem_usage_tracking();
+  }
+
+ protected:
+  // The state of the scheduler and the execution of the graph is encapsulated
+  // by the scheduler_state_ object.
+  std::unique_ptr<SchedulerState> scheduler_state_;
+  // ready_nodes_ is responsible for ordering the traversal of the graph.
+  ReadyNodeManager* ready_nodes_;  // Not owned.
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/devices.h b/third_party/tflite-hdrs/tensorflow/core/grappler/devices.h
new file mode 100644
index 00000000..8a27bfac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/devices.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
+#define TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
+
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Get the number of available GPUs whose number of multiprocessors is no less
+// than 8 and whose CUDA compute capability is no less than
+// min_cuda_compute_capability.
+int GetNumAvailableGPUs(
+    const std::pair<int, int>& min_cuda_compute_capability = {0, 0});
+
+// Maximum amount of gpu memory available per gpu. gpu_id must be in the range
+// [0, num_available_gpu)
+int64_t AvailableGPUMemory(int gpu_id);
+
+// Get the number of logical CPU cores (aka hyperthreads) available.
+int GetNumAvailableLogicalCPUCores();
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/gen_node.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/gen_node.h
new file mode 100644
index 00000000..e47e2d94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/gen_node.h
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GEN_NODE_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GEN_NODE_H_
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+class GenNode;
+
+// To find nodes by name.
+using GenNodeMap = std::unordered_map<string, std::unique_ptr<GenNode>>;
+
+// One node in the graph, in the form convenient for traversal and generation of
+// subgraphs. It refers to the original NodeDef protobuf for most information
+// and adds the extra enrichment.
+//
+// The graph building is 2-stage: first match a GenNode with each NodeDef and
+// collect them into a map that finds them by name, then process the map,
+// deep-parse the underlying NodeDefs and connect the GenNodes together.
+class GenNode {
+ public:
+  // Will keep the pointer, so the underlying object must not be deleted while
+  // GenNode is alive.
+  explicit GenNode(const NodeDef* node);
+
+  // Access wrappers.
+  const string& name() const { return node_->name(); }
+  const string& opcode() const { return node_->op(); }
+  const NodeDef* node_def() const { return node_; }
+
+  // Parse the inputs of this node and update the map accordingly, creating the
+  // links (i.e. edges, connections between nodes) in itself and in the nodes
+  // it's linked to (the map itself is unchanged, only the nodes in it are
+  // updated).
+  absl::Status ParseInputs(const GenNodeMap* map);
+
+  // Does the full 2-stage build of the graph. The map should be initially
+  // empty. The map keeps pointers to the nodes in source, so the source must
+  // not be destroyed before the map.
+  static absl::Status BuildGraphInMap(const GraphDef& source, GenNodeMap* map);
+
+  // The enrichment that constitutes the point of this class.
+
+  // Representation of a connection on a node.
+  class Port {
+   public:
+    // A port may be inbound or outbound.
+    // Negative ids (canonically -1) mean a control port.
+    Port(bool inbound, int32_t id) : value_(id << 1) {
+      if (inbound) {
+        value_ |= 1;
+      }
+    }
+    Port(const Port&) = default;
+    Port& operator=(const Port&) = default;
+
+    bool IsInbound() const { return (value_ & 0x1); }
+
+    bool IsControl() const { return (value_ < 0); }
+
+    int32_t Id() const {
+      // Arithmetic shift preserves the sign.
+      return (value_ >> 1);
+    }
+
+    // Integer type used to represent the encoded port value.
+    using IntPort = int32_t;
+
+    // Returns the encoded form of this port, so that it can be used
+    // as various map indexes.
+    IntPort Encoded() const { return value_; }
+
+    static Port Decode(IntPort encoded) { return Port(encoded); }
+
+    bool operator==(const Port& other) const { return value_ == other.value_; }
+    bool operator<(const Port& other) const { return value_ < other.value_; }
+
+    struct Hasher {
+      size_t operator()(const Port& port) const noexcept {
+        return hasher(port.Encoded());
+      }
+      std::hash<int32_t> hasher;
+    };
+
+    // Convenient for printing. I've really wanted it to be implicit but
+    // ClangTidy insists on making it explicit.
+    explicit operator string() const;
+
+   private:
+    explicit Port(IntPort value) : value_(value) {}
+
+    IntPort value_;
+  };
+
+  struct LinkTarget {
+    GenNode* node;  // Node where this link points.
+    Port port;      // Port on the remote side of this link.
+
+    LinkTarget(GenNode* a_node, Port a_port) : node(a_node), port(a_port) {}
+  };
+  // All the links that are connected to the same port of this node
+  // are collected in one vector. A link is an edge of the graph that connects
+  // 2 nodes. Each of the connected nodes has its own perspective on the link,
+  // seeing its local port, remote port and the remote node. The direction of
+  // the link is encoded in the ports, one port is always incoming and another
+  // one outgoing.
+  using LinkTargetVector = std::vector<LinkTarget>;
+  // Both inputs and outputs are stored in the same map.
+  using LinkMap = std::unordered_map<Port, LinkTargetVector, Port::Hasher>;
+
+  // Access to the link map.
+  const LinkMap& links() const { return links_; }
+
+  // Check whether the port is an input (including the controls) with multiple
+  // connections. Such inputs get handled in a special way when building the
+  // subgraphs, in an "all or nothing" fashion.
+  bool IsMultiInput(Port port) const;
+
+  // When building the subgraphs, must include either all non-control inputs of
+  // this node into the subgraph or none of them. This happens when at least one
+  // of the inputs is a multi-input (or if the opcode is commutative, thus
+  // treating all the inputs as one multi-input).
+  bool AllInputsOrNone() const { return all_inputs_or_none_; }
+
+ private:
+  const NodeDef* node_;
+  // Becomes valid only after ParseInputs().
+  const OpDef* op_;
+
+  // The opcode has a complicated structure of input args, with multi-input args
+  // that are not commutative. This means that to make sense, the subgraphs that
+  // include this node must also include either all its inputs or none of them.
+  bool all_inputs_or_none_ = false;
+
+  LinkMap links_;
+};
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GEN_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
new file mode 100644
index 00000000..56828ee1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
@@ -0,0 +1,154 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_H_
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/graph_analyzer/map_tools.h"
+#include "tensorflow/core/grappler/graph_analyzer/sig_node.h"
+#include "tensorflow/core/grappler/graph_analyzer/subgraph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+namespace test {
+class GraphAnalyzerTest;
+}  // end namespace test
+
+// Finds all the subgraphs of a given size and groups them by equivalence.
+class GraphAnalyzer {
+ public:
+  // Makes a copy of the graph.
+  GraphAnalyzer(const GraphDef& graph, int subgraph_size);
+
+  virtual ~GraphAnalyzer();
+
+  // Performs the analysis and collects the subgraphs.
+  absl::Status Run();
+
+  // Returns the subgraphs found in Run() printed to text.
+  std::vector<string> DumpSubgraphs();
+
+  // Prints the subgraphs found in Run() to stdout.
+  absl::Status OutputSubgraphs();
+
+  // TODO(babkin): add a way to extract the subgraphs as direct data
+  // structures and as protobufs, and to write protobufs to a RecordIO.
+
+ private:
+  GraphAnalyzer() = delete;
+  GraphAnalyzer(const GraphAnalyzer&) = delete;
+  void operator=(const GraphAnalyzer&) = delete;
+
+  friend class tensorflow::grappler::graph_analyzer::test::GraphAnalyzerTest;
+
+  // Builds the map of nodes from the original graph definition.
+  absl::Status BuildMap();
+
+  // Using nodes_, finds all the subgraphs of size subgraph_size_ and places
+  // them into result_.
+  void FindSubgraphs();
+
+  // Deletes from result_ the unacceptable subgraphs. Those include the
+  // subgraphs where not all the inputs at a multi-input port are included (this
+  // could happen if some of these inputs were reached and included through
+  // different paths).
+  void DropInvalidSubgraphs();
+
+  // Deletes from result_ duplicate entries of equivalent topology.
+  absl::Status CollateResult();
+
+  // Returns the raw subgraphs found in FindSubgraphs() printed to text.
+  std::vector<string> DumpRawSubgraphs();
+
+  // Finds and adds appropriately to either partial_ or result_ all the
+  // subgraphs that can be created by extending the parent subgraph by one node.
+  // Ignores the duplicates.
+  void ExtendSubgraph(Subgraph* parent);
+
+  // Extends the parent subgraph by adding another node (if it wasn't already
+  // added) and all its non-control inputs in the link map range at once.
+  // If the subgraph would grow over subgraph_size_, it gets ignored.
+  void ExtendSubgraphAllOrNone(Subgraph* parent, const GenNode* node);
+  // Same but adds one specific inbound port (even control) all-or-none.
+  void ExtendSubgraphPortAllOrNone(Subgraph* parent, const GenNode* node,
+                                   GenNode::Port port);
+  // The common final step called by ExtendSubgraph*AllOrNone() methods.
+  void AddExtendedSubgraph(Subgraph* parent, const Subgraph::Identity& id);
+
+  // Returns true if this subgraph has any multi-inputs that aren't all-in or
+  // all-out.
+  bool HasInvalidMultiInputs(Subgraph* sg);
+
+  // Graph to run the analysis on.
+  GraphDef graph_;
+  int subgraph_size_;
+
+  // The enriched graph of parsed nodes and connections.
+  GenNodeMap nodes_;
+  // The resulting set of subgraphs.
+  SubgraphPtrSet result_;
+  // The subgraphs of partial size, stored while finding the result.
+  SubgraphPtrSet partial_;
+  // The subgraphs of partial size (stored in partial_) that are still waiting
+  // to be extended.
+  //
+  // TODO(babkin): This is rather simple-minded, each subgraph is examined from
+  // scratch, which means that all its internal links get iterated too. But it's
+  // OK for the small subgraphs. This can be improved by keeping not just
+  // subgraphs but iterators on the list, each of them having the list not-yet
+  // examined nodes (and the link position of the next link to be examined for
+  // the first node). This would add extra constant overhead, so the break-even
+  // subgraph size is not clear yet.
+  std::deque<Subgraph*> todo_;
+
+  // The collation map by signature is designed to allow the removal of entries
+  // and moving of the signature references from the keys of this map to the
+  // outside world. Must be careful at inserting and removal: make sure that
+  // when a new entry is inserted, its signature reference gets populated with
+  // the same data as the key of the map, and that if a reference is moved out,
+  // the map entry gets removed before that reference gets destroyed.
+  struct CollationEntry {
+    std::shared_ptr<Signature> sig;
+    size_t count = 0;
+  };
+  using CollationMap =
+      std::unordered_map<Signature*, CollationEntry, HashAtPtr<Signature*>,
+                         EqAtPtr<Signature*> >;
+  CollationMap collation_map_;
+
+  // The entries are owned by collation_map_, so must be removed from
+  // ordered_collation_ before removing them from collation_map_.
+  struct ReverseLessByCount {
+    bool operator()(CollationEntry* left, CollationEntry* right) const {
+      return left->count > right->count;  // Reverse order.
+    }
+  };
+  using CollationOrderByCount =
+      std::multiset<CollationEntry*, ReverseLessByCount>;
+  CollationOrderByCount ordered_collation_;
+};
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
new file mode 100644
index 00000000..5a91fe7d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_TOOL_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_TOOL_H_
+
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+void GraphAnalyzerTool(const string& file_name, int n);
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_TOOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/hash_tools.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/hash_tools.h
new file mode 100644
index 00000000..b0e79f9a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/hash_tools.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_HASH_TOOLS_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_HASH_TOOLS_H_
+
+#include <cstddef>
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+// Unfortunately, std::hash provides no way to combine hashes, so everyone
+// is copying boost::hash_combine. This is a version that follows Google's
+// guidelines on the arguments, and contains only the combination, without
+// hashing.
+inline void CombineHash(size_t from, size_t* to) {
+  *to ^= from + 0x9e3779b9 + (*to << 6) + (*to >> 2);
+}
+
+// Combine two hashes in such a way that the order of combination doesn't matter
+// (so it's really both commutative and associative). The result is not a very
+// high-quality hash but can be used in case if the order of sub-elements must
+// not matter in the following comparison. An alternative would be to sort the
+// hashes of the sub-elements and then combine them normally in the sorted
+// order.
+inline void CombineHashCommutative(size_t from, size_t* to) {
+  *to = *to + from + 0x9e3779b9;
+}
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_HASH_TOOLS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/map_tools.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/map_tools.h
new file mode 100644
index 00000000..f380504a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/map_tools.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_MAP_TOOLS_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_MAP_TOOLS_H_
+
+#include <functional>
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+// Helpers for building maps of pointers.
+
+template <typename Ptr>
+struct LessAtPtr : std::function<bool(Ptr, Ptr)> {
+  bool operator()(const Ptr& x, const Ptr& y) const { return *x < *y; }
+};
+
+template <typename Ptr>
+struct EqAtPtr : std::function<bool(Ptr, Ptr)> {
+  bool operator()(const Ptr& x, const Ptr& y) const { return *x == *y; }
+};
+
+template <typename Ptr>
+struct HashAtPtr : std::function<size_t(Ptr)> {
+  size_t operator()(const Ptr& x) const { return x->Hash(); }
+};
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_MAP_TOOLS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/sig_node.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/sig_node.h
new file mode 100644
index 00000000..6e6749b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/sig_node.h
@@ -0,0 +1,304 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SIG_NODE_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SIG_NODE_H_
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_analyzer/gen_node.h"
+#include "tensorflow/core/grappler/graph_analyzer/hash_tools.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+namespace test {
+class SigBaseTest;
+}  // end namespace test
+
+class SigNode;
+
+// To find nodes by name. Having the map ordered makes the tests easier,
+// and it isn't used in production code often enough to get any win from
+// using an unordered map.
+using SigNodeMap = std::map<string, std::unique_ptr<SigNode>>;
+
+// One node in the graph, in the form convenient for generation of the signature
+// of the graph, and comparison of two (sub)graphs for equivalence. It refers to
+// the original NodeDef protobuf for most information and adds the extra
+// enrichment.
+//
+// The graph building is 2-stage: first match a SigNode with each NodeDef and
+// collect them into a map that finds them by name, then process the map,
+// deep-parse the underlying NodeDefs and connect the SigNodes together.
+class SigNode {
+ public:
+  friend struct Signature;
+
+  // Will keep the pointer to the underlying NodeDef, so that
+  // underlying object must not be deleted while SigNode is alive.
+  explicit SigNode(const NodeDef* node);
+
+  // Access wrappers.
+  const string& name() const { return node_->name(); }
+  const string& opcode() const { return node_->op(); }
+  const NodeDef* node_def() const { return node_; }
+
+  // For extraction of subgraphs into a separate SigNodeMap, copies the links
+  // that point inside the subgraph from a full-graph SigNode to a subgraph
+  // SigNode. The translation map defines the subgraph and gives the mapping
+  // from the nodes in the full graph to the matching nodes in subgraph.
+  using TranslationMap =
+      std::unordered_map<const GenNode* /*full_graph*/, SigNode* /*subgraph*/>;
+  void CopyLinks(const GenNode& from, const TranslationMap& map);
+
+  // A link is an edge of the graph that connects 2 nodes. Each of the connected
+  // nodes has its own perspective on the link, seeing its local port, remote
+  // port and the remote node. The direction of the link is encoded in the
+  // ports, one port is always incoming and another one outgoing.
+  //
+  // The link tag here contains both ports of the link viewed from the
+  // perspective of this node; consisting of both the local port (i.e. at this
+  // node) and remote port (i.e. on the other node), the local one going first.
+  struct LinkTag {
+    struct Hasher {
+      size_t operator()(const LinkTag& tag) const noexcept {
+        size_t hval = port_hasher(tag.local);
+        CombineHash(port_hasher(tag.remote), &hval);
+        return hval;
+      }
+      GenNode::Port::Hasher port_hasher;
+    };
+
+    LinkTag(GenNode::Port a_local, GenNode::Port a_remote)
+        : local(a_local), remote(a_remote) {}
+
+    // The default constructor is used for the default values in maps.
+    // (false, 99) is an arbitrary value that makes the uninitialized
+    // links easy to tell when debugging (they should never happen).
+    LinkTag() : local(false, 99), remote(false, 99) {}
+
+    // Port of the link on the local node.
+    GenNode::Port local;
+    // Port of the link on the remote node.
+    GenNode::Port remote;
+
+    bool operator==(const LinkTag& other) const {
+      return local == other.local && remote == other.remote;
+    }
+    bool operator<(const LinkTag& other) const {
+      return local < other.local ||
+             (local == other.local && remote < other.remote);
+    }
+  };
+
+  // Since the signature logic doesn't differentiate between the links
+  // with the same tag (other than by the "peer" nodes on their other ends),
+  // all the links with the same tag are grouped into a single structure.
+  struct Link {
+    LinkTag tag;
+    size_t unique_hash;  // Hash of the tag after conflict resolution.
+    // The remote node(s) on the other side on the link(s).
+    using PeerVector = std::vector<SigNode*>;
+    PeerVector peers;
+  };
+
+  // A way to look up the link description by its hash.
+  using LinkHashMap = std::map<size_t, Link>;
+  const LinkHashMap& hash_to_link() const { return hash_to_link_; }
+
+  // The enumeration of all the peer nodes in a predictable order.
+  // Before the signature generation, only the link values determine the
+  // order, after the signature generation the entries at the same
+  // links get further sorted by their peer node ranks.
+  struct HashedPeer {
+    HashedPeer(size_t l, SigNode* p) : link_hash(l), peer(p) {}
+
+    struct LessByRank {
+      bool operator()(const SigNode::HashedPeer& left,
+                      const SigNode::HashedPeer& right) {
+        return left.peer->unique_rank_ < right.peer->unique_rank_;
+      }
+    };
+
+    size_t link_hash;
+    SigNode* peer;
+  };
+  using HashedPeerVector = std::vector<HashedPeer>;
+  const HashedPeerVector& hashed_peers() const { return hashed_peers_; }
+
+  // Compares two nodes in two different graphs for equivalence (two nodes in
+  // the same graph would never be equivalent). Expects that the signatures of
+  // the graphs have already been computed, so unique_rank_ is filled in and
+  // the hashed_peers_ properly ordered.
+  bool operator==(const SigNode& other) const;
+
+  bool operator!=(const SigNode& other) const { return !(*this == other); }
+
+ private:
+  friend class test::SigBaseTest;
+
+  // The CopyLinks code is split into 2 parts for testability.
+  // The first pass builds a map ordered by LinkTag for predictability.
+  void CopyLinksPass1(const GenNode& from, const TranslationMap& map,
+                      std::map<LinkTag, Link>* link_map);
+  // The second pass converts to the map by hash value,
+  // resolves any hash conflicts, and builds the hashed peer vector.
+  void CopyLinksPass2(std::map<LinkTag, Link>* link_map);
+
+  // Computes the topological hash at distance 0. Resets the topo_hash_ vector
+  // and hashed_nodes_;
+  void ComputeTopoHash0();
+
+  // Compute the topological has at the given distance. The hashes for all the
+  // lower distances must be already computed for all the nodes in the graph.
+  // Also computes next_hashed_nodes_ from last_hashed_nodes_.
+  void ComputeTopoHash(int distance);
+
+  // Get the hash value for a particular distance. It must be previously
+  // computed.
+  size_t GetTopoHash(int distance) const;
+
+  // The hash value for the highest computed distance. It must be previously
+  // computed.
+  size_t GetHighTopoHash() const {
+    CHECK(!topo_hash_.empty());
+    return topo_hash_.back();
+  }
+
+  // Rehash the topmost hash, to avoid conflicts.
+  void ReHighTopoHash() {
+    CHECK(!topo_hash_.empty());
+    CombineHash(1, &topo_hash_.back());
+  }
+
+  // Ordering by node order and highest available hash (it must be
+  // previously computed).
+  struct NodeOrderLess {
+    bool operator()(const SigNode* left, const SigNode* right) {
+      return left->topo_hash_.back() < right->topo_hash_.back();
+    }
+  };
+
+ private:
+  const NodeDef* node_;
+
+  // The bitmap mask with 1 bit set that represents this node in the set
+  // during the computation of the signature.
+  uint64_t node_mask_ = 0;
+
+  // The code that populates this map makes sure that there are no hash
+  // conflicts, rehashing if necessary.
+  LinkHashMap hash_to_link_;
+
+  // The enumeration of all the direct peers in the predictable order (which
+  // happens to be the order ot their link tags, but the order of the hashes
+  // would do too). It is used for the quick enumeration during the signature
+  // computation. After the signature building is completed, the entries that
+  // have the same link tag get further sorted in the order of the ranks of
+  // their nodes.
+  HashedPeerVector hashed_peers_;
+
+  // The unique rank represents the order in which the node will be included
+  // into the signature. It gets assigned in order either when the topo_hash_ of
+  // this node becomes unique in the graph, or when the nodes are completely
+  // equivalent, one of them is picked at random to assign the next rank, and
+  // then the rest of the nodes attempt to disambiguate based on that
+  // information.
+  size_t unique_rank_ = ~0;
+  // When hash_is_final_ is set, the topo_has_ vector stops growing, and the
+  // last value from it is used for all the further hashes.
+  bool hash_is_final_ = false;
+  // The hashes that include the topology of the nodes up to the distance N. The
+  // hash for distance 0 is produced from the attributes of this node itself and
+  // its general connectivity properties but no information about the
+  // neighboring nodes. The hash for distance D+1 is build from hashes at level
+  // D of this node and of all its immediate neighbors. The neighbors that are
+  // connected by equivalent links are included in a commutative way.
+  std::vector<size_t> topo_hash_;
+  // The set of nodes that got included into the computation of the
+  // last topo_hash_ entry.
+  uint64_t last_hashed_nodes_ = 0;
+  // The next set of nodes that gets used for the current topo_hash entry.
+  uint64_t next_hashed_nodes_ = 0;
+};
+
+// Signature of a graph. The computation is intertwined with the private methods
+// of SigNode, so keeping both in the same file looks more convenient.
+struct Signature {
+  friend class test::SigBaseTest;
+
+  // Maximal size of the graphs for which the signature can be computed.
+  // Changing this constant won't magically add the support for a larger size,
+  // the rest of implementation would have to be extended. The value of 64 is
+  // driven by the size of a bitset in an uint64_t, and should be enough for our
+  // purposes, while having a high efficiency of implementation.
+  static constexpr int kMaxGraphSize = 64;
+
+  // Using the map, computes the rest of the fields of a signature.
+  // Returns an error is the graph is too big.
+  absl::Status Compute();
+
+  // Convert the computed signature to a string representation.
+  string ToString() const;
+
+  SigNodeMap map;        // The nodes in the graph, accessible by name.
+  size_t sig_short = 0;  // Hash of the signature, for the quick equality check.
+  // The full signature: hashes of the nodes in a predictable order.
+  std::vector<size_t> sig_full;
+  // The nodes in the same order as they go in the signature.
+  std::vector<SigNode*> nodes;
+
+  // For building the unordered maps.
+  size_t Hash() const { return sig_short; }
+
+  // Returns true if the graphs are equivalent. The signature must be already
+  // computed.
+  bool operator==(const Signature& other) const;
+
+ private:
+  // Populates the nodes vector from the map and initializes the state of the
+  // nodes for the signature computation.
+  void PrepareNodes();
+
+  // Finds the nodes with the hashes that are unique and assigns the unique ids
+  // to them. If there are nodes with non-unique hashes, exactly one node from
+  // the first such sequence (in the order of hash values) will be picked and
+  // assigned a unique id. Assumes that the nodes[0...(next_node_id-1)] have
+  // been already assigned the unique ids. Advances next_node_id by at least 1.
+  void FindUniqueHashes(size_t* next_node_id_p);
+
+  // One round of the signature computation. Assumes that the
+  // nodes[0...(next_node_id-1)] have been already assigned the fixed
+  // positions, and thus computes the hashes only for the remaining nodes.
+  void ComputeOneRound(size_t next_node_id);
+
+  // Additional ordering of the hashed_peers_ links in the nodes, so that they
+  // can be compared and printed in a predictable order.
+  void OrderLinks();
+};
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SIG_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/subgraph.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/subgraph.h
new file mode 100644
index 00000000..7d3494cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/subgraph.h
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SUBGRAPH_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SUBGRAPH_H_
+
+#include <initializer_list>
+#include <set>
+#include <unordered_set>
+
+#include "tensorflow/core/grappler/graph_analyzer/gen_node.h"
+#include "tensorflow/core/grappler/graph_analyzer/map_tools.h"
+#include "tensorflow/core/grappler/graph_analyzer/sig_node.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+
+// The description of a single subgraph for processing.
+class Subgraph {
+ public:
+  // Identity of a single subgraph as a set of nodes.
+  class Identity : public gtl::FlatSet<const GenNode*> {
+   public:
+    using InitializerList = std::initializer_list<GenNode*>;
+
+    Identity() = default;
+    Identity(InitializerList init);
+    bool operator<(const Identity& other) const;
+    bool operator==(const Identity& other) const;
+
+    // Compute the hash.
+    size_t Hash() const;
+  };
+
+  explicit Subgraph(Identity id) : id_(std::move(id)), hash_(id_.Hash()) {}
+
+  // Construct by extending the parent identity with an extra node.
+  Subgraph(const Identity& parent_id, GenNode* add_node);
+
+  Subgraph() = delete;
+  Subgraph(const Subgraph& other) = delete;
+  void operator=(const Subgraph& other) = delete;
+
+  // Order for building sets of subgraphs.
+  bool operator<(const Subgraph& other) const { return this->id_ < other.id_; }
+  // Support for hashed sets.
+  bool operator==(const Subgraph& other) const {
+    return this->id_ == other.id_;
+  }
+  size_t Hash() const { return hash_; }
+
+  // Dump the subgraph information to a string.
+  string Dump();
+
+  // Extract this subgraph into a separate graph representation for signature
+  // building, that includes only the links between the nodes in the subgraph
+  // and drops all the external links. The result map should be clear before the
+  // call.
+  void ExtractForSignature(SigNodeMap* result);
+
+  const Identity& id() const { return id_; }
+  bool specific() const { return specific_; }
+  void SetSpecific(bool value) { specific_ = value; }
+  int32_t collation_count() const { return collation_count_; }
+  void AddCollation(int32_t n = 1) { collation_count_ += n; }
+  void ResetCollation() { collation_count_ = 1; }
+  void MergeCollation(const Subgraph& other) {
+    collation_count_ += other.collation_count_;
+  }
+
+ private:
+  // Identity also serves as the list of nodes. It never changes throughout the
+  // life of subgraph.
+  Identity id_;
+  size_t hash_;  // Cached from the identity.
+  // Whether the dump should include the specific names of the nodes. The
+  // non-specific (i.e. generic) subgraphs represent a collation of multiple
+  // subgraphs.
+  bool specific_ = true;
+  // How many collated subgraphs are represented by this subgraph.
+  int32_t collation_count_ = 1;
+};
+
+// Iteration of all links in a subgraph. This is more like Java iterators than
+// the normal C++ iterators. It's simpler this way and there seems to be no
+// major reason to make it a proper C++ iterator.
+class SubgraphIterator {
+ public:
+  // Obviously an iterator is valid only until the original object
+  // gets destroyed.
+  explicit SubgraphIterator(const Subgraph::Identity* id);
+  explicit SubgraphIterator(const Subgraph* sg) : SubgraphIterator(&sg->id()) {}
+
+  // Check whether the built-in iterator is at the end.
+  bool AtEnd() const { return id_it_ == id_->end(); }
+
+  // Get the neighbor at the current iterator.
+  // MUST NOT be called when AtEnd();
+  const GenNode::LinkTarget& GetNeighbor() const {
+    return link_map_it_->second[link_idx_];
+  }
+
+  // Get the node at the current iterator.
+  // MUST NOT be called when AtEnd();
+  const GenNode* GetNode() const { return *id_it_; }
+
+  // Get the port leading to the neighbor at the current iterator.
+  // MUST NOT be called when AtEnd();
+  GenNode::Port GetPort() const { return link_map_it_->first; }
+
+  // Increases the iterator.
+  // Returns true if NOT AtEnd() after increasing the iterator.
+  // Safe to call if already AtEnd().
+  bool Next();
+
+  // If there are more links at the same port, increases the iterator and
+  // returns true. Otherwise leaves the iterator unchanged and returns false.
+  bool NextIfSamePort();
+
+  // Increases the iterator directly to the last position on the current port
+  // (or if already there then doesn't increase). Equivalent to calling
+  // NextIfSamePort() while it returns true, but faster.
+  // Safe to call if already AtEnd().
+  void SkipPort();
+
+  // Increases the iterator directly to the last position on the current node.
+  // Safe to call if already AtEnd().
+  void SkipNode();
+
+  // Returns true if the iterators are exactly the same.
+  bool operator==(const SubgraphIterator& other) const;
+  bool operator!=(const SubgraphIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  // After link_idx_ has been increased, make sure that it points to the
+  // next valid element (or end) by increasing the higher levels of iteration if
+  // needed.
+  // Returns true if NOT AtEnd() after increasing the iterator.
+  // NOT safe to call if already AtEnd().
+  bool PropagateNext();
+
+  // Identity of the subgraph being iterated over.
+  const Subgraph::Identity* id_;
+
+  // The current position, allowing to iterate through the links (see the
+  // reasoning for it in the public section).
+  //
+  // (1) Iterator of the nodes in the subgraph.
+  Subgraph::Identity::const_iterator id_it_;
+  // (2) Iterator in the link map of the node.
+  GenNode::LinkMap::const_iterator link_map_it_;
+  // (3) Index in the vector of the links.
+  int32_t link_idx_;
+};
+
+// A convenient way to store subgraphs: in a set of unique_ptrs. This way the
+// addresses of subgraph objects will stay stable, and the objects themselves
+// won't be copied.
+class SubgraphPtrSet
+    : public std::unordered_set<std::unique_ptr<Subgraph>,
+                                HashAtPtr<std::unique_ptr<Subgraph>>,
+                                EqAtPtr<std::unique_ptr<Subgraph>>> {
+ public:
+  // Attempts to extend the set by adding a new subgraph that gets created by
+  // adding one node to the parent subgraph. If such a subgraph already exists,
+  // returns nullptr, otherwise returns the pointer to the new subgraph.
+  Subgraph* ExtendParent(const Subgraph::Identity& parent_id, GenNode* node);
+};
+
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/test_tools.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/test_tools.h
new file mode 100644
index 00000000..98e269d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_analyzer/test_tools.h
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_TEST_TOOLS_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_TEST_TOOLS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/grappler/graph_analyzer/gen_node.h"
+#include "tensorflow/core/grappler/graph_analyzer/sig_node.h"
+#include "tensorflow/core/grappler/op_types.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_analyzer {
+namespace test {
+
+//=== Helper methods to construct the nodes.
+
+NodeDef MakeNodeConst(const string& name);
+
+NodeDef MakeNode2Arg(const string& name, const string& opcode,
+                     const string& arg1, const string& arg2);
+
+NodeDef MakeNode4Arg(const string& name, const string& opcode,
+                     const string& arg1, const string& arg2, const string& arg3,
+                     const string& arg4);
+
+inline NodeDef MakeNodeMul(const string& name, const string& arg1,
+                           const string& arg2) {
+  return MakeNode2Arg(name, "Mul", arg1, arg2);
+}
+
+// Not really a 2-argument but convenient to construct.
+inline NodeDef MakeNodeAddN(const string& name, const string& arg1,
+                            const string& arg2) {
+  return MakeNode2Arg(name, "AddN", arg1, arg2);
+}
+
+inline NodeDef MakeNodeSub(const string& name, const string& arg1,
+                           const string& arg2) {
+  return MakeNode2Arg(name, "Sub", arg1, arg2);
+}
+
+// Has 2 honest outputs.
+inline NodeDef MakeNodeBroadcastGradientArgs(const string& name,
+                                             const string& arg1,
+                                             const string& arg2) {
+  return MakeNode2Arg(name, "BroadcastGradientArgs", arg1, arg2);
+}
+
+NodeDef MakeNodeShapeN(const string& name, const string& arg1,
+                       const string& arg2);
+
+NodeDef MakeNodeIdentityN(const string& name, const string& arg1,
+                          const string& arg2);
+
+NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1,
+                                const string& arg2, const string& arg3,
+                                const string& arg4);
+
+//=== A container of pre-constructed graphs.
+
+class TestGraphs {
+ public:
+  TestGraphs();
+
+  // Graph with 3 nodes and a control link to self (which is not valid in
+  // reality but adds excitement to the tests).
+  GraphDef graph_3n_self_control_;
+  // Graph that has the multi-input links.
+  GraphDef graph_multi_input_;
+  // Graph that has the all-or-none nodes.
+  GraphDef graph_all_or_none_;
+  // All the nodes are connected in a circle that goes in one direction.
+  GraphDef graph_circular_onedir_;
+  // All the nodes are connected in a circle that goes in both directions.
+  GraphDef graph_circular_bidir_;
+  // The nodes are connected in a line.
+  GraphDef graph_linear_;
+  // The nodes are connected in a cross shape.
+  GraphDef graph_cross_;
+  GraphDef graph_small_cross_;
+  // For testing the ordering of links at the end of signature generation,
+  // a variation of a cross.
+  GraphDef graph_for_link_order_;
+  // Sun-shaped, a ring with "rays".
+  GraphDef graph_sun_;
+};
+
+//=== Helper methods for analysing the structures.
+
+std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map);
+
+// Also checks for the consistency of hash values.
+std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map);
+
+std::vector<string> DumpHashedPeerVector(
+    const SigNode::HashedPeerVector& hashed_peers);
+
+}  // end namespace test
+}  // end namespace graph_analyzer
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_TEST_TOOLS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_topology_view.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_topology_view.h
new file mode 100644
index 00000000..91cbfa2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_topology_view.h
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// GraphTopologyView is a helper class to simplify `node-to-node` connectivity
+// traversals. Regular `GraphView` simplifies `tensor-to-tensor` traversals:
+// connections between output tensors and inputs of a consumer nodes. For the
+// topology view we are focused on nodes connected to nodes, and it's irrelevant
+// if this connection is formed by one or multiple individual tensors.
+//
+// Example:
+//   a = Placeholder(..)
+//   b = Placeholder(..)
+//   c = AddN([a, a, b])
+//
+// GraphView edges:         [a:0 -> c:0, a:0 -> c:1, b:0 -> c:2]
+// GraphTopologyView edges: [a -> c, b -> c]
+//
+// GraphView is used for exploring single node fanins and fanouts, and
+// GraphTopologyView is focused on efficient full graph traversals (computing
+// graph node properties from transitive fanouts, etc...).
+class GraphTopologyView {
+ public:
+  GraphTopologyView() = default;
+  explicit GraphTopologyView(bool skip_invalid_edges)
+      : skip_invalid_edges_(skip_invalid_edges) {}
+
+  // Initialize graph topology view from the graph. It's possible to pass
+  // additional edges that do not exist in a graph, but must be respected when
+  // computing graph topology. Example: Tensorflow runtime allows concurrent
+  // execution of dequeue/enqueue ops from the same queue resource, but we might
+  // want to enforce ordering between them for the purpose of graph analysis.
+  absl::Status InitializeFromGraph(
+      const GraphDef& graph, absl::Span<const GraphView::Edge> ephemeral_edges,
+      bool ignore_control_edges);
+  absl::Status InitializeFromGraph(
+      const GraphDef& graph, absl::Span<const GraphView::Edge> ephemeral_edges);
+  absl::Status InitializeFromGraph(const GraphDef& graph,
+                                   bool ignore_control_edges);
+  absl::Status InitializeFromGraph(const GraphDef& graph);
+
+  bool is_initialized() const { return graph_ != nullptr; }
+  int num_nodes() const { return num_nodes_; }
+  const GraphDef* graph() const { return graph_; }
+
+  // Returns true iff the node exists in the underlying graph.
+  bool HasNode(absl::string_view node_name) const;
+
+  // Finds a node by name or returns `nullptr` if it's not in the graph.
+  const NodeDef* GetNode(absl::string_view node_name) const;
+  // Returns a node corresponding to the given node index.
+  const NodeDef* GetNode(int node_idx) const;
+
+  // Returns a node index for the given node name, if the name exists in the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(absl::string_view node_name) const;
+  // Returns a node index for the given node, if the node belongs to the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(const NodeDef& node) const;
+
+  // Returns all the node indexes that are in the direct fanin of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 4>& GetFanin(int node_idx) const;
+  // Returns all the node indexes that are in the direct fanout of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 2>& GetFanout(int node_idx) const;
+
+ private:
+  // If true, all invalid edges and inputs (srd, dst or input node not found in
+  // a graph) will be skipped, otherwise initialization will fail with error.
+  bool skip_invalid_edges_ = false;
+
+  // WARN: `graph_` must outlive this object and graph nodes must not be
+  // destructed, because node names captured with absl::string_view.
+  const GraphDef* graph_ = nullptr;  // do not own
+  int num_nodes_ = 0;
+  std::vector<absl::string_view> index_to_node_name_;
+  absl::flat_hash_map<absl::string_view, int> node_name_to_index_;
+  std::vector<absl::InlinedVector<int, 4>> fanins_;   // node_idx->input nodes
+  std::vector<absl::InlinedVector<int, 2>> fanouts_;  // node_idx->output nodes
+
+  // We need a valid reference to return from GetFanin/GetFanout if the
+  // `node_idx` argument is outside of the [0, num_nodes_) range.
+  absl::InlinedVector<int, 4> empty_fanin_;
+  absl::InlinedVector<int, 2> empty_fanout_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/graph_view.h b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_view.h
new file mode 100644
index 00000000..4b7e8cfe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/graph_view.h
@@ -0,0 +1,428 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Map a node/op's input/output port_id to arg_id.
+//
+// The port_id refers to the n-th tensor of the node, while the arg_id refers to
+// the n-th arg of the op. These two can be different if an op's arg is a list
+// of tensors.
+//
+// We return -1 for any invalid port_id (i.e., no corresponding arg_id).
+int OpOutputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id);
+int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id);
+
+namespace internal {
+
+// GraphViewInternal is a helper class to simplify graph traversal. It creates
+// an immutable view of the nodes and edges represented by a GraphDef protocol
+// buffer.
+//
+// There are two public classes implementing GraphViewInternal:
+//
+// - GraphView: constructed from the `const GraphDef` and doesn't allow
+//   to mutate underlying graph via input/output ports lookup functions (ports
+//   have const pointers to nodes).
+//
+// - MutableGraphView: constructed from the 'GraphDef` and allows to mutate
+//   the graph via input/output ports lookup functions (ports have non-const
+//   pointers to nodes), and also have couple additional functions to
+//   add/remove/replace nodes in the graph.
+//
+// --------------------------- !!! WARNING !!! ---------------------------------
+//     Removing nodes from the graph outside of MutableGraphView will
+//     lead to segfaults! Guaranteed by absl::string_view!
+// -----------------------------------------------------------------------------
+//
+template <typename GraphDefT, typename NodeDefT>
+class GraphViewInternal {
+ public:
+  struct Port {
+    Port() : node(nullptr), port_id(0) {}
+    Port(NodeDefT* n, int port) : node(n), port_id(port) {}
+
+    bool operator==(const Port& other) const {
+      return node == other.node && port_id == other.port_id;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Port& p) {
+      return H::combine(std::move(h), p.node, p.port_id);
+    }
+
+    NodeDefT* node;
+    int port_id;
+  };
+
+  struct InputPort : public Port {
+    using Port::Port;
+  };
+
+  struct OutputPort : public Port {
+    using Port::Port;
+  };
+
+  struct Edge {
+    Edge(OutputPort s, InputPort d) : src(s), dst(d) {}
+
+    bool operator==(const Edge& other) const {
+      return src == other.src && dst == other.dst;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Edge& e) {
+      return H::combine(std::move(h), e.src, e.dst);
+    }
+
+    OutputPort src;
+    InputPort dst;
+  };
+
+  GraphDefT* graph() const { return graph_; }
+
+  // Finds a node by name or return `nullptr` if it's not in the graph view.
+  NodeDefT* GetNode(absl::string_view node_name) const {
+    return gtl::FindWithDefault(nodes_, node_name, nullptr);
+  }
+
+  // Checks if a node by name is in the graph view.
+  bool HasNode(absl::string_view node_name) const {
+    return GetNode(node_name) != nullptr;
+  }
+
+  // Gets the specified input port. Note that the special '-1' port_id can be
+  // used to access the controlling nodes (i.e. the nodes connected to node_name
+  // through an incoming control dependency).
+  InputPort GetInputPort(absl::string_view node_name, int port_id) const {
+    return InputPort(GetNode(node_name), port_id);
+  }
+
+  // Gets the specified output port. Note that the special '-1' port_id can be
+  // used to access the controlled nodes (i.e. the nodes connected to node_name
+  // through an outgoing control dependency).
+  OutputPort GetOutputPort(absl::string_view node_name, int port_id) const {
+    return OutputPort(GetNode(node_name), port_id);
+  }
+
+  // Gets the input port(s) in the immediate fanout of an output port.
+  const absl::flat_hash_set<InputPort>& GetFanout(
+      const OutputPort& port) const {
+    return gtl::FindWithDefault(fanouts_, port, fanout_not_found_value_);
+  }
+
+  // Gets the output port(s) in the immediate fanin of an input port.
+  absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
+    if (port.port_id >= 0) {
+      OutputPort regular_fanin = GetRegularFanin(port);
+      if (regular_fanin.node == nullptr) {
+        return {};
+      }
+      return {regular_fanin};
+    }
+
+    // Collect fanin for the control input.
+    absl::flat_hash_set<OutputPort> result;
+    const int first_control_port =
+        gtl::FindWithDefault(max_regular_input_port_, port.node, -1) + 1;
+    for (int i = first_control_port; i < port.node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(port.node->input(i));
+
+      auto it = nodes_.find(tensor_id.node());
+      if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
+    }
+    return result;
+  }
+
+  // Special case: regular (i.e. non-control) input ports can only have one
+  // fanin. If port.port_id is out of range or is a control dependency, then an
+  // empty OutputPort is returned.
+  const OutputPort GetRegularFanin(const InputPort& port) const {
+    if (port.port_id < 0 ||
+        port.port_id >
+            gtl::FindWithDefault(max_regular_input_port_, port.node, -1)) {
+      return OutputPort();
+    }
+
+    TensorId tensor_id = ParseTensorName(port.node->input(port.port_id));
+    return GetOutputPort(tensor_id.node(), tensor_id.index());
+  }
+
+  // Checks if a tensor id is a fanin of the node.
+  bool HasFanin(const NodeDefT& node, const TensorId& fanin) const {
+    int end = node.input_size();
+    if (end == 0 || fanin.index() < -1) {
+      return false;
+    }
+
+    const int num_regular_fanins =
+        gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
+    int start = 0;
+    if (fanin.index() > -1) {
+      end = num_regular_fanins;
+    } else {
+      start = num_regular_fanins;
+    }
+    for (int i = start; i < end; ++i) {
+      if (ParseTensorName(node.input(i)) == fanin) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Gets all the input ports in the immediate fanout of a node. Include the
+  // controlled nodes iff include_controlled_nodes is true.
+  absl::flat_hash_set<InputPort> GetFanouts(
+      const NodeDefT& node, bool include_controlled_nodes) const {
+    absl::flat_hash_set<InputPort> result;
+
+    OutputPort port;
+    port.node = const_cast<NodeDefT*>(&node);
+    const int first_port_id = include_controlled_nodes ? -1 : 0;
+    const int last_port_id =
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
+
+    for (int i = first_port_id; i <= last_port_id; ++i) {
+      port.port_id = i;
+      auto it = fanouts_.find(port);
+      if (it != fanouts_.end()) {
+        result.insert(it->second.begin(), it->second.end());
+      }
+    }
+    return result;
+  }
+
+  // Gets all the output ports in the immediate fanin of a node. Include the
+  // controlling nodes iff include_controlling_nodes is true.
+  absl::flat_hash_set<OutputPort> GetFanins(
+      const NodeDefT& node, bool include_controlling_nodes) const {
+    absl::flat_hash_set<OutputPort> result;
+    const int max_input_port =
+        include_controlling_nodes
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
+      TensorId tensor_id = ParseTensorName(node.input(i));
+
+      auto it = nodes_.find(tensor_id.node());
+      if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
+    }
+    return result;
+  }
+
+  // Gets the number of ports in the immediate fanin of a node. Count the
+  // controlling nodes iff include_controlling_nodes is true.
+  int NumFanins(const NodeDefT& node, bool include_controlling_nodes) const {
+    if (include_controlling_nodes) {
+      return node.input_size();
+    }
+    return gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
+  }
+
+  // Gets the number of ports in the immediate fanout of a node. Count the
+  // controlled nodes iff include_controlled_nodes is true.
+  int NumFanouts(const NodeDefT& node, bool include_controlled_nodes) const {
+    int count = 0;
+
+    OutputPort port;
+    port.node = const_cast<NodeDefT*>(&node);
+    const int first_port_id = include_controlled_nodes ? -1 : 0;
+    const int last_port_id =
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
+
+    for (int i = first_port_id; i <= last_port_id; ++i) {
+      port.port_id = i;
+      auto it = fanouts_.find(port);
+      if (it != fanouts_.end()) count += it->second.size();
+    }
+
+    return count;
+  }
+
+  // Gets all the edges in the immediate fanout of a node. Include the
+  // controlled edges iff include_controlled_edges is true.
+  absl::flat_hash_set<Edge> GetFanoutEdges(
+      const NodeDefT& node, bool include_controlled_edges) const {
+    absl::flat_hash_set<Edge> result;
+
+    OutputPort port;
+    port.node = const_cast<NodeDefT*>(&node);
+    const int first_port_id = include_controlled_edges ? -1 : 0;
+    const int last_port_id =
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
+
+    for (int i = first_port_id; i <= last_port_id; ++i) {
+      port.port_id = i;
+      auto it = fanouts_.find(port);
+      if (it != fanouts_.end()) {
+        for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
+          result.emplace(/*src=*/port, /*dst=*/*itr);
+        }
+      }
+    }
+    return result;
+  }
+
+  // Gets all the edges in the immediate fanin of a node. Include the
+  // controlling edges iff include_controlling_edges is true.
+  absl::flat_hash_set<Edge> GetFaninEdges(
+      const NodeDefT& node, bool include_controlling_edges) const {
+    absl::flat_hash_set<Edge> result;
+    const int max_input_port =
+        include_controlling_edges
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
+      TensorId tensor_id = ParseTensorName(node.input(i));
+
+      auto it = nodes_.find(tensor_id.node());
+      if (it != nodes_.end()) {
+        result.emplace(/*src=*/OutputPort(it->second, tensor_id.index()),
+                       /*dst=*/InputPort(const_cast<NodeDefT*>(&node), i));
+      }
+    }
+    return result;
+  }
+
+ protected:
+  explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
+
+  absl::Status AddUniqueNode(NodeDefT* node) {
+    auto inserted = nodes_.emplace(node->name(), node);
+    return inserted.second
+               ? absl::OkStatus()
+               : absl::InvalidArgumentError(absl::StrCat(
+                     "Non unique node name detected: ", node->name()));
+  }
+
+  // TODO(ezhulenev): Remove this function.
+  void AddUniqueNodeOrDie(NodeDefT* node) {
+    absl::Status st = AddUniqueNode(node);
+    CHECK(st.ok()) << st.message();
+  }
+
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
+  void AddFanouts(NodeDefT* node) {
+    int max_input_port = -1;
+    for (int i = 0; i < node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node->input(i));
+      OutputPort output(nodes_[tensor_id.node()], tensor_id.index());
+
+      if (output.port_id < 0) {
+        fanouts_[output].emplace(node, -1);
+      } else {
+        max_input_port = i;
+        int& max_regular_output_port = max_regular_output_port_[output.node];
+        max_regular_output_port =
+            std::max(max_regular_output_port, output.port_id);
+        fanouts_[output].emplace(node, i);
+      }
+    }
+    if (max_input_port > -1) {
+      max_regular_input_port_[node] = max_input_port;
+    }
+  }
+
+  // Access to the mutable internal state for MutableGraphView.
+  absl::flat_hash_map<absl::string_view, NodeDefT*>& nodes() { return nodes_; }
+
+  absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>>& fanouts() {
+    return fanouts_;
+  }
+
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_input_port() {
+    return max_regular_input_port_;
+  }
+
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_output_port() {
+    return max_regular_output_port_;
+  }
+
+ private:
+  GraphDefT* graph_;  // must outlive the graph view
+
+  // A mapping from the node name to the node itself.
+  absl::flat_hash_map<absl::string_view, NodeDefT*> nodes_;
+
+  // A mapping from the output port to all inputs that read from it.
+  absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>> fanouts_;
+
+  // Keep a maximum index of input tensors of the node.
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_input_port_;
+
+  // Keep a maximum index of tensor fetched from the node. It doesn't guarantee
+  // that all tensors in the [0, max_regular_output_port] range are actually
+  // fetched by other nodes.
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_output_port_;
+
+  // If the node has no fanouts at given output port (output tensor consumers)
+  // we return a reference to this set from `GetFanout` (we can't construct new
+  // empty set every time, because we need a non-dangling reference).
+  absl::flat_hash_set<InputPort> fanout_not_found_value_;
+};
+
+}  // namespace internal
+
+// Immutable GraphView that keeps the constness of the GraphDef. If you need to
+// mutate the graph or the nodes via the graph view lookup functions, see
+// MutableGraphView.
+class GraphView
+    : public internal::GraphViewInternal<const GraphDef, const NodeDef> {
+ public:
+  explicit GraphView(const GraphDef* graph) : GraphViewInternal(graph) {
+    for (const NodeDef& node : graph->node()) AddUniqueNodeOrDie(&node);
+    for (const NodeDef& node : graph->node()) AddFanouts(&node);
+  }
+};
+
+// Returns true if node has one (or zero) fanout nodes at given output port.
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port = 0);
+
+// Returns true if node has at least one fanout node at given output port.
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
+// Returns true if the node has at least one input control dependency.
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one output control dependency.
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input or output control dependency.
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/grappler_item.h b/third_party/tflite-hdrs/tensorflow/core/grappler/grappler_item.h
new file mode 100644
index 00000000..36bc4f15
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/grappler_item.h
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A TensorFlow model to optimize.
+// Models are represented by the combination of a graph, one of more fetch
+// nodes, and potentially a set of nodes to feed.
+struct GrapplerItem {
+  GrapplerItem() = default;
+  GrapplerItem(const GrapplerItem& other) = default;
+  GrapplerItem(GrapplerItem&& other) = default;
+  GrapplerItem& operator=(const GrapplerItem& other) = default;
+  GrapplerItem& operator=(GrapplerItem&& other) = default;
+  virtual ~GrapplerItem() = default;
+
+  // Create a copy of this GrapplerItem with graph swapped with the argument.
+  GrapplerItem WithGraph(GraphDef&& graph) const;
+
+  string id;  // A unique id for this item
+
+  // Inputs
+  GraphDef graph;
+  std::vector<std::pair<string, Tensor>> feed;
+  std::vector<string> fetch;
+
+  // Initialization op(s).
+  std::vector<string> init_ops;
+  // Expected initialization time in seconds, or 0 if unknown
+  int64_t expected_init_time = 0;
+
+  // Save/restore ops (if any)
+  string save_op;
+  string restore_op;
+  string save_restore_loc_tensor;
+
+  // Queue runner(s) required to run the queue(s) of this model.
+  std::vector<QueueRunnerDef> queue_runners;
+
+  // List of op names to keep in the graph. This includes nodes that are
+  // referenced in various collections, and therefore must be preserved to
+  // ensure that the optimized metagraph can still be loaded.
+  std::vector<string> keep_ops;
+
+  // Return the set of node evaluated during a regular train/inference step.
+  std::vector<const NodeDef*> MainOpsFanin() const;
+  // Return the set of node run to populate the queues (if any).
+  std::vector<const NodeDef*> EnqueueOpsFanin() const;
+  // Return the set nodes used by TensorFlow to initialize the graph.
+  std::vector<const NodeDef*> InitOpsFanin() const;
+  // Return the set of variables accessed during a regular train/inference step.
+  std::vector<const NodeDef*> MainVariables() const;
+  // Return a set of node names that must be preserved. This includes feed and
+  // fetch nodes, keep_ops, init_ops.
+  std::unordered_set<string> NodesToPreserve() const;
+
+  struct OptimizationOptions {
+    // Is it allowed to add nodes to the graph that do not have registered
+    // gradient function.
+    bool allow_non_differentiable_rewrites = true;
+
+    // Tensorflow function execution semantics is slightly different from the
+    // main Tensorflow graph, and we need to make sure that we do not change it
+    // by running Grappler optimizer passes. One main difference is that
+    // functions do not prune ops with side-effects and dataset-output ops (see
+    // PruneFunctionBody in common_runtime/function.cc).
+    bool allow_pruning_stateful_and_dataset_ops = true;
+
+    // If true Grappler will optimize the main graph, and also all functions in
+    // the graph function library (function can't be polymorphic, it can't have
+    // undefined type parameters in the function signature, or placeholder
+    // attributes in the function body).
+    bool optimize_function_library = true;
+
+    // Mark the grapper optimization run in eager mode or not.
+    bool is_eager_mode = false;
+
+    // Number of intra threads used to run operation.
+    int intra_op_parallelism_threads = tsl::port::MaxParallelism();
+  };
+
+  const std::unordered_set<string>& devices() const;
+  // Adds a device to a set of available devices, only if it's a valid fully
+  // defined device name. Returns `OkStatus()` if successfully added a device,
+  // and an error otherwise.
+  absl::Status AddDevice(const string& device);
+  // Adds all valid devices from the other Grappler item to the device set.
+  absl::Status AddDevices(const GrapplerItem& other);
+  // Adds all valid devices from the nodes of the graph to the device set.
+  // Returns `OkStatus()` if all device annotations found in a graph are valid
+  // fully defined device names, and an error otherwise.
+  absl::Status InferDevicesFromGraph();
+  // Clears a set of available devices.
+  void ClearDevices();
+
+  const OptimizationOptions& optimization_options() const;
+  OptimizationOptions& optimization_options();
+
+ private:
+  // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
+  // TODO(ezhulenev): Migrate all unordered collections to absl.
+
+  // A set of fully defined device names that can be used to place the nodes of
+  // the `graph`.
+  // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
+  std::unordered_set<string> devices_;
+
+  OptimizationOptions optimization_options_;
+};
+
+GrapplerItem::OptimizationOptions CreateOptOptionsForEager();
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/grappler_item_builder.h b/third_party/tflite-hdrs/tensorflow/core/grappler/grappler_item_builder.h
new file mode 100644
index 00000000..00661da0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/grappler_item_builder.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+
+#include <memory>
+#include <set>
+#include <string>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+
+class MetaGraphDef;
+
+namespace grappler {
+
+struct ItemConfig {
+  ItemConfig() {}
+
+  // If true, ignore all user specified node placement.
+  bool ignore_user_placement = true;
+  // If true, ignore all user specified colocation attributes.
+  bool ignore_colocation = true;
+  // Dimension to use if a placeholder node has an _output_shapes attribute with
+  // a dimension of -1.
+  int placeholder_unknown_output_shape_dim = -1;
+  // If true, erases all "_noinline" attributes from user-defined functions.
+  // Has no effect if "inline_functions" is disabled.
+  bool erase_noinline_attributes = false;
+  // If non-empty, override the directory of asset paths.
+  string assets_directory_override;
+  // If true, runs ModelPruner on the graph.
+  bool prune_graph = false;
+  // Override feed nodes list.
+  std::set<string> feed_nodes;
+  // Override fetch nodes list.
+  std::set<string> fetch_nodes;
+
+  // Configs for graph optimizations from common_runtime. This is NOT Grappler
+  // function optimizer. When Grappler is invoked at runtime, it is typically
+  // running after common_runtime pass.
+  //
+  // If true, does L1 optimizations.
+  bool apply_optimizations = false;
+  // If true, does function inlining.
+  bool inline_functions = false;
+};
+
+// Method for optimizing the graph def (including function inlining and other
+// optimizations). This is optimizations from common_runtime, NOT Grappler
+// function optimizer.
+absl::Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
+                                   GraphDef* output_graph_def,
+                                   const ItemConfig& cfg);
+
+// Factory method for creating a GrapplerItem from a MetaGraphDef.
+// Returns nullptr if the given meta_graph cannot be converted.
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
+    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
+
+// Factory method for creating a GrapplerItem from a file
+// containing a MetaGraphDef in either binary or text format.
+// Returns nullptr if the given meta_graph cannot be converted.
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
+    const string& id, const string& meta_graph_file, const ItemConfig& cfg);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/file_input_yielder.h b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/file_input_yielder.h
new file mode 100644
index 00000000..f3e9ecb6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The file input provides a mechanism to feed grappler with existing TensorFlow
+// graphs stored in TensorFlow checkpoints. Note that at this point the weights
+// that may be stored in the checkpoint are not restored in order to speedup the
+// initialization.
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+
+#include <stddef.h>
+#include <limits>
+#include <vector>
+#include "tensorflow/core/grappler/inputs/input_yielder.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GrapplerItem;
+
+class FileInputYielder : public InputYielder {
+ public:
+  // Iterates over the files specified in the list of 'filename' up to
+  // 'max_iterations' times.
+  explicit FileInputYielder(
+      const std::vector<string>& filenames,
+      size_t max_iterations = std::numeric_limits<size_t>::max());
+  bool NextItem(GrapplerItem* item) override;
+
+ private:
+  const std::vector<string> filenames_;
+  size_t current_file_;
+  size_t current_iteration_;
+  size_t max_iterations_;
+
+  size_t bad_inputs_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/input_yielder.h b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/input_yielder.h
new file mode 100644
index 00000000..06f642c5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/input_yielder.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+
+namespace tensorflow {
+namespace grappler {
+
+struct GrapplerItem;
+
+// Abstract interface for yielding graphs that we want to optimize.
+class InputYielder {
+ public:
+  virtual ~InputYielder() {}
+
+  virtual bool NextItem(GrapplerItem* item) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
new file mode 100644
index 00000000..bf776bcd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/grappler/inputs/input_yielder.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+class TrivialTestGraphInputYielder : public InputYielder {
+ public:
+  TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
+                               bool insert_queue,
+                               const std::vector<std::string>& device_names);
+  bool NextItem(GrapplerItem* item) override;
+
+ private:
+  const int num_stages_;
+  const int width_;
+  const int tensor_size_;
+  const bool insert_queue_;
+  std::vector<std::string> device_names_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/utils.h
new file mode 100644
index 00000000..9caefcd8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/inputs/utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
+
+#include <set>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool FilesExist(const std::vector<string>& files,
+                std::vector<absl::Status>* status = nullptr);
+bool FilesExist(const std::set<string>& files);
+
+bool FileExists(const string& file, absl::Status* status);
+
+// Reads GraphDef from file in either text or raw serialized format.
+absl::Status ReadGraphDefFromFile(const string& graph_def_path,
+                                  GraphDef* result);
+
+// Reads MetaGraphDef from file in either text or raw serialized format.
+absl::Status ReadMetaGraphDefFromFile(const string& meta_graph_def_path,
+                                      MetaGraphDef* result);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/mutable_graph_view.h b/third_party/tflite-hdrs/tensorflow/core/grappler/mutable_graph_view.h
new file mode 100644
index 00000000..fdd4fa32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/mutable_graph_view.h
@@ -0,0 +1,336 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
+
+#include <set>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const char kMutableGraphViewCtrl[] = "ConstantFoldingCtrl";
+
+// A utility class to simplify the traversal of a GraphDef that, unlike
+// GraphView, supports updating the graph.  Note that you should not modify the
+// graph separately, because the view will get out of sync.
+
+class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
+ public:
+  explicit MutableGraphView(GraphDef* graph) : GraphViewInternal(graph) {
+    for (NodeDef& node : *graph->mutable_node()) AddUniqueNodeOrDie(&node);
+    for (NodeDef& node : *graph->mutable_node()) AddAndDedupFanouts(&node);
+  }
+
+  // Lookup fanouts/fanins using immutable ports.
+  using GraphViewInternal::GetFanout;
+  const absl::flat_hash_set<InputPort>& GetFanout(
+      const GraphView::OutputPort& port) const;
+
+  using GraphViewInternal::GetFanin;
+  absl::flat_hash_set<OutputPort> GetFanin(
+      const GraphView::InputPort& port) const;
+
+  using GraphViewInternal::GetRegularFanin;
+  const OutputPort GetRegularFanin(const GraphView::InputPort& port) const;
+
+  // Adds a new node to graph and updates the view. Returns a pointer to the
+  // node in graph.
+  NodeDef* AddNode(NodeDef&& node);
+
+  // Adds all nodes from the `subgraph` to the underlying graph and updates the
+  // view. `subgraph` doesn't have to be a valid graph definition on it's own,
+  // it can have edges to the nodes that are not in it, however after adding
+  // it to the underlying graph, final graph must be valid.
+  //
+  // If subgraph function library is not empty, all new functions will be added
+  // to the graph. Functions that appear with the same name in both subgraph and
+  // the graph represented by *this, must have identical function definitions.
+  //
+  // IMPORTANT: All nodes and functions of the given subgraph moved into the
+  // underlying graph, which leaves subgraph in valid but undefined state.
+  absl::Status AddSubgraph(GraphDef&& subgraph);
+
+  // Updates node `node_name` op, device, and attributes. This will clear any
+  // existing attributes. If it is not possible to update the node or if the
+  // node does not exist, an error will be returned and nothing will be modified
+  // in the graph.
+  absl::Status UpdateNode(absl::string_view node_name, absl::string_view op,
+                          absl::string_view device,
+                          absl::Span<const std::pair<string, AttrValue>> attrs);
+
+  // Updates node `from_node_name` name to `to_node_name`. If `to_node_name` is
+  // in use, node `from_node_name` does not exist, or node `from_node_name` has
+  // fanouts and `update_fanouts` is set to false, an error will be returned and
+  // nothing will be modified in the graph.
+  absl::Status UpdateNodeName(absl::string_view from_node_name,
+                              absl::string_view to_node_name,
+                              bool update_fanouts);
+
+  // Swap node names `from_node_name` and `to_node_name`. Self loops of one node
+  // are removed by updating the inputs introducing self loops to use the other
+  // node's name. Setting `update_fanouts` to false will exclude other fanouts
+  // from having their inputs updated, but inputs introducing self loops will
+  // always be updated regardless of `update_fanouts.
+  //
+  // Example:
+  //   1. foo(other:3, bar:2, ^bar)
+  //   2. bar(foo:3, other:1, foo:1, ^foo)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", false):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", true):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(bar:5, foo:6)
+  //
+  // If it is not possible to swap node names (i.e. nodes do not exist or Switch
+  // control dependency may be introduced), an error will be returned and
+  // nothing will be modified in the graph.
+  absl::Status SwapNodeNames(absl::string_view from_node_name,
+                             absl::string_view to_node_name,
+                             bool update_fanouts);
+
+  // Updates all fanouts (input ports fetching output tensors) from
+  // `from_node_name` to the `to_node_name`, including control dependencies.
+  //
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
+  //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
+  //
+  // After calling UpdateFanouts(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
+  //   2. foo2(new_bar:1, other:1)
+  //   3. foo3(other:2, ^new_bar)
+  absl::Status UpdateFanouts(absl::string_view from_node_name,
+                             absl::string_view to_node_name);
+
+  // Adds regular fanin `fanin` to node `node_name`. If the node or fanin do not
+  // exist in the graph, nothing will be modified in the graph. Otherwise fanin
+  // will be added after existing non control dependency fanins. Control
+  // dependencies will be deduped. To add control dependencies, use
+  // AddControllingFanin.
+  absl::Status AddRegularFanin(absl::string_view node_name,
+                               const TensorId& fanin);
+
+  // Adds regular fanin `fanin` to node `node_name` at port `port`. If the node
+  // or fanin do not exist in the graph, nothing will be modified in the graph.
+  // Otherwise fanin will be inserted at port `port`. Control dependencies will
+  // be deduped. To add control dependencies, use AddControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the number of
+  // regular fanins), this will result in an error and the node will not be
+  // modified.
+  absl::Status AddRegularFaninByPort(absl::string_view node_name, int port,
+                                     const TensorId& fanin);
+
+  // Adds control dependency `fanin` to the target node named `node_name`. To
+  // add regular fanins, use AddRegularFanin.
+  //
+  // Case 1: If the fanin is not a Switch node, the control dependency is simply
+  // added to the target node:
+  //
+  //   fanin -^> target node.
+  //
+  // Case 2: If the fanin is a Switch node, we cannot anchor a control
+  // dependency on it, because unlike other nodes, only one of its outputs will
+  // be generated when the node is activated. In this case, we try to find an
+  // Identity/IdentityN node in the fanout of the relevant port of the Switch
+  // and add it as a fanin to the target node. If no such Identity/IdentityN
+  // node can be found, a new Identity node will be created. In both cases, we
+  // end up with:
+  //
+  //   fanin -> Identity{N} -^> target node.
+  //
+  // If the control dependency being added is redundant (control dependency
+  // already exists or control dependency can be deduped from regular fanins),
+  // this will not result in an error and the node will not be modified.
+  absl::Status AddControllingFanin(absl::string_view node_name,
+                                   const TensorId& fanin);
+
+  // Removes regular fanin `fanin` from node `node_name`. If the node or fanin
+  // do not exist in the graph, nothing will be modified in the graph. If there
+  // are multiple inputs that match the fanin, all of them will be removed. To
+  // remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the fanin being removed doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  absl::Status RemoveRegularFanin(absl::string_view node_name,
+                                  const TensorId& fanin);
+
+  // Removes regular fanin at port `port` from node `node_name`. If the node
+  // does not exist in the graph, nothing will be modified in the graph.
+  // To remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  absl::Status RemoveRegularFaninByPort(absl::string_view node_name, int port);
+
+  // Removes control dependency `fanin_node_name` from the target node named
+  // `node_name`. If the node or fanin do not exist in the graph, nothing will
+  // be modified in the graph. To remove regular fanins, use RemoveRegularFanin.
+  //
+  // If the fanin being removed doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  absl::Status RemoveControllingFanin(absl::string_view node_name,
+                                      absl::string_view fanin_node_name);
+
+  // Removes all fanins from node `node_name`. Control dependencies will be
+  // retained if keep_controlling_fanins is true.
+  //
+  // If no fanins are removed, this will not result in an error and the node
+  // will not be modified.
+  absl::Status RemoveAllFanins(absl::string_view node_name,
+                               bool keep_controlling_fanins);
+
+  // Replaces all fanins `from_fanin` with `to_fanin` in node `node_name`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the fanin being updated doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  absl::Status UpdateFanin(absl::string_view node_name,
+                           const TensorId& from_fanin,
+                           const TensorId& to_fanin);
+
+  // Replaces fanin at port `port` in node `node_name` with fanin `fanin`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  absl::Status UpdateRegularFaninByPort(absl::string_view node_name, int port,
+                                        const TensorId& fanin);
+
+  // Swaps fanins at ports `from_port` and `to_port` in node `node_name`. If the
+  // node does not exist, nothing will be modified in the graph.
+  //
+  // If the ports are not a valid port (less than 0 or greater than the last
+  // index of the regular fanins), this will result in an error and the node
+  // will not be modified.
+  absl::Status SwapRegularFaninsByPorts(absl::string_view node_name,
+                                        int from_port, int to_port);
+
+  // Updates all regular fanins to equivalent controlling fanins. If it is not
+  // possible, an error will be returned and nothing will be modified in the
+  // graph.
+  absl::Status UpdateAllRegularFaninsToControlling(absl::string_view node_name);
+
+  // Deletes nodes from the graph. If a node can't be safely removed,
+  // specifically if a node still has fanouts, an error will be returned. Nodes
+  // that can't be found are ignored.
+  absl::Status DeleteNodes(const absl::flat_hash_set<string>& nodes_to_delete);
+
+ private:
+  // Adds fanouts for fanins of node to graph, while deduping control
+  // dependencies from existing control dependencies and regular fanins. Note,
+  // node inputs will be mutated if control dependencies can be deduped.
+  void AddAndDedupFanouts(NodeDef* node);
+
+  // Finds next output port smaller than fanin.port_id and update. The
+  // max_regular_output_port is only updated if fanin.port_id is the same as the
+  // current max_regular_output_port and if the fanouts set is empty. If there
+  // are no regular outputs, max_regular_output_port will be erased.
+  void UpdateMaxRegularOutputPortForRemovedFanin(
+      const OutputPort& fanin,
+      const absl::flat_hash_set<InputPort>& fanin_fanouts);
+
+  // Updates max regular output port for newly added fanin by checking the
+  // current max and updating if the newly added fanin is of a larger port.
+  void UpdateMaxRegularOutputPortForAddedFanin(const OutputPort& fanin);
+
+  // Updates all fanouts (input ports fetching output tensors) from `from_node`
+  // to the `to_node`, including control dependencies.
+  //
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
+  //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
+  //
+  // After calling UpdateFanouts(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
+  //   2. foo2(new_bar:1, other:1)
+  //   3. foo3(other:2, ^new_bar)
+  //
+  // IMPORTANT: If `from_node` or `to_node` is not in the underlying graph, the
+  // behavior is undefined.
+  absl::Status UpdateFanoutsInternal(NodeDef* from_node, NodeDef* to_node);
+
+  // Adds fanin to node. If fanin is a control dependency, existing control
+  // dependencies will be checked first before adding. Otherwise fanin will be
+  // added after existing non control dependency inputs.
+  bool AddFaninInternal(NodeDef* node, const OutputPort& fanin);
+
+  // Finds control dependency node to be used based on fanin. If fanin is not a
+  // Switch node, fanin.node is simply returned. Otherwise this will try to find
+  // a candidate Identity node consuming fanin, as the control dependency. If it
+  // is not possible or will introduce a self loop, an error message will be
+  // set. If nullptr is returned with no error
+  // GetOrCreateIdentityConsumingSwitch should be called to generate the new
+  // Identity node.
+  NodeDef* GetControllingFaninToAdd(absl::string_view node_name,
+                                    const OutputPort& fanin, string* error_msg);
+
+  // Finds a generated Identity node consuming Switch node `fanin.node` at port
+  // `fanin.port_id`. If such a node does not exist, a new Identity node will be
+  // created.
+  NodeDef* GetOrCreateIdentityConsumingSwitch(const OutputPort& fanin);
+
+  // Removes all instances of regular fanin `fanin` from node `node`.
+  bool RemoveRegularFaninInternal(NodeDef* node, const OutputPort& fanin);
+
+  // Removes controlling fanin `fanin_node` from node if such controlling fanin
+  // exists.
+  bool RemoveControllingFaninInternal(NodeDef* node, NodeDef* fanin_node);
+
+  // Checks if nodes to be deleted are missing or have any fanouts that will
+  // remain in the graph. If node is removed in either case, the graph will
+  // enter an invalid state.
+  absl::Status CheckNodesCanBeDeleted(
+      const absl::flat_hash_set<string>& nodes_to_delete);
+
+  // Removes fanins of the deleted node from internal state. Control
+  // dependencies are retained iff keep_controlling_fanins is true.
+  void RemoveFaninsInternal(NodeDef* deleted_node,
+                            bool keep_controlling_fanins);
+
+  // Removes fanouts of the deleted node from internal state.
+  void RemoveFanoutsInternal(NodeDef* deleted_node);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/op_types.h b/third_party/tflite-hdrs/tensorflow/core/grappler/op_types.h
new file mode 100644
index 00000000..719f12fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/op_types.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+bool IsAdd(const NodeDef& node);
+bool IsAddN(const NodeDef& node);
+bool IsAll(const NodeDef& node);
+bool IsAngle(const NodeDef& node);
+bool IsAny(const NodeDef& node);
+bool IsAnyDiv(const NodeDef& node);
+bool IsAnyBatchMatMul(const NodeDef& node);
+bool IsAnyMatMul(const NodeDef& node);
+bool IsAnyMax(const NodeDef& node);
+bool IsAnyMaxPool(const NodeDef& node);
+bool IsAnyMin(const NodeDef& node);
+bool IsAnyMul(const NodeDef& node);
+bool IsAnySparseSegmentReduction(const NodeDef& node);
+bool IsApproximateEqual(const NodeDef& node);
+bool IsArg(const NodeDef& node);
+bool IsArgMax(const NodeDef& node);
+bool IsArgMin(const NodeDef& node);
+bool IsAssert(const NodeDef& node);
+bool IsAssign(const NodeDef& node);
+bool IsAsString(const NodeDef& node);
+bool IsAtan2(const NodeDef& node);
+bool IsAvgPoolGrad(const NodeDef& node);
+bool IsBetainc(const NodeDef& node);
+bool IsBiasAdd(const NodeDef& node);
+bool IsBiasAddV2(const NodeDef& node);
+bool IsBiasAddGrad(const NodeDef& node);
+bool IsBitcast(const NodeDef& node);
+bool IsBroadcastTo(const NodeDef& node);
+bool IsCast(const NodeDef& node);
+bool IsCheckNumerics(const NodeDef& node);
+bool IsCollective(const NodeDef& node);
+bool IsComplex(const NodeDef& node);
+bool IsComplexAbs(const NodeDef& node);
+bool IsConcat(const NodeDef& node);
+bool IsConcatOffset(const NodeDef& node);
+bool IsConj(const NodeDef& node);
+bool IsConjugateTranspose(const NodeDef& node);
+bool IsConstant(const NodeDef& node);
+bool IsControlFlow(const NodeDef& node);
+bool IsConv2D(const NodeDef& node);
+bool IsConv2DBackpropFilter(const NodeDef& node);
+bool IsConv2DBackpropInput(const NodeDef& node);
+bool IsConv3D(const NodeDef& node);
+bool IsConv3DBackpropFilterV2(const NodeDef& node);
+bool IsConv3DBackpropInputV2(const NodeDef& node);
+bool IsDepthwiseConv2dNative(const NodeDef& node);
+bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
+bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
+bool IsDequeueOp(const NodeDef& node);
+bool IsDiv(const NodeDef& node);
+bool IsDivNoNan(const NodeDef& node);
+bool IsElementWiseMonotonic(const NodeDef& node, bool* is_non_decreasing);
+bool IsElu(const NodeDef& node);
+bool IsEluGrad(const NodeDef& node);
+bool IsQuantizationEmulation(const NodeDef& node);
+bool IsEnter(const NodeDef& node);
+bool IsEqual(const NodeDef& node);
+bool IsExit(const NodeDef& node);
+bool IsExp(const NodeDef& node);
+bool IsFakeParam(const NodeDef& node);
+bool IsFill(const NodeDef& node);
+bool IsFloorDiv(const NodeDef& node);
+bool IsFloorMod(const NodeDef& node);
+bool IsFusedBatchNorm(const NodeDef& node);
+bool IsFusedBatchNormEx(const NodeDef& node);
+bool IsFusedBatchNormGrad(const NodeDef& node);
+bool IsGather(const NodeDef& node);
+bool IsGreater(const NodeDef& node);
+bool IsGreaterEqual(const NodeDef& node);
+bool IsHistogramSummary(const NodeDef& node);
+bool IsHostConstant(const NodeDef& node);
+bool IsIdentity(const NodeDef& node);
+bool IsIdentityN(const NodeDef& node);
+bool IsIdentityNSingleInput(const NodeDef& node);
+bool IsIf(const NodeDef& node);
+bool IsIgamma(const NodeDef& node);
+bool IsIgammac(const NodeDef& node);
+bool IsImag(const NodeDef& node);
+bool IsImmutableConst(const NodeDef& node);
+bool IsInvGrad(const NodeDef& node);
+bool IsLeakyRelu(const NodeDef& node);
+bool IsLeakyReluGrad(const NodeDef& node);
+bool IsLess(const NodeDef& node);
+bool IsLessEqual(const NodeDef& node);
+bool IsLog(const NodeDef& node);
+bool IsLogicalAnd(const NodeDef& node);
+bool IsLogicalNot(const NodeDef& node);
+bool IsLogicalOr(const NodeDef& node);
+bool IsLoopCond(const NodeDef& node);
+bool IsMatMul(const NodeDef& node);
+bool IsMax(const NodeDef& node);
+bool IsMaxPoolGrad(const NodeDef& node);
+bool IsMaximum(const NodeDef& node);
+bool IsMean(const NodeDef& node);
+bool IsMerge(const NodeDef& node);
+bool IsMin(const NodeDef& node);
+bool IsMinimum(const NodeDef& node);
+bool IsMirrorPad(const NodeDef& node);
+bool IsMirrorPadGrad(const NodeDef& node);
+bool IsMklFusedMish(const NodeDef& node);
+bool IsMod(const NodeDef& node);
+bool IsMul(const NodeDef& node);
+bool IsMulNoNan(const NodeDef& node);
+bool IsNeg(const NodeDef& node);
+bool IsNextIteration(const NodeDef& node);
+bool IsNoOp(const NodeDef& node);
+bool IsNotEqual(const NodeDef& node);
+bool IsOnesLike(const NodeDef& node);
+bool IsPack(const NodeDef& node);
+bool IsPack(const NodeDef& node);
+bool IsPad(const NodeDef& node);
+bool IsPartitionedCall(const NodeDef& node);
+bool IsPlaceholder(const NodeDef& node);
+bool IsPolygamma(const NodeDef& node);
+bool IsPow(const NodeDef& node);
+bool IsPrint(const NodeDef& node);
+bool IsProd(const NodeDef& node);
+bool IsQuantizedMatMul(const NodeDef& node);
+bool IsQueue(const NodeDef& node);
+bool IsRandomShuffle(const NodeDef& node);
+bool IsRank(const NodeDef& node);
+bool IsReadVariableOp(const NodeDef& node);
+bool IsReadVariablesOp(const NodeDef& node);
+bool IsReal(const NodeDef& node);
+bool IsRealDiv(const NodeDef& node);
+bool IsReciprocalGrad(const NodeDef& node);
+bool IsRecv(const NodeDef& node);
+bool IsReduction(const NodeDef& node);
+bool IsRelu(const NodeDef& node);
+bool IsRelu6(const NodeDef& node);
+bool IsRelu6Grad(const NodeDef& node);
+bool IsReluGrad(const NodeDef& node);
+bool IsReshape(const NodeDef& node);
+bool IsRestore(const NodeDef& node);
+bool IsRetval(const NodeDef& node);
+bool IsReverse(const NodeDef& node);
+bool IsReverseV2(const NodeDef& node);
+bool IsRsqrt(const NodeDef& node);
+bool IsRsqrtGrad(const NodeDef& node);
+bool IsSelect(const NodeDef& node);
+bool IsSeluGrad(const NodeDef& node);
+bool IsSend(const NodeDef& node);
+bool IsShape(const NodeDef& node);
+bool IsShapeN(const NodeDef& node);
+bool IsShuffle(const NodeDef& node);
+bool IsSigmoid(const NodeDef& node);
+bool IsSigmoidGrad(const NodeDef& node);
+bool IsSize(const NodeDef& node);
+bool IsSlice(const NodeDef& node);
+bool IsSnapshot(const NodeDef& node);
+bool IsSoftmax(const NodeDef& node);
+bool IsSoftplusGrad(const NodeDef& node);
+bool IsSoftsignGrad(const NodeDef& node);
+bool IsSplit(const NodeDef& node);
+bool IsSplitV(const NodeDef& node);
+bool IsSqrt(const NodeDef& node);
+bool IsSqrtGrad(const NodeDef& node);
+bool IsSquare(const NodeDef& node);
+bool IsSquaredDifference(const NodeDef& node);
+bool IsSqueeze(const NodeDef& node);
+bool IsStackCloseOp(const NodeDef& node);
+bool IsStackOp(const NodeDef& node);
+bool IsStackPopOp(const NodeDef& node);
+bool IsStackPushOp(const NodeDef& node);
+bool IsStatefulPartitionedCall(const NodeDef& node);
+bool IsStopGradient(const NodeDef& node);
+bool IsStridedSlice(const NodeDef& node);
+bool IsStridedSliceGrad(const NodeDef& node);
+bool IsStringToHashBucketFast(const NodeDef& node);
+bool IsSub(const NodeDef& node);
+bool IsSum(const NodeDef& node);
+bool IsSwitch(const NodeDef& node);
+bool IsSymbolicGradient(const NodeDef& node);
+bool IsTanh(const NodeDef& node);
+bool IsTanhGrad(const NodeDef& node);
+bool IsTensorArray(const NodeDef& node);
+bool IsTile(const NodeDef& node);
+bool IsTranspose(const NodeDef& node);
+bool IsTruncateDiv(const NodeDef& node);
+bool IsTruncateMod(const NodeDef& node);
+bool IsUnique(const NodeDef& node);
+bool IsUnpack(const NodeDef& node);
+bool IsVariable(const NodeDef& node);
+bool IsWhile(const NodeDef& node);
+bool IsXdivy(const NodeDef& node);
+bool IsXlaLaunch(const NodeDef& node);
+bool IsZerosLike(const NodeDef& node);
+bool IsZeta(const NodeDef& node);
+
+// Return true if the op is an aggregation (e.g. Add, AddN).
+// Returns false if it could not be determined to be so.
+bool IsAggregate(const NodeDef& node);
+
+// Return true if the op is commutative (e.g. Mul, Add).
+// Returns false if it could not be determined to be so.
+bool IsCommutative(const NodeDef& node);
+
+// Returns true if the node is known to use persistent memory to store its
+// value.
+bool IsPersistent(const NodeDef& node);
+
+// Returns true if the node belongs to the NC_DATASET class (see graph/graph.h).
+bool IsDataset(const NodeDef& node);
+
+// Returns true if the node op is marked as stateful, or if it was not found in
+// op_registry.
+bool IsStateful(const NodeDef& node, const OpRegistryInterface* op_registry);
+bool IsStateful(const NodeDef& node);  // use OpRegistry::Global()
+
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry);
+bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
+
+// Returns true if the takes a tensor reference as input.
+// Returns false if the op type is unknown.
+bool HasRefInput(const NodeDef& node);
+
+bool ModifiesFrameInfo(const NodeDef& node);
+
+// Returns true if the op is known to write to one or more of its inputs.
+bool ModifiesInputsInPlace(const NodeDef& node);
+
+// Returns true if the op is an element-wise involution, i.e. if it is its
+// own inverse such that f(f(x)) == x.
+bool IsInvolution(const NodeDef& node);
+
+// Returns true if the op preserves the order and value of elements
+// and shape of its first input tensor.
+bool IsValueAndOrderAndShapePreserving(const NodeDef& node);
+
+// Returns true if the op preserves the order and value of elements in its
+// first input tensor and possible changes its shape.
+bool IsValueAndOrderPreserving(const NodeDef& node);
+
+// Returns true if the op in node only rearranges the order of elements in its
+// first input tensor and possible changes its shape. More precisely, this
+// function returns true if the op commutes with all element-wise operations.
+bool IsValuePreserving(const NodeDef& node);
+
+// Returns true if node is idempotent w.r.t. its first input, i.e. if
+// Op(Op(x, y, z), y, z) = Op(x, y, z).
+bool IsIdempotent(const NodeDef& node);
+
+bool IsUnaryElementWise(const NodeDef& node);
+
+// Returns true if we can find an opdef corresponding to the op of the node.
+bool HasOpDef(const NodeDef& node);
+
+// Returns true if the op changes the scalar type of its first input elements
+// and preserves the number of elements.
+bool IsCastLike(const NodeDef& node);
+
+// Returns true if this op never forwards any of its inputs, i.e. always
+// allocates buffers for its inputs.
+bool NeverForwardsInputs(const NodeDef& node);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
new file mode 100644
index 00000000..2d079a5c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+
+#include <unordered_set>
+
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kArithmeticOptimizer[] = "ArithmeticOptimizer";
+
+// Optimize TF computations by reducing the arithmetic complexity required to
+// run a model.
+class ArithmeticOptimizer : public GraphOptimizer {
+ public:
+  ArithmeticOptimizer()
+      : opt_level_(RewriterConfig::ON),
+        options_(ArithmeticOptimizerOptions::Default(RewriterConfig::ON)) {}
+
+  explicit ArithmeticOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level),
+        options_(ArithmeticOptimizerOptions::Default(opt_level)) {}
+
+  ~ArithmeticOptimizer() override {}
+
+  string name() const override { return "arithmetic_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  friend class ArithmeticOptimizerTest;
+
+  // Granular control for arithmetic optimizer stages
+  struct ArithmeticOptimizerOptions {
+    bool combine_add_to_addn = true;
+    bool convert_sqrt_div_to_rsqrt_mul = true;
+    bool dedup_computations = true;
+    bool fold_conjugate_into_transpose = true;
+    bool fold_multiply_into_conv = true;
+    bool fold_transpose_into_matmul = true;
+    bool fuse_squared_diff = true;
+    bool hoist_common_factor_out_of_aggregation = true;
+    bool hoist_cwise_unary_chains = true;
+    bool minimize_broadcasts = true;
+    bool optimize_max_or_min_of_monotonic = true;
+    bool remove_idempotent = true;
+    bool remove_identity_transpose = true;
+    bool remove_involution = true;
+    bool remove_logical_not = true;
+    bool remove_negation = true;
+    bool remove_redundant_bitcast = true;
+    bool remove_redundant_cast = true;
+    bool remove_redundant_reshape = true;
+    bool reduce_upsampling_dims = true;
+    bool reorder_cast_like_and_value_preserving = true;
+    bool replace_mul_with_tile = true;
+    bool replace_mul_with_square = true;
+    bool replace_pack_with_tile_reshape = true;
+    bool convert_pow = true;
+    bool convert_log1p = true;
+    bool convert_log_softmax = true;
+    bool convert_expm1 = true;
+    bool unary_ops_composition = true;
+    bool remove_stack_slice_same_axis = true;
+    bool simplify_aggregation = true;
+    bool simplify_embedding_lookup = true;
+    bool remove_cast_into_segment_reduction = true;
+
+    // Choose which arithmetic optimizer stages will be enabled for a given
+    // optimization level by default.
+    static ArithmeticOptimizerOptions Default(
+        RewriterConfig::Toggle opt_level) {
+      ArithmeticOptimizerOptions options;
+      return options;
+    }
+  };
+
+  // Returns true if it is safe to dedup node from the graph.
+  bool CanDedup(const NodeDef& node) const;
+
+  // Dedup redundant nodes in the graph.
+  void DedupComputations();
+
+  // Forward the control dependencies anchored on src_nodes to the target_nodes.
+  void ForwardControlDependencies(NodeDef* target_node,
+                                  const std::vector<const NodeDef*>& src_nodes);
+
+  // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
+  // transposes.
+  absl::Status SimplifyArithmeticOps(bool can_use_shapes);
+  // Tries to simplify the expression that roots at `node` and replaces the uses
+  // of `node` to the simplified expression. Returns the name of the simplified
+  // tensor (e.g. "split:1") or an empty string if no simplification is
+  // performed.
+  //
+  // `node_map` stores the mapping from node names to NodeDef*, and will be
+  // updated according to the rewrite.
+  //
+  // `new_nodes` will be populated with the new nodes this function creates and
+  // updates. The caller can push these nodes into the simplification queue to
+  // optimize them further.
+  //
+  // TODO(jingyue): This interface is not suitable for optimizing nodes with
+  // multiple output tensors. We should pass in a tensor name instead of a
+  // NodeDef.
+  string TrySimplifyAndReplaceUses(const NodeDef* node,
+                                   SetVector<NodeDef*>* nodes_to_simplify);
+
+  RewriterConfig::Toggle opt_level_;
+  ArithmeticOptimizerOptions options_;
+
+  bool fetch_nodes_known_ = false;
+  std::unordered_set<string> nodes_to_preserve_;
+  std::unique_ptr<NodeMap> node_map_;
+  std::unique_ptr<GraphProperties> graph_properties_;
+  GraphDef* optimized_graph_ = nullptr;  // Not owned.
+  gtl::FlatSet<string> feed_nodes_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
new file mode 100644
index 00000000..7955db31
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
@@ -0,0 +1,289 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ArithmeticOptimizerTest : public GrapplerTest {
+ protected:
+  // Optimize a graph using optimizer and prune all the nodes that no
+  // longer have any output consumers.
+  void OptimizeAndPrune(GraphOptimizer* optimizer, GrapplerItem* item,
+                        GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run optimizer twice to make sure the rewrite is idempotent.
+  void DedupAndOptimizeTwiceAndPrune(GraphOptimizer* optimizer,
+                                     GrapplerItem* item, GraphDef* output) {
+    TF_EXPECT_OK(CommonSubgraphElimination().Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run optimizer twice to make sure the rewrite is idempotent.
+  void OptimizeTwice(GraphOptimizer* optimizer, GrapplerItem* item,
+                     GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+  }
+
+  // Run optimizer twice to make sure the rewrite is idempotent.
+  // Optionally run a constant folding pass before pruning.
+  void OptimizeTwiceAndPrune(GraphOptimizer* optimizer, GrapplerItem* item,
+                             GraphDef* output, bool const_folding = false) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    if (const_folding) {
+      item->graph.Swap(output);
+      output->Clear();
+      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
+                       .Optimize(nullptr, *item, output));
+    }
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    optimizer->options_.combine_add_to_addn = false;
+  }
+
+  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.combine_add_to_addn = true;
+  }
+
+  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_conjugate_into_transpose = true;
+  }
+
+  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_multiply_into_conv = true;
+  }
+
+  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_transpose_into_matmul = true;
+  }
+
+  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
+  }
+
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
+  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_identity_transpose = true;
+  }
+
+  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_involution = true;
+  }
+
+  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_bitcast = true;
+  }
+
+  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_cast = true;
+  }
+
+  void EnableOnlyReduceUpsamplingDims(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reduce_upsampling_dims = true;
+  }
+
+  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_reshape = true;
+  }
+
+  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_negation = true;
+  }
+
+  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reorder_cast_like_and_value_preserving = true;
+  }
+
+  void EnableOnlyReplaceMulWithBroadcastByTile(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_tile = true;
+  }
+
+  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_square = true;
+  }
+
+  void EnableOnlyReplacePackWithTileReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_pack_with_tile_reshape = true;
+  }
+
+  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_cwise_unary_chains = true;
+  }
+
+  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
+  }
+
+  void EnableOnlyLogSoftmax(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_log_softmax = true;
+  }
+
+  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_pow = true;
+  }
+
+  void EnableOnlyFuseSquaredDiff(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fuse_squared_diff = true;
+  }
+
+  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_idempotent = true;
+  }
+
+  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_logical_not = true;
+  }
+
+  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_aggregation = true;
+  }
+
+  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_log1p = true;
+  }
+
+  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.optimize_max_or_min_of_monotonic = true;
+  }
+
+  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_expm1 = true;
+  }
+
+  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.unary_ops_composition = true;
+  }
+
+  void EnableOnlyRemoveStackSliceSameAxis(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_stack_slice_same_axis = true;
+  }
+
+  void EnableOnlySimplifyEmbeddingLookup(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_embedding_lookup = true;
+  }
+
+  void EnableOnlyRemoveCastIntoSegmentReduction(
+      ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_cast_into_segment_reduction = true;
+  }
+
+ private:
+  void DisableAllStages(ArithmeticOptimizer* optimizer) {
+    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.dedup_computations = false;
+    options.combine_add_to_addn = false;
+    options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.convert_pow = false;
+    options.convert_log1p = false;
+    options.optimize_max_or_min_of_monotonic = false;
+    options.fold_conjugate_into_transpose = false;
+    options.fold_multiply_into_conv = false;
+    options.fold_transpose_into_matmul = false;
+    options.hoist_common_factor_out_of_aggregation = false;
+    options.hoist_cwise_unary_chains = false;
+    options.minimize_broadcasts = false;
+    options.remove_identity_transpose = false;
+    options.remove_involution = false;
+    options.remove_idempotent = false;
+    options.remove_redundant_bitcast = false;
+    options.remove_redundant_cast = false;
+    options.remove_redundant_reshape = false;
+    options.remove_negation = false;
+    options.remove_logical_not = false;
+    options.reorder_cast_like_and_value_preserving = false;
+    options.replace_mul_with_tile = false;
+    options.replace_mul_with_square = false;
+    options.simplify_aggregation = false;
+    options.unary_ops_composition = false;
+    options.simplify_embedding_lookup = false;
+    options.remove_cast_into_segment_reduction = false;
+    optimizer->options_ = options;
+  }
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
new file mode 100644
index 00000000..d4be8476
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// CUDA: convert to float16 on GPU
+// BF16: convert to bfloat16 on CPU
+// CPU: emulate float16 on CPU without changing operator kernel
+// FP16_CPU : convert to float16 on CPU
+enum class AutoMixedPrecisionMode { CUDA, BF16, CPU, FP16_CPU };
+
+// Convert data types to float16 or bfloat16 where appropriate to improve
+// performance on GPUs or CPUs.
+class AutoMixedPrecision : public GraphOptimizer {
+ public:
+  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If BF16 or
+  // FP16_CPU, converts nodes to bfloat16/fp16 on CPUs in order to take
+  // advantage of oneDNN performance improvements with bfloat16/fp16.
+  explicit AutoMixedPrecision(
+      AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
+      : mode_(mode) {}
+
+  ~AutoMixedPrecision() override {}
+
+  string name() const override {
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        return "auto_mixed_precision";
+      case AutoMixedPrecisionMode::BF16:
+        return "auto_mixed_precision_onednn_bfloat16";
+      case AutoMixedPrecisionMode::CPU:
+        return "auto_mixed_precision_cpu";
+      case AutoMixedPrecisionMode::FP16_CPU:
+        // Note: using different name than GPU for ease of debugging.
+        return "auto_mixed_precision_onednn_float16";
+      default:
+        LOG(FATAL) << "Invalid value for AutoMixedPrecisionMode: "  // Crash Ok
+                   << static_cast<int>(mode_);
+    }
+  };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) override;
+
+ private:
+  const AutoMixedPrecisionMode mode_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
new file mode 100644
index 00000000..37f3714c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -0,0 +1,600 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_LISTS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_LISTS_H_
+
+#include <string>
+
+#include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Represents the four lists of ops: the allow list, infer list, deny list, and
+// clear list. These lists determine which ops are converted to fp16/bf16
+// (referred to as 'f16' for short) and which ops stay as fp32.
+class AutoMixedPrecisionLists {
+ public:
+  virtual ~AutoMixedPrecisionLists() {}
+
+  // Returns the set of ops that are considered numerically-safe (for execution
+  // in f16), performance-critical, and can run in f16. These ops are always
+  // converted to f16.
+  virtual gtl::FlatSet<string> AllowList() = 0;
+  // Returns the set of ops that can run in f16 and are considered numerically-
+  // safe (for execution in f16), but which may be made unsafe by an upstream
+  // denylist op.
+  virtual gtl::FlatSet<string> InferList() = 0;
+  // Returns the set of ops that are considered numerically-dangerous (i.e.,
+  // unsafe for execution in f16) and whose effects may also be observed in
+  // downstream nodes (e.g. for f16, in Exp -> Add, the Add is unsafe due to
+  // the Exp).
+  virtual gtl::FlatSet<string> DenyList() = 0;
+  // Returns the set of ops that do not have numerically-significant effects
+  // (i.e., they are always considered safe for execution in f16 precision), and
+  // can run in f16.
+  virtual gtl::FlatSet<string> ClearList() = 0;
+
+ protected:
+  // Adds or removes ops from list if certain environmental variables are set.
+  static void UpdateList(const string& list_name, gtl::FlatSet<string>* list) {
+    CHECK(list_name == "ALLOWLIST" || list_name == "INFERLIST" ||  // Crash OK.
+          list_name == "DENYLIST" || list_name == "CLEARLIST" ||
+          // TODO(reedwm): for bkwds compat; remove when no longer necessary:
+          list_name == "WHITELIST" || list_name == "GRAYLIST" ||
+          list_name == "BLACKLIST");
+    string add_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_ADD";
+    string remove_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_REMOVE";
+    string to_add, to_remove;
+    TF_CHECK_OK(ReadStringFromEnvVar(add_env_var, "", &to_add));
+    TF_CHECK_OK(ReadStringFromEnvVar(remove_env_var, "", &to_remove));
+    for (const auto& x : str_util::Split(to_add, ",")) {
+      list->insert(x);
+    }
+    for (const auto& x : str_util::Split(to_remove, ",")) {
+      list->erase(x);
+    }
+  }
+
+  // Subclasses should include these on the ClearList.
+  static void AddTensorListOps(gtl::FlatSet<string>* list) {
+    // Note: if a data structure op (such as TensorListPopBack) is added here,
+    // IsTensorListReaderOp or IsTensorListWriterOp may need to be modified
+    // LINT.IfChange
+    constexpr const char* tensor_list_ops[] = {
+        "TensorListConcat",     "TensorListConcatLists",
+        "TensorListConcatV2",   "TensorListGather",
+        "TensorListGetItem",    "TensorListPopBack",
+        "TensorListPushBack",   "TensorListPushBackBatch",
+        "TensorListFromTensor", "TensorListScatter",
+        "TensorListScatterV2",  "TensorListScatterIntoExistingList",
+        "TensorListSetItem",    "TensorListSplit",
+        "TensorListStack"};
+    // LINT.ThenChange(//tensorflow/core/grappler/optimizers/auto_mixed_precision.cc)
+    for (auto op : tensor_list_ops) {
+      list->insert(op);
+    }
+  }
+};
+
+class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
+ private:
+  static bool IsPseudoFastMath() {
+    string optimization_level;
+    TF_CHECK_OK(
+        ReadStringFromEnvVar("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "",
+                             &optimization_level));
+    optimization_level = absl::AsciiStrToUpper(optimization_level);
+    return optimization_level == "TENSOR_CORES_ONLY";
+  }
+
+ public:
+  AutoMixedPrecisionListsFp16(
+      int cuda_version, int cudnn_version,
+      AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {
+    if (mode == AutoMixedPrecisionMode::CUDA ||
+        mode == AutoMixedPrecisionMode::CPU) {
+      // Note: this is not a typo here. use_cuda_ is set to true for the CPU
+      // intentionally to make CPU and GPU have the same fp16 ops.
+      use_cuda_ = true;
+      use_onednn_ = false;
+    } else if (mode == AutoMixedPrecisionMode::FP16_CPU) {
+      use_onednn_ = true;
+      use_cuda_ = false;
+    }
+  }
+
+  gtl::FlatSet<string> AllowList() override {
+    auto list = gtl::FlatSet<string>{
+        "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Einsum",
+        "MatMul",
+    };
+    if (use_cuda_) {
+      list.insert("BlockLSTM");
+      list.insert("BlockLSTMV2");
+      list.insert("BlockLSTMGrad");
+      list.insert("BlockLSTMGradV2");
+      list.insert("CudnnRNN");
+      list.insert("CudnnRNNBackprop");
+      list.insert("CudnnRNNBackpropV2");
+      list.insert("CudnnRNNBackpropV3");
+      list.insert("CudnnRNNV2");
+      list.insert("CudnnRNNV3");
+      list.insert("FusedConv2DBiasActivation");
+      list.insert("FusedSparseConvGpuV2");
+      list.insert("GRUBlockCell");
+      list.insert("GRUBlockCellGrad");
+      list.insert("LSTMBlockCell");
+      list.insert("LSTMBlockCellGrad");
+      list.insert("Mha");
+      list.insert("MhaV2");
+      list.insert("Tmlp");
+      list.insert("TmlpV2");
+      list.insert("TmlpV3");
+      list.insert("Pmlp");
+      list.insert("FastUnsortedSegmentMax");
+      list.insert("VoxelMax");
+    }
+#if TENSORFLOW_USE_ROCM
+    if (true) {
+#else
+    if ((use_cuda_ && cuda_version_ >= 9010) || use_onednn_) {
+      // Fp16 BatchMatMul is slow before CUDA 9.1.
+#endif
+      list.insert("BatchMatMul");
+      list.insert("BatchMatMulV2");
+    }
+    if ((use_cuda_ && cudnn_version_ >= 7602) || use_onednn_) {
+      // Fp16 3D conv is slow before CUDNN 7.6.2.
+      list.insert("Conv3D");
+      list.insert("Conv3DBackpropFilter");
+      list.insert("Conv3DBackpropFilterV2");
+      list.insert("Conv3DBackpropInput");
+      list.insert("Conv3DBackpropInputV2");
+    }
+    if ((use_cuda_ && cudnn_version_ >= 8000) || use_onednn_) {
+      list.insert("DepthwiseConv2dNative");
+      list.insert("DepthwiseConv2dNativeBackpropFilter");
+      list.insert("DepthwiseConv2dNativeBackpropInput");
+    }
+    UpdateList("ALLOWLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("WHITELIST", &list);
+
+    return list;
+  }
+
+  gtl::FlatSet<string> InferList() override {
+    if (IsPseudoFastMath() && use_cuda_) {
+      return gtl::FlatSet<string>{};
+    }
+
+    auto list = gtl::FlatSet<string>{
+        "Add",
+        "AddN",
+        "AddV2",
+        "AvgPool",
+        "AvgPool3D",
+        "AvgPool3DGrad",
+        "AvgPoolGrad",
+        "BiasAdd",
+        "BiasAddGrad",
+        "BiasAddV1",
+        "Elu",
+        "EluGrad",
+        "Erf",
+        "Erfc",
+        "FloorDiv",
+        "FusedBatchNormV2",
+        "FusedBatchNormGradV2",
+        "FusedBatchNormV3",
+        "FusedBatchNormGradV3",
+        "_FusedBatchNormEx",
+        "Inv",
+        "LeakyRelu",
+        "LeakyReluGrad",
+        "Log",
+        "Log1p",
+        "LogSoftmax",
+        "Mul",
+        "Prod",
+        "RealDiv",
+        "Reciprocal",
+        "Selu",
+        "SeluGrad",
+        "Sigmoid",
+        "SigmoidGrad",
+        "Softmax",
+        "Softplus",
+        "SoftplusGrad",
+        "Softsign",
+        "SoftsignGrad",
+        "Sqrt",
+        "Sub",
+        "Tanh",
+        "TanhGrad",
+    };
+    if (use_onednn_) {
+      list.insert("Rsqrt");
+      list.insert("Square");
+      list.insert("SquaredDifference");
+    }
+    UpdateList("INFERLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("GRAYLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> DenyList() override {
+    if (IsPseudoFastMath() && use_cuda_) {
+      return gtl::FlatSet<string>{};
+    }
+
+    auto list = gtl::FlatSet<string>{
+        "Exp",
+        "Expm1",
+        "L2Loss",
+        "Mean",
+        "Pow",
+        "SaveV2",
+        "SoftmaxCrossEntropyWithLogits",
+        "SparseSoftmaxCrossEntropyWithLogits",
+        "Sum",
+    };
+    UpdateList("DENYLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("BLACKLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> ClearList() override {
+    if (IsPseudoFastMath() && use_cuda_) {
+      return gtl::FlatSet<string>{};
+    }
+
+    auto list = gtl::FlatSet<string>{
+        "Abs",
+        "ArgMax",
+        "ArgMin",
+        "BatchToSpace",
+        "BatchToSpaceND",
+        "BroadcastTo",
+        "Ceil",
+        "CheckNumerics",
+        "ClipByValue",
+        "Concat",
+        "ConcatV2",
+        "DepthToSpace",
+        "DynamicPartition",
+        "DynamicStitch",
+        "Enter",
+        "EnsureShape",
+        "Equal",
+        "Exit",
+        "ExpandDims",
+        "Fill",
+        "Floor",
+        "Gather",
+        "GatherNd",
+        "GatherV2",
+        "Greater",
+        "GreaterEqual",
+        "Identity",
+        "IdentityN",
+        "IsFinite",
+        "IsInf",
+        "IsNan",
+        "Less",
+        "LessEqual",
+        "Max",
+        "MaxPool",
+        "MaxPool3D",
+        "MaxPool3DGrad",
+        "MaxPool3DGradGrad",
+        "MaxPoolGrad",
+        "MaxPoolGradGrad",
+        "MaxPoolGradGradV2",
+        "MaxPoolGradV2",
+        "MaxPoolV2",
+        "Maximum",
+        "Merge",
+        "Min",
+        "Minimum",
+        "MirrorPad",
+        "MirrorPadGrad",
+        "Neg",
+        "NextIteration",
+        "NotEqual",
+        "OneHot",
+        "OnesLike",
+        "Pack",
+        "Pad",
+        "PadV2",
+        "PreventGradient",
+        "Rank",
+        "Relu",
+        "Relu6",
+        "Relu6Grad",
+        "ReluGrad",
+        "Reshape",
+        "ResizeNearestNeighbor",
+        "ResizeNearestNeighborGrad",
+        "Reverse",
+        "ReverseSequence",
+        "ReverseV2",
+        "Round",
+        "Select",
+        "SelectV2",
+        "Shape",
+        "ShapeN",
+        "Sign",
+        "Size",
+        "Slice",
+        "Snapshot",
+        "SpaceToBatch",
+        "SpaceToBatchND",
+        "SpaceToDepth",
+        "Split",
+        "SplitV",
+        "Squeeze",
+        "StopGradient",
+        "StridedSlice",
+        "StridedSliceGrad",
+        "Switch",
+        "Tile",
+        "TopK",
+        "TopKV2",
+        "Transpose",
+        "Unpack",
+        "Where",
+        "ZerosLike",
+    };
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
+    return list;
+  }
+
+ private:
+  int cuda_version_;
+  int cudnn_version_;
+  bool use_cuda_;
+  bool use_onednn_;
+};
+
+// TODO(reedwm): Remove this alias. Some Google-internal code still uses the
+// AutoMixedPrecisionListsCuda name.
+using AutoMixedPrecisionListsCuda = AutoMixedPrecisionListsFp16;
+
+class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
+ public:
+  AutoMixedPrecisionListsMkl() {}
+
+  // Only ops which are supported by MKL in bfloat16 should be added to the
+  // allow list, infer list, or clear list.
+  gtl::FlatSet<string> AllowList() override {
+    auto list = gtl::FlatSet<string>{"Conv2D",
+                                     "Conv2DBackpropFilter",
+                                     "Conv2DBackpropInput",
+                                     "Conv3D",
+                                     "Conv3DBackpropFilterV2",
+                                     "Conv3DBackpropInputV2",
+                                     "DepthwiseConv2dNative",
+                                     "DepthwiseConv2dNativeBackpropFilter",
+                                     "DepthwiseConv2dNativeBackpropInput",
+                                     "MatMul",
+                                     "FusedPadConv2D",
+                                     "BatchMatMul",
+                                     "BatchMatMulV2",
+                                     "Einsum"};
+
+    UpdateList("ALLOWLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("WHITELIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> InferList() override {
+    auto list = gtl::FlatSet<string>{"Add",
+                                     "AddN",
+                                     "AddV2",
+                                     "AvgPool",
+                                     "AvgPool3D",
+                                     "AvgPool3DGrad",
+                                     "AvgPoolGrad",
+                                     "BiasAdd",
+                                     "BiasAddGrad",
+                                     "BiasAddV1",
+                                     "Erf",
+                                     "Erfc",
+                                     "FusedBatchNormV2",
+                                     "FusedBatchNormGradV2",
+                                     "FusedBatchNormV3",
+                                     "FusedBatchNormGradV3",
+                                     "LeakyRelu",
+                                     "LeakyReluGrad",
+                                     "Mul",
+                                     "Sub",
+                                     "Elu",
+                                     "EluGrad",
+                                     "FloorDiv",
+                                     "_FusedBatchNormEx",
+                                     "Inv",
+                                     "Log",
+                                     "Log1p",
+                                     "LogSoftmax",
+                                     "Mean",
+                                     "Prod",
+                                     "RealDiv",
+                                     "Reciprocal",
+                                     "Rsqrt",
+                                     "Selu",
+                                     "SeluGrad",
+                                     "Sigmoid",
+                                     "SigmoidGrad",
+                                     "Softmax",
+                                     "Softplus",
+                                     "SoftplusGrad",
+                                     "Softsign",
+                                     "SoftsignGrad",
+                                     "Sqrt",
+                                     "Square",
+                                     "SquaredDifference",
+                                     "Sum",
+                                     "Tanh",
+                                     "TanhGrad"};
+    UpdateList("INFERLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("GRAYLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> DenyList() override {
+    auto list = gtl::FlatSet<string>{
+        "Exp",
+        "Expm1",
+        "L2Loss",
+        "Pow",
+        "SaveV2",
+        "SoftmaxCrossEntropyWithLogits",
+        "SparseSoftmaxCrossEntropyWithLogits",
+    };
+    UpdateList("DENYLIST", &list);
+    // For backwards compatibility, keeping the original env variable here.
+    // TODO(reedwm): This should be removed if we don't have active users.
+    UpdateList("BLACKLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> ClearList() override {
+    auto list = gtl::FlatSet<string>{
+        "Abs",
+        "ArgMax",
+        "ArgMin",
+        "BatchToSpace",
+        "BatchToSpaceND",
+        "BroadcastTo",
+        "Ceil",
+        "CheckNumerics",
+        "ClipByValue",
+        "Concat",
+        "ConcatV2",
+        "DepthToSpace",
+        "DynamicPartition",
+        "DynamicStitch",
+        "EnsureShape",
+        "Enter",
+        "Equal",
+        "Exit",
+        "ExpandDims",
+        "Fill",
+        "Floor",
+        "Gather",
+        "GatherNd",
+        "GatherV2",
+        "Greater",
+        "GreaterEqual",
+        "Identity",
+        "IdentityN",
+        "IsFinite",
+        "IsInf",
+        "IsNan",
+        "Less",
+        "LessEqual",
+        "Max",
+        "Maximum",
+        "MaxPool",
+        "MaxPool3D",
+        "MaxPool3DGrad",
+        "MaxPoolGrad",
+        "MaxPoolGradGrad",
+        "MaxPoolGradGradV2",
+        "MaxPoolGradV2",
+        "MaxPoolV2",
+        "Merge",
+        "Min",
+        "Minimum",
+        "MirrorPad",
+        "MirrorPadGrad",
+        "Neg",
+        "NextIteration",
+        "NotEqual",
+        "OnesLike",
+        "Pack",
+        "Pad",
+        "PadV2",
+        "PreventGradient",
+        "Rank",
+        "Relu",
+        "Relu6",
+        "Relu6Grad",
+        "ReluGrad",
+        "Reshape",
+        "ResizeNearestNeighbor",
+        "ResizeNearestNeighborGrad",
+        "ResizeBilinear",
+        "Reverse",
+        "ReverseSequence",
+        "ReverseV2",
+        "Round",
+        "ScatterNd",
+        "Select",
+        "SelectV2",
+        "Shape",
+        "ShapeN",
+        "Sign",
+        "Slice",
+        "Snapshot",
+        "SpaceToBatch",
+        "SpaceToBatchND",
+        "SpaceToDepth",
+        "Split",
+        "SplitV",
+        "Squeeze",
+        "StatelessWhile",
+        "StopGradient",
+        "StridedSlice",
+        "StridedSliceGrad",
+        "Switch",
+        "Tile",
+        "TopK",
+        "TopKV2",
+        "Transpose",
+        "Where",
+        "While",
+        "Unpack",
+        "ZerosLike",
+    };
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
+    return list;
+  }
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_LISTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_parallel.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_parallel.h
new file mode 100644
index 00000000..ae063864
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+
+#include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Automatically parallelize a graph by splitting in the batch dimension.
+class AutoParallel : public GraphOptimizer {
+ public:
+  AutoParallel(int num_replicas) : num_replicas_(num_replicas) {
+    CHECK(num_replicas_ >= 2);
+  }
+  ~AutoParallel() override {}
+
+  string name() const override { return "autoparallel"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) override;
+
+ private:
+  GraphDef graph_;
+  std::map<string, NodeDef*> all_nodes_;
+  std::set<string> apply_gradients_nodes_;
+  std::set<string> replica_nodes_;
+  std::set<string> shared_nodes_;
+  const GrapplerItem* item_;
+  int num_replicas_;
+  int num_gpus_;
+  absl::Status Initialize(const GrapplerItem& item);
+  NodeDef* AddNodeDivConst();
+  NodeDef* AddNodeDiv(const string& name, const string& input_a,
+                      const string& input_b);
+  NodeDef* AddNodeControl(const string& name, const std::set<string>& deps,
+                          GraphDef* graph);
+  bool NotSharedNode(const string& name);
+  void AddSharedNodes(GraphDef* graph);
+  void AddOneReplica(GraphDef* graph, int number);
+  void BuildGraph(GraphDef* graph);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/common_subgraph_elimination.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/common_subgraph_elimination.h
new file mode 100644
index 00000000..2ec80e88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/common_subgraph_elimination.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_COMMON_SUBGRAPH_ELIMINATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_COMMON_SUBGRAPH_ELIMINATION_H_
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by deduping equivalent subgraphs.
+class Cluster;
+struct GrapplerItem;
+
+class CommonSubgraphElimination : public GraphOptimizer {
+ public:
+  CommonSubgraphElimination() {}
+
+  explicit CommonSubgraphElimination(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+
+  ~CommonSubgraphElimination() override {}
+
+  string name() const override { return "common_subgraph_elimination"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  friend class CommonSubgraphEliminationTest;
+
+  // Returns true if it is safe to dedup node from the graph.
+  bool CanDedup(const NodeDef& node) const;
+
+  // Dedup redundant nodes in the graph.
+  absl::Status DedupComputations(GraphDef* optimized_graph);
+
+  RewriterConfig::Toggle opt_level_;
+
+  bool fetch_nodes_known_ = false;
+  std::unordered_set<string> nodes_to_preserve_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_COMMON_SUBGRAPH_ELIMINATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/constant_folding.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/constant_folding.h
new file mode 100644
index 00000000..9c58f81e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -0,0 +1,360 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const char kConstantFoldingConst[] = "ConstantFolding";
+const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
+extern const int64_t kMaxConstantSize;
+
+// Constant folding optimization for a graph.
+class ConstantFolding : public GraphOptimizer {
+ public:
+  // The size limit will only be considered if the newly created node is greater
+  // than original_size (optional).
+  static absl::Status CreateNodeDef(const string& name,
+                                    const TensorValue& tensor, NodeDef* node,
+                                    size_t original_size = 0);
+  static string AddControlDependency(const string& input_name, GraphDef* graph,
+                                     NodeMap* node_map);
+
+  explicit ConstantFolding(DeviceBase* cpu_device,
+                           bool disable_compressed_tensor_optimization = false,
+                           bool fold_quantization_emulation = true);
+  ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device,
+                  bool disable_compressed_tensor_optimization = false,
+                  bool fold_quantization_emulation = true);
+
+  ~ConstantFolding() override {}
+
+  string name() const override { return "constant_folding"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) override;
+
+ private:
+  bool ForwardInputs(NodeDef* node, absl::Span<const int> inputs_to_forward);
+  string OptimizedNodeName(const NodeDef& node, absl::string_view suffix) const;
+  bool OptimizedNodeExists(const NodeDef& node, absl::string_view suffix) const;
+
+  bool IsReallyConstant(const NodeDef& node) const;
+
+  bool GetTensorFromConstNode(const string& node_name_or_input, Tensor* tensor);
+
+  absl::Status MaterializeShapes(const GraphProperties& properties);
+
+  absl::Status MaterializeBroadcastGradientArgs(
+      const NodeDef& node, const GraphProperties& properties);
+  absl::Status MaterializeReductionIndices(NodeDef* node,
+                                           const GraphProperties& properties);
+  absl::Status MaterializeConstantValuedNode(NodeDef* node,
+                                             const GraphProperties& properties);
+  absl::Status MaterializeOutputValues(NodeDef* node,
+                                       const GraphProperties& properties);
+  absl::Status MaterializeConstants(const GraphProperties& properties);
+
+  bool IsFoldable(const NodeDef& node, const GraphProperties* properties);
+  bool IsFoldableUncached(const NodeDef& node,
+                          const GraphProperties* properties) const;
+  bool MaybeFoldable(const NodeDef& node,
+                     const GraphProperties* properties) const;
+
+  absl::Status EvaluateNode(
+      const NodeDef& node, const absl::InlinedVector<TensorValue, 4UL>& inputs,
+      absl::InlinedVector<TensorValue, 4UL>* output) const;
+
+  absl::Status EvaluateOneFoldable(const NodeDef& node,
+                                   std::vector<NodeDef>* outputs,
+                                   bool* result_too_large);
+
+  absl::Status FoldMergeNode(NodeDef* node, GraphDef* output_graph);
+  absl::Status FoldNode(NodeDef* node, GraphDef* output_graph,
+                        bool* result_too_large);
+
+  bool IsOnes(const NodeDef& node) const;
+  bool IsZeros(const NodeDef& node) const;
+  bool ReplaceOperationWithBroadcastTo(int input_to_broadcast,
+                                       const GraphProperties& properties,
+                                       NodeDef* node, GraphDef* graph);
+  void ReplaceOperationWithIdentity(int input_to_forward,
+                                    const GraphProperties& properties,
+                                    NodeDef* node, GraphDef* graph);
+  void ReplaceOperationWithSnapshot(int input_to_forward,
+                                    const GraphProperties& properties,
+                                    NodeDef* node, GraphDef* graph);
+  void ReplaceOperationWithNoOp(NodeDef* node, GraphProperties* properties,
+                                GraphDef* graph);
+  void ReplaceBinaryOperationWithBroadcastTo(int input_to_broadcast,
+                                             const GraphProperties& properties,
+                                             NodeDef* node, GraphDef* graph);
+  void ReplaceSubtractionFromZeroByNegation(NodeDef* node, GraphDef* graph);
+  absl::Status ReplaceOperationWithConstant(double value,
+                                            const GraphProperties& properties,
+                                            const TensorShapeProto& shape,
+                                            NodeDef* node, GraphDef* graph);
+
+  // Notice: Destroys *value.
+  absl::Status ReplaceOperationWithConstantTensor(DataType dtype,
+                                                  TensorProto* value,
+                                                  NodeDef* node,
+                                                  GraphDef* graph);
+
+  void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
+  absl::Status FoldGraph(const GraphProperties& properties, GraphDef* output,
+                         absl::flat_hash_set<string>* nodes_to_not_simplify);
+
+  absl::Status IsSimplifiableReshape(const NodeDef& node,
+                                     const GraphProperties& properties) const;
+  absl::Status SimplifyGraph(
+      GraphDef* optimized_graph, GraphProperties* properties,
+      absl::flat_hash_set<string>* nodes_to_not_simplify);
+  absl::Status SimplifyNode(NodeDef* node, GraphDef* optimized_graph,
+                            GraphProperties* properties);
+
+  absl::Status RunOptimizationPass(Cluster* cluster, GrapplerItem* item,
+                                   GraphProperties* properties,
+                                   GraphDef* optimized_graph);
+
+  // Applies partial constant folding for Concat which is not commutative.
+  // Returns true if the transformation applied successfully.
+  bool PartialConcatConstFolding(GraphDef* optimized_graph,
+                                 GraphProperties* properties, NodeDef* node);
+
+  // Applies partial constant folding for associative operators AddN and
+  // AccumulateNV2. Returns true if the transformation applied successfully.
+  bool PartialAssocOpConstFolding(GraphDef* optimized_graph,
+                                  GraphProperties* properties, NodeDef* node);
+
+  // Applies partial constant propagation through IdentityN operator.
+  // Returns true if the transformation applied successfully.
+  bool PartialConstPropThroughIdentityN(NodeDef* node);
+
+  struct ConstantPushDownContext {
+    NodeDef* op_child;
+    NodeDef* const_child;
+    bool left_child_is_const;
+    bool right_child_is_const;
+    NodeDef* left_leaf;
+    NodeDef* right_leaf;
+    bool left_leaf_is_const;
+    bool right_leaf_is_const;
+
+    // Shape & type information.
+    const std::vector<OpInfo::TensorProperties>* parent_input_props;
+    const std::vector<OpInfo::TensorProperties>* op_child_input_props;
+  };
+
+  // Populates ctx with pointers to the nodes in expression tree for which
+  // constant pushdown optimization is being considered, corresponding to one of
+  // the following configurations:
+  //
+  //               parent                            parent
+  //               /    \                            /    \
+  //        op_child   const_child            const_child op_child
+  //         /     \                                       /     \
+  //    left_leaf  right_leaf                        left_leaf  right_leaf
+  //
+  // Returns true if the expression is possible amenable for optimization.
+  // Returns false if must_have_properties is true and input properties for
+  // parent and op_child are not known.
+  bool PrepareConstantPushDown(const NodeDef& parent,
+                               const GraphProperties& properties,
+                               bool must_have_properties,
+                               ConstantPushDownContext* ctx) const;
+
+  // Pushes down constants on '+', '-', '*', and '/' operators if applicable.
+  // Returns true if the transformation applied successfully.
+  bool ConstantPushDown(GraphProperties* properties, GraphDef* optimized_graph,
+                        NodeDef* node);
+
+  // Pushes down constants on '+' and 'BiasAdd' operators if applicable.
+  // Returns true if the graph was modified.
+  bool ConstantPushDownBiasAdd(GraphProperties* properties,
+                               GraphDef* optimized_graph, NodeDef* node);
+
+  // Aggregate constants present around a conv operator. Returns true if the
+  // transformation was applied successfully.
+  bool MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
+                       const GraphProperties& properties);
+
+  // Strength reduces floating point division by a constant Div(x, const) to
+  // multiplication by the reciprocal Mul(x, Reciprocal(const)).
+  bool ReduceDivToReciprocalMul(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies arithmetic operations with ones or zeros. Returns the status,
+  // and updates the success input argument that denotes if any simplification
+  // was applied.
+  absl::Status SimplifyArithmeticOperations(const GraphProperties& properties,
+                                            bool use_shape_info,
+                                            GraphDef* optimized_graph,
+                                            NodeDef* node);
+
+  // Simplifies a Reshape operation to an Identity operation if applicable.
+  bool SimplifyReshape(const GraphProperties& properties, bool use_shape_info,
+                       NodeDef* node);
+
+  // Returns true iff the node is a reduction and its reduction indices are
+  // constant. Sets *indices_is_empty to true if the set of dimensions to reduce
+  // along is empty (this happens often in the gradient graphs).
+  bool IsReductionWithConstantIndices(const NodeDef& node,
+                                      bool* indices_is_empty) const;
+  // Returns true if theres a possibility that a Reduce node could be simplified
+  // to an Identity/Reshape.
+  bool IsReductionCandidateForSimplification(
+      const NodeDef& node, const GraphProperties& properties,
+      TensorShapeProto* input_tensor_shape,
+      TensorShapeProto* output_tensor_shape, bool* is_single_element_op) const;
+  // Returns true iff this reduction can be reduced to an identity (i.e if the
+  // input dimensions to reduce along are all of size 1 and keep_dims is true).
+  bool IsReductionSimplifiableToIdentity(
+      const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+      const absl::InlinedVector<TensorValue, 4UL>& reduction_indices_vector)
+      const;
+  // Changes a reduction into an Identity op, returning true on success.
+  bool ReplaceReductionWithIdentity(NodeDef* node) const;
+
+  // Simplifies a Reduction operation to an Identity/Reshape operation if
+  // applicable.
+  bool SimplifyReduction(GraphDef* optimized_graph,
+                         const GraphProperties& properties, NodeDef* node);
+
+  // Switch(x, x) will always feed false to its false branch and true to
+  // its true branch. By rewriting the graph a bit, we can propagate these
+  // constants down the two output branches, and just use control dependencies
+  // to trigger the selected one at runtime. For example,
+  //
+  //     +------+
+  // x-->|Switch|-->a  (in practice there may be multiple consumers of each
+  // x-->|      |-->b   output branch.)
+  //     +------+
+  //
+  // Is rewritten as
+  //
+  //     +------+
+  // x-->|Switch|-->Identity--^>Const(false)-->a
+  // x-->|      |-->Identity--^>Const(true)-->b
+  //     +------+
+  bool SimplifySwitch(GraphDef* optimized_graph, NodeDef* node);
+
+  // Moves constants past Enter node if applicable.
+  bool MoveConstantsPastEnter(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies Pack operation if applicable.
+  bool SimplifyPack(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies a Squeeze operation to an Identity operation if applicable.
+  void SimplifySqueeze(const GraphProperties& properties, bool use_shape_info,
+                       GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies a Pad operation to an Identity operation if applicable.
+  absl::Status SimplifyPad(const GraphProperties& properties,
+                           bool use_shape_info, GraphDef* optimized_graph,
+                           NodeDef* node);
+
+  // Simplifies a Tile operation to an Identity operation if applicable.
+  absl::Status SimplifyTile(const GraphProperties& properties,
+                            bool use_shape_info, GraphDef* optimized_graph,
+                            NodeDef* node);
+
+  // Simplifies a StridedSlice operation to an Identity operation if applicable.
+  absl::Status SimplifyStridedSlice(const GraphProperties& properties,
+                                    bool use_shape_info,
+                                    GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies a Slice operation to an Identity operation if applicable.
+  absl::Status SimplifySlice(const GraphProperties& properties,
+                             bool use_shape_info, GraphDef* optimized_graph,
+                             NodeDef* node);
+
+  // Simplify a Case operation where the output_idx is known.
+  bool SimplifyCase(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplify a Select operation where the predicates are all true or all false.
+  bool SimplifySelect(const GraphProperties& properties,
+                      GraphDef* optimized_graph, NodeDef* node);
+
+  // Replaces variable updates that are effectively no-ops with NoOp nodes.
+  void RemoveRedundantVariableUpdates(GraphProperties* properties,
+                                      GraphDef* optimized_graph, NodeDef* node);
+
+  // Removes Reverse op over dimensions with size 1.
+  absl::Status RemoveReverse(const GraphProperties& properties,
+                             bool use_shape_info, GraphDef* optimized_graph,
+                             NodeDef* node);
+
+  // Removes RandomShuffle op if it is scalar or first dimension is of size 1.
+  void RemoveRandomShuffle(const GraphProperties& properties,
+                           bool use_shape_info, GraphDef* optimized_graph,
+                           NodeDef* node);
+
+  // Removes Shuffle or Transpose op over dimensions of size 1.
+  absl::Status RemoveShuffleOrTranspose(const GraphProperties& properties,
+                                        bool use_shape_info,
+                                        GraphDef* optimized_graph,
+                                        NodeDef* node);
+
+  // Removes Split or SplitV node if possible.
+  void RemoveSplitOrSplitV(const GraphProperties& properties,
+                           GraphDef* optimized_graph, NodeDef* node);
+
+  bool GetConcatAxis(const NodeDef& node, int* axis);
+  bool MergeConcat(bool use_shape_info, GraphProperties* properties,
+                   GraphDef* optimized_graph, NodeDef* node);
+
+  absl::Status AddQuantizedMatMulMinMaxOutConstNodes(NodeDef* node,
+                                                     GraphDef* optimized_graph);
+
+  // Points to an externally provided device or to owned_device_;
+  RewriterConfig::Toggle opt_level_;
+  DeviceBase* cpu_device_;
+  std::unique_ptr<DeviceBase> owned_device_;
+
+  std::unique_ptr<ResourceMgr> resource_mgr_;
+  GraphDef* graph_;
+  std::unique_ptr<NodeMap> node_map_;
+  std::unordered_set<string> nodes_to_preserve_;
+  // TODO(rmlarsen): Could these be keyed on absl::string_view?
+  absl::flat_hash_set<string> nodes_allowlist_;
+  absl::flat_hash_set<string> feed_nodes_;
+  absl::flat_hash_map<string, bool> maybe_foldable_nodes_;
+  bool has_fetch_;
+  bool graph_modified_;
+  bool graph_contains_assign_or_inplace_op_;
+  bool disable_compressed_tensor_optimization_;
+  bool fold_quantization_emulation_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
new file mode 100644
index 00000000..beb6bd09
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A custom optimizer that can be registered.
+class CustomGraphOptimizer : public GraphOptimizer {
+ public:
+  virtual ~CustomGraphOptimizer() {}
+  virtual absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+          nullptr) = 0;
+  // Populates ConfigProto on which the Session is run prior to running Init.
+  absl::Status InitWithConfig(
+      const ConfigProto& config_proto,
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config = nullptr) {
+    config_proto_ = config_proto;
+    return this->Init(config);
+  }
+
+  ConfigProto config_proto_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
new file mode 100644
index 00000000..67dff162
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_REGISTRY_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Contains plugin's configurations for each Grappler optimizer (on/off).
+// See tensorflow/core/protobuf/rewriter_config.proto for optimizer description.
+struct ConfigList {
+  ConfigList() {}
+  ConfigList(bool disable_model_pruning,
+             std::unordered_map<string, RewriterConfig_Toggle> config)
+      : disable_model_pruning(disable_model_pruning),
+        toggle_config(std::move(config)) {}
+
+  bool operator==(const ConfigList& other) const {
+    return (disable_model_pruning == other.disable_model_pruning) &&
+           (toggle_config == other.toggle_config);
+  }
+  bool disable_model_pruning;  // Don't remove unnecessary ops from the graph.
+  std::unordered_map<string, RewriterConfig_Toggle> toggle_config;
+};
+
+class CustomGraphOptimizerRegistry {
+ public:
+  static std::unique_ptr<CustomGraphOptimizer> CreateByNameOrNull(
+      const string& name);
+
+  static std::vector<string> GetRegisteredOptimizers();
+
+  typedef std::function<CustomGraphOptimizer*()> Creator;
+  // Register graph optimizer which can be called during program initialization.
+  // This class is not thread-safe.
+  static void RegisterOptimizerOrDie(const Creator& optimizer_creator,
+                                     const string& name);
+};
+
+class CustomGraphOptimizerRegistrar {
+ public:
+  explicit CustomGraphOptimizerRegistrar(
+      const CustomGraphOptimizerRegistry::Creator& creator,
+      const string& name) {
+    CustomGraphOptimizerRegistry::RegisterOptimizerOrDie(creator, name);
+  }
+};
+
+#define REGISTER_GRAPH_OPTIMIZER_AS(MyCustomGraphOptimizerClass, name) \
+  namespace {                                                          \
+  static ::tensorflow::grappler::CustomGraphOptimizerRegistrar         \
+      MyCustomGraphOptimizerClass##_registrar(                         \
+          []() { return new MyCustomGraphOptimizerClass; }, (name));   \
+  }  // namespace
+
+#define REGISTER_GRAPH_OPTIMIZER(MyCustomGraphOptimizerClass) \
+  REGISTER_GRAPH_OPTIMIZER_AS(MyCustomGraphOptimizerClass,    \
+                              #MyCustomGraphOptimizerClass)
+
+// A separate registry to register all plug-in CustomGraphOptimizers.
+class PluginGraphOptimizerRegistry {
+ public:
+  // Constructs a list of plug-in CustomGraphOptimizers from the global map
+  // `registered_plugin_optimizers`.
+  static std::vector<std::unique_ptr<CustomGraphOptimizer>> CreateOptimizers(
+      const std::set<string>& device_types);
+
+  typedef std::function<CustomGraphOptimizer*()> Creator;
+
+  // Returns plugin's config. If any of the config is turned off, the returned
+  // config will be turned off.
+  static ConfigList GetPluginConfigs(bool use_plugin_optimizers,
+                                     const std::set<string>& device_types);
+
+  // Registers plugin graph optimizer which can be called during program
+  // initialization. Dies if multiple plugins with the same `device_type` are
+  // registered. This class is not thread-safe.
+  static void RegisterPluginOptimizerOrDie(const Creator& optimizer_creator,
+                                           const std::string& device_type,
+                                           ConfigList& configs);
+
+  // Prints plugin's configs if there are some conflicts.
+  static void PrintPluginConfigsIfConflict(
+      const std::set<string>& device_types);
+
+  // Returns true when `plugin_config` conflicts with `user_config`:
+  // - Plugin's `disable_model_pruning` is not equal to `user_config`'s, or
+  // - At least one of plugin's `toggle_config`s is on when it is set to off in
+  //   `user_config`'s.
+  static bool IsConfigsConflict(ConfigList& user_config,
+                                ConfigList& plugin_config);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/auto_shard.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/auto_shard.h
new file mode 100644
index 00000000..400ace5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/auto_shard.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTO_SHARD_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTO_SHARD_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class AutoShard : public TFDataOptimizerBase {
+ public:
+  AutoShard() = default;
+  ~AutoShard() override = default;
+
+  string name() const override { return "tf_auto_shard"; }
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  int64_t num_workers_;
+  int64_t num_replicas_;
+  int64_t index_;
+  tensorflow::data::AutoShardPolicy auto_shard_policy_;
+};
+
+// For testing only
+namespace internal {
+bool IsEligibleRewriteBatchSize(const NodeDef& sink_node,
+                                const MutableGraphView& graph,
+                                std::vector<std::string>* ineligible_reason);
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTO_SHARD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
new file mode 100644
index 00000000..0860ba50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization does the following:
+//
+// 1. Adds `prefetch(AUTOTUNE)` after all asynchronous tf.data transformations
+// (e.g. parallel batch, parallel map, parallel interleave, and map + batch) if
+// they are not followed by a `prefetch` yet.
+//
+// 2. If there exists any `prefetch(buffer_size=N)` for `N>=0`,  it will replace
+// the transformation with autotunable version of `prefetch` which uses N as
+// the minimum size of the buffer.
+class AutotuneBufferSizes : public TFDataOptimizerBase {
+ public:
+  AutotuneBufferSizes() = default;
+  ~AutotuneBufferSizes() override = default;
+
+  string name() const override { return "autotune_buffer_sizes"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Received an invalid value for parameter ", kAutotune,
+                       ": ", autotune));
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/batch_parallelization.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
new file mode 100644
index 00000000..2e77dea0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
@@ -0,0 +1,65 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_BATCH_PARALLELIZATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_BATCH_PARALLELIZATION_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization parallelizes BatchDataset.
+class BatchParallelization : public TFDataOptimizerBase {
+ public:
+  BatchParallelization() = default;
+  ~BatchParallelization() override = default;
+
+  string name() const override { return "batch_parallelization"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_BATCH_PARALLELIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
new file mode 100644
index 00000000..977b0c5d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization sets intra-op parallelism to be 1.
+class DisableIntraOpParallelism : public TFDataOptimizerBase {
+ public:
+  DisableIntraOpParallelism() = default;
+  ~DisableIntraOpParallelism() override = default;
+
+  string name() const override { return "disable_intra_op_parallelism"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
new file mode 100644
index 00000000..3aded258
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_PREFETCH_LEGACY_AUTOTUNE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_PREFETCH_LEGACY_AUTOTUNE_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization disables the lagacy autotune option for PrefetchDataset.
+class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
+ public:
+  DisablePrefetchLegacyAutotune() = default;
+  ~DisablePrefetchLegacyAutotune() override = default;
+
+  string name() const override { return "disable_prefetch_legacy_autotune"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_PREFETCH_LEGACY_AUTOTUNE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
new file mode 100644
index 00000000..35c333c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_ENABLE_GRADIENT_DESCENT_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_ENABLE_GRADIENT_DESCENT_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization enables Gradient Descent Optimization in `ModelDataset`.
+class EnableGradientDescent : public TFDataOptimizerBase {
+ public:
+  EnableGradientDescent() = default;
+  ~EnableGradientDescent() override = default;
+
+  string name() const override { return "enable_gradient_descent"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_ENABLE_GRADIENT_DESCENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/filter_fusion.h
new file mode 100644
index 00000000..757f7557
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization fuses filter transformations.
+class FilterFusion : public TFDataOptimizerBase {
+ public:
+  FilterFusion() = default;
+  ~FilterFusion() override = default;
+
+  string name() const override { return "filter_fusion"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/filter_parallelization.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
new file mode 100644
index 00000000..63f75907
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_PARALLELIZATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_PARALLELIZATION_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization parallelizes FilterDataset when function is stateless.
+class FilterParallelization : public TFDataOptimizerBase {
+ public:
+  FilterParallelization() = default;
+  ~FilterParallelization() override = default;
+
+  string name() const override { return "filter_parallelization"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_PARALLELIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/function_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/function_utils.h
new file mode 100644
index 00000000..06034636
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_UTILS_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace function_utils {
+// This namespace contains utility functions for querying and modifying
+// FunctionDefs.
+
+// Describes a FunctionDef input tensor. In FunctionDefs, input tensor strings
+// have the format node_name:node_output:position (if they derive from nodes),
+// or input_name (if they derive from an argument).
+struct FunctionDefTensorDesc {
+  FunctionDefTensorDesc() = default;
+
+  FunctionDefTensorDesc(const string& node_name, const string& output,
+                        int position);
+
+  // Parses node_name:node_output:position string into its components.
+  explicit FunctionDefTensorDesc(const string& input);
+
+  // TODO(rachelim): Add provisions to deal with special formats, like how
+  // GrapplerFunctionItem expands node output range if position is not defined
+  string full_str;
+  string node_name;
+  string node_output;
+  int position = -1;
+};
+
+// Replaces all references to `from` tensor in func's nodes' inputs and retvals
+// to `to` tensor. This is similar to `MutableGraphView::ReplaceInputs`.
+void ReplaceReferences(const string& from, const string& to, FunctionDef* func);
+
+// Adds a function output to the function def, ensuring that the output key
+// is unique, and maps to output_tensor_name in the ret dict.
+void AddFunctionOutputWithUniqueName(absl::string_view prefix,
+                                     absl::string_view output_tensor_name,
+                                     FunctionDef* fdef, DataType dtype);
+
+// Adds an input to a FunctionDef.
+OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+                               DataType dtype);
+
+// Adds a node to a FunctionDef.
+NodeDef* AddNode(absl::string_view name, absl::string_view op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 FunctionDef* fd);
+
+// Checks whether the function contains a node with the given name.
+bool ContainsFunctionNodeWithName(absl::string_view name,
+                                  const FunctionDef& function);
+
+// Checks whether the function contains a node with the given op.
+bool ContainsFunctionNodeWithOp(absl::string_view op,
+                                const FunctionDef& function);
+
+// Checks whether the function contains an output with the given name.
+bool ContainsFunctionOutputWithName(absl::string_view name,
+                                    const FunctionDef& function);
+
+// Returns the index of the function input with the given name or -1 if the
+// function node does not exist.
+int FindFunctionInputWithName(absl::string_view name,
+                              const FunctionDef& function);
+
+// Returns the index of the function output with the given name or -1 if the
+// function node does not exist.
+int FindFunctionOutputWithName(absl::string_view name,
+                               const FunctionDef& function);
+
+// Returns the index of the function node with the given name or -1 if the
+// function node does not exist.
+int FindFunctionNodeWithName(absl::string_view name,
+                             const FunctionDef& function);
+
+// Returns the index of the function node with the given op or -1 if the
+// function node does not exist.
+int FindFunctionNodeWithOp(absl::string_view op, const FunctionDef& function);
+
+// Sets the function node name using the `prefix` as a prefix while guaranteeing
+// the name is unique across the functions nodes.
+void SetUniqueFunctionNodeName(absl::string_view prefix, FunctionDef* function,
+                               NodeDef* node);
+
+// Checks if the function is stateful by checking the function graph for
+// stateful ops. Because the "If" and "While" ops are conservatively marked as
+// stateful, the check recurses into their graph to determine whether they are
+// actually stateful. The `skip_assert` argument determines whether the "Assert"
+// op should be treated as stateful or not.
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def,
+                        bool skip_assert = false);
+
+// Checks if the node is stateful. Because the "If" or "While" ops are
+// conservatively marked as stateful, the check recurses into their graph to
+// determine whether they are actually stateful. The `skip_assert` argument
+// determines whether the "Assert" op  should be treated as stateful or not.
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert = false);
+
+}  // end namespace function_utils
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/fusion_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/fusion_utils.h
new file mode 100644
index 00000000..d0b7ed7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/fusion_utils.h
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUSION_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUSION_UTILS_H_
+
+#include <functional>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace fusion_utils {
+
+// These functions are invoked with first and second function signature,
+// should set a signature of fused second_function.
+using SetFunctionSignatureFn = std::function<void(
+    const OpDef& first_function_signature,
+    const OpDef& second_function_signature, OpDef* fused_function_signature)>;
+
+using StringCollection = absl::InlinedVector<string, 2UL>;
+
+// These functions are invoked with nodes from second function that were
+// previously taking arguments as input. The `arg_num` tells which
+// function argument node was using as an input, e.g:
+// node(arg_1, other_node, arg_4)
+// would be called on the first and third input with arg_num equal 1 and 4.
+// It should set up inputs based on first function inputs or outputs or
+// second function inputs.
+using SetInputFn =
+    std::function<string(const StringCollection& first_function_inputs,
+                         const StringCollection& second_function_inputs,
+                         const StringCollection& parent_outputs, int arg_num)>;
+
+// This function is invoked with first and second function ret. It is used to
+// set up returns of fused function.
+using SetOutputFn =
+    std::function<void(const protobuf::Map<string, string>& parent_ret,
+                       const protobuf::Map<string, string>& second_function_ret,
+                       protobuf::Map<string, string>* fused_ret)>;
+
+using SetNodesFn = std::function<void(
+    const FunctionDef& first_function, const FunctionDef& second_function,
+    FunctionDef* fused_function, FunctionDefLibrary* library)>;
+
+void MergeNodes(const FunctionDef& first_function,
+                const FunctionDef& second_function, FunctionDef* fused_function,
+                FunctionDefLibrary* library);
+
+// Returns true if functions can be composed.
+bool CanCompose(const OpDef& first_signature, const OpDef& second_signature);
+
+void ComposeSignature(const OpDef& first_signature,
+                      const OpDef& second_signature, OpDef* fused_signature);
+
+string ComposeInput(const StringCollection& first_inputs,
+                    const StringCollection& second_inputs,
+                    const StringCollection& first_outputs, int arg_num);
+
+// Sets output to the composition of first and second function:
+// second_function(first_function(args...)).
+void ComposeOutput(const protobuf::Map<string, string>& first_ret,
+                   const protobuf::Map<string, string>& second_ret,
+                   protobuf::Map<string, string>* fused_ret);
+
+// Set input signature to `first_function_signature` and output signature
+// to `first_function_signature` + `second_function_signature`
+void CombineSignature(const OpDef& first_signature,
+                      const OpDef& second_signature, OpDef* fused_signature);
+
+// Apart from first function returns, return values from second function as
+// extra returns like:
+// return *first_function(...), *second_function(...)
+void CombineOutput(const protobuf::Map<string, string>& first_ret,
+                   const protobuf::Map<string, string>& second_ret,
+                   protobuf::Map<string, string>* fused_ret);
+
+// Returns true if both signatures have the same number of input and output
+// args.
+bool HasSameSignature(const OpDef& first_signature,
+                      const OpDef& second_signature);
+
+// Check if both signatures are same and copy it from `first_signature`.
+void SameSignature(const OpDef& first_signature, const OpDef& second_signature,
+                   OpDef* fused_signature);
+
+// Take the same input as first function.
+string SameInput(const StringCollection& first_inputs,
+                 const StringCollection& second_inputs,
+                 const StringCollection& first_outputs, int arg_num);
+
+// Create a fused function that computes the short-circuit logical AND of the
+// result of the first function and the result of the second function.
+void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
+                           const protobuf::Map<string, string>& second_ret,
+                           protobuf::Map<string, string>* fused_ret);
+
+void LazyConjunctionNodes(const FunctionDef& first_function,
+                          const FunctionDef& second_function,
+                          FunctionDef* fused_function,
+                          FunctionDefLibrary* library);
+
+// Fuse `first_function` with `second_function`, setting `fused_name_prefix` as
+// a name prefix.  The nodes from `first_function` are copied unmodified.  All
+// of the setup functions are called with a copy of second function having names
+// that are not conflicting with first function.  This means that copied nodes
+// from  second function can end up having different names.  For explanation of
+// set up functions see the documentation of the functions types.
+FunctionDef* FuseFunctions(const FunctionDef& first_function,
+                           const FunctionDef& second_function,
+                           absl::string_view fused_name_prefix,
+                           const SetFunctionSignatureFn& set_signature,
+                           const SetInputFn& set_input,
+                           const SetOutputFn& set_output,
+                           const SetNodesFn& set_nodes,
+                           FunctionDefLibrary* library);
+
+}  // namespace fusion_utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
new file mode 100644
index 00000000..2b09eafc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_TEST_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_TEST_UTILS_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_tests_utils {
+
+// Creates a test NodeDef for BatchDatasetV2.
+NodeDef MakeBatchV2Node(absl::string_view name,
+                        absl::string_view input_node_name,
+                        absl::string_view batch_size_node_name,
+                        absl::string_view drop_remainder_node_name,
+                        bool parallel_copy);
+
+// Creates a test NodeDef for ParallelBatchDataset.
+NodeDef MakeParallelBatchNode(absl::string_view name,
+                              absl::string_view input_node_name,
+                              absl::string_view batch_size_node_name,
+                              absl::string_view num_parallel_calls_node_name,
+                              absl::string_view drop_remainder_node_name,
+                              absl::string_view deterministic);
+
+// Creates a test NodeDef for ShuffleDatasetV2.
+NodeDef MakeCacheV2Node(absl::string_view name,
+                        absl::string_view input_node_name,
+                        absl::string_view filename_node_name,
+                        absl::string_view cache_node_name);
+
+// Creates a test NodeDef for FilterDataset.
+NodeDef MakeFilterNode(absl::string_view name,
+                       absl::string_view input_node_name,
+                       absl::string_view function_name = "IsZero");
+
+// Creates a test NodeDef for MapDataset.
+NodeDef MakeMapNode(absl::string_view name, absl::string_view input_node_name,
+                    absl::string_view function_name = "XTimesTwo");
+
+// Creates a test NodeDef for MapAndBatchDataset.
+NodeDef MakeMapAndBatchNode(absl::string_view name,
+                            absl::string_view input_node_name,
+                            absl::string_view batch_size_node_name,
+                            absl::string_view num_parallel_calls_node_name,
+                            absl::string_view drop_remainder_node_name,
+                            absl::string_view function_name = "XTimesTwo");
+
+// Creates a test NodeDef for ParallelInterleaveDatasetV2.
+NodeDef MakeParallelInterleaveV2Node(
+    absl::string_view name, absl::string_view input_node_name,
+    absl::string_view cycle_length_node_name,
+    absl::string_view block_length_node_name,
+    absl::string_view num_parallel_calls_node_name,
+    absl::string_view function_name, bool sloppy);
+
+// Creates a test NodeDef for ParallelInterleaveDatasetV4.
+NodeDef MakeParallelInterleaveV4Node(
+    absl::string_view name, absl::string_view input_node_name,
+    absl::string_view cycle_length_node_name,
+    absl::string_view block_length_node_name,
+    absl::string_view num_parallel_calls_node_name,
+    absl::string_view function_name, absl::string_view deterministic);
+
+// Creates a test NodeDef for InterleaveDataset.
+NodeDef MakeInterleaveNode(absl::string_view name,
+                           absl::string_view input_node_name,
+                           absl::string_view cycle_length_node_name,
+                           absl::string_view block_length_node_name,
+                           absl::string_view function_name,
+                           absl::string_view deterministic);
+
+// Creates a test NodeDef for ParallelMapDataset.
+NodeDef MakeParallelMapNode(absl::string_view name,
+                            absl::string_view input_node_name,
+                            absl::string_view num_parallel_calls_node_name,
+                            absl::string_view function_name, bool sloppy);
+
+// Creates a test NodeDef for ParallelMapDatasetV2.
+NodeDef MakeParallelMapV2Node(absl::string_view name,
+                              absl::string_view input_node_name,
+                              absl::string_view num_parallel_calls_node_name,
+                              absl::string_view function_name,
+                              absl::string_view deterministic,
+                              bool use_unbounded_threadpool);
+
+// Creates a test NodeDef for ParseExampleDataset.
+NodeDef MakeParseExampleNode(absl::string_view name,
+                             absl::string_view input_node_name,
+                             absl::string_view num_parallel_calls_node_name,
+                             bool sloppy);
+
+// Creates a test NodeDef for ShuffleDatasetV2.
+NodeDef MakeShuffleV2Node(absl::string_view name,
+                          absl::string_view input_node_name,
+                          absl::string_view buffer_size_node_name,
+                          absl::string_view seed_generator_node_name);
+
+// Creates a test NodeDef for TakeDataset.
+NodeDef MakeTakeNode(absl::string_view name, absl::string_view input_node_name,
+                     absl::string_view count_node_name);
+
+// Creates a test NodeDef for TensorSliceDataset.
+NodeDef MakeTensorSliceNode(absl::string_view name,
+                            absl::string_view tensor_node_name,
+                            bool replicate_on_split);
+
+// Creates a test NodeDef for SkipDataset.
+NodeDef MakeSkipNode(absl::string_view name, absl::string_view input_node_name,
+                     absl::string_view count_node_name);
+
+// Creates a test NodeDef for ShardDataset.
+NodeDef MakeShardNode(absl::string_view name, absl::string_view input_node_name,
+                      absl::string_view num_shards_node_name,
+                      absl::string_view index_node_name);
+
+// Creates a test NodeDef for PrefetchDataset.
+NodeDef MakePrefetchNode(absl::string_view name,
+                         absl::string_view input_node_name,
+                         absl::string_view buffer_size);
+
+}  // namespace graph_tests_utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/graph_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/graph_utils.h
new file mode 100644
index 00000000..70d0c480
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -0,0 +1,214 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_utils {
+
+// Returns the index of the first element in collection that fulfills predicate.
+// If no such element exists, returns -1.
+template <typename Predicate, typename Collection>
+int GetFirstElementIndexWithPredicate(const Predicate& predicate,
+                                      const Collection& collection) {
+  unsigned idx = 0;
+  for (auto&& element : collection) {
+    if (predicate(element)) {
+      return idx;
+    }
+    idx++;
+  }
+  return -1;
+}
+
+// Adds a node to the graph.
+NodeDef* AddNode(absl::string_view name, absl::string_view op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 MutableGraphView* graph);
+
+// Adds Placeholder node for given type.
+NodeDef* AddScalarPlaceholder(DataType dtype, MutableGraphView* graph);
+
+// Adds a Const node with the given value to the graph.
+template <typename T>
+NodeDef* AddScalarConstNode(T v, MutableGraphView* graph) {
+  // is_same is an idiomatic hack for making it compile if not instantiated.
+  // Replacing with false will result in a compile-time error.
+  static_assert(!std::is_same<T, T>::value,
+                "Invalid specialization of this method for type T.");
+  return {};
+}
+
+template <>
+NodeDef* AddScalarConstNode(bool v, MutableGraphView* graph);
+template <>
+NodeDef* AddScalarConstNode(double v, MutableGraphView* graph);
+template <>
+NodeDef* AddScalarConstNode(float v, MutableGraphView* graph);
+template <>
+NodeDef* AddScalarConstNode(int v, MutableGraphView* graph);
+template <>
+NodeDef* AddScalarConstNode(int64_t v, MutableGraphView* graph);
+template <>
+NodeDef* AddScalarConstNode(absl::string_view v, MutableGraphView* graph);
+
+// Retrieves the value of a const node. Returns an error
+// if the node is not const, or its value is of a different type.
+template <typename T>
+absl::Status GetScalarConstNodeValue(const NodeDef& node, T* value) {
+  // is_same is an idiomatic hack for making it compile if not instantiated.
+  // Replacing with false will result in a compile-time error.
+  static_assert(!std::is_same<T, T>::value,
+                "Invalid specialization of this method fo rtype T.");
+}
+
+template <>
+absl::Status GetScalarConstNodeValue(const NodeDef& node, int64_t* value);
+template <>
+absl::Status GetScalarConstNodeValue(const NodeDef& node, bool* value);
+
+// Checks whether the two graphs are the same.
+bool Compare(const GraphDef& g1, const GraphDef& g2);
+
+// Checks whether the graph contains a node with the given name.
+bool ContainsGraphNodeWithName(absl::string_view name, const GraphDef& graph);
+
+// Checks whether the library contains a function with the given name.
+bool ContainsGraphFunctionWithName(absl::string_view name,
+                                   const FunctionDefLibrary& library);
+
+// Checks whether the graph contains a node with the given op.
+bool ContainsNodeWithOp(absl::string_view op, const GraphDef& graph);
+
+// Returns the index of the node with the given name or -1 if the node does
+// not exist.
+int FindGraphNodeWithName(absl::string_view name, const GraphDef& graph);
+
+// Returns the index of the function with the given name or -1 if the function
+// does not exist.
+int FindGraphFunctionWithName(absl::string_view name,
+                              const FunctionDefLibrary& library);
+
+// Returns the index of the first node with the given op or -1 if no such  node
+// exists.
+int FindGraphNodeWithOp(absl::string_view op, const GraphDef& graph);
+
+// Gets the 0th input to a node in the graph.
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph);
+
+// Gets the ith input to a node in the graph.
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
+                      int64_t i);
+
+// Gets the attr corresponding to a dataset node's output types, if it exists.
+absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
+                                       DataTypeVector* output_types);
+
+// Returns the list of indices of all nodes with the given op or empty list if
+// no such node exists.
+std::vector<int> FindAllGraphNodesWithOp(const string& op,
+                                         const GraphDef& graph);
+
+// Sets the node name using `prefix` as a prefix while guaranteeing the name
+// is unique across the graph.
+void SetUniqueGraphNodeName(absl::string_view prefix, GraphDef* graph,
+                            NodeDef* node);
+
+// Sets the function name using the `prefix` name as a prefix while guaranteeing
+// the name is unique across the function library.
+void SetUniqueGraphFunctionName(absl::string_view prefix,
+                                const FunctionDefLibrary* library,
+                                FunctionDef* function);
+
+// Copies attribute having name `attribute_name` from node `from` to node
+// `to_node`.
+void CopyAttribute(const string& attribute_name, const NodeDef& from,
+                   NodeDef* to_node);
+
+// Concatenates list attribute having name `attribute_name` from `first` and
+// `second` node, setting it to `to_node`.
+void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
+                         const NodeDef& second, NodeDef* to_node);
+
+// Checks that all nodes in the graphs have unique names, and sets their names
+// to be unique if they are not already.  This is necessary as Graph does not
+// have the provisions to deduplicate names, and name deduplication elsewhere
+// in tensorflow happens in other layers (for example, in the Scope class of the
+// C++ API). Note that the nodes in the graph are identified by their id,
+// and renaming nodes does not mutate any edges.
+absl::Status EnsureNodeNamesUnique(Graph* g);
+
+// Returns the item's fetch node, if there is exactly one. Otherwise, returns an
+// error.
+absl::Status GetFetchNode(const MutableGraphView& graph,
+                          const GrapplerItem& item, NodeDef** fetch_node);
+
+// Returns true if `item` is derived from a `FunctionDef`, false otherwise.
+// Currently, we determine this heuristically: If we don't have any fetch nodes
+// or all fetch nodes are `Retval` ops, then we consider this item as derived
+// from a `FunctionDef`.
+bool IsItemDerivedFromFunctionDef(const GrapplerItem& item,
+                                  const MutableGraphView& graph_view);
+
+// If both input nodes have the "metadata" attribute set, it populates the
+// "metadata" attribute for the fused node.
+void MaybeSetFusedMetadata(const NodeDef& node1, const NodeDef& node2,
+                           NodeDef* fused_node);
+
+// Copies the attributes `output_shapes`, `output_types` from node `from` to
+// node `to_node` if they exist. The method will return `true` if attributes
+// copied successfully, otherwise it will return `false`.
+//
+// Some tf.data transformations set `Toutput_types` instead of `output_types`
+// when the attribute describes type of tensor inputs (e.g. TensorDataset,
+// TensorSliceDataset, and PaddedBatchDataset). In this case the method copies
+// the attribute `Toutput_types` of node `from` to the attribute `output_types`
+// of node `to_node`.
+bool CopyShapesAndTypesAttrs(const NodeDef& from, NodeDef* to_node);
+
+// Checks whether the op has a "sloppy" attribute.
+bool HasSloppyAttr(const string& op);
+
+// Checks whether the op has a "replicate_on_split" attribute.
+bool HasReplicateOnSplitAttr(const string& op);
+
+// Checks whether the op has a "deterministic" attribute.
+bool HasDeterministicAttr(const string& op);
+
+// Sets the `name` as the metadata name of the `node`. It returns an error if
+// the `node` already has a metadata name.
+absl::Status SetMetadataName(const std::string& name, NodeDef* node);
+
+}  // namespace graph_utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/inject_io_prefetch.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/inject_io_prefetch.h
new file mode 100644
index 00000000..444d49e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/inject_io_prefetch.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_IO_PREFETCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_IO_PREFETCH_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class InjectIoPrefetch : public TFDataOptimizerBase {
+ public:
+  InjectIoPrefetch() = default;
+  ~InjectIoPrefetch() override = default;
+
+  std::string name() const override { return "inject_io_prefetch"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ protected:
+  bool autotune_ = true;
+};
+
+class InjectIoPrefetchEligible : public InjectIoPrefetch {
+ public:
+  std::string name() const override { return "inject_io_prefetch_eligible"; };
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_IO_PREFETCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/inject_prefetch.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
new file mode 100644
index 00000000..f2ffda83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
@@ -0,0 +1,66 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_PREFETCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_PREFETCH_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// If autotune is ON and the last transformation in the input pipeline is not
+// `prefetch()`, this optimization adds `prefetch(AUTOTUNE)` after it.
+class InjectPrefetch : public TFDataOptimizerBase {
+ public:
+  InjectPrefetch() = default;
+  ~InjectPrefetch() override = default;
+
+  std::string name() const override { return "inject_prefetch"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ protected:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_PREFETCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/make_deterministic.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/make_deterministic.h
new file mode 100644
index 00000000..30659c43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/make_deterministic.h
@@ -0,0 +1,77 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_DETERMINISTIC_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_DETERMINISTIC_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Removes sources on nondeterminism from dataset ops. Nondeterminism can occur
+// in the follow ways, each which this pass addresses:
+//
+// 1. The datasets ParallelInterleave, ParallelMap, and MapAndBatch can
+//    introduce nondeterminism by running a function multiple times in parallel.
+//    Specifically, if the function can mutate state, it is potentially
+//    nondeterministic. In such cases, this pass converts such dataset ops to a
+//    non-parallel version. As a performance optimization, in certain cases this
+//    pass will instead move nondeterministic ops to a separate non-parallel Map
+//    op, so that most of the ops can still run in parallel.
+//
+// 2. Certain datasets, such as Prefetch, can introduce asynchrony by running a
+//    dataset iterator in a background thread while ops outside the dataset are
+//    also running. This can introduce nondeterminism if the input pipeline has
+//    certain stateful ops. Other than Prefetch, datasets with a
+//    `num_parallel_calls` argument also introduce asynchrony, which includes
+//    the parallel datasets mentioned in (1) above.
+//
+//    This pass modifies nodes to remove asynchrony when there are any datasets
+//    in the graph with problematic stateful ops. This is done by converting
+//    parallel ops into non-parallel versions, as in (1), and by removing
+//    Prefetch nodes. Unlike (1), legacy random ops such as RandomUniform are
+//    not problematic despite being stateful, as if the op is within a dataset's
+//    function, ops outside the dataset cannot access the state. Also unlike
+//    (1), nondeterministic ops are never moved to a separate Map op, since
+//    doing so would not remove asynchrony.
+//
+// 3. Nondeterminism occurs if an op has a "deterministic" attribute that is
+//    false or a "sloppy" attribute that is true. This pass changes such
+//    attributes to be deterministic.
+class MakeDeterministic : public TFDataOptimizerBase {
+ public:
+  MakeDeterministic() = default;
+  ~MakeDeterministic() override = default;
+
+  string name() const override { return "make_deterministic"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_DETERMINISTIC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/make_sloppy.h
new file mode 100644
index 00000000..b1046809
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MakeSloppy : public TFDataOptimizerBase {
+ public:
+  MakeSloppy() = default;
+  ~MakeSloppy() override = default;
+
+  string name() const override { return "make_sloppy"; }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
new file mode 100644
index 00000000..7e7e002b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MapAndBatchFusion : public TFDataOptimizerBase {
+ public:
+  MapAndBatchFusion() = default;
+  ~MapAndBatchFusion() override = default;
+
+  string name() const override { return "map_and_batch_fusion"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
new file mode 100644
index 00000000..018a8751
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This transformation fuses map and filter operations by moving computation of
+// filter predicate to MapDataset, which as a result produces an extra boolean
+// component. We filter by the boolean component, then project it away.
+//
+// In symbols, we transform map(x -> f(x)).filter(f(x) -> p(f(x))) into
+// map(x -> f(x), p(f(x))).filter(f(x), p(f(x)) -> p(f(x))).map(f(x), p(f(x))
+// -> f(x)). This is more efficient because the latter filter and map operations
+// can be performed short-circuit, so only the first map requires an executor
+// invocation.
+class MapAndFilterFusion : public TFDataOptimizerBase {
+ public:
+  MapAndFilterFusion() = default;
+  ~MapAndFilterFusion() override = default;
+
+  string name() const override { return "map_and_filter_fusion"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_fusion.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_fusion.h
new file mode 100644
index 00000000..2512fc88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization fuses map transformations by merging their map functions.
+class MapFusion : public TFDataOptimizerBase {
+ public:
+  MapFusion() = default;
+  ~MapFusion() override = default;
+
+  string name() const override { return "map_fusion"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_parallelization.h
new file mode 100644
index 00000000..6ed70034
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization parallelizes MapDataset when function is stateless.
+class MapParallelization : public TFDataOptimizerBase {
+ public:
+  MapParallelization() = default;
+  ~MapParallelization() override = default;
+
+  string name() const override { return "map_parallelization"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return absl::OkStatus();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
new file mode 100644
index 00000000..e839389d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimizer performs tf.data-specific optimizations by invoking
+// other optimizers.
+class TFDataMetaOptimizer : public CustomGraphOptimizer {
+ public:
+  TFDataMetaOptimizer() = default;
+  ~TFDataMetaOptimizer() override = default;
+
+  string name() const override { return "tf_data_meta_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) override;
+
+ private:
+  absl::flat_hash_map<string, std::unique_ptr<GraphOptimizer>>
+      enabled_optimizers_;
+
+  // Applies an optimization with the specified name on `item`, and stores
+  // the result in `item.graph`
+  absl::Status ApplyOptimization(const string& name, Cluster* cluster,
+                                 GrapplerItem* item) const;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/noop_elimination.h
new file mode 100644
index 00000000..389b112e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This class eliminates tf.data transformations such as `take(n)` (for n < 0),
+// `skip(0)`, `repeat(1)`, or `prefetch(0)`.
+class NoOpElimination : public TFDataOptimizerBase {
+ public:
+  NoOpElimination() = default;
+  ~NoOpElimination() override = default;
+
+  string name() const override { return "noop_elimination"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/optimizer_base.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/optimizer_base.h
new file mode 100644
index 00000000..7cd16fba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/optimizer_base.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A base class for tf.data optimizers.
+class TFDataOptimizerBase : public CustomGraphOptimizer {
+ public:
+  struct OptimizationStats {
+    // Identifies the number of independent graph changes for an optimization.
+    int64_t num_changes = 0;
+  };
+
+  TFDataOptimizerBase() = default;
+  ~TFDataOptimizerBase() override = default;
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) final;
+
+  virtual absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                               const GrapplerItem& item,
+                                               GraphDef* output,
+                                               OptimizationStats* stats) = 0;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/parallel_batch.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/parallel_batch.h
new file mode 100644
index 00000000..46b5ff9c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/parallel_batch.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_PARALLEL_BATCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_PARALLEL_BATCH_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ParallelBatch : public TFDataOptimizerBase {
+ public:
+  ParallelBatch() = default;
+  ~ParallelBatch() override = default;
+
+  string name() const override { return "parallel_batch"; }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_PARALLEL_BATCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/remove_compression_map.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
new file mode 100644
index 00000000..550436f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REMOVE_COMPRESSION_MAP_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REMOVE_COMPRESSION_MAP_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class RemoveCompressionMap : public TFDataOptimizerBase {
+ public:
+  RemoveCompressionMap() = default;
+  ~RemoveCompressionMap() override = default;
+
+  string name() const override { return "remove_compression_map"; }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REMOVE_COMPRESSION_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/replicate_on_split.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/replicate_on_split.h
new file mode 100644
index 00000000..cffcbd18
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/replicate_on_split.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REPLICATE_ON_SPLIT_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REPLICATE_ON_SPLIT_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ReplicateOnSplit : public TFDataOptimizerBase {
+ public:
+  ReplicateOnSplit() = default;
+  ~ReplicateOnSplit() override = default;
+
+  string name() const override { return "replicate_on_split"; }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REPLICATE_ON_SPLIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h
new file mode 100644
index 00000000..c881d9aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SEQ_INTERLEAVE_PREFETCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SEQ_INTERLEAVE_PREFETCH_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization replaces parallel interleave with sequential interleave and
+// adds `prefetch(AUTOTUNE)` after the user defined map function in interleave.
+class SeqInterleavePrefetch : public TFDataOptimizerBase {
+ public:
+  SeqInterleavePrefetch() = default;
+  ~SeqInterleavePrefetch() override = default;
+
+  std::string name() const override { return "seq_interleave_prefetch"; };
+
+  // The SeqInterleavePrefetch optimizer requires access to the function
+  // library.
+  bool UsesFunctionLibrary() const override { return true; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ protected:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SEQ_INTERLEAVE_PREFETCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
new file mode 100644
index 00000000..ba30ca63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ShuffleAndRepeatFusion : public TFDataOptimizerBase {
+ public:
+  ShuffleAndRepeatFusion() = default;
+  ~ShuffleAndRepeatFusion() override = default;
+
+  string name() const override { return "shuffle_and_repeat_fusion"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/slack.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/slack.h
new file mode 100644
index 00000000..af70d314
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/slack.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
+
+#include "absl/strings/numbers.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization sets the slack attr of the terminal PrefetchDataset node in
+// an input pipeline.
+class Slack : public TFDataOptimizerBase {
+ public:
+  Slack() = default;
+  ~Slack() override = default;
+
+  string name() const override { return "slack"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return errors::InvalidArgument("Config parameter required.");
+
+    const string& slack_period_param =
+        config->parameter_map().at("slack_period").s();
+    if (!absl::SimpleAtoi(slack_period_param, &slack_period_)) {
+      return errors::InvalidArgument("Invalid `slack_period` parameter: ",
+                                     slack_period_param);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+
+ private:
+  int64_t slack_period_ = -1;
+
+  absl::Status RecursivelyHandleOp(const MutableGraphView& graph,
+                                   NodeDef* dataset_node);
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/split_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/split_utils.h
new file mode 100644
index 00000000..df4c52b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/split_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SPLIT_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SPLIT_UTILS_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace split_utils {
+
+// Return value of `SplitFunction`, which is described below.
+struct SplitResults {
+  FunctionDef first_function;
+  FunctionDef second_function;
+  std::vector<DataType> first_function_output_types;
+};
+
+// Splits a FunctionDef into two FunctionDefs, called `first` and `second`, such
+// that calling `function(*args)` is equivalent to calling
+// `second(first(*args))`. The set `nodes_in_first_function` specifies nodes
+// that are copied to `first`, and the other nodes are copied to `second`. Any
+// edges from `first` to `second` will be represented by an output of `first`
+// and a corresponding input of `second`. The caller must pass
+// `nodes_in_first_function` such that there will not be any edges from `second`
+// to `first`.
+//
+// For example, if you have the following function (using Python syntax):
+//
+//     def f(x):
+//       y = tf.math.add(x, 1., name='add')
+//       return tf.multiply(y, 2, name='mul')
+//
+// Calling SplitFunction(f, {'add'}) results in:
+//
+//     def first_function(x):
+//       return tf.math.add(x, 1., name='add')
+//     def second_function(y):
+//       return tf.multiply(y, 2, name='mul')
+//
+// The `num_captured_inputs` argument controls which arguments of `function`
+// will be arguments of `second`. If it is zero, the only arguments of `second`
+// are the outputs of `first`. If it is above zero, the last
+// `num_caputured_inputs` arguments of `function` will also be arguments of
+// `second`.
+//
+// Splitting functions in certain cases is unimplemented, in which case an
+// Unimplemented status will be returned. Grappler passes must gracefully handle
+// Unimplemented statuses without returning the error to its caller.
+absl::StatusOr<SplitResults> SplitFunction(
+    const FunctionDef& function,
+    const absl::flat_hash_set<absl::string_view>& nodes_in_first_function,
+    int64_t num_captured_inputs, const FunctionLibraryDefinition& library);
+
+}  // namespace split_utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SPLIT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h
new file mode 100644
index 00000000..b886d36a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_USE_PRIVATE_THREAD_POOL_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_USE_PRIVATE_THREAD_POOL_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization creates private thread pool for the input pipeline.
+class UsePrivateThreadPool : public TFDataOptimizerBase {
+ public:
+  UsePrivateThreadPool() = default;
+  ~UsePrivateThreadPool() override = default;
+
+  string name() const override { return "use_private_thread_pool"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  absl::Status OptimizeAndCollectStats(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* output,
+                                       OptimizationStats* stats) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_USE_PRIVATE_THREAD_POOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/debug_stripper.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/debug_stripper.h
new file mode 100644
index 00000000..c94257f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/debug_stripper.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// DebugStripper strips off debug-related nodes (e.g.
+// Assert, CheckNumerics, Print) from the graph.
+class DebugStripper : public GraphOptimizer {
+ public:
+  DebugStripper() {}
+  ~DebugStripper() override {}
+
+  string name() const override { return "debug_stripper"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/dependency_optimizer.h
new file mode 100644
index 00000000..cc8d7043
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by removing control dependencies or re-arranging
+// them to shorten the critical path for a model step or enable other
+// optimizations, such as removing nodes that are effectively noops.
+class DependencyOptimizer : public GraphOptimizer {
+ public:
+  DependencyOptimizer() {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
+  ~DependencyOptimizer() override {}
+
+  string name() const override { return "dependency_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  // Returns true if bypassing node does not increase the number of edges or
+  // number of edges crossing a device boundary.
+  bool BypassingNodeIsBeneficial(
+      const NodeDef& node, const std::vector<NodeDef*>& input_nodes,
+      const std::vector<NodeDef*>& output_nodes) const;
+  int NumEdgesIfBypassed(const NodeDef& node,
+                         const std::vector<NodeDef*>& output_nodes) const;
+  // Returns true if node is not an Identity node or if it is an Identity
+  // that is safe to remove.
+  bool SafeToRemoveIdentity(const NodeDef& node) const;
+  // Returns true if it is safe to convert node to NoOp.
+  bool SafeToConvertToNoOp(const NodeDef& node) const;
+  // Removes all duplicate control dependencies.
+  void CleanControlInputs();
+  // Builds a map from the &optimized_graph_->node(i) to i.
+  void BuildNodeToIdx();
+  // Tries to optimize the node with the given index, possibly additional
+  // optimizations by inserting nodes in nodes_to_simplify, and pruning nodes by
+  // inserting them in nodes_to_delete.
+  void OptimizeNode(int node_idx, SetVector<int>* nodes_to_simplify,
+                    std::set<int>* nodes_to_delete);
+  // Eliminates redundant control dependencies by computing the transitive
+  // reduction of the graph.
+  absl::Status TransitiveReduction();
+  // Main driver of dependency optimizations.
+  absl::Status OptimizeDependencies();
+  // Replaces multiple cross-device control edges from the same device with a
+  // single control edge.  If `host_granularity` is true then group control
+  // edges from all devices on the same host.
+  void GroupCrossDeviceControlEdges(bool host_granularity);
+
+  bool fetch_nodes_known_;
+  std::unordered_set<string> nodes_to_preserve_;
+  std::unique_ptr<NodeMap> node_map_;
+  std::unordered_map<const NodeDef*, int> node_to_idx_;
+  GraphDef* optimized_graph_;  // Not owned.
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/evaluation_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/evaluation_utils.h
new file mode 100644
index 00000000..9ae5cb22
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/evaluation_utils.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EVALUATION_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EVALUATION_UTILS_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace Eigen {
+class ThreadPoolInterface;
+class ThreadPoolWrapper;
+}  // namespace Eigen
+
+namespace tensorflow {
+namespace grappler {
+
+class DeviceSimple : public DeviceBase {
+ public:
+  DeviceSimple();
+  ~DeviceSimple();
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+  const std::string& device_type() const override { return device_type_; }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+  const std::string device_type_ = DEVICE_CPU;
+};
+
+absl::Status EvaluateNode(const NodeDef& node,
+                          const absl::InlinedVector<TensorValue, 4UL>& inputs,
+                          DeviceBase* cpu_device, ResourceMgr* resource_mgr,
+                          absl::InlinedVector<TensorValue, 4UL>* output);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EVALUATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/function_api_info.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/function_api_info.h
new file mode 100644
index 00000000..e2ae234f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_API_INFO_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_API_INFO_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+class FunctionApiInfo {
+ public:
+  FunctionApiInfo();
+  virtual ~FunctionApiInfo();
+
+  enum FunctionType {
+    INFERENCE,  // Default type.
+    FORWARD,
+    BACKWARD,
+  };
+
+  absl::Status Init(const FunctionDef& function_def);
+
+  const string& interface_name() const;
+  const string& preferred_device() const;
+  const FunctionType function_type() const;
+  const string& pairing_function_name() const;
+  const DataTypeVector& input_arg_dtypes() const;
+  const DataTypeVector& output_arg_dtypes() const;
+
+ private:
+  string interface_name_;
+  string preferred_device_;
+  FunctionType function_type_;
+  // The pairing function is used to pair between forward and backward function,
+  // which will be useful during function swapping. Inference function won't
+  // have pairing function.
+  string pairing_function_name_;
+  // The following two attributes are useful for forward and backward functions.
+  DataTypeVector input_arg_dtypes_;
+  DataTypeVector output_arg_dtypes_;
+
+  FunctionApiInfo(const FunctionApiInfo&) = delete;
+  void operator=(const FunctionApiInfo&) = delete;
+};
+
+// A collection of information for function and the interface it implements.
+// A interface is a well defined math operation, eg I1 = 2 * x + y. Multiple
+// functions could implement the same interface with different behavior based on
+// different hardware condition and limits,
+// eg F1 = math_ops.add(math_ops.add(x, x), y), or
+//    F2 = math_ops.add(math_ops.matmul(x, 2), y).
+class FunctionLibraryApiInfo {
+ public:
+  FunctionLibraryApiInfo();
+  virtual ~FunctionLibraryApiInfo();
+  // Populate the internal field for the functions within the function_library.
+  absl::Status Init(const FunctionDefLibrary& function_library);
+
+  absl::Status GetEquivalentImplementations(
+      const string& function_name, std::vector<string>* other_functions) const;
+
+  const FunctionApiInfo* GetApiInfo(const string& function_name) const;
+  bool empty() const { return func_info_.empty(); }
+  std::size_t size() const { return func_info_.size(); }
+
+ private:
+  // Map between function name to function details.
+  std::unordered_map<string, std::unique_ptr<FunctionApiInfo>> func_info_;
+
+  // Map between interface name to function names.
+  // Forward/backward function pair usually have different signatures between
+  // each other since forward function could produce extra internal state as
+  // output, and backward will take those extra state as inputs.
+  absl::flat_hash_map<string, std::vector<string>> intf_to_inference_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_forward_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_backward_funcs_;
+
+  FunctionLibraryApiInfo(const FunctionLibraryApiInfo&) = delete;
+  void operator=(const FunctionLibraryApiInfo&) = delete;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_API_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/function_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/function_optimizer.h
new file mode 100644
index 00000000..8f8eb732
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Remap TensorFlow subgraphs onto alternative operations or collection of
+// operations to make the overall graph more efficient.
+class FunctionOptimizer : public GraphOptimizer {
+ public:
+  explicit FunctionOptimizer(RewriterConfig::Toggle opt_level,
+                             bool lower_control_flow)
+      : opt_level_(opt_level), lower_control_flow_(lower_control_flow) {}
+  ~FunctionOptimizer() override = default;
+
+  string name() const override { return "function_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  friend class FunctionOptimizerTest;
+
+  // Runs a single function optimizer pass over the `graph`. All nodes that are
+  // not function calls will be copied from the `graph` to the
+  // `optimized_graph`. Function call nodes inlined or specialized, and
+  // instantiated function body or specialized function call nodes will be added
+  // to the `optimized_graph`.
+  absl::Status RunFunctionOptimizerPass(const GrapplerItem& item,
+                                        GraphDef* optimized_graph) const;
+
+  RewriterConfig::Toggle opt_level_;
+  bool lower_control_flow_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
new file mode 100644
index 00000000..61a578fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_H_
+
+#include <string>
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize the data layout for convolutional models.
+class GenericLayoutOptimizer : public GraphOptimizer {
+ public:
+  explicit GenericLayoutOptimizer(string enforced_layout = "")
+      : GenericLayoutOptimizer(RewriterConfig::DEFAULT,
+                               RewriterConfig::NO_CONVERSION_ON_CPU,
+                               enforced_layout) {}
+  explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level,
+                                  string enforced_layout = "")
+      : GenericLayoutOptimizer(opt_level, RewriterConfig::NO_CONVERSION_ON_CPU,
+                               enforced_layout) {}
+  explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level,
+                                  RewriterConfig::CpuLayout layout_conversion,
+                                  string enforced_layout = "")
+      : opt_level_(opt_level),
+        cpu_layout_conversion_(layout_conversion),
+        enforced_layout_(enforced_layout) {}
+  ~GenericLayoutOptimizer() override = default;
+
+  string name() const override { return "layout"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* output) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+  RewriterConfig::CpuLayout cpu_layout_conversion_;
+  const string enforced_layout_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
new file mode 100644
index 00000000..1c0c0134
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -0,0 +1,676 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_TRANSPOSER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_TRANSPOSER_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAttrSrcFormat[] = "src_format";
+constexpr char kAttrDstFormat[] = "dst_format";
+constexpr char kAttrOutputShape[] = "_output_shapes";
+constexpr char kGPU[] = "GPU";
+constexpr char kCPU[] = "CPU";
+
+// TransposeContext owns all data members. Must initialize GraphProperties,
+// FrameView, GraphDef and MutableGraphView with the same graph. NodeDef
+// pointers in FrameView, GraphDef and MutableGraphView must point to nodes in
+// the same GraphDef instance.
+struct TransposeContext {
+  // Initializes TransposeContext with given GrapplerItem. Because initializing
+  // FrameMap and GraphProperties may return error, we initialize
+  // TransposeContext outside constructor.
+  static absl::Status InitializeTransposeContext(bool assume_valid_feeds,
+                                                 const GrapplerItem& item,
+                                                 const Cluster* cluster,
+                                                 TransposeContext* context);
+
+  static absl::Status InitializeTransposeContext(const GrapplerItem& item,
+                                                 const Cluster* cluster,
+                                                 TransposeContext* context) {
+    return InitializeTransposeContext(false, item, cluster, context);
+  }
+
+  // Sets data formats to convert from and to for specified device type.
+  void AssignDeviceAndDataFormats(absl::string_view target_device,
+                                  absl::string_view src_format,
+                                  absl::string_view dst_format);
+
+  FrameView frames;
+  GraphDef graph;
+  // Number of nodes in the original graph. As new nodes are appended to the end
+  // of the graph, all new nodes should have a node index greater than or equal
+  // to this.
+  int num_nodes;
+  absl::flat_hash_set<string> nodes_to_preserve;
+  std::unique_ptr<GraphProperties> graph_properties;
+  std::unique_ptr<utils::MutableGraphView> graph_view;
+
+  string target_device;
+  string src_format;
+  string dst_format;
+  absl::flat_hash_map<char, int> src_dim_indices;
+  absl::flat_hash_map<char, int> dst_dim_indices;
+  std::vector<int> src_to_dst;
+  std::vector<int> dst_to_src;
+
+  string enforced_layout;
+};
+
+class Transposer {
+ public:
+  explicit Transposer() {}
+
+  Transposer(const Transposer&) = delete;
+  Transposer& operator=(const Transposer&) = delete;
+
+  virtual ~Transposer() {}
+
+  // Returns true iff the node should be processed by this transposer.
+  // NodeProcessors may perform additional oprand specific checks before
+  // processing if necessary.
+  // Following common conditions are checked:
+  // * node's device matches target device
+  // * node's source format matches config's source format
+  // * node has output
+  bool ShouldProcess(const TransposeContext& context,
+                     const utils::MutableNodeView& node) const;
+
+  // Transposes given node from src format to dst format. Also perform other
+  // necessary operations to guarantee the graph produce the same result.
+  // Eg. Add Transpose node sets before fanin ports and after fanout ports.
+  virtual absl::Status TransposeNode(TransposeContext* context,
+                                     utils::MutableNodeView* node) = 0;
+
+  // Creates a Const node for permutation. If node with node_name already exits,
+  // return and reuse it.
+  absl::Status CreateConstPermNode(TransposeContext* context,
+                                   absl::string_view node_name,
+                                   absl::string_view device,
+                                   absl::Span<const int> permutation,
+                                   absl::string_view control_node_name,
+                                   utils::MutationNewNode* added_node);
+
+  // Creates a TransposeNode with given properties. If node with node_name
+  // already exits, return and reuse it.
+  // A const perm node is also created and connected to the 2nd fanin.
+  // control_node_name is ignored if it is empty.
+  absl::Status CreateTransposeNode(
+      TransposeContext* context, absl::string_view name_format,
+      const DataType& data_type, absl::string_view device,
+      TensorShapeProto fanin_shape, absl::Span<const int> permutation,
+      absl::string_view control_node_name, utils::MutationNewNode* added_node,
+      string* transpose_node_name);
+
+  // Update all edges between dst_node->fanin[dst_ports] and dst_node by
+  // inserting an op node.
+  absl::Status UpdateFaninEdgesWithOp(TransposeContext* context,
+                                      absl::Span<const int> dst_ports,
+                                      utils::MutableNodeView* dst_node,
+                                      absl::string_view op);
+
+  // Update all edges between src_node:src_ports and nodes take
+  // src_node:src_ports as fanin. Also update attr _output_shape of src_node.
+  absl::Status UpdateFanoutEdgesWithOp(TransposeContext* context,
+                                       absl::Span<const int> src_ports,
+                                       utils::MutableNodeView* src_node,
+                                       absl::string_view op);
+
+  // Creates a DataFromat node with given properties.
+  // DataFromat op is either DataFormatVecPermute or DataFormatDimMap.
+  absl::Status CreateDataFormatNode(
+      TransposeContext* context, absl::string_view node_name,
+      absl::string_view op, absl::string_view device, const DataType& data_type,
+      bool is_fanin_on_host, bool is_src_format_to_dst_format,
+      utils::MutationNewNode* added_node);
+
+ protected:
+  int GetFanoutPortRank(const utils::MutableNodeView& node, int port) const;
+  bool IsFanoutPortRankN(const utils::MutableNodeView& node, int port,
+                         int n) const;
+  bool IsFanoutPortsRankN(const utils::MutableNodeView& node,
+                          absl::Span<const int> ports, int n) const;
+  int GetFaninPortRank(const utils::MutableNodeView& node, int port) const;
+  bool IsFaninPortRankN(const utils::MutableNodeView& node, int port,
+                        int n) const;
+
+  // Checks if fanin at specified port(s) has dimensions `dims` iff fanin is a
+  // Const. If fanin is not a Const, no dimensions will be checked and this will
+  // return true.
+  bool IsFaninPortDimsNIfConst(const utils::MutableNodeView& node, int port,
+                               absl::Span<const int> dims) const;
+  bool IsFaninPortsDimsNIfConst(const utils::MutableNodeView& node,
+                                absl::Span<const int> ports,
+                                absl::Span<const int> dims) const;
+  bool CanProcessNode(const TransposeContext& context,
+                      const utils::MutableNodeView& node) const;
+  // Update all edges between dst_node->fanin[dst_ports] and dst_node.
+  // A node with op is created and inserted between all edges.
+  // op is one of Transpose, DataFormatVecPermute or DataFormatDimMap.
+  absl::Status UpdateEdge(TransposeContext* context,
+                          absl::string_view name_format, absl::string_view op,
+                          const AttrValue* input_shape, bool is_in_frame,
+                          bool is_src_format_to_dst_format, const int src_port,
+                          const int dst_port, utils::MutableNodeView* src_node,
+                          utils::MutableNodeView* dst_node);
+  string GetFaninNameFormat(absl::string_view node_name, int port,
+                            absl::string_view src_format,
+                            absl::string_view dst_format);
+  string GetFanoutNameFormat(absl::string_view node_name, int port, int index,
+                             absl::string_view src_format,
+                             absl::string_view dst_format);
+  string LayoutOptimizerNode(absl::string_view node_name);
+  string GetReshapeNodeNameFormat(absl::string_view node_name, int index,
+                                  absl::string_view src_format,
+                                  absl::string_view dst_format);
+  string GetShapeConstNodeNameFormat(absl::string_view node_name, int index);
+};
+
+class LayoutSensitiveOpTransposer : public Transposer {
+ public:
+  explicit LayoutSensitiveOpTransposer() : Transposer() {}
+
+  // Updates attrs data_format, ksize, strides of the given node to dst_format.
+  // _output_shape is updated during UpdateOutputEdges.
+  absl::Status UpdateNode(TransposeContext* context,
+                          utils::MutableNodeView* node);
+};
+
+// Layout sensitive op transposers.
+
+class DefaultLayoutSensitiveOpTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit DefaultLayoutSensitiveOpTransposer()
+      : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class BiasAddTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit BiasAddTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class AvgPoolGradTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit AvgPoolGradTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class BiasAddGradTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit BiasAddGradTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class Conv2DBackpropFilterTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv2DBackpropFilterTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class Conv2DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv2DBackpropInputTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class Conv3DTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class Conv3DBackpropFilterTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DBackpropFilterTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class Conv3DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DBackpropInputTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class FusedBatchNormExTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit FusedBatchNormExTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class FusedBatchNormGradTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit FusedBatchNormGradTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ private:
+  bool IsTraining(const utils::MutableNodeView& node) const;
+};
+
+class MaxPoolV2Transposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit MaxPoolV2Transposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class MaxPool3DTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit MaxPool3DTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class MaxPoolGradTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit MaxPoolGradTransposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class MaxPoolGradV2Transposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit MaxPoolGradV2Transposer() : LayoutSensitiveOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+// Layout agnostic op transposers.
+
+class LayoutAgnosticOpTransposer : public Transposer {
+ public:
+  explicit LayoutAgnosticOpTransposer() : Transposer() {}
+
+ protected:
+  bool IsAfterDstToSrcTransform(const TransposeContext& context,
+                                const utils::MutableNodeView& node) const;
+
+  std::vector<int> GetVariadicNDFaninPorts(const TransposeContext& context,
+                                           const utils::MutableNodeView& node,
+                                           int rank) const;
+};
+
+class DefaultLayoutAgnosticOpTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit DefaultLayoutAgnosticOpTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class AddNTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit AddNTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class BinaryOpTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit BinaryOpTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ private:
+  bool IsNDOperateWithMD(const utils::MutableNodeView& node, int n, int m);
+  bool IsFaninShapeSupported(const utils::MutableNodeView& node, int rank);
+  std::vector<int> GetNDDataFaninPorts(const utils::MutableNodeView& node,
+                                       int rank);
+  absl::Status AddNodeShapeConst(utils::Mutation* mutation,
+                                 absl::string_view node_name,
+                                 absl::string_view node_device,
+                                 bool node_in_frame, int num_channels,
+                                 absl::string_view depended_node, int rank);
+  absl::Status AddNodeReshape(utils::Mutation* mutation,
+                              absl::string_view node_name,
+                              absl::string_view node_device,
+                              absl::string_view input_name,
+                              absl::string_view shape_const_node_name,
+                              const DataType& data_type);
+  absl::Status MaybeReshapeVectorFanin(TransposeContext* context,
+                                       utils::MutableNodeView* node, int rank);
+};
+
+class ConcatOpTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit ConcatOpTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class FillOpTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit FillOpTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class IdentityNTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit IdentityNTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class MergeTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit MergeTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ private:
+  bool IsEveryFaninAfterDstToSrcTransform(
+      const TransposeContext& context,
+      const utils::MutableNodeView& node) const;
+};
+
+class PadTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit PadTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class ReduceTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit ReduceTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ private:
+  bool KeepDims(const utils::MutableNodeView& node);
+  bool IsAlongAxis(const Tensor& tensor, absl::Span<const int> axis, int rank);
+  bool IsReduceAxisSupported(const TransposeContext& context,
+                             const utils::MutableNodeView& node, int rank);
+};
+
+class ReverseV2Transposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit ReverseV2Transposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class SelectTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit SelectTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ protected:
+  bool IsFaninScalarVector4D(const utils::MutableNodeView& fanin, int port);
+  std::vector<int> GetFaninPorts(const utils::MutableNodeView& fanin, int port);
+};
+
+class ShapeTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit ShapeTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class ShapeNTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit ShapeNTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class SliceTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit SliceTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class SplitTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit SplitTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class SplitVTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit SplitVTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class SqueezeTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit SqueezeTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ private:
+  bool IsInputConvertible(const TransposeContext& context,
+                          const utils::MutableNodeView& node) const;
+  bool IsAlongAxis(const AttrValue& attr, absl::Span<const int> axis,
+                   int rank) const;
+  bool IsDimsSupported(const TransposeContext& context,
+                       const utils::MutableNodeView& node) const;
+  absl::Status UpdateSqueezeDims(TransposeContext* context,
+                                 utils::MutableNodeView* node);
+};
+
+class StridedSliceTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit StridedSliceTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+
+ private:
+  bool IsMaskZero(const utils::MutableNodeView& node, absl::string_view mask);
+  bool HasOnlyBeginEndMask(const utils::MutableNodeView& node);
+  absl::Status PermuteMask(TransposeContext* context,
+                           utils::MutableNodeView* node,
+                           absl::string_view mask);
+};
+
+class SwitchTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit SwitchTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class TernaryOpTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit TernaryOpTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class TileTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit TileTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+class UnaryGradTransposer : public LayoutAgnosticOpTransposer {
+ public:
+  explicit UnaryGradTransposer() : LayoutAgnosticOpTransposer() {}
+
+  absl::Status TransposeNode(TransposeContext* context,
+                             utils::MutableNodeView* node) override;
+};
+
+// Utils.
+
+// Permutes elements according to permutation and replaces the original values.
+// Permutation and values must have same size.
+template <typename T>
+absl::Status PermuteSingle(absl::string_view location,
+                           absl::Span<const int> permutation, T* values) {
+  DCHECK(values != nullptr);
+  int permutation_size = permutation.size();
+  if (values->size() != permutation_size) {
+    return absl::Status(absl::StatusCode::kInvalidArgument,
+                        absl::StrCat("Size of values ", values->size(),
+                                     " does not match size of permutation ",
+                                     permutation_size, " @ ", location));
+  }
+  typedef typename T::value_type V;
+  std::vector<V> elements(values->begin(), values->end());
+  int index = 0;
+  for (V& element : *values) {
+    element = elements[permutation[index++]];
+  }
+  return absl::OkStatus();
+}
+
+// Permutes two elements at a time according to permutation and replaces the
+// original values. Values must be twice the size of permutation.
+template <typename T>
+absl::Status PermuteDouble(absl::string_view location,
+                           absl::Span<const int> permutation, T* values) {
+  DCHECK(values != nullptr);
+  int permutation_size = permutation.size();
+  if (values->size() != permutation_size * 2) {
+    return absl::Status(
+        absl::StatusCode::kInvalidArgument,
+        absl::StrCat("Size of values ", values->size(),
+                     " does not match twice the size of permutation ",
+                     permutation_size, " @ ", location));
+  }
+  typedef typename T::value_type V;
+  std::vector<V> elements(values->begin(), values->end());
+  for (int i = 0; i < values->size(); i = i + 2) {
+    const int permutation_index = permutation[i / 2];
+    (*values)[i] = elements[permutation_index * 2];
+    (*values)[i + 1] = elements[permutation_index * 2 + 1];
+  }
+  return absl::OkStatus();
+}
+
+string GetDeviceName(const NodeDef& node);
+
+bool IsDefaultLayoutSensitiveOp(const NodeDef& node);
+
+bool IsLayoutSensitiveOp(const NodeDef& node);
+
+bool IsDefaultLayoutAgnosticOp(const NodeDef& node);
+
+bool IsLayoutAgnosticOp(const NodeDef& node);
+
+bool IsTernaryOp(const NodeDef& node);
+
+bool IsUnaryGrad(const NodeDef& node);
+
+bool IsMaxPoolV2(const NodeDef& node);
+
+bool IsMaxPool3D(const NodeDef& node);
+
+bool IsMaxPoolGradV2(const NodeDef& node);
+
+bool IsMaxPoolGradGradV1(const NodeDef& node);
+
+bool IsMaxPoolGradGradV2(const NodeDef& node);
+
+bool IsBinaryOp(const NodeDef& node);
+
+bool IsReduceOp(const NodeDef& node);
+
+std::vector<int> GetDataFaninPorts(const utils::MutableNodeView& node);
+
+std::vector<int> GetDataFanoutPorts(const utils::MutableNodeView& node);
+
+// Returns a value of constant input to the `node` at `index`, iff `predicate`
+// evaluated to true. Returns true if `tensor` was populated with data.
+bool GetValueAttrFromConstInputNode(
+    const utils::MutableNodeView& node,
+    const std::function<bool(const NodeDef&)>& predicate, int index,
+    Tensor* tensor);
+
+bool IsDataFormatOp(const utils::MutableNodeView& node);
+
+absl::flat_hash_map<char, int> GetDimensionIndices(
+    absl::string_view data_format);
+
+std::vector<int> GetPermutation(
+    const absl::flat_hash_map<char, int>& src_dim_indices,
+    absl::string_view dst_format);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_TRANSPOSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.h
new file mode 100644
index 00000000..a31b1ca6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_TRANSPOSER_FACTORY_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_TRANSPOSER_FACTORY_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class TransposerFactory {
+ public:
+  explicit TransposerFactory() {}
+
+  std::shared_ptr<Transposer> GetTransposer(const NodeDef& node);
+
+ protected:
+  template <typename T>
+  std::shared_ptr<Transposer> GetOrCreateIfNotFound(const string& key) {
+    auto& transposer = transposer_map_[key];
+    if (transposer == nullptr) {
+      transposer = std::make_shared<T>();
+    }
+    return transposer;
+  }
+
+  absl::flat_hash_map<string, std::shared_ptr<Transposer>> transposer_map_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_TRANSPOSER_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/graph_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/graph_optimizer.h
new file mode 100644
index 00000000..6b7ba893
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// An abstract interface for an algorithm for generating a candidate
+// optimization of a GrapplerItem for running on a cluster.
+class GraphOptimizer {
+ public:
+  GraphOptimizer() : deadline_usec_(0) {}
+  virtual ~GraphOptimizer() {}
+
+  virtual string name() const = 0;
+
+  // Returns true if the optimizer requires a valid function library to perform
+  // graph optimization. If false, optimized GrapplerItem will have a stub
+  // instead of real function library (all function signatures and attributes
+  // will be valid, but function body will be empty). Most of the optimizers
+  // that do not instantiate functions should return true.
+  virtual bool UsesFunctionLibrary() const = 0;
+
+  // Routine called to allow an algorithm to propose a rewritten graph
+  // for the graph, feeds and fetches in "item" to run more efficiently
+  // on "cluster". If the returned status is OkStatus() then
+  // *optimized_graph contains the rewritten graph.
+  // Returns an error status if it failed to generate a solution.
+  //
+  // A return value of error::Aborted() can be used signal early termination of
+  // the optimizer, e.g. if the optimization turned out to be a no-op. In this
+  // case the content of *optimized_graph is undefined.
+  virtual absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                                GraphDef* optimized_graph) = 0;
+
+  // Subclasses may define a version of Optimize that consumes item.
+  virtual absl::Status Optimize(Cluster* cluster, GrapplerItem&& item,
+                                GraphDef* optimized_graph) {
+    return Optimize(cluster, item, optimized_graph);
+  }
+
+  // Set deadline in microseconds since epoch. A value of zero means no
+  // deadline.
+  void set_deadline_usec(uint64 deadline_usec) {
+    deadline_usec_ = deadline_usec;
+  }
+  uint64 deadline_usec() const { return deadline_usec_; }
+  bool DeadlineExceeded() const {
+    return deadline_usec_ > 0 && Env::Default()->NowMicros() > deadline_usec_;
+  }
+
+ private:
+  uint64 deadline_usec_;
+};
+
+#define GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED()                \
+  do {                                                        \
+    if (this->DeadlineExceeded()) {                           \
+      return absl::DeadlineExceededError(                     \
+          absl::StrCat(this->name(), " exceeded deadline.")); \
+    }                                                         \
+  } while (0)
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
new file mode 100644
index 00000000..ed5549ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -0,0 +1,315 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+struct NodeScopeAndName {
+  string scope;
+  string name;
+};
+
+// Parse scope and name: "a/b/c/Add_1" -> {"a/b/c", "Add_1"}
+const NodeScopeAndName ParseNodeScopeAndName(const string& node_name);
+
+// Context owned by GraphOptimizer, and passed to every stage at construction
+// time. Each optimizer stage is responsible for updating it according to the
+// changes it made to the graph.
+//
+// If an optimizer needs access to some helper class that is not present in this
+// context, consider creating an extension context, specific to that
+// optimizer (see example of ArithmeticOptimizerContext). GraphOptimizerContext
+// should only have members that are useful to almost all optimizers.
+struct GraphOptimizerContext {
+  GraphOptimizerContext(const std::unordered_set<string>* nodes_to_preserve,
+                        GraphDef* optimized_graph,
+                        GraphProperties* graph_properties, NodeMap* node_map,
+                        gtl::FlatSet<string>* feed_nodes,
+                        RewriterConfig::Toggle opt_level)
+      : nodes_to_preserve(nodes_to_preserve),
+        optimized_graph(optimized_graph),
+        graph_properties(graph_properties),
+        node_map(node_map),
+        feed_nodes(feed_nodes),
+        opt_level(opt_level) {}
+
+  const std::unordered_set<string>* nodes_to_preserve;
+  GraphDef* optimized_graph;
+  GraphProperties* graph_properties;
+  NodeMap* node_map;
+  gtl::FlatSet<string>* feed_nodes;
+  RewriterConfig::Toggle opt_level;
+};
+
+absl::Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
+                          NodeDef** node);
+absl::Status GetTensorProperties(const GraphOptimizerContext& ctx,
+                                 const string& tensor,
+                                 const OpInfo::TensorProperties** properties);
+
+NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
+                     const NodeDef* node_to_copy);
+NodeDef* AddEmptyNode(const GraphOptimizerContext& ctx, const string& name);
+
+// WARNING:
+// Optimizer stage must try to re-use original nodes of a graph and
+// make all updates in place. This helps to make robust node placement
+// decisions. Create new nodes only if there is a reason for that.
+
+// Make a name for a new node obtained by optimizing a single node of the
+// original graph. The optimized node is placed under the original node scope.
+//
+// Node name uniqueness is guaranteed by unique name of an original node in
+// a same scope.
+//
+// Empty sub_scope or prefix ignored. At least one of them must be non-empty.
+//
+// Example: a/b/c/Add -> a/b/c/${sub_scope}/${prefix}_Add.
+const string MakeOptimizedNodeName(const NodeScopeAndName& node,
+                                   const string& sub_scope,
+                                   const string& prefix);
+// Make a name for a new node obtained by optimizing multiple nodes of the
+// original graph, starting from "root". The optimized node is placed under
+// the original scope of a "root" node.
+//
+// Example: [a/b/c/Add, x/y/z/Mul] -> a/b/c/${sub_scope}/${prefix}_Add_Mul
+const string MakeOptimizedNodeName(const NodeScopeAndName& root,
+                                   const std::vector<string> node_names,
+                                   const string& sub_scope,
+                                   const string& prefix);
+
+// Base class for multi-stage GraphOptimizers (ArithmeticOptimizer, etc...).
+//
+// If a graph optimizer consists of large number of small independent
+// rewrites, each of them should be implemented as a separate stage.
+//
+// * Result:
+// Each graph optimizer choose what result is reported by each stage
+// (e.g. each stage can fill in the name of optimized nodes, or have more
+// complex result).
+template <typename Result>
+class GraphOptimizerStage {
+ public:
+  explicit GraphOptimizerStage(const string& optimizer_name,
+                               const string& stage_name,
+                               const GraphOptimizerContext& ctx)
+      : optimizer_name_(optimizer_name), stage_name_(stage_name), ctx_(ctx) {}
+  virtual ~GraphOptimizerStage() = default;
+
+  const string& stage_name() const { return stage_name_; }
+  const string& optimizer_name() const { return optimizer_name_; }
+
+  // Check if we should try to simplify node. Returning true doesn't
+  // guarantee that node will be simplified.
+  //
+  // Should implement just a basic sanity check, without any expensive graph
+  // traversals.
+  virtual bool IsSupported(const NodeDef* node) const = 0;
+
+  // Try to simplify the given node.
+  //
+  // Return error status only if some precondition is failed, or got an
+  // incorrect graph. In every other case return Status:OK(), even if didn't
+  // simplify anything.
+  //
+  // Report result using output argument. Each GraphOptimizer can choose it's
+  // own Result type.
+  // TODO(ezhulenev): if it will appear that Result output parameter is not
+  // sufficiently useful (used with a reason by most optimizers), get rid of it,
+  // and remove template parameter.
+  virtual absl::Status TrySimplify(NodeDef* node, Result* result) = 0;
+
+  // Return InvalidArgumentError if node is not supported by the optimizer
+  // stage.
+  // TODO(ezhulenev): make this check part of non-virtual public API
+  // (TrySimplify), and make virtual implementation protected.
+  absl::Status EnsureNodeIsSupported(const NodeDef* node) const {
+    return IsSupported(node)
+               ? absl::OkStatus()
+               : errors::InvalidArgument(
+                     "Node ", node->name(), " is not supported by optimizer ",
+                     optimizer_name_, " and stage ", stage_name_);
+  }
+
+  // Get a name for a new node, created by this stage, based on one or multiple
+  // nodes of an original graph.
+  const string OptimizedNodeName(const NodeScopeAndName& node) const {
+    return MakeOptimizedNodeName(node, optimizer_name_, stage_name_);
+  }
+  const string OptimizedNodeName(const NodeScopeAndName& root,
+                                 const std::vector<string>& nodes) const {
+    return MakeOptimizedNodeName(root, nodes, optimizer_name_, stage_name_);
+  }
+  const string OptimizedNodeName(const NodeScopeAndName& node,
+                                 const string& rewrite_rule) const {
+    const string prefix = strings::StrCat(stage_name_, "_", rewrite_rule);
+    return MakeOptimizedNodeName(node, optimizer_name_, prefix);
+  }
+
+  const string UniqueOptimizedNodeName(const NodeScopeAndName& node) {
+    const string node_name = OptimizedNodeName(node);
+    return UniqueNodeName(node_name);
+  }
+  const string UniqueOptimizedNodeName(const NodeScopeAndName& node,
+                                       const string& rewrite_rule) {
+    const string node_name = OptimizedNodeName(node, rewrite_rule);
+    return UniqueNodeName(node_name);
+  }
+
+  // Get a node by input name from a node map. Return an error if node was not
+  // found.
+  absl::Status GetInputNode(const string& input, NodeDef** node) const {
+    return ::tensorflow::grappler::GetInputNode(ctx_, input, node);
+  }
+  // Lookup tensor properties by name. Tensor name might have non-zero port
+  // number. Return an error if tensor node doesn't exists in a graph, or it
+  // doesn't have properties defined for requested port.
+  absl::Status GetTensorProperties(
+      const string& tensor, const OpInfo::TensorProperties** properties) const {
+    return ::tensorflow::grappler::GetTensorProperties(ctx_, tensor,
+                                                       properties);
+  }
+
+  NodeDef* AddCopyNode(const string& name, const NodeDef* node_to_copy) {
+    return ::tensorflow::grappler::AddCopyNode(ctx_, name, node_to_copy);
+  }
+  NodeDef* AddEmptyNode(const string& name) {
+    return ::tensorflow::grappler::AddEmptyNode(ctx_, name);
+  }
+
+ protected:
+  const GraphOptimizerContext& ctx() const { return ctx_; }
+
+ private:
+  const string UniqueNodeName(absl::string_view name) {
+    string node_name = string(name);
+    while (ctx_.node_map->NodeExists(node_name)) {
+      node_name = absl::StrCat(name, "_unique",
+                               optimized_node_name_counter_.fetch_add(1));
+    }
+
+    return node_name;
+  }
+
+  const string optimizer_name_;
+  const string stage_name_;
+  const GraphOptimizerContext ctx_;
+  std::atomic<int64_t> optimized_node_name_counter_ = {0};
+};
+
+template <typename Result>
+class GraphOptimizerStagePipeline {
+ public:
+  // Break predicate specifies if a pipeline should stop early, and not pass
+  // a node to the next registered optimizer stage, typically that should be the
+  // case when a stage successfully optimized a node, and it wants to yield
+  // control to the optimizer.
+  explicit GraphOptimizerStagePipeline(
+      const std::function<bool(const Result&)> break_predicate)
+      : break_predicate_(break_predicate) {}
+
+  // Add a stage to the pipeline. It should be called with the arguments for the
+  // stage constructor:
+  //
+  //   pipeline.AddStage<FooStage>(constructor_arg1, constructor_arg2);
+  //
+  // Returns a reference to the added stage.
+  template <typename T, typename... Args>
+  T& AddStage(Args&&... args) {
+    auto stage = new T(std::forward<Args>(args)...);
+    stages_.push_back(std::unique_ptr<T>(stage));
+    return *stage;
+  }
+
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true.
+  //
+  // Return true, if pipeline exited after a break predicate was evaluated as
+  // 'true', which typically means that a node was optimized by one of the
+  // registered stages.
+  //
+  // Return false, if node was not optimized by any of registered stages.
+  bool PassThroughAllStages(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (stage->IsSupported(node)) {
+        const absl::Status stage_status = stage->TrySimplify(node, result);
+        // Each stage must be "error safe" (just like exception safe). In
+        // case of any error it must leave optimized graph unmodified.
+        if (!stage_status.ok()) {
+          VLOG(2) << "Failed to run optimizer " << stage->optimizer_name()
+                  << ", stage " << stage->stage_name() << " node "
+                  << node->name() << ". Error: " << stage_status.message();
+        }
+        if (break_predicate_(*result)) return true;
+      }
+    }
+    return false;
+  }
+
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true or a stage fails.
+  //
+  // Returns any stage failure status, or else OkStatus().
+  absl::Status PassThroughAllStagesWithStatus(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (!stage->IsSupported(node)) {
+        continue;
+      }
+      const absl::Status stage_status = stage->TrySimplify(node, result);
+      if (!stage_status.ok()) {
+        return stage_status;
+      } else if (break_predicate_(*result)) {
+        break;
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  std::size_t NumStages() { return stages_.size(); }
+
+  std::vector<string> StageNames() {
+    std::vector<string> names;
+    names.reserve(stages_.size());
+    for (const auto& stage : stages_) {
+      names.push_back(stage->stage_name());
+    }
+    return names;
+  }
+
+ private:
+  std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
+  std::function<bool(const Result&)> break_predicate_;
+
+  GraphOptimizerStagePipeline(const GraphOptimizerStagePipeline&) = delete;
+  void operator=(const GraphOptimizerStagePipeline&) = delete;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/implementation_selector.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/implementation_selector.h
new file mode 100644
index 00000000..dc804fdc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -0,0 +1,203 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/function_api_info.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+static constexpr const char* const kNoImplSelectionAttr = "_noimpl_selection";
+
+// Motivation: To achieve the same high level functionality, the underlying
+// implementations sometimes are different for various devices where the
+// function runs. In order to achieve the correct result and best performance,
+// the proper implementation needs to be picked dynamically.
+//
+// Currently there are two approaches to do this.
+// (1) Utilize case op and dynamacically change the branch index.
+// (2) Swap function implementation, it will be deprecated.
+//
+// Idea for approach 1.
+// This transformation rewrites the DeviceIndex op with a Const op with value
+// of the index of the device the associcated Case op runs.
+// Example:
+// def plus_one_gpu(x): return x + 1.0
+// def plus_one_reference_implementation(x): return x + 1.0
+// input = tf.constant(2.0, dtype=tf.float32)
+// cpu_fn = lambda:plus_one_reference_implementation(input)
+// gpu_fn = lambda:plus_one_gpu(input)
+// control_flow_switch_case.execute_fn_for_device(
+//  {"CPU": cpu_fn, "GPU":gpu_fn)}, default_fn=cpu_fn)
+//
+// Idea for approach 2.
+// This transformation replaces function calls by the appropriate function
+// definition based on properties of the runtime system. For instance,
+// we may choose one implementation over another if we have a GPU with
+// enough memory available.
+//
+// It is a way for the programmer to specify alternative implementations
+// of the same functionality in the graph, and let TensorFlow pick the
+// most appropriate one at runtime.
+//
+// For instance, the python code might specify:
+// @Defun(tf.float32,
+//        api_implements='plus_one',
+//        api_preferred_device='GPU')
+// def plus_one_gpu(x): return x + 1.0
+//
+// @Defun(tf.float32,
+//        api_implements='plus_one')
+// def plus_one_reference_implementation(x): return x + 1.0
+// input = tf.constant(2.0, dtype=tf.float32)
+//
+// z = plus_one_reference_implementation(input)
+// z = plus_one_gpu(input)
+// print(sess.run(z))
+//
+
+// At runtime, we will select either `plus_one_gpu` or
+// `plus_one_reference_implementation` based on the availability of the GPU.
+//
+// Available annotations:
+//  - api_implements(string): all functions mapping to the same
+//    string can be interchanged. For now, all functions must have the same
+//    signature and overloads are not allowed. Defuns within defuns are
+//    allowed.
+//  - api_preferred_device(string): sets which device is preferred.
+class ImplementationSelector : public CustomGraphOptimizer {
+ public:
+  ImplementationSelector() = default;
+  ~ImplementationSelector() override = default;
+  absl::Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+  string name() const override {
+    return "implementation_selector";
+  }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  // This call is not thread-safe.
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  absl::Status LoadFunctions(const GraphDef& graph);
+  absl::Status MaybeOptimizeFunctionCall(
+      const Cluster* cluster, utils::MutableNodeView* node_view) const;
+
+  // Finds all call sites for functions, then replace with the appropriate
+  // implementation.
+  // There are two ways of calling functions:
+  //  1. By specifying an op name as a function name, and
+  //  2. Via the functional interface, where the function name appears as an
+  //  Attr.
+  //
+  // There may be multiple call sites for a given function. The function body
+  // may call into another function, so a function might have to be duplicated.
+  // For simplicity, we do not change function bodies. Also, we do not change
+  // gradients.
+  absl::Status SelectImplementation(const Cluster* cluster,
+                                    GraphDef* graph) const;
+
+  // Rewrites the DeviceIndex op with a Const op with value of the index of the
+  // device the associcated Case op runs.
+
+  // This function first looks up all the DeviceIndex ops.
+  // Then for each of these ops, it finds the device of the
+  // associated Case op that takes the DeviceIndex op as the input, and
+  // caculates the index of the device in the device list of DeviceIndex op.
+  // Lastly, it rewrites the DeviceIndex op with a Const op and sets the value
+  // to be the index.
+  //
+  // Example input nodes:
+  // node {
+  //   name: "x"
+  //   op: "DeviceIndex"
+  //   device: "/device:CPU:0"
+  //   attr {
+  //     key: "device_names"
+  //     value {
+  //       list {
+  //         s: "CPU"
+  //         s: "TPU_REPLICATED_CORE"
+  //         s: "GPU"
+  //       }
+  //     }
+  //   }
+  // }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  // Example output nodes:
+  //
+  //  name: "x"
+  //  op: "Const"
+  //  device: "/device:CPU:0"
+  //  attr {
+  //    key: "dtype"
+  //    value {
+  //      type: DT_INT32
+  //    }
+  //  }
+  //  attr {
+  //    key: "value"
+  //    value {
+  //      tensor {
+  //        dtype: DT_INT32
+  //        int_val: 2
+  //      }
+  //    }
+  //  }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  absl::Status SelectDeviceIndex(GraphDef* graph) const;
+
+  std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
+
+  ImplementationSelector(const ImplementationSelector&) = delete;
+  void operator=(const ImplementationSelector&) = delete;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.h
new file mode 100644
index 00000000..d15ff68b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.h
@@ -0,0 +1,66 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_INFERENCE_BATCH_OP_REWRITER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_INFERENCE_BATCH_OP_REWRITER_H_
+
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kEnableAdaptiveSchedulerAttr[] = "_enable_adaptive_scheduler";
+constexpr char kMinInflightBatchesAttr[] = "_min_inflight_batches";
+constexpr char kInitialInflightBatchesAttr[] = "_initial_inflight_batches";
+constexpr char kMaxInflightBatchesAttr[] = "_max_inflight_batches";
+constexpr char kBatchesToAverageOverAttr[] = "_batches_to_average_over";
+constexpr char kFullBatchSchedulingBoostMicros[] =
+    "_full_batch_scheduling_boost_micros";  // NOLINT(whitespace/line_length)
+
+constexpr int64_t kMinInflightBatches = 16;
+constexpr int64_t kInitialInflightBatches = 16;
+constexpr int64_t kBatchesToAverageOver = 10;
+constexpr int64_t kMaxInflightBatches = 64;
+
+using ::tensorflow::serving::BatchOpRewriteConfig;
+
+// This optimization does the following:
+//
+// Rewrite `num_batch_threads` to zero in batch-op. In this way, graphs with
+// batch op will use a shared thread pool to schedule batches, as opposed to
+// allocating batch threads per batch-op.
+class BatchOpRewriter : public ::tensorflow::grappler::CustomGraphOptimizer {
+ public:
+  absl::Status Init(
+      const ::tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  std::string name() const override { return "batch_op_rewriter"; }
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(::tensorflow::grappler::Cluster* cluster,
+                        const ::tensorflow::grappler::GrapplerItem& item,
+                        ::tensorflow::GraphDef* optimized_graph) override;
+
+ private:
+  BatchOpRewriteConfig config_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_INFERENCE_BATCH_OP_REWRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/loop_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/loop_optimizer.h
new file mode 100644
index 00000000..0b561876
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
+
+#include <unordered_set>
+
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kLoopOptimizer[] = "LoopOptimizer";
+
+class LoopOptimizer : public GraphOptimizer {
+ public:
+  LoopOptimizer();
+
+  explicit LoopOptimizer(RewriterConfig::Toggle opt_level,
+                         DeviceBase* cpu_device);
+
+  ~LoopOptimizer() override {}
+
+  string name() const override { return "loop_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  friend class LoopOptimizerTest;
+
+  // Granular control for loop optimizer stages.
+  struct LoopOptimizerOptions {
+    bool enable_loop_invariant_node_motion = false;
+    bool enable_stack_push_removal = true;
+    bool enable_dead_branch_removal = true;
+
+    static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
+      LoopOptimizerOptions options;
+      return options;
+    }
+  };
+
+  absl::Status RemoveDeadBranches(
+      const std::unordered_set<string>& nodes_to_preserve, NodeMap& node_map,
+      const absl::flat_hash_set<string>& feed_nodes, GraphDef* optimized_graph);
+
+  RewriterConfig::Toggle opt_level_;
+  DeviceBase* cpu_device_;
+  LoopOptimizerOptions options_;
+  std::unique_ptr<ResourceMgr> resource_mgr_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/memory_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/memory_optimizer.h
new file mode 100644
index 00000000..e1274d93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+
+#include <string>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Swap tensors in and out of device memory.
+class MemoryOptimizer : public GraphOptimizer {
+ public:
+  // optimization_level: Controls the level of autonomy for the memory
+  //   optimizer. See RewriterConfig::memory_optimization.
+  // recomputation_targets_name_scope: Name scope for potential outputs of
+  //   recomputations. See
+  //   RewriterConfig::memory_optimizer_target_node_name_scope.
+  explicit MemoryOptimizer(
+      RewriterConfig::MemOptType optimization_level,
+      const string& recomputation_targets_name_scope = "gradients/")
+      : optimization_level_(optimization_level),
+        recomputation_targets_name_scope_(recomputation_targets_name_scope) {}
+  ~MemoryOptimizer() override {}
+
+  string name() const override { return "memory_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* pruned_graph) override;
+
+ private:
+  RewriterConfig::MemOptType optimization_level_;
+  string recomputation_targets_name_scope_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/meta_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/meta_optimizer.h
new file mode 100644
index 00000000..74756553
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/verifier_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Run the other grappler optimizers based on the specified rewriter config.
+class MetaOptimizer : public GraphOptimizer {
+ public:
+  MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg);
+  ~MetaOptimizer() override = default;
+
+  string name() const override { return "meta_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override {
+    GrapplerItem copy(item);
+    return OptimizeConsumeItem(cluster, std::move(copy), optimized_graph);
+  }
+
+  absl::Status OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
+                                   GraphDef* optimized_graph);
+
+  string GetResultString() const;
+
+  void PrintResult();
+
+ private:
+  std::unique_ptr<GraphOptimizer> MakeNewOptimizer(
+      const string& optimizer, const std::set<string>& device_types) const;
+
+  // When grappler should lower control flow to V1 switch/merge style nodes.
+  bool LowerControlFlow() const;
+
+  // Initialize active optimizers from RewriterConfig toggles.
+  absl::Status InitializeOptimizers(
+      const std::set<string>& device_types,
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Initialize active optimizers from RewriterConfig optimizer names.
+  absl::Status InitializeOptimizersByName(
+      const std::set<string>& device_types,
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Initialize active optimizers from RewriterConfig.custom_optimizers.
+  absl::Status InitializeCustomGraphOptimizers(
+      const std::set<string>& device_types,
+      const std::set<string>& pre_initialized_optimizers,
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  absl::Status InitializePluginGraphOptimizers(
+      const std::set<string>& device_types,
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Returns the config for a custom graph optimizer. Null if none was found.
+  const RewriterConfig::CustomGraphOptimizer* GetCustomGraphOptimizerConfig(
+      const string& name) const;
+
+  // Initialize active verifiers from the RewriterConfig toggles.
+  void InitializeVerifiers(
+      std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+      std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+      const;
+
+  void PrintUserAndPluginConfigs(const std::set<string>& device_types) const;
+
+  // Run optimization pass over a single GrapplerItem. Meta optimizer might run
+  // multiple such passes: 1) for the main graph 2) for the function library
+  absl::Status OptimizeGraph(
+      const std::vector<std::unique_ptr<GraphOptimizer>>& optimizers,
+      Cluster* cluster, GrapplerItem&& item, GraphDef* optimized_graph);
+  absl::Status OptimizeGraph(Cluster* cluster, GrapplerItem&& item,
+                             GraphDef* optimized_graph);
+
+  DeviceBase* const cpu_device_;  // may be NULL
+  ConfigProto config_proto_;
+  RewriterConfig& cfg_;
+  bool xla_auto_clustering_on_;
+
+  struct OptimizerResult {
+    string optimizer_name;
+    string message;
+    absl::Status status;
+  };
+
+  struct GraphOptimizationResult {
+    explicit GraphOptimizationResult(const string& id) : id(id) {}
+    string id;
+    std::vector<OptimizerResult> results;
+  };
+
+  absl::Status RunOptimizer(GraphOptimizer* optimizer, Cluster* cluster,
+                            GrapplerItem* optimized_item,
+                            GraphDef* optimized_graph,
+                            GraphOptimizationResult* optimization_result);
+
+  std::vector<GraphOptimizationResult> optimization_results_;
+};
+
+bool MetaOptimizerEnabled(const ConfigProto& cfg);
+
+// Run the meta optimizer.
+//
+// If <cpu_device> is non-null, it is the device to be used for executing ops
+// during constant folding; if NULL, a new device is created for doing constant
+// folding. For performance, it is recommended to pass in an existing cpu_device
+// when possible.
+absl::Status RunMetaOptimizer(GrapplerItem&& item, const ConfigProto& cfg,
+                              DeviceBase* cpu_device, Cluster* cluster,
+                              GraphDef* optimized_graph);
+
+// Wrapper around RunMetaOptimizer convenient for optimizing
+// function graphs.
+//
+// Runs grappler optimizations on `g` based on `config_proto`.
+// `ret_node_names`: a vector of node names whose outputs are returned,
+//    aka fetches. when `g` represent a function, these are _Retval nodes.
+// `lib`: function library to use with `g`.
+// `device_set`: the set of devices that graph can refer to.
+// `cpu_device`: the CPU device.
+// `config_proto`: Grapper configuration.
+// `grappler_item_id': Grappler item id (e.g. optimized function name).
+// `optimization_options`: Grappler optimization constraints that are known only
+//    at runtime.
+//
+// **g is a graph constructed based on the runtime library 'lib'.
+// OptimizeGraph mutates **g extensively and replaces '*g' with a
+// complete copy. Therefore, the caller should not keep any references
+// to nodes *g.
+absl::Status OptimizeGraph(
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* lib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/model_pruner.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/model_pruner.h
new file mode 100644
index 00000000..668bb442
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Prune a model to make it more efficient:
+// * Remove unnecessary operations.
+// * Optimize gradient computations.
+class ModelPruner : public GraphOptimizer {
+ public:
+  ModelPruner() {}
+  ~ModelPruner() override {}
+
+  string name() const override { return "model_pruner"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
new file mode 100644
index 00000000..3cd1db08
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_PIN_TO_HOST_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_PIN_TO_HOST_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace internal {
+// Try and find an appropriate Host device in `devices` given `device`.
+string TryFindHostDevice(const gtl::FlatSet<string>& devices,
+                         bool has_device_cpu, const string& device);
+}  // end namespace internal
+
+// Optimize TensorFlow ops that should be swapped into the CPU to avoid
+// excessive cpu<->gpu memcpy/sync.
+//
+// TODO(williamchan): The current heuristic will swap any small integer Const to
+// CPU. This may cause a problem cpu->cpu->gpu wherein the original behaviour of
+// gpu->gpu->gpu may have been better/faster. We should probably fix this.
+class PinToHostOptimizer : public GraphOptimizer {
+ public:
+  PinToHostOptimizer() {}
+  explicit PinToHostOptimizer(RewriterConfig::Toggle opt_level) {}
+
+  ~PinToHostOptimizer() override {}
+
+  string name() const override { return "pin_to_host_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_PIN_TO_HOST_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/remapper.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/remapper.h
new file mode 100644
index 00000000..51332eeb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/remapper.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by remapping subgraphs/nodes onto other subgraphs or
+// nodes to decrease the amount of operations needed to perform a computation.
+class Remapper : public GraphOptimizer {
+ public:
+  explicit Remapper(RewriterConfig::Toggle opt_level,
+                    RewriterConfig::CpuLayout cpu_layout_conversion =
+                        RewriterConfig::NO_CONVERSION_ON_CPU,
+                    bool xla_auto_clustering_on = false)
+      : opt_level_(opt_level),
+        cpu_layout_conversion_(cpu_layout_conversion),
+        xla_auto_clustering_on_(xla_auto_clustering_on) {}
+
+  ~Remapper() override {}
+
+  string name() const override { return "remapper"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+  RewriterConfig::CpuLayout cpu_layout_conversion_;
+  bool xla_auto_clustering_on_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
new file mode 100644
index 00000000..1b50f148
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
+
+#include <atomic>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+class Graph;
+
+namespace grappler {
+class GraphProperties;
+class NodeMap;
+class ScopedAllocatorOptimizer;
+
+// An Optimizer that introduces ScopedAllocators in order to reduce data
+// movement and consolidate some kinds of Ops.
+class ScopedAllocatorOptimizer : public GraphOptimizer {
+ public:
+  ScopedAllocatorOptimizer(RewriterConfig::Toggle opt_level,
+                           const ScopedAllocatorOptions& opts);
+  ~ScopedAllocatorOptimizer() override;
+
+  string name() const override { return "scoped_allocator_optimizer"; }
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+
+  // Map from an Op name to a vector of Nodes with that Op.
+  typedef absl::flat_hash_map<string, std::vector<NodeDef*>> DevOpOccurrences;
+  // Map from a device name to a DevOpOccurrences map.
+  typedef absl::flat_hash_map<string, DevOpOccurrences> GraphOpOccurrences;
+  typedef absl::flat_hash_set<string> OpNameSet;
+
+  absl::Status ProcessGraphDef(GraphDef* graph,
+                               const GraphProperties& graph_properties);
+
+  // Populates *occs by grouping Nodes with common Ops, according to
+  // their assigned devices.
+  void FindOpOccurrences(GraphDef* graph, const OpNameSet& op_names,
+                         GraphOpOccurrences* occs);
+
+  // Returns a new, unused scope_id to be assigned to a ScopedAllocator that
+  // will allocate num_fields (> 0) separate tensors.
+  int NewScopedAllocatorId(int num_fields);
+
+  // Returns a new, unused id to be assigned to an IdentityOp used in this graph
+  // rewrite.
+  absl::Status NewIdentityId(int* id);
+
+  NodeMap* node_map() { return node_map_.get(); }
+
+  const absl::flat_hash_set<string>& repeated_outputs() {
+    return repeated_outputs_;
+  }
+
+  // Appends values to the attr value under name in node_def, if present.
+  // If not present does an assignment.
+  static void ExtendNodeAttr(absl::string_view name,
+                             const std::vector<int32>& values,
+                             NodeDef* node_def);
+
+  // Class that knows how to do graph rewriting for a particular kind of Op in
+  // order to take advantage of a ScopedAllocator.
+  class Rewriter {
+   public:
+    virtual ~Rewriter() {}
+
+    virtual absl::Status Rewrite(ScopedAllocatorOptimizer* paopti,
+                                 int64_t invocation_count, GraphDef* graph,
+                                 const string& op_name,
+                                 const std::vector<NodeDef*>& nodes,
+                                 bool* applied) = 0;
+
+    void SetGraphProperties(const GraphProperties& graph_properties) {
+      graph_properties_ = &graph_properties;
+      CHECK(graph_properties_);
+    }
+
+   protected:
+    const GraphProperties* graph_properties_;
+  };
+
+ private:
+  Rewriter* GetRewriter(const string& op_name);
+
+  absl::Status OrderNodeSet(std::vector<NodeDef*>* nodes) const;
+
+  RewriterConfig::Toggle opt_level_;
+  std::unordered_set<string> nodes_to_preserve_;
+  OpNameSet op_name_set_;
+  absl::flat_hash_map<string, Rewriter*> rewriters_;
+  std::vector<Rewriter*> to_delete_;
+  int next_sa_id_ = 1;
+  int next_identity_id_ = 1;
+  std::unique_ptr<NodeMap> node_map_;
+  // Keeps track of outputs, i.e. a node and an output index, that are inputs to
+  // more than one op groups that are candidates for scoped allocator
+  // optimization.
+  absl::flat_hash_set<string> repeated_outputs_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/shape_optimizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/shape_optimizer.h
new file mode 100644
index 00000000..00679ca8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/shape_optimizer.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TensorFlow subgraphs that operate on shape and shape related
+// information.
+class ShapeOptimizer : public GraphOptimizer {
+ public:
+  ShapeOptimizer() {}
+  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level) {}
+
+  ~ShapeOptimizer() override {}
+
+  string name() const override { return "shape_optimizer"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  absl::Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                        GraphDef* optimized_graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/static_schedule.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/static_schedule.h
new file mode 100644
index 00000000..b26ce381
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/static_schedule.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute the earliest time at which the execution of each node in the graph
+// can complete.
+// In our estimation, we ensure that each node takes at least one nanosecond to
+// execute: therefore the execution times can be used to derive a topological
+// ordering of the graph (at least as long as there is no loop in the graph).
+absl::Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* execution_times);
+
+// Compute the time by which the execution of each node must complete to ensure
+// the subsequent nodes can still be executed by the times predicted by the
+// EstimateEarliestExecutionTimes function.
+absl::Status EstimateRequiredTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
+        execution_times,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* required_times);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.h
new file mode 100644
index 00000000..58872497
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.h
@@ -0,0 +1,65 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_TFG_OPTIMIZER_HOOK_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_TFG_OPTIMIZER_HOOK_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace mlir {
+class PassManager;
+
+namespace tfg {
+
+// A function that builds the TFG pass pipeline.
+using TFGPassPipelineBuilder = std::function<void(PassManager& pm)>;
+
+// This class implements a Grappler optimizer wrapping a pipeline of passes
+// implemented with TFG.
+class TFGGrapplerOptimizer : public tensorflow::grappler::GraphOptimizer {
+ public:
+  // Constructs a TFG optimizer using the provided pipeline builder. By default,
+  // the optimizer will not use multi-threading. If `num_tfg_threads` is
+  // non-zero, then TFG will use threading with the specified number of threads.
+  explicit TFGGrapplerOptimizer(TFGPassPipelineBuilder builder,
+                                unsigned num_tfg_threads = 0);
+  // Explicit destructor to defer instantiation of Impl.
+  ~TFGGrapplerOptimizer() override;
+
+  // Constructs a name for the optimizer using the registered passes.
+  std::string name() const override;
+  // The TFG optimizer requires access to the function library.
+  bool UsesFunctionLibrary() const override { return true; }
+
+  // Runs the optimizer on the GraphDef. The optimizer converts the GraphDef to
+  // TFG using the importer, runs the passes on the MLIR, and exports back to
+  // GraphDef. The result is stored in `optimized_graph`.
+  absl::Status Optimize(tensorflow::grappler::Cluster* cluster,
+                        const tensorflow::grappler::GrapplerItem& item,
+                        tensorflow::GraphDef* optimized_graph) override;
+
+ private:
+  // Hide the implementation details.
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // end namespace tfg
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_TFG_OPTIMIZER_HOOK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/tfg_passes_builder.h b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/tfg_passes_builder.h
new file mode 100644
index 00000000..4aee20b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/optimizers/tfg_passes_builder.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_TFG_PASSES_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_TFG_PASSES_BUILDER_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace mlir {
+namespace tfg {
+
+// Constructs the default graph/function-level TFG pass pipeline.
+void DefaultGrapplerPipeline(PassManager& manager);
+
+// Constructs the default module-level TFG pass pipeline.
+void DefaultModuleGrapplerPipeline(PassManager& manager,
+                                   const tensorflow::RewriterConfig& config);
+
+// Constructs the Remapper pass pipeline.
+void RemapperPassBuilder(PassManager& manager);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_TFG_PASSES_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils.h
new file mode 100644
index 00000000..e437ebe0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils.h
@@ -0,0 +1,440 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Utilities for manipulating node name and input strings.
+
+// Returns the trailing position number (or zero if no number is present) if
+// NodeName(input_name) is equal to node_name. Returns -1 for control inputs.
+// Returns -2 if input_name is empty or NodeName(input_name) is not equal to
+// node_name.
+inline int NodePositionIfSameNode(absl::string_view input_name,
+                                  absl::string_view node_name) {
+  bool is_control = absl::StartsWith(input_name, "^");
+  if (is_control) input_name.remove_prefix(1);
+  if (input_name.empty() || node_name.empty() ||
+      input_name.size() < node_name.size()) {
+    return -2;
+  }
+  TensorId id = ParseTensorName(input_name);
+  if (id.first != node_name) return -2;
+  if (is_control) return -1;
+  return id.second;
+}
+
+// Returns the node name and position in a single call.
+inline absl::string_view ParseNodeNameAsStringPiece(absl::string_view name,
+                                                    int* position) {
+  const bool is_control = absl::StartsWith(name, "^");
+  TensorId id = ParseTensorName(name);
+  if (position) {
+    *position = is_control ? -1 : id.second;
+  }
+  if (is_control && id.second >= 0) {
+    id.first.remove_prefix(1);
+  }
+  return id.first;
+}
+
+// Returns the node name and position in a single call.
+inline string ParseNodeName(const string& name, int* position) {
+  return string(ParseNodeNameAsStringPiece(name, position));
+}
+
+// Return the node name corresponding to 'name' if name is valid, or the empty
+// string otherwise.
+inline absl::string_view NodeNameAsStringPiece(const string& name) {
+  return ParseNodeNameAsStringPiece(name, nullptr);
+}
+
+// Return the node name corresponding to 'name' if name is valid, or the empty
+// string otherwise.
+inline string NodeName(const string& name) {
+  return string(NodeNameAsStringPiece(name));
+}
+
+inline int NodePosition(const string& name) {
+  int position;
+  ParseNodeNameAsStringPiece(name, &position);
+  return position;
+}
+
+namespace internal {
+// Base template class for NodeMap and ImmutableNodeMap.
+template <typename GraphDefT, typename NodeDefT>
+class NodeMapInternal {
+ public:
+  // Note: The NodeMap will store pointers to nodes in graph, which may become
+  // invalid if graph is changed.
+  explicit NodeMapInternal(GraphDefT* graph) {
+    if (graph == nullptr) {
+      LOG(WARNING) << "NodeMapInternal constructor is called with a nullptr!";
+      return;
+    }
+    nodes_.reserve(graph->node_size());
+    outputs_.reserve(graph->node_size());
+    for (int i = 0; i < graph->node_size(); i++) {
+      NodeDefT* node = GetNodeDefFromGraph(graph, i);
+      const string& node_name = node->name();
+      auto rslt = nodes_.emplace(node_name, node);
+      // Check that the graph doesn't contain multiple nodes with the same name.
+      if (!rslt.second) {
+        // The first node found with a given name becomes the canonical.
+        LOG(WARNING) << "Duplicated node in the graph: " << node_name;
+      }
+      NodeDefT* canonical = rslt.second ? node : rslt.first->second;
+      for (const auto& input : node->input()) {
+        outputs_[NodeName(input)].insert(canonical);
+      }
+    }
+  }
+
+  // Get unordered list of fanouts from node. Notice, that the order is
+  // non-deterministic.
+  const absl::flat_hash_set<NodeDefT*>& GetOutputs(
+      const string& node_name) const {
+    auto it = outputs_.find(node_name);
+    if (it == outputs_.end()) {
+      return empty_set_;
+    }
+    return it->second;
+  }
+
+  // Get fanouts ordered by name.
+  std::vector<NodeDefT*> GetOutputsOrderedByNodeName(
+      const string& node_name) const {
+    std::vector<NodeDefT*> result;
+    auto it = outputs_.find(node_name);
+    if (it != outputs_.end()) {
+      const absl::flat_hash_set<NodeDefT*>& outputs = it->second;
+      result.reserve(outputs.size());
+      result.assign(outputs.begin(), outputs.end());
+      std::sort(result.begin(), result.end(),
+                [](const NodeDef* n1, const NodeDef* n2) {
+                  return n1->name() < n2->name();
+                });
+    }
+    return result;
+  }
+
+  // This method doesn't record the outputs of the added node; the outputs need
+  // to be explicitly added by the AddOutput method.
+  void AddNode(const string& node_name, NodeDefT* node) {
+    DCHECK(node != nullptr);
+    auto ret = nodes_.emplace(node_name, node);
+    DCHECK(ret.second)
+        << "Pair (" << node_name << "," << node
+        << ") is not inserted because the same key already exists.";
+  }
+
+  void RemoveNode(const string& name) {
+    nodes_.erase(NodeName(name));
+    outputs_.erase(NodeName(name));
+  }
+
+  NodeDefT* GetNode(const string& name) const {
+    const string node_name = NodeName(name);
+    auto it = nodes_.find(node_name);
+    if (it == nodes_.end()) {
+      VLOG(1) << "Node could not be found: " << name;
+      return nullptr;
+    }
+    return it->second;
+  }
+
+  bool NodeExists(const string& name) const {
+    const string node_name = NodeName(name);
+    return nodes_.find(node_name) != nodes_.end();
+  }
+
+  void AddOutput(const string& node_name, const string& output_name) {
+    auto output_node = nodes_[NodeName(output_name)];
+    DCHECK(output_node) << "Output node " << output_name
+                        << " is missing in NodeMap.";
+    outputs_[node_name].insert(output_node);
+  }
+
+  void RemoveOutput(const string& node_name, const string& output_name) {
+    outputs_[node_name].erase(nodes_[NodeName(output_name)]);
+  }
+
+  void UpdateInput(const string& node_name, const string& old_input_name,
+                   const string& new_input_name) {
+    RemoveOutput(NodeName(old_input_name), node_name);
+    AddOutput(NodeName(new_input_name), node_name);
+  }
+
+  void RemoveInputs(const string& node_name) {
+    auto node = nodes_[node_name];
+    for (const auto& input : node->input()) {
+      RemoveOutput(NodeName(input), node->name());
+    }
+  }
+
+  void RemoveOutputs(const string& node_name) { outputs_.erase(node_name); }
+
+  void UpdateOutput(const string& node_name, const string& old_output_name,
+                    const string& new_output_name) {
+    absl::flat_hash_set<NodeDef*>& outputs = outputs_[node_name];
+    outputs.erase(nodes_[NodeName(old_output_name)]);
+    outputs.insert(nodes_[NodeName(new_output_name)]);
+  }
+
+ private:
+  // Helper method to get the NodeDef pointer of i-th node in a graph.
+  inline NodeDefT* GetNodeDefFromGraph(GraphDefT* graph, int64_t i) const;
+
+  const absl::flat_hash_set<NodeDefT*> empty_set_;
+  absl::node_hash_map<string, NodeDefT*> nodes_;
+  absl::node_hash_map<string, absl::flat_hash_set<NodeDefT*>> outputs_;
+};
+
+// Specialized template class method GetNodeDefFromGraph.
+template <>
+inline NodeDef* NodeMapInternal<GraphDef, NodeDef>::GetNodeDefFromGraph(
+    GraphDef* graph, int64_t i) const {
+  return graph->mutable_node(i);
+}
+
+template <>
+inline const NodeDef*
+NodeMapInternal<const GraphDef, const NodeDef>::GetNodeDefFromGraph(
+    const GraphDef* graph, int64_t i) const {
+  return &graph->node(i);
+}
+}  // namespace internal
+
+// A utility class to lookup a node and its outputs by node name.
+class NodeMap : public internal::NodeMapInternal<GraphDef, NodeDef> {
+ public:
+  explicit NodeMap(GraphDef* graph) : NodeMapInternal(graph) {}
+};
+
+// Same to NodeMap, but uses const GraphDef.
+class ImmutableNodeMap
+    : public internal::NodeMapInternal<const GraphDef, const NodeDef> {
+ public:
+  explicit ImmutableNodeMap(const GraphDef* graph) : NodeMapInternal(graph) {}
+};
+
+// A vector with a set. The set stores the same elements as the vector, and
+// quickly answers whether a value is in the vector. Duplicated elements are not
+// allowed for now.
+template <class T, class Hash = std::hash<T>>
+class SetVector {
+ public:
+  // Returns false if value already existed in the set, true otherwise.
+  bool PushBack(const T& value) {
+    if (!set_.insert(value).second) {
+      return false;
+    }
+    vector_.push_back(value);
+    return true;
+  }
+
+  T PopBack() {
+    T back = vector_.back();
+    set_.erase(back);
+    vector_.pop_back();
+    return back;
+  }
+
+  bool Exists(const T& value) const { return set_.find(value) != set_.end(); }
+
+  bool Empty() const { return vector_.empty(); }
+
+  void Reserve(int64_t size) { vector_.reserve(size); }
+
+ private:
+  gtl::FlatSet<T, Hash> set_;
+  std::vector<T> vector_;
+};
+
+// Returns formatted string from TensorId specific to grappler. Specifically,
+// for the 0 port (first output), only the node name is returned.
+string TensorIdToString(const TensorId& tensor_id);
+
+// Returns formatted string from SafeTensorId specific to grappler.
+// Specifically, for the 0 port (first output), only the node name is returned.
+string SafeTensorIdToString(const SafeTensorId& tensor_id);
+
+// True iff 'name' refers to a control inputs, i.e. a node name prefixed with
+// the ^ character.
+bool IsControlInput(absl::string_view name);
+
+// True iff tensor index refers to a control input.
+bool IsControlInput(const TensorId& tensor_id);
+
+// True iff 'name1' and 'name2' refer to the same input.
+bool IsSameInput(const string& name1, const string& name2);
+
+
+// Add a prefix to a node name with a custom delimiter.
+string AddPrefixToNodeName(const string& name, const string& prefix,
+                           const string& delimiter);
+
+// Add a prefix to a node name.
+string AddPrefixToNodeName(const string& name, const string& prefix);
+
+// Executes a 'fn' in the 'thread_pool'. The method waits for the configured
+// timeout (in milliseconds) for 'fn' to complete, before returning false.
+//
+// If returning false, the 'fn' may still continue to execute in the
+// thread-pool. It is the responsibility of the caller to reset the thread-pool
+// as appropriate.
+bool ExecuteWithTimeout(std::function<void()> fn, int64_t timeout_in_ms,
+                        thread::ThreadPool* thread_pool);
+
+// Returns the node name prefixed with conventional symbol '^'
+// for control dependency, given a NodeDef.
+string AsControlDependency(const NodeDef& node);
+
+// Returns the node name prefixed with conventional symbol '^'
+// for control dependency, given a node name
+string AsControlDependency(const string& node);
+
+// Returns true if the node is assigned to run on CPU device.
+bool NodeIsOnCpu(const NodeDef* node);
+
+// Returns true if the node is assigned to run on GPU device.
+bool NodeIsOnGpu(const NodeDef* node);
+
+// Returns the number of outputs of a node according to its OpDef. Note that
+// some of the outputs may be unconnected.
+int NumOutputs(const NodeDef& node, GraphDef* graph);
+
+// Returns true iff the node has at least one control input.
+bool HasControlInputs(const NodeDef& node);
+
+// Returns true iff the node has at least one regular input.
+bool HasRegularInputs(const NodeDef& node);
+
+// Returns true iff the node has at least one regular output.
+bool HasRegularOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Returns true iff the node has at least one control output.
+bool HasControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Number of connected control inputs.
+int NumControlInputs(const NodeDef& node);
+
+// Number of connected non-control inputs.
+int NumNonControlInputs(const NodeDef& node);
+
+// Number of connected control outputs.
+int NumControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Number of connected non-control outputs.
+int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Number of connected non-control data outputs (Ops that consume output tensor
+// data, not just it's shape).
+int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Removes redundant control inputs from node.
+void DedupControlInputs(NodeDef* node);
+
+// Returns an error if an attribute with the given key does not exist in node.
+absl::Status CheckAttrExists(const NodeDef& node, const string& key);
+
+// Returns an error if attributes with the given keys do not exist in node.
+absl::Status CheckAttrsExist(const NodeDef& node,
+                             absl::Span<const string> keys);
+
+// Returns the data type in attribute `attr_name` of `node`. If that attribute
+// doesn't exist, returns DT_INVALID.
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr);
+
+// Returns the last node in the simple chain starting at source and traversing
+// through the input(0) edge from each node as long as the next node satisfies
+// the predicate given in pred_fn. If no nodes satisfy the predicate, &source
+// will be returned. Example: For the chain
+//    source <- a <- b <- ... <- y <- z
+// where
+//    pred_fn(a) = pred_fn(b) = ... = pred_fn(y) = true,
+//    pred_fn(z) = false,
+// the return value will be a pointer to y.
+NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
+                        bool follow_control_input,
+                        const std::function<bool(const NodeDef&)>& pred_fn);
+
+// Permute the nodes of graph in place according to the permutation.
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation);
+
+// Returns OkStatus() if a kernel is registered for node.op() on the device
+// type corresponding to node.device().
+absl::Status IsKernelRegisteredForNode(
+    absl::string_view node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    absl::string_view node_op, absl::string_view node_device,
+    AttrSlice node_attrs);
+absl::Status IsKernelRegisteredForNode(const NodeDef& node);
+
+absl::Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
+
+void EraseNodesFromGraph(const std::set<int>& nodes_to_delete, GraphDef* graph);
+
+void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph);
+
+void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
+                         GraphDef* graph);
+
+// Erase all attributes without leading underscore. Returns the number of
+// attributes erased.
+int EraseRegularNodeAttributes(NodeDef* node);
+
+// Erase attribute "_xla_inferred_shapes" as well as all attributes starting in
+// "_output_".
+int EraseNodeOutputAttributes(NodeDef* node);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/canonicalizer.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/canonicalizer.h
new file mode 100644
index 00000000..a913fc25
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/canonicalizer.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Canonicalizes node by performing the following steps
+//  - sorting control inputs,
+//  - sorting data inputs if the node represents a commutative op.
+void CanonicalizeNode(NodeDef* node);
+
+// Canonicalizes all nodes in graph.
+void CanonicalizeGraph(GraphDef* graph);
+
+// Compresses Const and HostConstant nodes in the graph to the smallest
+// representation possible, either
+//   a) truncated repeated field representation, or
+//   b) raw serialized byte format.
+// Each node is only modified if it is larger than 64 bytes and compression
+// reduces its size by more than 50%.
+void CompressConstants(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/colocation.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/colocation.h
new file mode 100644
index 00000000..6062db61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/colocation.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Evaluates the colocation relation in the graph and rewrites the new
+// colocation relation in the graph. We scan the graph nodes sequentially, and
+// builds a disjoint-sets of nodes (within each disjoint-set the nodes are
+// colocated with each other). We then select the root node of each set as a
+// representative node, and then colocate each node within the set (should also
+// exist in graph) with the representative node.
+// Note that there is current one situation this function can't handle:
+// Node A colocates with X, node B colocates with Y, X colocates with Y but
+// X, Y are removed from graph. In this case we can't know A colocates with B.
+void ReassignColocation(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/frame.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/frame.h
new file mode 100644
index 00000000..d66cfb58
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/frame.h
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// FrameView is a helper class that allows to find in what execution frames (if
+// any) the given node can be running in. It's constructed from an immutable
+// GraphView, and any modification of the underlying graph might invalidate it.
+//
+// All execution frames assigned a unique integer id, but they do not have any
+// meaning whatsoever, it's just a sequence number.
+//
+// See the paper "Dynamic Control Flow in Large-Scale Machine Learning" for
+// detailed explanation of execution frames (https://arxiv.org/abs/1805.01772).
+class FrameView {
+ public:
+  FrameView() : is_inferred_(false), num_frames_(0) {}
+
+  // Infers nodes execution frames from the GraphView. Returns an error if
+  // called multiple times.
+  absl::Status InferFromGraphView(const utils::GraphView& graph_view);
+  // Infers nodes execution frames from the MutableGraphView. Returns an error
+  // if called multiple times.
+  absl::Status InferFromGraphView(const utils::MutableGraphView& graph_view);
+  // Infers nodes execution by constructing temporary GraphView and passing it
+  // to InferFromGraphView.
+  absl::Status InferFromGraph(const GraphDef& graph);
+
+  // Returns all frames of the given node (denoted by their frame ids) in
+  // outermost-to-innermost order.
+  const std::vector<int>& Frames(const NodeDef& node) const;
+
+  // Returns true iff the node is at least in one execution frame.
+  bool IsInFrame(const NodeDef& node) const;
+
+  int num_frames() const { return num_frames_; }
+  bool is_inferred() const { return is_inferred_; }
+
+ private:
+  template <typename GraphViewT>
+  inline absl::Status InferFromGraphViewT(const GraphViewT& graph_view);
+
+  bool is_inferred_;  // true if it was inferred from the graph
+  int num_frames_;    // number of frames present in a graph
+  absl::flat_hash_map<const NodeDef*, std::vector<int>> node_to_frames_;
+
+  // We return a reference to this vector if node has no frames.
+  const std::vector<int> node_has_no_frames_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/functions.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/functions.h
new file mode 100644
index 00000000..0006a260
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/functions.h
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Function input argument instantiated into an '_Arg' node in the function body
+// graph, with an 'index' attribute corresponding to the input position.
+struct InputArgInstantiation {
+  InputArgInstantiation(string node_name, DataType data_type)
+      : node_name(std::move(node_name)), data_type(data_type) {}
+  string node_name;
+  DataType data_type;
+};
+
+// Function output instantiated into a '_Retval' node in the function body
+// graph, with an 'index' attribute corresponding to the output position.
+struct OutputArgInstantiation {
+  OutputArgInstantiation(string node_name, DataType data_type)
+      : node_name(std::move(node_name)), data_type(data_type) {}
+  string node_name;
+  DataType data_type;
+};
+
+// A mapping from control output name to node name in function body graph.
+struct ControlOutput {
+  string output_name;
+  string node_name;
+  bool operator<(const ControlOutput& a) const {
+    return output_name < a.output_name;
+  }
+};
+
+// A special case of GrapplerItem, constructed from a TensorFlow Function.
+class GrapplerFunctionItem : public GrapplerItem {
+ public:
+  GrapplerFunctionItem() = default;
+
+  const string& description() const;
+
+  const std::vector<InputArgInstantiation>& inputs() const;
+  const InputArgInstantiation& input(int i) const;
+  const std::size_t input_size() const;
+
+  const std::vector<OutputArgInstantiation>& outputs() const;
+  const OutputArgInstantiation& output(int i) const;
+  const std::size_t output_size() const;
+
+  const std::vector<ControlOutput>& control_outputs() const;
+  const std::size_t control_output_size() const;
+
+  const AttrSlice& func_attr() const;
+  const std::vector<const FunctionDef::ArgAttrs*>& arg_attr() const;
+  const GraphDef& function_body() const;
+  GraphDef& mutable_function_body();
+
+  bool is_stateful() const;
+
+  GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
+
+ private:
+  friend absl::Status MakeGrapplerFunctionItem(const FunctionDef&,
+                                               const AttrSlice&,
+                                               const FunctionLibraryDefinition&,
+                                               int, GrapplerFunctionItem*);
+  friend absl::Status ReplaceInputWithConst(const NodeDef&, int,
+                                            GrapplerFunctionItem*);
+  friend absl::Status RemoveFunctionOutputs(const absl::flat_hash_set<int>&,
+                                            GrapplerFunctionItem*,
+                                            std::vector<std::pair<int, int>>*);
+
+  GrapplerFunctionItem(string func_name, string description,
+                       AttrSlice func_attr,
+                       std::vector<const FunctionDef::ArgAttrs*> arg_attr,
+                       std::vector<InputArgInstantiation> input_args,
+                       std::vector<OutputArgInstantiation> output_args,
+                       std::vector<ControlOutput> control_outputs,
+                       int graph_def_version, bool is_stateful,
+                       GraphDef&& function_body);
+
+  string description_;
+  AttrSlice func_attr_;  // Attributes specific to function definition that
+                         // produced this item (FuncDef.attr field).
+
+  // Attributes of function arguments
+  std::vector<const FunctionDef::ArgAttrs*> arg_attr_;
+
+  std::vector<InputArgInstantiation> input_args_;
+  std::vector<OutputArgInstantiation> output_args_;
+  std::vector<ControlOutput> control_outputs_;
+
+  bool is_stateful_ = false;
+};
+
+// Check if function input/output types are fully defined only at instantiation
+// time (parametrized by its instantiation node).
+bool HasParametrizedType(const FunctionDef& func);
+
+// Check if a function body is parametrized by its instantiation node. Function
+// body is parametrized, if it has at least one node with a 'placeholder'
+// attribute.
+bool HasParametrizedBody(const FunctionDef& func);
+
+// Check if function has parametrized type or body.
+bool IsParametrized(const FunctionDef& func);
+
+// Resolve function instantiation type parameters from the attributes of the
+// caller node. Return error if type can't be resolved.
+absl::Status InstantiationTypeParameters(
+    const FunctionDef& func, const AttrSlice& func_instantiation_attr,
+    absl::flat_hash_map<string, DataType>* type_parameters);
+
+// Resolve function instantiation body parameters (values for the function body
+// attr placeholders) from the attributes of the caller node. Return error if
+// type can't be resolved.
+absl::Status InstantiationBodyParameters(
+    const FunctionDef& func, const AttrSlice& func_instantiation_attr,
+    absl::flat_hash_map<string, AttrValue>* body_parameters);
+
+// Replace one of the function inputs with a constant.
+absl::Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
+                                   GrapplerFunctionItem* item);
+
+// Removes outputs from instantiated grappler function item. For all active
+// function outputs that changed its output index, this function adds an output
+// mapping (std::pair<old index, new index>).
+absl::Status RemoveFunctionOutputs(
+    const absl::flat_hash_set<int>& remove_outputs, GrapplerFunctionItem* item,
+    std::vector<std::pair<int, int>>* output_mapping);
+
+// TODO(ezhulenev, b/120103818): Add RemoveFunctionInputs.
+
+// Make a GrapplerFunctionItem from the function definition and function
+// instantiation attributes (caller node attributes). Returns error if the given
+// function def cannot be converted (e.g. not all attributes are defined).
+absl::Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                      const AttrSlice& func_instantiation_attr,
+                                      const FunctionLibraryDefinition& flib,
+                                      int graph_def_version,
+                                      GrapplerFunctionItem* item);
+
+// Make a GrapplerFunction item from the function definition. Function must be
+// fully defined (no type or body parametrization).
+// TODO(ezhulenev): Support parametrized functions without fully defined
+// instantiation attributes? Do we ever want to optimize parametrized function
+// without specializing it to its instantiation attributes (at least types)?
+absl::Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                      const FunctionLibraryDefinition& flib,
+                                      int graph_def_version,
+                                      GrapplerFunctionItem* item);
+
+// Make a FunctionDef from the GrapplerFunctionItem. Use function library
+// definition to lookup function body nodes output names and ranges.
+absl::Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                             const FunctionLibraryDefinition& flib,
+                             FunctionDef* func);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/graph_view.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/graph_view.h
new file mode 100644
index 00000000..3398e338
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/graph_view.h
@@ -0,0 +1,541 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/utils/graph_view_internal.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+
+class NodeView;
+
+class GraphView;
+
+// FaninView is a helper class to represent fanouts of a node. This holds a
+// pointer to GraphView, the index of the node being represented from GraphView,
+// and the input index (hence is labeled as Fanin).
+class FaninView : public internal::NodeIndexAndPortIndex<NodeView, GraphView> {
+ public:
+  FaninView() : NodeIndexAndPortIndex() {}
+
+  FaninView(GraphView* graph_view, int node_index, int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  FaninView(NodeView* node_view, int index);
+
+ private:
+  friend class NodeView;
+  friend class GraphView;
+};
+
+// FanoutView is a helper class to represent fanins of a node. This holds a
+// pointer to GraphView, the index of the node being represented from GraphView,
+// and the output index (hence is labeled as Fanout).
+class FanoutView : public internal::NodeIndexAndPortIndex<NodeView, GraphView> {
+ public:
+  FanoutView() : NodeIndexAndPortIndex() {}
+
+  FanoutView(GraphView* graph_view, int node_index, int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  FanoutView(NodeView* node_view, int index);
+
+ private:
+  friend class NodeView;
+  friend class GraphView;
+};
+
+// Immutable NodeView that keeps the constness of the NodeDef. This allows for
+// lookups of fanins and fanouts, and traversals of the graph, but no mutations.
+// No dedupping of fanins will be performed on the node to preserve it's
+// constness.
+class NodeView : public internal::NodeViewInternal<FaninView, FanoutView,
+                                                   GraphView, true> {
+ public:
+  explicit NodeView(GraphView* graph_view, int node_index)
+      : NodeViewInternal(graph_view, node_index) {}
+
+  NodeView() : NodeViewInternal() {}
+
+  ~NodeView() override = default;
+
+  NodeView(NodeView&&) = default;
+  NodeView& operator=(NodeView&&) = default;
+
+  const NodeDef* node() const override;
+
+  // Checks if a fanin exists for the node.
+  bool HasFanin(const FanoutView& fanin) const override;
+
+  // Checks if a fanout exists for the node.
+  bool HasFanout(const FaninView& fanout) const override;
+
+ private:
+  inline const FanoutView& GetMissingFanin() const override;
+
+  inline const std::vector<FaninView>& GetMissingFanout() const override;
+
+  absl::flat_hash_set<internal::NodeDefAndPortIndex> fanins_set_;
+
+  friend class FaninView;
+  friend class FanoutView;
+  friend class GraphView;
+};
+
+// Immutable GraphView that keeps the constness of the GraphDef. This allows
+// for lookups and traversals of the graph, but no mutations.
+class GraphView : public internal::GraphViewInternal<NodeView, FaninView,
+                                                     FanoutView, true> {
+ public:
+  explicit GraphView(const GraphDef* graph, absl::Status* status);
+  ~GraphView() override = default;
+
+ private:
+  bool AddUniqueNodeInternal(const NodeDef* node);
+
+  absl::Status CheckAndAddFaninsInternal(NodeView* node_view);
+
+  friend class NodeView;
+};
+
+class MutableNodeView;
+
+class MutableGraphView;
+
+class Mutation;
+
+// MutableFaninView is a helper class to represent fanouts of a node. This holds
+// a pointer to MutableGraphView, the index of the node from MutableGraphView
+// being mutated, and the input index (hence is labeled as Fanin).
+class MutableFaninView
+    : public internal::NodeIndexAndPortIndex<MutableNodeView,
+                                             MutableGraphView> {
+ public:
+  MutableFaninView() : NodeIndexAndPortIndex() {}
+
+  MutableFaninView(MutableGraphView* graph_view, int node_index, int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  explicit MutableFaninView(MutableGraphView* graph_view, int node_index,
+                            int port_index, int fanin_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index),
+        fanin_index_(fanin_index) {
+    // TODO(lyandy): Remove once constructor is not public.
+    DCHECK(port_index < 0 || port_index == fanin_index);
+  }
+
+  MutableFaninView(MutableNodeView* node_view, int index);
+
+ private:
+  // Index of associated fanin in fanout's underlying MutableNodeView. For
+  // regular fanouts, this will be the same as port_index (index of the
+  // associated fanin in MutableNodeView::regular_fanins_). For controlled
+  // fanouts, this will be the index of the associated fanin in
+  // MutableNodeView::controlling_fanins_.
+  int fanin_index_ = internal::kMissingIndex;
+
+  friend class MutableNodeView;
+  friend class MutableGraphView;
+  friend class Mutation;
+};
+
+// MutableFanoutView is a helper class to represent fanins of a node. This holds
+// a pointer to MutableGraphView, the index of the node from MutableGraphView
+// being mutated, and the output index (hence is labeled as Fanout).
+class MutableFanoutView
+    : public internal::NodeIndexAndPortIndex<MutableNodeView,
+                                             MutableGraphView> {
+ public:
+  MutableFanoutView() : NodeIndexAndPortIndex() {}
+
+  MutableFanoutView(MutableGraphView* graph_view, int node_index,
+                    int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  explicit MutableFanoutView(MutableGraphView* graph_view, int node_index,
+                             int port_index, int fanout_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index),
+        fanout_index_(fanout_index) {}
+
+  MutableFanoutView(MutableNodeView* node_view, int index);
+
+ private:
+  // Index of associated fanout in fanin's underlying MutableNodeView. For
+  // regular fanins, this will be the index of the associated fanout in
+  // MutableNodeView::regular_fanouts_by_port_[port_index]. For controlled
+  // fanins, this will be the index of the associated fanout in
+  // MutableNodeView::controlled_fanouts_.
+  int fanout_index_ = internal::kMissingIndex;
+
+  friend class MutableNodeView;
+  friend class MutableGraphView;
+  friend class Mutation;
+};
+
+// Mutable NodeView that holds a mutable NodeDef. This allows for lookups of
+// fanins and fanouts, and traversals of the graph. Control dependencies will be
+// dedupped among other control dependencies on initialization via
+// MutableGraphView. Mutations should be handled via MutableGraphView and not
+// directly on the mutable NodeDef.
+class MutableNodeView
+    : public internal::NodeViewInternal<MutableFaninView, MutableFanoutView,
+                                        MutableGraphView, false> {
+ public:
+  explicit MutableNodeView(MutableGraphView* graph_view, int node_index)
+      : NodeViewInternal(graph_view, node_index) {}
+
+  MutableNodeView() : NodeViewInternal() {}
+
+  ~MutableNodeView() override = default;
+
+  MutableNodeView(MutableNodeView&&) = default;
+  MutableNodeView& operator=(MutableNodeView&&) = default;
+
+  NodeDef* node() const override;
+
+  // Checks if a fanin exists for the node.
+  bool HasFanin(const MutableFanoutView& fanin) const override;
+
+  // Checks if a fanout exists for the node.
+  bool HasFanout(const MutableFaninView& fanout) const override;
+
+ private:
+  inline const MutableFanoutView& GetMissingFanin() const override;
+
+  inline const std::vector<MutableFaninView>& GetMissingFanout() const override;
+
+  absl::flat_hash_map<internal::NodeDefAndPortIndex, int> fanins_count_;
+  absl::flat_hash_map<absl::string_view, int> controlling_fanins_index_;
+  // Index of associated MutableNodeViewDiff in Mutation::updated_nodes_.
+  // If this is -1, there exists no MutableNodeViewDiff for this node.
+  int update_index_ = internal::kMissingIndex;
+
+  friend class MutableFaninView;
+  friend class MutableFanoutView;
+  friend class MutableGraphView;
+  friend class Mutation;
+};
+
+class MutationNewNode {
+ public:
+  MutationNewNode() {}
+
+ private:
+  explicit MutationNewNode(Mutation* mutation, int mutation_counter, int index)
+      : mutation_(mutation),
+        mutation_counter_(mutation_counter),
+        index_(index) {}
+
+  Mutation* mutation_ = nullptr;
+  int mutation_counter_ = internal::kMissingSlot;
+  int index_ = internal::kMissingIndex;
+
+  friend class Mutation;
+};
+
+// Mutation is a helper class that allows rewrites of MutableGraphView. This
+// should not be initialized or be used directly.
+// Note, if a node is renamed to another node, or a new node is created with the
+// same name as an existing node, the node with the same name originally in the
+// graph will be overwritten.
+class Mutation {
+ public:
+  // Create a new node to be added to the graph. If the node's fanins are not
+  // well formed (self loops, control dependencies between regular fanins), the
+  // `status` will be set.
+  MutationNewNode AddNode(NodeDef&& node, absl::Status* status);
+
+  // Remove an existing node in the graph.
+  void RemoveNode(MutableNodeView* node);
+
+  // Update the name of an existing node.
+  void UpdateNodeName(MutableNodeView* node, absl::string_view name);
+
+  // Update the name of a new node.
+  void UpdateNodeName(const MutationNewNode& node, absl::string_view name);
+
+  // Update the op of an existing node.
+  void UpdateNodeOp(MutableNodeView* node, absl::string_view op);
+
+  // Update the op of a new node.
+  void UpdateNodeOp(const MutationNewNode& node, absl::string_view op);
+
+  // Update the device of an existing node.
+  void UpdateNodeDevice(MutableNodeView* node, absl::string_view device);
+
+  // Update the device of a new node.
+  void UpdateNodeDevice(const MutationNewNode& node, absl::string_view device);
+
+  // Add or replace regular fanin `fanin` at `index` for an existing node.
+  void AddOrUpdateRegularFanin(MutableNodeView* node, int index,
+                               const TensorId& fanin);
+
+  // Add or replace regular fanin `fanin` at `index` for a new node.
+  void AddOrUpdateRegularFanin(const MutationNewNode& node, int index,
+                               const TensorId& fanin);
+
+  // Remove regular fanin at `index` for an existing node.
+  void RemoveRegularFanin(MutableNodeView* node, int index);
+
+  // Remove regular fanin at `index` for a new node.
+  void RemoveRegularFanin(const MutationNewNode& node, int index);
+
+  // Add controlling fanin `fanin_node_name` for an existing node.
+  void AddControllingFanin(MutableNodeView* node,
+                           absl::string_view fanin_node_name);
+
+  // Add controlling fanin `fanin_node_name` for a new node.
+  void AddControllingFanin(const MutationNewNode& node,
+                           absl::string_view fanin_node_name);
+
+  // Remove controlling fanin `fanin_node_name` for an existing node.
+  void RemoveControllingFanin(MutableNodeView* node,
+                              absl::string_view fanin_node_name);
+
+  // Remove controlling fanin `fanin_node_name` for a new node.
+  void RemoveControllingFanin(const MutationNewNode& node,
+                              absl::string_view fanin_node_name);
+
+  // Add or replace attribute `attr_name` with `attr_value` for an existing
+  // node.
+  void AddOrUpdateNodeAttr(MutableNodeView* node, absl::string_view attr_name,
+                           const AttrValue& attr_value);
+
+  // Add or replace attribute `attr_name` with `attr_value` for a new node.
+  void AddOrUpdateNodeAttr(const MutationNewNode& node,
+                           absl::string_view attr_name,
+                           const AttrValue& attr_value);
+
+  // Remove attribute `attr_name` for an existing node.
+  void RemoveNodeAttr(MutableNodeView* node, absl::string_view attr_name);
+
+  // Remove attribute `attr_name` for a new node.
+  void RemoveNodeAttr(const MutationNewNode& node, absl::string_view attr_name);
+
+  // Reset and clear mutation.
+  void Reset();
+
+  // Applies the Mutation to the graph. If the mutation is valid, the graph will
+  // be modified. Otherwise an error status will be returned and the graph will
+  // not be modified.
+  absl::Status Apply();
+
+ private:
+  explicit Mutation(MutableGraphView* graph_view);
+
+  void ResetInternal();
+
+  using MutableNodeViewDiff = internal::NodeViewDiff<MutableGraphView>;
+
+  // Adds a mutation to the `node`. Mutation function `mutate_fn` must return
+  // `true` if it actually does any mutations. If it returns `false` mutation
+  // will be ignored.
+  void AddMutation(MutableNodeView* node,
+                   std::function<bool(MutableNodeViewDiff*)> mutate_fn);
+
+  MutableGraphView* graph_view_ = nullptr;
+  int mutation_counter_ = 0;
+  std::vector<MutableNodeViewDiff> updated_nodes_;
+  absl::flat_hash_set<int> removed_nodes_;
+
+  using MutationNewNodeHolder = internal::NewNode<MutableGraphView>;
+  std::vector<MutationNewNodeHolder> new_nodes_;
+
+  friend class MutableGraphView;
+};
+
+// Mutable GraphView that holds a mutable GraphDef. This allows for lookups and
+// traversals of the graph. Control dependencies will be dedupped among other
+// control dependencies on initialization. Mutations should be handled using
+// this API instead of directly on the GraphDef/NodeDef.
+// Note, after a mutation, pointers of MutableNodeView's from MutableGraphView
+// may be invalidated.
+class MutableGraphView
+    : public internal::GraphViewInternal<MutableNodeView, MutableFaninView,
+                                         MutableFanoutView, false> {
+ public:
+  explicit MutableGraphView(GraphDef* graph, absl::Status* status);
+  ~MutableGraphView() override = default;
+
+  // Returns a Mutation (builder) that can be used to modify MutableGraphView.
+  Mutation* GetMutationBuilder();
+
+  // Helper class representing an extra dependency for topological sorting.
+  class TopologicalDependency {
+   public:
+    TopologicalDependency(const MutableNodeView* from_node,
+                          const MutableNodeView* to_node) {
+      if (from_node->graph_view_ == to_node->graph_view_) {
+        graph_view_ = from_node->graph_view_;
+        from_ = from_node->node_index_;
+        to_ = to_node->node_index_;
+      }
+    }
+
+   private:
+    MutableGraphView* graph_view_ = nullptr;
+    int from_ = internal::kMissingIndex;
+    int to_ = internal::kMissingIndex;
+
+    friend class MutableGraphView;
+  };
+
+  // Sorts graph topologically in-place. If `ignore_cycles` is set, a
+  // topological like sorting will be performed when there are cycles. Otherwise
+  // if a cycle is detected or if the graph cannot be sorted, an error will be
+  // returned.
+  absl::Status SortTopologically(
+      bool ignore_cycles,
+      absl::Span<const TopologicalDependency> extra_dependencies);
+
+ private:
+  bool AddUniqueNodeInternal(NodeDef* node);
+
+  absl::Status CheckFaninsInternal(std::vector<std::vector<TensorId>>* fanins);
+
+  void AddFaninsInternal(std::vector<std::vector<TensorId>>* fanins);
+
+  // RenamedOrOverwrittenNode holds a index to Mutation::updated_nodes_ for a
+  // renamed node, alongside a potential overwritten node index in the actual
+  // graph. If the renamed node is not overwriting any existing nodes,
+  // `overwritten_node_index_` will be set to `internal::kMissingIndex`.
+  class RenamedOrOverwrittenNode {
+   public:
+    RenamedOrOverwrittenNode(int renamed_update_index,
+                             int overwritten_node_index)
+        : renamed_update_index_(renamed_update_index),
+          overwritten_node_index_(overwritten_node_index) {}
+
+   private:
+    int renamed_update_index_;
+    int overwritten_node_index_;
+
+    friend class MutableGraphView;
+  };
+
+  absl::Status GetNodeNamesAndPartitionUpdatedNodes(
+      absl::flat_hash_map<absl::string_view, int>* node_names,
+      std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+      std::vector<int>* inplace_nodes,
+      std::vector<int>* empty_diff_node_indices);
+
+  absl::Status RemovedOrMissingNodeFanoutsWellFormed(
+      const absl::flat_hash_map<absl::string_view, int>& node_names,
+      const std::vector<RenamedOrOverwrittenNode>& renamed_nodes);
+
+  absl::Status CheckNodeNamesAndFanins(
+      const absl::flat_hash_map<absl::string_view, int>& node_names,
+      const std::vector<RenamedOrOverwrittenNode>& renamed_nodes,
+      const std::vector<int>& inplace_nodes);
+
+  absl::Status CheckKernelRegisteredForNodes();
+
+  // Helper class to move fanouts around.
+  class NodeViewFanouts {
+   public:
+    NodeViewFanouts(
+        std::vector<std::vector<MutableFaninView>>&& regular_fanouts_by_port,
+        int num_regular_fanouts,
+        std::vector<MutableFaninView> controlled_fanouts)
+        : regular_fanouts_by_port_(std::move(regular_fanouts_by_port)),
+          num_regular_fanouts_(num_regular_fanouts),
+          controlled_fanouts_(std::move(controlled_fanouts)) {}
+
+   private:
+    std::vector<std::vector<MutableFaninView>> regular_fanouts_by_port_;
+    int num_regular_fanouts_ = 0;
+    std::vector<MutableFaninView> controlled_fanouts_;
+
+    friend class MutableGraphView;
+  };
+
+  template <typename T>
+  void ReplaceNodeFanouts(MutableNodeView* node, T* fanouts);
+
+  void FixRenamedNodes(
+      std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+      absl::flat_hash_map<string, NodeViewFanouts>* renamed_fanouts,
+      std::vector<bool>* overwritten_name_removed_nodes);
+
+  void AddNewNodes(
+      absl::flat_hash_map<string, NodeViewFanouts>* renamed_fanouts,
+      std::vector<int>* new_node_indices);
+
+  void FixRenamedFanouts(
+      const absl::flat_hash_map<string, NodeViewFanouts>& renamed_fanouts);
+
+  inline void RemoveRegularFaninFanoutInternal(MutableNodeView* node_view,
+                                               int i);
+
+  inline void AddRegularFaninInternal(MutableNodeView* node_view,
+                                      const SafeTensorId& fanin_id);
+
+  inline void UpdateRegularFaninInternal(MutableNodeView* node_view,
+                                         const int i,
+                                         const SafeTensorId& fanin_id);
+
+  inline void RemoveControllingFaninFanoutInternal(MutableNodeView* node_view,
+                                                   int i);
+
+  inline void RemoveControllingFaninInternal(
+      MutableNodeView* node_view, const std::set<int>& indices_to_remove);
+
+  inline void AddControllingFaninInternal(MutableNodeView* node_view,
+                                          absl::string_view fanin_node_name);
+
+  void ApplyNodeUpdates();
+
+  void SetNewNodesFanins(const std::vector<int>& new_node_indices);
+
+  inline void RemoveAllFaninFanoutInternal(MutableNodeView* node_view);
+
+  void RemoveNodesInternal(
+      const std::vector<RenamedOrOverwrittenNode>& renamed_nodes,
+      const std::vector<bool>& overwritten_name_removed_nodes);
+
+  inline absl::Status ValidateInternal(
+      absl::flat_hash_map<absl::string_view, int>* node_names,
+      std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+      std::vector<int>* inplace_nodes,
+      std::vector<int>* empty_diff_node_indices);
+
+  absl::Status ApplyMutationInternal();
+
+  Mutation mutation_;
+
+  friend class MutableNodeView;
+  friend class Mutation;
+};
+
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/graph_view_internal.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/graph_view_internal.h
new file mode 100644
index 00000000..d66b1ca0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/graph_view_internal.h
@@ -0,0 +1,920 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_INTERNAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_INTERNAL_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+namespace internal {
+
+constexpr int kMissingSlot = -2;
+constexpr int kMissingIndex = -1;
+constexpr int kNodeNamePresent = -1;
+
+// NodeIndexAndPortIndex is a helper class that represents fanins and fanouts
+// of a node.
+template <typename NodeViewT, typename GraphViewT>
+class NodeIndexAndPortIndex {
+ public:
+  NodeIndexAndPortIndex()
+      : graph_view_(nullptr),
+        node_index_(kMissingIndex),
+        port_index_(kMissingSlot) {}
+  NodeIndexAndPortIndex(GraphViewT* graph_view, int node_index, int port_index)
+      : graph_view_(graph_view),
+        node_index_(node_index),
+        port_index_(port_index) {}
+
+  bool operator==(const NodeIndexAndPortIndex& other) const {
+    return port_index_ == other.port_index_ &&
+           node_index_ == other.node_index_ && graph_view_ == other.graph_view_;
+  }
+
+  template <typename Hash>
+  friend Hash AbslHashValue(Hash h, const NodeIndexAndPortIndex& n) {
+    return Hash::combine(std::move(h), n.node_index_, n.port_index_);
+  }
+
+  // Returns NodeView from `graph_view_` at `node_index_`.
+  NodeViewT* node_view() const {
+    if (graph_view_ == nullptr) {
+      return nullptr;
+    }
+    return graph_view_->GetNode(node_index_);
+  }
+
+  // Returns node index in graph.
+  int node_index() const { return node_index_; }
+
+  // Returns input/output port index.
+  int index() const { return port_index_; }
+
+ protected:
+  GraphViewT* graph_view_;
+  int node_index_;
+  int port_index_;
+};
+
+// NodeDefAndPortIndex is a helper class that represents fanins hashed with
+// pointer stability using the fanin's NodeDef.
+class NodeDefAndPortIndex {
+ public:
+  NodeDefAndPortIndex(const NodeDef* node_def, int port_index)
+      : node_def_(node_def), port_index_(port_index) {}
+
+  bool operator==(const NodeDefAndPortIndex& other) const {
+    return node_def_ == other.node_def_ && port_index_ == other.port_index_;
+  }
+
+  template <typename Hash>
+  friend Hash AbslHashValue(Hash h, const NodeDefAndPortIndex& n) {
+    return Hash::combine(std::move(h), n.node_def_, n.port_index_);
+  }
+
+ private:
+  const NodeDef* node_def_;
+  int port_index_;
+};
+
+// NodeViewInternal is a helper class to simplify graph traversal. It creates
+// a view of a node and associated fanins and fanouts from the NodeDef
+// protocol buffer.
+//
+// There are two public classes implementing NodeViewInternal:
+//
+// - NodeView: constructed from `const NodeDef` and doesn't allow mutating the
+//   underlying node.
+// - MutableNodeView: constructed from `NodeDef` and allows mutating the
+//   underlying node.
+//
+// --------------------------- !!! WARNING !!! ---------------------------------
+//     Modifying the node outside of implementations of NodeViewInternal
+//     (i.e. modifying inputs of the NodeDef directly) may leave the NodeView
+//     in an inconsistent/invalid state.
+// -----------------------------------------------------------------------------
+//
+template <typename FaninViewT, typename FanoutViewT, typename GraphViewT,
+          bool IsConst>
+class NodeViewInternal {
+ private:
+  using NodeDefT =
+      typename std::conditional<IsConst, const NodeDef, NodeDef>::type;
+
+ public:
+  explicit NodeViewInternal(GraphViewT* graph_view, int node_index)
+      : graph_view_(graph_view),
+        node_index_(node_index),
+        attrs_(AttrSlice(graph_view->graph()->node(node_index))) {}
+
+  NodeViewInternal()
+      : graph_view_(nullptr), node_index_(kMissingIndex), attrs_(AttrSlice()) {}
+
+  virtual ~NodeViewInternal() {}
+
+  NodeViewInternal(NodeViewInternal&&) = default;
+  NodeViewInternal& operator=(NodeViewInternal&&) = default;
+
+  bool operator==(const NodeViewInternal& other) const {
+    return node_index_ == other.node_index_ && graph_view_ == other.graph_view_;
+  }
+
+  template <typename Hash>
+  friend Hash AbslHashValue(Hash h, const NodeViewInternal& n) {
+    return Hash::combine(std::move(h), n.node_index_);
+  }
+
+  // Returns NodeDef of view.
+  virtual NodeDefT* node() const = 0;
+
+  // Returns index of node in GraphDef/GraphView.
+  int node_index() const { return node_index_; }
+
+  // Returns the name of the node.
+  const string& GetName() const { return node()->name(); }
+
+  // Returns the op of the node.
+  const string& GetOp() const { return node()->op(); }
+
+  // Returns the device set for the node.
+  const string& GetDevice() const { return node()->device(); }
+
+  // Returns all regular fanins, based on ordering in the node.
+  const std::vector<FanoutViewT>& GetRegularFanins() const {
+    return regular_fanins_;
+  }
+
+  // Returns a regular fanin based on input index. If no such fanin exist, a
+  // missing fanin is returned, with no NodeView set and an index of -2.
+  const FanoutViewT& GetRegularFanin(int i) const {
+    int regular_fanins_size = regular_fanins_.size();
+    if (i < 0 || i >= regular_fanins_size) {
+      return GetMissingFanin();
+    }
+    return regular_fanins_[i];
+  }
+
+  // Returns all controlling fanins, based on ordering in the node.
+  const std::vector<FanoutViewT>& GetControllingFanins() const {
+    return controlling_fanins_;
+  }
+
+  // Returns all regular fanouts.
+  const std::vector<std::vector<FaninViewT>>& GetRegularFanouts() const {
+    return regular_fanouts_by_port_;
+  }
+
+  // Returns a regular fanout(s) based on output index. If no such output index
+  // exists, no fanouts will be returned.
+  const std::vector<FaninViewT>& GetRegularFanout(int i) const {
+    int regular_fanouts_by_port_size = regular_fanouts_by_port_.size();
+    if (i < 0 || i >= regular_fanouts_by_port_size) {
+      return GetMissingFanout();
+    }
+    return regular_fanouts_by_port_[i];
+  }
+
+  // Returns all controlled fanouts.
+  const std::vector<FaninViewT>& GetControlledFanouts() const {
+    return controlled_fanouts_;
+  }
+
+  // Returns the number of regular fanins.
+  int NumRegularFanins() const { return regular_fanins_.size(); }
+
+  // Returns the number of controlling fanins.
+  int NumControllingFanins() const { return controlling_fanins_.size(); }
+
+  // Returns the number of regular fanouts.
+  int NumRegularFanouts() const { return num_regular_fanouts_; }
+
+  // Returns the number of controlled fanouts.
+  int NumControlledFanouts() const { return controlled_fanouts_.size(); }
+
+  // Checks if a fanin exists for the node.
+  virtual bool HasFanin(const FanoutViewT& fanin) const = 0;
+
+  // Checks if a fanout exists for the node.
+  virtual bool HasFanout(const FaninViewT& fanout) const = 0;
+
+  // Returns an attribute of the node by key. If no attribute for such key
+  // exists, a `nullptr` is returned.
+  const AttrValue* GetAttr(absl::string_view attr_name) const {
+    return attrs_.Find(attr_name);
+  }
+
+  // Returns all attributes of the node.
+  const AttrSlice& GetAttrs() const { return attrs_; }
+
+  // Returns the number of attributes in the node.
+  int NumAttrs() const { return attrs_.size(); }
+
+  // Checks if an attribute exist in the node.
+  bool HasAttr(absl::string_view attr_name) const {
+    return attrs_.Find(attr_name) != nullptr;
+  }
+
+ protected:
+  virtual inline const FanoutViewT& GetMissingFanin() const = 0;
+  virtual inline const std::vector<FaninViewT>& GetMissingFanout() const = 0;
+
+  std::vector<FanoutViewT> regular_fanins_;
+  std::vector<FanoutViewT> controlling_fanins_;
+  std::vector<std::vector<FaninViewT>> regular_fanouts_by_port_;
+  int num_regular_fanouts_ = 0;
+  std::vector<FaninViewT> controlled_fanouts_;
+
+  GraphViewT* graph_view_;
+  int node_index_;
+  AttrSlice attrs_;
+};
+
+// GraphViewInternal is a helper class to simplify graph traversal. It creates
+// a view of the nodes and associated fanins and fanouts from the GraphDef
+// protocol buffer.
+//
+// There are two public classes implementing GraphViewInternal:
+//
+// - GraphView: constructed from `const GraphDef` and doesn't allow mutating
+//   the underlying graph and its nodes.
+// - MutableGraphView: constructed from `GraphDef` and allows mutating the
+//   underlying graph and its nodes.
+//
+// --------------------------- !!! WARNING !!! ---------------------------------
+//     Modifying the graph outside of implementations of GraphViewInternal
+//     (i.e. removing nodes from the GraphDef directly) may lead to
+//     segfaults! Guaranteed by absl::string_view!
+// -----------------------------------------------------------------------------
+//
+template <typename NodeViewT, typename FaninViewT, typename FanoutViewT,
+          bool IsConst>
+class GraphViewInternal {
+ private:
+  using GraphDefT =
+      typename std::conditional<IsConst, const GraphDef, GraphDef>::type;
+
+ public:
+  explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
+  virtual ~GraphViewInternal() {}
+
+  bool operator==(const GraphViewInternal& other) const {
+    return graph_ == other.graph_;
+  }
+
+  GraphDefT* graph() const { return graph_; }
+
+  // Finds node by index in the graph. If no such node exists in the graph, a
+  // `nullptr` is returned.
+  const NodeViewT* GetNode(int node_index) const {
+    int nodes_size = nodes_.size();
+    if (node_index < 0 || node_index >= nodes_size) {
+      return nullptr;
+    }
+    return &nodes_[node_index];
+  }
+
+  NodeViewT* GetNode(int node_index) {
+    int nodes_size = nodes_.size();
+    if (node_index < 0 || node_index >= nodes_size) {
+      return nullptr;
+    }
+    return &nodes_[node_index];
+  }
+
+  // Finds node by name. If no such node exists in the graph, a `nullptr` is
+  // returned.
+  const NodeViewT* GetNode(absl::string_view node_name) const {
+    auto it = node_index_by_name_.find(node_name);
+    if (it == node_index_by_name_.end()) {
+      return nullptr;
+    }
+    return &nodes_[it->second];
+  }
+
+  NodeViewT* GetNode(absl::string_view node_name) {
+    auto it = node_index_by_name_.find(node_name);
+    if (it == node_index_by_name_.end()) {
+      return nullptr;
+    }
+    return &nodes_[it->second];
+  }
+
+  // Returns all nodes (as NodeView) in the graph.
+  const std::vector<NodeViewT>& GetNodes() const { return nodes_; }
+
+  // Checks if a node by name exists in the graph.
+  bool HasNode(absl::string_view node_name) const {
+    return node_index_by_name_.contains(node_name);
+  }
+
+  // Returns the number of nodes in the graph.
+  int NumNodes() const { return nodes_.size(); }
+
+ protected:
+  // Reset allocated node vector and node map in case of failure.
+  void Reset() {
+    std::vector<NodeViewT>().swap(nodes_);
+    absl::flat_hash_map<absl::string_view, int>().swap(node_index_by_name_);
+  }
+
+  // nodes_[i] is a view of graph_.{mutable_}node(i).
+  std::vector<NodeViewT> nodes_;
+  absl::flat_hash_map<absl::string_view, int> node_index_by_name_;
+  GraphDefT* graph_;
+  const FanoutViewT missing_fanin_;
+  const std::vector<FaninViewT> missing_fanout_;
+};
+
+inline SafeTensorId EmptyTensorId() {
+  return SafeTensorId("", internal::kMissingSlot);
+}
+
+inline bool IsEmptyTensorId(const TensorId tensor_id) {
+  return tensor_id.node().empty() &&
+         tensor_id.index() == internal::kMissingSlot;
+}
+
+// NodeViewDiff is a helper struct holding changes to be made to an existing
+// node in GraphViewT. This should not be initialized or be used directly.
+template <typename GraphViewT>
+struct NodeViewDiff {
+  explicit NodeViewDiff(GraphViewT* graph_view, int node_index)
+      : graph_view(graph_view), node_index(node_index) {}
+
+  GraphViewT* graph_view;
+  int node_index;
+  string name;
+  bool update_name = false;
+  string op;
+  bool update_op = false;
+  string device;
+  bool update_device = false;
+  // Fanins to append after existing regular fanins.
+  std::vector<SafeTensorId> regular_inputs_to_add;
+  // Number of fanins to be appended. This is used for a quick comparison with
+  // `regular_inputs_to_add` for if there will be any missing inputs in the
+  // updated node.
+  int num_regular_inputs_to_add = 0;
+  // Fanins to update inplace.
+  std::map<int, SafeTensorId> regular_inputs_to_update;
+  // Fanins from end of regular fanins to remove. This keeps track of existing
+  // regular fanins in the original node to remove.
+  std::vector<bool> regular_inputs_to_remove;
+  // Number of fanins marked for removal. This is used for a quick comparison
+  // with `regular_inputs_to_remove` for if there will be any missing inputs
+  // in the updated node.
+  int num_regular_inputs_to_remove = 0;
+  absl::flat_hash_set<string> controlling_inputs_to_add;
+  std::set<int> controlling_inputs_to_remove;
+  absl::flat_hash_map<string, AttrValue> attrs_to_add;
+  absl::flat_hash_set<string> attrs_to_remove;
+  // AttrValueMap constructor and destructor are very expensive, we will
+  // initialize it lazily only if needed.
+  absl::optional<AttrValueMap> processed_attrs;
+};
+
+// Updates node name. If `name` is the same as the name in the original node,
+// the field will be cleared in the diff.
+template <typename GraphViewT>
+inline bool UpdateName(NodeViewDiff<GraphViewT>* diff, absl::string_view name) {
+  if (diff->graph_view->GetNode(diff->node_index)->GetName() == name) {
+    diff->name.clear();
+    diff->update_name = false;
+  } else {
+    diff->name = string(name);
+    diff->update_name = true;
+  }
+  return true;
+}
+
+// Updates node op. If `op` is the same as the op in the original node, the
+// field will be cleared in the diff.
+template <typename GraphViewT>
+inline bool UpdateOp(NodeViewDiff<GraphViewT>* diff, absl::string_view op) {
+  if (diff->graph_view->GetNode(diff->node_index)->GetOp() == op) {
+    diff->op.clear();
+    diff->update_op = false;
+  } else {
+    diff->op = string(op);
+    diff->update_op = true;
+  }
+  return true;
+}
+
+// Updates node device. If `device` is the same as the device in the original
+// node, the field will be cleared in the diff.
+template <typename GraphViewT>
+inline bool UpdateDevice(NodeViewDiff<GraphViewT>* diff,
+                         absl::string_view device) {
+  if (diff->graph_view->GetNode(diff->node_index)->GetDevice() == device) {
+    diff->device.clear();
+    diff->update_device = false;
+  } else {
+    diff->device = string(device);
+    diff->update_device = true;
+  }
+  return true;
+}
+
+// Adds or updates value in vector `v` at index `i`. This will also resize the
+// vector if index `i` is out of bounds, padding the vector with
+// `default_value`. Returns true if a new value was appended or if an update
+// occurred where an existing value was changed from `default_value`.
+template <typename T, typename U>
+inline bool AddOrUpdateAtIndex(std::vector<T>* v, int i, const U& value,
+                               const T& default_value) {
+  int v_size = v->size();
+  if (i > v_size) {
+    // Resize to include `value`, filling the newly introduced gap with
+    // `default_value` for later checks of validity (gaps in vector).
+    v->reserve(i + 1);
+    v->resize(i, default_value);
+    v->push_back({value});
+  } else if (i == v_size) {
+    // Vector is large enough, simply append `value` to the end.
+    v->push_back({value});
+  } else {
+    // Update existing value.
+    bool updated = (*v)[i] == default_value;
+    (*v)[i] = {value};
+    return updated;
+  }
+  return true;
+}
+
+// Checks if a node with name `node_name` will exist in the final mutated graph.
+template <typename GraphViewT>
+inline bool CheckNodeNameExists(
+    absl::string_view node_name,
+    const absl::flat_hash_map<absl::string_view, int>& updated_node_names,
+    const GraphViewT* graph_view) {
+  auto it = updated_node_names.find(node_name);
+  if (it != updated_node_names.end()) {
+    return it->second == kNodeNamePresent;
+  }
+  return graph_view->HasNode(node_name);
+}
+
+// Adds or updates regular fanin at `index` of regular fanins. If `index` is
+// less than the number of regular fanins in the original node, the fanin at
+// `index` in the original node will be updated with `fanin` if the fanin
+// differs. If `index` is greater than or equal to the number of regular fanins,
+// `fanin` will be added beyond the end of regular fanins at `index`.
+template <typename GraphViewT>
+inline bool AddOrUpdateRegularFanin(NodeViewDiff<GraphViewT>* diff, int index,
+                                    const TensorId& fanin) {
+  if (index < 0) {
+    // Not a valid index for regular fanins.
+    return false;
+  }
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  const int num_regular_fanins = node_view->NumRegularFanins();
+  if (index < num_regular_fanins) {  // Updating existing fanins.
+    // Calculate (relative) index from end of regular fanins, from absolute
+    // index from beginning of regular fanins.
+    const int relative_removal_index = num_regular_fanins - index - 1;
+    // Check if at relative index fanin was already marked for removal.
+    int diff_regular_inputs_to_remove_size =
+        diff->regular_inputs_to_remove.size();
+    if (relative_removal_index < diff_regular_inputs_to_remove_size &&
+        diff->regular_inputs_to_remove[relative_removal_index]) {
+      // Unmark fanin for removal.
+      diff->regular_inputs_to_remove[relative_removal_index] = false;
+      --diff->num_regular_inputs_to_remove;
+    }
+    const auto& existing_fanin = node_view->GetRegularFanin(index);
+    if (existing_fanin.index() != fanin.index() ||
+        existing_fanin.node_view()->GetName() != fanin.node()) {
+      // Update fanin if it is different from original fanin in node.
+      gtl::InsertOrUpdate(&diff->regular_inputs_to_update, index,
+                          SafeTensorId(fanin));
+    }
+  } else {
+    // Add fanin beyond current fanin range.
+    const int relative_add_index = index - num_regular_fanins;
+    if (AddOrUpdateAtIndex(&diff->regular_inputs_to_add, relative_add_index,
+                           fanin, EmptyTensorId())) {
+      // New fanin was added.
+      ++diff->num_regular_inputs_to_add;
+    }
+  }
+  return true;
+}
+
+// Remove regular fanin at `index` of regular fanins. This can remove existing
+// fanins and updated/added fanins via AddOrUpdateRegularFanins.
+template <typename GraphViewT>
+inline bool RemoveRegularFanin(NodeViewDiff<GraphViewT>* diff, int index) {
+  if (index < 0) {
+    // Not a valid index for regular fanins.
+    return false;
+  }
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  const int num_regular_fanins = node_view->NumRegularFanins();
+  if (index < num_regular_fanins) {  // Removing existing fanins.
+    // Remove updated fanin if it exists.
+    diff->regular_inputs_to_update.erase(index);
+    // Calculate (relative) index from end of regular fanins, from absolute
+    // index from beginning of regular fanins.
+    const int relative_removal_index = num_regular_fanins - index - 1;
+    if (AddOrUpdateAtIndex(&diff->regular_inputs_to_remove,
+                           relative_removal_index,
+                           /*value=*/true, /*default_value=*/false)) {
+      ++diff->num_regular_inputs_to_remove;
+    }
+  } else {
+    // Relative index from end of regular fanins.
+    const int relative_add_index = index - num_regular_fanins;
+    int diff_regular_inputs_to_add_size = diff->regular_inputs_to_add.size();
+    if (relative_add_index >= diff_regular_inputs_to_add_size ||
+        IsEmptyTensorId(diff->regular_inputs_to_add[relative_add_index])) {
+      // At relative index, appended regular fanin was already marked for
+      // removal.
+      return false;
+    }
+    // Remove added fanin.
+    diff->regular_inputs_to_add[relative_add_index] = EmptyTensorId();
+    --diff->num_regular_inputs_to_add;
+  }
+  return true;
+}
+
+// Adds controlling fanin. If the controlling fanin already exists in the
+// original node, it will be dedupped. If the controlling fanin is marked for
+// removal, this will reverse it.
+template <typename GraphViewT>
+inline bool AddControllingFanin(NodeViewDiff<GraphViewT>* diff,
+                                int control_index,
+                                absl::string_view fanin_node_name) {
+  if (control_index == kMissingIndex) {
+    diff->controlling_inputs_to_add.emplace(fanin_node_name);
+  } else {
+    diff->controlling_inputs_to_remove.erase(control_index);
+  }
+  return true;
+}
+
+// Remove controlling fanin. If the controlling fanin does not exist in the
+// original node and diff, nothing will happen. If the controlling fanin exists
+// in the diff, it will be removed. Otherwise the controlling fanin will be
+// marked for removal from the original node.
+template <typename GraphViewT>
+inline bool RemoveControllingFanin(NodeViewDiff<GraphViewT>* diff,
+                                   int control_index,
+                                   absl::string_view fanin_node_name) {
+  if (control_index == kMissingIndex) {
+    diff->controlling_inputs_to_add.erase(fanin_node_name);
+  } else {
+    diff->controlling_inputs_to_remove.emplace(control_index);
+  }
+  return true;
+}
+
+// Adds or updates an attribute by name. If an attribute exist in the original
+// node or diff (including those marked for removal), this will overwrite it.
+template <typename GraphViewT>
+inline bool AddOrUpdateAttribute(NodeViewDiff<GraphViewT>* diff,
+                                 absl::string_view attr_name,
+                                 const AttrValue& attr_value) {
+  diff->attrs_to_add.empty() ? 0 : diff->attrs_to_remove.erase(attr_name);
+  gtl::InsertOrUpdate(&diff->attrs_to_add, string(attr_name), attr_value);
+  return true;
+}
+
+// Removes an attribute by name. If an attribute exist in the original node or
+// diff, this will remove it.
+template <typename GraphViewT>
+inline bool RemoveAttribute(NodeViewDiff<GraphViewT>* diff,
+                            absl::string_view attr_name) {
+  const size_t num_erased =
+      diff->attrs_to_add.empty() ? 0 : diff->attrs_to_add.erase(attr_name);
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  if (node_view->HasAttr(attr_name)) {
+    diff->attrs_to_remove.emplace(attr_name);
+    return true;
+  }
+  return num_erased > 0;
+}
+
+// Removes trailing values in vector `v` for values equal to `value`.
+template <typename T>
+inline void ResizeByTrimmingEndForValue(std::vector<T>* v, const T& value) {
+  int curr_index = v->size();
+  const int last_index = v->size() - 1;
+  for (int i = last_index; i >= 0; --i) {
+    if ((*v)[i] == value) {
+      curr_index = i;
+    } else {
+      break;
+    }
+  }
+  if (curr_index <= last_index) {
+    v->resize(curr_index);
+  }
+}
+
+// Checks if any changes are set in the diff.
+template <typename GraphViewT>
+inline bool IsEmpty(NodeViewDiff<GraphViewT>* diff) {
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_remove, false);
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_add, EmptyTensorId());
+  return !diff->update_name && !diff->update_op && !diff->update_device &&
+         diff->regular_inputs_to_add.empty() &&
+         diff->regular_inputs_to_update.empty() &&
+         diff->regular_inputs_to_remove.empty() &&
+         diff->controlling_inputs_to_add.empty() &&
+         diff->controlling_inputs_to_remove.empty() &&
+         diff->attrs_to_add.empty() && diff->attrs_to_remove.empty();
+}
+
+// Resets and clears existing diff.
+template <typename GraphViewT>
+inline void Reset(NodeViewDiff<GraphViewT>* diff) {
+  diff->name.clear();
+  diff->update_name = false;
+  diff->op.clear();
+  diff->update_op = false;
+  diff->device.clear();
+  diff->update_device = false;
+  std::vector<SafeTensorId>().swap(diff->regular_inputs_to_add);
+  diff->num_regular_inputs_to_add = false;
+  std::map<int, SafeTensorId>().swap(diff->regular_inputs_to_update);
+  std::vector<bool>().swap(diff->regular_inputs_to_remove);
+  diff->num_regular_inputs_to_remove = 0;
+  absl::flat_hash_set<string>().swap(diff->controlling_inputs_to_add);
+  std::set<int>().swap(diff->controlling_inputs_to_remove);
+  absl::flat_hash_map<string, AttrValue>().swap(diff->attrs_to_add);
+  absl::flat_hash_set<string>().swap(diff->attrs_to_remove);
+}
+
+// Checks if changes to node will result in a valid node.
+template <typename GraphViewT>
+inline bool IsWellFormed(
+    NodeViewDiff<GraphViewT>* diff,
+    const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_remove, false);
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_add, EmptyTensorId());
+  int diff_regular_inputs_to_add_size = diff->regular_inputs_to_add.size();
+  if (diff_regular_inputs_to_add_size != diff->num_regular_inputs_to_add) {
+    // Missing regular fanins in between appended fanins.
+    return false;
+  } else if (diff->num_regular_inputs_to_add > 0 &&
+             !diff->regular_inputs_to_remove.empty()) {
+    // Appending new fanins while removing existing fanins, resulting in missing
+    // regular fanins in between.
+    return false;
+  } else if (static_cast<int>(diff->regular_inputs_to_remove.size()) !=
+             diff->num_regular_inputs_to_remove) {
+    // Regular fanins exist in between removed fanins.
+    return false;
+  }
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  const string& node_name =
+      diff->update_name ? diff->name : node_view->GetName();
+  auto invalid_node_name = [&](absl::string_view fanin_node_name) -> bool {
+    return fanin_node_name == node_name ||
+           !CheckNodeNameExists(fanin_node_name, updated_node_names,
+                                diff->graph_view);
+  };
+
+  // Check if nodes of all updated and new fanins exist (from name) and if such
+  // fanins do not introduce self loops. Note, this will not check for if
+  // unmodified fanins exist.
+  if (diff->update_name) {
+    // If name of node was changed in node, check all fanins. Updated fanins are
+    // checked for existence and self loops. Unmodified fanins are checked for
+    // self loops.
+    // `regular_inputs_to_update`, `controlling_inputs_to_remove` are sorted,
+    // so iterators from these maps/sets can be incremented alongside iteration
+    // and be used for comparisons.
+    const int last_index =
+        node_view->NumRegularFanins() - diff->num_regular_inputs_to_remove - 1;
+    auto regular_to_update_it = diff->regular_inputs_to_update.begin();
+    for (int i = 0; i <= last_index; ++i) {
+      if (regular_to_update_it != diff->regular_inputs_to_update.end() &&
+          regular_to_update_it->first < i) {
+        ++regular_to_update_it;
+      }
+      if (regular_to_update_it != diff->regular_inputs_to_update.end() &&
+          regular_to_update_it->first == i) {
+        if (invalid_node_name(regular_to_update_it->second.node())) {
+          return false;
+        }
+      } else {
+        const string& regular_name =
+            node_view->GetRegularFanin(i).node_view()->GetName();
+        if (regular_name == node_name) {
+          return false;
+        }
+      }
+    }
+
+    auto& controls = node_view->GetControllingFanins();
+    const int num_controls = controls.size();
+    auto control_to_remove_it = diff->controlling_inputs_to_remove.begin();
+    for (int i = 0; i < num_controls; ++i) {
+      if (control_to_remove_it != diff->controlling_inputs_to_remove.end() &&
+          *control_to_remove_it < i) {
+        ++control_to_remove_it;
+      }
+      if (control_to_remove_it != diff->controlling_inputs_to_remove.end() &&
+          *control_to_remove_it == i) {
+        // Control dependency marked for removal, can be ignored.
+        continue;
+      } else if (controls[i].node_view()->GetName() == node_name) {
+        return false;
+      }
+    }
+  } else {
+    // Name of node was not changed, check only updated fanins under the
+    // assumption prior fanins were valid.
+    for (const auto& updated : diff->regular_inputs_to_update) {
+      const string& fanin_name = updated.second.node();
+      if (invalid_node_name(fanin_name)) {
+        return false;
+      }
+    }
+  }
+  // Check appended regular fanins.
+  for (const auto& regular : diff->regular_inputs_to_add) {
+    if (invalid_node_name(regular.node())) {
+      return false;
+    }
+  }
+  // Check new controlling fanins.
+  for (const auto& control : diff->controlling_inputs_to_add) {
+    if (invalid_node_name(control)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// NewNode is a helper struct holding a new node to be added to a GraphViewT.
+// This should not be initialized or be used directly.
+template <typename GraphViewT>
+struct NewNode {
+  explicit NewNode(GraphViewT* graph_view, NodeDef&& node)
+      : graph_view(graph_view), node(std::move(node)) {}
+
+  GraphViewT* graph_view;
+  NodeDef node;
+  std::vector<SafeTensorId> regular_fanins;
+  int num_regular_fanins = 0;
+  absl::flat_hash_set<string> controlling_fanins;
+};
+
+// Updates new node name.
+template <typename GraphViewT>
+inline void UpdateName(NewNode<GraphViewT>* new_node, absl::string_view name) {
+  if (name.empty()) {
+    new_node->node.clear_name();
+  } else {
+    new_node->node.set_name(string(name));
+  }
+}
+
+// Updates new node op.
+template <typename GraphViewT>
+inline void UpdateOp(NewNode<GraphViewT>* new_node, absl::string_view op) {
+  if (op.empty()) {
+    new_node->node.clear_op();
+  } else {
+    new_node->node.set_op(string(op));
+  }
+}
+
+// Updates new node device.
+template <typename GraphViewT>
+inline void UpdateDevice(NewNode<GraphViewT>* new_node,
+                         absl::string_view device) {
+  if (device.empty()) {
+    new_node->node.clear_device();
+  } else {
+    new_node->node.set_device(string(device));
+  }
+}
+
+// Adds or updates regular fanin at `index` of regular fanins in the new node.
+// If another fanin already exists at `index`, it will be replaced with `fanin`.
+template <typename GraphViewT>
+inline void AddOrUpdateRegularFanin(NewNode<GraphViewT>* new_node, int index,
+                                    const TensorId& fanin) {
+  if (index < 0) {
+    // Not a valid index for regular fanins.
+    return;
+  } else if (AddOrUpdateAtIndex(&new_node->regular_fanins, index, fanin,
+                                EmptyTensorId())) {
+    ++new_node->num_regular_fanins;
+  }
+}
+
+// Remove regular fanin at `index` of regular fanins in the new node. This can
+// remove existing fanins and updated/added fanins via AddOrUpdateRegularFanins.
+template <typename GraphViewT>
+inline void RemoveRegularFanin(NewNode<GraphViewT>* new_node, int index) {
+  int new_node_regular_fanins_size = new_node->regular_fanins.size();
+  if (index < 0 || index >= new_node_regular_fanins_size ||
+      IsEmptyTensorId(new_node->regular_fanins[index])) {
+    return;
+  }
+  new_node->regular_fanins[index] = EmptyTensorId();
+  --new_node->num_regular_fanins;
+}
+
+// Adds controlling fanin to new node.
+template <typename GraphViewT>
+inline void AddControllingFanin(NewNode<GraphViewT>* new_node,
+                                absl::string_view fanin_node_name) {
+  new_node->controlling_fanins.emplace(fanin_node_name);
+}
+
+// Removes controlling fanin to new node.
+template <typename GraphViewT>
+inline void RemoveControllingFanin(NewNode<GraphViewT>* new_node,
+                                   absl::string_view fanin_node_name) {
+  new_node->controlling_fanins.erase(fanin_node_name);
+}
+
+// Adds or updates an attribute by name to a new node.
+template <typename GraphViewT>
+inline void AddOrUpdateAttribute(NewNode<GraphViewT>* new_node,
+                                 absl::string_view attr_name,
+                                 const AttrValue& attr_value) {
+  gtl::InsertOrUpdate(new_node->node.mutable_attr(), string(attr_name),
+                      attr_value);
+}
+
+// Removes an attribute by name to a new node.
+template <typename GraphViewT>
+inline void RemoveAttribute(NewNode<GraphViewT>* new_node,
+                            absl::string_view attr_name) {
+  new_node->node.mutable_attr()->erase(string(attr_name));
+}
+
+// Checks if current state of new node is a valid node.
+template <typename GraphViewT>
+inline bool IsWellFormed(
+    NewNode<GraphViewT>* new_node,
+    const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
+  ResizeByTrimmingEndForValue(&new_node->regular_fanins, EmptyTensorId());
+  int new_node_regular_fanins_size = new_node->regular_fanins.size();
+  if (new_node_regular_fanins_size != new_node->num_regular_fanins) {
+    return false;
+  }
+
+  const string& node_name = new_node->node.name();
+  auto invalid_node_name = [new_node, updated_node_names,
+                            node_name](absl::string_view fanin_node_name) {
+    return fanin_node_name == node_name ||
+           !CheckNodeNameExists(fanin_node_name, updated_node_names,
+                                new_node->graph_view);
+  };
+  // Check if nodes of all fanins exist (from name) and if fanins do not
+  // introduce self loops.
+  for (const auto& regular : new_node->regular_fanins) {
+    if (invalid_node_name(regular.node())) {
+      return false;
+    }
+  }
+  for (const auto& control : new_node->controlling_fanins) {
+    if (invalid_node_name(control)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+}  // namespace internal
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/grappler_test.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/grappler_test.h
new file mode 100644
index 00000000..967cff28
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/grappler_test.h
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GrapplerTest : public ::testing::Test {
+ public:
+  GrapplerTest();
+
+ protected:
+  void DisableAllOptimizers();
+  void EnableAllOptimizers();
+
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names) const;
+
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names,
+      const std::vector<std::pair<string, Tensor>>& inputs) const;
+
+  std::vector<Tensor> EvaluateFetchNodes(const GrapplerItem& item) const;
+
+  NodeDef* AddNode(const string& name, const string& op,
+                   const std::vector<string>& inputs,
+                   const std::vector<std::pair<string, AttrValue>>& attributes,
+                   GraphDef* graph) const;
+
+  void DisableAllOptimizers(RewriterConfig* cfg);
+
+  // Checks if two graphs are equal. Both graphs must have the same set of nodes
+  // with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failures to the current test.
+  void CompareGraphs(GraphDef want, GraphDef got) const;
+
+  // Checks if two nodes have the same name, op, inputs and attributes.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failures to the current test.
+  void CompareNodes(const NodeDef& want, const NodeDef& got) const;
+
+  // Checks if two functions are equal. Both functions must have the same set of
+  // nodes with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failures to the current test.
+  void CompareFunctions(FunctionDef want, FunctionDef got) const;
+
+  // Checks if node 'src' is directly connected to the input($position) of
+  // 'dst'.
+  bool IsNodesDirectlyConnected(const NodeMap& node_map, const string& src,
+                                const string& dst, int position = 0);
+
+  // Counts nodes of the given op-type in a graph.
+  int CountOpNodes(const GraphDef& graph, const string& op);
+
+  // Get a random tensor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateRandomTensor(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    for (auto i = 0; i < tensor.NumElements(); i++)
+      tensor.flat<T>()(i) = i + random::New64() % 10;
+    return tensor;
+  }
+
+  // Creates a random tensor with given shape using `setRandom`.
+  template <DataType DTYPE>
+  Tensor GenerateTensorWithSetRandom(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    tensor.flat<T>().setRandom();
+    return tensor;
+  }
+
+  // Get a constant tensor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateConstantTensor(
+      const TensorShape& shape,
+      typename EnumToDataType<DTYPE>::Type value) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    for (auto i = 0; i < tensor.NumElements(); i++) tensor.flat<T>()(i) = value;
+    return tensor;
+  }
+
+  inline tensorflow::Scope CreateScopeWithDevice(absl::string_view device) {
+    return tensorflow::Scope::NewRootScope().WithDevice(string(device));
+  }
+
+ private:
+  SessionOptions options_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/pattern_utils.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/pattern_utils.h
new file mode 100644
index 00000000..de4eecb8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/pattern_utils.h
@@ -0,0 +1,245 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_PATTERN_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_PATTERN_UTILS_H_
+
+#include "tensorflow/core/grappler/utils/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+
+//------------------------------------------------------------------------------
+// A pattern can be defined by the following grammar. Here, op_type is any valid
+// op name in the TensorFlow.
+//
+//    leaf_pattern ::= `{` op_type `}`
+//    pattern ::= leaf_pattern |
+//                `{` op_type `,` `{` pattern `,` ... `,` pattern `}` `}`
+//
+// (1) For example, the following pattern syntax describes a pattern for
+// _FusedConv2D (Conv2D + BiasAdd + Relu). Note that "*" means any type of op.
+//
+//  {"Relu",
+//    {
+//      "BiasAdd",
+//      {
+//        {"Conv2D"},
+//        {"*"}
+//      }
+//    }
+//  }
+//
+// The syntax above has a root ("Relu") and children (inputs), where each child
+// is a sub-pattern. Graph pattern matcher finds a match for the given pattern
+// syntax in a graph and returns a set of matched nodes.
+//
+// (2) In order to match a DAG with a given root, we extend pattern syntax with
+// labels. For example, a frequently found pattern in Deep Learning models is a
+// residual block like below.
+//
+//    Placeholder  Const
+//          |        |
+//    +-----+-----+  |
+//    |           |  |
+//    |           v  v
+//    |          Conv2D   Const
+//    |            |        |
+//    |            v  v-----+
+//    |          BiasAdd
+//    |            |
+//    v v----------+
+//   AddV2
+//
+// As shown above, it is the same input node (Placeholder) consumed by both
+// AddV2 and and Conv2D. This constrained can be put as labels in the following
+// augmented pattern syntax.
+//
+//  {"AddV2", "my_add",
+//    {
+//      {"*", "my_residual_input"},
+//      {"BiasAdd", "my_bias_add",
+//        {
+//          {"Conv2D", "my_conv",
+//            {
+//              {"*", "my_residual_input"},
+//              {"*", "my_filter"}
+//            }
+//          },
+//          {"*", my_bias"}
+//        }
+//      }
+//    }
+//  }
+//
+// Note that the same label "my_residual_input" is used to tell that it is a
+// child of both "AddV2" and "Conv2D". Labels are arbitrary strings to associate
+// with the nodes to be matched as well as to uniquely identify those nodes.
+//
+// (3) The motivatation for a grammar based pattern matching in grappler is to
+// make easy for finding fusion pattern in the remapper. A subgraph that
+// matches a given pattern, however, is not fusable if any of the matched node,
+// that will be removed as a part of fusion, has a consumer outside the matched
+// subgraph. In order to check for such type of external dependencies, we
+// further extend pattern syntax by prospective action (NodeStatus) on the
+// matched nodes as shown below. This helps cross checking the nodes to be
+// removed with the nodes matched intially.
+//
+//  {"AddV2", "my_add", NodeStatus::kReplace,
+//    {
+//      {"*", "my_residual_input", NodeStatus::kRemain},
+//      {"BiasAdd", "my_bias_add", NodeStatus::kRemove,
+//        {
+//          {"Conv2D", "my_conv", NodeStatus::kRemove,
+//            {
+//              {"*", "my_residual_input", NodeStatus::kRemain},
+//              {"*", "my_filter", NodeStatus::Remain}
+//            }
+//          },
+//          {"*", my_bias", NodeStatus::kRemain}
+//        }
+//      }
+//    }
+//  }
+//------------------------------------------------------------------------------
+
+// Pattern matcher recursively matches child subpatterns. The direction
+// for children could be toward node's input (fanins) or outputs (fanouts).
+enum class MatchingDirection { kFollowInputs, kFollowOutputs };
+
+// Action for each node in the set of matched nodes for a given pattern.
+enum class NodeStatus { kRemain, kRemove, kReplace };
+
+// TODO (intel-tf): Support multiple roots by making them children of a single
+// virtual root.
+struct OpTypePattern {
+  string op;
+  string label;
+  NodeStatus node_status;
+  std::vector<OpTypePattern> children;
+
+  string DebugString() const {
+    string result = "{(op: " + op + ", " + "label: " + label + "), {";
+    for (const OpTypePattern& child : children) {
+      result += child.DebugString() + ",";
+    }
+    result += "}}";
+    return result;
+  }
+};
+
+// This is a helpful recursive structure that keeps one-to-one mapping of
+// pattern syntax to the matched nodes. User can call DebugString to see what
+// has been matched so far and where is the failing point.
+struct NodeViewMatch {
+  MutableNodeView* node_view = nullptr;
+  std::vector<NodeViewMatch> children;
+
+  string DebugString() const {
+    string result = "{";
+    if (node_view == nullptr) {
+      result += "Non-Matched-Node}";
+      return result;
+    } else {
+      result += node_view->node()->DebugString();
+      result += ", {";
+      for (const NodeViewMatch& child : children) {
+        result += child.DebugString() + ",";
+      }
+      result += "}}";
+      return result;
+    }
+  }
+
+  void Clear() {
+    for (NodeViewMatch& child : children) {
+      child.Clear();  // child is an object.
+    }
+    children.clear();  // children is a vector.
+    if (node_view != nullptr) {
+      node_view = nullptr;
+    }
+  }
+};
+
+template <MatchingDirection DIRECTION = MatchingDirection::kFollowInputs>
+class SubGraphMatcher {
+ public:
+  SubGraphMatcher(MutableGraphView* graph_view) : graph_view_(graph_view){};
+
+  // If a given pattern is matched, this function returns true as well as the
+  // matched node and remove node info is populated.
+  bool GetMatchedNodes(const OpTypePattern& pattern,
+                       const std::unordered_set<string>& nodes_to_preserve,
+                       MutableNodeView* node_view,
+                       std::map<string, int>* matched_nodes_map,
+                       std::set<int>* remove_node_indices);
+
+ private:
+  MutableGraphView* graph_view_;
+  std::map<string, int> node_label_to_index_;
+  std::set<int> matched_node_indices_;
+  std::set<int> remove_node_indices_;
+  std::unique_ptr<NodeViewMatch> match_ = nullptr;
+
+  bool DoesOpTypePatternMatch(const OpTypePattern& pattern,
+                              MutableNodeView* node_view, NodeViewMatch* match);
+
+  // This function should be called after the pattern matcher has found
+  // potential matched nodes (i.e. when DoesOpTypePatternMatch returns "true").
+  // It performs a sanity check if the candidate nodes for removal in subgraph
+  // fusion is indeed safe to remove.
+  bool IsSafeNodesToRemove(
+      const std::unordered_set<string>& nodes_to_preserve) {
+    for (const auto& node_idx : remove_node_indices_) {
+      auto node_view = graph_view_->GetNode(node_idx);
+      // Check if the node to be removed is in the nodes to be preserved.
+      string node_name = node_view->GetName();
+      if (nodes_to_preserve.count(node_name) > 0) return false;
+      // Traverse all the Regular Fanouts. Fanouts are stored as vector of
+      // vector, std::vector<std::vector<MutableFaninView>>. Note that
+      // a MutableNodeView's fanouts are stored in a nested vector of
+      // MutableFaninView type.
+      auto fanouts_by_ports = node_view->GetRegularFanouts();
+      for (const auto& fanouts : fanouts_by_ports) {
+        for (const auto& fanout : fanouts) {
+          if (!matched_node_indices_.count(fanout.node_index())) {
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+
+template <>
+bool SubGraphMatcher<MatchingDirection::kFollowInputs>::DoesOpTypePatternMatch(
+    const OpTypePattern& pattern, MutableNodeView* node_view,
+    NodeViewMatch* match);
+
+template <>
+bool SubGraphMatcher<MatchingDirection::kFollowInputs>::GetMatchedNodes(
+    const OpTypePattern& pattern,
+    const std::unordered_set<string>& nodes_to_preserve,
+    MutableNodeView* node_view, std::map<string, int>* matched_nodes_map,
+    std::set<int>* remove_node_indices);
+
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_PATTERN_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/scc.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/scc.h
new file mode 100644
index 00000000..ceb9f5db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/scc.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/inputs/utils.h"
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Computes modified strongly connected components:
+// All nodes that are not part of a loop are assigned the special -1 id
+// All nodes that are part of at least one loop are assigned a positive
+// component id: if 2 nodes v and w are reachable from one another (i.e. if they
+// belong to the same scc), they'll be assigned the same id, otherwise they'll
+// be assigned distinct ids. *num_components is set to the number of distinct
+// ids.
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_components);
+
+// Returns the number of individual loops present in the graph, and populate the
+// 'loops' argument with the collection of loops (denoted by their loop ids) a
+// node is part of. Loops ids are arbitrary.
+int IdentifyLoops(const GraphDef& graph,
+                  std::unordered_map<const NodeDef*, std::vector<int>>* loops);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/symbolic_shapes.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/symbolic_shapes.h
new file mode 100644
index 00000000..9da374ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/symbolic_shapes.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_SYMBOLIC_SHAPES_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_SYMBOLIC_SHAPES_H_
+
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsKnown(const TensorShapeProto::Dim& dim);
+bool IsKnownSymbolically(const TensorShapeProto::Dim& dim);
+bool IsUnknown(const TensorShapeProto::Dim& dim);
+
+// Shape is symbolically defined, if it has a known rank, and each dimension is
+// known (dim_size >= 0), or is a symbolic dimension size (dim_size <= -2).
+bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
+bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
+
+// Returns the rank of the shape ir -1 if unknown
+int Rank(const TensorShapeProto& shape);
+
+// Returns the number of coefficients in the shape or -1 if unknown.
+// TODO(bsteiner) Add a function that computes the minimum size of the tensor,
+// ie the size assuming all the symbolic dimensions take the value 1.
+int64_t NumCoefficients(const TensorShapeProto& shape);
+
+// Shapes are symbolically equal, if they have the same rank, they are known or
+// symbolically defined, and have matching dimensions.
+bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
+                             const TensorShapeProto& right);
+bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
+                             const OpInfo::TensorProperties& right);
+
+// Check if two shapes can be broadcasted to each other. Both shapes must be at
+// least symbolically defined, and the have valid BCast instance.
+bool ShapesBroadcastable(const TensorShapeProto& left,
+                         const TensorShapeProto& right);
+bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
+                         const OpInfo::TensorProperties& right);
+bool ShapeAfterBroadcast(const TensorShapeProto& left,
+                         const TensorShapeProto& right,
+                         TensorShapeProto* output_shape);
+
+// Return true if can prove, that tensor of size 'left' is smaller than tensor
+// of size 'right'. Return false if it's larger or equal, or it's impossible to
+// compare because of unknown dimensions, or mismatch in symbolic dimensions.
+bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
+                                          const TensorShapeProto& right);
+bool CompareSymbolicallyShapedTensorSizes(
+    const OpInfo::TensorProperties& left,
+    const OpInfo::TensorProperties& right);
+
+// Returns the ratio of the sizes of the 2 shapes if known statically, or -1
+// otherwise.
+int64_t ComputeSizeRatio(const TensorShapeProto& numerator,
+                         const TensorShapeProto& denominator);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_SYMBOLIC_SHAPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/topological_sort.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/topological_sort.h
new file mode 100644
index 00000000..59ea41af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/topological_sort.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// TODO(ezhulenev, b/121379902): We should be consistent with GraphTopologyView
+// and use `GraphView::Edge` to pass extra dependencies.
+struct TopologicalDependency {
+  TopologicalDependency(const NodeDef* from, const NodeDef* to)
+      : from(from), to(to) {}
+  const NodeDef* from;
+  const NodeDef* to;
+};
+
+// Computes a topological ordering for the graph nodes and outputs nodes in the
+// topological order to the `topo_order` output argument.
+//
+// It's possible to pass additional edges that do not exists in a graph, but
+// must be respected when computing graph topological order. Example: Tensorflow
+// runtime allows concurrent execution of dequeue/enqueue ops from the same
+// queue resource, but we might want to enforce ordering between them.
+absl::Status ComputeTopologicalOrder(
+    const GraphDef& graph,
+    absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order);
+absl::Status ComputeTopologicalOrder(const GraphDef& graph,
+                                     std::vector<const NodeDef*>* topo_order);
+
+// Sorts a graph in topological order.
+absl::Status TopologicalSort(GraphDef* graph);
+
+// Sorts a graph in topological order and reverse it.
+absl::Status ReversedTopologicalSort(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/tpu.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/tpu.h
new file mode 100644
index 00000000..f81ab93f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/tpu.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TPU_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TPU_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Check if the graphdef contains nodes that indicate a graph processed by the
+// legacy TPU bridge.
+bool IsLegacyTPUBridgeGraphDef(const GraphDef& def);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/transitive_fanin.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/transitive_fanin.h
new file mode 100644
index 00000000..dd9b0c46
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/transitive_fanin.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRANSITIVE_FANIN_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRANSITIVE_FANIN_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Computes the transitive fanin of the graph based on reachability from the
+// specified terminal nodes. Returns the set of nodes comprising the
+// transitive fanin into fanin_nodes. Optionally returns a map of name->node
+// for that graph into name_to_fanin_node if that is not set to nullptr.
+absl::Status ComputeTransitiveFanin(
+    const GraphDef& graph, const std::vector<string>& terminal_nodes,
+    std::unordered_map<string, const NodeDef*>* name_to_fanin_node,
+    std::vector<const NodeDef*>* fanin_nodes);
+
+absl::Status ComputeTransitiveFanin(const GraphDef& graph,
+                                    const std::vector<string>& terminal_nodes,
+                                    std::vector<const NodeDef*>* fanin_nodes);
+
+// Creates output_graph from input_graph using the transitive fanin from the
+// specified terminal nodes. Returns error if the input_graph is deemed
+// structurally invalid.
+absl::Status SetTransitiveFaninGraph(const GraphDef& input_graph,
+                                     GraphDef* output_graph,
+                                     const std::vector<string>& terminal_nodes);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TRANSITIVE_FANIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/utils/traversal.h b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/traversal.h
new file mode 100644
index 00000000..5c9dada4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/utils/traversal.h
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+
+#include <functional>
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+enum class TraversalDirection { kFollowInputs, kFollowOutputs };
+
+// Encapsulate DFS callbacks that will be called during the graph traversal.
+//
+// If non-empty, the `pre_order` and `post_order` functors will be called on
+// each reachable node (including the `from` nodes) in pre and post order. If
+// loops are found, the `on_back_edge` functor will be called on the
+// corresponding back edges. Moreover, the pre and post order will assume that
+// these back edges will be cut.
+struct DfsCallbacks {
+  DfsCallbacks() = default;
+  DfsCallbacks(std::function<void(const NodeDef*)> pre,
+               std::function<void(const NodeDef*)> post,
+               std::function<void(const NodeDef*, const NodeDef*)> back_edge)
+      : pre_order(std::move(pre)),
+        post_order(std::move(post)),
+        on_back_edge(std::move(back_edge)) {}
+
+  static DfsCallbacks PreOrder(std::function<void(const NodeDef*)> pre) {
+    return DfsCallbacks(std::move(pre), nullptr, nullptr);
+  }
+
+  static DfsCallbacks PostOrder(std::function<void(const NodeDef*)> post) {
+    return DfsCallbacks(nullptr, std::move(post), nullptr);
+  }
+
+  std::function<void(const NodeDef*)> pre_order;
+  std::function<void(const NodeDef*)> post_order;
+  std::function<void(const NodeDef*, const NodeDef*)> on_back_edge;
+};
+
+// Encapsulate DFS predicates for traversing the graph.
+//
+// The `enter` predicate decides if traversal should enter the node, and the
+// `advance` predicate decides if the traversal should follow inputs/outputs
+// from the node.
+//
+// If predicates are empty (default initialized), it's assumed that we can enter
+// into any node and advance from any node respectively.
+struct DfsPredicates {
+  DfsPredicates() = default;
+  DfsPredicates(std::function<bool(const NodeDef*)> enter,
+                std::function<bool(const NodeDef*)> advance)
+      : enter(std::move(enter)), advance(std::move(advance)) {}
+
+  static DfsPredicates Enter(std::function<bool(const NodeDef*)> enter) {
+    return DfsPredicates(std::move(enter), nullptr);
+  }
+
+  static DfsPredicates Advance(std::function<bool(const NodeDef*)> advance) {
+    return DfsPredicates(nullptr, std::move(advance));
+  }
+
+  std::function<bool(const NodeDef*)> enter;
+  std::function<bool(const NodeDef*)> advance;
+};
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Use `predicates` to decide if
+// traversal should enter/advance to/from the graph node. These predicates also
+// applied to the `from` nodes. Call corresponding callbacks for each visited
+// node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks);
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Call corresponding callbacks
+// for each visited node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/verifiers/graph_verifier.h b/third_party/tflite-hdrs/tensorflow/core/grappler/verifiers/graph_verifier.h
new file mode 100644
index 00000000..53d62e4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/verifiers/graph_verifier.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// An abstract interface for verifying a graph.
+// This will be used to implement specific verifiers to verify that a grappler
+// transformed graph is valid.
+// Some examples of specific verifiers are:
+// 1. A general structural verifier that verifies that the specified graph has
+//    a valid structure that meets the specification of what it means to be
+//      a valid TensorFlow graph.
+// 2. A backend specific verifier that verifies that the specified graph,
+//     generated after a grappler transformation to convert the input TensorFlow
+//     graph to a corresponding backend graph, is a valid graph in the
+//     specification of the backend.
+class GraphVerifier {
+ public:
+  GraphVerifier() {}
+  virtual ~GraphVerifier() {}
+
+  // A name for the verifier.
+  virtual string name() const = 0;
+
+  // Implement an algorithm to verify the specified graph.
+  // The return value is a Status that represents a concatenation of Status of
+  // each verification step.
+  virtual absl::Status Verify(const GraphDef& graph) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/grappler/verifiers/structure_verifier.h b/third_party/tflite-hdrs/tensorflow/core/grappler/verifiers/structure_verifier.h
new file mode 100644
index 00000000..de77933f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/grappler/verifiers/structure_verifier.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Verifies the structure of a graph to ensure it is valid.
+class StructureVerifier : public GraphVerifier {
+ public:
+  StructureVerifier() {}
+  ~StructureVerifier() override {}
+
+  string name() const override { return "structure_verifier"; };
+
+  absl::Status Verify(const GraphDef& graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/dialect.h b/third_party/tflite-hdrs/tensorflow/core/ir/dialect.h
new file mode 100644
index 00000000..cba40b38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/dialect.h
@@ -0,0 +1,82 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_DIALECT_H_
+#define TENSORFLOW_CORE_IR_DIALECT_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "tensorflow/core/ir/types/dialect.h"
+
+namespace mlir {
+namespace tfg {
+// Include the relevant TensorFlow attrs/types directly in the TFG namespace.
+using mlir::tf_type::Bfloat16RefType;      // NOLINT
+using mlir::tf_type::BoolRefType;          // NOLINT
+using mlir::tf_type::Complex128RefType;    // NOLINT
+using mlir::tf_type::Complex64RefType;     // NOLINT
+using mlir::tf_type::ControlType;          // NOLINT
+using mlir::tf_type::DoubleRefType;        // NOLINT
+using mlir::tf_type::Float8E4M3FNRefType;  // NOLINT
+using mlir::tf_type::Float8E5M2RefType;    // NOLINT
+using mlir::tf_type::FloatRefType;         // NOLINT
+using mlir::tf_type::FuncAttr;             // NOLINT
+using mlir::tf_type::HalfRefType;          // NOLINT
+using mlir::tf_type::Int16RefType;         // NOLINT
+using mlir::tf_type::Int32RefType;         // NOLINT
+using mlir::tf_type::Int4RefType;          // NOLINT
+using mlir::tf_type::Int64RefType;         // NOLINT
+using mlir::tf_type::Int8RefType;          // NOLINT
+using mlir::tf_type::OpaqueTensorType;     // NOLINT
+using mlir::tf_type::PlaceholderAttr;      // NOLINT
+using mlir::tf_type::Qint16RefType;        // NOLINT
+using mlir::tf_type::Qint16Type;           // NOLINT
+using mlir::tf_type::Qint32RefType;        // NOLINT
+using mlir::tf_type::Qint32Type;           // NOLINT
+using mlir::tf_type::Qint8RefType;         // NOLINT
+using mlir::tf_type::Qint8Type;            // NOLINT
+using mlir::tf_type::Quint16RefType;       // NOLINT
+using mlir::tf_type::Quint16Type;          // NOLINT
+using mlir::tf_type::Quint8RefType;        // NOLINT
+using mlir::tf_type::Quint8Type;           // NOLINT
+using mlir::tf_type::ResourceRefType;      // NOLINT
+using mlir::tf_type::ResourceType;         // NOLINT
+using mlir::tf_type::ShapeAttr;            // NOLINT
+using mlir::tf_type::StringRefType;        // NOLINT
+using mlir::tf_type::StringType;           // NOLINT
+using mlir::tf_type::Uint16RefType;        // NOLINT
+using mlir::tf_type::Uint32RefType;        // NOLINT
+using mlir::tf_type::Uint4RefType;         // NOLINT
+using mlir::tf_type::Uint64RefType;        // NOLINT
+using mlir::tf_type::Uint8RefType;         // NOLINT
+using mlir::tf_type::VariantRefType;       // NOLINT
+using mlir::tf_type::VariantType;          // NOLINT
+using mlir::tf_type::VersionAttr;          // NOLINT
+
+class TFGraphOpAsmInterface;
+class TFOp;
+}  // namespace tfg
+}  // namespace mlir
+
+// Dialect main class is defined in ODS, we include it here.
+#include "tensorflow/core/ir/dialect.h.inc"  // IWYU pragma: export
+// ODS-generated attribute classes.
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/core/ir/attributes.h.inc"
+
+#endif  // TENSORFLOW_CORE_IR_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_attributes.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_attributes.h
new file mode 100644
index 00000000..250a32e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_attributes.h
@@ -0,0 +1,86 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_ATTRIBUTES_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_ATTRIBUTES_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+namespace tfg {
+
+// Convert the list of MLIR Attributes `attrs` to the `tensorflow::AttrValueMap`
+// `values`.
+absl::Status ConvertAttributes(ArrayRef<NamedAttribute> attrs,
+                               ArrayRef<StringRef> attrs_to_ignore,
+                               bool remove_ref_type,
+                               tensorflow::AttrValueMap* values);
+
+// Convert the MLIR attribute `attr` and return a `tensorflow::AttrValue`.
+absl::StatusOr<tensorflow::AttrValue> ConvertAttribute(Attribute attr);
+
+absl::Status SetShapeAttribute(absl::string_view name, ShapedType shaped_type,
+                               tensorflow::AttrValueMap* values);
+
+// Converts an MLIR shaped type to a TensorFlow shape attribute.
+ShapeAttr ConvertTypeToTensorShapeAttr(const Type& type);
+
+/// Import from TensorFlow to MLIR
+
+// Converts non func AttrValue proto into an MLIR attribute. Func attribute is
+// exclused in this function because the function might be renamed when the
+// function definition is imported.
+absl::StatusOr<Attribute> ConvertNonFuncAttributeValue(
+    const tensorflow::AttrValue& value, Builder& builder);
+
+// Converts all kinds of AttrValue proto into an MLIR attribute.
+absl::StatusOr<Attribute> ConvertAttributeValue(
+    const tensorflow::AttrValue& value, Builder& builder);
+
+// Convert the MLIR FullTyoe attribute `attr` and return a
+// `tensorflow::FullTypeDef`.
+absl::StatusOr<tensorflow::FullTypeDef> ConvertAttribute(
+    tf_type::FullTypeAttr full_type);
+
+// Converts fulltype proto to attribute.
+absl::StatusOr< ::mlir::tf_type::FullTypeAttr> ConvertAttribute(
+    const tensorflow::FullTypeDef& full_type, Builder& builder);
+
+// Convert an array of handle data (pairs of data types and shapes) to an array
+// attribute of tensor types.
+absl::StatusOr<ArrayAttr> ConvertHandleData(
+    Builder builder,
+    const tensorflow::protobuf::RepeatedPtrField<
+        tensorflow::ResourceHandleProto_DtypeAndShape>& handle_data);
+
+// Convert an array of handle data into the `handle_data` field of the provided
+// ArgDef. Each entry of the array is expected to be a TensorType.
+absl::Status ConvertHandleData(ArrayAttr handle_data_arr,
+                               tensorflow::OpDef::ArgDef* arg);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_ATTRIBUTES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_tensor.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_tensor.h
new file mode 100644
index 00000000..15bbe282
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_tensor.h
@@ -0,0 +1,93 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_TENSOR_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_TENSOR_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/types/dialect.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+namespace tfg {
+
+// Converts an TensorFlow tensor proto into an MLIR elements attribute.
+absl::StatusOr<ElementsAttr> ConvertTensorProto(
+    const tensorflow::TensorProto& input_tensor, Builder builder);
+
+// Converts an TensorFlow tensor into an MLIR elements attribute.
+absl::StatusOr<ElementsAttr> ConvertTensor(
+    const tensorflow::Tensor& input_tensor, Builder builder);
+
+// Converts a shape from MLIR to a TensorFlow tensor shape proto.
+void ConvertToTensorShapeProto(ArrayRef<int64_t> shape,
+                               tensorflow::TensorShapeProto* output_shape);
+
+// Converts an MLIR type to a TensorFlow tensor shape.
+tensorflow::PartialTensorShape ConvertTypeToTensorShape(const Type& type);
+
+// Converts a TensorFlow shape attribute to an MLIR shape attribute.
+absl::StatusOr<ShapeAttr> ConvertTensorShapeProto(
+    const tensorflow::TensorShapeProto& shape, MLIRContext* context);
+
+// Fill in the contents of TensorShapeProto for the given shape.
+// ShapeContainerT is any type with the following methods:
+//   bool hasRank()
+//   ArrayRef<int64_t> getShape()
+// This includes TF::ShapeAttr and ShapedType.
+template <typename ShapeContainerT>
+void SetTensorShapeProto(ShapeContainerT shape,
+                         tensorflow::TensorShapeProto* proto) {
+  if (shape.hasRank()) {
+    for (int64_t dim : shape.getShape()) {
+      // TODO(hinsu): Use tensorflow::kTFDynamicSize instead of -1 without
+      // depending on tensorflow/compiler
+      proto->add_dim()->set_size(mlir::ShapedType::isDynamic(dim) ? -1 : dim);
+    }
+  } else {
+    proto->set_unknown_rank(true);
+  }
+}
+
+// Converts an MLIR elements attribute to a TensorFlow tensor proto.
+absl::Status ConvertToTensorProto(ElementsAttr attr,
+                                  tensorflow::TensorProto* output_tensor);
+
+// Converts an MLIR elements attribute to a TensorFlow tensor.
+absl::Status ConvertToTensor(ElementsAttr attr,
+                             tensorflow::Tensor* output_tensor);
+
+// Converts a TF shape to MLIR shape, i.e. -1 becomes kDynamicSize.
+llvm::SmallVector<int64_t> ConvertTFShapeToMlir(llvm::ArrayRef<int64_t> shape);
+
+// Converts an MLIR shape to TF shape, i.e. kDynamicSize becomes -1.
+llvm::SmallVector<int64_t> ConvertMlirShapeToTF(llvm::ArrayRef<int64_t> shape);
+
+// Creates a TF TensorShape using MLIR shape, element type and encoding.
+mlir::RankedTensorType GetTypeFromTFTensorShape(llvm::ArrayRef<int64_t> shape,
+                                                mlir::Type elementType,
+                                                mlir::Attribute encoding = {});
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_types.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_types.h
new file mode 100644
index 00000000..d3f1756c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/convert_types.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_TYPES_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_TYPES_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+namespace tfg {
+// Converts the TensorFlow DataType 'dtype' into an MLIR (scalar) type.
+absl::Status ConvertDataType(tensorflow::DataType dtype, Builder& builder,
+                             Type* type);
+
+// Converts a scalar MLIR type to a TensorFlow Datatype.
+absl::Status ConvertScalarTypeToDataType(Type type,
+                                         tensorflow::DataType* dtype);
+
+// Converts an MLIR type to TensorFlow DataType. If 'type' is a scalar type, it
+// is converted directly. If it is a shaped type, the element type is converted.
+absl::Status ConvertToDataType(Type type, tensorflow::DataType* dtype);
+
+// Converts an TensorFlow shape to the one used in MLIR.
+void ConvertToMlirShape(const tensorflow::TensorShape& input_shape,
+                        SmallVectorImpl<int64_t>* shape);
+
+// Converts an TensorFlow shape proto to the one used in MLIR.
+absl::Status ConvertToMlirShape(const tensorflow::TensorShapeProto& input_shape,
+                                SmallVectorImpl<int64_t>* shape);
+
+// Given a tensor shape and dtype, get the corresponding MLIR tensor type.
+absl::StatusOr<Type> ConvertToMlirTensorType(
+    const tensorflow::TensorShapeProto& shape, tensorflow::DataType dtype,
+    Builder* builder);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_CONVERT_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/functiondef_export.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/functiondef_export.h
new file mode 100644
index 00000000..1eec4282
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/functiondef_export.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_FUNCTIONDEF_EXPORT_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_FUNCTIONDEF_EXPORT_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/ir/ops.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+namespace tfg {
+
+// Export a generic GraphFuncOp into a FunctionDef. This is intended to be a
+// straight serialization, an error is returned in case of failure.
+absl::StatusOr<tensorflow::FunctionDef> ConvertGenericFunctionToFunctionDef(
+    GraphFuncOp func);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_FUNCTIONDEF_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/functiondef_import.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/functiondef_import.h
new file mode 100644
index 00000000..7e9aba69
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/functiondef_import.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_FUNCTIONDEF_IMPORT_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_FUNCTIONDEF_IMPORT_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/ir/ops.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+namespace tfg {
+
+// Import the FunctionDef `func` as a TFG generic function (see GraphFuncOp
+// documentation). The function will be inserted using the provided `builder`.
+absl::Status ConvertGenericFunction(GraphFuncOp func_op,
+                                    const tensorflow::FunctionDef& func,
+                                    OpBuilder& builder);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_FUNCTIONDEF_IMPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/graphdef_export.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/graphdef_export.h
new file mode 100644
index 00000000..74af12fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/graphdef_export.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_GRAPHDEF_EXPORT_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_GRAPHDEF_EXPORT_H_
+
+#include <string>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/ops.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+namespace tfg {
+
+// Get the name of a value as if it were an edge in a graph.
+absl::StatusOr<std::string> GetValueName(Value value, TFGraphDialect *dialect);
+
+// Convert a TFG graph directly to GraphDef. Graph functions in the module are
+// added to the GraphDef's function library.
+absl::Status ConvertToGraphDef(ModuleOp module, tensorflow::GraphDef *graph);
+
+// Convert a single TFG op to NodeDef. This utliity function requires a callback
+// `get_value_name` that returns the edge name of the given operand.
+absl::Status ConvertToNodeDef(
+    Operation *op, tensorflow::NodeDef *node, TFGraphDialect *dialect,
+    function_ref<absl::StatusOr<std::string>(Value)> get_value_name);
+
+// Convert a single TFG function to a FunctionDef and add it to the function
+// library. If a function with the same name already exists, replace it.
+absl::Status ConvertToFunctionDef(
+    GraphFuncOp func, tensorflow::FunctionLibraryDefinition &library);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_GRAPHDEF_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/graphdef_import.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/graphdef_import.h
new file mode 100644
index 00000000..cda3a989
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/graphdef_import.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_GRAPHDEF_IMPORT_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_GRAPHDEF_IMPORT_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace mlir {
+namespace tfg {
+
+// Convert a GraphDef directly to TFG.
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportGraphDef(
+    MLIRContext *context, const tensorflow::GraphDebugInfo &debug_info,
+    const tensorflow::GraphDef &graph_def);
+
+// Converts a graph and function library to a TFG module.
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportGraphAndFunctionsToMlir(
+    MLIRContext *context, const tensorflow::GraphDebugInfo &debug_info,
+    const tensorflow::Graph &graph,
+    const tensorflow::FunctionLibraryDefinition &flib_def);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_GRAPHDEF_IMPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/load_proto.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/load_proto.h
new file mode 100644
index 00000000..9644411c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/load_proto.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_LOAD_PROTO_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_LOAD_PROTO_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Reads text (.pbtext) or binary (.pb) format of a proto message from the given
+// buffer. Returns error status of the file is not found or malformed proto.
+// Note that text protos can only be parsed when full protobuf::Message protos
+// are used, and will fail for protobuf::MessageLite protos.
+absl::Status LoadProtoFromBuffer(absl::string_view input,
+                                 protobuf::Message* proto);
+absl::Status LoadProtoFromBuffer(absl::string_view input,
+                                 protobuf::MessageLite* proto);
+
+// Reads text (.pbtext) or binary (.pb) format of a proto message from the given
+// file path. Returns error status of the file is not found or malformed proto.
+// Note that text protos can only be parsed when full protobuf::Message protos
+// are used, and will fail for protobuf::MessageLite protos.
+absl::Status LoadProtoFromFile(absl::string_view input_filename,
+                               protobuf::Message* proto);
+absl::Status LoadProtoFromFile(absl::string_view input_filename,
+                               protobuf::MessageLite* proto);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_LOAD_PROTO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/mangling.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/mangling.h
new file mode 100644
index 00000000..a85be927
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/mangling.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_MANGLING_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_MANGLING_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace mlir {
+namespace tfg {
+namespace mangling_util {
+// The type of a mangled string.
+enum class MangledKind { kUnknown, kDataType, kTensorShape, kTensor };
+
+// Print proto in TextFormat in the single-line mode.
+std::string PrintShortTextProto(const ::tensorflow::protobuf::Message& message);
+// The MessageLite interface does not support reflection so this API
+// will only print a summary of the proto. This API is needed for code
+// that may work with both Message and MessageLite.
+std::string PrintShortTextProto(
+    const ::tensorflow::protobuf::MessageLite& message);
+
+// Mangles an attribute name, marking the attribute as a TensorFlow attribute.
+std::string MangleAttributeName(absl::string_view str);
+
+// Returns true if 'str' was mangled with MangleAttributeName.
+bool IsMangledAttributeName(absl::string_view str);
+
+// Demangles an attribute name that was manged with MangleAttributeName.
+// REQUIRES: IsMangledAttributeName returns true.
+absl::string_view DemangleAttributeName(absl::string_view str);
+
+// Returns the type of a mangled string, or kUnknown.
+MangledKind GetMangledKind(absl::string_view str);
+
+// Return a TensorShapeProto mangled as a string.
+std::string MangleShape(const tensorflow::TensorShapeProto& shape);
+// Demangle a string mangled with MangleShape.
+absl::Status DemangleShape(absl::string_view str,
+                           tensorflow::TensorShapeProto* proto);
+
+// Return a TensorProto mangled as a string.
+std::string MangleTensor(const tensorflow::TensorProto& tensor);
+// Demangle a string mangled with MangleTensor.
+absl::Status DemangleTensor(absl::string_view str,
+                            tensorflow::TensorProto* proto);
+
+// Return a DataType mangled as a string.
+std::string MangleDataType(const tensorflow::DataType& dtype);
+// Demangle a string mangled with MangleDataType.
+absl::Status DemangleDataType(absl::string_view str,
+                              tensorflow::DataType* proto);
+
+}  // namespace mangling_util
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_MANGLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/parse_text_proto.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/parse_text_proto.h
new file mode 100644
index 00000000..00a7d83e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/parse_text_proto.h
@@ -0,0 +1,46 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_PARSE_TEXT_PROTO_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_PARSE_TEXT_PROTO_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace mlir {
+namespace tfg {
+
+// Sets output to the given input with `prefix` stripped, or returns an error if
+// the prefix doesn't exist.
+absl::Status ConsumePrefix(absl::string_view str, absl::string_view prefix,
+                           absl::string_view* output);
+
+// Strips `prefix_to_strip` from `text_proto`, parses, and returns the parsed
+// proto.
+absl::Status ParseTextProto(absl::string_view text_proto,
+                            absl::string_view prefix_to_strip,
+                            tensorflow::protobuf::Message* parsed_proto);
+inline absl::Status ParseTextProto(
+    absl::string_view /* text_proto */, absl::string_view /* prefix_to_strip */,
+    tensorflow::protobuf::MessageLite* /* parsed_proto */) {
+  return tensorflow::errors::Unavailable("Cannot parse text protos on mobile.");
+}
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_PARSE_TEXT_PROTO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/savedmodel_export.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/savedmodel_export.h
new file mode 100644
index 00000000..b270ce9c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/savedmodel_export.h
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_EXPORT_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_EXPORT_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+namespace mlir {
+namespace tfg {
+
+// Given an MLIR module, returns a `output_saved_model` SavedModel.
+// The module must contain at most a single Graph operation and zero or more
+// TFFunc operations. `original_saved_model` is used as only a GraphDef portion
+// of a saved model represented in the MLIR module.
+absl::Status ExportMlirToSavedModel(
+    mlir::ModuleOp module, const tensorflow::SavedModel &original_saved_model,
+    tensorflow::SavedModel *output_saved_model);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/savedmodel_import.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/savedmodel_import.h
new file mode 100644
index 00000000..787f2ae5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/savedmodel_import.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_IMPORT_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_IMPORT_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+namespace mlir {
+namespace tfg {
+
+// Converts a saved model to a MLIR module expressed in TFG dialect.
+// Only the root graph and function library of the saved model gets imported
+// into MLIR TFG dialect.
+// TODO(b/218882780): Consider importing SignatureDefs from the SavedModel.
+absl::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelToMlir(
+    mlir::MLIRContext* context, const tensorflow::GraphDebugInfo& debug_info,
+    const tensorflow::SavedModel& saved_model);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_IMPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h
new file mode 100644
index 00000000..516ede67
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_TESTS_ROUNDTRIP_ROUNDTRIP_H_
+#define TENSORFLOW_CORE_IR_IMPORTEXPORT_TESTS_ROUNDTRIP_ROUNDTRIP_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+void NormalizeTensorData(GraphDef& graphdef, bool add_fulltype);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_IR_IMPORTEXPORT_TESTS_ROUNDTRIP_ROUNDTRIP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/interfaces.h b/third_party/tflite-hdrs/tensorflow/core/ir/interfaces.h
new file mode 100644
index 00000000..c6b07034
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/interfaces.h
@@ -0,0 +1,75 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_INTERFACES_H_
+#define TENSORFLOW_CORE_IR_INTERFACES_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/DialectInterface.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+
+// Include generated declarations.
+#include "tensorflow/core/ir/interfaces.h.inc"
+
+namespace mlir {
+namespace tfg {
+// The dialect fallback model for the TensorFlow registry interface.
+class TensorFlowRegistryInterfaceBase
+    : public TensorFlowRegistryInterface::FallbackModel<
+          TensorFlowRegistryInterfaceBase>,
+      public DialectInterface::Base<TensorFlowRegistryInterfaceBase> {
+ public:
+  explicit TensorFlowRegistryInterfaceBase(Dialect *dialect)
+      : DialectInterface::Base<TensorFlowRegistryInterfaceBase>(dialect) {}
+
+  // Returns whether the operation is stateful.
+  virtual bool isStateful(Operation *op) const = 0;
+};
+
+// This dialect fallback model implements memory effects for TensorFlow
+// operations.
+class StatefulMemoryEffectInterface
+    : public MemoryEffectOpInterface::FallbackModel<
+          StatefulMemoryEffectInterface>,
+      public DialectInterface::Base<StatefulMemoryEffectInterface> {
+ public:
+  explicit StatefulMemoryEffectInterface(Dialect *dialect)
+      : DialectInterface::Base<StatefulMemoryEffectInterface>(dialect) {}
+
+  // Get the memory effects of a TensorFlow operation. If the operation is known
+  // to be stateless, then it has no memory effects. Otherwise, statefulness is
+  // modelled as `MemoryWrite`.
+  void getEffects(
+      Operation *op,
+      SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+          &effects) const;
+};
+}  // namespace tfg
+
+namespace OpTrait {
+// This trait marks intrinsic TFG operations, e.g. terminators, functions,
+// and region control-flow operations. Any TFG operation that has this trait
+// exists only in MLIR.
+template <typename ConcreteType>
+class IntrinsicOperation
+    : public mlir::OpTrait::TraitBase<ConcreteType, IntrinsicOperation> {};
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_INTERFACES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/ops.h b/third_party/tflite-hdrs/tensorflow/core/ir/ops.h
new file mode 100644
index 00000000..08e20991
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/ops.h
@@ -0,0 +1,67 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_OPS_H_
+#define TENSORFLOW_CORE_IR_OPS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/RegionKindInterface.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/interfaces.h"
+#include "tensorflow/core/ir/tf_op_wrapper.h"
+
+// Get the C++ declaration for all the ops defined in ODS for the dialect.
+
+#define GET_OP_CLASSES
+#include "tensorflow/core/ir/ops.h.inc"
+
+namespace mlir {
+namespace tfg {
+
+// Analysis that keeps track of all function names in a module.
+struct FunctionTable {
+  explicit FunctionTable(ModuleOp module);
+
+  // Returns whether there are no functions.
+  bool empty() const { return functions.empty(); }
+
+  // Returns whether `op` may be a function call.
+  bool MayBeCall(Operation* op) const;
+
+  // Returns whether `op` is a legacy function call. A "legacy" function call
+  // is when the operation name is the name of a function in the library.
+  bool IsLegacyCall(Operation* op) const;
+
+ private:
+  // All the functions in the graph.
+  DenseSet<StringRef> functions;
+};
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/tf_op_registry.h b/third_party/tflite-hdrs/tensorflow/core/ir/tf_op_registry.h
new file mode 100644
index 00000000..fe0d82e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/tf_op_registry.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_TF_OP_REGISTRY_H_
+#define TENSORFLOW_CORE_IR_TF_OP_REGISTRY_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/core/ir/interfaces.h"
+
+// Forward declaration of TensorFlow types.
+namespace tensorflow {
+class OpRegistry;
+}  // namespace tensorflow
+
+namespace mlir {
+namespace tfg {
+class TensorFlowOpRegistryInterface : public TensorFlowRegistryInterfaceBase {
+ public:
+  // Create the interface model with a provided registry.
+  TensorFlowOpRegistryInterface(Dialect *dialect,
+                                const tensorflow::OpRegistry *registry)
+      : TensorFlowRegistryInterfaceBase(dialect), registry_(registry) {}
+  // Create the interface model with the global registry.
+  explicit TensorFlowOpRegistryInterface(Dialect *dialect);
+
+  // Returns true if the operation is stateful.
+  bool isStateful(Operation *op) const override;
+
+  // Returns the current TensorFlow op registry.
+  const tensorflow::OpRegistry *GetRegistry() const { return registry_; }
+
+ private:
+  // The TensorFlow op registry instance.
+  const tensorflow::OpRegistry *registry_;
+};
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_TF_OP_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/tf_op_wrapper.h b/third_party/tflite-hdrs/tensorflow/core/ir/tf_op_wrapper.h
new file mode 100644
index 00000000..1c8183f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/tf_op_wrapper.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_TF_OP_WRAPPER_H_
+#define TENSORFLOW_CORE_IR_TF_OP_WRAPPER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator_range.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/types/dialect.h"
+#include "tensorflow/core/ir/utility.h"
+
+namespace mlir {
+namespace detail {
+// This class iterates over the control dependencies of the values.
+template <typename ValueIteratorT>
+class ControlRetIterator final
+    : public llvm::mapped_iterator_base<ControlRetIterator<ValueIteratorT>,
+                                        ValueIteratorT, Value> {
+ public:
+  using llvm::mapped_iterator_base<ControlRetIterator<ValueIteratorT>,
+                                   ValueIteratorT, Value>::mapped_iterator_base;
+
+  Value mapElement(Value value) const {
+    return mlir::isa<tf_type::ControlType>(value.getType())
+               ? value
+               : tfg::LookupControlDependency(value);
+  }
+};
+}  // namespace detail
+
+namespace tfg {
+
+// Wrapper class exposing convenience methods to manipulate TensorFlow graph
+// nodes uniformly.
+class TFOp {
+ public:
+  // Wrap an operation. The operation can be null. The constructor must be
+  // marked as implicit to support `llvm::dyn_cast`.
+  TFOp(Operation *op = nullptr);  // NOLINT
+
+  explicit TFOp(Operation &op) : TFOp(&op) {}
+
+  // Support LLVM-style RTTI.
+  static bool classof(Operation *op) {
+    return isa<TFGraphDialect>(op->getDialect());
+  }
+
+  // Get the wrapped operation.
+  Operation *getOperation() { return op_; }
+
+  // Returns a pointer to the TensorFlow Graph Dialect. It nevers returns
+  // nullptr.
+  TFGraphDialect *getDialect() {
+    return cast<TFGraphDialect>(op_->getDialect());
+  }
+
+  // Split the operands into data and control operands.
+  std::pair<OperandRange, OperandRange> splitOperands() {
+    ControlType ctl_type = getDialect()->getControlType();
+    return SplitDataAndControlValues(op_->getOperands(), ctl_type);
+  }
+
+  // Returns the regular operands, the control operands will be excluded.
+  OperandRange getNonControlOperands() { return splitOperands().first; }
+
+  // The control operands are always after the regular inputs.
+  OperandRange getControlOperands() { return splitOperands().second; }
+
+  // Returns the control token produced by this operation.
+  Value controlRet() { return op_->getResult(op_->getNumResults() - 1); }
+
+  // Returns the non-control results produced by this operation.
+  ResultRange getNonControlResults() {
+    return op_->getResults().slice(0, op_->getNumResults() - 1);
+  }
+
+  // Returns the node name for this operation.
+  StringAttr nameAttr();
+  StringRef name();
+  // Set a new node name for this operation.
+  void setName(const Twine &name);
+  void setName(StringAttr name);
+
+  // Returns the requested device, which is also the "device" field in a
+  // GraphDef.
+  StringAttr requestedDeviceAttr();
+  StringRef requestedDevice();
+  // Set a new requested device for this operation.
+  void setRequestedDevice(const Twine &requested_device);
+  void setRequestedDevice(StringAttr requested_device);
+
+  // Returns the assigned device, this field is set by placer in general.
+  StringAttr assignedDeviceAttr();
+  StringRef assignedDevice();
+  // Set a new assigned device for this operation.
+  void setAssignedDevice(const Twine &assigned_device);
+  void setAssignedDevice(StringAttr assigned_device);
+
+  // Returns the assigned TPU cluster name.
+  StringAttr tpuReplicate();
+  // Set the assigned TPU cluster name.
+  void setTpuReplicate(StringAttr tpu_replicate);
+
+  // Returns the device, preferring the assigned device if set, and the
+  // requested device otherwise.
+  StringAttr deviceAttr() {
+    StringAttr device = assignedDeviceAttr();
+    if (device) {
+      assert(!device.getValue().empty());
+      return device;
+    }
+    return requestedDeviceAttr();
+  }
+  StringRef device() {
+    StringAttr device_attr = deviceAttr();
+    if (device_attr) return device_attr.getValue();
+    return "";
+  }
+
+  // Forward `->` to the underlying operation, exposing the `Operation` methods.
+  Operation *operator->() { return op_; }
+  Operation &operator*() { return *op_; }
+
+  // Converts to true if there is a wrapped operation.
+  explicit operator bool() const { return op_; }
+
+ private:
+  // The wrapped operation.
+  Operation *op_;
+};
+
+// A range iterator to get the control tokens associated with a value range.
+// This range allows to wrap a ValueRange (or an OperandRange) and iterates on
+// the control token associated to the producer of each value. For example, if
+// you wrap the operands of an operation:
+//     OperandControlRetRange range = op->getOperands();
+// iterating this range will yield the control edges from each of the operations
+// (or block arguments) producing these operands.
+template <typename ValueRangeT>
+class ControlRetRange final
+    : public llvm::iterator_range<
+          ::mlir::detail::ControlRetIterator<typename ValueRangeT::iterator>> {
+ public:
+  using Base = llvm::iterator_range<
+      ::mlir::detail::ControlRetIterator<typename ValueRangeT::iterator>>;
+  explicit ControlRetRange(ValueRangeT c) : Base(c.begin(), c.end()) {}
+
+  /// Return the value at the given index.
+  Value operator[](size_t index) const {
+    assert(index < size() && "invalid index into value range");
+    return *(this->begin() + index);
+  }
+
+  // Return the size of this range.
+  size_t size() const { return llvm::size(*this); }
+
+  // Return first value in the range.
+  Value front() { return (*this)[0]; }
+
+  // Compare this range with another.
+  template <typename OtherT>
+  bool operator==(const OtherT &other) const {
+    return llvm::size(*this) == llvm::size(other) &&
+           std::equal(this->begin(), this->end(), other.begin());
+  }
+  template <typename OtherT>
+  bool operator!=(const OtherT &other) const {
+    return !(*this == other);
+  }
+};
+
+using OperandControlRetRange = ControlRetRange<OperandRange>;
+using ValueControlRetRange = ControlRetRange<ValueRange>;
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_TF_OP_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/types/dialect.h b/third_party/tflite-hdrs/tensorflow/core/ir/types/dialect.h
new file mode 100644
index 00000000..b0b601e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/types/dialect.h
@@ -0,0 +1,359 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_TYPES_DIALECT_H_
+#define TENSORFLOW_CORE_IR_TYPES_DIALECT_H_
+
+#include <optional>
+#include <string>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+
+// Include the dialect class generated from dialect.td.
+// The constructor and the printing/parsing of dialect types are manually
+// implemented (see ops.cpp).
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/ir/types/dialect.h.inc"
+
+// Include the Type classes declaration generated from types.td
+#define GET_TYPEDEF_CLASSES
+#include "tensorflow/core/ir/types/types.h.inc"
+
+namespace mlir {
+namespace tf_type {
+
+//===----------------------------------------------------------------------===//
+// TensorFlow types
+//===----------------------------------------------------------------------===//
+
+// The base class in the TensorFlow type hierarchy.
+class TensorFlowType : public Type {
+ public:
+  using Type::Type;
+
+  // Support method to enable LLVM-style type casting.
+  static bool classof(Type type);
+};
+
+// Returns true if the specified type is a valid TensorFlow element type.
+inline bool IsValidTFElementType(Type type) {
+  return mlir::isa<ComplexType, FloatType, IntegerType, TensorFlowType,
+                   quant::QuantizedType>(type);
+}
+
+// Returns true if this is a valid TensorFlow tensor type.
+inline bool IsValidTFTensorType(Type type) {
+  // TensorFlow types should be tensors of one of the valid TensorFlow element
+  // types.
+  if (auto tensor_ty = mlir::dyn_cast<TensorType>(type))
+    return IsValidTFElementType(tensor_ty.getElementType());
+  return false;
+}
+
+namespace detail {
+// Common implementation of TensorFlow types. The template argument indicates
+// the concrete derived class per CRTP.
+template <typename Derived>
+class TensorFlowTypeImpl
+    : public Type::TypeBase<Derived, TensorFlowType, TypeStorage> {
+ public:
+  using Base = typename Type::TypeBase<Derived, TensorFlowType, TypeStorage>;
+  using TFBase = TensorFlowTypeImpl<Derived>;
+  using Base::Base;
+};
+}  // namespace detail
+
+// TensorFlowRefType class supports all the ref types in TensorFlow dialect.
+class TensorFlowRefType : public TensorFlowType {
+ public:
+  using TensorFlowType::TensorFlowType;
+
+  // Checks if a type is TensorFlow Ref type.
+  static bool classof(Type type);
+
+  // Converts a type to the corresponding TensorFlowRef type.
+  static TensorFlowType get(Type type);
+  static TensorFlowType getChecked(Type type, MLIRContext* context,
+                                   Location loc) {
+    if (failed(verify(loc, type))) {
+      return TensorFlowRefType();
+    }
+    return get(type);
+  }
+
+  static LogicalResult verify(Location loc, Type type) {
+    // type should be a valid TensorFlow type.
+    if (!IsValidTFTensorType(type)) {
+      return emitError(loc) << "invalid TensorFlow type: " << type;
+    }
+    return success();
+  }
+
+  // Converts a TensorFlowRef type to the corresponding TensorFlow or standard
+  // type.
+  Type RemoveRef();
+};
+
+// Define a class for each individual TensorFlow type (dtype), see types.def
+// for the list.
+#define HANDLE_TF_TYPE(tftype, enumerant, name_marg)                     \
+  class tftype##Type : public detail::TensorFlowTypeImpl<tftype##Type> { \
+   public:                                                               \
+    using TFBase::TFBase;                                                \
+    static constexpr StringLiteral name = #name_marg;                    \
+  };
+#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name_marg)
+#include "tensorflow/core/ir/types/types.def"
+
+namespace detail {
+// Storage type contains inferred subtypes for TypeWithSubtype.
+class TypeWithSubtypeStorage : public TypeStorage {
+ public:
+  using KeyTy = ArrayRef<TensorType>;
+
+  // NOLINTNEXTLINE
+  static TypeWithSubtypeStorage* construct(TypeStorageAllocator& allocator,
+                                           const KeyTy& key) {
+    ArrayRef<TensorType> subtypes = allocator.copyInto(key);
+    return new (allocator.allocate<TypeWithSubtypeStorage>())
+        TypeWithSubtypeStorage(subtypes);
+  }
+
+  explicit TypeWithSubtypeStorage(const KeyTy& key) : subtypes_(key) {}
+
+  bool operator==(const KeyTy& key) const { return key == subtypes_; }
+
+  static llvm::hash_code hashKey(const KeyTy& key) {
+    return llvm::hash_combine_range(key.begin(), key.end());
+  }
+
+  KeyTy subtypes_;
+};
+
+// Common implementation of TensorFlow types with subtypes. These subtypes are
+// opaque and their interpretation depends on the actual underlying type.
+// The template argument indicates the concrete derived class per CRTP. Concrete
+// classes must implement the following:
+//   - `static std::string getTypeName()` that returns the name of the type for
+//     verification logging.
+template <typename Derived>
+class TypeWithSubtypeImpl
+    : public Type::TypeBase<Derived, TensorFlowType, TypeWithSubtypeStorage> {
+ public:
+  using Base = Type::TypeBase<Derived, TensorFlowType, TypeWithSubtypeStorage>;
+  using TFBase = TypeWithSubtypeImpl<Derived>;
+  using Base::Base;
+
+  static Derived get(ArrayRef<TensorType> subtypes, MLIRContext* context) {
+    return Base::get(context, subtypes);
+  }
+
+  static Derived getChecked(ArrayRef<TensorType> subtypes, MLIRContext* context,
+                            Location loc) {
+    return Base::getChecked(loc, subtypes);
+  }
+  static Derived getChecked(function_ref<InFlightDiagnostic()> emitError,
+                            MLIRContext* context,
+                            ArrayRef<TensorType> subtypes) {
+    return Base::getChecked(emitError, context, subtypes);
+  }
+
+  static Derived get(MLIRContext* context) { return get({}, context); }
+
+  static LogicalResult verify(function_ref<InFlightDiagnostic()> emitError,
+                              ArrayRef<TensorType> subtypes) {
+    // Each of the subtypes should be a valid TensorFlow type.
+    for (TensorType subtype : subtypes) {
+      if (!IsValidTFTensorType(subtype)) {
+        return emitError() << "invalid " << Derived::getTypeName()
+                           << " subtype: " << subtype;
+      }
+    }
+    return success();
+  }
+
+  ArrayRef<TensorType> getSubtypes() { return Base::getImpl()->subtypes_; }
+};
+}  // namespace detail
+
+// TensorFlowTypeWithSubtype class supports all the types with subtypes in
+// TensorFlow dialect.
+class TensorFlowTypeWithSubtype : public TensorFlowType {
+ public:
+  using TensorFlowType::TensorFlowType;
+
+  // Checks if a type is TensorFlow type with subtypes.
+  static bool classof(Type type);
+
+  // Converts a TypeWithSubtype type to the same type but without its subtypes.
+  Type RemoveSubtypes();
+
+  // Clone the current Type with new subtypes.
+  TensorFlowTypeWithSubtype clone(ArrayRef<TensorType> new_subtypes);
+
+  // Returns the subtypes.
+  ArrayRef<TensorType> GetSubtypes();
+};
+
+// Returns the corresponding TensorFlow type with subtypes but without its
+// subtypes.
+inline Type GetDefaultTypeOf(TensorFlowTypeWithSubtype type) {
+  return type.RemoveSubtypes();
+}
+
+// TensorFlow resource type is used to support TensorFlow resource variables,
+// which represent shared, persistent state manipulated by a TensorFlow program.
+// ResourceType stores shape and datatype for subtypes unlike most other data
+// types that don't have any associated information.
+class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
+ public:
+  using TFBase::TFBase;
+  static constexpr ::mlir::StringLiteral name = "tf_type.resource";
+  static std::string getTypeName() { return "ResourceType"; }
+};
+
+// TensorFlow variant type is used to support arbitrary custom C++ data types.
+// VariantType stores inferred shape and datatype for subtypes unlike most other
+// data types that don't have any associated information. For example, variants
+// encoding TensorList type stores the common shape and dtype of the list
+// elements as the only subtype.
+class VariantType : public detail::TypeWithSubtypeImpl<VariantType> {
+ public:
+  using TFBase::TFBase;
+  static constexpr ::mlir::StringLiteral name = "tf_type.variant";
+  static std::string getTypeName() { return "VariantType"; }
+};
+
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+Type GetCastCompatibleType(Type a, Type b, bool may_ignore_ref_type_a = false);
+
+// Returns whether two arrays of Type are broadcast compatible.
+bool BroadcastCompatible(TypeRange lhs, TypeRange rhs);
+
+// Returns whether the two elemental types are compatible. Shapes are compatible
+// if:
+// - the types are statically equal
+// - could be dynamically equal
+//   - considering dynamic shapes equal unless contradictory info known;
+//   - element types are equivalent, modulo subtypes possible be less exact
+//     (e.g., a resource type without subtype is considered compatible with
+//      resource type with known subtype).
+// Provide option to ignore ref types on 'lhs'.
+bool HasCompatibleElementTypes(Type lhs, Type rhs,
+                               bool may_ignore_ref_type_lhs = false);
+
+// Returns true if all TensorFlow types can be cast to one
+// another. In other words, a single run-time value is legal for all the types.
+// For example, tensor<*xf32>, tensor<?xf32> and tensor<3xf32> are cast
+// compatible.
+bool AreCastCompatible(TypeRange types);
+
+// Returns true if corresponding elements of lhs and rhs AreCastCompatible and
+// lhs and rhs are the same length.
+bool ArraysAreCastCompatible(TypeRange lhs, TypeRange rhs);
+
+// If `ty` is a tensor type and its element type has subtypes, then returns a
+// new type of same shape but dropped subtypes for the element type.
+// Otherwise, if `ty` has subtypes, then returns corresponding type with dropped
+// subtypes.
+// Otherwise, returns the original type `ty`.
+Type DropSubTypes(Type ty);
+
+// If `ty` is a tensor type and has elements of a ref type, then returns a new
+// type of same shape but corresponding non-ref type as element type.
+// Otherwise, if `ty` is a ref type, then returns corresponding non-ref type.
+// Otherwise, returns the original type `ty`.
+Type DropRefType(Type ty);
+
+// Convenience call for executing both `DropRefType` and `DropSubTypes`.
+Type DropRefAndSubTypes(Type ty);
+
+//===----------------------------------------------------------------------===//
+// Utility iterators
+//===----------------------------------------------------------------------===//
+
+// An iterator for the tensor shapes of an op's operands of shaped types.
+// Returns std::nullopt if a operand is unranked; returns ArrayRef<int64_t> as
+// the shape otherwise.
+class OperandShapeIterator final
+    : public llvm::mapped_iterator<Operation::operand_iterator,
+                                   std::optional<ArrayRef<int64_t>> (*)(
+                                       Value)> {
+ public:
+  using reference = std::optional<ArrayRef<int64_t>>;
+
+  /// Initializes the operand shape iterator to the specified operand iterator.
+  explicit OperandShapeIterator(Operation::operand_iterator it);
+};
+
+using OperandShapeRange = iterator_range<OperandShapeIterator>;
+
+// An iterator for the tensor shapes of an op's results of shaped types.
+// Returns std::nullopt if a result is unranked; returns ArrayRef<int64_t> as
+// the shape otherwise.
+class ResultShapeIterator final
+    : public llvm::mapped_iterator<Operation::result_iterator,
+                                   std::optional<ArrayRef<int64_t>> (*)(
+                                       Value)> {
+ public:
+  using reference = std::optional<ArrayRef<int64_t>>;
+
+  /// Initializes the result shape iterator to the specified result iterator.
+  explicit ResultShapeIterator(Operation::result_iterator it);
+};
+
+using ResultShapeRange = iterator_range<ResultShapeIterator>;
+
+// Returns a range with just resource type values from the input range
+// preserved.
+template <typename RangeT>
+auto filter_resources(RangeT&& range) {
+  return llvm::make_filter_range(std::forward<RangeT>(range), [](Value val) {
+    return mlir::isa<ResourceType>(getElementTypeOrSelf(val.getType()));
+  });
+}
+
+// Returns the element type if `type` is a `ShapedType` and the type itself
+// otherwise, converting `TensorFlowRef` type to corresponding `TensorFlow` or
+// standard type if necessary.
+inline Type GetElementTypeOrSelfResolveRef(Type type) {
+  Type element_type = getElementTypeOrSelf(type);
+  if (auto ref_type = mlir::dyn_cast<TensorFlowRefType>(element_type)) {
+    element_type = ref_type.RemoveRef();
+  }
+  return element_type;
+}
+
+}  // namespace tf_type
+}  // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// Tablegen Attribute Declarations
+//===----------------------------------------------------------------------===//
+
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/core/ir/types/attributes.h.inc"
+#include "tensorflow/core/ir/types/attributes_enum.h.inc"
+
+#endif  // TENSORFLOW_CORE_IR_TYPES_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/utility.h b/third_party/tflite-hdrs/tensorflow/core/ir/utility.h
new file mode 100644
index 00000000..e234751e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/utility.h
@@ -0,0 +1,87 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_UTILITY_H_
+#define TENSORFLOW_CORE_IR_UTILITY_H_
+
+#include <optional>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+
+namespace mlir {
+namespace tfg {
+
+// Region-based loop ops store control tokens all after the data values, unlike
+// functions which store them as pairs. This is required by
+// RegionBranchOpInterface's API which requires MutableOperandRange, i.e. the
+// data operands need to be stored contiguously.
+
+// TODO(jeffniu): These functions aren't just for "loop regions" any more, but
+// any region-based ops (if/case have explicit capture forms).
+
+// Given a region belonging to a region-based loop operation (e.g. a while
+// loop), return the subrange of block arguments that are data values.
+Block::BlockArgListType GetLoopRegionDataArgs(Region &region);
+// Given a region belonging to a region-based loop operation (e.g. a while
+// loop), return the subrange of block arguments that are control tokens.
+Block::BlockArgListType GetLoopRegionControlTokens(Region &region);
+// Given a data value block argument of a region belonging to a region-based
+// loop operation (e.g. a while loop), return the block argument that
+// corresponds to the control token.
+BlockArgument GetLoopRegionControlOf(BlockArgument data);
+// Given a control token block argument of a region belonging to a region-based
+// loop operation (e.g. a while loop), return the block argument that
+// corresponds to the data value.
+BlockArgument GetLoopRegionDataOf(BlockArgument ctl);
+
+// Given a TFG value, lookup the associated control token. For op results, the
+// token will be the last result of the op. For block arguments, the token will
+// be the subsequent argument. A data value always has an associated control
+// token.
+Value LookupControlDependency(Value data);
+
+// Given a TFG control token, lookup the associated data value. Block arguments
+// will always have an associated data value: the previous argument. For ops,
+// if the only result is a control token, return None. Otherwise, returns the
+// first result.
+std::optional<Value> LookupDataValue(Value ctl);
+
+// Given a range of values, operands, or results, that contains data and control
+// values, where all control tokens come after the data values, split the range
+// between the two.
+template <typename RangeT>
+std::pair<RangeT, RangeT> SplitDataAndControlValues(RangeT values,
+                                                    ControlType ctl_type) {
+  unsigned num_ctl = 0;
+  for (Value value : llvm::reverse(values)) {
+    if (value.getType() == ctl_type)
+      ++num_ctl;
+    else
+      break;
+  }
+  unsigned split_idx = llvm::size(values) - num_ctl;
+  return std::make_pair(values.slice(0, split_idx),
+                        values.slice(split_idx, num_ctl));
+}
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_UTILITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ir/utils/shape_inference_utils.h b/third_party/tflite-hdrs/tensorflow/core/ir/utils/shape_inference_utils.h
new file mode 100644
index 00000000..273f4cee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ir/utils/shape_inference_utils.h
@@ -0,0 +1,94 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_IR_UTILS_SHAPE_INFERENCE_UTILS_H_
+#define TENSORFLOW_CORE_IR_UTILS_SHAPE_INFERENCE_UTILS_H_
+
+#include <cstdint>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+struct OpRegistrationData;
+}  // namespace tensorflow
+
+namespace mlir {
+namespace tfg {
+
+// Function that takes in a value and extracts a constant from it, if available.
+// If the value cannot be resolved as a constant, a nullptr will be returned.
+// Certain shape functions require constant values as arguments.
+using OperandAsConstantFn = llvm::function_ref<Attribute(Value)>;
+
+// Function that takes in an operation result and computes a shape (can be
+// partial) value. Certain shape functions require shape values as arguments.
+using OpResultAsShapeFn =
+    llvm::function_ref<tensorflow::shape_inference::ShapeHandle(
+        tensorflow::shape_inference::InferenceContext&, OpResult)>;
+
+// Function that takes a result index and returns the element type. Element
+// types are necessary for handle types (resource, variant).
+using ResultElementTypeFn = llvm::function_ref<Type(int)>;
+
+// Extracts the attributes of a MLIR operation and populates the converted
+// attributes in a proto map<string, AttrValue>. This is used by operation
+// defined in TF dialect which has different attributes format than TFG dialect.
+using GetAttrValuesFn = llvm::function_ref<absl::Status(
+    Operation*, llvm::StringRef, const tensorflow::OpRegistrationData*, bool,
+    tensorflow::AttrValueMap*)>;
+
+// Runs TensorFlow shape inference associated to the op type registered in the
+// TensorFlow op registry based on the Graph version, operands, and attributes.
+// Invoking this shape function will create conversions of parameters to the
+// TensorFlow Graph equivalent data structures and back to MLIR equivalent data
+// structures. This does not use a natively implemented shape inference in MLIR,
+// and instead is temporary until shape functions are reimplemented/migrated to
+// being in MLIR instead of the TensorFlow op registry.
+// Note that the default way to get the attrs in the operation is using the API
+// in TFG importer. For operations that has different format of attributes, they
+// should give the `get_attr_values_fn` to read the attributes correctly.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    std::optional<Location> location, Operation* op, ValueRange operands,
+    int64_t graph_version, OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    GetAttrValuesFn get_attr_values_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+// This one is almost the same as the above one, the difference is that we use
+// ConvertOperationToNode to convert the operation to NodeDef to get the attr
+// values.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    std::optional<Location> location, Operation* op, ValueRange operands,
+    int64_t graph_version, OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_IR_UTILS_SHAPE_INFERENCE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/aggregate_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/aggregate_ops.h
new file mode 100644
index 00000000..7f56e994
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/aggregate_ops.h
@@ -0,0 +1,226 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_H_
+
+#include <numeric>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor definitions for Aggregate ops, must be compilable by nvcc.
+template <typename Device, typename T>
+struct Add2Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2);
+};
+
+template <typename Device, typename T>
+struct Add2EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2) {
+    out.device(d) = in1 + in2;
+  }
+};
+
+template <typename Device, typename T>
+struct Add3Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3);
+};
+
+template <typename Device, typename T>
+struct Add3EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3) {
+    out.device(d) = in1 + in2 + in3;
+  }
+};
+
+template <typename Device, typename T>
+struct Add4Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4);
+};
+
+template <typename Device, typename T>
+struct Add4EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4) {
+    out.device(d) = in1 + in2 + in3 + in4;
+  }
+};
+
+template <typename Device, typename T>
+struct Add5Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5);
+};
+
+template <typename Device, typename T>
+struct Add5EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4,
+                      typename TTypes<T>::ConstFlat in5) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5;
+  }
+};
+
+template <typename Device, typename T>
+struct Add6Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6);
+};
+
+template <typename Device, typename T>
+struct Add6EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4,
+                      typename TTypes<T>::ConstFlat in5,
+                      typename TTypes<T>::ConstFlat in6) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6;
+  }
+};
+
+template <typename Device, typename T>
+struct Add7Functor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6,
+                  typename TTypes<T>::ConstFlat in7);
+};
+
+template <typename Device, typename T>
+struct Add7EigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::Flat out,
+                      typename TTypes<T>::ConstFlat in1,
+                      typename TTypes<T>::ConstFlat in2,
+                      typename TTypes<T>::ConstFlat in3,
+                      typename TTypes<T>::ConstFlat in4,
+                      typename TTypes<T>::ConstFlat in5,
+                      typename TTypes<T>::ConstFlat in6,
+                      typename TTypes<T>::ConstFlat in7) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7;
+  }
+};
+
+template <typename Device, typename T>
+struct Add8Functor {
+  void operator()(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8);
+};
+
+template <typename Device, typename T>
+struct Add8EigenImpl {
+  static void Compute(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8;
+  }
+};
+
+// Add8p is like Add8 except the underlying implementation should +=
+// rather than assign to the output.
+template <typename Device, typename T>
+struct Add8pFunctor {
+  void operator()(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8);
+};
+
+template <typename Device, typename T>
+struct Add8pEigenImpl {
+  static void Compute(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    out.device(d) += in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8;
+  }
+};
+
+template <typename Device, typename T>
+struct Add9Functor {
+  void operator()(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9);
+};
+
+template <typename Device, typename T>
+struct Add9EigenImpl {
+  static void Compute(
+      const Device& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9) {
+    out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8 + in9;
+  }
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/aggregate_ops_cpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/aggregate_ops_cpu.h
new file mode 100644
index 00000000..f205d8d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -0,0 +1,142 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_CPU_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+#include "tensorflow/core/kernels/aggregate_ops.h"
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+
+namespace tensorflow {
+
+// Partial specializations for a CPUDevice, that uses the Eigen implementation
+// from AddNEigenImpl.
+namespace functor {
+template <typename T>
+struct Add2Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2) {
+    Add2EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2);
+  }
+};
+template <typename T>
+struct Add3Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3) {
+    Add3EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3);
+  }
+};
+template <typename T>
+struct Add4Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4) {
+    Add4EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4);
+  }
+};
+template <typename T>
+struct Add5Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5) {
+    Add5EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
+  }
+};
+template <typename T>
+struct Add6Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6) {
+    Add6EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
+  }
+};
+template <typename T>
+struct Add7Functor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstFlat in1,
+                  typename TTypes<T>::ConstFlat in2,
+                  typename TTypes<T>::ConstFlat in3,
+                  typename TTypes<T>::ConstFlat in4,
+                  typename TTypes<T>::ConstFlat in5,
+                  typename TTypes<T>::ConstFlat in6,
+                  typename TTypes<T>::ConstFlat in7) {
+    Add7EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7);
+  }
+};
+
+template <typename T>
+struct Add8Functor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    Add8EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7, in8);
+  }
+};
+
+template <typename T>
+struct Add8pFunctor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
+    Add8pEigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                          in7, in8);
+  }
+};
+
+template <typename T>
+struct Add9Functor<CPUDevice, T> {
+  void operator()(
+      const CPUDevice& d, typename TTypes<T>::Flat out,
+      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
+      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
+      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
+      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
+      typename TTypes<T>::ConstFlat in9) {
+    Add9EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
+                                         in7, in8, in9);
+  }
+};
+
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_CPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/argmax_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/argmax_op.h
new file mode 100644
index 00000000..9b2325c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/argmax_op.h
@@ -0,0 +1,72 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
+// Generator definition for ArgMaxOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T, typename Tout>
+struct ArgMax {
+#define DECLARE_COMPUTE_SPEC(Dims)                                             \
+  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                                \
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,            \
+      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output) { \
+    output.device(d) = input.argmax(dimension).template cast<Tout>();          \
+  }
+
+  DECLARE_COMPUTE_SPEC(1);
+  DECLARE_COMPUTE_SPEC(2);
+  DECLARE_COMPUTE_SPEC(3);
+  DECLARE_COMPUTE_SPEC(4);
+  DECLARE_COMPUTE_SPEC(5);
+  DECLARE_COMPUTE_SPEC(6);
+  DECLARE_COMPUTE_SPEC(7);
+
+#undef DECLARE_COMPUTE_SPEC
+};
+
+template <typename Device, typename T, typename Tout>
+struct ArgMin {
+#define DECLARE_COMPUTE_SPEC(Dims)                                             \
+  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                                \
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,            \
+      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output) { \
+    output.device(d) = input.argmin(dimension).template cast<Tout>();          \
+  }
+
+  DECLARE_COMPUTE_SPEC(1);
+  DECLARE_COMPUTE_SPEC(2);
+  DECLARE_COMPUTE_SPEC(3);
+  DECLARE_COMPUTE_SPEC(4);
+  DECLARE_COMPUTE_SPEC(5);
+  DECLARE_COMPUTE_SPEC(6);
+  DECLARE_COMPUTE_SPEC(7);
+
+#undef DECLARE_COMPUTE_SPEC
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/assign_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/assign_op.h
new file mode 100644
index 00000000..063be3e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/assign_op.h
@@ -0,0 +1,71 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_ASSIGN_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ASSIGN_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ref_var.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+// TODO(jeff): Get rid of use_exclusive_lock_ option
+
+// Computes *input[0] = input[1]
+class AssignOp : public OpKernel {
+ public:
+  explicit AssignOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("validate_shape", &validate_shape_));
+    OP_REQUIRES(context, IsRefType(context->input_type(0)),
+                errors::InvalidArgument("lhs input needs to be a ref type"));
+    if (!context
+             ->GetAttr("_grappler_relax_allocator_constraints",
+                       &relax_constraints_)
+             .ok()) {
+      relax_constraints_ = false;
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    constexpr int input_ref_index = 0;
+    constexpr int output_ref_index = 0;
+    constexpr int value_index = 1;
+
+    auto copy = [this](OpKernelContext* cc_ctx, Tensor* lhs,
+                       const Tensor& rhs) { Copy(cc_ctx, lhs, rhs); };
+
+    AssignRefVariable(context, input_ref_index, output_ref_index, value_index,
+                      use_exclusive_lock_, validate_shape_, relax_constraints_,
+                      copy);
+  }
+
+  virtual void Copy(OpKernelContext* context, Tensor* lhs,
+                    const Tensor& rhs) = 0;
+
+  bool use_exclusive_lock_;
+  bool validate_shape_;
+  bool relax_constraints_;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_ASSIGN_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/autotune_conv_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/autotune_conv_impl.h
new file mode 100644
index 00000000..63c6a64d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/autotune_conv_impl.h
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+------------------------------------------------------------------------------*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_AUTOTUNE_CONV_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_AUTOTUNE_CONV_IMPL_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_THREADS
+
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+
+namespace tensorflow::internal {
+
+template <typename LaunchFunc, typename Sig>
+StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
+    OpKernelContext* ctx,
+    std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
+    bool actually_do_autotune, const LaunchFunc& launch_func,
+    size_t scratch_size_limit, const se::RedzoneAllocator& rz_allocator) {
+  auto* stream = ctx->op_device_context()->stream();
+
+  se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
+                                              stream);
+
+  std::vector<xla::AutotuneResult> results;
+  // TODO(reedwm): Warn if determinism is enabled after autotune is run
+  for (auto& runner : runners) {
+    // TODO(zhengxq): profile each algorithm multiple times to better
+    // accuracy.
+    se::RedzoneAllocator rz_scratch_allocator(
+        stream, &tf_allocator_adapter,
+        /*memory_limit=*/scratch_size_limit);
+    DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);
+    se::ScratchAllocator* allocator_used =
+        !RedzoneCheckDisabled()
+            ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
+            : static_cast<se::ScratchAllocator*>(&scratch_allocator);
+
+    TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
+    se::dnn::ProfileResult profile_result;
+    Status cudnn_launch_status =
+        actually_do_autotune
+            ? launch_func(allocator_used, runner, &profile_result)
+            : OkStatus();
+    if (!actually_do_autotune) {
+      // Make the result valid according to `is_valid`.
+      profile_result.set_algorithm(desc);
+      profile_result.set_elapsed_time_in_ms(0);
+    }
+
+    // We need to make sure the profiling results are one-to-one with the
+    // "runners". So, we insert dummy results when the execution fails.
+    results.emplace_back();
+    auto& result = results.back();
+    *result.mutable_algorithm() = desc.ToProto();
+    if (cudnn_launch_status.ok() && profile_result.is_valid()) {
+      result.set_scratch_bytes(
+          !RedzoneCheckDisabled()
+              ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
+              : scratch_allocator.TotalByteSize());
+      *result.mutable_run_time() = proto_utils::ToDurationProto(
+          absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
+      CheckRedzones(rz_scratch_allocator, &result);
+      CheckRedzones(rz_allocator, &result);
+    } else {
+      result.mutable_failure()->set_kind(xla::AutotuneResult::UNKNOWN);
+      result.mutable_failure()->set_msg(
+          absl::StrCat("Profiling failure on CUDNN engine ", desc.ToString(),
+                       ": ", cudnn_launch_status.ToString()));
+    }
+  }
+
+  return results;
+}
+
+}  // namespace tensorflow::internal
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_AUTOTUNE_CONV_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/avgpooling_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/avgpooling_op.h
new file mode 100644
index 00000000..8008c3c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/avgpooling_op.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_AVGPOOLING_OP_H_
+#define TENSORFLOW_CORE_KERNELS_AVGPOOLING_OP_H_
+// Functor definition for AvgPoolingOp, must be compilable by nvcc.
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_pooling.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SpatialAvgPooling {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input, int window_rows,
+                  int window_cols, int row_stride, int col_stride,
+                  const Eigen::PaddingType& padding) {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto output32, auto input32) {
+          // Because we swap the layout, we swap the row/cols as well.
+          output32.swap_layout().device(d) = Eigen::SpatialAvgPooling(
+              input32.swap_layout(), window_cols, window_rows, col_stride,
+              row_stride, padding);
+        },
+        output, input);
+  }
+};
+
+}  // namespace functor
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
+// operation that works NHWC data formats. Arguments:
+//   top_diff: backprop to the output of the pooling layer
+//   num: number of input batches
+//   height: input height
+//   width: input width
+//   channels: number of input channels
+//   pooled_height: the height of the output to the pooling layer
+//   pooled_width: the width of the output to the pooling layer
+//   kernel_h: the height of the pooling kernel
+//   kernel_w: the width of the pooling kernel
+//   stride_h: the height of the vertical stride
+//   stride_w: the width of the horizontal stride
+//   pad_t: padding size to the top side
+//   pad_l: padding size to the left side
+//   bottom_diff: backprop to the input of the pooling layer.
+template <typename T>
+bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
+                            const int height, const int width,
+                            const int channels, const int pooled_height,
+                            const int pooled_width, const int kernel_h,
+                            const int kernel_w, const int stride_h,
+                            const int stride_w, const int pad_t,
+                            const int pad_l, T* const bottom_diff,
+                            const GPUDevice& d);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_AVGPOOLING_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batch_kernel_test_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_kernel_test_util.h
new file mode 100644
index 00000000..2495580a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_kernel_test_util.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_KERNEL_TEST_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_KERNEL_TEST_UTIL_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/kernels/batch_kernels.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace test_util {
+
+// A test util for accessing private members of `BatchFunctionKernel`.
+class BatchFunctionKernelTestAccess {
+ public:
+  explicit BatchFunctionKernelTestAccess(const BatchFunctionKernel* kernel);
+
+  bool enable_adaptive_batch_threads() const;
+
+ private:
+  const BatchFunctionKernel* const kernel_;
+};
+
+class BatchFunctionKernelTestBase : public OpsTestBase,
+                                    public ::testing::WithParamInterface<bool> {
+ public:
+  // Init test fixture with a batch kernel instance.
+  absl::Status Init(bool enable_adaptive_scheduler);
+};
+
+}  // namespace test_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_KERNEL_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batch_kernels.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_kernels.h
new file mode 100644
index 00000000..73baea3a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_kernels.h
@@ -0,0 +1,139 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_KERNELS_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/types.h"
+
+namespace tensorflow {
+
+// Per-model inflight batches parameters.
+ABSL_CONST_INIT extern const int64_t kMinInflightBatches;
+ABSL_CONST_INIT extern const int64_t kInitialInflightBatches;
+ABSL_CONST_INIT extern const int64_t kBatchesToAverageOver;
+ABSL_CONST_INIT extern const int64_t kMaxInflightBatches;
+
+namespace test_util {
+class BatchFunctionKernelTestAccess;
+}  // namespace test_util
+
+// Records the usage of attribute `enable_large_batch_splitting`.
+void RecordBatchSplitUsage(
+    std::optional<bool> maybe_enable_large_batch_splitting,
+    absl::string_view model_name);
+
+// Records the number of batch threads of a model.
+void RecordBatchParamNumBatchThreads(int64_t num_batch_threads,
+                                     absl::string_view model_name);
+
+// Returns the model name from the context.
+absl::string_view GetModelName(OpKernelContext* ctx);
+
+// `BatchFunctionKernel` is the implementation of op `BatchFunction`.
+//
+// `BatchFunctionKernel` will batch (tensor) inputs by concatenating them
+// along the 0-th dimension, schedule a user-defined computation, and then
+// splits the returned tensors as batch output.
+//
+// In particular, an instance of `BatchFunctionKernel` creates or re-uses a
+// a batch scheduler instance based on op attributes, pre-processes and enqueues
+// concatenated inputs to the scheduler which invokes user-defined function,
+// and then splits function output as op output.
+//
+// User defined function is named by attribute `f` and defined in the graph.
+class BatchFunctionKernel : public AsyncOpKernel {
+ public:
+  explicit BatchFunctionKernel(OpKernelConstruction* c);
+
+  bool IsExpensive() override;
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final;
+
+ private:
+  friend class test_util::BatchFunctionKernelTestAccess;
+
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically.
+  // If large batch split is not enabled, the last one must equal
+  // `max_batch_size_`. otherwise the last element must be smaller than or equal
+  // to `max_batch_size_`.
+  absl::Status ValidateAllowedBatchSizes() const;
+
+  // Creates the function handle if it isn't initialized yet; and re-use it
+  // afterwards.
+  absl::Status GetOrCreateFunctionHandle(
+      OpKernelContext* c, FunctionLibraryRuntime::Handle* handle);
+
+  // Instantiate the user-defined function and emits `handle`.
+  absl::Status InstantiateFunction(
+      OpKernelContext* c, FunctionLibraryRuntime::Handle* handle) const;
+
+  // Initialize vars by reading from op-kernel-construction.
+  // Vars
+  // - enable_adaptive_batch_threads_
+  //   true if value of attribute `kEnableAdaptiveSchedulerAttr` is true, or
+  //   if `num_batch_threads` is not positive.
+  // - adaptive_batch_scheduler_options_
+  //   Read from corresponding attributes as long as they are set.
+  void SetAdaptiveBatchSchedulerOptions(OpKernelConstruction* c,
+                                        int32_t num_batch_threads);
+  string container_;
+  string shared_name_;
+  string batcher_queue_;
+  int32 num_batch_threads_;
+  int32 max_batch_size_;
+  int32 batch_timeout_micros_;
+  int32 max_enqueued_batches_;
+  std::vector<int32> allowed_batch_sizes_;
+  int32 low_priority_max_batch_size_;
+  int32 low_priority_batch_timeout_micros_;
+  int32 low_priority_max_enqueued_batches_;
+  std::vector<int32> low_priority_allowed_batch_sizes_;
+  std::string mixed_priority_policy_;
+  std::string batch_padding_policy_;
+  NameAttrList func_;
+  absl::optional<FunctionLibraryRuntime::Handle> fhandle_ TF_GUARDED_BY(mu_);
+  bool enable_large_batch_splitting_ = false;
+  bool has_attribute_enable_large_batch_splitting_ = false;
+  bool enable_adaptive_batch_threads_ = false;
+
+  mutex mu_;
+
+  // Parameters for adaptive batch scheduler only.
+  // Note 'num_batch_threads_' above is shared by two implementations of batch
+  // scheduler.
+  struct AdaptiveBatchSchedulerOptions {
+    int32 min_in_flight_batches_limit = kMinInflightBatches;
+    int32 initial_in_flight_batches_limit = kInitialInflightBatches;
+    int32 max_in_flight_batches_limit = kMaxInflightBatches;
+    int32 batches_to_average_over = kBatchesToAverageOver;
+    int64 full_batch_scheduling_boost_micros = -1;
+  };
+  absl::optional<AdaptiveBatchSchedulerOptions>
+      adaptive_batch_scheduler_options_ = absl::nullopt;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batch_norm_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_norm_op.h
new file mode 100644
index 00000000..7341833e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_norm_op.h
@@ -0,0 +1,143 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_NORM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_NORM_OP_H_
+// Functor definition for BatchNormOp, must be compilable by nvcc.
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BatchNormOp to do the computations.
+template <typename Device, typename T>
+struct BatchNorm {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T>::ConstVec mean,
+                  typename TTypes<T>::ConstVec var,
+                  typename TTypes<T>::ConstVec beta,
+                  typename TTypes<T>::ConstVec gamma, T variance_epsilon,
+                  bool scale_after_normalization,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int depth = mean.dimension(0);
+    const int rest_size = input.size() / depth;
+
+    Eigen::DSizes<int, 2> rest_by_depth(rest_size, depth);
+    Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one;
+    rest_by_one.set(0, rest_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_depth;
+    one_by_depth.set(1, depth);
+    Eigen::IndexList<int, Eigen::type2index<1> > depth_by_one;
+    depth_by_one.set(0, depth);
+    if (scale_after_normalization) {
+      output.reshape(rest_by_depth).device(d) =
+          (input.reshape(rest_by_depth) -
+           mean.reshape(one_by_depth).broadcast(rest_by_one)) *
+              ((var + var.constant(variance_epsilon)).rsqrt() * gamma)
+                  .eval()
+                  .reshape(one_by_depth)
+                  .broadcast(rest_by_one) +
+          beta.reshape(one_by_depth).broadcast(rest_by_one);
+    } else {
+      output.reshape(rest_by_depth).device(d) =
+          (input.reshape(rest_by_depth) -
+           mean.reshape(one_by_depth).broadcast(rest_by_one)) *
+              ((var + var.constant(variance_epsilon)).rsqrt())
+                  .eval()
+                  .reshape(one_by_depth)
+                  .broadcast(rest_by_one) +
+          beta.reshape(one_by_depth).broadcast(rest_by_one);
+    }
+  }
+};
+
+template <typename Device, typename T>
+struct BatchNormGrad {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T>::ConstVec mean,
+                  typename TTypes<T>::ConstVec var,
+                  typename TTypes<T>::ConstVec gamma,
+                  typename TTypes<T, 4>::ConstTensor out_backprop,
+                  T variance_epsilon, bool scale_after_normalization,
+                  typename TTypes<T, 4>::Tensor dx, typename TTypes<T>::Vec dm,
+                  typename TTypes<T>::Vec dv, typename TTypes<T>::Vec db,
+                  typename TTypes<T>::Vec dg, typename TTypes<T>::Vec scratch1,
+                  typename TTypes<T>::Vec scratch2) {
+    const int depth = mean.dimension(0);
+    const int rest_size = input.size() / depth;
+
+    typedef typename TTypes<T>::ConstVec::Index Index;
+
+    Eigen::DSizes<Index, 2> rest_by_depth(rest_size, depth);
+    Eigen::IndexList<Index, Eigen::type2index<1> > rest_by_one;
+    rest_by_one.set(0, rest_size);
+    Eigen::IndexList<Eigen::type2index<1>, Index> one_by_depth;
+    one_by_depth.set(1, depth);
+    Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
+
+    // db = out_backprop
+    //
+    // dg = out_backprop * ((x - m) * rsqrt(v + epsilon))
+    //
+    // dv = sum_over_rest(out_backprop * gamma * (x - m)) *
+    //      (-1/2) * (v + epsilon) ^ (-3/2)
+    //
+    // dm = sum_over_rest(out_backprop * gamma) * (-1 / rsqrt(v + epsilon))
+    //
+    // dx = out_backprop * (gamma * rsqrt(v + epsilon))
+    db.device(d) = out_backprop.reshape(rest_by_depth).sum(reduction_axis);
+
+    // scratch1 = rsqrt(v + epsilon)
+    scratch1.device(d) = (var + var.constant(variance_epsilon)).rsqrt();
+
+    // scratch2 = sum_over_rest(out_backprop * (x - m))
+    scratch2.device(d) = (out_backprop.reshape(rest_by_depth) *
+                          (input.reshape(rest_by_depth) -
+                           mean.reshape(one_by_depth).broadcast(rest_by_one)))
+                             .sum(reduction_axis);
+
+    if (scale_after_normalization) {
+      dx.reshape(rest_by_depth).device(d) =
+          out_backprop.reshape(rest_by_depth) * ((scratch1 * gamma)
+                                                     .eval()
+                                                     .reshape(one_by_depth)
+                                                     .broadcast(rest_by_one));
+      dm.device(d) = -db * (scratch1 * gamma).eval();
+      dg.device(d) = scratch2 * scratch1;
+    } else {
+      dx.reshape(rest_by_depth).device(d) =
+          out_backprop.reshape(rest_by_depth) *
+          scratch1.reshape(one_by_depth).broadcast(rest_by_one);
+      dm.device(d) = -db * scratch1;
+      dg.device(d) = dg.constant(static_cast<T>(0.0));  // Gamma is not learned.
+    }
+
+    // scratch1 = - 1/2 * (var + epsilon) ^ (-3/2)
+    scratch1.device(d) = scratch1 * scratch1.constant(static_cast<T>(-0.5f)) /
+                         (var + var.constant(variance_epsilon));
+
+    if (scale_after_normalization) {
+      dv.device(d) = scratch2 * (scratch1 * gamma).eval();
+    } else {
+      dv.device(d) = scratch2 * scratch1;
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_NORM_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batch_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_util.h
new file mode 100644
index 00000000..dad2ec4e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batch_util.h
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOTE(lespeholt): This file is deprecated. Use
+// "tensorflow/core/util/batch_util.h" instead.
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+
+#include "tensorflow/core/util/batch_util.h"
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
new file mode 100644
index 00000000..8be441b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -0,0 +1,871 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class ASBSBatch;
+
+template <typename TaskType>
+class ASBSQueue;
+}  // namespace internal
+
+// Shared batch scheduler designed to minimize latency. The scheduler keeps
+// track of a number of queues (one per model or model version) which are
+// continuously enqueuing requests. The scheduler groups the requests into
+// batches which it periodically sends off for processing (see
+// shared_batch_scheduler.h for more details). AdaptiveSharedBatchScheduler
+// (ASBS) prioritizes batches primarily by age (i.e. the batch's oldest request)
+// along with a configurable preference for scheduling larger batches first.
+//
+//
+// ASBS tries to keep the system busy by maintaining an adjustable number of
+// concurrently processed batches.  If a new batch is created, and the number of
+// in flight batches is below the target, the next (i.e. oldest) batch is
+// immediately scheduled.  Similarly, when a batch finishes processing, the
+// target is rechecked, and another batch may be scheduled.  To avoid the need
+// to carefully tune the target for workload, model type, platform, etc, it is
+// dynamically adjusted in order to provide the lowest average latency.
+//
+// Some potential use cases:
+// Hardware Accelerators (GPUs & TPUs) - If some phase of batch processing
+//   involves serial processing by a device, from a latency perspective it is
+//   desirable to keep the device evenly loaded, avoiding the need to wait for
+//   the device to process prior batches.
+// CPU utilization - If the batch processing is cpu dominated, you can reap
+//   latency gains when underutilized by increasing the processing rate, but
+//   back the rate off when the load increases to avoid overload.
+
+template <typename TaskType>
+class AdaptiveSharedBatchScheduler
+    : public std::enable_shared_from_this<
+          AdaptiveSharedBatchScheduler<TaskType>> {
+ public:
+  ~AdaptiveSharedBatchScheduler() {
+    // Finish processing batches before destroying other class members.
+    if (owned_batch_thread_pool_) {
+      delete batch_thread_pool_;
+    }
+  }
+
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+    // Number of batch processing threads - the maximum value of
+    // in_flight_batches_limit_.  It is recommended that this value be set by
+    // running the system under load, observing the learned value for
+    // in_flight_batches_limit_, and setting this maximum to ~ 2x the value.
+    // Under low load, in_flight_batches_limit_ has no substantial effect on
+    // latency and therefore undergoes a random walk.  Unreasonably large values
+    // for num_batch_threads allows for large in_flight_batches_limit_, which
+    // will harm latency for some time once load increases again.
+    int64_t num_batch_threads = port::MaxParallelism();
+    // You can pass a ThreadPool directly rather than the above two
+    // parameters.  If given, the above two parameers are ignored.  Ownership of
+    // the threadpool is not transferred.
+    thread::ThreadPool* thread_pool = nullptr;
+
+    // Lower bound for in_flight_batches_limit_. As discussed above, can be used
+    // to minimize the damage caused by the random walk under low load.
+    int64_t min_in_flight_batches_limit = 1;
+    // Although batch selection is primarily based on age, this parameter
+    // specifies a preference for larger batches.  A full batch will be
+    // scheduled before an older, nearly empty batch as long as the age gap is
+    // less than full_batch_scheduling_boost_micros.  The optimal value for this
+    // parameter should be of order the batch processing latency, but must be
+    // chosen carefully, as too large a value will harm tail latency.
+    int64_t full_batch_scheduling_boost_micros = 0;
+    // The environment to use (typically only overridden by test code).
+    Env* env = Env::Default();
+    // Initial limit for number of batches being concurrently processed.
+    // Non-integer values correspond to probabilistic limits - i.e. a value of
+    // 3.2 results in an actual cap of 3 80% of the time, and 4 20% of the time.
+    double initial_in_flight_batches_limit = 3;
+    // Number of batches between adjustments of in_flight_batches_limit.  Larger
+    // numbers will give less noisy latency measurements, but will be less
+    // responsive to changes in workload.
+    int64_t batches_to_average_over = 1000;
+
+    // If true, schedule batches using FIFO policy.
+    // Requires that `full_batch_scheduling_boost_micros` is zero.
+    // NOTE:
+    // A new parameter is introduced (not re-using
+    // full_batch_scheduling_boost_micros==zero) for backward compatibility of
+    // API.
+    bool fifo_scheduling = false;
+  };
+
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static absl::Status Create(
+      const Options& options,
+      std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>>* scheduler);
+
+  struct QueueOptions {
+    // Maximum size of a batch that's formed within
+    // `ASBSQueue<TaskType>::Schedule`.
+    int max_batch_size = 1000;
+    // Maximum size of input task, which is submitted to the queue by
+    // calling `ASBSQueue<TaskType>::Schedule` and used to form batches.
+    //
+    // If specified, it should be larger than or equal to 'max_batch_size'.
+    absl::optional<int> max_input_task_size = absl::nullopt;
+    // Maximum number of tasks to add to a specific batch.
+    absl::optional<int> max_tasks_per_batch = absl::nullopt;
+    // Maximum number of enqueued (i.e. non-scheduled) batches.
+    int max_enqueued_batches = 10;
+    // Amount of time non-full batches must wait before becoming schedulable.
+    // A non-zero value can improve performance by limiting the scheduling of
+    // nearly empty batches.
+    int64_t batch_timeout_micros = 0;
+    // If non nullptr, split_input_task_func should split input_task into
+    // multiple tasks, the first of which has size first_size and the remaining
+    // not exceeding max_size. This function may acquire ownership of input_task
+    // and should return a status indicating if the split was successful. Upon
+    // success, the caller can assume that all output_tasks will be scheduled.
+    // Including this option allows the scheduler to pack batches better and
+    // should usually improve overall throughput.
+    std::function<absl::Status(
+        std::unique_ptr<TaskType>* input_task, int first_size,
+        int max_batch_size,
+        std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
+
+    // If true, the padding will not be appended.
+    bool disable_padding = false;
+  };
+
+  using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+
+  // Adds queue (and its callback) to be managed by this scheduler.
+  absl::Status AddQueue(const QueueOptions& options,
+                        BatchProcessor process_batch_callback,
+                        std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+  double in_flight_batches_limit() {
+    mutex_lock l(mu_);
+    return in_flight_batches_limit_;
+  }
+
+ private:
+  // access to AddBatch, MaybeScheduleClosedBatches, RemoveQueue, GetEnv.
+  friend class internal::ASBSQueue<TaskType>;
+
+  explicit AdaptiveSharedBatchScheduler(const Options& options);
+
+  // Tracks processing latency and adjusts in_flight_batches_limit to minimize.
+  void CallbackWrapper(const internal::ASBSBatch<TaskType>* batch,
+                       BatchProcessor callback, bool is_express);
+
+  // Schedules batch if in_flight_batches_limit_ is not met.
+  void MaybeScheduleNextBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Schedules batch using FIFO policy if in_flight_batches_limit_ is not met.
+  void MaybeScheduleNextBatchFIFO() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Schedules all closed batches in batches_ for which an idle thread is
+  // available in batch_thread_pool_.
+  // Batches scheduled this way are called express batches.
+  // Express batches are not limited by in_flight_batches_limit_, and
+  // their latencies will not affect in_flight_batches_limit_.
+  void MaybeScheduleClosedBatches();
+
+  void MaybeScheduleClosedBatchesLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void MaybeScheduleClosedBatchesLockedFIFO() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void MaybeAdjustInflightLimit() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Notifies scheduler of non-empty batch which is eligible for processing.
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
+
+  // Removes queue from scheduler.
+  void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
+
+  Env* GetEnv() const { return options_.env; }
+
+  const Options options_;
+
+  // Collection of batches added by AddBatch, ordered by age. Owned by scheduler
+  // until they are released for processing.
+  std::vector<const internal::ASBSBatch<TaskType>*> batches_ TF_GUARDED_BY(mu_);
+
+  // Collection of batches added by AddBatch, ordered by age. Owned by
+  // scheduler until they are released for processing.
+  std::deque<const internal::ASBSBatch<TaskType>*> fifo_batches_
+      TF_GUARDED_BY(mu_);
+
+  // Unowned queues and callbacks added by AddQueue.
+  std::unordered_map<const internal::ASBSQueue<TaskType>*, BatchProcessor>
+      queues_and_callbacks_ TF_GUARDED_BY(mu_);
+
+  mutex mu_;
+
+  // Responsible for running the batch processing callbacks.
+  thread::ThreadPool* batch_thread_pool_;
+
+  bool owned_batch_thread_pool_ = false;
+
+  // Limit on number of batches which can be concurrently processed.
+  // Non-integer values correspond to probabilistic limits - i.e. a value of 3.2
+  // results in an actual cap of 3 80% of the time, and 4 20% of the time.
+  double in_flight_batches_limit_ TF_GUARDED_BY(mu_);
+
+  // Number of regular batches currently being processed.
+  int64_t in_flight_batches_ TF_GUARDED_BY(mu_) = 0;
+  // Number of express batches currently being processed.
+  int64_t in_flight_express_batches_ TF_GUARDED_BY(mu_) = 0;
+
+  // RNG engine and distribution.
+  std::default_random_engine rand_engine_;
+  std::uniform_real_distribution<double> rand_double_;
+
+  // Fields controlling the dynamic adjustment of in_flight_batches_limit_.
+  // Number of batches since the last in_flight_batches_limit_ adjustment.
+  int64_t batch_count_ TF_GUARDED_BY(mu_) = 0;
+
+  struct DelayStats {
+    // Sum of processing latency for batches counted by batch_count_.
+    int64_t batch_latency_sum = 0;
+    // Average batch latency for previous value of in_flight_batches_limit_.
+    double last_avg_latency_ms = 0;
+    // Did last_avg_latency_ms decrease from the previous last_avg_latency_ms?
+    bool last_latency_decreased = false;
+    // Current direction (+-) to adjust in_flight_batches_limit_
+    int step_direction = 1;
+  };
+
+  // Delay stats between the creation of a batch and the completion of a
+  // batch.
+  DelayStats batch_delay_stats_ TF_GUARDED_BY(mu_);
+
+  // Max adjustment size (as a fraction of in_flight_batches_limit_).
+  constexpr static double kMaxStepSizeMultiplier = 0.125;  // 1/8;
+  // Min adjustment size (as a fraction of in_flight_batches_limit_).
+  constexpr static double kMinStepSizeMultiplier = 0.0078125;  // 1/128
+  // Current adjustment size (as a fraction of in_flight_batches_limit_).
+  double step_size_multiplier_ TF_GUARDED_BY(mu_) = kMaxStepSizeMultiplier;
+
+  AdaptiveSharedBatchScheduler(const AdaptiveSharedBatchScheduler&) = delete;
+  void operator=(const AdaptiveSharedBatchScheduler&) = delete;
+};
+
+//////////////////////////////////////////////////////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+// Consolidates tasks into batches, passing them off to the
+// AdaptiveSharedBatchScheduler for processing.
+template <typename TaskType>
+class ASBSQueue : public BatchScheduler<TaskType> {
+ public:
+  using QueueOptions =
+      typename AdaptiveSharedBatchScheduler<TaskType>::QueueOptions;
+
+  ASBSQueue(std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler,
+            const QueueOptions& options);
+
+  ~ASBSQueue() override;
+
+  // Adds task to current batch. Fails if the task size is larger than the batch
+  // size or if the current batch is full and this queue's number of outstanding
+  // batches is at its maximum.
+  absl::Status Schedule(std::unique_ptr<TaskType>* task) override;
+
+  // Number of tasks waiting to be scheduled.
+  size_t NumEnqueuedTasks() const override;
+
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacity() const override;
+
+  // Notifies queue that a batch is about to be scheduled; the queue should not
+  // place any more tasks in this batch.
+  void ReleaseBatch(const ASBSBatch<TaskType>* batch);
+
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
+ private:
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacityLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns uint64 one greater than was returned by the previous call.
+  // Context id is reused after std::numeric_limits<uint64>::max is exhausted.
+  static uint64 NewTraceMeContextIdForBatch();
+
+  std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
+  const QueueOptions options_;
+  // Owned by scheduler_.
+  ASBSBatch<TaskType>* current_batch_ TF_GUARDED_BY(mu_) = nullptr;
+  int64_t num_enqueued_batches_ TF_GUARDED_BY(mu_) = 0;
+  int64_t num_enqueued_tasks_ TF_GUARDED_BY(mu_) = 0;
+  mutable mutex mu_;
+  ASBSQueue(const ASBSQueue&) = delete;
+  void operator=(const ASBSQueue&) = delete;
+};
+
+// Batch which remembers when and by whom it was created.
+template <typename TaskType>
+class ASBSBatch : public Batch<TaskType> {
+ public:
+  ASBSBatch(ASBSQueue<TaskType>* queue, int64_t creation_time_micros,
+            int64_t batch_timeout_micros, uint64 traceme_context_id)
+      : queue_(queue),
+        creation_time_micros_(creation_time_micros),
+        schedulable_time_micros_(creation_time_micros + batch_timeout_micros),
+        traceme_context_id_(traceme_context_id) {}
+
+  ~ASBSBatch() override {}
+
+  ASBSQueue<TaskType>* queue() const { return queue_; }
+
+  int64_t creation_time_micros() const { return creation_time_micros_; }
+
+  int64_t schedulable_time_micros() const { return schedulable_time_micros_; }
+
+  uint64 traceme_context_id() const { return traceme_context_id_; }
+
+ private:
+  ASBSQueue<TaskType>* queue_;
+  const int64_t creation_time_micros_;
+  const int64_t schedulable_time_micros_;
+  const uint64 traceme_context_id_;
+  ASBSBatch(const ASBSBatch&) = delete;
+  void operator=(const ASBSBatch&) = delete;
+};
+}  // namespace internal
+
+// ---------------- AdaptiveSharedBatchScheduler ----------------
+
+template <typename TaskType>
+constexpr double AdaptiveSharedBatchScheduler<TaskType>::kMaxStepSizeMultiplier;
+
+template <typename TaskType>
+constexpr double AdaptiveSharedBatchScheduler<TaskType>::kMinStepSizeMultiplier;
+
+template <typename TaskType>
+absl::Status AdaptiveSharedBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  if (options.min_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "min_in_flight_batches_limit must be >= 1; was ",
+        options.min_in_flight_batches_limit);
+  }
+  if (options.min_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "min_in_flight_batches_limit (", options.min_in_flight_batches_limit,
+        ") must be <= num_batch_threads (", options.num_batch_threads, ")");
+  }
+  if (options.full_batch_scheduling_boost_micros < 0) {
+    return errors::InvalidArgument(
+        "full_batch_scheduling_boost_micros can't be negative; was ",
+        options.full_batch_scheduling_boost_micros);
+  }
+  if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit (",
+        options.initial_in_flight_batches_limit,
+        ") should not be larger than num_batch_threads (",
+        options.num_batch_threads, ")");
+  }
+  if (options.initial_in_flight_batches_limit <
+      options.min_in_flight_batches_limit) {
+    return errors::InvalidArgument("initial_in_flight_batches_limit (",
+                                   options.initial_in_flight_batches_limit,
+                                   "must be >= min_in_flight_batches_limit (",
+                                   options.min_in_flight_batches_limit, ")");
+  }
+  if (options.batches_to_average_over < 1) {
+    return errors::InvalidArgument(
+        "batches_to_average_over should be "
+        "greater than or equal to 1; was ",
+        options.batches_to_average_over);
+  }
+  scheduler->reset(new AdaptiveSharedBatchScheduler<TaskType>(options));
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+AdaptiveSharedBatchScheduler<TaskType>::AdaptiveSharedBatchScheduler(
+    const Options& options)
+    : options_(options),
+      in_flight_batches_limit_(options.initial_in_flight_batches_limit),
+      rand_double_(0.0, 1.0) {
+  std::random_device device;
+  rand_engine_.seed(device());
+  if (options.thread_pool == nullptr) {
+    owned_batch_thread_pool_ = true;
+    batch_thread_pool_ = new thread::ThreadPool(
+        GetEnv(), options.thread_pool_name, options.num_batch_threads);
+  } else {
+    owned_batch_thread_pool_ = false;
+    batch_thread_pool_ = options.thread_pool;
+  }
+}
+
+template <typename TaskType>
+absl::Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options, BatchProcessor process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.max_enqueued_batches <= 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be positive; was ",
+        options.max_enqueued_batches);
+  }
+  if (options.max_input_task_size.has_value()) {
+    if (options.max_input_task_size.value() < options.max_batch_size) {
+      return errors::InvalidArgument(
+          "max_input_task_size must be larger than or equal to max_batch_size;"
+          "got max_input_task_size as ",
+          options.max_input_task_size.value(), " and max_batch_size as ",
+          options.max_batch_size);
+    }
+  }
+  internal::ASBSQueue<TaskType>* asbs_queue_raw;
+  queue->reset(asbs_queue_raw = new internal::ASBSQueue<TaskType>(
+                   this->shared_from_this(), options));
+  mutex_lock l(mu_);
+  queues_and_callbacks_[asbs_queue_raw] = process_batch_callback;
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
+    const internal::ASBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  if (options_.fifo_scheduling) {
+    fifo_batches_.push_back(batch);
+  } else {
+    batches_.push_back(batch);
+  }
+  int64_t delay_micros =
+      batch->schedulable_time_micros() - GetEnv()->NowMicros();
+  if (delay_micros <= 0) {
+    MaybeScheduleNextBatch();
+    return;
+  }
+  // Try to schedule batch once it becomes schedulable. Although scheduler waits
+  // for all batches to finish processing before allowing itself to be deleted,
+  // MaybeScheduleNextBatch() is called in other places, and therefore it's
+  // possible the scheduler could be deleted by the time this closure runs.
+  // Grab a shared_ptr reference to prevent this from happening.
+  GetEnv()->SchedClosureAfter(
+      delay_micros, [this, lifetime_preserver = this->shared_from_this()] {
+        mutex_lock l(mu_);
+        MaybeScheduleNextBatch();
+      });
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::RemoveQueue(
+    const internal::ASBSQueue<TaskType>* queue) {
+  mutex_lock l(mu_);
+  queues_and_callbacks_.erase(queue);
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatchFIFO() {
+  const internal::ASBSBatch<TaskType>* batch = *fifo_batches_.begin();
+  if (batch->schedulable_time_micros() > GetEnv()->NowMicros()) {
+    return;
+  }
+  fifo_batches_.pop_front();
+  // Queue may destroy itself after ReleaseBatch is called.
+  batch->queue()->ReleaseBatch(batch);
+  batch_thread_pool_->Schedule(std::bind(
+      &AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper, this, batch,
+      queues_and_callbacks_[batch->queue()], false /* is express */));
+  in_flight_batches_++;
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<
+    TaskType>::MaybeScheduleClosedBatchesLockedFIFO() {
+  // Only schedule closed batches if we have spare capacity.
+  int available_threads =
+      static_cast<int>(options_.num_batch_threads - in_flight_batches_ -
+                       in_flight_express_batches_);
+  for (auto it = fifo_batches_.begin();
+       it != fifo_batches_.end() && available_threads > 0;
+       it = fifo_batches_.begin()) {
+    if ((*it)->IsClosed()) {
+      const internal::ASBSBatch<TaskType>* batch = *it;
+      fifo_batches_.pop_front();
+      batch->queue()->ReleaseBatch(batch);
+      batch_thread_pool_->Schedule(
+          std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper,
+                    this, batch, queues_and_callbacks_[batch->queue()], true));
+      in_flight_express_batches_++;
+      available_threads--;
+    } else {
+      // Batches are FIFO, so stop iteration after finding the first non-closed
+      // batches.
+      break;
+    }
+  }
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
+  bool batch_empty =
+      options_.fifo_scheduling ? fifo_batches_.empty() : batches_.empty();
+  if (batch_empty || in_flight_batches_ >= in_flight_batches_limit_) return;
+  // Non-integer limit handled probabilistically.
+  if (in_flight_batches_limit_ - in_flight_batches_ < 1 &&
+      rand_double_(rand_engine_) >
+          in_flight_batches_limit_ - in_flight_batches_) {
+    return;
+  }
+
+  if (options_.fifo_scheduling) {
+    MaybeScheduleNextBatchFIFO();
+    return;
+  }
+
+  auto best_it = batches_.end();
+  double best_score = (std::numeric_limits<double>::max)();
+  int64_t now_micros = GetEnv()->NowMicros();
+  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+    if ((*it)->schedulable_time_micros() > now_micros) continue;
+    const double score =
+        (*it)->creation_time_micros() -
+        options_.full_batch_scheduling_boost_micros * (*it)->size() /
+            static_cast<double>((*it)->queue()->max_task_size());
+    if (best_it == batches_.end() || score < best_score) {
+      best_score = score;
+      best_it = it;
+    }
+  }
+  // No schedulable batches.
+  if (best_it == batches_.end()) return;
+  const internal::ASBSBatch<TaskType>* batch = *best_it;
+  batches_.erase(best_it);
+  // Queue may destroy itself after ReleaseBatch is called.
+  batch->queue()->ReleaseBatch(batch);
+  batch_thread_pool_->Schedule(
+      std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper, this,
+                batch, queues_and_callbacks_[batch->queue()], false));
+  in_flight_batches_++;
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatches() {
+  mutex_lock l(mu_);
+  MaybeScheduleClosedBatchesLocked();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<
+    TaskType>::MaybeScheduleClosedBatchesLocked() {
+  if (options_.fifo_scheduling) {
+    MaybeScheduleClosedBatchesLockedFIFO();
+    return;
+  }
+  // Only schedule closed batches if we have spare capacity.
+  int available_threads =
+      static_cast<int>(options_.num_batch_threads - in_flight_batches_ -
+                       in_flight_express_batches_);
+  for (auto it = batches_.begin();
+       it != batches_.end() && available_threads > 0;) {
+    if ((*it)->IsClosed()) {
+      const internal::ASBSBatch<TaskType>* batch = *it;
+      it = batches_.erase(it);
+      batch->queue()->ReleaseBatch(batch);
+      batch_thread_pool_->Schedule(
+          std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper,
+                    this, batch, queues_and_callbacks_[batch->queue()], true));
+      in_flight_express_batches_++;
+      available_threads--;
+    } else {
+      ++it;
+    }
+  }
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
+    const internal::ASBSBatch<TaskType>* batch,
+    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback,
+    bool is_express) {
+  tsl::profiler::TraceMeConsumer trace_me(
+      [&] {
+        return profiler::TraceMeEncode(
+            "ProcessBatch", {{"batch_size_before_padding", batch->size()},
+                             {"_r", 2} /*root_event*/});
+      },
+      tsl::profiler::ContextType::kAdaptiveSharedBatchScheduler,
+      batch->traceme_context_id());
+  const int64_t start_time = batch->creation_time_micros();
+  callback(std::unique_ptr<Batch<TaskType>>(
+      const_cast<internal::ASBSBatch<TaskType>*>(batch)));
+  int64_t end_time = GetEnv()->NowMicros();
+  mutex_lock l(mu_);
+  if (is_express) {
+    in_flight_express_batches_--;
+    MaybeScheduleClosedBatchesLocked();
+    return;
+  }
+  in_flight_batches_--;
+  batch_count_++;
+  batch_delay_stats_.batch_latency_sum += end_time - start_time;
+
+  MaybeAdjustInflightLimit();
+
+  MaybeScheduleNextBatch();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeAdjustInflightLimit() {
+  // Occasionally adjust in_flight_batches_limit_ to minimize average latency.
+  // Although the optimal value may depend on the workload, the latency should
+  // be a simple convex function of in_flight_batches_limit_, allowing us to
+  // locate the global minimum relatively quickly.
+  if (batch_count_ == options_.batches_to_average_over) {
+    double current_avg_latency_ms =
+        (batch_delay_stats_.batch_latency_sum / 1000.) / batch_count_;
+    bool current_latency_decreased =
+        current_avg_latency_ms < batch_delay_stats_.last_avg_latency_ms;
+    if (current_latency_decreased) {
+      // If latency improvement was because we're moving in the correct
+      // direction, increase step_size so that we can get to the minimum faster.
+      // If latency improvement was due to backtracking from a previous failure,
+      // decrease step_size in order to refine our location.
+      step_size_multiplier_ *=
+          (batch_delay_stats_.last_latency_decreased ? 2 : 0.5);
+      step_size_multiplier_ =
+          std::min(step_size_multiplier_, kMaxStepSizeMultiplier);
+      step_size_multiplier_ =
+          std::max(step_size_multiplier_, kMinStepSizeMultiplier);
+    } else {
+      // Return (nearly) to previous position and confirm that latency is better
+      // there before decreasing step size.
+      batch_delay_stats_.step_direction = -batch_delay_stats_.step_direction;
+    }
+    in_flight_batches_limit_ += batch_delay_stats_.step_direction *
+                                in_flight_batches_limit_ *
+                                step_size_multiplier_;
+    in_flight_batches_limit_ =
+        std::min(in_flight_batches_limit_,
+                 static_cast<double>(options_.num_batch_threads));
+    in_flight_batches_limit_ =
+        std::max(in_flight_batches_limit_,
+                 static_cast<double>(options_.min_in_flight_batches_limit));
+    batch_delay_stats_.last_avg_latency_ms = current_avg_latency_ms;
+    batch_delay_stats_.last_latency_decreased = current_latency_decreased;
+    batch_count_ = 0;
+    batch_delay_stats_.batch_latency_sum = 0;
+  }
+}
+
+// ---------------- ASBSQueue ----------------
+
+namespace internal {
+template <typename TaskType>
+ASBSQueue<TaskType>::ASBSQueue(
+    std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler,
+    const QueueOptions& options)
+    : scheduler_(scheduler), options_(options) {}
+
+template <typename TaskType>
+ASBSQueue<TaskType>::~ASBSQueue() {
+  // Wait until last batch has been scheduled.
+  const int kSleepMicros = 1000;
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (num_enqueued_batches_ == 0) {
+        break;
+      }
+    }
+    scheduler_->GetEnv()->SleepForMicroseconds(kSleepMicros);
+  }
+  scheduler_->RemoveQueue(this);
+}
+
+template <typename TaskType>
+absl::Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  size_t size = (*task)->size();
+  if (options_.split_input_task_func == nullptr &&
+      size > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", size,
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+  if (options_.max_input_task_size.has_value() &&
+      (size > options_.max_input_task_size.value())) {
+    return errors::InvalidArgument("Task size ", size,
+                                   " is larger than max input task size ",
+                                   options_.max_input_task_size.value());
+  }
+
+  std::vector<std::unique_ptr<TaskType>> tasks_to_schedule;
+  std::vector<ASBSBatch<TaskType>*> new_batches;
+  bool closed_batch = false;
+  {
+    mutex_lock l(mu_);
+    if (size > SchedulingCapacityLocked()) {
+      return errors::Unavailable("The batch scheduling queue is full");
+    }
+
+    int remaining_batch_size =
+        current_batch_ == nullptr
+            ? options_.max_batch_size
+            : options_.max_batch_size - current_batch_->size();
+    if (options_.split_input_task_func == nullptr ||
+        size <= remaining_batch_size) {
+      // Either we don't allow task splitting or task fits within the current
+      // batch.
+      tasks_to_schedule.push_back(std::move(*task));
+    } else {
+      // Split task in order to completely fill the current batch.
+      // Beyond this point Schedule should not fail, as the caller has been
+      // promised that all of the split tasks will be scheduled.
+      TF_RETURN_IF_ERROR(options_.split_input_task_func(
+          task, remaining_batch_size, options_.max_batch_size,
+          &tasks_to_schedule));
+    }
+    for (auto& task : tasks_to_schedule) {
+      // Can't fit within current batch, close it off and try to create another.
+      if (current_batch_ &&
+          current_batch_->size() + task->size() > options_.max_batch_size) {
+        current_batch_->Close();
+        closed_batch = true;
+        current_batch_ = nullptr;
+      }
+      if (!current_batch_) {
+        num_enqueued_batches_++;
+        // batch.traceme_context_id connects TraceMeProducer and
+        // TraceMeConsumer.
+        // When multiple calls to "ASBS::Schedule" accumulate to one batch, they
+        // are processed in the same batch and should share traceme_context_id.
+        current_batch_ = new ASBSBatch<TaskType>(
+            this, scheduler_->GetEnv()->NowMicros(),
+            options_.batch_timeout_micros, NewTraceMeContextIdForBatch());
+        new_batches.push_back(current_batch_);
+      }
+
+      // Annotate each task (corresponds to one call of schedule) with a
+      // TraceMeProducer.
+      tsl::profiler::TraceMeProducer trace_me(
+          [task_size = task->size()] {
+            return profiler::TraceMeEncode(
+                "ASBSQueue::Schedule",
+                {{"batching_input_task_size", task_size}});
+          },
+          tsl::profiler::ContextType::kAdaptiveSharedBatchScheduler,
+          this->current_batch_->traceme_context_id());
+      current_batch_->AddTask(std::move(task));
+      num_enqueued_tasks_++;
+      // If current_batch_ is now full, allow it to be processed immediately.
+      bool reached_max_tasks =
+          (options_.max_tasks_per_batch.has_value() &&
+           current_batch_->num_tasks() >= options_.max_tasks_per_batch.value());
+      if (current_batch_->size() == options_.max_batch_size ||
+          reached_max_tasks) {
+        current_batch_->Close();
+        closed_batch = true;
+        current_batch_ = nullptr;
+      }
+    }
+  }
+  // Scheduler functions must be called outside of lock, since they may call
+  // ReleaseBatch.
+  for (auto* batch : new_batches) {
+    scheduler_->AddBatch(batch);
+  }
+  if (closed_batch) {
+    scheduler_->MaybeScheduleClosedBatches();
+  }
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+void ASBSQueue<TaskType>::ReleaseBatch(const ASBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  num_enqueued_batches_--;
+  num_enqueued_tasks_ -= batch->num_tasks();
+  if (batch == current_batch_) {
+    current_batch_->Close();
+    current_batch_ = nullptr;
+  }
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  return num_enqueued_tasks_;
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  return SchedulingCapacityLocked();
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::SchedulingCapacityLocked() const {
+  const int current_batch_capacity =
+      current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
+  const int spare_batches =
+      options_.max_enqueued_batches - num_enqueued_batches_;
+  return spare_batches * options_.max_batch_size + current_batch_capacity;
+}
+
+template <typename TaskType>
+// static
+uint64 ASBSQueue<TaskType>::NewTraceMeContextIdForBatch() {
+  static std::atomic<uint64> traceme_context_id(0);
+  return traceme_context_id.fetch_add(1, std::memory_order_relaxed);
+}
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
new file mode 100644
index 00000000..a0665503
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -0,0 +1,366 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
+
+namespace tensorflow {
+namespace serving {
+
+// A BatchScheduler implementation geared toward handling a single request type
+// running on a specific set of hardware resources. A typical scenario is one in
+// which all requests invoke the same machine-learned model on one GPU.
+//
+// If there are, say, two GPUs and two models each bound to one of the GPUs, one
+// could use two BasicBatchScheduler instances to schedule the two model/GPU
+// combinations independently. If multiple models must share a given GPU or
+// other hardware resource, consider using SharedBatchScheduler instead.
+//
+//
+// PARAMETERS AND BEHAVIOR:
+//
+// BasicBatchScheduler runs a fixed pool of threads, which it uses to process
+// batches of tasks. It enforces a maximum batch size, and enqueues a bounded
+// number of tasks. If the queue is nearly empty, such that a full batch cannot
+// be formed, when a thread becomes free, it anyway schedules a batch
+// immediately if a task has been in the queue for longer than a given timeout
+// parameter. If the timeout parameter is set to 0, then the batch threads will
+// always be kept busy (unless there are zero tasks waiting to be processed).
+//
+// For online serving, it is recommended to set the maximum number of enqueued
+// batches worth of tasks equal to the number of batch threads, which allows
+// enqueuing of enough tasks s.t. if every thread becomes available it can be
+// kept busy, but no more. For bulk processing jobs and throughput-oriented
+// benchmarks, you may want to set it much higher.
+//
+// When Schedule() is called, if the queue is full the call will fail with an
+// UNAVAILABLE error (after which the client may retry again later). If the call
+// succeeds, the maximum time the task will spend in the queue before being
+// placed in a batch and assigned to a thread for processing, is the greater of:
+//  - the maximum time to process ceil(max_enqueued_batches/num_batch_threads)
+//    (1 in the recommended configuration) batches of previously-submitted tasks
+//  - the configured timeout parameter (which can be 0, as mentioned above)
+//
+// Unlike StreamingBatchScheduler, when BasicBatchScheduler assigns a batch to a
+// thread, it closes the batch. The process-batch callback may assume that every
+// batch it receives is closed at the outset.
+//
+//
+// RECOMMENDED USE-CASES:
+//
+// BasicBatchScheduler is suitable for use-cases that feature a single kind of
+// request (e.g. a server performing inference with a single machine-learned
+// model, possibly evolving over time), with loose versioning semantics.
+// Concretely, the following conditions should hold:
+//
+//  A. All requests batched onto a given resource (e.g. a hardware accelerator,
+//     or a pool accelerators) are of the same type. For example, they all
+//     invoke the same machine-learned model.
+//
+//     These variations are permitted:
+//      - The model may reside in a single servable, or it may be spread across
+//        multiple servables that are used in unison (e.g. a vocabulary lookup
+//        table servable and a tensorflow session servable).
+//      - The model's servable(s) may be static, or they may evolve over time
+//        (successive servable versions).
+//      - Zero or more of the servables are used in the request thread; the rest
+//        are used in the batch thread. In our running example, the vocabulary
+//        lookups and tensorflow runs may both be performed in the batch thread,
+//        or alternatively the vocabulary lookup may occur in the request thread
+//        with only the tensorflow run performed in the batch thread.
+//
+//     In contrast, BasicBatchScheduler is not a good fit if the server
+//     hosts multiple distinct models running on a pool accelerators, with each
+//     request specifying which model it wants to use. BasicBatchScheduler
+//     has no facility to time-multiplex the batch threads across multiple
+//     models in a principled way. More basically, it cannot ensure that a given
+//     batch doesn't contain a mixture of requests for different models.
+//
+//  B. Requests do not specify a particular version of the servable(s) that must
+//     be used. Instead, each request is content to use the "latest" version.
+//
+//     BasicBatchScheduler does not constrain which requests get grouped
+//     together into a batch, so using this scheduler there is no way to achieve
+//     cohesion of versioned requests to version-specific batches.
+//
+//  C. No servable version coordination needs to be performed between the
+//     request threads and the batch threads. Often, servables are only used in
+//     the batch threads, in which case this condition trivially holds. If
+//     servables are used in both threads, then the use-case must tolerate
+//     version skew across the servables used in the two kinds of threads.
+//
+//
+// EXAMPLE USE-CASE FLOW:
+//
+// For such use-cases, request processing via BasicBatchScheduler generally
+// follows this flow (given for illustration; variations are possible):
+//  1. Optionally perform some pre-processing on each request in the request
+//     threads.
+//  2. Route the requests to the batch scheduler, as batching::Task objects.
+//     (Since all requests are of the same type and are not versioned, the
+//     scheduler is free to group them into batches arbitrarily.)
+//  3. Merge the requests into a single batched representation B.
+//  4. Obtain handles to the servable(s) needed to process B. The simplest
+//     approach is to obtain the latest version of each servable. Alternatively,
+//     if cross-servable consistency is required (e.g. the vocabulary lookup
+//     table's version number must match that of the tensorflow session),
+//     identify an appropriate version number and obtain the servable handles
+//     accordingly.
+//  5. Process B using the obtained servable handles, and split the result into
+//     individual per-request units.
+//  6. Perform any post-processing in the batch thread and/or request thread.
+//
+//
+// PERFORMANCE TUNING: See README.md.
+//
+template <typename TaskType>
+class BasicBatchScheduler : public BatchScheduler<TaskType> {
+ public:
+  // TODO(b/25089730): Tune defaults based on best practices as they develop.
+  // (Keep them mirrored to the ones in SharedBatchScheduler::QueueOptions and
+  // SharedBatchScheduler::Options.)
+  struct Options {
+    // Options related with (underlying) shared batch scheduler.
+    // 'thread_pool_name' and 'num_batch_threads' are used to initialize
+    // a shared batch scheduler underlyingly iff 'shared_batch_scheduler' is
+    // nullptr.
+    //
+    // There are two ways to specify threading:
+    // 1) Have each session create its own pool.
+    // 2) Have multiple sessions share the same pool.
+    //
+    // In general, the number of threads should be tied to roughly the number of
+    // compute resources (CPU cores or accelerator cores) backing the threads.
+    // Sharing a thread pool helps alleviate potential over allocation of
+    // threads to limited compute resources.
+
+    // To have each session create its own thread pool (1) set
+    // thread_pool_name/num_batch_threads.
+
+    // To share a thread pool (2) create a scheduler and pass it in.
+
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::MaxParallelism();
+
+    // If specified, this scheduler will be used underlyingly to schedule
+    // batches. Note setting this means `thread_pool_name` and
+    // `num_batch_threads` are ignored.
+    std::shared_ptr<SharedBatchScheduler<TaskType>> shared_batch_scheduler =
+        nullptr;
+
+    // Options for queue.
+    // The maximum size of each batch.
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    int max_batch_size = 1000;
+
+    // If a task has been enqueued for this amount of time (in microseconds),
+    // and a thread is available, the scheduler will immediately form a batch
+    // from enqueued tasks and assign the batch to the thread for processing,
+    // even if the batch's size is below 'max_batch_size'.
+    //
+    // This parameter offers a way to bound queue latency, so that a task isn't
+    // stuck in the queue indefinitely waiting for enough tasks to arrive to
+    // make a full batch. (The latency bound is given in the class documentation
+    // above.)
+    //
+    // The goal is to smooth out batch sizes under low request rates, and thus
+    // avoid latency spikes.
+    int64_t batch_timeout_micros = 0;
+
+    // The maximum allowable number of enqueued (accepted by Schedule() but
+    // not yet being processed on a batch thread) tasks in terms of batches.
+    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
+    // See the class documentation above for guidelines on how to tune this
+    // parameter.
+    int max_enqueued_batches = 10;
+
+    // If true, an input task (i.e., input of `BasicBatchScheduler::Schedule`)
+    // with a large size (i.e., larger than the largest value of
+    // `allowed_batch_sizes`) will be split into multiple smaller batch tasks
+    // and possibly put into different batches for processing. If false, each
+    // input task is put into one batch as a whole for processing.
+    //
+    // API note:
+    // The value of this option doesn't affect processing output given the same
+    // input; it affects implementation details as stated below:
+    // 1. Improve batching efficiency by eliminating unnecessary padding in the
+    // following scenario: when an open batch has M slots while an input of size
+    // N is scheduled (M < N), the input can be split to fill remaining slots
+    // of an open batch as opposed to padding.
+    // 2.`max_batch_size` specifies the limit of input and
+    // `max_execution_batch_size` specifies the limit of a task to be processed.
+    // API user can give an input of size 128 when 'max_execution_batch_size'
+    // is 32 -> implementation can split input of 128 into 4 x 32, schedule
+    // concurrent processing, and then return concatenated results corresponding
+    // to 128.
+    bool enable_large_batch_splitting = false;
+
+    // `split_input_task_func` specifies how to split `input_task` into
+    // `output_tasks`.
+    //
+    // `input_task`: a unit of task to be split.
+    // `first_output_task_size`: task size of first output.
+    // `max_batch_size`: Maximum size of each batch.
+    // `output_tasks`: A list of output tasks after split.
+    //
+    // REQUIRED:
+    // 1) All `output_tasks` should be non-empty tasks.
+    // 2) Sizes of `output_tasks` add up to size of `input_task`.
+    //
+    // NOTE:
+    // Instantiations of `TaskType` may vary, so it's up to caller to define
+    // how (e.g., which members to access) to split input tasks.
+    std::function<absl::Status(
+        std::unique_ptr<TaskType>* input_task, int first_output_task_size,
+        int input_batch_size_limit,
+        std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
+
+    // The maximum size of each enqueued batch (i.e., in `batches_`).
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    //
+    // REQUIRES:
+    // - If enable_large_batch_splitting is true, `max_execution_batch_size` is
+    // less than or equal to `max_batch_size`.
+    // - If enable_large_batch_splitting is false, `max_execution_batch_size` is
+    // equal to `max_batch_size`.
+    int max_execution_batch_size = 10;
+
+    // The following options are typically only overridden by test code.
+
+    // The environment to use.
+    Env* env = Env::Default();
+  };
+  static absl::Status Create(
+      const Options& options,
+      std::function<void(std::unique_ptr<Batch<TaskType>>)>
+          process_batch_callback,
+      std::unique_ptr<BasicBatchScheduler>* scheduler);
+
+  ~BasicBatchScheduler() override = default;
+
+  absl::Status Schedule(std::unique_ptr<TaskType>* task) override;
+  size_t NumEnqueuedTasks() const override;
+  size_t SchedulingCapacity() const override;
+
+  size_t max_task_size() const override {
+    return shared_scheduler_queue_->max_task_size();
+  }
+
+ private:
+  explicit BasicBatchScheduler(
+      std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue);
+
+  // This class is merely a thin wrapper around a SharedBatchScheduler with a
+  // single queue.
+  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue_;
+
+  BasicBatchScheduler(const BasicBatchScheduler&) = delete;
+  void operator=(const BasicBatchScheduler&) = delete;
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+template <typename TaskType>
+absl::Status BasicBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::function<void(std::unique_ptr<Batch<TaskType>>)>
+        process_batch_callback,
+    std::unique_ptr<BasicBatchScheduler>* scheduler) {
+  std::shared_ptr<SharedBatchScheduler<TaskType>> shared_scheduler;
+
+  if (options.shared_batch_scheduler == nullptr) {
+    typename SharedBatchScheduler<TaskType>::Options shared_scheduler_options;
+    shared_scheduler_options.thread_pool_name = options.thread_pool_name;
+    shared_scheduler_options.num_batch_threads = options.num_batch_threads;
+    shared_scheduler_options.env = options.env;
+
+    TF_RETURN_IF_ERROR(SharedBatchScheduler<TaskType>::Create(
+        shared_scheduler_options, &shared_scheduler));
+  } else {
+    shared_scheduler = options.shared_batch_scheduler;
+  }
+
+  typename SharedBatchScheduler<TaskType>::QueueOptions
+      shared_scheduler_queue_options;
+  shared_scheduler_queue_options.input_batch_size_limit =
+      options.max_batch_size;
+  shared_scheduler_queue_options.batch_timeout_micros =
+      options.batch_timeout_micros;
+  shared_scheduler_queue_options.max_enqueued_batches =
+      options.max_enqueued_batches;
+  shared_scheduler_queue_options.enable_large_batch_splitting =
+      options.enable_large_batch_splitting;
+  shared_scheduler_queue_options.split_input_task_func =
+      options.split_input_task_func;
+  shared_scheduler_queue_options.max_execution_batch_size =
+      options.max_execution_batch_size;
+  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue;
+  TF_RETURN_IF_ERROR(shared_scheduler->AddQueue(shared_scheduler_queue_options,
+                                                process_batch_callback,
+                                                &shared_scheduler_queue));
+
+  scheduler->reset(
+      new BasicBatchScheduler<TaskType>(std::move(shared_scheduler_queue)));
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+absl::Status BasicBatchScheduler<TaskType>::Schedule(
+    std::unique_ptr<TaskType>* task) {
+  return shared_scheduler_queue_->Schedule(task);
+}
+
+template <typename TaskType>
+size_t BasicBatchScheduler<TaskType>::NumEnqueuedTasks() const {
+  return shared_scheduler_queue_->NumEnqueuedTasks();
+}
+
+template <typename TaskType>
+size_t BasicBatchScheduler<TaskType>::SchedulingCapacity() const {
+  return shared_scheduler_queue_->SchedulingCapacity();
+}
+
+template <typename TaskType>
+BasicBatchScheduler<TaskType>::BasicBatchScheduler(
+    std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue)
+    : shared_scheduler_queue_(std::move(shared_scheduler_queue)) {}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_input_task.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_input_task.h
new file mode 100644
index 00000000..4f50f1da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_input_task.h
@@ -0,0 +1,267 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_INPUT_TASK_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_INPUT_TASK_H_
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/base/call_once.h"
+#include "absl/container/fixed_array.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/concat_split_util.h"
+#include "tensorflow/core/kernels/batching_util/input_split_metadata.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/util/incremental_barrier.h"
+
+namespace tensorflow {
+namespace serving {
+
+namespace internal {
+template <typename TaskType>
+class BatchInputTaskHandleTestAccess;
+
+template <typename TaskType>
+class BatchInputTaskTestAccess;
+
+template <typename TaskType>
+class BatchInputTask;
+
+// A RAII-style object that holds a ref-counted batch-input-task, and
+// represents a slice of batch-input-task.
+
+// To be handed out to callers of `BatchInputTask::ToTaskHandles` quickly
+// (i.e. not necessarily waiting for input split)
+//
+// `BatchInputTaskHandle::GetSplitTask` evaluates to the slice of task.
+template <typename TaskType>
+class BatchInputTaskHandle : public BatchTask {
+ public:
+  BatchInputTaskHandle(
+      std::shared_ptr<BatchInputTask<TaskType>> batch_input_task, int split_id,
+      size_t task_size);
+
+  // Should be called once. Returns nullptr on subsequent calls.
+  std::unique_ptr<TaskType> GetSplitTask();
+
+  // Returns the size of this task.
+  size_t size() const override { return task_size_; }
+
+ private:
+  template <typename T>
+  friend class internal::BatchInputTaskHandleTestAccess;
+
+  int split_id() const { return split_id_; }
+
+  std::shared_ptr<BatchInputTask<TaskType>> batch_input_task_;
+
+  // The handle evaluates to the N-th slice of original task, and
+  // N is `split_id_`.
+  const int split_id_;
+
+  const size_t task_size_;
+
+  std::atomic<bool> once_{false};
+};
+
+// BatchInputTask encapsulates a input (`input_task`) to be batched and the
+// information to get task splits after it's enqueued, so as to support lazy
+// split of a task.
+//
+// Input split could reduce excessive padding for efficiency; lazy split
+// moves task-split out of the critical path of enqueue and dequeue and reduces
+// contention.
+//
+// BatchInputTask is thread safe.
+//
+// Usage
+//
+// ... a deque with frequent enqueue and dequeue operations ...
+// ... Note, a deque of Batch of BatchInputTaskHandle is used to form batches
+//     at enqueue time (split is lazy at deque time);
+// ... For use cases to form batches at dequeue time, we can use a deque of
+//     BatchInputTaskHandle directly, and "peek" metadata to form a batch by
+//     then.
+// std::deque<std::unique_ptr<Batch<BatchInputTaskHandle<TaskType>>>> deque_
+//     TF_GUARDED_BY(mu_);
+//
+// std::unique_ptr<TaskType> input_task;
+//
+// ... Enqueue path ...
+//
+// {
+//   mutex_lock l(mu_);
+//   std::shared_ptr<BatchInputTask<TaskType>> batch_input_task =
+//       ConstructLazyBatchWithoutSplit(input_task);
+//
+//   std::vector<std::unique_ptr<BatchInputTaskHandle<TaskType>>> task_handles;
+//   input_batch->ToTaskHandles(&task_handles);
+//   for (int i = 0; i < task_handles.size(); ++i) {
+//     EnqueueTaskHandleIntoDeque(deque_);
+//   }
+//
+// ... Dequeue path ...
+// std::unique_ptr<Batch<BatchInputTaskHandle<TaskType>>> handles_to_schedule;
+// {
+//    mutex_lock l(mu_);
+//    ... HasBatchToSchedule could be customized or specialized
+//    ... (e.g., readiness depending on enqueue time)
+//    if (HasBatchToSchedule(deque_)) {
+//      handles_to_schedule = std::move(deque_.front());
+//      deque_.pop_front();
+//    }
+// }
+// ...... `mu_` is released ......
+//
+// std::vector<std::unique_ptr<BatchInputTaskHandle<TaskType>>> tasks_in_batch =
+//     RemoveAllTasksFromBatch(handles_to_schedule);
+//
+// std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+// for (int i = 0; i < tasks_in_batch.size(); i++) {
+//   batch_to_schedule->AddTask(std::move(tasks_in_batch[i]->GetSplitTask()));
+// }
+// batch_to_schedule->Close();
+//
+// `batch_to_schedule` is ready for schedule.
+template <typename TaskType>
+class BatchInputTask
+    : public std::enable_shared_from_this<BatchInputTask<TaskType>> {
+ public:
+  using SplitInputFunc = std::function<absl::Status(
+      std::unique_ptr<TaskType>* input_task, int first_output_task_size,
+      int input_batch_size_limit,
+      std::vector<std::unique_ptr<TaskType>>* output_tasks)>;
+
+  BatchInputTask(std::unique_ptr<TaskType> input_task,
+                 int open_batch_remaining_slot, int batch_size_limit,
+                 SplitInputFunc split_input_func);
+
+  // Outputs the task handles for the input task.
+  // Each task handle represents a slice of task after input task is split, and
+  // could evaluate to that slice.
+  //
+  // NOTE:
+  // Each task handle in `output_task_handles` takes ownership of a reference of
+  // this BatchInputTask.
+  void ToTaskHandles(
+      std::vector<std::unique_ptr<BatchInputTaskHandle<TaskType>>>*
+          output_task_handles);
+
+ private:
+  friend class BatchInputTaskHandle<TaskType>;
+  template <typename T>
+  friend class internal::BatchInputTaskTestAccess;
+
+  std::unique_ptr<TaskType> GetSplitTask(int split_id);
+
+  absl::Status SplitBatches(
+      std::vector<std::unique_ptr<TaskType>>* output_tasks);
+
+  std::unique_ptr<TaskType> input_task_;
+
+  const int input_task_size_ = 0;
+  const int open_batch_remaining_slot_;
+
+  const int batch_size_limit_;
+  const SplitInputFunc split_func_;
+
+  const InputSplitMetadata input_split_metadata_;
+
+  mutable absl::once_flag once_;
+
+  std::vector<std::unique_ptr<TaskType>> task_splits_;
+  absl::Status split_status_;
+};
+
+//
+// Implementation details. API readers may skip.
+//
+
+template <typename TaskType>
+BatchInputTaskHandle<TaskType>::BatchInputTaskHandle(
+    std::shared_ptr<BatchInputTask<TaskType>> batch_input_task, int split_id,
+    size_t task_size)
+    : batch_input_task_(batch_input_task),
+      split_id_(split_id),
+      task_size_(task_size) {}
+
+template <typename TaskType>
+std::unique_ptr<TaskType> BatchInputTaskHandle<TaskType>::GetSplitTask() {
+  if (once_.load(std::memory_order_acquire)) {
+    return nullptr;
+  }
+  once_.store(true, std::memory_order_release);
+  return batch_input_task_->GetSplitTask(split_id_);
+}
+
+template <typename TaskType>
+BatchInputTask<TaskType>::BatchInputTask(std::unique_ptr<TaskType> input_task,
+                                         int open_batch_remaining_slot,
+                                         int batch_size_limit,
+                                         SplitInputFunc split_input_func)
+    : input_task_(std::move(input_task)),
+      input_task_size_(input_task_->size()),
+      open_batch_remaining_slot_(open_batch_remaining_slot),
+      batch_size_limit_(batch_size_limit),
+      split_func_(split_input_func),
+      input_split_metadata_(input_task_size_, open_batch_remaining_slot,
+                            batch_size_limit) {}
+
+template <typename TaskType>
+void BatchInputTask<TaskType>::ToTaskHandles(
+    std::vector<std::unique_ptr<BatchInputTaskHandle<TaskType>>>*
+        task_handles) {
+  const absl::FixedArray<int>& task_sizes = input_split_metadata_.task_sizes();
+  task_handles->resize(task_sizes.size());
+  for (int i = 0; i < task_handles->size(); i++) {
+    (*task_handles)[i] = std::make_unique<BatchInputTaskHandle<TaskType>>(
+        this->shared_from_this(), i, task_sizes[i]);
+  }
+}
+
+template <typename TaskType>
+std::unique_ptr<TaskType> BatchInputTask<TaskType>::GetSplitTask(int split_id) {
+  absl::call_once(once_,
+                  [this]() { split_status_ = SplitBatches(&task_splits_); });
+  if (!split_status_.ok()) {
+    LOG_EVERY_N_SEC(WARNING, 60 /* seconds */)
+        << "Split task with error: " << split_status_ << " split metadata is "
+        << input_split_metadata_.DebugString();
+    return nullptr;
+  }
+  if (split_id >= 0 && split_id < task_splits_.size()) {
+    return std::move(task_splits_[split_id]);
+  }
+  return nullptr;
+}
+
+template <typename TaskType>
+absl::Status BatchInputTask<TaskType>::SplitBatches(
+    std::vector<std::unique_ptr<TaskType>>* output_tasks) {
+  return split_func_(&input_task_, open_batch_remaining_slot_,
+                     batch_size_limit_, output_tasks);
+}
+
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_INPUT_TASK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_resource_base.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_resource_base.h
new file mode 100644
index 00000000..e853fc48
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -0,0 +1,380 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_RESOURCE_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_RESOURCE_BASE_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "tensorflow/core/common_runtime/cost_measurement_registry.h"
+#include "tensorflow/core/common_runtime/request_cost.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
+#include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tsl/platform/criticality.h"
+
+namespace tensorflow {
+namespace serving {
+
+// Options used to create a batch resource.
+struct BatchResourceOptions {
+  int32_t num_batch_threads;
+  int32_t max_batch_size;
+  int32_t batch_timeout_micros;
+  int32_t max_enqueued_batches;
+  std::vector<int32_t> allowed_batch_sizes;
+  std::string batch_padding_policy{kPadUpPolicy};
+  int32_t low_priority_max_batch_size;
+  int32_t low_priority_batch_timeout_micros;
+  int32_t low_priority_max_enqueued_batches;
+  std::vector<int32_t> low_priority_allowed_batch_sizes;
+  MixedPriorityBatchingPolicy mixed_priority_batching_policy;
+};
+
+// Base class for resource that encapsulating the state and logic for batching
+// tensors.
+class BatchResourceBase : public ResourceBase {
+ public:
+  // Given a BatchTask (from one op invocation) with 'num_outputs'== M and
+  // split into N sub tasks, TensorMatrix is a N X M matrix.
+  // Namely, TensorMatrix[i][j] indicates the i-th split tensor of j-th output;
+  // concatenating tensors along the 2nd dimension gives a output tensor.
+  typedef std::vector<std::vector<Tensor>> TensorMatrix;
+
+  // One task to be batched, corresponds to a `slice` of input from one batch-op
+  // invocation.
+  //
+  // Given input from one batch-op invocation, a `slice` of this input is:
+  // 1) Split each Tensor in `BatchTask::inputs` along the 0th dimension.
+  // 2) 'split_index' is calculated along the 0-th dimension.
+  //
+  // Note input from one batch-op invocation is valid and considered a
+  // specialized `slice`.
+  struct BatchTask : public tensorflow::serving::BatchTask {
+    BatchTask() : criticality_val(tsl::criticality::GetCriticality()){};
+
+    // A unique ID to identify this invocation of Batch.
+    int64_t guid;
+
+    Context propagated_context;
+
+    std::vector<Tensor> inputs;
+    std::vector<Tensor> captured_inputs;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done_callback;
+
+    // The index of this split, along the 0-th dimension of input from op
+    // invocation.
+    int split_index = 0;
+
+    // Two-dimensional tensor matrix, ownership shared by:
+    // 1) each split of task (to fill one row in this matrix)
+    // and
+    // 2) callback that runs to merge output of individual splits for an op
+    // invocation, after all splits complete.
+    std::shared_ptr<TensorMatrix> output;
+
+    // 'status' records error (could be from any split) if at least one split
+    // returns error, OK otherwise.
+    // Ownership is shared by individual splits and callback.
+    std::shared_ptr<ThreadSafeStatus> status;
+
+    bool is_partial = false;
+
+    uint64 start_time;
+
+    size_t size() const override { return inputs[0].shape().dim_size(0); }
+
+    // Create a split task from this one. The caller needs to setup the inputs
+    // of the new task
+    std::unique_ptr<BatchTask> CreateSplitTask(
+        int split_index, AsyncOpKernel::DoneCallback done_callback);
+
+    // RequestCost is for collecting the cost and must outlive the batching
+    // processing.
+    //
+    // For example, to collect cost in rpc processing, `request_cost` is owned
+    // by rpc handler and points to the RequestCost of an rpc which provides
+    // the inputs to this BatchTask.
+    //
+    // After the batch processing, the request cost will be incremented with
+    // this task's processing costs.
+    RequestCost* request_cost = nullptr;
+
+    // Returns the criticality associated with the task.
+    tsl::criticality::Criticality criticality() const override {
+      return criticality_val;
+    };
+
+    // If nonzero, make a batch of this size entirely out of padding. This
+    // batch is processed, but is not propagated to the kernel outputs.
+    int forced_warmup_batch_size = 0;
+
+   protected:
+    virtual std::unique_ptr<BatchTask> CreateDerivedTask() {
+      return std::make_unique<BatchTask>();
+    }
+
+   private:
+    // Criticality associated with the task.
+    ::tsl::criticality::Criticality criticality_val;
+  };
+
+  // Appending a T suffix to make the type alias different to those in
+  // tensorflow::serving namespace, because some versions of compiler complain
+  // about changing meaning of the symbols.
+  using BatcherT = SharedBatchScheduler<BatchResourceBase::BatchTask>;
+  using AdaptiveBatcherT =
+      AdaptiveSharedBatchScheduler<BatchResourceBase::BatchTask>;
+  using BatcherQueueT = BatchScheduler<BatchResourceBase::BatchTask>;
+  using BatchT = Batch<BatchResourceBase::BatchTask>;
+
+  BatchResourceBase(bool has_process_batch_function,
+                    std::shared_ptr<BatcherT> batcher,
+                    const BatcherT::QueueOptions& batcher_queue_options,
+                    std::vector<int32> allowed_batch_sizes)
+      : has_process_batch_function_(has_process_batch_function),
+        batcher_(std::move(batcher)),
+        batcher_queue_options_(batcher_queue_options),
+        allowed_batch_sizes_(std::move(allowed_batch_sizes)),
+        allowed_batch_sizes_str_(absl::StrJoin(allowed_batch_sizes_, ",")) {}
+
+  BatchResourceBase(bool has_process_batch_function,
+                    std::shared_ptr<AdaptiveBatcherT> batcher,
+                    const AdaptiveBatcherT::QueueOptions& batcher_queue_options,
+                    std::vector<int32> allowed_batch_sizes)
+      : has_process_batch_function_(has_process_batch_function),
+        adaptive_batcher_(std::move(batcher)),
+        adaptive_batcher_queue_options_(batcher_queue_options),
+        allowed_batch_sizes_(std::move(allowed_batch_sizes)),
+        allowed_batch_sizes_str_(absl::StrJoin(allowed_batch_sizes_, ",")) {}
+
+  void set_session_metadata(tensorflow::SessionMetadata session_metadata) {
+    session_metadata_ = std::move(session_metadata);
+  }
+
+  const SessionMetadata& session_metadata() const { return session_metadata_; }
+
+  using CreateBatchTaskFn =
+      std::function<StatusOr<std::unique_ptr<BatchTask>>()>;
+
+  // Like `RegisterInput`, but extra "dummy" batches are processed for each
+  // batch size. Only the real request's outputs are propagated to the caller.
+  Status RegisterWarmupInputs(int64_t guid, OpKernelContext* context,
+                              const string& batcher_queue_name,
+                              const CreateBatchTaskFn& create_batch_task_fn,
+                              AsyncOpKernel::DoneCallback done);
+  // Ingests data from one invocation of the batch op. The data is enqueued to
+  // be combined with others into a batch, asynchronously.
+  // `CreateBatchTaskFn` should be used to instantiate fields added to a
+  // child class of `BatchTask` by the caller.
+  Status RegisterInput(int64_t guid, OpKernelContext* context,
+                       const string& batcher_queue_name,
+                       const CreateBatchTaskFn& create_batch_task_fn,
+                       AsyncOpKernel::DoneCallback done_callback,
+                       int forced_warmup_batch_size = 0);
+
+  static BatcherT::QueueOptions GetBatcherQueueOptions(
+      int32_t num_batch_threads, int32_t max_batch_size,
+      int32_t batch_timeout_micros, int32_t max_enqueued_batches,
+      const std::vector<int32>& allowed_batch_sizes,
+      bool enable_large_batch_splitting, bool disable_padding);
+
+  static BatcherT::QueueOptions GetBatcherQueueOptions(
+      int32_t num_batch_threads, int32_t max_batch_size,
+      int32_t batch_timeout_micros, int32_t max_enqueued_batches,
+      const std::vector<int32>& allowed_batch_sizes,
+      bool enable_large_batch_splitting, bool disable_padding,
+      absl::string_view batch_padding_policy,
+      int32_t low_priority_max_batch_size,
+      int32_t low_priority_batch_timeout_micros,
+      int32_t low_priority_max_enqueued_batches,
+      const std::vector<int32>& low_priority_allowed_batch_sizes,
+      MixedPriorityBatchingPolicy mixed_priority_batching_policy);
+
+  static AdaptiveBatcherT::QueueOptions GetAdaptiveBatcherQueueOptions(
+      int32_t max_batch_size, int32_t batch_timeout_micros,
+      int32_t max_enqueued_batches, bool enable_large_batch_splitting,
+      const std::vector<int32>& allowed_batch_sizes, bool disable_padding);
+
+  // Split 'input' of 'input_task_ptr' along 0th dimension, into a list of
+  // 'output_tasks'.
+  // Task sizes are determined by
+  // 1) open_batch_remaining_slot
+  // 2) max_batch_size
+  // 3) size-of-input-task
+  // in a way that
+  // 1) Task sizes add up to `size-of-input-task`.
+  // 2) Task sizes from left to right are like
+  //    [open_batch_remaining_slot, max_batch_size, max_batch_size, ...,
+  //    `size-of-input-task` - `sum-of-previous-elements`].
+  //
+  // REQUIRES:
+  // Caller should make sure size-of-input-task is greater than
+  // open_batch_remaining_slot.
+  static Status SplitInputTask(
+      std::unique_ptr<BatchTask>* input_task_ptr, int open_batch_remaining_slot,
+      int max_batch_size,
+      std::vector<std::unique_ptr<BatchTask>>* output_tasks);
+
+  // Splits the batch costs to each task.
+  //
+  // Inputs:
+  // 1) batch_cost_measurements, which provides the total cost of each type;
+  // 2) processed_size, it's the batch size plus the padding amount;
+  // 3) batch, provides the batch size and input sizes.
+  //
+  // Outputs:
+  // The request_cost in each batch task will be updated.
+  // - This function will use two approaches to split the batch cost (if it's
+  //   non-zero), thus two costs will be output.
+  //   1) smeared cost: batch cost is split proportionally to each task's size,
+  //      and paddings do not share any cost;
+  //   2) non-smeared cost: batch cost is split proportionally to each task or
+  //      padding's size. Here padding's cost is not assigned to any tasks.
+  // - This function will also record the metrics of this batch in each task,
+  //   including:
+  //   1) the batch size;
+  //   2) the input size from this task;
+  //   3) the padding amount.
+  static void SplitBatchCostsAndRecordMetrics(
+      const std::string& model_name, const std::string& op_name,
+      const std::vector<std::unique_ptr<CostMeasurement>>&
+          batch_cost_measurements,
+      int64_t processed_size, BatchT& batch);
+
+ private:
+  // Implementation of calling the process batch function.
+  virtual void ProcessFuncBatchImpl(
+      const BatchResourceBase::BatchTask& last_task,
+      absl::Span<const Tensor> inputs, std::vector<Tensor>* combined_outputs,
+      std::function<void(const Status&)> done) const = 0;
+
+  // Validates that it's legal to combine the tasks in 'batch' into a batch.
+  // Assumes the batch is non-empty.
+  static Status ValidateBatch(const BatchT& batch);
+
+  // Returns a boolean indicating whether a batch is formed from low priority
+  // tasks only or not.
+  bool IsLowPriorityBatch(const BatchT& batch) const;
+
+  // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
+  // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
+  // returns 'batch_size'.
+  int RoundToLowestAllowedBatchSize(int batch_size,
+                                    bool is_low_priority_batch = false) const;
+
+  // Helper function to propagate the status to the task's context and call the
+  // done callback on the task.
+  void CleanUpFunctionHelper(BatchTask& task, const Status& status) const;
+
+  // Concatenates the input tensors of the tasks from the batch and the
+  // unbatched task vector. When padding is enabled in the batcher queue, they
+  // are padded with garbage value up to the nearest allowed batch size.
+  Status ConcatInputTensors(
+      const BatchT& batch,
+      const std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks,
+      OpKernelContext* context,
+      std::vector<Tensor>* concatenated_tensors) const;
+
+  Status SplitOutputTensors(
+      const std::vector<Tensor>& combined_outputs, BatchT* batch,
+      std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks) const;
+
+  void ProcessFuncBatch(
+      std::unique_ptr<BatchT> batch,
+      std::vector<std::unique_ptr<BatchTask>> unbatched_tasks = {}) const;
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<BatchT> batch) const;
+
+  // Callback function that wraps the Process*Batch functions above. The caller
+  // of the callback must guarantee that the unique pointers passed as argument
+  // are not null.
+  void ProcessBatchCallBack(
+      std::unique_ptr<Batch<BatchTask>> batch,
+      std::vector<std::unique_ptr<BatchTask>> unbatched_tasks);
+
+  // Emits an index tensor, which the Unbatch op will use to un-concatenate
+  // the tensor and attribute the pieces to the right batch keys. The index
+  // tensor contains, for each input: [batch_key, start_offset, end_offset]
+  // where start_offset and end_offset represent the range of entries in the
+  // concatenated tensors that belong to that input.
+  //
+  // Emits the result to the output at 'output_index' using 'context'.
+  static Status EmitIndexTensor(OpKernelContext* context, const BatchT& batch,
+                                int output_index);
+
+  // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
+  // creates it.
+  //
+  // The model_name and op_name are the names of the current model and
+  // operation, respectively.
+  Status LookupOrCreateBatcherQueue(const string& queue_name,
+                                    const string& model_name,
+                                    const string& op_name,
+                                    BatcherQueueT** queue);
+
+  SessionMetadata session_metadata_;
+
+  absl::Mutex outstanding_batch_mu_;
+  int num_outstanding_batched_items_ TF_GUARDED_BY(outstanding_batch_mu_) = 0;
+
+  // True if user specified a batch processing function for this resource.
+  const bool has_process_batch_function_;
+  // A batch scheduler, and options for creating queues.
+  std::shared_ptr<BatcherT> batcher_;
+  BatcherT::QueueOptions batcher_queue_options_;
+
+  // A batch scheduler, and options for creating queues.
+  std::shared_ptr<AdaptiveBatcherT> adaptive_batcher_;
+  AdaptiveBatcherT::QueueOptions adaptive_batcher_queue_options_;
+
+  // A collection of batcher queues, keyed on queue name.
+  // TODO(olston): Garbage-collect unused queues (perhaps simply remove empty
+  // ones (with a time delay?); it's okay if they get recreated later).
+  mutable mutex batcher_queues_mu_;
+  std::map<string, std::unique_ptr<BatcherQueueT>> batcher_queues_
+      TF_GUARDED_BY(batcher_queues_mu_);
+
+  std::vector<int32> allowed_batch_sizes_;
+  // A concatenated string of <allowed_batch_sizes_>, separated by ",". This is
+  // used to record batching parameter.
+  string allowed_batch_sizes_str_;
+};
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_RESOURCE_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_scheduler.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_scheduler.h
new file mode 100644
index 00000000..ccb34412
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -0,0 +1,601 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Abstractions for processing small tasks in a batched fashion, to reduce
+// processing times and costs that can be amortized across multiple tasks.
+//
+// The core class is BatchScheduler, which groups tasks into batches.
+//
+// BatchScheduler encapsulates logic for aggregating multiple tasks into a
+// batch, and kicking off processing of a batch on a thread pool it manages.
+//
+// This file defines an abstract BatchScheduler class.
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <deque>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/criticality.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace serving {
+
+const absl::string_view kLowPriorityPaddingWithMaxBatchSizeAttrValue =
+    "low_priority_padding_with_max_batch_size";
+const absl::string_view kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue =
+    "low_priority_padding_with_next_allowed_batch_size";
+const absl::string_view kPriorityIsolationAttrValue = "priority_isolation";
+const absl::string_view kPriorityMergeAttrValue = "priority_merge";
+
+enum class MixedPriorityBatchingPolicy {
+  kLowPriorityPaddingWithMaxBatchSize,
+  kLowPriorityPaddingWithNextAllowedBatchSize,
+  kPriorityIsolation,
+  kPriorityMerge,
+};
+
+absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
+    absl::string_view attr_value);
+
+// The abstract superclass for a unit of work to be done as part of a batch.
+//
+// An implementing subclass typically contains (or points to):
+//  (a) input data;
+//  (b) a thread-safe completion signal (e.g. a Notification);
+//  (c) a place to store the outcome (success, or some error), upon completion;
+//  (d) a place to store the output data, upon success.
+//
+// Items (b), (c) and (d) are typically non-owned pointers to data homed
+// elsewhere, because a task's ownership gets transferred to a BatchScheduler
+// (see below) and it may be deleted as soon as it is done executing.
+class BatchTask {
+ public:
+  virtual ~BatchTask() = default;
+
+  // Returns the size of the task, in terms of how much it contributes to the
+  // size of a batch. (A batch's size is the sum of its task sizes.)
+  virtual size_t size() const = 0;
+
+  // Returns the criticality of associated with the task. It defaults to
+  // kCritical.
+  virtual tsl::criticality::Criticality criticality() const {
+    return tsl::criticality::Criticality::kCritical;
+  }
+};
+
+// A thread-safe collection of BatchTasks. Tasks can be either added or removed
+// from the TaskQueue. It is mainly used to hold the registered tasks without
+// forming batches, so that the batches can be formed more flexibly right before
+// they get scheduled for execution.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class TaskQueue {
+ public:
+  TaskQueue() = default;
+
+  struct TaskWrapper {
+    std::unique_ptr<TaskType> task;
+    uint64 start_time_micros;
+
+    TaskWrapper(std::unique_ptr<TaskType> task, uint64 start_time_micros)
+        : task(std::move(task)), start_time_micros(start_time_micros) {}
+  };
+
+  // Appends a task to the end of the queue with the given start time.
+  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
+
+  // Adds a task to the front of the queue with the given start time.
+  void PrependTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
+
+  // Removes a task from the front of the queue, i.e., the oldest task in the
+  // queue.
+  std::unique_ptr<TaskType> RemoveTask();
+
+  // Removes tasks from the front of the queue as many as possible as long as
+  // the sum of sizes of the removed tasks don't exceed the 'size' given as the
+  // argument.
+  std::vector<std::unique_ptr<TaskType>> RemoveTask(int size);
+
+  // Returns the start time of the earliest task in the queue. If the queue is
+  // empty, return the null value.
+  std::optional<uint64> EarliestTaskStartTime() const;
+
+  // Returns true iff the queue contains 0 tasks.
+  bool empty() const;
+
+  // Returns the number of tasks in the queue.
+  int num_tasks() const;
+
+  // Returns the sum of the task sizes.
+  int size() const;
+
+ private:
+  mutable mutex mu_;
+
+  // Tasks in the queue.
+  std::deque<TaskWrapper> tasks_ TF_GUARDED_BY(mu_);
+
+  // The sum of the sizes of the tasks in 'tasks_'.
+  int size_ TF_GUARDED_BY(mu_) = 0;
+
+  // Whether the queue is empty.
+  std::atomic<bool> empty_ TF_GUARDED_BY(mu_){true};
+
+  // The copy constructor and the assign op are deleted.
+  TaskQueue(const TaskQueue&) = delete;
+  void operator=(const TaskQueue&) = delete;
+};
+
+template <typename TaskType>
+void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
+                                  uint64 start_time_micros) {
+  {
+    mutex_lock l(mu_);
+    size_ += task->size();
+    tasks_.emplace_back(std::move(task), start_time_micros);
+    empty_.store(false);
+  }
+}
+
+template <typename TaskType>
+void TaskQueue<TaskType>::PrependTask(std::unique_ptr<TaskType> task,
+                                      uint64 start_time_micros) {
+  {
+    mutex_lock l(mu_);
+    size_ += task->size();
+    tasks_.emplace_front(std::move(task), start_time_micros);
+    empty_.store(false);
+  }
+}
+
+template <typename TaskType>
+std::unique_ptr<TaskType> TaskQueue<TaskType>::RemoveTask() {
+  {
+    mutex_lock l(mu_);
+    if (tasks_.empty()) {
+      return nullptr;
+    }
+    std::unique_ptr<TaskType> task = std::move(tasks_.front().task);
+    size_ -= task->size();
+    tasks_.pop_front();
+    if (tasks_.empty()) {
+      empty_.store(true);
+    }
+    return task;
+  }
+}
+
+template <typename TaskType>
+std::vector<std::unique_ptr<TaskType>> TaskQueue<TaskType>::RemoveTask(
+    int size) {
+  {
+    mutex_lock l(mu_);
+    if (tasks_.empty()) {
+      return {};
+    }
+
+    int size_lower_bound = size_ - size;
+    std::vector<std::unique_ptr<TaskType>> remove_tasks;
+    while (!tasks_.empty() &&
+           size_ - static_cast<int>(tasks_.front().task->size()) >=
+               size_lower_bound) {
+      size_ -= static_cast<int>(tasks_.front().task->size());
+      remove_tasks.push_back(std::move(tasks_.front().task));
+      tasks_.pop_front();
+      if (tasks_.empty()) {
+        empty_.store(true);
+      }
+    }
+    return remove_tasks;
+  }
+}
+
+template <typename TaskType>
+bool TaskQueue<TaskType>::empty() const {
+  {
+    mutex_lock l(mu_);
+    return empty_.load();
+  }
+}
+
+template <typename TaskType>
+std::optional<uint64> TaskQueue<TaskType>::EarliestTaskStartTime() const {
+  {
+    mutex_lock l(mu_);
+
+    if (tasks_.empty()) {
+      return std::nullopt;
+    }
+
+    return tasks_.front().start_time_micros;
+  }
+}
+
+template <typename TaskType>
+int TaskQueue<TaskType>::num_tasks() const {
+  {
+    mutex_lock l(mu_);
+    return tasks_.size();
+  }
+}
+
+template <typename TaskType>
+int TaskQueue<TaskType>::size() const {
+  {
+    mutex_lock l(mu_);
+    return size_;
+  }
+}
+
+// A thread-safe collection of BatchTasks, to be executed together in some
+// fashion.
+//
+// At a given time, a batch is either "open" or "closed": an open batch can
+// accept new tasks; a closed one cannot. A batch is monotonic: initially it is
+// open and tasks can be added to it; then it is closed and its set of tasks
+// remains fixed for the remainder of its life. A closed batch cannot be re-
+// opened.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class Batch {
+ public:
+  Batch();
+  explicit Batch(uint64 traceme_context_id);
+  virtual ~Batch();  // Blocks until the batch is closed.
+
+  // Appends 'task' to the batch. After calling AddTask(), the newly-added task
+  // can be accessed via task(num_tasks()-1) or mutable_task(num_tasks()-1).
+  // Dies if the batch is closed.
+  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros = 0);
+
+  // Removes the most recently added task. Returns nullptr if the batch is
+  // empty.
+  std::unique_ptr<TaskType> RemoveTask();
+
+  // Caller takes ownership of returned tasks.
+  // Must be called after a batch is closed.
+  std::vector<std::unique_ptr<TaskType>> RemoveAllTasks();
+
+  // Returns the number of tasks in the batch.
+  int num_tasks() const;
+
+  // Returns true iff the batch contains 0 tasks.
+  bool empty() const;
+
+  // Returns a reference to the ith task (in terms of insertion order).
+  const TaskType& task(int i) const;
+
+  // Returns a pointer to the ith task (in terms of insertion order).
+  //
+  // Caller doesn't take ownership.
+  TaskType* mutable_task(int i);
+
+  // Returns the sum of the task sizes.
+  size_t size() const;
+
+  // Returns true iff the batch is currently closed.
+  bool IsClosed() const;
+
+  // Blocks until the batch is closed.
+  void WaitUntilClosed() const;
+
+  // Marks the batch as closed. Dies if called more than once.
+  void Close();
+
+  // Returns the TraceMe context id of this batch.
+  uint64 traceme_context_id() const;
+
+  // Attempts to trim this batch to a new, smaller size (not to be confused with
+  // the number of tasks in the batch). On success, the trimmed tasks go into
+  // 'out_trimmed_tasks' in the same order the tasks were in this batch.
+  //
+  // The method might not succeed if it needs to split a large task to hit the
+  // correct size.
+  void TryTrimToNewSize(
+      int new_size, std::vector<std::unique_ptr<TaskType>>& out_trimmed_tasks);
+
+  // Returns the start time of the earliest task in the queue. If the queue is
+  // empty, return the null value.
+  std::optional<uint64> EarliestTaskStartTime() const;
+
+ private:
+  mutable mutex mu_;
+
+  // The tasks in the batch.
+  std::vector<std::unique_ptr<TaskType>> tasks_ TF_GUARDED_BY(mu_);
+
+  // The sum of the sizes of the tasks in 'tasks_'.
+  size_t size_ TF_GUARDED_BY(mu_) = 0;
+
+  std::atomic<bool> empty_ TF_GUARDED_BY(mu_){true};
+
+  // Whether the batch has been closed.
+  Notification closed_;
+
+  // The TracMe context id.
+  const uint64 traceme_context_id_;
+
+  // The minimum start time of all tasks in the batch.
+  // If the batch is empty, the value is undefined.
+  uint64 earliest_task_start_time_micros_ TF_GUARDED_BY(mu_);
+
+  Batch(const Batch&) = delete;
+  void operator=(const Batch&) = delete;
+};
+
+// An abstract batch scheduler class. Collects individual tasks into batches,
+// and processes each batch on a pool of "batch threads" that it manages. The
+// actual logic for processing a batch is accomplished via a callback.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class BatchScheduler {
+ public:
+  virtual ~BatchScheduler() = default;
+
+  // Submits a task to be processed as part of a batch.
+  //
+  // Ownership of '*task' is transferred to the callee iff the method returns
+  // Status::OK. In that case, '*task' is left as nullptr. Otherwise, '*task' is
+  // left as-is.
+  //
+  // If no batch processing capacity is available to process this task at the
+  // present time, and any task queue maintained by the implementing subclass is
+  // full, this method returns an UNAVAILABLE error code. The client may retry
+  // later.
+  //
+  // Other problems, such as the task size being larger than the maximum batch
+  // size, yield other, permanent error types.
+  //
+  // In all cases, this method returns "quickly" without blocking for any
+  // substantial amount of time. If the method returns Status::OK, the task is
+  // processed asynchronously, and any errors that occur during the processing
+  // of the batch that includes the task can be reported to 'task'.
+  virtual absl::Status Schedule(std::unique_ptr<TaskType>* task) = 0;
+
+  // Returns the number of tasks that have been scheduled (i.e. accepted by
+  // Schedule()), but have yet to be handed to a thread for execution as part of
+  // a batch. Note that this returns the number of tasks, not the aggregate task
+  // size (so if there is one task of size 3 and one task of size 5, this method
+  // returns 2 rather than 8).
+  virtual size_t NumEnqueuedTasks() const = 0;
+
+  // Returns a guaranteed number of size 1 tasks that can be Schedule()d without
+  // getting an UNAVAILABLE error. In a typical implementation, returns the
+  // available space on a queue.
+  //
+  // There are two important caveats:
+  //  1. The guarantee does not extend to varying-size tasks due to possible
+  //     internal fragmentation of batches.
+  //  2. The guarantee only holds in a single-thread environment or critical
+  //     section, i.e. if an intervening thread cannot call Schedule().
+  //
+  // This method is useful for monitoring, or for guaranteeing a future slot in
+  // the schedule (but being mindful about the caveats listed above).
+  virtual size_t SchedulingCapacity() const = 0;
+
+  // Returns the maximum allowed size of tasks submitted to the scheduler. (This
+  // is typically equal to a configured maximum batch size.)
+  virtual size_t max_task_size() const = 0;
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+template <typename TaskType>
+Batch<TaskType>::Batch() : Batch(0) {}
+
+template <typename TaskType>
+Batch<TaskType>::Batch(uint64 traceme_context_id)
+    : traceme_context_id_(traceme_context_id) {}
+
+template <typename TaskType>
+Batch<TaskType>::~Batch() {
+  WaitUntilClosed();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task,
+                              uint64 start_time_micros) {
+  DCHECK(!IsClosed());
+  {
+    mutex_lock l(mu_);
+    size_ += task->size();
+    tasks_.push_back(std::move(task));
+    empty_.store(false);
+    if (tasks_.size() == 1) {
+      earliest_task_start_time_micros_ = start_time_micros;
+    } else {
+      earliest_task_start_time_micros_ =
+          std::min(earliest_task_start_time_micros_, start_time_micros);
+    }
+  }
+}
+
+template <typename TaskType>
+std::optional<uint64> Batch<TaskType>::EarliestTaskStartTime() const {
+  {
+    mutex_lock l(mu_);
+    if (tasks_.empty()) {
+      return std::nullopt;
+    }
+    return earliest_task_start_time_micros_;
+  }
+}
+
+template <typename TaskType>
+std::vector<std::unique_ptr<TaskType>> Batch<TaskType>::RemoveAllTasks() {
+  DCHECK(IsClosed());
+  {
+    mutex_lock l(mu_);
+    size_ = 0;
+    empty_.store(true);
+    std::vector<std::unique_ptr<TaskType>> tasks_to_return;
+
+    // Swapping vector takes constant time.
+    tasks_to_return.swap(tasks_);
+    return std::move(tasks_to_return);
+  }
+}
+
+template <typename TaskType>
+std::unique_ptr<TaskType> Batch<TaskType>::RemoveTask() {
+  {
+    mutex_lock l(mu_);
+    if (tasks_.empty()) {
+      return nullptr;
+    }
+    std::unique_ptr<TaskType> task = std::move(tasks_.back());
+    size_ -= task->size();
+    tasks_.pop_back();
+    if (tasks_.empty()) {
+      empty_.store(true);
+    }
+    return task;
+  }
+}
+
+template <typename TaskType>
+int Batch<TaskType>::num_tasks() const {
+  {
+    mutex_lock l(mu_);
+    return tasks_.size();
+  }
+}
+
+template <typename TaskType>
+bool Batch<TaskType>::empty() const TF_NO_THREAD_SAFETY_ANALYSIS {
+  // tracer is added to zoom in about this method.
+  // TODO(b/160249203): Remove tracer after evaluating a change to reduce
+  // lock contention and cpu usage (which is observed in profiler and
+  // very data-driven).
+  tsl::profiler::TraceMe tracer("BatchTask::empty");
+  return empty_.load();
+}
+
+template <typename TaskType>
+const TaskType& Batch<TaskType>::task(int i) const {
+  DCHECK_GE(i, 0);
+  {
+    mutex_lock l(mu_);
+    DCHECK_LT(i, tasks_.size());
+    return *tasks_[i].get();
+  }
+}
+
+template <typename TaskType>
+TaskType* Batch<TaskType>::mutable_task(int i) {
+  DCHECK_GE(i, 0);
+  {
+    mutex_lock l(mu_);
+    DCHECK_LT(i, tasks_.size());
+    return tasks_[i].get();
+  }
+}
+
+template <typename TaskType>
+size_t Batch<TaskType>::size() const {
+  {
+    mutex_lock l(mu_);
+    return size_;
+  }
+}
+
+template <typename TaskType>
+bool Batch<TaskType>::IsClosed() const {
+  return const_cast<Notification*>(&closed_)->HasBeenNotified();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::WaitUntilClosed() const {
+  const_cast<Notification*>(&closed_)->WaitForNotification();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::Close() {
+  closed_.Notify();
+}
+
+template <typename TaskType>
+uint64 Batch<TaskType>::traceme_context_id() const {
+  return traceme_context_id_;
+}
+
+template <typename TaskType>
+void Batch<TaskType>::TryTrimToNewSize(
+    int new_size, std::vector<std::unique_ptr<TaskType>>& out_trimmed_tasks) {
+  mutex_lock l(mu_);
+  DCHECK_GT(new_size, 0);
+  DCHECK_LT(new_size, size_);
+  DCHECK(out_trimmed_tasks.empty());
+
+  // Index of the first task to trim away. It is possible that it is the index
+  // of a task of size larger than 1 that will have to be split in order to get
+  // to the target new_size.
+  int32 first_task_to_move = 0;
+  // The sum of sizes of tasks i, where i < first_task_to_move.
+  int32 size_of_previous_tasks = 0;
+  while (size_of_previous_tasks + tasks_[first_task_to_move]->size() <=
+         new_size) {
+    size_of_previous_tasks += tasks_[first_task_to_move]->size();
+    first_task_to_move++;
+    // The loop must always stop before this check is tripped because new_size
+    // must never be larger than the size of the batch.
+    DCHECK_LT(first_task_to_move, tasks_.size());
+  }
+
+  // Check whether task 'first_task_to_move' will have to be split.
+  if (size_of_previous_tasks < new_size) {
+    // TODO: b/325954758 - Consider supporting splitting large tasks and then
+    // drop 'Try' from the method name.
+    return;
+  }
+  DCHECK_EQ(size_of_previous_tasks, new_size);
+
+  // Actually trim.
+  out_trimmed_tasks.reserve(tasks_.size() - first_task_to_move);
+  std::move(tasks_.begin() + first_task_to_move, tasks_.end(),
+            std::back_inserter(out_trimmed_tasks));
+  tasks_.resize(first_task_to_move);
+  size_ = new_size;
+}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
new file mode 100644
index 00000000..9a6deb1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
@@ -0,0 +1,157 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_UTILS_H_
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_stats.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+// Returns the next allowed batch size, which is the smallest allowed batch size
+// greater than or equal to the given batch size. If allowed_batch_sizes,
+// returns batch_size as is.
+int GetNextAllowedBatchSize(int batch_size,
+                            const std::vector<int32>& allowed_batch_sizes,
+                            bool disable_padding);
+
+// Returns the largest allowed batch size that is smaller than or equal to
+// batch_size. Returns batch_size if no such size exists.
+int GetPrevAllowedBatchSize(int batch_size,
+                            const std::vector<int32>& allowed_batch_sizes,
+                            bool disable_padding);
+
+// Constants containing possible values for the batch_padding_policy argument
+// of MaybeBatchDown. This argument specifies the policy that a batch scheduler
+// is using when deciding what to do when, say, 18 requests need to be batched,
+// but only 16 and 32 batch sizes are allowed. The following options are
+// available.
+//
+//   - PAD_UP: pad to size 32.
+//   - BATCH_DOWN: schedule a batch of size 16 and leave 2 requests in the
+//     batch buffer.
+//   - MINIMIZE_TPU_COST_PER_REQUEST: a smarter greedy policy that chooses
+//     to either PAD_UP or BATCH_DOWN so as to minimize the TPU costs per
+//     real request. In this case, it would compare (batch_16_cost / 16) and
+//     (batch_32_cost / 18).
+//
+inline constexpr absl::string_view kBatchDownPolicy = "BATCH_DOWN";
+inline constexpr absl::string_view kPadUpPolicy = "PAD_UP";
+inline constexpr absl::string_view kMinimizeTpuCostPerRequestPolicy =
+    "MINIMIZE_TPU_COST_PER_REQUEST";
+
+// Trims the batch to the next allowed batch size when possible and when
+// configured by batch_padding_policy.
+//
+// When trimming, this function puts the trimmed tasks go into the
+// out_trimmed_tasks vector in the same order as they were in the batch.
+template <typename TaskType>
+void MaybeBatchDown(Batch<TaskType>& batch,
+                    const std::vector<int32>& allowed_batch_sizes,
+                    bool disable_padding,
+                    absl::string_view batch_padding_policy,
+                    ModelBatchStats* model_batch_stats,
+                    std::vector<std::unique_ptr<TaskType>>& out_trimmed_tasks) {
+  if (batch_padding_policy == kPadUpPolicy) {
+    // This is the default behavior of batch resource when it is given a batch
+    // size that doesn't match any of the allowed batch sizes.
+    return;
+  }
+  bool minimize_tpu_cost_per_request;
+  if (batch_padding_policy == kBatchDownPolicy) {
+    minimize_tpu_cost_per_request = false;
+  } else if (batch_padding_policy == kMinimizeTpuCostPerRequestPolicy) {
+    if (model_batch_stats == nullptr) {
+      LOG_FIRST_N(ERROR, 1)
+          << kMinimizeTpuCostPerRequestPolicy
+          << " batch padding policy has been chosen "
+             "but no ModelBatchStats passed to the batch scheduler; will "
+             "fall back on the "
+          << kPadUpPolicy << " policy.";
+      return;
+    }
+    minimize_tpu_cost_per_request = true;
+  } else {
+    LOG_FIRST_N(ERROR, 1) << "Unsupported batch_padding_policy: "
+                          << batch_padding_policy << ", falling back on the "
+                          << kPadUpPolicy << " policy.";
+    return;
+  }
+
+  int32 batch_size = batch.size();
+
+  int32 pad_up_size =
+      GetNextAllowedBatchSize(batch_size, allowed_batch_sizes, disable_padding);
+  if (pad_up_size == batch_size) {
+    return;  // Good, no padding is necessary.
+  }
+
+  int32 batch_down_size =
+      GetPrevAllowedBatchSize(batch_size, allowed_batch_sizes, disable_padding);
+  if (batch_down_size == batch_size) {
+    return;  // Can't batch down (e.g. no smaller batch size available).
+  }
+
+  if (minimize_tpu_cost_per_request) {
+    // TODO: b/325954758 - Consider logging a warning here or elsewhere if
+    // a larger batch doesn't cost meaningfully cheaper than a smaller batch.
+    // TODO: b/325954758 - Consider logging a warning here or elsewhere if a
+    // smaller batch costs unreasonably cheaper than a larger one (assuming
+    // a batch cost model = constant_cost + batch_size * per_element_cost).
+    // TODO: b/325954758 - Consider occasionally picking either batch size so
+    // that we learn fresh costs of each batch size. For this code, it is not a
+    // large priority though because if we are in between two allowed batch
+    // sizes (say, 16 and 32), chances are that will occasionally organically
+    // get batches of exact sizes 16 and 32 (and then we pick those
+    // unconditionally). But if we explicitly occasionally explored other batch
+    // sizes, we wouldn't have to rely on this "chances are". For other
+    // applications of batch costs, we might also want to occasionally explore
+    // all allowed batch sizes and not just 16 and 32 from this example.
+    std::optional<absl::Duration> down_batch_cost =
+        model_batch_stats->batch_size(batch_down_size).tpu_cost().mean();
+    std::optional<absl::Duration> up_batch_cost =
+        model_batch_stats->batch_size(pad_up_size).tpu_cost().mean();
+    if (!down_batch_cost.has_value() || !up_batch_cost.has_value()) {
+      // We have no data about batch costs, let's just do nothing.
+      return;
+    }
+
+    auto batch_down_cost_per_request = *down_batch_cost / batch_down_size;
+    auto pad_up_cost_per_request = *up_batch_cost / batch_size;
+
+    if (pad_up_cost_per_request < batch_down_cost_per_request) {
+      // Abort batching down because it's cheaper to pad up.
+      return;
+    }
+  }
+
+  // Batch down.
+  batch.TryTrimToNewSize(batch_down_size, out_trimmed_tasks);
+}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_stats.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_stats.h
new file mode 100644
index 00000000..87c36fca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/batch_stats.h
@@ -0,0 +1,274 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The API for reporting and querying batch statistics such as the average batch
+// costs for in-process use.
+//
+// All these statistics can also be retrieved from metrics reported by various
+// modules (e.g., batch_resource_base), but it would be slow. This API, on the
+// other hand, was designed to be queried on every request.
+//
+// The classes defined here are not supposed to be instantiated by the user.
+// Instead, this file provides a single entry point:
+//
+//   BatchStatsRegistry& GlobalBatchStatsRegistry();
+//
+// For example, to register batch cost, do:
+//
+//   GlobalBatchStatsRegistry()
+//       .model(/* model_name= */ "m", /* op_name= */ "o")
+//       .batch_size(4)
+//       .tpu_cost
+//       .Register(cost);
+//
+// To get the mean cost later, do:
+//
+//   std::optional<absl::Duration> cost =
+//       .GlobalBatchStatsRegistry()
+//           .model(/* model_name= */ "m", /* op_name= */ "o")
+//           .batch_size(4)
+//           .tpu_cost
+//           .mean();
+//
+// It is allowed and safe to store references to intermediate objects here
+// because all intermediate objects are guaranteed to never be destroyed.
+//
+// All operations supported by this API are thread-safe.
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_STATS_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_STATS_H_
+
+#include <atomic>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/container/node_hash_map.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow::serving {
+
+// Default values for when there is no recorded statistic in ModelBatchStats.
+constexpr int64_t kNumBatchThreadsUnknown = -1;
+constexpr int64_t kBatchTimeoutMicrosUnknown = -1;
+
+// Tracks the average cost of registered samples.
+//
+// Thread-safe.
+class CostTracker {
+ public:
+  // Registers a cost sample.
+  void Register(absl::Duration cost) {
+    DCHECK_GT(cost, absl::ZeroDuration());
+
+    mutex_lock l(mu_);
+    sample_count_++;
+    sample_sum_ += cost;
+  };
+
+  // Returns the average cost of all registered samples, giving each sample
+  // the same weight.
+  //
+  // Returns std::nullopt if no samples have been registered.
+  //
+  // TODO: b/325954758 - Switch this to an exponentially-decaying average. It's
+  // likely enough to set the half-life to the last 100-1000 samples.
+  std::optional<absl::Duration> mean() const {
+    int64_t count;
+    absl::Duration sum;
+
+    {
+      // We only hold the lock to read the values and release it before later
+      // performing a relatively slow division operation.
+      mutex_lock l(mu_);
+      count = sample_count_;
+      sum = sample_sum_;
+    }
+
+    if (count == 0) return std::nullopt;
+
+    return sum / count;
+  };
+
+ private:
+  mutable mutex mu_;
+
+  int64_t sample_count_ TF_GUARDED_BY(mu_) = 0;
+  absl::Duration sample_sum_ TF_GUARDED_BY(mu_);
+};
+
+// Tracks statistics for a particular model and batch size.
+//
+// Thread-safe.
+class BatchSizeStats {
+ public:
+  CostTracker& tpu_cost() { return tpu_cost_; };
+
+ private:
+  CostTracker tpu_cost_;
+};
+
+// Tracks statistics for a particular model.
+//
+// Here, "model" means a specific version of a model (we assume that version is
+// encoded in the op_name). In rare cases, when a model version has multiple
+// BatchFunction operation, we also treat each such operation as a separate
+// model in this context (they should also have different op_names).
+//
+// Thread-safe.
+class ModelBatchStats {
+ public:
+  // Returns a reference to the BatchSizeStats instance for the given batch
+  // size.
+  //
+  // The returned reference persist for as long as 'this' is alive.
+  BatchSizeStats& batch_size(int32 batch_size) {
+    mutex_lock l(mu_);
+    return batch_size_stats_by_batch_size_[batch_size];
+  }
+
+  // Registers that the model server has processed a batch of size `size`
+  // non-padding tasks for this model, updating the current cumulative
+  // processed size.
+  void RegisterProcessedSize(int64_t size) {
+    cumulative_processed_size_.fetch_add(size, std::memory_order_relaxed);
+  }
+
+  // Returns the cumulative size processed by this model (the total
+  // count of individual unit-sized queries processed by the model).
+  int64_t cumulative_processed_size() const {
+    return cumulative_processed_size_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the list of batch sizes for which this model has statistics.
+  //
+  // The returned list is not guaranteed to be sorted.
+  std::vector<int32> BatchSizes() const {
+    std::vector<int32> result;
+    mutex_lock l(mu_);
+    result.reserve(batch_size_stats_by_batch_size_.size());
+    for (const auto& [key, value] : batch_size_stats_by_batch_size_) {
+      result.push_back(key);
+    }
+    return result;
+  }
+
+  void SetNumBatchThreads(int64_t num_batch_threads) {
+    num_batch_threads_.store(num_batch_threads, std::memory_order_relaxed);
+  }
+
+  int64_t num_batch_threads() const {
+    return num_batch_threads_.load(std::memory_order_relaxed);
+  }
+
+  void SetBatchTimeoutMicros(int64_t batch_timeout_micros) {
+    batch_timeout_micros_.store(batch_timeout_micros,
+                                std::memory_order_relaxed);
+  }
+
+  int64_t batch_timeout_micros() const {
+    return batch_timeout_micros_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  mutable mutex mu_;
+
+  // The storage of all BatchSizeStats instances.
+  //
+  // The mutex only protects adding/finding element in the map. Access to
+  // elements themselves (after they were created) is not protected here. No
+  // element deletion is possible because we return references to items in this
+  // map and don't track their lifetime. We are using the node hash map so that
+  // elements, once created, are fixed in memory.
+  absl::node_hash_map<int32, BatchSizeStats> batch_size_stats_by_batch_size_
+      TF_GUARDED_BY(mu_);
+
+  // The total count of individual unit-sized queries processed by this model.
+  // Can be used to generate an internal load metric per model. See
+  // RegisterQuerySize for more details.
+  std::atomic<int64_t> cumulative_processed_size_ = 0;
+
+  // The number of batch threads assigned to this model.
+  std::atomic<int64_t> num_batch_threads_ = kNumBatchThreadsUnknown;
+
+  // The timeout in microseconds for this model (after which the current batch
+  // is sent to be processed by the TPU).
+  std::atomic<int64_t> batch_timeout_micros_ = kBatchTimeoutMicrosUnknown;
+};
+
+// Tracks batch statistics for all models.
+//
+// Thread-safe.
+class BatchStatsRegistry {
+ public:
+  // Returns a reference to ModelBatchStats for the provided model_name and
+  // op_name.
+  //
+  // Upon invocation with a not-yet-seen arguments, creates an empty
+  // ModelBatchStats instance.
+  //
+  // The returned reference persist for as long as 'this' is alive.
+  ModelBatchStats& model(const std::string& model_name,
+                         const std::string& op_name) {
+    std::tuple key(model_name, op_name);
+    mutex_lock l(mu_);
+    return model_batch_stats_by_model_and_op_names_[key];
+  }
+
+  // Returns a list of all model and op names.
+  //
+  // This is the set of model/op names tracked by this BatchStats instance.
+  // Note that the returned list is not guaranteed to be sorted.
+  std::vector<std::tuple<std::string, std::string>> ModelAndOpNames() const {
+    std::vector<std::tuple<std::string, std::string>> result;
+    mutex_lock l(mu_);
+    result.reserve(model_batch_stats_by_model_and_op_names_.size());
+    for (const auto& [key, value] : model_batch_stats_by_model_and_op_names_) {
+      result.push_back(key);
+    }
+    return result;
+  }
+
+ private:
+  mutable mutex mu_;
+
+  // The storage of all ModelBatchStats instances.
+  //
+  // The mutex only protects adding/finding element in the map. Access to
+  // elements themselves (after they were created) is not protected here. No
+  // element deletion is possible because we return references to items in this
+  // map and don't track their lifetime. We are using the node hash map for
+  // element pointer stability.
+  absl::node_hash_map<std::tuple<std::string, std::string>, ModelBatchStats>
+      model_batch_stats_by_model_and_op_names_ TF_GUARDED_BY(mu_);
+};
+
+// Returns the global instance of BatchStats, to use used for all production
+// purposes (one should only instantiate individual classes from this file to
+// test them).
+inline BatchStatsRegistry& GlobalBatchStatsRegistry() {
+  static BatchStatsRegistry* instance = new BatchStatsRegistry();
+  return *instance;
+}
+
+}  // namespace tensorflow::serving
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/bounded_executor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/bounded_executor.h
new file mode 100644
index 00000000..804a3790
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/bounded_executor.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BOUNDED_EXECUTOR_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BOUNDED_EXECUTOR_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+
+namespace tensorflow {
+namespace serving {
+// BoundedExecutor has a bounded number of threads and unlimited queue length,
+// scheduled tasks are executed in a FIFO way.
+class BoundedExecutor : public thread::ThreadPoolInterface {
+ public:
+  struct Options {
+    Env* env = Env::Default();
+    ThreadOptions thread_options;
+    std::string thread_name;
+    int num_threads = -1;
+  };
+
+  static absl::StatusOr<std::unique_ptr<BoundedExecutor>> Create(
+      const Options& options);
+
+  // Destructor. All threads will be joined.
+  ~BoundedExecutor() override;
+
+  // Enqueue a function to be executed.
+  //
+  // Callers are responsible to guarantee `func` is not nullptr.
+  void Schedule(std::function<void()> func) override;
+
+  // Returns the number of threads.
+  int NumThreads() const override;
+
+  int CurrentThreadId() const override;
+
+ private:
+  explicit BoundedExecutor(const Options& options);
+
+  // Starts N workers (N == num_threads), polling tasks from `work_queue_`.
+  void InitWorker();
+
+  // A loop to fetch task from `work_queue_` and execute task.
+  void Run();
+
+  const Options& options_;
+
+  mutex work_queue_mu_;
+  std::deque<std::function<void()>> work_queue_ TF_GUARDED_BY(work_queue_mu_);
+  condition_variable work_queue_cv_ TF_GUARDED_BY(work_queue_mu_);
+
+  // A fixed number of threads.
+  std::vector<std::unique_ptr<Thread>> threads_;
+  BoundedExecutor(const BoundedExecutor&) = delete;
+  void operator=(const BoundedExecutor&) = delete;
+};
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BOUNDED_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/concat_split_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/concat_split_util.h
new file mode 100644
index 00000000..b5354be3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -0,0 +1,253 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_CONCAT_SPLIT_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_CONCAT_SPLIT_UTIL_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace concat_split_util {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Concatenates 'inputs' into a single tensor along the zeroth dimension.
+// Requires that all elements of 'inputs' have element type T. Writes to
+// 'output' using 'context' for the allocation to ensure proper device
+// placement.
+template <typename T>
+absl::Status Concat(OpKernelContext* context,
+                    const absl::Span<const Tensor> inputs, Tensor* output) {
+  const int input_dims = inputs[0].dims();
+  const TensorShape& input_shape = inputs[0].shape();
+
+  // Note that we reduce the concat of k-dimensional tensors into a two
+  // dimensional concat. Assuming the dimensions of any input tensor are
+  // {y0, y1,...,ym-1}, we flatten it to {1, y}, where y = Prod_i(yi).
+  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
+  inputs_flat.reserve(inputs.size());
+  int64_t output_dim0 = 0;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const Tensor& input = inputs[i];
+    if (input.dims() != input_dims) {
+      return errors::InvalidArgument(
+          "Ranks of all input tensors should match: shape[0] = ",
+          input_shape.DebugString(), " vs. shape[", i,
+          "] = ", input.shape().DebugString());
+    }
+    for (int j = 1; j < input_dims; ++j) {
+      if (input.dim_size(j) != input_shape.dim_size(j)) {
+        return errors::InvalidArgument(
+            "Dimensions of inputs should match: shape[0] = ",
+            input_shape.DebugString(), " vs. shape[", i,
+            "] = ", input.shape().DebugString());
+      }
+    }
+    if (input.NumElements() > 0) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          input.shaped<T, 2>({1, input.NumElements()})));
+    }
+    output_dim0 += input.dim_size(0);
+  }
+
+  TensorShape output_shape(input_shape);
+  output_shape.set_dim(0, output_dim0);
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
+                                            output_shape, output, attr));
+  if (output->NumElements() > 0) {
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+    if (std::is_same<Device, GPUDevice>::value) {
+      ConcatGPU<T>(context, inputs_flat, output, &output_flat);
+      return OkStatus();
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
+  }
+
+  return absl::OkStatus();
+}
+
+// Same as 'Concat' above, but handles Tensor dtype deduction automatically.
+inline absl::Status Concat(OpKernelContext* context,
+                           const absl::Span<const Tensor> inputs,
+                           Tensor* output) {
+  const DataType type = inputs[0].dtype();
+  absl::Status concat_status;
+  switch (type) {
+#define CASE(type)                                         \
+  case DataTypeToEnum<type>::value:                        \
+    concat_status = Concat<type>(context, inputs, output); \
+    break;
+    TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+    default:
+      concat_status = errors::InvalidArgument("Unsupported data type: ", type);
+      break;
+  }
+  return concat_status;
+}
+
+// The Split*() functions split 'input' with element type T into 'sizes.size()'
+// tensors along the zeroth dimension, with the ith split having zeroth-
+// dimension size 'sizes[i]'. They allocate the output tensors using 'context',
+// for proper device placement.
+
+// Handles special cases that are cheap. Sets 'done==true' iff it found an
+// applicable special case and wrote to the outputs. Otherwise acts as a no-op.
+template <typename T>
+absl::Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
+                            const absl::Span<const int64_t> sizes,
+                            std::vector<Tensor>* outputs, bool* done) {
+  *done = false;
+
+  int64_t total_size = 0;
+  for (const int64_t size : sizes) {
+    total_size += size;
+  }
+  if (total_size > input.shape().dim_size(0)) {
+    return errors::InvalidArgument(
+        "Sum of split sizes must not exceed dim0-size of input tensor");
+  }
+
+  // Special case 0: trivial 1-way split.
+  if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
+    outputs->push_back(input);
+    *done = true;
+    return absl::OkStatus();
+  }
+
+  // Special case 1: input is aligned.
+  if (IsInnerDimsSizeAligned<T>(input.shape())) {
+    int64_t position = 0;
+    for (const int64_t size : sizes) {
+      outputs->emplace_back(input.Slice(position, position + size));
+      position += size;
+    }
+    *done = true;
+    return absl::OkStatus();
+  }
+
+  return absl::OkStatus();
+}
+
+// Handles the general case, on CPU.
+template <typename T>
+absl::Status SplitCPU(OpKernelContext* context, const Tensor& input,
+                      const absl::Span<const int64_t> sizes,
+                      std::vector<Tensor>* outputs) {
+  int64_t suffix_dim_size = 1;
+  for (int i = 1; i < input.shape().dims(); ++i) {
+    suffix_dim_size *= input.shape().dim_size(i);
+  }
+  auto input_reshaped =
+      input.shaped<T, 2>({input.shape().dim_size(0), suffix_dim_size});
+
+  int64_t position = 0;
+  for (const int64_t size : sizes) {
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, size);
+    Tensor output;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(input.dtype(), output_shape, &output, attr));
+    auto output_shaped = output.shaped<T, 2>({size, suffix_dim_size});
+
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{
+        static_cast<Eigen::DenseIndex>(position), 0};
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{
+        static_cast<Eigen::DenseIndex>(size),
+        static_cast<Eigen::DenseIndex>(suffix_dim_size)};
+    functor::Split<CPUDevice, T, 2>()(context->eigen_device<CPUDevice>(),
+                                      output_shaped, input_reshaped,
+                                      slice_indices, slice_sizes);
+
+    outputs->emplace_back(output);
+
+    position += size;
+  }
+
+  return absl::OkStatus();
+}
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+
+// Handles the general case, on GPU.
+template <typename T>
+Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64_t>& sizes,
+                std::vector<Tensor>* outputs) {
+  // TODO(olston, apassos): Implement this.
+  LOG(FATAL) << "Not yet implemented";  // Crash ok
+}
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// The outer function that dispatches to the various Split*() functions above.
+template <typename T>
+absl::Status Split(OpKernelContext* context, const Tensor& input,
+                   const absl::Span<const int64_t> sizes,
+                   std::vector<Tensor>* outputs) {
+  bool easy_cases_done;
+  TF_RETURN_IF_ERROR(
+      SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
+  if (easy_cases_done) {
+    return absl::OkStatus();
+  }
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+// TODO(olston, apassos): Handle non-CPU cases.
+// return SplitGPU<T>(context, input, sizes, outputs);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  return SplitCPU<T>(context, input, sizes, outputs);
+}
+
+// Same as 'Split' above, but handles Tensor dtype automatically.
+inline absl::Status Split(OpKernelContext* context, const Tensor& input,
+                          const absl::Span<const int64_t> sizes,
+                          std::vector<Tensor>* outputs) {
+  const DataType type = input.dtype();
+  absl::Status split_status;
+  switch (type) {
+#define CASE(type)                                              \
+  case DataTypeToEnum<type>::value:                             \
+    split_status = Split<type>(context, input, sizes, outputs); \
+    break;
+    TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+    default:
+      split_status = errors::InvalidArgument("Unsupported data type: ", type);
+      break;
+  }
+  return split_status;
+}
+
+}  // namespace concat_split_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_CONCAT_SPLIT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/fake_clock_env.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/fake_clock_env.h
new file mode 100644
index 00000000..6fc8d9e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace test_util {
+
+// An Env implementation with a fake clock for NowMicros() and
+// SleepForMicroseconds(). The clock doesn't advance on its own; it advances via
+// an explicit Advance() method.
+// All other Env virtual methods pass through to a wrapped Env.
+class FakeClockEnv : public EnvWrapper {
+ public:
+  explicit FakeClockEnv(Env* wrapped);
+  ~FakeClockEnv() override = default;
+
+  // Advance the clock by a certain number of microseconds.
+  void AdvanceByMicroseconds(int micros);
+
+  // Blocks until there is a sleeping thread that is scheduled to wake up at
+  // the given (absolute) time.
+  void BlockUntilSleepingThread(uint64 wake_time);
+
+  // Blocks until there are at least num_threads sleeping.
+  void BlockUntilThreadsAsleep(int num_threads);
+
+  // Methods that this class implements.
+  uint64 NowMicros() const override;
+  void SleepForMicroseconds(int64_t micros) override;
+
+ private:
+  mutable mutex mu_;
+
+  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+
+  struct SleepingThread {
+    uint64 wake_time;
+    Notification* wake_notification;
+  };
+  std::vector<SleepingThread> sleeping_threads_ TF_GUARDED_BY(mu_);
+
+  FakeClockEnv(const FakeClockEnv&) = delete;
+  void operator=(const FakeClockEnv&) = delete;
+};
+
+}  // namespace test_util
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/input_split_metadata.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/input_split_metadata.h
new file mode 100644
index 00000000..429858d3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/input_split_metadata.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INPUT_SPLIT_METADATA_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INPUT_SPLIT_METADATA_H_
+
+#include <algorithm>
+
+#include "absl/container/fixed_array.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+// InputSplitMetadata represents the task sizes of an batch-task after it's
+// tailored according to queue status (`open_batch_remaining_slot` and
+// `batch_size_limit`).
+//
+// This is an internal helper class, and the implementation is shared
+// shared across different instantiations of internal::Queue<TaskType>
+// in input-split mode (QueueOptions.enable_large_batch_splitting is true).
+class InputSplitMetadata {
+ public:
+  InputSplitMetadata(int input_task_size, int open_batch_remaining_slot,
+                     int batch_size_limit);
+
+  // Returns underlying task sizes.
+  const absl::FixedArray<int>& task_sizes() const;
+
+  // Serializes task split metadata into a string for debugging.
+  std::string DebugString() const;
+
+ private:
+  absl::FixedArray<int> generate_task_sizes(int input_task_size,
+                                            int open_batch_remaining_slot,
+                                            int batch_size_limit) const;
+
+  const absl::FixedArray<int> task_sizes_;
+};
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INPUT_SPLIT_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/periodic_function.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/periodic_function.h
new file mode 100644
index 00000000..278cfac2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -0,0 +1,130 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// PeriodicFunction will periodically call the given function with a specified
+// period in a background thread.  After Start() returns, the thread is
+// guaranteed to have started. The destruction of the class causes the
+// background thread to be destroyed as well.  Start() should not be called more
+// than once.
+//
+// PeriodicFunction runs the function as soon as any previous run both is
+// complete and was started more than "interval_micros" earlier.  Thus, runs are
+// both serialized, and normally have a period of "interval_micros" if no run
+// exceeds the time.
+//
+// Note that, if the function takes longer than two interval_micross to finish,
+// then PeriodicFunction will "skip" at least one call to the function.  For
+// instance, if the period is 50ms and the function starts runs at time 0 for
+// 150ms, then the function will immediately start executing again at time 150,
+// but there will be no function runs corresponding to times 50 or 100.  This is
+// especially important to remember when using an environment with a simulated
+// clock: advancing simulated time atomically over N interval_micross will not
+// cause the function to be called N times.
+//
+// This object is thread-safe.
+//
+// Example:
+//
+//   class Foo {
+//    public:
+//     Foo() : periodic_function_([this]() { Bar(); },
+//                               1000 /* 1000us == 1ms*/) {
+//     }
+//
+//    private:
+//     void Bar() { ... }
+//
+//     PeriodicFunction periodic_function_;
+//   };
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/functional/any_invocable.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+namespace internal {
+class PeriodicFunctionTestAccess;
+}
+
+class PeriodicFunction {
+ public:
+  // Provides the ability to customize several aspects of the PeriodicFunction.
+  // Passed to constructor of PeriodicFunction.
+  struct Options {
+    Options() {}
+
+    // Any standard thread options, such as stack size, should
+    // be passed via "thread_options".
+    ThreadOptions thread_options;
+
+    // Specifies the thread name prefix (see the description in class
+    // Thread).
+    string thread_name_prefix = "periodic_function";
+
+    // The environment to use. Does not take ownership, but must remain alive
+    // for as long as the PeriodicFunction exists.
+    Env* env = Env::Default();
+
+    // Specifies the length of sleep before the first invocation of the
+    // function.
+    // This can be used for adding a random jitter to avoid synchronous behavior
+    // across multiple periodic functions.
+    int64_t startup_delay_micros = 0;
+  };
+
+  // Also starts the background thread which will be calling the function.
+  PeriodicFunction(absl::AnyInvocable<void()> function, int64_t interval_micros,
+                   const Options& options = Options());
+
+  ~PeriodicFunction();
+
+ private:
+  friend class internal::PeriodicFunctionTestAccess;
+
+  // Notifies the background thread to stop.
+  void NotifyStop();
+
+  // (Blocking.) Loops forever calling "function_" every "interval_micros_".
+  void RunLoop(int64_t start);
+
+  absl::AnyInvocable<void()> function_;   // Actual client function
+  const int64_t interval_micros_;         // Interval between calls.
+  const Options options_;
+
+  // Used to notify the thread to stop.
+  Notification stop_thread_;
+
+  // Thread for running "function_"
+  std::unique_ptr<Thread> thread_ = nullptr;
+
+  PeriodicFunction(const PeriodicFunction&) = delete;
+  void operator=(const PeriodicFunction&) = delete;
+};
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
new file mode 100644
index 00000000..a7285077
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -0,0 +1,552 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class SDBSBatch;
+
+template <typename TaskType>
+class SDBSQueue;
+}  // namespace internal
+
+// EXPERIMENTAL: API MAY BE SUBJECTED TO SUDDEN CHANGES.
+//
+// Shared batch scheduler designed for batches which are processed by a serial
+// device (e.g. GPU, TPU). When batch processing involves a mix of
+// parallelizable cpu work and non-parallelizable on-device work, overall
+// latency can be minimized by producing batches at a (load dependent) rate
+// which keeps the serial device uniformly busy.
+//
+// SerialDeviceBatchScheduler (SDBS) controls the batching rate by limiting the
+// allowed number of concurrently processed batches. Too large a limit causes
+// batches to pile up behind the serial device, adding to the overall batch
+// latency. Too small a limit underutilizes the serial device and harms latency
+// by forcing batches to wait longer to be processed. Feedback from the device
+// (i.e. avg number of batches directly pending on the device) is used to set
+// the correct limit.
+//
+// SDBS groups requests into per model batches which are processed when a batch
+// processing thread becomes available. SDBS prioritizes batches primarily by
+// age (i.e. the batch's oldest request) along with a configurable preference
+// for scheduling larger batches first.
+
+
+template <typename TaskType>
+class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
+                                       SerialDeviceBatchScheduler<TaskType>> {
+ public:
+  ~SerialDeviceBatchScheduler();
+
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+    // Maximum number of batch processing threads.
+    int64_t num_batch_threads = port::NumSchedulableCPUs();
+    // Although batch selection is primarily based on age, this parameter
+    // specifies a preference for larger batches.  A full batch will be
+    // scheduled before an older, nearly empty batch as long as the age gap is
+    // less than full_batch_scheduling_boost_micros.  The optimal value for this
+    // parameter should be of order the batch processing latency, but must be
+    // chosen carefully, as too large a value will harm tail latency.
+    int64_t full_batch_scheduling_boost_micros = 0;
+    // The environment to use (typically only overridden by test code).
+    Env* env = Env::Default();
+    // Initial limit for number of batches being concurrently processed.
+    int64_t initial_in_flight_batches_limit = 3;
+    // Returns the current number of batches directly waiting to be processed
+    // by the serial device (i.e. GPU, TPU).
+    std::function<int64()> get_pending_on_serial_device;
+    // Desired average number of batches directly waiting to be processed by the
+    // serial device. Small numbers of O(1) should deliver the best latency.
+    double target_pending = 2;
+    // Number of batches between potential adjustments of
+    // in_flight_batches_limit.  Larger numbers will reduce noise, but will be
+    // less responsive to sudden changes in workload.
+    int64_t batches_to_average_over = 1000;
+  };
+
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static absl::Status Create(
+      const Options& options,
+      std::shared_ptr<SerialDeviceBatchScheduler<TaskType>>* scheduler);
+
+  struct QueueOptions {
+    // Maximum size of each batch.
+    int max_batch_size = 1000;
+    // Maximum number of enqueued (i.e. non-scheduled) batches.
+    int max_enqueued_batches = 10;
+  };
+
+  using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+
+  // Adds queue (and its callback) to be managed by this scheduler.
+  absl::Status AddQueue(const QueueOptions& options,
+                        BatchProcessor process_batch_callback,
+                        std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+  double in_flight_batches_limit() {
+    mutex_lock l(mu_);
+    return in_flight_batches_limit_;
+  }
+
+  double recent_low_traffic_ratio() {
+    mutex_lock l(mu_);
+    return recent_low_traffic_ratio_;
+  }
+
+ private:
+  // access to AddBatch(), RemoveQueue(), env().
+  friend class internal::SDBSQueue<TaskType>;
+
+  explicit SerialDeviceBatchScheduler(const Options& options);
+
+  // Continuously retrieves and processes batches.
+  void ProcessBatches();
+
+  // Notifies scheduler of non-empty batch which is eligible for processing.
+  void AddBatch(const internal::SDBSBatch<TaskType>* batch);
+
+  // Removes queue from scheduler.
+  void RemoveQueue(const internal::SDBSQueue<TaskType>* queue);
+
+  Env* env() const { return options_.env; }
+
+  const Options options_;
+
+  // Collection of batches added by AddBatch. Owned by scheduler until they are
+  // released for processing.
+  std::vector<const internal::SDBSBatch<TaskType>*> batches_ TF_GUARDED_BY(mu_);
+
+  // Unowned queues and callbacks added by AddQueue.
+  std::unordered_map<const internal::SDBSQueue<TaskType>*, BatchProcessor>
+      queues_and_callbacks_ TF_GUARDED_BY(mu_);
+
+  // Responsible for running the batch processing callbacks.
+  std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
+
+  // Limit on number of batches which can be concurrently processed.
+  int64_t in_flight_batches_limit_ TF_GUARDED_BY(mu_);
+
+  // Number of batch processing threads.
+  int64_t processing_threads_ TF_GUARDED_BY(mu_) = 0;
+
+  // Number of batches processed since the last in_flight_batches_limit_
+  // adjustment.
+  int64_t batch_count_ TF_GUARDED_BY(mu_) = 0;
+
+  // Number of times since the last in_flight_batches_limit_ adjustment when a
+  // processing thread was available but there were no batches to process.
+  int64_t no_batch_count_ TF_GUARDED_BY(mu_) = 0;
+
+  // Sum of batches pending on the serial device since the last
+  // in_flight_batches_limit_ adjustment.
+  int64_t pending_sum_ = 0;
+
+  // Sum of batch latencies since the last in_flight_batches_limit_ adjustment.
+  int64_t batch_latency_sum_ = 0;
+
+  // Average period between which two consecutive batches begin processing.
+  int64_t batch_period_micros_ = 0;
+
+  // Moving average tracking the fraction of recent in_flight_batches_limit_
+  // adjustments where the external traffic was not high enough to provide
+  // useful feedback for an adjustment.
+  double recent_low_traffic_ratio_ = 0;
+
+  mutex mu_;
+
+  SerialDeviceBatchScheduler(const SerialDeviceBatchScheduler&) = delete;
+  void operator=(const SerialDeviceBatchScheduler&) = delete;
+};
+
+//////////////////////////////////////////////////////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+// Consolidates tasks into batches, passing them off to the
+// SerialDeviceBatchScheduler for processing.
+template <typename TaskType>
+class SDBSQueue : public BatchScheduler<TaskType> {
+ public:
+  using QueueOptions =
+      typename SerialDeviceBatchScheduler<TaskType>::QueueOptions;
+
+  SDBSQueue(std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler,
+            const QueueOptions& options);
+
+  ~SDBSQueue() override;
+
+  // Adds task to current batch. Fails if the task size is larger than the batch
+  // size or if the current batch is full and this queue's number of outstanding
+  // batches is at its maximum.
+  absl::Status Schedule(std::unique_ptr<TaskType>* task) override;
+
+  // Number of tasks waiting to be scheduled.
+  size_t NumEnqueuedTasks() const override;
+
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacity() const override;
+
+  // Notifies queue that a batch is about to be scheduled; the queue should not
+  // place any more tasks in this batch.
+  void ReleaseBatch(const SDBSBatch<TaskType>* batch);
+
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
+ private:
+  std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler_;
+  const QueueOptions options_;
+  // Owned by scheduler_.
+  SDBSBatch<TaskType>* current_batch_ TF_GUARDED_BY(mu_) = nullptr;
+  int64_t num_enqueued_batches_ TF_GUARDED_BY(mu_) = 0;
+  int64_t num_enqueued_tasks_ TF_GUARDED_BY(mu_) = 0;
+  mutable mutex mu_;
+  SDBSQueue(const SDBSQueue&) = delete;
+  void operator=(const SDBSQueue&) = delete;
+};
+
+// Batch which remembers when and by whom it was created.
+template <typename TaskType>
+class SDBSBatch : public Batch<TaskType> {
+ public:
+  SDBSBatch(SDBSQueue<TaskType>* queue, int64_t creation_time_micros)
+      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+
+  ~SDBSBatch() override {}
+
+  SDBSQueue<TaskType>* queue() const { return queue_; }
+
+  int64_t creation_time_micros() const { return creation_time_micros_; }
+
+ private:
+  SDBSQueue<TaskType>* queue_;
+  const int64_t creation_time_micros_;
+  SDBSBatch(const SDBSBatch&) = delete;
+  void operator=(const SDBSBatch&) = delete;
+};
+}  // namespace internal
+
+// ---------------- SerialDeviceBatchScheduler ----------------
+
+template <typename TaskType>
+absl::Status SerialDeviceBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<SerialDeviceBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  if (options.initial_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit must be positive; was ",
+        options.initial_in_flight_batches_limit);
+  }
+  if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit (",
+        options.initial_in_flight_batches_limit,
+        ") should not be larger than num_batch_threads (",
+        options.num_batch_threads, ")");
+  }
+  if (options.full_batch_scheduling_boost_micros < 0) {
+    return errors::InvalidArgument(
+        "full_batch_scheduling_boost_micros can't be negative; was ",
+        options.full_batch_scheduling_boost_micros);
+  }
+  if (options.batches_to_average_over < 1) {
+    return errors::InvalidArgument(
+        "batches_to_average_over should be "
+        "greater than or equal to 1; was ",
+        options.batches_to_average_over);
+  }
+  if (options.target_pending <= 0) {
+    return errors::InvalidArgument(
+        "target_pending should be larger than zero; was ",
+        options.target_pending);
+  }
+  if (!options.get_pending_on_serial_device) {
+    return errors::InvalidArgument(
+        "get_pending_on_serial_device must be "
+        "specified");
+  }
+  scheduler->reset(new SerialDeviceBatchScheduler<TaskType>(options));
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+SerialDeviceBatchScheduler<TaskType>::SerialDeviceBatchScheduler(
+    const Options& options)
+    : options_(options),
+      in_flight_batches_limit_(options.initial_in_flight_batches_limit),
+      processing_threads_(options.initial_in_flight_batches_limit) {
+  batch_thread_pool_.reset(new thread::ThreadPool(
+      env(), options.thread_pool_name, options.num_batch_threads));
+  for (int i = 0; i < processing_threads_; i++) {
+    batch_thread_pool_->Schedule(
+        std::bind(&SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
+  }
+}
+
+template <typename TaskType>
+SerialDeviceBatchScheduler<TaskType>::~SerialDeviceBatchScheduler() {
+  // Signal processing threads to exit.
+  {
+    mutex_lock l(mu_);
+    processing_threads_ = 0;
+  }
+  // Hangs until all threads finish.
+  batch_thread_pool_.reset();
+}
+
+template <typename TaskType>
+absl::Status SerialDeviceBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options, BatchProcessor process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.max_enqueued_batches <= 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be positive; was ",
+        options.max_enqueued_batches);
+  }
+  internal::SDBSQueue<TaskType>* SDBS_queue_raw;
+  queue->reset(SDBS_queue_raw = new internal::SDBSQueue<TaskType>(
+                   this->shared_from_this(), options));
+  mutex_lock l(mu_);
+  queues_and_callbacks_[SDBS_queue_raw] = process_batch_callback;
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::AddBatch(
+    const internal::SDBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  batches_.push_back(batch);
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::RemoveQueue(
+    const internal::SDBSQueue<TaskType>* queue) {
+  mutex_lock l(mu_);
+  queues_and_callbacks_.erase(queue);
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::ProcessBatches() {
+  const int64_t kIdleThreadSleepTimeMicros = 1000;
+  const double kMaxNoBatchRatio = .1;
+  const double kLowTrafficMovingAverageFactor = .1;
+  for (;;) {
+    mu_.lock();
+    if (processing_threads_ < 1 ||
+        processing_threads_ > in_flight_batches_limit_) {
+      processing_threads_--;
+      mu_.unlock();
+      break;
+    }
+    if (batches_.empty()) {
+      no_batch_count_++;
+      int64_t sleep_time = batch_period_micros_ ? batch_period_micros_
+                                                : kIdleThreadSleepTimeMicros;
+      mu_.unlock();
+      env()->SleepForMicroseconds(sleep_time);
+      continue;
+    }
+    auto best_it = batches_.begin();
+    double best_score =
+        (*best_it)->creation_time_micros() -
+        options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
+            static_cast<double>((*best_it)->queue()->max_task_size());
+    for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+      const double score =
+          (*it)->creation_time_micros() -
+          options_.full_batch_scheduling_boost_micros * (*it)->size() /
+              static_cast<double>((*it)->queue()->max_task_size());
+      if (score < best_score) {
+        best_score = score;
+        best_it = it;
+      }
+    }
+    const internal::SDBSBatch<TaskType>* batch = *best_it;
+    batches_.erase(best_it);
+    // Queue may destroy itself after ReleaseBatch is called.
+    batch->queue()->ReleaseBatch(batch);
+    auto callback = queues_and_callbacks_[batch->queue()];
+    mu_.unlock();
+    int64_t start_time = env()->NowMicros();
+    callback(std::unique_ptr<Batch<TaskType>>(
+        const_cast<internal::SDBSBatch<TaskType>*>(batch)));
+    int64_t end_time = env()->NowMicros();
+    mu_.lock();
+    batch_count_++;
+    batch_latency_sum_ += end_time - start_time;
+    pending_sum_ += options_.get_pending_on_serial_device();
+    if (batch_count_ == options_.batches_to_average_over) {
+      recent_low_traffic_ratio_ *= (1 - kLowTrafficMovingAverageFactor);
+      // Only adjust in_flight_batches_limit_ if external load is large enough
+      // to consistently provide batches. Otherwise we would (mistakenly) assume
+      // that the device is underutilized because in_flight_batches_limit_ is
+      // too small.
+      if (no_batch_count_ < kMaxNoBatchRatio * batch_count_) {
+        double avg_pending = pending_sum_ / static_cast<double>(batch_count_);
+        // Avg processing time / # of concurrent batches gives the avg period
+        // between which two consecutive batches begin processing. Used to set a
+        // reasonable sleep time for idle batch processing threads.
+        batch_period_micros_ =
+            batch_latency_sum_ / batch_count_ / in_flight_batches_limit_;
+        // When the processing pipeline is consistently busy, the average number
+        // of pending batches differs from in_flight_batches_limit_ by a
+        // load-dependent offset. Adjust in_flight_batches_limit_to maintain
+        // the desired target pending.
+        in_flight_batches_limit_ +=
+            std::round(options_.target_pending - avg_pending);
+        in_flight_batches_limit_ =
+            std::max(in_flight_batches_limit_, int64_t{1});
+        in_flight_batches_limit_ =
+            std::min(in_flight_batches_limit_, options_.num_batch_threads);
+        // Add extra processing threads if necessary.
+        if (processing_threads_ > 0 &&
+            processing_threads_ < in_flight_batches_limit_) {
+          int extra_threads = in_flight_batches_limit_ - processing_threads_;
+          for (int i = 0; i < extra_threads; i++) {
+            batch_thread_pool_->Schedule(std::bind(
+                &SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
+          }
+          processing_threads_ = in_flight_batches_limit_;
+        }
+      } else {
+        recent_low_traffic_ratio_ += kLowTrafficMovingAverageFactor;
+      }
+      batch_count_ = 0;
+      no_batch_count_ = 0;
+      pending_sum_ = 0;
+      batch_latency_sum_ = 0;
+    }
+    mu_.unlock();
+  }
+}
+
+// ---------------- SDBSQueue ----------------
+
+namespace internal {
+template <typename TaskType>
+SDBSQueue<TaskType>::SDBSQueue(
+    std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler,
+    const QueueOptions& options)
+    : scheduler_(scheduler), options_(options) {}
+
+template <typename TaskType>
+SDBSQueue<TaskType>::~SDBSQueue() {
+  // Wait until last batch has been scheduled.
+  const int kSleepMicros = 1000;
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (num_enqueued_batches_ == 0) {
+        break;
+      }
+    }
+    scheduler_->env()->SleepForMicroseconds(kSleepMicros);
+  }
+  scheduler_->RemoveQueue(this);
+}
+
+template <typename TaskType>
+absl::Status SDBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  SDBSBatch<TaskType>* new_batch = nullptr;
+  size_t size = (*task)->size();
+  if (size > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", size,
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+  {
+    mutex_lock l(mu_);
+    // Current batch is full, create another if allowed.
+    if (current_batch_ &&
+        current_batch_->size() + size > options_.max_batch_size) {
+      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
+        return errors::Unavailable("The batch scheduling queue is full");
+      }
+      current_batch_->Close();
+      current_batch_ = nullptr;
+    }
+    if (!current_batch_) {
+      num_enqueued_batches_++;
+      current_batch_ = new_batch =
+          new SDBSBatch<TaskType>(this, scheduler_->env()->NowMicros());
+    }
+    current_batch_->AddTask(std::move(*task));
+    num_enqueued_tasks_++;
+  }
+  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
+  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+void SDBSQueue<TaskType>::ReleaseBatch(const SDBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  num_enqueued_batches_--;
+  num_enqueued_tasks_ -= batch->num_tasks();
+  if (batch == current_batch_) {
+    current_batch_->Close();
+    current_batch_ = nullptr;
+  }
+}
+
+template <typename TaskType>
+size_t SDBSQueue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  return num_enqueued_tasks_;
+}
+
+template <typename TaskType>
+size_t SDBSQueue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  const int current_batch_capacity =
+      current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
+  const int spare_batches =
+      options_.max_enqueued_batches - num_enqueued_batches_;
+  return spare_batches * options_.max_batch_size + current_batch_capacity;
+}
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
new file mode 100644
index 00000000..347f3008
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -0,0 +1,1548 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <list>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "tensorflow/core/kernels/batching_util/batch_input_task.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
+#include "tensorflow/core/kernels/batching_util/batch_stats.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tsl/platform/criticality.h"
+#include "tsl/platform/errors.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/context_types.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class Queue;
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace serving {
+
+// A batch scheduler for server instances that service multiple request types
+// (e.g. multiple machine-learned models, or multiple versions of a model served
+// concurrently), or even multiple distinct tasks for a given request. The
+// scheduler multiplexes batches of different kinds of tasks onto a fixed-size
+// thread pool (each batch contains tasks of a single type), in a carefully
+// controlled manner. A common configuration is to set the number of threads
+// equal to the number of hardware accelerator units, in which case the
+// scheduler takes care of multiplexing the task types onto the shared hardware,
+// in a manner that is both fair and efficient.
+//
+// Semantically, SharedBatchScheduler behaves like having N instances of
+// BasicBatchScheduler (see basic_batch_scheduler.h), one per task type. The
+// difference is that under the covers there is a single shared thread pool,
+// instead of N independent ones, with their sharing deliberately coordinated.
+//
+// SharedBatchScheduler does not implement the BatchScheduler API; rather, it
+// presents an abstraction of "queues", where each queue corresponds to one type
+// of task. Tasks submitted to a given queue are placed in their own batches,
+// and cannot be mixed with other tasks. Queues can be added and deleted
+// dynamically, to accommodate e.g. versions of a model being brought up and
+// down over the lifetime of a server.
+//
+// The batch thread pool round-robins through the queues, running one batch
+// from a queue and then moving to the next queue. Each queue behaves like a
+// BasicBatchScheduler instance, in the sense that it has maximum batch size and
+// timeout parameters, which govern when a batch is eligible to be processed.
+//
+// Each queue is independently configured with a maximum size (in terms of the
+// maximum number of batches worth of enqueued tasks). For online serving, it is
+// recommended that the queue sizes be configured such that the sum of the sizes
+// of the active queues roughly equal the number of batch threads. (The idea is
+// that if all threads become available at roughly the same time, there will be
+// enough enqueued work for them to take on, but no more.)
+//
+// If queue sizes are configured in the manner suggested above, the maximum time
+// a task can spend in a queue before being placed in a batch and assigned to a
+// thread for processing, is the greater of:
+//  - the maximum time to process one batch of tasks from any active queue
+//  - the configured timeout parameter for the task's queue (which can be 0)
+//
+// For bulk processing jobs and throughput-oriented benchmarks, you may want to
+// set the maximum queue size to a large value.
+//
+// TODO(b/26539183): Support queue servicing policies other than round-robin.
+// E.g. let each queue specify a "share" (an int >= 1), so e.g. with queues A
+// and B having shares 1 and 2 respectively, the servicing pattern is ABBABB...
+//
+//
+// PERFORMANCE TUNING: See README.md.
+//
+template <typename TaskType>
+class SharedBatchScheduler
+    : public std::enable_shared_from_this<SharedBatchScheduler<TaskType>> {
+ public:
+  using BatchTaskUniquePtr = std::unique_ptr<Batch<TaskType>>;
+
+  using ProcessBatchCallback =
+      std::variant<std::function<void(BatchTaskUniquePtr)>,
+                   std::function<void(BatchTaskUniquePtr,
+                                      std::vector<std::unique_ptr<TaskType>>)>>;
+  // TODO(b/25089730): Tune defaults based on best practices as they develop.
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::MaxParallelism();
+
+    // The environment to use.
+    // (Typically only overridden by test code.)
+    Env* env = Env::Default();
+
+    // If true, when multiple queues have available batches to process, they
+    // will be prioritized based on a (priority, arrival_time) key.
+    bool rank_queues = false;
+
+    // If true, Create() will return a global instance of the scheduler. Only
+    // the options provided in the first Create() call will be used to
+    // initialize the global scheduler.
+    bool use_global_scheduler = false;
+  };
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static absl::Status Create(
+      const Options& options,
+      std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler);
+
+  virtual ~SharedBatchScheduler();
+
+  // Adds a queue to which tasks may be submitted. The returned queue implements
+  // the BatchScheduler API. Each queue has its own set of scheduling options,
+  // and its own callback to process batches of tasks submitted to the queue.
+  //
+  // The returned queue's destructor blocks until all tasks submitted to it have
+  // been processed.
+  struct QueueOptions {
+    // The size limit of an input batch to the queue.
+    //
+    // If `enable_large_batch_splitting` is True, 'input_batch_size_limit'
+    // should be greater or equal than `max_execution_batch_size`; otherwise
+    // `input_batch_size_limit` should be equal to `max_execution_batch_size`.
+    size_t input_batch_size_limit = 1000;
+
+    // If a task has been enqueued for this amount of time (in microseconds),
+    // and a thread is available, the scheduler will immediately form a batch
+    // from enqueued tasks and assign the batch to the thread for processing,
+    // even if the batch's size is below 'input_batch_size_limit'.
+    //
+    // This parameter offers a way to bound queue latency, so that a task isn't
+    // stuck in the queue indefinitely waiting for enough tasks to arrive to
+    // make a full batch. (The latency bound is given in the class documentation
+    // above.)
+    //
+    // The goal is to smooth out batch sizes under low request rates, and thus
+    // avoid latency spikes.
+    int64_t batch_timeout_micros = 0;
+
+    // The maximum allowable number of enqueued (accepted by Schedule() but
+    // not yet being processed on a batch thread) tasks in terms of batches.
+    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
+    // See the class documentation above for guidelines on how to tune this
+    // parameter.
+    //
+    // Must be positive, or else invalid argument error will be returned at
+    // queue creation time.
+    size_t max_enqueued_batches = 10;
+
+    // If true, queue implementation would split one input batch task into
+    // subtasks (as specified by `split_input_task_func` below) and fit subtasks
+    // into different batches.
+    //
+    // For usage of `split_input_task_func`, please see its comment.
+    bool enable_large_batch_splitting = false;
+
+    // `input_task`: a unit of task to be split.
+    // `first_output_task_size`: task size of first output.
+    // `max_execution_batch_size`: Maximum size of each batch.
+    // `output_tasks`: A list of output tasks after split.
+    //
+    // REQUIRED:
+    // 1) All `output_tasks` should be non-empty tasks.
+    // 2) Sizes of `output_tasks` add up to size of `input_task`.
+    //
+    // NOTE:
+    // Instantiations of `TaskType` may vary, so it's up to caller to define
+    // how (e.g., which members to access) to split input tasks.
+    std::function<absl::Status(
+        std::unique_ptr<TaskType>* input_task, int first_output_task_size,
+        int input_batch_size_limit,
+        std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
+
+    // The maximum size of each enqueued batch (i.e., in
+    // `high_priority_batches_`).
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    size_t max_execution_batch_size = 1000;
+
+    // If non-empty, contains configured batch sizes.
+    std::vector<int32> allowed_batch_sizes;
+
+    // If true, the padding will not be appended.
+    bool disable_padding = false;
+
+    // The padding policy to use.
+    //
+    // See the documentation for kPadUpPolicy for details.
+    string batch_padding_policy = string(kPadUpPolicy);
+
+    // A pointer to a ModelBatchStats instance for this model. To be used for
+    // cost-based padding policy selection.
+    //
+    // If null, some other padding policy will be used if a cost-based one is
+    // requested.
+    ModelBatchStats* model_batch_stats = nullptr;
+
+    // If true, queue implementation would split high priority and low priority
+    // inputs into two sub queues.
+    bool enable_priority_queue = false;
+
+    // A separate set of queue options for different priority inputs.
+    // Use iff `enable_priority_queue` is true.
+    struct PriorityQueueOptions {
+      // See QueueOptions.max_execution_batch_size
+      size_t max_execution_batch_size = 0;
+      // See QueueOptions.batch_timeout_micros
+      int64_t batch_timeout_micros = 0;
+      // See QueueOptions.input_batch_size_limit
+      size_t input_batch_size_limit = 0;
+      // See QueueOptions.max_enqueued_batches
+      size_t max_enqueued_batches = 0;
+      // See QueueOptions.allowed_batch_sizes
+      std::vector<int32> allowed_batch_sizes;
+    };
+    // A subset of queue options for high priority input. These options are
+    // currently not being used in favor of the equivalents options at the
+    // QueueOptions level.
+    PriorityQueueOptions high_priority_queue_options;
+    // A subset of queue options for low priority input.
+    PriorityQueueOptions low_priority_queue_options;
+
+    // A policy that determines the mixed priority batching behavior. It is
+    // effective only when enable_priority_queue is true.
+    MixedPriorityBatchingPolicy mixed_priority_batching_policy =
+        MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize;
+  };
+  // This method is marked virtual for testing purposes only.
+  virtual absl::Status AddQueue(
+      const QueueOptions& options, ProcessBatchCallback process_batch_callback,
+      std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+ protected:
+  explicit SharedBatchScheduler(const Options& options);
+
+ private:
+  void GetNextWorkItem_Locked(internal::Queue<TaskType>** queue_for_batch_out,
+                              BatchTaskUniquePtr* batch_to_process_out)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // The code executed in 'batch_threads_'. Obtains a batch to process from the
+  // queue pointed to by 'next_queue_to_schedule_', and processes it. If that
+  // queue declines to provide a batch to process, moves onto the next queue. If
+  // no queues provide a batch to process, just sleeps briefly and exits.
+  void ThreadLogic();
+
+  // Called by `AddQueue`.
+  absl::Status AddQueueAfterRewritingOptions(
+      const QueueOptions& options, ProcessBatchCallback process_batch_callback,
+      std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+  static bool BatchExists(const BatchTaskUniquePtr& batch_to_process);
+
+  const Options options_;
+
+  mutex mu_;
+
+  // A list of queues. (We use std::list instead of std::vector to ensure that
+  // iterators are not invalidated by adding/removing elements. It also offers
+  // efficient removal of elements from the middle.)
+  using QueueList = std::list<std::unique_ptr<internal::Queue<TaskType>>>;
+
+  // All "active" queues, i.e. ones that either:
+  //  - have not been removed, or
+  //  - have been removed but are not yet empty.
+  QueueList queues_ TF_GUARDED_BY(mu_);
+
+  // An iterator over 'queues_', pointing to the queue from which the next
+  // available batch thread should grab work.
+  typename QueueList::iterator next_queue_to_schedule_ TF_GUARDED_BY(mu_);
+
+  // Used by idle batch threads to wait for work to enter the system. Notified
+  // whenever a batch becomes schedulable.
+  condition_variable schedulable_batch_cv_;
+
+  // Threads that process batches obtained from the queues.
+  std::vector<std::unique_ptr<PeriodicFunction>> batch_threads_;
+
+  SharedBatchScheduler(const SharedBatchScheduler&) = delete;
+  void operator=(const SharedBatchScheduler&) = delete;
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+
+// A task queue for SharedBatchScheduler. Accepts tasks and accumulates them
+// into batches, and dispenses those batches to be processed via a "pull"
+// interface. The queue's behavior is governed by maximum batch size, timeout
+// and maximum queue length parameters; see their documentation in
+// SharedBatchScheduler.
+//
+// The queue is implemented as a deque of batches, with these invariants:
+//  - The number of batches is between 1 and 'options_.max_enqueued_batches'.
+//  - The back-most batch is open; the rest are closed.
+//
+// Submitted tasks are added to the open batch. If that batch doesn't have room
+// but the queue isn't full, then that batch is closed and a new open batch is
+// started.
+//
+// Batch pull requests are handled by dequeuing the front-most batch if it is
+// closed. If the front-most batch is open (i.e. the queue contains only one
+// batch) and has reached the timeout, it is immediately closed and returned;
+// otherwise no batch is returned for the request.
+template <typename TaskType>
+class Queue {
+ public:
+  using ProcessBatchCallbackWithoutPaddingTasks =
+      std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+  using ProcessBatchCallbackWithPaddingTasks =
+      std::function<void(std::unique_ptr<Batch<TaskType>>,
+                         std::vector<std::unique_ptr<TaskType>>)>;
+  using ProcessBatchCallback =
+      std::variant<ProcessBatchCallbackWithoutPaddingTasks,
+                   ProcessBatchCallbackWithPaddingTasks>;
+
+  using SchedulableBatchCallback = std::function<void()>;
+  using SplitInputTaskIntoSubtasksCallback = std::function<absl::Status(
+      std::unique_ptr<TaskType>* input_task, int open_batch_remaining_slot,
+      int max_execution_batch_size,
+      std::vector<std::unique_ptr<TaskType>>* output_tasks)>;
+  // Orderable key representing the priority of a batch. Higher priority
+  // batches will be prioritized for execution first (when using
+  // rank_queues=true).
+  // - A smaller key value is higher priority than a larger one.
+  // - This is a pair formed from <priority, batch_timestamp>. The exact values
+  //   used are an implementation detail of PeekBatchPriority().
+  using BatchPriorityKey = std::pair<int, int64_t>;
+
+  Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
+        Env* env, ProcessBatchCallback process_batch_callback,
+        SchedulableBatchCallback schedulable_batch_callback);
+
+  // Illegal to destruct unless the queue is empty.
+  ~Queue();
+
+  // Submits a task to the queue, with the same semantics as
+  // BatchScheduler::Schedule().
+  absl::Status Schedule(std::unique_ptr<TaskType>* task);
+
+  // Returns the number of enqueued tasks, with the same semantics as
+  // BatchScheduler::NumEnqueuedTasks().
+  size_t NumEnqueuedTasks() const;
+
+  // Returns the queue capacity, with the same semantics as
+  // BatchScheduler::SchedulingCapacity().
+  size_t SchedulingCapacity() const;
+
+  // Returns the maximum allowed size of tasks submitted to the queue.
+  size_t max_task_size() const { return options_.input_batch_size_limit; }
+
+  // Returns the maximum allowed size of tasks to be executed.
+  // Returned value would be less than or equal to the maximum allowed input
+  // size that's provided by caller of batch scheduler.
+  size_t max_execution_batch_size() const { return max_execution_batch_size_; }
+
+  // Called by a thread that is ready to process a batch, to request one from
+  // this queue. Either returns a batch that is ready to be processed, or
+  // nullptr if the queue declines to schedule a batch at this time. If it
+  // returns a batch, the batch is guaranteed to be closed.
+  typename SharedBatchScheduler<TaskType>::BatchTaskUniquePtr ScheduleBatch();
+
+  // Without mutating the queue, checks if ScheduleBatch() will return a valid
+  // batch and if so will return the priority of that batch.
+  std::optional<BatchPriorityKey> PeekBatchPriority() const;
+
+  // Retrieves the low priority tasks that can be padded to a high priority
+  // batch of the specified size.
+  std::vector<std::unique_ptr<TaskType>> GetLowPriorityTasksForPadding(
+      size_t batch_size);
+
+  // Processes a batch that has been returned earlier by ScheduleBatch().
+  void ProcessBatch(std::unique_ptr<Batch<TaskType>> batch,
+                    std::vector<std::unique_ptr<TaskType>> padding_task);
+
+  // Determines whether the queue is empty, i.e. has no tasks waiting or being
+  // processed.
+  bool IsEmpty() const;
+
+  // Marks the queue closed, and waits until it is empty.
+  void CloseAndWaitUntilEmpty();
+
+  bool closed() const TF_NO_THREAD_SAFETY_ANALYSIS { return closed_.load(); }
+
+ private:
+  // Computes the max_execution_batch_size of the queue based on queue options.
+  static size_t GetMaxExecutionBatchSize(
+      const typename SharedBatchScheduler<TaskType>::QueueOptions& options) {
+    // If `enable_large_batch_splitting`, returns `max_execution_batch_size`
+    // configured by user options directly; returns `input_batch_size_limit`
+    // otherwise.
+    //
+    // Note `input_batch_size_limit` is used for backward compatibitliy ->
+    // users may not specify `max_execution_batch_size` explicitly.
+    if (options.enable_large_batch_splitting) {
+      return options.max_execution_batch_size;
+    } else {
+      return options.input_batch_size_limit;
+    }
+  }
+
+  // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
+  bool IsEmptyInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns true iff the task is a low priority task based on the queue option.
+  bool IsLowPriorityTask(std::unique_ptr<TaskType>* task);
+
+  // Implementation of Schedule above. Enqueues `task` as it
+  // is or split it inline (eagerly) to form batches to be processed by
+  // `Queue<TaskType>::ProcessBatch`
+  absl::Status ScheduleWithoutOrEagerSplitImpl(std::unique_ptr<TaskType>* task)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Pads the open batch until it is full with low priority tasks.
+  void PadOpenBatchWithLowPriorityTasks() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Closes the open batch residing at the back of std::deque, and inserts a
+  // fresh open batch behind it.
+  void StartNewBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Split `input task` into `output_tasks` according to 'task_sizes'.
+  absl::Status SplitInputBatchIntoSubtasks(
+      std::unique_ptr<TaskType>* input_task,
+      std::vector<std::unique_ptr<TaskType>>* output_tasks)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Determines whether the open batch residing at the back of
+  // 'high_priority_batches_' is currently schedulable.
+  bool IsOpenBatchSchedulable() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  std::optional<BatchPriorityKey> PeekBatchPriorityImpl() const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Determines whether the low priority tasks in `low_priority_tasks_` can form
+  // a batch on their own. If yes, returns a batch that is ready to be
+  // processed. Otherwise, returns an empty unique_ptr.
+  std::unique_ptr<Batch<TaskType>> ScheduleLowPriorityBatch()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Same as SchedulingCapacity(), but assumes the caller already holds a
+  // lock on 'mu_'.
+  size_t SchedulingCapacityInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns an error if queue doesn't have capacity for this task.
+  //
+  // `task` must outlive this method.
+  absl::Status ValidateBatchTaskQueueCapacity(TaskType* task) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns an error if the low priority task queue doesn't have capacity for
+  // this task using the low priority batch options. Since the low priority
+  // tasks are not batched until they get scheduled, it only checks that a
+  // single task does not it exceed input batch size limit and the total size of
+  // the tasks in the queue does not exceed the max batch size * max enqueued
+  // batch sizes.
+  absl::Status ValidateLowPriorityTaskQueueCapacity(const TaskType& task) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // The task size of the last batch in the queue.
+  size_t tail_batch_task_size() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns the number of enqueued batches.
+  int64 num_enqueued_batches() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Gets the appropriate batches.
+  std::deque<std::unique_ptr<Batch<TaskType>>>& GetBatches()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Gets the appropriate batches (const version).
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& GetBatches() const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Gets the low priority task queue.
+  TaskQueue<TaskType>& GetLowPriorityTaskQueue()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Retrieves the tasks up to the specified size from the low priority task
+  // queue. It will immediately return an empty vector when
+  // enable_priority_queue is false.
+  std::vector<std::unique_ptr<TaskType>> GetLowPriorityTasks(size_t size);
+
+  const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
+
+  // The environment to use.
+  Env* env_;
+
+  // The maximum batch size to be executed by `Queue::ProcessBatch`.
+  // See the comment of QueueOptions and helper function
+  // `GetMaxExecutionBatchSize` for more details on what it means.
+  const size_t max_execution_batch_size_;
+
+  // A callback invoked to processes a batch of work units. Always invoked
+  // from a batch thread.
+  ProcessBatchCallback process_batch_callback_;
+
+  // A callback invoked to notify the scheduler that a new batch has become
+  // schedulable.
+  SchedulableBatchCallback schedulable_batch_callback_;
+
+  mutable mutex mu_;
+
+  // Whether this queue can accept new tasks. This variable is monotonic: it
+  // starts as false, and then at some point gets set to true and remains true
+  // for the duration of this object's life.
+  std::atomic<bool> closed_ TF_GUARDED_BY(mu_){false};
+
+  // The enqueued tasks for low priority inputs.
+  // Each element corresponds to a task to be dequeued. These tasks to be
+  // consumed by `Queue<TaskType>::ProcessBatch` to either pad the high priority
+  // batches below or form their own batch to be executed.
+  TaskQueue<TaskType> low_priority_tasks_ TF_GUARDED_BY(mu_);
+
+  // The enqueued batches for high priority input.
+  // Each element corresponds to a task to be dequeued and processed by
+  // `Queue<TaskType>::ProcessBatch`.
+  std::deque<std::unique_ptr<Batch<TaskType>>> high_priority_batches_
+      TF_GUARDED_BY(mu_);
+
+  // The counter of the TraceMe context ids.
+  uint64 traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
+
+  // The time at which the first task was added to the open (back-most) batch
+  // in 'high_priority_batches_'. Valid iff that batch contains at least one
+  // task.
+  //
+  // Note that when using a batch padding policy other than PAD_UP, this field
+  // might contain an approximate value.
+  uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
+
+  // Whether this queue contains a batch that is eligible to be scheduled.
+  // Used to keep track of when to call 'schedulable_batch_callback_'.
+  bool schedulable_batch_ TF_GUARDED_BY(mu_) = false;
+
+  // The number of batches currently being processed by batch threads.
+  // Incremented in ScheduleBatch() and decremented in ProcessBatch().
+  int num_batches_being_processed_ TF_GUARDED_BY(mu_) = 0;
+
+  // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for
+  // the case in which the queue is not empty when CloseAndWaitUntilEmpty()
+  // starts. When ProcessBatch() dequeues the last batch and makes the queue
+  // empty, if 'empty_notification_' is non-null it calls
+  // 'empty_notification_->Notify()'.
+  Notification* empty_notification_ TF_GUARDED_BY(mu_) = nullptr;
+
+  Queue(const Queue&) = delete;
+  void operator=(const Queue&) = delete;
+};
+
+// A RAII-style object that points to a Queue and implements
+// the BatchScheduler API. To be handed out to clients who call AddQueue().
+template <typename TaskType>
+class QueueHandle : public BatchScheduler<TaskType> {
+ public:
+  QueueHandle(std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
+              Queue<TaskType>* queue);
+  ~QueueHandle() override;
+
+  absl::Status Schedule(std::unique_ptr<TaskType>* task) override;
+  size_t NumEnqueuedTasks() const override;
+  size_t SchedulingCapacity() const override;
+
+  size_t max_task_size() const override { return queue_->max_task_size(); }
+
+ private:
+  // The scheduler that owns 'queue_'.
+  std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler_;
+
+  // The queue this handle wraps. Owned by 'scheduler_', which keeps it alive at
+  // least until this class's destructor closes it.
+  Queue<TaskType>* queue_;
+
+  QueueHandle(const QueueHandle&) = delete;
+  void operator=(const QueueHandle&) = delete;
+};
+
+}  // namespace internal
+
+template <typename TaskType>
+absl::Status SharedBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+
+  if (options.use_global_scheduler) {
+    static std::shared_ptr<SharedBatchScheduler<TaskType>>* global_scheduler =
+        [&]() {
+          return new std::shared_ptr<SharedBatchScheduler<TaskType>>(
+              new SharedBatchScheduler<TaskType>(options));
+        }();
+    *scheduler = *global_scheduler;
+    return absl::OkStatus();
+  }
+
+  scheduler->reset(new SharedBatchScheduler<TaskType>(options));
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+SharedBatchScheduler<TaskType>::~SharedBatchScheduler() {
+  // Wait until the batch threads finish clearing out and deleting the closed
+  // queues.
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (queues_.empty()) {
+        break;
+      }
+    }
+    const int64_t kSleepTimeMicros = 100;
+    options_.env->SleepForMicroseconds(kSleepTimeMicros);
+  }
+  // Delete the batch threads before allowing state the threads may access (e.g.
+  // 'mu_') to be deleted.
+  batch_threads_.clear();
+}
+
+template <typename TaskType>
+absl::Status SharedBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options, ProcessBatchCallback process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  QueueOptions rewrite_options = options;
+  if ((!rewrite_options.enable_large_batch_splitting) &&
+      rewrite_options.max_enqueued_batches == 0) {
+    // Many existing models (with very low QPS) rely on this option to be >0.
+    // Rewrite and set this to one and retain old behavior to allow such models
+    // to continue to work.
+    //
+    // Note, technically an invalid-argument error should be returned, but
+    // that may break such models.
+    rewrite_options.max_enqueued_batches = 1;
+  }
+  return AddQueueAfterRewritingOptions(rewrite_options, process_batch_callback,
+                                       queue);
+}
+
+template <typename TaskType>
+absl::Status SharedBatchScheduler<TaskType>::AddQueueAfterRewritingOptions(
+    const QueueOptions& options, ProcessBatchCallback process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.input_batch_size_limit == 0) {
+    return errors::InvalidArgument(
+        "input_batch_size_limit must be positive; was ",
+        options.input_batch_size_limit);
+  }
+  if (options.batch_timeout_micros < 0) {
+    return errors::InvalidArgument(
+        "batch_timeout_micros must be non-negative; was ",
+        options.batch_timeout_micros);
+  }
+  if (options.max_enqueued_batches == 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be positive; was ",
+        options.max_enqueued_batches);
+  }
+
+  if (options.enable_large_batch_splitting &&
+      options.split_input_task_func == nullptr) {
+    return errors::InvalidArgument(
+        "split_input_task_func must be specified when split_input_task is "
+        "true: ",
+        options.enable_large_batch_splitting);
+  }
+
+  if (options.enable_large_batch_splitting &&
+      (options.input_batch_size_limit < options.max_execution_batch_size)) {
+    return errors::InvalidArgument(
+        "When enable_large_batch_splitting is true, input_batch_size_limit "
+        "must be "
+        "greater than or equal to max_execution_batch_size.",
+        options.enable_large_batch_splitting, options.input_batch_size_limit,
+        options.max_execution_batch_size);
+  }
+
+  auto schedulable_batch_callback = [this] {
+    mutex_lock l(mu_);
+    schedulable_batch_cv_.notify_one();
+  };
+  auto internal_queue =
+      std::unique_ptr<internal::Queue<TaskType>>(new internal::Queue<TaskType>(
+          options, options_.env, process_batch_callback,
+          schedulable_batch_callback));
+  auto handle = std::unique_ptr<BatchScheduler<TaskType>>(
+      new internal::QueueHandle<TaskType>(this->shared_from_this(),
+                                          internal_queue.get()));
+  {
+    mutex_lock l(mu_);
+    queues_.push_back(std::move(internal_queue));
+    if (next_queue_to_schedule_ == queues_.end()) {
+      next_queue_to_schedule_ = queues_.begin();
+    }
+  }
+  *queue = std::move(handle);
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+SharedBatchScheduler<TaskType>::SharedBatchScheduler(const Options& options)
+    : options_(options), next_queue_to_schedule_(queues_.end()) {
+  // Kick off the batch threads.
+  PeriodicFunction::Options periodic_fn_options;
+  periodic_fn_options.thread_name_prefix =
+      strings::StrCat(options.thread_pool_name, "_");
+  for (int i = 0; i < options.num_batch_threads; ++i) {
+    std::unique_ptr<PeriodicFunction> thread(new PeriodicFunction(
+        [this] { this->ThreadLogic(); },
+        0 /* function invocation interval time */, periodic_fn_options));
+    batch_threads_.push_back(std::move(thread));
+  }
+}
+
+template <typename TaskType>
+bool SharedBatchScheduler<TaskType>::BatchExists(
+    const BatchTaskUniquePtr& batch_to_process) {
+  return batch_to_process != nullptr;
+}
+
+template <typename TaskType>
+void SharedBatchScheduler<TaskType>::GetNextWorkItem_Locked(
+    internal::Queue<TaskType>** queue_for_batch_out,
+    BatchTaskUniquePtr* batch_to_process_out) {
+  BatchTaskUniquePtr batch_to_process;
+  internal::Queue<TaskType>* queue_for_batch = nullptr;
+  std::optional<typename internal::Queue<TaskType>::BatchPriorityKey>
+      batch_priority_key;
+  const int num_queues = queues_.size();
+  for (int num_queues_tried = 0;
+       !BatchExists(batch_to_process) && num_queues_tried < num_queues;
+       ++num_queues_tried) {
+    DCHECK(next_queue_to_schedule_ != queues_.end());
+
+    // If a closed queue responds to ScheduleBatch() with nullptr, the queue
+    // will never yield any further batches so we can drop it. To avoid a
+    // race, we take a snapshot of the queue's closedness state *before*
+    // calling ScheduleBatch().
+    const bool queue_closed = (*next_queue_to_schedule_)->closed();
+
+    bool queue_has_work = false;
+
+    if (options_.rank_queues) {
+      auto key = (*next_queue_to_schedule_)->PeekBatchPriority();
+      queue_has_work = key.has_value();
+      if (key.has_value() && (!batch_priority_key.has_value() ||
+                              key.value() < batch_priority_key.value())) {
+        batch_priority_key = key;
+        queue_for_batch = next_queue_to_schedule_->get();
+      }
+    } else {
+      // Ask '*next_queue_to_schedule_' if it wants us to process a batch.
+      batch_to_process = (*next_queue_to_schedule_)->ScheduleBatch();
+      queue_has_work = BatchExists(batch_to_process);
+
+      if (queue_has_work) {
+        queue_for_batch = next_queue_to_schedule_->get();
+      }
+    }
+
+    // Advance 'next_queue_to_schedule_'.
+    if (queue_closed && (*next_queue_to_schedule_)->IsEmpty() &&
+        !queue_has_work) {
+      // We've encountered a closed queue with no work to do. Drop it.
+      DCHECK_NE(queue_for_batch, next_queue_to_schedule_->get());
+      next_queue_to_schedule_ = queues_.erase(next_queue_to_schedule_);
+    } else {
+      ++next_queue_to_schedule_;
+    }
+    if (next_queue_to_schedule_ == queues_.end() && !queues_.empty()) {
+      // We've hit the end. Wrap to the first queue.
+      next_queue_to_schedule_ = queues_.begin();
+    }
+  }
+
+  if (options_.rank_queues && batch_priority_key.has_value()) {
+    batch_to_process = queue_for_batch->ScheduleBatch();
+  }
+
+  *queue_for_batch_out = queue_for_batch;
+  *batch_to_process_out = std::move(batch_to_process);
+}
+
+template <typename TaskType>
+void SharedBatchScheduler<TaskType>::ThreadLogic() {
+  // A batch to process next (or nullptr if no work to do).
+  BatchTaskUniquePtr batch_to_process;
+  // The queue with which 'batch_to_process' is associated.
+  internal::Queue<TaskType>* queue_for_batch = nullptr;
+  {
+    mutex_lock l(mu_);
+    while (true) {
+      GetNextWorkItem_Locked(&queue_for_batch, &batch_to_process);
+      if (BatchExists(batch_to_process)) break;
+      // We couldn't find any work to do. Wait until a new batch becomes
+      // schedulable, or some time has elapsed, before checking again.
+      const int64_t kTimeoutMillis =
+          1;  // The smallest accepted granule of time.
+      WaitForMilliseconds(&l, &schedulable_batch_cv_, kTimeoutMillis);
+      if (queues_.empty()) return;
+    }
+  }
+
+  size_t batch_size_to_schedule = batch_to_process->size();
+  queue_for_batch->ProcessBatch(
+      std::move(batch_to_process),
+      queue_for_batch->GetLowPriorityTasksForPadding(batch_size_to_schedule));
+}
+
+namespace internal {
+
+template <typename TaskType>
+Queue<TaskType>::Queue(
+    const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
+    Env* env, ProcessBatchCallback process_batch_callback,
+    SchedulableBatchCallback schedulable_batch_callback)
+    : options_(options),
+      env_(env),
+      max_execution_batch_size_(GetMaxExecutionBatchSize(options_)),
+      process_batch_callback_(process_batch_callback),
+      schedulable_batch_callback_(schedulable_batch_callback) {
+  // Set the higher 32 bits of traceme_context_id_counter_ to be the creation
+  // time of the queue. This prevents the batches in different queues to have
+  // the same traceme_context_id_counter_.
+  traceme_context_id_counter_ = (absl::GetCurrentTimeNanos() & 0xFFFFFFFF)
+                                << 32;
+  GetBatches().emplace_back(new Batch<TaskType>);
+}
+
+template <typename TaskType>
+Queue<TaskType>::~Queue() {
+  mutex_lock l(mu_);
+  DCHECK(IsEmptyInternal());
+  GetBatches().back()->Close();
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsLowPriorityTask(std::unique_ptr<TaskType>* task) {
+  if (!options_.enable_priority_queue) {
+    return false;
+  }
+
+  // The criticality is defined only when the task is a derived class of
+  // BatchTask.
+  if constexpr (std::is_base_of_v<BatchTask, TaskType>) {
+    // TODO(b/316379576): Make the criticality and priority configurable.
+    return ((*task)->criticality() ==
+                tsl::criticality::Criticality::kSheddablePlus ||
+            (*task)->criticality() ==
+                tsl::criticality::Criticality::kSheddable);
+  }
+
+  // Otherwise, consider it a high priority task and return false.
+  return false;
+}
+
+template <typename TaskType>
+absl::Status Queue<TaskType>::ScheduleWithoutOrEagerSplitImpl(
+    std::unique_ptr<TaskType>* task) {
+  // TODO(b/161857471):
+  // Add test coverage when when concurrent incoming batches arrives and
+  // use up all queue capacity.
+  TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
+
+  std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
+  const int64_t open_batch_remaining_slot =
+      max_execution_batch_size() - batches.back()->size();
+
+  const int64_t input_task_size = (*task)->size();
+
+  std::vector<std::unique_ptr<TaskType>> output_tasks;
+
+  if (input_task_size <= open_batch_remaining_slot ||
+      !options_.enable_large_batch_splitting) {
+    // This is the fast path when input doesn't need to be split.
+    output_tasks.push_back(std::move(*task));
+  } else {
+    TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
+  }
+
+  for (int i = 0; i < output_tasks.size(); ++i) {
+    if (batches.back()->size() + output_tasks[i]->size() >
+        max_execution_batch_size()) {
+      StartNewBatch();
+    }
+    if (batches.back()->empty()) {
+      open_batch_start_time_micros_ = env_->NowMicros();
+    }
+    tsl::profiler::TraceMeProducer trace_me(
+        [&output_tasks, i] {
+          return profiler::TraceMeEncode("ScheduleOutputTask",
+                                         {{"size", output_tasks[i]->size()}});
+        },
+        tsl::profiler::ContextType::kSharedBatchScheduler,
+        batches.back()->traceme_context_id());
+    batches.back()->AddTask(std::move(output_tasks[i]), env_->NowMicros());
+  }
+
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::PadOpenBatchWithLowPriorityTasks() {
+  std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
+  const bool should_pad = options_.enable_priority_queue &&
+                          options_.mixed_priority_batching_policy ==
+                              MixedPriorityBatchingPolicy::kPriorityMerge &&
+                          batches.size() == 1 && IsOpenBatchSchedulable();
+  if (!should_pad) {
+    return;
+  }
+
+  // If true, the next low priority task couldn't fit in the remaining space of
+  // the open batch.
+  bool out_of_space = false;
+
+  while (!low_priority_tasks_.empty() && !out_of_space) {
+    const int64_t open_batch_remaining_slot =
+        max_execution_batch_size() - batches.back()->size();
+    if (open_batch_remaining_slot <= 0) {
+      // Terminate early if the open batch is full. Remaining low priority tasks
+      // will be re-checked during the next batch formation opportunity.
+      return;
+    }
+
+    uint64 task_time = low_priority_tasks_.EarliestTaskStartTime().value();
+    std::unique_ptr<TaskType> task = low_priority_tasks_.RemoveTask();
+
+    const int64_t input_task_size = task->size();
+
+    std::vector<std::unique_ptr<TaskType>> output_tasks;
+
+    if (input_task_size <= open_batch_remaining_slot ||
+        !options_.enable_large_batch_splitting) {
+      // This is the fast path when input doesn't need to be split.
+      output_tasks.push_back(std::move(task));
+    } else {
+      absl::Status status = SplitInputBatchIntoSubtasks(&task, &output_tasks);
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to split low priority task: " << status;
+        continue;
+      }
+    }
+
+    for (int i = 0; i < output_tasks.size(); ++i) {
+      if (batches.back()->size() + output_tasks[i]->size() >
+          max_execution_batch_size()) {
+        low_priority_tasks_.PrependTask(std::move(output_tasks[i]), task_time);
+        out_of_space = true;
+        // NOTE: Future iterations of this loop will also hit this case but are
+        // needed to re-add all the unused tasks to the low priority queue.
+        continue;
+      }
+
+      if (batches.back()->empty()) {
+        open_batch_start_time_micros_ = task_time;
+      } else {
+        open_batch_start_time_micros_ =
+            std::min(open_batch_start_time_micros_, task_time);
+      }
+
+      tsl::profiler::TraceMeProducer trace_me(
+          [&output_tasks, i] {
+            return profiler::TraceMeEncode("ScheduleOutputTask",
+                                           {{"size", output_tasks[i]->size()}});
+          },
+          tsl::profiler::ContextType::kSharedBatchScheduler,
+          batches.back()->traceme_context_id());
+
+      batches.back()->AddTask(std::move(output_tasks[i]));
+    }
+  }
+}
+
+template <typename TaskType>
+absl::Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  const bool large_batch_splitting = options_.enable_large_batch_splitting;
+  tsl::profiler::TraceMe trace_me([task, large_batch_splitting] {
+    return profiler::TraceMeEncode(
+        large_batch_splitting ? "ScheduleWithEagerSplit"
+                              : "ScheduleWithoutSplit",
+        {{"batching_input_task_size", (*task)->size()}});
+  });
+
+  bool notify_of_schedulable_batch = false;
+  {
+    mutex_lock l(mu_);
+
+    DCHECK(!closed_);
+
+    if (IsLowPriorityTask(task)) {
+      // Insert the task to the low priority task queue instead of the high
+      // priority batch queue below.
+      TF_RETURN_IF_ERROR(ValidateLowPriorityTaskQueueCapacity(**task));
+      low_priority_tasks_.AddTask(std::move(*task), env_->NowMicros());
+    } else {
+      TF_RETURN_IF_ERROR(ScheduleWithoutOrEagerSplitImpl(task));
+    }
+
+    // Check if the batch queue has a schedulable batch and mark it schedulable
+    // if it not already marked.
+    if (!schedulable_batch_) {
+      if (GetBatches().size() > 1 || IsOpenBatchSchedulable()) {
+        schedulable_batch_ = true;
+        notify_of_schedulable_batch = true;
+      }
+    }
+  }
+
+  if (notify_of_schedulable_batch) {
+    schedulable_batch_callback_();
+  }
+
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::NumEnqueuedTasks() const {
+  size_t num_enqueued_tasks = 0;
+  mutex_lock l(mu_);
+  for (const auto& batch : GetBatches()) {
+    num_enqueued_tasks += batch->num_tasks();
+  }
+  return num_enqueued_tasks + low_priority_tasks_.num_tasks();
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  return SchedulingCapacityInternal();
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::SchedulingCapacityInternal() const {
+  const int64 num_new_batches_schedulable =
+      static_cast<int64_t>(options_.max_enqueued_batches) -
+      this->num_enqueued_batches();
+  const int64 execution_batch_size_limit = max_execution_batch_size();
+  const int64 open_batch_capacity =
+      execution_batch_size_limit - this->tail_batch_task_size();
+  // Note the returned value is guaranteed to be not negative, since
+  // enqueue operation could only happen if queue has enough capacity.
+  return (num_new_batches_schedulable * execution_batch_size_limit) +
+         open_batch_capacity;
+}
+
+template <typename TaskType>
+absl::Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(
+    TaskType* task) const {
+  // Check if the task size is larger than the batch size limit, regardless of
+  // the batch capacity.
+  if (task->size() > options_.input_batch_size_limit) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "Task size %d is larger than maximum input batch size %d", task->size(),
+        options_.input_batch_size_limit));
+  }
+
+  if (options_.enable_large_batch_splitting) {
+    if (task->size() > SchedulingCapacityInternal()) {
+      return errors::Unavailable(
+          "The batch scheduling queue to which this task was submitted is "
+          "full; task size is ",
+          task->size(), " but scheduling capacity is only ",
+          SchedulingCapacityInternal(),
+          " (num_enqueued_batches=", num_enqueued_batches(),
+          ", max_enqueued_batches=", options_.max_enqueued_batches,
+          ", open_batch_size=", tail_batch_task_size(),
+          ", max_execution_batch_size=", max_execution_batch_size(), ")");
+    }
+    return absl::OkStatus();
+  }
+
+  // NOTE, the capacity checking below is loose and is retained
+  // for backward compatibility that was broken due to the merge of no-split
+  // and eager split.
+  // There are existing clients/models that rely on the loose check
+  // and can get errors after the merge. Retaining the old behavior
+  // allows such models to continue to work.
+  //
+  // We need to revisit/remove this check after we fix model configs.
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+  if (batches.back()->size() + task->size() > options_.input_batch_size_limit) {
+    if (batches.size() >= options_.max_enqueued_batches) {
+      return errors::Unavailable(
+          "The batch scheduling queue to which this task was submitted is "
+          "full; currently ",
+          batches.size(), " batches enqueued and max_enqueued_batches is ",
+          options_.max_enqueued_batches);
+    }
+  }
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+absl::Status Queue<TaskType>::ValidateLowPriorityTaskQueueCapacity(
+    const TaskType& task) const {
+  // Unlike the high priority batch capacity validation where having only
+  // input_batch_size_limit without max_execution_batch_size is allowed, it
+  // doesn't have the backward compatibility check and always assume that
+  // max_execution_batch_size is present.
+  if (task.size() >
+      options_.low_priority_queue_options.max_execution_batch_size) {
+    return absl::UnavailableError(absl::StrFormat(
+        "The low priority task queue to which this task was submitted has "
+        "max_execution_batch_size=%d and the task size is %d",
+        options_.low_priority_queue_options.max_execution_batch_size,
+        task.size()));
+  }
+  if (low_priority_tasks_.size() + task.size() >
+      options_.low_priority_queue_options.max_enqueued_batches *
+          options_.low_priority_queue_options.max_execution_batch_size) {
+    return absl::UnavailableError(absl::StrFormat(
+        "The low priority task queue to which this task was submitted does not "
+        "have the capacity to handle this task; currently the low priority "
+        "queue has %d tasks enqueued and the submitted task size is %d while "
+        "max_enqueued_batches=%d and max_execution_batch_size=%d",
+        low_priority_tasks_.size(), task.size(),
+        options_.low_priority_queue_options.max_enqueued_batches,
+        options_.low_priority_queue_options.max_execution_batch_size));
+  }
+  return absl::OkStatus();
+}
+
+template <typename TaskType>
+typename SharedBatchScheduler<TaskType>::BatchTaskUniquePtr
+Queue<TaskType>::ScheduleBatch() {
+  // The batch to schedule, which we may populate below. (If left as nullptr,
+  // that means we are electing not to schedule a batch at this time.)
+  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+
+  {
+    mutex_lock l(mu_);
+
+    std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
+    // Just in time merging of low priority tasks into the open batch.
+    PadOpenBatchWithLowPriorityTasks();
+
+    // Consider closing the open batch at this time, to schedule it.
+    if (batches.size() == 1 && IsOpenBatchSchedulable()) {
+      // Support BatchPaddingPolicy::kBatchDown and
+      // BatchPaddingPolicy::kMinimizeTpuCostPerRequest. We do this before
+      // starting a new batch because starting a new batch will close the old
+      // batch, making it read-only.
+      Batch<TaskType>& old_batch = *batches[0];
+      uint64 old_batch_time = old_batch.EarliestTaskStartTime().value();
+      std::vector<std::unique_ptr<TaskType>> trimmed_tasks;
+      MaybeBatchDown(
+          /* batch= */ old_batch,
+          /* allowed_batch_sizes= */ options_.allowed_batch_sizes,
+          /* disable_padding= */ options_.disable_padding,
+          /* batch_padding_policy= */ options_.batch_padding_policy,
+          /* model_batch_stats= */ options_.model_batch_stats,
+          /* out_trimmed_tasks= */ trimmed_tasks);
+
+      StartNewBatch();
+
+      // Move the trimmed tasks, if any, into the new batch.
+      Batch<TaskType>& new_batch = *batches[1];
+      for (std::unique_ptr<TaskType>& task : trimmed_tasks) {
+        new_batch.AddTask(std::move(task), old_batch_time);
+      }
+      if (!new_batch.empty()) {
+        // TODO - b/325954758: Reconsider the starting time of a trimmed batch.
+        //
+        // Ideally, we'd set open_batch_start_time_micros_ to time we received
+        // the first task in the open batch, but we don't have this information
+        // here. For now, we're trying as alternative solution that doesn't
+        // require adding time to each task: assume that requests arrived at a
+        // steady rate and therefore use a point between the old value of
+        // open_batch_start_time_micros_ and NOW.
+        //
+        // Let's say that originally, the batch had 10 requests, and we want to
+        // schedule a batch of size 8 and leave 2 requests in the open batch
+        // (new_batch). Then, variable `position` is 0.8, which means we have to
+        // set open_batch_start_time_micros_ to be at a position of 80% between
+        // open_batch_start_time_micros_ and now.
+        double position = static_cast<double>(old_batch.size()) /
+                          (old_batch.size() + new_batch.size());
+        open_batch_start_time_micros_ +=
+            (env_->NowMicros() - open_batch_start_time_micros_) * position;
+      }
+    }
+
+    if (batches.size() >= 2) {
+      // There is at least one closed batch that is ready to be scheduled.
+      batch_to_schedule = std::move(batches.front());
+      batches.pop_front();
+    }
+
+    if (batch_to_schedule == nullptr) {
+      // If there was no schedulable batch in the batch queue, try to schedule
+      // from the low priority task queue.
+      batch_to_schedule = ScheduleLowPriorityBatch();
+    }
+
+    if (batch_to_schedule == nullptr) {
+      // There is neither high nor low priority batch that can be scheduled,
+      // mark the condition false and return the nullptr.
+      schedulable_batch_ = false;
+      return batch_to_schedule;
+    }
+
+    // Otherwise, increment the counter and return the batch.
+    ++num_batches_being_processed_;
+  }
+  return batch_to_schedule;
+}
+
+template <typename TaskType>
+std::vector<std::unique_ptr<TaskType>> Queue<TaskType>::GetLowPriorityTasks(
+    size_t size) {
+  std::vector<std::unique_ptr<TaskType>> low_priority_tasks_to_pad;
+  // If priority queue is not enabled, immediately return instead of attempting
+  // to acquire a lock.
+  if (!options_.enable_priority_queue || size == 0)
+    return low_priority_tasks_to_pad;
+  {
+    mutex_lock l(mu_);
+    low_priority_tasks_to_pad = GetLowPriorityTaskQueue().RemoveTask(size);
+  }
+  return low_priority_tasks_to_pad;
+}
+
+template <typename TaskType>
+std::vector<std::unique_ptr<TaskType>>
+Queue<TaskType>::GetLowPriorityTasksForPadding(size_t batch_size) {
+  size_t target_batch_size;
+  switch (options_.mixed_priority_batching_policy) {
+    case MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize:
+      target_batch_size = max_execution_batch_size();
+      break;
+    case MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize:
+      target_batch_size = GetNextAllowedBatchSize(
+          batch_size, options_.allowed_batch_sizes, options_.disable_padding);
+      break;
+    default:
+      target_batch_size = 0;
+      break;
+  }
+
+  if (target_batch_size <= batch_size) {
+    return {};
+  }
+  return GetLowPriorityTasks(target_batch_size - batch_size);
+}
+
+template <typename TaskType>
+void Queue<TaskType>::ProcessBatch(
+    std::unique_ptr<Batch<TaskType>> batch,
+    std::vector<std::unique_ptr<TaskType>> padding_task) {
+  tsl::profiler::TraceMeConsumer trace_me(
+      [&] {
+        return profiler::TraceMeEncode(
+            "ProcessBatch", {{"batch_size_before_padding", batch->size()},
+                             {"_r", 2} /*root_event*/});
+      },
+      tsl::profiler::ContextType::kSharedBatchScheduler,
+      batch->traceme_context_id());
+
+  if (std::holds_alternative<ProcessBatchCallbackWithoutPaddingTasks>(
+          process_batch_callback_)) {
+    std::get<ProcessBatchCallbackWithoutPaddingTasks>(process_batch_callback_)(
+        std::move(batch));
+  } else {
+    std::get<ProcessBatchCallbackWithPaddingTasks>(process_batch_callback_)(
+        std::move(batch), std::move(padding_task));
+  }
+
+  {
+    mutex_lock l(mu_);
+    --num_batches_being_processed_;
+    if (empty_notification_ != nullptr && IsEmptyInternal()) {
+      empty_notification_->Notify();
+    }
+  }
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsEmpty() const {
+  mutex_lock l(mu_);
+  return IsEmptyInternal();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::CloseAndWaitUntilEmpty() {
+  Notification empty;
+  {
+    mutex_lock l(mu_);
+    closed_ = true;
+    if (IsEmptyInternal()) {
+      empty.Notify();
+    } else {
+      // Arrange for ProcessBatch() to notify when the queue becomes empty.
+      empty_notification_ = &empty;
+    }
+  }
+  empty.WaitForNotification();
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsEmptyInternal() const {
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+  return num_batches_being_processed_ == 0 && batches.size() == 1 &&
+         batches.back()->empty() && low_priority_tasks_.empty();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::StartNewBatch() {
+  std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+  batches.back()->Close();
+  batches.emplace_back(new Batch<TaskType>(++traceme_context_id_counter_));
+}
+
+template <typename TaskType>
+absl::Status Queue<TaskType>::SplitInputBatchIntoSubtasks(
+    std::unique_ptr<TaskType>* input_task,
+    std::vector<std::unique_ptr<TaskType>>* output_tasks) {
+  const int open_batch_remaining_slot =
+      max_execution_batch_size() - this->tail_batch_task_size();
+  return options_.split_input_task_func(
+      std::move(input_task), open_batch_remaining_slot,
+      max_execution_batch_size(), std::move(output_tasks));
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsOpenBatchSchedulable() const {
+  return PeekBatchPriorityImpl().has_value();
+}
+
+template <typename TaskType>
+std::optional<typename Queue<TaskType>::BatchPriorityKey>
+Queue<TaskType>::PeekBatchPriority() const {
+  {
+    mutex_lock l(mu_);
+    return PeekBatchPriorityImpl();
+  }
+}
+
+template <typename TaskType>
+std::optional<typename Queue<TaskType>::BatchPriorityKey>
+Queue<TaskType>::PeekBatchPriorityImpl() const {
+  const int kHighPriority = 1;
+  const int kLowPriority = 2;
+
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
+  if (batches.size() >= 2) {
+    Batch<TaskType>* batch = batches.front().get();
+    return std::make_pair(kHighPriority,
+                          batch->EarliestTaskStartTime().value());
+  }
+
+  Batch<TaskType>* open_batch = batches.back().get();
+
+  size_t effective_batch_size = open_batch->size();
+  uint64 effective_start_time_micros = open_batch_start_time_micros_;
+  int64_t effective_batch_timeout_micros = options_.batch_timeout_micros;
+  if (effective_batch_size == 0) {
+    // open_batch_start_time_micros_ is not valid for an empty batch.
+    effective_start_time_micros = env_->NowMicros();
+  }
+
+  if (options_.enable_priority_queue &&
+      options_.mixed_priority_batching_policy ==
+          MixedPriorityBatchingPolicy::kPriorityMerge) {
+    if (effective_batch_size == 0) {
+      effective_batch_timeout_micros =
+          options_.low_priority_queue_options.batch_timeout_micros;
+    }
+
+    effective_batch_size += low_priority_tasks_.size();
+
+    auto low_priority_earliest_start_time =
+        low_priority_tasks_.EarliestTaskStartTime();
+    if (low_priority_earliest_start_time.has_value()) {
+      effective_start_time_micros = std::min(effective_start_time_micros,
+                                             *low_priority_earliest_start_time);
+    }
+  }
+
+  if (effective_batch_size == 0) {
+    return std::nullopt;
+  }
+
+  bool schedulable = closed_ ||
+                     effective_batch_size >= max_execution_batch_size() ||
+                     env_->NowMicros() >= effective_start_time_micros +
+                                              effective_batch_timeout_micros;
+
+  if (!schedulable) {
+    return std::nullopt;
+  }
+
+  int priority = open_batch->empty() ? kLowPriority : kHighPriority;
+  return std::make_pair(priority, effective_start_time_micros);
+}
+
+template <typename TaskType>
+std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleLowPriorityBatch() {
+  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+  if (!options_.enable_priority_queue || low_priority_tasks_.empty() ||
+      options_.mixed_priority_batching_policy ==
+          MixedPriorityBatchingPolicy::kPriorityMerge) {
+    // Return early if priority queue is disabled or there is no low priority
+    // task. Note that the priority_merge policy does all scheduling in
+    // ScheduleBatch().
+    return batch_to_schedule;
+  }
+  if (env_->NowMicros() <
+          *low_priority_tasks_.EarliestTaskStartTime() +
+              options_.low_priority_queue_options.batch_timeout_micros &&
+      low_priority_tasks_.size() <
+          options_.low_priority_queue_options.max_execution_batch_size) {
+    // Return early if the low priority tasks can't fill up the max batch size
+    // and the earliest task didn't time out.
+    return batch_to_schedule;
+  }
+  if (!GetBatches().empty() && !GetBatches().front()->empty()) {
+    // Return early if there is a non-empty high priority batch in the queue.
+    return batch_to_schedule;
+  }
+
+  batch_to_schedule = std::make_unique<Batch<TaskType>>();
+  for (std::unique_ptr<TaskType>& task : low_priority_tasks_.RemoveTask(
+           options_.low_priority_queue_options.max_execution_batch_size)) {
+    batch_to_schedule->AddTask(std::move(task), env_->NowMicros());
+  }
+  batch_to_schedule->Close();
+
+  return batch_to_schedule;
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::tail_batch_task_size() const {
+  return GetBatches().back()->size();
+}
+
+template <typename TaskType>
+int64 Queue<TaskType>::num_enqueued_batches() const {
+  return GetBatches().size();
+}
+
+template <typename TaskType>
+std::deque<std::unique_ptr<Batch<TaskType>>>& Queue<TaskType>::GetBatches() {
+  return high_priority_batches_;
+}
+
+template <typename TaskType>
+const std::deque<std::unique_ptr<Batch<TaskType>>>&
+Queue<TaskType>::GetBatches() const {
+  return high_priority_batches_;
+}
+
+template <typename TaskType>
+TaskQueue<TaskType>& Queue<TaskType>::GetLowPriorityTaskQueue() {
+  return low_priority_tasks_;
+}
+
+template <typename TaskType>
+QueueHandle<TaskType>::QueueHandle(
+    std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
+    Queue<TaskType>* queue)
+    : scheduler_(scheduler), queue_(queue) {}
+
+template <typename TaskType>
+QueueHandle<TaskType>::~QueueHandle() {
+  queue_->CloseAndWaitUntilEmpty();
+}
+
+template <typename TaskType>
+absl::Status QueueHandle<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  return queue_->Schedule(task);
+}
+
+template <typename TaskType>
+size_t QueueHandle<TaskType>::NumEnqueuedTasks() const {
+  return queue_->NumEnqueuedTasks();
+}
+
+template <typename TaskType>
+size_t QueueHandle<TaskType>::SchedulingCapacity() const {
+  return queue_->SchedulingCapacity();
+}
+
+}  // namespace internal
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/threadsafe_status.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/threadsafe_status.h
new file mode 100644
index 00000000..68e94f70
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/threadsafe_status.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_THREADSAFE_STATUS_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_THREADSAFE_STATUS_H_
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+// Wrapper class to allow both lock-free construction and concurrent updates on
+// a 'status'.
+//
+// Example Usage:
+//   std::thread threads[2];
+//   ThreadSafeStatus thread_safe_status;
+//   threads[0] = std::thread([&]() {
+//     status.Update(errors::Internal("internal error"));
+//   });
+//   threads[1] = std::thread([&]() {
+//     status.Update(errors::InvalidArgument("invalid argument"));
+//   });
+//   threads[0].Join();
+//   threads[1].Join();
+//
+//   NOTE:
+//   When updated in a multi-threading setup, only the first error is retained.
+class ThreadSafeStatus {
+ public:
+  const absl::Status& status() const& TF_LOCKS_EXCLUDED(mutex_);
+  absl::Status status() && TF_LOCKS_EXCLUDED(mutex_);
+
+  // Retains the first error status: replaces the current status with
+  // `new_status` if `new_status` is not OK and the previous status is OK.
+  void Update(const absl::Status& new_status) TF_LOCKS_EXCLUDED(mutex_);
+  void Update(absl::Status&& new_status) TF_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  mutable mutex mutex_;
+  absl::Status status_ TF_GUARDED_BY(mutex_);
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_THREADSAFE_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/warmup.h b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/warmup.h
new file mode 100644
index 00000000..30e64795
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/batching_util/warmup.h
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_WARMUP_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_WARMUP_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace tensorflow {
+namespace serving {
+
+// Global registry for model's warm-up states. Before a model executes warm-up
+// requests, it is registered here so that the runtime can distinguish demand
+// requests vs. warm-up requests and apply warm-up specific optimizations.
+class WarmupStateRegistry {
+ public:
+  struct Key {
+    std::string name;
+    int64_t version;
+
+    Key(std::string name, int64_t version)
+        : name(std::move(name)), version(version) {}
+
+    template <typename H>
+    friend H AbslHashValue(H state, const Key& key) {
+      return H::combine(std::move(state), key.name, key.version);
+    }
+
+    friend bool operator==(const Key& x, const Key& y) {
+      return x.name == y.name && x.version == y.version;
+    }
+  };
+  // Data stored per key.
+  struct PerModelData {
+    // If true, supported batch ops will execute the model on dummy batches
+    // for all `allowed_batch_sizes` of that batch op. This removes the
+    // need to issue separate warmup requests for each batch size.
+    bool warmup_all_batch_sizes = false;
+  };
+
+  // RAII handle for registered models.
+  class Handle {
+   public:
+    Handle() = default;
+
+    Handle(const Handle& other) = delete;
+    Handle& operator=(const Handle& other) = delete;
+    Handle(Handle&& other)
+        : key_(std::move(other.key_)), registry_(other.registry_) {
+      other.key_.reset();
+    }
+    Handle& operator=(Handle&& other) {
+      if (key_.has_value()) {
+        Release();
+      }
+
+      key_ = std::move(other.key_);
+      other.key_.reset();
+      registry_ = other.registry_;
+      return *this;
+    }
+
+    ~Handle() { Release(); }
+
+    void Release();
+
+   private:
+    friend class WarmupStateRegistry;
+
+    // Can only be constructed by `WarmupStateRegistry::Register()`.
+    Handle(const Key& key, WarmupStateRegistry* registry)
+        : key_(key), registry_(registry) {
+      DCHECK(registry_);
+    }
+
+    std::optional<Key> key_;
+    WarmupStateRegistry* registry_ = nullptr;
+  };
+
+  // Registers the given model to be in a warm-up state and associates the given
+  // metadata with the model. Returns an RAII handle that unregisters the model
+  // at its destruction.
+  absl::StatusOr<Handle> Register(const Key& model_key,
+                                  std::unique_ptr<PerModelData> per_model_data);
+
+  // Return model data. A nullptr indicates the key was not present.
+  const PerModelData* Lookup(const Key& model_key);
+
+ private:
+  friend class Handle;
+
+  void Unregister(const Key& model_key);
+
+  absl::Mutex mu_;
+  // Map of model names/versions to miscellaneous data.
+  absl::flat_hash_map<Key, std::unique_ptr<PerModelData>> states_
+      ABSL_GUARDED_BY(&mu_);
+};
+
+WarmupStateRegistry& GetGlobalWarmupStateRegistry();
+
+// Utility function that returns whether or not to warmup all batch sizes,
+// based on the state of WarmupStateRegistry.
+bool ShouldWarmupAllBatchSizes(const OpKernelContext* c);
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_WARMUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/betainc_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/betainc_op.h
new file mode 100644
index 00000000..c808e688
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/betainc_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BETAINC_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BETAINC_OP_H_
+// Functor definition for BetaincOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BetaincOp to do the computations.
+template <typename Device, typename T, int NDIM>
+struct Betainc {
+  void operator()(const Device& d, typename TTypes<T, NDIM>::ConstTensor a,
+                  typename TTypes<T, NDIM>::ConstTensor b,
+                  typename TTypes<T, NDIM>::ConstTensor x,
+                  typename TTypes<T, NDIM>::Tensor output) {
+    output.device(d) = Eigen::betainc(a, b, x);
+  }
+
+  void BCast(const Device& d, typename TTypes<T, NDIM>::ConstTensor a,
+             const typename Eigen::array<Eigen::DenseIndex, NDIM>& bcast_a,
+             typename TTypes<T, NDIM>::ConstTensor b,
+             const typename Eigen::array<Eigen::DenseIndex, NDIM>& bcast_b,
+             typename TTypes<T, NDIM>::ConstTensor x,
+             const typename Eigen::array<Eigen::DenseIndex, NDIM>& bcast_x,
+             typename TTypes<T, NDIM>::Tensor output) {
+    output.device(d) = Eigen::betainc(
+        a.broadcast(bcast_a), b.broadcast(bcast_b), x.broadcast(bcast_x));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BETAINC_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/bias_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/bias_op.h
new file mode 100644
index 00000000..d4a78804
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/bias_op.h
@@ -0,0 +1,60 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BIAS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BIAS_OP_H_
+// Functor definition for BiasOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by BiasOp to do the computations.
+template <typename Device, typename T>
+struct Bias {
+  // Add "bias" to "input", repeating "bias".
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstVec bias,
+                  typename TTypes<T>::Flat output) {
+    const Eigen::Index rest_size = input.size() / bias.dimension(0);
+    Eigen::DSizes<Eigen::Index, 1> bcast(rest_size);
+    MaybeWith32BitIndexing<Device>(
+        [&](auto input32, auto bias32, auto output32, const auto& bcast32) {
+          output32.device(d) = input32 + bias32.broadcast(bcast32);
+        },
+        input, bias, output, bcast);
+  }
+
+  // NCHW layout, repeating on the first dimension, broadcasting on the last
+  // dimension.
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix input,
+                  typename TTypes<T>::ConstMatrix bias1,  // shape [C, 1].
+                  typename TTypes<T>::Matrix output) {
+    const Eigen::Index rest_size = input.dimension(0) / bias1.dimension(0);
+    Eigen::DSizes<Eigen::Index, 2> bcast(rest_size, input.dimension(1));
+    MaybeWith32BitIndexing<Device>(
+        [&](auto input32, auto bias32, auto output32, const auto& bcast32) {
+          output32.device(d) = input32 + bias32.broadcast(bcast32);
+        },
+        input, bias1, output, bcast);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BIAS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/bias_op_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/bias_op_gpu.h
new file mode 100644
index 00000000..0ece14a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/bias_op_gpu.h
@@ -0,0 +1,81 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BIAS_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_BIAS_OP_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+struct BiasGPU {
+  static void compute(const GPUDevice& d, const T* input, const T* bias,
+                      T* output, int32_t batch, int32_t height, int32_t width,
+                      int32_t depth, int32_t channel, TensorFormat data_format);
+};
+
+template <typename T>
+struct BiasGradGPU {
+  static void compute(const GPUDevice& device, const T* output_backprop,
+                      T* bias_backprop, int32_t batch, int32_t height,
+                      int32_t width, int32_t depth, int32_t channel,
+                      TensorFormat data_format);
+
+  static void DoRowReduction(OpKernelContext* context, T* output,
+                             const T* input, int rows, int cols);
+
+  static void DoColReduction(OpKernelContext* context, T* output,
+                             const T* input, int rows, int cols);
+};
+
+enum class BiasAddGradGPUMode {
+  kInvalid = 0,
+  kNative = 1,
+  kReduction = 2,
+};
+
+// Describe the BiasGradGPU result from a perf experiment.
+//
+// Arguments:
+// algorithm: returns the method to use for bias add grad.
+// elapsed_time; returns the measured elapsed time in microseconds.
+class BiasGradGPUProfileResult {
+ public:
+  bool is_valid() const {
+    return (algorithm_ != BiasAddGradGPUMode::kInvalid &&
+            elapsed_time_ != std::numeric_limits<float>::max());
+  }
+  BiasAddGradGPUMode algorithm() const { return algorithm_; }
+  void set_algorithm(BiasAddGradGPUMode val) { algorithm_ = val; }
+  uint64 elapsed_time() const { return elapsed_time_; }
+  void set_elapsed_time(uint64 val) { elapsed_time_ = val; }
+
+ private:
+  BiasAddGradGPUMode algorithm_ = BiasAddGradGPUMode::kInvalid;
+  uint64 elapsed_time_ = std::numeric_limits<uint64>::max();
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BIAS_OP_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/bincount_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/bincount_op.h
new file mode 100644
index 00000000..48847617
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/bincount_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BINCOUNT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BINCOUNT_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename Tidx, typename T, bool binary_count>
+struct BincountFunctor {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                              const typename TTypes<T, 1>::ConstTensor& weights,
+                              typename TTypes<T, 1>::Tensor& output,
+                              const Tidx num_bins);
+};
+
+template <typename Device, typename Tidx, typename T, bool binary_count>
+struct BincountReduceFunctor {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 2>::ConstTensor& in,
+                              const typename TTypes<T, 2>::ConstTensor& weights,
+                              typename TTypes<T, 2>::Tensor& out,
+                              const Tidx num_bins);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BINCOUNT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/broadcast_to_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/broadcast_to_op.h
new file mode 100644
index 00000000..083723e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/broadcast_to_op.h
@@ -0,0 +1,91 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BroadcastTo {
+  template <int NDIMS>
+  void DoBCast(
+      const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+      typename TTypes<T, NDIMS>::ConstTensor in,
+      const typename Eigen::array<Eigen::DenseIndex, NDIMS> &bcast) const {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto out32, auto in32, const auto &bcast32) {
+          out32.device(device) = in32.broadcast(bcast32);
+        },
+        out, in, bcast);
+  }
+
+  template <int NDIMS>
+  void ReshapeAndBCast(const Device &device, Tensor &output_tensor,
+                       const Tensor &input_tensor, const BCast &bcast) const {
+    DoBCast<NDIMS>(
+        device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+        input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+        BCast::ToIndexArrayType<Eigen::DenseIndex, NDIMS>(bcast.x_bcast()));
+  }
+
+  // PRECONDITION: rank(input_shape) > 0 &&
+  //               rank(input_shape) <= rank(output_shape)  &&
+  //               output_shape.num_elements() > 0.
+  void operator()(const Device &device, OpKernelContext *ctx,
+                  Tensor &output_tensor, const TensorShape &output_shape,
+                  const Tensor &input_tensor, const TensorShape &input_shape,
+                  const BCast &bcast) const {
+    const int ndims = bcast.y_reshape().size();
+    switch (ndims) {
+      case 1:
+        ReshapeAndBCast<1>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 2:
+        ReshapeAndBCast<2>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 3:
+        ReshapeAndBCast<3>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 4:
+        ReshapeAndBCast<4>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 5:
+        ReshapeAndBCast<5>(device, output_tensor, input_tensor, bcast);
+        break;
+      default:
+        ctx->SetStatus(errors::Unimplemented(
+            "Broadcast between ", input_shape.DebugString(), " and ",
+            output_shape.DebugString(), " is not supported yet."));
+        break;
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/bucketize_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/bucketize_op.h
new file mode 100644
index 00000000..9fb59c77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/bucketize_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BUCKETIZE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BUCKETIZE_OP_H_
+
+#include <vector>
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct BucketizeFunctor {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<T, 1>::ConstTensor& input,
+                              const std::vector<float>& boundaries_vector,
+                              typename TTypes<int32, 1>::Tensor& output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BUCKETIZE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cast_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cast_op.h
new file mode 100644
index 00000000..0c955651
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cast_op.h
@@ -0,0 +1,351 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CAST_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/types.h"
+
+// Note that the GPU cast functor templates need to be instantiated unlike the
+// CPU ones, and hence their specializations are different than that for CPUs.
+#ifdef SPECIALIZE_FOR_GPUS
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_TYPE)                  \
+  template <typename Device>                                        \
+  struct CastFunctor<Device, OUT_TYPE, IN_TYPE> {                   \
+    void operator()(const Device& d,                                \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,     \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor,  \
+                    bool truncate = false) {                        \
+      if (truncate) {                                               \
+        out_tensor.device(d) =                                      \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_TYPE, OUT_TYPE>()) \
+                .template cast<OUT_TYPE>();                         \
+      } else {                                                      \
+        out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
+      }                                                             \
+    }                                                               \
+  };                                                                \
+  template struct CastFunctor<DEVICE, OUT_TYPE, IN_TYPE>;
+#else
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_TYPE)                  \
+  template <>                                                       \
+  struct CastFunctor<DEVICE, OUT_TYPE, IN_TYPE> {                   \
+    void operator()(const DEVICE& d,                                \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,     \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor,  \
+                    bool truncate = false) {                        \
+      if (truncate) {                                               \
+        out_tensor.device(d) =                                      \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_TYPE, OUT_TYPE>()) \
+                .template cast<OUT_TYPE>();                         \
+      } else {                                                      \
+        out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
+      }                                                             \
+    }                                                               \
+  };
+#endif
+
+#define CAST_FUNCTORS(devname)                                        \
+  SPECIALIZE_CAST(devname, float, double)                             \
+  SPECIALIZE_CAST(devname, float, std::complex<double>)               \
+  SPECIALIZE_CAST(devname, std::complex<float>, std::complex<double>) \
+  SPECIALIZE_CAST(devname, std::complex<float>, double)               \
+  SPECIALIZE_CAST(devname, Eigen::half, double)                       \
+  SPECIALIZE_CAST(devname, Eigen::half, float)                        \
+  SPECIALIZE_CAST(devname, Eigen::half, std::complex<double>)         \
+  SPECIALIZE_CAST(devname, Eigen::half, std::complex<float>)          \
+  SPECIALIZE_CAST(devname, bfloat16, float)                           \
+  SPECIALIZE_CAST(devname, float8_e5m2, double)                       \
+  SPECIALIZE_CAST(devname, float8_e5m2, float)                        \
+  SPECIALIZE_CAST(devname, float8_e5m2, bfloat16)                     \
+  SPECIALIZE_CAST(devname, float8_e5m2, Eigen::half)                  \
+  SPECIALIZE_CAST(devname, float8_e5m2, float8_e4m3fn)                \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, double)                     \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, float)                      \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, bfloat16)                   \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, Eigen::half)                \
+  template <typename OUT_TYPE, typename IN_TYPE>                      \
+  struct CastFunctor<devname, OUT_TYPE, IN_TYPE> {                    \
+    void operator()(const devname& d,                                 \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,       \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor,    \
+                    bool truncate = false) {                          \
+      out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();     \
+    }                                                                 \
+  };
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+// If MLIR kernels are enabled, we don't need the specialized cast from float to
+// double or from Eigen::half to double. We still need the specialized cast from
+// Eigen::half to float, because it is used in depthwise_conv_grad_op.cc. We
+// still need the specialized cast from float to double because it is used in
+// resize_bilinear_op.cc.
+#define CAST_FUNCTORS_SUBSET(devname)                              \
+  SPECIALIZE_CAST(devname, float, double)                          \
+  SPECIALIZE_CAST(devname, Eigen::half, float)                     \
+  SPECIALIZE_CAST(devname, bfloat16, float)                        \
+  SPECIALIZE_CAST(devname, float8_e5m2, double)                    \
+  SPECIALIZE_CAST(devname, float8_e5m2, float)                     \
+  SPECIALIZE_CAST(devname, float8_e5m2, bfloat16)                  \
+  SPECIALIZE_CAST(devname, float8_e5m2, Eigen::half)               \
+  SPECIALIZE_CAST(devname, float8_e5m2, float8_e4m3fn)             \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, double)                  \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, float)                   \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, bfloat16)                \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, Eigen::half)             \
+  template <typename OUT_TYPE, typename IN_TYPE>                   \
+  struct CastFunctor<devname, OUT_TYPE, IN_TYPE> {                 \
+    void operator()(const devname& d,                              \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,    \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor, \
+                    bool truncate = false) {                       \
+      out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();  \
+    }                                                              \
+  };
+#endif
+
+namespace tensorflow {
+
+typedef std::function<void(OpKernelContext*, const Tensor&, Tensor*,
+                           bool trunc)>
+    CastFunctorType;
+
+// Common base class of Cast kernels
+class CastOpBase : public OpKernel {
+ public:
+  explicit CastOpBase(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  DataType src_dtype_;
+  DataType dst_dtype_;
+  DataType external_src_dtype_;
+  DataType external_dst_dtype_;
+  bool use_truncation_;
+  CastFunctorType work_ = nullptr;
+  absl::Status Unimplemented();
+
+  CastOpBase(const CastOpBase&) = delete;
+  void operator=(const CastOpBase&) = delete;
+};
+
+// CPU implementation of Cast
+class CpuCastOp : public CastOpBase {
+ public:
+  explicit CpuCastOp(OpKernelConstruction* ctx);
+
+ private:
+  absl::Status Prepare();
+};
+
+namespace functor {
+
+template <typename I>
+constexpr int MantissaWidth() {
+  return std::numeric_limits<I>::digits;
+}
+
+template <>
+constexpr int MantissaWidth<Eigen::half>() {
+  // Remember, there's 1 hidden bit
+  return 10 + 1;
+}
+
+template <>
+constexpr int MantissaWidth<bfloat16>() {
+  // Remember, there's 1 hidden bit
+  return 7 + 1;
+}
+
+template <typename Device, typename Tout, typename Tin>
+void Cast(const Device& d, typename TTypes<Tout>::Flat o,
+          typename TTypes<Tin>::ConstFlat i) {
+  o.device(d) = i.template cast<Tout>();
+}
+
+template <typename Device, typename Tout, typename Tin>
+struct CastFunctor {
+  void operator()(const Device& d, typename TTypes<Tout>::Flat o,
+                  typename TTypes<Tin>::ConstFlat i, bool truncate = false);
+};
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 8, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!Eigen::numext::isnan(t)) {
+    uint64_t* p = reinterpret_cast<uint64_t*>(&t);
+    *p &= (0xFFFFFFFFFFFFFFFF << n);
+  }
+}
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 4, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!Eigen::numext::isnan(t)) {
+    uint32_t* p = reinterpret_cast<uint32_t*>(&t);
+    *p &= (0xFFFFFFFF << n);
+  }
+}
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 2, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!Eigen::numext::isnan(t)) {
+    uint16_t* p = reinterpret_cast<uint16_t*>(&t);
+    *p &= (0xFFFF << n);
+  }
+}
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 1, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!Eigen::numext::isnan(t)) {
+    uint8_t* p = reinterpret_cast<uint8_t*>(&t);
+    *p &= (0xFF << n);
+  }
+}
+
+// Set n least significant bits to 0
+template <typename I, typename O>
+struct LSBZeroSetter {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE I operator()(const I& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I t = a;
+    LSBZeroSetterHelper(t, bits);
+    return t;
+  }
+};
+
+template <typename I, typename O>
+struct LSBZeroSetter<std::complex<I>, std::complex<O>> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<I> operator()(
+      const std::complex<I>& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I re = Eigen::numext::real(a);
+    I img = Eigen::numext::imag(a);
+    LSBZeroSetterHelper(re, bits);
+    LSBZeroSetterHelper(img, bits);
+    std::complex<I> toReturn(re, img);
+    return toReturn;
+  }
+};
+
+template <typename I, typename O>
+struct LSBZeroSetter<std::complex<I>, O> {
+  // Sets the 16 LSBits of the float to 0
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<I> operator()(
+      const std::complex<I>& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I re = Eigen::numext::real(a);
+    I img = Eigen::numext::imag(a);
+    LSBZeroSetterHelper(re, bits);
+    LSBZeroSetterHelper(img, bits);
+    std::complex<I> toReturn(re, img);
+    return toReturn;
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+namespace Eigen {
+namespace internal {
+
+// Eigen can't convert to/from complex numbers, because it is limited to cases
+// that can be static_casted. But numpy is able to cast to/from complex, which
+// we want to replicate. So we add specializations for complex here.
+template <typename From, typename To>
+struct scalar_cast_op<std::complex<From>, To> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE To
+  operator()(const std::complex<From>& a) const {
+    // Replicate numpy behavior of returning just the real part
+    return static_cast<To>(a.real());
+  }
+};
+
+template <typename From>
+struct scalar_cast_op<std::complex<From>, bool> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(
+      const std::complex<From>& a) const {
+    return static_cast<bool>(a.real());
+  }
+};
+
+template <typename From, typename To>
+struct scalar_cast_op<From, std::complex<To>> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<To> operator()(
+      const From& a) const {
+    // Replicate numpy behavior of setting the imaginary part to 0
+    return std::complex<To>(static_cast<To>(a), To(0));
+  }
+};
+
+template <typename From, typename To>
+struct scalar_cast_op<std::complex<From>, std::complex<To>> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<To> operator()(
+      const std::complex<From>& a) const {
+    return std::complex<To>(static_cast<To>(a.real()),
+                            static_cast<To>(a.imag()));
+  }
+};
+
+template <typename From, typename To>
+struct functor_traits_complex_impl {
+  enum { Cost = NumTraits<To>::AddCost, PacketAccess = false };
+};
+
+template <typename From>
+struct functor_traits<scalar_cast_op<std::complex<From>, bool>>
+    : functor_traits_complex_impl<std::complex<From>, bool> {};
+
+template <typename From, typename To>
+struct functor_traits<scalar_cast_op<std::complex<From>, To>>
+    : functor_traits_complex_impl<std::complex<From>, To> {};
+template <typename From, typename To>
+struct functor_traits<scalar_cast_op<From, std::complex<To>>>
+    : functor_traits_complex_impl<From, std::complex<To>> {};
+// Needed to avoid ambiguous partial specialization
+template <typename From, typename To>
+struct functor_traits<scalar_cast_op<std::complex<From>, std::complex<To>>>
+    : functor_traits_complex_impl<std::complex<From>, std::complex<To>> {};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_CAST_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cast_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cast_op_impl.h
new file mode 100644
index 00000000..6f0fe7eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cast_op_impl.h
@@ -0,0 +1,189 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+
+#include <cstdint>
+#include <limits>
+
+#define EIGEN_USE_THREADS
+
+#include "absl/status/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/cast_op.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <class F, class I>
+struct OutOfRange {
+  bool operator()(const F f) const {
+    return f < std::numeric_limits<I>::min() ||
+           f > std::numeric_limits<I>::max();
+  }
+};
+
+#define VALIDATE_CAST(I, F)                                                  \
+  template <>                                                                \
+  struct CastFunctor<Eigen::ThreadPoolDevice, I, F> {                        \
+    void operator()(const Eigen::ThreadPoolDevice& d,                        \
+                    typename TTypes<I>::Flat out_tensor,                     \
+                    typename TTypes<F>::ConstFlat in_tensor,                 \
+                    bool truncate = false) const {                           \
+      Eigen::Tensor<bool, 0, Eigen::RowMajor> out_of_range =                 \
+          in_tensor.unaryExpr(OutOfRange<F, I>{}).any();                     \
+      if (out_of_range()) {                                                  \
+        LOG(ERROR)                                                           \
+            << "IMPORTANT! The input tensor to Cast contains values out of " \
+               "range for the target type. This is undefined behavior and "  \
+               "likely a bug in your model. A crash immediately after this " \
+               "under ubsan is expected.";                                   \
+      }                                                                      \
+      out_tensor.device(d) = in_tensor.template cast<I>();                   \
+    }                                                                        \
+  };
+
+// Add additional logging for out of range inputs when running in debug mode.
+#ifndef NDEBUG
+VALIDATE_CAST(int32, float);
+VALIDATE_CAST(int64, float);
+VALIDATE_CAST(int32, double);
+VALIDATE_CAST(int64, double);
+#endif
+
+CAST_FUNCTORS(Eigen::ThreadPoolDevice);
+
+
+}  // namespace functor
+
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  FN(arg0, arg1, bool);                \
+  FN(arg0, arg1, uint8);               \
+  FN(arg0, arg1, uint16);              \
+  FN(arg0, arg1, uint32);              \
+  FN(arg0, arg1, uint64);              \
+  FN(arg0, arg1, int8);                \
+  FN(arg0, arg1, int16);               \
+  FN(arg0, arg1, int32);               \
+  FN(arg0, arg1, int64_t);             \
+  FN(arg0, arg1, float);               \
+  FN(arg0, arg1, double);              \
+  FN(arg0, arg1, std::complex<float>); \
+  FN(arg0, arg1, std::complex<double>) \
+  FN(arg0, arg1, Eigen::half);         \
+  FN(arg0, arg1, bfloat16);
+
+#define CAST_CASE(DEVICE, IN, OUT)                                        \
+  if (DataTypeToEnum<OUT>::value == dst_dtype) {                          \
+    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,       \
+              bool truncate) {                                            \
+      functor::CastFunctor<DEVICE, OUT, IN> func;                         \
+      func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>(), \
+           truncate);                                                     \
+    };                                                                    \
+  }
+
+// The functions below are implemented in the cast_op_impl_*.cc files.
+CastFunctorType GetCpuCastFromBool(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint8(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint16(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint32(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint64(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromInt16(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromInt32(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromInt64(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromHalf(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromFloat(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromDouble(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromComplex64(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromFloat8e5m2(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromFloat8e4m3fn(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromInt4(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint4(DataType dst_dtype);
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+// Same, for GPU.
+CastFunctorType GetGpuCastFromBool(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromUint8(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromUint16(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromInt8(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromUint32(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromUint64(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromInt16(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromInt32(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromInt64(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromHalf(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromFloat(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromDouble(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromFloat8e5m2(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromFloat8e4m3fn(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromInt4(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromUint4(DataType dst_dtype);
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/checkpoint_callback_manager.h b/third_party/tflite-hdrs/tensorflow/core/kernels/checkpoint_callback_manager.h
new file mode 100644
index 00000000..7e0d9d8f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/checkpoint_callback_manager.h
@@ -0,0 +1,113 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0(the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace checkpoint {
+
+ABSL_CONST_INIT extern const absl::string_view
+    kCheckpointCallbackManagerResourceName;
+
+// StatusOr<std::string> save_callback(absl::string_view checkpoint_id);
+using SaveCallback =
+    std::function<absl::StatusOr<std::string>(absl::string_view)>;
+
+// Status restore_callback(absl::string_view checkpoint_id,
+//                         absl::string_view content_from_checkpoint);
+using RestoreCallback =
+    std::function<absl::Status(absl::string_view, absl::string_view)>;
+
+// A class to save and restore additional information for checkpointing.
+class CheckpointCallbackManager : public ResourceBase {
+ public:
+  CheckpointCallbackManager() = default;
+
+  // Not copyable or movable
+  CheckpointCallbackManager(const CheckpointCallbackManager&) = delete;
+  CheckpointCallbackManager& operator=(const CheckpointCallbackManager&) =
+      delete;
+
+  std::string DebugString() const override {
+    return "CheckpointCallbackManager";
+  }
+
+  // Infers a checkpoint id and directory from a prefix
+  // passed to SaveV2 / RestoreV2 Ops
+  static absl::StatusOr<std::pair<std::string, std::string>>
+  GetCheckpointIdAndPathFromPrefix(absl::string_view prefix);
+
+  // Register a save callback.
+  // The passed callback will be triggered with an identified checkpoint id.
+  // The callback should return a string content needs to be stored
+  // as a part of a checkpoint, and then the content is stored as a file
+  // with the registered the file_extension.
+  absl::Status RegisterSaveCallback(absl::string_view file_extension,
+                                    SaveCallback callback);
+
+  // Checks if a registered save callback exists for an extension.
+  bool DoesSaveCallbackExist(absl::string_view file_extension);
+
+  // Register a restore callback.
+  // The passed file_extension is used to generate a file name together with
+  // an identified checkpoint_id. If the file exists, the registered callback
+  // is triggered with the content of the file.
+  absl::Status RegisterRestoreCallback(absl::string_view file_extension,
+                                       RestoreCallback callback);
+
+  // Checks if a registered restore callback exists for an extension.
+  bool DoesRestoreCallbackExist(absl::string_view file_extension);
+
+  // Should be triggered from SaveV2()::Compute().
+  void Save(absl::string_view prefix);
+
+  // Should be triggered from RestoreV2()::Compute().
+  void Restore(absl::string_view prefix);
+
+ private:
+  mutable mutex mu_;
+
+  absl::flat_hash_map<std::string, SaveCallback> save_callbacks_
+      TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, RestoreCallback> restore_callbacks_
+      TF_GUARDED_BY(mu_);
+
+  // Checkpoint save and restore could happen before save / restore callbacks
+  // are registered. The last checkpoint information is kept in these variables
+  // to trigger the registered callback lazily.
+  std::pair<std::string, std::string> last_restored_checkpoint_id_and_dir_
+      TF_GUARDED_BY(mu_);
+
+  std::pair<std::string, std::string> last_saved_checkpoint_id_and_dir_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace checkpoint
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl.h
new file mode 100644
index 00000000..4fc4bebb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclBase : public CollectiveImplementationInterface {
+ public:
+  explicit NcclBase(CollectiveType type, const string& name);
+  ~NcclBase() override = default;
+
+  // No-op for this collective implementation.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes the device objects and device localities.
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+ protected:
+  const CollectiveType type_;
+  const string name_;
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_all_to_all.h b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_all_to_all.h
new file mode 100644
index 00000000..4ba624c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_all_to_all.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_ALL_TO_ALL_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_ALL_TO_ALL_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclAllToAll : public NcclBase {
+ public:
+  NcclAllToAll() : NcclBase(ALL_TO_ALL_COLLECTIVE, "NcclAllToAll") {}
+  ~NcclAllToAll() override = default;
+
+  // Hands off all-to-all to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_ALL_TO_ALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_broadcaster.h b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_broadcaster.h
new file mode 100644
index 00000000..9c1f6f4a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_broadcaster.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclBroadcaster : public NcclBase {
+ public:
+  NcclBroadcaster() : NcclBase(BROADCAST_COLLECTIVE, "NcclBroadcast") {}
+  ~NcclBroadcaster() override = default;
+
+  // Hands off broadcast to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_gatherer.h b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_gatherer.h
new file mode 100644
index 00000000..97d41f77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_gatherer.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclGatherer : public NcclBase {
+ public:
+  NcclGatherer() : NcclBase(GATHER_COLLECTIVE, "NcclGather") {}
+  ~NcclGatherer() override = default;
+
+  // Hands off all-gather to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_reducer.h b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_reducer.h
new file mode 100644
index 00000000..b95d5720
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/collective_nccl_reducer.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclReducer : public NcclBase {
+ public:
+  NcclReducer() : NcclBase(REDUCTION_COLLECTIVE, "NcclReduce") {}
+  NcclReducer(CollectiveType type, const string& name) : NcclBase(type, name) {}
+  ~NcclReducer() override = default;
+
+  // Hands off all reduce to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+class NcclReduceScatterer : public NcclReducer {
+ public:
+  NcclReduceScatterer()
+      : NcclReducer(REDUCE_SCATTER_COLLECTIVE, "NcclReduceScatter") {}
+  ~NcclReduceScatterer() override = default;
+  // Uses same Run() as NcclReducer.
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/composite_tensor_variant.h b/third_party/tflite-hdrs/tensorflow/core/kernels/composite_tensor_variant.h
new file mode 100644
index 00000000..fa98f795
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/composite_tensor_variant.h
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_COMPOSITE_TENSOR_VARIANT_H_
+#define TENSORFLOW_CORE_KERNELS_COMPOSITE_TENSOR_VARIANT_H_
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+
+namespace tensorflow {
+
+class CompositeTensorVariantMetadata;
+
+// Encoding for a `tf.ExtensionType` value, that can be saved as a Variant.
+//
+// `tf.ExtensionType` (also known as `CompositeTensor`) is a Python base class
+// used to Python types that are supported by TensorFlow APIs.  Example
+// ExtensionTypes include `tf.RaggedTensor` and `tf.SparseTensor`.
+//
+// `CompositeTensorVariant` decomposes the `ExtensionType` value into two
+// parts:
+//
+//   * `components`: A list of Tensors, which encodes the value's dynamic
+//     data -- i.e., data that may change for different executions of a graph.
+//   * `metadata`: A serialized TypeSpec, which encodes the value's
+//     static data -- i.e., data that is the same for all executions of a graph.
+//
+// CompositeTensorVariant can be stored in a Tensor with dtype=DT_VARIANT.
+// Typically, extension type values are encoded with a scalar tensor containing
+// a single CompositeTensorVariant value.
+class CompositeTensorVariant {
+ public:
+  CompositeTensorVariant(const CompositeTensorVariantMetadata& metadata,
+                         absl::Span<Tensor> flat_components);
+
+  CompositeTensorVariant();
+  CompositeTensorVariant(const CompositeTensorVariant& other);
+  CompositeTensorVariant& operator=(CompositeTensorVariant&& other) = default;
+  CompositeTensorVariant& operator=(const CompositeTensorVariant& other) =
+      delete;
+
+  // Returns the list of Tensor components that encode this value's dynamic
+  // data.
+  absl::Span<const Tensor> flat_components() const {
+    return absl::MakeConstSpan(flat_components_);
+  }
+
+  // Returns the serialized TypeSpec that encodes the value's static data.
+  const CompositeTensorVariantMetadata& metadata() const { return *metadata_; }
+
+  // Variant methods.
+  string TypeName() const { return kTypeName; }
+
+  // Updates `VariantTensorData` with an encoding for this value.
+  void Encode(VariantTensorData* data) const;
+
+  // Updates this value to match the encoding in a given `VariantTensorData`.
+  bool Decode(const VariantTensorData& data);
+
+  // Returns a string summary for this value.
+  string DebugString() const;
+
+  // Name of this type (used for variant serialization).
+  static constexpr const char kTypeName[] = "CompositeTensorVariant";
+
+ private:
+  // Tensor components for this value.
+  std::vector<Tensor> flat_components_;
+
+  // TypeSpec for this value.  CompositeTensorVariantMetadata is a thin wrapper
+  // around a TypeSpecProto, which is used to retain flexibility to change the
+  // variant encoding.
+  //
+  // Note: we use a unique_ptr, because header files in the kernels/ directory
+  // are not allowed to import .pb.h files.
+  std::unique_ptr<CompositeTensorVariantMetadata> metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COMPOSITE_TENSOR_VARIANT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib.h b/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib.h
new file mode 100644
index 00000000..ca30908c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib.h
@@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
+
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+// Functors to concatenate tensors. These always take a rank-2 tensor (i.e a
+// matrix) and concatenate it along the axis 1 ("putting them next to each
+// other" as opposed to "putting them on top of one another").
+//
+// Any concatenation of n-dimensional tensors across any axis can be reduced to
+// a concatenation of two-dimensional tensors across the axis 1 by first
+// partitioning the axes of the original tensors into those less than the axis
+// to be concatenated across and the rest. Then reshape the tensors into a
+// two-dimensional tensor by collapsing these two sets of axes and concatenate
+// the resulting matrices across the axis 1, finally reshaping the result to
+// have the proper shape.
+//
+// So, for example, when stacking N tensors, reshape each to have shape
+// {1, Numelements} and reshape the result matrix to have shape
+// {1, N * NumElements} before passing it to this functor.
+
+// Assumes all elements of inputs are nonempty.
+// Assumes output is nonempty.
+template <typename T>
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+template <typename T>
+void ConcatGPU(
+    OpKernelContext* c,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs_flat,
+    Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
+
+// Explicit instantiations in concat_lib_gpu.cc.
+#define REGISTER(T)                                                           \
+  extern template void ConcatGPU<T>(                                          \
+      OpKernelContext * c,                                                    \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs_flat,                                                        \
+      Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
+
+TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
+TF_CALL_GPU_ALL_TYPES(REGISTER);
+#undef REGISTER
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib_cpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib_cpu.h
new file mode 100644
index 00000000..45960772
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib_cpu.h
@@ -0,0 +1,135 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
+
+#define EIGEN_USE_THREADS
+
+#include <vector>
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+// ElementCopier must be a struct with a single Copy function, which is passed
+// the output pointer, input pointer, input index, and number of elements to
+// copy from input to output.
+template <typename T, typename ElementCopier>
+void ConcatCPUImpl(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    int64_t cost_per_unit, ElementCopier copier,
+    typename TTypes<T, 2>::Matrix* output) {
+  size_t num_inputs = inputs.size();
+
+  std::vector<ptrdiff_t> sizes;
+  sizes.reserve(num_inputs);
+  int64_t row_size = 0;
+  for (const auto& input : inputs) {
+    sizes.push_back(input->dimension(1));
+    row_size += sizes.back();
+  }
+
+  // cost_per_unit is estimated bytes to copy per output array element (for
+  // strings this includes an estimate of the number of bytes of the actual
+  // string data, as well).
+  const int64_t estimated_total_cost = output->size() * cost_per_unit;
+
+  auto worker_threads = d->tensorflow_cpu_worker_threads();
+  int num_threads = std::min(4, worker_threads->num_threads);
+  num_threads = static_cast<int>(
+      std::min<int64_t>(num_threads, estimated_total_cost / 16384));
+  // Single threaded mode.
+  // TODO(dga):  Deduplicate this code w.r.t. sharded code below.
+  if (num_threads == 0) {
+    T* out = &(*output)(0, 0);
+    std::vector<const T*> inp;
+    inp.reserve(num_inputs);
+    for (const auto& input : inputs) {
+      inp.push_back(&(*input)(0, 0));
+    }
+    const int64_t dim0 = output->dimension(0);
+    for (int64_t i = 0; i < dim0; ++i) {
+      for (int64_t j = 0; j < num_inputs; ++j) {
+        auto size = sizes[j];
+        copier.Copy(out, inp[j], j, size);
+        out += size;
+        inp[j] += size;
+      }
+    }
+    return;
+  }
+
+  // Sharded mode.
+  auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
+                  int64_t start, int64_t end) {
+    int64_t skipped_rows = start / row_size;
+    T* out = output->data() + skipped_rows * row_size;
+    T* out_start = output->data() + start;
+    T* out_end = output->data() + end;
+
+    // Handle partial row at start
+    if (out < out_start) {
+      for (size_t j = 0; j < num_inputs; ++j) {
+        ptrdiff_t size = sizes[j];
+        ptrdiff_t offset = out_start - out;
+        if (size <= offset) {
+          out += size;
+          continue;
+        }
+        const T* inp = &(*inputs[j])(skipped_rows, 0);
+        if (offset > 0) {
+          out += offset;
+          inp += offset;
+          size -= offset;
+        }
+        size = std::min(size, out_end - out);
+        if (size <= 0) break;
+        copier.Copy(out, inp, j, size);
+        out += size;
+      }
+      ++skipped_rows;
+    }
+    if (out == out_end) return;
+    CHECK(out >= out_start);
+    CHECK(out < out_end);
+
+    // Copy remaining data.
+    std::vector<const T*> inp;
+    inp.reserve(num_inputs);
+    for (const auto& input : inputs) {
+      inp.push_back(&(*input)(skipped_rows, 0));
+    }
+    const int64_t dim0 = output->dimension(0);
+    for (int64_t i = skipped_rows; i < dim0; ++i) {
+      for (int64_t j = 0; j < num_inputs; ++j) {
+        ptrdiff_t size = std::min(sizes[j], out_end - out);
+        copier.Copy(out, inp[j], j, size);
+        out += size;
+        inp[j] += size;
+        if (out == out_end) return;
+      }
+    }
+  };
+  Shard(worker_threads->num_threads, worker_threads->workers, output->size(),
+        cost_per_unit, work);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib_gpu.h
new file mode 100644
index 00000000..8e42cc1c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/concat_lib_gpu.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_GPU_H_
+
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_GPU
+
+#include <memory>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+
+namespace tensorflow {
+
+template <typename T, typename IntType>
+void ConcatGPUSlice(
+    const Eigen::GpuDevice& gpu_device,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs_flat,
+    typename TTypes<T, 2>::Matrix* output);
+
+template <typename T, typename IntType>
+void ConcatGPUImpl(const Eigen::GpuDevice& d,
+                   const GpuDeviceArrayStruct<const T*>& input_ptrs,
+                   const GpuDeviceArrayStruct<IntType>& ptr_offsets,
+                   bool same_size, int slice_size,
+                   typename TTypes<T, 2>::Matrix* output);
+
+// Explicit instantiations in concat_lib_gpu_impl.cu.cc.
+#define REGISTER(T)                                                           \
+  extern template void ConcatGPUSlice<T, int32>(                              \
+      const Eigen::GpuDevice& gpu_device,                                     \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs_flat,                                                        \
+      typename TTypes<T, 2>::Matrix* output);                                 \
+  extern template void ConcatGPUSlice<T, int64>(                              \
+      const Eigen::GpuDevice& gpu_device,                                     \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs_flat,                                                        \
+      typename TTypes<T, 2>::Matrix* output);                                 \
+  extern template void ConcatGPUImpl<T, int32>(                               \
+      const Eigen::GpuDevice& d,                                              \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs,                       \
+      const GpuDeviceArrayStruct<int32>& ptr_offsets, bool fixed_size,        \
+      int split_size, typename TTypes<T, 2>::Matrix* output);                 \
+  extern template void ConcatGPUImpl<T, int64>(                               \
+      const Eigen::GpuDevice& d,                                              \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs,                       \
+      const GpuDeviceArrayStruct<int64_t>& ptr_offsets, bool fixed_size,      \
+      int split_size, typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
+TF_CALL_GPU_ALL_TYPES(REGISTER);
+#undef REGISTER
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator.h
new file mode 100644
index 00000000..d2578a55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator.h
@@ -0,0 +1,136 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_H_
+#define TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_H_
+
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/typed_conditional_accumulator_base.h"
+
+namespace tensorflow {
+
+/**
+ * An aggregation object for adding dense gradients.
+ *
+ * The two main methods of this class are TryApplyGrad and TryTakeGrad.
+ *
+ * TryApplyGrad tries add a gradient to the accumulator. The attempt is
+ * successful if local_step >= global_step, i.e., if the gradient is not stale,
+ * having been computed using up-to-date information. Otherwise, the gradient is
+ * silently dropped.
+ *
+ * TryTakeGrad logs an attempt to read the average gradient. The attempt is
+ * blocked until the number of gradients accumulated (via TryApplyGrad) is equal
+ * or exceeds the number requested by TryTakeGrad.
+ * Once this condition is satisfied, the following actions are taken:
+ * (1) the value of the average gradient is returned
+ * (2) the count of accumulated gradients is reset to 0
+ * (3) the internal global_step value (current_global_step_) is incremented by 1
+ *
+ * ConditionalAccumulator is the datatype-dependent templated sub-class of
+ * ConditionalAccumulatorBase. It implements the virtual arithmetic methods that
+ * are used by for aggregating, averaging, allocating, returning dense Tensors.
+ */
+template <typename Device, typename T>
+class ConditionalAccumulator
+    : public TypedConditionalAccumulatorBase<const Tensor> {
+ public:
+  // Args:
+  //   dtype: The datatype of the gradients to be accumulated.
+  //   shape: The shape of the accumulated gradients.
+  //   name:  A name to use for the ConditionalAccumulator.
+  //   reduction_type: The reduction type, i.e., MEAN or SUM
+  ConditionalAccumulator(const DataType& dtype, const PartialTensorShape& shape,
+                         const string& name, const string& reduction_type)
+      : TypedConditionalAccumulatorBase<const Tensor>(dtype, shape, name,
+                                                      reduction_type) {}
+  ~ConditionalAccumulator() override{};
+
+ protected:
+  // accum_grad is the tensor that holds the aggregate gradient.
+  // It is initialized the first time ApplyGrad is called.
+  Tensor accum_grad_;
+
+  functor::SetZeroFunctor<Device, T> set_zero_functor_;
+
+  absl::Status ValidateShape(const Tensor* tensor)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+    // Must be compatible with accumulated gradient if available
+    if (counter_ > 0) {
+      if (!accum_grad_.shape().IsSameSize(tensor->shape())) {
+        return errors::InvalidArgument("Shape mismatch: expected ",
+                                       accum_grad_.shape().DebugString(),
+                                       ", got ", tensor->shape().DebugString());
+      }
+    }
+    // Must also be compatible with given shape
+    if (!shape_.IsCompatibleWith(tensor->shape())) {
+      return errors::InvalidArgument("Shape mismatch: expected ",
+                                     shape_.DebugString(), ", got ",
+                                     tensor->shape().DebugString());
+    }
+    return absl::OkStatus();
+  }
+
+  void AllocateAndAssignToAccumGradFunction(OpKernelContext* ctx,
+                                            const Tensor* grad) override {
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    ctx->allocate_temp(dtype_, grad->shape(), &accum_grad_).IgnoreError();
+    accum_grad_.flat<T>().device(ctx->template eigen_device<Device>()) =
+        grad->flat<T>();
+  }
+
+  void AddToAccumGradFunction(OpKernelContext* ctx,
+                              const Tensor* grad) override {
+    accum_grad_.flat<T>().device(ctx->template eigen_device<Device>()) +=
+        grad->flat<T>();
+  }
+
+  void DivideAccumGradByCounter(OpKernelContext* ctx) override
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+    Tensor c(DataTypeToEnum<T>::value, {});
+    c.scalar<T>()() = TypeConverter<T, int>::ConvertUToT(this->counter_);
+    this->accum_grad_.template flat<T>().device(
+        ctx->template eigen_device<Device>()) =
+        this->accum_grad_.template flat<T>() / c.scalar<T>()();
+  }
+
+  bool SetOutput(OpKernelContext* ctx) override {
+    ctx->set_output(0, accum_grad_);
+    return true;
+  }
+
+  bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx,
+                                             const Tensor** tensor) override
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+    // Get input gradient tensor
+    const Tensor* grad_tensor;
+    OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor));
+    *tensor = grad_tensor;
+    OP_REQUIRES_OK_BOOLEAN(ctx, this->ValidateShape(*tensor));
+    return true;
+  }
+
+  void CleanUpGradTensor(const Tensor* tensor) override {
+    // do nothing
+  }
+
+  ConditionalAccumulator(const ConditionalAccumulator&) = delete;
+  void operator=(const ConditionalAccumulator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator_base.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator_base.h
new file mode 100644
index 00000000..683e667e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -0,0 +1,201 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
+
+#include <deque>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+/**
+ * ConditionalAccumulator/ConditionalAccumulatorBase implements an aggregation
+ * object for adding gradients.
+ * The two main methods of this class are TryApplyGrad and TryTakeGrad.
+ *
+ * TryApplyGrad tries add a gradient to the accumulator. The attempt is
+ * successful if local_step >= global_step, i.e., if the gradient is not stale,
+ * having been computed using up-to-date information. Otherwise, the gradient is
+ * silently dropped.
+ *
+ * TryTakeGrad logs an attempt to read the average gradient. The attempt is
+ * blocked until the number of gradients accumulated (via TryApplyGrad) is equal
+ * or exceeds the number requested by TryTakeGrad.
+ * Once this condition is satisfied, the following actions are taken:
+ * (1) the value of the average gradient is returned
+ * (2) the count of accumulated gradients is reset to 0
+ * (3) the internal global_step value (current_global_step_) is incremented by 1
+ */
+class ConditionalAccumulatorBase : public ResourceBase {
+ public:
+  // Args:
+  //   dtype: The datatype of the gradients to be accumulated.
+  //   shape: The shape of the accumulated gradients.
+  //   name:  A name to use for the ConditionalAccumulator.
+  ConditionalAccumulatorBase(const DataType& dtype,
+                             const PartialTensorShape& shape,
+                             const string& name, const string& reduction_type);
+
+  typedef AsyncOpKernel::DoneCallback DoneCallback;
+
+  virtual void TryApplyGrad(int64_t local_step, OpKernelContext* ctx) = 0;
+  void TryTakeGrad(int num_required, OpKernelContext* ctx,
+                   DoneCallback callback);
+
+  // Accessor methods
+  uint32 num_accumulated() {
+    mutex_lock lock(mu_);
+    return counter_;
+  }
+
+  const DataType& dtype() const { return dtype_; }
+
+  string DebugString() const override { return "A conditional accumulator"; }
+
+  // SetGlobalStep is a modifier method for current_global_step.
+  // It returns an InvalidArgument error if the new_global_step is less than
+  // current_global_step.
+  absl::Status SetGlobalStep(int64_t new_global_step);
+
+  absl::Status MatchesNodeDef(const NodeDef& node_def);
+
+ protected:
+  // Virtual methods to be implemented by sub-classes for different datatypes.
+  // Implements arithmetic operations specific to datatype.
+  virtual void DivideAccumGradByCounter(OpKernelContext* ctx)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+  virtual bool SetOutput(OpKernelContext* ctx) = 0;
+
+  enum RunResult { kNoProgress, kComplete };
+
+  // Helper struct holding information about a TakeGrad attempt
+  struct Attempt;
+  typedef std::function<RunResult(Attempt*)> RunCallback;
+  struct Attempt {
+    int elements_requested;
+    DoneCallback done_callback;  // must be run outside mu_
+    OpKernelContext* context;
+    CancellationManager* cancellation_manager;  // not owned
+    CancellationToken cancellation_token;
+    RunCallback run_callback;  // must be run while holding mu_
+    bool is_cancelled;
+
+    Attempt(int elements_requested, DoneCallback done_callback,
+            OpKernelContext* context, CancellationManager* cancellation_manager,
+            CancellationToken cancellation_token, RunCallback run_callback)
+        : elements_requested(elements_requested),
+          done_callback(std::move(done_callback)),
+          context(context),
+          cancellation_manager(cancellation_manager),
+          cancellation_token(cancellation_token),
+          run_callback(std::move(run_callback)),
+          is_cancelled(false) {}
+  };
+
+  // Helper struct for deregistration of a cancellation token and executing a
+  // DoneCallback after a TakeGrad attempt is complete.
+  struct CleanUp {
+    CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm)
+        : finished(f), to_deregister(ct), cm(cm) {}
+    DoneCallback finished;
+    CancellationToken to_deregister;
+    CancellationManager* cm;
+  };
+
+  // Fields
+
+  const DataType dtype_;
+  const PartialTensorShape shape_;
+  const string name_;
+  const string reduction_type_;
+  mutex mu_;
+  int counter_ TF_GUARDED_BY(mu_);
+  int64_t current_global_step_ TF_GUARDED_BY(mu_);
+
+  std::deque<Attempt> takegrad_attempts_ TF_GUARDED_BY(mu_);
+
+  // Methods
+
+  // Helper function for creating cancellation callback
+  void Cancel(CancellationManager* cancellation_manager,
+              CancellationToken token);
+
+  // Helper functions to process TakeGrad attempts.
+  // FlushUnlocked is called at the end of each TryApplyGrad and TryTakeGrad
+  // calls to try to clear the TakeGrad attempts. This in turn calls
+  // TryAttemptLocked, which then executes the RunCallback of the logged
+  // attempts.
+  // Both functions are modeled after core/kernels/queue_base.
+  // Note: ApplyGrad attempts never block -- unlike in a queue with limited
+  //       capacity, we can always add the newest gradient to our accumulator
+  //       (if it is not stale) or drop it silently (if it is stale).
+  void FlushUnlocked();
+  bool TryAttemptLocked(std::vector<CleanUp>* clean_up)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Helper methods
+  //  void DeepCopy(Tensor* dst);
+  bool TakeGradLockedHelper(OpKernelContext* ctx, DoneCallback callback)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+};
+
+/*
+ * Modifications to convenience macros defined in core/framework/op_kernel.h.
+ * The below macros return a boolean if the test fails, so that the calling
+ * function can get an indication that a failure has occurred.
+ */
+#define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
+  do {                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                       \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
+      return false;                                    \
+    }                                                  \
+  } while (0)
+
+#define OP_REQUIRES_OK_BOOLEAN(CTX, STATUS)                 \
+  do {                                                      \
+    ::tensorflow::Status _s(STATUS);                        \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return false;                                         \
+    }                                                       \
+  } while (0)
+
+/*
+ * Convenience classes for helping to convert between numeric types.
+ * The specialization for Eigen::half here simplifies specialization of
+ * ConditionalAccumulator classes later.
+ */
+template <typename T, typename U>
+class TypeConverter {
+ public:
+  static T ConvertUToT(U c) { return c; /* implicit conversion */ }
+};
+
+template <typename U>
+class TypeConverter<Eigen::half, U> {
+ public:
+  static Eigen::half ConvertUToT(U c) { return static_cast<Eigen::half>(c); }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator_base_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator_base_op.h
new file mode 100644
index 00000000..c0d1c9a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -0,0 +1,262 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/conditional_accumulator_base.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+typedef std::function<void()> DoneCallback;
+
+namespace tensorflow {
+
+/**
+ * Defines a ConditionalAccumulatorBaseOp, which constructs a
+ * ConditionalAccumulatorBase (via sub-class's Creator) and returns its handle.
+ */
+class ConditionalAccumulatorBaseOp : public OpKernel {
+ public:
+  explicit ConditionalAccumulatorBaseOp(OpKernelConstruction* context)
+      : OpKernel(context), accumulator_set_(false) {
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_STRING, TensorShape({2}),
+                                                   &accumulator_));
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("reduction_type", &reduction_type_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    if (!accumulator_set_) {
+      OP_REQUIRES_OK(ctx, SetAccumulatorHandle(ctx));
+    }
+    SetHandleToOutput(ctx);
+  }
+
+ protected:
+  ~ConditionalAccumulatorBaseOp() override {
+    // If the accumulator object was not shared, delete it.
+    if (accumulator_set_ && cinfo_.resource_is_private_to_kernel()) {
+      TF_CHECK_OK((cinfo_.resource_manager()
+                       ->template Delete<ConditionalAccumulatorBase>(
+                           cinfo_.container(), cinfo_.name())));
+    }
+  }
+
+ protected:
+  virtual void SetHandleToOutput(OpKernelContext* ctx)
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  virtual absl::Status CheckSignature(OpKernelContext* ctx) = 0;
+
+ protected:
+  typedef std::function<absl::Status(ConditionalAccumulatorBase**)> Creator;
+
+  // Subclasses must override this
+  virtual Creator GetCreator() const = 0;
+
+  // Variables required to construct ConditionalAccumulator
+  DataType dtype_;
+  PartialTensorShape shape_;
+  ContainerInfo cinfo_;
+  string reduction_type_;
+  mutex mu_;
+  Tensor accumulator_ TF_GUARDED_BY(mu_);
+  bool accumulator_set_ TF_GUARDED_BY(mu_);
+
+ private:
+  absl::Status SetAccumulatorHandle(OpKernelContext* ctx)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def()));
+
+    // Check input signature
+    TF_RETURN_IF_ERROR(CheckSignature(ctx));
+
+    Creator creator = GetCreator();
+    ConditionalAccumulatorBase* accumulator;
+    TF_RETURN_IF_ERROR(
+        (cinfo_.resource_manager()
+             ->template LookupOrCreate<ConditionalAccumulatorBase>(
+                 cinfo_.container(), cinfo_.name(), &accumulator, creator)));
+    core::ScopedUnref unref_me(accumulator);
+
+    // Verify that the shared accumulator is compatible
+    // with the requested arguments.
+    TF_RETURN_IF_ERROR(accumulator->MatchesNodeDef(def()));
+    auto h = accumulator_.template flat<tstring>();
+    h(0) = cinfo_.container();
+    h(1) = cinfo_.name();
+    accumulator_set_ = true;
+    return absl::OkStatus();
+  }
+};
+
+// ------------------Sync kernels ------------------------------------------
+
+/**
+ * General OpKernel for ConditionalAccumulatorBase-related ops.
+ */
+class ConditionalAccumulatorBaseSyncOpKernel : public OpKernel {
+ public:
+  explicit ConditionalAccumulatorBaseSyncOpKernel(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) final {
+    ConditionalAccumulatorBase* accumulator;
+    OP_REQUIRES_OK(ctx, GetResourceFromContext(ctx, "handle", &accumulator));
+    Compute(ctx, accumulator);
+    accumulator->Unref();
+  }
+
+ protected:
+  virtual void Compute(OpKernelContext* ctx,
+                       ConditionalAccumulatorBase* accumulator) = 0;
+
+  virtual DataTypeVector GetExpectedInputs(
+      ConditionalAccumulatorBase* accumulator) = 0;
+
+  virtual void CheckSignature(OpKernelContext* ctx,
+                              ConditionalAccumulatorBase* accumulator) {
+    // Check input signature
+    DataTypeVector expected_inputs = GetExpectedInputs(accumulator);
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+  }
+};
+
+/**
+ * Defines a AccumulateGradientOp, the execution of which adds a gradient to the
+ * given ConditionalAccumulator.
+ */
+class ConditionalAccumulatorBaseApplyGradientOp
+    : public ConditionalAccumulatorBaseSyncOpKernel {
+ public:
+  explicit ConditionalAccumulatorBaseApplyGradientOp(
+      OpKernelConstruction* context)
+      : ConditionalAccumulatorBaseSyncOpKernel(context) {}
+
+ protected:
+  void Compute(OpKernelContext* ctx,
+               ConditionalAccumulatorBase* accumulator) override {
+    // Check input signature
+    CheckSignature(ctx, accumulator);
+
+    // Get input local_step
+    const Tensor* local_step_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("local_step", &local_step_tensor));
+    if (!TensorShapeUtils::IsScalar(local_step_tensor->shape())) {
+      ctx->CtxFailureWithWarning(errors::InvalidArgument(
+          "Argument local_step must be scalar, but had bad shape ",
+          local_step_tensor->shape().DebugString()));
+    }
+
+    // Actually try to apply gradient now
+    accumulator->TryApplyGrad(local_step_tensor->scalar<int64_t>()(), ctx);
+  }
+};
+
+// -------------------- Async kernels --------------------------------------
+/**
+ * General OpKernel for ConditionalAccumulatorBase-related ops.
+ */
+class ConditionalAccumulatorBaseAsyncOpKernel : public AsyncOpKernel {
+ public:
+  explicit ConditionalAccumulatorBaseAsyncOpKernel(
+      OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback callback) final {
+    ConditionalAccumulatorBase* accumulator;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, GetResourceFromContext(ctx, "handle", &accumulator), callback);
+    ComputeAsync(ctx, accumulator, [callback, accumulator]() {
+      accumulator->Unref();
+      callback();
+    });
+  }
+
+ protected:
+  virtual void ComputeAsync(OpKernelContext* ctx,
+                            ConditionalAccumulatorBase* accumulator,
+                            DoneCallback callback) = 0;
+
+  virtual DataTypeVector GetExpectedInputs(
+      ConditionalAccumulatorBase* accumulator) = 0;
+
+  virtual void CheckSignature(OpKernelContext* ctx,
+                              ConditionalAccumulatorBase* accumulator,
+                              DoneCallback callback) {
+    // Check input signature
+    OP_REQUIRES_OK_ASYNC(ctx,
+                         ctx->MatchSignature(GetExpectedInputs(accumulator),
+                                             {accumulator->dtype()}),
+                         callback);
+  }
+};
+
+/**
+ * Defines a TakeAccumulatedGradientOp, the execution of which adds a gradient
+ * to the given ConditionalAccumulator.
+ */
+class ConditionalAccumulatorBaseTakeGradientOp
+    : public ConditionalAccumulatorBaseAsyncOpKernel {
+ public:
+  explicit ConditionalAccumulatorBaseTakeGradientOp(
+      OpKernelConstruction* context)
+      : ConditionalAccumulatorBaseAsyncOpKernel(context) {}
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx,
+                    ConditionalAccumulatorBase* accumulator,
+                    DoneCallback callback) override {
+    // Check signature
+    CheckSignature(ctx, accumulator, callback);
+
+    // Get input num_required
+    const Tensor* num_required_tensor;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input("num_required", &num_required_tensor),
+                         callback);
+    if (!TensorShapeUtils::IsScalar(num_required_tensor->shape())) {
+      ctx->CtxFailureWithWarning(errors::InvalidArgument(
+          "Argument num_required must be scalar, but had bad shape ",
+          num_required_tensor->shape().DebugString()));
+      callback();
+    }
+
+    // Actually try to take gradient now
+    accumulator->TryTakeGrad(num_required_tensor->scalar<int32>()(), ctx,
+                             callback);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/constant_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/constant_op.h
new file mode 100644
index 00000000..32f1ddb7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/constant_op.h
@@ -0,0 +1,52 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// ConstantOp returns a tensor specified by ConstantOpDef.
+class ConstantOp : public OpKernel {
+ public:
+  explicit ConstantOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+  const Tensor* const_tensor() const override { return &tensor_; };
+  ~ConstantOp() override;
+
+ private:
+  Tensor tensor_;
+  ConstantOp(const ConstantOp&) = delete;
+  void operator=(const ConstantOp&) = delete;
+};
+
+class PlaceholderOp : public OpKernel {
+ public:
+  explicit PlaceholderOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  PartialTensorShape expected_shape_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/control_flow_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/control_flow_ops.h
new file mode 100644
index 00000000..13869317
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/control_flow_ops.h
@@ -0,0 +1,140 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONTROL_FLOW_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CONTROL_FLOW_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// A ControlTriggerOp is similar to a NoOp. However, it always treats the input
+// control edges as Live edges. Its primary use so far is in the scheduling of
+// recvs, where we add ControlTrigger nodes and use them to trigger recvs. We
+// allow ControlTrigger nodes to be enabled by dead nodes.
+class ControlTriggerOp : public OpKernel {
+ public:
+  explicit ControlTriggerOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+  bool IsExpensive() override { return false; }
+};
+
+// A switch op has two inputs and two outputs. It forwards the value of
+// Input:0 to the output specified by input:1. Input:1 is a boolean tensor.
+// Input:0 is forwarded to output:0 if input:1 is false, otherwise to
+// output:1.
+class SwitchOp : public OpKernel {
+ public:
+  explicit SwitchOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override;
+  bool IsExpensive() override { return false; }
+  ~SwitchOp() override {}
+
+  SwitchOp(const SwitchOp&) = delete;
+  void operator=(const SwitchOp&) = delete;
+};
+
+// An n-way switch op has two inputs and N outputs. It forwards the value of
+// Input:0 to the output specified by Input:1. Input:1 is an integer tensor.
+// Input:0 is forwarded to output:0 if Input:1 is 0, to output:1 if 1, and so
+// forth. If Input:1 is <0 or >=num_outputs(), Input:0 is forwarded to
+// output:num_outputs()-1.
+class SwitchNOp : public OpKernel {
+ public:
+  explicit SwitchNOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override;
+  bool IsExpensive() override { return false; }
+  ~SwitchNOp() override {}
+
+  SwitchNOp(const SwitchNOp&) = delete;
+  void operator=(const SwitchNOp&) = delete;
+};
+
+// A merge op has n inputs and two outputs. It forwards the value of the
+// first input that becomes available to its first output, and the
+// index of the first input to its second output.
+class MergeOp : public OpKernel {
+ public:
+  explicit MergeOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* context) override;
+  bool IsExpensive() override { return false; }
+  ~MergeOp() override {}
+
+  MergeOp(const MergeOp&) = delete;
+  void operator=(const MergeOp&) = delete;
+};
+
+// An enter op has one input and one output. It creates or finds
+// the child frame that is uniquely identified by the frame_name,
+// and makes its input available to the child frame.
+class EnterOp : public OpKernel {
+ public:
+  explicit EnterOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override;
+  bool IsExpensive() override { return false; }
+  ~EnterOp() override {}
+
+  EnterOp(const EnterOp&) = delete;
+  void operator=(const EnterOp&) = delete;
+};
+
+// An exit op has one input and one output. It exits the current
+// frame to its parent frame, and makes its input available to the
+// parent frame.
+class ExitOp : public OpKernel {
+ public:
+  explicit ExitOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override;
+  bool IsExpensive() override { return false; }
+  ~ExitOp() override {}
+
+  ExitOp(const ExitOp&) = delete;
+  void operator=(const ExitOp&) = delete;
+};
+
+// A next_iteration op has one input and one output. It makes its input
+// available to the next iteration.
+class NextIterationOp : public OpKernel {
+ public:
+  explicit NextIterationOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override;
+  bool IsExpensive() override { return false; }
+  ~NextIterationOp() override {}
+
+  NextIterationOp(const NextIterationOp&) = delete;
+  void operator=(const NextIterationOp&) = delete;
+};
+
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+  explicit LoopCondOp(OpKernelConstruction* context);
+  ~LoopCondOp() override;
+
+  void Compute(OpKernelContext* context) override;
+
+  bool IsExpensive() override;
+
+  LoopCondOp(const LoopCondOp&) = delete;
+  void operator=(const LoopCondOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_2d.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_2d.h
new file mode 100644
index 00000000..1ddeec23
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_2d.h
@@ -0,0 +1,585 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_2D_H_
+
+#include "absl/strings/string_view.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+// Returns true if TF_CONV2D_USE_FP16_ACCUMULATE == 1, false otherwise.
+static bool Conv2dUseFp16Accumulate() {
+  static bool use_fp16_accumulate = []() {
+    const char* env = std::getenv("TF_CONV2D_USE_FP16_ACCUMULATE");
+    return (env != nullptr) && (absl::string_view(env) == "1");
+  }();
+  return use_fp16_accumulate;
+}
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename Input, typename Filter, typename Output,
+          typename OutputKernel>
+void SpatialConvolutionFunc(const Device& d, Output output, Input input,
+                            Filter filter, int row_stride, int col_stride,
+                            int row_dilation, int col_dilation,
+                            const Eigen::PaddingType& padding,
+                            const OutputKernel& output_kernel,
+                            int padding_top = 0, int padding_bottom = 0,
+                            int padding_left = 0, int padding_right = 0) {
+  // Need to swap row/col, padding_top/padding_left, and
+  // padding_bottom/padding_right when calling Eigen. Eigen expects the tensor
+  // in NWHC format, but the tensor given is in NHWC.
+  output.device(d) = Eigen::SpatialConvolution(
+      input, filter, col_stride, row_stride, padding, col_dilation,
+      row_dilation, output_kernel, padding_left, padding_right, padding_top,
+      padding_bottom);
+}
+
+// TODO(ezhulenev): Non-templated `operator()` are required by explicit template
+// instantiations for the GPU device. However they are almost certainly not used
+// in any of the kernel implementation. Check if they can be removed.
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
+struct SpatialConvolution {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 4>::ConstTensor filter, int row_stride,
+                  int col_stride, int row_dilation, int col_dilation,
+                  const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
+                           row_dilation, col_dilation, padding, output_kernel);
+  }
+
+  template <typename Input, typename Filter, typename Output>
+  void operator()(const Device& d, Output output, Input input, Filter filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
+                           row_dilation, col_dilation, padding, output_kernel);
+  }
+
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 4>::ConstTensor filter, int row_stride,
+                  int col_stride, int row_dilation, int col_dilation,
+                  int padding_top, int padding_bottom, int padding_left,
+                  int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    SpatialConvolutionFunc(
+        d, output, input, filter, row_stride, col_stride, row_dilation,
+        col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
+        padding_top, padding_bottom, padding_left, padding_right);
+  }
+
+  template <typename Input, typename Filter, typename Output>
+  void operator()(const Device& d, Output output, Input input, Filter filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, int padding_top, int padding_bottom,
+                  int padding_left, int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    SpatialConvolutionFunc(
+        d, output, input, filter, row_stride, col_stride, row_dilation,
+        col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
+        padding_top, padding_bottom, padding_left, padding_right);
+  }
+};
+
+template <typename Device, typename OutputKernel>
+struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
+  void operator()(const Device& d,
+                  typename TTypes<Eigen::half, 4>::Tensor output,
+                  typename TTypes<Eigen::half, 4>::ConstTensor input,
+                  typename TTypes<Eigen::half, 4>::ConstTensor filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    if (Conv2dUseFp16Accumulate()) {
+      output.device(d) = Eigen::SpatialConvolution(
+          input, filter, col_stride, row_stride, padding, col_dilation,
+          row_dilation, output_kernel);
+    } else {
+      output.device(d) =
+          Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
+                                    col_stride, row_stride, padding,
+                                    col_dilation, row_dilation, output_kernel)
+              .template cast<Eigen::half>();
+    }
+  }
+
+  template <typename Input, typename Filter, typename Output>
+  void operator()(const Device& d, Output output, Input input, Filter filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    if (Conv2dUseFp16Accumulate()) {
+      output.device(d) = Eigen::SpatialConvolution(
+          input, filter, col_stride, row_stride, padding, col_dilation,
+          row_dilation, output_kernel);
+    } else {
+      output.device(d) =
+          Eigen::SpatialConvolution(input.template cast<float>(),
+                                    filter.template cast<float>(), col_stride,
+                                    row_stride, padding, col_dilation,
+                                    row_dilation, output_kernel)
+              .template cast<Eigen::half>();
+    }
+  }
+
+  void operator()(const Device& d,
+                  typename TTypes<Eigen::half, 4>::Tensor output,
+                  typename TTypes<Eigen::half, 4>::ConstTensor input,
+                  typename TTypes<Eigen::half, 4>::ConstTensor filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, int padding_top, int padding_bottom,
+                  int padding_left, int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    if (Conv2dUseFp16Accumulate()) {
+      output.device(d) = Eigen::SpatialConvolution(
+          input, filter, col_stride, row_stride,
+          Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
+          output_kernel, padding_left, padding_right, padding_top,
+          padding_bottom);
+    } else {
+      output.device(d) =
+          Eigen::SpatialConvolution(
+              input.cast<float>(), filter.cast<float>(), col_stride, row_stride,
+              Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
+              output_kernel, padding_left, padding_right, padding_top,
+              padding_bottom)
+              .template cast<Eigen::half>();
+    }
+  }
+
+  template <typename Input, typename Filter, typename Output>
+  void operator()(const Device& d, Output output, Input input, Filter filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, int padding_top, int padding_bottom,
+                  int padding_left, int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    if (Conv2dUseFp16Accumulate()) {
+      output.device(d) = Eigen::SpatialConvolution(
+          input, filter, col_stride, row_stride,
+          Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
+          output_kernel, padding_left, padding_right, padding_top,
+          padding_bottom);
+    } else {
+      output.device(d) =
+          Eigen::SpatialConvolution(
+              input.template cast<float>(), filter.template cast<float>(),
+              col_stride, row_stride, Eigen::PaddingType::PADDING_VALID,
+              col_dilation, row_dilation, output_kernel, padding_left,
+              padding_right, padding_top, padding_bottom)
+              .template cast<Eigen::half>();
+    }
+  }
+};
+
+// Use float32 accumulation for bfloat16 to deal with precision accumulation
+// issues.
+template <typename Device, typename OutputKernel>
+struct SpatialConvolution<Device, Eigen::bfloat16, OutputKernel> {
+  void operator()(const Device& d,
+                  typename TTypes<Eigen::bfloat16, 4>::Tensor output,
+                  typename TTypes<Eigen::bfloat16, 4>::ConstTensor input,
+                  typename TTypes<Eigen::bfloat16, 4>::ConstTensor filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    output.device(d) =
+        Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
+                                  col_stride, row_stride, padding, col_dilation,
+                                  row_dilation, output_kernel)
+            .template cast<Eigen::bfloat16>();
+  }
+
+  template <typename Input, typename Filter, typename Output>
+  void operator()(const Device& d, Output output, Input input, Filter filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    output.device(d) =
+        Eigen::SpatialConvolution(input.template cast<float>(),
+                                  filter.template cast<float>(), col_stride,
+                                  row_stride, padding, col_dilation,
+                                  row_dilation, output_kernel)
+            .template cast<Eigen::bfloat16>();
+  }
+
+  void operator()(const Device& d,
+                  typename TTypes<Eigen::bfloat16, 4>::Tensor output,
+                  typename TTypes<Eigen::bfloat16, 4>::ConstTensor input,
+                  typename TTypes<Eigen::bfloat16, 4>::ConstTensor filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, int padding_top, int padding_bottom,
+                  int padding_left, int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    output.device(d) =
+        Eigen::SpatialConvolution(
+            input.cast<float>(), filter.cast<float>(), col_stride, row_stride,
+            Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
+            output_kernel, padding_left, padding_right, padding_top,
+            padding_bottom)
+            .template cast<Eigen::bfloat16>();
+  }
+
+  template <typename Input, typename Filter, typename Output>
+  void operator()(const Device& d, Output output, Input input, Filter filter,
+                  int row_stride, int col_stride, int row_dilation,
+                  int col_dilation, int padding_top, int padding_bottom,
+                  int padding_left, int padding_right,
+                  const OutputKernel& output_kernel = OutputKernel()) {
+    output.device(d) =
+        Eigen::SpatialConvolution(
+            input.template cast<float>(), filter.template cast<float>(),
+            col_stride, row_stride, Eigen::PaddingType::PADDING_VALID,
+            col_dilation, row_dilation, output_kernel, padding_left,
+            padding_right, padding_top, padding_bottom)
+            .template cast<Eigen::bfloat16>();
+  }
+};
+
+template <typename Device, typename T>
+struct SpatialConvolutionBackwardInputFunc {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
+                  typename TTypes<T, 4>::ConstTensor filter,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
+                  Eigen::DenseIndex col_dilation,
+                  Eigen::DenseIndex row_dilation) {
+    input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
+        filter, output_backward, input_backward.dimension(2),
+        input_backward.dimension(1), col_stride, row_stride, col_dilation,
+        row_dilation);
+  }
+};
+
+// GPU version requires all tensors to be indexable by int32.
+template <typename T>
+struct SpatialConvolutionBackwardInputFunc<Eigen::GpuDevice, T> {
+  void operator()(const Eigen::GpuDevice& d,
+                  typename TTypes<T, 4>::Tensor input_backward,
+                  typename TTypes<T, 4>::ConstTensor filter,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
+                  Eigen::DenseIndex col_dilation,
+                  Eigen::DenseIndex row_dilation) {
+    To32Bit(input_backward).device(d) = Eigen::SpatialConvolutionBackwardInput(
+        To32Bit(filter), To32Bit(output_backward), input_backward.dimension(2),
+        input_backward.dimension(1), col_stride, row_stride, col_dilation,
+        row_dilation);
+  }
+};
+
+template <typename Device, typename T>
+struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
+                  typename TTypes<T, 4>::ConstTensor filter,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
+                  Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
+                  Eigen::DenseIndex col_dilation,
+                  Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
+                  Eigen::DenseIndex pad_top) {
+    // We have to slice the result of a spatial convolution backward
+    // input, before assigning it to the `input_backward` to remove padding.
+    //
+    // TODO(ezhulenev): Pass explicit paddings to Eigen and do not materialize
+    // intermediate result in memory before slicing.
+    input_backward.device(d) =
+        Eigen::SpatialConvolutionBackwardInput(
+            filter, output_backward, padded_cols, padded_rows, col_stride,
+            row_stride, col_dilation, row_dilation)
+            .eval()
+            .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
+                   input_backward.dimensions());
+  }
+};
+
+// GPU version requires all tensors to be indexable by int32.
+template <typename T>
+struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<Eigen::GpuDevice,
+                                                              T> {
+  void operator()(const Eigen::GpuDevice& d,
+                  typename TTypes<T, 4>::Tensor input_backward,
+                  typename TTypes<T, 4>::ConstTensor filter,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
+                  Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
+                  Eigen::DenseIndex col_dilation,
+                  Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
+                  Eigen::DenseIndex pad_top) {
+    To32Bit(input_backward).device(d) =
+        Eigen::SpatialConvolutionBackwardInput(
+            To32Bit(filter), To32Bit(output_backward), padded_cols, padded_rows,
+            col_stride, row_stride, col_dilation, row_dilation)
+            .eval()
+            .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
+                   input_backward.dimensions());
+  }
+};
+
+// TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
+// My initial attempt to do this compiled but failed in the pytest
+// due to a swigdeps error.
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
+struct MatMulConvFunctor {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, typename TTypes<T, 2>::Tensor out,
+      typename TTypes<T, 2>::ConstTensor in0,
+      typename TTypes<T, 2>::ConstTensor in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    out.device(d) = in0.contract(in1, dim_pair, output_kernel);
+  }
+};
+
+// Use float32 accumulation for float16 by default to deal with precision
+// accumulation issues.  To enable float16 accumulation, set the environment
+// variable TF_CONV2D_USE_FP16_ACCUMULATE.
+template <typename Device, typename OutputKernel>
+struct MatMulConvFunctor<Device, Eigen::half, OutputKernel> {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, typename TTypes<Eigen::half, 2>::Tensor out,
+      typename TTypes<Eigen::half, 2>::ConstTensor in0,
+      typename TTypes<Eigen::half, 2>::ConstTensor in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    if (Conv2dUseFp16Accumulate()) {
+      out.device(d) = in0.contract(in1, dim_pair, output_kernel);
+    } else {
+      out.device(d) =
+          in0.cast<float>()
+              .contract(in1.template cast<float>(), dim_pair, output_kernel)
+              .template cast<Eigen::half>();
+    }
+  }
+};
+
+// Use float32 accumulation for bfloat16 to deal with precision accumulation
+// issues.
+template <typename Device, typename OutputKernel>
+struct MatMulConvFunctor<Device, Eigen::bfloat16, OutputKernel> {
+  void operator()(
+      const Device& d, typename TTypes<Eigen::bfloat16, 2>::Tensor out,
+      typename TTypes<Eigen::bfloat16, 2>::ConstTensor in0,
+      typename TTypes<Eigen::bfloat16, 2>::ConstTensor in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    out.device(d) = in0.cast<float>()
+                        .contract(in1.cast<float>(), dim_pair, output_kernel)
+                        .template cast<Eigen::bfloat16>();
+  }
+};
+
+// Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format.
+//
+// Note: Currently supports OIHW and OHWI destination formats.
+template <typename Device, typename T, typename IndexType, int NDIMS>
+struct TransformFilter {
+  void operator()(const Device& d, FilterTensorFormat dst_filter_format,
+                  typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
+                  typename TTypes<T, NDIMS, IndexType>::Tensor out) {
+    // NOTE: Source filter format is always HWIO.
+    Eigen::DSizes<IndexType, NDIMS - 2> spatial_dims;
+    for (int i = 0; i < spatial_dims.rank(); ++i) {
+      spatial_dims[i] = in.dimension(i);
+    }
+
+    // Merge the spatial dimensions together to speed up the shuffle operation.
+    Eigen::DSizes<IndexType, 3> merged_dims;
+    merged_dims[0] = spatial_dims.TotalSize();  // product of spatial dims [H*W]
+    merged_dims[1] = in.dimension(NDIMS - 2);   // input filters           [I]
+    merged_dims[2] = in.dimension(NDIMS - 1);   // output filters          [O]
+
+    // Shuffle tensor with merged spatial dimensions.
+    Eigen::DSizes<IndexType, 3> shuffling_perm;
+    // Expand shuffled tensor into final dimensions.
+    Eigen::DSizes<IndexType, NDIMS> expanded_dims;
+
+    if (dst_filter_format == FORMAT_OIHW) {
+      shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 1, 0);
+
+      expanded_dims[0] = merged_dims[2];  // [O]
+      expanded_dims[1] = merged_dims[1];  // [I]
+      for (int i = 0; i < spatial_dims.rank(); ++i) {
+        expanded_dims[2 + i] = spatial_dims[i];
+      }
+
+    } else if (dst_filter_format == FORMAT_OHWI) {
+      shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 0, 1);
+
+      expanded_dims[0] = merged_dims[2];          // [O]
+      expanded_dims[NDIMS - 1] = merged_dims[1];  // [I]
+      for (int i = 0; i < spatial_dims.rank(); ++i) {
+        expanded_dims[1 + i] = spatial_dims[i];
+      }
+
+    } else {
+      DCHECK(false) << "Unsupported destination filter format: "
+                    << ToString(dst_filter_format);
+    }
+
+    out.device(d) =
+        in.reshape(merged_dims).shuffle(shuffling_perm).reshape(expanded_dims);
+  }
+};
+
+// TODO This functor is not used anywhere and should be removed,
+// but it defines some eigen templates that are referenced in other kernels.
+template <typename Device, typename T, typename IndexType>
+struct TransformDepth {
+  void operator()(const Device& d,
+                  typename TTypes<T, 4, IndexType>::ConstTensor in,
+                  const Eigen::DSizes<IndexType, 4>& shuffle,
+                  typename TTypes<T, 4, IndexType>::Tensor out) {
+    Eigen::DSizes<IndexType, 3> merged_dims;
+    Eigen::DSizes<IndexType, 4> expanded_dims;
+    Eigen::DSizes<IndexType, 3> new_shuffle;
+
+    // Merge dimensions that won't be shuffled together to speed things up.
+    if (shuffle[1] == 2 && shuffle[2] == 3) {
+      merged_dims[0] = in.dimension(0);
+      merged_dims[1] = in.dimension(1);
+      merged_dims[2] = in.dimension(2) * in.dimension(3);
+      new_shuffle[0] = shuffle[0];
+      new_shuffle[1] = 2;
+      new_shuffle[2] = shuffle[3];
+      expanded_dims[0] = in.dimension(shuffle[0]);
+      expanded_dims[1] = in.dimension(2);
+      expanded_dims[2] = in.dimension(3);
+      expanded_dims[3] = in.dimension(shuffle[3]);
+    } else if (shuffle[0] == 2 && shuffle[1] == 3) {
+      merged_dims[0] = in.dimension(0);
+      merged_dims[1] = in.dimension(1);
+      merged_dims[2] = in.dimension(2) * in.dimension(3);
+      new_shuffle[0] = 2;
+      new_shuffle[1] = shuffle[2];
+      new_shuffle[2] = shuffle[3];
+      expanded_dims[0] = in.dimension(2);
+      expanded_dims[1] = in.dimension(3);
+      expanded_dims[2] = in.dimension(shuffle[2]);
+      expanded_dims[3] = in.dimension(shuffle[3]);
+    } else if (shuffle[0] == 0 && shuffle[1] == 3 && shuffle[2] == 1 &&
+               shuffle[3] == 2) {
+      merged_dims[0] = in.dimension(0);
+      merged_dims[1] = in.dimension(1) * in.dimension(2);
+      merged_dims[2] = in.dimension(3);
+      new_shuffle[0] = 0;
+      new_shuffle[1] = 2;
+      new_shuffle[2] = 1;
+      expanded_dims[0] = in.dimension(0);
+      expanded_dims[1] = in.dimension(3);
+      expanded_dims[2] = in.dimension(1);
+      expanded_dims[3] = in.dimension(2);
+    } else {
+      assert(false && "unexpected shuffle");
+    }
+
+    out.device(d) =
+        in.reshape(merged_dims).shuffle(new_shuffle).reshape(expanded_dims);
+  }
+};
+
+template <typename Device, typename T, typename IndexType, int NDIMS>
+struct PadInput {
+  void operator()(const Device& d,
+                  typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
+                  const std::array<int, NDIMS - 2>& padding_left,
+                  const std::array<int, NDIMS - 2>& padding_right,
+                  typename TTypes<T, NDIMS, IndexType>::Tensor out,
+                  TensorFormat format, const T& padding_value) {
+    Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
+    padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
+    for (int i = 0; i < NDIMS - 2; ++i) {
+      padding[GetTensorDimIndex<NDIMS - 2>(format, '0' + i)] = {
+          padding_left[i], padding_right[i]};
+    }
+    padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
+    out.device(d) = in.pad(padding, padding_value);
+  }
+};
+
+// Converts a tensor from:
+//   [batch, <spatial>, filters]
+// to:
+//   [batch, filters, <spatial>]
+template <typename Device, typename T, int NDIMS>
+struct NHWCToNCHW {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out);
+};
+
+// Converts a tensor from:
+//   [batch, filters, <spatial>]
+// to:
+//   [batch, <spatial>, filters]
+template <typename Device, typename T, int NDIMS>
+struct NCHWToNHWC {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out);
+};
+
+// Converts a tensor from:
+//   [dim0, dim1, dim2]
+// to:
+//   [dim0, dim2, dim1]
+template <typename Device, typename T, bool conjugate = false>
+struct SwapDimension1And2InTensor3 {
+  void operator()(const Device& d, const T* in,
+                  const absl::Span<const int64_t>& input_dims, T* out);
+};
+
+// Converts a tensor from:
+//   [dim0, dim1, dim2]
+// to:
+//   [dim2, dim1, dim0]
+template <typename Device, typename T, bool conjugate = false>
+struct SwapDimension0And2InTensor3 {
+  void operator()(const Device& d, const T* in,
+                  const absl::Span<const int64_t>& input_dims, T* out);
+};
+
+// Transforms back filter from OIHW or OHWI to HWOI format to reverse effect of
+// TransformFilter above.
+template <typename Device, typename T, int NDIMS>
+struct ReverseTransformFilter {
+  void operator()(const Device& d, FilterTensorFormat src_filter_format,
+                  typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out);
+};
+
+}  // namespace functor
+
+template <class T>
+class ConvAlgorithmMap;
+
+template <>
+class ConvAlgorithmMap<Eigen::ThreadPoolDevice> {};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_2d_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_2d_gpu.h
new file mode 100644
index 00000000..60d2e831
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_2d_gpu.h
@@ -0,0 +1,1147 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T, bool conjugate>
+struct maybe_conj {
+  __device__ static __inline__ T run(T x) {
+    if (conjugate) {
+      return Eigen::numext::conj(x);
+    } else {
+      return x;
+    }
+  }
+};
+
+// Partial specializations for Gpu types used to store complex numbers.
+template <bool conjugate>
+struct maybe_conj<float2, conjugate> {
+  __device__ static __inline__ float2 run(float2 c) {
+    if (conjugate) {
+      float2 c_conj;
+      c_conj.x = c.x;
+      c_conj.y = -c.y;
+      return c_conj;
+    } else {
+      return c;
+    }
+  }
+};
+
+template <bool conjugate>
+struct maybe_conj<double2, conjugate> {
+  __device__ static __inline__ double2 run(double2 c) {
+    if (conjugate) {
+      double2 c_conj;
+      c_conj.x = c.x;
+      c_conj.y = -c.y;
+      return c_conj;
+    } else {
+      return c;
+    }
+  }
+};
+
+// TODO(mjanusz): Move this to a shared util file.
+// A simple array that contains data that can be passed between CPU and GPU.
+template <typename T, int IndexCount, T DefaultValue>
+struct Array {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](int index) const {
+    return data[index];
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](int index) {
+    return data[index];
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array() {
+    for (int i = 0; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0) {
+    data[0] = a0;
+    for (int i = 1; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0, T a1) {
+    data[0] = a0;
+    data[1] = a1;
+    for (int i = 2; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0, T a1, T a2) {
+    data[0] = a0;
+    data[1] = a1;
+    data[2] = a2;
+    for (int i = 3; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_STRONG_INLINE Array(const std::array<T, IndexCount>& array) {
+    for (int i = 0; i < IndexCount; i++) {
+      data[i] = array[i];
+    }
+  }
+  T data[IndexCount];
+};
+
+// A dimension type with compile-time known size.
+template <int IndexCount>
+struct Dimension : Array<int, IndexCount, 1> {
+  typedef Array<int, IndexCount, 1> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension() : Base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0) : Base(a0) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0, int a1)
+      : Base(a0, a1) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0, int a1, int a2)
+      : Base(a0, a1, a2) {}
+  EIGEN_STRONG_INLINE Dimension(const std::array<int, IndexCount>& array)
+      : Base(array) {}
+};
+
+// An index type with compile-time known size.
+template <int IndexCount>
+struct Index : Array<int, IndexCount, 0> {
+  typedef Array<int, IndexCount, 0> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index() : Base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0) : Base(a0) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0, int a1) : Base(a0, a1) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0, int a1, int a2)
+      : Base(a0, a1, a2) {}
+};
+
+// A helper function that converts a tensor index into a flat array index.
+template <int IndexCount>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int TensorIndexToFlat(
+    const Index<IndexCount>& index, const Dimension<IndexCount>& dims) {
+  int flat_index = index[0];
+  for (int i = 1; i < IndexCount; i++) {
+    flat_index = flat_index * dims[i] + index[i];
+  }
+  return flat_index;
+}
+
+// A helper function that converts a flat array index into a tensor index.
+template <int IndexCount>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex(
+    int index, const Dimension<IndexCount>& dims) {
+  Index<IndexCount> tensor_index;
+  for (int i = IndexCount - 1; i >= 0; i--) {
+    int new_index = index / dims[i];
+    tensor_index[i] = index - dims[i] * new_index;
+    index = new_index;
+  }
+  return tensor_index;
+}
+
+// A simple CUDA custom kernel to shuffle dimensions of a 3D tensor according to
+// the given shuffle permutation in template parameters. Shuffle permutation
+// <sp0, sp1, sp2> shuffles dimensions such that input dimension 0 goes to sp0,
+// 1 goes to sp1 and 2 goes to sp2. For example, shuffle permutation <2, 0, 1>
+// will populate output so that input[x][y][z] is equal to (*output)[y][z][x].
+//
+// Requires that nthreads is equal to the total number of elements in the input
+// tensor.
+template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
+__global__ void ShuffleInTensor3Simple(int nthreads,
+                                       const T* __restrict__ input,
+                                       Dimension<3> input_dims,
+                                       T* __restrict__ output) {
+  Dimension<3> output_dims;
+  output_dims[sp0] = input_dims[0];
+  output_dims[sp1] = input_dims[1];
+  output_dims[sp2] = input_dims[2];
+
+  // Iterate over output as opposed to iterating over input for better
+  // performance. Iterating over output will generate sequential writes and
+  // random reads that performs better compared to sequential reads and random
+  // writes.
+  GPU_1D_KERNEL_LOOP(output_index, nthreads) {
+    Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
+
+    Index<3> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[sp0];
+    input_tensor_index[1] = output_tensor_index[sp1];
+    input_tensor_index[2] = output_tensor_index[sp2];
+
+    int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+
+    output[output_index] =
+        maybe_conj<T, conjugate>::run(ldg(input + input_index));
+  }
+}
+
+static constexpr int kUnroll = 4;
+
+template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
+__global__ void ShuffleInTensor3SimpleVector(int nthreads,
+                                             const T* __restrict__ input,
+                                             Dimension<3> input_dims,
+                                             T* __restrict__ output) {
+  Dimension<3> output_dims;
+  output_dims[sp0] = input_dims[0];
+  output_dims[sp1] = input_dims[1];
+  output_dims[sp2] = input_dims[2];
+
+  const int stride = blockDim.x * gridDim.x * kUnroll;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  T buf[kUnroll];
+
+  int output_index;
+  for (output_index = tid * kUnroll; output_index + kUnroll - 1 < nthreads;
+       output_index += stride) {
+#pragma unroll
+    for (int i = 0; i < kUnroll; i++) {
+      int output_index_i = output_index + i;
+      Index<3> output_tensor_index =
+          FlatToTensorIndex(output_index_i, output_dims);
+      Index<3> input_tensor_index;
+      input_tensor_index[0] = output_tensor_index[sp0];
+      input_tensor_index[1] = output_tensor_index[sp1];
+      input_tensor_index[2] = output_tensor_index[sp2];
+
+      int input_index_i = TensorIndexToFlat(input_tensor_index, input_dims);
+      buf[i] = maybe_conj<T, conjugate>::run(ldg(input + input_index_i));
+    }
+    float2* out = reinterpret_cast<float2*>(output + output_index);
+    *out = *reinterpret_cast<float2*>(buf);
+  }
+
+  for (; output_index < nthreads; ++output_index) {
+    Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
+
+    Index<3> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[sp0];
+    input_tensor_index[1] = output_tensor_index[sp1];
+    input_tensor_index[2] = output_tensor_index[sp2];
+
+    int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+
+    output[output_index] =
+        maybe_conj<T, conjugate>::run(ldg(input + input_index));
+  }
+}
+
+// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
+// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
+//
+// Each thread block operates on a single tile, a rectangle of dimensions
+// TileSizeI x TileSizeJ.
+//
+// In general, for best performance, you should probably set TileSizeI,
+// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
+// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
+// the best performance on K40 GPUs.
+template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
+          bool conjugate = false>
+__global__ void SwapDimension1And2InTensor3UsingTiles(
+    const T* __restrict__ input, Dimension<3> input_dims,
+    T* __restrict__ output) {
+  eigen_assert(blockDim.x == NumThreads);
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
+  constexpr int WriteRowPerPass = NumThreads / TileSizeI;
+  // One extra line in the inner dimension to avoid share memory bank conflict.
+  // This is to mimic the following, but no constructor of T can be invoked.
+  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+#if GOOGLE_CUDA
+  __shared__ __align__(
+      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
+  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
+  SharedMemoryTile shared_memory_tile =
+      reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
+#elif TENSORFLOW_USE_ROCM
+  __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+#endif
+
+  int x = threadIdx.x;
+
+  Dimension<3> output_dims = {
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
+  };
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
+  };
+
+  Index<3> input_tile_index =
+      FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
+
+  Index<3> input_tile_origin = {
+      input_tile_index[0],
+      input_tile_index[1] * TileSizeI,
+      input_tile_index[2] * TileSizeJ,
+  };
+
+  int input_origin_flat_index =
+      TensorIndexToFlat(input_tile_origin, input_dims);
+
+  bool full_tile = true;
+  int tile_width = TileSizeJ;
+
+  // Only the last row or column may not have the full size.
+  if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
+    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
+    full_tile &= false;
+  }
+
+  int tile_height = TileSizeI;
+
+  if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
+    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
+    full_tile &= false;
+  }
+
+  // Calculate effective thread number. This ensures that we use the largest
+  // number of threads available to form a regular thread block with no
+  // trailing incomplete lines.
+  constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;
+
+  if (x < in_effective_thread_num) {
+    // Orient the logical thread block with respect to the input array.
+    // ie. align the contiguous dimension of thread blocks with the contiguous
+    // dimension of the input array.
+    int ti = x / TileSizeJ;
+    int tj = x % TileSizeJ;
+    int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
+    int input_increment = ReadRowPerPass * input_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
+        shared_memory_tile[i_loc][tj] =
+            maybe_conj<T, conjugate>::run(input[input_index]);
+        input_index += input_increment;
+      }
+    } else {
+      if (tj < tile_width) {
+        for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
+          shared_memory_tile[i_loc][tj] =
+              maybe_conj<T, conjugate>::run(input[input_index]);
+          input_index += input_increment;
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  Index<3> output_tile_index = {
+      input_tile_index[0],
+      input_tile_index[2],
+      input_tile_index[1],
+  };
+
+  Index<3> output_tile_origin = {
+      output_tile_index[0],
+      output_tile_index[1] * TileSizeJ,
+      output_tile_index[2] * TileSizeI,
+  };
+
+  int output_origin_flat_index =
+      TensorIndexToFlat(output_tile_origin, output_dims);
+
+  constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;
+
+  if (x < out_effective_thread_num) {
+    // Re-orient the logical thread block with respect to the output array.
+    // ie. align the contiguous dimension of thread blocks with contiguous
+    // dimension of the output array.
+    int ti = x / TileSizeI;
+    int tj = x % TileSizeI;
+    int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
+    int output_increment = WriteRowPerPass * output_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
+        output[output_index] = shared_memory_tile[tj][i_loc];
+        output_index += output_increment;
+      }
+    } else {
+      if (tj < tile_height) {
+        for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
+          output[output_index] = shared_memory_tile[tj][i_loc];
+          output_index += output_increment;
+        }
+      }
+    }
+  }
+}
+
+// A Gpu custom kernel that convert input to output, given proper padding on
+// the left and the top.
+template <typename T, int NDIMS>
+__global__ void PadInputCustomKernelNHWC(
+    int nthreads, const T* __restrict__ input, Dimension<NDIMS> input_dims,
+    T* __restrict__ output, Dimension<NDIMS> output_dims,
+    Dimension<NDIMS - 2> padding_left, T padding_value) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
+    int output_index = index;
+    Index<NDIMS> output_tensor_index =
+        FlatToTensorIndex(output_index, output_dims);
+
+    Index<NDIMS> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[0];  // batch
+    bool ok = true;
+    for (int i = 1; i < NDIMS - 1; i++) {
+      input_tensor_index[i] = output_tensor_index[i] - padding_left[i - 1];
+      ok &=
+          (input_tensor_index[i] >= 0 && input_tensor_index[i] < input_dims[i]);
+    }
+    input_tensor_index[NDIMS - 1] = output_tensor_index[NDIMS - 1];  // channels
+
+    if (ok) {
+      const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+      output[output_index] = input[input_index];
+    } else {
+      output[output_index] = padding_value;
+    }
+  }
+}
+
+template <typename T, int NDIMS>
+__global__ void PadInputCustomKernelNCHW(
+    int nthreads, const T* __restrict__ input, Dimension<NDIMS> input_dims,
+    T* __restrict__ output, Dimension<NDIMS> output_dims,
+    Dimension<NDIMS - 2> padding_left, T padding_value) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
+    int output_index = index;
+    Index<NDIMS> output_tensor_index =
+        FlatToTensorIndex(output_index, output_dims);
+
+    Index<NDIMS> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[0];  // batch
+    input_tensor_index[1] = output_tensor_index[1];  // channels
+    bool ok = true;
+    for (int i = 2; i < NDIMS; i++) {
+      input_tensor_index[i] = output_tensor_index[i] - padding_left[i - 2];
+      ok &=
+          (input_tensor_index[i] >= 0 && input_tensor_index[i] < input_dims[i]);
+    }
+
+    if (ok) {
+      const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+      output[output_index] = input[input_index];
+    } else {
+      output[output_index] = padding_value;
+    }
+  }
+}
+
+// A GPU helper function that converts TensorFlow filter format to Cudnn filter
+// format.
+template <typename T, int NDIMS>
+struct TransformFilter<GPUDevice, T, int, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, FilterTensorFormat dst_filter_format,
+                  typename TTypes<T, NDIMS, int>::ConstTensor in,
+                  typename TTypes<T, NDIMS, int>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // spatial dimensions
+    for (int i = 1; i < NDIMS - 2; i++) {
+      combined_dims[0] *= in.dimension(i);
+    }
+    combined_dims[1] = in.dimension(NDIMS - 2);  // input filters
+    combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
+    GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
+
+    if (dst_filter_format == FORMAT_OIHW) {
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in.data(), combined_dims, out.data()));
+
+    } else if (dst_filter_format == FORMAT_OHWI) {
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in.data(), combined_dims, out.data()));
+
+    } else {
+      LOG(ERROR) << "Unsupported filter format: "
+                 << ToString(dst_filter_format);
+    }
+  }
+};
+
+// Converts Cudnn filter format OIHW or OHWI back to TensorFlow filter format
+// HWIO.
+template <typename T, int NDIMS>
+struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, FilterTensorFormat src_filter_format,
+                  typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out) {
+    Dimension<3> combined_dims;
+
+    if (src_filter_format == FORMAT_OIHW) {
+      combined_dims[0] = in.dimension(0);  // output filters
+      combined_dims[1] = in.dimension(1);  // input filters
+      combined_dims[2] = in.dimension(2);  // spatial dimensions
+      for (int i = 3; i < NDIMS; ++i) {
+        combined_dims[2] *= in.dimension(i);
+      }
+
+      GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in.data(), combined_dims, out.data()));
+
+    } else if (src_filter_format == FORMAT_OHWI) {
+      combined_dims[0] = in.dimension(0);  // output filters
+      combined_dims[1] = in.dimension(1);  // spatial dimensions
+      for (int i = 2; i < NDIMS - 1; i++) {
+        combined_dims[1] *= in.dimension(i);
+      }
+      combined_dims[2] = in.dimension(NDIMS - 1);  // input filters
+
+      GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 0, 1>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in.data(), combined_dims, out.data()));
+
+    } else {
+      // TODO(ezhulenev): Set error status in OpKernelContext instead.
+      LOG(FATAL) << "Unsupported filter format: "
+                 << ToString(src_filter_format);
+    }
+  }
+};
+
+// A GPU helper function that converts input tensor to a larger output tensor,
+// given proper padding values. The padded value is zero.
+template <typename T, int NDIMS>
+struct PadInput<GPUDevice, T, int, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d,
+                  typename TTypes<T, NDIMS, int>::ConstTensor in,
+                  const std::array<int, NDIMS - 2>& padding_left,
+                  const std::array<int, NDIMS - 2>& padding_right,
+                  typename TTypes<T, NDIMS, int>::Tensor out,
+                  TensorFormat format, const T& padding_value) {
+    GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
+    Dimension<NDIMS> input_dims;
+    for (int i = 0; i < NDIMS; ++i) {
+      input_dims[i] = in.dimension(i);
+    }
+    Dimension<NDIMS> output_dims;
+    for (int i = 0; i < NDIMS; ++i) {
+      output_dims[i] = out.dimension(i);
+    }
+
+    const Dimension<NDIMS - 2> padding_left_dim(padding_left);
+
+    if (format == FORMAT_NHWC) {
+      TF_CHECK_OK(GpuLaunchKernel(
+          PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim,
+          padding_value));
+    } else if (format == FORMAT_NCHW) {
+      TF_CHECK_OK(GpuLaunchKernel(
+          PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim,
+          padding_value));
+    } else {
+      LOG(FATAL) << "Invalid data format: " << format;
+    }
+  }
+};
+
+// We want std::equal_to and std::greater, but they're not constexpr until
+// C++14.
+struct EqualTo {
+  constexpr bool operator()(int a, int b) const { return a == b; }
+};
+
+struct GreaterThan {
+  constexpr bool operator()(int a, int b) const { return a > b; }
+};
+
+// For each data type, the tile size possibility frontier denotes the tile size
+// combinations that consume the most computational resources constrained by
+// - number of threads per SM limit,
+// - limit on size of the short dimension (<=15) due to the definition of
+//   narrow matrix,
+// - shared memory limit and
+// - some experimentally determined, type-specific constraint on the product of
+//   two side lengths to increase grid-level parallelism.
+//
+// A tile size combination lies on the frontier if and only if one or more
+// constraint mentioned above is hit. Tile size combinations lying outside this
+// frontier are either not possible, or are slower than the alternatives.
+//
+// It is instrumental to consider, for each data type, two subsets of the
+// corresponding frontier:
+// - long side frontier: the union of the biggest tile size combination for
+//   each legal long side len.
+// - non long side frontier: the frontier set minus the long side frontier.
+//
+// TileSizePossibilityFrontierCheck defines the frontier using only the long
+// side frontier tile size combinations (since one can easily extrapolate
+// the entire frontier from this subset). It serves as a utility function
+// to help us determine where a tile size combination of interest lies with
+// resepect to the frontier.
+template <typename Op>
+constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
+                                                int TileShortSide,
+                                                int size_of_t, Op op) {
+  // clang-format off
+
+  return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 64   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 128  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 2)))) ||
+          (size_of_t == 8 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 2)))) ||
+          (size_of_t == 4 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 2 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 1 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2))));
+
+  // clang-format on
+}
+
+constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
+                                          int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, EqualTo());
+}
+constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
+                                       int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, GreaterThan());
+}
+constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
+                                             int TileShortSide, int size_of_t) {
+  // For a tile size combination (longside, shortside), lying on the frontier
+  // implies that (longside, shortside) is on or within the frontier but
+  // (longside*2, shortside) or (longside, shortside+1) is not. With the above
+  // criterion, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // it is not on the long side frontier.
+  return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
+         (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
+          TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
+                                  size_of_t)) &&
+         !TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
+}
+
+// Helper function to launch a batch narrow matirx transpose kernel.
+template <typename T, int TileLongSide, int TileShortSide, bool conjugate>
+void LaunchBatchNarrowMatrixTransposeKernel(
+    const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
+    const T* input, const Dimension<3>& input_dims, T* output) {
+  constexpr int NumThreads = TileLongSide;
+  if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
+                                              TileShortSide, conjugate>,
+        total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
+        output));
+  } else {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
+                                              TileLongSide, conjugate>,
+        total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
+        output));
+  }
+}
+
+// Recursive template function to search, in a trial-and-error manner, for the
+// minimum tile size configuration satisfying the requested tile side lengths.
+// An important invariant of this search procedure is that for an unsatisfied
+// request, we always try doubling the long side len first, and only after
+// the request is satisfied for the long side len do we begin incrementing
+// the short side len.
+//
+// We have three specializations of this search function depending on where the
+// current tile size combination lies with respect to the frontier.
+// - It lies within the frontier. If request is not satisfied, for the next tile
+// size combination, we first try doubling the long side len and if that does
+// not work, we then increment the short side len.
+// - It lies on the non long side frontier. If the request is not satisfied, we
+// can only increment the short side len.
+// - It lies on the long side frontier. We launch the kernel without checking if
+// the request is satisfied or not.
+template <typename T, int TileLongSide, int TileShortSide, bool conjugate,
+          typename dummy = void>
+struct BatchNarrowMatrixTransposeDispatcher {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide,
+                                             conjugate>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; we then
+    // determine whether it is the long side or the short side that falls short
+    // of the request and increase that parameter accordingly.
+    const bool long_side_request_not_satisfied =
+        std::max(tile_size_i, tile_size_j) > TileLongSide;
+
+    if (long_side_request_not_satisfied) {
+      BatchNarrowMatrixTransposeDispatcher<T, TileLongSide * 2, TileShortSide,
+                                           conjugate>::DoIt(d, tile_size_i,
+                                                            tile_size_j,
+                                                            total_tiles_count,
+                                                            input, input_dims,
+                                                            output);
+    } else {
+      BatchNarrowMatrixTransposeDispatcher<T, TileLongSide, TileShortSide + 1,
+                                           conjugate>::DoIt(d, tile_size_i,
+                                                            tile_size_j,
+                                                            total_tiles_count,
+                                                            input, input_dims,
+                                                            output);
+    }
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide, bool conjugate>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide, conjugate,
+    typename std::enable_if<TileSizeOnNonLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide,
+                                             conjugate>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; since
+    // we are on the non long side frontier, we increment the short dimension
+    // and try again.
+    BatchNarrowMatrixTransposeDispatcher<T, TileLongSide, TileShortSide + 1,
+                                         conjugate>::DoIt(d, tile_size_i,
+                                                          tile_size_j,
+                                                          total_tiles_count,
+                                                          input, input_dims,
+                                                          output);
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide, bool conjugate>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide, conjugate,
+    typename std::enable_if<TileSizeOnLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+
+    LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide,
+                                           conjugate>(
+        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        output);
+  }
+};
+
+// This function tries to recover, in a brute force way, the frontier defined in
+// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
+// on the long side frontier. This vector is sufficient to determine the entire
+// frontier.
+//
+// Note that if one changes the frontier definition in
+// TileSizePossibilityFrontierCheck and forgets to set the largest short
+// side len of the largest legal long side len to 2, this function will fail
+// and crash the program.
+template <int SizeOfT>
+const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
+  static_assert(
+      SizeOfT <= 16,
+      "Currently, only data types of sizes 16 bytes or less are supported.");
+  static_assert((SizeOfT & (SizeOfT - 1)) == 0,
+                "Data types must have sizes that are powers of 2.");
+
+  // Expensive work to populate sizes, lazily run in a thread-safe
+  // manner the first time GetTileSizesFrontier<N> is called.
+  static auto* frontier = [] {
+    auto* frontier = new std::vector<std::pair<int, int>>();
+    const int kMaxLongSideLen = 1024;
+    const int kMaxShortSideLen = 15;
+    for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
+      for (int short_side = 2; short_side <= kMaxShortSideLen;
+           short_side += 1) {
+        if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
+          // The current combination lies on the frontier, thus we
+          // add it to the frontier definition.
+          frontier->push_back(std::make_pair(long_side, short_side));
+
+          // The long side length is the largest one allowed iff its
+          // corresponding short side length is 2.
+          if (short_side == 2) return frontier;
+
+          // We have exhausted all the possibilities in the frontier
+          // with the given long side length.
+          break;
+        }
+      }
+    }
+    LOG(FATAL)
+        << "The corresponding short side length of the largest long side "
+           "length has to be 2.";
+  }();
+  return *frontier;
+}
+
+// Helper structs to help determine which data type to use given the size of
+// the matrix data type. A transpose of elements of size N will use a kernel
+// which operates on an array of TransposeElemType<N>::type.
+template <int ElemBytes>
+struct TransposeElemType;
+template <>
+struct TransposeElemType<1> {
+  using type = uint8;
+};
+template <>
+struct TransposeElemType<2> {
+  using type = uint16;
+};
+template <>
+struct TransposeElemType<4> {
+  using type = uint32;
+};
+template <>
+struct TransposeElemType<8> {
+  using type = float2;
+};
+template <>
+struct TransposeElemType<16> {
+  using type = double2;
+};
+
+// A helper function to make RunSwapDimension1And2InTensor3 concise. This
+// helper function looks at the data type and input matrix sizes and decides
+// the thread numbers and tile sizes to use.
+template <typename T, bool conjugate = false>
+void SwapDimension1And2InTensor3WithNarrowMatrices(
+    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
+    T* output, const int kMinDimensionToUseTiles) {
+  // Get available tile sizes here for the data type requested:
+  const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
+
+  int tile_long_side_len = 0;
+  int tile_short_side_len = 0;
+  float lowest_cost = std::numeric_limits<float>::max();
+  int data_long_side = std::max(input_dims[1], input_dims[2]);
+
+  for (auto tile_size_pair : tile_spec) {
+    int proposed_tile_long_side_len = tile_size_pair.first;
+
+    // Number of threads that will not be doing anything useful when reading
+    // the matrix because the thread block size is bigger than the data block
+    // size.
+    int num_wasted_threads =
+        data_long_side - MathUtil::FloorOfRatio<int>(
+                             data_long_side, proposed_tile_long_side_len) *
+                             proposed_tile_long_side_len;
+
+    int num_full_tiles = MathUtil::FloorOfRatio<int>(
+        data_long_side, proposed_tile_long_side_len);
+
+    float cost = 0;
+
+    // However, if we can execute two or more full tiles, then we gladly
+    // accept any number of wasted threads and ignore its cost.
+    if (num_full_tiles <= 1) cost = num_wasted_threads;
+
+    // Using less than or equal to here because given the same cost, we
+    // would like to launch as many threads as possible.
+    if (cost <= lowest_cost) {
+      tile_long_side_len = proposed_tile_long_side_len;
+      tile_short_side_len = tile_size_pair.second;
+      lowest_cost = cost;
+    }
+  }
+
+  // Request tile sizes such that the longer side of threadblock aligns with
+  // the longer side of input data block to maximize read throughput.
+  // The ideal tile shape is one where the length of the shorter side of the
+  // tile is equal to the length of the shorter side of the input matrix.
+  int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? tile_long_side_len
+                                  : input_dims[1];
+  int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? input_dims[2]
+                                  : tile_long_side_len;
+
+  // Truncate the shorter size requested according to the manual limit set in
+  // tile_spec to make sure that we do not launch configurations violating
+  // hardware limits.
+  requested_tile_size_i =
+      requested_tile_size_i == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j =
+      requested_tile_size_j == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_j, tile_short_side_len);
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
+      MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
+  };
+
+  int total_tiles_count =
+      input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
+
+  using ElemType = typename TransposeElemType<sizeof(T)>::type;
+  static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
+  BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2, conjugate>::DoIt(
+      d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
+      reinterpret_cast<const ElemType*>(input), input_dims,
+      reinterpret_cast<ElemType*>(output));
+}
+
+// Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
+// 3D tensor. It looks at the shape of the incoming data, and decides the best
+// strategy to launch.
+template <typename T, bool conjugate = false>
+void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
+                                    const Dimension<3>& input_dims, T* output) {
+  // If both dimensions are not trivial, use tiles for the actual swapping.
+  // If one dimension is trivial, use SmallDim kernel for swapping.
+  // Otherwise, the trivial swapping relying on the ldg cache is more efficient.
+  static const int kMinDimensionToUseTiles = 16;
+  static const int kMinDimensionToUseRectTiles = 96;
+
+  bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
+                      input_dims[2] >= kMinDimensionToUseTiles;
+  bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
+                       input_dims[2] >= kMinDimensionToUseRectTiles;
+  if (large_matrix) {
+    // We get best performance when kTileSize is the number of threads in a warp
+    // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
+    // threads.
+    constexpr int kTileSize = 32;
+    constexpr int kNumThreads = 256;
+
+    Dimension<3> input_dims_in_tiles = {
+        input_dims[0],
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
+    };
+
+    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
+                            input_dims_in_tiles[2];
+    TF_CHECK_OK(GpuLaunchKernel(
+        SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize,
+                                              kTileSize, conjugate>,
+        total_tiles_count, kNumThreads, 0, d.stream(), input, input_dims,
+        output));
+
+  } else if (narrow_matrix) {
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
+  } else {
+    int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
+    GpuLaunchConfig config = GetGpuLaunchConfig(total_element_count, d);
+    TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>,
+                                config.block_count, config.thread_per_block, 0,
+                                d.stream(), config.virtual_thread_count, input,
+                                input_dims, output));
+  }
+}
+
+// A GPU helper functor that does general dimension 1 and 2 switch for 3D
+// tensor.
+template <typename T, bool conjugate>
+struct SwapDimension1And2InTensor3<GPUDevice, T, conjugate> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64_t>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    RunSwapDimension1And2InTensor3<T, conjugate>(d, in, input_dims, out);
+  }
+};
+
+// A GPU helper functor that does general dimension 0 and 2 switch for 3D
+// tensor.
+template <typename T, bool conjugate>
+struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64_t>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
+    GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d);
+
+    auto out_ptr = reinterpret_cast<uintptr_t>(out);
+    bool aligned = out_ptr % 16 == 0;
+
+    bool use_vector = false;
+    bool use_custom_config = false;
+    if ((input_dims[0] <= 128 && input_dims[2] <= 128) ||
+        input_dims[0] * input_dims[1] <= 128 ||
+        input_dims[1] * input_dims[2] <= 8) {
+      use_vector = true;
+      use_custom_config = true;
+    } else if (input_dims[1] * input_dims[2] <= 16384) {
+      use_vector = true;
+    }
+
+    if (sizeof(T) == 2 && aligned && use_vector) {
+      int block_count;
+      if (use_custom_config) {
+        block_count = (total_size + config.thread_per_block - 1) /
+                      config.thread_per_block;
+      } else {
+        block_count = config.block_count;
+      }
+
+      TF_CHECK_OK(
+          GpuLaunchKernel(ShuffleInTensor3SimpleVector<T, 2, 1, 0, conjugate>,
+                          block_count, config.thread_per_block / kUnroll, 0,
+                          d.stream(), total_size, in, input_dims, out));
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in, input_dims, out));
+    }
+  }
+};
+
+// A GPU helper functor that converts NHWC TensorFlow data format to
+// NCHW format that is accepted by Cudnn.
+template <typename T, int NDIMS>
+struct NHWCToNCHW<GPUDevice, T, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // N (batch)
+    combined_dims[1] = in.dimension(1);  // spatial dimensions (HW)
+    for (int i = 2; i < NDIMS - 1; ++i) {
+      combined_dims[1] *= in.dimension(i);
+    }
+    combined_dims[2] = in.dimension(NDIMS - 1);  // C (channels)
+    RunSwapDimension1And2InTensor3(d, in.data(), combined_dims, out.data());
+  }
+};
+
+// A GPU helper functor that converts NCHW Cudnn data format to NHWC TensorFlow
+// Format.
+template <typename T, int NDIMS>
+struct NCHWToNHWC<GPUDevice, T, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // N (batch)
+    combined_dims[1] = in.dimension(1);  // C (channel)
+    combined_dims[2] = in.dimension(2);  // spatial dimensions (HW)
+    for (int i = 3; i < NDIMS; ++i) {
+      combined_dims[2] *= in.dimension(i);
+    }
+    RunSwapDimension1And2InTensor3(d, in.data(), combined_dims, out.data());
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_3d.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_3d.h
new file mode 100644
index 00000000..b4cdbd5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_3d.h
@@ -0,0 +1,128 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functors for 3d convolution.
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_3D_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_3D_H_
+
+#include <array>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h"
+#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Applies a 3D convolution to a batch of multi-channel volumes.
+template <typename Device, typename T>
+struct CuboidConvolution;
+
+// Backward input pass for the cuboid convolution.
+template <typename Device, typename T>
+struct CuboidConvolutionBackwardInput;
+
+// Backward filter pass for the cuboid convolution.
+template <typename Device, typename T>
+struct CuboidConvolutionBackwardFilter;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename T>
+struct CuboidConvolution<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 5>::Tensor output,
+                  typename TTypes<T, 5>::ConstTensor input,
+                  typename TTypes<T, 5>::ConstTensor filter, int stride_planes,
+                  int stride_rows, int stride_cols,
+                  const Eigen::PaddingType& padding) {
+    output.device(d) = Eigen::CuboidConvolution(
+        input, filter, stride_planes, stride_rows, stride_cols, padding);
+  }
+};
+
+template <typename T>
+struct CuboidConvolutionBackwardInput<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T, 5>::Tensor input_backward,
+                  typename TTypes<T, 5>::ConstTensor filter,
+                  typename TTypes<T, 5>::ConstTensor output_backward,
+                  int stride_planes, int stride_rows, int stride_cols) {
+    // Need to swap the order of plane/row/col strides when calling Eigen.
+    input_backward.device(d) = Eigen::CuboidConvolutionBackwardInput(
+        filter, output_backward,
+        input_backward.dimension(3),  // input_planes
+        input_backward.dimension(2),  // input_rows
+        input_backward.dimension(1),  // input_cols
+        stride_cols, stride_rows, stride_planes);
+  }
+};
+
+template <typename T>
+struct CuboidConvolutionBackwardFilter<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T, 5>::Tensor filter_backward,
+                  typename TTypes<T, 5>::ConstTensor input,
+                  typename TTypes<T, 5>::ConstTensor output_backward,
+                  int stride_planes, int stride_rows, int stride_cols) {
+    // Need to swap the order of plane/row/col strides when calling Eigen.
+    filter_backward.device(d) = Eigen::CuboidConvolutionBackwardKernel(
+        input, output_backward,
+        filter_backward.dimension(2),  // kernel_planes
+        filter_backward.dimension(1),  // kernel_rows
+        filter_backward.dimension(0),  // kernel_cols
+        stride_cols, stride_rows, stride_planes);
+  }
+};
+
+}  // namespace functor
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+struct LaunchConv3DOp;
+
+template <typename T>
+struct LaunchConv3DOp<CPUDevice, T> {
+  static void launch(OpKernelContext* context, bool cudnn_use_autotune,
+                     const Tensor& input, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
+                     const std::array<int64, 3>& strides, const Padding padding,
+                     TensorFormat data_format, Tensor* output) {
+    OP_REQUIRES(context, data_format == FORMAT_NHWC,
+                absl::InvalidArgumentError("CPU implementation of Conv3D "
+                                           "currently only supports the NHWC "
+                                           "tensor format."));
+    OP_REQUIRES(
+        context, dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1,
+        absl::InvalidArgumentError("CPU implementation of Conv3D "
+                                   "currently only supports dilated rates "
+                                   "of 1."));
+    OP_REQUIRES(context, filter.dim_size(3) == input.dim_size(input.dims() - 1),
+                absl::InvalidArgumentError(absl::StrCat(
+                    "Number of channels in filter (", filter.dim_size(3),
+                    ") must match last dimension of input (",
+                    input.dim_size(input.dims() - 1), ")")));
+    functor::CuboidConvolution<CPUDevice, T>()(
+        context->eigen_device<CPUDevice>(), output->tensor<T, 5>(),
+        input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
+        strides[0], BrainPadding2EigenPadding(padding));
+  }
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_3D_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_input_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_input_ops.h
new file mode 100644
index 00000000..3dbecd51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_input_ops.h
@@ -0,0 +1,718 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_GRAD_INPUT_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_GRAD_INPUT_OPS_H_
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "absl/base/dynamic_annotations.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/conv_grad_shape_utils.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage
+// order (height, width, depth), constructed from patches in 'col_data', which
+// is required to be in storage order (out_height * out_width, filter_height,
+// filter_width, in_depth).  Implementation by Yangqing Jia (jiayq).
+template <typename T>
+void Col2im(const T* col_data, const int depth, const int height,
+            const int width, const int filter_h, const int filter_w,
+            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+            const int stride_h, const int stride_w, T* __restrict im_data) {
+  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
+      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            for (int i = 0; i < depth; ++i) {
+              im_patch_data[i] += col_data[i];
+            }
+          }
+          im_patch_data += depth;
+          col_data += depth;
+        }
+        // Jump over remaining number of depth.
+        im_patch_data += depth * (width - filter_w);
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+// Computes backprop input using Eigen::SpatialConvolutionBackwardInput on CPU
+// and GPU (for int32 only).
+template <typename Device, typename T>
+struct LaunchConv2DBackpropInputOpImpl {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& filter,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format) {
+    std::vector<int32> strides(4, 1);
+    std::vector<int32> dilations(4, 1);
+
+    auto input_h = GetTensorDimIndex(data_format, 'H');
+    auto input_w = GetTensorDimIndex(data_format, 'W');
+    strides[input_h] = row_stride;
+    strides[input_w] = col_stride;
+    dilations[input_h] = row_dilation;
+    dilations[input_w] = col_dilation;
+
+    const TensorShape& input_shape = in_backprop->shape();
+    const TensorShape& filter_shape = filter.shape();
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        ctx, ConvBackpropComputeDimensionsV2(
+                 "Conv2DBackpropInput", /*num_spatial_dims=*/2, input_shape,
+                 filter_shape, out_backprop.shape(), dilations, strides,
+                 padding, explicit_paddings, data_format, &dims));
+
+    int64_t padding_top = -1, padding_bottom = -1;
+    int64_t padding_left = -1, padding_right = -1;
+    if (padding == EXPLICIT) {
+      GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                               &padding_top, &padding_bottom);
+      GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                               &padding_left, &padding_right);
+    }
+
+    int64_t expected_out_rows, expected_out_cols;
+    // The function is guaranteed to succeed because we checked the output and
+    // padding was valid earlier.
+    TF_CHECK_OK(GetWindowedOutputSizeVerbose(
+        dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+        row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+        &padding_bottom));
+    DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+
+    TF_CHECK_OK(GetWindowedOutputSizeVerbose(
+        dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+        col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+        &padding_right));
+    DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
+
+    if (std::is_same<Device, GPUDevice>::value) {
+      int64_t size = 1;
+#define REQUIRES_32BIT(x)                                                   \
+  size *= x;                                                                \
+  OP_REQUIRES(ctx,                                                          \
+              FastBoundsCheck(x, std::numeric_limits<int32>::max()) &&      \
+                  FastBoundsCheck(size, std::numeric_limits<int32>::max()), \
+              errors::InvalidArgument("Tensor too large"))
+
+      REQUIRES_32BIT(in_backprop->dim_size(0));
+      REQUIRES_32BIT(in_backprop->dim_size(1) + padding_top + padding_bottom);
+      REQUIRES_32BIT(in_backprop->dim_size(2) + padding_left + padding_right);
+      REQUIRES_32BIT(in_backprop->dim_size(3));
+#undef REQUIRES_32BIT
+    }
+
+    auto in_backprop_t = in_backprop->tensor<T, 4>();
+    auto out_backprop_t = out_backprop.tensor<T, 4>();
+    auto filter_t = filter.tensor<T, 4>();
+
+    // WARNING: Need to swap row/col, padding_top/padding_left, and
+    // padding_bottom/padding_right when calling Eigen. Eigen expects tensors
+    // in NWHC format, but Tensorflow uses NHWC.
+
+    if (padding != EXPLICIT) {
+      // If padding was not explicitly defined, Eigen spatial convolution
+      // backward input will infer correct forward paddings from input tensors.
+      functor::SpatialConvolutionBackwardInputFunc<Device, T>()(
+          ctx->eigen_device<Device>(), in_backprop_t, filter_t, out_backprop_t,
+          col_stride, row_stride, col_dilation, row_dilation);
+    } else {
+      functor::SpatialConvolutionBackwardInputWithExplicitPaddingFunc<Device,
+                                                                      T>()(
+          ctx->eigen_device<Device>(), in_backprop_t, filter_t, out_backprop_t,
+          in_backprop_t.dimension(2) + (padding_left + padding_right),
+          in_backprop_t.dimension(1) + (padding_top + padding_bottom),
+          col_stride, row_stride, col_dilation, row_dilation, padding_top,
+          padding_left);
+    }
+  }
+};
+
+// Computes backprop input using Eigen::SpatialConvolutionBackwardInput on CPU.
+template <typename T>
+struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& filter,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format) {
+    LaunchConv2DBackpropInputOpImpl<CPUDevice, T> launcher;
+    launcher(ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter,
+             row_dilation, col_dilation, row_stride, col_stride, padding,
+             explicit_paddings, in_backprop, data_format);
+  }
+};
+
+template <typename T>
+struct Conv2DCustomBackpropInputMatMulFunctor {
+  using MatrixMap = Eigen::Map<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  using ConstMatrixMap = Eigen::Map<
+      const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Compute gradient into 'im2col_buf'.
+    MatrixMap C(im2col_buf, output_image_size, filter_total_size);
+
+    ConstMatrixMap A(out_data, output_image_size, dims_out_depth);
+    ConstMatrixMap B(filter_data, filter_total_size, dims_out_depth);
+
+    C.noalias() = A * B.transpose();
+  }
+};
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+template <>
+struct Conv2DCustomBackpropInputMatMulFunctor<float> {
+  using T = float;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Inputs are in RowMajor order.
+    //   im2col      = out_data    * filter_data^T
+    //   [ois x fts] = [ois x dod] * [fts x dod]^T
+    //
+    // Dimension names:
+    //   out_image_size    -> ois
+    //   filter_total_size -> fts
+    //   dims_out_depth    -> dod
+
+    const int m = output_image_size;
+    const int n = filter_total_size;
+    const int k = dims_out_depth;  // contraction dim
+
+    const char transposeA = 'N';  // sgemm(A) == filter_data
+    const char transposeB = 'T';  // sgemm(B) == out_data
+
+    const int ldA = dims_out_depth;
+    const int ldB = dims_out_depth;
+    const int ldC = filter_total_size;
+
+    const float alpha = 1.0;
+    const float beta = 0.0;
+
+    // dnnl_sgemm code can't be instrumented with msan.
+    ANNOTATE_MEMORY_IS_INITIALIZED(
+        im2col_buf, filter_total_size * output_image_size * sizeof(T));
+
+    dnnl_status_t st =
+        dnnl_sgemm(transposeA, transposeB, m, n, k, alpha, out_data, ldA,
+                   filter_data, ldB, beta, im2col_buf, ldC);
+
+    OP_REQUIRES(
+        ctx, st == 0,
+        errors::Internal("Failed to call dnnl_sgemm. Error code: ", st));
+  }
+};
+#endif
+
+template <typename Device, class T>
+class Conv2DBackpropInputOp : public OpKernel {
+ public:
+  explicit Conv2DBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(
+        context, (dilation_n == 1 && dilation_c == 1),
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+
+    if (std::is_same<Device, CPUDevice>::value ||
+        std::is_same<T, int32>::value) {
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument("Conv2DBackpropInputOp [CPU or GPU(int32)] "
+                                  "only supports NHWC data format."));
+
+      // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+      OP_REQUIRES(
+          context, (dilation_h == 1 && dilation_w == 1),
+          errors::InvalidArgument(
+              "Conv2DBackpropInputOp [CPU or GPU(int32)] not yet support "
+              "dilation rates larger than 1."));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_sizes = context->input(0);
+    const Tensor& filter = context->input(1);
+    const Tensor& out_backprop = context->input(2);
+
+    OP_REQUIRES(
+        context, out_backprop.dims() == 4,
+        errors::InvalidArgument("input_sizes must be 4-dimensional, got: ",
+                                out_backprop.dims()));
+
+    TensorShape input_shape;
+    OP_REQUIRES_OK(context,
+                   Conv2DBackpropComputeInputShape(input_sizes, filter.shape(),
+                                                   out_backprop.shape(),
+                                                   data_format_, &input_shape));
+
+    Tensor* in_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &in_backprop));
+
+    // If there is nothing to compute, return.
+    if (input_shape.num_elements() == 0) {
+      return;
+    }
+
+    // If shapes are valid but `out_backprop` is empty, in_backprop should be
+    // set to all zeros.  Otherwise, cudnn/dnnl fail with an empty input.
+    if (out_backprop.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, T> set_zero;
+      set_zero(context->eigen_device<Device>(),
+               in_backprop->template flat<T>());
+      return;
+    }
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
+
+    VLOG(2) << "Conv2DBackpropInput:"
+            << " input: " << input_shape.DebugString()
+            << " filter:" << filter.shape().DebugString()
+            << " out_backprop: " << out_backprop.shape().DebugString()
+            << " strides: [" << stride_rows << ", " << stride_cols << "]"
+            << " dilations: [" << dilation_rows << ", " << dilation_cols << "]";
+
+    LaunchConv2DBackpropInputOp<Device, T> launch;
+    launch(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
+           dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+           explicit_paddings_, in_backprop, data_format_);
+  }
+
+ private:
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  TensorFormat data_format_;
+  Padding padding_;
+  std::vector<int64_t> explicit_paddings_;
+
+  bool use_cudnn_ = false;
+  bool cudnn_use_autotune_ = false;
+
+  Conv2DBackpropInputOp(const Conv2DBackpropInputOp&) = delete;
+  void operator=(const Conv2DBackpropInputOp&) = delete;
+};
+
+// Based on implementation written by Yangqing Jia (jiayq).
+template <typename Device, class T>
+class Conv2DCustomBackpropInputOp : public OpKernel {
+ public:
+  explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument(
+                    "Conv2DCustomBackpropInputOp only supports NHWC."));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(
+        context, (strides_[0] == 1 && strides_[3] == 1),
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(
+        context, (dilations_[0] == 1 && dilations_[3] == 1),
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(
+        context, (dilations_[1] == 1 && dilations_[2] == 1),
+        errors::InvalidArgument("Current CPU implementations do not yet "
+                                "support dilation rates larger than 1."));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_sizes = context->input(0);
+    const Tensor& filter = context->input(1);
+    const Tensor& out_backprop = context->input(2);
+    OP_REQUIRES(
+        context, out_backprop.dims() == 4,
+        errors::InvalidArgument("input_sizes must be 4-dimensional, got: ",
+                                out_backprop.dims()));
+
+    TensorShape input_shape;
+    OP_REQUIRES_OK(context,
+                   Conv2DBackpropComputeInputShape(input_sizes, filter.shape(),
+                                                   out_backprop.shape(),
+                                                   data_format_, &input_shape));
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensionsV2(
+                       "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2,
+                       input_shape, filter.shape(), out_backprop.shape(),
+                       /*dilations=*/{1, 1, 1, 1}, strides_, padding_,
+                       explicit_paddings_, data_format_, &dims));
+
+    OP_REQUIRES(context, dims.in_depth == filter.shape().dim_size(2),
+                errors::InvalidArgument(
+                    "Gradients for grouped convolutions are not "
+                    "supported on CPU. Please file a feature request if you "
+                    "run into this issue. Computed input depth ",
+                    dims.in_depth, " doesn't match filter input depth ",
+                    filter.shape().dim_size(2)));
+    OP_REQUIRES(
+        context, dims.out_depth == filter.shape().dim_size(3),
+        errors::InvalidArgument("Computed output depth ", dims.out_depth,
+                                " doesn't match filter output depth ",
+                                filter.shape().dim_size(3)));
+
+    Tensor* in_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &in_backprop));
+
+    // If there is nothing to compute, return.
+    if (input_shape.num_elements() == 0) {
+      return;
+    }
+
+    // If shapes are valid but `out_backprop` is empty, in_backprop should be
+    // set to all zeros.  Otherwise, cudnn/dnnl fail with an empty input.
+    if (out_backprop.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, T> set_zero;
+      set_zero(context->eigen_device<Device>(),
+               in_backprop->template flat<T>());
+      return;
+    }
+
+    int64_t pad_top, pad_bottom;
+    int64_t pad_left, pad_right;
+
+    if (padding_ == Padding::EXPLICIT) {
+      pad_top = explicit_paddings_[2];
+      pad_bottom = explicit_paddings_[3];
+      pad_left = explicit_paddings_[4];
+      pad_right = explicit_paddings_[5];
+    }
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            /*dilation_rate=*/1, dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            /*dilation_rate=*/1, dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    // The total dimension size of each kernel.
+    const int filter_total_size = dims.spatial_dims[0].filter_size *
+                                  dims.spatial_dims[1].filter_size *
+                                  dims.in_depth;
+    // The output image size is the spatial size of the output.
+    const int output_image_size =
+        dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size;
+
+    // TODO(andydavis) Get L2/L3 cache sizes from device.
+    const size_t l2_cache_size = 256LL << 10;
+    const size_t l3_cache_size = 30LL << 20;
+
+    // Use L3 cache size as target working set size.
+    const size_t target_working_set_size = l3_cache_size / sizeof(T);
+
+    // Calculate size of matrices involved in MatMul: C = A x B.
+    const size_t size_A = output_image_size * dims.out_depth;
+
+    const size_t size_B = filter_total_size * dims.out_depth;
+
+    const size_t size_C = output_image_size * filter_total_size;
+
+    const size_t work_unit_size = size_A + size_B + size_C;
+
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    // Calculate per-thread work unit size.
+    const size_t thread_work_unit_size =
+        work_unit_size / worker_threads.num_threads;
+
+    // Set minimum per-thread work unit size to size of L2 cache.
+    const size_t min_thread_work_unit_size = l2_cache_size / sizeof(T);
+
+    // Use parallel tensor contractions if there is no batching, or if the
+    // minimum per-thread work unit size threshold has been exceeded.
+    // Otherwise, revert to multiple single-threaded matmul ops running in
+    // parallel to keep all threads busy.
+    // TODO(andydavis) Explore alternatives to branching the code in this way
+    // (i.e. run multiple, parallel tensor contractions in another thread pool).
+    const bool use_parallel_contraction =
+        dims.batch_size == 1 ||
+        thread_work_unit_size >= min_thread_work_unit_size;
+
+    OP_REQUIRES(
+        context, work_unit_size > 0,
+        errors::InvalidArgument("input, filter_sizes and out_backprop tensors "
+                                "must all have at least 1 element"));
+
+    const size_t shard_size =
+        use_parallel_contraction
+            ? 1
+            : (target_working_set_size + work_unit_size - 1) / work_unit_size;
+
+    Tensor col_buffer;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       TensorShape({static_cast<int64_t>(shard_size),
+                                    static_cast<int64_t>(output_image_size),
+                                    static_cast<int64_t>(filter_total_size)}),
+                       &col_buffer));
+
+    // The input offset corresponding to a single input image.
+    const int input_offset = dims.spatial_dims[0].input_size *
+                             dims.spatial_dims[1].input_size * dims.in_depth;
+    // The output offset corresponding to a single output image.
+    const int output_offset = dims.spatial_dims[0].output_size *
+                              dims.spatial_dims[1].output_size * dims.out_depth;
+
+    const T* filter_data = filter.template flat<T>().data();
+    T* col_buffer_data = col_buffer.template flat<T>().data();
+    const T* out_backprop_data = out_backprop.template flat<T>().data();
+
+    auto in_backprop_flat = in_backprop->template flat<T>();
+    T* input_backprop_data = in_backprop_flat.data();
+    in_backprop_flat.device(context->eigen_device<Device>()) =
+        in_backprop_flat.constant(T(0));
+
+    if (use_parallel_contraction) {
+      typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
+                               Eigen::Unaligned>
+          TensorMap;
+      typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                               Eigen::Unaligned>
+          ConstTensorMap;
+
+      // Initialize contraction dims (we need to transpose 'B' below).
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
+      contract_dims[0].first = 1;
+      contract_dims[0].second = 1;
+
+      for (int image_id = 0; image_id < dims.batch_size; ++image_id) {
+        // Compute gradient into col_buffer.
+        TensorMap C(col_buffer_data, output_image_size, filter_total_size);
+
+        ConstTensorMap A(out_backprop_data + output_offset * image_id,
+                         output_image_size, dims.out_depth);
+        ConstTensorMap B(filter_data, filter_total_size, dims.out_depth);
+
+        C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims);
+
+        Col2im<T>(
+            col_buffer_data, dims.in_depth, dims.spatial_dims[0].input_size,
+            dims.spatial_dims[1].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[1].filter_size, pad_top, pad_left, pad_bottom,
+            pad_right, dims.spatial_dims[0].stride, dims.spatial_dims[1].stride,
+            input_backprop_data);
+
+        input_backprop_data += input_offset;
+      }
+    } else {
+      for (int image_id = 0; image_id < dims.batch_size;
+           image_id += shard_size) {
+        const int shard_limit =
+            std::min(static_cast<int>(shard_size),
+                     static_cast<int>(dims.batch_size) - image_id);
+
+        auto shard = [&context, &dims, &pad_top, &pad_left, &pad_bottom,
+                      &pad_right, &output_image_size, &filter_total_size,
+                      &input_backprop_data, &col_buffer_data,
+                      &out_backprop_data, &filter_data, &input_offset,
+                      &output_offset, &size_C](int64_t start, int64_t limit) {
+          for (int shard_id = start; shard_id < limit; ++shard_id) {
+            T* im2col_buf = col_buffer_data + shard_id * size_C;
+            T* input_data = input_backprop_data + shard_id * input_offset;
+            const T* out_data = out_backprop_data + shard_id * output_offset;
+
+            Conv2DCustomBackpropInputMatMulFunctor<T>()(
+                context, out_data, filter_data, filter_total_size,
+                output_image_size, dims.out_depth, im2col_buf);
+
+            Col2im<T>(im2col_buf, dims.in_depth,
+                      dims.spatial_dims[0].input_size,
+                      dims.spatial_dims[1].input_size,
+                      dims.spatial_dims[0].filter_size,
+                      dims.spatial_dims[1].filter_size, pad_top, pad_left,
+                      pad_bottom, pad_right, dims.spatial_dims[0].stride,
+                      dims.spatial_dims[1].stride, input_data);
+          }
+        };
+        Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
+              work_unit_size, shard);
+
+        input_backprop_data += input_offset * shard_limit;
+        out_backprop_data += output_offset * shard_limit;
+      }
+    }
+  }
+
+ private:
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  Padding padding_;
+  std::vector<int64_t> explicit_paddings_;
+  TensorFormat data_format_;
+
+  Conv2DCustomBackpropInputOp(const Conv2DCustomBackpropInputOp&) = delete;
+  void operator=(const Conv2DCustomBackpropInputOp&) = delete;
+};
+
+// TODO(ezhulenev): Add a cost model to switch between custom/Eigen ops.
+#define DEFAULT_CONV_2D_BACKPROP_CPU_OP Conv2DCustomBackpropInputOp
+
+#define REGISTER_CONV_2D_BACKPROP_CPU_KERNELS(T)                             \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DEFAULT_CONV_2D_BACKPROP_CPU_OP<CPUDevice, T>);                        \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
+                              .Device(DEVICE_CPU)                            \
+                              .Label("custom")                               \
+                              .TypeConstraint<T>("T"),                       \
+                          Conv2DCustomBackpropInputOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
+                              .Device(DEVICE_CPU)                            \
+                              .Label("eigen_tensor")                         \
+                              .TypeConstraint<T>("T"),                       \
+                          Conv2DBackpropInputOp<CPUDevice, T>);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_GRAD_INPUT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_ops.h
new file mode 100644
index 00000000..40e03b2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_ops.h
@@ -0,0 +1,215 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the common header for the input and filter backprop kernels.
+//
+// The operation to compute Conv2D gradients.
+//
+// To compute the gradients for Conv2D, we need three input tensors:
+//    input, filter, and backprop for output.
+// And we need to compute two backprops: one for input and one for filter. We
+// compute them in two different kernels.
+//
+// Both backprops can be computed as straightforward conv2d.
+//
+// Consider a case where the input is 3x3 and the filter is 2x1:
+//
+// INPUT = [ A  B  C ]
+//         [ D  E  F ]
+//         [ G  H  I ]
+//
+// where each "A", "B", etc is batch x in_depth
+//
+// FILTER = [ X  Y ]
+//
+// where both "X" and "Y" are in_depth x out_depth
+//
+// With VALID padding, the output is 3x2:
+//
+// OUTPUT = [ a  b ]
+//          [ c  d ]
+//          [ e  f ]
+//
+// where each "a", "b", etc is batch x out_depth
+//
+// So we have:
+//
+//   a = A * X + B * Y
+//   b = B * X + C * Y
+//   c = D * X + E * Y
+//   d = E * X + F * Y
+//   e = G * X + H * Y
+//   f = H * X + I * Y
+//
+// So when we have backprops for the outputs (we denote them by
+// a', b', ... ):
+//
+// The backprops for the input are:
+//
+//   A' = a' * X^t
+//   B' = a' * Y^t + b' * X^t
+//   C' = b' * Y^t
+//   ...
+//
+// This is essentially computing a 2d conv of
+//
+// INPUT = [ 0  a'  b'  0 ]
+//         [ 0  c'  d'  0 ]
+//         [ 0  e'  f'  0 ]
+// and
+//
+// FILTER = [ Y^t X^t ]
+//
+// The backprops for the filter are:
+//
+//   X' = A^t * a' + B^t * b' + D^t * c' + E^t * d' + G^t * e' + H^t * f'
+//   Y' = B^t * a' + C^t * b' + E^t + c' + F^t * d' + H^t * e' + I^t * f'
+//
+// This is essentially computing a 2d conv of
+//
+// INPUT = [ A^t  B^t  C^t ]
+//         [ D^t  E^t  F^t ]
+//         [ G^t  H^t  I^t ]
+//
+// and
+//
+// FILTER = [ a'  b' ]
+//          [ c'  d' ]
+//          [ e'  f' ]
+//
+//
+//////////////////////////////////////////////////////////
+//
+// With stride more than one, it's a bit more complicated (we will need to
+// create holes to the backprop).
+//
+// Consider the case where
+//
+// INPUT = [ A B C D E ]
+//         [ F G H I J ]
+//         [ K L M N O ]
+// and
+//
+// FILTER = [ X Y Z ]
+//
+// with stride 2.
+//
+// The output will be
+//
+// OUTPUT = [ a b ]
+//          [ c d ]
+//
+// where:
+//
+//   a = A * X + B * Y + C * Z
+//   b = C * X + D * Y + E * Z
+//   c = K * X + L * Y + M * Z
+//   d = M * X + N * Y + O * Z
+//
+//
+// To compute the backprop for INPUT, we need to convolve
+//
+// INPUT = [ 0  0  a' 0  b' 0  0 ]
+//         [ 0  0  0  0  0  0  0 ]
+//         [ 0  0  c' 0  d' 0  0 ]
+//
+// (notice the holes in INPUT)
+//
+// and
+//
+// FILTER = [ Z^t  Y^t  X^t ]
+//
+// with stride 1.
+//
+// To compute the backprop for FILTER, we need to convolve
+
+//
+// INPUT = [ A^t  B^t  C^t  D^t  E^t ]
+//         [ F^t  G^t  H^t  I^t  J^t ]
+//         [ K^t  L^t  M^t  N^t  O^t ]
+// and
+//
+// FILTER = [ a' 0  b' ]
+//          [ 0  0  0  ]
+//          [ c' 0  d' ]
+//
+// (notice the holes in FILTER)
+//
+//
+// with stride 1
+//
+//////////////////////////////////////////////////////////
+//
+//
+// The case for SAME padding is in fact very similar to VALID -- we just
+// need to pad the input tensor a bit when computing the filter_backprop.
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_GRAD_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_GRAD_OPS_H_
+
+#include <vector>
+
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+struct LaunchConv2DBackpropInputOp {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& filter,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchConv2DBackpropFilterOp {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& input,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  Tensor* filter_backprop, TensorFormat data_format);
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename T>
+struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& input,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  Tensor* filter_backprop, TensorFormat data_format);
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_GRAD_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_shape_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_shape_utils.h
new file mode 100644
index 00000000..d83c1bb2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_grad_shape_utils.h
@@ -0,0 +1,93 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_GRAD_SHAPE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_GRAD_SHAPE_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+// Information about a single spatial dimension for a convolution
+// backpropagation.
+struct ConvBackpropSpatialDimension {
+  int64_t input_size;
+  int64_t filter_size;
+  int64_t output_size;
+  int64_t stride;
+  int64_t dilation;
+
+  // Output size after scaling by the stride.
+  int64_t expanded_output_size;
+
+  // Number of padding elements to be added before/after this dimension of
+  // the input when computing Conv?DBackpropInput.
+  int64_t pad_before, pad_after;
+};
+
+// Computed dimensions for a backwards convolution.
+struct ConvBackpropDimensions {
+  // Information about each spatial dimension.
+  absl::InlinedVector<ConvBackpropSpatialDimension, 3UL> spatial_dims;
+
+  // Batch size.
+  int64_t batch_size;
+
+  // Input and output feature depth.
+  int64_t in_depth, out_depth;
+
+  // Convenience access methods for spatial dimensions properties.
+  int64_t input_size(int dim) const { return spatial_dims[dim].input_size; }
+  int64_t filter_size(int dim) const { return spatial_dims[dim].filter_size; }
+  int64_t output_size(int dim) const { return spatial_dims[dim].output_size; }
+  int64_t stride(int dim) const { return spatial_dims[dim].stride; }
+  int64_t dilation(int dim) const { return spatial_dims[dim].dilation; }
+
+  // Compute padding for the given spatial dimension.
+  int SpatialPadding(const Padding& padding, int dim) const;
+};
+
+// Common code between implementations of Conv?DBackpropInput and
+// Conv?DBackpropFilter. Verifies that the dimensions all match, and computes
+// sizes/padding for the spatial dimensions. Does not support explicit padding.
+absl::Status ConvBackpropComputeDimensions(
+    absl::string_view label, int num_spatial_dims,
+    const TensorShape& input_shape, const TensorShape& filter_shape,
+    const TensorShape& out_backprop_shape, const std::vector<int32>& strides,
+    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims);
+
+// The V2 version computes the same outputs with arbitrary dilation rate and
+// supports explicit padding.
+// TODO(b/67112639): Merge V2 versions and the original versions eventually.
+absl::Status ConvBackpropComputeDimensionsV2(
+    absl::string_view label, int num_spatial_dims,
+    const TensorShape& input_shape, const TensorShape& filter_shape,
+    const TensorShape& out_backprop_shape, absl::Span<const int32> dilations,
+    const std::vector<int32>& strides, Padding padding,
+    absl::Span<const int64_t> explicit_paddings, TensorFormat data_format,
+    ConvBackpropDimensions* dims);
+
+// Computes the shape of the in_backprop.
+absl::Status Conv2DBackpropComputeInputShape(
+    const Tensor& input_sizes, const TensorShape& filter_shape,
+    const TensorShape& out_backprop_shape, const TensorFormat& data_format,
+    TensorShape* input_shape);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_GRAD_SHAPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops.h
new file mode 100644
index 00000000..65c63fec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops.h
@@ -0,0 +1,140 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+struct LaunchConv2DOp {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchConvOp {
+  void operator()(OpKernelContext* context, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter,
+                  const std::vector<int64>& dilations,
+                  const std::vector<int64>& strides, Padding padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  TensorFormat data_format, Tensor* output);
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename T>
+struct LaunchConv2DOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchConvOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* context, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter,
+                  const std::vector<int64>& dilations,
+                  const std::vector<int64>& strides, const Padding padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  TensorFormat data_format, Tensor* output);
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// Used to keep track of persistent memory buffers used within the op.
+// It uses malloc and free to avoid the time cost of initializing the memory.
+template <class T, size_t size>
+struct Im2ColBufferResource : public ResourceBase {
+  Im2ColBufferResource<T, size>() {
+    data = static_cast<T*>(port::Malloc(size * sizeof(T)));
+  }
+  ~Im2ColBufferResource<T, size>() { port::Free(data); }
+  // This mutex ensures that only a single operation at a time is able to use
+  // the buffer memory held by this resource.
+  mutex mu;
+  T* data;
+  string DebugString() const { return "Im2ColBufferResource"; }
+};
+
+// Convolution parameters specified by Op attributes.
+struct Conv2DParameters {
+  std::vector<int32> dilations;
+  std::vector<int32> strides;
+  Padding padding;
+  TensorFormat data_format;
+  std::vector<int64_t> explicit_paddings;
+};
+
+// Convolution dimensions inferred from parameters, input and filter tensors.
+struct Conv2DDimensions {
+  int batch;
+  int input_rows;
+  int input_cols;
+  int in_depth;
+
+  int filter_rows;
+  int filter_cols;
+  int patch_depth;
+  int out_depth;
+
+  int stride_rows;
+  int stride_cols;
+
+  int dilation_rows;
+  int dilation_cols;
+
+  int64_t out_rows;
+  int64_t out_cols;
+  int64_t pad_rows_before;
+  int64_t pad_rows_after;
+  int64_t pad_cols_before;
+  int64_t pad_cols_after;
+};
+
+// Initializes and validates Conv2D parameters configured by OpKernel
+// attributes.
+absl::Status InitConv2DParameters(const OpKernelConstruction* context,
+                                  Conv2DParameters* params);
+
+// Computes and validates convolutions dimensions from Conv2D parameters. If
+// parameters are valid, dimensions will be updated with derived convolution
+// dimensions, otherwise an error will be returned.
+absl::Status ComputeConv2DDimension(const Conv2DParameters& params,
+                                    const Tensor& input, const Tensor& filter,
+                                    Conv2DDimensions* dimensions);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_fused_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_fused_impl.h
new file mode 100644
index 00000000..5e35562b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -0,0 +1,848 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements convolution operations with other kernels baked into the
+// processing, to optimize latency and memory usage:
+//  - Conv2D + BiasAdd + <Activation>
+//  - Conv2D + FusedBatchNorm + <Activation>
+//
+// Activation: Relu, Relu6, Elu, etc...
+//
+// Kernels for convolutions fused with image transformations (resize and mirror
+// padding) defined in `conv_ops_fused_image_transform.cc`.
+//
+// For the CPU device we implement fusion with an Eigen tensor contraction
+// output kernel. For the GPU device we rely on CuDNN primitives.
+//
+// NOTE: GPU only supports fusion of Conv2D + BiasAdd + <optional Relu>.
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/fused_eigen_output_kernels.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cudnn/cudnn.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchFusedConv2DOp {
+  void operator()(OpKernelContext* context, bool use_cudnn,
+                  bool cudnn_use_autotune, const Tensor& input,
+                  const Tensor& filter, FusedComputationType fusion,
+                  const FusedComputationArgs& fusion_args,
+                  const Conv2DParameters& params,
+                  const Conv2DDimensions& dimensions, Tensor* output);
+};
+
+// This is CPU-only implementation that uses Eigen contraction output kernels.
+//
+// Dispatch 2D convolution to the appropriate primitive operation:
+//   (1) MatMul for the case of 1x1 convolution.
+//   (2) MatMul for the case when filter size equals to the input size.
+//   (3) General spatial 2D convolution for all other cases.
+template <typename T>
+class LaunchFusedConv2DWithOutputKernel {
+ public:
+  LaunchFusedConv2DWithOutputKernel(
+      int row_stride, int col_stride,      //
+      int row_dilation, int col_dilation,  //
+      Padding padding, const std::vector<int64_t>& explicit_paddings)
+      : row_stride_(row_stride),
+        col_stride_(col_stride),
+        row_dilation_(row_dilation),
+        col_dilation_(col_dilation),
+        padding_(padding),
+        explicit_paddings_(explicit_paddings) {}
+
+  template <typename OutputKernel>
+  void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
+                  const Tensor& input, const Tensor& filter, Tensor* output) {
+    // Wrap output_kernel into type erased wrapper to reduce the number of
+    // unique template instantiations for Eigen Tensor contraction expressions.
+    OutputKernelWrapper output_kernel_wrapper(
+        [&output_kernel](
+            const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+            const Eigen::TensorContractionParams& params, Eigen::Index i,
+            Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
+          output_kernel(output_mapper, params, i, j, num_rows, num_cols);
+        });
+
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
+        row_stride_ == 1 && col_stride_ == 1 && padding_ != EXPLICIT) {
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
+      }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair, std::move(output_kernel_wrapper));
+
+    } else if (filter.dim_size(0) == input.dim_size(1) &&
+               filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
+               col_dilation_ == 1 && padding_ == VALID) {
+      // If the input data and filter have the same height/width,
+      // reduce the 2D convolution to matrix multiplication.
+      const auto k =  // Length of reduction dimension.
+          filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
+          input.shaped<T, 2>({input.dim_size(0), k}),
+          filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
+          std::move(output_kernel_wrapper));
+
+    } else {
+      if (padding_ == EXPLICIT) {
+        functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
+            ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
+            col_stride_, row_dilation_, col_dilation_,
+            static_cast<int>(explicit_paddings_[2]),
+            static_cast<int>(explicit_paddings_[3]),
+            static_cast<int>(explicit_paddings_[4]),
+            static_cast<int>(explicit_paddings_[5]),
+            std::move(output_kernel_wrapper));
+      } else {
+        functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
+            ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
+            col_stride_, row_dilation_, col_dilation_,
+            BrainPadding2EigenPadding(padding_),
+            std::move(output_kernel_wrapper));
+      }
+    }
+  }
+
+ private:
+  // Wrap output_kernel into type erased struct to reduce the number of unique
+  // template instantiations for Eigen Tensor contraction expressions.
+  //
+  // We do not pass std::function directly as an output kernel because it blows
+  // up the binary size in debug mode with super long symbol names.
+  struct OutputKernelWrapper {
+    using OutputKernelFn =
+        std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
+                           const Eigen::TensorContractionParams&, Eigen::Index,
+                           Eigen::Index, Eigen::Index, Eigen::Index)>;
+
+    explicit OutputKernelWrapper(OutputKernelFn fn)
+        : output_kernel_fn(std::move(fn)) {}
+
+    void operator()(
+        const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+        const Eigen::TensorContractionParams& params, Eigen::Index i,
+        Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
+      output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
+    }
+
+    OutputKernelFn output_kernel_fn;
+  };
+
+  int row_stride_;
+  int col_stride_;
+  int row_dilation_;
+  int col_dilation_;
+  const Padding padding_;
+  const std::vector<int64_t>& explicit_paddings_;
+};
+
+template <typename T>
+struct LaunchFusedConv2DOp<CPUDevice, T> {
+  void operator()(OpKernelContext* context, bool use_cudnn,
+                  bool cudnn_use_autotune, const Tensor& input,
+                  const Tensor& filter, const FusedComputationType fusion,
+                  const FusedComputationArgs& fusion_args,
+                  const Conv2DParameters& params,
+                  const Conv2DDimensions& dimensions, Tensor* output) {
+    OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(2),
+                errors::Unimplemented("Fused conv implementation does not "
+                                      "support grouped convolutions for now."));
+    OP_REQUIRES(context, params.data_format == FORMAT_NHWC,
+                errors::Unimplemented("Fused conv implementation only supports "
+                                      "NHWC tensor format for now."));
+    OP_REQUIRES(context, DataTypeToEnum<T>::value != DT_HALF,
+                errors::Unimplemented("Fused conv implementation with half "
+                                      "precision is not supported on CPU."));
+
+    BiasAddArgs<T> bias_add_args;
+    if (BiasAddArgs<T>::IsSupported(fusion)) {
+      if (fusion == FusedComputationType::kBiasAddWithLeakyRelu) {
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args,
+                                                &fusion_args.leakyrelu_alpha));
+      } else {
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
+      }
+    }
+
+    FusedBatchNormArgs<T> fused_batch_norm_args;
+    if (FusedBatchNormArgs<T>::IsSupported(fusion)) {
+      if (fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu) {
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm_args,
+                                              &fusion_args.leakyrelu_alpha));
+      } else {
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm_args));
+      }
+    }
+
+    LaunchFusedConv2DWithOutputKernel<T> conv2d(
+        dimensions.stride_rows, dimensions.stride_cols,
+        dimensions.dilation_rows, dimensions.dilation_cols, params.padding,
+        params.explicit_paddings);
+
+    switch (fusion) {
+      case FusedComputationType::kUndefined:
+        OP_REQUIRES_OK(context, errors::Internal("Fusion type is undefined"));
+        break;
+      case FusedComputationType::kBiasAdd:
+        conv2d(WithBiasAdd<T>(bias_add_args), context, input, filter, output);
+        break;
+      case FusedComputationType::kBiasAddWithRelu:
+        conv2d(WithBiasAddAndRelu<T>(bias_add_args), context, input, filter,
+               output);
+        break;
+      case FusedComputationType::kBiasAddWithRelu6:
+        conv2d(WithBiasAddAndRelu6<T>(bias_add_args), context, input, filter,
+               output);
+        break;
+      case FusedComputationType::kBiasAddWithLeakyRelu:
+        conv2d(WithBiasAddAndLeakyRelu<T>(bias_add_args), context, input,
+               filter, output);
+        break;
+      case FusedComputationType::kBiasAddWithElu:
+        conv2d(WithBiasAddAndElu<T>(bias_add_args), context, input, filter,
+               output);
+        break;
+      case FusedComputationType::kFusedBatchNorm:
+        conv2d(
+            WithFusedBatchNorm<T>(fusion_args.epsilon, fused_batch_norm_args),
+            context, input, filter, output);
+        break;
+      case FusedComputationType::kFusedBatchNormWithRelu:
+        conv2d(WithFusedBatchNormAndRelu<T>(fusion_args.epsilon,
+                                            fused_batch_norm_args),
+               context, input, filter, output);
+        break;
+      case FusedComputationType::kFusedBatchNormWithRelu6:
+        conv2d(WithFusedBatchNormAndRelu6<T>(fusion_args.epsilon,
+                                             fused_batch_norm_args),
+               context, input, filter, output);
+        break;
+      case FusedComputationType::kFusedBatchNormWithLeakyRelu:
+        conv2d(WithFusedBatchNormAndLeakyRelu<T>(fusion_args.epsilon,
+                                                 fused_batch_norm_args),
+               context, input, filter, output);
+        break;
+      case FusedComputationType::kFusedBatchNormWithElu:
+        conv2d(WithFusedBatchNormAndElu<T>(fusion_args.epsilon,
+                                           fused_batch_norm_args),
+               context, input, filter, output);
+        break;
+      default:
+        OP_REQUIRES_OK(context, errors::Internal("Fusion type is unsupported"));
+        break;
+    }
+  }
+};
+
+template <>
+struct LaunchFusedConv2DOp<CPUDevice, int8>;
+
+template <>
+struct LaunchFusedConv2DOp<CPUDevice, qint8>;
+
+#if GOOGLE_CUDA
+
+inline int64_t ConvolveScratchSize() {
+  static int64_t convolve_scratch_size = GetDnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+  return convolve_scratch_size;
+}
+
+template <typename T>
+struct LaunchFusedConv2DOp<GPUDevice, T> {
+  void operator()(OpKernelContext* context, bool use_cudnn,
+                  bool cudnn_use_autotune, const Tensor& input_param,
+                  const Tensor& filter, FusedComputationType fusion,
+                  const FusedComputationArgs& fusion_args,
+                  const Conv2DParameters& params,
+                  const Conv2DDimensions& dimensions, Tensor* output) {
+    OP_REQUIRES(
+        context,
+        params.data_format == FORMAT_NHWC || params.data_format == FORMAT_NCHW,
+        errors::Unimplemented("Fused conv implementation only supports "
+                              "NHWC and HCHW tensor formats for now."));
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+    OP_REQUIRES(
+        context, use_cudnn,
+        errors::Unimplemented("FusedConv2D for GPU is not currently supported "
+                              "without cudnn"));
+
+    bool is_supported_activation =
+        fusion == FusedComputationType::kBiasAddWithRelu ||
+        fusion == FusedComputationType::kBiasAddWithRelu6 ||
+        fusion == FusedComputationType::kBiasAddWithElu ||
+        fusion == FusedComputationType::kBiasAddWithLeakyRelu;
+    OP_REQUIRES(
+        context, is_supported_activation,
+        errors::Unimplemented("FusedConv2D implementation only supports "
+                              "fusing with `BiasAdd + Relu|Relu6|Elu|LeakyRlue`"
+                              " for now."));
+
+    Tensor input = input_param;
+
+    const int64_t in_batch = GetTensorDim(input, params.data_format, 'N');
+    int64_t in_rows = GetTensorDim(input, params.data_format, 'H');
+    int64_t in_cols = GetTensorDim(input, params.data_format, 'W');
+    const int64_t in_depths = GetTensorDim(input, params.data_format, 'C');
+
+    const int64_t patch_rows = filter.dim_size(0);
+    const int64_t patch_cols = filter.dim_size(1);
+    const int64_t patch_depths = filter.dim_size(2);
+
+    const int64_t out_batch = GetTensorDim(*output, params.data_format, 'N');
+    const int64_t out_rows = GetTensorDim(*output, params.data_format, 'H');
+    const int64_t out_cols = GetTensorDim(*output, params.data_format, 'W');
+    const int64_t out_depths = GetTensorDim(*output, params.data_format, 'C');
+
+    // Bias of the following dimensions: [ output_depth ]
+    const Tensor& bias = context->input(2);
+    OP_REQUIRES(context, bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional",
+                                        bias.shape().DebugString()));
+    OP_REQUIRES(context, bias.dim_size(0) == out_depths,
+                errors::InvalidArgument("bias depth must be equal to out depth",
+                                        bias.shape().DebugString()));
+
+    const int64_t common_padding_rows =
+        std::min(dimensions.pad_rows_before, dimensions.pad_rows_after);
+    const int64_t common_padding_cols =
+        std::min(dimensions.pad_cols_before, dimensions.pad_cols_after);
+    if (dimensions.pad_rows_before != dimensions.pad_rows_after ||
+        dimensions.pad_cols_before != dimensions.pad_cols_after) {
+      // cuDNN only supports padding the same amount on the left and right
+      // sides, and on the top and bottom sides. So we manually create a new
+      // padded input tensor such that we can pass it to cuDNN.
+
+      // TODO(reedwm): In some cases, we can avoid an allocation even if the two
+      // padding sides are different. For example, if the input is 2x2, the
+      // filter is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the
+      // result is equivalent to as if the padding is (1, 1, 1, 1). Changing the
+      // padding in such a way would allow us to avoid the allocation.
+      Tensor transformed_input;
+      const int64_t padding_rows_diff =
+          std::abs(dimensions.pad_rows_after - dimensions.pad_rows_before);
+      const int64_t padding_cols_diff =
+          std::abs(dimensions.pad_cols_after - dimensions.pad_cols_before);
+      const int64_t new_in_rows = in_rows + padding_rows_diff;
+      const int64_t new_in_cols = in_cols + padding_cols_diff;
+      TensorShape transformed_input_shape;
+      OP_REQUIRES_OK(context,
+                     ShapeFromFormatWithStatus(
+                         params.data_format, in_batch, new_in_rows, new_in_cols,
+                         in_depths, &transformed_input_shape));
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                     transformed_input_shape,
+                                                     &transformed_input));
+      const int64_t input_pad_top =
+          dimensions.pad_rows_before - common_padding_rows;
+      const int64_t input_pad_bottom =
+          dimensions.pad_rows_after - common_padding_rows;
+      const int64_t input_pad_left =
+          dimensions.pad_cols_before - common_padding_cols;
+      const int64_t input_pad_right =
+          dimensions.pad_cols_after - common_padding_cols;
+      bool in_bounds =
+          FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+      if (!in_bounds) {
+        context->SetStatus(errors::InvalidArgument("Padding is too large."));
+        return;
+      }
+      functor::PadInput<GPUDevice, T, int, 4>()(
+          context->eigen_device<GPUDevice>(),
+          To32Bit(input_param.tensor<T, 4>()),
+          {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+          {{static_cast<int>(input_pad_bottom),
+            static_cast<int>(input_pad_right)}},
+          To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{});
+      input = transformed_input;
+      in_rows = new_in_rows;
+      in_cols = new_in_cols;
+    }
+
+    const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
+                                 stream->GetCudaComputeCapability().IsAtLeast(
+                                     se::CudaComputeCapability::VOLTA);
+    if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
+      // Convert the input tensor from NHWC to NCHW.
+      TensorShape nchw_shape;
+      OP_REQUIRES_OK(
+          context, ShapeFromFormatWithStatus(FORMAT_NCHW, in_batch, in_rows,
+                                             in_cols, in_depths, &nchw_shape));
+      if (in_depths > 1) {
+        Tensor transformed_input;
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<T>::value,
+                                              nchw_shape, &transformed_input));
+        functor::NHWCToNCHW<GPUDevice, T, 4>()(
+            context->eigen_device<GPUDevice>(),
+            const_cast<const Tensor&>(input).tensor<T, 4>(),
+            transformed_input.tensor<T, 4>());
+        input = transformed_input;
+      } else {
+        // If depth <= 1, then just reshape.
+        CHECK(input.CopyFrom(input, nchw_shape));  // Crash OK
+      }
+    }
+
+    CHECK(common_padding_rows >= 0) << "Negative padding rows";  // Crash OK
+    CHECK(common_padding_rows >= 0) << "Negative padding cols";  // Crash OK
+
+    se::dnn::ActivationMode dnn_activation_mode;
+    switch (fusion) {
+      case FusedComputationType::kBiasAddWithRelu:
+        dnn_activation_mode = se::dnn::ActivationMode::kRelu;
+        break;
+      case FusedComputationType::kBiasAddWithRelu6:
+        dnn_activation_mode = se::dnn::ActivationMode::kRelu6;
+        break;
+      case FusedComputationType::kBiasAddWithElu:
+        dnn_activation_mode = se::dnn::ActivationMode::kElu;
+        break;
+      case FusedComputationType::kBiasAddWithLeakyRelu:
+        dnn_activation_mode = se::dnn::ActivationMode::kLeakyRelu;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported fusion type";  // Crash OK
+    }
+
+    const TensorFormat compute_data_format =
+        compute_in_nhwc ? FORMAT_NHWC : FORMAT_NCHW;
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+    std::tie(compute_data_layout, filter_layout) =
+        compute_in_nhwc ? kComputeInNHWC : kComputeInNCHW;
+
+    se::dnn::BatchDescriptor input_desc;
+    input_desc.set_count(in_batch)
+        .set_feature_map_count(in_depths)
+        .set_height(in_rows)
+        .set_width(in_cols)
+        .set_layout(compute_data_layout);
+    se::dnn::FilterDescriptor filter_desc;
+    filter_desc.set_input_filter_height(patch_rows)
+        .set_input_filter_width(patch_cols)
+        .set_input_feature_map_count(patch_depths)
+        .set_output_feature_map_count(filter.dim_size(3))
+        .set_layout(filter_layout);
+    se::dnn::BatchDescriptor bias_desc;
+    bias_desc.set_count(1)
+        .set_height(1)
+        .set_width(1)
+        .set_feature_map_count(out_depths)
+        .set_layout(compute_data_layout);
+    se::dnn::ConvolutionDescriptor conv_desc;
+    conv_desc.set_vertical_dilation_rate(dimensions.dilation_rows)
+        .set_horizontal_dilation_rate(dimensions.dilation_cols)
+        .set_vertical_filter_stride(dimensions.stride_rows)
+        .set_horizontal_filter_stride(dimensions.stride_cols)
+        .set_zero_padding_height(common_padding_rows)
+        .set_zero_padding_width(common_padding_cols)
+        .set_group_count(in_depths / patch_depths);
+    se::dnn::BatchDescriptor output_desc;
+    output_desc.set_count(out_batch)
+        .set_height(out_rows)
+        .set_width(out_cols)
+        .set_feature_map_count(out_depths)
+        .set_layout(compute_data_layout);
+
+    Tensor transformed_filter;
+    const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
+      VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
+              << " to " << ToString(dst_format);
+
+      TensorShape dst_shape =
+          dst_format == FORMAT_OIHW
+              ? TensorShape({filter.dim_size(3), filter.dim_size(2),
+                             filter.dim_size(0), filter.dim_size(1)})
+              : TensorShape({filter.dim_size(3), filter.dim_size(0),
+                             filter.dim_size(1), filter.dim_size(2)});
+
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          DataTypeToEnum<T>::value, dst_shape, &transformed_filter));
+      functor::TransformFilter<GPUDevice, T, int, 4>()(
+          context->eigen_device<GPUDevice>(), dst_format,
+          To32Bit(filter.tensor<T, 4>()),
+          To32Bit(transformed_filter.tensor<T, 4>()));
+
+      return OkStatus();
+    };
+
+    if (compute_in_nhwc) {
+      OP_REQUIRES_OK(context, transform_filter(FORMAT_OHWI));
+    } else {
+      OP_REQUIRES_OK(context, transform_filter(FORMAT_OIHW));
+    }
+
+    Tensor transformed_output;
+    if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
+      // Only allocate temporary memory when a layout transformation is needed.
+      TensorShape transformed_output_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NCHW, out_batch, out_rows, out_cols,
+                                  out_depths, &transformed_output_shape));
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                     transformed_output_shape,
+                                                     &transformed_output));
+    } else {
+      transformed_output = *output;
+    }
+
+    const auto tensor_on_device = [](const Tensor& t) -> se::DeviceMemory<T> {
+      return AsDeviceMemory(t.template flat<T>().data(),
+                            t.template flat<T>().size());
+    };
+
+    se::DeviceMemory<T> input_ptr = tensor_on_device(input);
+    se::DeviceMemory<T> filter_ptr = tensor_on_device(transformed_filter);
+    se::DeviceMemory<T> bias_ptr = tensor_on_device(bias);
+    se::DeviceMemory<T> output_ptr = tensor_on_device(transformed_output);
+
+    // We do not use side inputs, so we can safely pass nullptr.
+    se::DeviceMemory<T> side_input_ptr =
+        AsDeviceMemory(static_cast<T*>(nullptr), 0);
+
+    constexpr double kConvScale = 1.0;
+    constexpr double kSideInputScale = 0.0;
+    double leakyrelu_alpha = fusion_args.leakyrelu_alpha;
+
+    DataType dtype = input.dtype();
+    ConvParameters conv_parameters = {
+        stream->parent(),
+        in_batch,                      // batch
+        in_depths,                     // in_depths
+        {{in_rows,                     // in_rows
+          in_cols}},                   // in_cols
+        compute_data_format,           // compute_data_format
+        out_depths,                    // out_depths
+        {{patch_rows,                  // filter_rows
+          patch_cols,                  // filter_cols
+          patch_depths}},              // filter_depths
+        {{dimensions.dilation_rows,    // dilation_rows
+          dimensions.dilation_cols}},  // dilation_cols
+        {{dimensions.stride_rows,      // stride_rows
+          dimensions.stride_cols}},    // stride_cols
+        {{common_padding_rows,         // padding_rows
+          common_padding_cols}},       // padding_cols
+        dtype,                         // tensor datatype
+        conv_desc.group_count(),
+        ConvParameters::FusionInfo{kConvScale, kSideInputScale, leakyrelu_alpha,
+                                   dnn_activation_mode,  // activation_mode
+                                   /*is_contrib=*/false}};
+
+    se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
+
+    auto entry_or = AutotuneFusedConv<T>(
+        cudnn_use_autotune, FusedConvAutotuneMap::GetInstance(),
+        conv_parameters, context, input_desc, filter_desc, bias_desc,
+        output_desc, conv_desc, dnn_activation_mode, kConvScale,
+        kSideInputScale, leakyrelu_alpha, input_ptr, filter_ptr, output_ptr,
+        bias_ptr, side_input_ptr, ConvolveScratchSize());
+    OP_REQUIRES_OK(context, entry_or.status());
+    auto autotune_entry = std::move(entry_or).value();
+
+    DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
+    Status cudnn_launch_status;
+    if (!autotune_entry.is_algorithm_config()) {
+      auto& runners = autotune_entry.GetOpRunners();
+      se::dnn::FusedConvOp::Config config{se::dnn::ConvolutionKind::FORWARD,
+                                          element_type,
+                                          element_type,
+                                          element_type,
+                                          kConvScale,
+                                          kSideInputScale,
+                                          leakyrelu_alpha,
+                                          input_desc,
+                                          filter_desc,
+                                          bias_desc,
+                                          output_desc,
+                                          conv_desc,
+                                          dnn_activation_mode};
+      auto primary_or = runners.primary->GetOrCreateRunner(config, stream);
+      OP_REQUIRES_OK(context, primary_or.status());
+      auto* primary = primary_or.value();
+
+      const se::dnn::FusedConvRunner* no_scratch_fallback = nullptr;
+      if (runners.no_scratch_fallback) {
+        auto no_scratch_fallback_or =
+            runners.no_scratch_fallback->GetOrCreateRunner(config, stream);
+        OP_REQUIRES_OK(context, no_scratch_fallback_or.status());
+        no_scratch_fallback = no_scratch_fallback_or.value();
+      }
+
+      auto runner_and_scratch_or =
+          AllocateScratchOrFallback<se::dnn::FusedConvOp::Signature>(
+              &scratch_allocator, primary, no_scratch_fallback);
+      OP_REQUIRES_OK(context, runner_and_scratch_or.status());
+      auto runner_and_scratch = std::move(runner_and_scratch_or).value();
+      auto& runner =
+          *std::get<const se::dnn::FusedConvRunner*>(runner_and_scratch);
+      cudnn_launch_status = runner(
+          stream, nullptr, std::get<se::DeviceMemoryBase>(runner_and_scratch),
+          input_ptr, filter_ptr, side_input_ptr, bias_ptr, output_ptr);
+    } else {
+      auto dnn = stream->parent()->AsDnn();
+      OP_REQUIRES(context, dnn != nullptr,
+                  absl::InternalError("No DNN for stream."));
+      cudnn_launch_status = dnn->FusedConvolveWithAlgorithm(
+          stream, input_desc, input_ptr,    // input
+          kConvScale,                       // input_scale
+          filter_desc, filter_ptr,          // filter
+          conv_desc,                        // conv
+          side_input_ptr, kSideInputScale,  // side_input
+          bias_desc, bias_ptr,              // bias
+          dnn_activation_mode,              // activation
+          output_desc, &output_ptr,         // output
+          &scratch_allocator, autotune_entry.GetAlgorithmConfig(), nullptr);
+    }
+
+    OP_REQUIRES_OK(context, cudnn_launch_status);
+
+    // Convert the output tensor back from NCHW to NHWC.
+    if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
+      functor::NCHWToNHWC<GPUDevice, T, 4>()(
+          context->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
+          output->tensor<T, 4>());
+    }
+  }
+};
+
+template <>
+struct LaunchFusedConv2DOp<GPUDevice, int8>;
+
+template <>
+struct LaunchFusedConv2DOp<GPUDevice, qint8>;
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T>
+class FusedConv2DOp : public OpKernel {
+ public:
+  explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+
+    using FCT = FusedComputationType;
+
+    std::vector<FusedComputationPattern> patterns;
+    if (std::is_same<Device, CPUDevice>::value) {
+      patterns = {
+          {FCT::kBiasAdd, {"BiasAdd"}},
+          {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
+          {FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
+          {FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
+          {FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
+          {FCT::kFusedBatchNorm, {"FusedBatchNorm"}},
+          {FCT::kFusedBatchNormWithRelu, {"FusedBatchNorm", "Relu"}},
+          {FCT::kFusedBatchNormWithRelu6, {"FusedBatchNorm", "Relu6"}},
+          {FCT::kFusedBatchNormWithElu, {"FusedBatchNorm", "Elu"}},
+          {FCT::kFusedBatchNormWithLeakyRelu, {"FusedBatchNorm", "LeakyRelu"}},
+      };
+    }
+
+    // NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
+    // identity activation function, it in theory should allow to fuse
+    // convolution with BiasAdd, but in practice it doesn't work, cuDNN ignores
+    // this parameter and always does Relu activation.
+    if (std::is_same<Device, GPUDevice>::value) {
+      if (std::is_same<T, int8>::value || std::is_same<T, qint8>::value) {
+        patterns = {{FCT::kBiasAdd, {"BiasAdd"}},
+                    {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}}};
+      } else {
+        patterns = {
+            {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
+            {FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
+            {FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
+            {FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
+        };
+      }
+    }
+
+    OP_REQUIRES_OK(context, InitializeFusedComputation(
+                                context, "Conv2D", patterns,
+                                &fused_computation_, &fused_computation_args_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    Conv2DDimensions dimensions;
+    OP_REQUIRES_OK(context,
+                   ComputeConv2DDimension(params_, input, filter, &dimensions));
+
+    TensorShape out_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(
+                     params_.data_format, dimensions.batch, dimensions.out_rows,
+                     dimensions.out_cols, dimensions.out_depth, &out_shape));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2D: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    LaunchFusedConv2DOp<Device, T>()(context, use_cudnn_, cudnn_use_autotune_,
+                                     input, filter, fused_computation_,
+                                     fused_computation_args_, params_,
+                                     dimensions, output);
+  }
+
+ private:
+  Conv2DParameters params_;
+  bool use_cudnn_;
+  bool cudnn_use_autotune_;
+
+  FusedComputationType fused_computation_ = FusedComputationType::kUndefined;
+  FusedComputationArgs fused_computation_args_;
+
+  FusedConv2DOp(const FusedConv2DOp&) = delete;
+  void operator=(const FusedConv2DOp&) = delete;
+};
+
+// Registration of the CPU implementations.
+#define REGISTER_FUSED_CPU_CONV2D(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      FusedConv2DOp<CPUDevice, T>);
+
+#if GOOGLE_CUDA
+
+#define DECLARE_FUNCTOR_GPU_SPEC(T)                                     \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      const T& padding_value);                                          \
+  extern template struct PadInput<GPUDevice, T, int, 4>
+
+// Registration of the GPU implementations.
+#define REGISTER_FUSED_GPU_CONV2D(T)                    \
+  REGISTER_KERNEL_BUILDER(Name("_FusedConv2D")          \
+                              .Device(DEVICE_GPU)       \
+                              .TypeConstraint<T>("T")   \
+                              .HostMemory("host_args"), \
+                          FusedConv2DOp<GPUDevice, T>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_gpu.h
new file mode 100644
index 00000000..627450ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_gpu.h
@@ -0,0 +1,213 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <tuple>
+#include <unordered_map>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+bool ComputeInNhwcEnabled(DataType data_type, se::Stream* stream,
+                          bool use_4d_tensor = true);
+
+// Get the Dnn workspace limit from the environment variable, which is in MB.
+// Return the workspace memory limit in bytes. If no value is set, return the
+// default value.
+int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
+                           int64_t default_value_in_bytes);
+
+// Call the Dnn workspace limit from TF_CUDNN_WORKSPACE_LIMIT_IN_MB or default.
+int64 GetDnnWorkspaceLimitOrDefault();
+
+// A class to provide scratch-space allocator for Stream-Executor Cudnn
+// callback. TensorFlow is responsible for releasing the temporary buffers after
+// the kernel finishes.
+class DnnScratchAllocator : public se::ScratchAllocator {
+ public:
+  virtual ~DnnScratchAllocator() {}
+  DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
+      : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
+  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
+  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      int64_t byte_size) override {
+    Tensor temporary_memory;
+    if (byte_size < 0) {
+      return tsl::Status{absl::StatusCode::kInvalidArgument,
+                         "Requested negative byte size!"};
+    }
+    if (byte_size > memory_limit_) {
+      return tsl::Status{absl::StatusCode::kUnavailable,
+                         absl::StrCat("Requested memory size (", byte_size,
+                                      ") exceeds the max memory limit (",
+                                      memory_limit_, ").")};
+    }
+    AllocationAttributes allocation_attr;
+    allocation_attr.retry_on_failure = false;
+    Status allocation_status(context_->allocate_temp(
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory,
+        AllocatorAttributes(), allocation_attr));
+    if (!allocation_status.ok()) {
+      return tsl::Status{
+          absl::StatusCode::kUnavailable,
+          absl::StrCat("Failed to allocate the requested memory size (",
+                       byte_size, ").")};
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    total_byte_size_ += byte_size;
+    return tsl::StatusOr<se::DeviceMemory<uint8>>(
+        AsDeviceMemory(temporary_memory.flat<uint8>().data(),
+                       temporary_memory.flat<uint8>().size()));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 memory_limit_;
+  int64 total_byte_size_;
+  OpKernelContext* context_;
+  std::vector<Tensor> allocated_tensors_;
+};
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Select an algorithm for the given convolution, either by running actual
+// autotuning with a cache, or by falling back to a default if
+// 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend.
+template <typename T>
+StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
+    bool cudnn_use_autotune,
+    AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
+        autotune_map,
+    const ConvParameters& params, OpKernelContext* ctx,
+    const se::dnn::BatchDescriptor& input_desc,
+    const se::dnn::FilterDescriptor& filter_desc,
+    const se::dnn::BatchDescriptor& bias_desc,
+    const se::dnn::BatchDescriptor& output_desc,
+    const se::dnn::ConvolutionDescriptor& conv_desc,
+    const se::dnn::ActivationMode activation_mode, double conv_input_scale,
+    double side_input_scale, double leakyrelu_alpha,
+    se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr,
+    se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr,
+    se::DeviceMemory<T> side_input_ptr, int64_t scratch_size);
+
+template <typename T>
+StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
+    bool cudnn_use_autotune,
+    AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
+    const ConvParameters& conv_parameters, OpKernelContext* ctx,
+    se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
+    se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
+    se::DeviceMemory<T> filter_ptr,
+    const se::dnn::ConvolutionDescriptor& conv_desc,
+    const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
+    int64_t scratch_size_limit);
+
+// Returns a pointer to the primary 'OpRunner' of 'runners' and allocated
+// scratch memory if allocatable; else a pointer to its fallback
+// no-scratch-space runner, and a null 'DeviceMemoryBase'.
+template <typename Sig>
+StatusOr<std::tuple<const se::dnn::OpRunner<Sig>*, se::DeviceMemoryBase>>
+AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator,
+                          const se::dnn::OpRunner<Sig>* primary,
+                          const se::dnn::OpRunner<Sig>* no_scratch_fallback) {
+  const se::dnn::OpRunner<Sig>* selected_runner = primary;
+
+  auto workspace_size = selected_runner->GetWorkspaceSize();
+
+  se::DeviceMemoryBase scratch_memory;
+  if (workspace_size > 0) {
+    auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
+    if (scratch_or.ok()) {
+      scratch_memory = scratch_or.value();
+    } else if ((selected_runner = no_scratch_fallback)) {
+      if (selected_runner->GetWorkspaceSize() > 0) {
+        return errors::Internal(
+            "No-scratch fallback runner requires nonzero scratch space");
+      }
+    } else {
+      return errors::Unknown(
+          "CUDNN failed to allocate the scratch space for the runner or to "
+          "find a working no-scratch runner.");
+    }
+  }
+
+  return std::make_tuple(selected_runner, scratch_memory);
+}
+
+template <typename T>
+Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
+                           DnnScratchAllocator* scratch_allocator,
+                           se::dnn::ConvolutionKind kind, se::Stream* stream,
+                           const se::dnn::BatchDescriptor& input_desc,
+                           se::DeviceMemory<T> in_ptr,
+                           const se::dnn::FilterDescriptor& filter_desc,
+                           se::DeviceMemory<T> filter_ptr,
+                           const se::dnn::ConvolutionDescriptor& conv_desc,
+                           const se::dnn::BatchDescriptor& output_desc,
+                           se::DeviceMemory<T> out_ptr) {
+  if (!autotune_entry.is_algorithm_config()) {
+    const auto& runners = autotune_entry.GetOpRunners();
+    se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
+    se::dnn::ConvOp::Config config{kind,       element_type, element_type,
+                                   input_desc, filter_desc,  output_desc,
+                                   conv_desc};
+    TF_ASSIGN_OR_RETURN(auto* primary,
+                        runners.primary->GetOrCreateRunner(config, stream));
+
+    const se::dnn::ConvRunner* no_scratch_fallback = nullptr;
+    if (runners.no_scratch_fallback) {
+      TF_ASSIGN_OR_RETURN(
+          no_scratch_fallback,
+          runners.no_scratch_fallback->GetOrCreateRunner(config, stream));
+    }
+
+    TF_ASSIGN_OR_RETURN(auto runner_and_scratch,
+                        AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
+                            scratch_allocator, primary, no_scratch_fallback));
+    auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
+    return runner(stream, nullptr,
+                  std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
+                  filter_ptr, out_ptr);
+  } else {
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InternalError("No DNN for stream.");
+    }
+    return dnn->ConvolveWithAlgorithm(
+        stream, kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc,
+        out_ptr, conv_desc, scratch_allocator,
+        autotune_entry.GetAlgorithmConfig(), nullptr);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_impl.h
new file mode 100644
index 00000000..0d3fc798
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/conv_ops_impl.h
@@ -0,0 +1,1284 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_IMPL_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/op_requires.h"
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <string.h>
+
+#include <array>
+#include <atomic>
+#include <functional>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "absl/synchronization/blocking_counter.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_3d.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/deep_conv2d.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchGeneric {
+  void operator()(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int row_stride, int col_stride,
+                  int row_dilation, int col_dilation, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format) {
+    DCHECK(data_format == FORMAT_NHWC)
+        << "Generic conv implementation only "
+           "supports NHWC tensor format for now.";
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
+        col_stride == 1 && (padding == SAME || padding == VALID)) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      //
+      // TODO(vrv): We should be able to call SpatialConvolution
+      // and it will produce the same result, but doing so
+      // led to NaNs during training.  Using matmul instead for now.
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
+      }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<Device, T>()(
+          ctx->eigen_device<Device>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair);
+    } else if (filter.dim_size(0) == input.dim_size(1) &&
+               filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
+               col_dilation == 1 && padding == VALID) {
+      // If the input data and filter have the same height/width,
+      // the 2D convolution is reduced to matrix multiplication.
+      const int k =  // Length of reduction dimension.
+          filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<Device, T>()(
+          ctx->eigen_device<Device>(),
+          output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
+          input.shaped<T, 2>({input.dim_size(0), k}),
+          filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair);
+    } else {
+      if (padding == EXPLICIT) {
+        functor::SpatialConvolution<Device, T>()(
+            ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
+            row_dilation, col_dilation, static_cast<int>(explicit_paddings[2]),
+            static_cast<int>(explicit_paddings[3]),
+            static_cast<int>(explicit_paddings[4]),
+            static_cast<int>(explicit_paddings[5]));
+      } else {
+        functor::SpatialConvolution<Device, T>()(
+            ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+            input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
+            row_dilation, col_dilation, BrainPadding2EigenPadding(padding));
+      }
+    }
+  }
+};
+
+// Compute grouped 2D convolutions on CPU. Unlike grouped convolution
+// implementation in cuDNN this is faaaaaar from optimal and needs more work
+// to deliver competitive performance. Currently it exists to close the feature
+// parity gap between convolution operations on different devices.
+template <typename T>
+struct LaunchGrouped {
+  void operator()(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int row_stride, int col_stride,
+                  int row_dilation, int col_dilation, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format) {
+    DCHECK(data_format == FORMAT_NHWC)
+        << "Grouped conv implementation only "
+           "supports NHWC tensor format for now.";
+
+    const int64_t in_depth = input.dim_size(3);
+    const int64_t patch_depth = filter.dim_size(2);
+    const int64_t num_groups = in_depth / patch_depth;
+
+    // Shuffle input/filter tensors to have group as a leading dimension.
+    std::array<int64_t, 5> shuffle({3, 0, 1, 2, 4});
+
+    // Compute pre shuffle dimemnsions.
+    auto pre_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> {
+      return {tensor.dim_size(0), tensor.dim_size(1), tensor.dim_size(2),
+              num_groups, tensor.dim_size(3) / num_groups};
+    };
+
+    // Compute post shuffle dimemnsions.
+    auto post_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> {
+      return {num_groups, tensor.dim_size(0), tensor.dim_size(1),
+              tensor.dim_size(2), tensor.dim_size(3) / num_groups};
+    };
+
+    auto& device = ctx->eigen_device<CPUDevice>();
+
+    absl::BlockingCounter shuffles_completed(2);
+    auto on_shuffled = [&]() { shuffles_completed.DecrementCount(); };
+
+    // Shuffle input into temporary tensor.
+    Tensor input_shuffled;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(input.dtype(), TensorShape(post_shuffle(input)),
+                                &input_shuffled));
+    input_shuffled.tensor<T, 5>().device(device, on_shuffled) =
+        input.shaped<T, 5>(pre_shuffle(input)).shuffle(shuffle);
+
+    // Shuffle filter into temporary tensor.
+    Tensor filter_shuffled;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(filter.dtype(),
+                                           TensorShape(post_shuffle(filter)),
+                                           &filter_shuffled));
+    filter_shuffled.tensor<T, 5>().device(device, on_shuffled) =
+        filter.shaped<T, 5>(pre_shuffle(filter)).shuffle(shuffle);
+
+    // Wait for the completion of input/filter shuffles.
+    shuffles_completed.Wait();
+
+    // Write group convolution results into temporary output tensor.
+    Tensor output_shuffled;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(output->dtype(),
+                                           TensorShape(post_shuffle(*output)),
+                                           &output_shuffled));
+
+    for (int64_t i = 0; i < num_groups; ++i) {
+      // TODO(ezhulenev): Run this loop using `parallelFor` (regular parallelFor
+      // will lead to deadlock, SpatialConvolution has to use async Eigen
+      // assignment). This requires small changes to Eigen to support async
+      // exeuction for tensor chipping operation.
+
+      // TODO(ezhulenev): Grouped convolution should also support 1x1 filter
+      // optimization.
+
+      auto input_slice = input_shuffled.tensor<T, 5>().template chip<0>(i);
+      auto filter_slice = filter_shuffled.tensor<T, 5>().template chip<0>(i);
+      auto output_slice = output_shuffled.tensor<T, 5>().template chip<0>(i);
+
+      if (padding == EXPLICIT) {
+        functor::SpatialConvolution<CPUDevice, T>()(
+            ctx->eigen_device<CPUDevice>(), output_slice, input_slice,
+            filter_slice, row_stride, col_stride, row_dilation, col_dilation,
+            static_cast<int>(explicit_paddings[2]),
+            static_cast<int>(explicit_paddings[3]),
+            static_cast<int>(explicit_paddings[4]),
+            static_cast<int>(explicit_paddings[5]));
+      } else {
+        functor::SpatialConvolution<CPUDevice, T>()(
+            ctx->eigen_device<CPUDevice>(), output_slice, input_slice,
+            filter_slice, row_stride, col_stride, row_dilation, col_dilation,
+            BrainPadding2EigenPadding(padding));
+      }
+    }
+
+    // Shuffle temporary output back into pre-shuffled shape.
+    std::array<int64_t, 5> rev_shuffle({1, 2, 3, 0, 4});
+    output->shaped<T, 5>(pre_shuffle(*output)).device(device) =
+        output_shuffled.tensor<T, 5>().shuffle(rev_shuffle);
+  }
+};
+
+template <typename Device, typename T>
+struct LaunchConvOp;
+
+template <typename T>
+struct LaunchConvOp<CPUDevice, T> {
+  void operator()(OpKernelContext* context, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter,
+                  const std::vector<int64>& dilations,
+                  const std::vector<int64>& strides, const Padding padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  TensorFormat data_format, Tensor* output) {
+    // For now just calling existing launchers based on spatial dimensions.
+    int spatial_dims = input.dims() - 2;
+
+    if (spatial_dims == 2) {
+      LaunchConv2DOp<CPUDevice, T>()(context, true, cudnn_use_autotune, input,
+                                     filter, dilations[1], dilations[2],
+                                     strides[1], strides[2], padding,
+                                     explicit_paddings, output, data_format);
+    } else {
+      LaunchConv3DOp<CPUDevice, T>().launch(
+          context, cudnn_use_autotune, input, filter,
+          {dilations[1], dilations[2], dilations[3]},
+          {strides[1], strides[2], strides[3]}, padding, data_format, output);
+    }
+  }
+};
+
+template <typename Device, typename T>
+class ConvOp : public BinaryOp<T> {
+ public:
+  explicit ConvOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    // TODO(b/290223810) Add support for grouped and depthwise convolutions.
+    OP_REQUIRES_OK(context, context->GetAttr("groups", &groups_));
+    OP_REQUIRES(context, groups_ == 1,
+                absl::UnimplementedError(
+                    "Grouped/Depthwise Convolutions are not supported yet."));
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context,
+                data_format_str == "CHANNELS_LAST" ||
+                    data_format_str == "CHANNELS_FIRST",
+                absl::InvalidArgumentError(
+                    absl::StrCat("Unknown data format: ", data_format_str)));
+    data_format_ =
+        data_format_str == "CHANNELS_LAST" ? FORMAT_NHWC : FORMAT_NCHW;
+
+    // Always assume filter_format is HWIO / DHWIO.
+    filter_format_ = FilterTensorFormat::FORMAT_HWIO;
+
+    // These parameters are checked against spatial dimensions on compute.
+    OP_REQUIRES_OK(context, context->GetAttr("batch_dims", &batch_dims_));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    if (context->HasAttr("explicit_paddings")) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, [spatial_dims], in_depth ].
+    const Tensor& input = context->input(0);
+    size_t original_input_dims = context->input(0).dims();
+    const TensorShape original_input_shape = context->input(0).shape();
+    int spatial_dims = original_input_dims - 1 - batch_dims_;
+
+    // Input filter is of the following dimensions:
+    // [ batch, [spatial dims], in_depth ].
+    const Tensor& filter = context->input(1);
+
+    OP_REQUIRES(context, (spatial_dims == 2 || spatial_dims == 3),
+                absl::InvalidArgumentError(absl::StrCat(
+                    "The input must have 2 or 3 spatial dimensions but got ",
+                    spatial_dims)));
+
+    OP_REQUIRES(
+        context, filter.NumElements() > 0,
+        absl::InvalidArgumentError("filter must not have zero elements "
+                                   "(i.e. all dimensions must be non-zero)"));
+
+    // Flatten tensor for computation.
+    Tensor input_flat;
+    if (batch_dims_ == 1) {
+      input_flat = input;
+    } else {
+      std::vector<int64_t> in_flat_shape_vec(1, 1);
+      for (int i = 0; i < batch_dims_; ++i) {
+        in_flat_shape_vec[0] *= original_input_shape.dim_size(i);
+      }
+      for (int i = batch_dims_; i < original_input_shape.dims(); ++i) {
+        in_flat_shape_vec.push_back(original_input_shape.dim_size(i));
+      }
+      TensorShape in_flat_shape(in_flat_shape_vec);
+      if (!input_flat.CopyFrom(input, in_flat_shape)) {
+        // This should never happen, since the output sizes should always be the
+        // same after expanding batches.
+        context->SetStatus(absl::InternalError(absl::StrCat(
+            "Could not flatten input shape ",
+            original_input_shape.DebugString(), " and flat input shape ",
+            in_flat_shape.DebugString())));
+      }
+    }
+
+    OP_REQUIRES(context, filter.dims() == 4 || filter.dims() == 5,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "The filter must be rank 4 or 5 but got ", filter.dims())));
+    for (int i = 0; i < spatial_dims; i++) {
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          absl::InvalidArgumentError("filter too large"));
+    }
+
+    // Validate operation parameters based on inferred spatial dims.
+    OP_REQUIRES(context, strides_.size() == spatial_dims + 2,
+                absl::InvalidArgumentError(
+                    absl::StrCat("Sliding window strides field must specify ",
+                                 spatial_dims + 2, " dimensions")));
+
+    OP_REQUIRES(context,
+                (GetTensorDim(strides_, data_format_, 'C') == 1 &&
+                 GetTensorDim(strides_, data_format_, 'N') == 1),
+                absl::InvalidArgumentError(
+                    "Current implementation does not support "
+                    "strides in the batch and depth dimensions."));
+    bool stride_valid = true;
+    for (int i = 0; i < spatial_dims; ++i) {
+      stride_valid =
+          stride_valid && (GetTensorDim(strides_, data_format_,
+                                        static_cast<char>(i + '0')) > 0);
+    }
+    OP_REQUIRES(
+        context, stride_valid,
+        absl::InvalidArgumentError("Spatial strides should be larger than 0."));
+    if (dilations_.empty()) {
+      dilations_ = std::vector<int64_t>(spatial_dims + 2, 1);
+    } else {
+      OP_REQUIRES(context, dilations_.size() == spatial_dims + 2,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Dilation rates field must specify",
+                                   spatial_dims + 2, "dimensions")));
+      OP_REQUIRES(context,
+                  (GetTensorDim(dilations_, data_format_, 'N') == 1 &&
+                   GetTensorDim(dilations_, data_format_, 'C') == 1),
+                  absl::InvalidArgumentError(
+                      "Current implementation does not support "
+                      "dilation rates in the batch and depth dimensions."));
+      bool dilation_valid = true;
+      for (int i = 0; i < spatial_dims; ++i) {
+        dilation_valid =
+            dilation_valid && (GetTensorDim(dilations_, data_format_,
+                                            static_cast<char>(i + '0')) > 0);
+      }
+      OP_REQUIRES(
+          context, dilation_valid,
+          absl::InvalidArgumentError("Dilated rates should be larger than 0."));
+    }
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              spatial_dims + 2, data_format_));
+
+    const int64_t in_depth_raw = GetTensorDim(input_flat, data_format_, 'C');
+    const int64_t patch_depth_raw = GetFilterDim(filter, filter_format_, 'I');
+    OP_REQUIRES(context,
+                FastBoundsCheck(in_depth_raw, std::numeric_limits<int>::max()),
+                absl::InvalidArgumentError("Input depth too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(patch_depth_raw, std::numeric_limits<int>::max()),
+        absl::InvalidArgumentError("Patch depth too large"));
+    const int in_depth = static_cast<int>(in_depth_raw);
+    const int patch_depth = static_cast<int>(patch_depth_raw);
+    OP_REQUIRES(
+        context, patch_depth > 0,
+        absl::InvalidArgumentError(absl::StrCat(
+            "filter depth must be stricly positive, got ", patch_depth)));
+    OP_REQUIRES(context, in_depth == patch_depth,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "Input depth must be equal to filter depth: ", in_depth,
+                    " vs ", patch_depth)));
+
+    const int out_depth =
+        static_cast<int>(GetFilterDim(filter, filter_format_, 'O'));
+
+    std::vector<int64_t> input_dims_raw(spatial_dims);
+    std::vector<int> input_dims(spatial_dims);
+    std::vector<int> filter_dims(spatial_dims);
+    for (int i = 0; i < spatial_dims; ++i) {
+      input_dims_raw[i] =
+          GetTensorDim(input_flat, data_format_, static_cast<char>(i + '0'));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(input_dims_raw[i], std::numeric_limits<int>::max()),
+          absl::InvalidArgumentError(
+              absl::StrCat("Input spatial dimension ", i, " too large")));
+      input_dims[i] = static_cast<int>(input_dims_raw[i]);
+      filter_dims[i] = static_cast<int>(
+          GetFilterDim(filter, filter_format_, static_cast<char>(i + '0')));
+    }
+    // The first dimension for input is batch.
+    const int64_t batch_raw = GetTensorDim(input_flat, data_format_, 'N');
+    OP_REQUIRES(context,
+                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+                absl::InvalidArgumentError("Batch is too large"));
+    const int batch = static_cast<int>(batch_raw);
+
+    // Take the stride and dilation from the spatial dimensions only (we
+    // do not support striding or dilation on the batch or depth dimension).
+    std::vector<int64_t> stride_dims(spatial_dims);
+    std::vector<int64_t> dilation_dims(spatial_dims);
+    for (int i = 0; i < spatial_dims; ++i) {
+      stride_dims[i] =
+          GetTensorDim(strides_, data_format_, static_cast<char>(i + '0'));
+      dilation_dims[i] =
+          GetTensorDim(dilations_, data_format_, static_cast<char>(i + '0'));
+    }
+    std::vector<int64_t> pad_before(spatial_dims, -1);
+    std::vector<int64_t> pad_after(spatial_dims, -1);
+    if (padding_ == Padding::EXPLICIT) {
+      GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H',
+                               &pad_before[0], &pad_after[0]);
+      GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W',
+                               &pad_before[1], &pad_after[1]);
+    }
+
+    // Compute windowed output sizes for spatial dimensions.
+    std::vector<int64_t> out_dims(spatial_dims);
+    for (int i = 0; i < spatial_dims; ++i) {
+      OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                  input_dims[i], filter_dims[i],
+                                  dilation_dims[i], stride_dims[i], padding_,
+                                  &out_dims[i], &pad_before[i], &pad_after[i]));
+    }
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context,
+                   ShapeFromFormatWithStatus(data_format_, batch, out_dims,
+                                             out_depth, &out_shape));
+
+    Tensor* output;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    // If the input is empty, result can only be due to padding.
+    if (input_flat.NumElements() == 0) {
+      // Zero-out output and return.
+      functor::SetZeroFunctor<Device, T>()(context->eigen_device<Device>(),
+                                           output->template flat<T>());
+
+      return;
+    }
+
+    launcher_(context, cudnn_use_autotune_, input_flat, filter, dilations_,
+              strides_, padding_, explicit_paddings_, data_format_, output);
+
+    // Reshape the output to preserve original batch dimensions.
+    if (batch_dims_ != 1) {
+      std::vector<int64_t> reshape_vect(batch_dims_);
+      for (int i = 0; i < batch_dims_; ++i) {
+        reshape_vect[i] = original_input_shape.dim_size(i);
+      }
+      for (int i = 1; i < out_shape.dims(); ++i) {
+        reshape_vect.push_back(out_shape.dim_size(i));
+      }
+      TensorShape expanded_out_shape(reshape_vect);
+      if (!output->CopyFrom(*output, expanded_out_shape)) {
+        // This should never happen, since the output sizes should always be the
+        // same after expanding batches.
+        context->SetStatus(absl::InternalError(
+            absl::StrCat("Could not expand dimension with flat output shape ",
+                         out_shape.DebugString(), " and expanded output shape ",
+                         expanded_out_shape.DebugString())));
+      }
+    }
+  }
+
+ private:
+  std::vector<int64_t> strides_;
+  Padding padding_;
+  std::vector<int64_t> explicit_paddings_;
+  TensorFormat data_format_;
+  FilterTensorFormat filter_format_;
+  std::vector<int64_t> dilations_;
+  int batch_dims_;
+  int groups_;
+  bool cudnn_use_autotune_;
+
+  LaunchConvOp<Device, T> launcher_;
+
+  ConvOp(const ConvOp&) = delete;
+  void operator=(const ConvOp&) = delete;
+};
+
+template <typename T>
+struct LaunchConv2DOp<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
+                  TensorFormat data_format) {
+    if (data_format != FORMAT_NHWC) {
+      ctx->SetStatus(errors::Unimplemented(
+          "The Conv2D op currently only supports the NHWC tensor format on the "
+          "CPU. The op was given the format: ",
+          ToString(data_format)));
+      return;
+    }
+
+    for (int64_t explicit_padding : explicit_paddings) {
+      if (!FastBoundsCheck(explicit_padding, std::numeric_limits<int>::max())) {
+        ctx->SetStatus(errors::InvalidArgument("filter too large"));
+        return;
+      }
+    }
+
+    const int64_t in_depth = input.dim_size(3);
+    const int64_t out_depth = output->dim_size(3);
+    const int64_t patch_depth = filter.dim_size(2);
+
+    if (patch_depth <= 0) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "filter depth must be stricly positive, got ", patch_depth));
+      return;
+    }
+    if (in_depth % patch_depth != 0) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "input depth must be evenly divisible by filter depth: ", in_depth,
+          " vs ", patch_depth));
+      return;
+    }
+    if (filter.NumElements() <= 0) {
+      ctx->SetStatus(
+          errors::InvalidArgument("filter must not have zero elements "
+                                  "(i.e. all dimensions must be non-zero)"));
+      return;
+    }
+
+    const int64_t num_groups = in_depth / patch_depth;
+    if (num_groups <= 0) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "number of groups must be stricly positive, got ", num_groups));
+      return;
+    }
+    if (out_depth % num_groups != 0 || out_depth < num_groups) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "output depth must be evenly divisible by number of groups: ",
+          out_depth, " vs ", num_groups));
+      return;
+    }
+
+    if (in_depth != patch_depth) {
+      LaunchGrouped<T>()(ctx, input, filter, row_stride, col_stride,
+                         row_dilation, col_dilation, padding, explicit_paddings,
+                         output, data_format);
+    } else {
+      LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
+                                    row_dilation, col_dilation, padding,
+                                    explicit_paddings, output, data_format);
+    }
+  }
+};
+extern template struct LaunchConv2DOp<CPUDevice, Eigen::bfloat16>;
+extern template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
+extern template struct LaunchConv2DOp<CPUDevice, float>;
+extern template struct LaunchConv2DOp<CPUDevice, double>;
+extern template struct LaunchConv2DOp<CPUDevice, int32>;
+
+template <typename Device, typename T>
+class LaunchDeepConvOp {
+ public:
+  static bool Run(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int batch, int input_rows,
+                  int input_cols, int in_depth, int filter_rows,
+                  int filter_cols, int pad_rows, int pad_cols, int out_rows,
+                  int /*out_cols*/, int /*out_depth*/, int /*dilation_rows*/,
+                  int /*dilation_cols*/, int /*stride_rows*/,
+                  int /*stride_cols*/, Tensor* /*output*/,
+                  TensorFormat /*data_format*/) {
+    return false;
+  }
+};
+
+template <typename Device, typename T>
+class Conv2DOp : public BinaryOp<T> {
+ public:
+  explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    Conv2DDimensions dimensions;
+    OP_REQUIRES_OK(context,
+                   ComputeConv2DDimension(params_, input, filter, &dimensions));
+
+    TensorShape out_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(
+                     params_.data_format, dimensions.batch, dimensions.out_rows,
+                     dimensions.out_cols, dimensions.out_depth, &out_shape));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "Conv2D: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    // If the input is empty, result can only be due to padding.
+    if (input.NumElements() == 0) {
+      // Zero-out output and return.
+      functor::SetZeroFunctor<Device, T>()(context->eigen_device<Device>(),
+                                           output->template flat<T>());
+
+      return;
+    }
+
+    if (params_.padding != EXPLICIT &&
+        LaunchDeepConvOp<Device, T>::Run(
+            context, input, filter, dimensions.batch, dimensions.input_rows,
+            dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
+      return;
+    }
+
+    launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
+              dimensions.dilation_rows, dimensions.dilation_cols,
+              dimensions.stride_rows, dimensions.stride_cols, params_.padding,
+              params_.explicit_paddings, output, params_.data_format);
+  }
+
+ private:
+  Conv2DParameters params_;
+  bool use_cudnn_;
+  bool cudnn_use_autotune_;
+
+  LaunchConv2DOp<Device, T> launcher_;
+
+  Conv2DOp(const Conv2DOp&) = delete;
+  void operator=(const Conv2DOp&) = delete;
+};
+extern template struct Conv2DOp<CPUDevice, Eigen::bfloat16>;
+extern template struct Conv2DOp<CPUDevice, Eigen::half>;
+extern template struct Conv2DOp<CPUDevice, float>;
+extern template struct Conv2DOp<CPUDevice, double>;
+extern template struct Conv2DOp<CPUDevice, int32>;
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename T>
+void LaunchConvOpImpl(OpKernelContext* context, bool cudnn_use_autotune,
+                      const Tensor& input_param, const Tensor& filter,
+                      const gtl::InlinedVector<int64_t, 3>& dilations,
+                      const gtl::InlinedVector<int64_t, 3>& strides,
+                      const Padding& padding,
+                      const std::vector<int64_t>& explicit_paddings,
+                      TensorFormat data_format, Tensor* output) {
+  auto* stream = context->op_device_context()->stream();
+  OP_REQUIRES(context, stream, absl::InternalError("No GPU stream available."));
+
+  Tensor input = input_param;
+
+  int spatial_dims = input.dims() - 2;
+  std::vector<int64_t> in_dims(spatial_dims);
+
+  const int64_t in_batch = GetTensorDim(input, data_format, 'N');
+  for (int i = 0; i < spatial_dims; ++i) {
+    in_dims[i] = GetTensorDim(input, data_format, static_cast<char>('0' + i));
+  }
+  const int64_t in_depth = GetTensorDim(input, data_format, 'C');
+
+  std::vector<int64_t> filter_dims(spatial_dims);
+  for (int i = 0; i < spatial_dims; ++i) {
+    filter_dims[i] = filter.dim_size(i);
+  }
+  const int64_t filter_depth = filter.dim_size(spatial_dims);
+  const int64_t out_depth = filter.dim_size(spatial_dims + 1);
+
+  OP_REQUIRES(
+      context, filter.NumElements() > 0,
+      absl::InvalidArgumentError("filter must not have zero elements "
+                                 "(i.e. all dimensions must be non-zero)"));
+
+  bool is_grouped_convolution = filter_depth != in_depth;
+  // check if filter is 1x1 and stride/dilation are all ones
+  bool one_filter = true;
+  bool one_dilations = true;
+  bool one_stride = true;
+  for (int i = 0; i < spatial_dims; ++i) {
+    one_filter = one_filter && (filter_dims[i] == 1);
+    one_dilations = one_dilations && (dilations[i] == 1);
+    one_stride = one_stride && (strides[i] == 1);
+  }
+  // check if filter is same spatial shape as input
+  bool filter_same_dims = true;
+  for (int i = 0; i < spatial_dims; ++i) {
+    if (filter_dims[i] != in_dims[i]) filter_same_dims = false;
+  }
+
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(context, blas != nullptr,
+              absl::InternalError("No BLAS for stream."));
+  if (!is_grouped_convolution && one_filter && one_dilations && one_stride &&
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
+    // 1x1 filter, so call cublas directly.
+    const uint64 m = in_batch * std::accumulate(in_dims.begin(), in_dims.end(),
+                                                1, std::multiplies<>{});
+    const uint64 k = in_depth;
+    const uint64 n = out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                output->template flat<T>().size());
+
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
+    OP_REQUIRES_OK(context, blas->BlasGemm(stream, no_transpose, no_transpose,
+                                           n, m, k, b_ptr, n, a_ptr, k, &c_ptr,
+                                           n, GetNumericOptions(),
+                                           se::blas::CallContext::kNone));
+    return;
+  } else if (!is_grouped_convolution && filter_same_dims && padding == VALID &&
+             data_format == FORMAT_NHWC) {
+    // The input data and filter have the same spatial dimensions, so call
+    // cublas directly.
+    const uint64 m = in_batch;
+    const uint64 k = in_depth * std::accumulate(in_dims.begin(), in_dims.end(),
+                                                1, std::multiplies<>{});
+    const uint64 n = out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                output->template flat<T>().size());
+
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
+    OP_REQUIRES_OK(context, blas->BlasGemm(stream, no_transpose, no_transpose,
+                                           n, m, k, b_ptr, n, a_ptr, k, &c_ptr,
+                                           n, GetNumericOptions(),
+                                           se::blas::CallContext::kNone));
+    return;
+  }
+
+  const bool compute_in_nhwc = ComputeInNhwcEnabled(
+      DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/(spatial_dims == 2));
+  const TensorFormat compute_data_format =
+      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                      : FORMAT_NCHW;
+
+  VLOG(3) << "Compute Conv with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
+  std::vector<int64_t> out_dims(output->dims());
+  for (int i = 0; i < output->dims(); ++i) {
+    out_dims[i] = output->dim_size(i);
+  }
+  std::vector<std::pair<int64_t, int64_t>> paddings(spatial_dims, {-1, -1});
+  // Explicit only on 2D case.
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                             &paddings[0].first, &paddings[0].second);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                             &paddings[1].first, &paddings[1].second);
+  }
+
+  // Get padding values, output should be valid, since it was checked before.
+  std::vector<int64_t> out_dims_check(spatial_dims);
+  for (int i = 0; i < spatial_dims; ++i) {
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                in_dims[i], filter_dims[i], dilations[i],
+                                strides[i], padding, &out_dims_check[i],
+                                &paddings[i].first, &paddings[i].second));
+    OP_REQUIRES(context,
+                (out_dims_check[i] == GetTensorDim(*output, data_format,
+                                                   static_cast<char>('0' + i))),
+                absl::InternalError("Output dimension doesn't match yo"));
+  }
+
+  bool assymmetric_padding = false;
+  std::vector<int64_t> common_padding(spatial_dims);
+  for (int i = 0; i < spatial_dims; ++i) {
+    common_padding[i] = std::min(paddings[i].first, paddings[i].second);
+    assymmetric_padding =
+        assymmetric_padding || (paddings[i].first != paddings[i].second);
+  }
+
+  if (assymmetric_padding) {
+    // cuDNN only supports padding the same amount on either side. So we
+    // manually create a new padded input tensor.
+    Tensor transformed_input;
+    std::vector<int64_t> new_in_dims(input.dims());
+    new_in_dims[0] = in_batch;
+    for (int i = 0; i < spatial_dims; ++i) {
+      int index = GetTensorSpatialDimIndex(input.dims(), data_format, i);
+      new_in_dims[index] =
+          in_dims[i] + std::abs(paddings[i].first - paddings[i].second);
+    }
+    new_in_dims[GetTensorDimIndex(data_format, 'C', input.dims())] = in_depth;
+    TensorShape transformed_input_shape(new_in_dims);
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_input_shape,
+                                                   &transformed_input));
+
+    // Padding to add on transformed input.
+    std::vector<std::pair<int64_t, int64_t>> transformed_input_padding(
+        paddings);
+    for (int i = 0; i < spatial_dims; ++i) {
+      transformed_input_padding[i].first -= common_padding[i];
+      transformed_input_padding[i].second -= common_padding[i];
+    }
+
+    // Check padding size.
+    bool padding_bounds_valid = true;
+    for (int i = 0; i < spatial_dims; ++i) {
+      padding_bounds_valid =
+          padding_bounds_valid &&
+          FastBoundsCheck(transformed_input_padding[i].first,
+                          std::numeric_limits<int>::max()) &&
+          FastBoundsCheck(transformed_input_padding[i].second,
+                          std::numeric_limits<int>::max());
+    }
+    OP_REQUIRES(context, padding_bounds_valid,
+                absl::InvalidArgumentError("Padding is too large."));
+
+    // Pad new input.
+    if (input.dims() == 4) {
+      std::array<int, 2> pad_left{
+          static_cast<int>(transformed_input_padding[0].first),
+          static_cast<int>(transformed_input_padding[1].first)};
+      std::array<int, 2> pad_right{
+          static_cast<int>(transformed_input_padding[0].second),
+          static_cast<int>(transformed_input_padding[1].second)};
+      functor::PadInput<GPUDevice, T, int, 4>()(
+          context->eigen_device<GPUDevice>(),
+          To32Bit(static_cast<const Tensor&>(input).tensor<T, 4>()), pad_left,
+          pad_right, To32Bit(transformed_input.tensor<T, 4>()), data_format,
+          T{});
+    } else if (input.dims() == 5) {
+      std::array<int, 3> pad_left{
+          static_cast<int>(transformed_input_padding[0].first),
+          static_cast<int>(transformed_input_padding[1].first),
+          static_cast<int>(transformed_input_padding[2].first)};
+      std::array<int, 3> pad_right{
+          static_cast<int>(transformed_input_padding[0].second),
+          static_cast<int>(transformed_input_padding[1].second),
+          static_cast<int>(transformed_input_padding[2].second)};
+      functor::PadInput<GPUDevice, T, int, 5>()(
+          context->eigen_device<GPUDevice>(),
+          To32Bit(static_cast<const Tensor&>(input).tensor<T, 5>()), pad_left,
+          pad_right, To32Bit(transformed_input.tensor<T, 5>()), data_format,
+          T{});
+    } else {
+      context->SetStatus(
+          absl::InternalError("Failed to pad input, invalid dimensions."));
+    }
+
+    input = transformed_input;
+    for (int i = 0; i < spatial_dims; ++i) {
+      in_dims[i] = new_in_dims[GetTensorDimIndex(
+          data_format, static_cast<char>('0' + i), input.dims())];
+    }
+  }
+
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the input tensor from NHWC to NCHW.";
+
+    TensorShape channels_first_shape;
+    OP_REQUIRES_OK(context,
+                   ShapeFromFormatWithStatus(FORMAT_NCHW, in_batch, in_dims,
+                                             in_depth, &channels_first_shape));
+
+    if (in_depth > 1) {
+      Tensor transformed_input;
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                     channels_first_shape,
+                                                     &transformed_input));
+      if (input.dims() == 4) {
+        functor::NHWCToNCHW<GPUDevice, T, 4>()(
+            context->eigen_device<GPUDevice>(),
+            const_cast<const Tensor&>(input).tensor<T, 4>(),
+            transformed_input.tensor<T, 4>());
+      } else if (input.dims() == 5) {
+        functor::NHWCToNCHW<GPUDevice, T, 5>()(
+            context->eigen_device<GPUDevice>(),
+            const_cast<const Tensor&>(input).tensor<T, 5>(),
+            transformed_input.tensor<T, 5>());
+      } else {
+        context->SetStatus(
+            absl::InternalError("Failed to reshape input to channels first "
+                                "format, invalid dimensions."));
+      }
+      input = transformed_input;
+    } else {
+      // Depth = 1, reshape.
+      if (!input.CopyFrom(input, channels_first_shape)) {
+        context->SetStatus(absl::InternalError(
+            "Failed to reshape input to channels first format."));
+      }
+    }
+  } else {
+    DCHECK(data_format == compute_data_format)  // Crash OK.
+        << "Illegal data and compute format pair:"
+        << " data_format=" << ToString(data_format)
+        << " compute_data_format=" << ToString(compute_data_format);
+  }
+
+  // Check paddings are not negative.
+  bool non_negative_paddings = true;
+  for (int i = 0; i < spatial_dims; ++i) {
+    non_negative_paddings = non_negative_paddings && common_padding[i] >= 0;
+  }
+  OP_REQUIRES(context, non_negative_paddings,
+              absl::InvalidArgumentError("Padding is negative."));
+
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
+  se::dnn::BatchDescriptor input_desc(spatial_dims);
+  input_desc.set_count(in_batch).set_feature_map_count(in_depth).set_layout(
+      compute_data_layout);
+  if (spatial_dims == 2) {
+    input_desc.set_spatial_dim(stream_executor::dnn::DimIndex::X, in_dims[1])
+        .set_spatial_dim(stream_executor::dnn::DimIndex::Y, in_dims[0]);
+  } else if (spatial_dims == 3) {
+    input_desc.set_spatial_dim(stream_executor::dnn::DimIndex::X, in_dims[2])
+        .set_spatial_dim(stream_executor::dnn::DimIndex::Y, in_dims[1])
+        .set_spatial_dim(stream_executor::dnn::DimIndex::Z, in_dims[0]);
+  } else {
+    context->SetStatus(
+        absl::InternalError("Failed to set Input Descripitor:"
+                            " invalid number of spatial dimensions"));
+  }
+
+  se::dnn::BatchDescriptor output_desc(spatial_dims);
+  output_desc.set_count(GetTensorDim(*output, data_format, 'N'))
+      .set_feature_map_count(GetTensorDim(*output, data_format, 'C'))
+      .set_layout(compute_data_layout);
+  if (spatial_dims == 2) {
+    output_desc
+        .set_spatial_dim(
+            stream_executor::dnn::DimIndex::X,
+            GetTensorDim(*output, data_format, static_cast<int>('1')))
+        .set_spatial_dim(
+            stream_executor::dnn::DimIndex::Y,
+            GetTensorDim(*output, data_format, static_cast<int>('0')));
+  } else if (spatial_dims == 3) {
+    output_desc
+        .set_spatial_dim(
+            stream_executor::dnn::DimIndex::X,
+            GetTensorDim(*output, data_format, static_cast<int>('2')))
+        .set_spatial_dim(
+            stream_executor::dnn::DimIndex::Y,
+            GetTensorDim(*output, data_format, static_cast<int>('1')))
+        .set_spatial_dim(
+            stream_executor::dnn::DimIndex::Z,
+            GetTensorDim(*output, data_format, static_cast<int>('0')));
+  } else {
+    context->SetStatus(
+        absl::InternalError("Failed to set Output Descripitor: invalid "
+                            "number of spatial dimensions"));
+  }
+
+  se::dnn::FilterDescriptor filter_desc(spatial_dims);
+  filter_desc.set_input_feature_map_count(filter_depth)
+      .set_output_feature_map_count(out_depth)
+      .set_layout(filter_layout);
+  if (spatial_dims == 2) {
+    filter_desc
+        .set_spatial_dim(stream_executor::dnn::DimIndex::X, filter_dims[1])
+        .set_spatial_dim(stream_executor::dnn::DimIndex::Y, filter_dims[0]);
+  } else if (spatial_dims == 3) {
+    filter_desc
+        .set_spatial_dim(stream_executor::dnn::DimIndex::X, filter_dims[2])
+        .set_spatial_dim(stream_executor::dnn::DimIndex::Y, filter_dims[1])
+        .set_spatial_dim(stream_executor::dnn::DimIndex::Z, filter_dims[0]);
+  } else {
+    context->SetStatus(
+        absl::InternalError("Failed to set Filter Descripitor: invalid "
+                            "number of spatial dimensions"));
+  }
+
+  se::dnn::ConvolutionDescriptor conv_desc(spatial_dims);
+  if (spatial_dims == 2) {
+    conv_desc.set_dilation_rate(stream_executor::dnn::DimIndex::X, dilations[1])
+        .set_dilation_rate(stream_executor::dnn::DimIndex::Y, dilations[0])
+        .set_filter_stride(stream_executor::dnn::DimIndex::X, strides[1])
+        .set_filter_stride(stream_executor::dnn::DimIndex::Y, strides[0])
+        .set_zero_padding(stream_executor::dnn::DimIndex::X, common_padding[1])
+        .set_zero_padding(stream_executor::dnn::DimIndex::Y, common_padding[0]);
+  } else if (spatial_dims == 3) {
+    conv_desc.set_dilation_rate(stream_executor::dnn::DimIndex::X, dilations[2])
+        .set_dilation_rate(stream_executor::dnn::DimIndex::Y, dilations[1])
+        .set_dilation_rate(stream_executor::dnn::DimIndex::Z, dilations[0])
+        .set_filter_stride(stream_executor::dnn::DimIndex::X, strides[2])
+        .set_filter_stride(stream_executor::dnn::DimIndex::Y, strides[1])
+        .set_filter_stride(stream_executor::dnn::DimIndex::Z, strides[0])
+        .set_zero_padding(stream_executor::dnn::DimIndex::X, common_padding[2])
+        .set_zero_padding(stream_executor::dnn::DimIndex::Y, common_padding[1])
+        .set_zero_padding(stream_executor::dnn::DimIndex::Z, common_padding[0]);
+  } else {
+    context->SetStatus(
+        absl::InternalError("Failed to set Convolution Descripitor: invalid "
+                            "number of spatial dimensions"));
+  }
+  conv_desc.set_group_count(1);
+  // TODO(b/290223810) Change group count when implementing group/depthwise.
+  Tensor transformed_filter;
+  auto dst_format =
+      compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+  VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO) << " to "
+          << ToString(dst_format);
+  std::vector<int64_t> dst_shape_vec(spatial_dims + 2);
+  dst_shape_vec[0] = out_depth;
+  if (dst_format == FORMAT_OIHW) {
+    dst_shape_vec[1] = filter_depth;
+    for (int i = 2; i < filter.dims(); ++i) {
+      dst_shape_vec[i] = filter_dims[i - 2];
+    }
+  } else {
+    // Format OHWI
+    dst_shape_vec[filter.dims() - 1] = filter_depth;
+    for (int i = 1; i < filter.dims() - 1; ++i) {
+      dst_shape_vec[i] = filter_dims[i - 1];
+    }
+  }
+  TensorShape dst_shape(dst_shape_vec);
+  OP_REQUIRES_OK(context,
+                 context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                        &transformed_filter));
+
+  // Filter: [(spatial_dims), in, out] (HWIO)
+  // T_filter: [out, in, (spatial_dims)] (OIHW) or
+  // T_filter: [out, (spatial_dims), in] (OHWI)
+  if (spatial_dims == 2) {
+    functor::TransformFilter<GPUDevice, T, int, 4>()(
+        context->eigen_device<GPUDevice>(), dst_format,
+        To32Bit(filter.tensor<T, 4>()),
+        To32Bit(transformed_filter.tensor<T, 4>()));
+  } else if (spatial_dims == 3) {
+    functor::TransformFilter<GPUDevice, T, int, 5>()(
+        context->eigen_device<GPUDevice>(), dst_format,
+        To32Bit(filter.tensor<T, 5>()),
+        To32Bit(transformed_filter.tensor<T, 5>()));
+  } else {
+    context->SetStatus(absl::InternalError(
+        "Failed to reshape filter, invalid spatial dimensions."));
+  }
+
+  Tensor transformed_output;
+  if (data_format != compute_data_format) {
+    VLOG(4) << "Allocate temporary memory for output in compute data format";
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, in_batch, out_dims_check,
+                                out_depth, &transformed_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_output_shape,
+                                                   &transformed_output));
+  } else {
+    transformed_output = *output;
+  }
+
+  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                  input.template flat<T>().size());
+  auto filter_ptr =
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
+  auto output_ptr =
+      AsDeviceMemory(transformed_output.template flat<T>().data(),
+                     transformed_output.template flat<T>().size());
+
+  static int64_t ConvolveScratchSize = GetDnnWorkspaceLimitOrDefault();
+
+  if (spatial_dims == 2) {
+    filter_dims.push_back(filter_depth);
+  }
+  ConvParameters conv_parameters = {
+      stream->parent(),
+      in_batch,             // batch
+      in_depth,             // in_depths
+      in_dims,              // input spatial dims
+      compute_data_format,  // compute_data_format
+      out_depth,            // out_depths
+      filter_dims,          // filter spatial dims
+      dilations,            // dilations
+      strides,              // strides
+      common_padding,       // paddings (symmetrical)
+      input.dtype(),        // tensor datatype
+      conv_desc.group_count(),
+  };
+
+  auto entry_or = AutotuneUnfusedConv(
+      cudnn_use_autotune, ConvAutotuneMap::GetInstance(), conv_parameters,
+      context, se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr,
+      filter_desc, filter_ptr, conv_desc, output_desc, output_ptr,
+      ConvolveScratchSize);
+  OP_REQUIRES_OK(context, entry_or.status());
+  auto autotune_entry = std::move(entry_or).value();
+
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, context);
+  Status cudnn_launch_status = LaunchAutotunedConv(
+      autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD,
+      stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+      output_desc, output_ptr);
+  if (!cudnn_launch_status.ok()) {
+    context->SetStatus(cudnn_launch_status);
+    return;
+  }
+
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
+    if (spatial_dims == 2) {
+      functor::NCHWToNHWC<GPUDevice, T, 4>()(
+          context->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
+          output->tensor<T, 4>());
+    } else if (spatial_dims == 3) {
+      functor::NCHWToNHWC<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
+          output->tensor<T, 5>());
+    } else {
+      context->SetStatus(absl::InternalError(
+          "Failed to convert output data foramt, invalid spatial dimensions."));
+    }
+  }
+}
+
+template <typename T>
+void LaunchConvOp<GPUDevice, T>::operator()(
+    OpKernelContext* context, bool cudnn_use_autotune, const Tensor& input,
+    const Tensor& filter, const std::vector<int64>& dilations,
+    const std::vector<int64>& strides, const Padding padding,
+    const std::vector<int64_t>& explicit_paddings, TensorFormat data_format,
+    Tensor* output) {
+  // Get spatial dims for dilations and strides.
+  int spatial_dims = input.dims() - 2;
+  gtl::InlinedVector<int64_t, 3> strides_spatial(spatial_dims);
+  gtl::InlinedVector<int64_t, 3> dilations_spatial(spatial_dims);
+  for (int i = 0; i < spatial_dims; ++i) {
+    strides_spatial[i] =
+        GetTensorDim(strides, data_format, static_cast<char>(i + '0'));
+    dilations_spatial[i] =
+        GetTensorDim(dilations, data_format, static_cast<char>(i + '0'));
+  }
+  LaunchConvOpImpl<T>(context, cudnn_use_autotune, input, filter,
+                      dilations_spatial, strides_spatial, padding,
+                      explicit_paddings, data_format, output);
+}
+
+template <typename T>
+void LaunchConv2DOp<GPUDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& input_param, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* output,
+    TensorFormat data_format) {
+  // Cast strides and dilations.
+  gtl::InlinedVector<int64_t, 3> casted_strides = {row_stride, col_stride};
+  gtl::InlinedVector<int64_t, 3> casted_dilations = {row_dilation,
+                                                     col_dilation};
+  LaunchConvOpImpl<T>(ctx, cudnn_use_autotune, input_param, filter,
+                      casted_dilations, casted_strides, padding,
+                      explicit_paddings, data_format, output);
+}
+
+// To be used inside depthwise_conv_op.cc.
+extern template struct LaunchConv2DOp<GPUDevice, float>;
+// extern template struct LaunchConv2DOp<GPUDevice, Eigen::bfloat16>;
+extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
+extern template struct LaunchConv2DOp<GPUDevice, double>;
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cross_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cross_op.h
new file mode 100644
index 00000000..cf5956ac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cross_op.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CROSS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CROSS_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename Type>
+struct Cross {
+  void operator()(const Device &d,
+                  typename TTypes<Type, 2>::ConstTensor in0_data,
+                  typename TTypes<Type, 2>::ConstTensor in1_data,
+                  typename TTypes<Type, 2>::Tensor output_data) {
+    auto s1 = output_data.template chip<1>(0);
+    auto s2 = output_data.template chip<1>(1);
+    auto s3 = output_data.template chip<1>(2);
+
+    auto u1 = in0_data.template chip<1>(0);
+    auto u2 = in0_data.template chip<1>(1);
+    auto u3 = in0_data.template chip<1>(2);
+
+    auto v1 = in1_data.template chip<1>(0);
+    auto v2 = in1_data.template chip<1>(1);
+    auto v3 = in1_data.template chip<1>(2);
+
+    s1.device(d) = u2 * v3 - u3 * v2;
+    s2.device(d) = u3 * v1 - u1 * v3;
+    s3.device(d) = u1 * v2 - u2 * v1;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CROSS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cudnn_pooling_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cudnn_pooling_gpu.h
new file mode 100644
index 00000000..970eb533
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -0,0 +1,70 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions to run 3d pooling on GPU using CuDNN.
+
+#ifndef TENSORFLOW_CORE_KERNELS_CUDNN_POOLING_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CUDNN_POOLING_GPU_H_
+
+#include <array>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/stream_executor.h"
+#endif
+
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// Runs (avg/max)pooling on GPU.
+// Dimension order for all array arguments is: x, y, z.
+template <typename T>
+class DnnPooling3dOp {
+ public:
+  static void Compute(OpKernelContext* context,
+                      se::dnn::PoolingMode pooling_mode,
+                      const std::array<int64, 3>& size,
+                      const std::array<int64, 3>& stride,
+                      const std::array<int64, 3>& padding,
+                      TensorFormat data_format, const Tensor& tensor_in,
+                      Tensor* output);
+};
+
+// Computes the gradient of (avg/max)pooling on GPU.
+// Dimension order for all array arguments is: x, y, z.
+template <typename T>
+class DnnPooling3dGradOp {
+ public:
+  static void Compute(OpKernelContext* context,
+                      se::dnn::PoolingMode pooling_mode,
+                      const std::array<int64, 3>& window,
+                      const std::array<int64, 3>& stride,
+                      const std::array<int64, 3>& padding,
+                      const std::array<int64, 3>& output_size,
+                      TensorFormat data_format, const Tensor& out_backprop,
+                      const TensorShape& tensor_in_shape,
+                      const Tensor* tensor_in, const Tensor* tensor_out,
+                      Tensor* input_backprop);
+};
+
+#endif
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CUDNN_POOLING_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_op_clip.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_op_clip.h
new file mode 100644
index 00000000..171b6932
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_op_clip.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename Device, typename T>
+struct UnaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename Device, typename T>
+struct BinaryRightClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename Device, typename T>
+struct BinaryLeftClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename Device, typename T>
+struct TernaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops.h
new file mode 100644
index 00000000..06d75372
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops.h
@@ -0,0 +1,1340 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_H_
+
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <functional>
+#include <type_traits>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace Eigen {
+namespace internal {
+
+#if GOOGLE_CUDA
+template <>
+struct scalar_arg_op<std::complex<float>> {
+  typedef typename Eigen::NumTraits<std::complex<float>>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(
+      const std::complex<float>& a) const {
+    return ::atan2f(a.imag(), a.real());
+  }
+};
+
+template <>
+struct scalar_arg_op<std::complex<double>> {
+  typedef typename Eigen::NumTraits<std::complex<double>>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double operator()(
+      const std::complex<double>& a) const {
+    return ::atan2(a.imag(), a.real());
+  }
+};
+#endif
+
+template <typename Scalar, typename Exponent>
+struct safe_scalar_binary_pow_op {
+  static_assert(std::is_integral<Scalar>::value, "Integer type expected");
+  static_assert(std::is_integral<Exponent>::value &&
+                    std::is_signed<Exponent>::value,
+                "Signed integer type expected");
+
+  bool* const error;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_scalar_binary_pow_op(bool* error)
+      : error(error) {}
+
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a,
+                                             const Exponent& b) const {
+    const Exponent safe_b = tensorflow::internal::SubtleMustCopy(b);
+    if (TF_PREDICT_TRUE(safe_b >= 0)) {
+      return numext::pow(a, safe_b);
+    } else {
+      *error = true;
+      return 0;
+    }
+  }
+};
+
+template <typename Scalar, typename Exponent>
+struct functor_traits<safe_scalar_binary_pow_op<Scalar, Exponent>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+template <typename T, typename DivOrMod>
+struct safe_div_or_mod_op {
+  static_assert(std::is_integral<T>::value, "Integer type expected");
+
+  bool* const error;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_div_or_mod_op(bool* error)
+      : error(error) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
+                                                     const T& b) const {
+    const T safe_b = tensorflow::internal::SubtleMustCopy(b);
+    if (TF_PREDICT_TRUE(safe_b != 0)) {
+      // Avoid FPE for INT_MIN/-1.
+      const T safe_a = tensorflow::internal::SubtleMustCopy(a);
+      if (TF_PREDICT_FALSE(std::is_signed<T>::value &&
+                           safe_a == std::numeric_limits<T>::min() &&
+                           safe_b == T(-1))) {
+        // Prefer to overflow 'a' instead of crashing.
+        return DivOrMod()(-safe_a, 1);
+      }
+      return DivOrMod()(safe_a, safe_b);
+    } else {
+      *error = true;
+      return 0;
+    }
+  }
+};
+
+template <typename T, typename DivOrMod>
+struct functor_traits<safe_div_or_mod_op<T, DivOrMod>> {
+  enum {
+    Cost = functor_traits<DivOrMod>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = false,
+  };
+};
+
+template <typename T, typename Binary>
+struct no_nan_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
+                                                     const T& b) const {
+    if (b != T(0)) {
+      return Binary()(a, b);
+    } else {
+      return T(0);
+    }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    const Packet mask = pcmp_eq(b, pzero(b));
+    const Packet quotient = Binary().packetOp(a, b);
+    return pandnot(quotient, mask);
+  }
+};
+
+template <typename T, bool IsComplex = Eigen::NumTraits<T>::IsComplex>
+struct div_no_nan_op;
+
+template <typename T>
+struct div_no_nan_op<T, /*IsComplex=*/false>
+    : public no_nan_op<T, scalar_quotient_op<T>> {
+};
+
+template <typename T>
+struct functor_traits<div_no_nan_op<T, /*IsComplex=*/false>> {
+  enum {
+    Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = true,
+  };
+};
+
+// Whether or not complex division produces a NaN depends on the underlying
+// implementation. Some compilers (e.g. gcc) use a simple method that divides
+// by |b|^2, which may underflow to 0 for b != 0.
+template <typename T>
+struct div_no_nan_op<T, /*IsComplex=*/true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
+                                                     const T& b) const {
+    if (b == T(0)) {
+      return T(0);
+    } else {
+      // If the numerator is zero, then the result must be zero even if |b|^2
+      // underflows to zero.
+      const T numerator =
+          scalar_product_op<T>()(a, scalar_conjugate_op<T>()(b));
+      if (numerator == T(0)) {
+        return T(0);
+      }
+    }
+    return scalar_quotient_op<T>()(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    const Packet numerator = pmul(a, pconj(b));
+    const Packet mask = por(pcmp_eq(b, pzero(a)), pcmp_eq(numerator, pzero(a)));
+    const Packet quotient = pdiv(a, b);
+    return pandnot(quotient, mask);
+  }
+};
+
+template <typename T>
+struct functor_traits<div_no_nan_op<T, /*IsComplex=*/true>> {
+  enum {
+    Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasMul && packet_traits<T>::HasDiv &&
+                   packet_traits<T>::HasConj,
+  };
+};
+
+template <typename T>
+struct mul_no_nan_op : public no_nan_op<T, scalar_product_op<T>> {
+};
+
+template <typename T>
+struct functor_traits<mul_no_nan_op<T>> {
+  enum {
+    Cost = functor_traits<scalar_product_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = true,
+  };
+};
+
+// scalar_left and scalar_right are template helpers to partially
+// apply a binary function.
+//
+// Suppose Binary is a binary functor f(x, y), scalar_left<> is a
+// unary functor g_x(y) = f(x, y), where x is provided via the
+// constructor. Similarly, scalar_right<> is a unary functor g_y(x) =
+// f(x, y).
+
+template <typename Tout, typename Tin, typename Binary>
+struct scalar_left : private Binary {
+  using result_type = Tout;
+
+  const Tin* left;
+
+  inline scalar_left(const scalar_left& other) = default;
+
+  template <typename... Args>
+  EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c, Args... args)
+      : Binary(args...), left(c) {}
+
+  EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const {
+    return Binary::operator()(*left, right);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& right_packet) const {
+    return Binary::packetOp(Eigen::internal::pset1<Packet>(*left),
+                            right_packet);
+  }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_left<Tout, Tin, Binary>> {
+  enum {
+    Cost = functor_traits<Binary>::Cost,
+    PacketAccess = functor_traits<Binary>::PacketAccess,
+  };
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct scalar_right : private Binary {
+  using result_type = Tout;
+
+  const Tin* right;
+
+  inline scalar_right(const scalar_right& other) = default;
+
+  template <typename... Args>
+  EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c, Args... args)
+      : Binary(args...), right(c) {}
+
+  EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const {
+    return Binary::operator()(left, *right);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& left_packet) const {
+    return Binary::packetOp(left_packet,
+                            Eigen::internal::pset1<Packet>(*right));
+  }
+};
+
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_right<Tout, Tin, Binary>> {
+  enum {
+    Cost = functor_traits<Binary>::Cost,
+    PacketAccess = functor_traits<Binary>::PacketAccess,
+  };
+};
+
+// similar to std::equal_to, but with the DEVICE_FUNC qualifier
+template <class T>
+struct equal_to : std::function<bool(T, T)> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x,
+                                                        const T& y) const {
+    return x == y;
+  }
+};
+
+// similar to std::not_equal_to, but with the DEVICE_FUNC qualifier
+template <class T>
+struct not_equal_to : std::function<bool(T, T)> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x,
+                                                        const T& y) const {
+    return x != y;
+  }
+};
+
+// similar to std::greater, but with the DEVICE_FUNC qualifier
+template <class T>
+struct greater : std::function<bool(T, T)> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x,
+                                                        const T& y) const {
+    return x > y;
+  }
+};
+
+// similar to std::less, but with the DEVICE_FUNC qualifier
+template <class T>
+struct less : std::function<bool(T, T)> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x,
+                                                        const T& y) const {
+    return x < y;
+  }
+};
+
+// similar to std::greater_equal, but with the DEVICE_FUNC qualifier
+template <class T>
+struct greater_equal : std::function<bool(T, T)> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x,
+                                                        const T& y) const {
+    return x >= y;
+  }
+};
+
+// similar to std::less_equal, but with the DEVICE_FUNC qualifier
+template <class T>
+struct less_equal : std::function<bool(T, T)> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x,
+                                                        const T& y) const {
+    return x <= y;
+  }
+};
+
+// Functor that enables squared difference functor.
+template <typename Scalar>
+struct scalar_squared_difference_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    const Scalar v = scalar_difference_op<Scalar>()(a, b);
+    return scalar_product_op<Scalar>()(v, scalar_conjugate_op<Scalar>()(v));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    const Packet v = scalar_difference_op<Scalar>().packetOp(a, b);
+    return scalar_product_op<Scalar>().packetOp(
+        v, scalar_conjugate_op<Scalar>().packetOp(v));
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_squared_difference_op<Scalar>> {
+  enum {
+    Cost = functor_traits<scalar_difference_op<Scalar>>::Cost +
+           functor_traits<scalar_conjugate_op<Scalar>>::Cost +
+           functor_traits<scalar_product_op<Scalar>>::Cost,
+    PacketAccess = functor_traits<scalar_difference_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_conjugate_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_product_op<Scalar>>::PacketAccess
+  };
+};
+
+// TODO(b/32239616): This kernel should be moved into Eigen and vectorized.
+template <typename T, typename Enable = void>
+struct google_floor_div {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    const T z = x / y;
+    // Subtract one if there is a remainder and if the inputs have opposite
+    // signs. This approach avoids unnecessary overflows.
+    return z * y != x && (x < T(0) != y < T(0)) ? z - T(1) : z;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet x_mask = pcmp_lt(x, zeros);
+    Packet y_mask = pcmp_lt(y, zeros);
+    Packet x_div_y = pdiv(x, y);
+    Packet x_div_y_times_y = pmul(x_div_y, y);
+    return pselect(por(peq(x_div_y_times_y, x), peq(x_mask, y_mask)), x_div_y,
+                   psub(x_div_y, pones(x)));
+  }
+};
+
+template <typename T>
+struct google_floor_div<
+    T, typename std::enable_if<std::is_unsigned<T>::value>::type> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    return x / y;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    return pdiv(x, y);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<google_floor_div<Scalar>> {
+  enum {
+    Cost = 2 * Eigen::internal::scalar_div_cost<
+                   Scalar, packet_traits<Scalar>::HasDiv>::value +
+           NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiv
+  };
+};
+
+template <typename T, typename Enable = void>
+struct google_floor_div_real {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    return Eigen::numext::floor(x / y);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    return pfloor(pdiv(x, y));
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<google_floor_div_real<Scalar>> {
+  enum {
+    Cost = 2 * Eigen::internal::scalar_div_cost<
+                   Scalar, packet_traits<Scalar>::HasDiv>::value +
+           2 * NumTraits<Scalar>::AddCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasRound
+  };
+};
+
+// TODO(rmlarsen): Add vectorized mod & fmod in Eigen and use it here.
+template <typename T>
+struct google_floor_fmod {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    // EIGEN_STATIC_ASSERT(NUMERIC_TYPE_MUST_BE_REAL);
+    T trunc_mod = scalar_fmod_op<T>()(x, y);
+    return trunc_mod != T(0) && (y < T(0) != trunc_mod < T(0)) ? trunc_mod + y
+                                                               : trunc_mod;
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<google_floor_fmod<Scalar>> {
+  enum {
+    Cost = functor_traits<Eigen::internal::scalar_fmod_op<Scalar>>::Cost +
+           NumTraits<Scalar>::AddCost,
+    PacketAccess = false
+  };
+};
+
+// TODO(rmlarsen): Add vectorized mod & fmod in Eigen and use it here.
+template <typename T>
+struct google_floor_mod {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    // EIGEN_STATIC_ASSERT(!NUMERIC_TYPE_MUST_BE_REAL);
+    T trunc_mod = Eigen::internal::scalar_mod2_op<T>()(x, y);
+    return trunc_mod != T(0) && (y < T(0) != trunc_mod < T(0)) ? trunc_mod + y
+                                                               : trunc_mod;
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<google_floor_mod<Scalar>> {
+  enum {
+    Cost = functor_traits<Eigen::internal::scalar_mod2_op<Scalar>>::Cost +
+           NumTraits<Scalar>::AddCost,
+    PacketAccess = false
+  };
+};
+
+template <typename T, typename Enable = void>
+struct google_truncate_div_real {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    EIGEN_USING_STD(trunc)
+    return static_cast<T>(trunc(x / y));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    const Packet z = pdiv(x, y);
+    return pselect(pcmp_lt(z, pzero(z)), pceil(z), pfloor(z));
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<google_truncate_div_real<Scalar>> {
+  enum {
+    Cost = 2 * Eigen::internal::scalar_div_cost<
+                   Scalar, packet_traits<Scalar>::HasDiv>::value +
+           3 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiv &&
+                   packet_traits<Scalar>::HasRound &&
+                   packet_traits<Scalar>::HasCmp
+  };
+};
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push")       \
+      _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+template <typename Scalar, bool IsInteger = Eigen::NumTraits<Scalar>::IsInteger,
+          bool HasRint = packet_traits<Scalar>::HasRound>
+struct scalar_round_half_to_even_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+
+    const Scalar round_val = Eigen::numext::floor(x + Scalar(0.5));
+    const Scalar fraction = round_val - x;
+    if (TF_PREDICT_FALSE(fraction == Scalar(.5))) {
+      return Scalar(2) * Eigen::numext::floor(Scalar(.5) * x + Scalar(0.5));
+    } else {
+      return round_val;
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    Packet half = pset1<Packet>(Scalar(0.5));
+    Packet round_val = pfloor(padd(x, half));
+    Packet fraction = psub(round_val, x);
+    Packet half_mask = pcmp_eq(fraction, half);
+    bool any_halves = predux_any(half_mask);
+    if (TF_PREDICT_FALSE(any_halves)) {
+      Packet two = pset1<Packet>(Scalar(2));
+      Packet nearest_even = pmul(two, pfloor(pmadd(half, x, half)));
+      return pselect(half_mask, nearest_even, round_val);
+    } else {
+      return round_val;
+    }
+  }
+};
+
+template <typename Scalar>
+struct scalar_round_half_to_even_op<Scalar, true, false> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    return x;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return x;
+  }
+};
+
+template <typename Scalar>
+struct scalar_round_half_to_even_op<Scalar, false, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    return Eigen::numext::rint(x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return print(x);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_round_half_to_even_op<Scalar>> {
+  enum {
+    Cost = Eigen::NumTraits<Scalar>::IsInteger ? 0
+                                               : 4 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasRound &&
+                   packet_traits<Scalar>::HasAdd &&
+                   packet_traits<Scalar>::HasMul,
+  };
+};
+
+template <typename Scalar, bool IsInteger = Eigen::NumTraits<Scalar>::IsInteger>
+struct scalar_round_up_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+    return Eigen::numext::floor(x + Scalar(0.5));
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return pfloor(padd(x, pset1<Packet>(0.5)));
+  }
+};
+
+template <typename Scalar>
+struct scalar_round_up_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    return x;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return x;
+  }
+};
+
+template <typename Scalar, bool IsInteger>
+struct functor_traits<scalar_round_up_op<Scalar, IsInteger>> {
+  enum {
+    Cost = IsInteger ? 0 : 4 * NumTraits<Scalar>::AddCost,
+    PacketAccess = IsInteger || packet_traits<Scalar>::HasRound
+  };
+};
+
+#undef ENABLE_FLOAT_EQUALITY_WARNING
+#undef DISABLE_FLOAT_EQUALITY_WARNING
+
+template <typename Scalar>
+struct bitwise_xor_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x, const Scalar& y) const {
+    return x ^ y;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    return Eigen::internal::pxor(a, b);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<bitwise_xor_op<Scalar>> {
+  enum { Cost = Eigen::NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+template <typename Scalar>
+struct xlogy_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x, const Scalar& y) const {
+    if (x == Scalar(0.)) {
+      return Scalar(0.);
+    }
+    return x * numext::log(y);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet mask = pcmp_eq(x, zeros);
+    scalar_log_op<Scalar> log_op;
+    Packet log_y = log_op.packetOp(y);
+    Packet x_log_y = pmul(x, log_y);
+    return pselect(mask, x, x_log_y);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<xlogy_op<Scalar>> {
+  enum {
+    Cost = functor_traits<scalar_log_op<Scalar>>::Cost +
+           Eigen::NumTraits<Scalar>::MulCost,
+    PacketAccess = functor_traits<scalar_log_op<Scalar>>::PacketAccess
+  };
+};
+
+template <typename Scalar>
+struct xlog1py_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x, const Scalar& y) const {
+    if (x == Scalar(0.)) {
+      return Scalar(0.);
+    }
+    return x * numext::log1p(y);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet mask = pcmp_eq(x, zeros);
+    scalar_log1p_op<Scalar> log1p_op;
+    Packet log1p_y = log1p_op.packetOp(y);
+    Packet x_log1p_y = pmul(x, log1p_y);
+    return pselect(mask, x, x_log1p_y);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<xlog1py_op<Scalar>> {
+  enum {
+    Cost = functor_traits<scalar_log1p_op<Scalar>>::Cost +
+           Eigen::NumTraits<Scalar>::MulCost,
+#if TENSORFLOW_USE_ROCM
+    PacketAccess = false,
+#else
+    PacketAccess = functor_traits<scalar_log1p_op<Scalar>>::PacketAccess
+#endif
+  };
+};
+
+template <typename Scalar>
+struct xdivy_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x, const Scalar& y) const {
+    if (x == Scalar(0.)) {
+      return Scalar(0.);
+    }
+    return x / y;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet mask = pcmp_eq(x, zeros);
+    Packet x_div_y = pdiv(x, y);
+    return pselect(mask, x, x_div_y);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<xdivy_op<Scalar>> {
+  enum {
+    Cost =
+        Eigen::NumTraits<Scalar>::AddCost +
+        Eigen::internal::scalar_div_cost<Scalar,
+                                         packet_traits<Scalar>::HasDiv>::value,
+    PacketAccess = packet_traits<Scalar>::HasDiv
+  };
+};
+
+template <typename T>
+struct scalar_erfinv_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    constexpr T half = T(0.5);
+    T y = numext::ndtri(half * x + half);
+    constexpr T half_sqrt = T(M_SQRT1_2);
+    return y * half_sqrt;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    Packet half = pset1<Packet>(T(0.5));
+    Packet y = pndtri<Packet>(pmadd(half, x, half));
+    Packet half_sqrt = pset1<Packet>(T(M_SQRT1_2));
+    return pmul(y, half_sqrt);
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_erfinv_op<T>> {
+  enum {
+    Cost = functor_traits<scalar_ndtri_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = packet_traits<T>::HasNdtri,
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+namespace functor {
+
+////////////////////////////////////////////////////////////////////////////////
+// Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+// Base template for functors whose input scalar type is T and
+// output scalar type is R.
+template <typename T, typename F, typename R = T>
+struct base {
+  // func defines operator() and its vectorized version packetOp().
+  typedef F func;
+
+  // If true, the functor's corresponding binary op will instantiate
+  // specialized kernels to perform an optimized broadcast
+  // operation. Each functor for which this is enabled increases the
+  // code size, so by default this is disabled for binary functors and
+  // is enabled on a per-op basis as needed.
+  static constexpr bool use_bcast_optimization = false;
+
+  // operator() has the signature:
+  //  out_type operator()(in_type in0, in_type in1 ...)
+  typedef R out_type;
+  typedef T in_type;
+
+  // TensorFlow provides tensor-ized version of "func". Roughly
+  // speaking, the tensorflow operation has the signature:
+  //   tout_type op(tin_type in0)
+  //   tout_type op(tin_type in0, tin_type in1)
+  //   tout_type op(tin_type in0, in_type scalar)
+  typedef typename TTypes<out_type>::Flat tout_type;
+  typedef typename TTypes<in_type>::ConstFlat tin_type;
+  typedef typename TTypes<in_type>::ConstScalar tscalar_type;
+
+  // Whether the functor can error out.  Currently applies only to integer
+  // div and mod.
+  static constexpr bool has_errors = false;
+};
+
+// For now, we only apply certain speed optimization for
+// float/double's broadcast binary op.
+template <typename T>
+struct use_bcast_optimization {
+  static constexpr bool value = false;
+};
+
+template <>
+struct use_bcast_optimization<float> {
+  static constexpr bool value = true;
+};
+
+template <>
+struct use_bcast_optimization<double> {
+  static constexpr bool value = true;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Unary functors
+////////////////////////////////////////////////////////////////////////////////
+
+// abs(x) = |x|
+// neg(x) = - x
+// inverse(x) = 1 / x
+// square(x) = x^2
+// sqrt(x) = x^(1/2)
+// rsqrt(x) = x^(-1/2)
+// exp(x) = e^x
+// expm1(x) = e^x - 1
+// log(x) = natural logarithm of x
+// log1p(x) = natural logarithm of 1 + x
+// tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+// sigmoid = 1 / (1 + exp(-x))  // a.k.a, logistic
+//
+// NOTE: We may eventually implement common functions used in NN
+// here. E.g., rectifier, softplus, derivatives of tanh, sigmod, etc.
+// For reference, see speech/lstm/eigen_functors.h.
+
+template <typename T>
+struct abs : base<T, Eigen::internal::scalar_abs_op<T>,
+                  typename Eigen::internal::scalar_abs_op<T>::result_type> {};
+
+template <typename T>
+struct neg : base<T, Eigen::internal::scalar_opposite_op<T>> {};
+
+template <typename T>
+struct inverse : base<T, Eigen::internal::scalar_inverse_op<T>> {};
+
+template <typename T>
+struct square : base<T, Eigen::internal::scalar_square_op<T>> {};
+
+template <typename T>
+struct sqrt : base<T, Eigen::internal::scalar_sqrt_op<T>> {};
+
+template <typename T>
+struct rsqrt : base<T, Eigen::internal::scalar_rsqrt_op<T>> {};
+
+template <typename T>
+struct exp : base<T, Eigen::internal::scalar_exp_op<T>> {};
+
+template <typename T>
+struct expm1 : base<T, Eigen::internal::scalar_expm1_op<T>> {};
+
+template <typename T>
+struct log : base<T, Eigen::internal::scalar_log_op<T>> {};
+
+template <typename T>
+struct log1p : base<T, Eigen::internal::scalar_log1p_op<T>> {};
+
+template <typename T>
+struct sign : base<T, Eigen::internal::scalar_sign_op<T>> {};
+
+template <typename T>
+struct sinh : base<T, Eigen::internal::scalar_sinh_op<T>> {};
+
+template <typename T>
+struct cosh : base<T, Eigen::internal::scalar_cosh_op<T>> {};
+
+template <typename T>
+struct tanh : base<T, Eigen::internal::scalar_tanh_op<T>> {};
+
+template <typename T>
+struct asinh : base<T, Eigen::internal::scalar_asinh_op<T>> {};
+
+template <typename T>
+struct acosh : base<T, Eigen::internal::scalar_acosh_op<T>> {};
+
+template <typename T>
+struct atanh : base<T, Eigen::internal::scalar_atanh_op<T>> {};
+
+template <typename T>
+struct lgamma : base<T, Eigen::internal::scalar_lgamma_op<T>> {};
+
+template <typename T>
+struct digamma : base<T, Eigen::internal::scalar_digamma_op<T>> {};
+
+template <typename T>
+struct erf : base<T, Eigen::internal::scalar_erf_op<T>> {};
+
+template <typename T>
+struct erfc : base<T, Eigen::internal::scalar_erfc_op<T>> {};
+
+template <typename T>
+struct ndtri : base<T, Eigen::internal::scalar_ndtri_op<T>> {};
+
+template <typename T>
+struct erfinv : base<T, Eigen::internal::scalar_erfinv_op<T>> {};
+
+template <typename T>
+struct sigmoid : base<T, Eigen::internal::scalar_logistic_op<T>> {};
+
+template <typename T>
+struct sin : base<T, Eigen::internal::scalar_sin_op<T>> {};
+
+template <typename T>
+struct cos : base<T, Eigen::internal::scalar_cos_op<T>> {};
+
+template <typename T>
+struct tan : base<T, Eigen::internal::scalar_tan_op<T>> {};
+
+template <typename T>
+struct asin : base<T, Eigen::internal::scalar_asin_op<T>> {};
+
+template <typename T>
+struct acos : base<T, Eigen::internal::scalar_acos_op<T>> {};
+
+template <typename T>
+struct atan : base<T, Eigen::internal::scalar_atan_op<T>> {};
+
+struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool>> {
+};
+
+// Flip all bits. Named invert to be consistent with numpy.
+template <typename T>
+struct invert_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a) const {
+    return ~a;
+  }
+};
+
+template <typename T>
+struct invert : base<T, invert_op<T>> {};
+
+// NOTE: std::isinf, std::isnan, std::isfinite are plain function.
+// Therefore we need to wrap them in functors to be used with Eigen's
+// type system.
+template <typename T>
+struct isinf : base<T, Eigen::internal::scalar_isinf_op<T>, bool> {};
+
+template <typename T>
+struct isnan : base<T, Eigen::internal::scalar_isnan_op<T>, bool> {};
+
+template <typename T>
+struct isfinite : base<T, Eigen::internal::scalar_isfinite_op<T>, bool> {};
+
+template <typename T>
+struct floor : base<T, Eigen::internal::scalar_floor_op<T>> {};
+
+template <typename T>
+struct round : base<T, Eigen::internal::scalar_round_half_to_even_op<T>> {};
+
+template <typename T>
+struct ceil : base<T, Eigen::internal::scalar_ceil_op<T>> {};
+
+// Note: rint rounds half values to even, just like round_half_to_even_op.
+template <typename T>
+struct rint : base<T, Eigen::internal::scalar_rint_op<T>> {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Binary functors
+////////////////////////////////////////////////////////////////////////////////
+
+// Binary functors:
+//
+// add(x, y) = x + y
+// sub(x, y) = x - y
+// mul(x, y) = x * y
+// div(x, y) = x / y
+// mod(x, y) = x % y         (int32 and int64 only)
+// fmod(x, y) = fmod(x, y)   (float and double only)
+// pow(x, y) = x ^ y
+// maximum(x, y) = x > y ? x : y
+// minimum(x, y) = x < y ? x : y
+// squared_difference(x, y) = conj(x - y) * (x - y)
+
+template <typename T>
+struct add : base<T, Eigen::internal::scalar_sum_op<T>> {
+  static constexpr bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct sub : base<T, Eigen::internal::scalar_difference_op<T>> {
+  static constexpr bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct mul : base<T, Eigen::internal::scalar_product_op<T>> {
+  static constexpr bool use_bcast_optimization = true;
+};
+
+template <typename T>
+struct mul_no_nan : base<T, Eigen::internal::mul_no_nan_op<T>> {};
+
+template <typename T>
+struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
+
+template <typename T>
+struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
+                              T, Eigen::internal::scalar_quotient_op<T>>> {
+  static constexpr bool has_errors = true;
+};
+
+template <typename T>
+struct div_no_nan : base<T, Eigen::internal::div_no_nan_op<T>> {};
+
+template <typename T>
+struct fmod : base<T, Eigen::internal::scalar_fmod_op<T>> {};
+
+template <typename T>
+struct mod : base<T, Eigen::internal::scalar_mod2_op<T>> {};
+
+template <typename T>
+struct safe_mod : base<T, Eigen::internal::safe_div_or_mod_op<
+                              T, Eigen::internal::scalar_mod2_op<T>>> {
+  static constexpr bool has_errors = true;
+};
+
+template <typename T>
+struct floor_fmod : base<T, Eigen::internal::google_floor_fmod<T>> {};
+
+template <typename T>
+struct safe_floor_mod : base<T, Eigen::internal::safe_div_or_mod_op<
+                                    T, Eigen::internal::google_floor_mod<T>>> {
+  static constexpr bool has_errors = true;
+};
+
+template <typename T>
+struct floor_div : base<T, Eigen::internal::google_floor_div<T>> {};
+
+template <typename T>
+struct safe_floor_div : base<T, Eigen::internal::safe_div_or_mod_op<
+                                    T, Eigen::internal::google_floor_div<T>>> {
+  static constexpr bool has_errors = true;
+};
+
+template <typename T>
+struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
+
+template <typename T>
+struct truncate_div_real
+    : base<T, Eigen::internal::google_truncate_div_real<T>> {};
+
+template <typename T>
+struct pow : base<T, Eigen::internal::scalar_pow_op<T, T>> {};
+
+template <typename T>
+struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
+  static constexpr bool has_errors = true;
+};
+
+// Version of safe_pow for integers which returns 0 if RHS is negative and LHS
+// is not 1 or -1. For use on GPUs, where we cannot raise an error.
+template <typename T>
+struct safe_pow_ignore_error_op {
+  static_assert(std::is_integral<T>::value, "Integer type expected");
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    if (TF_PREDICT_FALSE(y < 0)) {
+      if (x == T(-1)) {
+        T trunc_mod = Eigen::internal::scalar_mod2_op<T>()(y, T(2));
+        return trunc_mod == T(-1) ? T(-1) : T(1);
+      }
+      return x == T(1) ? T(1) : T(0);
+    }
+    return Eigen::internal::scalar_pow_op<T, T>{}(x, y);
+  }
+};
+
+template <typename T>
+struct safe_pow_ignore_error : base<T, safe_pow_ignore_error_op<T>> {};
+
+template <typename T>
+struct maximum
+    : base<T, Eigen::internal::scalar_max_op<T, T, Eigen::PropagateNaN>> {};
+
+template <typename T>
+struct minimum
+    : base<T, Eigen::internal::scalar_min_op<T, T, Eigen::PropagateNaN>> {};
+
+template <typename T>
+struct igamma : base<T, Eigen::internal::scalar_igamma_op<T>> {};
+
+template <typename T>
+struct random_gamma_grad
+    : base<T, Eigen::internal::scalar_gamma_sample_der_alpha_op<T>> {};
+
+template <typename T>
+struct igammac : base<T, Eigen::internal::scalar_igammac_op<T>> {};
+
+template <typename T>
+struct zeta : base<T, Eigen::internal::scalar_zeta_op<T>> {};
+
+template <typename T>
+struct polygamma : base<T, Eigen::internal::scalar_polygamma_op<T>> {};
+
+template <typename T>
+struct atan2 : base<T, Eigen::internal::scalar_atan2_op<T, T>> {};
+
+template <typename T>
+struct squared_difference
+    : base<T, Eigen::internal::scalar_squared_difference_op<T>> {};
+
+template <typename T>
+struct xdivy : base<T, Eigen::internal::xdivy_op<T>> {};
+
+template <typename T>
+struct xlogy : base<T, Eigen::internal::xlogy_op<T>> {};
+
+template <typename T>
+struct xlog1py : base<T, Eigen::internal::xlog1py_op<T>> {};
+
+template <typename T>
+struct less : base<T, Eigen::internal::less<T>, bool> {};
+
+template <typename T>
+struct less_equal : base<T, Eigen::internal::less_equal<T>, bool> {};
+
+template <typename T>
+struct greater : base<T, Eigen::internal::greater<T>, bool> {};
+
+template <typename T>
+struct greater_equal : base<T, Eigen::internal::greater_equal<T>, bool> {};
+
+template <typename T>
+struct equal_to : base<T, Eigen::internal::equal_to<T>, bool> {};
+
+template <typename T>
+struct not_equal_to : base<T, Eigen::internal::not_equal_to<T>, bool> {};
+
+struct logical_and : base<bool, Eigen::internal::scalar_boolean_and_op<bool>> {
+};
+
+struct logical_or : base<bool, Eigen::internal::scalar_boolean_or_op<bool>> {};
+
+template <typename T>
+struct bitwise_and_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    return x & y;
+  }
+};
+
+template <typename T>
+struct bitwise_or_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    return x | y;
+  }
+};
+
+template <typename T>
+struct bitwise_and : base<T, bitwise_and_op<T>> {};
+
+template <typename T>
+struct bitwise_or : base<T, bitwise_or_op<T>> {};
+
+template <typename T>
+struct bitwise_xor : base<T, Eigen::internal::bitwise_xor_op<T>> {};
+
+template <typename T>
+struct left_shift_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    // Avoids UB: don't shift by larger than the bitwidth of T, and
+    // performs left shifts as unsigned shifts.
+    T y_clamped = y;
+    if (y_clamped < 0) {
+      y_clamped = 0;
+    } else if (y_clamped > sizeof(T) * CHAR_BIT - 1) {
+      y_clamped = sizeof(T) * CHAR_BIT - 1;
+    }
+    using U = typename std::make_unsigned<T>::type;
+    return static_cast<T>(static_cast<U>(x) << static_cast<U>(y_clamped));
+  }
+};
+
+template <typename T>
+struct right_shift_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    // Avoids UB: don't shift by larger than the bitwidth of T.
+    T y_clamped = y;
+    if (y_clamped < 0) {
+      y_clamped = 0;
+    } else if (y_clamped > sizeof(T) * CHAR_BIT - 1) {
+      y_clamped = sizeof(T) * CHAR_BIT - 1;
+    }
+    // Technically right shifts of signed integers are not necessarily
+    // arithmetic shifts according to the C++ standard. However in practice most
+    // implementations are arithmetic shifts. If this proves to be a problem in
+    // practice, we may need to use an alternative implementation.
+    return x >> y_clamped;
+  }
+};
+
+template <typename T>
+struct left_shift : base<T, left_shift_op<T>> {};
+
+template <typename T>
+struct right_shift : base<T, right_shift_op<T>> {};
+
+template <typename T>
+struct make_complex_func {
+  typedef std::complex<T> result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(T real,
+                                                               T imag) const {
+    return std::complex<T>(real, imag);
+  }
+};
+
+template <typename T>
+struct make_complex : base<T, make_complex_func<T>, std::complex<T>> {};
+
+template <typename T>
+struct get_real
+    : base<T, Eigen::internal::scalar_real_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct get_imag
+    : base<T, Eigen::internal::scalar_imag_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct get_angle
+    : base<T, Eigen::internal::scalar_arg_op<T>, typename T::value_type> {};
+
+template <typename T>
+struct conj : base<T, Eigen::internal::scalar_conjugate_op<T>> {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Functors takes 1 or 2 tensors, computes the base functor on
+// coefficient of the input tensors and puts the results in the output
+// tensor.
+////////////////////////////////////////////////////////////////////////////////
+template <typename Device, typename Functor>
+struct UnaryFunctor {
+  // Computes on device "d": out[i] = Functor(in[i])
+  void operator()(const Device& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in);
+};
+
+template <typename Device, typename Functor, typename Targ>
+struct UnaryFunctorWithArg {
+  // Computes on device "d": out[i] = Functor(in[i])
+  void operator()(const Device& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in, Targ val);
+};
+
+template <typename Device, typename Functor, int NDIMS,
+          bool has_errors = Functor::has_errors>
+struct BinaryFunctor {
+  // Computes on device "d": out[i] = Functor(in0[i], in1[i])
+  void operator()(const Device& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error);
+
+  // Computes on device "d": out[i] = Functor(scalar[0], in[i])
+  void Left(const Device& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error);
+
+  // Computes on device "d": out[i] = Functor(in[i], scalar[0])
+  void Right(const Device& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error);
+
+  // Computes on device "d":
+  //   out = Functor(in0.broadcast(bcast0), in1.broadcast(bcast1))
+  //
+  // TODO(zhifengc): makes BCast a template member function on NDIMS
+  // instead making BinaryFunctor templates on NDIMS.
+  void BCast(const Device& d,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error);
+};
+
+template <typename Device, typename T>
+struct ApproximateEqual {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z);
+};
+
+template <int NDIMS>
+bool AllOne(const typename Eigen::array<Eigen::DenseIndex, NDIMS>& a) {
+  for (size_t i = 0; i < a.size(); ++i) {
+    if (a[i] != 1) return false;
+  }
+  return true;
+}
+
+template <typename Device, typename T>
+struct SelectFunctor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstFlat cond_flat,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat);
+};
+
+template <typename Device, typename T>
+struct SelectScalarFunctor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat);
+};
+
+template <typename Device, typename T>
+struct BatchSelectFunctor {
+  void operator()(const Device& d,
+                  typename TTypes<T>::Matrix output_flat_outer_dims,
+                  TTypes<bool>::ConstVec cond_vec,
+                  typename TTypes<T>::ConstMatrix then_flat_outer_dims,
+                  typename TTypes<T>::ConstMatrix else_flat_outer_dims);
+};
+
+template <typename Device, typename T, int NDIMS>
+struct BCastSelectFunctor {
+  void operator()(const Device& d,
+                  typename TTypes<T, NDIMS>::Tensor output_tensor,
+                  typename TTypes<bool, NDIMS>::ConstTensor cond_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor then_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor else_tensor,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> cond_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> then_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> else_bcast);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_common.h
new file mode 100644
index 00000000..fd7ee451
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_common.h
@@ -0,0 +1,683 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_COMMON_H_
+
+// See docs in ../ops/math_ops.cc.
+#define _USE_MATH_DEFINES
+#include <cmath>
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/platform/bfloat16.h"
+
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+class BinaryOpShared : public OpKernel {
+ public:
+  explicit BinaryOpShared(OpKernelConstruction* ctx, DataType out, DataType in);
+
+ protected:
+  struct BinaryOpState {
+    // Sets up bcast with the shape of in0 and in1, ensures that the bcast
+    // is valid, and if so, set out, either by allocating a new buffer using
+    // ctx->output(...) or by creating an alias for an owned input buffer for
+    // in-place computation.
+    // Caller must check ctx->status() upon return for non-ok status.
+    // If ctx->status().ok() is true, then out is guaranteed to be allocated.
+    explicit BinaryOpState(OpKernelContext* ctx);
+
+    const Tensor& in0;
+    const Tensor& in1;
+
+    BCast bcast;
+    Tensor* out = nullptr;
+    int64_t out_num_elements;
+
+    int64_t in0_num_elements;
+    int64_t in1_num_elements;
+
+    int ndims;
+    bool result;
+  };
+
+  void SetUnimplementedError(OpKernelContext* ctx);
+  void SetComputeError(OpKernelContext* ctx);
+};
+
+// Coefficient-wise binary operations:
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined in cwise_ops.h. E.g., functor::add.
+template <typename Device, typename Functor>
+class BinaryOp : public BinaryOpShared {
+ public:
+  typedef typename Functor::in_type Tin;    // Input scalar data type.
+  typedef typename Functor::out_type Tout;  // Output scalar data type.
+
+  explicit BinaryOp(OpKernelConstruction* ctx)
+      : BinaryOpShared(ctx, DataTypeToEnum<Tout>::v(),
+                       DataTypeToEnum<Tin>::v()) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_0 = ctx->input(0);
+    OP_REQUIRES(ctx, input_0.dtype() == DataTypeToEnum<Tin>::v(),
+                errors::InvalidArgument(
+                    "Expected tensor of type ",
+                    DataTypeString(DataTypeToEnum<Tin>::v()), " but got type ",
+                    DataTypeString(input_0.dtype())));
+    const Tensor& input_1 = ctx->input(1);
+    OP_REQUIRES(ctx, input_1.dtype() == DataTypeToEnum<Tin>::v(),
+                errors::InvalidArgument(
+                    "Expected tensor of type ",
+                    DataTypeString(DataTypeToEnum<Tin>::v()), " but got type ",
+                    DataTypeString(input_1.dtype())));
+    const Device& eigen_device = ctx->eigen_device<Device>();
+    bool error = false;
+    bool* const error_ptr = Functor::has_errors ? &error : nullptr;
+
+    // NOTE: Handle three simple cases before building the BinaryOpState, which
+    // is relatively expensive for small operations.
+    if (input_0.shape() == input_1.shape()) {
+      // tensor op tensor with no broadcasting.
+      Tensor* out;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0, 1}, 0, input_0.shape(), &out));
+      functor::BinaryFunctor<Device, Functor, 1>()(
+          eigen_device, out->template flat<Tout>(),
+          input_0.template flat<Tin>(), input_1.template flat<Tin>(),
+          error_ptr);
+      if (Functor::has_errors && error) {
+        SetComputeError(ctx);
+      }
+      return;
+    } else if (input_0.shape().dims() == 0) {
+      // scalar op tensor.
+      Tensor* out;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {1}, 0, input_1.shape(), &out));
+
+      functor::BinaryFunctor<Device, Functor, 1>().Left(
+          eigen_device, out->template flat<Tout>(),
+          input_0.template scalar<Tin>(), input_1.template flat<Tin>(),
+          error_ptr);
+      if (Functor::has_errors && error) {
+        SetComputeError(ctx);
+      }
+      return;
+    } else if (input_1.shape().dims() == 0) {
+      // tensor op scalar.
+      Tensor* out;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0}, 0, input_0.shape(), &out));
+      functor::BinaryFunctor<Device, Functor, 1>().Right(
+          eigen_device, out->template flat<Tout>(),
+          input_0.template flat<Tin>(), input_1.template scalar<Tin>(),
+          error_ptr);
+      if (Functor::has_errors && error) {
+        SetComputeError(ctx);
+      }
+      return;
+    }
+
+    // 'state': Shared helper not dependent on T to reduce code size
+    BinaryOpState state(ctx);
+    if (ctx->status().code() == error::RESOURCE_EXHAUSTED) {
+      // Stop when BinaryOpState's constructor failed due to OOM.
+      return;
+    }
+    auto& bcast = state.bcast;
+    Tensor* out = state.out;
+    if (!bcast.IsValid()) {
+      if (ctx->status().ok()) {
+        if (state.result) {
+          functor::SetOneFunctor<Device, bool>()(eigen_device,
+                                                 out->flat<bool>());
+        } else {
+          functor::SetZeroFunctor<Device, bool>()(eigen_device,
+                                                  out->flat<bool>());
+        }
+      }
+      return;
+    }
+
+    auto& in0 = state.in0;
+    auto& in1 = state.in1;
+    if (state.out_num_elements == 0) {
+      return;
+    }
+
+    const int ndims = state.ndims;
+    if (ndims <= 1) {
+      auto out_flat = out->flat<Tout>();
+      if (state.in1_num_elements == 1) {
+        // tensor op scalar
+        functor::BinaryFunctor<Device, Functor, 1>().Right(
+            eigen_device, out_flat, in0.template flat<Tin>(),
+            in1.template scalar<Tin>(), error_ptr);
+      } else if (state.in0_num_elements == 1) {
+        // scalar op tensor
+        functor::BinaryFunctor<Device, Functor, 1>().Left(
+            eigen_device, out_flat, in0.template scalar<Tin>(),
+            in1.template flat<Tin>(), error_ptr);
+      } else {
+        functor::BinaryFunctor<Device, Functor, 1>()(
+            eigen_device, out_flat, in0.template flat<Tin>(),
+            in1.template flat<Tin>(), error_ptr);
+      }
+    } else if (ndims == 2) {
+      functor::BinaryFunctor<Device, Functor, 2>().BCast(
+          eigen_device, out->shaped<Tout, 2>(bcast.result_shape()),
+          in0.template shaped<Tin, 2>(bcast.x_reshape()),
+          BCast::ToIndexArray<2>(bcast.x_bcast()),
+          in1.template shaped<Tin, 2>(bcast.y_reshape()),
+          BCast::ToIndexArray<2>(bcast.y_bcast()), error_ptr);
+    } else if (ndims == 3) {
+      functor::BinaryFunctor<Device, Functor, 3>().BCast(
+          eigen_device, out->shaped<Tout, 3>(bcast.result_shape()),
+          in0.template shaped<Tin, 3>(bcast.x_reshape()),
+          BCast::ToIndexArray<3>(bcast.x_bcast()),
+          in1.template shaped<Tin, 3>(bcast.y_reshape()),
+          BCast::ToIndexArray<3>(bcast.y_bcast()), error_ptr);
+    } else if (ndims == 4) {
+      functor::BinaryFunctor<Device, Functor, 4>().BCast(
+          eigen_device, out->shaped<Tout, 4>(bcast.result_shape()),
+          in0.template shaped<Tin, 4>(bcast.x_reshape()),
+          BCast::ToIndexArray<4>(bcast.x_bcast()),
+          in1.template shaped<Tin, 4>(bcast.y_reshape()),
+          BCast::ToIndexArray<4>(bcast.y_bcast()), error_ptr);
+    } else if (ndims == 5) {
+      functor::BinaryFunctor<Device, Functor, 5>().BCast(
+          eigen_device, out->shaped<Tout, 5>(bcast.result_shape()),
+          in0.template shaped<Tin, 5>(bcast.x_reshape()),
+          BCast::ToIndexArray<5>(bcast.x_bcast()),
+          in1.template shaped<Tin, 5>(bcast.y_reshape()),
+          BCast::ToIndexArray<5>(bcast.y_bcast()), error_ptr);
+    } else {
+      SetUnimplementedError(ctx);
+    }
+    if (Functor::has_errors && error) {
+      SetComputeError(ctx);
+    }
+  }
+};
+
+template <typename Device, typename T>
+class ApproximateEqualOp : public OpKernel {
+ public:
+  explicit ApproximateEqualOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float tolerance;
+    OP_REQUIRES_OK(context, context->GetAttr("tolerance", &tolerance));
+    tolerance_ = T(tolerance);
+  }
+  void Compute(OpKernelContext* context) override {
+    const Tensor& x_input = context->input(0);
+    const Tensor& y_input = context->input(1);
+    OP_REQUIRES(
+        context, x_input.shape() == y_input.shape(),
+        errors::InvalidArgument("x and y must be of the same shape. ",
+                                "x shape: ", x_input.shape().DebugString(),
+                                ". y shape: ", y_input.shape().DebugString()));
+    Tensor* z_output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, x_input.shape(), &z_output));
+    const Device& d = context->eigen_device<Device>();
+    typename TTypes<T>::ConstFlat x(x_input.flat<T>());
+    typename TTypes<T>::ConstFlat y(y_input.flat<T>());
+    typename TTypes<bool>::Flat z(z_output->flat<bool>());
+    functor::ApproximateEqual<Device, T>()(d, x, y, tolerance_, z);
+  }
+
+ private:
+  T tolerance_;
+};
+
+// Basic coefficient-wise binary operations that are known to not require
+// any broadcasting. This is the case for example of the gradients of
+// unary operations.
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined above. E.g., functor::tanh_grad.
+template <typename Device, typename Functor>
+class SimpleBinaryOp : public OpKernel {
+ public:
+  typedef typename Functor::in_type Tin;    // Input scalar data type.
+  typedef typename Functor::out_type Tout;  // Output scalar data type.
+
+  explicit SimpleBinaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    OP_REQUIRES(
+        ctx, in0.NumElements() == in1.NumElements(),
+        errors::InvalidArgument("The two arguments to a cwise op must have "
+                                "same number of elements, got ",
+                                in0.NumElements(), " and ", in1.NumElements()));
+    auto in0_flat = in0.flat<Tin>();
+    auto in1_flat = in1.flat<Tin>();
+    const Device& eigen_device = ctx->eigen_device<Device>();
+
+    Tensor* out = nullptr;
+    if (std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0, 1}, 0, in0.shape(), &out));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+    }
+    auto out_flat = out->flat<Tout>();
+    functor::SimpleBinaryFunctor<Device, Functor>()(eigen_device, out_flat,
+                                                    in0_flat, in1_flat);
+  }
+};
+
+// Coefficient-wise unary operations:
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined in cwise_ops.h. E.g., functor::sqrt.
+template <typename Device, typename Functor>
+class UnaryOp : public OpKernel {
+ public:
+  typedef typename Functor::in_type Tin;    // Input scalar data type.
+  typedef typename Functor::out_type Tout;  // Output scalar data type.
+  // Tin may be different from Tout. E.g., abs: complex64 -> float
+
+  explicit UnaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    auto in = DataTypeToEnum<Tin>::v();
+    auto out = DataTypeToEnum<Tout>::v();
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature({in}, {out}));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    Tensor* out = nullptr;
+    if (std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0}, 0, inp.shape(), &out));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    }
+    functor::UnaryFunctor<Device, Functor>()(
+        ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>());
+  }
+};
+
+template <typename Device, VariantUnaryOp OpEnum>
+class UnaryVariantOp : public OpKernel {
+ public:
+  explicit UnaryVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& inp = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(inp.shape()),
+        errors::InvalidArgument("Non-scalar variants are not supported."));
+    const Variant& v = inp.scalar<Variant>()();
+    Variant v_out;
+    OP_REQUIRES_OK(ctx, UnaryOpVariant<Device>(ctx, OpEnum, v, &v_out));
+    int numa_node = ctx->device()->NumaNode();
+    Tensor out(cpu_allocator(numa_node), DT_VARIANT, TensorShape());
+    out.scalar<Variant>()() = std::move(v_out);
+    ctx->set_output(0, std::move(out));
+  }
+};
+
+namespace functor {
+
+template <typename D, typename Out, typename Rhs>
+void Assign(const D& d, Out out, Rhs rhs) {
+  out.device(d) = rhs;
+}
+
+// Partial specialization of BinaryFunctor<Device=CPUDevice, Functor, NDIMS>
+// for functors with no error checking.
+template <typename Functor, int NDIMS>
+struct BinaryFunctor<CPUDevice, Functor, NDIMS, false> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error) {
+    Assign(d, out, in0.binaryExpr(in1, typename Functor::func()));
+  }
+
+  void Left(const CPUDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  void Right(const CPUDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  void BCast(const CPUDevice& dev,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error) {
+    typename Functor::func func;
+    if (AllOne<NDIMS>(bcast0) && AllOne<NDIMS>(bcast1)) {
+      Assign(dev, out, in0.binaryExpr(in1, func));
+    } else if (AllOne<NDIMS>(bcast0)) {
+      auto rhs = in1.broadcast(bcast1);
+      Assign(dev, out, in0.binaryExpr(rhs, func));
+    } else if (AllOne<NDIMS>(bcast1)) {
+      auto lhs = in0.broadcast(bcast0);
+      Assign(dev, out, lhs.binaryExpr(in1, func));
+    } else {
+      auto lhs = in0.broadcast(bcast0);
+      auto rhs = in1.broadcast(bcast1);
+      Assign(dev, out, lhs.binaryExpr(rhs, func));
+    }
+  }
+};
+
+// Partial specialization of BinaryFunctor<Device=CPUDevice, Functor, 2>
+// for functors with no error checking.
+template <typename Functor>
+struct BinaryFunctor<CPUDevice, Functor, 2, false> {
+  enum { NDIMS = 2 };
+
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error) {
+    Assign(d, out, in0.binaryExpr(in1, typename Functor::func()));
+  }
+
+  void Left(const CPUDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  void Right(const CPUDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data())));
+  }
+
+  inline Eigen::IndexList<Eigen::DenseIndex, Eigen::type2index<1>> NByOne(
+      Eigen::DenseIndex n) {
+    Eigen::IndexList<Eigen::DenseIndex, Eigen::type2index<1>> ret;
+    ret.set(0, n);
+    return ret;
+  }
+  inline Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> OneByM(
+      Eigen::DenseIndex m) {
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> ret;
+    ret.set(1, m);
+    return ret;
+  }
+
+  void BCast(const CPUDevice& dev,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error) {
+    typedef typename Functor::in_type T;
+    typename Functor::func func;
+    if (Functor::use_bcast_optimization && use_bcast_optimization<T>::value) {
+      // Optimize for speed by using Eigen::type2index and avoid
+      // .broadcast() when we know it's a no-op.
+      //
+      // Here, we need to handle 6 cases depending on how many "1"
+      // exist in in0 and in1's shapes (4 numbers in total). It's not
+      // possible that two shapes have more than 2 1s because those
+      // are simplified to NDIMS==1 case.
+      //
+      // Because this optimization increases the binary size for each
+      // Functor (+, -, *, /, <, <=, etc.), type and ndim combination.
+      // we only apply such optimization for selected ops/types/ndims.
+      //
+      // Because NDIMS, Functor::use_broadcast_optimization and
+      // use_broadcast_optimization<T> are compile-time constant, gcc
+      // does a decent job avoiding generating code when conditions
+      // are not met.
+      const Eigen::DenseIndex a = in0.dimension(0);  // in0 is shape [a, b]
+      const Eigen::DenseIndex b = in0.dimension(1);
+      const Eigen::DenseIndex c = in1.dimension(0);  // in1 is shape [c, d]
+      const Eigen::DenseIndex d = in1.dimension(1);
+      if ((a == 1) && (d == 1)) {
+        auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c));
+        auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if ((b == 1) && (c == 1)) {
+        auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d));
+        auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (a == 1) {
+        auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c));
+        auto rhs = in1;
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (b == 1) {
+        auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d));
+        auto rhs = in1;
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (c == 1) {
+        auto lhs = in0;
+        auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+      if (d == 1) {
+        auto lhs = in0;
+        auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b));
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+
+      const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
+      const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
+      if (bcast0_all_one && !bcast1_all_one) {
+        auto lhs = in0;  // No need to do broadcast for in0
+        auto rhs = in1.broadcast(bcast1);
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+
+      if (!bcast0_all_one && bcast1_all_one) {
+        auto lhs = in0.broadcast(bcast0);
+        auto rhs = in1;  // No need to do broadcast for in1
+        Assign(dev, out, lhs.binaryExpr(rhs, func));
+        return;
+      }
+    }
+
+    // Fallback path. Always works and probably slower.
+    auto lhs = in0.broadcast(bcast0);
+    auto rhs = in1.broadcast(bcast1);
+    Assign(dev, out, lhs.binaryExpr(rhs, func));
+  }
+};
+
+// Version of BinaryFunctor with error handling.
+template <typename Functor, int NDIMS>
+struct BinaryFunctor<CPUDevice, Functor, NDIMS, true> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error) {
+    Assign(d, out, in0.binaryExpr(in1, typename Functor::func(error)));
+  }
+
+  void Left(const CPUDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data(), error)));
+  }
+
+  void Right(const CPUDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    Assign(d, out, in.unaryExpr(Unary(scalar.data(), error)));
+  }
+
+  void BCast(const CPUDevice& dev,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error) {
+    typename Functor::func func(error);
+    auto lhs = in0.broadcast(bcast0);
+    auto rhs = in1.broadcast(bcast1);
+    Assign(dev, out, lhs.binaryExpr(rhs, func));
+  }
+};
+
+// Partial specialization of UnaryFunctor<Device=CPUDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<CPUDevice, Functor> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in) {
+    Assign(d, out, in.unaryExpr(typename Functor::func()));
+  }
+};
+
+template <typename Functor, typename Targ>
+struct UnaryFunctorWithArg<CPUDevice, Functor, Targ> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in, Targ val) {
+    Assign(d, out, in.unaryExpr(typename Functor::func(val)));
+  }
+};
+
+// Partial specialization of ApproximateEqual<Device=CPUDevice, T>.
+template <typename T>
+struct ApproximateEqual<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z) {
+    auto diff = x - y;
+    z.device(d) = diff.abs() <= tolerance;
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER(OP, D, N, F, T)                                             \
+  REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \
+                          OP<D##Device, F<T>>);
+
+#define REGISTER_VARIANT(OP, D, N, ENUM)                       \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name(N).Device(DEVICE_##D).TypeConstraint<Variant>("T"), \
+      OP<D##Device, ENUM>);
+
+// Macros to register kernels for multiple types (T0, T1, etc.)  on
+// device type "D" (CPU or GPU) for operation "N" (e.g., sqrt) using
+// the functor "F" (e.g., functor::sqrt).
+
+#if defined(__ANDROID_TYPES_SLIM__)
+// Note that __ANDROID_TYPES_SLIM__ is also checked in the cwise_ops*.cc files.
+// Normally Android TensorFlow is built with a reduced number of types (float).
+// Override on the command-line using "--copt=-D__ANDROID_TYPES_FULL__"
+// to generate a library with full type support with a consequent increase in
+// code size.
+#define REGISTER2(OP, D, N, F, T0, T1) REGISTER(OP, D, N, F, T0)
+#define REGISTER3(OP, D, N, F, T0, T1, T2) REGISTER(OP, D, N, F, T0)
+#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) REGISTER(OP, D, N, F, T0)
+#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) REGISTER(OP, D, N, F, T0)
+#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) REGISTER(OP, D, N, F, T0)
+#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
+  REGISTER(OP, D, N, F, T0)
+#define REGISTER8(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  REGISTER(OP, D, N, F, T0)
+#define REGISTER9(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
+  REGISTER(OP, D, N, F, T0)
+#else  // !defined(__ANDROID_TYPES_SLIM__)
+#define REGISTER2(OP, D, N, F, T0, T1) \
+  REGISTER(OP, D, N, F, T0)            \
+  REGISTER(OP, D, N, F, T1)
+#define REGISTER3(OP, D, N, F, T0, T1, T2) \
+  REGISTER2(OP, D, N, F, T0, T1)           \
+  REGISTER(OP, D, N, F, T2)
+#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) \
+  REGISTER2(OP, D, N, F, T0, T1)               \
+  REGISTER2(OP, D, N, F, T2, T3)
+#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) \
+  REGISTER3(OP, D, N, F, T0, T1, T2)               \
+  REGISTER2(OP, D, N, F, T3, T4)
+#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) \
+  REGISTER3(OP, D, N, F, T0, T1, T2)                   \
+  REGISTER3(OP, D, N, F, T3, T4, T5)
+#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
+  REGISTER4(OP, D, N, F, T0, T1, T2, T3)                   \
+  REGISTER3(OP, D, N, F, T4, T5, T6)
+#define REGISTER8(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  REGISTER4(OP, D, N, F, T0, T1, T2, T3)                       \
+  REGISTER4(OP, D, N, F, T4, T5, T6, T7)
+#define REGISTER9(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
+  REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4)                       \
+  REGISTER4(OP, D, N, F, T5, T6, T7, T8)
+
+// Instead of adding REGISTER10, etc., shard the .cc files - see
+// cwise_op_equal_to_*.cc for an example.
+
+#endif  // defined(__ANDROID_TYPES_SLIM__)
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
new file mode 100644
index 00000000..fdd61d03
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -0,0 +1,218 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <complex>
+
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef std::complex<float> complex64;
+typedef std::complex<double> complex128;
+
+// Partial specialization of UnaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor>
+struct UnaryFunctor<GPUDevice, Functor> {
+  void operator()(const GPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in) {
+    MaybeWith32BitIndexing<GPUDevice>(
+        [&](auto out32, auto in32) {
+          out32.device(d) = in32.unaryExpr(typename Functor::func());
+        },
+        out, in);
+  }
+};
+
+// Partial specialization of BinaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor, int NDIMS, bool has_errors>
+struct BinaryFunctor<GPUDevice, Functor, NDIMS, has_errors> {
+  void operator()(const GPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1, bool* error) {
+    MaybeWith32BitIndexing<GPUDevice>(
+        [&](auto out32, auto in0_32, auto in1_32) {
+          out32.device(d) = in0_32.binaryExpr(in1_32, typename Functor::func());
+        },
+        out, in0, in1);
+  }
+
+  void Left(const GPUDevice& d, typename Functor::tout_type out,
+            typename Functor::tscalar_type scalar,
+            typename Functor::tin_type in, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
+    MaybeWith32BitIndexing<GPUDevice>(
+        [&](auto out32, auto in32) {
+          out32.device(d) = in32.unaryExpr(Unary(scalar.data()));
+        },
+        out, in);
+  }
+
+  void Right(const GPUDevice& d, typename Functor::tout_type out,
+             typename Functor::tin_type in,
+             typename Functor::tscalar_type scalar, bool* error) {
+    typedef typename Functor::out_type Tout;
+    typedef typename Functor::in_type Tin;
+    typedef typename Functor::func Binary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
+    MaybeWith32BitIndexing<GPUDevice>(
+        [&](auto out32, auto in32) {
+          out32.device(d) = in32.unaryExpr(Unary(scalar.data()));
+        },
+        out, in);
+  }
+
+  void BCast(const GPUDevice& d,
+             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
+             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
+             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
+             bool* error) {
+    typedef typename Functor::in_type T;
+    typename Functor::func func;
+    if ((NDIMS == 2) && Functor::use_bcast_optimization &&
+        use_bcast_optimization<T>::value) {
+      const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
+      const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
+      if (bcast0_all_one && !bcast1_all_one) {
+        MaybeWith32BitIndexing<GPUDevice>(
+            [&](auto out32, auto in0_32, auto in1_32) {
+              out32.device(d) =
+                  in0_32.binaryExpr(in1_32.broadcast(bcast1), func);
+            },
+            out, in0, in1);
+        return;
+      }
+      if (!bcast0_all_one && bcast1_all_one) {
+        MaybeWith32BitIndexing<GPUDevice>(
+            [&](auto out32, auto in0_32, auto in1_32) {
+              out32.device(d) =
+                  in0_32.broadcast(bcast0).binaryExpr(in1_32, func);
+            },
+            out, in0, in1);
+        return;
+      }
+    }
+    MaybeWith32BitIndexing<GPUDevice>(
+        [&](auto out32, auto in0_32, auto in1_32) {
+          out32.device(d) = in0_32.broadcast(bcast0).binaryExpr(
+              in1_32.broadcast(bcast1), func);
+        },
+        out, in0, in1);
+  }
+};
+
+// Partial specialization of ApproximateEqual<Device=GPUDevice, T>.
+template <typename T>
+struct ApproximateEqual<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z) {
+    auto diff = x - y;
+    z.device(d) = diff.abs() <= tolerance;
+  }
+};
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
+#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<GPUDevice, F<T> >
+#define DEFINE_UNARY2(F, T0, T1) \
+  DEFINE_UNARY1(F, T0);          \
+  DEFINE_UNARY1(F, T1)
+#define DEFINE_UNARY3(F, T0, T1, T2) \
+  DEFINE_UNARY2(F, T0, T1);          \
+  DEFINE_UNARY1(F, T2)
+#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
+  DEFINE_UNARY2(F, T0, T1);              \
+  DEFINE_UNARY2(F, T2, T3)
+#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_UNARY2(F, T0, T1);                  \
+  DEFINE_UNARY3(F, T2, T3, T4)
+#define DEFINE_UNARY6(F, T0, T1, T2, T3, T4, T5) \
+  DEFINE_UNARY2(F, T0, T1);                      \
+  DEFINE_UNARY4(F, T2, T3, T4, T5)
+#define DEFINE_UNARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_UNARY2(F, T0, T1);                          \
+  DEFINE_UNARY5(F, T2, T3, T4, T5, T6)
+#define DEFINE_UNARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  DEFINE_UNARY4(F, T0, T1, T2, T3);                      \
+  DEFINE_UNARY4(F, T4, T5, T6, T7)
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for BinaryFunctor.
+#define DEFINE_BINARY1(F, T)                         \
+  template struct BinaryFunctor<GPUDevice, F<T>, 1>; \
+  template struct BinaryFunctor<GPUDevice, F<T>, 2>; \
+  template struct BinaryFunctor<GPUDevice, F<T>, 3>; \
+  template struct BinaryFunctor<GPUDevice, F<T>, 4>; \
+  template struct BinaryFunctor<GPUDevice, F<T>, 5>
+#define DEFINE_BINARY2(F, T0, T1) \
+  DEFINE_BINARY1(F, T0);          \
+  DEFINE_BINARY1(F, T1)
+#define DEFINE_BINARY3(F, T0, T1, T2) \
+  DEFINE_BINARY2(F, T0, T1);          \
+  DEFINE_BINARY1(F, T2)
+#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_BINARY2(F, T0, T1);              \
+  DEFINE_BINARY2(F, T2, T3)
+#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_BINARY2(F, T0, T1);                  \
+  DEFINE_BINARY3(F, T2, T3, T4)
+#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \
+  DEFINE_BINARY3(F, T0, T1, T2);                  \
+  DEFINE_BINARY3(F, T3, T4, T5)
+#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_BINARY3(F, T0, T1, T2);                      \
+  DEFINE_BINARY4(F, T3, T4, T5, T6)
+#define DEFINE_BINARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  DEFINE_BINARY4(F, T0, T1, T2, T3);                      \
+  DEFINE_BINARY4(F, T4, T5, T6, T7)
+#define DEFINE_BINARY9(F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
+  DEFINE_BINARY4(F, T0, T1, T2, T3);                          \
+  DEFINE_BINARY5(F, T4, T5, T6, T7, T8)
+#define DEFINE_BINARY10(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
+  DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                           \
+  DEFINE_BINARY5(F, T5, T6, T7, T8, T9)
+#define DEFINE_BINARY11(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
+  DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                                \
+  DEFINE_BINARY6(F, T5, T6, T7, T8, T9, T10)
+
+#define DEFINE_APPROXIMATE_EQUAL1(T) \
+  template struct ApproximateEqual<GPUDevice, T>;
+#define DEFINE_APPROXIMATE_EQUAL2(T0, T1) \
+  DEFINE_APPROXIMATE_EQUAL1(T0);          \
+  DEFINE_APPROXIMATE_EQUAL1(T1);
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
new file mode 100644
index 00000000..dddce612
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+
+#define EIGEN_USE_GPU
+
+#include <complex>
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+#include "tensorflow/core/platform/types.h"
+
+#include "tensorflow/core/platform/logging.h"
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef std::complex<float> complex64;
+typedef std::complex<double> complex128;
+
+// Partial specialization of SimpleBinaryFunctor<Device=GPUDevice, Functor>.
+template <typename Functor>
+struct SimpleBinaryFunctor<GPUDevice, Functor> {
+  void operator()(const GPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in1,
+                  typename Functor::tin_type in2) {
+    MaybeWith32BitIndexing<GPUDevice>(
+        [&](auto out32, auto in1_32) {
+          out32.device(d) = in1_32.binaryExpr(in2, typename Functor::func());
+        },
+        out, in1);
+  }
+};
+
+// Macros to explicitly instantiate kernels on GPU for multiple types
+// (T0, T1, etc.) for SimpleBinaryFunctor (e.g., functor::tanh_grad).
+#define DEFINE_SIMPLE_BINARY1(F, T) \
+  template struct SimpleBinaryFunctor<GPUDevice, F<T> >
+#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \
+  DEFINE_SIMPLE_BINARY1(F, T0);          \
+  DEFINE_SIMPLE_BINARY1(F, T1)
+#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);          \
+  DEFINE_SIMPLE_BINARY1(F, T2)
+#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);              \
+  DEFINE_SIMPLE_BINARY2(F, T2, T3)
+#define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+  DEFINE_SIMPLE_BINARY3(F, T2, T3, T4)
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gradients.h b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gradients.h
new file mode 100644
index 00000000..0be3f788
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -0,0 +1,210 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GRADIENTS_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GRADIENTS_H_
+
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace Eigen {
+namespace internal {
+
+// Gradient for the tanh function
+template <typename T>
+struct scalar_tanh_gradient_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+  operator()(const T& output, const T& output_gradient) const {
+    return output_gradient * (T(1) - output * output);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& output, const Packet& output_gradient) const {
+    return pmul(output_gradient,
+                psub(pset1<Packet>(T(1)), pmul(output, output)));
+  }
+};
+template <typename T>
+struct functor_traits<scalar_tanh_gradient_op<T>> {
+  enum {
+    Cost = NumTraits<T>::AddCost + 2 * NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasSub && packet_traits<T>::HasMul,
+  };
+};
+
+// Gradient for the sigmoid function
+template <typename T>
+struct scalar_sigmoid_gradient_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+  operator()(const T& output, const T& output_gradient) const {
+    return output_gradient * output * (T(1) - output);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& output, const Packet& output_gradient) const {
+    return pmul(output_gradient,
+                pmul(output, psub(pset1<Packet>(T(1)), output)));
+  }
+};
+template <typename T>
+struct functor_traits<scalar_sigmoid_gradient_op<T>> {
+  enum {
+    Cost = NumTraits<T>::AddCost + 2 * NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasSub && packet_traits<T>::HasMul,
+  };
+};
+
+// Gradient for the inverse function
+template <typename T>
+struct scalar_inverse_gradient_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+  operator()(const T& output, const T& output_gradient) const {
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return -out_conj * out_conj * output_gradient;
+    }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& output, const Packet& output_gradient) const {
+    const Packet out_conj = pconj(output);
+    return mul_no_nan_op<T>().packetOp(pnegate(pmul(out_conj, out_conj)),
+                                       output_gradient);
+  }
+};
+template <typename T>
+struct functor_traits<scalar_inverse_gradient_op<T>> {
+  enum {
+    Cost = NumTraits<T>::AddCost + 2 * NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasMul,
+  };
+};
+
+// Gradient for the sqrt function
+template <typename T>
+struct scalar_sqrt_gradient_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+  operator()(const T& output, const T& output_gradient) const {
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return (static_cast<T>(0.5) * output_gradient) / out_conj;
+    }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& output, const Packet& output_gradient) const {
+    const Packet const_half = pset1<Packet>(static_cast<T>(0.5));
+    const Packet out_conj = pconj(output);
+    return mul_no_nan_op<T>().packetOp(pdiv(const_half, out_conj),
+                                       output_gradient);
+  }
+};
+template <typename T>
+struct functor_traits<scalar_sqrt_gradient_op<T>> {
+  enum {
+    PacketAccess = packet_traits<T>::HasMul & packet_traits<T>::HasDiv,
+    Cost = NumTraits<T>::MulCost + scalar_div_cost<T, PacketAccess>::value,
+  };
+};
+
+// Gradient for the rsqrt function
+template <typename T>
+struct scalar_rsqrt_gradient_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
+  operator()(const T& output, const T& output_gradient) const {
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return static_cast<T>(-0.5) * (output_gradient * out_conj) *
+             (out_conj * out_conj);
+    }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& output, const Packet& output_gradient) const {
+    const Packet const_half = pset1<Packet>(static_cast<T>(-0.5));
+    const Packet out_conj = pconj(output);
+    auto safe_pmul = [](const Packet& a, const Packet& b) {
+      return mul_no_nan_op<T>().packetOp(a, b);
+    };
+    return safe_pmul(pmul(const_half, pmul(out_conj, out_conj)),
+                     safe_pmul(out_conj, output_gradient));
+  }
+};
+template <typename T>
+struct functor_traits<scalar_rsqrt_gradient_op<T>> {
+  enum {
+    Cost = 4 * NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasMul,
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename Functor>
+struct SimpleBinaryFunctor {
+  void operator()(const Device& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1);
+};
+
+// Partial specialization of BinaryFunctor for CPU devices
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Functor>
+struct SimpleBinaryFunctor<CPUDevice, Functor> {
+  void operator()(const CPUDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1) {
+    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+  }
+};
+
+
+template <typename T>
+struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
+
+template <typename T>
+struct sigmoid_grad : base<T, Eigen::internal::scalar_sigmoid_gradient_op<T>> {
+};
+
+template <typename T>
+struct inverse_grad : base<T, Eigen::internal::scalar_inverse_gradient_op<T>> {
+};
+
+template <typename T>
+struct sqrt_grad : base<T, Eigen::internal::scalar_sqrt_gradient_op<T>> {};
+
+template <typename T>
+struct rsqrt_grad : base<T, Eigen::internal::scalar_rsqrt_gradient_op<T>> {};
+
+template <typename T>
+struct igamma_grad_a : base<T, Eigen::internal::scalar_igamma_der_a_op<T>> {};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GRADIENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/batch_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/batch_dataset_op.h
new file mode 100644
index 00000000..4be07eff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/batch_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_BATCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_BATCH_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class BatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Batch";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kBatchSize = "batch_size";
+  static constexpr const char* const kDropRemainder = "drop_remainder";
+  static constexpr const char* const kParallelCopy = "parallel_copy";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit BatchDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int op_version_;
+  bool parallel_copy_ = false;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_BATCH_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/cache_dataset_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/cache_dataset_ops.h
new file mode 100644
index 00000000..e0ceee2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/cache_dataset_ops.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OPS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class CacheDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  class FileDatasetBase;
+  class MemoryDatasetBase;
+
+  static constexpr const char* const kDatasetType = "Cache";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kFileName = "filename";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit CacheDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class FileDataset;
+  class FileDatasetV2;
+  class MemoryDataset;
+  class MemoryDatasetV2;
+
+  const int op_version_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CACHE_DATASET_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/cache_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/cache_ops.h
new file mode 100644
index 00000000..e1e58ae9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/cache_ops.h
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CACHE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CACHE_OPS_H_
+
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+namespace data {
+
+// A thread-safe data structure for caching dataset elements.
+//
+// The expected use is that a single `MemoryWriterIterator` populates the
+// cache with dataset elements. Once all elements are cached, the cache can
+// be used by one or more `MemoryReaderIterator`s.
+class MemoryCache {
+ public:
+  MemoryCache() = default;
+
+  // Marks the cache as completed.
+  void Complete(std::vector<std::vector<Tensor>>&& cache);
+
+  // Returns whether the cache is completed.
+  bool IsCompleted();
+
+  // Resets the cache.
+  void Reset();
+
+  // Returns the element at the given index.
+  const std::vector<Tensor>& at(int64_t index);
+
+  // Returns the size of the cache.
+  size_t size();
+
+  // Returns a reference to the cache's data. The returned reference will be
+  // invalidated by any call to Reset().
+  const std::vector<std::vector<Tensor>>& data();
+
+ private:
+  mutex mu_;
+  // Determines whether all elements of the dataset have been cached.
+  bool completed_ TF_GUARDED_BY(mu_) = false;
+  std::vector<std::vector<Tensor>> cache_ TF_GUARDED_BY(mu_);
+};
+
+// A resource wrapping a shared instance of a memory cache.
+class MemoryCacheManager : public ResourceBase {
+ public:
+  MemoryCacheManager() : cache_(std::make_shared<MemoryCache>()) {}
+
+  string DebugString() const override;
+
+  std::shared_ptr<MemoryCache> get() { return cache_; }
+
+ private:
+  std::shared_ptr<MemoryCache> cache_;
+};
+
+// Creates an instance of cache resource and transfers ownership to the caller.
+class AnonymousMemoryCacheHandleOp
+    : public AnonymousResourceOp<MemoryCacheManager> {
+ public:
+  explicit AnonymousMemoryCacheHandleOp(OpKernelConstruction* ctx);
+
+ private:
+  string name() override;
+  absl::Status CreateResource(
+      OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib, MemoryCacheManager** manager) override;
+};
+
+// Deletes an instance of cache resource.
+class DeleteMemoryCacheOp : public OpKernel {
+ public:
+  explicit DeleteMemoryCacheOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CACHE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/concatenate_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/concatenate_dataset_op.h
new file mode 100644
index 00000000..a40e71fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/concatenate_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CONCATENATE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CONCATENATE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Concatenate";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kAnotherDataset = "another_dataset";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit ConcatenateDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase* to_concatenate, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CONCATENATE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/dataset_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/dataset_ops.h
new file mode 100644
index 00000000..fbbfb514
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/dataset_ops.h
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
+
+#include <memory>
+
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide this functionality because not all of its
+// dependencies are available there.
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+class DatasetToGraphOp : public OpKernel {
+ public:
+  static constexpr const char* const kAllowStateful = "allow_stateful";
+  static constexpr const char* const kStripDeviceAssignment =
+      "strip_device_assignment";
+  static constexpr const char* const kExternalStatePolicy =
+      "external_state_policy";
+  static constexpr const char* const kDatasetToGraph = "DatasetToGraph";
+
+  explicit DatasetToGraphOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  const int op_version_;
+  ExternalStatePolicy external_state_policy_ = ExternalStatePolicy::POLICY_WARN;
+  bool strip_device_assignment_ = false;
+};
+
+class DatasetCardinalityOp : public OpKernel {
+ public:
+  explicit DatasetCardinalityOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<CardinalityOptions> cardinality_options_;
+};
+
+// An OpKernel that computes the fingerprint of a dataset.
+class DatasetFingerprintOp : public OpKernel {
+ public:
+  explicit DatasetFingerprintOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class DatasetFromGraphOp : public OpKernel {
+ public:
+  static constexpr const char* const kGraphDef = "graph_def";
+  static constexpr const char* const kHandle = "handle";
+
+  explicit DatasetFromGraphOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.h
new file mode 100644
index 00000000..098206a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_CARDINALITY_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_CARDINALITY_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class AssertCardinalityDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "AssertCardinality";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kCardinality = "cardinality";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit AssertCardinalityDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_CARDINALITY_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h
new file mode 100644
index 00000000..6e86b5d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class AssertNextDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "AssertNext";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kTransformations = "transformations";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit AssertNextDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.h
new file mode 100644
index 00000000..ed42b0c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_PREV_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_PREV_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class AssertPrevDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr char kDatasetType[] = "AssertPrev";
+  static constexpr char kInputDataset[] = "input_dataset";
+  static constexpr char kTransformations[] = "transformations";
+  static constexpr char kOutputTypes[] = "output_types";
+  static constexpr char kOutputShapes[] = "output_shapes";
+
+  explicit AssertPrevDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_PREV_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
new file mode 100644
index 00000000..c1f71bd6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class AutoShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "AutoShard";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kNumWorkers = "num_workers";
+  static constexpr const char* const kIndex = "index";
+  static constexpr const char* const kAutoShardPolicy = "auto_shard_policy";
+  static constexpr const char* const kNumReplicas = "num_replicas";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit AutoShardDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  static RewriterConfig CreateConfig(int64_t num_workers, int64_t index,
+                                     int64_t auto_shard_policy,
+                                     int64_t num_replicas);
+  int64_t auto_shard_policy_;
+  int64_t num_replicas_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/compression_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/compression_ops.h
new file mode 100644
index 00000000..6dd89ea4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/compression_ops.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class CompressElementOp : public OpKernel {
+ public:
+  explicit CompressElementOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class UncompressElementOp : public OpKernel {
+ public:
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit UncompressElementOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
new file mode 100644
index 00000000..5f23123a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
@@ -0,0 +1,107 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_DATASET_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace data {
+
+// A resource which counts how many iterators have been created. This is used
+// by the DataServiceDataset to coordinate jobs across multiple iterations.
+class IterationCounter : public ResourceBase {
+ public:
+  IterationCounter() : counter_(0) {}
+
+  std::string DebugString() const override {
+    mutex_lock l(mu_);
+    return absl::StrCat(counter_);
+  }
+
+  int64_t GetAndIncrement() {
+    mutex_lock l(mu_);
+    return ++counter_;
+  }
+
+ private:
+  mutable mutex mu_;
+  int64_t counter_ TF_GUARDED_BY(mu_) = 0;
+};
+
+// Creates a dataset for reading from the tf.data service.
+class DataServiceDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "DataService";
+  static constexpr const char* const kDatasetId = "dataset_id";
+  static constexpr const char* const kProcessingMode = "processing_mode";
+  static constexpr const char* const kAddress = "address";
+  static constexpr const char* const kProtocol = "protocol";
+  static constexpr const char* const kDataTransferProtocol =
+      "data_transfer_protocol";
+  static constexpr const char* const kJobName = "job_name";
+  static constexpr const char* const kConsumerIndex = "consumer_index";
+  static constexpr const char* const kNumConsumers = "num_consumers";
+  static constexpr const char* const kMaxOutstandingRequests =
+      "max_outstanding_requests";
+  static constexpr const char* const kTaskRefreshIntervalHintMs =
+      "task_refresh_interval_hint_ms";
+  static constexpr const char* const kTargetWorkers = "target_workers";
+  static constexpr const char* const kIterationCounter = "iteration_counter";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kUncompress = "uncompress";
+  static constexpr const char* const kUncompressFn = "uncompress_fn";
+  static constexpr const char* const kCrossTrainerCacheOptions =
+      "cross_trainer_cache_options";
+
+  // Note: If a new constant is declared here, it *must* be defined in
+  // data_service_dataset_op.cc, otherwise it will not compile in debug mode.
+
+  explicit DataServiceDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  int op_version_;
+  absl::Duration task_refresh_interval_hint_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::string data_transfer_protocol_;
+  TargetWorkers target_workers_ = TARGET_WORKERS_AUTO;
+  bool uncompress_;
+  std::shared_ptr<FunctionMetadata> uncompress_fn_ = nullptr;
+  std::string seriazlied_cross_trainer_cache_options_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/data_service_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/data_service_ops.h
new file mode 100644
index 00000000..b21a353d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/data_service_ops.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_OPS_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+// Registers a dataset with the tf.data service.
+//
+// The address and protocol inputs are used to connect to the dispatcher.
+// The external state policy attribute determines whether to ignore, warn, or
+// error out when the dataset contains external state.
+// The op produces a dataset id for identifying the registered dataset.
+class RegisterDatasetOp : public OpKernel {
+ public:
+  static constexpr const char* const kAddress = "address";
+  static constexpr const char* const kProtocol = "protocol";
+  static constexpr const char* const kExternalStatePolicy =
+      "external_state_policy";
+  static constexpr const char* const kElementSpec = "element_spec";
+  static constexpr const char* const kMetadata = "metadata";
+  static constexpr const char* const kRequestedDatasetId =
+      "requested_dataset_id";
+  static constexpr const char* const kTimeoutMs = "timeout_ms";
+
+  explicit RegisterDatasetOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int op_version_;
+  ExternalStatePolicy external_state_policy_;
+  std::string element_spec_;
+  std::string serialized_metadata_;
+  std::string requested_dataset_id_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DATA_SERVICE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.h
new file mode 100644
index 00000000..25c0ef7a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DIRECTED_INTERLEAVE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DIRECTED_INTERLEAVE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class DirectedInterleaveDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "DirectedInterleave";
+  static constexpr const char* const kSelectorInputDataset =
+      "selector_input_dataset";
+  static constexpr const char* const kDataInputDatasets = "data_input_datasets";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kNumInputDatasets = "N";
+  static constexpr const char* const kStopOnEmptyDataset =
+      "stop_on_empty_dataset";
+
+  explicit DirectedInterleaveDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  bool stop_on_empty_dataset_ = false;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DIRECTED_INTERLEAVE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/distributed_save_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/distributed_save_op.h
new file mode 100644
index 00000000..d88642f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/distributed_save_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DISTRIBUTED_SAVE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DISTRIBUTED_SAVE_OP_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// Initiates the process of distributedly saving a dataset to disk.
+class DistributedSaveOp : public OpKernel {
+ public:
+  static constexpr const char* const kDirectory = "directory";
+  static constexpr const char* const kAddress = "address";
+  static constexpr const char* const kMetadata = "metadata";
+
+  explicit DistributedSaveOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::string serialized_metadata_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DISTRIBUTED_SAVE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/list_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/list_dataset_op.h
new file mode 100644
index 00000000..ef921042
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/list_dataset_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LIST_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LIST_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ListDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "List";
+  static constexpr const char* const kTensors = "tensors";
+  static constexpr const char* const kTinputTypes = "Tinput_types";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit ListDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LIST_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.h
new file mode 100644
index 00000000..f58473a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LMDB_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LMDB_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class LMDBDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "LMDB";
+  static constexpr const char* const kFileNames = "filenames";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  using DatasetOpKernel::DatasetOpKernel;
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LMDB_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/load_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/load_dataset_op.h
new file mode 100644
index 00000000..4a27d6aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/load_dataset_op.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LOAD_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LOAD_DATASET_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// An operation that can load a dataset from one or more files.
+class LoadDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kCompression = "compression";
+  static constexpr const char* const kDatasetType = "Load";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kPath = "path";
+  static constexpr const char* const kReaderFunc = "reader_func";
+  static constexpr const char* const kReaderFuncOtherArgs =
+      "reader_func_other_args";
+  static constexpr const char* const kReaderFuncTarguments =
+      "Treader_func_args";
+
+  explicit LoadDatasetOp(OpKernelConstruction* ctx);
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  std::string compression_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::shared_ptr<FunctionMetadata> reader_func_metadata_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_LOAD_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h
new file mode 100644
index 00000000..b3fec152
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_MAP_AND_BATCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_MAP_AND_BATCH_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See documentation in ../../ops/experimental_dataset_ops.cc for a high-level
+// description of the following op.
+
+class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "MapAndBatch";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kBatchSize = "batch_size";
+  static constexpr const char* const kNumParallelCalls = "num_parallel_calls";
+  static constexpr const char* const kDropRemainder = "drop_remainder";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kPreserveCardinality =
+      "preserve_cardinality";
+
+  explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool preserve_cardinality_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_MAP_AND_BATCH_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h
new file mode 100644
index 00000000..fc59b599
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_PARALLEL_INTERLEAVE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_PARALLEL_INTERLEAVE_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See documentation in ../../ops/experimental_dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "LegacyParallelInterleave";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kCycleLength = "cycle_length";
+  static constexpr const char* const kBlockLength = "block_length";
+  static constexpr const char* const kDeterministic = "deterministic";
+  static constexpr const char* const kSloppy = "sloppy";
+  static constexpr const char* const kBufferOutputElements =
+      "buffer_output_elements";
+  static constexpr const char* const kPrefetchInputElements =
+      "prefetch_input_elements";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int op_version_;
+
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  DeterminismPolicy deterministic_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_PARALLEL_INTERLEAVE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/random_access_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/random_access_ops.h
new file mode 100644
index 00000000..293cb99c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/random_access_ops.h
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_RANDOM_ACCESS_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_RANDOM_ACCESS_OPS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// An operation that can get an element at a specified index in a dataset.
+class GetElementAtIndexOp : public AsyncOpKernel {
+ public:
+  explicit GetElementAtIndexOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        unbounded_threadpool_(ctx->env(), "tf_data_get_element_at_index") {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  ~GetElementAtIndexOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    unbounded_threadpool_.Schedule([this, ctx, done = std::move(done)]() {
+      ctx->SetStatus(DoCompute(ctx));
+      done();
+    });
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    ctx->SetStatus(DoCompute(ctx));
+  }
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx);
+
+ private:
+  UnboundedThreadPool unbounded_threadpool_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_RANDOM_ACCESS_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/random_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/random_dataset_op.h
new file mode 100644
index 00000000..2b3624fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/random_dataset_op.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_RANDOM_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_RANDOM_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt for the
+// API definition that corresponds to this kernel.
+class RandomDatasetOp : public DatasetOpKernel {
+ public:
+  // Names of op parameters, public so that they can be accessed by test cases.
+  // Make sure that these are kept in sync with the REGISTER_OP call in
+  // tensorflow/core/ops/experimental_dataset_ops.cc
+  static constexpr const char* const kDatasetType = "Random";
+  static constexpr const char* const kSeed = "seed";
+  static constexpr const char* const kSeed2 = "seed2";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kRerandomizeEachIteration =
+      "rerandomize_each_iteration";
+
+  explicit RandomDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  int32_t op_version_;
+  bool rerandomize_each_iteration_ = false;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_RANDOM_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sampling_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sampling_dataset_op.h
new file mode 100644
index 00000000..9223c0e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sampling_dataset_op.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAMPLING_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAMPLING_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// See tensorflow/core/api_def/base_api/api_def_SamplingDataset.pbtxt for the
+// API definition that corresponds to this kernel.
+class SamplingDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  // Names of op parameters, public so that they can be accessed by test cases.
+  // Make sure that these are kept in sync with the REGISTER_OP call in
+  // tensorflow/core/ops/experimental_dataset_ops.cc
+  static constexpr const char* const kDatasetType = "Sampling";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kRate = "rate";
+  static constexpr const char* const kSeed = "seed";
+  static constexpr const char* const kSeed2 = "seed2";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit SamplingDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAMPLING_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/save_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/save_dataset_op.h
new file mode 100644
index 00000000..77478d4e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/save_dataset_op.h
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAVE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAVE_DATASET_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// An operation that can save a dataset to one or more files.
+class SaveDatasetOp : public HybridAsyncOpKernel {
+ public:
+  static constexpr const char* const kCompression = "compression";
+  static constexpr const char* const kPath = "path";
+  static constexpr const char* const kShardFunc = "shard_func";
+  static constexpr const char* const kShardFuncOtherArgs =
+      "shard_func_other_args";
+  static constexpr const char* const kUseShardFunc = "use_shard_func";
+
+  explicit SaveDatasetOp(OpKernelConstruction* ctx);
+
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+
+ private:
+  static constexpr const int kFileFormatVersion = 2;
+
+  absl::Status ConsumeElement();
+
+  absl::Status GetShardIndex(IteratorContext* ctx,
+                             InstantiatedCapturedFunction* function,
+                             const std::vector<Tensor>& element,
+                             int64_t* shard_index);
+
+  absl::Status WriteData(OpKernelContext* ctx, DatasetBase* dataset,
+                         std::unique_ptr<CapturedFunction> captured_func,
+                         const std::string& run_dir, uint64* num_elements);
+
+  absl::Status WriteMetadataFile(Env* env, const std::string& path,
+                                 uint64 run_id,
+                                 const DataTypeVector& output_dtypes,
+                                 uint64 num_elements, bool finalized);
+
+  bool use_shard_func_;
+  std::string compression_;
+  std::shared_ptr<FunctionMetadata> func_metadata_;
+};
+
+// An operation that can save a dataset to one or more files. This
+// version of the implementation subclasses from UnaryDatasetOpKernel to align
+// the implementation of save with that of the other tf.data transformations.
+class SaveDatasetV2Op : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kPath = "path";
+  static constexpr const char* const kCompression = "compression";
+
+  static constexpr const char* const kDatasetType = "SaveV2";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  static constexpr const char* const kShardFunc = "shard_func";
+  static constexpr const char* const kShardFuncOtherArgs =
+      "shard_func_other_args";
+  static constexpr const char* const kUseShardFunc = "use_shard_func";
+  static constexpr const char* const kShardFuncTarguments = "Tshard_func_args";
+
+  explicit SaveDatasetV2Op(OpKernelConstruction* ctx);
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  static constexpr const int kFileFormatVersion = 2;
+
+  tstring path_;
+  std::string compression_;
+  std::unique_ptr<CapturedFunction> shard_func_;
+  bool use_shard_func_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::shared_ptr<FunctionMetadata> func_metadata_;
+  std::string writer_prefix_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SAVE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
new file mode 100644
index 00000000..fb1fa875
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class SnapshotDatasetV2Op : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Snapshot";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kCompression = "compression";
+  static constexpr const char* const kReaderPrefix = "reader_prefix";
+  static constexpr const char* const kWriterPrefix = "writer_prefix";
+  static constexpr const char* const kHashValid = "hash_valid";
+  static constexpr const char* const kHash = "hash";
+  static constexpr const char* const kCompressionAuto = "AUTO";
+  static constexpr const char* const kReaderFunc = "reader_func";
+  static constexpr const char* const kShardFunc = "shard_func";
+  static constexpr const char* const kReaderFuncOtherArgs =
+      "reader_func_other_args";
+  static constexpr const char* const kShardFuncOtherArgs =
+      "shard_func_other_args";
+  static constexpr const char* const kReaderFuncTarguments =
+      "Treader_func_args";
+  static constexpr const char* const kShardFuncTarguments = "Tshard_func_args";
+  // Note: If a new constant is declared here, it *must* be defined in
+  // snapshot_dataset_op.cc, otherwise it will not compile in debug mode.
+
+  explicit SnapshotDatasetV2Op(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  static constexpr const int kFileFormatVersion = 2;
+
+  class Dataset;
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+
+  std::string compression_;
+  std::string reader_prefix_;
+  std::string writer_prefix_;
+  bool hash_valid_;
+  uint64 hash_;
+
+  std::shared_ptr<FunctionMetadata> reader_func_metadata_;
+  std::shared_ptr<FunctionMetadata> shard_func_metadata_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/driver_manager.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
new file mode 100644
index 00000000..7aa307e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
+
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace sql {
+
+// A factory class for creating `QueryConnection` instances.
+class DriverManager {
+ public:
+  // A factory method for creating `QueryConnection` instances.
+  //
+  // `driver_name` is the database type (e.g. 'sqlite'). `driver_name`
+  // corresponds to a `QueryConnection` subclass. For example, if `driver_name`
+  // == `sqlite`, then `CreateQueryConnection` will create a
+  // `SqliteQueryConnection` instance.
+  static std::unique_ptr<QueryConnection> CreateQueryConnection(
+      const string& driver_name);
+};
+
+}  // namespace sql
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/query_connection.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/query_connection.h
new file mode 100644
index 00000000..031a8725
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+class IteratorContext;
+
+namespace experimental {
+
+namespace sql {
+// This interface allows a user to connect to a database, execute a query, and
+// iterate over the result set, putting the results into an output tensor.
+// A subclass implementation is required for each type of database
+// (e.g. sqlite3, mysql, etc.)
+//
+// Presently, a `QueryConnection` instance can only handle one query at a time.
+// In a future extension, this class may be refactored so that it creates
+// instances of a new class (named, say, `Statement`) which could have a
+// one-to-one correspondence with queries. This would make `QueryConnection`
+// more consistent with `Connection` classes of other database APIs.
+// `QueryConnection` would then be renamed simply `Connection`.
+//
+// This class is not thread safe. Access to it is guarded by a mutex in
+// `SqlDatasetOp::Dataset::Iterator`.
+class QueryConnection {
+ public:
+  virtual ~QueryConnection() {}
+  // Opens a connection to the database named by `data_source_name`. Prepares to
+  // execute `query` against the database.
+  //
+  // The client must call `Close()` to release the connection resources, even
+  // if `Open()` fails. `Close()` must be called before making another call
+  // to `Open()`.
+  virtual absl::Status Open(const string& data_source_name, const string& query,
+                            const DataTypeVector& output_types) = 0;
+  // Closes an opened connection.
+  virtual absl::Status Close() = 0;
+  // Retrieves the next row of the result set of the query from the most recent
+  // call to `Open()`.
+  //
+  // If such a row exists, then the row will be stored in `*out_tensors`, and
+  // `false` will be stored in `*end_of_sequence`.
+  //
+  // If there are no more rows in the result set, then instead `true` will be
+  // stored in `*end_of_sequence`, and the content of `*out_tensors` will be
+  // undefined.
+  virtual absl::Status GetNext(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) = 0;
+};
+
+}  // namespace sql
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
new file mode 100644
index 00000000..4cf2608c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
+
+#include <memory>
+
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
+#include "tensorflow/core/lib/db/sqlite.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+namespace sql {
+
+class SqliteQueryConnection : public QueryConnection {
+ public:
+  SqliteQueryConnection();
+  ~SqliteQueryConnection() override;
+  absl::Status Open(const string& data_source_name, const string& query,
+                    const DataTypeVector& output_types) override;
+  absl::Status Close() override;
+  absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence) override;
+
+ private:
+  // Prepares the query string `query_`.
+  absl::Status PrepareQuery();
+  // Fills `tensor` with the column_index_th element of the current row of
+  // `stmt_`.
+  void FillTensorWithResultSetEntry(const DataType& data_type, int column_index,
+                                    Tensor* tensor);
+  Sqlite* db_ = nullptr;
+  SqliteStatement stmt_;
+  int column_count_ = 0;
+  string query_;
+  DataTypeVector output_types_;
+};
+
+}  // namespace sql
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.h
new file mode 100644
index 00000000..1255365d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_THREADPOOL_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_THREADPOOL_DATASET_OP_H_
+
+#include <cstdint>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType =
+      "MaxIntraOpParallelismDataset";
+  static constexpr const char* const kDatasetOp =
+      "MaxIntraOpParallelismDatasetOp";
+
+  // Executes the logic of the MaxIntraOpParallelismDatasetOp directly (as
+  // opposed to through executing the MaxIntraOpParallelismDatasetOp op kernel).
+  static void MakeDatasetFromOptions(OpKernelContext* ctx, DatasetBase* input,
+                                     int32_t max_intra_op_parallelism,
+                                     DatasetBase** output);
+
+  explicit MaxIntraOpParallelismDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "PrivateThreadPoolDataset";
+  static constexpr const char* const kDatasetOp = "PrivateThreadPoolDatasetOp";
+
+  // Executes the logic of the PrivateThreadpoolDatasetOp directly (as
+  // opposed to through executing the PrivateThreadpoolDatasetOp op kernel).
+  static void MakeDatasetFromOptions(OpKernelContext* ctx, DatasetBase* input,
+                                     int32_t num_threads, DatasetBase** output);
+
+  explicit PrivateThreadPoolDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_THREADPOOL_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/unique_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/unique_dataset_op.h
new file mode 100644
index 00000000..2d415816
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/experimental/unique_dataset_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_UNIQUE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_UNIQUE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class UniqueDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Unique";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit UniqueDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_UNIQUE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/filter_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/filter_dataset_op.h
new file mode 100644
index 00000000..59c5bcc1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/filter_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_FILTER_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_FILTER_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class FilterDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Filter";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kPredicate = "predicate";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit FilterDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_FILTER_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/finalize_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/finalize_dataset_op.h
new file mode 100644
index 00000000..4b2ef22b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/finalize_dataset_op.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_FINALIZE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_FINALIZE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+class FinalizeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Finalize";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kHasCapturedRef = "has_captured_ref";
+
+  explicit FinalizeDatasetOp(OpKernelConstruction* ctx);
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  bool has_captured_ref_;
+};
+
+class FinalizeDatasetNoopOp : public UnaryDatasetOpKernel {
+ public:
+  explicit FinalizeDatasetNoopOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    LOG(WARNING) << "FinalizeDataset is only supported on CPU. Using it on "
+                    "devices other than CPU has no effect.";
+    input->Ref();
+    *output = input;
+  }
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_FINALIZE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/fixed_length_record_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/fixed_length_record_dataset_op.h
new file mode 100644
index 00000000..30b62031
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/fixed_length_record_dataset_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_FIXED_LENGTH_RECORD_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_FIXED_LENGTH_RECORD_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class FixedLengthRecordDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "FixedLengthRecord";
+  static constexpr const char* const kFileNames = "filenames";
+  static constexpr const char* const kHeaderBytes = "header_bytes";
+  static constexpr const char* const kRecordBytes = "record_bytes";
+  static constexpr const char* const kFooterBytes = "footer_bytes";
+  static constexpr const char* const kBufferSize = "buffer_size";
+  static constexpr const char* const kCompressionType = "compression_type";
+
+  explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int op_version_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_FIXED_LENGTH_RECORD_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/flat_map_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/flat_map_dataset_op.h
new file mode 100644
index 00000000..6b370757
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/flat_map_dataset_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_FLAT_MAP_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_FLAT_MAP_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class FlatMapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "FlatMap";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit FlatMapDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_FLAT_MAP_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/generator_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/generator_dataset_op.h
new file mode 100644
index 00000000..b734e9a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/generator_dataset_op.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class GeneratorDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Generator";
+  static constexpr const char* const kInitFuncOtherArgs =
+      "init_func_other_args";
+  static constexpr const char* const kNextFuncOtherArgs =
+      "next_func_other_args";
+  static constexpr const char* const kFinalizeFuncOtherArgs =
+      "finalize_func_other_args";
+  static constexpr const char* const kInitFunc = "init_func";
+  static constexpr const char* const kNextFunc = "next_func";
+  static constexpr const char* const kFinalizeFunc = "finalize_func";
+  static constexpr const char* const kTinitFuncArgs = "Tinit_func_args";
+  static constexpr const char* const kTnextFuncArgs = "Tnext_func_args";
+  static constexpr const char* const kTfinalizeFuncArgs = "Tfinalize_func_args";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit GeneratorDatasetOp(OpKernelConstruction* ctx);
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::shared_ptr<FunctionMetadata> init_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> next_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> finalize_func_metadata_ = nullptr;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/get_options_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/get_options_op.h
new file mode 100644
index 00000000..3e6611cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/get_options_op.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_GET_OPTIONS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_GET_OPTIONS_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+// TODO(jsimsa): Provide class-level documentation for this and the other ops.
+class GetOptionsOp : public OpKernel {
+ public:
+  explicit GetOptionsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) final;
+
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_GET_OPTIONS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/interleave_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/interleave_dataset_op.h
new file mode 100644
index 00000000..a1300ddd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/interleave_dataset_op.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_INTERLEAVE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_INTERLEAVE_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class InterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Interleave";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kCycleLength = "cycle_length";
+  static constexpr const char* const kBlockLength = "block_length";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit InterleaveDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_INTERLEAVE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/iterator_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/iterator_ops.h
new file mode 100644
index 00000000..a2b13411
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/iterator_ops.h
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_ITERATOR_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_ITERATOR_OPS_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/metric_utils.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
+#include "tensorflow/core/data/unbounded_thread_pool.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/refcount.h"
+
+namespace tensorflow {
+namespace data {
+
+class IteratorResource : public ResourceBase {
+ public:
+  IteratorResource(Env* env, const DataTypeVector& output_dtypes,
+                   const std::vector<PartialTensorShape>& output_shapes,
+                   std::unique_ptr<DeviceMgr> device_mgr,
+                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                   FunctionLibraryRuntime* flr);
+
+  ~IteratorResource() override;
+
+  // Gets the next output from the iterator managed by this iterator resource.
+  //
+  // If at least one output remains, that output will be stored in
+  // `*out_tensors` and `false` will be stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain, `true` will be stored in `*end_of_sequence`, and
+  // the content of `*out_tensors` will be undefined.
+  absl::Status GetNext(OpKernelContext* ctx, std::vector<Tensor>* out_tensors,
+                       bool* end_of_sequence);
+
+  absl::Status GetModelProto(std::string& model_proto);
+
+  // Saves a checkpoint of the state of the iterator through the given `writer`.
+  absl::Status Save(OpKernelContext* ctx,
+                    ExternalStatePolicy external_state_policy,
+                    IteratorStateWriter* writer);
+
+  // Restores the state of the iterator from a checkpoint created by `Save`.
+  absl::Status Restore(OpKernelContext* ctx, IteratorStateReader* reader);
+
+  // Creates an iterator for `dataset`, and associates the iterator with this
+  // iterator resource.
+  //
+  // `SetIteratorFromDataset` should be called before calling `GetNext`, `Save`,
+  // or `Restore`.
+  absl::Status SetIteratorFromDataset(OpKernelContext* ctx,
+                                      const DatasetBase* dataset);
+
+  string DebugString() const override { return "Iterator resource"; }
+
+  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+ private:
+  class State {
+   public:
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* flr,
+          std::unique_ptr<DatasetBaseIterator> iterator)
+        : flib_def_(std::move(flib_def)),
+          flr_(flr),
+          pflr_(std::move(pflr)),
+          function_handle_cache_(std::make_unique<FunctionHandleCache>(flr)),
+          iterator_(std::move(iterator)),
+
+          id_registry_(std::make_shared<MemoryCheckpoint::IdRegistry>()),
+          checkpoint_(MemoryCheckpoint::CreateRootCheckpoint(id_registry_)) {}
+
+    ~State() { cancellation_manager_.StartCancel(); }
+
+    std::shared_ptr<FunctionLibraryDefinition> flib_def() { return flib_def_; }
+
+    FunctionLibraryRuntime* flr() { return flr_; }
+
+    std::shared_ptr<ProcessFunctionLibraryRuntime> pflr() { return pflr_; }
+
+    FunctionHandleCache* function_handle_cache() {
+      return function_handle_cache_.get();
+    }
+
+    ResourceMgr* resource_mgr() { return &resource_mgr_; }
+
+    CancellationManager* cancellation_manager() {
+      return &cancellation_manager_;
+    }
+
+    DatasetBaseIterator* iterator() { return iterator_.get(); }
+
+    std::shared_ptr<model::Model> model() { return model_; }
+
+    const MemoryCheckpoint& checkpoint() const { return checkpoint_; }
+
+    DatasetBase* dataset() { return dataset_.get(); }
+
+    // Downcasts the given `IteratorBase` to a `DatasetBaseIterator`, and uses
+    // it to set the `iterator` and the `dataset` field.
+    void DowncastAndSetIteratorAndDataset(std::unique_ptr<IteratorBase> it,
+                                          const DatasetBase* dataset);
+
+    // Merges the given checkpoint with the checkpoint of this state.
+    void MergeCheckpoint(MemoryCheckpoint* other);
+
+    void SetModel(std::shared_ptr<model::Model> model);
+
+    std::shared_ptr<MemoryCheckpoint::IdRegistry> id_registry() {
+      return id_registry_;
+    }
+
+   private:
+    std::shared_ptr<FunctionLibraryDefinition> flib_def_;
+    FunctionLibraryRuntime* flr_ = nullptr;  // not owned
+    std::shared_ptr<ProcessFunctionLibraryRuntime> pflr_;
+    std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+    ResourceMgr resource_mgr_;
+    CancellationManager cancellation_manager_;
+    std::unique_ptr<DatasetBaseIterator> iterator_;
+    core::RefCountPtr<DatasetBase> dataset_;
+    std::shared_ptr<MemoryCheckpoint::IdRegistry> id_registry_;
+    MemoryCheckpoint checkpoint_;
+    std::shared_ptr<model::Model> model_;
+  };
+
+  IteratorMetricsCollector metrics_collector_;
+  std::shared_ptr<TfDatazMetricsCollector> tf_dataz_metrics_collector_;
+  UnboundedThreadPool unbounded_thread_pool_;
+
+  mutex mu_;
+  const Env& env_;
+  const std::unique_ptr<DeviceMgr> device_mgr_ TF_GUARDED_BY(mu_);
+  std::shared_ptr<State> iterator_state_ TF_GUARDED_BY(mu_);
+  const DataTypeVector output_dtypes_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+class IteratorHandleOp : public OpKernel {
+ public:
+  explicit IteratorHandleOp(OpKernelConstruction* ctx);
+
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel. Ideally the resource should be deleted when it is no longer held
+  // by anyone, but it would break backward compatibility.
+  ~IteratorHandleOp() override;
+
+  void Compute(OpKernelContext* context) override TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  absl::Status VerifyResource(IteratorResource* resource);
+
+  FunctionLibraryRuntime* CreatePrivateFLR(
+      OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
+      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr);
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  IteratorResource* resource_ TF_GUARDED_BY(mu_) = nullptr;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+  string name_;
+};
+
+// Like IteratorHandleOp, but creates handles which are never shared, and does
+// not hold a reference to these handles. The latter is important for eager
+// execution, since OpKernel instances generally live as long as the program
+// running them.
+class AnonymousIteratorHandleOp : public AnonymousResourceOp<IteratorResource> {
+ public:
+  explicit AnonymousIteratorHandleOp(OpKernelConstruction* context);
+
+ private:
+  string name() override;
+
+  absl::Status CreateResource(
+      OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib, IteratorResource** resource) override;
+
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+};
+
+// A hybrid asynchronous-and-synchronous OpKernel with efficient support for
+// both modes.
+//
+// Inherit from this class when the application logic of the kernel (i) is
+// implemented synchronously, (ii) must run on a background thread when the
+// kernel executes in the inter-op threadpool (typically because it depends on
+// inter-op threadpool threads, e.g. for function execution), and (iii) can run
+// synchronously on the calling thread when the caller donates a thread
+// (typically in eager execution). The implementation avoids a thread-hop in
+// case (iii).
+//
+// NOTE: Unlike typical OpKernel subclasses, the application logic is
+// implemented in a method (DoCompute()) that returns Status. Use
+// TF_RETURN_IF_ERROR for error-related control flow rather than
+// OP_REQUIRES_OK().
+class HybridAsyncOpKernel : public AsyncOpKernel {
+ public:
+  HybridAsyncOpKernel(OpKernelConstruction* ctx,
+                      const char* background_worker_name);
+
+  void Compute(OpKernelContext* ctx) final;
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) final;
+
+ protected:
+  virtual absl::Status DoCompute(OpKernelContext* ctx) = 0;
+
+ private:
+  BackgroundWorker background_worker_;
+};
+
+class MakeIteratorOp : public HybridAsyncOpKernel {
+ public:
+  explicit MakeIteratorOp(OpKernelConstruction* ctx)
+      : HybridAsyncOpKernel(ctx, "tf_data_make_iterator") {}
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+};
+
+class IteratorGetNextOp : public HybridAsyncOpKernel {
+ public:
+  explicit IteratorGetNextOp(OpKernelConstruction* ctx)
+      : HybridAsyncOpKernel(ctx, "tf_data_iterator_get_next") {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  AsyncOpKernel* AsAsync() override;
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class IteratorGetModelProtoOp : public HybridAsyncOpKernel {
+ public:
+  explicit IteratorGetModelProtoOp(OpKernelConstruction* ctx)
+      : HybridAsyncOpKernel(
+            ctx,
+            /*background_worker_name=*/"tf_data_iterator_get_model_proto") {}
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+};
+
+class DeleteIteratorOp : public HybridAsyncOpKernel {
+ public:
+  explicit DeleteIteratorOp(OpKernelConstruction* ctx)
+      : HybridAsyncOpKernel(ctx, "tf_data_delete_iterator") {}
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+};
+
+class IteratorGetNextAsOptionalOp : public HybridAsyncOpKernel {
+ public:
+  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
+      : HybridAsyncOpKernel(ctx, "tf_data_iterator_get_next_as_optional") {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class IteratorToStringHandleOp : public OpKernel {
+ public:
+  explicit IteratorToStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class IteratorFromStringHandleOp : public OpKernel {
+ public:
+  explicit IteratorFromStringHandleOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class SerializeIteratorOp : public OpKernel {
+ public:
+  static constexpr const char* const kExternalStatePolicy =
+      "external_state_policy";
+
+  explicit SerializeIteratorOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  ExternalStatePolicy external_state_policy_ = ExternalStatePolicy::POLICY_WARN;
+};
+
+class DeserializeIteratorOp : public OpKernel {
+ public:
+  explicit DeserializeIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_ITERATOR_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/map_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/map_dataset_op.h
new file mode 100644
index 00000000..dff288d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/map_dataset_op.h
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_MAP_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_MAP_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class MapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Map";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kUseInterOpParallelism =
+      "use_inter_op_parallelism";
+  static constexpr const char* const kPreserveCardinality =
+      "preserve_cardinality";
+  static constexpr const char* const kForceSynchronous = "force_synchronous";
+
+  explicit MapDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool preserve_cardinality_;
+  bool force_synchronous_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_MAP_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/map_defun_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/map_defun_op.h
new file mode 100644
index 00000000..fc4adde9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/map_defun_op.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_MAP_DEFUN_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_MAP_DEFUN_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+// This op runs a given defun on slices of the input arguments. The function
+// given by "f" is assumed to be stateless, and is executed concurrently
+// on all the slices; up to batch_size (i.e. the 0th dimension of each argument)
+// functions will be scheduled at once.
+//
+// The "max_intra_op_parallelism" attr, which defaults to 1, can be used to
+// limit the intra op parallelism. To limit inter-op parallelism, a user
+// can set a private threadpool on the dataset using `tf.data.Options`'s
+// `ThreadingOptions`.
+//
+// Note that this op is not exposed to users directly, but is invoked in
+// tf.data rewrites.
+class MapDefunOp : public AsyncOpKernel {
+ public:
+  static constexpr const char* const kArguments = "arguments";
+  static constexpr const char* const kCapturedInputs = "captured_inputs";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kTcaptured = "Tcaptured";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kMaxIntraOpParallelism =
+      "max_intra_op_parallelism";
+
+  explicit MapDefunOp(OpKernelConstruction* ctx);
+
+  ~MapDefunOp() override = default;
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  struct ComputeOptions;
+  class MapFunctionCallFrame;
+
+  void SetRunOptions(OpKernelContext* ctx,
+                     FunctionLibraryRuntime::Options* opts,
+                     ComputeOptions* compute_opts, bool always_collect_stats);
+
+  // Get inputs to Compute and check that they are valid.
+  absl::Status SetupArgs(OpKernelContext* ctx, ComputeOptions** compute_opts);
+
+  absl::Status SetupOutputs(OpKernelContext* ctx, ComputeOptions* opts);
+
+  FunctionLibraryRuntime::Handle func_handle_;
+  std::vector<PartialTensorShape> output_shapes_;
+  // If this value is positive, limit the max intra op parallelism when the
+  // function is run on slices of the input.
+  int max_intra_op_parallelism_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_MAP_DEFUN_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/model_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/model_dataset_op.h
new file mode 100644
index 00000000..a6198414
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/model_dataset_op.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_MODEL_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_MODEL_DATASET_OP_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide model dataset op because not all of its
+// dependencies are available there. The op is replaced with a no-op.
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
+
+namespace tensorflow {
+namespace data {
+
+class ModelDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "ModelDataset";
+  static constexpr const char* const kDatasetOp = "ModelDatasetOp";
+  static constexpr const char* const kAlgorithm = "algorithm";
+  static constexpr const char* const kCpuBudget = "cpu_budget";
+  static constexpr const char* const kRamBudget = "ram_budget";
+
+  // Executes the logic of the ModelDatasetOp directly (as opposed to through
+  // executing the ModelDatasetOp op kernel).
+  static void MakeDatasetFromOptions(OpKernelContext* ctx, DatasetBase* input,
+                                     model::AutotuneAlgorithm algorithm,
+                                     int64_t cpu_budget, int64_t ram_budget,
+                                     DatasetBase** output);
+
+  explicit ModelDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  model::AutotuneAlgorithm algorithm_;
+  int64_t cpu_budget_;
+  int64_t ram_budget_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#else  // !IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ModelDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  // Creates and returns a ModelDatasetOp::Dataset in output, given the
+  // input, algorithm, cpu_budget and ram_budget parameters. This method is used
+  // to create the dataset without explicitly using the ModelDatasetOp.
+  static void MakeDatasetFromOptions(OpKernelContext* ctx, DatasetBase* input,
+                                     model::AutotuneAlgorithm algorithm,
+                                     bool cpu_budget, bool ram_budget,
+                                     DatasetBase** output);
+
+  explicit ModelDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_MODEL_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/optimize_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/optimize_dataset_op.h
new file mode 100644
index 00000000..1824fc5a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/optimize_dataset_op.h
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_OPTIMIZE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_OPTIMIZE_DATASET_OP_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide optimize dataset op because not all of its
+// dependencies are available there. The op is replaced with a no-op.
+#if !defined(IS_MOBILE_PLATFORM)
+namespace tensorflow {
+namespace data {
+
+class OptimizeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Optimize";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOptimizations = "optimizations";
+  static constexpr const char* const kOptimizationsEnabled =
+      "optimizations_enabled";
+  static constexpr const char* const kOptimizationsDisabled =
+      "optimizations_disabled";
+  static constexpr const char* const kOptimizationsDefault =
+      "optimizations_default";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kOptimizationConfigs =
+      "optimization_configs";
+  static constexpr const char* const kOptimizeDatasetV1 = "OptimizeDataset";
+  static constexpr const char* const kOptimizeDatasetV2 = "OptimizeDatasetV2";
+
+  // Creates and returns a OptimizeDatasetOp::Dataset in output, given the
+  // default optimizations and those that are enabled, disabled. This method is
+  // used to create the dataset without explicitly using the OptimizeDatasetOp.
+  static void MakeDatasetFromOptions(
+      OpKernelContext* ctx, DatasetBase* input,
+      const absl::flat_hash_set<tstring>& optimizations_enabled,
+      const absl::flat_hash_set<tstring>& optimizations_disabled,
+      const absl::flat_hash_set<tstring>& optimizations_default,
+      const absl::flat_hash_set<tstring>& optimization_configs,
+      DatasetBase** output);
+
+  explicit OptimizeDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  absl::flat_hash_set<tstring> optimization_configs_;
+  int op_version_ = 0;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#else  // !IS_MOBILE_PLATFORM
+namespace tensorflow {
+namespace data {
+
+class OptimizeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  // Executes the logic of the OptimizeDatasetOp directly (as opposed to through
+  // executing the OptimizeDatasetOp op kernel).
+  static void MakeDatasetFromOptions(
+      OpKernelContext* ctx, DatasetBase* input,
+      const absl::flat_hash_set<tstring>& optimizations_enabled,
+      const absl::flat_hash_set<tstring>& optimizations_disabled,
+      const absl::flat_hash_set<tstring>& optimizations_default,
+      const absl::flat_hash_set<tstring>& optimization_configs,
+      DatasetBase** output);
+
+  explicit OptimizeDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_OPTIMIZE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/optional_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/optional_ops.h
new file mode 100644
index 00000000..8006b00b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/optional_ops.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_OPTIONAL_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_OPTIONAL_OPS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/optional_ops_util.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+namespace data {
+
+// Stores a DT_VARIANT value representing an Optional with the given value
+// in the `output_index`^th output of the given kernel execution context.
+absl::Status WriteOptionalWithValueToOutput(OpKernelContext* ctx,
+                                            int output_index,
+                                            std::vector<Tensor> value);
+
+// Stores a DT_VARIANT value representing an Optional with no value
+// in the `output_index`^th output of the given kernel execution context.
+absl::Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index);
+
+template <typename Device>
+absl::Status OptionalZerosLike(OpKernelContext* ctx, const OptionalVariant& x,
+                               OptionalVariant* y) {
+  return OptionalZerosLike(ctx, x, y, ZerosLikeTensor<Device>);
+}
+
+template <typename Device>
+absl::Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
+                               const OptionalVariant& b, OptionalVariant* out) {
+  return OptionalBinaryAdd(ctx, a, b, out, BinaryAddTensors<Device>);
+}
+
+class OptionalNoneOp : public OpKernel {
+ public:
+  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalFromValueOp : public OpKernel {
+ public:
+  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalHasValueOp : public OpKernel {
+ public:
+  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalGetValueOp : public OpKernel {
+ public:
+  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
+  }
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_OPTIONAL_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/optional_ops_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/optional_ops_util.h
new file mode 100644
index 00000000..3ee3742f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/optional_ops_util.h
@@ -0,0 +1,117 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_OPTIONAL_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_OPTIONAL_OPS_UTIL_H_
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+namespace data {
+
+const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
+
+// An `OptionalVariant` can represent either an "actual value" (a tuple of
+// tensors) or "none", and may be stored in a DT_VARIANT tensor.
+class OptionalVariant {
+ public:
+  // Create an `OptionalVariant` with no actual value.
+  OptionalVariant() : values_(nullptr) {}
+
+  // Create an `OptionalVariant` with the actual value given by the tuple of
+  // tensors in `values`.
+  explicit OptionalVariant(std::vector<Tensor> values) {
+    values_ = std::make_shared<std::vector<Tensor>>(std::move(values));
+  }
+
+  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
+
+  // Returns true if `this` represents an actual value.
+  bool has_value() const { return values_ != nullptr; }
+
+  // REQUIRES: `this->has_value()` must be true.
+  const std::vector<Tensor>& get_values() const {
+    DCHECK(values_) << "Tried to get values from an empty OptionalVariant";
+    return *values_;
+  }
+
+  // Implementations of the necessary methods for using `OptionalVariant`
+  // objects in DT_VARIANT tensors.
+  string TypeName() const { return kOptionalVariantTypeName; }
+  void Encode(VariantTensorData* data) const {
+    data->set_metadata(values_ != nullptr);
+    if (values_ != nullptr) {
+      for (const auto& t : *values_) {
+        *(data->add_tensors()) = t;
+      }
+    }
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    if (data.type_name() != TypeName()) {
+      return false;
+    }
+    bool has_value = false;
+    if (!data.get_metadata(&has_value)) {
+      return false;
+    }
+    if (has_value) {
+      values_ = std::make_shared<std::vector<Tensor>>(data.tensors());
+    } else {
+      values_.reset();
+    }
+    return true;
+  }
+
+  string DebugString() const {
+    if (values_) {
+      return strings::StrCat("OptionalVariant<", "values: (",
+                             absl::StrJoin(*values_, ", ",
+                                           [](string* s, const Tensor& elem) {
+                                             *s = elem.DebugString();
+                                           }),
+                             ")>");
+    } else {
+      return strings::StrCat("OptionalVariant<None>");
+    }
+  }
+
+ private:
+  std::shared_ptr<const std::vector<Tensor>> values_;
+};
+
+absl::Status OptionalZerosLike(
+    OpKernelContext* ctx, const OptionalVariant& x, OptionalVariant* y,
+    std::function<absl::Status(OpKernelContext* ctx, const Tensor& input,
+                               Tensor* out)>
+        zeros_like_func);
+
+absl::Status OptionalBinaryAdd(
+    OpKernelContext* ctx, const OptionalVariant& a, const OptionalVariant& b,
+    OptionalVariant* out,
+    std::function<absl::Status(OpKernelContext* ctx, const Tensor& a,
+                               const Tensor& b, Tensor* out)>
+        binary_add_func);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_OPTIONAL_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/options_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/options_dataset_op.h
new file mode 100644
index 00000000..024ae757
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/options_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_OPTIONS_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_OPTIONS_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+// TODO(jsimsa): Provide class-level documentation for this and the other ops.
+class OptionsDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Options";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kSerializedOptions = "serialized_options";
+
+  explicit OptionsDatasetOp(OpKernelConstruction* ctx);
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  tstring serialized_options_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_OPTIONS_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/padded_batch_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/padded_batch_dataset_op.h
new file mode 100644
index 00000000..474587db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/padded_batch_dataset_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PADDED_BATCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PADDED_BATCH_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "PaddedBatch";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kBatchSize = "batch_size";
+  static constexpr const char* const kPaddedShapes = "padded_shapes";
+  static constexpr const char* const kPaddingValues = "padding_values";
+  static constexpr const char* const kDropRemainder = "drop_remainder";
+  static constexpr const char* const kParallelCopy = "parallel_copy";
+  static constexpr const char* const kToutputTypes = "Toutput_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kNumPaddedShapes = "N";
+
+  explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int op_version_;
+  bool parallel_copy_ = false;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PADDED_BATCH_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_batch_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_batch_dataset_op.h
new file mode 100644
index 00000000..219dc73c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_batch_dataset_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_BATCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_BATCH_DATASET_OP_H_
+
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ParallelBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "ParallelBatch";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kBatchSize = "batch_size";
+  static constexpr const char* const kNumParallelCalls = "num_parallel_calls";
+  static constexpr const char* const kDropRemainder = "drop_remainder";
+  static constexpr const char* const kParallelCopy = "parallel_copy";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kDeterministic = "deterministic";
+
+  explicit ParallelBatchDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DeterminismPolicy deterministic_;
+  bool parallel_copy_ = false;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_BATCH_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_filter_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_filter_dataset_op.h
new file mode 100644
index 00000000..48b1bda1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_filter_dataset_op.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_FILTER_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_FILTER_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ParallelFilterDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "ParallelFilter";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kNumParallelCalls = "num_parallel_calls";
+  static constexpr const char* const kPredicate = "predicate";
+  static constexpr const char* const kDeterministic = "deterministic";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit ParallelFilterDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DeterminismPolicy deterministic_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_FILTER_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_interleave_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_interleave_dataset_op.h
new file mode 100644
index 00000000..be46a360
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_interleave_dataset_op.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_INTERLEAVE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_INTERLEAVE_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "ParallelInterleave";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kCycleLength = "cycle_length";
+  static constexpr const char* const kBlockLength = "block_length";
+  static constexpr const char* const kBufferOutputElements =
+      "buffer_output_elements";
+  static constexpr const char* const kPrefetchInputElements =
+      "prefetch_input_elements";
+  static constexpr const char* const kNumParallelCalls = "num_parallel_calls";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kDeterministic = "deterministic";
+  static constexpr const char* const kSloppy = "sloppy";
+
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int op_version_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  DeterminismPolicy deterministic_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_INTERLEAVE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_map_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_map_dataset_op.h
new file mode 100644
index 00000000..efdf6339
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/parallel_map_dataset_op.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_MAP_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_MAP_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "ParallelMap";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kOtherArguments = "other_arguments";
+  static constexpr const char* const kNumParallelCalls = "num_parallel_calls";
+  static constexpr const char* const kFunc = "f";
+  static constexpr const char* const kTarguments = "Targuments";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kUseInterOpParallelism =
+      "use_inter_op_parallelism";
+  static constexpr const char* const kDeterministic = "deterministic";
+  static constexpr const char* const kSloppy = "sloppy";
+  static constexpr const char* const kPreserveCardinality =
+      "preserve_cardinality";
+  static constexpr const char* const kUseUnboundedThreadpool =
+      "use_unbounded_threadpool";
+
+  explicit ParallelMapDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  const int op_version_;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool sloppy_;
+  bool preserve_cardinality_;
+  DeterminismPolicy deterministic_;
+  bool use_unbounded_threadpool_;
+
+  friend std::unique_ptr<DatasetBase> MakeDataServiceUncompressDataset(
+      DatasetBase* input, std::unique_ptr<CapturedFunction> captured_function,
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes);
+};
+
+// Used by tf.data service to create a map dataset for uncompression.
+std::unique_ptr<DatasetBase> MakeDataServiceUncompressDataset(
+    DatasetBase* input, std::unique_ptr<CapturedFunction> captured_function,
+    const DataTypeVector& output_types,
+    const std::vector<PartialTensorShape>& output_shapes);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PARALLEL_MAP_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/prefetch_autotuner.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/prefetch_autotuner.h
new file mode 100644
index 00000000..a06eb60f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/prefetch_autotuner.h
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+
+// PrefetchAutotuner dynamically adjusts the buffer size of a prefetch iterator.
+//
+// PrefetchAutotuner attempts to find the minimum buffer size such that there is
+// always at least 1 element in the prefetch queue every time the downstream
+// iterator calls GetNext().
+//
+// One common failure mode of input pipelines is being throughput bound. No
+// amount of prefetching can address that performance mode. In order to guard
+// against this condition, PrefetchAutotuner will only increase the buffer_limit
+// if the prefetching thread is able to successfully fill the buffer at its
+// current size.
+//
+// Note: in the current implementation, we never decrease the buffer_limit().
+// This should change in the future!
+//
+// PrefetchAutotuner is NOT thread safe.
+class PrefetchAutotuner {
+ public:
+  explicit PrefetchAutotuner(
+      int64_t initial_buffer_size, int64_t buffer_size_min,
+      std::shared_ptr<model::RamBudgetManager> ram_budget_manager);
+
+  int64_t buffer_limit() const { return buffer_limit_; }
+
+  // Reports whether the element size has been set.
+  bool HasElementSize() const { return element_size_bytes_.has_value(); }
+  // Sets the element size to use for predicting memory usage. Element size must
+  // be set before the autotuner can increase the buffer size.
+  void SetElementSize(int64_t element_size_bytes);
+  void RecordConsumption(size_t current_buffer_size);
+  void RecordEmpty() { RecordConsumption(0); }
+
+ private:
+  // PrefetchAutotuner operates as a state machine.
+  enum class Mode {
+    // Disables the autotuning.
+    kDisabled,
+
+    // We have increased the size of the buffer, and will transition to
+    // kDownswing if we successfully fill the buffer.
+    kUpswing,
+
+    // We have successfully filled a buffer of this size. If we ever block the
+    // downstream iterator, we should increase the buffer size.
+    kDownswing,
+  };
+
+  int64_t buffer_limit_;
+  // Estimated per-element size.
+  std::optional<int64_t> element_size_bytes_;
+  Mode mode_ = Mode::kDisabled;
+  std::shared_ptr<model::RamBudgetManager> ram_budget_manager_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/prefetch_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/prefetch_dataset_op.h
new file mode 100644
index 00000000..e193e75e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+namespace tensorflow {
+namespace data {
+
+class PrefetchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Prefetch";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kBufferSize = model::kBufferSize;
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kSlackPeriod = "slack_period";
+  static constexpr const char* const kLegacyAutotune = "legacy_autotune";
+  static constexpr const char* const kBufferSizeMin = "buffer_size_min";
+
+  explicit PrefetchDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  int64_t slack_period_ = 0;
+  bool legacy_autotune_ = true;
+  int64_t buffer_size_min_ = 0;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/random_seed_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/random_seed_ops.h
new file mode 100644
index 00000000..f0afa739
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/random_seed_ops.h
@@ -0,0 +1,160 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
+
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+namespace data {
+
+// Represents a pair of random seeds. By TensorFlow convention, if both seeds
+// are 0, then pseudo-random values are used instead.
+class RandomSeeds {
+ public:
+  RandomSeeds(int64_t seed, int64_t seed2)
+      : input_seed_(seed),
+        input_seed2_(seed2),
+        seed_((seed | seed2) == 0 ? random::New64() : seed),
+        seed2_((seed | seed2) == 0 ? random::New64() : seed2) {}
+
+  int64_t input_seed() const { return input_seed_; }
+  int64_t input_seed2() const { return input_seed2_; }
+  int64_t seed() const { return seed_; }
+  int64_t seed2() const { return seed2_; }
+
+ private:
+  const int64_t input_seed_;
+  const int64_t input_seed2_;
+  const int64_t seed_;
+  const int64_t seed2_;
+};
+
+// Base class for seed generator resources. Subclasses customize how seeds are
+// generated.
+class SeedGenerator {
+ public:
+  virtual ~SeedGenerator() {}
+
+  virtual int64_t seed() const = 0;
+  virtual int64_t seed2() const = 0;
+  virtual bool reshuffle_each_iteration() const = 0;
+
+  virtual void GenerateSeeds(int64_t* seed1, int64_t* seed2) = 0;
+  virtual void Reset() = 0;
+
+  virtual int64_t num_random_samples() const {
+    tf_shared_lock l(mu_);
+    return num_random_samples_;
+  }
+  virtual void set_num_random_samples(int64_t num_random_samples) {
+    mutex_lock l(mu_);
+    num_random_samples_ = num_random_samples;
+  }
+
+ protected:
+  mutable mutex mu_;
+  int64_t num_random_samples_ TF_GUARDED_BY(mu_) = 0;
+};
+
+// A resource wrapping a shared instance of a seed generator.
+class SeedGeneratorManager : public ResourceBase {
+ public:
+  explicit SeedGeneratorManager(SeedGenerator* seed_generator)
+      : seed_generator_(seed_generator) {}
+
+  std::string DebugString() const override;
+
+  std::shared_ptr<SeedGenerator> get() { return seed_generator_; }
+
+ private:
+  std::shared_ptr<SeedGenerator> seed_generator_;
+};
+
+// Always generates the specified seed values.
+class FixedSeedGenerator : public SeedGenerator {
+ public:
+  explicit FixedSeedGenerator(RandomSeeds seeds) : seeds_(std::move(seeds)) {}
+
+  int64_t seed() const override { return seeds_.seed(); }
+  int64_t seed2() const override { return seeds_.seed(); }
+  bool reshuffle_each_iteration() const override { return false; }
+
+  void GenerateSeeds(int64_t* seed1, int64_t* seed2) override;
+  void Reset() override {}
+
+ private:
+  const RandomSeeds seeds_;
+};
+
+// Generates different (but deterministically chosen) seed values.
+class RandomSeedGenerator : public SeedGenerator {
+ public:
+  explicit RandomSeedGenerator(RandomSeeds seeds)
+      : seeds_(std::move(seeds)),
+        parent_generator_(seeds_.seed(), seeds_.seed2()),
+        generator_(&parent_generator_) {}
+
+  int64_t seed() const override { return seeds_.seed(); }
+  int64_t seed2() const override { return seeds_.seed2(); }
+  bool reshuffle_each_iteration() const override { return true; }
+
+  void GenerateSeeds(int64_t* seed1, int64_t* seed2) override;
+  void Reset() override;
+
+ private:
+  const RandomSeeds seeds_;
+  random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
+  random::SingleSampleAdapter<random::PhiloxRandom> generator_
+      TF_GUARDED_BY(mu_);
+};
+
+// Creates an instance of seed generator resource and transfers ownership
+// to the caller.
+class AnonymousSeedGeneratorHandleOp
+    : public AnonymousResourceOp<SeedGeneratorManager> {
+ public:
+  explicit AnonymousSeedGeneratorHandleOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  string name() override;
+  absl::Status CreateResource(
+      OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib, SeedGeneratorManager** manager) override;
+
+  mutex mu_;
+  std::unique_ptr<RandomSeeds> seeds_ TF_GUARDED_BY(mu_);
+  bool reshuffle_;
+};
+
+// Deletes an instance of seed generator resource.
+class DeleteSeedGeneratorOp : public OpKernel {
+ public:
+  explicit DeleteSeedGeneratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/range_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/range_dataset_op.h
new file mode 100644
index 00000000..687f2eb6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/range_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_RANGE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_RANGE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class RangeDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Range";
+  static constexpr const char* const kStart = "start";
+  static constexpr const char* const kStop = "stop";
+  static constexpr const char* const kStep = "step";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kReplicateOnSplit = "replicate_on_split";
+
+  explicit RangeDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  class RangeSplitProvider;
+  DataTypeVector output_types_;
+  bool replicate_on_split_ = false;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_RANGE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/reduce_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/reduce_dataset_op.h
new file mode 100644
index 00000000..73e18144
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/reduce_dataset_op.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_REDUCE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_REDUCE_DATASET_OP_H_
+
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+
+namespace tensorflow {
+namespace data {
+
+class ReduceDatasetOp : public HybridAsyncOpKernel {
+ public:
+  explicit ReduceDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  absl::Status DoCompute(OpKernelContext* ctx) override;
+
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_REDUCE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/repeat_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/repeat_dataset_op.h
new file mode 100644
index 00000000..81d534f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/repeat_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_REPEAT_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_REPEAT_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class RepeatDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Repeat";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kCount = "count";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit RepeatDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_REPEAT_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/rewrite_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/rewrite_dataset_op.h
new file mode 100644
index 00000000..cd9b34b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/rewrite_dataset_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_REWRITE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_REWRITE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class RewriteDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Rewrite";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kRewriteName = "rewrite_name";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit RewriteDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_REWRITE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/shard_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/shard_dataset_op.h
new file mode 100644
index 00000000..acdf171a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/shard_dataset_op.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SHARD_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SHARD_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Shard";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kNumShards = "num_shards";
+  static constexpr const char* const kIndex = "index";
+  static constexpr const char* const kRequireNonEmpty = "require_non_empty";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit ShardDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  bool require_non_empty_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SHARD_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/shuffle_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/shuffle_dataset_op.h
new file mode 100644
index 00000000..f33f75c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/shuffle_dataset_op.h
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SHUFFLE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SHUFFLE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kBufferSize = "buffer_size";
+  static constexpr const char* const kSeed = "seed";
+  static constexpr const char* const kSeed2 = "seed2";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kReshuffleEachIteration =
+      "reshuffle_each_iteration";
+
+  explicit ShuffleDatasetOpBase(OpKernelConstruction* ctx);
+
+ protected:
+  class ShuffleDatasetBase;
+};
+
+class ShuffleDatasetOp : public ShuffleDatasetOpBase {
+ public:
+  static constexpr const char* const kDatasetType = "Shuffle";
+
+  explicit ShuffleDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  class DatasetV2;
+  class DatasetV3;
+  int op_version_ = 0;
+  bool reshuffle_each_iteration_ = true;
+};
+
+class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
+ public:
+  static constexpr const char* const kDatasetType = "ShuffleAndRepeat";
+  static constexpr const char* const kCount = "count";
+
+  explicit ShuffleAndRepeatDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  class DatasetV2;
+  int op_version_ = 0;
+  bool reshuffle_each_iteration_ = true;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SHUFFLE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/skip_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/skip_dataset_op.h
new file mode 100644
index 00000000..6e22d7af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/skip_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SKIP_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SKIP_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class SkipDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Skip";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kCount = "count";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit SkipDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SKIP_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/take_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/take_dataset_op.h
new file mode 100644
index 00000000..de51d6a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/take_dataset_op.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_TAKE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_TAKE_DATASET_OP_H_
+
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+class TakeDataset : public DatasetBase {
+ public:
+  TakeDataset(OpKernelContext* ctx, int64_t count, const DatasetBase* input);
+
+  TakeDataset(DatasetContext::Params params, int64_t count,
+              const DatasetBase* input);
+
+  ~TakeDataset() override;
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+
+  const DataTypeVector& output_dtypes() const override;
+
+  const std::vector<PartialTensorShape>& output_shapes() const override;
+
+  string DebugString() const override;
+
+  int64_t CardinalityInternal(CardinalityOptions options) const override;
+
+  absl::Status InputDatasets(
+      std::vector<const DatasetBase*>* inputs) const override;
+
+  absl::Status Get(OpKernelContext* ctx, int64 index,
+                   std::vector<Tensor>* out_tensors) const override;
+
+  absl::Status CheckExternalState() const override;
+
+  absl::Status RandomIndexingCompatible() const override;
+
+ protected:
+  absl::Status AsGraphDefInternal(SerializationContext* ctx,
+                                  DatasetGraphDefBuilder* b,
+                                  Node** output) const override;
+
+ private:
+  class EmptyIterator;
+  class FiniteIterator;
+  const int64_t count_;
+  const DatasetBase* const input_;
+  absl::Status random_indexing_compatible_;
+};
+
+class TakeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Take";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kCount = "count";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit TakeDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_TAKE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/tensor_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/tensor_dataset_op.h
new file mode 100644
index 00000000..dcd738e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/tensor_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_TENSOR_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_TENSOR_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class TensorDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Tensor";
+  static constexpr const char* const kComponents = "components";
+  static constexpr const char* const kToutput_types = "Toutput_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit TensorDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_TENSOR_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/tensor_slice_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/tensor_slice_dataset_op.h
new file mode 100644
index 00000000..c2ddbaf1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/tensor_slice_dataset_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_TENSOR_SLICE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_TENSOR_SLICE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class TensorSliceDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "TensorSlice";
+  static constexpr const char* const kComponents = "components";
+  static constexpr const char* const kToutputTypes = "Toutput_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kIsFiles = "is_files";
+  static constexpr const char* const kReplicateOnSplit = "replicate_on_split";
+
+  explicit TensorSliceDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool is_files_ = false;
+  bool replicate_on_split_ = false;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_TENSOR_SLICE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/text_line_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/text_line_dataset_op.h
new file mode 100644
index 00000000..3621b57a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/text_line_dataset_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_TEXT_LINE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_TEXT_LINE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class TextLineDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "TextLine";
+  static constexpr const char* const kFileNames = "filenames";
+  static constexpr const char* const kCompressionType = "compression_type";
+  static constexpr const char* const kBufferSize = "buffer_size";
+
+  explicit TextLineDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_TEXT_LINE_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/tf_record_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/tf_record_dataset_op.h
new file mode 100644
index 00000000..0cfbc667
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/tf_record_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_TF_RECORD_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_TF_RECORD_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class TFRecordDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "TFRecord";
+  static constexpr const char* const kFileNames = "filenames";
+  static constexpr const char* const kCompressionType = "compression_type";
+  static constexpr const char* const kBufferSize = "buffer_size";
+  static constexpr const char* const kByteOffsets = "byte_offsets";
+
+  explicit TFRecordDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  int op_version_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_TF_RECORD_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/window_dataset.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/window_dataset.h
new file mode 100644
index 00000000..17e5b6b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/window_dataset.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// Creates a dataset representing an eagerly-collected window of elements.
+//
+// The `elements` argument defines the elements of the resulting
+// dataset, which is stored in `out_dataset`.
+//
+// This dataset is constructed internally for use in datasets that
+// build nested dataset expressions (e.g. the reducer function for
+// GroupByWindowDataset). It efficiently supports multiple iterators on
+// the same window without recomputation.
+//
+// REQUIRES: `output_types` must match the types of the respective
+// element components in `elements`.
+// REQUIRES: `output_shapes` must be compatible with the shapes of the
+// respective element components in `elements`.a
+absl::Status NewWindow(std::vector<std::vector<Tensor>> elements,
+                       DataTypeVector output_types,
+                       std::vector<PartialTensorShape> output_shapes,
+                       DatasetBase** out_dataset);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/window_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/window_dataset_op.h
new file mode 100644
index 00000000..241e0f51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/window_dataset_op.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_OP_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace data {
+
+class WindowDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Window";
+  static constexpr const char* const kInputDataset = "input_dataset";
+  static constexpr const char* const kSize = "size";
+  static constexpr const char* const kShift = "shift";
+  static constexpr const char* const kStride = "stride";
+  static constexpr const char* const kDropRemainder = "drop_remainder";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit WindowDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data/zip_dataset_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data/zip_dataset_op.h
new file mode 100644
index 00000000..1e6b294b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data/zip_dataset_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_ZIP_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_ZIP_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+
+class ZipDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Zip";
+  static constexpr const char* const kInputDatasets = "input_datasets";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kNumInputDatasets = "N";
+
+  explicit ZipDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_ZIP_DATASET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/data_format_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/data_format_ops.h
new file mode 100644
index 00000000..3d4568d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/data_format_ops.h
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_FORMAT_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_FORMAT_OPS_H_
+// Functor definition for data format dim mapping ops, must be compilable
+// by nvcc.
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by DataFormatDimMapOP to do the computations.
+template <typename Device, typename T>
+struct DataFormatDimMap {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::Flat y, const TTypes<int>::Vec dst) {
+    if (dst.size() == 4) {
+      auto zero = x.constant(0);
+      auto one = x.constant(1);
+      auto two = x.constant(2);
+
+      auto f_zero = x.constant(dst(0));
+      auto f_one = x.constant(dst(1));
+      auto f_two = x.constant(dst(2));
+      auto f_three = x.constant(dst(3));
+
+      auto four = x.constant(4);
+      auto x_mod = (x + four) % 4;
+
+      auto is_zero = (x_mod == zero);
+      auto is_one = (x_mod == one);
+      auto is_two = (x_mod == two);
+
+      y.device(d) = is_zero.select(
+          f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
+    } else {
+      auto zero = x.constant(0);
+      auto one = x.constant(1);
+      auto two = x.constant(2);
+      auto three = x.constant(3);
+
+      auto f_zero = x.constant(dst(0));
+      auto f_one = x.constant(dst(1));
+      auto f_two = x.constant(dst(2));
+      auto f_three = x.constant(dst(3));
+      auto f_four = x.constant(dst(4));
+
+      auto five = x.constant(5);
+      auto x_mod = (x + five) % 5;
+
+      auto is_zero = (x_mod == zero);
+      auto is_one = (x_mod == one);
+      auto is_two = (x_mod == two);
+      auto is_three = (x_mod == three);
+
+      y.device(d) = is_zero.select(
+          f_zero,
+          is_one.select(
+              f_one, is_two.select(f_two, is_three.select(f_three, f_four))));
+    }
+  }
+};
+
+template <typename T>
+struct VecPermute {
+  explicit VecPermute(const Eigen::DSizes<Eigen::DenseIndex, 10>& dst)
+      : dst(dst) {}
+  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
+      typename TTypes<T>::ConstFlat input) const {
+    Eigen::DSizes<Eigen::DenseIndex, 1> result;
+    result[0] = input.dimension(0);
+    return result;
+  }
+  template <typename Output, typename Device>
+  void eval(typename TTypes<T>::ConstFlat input, Output& output,
+            const Device& d) const {
+    for (int i = 0; i < input.size(); ++i) {
+      output.template chip<0>(dst[i]).device(d) = input.template chip<0>(i);
+    }
+  }
+
+ private:
+  Eigen::DSizes<Eigen::DenseIndex, 10> dst;
+};
+
+// Functor used by DataFormatVecPermuteOp to do the computations.
+template <typename Device, typename T>
+struct DataFormatVecPermute {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::Flat y,
+                  const Eigen::DSizes<Eigen::DenseIndex, 10>& dst) {
+    y.device(d) = x.customOp(VecPermute<T>(dst));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_FORMAT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/debug_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/debug_ops.h
new file mode 100644
index 00000000..f417caf2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/debug_ops.h
@@ -0,0 +1,959 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
+
+#include <cstdint>
+#include <memory>
+#include <numeric>
+
+#include "tensorflow/core/platform/bfloat16.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/util/determinism.h"
+#endif
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#endif
+
+#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/debug_events_writer.h"
+
+namespace tensorflow {
+
+// Copy op for debugging.
+// Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
+// device on which the tensor is allocated.
+class CopyOp : public OpKernel {
+ public:
+  explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
+
+    std::vector<string> debug_ops_spec;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("debug_ops_spec", &debug_ops_spec));
+    for (const string& debug_op_spec : debug_ops_spec) {
+      // Assume debug_op_spec has the format
+      // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
+      // DebugIdentity;grpc://localhost:3333;1
+      const std::vector<string> items = str_util::Split(debug_op_spec, ";");
+      OP_REQUIRES(
+          context, items.size() == 3,
+          errors::Internal(
+              "Unexpected number of semicolons in debug_ops_spec element: ",
+              debug_op_spec));
+      debug_op_and_url_specs_.push_back(
+          DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
+                               items[1], items[2] == "1"));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& src_tensor = context->input(0);
+
+    if (src_tensor.IsInitialized() &&
+        DataTypeCanUseMemcpy(src_tensor.dtype()) &&
+        DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
+      // Source tensor is initialized and is mem-copyable. Make a copy.
+      Tensor* copied_tensor;
+      OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
+                                                       &copied_tensor));
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+      Device* device = static_cast<Device*>(context->device());
+      // Determine if the input tensor is not on CPU (e.g., on GPU).
+      bool off_host_input = device->device_type() == DEVICE_GPU &&
+                            !context->input_alloc_attr(0).on_host();
+
+      if (off_host_input) {
+        DeviceContext* device_ctxt = context->op_device_context();
+        // Input is not on host: deep-copy it from GPU to the same GPU.
+        Notification done_copy;
+        GPUUtil::CopyGPUTensorToSameGPU(
+            device, device_ctxt, &src_tensor, copied_tensor,
+            [&done_copy](const Status& s) { done_copy.Notify(); });
+        done_copy.WaitForNotification();
+      } else {
+        // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
+        *copied_tensor = tensor::DeepCopy(src_tensor);
+      }
+#else
+      *copied_tensor = tensor::DeepCopy(src_tensor);
+#endif
+    } else {
+      // Source tensor is NOT initialized and/or is not mem-copyable: Forward
+      // the Tensor object.
+      context->set_output(0, src_tensor);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  string tensor_name_;
+  std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
+};
+
+// Base class of all debug ops.
+class BaseDebugOp : public OpKernel {
+ public:
+  explicit BaseDebugOp(const string& debug_op_name,
+                       OpKernelConstruction* context)
+      : OpKernel(context), debug_op_name_(debug_op_name) {
+    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+    OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
+
+    string device_name;
+    string tensor_name;
+    OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name));
+    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name));
+
+    std::vector<string> name_items = str_util::Split(tensor_name, ':');
+    string node_name;
+    int32_t output_slot = 0;
+    OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2,
+                errors::InvalidArgument("Failed to parse tensor name: \"",
+                                        tensor_name, "\""));
+    if (name_items.size() == 2) {
+      node_name = name_items[0];
+      OP_REQUIRES(
+          context, absl::SimpleAtoi(name_items[1], &output_slot),
+          errors::InvalidArgument("Invalid string value for output_slot: \"",
+                                  name_items[1], "\""));
+    } else if (name_items.size() == 1) {
+      node_name = name_items[0];
+    }
+
+    debug_watch_key_.reset(
+        new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_));
+  }
+
+  bool IsExpensive() override { return false; }
+
+ protected:
+  // Apply gRPC gating (if gated_grpc_ attribute is true).
+  //
+  // Returns false if and only if all grpc:// debug URLs of the debug op are
+  // disabled currently (i.e., gated off), in which case the debug op will emit
+  // an empty (size {0}) tensor of undefined data type.
+  bool ApplyGrpcGating(OpKernelContext* context) {
+    if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(
+                           debug_watch_key_->debug_node_name, debug_urls_)) {
+      // The entire node is gated off: Output an empty tensor and avoid
+      // expensive computation.
+      Tensor* output_tensor;
+      TensorShape shape({0});
+      if (!context->allocate_output(0, shape, &output_tensor).ok()) {
+        LOG(ERROR) << "Debug node of watch key "
+                   << debug_watch_key_->debug_node_name
+                   << " failed to allocate empty tensor under gated-off state.";
+      }
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  // Publish a tensor to all debug URLs of the debug op.
+  // Log an error if the publishing failed.
+  absl::Status PublishTensor(const Tensor& tensor, int64_t step_id = -1) {
+    if (debug_urls_.empty()) {
+      return absl::OkStatus();
+    } else {
+      absl::Status status = DebugIO::PublishDebugTensor(
+          *debug_watch_key_, tensor, Env::Default()->NowMicros(), debug_urls_,
+          gated_grpc_, step_id);
+      if (!status.ok()) {
+        LOG(ERROR) << "Debug node of watch key "
+                   << debug_watch_key_->debug_node_name
+                   << " failed to publish debug tensor data to all URLs "
+                   << absl::StrJoin(debug_urls_, ", ")
+                   << ", due to: " << status.message();
+      }
+      return status;
+    }
+  }
+
+  void CompleteDebugNodeKey(const string& io_of_node, bool is_input,
+                            int io_index) {
+    debug_watch_key_ = std::make_unique<DebugNodeKey>(
+        debug_watch_key_->device_name, debug_watch_key_->node_name,
+        debug_watch_key_->output_slot, debug_op_name_, io_of_node, is_input,
+        io_index);
+  }
+
+ private:
+  const string debug_op_name_;
+  std::unique_ptr<DebugNodeKey> debug_watch_key_;
+  std::vector<string> debug_urls_;
+  bool gated_grpc_;
+};
+
+// Identity op for debugging.
+//   Output slot 0 carries the debug signal and is always allocated on the
+//   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
+//   the debug signal is equal to the input tensor.
+class DebugIdentityOp : public BaseDebugOp {
+ public:
+  explicit DebugIdentityOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugIdentity", context) {}
+
+  void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    OP_REQUIRES_OK(context, PublishTensor(context->input(0)));
+    context->set_output(0, context->input(0));
+  }
+};
+
+// Identity op for debugging.
+//   Output slot 0 carries the debug signal and is always allocated on the
+//   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
+//   the debug signal is equal to the input tensor.
+class DebugIdentityV3Op : public BaseDebugOp {
+ public:
+  explicit DebugIdentityV3Op(OpKernelConstruction* context)
+      : BaseDebugOp("DebugIdentityV3", context) {
+    string io_of_node;
+    bool is_input;
+    int io_index;
+    OP_REQUIRES_OK(context, context->GetAttr("io_of_node", &io_of_node));
+    OP_REQUIRES_OK(context, context->GetAttr("is_input", &is_input));
+    OP_REQUIRES_OK(context, context->GetAttr("io_index", &io_index));
+    if (!io_of_node.empty()) {
+      CompleteDebugNodeKey(io_of_node, is_input, io_index);
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    OP_REQUIRES_OK(context,
+                   PublishTensor(context->input(0), context->step_id()));
+    context->set_output(0, context->input(0));
+  }
+};
+
+// NaN-counter op for debugging.
+template <typename T>
+class DebugNanCountOp : public BaseDebugOp {
+ public:
+  explicit DebugNanCountOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugNanCount", context) {}
+
+  void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    Tensor* output_tensor;
+    const Tensor& input = context->input(0);
+
+    // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
+    int64_t nan_count = 0;
+
+    // If the input is an uninitialized tensor, let nan_count be 0.
+    if (input.IsInitialized()) {
+      // Count NaNs.
+      const TensorShape& input_shape = input.shape();
+      const T* input_flat = input.template flat<T>().data();
+
+      for (int64_t i = 0; i < input_shape.num_elements(); ++i) {
+        if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) {
+          nan_count++;
+        }
+      }
+    }
+
+    TensorShape shape({1});
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
+    output_tensor->vec<int64_t>()(0) = nan_count;
+    OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
+  }
+};
+
+// Numeric summary op for debugging.
+template <typename T>
+class DebugNumericSummaryOp : public BaseDebugOp {
+ public:
+  explicit DebugNumericSummaryOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugNumericSummary", context) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
+    OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("mute_if_healthy", &mute_if_healthy_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    Tensor* output_tensor;
+    const Tensor& input = context->input(0);
+
+    int64_t is_initialized = 0;
+    int64_t element_count = 0;
+    int64_t negative_inf_count = 0;
+    int64_t negative_count = 0;
+    int64_t zero_count = 0;
+    int64_t positive_count = 0;
+    int64_t positive_inf_count = 0;
+    int64_t nan_count = 0;
+    double min = std::numeric_limits<double>::infinity();
+    double max = -std::numeric_limits<double>::infinity();
+    double sum = 0.0;
+    double mean = std::numeric_limits<double>::quiet_NaN();
+    double variance = std::numeric_limits<double>::quiet_NaN();
+
+    // Equal to negative_count + zero_count + positive_count.
+    int64_t non_inf_nan_count = 0;
+
+    const TensorShape& input_shape = input.shape();
+    if (input.IsInitialized()) {
+      is_initialized = 1;
+      const T* input_flat = input.template flat<T>().data();
+
+      element_count = input_shape.num_elements();
+      const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
+      const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
+
+      for (int64_t i = 0; i < element_count; ++i) {
+        const double x = static_cast<double>(input_flat[i]);
+        if (Eigen::numext::isnan(x)) {
+          nan_count++;
+        } else if (Eigen::numext::isinf(x)) {
+          if (x < 0.0) {
+            negative_inf_count++;
+          } else {
+            positive_inf_count++;
+          }
+        } else {
+          if (is_lower_bound_custom && x <= lower_bound_) {
+            negative_inf_count++;
+          } else if (is_upper_bound_custom && x >= upper_bound_) {
+            positive_inf_count++;
+          } else if (x < 0.0) {
+            negative_count++;
+          } else if (x > 0.0) {
+            positive_count++;
+          } else {
+            zero_count++;
+          }
+
+          if (x < min) {
+            min = x;
+          }
+          if (x > max) {
+            max = x;
+          }
+
+          non_inf_nan_count++;
+          sum += x;
+        }
+      }
+
+      if (non_inf_nan_count > 0) {
+        mean = sum / non_inf_nan_count;
+
+        // Do a second pass to compute variance.
+        variance = 0.0;
+        for (int64_t i = 0; i < element_count; ++i) {
+          const double x = static_cast<double>(input_flat[i]);
+          if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
+            variance += (x - mean) * (x - mean);
+          }
+        }
+        variance /= non_inf_nan_count;
+      }
+    }
+
+    TensorShape shape({14 + input_shape.dims()});
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
+    output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
+    output_tensor->vec<double>()(1) = static_cast<double>(element_count);
+    output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
+    output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
+    output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
+    output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
+    output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
+    output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
+    output_tensor->vec<double>()(8) = min;
+    output_tensor->vec<double>()(9) = max;
+    output_tensor->vec<double>()(10) = mean;
+    output_tensor->vec<double>()(11) = variance;
+
+    output_tensor->vec<double>()(12) = static_cast<double>(input.dtype());
+    output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims());
+    for (size_t d = 0; d < input_shape.dims(); ++d) {
+      output_tensor->vec<double>()(14 + d) =
+          static_cast<double>(input_shape.dim_sizes()[d]);
+    }
+
+    bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
+                positive_inf_count == 0;
+    if (!mute) {
+      OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
+    }
+  }
+
+ private:
+  float lower_bound_;
+  float upper_bound_;
+  bool mute_if_healthy_;
+};
+
+// Identity op for tfdbg v2: Writes debug data using DebugEventsWriter.
+class DebugIdentityV2Op : public OpKernel {
+ public:
+  explicit DebugIdentityV2Op(OpKernelConstruction* context)
+      : OpKernel(context),
+        device_name_(context->device()->name()),
+        output_slot_(-1),
+        tensor_debug_mode_(0),
+        tfdbg_run_id_() {
+    std::vector<string> debug_urls;
+    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls));
+    for (const string& debug_url : debug_urls) {
+      if (absl::StartsWith(debug_url, DebugIO::kFileURLScheme)) {
+        dump_roots_.emplace_back(
+            debug_url.substr(strlen(DebugIO::kFileURLScheme)));
+      } else {
+        context->SetStatus(
+            errors::Internal("Unsupported debug URL schema in: ", debug_url));
+      }
+    }
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("tfdbg_context_id", &tfdbg_context_id_));
+    OP_REQUIRES_OK(context, context->GetAttr("op_name", &op_name_));
+    OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
+    if (context->HasAttr("circular_buffer_size")) {
+      OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size",
+                                               &circular_buffer_size_));
+    } else {
+      circular_buffer_size_ =
+          tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
+    }
+    if (context->HasAttr("tfdbg_run_id")) {
+      OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor = context->input(0);
+    for (const string& dump_root : dump_roots_) {
+      tfdbg::DebugEventsWriter* debug_events_writer =
+          tfdbg::DebugEventsWriter::GetDebugEventsWriter(
+              dump_root, tfdbg_run_id_, circular_buffer_size_);
+      OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
+                                  tfdbg_context_id_, device_name_, op_name_,
+                                  output_slot_, tensor_debug_mode_, tensor));
+    }
+    context->set_output(0, tensor);
+  }
+
+ private:
+  std::vector<string> dump_roots_;
+  string tfdbg_context_id_;
+  string device_name_;
+  string op_name_;
+  int32 output_slot_;
+  int32 tensor_debug_mode_;
+  int64_t circular_buffer_size_;
+  string tfdbg_run_id_;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename Tin, typename Tout>
+struct CurtHealthLaunch {
+  void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]);
+};
+
+extern template struct CurtHealthLaunch<Eigen::half, float>;
+extern template struct CurtHealthLaunch<float, float>;
+extern template struct CurtHealthLaunch<double, float>;
+extern template struct CurtHealthLaunch<Eigen::half, double>;
+extern template struct CurtHealthLaunch<float, double>;
+extern template struct CurtHealthLaunch<double, double>;
+
+template <typename Tin, typename Tout>
+struct ConciseHealthLaunch {
+  void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]);
+};
+
+extern template struct ConciseHealthLaunch<Eigen::half, float>;
+extern template struct ConciseHealthLaunch<float, float>;
+extern template struct ConciseHealthLaunch<double, float>;
+extern template struct ConciseHealthLaunch<Eigen::half, double>;
+extern template struct ConciseHealthLaunch<float, double>;
+extern template struct ConciseHealthLaunch<double, double>;
+
+template <typename Tin, typename Tout>
+struct FullHealthLaunch {
+  void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]);
+};
+
+extern template struct FullHealthLaunch<Eigen::half, float>;
+extern template struct FullHealthLaunch<float, float>;
+extern template struct FullHealthLaunch<double, float>;
+extern template struct FullHealthLaunch<Eigen::half, double>;
+extern template struct FullHealthLaunch<float, double>;
+extern template struct FullHealthLaunch<double, double>;
+
+template <typename Tin, typename Tout>
+struct ReduceInfNanThreeSlotsLaunch {
+  void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]);
+};
+
+extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, float>;
+extern template struct ReduceInfNanThreeSlotsLaunch<float, float>;
+extern template struct ReduceInfNanThreeSlotsLaunch<double, float>;
+extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, double>;
+extern template struct ReduceInfNanThreeSlotsLaunch<float, double>;
+extern template struct ReduceInfNanThreeSlotsLaunch<double, double>;
+
+#endif
+
+template <typename Device, typename Tin, typename Tout>
+class DebugNumericSummaryV2Op;
+
+// Numeric summary op for tfdbg v2: CPU Kernel.
+template <typename Tin, typename Tout>
+class DebugNumericSummaryV2Op<CPUDevice, Tin, Tout> : public OpKernel {
+ public:
+  explicit DebugNumericSummaryV2Op(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
+    OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor = context->input(0);
+    auto in = tensor.flat<Tin>();
+    const Tin* data = in.data();
+    const int64_t size = in.size();
+    Tensor* output_tensor;
+    Tout tensor_id = static_cast<Tout>(tensor_id_);
+    const Tout num_elem = static_cast<Tout>(context->input(0).NumElements());
+    // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because
+    // that mode does not make use of tensor_id.
+    if (tensor_debug_mode_ != 8) {
+      OP_REQUIRES(
+          context, tensor_id_ <= kMaxTensorId,
+          errors::InvalidArgument("DebugNumericSummaryV2Op requires "
+                                  "tensor_id to be less than or equal to "
+                                  "(2^",
+                                  std::numeric_limits<Tout>::digits,
+                                  "). Given tensor_id:", tensor_id_));
+    }
+
+    if (tensor_debug_mode_ == 2) {  // CURT_HEALTH
+      TensorShape shape({2});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+      output_tensor->flat<Tout>()(0) = tensor_id;  // Slot tensor id
+      output_tensor->flat<Tout>()(1) = 0.0;        // Has inf or nan
+      int fp_props =
+          std::accumulate(data, data + size, 0, [](const int x, const Tin& y) {
+            return Eigen::numext::isfinite(y) ? x : 1;
+          });
+      if (fp_props) {
+        output_tensor->flat<Tout>()(1) = 1.0;
+      }
+    } else if (tensor_debug_mode_ == 3) {  // CONCISE_HEALTH
+      TensorShape shape({5});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+      output_tensor->flat<Tout>()(0) = tensor_id;
+      output_tensor->flat<Tout>()(1) = num_elem;
+
+      // Accumulator value [neg_inf_count, pos_inf_count, nan_count]
+      Tout fp_props[3] = {0.0, 0.0, 0.0};
+      std::for_each(data, data + size, [&fp_props](const Tin& y) {
+        if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
+          // Do nothing: common case.
+        } else if (Eigen::numext::isinf(y)) {
+          if (y < static_cast<Tin>(0.f)) {
+            ++fp_props[0];
+          } else {
+            ++fp_props[1];
+          }
+        } else if (Eigen::numext::isnan(y)) {
+          ++fp_props[2];
+        }
+      });
+      output_tensor->flat<Tout>()(2) = fp_props[0];  // Slot for -inf count
+      output_tensor->flat<Tout>()(3) = fp_props[1];  // Slot for inf count
+      output_tensor->flat<Tout>()(4) = fp_props[2];  // Slot for nan count
+    } else if (tensor_debug_mode_ == 4) {            // FULL HEALTH
+      TensorShape shape({11});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+      int num_dims = tensor.dims();
+      output_tensor->flat<Tout>()(0) = tensor_id;
+      output_tensor->flat<Tout>()(1) = -1.0;  // TODO(144919262): Device ID
+      output_tensor->flat<Tout>()(2) = static_cast<Tout>(tensor.dtype());
+      output_tensor->flat<Tout>()(3) = static_cast<Tout>(num_dims);
+      output_tensor->flat<Tout>()(4) = num_elem;
+
+      // Accumulator value [neg_inf_count, pos_inf_count, nan_count, neg_count,
+      //                   zero_count, pos_count]
+      Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+      std::for_each(data, data + size, [&fp_props](const Tin& y) {
+        if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
+          if (y < static_cast<Tin>(0.f)) {
+            ++fp_props[3];
+          } else if (y == static_cast<Tin>(0.f)) {
+            ++fp_props[4];
+          } else {
+            ++fp_props[5];
+          }
+        } else if (Eigen::numext::isinf(y)) {
+          if (y < static_cast<Tin>(0.f)) {
+            ++fp_props[0];
+          } else {
+            ++fp_props[1];
+          }
+        } else if (Eigen::numext::isnan(y)) {
+          ++fp_props[2];
+        }
+      });
+      output_tensor->flat<Tout>()(5) = fp_props[0];   // Slot for -inf count
+      output_tensor->flat<Tout>()(6) = fp_props[1];   // Slot for inf count
+      output_tensor->flat<Tout>()(7) = fp_props[2];   // Slot for nan count.
+      output_tensor->flat<Tout>()(8) = fp_props[3];   // Slot for neg count.
+      output_tensor->flat<Tout>()(9) = fp_props[4];   // Slot for zero count.
+      output_tensor->flat<Tout>()(10) = fp_props[5];  // Slot for pos count.
+    } else if (tensor_debug_mode_ == 5) {             // SHAPE
+      TensorShape shape({10});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+
+      int num_dims = tensor.dims();
+      output_tensor->flat<Tout>()(0) = tensor_id;
+      output_tensor->flat<Tout>()(1) = static_cast<Tout>(tensor.dtype());
+      output_tensor->flat<Tout>()(2) = static_cast<Tout>(num_dims);
+      output_tensor->flat<Tout>()(3) = num_elem;
+
+      // Tensor shape - stored as (6 columns)
+      // if num_dim is less than 6, we right pad the shape with zeros
+      // if num_dim is greater than 6, we truncate the head (left most) of the
+      // dimensions as they are more predictable than the last few (e.g. batch
+      // size as first dimension)
+      int dim_idx = 4;
+      for (int i = std::max(0, num_dims - kShapeDims);
+           i < std::max(6, num_dims); ++i) {
+        if (i < num_dims) {
+          output_tensor->flat<Tout>()(dim_idx++) =
+              static_cast<Tout>(tensor.dim_size(i));
+        } else {
+          output_tensor->flat<Tout>()(dim_idx++) = 0.0;
+        }
+      }
+    } else if (tensor_debug_mode_ == 8) {  // REDUCE_INF_NAN_THREE_SLOTS.
+      TensorShape shape({3});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+      output_tensor->flat<Tout>()(0) = 0.0;  // Slot for -inf.
+      output_tensor->flat<Tout>()(1) = 0.0;  // Slot for inf.
+      output_tensor->flat<Tout>()(2) = 0.0;  // Slot for nan.
+
+      int fp_props =
+          std::accumulate(data, data + size, 0, [](const int x, const Tin& y) {
+            int result = x;
+            if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
+              // Do nothing: common case.
+            } else if (Eigen::numext::isinf(y)) {
+              result |= y < static_cast<Tin>(0.f) ? kNegInfBit : kPosInfBit;
+            } else if (Eigen::numext::isnan(y)) {
+              result |= kNaNBit;
+            }
+            return result;
+          });
+
+      if (fp_props & kNegInfBit) {
+        output_tensor->flat<Tout>()(0) = -std::numeric_limits<Tout>::infinity();
+      }
+      if (fp_props & kPosInfBit) {
+        output_tensor->flat<Tout>()(1) = std::numeric_limits<Tout>::infinity();
+      }
+      if (fp_props & kNaNBit) {
+        output_tensor->flat<Tout>()(2) = std::numeric_limits<Tout>::quiet_NaN();
+      }
+    } else {
+      // TODO(cais): Implement other tensor debug modes in debug_event.proto.
+      context->SetStatus(errors::Unimplemented(
+          "Unimplemented tensor debug mode: ", tensor_debug_mode_));
+    }
+  }
+
+ private:
+  int tensor_debug_mode_;
+  int64_t tensor_id_;
+  static constexpr int kShapeDims = 6;
+  static constexpr int kNegInfBit = 0x01;
+  static constexpr int kPosInfBit = 0x02;
+  static constexpr int kNaNBit = 0x04;
+  static constexpr int64_t kMaxTensorId = 1LL
+                                          << std::numeric_limits<Tout>::digits;
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Tin, typename Tout>
+class DebugNumericSummaryV2Op<GPUDevice, Tin, Tout> : public AsyncOpKernel {
+ public:
+  typedef GPUDevice Device;
+
+  explicit DebugNumericSummaryV2Op(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
+    OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    Tensor* output_tensor;
+    Tout tensor_id = static_cast<Tout>(tensor_id_);
+    const Tensor& tensor = context->input(0);
+    const Tout num_elem = static_cast<Tout>(tensor.NumElements());
+    const Device& d = context->eigen_device<Device>();
+    auto input = tensor.flat<Tin>();
+    auto check_cb = [this, done]() { done(); };
+    // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because
+    // that mode does not make use of tensor_id.
+    if (tensor_debug_mode_ != 8) {
+      OP_REQUIRES_ASYNC(
+          context, tensor_id_ <= kMaxTensorId,
+          errors::InvalidArgument("DebugNumericSummaryV2Op requires "
+                                  "tensor_id to be less than or equal to "
+                                  "(2^",
+                                  std::numeric_limits<Tout>::digits,
+                                  "). Given tensor_id:", tensor_id_),
+          done);
+    }
+
+    if (tensor_debug_mode_ == 2) {  // CURT_HEALTH.
+      TensorShape shape({2});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+
+      auto* stream = context->op_device_context()->stream();
+      OP_REQUIRES_ASYNC(context, stream != nullptr,
+                        errors::Internal("No GPU stream available."), done);
+
+      se::DeviceMemoryBase output_tensor_ptr(
+          output_tensor->flat<Tout>().data(),
+          output_tensor->flat<Tout>().size());
+      OP_REQUIRES_OK(context,
+                     stream->MemZero(&output_tensor_ptr, 2 * sizeof(Tout)));
+      // Copy tensor_id to slot zero
+      OP_REQUIRES_OK(context, stream->Memcpy(&output_tensor_ptr, &tensor_id,
+                                             sizeof(Tout)));
+      if (num_elem == 0) {
+        done();
+        return;
+      }
+
+      // Call the GPU kernels for the numerical (inf/nan) checks.
+      auto input = context->input(0).flat<Tin>();
+      CurtHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(),
+                                        output_tensor->flat<Tout>().data() + 1);
+
+      context->device()
+          ->tensorflow_accelerator_device_info()
+          ->event_mgr->ThenExecute(stream, std::move(check_cb));
+    } else if (tensor_debug_mode_ == 3) {  // CONCISE_HEALTH.
+      TensorShape shape({5});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+      OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(),
+                        errors::Unimplemented(
+                            "Determinism is not yet supported for "
+                            "DebugNumericSummaryV2 when tensor_debug_mode is "
+                            "CONCISE_HEALTH."),
+                        done);
+
+      auto* stream = context->op_device_context()->stream();
+      OP_REQUIRES_ASYNC(context, stream != nullptr,
+                        errors::Internal("No GPU stream available."), done);
+
+      se::DeviceMemoryBase output_tensor_ptr(
+          output_tensor->flat<Tout>().data(),
+          output_tensor->flat<Tout>().size());
+      OP_REQUIRES_OK(context,
+                     stream->Memset32(&output_tensor_ptr, 0, 5 * sizeof(Tout)));
+      const Tout static_output[] = {tensor_id, num_elem};
+      OP_REQUIRES_OK(context, stream->Memcpy(&output_tensor_ptr, &static_output,
+                                             2 * sizeof(Tout)));
+      if (num_elem == 0) {
+        done();
+        return;
+      }
+
+      // Call the GPU kernels for the numerical (inf/nan) checks.
+      ConciseHealthLaunch<Tin, Tout>().Run(
+          d, input.data(), input.size(),
+          output_tensor->flat<Tout>().data() + 2);
+
+      context->device()
+          ->tensorflow_accelerator_device_info()
+          ->event_mgr->ThenExecute(stream, std::move(check_cb));
+    } else if (tensor_debug_mode_ == 4) {  // FULL HEALTH
+      TensorShape shape({11});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+
+      auto* stream = context->op_device_context()->stream();
+      OP_REQUIRES_ASYNC(context, stream != nullptr,
+                        errors::Internal("No GPU stream available."), done);
+      OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(),
+                        errors::Unimplemented(
+                            "Determinism is not yet supported for "
+                            "DebugNumericSummaryV2 when tensor_debug_mode is "
+                            "FULL_HEALTH."),
+                        done);
+
+      se::DeviceMemoryBase output_tensor_ptr(
+          output_tensor->flat<Tout>().data(),
+          output_tensor->flat<Tout>().size());
+      OP_REQUIRES_OK(
+          context, stream->Memset32(&output_tensor_ptr, 0, 11 * sizeof(Tout)));
+
+      int num_dims = tensor.dims();
+      const Tout static_output[] = {tensor_id,
+                                    -1.0,  // TODO(144919262): Device ID
+                                    static_cast<Tout>(tensor.dtype()),
+                                    static_cast<Tout>(num_dims), num_elem};
+      OP_REQUIRES_OK(context, stream->Memcpy(&output_tensor_ptr, &static_output,
+                                             5 * sizeof(Tout)));
+      if (num_elem == 0) {
+        done();
+        return;
+      }
+
+      // Call the GPU kernels for the numerical (inf/nan) checks and
+      // pos/neg/zero counts.
+      FullHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(),
+                                        output_tensor->flat<Tout>().data() + 5);
+
+      context->device()
+          ->tensorflow_accelerator_device_info()
+          ->event_mgr->ThenExecute(stream, std::move(check_cb));
+    } else if (tensor_debug_mode_ == 5) {  // SHAPE
+      TensorShape shape({10});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+
+      auto* stream = context->op_device_context()->stream();
+      OP_REQUIRES_ASYNC(context, stream != nullptr,
+                        errors::Internal("No GPU stream available."), done);
+
+      se::DeviceMemoryBase output_tensor_ptr(
+          output_tensor->flat<Tout>().data(),
+          output_tensor->flat<Tout>().size());
+
+      int num_dims = tensor.dims();
+      Tout static_output[10] = {tensor_id,
+                                static_cast<Tout>(tensor.dtype()),
+                                static_cast<Tout>(num_dims),
+                                num_elem,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0};
+      // Tensor shape: right pad zeros, truncate head
+      int dim_idx = 4;
+      for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) {
+        static_output[dim_idx++] = static_cast<Tout>(tensor.dim_size(i));
+      }
+      // Write to device stream
+      OP_REQUIRES_OK(context, stream->Memcpy(&output_tensor_ptr, &static_output,
+                                             sizeof(Tout) * 10));
+      context->device()
+          ->tensorflow_accelerator_device_info()
+          ->event_mgr->ThenExecute(stream, std::move(check_cb));
+    } else if (tensor_debug_mode_ == 8) {  // REDUCE_INF_NAN_THREE_SLOTS.
+      TensorShape shape({3});
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, shape, &output_tensor));
+
+      auto* stream = context->op_device_context()->stream();
+      OP_REQUIRES_ASYNC(context, stream != nullptr,
+                        errors::Internal("No GPU stream available."), done);
+
+      se::DeviceMemoryBase output_tensor_ptr(
+          output_tensor->flat<Tout>().data(),
+          output_tensor->flat<Tout>().size());
+      OP_REQUIRES_OK(
+          context,
+          stream->Memset32(&output_tensor_ptr, 0,
+                           output_tensor->flat<Tout>().size() * sizeof(Tout)));
+      if (num_elem == 0) {
+        done();
+        return;
+      }
+
+      // Call the GPU kernels for the numerical (inf/nan) checks.
+      auto input = context->input(0).flat<Tin>();
+      ReduceInfNanThreeSlotsLaunch<Tin, Tout>().Run(
+          d, input.data(), input.size(), output_tensor->flat<Tout>().data());
+
+      context->device()
+          ->tensorflow_accelerator_device_info()
+          ->event_mgr->ThenExecute(stream, std::move(check_cb));
+    } else {
+      // TODO(cais): Implement other tensor debug modes in debug_event.proto.
+      context->SetStatus(errors::Unimplemented(
+          "Unimplemented tensor debug mode: ", tensor_debug_mode_));
+      done();
+    }
+  }
+
+ private:
+  int tensor_debug_mode_;
+  int64_t tensor_id_;
+  static constexpr int64_t kMaxTensorId = 1L
+                                          << std::numeric_limits<Tout>::digits;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/deep_conv2d.h b/third_party/tflite-hdrs/tensorflow/core/kernels/deep_conv2d.h
new file mode 100644
index 00000000..c484db38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/deep_conv2d.h
@@ -0,0 +1,117 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#define TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+// DeepConv2D is a Conv2D implementation specialized for deep (i.e. large
+// in_depth * out_depth product) convolutions (see deep_conv2d.cc for details).
+
+// DeepConv2DTransform is an interface for implementing transforms for
+// DeepConv2D. Implementations must specify transform matrices and
+// input/output/filter shapes. DeepConv2d computes:
+//
+//   y = C[Ad * Bg]
+//
+//   C: output transform matrix
+//   A: input data transform matrix
+//   B: filter transform matrix
+//   d: vectorized 2D data tile
+//   g: vectorized 2D filter tile
+//   y: vectorized 2D output tile
+
+template <typename T>
+class DeepConv2DTransform {
+ public:
+  virtual ~DeepConv2DTransform() {}
+
+  virtual void GetFilterTransformMatrix(const int64_t rows, const int64_t cols,
+                                        T* transform_matrix) const = 0;
+
+  virtual void GetInputTransformMatrix(const int64_t rows, const int64_t cols,
+                                       T* transform_matrix) const = 0;
+
+  virtual void GetOutputTransformMatrix(const int64_t rows, const int64_t cols,
+                                        T* transform_matrix) const = 0;
+
+  struct Shape {
+    Shape(int64_t r, int64_t c) : rows(r), cols(c) {}
+    int64_t rows;
+    int64_t cols;
+  };
+
+  virtual const Shape& filter_shape() const = 0;
+  virtual const Shape& input_shape() const = 0;
+  virtual const Shape& output_shape() const = 0;
+};
+
+// Conv2D arguments used by DeepConv2D implementation.
+struct Conv2DArgs {
+  // Input layer dimensions
+  int batch;
+  int in_rows;
+  int in_cols;
+  int in_depth;
+  int filter_rows;
+  int filter_cols;
+  int pad_rows;
+  int pad_cols;
+
+  // Output layer dimensions
+  int out_rows;
+  int out_cols;
+  int out_depth;
+
+  Conv2DArgs()
+      : batch(0),
+        in_rows(0),
+        in_cols(0),
+        in_depth(0),
+        filter_rows(0),
+        filter_cols(0),
+        pad_rows(0),
+        pad_cols(0),
+        out_rows(0),
+        out_cols(0),
+        out_depth(0) {}
+};
+
+// Returns true if convolution operation specified by function arguments
+// can use DeepConv2D implementation, and false otherwise.
+// May return false based on parameters, cost, or whether feature is disabled.
+bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
+                      int filter_cols, int in_depth, int out_depth,
+                      int out_rows, int out_cols);
+
+namespace functor {
+
+// Calls DeepConv2D implementation (see deep_conv2d.cc for details).
+template <typename Device, typename T>
+struct DeepConv2D {
+  void operator()(OpKernelContext* ctx, const Conv2DArgs& args, const T* input,
+                  const T* filter, T* output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/dense_update_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/dense_update_functor.h
new file mode 100644
index 00000000..c16db936
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/dense_update_functor.h
@@ -0,0 +1,81 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+
+enum DenseUpdateType { ADD, SUB, ASSIGN };
+
+namespace functor {
+
+template <typename Device, typename T, DenseUpdateType OP>
+struct DenseUpdate {
+  void operator()(const Device& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update);
+};
+
+template <typename T>
+struct DenseUpdate<CPUDevice, T, ADD> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) += update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<CPUDevice, T, SUB> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) -= update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<CPUDevice, T, ASSIGN> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) = update;
+  }
+};
+
+
+}  // end namespace functor
+
+template <typename Device>
+absl::Status VariantCopyFn(OpKernelContext* context, const Tensor& from,
+                           Tensor* to);
+
+template <>
+absl::Status VariantCopyFn<CPUDevice>(OpKernelContext* context,
+                                      const Tensor& from, Tensor* to);
+template <>
+absl::Status VariantCopyFn<GPUDevice>(OpKernelContext* context,
+                                      const Tensor& from, Tensor* to);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/depthtospace_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/depthtospace_op.h
new file mode 100644
index 00000000..63dba5d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/depthtospace_op.h
@@ -0,0 +1,56 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by DepthToSpaceOp to do the computations.
+// Implements a family of Depth to Space transforms for a 4D 'input' tensor
+// to a 4D 'output' tensor, both tensors use type 'T' and layout 'data_format'.
+// These transforms multiply the vertical and horizontal image sizes by
+// 'block_size', and divide the depth dimension by (block_size * block_size)
+// which must divide evenly.
+// Each pixel in the input image is converted to a square block of pixels in
+// the output image. The Y, X coordinates within each block comes from the
+// high component of the input depth (channel) index.
+// e.g. for data_format = NHWC:
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channel).
+//      The output would be a transpose to the following layout:
+//      n,iY,bY,iX,bX,oC
+template <typename Device, typename T, TensorFormat data_format>
+struct DepthToSpaceOpFunctor {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output);
+
+  // This 5-D version is to support NCHW_VECT_C.
+  void operator()(const Device& d, typename TTypes<T, 5>::ConstTensor input,
+                  int block_size, typename TTypes<T, 5>::Tensor output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/depthwise_conv_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/depthwise_conv_op.h
new file mode 100644
index 00000000..1114caab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/depthwise_conv_op.h
@@ -0,0 +1,352 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+
+struct DepthwiseArgs {
+  // Input layer dimensions
+  int batch;
+  int in_rows;
+  int in_cols;
+  int in_depth;
+  int filter_rows;
+  int filter_cols;
+  int depth_multiplier;
+  int stride;
+  int pad_rows;  // Amount of padding to the top of the input
+  int pad_cols;  // Amount of padding to the left of the input
+
+  // Output layer dimensions
+  int out_rows;
+  int out_cols;
+  int out_depth;
+
+  DepthwiseArgs()
+      : batch(0),
+        in_rows(0),
+        in_cols(0),
+        in_depth(0),
+        filter_rows(0),
+        filter_cols(0),
+        depth_multiplier(0),
+        stride(0),
+        pad_rows(0),
+        pad_cols(0),
+        out_rows(0),
+        out_cols(0),
+        out_depth(0) {}
+};
+
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+struct LaunchDepthwiseConvOp {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* input, const T* filter, T* output,
+                  TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchDepthwiseConvBackpropInputOp {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchDepthwiseConvBackpropFilterOp {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format);
+};
+
+bool UseCudnnWith16BitFloat(OpKernelContext* ctx, DataType dtype);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename T>
+struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* input, const T* filter, T* output,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchDepthwiseConvBackpropInputOp<Eigen::GpuDevice, T> {
+  void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchDepthwiseConvBackpropFilterOp<Eigen::GpuDevice, T> {
+  void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format);
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace functor {
+
+// Pads 'filter' to vector-register boundary along its inner dimension:
+//   filter_inner_dim_size = in_depth * depth_multiplier
+// Requires 'filter' to have the following storage order:
+//   [filter_rows, filter_cols, in_depth, depth_multiplier]
+// Returns zero-padded filter in 'padded_filter'.
+//
+// EX:
+//   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
+//   So we have a total of 3 * 2 = 6 filters, each of spatial size 2 x 2.
+//
+//   filter [rows, cols, in_depth, depth_multiplier]
+//     [u0, v0, w0, x0] [y0, z0, u1, v1] [w1, x1, y1, z1]
+//     [u2, v2, w2, x2] [y2, z2, u3, v3] [w3, x3, y3, z3]
+//
+//   padded_filter [rows, cols, in_depth, depth_multiplier]
+//     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
+//     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
+
+template <typename T>
+struct DepthwiseFilterPadOp {
+  void operator()(const DepthwiseArgs& args, const T* filter,
+                  T* padded_filter) {
+    typedef typename Eigen::internal::packet_traits<T>::type Packet;
+    static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
+
+    // Calculate vectorized and scalar lengths of filter's inner dimension.
+    const int64_t filter_inner_dim_size = args.out_depth;
+    const int64_t vectorized_size =
+        (filter_inner_dim_size / kPacketSize) * kPacketSize;
+    const int64_t scalar_size = filter_inner_dim_size - vectorized_size;
+    // Calculate required padding and padded output buffer stride.
+    const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
+    const int64_t padded_filter_stride = vectorized_size + kPacketSize;
+
+    const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
+    for (int64_t i = 0; i < filter_spatial_size; ++i) {
+      const int64_t input_base = i * filter_inner_dim_size;
+      const int64_t output_base = i * padded_filter_stride;
+      // Write vectorized length of filter's inner dimension to output.
+      for (int64_t j = 0; j < vectorized_size; j += kPacketSize) {
+        const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j);
+        Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v);
+      }
+      // Write scalar length of filter's inner dimension to output.
+      for (int64_t j = 0; j < scalar_size; ++j) {
+        padded_filter[output_base + vectorized_size + j] =
+            filter[input_base + vectorized_size + j];
+      }
+      // Pad the remainder of output to vector-register boundary.
+      for (int64_t j = 0; j < pad_size; ++j) {
+        padded_filter[output_base + vectorized_size + scalar_size + j] =
+            static_cast<T>(0);
+      }
+    }
+  }
+};
+
+// Copies data from local region in 'input' specified by 'out_r' and 'out_'c'
+// to 'input_buffer'. The copied data is replicated by factor
+// 'args.depth_multiplier', and padded to vector register-width boundaries so
+// that it is aligned for efficient traversal and vector multiply-add by the
+// depthwise kernel.
+//
+// EX:
+//   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
+//
+//   input: [batch, in_rows, in_cols, in_depth]
+//
+//     [a0, a1, a2, b0, b1, b2, ..., e0, e1, e2, f0, f1, f2, ...]
+//
+//   input_buffer (register boundaries shown):
+//     [a0, a0, a1, a1] [a2, a2, 0, 0]   in_row = 0, in_col = 0
+//     [b0, b0, b1, b1] [b2, b2, 0, 0]   in_row = 0, in_col = 1
+//     [e0, e0, e1, e1] [e2, e2, 0, 0]   in_row = 1, in_col = 0
+//     [f0, f0, f1, f1] [f2, f2, 0, 0]   in_row = 1, in_col = 1
+//
+// Returns replicated and padded data from specified input region in
+// 'input_buffer'.
+
+template <typename T>
+struct DepthwiseInputCopyOp {
+  void operator()(const DepthwiseArgs& args,
+                  const int64_t padded_filter_inner_dim_size,
+                  const int64_t out_r, const int64_t out_c, const T* input,
+                  T* input_buffer) {
+    typedef typename Eigen::internal::packet_traits<T>::type Packet;
+    static const int64_t kPacketSize = Eigen::internal::packet_traits<T>::size;
+
+    const int64_t kDepth = args.depth_multiplier;
+    // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
+    const int64_t input_vectorized_size =
+        (args.in_depth / kPacketSize) * kPacketSize;
+    const int64_t input_scalar_size = args.in_depth - input_vectorized_size;
+
+    // Calculate output padding length.
+    const int64_t output_scalar_size = args.out_depth % kPacketSize;
+    const int64_t output_pad_size =
+        output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0;
+
+    // Iterate through all rows x cols reading 'in_depth' from 'input' and
+    // replicating by 'depth_multiplier' into 'input_buffer' (otherwise
+    // zero-padding input buffer as needed).
+    auto* in_buf = input_buffer;
+    const int64_t in_r_start = out_r * args.stride - args.pad_rows;
+    const int64_t in_c_start = out_c * args.stride - args.pad_cols;
+
+    // TODO: add a ploaddup variant for depth == 2 if needed.
+    if (kDepth > 1 && kDepth <= kPacketSize) {
+      for (int64_t f_r = 0; f_r < args.filter_rows; ++f_r) {
+        const int64_t in_r = in_r_start + f_r;
+
+        for (int64_t f_c = 0; f_c < args.filter_cols; ++f_c) {
+          const int64_t in_c = in_c_start + f_c;
+
+          if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
+              in_c < args.in_cols) {
+            const auto* in =
+                input + (in_r * args.in_cols + in_c) * args.in_depth;
+            int64_t limit = args.in_depth;
+            // This will overwrite up to kPacketSize next elements,
+            // this is ok on all iterations except the last one, since
+            // we will write correct values on a next iteration.
+            if (f_c == args.filter_cols - 1) {
+              limit -= (kPacketSize - kDepth) / kDepth + 1;
+              if (limit < 0) {
+                limit = 0;
+              }
+            }
+            // Copy vectorized portion of inner dimension.
+            for (int64_t d = 0; d < limit; d++) {
+              const auto p = Eigen::internal::pset1<Packet>(in[d]);
+              Eigen::internal::pstoreu<T>(in_buf, p);
+              in_buf += kDepth;
+            }
+
+            // Copy the scalar portion.
+            for (int64_t d = limit; d < args.in_depth; d++) {
+              const auto value = in[d];
+              for (int64_t dm = 0; dm < kDepth; dm++) {
+                in_buf[dm] = value;
+              }
+              in_buf += kDepth;
+            }
+
+            // Pad the remainder of the output to vector register boundary.
+            for (int64_t d = 0; d < output_pad_size; ++d) {
+              in_buf[d] = static_cast<T>(0);
+            }
+            in_buf += output_pad_size;
+          } else {
+            // Zero pad.
+            memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
+            in_buf += padded_filter_inner_dim_size;
+          }
+        }
+      }
+    } else if (kDepth > kPacketSize) {
+      // Calculate vectorized and scalar (residual) lengths for
+      // 'depth_multiplier'. This is used to efficiently replicate data for
+      // when 'depth_multiplier' > kPacketSize.
+      const int64_t dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize;
+
+      for (int64_t f_r = 0; f_r < args.filter_rows; ++f_r) {
+        const int64_t in_r = in_r_start + f_r;
+
+        for (int64_t f_c = 0; f_c < args.filter_cols; ++f_c) {
+          const int64_t in_c = in_c_start + f_c;
+
+          if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
+              in_c < args.in_cols) {
+            const auto* in =
+                input + (in_r * args.in_cols + in_c) * args.in_depth;
+            // Copy vectorized portion of inner dimension.
+            for (int64_t d = 0; d < args.in_depth; d++) {
+              const auto p = Eigen::internal::pset1<Packet>(in[d]);
+              for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
+                Eigen::internal::pstoreu<T>(in_buf + dm, p);
+              }
+              // Overlapping store for the remainder.
+              Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p);
+              in_buf += kDepth;
+            }
+            // Pad the remainder of the output to vector register boundary.
+            for (int64_t d = 0; d < output_pad_size; ++d) {
+              in_buf[d] = static_cast<T>(0);
+            }
+            in_buf += output_pad_size;
+          } else {
+            // Zero pad.
+            memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
+            in_buf += padded_filter_inner_dim_size;
+          }
+        }
+      }
+    } else if (kDepth == 1) {
+      for (int64_t f_r = 0; f_r < args.filter_rows; ++f_r) {
+        const int64_t in_r = in_r_start + f_r;
+
+        for (int64_t f_c = 0; f_c < args.filter_cols; ++f_c) {
+          const int64_t in_c = in_c_start + f_c;
+
+          if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
+              in_c < args.in_cols) {
+            const auto* in =
+                input + (in_r * args.in_cols + in_c) * args.in_depth;
+            for (int64_t d = 0; d < input_vectorized_size; d += kPacketSize) {
+              const auto p = Eigen::internal::ploadu<Packet>(in + d);
+              Eigen::internal::pstoreu<T>(in_buf, p);
+              in_buf += kPacketSize;
+            }
+            for (int64_t d = 0; d < input_scalar_size; ++d) {
+              T v = in[input_vectorized_size + d];
+              in_buf[d] = v;
+            }
+            in_buf += input_scalar_size;
+
+            // Pad the remainder of the output to vector register boundary.
+            for (int64_t d = 0; d < output_pad_size; ++d) {
+              in_buf[d] = static_cast<T>(0);
+            }
+            in_buf += output_pad_size;
+          } else {
+            // Zero pad.
+            memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
+            in_buf += padded_filter_inner_dim_size;
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/depthwise_conv_op_gpu.h
new file mode 100644
index 00000000..b058ef26
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -0,0 +1,1759 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/determinism.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define UNROLL
+#define NOUNROLL
+#else
+#define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
+#endif
+
+namespace tensorflow {
+
+namespace detail {
+template <typename T>
+struct PseudoHalfType {
+  using Type = T;
+};
+template <>
+struct PseudoHalfType<Eigen::half> {
+  using Type = float;
+};
+template <>
+struct PseudoHalfType<Eigen::bfloat16> {
+  using Type = float;
+};
+}  // namespace detail
+
+using Eigen::GpuDevice;
+
+// Returns whether depthwise convolution forward or backward input pass can be
+// performed using the faster ('Small') variant of the kernel.
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+    const DepthwiseArgs& args) {
+  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
+         args.in_cols <= 32 && args.in_rows == args.out_rows &&
+         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
+         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
+         args.pad_cols < args.filter_cols &&
+         args.filter_rows * args.filter_cols <=
+             (args.in_rows + 1) / 2 * args.in_cols;
+}
+
+// Returns whether depthwise convolution backward filter pass can be performed
+// using the faster ('Small') variant of the kernel.
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    const DepthwiseArgs& args, const int block_height) {
+  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
+         args.in_cols <= 32 && args.in_rows == args.out_rows &&
+         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
+         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
+         args.pad_cols < args.filter_cols && block_height <= args.in_rows &&
+         args.filter_rows * args.filter_cols <= args.in_cols * block_height;
+}
+
+// The DepthwiseConv2dGPUKernels perform either forward or backprop input
+// convolution depending on a template argument of this enum.
+enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };
+
+// A GPU kernel to compute the depthwise convolution forward pass
+// in NHWC format.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
+                                 const T* __restrict__ input,
+                                 const T* __restrict__ filter,
+                                 T* __restrict__ output, int num_outputs) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  GPU_1D_KERNEL_LOOP(thread_id, num_outputs) {
+    // Compute the indexes of this thread in the output.
+    const int out_channel = thread_id % out_depth;
+    const int out_col = (thread_id / out_depth) % out_width;
+    const int out_row = (thread_id / out_depth / out_width) % out_height;
+    const int batch = thread_id / out_depth / out_width / out_height;
+    // Compute the input depth and the index of depth multiplier.
+    const int in_channel = out_channel / depth_multiplier;
+    const int multiplier = out_channel % depth_multiplier;
+
+    // Decide if all input is valid, if yes, we can skip the boundary checks
+    // for each input.
+    const int input_row_start = out_row * stride - pad_height;
+    const int input_col_start = out_col * stride - pad_width;
+    const int input_row_end = input_row_start + filter_height;
+    const int input_col_end = input_col_start + filter_width;
+
+    S sum = static_cast<S>(0);
+
+    const int input_offset_temp = in_height * batch;
+    if (input_row_start >= 0 && input_col_start >= 0 &&
+        input_row_end < in_height && input_col_end < in_width) {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
+
+          const int input_offset =
+              in_channel +
+              in_depth * (in_col + in_width * (in_row + input_offset_temp));
+          const int filter_offset =
+              multiplier +
+              depth_multiplier *
+                  (in_channel + in_depth * (filter_col + filter_offset_temp));
+          sum += static_cast<S>(ldg(input + input_offset)) *
+                 static_cast<S>(ldg(filter + filter_offset));
+        }
+      }
+    } else {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
+            const int in_col = input_col_start + filter_col;
+
+            const int input_offset =
+                in_channel +
+                in_depth * (in_col + in_width * (in_row + input_offset_temp));
+            const int filter_offset =
+                multiplier +
+                depth_multiplier *
+                    (in_channel + in_depth * (filter_col + filter_offset_temp));
+            sum += static_cast<S>(ldg(input + input_offset)) *
+                   static_cast<S>(ldg(filter + filter_offset));
+          }
+        }
+      }
+    }
+    output[thread_id] = static_cast<T>(sum);
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution forward pass in NHWC format,
+// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+// Backprop input direction is the same as forward direction with the filter
+// rotated by 180°.
+// T is the tensors' data type. S is the math type the kernel uses. This is the
+// same as T for all cases but pseudo half (which has T=Eigen::half, S=float).
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
+          bool kKnownEvenHeight>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
+    const DepthwiseArgs args, const T* __restrict__ input,
+    const T* __restrict__ filter, T* __restrict__ output) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.x depths.
+  GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory);
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
+  S* const shared_data = reinterpret_cast<S*>(shared_memory);
+
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+
+  assert(blockDim.x == kBlockDepth);
+  assert(blockDim.y == args.in_cols);
+  const int block_height = blockDim.z;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_size = block_height * in_width * kBlockDepth;
+  const int in_row_size = in_width * in_depth;
+  const int in_size = in_height * in_row_size;
+  const int in_increment = (in_width - 1) * kBlockDepth;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int even_height = kKnownEvenHeight || (1 & ~in_height);
+  const int tile_height = in_height + filter_height - even_height;
+  const int tile_row_size = tile_width * kBlockDepth;
+  const int tile_size = tile_height * tile_row_size;
+  const int tile_offset = block_height * tile_row_size;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth;
+  const int in_blocks = batch_blocks * num_batches;
+  const int tensor_offset =
+      kKnownEvenHeight ? in_size / 2 : block_height * in_row_size;
+
+  const int thread_depth = threadIdx.x;
+  const int thread_col = threadIdx.y;
+  const int thread_row = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_width + thread_col;
+  const int thread_idx = thread_pix * kBlockDepth + thread_depth;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = S();
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_pix * in_depth + thread_depth;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_width + thread_col;
+  const int data_idx = data_pix * kBlockDepth + thread_depth;
+
+  // Position in shared memory, offset by pad_height / pad_width.
+  const int tile_pix = data_pix + pad_offset;
+  const int tile_idx = tile_pix * kBlockDepth + thread_depth;
+
+  const int max_channel = in_depth - thread_depth;
+  const int filter_write_offset =
+      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset =
+      tile_size + thread_depth +
+      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth);
+  const bool skip_second =
+      !kKnownEvenHeight && thread_row + (in_height & 1) == block_height;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int batch = b / batch_blocks;
+    const int block = b - batch * batch_blocks;
+
+    const int start_channel = block * kBlockDepth;
+    const int filter_offset = tensor_idx + start_channel;
+    const int inout_offset = batch * in_size + filter_offset;
+    const bool channel_in_range = start_channel < max_channel;
+
+    if (channel_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      S* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = static_cast<S>(ldg(in_ptr));
+      if (!skip_second) {
+        tile_ptr[tile_offset] = static_cast<S>(ldg(tensor_offset + in_ptr));
+      }
+
+      if (filter_write_offset != 0) {
+        shared_data[filter_write_offset] =
+            static_cast<S>(ldg(filter_offset + filter));
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (channel_in_range) {
+      S sum1 = S();
+      S sum2 = S();
+      int shared_offset = data_idx;
+      const S* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
+          if (kDirection == DIRECTION_BACKWARD) {
+            filter_ptr -= kBlockDepth;
+          }
+          const S filter_value = *filter_ptr;
+          const S* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          shared_offset += kBlockDepth;
+          if (kDirection == DIRECTION_FORWARD) {
+            filter_ptr += kBlockDepth;
+          }
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = static_cast<T>(sum1);
+      if (!skip_second) {
+        out_ptr[tensor_offset] = static_cast<T>(sum2);
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+  }
+}
+
+// A GPU kernel to compute the depthwise convolution forward pass
+// in NCHW format.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
+                                 const T* __restrict__ input,
+                                 const T* __restrict__ filter,
+                                 T* __restrict__ output, int num_outputs) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const FastDividerUint32 depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_width = args.out_cols;
+  const FastDividerUint32 out_height = args.out_rows;
+  const FastDividerUint32 out_depth = args.out_depth;
+
+  GPU_1D_KERNEL_LOOP(thread_id, num_outputs) {
+    // Compute the indexes of this thread in the output.
+    //
+    // We want coalesced reads so we make sure that each warp reads
+    // a contiguous chunk of memory.
+    //
+    // THIS IS PROBABLY WRONG, we are not doing coalesced reads
+    // into the input, because of the depth multiplier division...
+    const int out_col = thread_id % out_width;
+    const int out_row = (thread_id / out_width) % out_height;
+    const int out_channel = (thread_id / out_width / out_height) % out_depth;
+    const int batch = thread_id / out_width / out_height / out_depth;
+
+    // Compute the input depth and the index of depth multiplier
+    // based off the output depth index that this thread is
+    // computing n.
+    const int in_channel = out_channel / depth_multiplier;
+    const int multiplier = out_channel % depth_multiplier;
+
+    // Data is stored in the following format (let's assume we
+    // flatten the height and width into one contiguous dimension
+    // called "P".
+    //
+    // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
+    // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
+    //
+    // Each row contains in_depth * in_height * in_width values
+    // for each sample in the batch.
+    //
+    // We can further flatten it into:
+    //
+    // B1C1P1 B1C1P2 .....
+    // B1C2P1 B1C2P2 ....
+    // B2C1P1 B2C1P2 .....
+    // B2C2P1 B2C2P2 ....
+    //
+    // where each row is a contiguous array of all of the spatial
+    // pixels for a given batch and input depth.  The following
+    // loop unrolls across the filter dimensions for a given thread,
+    // indexing into the filter value and the corresponding input
+    // patch.
+    //
+    // We can compute the index into the patch once right here.
+    const int input_offset_temp =
+        (batch * in_depth + in_channel) * (in_height * in_width);
+
+    // Finally, we can iterate over the spatial dimensions and perform the
+    // convolution, writing into the output at the end.
+    //
+    // We perform an additional optimization, where we can determine
+    // whether the patch fits within the image indices statically, and
+    // avoid boundary checking within the loop.
+    const int input_row_start = out_row * stride - pad_height;
+    const int input_col_start = out_col * stride - pad_width;
+    const int input_row_end = input_row_start + filter_height;
+    const int input_col_end = input_col_start + filter_width;
+
+    S sum = static_cast<S>(0);
+    if (input_row_start >= 0 && input_col_start >= 0 &&
+        input_row_end < in_height && input_col_end < in_width) {
+      // Loop that doesn't need to check for boundary conditions.
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
+
+          const int input_offset =
+              (input_offset_temp) + (in_row * in_width) + in_col;
+          const int filter_offset =
+              multiplier +
+              depth_multiplier *
+                  (in_channel + in_depth * (filter_col + filter_offset_temp));
+          sum += static_cast<S>(ldg(input + input_offset)) *
+                 static_cast<S>(ldg(filter + filter_offset));
+        }
+      }
+    } else {
+      // Loop that needs to check for boundary conditions.
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
+          // TODO(vrv): the in_row check can be done outside of this loop;
+          // benchmark both methods to determine the better decision.
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
+            const int in_col = input_col_start + filter_col;
+
+            // input_offset_temp indexes into the start of memory
+            // where the spatial data starts.
+            const int input_offset =
+                (input_offset_temp) + (in_row * in_width) + in_col;
+
+            const int filter_offset =
+                multiplier +
+                depth_multiplier *
+                    (in_channel + in_depth * (filter_col + filter_offset_temp));
+            sum += static_cast<S>(ldg(input + input_offset)) *
+                   static_cast<S>(ldg(filter + filter_offset));
+          }
+        }
+      }
+    }
+
+    output[thread_id] = static_cast<T>(sum);
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
+// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+// Backprop input direction is the same as forward direction with the filter
+// rotated by 180°.
+// T is the tensors' data type. S is the math type the kernel uses. This is the
+// same as T for all cases but pseudo half (which has T=Eigen::half, S=float).
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
+          bool kKnownEvenHeight>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
+    const DepthwiseArgs args, const T* __restrict__ input,
+    const T* __restrict__ filter, T* __restrict__ output) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.z depths.
+  GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory);
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
+  S* const shared_data = reinterpret_cast<S*>(shared_memory);
+
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+
+  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
+  assert(blockDim.x == args.in_cols);
+  assert(blockDim.z == kBlockDepth);
+  const int block_height = blockDim.y;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_pixels = in_width * block_height;
+  const int block_size = block_pixels * kBlockDepth;
+  const int in_pixels = in_width * in_height;
+  const int in_increment = in_width - 1;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int even_height = kKnownEvenHeight || (1 & ~in_height);
+  const int tile_height = in_height + filter_height - even_height;
+  const int tile_pixels = tile_width * tile_height;
+  const int tile_size = tile_pixels * kBlockDepth;
+  const int tile_offset = block_height * tile_width;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int in_total_depth = in_depth * num_batches;
+  const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth;
+
+  const int thread_col = threadIdx.x;
+  const int thread_row = threadIdx.y;
+  const int thread_depth = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_width + thread_col;
+  const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = S();
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_width + thread_col;
+  const int data_idx = thread_depth * tile_pixels + data_pix;
+
+  // Position in shared memory, offset by pad_height / pad_width.
+  const int tile_idx = data_idx + pad_offset;
+
+  // Filter is always in HWCK format, irrespective of the input/output format.
+  const int filter_pix = thread_idx / kBlockDepth;
+  const int filter_channel = thread_idx % kBlockDepth;
+  const int filter_idx = filter_pix * in_depth;
+
+  const int max_channel = in_total_depth - thread_depth;
+  const int filter_write_offset =
+      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset =
+      tile_size + thread_depth +
+      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth);
+  const bool skip_second =
+      !kKnownEvenHeight && thread_row + (in_height & 1) == block_height;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int channel = b * kBlockDepth;
+
+    const int inout_offset = channel * in_pixels + tensor_idx;
+    const bool channel_in_range = channel < max_channel;
+
+    if (channel_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      S* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = static_cast<S>(ldg(in_ptr));
+      if (!skip_second) {
+        tile_ptr[tile_offset] = static_cast<S>(ldg(block_pixels + in_ptr));
+      }
+    }
+
+    if (filter_write_offset != 0) {
+      const int filter_offset =
+          filter_idx + (channel + filter_channel) % in_depth;
+      shared_data[filter_write_offset] =
+          static_cast<S>(ldg(filter_offset + filter));
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (channel_in_range) {
+      S sum1 = S();
+      S sum2 = S();
+      int shared_offset = data_idx;
+      const S* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
+          if (kDirection == DIRECTION_BACKWARD) {
+            filter_ptr -= kBlockDepth;
+          }
+          const S filter_value = *filter_ptr;
+          const S* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          ++shared_offset;
+          if (kDirection == DIRECTION_FORWARD) {
+            filter_ptr += kBlockDepth;
+          }
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = static_cast<T>(sum1);
+      if (!skip_second) {
+        out_ptr[block_pixels] = static_cast<T>(sum2);
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+  }
+}
+
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
+          bool kKnownEvenHeight>
+Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
+                                     const DepthwiseArgs& args, const T* input,
+                                     const T* filter, T* output,
+                                     TensorFormat data_format) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  const int block_height = (args.in_rows + 1) / 2;
+  dim3 block_dim;
+  int block_count;
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
+  switch (data_format) {
+    case FORMAT_NHWC:
+      block_dim = dim3(kBlockDepth, args.in_cols, block_height);
+      block_count =
+          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
+      kernel =
+          DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth,
+                                            kKnownFilterHeight, kBlockDepth,
+                                            kKnownEvenHeight>;
+      break;
+    case FORMAT_NCHW:
+      block_dim = dim3(args.in_cols, block_height, kBlockDepth);
+      block_count =
+          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
+      kernel =
+          DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth,
+                                            kKnownFilterHeight, kBlockDepth,
+                                            kKnownEvenHeight>;
+      break;
+    default:
+      return errors::InvalidArgument("FORMAT_", ToString(data_format),
+                                     " is not supported");
+  }
+  const int tile_width = args.in_cols + args.filter_cols - 1;
+  const int tile_height = block_height * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_height * tile_width;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
+  const int shared_memory_size =
+      kBlockDepth * (tile_pixels + filter_pixels) * sizeof(S);
+  const int num_outputs = args.out_rows * args.out_cols * block_count;
+  auto device = ctx->eigen_gpu_device();
+  GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+      num_outputs, device, kernel, shared_memory_size,
+      block_dim.x * block_dim.y * block_dim.z);
+  TF_CHECK_OK(GpuLaunchKernel(kernel, config.block_count, block_dim,
+                              shared_memory_size, device.stream(), args, input,
+                              filter, output));
+  return OkStatus();
+}
+
+// Returns whether the context's GPU supports efficient fp16 math.
+inline bool HasFastHalfMath(OpKernelContext* ctx) {
+  se::CudaComputeCapability compute_capability =
+      ctx->op_device_context()->stream()->GetCudaComputeCapability();
+  // GPUs before sm_53 don't support fp16 math, and sm_61's fp16 math is slow.
+  return compute_capability.IsAtLeast(5, 3) &&
+         compute_capability != se::CudaComputeCapability{6, 1};
+}
+
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth>
+Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
+                                     const DepthwiseArgs& args, const T* input,
+                                     const T* filter, T* output,
+                                     TensorFormat data_format) {
+  if (args.in_rows & 1) {
+    return LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                         kKnownFilterHeight, kBlockDepth,
+                                         false>(ctx, args, input, filter,
+                                                output, data_format);
+  } else {
+    return LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                         kKnownFilterHeight, kBlockDepth, true>(
+        ctx, args, input, filter, output, data_format);
+  }
+}
+
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight>
+Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
+                                     const DepthwiseArgs& args, const T* input,
+                                     const T* filter, T* output,
+                                     TensorFormat data_format) {
+  // Maximize (power of two) kBlockDepth while keeping a block within 1024
+  // threads (2 pixels per thread).
+  const int block_pixels = (args.in_rows + 1) / 2 * args.in_cols;
+  if (block_pixels > 256) {
+    return LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                         kKnownFilterHeight, 2>(
+        ctx, args, input, filter, output, data_format);
+  } else if (block_pixels > 128) {
+    return LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                         kKnownFilterHeight, 4>(
+        ctx, args, input, filter, output, data_format);
+  } else {
+    return LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                         kKnownFilterHeight, 8>(
+        ctx, args, input, filter, output, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
+                                const T* input, const T* filter, T* output,
+                                TensorFormat data_format) {
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
+  switch (data_format) {
+    case FORMAT_NHWC:
+      kernel =
+          DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                       kKnownDepthMultiplier>;
+      break;
+    case FORMAT_NCHW:
+      kernel =
+          DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                       kKnownDepthMultiplier>;
+      break;
+    default:
+      return errors::InvalidArgument("FORMAT_", ToString(data_format),
+                                     " is not supported");
+  }
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  auto device = ctx->eigen_gpu_device();
+  GpuLaunchConfig config =
+      GetGpuLaunchConfig(num_outputs, device, kernel, 0, 0);
+  // The compile-time constant version runs faster with a single block.
+  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
+                                      kKnownDepthMultiplier < 0
+                                  ? std::numeric_limits<int>::max()
+                                  : device.getNumGpuMultiProcessors();
+  TF_CHECK_OK(GpuLaunchKernel(kernel,
+                              std::min(max_block_count, config.block_count),
+                              config.thread_per_block, 0, device.stream(), args,
+                              input, filter, output, num_outputs));
+  return OkStatus();
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
+                                const T* input, const T* filter, T* output,
+                                TensorFormat data_format) {
+  if (args.depth_multiplier == 1) {
+    if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+      return LaunchDepthwiseConv2dGPUSmall<
+          T, DIRECTION_FORWARD, kKnownFilterWidth, kKnownFilterHeight>(
+          ctx, args, input, filter, output, data_format);
+    }
+
+    return LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight,
+                                    1>(ctx, args, input, filter, output,
+                                       data_format);
+  } else {
+    return LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight,
+                                    -1>(ctx, args, input, filter, output,
+                                        data_format);
+  }
+}
+
+// A simple launch pad to launch the GPU kernel for depthwise convolution.
+template <typename T>
+void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
+                                                     const DepthwiseArgs& args,
+                                                     const T* input,
+                                                     const T* filter, T* output,
+                                                     TensorFormat data_format) {
+  if (args.filter_rows == 3 && args.filter_cols == 3) {
+    OP_REQUIRES_OK(ctx, LaunchDepthwiseConv2dGPU<T, 3, 3>(
+                            ctx, args, input, filter, output, data_format));
+  } else {
+    OP_REQUIRES_OK(ctx, LaunchDepthwiseConv2dGPU<T, -1, -1>(
+                            ctx, args, input, filter, output, data_format));
+  }
+}
+
+// A GPU kernel to compute the depthwise convolution backprop w.r.t. input.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNHWC(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ filter, T* __restrict__ in_backprop,
+        int num_in_backprop) {
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  GPU_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
+    // Compute the indexes of this thread in the output.
+    const int in_channel = thread_id % in_depth;
+    const int in_col = (thread_id / in_depth) % in_width;
+    const int in_row = (thread_id / in_depth / in_width) % in_height;
+    const int batch = thread_id / in_depth / in_width / in_height;
+
+    T sum = static_cast<T>(0);
+
+    const int out_row_start =
+        tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride);
+    const int out_row_end =
+        tf_min(out_height - 1, (in_row + pad_height) / stride);
+    const int out_col_start =
+        tf_max(0, (in_col - filter_width + pad_width + stride) / stride);
+    const int out_col_end =
+        tf_min(out_width - 1, (in_col + pad_width) / stride);
+
+    NOUNROLL for (int out_row = out_row_start; out_row <= out_row_end;
+                  ++out_row) {
+      const int filter_row = in_row + pad_height - out_row * stride;
+      const int temp_out_backprop_offset =
+          out_depth * out_width * (out_row + out_height * batch);
+      const int temp_filter_offset = filter_width * filter_row;
+      NOUNROLL for (int out_col = out_col_start; out_col <= out_col_end;
+                    ++out_col) {
+        const int filter_col = in_col + pad_width - out_col * stride;
+        int filter_offset =
+            depth_multiplier *
+            (in_channel + in_depth * (filter_col + temp_filter_offset));
+        const int out_backprop_offset =
+            out_depth * out_col + temp_out_backprop_offset;
+#pragma unroll 6
+        for (int i = 0; i < depth_multiplier; ++i) {
+          sum += ldg(out_backprop + out_backprop_offset +
+                     in_channel * depth_multiplier + i) *
+                 ldg(filter + filter_offset + i);
+        }
+      }
+    }
+    const int in_backprop_offset =
+        in_channel +
+        in_depth * (in_col + in_width * (in_row + in_height * batch));
+    in_backprop[in_backprop_offset] = sum;
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNCHW(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ filter, T* __restrict__ in_backprop,
+        int num_in_backprop) {
+  const FastDividerUint32 in_height = args.in_rows;
+  const FastDividerUint32 in_width = args.in_cols;
+  const FastDividerUint32 in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  // TODO(vrv): Consider assigning threads to output and using
+  // atomics for accumulation, similar to the filter case.
+  GPU_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
+    // Compute the indexes of this thread in the input.
+    const int in_col = thread_id % in_width;
+    const int in_row = (thread_id / in_width) % in_height;
+    const int in_channel = (thread_id / in_width / in_height) % in_depth;
+    const int batch = thread_id / in_depth / in_width / in_height;
+
+    T sum = static_cast<T>(0);
+    const int out_channel_start = in_channel * depth_multiplier;
+    const int out_channel_end = out_channel_start + depth_multiplier;
+
+    const int out_row_start =
+        tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride);
+    const int out_row_end =
+        tf_min<int>(out_height - 1, (in_row + pad_height) / stride);
+    const int out_col_start =
+        tf_max<int>(0, (in_col - filter_width + pad_width + stride) / stride);
+    const int out_col_end =
+        tf_min<int>(out_width - 1, (in_col + pad_width) / stride);
+
+    UNROLL for (int out_channel = out_channel_start;
+                out_channel < out_channel_end; ++out_channel) {
+      UNROLL for (int out_row = out_row_start; out_row <= out_row_end;
+                  ++out_row) {
+        const int filter_row = in_row + pad_height - out_row * stride;
+        const int filter_dm = out_channel - out_channel_start;
+
+        const int temp_filter_offset = filter_width * filter_row;
+        for (int out_col = out_col_start; out_col <= out_col_end; ++out_col) {
+          const int filter_col = in_col + pad_width - out_col * stride;
+          const int filter_offset =
+              filter_dm +
+              args.depth_multiplier *
+                  (in_channel + in_depth * (filter_col + temp_filter_offset));
+
+          const int out_backprop_offset =
+              (batch * out_depth * out_height * out_width) +
+              (out_channel * out_height * out_width) + (out_row * out_width) +
+              (out_col);
+
+          sum += ldg(out_backprop + out_backprop_offset) *
+                 ldg(filter + filter_offset);
+        }
+      }
+    }
+    const int in_backprop_offset = (batch * in_height * in_width * in_depth) +
+                                   (in_channel * in_height * in_width) +
+                                   (in_row * in_width) + (in_col);
+    in_backprop[in_backprop_offset] = sum;
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx,
+                                             const DepthwiseArgs& args,
+                                             const T* out_backprop,
+                                             const T* filter, T* in_backprop,
+                                             TensorFormat data_format) {
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
+  switch (data_format) {
+    case FORMAT_NHWC:
+      kernel = DepthwiseConv2dBackpropInputGPUKernelNHWC<
+          T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
+      break;
+    case FORMAT_NCHW:
+      kernel = DepthwiseConv2dBackpropInputGPUKernelNCHW<
+          T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
+      break;
+    default:
+      return errors::InvalidArgument("FORMAT_", ToString(data_format),
+                                     " is not supported");
+  }
+  const int num_in_backprop =
+      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  auto device = ctx->eigen_gpu_device();
+  int launch_bounds_value = 640;
+  GpuLaunchConfig config = GetGpuLaunchConfig(num_in_backprop, device, kernel,
+                                              0, launch_bounds_value);
+  TF_CHECK_OK(GpuLaunchKernel(
+      kernel, config.block_count, config.thread_per_block, 0, device.stream(),
+      args, out_backprop, filter, in_backprop, num_in_backprop));
+  return OkStatus();
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx,
+                                             const DepthwiseArgs& args,
+                                             const T* out_backprop,
+                                             const T* filter, T* in_backprop,
+                                             TensorFormat data_format) {
+  if (args.depth_multiplier == 1) {
+    // This kernel doesn't currently work in all cases so it is disabled.
+    // TODO(b/150988950): Fix and reenable this kernel.
+    if (/* CanLaunchDepthwiseConv2dGPUSmall(args) */ false) {
+      return LaunchDepthwiseConv2dGPUSmall<
+          T, DIRECTION_BACKWARD, kKnownFilterWidth, kKnownFilterHeight>(
+          ctx, args, out_backprop, filter, in_backprop, data_format);
+    }
+
+    return LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
+                                                 kKnownFilterHeight, 1>(
+        ctx, args, out_backprop, filter, in_backprop, data_format);
+  } else {
+    return LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
+                                                 kKnownFilterHeight, -1>(
+        ctx, args, out_backprop, filter, in_backprop, data_format);
+  }
+}
+
+// A simple launch pad to launch the GPU kernel for depthwise convolution.
+template <typename T>
+void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* filter, T* in_backprop, TensorFormat data_format) {
+  if (args.filter_rows == 3 && args.filter_cols == 3) {
+    OP_REQUIRES_OK(
+        ctx, LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
+                 ctx, args, out_backprop, filter, in_backprop, data_format));
+  } else {
+    OP_REQUIRES_OK(
+        ctx, LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>(
+                 ctx, args, out_backprop, filter, in_backprop, data_format));
+  }
+}
+
+// A GPU kernel to compute the depthwise convolution backprop w.r.t. filter.
+// TODO: Add fp32 accumulation to half calls of this function. This addition
+// is non-trivial as the partial sums are added directly to the output
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ input, T* __restrict__ filter_backprop,
+        int num_out_backprop) {
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  GPU_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
+    // Compute the indexes of this thread in the output.
+    const int out_channel = thread_id % out_depth;
+    const int out_col = (thread_id / out_depth) % out_width;
+    const int out_row = (thread_id / out_depth / out_width) % out_height;
+    const int batch = thread_id / out_depth / out_width / out_height;
+    // Compute the input depth and the index of depth multiplier.
+    const int in_channel = out_channel / depth_multiplier;
+    const int dm = out_channel % depth_multiplier;
+
+    // Decide if all input is valid, if yes, we can skip the boundary checks
+    // for each input.
+    const int in_row_start = out_row * stride - pad_height;
+    const int in_col_start = out_col * stride - pad_width;
+    const int in_row_end = in_row_start + filter_height;
+    const int in_col_end = in_col_start + filter_width;
+
+    const int out_backprop_offset =
+        out_channel +
+        out_depth * (out_col + out_width * (out_row + out_height * batch));
+    const T out_bp = ldg(out_backprop + out_backprop_offset);
+    if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height &&
+        in_col_end < in_width) {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = in_row_start + filter_row;
+        // Avoid repeated computation.
+        const int input_offset_temp = in_width * (in_row + in_height * batch);
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = in_col_start + filter_col;
+
+          const int input_offset =
+              in_channel + in_depth * (in_col + input_offset_temp);
+          T partial_sum = ldg(input + input_offset) * out_bp;
+          T* addr =
+              filter_backprop +
+              (dm + depth_multiplier *
+                        (in_channel +
+                         in_depth * (filter_col + filter_width * filter_row)));
+          GpuAtomicAdd(addr, partial_sum);
+        }
+      }
+    } else {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = in_row_start + filter_row;
+        // Avoid repeated computation.
+        const int input_offset_temp = in_width * (in_row + in_height * batch);
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = in_col_start + filter_col;
+          const int addr_temp = filter_width * filter_row;
+
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
+            const int input_offset =
+                in_channel + in_depth * (in_col + input_offset_temp);
+            T partial_sum = ldg(input + input_offset) * out_bp;
+            T* addr =
+                filter_backprop +
+                (dm + depth_multiplier *
+                          (in_channel + in_depth * (filter_col + addr_temp)));
+            // Potentially many threads can add to the same address so we have
+            // to use atomic add here.
+            // TODO(jmchen): If atomic add turns out to be slow, we can:
+            // 1. allocate multiple buffers for the gradients (one for each
+            // example in a batch, for example). This can reduce the
+            // contention on the destination; 2. Have each thread compute one
+            // gradient for an element in the filters. This should work well
+            // when the input depth is big and filter size is not too small.
+            GpuAtomicAdd(addr, partial_sum);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Device function to compute sub-warp sum reduction for a power-of-two group of
+// neighboring threads.
+template <int kWidth, typename T>
+__device__ __forceinline__ T WarpSumReduce(T val) {
+  // support only power-of-two widths.
+  assert(__popc(kWidth) == 1);
+  int sub_warp = GpuLaneId() / kWidth;
+  int zeros = sub_warp * kWidth;
+  unsigned mask = ((1UL << kWidth) - 1) << zeros;
+  for (int delta = kWidth / 2; delta > 0; delta /= 2) {
+    val += GpuShuffleXorSync(mask, val, delta);
+  }
+  return val;
+}
+
+// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
+// NHWC format, tailored for small images up to 32x32. Stride and depth
+// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
+// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input tensor are loaded into shared memory before performing the
+// convolution. Per iteration and filter element, each thread first performs
+// a partial convolution for two elements, one each in the lower and upper half
+// of a tile. The intermediate result of all pixels of a warp are then
+// accumulated and written to shared memory. Finally, the values in shared
+// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
+// up in global memory using atomics.
+// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
+// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth.
+// T is the tensors' data type. S is the math type the kernel uses. This is the
+// same as T for all cases but pseudo half (which has T=Eigen::half, S=float).
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockDepth, int kAccumPixels>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
+    const DepthwiseArgs args, const T* __restrict__ output,
+    const T* __restrict__ input, T* __restrict__ filter) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
+  // Holds block plus halo and filter data for blockDim.x depths.
+  GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory);
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
+  S* const shared_data = reinterpret_cast<S*>(shared_memory);
+
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = blockDim.y;  // slower (see b/62280718): args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+
+  assert(blockDim.x == kBlockDepth);
+  assert(blockDim.y == args.in_cols);
+  const int block_height = blockDim.z;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_size = block_height * in_width * kBlockDepth;
+  assert((block_size & 31) == 0);
+  const int in_row_size = in_width * in_depth;
+  const int in_size = in_height * in_row_size;
+  const int in_increment = (in_width - 1) * kBlockDepth;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int tile_height = 2 * block_height + filter_height - 1;
+  const int tile_row_size = tile_width * kBlockDepth;
+  const int tile_size = tile_height * tile_row_size;
+  const int tile_offset = block_height * tile_row_size;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth;
+  const int in_blocks = batch_blocks * num_batches;
+  const int tensor_offset = block_height * in_row_size;
+  // The accumulator has a fixed number of pixels that can be reduced by one
+  // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written.
+  assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth);
+  const int accum_increment = kAccumPixels * kBlockDepth;
+  const int accum_size = filter_pixels * accum_increment;
+
+  const int thread_depth = threadIdx.x;
+  const int thread_col = threadIdx.y;
+  const int thread_row = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_width + thread_col;
+  const int thread_idx = thread_pix * kBlockDepth + thread_depth;
+
+  // Initialize tile, in particular the padding and accumulator.
+  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
+    shared_data[i] = S();
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_pix * in_depth + thread_depth;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_width + thread_col;
+  const int data_idx = data_pix * kBlockDepth + thread_depth;
+
+  // Position in shared memory, offset by pad_height / pad_width.
+  const int tile_pix = data_pix + pad_offset;
+  const int tile_idx = tile_pix * kBlockDepth + thread_depth;
+
+  // Position in accumulator (kBlockDepth per warp, depth major).
+  const int accum_pix = thread_pix / (32 / kBlockDepth);
+  const int accum_idx = thread_depth * kAccumPixels + accum_pix;
+
+  const int max_channel = in_depth - thread_depth;
+  const int accum_offset = tile_size + accum_idx;
+  const bool skip_second = block_height + thread_row >= in_height;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int batch = b / batch_blocks;
+    const int block = b - batch * batch_blocks;
+
+    const int start_channel = block * kBlockDepth;
+    const int filter_offset = tensor_idx + start_channel;
+    const int inout_offset = batch * in_size + filter_offset;
+    const bool channel_in_range = start_channel < max_channel;
+
+    if (channel_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      S* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = static_cast<S>(ldg(in_ptr));
+      if (!skip_second) {
+        tile_ptr[tile_offset] = static_cast<S>(ldg(tensor_offset + in_ptr));
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+    unsigned active_threads = GpuBallotSync(kCudaWarpAll, channel_in_range);
+
+    if (channel_in_range) {
+      const T* const out_ptr = inout_offset + output;
+      const S out1 = static_cast<S>(ldg(out_ptr));
+      const S out2 =
+          skip_second ? S() : static_cast<S>(ldg(tensor_offset + out_ptr));
+      int shared_offset = data_idx;
+      S* accum_ptr = accum_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
+          const S* const tile_ptr = shared_offset + shared_data;
+          S val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
+          // Warp-accumulate pixels of the same depth and write to accumulator.
+          for (int delta = 16; delta >= kBlockDepth; delta /= 2) {
+            val += GpuShuffleXorSync(active_threads, val, delta);
+          }
+          if (!(thread_idx & 32 - kBlockDepth) /* lane_idx < kBlockDepth */) {
+            *accum_ptr = val;
+          }
+          shared_offset += kBlockDepth;
+          accum_ptr += accum_increment;
+        }
+        shared_offset += in_increment;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    const S* const accum_data = tile_size + shared_data;
+    for (int i = thread_idx; i < accum_size; i += block_size) {
+      const int filter_idx = i / kAccumPixels;
+      const int filter_pix = filter_idx / kBlockDepth;
+      const int filter_channel = filter_idx % kBlockDepth + start_channel;
+      const int filter_offset = filter_pix * in_depth + filter_channel;
+      if (filter_channel < in_depth) {
+        S val = accum_data[i];
+        // Warp-accumulate the pixels of the same depth from the accumulator.
+        val = WarpSumReduce<kAccumPixels>(val);
+        if (!(thread_idx & kAccumPixels - 1)) {
+          GpuAtomicAdd(filter_offset + filter, static_cast<T>(val));
+        }
+      }
+    }
+  }
+}
+
+// A GPU kernel to compute the depthwise convolution backprop w.r.t. filter.
+template <typename T>
+__global__ void __launch_bounds__(512, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW(
+        const DepthwiseArgs args, const T* __restrict__ out_backprop,
+        const T* __restrict__ input, T* __restrict__ filter_backprop) {
+  const int batch_num = args.batch;
+  const int in_depth = args.in_depth;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
+  const int filter_width = args.filter_cols;
+  const int stride_height = args.stride;
+  const int stride_width = args.stride;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_depth = args.out_depth;
+  const int out_height = args.out_rows;
+  const FastDividerUint32 out_width = args.out_cols;
+  const FastDividerUint32 depth_multiplier = args.depth_multiplier;
+  assert(gridDim.x == filter_width);
+  assert(gridDim.z == out_depth);
+
+  typedef gpuprim::WarpReduce<T> WarpReduce;
+  typename WarpReduce::TempStorage temp_storage;
+
+  const int filter_w = blockIdx.x;
+  const int filter_h = blockIdx.y;
+  const int out_c = blockIdx.z;
+
+  const int in_c = out_c / depth_multiplier;
+  const int dm = out_c % depth_multiplier;
+  const int filter_backprop_offset =
+      (((filter_h * filter_width) + filter_w) * in_depth + in_c) *
+          depth_multiplier +
+      dm;
+  const int out_spatial_size = out_height * out_width;
+
+  T partial_sum = static_cast<T>(0.f);
+  for (int batch = 0; batch < batch_num; batch++) {
+    const int input_offset_temp = (batch * in_depth + in_c) * in_height;
+    const int output_backprop_offset_temp =
+        (batch * out_depth + out_c) * out_height;
+    for (int i = threadIdx.x; i < out_spatial_size; i += blockDim.x) {
+      const int out_col = i % out_width;
+      const int out_row = i / out_width;
+      // We use the formula: `(in_row - filter_w + pad_left ) / stride =
+      // out_row` to compute corresponding in_row and out_row positions. Similar
+      // for in_col and out_col.
+      const int in_row = out_row * stride_height + filter_h - pad_height;
+      const int in_col = out_col * stride_width + filter_w - pad_width;
+
+      if (in_row < 0 || in_col < 0 || in_row >= in_height ||
+          in_col >= in_width) {
+        continue;
+      }
+
+      int input_offset = (input_offset_temp + in_row) * in_width + in_col;
+      int output_backprop_offset =
+          (output_backprop_offset_temp + out_row) * out_width + out_col;
+      partial_sum += out_backprop[output_backprop_offset] * input[input_offset];
+    }
+  }
+
+  T val = WarpReduce(temp_storage).Sum(partial_sum);
+  if (gpuprim::LaneId() == 0) {
+    T* addr = filter_backprop + filter_backprop_offset;
+    GpuAtomicAdd(addr, val);
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
+// NCHW format, tailored for small images up to 32x32. Stride and depth
+// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
+// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input tensor are loaded into shared memory before performing the
+// convolution. Per iteration and filter element, each thread first performs
+// a partial convolution for two elements, one each in the lower and upper half
+// of a tile. The intermediate result of all pixels of a warp are then
+// accumulated and written to shared memory. Finally, the values in shared
+// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
+// up in global memory using atomics.
+// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
+// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockDepth, int kAccumPixels>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
+    const DepthwiseArgs args, const T* __restrict__ output,
+    const T* __restrict__ input, T* __restrict__ filter) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
+  // Holds block plus halo and filter data for blockDim.z depths.
+  GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory);
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
+  S* const shared_data = reinterpret_cast<S*>(shared_memory);
+
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = blockDim.x;  // slower (see b/62280718): args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_height =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_width =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+
+  assert(blockDim.x == args.in_cols);
+  assert(blockDim.z == kBlockDepth);
+  const int block_height = blockDim.y;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_pixels = in_width * block_height;
+  const int block_size = block_pixels * kBlockDepth;
+  assert((block_size & 31) == 0);
+  const int in_pixels = in_width * in_height;
+  const int in_increment = in_width - 1;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int tile_height = 2 * block_height + filter_height - 1;
+  const int tile_pixels = tile_width * tile_height;
+  const int tile_size = tile_pixels * kBlockDepth;
+  const int tile_offset = block_height * tile_width;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int in_total_depth = in_depth * num_batches;
+  const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth;
+  // The accumulator has a fixed number of pixels that can be reduced by one
+  // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written.
+  assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth);
+  const int accum_increment = kAccumPixels * kBlockDepth;
+  const int accum_size = filter_pixels * accum_increment;
+
+  const int thread_col = threadIdx.x;
+  const int thread_row = threadIdx.y;
+  const int thread_depth = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_width + thread_col;
+  const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+  // Initialize tile, in particular the padding and accumulator.
+  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
+    shared_data[i] = S();
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_width + thread_col;
+  const int data_idx = thread_depth * tile_pixels + data_pix;
+
+  // Position in shared memory, offset by pad_height / pad_width.
+  const int tile_idx = data_idx + pad_offset;
+
+  // Position in accumulator (kBlockDepth per warp, depth major).
+  const int accum_pix = thread_pix / (32 / kBlockDepth);
+  const int accum_idx = thread_depth * kAccumPixels + accum_pix;
+
+  const int max_channel = in_total_depth - thread_depth;
+  const int accum_offset = tile_size + accum_idx;
+  const bool skip_second = block_height + thread_row >= in_height;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int channel = b * kBlockDepth;
+
+    const int inout_offset = channel * in_pixels + tensor_idx;
+    const bool channel_in_range = channel < max_channel;
+
+    if (channel_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      S* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = static_cast<S>(ldg(in_ptr));
+      if (!skip_second) {
+        tile_ptr[tile_offset] = static_cast<S>(ldg(block_pixels + in_ptr));
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+    unsigned active_threads = GpuBallotSync(kCudaWarpAll, channel_in_range);
+
+    if (channel_in_range) {
+      const T* const out_ptr = inout_offset + output;
+      const S out1 = static_cast<S>(ldg(out_ptr));
+      const S out2 =
+          skip_second ? S() : static_cast<S>(ldg(block_pixels + out_ptr));
+      int shared_offset = data_idx;
+      S* accum_ptr = accum_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
+          const S* const tile_ptr = shared_offset + shared_data;
+          S val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
+          // Warp-accumulate pixels of the same depth and write to accumulator.
+          for (int delta = 16 / kBlockDepth; delta > 0; delta /= 2) {
+            val += GpuShuffleXorSync(active_threads, val, delta);
+          }
+          if (!(thread_idx & 32 / kBlockDepth - 1)) {
+            *accum_ptr = val;  // kBlockDepth threads per warp.
+          }
+          ++shared_offset;
+          accum_ptr += accum_increment;
+        }
+        shared_offset += in_increment;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    const S* const accum_data = tile_size + shared_data;
+    for (int i = thread_idx; i < accum_size; i += block_size) {
+      const int filter_idx = i / kAccumPixels;
+      const int filter_pix = filter_idx / kBlockDepth;
+      const int filter_channel =
+          (channel + filter_idx % kBlockDepth) % in_depth;
+      const int filter_offset = filter_pix * in_depth + filter_channel;
+      if (filter_channel < in_depth) {
+        S val = accum_data[i];
+        // Warp-accumulate pixels of the same depth from the accumulator.
+        val = WarpSumReduce<kAccumPixels>(val);
+        if (!(thread_idx & kAccumPixels - 1)) {
+          GpuAtomicAdd(filter_offset + filter, static_cast<T>(val));
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockDepth, int kAccumPixels>
+Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const int block_height,
+    const T* out_backprop, const T* input, T* filter_backprop,
+    TensorFormat data_format) {
+  typedef typename detail::PseudoHalfType<T>::Type S;
+  auto device = ctx->eigen_gpu_device();
+  const int tile_width = args.in_cols + args.filter_cols - 1;
+  const int tile_height = block_height * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_height * tile_width;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
+  const int shared_memory_size =
+      kBlockDepth * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(S);
+  if (shared_memory_size > device.sharedMemPerBlock()) {
+    return errors::FailedPrecondition("Not enough shared memory");
+  }
+
+  dim3 block_dim;
+  int block_count;
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
+  switch (data_format) {
+    case FORMAT_NHWC:
+      block_dim = dim3(kBlockDepth, args.in_cols, block_height);
+      block_count =
+          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
+      kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
+      break;
+    case FORMAT_NCHW:
+      block_dim = dim3(args.in_cols, block_height, kBlockDepth);
+      block_count =
+          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
+      kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
+      break;
+    default:
+      return errors::InvalidArgument("FORMAT_", ToString(data_format),
+                                     " is not supported");
+  }
+  const int num_out_backprop = args.out_rows * args.out_cols * block_count;
+  GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+      num_out_backprop, device, kernel, shared_memory_size,
+      block_dim.x * block_dim.y * block_dim.z);
+  TF_CHECK_OK(GpuLaunchKernel(kernel, config.block_count, block_dim,
+                              shared_memory_size, device.stream(), args,
+                              out_backprop, input, filter_backprop));
+  return OkStatus();
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockDepth>
+Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const int block_height,
+    const T* out_backprop, const T* input, T* filter_backprop,
+    TensorFormat data_format) {
+  // Minimize (power of two) kAccumPixels, while satisfying
+  // kAccumPixels * 32 >= block_height * in_width * kBlockDepth.
+  const int block_pixels = block_height * args.in_cols * kBlockDepth;
+  if (block_pixels > 512) {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 32>(
+        ctx, args, block_height, out_backprop, input, filter_backprop,
+        data_format);
+  } else if (block_pixels > 256) {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 16>(
+        ctx, args, block_height, out_backprop, input, filter_backprop,
+        data_format);
+  } else {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 8>(
+        ctx, args, block_height, out_backprop, input, filter_backprop,
+        data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  // Maximize (power of two) kBlockDepth while keeping a block within 1024
+  // threads (2 pixels per thread).
+  int block_depth = 8;
+  int block_height = (args.in_rows + 1) / 2;
+  int round_mask = 1;
+  for (; block_depth > 1; block_depth /= 2) {
+    // args.in_cols * block_height * kBlockDepth must be multiple of 32.
+    for (; block_height * args.in_cols * block_depth & 31;
+         round_mask = round_mask * 2 + 1) {
+      block_height = block_height + round_mask & ~round_mask;
+    }
+    int block_size = block_height * args.in_cols * block_depth;
+    if (block_size <= 1024) {
+      break;
+    }
+  }
+
+  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_height)) {
+    return errors::FailedPrecondition("Cannot launch this configuration");
+  }
+
+  switch (block_depth) {
+    case 8:
+      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, 8>(
+          ctx, args, block_height, out_backprop, input, filter_backprop,
+          data_format);
+    case 4:
+      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, 4>(
+          ctx, args, block_height, out_backprop, input, filter_backprop,
+          data_format);
+    case 2:
+      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, 2>(
+          ctx, args, block_height, out_backprop, input, filter_backprop,
+          data_format);
+    default:
+      return errors::InvalidArgument("Unexpected block depth");
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+Status LaunchDepthwiseConv2dBackpropFilterGPU(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  auto device = ctx->eigen_gpu_device();
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  if (data_format == FORMAT_NHWC) {
+    auto kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
+
+    int launch_bounds_value = 640;
+    GpuLaunchConfig config = GetGpuLaunchConfig(num_out_backprop, device,
+                                                kernel, 0, launch_bounds_value);
+    TF_CHECK_OK(GpuLaunchKernel(
+        kernel, config.block_count, config.thread_per_block, 0, device.stream(),
+        args, out_backprop, input, filter_backprop, num_out_backprop));
+  } else if (data_format == FORMAT_NCHW) {
+    auto kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHW<T>;
+    dim3 blocks = dim3(args.filter_cols, args.filter_rows, args.out_depth);
+    dim3 threads = dim3(512, 1, 1);
+
+    TF_CHECK_OK(GpuLaunchKernel(kernel, blocks, threads, 0, device.stream(),
+                                args, out_backprop, input, filter_backprop));
+  } else {
+    return errors::InvalidArgument("FORMAT_", ToString(data_format),
+                                   " is not supported");
+  }
+
+  return OkStatus();
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+Status LaunchDepthwiseConv2dBackpropFilterGPU(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  if (args.depth_multiplier == 1) {
+    if (TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<T, kKnownFilterWidth,
+                                                       kKnownFilterHeight>(
+            ctx, args, out_backprop, input, filter_backprop, data_format)
+            .ok()) {
+      return OkStatus();
+    }
+
+    return LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
+                                                  kKnownFilterHeight, 1>(
+        ctx, args, out_backprop, input, filter_backprop, data_format);
+  } else {
+    return LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
+                                                  kKnownFilterHeight, -1>(
+        ctx, args, out_backprop, input, filter_backprop, data_format);
+  }
+}
+
+// A simple launch pad to launch the GPU kernel for depthwise convolution.
+template <typename T>
+void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  auto stream = ctx->op_device_context()->stream();
+
+  // It's simpler to catch this here than in
+  // DepthwiseConv2dNativeBackpropFilterOp
+  OP_REQUIRES(
+      ctx, !OpDeterminismRequired(),
+      errors::Unimplemented(
+          "A deterministic GPU implementation of DepthwiseConvBackpropFilter is"
+          " not available with this version of cuDNN. Please build with cuDNN"
+          " version 7.6.3 or later."));
+
+  // Initialize the results to 0.
+  int num_filter_backprop =
+      args.filter_rows * args.filter_cols * args.out_depth;
+  se::DeviceMemoryBase filter_bp_ptr(filter_backprop, num_filter_backprop);
+  OP_REQUIRES_OK(
+      ctx, stream->MemZero(&filter_bp_ptr, num_filter_backprop * sizeof(T)));
+
+  if (args.filter_rows == 3 && args.filter_cols == 3) {
+    OP_REQUIRES_OK(
+        ctx, LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>(
+                 ctx, args, out_backprop, input, filter_backprop, data_format));
+  } else {
+    OP_REQUIRES_OK(
+        ctx, LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>(
+                 ctx, args, out_backprop, input, filter_backprop, data_format));
+  }
+}
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/diag_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/diag_op.h
new file mode 100644
index 00000000..c41da62d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/diag_op.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DIAG_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct DiagFunctor {
+  absl::Status operator()(OpKernelContext* context, const int64_t size,
+                          const T* in, T* out);
+};
+
+template <typename Device, typename T>
+struct DiagPartFunctor {
+  absl::Status operator()(OpKernelContext* context, const int64_t size,
+                          const T* in, T* out);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DIAG_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/dilation_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/dilation_ops.h
new file mode 100644
index 00000000..4f0b944a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/dilation_ops.h
@@ -0,0 +1,66 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DILATION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DILATION_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct Dilation {
+  // We assume that the tensor sizes are correct.
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 3>::ConstTensor filter, int stride_rows,
+                  int stride_cols, int rate_rows, int rate_cols, int pad_top,
+                  int pad_left, typename TTypes<T, 4>::Tensor output);
+};
+
+template <typename Device, typename T>
+struct DilationBackpropInput {
+  // We assume that the tensor sizes are correct.
+  // To avoid storing the argmax values during forward computation, we recompute
+  // the argmax during backward computation, which is the reason why we provide
+  // filter as argument to the backward computation routine.
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 3>::ConstTensor filter,
+                  typename TTypes<T, 4>::ConstTensor out_backprop,
+                  int stride_rows, int stride_cols, int rate_rows,
+                  int rate_cols, int pad_top, int pad_left,
+                  typename TTypes<T, 4>::Tensor in_backprop);
+};
+
+template <typename Device, typename T>
+struct DilationBackpropFilter {
+  // We assume that the tensor sizes are correct.
+  // To avoid storing the argmax values during forward computation, we recompute
+  // the argmax during backward computation, which is the reason why we provide
+  // filter as argument to the backward computation routine.
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<T, 3>::ConstTensor filter,
+                  typename TTypes<T, 4>::ConstTensor out_backprop,
+                  int stride_rows, int stride_cols, int rate_rows,
+                  int rate_cols, int pad_top, int pad_left,
+                  typename TTypes<T, 3>::Tensor filter_backprop);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DILATION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_activations.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_activations.h
new file mode 100644
index 00000000..8224627f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_activations.h
@@ -0,0 +1,122 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+namespace Eigen {
+
+/** scalar_sigmoid_fast_derivative_op
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a sigmoid
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+ */
+template <typename T>
+struct scalar_sigmoid_fast_derivative_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
+    const T one = T(1);
+    return (one - y) * y;
+  }
+
+  template <typename Packet>
+  inline Packet packetOp(const Packet& y) const {
+    const Packet one = internal::pset1<Packet>(1);
+    return internal::pmul(internal::psub(one, y), y);
+  }
+};
+
+namespace internal {
+template <typename T>
+struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
+                   packet_traits<T>::HasNegate
+  };
+};
+}  // namespace internal
+
+/** scalar_tanh_fast_derivative_op
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a tanh
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+ */
+template <typename T>
+struct scalar_tanh_fast_derivative_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
+    const T one = T(1);
+    return one - (y * y);
+  }
+
+  template <typename Packet>
+  inline Packet packetOp(const Packet& y) const {
+    const Packet one = internal::pset1<Packet>(1);
+    return internal::psub(one, internal::pmul(y, y));
+  }
+};
+
+namespace internal {
+template <typename T>
+struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 1,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
+                   packet_traits<T>::HasNegate
+  };
+};
+}  // namespace internal
+
+/**
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
+template <typename Scalar>
+struct scalar_clip_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return numext::mini(numext::maxi(a, -b), b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
+    return internal::pmin(internal::pmax(a, internal::pnegate(b)), b);
+  }
+};
+
+namespace internal {
+template <typename Scalar>
+struct functor_traits<scalar_clip_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost * 3,
+    PacketAccess = packet_traits<Scalar>::HasMax &&
+                   packet_traits<Scalar>::HasMin &&
+                   packet_traits<Scalar>::HasNegate
+  };
+};
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_attention.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_attention.h
new file mode 100644
index 00000000..7eec12bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_attention.h
@@ -0,0 +1,300 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+namespace Eigen {
+
+// Noise mode used when padding.
+enum ExtractGlimpsesNoiseMode {
+  UNIFORM = 0,
+  GAUSSIAN = 1,
+  ZERO = 2,
+};
+
+/** ExtractGlimpses
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Extract glimpses from an input tensor.
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 4
+ * (depth, x, y, and batch). The width and height parameters specify the
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
+ * locations of the center of the glimpses relative to the center of the input
+ * image. The vector is expected to contain one IndexPair for each image in the
+ * batch dimension. The normalized boolean indicates if incoming coordinates are
+ * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
+ * height and width dimension. The centered boolean indicates if incoming
+ * coordinates are centered relative to the image, in which case -1.0 and 1.0
+ * correspond to minimum and maximum of each dimension while 0.0 corresponds to
+ * the center.
+ *
+ * The result can be assigned to a tensor of rank equal to that of the input.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
+ * dimensions of the result will be equal to the dimensions of the input except
+ * for width and height which will be equal to the requested glimpse size.
+ */
+namespace {
+
+template <typename Index>
+struct GlimpseExtractionOp {
+  GlimpseExtractionOp(const Index width, const Index height,
+                      const std::vector<IndexPair<float> >& offsets,
+                      const bool normalized, const bool centered,
+                      const ExtractGlimpsesNoiseMode noise, const int version)
+      : width_(width),
+        height_(height),
+        offsets_(offsets),
+        normalized_(normalized),
+        centered_(centered),
+        noise_(noise),
+        version_(version) {}
+
+  template <typename Input>
+  DSizes<Index, 4> dimensions(const Input& input) const {
+    typedef typename internal::traits<Input>::Index IndexType;
+    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
+    Ref in(input);
+
+    DSizes<Index, 4> dims = in.dimensions();
+
+    dims[0] = in.dimension(0);
+    dims[1] = width_;
+    dims[2] = height_;
+    dims[3] = in.dimension(3);
+    return dims;
+  }
+
+  template <typename Input, typename Output, typename Device>
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
+                              const Device& device) const {
+    typedef typename internal::traits<Input>::Index IndexType;
+    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
+    Ref in(input);
+    const Index num_channels = in.dimension(0);
+    const Index input_width = in.dimension(1);
+    const Index input_height = in.dimension(2);
+    const Index batch_size = in.dimension(3);
+    eigen_assert(input_width > 0);
+    eigen_assert(input_height > 0);
+    internal::NormalRandomGenerator<float> gen;
+    internal::UniformRandomGenerator<float> unigen;
+
+    for (Index i = 0; i < batch_size; ++i) {
+      float x = offsets_[i].first, y = offsets_[i].second;
+
+      if (version_ == 1) {
+        // Un-normalize coordinates back to pixel space if normalized.
+        if (normalized_) {
+          x *= input_width;
+          y *= input_height;
+        }
+        // Un-center if coordinates are centered on the image center.
+        if (centered_) {
+          x /= 2.0f;
+          y /= 2.0f;
+          x += input_width / 2.0f;
+          y += input_height / 2.0f;
+        }
+        // Remove half of the glimpse window.
+        x -= width_ / 2.0f;
+        y -= height_ / 2.0f;
+      } else {
+        if (normalized_) {
+          // Un-normalize coordinates back to pixel space if normalized.
+          x *= input_width;
+          y *= input_height;
+          if (centered_) {
+            // Un-center if coordinates are centered on the image center.
+            x /= 2.0f;
+            y /= 2.0f;
+            x += input_width / 2.0f;
+            y += input_height / 2.0f;
+            // Remove half of the glimpse window.
+            x -= width_ / 2.0f;
+            y -= height_ / 2.0f;
+          }
+        } else {
+          if (centered_) {
+            x += input_width / 2.0f;
+            y += input_height / 2.0f;
+          }
+        }
+      }
+
+      const Index offset_x = (Index)x;
+      const Index offset_y = (Index)y;
+      Index glimpse_width = width_;
+      Index glimpse_height = height_;
+      bool partial_overlap = false;
+      DSizes<Index, 3> slice_offset(0, offset_x, offset_y);
+      DSizes<Index, 3> slice_extent(num_channels, width_, height_);
+      DSizes<Index, 3> base_offset(0, 0, 0);
+
+      if (offset_x < 0) {
+        slice_offset[1] = 0;
+        glimpse_width = (std::max<Index>)(0, width_ + offset_x);
+        slice_extent[1] = glimpse_width;
+        base_offset[1] = width_ - glimpse_width;
+        partial_overlap = true;
+      } else if (offset_x + width_ >= input_width) {
+        glimpse_width = (std::max<Index>)(0, input_width - offset_x);
+        slice_extent[1] = glimpse_width;
+        partial_overlap = true;
+      }
+      if (offset_y < 0) {
+        slice_offset[2] = 0;
+        glimpse_height = (std::max<Index>)(0, height_ + offset_y);
+        slice_extent[2] = glimpse_height;
+        base_offset[2] = height_ - glimpse_height;
+        partial_overlap = true;
+      } else if (offset_y + height_ >= input_height) {
+        glimpse_height = (std::max<Index>)(0, input_height - offset_y);
+        slice_extent[2] = glimpse_height;
+        partial_overlap = true;
+      }
+      slice_extent[1] = std::min<Index>(input_width, slice_extent[1]);
+      slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
+
+      if (partial_overlap) {
+        switch (noise_) {
+          case ZERO: {
+            // Initialize the glimpse with zero noise.
+            output.template chip<3>(i).device(device) =
+                output.template chip<3>(i).constant(0);
+          } break;
+          case UNIFORM: {
+            // Initialize the glimpse with uniform noise.
+            typedef std::remove_const_t<
+                typename internal::traits<Input>::Scalar>
+                Scalar;
+            TensorFixedSize<Scalar, Sizes<> > mini;
+            mini.device(device) = input.template chip<3>(i).minimum();
+            TensorFixedSize<float, Sizes<> > range;
+            range.device(device) = (input.template chip<3>(i).maximum() - mini)
+                                       .template cast<float>();
+
+            DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
+            TensorMap<Tensor<float, 3> > tmp(nullptr, glimpse_size);
+            output.template chip<3>(i).device(device) =
+                mini.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size) +
+                (tmp.random(unigen) *
+                 range.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size))
+                    .template cast<Scalar>();
+          } break;
+          case GAUSSIAN: {
+            // Initialize the glimpse with white noise: compute the mean and
+            // sigma
+            // of each channel, and use them to shape the gaussian.
+            DSizes<Index, 2> glimpse_size(width_, height_);
+            DSizes<Index, 2> input_size(input_width, input_height);
+            typedef std::remove_const_t<
+                typename internal::traits<Input>::Scalar>
+                Scalar;
+
+            for (int j = 0; j < num_channels; ++j) {
+              TensorFixedSize<Scalar, Sizes<> > mean;
+              mean.device(device) = input.template chip<3>(i)
+                                        .template chip<0>(j)
+                                        .template cast<float>()
+                                        .mean();
+              TensorFixedSize<float, Sizes<> > sigma;
+              sigma.device(device) =
+                  (input.template chip<3>(i)
+                       .template chip<0>(j)
+                       .template cast<float>() -
+                   mean.reshape(Sizes<1, 1>()).broadcast(input_size))
+                      .square()
+                      .mean()
+                      .sqrt();
+              TensorFixedSize<Scalar, Sizes<> > mini;
+              mini.device(device) =
+                  input.template chip<3>(i).template chip<0>(j).minimum();
+              TensorFixedSize<float, Sizes<> > maxi;
+              maxi.device(device) =
+                  input.template chip<3>(i).template chip<0>(j).maximum();
+
+              TensorMap<Tensor<float, 2> > tmp(nullptr, glimpse_size);
+              output.template chip<3>(i).template chip<0>(j).device(device) =
+                  (mean.reshape(Sizes<1, 1>()).broadcast(glimpse_size) +
+                   (tmp.random(gen) *
+                    sigma.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
+                       .template cast<Scalar>())
+                      .cwiseMin(
+                          maxi.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
+                      .cwiseMax(
+                          mini.reshape(Sizes<1, 1>()).broadcast(glimpse_size));
+            }
+          } break;
+        }
+
+        // Copy the part of the glimpse that cover the input image if any.
+        if (glimpse_width == 0 || glimpse_height == 0) {
+          continue;
+        }
+        output.template chip<3>(i)
+            .slice(base_offset, slice_extent)
+            .device(device) =
+            input.template chip<3>(i).slice(slice_offset, slice_extent);
+      } else {
+        output.template chip<3>(i).device(device) =
+            input.template chip<3>(i).slice(slice_offset, slice_extent);
+      }
+    }
+  }
+
+ private:
+  const Index width_;
+  const Index height_;
+  const std::vector<IndexPair<float> > offsets_;
+  const bool normalized_;
+  const bool centered_;
+  const ExtractGlimpsesNoiseMode noise_;
+  const int version_;
+};
+}  // namespace
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
+    const Input>
+ExtractGlimpses(
+    const Input& input, const typename internal::traits<Input>::Index width,
+    const typename internal::traits<Input>::Index height,
+    const std::vector<IndexPair<float> >& offsets, const bool normalized = true,
+    const bool centered = true,
+    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM,
+    const int version = 2) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index Index;
+  const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
+                                      centered, noise, version);
+  return input.customOp(op);
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
new file mode 100644
index 00000000..4ef1b924
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
@@ -0,0 +1,610 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
+
+namespace Eigen {
+
+/** CuboidConvolutionBackwardInput
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 3D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 4 or
+ * more (channels, depth, height, width, and optionally others)
+ * The kernel parameter is expected to be a 5D tensor (filters, channels,
+ * kernel_depth, kernel_height, kernel_width)
+ * output_backward and kernel have to be in the same layout.
+ *
+ * The dimensions of the result will be filters, depth, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the depth, width and height dimensions
+ * provided that the same order is used in the input, the kernel, and the
+ * output.
+ *
+ * All dimension orders above are given for col-major, and should be reversed
+ * for row-major.
+ */
+
+template <typename OutputBackward, typename Kernel>
+EIGEN_ALWAYS_INLINE static const std::conditional_t<
+    internal::traits<OutputBackward>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array<
+                IndexPair<typename internal::traits<OutputBackward>::Index>, 1>,
+            const Eigen::TensorForcedEvalOp<const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const TensorShufflingOp<
+                    const array<
+                        typename internal::traits<OutputBackward>::Index, 5>,
+                    const TensorReverseOp<const Eigen::array<bool, 5>,
+                                          const Kernel>>>>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const OutputBackward>>>>,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array<
+                IndexPair<typename internal::traits<OutputBackward>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const OutputBackward>>,
+            const Eigen::TensorForcedEvalOp<const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const TensorShufflingOp<
+                    const array<
+                        typename internal::traits<OutputBackward>::Index, 5>,
+                    const TensorReverseOp<const Eigen::array<bool, 5>,
+                                          const Kernel>>>>>>>
+CuboidConvolutionBackwardInput(
+    const Kernel& kernel, const OutputBackward& output_backward,
+    typename internal::traits<OutputBackward>::Index inputPlanes,
+    typename internal::traits<OutputBackward>::Index inputRows,
+    typename internal::traits<OutputBackward>::Index inputCols,
+    const DenseIndex plane_stride = 1, const DenseIndex row_stride = 1,
+    const DenseIndex col_stride = 1) {
+  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
+  const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar,
+                               internal::traits<Kernel>::NumDimensions,
+                               internal::traits<Kernel>::Layout, TensorIndex>>
+      kern(kernel);
+  const TensorRef<
+      const Tensor<typename internal::traits<OutputBackward>::Scalar,
+                   internal::traits<OutputBackward>::NumDimensions,
+                   internal::traits<OutputBackward>::Layout, TensorIndex>>
+      out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout ==
+                          internal::traits<OutputBackward>::Layout,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor =
+      (internal::traits<OutputBackward>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
+  const TensorIndex kernelPlanes =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
+
+  const TensorIndex outputPlanes =
+      isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2];
+  const TensorIndex outputRows =
+      isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3];
+  const TensorIndex outputCols =
+      isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4];
+
+  // TODO(ezhulenev): Add support for inflated strides. Without inflated strides
+  // effective kernel planes/rows/cols are always the same as the kernel itself
+  // (see eigen_spatial_convolutions for details).
+  const TensorIndex kernelPlanesEff = kernelPlanes;
+  const TensorIndex kernelRowsEff = kernelRows;
+  const TensorIndex kernelColsEff = kernelCols;
+
+  // Computing the forward padding.
+  const TensorIndex forward_pad_top_z = numext::maxi<Index>(
+      0,
+      ((outputPlanes - 1) * plane_stride + kernelPlanesEff - inputPlanes) / 2);
+  const TensorIndex forward_pad_top = numext::maxi<Index>(
+      0, ((outputRows - 1) * row_stride + kernelRowsEff - inputRows) / 2);
+  const TensorIndex forward_pad_left = numext::maxi<Index>(
+      0, ((outputCols - 1) * col_stride + kernelColsEff - inputCols) / 2);
+
+  const TensorIndex padding_top_z = kernelPlanesEff - 1 - forward_pad_top_z;
+  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
+  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
+
+  const TensorIndex padding_bottom_z = inputPlanes -
+                                       (outputPlanes - 1) * plane_stride - 2 -
+                                       padding_top_z + kernelPlanesEff;
+  const TensorIndex padding_bottom = inputRows - (outputRows - 1) * row_stride -
+                                     2 - padding_top + kernelRowsEff;
+  const TensorIndex padding_right = inputCols - (outputCols - 1) * col_stride -
+                                    2 - padding_left + kernelColsEff;
+
+  eigen_assert(padding_top_z >= 0);
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom_z >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The kernel has dimensions :
+  //   filters x channels x patch_planes x patch_rows x patch_cols.
+  // We need to reverse the kernel along the spatial dimensions.
+  Eigen::array<bool, 5> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+    kernel_reverse[4] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = false;
+    kernel_reverse[4] = false;
+  }
+
+  // Reorder the dimensions to:
+  //   filters x patch_planes x patch_rows x patch_cols x channels
+  array<TensorIndex, 5> kernel_shuffle;
+  if (isColMajor) {
+    //  From: filters x channels x planes x rows x cols
+    //  To:   filters x planes x rows x cols x channels
+    kernel_shuffle[0] = 0;
+    kernel_shuffle[1] = 2;
+    kernel_shuffle[2] = 3;
+    kernel_shuffle[3] = 4;
+    kernel_shuffle[4] = 1;
+  } else {
+    //  From: cols x rows x planes x channels x filters
+    //  To:   channels x cols x rows x planes x filters
+    kernel_shuffle[0] = 3;
+    kernel_shuffle[1] = 0;
+    kernel_shuffle[2] = 1;
+    kernel_shuffle[3] = 2;
+    kernel_shuffle[4] = 4;
+  }
+
+  // Collapse the dims
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters * kernelPlanes * kernelRows * kernelCols;
+    kernel_dims[1] = kernelChannels;
+  } else {
+    kernel_dims[1] = kernelFilters * kernelPlanes * kernelRows * kernelCols;
+    kernel_dims[0] = kernelChannels;
+  }
+
+  // The output_backward has dimensions out_depth X out_planes X out_rows X
+  // out_cols X OTHERS
+  // When we extract the image patches from output_backward, it will have
+  // dimensions:
+  //   out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes *
+  //   input_rows * input_cols * OTHERS)
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] =
+        kernelFilters * kernelPlanes * kernelRows * kernelCols;
+    pre_contract_dims[1] = inputPlanes * inputRows * inputCols;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[1] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] =
+        kernelFilters * kernelPlanes * kernelRows * kernelCols;
+    pre_contract_dims[0] = inputPlanes * inputRows * inputCols;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // We will contract along the collapsed dimension that contains the
+  // kernelFilters, kernelPlanes, kernelRows and kernelCols.
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  if (isColMajor) {
+    // col-major: kernel.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+  } else {
+    // row-major: output.patches.contract(kernel)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
+  }
+
+  // Post contraction, the dimensions of the input_backprop is
+  //  channels X input_planes X input_rows X input_cols X OTHERS
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = inputPlanes;
+    post_contract_dims[2] = inputRows;
+    post_contract_dims[3] = inputCols;
+    for (int i = 4; i < NumDims; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelChannels;
+    post_contract_dims[NumDims - 2] = inputPlanes;
+    post_contract_dims[NumDims - 3] = inputRows;
+    post_contract_dims[NumDims - 4] = inputCols;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  }
+
+  return choose(
+      Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+      kernel.reverse(kernel_reverse)
+          .shuffle(kernel_shuffle)
+          .reshape(kernel_dims)
+          .eval()
+          .contract(output_backward
+                        .extract_volume_patches(
+                            kernelPlanes, kernelRows, kernelCols, 1, 1, 1,
+                            plane_stride, row_stride, col_stride, padding_top_z,
+                            padding_bottom_z, padding_top, padding_bottom,
+                            padding_left, padding_right)
+                        .reshape(pre_contract_dims),
+                    contract_dims)
+          .reshape(post_contract_dims),
+      output_backward
+          .extract_volume_patches(kernelPlanes, kernelRows, kernelCols, 1, 1, 1,
+                                  plane_stride, row_stride, col_stride,
+                                  padding_top_z, padding_bottom_z, padding_top,
+                                  padding_bottom, padding_left, padding_right)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reverse(kernel_reverse)
+                        .shuffle(kernel_shuffle)
+                        .reshape(kernel_dims)
+                        .eval(),
+                    contract_dims)
+          .reshape(post_contract_dims));
+}
+
+/** CuboidConvolutionBackwardKernel
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 3D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 4 or
+ * more (channels, depth, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_depth, kernel_height, kernel_width)
+ * output_backward and kernel have to be in the same layout.
+ *
+ * The dimensions of the result will be filters, depth, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the depth, width and height dimensions
+ * provided that the same order is used in the input, the kernel, and the
+ * output.
+ *
+ * All dimension orders above are given for col-major, and should be reversed
+ * for row-major.
+ */
+template <typename OutputBackward, typename Input>
+EIGEN_ALWAYS_INLINE static const std::conditional_t<
+    internal::traits<Input>::Layout == ColMajor,
+    const TensorReverseOp<
+        const Eigen::array<typename internal::traits<Input>::Index,
+                           internal::traits<Input>::NumDimensions>,
+        const Eigen::TensorShufflingOp<
+            const Eigen::array<typename internal::traits<Input>::Index,
+                               internal::traits<Input>::NumDimensions>,
+            const Eigen::TensorReshapingOp<
+                const Eigen::DSizes<typename internal::traits<Input>::Index,
+                                    internal::traits<Input>::NumDimensions>,
+                const TensorContractionOp<
+                    const array<
+                        IndexPair<typename internal::traits<Input>::Index>, 1>,
+                    const Eigen::TensorForcedEvalOp<const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const Eigen::TensorShufflingOp<
+                            const Eigen::array<
+                                typename internal::traits<Input>::Index,
+                                internal::traits<Input>::NumDimensions>,
+                            const OutputBackward>>>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const TensorVolumePatchOp<
+                            Dynamic, Dynamic, Dynamic,
+                            const Eigen::TensorForcedEvalOp<
+                                const Eigen::TensorShufflingOp<
+                                    const Eigen::array<
+                                        typename internal::traits<Input>::Index,
+                                        internal::traits<Input>::NumDimensions>,
+                                    const Input>>>>>>>>,
+    const TensorReverseOp<
+        const Eigen::array<typename internal::traits<Input>::Index,
+                           internal::traits<Input>::NumDimensions>,
+        const Eigen::TensorShufflingOp<
+            const Eigen::array<typename internal::traits<Input>::Index,
+                               internal::traits<Input>::NumDimensions>,
+            const Eigen::TensorReshapingOp<
+                const Eigen::DSizes<typename internal::traits<Input>::Index,
+                                    internal::traits<Input>::NumDimensions>,
+                const TensorContractionOp<
+                    const array<
+                        IndexPair<typename internal::traits<Input>::Index>, 1>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const TensorVolumePatchOp<
+                            Dynamic, Dynamic, Dynamic,
+                            const Eigen::TensorForcedEvalOp<
+                                const Eigen::TensorShufflingOp<
+                                    const Eigen::array<
+                                        typename internal::traits<Input>::Index,
+                                        internal::traits<Input>::NumDimensions>,
+                                    const Input>>>>,
+                    const Eigen::TensorForcedEvalOp<const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const Eigen::TensorShufflingOp<
+                            const Eigen::array<
+                                typename internal::traits<Input>::Index,
+                                internal::traits<Input>::NumDimensions>,
+                            const OutputBackward>>>>>>>>
+CuboidConvolutionBackwardKernel(
+    const Input& input, const OutputBackward& output_backward,
+    typename internal::traits<Input>::Index kernelPlanes,
+    typename internal::traits<Input>::Index kernelRows,
+    typename internal::traits<Input>::Index kernelCols,
+    const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1,
+    const DenseIndex strideCols = 1) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex>>
+      in(input);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar,
+                   internal::traits<OutputBackward>::NumDimensions,
+                   internal::traits<OutputBackward>::Layout, TensorIndex>>
+      out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout ==
+                          internal::traits<OutputBackward>::Layout,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions ==
+                          internal::traits<OutputBackward>::NumDimensions,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  // We do not support higher dimensional backward convolutions, or convolutions
+  // without batch dimension.
+  // TODO(ezhulenev): Relax this constraint, and turn on tests without batch
+  // dimension in eigen_backward_cuboid_convolutions_test.cc.
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const TensorIndex inputPlanes =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols =
+      isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  const TensorIndex outputPlanes =
+      isColMajor ? out.dimension(1) : out.dimension(NumDims - 2);
+  const TensorIndex outputRows =
+      isColMajor ? out.dimension(2) : out.dimension(NumDims - 3);
+  const TensorIndex outputCols =
+      isColMajor ? out.dimension(3) : out.dimension(NumDims - 4);
+
+  // Number of filters. This is the same as the output depth.
+  const TensorIndex kernelFilters =
+      isColMajor ? out.dimension(0) : out.dimension(NumDims - 1);
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? in.dimension(0) : in.dimension(NumDims - 1);
+
+  // Number of batches in the input tensor.
+  const TensorIndex batch =
+      isColMajor ? in.dimension(4) : in.dimension(NumDims - 5);
+
+  // TODO(ezhulenev): Add support for inflated strides. Without inflated strides
+  // effective kernel planes/rows/cols are always the same as the kernel itself
+  // (see eigen_spatial_convolutions for details).
+  const TensorIndex kernelPlanesEff = kernelPlanes;
+  const TensorIndex kernelRowsEff = kernelRows;
+  const TensorIndex kernelColsEff = kernelCols;
+
+  // Compute forward padding from input and output_backward dimensions.
+  const TensorIndex padPlanes = numext::maxi<Index>(
+      0, (outputPlanes - 1) * stridePlanes + kernelPlanesEff - inputPlanes);
+  const TensorIndex padRows = numext::maxi<Index>(
+      0, (outputRows - 1) * strideRows + kernelRowsEff - inputRows);
+  const TensorIndex padCols = numext::maxi<Index>(
+      0, (outputCols - 1) * strideCols + kernelColsEff - inputCols);
+
+  const TensorIndex padding_top_z = padPlanes / 2;
+  const TensorIndex padding_top = padRows / 2;
+  const TensorIndex padding_left = padCols / 2;
+
+  // Compute paddings for output_backward before extracting patches.
+  const auto expanded_out_planes = (outputPlanes - 1) * stridePlanes + 1;
+  const auto expanded_out_rows = (outputRows - 1) * strideRows + 1;
+  const auto expanded_out_cols = (outputCols - 1) * strideCols + 1;
+  const auto padded_out_planes = inputPlanes + kernelPlanes - 1;
+  const auto padded_out_rows = inputRows + kernelRows - 1;
+  const auto padded_out_cols = inputCols + kernelCols - 1;
+  const auto top_pad_planes = kernelPlanes - 1 - padding_top_z;
+  const auto top_pad_rows = kernelRows - 1 - padding_top;
+  const auto left_pad_cols = kernelCols - 1 - padding_left;
+  const auto bottom_pad_planes =
+      padded_out_planes - expanded_out_planes - top_pad_planes;
+  const auto bottom_pad_rows =
+      padded_out_rows - expanded_out_rows - top_pad_rows;
+  const auto right_pad_cols =
+      padded_out_cols - expanded_out_cols - left_pad_cols;
+
+  // Reorder output_backward dimensions.
+  array<TensorIndex, 5> output_backward_shuffle;
+  if (isColMajor) {
+    // From: [out_depth, out_planes, out_rows, out_cols, batch]
+    // To:   [batch, out_planes, out_rows, out_cols, out_depth]
+    output_backward_shuffle = {4, 1, 2, 3, 0};
+  } else {
+    // From: [batch, out_cols, out_rows, out_planes, out_depth]
+    // To:   [out_depth, out_cols, out_rows, out_planes, batch]
+    output_backward_shuffle = {4, 1, 2, 3, 0};
+  }
+
+  // Reorder input dimensions.
+  array<TensorIndex, 5> input_shuffle;
+  if (isColMajor) {
+    // From: [in_depth, in_planes, in_rows, in_cols, batch]
+    // To:   [in_depth, batch, in_planes, in_rows, in_cols]
+    input_shuffle = {0, 4, 1, 2, 3};
+  } else {
+    // From: [batch, in_cols, in_rows, in_planes, in_depth]
+    // To:   [in_cols, in_rows, in_planes, batch, in_depth]
+    input_shuffle = {1, 2, 3, 0, 4};
+  }
+
+  // Input is playing the role of a "kernel" in this convolution.
+  DSizes<TensorIndex, 2> input_dims;
+  if (isColMajor) {
+    input_dims[0] = kernelChannels;
+    input_dims[1] = batch * inputPlanes * inputRows * inputCols;
+  } else {
+    input_dims[1] = kernelChannels;
+    input_dims[0] = inputCols * inputRows * inputPlanes * batch;
+  }
+
+  // Molds the output of the patch extraction result into a 2D tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = batch * inputPlanes * inputRows * inputCols;
+    pre_contract_dims[1] =
+        kernelPlanes * kernelRows * kernelCols * kernelFilters;
+  } else {
+    pre_contract_dims[1] = inputCols * inputRows * inputPlanes * batch;
+    pre_contract_dims[0] =
+        kernelFilters * kernelCols * kernelRows * kernelPlanes;
+  }
+
+  // We will contract along the collapsed dimension that contains the
+  // batch, inputPlanes, inputRows and inputCols.
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  // Dimensions after contraction.
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = kernelPlanes;
+    post_contract_dims[2] = kernelRows;
+    post_contract_dims[3] = kernelCols;
+    post_contract_dims[4] = kernelFilters;
+  } else {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = kernelCols;
+    post_contract_dims[2] = kernelRows;
+    post_contract_dims[3] = kernelPlanes;
+    post_contract_dims[4] = kernelChannels;
+  }
+
+  // Reorder output of contraction to valid filter shape.
+  array<TensorIndex, 5> kernel_shuffle;
+  if (isColMajor) {
+    // From: [in_depth, kernel_planes, kernel_rows, kernel_cols, out_depth]
+    // To:   [out_depth, in_depth, kernel_planes, kernel_rows, kernel_cols]
+    kernel_shuffle = {4, 0, 1, 2, 3};
+  } else {
+    // From: [out_depth, kernel_cols, kernel_rows, kernel_planes, in_depth]
+    // To:   [kernel_cols, kernel_rows, kernel_planes, in_depth, out_depth]
+    kernel_shuffle = {1, 2, 3, 4, 0};
+  }
+
+  // Reverse kernel backprop dimensions.
+  array<TensorIndex, 5> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse = {false, false, true, true, true};
+  } else {
+    kernel_reverse = {true, true, true, false, false};
+  }
+
+  // Create convolution input (aka source of patches) from output backward
+  // tensor by shuffling dimensions.
+  const auto the_input =
+      output_backward.shuffle(output_backward_shuffle).eval();
+
+  // Create convolution kernel (aka filter) from input by shuffling and
+  // reshaping.
+  const auto the_kernel =
+      input.shuffle(input_shuffle).reshape(input_dims).eval();
+
+  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                the_kernel.contract(
+                    the_input
+                        .extract_volume_patches(
+                            inputPlanes, inputRows, inputCols, 1, 1, 1,
+                            stridePlanes, strideRows, strideCols,
+                            top_pad_planes, bottom_pad_planes, top_pad_rows,
+                            bottom_pad_rows, left_pad_cols, right_pad_cols)
+                        .reshape(pre_contract_dims),
+                    contract_dims),
+                the_input
+                    .extract_volume_patches(
+                        inputPlanes, inputRows, inputCols, 1, 1, 1,
+                        stridePlanes, strideRows, strideCols, top_pad_planes,
+                        bottom_pad_planes, top_pad_rows, bottom_pad_rows,
+                        left_pad_cols, right_pad_cols)
+                    .reshape(pre_contract_dims)
+                    .contract(the_kernel, contract_dims))
+      .reshape(post_contract_dims)
+      .shuffle(kernel_shuffle)
+      .reverse(kernel_reverse);
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
new file mode 100644
index 00000000..c21b6fe0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -0,0 +1,593 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"
+
+namespace Eigen {
+
+/** SpatialConvolutionBackwardInput
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * more (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The output_backward and the kernel must both be in col-major layout. The
+ * result will also be in col-major layout.
+ *
+ * If row_in_stride, col_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every row_in_stride, col_in_stride input
+ * pixels.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * output_backward. The dimensions of the result will be filters, height, width
+ * (and others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ */
+typedef IndexList<type2index<0>, type2index<0>, type2index<1>, type2index<1>>
+    ReverseColMajor;
+typedef IndexList<type2index<1>, type2index<1>, type2index<0>, type2index<0>>
+    ReverseRowMajor;
+
+template <typename OutputBackward, typename Kernel>
+EIGEN_ALWAYS_INLINE static const std::conditional_t<
+    internal::traits<OutputBackward>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array<
+                IndexPair<typename internal::traits<OutputBackward>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const Eigen::TensorForcedEvalOp<const TensorShufflingOp<
+                    const array<
+                        typename internal::traits<OutputBackward>::Index, 4>,
+                    const Eigen::TensorForcedEvalOp<const TensorReverseOp<
+                        const ReverseColMajor, const Kernel>>>>>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const TensorImagePatchOp<Dynamic, Dynamic,
+                                         const OutputBackward>>>>,
+    TensorReshapingOp<
+
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array<
+                IndexPair<typename internal::traits<OutputBackward>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const TensorImagePatchOp<Dynamic, Dynamic,
+                                         const OutputBackward>>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index,
+                             2>,
+                const Eigen::TensorForcedEvalOp<const TensorShufflingOp<
+                    const array<
+                        typename internal::traits<OutputBackward>::Index, 4>,
+                    const Eigen::TensorForcedEvalOp<const TensorReverseOp<
+                        const ReverseRowMajor, const Kernel>>>>>>>>
+SpatialConvolutionBackwardInput(
+    const Kernel& kernel, const OutputBackward& output_backward,
+    typename internal::traits<OutputBackward>::Index inputRows,
+    typename internal::traits<OutputBackward>::Index inputCols,
+    const DenseIndex row_stride = 1, const DenseIndex col_stride = 1,
+    const DenseIndex row_in_stride = 1, const DenseIndex col_in_stride = 1) {
+  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
+  typedef typename internal::traits<OutputBackward>::Scalar OutScalar;
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+                   internal::traits<Kernel>::NumDimensions,
+                   internal::traits<Kernel>::Layout, TensorIndex>>
+      kern(kernel);
+  TensorRef<Tensor<OutScalar, internal::traits<OutputBackward>::NumDimensions,
+                   internal::traits<OutputBackward>::Layout, TensorIndex>>
+      out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout ==
+                          internal::traits<OutputBackward>::Layout,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor =
+      (internal::traits<OutputBackward>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  // This is the effective kernel size, taking into account the (*_in_stride -
+  // 1) zero-values
+  // inserted between consecutive kernel elements in atrous convolution
+  const TensorIndex kernelRowsEff =
+      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const TensorIndex kernelColsEff =
+      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  const TensorIndex outputRows = isColMajor
+                                     ? output_backward.dimension(1)
+                                     : output_backward.dimension(NumDims - 2);
+  const TensorIndex outputCols = isColMajor
+                                     ? output_backward.dimension(2)
+                                     : output_backward.dimension(NumDims - 3);
+
+  // Computing the forward padding
+  const TensorIndex forward_pad_top = numext::maxi<Index>(
+      0, ((outputRows - 1) * row_stride + kernelRowsEff - inputRows) / 2);
+  const TensorIndex forward_pad_left = numext::maxi<Index>(
+      0, ((outputCols - 1) * col_stride + kernelColsEff - inputCols) / 2);
+  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
+  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
+
+  const TensorIndex padding_bottom = inputRows - (outputRows - 1) * row_stride -
+                                     2 - padding_top + kernelRowsEff;
+  const TensorIndex padding_right = inputCols - (outputCols - 1) * col_stride -
+                                    2 - padding_left + kernelColsEff;
+
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The kernel has dimensions filters X channels X patch_rows X patch_cols
+  // We need to reverse the kernel along dimensions corresponding to rows and
+  // cols.
+  // TODO(yangke): we can make things slightly faster by collapsing the
+  // dimensions
+  // where we don't reverse. Try that once we have a faster compiler.
+  typedef std::conditional_t<isColMajor, ReverseColMajor, ReverseRowMajor>
+      Reverse;
+  Reverse kernel_reverse;
+  // Reorder the dimensions to:
+  //   filters x patch_rows x patch_cols x channels
+  array<TensorIndex, 4> kernel_shuffle;
+  if (isColMajor) {
+    //  From: filters x channels x rows x cols
+    //  To:   filters x rows x cols x channels
+    kernel_shuffle[0] = 0;
+    kernel_shuffle[1] = 2;
+    kernel_shuffle[2] = 3;
+    kernel_shuffle[3] = 1;
+  } else {
+    //  From: cols x rows x channels x filters
+    //  To:   channels x cols x rows x filters
+    kernel_shuffle[0] = 2;
+    kernel_shuffle[1] = 0;
+    kernel_shuffle[2] = 1;
+    kernel_shuffle[3] = 3;
+  }
+
+  // Collapse the dims
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters * kernelRows * kernelCols;
+    kernel_dims[1] = kernelChannels;
+  } else {
+    kernel_dims[1] = kernelFilters * kernelRows * kernelCols;
+    kernel_dims[0] = kernelChannels;
+  }
+
+  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward, it will have
+  // dimensions
+  //   out_depth X (patch_rows * patch_cols) X (input_rows * input_cols *
+  //   OTHERS)
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters * kernelRows * kernelCols;
+    pre_contract_dims[1] = inputRows * inputCols;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelFilters * kernelRows * kernelCols;
+    pre_contract_dims[0] = inputRows * inputCols;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // We will contract along the collapsed dimension that contains the
+  // kernelFilters, the kernelRows and the kernelCols.
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  if (isColMajor) {
+    // col-major: kernel.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+  } else {
+    // row-major: output.patches.contract(kernel)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
+  }
+
+  // Post contraction, the dimensions of the input_backprop is
+  //  channels X input_rows X input_cols X OTHERS
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = inputRows;
+    post_contract_dims[2] = inputCols;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelChannels;
+    post_contract_dims[NumDims - 2] = inputRows;
+    post_contract_dims[NumDims - 3] = inputCols;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  }
+
+  // NOTE(ezhulenev): We do eval after reverse and shuffle, because tiled
+  // evaluation of these ops does not compose. Doing explicit eval is ~8x
+  // faster in micro benchmarks.
+
+  return choose(
+      Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+      kernel.reverse(kernel_reverse)
+          .eval()
+          .shuffle(kernel_shuffle)
+          .eval()
+          .reshape(kernel_dims)
+          .contract(
+              output_backward
+                  .extract_image_patches(
+                      kernelRows, kernelCols, 1, 1, row_in_stride,
+                      col_in_stride, row_stride, col_stride, padding_top,
+                      padding_bottom, padding_left, padding_right, OutScalar(0))
+                  .reshape(pre_contract_dims),
+              contract_dims)
+          .reshape(post_contract_dims),
+      output_backward
+          .extract_image_patches(kernelRows, kernelCols, 1, 1, row_in_stride,
+                                 col_in_stride, row_stride, col_stride,
+                                 padding_top, padding_bottom, padding_left,
+                                 padding_right, OutScalar(0))
+          .reshape(pre_contract_dims)
+          .contract(kernel.reverse(kernel_reverse)
+                        .eval()
+                        .shuffle(kernel_shuffle)
+                        .eval()
+                        .reshape(kernel_dims),
+                    contract_dims)
+          .reshape(post_contract_dims));
+}
+
+/** SpatialConvolutionBackwardKernel
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * more (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The output_backward and the kernel must both be in col-major layout. The
+ * result will also be in col-major layout.
+ *
+ * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
+ * atrous convolution), sampling every row_in_stride, col_in_stride input
+ * pixels.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * output_backward. The dimensions of the result will be filters, height, width
+ * (and others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ */
+
+template <typename OutputBackward, typename Input>
+EIGEN_ALWAYS_INLINE static const std::conditional_t<
+    internal::traits<Input>::Layout == ColMajor,
+    const TensorReverseOp<
+        const Eigen::array<typename internal::traits<Input>::Index,
+                           internal::traits<Input>::NumDimensions>,
+        const Eigen::TensorForcedEvalOp<const Eigen::TensorShufflingOp<
+            const Eigen::array<typename internal::traits<Input>::Index,
+                               internal::traits<Input>::NumDimensions>,
+            const Eigen::TensorReshapingOp<
+                const Eigen::DSizes<typename internal::traits<Input>::Index,
+                                    internal::traits<Input>::NumDimensions>,
+                const TensorContractionOp<
+                    const array<
+                        IndexPair<typename internal::traits<Input>::Index>, 1>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const Eigen::TensorForcedEvalOp<
+                            const Eigen::TensorShufflingOp<
+                                const Eigen::array<
+                                    typename internal::traits<Input>::Index,
+                                    internal::traits<Input>::NumDimensions>,
+                                const Input>>>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const TensorImagePatchOp<
+                            Dynamic, Dynamic,
+                            const Eigen::TensorForcedEvalOp<
+                                const Eigen::TensorShufflingOp<
+                                    const Eigen::array<
+                                        typename internal::traits<Input>::Index,
+                                        internal::traits<Input>::NumDimensions>,
+                                    const OutputBackward>>>>>>>>>,
+    const TensorReverseOp<
+        const Eigen::array<typename internal::traits<Input>::Index,
+                           internal::traits<Input>::NumDimensions>,
+        const Eigen::TensorForcedEvalOp<const Eigen::TensorShufflingOp<
+            const Eigen::array<typename internal::traits<Input>::Index,
+                               internal::traits<Input>::NumDimensions>,
+            const Eigen::TensorReshapingOp<
+                const Eigen::DSizes<typename internal::traits<Input>::Index,
+                                    internal::traits<Input>::NumDimensions>,
+                const TensorContractionOp<
+                    const array<
+                        IndexPair<typename internal::traits<Input>::Index>, 1>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const TensorImagePatchOp<
+                            Dynamic, Dynamic,
+                            const Eigen::TensorForcedEvalOp<
+                                const Eigen::TensorShufflingOp<
+                                    const Eigen::array<
+                                        typename internal::traits<Input>::Index,
+                                        internal::traits<Input>::NumDimensions>,
+                                    const OutputBackward>>>>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index,
+                                     2>,
+                        const Eigen::TensorForcedEvalOp<
+                            const Eigen::TensorShufflingOp<
+                                const Eigen::array<
+                                    typename internal::traits<Input>::Index,
+                                    internal::traits<Input>::NumDimensions>,
+                                const Input>>>>>>>>>
+SpatialConvolutionBackwardKernel(
+    const Input& input, const OutputBackward& output_backward,
+    typename internal::traits<Input>::Index kernelRows,
+    typename internal::traits<Input>::Index kernelCols,
+    const DenseIndex row_stride = 1, const DenseIndex col_stride = 1,
+    const DenseIndex row_in_stride = 1, const DenseIndex col_in_stride = 1) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  typedef typename internal::traits<OutputBackward>::Scalar OutScalar;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex>>
+      in(input);
+  TensorRef<Tensor<OutScalar, internal::traits<OutputBackward>::NumDimensions,
+                   internal::traits<OutputBackward>::Layout, TensorIndex>>
+      out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout ==
+                          internal::traits<OutputBackward>::Layout,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  // stride and in_stride cannot both be larger than 1
+  eigen_assert(!(row_stride > 1 && row_in_stride > 1));
+  eigen_assert(!(col_stride > 1 && col_in_stride > 1));
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions ==
+                          internal::traits<OutputBackward>::NumDimensions,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(NumDims == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const TensorIndex inputRows =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputCols =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  const TensorIndex outputRows = isColMajor
+                                     ? output_backward.dimension(1)
+                                     : output_backward.dimension(NumDims - 2);
+  const TensorIndex outputCols = isColMajor
+                                     ? output_backward.dimension(2)
+                                     : output_backward.dimension(NumDims - 3);
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1];
+
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1];
+
+  // This is the effective kernel size, taking into account the
+  // (*_in_stride - 1) zero-values inserted between consecutive kernel
+  // elements in atrous convolution
+  const TensorIndex kernelRowsEff =
+      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const TensorIndex kernelColsEff =
+      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  // Number of batches (and other dimensions) in the input tensor.
+  TensorIndex batch = 1;
+  for (int d = 3; d < NumDims; ++d) {
+    batch *= isColMajor ? in.dimension(d) : in.dimension(NumDims - d - 1);
+  }
+
+  // Computing the forward padding
+  const TensorIndex padRows = numext::maxi<Index>(
+      0, (outputRows - 1) * row_stride + kernelRowsEff - inputRows);
+  const TensorIndex padCols = numext::maxi<Index>(
+      0, (outputCols - 1) * col_stride + kernelColsEff - inputCols);
+
+  TensorIndex padding_top = padRows / 2;
+  TensorIndex padding_left = padCols / 2;
+
+  // Compute paddings for output_backward before extracting patches.
+  const TensorIndex expanded_out_rows = (outputRows - 1) * row_stride + 1;
+  const TensorIndex expanded_out_cols = (outputCols - 1) * col_stride + 1;
+
+  const TensorIndex padded_out_rows = inputRows + kernelRowsEff - 1;
+  const TensorIndex padded_out_cols = inputCols + kernelColsEff - 1;
+
+  const TensorIndex top_pad_rows = kernelRowsEff - 1 - padding_top;
+  const TensorIndex left_pad_cols = kernelColsEff - 1 - padding_left;
+
+  const TensorIndex bottom_pad_rows =
+      padded_out_rows - expanded_out_rows - top_pad_rows;
+  const TensorIndex right_pad_cols =
+      padded_out_cols - expanded_out_cols - left_pad_cols;
+
+  // Reorder output_backward dimensions.
+  array<TensorIndex, 4> output_backward_shuffle;
+  if (isColMajor) {
+    // From: [out_depth, out_rows, out_cols, batch]
+    // To:   [batch, out_rows, out_cols, out_depth]
+    output_backward_shuffle = {3, 1, 2, 0};
+  } else {
+    // From: [batch, out_cols, out_rows, out_depth]
+    // To:   [out_depth, out_cols, out_rows, batch]
+    output_backward_shuffle = {3, 1, 2, 0};
+  }
+
+  // Reorder input dimensions.
+  array<TensorIndex, 4> input_shuffle;
+  if (isColMajor) {
+    // From: [in_depth, in_rows, in_cols, batch]
+    // To:   [in_depth, batch, in_rows, in_cols]
+    input_shuffle = {0, 3, 1, 2};
+  } else {
+    // From: [batch, in_cols, in_rows, in_depth]
+    // To:   [in_cols, in_rows, batch, in_depth]
+    input_shuffle = {1, 2, 0, 3};
+  }
+
+  // Input is playing the role of a "kernel" in this convolution.
+  DSizes<TensorIndex, 2> input_dims;
+  if (isColMajor) {
+    input_dims[0] = kernelChannels;
+    input_dims[1] = batch * inputRows * inputCols;
+  } else {
+    input_dims[1] = kernelChannels;
+    input_dims[0] = inputCols * inputRows * batch;
+  }
+
+  // Molds the output of the patch extraction result into a 2D tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = batch * inputRows * inputCols;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelFilters;
+  } else {
+    pre_contract_dims[1] = inputCols * inputRows * batch;
+    pre_contract_dims[0] = kernelFilters * kernelCols * kernelRows;
+  }
+
+  // We will contract along the collapsed dimension that contains the
+  // batch, inputRows and inputCols.
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  // Dimensions after contraction.
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = kernelRows;
+    post_contract_dims[2] = kernelCols;
+    post_contract_dims[3] = kernelFilters;
+  } else {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = kernelCols;
+    post_contract_dims[2] = kernelRows;
+    post_contract_dims[3] = kernelChannels;
+  }
+
+  // Reorder output of contraction to a valid filter shape.
+  array<TensorIndex, 4> kernel_shuffle;
+  if (isColMajor) {
+    // From: [in_depth, kernel_rows, kernel_cols, out_depth]
+    // To:   [out_depth, in_depth, kernel_rows, kernel_cols]
+    kernel_shuffle = {3, 0, 1, 2};
+  } else {
+    // From: [out_depth, kernel_cols, kernel_rows, in_depth]
+    // To:   [kernel_cols, kernel_rows, in_depth, out_depth]
+    kernel_shuffle = {1, 2, 3, 0};
+  }
+
+  // Reverse kernel backprop dimensions.
+  array<TensorIndex, 4> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse = {false, false, true, true};
+  } else {
+    kernel_reverse = {true, true, false, false};
+  }
+
+  // Create convolution input (aka source of patches) from output backward
+  // tensor by shuffling dimensions.
+  const auto output_backward_shuffled =
+      output_backward.shuffle(output_backward_shuffle).eval();
+
+  // Create convolution kernel (aka filter) from input by shuffling and
+  // reshaping.
+  const auto input_shuffled =
+      input.shuffle(input_shuffle).eval().reshape(input_dims);
+
+  return choose(
+             Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+             input_shuffled.contract(
+                 output_backward_shuffled
+                     .extract_image_patches(inputRows, inputCols, row_in_stride,
+                                            col_in_stride, 1, 1, row_stride,
+                                            col_stride, top_pad_rows,
+                                            bottom_pad_rows, left_pad_cols,
+                                            right_pad_cols, OutScalar(0))
+                     .reshape(pre_contract_dims),
+                 contract_dims),
+             output_backward_shuffled
+                 .extract_image_patches(
+                     inputRows, inputCols, row_in_stride, col_in_stride, 1, 1,
+                     row_stride, col_stride, top_pad_rows, bottom_pad_rows,
+                     left_pad_cols, right_pad_cols, OutScalar(0))
+                 .reshape(pre_contract_dims)
+                 .contract(input_shuffled, contract_dims))
+      .reshape(post_contract_dims)
+      .shuffle(kernel_shuffle)
+      .eval()
+      .reverse(kernel_reverse);
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_benchmark.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_benchmark.h
new file mode 100644
index 00000000..e69a5976
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_benchmark.h
@@ -0,0 +1,295 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BENCHMARK_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BENCHMARK_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h"
+#include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
+#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+using ::tensorflow::TTypes;
+
+template <typename Scalar, typename Device>
+class SpatialConvolutionBenchmarksSuite {
+ public:
+  using Input = TTypes<float, 4>::ConstTensor;
+  using Filter = TTypes<float, 4>::ConstTensor;
+  using Output = TTypes<float, 4>::Tensor;
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  SpatialConvolutionBenchmarksSuite(::testing::benchmark::State& state,
+                                    Device& device)
+      : state_(state), device_(device) {}
+
+  Eigen::Index BufferSize(const Dimensions& dims) {
+    return dims.TotalSize() * sizeof(Scalar);
+  }
+
+  void SpatialConvolution(Dimensions input_dims, Dimensions filter_dims) {
+    Dimensions output_dims(input_dims[0],    // batch
+                           input_dims[1],    // input_height
+                           input_dims[2],    // input_width
+                           filter_dims[3]);  // filter_count
+
+    Scalar* input_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(input_dims)));
+    Scalar* filter_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(filter_dims)));
+    Scalar* output_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(output_dims)));
+
+    device_.memset(input_data, 123, BufferSize(input_dims));
+    device_.memset(filter_data, 123, BufferSize(filter_dims));
+
+    Input input(input_data, input_dims);
+    Filter filter(filter_data, filter_dims);
+    Output output(output_data, output_dims);
+
+    for (auto s : state_) {
+      output.device(device_) = Eigen::SpatialConvolution(input, filter);
+      tensorflow::testing::DoNotOptimize(output);
+    }
+
+    device_.deallocate(input_data);
+    device_.deallocate(filter_data);
+    device_.deallocate(output_data);
+  }
+
+  void SpatialConvolutionBackwardInput(Dimensions input_dims,
+                                       Dimensions filter_dims) {
+    using OutputBackward = TTypes<float, 4>::ConstTensor;
+    using InputBackward = TTypes<float, 4>::Tensor;
+
+    Dimensions output_dims(input_dims[0],    // batch
+                           input_dims[1],    // input_height
+                           input_dims[2],    // input_width
+                           filter_dims[3]);  // filter_count
+
+    // Assuming that the convolution had SAME padding.
+    Eigen::Index input_rows = input_dims[1];
+    Eigen::Index input_cols = input_dims[2];
+
+    Scalar* filter_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(filter_dims)));
+    Scalar* output_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(output_dims)));
+    Scalar* input_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(input_dims)));
+
+    device_.memset(filter_data, 123, BufferSize(filter_dims));
+    device_.memset(output_backward_data, 123, BufferSize(output_dims));
+
+    Filter filter(filter_data, filter_dims);
+    OutputBackward output_backward(output_backward_data, output_dims);
+    InputBackward input_backward(input_backward_data, input_dims);
+
+    for (auto s : state_) {
+      input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput(
+          filter, output_backward, input_rows, input_cols);
+      tensorflow::testing::DoNotOptimize(input_backward);
+    }
+
+    device_.deallocate(filter_data);
+    device_.deallocate(output_backward_data);
+    device_.deallocate(input_backward_data);
+  }
+
+  void SpatialConvolutionBackwardKernel(Dimensions input_dims,
+                                        Dimensions filter_dims) {
+    using OutputBackward = TTypes<float, 4>::ConstTensor;
+    using FilterBackward = TTypes<float, 4>::Tensor;
+
+    Dimensions output_dims(input_dims[0],    // batch
+                           input_dims[1],    // input_height
+                           input_dims[2],    // input_width
+                           filter_dims[3]);  // filter_count
+
+    // Assuming that the convolution had SAME padding.
+    Eigen::Index filter_rows = filter_dims[0];
+    Eigen::Index filter_cols = filter_dims[1];
+
+    Scalar* input_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(input_dims)));
+    Scalar* output_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(output_dims)));
+    Scalar* filter_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(filter_dims)));
+
+    device_.memset(input_data, 123, BufferSize(input_dims));
+    device_.memset(output_backward_data, 123, BufferSize(output_dims));
+
+    Input input(input_data, input_dims);
+    OutputBackward output_backward(output_backward_data, input_dims);
+    FilterBackward filter_backward(filter_backward_data, filter_dims);
+
+    for (auto s : state_) {
+      filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel(
+          input, output_backward, filter_rows, filter_cols);
+      tensorflow::testing::DoNotOptimize(filter_backward);
+    }
+
+    device_.deallocate(input_data);
+    device_.deallocate(output_backward_data);
+    device_.deallocate(filter_backward_data);
+  }
+
+ private:
+  ::testing::benchmark::State& state_;
+
+  Device& device_;
+};
+
+template <typename Scalar, typename Device>
+class CuboidConvolutionBenchmarksSuite {
+ public:
+  using Input = TTypes<float, 5>::ConstTensor;
+  using Filter = TTypes<float, 5>::ConstTensor;
+  using Output = TTypes<float, 5>::Tensor;
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 5>;
+
+  CuboidConvolutionBenchmarksSuite(::testing::benchmark::State& state,
+                                   Device& device)
+      : state_(state), device_(device) {}
+
+  Eigen::Index BufferSize(const Dimensions& dims) {
+    return dims.TotalSize() * sizeof(Scalar);
+  }
+
+  void CuboidConvolution(Dimensions input_dims, Dimensions filter_dims) {
+    Dimensions output_dims(input_dims[0],    // batch
+                           input_dims[1],    // input_height
+                           input_dims[2],    // input_width
+                           input_dims[3],    // input_planes
+                           filter_dims[4]);  // filter_count
+
+    Scalar* input_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(input_dims)));
+    Scalar* filter_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(filter_dims)));
+    Scalar* output_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(output_dims)));
+
+    device_.memset(input_data, 123, BufferSize(input_dims));
+    device_.memset(filter_data, 123, BufferSize(filter_dims));
+
+    Input input(input_data, input_dims);
+    Filter filter(filter_data, filter_dims);
+    Output output(output_data, output_dims);
+
+    for (auto s : state_) {
+      output.device(device_) = Eigen::CuboidConvolution(input, filter);
+      tensorflow::testing::DoNotOptimize(output);
+    }
+
+    device_.deallocate(input_data);
+    device_.deallocate(filter_data);
+    device_.deallocate(output_data);
+  }
+
+  void CuboidConvolutionBackwardInput(Dimensions input_dims,
+                                      Dimensions filter_dims) {
+    Dimensions output_dims(input_dims[0],    // batch
+                           input_dims[1],    // input_height
+                           input_dims[2],    // input_width
+                           input_dims[3],    // input_planes
+                           filter_dims[4]);  // filter_count
+
+    using OutputBackward = TTypes<float, 5>::ConstTensor;
+    using InputBackward = TTypes<float, 5>::Tensor;
+
+    // Assuming that the convolution had SAME padding.
+    Eigen::Index input_rows = input_dims[1];
+    Eigen::Index input_cols = input_dims[2];
+    Eigen::Index input_planes = input_dims[3];
+
+    Scalar* filter_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(filter_dims)));
+    Scalar* output_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(output_dims)));
+    Scalar* input_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(input_dims)));
+
+    device_.memset(filter_data, 123, BufferSize(filter_dims));
+    device_.memset(output_backward_data, 123, BufferSize(output_dims));
+
+    Filter filter(filter_data, filter_dims);
+    OutputBackward output_backward(output_backward_data, output_dims);
+    InputBackward input_backward(input_backward_data, input_dims);
+
+    for (auto s : state_) {
+      input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput(
+          filter, output_backward, input_planes, input_rows, input_cols);
+      tensorflow::testing::DoNotOptimize(input_backward);
+    }
+
+    device_.deallocate(filter_data);
+    device_.deallocate(output_backward_data);
+    device_.deallocate(input_backward_data);
+  }
+
+  void CuboidConvolutionBackwardKernel(Dimensions input_dims,
+                                       Dimensions filter_dims) {
+    using OutputBackward = TTypes<float, 5>::ConstTensor;
+    using FilterBackward = TTypes<float, 5>::Tensor;
+
+    Dimensions output_dims(input_dims[0],    // batch
+                           input_dims[1],    // input_height
+                           input_dims[2],    // input_width
+                           input_dims[3],    // input_planes
+                           filter_dims[4]);  // filter_count
+
+    // Assuming that the convolution had SAME padding.
+    Eigen::Index filter_rows = filter_dims[0];
+    Eigen::Index filter_cols = filter_dims[1];
+    Eigen::Index filter_planes = filter_dims[2];
+
+    Scalar* input_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(input_dims)));
+    Scalar* output_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(output_dims)));
+    Scalar* filter_backward_data =
+        static_cast<Scalar*>(device_.allocate(BufferSize(filter_dims)));
+
+    device_.memset(input_data, 123, BufferSize(input_dims));
+    device_.memset(output_backward_data, 123, BufferSize(output_dims));
+
+    Input input(input_data, input_dims);
+    OutputBackward output_backward(output_backward_data, output_dims);
+    FilterBackward filter_backward(filter_backward_data, filter_dims);
+
+    for (auto s : state_) {
+      filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel(
+          input, output_backward, filter_planes, filter_rows, filter_cols);
+      tensorflow::testing::DoNotOptimize(filter_backward);
+    }
+
+    device_.deallocate(input_data);
+    device_.deallocate(output_backward_data);
+    device_.deallocate(filter_backward_data);
+  }
+
+ private:
+  ::testing::benchmark::State& state_;
+  Device& device_;
+};
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_cuboid_convolution.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_cuboid_convolution.h
new file mode 100644
index 00000000..156c557e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -0,0 +1,1995 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+#include "xla/tsl/framework/convolution/eigen_convolution_helpers.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#if !EIGEN_ALTIVEC_USE_CUSTOM_PACK
+// WARNING: Most of the code here implicitly assumes that the matrix is in
+// ColMajor layout. This is guaranteed by the tensor contraction (see
+// TensorContraction.h).
+//
+// Inside Eigen a tensor contraction is represented by a matrix multiplication.
+// We don't want to actually extract volume patches and reshape the result into
+// a matrix (this involves allocating huge extra memory), so the patch
+// extraction and reshape operations are implicit.
+//
+// TensorContractionInputMapper takes a matrix index and returns the coefficient
+// (or the packet) of the "virtual tensor", that would be at that index if we
+// were to actually reshape the result of patch extraction.
+//
+// TensorContractionSubMapper provides a similar view into the "virtual matrix"
+// at the given vertical and horizontal offsets.
+//
+// "Virtual matrix" dimensions:
+//   *0: kernelChannels * kernelPlanes * kernelRows * kernelCols
+//    1: out_planes * out_height * out_width * OTHERS (e.g batches, etc...)
+//
+// *) extracted patches are continuous in memory (innermost dimension assuming
+//    col major layout)
+//
+// With this dimensions:
+//   row - offset within a single patch (in code: patchId)
+//   col - index of the extracted patch (in code: patchIndex)
+//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
+//
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<
+    Scalar_, Index, Side,
+    TensorEvaluator<const TensorReshapingOp<NewDimension,
+                                            const TensorVolumePatchOp<
+                                                Planes, Rows, Cols, ArgType> >,
+                    Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef Scalar_ Scalar;
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(
+      const TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension,
+              const TensorVolumePatchOp<Planes, Rows, Cols, ArgType> >,
+          Device>& tensor,
+      const nocontract_t&, const nocontract_t&, const contract_t&,
+      const contract_t&)
+      : m_impl(tensor.impl().impl()) {
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_patch_depth = tensor.impl().dimensions()[0];
+      m_patch_planes = tensor.impl().dimensions()[1];
+      m_patch_rows = tensor.impl().dimensions()[2];
+      m_patch_cols = tensor.impl().dimensions()[3];
+      m_num_patches = tensor.impl().dimensions()[4];
+    } else {
+      const int NumDims = tensor.impl().dimensions().size();
+      m_patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      m_patch_planes = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_rows = tensor.impl().dimensions()[NumDims - 3];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 4];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 5];
+    }
+
+    // Strides for navigating through the single patch.
+    m_patch_plane_stride = m_patch_depth;
+    m_patch_row_stride = m_patch_planes * m_patch_plane_stride;
+    m_patch_col_stride = m_patch_rows * m_patch_row_stride;
+
+    // Strides for the output tensor.
+    // IMPORTANT: These strides are used to locate an element in a patch at a
+    // depth zero (channel), which is not quite the same as "traditional"
+    // stride.
+    m_rowStride = m_patch_planes;
+    m_colStride = m_patch_rows * m_rowStride;
+    m_patchStride = m_colStride * m_patch_cols * m_patch_depth;
+    m_otherStride = m_patchStride * m_num_patches;
+
+    m_outputPlanes = tensor.impl().outputPlanes();
+    m_outputRows = tensor.impl().outputRows();
+    m_outputCols = tensor.impl().outputCols();
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    m_plane_strides = tensor.impl().userPlaneStride();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_plane_strides = tensor.impl().userInPlaneStride();
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    m_patch_plane_inflate_strides = tensor.impl().planeInflateStride();
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_inputDepth = tensor.impl().impl().dimensions()[0];
+      m_inputPlanes = tensor.impl().impl().dimensions()[1];
+      m_inputRows = tensor.impl().impl().dimensions()[2];
+      m_inputCols = tensor.impl().impl().dimensions()[3];
+    } else {
+      const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputDepth = tensor.impl().impl().dimensions()[NumDims - 1];
+      m_inputPlanes = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 3];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 4];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_patchInputStride =
+        m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_planePaddingTop = tensor.impl().planePaddingTop();
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+
+    m_fastPatchPlaneStride =
+        internal::TensorIntDivisor<Index>(m_patch_plane_stride);
+    m_fastPatchRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_stride);
+    m_fastPatchColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_stride);
+
+    m_fastInputPlaneStride =
+        internal::TensorIntDivisor<Index>(m_patch_plane_inflate_strides);
+    m_fastInputRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+
+    m_fastDimZero = internal::TensorIntDivisor<Index>(m_patch_depth);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastOutputCols = internal::TensorIntDivisor<Index>(m_outputCols);
+
+    m_fastOutputPlanesRows =
+        internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+  }
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
+      : m_impl(base_mapper.m_impl) {
+    m_patch_depth = base_mapper.m_patch_depth;
+    m_patch_planes = base_mapper.m_patch_planes;
+    m_patch_rows = base_mapper.m_patch_rows;
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+
+    m_patch_plane_stride = base_mapper.m_patch_plane_stride;
+    m_patch_row_stride = base_mapper.m_patch_row_stride;
+    m_patch_col_stride = base_mapper.m_patch_col_stride;
+
+    m_rowStride = base_mapper.m_rowStride;
+    m_colStride = base_mapper.m_colStride;
+    m_patchStride = base_mapper.m_patchStride;
+    m_otherStride = base_mapper.m_otherStride;
+
+    m_planeInputStride = base_mapper.m_planeInputStride;
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+    m_otherInputStride = base_mapper.m_otherInputStride;
+
+    m_inputDepth = base_mapper.m_inputDepth;
+    m_inputPlanes = base_mapper.m_inputPlanes;
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputPlanes = base_mapper.m_outputPlanes;
+    m_outputRows = base_mapper.m_outputRows;
+    m_outputCols = base_mapper.m_outputCols;
+
+    m_plane_strides = base_mapper.m_plane_strides;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_plane_strides = base_mapper.m_in_plane_strides;
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_patch_plane_inflate_strides = base_mapper.m_patch_plane_inflate_strides;
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_planePaddingTop = base_mapper.m_planePaddingTop;
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_outputPlanesRows = base_mapper.m_outputPlanesRows;
+
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastPatchPlaneStride = base_mapper.m_fastPatchPlaneStride;
+    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
+    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
+    m_fastInputPlaneStride = base_mapper.m_fastInputPlaneStride;
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastRowStride = base_mapper.m_fastRowStride;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputPlanes = base_mapper.m_fastOutputPlanes;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastOutputCols = base_mapper.m_fastOutputCols;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+    m_fastOutputPlanesRows = base_mapper.m_fastOutputPlanesRows;
+  }
+
+  // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_in_plane_strides != 1 || m_in_row_strides != 1 ||
+           m_in_col_strides != 1 || m_patch_plane_inflate_strides != 1 ||
+           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
+    Index planeIndex, rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, planeIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, planeIndex, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual
+  // m_rowIndex, m_colIndex, m_otherIndex. This is currently only used by the
+  // gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
+    Index planeIndex, rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, planeIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, planeIndex, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
+    Index planeIndex, rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, planeIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, planeIndex, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
+    Index planeIndex, rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, planeIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, planeIndex, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_planeInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchPlanes() const { return m_rowStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_patch_rows; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+ private:
+  friend class TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>;
+
+  // Load coefficient from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index planeIndex,
+                                       Index rowIndex, Index colIndex,
+                                       Index otherIndex) const {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol =
+        (m_patch_col_inflate_strides == 1)
+            ? inputCol
+            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+
+    const Index rowOffset =
+        (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow =
+        (m_patch_row_inflate_strides == 1)
+            ? inputRow
+            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+
+    const Index planeOffset =
+        patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex + planeOffset * m_in_plane_strides;
+    const Index origInputPlane =
+        (m_patch_plane_inflate_strides == 1)
+            ? inputPlane
+            : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+
+    if (origInputCol < 0 || origInputRow < 0 || origInputPlane < 0 ||
+        origInputCol >= m_inputCols || origInputRow >= m_inputRows ||
+        origInputPlane >= m_inputPlanes ||
+        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
+        (inputRow != origInputRow * m_patch_row_inflate_strides) ||
+        (inputPlane != origInputPlane * m_patch_plane_inflate_strides)) {
+      return Scalar(0);
+    }
+
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + origInputPlane * m_planeInputStride +
+                             origInputRow * m_rowInputStride +
+                             origInputCol * m_colInputStride + otherIndex;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
+  // and `in_strides` equal to 1 (template specialization without templates).
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index planeIndex,
+                                               Index rowIndex, Index colIndex,
+                                               Index otherIndex) const {
+    eigen_assert(!nonStandardPatches());
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset =
+        (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index planeOffset =
+        patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    const Index inputPlane = planeIndex + planeOffset;
+
+    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
+        inputRow >= m_inputRows || inputPlane < 0 ||
+        inputPlane >= m_inputPlanes) {
+      return Scalar(0);
+    }
+
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputPlane * m_planeInputStride +
+                             inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  // Load packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index planeIndex,
+                                        Index rowIndex, Index colIndex,
+                                        Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId <
+                 patchDepth() * patchPlanes() * patchRows() * patchCols());
+
+    if (nonStandardPatches()) {
+      return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
+                                    otherIndex);
+    }
+    typedef decltype(m_impl) TensorEvaluatorT;
+    return loadPacketStandard<Packet, TensorEvaluatorT>(
+        patchId, planeIndex, rowIndex, colIndex, otherIndex);
+  }
+
+  // Helper function to load a 'partial' packet - this is the single row part of
+  // a packet that is split across two rows (but single column). In the
+  // 'partial' packet, the elements corresponding to the row (specified through
+  // rowOffset) are loaded and the rest of the elements are zero-filled into the
+  // 'partial' packet. This function is called from
+  // loadPacketStandardFromSingleColumnTwoRows(). This code path is exercised
+  // only when the packet type supports masked load and when the partial packet
+  // load is available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
+      Index planeIndex, Index rowIndex, Index colIndex, Index otherIndex,
+      Index patchId, const Index span[], const Index patchOffsets[],
+      Index colOffset, Index rowOffset) const {
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    const Index planeOffsets[2] = {
+        patchOffsets[0] - colOffset * m_colStride - rowOffset * m_rowStride,
+        patchOffsets[1] - colOffset * m_colStride - rowOffset * m_rowStride};
+    const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
+                                  planeIndex + planeOffsets[1]};
+
+    if (inputRow >= m_inputRows || inputRow < 0 || inputCol >= m_inputCols ||
+        inputCol < 0 || inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
+      // Partial packet is all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    } else if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // From inputIndex-span[0], we need to load elements starting from index
+      // span[0] all the way upto (and including) span[1].
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
+                               inputRow * m_rowInputStride +
+                               inputCol * m_colInputStride + otherIndex;
+      return m_impl.template partialPacket<Packet>(
+          inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
+    } else {
+      // Using slow path for this partial packet.
+      // We need to load elements starting from index span[0] all the way upto
+      // (and including) span[1]. We split this load into 3 parts:
+      // 0 : span[0]-1 - Zeros will be loaded for these indices
+      // span[0] : span[1] - Elements will be loaded here for these indices
+      // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
+      const Index packetSize = internal::unpacket_traits<Packet>::size;
+      EIGEN_ALIGN_MAX
+      std::remove_const_t<Scalar> values[packetSize];
+      for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
+      for (int i = span[0]; i < span[1] + 1; ++i)
+        values[i] = loadCoeff(patchId - span[0] + i, planeIndex, rowIndex,
+                              colIndex, otherIndex);
+      for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
+      return internal::pload<Packet>(values);
+    }
+  }
+
+  // Helper function to load a packet that is split across two rows (but single
+  // column). If required, this function is called from loadPacketStandard()
+  // when the packet type supports masked load and when the partial packet load
+  // is available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnTwoRows(
+      Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
+      Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
+      const Index rowOffsets[]) const {
+    eigen_assert(colOffsets[1] == colOffsets[0] &&
+                 rowOffsets[1] == rowOffsets[0] + 1);
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+
+    // Packet to load will be split into 2 parts where each part spans a single
+    // row and both the parts span the same column.
+    // First determine where to split.
+    const Index patchIdSplit =
+        (((rowOffsets[1] * m_rowStride) + (colOffsets[0] * m_colStride)) *
+         m_patch_depth) -
+        1;
+    const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
+
+    // patchIds[i]:          patchId corresponding to partial packet i
+    // spans[i]:             Start and end indices corresponding to the elements
+    //                       to be loaded for partial packet i
+    // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
+    const Index patchIds[2] = {patchId, patchIdSplit + 1};
+    const Index spans[2][2] = {{0, patchIdSplit - patchId},
+                               {patchIdSplit - patchId + 1, packetSize - 1}};
+    const Index patchOffsets2Cols[2][2] = {
+        {patchOffsets[0], patchOffsetSplit},
+        {patchOffsetSplit + 1, patchOffsets[1]}};
+
+    // Load partial packets and do bit-wise OR to generate required packet
+    return internal::por<Packet>(
+        loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
+                                  patchIds[0], spans[0], patchOffsets2Cols[0],
+                                  colOffsets[0], rowOffsets[0]),
+        loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
+                                  patchIds[1], spans[1], patchOffsets2Cols[1],
+                                  colOffsets[1], rowOffsets[1]));
+  }
+
+  // Helper function to load a packet that is present in a single column and
+  // row. If required, this function is called from loadPacketStandard().
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnSingleRow(
+      Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
+      Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
+      const Index rowOffsets[], const Index inputCols[],
+      const Index inputRows[]) const {
+    eigen_assert(colOffsets[1] == colOffsets[0] &&
+                 rowOffsets[1] == rowOffsets[0]);
+    const Index planeOffsets[2] = {
+        patchOffsets[0] - colOffsets[0] * m_colStride -
+            rowOffsets[0] * m_rowStride,
+        patchOffsets[1] - colOffsets[1] * m_colStride -
+            rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
+                                  planeIndex + planeOffsets[1]};
+
+    if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
+                               inputRows[0] * m_rowInputStride +
+                               inputCols[0] * m_colInputStride + otherIndex;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
+                                  otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is not available
+  // for the TensorEvaluator or if the packet type does not support masked
+  // load.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
+                     Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId <
+                 patchDepth() * patchPlanes() * patchRows() * patchCols());
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, planeIndex, rowIndex, colIndex,
+                            otherIndex);
+    } else {
+      // Offsets and input calculation here are identical to
+      // loadCoeffStandard(...), but repeated twice.
+
+      const Index patchOffsets[2] = {
+          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                   patchOffsets[1] / m_fastColStride};
+      eigen_assert(colOffsets[0] <= colOffsets[1]);
+
+      const Index inputCols[2] = {colIndex + colOffsets[0],
+                                  colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {
+            (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+            (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                    rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] == inputRows[1]) {
+          return loadPacketStandardFromSingleColumnSingleRow(
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
+              colOffsets, rowOffsets, inputCols, inputRows);
+        }
+      }
+    }
+
+    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
+                                  otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is available for
+  // the TensorEvaluator and if the packet type supports masked load.
+  // The only difference between this and the other case is that if the packet
+  // to load is split across two rows (but in same column), then in this case
+  // instead of going to the slow (element-by-element) load, we load two packets
+  // - each containing elements from one of the rows (rest of the elements of
+  // the packets are zeroes), and then combine these two packets to generate the
+  // required packet. The idea is to enable fast load (if possible) of these
+  // 'partial' packets.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
+                     Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId <
+                 patchDepth() * patchPlanes() * patchRows() * patchCols());
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, planeIndex, rowIndex, colIndex,
+                            otherIndex);
+    } else {
+      // Offsets and input calculation here are identical to
+      // loadCoeffStandard(...), but repeated twice.
+
+      const Index patchOffsets[2] = {
+          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                   patchOffsets[1] / m_fastColStride};
+      eigen_assert(colOffsets[0] <= colOffsets[1]);
+
+      const Index inputCols[2] = {colIndex + colOffsets[0],
+                                  colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {
+            (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+            (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                    rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] == inputRows[1]) {
+          return loadPacketStandardFromSingleColumnSingleRow(
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
+              colOffsets, rowOffsets, inputCols, inputRows);
+        }
+        if (inputRows[0] + 1 == inputRows[1]) {
+          return loadPacketStandardFromSingleColumnTwoRows(
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
+              colOffsets, rowOffsets);
+        }
+      }
+    }
+
+    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
+                                  otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index planeIndex,
+                                            Index rowIndex, Index colIndex,
+                                            Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId <
+                 patchDepth() * patchPlanes() * patchRows() * patchCols());
+
+    eigen_assert(!nonStandardPatches());
+    eigen_assert((patchDepth() % packetSize) == 0);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset =
+        (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index planeOffset =
+        patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    const Index inputPlane = planeIndex + planeOffset;
+
+    if (inputCol < 0 || inputRow < 0 || inputPlane < 0 ||
+        inputCol >= m_inputCols || inputRow >= m_inputRows ||
+        inputPlane >= m_inputPlanes) {
+      return internal::pset1<Packet>(Scalar(0));
+    }
+
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputPlane * m_planeInputStride +
+                             inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
+  packetWithPossibleZero(Index patchId, Index planeIndex, Index rowIndex,
+                         Index colIndex, Index otherIndex) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX
+    std::remove_const_t<Scalar> values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] =
+          loadCoeff(patchId + i, planeIndex, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  // Precompute the indices (plane, row, col, other) of the first element of
+  // the given patch index, within the output tensor of the TensorVolumePatchOp.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
+      Index patchIndex, Index& planeIndex, Index& rowIndex, Index& colIndex,
+      Index& otherIndex) const {
+    const size_t NumInputDims = array_size<
+        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+    // Check if patchIndex might contain batch and other dimensions.
+    otherIndex = (NumInputDims == 4) ? 0 : patchIndex / m_fastNumPatches;
+
+    // Compute index of the patch within the batch (and other dimensions).
+    const Index patch3DIndex = (NumInputDims == 4)
+                                   ? patchIndex
+                                   : (patchIndex - otherIndex * m_num_patches);
+
+    otherIndex *= m_patchInputStride;
+
+    colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    rowIndex =
+        (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    planeIndex =
+        patch3DIndex - (colIndex * m_outputRows + rowIndex) * m_outputPlanes;
+
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+    planeIndex = planeIndex * m_plane_strides - m_planePaddingTop;
+  }
+
+  Index m_patch_depth;   // number of channels in the patch
+  Index m_patch_planes;  // number of planes in the patch
+  Index m_patch_rows;    // number of rows in the patch
+  Index m_patch_cols;    // number of columns in the patch
+  Index m_num_patches;   // number of patches to extract
+
+  // Strides for navigating through the single patch.
+  Index m_patch_plane_stride;
+  Index m_patch_row_stride;
+  Index m_patch_col_stride;
+
+  // Strides for the output tensor (depth is not the part of the stride).
+  Index m_rowStride;
+  Index m_colStride;
+  Index m_patchStride;
+  Index m_otherStride;
+
+  Index m_planeInputStride;  // Plane stride in the input tensor
+  Index m_rowInputStride;    // Row stride in the input tensor
+  Index m_colInputStride;    // Col stride in the input tensor
+  Index m_patchInputStride;  // Patch stride in the input tensor
+  Index m_otherInputStride;
+
+  Index m_inputDepth;   // Depth of the input tensor
+  Index m_inputPlanes;  // Number of planes in the input tensor
+  Index m_inputRows;    // Number of rows in the input tensor
+  Index m_inputCols;    // Number of cols in the input tensor
+
+  Index m_outputPlanes;      // Number of output planes
+  Index m_outputRows;        // Number of output rows
+  Index m_outputCols;        // Number of output cols
+  Index m_outputPlanesRows;  // Cached outputPlanes * outputRows.
+
+  Index m_plane_strides;  // User specified plane stride
+  Index m_row_strides;    // User specified row stride
+  Index m_col_strides;    // User specified col stride
+
+  // User specified plane/row/col atrous convolution strides.
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  // User specified plane/row/col inflation strides in the image patch.
+  Index m_patch_plane_inflate_strides;
+  Index m_patch_row_inflate_strides;
+  Index m_patch_col_inflate_strides;
+
+  Index m_planePaddingTop;  // Plane padding
+  Index m_rowPaddingTop;    // Row padding
+  Index m_colPaddingLeft;   // Column padding
+
+  // Fast representation of various divisors.
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+
+  internal::TensorIntDivisor<Index> m_fastPatchPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
+  internal::TensorIntDivisor<Index> m_fastPatchColStride;
+
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  internal::TensorIntDivisor<Index> m_fastDimZero;  // aka output depth
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastOutputCols;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper<
+    Scalar, Index, Side,
+    TensorEvaluator<const TensorReshapingOp<NewDimension,
+                                            const TensorVolumePatchOp<
+                                                Planes, Rows, Cols, ArgType> >,
+                    Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      ParentMapper;
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper),
+        m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_planeIndex, m_rowIndex,
+                                     m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const Self& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper.m_base_mapper),
+        m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_planeIndex, m_rowIndex,
+                                     m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper.loadCoeff(i + m_depth_offset, m_planeIndex, m_rowIndex,
+                                   m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
+                                                          Index j) const {
+    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_depth_offset, m_planeIndex,
+                                    m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
+                                                          Index j) const {
+    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
+                                                        j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
+  loadCoeffStandard(Index i) const {
+    return m_base_mapper.loadCoeffStandard(
+        i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
+    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_planeIndex,
+                                        m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
+  loadPacketStandard(Index i) const {
+    typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
+    return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
+        i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_base_mapper.nonStandardPatches();
+  }
+
+  // Max(Col|Row|Plane|Depth): compute the upper limit for the column, row,
+  // plane and depth index respectively that fits into the peeled_k elements
+  // starting at m_depth_offset.
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
+    const Index max_col =
+        fastPatchColStride().divide(m_depth_offset + peeled_k);
+    return std::min<Index>(1 + max_col, patchCols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
+                                   const Index col) const {
+    const Index max_row = fastPatchRowStride().divide(
+        m_depth_offset + peeled_k - col * patchColStride());
+    return std::min<Index>(1 + max_row, patchRows());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxPlane(const Index peeled_k, const Index col,
+                                     const Index row) const {
+    const Index max_plane = fastPatchPlaneStride().divide(
+        m_depth_offset + peeled_k - col * patchColStride() -
+        row * patchRowStride());
+    return std::min<Index>(1 + max_plane, patchPlanes());
+  }
+
+  // MaxDepth uses only the remaining number of elements in the peeled_k.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
+                                     const Index start_depth) const {
+    return std::min<Index>(start_depth + num_elements, patchDepth());
+  }
+
+  // Every register matters in this code, so sometimes to prevent register
+  // spilling, instead of the variable that you would expect to see, we use
+  // another one, that is guaranteed to have the same value. E.g. patch depth is
+  // always the same as input depth, and it's also the same as input plane
+  // stride. Bunch of other parameters have similar relations.
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const {
+    eigen_assert(m_base_mapper.m_patch_depth ==
+                     m_base_mapper.m_planeInputStride &&
+                 "Patch depth must be equal to plane input stride.");
+    return m_base_mapper.m_planeInputStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchPlanes() const {
+    eigen_assert(m_base_mapper.m_patch_planes == m_base_mapper.m_rowStride &&
+                 "Patch planes must be equal to row stride.");
+    return m_base_mapper.m_rowStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const {
+    return m_base_mapper.m_patch_rows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const {
+    return m_base_mapper.m_patch_cols;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchPlaneStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_plane_stride &&
+                 "Patch depth must be equal to patch plane stride.");
+    return patchDepth();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
+    return m_base_mapper.m_patch_row_stride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchColStride() const {
+    return m_base_mapper.m_patch_col_stride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchPlaneStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_plane_stride &&
+                 "Patch depth must be equal to patch plane stride.");
+    return m_base_mapper.m_fastDimZero;  // patch_depth
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
+    return m_base_mapper.m_fastPatchRowStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
+    return m_base_mapper.m_fastPatchColStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
+                                             const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padPlane(const Index plane) const {
+    const Index p = m_planeIndex + plane;
+    return p < 0 || p >= m_base_mapper.m_inputPlanes;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
+    const Index r = m_rowIndex + row;
+    return r < 0 || r >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
+    const Index c = m_colIndex + col;
+    return c < 0 || c >= m_base_mapper.m_inputCols;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index plane, const Index row,
+                                      const Index col) const {
+    const Index p = m_planeIndex + plane;
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return p * m_base_mapper.m_planeInputStride +
+           r * m_base_mapper.m_rowInputStride +
+           c * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index planeOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    const Index rowOffset =
+        (patchOffset - colOffset * m_base_mapper.m_colStride) /
+        m_base_mapper.m_fastRowStride;
+    const Index planeOffset = patchOffset -
+                              colOffset * m_base_mapper.m_colStride -
+                              rowOffset * m_base_mapper.m_rowStride;
+    return planeOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    const Index rowOffset =
+        (patchOffset - colOffset * m_base_mapper.m_colStride) /
+        m_base_mapper.m_fastRowStride;
+    return rowOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return colOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const {
+    return m_depth_offset % patchDepth();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
+  getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
+  }
+
+ private:
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
+
+  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
+  // indices for the first element in a patch specified by col_offset
+  // (see computeBaseIndices(...) for details).
+  Index m_planeIndex;
+  Index m_rowIndex;
+  Index m_colIndex;
+  Index m_otherIndex;
+};
+
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted volume patches) in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
+// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
+// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
+// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
+// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
+// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
+// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
+// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
+// A8 ...
+// ...
+//
+// *) A, B, C, ... - patches extracted from the original input.
+// *) A0, A1, A2 ... - values from the same patch at different offsets.
+//
+// The traversal (packed rhs memory) order (B0 besides A0 in memory):
+// A0 B0 C0 D0 A1 B1 C1 D1 ...
+// E0 F0 G0 H0 E1 F1 G1 H1 ...
+// ...
+// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
+//
+// This traversal order must be the same as in default gemm_pack_rhs defined in
+// GeneralBlockPanelKernel.h.
+//
+// *) nr - number of registers along the 'n' dimension.
+//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
+//    Multiplication" paper.
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((packet_size % 4) == 0 && !non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns, rows and planes if we know that a single
+        // packet do not span across multiple planes, rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const Index start_plane = ((c == start_col) && (r == start_row))
+                                            ? rhs.planeOffset()
+                                            : 0;
+              const Index max_plane = rhs.maxPlane(peeled_k, c, r);
+
+              const bool pad_row0 = pad_col0 || dm0.padRow(r);
+              const bool pad_row1 = pad_col1 || dm1.padRow(r);
+              const bool pad_row2 = pad_col2 || dm2.padRow(r);
+              const bool pad_row3 = pad_col3 || dm3.padRow(r);
+
+              for (Index p = start_plane; p < max_plane; ++p) {
+                eigen_assert(k <= peeled_k);
+
+                const bool pad0 = pad_row0 || dm0.padPlane(p);
+                const bool pad1 = pad_row1 || dm1.padPlane(p);
+                const bool pad2 = pad_row2 || dm2.padPlane(p);
+                const bool pad3 = pad_row3 || dm3.padPlane(p);
+
+                const Index idx0 = dm0.baseIndex(p, r, c);
+                const Index idx1 = dm1.baseIndex(p, r, c);
+                const Index idx2 = dm2.baseIndex(p, r, c);
+                const Index idx3 = dm3.baseIndex(p, r, c);
+
+                const Index start_depth =
+                    ((c == start_col) && (r == start_row) && (p == start_plane))
+                        ? rhs.depthOffset()
+                        : 0;
+                const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+                eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+                for (Index d = start_depth; d < max_depth; d += packet_size) {
+                  eigen_assert(k < peeled_k);
+                  PacketBlock<Packet, 4> kernel;
+                  kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                          : rhs.packetNoPadding(d, idx0);
+                  kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                          : rhs.packetNoPadding(d, idx1);
+                  kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
+                                          : rhs.packetNoPadding(d, idx2);
+                  kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
+                                          : rhs.packetNoPadding(d, idx3);
+                  ptranspose(kernel);
+                  pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                  pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                  pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                  pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                  block += 4 * packet_size;
+                  k += packet_size;
+                }
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          // Packet can span multiple planes, rows or columns, so we have to go
+          // though the slower "standard" path.
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketStandard(k);
+            kernel.packet[1] = dm1.loadPacketStandard(k);
+            kernel.packet[2] = dm2.loadPacketStandard(k);
+            kernel.packet[3] = dm3.loadPacketStandard(k);
+            ptranspose(kernel);
+            pstoreu(block + 0 * packet_size, kernel.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel.packet[1]);
+            pstoreu(block + 2 * packet_size, kernel.packet[2]);
+            pstoreu(block + 3 * packet_size, kernel.packet[3]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!non_standard_patches) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, /*packet_size*/ 2, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, /*packet_size*/ 2, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const int packet_size = 2;
+
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns, rows and planes if we know that a single
+        // packet do not span across multiple planes, rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const Index start_plane = ((c == start_col) && (r == start_row))
+                                            ? rhs.planeOffset()
+                                            : 0;
+              const Index max_plane = rhs.maxPlane(peeled_k, c, r);
+
+              const bool pad_row0 = dm0.padRow(r);
+              const bool pad_row1 = dm1.padRow(r);
+              const bool pad_row2 = dm2.padRow(r);
+              const bool pad_row3 = dm3.padRow(r);
+
+              for (Index p = start_plane; p < max_plane; ++p) {
+                eigen_assert(k <= peeled_k);
+
+                const bool pad0 = pad_col0 || pad_row0 || dm0.padPlane(p);
+                const bool pad1 = pad_col1 || pad_row1 || dm1.padPlane(p);
+                const bool pad2 = pad_col2 || pad_row2 || dm2.padPlane(p);
+                const bool pad3 = pad_col3 || pad_row3 || dm3.padPlane(p);
+
+                const Index idx0 = dm0.baseIndex(p, r, c);
+                const Index idx1 = dm1.baseIndex(p, r, c);
+                const Index idx2 = dm2.baseIndex(p, r, c);
+                const Index idx3 = dm3.baseIndex(p, r, c);
+
+                const Index start_depth =
+                    ((c == start_col) && (r == start_row) && (p == start_plane))
+                        ? rhs.depthOffset()
+                        : 0;
+                const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+                eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+                for (Index d = start_depth; d < max_depth; d += packet_size) {
+                  eigen_assert(k < peeled_k);
+                  PacketBlock<Packet, 2> kernel0;
+                  PacketBlock<Packet, 2> kernel1;
+                  kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                           : rhs.packetNoPadding(d, idx0);
+                  kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                           : rhs.packetNoPadding(d, idx1);
+                  kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+                                           : rhs.packetNoPadding(d, idx2);
+                  kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+                                           : rhs.packetNoPadding(d, idx3);
+                  ptranspose(kernel0);
+                  ptranspose(kernel1);
+                  pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                  pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                  pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                  pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                  block += 4 * packet_size;
+                  k += packet_size;
+                }
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!rhs.nonStandardPatches()) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Special case for non-vectorized types such as float16 (packet_size = 1).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, /*packet_size*/ 1, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      if (!rhs.nonStandardPatches()) {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+#endif
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+// Pack a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted image patches) in contiguous block in
+// column-major storage order. Knowing the properties of the original patch op
+// we can do it more efficient than the default gemm_pack_colmajor_block.
+//
+// TODO(ezhulenev): gemm_pack_colmajor_block for spatial convolutions supports
+// squeezing reads along the 2 innermost dimensions, add it here if needed.
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar,
+          typename StorageIndex, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
+          int Alignment>
+struct gemm_pack_colmajor_block<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && rhs.patchDepth() % packet_size == 0) {
+      packStandardPatches<true>(block, rhs, rows, cols);
+
+    } else if (standard_patches) {
+      packStandardPatches<false>(block, rhs, rows, cols);
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard volume patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  //
+  template <bool patch_depth_is_multiple_of_packet_size>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const Index peeled_k = (rows / packet_size) * packet_size;
+
+    const Index start_col = rhs.colOffset();
+    const Index max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      Index k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const Index max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        for (Index r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const Index start_plane =
+              ((c == start_col) && (r == start_row)) ? rhs.planeOffset() : 0;
+          const Index max_plane = rhs.maxPlane(peeled_k, c, r);
+          const bool pad_row = pad_col || lm.padRow(r);
+
+          for (Index p = start_plane; p < max_plane; ++p) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_depth =
+                ((c == start_col) && (r == start_row) && (p == start_plane))
+                    ? rhs.depthOffset()
+                    : 0;
+            const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+
+            const bool pad = pad_col || pad_row || lm.padPlane(p);
+            const Index base_idx = lm.baseIndex(p, r, c);
+
+            if (patch_depth_is_multiple_of_packet_size)
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            const Index max_vectorized_depth =
+                patch_depth_is_multiple_of_packet_size
+                    ? max_depth
+                    : max_depth - packet_size;
+
+            Index d = start_depth;
+
+            // 1. Process depth dimension with vectorized instructions.
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet packet = pad ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, packet);
+              block += packet_size;
+              k += packet_size;
+            }
+
+            // 2. Finish with coefficients.
+            if (!patch_depth_is_multiple_of_packet_size) {
+              for (; d < max_depth; d++) {
+                eigen_assert(k < peeled_k);
+                *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+                ++block;
+                ++k;
+              }
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+
+}  // namespace internal
+
+/** CuboidConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 3D convolution over a multichannel input voxel block.
+ *
+ * The input parameter is expected to be a tensor with a rank of 4 or more
+ * (channels, depth, height, width, and optionally others).
+ * The kernel parameter is expected to be a 5D tensor (filters, channels,
+ * kernel_depth, kernel_height, kernel_width).
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, depth, height, width
+ * (and others if applicable).
+ *
+ * The input and kernel have to be in the same layout, and both row-major and
+ * col-major are supported. The shapes given above are for col-major layout.
+ * For row-major, all dimensions should be reversed.
+ *
+ * It is possible to swap the order of the depth, width, and height dimensions
+ * provided that the same order is used in the input, the kernel, and the
+ * output.
+ */
+template <typename Input, typename Kernel>
+EIGEN_ALWAYS_INLINE static const std::conditional_t<
+    internal::traits<Input>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const Kernel>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> > > >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> >,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const Kernel> > > >
+CuboidConvolution(const Input& input, const Kernel& kernel,
+                  const Index stridePlanes = 1, const Index strideRows = 1,
+                  const Index strideCols = 1,
+                  const PaddingType padding_type = PADDING_SAME) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+                   internal::traits<Kernel>::NumDimensions,
+                   internal::traits<Kernel>::Layout, TensorIndex> >
+      kern(kernel);
+
+  EIGEN_STATIC_ASSERT(
+      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result.
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
+
+  // Spatial size of the kernel.
+  const TensorIndex kernelPlanes =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
+
+  if (isColMajor) {
+    eigen_assert(kernelChannels == in.dimension(0));
+  } else {
+    eigen_assert(kernelChannels == in.dimension(NumDims - 1));
+  }
+
+  const TensorIndex inputPlanes =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols =
+      isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  TensorIndex out_planes;
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_planes = Eigen::divup(inputPlanes - kernelPlanes + 1,
+                                static_cast<TensorIndex>(stridePlanes));
+      out_height = Eigen::divup(inputRows - kernelRows + 1,
+                                static_cast<TensorIndex>(strideRows));
+      out_width = Eigen::divup(inputCols - kernelCols + 1,
+                               static_cast<TensorIndex>(strideCols));
+      break;
+    case PADDING_SAME:
+      out_planes =
+          Eigen::divup(inputPlanes, static_cast<TensorIndex>(stridePlanes));
+      out_height =
+          Eigen::divup(inputRows, static_cast<TensorIndex>(strideRows));
+      out_width = Eigen::divup(inputCols, static_cast<TensorIndex>(strideCols));
+      break;
+    default:
+      out_planes = 0;
+      out_height = 0;
+      out_width = 0;
+      eigen_assert(false && "unexpected padding");
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelPlanes * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelPlanes * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+
+  // Molds the output of the patch extraction result into a 2D tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] =
+        kernelChannels * kernelPlanes * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_planes * out_height * out_width;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] =
+        kernelChannels * kernelPlanes * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_planes * out_height * out_width;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  // Molds the output of the contraction into the shape expected by the user
+  // (assuming ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output depth
+  // - 3nd dim: output height
+  // - 4rd dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_planes;
+    post_contract_dims[2] = out_height;
+    post_contract_dims[3] = out_width;
+    for (int i = 4; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_planes;
+    post_contract_dims[NumDims - 3] = out_height;
+    post_contract_dims[NumDims - 4] = out_width;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+          .contract(input
+                        .extract_volume_patches(
+                            kernelPlanes, kernelRows, kernelCols, stridePlanes,
+                            strideRows, strideCols, padding_type)
+                        .reshape(pre_contract_dims),
+                    contract_dims)
+          .reshape(post_contract_dims),
+      input
+          .extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
+                                  stridePlanes, strideRows, strideCols,
+                                  padding_type)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .reshape(post_contract_dims));
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_pooling.h b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_pooling.h
new file mode 100644
index 00000000..ac701df0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/eigen_pooling.h
@@ -0,0 +1,546 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+namespace Eigen {
+
+/** SpatialMaxPooling
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a max-pooling over a multichannel input image.
+ *
+ * The input parameter is expected to be a with a rank of 4 (channels, height,
+ * width, others in col-major, and the reverse of that in row-major).
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be channels, height, width, and
+ * others (in col-major, and the reverse of that if the input was row-major).
+ *
+ * The order of the width and height dimensions can be swapped if needed.
+ *
+ */
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<typename internal::traits<Input>::Index,
+                        internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::MaxReducer<
+            std::remove_const_t<typename internal::traits<Input>::Scalar>>,
+        std::conditional_t<
+            internal::traits<Input>::Layout == ColMajor,
+            const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>>,
+            const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>>>,
+        const TensorImagePatchOp<Dynamic, Dynamic, const Input>>>
+SpatialMaxPooling(const Input& input, DenseIndex patchRows,
+                  DenseIndex patchCols, DenseIndex strideRows,
+                  DenseIndex strideCols, const PaddingType padding_type,
+                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+
+  const DenseIndex patchRowsEff =
+      patchRows + (patchRows - 1) * (in_strideRows - 1);
+  const DenseIndex patchColsEff =
+      patchCols + (patchCols - 1) * (in_strideCols - 1);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int idxRows = isColMajor ? 1 : 2;
+  static const int idxCols = isColMajor ? 2 : 1;
+
+  // Molds the output of the reduction into the shape expected by the user.
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions>
+      post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)) - patchRowsEff + 1,
+        strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)) - patchColsEff + 1,
+        strideCols);
+  } else {
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)), strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)), strideCols);
+  }
+  post_reduce_dims[3] = in.dimension(3);
+
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  std::conditional_t<
+      internal::traits<Input>::Layout == ColMajor,
+      const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>>,
+      const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>>>
+      reduction_dims;
+
+  return input
+      .extract_image_patches(
+          patchRows, patchCols, strideRows, strideCols, in_strideRows,
+          in_strideCols, padding_type,
+          Eigen::NumTraits<std::remove_const_t<
+              typename internal::traits<Input>::Scalar>>::lowest())
+      .maximum(reduction_dims)
+      .reshape(post_reduce_dims);
+}
+
+/** CuboidMaxPooling
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a max-pooling over a multichannel input volume.
+ *
+ * The input parameter is expected to be a tensor with a rank of 5 (channels,
+ * depth, height, width, others in col-major, and the reverse of that in
+ * row-major).
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be channels, depth, height, width,
+ * and others (in col-major, and the reverse of that if the input was
+ * row-major).
+ *
+ * The order of the depth, width and height dimensions can be swapped if
+ * needed.
+ *
+ */
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::MaxReducer<
+            std::remove_const_t<typename internal::traits<Input>::Scalar>>,
+        const Eigen::IndexList<Eigen::type2index<1>>,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>>>>
+CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
+                 DenseIndex patchRows, DenseIndex patchCols,
+                 DenseIndex stridePlanes, DenseIndex strideRows,
+                 DenseIndex strideCols, const PaddingType padding_type) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+
+  static const int idxPlanes = isColMajor ? 1 : 3;
+  static const int idxRows = 2;
+  static const int idxCols = isColMajor ? 3 : 1;
+
+  // Molds the output of the reduction into the shape expected by the used
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output depth
+  // - 3rd dim: output height
+  // - 4th dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>
+      post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxPlanes] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxPlanes)) - patchPlanes + 1,
+        stridePlanes);
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)) - patchRows + 1,
+        strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)) - patchCols + 1,
+        strideCols);
+  } else {
+    post_reduce_dims[idxPlanes] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxPlanes)), stridePlanes);
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)), strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)), strideCols);
+  }
+  post_reduce_dims[4] = in.dimension(4);
+
+  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
+  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
+  if (isColMajor) {
+    pre_reduce_dims[0] = post_reduce_dims[0];
+    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] *
+                         post_reduce_dims[3] * post_reduce_dims[4];
+  } else {
+    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] *
+                         post_reduce_dims[2] * post_reduce_dims[3];
+    pre_reduce_dims[2] = post_reduce_dims[4];
+  }
+
+  typedef std::remove_const_t<typename internal::traits<Input>::Scalar>
+      CoeffReturnType;
+
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
+  return input
+      .extract_volume_patches(patchPlanes, patchRows, patchCols, stridePlanes,
+                              strideRows, strideCols, padding_type,
+                              -Eigen::NumTraits<CoeffReturnType>::highest())
+      .reshape(pre_reduce_dims)
+      .maximum(reduction_dims)
+      .reshape(post_reduce_dims);
+}
+
+/** SpatialAvgPooling
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies an average pooling over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 4 (channels,
+ * height, width, others in col-major, and the reverse of that in row-major).
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be channels, height, width, and
+ * others (in col-major, and the reverse of that if the input was row-major).
+ *
+ * The order of the width and height dimensions can be swapped if needed.
+ *
+ */
+namespace internal {
+
+template <typename T>
+struct AvgPoolMeanReducer {
+#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) && \
+    !defined(__HIPCC__)
+  // We only support packet access for floats.
+  static constexpr bool PacketAccess = internal::is_same<T, float>::value;
+#else
+  static const bool PacketAccess = false;
+#endif
+  static constexpr bool IsStateful = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
+    typedef typename packet_traits<T>::type Packet;
+#if defined(__HIPCC__)
+    packetCount_ = 0;
+#else
+    packetCount_ = pset1<Packet>(T(0.0));
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    if (t != -Eigen::NumTraits<T>::highest()) {
+      (*accum) = (*accum) + t;
+      scalarCount_++;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    eigen_assert(scalarCount_ > 0);
+    return accum / T(scalarCount_);
+  }
+
+#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) && \
+    !defined(__HIPCC__)
+#ifdef EIGEN_VECTORIZE_AVX512
+#define pequal(a, b)   \
+  _mm512_castsi512_ps( \
+      _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
+
+  // The ternarylogic function immediate determines the values in the result
+  // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+  // For details, refer to the vpternlogd instruction table at
+  // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+
+#define psel(a, b, false_mask)                        \
+  _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
+      _mm512_castps_si512(a), _mm512_castps_si512(b), \
+      _mm512_castps_si512(false_mask), 0xd8))
+#elif defined EIGEN_VECTORIZE_AVX
+#define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
+#define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
+#else
+#define pequal(a, b) _mm_cmpeq_ps(a, b)
+#define psel(a, b, false_mask) \
+  _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b))
+#endif
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p,
+                                                          Packet* accum) {
+    reducePacketWithType(static_cast<T>(0), p, accum);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacketWithType(
+      T, const Packet& p, Packet* accum) {
+    Packet skip_mask =
+        pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
+    (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
+    packetCount_ = padd<Packet>(
+        packetCount_, psel(pset1<Packet>(1), pset1<Packet>(0), skip_mask));
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+  finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, packetCount_);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
+  finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_));
+  }
+#endif
+
+ protected:
+  typedef typename packet_traits<T>::type Packet;
+  int scalarCount_;
+#if defined(__HIPCC__)
+  int packetCount_;
+#else
+  Packet packetCount_;
+#endif
+};
+
+template <typename Device>
+struct reducer_traits<AvgPoolMeanReducer<float>, Device> {
+  enum {
+    Cost = 1,
+#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) && \
+    !defined(__HIPCC__)
+    // We only support packet access for floats.
+    PacketAccess = true,
+#else
+    PacketAccess = false,
+#endif
+    IsStateful = true,
+    IsExactlyAssociative = false
+  };
+};
+
+template <>
+struct reducer_traits<AvgPoolMeanReducer<float>, GpuDevice> {
+  enum {
+    Cost = 1,
+    PacketAccess = false,
+    IsStateful = true,
+    IsExactlyAssociative = false
+  };
+};
+
+}  // namespace internal
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<typename internal::traits<Input>::Index,
+                        internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::AvgPoolMeanReducer<
+            std::remove_const_t<typename internal::traits<Input>::Scalar>>,
+        std::conditional_t<
+            internal::traits<Input>::Layout == ColMajor,
+            const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>>,
+            const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>>>,
+        const TensorImagePatchOp<Dynamic, Dynamic, const Input>>>
+SpatialAvgPooling(const Input& input, DenseIndex patchRows,
+                  DenseIndex patchCols, DenseIndex strideRows,
+                  DenseIndex strideCols, const PaddingType padding_type,
+                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+
+  const DenseIndex patchRowsEff =
+      patchRows + (patchRows - 1) * (in_strideRows - 1);
+  const DenseIndex patchColsEff =
+      patchCols + (patchCols - 1) * (in_strideCols - 1);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int idxRows = isColMajor ? 1 : 2;
+  static const int idxCols = isColMajor ? 2 : 1;
+
+  // Molds the output of the reduction into the shape expected by the user.
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions>
+      post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)) - patchRowsEff + 1,
+        strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)) - patchColsEff + 1,
+        strideCols);
+  } else {
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)), strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)), strideCols);
+  }
+  post_reduce_dims[3] = in.dimension(3);
+
+  typedef std::remove_const_t<typename internal::traits<Input>::Scalar>
+      CoeffReturnType;
+  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
+
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  std::conditional_t<
+      internal::traits<Input>::Layout == ColMajor,
+      const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>>,
+      const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>>>
+      reduction_dims;
+  return input
+      .extract_image_patches(patchRows, patchCols, strideRows, strideCols,
+                             in_strideRows, in_strideCols, padding_type,
+                             -Eigen::NumTraits<CoeffReturnType>::highest())
+      .reduce(reduction_dims, mean_with_nan)
+      .reshape(post_reduce_dims);
+}
+
+/** CuboidAvgPooling
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies an average pooling over a multichannel input volume.
+ *
+ * The input parameter is expected to be a tensor with a rank of 5 (channels,
+ * depth, height, width, others, and the reverse of that in row-major).
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be channels, depth, width, and
+ * others (in col-major, and the reverse of that if the input was row-major).
+ *
+ * The order of the depth, width and height dimensions can be swapped if
+ * needed.
+ *
+ */
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::AvgPoolMeanReducer<
+            std::remove_const_t<typename internal::traits<Input>::Scalar>>,
+        const Eigen::IndexList<Eigen::type2index<1>>,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>>>>
+CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
+                 DenseIndex patchRows, DenseIndex patchCols,
+                 DenseIndex stridePlanes, DenseIndex strideRows,
+                 DenseIndex strideCols, const PaddingType padding_type) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+
+  static const int idxPlanes = isColMajor ? 1 : 3;
+  static const int idxRows = 2;
+  static const int idxCols = isColMajor ? 3 : 1;
+  // Molds the output of the reduction into the shape expected by the used
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: outupt depth
+  // - 3rd dim: output height
+  // - 4th dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>
+      post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxPlanes] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxPlanes)) - patchPlanes + 1,
+        stridePlanes);
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)) - patchRows + 1,
+        strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)) - patchCols + 1,
+        strideCols);
+  } else {
+    post_reduce_dims[idxPlanes] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxPlanes)), stridePlanes);
+    post_reduce_dims[idxRows] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxRows)), strideRows);
+    post_reduce_dims[idxCols] = Eigen::divup(
+        static_cast<DenseIndex>(in.dimension(idxCols)), strideCols);
+  }
+  post_reduce_dims[4] = in.dimension(4);
+
+  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
+  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
+  if (isColMajor) {
+    pre_reduce_dims[0] = post_reduce_dims[0];
+    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] *
+                         post_reduce_dims[3] * post_reduce_dims[4];
+  } else {
+    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] *
+                         post_reduce_dims[2] * post_reduce_dims[3];
+    pre_reduce_dims[2] = post_reduce_dims[4];
+  }
+
+  typedef std::remove_const_t<typename internal::traits<Input>::Scalar>
+      CoeffReturnType;
+  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
+
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
+  return input
+      .extract_volume_patches(patchPlanes, patchRows, patchCols, stridePlanes,
+                              strideRows, strideCols, padding_type,
+                              -Eigen::NumTraits<CoeffReturnType>::highest())
+      .reshape(pre_reduce_dims)
+      .reduce(reduction_dims, mean_with_nan)
+      .reshape(post_reduce_dims);
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fake_quant_ops_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fake_quant_ops_functor.h
new file mode 100644
index 00000000..5053b5f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -0,0 +1,290 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FAKE_QUANT_OPS_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_FAKE_QUANT_OPS_FUNCTOR_H_
+
+#include <tuple>
+
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float StdRound(float input) {
+// On Android, std::round() isn't present, just round().
+#if defined(__ANDROID__)
+  return round(input);
+#else
+  return std::round(input);
+#endif
+}
+
+namespace tensorflow {
+
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(
+    const float min, const float max, const int quant_min, const int quant_max,
+    float* nudged_min, float* nudged_max, float* scale, float* inv_scale) {
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *scale = (max - min) / (quant_max_float - quant_min_float);
+  // Re-calculate the inverse to avoid loss of precision which would result
+  // from simply taking the reciprocal of *scale
+  *inv_scale = (quant_max_float - quant_min_float) / (max - min);
+  const float zero_point_from_min = quant_min_float - min / *scale;
+  const uint16 nudged_zero_point = [zero_point_from_min, quant_min,
+                                    quant_min_float, quant_max,
+                                    quant_max_float] {
+    if (zero_point_from_min < quant_min_float) {
+      return static_cast<uint16>(quant_min);
+    }
+    if (zero_point_from_min > quant_max_float) {
+      return static_cast<uint16>(quant_max);
+    }
+    return static_cast<uint16>(StdRound(zero_point_from_min));
+  }();
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+}
+
+template <typename T>
+using ConstScalar = typename tensorflow::TTypes<T>::ConstScalar;
+template <typename T>
+using Scalar = typename tensorflow::TTypes<T>::Scalar;
+template <typename T>
+using ConstVec = typename tensorflow::TTypes<T>::ConstVec;
+template <typename T>
+using Vec = typename tensorflow::TTypes<T>::Vec;
+template <typename T>
+using ConstFlat = typename tensorflow::TTypes<T>::ConstFlat;
+template <typename T>
+using Flat = typename tensorflow::TTypes<T>::Flat;
+
+// Functor called by FakeQuantWithMinMaxArgsOp to do the work.  Compiles both
+// for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxArgsFunctor {
+  void operator()(const Device& d, ConstFlat<float> inputs, const float min,
+                  const float max, const int quant_min, const int quant_max,
+                  Flat<float> outputs) {
+    eigen_assert(min <= 0.0f && "min should be <= 0.0");
+    eigen_assert(max >= 0.0f && "max should be >= 0.0");
+    eigen_assert(min < max && "min should be < max");
+
+    float nudged_min, nudged_max, nudged_scale, inv_nudged_scale;
+    Nudge(min, max, quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale, &inv_nudged_scale);
+
+    const float quant_zero = floor(-nudged_min * inv_nudged_scale + 0.5f);
+
+    auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
+    auto clamped_shifted = clamped - nudged_min;
+    outputs.device(d) =
+        (clamped_shifted * inv_nudged_scale - quant_zero + 0.5f).floor() *
+        nudged_scale;
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxArgsGradientOp to do the work.  Compiles
+// both for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxArgsGradientFunctor {
+  void operator()(const Device& d, ConstFlat<float> gradients,
+                  ConstFlat<float> inputs, const float min, const float max,
+                  const int quant_min, const int quant_max,
+                  Flat<float> backprops) {
+    eigen_assert(min <= 0.0f && "min should be <= 0.0");
+    eigen_assert(max >= 0.0f && "max should be >= 0.0");
+    eigen_assert(min < max && "min should be < max");
+
+    float nudged_min, nudged_max, nudged_scale, inv_nudged_scale;
+    Nudge(min, max, quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale, &inv_nudged_scale);
+
+    auto between_nudged_min_max =
+        (inputs >= nudged_min && inputs <= nudged_max)
+            .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprops.device(d) = gradients * between_nudged_min_max;
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxVarsOp to do the work.  Compiles both
+// for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxVarsFunctor {
+  void operator()(const Device& d, ConstFlat<float> inputs,
+                  ConstScalar<float> min, ConstScalar<float> max,
+                  const int quant_min, const int quant_max,
+                  Flat<float> outputs) {
+    const float min_val = min();
+    const float max_val = max();
+    // If min and max are both zero, we should just return zero.
+    if (min_val == 0.0f && max_val == 0.0f) {
+      outputs.device(d) = outputs.constant(0.0f);
+      return;
+    }
+    float nudged_min, nudged_max, nudged_scale, inv_nudged_scale;
+    Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale, &inv_nudged_scale);
+
+    const float quant_zero = floor(-nudged_min * inv_nudged_scale + 0.5f);
+    const auto nudged_scale_repl = inputs.constant(nudged_scale);
+    // const auto inv_nudged_scale_repl = inputs.constant(inv_nudged_scale);
+
+    const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
+    const auto clamped_shifted = clamped - nudged_min;
+    outputs.device(d) =
+        (clamped_shifted / nudged_scale_repl - quant_zero + 0.5f).floor() *
+        nudged_scale_repl;
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxVarsGradientOp to do the work.  Compiles
+// both for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxVarsGradientFunctor {
+  void operator()(const Device& d, ConstFlat<float> gradients,
+                  ConstFlat<float> inputs, ConstScalar<float> min,
+                  ConstScalar<float> max, const int quant_min,
+                  const int quant_max, Flat<float> backprops_wrt_input,
+                  Scalar<float> backprop_wrt_min,
+                  Scalar<float> backprop_wrt_max) {
+    const float min_val = min();
+    const float max_val = max();
+    // If min and max are both zero, we propagate everything to inputs.
+    if (min_val == 0.0f && max_val == 0.0f) {
+      backprops_wrt_input.device(d) = gradients;
+      backprop_wrt_min.device(d) = backprop_wrt_min.constant(0.0f);
+      backprop_wrt_max.device(d) = backprop_wrt_max.constant(0.0f);
+      return;
+    }
+    float nudged_min, nudged_max, nudged_scale, inv_nudged_scale;
+    Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale, &inv_nudged_scale);
+
+    const auto between_min_max =
+        (inputs >= nudged_min && inputs <= nudged_max)
+            .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprops_wrt_input.device(d) = gradients * between_min_max;
+
+    const auto below_min =
+        (inputs < nudged_min)
+            .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprop_wrt_min.device(d) = (gradients * below_min).sum();
+
+    const auto above_max =
+        (inputs > nudged_max)
+            .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprop_wrt_max.device(d) = (gradients * above_max).sum();
+  }
+};
+
+using Index = typename tensorflow::TTypes<float>::ConstTensor::Index;
+
+// Functor called by FakeQuantWithMinMaxVarsPerChannelOp to do the work.
+// Compiles both for CPU and GPU.
+//
+// Already verified: inputs, outputs are of shape [b, d], min, max are of shape
+// [d].
+template <typename Device>
+struct FakeQuantWithMinMaxVarsPerChannelFunctor {
+  void operator()(const Device& d, TTypes<float>::ConstMatrix inputs,
+                  ConstVec<float> min, ConstVec<float> max, const int quant_min,
+                  const int quant_max, TTypes<float>::Matrix outputs) {
+    for (Index i = 0; i < min.size(); ++i) {
+      const float min_val = min(i);
+      const float max_val = max(i);
+      // If min and max are both zero, we should just return zero.
+      if (min_val == 0.0f && max_val == 0.0f) {
+        auto chip = outputs.chip<1>(i);
+        chip.device(d) = chip.constant(0.0f);
+        continue;
+      }
+      float nudged_min, nudged_max, nudged_scale, inv_nudged_scale;
+      Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
+            &nudged_scale, &inv_nudged_scale);
+
+      const float quant_zero = floor(-nudged_min * inv_nudged_scale + 0.5f);
+
+      const auto clamped =
+          inputs.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
+      const auto clamped_shifted = clamped - nudged_min;
+
+      outputs.chip<1>(i).device(d) =
+          (clamped_shifted * inv_nudged_scale - quant_zero + 0.5f).floor() *
+          nudged_scale;
+    }
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxVarsPerChannelGradientOp to do the work.
+// Compiles both for CPU and GPU.
+//
+// Already verified: gradients, inputs, backprops_wrt_input are of shape [b, d],
+// min, max, backprop_wrt_min, backprop_wrt_max are of shape [d].
+template <typename Device>
+struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
+  void operator()(const Device& d, TTypes<float>::ConstMatrix gradients,
+                  TTypes<float>::ConstMatrix inputs, ConstVec<float> min,
+                  ConstVec<float> max, const int quant_min, const int quant_max,
+                  TTypes<float>::Matrix backprops_wrt_input,
+                  Vec<float> backprop_wrt_min, Vec<float> backprop_wrt_max) {
+    for (Index i = 0; i < min.size(); ++i) {
+      const float min_val = min(i);
+      const float max_val = max(i);
+      const auto gradients_chip = gradients.chip<1>(i);
+      const auto inputs_chip = inputs.chip<1>(i);
+      // If min and max are both zero, we propagate everything to inputs.
+      if (min_val == 0.0f && max_val == 0.0f) {
+        backprops_wrt_input.chip<1>(i).device(d) = gradients_chip;
+        auto min_chip = backprop_wrt_min.chip<0>(i);
+        auto max_chip = backprop_wrt_max.chip<0>(i);
+        min_chip.device(d) = min_chip.constant(0.0f);
+        max_chip.device(d) = max_chip.constant(0.0f);
+        continue;
+      }
+      float nudged_min, nudged_max, nudged_scale, inv_nudged_scale;
+      Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
+            &nudged_scale, &inv_nudged_scale);
+
+      const auto between_min_max =
+          (inputs_chip >= nudged_min && inputs_chip <= nudged_max)
+              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      backprops_wrt_input.chip<1>(i).device(d) =
+          gradients_chip * between_min_max;
+
+      const auto below_min =
+          (inputs_chip < nudged_min)
+              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      Eigen::DSizes<Index, 1> reduce(0);
+      backprop_wrt_min.chip<0>(i).device(d) =
+          (gradients_chip * below_min).sum(reduce);
+
+      const auto above_max =
+          (inputs_chip > nudged_max)
+              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      backprop_wrt_max.chip<0>(i).device(d) =
+          (gradients_chip * above_max).sum(reduce);
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FAKE_QUANT_OPS_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fifo_queue.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fifo_queue.h
new file mode 100644
index 00000000..6648fe27
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fifo_queue.h
@@ -0,0 +1,93 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FIFO_QUEUE_H_
+#define TENSORFLOW_CORE_KERNELS_FIFO_QUEUE_H_
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/queue_op.h"
+#include "tensorflow/core/kernels/typed_queue.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class FIFOQueue : public TypedQueue<std::deque<Tensor> > {
+ public:
+  FIFOQueue(int32_t capacity, const DataTypeVector& component_dtypes,
+            const std::vector<TensorShape>& component_shapes,
+            const string& name);
+
+  // Implementations of QueueInterface methods --------------------------------
+
+  void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                  DoneCallback callback) override;
+  void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+                      DoneCallback callback) override;
+  void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override;
+  void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                      bool allow_small_batch,
+                      CallbackWithTuple callback) override;
+  absl::Status MatchesNodeDef(const NodeDef& node_def) override;
+
+  int32 size() const override {
+    mutex_lock lock(mu_);
+    return queues_[0].size();
+  }
+
+ protected:
+  ~FIFOQueue() override {}
+
+  // Helper for dequeuing a single element from queues_.
+  void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  static absl::Status GetElementComponentFromBatch(const Tuple& tuple,
+                                                   int64_t index, int component,
+                                                   OpKernelContext* ctx,
+                                                   Tensor* out_tensor);
+
+ private:
+  FIFOQueue(const FIFOQueue&) = delete;
+  void operator=(const FIFOQueue&) = delete;
+};
+
+// Defines a FIFOQueueOp, which produces a Queue (specifically, one
+// backed by FIFOQueue) that persists across different graph
+// executions, and sessions. Running this op produces a single-element
+// tensor of handles to Queues in the corresponding device.
+class FIFOQueueOp : public TypedQueueOp {
+ public:
+  explicit FIFOQueueOp(OpKernelConstruction* context);
+
+ private:
+  absl::Status CreateResource(QueueInterface** ret) override
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  std::vector<TensorShape> component_shapes_;
+  FIFOQueueOp(const FIFOQueueOp&) = delete;
+  void operator=(const FIFOQueueOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FIFO_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fill_empty_rows_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fill_empty_rows_functor.h
new file mode 100644
index 00000000..2298ed92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fill_empty_rows_functor.h
@@ -0,0 +1,271 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FILL_EMPTY_ROWS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_FILL_EMPTY_ROWS_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T, typename Tindex, bool RaggedOperands>
+struct FillEmptyRows {
+  // Note that the done callback is only used by the GPU implementation.
+  absl::Status operator()(OpKernelContext* context,
+                          const Tensor& default_value_t,
+                          const Tensor& indices_t, const Tensor& values_t,
+                          const Tensor& dense_shape_t,
+                          typename AsyncOpKernel::DoneCallback done = nullptr);
+};
+
+template <typename T, typename Tindex, bool RaggedOperands>
+struct FillEmptyRows<CPUDevice, T, Tindex, RaggedOperands> {
+  static constexpr int IndicesRank = RaggedOperands ? 1 : 2;
+  absl::Status operator()(OpKernelContext* context,
+                          const Tensor& default_value_t,
+                          const Tensor& indices_t, const Tensor& values_t,
+                          const Tensor& dense_shape_t,
+                          typename AsyncOpKernel::DoneCallback done) {
+    (void)done;  // Unused (only used in GPU implementation)
+    const int kOutputIndicesOutput = 0;
+    const int kOutputValuesOutput = 1;
+    const int kEmptyRowIndicatorOutput = 2;
+    const int kReverseIndexMapOutput = 3;
+
+    const T& default_value = default_value_t.scalar<T>()();
+    const auto indices = indices_t.tensor<Tindex, IndicesRank>();
+    const auto values = values_t.vec<T>();
+    const auto dense_shape = dense_shape_t.tensor<Tindex, IndicesRank - 1>();
+
+    const Tindex N = indices_t.shape().dim_size(0);
+    const Tindex dense_rows = dense_shape(0);
+
+    bool* empty_row_indicator = nullptr;
+    if (context->output_required(kEmptyRowIndicatorOutput)) {
+      Tensor* empty_row_indicator_t = nullptr;
+      TensorShape output_shape;
+      TF_RETURN_IF_ERROR(
+          TensorShape::BuildTensorShape({dense_rows}, &output_shape));
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          kEmptyRowIndicatorOutput, output_shape, &empty_row_indicator_t));
+      empty_row_indicator = empty_row_indicator_t->vec<bool>().data();
+    }
+    Tindex* reverse_index_map = nullptr;
+    if (context->output_required(kReverseIndexMapOutput)) {
+      Tensor* reverse_index_map_t = nullptr;
+      TensorShape output_shape;
+      TF_RETURN_IF_ERROR(TensorShape::BuildTensorShape({N}, &output_shape));
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          kReverseIndexMapOutput, output_shape, &reverse_index_map_t));
+      reverse_index_map = reverse_index_map_t->vec<Tindex>().data();
+    }
+
+    const int rank = IndicesRank == 1 ? 1 : indices_t.shape().dim_size(1);
+
+    if (dense_rows == 0) {
+      if (N != 0) {
+        return errors::InvalidArgument(
+            "Received SparseTensor with dense_shape[0] = 0 but "
+            "indices.shape[0] = ",
+            N);
+      }
+      Tensor* output_indices_t;
+      TensorShape output_indices_shape;
+      TF_RETURN_IF_ERROR(
+          TensorShape::BuildTensorShape({0, rank}, &output_indices_shape));
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          kOutputIndicesOutput, output_indices_shape, &output_indices_t));
+      Tensor* output_values_t;
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          kOutputValuesOutput, TensorShape({0}), &output_values_t));
+
+      // Exit early, nothing more to do.
+      return absl::OkStatus();
+    }
+
+    auto vec_or_matrix = [](auto tensor, int index1, int index2) -> auto& {
+      std::array<int, IndicesRank> indices;
+      indices[0] = index1;
+      if (IndicesRank == 2) {
+        indices[1] = index2;
+      }
+      return std::apply(tensor, indices);
+    };
+
+    bool rows_are_ordered = true;
+    Tindex last_indices_row = 0;
+    std::vector<Tindex> csr_offset(dense_rows, 0);
+    for (int i = 0; i < N; ++i) {
+      const Tindex row = vec_or_matrix(indices, i, 0);
+      if (row < 0 || row >= dense_rows) {
+        return errors::InvalidArgument("indices(", i, ", 0) is invalid: ", row,
+                                       " >= ", dense_rows);
+      }
+      ++csr_offset[row];
+      rows_are_ordered = rows_are_ordered & (row >= last_indices_row);
+      last_indices_row = row;
+    }
+    bool all_rows_full = true;
+    for (int row = 0; row < dense_rows; ++row) {
+      // csr_offset here describes the number of elements in this dense row
+      bool row_empty = (csr_offset[row] == 0);
+      if (empty_row_indicator) {
+        empty_row_indicator[row] = row_empty;
+      }
+      all_rows_full = all_rows_full & !row_empty;
+      // In filled version, each row has at least one element.
+      csr_offset[row] = std::max(csr_offset[row], Tindex{1});
+      // Update csr_offset to represent the number of elements up to and
+      // including dense_row + 1:
+      //  csr_offset(0) == #{elements of row 0}
+      //  csr_offset(1) == #{elements of row 1} + #{elements of row 0}
+      //  ..
+      //  csr_offset(i) == starting index for elements in row i + 1.
+      if (row > 0) {
+        csr_offset[row] += csr_offset[row - 1];
+      }
+    }
+
+    if (all_rows_full && rows_are_ordered) {
+      context->set_output(kOutputIndicesOutput, indices_t);
+      context->set_output(kOutputValuesOutput, values_t);
+      if (reverse_index_map) {
+        for (Tindex i = 0; i < N; ++i) {
+          reverse_index_map[i] = i;
+        }
+      }
+    } else {
+      Tensor* output_indices_t;
+      const Tindex N_full = csr_offset[dense_rows - 1];
+      TensorShape output_indices_shape;
+      if constexpr (RaggedOperands) {
+        TF_RETURN_IF_ERROR(
+            TensorShape::BuildTensorShape({N_full}, &output_indices_shape));
+      } else {
+        TF_RETURN_IF_ERROR(TensorShape::BuildTensorShape(
+            {N_full, rank}, &output_indices_shape));
+      }
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          kOutputIndicesOutput, output_indices_shape, &output_indices_t));
+      auto output_indices = output_indices_t->tensor<Tindex, IndicesRank>();
+
+      Tensor* output_values_t;
+      TF_RETURN_IF_ERROR(context->allocate_output(
+          kOutputValuesOutput, TensorShape({N_full}), &output_values_t));
+      auto output_values = output_values_t->vec<T>();
+
+      std::vector<Tindex> filled_count(dense_rows, 0);
+
+      // Fill in values for rows that are not missing
+      for (Tindex i = 0; i < N; ++i) {
+        const Tindex row = vec_or_matrix(indices, i, 0);
+        Tindex& offset = filled_count[row];
+        const Tindex output_i = ((row == 0) ? 0 : csr_offset[row - 1]) + offset;
+        offset++;  // Increment the filled count for this row.
+        std::copy_n(&vec_or_matrix(indices, i, 0), rank,
+                    &vec_or_matrix(output_indices, output_i, 0));
+        output_values(output_i) = values(i);
+        // We'll need this reverse index map to backprop correctly.
+        if (reverse_index_map) {
+          reverse_index_map[i] = output_i;
+        }
+      }
+
+      // Fill in values for rows that are missing
+      for (Tindex row = 0; row < dense_rows; ++row) {
+        const Tindex row_count = filled_count[row];
+        if (row_count == 0) {  // We haven't filled this row
+          const Tindex starting_index = (row == 0) ? 0 : csr_offset[row - 1];
+          // Remaining index values were set to zero already.
+          // Just need to set the row index in the right location.
+          vec_or_matrix(output_indices, starting_index, 0) = row;
+          for (Tindex col = 1; col < rank; ++col) {
+            vec_or_matrix(output_indices, starting_index, col) = 0;
+          }
+          output_values(starting_index) = default_value;
+        }
+      }
+    }
+
+    return absl::OkStatus();
+  }
+};
+
+template <typename Device, typename T, typename Tindex>
+struct FillEmptyRowsGrad {
+  absl::Status operator()(OpKernelContext* context,
+                          typename TTypes<Tindex>::ConstVec reverse_index_map,
+                          typename TTypes<T>::ConstVec grad_values,
+                          typename TTypes<T>::Vec d_values,
+                          typename TTypes<T>::Scalar d_default_value);
+};
+
+template <typename T, typename Tindex>
+struct FillEmptyRowsGrad<CPUDevice, T, Tindex> {
+  absl::Status operator()(OpKernelContext* context,
+                          typename TTypes<Tindex>::ConstVec reverse_index_map,
+                          typename TTypes<T>::ConstVec grad_values,
+                          typename TTypes<T>::Vec d_values,
+                          typename TTypes<T>::Scalar d_default_value) {
+    const CPUDevice& device = context->eigen_device<CPUDevice>();
+    const Tindex N = reverse_index_map.dimension(0);
+    const Tindex N_full = grad_values.dimension(0);
+
+    T& d_default_value_scalar = d_default_value();
+    d_default_value_scalar = T();
+
+    Tensor visited_t;
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(DT_BOOL, TensorShape({N_full}), &visited_t));
+    auto visited = visited_t.vec<bool>();
+    visited.device(device) = visited.constant(false);
+
+    for (int i = 0; i < N; ++i) {
+      // Locate the index of the output of the forward prop associated
+      // with this location in the input of the forward prop.  Copy
+      // the gradient into it.  Mark it as visited.
+      int64_t reverse_index = reverse_index_map(i);
+      if (reverse_index < 0 || reverse_index >= N_full) {
+        return errors::InvalidArgument(
+            "Elements in reverse index must be in [0, ", N_full, ") but got ",
+            reverse_index);
+      }
+      d_values(i) = grad_values(reverse_index);
+      visited(reverse_index) = true;
+    }
+    for (int j = 0; j < N_full; ++j) {
+      // The default value gradient gets the accumulated remainder of
+      // the backprop values (since the default value was used to fill
+      // in these slots in the forward calculation).
+      if (!visited(j)) {
+        d_default_value_scalar += grad_values(j);
+      }
+    }
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FILL_EMPTY_ROWS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fill_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fill_functor.h
new file mode 100644
index 00000000..abdc10ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fill_functor.h
@@ -0,0 +1,96 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FILL_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_FILL_FUNCTOR_H_
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct FillFunctor {
+  // Computes on device "d": out = out.constant(in(0)),
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in);
+};
+
+template <typename Device, typename T>
+struct SetZeroFunctor {
+  // Computes on device "d": out = out.setZero(),
+  void operator()(const Device& d, typename TTypes<T>::Flat out);
+};
+
+// Partial specialization of SetZeroFunctor<Device=Eigen::ThreadPoolDevice, T>.
+template <typename T>
+struct SetZeroFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out);
+};
+
+
+template <>
+struct SetZeroFunctor<Eigen::ThreadPoolDevice, tstring> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<tstring>::Flat out);
+};
+
+template <typename Device, typename T>
+struct SetOneFunctor {
+  // Computes on device "d": out = out.setOne(),
+  void operator()(const Device& d, typename TTypes<T>::Flat out);
+};
+
+// Partial specialization of SetOneFunctor<Device=Eigen::ThreadPoolDevice, T>.
+template <typename T>
+struct SetOneFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out);
+};
+
+
+template <>
+struct SetOneFunctor<Eigen::ThreadPoolDevice, tstring> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<tstring>::Flat out);
+};
+
+template <typename Device, typename T>
+struct SetNanFunctor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out);
+};
+
+// Partial specialization of SetNanFunctor<Device=Eigen::ThreadPoolDevice, T>.
+template <typename T>
+struct SetNanFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out);
+};
+
+template <>
+struct SetNanFunctor<Eigen::ThreadPoolDevice, tstring> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<tstring>::Flat out);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FILL_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fractional_pool_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fractional_pool_common.h
new file mode 100644
index 00000000..0abb20d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fractional_pool_common.h
@@ -0,0 +1,79 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_FRACTIONAL_POOL_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_FRACTIONAL_POOL_COMMON_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+// Shuffle a container randomly, copied from random_shuffle_op.cc
+template <class Iter, class Random>
+static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
+  if (first == last) {
+    return;
+  }
+  const auto stop = last - 1;
+  for (auto i = first; i != stop; ++i) {
+    using std::iter_swap;
+    iter_swap(i, i + uniform(last - i));
+  }
+}
+
+// Generate pooling sequence for fractional pooling along one dimension.
+//
+// Regular max/avg pooling can be viewed as a special case, in which given the
+//     * input_length: e.g. 10
+//     * output_length: e.g. 5
+// it will generate pooling sequence as
+//     diff sequence: [2, 2, 2, 2, 2]
+// or as
+//     cumulative sequence: [0, 2, 4, 6, 8, 10]
+//
+// In the case of fractional pooling, input_length is not an integer multiple of
+// output_length, randomness plays a role when generating pooling sequence.
+// There are two type of randomness (random vs pseudo-random) defined in paper:
+// http://arxiv.org/abs/1412.6071
+// You can check the paper for the difference between these two types.
+//
+// In summary, the generated diff sequence satisfy the following properties for
+// both types of randomness:
+//     * length(generated_diff_pooling_sequence) = output_length
+//     * sum(generated_diff_pooling_sequence) = input_length
+//     * Let's define floor(input_length / output_length) = K, then
+//       K <= generated_diff_pooling_sequence[i] <= K+1
+// For example, when input_length = 10, output_length = 6, the following are
+// valid pooling sequence:
+//     * [1, 2, 2, 1, 2, 2]
+//     * [1, 1, 2, 2, 2, 2]
+// [1, 3, 2, 2, 2, 2] is not valid.
+//
+// Args:
+//   input_length:  See above explanation
+//   output_length:  See above explanation
+//   generator:  Parallel version of random number generator
+//   pseudo_random:  Whether or not use pseudo-random
+// Returns:
+//   pooling_sequence:  This is the cumulative pooling sequence.
+std::vector<int64_t> GeneratePoolingSequence(int input_length,
+                                             int output_length,
+                                             GuardedPhiloxRandom* generator,
+                                             bool pseudo_random);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FRACTIONAL_POOL_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/function_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/function_ops.h
new file mode 100644
index 00000000..552e1e6c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/function_ops.h
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
+
+#include "tensorflow/core/framework/full_type_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kDeviceArgOp = FunctionLibraryDefinition::kDeviceArgOp;
+static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static const char* const kDeviceRetOp = FunctionLibraryDefinition::kDeviceRetOp;
+
+class ArgOp : public OpKernel {
+ public:
+  explicit ArgOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  int index_;
+  DataType dtype_;
+
+  ArgOp(const ArgOp&) = delete;
+  void operator=(const ArgOp&) = delete;
+};
+
+class RetvalOp : public OpKernel {
+ public:
+  explicit RetvalOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  int index_;
+  DataType dtype_;
+
+  RetvalOp(const RetvalOp&) = delete;
+  void operator=(const RetvalOp&) = delete;
+};
+
+class RemoteCallOp : public AsyncOpKernel {
+ public:
+  explicit RemoteCallOp(OpKernelConstruction* ctx);
+
+  ~RemoteCallOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+
+ private:
+  NameAttrList func_;
+  DataTypeVector input_dtypes_;
+  DataTypeVector output_dtypes_;
+  // Note that in the future if all RemoteCall ops have full type
+  // information, the kernel will not need access to the "Tout" Attr and
+  // return_type_ will replace output_dtypes_.
+  FullTypeDef return_type_;
+
+  mutex mu_;
+  typedef std::pair<string, FunctionLibraryRuntime*> FunctionTarget;
+  std::map<FunctionTarget, FunctionLibraryRuntime::Handle> handle_cache_
+      TF_GUARDED_BY(mu_);
+
+  RemoteCallOp(const RemoteCallOp&) = delete;
+  void operator=(const RemoteCallOp&) = delete;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fused_batch_norm_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fused_batch_norm_op.h
new file mode 100644
index 00000000..e50d80ae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -0,0 +1,72 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FUSED_BATCH_NORM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_FUSED_BATCH_NORM_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace functor {
+
+// FusedBatchNormEx op supports side inputs and activations:
+//   (1) batch_norm + activation
+//   (2) batch norm + side input + activation
+enum class FusedBatchNormActivationMode { kIdentity, kRelu };
+
+std::string ToString(FusedBatchNormActivationMode activation_mode);
+
+absl::Status ParseActivationMode(OpKernelConstruction* context,
+                                 FusedBatchNormActivationMode* activation_mode);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// This is a functor to launch custom CUDA kernel for FusedBatchNorm with side
+// input and activation when 'is_training=False'. In training we rely on cuDNN.
+template <typename Device, typename T, typename U>
+struct FusedBatchNormInferenceFunctor {
+  void operator()(OpKernelContext* context, TensorFormat tensor_format,
+                  typename TTypes<T, 4>::ConstTensor in,
+                  typename TTypes<U>::ConstVec scale,
+                  typename TTypes<U>::ConstVec offset,
+                  typename TTypes<U>::ConstVec estimated_mean,
+                  typename TTypes<U>::ConstVec estimated_variance,
+                  typename TTypes<T, 4>::ConstTensor side_input, U epsilon,
+                  FusedBatchNormActivationMode activation_mode,
+                  typename TTypes<T, 4>::Tensor out);
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// Functor used by FusedBatchNormGradOp to do the computations when
+// is_training=False.
+template <typename Device, typename T, typename U>
+struct FusedBatchNormFreezeGrad {
+  void operator()(OpKernelContext* context, const Tensor& y_backprop_input,
+                  const Tensor& x_input, const Tensor& scale_input,
+                  const Tensor& pop_mean_input,
+                  const Tensor& pop_variance_input, U epsilon,
+                  Tensor* x_backprop_output, Tensor* scale_backprop_output,
+                  Tensor* offset_backprop_output) {}
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_BATCH_NORM_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fused_eigen_output_kernels.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fused_eigen_output_kernels.h
new file mode 100644
index 00000000..84a0d27b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fused_eigen_output_kernels.h
@@ -0,0 +1,479 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Output kernels for fusing computation into Eigen Tensor contractions:
+//   (1) FusedConv2DOp
+//   (2) FusedMatMulOp
+//
+// Supported fused computations:
+//   (1) {Conv2D/MatMul} + BiasAdd + <Activation>
+//   (2) {Conv2D/MatMul} + FusedBatchNorm + <Activation>
+//
+// Activation: Relu, Relu6, Elu, etc...
+
+#ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
+
+#include <type_traits>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+enum class FusedComputationType {
+  kUndefined,
+  kBiasAdd,
+  kBiasAddWithRelu,
+  kBiasAddWithRelu6,
+  kBiasAddWithTanh,
+  kBiasAddWithSigmoid,
+  kBiasAddWithElu,
+  kBiasAddWithLeakyRelu,
+  kBiasAddWithGeluApproximate,
+  kBiasAddWithGeluExact,
+  kFusedBatchNorm,
+  kFusedBatchNormWithRelu,
+  kFusedBatchNormWithRelu6,
+  kFusedBatchNormWithElu,
+  kFusedBatchNormWithLeakyRelu
+};
+
+// We have to pass around additional arguments for all possible fusion types.
+struct FusedComputationArgs {
+  float epsilon = 0.0;          // Used by `FusedBatchNorm` fusion only
+  float leakyrelu_alpha = 0.0;  // Used by `LeakyRelu` fusion only
+};
+
+struct FusedComputationPattern {
+  FusedComputationType fused_computation;
+  std::vector<string> fused_ops;
+};
+
+// Parse attributes from the kernel construction context, and verifies that they
+// specify valid fused computation pattern.
+absl::Status InitializeFusedComputation(
+    OpKernelConstruction* context, const string& kernel_name,
+    const std::vector<FusedComputationPattern>& patterns,
+    FusedComputationType* fused_computation,
+    FusedComputationArgs* fused_computation_args);
+
+// Type alias for the tensor contraction output mapper.
+template <typename Scalar, typename StorageIndex>
+using ContractionOutputMapper =
+    Eigen::internal::blas_data_mapper<Scalar, StorageIndex, Eigen::ColMajor>;
+
+// Returns input expression without any transformations.
+struct Identity {
+  template <typename XprType>
+  static auto apply(XprType expr) -> XprType {
+    return expr;
+  };
+};
+
+// Applies `Relu` to the passed input expression.
+struct Relu {
+  template <typename XprType>
+  static auto apply(XprType expr)
+      -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
+    return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
+  };
+};
+
+// Applies `Relu6` to the passed input expression.
+struct Relu6 {
+  template <typename XprType>
+  static auto apply(XprType expr)
+      -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())
+                      .cwiseMin(std::declval<typename XprType::Scalar>())) {
+    return expr.cwiseMax(static_cast<typename XprType::Scalar>(0))
+        .cwiseMin(static_cast<typename XprType::Scalar>(6));
+  };
+};
+
+// Applies `Tanh` to the passed input expression.
+struct Tanh {
+  template <typename XprType>
+  static auto apply(XprType expr) -> decltype(expr.tanh()) {
+    return expr.tanh();
+  };
+};
+
+// Applies `Sigmoid` to the passed input expression.
+struct Sigmoid {
+  template <typename XprType>
+  static auto apply(XprType expr) -> decltype(expr.sigmoid()) {
+    return expr.sigmoid();
+  };
+};
+
+// Applies `Elu` to the passed input expression.
+struct Elu {
+  template <typename XprType>
+  static auto apply(XprType expr) -> decltype(
+      (expr < std::declval<typename XprType::Scalar>())
+          .select(expr.exp() -
+                      expr.constant(std::declval<typename XprType::Scalar>()),
+                  expr)) {
+    return (expr < static_cast<typename XprType::Scalar>(0))
+        .select(expr.exp() -
+                    expr.constant(static_cast<typename XprType::Scalar>(1)),
+                expr);
+  };
+};
+
+// Applies `LeakyRelu` to the passed input expression.
+struct LeakyRelu {
+  template <typename XprType>
+  static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype(
+      (expr < std::declval<typename XprType::Scalar>())
+          .select(expr *
+                      expr.constant(std::declval<typename XprType::Scalar>()),
+                  expr)) {
+    return (expr < static_cast<typename XprType::Scalar>(0))
+        .select(expr * expr.constant(static_cast<typename XprType::Scalar>(
+                           leakyrelu_alpha)),
+                expr);
+  };
+};
+
+template <typename T>
+struct BiasAddArgs {
+  const T* bias_add_data = nullptr;
+  float leakyrelu_alpha;
+
+  static bool IsSupported(FusedComputationType fusion) {
+    return fusion == FusedComputationType::kBiasAdd ||
+           fusion == FusedComputationType::kBiasAddWithRelu ||
+           fusion == FusedComputationType::kBiasAddWithRelu6 ||
+           fusion == FusedComputationType::kBiasAddWithTanh ||
+           fusion == FusedComputationType::kBiasAddWithSigmoid ||
+           fusion == FusedComputationType::kBiasAddWithElu ||
+           fusion == FusedComputationType::kBiasAddWithLeakyRelu;
+  }
+};
+
+template <typename T>
+struct FusedBatchNormArgs {
+  const T* scale_data = nullptr;
+  const T* offset_data = nullptr;
+  const T* estimated_mean_data = nullptr;
+  const T* estimated_variance_data = nullptr;
+
+  // Precomputed expression:
+  //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
+  Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
+
+  float leakyrelu_alpha;
+
+  static bool IsSupported(FusedComputationType fusion) {
+    return fusion == FusedComputationType::kFusedBatchNorm ||
+           fusion == FusedComputationType::kFusedBatchNormWithRelu ||
+           fusion == FusedComputationType::kFusedBatchNormWithRelu6 ||
+           fusion == FusedComputationType::kFusedBatchNormWithElu ||
+           fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu;
+  }
+};
+
+// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
+// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
+// using these tensors.
+//
+// (1) Spatial Convolution (see eigen_spatial_convolutions.h):
+//
+//   TensorContraction output matrix (before reshape) has a ColMajor layout, and
+//   has dimensions:
+//   - rows: output_channels
+//   - cols: all other dimensions
+//
+//   First element in every column is:
+//     [batch ??, height ??, width ??, out_channel = i]
+//
+//   We do not know what are the values of the 'batch', 'height', and 'width'
+//   here (if we know original dimensions, they can be computed from 'j').
+//
+//   Each column of an output block is a continuous slice along the output
+//   channel dimension, so we can use it to efficiently compute any
+//   transformation that depends only on a channel value (e.g. add channel
+//   bias).
+//
+// (2) Matrix Multiplication (see matmul_op.cc):
+//
+//   For the `MxK * KxN` matrix multiplication, output matrix has a `MxN`
+//   dimensions. Each column in output block is a slice of the innermost
+//   dimension of the output matrix starting at offset 'i'.
+//
+//   Example: In Tensorflow MatMul [8x32] * [32x64], each output block column
+//   will correspond to MatMul output row of size 64 (because Tensorflow uses
+//   row major storage order).
+
+// Output kernel that fuses BiasAdd operation into the output of tensor
+// contraction + activation function defined by Activation.
+template <typename T, typename Activation = Identity>
+struct BiasAddOutputKernel {
+  explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
+      : bias_data(args.bias_add_data) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+    typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      Scalar* output_base = &output_mapper(0, col);
+      typename TTypes<Scalar>::UnalignedTensor output(output_base, num_rows);
+      if constexpr (std::is_same_v<Scalar, T>) {
+        const auto expr = output + bias;
+        output = Activation::template apply<decltype(expr)>(expr);
+      } else {
+        const auto bias_expr = bias.template cast<Scalar>();
+        const auto expr = output + bias_expr;
+        output = Activation::template apply<decltype(expr)>(expr);
+      }
+    }
+  }
+
+ private:
+  const T* bias_data;
+};
+
+template <typename T>
+struct BiasAddOutputKernel<T, LeakyRelu> {
+  explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
+      : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+    typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      Scalar* output_base = &output_mapper(0, col);
+      typename TTypes<Scalar>::UnalignedTensor output(output_base, num_rows);
+      if constexpr (std::is_same_v<Scalar, T>) {
+        const auto expr = output + bias;
+        output =
+            LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+      } else {
+        const auto bias_expr = bias.template cast<Scalar>();
+        const auto expr = output + bias_expr;
+        output =
+            LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+      }
+    }
+  }
+
+ private:
+  const T* bias_data;
+  float leakyrelu_alpha;
+};
+
+// Output kernel that fuses FusedBatchNorm operation into the output of tensor
+// contraction + activation function defined by Activation.
+template <typename T, typename Activation = Identity>
+struct FusedBatchNormOutputKernel {
+  FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
+      : epsilon(epsilon),
+        scaling_factor_data(args.scaling_factor.data()),
+        offset_data(args.offset_data),
+        estimated_mean_data(args.estimated_mean_data) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
+                                                            num_rows);
+    typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
+    typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = Activation::template apply<decltype(shifted)>(shifted);
+    }
+  }
+
+ private:
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
+};
+
+template <typename T>
+struct FusedBatchNormOutputKernel<T, LeakyRelu> {
+  FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
+      : epsilon(epsilon),
+        scaling_factor_data(args.scaling_factor.data()),
+        offset_data(args.offset_data),
+        estimated_mean_data(args.estimated_mean_data),
+        leakyrelu_alpha(args.leakyrelu_alpha) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
+                                                            num_rows);
+    typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
+    typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = LeakyRelu::template apply<decltype(shifted)>(shifted,
+                                                            leakyrelu_alpha);
+    }
+  }
+
+ private:
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
+  float leakyrelu_alpha;
+};
+
+// Type aliases for the output kernels, purely for the sake of better launch
+// dispatching code readability.
+template <typename T>
+using WithBiasAdd = BiasAddOutputKernel<T>;
+template <typename T>
+using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
+template <typename T>
+using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
+template <typename T>
+using WithBiasAddAndTanh = BiasAddOutputKernel<T, Tanh>;
+template <typename T>
+using WithBiasAddAndSigmoid = BiasAddOutputKernel<T, Sigmoid>;
+template <typename T>
+using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
+template <typename T>
+using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
+template <typename T>
+using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
+template <typename T>
+using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
+template <typename T>
+using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>;
+template <typename T>
+using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>;
+template <typename T>
+using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>;
+
+template <typename T>
+absl::Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args,
+                             const float* leakyrelu_alpha = nullptr) {
+  // Bias of the following dimensions: [ output_depth ]
+  const Tensor& bias = context->input(2);
+
+  if (bias.dims() != 1)
+    return errors::InvalidArgument("bias must be 1-dimensional",
+                                   bias.shape().DebugString());
+
+  const auto data_ptr = [](const Tensor& tensor) -> const T* {
+    return reinterpret_cast<const T*>(tensor.tensor_data().data());
+  };
+
+  args->bias_add_data = data_ptr(bias);
+
+  if (leakyrelu_alpha) {
+    args->leakyrelu_alpha = *leakyrelu_alpha;
+  }
+
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
+                                    FusedBatchNormArgs<T>* args,
+                                    const float* leakyrelu_alpha = nullptr) {
+  const Tensor& scale = context->input(2);
+  const Tensor& offset = context->input(3);
+  const Tensor& estimated_mean = context->input(4);
+  const Tensor& estimated_variance = context->input(5);
+
+  if (scale.dims() != 1)
+    return errors::InvalidArgument("scale must be 1-dimensional",
+                                   scale.shape().DebugString());
+  if (offset.dims() != 1)
+    return errors::InvalidArgument("offset must be 1-dimensional",
+                                   offset.shape().DebugString());
+  if (estimated_mean.dims() != 1)
+    return errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                   estimated_mean.shape().DebugString());
+  if (estimated_variance.dims() != 1)
+    return errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                   estimated_variance.shape().DebugString());
+
+  const auto data_ptr = [](const Tensor& tensor) -> const T* {
+    return reinterpret_cast<const T*>(tensor.tensor_data().data());
+  };
+
+  args->scale_data = data_ptr(scale);
+  args->offset_data = data_ptr(offset);
+  args->estimated_mean_data = data_ptr(estimated_mean);
+  args->estimated_variance_data = data_ptr(estimated_variance);
+
+  // Precompute scaling factor once for all output blocks (kernels).
+  args->scaling_factor =
+      (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
+      scale.flat<T>();
+
+  if (leakyrelu_alpha) {
+    args->leakyrelu_alpha = *leakyrelu_alpha;
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/fuzzing/fuzz_session.h b/third_party/tflite-hdrs/tensorflow/core/kernels/fuzzing/fuzz_session.h
new file mode 100644
index 00000000..09c7563d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -0,0 +1,157 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/public/session.h"
+
+// Standard invoking function macro to dispatch to a fuzzer class.
+#ifndef PLATFORM_WINDOWS
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)                              \
+  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { \
+    static FuzzerClass* fuzzer = new FuzzerClass();                         \
+    return fuzzer->Fuzz(data, size);                                        \
+  }
+#else
+// We don't compile this for Windows, MSVC doesn't like it as pywrap in Windows
+// links all the code into one big object file and there are conflicting
+// function names.
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)
+#endif
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// class FooFuzzer : public FuzzSession {
+//   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+//   void FuzzImpl(const uint8_t* data, size_t size) {
+//      ... convert data and size to a Tensor, pass it to:
+//      RunInputs({{"input", input_tensor}});
+//
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() {}
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) = 0;
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  Status InitIfNeeded() {
+    if (initialized_) {
+      return absl::OkStatus();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::DisabledShapeInferenceScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: " << status.message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the input node(s), and discarding
+  // any returned output.
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
+  }
+
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  int Fuzz(const uint8_t* data, size_t size) {
+    Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(data, size);
+    return 0;
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+// A specialized fuzz implementation for ops that take
+// a single string.  Caller must still define the op
+// to plumb by overriding BuildGraph or using
+// a plumbing macro.
+class FuzzStringInputOp : public FuzzSession {
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<tstring>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor.h
new file mode 100644
index 00000000..607f3c80
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor.h
@@ -0,0 +1,183 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_H_
+
+#include "absl/base/prefetch.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// Helper method to copy using memcpy.
+template <typename T, typename Index, typename SliceIndex,
+          SliceIndex static_slice_elems>
+SliceIndex HandleCopies(OpKernelContext* ctx,
+                        typename TTypes<T, 3>::ConstTensor params,
+                        typename TTypes<Index>::ConstFlat indices,
+                        SliceIndex slice_elems,
+                        typename TTypes<T, 3>::Tensor out) {
+  const SliceIndex indices_size = static_cast<SliceIndex>(indices.dimension(0));
+  const SliceIndex batch_size = static_cast<SliceIndex>(params.dimension(0));
+  const Index limit = static_cast<Index>(params.dimension(1));
+  T* out_base = out.data();
+  const T* params_base = params.data();
+  if (static_slice_elems >= 0) {
+    // Give compiler static knowledge of the number of elements/bytes
+    slice_elems = static_slice_elems;
+  }
+  // Compute slice_bytes here so that static knowledge is available
+  const size_t slice_bytes = slice_elems * sizeof(T);
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  mutex mu;
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
+  SliceIndex result = -1;
+  auto work = [&](int64_t start, int64_t end) {
+    SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
+    SliceIndex indices_idx = static_cast<SliceIndex>(start % indices_size);
+    SliceIndex batch_idx_end = static_cast<SliceIndex>(end / indices_size);
+    SliceIndex indices_idx_end = static_cast<SliceIndex>(end % indices_size);
+
+    while ((batch_idx < batch_idx_end) ||
+           (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
+      SliceIndex i_next = indices_idx + 1;
+      SliceIndex b_next = batch_idx + 1;
+      const Index index = internal::SubtleMustCopy(indices(indices_idx));
+      if (!FastBoundsCheck(index, limit)) {
+        mutex_lock l(mu);
+        result = indices_idx;
+        return;
+      }
+      if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
+          (i_next < indices_size)) {
+        absl::PrefetchToLocalCache(&params(batch_idx, indices(i_next), 0));
+        absl::PrefetchToLocalCache(&out(batch_idx, i_next, 0));
+        b_next = batch_idx;
+      } else if (b_next <= batch_idx_end) {
+        absl::PrefetchToLocalCache(&params(b_next, indices(0), 0));
+        absl::PrefetchToLocalCache(&out(b_next, 0, 0));
+        i_next = 0;
+      }
+      // Copy using memcpy if possible, otherwise an Eigen loop
+      // TODO(cwhipkey): avoid linking to framework to get Allocator (to improve
+      // ahead-of-time compilation binary size).
+      if (is_simple_type<T>::value) {
+        // Avoid auto-promotion to Index from SliceIndex by casting.
+        memcpy(
+            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
+                           static_cast<SliceIndex>(index)) *
+                              slice_elems,
+            slice_bytes);
+      } else {
+        // For non-"simple" types (e.g. strings).
+        out.template chip<0>(batch_idx).template chip<0>(indices_idx) =
+            params.template chip<0>(batch_idx).template chip<0>(index);
+      }
+      indices_idx = i_next;
+      batch_idx = b_next;
+    }
+  };
+
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * indices_size, slice_elems * sizeof(T), work);
+  return result;
+}
+
+template <typename T, typename Index>
+struct GatherFunctorCPU {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<T, 3>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<T, 3>::Tensor out) {
+    const int64_t indices_size = indices.size();
+    const int64_t slice_size = out.dimension(2);
+    int64_t bad_i;
+
+    const int64_t batch_size = params.dimension(0);
+
+    bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
+                      params.size() > std::numeric_limits<int32>::max() ||
+                      indices_size > std::numeric_limits<int32>::max() ||
+                      batch_size * indices_size * slice_size >
+                          std::numeric_limits<int32>::max());
+#define CALL(elems)                                                        \
+  do {                                                                     \
+    if (use_large) {                                                       \
+      bad_i = HandleCopies<T, Index, int64_t, elems>(ctx, params, indices, \
+                                                     slice_size, out);     \
+    } else {                                                               \
+      const int32 small_slice = static_cast<int32>(slice_size);            \
+      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices,   \
+                                                   small_slice, out);      \
+    }                                                                      \
+  } while (0)
+
+    if (slice_size == 10)
+      CALL(10);
+    else if (slice_size == 20)
+      CALL(20);
+    else
+      CALL(-1);
+#undef CALL
+
+    return bad_i;
+  }
+};
+
+template <typename Device, typename T, typename Index>
+struct GatherFunctor {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<T, 3>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<T, 3>::Tensor out);
+};
+
+template <typename T, typename Index>
+struct GatherFunctor<CPUDevice, T, Index> {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<T, 3>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<T, 3>::Tensor out) {
+    return GatherFunctorCPU<T, Index>()(ctx, params, indices, out);
+  }
+};
+
+template <typename Index>
+struct GatherFunctor<GPUDevice, Variant, Index> {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<Variant, 3>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<Variant, 3>::Tensor out) {
+    return GatherFunctorCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_batched.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_batched.h
new file mode 100644
index 00000000..41b809bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_batched.h
@@ -0,0 +1,201 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_H_
+
+#include "absl/base/prefetch.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+// Helper method to copy using memcpy.
+template <typename T, typename Index, typename SliceIndex,
+          SliceIndex static_slice_elems>
+SliceIndex HandleCopiesBatched(OpKernelContext* ctx,
+                               typename TTypes<T, 4>::ConstTensor params,
+                               typename TTypes<Index>::ConstFlat indices,
+                               SliceIndex slice_elems,
+                               typename TTypes<T, 4>::Tensor out) {
+  const SliceIndex batch_size = static_cast<SliceIndex>(params.dimension(0));
+  const SliceIndex outer_size = static_cast<SliceIndex>(params.dimension(1));
+  const SliceIndex indices_size =
+      static_cast<SliceIndex>(indices.dimension(0)) / batch_size;
+
+  const Index limit = static_cast<Index>(params.dimension(2));
+  if (static_slice_elems >= 0) {
+    // Give compiler static knowledge of the number of elements/bytes
+    slice_elems = static_slice_elems;
+  }
+  // Compute slice_bytes here so that static knowledge is available
+  const size_t slice_bytes = slice_elems * sizeof(T);
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  mutex mu;
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
+  SliceIndex result = -1;
+  auto work = [&](int64_t start, int64_t end) {
+    const int64_t r_start = start % (outer_size * indices_size);
+    SliceIndex batch_idx = static_cast<SliceIndex>(
+        start / (outer_size * indices_size));
+    SliceIndex outer_idx = static_cast<SliceIndex>(r_start / indices_size);
+    SliceIndex indices_idx = static_cast<SliceIndex>(r_start % indices_size);
+
+    SliceIndex batch_offset = batch_idx * indices_size;
+    for (; start < end; ++start) {
+      SliceIndex i_next = indices_idx + 1;
+      SliceIndex o_next = outer_idx;
+      SliceIndex b_next = batch_idx;
+      SliceIndex b_offset_next = batch_offset;
+
+      if (i_next >= indices_size) {
+        i_next = 0;
+        if (++o_next >= outer_size) {
+          o_next = 0;
+          ++b_next;
+          b_offset_next += indices_size;
+        }
+      }
+      if (start + 1 < end) {
+        absl::PrefetchToLocalCache(
+            &params(b_next, o_next, indices(b_offset_next + i_next), 0));
+        absl::PrefetchToLocalCache(&out(b_next, o_next, i_next, 0));
+      }
+      const Index index = internal::SubtleMustCopy(
+          indices(batch_offset + indices_idx));
+      if (!FastBoundsCheck(index, limit)) {
+        mutex_lock l(mu);
+        result = batch_offset + indices_idx;
+        return;
+      }
+
+      // Copy using memcpy if possible, otherwise an Eigen loop
+      // TODO(cwhipkey): avoid linking to framework to get Allocator (to improve
+      // ahead-of-time compilation binary size).
+      if (is_simple_type<T>::value) {
+        // Avoid auto-promotion to Index from SliceIndex by casting.
+        memcpy(
+            &out(batch_idx, outer_idx, indices_idx, 0),
+            &params(batch_idx, outer_idx, static_cast<SliceIndex>(index), 0),
+            slice_bytes);
+      } else {
+        // For non-"simple" types (e.g. strings).
+        out.template chip<0>(batch_idx)
+            .template chip<0>(outer_idx)
+            .template chip<0>(indices_idx) =
+            params.template chip<0>(batch_idx)
+                .template chip<0>(outer_idx)
+                .template chip<0>(static_cast<SliceIndex>(index));
+      }
+
+      indices_idx = i_next;
+      outer_idx = o_next;
+      batch_idx = b_next;
+      batch_offset = b_offset_next;
+    }
+  };
+
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * outer_size * indices_size, slice_elems * sizeof(T), work);
+  return result;
+}
+
+template <typename T, typename Index>
+struct GatherFunctorBatchedCPU {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<T, 4>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<T, 4>::Tensor out) {
+    const int64_t indices_size = indices.size();  // Includes the batch_size.
+    const int64_t slice_size = out.dimension(3);
+    int64_t bad_i;
+
+    const int64_t batch_size = params.dimension(0);
+    const int64_t outer_size = params.dimension(1);
+
+    bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
+                      params.size() > std::numeric_limits<int32>::max() ||
+                      indices_size > std::numeric_limits<int32>::max() ||
+                      batch_size * outer_size * indices_size * slice_size >
+                          std::numeric_limits<int32>::max());
+#define CALL(elems)                                             \
+  do {                                                          \
+    if (use_large) {                                            \
+      bad_i = HandleCopiesBatched<T, Index, int64_t, elems>(    \
+          ctx, params, indices, slice_size, out);               \
+    } else {                                                    \
+      const int32 small_slice = static_cast<int32>(slice_size); \
+      bad_i = HandleCopiesBatched<T, Index, int32, elems>(      \
+          ctx, params, indices, small_slice, out);              \
+    }                                                           \
+  } while (0)
+
+    // TODO(rmlarsen): Investigate whether these specializations are still
+    // needed and, if yes, whether the slice sizes are appropriate.
+    if (slice_size == 10)
+      CALL(10);
+    else if (slice_size == 20)
+      CALL(20);
+    else
+      CALL(-1);
+#undef CALL
+
+    return bad_i;
+  }
+};
+
+template <typename Device, typename T, typename Index>
+struct GatherFunctorBatched {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<T, 4>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<T, 4>::Tensor out);
+};
+
+template <typename T, typename Index>
+struct GatherFunctorBatched<CPUDevice, T, Index> {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<T, 4>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<T, 4>::Tensor out) {
+    return GatherFunctorBatchedCPU<T, Index>()(ctx, params, indices, out);
+  }
+};
+
+template <typename Index>
+struct GatherFunctorBatched<GPUDevice, Variant, Index> {
+  int64_t operator()(OpKernelContext* ctx,
+                     typename TTypes<Variant, 4>::ConstTensor params,
+                     typename TTypes<Index>::ConstFlat indices,
+                     typename TTypes<Variant, 4>::Tensor out) {
+    return GatherFunctorBatchedCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_batched_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_batched_gpu.cu.h
new file mode 100644
index 00000000..e2cb7597
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_batched_gpu.cu.h
@@ -0,0 +1,183 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gather_functor_batched.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename ValueOrVec, typename Index, bool is_axis_zero,
+          bool is_batch_dims_zero>
+__global__ void GatherOpKernel(const ValueOrVec* __restrict__ params,
+                               const Index* __restrict__ indices,
+                               ValueOrVec* __restrict__ out, int64 outer_size,
+                               int64 gather_dim_size, int64 indices_size,
+                               int64 slice_size, int64 out_size) {
+  // params is a tensor of shape
+  // [batch_size, outer_size, gather_dim_size, slice_size].
+  GPU_1D_KERNEL_LOOP(i, out_size) {
+    Index batch_i = 0;  // The batch index into params to use for i.
+    Index outer_i = 0;  // The outer index into params to use for i.
+    Index indices_i = 0;  // The index into indices to use for i.
+    Index slice_i = 0;  // Index into the current slice in params to use for i.
+
+    const Index slices_count = i / slice_size;
+    if (is_batch_dims_zero) {
+      if (is_axis_zero) {
+        indices_i = slices_count;
+      } else {
+        outer_i = slices_count / indices_size;
+        indices_i = slices_count - outer_i * indices_size;
+      }
+    } else {
+      const Index entries_count = slices_count / indices_size;
+      if (is_axis_zero) {
+        batch_i = entries_count;
+      } else {
+        batch_i = entries_count / outer_size;
+        outer_i = entries_count - batch_i * outer_size;
+      }
+      indices_i = slices_count - entries_count * indices_size;
+    }
+    slice_i = i - slices_count * slice_size;
+
+    // Index into the gather axis to use for i.
+    Index gather_i = ldg(indices + batch_i * indices_size + indices_i);
+
+    // Check gather_i is in [0, gather_dim_size).
+    if (!FastBoundsCheck(gather_i, gather_dim_size)) {
+      // Set indices out of range to zero
+      // TODO(fpmc): Log an error for transfer back to host.
+      out[i] = ValueOrVec(0);
+    } else {
+      // Read params[batch_i, outer_i, gather_i, slice_i] and write it to the
+      // i'th position in out.
+      Index params_i = (
+          (batch_i * outer_size + outer_i) * gather_dim_size + gather_i
+      ) * slice_size + slice_i;
+      out[i] = params[params_i];
+    }
+  }
+}
+
+namespace detail {
+
+template <bool is_axis_zero, bool is_batch_dims_zero>
+struct LaunchGatherKernelVectorized {
+  template <int vec_size>
+  struct Impl {
+    template <typename T, typename Index>
+    Status operator()(const GPUDevice& d, const T* params, const Index* indices,
+                      T* out, int64 outer_size, int64 gather_dim_size,
+                      int64 indices_size, int64 slice_size, int64 out_size) {
+      DCHECK_EQ(slice_size % vec_size, 0);
+      DCHECK_EQ(out_size % vec_size, 0);
+      DCHECK_EQ(reinterpret_cast<std::uintptr_t>(params) % vec_size, 0);
+      DCHECK_EQ(reinterpret_cast<std::uintptr_t>(out) % vec_size, 0);
+      int64 out_size_vec = out_size / vec_size;
+      int64 slice_size_vec = slice_size / vec_size;
+      using Tvec = AlignedVector<T, vec_size>;
+      const Tvec* params_vec = reinterpret_cast<const Tvec*>(params);
+      Tvec* out_vec = reinterpret_cast<Tvec*>(out);
+
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size_vec, d,
+          &GatherOpKernel<Tvec, Index, is_axis_zero, is_batch_dims_zero>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+      return GpuLaunchKernel(
+          GatherOpKernel<Tvec, Index, is_axis_zero, is_batch_dims_zero>,
+          config.block_count, config.thread_per_block, 0, d.stream(),
+          params_vec, indices, out_vec, outer_size, gather_dim_size,
+          indices_size, slice_size_vec, out_size_vec);
+    }
+  };
+};
+
+}  // namespace detail
+
+template <bool is_axis_zero, bool is_batch_dims_zero, typename T,
+          typename Index>
+Status LaunchGatherKernel(const GPUDevice& d, const T* params,
+                          const Index* indices, T* out, int64 outer_size,
+                          int64 gather_dim_size, int64 indices_size,
+                          int64 slice_size, int64 out_size) {
+  // Note that the GPU memory allocator always returns aligned buffers, so the
+  // alignment of data pointers is expected to be deterministic.
+  // There will be performance cliffs when slice_size is not aligned, but there
+  // is no easy way to handle the misalignment because each row will be aligned
+  // differently.
+  return DispatchToVectorized<
+      T, detail::LaunchGatherKernelVectorized<
+             is_axis_zero, is_batch_dims_zero>::template Impl>(
+      MinAlignmentOf(params, out, slice_size), d, params, indices, out,
+      outer_size, gather_dim_size, indices_size, slice_size, out_size);
+}
+
+namespace functor {
+template <typename T, typename Index>
+struct GatherFunctorBatched<GPUDevice, T, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 4>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T, 4>::Tensor out) {
+    const GPUDevice& d = ctx->eigen_gpu_device();
+    const int64 out_size = out.size();
+    if (out_size == 0) {
+      // We need a check here since the CPU version does useful error checking
+      // work if there are nonempty indices but empty slices, so the kernel is
+      // executed in that case.  In the GPU case we don't know how to do error
+      // checking, so we skip the loop entirely.
+      return -1;
+    }
+    const bool is_batch_dims_zero = params.dimension(0) == 1;
+    const bool is_axis_zero = params.dimension(1) == 1;
+    const int64 outer_size = params.dimension(1);
+    const int64 gather_dim_size = params.dimension(2);
+    const int64 indices_size = indices.size() / params.dimension(0);
+    const int64 slice_size = params.dimension(3);
+
+    const auto function =
+        is_axis_zero
+            ? (is_batch_dims_zero ? LaunchGatherKernel<true, true, T, Index>
+                                  : LaunchGatherKernel<true, false, T, Index>)
+            : (is_batch_dims_zero ? LaunchGatherKernel<false, true, T, Index>
+                                  : LaunchGatherKernel<false, false, T, Index>);
+    TF_CHECK_OK(function(d, params.data(), indices.data(), out.data(),
+                         outer_size, gather_dim_size, indices_size, slice_size,
+                         out_size));
+    // TODO(fpmc): enable indices validation on GPU.
+    // Right now checking for indices out of bound in the kernel would
+    // require copying code between GPU/CPU, and thus slow.
+    return -1;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_BATCHED_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_gpu.cu.h
new file mode 100644
index 00000000..3ac0d912
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -0,0 +1,165 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename ValueOrVec, typename Index, bool is_axis_zero>
+__global__ void GatherOpKernel(const ValueOrVec* __restrict__ params,
+                               const Index* __restrict__ indices,
+                               ValueOrVec* __restrict__ out,
+                               int64 gather_dim_size, int64 indices_size,
+                               int64 slice_size, int64 out_size) {
+  GPU_1D_KERNEL_LOOP(i, out_size) {
+    Index batch_i = 0;
+    Index indices_i = 0;
+    Index slice_i = 0;
+    if (is_axis_zero) {
+      indices_i = i / slice_size;
+      slice_i = i - indices_i * slice_size;
+    } else {
+      Index batch_indices_i = i / slice_size;
+      // The batch index into params to use for i.
+      batch_i = batch_indices_i / indices_size;
+      // The index into indices to use for i.
+      indices_i = batch_indices_i - batch_i * indices_size;
+      // Index into the current slice in params to use for i.
+      slice_i = i - batch_indices_i * slice_size;
+    }
+
+    // Index into the gather axis to use for i.
+    Index gather_i = ldg(indices + indices_i);
+
+    // Check gather_i is in [0, gather_dim_size).
+    if (!FastBoundsCheck(gather_i, gather_dim_size)) {
+      // Set indices out of range to zero
+      // TODO(fpmc): Log an error for transfer back to host.
+      out[i] = ValueOrVec(0);
+    } else {
+      // params is a [batch_size, gather_dim_size, slice_size] tensor. Read
+      // params[batch_i, gather_i, slice_i] and write it to the i'th position in
+      // out.
+      Index params_i =
+          (batch_i * gather_dim_size + gather_i) * slice_size + slice_i;
+      out[i] = params[params_i];
+    }
+  }
+}
+
+namespace detail {
+
+template <bool is_axis_zero>
+struct LaunchGatherKernelVectorized {
+  template <int vec_size>
+  struct Impl {
+    template <typename T, typename Index>
+    Status operator()(const GPUDevice& d, const T* params, const Index* indices,
+                      T* out, int64 gather_dim_size, int64 indices_size,
+                      int64 slice_size, int64 out_size) {
+      DCHECK_EQ(slice_size % vec_size, 0);
+      DCHECK_EQ(out_size % vec_size, 0);
+      DCHECK_EQ(reinterpret_cast<std::uintptr_t>(params) % vec_size, 0);
+      DCHECK_EQ(reinterpret_cast<std::uintptr_t>(out) % vec_size, 0);
+      int64 out_size_vec = out_size / vec_size;
+      int64 slice_size_vec = slice_size / vec_size;
+      using Tvec = AlignedVector<T, vec_size>;
+      const Tvec* params_vec = reinterpret_cast<const Tvec*>(params);
+      Tvec* out_vec = reinterpret_cast<Tvec*>(out);
+
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size_vec, d, &GatherOpKernel<Tvec, Index, is_axis_zero>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+      return GpuLaunchKernel(
+          GatherOpKernel<Tvec, Index, is_axis_zero>, config.block_count,
+          config.thread_per_block, 0, d.stream(), params_vec, indices, out_vec,
+          gather_dim_size, indices_size, slice_size_vec, out_size_vec);
+    }
+  };
+};
+
+}  // namespace detail
+
+template <bool is_axis_zero, typename T, typename Index>
+Status LaunchGatherKernel(const GPUDevice& d, const T* params,
+                          const Index* indices, T* out, int64 gather_dim_size,
+                          int64 indices_size, int64 slice_size,
+                          int64 out_size) {
+  // Note that the GPU memory allocator always returns aligned buffers, so the
+  // alignment of data pointers is expected to be deterministic.
+  // There will be performance cliffs when slice_size is not aligned, but there
+  // is no easy way to handle the misalignment because each row will be aligned
+  // differently.
+  return DispatchToVectorized<
+      T, detail::LaunchGatherKernelVectorized<is_axis_zero>::template Impl>(
+      MinAlignmentOf(params, out, slice_size), d, params, indices, out,
+      gather_dim_size, indices_size, slice_size, out_size);
+}
+
+namespace functor {
+template <typename T, typename Index>
+struct GatherFunctor<GPUDevice, T, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 3>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T, 3>::Tensor out) {
+    const GPUDevice& d = ctx->eigen_gpu_device();
+    const int64 out_size = out.size();
+    if (out_size == 0) {
+      // We need a check here since the CPU version does useful error checking
+      // work if there are nonempty indices but empty slices, so the kernel is
+      // executed in that case.  In the GPU case we don't know how to do error
+      // checking, so we skip the loop entirely.
+      return -1;
+    }
+    const bool is_axis_zero = params.dimension(0) == 1;
+    const int64 gather_dim_size = params.dimension(1);
+    const int64 indices_size = indices.size();
+    const int64 slice_size = params.dimension(2);
+
+    if (is_axis_zero) {
+      TF_CHECK_OK(LaunchGatherKernel<true>(d, params.data(), indices.data(),
+                                           out.data(), gather_dim_size,
+                                           indices_size, slice_size, out_size));
+    } else {
+      TF_CHECK_OK(LaunchGatherKernel<false>(
+          d, params.data(), indices.data(), out.data(), gather_dim_size,
+          indices_size, slice_size, out_size));
+    }
+    // TODO(fpmc): enable indices validation on GPU.
+    // Right now checking for indices out of bound in the kernel would
+    // require copying code between GPU/CPU, and thus slow.
+    return -1;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gather_nd_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_nd_op.h
new file mode 100644
index 00000000..b53e1348
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_nd_op.h
@@ -0,0 +1,179 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_H_
+// Functor definition for GatherOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bad_indices_policy.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+class OpKernelContext;
+class Tensor;
+
+namespace functor {
+
+template <typename Device, typename T, typename Index, int IXDIM>
+struct GatherNdSlice {
+  // Performs a slice gather op on (Tparams, Tindices), writing to Tout.
+  // Returns an index to Tindices if the value at that index is out of range.
+  // Returns -1 if all values of Tindices are in range.
+  Index operator()(const Device& d, const Index slice_size,
+                   typename TTypes<int32>::Scalar Tscratch,
+                   typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+                   typename TTypes<Index>::ConstMatrix Tindices,
+                   typename TTypes<T>::Matrix Tout);
+};
+
+template <typename Device, typename T, typename Index>
+absl::Status DoGatherNd(
+    OpKernelContext* c, const Tensor& params, const Tensor& indices,
+    Tensor* out,
+    BadIndicesPolicy bad_indices_policy = BadIndicesPolicy::kDefault) {
+  if (!TensorShapeUtils::IsVectorOrHigher(params.shape())) {
+    return errors::InvalidArgument("params must be at least a vector");
+  }
+  if (!TensorShapeUtils::IsVectorOrHigher(indices.shape())) {
+    return errors::InvalidArgument("indices must be at least a vector");
+  }
+  if (indices.dim_size(indices.dims() - 1) > params.dims()) {
+    return errors::InvalidArgument(
+        "index innermost dimension length must be <= params rank; saw: ",
+        indices.dim_size(indices.dims() - 1), " vs. ", params.dims());
+  }
+
+  const TensorShape& indices_shape(indices.shape());
+  const int64_t indices_nd = indices_shape.dim_size(indices_shape.dims() - 1);
+
+  // Check that we have enough index space
+  int64_t N_big = 1;
+  for (int i = 0; i < indices_shape.dims() - 1; ++i) {
+    N_big *= indices_shape.dim_size(i);
+  }
+  if (N_big > std::numeric_limits<int>::max()) {
+    return errors::InvalidArgument(
+        "indices has too many elements for int indexing: ", N_big, " > ",
+        std::numeric_limits<int>::max());
+  }
+  if (params.NumElements() > std::numeric_limits<Index>::max()) {
+    return errors::InvalidArgument("params.NumElements() too large for ",
+                                   DataTypeString(DataTypeToEnum<Index>::v()),
+                                   " indexing: ", params.NumElements(), " > ",
+                                   std::numeric_limits<Index>::max());
+  }
+
+  // The result shape is
+  //   indices.shape[:-1] + params.shape[indices.shape[-1]:]
+  Index N_result = 1;
+  for (int i = 0; i < indices_shape.dims() - 1; ++i) {
+    N_result *= indices_shape.dim_size(i);
+  }
+
+  const TensorShape& params_shape(params.shape());
+  Index total_nd = params_shape.dims();
+
+  TensorShape result_shape(indices_shape);
+  result_shape.RemoveLastDims(1);
+
+  int64_t slice_size_big = 1;
+  for (Index i = indices_nd; i < total_nd; ++i) {
+    slice_size_big *= params_shape.dim_size(i);
+    TF_RETURN_IF_ERROR(result_shape.AddDimWithStatus(params_shape.dim_size(i)));
+  }
+
+  if (slice_size_big > std::numeric_limits<Index>::max()) {
+    return errors::InvalidArgument(
+        "slice size is too large for indexing: ", slice_size_big, " > ",
+        std::numeric_limits<Index>::max());
+  }
+
+  const Index slice_size = static_cast<Index>(slice_size_big);
+
+  TF_RETURN_IF_ERROR(
+      c->allocate_temp(DataTypeToEnum<T>::value, result_shape, out));
+
+  if (N_result > 0) {
+    if (params_shape.num_elements() == 0) {
+      return errors::InvalidArgument(
+          "Requested more than 0 entries, but "
+          "params is empty.  Params shape: ",
+          params_shape.DebugString());
+    }
+
+    auto indices_mat = indices.flat_inner_dims<Index>();
+
+    Index bad_i = -1;
+
+    // Request to copy slices / subtensors
+    // Make out a matrix with the slices the col size.
+    auto out_mat = out->shaped<T, 2>({N_result, slice_size});
+    Tensor scratch;
+    TF_RETURN_IF_ERROR(c->allocate_temp(DT_INT32, TensorShape(), &scratch));
+    auto scratch_scalar = scratch.scalar<int32>();
+
+    switch (indices_nd) {
+#define PARAMS_CASE(IXDIM)                                              \
+  case IXDIM: {                                                         \
+    functor::GatherNdSlice<Device, T, Index, IXDIM> func;               \
+    auto params_flat = params.flat_outer_dims<T, IXDIM + 1>();          \
+    bad_i = func(c->eigen_device<Device>(), slice_size, scratch_scalar, \
+                 params_flat, indices_mat, out_mat);                    \
+  } break
+      PARAMS_CASE(0);
+      PARAMS_CASE(1);
+      PARAMS_CASE(2);
+      PARAMS_CASE(3);
+      PARAMS_CASE(4);
+      PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
+#undef PARAMS_CASE
+      default:
+        return errors::InvalidArgument(
+            "Only indices.shape[-1] values between 1 and 7 "
+            "are currently supported.  Requested rank: ",
+            indices_nd);
+    }
+    using CPUDevice = Eigen::ThreadPoolDevice;
+
+    const bool check_bad_indices =
+        ((std::is_same<Device, CPUDevice>::value &&
+          bad_indices_policy == BadIndicesPolicy::kDefault) ||
+         bad_indices_policy == BadIndicesPolicy::kError);
+    if (check_bad_indices && bad_i >= 0) {
+      auto shape = indices.shape();
+      shape.RemoveLastDims(1);
+      return errors::InvalidArgument(
+          "indices", SliceDebugString(shape, bad_i), " = [",
+          str_util::Join(
+              gtl::ArraySlice<Index>(&indices_mat(bad_i, 0), indices_nd), ", "),
+          "] does not index into param shape ", params.shape().DebugString(),
+          ", node name: ", c->op_kernel().name());
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
new file mode 100644
index 00000000..524f303e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -0,0 +1,149 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+
+// Specialization of GatherNdSlice to CPU
+
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/gather_nd_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace generator {
+
+template <typename T, typename Index, int IXDIM>
+class GatherNdSliceGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE GatherNdSliceGenerator(
+      const Index slice_size, typename TTypes<Index>::ConstMatrix Tindices,
+      typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+      typename TTypes<T>::Matrix Tout, std::atomic<Index>* error_loc)
+      : slice_size_(slice_size),
+        Tindices_(Tindices),
+        Tparams_(Tparams),
+        Tout_(Tout),
+        error_loc_(error_loc) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool GenerateIndices(
+      const Index loc, Eigen::array<Eigen::DenseIndex, IXDIM + 1>* ix) const {
+    (*ix)[IXDIM] = 0;
+    bool out_of_bounds = false;
+    for (int i = 0; i < IXDIM; ++i) {
+      const Index ix_i = internal::SubtleMustCopy(Tindices_(loc, i));
+      (*ix)[i] = ix_i;
+      out_of_bounds |= !FastBoundsCheck(ix_i, Tparams_.dimension(i));
+    }
+    return out_of_bounds;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int32
+  operator()(const Eigen::array<Eigen::DenseIndex, 1>& loc_array) const {
+    const Index loc = loc_array[0];
+    Eigen::array<Eigen::DenseIndex, IXDIM + 1> ix;
+    Eigen::array<Eigen::DenseIndex, 2> ix_out;
+    ix_out[0] = loc;
+    ix_out[1] = 0;
+    const bool out_of_bounds = GenerateIndices(loc, &ix);
+    if (TF_PREDICT_FALSE(out_of_bounds)) {
+      error_loc_->store(loc);
+      std::fill_n(&Tout_(ix_out), slice_size_, T());
+    } else {
+      std::copy_n(&Tparams_(ix), slice_size_, &Tout_(ix_out));
+    }
+
+    return static_cast<int32>(0);  // Return something...
+  }
+
+ private:
+  const Index slice_size_;
+  const typename TTypes<Index>::ConstMatrix Tindices_;
+  const typename TTypes<T, IXDIM + 1>::ConstTensor Tparams_;
+  mutable typename TTypes<T>::Matrix Tout_;
+  std::atomic<Index>* error_loc_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename T, typename Index, int IXDIM>
+struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
+  Index operator()(const CPUDevice& d, const Index slice_size,
+                   typename TTypes<int32>::Scalar Tscratch,
+                   typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+                   typename TTypes<Index>::ConstMatrix Tindices,
+                   typename TTypes<T>::Matrix Tout) {
+    std::atomic<Index> error_loc(-1);
+    const Eigen::Index batch_size = Tindices.dimension(0);
+    generator::GatherNdSliceGenerator<T, Index, IXDIM> gather_nd_generator(
+        slice_size, Tindices, Tparams, Tout, &error_loc);
+
+    auto compute_shard = [&](Eigen::Index begin, Eigen::Index end) {
+      for (Eigen::Index i = begin; i < end; ++i) {
+        const Eigen::array<Eigen::Index, 1> loc{i};
+        gather_nd_generator(loc);
+      }
+    };
+    Eigen::Index bytes_moved = sizeof(T) * (slice_size + IXDIM);
+    auto cost = Eigen::TensorOpCost(bytes_moved /* bytes loaded */,
+                                    bytes_moved /* bytes stored */,
+                                    slice_size + IXDIM /* compute cycles */);
+    d.parallelFor(batch_size, cost, compute_shard);
+
+    // error_loc() returns -1 if there's no out-of-bounds index,
+    // otherwise it returns the location of an OOB index in Tindices.
+    return error_loc.load();
+  }
+};
+
+#define REGISTER_GATHER_ND_FULL(T, Index)                              \
+  template Index                                                       \
+  GatherNdSlice<CPUDevice, T, Index, CPU_PROVIDED_IXDIM>::operator()(  \
+      const CPUDevice& d, const Index slice_size,                      \
+      typename TTypes<int32>::Scalar Tscratch,                         \
+      typename TTypes<T, CPU_PROVIDED_IXDIM + 1>::ConstTensor Tparams, \
+      typename TTypes<Index>::ConstMatrix Tindices,                    \
+      typename TTypes<T>::Matrix Tout);
+
+#define REGISTER_GATHER_ND_CPU(type)    \
+  REGISTER_GATHER_ND_FULL(type, int16); \
+  REGISTER_GATHER_ND_FULL(type, int32); \
+  REGISTER_GATHER_ND_FULL(type, int64)
+
+TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_ND_CPU);
+TF_CALL_float8_e5m2(REGISTER_GATHER_ND_CPU);
+TF_CALL_float8_e4m3fn(REGISTER_GATHER_ND_CPU);
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gemm_functors.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gemm_functors.h
new file mode 100644
index 00000000..8039353e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gemm_functors.h
@@ -0,0 +1,153 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a set of different implementations for the basic matrix by matrix
+// multiply function, commonly known as GEMM after the BLAS library's naming.
+// Having a standard interface enables us to swap out implementations on
+// different platforms, to make sure we're using the optimal version. They are
+// implemented as C++ template functors, so they're easy to swap into all of the
+// different kernels that use them.
+
+#if !defined(EIGEN_USE_THREADS)
+#error "EIGEN_USE_THREADS must be enabled by all .cc files including this."
+#endif  // EIGEN_USE_THREADS
+
+#ifndef TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
+#define TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
+
+#include <string.h>
+
+#include <map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+// Apple provides an optimized BLAS library that is better than Eigen for their
+// devices, so use that if possible.
+#if defined(__APPLE__) && defined(USE_GEMM_FOR_CONV)
+#include <Accelerate/Accelerate.h>
+#define USE_CBLAS_GEMM
+#endif  // __APPLE__
+
+// Older Raspberry Pi systems don't have NEON SIMD acceleration, so Eigen falls
+// back to scalar code, but OpenBLAS has much faster support so prefer that.
+#if defined(RASPBERRY_PI) && defined(USE_GEMM_FOR_CONV) && defined(USE_OPENBLAS)
+#include <cblas.h>
+#define USE_CBLAS_GEMM
+#endif
+
+// A readable but slow implementation of matrix multiplication, useful for
+// debugging and understanding the algorithm. Use instead of FastGemmFunctor in
+// the Im2ColConvFunctor template definition inside the op registration to
+// enable. Assumes row-major ordering of the values in memory.
+template <class T1, class T2, class T3>
+class ReferenceGemmFunctor {
+ public:
+  void operator()(tensorflow::OpKernelContext* ctx, size_t m, size_t n,
+                  size_t k, const T1* a, size_t lda, const T2* b, size_t ldb,
+                  T3* c, size_t ldc) {
+    const size_t a_i_stride = lda;
+    const size_t a_l_stride = 1;
+    const size_t b_j_stride = 1;
+    const size_t b_l_stride = ldb;
+    const size_t c_i_stride = ldc;
+    const size_t c_j_stride = 1;
+    size_t i, j, l;
+    for (j = 0; j < n; j++) {
+      for (i = 0; i < m; i++) {
+        T3 total(0);
+        for (l = 0; l < k; l++) {
+          const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
+          const T1 a_value = a[a_index];
+          const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
+          const T2 b_value = b[b_index];
+          total += (a_value * b_value);
+        }
+        const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
+        c[c_index] = total;
+      }
+    }
+  }
+};
+
+// Uses the optimized EigenTensor library to implement the matrix multiplication
+// required by the Im2ColConvFunctor class. We supply the two input and one
+// output types so that the accumulator can potentially be higher-precision than
+// the inputs, even though we don't currently take advantage of this.
+template <class T1, class T2, class T3>
+class FastGemmFunctor {
+ public:
+  void operator()(tensorflow::OpKernelContext* ctx, size_t m, size_t n,
+                  size_t k, const T1* a, size_t lda, const T2* b, size_t ldb,
+                  T3* c, size_t ldc) {
+    typename tensorflow::TTypes<const T1>::Matrix a_matrix(a, m, k);
+    typename tensorflow::TTypes<const T2>::Matrix b_matrix(b, k, n);
+    typename tensorflow::TTypes<T3>::Matrix c_matrix(c, m, n);
+
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = 1;
+    dim_pair[0].second = 0;
+    c_matrix.device(ctx->eigen_device<Eigen::ThreadPoolDevice>()) =
+        a_matrix.contract(b_matrix, dim_pair);
+  }
+};
+
+// Use float32 accumulation for bfloat16 to deal with precision accumulation
+// issues.
+template <>
+class FastGemmFunctor<Eigen::bfloat16, Eigen::bfloat16, Eigen::bfloat16> {
+ public:
+  void operator()(tensorflow::OpKernelContext* ctx, size_t m, size_t n,
+                  size_t k, const Eigen::bfloat16* a, size_t lda,
+                  const Eigen::bfloat16* b, size_t ldb, Eigen::bfloat16* c,
+                  size_t ldc) {
+    using ConstMatrix =
+        typename tensorflow::TTypes<const Eigen::bfloat16>::Matrix;
+    ConstMatrix a_matrix(a, m, k);
+    ConstMatrix b_matrix(b, k, n);
+    typename tensorflow::TTypes<Eigen::bfloat16>::Matrix c_matrix(c, m, n);
+
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = 1;
+    dim_pair[0].second = 0;
+    c_matrix.device(ctx->eigen_device<Eigen::ThreadPoolDevice>()) =
+        a_matrix.cast<float>()
+            .contract(b_matrix.cast<float>(), dim_pair)
+            .template cast<Eigen::bfloat16>();
+  }
+};
+
+// If we have a fast CBLAS library, use its implementation through a wrapper.
+#if defined(USE_CBLAS_GEMM)
+template <>
+class FastGemmFunctor<float, float, float> {
+ public:
+  void operator()(tensorflow::OpKernelContext* ctx, size_t m, size_t n,
+                  size_t k, const float* a, size_t lda, const float* b,
+                  size_t ldb, float* c, size_t ldc) {
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a,
+                lda, b, ldb, 0.0f, c, ldc);
+  }
+};
+#endif  // USE_CBLAS_GEMM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_device_array.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_device_array.h
new file mode 100644
index 00000000..be0bd0e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_device_array.h
@@ -0,0 +1,125 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_DEVICE_ARRAY_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_DEVICE_ARRAY_H_
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_reference.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+
+namespace tensorflow {
+
+// Create an array of value on the host, to be sent to kernel using
+// GpuDeviceArrayStruct.
+//
+// Usage:
+//   int size = ...;
+//   GpuDeviceArrayOnHost ptrs(context, size);
+//   OP_REQUIRES_OK(ptrs.Init());
+//   for (int i = 0; i < size; ++i) {
+//     ptrs.Set(i, ...);
+//   }
+//   OP_REQUIRES_OK(ptrs.Finalize());
+//   launchKernel(..., ptrs.data, ...);
+//
+// ValueType must be memcopyable.
+template <typename ValueType, int MaxInlineValues = 8>
+class GpuDeviceArrayOnHost {
+ public:
+  GpuDeviceArrayOnHost(OpKernelContext* context, int32_t size)
+      : context_(context),
+        total_bytes_(static_cast<int64_t>(size) * sizeof(ValueType)) {
+    data_.size = size;
+  }
+
+  Status Init() {
+    if (inlined()) {
+      values_ = data_.inline_values;
+      return OkStatus();
+    }
+
+    // Out-of-line: allocate data that will be memcopied.
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    attr.set_gpu_compatible(true);
+    TF_RETURN_IF_ERROR(
+        context_->allocate_temp(DT_INT8, TensorShape{total_bytes_},
+                                &out_of_line_values_on_host_, attr));
+    values_ = reinterpret_cast<ValueType*>(
+        out_of_line_values_on_host_.flat<int8>().data());
+    return OkStatus();
+  }
+
+  void Set(int index, ValueType val) {
+    DCHECK(values_);  // ensure Init was called.
+    DCHECK_LT(index, data_.size);
+    *(values_ + index) = val;
+  }
+
+  Status Finalize() {
+    if (inlined()) {
+      return OkStatus();
+    }
+
+    // Out-of-line - copy pointers to device.
+    auto stream = context_->op_device_context()->stream();
+    TensorReference tensor_ref(out_of_line_values_on_host_);
+    TF_RETURN_IF_ERROR(context_->allocate_temp(
+        DT_INT8, TensorShape{total_bytes_}, &out_of_line_values_on_gpu_));
+    se::DeviceMemoryBase output_values_base{
+        out_of_line_values_on_gpu_.flat<int8>().data(),
+        static_cast<uint64>(total_bytes_)};
+    TF_RETURN_IF_ERROR(stream->Memcpy(
+        &output_values_base, out_of_line_values_on_host_.flat<int8>().data(),
+        total_bytes_));
+    context_->device()
+        ->tensorflow_accelerator_device_info()
+        ->event_mgr->ThenExecute(stream,
+                                 [tensor_ref]() { tensor_ref.Unref(); });
+    data_.out_of_line_values = reinterpret_cast<ValueType*>(
+        out_of_line_values_on_gpu_.flat<int8>().data());
+    return OkStatus();
+  }
+
+  const GpuDeviceArrayStruct<ValueType, MaxInlineValues>& data() const {
+    // Ensure Finalize is called.
+    DCHECK(inlined() || out_of_line_values_on_gpu_.IsInitialized());
+    return data_;
+  }
+
+ private:
+  bool inlined() const { return data_.size <= MaxInlineValues; }
+
+  OpKernelContext* const context_;
+  const int64_t total_bytes_;  // total size of all pointers.
+  ValueType* values_ = nullptr;
+  GpuDeviceArrayStruct<ValueType, MaxInlineValues> data_;
+
+  Tensor out_of_line_values_on_host_;
+  Tensor out_of_line_values_on_gpu_;
+
+  GpuDeviceArrayOnHost(const GpuDeviceArrayOnHost&) = delete;
+  void operator=(const GpuDeviceArrayOnHost&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_DEVICE_ARRAY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_device_array_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_device_array_gpu.h
new file mode 100644
index 00000000..15a09e3d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_device_array_gpu.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains structs and functions to be included in device code.
+
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_DEVICE_ARRAY_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_DEVICE_ARRAY_GPU_H_
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+
+namespace tensorflow {
+
+// To decode on the device side, use GetGpuDeviceArrayOnDevice.
+// To encode on the host side, use GpuDeviceArrayOnHost.
+template <typename ValueType, int MaxInlineValues = 8>
+struct GpuDeviceArrayStruct {
+  int32 size;
+  // used if size <= MaxInlineValues;
+  ValueType inline_values[MaxInlineValues];
+  ValueType* out_of_line_values = nullptr;  // used if size > MaxInlineValues;
+};
+
+template <typename ValueType, int MaxInlineValues = 8>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ValueType* GetGpuDeviceArrayOnDevice(
+    GpuDeviceArrayStruct<ValueType, MaxInlineValues>* data) {
+  if (data->size <= MaxInlineValues) {
+    return data->inline_values;
+  } else {
+    return data->out_of_line_values;
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_DEVICE_ARRAY_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_prim.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_prim.h
new file mode 100644
index 00000000..bef22b50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_prim.h
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+To in writing unless required by applicable law or agreed,
+distributed on an, software distributed under the license is "AS IS"
+BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
+or implied. For the specific language governing permissions and
+limitations under the license, the license you must see.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
+
+#include "tensorflow/core/platform/bfloat16.h"
+
+#if GOOGLE_CUDA
+#include "cub/block/block_load.cuh"
+#include "cub/block/block_scan.cuh"
+#include "cub/block/block_store.cuh"
+#include "cub/device/device_histogram.cuh"
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_reduce.cuh"
+#include "cub/device/device_scan.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "cub/device/device_segmented_reduce.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/counting_input_iterator.cuh"
+#include "cub/iterator/transform_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "cub/warp/warp_reduce.cuh"
+#include "third_party/gpus/cuda/include/cusparse.h"
+
+namespace gpuprim = ::cub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace cub {
+template <>
+__device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::half>(
+    Eigen::half *ptr, Eigen::half val, Int2Type<true> /*is_primitive*/) {
+  *reinterpret_cast<volatile uint16_t *>(ptr) =
+      Eigen::numext::bit_cast<uint16_t>(val);
+}
+
+template <>
+__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer<Eigen::half>(
+    Eigen::half *ptr, Int2Type<true> /*is_primitive*/) {
+  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
+  return Eigen::numext::bit_cast<Eigen::half>(result);
+}
+
+template <>
+__device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::bfloat16>(
+    Eigen::bfloat16 *ptr, Eigen::bfloat16 val,
+    Int2Type<true> /*is_primitive*/) {
+  *reinterpret_cast<volatile uint16_t *>(ptr) =
+      Eigen::numext::bit_cast<uint16_t>(val);
+}
+
+template <>
+__device__ __forceinline__ Eigen::bfloat16
+ThreadLoadVolatilePointer<Eigen::bfloat16>(Eigen::bfloat16 *ptr,
+                                           Int2Type<true> /*is_primitive*/) {
+  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(result);
+}
+
+template <>
+struct NumericTraits<Eigen::half>
+    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
+                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
+                 /*T=*/Eigen::half> {};
+template <>
+struct NumericTraits<tensorflow::bfloat16>
+    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
+                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
+                 /*T=*/tensorflow::bfloat16> {};
+}  // namespace cub
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hipcub/hipcub.hpp"
+#include "rocm/rocm_config.h"
+namespace gpuprim = ::hipcub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace rocprim {
+namespace detail {
+#if (TF_ROCM_VERSION >= 50200)
+template <>
+struct float_bit_mask<Eigen::half> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7C00;
+  static constexpr uint16_t mantissa = 0x03FF;
+  using bit_type = uint16_t;
+};
+
+template <>
+struct float_bit_mask<Eigen::bfloat16> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7F80;
+  static constexpr uint16_t mantissa = 0x007F;
+  using bit_type = uint16_t;
+};
+#endif
+template <>
+struct radix_key_codec_base<Eigen::half>
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
+template <>
+struct radix_key_codec_base<tensorflow::bfloat16>
+    : radix_key_codec_floating<tensorflow::bfloat16, uint16_t> {};
+};  // namespace detail
+};  // namespace rocprim
+
+#endif  // TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_prim_helpers.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_prim_helpers.h
new file mode 100644
index 00000000..52599890
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_prim_helpers.h
@@ -0,0 +1,286 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_PRIM_HELPERS_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_PRIM_HELPERS_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "xla/stream_executor/stream.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+namespace detail {
+
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const T size,
+                                T* out) {
+  GPU_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+
+// Initialize out with range start, start + delta, start + 2 * delta, ...
+template <typename T>
+Status RangeInit(const Eigen::GpuDevice& d, const T start, const T delta,
+                 const T size, T* out) {
+  if (size == 0) return OkStatus();
+  GpuLaunchConfig config = GetGpuLaunchConfig(size, d);
+  return GpuLaunchKernel(RangeInitKernel<T>, config.block_count,
+                         config.thread_per_block, 0, d.stream(), start, delta,
+                         size, out);
+}
+
+// Computes keys_out = sorted(keys_in), and indices_out = argsort(keys_in).
+// If keys_out is not required, it can be set to nullptr.
+// If indices_in is nullptr, the range of input indices [0, size) will be used.
+template <bool Descending, typename Tkey, typename Tindex>
+Status GpuRadixSortImpl(OpKernelContext* context, int size, const Tkey* keys_in,
+                        Tkey* keys_out,            // Optional
+                        const Tindex* indices_in,  // Optional
+                        Tindex* indices_out, int num_bits = sizeof(Tkey) * 8) {
+  if (size == 0) return OkStatus();
+  if (num_bits == 0) {
+    // Workaround for CUB failing when begin_bit = end_bit = 0 (e.g., when all
+    // keys are 0, so no sorting is needed).
+    se::Stream* stream = context->op_device_context()->stream();
+    if (keys_out) {
+      // Copy keys_in to keys_out.
+      size_t num_bytes = size * sizeof(Tkey);
+      se::DeviceMemoryBase src(const_cast<Tkey*>(keys_in), num_bytes);
+      se::DeviceMemoryBase dst(keys_out, num_bytes);
+      TF_RETURN_IF_ERROR(stream->Memcpy(&dst, src, num_bytes));
+    }
+    if (indices_in) {
+      // Copy indices_in to indices_out.
+      size_t num_bytes = size * sizeof(Tindex);
+      se::DeviceMemoryBase src(const_cast<Tindex*>(indices_in), num_bytes);
+      se::DeviceMemoryBase dst(indices_out, num_bytes);
+      TF_RETURN_IF_ERROR(stream->Memcpy(&dst, src, num_bytes));
+    } else {
+      // Set output indices to range.
+      const Eigen::GpuDevice& device =
+          context->eigen_device<Eigen::GpuDevice>();
+      TF_RETURN_IF_ERROR(detail::RangeInit(device, Tindex(0), Tindex(1),
+                                           Tindex(size), indices_out));
+    }
+    return OkStatus();
+  }
+  // Allocate temporary inputs/outputs if necessary.
+  Tensor tmp_indices_in;
+  if (!indices_in) {
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DataTypeToEnum<Tindex>::value, TensorShape({size}), &tmp_indices_in));
+    Tindex* mutable_indices_in = tmp_indices_in.flat<Tindex>().data();
+    indices_in = mutable_indices_in;
+    const Eigen::GpuDevice& device = context->eigen_device<Eigen::GpuDevice>();
+    // Initialize indices_in to the input index range.
+    TF_RETURN_IF_ERROR(detail::RangeInit(device, Tindex(0), Tindex(1),
+                                         Tindex(size), mutable_indices_in));
+  }
+  Tensor tmp_keys_out;
+  if (!keys_out) {
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DataTypeToEnum<Tkey>::value, TensorShape({size}), &tmp_keys_out));
+    keys_out = tmp_keys_out.flat<Tkey>().data();
+  }
+  // Determine temporary device storage requirements.
+  Tensor temp_storage;
+  size_t temp_storage_bytes = 0;
+  const auto& cu_stream = GetGpuStream(context);
+  gpuError_t err;
+  if constexpr (Descending) {
+    err = gpuprim::DeviceRadixSort::SortPairsDescending(
+        nullptr, temp_storage_bytes, keys_in, keys_out, indices_in, indices_out,
+        size, /*begin_bit=*/0, /*end_bit=*/num_bits, cu_stream);
+  } else {
+    err = gpuprim::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes, keys_in, keys_out, indices_in, indices_out,
+        size, /*begin_bit=*/0, /*end_bit=*/num_bits, cu_stream);
+  }
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceRadixSort::SortPairs to calculate "
+        "temp_storage_bytes, status: ",
+        cudaGetErrorString(err));
+  }
+  // Allocate temporary storage.
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+      &temp_storage));
+  // Sort indices by keys.
+  if constexpr (Descending) {
+    err = gpuprim::DeviceRadixSort::SortPairsDescending(
+        temp_storage.flat<int8>().data(), temp_storage_bytes, keys_in, keys_out,
+        indices_in, indices_out, size, /*begin_bit=*/0, /*end_bit=*/num_bits,
+        cu_stream);
+  } else {
+    err = gpuprim::DeviceRadixSort::SortPairs(
+        temp_storage.flat<int8>().data(), temp_storage_bytes, keys_in, keys_out,
+        indices_in, indices_out, size, /*begin_bit=*/0, /*end_bit=*/num_bits,
+        cu_stream);
+  }
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceRadixSort::SortPairs, "
+        "temp_storage_bytes: ",
+        temp_storage_bytes, "status: ", cudaGetErrorString(err));
+  }
+  return OkStatus();
+}
+
+}  // namespace detail
+
+template <typename Tkey, typename Tindex>
+Status GpuRadixSort(OpKernelContext* context, int size, const Tkey* keys_in,
+                    Tkey* keys_out,            // Optional
+                    const Tindex* indices_in,  // Optional
+                    Tindex* indices_out, int num_bits = sizeof(Tkey) * 8) {
+  return detail::GpuRadixSortImpl</*Descending=*/false>(
+      context, size, keys_in, keys_out, indices_in, indices_out, num_bits);
+}
+
+template <typename Tkey, typename Tindex>
+Status GpuRadixSortDescending(OpKernelContext* context, int size,
+                              const Tkey* keys_in,
+                              Tkey* keys_out,            // Optional
+                              const Tindex* indices_in,  // Optional
+                              Tindex* indices_out,
+                              int num_bits = sizeof(Tkey) * 8) {
+  return detail::GpuRadixSortImpl</*Descending=*/true>(
+      context, size, keys_in, keys_out, indices_in, indices_out, num_bits);
+}
+
+template <typename InputIteratorT, typename OutputIteratorT>
+Status GpuInclusivePrefixSum(OpKernelContext* context, int size,
+                             InputIteratorT input, OutputIteratorT output) {
+  static_assert(
+      !std::is_same<typename std::remove_reference<decltype(*input)>::type,
+                    bool>::value,
+      "GpuInclusivePrefixSum does not work correct with booleans, please use "
+      "TransformInputIterator to explicitly cast to an integer.");
+  if (size == 0) return OkStatus();
+  const auto& cu_stream = GetGpuStream(context);
+  size_t temp_storage_bytes;
+  auto err = gpuprim::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes,
+                                               input, output, size, cu_stream);
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceScan::InclusiveSum to calculate "
+        "temp_storage_bytes, status: ",
+        cudaGetErrorString(err));
+  }
+  Tensor temp_storage;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+      &temp_storage));
+  err = gpuprim::DeviceScan::InclusiveSum(temp_storage.flat<int8>().data(),
+                                          temp_storage_bytes, input, output,
+                                          size, cu_stream);
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceScan::InclusiveSum, "
+        "temp_storage_bytes: ",
+        temp_storage_bytes, ", status: ", cudaGetErrorString(err));
+  }
+  return OkStatus();
+}
+
+// Note that this behaves deterministically for repeat calls on the same device.
+template <typename InputIteratorT, typename OutputIteratorT,
+          typename OffsetIteratorT, typename ReduceOp, typename T>
+Status GpuSegmentedReduce(
+    OpKernelContext* context, int num_segments, ReduceOp reduce_op,
+    const T& initial_value,
+    InputIteratorT input,             // [any]
+    OffsetIteratorT segment_offsets,  // [num_segments + 1]
+    OutputIteratorT output) {         // [num_segments]
+  if (num_segments == 0) return OkStatus();
+  const auto& cu_stream = GetGpuStream(context);
+  size_t temp_storage_bytes;
+  auto err = gpuprim::DeviceSegmentedReduce::Reduce(
+      nullptr, temp_storage_bytes, input, output, num_segments, segment_offsets,
+      segment_offsets + 1, reduce_op, initial_value, cu_stream);
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceSegmentedReduce::Reduce to calculate "
+        "temp_storage_bytes, status: ",
+        cudaGetErrorString(err));
+  }
+  Tensor temp_storage;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+      &temp_storage));
+  err = gpuprim::DeviceSegmentedReduce::Reduce(
+      temp_storage.flat<int8>().data(), temp_storage_bytes, input, output,
+      num_segments, segment_offsets, segment_offsets + 1, reduce_op,
+      initial_value, cu_stream);
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceSegmentedReduce::Reduce"
+        ", temp_storage_bytes: ",
+        temp_storage_bytes, ", status: ", cudaGetErrorString(err));
+  }
+  return OkStatus();
+}
+
+template <typename InputIteratorT, typename FlagIteratorT,
+          typename OutputIteratorT, typename NumSelectedT = int>
+Status GpuSelectFlagged(OpKernelContext* context, int size,
+                        InputIteratorT input, FlagIteratorT flags,
+                        OutputIteratorT output,
+                        NumSelectedT* out_num_selected = nullptr) {
+  const auto& cu_stream = GetGpuStream(context);
+  Tensor out_num_selected_t;
+  if (!out_num_selected) {
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(DataTypeToEnum<NumSelectedT>::value,
+                               TensorShape({}), &out_num_selected_t));
+    out_num_selected = out_num_selected_t.scalar<NumSelectedT>().data();
+  }
+  size_t temp_storage_bytes;
+  auto err =
+      gpuprim::DeviceSelect::Flagged(nullptr, temp_storage_bytes, input, flags,
+                                     output, out_num_selected, size, cu_stream);
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceSelect::Flagged to calculate "
+        "temp_storage_bytes, status: ",
+        cudaGetErrorString(err));
+  }
+  Tensor temp_storage;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+      &temp_storage));
+  err = gpuprim::DeviceSelect::Flagged(temp_storage.flat<int8>().data(),
+                                       temp_storage_bytes, input, flags, output,
+                                       out_num_selected, size, cu_stream);
+  if (err != 0) {
+    return errors::Internal(
+        "Failed to launch gpuprim::DeviceSelect::Flagged, temp_storage_bytes: ",
+        temp_storage_bytes, ", status: ", cudaGetErrorString(err));
+  }
+  return OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_PRIM_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_utils.h
new file mode 100644
index 00000000..8d511859
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/gpu_utils.h
@@ -0,0 +1,448 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <unordered_map>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/lazy_op_runner.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace stream_executor {
+class RedzoneAllocator;
+}  // namespace stream_executor
+
+namespace xla {
+class AutotuneResult;
+}  // namespace xla
+
+namespace tensorflow {
+
+// Returns true if bfloat16 is directly supported in Ops and inputs shall not be
+// casted to floats to perform the computations and then back.
+bool IsBF16SupportedInOps(se::Stream* stream);
+
+class NodeDef;
+using xla::AutotuneResult;
+
+template <typename T>
+se::DeviceMemory<T> AsDeviceMemory(const T* gpu_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(gpu_memory));
+  se::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+
+// Return whether the redzone check is disabled.
+//
+// Controlled by the TF_DISABLE_RZ_CHECK environment variable.
+bool RedzoneCheckDisabled();
+
+// Return an allocated buffer with redzones the size of `buffer`. Does
+// *not* copy the contents of the `buffer` into the newly allocated buffer:
+// assumes that buffer is a pure out-parameter.
+//
+// Returns `buffer` if RedzoneCheckDisabled() is true.
+//
+// On error, return `buffer`, and log an error message (once).
+se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
+                                           se::DeviceMemoryBase buffer);
+
+// Check the passed allocator for redzone violations.
+// If violations have occurred, mark the corresponding autotune result
+// as a failure.
+void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
+                   AutotuneResult* autotune_result);
+
+template <typename T>
+inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+
+// Returns whether cuBLASLt is enabled.
+//
+// Controlled by the TF_USE_CUBLASLT environment variable.
+bool EnableCublasLtGemm();
+
+namespace internal {
+
+template <typename Parameters>
+struct AutotuneMapHasher {
+  std::size_t operator()(const Parameters& parameter) const {
+    return parameter.hash();
+  }
+};
+
+}  // namespace internal
+
+// A helper class that looks up the best autotuned config from parameters.
+// Due to the noisy nature of autotune, especially with multiple devices, it
+// only accepts a config if its margin exceeds a threshold.
+// For the same shape configs, if a new best config matches the previous best,
+// they get promoted; otherwise, the winner gets demoted. This process stops
+// when the winner's score exceeds the threshold.
+// In a bad case when two configs are very close to each other and flips
+// back and forth randomly, the expected number of experiments before autotune
+// settles is O(threshold ^ 2). So we recommend that number of warmup runs
+// for any benchmarks.
+template <typename Parameters, typename Config,
+          typename Hasher = internal::AutotuneMapHasher<Parameters>>
+class AutotuneMap {
+ public:
+  bool Find(const Parameters& params, Config* config) const {
+    mutex_lock lock(mu_);
+    auto iter = params_config_map_.find(params);
+    if (iter == params_config_map_.end() ||
+        (iter->second.score < min_score_threshold_ &&
+         iter->second.count <= max_autotune_count_)) {
+      return false;
+    }
+    *config = iter->second.config;
+    return true;
+  }
+  void Insert(const Parameters& params, const Config& config) {
+    mutex_lock lock(mu_);
+    auto iter = params_config_map_.find(params);
+    int new_score = 0;
+    if (iter == params_config_map_.end()) {
+      // Create a new entry if params is new.
+      VLOG(1) << GetActionSummary("creates", params, config);
+      params_config_map_.insert(
+          std::make_pair(params, ValueType{config, 1, 1}));
+      new_score = 1;
+    } else if (iter->second.score < min_score_threshold_ &&
+               iter->second.count <= max_autotune_count_) {
+      DCHECK_GT(iter->second.score, 0);
+      if (iter->second.config != config) {
+        // If it is different from the current winner, demotes the winner.
+        VLOG(1) << GetActionSummary("demotes", params, config);
+        new_score = --iter->second.score;
+        ++iter->second.count;
+        if (new_score <= 0) {
+          VLOG(1) << GetActionSummary("erases", params, config);
+          params_config_map_.erase(iter);
+        }
+      } else {
+        // If it is the same as the current winner, promotes the winner.
+        VLOG(1) << GetActionSummary("promotes", params, config);
+        new_score = ++iter->second.score;
+        ++iter->second.count;
+      }
+    }
+    if (new_score >= min_score_threshold_) {
+      VLOG(1) << GetActionSummary("accepts", params, config);
+    } else if (autotune_global_count_ >= max_autotune_global_count_) {
+      // The autotuning exceeds the max iteration threshold and we accept the
+      // the winner if it exists in the map, otherwise we accept the current
+      // winner.
+      auto winner = params_config_map_.find(params);
+      if (winner == params_config_map_.end()) {
+        VLOG(1) << GetActionSummary("creates", params, config);
+        for (int i = 0; i < min_score_threshold_; ++i) {
+          VLOG(1) << GetActionSummary("promotes", params, config);
+        }
+        params_config_map_.insert(
+            std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
+      } else {
+        int promotes_times = min_score_threshold_ - winner->second.score;
+        for (int i = 0; i < promotes_times; ++i) {
+          VLOG(1) << GetActionSummary("promotes", params, config);
+        }
+        winner->second.score = min_score_threshold_;
+      }
+      VLOG(1) << GetActionSummary("accepts", params, config);
+    }
+    autotune_global_count_++;
+  }
+
+  std::unordered_map<Parameters, Config, Hasher> GetMap() const {
+    mutex_lock lock(mu_);
+    std::unordered_map<Parameters, Config, Hasher> map;
+    for (const auto& entry : params_config_map_) {
+      map.insert(std::make_pair(entry.first, entry.second.config));
+    }
+    return map;
+  }
+
+  // Only for testing
+  void ClearMap() {
+    mutex_lock lock(mu_);
+    params_config_map_.clear();
+  }
+
+ private:
+  // Underlying data structure of values in the map.
+  struct ValueType {
+    Config config;
+    int32 score;
+    int32 count;
+  };
+  AutotuneMap(const std::string& name) : name_(name) {
+    min_score_threshold_ = 1;
+    int min_warmup_iterations = 10;
+    const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
+    if (threshold_str != nullptr) {
+      VLOG(1) << "TF_AUTOTUNE_THRESHOLD = " << threshold_str;
+      strings::safe_strto32(threshold_str, &min_score_threshold_);
+    }
+    const char* min_warmup_iteration_str =
+        getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
+    if (min_warmup_iteration_str != nullptr) {
+      strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
+    }
+    min_score_threshold_ = std::max(min_score_threshold_, 1);
+    max_autotune_count_ = std::max(
+        5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
+    max_autotune_global_count_ = 2 * max_autotune_count_;
+    autotune_global_count_ = 0;
+  }
+
+  template <class Group, class Params, class Cfg, class Hash>
+  friend class AutotuneSingleton;
+
+  std::string GetActionSummary(StringPiece action, const Parameters& params,
+                               const Config& config) {
+    return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
+                           string(action).c_str(), params.ToString().c_str(),
+                           config.ToString().c_str());
+  }
+
+  mutable mutex mu_;
+
+  std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
+      TF_GUARDED_BY(mu_);
+  std::string name_;
+  int32 min_score_threshold_;
+  int32 max_autotune_count_;
+  int32 max_autotune_global_count_;
+  int32 autotune_global_count_;
+
+  AutotuneMap(const AutotuneMap&) = delete;
+  void operator=(const AutotuneMap&) = delete;
+};
+
+// A Singleton helper that manages the global autotune results by groups.
+// The caller specified arbitrary Group type that can distinguish between
+// different autotune results, even if their Parameters and Configs are the
+// same.
+template <class Group, typename Parameters, typename Config,
+          typename Hasher = internal::AutotuneMapHasher<Parameters>>
+class AutotuneSingleton {
+ public:
+  typedef AutotuneMap<Parameters, Config, Hasher> AutotuneType;
+  static AutotuneType* GetInstance() {
+    static AutotuneType* instance = new AutotuneType(Group::name());
+    return instance;
+  }
+};
+
+// Logs convolution results to customized back-storage.
+void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
+                            se::dnn::DataType element_type,
+                            se::DeviceMemoryBase input_buffer,
+                            se::DeviceMemoryBase filter_buffer,
+                            se::DeviceMemoryBase output_buffer,
+                            const se::dnn::BatchDescriptor& input_desc,
+                            const se::dnn::FilterDescriptor& filter_desc,
+                            const se::dnn::BatchDescriptor& output_desc,
+                            const se::dnn::ConvolutionDescriptor& conv_desc,
+                            se::StreamExecutor* stream_exec,
+                            absl::Span<const AutotuneResult> results);
+
+// Logs fused convolution results to customized back-storage.
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, se::DeviceMemoryBase input_buffer,
+    se::DeviceMemoryBase filter_buffer, se::DeviceMemoryBase output_buffer,
+    se::DeviceMemoryBase bias_buffer, se::DeviceMemoryBase side_input_buffer,
+    const se::dnn::BatchDescriptor& input_desc,
+    const se::dnn::FilterDescriptor& filter_desc,
+    const se::dnn::BatchDescriptor& output_desc,
+    const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
+    double side_value_scale, se::dnn::ActivationMode activation_mode,
+    se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results);
+
+// Logs fused matmul results to customized back-storage.
+void LogFusedMatmulAutotuneResults(
+    se::dnn::DataType ab_dtype, se::dnn::DataType c_dtype,
+    se::DeviceMemoryBase a_buffer, se::DeviceMemoryBase b_buffer,
+    se::DeviceMemoryBase c_buffer, se::DeviceMemoryBase bias_buffer,
+    bool trans_a, bool trans_b, uint32_t m, uint32_t n, uint32_t k, int32_t lda,
+    int32_t ldb, int32_t ldc, se::dnn::ActivationMode activation_mode,
+    se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results);
+
+// Autotuning map entry for cuDNN-frontend-capable APIs.
+//
+// The longer-term intent is to remove the AlgorithmConfig variant and make this
+// contain only the two LazyOpRunners, but for the time being ROCm is stuck on
+// the legacy API and requires an AlgorithmConfig.
+template <typename Op>
+class AutotuneEntry {
+ public:
+  AutotuneEntry() : is_algorithm_config_(true) {}
+
+  // Initialize with legacy-API AlgorithmConfig; used for the ROCm backend only.
+  explicit AutotuneEntry(se::dnn::AlgorithmConfig config)
+      : is_algorithm_config_(true), algorithm_config_(std::move(config)) {}
+
+  AutotuneEntry(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary,
+                std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback)
+      : is_algorithm_config_(false),
+        op_runners_{std::move(primary), std::move(no_scratch_fallback)} {}
+
+  // Initialize from config data, without pre-cached runners, such as when
+  // loading AoT autotuning maps.
+  AutotuneEntry(se::dnn::AlgorithmDesc primary,
+                absl::optional<se::dnn::AlgorithmDesc> no_scratch_fallback)
+      : AutotuneEntry(std::make_shared<se::dnn::LazyOpRunner<Op>>(primary),
+                      no_scratch_fallback
+                          ? std::make_shared<se::dnn::LazyOpRunner<Op>>(
+                                *no_scratch_fallback)
+                          : nullptr) {}
+
+  // Initialize with pre-cached OpRunners, such as during autotuning.
+  static StatusOr<AutotuneEntry> FromOpRunners(
+      std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> primary,
+      std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>
+          no_cache_fallback) {
+    TF_ASSIGN_OR_RETURN(
+        auto primary_cache,
+        se::dnn::LazyOpRunner<Op>::FromOpRunner(std::move(primary)));
+
+    if (no_cache_fallback) {
+      TF_ASSIGN_OR_RETURN(auto fallback_cache,
+                          se::dnn::LazyOpRunner<Op>::FromOpRunner(
+                              std::move(no_cache_fallback)));
+      return AutotuneEntry(std::move(primary_cache), std::move(fallback_cache));
+
+    } else {
+      return AutotuneEntry(std::move(primary_cache), nullptr);
+    }
+  }
+
+  struct OpRunners {
+    OpRunners() = default;
+
+    OpRunners(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary_,
+              std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback_)
+        : primary(std::move(primary_)),
+          no_scratch_fallback(std::move(no_scratch_fallback_)) {}
+
+    // Null iff this 'OpRunners' is default-constructed as part of the
+    // fake-variant in AutotuneEntry; users outside gpu_utils.h itself should
+    // never see primary = nullptr.
+    std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary;
+    std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback;  // Nullable
+
+    bool operator==(const OpRunners& other) const {
+      return *primary == *other.primary &&
+             ((!no_scratch_fallback && !other.no_scratch_fallback) ||
+              (no_scratch_fallback && other.no_scratch_fallback &&
+               *no_scratch_fallback == *other.no_scratch_fallback));
+    }
+  };
+
+  bool is_algorithm_config() const { return is_algorithm_config_; }
+
+  const se::dnn::AlgorithmConfig& GetAlgorithmConfig() const {
+    DCHECK(is_algorithm_config_);
+    return algorithm_config_;
+  }
+
+  const OpRunners& GetOpRunners() const {
+    DCHECK(!is_algorithm_config_);
+    return op_runners_;
+  }
+
+  // AutotuneMap needs to test equality to keep track of the number of times an
+  // algorithm has won autotuning; for this purpose, we can use ToString to
+  // determine whether runners are equal.
+  bool operator==(const AutotuneEntry<Op>& other) const {
+    if (is_algorithm_config_) {
+      return other.is_algorithm_config_ &&
+             algorithm_config_ == other.algorithm_config_;
+    }
+
+    return !other.is_algorithm_config_ && op_runners_ == other.op_runners_;
+  }
+
+  bool operator!=(const AutotuneEntry<Op>& other) const {
+    return !(*this == other);
+  }
+
+  std::string ToString() const {
+    if (is_algorithm_config_) {
+      return algorithm_config_.ToString();
+    }
+    return absl::StrCat("{", op_runners_.primary->ToString(), ", ",
+                        (op_runners_.no_scratch_fallback
+                             ? op_runners_.no_scratch_fallback->ToString()
+                             : "(op_runners have no fallback)"),
+                        "}");
+  }
+
+ private:
+  // NVCC is broken, so we can't use absl::variant here.  Just fake it with a
+  // bool and both fields.
+  bool is_algorithm_config_;
+  se::dnn::AlgorithmConfig algorithm_config_;
+  OpRunners op_runners_;
+};
+
+namespace internal {
+StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
+    absl::Span<const AutotuneResult> results);
+}  // namespace internal
+
+// Returns the best algorithms for the config, one is the fastest, the other is
+// other is fastest with 0 scratch space. Unsuccessful autotuning results are
+// allowed and ignored.
+StatusOr<se::dnn::AlgorithmConfig> BestCudnnConvAlgorithm(
+    absl::Span<const AutotuneResult> results);
+
+// Explicitly-instantiated with ConvOp and FusedConvOp.
+//
+// The definition can't be in the header because including .pb.h files in
+// headers is forbidden.
+template <typename Op>
+StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
+    absl::Span<const AutotuneResult> results,
+    std::vector<
+        std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
+        runners);
+
+// Get the Dnn workspace limit from the environment variable, which is in MB.
+// Return the workspace memory limit in bytes. If no value is set, return the
+// default value.
+int64_t GetDnnWorkspaceLimit(const string& envvar_in_mb,
+                             int64_t default_value_in_bytes);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/hinge-loss.h b/third_party/tflite-hdrs/tensorflow/core/kernels/hinge-loss.h
new file mode 100644
index 00000000..51f11e04
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/hinge-loss.h
@@ -0,0 +1,126 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_HINGE_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_HINGE_LOSS_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/core/kernels/loss.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class HingeLossUpdater : public DualLossUpdater {
+ public:
+  // Computes the updated dual variable (corresponding) to a single example. The
+  // updated dual value maximizes the objective function of the dual
+  // optimization problem associated with hinge loss (conditioned on keeping the
+  // rest of the dual variables intact). The method below finds an optimal delta
+  // (difference between updated and previous dual value) using the update rule
+  // within SDCA procedure (see http://arxiv.org/pdf/1209.1873v2.pdf, page 5)
+  // and the particular form of conjugate function for hinge loss.
+  //
+  // The CoCoA+ modification is detailed in readme.md.
+  //
+  // TODO(sibyl-vie3Poto): Write up a doc with concrete derivation and point to it from
+  // here.
+  double ComputeUpdatedDual(const int num_loss_partitions, const double label,
+                            const double example_weight,
+                            const double current_dual, const double wx,
+                            const double weighted_example_norm) const final {
+    // Intuitively there are 3 cases:
+    // a. new optimal value of the dual variable falls within the admissible
+    // range [0, 1]. In this case we set new dual to this value.
+    // b. new optimal value is < 0. Then, because of convexity, the optimal
+    // valid value for new dual = 0
+    // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
+    const double candidate_optimal_dual =
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
+                                       weighted_example_norm);
+    if (label * candidate_optimal_dual < 0) {
+      return 0.0;
+    }
+    if (label * candidate_optimal_dual > 1.0) {
+      return label;
+    }
+    return candidate_optimal_dual;
+  }
+
+  // Conjugate of hinge loss. This is computed as:
+  // \phi*(z) = z if z \in [-1, 0] and +infinity everywhere else. See for
+  // instance http://www.eecs.berkeley.edu/~wainwrig/stat241b/lec10.pdf
+  // Here we want the weighted version of the conjugate loss. It turns out, that
+  // if w is the weight of an example, the conjugate of the weighted hinge loss
+  // is given by:
+  // \phi*(z) = z if z \in [-w, 0] and +infinity everywhere else. Here the
+  // conjugate function depends not only on the weight of the example but also
+  // on its label. In particular:
+  // \phi_y*(z) = y*z if y*z \in [-w, 0] and +infinity everywhere else where
+  // y \in {-1,1}. The following method implements \phi_y*(-\alpha/w).
+  double ComputeDualLoss(const double current_dual, const double example_label,
+                         const double example_weight) const final {
+    // For binary classification, there are 2 conjugate functions, one per
+    // label value (-1 and 1).
+    const double y_alpha = current_dual * example_label;  // y \alpha
+    if (y_alpha < 0 || y_alpha > 1.0) {
+      return std::numeric_limits<double>::max();
+    }
+    return -y_alpha * example_weight;
+  }
+
+  // Hinge loss for binary classification for a single example. Hinge loss
+  // equals max(0, 1 - y * wx) (see https://en.wikipedia.org/wiki/Hinge_loss).
+  // For weighted instances loss should be multiplied by the instance weight.
+  double ComputePrimalLoss(const double wx, const double example_label,
+                           const double example_weight) const final {
+    const double y_wx = example_label * wx;
+    return std::max(0.0, 1 - y_wx) * example_weight;
+  }
+
+  double PrimalLossDerivative(const double wx, const double label,
+                              const double example_weight) const final {
+    if (label * wx < 1) {
+      return -label * example_weight;
+    }
+    return 0;
+  }
+
+  // The smoothness constant is 0 since the derivative of the loss is not
+  // Lipschitz
+  double SmoothnessConstant() const final { return 0; }
+
+  // Converts binary example labels from 0.0 or 1.0 to -1.0 or 1.0 respectively
+  // as expected by hinge loss.
+  absl::Status ConvertLabel(float* const example_label) const final {
+    if (*example_label == 0.0) {
+      *example_label = -1;
+      return absl::OkStatus();
+    }
+    if (*example_label == 1.0) {
+      return absl::OkStatus();
+    }
+    return errors::InvalidArgument(
+        "Only labels of 0.0 or 1.0 are supported right now. "
+        "Found example with label: ",
+        *example_label);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_HINGE_LOSS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/histogram_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/histogram_op.h
new file mode 100644
index 00000000..cc6ea006
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/histogram_op.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_HISTOGRAM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_HISTOGRAM_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, typename Tout>
+struct HistogramFixedWidthFunctor {
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& values,
+      const typename TTypes<T, 1>::ConstTensor& value_range, int32_t nbins,
+      typename TTypes<Tout, 1>::Tensor& out);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_HISTOGRAM_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/host_constant_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/host_constant_op.h
new file mode 100644
index 00000000..9ba151ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/host_constant_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// HostConstantOp differs from ConstantOp in that its output is always
+// in host memory.
+class _HostConstantOp : public OpKernel {
+ public:
+  explicit _HostConstantOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+  const Tensor* const_tensor() const override { return &tensor_; };
+  ~_HostConstantOp() override {}
+
+ private:
+  Tensor tensor_;
+  _HostConstantOp(const _HostConstantOp&) = delete;
+  void operator=(const _HostConstantOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/identity_n_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/identity_n_op.h
new file mode 100644
index 00000000..7273731f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/identity_n_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2015-2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IDENTITY_N_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IDENTITY_N_OP_H_
+
+#include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class IdentityNOp : public OpKernel {
+ public:
+  explicit IdentityNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList input;
+    OpOutputList output;
+    OP_REQUIRES_OK(context, context->input_list("input", &input));
+    OP_REQUIRES_OK(context, context->output_list("output", &output));
+    OP_REQUIRES(context, input.size() == output.size(),
+                errors::InvalidArgument("Input and output counts must match"));
+    if (absl::StrContains(name(), kTpuExecuteStagingNodeName)) {
+      // TPU staging node execution is used for measuring launch latency.
+      metrics::UpdateTpuVariableDistributionTime(EnvTime::NowMicros() -
+                                                 context->start_time_usecs());
+    }
+    for (int i = 0; i < input.size(); ++i) {
+      output.set(i, input[i]);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IDENTITY_N_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/identity_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/identity_op.h
new file mode 100644
index 00000000..6b74868a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/identity_op.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IDENTITY_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IDENTITY_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class IdentityOp : public OpKernel {
+ public:
+  explicit IdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    if (IsRefType(context->input_dtype(0))) {
+      context->forward_ref_input_to_ref_output(0, 0);
+    } else {
+      context->set_output(0, context->input(0));
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IDENTITY_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_contrast_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_contrast_op.h
new file mode 100644
index 00000000..9981275c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_contrast_op.h
@@ -0,0 +1,127 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_CONTRAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_CONTRAST_OP_H_
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by AdjustContrastOp to do the computations.
+template <typename Device, typename T>
+struct AdjustContrast {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<float>::ConstScalar contrast_factor,
+                  typename TTypes<float>::ConstScalar min_value,
+                  typename TTypes<float>::ConstScalar max_value,
+                  typename TTypes<float, 4>::Tensor mean_values,
+                  typename TTypes<float, 4>::Tensor output) {
+    const int batch = input.dimension(0);
+    const int height = input.dimension(1);
+    const int width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    Eigen::array<int, 4> scalar_broadcast;
+    scalar_broadcast[0] = batch;
+    scalar_broadcast[1] = height;
+    scalar_broadcast[2] = width;
+    scalar_broadcast[3] = channels;
+
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >
+        reduction_axis;
+    Eigen::IndexList<Eigen::type2index<1>, int, int, Eigen::type2index<1> >
+        broadcast_dims;
+    broadcast_dims.set(1, height);
+    broadcast_dims.set(2, width);
+    Eigen::IndexList<int, Eigen::type2index<1>, Eigen::type2index<1>, int>
+        reshape_dims;
+    reshape_dims.set(0, batch);
+    reshape_dims.set(3, channels);
+
+    Eigen::Sizes<1, 1, 1, 1> scalar;
+    float num_reduced_coeffs = height * width;
+    mean_values.device(d) =
+        (input.template cast<float>().sum(reduction_axis).eval() /
+         num_reduced_coeffs)
+            .reshape(reshape_dims)
+            .broadcast(broadcast_dims);
+
+    auto contrast_factor_tensor =
+        contrast_factor.reshape(scalar).broadcast(scalar_broadcast);
+    auto adjusted =
+        (input.template cast<float>() - mean_values) * contrast_factor_tensor +
+        mean_values;
+    auto min_bcast = min_value.reshape(scalar).broadcast(scalar_broadcast);
+    auto max_bcast = max_value.reshape(scalar).broadcast(scalar_broadcast);
+    // TODO(wicke): This is rather slow and should be re-written as pure cuda.
+    output.device(d) = adjusted.cwiseMin(max_bcast).cwiseMax(min_bcast);
+  }
+};
+
+// Functor used by AdjustContrastOpv2 to do the computations.
+template <typename Device, typename T>
+struct AdjustContrastv2 {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  typename TTypes<float>::ConstScalar contrast_factor,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch = input.dimension(0);
+    const int height = input.dimension(1);
+    const int width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    Eigen::array<int, 4> scalar_broadcast;
+    scalar_broadcast[0] = batch;
+    scalar_broadcast[1] = height;
+    scalar_broadcast[2] = width;
+    scalar_broadcast[3] = channels;
+
+    Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> >
+        reduction_axis;
+    Eigen::IndexList<Eigen::type2index<1>, int, int, Eigen::type2index<1> >
+        broadcast_dims;
+    broadcast_dims.set(1, height);
+    broadcast_dims.set(2, width);
+    Eigen::IndexList<int, Eigen::type2index<1>, Eigen::type2index<1>, int>
+        reshape_dims;
+    reshape_dims.set(0, batch);
+    reshape_dims.set(3, channels);
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>,
+                     Eigen::type2index<0>, Eigen::type2index<3> >
+        reduced_dims_first;
+
+    Eigen::Sizes<1, 1, 1, 1> scalar;
+    float num_reduced_coeffs = height * width;
+    output.device(d) = (input.template cast<float>()
+                            .shuffle(reduced_dims_first)
+                            .sum(reduction_axis)
+                            .eval() /
+                        num_reduced_coeffs)
+                           .template cast<T>()
+                           .reshape(reshape_dims)
+                           .broadcast(broadcast_dims);
+    auto contrast_factor_tensor =
+        contrast_factor.reshape(scalar).broadcast(scalar_broadcast);
+    auto adjusted =
+        (input - output).template cast<float>() * contrast_factor_tensor;
+    output.device(d) += adjusted.template cast<T>();
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_CONTRAST_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
new file mode 100644
index 00000000..417ea652
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_HSV_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_HSV_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+typedef struct RgbTuple {
+  float r;
+  float g;
+  float b;
+} RgbTuple;
+
+typedef struct HsvTuple {
+  float h;
+  float s;
+  float v;
+} HsvTuple;
+
+inline __device__ HsvTuple rgb2hsv_cuda(const float r, const float g,
+                                        const float b) {
+  HsvTuple tuple;
+  const float M = fmaxf(r, fmaxf(g, b));
+  const float m = fminf(r, fminf(g, b));
+  const float chroma = M - m;
+  float h = 0.0f, s = 0.0f;
+  // hue
+  if (chroma > 0.0f) {
+    if (M == r) {
+      const float num = (g - b) / chroma;
+      const float sign = copysignf(1.0f, num);
+      h = ((sign < 0.0f) * 6.0f + sign * fmodf(sign * num, 6.0f)) / 6.0f;
+    } else if (M == g) {
+      h = ((b - r) / chroma + 2.0f) / 6.0f;
+    } else {
+      h = ((r - g) / chroma + 4.0f) / 6.0f;
+    }
+  } else {
+    h = 0.0f;
+  }
+  // saturation
+  if (M > 0.0) {
+    s = chroma / M;
+  } else {
+    s = 0.0f;
+  }
+  tuple.h = h;
+  tuple.s = s;
+  tuple.v = M;
+  return tuple;
+}
+
+inline __device__ RgbTuple hsv2rgb_cuda(const float h, const float s,
+                                        const float v) {
+  RgbTuple tuple;
+  const float new_h = h * 6.0f;
+  const float chroma = v * s;
+  const float x = chroma * (1.0f - fabsf(fmodf(new_h, 2.0f) - 1.0f));
+  const float new_m = v - chroma;
+  const bool between_0_and_1 = new_h >= 0.0f && new_h < 1.0f;
+  const bool between_1_and_2 = new_h >= 1.0f && new_h < 2.0f;
+  const bool between_2_and_3 = new_h >= 2.0f && new_h < 3.0f;
+  const bool between_3_and_4 = new_h >= 3.0f && new_h < 4.0f;
+  const bool between_4_and_5 = new_h >= 4.0f && new_h < 5.0f;
+  const bool between_5_and_6 = new_h >= 5.0f && new_h < 6.0f;
+  tuple.r = chroma * (between_0_and_1 || between_5_and_6) +
+            x * (between_1_and_2 || between_4_and_5) + new_m;
+  tuple.g = chroma * (between_1_and_2 || between_2_and_3) +
+            x * (between_0_and_1 || between_3_and_4) + new_m;
+  tuple.b = chroma * (between_3_and_4 || between_4_and_5) +
+            x * (between_2_and_3 || between_5_and_6) + new_m;
+  return tuple;
+}
+
+template <bool AdjustHue, bool AdjustSaturation, bool AdjustV, typename T>
+__global__ void adjust_hsv_nhwc(
+    const int64 number_elements, const T* const __restrict__ input,
+    T* const __restrict__ output, const float* const __restrict__ hue_delta,
+    const float* const __restrict__ saturation_scale,
+    const float* const __restrict__ value_scale) {
+  // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel
+  // (NHWC)
+  for (int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
+       idx < number_elements; idx += blockDim.x * gridDim.x * 3) {
+    if (!AdjustHue && !AdjustSaturation && !AdjustV) {
+      output[idx] = input[idx];
+      output[idx + 1] = input[idx + 1];
+      output[idx + 2] = input[idx + 2];
+      continue;
+    }
+    const HsvTuple hsv = rgb2hsv_cuda(static_cast<float>(input[idx]),
+                                      static_cast<float>(input[idx + 1]),
+                                      static_cast<float>(input[idx + 2]));
+    float new_h = hsv.h;
+    float new_s = hsv.s;
+    float new_v = hsv.v;
+    // hue adjustment
+    if (AdjustHue) {
+      const float delta = *hue_delta;
+      new_h = fmodf(hsv.h + delta, 1.0f);
+      if (new_h < 0.0f) {
+        new_h = fmodf(1.0f + new_h, 1.0f);
+      }
+    }
+    // saturation adjustment
+    if (AdjustSaturation && saturation_scale != nullptr) {
+      const float scale = *saturation_scale;
+      new_s = fminf(1.0f, fmaxf(0.0f, hsv.s * scale));
+    }
+    // value adjustment
+    if (AdjustV && value_scale != nullptr) {
+      const float scale = *value_scale;
+      new_v = hsv.v * scale;
+    }
+    const RgbTuple rgb = hsv2rgb_cuda(new_h, new_s, new_v);
+    output[idx] = static_cast<T>(rgb.r);
+    output[idx + 1] = static_cast<T>(rgb.g);
+    output[idx + 2] = static_cast<T>(rgb.b);
+  }
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_HSV_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_hue_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_hue_op.h
new file mode 100644
index 00000000..788b61bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_hue_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_HUE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_HUE_OP_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct AdjustHueGPU {
+  void operator()(GPUDevice* device, const int64_t number_of_elements,
+                  const T* const input, const float* const delta,
+                  T* const output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_HUE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_saturation_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_saturation_op.h
new file mode 100644
index 00000000..278161bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/adjust_saturation_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_SATURATION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_SATURATION_OP_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct AdjustSaturationGPU {
+  void operator()(GPUDevice* device, const int64_t number_of_elements,
+                  const T* const input, const float* const scale,
+                  T* const output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_ADJUST_SATURATION_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/colorspace_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/colorspace_op.h
new file mode 100644
index 00000000..b71f058f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/colorspace_op.h
@@ -0,0 +1,90 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_COLORSPACE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_COLORSPACE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct RGBToHSV {
+  void operator()(const Device &d,
+                  typename TTypes<T, 2>::ConstTensor input_data,
+                  typename TTypes<T, 1>::Tensor range,
+                  typename TTypes<T, 2>::Tensor output_data) {
+    auto H = output_data.template chip<1>(0);
+    auto S = output_data.template chip<1>(1);
+    auto V = output_data.template chip<1>(2);
+
+    auto R = input_data.template chip<1>(0);
+    auto G = input_data.template chip<1>(1);
+    auto B = input_data.template chip<1>(2);
+
+    Eigen::IndexList<Eigen::type2index<1> > channel_axis;
+
+    V.device(d) = input_data.maximum(channel_axis);
+
+    range.device(d) = V - input_data.minimum(channel_axis);
+
+    S.device(d) = (V > T(0)).select(range / V, V.constant(T(0)));
+
+    auto norm = range.inverse() * (T(1) / T(6));
+    // TODO(wicke): all these assignments are only necessary because a combined
+    // expression is larger than kernel parameter space. A custom kernel is
+    // probably in order.
+    H.device(d) = (R == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
+    H.device(d) = (H < T(0)).select(H + T(1), H);
+  }
+};
+
+template <typename Device, typename T>
+struct HSVToRGB {
+  void operator()(const Device &d,
+                  typename TTypes<T, 2>::ConstTensor input_data,
+                  typename TTypes<T, 2>::Tensor output_data) {
+    auto H = input_data.template chip<1>(0);
+    auto S = input_data.template chip<1>(1);
+    auto V = input_data.template chip<1>(2);
+
+    // TODO(wicke): compute only the fractional part of H for robustness
+    auto dh = H * T(6);
+    auto dr = ((dh - T(3)).abs() - T(1)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto dg = (-(dh - T(2)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto db = (-(dh - T(4)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto one_s = -S + T(1);
+
+    auto R = output_data.template chip<1>(0);
+    auto G = output_data.template chip<1>(1);
+    auto B = output_data.template chip<1>(2);
+
+    R.device(d) = (one_s + S * dr) * V;
+    G.device(d) = (one_s + S * dg) * V;
+    B.device(d) = (one_s + S * db) * V;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_COLORSPACE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/crop_and_resize_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/crop_and_resize_op.h
new file mode 100644
index 00000000..dd838ea5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/crop_and_resize_op.h
@@ -0,0 +1,72 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_CROP_AND_RESIZE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_CROP_AND_RESIZE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct CropAndResize {
+  // We assume that the tensor sizes are correct.
+  bool operator()(const OpKernelContext* context,
+                  typename TTypes<T, 4>::ConstTensor image,
+                  typename TTypes<float, 2>::ConstTensor boxes,
+                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  const std::string& method_name, float extrapolation_value,
+                  typename TTypes<float, 4>::Tensor crops);
+};
+
+template <typename Device, typename T>
+struct CropAndResizeBackpropImage {
+  // We assume that the tensor sizes are correct.
+  bool operator()(const OpKernelContext* context,
+                  typename TTypes<float, 4>::ConstTensor grads,
+                  typename TTypes<float, 2>::ConstTensor boxes,
+                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const std::string& method_name);
+};
+
+template <typename Device, typename T>
+struct CropAndResizeBackpropBoxes {
+  // We assume that the tensor sizes are correct.
+  bool operator()(const Device& d, typename TTypes<float, 4>::ConstTensor grads,
+                  typename TTypes<T, 4>::ConstTensor image,
+                  typename TTypes<float, 2>::ConstTensor boxes,
+                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<float, 2>::Tensor grads_boxes);
+};
+
+template <typename Device>
+struct CheckValidBoxIndexHelper {
+  // Checks if all values in box_index are in [0, batch).
+  void operator()(const Device& d,
+                  typename TTypes<int32, 1>::ConstTensor box_index, int batch,
+                  typename TTypes<bool, 0>::Tensor isvalid) {
+    isvalid.device(d) = ((box_index >= 0) && (box_index < batch)).all();
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_CROP_AND_RESIZE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/extract_image_patches_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/extract_image_patches_op.h
new file mode 100644
index 00000000..3dc2f323
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/extract_image_patches_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_EXTRACT_IMAGE_PATCHES_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_EXTRACT_IMAGE_PATCHES_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct ExtractImagePatchesForward {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  int patch_rows, int patch_cols, int stride_rows,
+                  int stride_cols, int rate_rows, int rate_cols,
+                  const Eigen::PaddingType& padding,
+                  typename TTypes<T, 4>::Tensor output) {
+    // Need to swap row/col when calling Eigen, because our data is in
+    // NHWC format while Eigen assumes NWHC format.
+    MaybeWith32BitIndexing<Device>(
+        [&](auto input32, auto output32) {
+          output32.device(d) =
+              input32
+                  .extract_image_patches(patch_cols, patch_rows, stride_cols,
+                                         stride_rows, rate_cols, rate_rows,
+                                         padding)
+                  .reshape(output32.dimensions());
+        },
+        input, output);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_EXTRACT_IMAGE_PATCHES_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/extract_volume_patches_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/extract_volume_patches_op.h
new file mode 100644
index 00000000..9e134818
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/extract_volume_patches_op.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct ExtractVolumePatchesForward {
+  void operator()(const Device& d, typename TTypes<T, 5>::ConstTensor input,
+                  int patch_planes, int patch_rows, int patch_cols,
+                  int stride_planes, int stride_rows, int stride_cols,
+                  /* int rate_planes, int rate_rows, int rate_cols, */
+                  const Eigen::PaddingType& padding,
+                  typename TTypes<T, 5>::Tensor output) {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto input32, auto output32) {
+          output32.device(d) =
+              input32
+                  .extract_volume_patches(patch_cols, patch_rows, patch_planes,
+                                          stride_cols, stride_rows,
+                                          stride_planes, padding)
+                  .reshape(output32.dimensions());
+        },
+        input, output);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/image_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/image_ops.h
new file mode 100644
index 00000000..914cb528
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/image_ops.h
@@ -0,0 +1,278 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
+
+// See docs in ../ops/image_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace generator {
+
+enum Interpolation { NEAREST, BILINEAR };
+enum Mode { FILL_REFLECT, FILL_WRAP, FILL_CONSTANT, FILL_NEAREST };
+
+using Eigen::array;
+using Eigen::DenseIndex;
+
+// Follow scipy's implementation
+// https://github.com/scipy/scipy/blob/master/scipy/ndimage/src/ni_interpolation.c
+template <typename Device, Mode M>
+struct MapCoordinate {
+  float operator()(const float out_coord, const DenseIndex len);
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::FILL_REFLECT> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    // Reflect [abcd] to [dcba|abcd|dcba].
+    float in_coord = out_coord;
+    if (in_coord < 0) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz2 = 2 * len;
+        if (in_coord < sz2) {
+          in_coord = sz2 * static_cast<DenseIndex>(-in_coord / sz2) + in_coord;
+        }
+        in_coord = (in_coord < -len) ? in_coord + sz2 : -in_coord - 1;
+      }
+    } else if (in_coord > len - 1) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz2 = 2 * len;
+        in_coord -= sz2 * static_cast<DenseIndex>(in_coord / sz2);
+        if (in_coord >= len) {
+          in_coord = sz2 - in_coord - 1;
+        }
+      }
+    }
+    // clamp is necessary because when out_coord = 3.5 and len = 4,
+    // in_coord = 3.5 and will be rounded to 4 in nearest interpolation.
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(in_coord);
+  }
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::FILL_WRAP> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    // Wrap [abcd] to [abcd|abcd|abcd].
+    float in_coord = out_coord;
+    if (in_coord < 0) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz = len - 1;
+        in_coord += len * (static_cast<DenseIndex>(-in_coord / sz) + 1);
+      }
+    } else if (in_coord > len - 1) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz = len - 1;
+        in_coord -= len * static_cast<DenseIndex>(in_coord / sz);
+      }
+    }
+    // clamp is necessary because when out_coord = -0.5 and len = 4,
+    // in_coord = 3.5 and will be rounded to 4 in nearest interpolation.
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(in_coord);
+  }
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::FILL_CONSTANT> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    return out_coord;
+  }
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::FILL_NEAREST> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(out_coord);
+  }
+};
+
+template <typename Device, typename T, Mode M>
+class ProjectiveGenerator {
+ private:
+  typename TTypes<T, 4>::ConstTensor input_;
+  typename TTypes<float>::ConstMatrix transforms_;
+  const Interpolation interpolation_;
+  const T fill_value_;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
+                      typename TTypes<float>::ConstMatrix transforms,
+                      const Interpolation interpolation, const T fill_value)
+      : input_(input),
+        transforms_(transforms),
+        interpolation_(interpolation),
+        fill_value_(fill_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const array<DenseIndex, 4>& coords) const {
+    const int64_t output_y = coords[1];
+    const int64_t output_x = coords[2];
+    const float* transform =
+        transforms_.dimension(0) == 1
+            ? transforms_.data()
+            : &transforms_.data()[transforms_.dimension(1) * coords[0]];
+    float projection = transform[6] * output_x + transform[7] * output_y + 1.f;
+    if (projection == 0) {
+      // Return the fill value for infinite coordinates,
+      // which are outside the input image
+      return fill_value_;
+    }
+    const float input_x =
+        (transform[0] * output_x + transform[1] * output_y + transform[2]) /
+        projection;
+    const float input_y =
+        (transform[3] * output_x + transform[4] * output_y + transform[5]) /
+        projection;
+
+    // Map out-of-boundary input coordinates to in-boundary based on fill_mode.
+    auto map_functor = MapCoordinate<Device, M>();
+    const float x = map_functor(input_x, input_.dimension(2));
+    const float y = map_functor(input_y, input_.dimension(1));
+
+    const DenseIndex batch = coords[0];
+    const DenseIndex channels = coords[3];
+    switch (interpolation_) {
+      case NEAREST:
+        return nearest_interpolation(batch, y, x, channels, fill_value_);
+      case BILINEAR:
+        return bilinear_interpolation(batch, y, x, channels, fill_value_);
+    }
+    // Unreachable; ImageProjectiveTransform only uses INTERPOLATION_NEAREST
+    // or INTERPOLATION_BILINEAR.
+    return fill_value_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  nearest_interpolation(const DenseIndex batch, const float y, const float x,
+                        const DenseIndex channel, const T fill_value) const {
+    return read_with_fill_value(batch, DenseIndex(std::round(y)),
+                                DenseIndex(std::round(x)), channel, fill_value);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  bilinear_interpolation(const DenseIndex batch, const float y, const float x,
+                         const DenseIndex channel, const T fill_value) const {
+    const float y_floor = std::floor(y);
+    const float x_floor = std::floor(x);
+    const float y_ceil = y_floor + 1;
+    const float x_ceil = x_floor + 1;
+    // f(x, y_floor) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_floor)
+    //               + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_floor)
+    const float value_yfloor =
+        (x_ceil - x) * static_cast<float>(read_with_fill_value(
+                           batch, DenseIndex(y_floor), DenseIndex(x_floor),
+                           channel, fill_value)) +
+        (x - x_floor) * static_cast<float>(read_with_fill_value(
+                            batch, DenseIndex(y_floor), DenseIndex(x_ceil),
+                            channel, fill_value));
+    // f(x, y_ceil) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_ceil)
+    //              + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_ceil)
+    const float value_yceil =
+        (x_ceil - x) * static_cast<float>(read_with_fill_value(
+                           batch, DenseIndex(y_ceil), DenseIndex(x_floor),
+                           channel, fill_value)) +
+        (x - x_floor) * static_cast<float>(read_with_fill_value(
+                            batch, DenseIndex(y_ceil), DenseIndex(x_ceil),
+                            channel, fill_value));
+    // f(x, y) = (y_ceil - y) / (y_ceil - y_floor) * f(x, y_floor)
+    //         + (y - y_floor) / (y_ceil - y_floor) * f(x, y_ceil)
+    return T((y_ceil - y) * value_yfloor + (y - y_floor) * value_yceil);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T read_with_fill_value(
+      const DenseIndex batch, const DenseIndex y, const DenseIndex x,
+      const DenseIndex channel, const T fill_value) const {
+    // batch and channel must be correct, because they are passed unchanged from
+    // the input.
+    return (0 <= y && y < input_.dimension(1) && 0 <= x &&
+            x < input_.dimension(2))
+               ? input_(array<DenseIndex, 4>{batch, y, x, channel})
+               : fill_value;
+  }
+};
+
+}  // end namespace generator
+
+namespace functor {
+
+using generator::Interpolation;
+using generator::Mode;
+using generator::ProjectiveGenerator;
+
+template <typename Device, typename T>
+struct FillProjectiveTransform {
+  typedef typename TTypes<T, 4>::Tensor OutputType;
+  typedef typename TTypes<T, 4>::ConstTensor InputType;
+  typedef typename TTypes<float, 2>::ConstTensor TransformsType;
+  const Interpolation interpolation;
+
+  explicit FillProjectiveTransform(Interpolation interpolation)
+      : interpolation(interpolation) {}
+
+  EIGEN_ALWAYS_INLINE
+  void operator()(const Device& device, OutputType* output,
+                  const InputType& images, const TransformsType& transform,
+                  const Mode fill_mode, const T fill_value) const {
+    switch (fill_mode) {
+      case Mode::FILL_REFLECT:
+        output->device(device) =
+            output->generate(ProjectiveGenerator<Device, T, Mode::FILL_REFLECT>(
+                images, transform, interpolation, fill_value));
+        break;
+      case Mode::FILL_WRAP:
+        output->device(device) =
+            output->generate(ProjectiveGenerator<Device, T, Mode::FILL_WRAP>(
+                images, transform, interpolation, fill_value));
+        break;
+      case Mode::FILL_CONSTANT:
+        output->device(device) = output->generate(
+            ProjectiveGenerator<Device, T, Mode::FILL_CONSTANT>(
+                images, transform, interpolation, fill_value));
+        break;
+      case Mode::FILL_NEAREST:
+        output->device(device) =
+            output->generate(ProjectiveGenerator<Device, T, Mode::FILL_NEAREST>(
+                images, transform, interpolation, fill_value));
+        break;
+    }
+  }
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/mirror_pad_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/mirror_pad_op.h
new file mode 100644
index 00000000..7c3df978
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/mirror_pad_op.h
@@ -0,0 +1,445 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen {
+template <typename PaddingDimensions, typename XprType>
+class TensorMirrorPadOp;
+
+namespace internal {
+template <typename PaddingDimensions, typename XprType>
+struct traits<TensorMirrorPadOp<PaddingDimensions, XprType>>
+    : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> _Nested;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+};
+
+template <typename PaddingDimensions, typename XprType>
+struct eval<TensorMirrorPadOp<PaddingDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorMirrorPadOp<PaddingDimensions, XprType>& type;
+};
+
+template <typename PaddingDimensions, typename XprType>
+struct nested<
+    TensorMirrorPadOp<PaddingDimensions, XprType>, 1,
+    typename eval<TensorMirrorPadOp<PaddingDimensions, XprType>>::type> {
+  typedef TensorMirrorPadOp<PaddingDimensions, XprType> type;
+};
+}  // namespace internal
+
+template <typename PaddingDimensions, typename XprType>
+class TensorMirrorPadOp
+    : public TensorBase<TensorMirrorPadOp<PaddingDimensions, XprType>,
+                        ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorMirrorPadOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorMirrorPadOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorMirrorPadOp>::StorageKind
+      StorageKind;
+  typedef typename Eigen::internal::traits<TensorMirrorPadOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMirrorPadOp(
+      const XprType& expr, const PaddingDimensions& padding_dims, Index offset)
+      : xpr_(expr), padding_dims_(padding_dims), offset_(offset) {}
+
+  EIGEN_DEVICE_FUNC
+  const PaddingDimensions& padding() const { return padding_dims_; }
+
+  EIGEN_DEVICE_FUNC
+  Index offset() const { return offset_; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const {
+    return xpr_;
+  }
+
+ protected:
+  typename XprType::Nested xpr_;
+  const PaddingDimensions padding_dims_;
+  const Index offset_;
+};
+
+// Eval as rvalue
+template <typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorMirrorPadOp<PaddingDimensions, ArgType>,
+                       Device> {
+  typedef TensorMirrorPadOp<PaddingDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int Dims = internal::array_size<PaddingDimensions>::value;
+  typedef DSizes<Index, Dims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  // Copied from Eigen3 Github version 0e806c1.
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    BlockAccessV2 = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : impl_(op.expression(), device), padding_(op.padding()) {
+    EIGEN_STATIC_ASSERT(Dims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    // op.offset() == 0 if padding mode is symmetric.
+    // op.offset() == 1 if padding mode is reflect.
+    eigen_assert(op.offset() == 0 || op.offset() == 1);
+    left_offset_ = -1 + op.offset();
+    right_offset_ = -1 - op.offset();
+
+    // This should trigger compilation error if padding dimensions and
+    // expression dimensions do not match.
+    dimensions_ = impl_.dimensions();
+    for (int dim = 0; dim < Dims; ++dim) {
+      eigen_assert(padding_[dim].first + op.offset() <= dimensions_[dim]);
+      eigen_assert(padding_[dim].second + op.offset() <= dimensions_[dim]);
+      dimensions_[dim] += padding_[dim].first + padding_[dim].second;
+    }
+
+    const auto& input_dims = impl_.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      input_strides_[0] = 1;
+      output_strides_[0] = 1;
+      for (int i = 0; i < Dims - 1; ++i) {
+        input_strides_[i + 1] = input_strides_[i] * input_dims[i];
+        output_strides_[i + 1] = output_strides_[i] * dimensions_[i];
+      }
+    } else {
+      input_strides_[numext::maxi(0, Dims - 1)] = 1;
+      output_strides_[numext::maxi(0, Dims - 1)] = 1;
+      for (int i = Dims - 1; i > 0; --i) {
+        input_strides_[i - 1] = input_strides_[i] * input_dims[i];
+        output_strides_[i - 1] = output_strides_[i] * dimensions_[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return dimensions_;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    impl_.evalSubExprsIfNeeded(nullptr);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { impl_.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  coeff(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    const Index input_index = ToInputIndex(index);
+    return impl_.coeff(input_index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  coeff(array<Index, Dims> coords) const {
+    for (int dim = 0; dim < Dims; ++dim) {
+      coords[dim] = ToInputCoord(coords[dim], dim);
+    }
+    ReadInputHelper<TensorEvaluator<ArgType, Device>::CoordAccess> helper;
+    return helper(coords, input_strides_, impl_);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+  packet(Index index) const {
+    constexpr int kPacketSize =
+        internal::unpacket_traits<PacketReturnType>::size;
+
+    EIGEN_STATIC_ASSERT(kPacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + kPacketSize <= dimensions().TotalSize());
+
+    // Find the effective inner-most dimension where padding actually happens.
+    // NOTE: This is independent of index argument, and can be done in the
+    // constructor to save computation. However, if packet access does not
+    // happen, then moving to constructor will incur needless overhead.
+    int dim = -1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int k = 0; k < Dims; ++k) {
+        if (padding_[k].first != 0 || padding_[k].second != 0) {
+          dim = k;
+          break;
+        }
+      }
+    } else {
+      for (int k = Dims - 1; k >= 0; --k) {
+        if (padding_[k].first != 0 || padding_[k].second != 0) {
+          dim = k;
+          break;
+        }
+      }
+    }
+
+    const Index input_index = ToInputIndex(index);
+
+    // If dim < 0, this means there is no padding at all.
+    if (dim < 0) {
+      return impl_.template packet<Unaligned>(input_index);
+    }
+
+    // Check if the way from the begin of the packet to the end of the packet
+    // is paved with contiguous road. That is, the indices must be between the
+    // padded region in the effective inner-most dimension.
+    const Index left = padding_[dim].first * output_strides_[dim];
+    const Index right =
+        (dimensions_[dim] - padding_[dim].second) * output_strides_[dim];
+
+    const Index index_mod = index % (dimensions_[dim] * output_strides_[dim]);
+    if (left <= index_mod && (index_mod + kPacketSize - 1) < right) {
+      return impl_.template packet<Unaligned>(input_index);
+    }
+
+    // If the road is not contiguous, then fall back to coeff().
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[kPacketSize];
+    values[0] = impl_.coeff(input_index);
+    for (int i = 1; i < kPacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType result = internal::pload<PacketReturnType>(values);
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    constexpr int kPacketSize =
+        internal::unpacket_traits<PacketReturnType>::size;
+
+    const double compute_cost = Dims * (7 * TensorOpCost::AddCost<Index>() +
+                                        2 * TensorOpCost::MulCost<Index>() +
+                                        TensorOpCost::DivCost<Index>());
+    return impl_.costPerCoeff(vectorized) +
+           TensorOpCost(1, 0, compute_cost, vectorized, kPacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return nullptr; }
+
+ protected:
+  using Coords = array<Index, Dims>;
+
+  // Full template specialization is not allowed within non-fully specialized
+  // template class. Adding a dummy parameter to make specializations partial.
+  template <bool CoordAccess, bool dummy = true>
+  struct ReadInputHelper;
+
+  template <bool dummy>
+  struct ReadInputHelper<false, dummy> {
+    template <typename Eval>
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index
+    operator()(const Coords& coord, const Coords& strides, const Eval& eval) {
+      Index index = 0;
+      for (int k = 0; k < Dims; ++k) {
+        index += coord[k] * strides[k];
+      }
+      return eval.coeff(index);
+    }
+  };
+
+  template <bool dummy>
+  struct ReadInputHelper<true, dummy> {
+    template <typename Eval>
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index
+    operator()(const Coords& coord, const Coords& strides, const Eval& eval) {
+      return eval.coeff(coord);
+    }
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index ToInputCoord(Index k,
+                                                           int dim) const {
+    const Index m = impl_.dimensions()[dim];
+    k -= padding_[dim].first;
+    if (k < 0) {
+      return -k + left_offset_;
+    }
+    if (k < m) {
+      return k;
+    }
+    return m - (k - m) + right_offset_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index
+  ToInputIndex(const Coords& coords) const {
+    Index input_index = 0;
+    for (int dim = 0; dim < Dims; ++dim) {
+      input_index += ToInputCoord(coords[dim], dim) * input_strides_[dim];
+    }
+    return input_index;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index ToInputIndex(Index index) const {
+    Index input_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int dim = Dims - 1; dim > 0; --dim) {
+        const Index k = index / output_strides_[dim];
+        index -= k * output_strides_[dim];
+        input_index += ToInputCoord(k, dim) * input_strides_[dim];
+      }
+      input_index += ToInputCoord(index, 0);
+    } else {
+      for (int dim = 0; dim < Dims - 1; ++dim) {
+        const Index k = index / output_strides_[dim];
+        index -= k * output_strides_[dim];
+        input_index += ToInputCoord(k, dim) * input_strides_[dim];
+      }
+      input_index += ToInputCoord(index, Dims - 1);
+    }
+
+    return input_index;
+  }
+
+  TensorEvaluator<ArgType, Device> impl_;
+  PaddingDimensions padding_;
+  Dimensions dimensions_;
+  array<Index, Dims> input_strides_;
+  array<Index, Dims> output_strides_;
+
+  Index left_offset_;
+  Index right_offset_;
+};
+}  // namespace Eigen
+
+namespace tensorflow {
+namespace functor {
+
+// offset argument must be either 0 or 1. This controls whether the boundary
+// values are replicated (offset == 0) or not replicated (offset == 1).
+template <typename Device, typename T, typename Tpaddings, int Dims>
+struct MirrorPad {
+  void operator()(const Device& device,
+                  typename TTypes<T, Dims, int32>::Tensor output,
+                  typename TTypes<T, Dims, int32>::ConstTensor input,
+                  typename TTypes<Tpaddings>::ConstMatrix padding, int offset) {
+    Eigen::array<Eigen::IndexPair<int32>, Dims> padding_dims;
+
+    for (int i = 0; i < Dims; ++i) {
+      padding_dims[i] = Eigen::IndexPair<int32>(padding(i, 0), padding(i, 1));
+    }
+
+    output.device(device) = MirrorPadOp(input, padding_dims, offset);
+  }
+
+  template <typename PaddingDimensions, typename Derived>
+  static const Eigen::TensorMirrorPadOp<PaddingDimensions, const Derived>
+  MirrorPadOp(
+      const Eigen::TensorBase<Derived, Eigen::ReadOnlyAccessors>& tensor,
+      const PaddingDimensions& padding, int offset) {
+    return Eigen::TensorMirrorPadOp<PaddingDimensions, const Derived>(
+        static_cast<const Derived&>(tensor), padding, offset);
+  }
+};
+
+// offset argument must be either 0 or 1. This controls whether the boundary
+// values are replicated (offset == 0) or not replicated (offset == 1).
+template <typename Device, typename T, typename Tpaddings, int Dims>
+struct MirrorPadGrad {
+  void operator()(const Device& device,
+                  typename TTypes<T, Dims, int32>::Tensor output,
+                  typename TTypes<T, Dims, int32>::ConstTensor input,
+                  typename TTypes<Tpaddings>::ConstMatrix paddings, int offset,
+                  typename TTypes<T, Dims, int32>::Tensor scratch) {
+    // Copy the gradient input into the scratch buffer.
+    scratch.device(device) = input;
+
+    Eigen::array<int32, Dims> lhs_offsets;
+    Eigen::array<int32, Dims> rhs_offsets;
+    Eigen::array<int32, Dims> extents;
+    Eigen::array<bool, Dims> reverses;
+
+    for (int i = 0; i < Dims; ++i) {
+      lhs_offsets[i] = 0;
+      rhs_offsets[i] = 0;
+      extents[i] = scratch.dimension(i);
+      reverses[i] = false;
+    }
+
+    // At this point, the central part (non-padded area) does not include the
+    // gradients back-propagated through padded areas. Those gradient components
+    // need be added to the central part.
+    //
+    // Note that a gradient input element falls into a padded area iff in at
+    // least one dimension i, the coordinate x(i) is in the range (python-style)
+    // [:paddings(i,0)] or [-paddings(i,1):].
+
+    for (int i = 0; i < Dims; ++i) {
+      reverses[i] = true;
+
+      // This handles the case when coordinate in dimension i is in the range
+      // [:paddings(i,0)]. This portion is added to the range
+      // [paddings(i,0) + offset:2 * paddings(i,0) + offset].
+      if (paddings(i, 0) > 0) {
+        rhs_offsets[i] = 0;
+        lhs_offsets[i] = paddings(i, 0) + offset;
+        extents[i] = paddings(i, 0);
+
+        scratch.slice(lhs_offsets, extents).device(device) +=
+            scratch.slice(rhs_offsets, extents).reverse(reverses);
+      }
+
+      // This handles the case when coordinate in dimension i is in the range
+      // [-paddings(i,1):]. This portion is added to the range
+      // [-2 * paddings(i,1) - offset:-paddings(i,1) - offset].
+      if (paddings(i, 1) > 0) {
+        rhs_offsets[i] = scratch.dimension(i) - paddings(i, 1);
+        lhs_offsets[i] = rhs_offsets[i] - paddings(i, 1) - offset;
+        extents[i] = paddings(i, 1);
+
+        scratch.slice(lhs_offsets, extents).device(device) +=
+            scratch.slice(rhs_offsets, extents).reverse(reverses);
+      }
+
+      reverses[i] = false;
+      lhs_offsets[i] = paddings(i, 0);
+      rhs_offsets[i] = paddings(i, 0);
+      extents[i] = output.dimension(i);
+
+      // At this point, scratch buffer contains gradient input as if paddings
+      // for dimension k = 0,...,i are zeros. Therefore after the loop
+      // termination, the central part of the scratch buffer contains the folded
+      // gradients.
+    }
+
+    // Copy the central part of the scratch buffer to the output.
+    output.device(device) = scratch.slice(rhs_offsets, extents);
+  }
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
new file mode 100644
index 00000000..b138ae0c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
+
+#if CPU_PROVIDED_IXDIM
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
+
+namespace tensorflow {
+
+using CpuDevice = Eigen::ThreadPoolDevice;
+
+#define DEFINE_CPU_SPECS(T)                                                    \
+  template struct functor::MirrorPad<CpuDevice, T, int32, CPU_PROVIDED_IXDIM>; \
+  template struct functor::MirrorPad<CpuDevice, T, int64_t, CPU_PROVIDED_IXDIM>;
+TF_CALL_POD_TYPES(DEFINE_CPU_SPECS);
+TF_CALL_QUANTIZED_TYPES(DEFINE_CPU_SPECS);
+TF_CALL_tstring(DEFINE_CPU_SPECS);
+#undef DEFINE_CPU_SPECS
+
+#define DEFINE_CPU_SPECS(T)                                     \
+  template struct functor::MirrorPadGrad<CpuDevice, T, int32,   \
+                                         CPU_PROVIDED_IXDIM>;   \
+  template struct functor::MirrorPadGrad<CpuDevice, T, int64_t, \
+                                         CPU_PROVIDED_IXDIM>;
+TF_CALL_NUMBER_TYPES(DEFINE_CPU_SPECS);
+#undef DEFINE_CPU_SPECS
+}  // namespace tensorflow
+
+#endif  // CPU_PROVIDED_IXDIM
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/non_max_suppression_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/non_max_suppression_op.h
new file mode 100644
index 00000000..04828b07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/non_max_suppression_op.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_NON_MAX_SUPPRESSION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_NON_MAX_SUPPRESSION_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+extern const int kNmsBoxesPerTread;
+
+// Given descending sorted box list, apply non-maximal-suppression with given
+// threshold and select boxes to keep.
+// - d_sorted_boxes_float_ptr: a pointer to device memory float array
+//   containing the box corners for N boxes sorted in descending order of
+//   scores.
+// - num_boxes: number of boxes.
+// - iou_threshold: the intersection-over-union (iou) threshold for elimination.
+// - d_selected_indices: is a device pointer to int array containing sorted
+//   indices of the boxes to keep.
+// - h_num_boxes_to_keep: is a host pointer for returning number of items
+//   to keep.
+// - flip_boxes: flag reorders the boxes use lower left and upper right
+//   corners if they are given in mixed format.
+Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
+              const float iou_threshold, int* d_selected_indices,
+              int* h_num_boxes_to_keep, OpKernelContext* context,
+              const int max_boxes, bool flip_boxes = false);
+#endif
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_NON_MAX_SUPPRESSION_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/resize_bilinear_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/resize_bilinear_op.h
new file mode 100644
index 00000000..1a304c2c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/resize_bilinear_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_RESIZE_BILINEAR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_RESIZE_BILINEAR_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct ResizeBilinear {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor images,
+                  const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
+                  typename TTypes<float, 4>::Tensor resized_images);
+};
+
+template <typename Device, typename T>
+struct ResizeBilinearGrad {
+  void operator()(const Device& d,
+                  typename TTypes<float, 4>::ConstTensor input_grad,
+                  const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
+                  typename TTypes<T, 4>::Tensor output_grad);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_RESIZE_BILINEAR_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
new file mode 100644
index 00000000..e6797dfb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
@@ -0,0 +1,45 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_RESIZE_NEAREST_NEIGHBOR_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, bool half_pixel_centers,
+          bool align_corners>
+struct ResizeNearestNeighbor {
+  bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output);
+};
+
+template <typename Device, typename T, bool half_pixel_centers,
+          bool align_corners>
+struct ResizeNearestNeighborGrad {
+  bool operator()(const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_grad,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output_grad);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_RESIZE_NEAREST_NEIGHBOR_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/sampling_kernels.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/sampling_kernels.h
new file mode 100644
index 00000000..6f889add
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/sampling_kernels.h
@@ -0,0 +1,192 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_SAMPLING_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_SAMPLING_KERNELS_H_
+
+#include <cmath>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+namespace functor {
+// Defines functions for different types of sampling kernels.
+enum SamplingKernelType {
+  // Lanczos kernel with radius 1.  Aliases but does not ring.
+  Lanczos1Kernel,
+
+  // Lanczos kernel with radius 3.  High-quality practical filter but may have
+  // some ringing especially on synthetic images.
+  Lanczos3Kernel,
+
+  // Lanczos kernel with radius 5.  Very-high-quality filter but may have
+  // stronger ringing.
+  Lanczos5Kernel,
+
+  // Gaussian kernel with radius 3, sigma = 1.5 / 3.  Less commonly used.
+  GaussianKernel,
+
+  // Rectangle function.  Equivalent to "nearest" sampling when upscaling.
+  // Has value 1 in interval (-0.5, 0.5), value 0.5 on edge, and 0 elsewhere.
+  BoxKernel,
+
+  // Hat/tent function with radius 1.  Equivalent to "bilinear" reconstruction
+  // when upsampling.
+  // Has value zero at -1.0 and 1.0.
+  TriangleKernel,
+
+  // Cubic interpolant of Keys.  Equivalent to Catmull-Rom kernel.  Reasonably
+  // good quality and faster than Lanczos3Kernel.
+  KeysCubicKernel,
+
+  // Cubic non-interpolating scheme.  For synthetic images (especially those
+  // lacking proper prefiltering), less ringing than Keys cubic kernel but less
+  // sharp.
+  MitchellCubicKernel,
+
+  // Always insert new kernel types before this.
+  SamplingKernelTypeEnd
+};
+
+// Converts a string into the corresponding kernel type.
+// Returns SamplingKernelTypeEnd if the string couldn't be converted.
+SamplingKernelType SamplingKernelTypeFromString(const absl::string_view str);
+
+// A function object for a Lanczos kernel.
+struct LanczosKernelFunc {
+  // Pass 1 for Lanczos1 kernel, 3 for Lanczos3 etc.
+  explicit LanczosKernelFunc(float _radius) : radius(_radius) {}
+  float operator()(float x) const {
+    constexpr float kPI = 3.14159265359;
+    x = std::abs(x);
+    if (x > radius) return 0.0;
+    // Need to special case the limit case of sin(x) / x when x is zero.
+    if (x <= 1e-3) {
+      return 1.0;
+    }
+    return radius * std::sin(kPI * x) * std::sin(kPI * x / radius) /
+           (kPI * kPI * x * x);
+  }
+  float Radius() const { return radius; }
+  const float radius;
+};
+
+struct GaussianKernelFunc {
+  static constexpr float kRadiusMultiplier = 3.0f;
+  // https://en.wikipedia.org/wiki/Gaussian_function
+  // We use sigma = 0.5, as suggested on p. 4 of Ken Turkowski's "Filters
+  // for Common Resampling Tasks" for kernels with a support of 3 pixels:
+  // www.realitypixels.com/turk/computergraphics/ResamplingFilters.pdf
+  // This implies a radius of 1.5,
+  explicit GaussianKernelFunc(float _radius = 1.5f)
+      : radius(_radius), sigma(_radius / kRadiusMultiplier) {}
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= radius) return 0.0;
+    return std::exp(-x * x / (2.0 * sigma * sigma));
+  }
+  float Radius() const { return radius; }
+  const float radius;
+  const float sigma;  // Gaussian standard deviation
+};
+
+struct BoxKernelFunc {
+  float operator()(float x) const {
+    x = std::abs(x);
+    return x < 0.5f ? 1. : x == 0.5f ? 0.5f : 0.0f;
+  }
+  float Radius() const { return 1.f; }
+};
+
+struct TriangleKernelFunc {
+  // https://en.wikipedia.org/wiki/Triangle_function
+  float operator()(float x) const {
+    x = std::abs(x);
+    return x < 1.0f ? 1.0f - x : 0.0f;
+  }
+  float Radius() const { return 1.f; }
+};
+
+struct KeysCubicKernelFunc {
+  // http://ieeexplore.ieee.org/document/1163711/
+  // R. G. Keys. Cubic convolution interpolation for digital image
+  // processing. IEEE Transactions on Acoustics, Speech, and Signal
+  // Processing, 29(6):1153–1160, 1981.
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= 2.0f) {
+      return 0.0f;
+    } else if (x >= 1.0f) {
+      return ((-0.5f * x + 2.5f) * x - 4.0f) * x + 2.0f;
+    } else {
+      return ((1.5f * x - 2.5f) * x) * x + 1.0f;
+    }
+  }
+  float Radius() const { return 2.f; }
+};
+
+struct MitchellCubicKernelFunc {
+  // https://doi.org/10.1145/378456.378514
+  // D. P. Mitchell and A. N. Netravali. Reconstruction filters in computer
+  // graphics.  Computer Graphics (Proceedings of ACM SIGGRAPH 1988),
+  // 22(4):221–228, 1988.
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= 2.0f) {
+      return 0.0f;
+    } else if (x >= 1.0f) {
+      return (((-7.0f / 18.0f) * x + 2.0f) * x - 10.0f / 3.0f) * x +
+             16.0f / 9.0f;
+    } else {
+      return (((7.0f / 6.0f) * x - 2.0f) * x) * x + 8.0f / 9.0f;
+    }
+  }
+  float Radius() const { return 2.f; }
+};
+
+inline LanczosKernelFunc CreateLanczos1Kernel() {
+  return LanczosKernelFunc(1.0);
+}
+
+inline LanczosKernelFunc CreateLanczos3Kernel() {
+  return LanczosKernelFunc(3.0);
+}
+
+inline LanczosKernelFunc CreateLanczos5Kernel() {
+  return LanczosKernelFunc(5.0);
+}
+
+inline GaussianKernelFunc CreateGaussianKernel() {
+  return GaussianKernelFunc(1.5);
+}
+
+inline BoxKernelFunc CreateBoxKernel() { return BoxKernelFunc(); }
+
+inline TriangleKernelFunc CreateTriangleKernel() {
+  return TriangleKernelFunc();
+}
+
+inline KeysCubicKernelFunc CreateKeysCubicKernel() {
+  return KeysCubicKernelFunc();
+}
+
+inline MitchellCubicKernelFunc CreateMitchellCubicKernel() {
+  return MitchellCubicKernelFunc();
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_SAMPLING_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/image/scale_and_translate_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/image/scale_and_translate_op.h
new file mode 100644
index 00000000..672cc2a8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/image/scale_and_translate_op.h
@@ -0,0 +1,76 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_SCALE_AND_TRANSLATE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_SCALE_AND_TRANSLATE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace functor {
+
+// The scale and translate op works by scaling and translating the row and
+// column dimensions separately.
+// When scaling and translating the rows the set of input pixels and kernel
+// weights used to compute a given output pixel within a row is constant across
+// rows and can thus be precomputed and reused for every row. Similarly for the
+// columns. This precomputed data structure is called a 'span'.
+
+// To compute the gradient we use the spans computed on the forward pass and
+// essentially reverse them: we record for each input pixel which output
+// pixels it contributes to. This means that the forward and backward passes
+// use the same core algorithm, only the spans are computed differently.
+
+// A pre-computed span of pixels along a single dimension.
+// The output pixel will be the weighted sum of pixels starting from start.
+struct Spans {
+  // The maximum span size of any output pixel.
+  int span_size;
+  // int32 tensor of size [output_dim].
+  Tensor starts;
+  // float tensor of size [output_dim, span_size].
+  // The output pixel at x is computed as:
+  //   dot_product(input[starts[x]:starts[x]+span_size], weights[x]).
+  Tensor weights;
+};
+
+// Gather spans in both dimensions.
+// row_span_size, row_starts and row_weights correspond to the variables in
+// the row Spans data structure, similarly for col_span_size etc.
+// intermediate_buffer is a Tensor used to store the result of the
+// resize in the column dimension and is of size:
+//    [batch_size, input_height, output_width, channels]
+template <typename Device, typename T>
+struct GatherSpans {
+  void operator()(OpKernelContext* context, const Device& d, int row_span_size,
+                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<float, 1>::ConstTensor row_weights,
+                  int col_span_size,
+                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<float, 1>::ConstTensor col_weights,
+                  typename TTypes<T, 4>::ConstTensor input_images,
+                  typename TTypes<float, 4>::Tensor intermediate_buffer,
+                  typename TTypes<float, 4>::Tensor output_images);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_SCALE_AND_TRANSLATE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/immutable_constant_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/immutable_constant_op.h
new file mode 100644
index 00000000..264abc84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/immutable_constant_op.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_IMMUTABLE_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMMUTABLE_CONSTANT_OP_H_
+
+#include <memory>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class ImmutableConstantOp : public OpKernel {
+ public:
+  explicit ImmutableConstantOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+  ~ImmutableConstantOp() override;
+
+  // Names of attributes that are used by this op
+  static constexpr char const* kDTypeAttr = "dtype";
+  static constexpr char const* kShapeAttr = "shape";
+  static constexpr char const* kMemoryRegionNameAttr = "memory_region_name";
+
+ private:
+  string region_name_;
+  DataType dtype_;
+  TensorShape shape_;
+  ImmutableConstantOp(const ImmutableConstantOp&) = delete;
+  void operator=(const ImmutableConstantOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IMMUTABLE_CONSTANT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/in_topk_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/in_topk_op.h
new file mode 100644
index 00000000..87777764
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/in_topk_op.h
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// InTopK argument can be passed either via mode attribute (InTopK op), or as an
+// input tensor (InTopKV2 op).
+struct TopKArg {
+  int64_t k_value = -1;
+  const Tensor* k_tensor = nullptr;
+};
+
+template <typename Device, typename T, typename TargetT>
+struct InTopKFunctor {
+  template <int ndims>
+  using Dims = Eigen::DSizes<Eigen::Index, ndims>;
+
+  void operator()(OpKernelContext* context,
+                  typename TTypes<T, 2>::ConstTensor predictions,
+                  typename TTypes<TargetT>::ConstVec targets, const TopKArg k,
+                  typename TTypes<bool>::Vec output) {}
+};
+
+template <typename T, typename TargetT>
+struct InTopKFunctor<CPUDevice, T, TargetT> {
+  void operator()(OpKernelContext* context,
+                  typename TTypes<T, 2>::ConstTensor predictions,
+                  typename TTypes<TargetT>::ConstVec targets, const TopKArg k,
+                  typename TTypes<bool>::Vec output) {
+    const Eigen::Index num_targets = predictions.dimension(0);
+    const Eigen::Index num_classes = predictions.dimension(1);
+
+    int64_t k_val = k.k_value;
+    if (k.k_tensor != nullptr) {
+      if (k.k_tensor->dtype() == DT_INT32) {
+        k_val = k.k_tensor->scalar<int32>()();
+      } else {
+        k_val = k.k_tensor->scalar<int64_t>()();
+      }
+    }
+
+    for (int batch_idx = 0; batch_idx < num_targets; batch_idx++) {
+      auto target = internal::SubtleMustCopy(targets(batch_idx));
+
+      bool cannot_say = !FastBoundsCheck(target, num_classes) ||
+                        !std::isfinite(predictions(batch_idx, target));
+
+      int more_probable_classes = 0;
+      if (!cannot_say) {
+        const T target_prediction = predictions(batch_idx, target);
+
+        for (int class_idx = 0; class_idx < num_classes; ++class_idx) {
+          T pred = predictions(batch_idx, class_idx);
+          if (!std::isfinite(pred)) {
+            cannot_say = true;
+            break;
+          } else if (pred > target_prediction) {
+            ++more_probable_classes;
+            if (more_probable_classes > k_val) break;
+          }
+        }
+      }
+      output(batch_idx) = cannot_say ? false : (more_probable_classes < k_val);
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/initializable_lookup_table.h b/third_party/tflite-hdrs/tensorflow/core/kernels/initializable_lookup_table.h
new file mode 100644
index 00000000..c190fbd3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/initializable_lookup_table.h
@@ -0,0 +1,271 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+#define TENSORFLOW_CORE_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+
+#include <atomic>
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Base class for lookup tables that require initialization.
+class InitializableLookupTable : public LookupInterface {
+ public:
+  class InitTableIterator;
+  class InitializerSerializer;
+
+  // Performs batch lookups, for every element in the key tensor, Find returns
+  // the corresponding value into the values tensor.
+  // If an element is not present in the table, the given default value is used.
+  //
+  // For tables that require initialization, `Find` is available once the table
+  // is marked as initialized.
+  //
+  // Returns the following statuses:
+  // - OK: when the find finishes successfully.
+  // - FailedPrecondition: if the table is not initialized.
+  // - InvalidArgument: if any of the preconditions on the lookup key or value
+  //   fails.
+  // - In addition, other implementations may provide another non-OK status
+  //   specific to their failure modes.
+  absl::Status Find(OpKernelContext* ctx, const Tensor& keys, Tensor* values,
+                    const Tensor& default_value) final;
+
+  // Returns errors::Unimplemented.
+  absl::Status Insert(OpKernelContext* ctx, const Tensor& keys,
+                      const Tensor& values) final {
+    return errors::Unimplemented(
+        "Insert not supported by InitializableLookupTable implementations");
+  }
+
+  // Returns errors::Unimplemented.
+  absl::Status Remove(OpKernelContext* ctx, const Tensor& keys) final {
+    return errors::Unimplemented(
+        "Remove not supported by InitializableLookupTable implementations");
+  }
+
+  absl::Status ExportValues(OpKernelContext* context) override {
+    return errors::Unimplemented(
+        "ExportValues not supported by InitializableLookupTable "
+        "implementations");
+  }
+
+  absl::Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
+                            const Tensor& values) final;
+
+  TensorShape key_shape() const final { return TensorShape(); }
+
+  TensorShape value_shape() const final { return TensorShape(); }
+
+  // Returns whether the table was initialized and is ready to serve lookups.
+  bool is_initialized() const {
+    return is_initialized_.load(std::memory_order_acquire);
+  }
+
+  // Initializes the table from the given init table iterator.
+  //
+  // Atomically, this operation prepares the table, populates it with the given
+  // iterator, and marks the table as initialized.
+  //
+  // Returns the following statuses:
+  // - OK: when the initialization was successful.
+  // - InvalidArgument: if any of the preconditions on the lookup key or value
+  //   fails.
+  // - FailedPrecondition: if the table is already initialized and
+  //   fail_if_initialized is set to true.
+  // - In addition, other implementations may provide another non-OK status
+  //   specific to their failure modes.
+  absl::Status Initialize(InitTableIterator& iter);
+
+  // Initializes the table from the given init table iterator. `serializer` may
+  // specify how to serialize the table initializer, so that the table can be
+  // serialized using its metadata (as opposed to serializing a handle to the
+  // table).
+  absl::Status Initialize(InitTableIterator& iter,
+                          std::unique_ptr<InitializerSerializer> serializer);
+
+  // Basic iterator to initialize lookup tables.
+  // It yields a sequence of pairs of `keys()` and `values()` Tensors, so that
+  // the consumer may insert key-value pairs in batches.
+  //
+  // Then the iterator is exhausted, valid returns false and status returns
+  // Status::OutOfRange.
+  //
+  // This class is Thread-unsafe.
+  class InitTableIterator {
+   public:
+    InitTableIterator() {}
+
+    virtual ~InitTableIterator() {}
+
+    // Prepares the next batch of key and value tensors.
+    virtual void Next() = 0;
+
+    // Returns true if keys and values point to valid tensors.
+    virtual bool Valid() const = 0;
+
+    // Returns a tensor that contains the current batch of 'key' values.
+    virtual const Tensor& keys() const = 0;
+
+    // Returns a tensor that contains the current batch of 'value' values.
+    virtual const Tensor& values() const = 0;
+
+    // Returns an error if one has occurred, otherwise returns Status::OK.
+    virtual absl::Status status() const = 0;
+
+    // Returns the total number of elements that the iterator will produce.
+    // It might return -1 in case of error.
+    virtual int64_t total_size() const = 0;
+
+   private:
+    InitTableIterator(const InitTableIterator&) = delete;
+    void operator=(const InitTableIterator&) = delete;
+  };
+
+  InitializableLookupTable* GetInitializableLookupTable() override {
+    return this;
+  }
+
+  // Logic specifying how to represent an initializer as a GraphDef, so that a
+  // lookup table can be serialized using its metadata (as opposed to
+  // serializing the content of the table, or a handle to the table).
+  class InitializerSerializer {
+   public:
+    // A function which builds a graph so that executing `*out` will initialize
+    // `table`.
+    using SerializeFn = std::function<absl::Status(GraphDefBuilder* builder,
+                                                   Node* table, Node** out)>;
+    // A function which performs any necessary cleanup for the serializer.
+    using CleanupFn = std::function<void()>;
+
+    // Wraps serialization logic that requires no cleanup.
+    explicit InitializerSerializer(SerializeFn serialize)
+        : serialize_(std::move(serialize)), cleanup_([] {}) {}
+
+    // Wraps serialization logic along with a cleanup function. `cleanup` will
+    // be run when the serializer is destroyed.
+    explicit InitializerSerializer(SerializeFn serialize, CleanupFn cleanup)
+        : serialize_(std::move(serialize)), cleanup_(std::move(cleanup)) {}
+
+    ~InitializerSerializer() { cleanup_(); }
+
+    // Builds a graph so that executing `*out` will initialize `table`.
+    absl::Status AsGraphDef(GraphDefBuilder* builder, Node* table, Node** out) {
+      return serialize_(builder, table, out);
+    }
+
+   private:
+    SerializeFn serialize_;
+    CleanupFn cleanup_;
+  };
+
+ protected:
+  // Prepares and allocates the underlying data structure to store the given
+  // number of expected elements.
+  virtual absl::Status DoPrepare(size_t expected_num_elements) = 0;
+
+  // Same as DoPrepare() but derived implementations might choose to skip
+  // calling get_expected_num_elements if size is not needed for DoPrepare.
+  virtual absl::Status DoLazyPrepare(
+      std::function<int64_t(void)> get_expected_num_elements) {
+    int64_t expected_num_elements = get_expected_num_elements();
+    if (expected_num_elements < 0) {
+      return errors::FailedPrecondition("Got negative expected_num_elements.");
+    }
+    return DoPrepare(expected_num_elements);
+  }
+
+  // Populates the table in batches given keys and values as tensors into the
+  // underlying data structure.
+  virtual absl::Status DoInsert(const Tensor& keys, const Tensor& values) = 0;
+
+  // Performs the batch find operation on the underlying data structure.
+  virtual absl::Status DoFind(const Tensor& keys, Tensor* values,
+                              const Tensor& default_value) = 0;
+
+  virtual absl::Status AreEntriesSame(const InitTableIterator& iter,
+                                      bool* result);
+
+  mutex mu_;
+
+ protected:
+  // When set, provides a mechanism for serializing the table initializer as
+  // GraphDef.
+  std::unique_ptr<InitializerSerializer> initializer_serializer_;
+
+ private:
+  std::atomic<bool> is_initialized_{false};
+};
+
+// Iterator to initialize tables given 'keys' and 'values' tensors.
+//
+// The two tensors are returned in the first iteration. It doesn't loop
+// over each element of the tensor since insertions in the lookup table can
+// process batches.
+class KeyValueTensorIterator
+    : public InitializableLookupTable::InitTableIterator {
+ public:
+  // keys and values are not owned by the iterator.
+  explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values)
+      : keys_(keys), values_(values), valid_(true), status_(absl::OkStatus()) {
+    TensorShape key_shape = keys_->shape();
+    if (!key_shape.IsSameSize(values_->shape())) {
+      valid_ = false;
+      status_ = errors::InvalidArgument(
+          "keys and values should have the same dimension.",
+          key_shape.DebugString(), " vs ", values_->shape().DebugString());
+    }
+    if (key_shape.num_elements() == 0) {
+      valid_ = false;
+      status_ =
+          errors::InvalidArgument("keys and values cannot be empty tensors.");
+    }
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void Next() override {
+    valid_ = false;
+    status_ = errors::OutOfRange("No more data.");
+  }
+
+  const Tensor& keys() const override { return *keys_; }
+
+  const Tensor& values() const override { return *values_; }
+
+  absl::Status status() const override { return status_; }
+
+  int64_t total_size() const override {
+    return keys_ == nullptr ? -1 : keys_->NumElements();
+  }
+
+ private:
+  KeyValueTensorIterator(const KeyValueTensorIterator&) = delete;
+  void operator=(const KeyValueTensorIterator&) = delete;
+
+  const Tensor* keys_;    // Doesn't own it.
+  const Tensor* values_;  // Doesn't own it.
+  bool valid_;            // true if the iterator points to an existing range.
+  absl::Status status_;
+};
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/inplace_ops_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/inplace_ops_functor.h
new file mode 100644
index 00000000..e1707824
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/inplace_ops_functor.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_INPLACE_OPS_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_INPLACE_OPS_FUNCTOR_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device>
+absl::Status DoParallelConcat(const Device& device, const Tensor& value,
+                              int32_t loc, Tensor* output);
+
+// Inplace update/add/sub values in 'y'. It computes
+//   y[i, :] = v if op is I_UPDATE
+//   y[i, :] += v if op is I_ADD
+//   y[i, :] -= v if op is I_SUB
+// Returns an error if the operation fails.
+enum InplaceOpType {
+  I_UPDATE,  // x = y
+  I_ADD,     // x += y
+  I_SUB,     // x -= y
+};
+template <typename Device>
+absl::Status DoInplace(const Device& device, InplaceOpType op, const Tensor& i,
+                       const Tensor& v, Tensor* y);
+// Copies x into y.
+template <typename Device>
+absl::Status DoCopy(const Device& device, const Tensor& x, Tensor* y);
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_INPLACE_OPS_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/kernel_platform_strings.h b/third_party/tflite-hdrs/tensorflow/core/kernels/kernel_platform_strings.h
new file mode 100644
index 00000000..9bf40c30
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/kernel_platform_strings.h
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generate platform strings for libtfkernel-*
+
+#ifndef TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+TF_PLATFORM_STRINGS()
+
+#endif  // TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/l2loss_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/l2loss_op.h
new file mode 100644
index 00000000..2adaacbb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/l2loss_op.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_L2LOSS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_L2LOSS_OP_H_
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T>
+struct L2LossOp : public OpKernel {
+  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) {}
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_L2LOSS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/determinant_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/determinant_op.h
new file mode 100644
index 00000000..6ace1bef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/determinant_op.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_DETERMINANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_DETERMINANT_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Helper functor to compute Determinant from a partially pivoted LU
+// factorization.
+template <typename Device, typename Scalar>
+struct DeterminantFromPivotedLUFunctor {
+  void operator()(const Device& device,
+                  typename TTypes<Scalar, 3>::ConstTensor lu_factor,
+                  const int* pivots, typename TTypes<Scalar, 1>::Tensor output,
+                  int* info);
+};
+
+// Helper functor to compute sign and log of the absolute value of the
+// determinant from a partially pivoted LU factorization.
+template <typename Device, typename Scalar>
+struct LogDeterminantFromPivotedLUFunctor {
+  void operator()(const Device& device,
+                  typename TTypes<Scalar, 3>::ConstTensor lu_factor,
+                  const int* pivots, typename TTypes<Scalar, 1>::Tensor sign,
+                  typename TTypes<Scalar, 1>::Tensor log_abs_det);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_DETERMINANT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/eig_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/eig_op_impl.h
new file mode 100644
index 00000000..220e6db5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/eig_op_impl.h
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EIG_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EIG_OP_IMPL_H_
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "Eigen/Eigenvalues"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class InputScalar, class OutputScalar>
+class EigOp : public LinearAlgebraOp<InputScalar, OutputScalar> {
+ public:
+  typedef LinearAlgebraOp<InputScalar, OutputScalar> Base;
+
+  explicit EigOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_v", &compute_v_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using InputMatrix = typename Base::InputMatrix;
+  using InputMatrixMaps = typename Base::InputMatrixMaps;
+  using InputConstMatrixMap = typename Base::InputConstMatrixMap;
+  using InputConstMatrixMaps = typename Base::InputConstMatrixMaps;
+
+  using OutputMatrix = typename Base::OutputMatrix;
+  using OutputMatrixMaps = typename Base::OutputMatrixMaps;
+  using OutputConstMatrixMap = typename Base::OutputConstMatrixMap;
+  using OutputConstMatrixMaps = typename Base::OutputConstMatrixMaps;
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64_t n = input_matrix_shapes[0].dim_size(0);
+    if (compute_v_) {
+      return TensorShapes({TensorShape({n}), TensorShape({n, n})});
+    } else {
+      return TensorShapes({TensorShape({n})});
+    }
+  }
+
+  void ComputeMatrix(OpKernelContext* context,
+                     const InputConstMatrixMaps& inputs,
+                     OutputMatrixMaps* outputs) final {
+    const int64_t rows = inputs[0].rows();
+    if (rows == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      return;
+    }
+
+    // This algorithm relies on denormals, so switch them back on locally.
+    port::ScopedDontFlushDenormal dont_flush_denormals;
+
+    using EigenSolver =
+        std::conditional_t<Eigen::NumTraits<InputScalar>::IsComplex,
+                           Eigen::ComplexEigenSolver<InputMatrix>,
+                           Eigen::EigenSolver<InputMatrix>>;
+    EigenSolver eig(inputs[0], /*computeEigenvectors=*/compute_v_);
+
+    OP_REQUIRES(
+        context, eig.info() == Eigen::Success,
+        errors::InvalidArgument("Eigen decomposition was not "
+                                "successful. The input might not be valid."));
+
+    outputs->at(0) = eig.eigenvalues().template cast<OutputScalar>();
+    if (compute_v_) {
+      outputs->at(1) = eig.eigenvectors();
+    }
+  }
+
+ private:
+  bool compute_v_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_EIG_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/einsum_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/einsum_op.h
new file mode 100644
index 00000000..26daed1e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/einsum_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int N>
+struct StrideFunctor {
+  void operator()(const Device& d, typename TTypes<T, N>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, N>& strides,
+                  typename TTypes<T, N>::Tensor output) {
+    output.device(d) = input.stride(strides);
+  }
+};
+
+template <typename Device, typename T, int N>
+struct InflateFunctor {
+  void operator()(const Device& d, typename TTypes<T, N>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, N>& strides,
+                  typename TTypes<T, N>::Tensor output) {
+    output.device(d) = input.inflate(strides);
+  }
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_EINSUM_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/einsum_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/einsum_op_impl.h
new file mode 100644
index 00000000..1d345be6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -0,0 +1,673 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_IMPL_H_
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_split.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/linalg/einsum_op.h"
+#include "tensorflow/core/kernels/matmul_op_impl.h"
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/einsum_op_util.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/reduction_ops_common_gpu.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+using ShapeVec = absl::InlinedVector<int64_t, 8UL>;
+using Labels = absl::InlinedVector<int, 8UL>;
+using OperandLabels = absl::InlinedVector<Labels, 2UL>;
+using LabelCounts = absl::InlinedVector<int, 8UL>;
+using OperandLabelCounts = absl::InlinedVector<LabelCounts, 2UL>;
+using LabelToDimSizes = absl::InlinedVector<int64_t, 8UL>;
+
+struct EinsumHelper {
+  // Insert new (unnamed) broadcasting labels at the location of ellipsis.
+  static void InsertBroadcastLabels(int num_bcast_dims, int num_named_labels,
+                                    int ellipsis_axis, Labels* labels,
+                                    LabelCounts* label_counts) {
+    labels->erase(labels->begin() + ellipsis_axis);
+    labels->insert(labels->begin() + ellipsis_axis, num_bcast_dims, 0);
+    std::iota(labels->begin() + ellipsis_axis,
+              labels->begin() + ellipsis_axis + num_bcast_dims,
+              num_named_labels);
+    // Increment label counts. Since these are new labels, the count is set
+    // to 1.
+    label_counts->resize(num_named_labels + num_bcast_dims, 1);
+  }
+
+  // Record and validate the label to dimension mapping. Must be a named
+  // (non-broadcasting) label as broadcasting labels don't have a fixed
+  // dimension.
+  static absl::Status RecordLabelToDimension(
+      const int label, const int axis, const Tensor& input,
+      LabelToDimSizes* label_to_dim_sizes) {
+    const int64_t input_dim = input.dim_size(axis);
+    // We know that label_to_dim_sizes has the size to accommodate named labels.
+    if (label_to_dim_sizes->at(label) != 0 &&
+        label_to_dim_sizes->at(label) != input_dim) {
+      return errors::InvalidArgument(
+          "Expected dimension ", label_to_dim_sizes->at(label), " at axis ",
+          axis, " of the input shaped ", input.shape().DebugString(),
+          " but got dimension ", input_dim);
+    }
+    (*label_to_dim_sizes)[label] = input_dim;
+    return absl::OkStatus();
+  }
+
+  // Validate input dimensions and populate unnamed labels and their label
+  // counts.
+  static absl::Status ProcessDimensions(
+      const OpInputList& inputs,
+      const absl::InlinedVector<bool, 2UL>& input_has_ellipsis,
+      const bool output_has_ellipsis, OperandLabels* input_labels,
+      Labels* output_labels, std::vector<EinsumDimensionType>* label_types,
+      OperandLabelCounts* input_label_counts, LabelCounts* output_label_counts,
+      LabelToDimSizes* label_to_dim_sizes) {
+    if (inputs.size() != input_labels->size()) {
+      return errors::InvalidArgument("Expected ", input_labels->size(),
+                                     " inputs but got: ", inputs.size());
+    }
+    const int num_inputs = inputs.size();
+
+    // We infer the number of broadcasting dimensions by taking the maximum rank
+    // among the broadcasting subshapes of the input.
+    int max_bcast_dims = 0;
+    const int num_named_labels = label_types->size();
+    label_to_dim_sizes->resize(num_named_labels);
+    for (int i = 0; i < num_inputs; ++i) {
+      Labels* labels = &(*input_labels)[i];
+
+      if (!input_has_ellipsis[i]) {
+        if (inputs[i].dims() != labels->size()) {
+          return errors::InvalidArgument("Expected input ", i, " to have rank ",
+                                         labels->size(),
+                                         " but got: ", inputs[i].dims());
+        }
+        for (int label_idx = 0; label_idx < labels->size(); ++label_idx) {
+          const int label = (*labels)[label_idx];
+          TF_RETURN_IF_ERROR(RecordLabelToDimension(label, label_idx, inputs[i],
+                                                    label_to_dim_sizes));
+        }
+        continue;
+      }
+
+      // Input has an ellipsis.
+      if (inputs[i].dims() + 1 < labels->size()) {
+        return errors::InvalidArgument(
+            "Expected input ", i, " to have rank at least ", labels->size() - 1,
+            " but got: ", inputs[i].dims());
+      }
+      int ellipsis_axis = -1;
+      const int num_bcast_dims = inputs[i].dims() - labels->size() + 1;
+      for (int label_idx = 0; label_idx < labels->size(); ++label_idx) {
+        const int label = (*labels)[label_idx];
+        if (label == kEllipsisLabel) {
+          ellipsis_axis = label_idx;
+          continue;
+        }
+        // Current label is not an ellipsis.
+        const int axis =
+            label_idx + (ellipsis_axis == -1 ? 0 : num_bcast_dims - 1);
+        TF_RETURN_IF_ERROR(
+            RecordLabelToDimension(label, axis, inputs[i], label_to_dim_sizes));
+      }
+      // Found an ellipsis. Replace 'kEllipsisLabel' with broadcasting
+      // dimensions.
+      if (ellipsis_axis != -1) {
+        InsertBroadcastLabels(num_bcast_dims, num_named_labels, ellipsis_axis,
+                              labels, &input_label_counts->at(i));
+        max_bcast_dims = std::max(max_bcast_dims, num_bcast_dims);
+      }
+    }
+    if (!absl::c_linear_search(input_has_ellipsis, true) &&
+        !output_has_ellipsis) {
+      return absl::OkStatus();
+    }
+    // Insert broadcasting dimensions in the output labels.
+    auto it =
+        std::find(output_labels->begin(), output_labels->end(), kEllipsisLabel);
+    if (it != output_labels->end()) {
+      const int ellipsis_axis = it - output_labels->begin();
+      InsertBroadcastLabels(max_bcast_dims, num_named_labels, ellipsis_axis,
+                            output_labels, output_label_counts);
+    } else if (max_bcast_dims > 0) {
+      return errors::InvalidArgument(
+          "Output contains ", max_bcast_dims,
+          " broadcasting dimension(s) but no ellipsis "
+          "(...) was found in the output subscripts.");
+    }
+    // Populate EinsumDimensionType for the new broadcasting labels.
+    label_types->resize(num_named_labels + max_bcast_dims,
+                        EinsumDimensionType::kBroadcasting);
+    return absl::OkStatus();
+  }
+
+  // Permutes the labels according to the given permutation.
+  static void PermuteLabels(const std::vector<int>& permutation,
+                            Labels* labels) {
+    Labels permuted_labels(labels->size());
+    for (int i = 0; i < labels->size(); ++i) {
+      permuted_labels[i] = (*labels)[permutation[i]];
+    }
+    labels->swap(permuted_labels);
+  }
+
+  // Returns a reshaped input Tensor. The underlying buffer is not copied.
+  static absl::Status CopyFrom(const Tensor& input, const TensorShape& shape,
+                               Tensor* output) {
+    if (output->CopyFrom(input, shape)) return absl::OkStatus();
+    return errors::Internal(
+        "Encountered error while reshaping a Tensor of shape ",
+        input.shape().DebugString(), " to shape ", shape.DebugString());
+  }
+
+  // Returns whether transposing would be a no-op; whether input has rank < 2 or
+  // the permutation is the identity permutation.
+  static bool ShouldTranspose(const TensorShape& input_shape,
+                              const std::vector<int>& permutation) {
+    if (input_shape.dims() < 2) return false;
+    for (int i = 0; i < permutation.size(); ++i) {
+      if (permutation[i] != i) return true;
+    }
+    return false;
+  }
+
+  // Transpose the input given a permutation. Returns a reference to the input
+  // if transposing is not necessary.
+  template <typename Device, typename T>
+  static absl::Status TransposeOperand(OpKernelContext* ctx,
+                                       const Tensor& input,
+                                       const std::vector<int>& permutation,
+                                       Tensor* output) {
+    if (!ShouldTranspose(input.shape(), permutation)) {
+      return CopyFrom(input, input.shape(), output);
+    }
+    TensorShape transposed_shape;
+    for (int i = 0; i < input.dims(); ++i) {
+      TF_RETURN_IF_ERROR(
+          transposed_shape.AddDimWithStatus(input.dim_size(permutation[i])));
+    }
+    // For empty Tensors, just change the shape. E.g. we may need to transpose
+    // from shape [1, 0, 5] to [5, 1, 0].
+    if (input.NumElements() == 0) {
+      return CopyFrom(input, transposed_shape, output);
+    }
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_temp(DataTypeToEnum<T>::value, transposed_shape, output));
+    const Device& device = ctx->eigen_device<Device>();
+    TF_RETURN_IF_ERROR(DoTranspose(device, input, permutation, output));
+    return absl::OkStatus();
+  }
+
+  // If there are repeated labels in either the input or output, then this
+  // strides the input (e.g. iii->i) or inflates it (e.g. i->iii), respectively.
+  template <typename Device, typename T>
+  static absl::Status StrideOrInflate(OpKernelContext* ctx, const Tensor& input,
+                                      const Labels& labels,
+                                      const LabelCounts& label_counts,
+                                      const bool should_inflate,
+                                      Tensor* output) {
+    // Return early if there are no repeated indices.
+    if (absl::c_all_of(label_counts, [](int c) { return c <= 1; })) {
+      return CopyFrom(input, input.shape(), output);
+    }
+    // We reshape so that each repeated label is compressed to one dimension.
+    // E.g. For iiij -> ij, The shape [3, 3, 3, 5] would be compressed to [27,
+    // 5]. Striding appropriately (in this case with strides 14 (=1+3+9) and 1)
+    // recovers the generalized diagonal of shape [3, 5].
+    ShapeVec reshape;
+    ShapeVec strides;
+    // Strided and inflated shapes correspond to input and output shapes,
+    // respectively, should_inflate is true (vice-versa if should_inflate is
+    // false). E.g. they are [3, 5] and [3, 3, 3, 5] in the above example.
+    ShapeVec strided_shape;
+    ShapeVec inflated_shape;
+    for (int label : labels) {
+      const int count = label_counts[label];
+      const int current_axis =
+          should_inflate ? strided_shape.size() : inflated_shape.size();
+      const int64_t dim = input.dim_size(current_axis);
+      strided_shape.push_back(dim);
+      inflated_shape.insert(inflated_shape.end(), count, dim);
+      const int64_t reshape_dim = MathUtil::IPow(dim, count);
+      reshape.push_back(reshape_dim);
+      // While taking the d-diagonal in a rank k Tensor, we take d
+      // equally-spaced elements including the first and last element. Then, (k
+      // - 1) * stride = d^k - 1, or, stride = (d^k - 1)/(d - 1).
+      const int64_t stride =
+          (dim > 1 && count > 1) ? (reshape_dim - 1) / (dim - 1) : 1;
+      strides.push_back(stride);
+    }
+
+    TensorShape output_shape =
+        TensorShape(should_inflate ? inflated_shape : strided_shape);
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
+    const Device& device = ctx->eigen_device<Device>();
+    switch (reshape.size()) {
+#define NDIMS_CASE(N)                                                 \
+  case N: {                                                           \
+    if (should_inflate) {                                             \
+      auto output_map = output->shaped<T, N>(reshape);                \
+      auto input_map = input.shaped<T, N>(strided_shape);             \
+      functor::InflateFunctor<Device, T, N>()(                        \
+          device, input_map, TensorShape(strides).AsEigenDSizes<N>(), \
+          output_map);                                                \
+    } else {                                                          \
+      auto input_map = input.shaped<T, N>(reshape);                   \
+      auto output_map = output->shaped<T, N>(strided_shape);          \
+      functor::StrideFunctor<Device, T, N>()(                         \
+          device, input_map, TensorShape(strides).AsEigenDSizes<N>(), \
+          output_map);                                                \
+    }                                                                 \
+  } break;
+      NDIMS_CASE(1);
+      NDIMS_CASE(2);
+      NDIMS_CASE(3);
+      NDIMS_CASE(4);
+      NDIMS_CASE(5);
+      NDIMS_CASE(6);
+      default:
+        return errors::Unimplemented(
+            "Unsupported rank: ", reshape.size(),
+            " while handling repeated indices. Up to rank 6 is supported.");
+#undef NDIMS_CASE
+    }
+    return absl::OkStatus();
+  }
+
+  // Returns true if the input dimensions are already sorted in the order
+  // [batch, contract, free, reduce]. Used to implement an optimization to avoid
+  // an extra transpose and instead uses (adj_x and adj_y) in BatchMatMul.
+  static bool ShouldSwapFreeAndContract(
+      const Labels& labels,
+      const std::vector<EinsumDimensionType>& label_types) {
+    // Check that ordering is according to dimension type, with the role of
+    // free and contract dimensions swapped.
+    absl::InlinedVector<int, 5UL> remap = {0, 1, 3, 2, 4};
+    for (int i = 0; i + 1 < labels.size(); ++i) {
+      const int dimtype_a = remap[label_types[labels[i]]];
+      const int dimtype_b = remap[label_types[labels[i + 1]]];
+      if (dimtype_a > dimtype_b ||
+          (dimtype_a == dimtype_b && labels[i] > labels[i + 1])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  template <typename Device, typename T>
+  static absl::Status ReduceOperand(
+      OpKernelContext* ctx, const Tensor& input,
+      const std::vector<EinsumDimensionType>& label_types,
+      const LabelCounts& label_counts, Labels* labels, Labels* free_labels,
+      bool* swap_free_and_contract, Tensor* output) {
+    // Find the permutation to transpose the input dimensions in the order of
+    // EinsumDimensionType; i.e. batch, free, contract and reduce dimensions.
+    // This makes it more convenient to invoke Reduce/Contract operations.
+    std::vector<int> permutation(input.dims());
+    absl::c_iota(permutation, 0);
+    Tensor input_transposed;
+    // Check if we can avoid the transpose. We need to flip the adj_x (or adj_y)
+    // flag during BatchMatMul. This is an extra optimization not necessary for
+    // correctness.
+    if (ShouldSwapFreeAndContract(*labels, label_types)) {
+      *swap_free_and_contract = true;
+    } else {
+      absl::c_sort(permutation, [&](int i, int j) {
+        int label_i = (*labels)[i];
+        int label_j = (*labels)[j];
+        return std::tie(label_types[label_i], label_i) <
+               std::tie(label_types[label_j], label_j);
+      });
+    }
+    // Transpose the input so that EinsumDimensionTypes are in order.
+    TF_RETURN_IF_ERROR(TransposeOperand<Device, T>(ctx, input, permutation,
+                                                   &input_transposed));
+    PermuteLabels(permutation, labels);
+
+    // Take the generalized diagonal for dimensions with repeated axis labels.
+    Tensor input_deduped;
+    labels->erase(std::unique(labels->begin(), labels->end()), labels->end());
+    TF_RETURN_IF_ERROR(
+        StrideOrInflate<Device, T>(ctx, input_transposed, *labels, label_counts,
+                                   false /* should_inflate */, &input_deduped));
+
+    // Reshape denotes the rank-5 shape [broadcast, batch, free, contract,
+    // reduce] where we've compacted the dimensions of each EinsumDimensionType.
+    absl::InlinedVector<int64_t, 5UL> reshape(5, 1);
+    // The output shape is [batch shape] + [free size, contract size]
+    // That is, the batch shape is preserved (for broadcasting while
+    // contracting) while the free dims and contract dims are compressed to one
+    // dimension each.
+    TensorShape output_shape;
+    for (int label_idx = 0; label_idx < labels->size(); ++label_idx) {
+      const int label = labels->at(label_idx);
+      int64_t dim = input_deduped.dim_size(label_idx);
+      if (label_types[label] == EinsumDimensionType::kBroadcasting ||
+          label_types[label] == EinsumDimensionType::kBatch) {
+        TF_RETURN_IF_ERROR(output_shape.AddDimWithStatus(dim));
+      } else if (label_types[label] == EinsumDimensionType::kFree) {
+        free_labels->push_back(label);
+      }
+      reshape[label_types[label]] *= dim;
+    }
+    if (*swap_free_and_contract)
+      std::swap(reshape[EinsumDimensionType::kFree],
+                reshape[EinsumDimensionType::kContract]);
+    TF_RETURN_IF_ERROR(
+        output_shape.AddDimWithStatus(reshape[EinsumDimensionType::kFree]));
+    TF_RETURN_IF_ERROR(
+        output_shape.AddDimWithStatus(reshape[EinsumDimensionType::kContract]));
+
+    if (reshape[EinsumDimensionType::kReduce] ==
+        1) {  // No need to actually reduce.
+      return CopyFrom(input_deduped, output_shape, output);
+    }
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
+    using Reducer = Eigen::internal::SumReducer<T>;
+    using Index = typename TTypes<T>::Tensor::Index;
+    // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor.
+    const int64_t output_size = reshape[kBroadcasting] * reshape[kBatch] *
+                                reshape[kFree] * reshape[kContract];
+    functor::ReduceFunctor<Device, Reducer>::Reduce(
+        ctx, output->shaped<T, 1>({output_size}),
+        const_cast<const Tensor&>(input_deduped)
+            .shaped<T, 2>({output_size, reshape[kReduce]}),
+        Eigen::array<Index, 1>({1}), Reducer());
+    return absl::OkStatus();
+  }
+
+  // Reshapes a Tensor of shape [b0,b1...bk,N,M] to [prod(b0,b1...bk),N,M].
+  static absl::Status ReshapeToRank3(const Tensor& input, int batch_size,
+                                     Tensor* output) {
+    const int rank = input.dims();
+    TensorShape output_shape = {batch_size, input.dim_size(rank - 2),
+                                input.dim_size(rank - 1)};
+    return CopyFrom(input, output_shape, output);
+  }
+
+  // Contracts the inputs along the last axis (or the second last if the
+  // corresponding value of swap_free_and_contract is true). The batch
+  // dimensions are broadcast to the output shape.
+  // TODO(anudhyan): BatchMatMul might devolve into a component-wise
+  // multiplication when the matrix shape is [1,1]; in this case BatchMatMul
+  // functor would be very inefficient. The functor should detect if this is the
+  // case and perform componentwise multiplication functor instead.
+  template <typename Device, typename T>
+  static absl::Status ContractOperands(
+      OpKernelContext* ctx, absl::Span<const Tensor> inputs,
+      absl::Span<const bool> swap_free_and_contract, Tensor* output) {
+    if (inputs.size() == 1)
+      return CopyFrom(inputs[0], inputs[0].shape(), output);
+    MatMulBCast bcast(inputs[0].shape().dim_sizes(),
+                      inputs[1].shape().dim_sizes());
+    if (!bcast.IsValid()) {
+      return errors::InvalidArgument(
+          "Invalid broadcasting dimensions: ", inputs[0].shape().DebugString(),
+          " vs. ", inputs[1].shape().DebugString());
+    }
+    Tensor lhs;
+    TF_RETURN_IF_ERROR(ReshapeToRank3(inputs[0], bcast.x_batch_size(), &lhs));
+    Tensor rhs;
+    TF_RETURN_IF_ERROR(ReshapeToRank3(inputs[1], bcast.y_batch_size(), &rhs));
+    TensorShape output_shape = bcast.output_batch_shape();
+    for (int i = 0; i < inputs.size(); ++i) {
+      const int64_t free_axis =
+          inputs[i].dims() - (swap_free_and_contract[i] ? 1 : 2);
+      TF_RETURN_IF_ERROR(
+          output_shape.AddDimWithStatus(inputs[i].dim_size(free_axis)));
+    }
+    bool trans_x = swap_free_and_contract[0];
+    bool trans_y = !swap_free_and_contract[1];
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
+    if (lhs.NumElements() == 0 || rhs.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, T> set_zero;
+      set_zero(ctx->eigen_device<Device>(), output->flat<T>());
+      return absl::OkStatus();
+    }
+    Tensor output_reshaped;
+    TF_RETURN_IF_ERROR(
+        ReshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped));
+    LaunchBatchMatMul<Device, T>::Launch(ctx, lhs, rhs, /*adj_x=*/false,
+                                         /*adj_y=*/false, trans_x, trans_y,
+                                         /*grad_x=*/false, /*grad_y=*/false,
+                                         bcast, &output_reshaped);
+    return absl::OkStatus();
+  }
+};
+
+template <typename Device, typename T>
+class EinsumOp : public OpKernel {
+ public:
+  explicit EinsumOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("equation", &equation_));
+    OP_REQUIRES_OK(
+        c, ParseEinsumEquation(equation_, &input_labels_, &output_labels_,
+                               &label_types_, &input_label_counts_,
+                               &output_label_counts_, &input_has_ellipsis_,
+                               &output_has_ellipsis_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &inputs));
+
+    OperandLabels input_labels(input_labels_);
+    Labels output_labels(output_labels_);
+    std::vector<EinsumDimensionType> label_types(label_types_);
+    OperandLabelCounts input_label_counts(input_label_counts_);
+    LabelCounts output_label_counts(output_label_counts_);
+    LabelToDimSizes label_to_dim_sizes;
+
+    OP_REQUIRES_OK(ctx, EinsumHelper::ProcessDimensions(
+                            inputs, input_has_ellipsis_, output_has_ellipsis_,
+                            &input_labels, &output_labels, &label_types,
+                            &input_label_counts, &output_label_counts,
+                            &label_to_dim_sizes));
+
+    // The reduction phase (a) sums across reduction dimensions, (b) takes
+    // generalized diagonals, and (c) reshapes it into shape
+    //   [(broadcasting) batch shape] + [F,C]
+    // where F and C denote the total (compacted) size of free and contract
+    // dimensions, respectively.
+    const int num_inputs = inputs.size();
+    OperandLabels free_labels(num_inputs);
+    absl::InlinedVector<Tensor, 2UL> inputs_reduced(num_inputs);
+    absl::InlinedVector<bool, 2UL> swap_free_and_contract(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      OP_REQUIRES_OK(ctx,
+                     EinsumHelper::ReduceOperand<Device, T>(
+                         ctx, inputs[i], label_types, input_label_counts[i],
+                         &input_labels[i], &free_labels[i],
+                         &swap_free_and_contract[i], &inputs_reduced[i]));
+    }
+
+    // After reduction, the inputs should be reshaped to Tensors suitable for
+    // contraction. If num_inputs is 1, the reduced input is simply forwarded to
+    // the output.
+    Tensor contraction_output_reshaped;
+    OP_REQUIRES_OK(ctx, EinsumHelper::ContractOperands<Device, T>(
+                            ctx, inputs_reduced, swap_free_and_contract,
+                            &contraction_output_reshaped));
+
+    // Copy the batch labels from the contraction output. Recover the batch
+    // shape, which may have been broadcasted.
+    TensorShape result_shape = contraction_output_reshaped.shape();
+    result_shape.RemoveLastDims(2);
+
+    int num_labels = label_types.size();
+    Labels result_labels;
+    // All batch dimensions should be present in the contracted result. First
+    // the broadcasting dimensions, then the named batch dimensions.
+    for (int label = 0; label < num_labels; ++label) {
+      if (label_types[label] == EinsumDimensionType::kBroadcasting)
+        result_labels.push_back(label);
+    }
+    for (int label = 0; label < num_labels; ++label) {
+      if (label_types[label] == EinsumDimensionType::kBatch)
+        result_labels.push_back(label);
+    }
+    for (int i = 0; i < num_inputs; ++i) {
+      for (int label : free_labels[i]) {
+        result_labels.push_back(label);
+        OP_REQUIRES_OK(
+            ctx, result_shape.AddDimWithStatus(label_to_dim_sizes[label]));
+      }
+    }
+
+    // Reshape the contraction (or reduction) result to its expanded shape:
+    // [(broadcasted) batch shape] + [free shape 0] + [free shape 1].
+    Tensor contraction_output;
+    OP_REQUIRES_OK(
+        ctx, EinsumHelper::CopyFrom(contraction_output_reshaped, result_shape,
+                                    &contraction_output));
+
+    // Inflate the output if necessary. (E.g. for the equation 'i->iii' which
+    // may arise while computing gradient of a regular Einsum).
+    // TODO(anudhyan): It's possible that Eigen's contract and inflate can be
+    // chained here to avoid materializing an intermediate.
+    Tensor output_inflated;
+    OP_REQUIRES_OK(
+        ctx, EinsumHelper::StrideOrInflate<Device, T>(
+                 ctx, contraction_output, result_labels, output_label_counts,
+                 true /* should_inflate */, &output_inflated));
+    if (output_inflated.dims() > contraction_output.dims()) {
+      // We inflated the output. Modify result labels accordingly.
+      Labels inflated_labels;
+      for (int label : result_labels) {
+        inflated_labels.insert(inflated_labels.end(),
+                               output_label_counts[label], label);
+      }
+      result_labels.swap(inflated_labels);
+    }
+    // Find the permutation to map the result labels to the output labels. Note
+    // that both the result and the final output may have the repeated labels,
+    // in which case the permutation preserves the left-to-right ordering.
+    // E.g. if result labels are [0, 0, 1] and output is [0, l, 0] then the
+    // permutation should be [0, 2, 1]. We also use the fact that repeated
+    // labels in the result are adjacent to each other.
+    std::vector<int> output_permutation(output_labels.size());
+    std::vector<int> label_to_position(num_labels, -1);
+    for (int i = 0; i < result_labels.size(); ++i) {
+      // Remember the position of only the leftmost result label.
+      if (label_to_position[result_labels[i]] == -1) {
+        label_to_position[result_labels[i]] = i;
+      }
+    }
+    for (int i = 0; i < output_labels.size(); ++i) {
+      output_permutation[i] = label_to_position[output_labels[i]];
+      // We have found the leftmost occurrence. The next one would be adjacent.
+      label_to_position[output_labels[i]] += 1;
+    }
+    Tensor output;
+    OP_REQUIRES_OK(ctx, EinsumHelper::TransposeOperand<Device, T>(
+                            ctx, output_inflated, output_permutation, &output));
+    ctx->set_output(0, std::move(output));
+  }
+
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override {
+    string op = profiler::TraceMeOp(name_view(), type_string_view());
+    string equation = strings::StrCat("(", equation_, ")");
+    if (verbose) {
+      string shape = ShapeTraceString(ctx);
+      if (!shape.empty()) {
+        return tsl::profiler::TraceMeEncode(
+            std::move(op), {{"equation", equation}, {"shape", shape}});
+      }
+    }
+    return tsl::profiler::TraceMeEncode(std::move(op),
+                                        {{"equation", equation}});
+  }
+
+ private:
+  string equation_;
+  OperandLabels input_labels_;
+  Labels output_labels_;
+  std::vector<EinsumDimensionType> label_types_;
+  OperandLabelCounts input_label_counts_;
+  LabelCounts output_label_counts_;
+  absl::InlinedVector<bool, 2UL> input_has_ellipsis_;
+  bool output_has_ellipsis_ = false;
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, N)                                      \
+  template <>                                                       \
+  void StrideFunctor<GPUDevice, T, N>::operator()(                  \
+      const GPUDevice& d, typename TTypes<T, N>::ConstTensor input, \
+      const Eigen::DSizes<Eigen::DenseIndex, N>& strides,           \
+      typename TTypes<T, N>::Tensor output);                        \
+  extern template struct StrideFunctor<GPUDevice, T, N>;            \
+  template <>                                                       \
+  void InflateFunctor<GPUDevice, T, N>::operator()(                 \
+      const GPUDevice& d, typename TTypes<T, N>::ConstTensor input, \
+      const Eigen::DSizes<Eigen::DenseIndex, N>& strides,           \
+      typename TTypes<T, N>::Tensor output);                        \
+  extern template struct InflateFunctor<GPUDevice, T, N>;
+
+#define DECLARE_GPU_SPECS(T) \
+  DECLARE_GPU_SPEC(T, 1);    \
+  DECLARE_GPU_SPEC(T, 2);    \
+  DECLARE_GPU_SPEC(T, 3);    \
+  DECLARE_GPU_SPEC(T, 4);    \
+  DECLARE_GPU_SPEC(T, 5);    \
+  DECLARE_GPU_SPEC(T, 6);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+// TODO(rocm): Enable once complex types are supported.
+#if GOOGLE_CUDA
+DECLARE_GPU_SPECS(complex64);
+DECLARE_GPU_SPECS(complex128);
+#endif
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPECS
+}  // namespace functor
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_EINSUM_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/eye_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/eye_functor.h
new file mode 100644
index 00000000..c77372f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/eye_functor.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_EYE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_EYE_FUNCTOR_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename Scalar>
+struct EyeFunctor {
+  void operator()(const Device& device,
+                  typename TTypes<Scalar, 3>::Tensor matrix_batch);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/linalg_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/linalg_ops_common.h
new file mode 100644
index 00000000..b4b98921
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/linalg_ops_common.h
@@ -0,0 +1,224 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_LINALG_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_LINALG_OPS_COMMON_H_
+
+// Classes to support linear algebra functionality, similar to the numpy.linalg
+// module. Supports batch computation on several matrices at once, sharding the
+// computations across different threads if necessary.
+#include <algorithm>
+#include <cstdint>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+// Base class for linear algebra operators.
+template <class InputScalar, class OutputScalar = InputScalar>
+class LinearAlgebraOp : public OpKernel {
+ public:
+  explicit LinearAlgebraOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override;
+
+ protected:
+  using TensorShapes = absl::InlinedVector<TensorShape, 4UL>;
+  // Returns the number of leading inputs that are to be treated as matrix
+  // inputs. By default this is all the inputs. Derived classes can override
+  // this to tell the base class to ignore one or more trailing inputs.
+  virtual int NumMatrixInputs(const OpKernelContext* context) const {
+    return context->num_inputs();
+  }
+
+  // Returns true if the number of inputs and their shapes are as expected.
+  // Many ops take a single square input matrix, so we provide that as a default
+  // implementation for convenience.
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context, const TensorShapes& input_matrix_shapes) const {
+    ValidateSingleSquareMatrix(context, input_matrix_shapes);
+  }
+
+  // Convenience validators for common cases:
+  //
+  // Validate op taking a single matrix A.
+  static void ValidateSingleMatrix(OpKernelContext* context,
+                                   const TensorShapes& input_matrix_shapes);
+  // Validate op taking a single square matrix A.
+  static void ValidateSingleSquareMatrix(
+      OpKernelContext* context, const TensorShapes& input_matrix_shapes);
+  // Validate op taking two matrices A and B that have the same number of rows.
+  static void ValidateSolver(OpKernelContext* context,
+                             const TensorShapes& input_matrix_shapes);
+  // Validate op taking two matrices A and B that have the same number of rows
+  // and A is square.
+  static void ValidateSquareSolver(OpKernelContext* context,
+                                   const TensorShapes& input_matrix_shapes);
+
+  // Returns the output shapes of each individual matrix operation. Output
+  // matrices shapes must be rank 0, 1, or 2. Scalar outputs are rank 0.
+  //
+  // The derived class may return a number of shapes (N) less than
+  // context->num_outputs() (M) to indicate that a only leading subset of
+  // the outputs will be populated. In this case, a dummy scalar tensor with
+  // value zero will be return for the last M-N outputs.
+  //
+  // For many ops, the output dimensions are the same as the input dimensions,
+  // so we provide that as a default implementation for convenience.
+  virtual TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const {
+    return input_matrix_shapes;
+  }
+
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing calls to ComputeMatrix in
+  // batch mode. Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc. Many linear algebra ops take roughly max(m,n)
+  // * min(m,n)^2, where the first input matrix is m-by-n. We provide that as a
+  // default implementation for convenience.
+  virtual int64_t GetCostPerUnit(
+      const TensorShapes& input_matrix_shapes) const {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64_t>(cost);
+  }
+
+  // Returns true if it is safe to forward (alias) input to output buffer
+  // and expect the kernel to perform the computation inplace.
+  virtual bool EnableInputForwarding() const { return true; }
+
+  using InputMatrix = Eigen::Matrix<InputScalar, Eigen::Dynamic, Eigen::Dynamic,
+                                    Eigen::RowMajor>;
+  using InputConstMatrixMap = Eigen::Map<const InputMatrix>;
+  using InputMatrixMap = Eigen::Map<InputMatrix>;
+  using InputConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<InputScalar, 1, Eigen::Dynamic>>;
+  using InputConstMatrixMaps = gtl::InlinedVector<InputConstMatrixMap, 4>;
+  using InputMatrixMaps = gtl::InlinedVector<InputMatrixMap, 4>;
+  using InputRealScalar = typename Eigen::NumTraits<InputScalar>::Real;
+
+  using OutputMatrix = Eigen::Matrix<OutputScalar, Eigen::Dynamic,
+                                     Eigen::Dynamic, Eigen::RowMajor>;
+  using OutputConstMatrixMap = Eigen::Map<const OutputMatrix>;
+  using OutputMatrixMap = Eigen::Map<OutputMatrix>;
+  using OutputConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<OutputScalar, 1, Eigen::Dynamic>>;
+  using OutputConstMatrixMaps = gtl::InlinedVector<OutputConstMatrixMap, 4>;
+  using OutputMatrixMaps = gtl::InlinedVector<OutputMatrixMap, 4>;
+  using OutputRealScalar = typename Eigen::NumTraits<OutputScalar>::Real;
+
+  // backward compatibility
+  using Scalar = OutputScalar;
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+  using ConstVectorMap =
+      Eigen::Map<const Eigen::Matrix<Scalar, 1, Eigen::Dynamic>>;
+  using ConstMatrixMaps = gtl::InlinedVector<ConstMatrixMap, 4>;
+  using MatrixMaps = gtl::InlinedVector<MatrixMap, 4>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  // Performs a single matrix computation given input matrices, and
+  // stores the result in outputs. For batch operations, this will be called
+  // repeatedly for a single call to Compute() when multiple matrices exist in
+  // input Tensors with rank > 2. In this case the calls to ComputeMatrix are
+  // parallelized. The number of threads used is determined by a cost model from
+  // the value returned by GetCostPerUnit().
+  virtual void ComputeMatrix(OpKernelContext* context,
+                             const InputConstMatrixMaps& inputs,
+                             OutputMatrixMaps* outputs) = 0;
+
+ private:
+  using TensorInputs = absl::InlinedVector<const Tensor*, 4UL>;
+  using TensorOutputs = absl::InlinedVector<Tensor*, 4UL>;
+  // This function maps 2-d slices (matrices) of the input and output tensors
+  // using Eigen::Map and calls ComputeMatrix implemented in terms of the
+  // Eigen::MatrixBase API by the derived class.
+  //
+  // The 'matrix_index' parameter specifies the index of the matrix to be used
+  // from each input tensor, and the index of the matrix to be written to each
+  // output tensor. The input matrices are in row major order, and located at
+  // the memory addresses
+  //   inputs[i].flat<Scalar>().data() +
+  //   matrix_index * input_matrix_shapes[i].num_elements()
+  // for i in 0...inputs.size()-1.
+  // The output matrices are in row major order, and located at the memory
+  // address
+  //   outputs[i]->flat<Scalar>().data() +
+  //   matrix_index * output_matrix_shapes[i].num_elements().
+  // for i in 0...outputs.size()-1.
+  //
+  void ComputeTensorSlice(OpKernelContext* context, int64_t matrix_index,
+                          const TensorInputs& inputs,
+                          const TensorShapes& input_matrix_shapes,
+                          const TensorOutputs& outputs,
+                          const TensorShapes& output_matrix_shapes);
+
+  void AnalyzeInputs(OpKernelContext* context, TensorInputs* inputs,
+                     TensorShapes* input_matrix_shapes,
+                     TensorShape* batch_shape);
+
+  void PrepareOutputs(OpKernelContext* context,
+                      const TensorShapes& input_matrix_shapes,
+                      const TensorShape& batch_shape, TensorOutputs* outputs,
+                      TensorShapes* output_matrix_shapes);
+};
+
+// Declare LinearAlgebraOp, which is explicitly instantiated in
+// linalg_ops_common.cc for half,float, double, complex64, and complex128.
+extern template class LinearAlgebraOp<Eigen::half>;
+extern template class LinearAlgebraOp<float>;
+extern template class LinearAlgebraOp<double>;
+extern template class LinearAlgebraOp<complex64>;
+extern template class LinearAlgebraOp<complex128>;
+
+}  // namespace tensorflow
+
+#define INHERIT_LINALG_TYPEDEFS(Scalar)                       \
+  typedef LinearAlgebraOp<Scalar> Base;                       \
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real; \
+  using Matrix = typename Base::Matrix;                       \
+  using MatrixMap = typename Base::MatrixMap;                 \
+  using MatrixMaps = typename Base::MatrixMaps;               \
+  using ConstMatrixMap = typename Base::ConstMatrixMap;       \
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;     \
+  using ConstVectorMap = typename Base::ConstVectorMap;       \
+  using TensorShapes = typename Base::TensorShapes;
+
+#define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
+      Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
+
+#define REGISTER_LINALG_OP_GPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
+      Name(OpName).Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), OpClass)
+
+// Deprecated, use one of the device-specific macros above.
+#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
+  REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar)
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_LINALG_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_band_part_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_band_part_op.h
new file mode 100644
index 00000000..2f68eba6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_band_part_op.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_BAND_PART_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_BAND_PART_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename Scalar>
+struct MatrixBandPartFunctor {
+  void operator()(OpKernelContext* context, const Device& device,
+                  int num_upper_diags, int num_lower_diags,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_BAND_PART_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_diag_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_diag_op.h
new file mode 100644
index 00000000..01c875ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_diag_op.h
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_DIAG_OP_H_
+
+// Generator definition for MatrixDiagOp, must be compilable by nvcc.
+
+#include <utility>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Reads the diagonal packing alignment.
+void ReadAlignment(OpKernelConstruction* context,
+                   bool* left_align_superdiagonal,
+                   bool* left_align_subdiagonal);
+
+// Calculates diagonal length and content offset (from aligning) of a diagonal.
+// Returns a pair of integers {diag_len, content_offset}:
+//   - diag_len: The length of the diag_index-th diagonal.
+//   - content_offset: Each diagonal is stored as a row in the compact format.
+//     If the diagonal is shorter than max_diag_len, its content is aligned
+//     either to the left or right. content_offset is the index in the row
+//     where the first element of the diag-index-th diagonal is stored. It is
+//     always zero when the diagonal is left-aligned.
+std::pair<int, int> ComputeDiagLenAndContentOffset(
+    int diag_index, int max_diag_len, int num_rows, int num_cols,
+    bool left_align_superdiagonal, bool left_align_subdiagonal);
+
+template <typename Device, typename T>
+struct MatrixDiagPart {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      OpKernelContext* context, const Device& device,
+      typename TTypes<T, 3>::ConstTensor& input,
+      typename TTypes<T>::Tensor& output, const Eigen::Index lower_diag_index,
+      const Eigen::Index upper_diag_index, const Eigen::Index max_diag_len,
+      const T padding_value, const bool left_align_superdiagonal,
+      const bool left_align_subdiagonal);
+};
+
+template <typename Device, typename T>
+struct MatrixDiag {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      OpKernelContext* context, const Device& device,
+      typename TTypes<T>::ConstTensor& diag,
+      typename TTypes<T, 3>::Tensor& output,
+      const Eigen::Index lower_diag_index, const Eigen::Index upper_diag_index,
+      const Eigen::Index max_diag_len, const T padding_value,
+      const bool left_align_superdiagonal, const bool left_align_subdiagonal);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_DIAG_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_set_diag_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_set_diag_op.h
new file mode 100644
index 00000000..449a3607
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_set_diag_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SET_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SET_DIAG_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct MatrixSetDiag {
+  static void Compute(OpKernelContext* context, const Device& device,
+                      typename TTypes<T, 3>::ConstTensor& input,
+                      typename TTypes<T>::ConstTensor& diag,
+                      typename TTypes<T, 3>::Tensor& output,
+                      const Eigen::Index lower_diag_index,
+                      const Eigen::Index upper_diag_index,
+                      const Eigen::Index max_diag_len,
+                      const bool left_align_superdiagonal,
+                      const bool left_align_subdiagonal);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SET_DIAG_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
new file mode 100644
index 00000000..c75c494e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
@@ -0,0 +1,166 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SOLVE_LS_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SOLVE_LS_OP_IMPL_H_
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "Eigen/Cholesky"  // from @eigen_archive
+#include "Eigen/Core"  // from @eigen_archive
+#include "Eigen/QR"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar>
+class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
+ public:
+  typedef LinearAlgebraOp<Scalar> Base;
+
+  explicit MatrixSolveLsOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("fast", &fast_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  // Tell the base class to ignore the regularization parameter
+  // in context->input(2).
+  int NumMatrixInputs(const OpKernelContext* context) const final { return 2; }
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSolver(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
+                                      input_matrix_shapes[1].dim_size(1)})});
+  }
+
+  int64_t GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
+    double cost = std::max(m, n) * std::min(m, n) * (std::min(m, n) + num_rhss);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64_t>(cost);
+  }
+
+  bool EnableInputForwarding() const final { return false; }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& matrix = inputs[0];
+    const ConstMatrixMap& rhs = inputs[1];
+    const auto& l2_regularizer_in = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(l2_regularizer_in.shape()),
+        errors::InvalidArgument("l2_regularizer must be scalar, got shape ",
+                                l2_regularizer_in.shape().DebugString()));
+    const double l2_regularizer = l2_regularizer_in.scalar<double>()();
+    OP_REQUIRES(context, l2_regularizer >= 0,
+                errors::InvalidArgument("l2_regularizer must be >= 0."));
+
+    const int64_t rows = matrix.rows();
+    const int64_t cols = matrix.cols();
+    if (rows == 0 || cols == 0 || rhs.rows() == 0 || rhs.cols() == 0) {
+      // The result is the empty matrix.
+      return;
+    }
+    if (fast_) {
+      // The fast branch assumes that matrix is not rank deficient and
+      // not too ill-conditioned. Specifically, the reciprocal condition number
+      // should be greater than the square root of the machine precision, i.e.
+      //   1 / cond(matrix) > sqrt(std::numeric_limits<Scalar>::epsilon()).
+      // This branch solves over- or underdetermined least-squares problems
+      // via the normal equations and Cholesky decomposition.
+      if (rows >= cols) {
+        // Overdetermined case (rows >= cols): Solves the ordinary (possibly
+        // regularized) least-squares problem
+        //   min || A * X - RHS ||_F^2 + l2_regularizer ||X||_F^2
+        // by solving the normal equations
+        //    (A^T * A + l2_regularizer * I) X = A^T RHS
+        // using Cholesky decomposition.
+        Matrix gramian(cols, cols);
+        gramian.template triangularView<Eigen::Lower>() =
+            matrix.adjoint() * matrix;
+        if (l2_regularizer > 0) {
+          gramian +=
+              (Scalar(l2_regularizer) * Matrix::Ones(cols, 1)).asDiagonal();
+        }
+        const Eigen::LLT<Eigen::Ref<Matrix>, Eigen::Lower> llt(gramian);
+        OP_REQUIRES(
+            context, llt.info() == Eigen::Success,
+            errors::InvalidArgument("Input matrix was rank deficient or "
+                                    "ill-conditioned. Try setting fast=False "
+                                    "or provide a larger l2_regularizer > 0."));
+        outputs->at(0).noalias() = matrix.adjoint() * rhs;
+        llt.solveInPlace(outputs->at(0));
+      } else {
+        // Underdetermined case (rows < cols): Solves the minimum-norm problem
+        //   min ||X||_F^2 s.t. A*X = RHS
+        // by solving the normal equations of the second kind
+        //   (A * A^T + l2_regularizer * I) Z = RHS,  X = A^T * Z
+        // using Cholesky decomposition.
+        Matrix gramian(rows, rows);
+        gramian.template triangularView<Eigen::Lower>() =
+            matrix * matrix.adjoint();
+        if (l2_regularizer > 0) {
+          gramian +=
+              (Scalar(l2_regularizer) * Matrix::Ones(rows, 1)).asDiagonal();
+        }
+        const Eigen::LLT<Eigen::Ref<Matrix>, Eigen::Lower> llt(gramian);
+        OP_REQUIRES(
+            context, llt.info() == Eigen::Success,
+            errors::InvalidArgument("Input matrix was rank deficient or "
+                                    "ill-conditioned. Try setting fast=False "
+                                    "or provide an l2_regularizer > 0."));
+        outputs->at(0).noalias() = matrix.adjoint() * llt.solve(rhs);
+      }
+    } else {
+      // Use complete orthogonal decomposition which is backwards stable and
+      // will compute the minimum-norm solution for rank-deficient matrices.
+      // This is 6-7 times slower than the fast path.
+      //
+      // TODO(rmlarsen): The implementation of
+      //   Eigen::CompleteOrthogonalDecomposition is not blocked, so for
+      //   matrices that do not fit in cache, it is significantly slower than
+      //   the equivalent blocked LAPACK routine xGELSY (e.g. Eigen is ~3x
+      //   slower for 4k x 4k matrices).
+      //   See http://www.netlib.org/lapack/lawnspdf/lawn114.pdf
+      outputs->at(0) = matrix.completeOrthogonalDecomposition().solve(rhs);
+    }
+  }
+
+ private:
+  bool fast_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_SOLVE_LS_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
new file mode 100644
index 00000000..8e524347
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
@@ -0,0 +1,416 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+//
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/gpu_solvers.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename Scalar>
+se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* gpu_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<Scalar*>(gpu_memory));
+  se::DeviceMemory<Scalar> typed(wrapped);
+  return typed;
+}
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// Sequential batch matrix triangular solve kernel that calls Eigen's
+// matrix triangular solve.
+template <typename Scalar>
+struct SequentialMatrixTriangularSolveKernel {
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t,
+                                                      int slice) {
+    return ConstMatrixMap(
+        t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2),
+        t.dim_size(1), t.dim_size(2));
+  }
+
+  static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) {
+    return MatrixMap(
+        t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2),
+        t->dim_size(1), t->dim_size(2));
+  }
+
+  static void Run(const Tensor& in_x, const Tensor& in_y, bool lower,
+                  bool adjoint, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const auto& x_batch_indices = bcast.x_batch_indices();
+    const auto& y_batch_indices = bcast.y_batch_indices();
+    for (int64_t i = start; i < limit; ++i) {
+      const int64_t x_batch_index = should_bcast ? x_batch_indices[i] : i;
+      const int64_t y_batch_index = should_bcast ? y_batch_indices[i] : i;
+      auto matrix = ConstTensorSliceToEigenMatrix(in_x, x_batch_index);
+      auto rhs = ConstTensorSliceToEigenMatrix(in_y, y_batch_index);
+      auto output = TensorSliceToEigenMatrix(out, i);
+      if (lower) {
+        auto triangle = matrix.template triangularView<Eigen::Lower>();
+        if (adjoint) {
+          output.noalias() = triangle.adjoint().solve(rhs);
+        } else {
+          output.noalias() = triangle.solve(rhs);
+        }
+      } else {
+        auto triangle = matrix.template triangularView<Eigen::Upper>();
+        if (adjoint) {
+          output.noalias() = triangle.adjoint().solve(rhs);
+        } else {
+          output.noalias() = triangle.solve(rhs);
+        }
+      }
+    }
+  }
+};
+
+template <typename Device, typename Scalar>
+struct LaunchBatchMatrixTriangularSolve;
+
+template <typename Scalar>
+struct LaunchBatchMatrixTriangularSolve<CPUDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adjoint, bool lower,
+                     const MatMulBCast& bcast, Tensor* out) {
+    // Number of matrix triangular solves i.e. size of the batch.
+    const int64_t batch_size = bcast.output_batch_size();
+    const int64_t cost_per_unit =
+        in_x.dim_size(1) * in_x.dim_size(1) * in_y.dim_size(2) / 2;
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using ConstMatrixMap = Eigen::Map<const Matrix>;
+    using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          cost_per_unit,
+          [&in_x, &in_y, adjoint, lower, &bcast, out](int start, int limit) {
+            SequentialMatrixTriangularSolveKernel<Scalar>::Run(
+                in_x, in_y, lower, adjoint, bcast, out, start, limit);
+          });
+  }
+};
+
+template <typename Device, typename Scalar>
+class BaseMatrixTriangularSolveOp : public OpKernel {
+ public:
+  explicit BaseMatrixTriangularSolveOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  ~BaseMatrixTriangularSolveOp() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+
+    ValidateInputTensors(ctx, in0, in1);
+    if (!ctx->status().ok()) {
+      return;
+    }
+
+    MatMulBCast bcast(in0.shape().dim_sizes(), in1.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            in0.shape().DebugString(), " vs. ", in1.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+    auto d0 = in0.dim_size(in0.dims() - 2);
+    auto d1 = in0.dim_size(in0.dims() - 1);
+    Tensor in0_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in0_reshaped.CopyFrom(in0, TensorShape({bcast.x_batch_size(), d0, d1})),
+        errors::Internal("Failed to reshape In[0] from ",
+                         in0.shape().DebugString()));
+    auto d2 = in1.dim_size(in1.dims() - 2);
+    auto d3 = in1.dim_size(in1.dims() - 1);
+    Tensor in1_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in1_reshaped.CopyFrom(in1, TensorShape({bcast.y_batch_size(), d2, d3})),
+        errors::Internal("Failed to reshape In[1] from ",
+                         in1.shape().DebugString()));
+    if (adjoint_) std::swap(d0, d1);
+    OP_REQUIRES(ctx, d1 == d2,
+                errors::InvalidArgument(
+                    "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
+                    in0.shape().DebugString(), " ", in1.shape().DebugString(),
+                    " ", lower_, " ", adjoint_));
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d0));
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d3));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    Tensor out_reshaped;
+    OP_REQUIRES(ctx,
+                out_reshaped.CopyFrom(*out, TensorShape({batch_size, d0, d3})),
+                errors::Internal("Failed to reshape output from ",
+                                 out->shape().DebugString()));
+    LaunchBatchMatrixTriangularSolve<Device, Scalar>::Launch(
+        ctx, in0_reshaped, in1_reshaped, adjoint_, lower_, bcast,
+        &out_reshaped);
+  }
+
+ private:
+  virtual void ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                                    const Tensor& in1) = 0;
+  bool lower_;
+  bool adjoint_;
+};
+
+template <class Device, class Scalar>
+class MatrixTriangularSolveOp
+    : public BaseMatrixTriangularSolveOp<Device, Scalar> {
+ public:
+  explicit MatrixTriangularSolveOp(OpKernelConstruction* context)
+      : BaseMatrixTriangularSolveOp<Device, Scalar>(context) {}
+
+  ~MatrixTriangularSolveOp() override {}
+
+ private:
+  void ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                            const Tensor& in1) override {
+    const auto in0_num_dims = in0.dims();
+    OP_REQUIRES(
+        ctx, in0_num_dims >= 2,
+        errors::InvalidArgument("In[0] ndims must be >= 2: ", in0_num_dims));
+
+    const auto in1_num_dims = in1.dims();
+    OP_REQUIRES(
+        ctx, in1_num_dims >= 2,
+        errors::InvalidArgument("In[1] ndims must be >= 2: ", in1_num_dims));
+
+    const auto in0_last_dim = in0.dim_size(in0_num_dims - 1);
+    const auto in0_prev_dim = in0.dim_size(in0_num_dims - 2);
+    OP_REQUIRES(ctx, in0_last_dim == in0_prev_dim,
+                errors::InvalidArgument(
+                    "In[0] matrices in the last dimensions must be square (",
+                    in0_last_dim, " =/= ", in0_prev_dim, ")"));
+  }
+};
+
+#define REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_CPU(TYPE)             \
+  REGISTER_KERNEL_BUILDER(Name("MatrixTriangularSolve")              \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<TYPE>("T"),            \
+                          MatrixTriangularSolveOp<CPUDevice, TYPE>); \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatrixTriangularSolve")         \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<TYPE>("T"),            \
+                          MatrixTriangularSolveOp<CPUDevice, TYPE>);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Scalar>
+struct LaunchBatchMatrixTriangularSolve<GPUDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adjoint, bool lower,
+                     const MatMulBCast& bcast, Tensor* out) {
+    auto* stream = context->op_device_context()->stream();
+
+    const uint64 m = in_x.dim_size(1);
+    const uint64 n = out->dim_size(2);
+
+    //  Do a memcpy when we don't need to broadcast.
+    if (!bcast.IsBroadcastingRequired() || out->shape() == in_y.shape()) {
+      auto src_device_mem = AsDeviceMemory(in_y.template flat<Scalar>().data());
+      auto dst_device_mem = AsDeviceMemory(out->template flat<Scalar>().data());
+      OP_REQUIRES_OK(context, stream->MemcpyD2D(&dst_device_mem, src_device_mem,
+                                                bcast.y_batch_size() * m * n *
+                                                    sizeof(Scalar)));
+    } else {
+      std::vector<Scalar*> out_ptrs;
+      std::vector<const Scalar*> b_tmp_ptrs;
+      auto* b_base_ptr = in_y.template flat<Scalar>().data();
+      const std::vector<int64_t>& b_batch_indices = bcast.y_batch_indices();
+      for (int64_t i = 0; i < bcast.y_batch_size(); ++i) {
+        b_tmp_ptrs.push_back(b_base_ptr + i * m * n);
+      }
+      for (int64_t i = 0; i < bcast.output_batch_size(); ++i) {
+        auto src_device_mem = AsDeviceMemory(b_tmp_ptrs[b_batch_indices[i]]);
+        auto dst_device_mem =
+            AsDeviceMemory(out->template flat<Scalar>().data() + i * m * n);
+        OP_REQUIRES_OK(context,
+                       stream->MemcpyD2D(&dst_device_mem, src_device_mem,
+                                         m * n * sizeof(Scalar)));
+      }
+    }
+
+    if (out->NumElements() == 0) {
+      return;
+    }
+
+#if GOOGLE_CUDA
+
+    cublasSideMode_t side = CUBLAS_SIDE_RIGHT;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
+
+    // Cublas does
+    // output = matrix \ rhs
+    // where matrix, rhs and output are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // output' = rhs' / matrix' (' stands for transpose)
+    // Upper/lower needs to be swapped for this.
+
+    uplo = lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+    trans = adjoint ? CUBLAS_OP_C : CUBLAS_OP_N;
+
+#elif TENSORFLOW_USE_ROCM
+    rocblas_side side = rocblas_side_right;
+    rocblas_fill uplo;
+    rocblas_operation trans;
+    rocblas_diagonal diag = rocblas_diagonal_non_unit;
+
+    // rocblas does
+    // output = matrix \ rhs
+    // where matrix, rhs and output are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // output' = rhs' / matrix' (' stands for transpose)
+    // Upper/lower needs to be swapped for this.
+
+    uplo = lower ? rocblas_fill_upper : rocblas_fill_lower;
+    trans = adjoint ? rocblas_operation_conjugate_transpose
+                    : rocblas_operation_none;
+
+#endif
+
+    auto solver = absl::make_unique<GpuSolver>(context);
+    const uint64 leading_dim_matrix = m;
+    const uint64 leading_dim_output = n;
+    const uint64 colmajor_rows = n;
+    const uint64 colmajor_cols = m;
+
+    const int64_t batch_size = bcast.output_batch_size();
+    std::vector<const Scalar*> a_ptrs;
+    std::vector<Scalar*> out_ptrs;
+    std::vector<const Scalar*> a_tmp_ptrs;
+    a_ptrs.reserve(batch_size);
+    out_ptrs.reserve(batch_size);
+    a_tmp_ptrs.reserve(bcast.x_batch_size());
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* out_base_ptr = out->template flat<Scalar>().data();
+
+    if (!bcast.IsBroadcastingRequired()) {
+      for (int64_t i = 0; i < batch_size; ++i) {
+        a_ptrs.push_back(a_base_ptr + i * m * m);
+        out_ptrs.push_back(out_base_ptr + i * m * n);
+      }
+    } else {
+      const std::vector<int64_t>& a_batch_indices = bcast.x_batch_indices();
+      for (int64_t i = 0; i < bcast.x_batch_size(); ++i) {
+        a_tmp_ptrs.push_back(a_base_ptr + i * m * m);
+      }
+      for (int64_t i = 0; i < batch_size; ++i) {
+        a_ptrs.push_back(a_tmp_ptrs[a_batch_indices[i]]);
+        out_ptrs.push_back(out_base_ptr + i * m * n);
+      }
+    }
+
+    typedef Scalar Coefficient;
+    const Scalar alpha = Scalar(1.0);
+
+    // TODO(b/146763573): Consider using Trsv here when the right hand side is
+    // a vector. This will require an explicit transpose since Trsv assumes
+    // CUBLAS_SIDE_LEFT.
+    if (batch_size == 1) {
+      OP_REQUIRES_OK(
+          context,
+          solver->Trsm(side, uplo, trans, diag, colmajor_rows, colmajor_cols,
+                       &alpha, a_ptrs[0], leading_dim_matrix /*lda*/,
+                       out_ptrs[0], leading_dim_output /*ldb*/));
+    } else {
+      // Heuristic for choosing between batched interface vs. non-batched
+      // interface. This is inspired by matrix_solve_op and can probably be
+      // tuned.
+      // TODO(b/146763573): Tune this heuristic.
+      const int kMaxMatrixSizeToBatchSizeRatio = 128;
+      const bool use_batched_solver =
+          m <= kMaxMatrixSizeToBatchSizeRatio * batch_size;
+      if (use_batched_solver) {
+        OP_REQUIRES_OK(
+            context, solver->TrsmBatched(
+                         side, uplo, trans, diag, colmajor_rows, colmajor_cols,
+                         &alpha, &a_ptrs[0], leading_dim_matrix /*lda*/,
+                         &out_ptrs[0], leading_dim_output /*ldb*/, batch_size));
+      } else {
+        for (int batch = 0; batch < batch_size; ++batch) {
+          OP_REQUIRES_OK(
+              context, solver->Trsm(side, uplo, trans, diag, colmajor_rows,
+                                    colmajor_cols, &alpha, a_ptrs[batch],
+                                    leading_dim_matrix /*lda*/, out_ptrs[batch],
+                                    leading_dim_output /*ldb*/));
+        }
+      }
+    }
+  }
+};
+
+#define REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_GPU(TYPE)             \
+  REGISTER_KERNEL_BUILDER(Name("MatrixTriangularSolve")              \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<TYPE>("T"),            \
+                          MatrixTriangularSolveOp<GPUDevice, TYPE>); \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatrixTriangularSolve")         \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<TYPE>("T"),            \
+                          MatrixTriangularSolveOp<GPUDevice, TYPE>);
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_MATRIX_TRIANGULAR_SOLVE_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/qr_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/qr_op_impl.h
new file mode 100644
index 00000000..c5a1823f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -0,0 +1,318 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_QR_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_QR_OP_IMPL_H_
+
+// See docs in ../ops/linalg_ops.cc.
+//
+// This header file is used by the individual qr_*op*.cc files for registering
+// individual kernels. A separate file is used for each instantiated kernel to
+// improve compilation times.
+#include <algorithm>
+#include <numeric>
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
+
+#include "Eigen/QR"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/kernels/linalg/eye_functor.h"
+#include "tensorflow/core/kernels/linalg/matrix_band_part_op.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/util/gpu_solvers.h"
+#endif
+
+namespace tensorflow {
+
+template <class Scalar>
+class QrOp : public LinearAlgebraOp<Scalar> {
+ public:
+  typedef LinearAlgebraOp<Scalar> Base;
+
+  explicit QrOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSingleMatrix(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64_t m = input_matrix_shapes[0].dim_size(0);
+    int64_t n = input_matrix_shapes[0].dim_size(1);
+    int64_t min_size = std::min(m, n);
+    if (full_matrices_) {
+      return TensorShapes({TensorShape({m, m}), TensorShape({m, n})});
+    } else {
+      return TensorShapes(
+          {TensorShape({m, min_size}), TensorShape({min_size, n})});
+    }
+  }
+
+  int64_t GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double max_size = std::max(m, n);
+    double min_size = std::min(m, n);
+    double cost = 2 * max_size * min_size * min_size -
+                  2 * min_size * min_size * min_size / 3.;
+    // TODO(jpoulson): Increase the cost if full_matrices is true in a manner
+    // that reflects the algorithm used for the expansion.
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64_t>(cost);
+  }
+
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    Eigen::HouseholderQR<Matrix> qr(inputs[0]);
+    const int m = inputs[0].rows();
+    const int n = inputs[0].cols();
+    const int min_size = std::min(m, n);
+
+    if (full_matrices_) {
+      outputs->at(0) = qr.householderQ();
+      outputs->at(1) = qr.matrixQR().template triangularView<Eigen::Upper>();
+    } else {
+      // TODO(jpoulson): Exploit the fact that Householder transformations can
+      // be expanded faster than they can be applied to an arbitrary matrix
+      // (Cf. LAPACK's DORGQR).
+      Matrix tmp = Matrix::Identity(m, min_size);
+      outputs->at(0) = qr.householderQ() * tmp;
+      auto qr_top = qr.matrixQR().block(0, 0, min_size, n);
+      outputs->at(1) = qr_top.template triangularView<Eigen::Upper>();
+    }
+  }
+
+ private:
+  bool full_matrices_;
+
+  QrOp(const QrOp&) = delete;
+  void operator=(const QrOp&) = delete;
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Scalar>
+class QrOpGpu : public AsyncOpKernel {
+ public:
+  explicit QrOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64_t m = input.dim_size(ndims - 2);
+    const int64_t n = input.dim_size(ndims - 1);
+    const int64_t min_size = std::min(m, n);
+    const int64_t batch_size =
+        input.template flat_inner_dims<Scalar, 3>().dimension(0);
+
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+
+    // Allocate output.
+    // If full_matrices_ is true then Q is m x m and R is m x n.
+    // Otherwise, Q is m x min(m, n), and R is min(m, n) x n.
+    Tensor* q;
+    TensorShape q_shape = input.shape();
+    q_shape.set_dim(ndims - 1, full_matrices_ ? m : min_size);
+    OP_REQUIRES_OK_ASYNC(context, context->allocate_output(0, q_shape, &q),
+                         done);
+    Tensor* r;
+    TensorShape r_shape = input.shape();
+    r_shape.set_dim(ndims - 2, full_matrices_ ? m : min_size);
+    OP_REQUIRES_OK_ASYNC(context, context->allocate_output(1, r_shape, &r),
+                         done);
+
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // TODO(rmlarsen): Convert to std::make_unique when available.
+    std::unique_ptr<GpuSolver> solver(new GpuSolver(context));
+
+    // Allocate temporaries.
+    Tensor input_transposed;
+    TensorShape transposed_shape = input.shape();
+    transposed_shape.set_dim(ndims - 2, input.dim_size(ndims - 1));
+    transposed_shape.set_dim(ndims - 1, input.dim_size(ndims - 2));
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver->allocate_scoped_tensor(DataTypeToEnum<Scalar>::value,
+                                       transposed_shape, &input_transposed),
+        done);
+
+    Tensor tau;
+    OP_REQUIRES_OK_ASYNC(context,
+                         solver->allocate_scoped_tensor(
+                             DataTypeToEnum<Scalar>::value,
+                             TensorShape({batch_size, min_size}), &tau),
+                         done);
+
+    // Transpose input, since cuSolver uses column-major, while TensorFlow uses
+    // row-major storage.
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    OP_REQUIRES_OK_ASYNC(
+        context, DoMatrixTranspose(device, input, &input_transposed), done);
+
+    // Compute QR decomposition in-place in input_transposed.
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "geqrf"));
+    auto input_transposed_reshaped =
+        input_transposed.flat_inner_dims<Scalar, 3>();
+    auto tau_matrix = tau.matrix<Scalar>();
+    auto r_reshaped = r->flat_inner_dims<Scalar, 3>();
+    for (int batch = 0; batch < batch_size; ++batch) {
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver->Geqrf(m, n, &input_transposed_reshaped(batch, 0, 0), m,
+                        &tau_matrix(batch, 0),
+                        dev_info.back().mutable_data() + batch),
+          done);
+    }
+
+#if GOOGLE_CUDA
+    cublasOperation_t transa = CUBLAS_OP_T;
+    cublasOperation_t transb = CUBLAS_OP_N;
+    cublasSideMode_t side = CUBLAS_SIDE_LEFT;
+#elif TENSORFLOW_USE_ROCM
+    rocblas_operation transa = rocblas_operation_transpose;
+    rocblas_operation transb = rocblas_operation_none;
+    rocblas_side side = rocblas_side_left;
+#endif
+
+    // Generate R. R is equal to the upper triangle of the decomposition
+    // stored in input_transposed. Crop, transpose (to get back to row-major)
+    // and copy it to the output buffer.
+    if (full_matrices_ || m == n) {
+      OP_REQUIRES_OK_ASYNC(
+          context, DoMatrixTranspose(device, input_transposed, r), done);
+    } else {
+      const Scalar alpha(1);
+      const Scalar beta(0);
+      const Scalar* dummy = nullptr;
+      for (int batch = 0; batch < batch_size; ++batch) {
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->Geam(transa, transb, n, full_matrices_ ? m : min_size,
+                         &alpha, &input_transposed_reshaped(batch, 0, 0), m,
+                         &beta, dummy, n, &r_reshaped(batch, 0, 0), n),
+            done);
+      }
+    }
+    // Extract the upper triangle of r (i.e. zero out the strictly lower
+    // triangle).
+    functor::MatrixBandPartFunctor<GPUDevice, Scalar> band_part;
+    auto r_reshaped_const =
+        const_cast<const Tensor*>(r)->flat_inner_dims<Scalar, 3>();
+    band_part(context, device, 0 /* num_lower_diags */,
+              -1 /* num_upper_diags */, r_reshaped_const, r_reshaped);
+
+    // Generate Q from the decomposition in input_transposed.
+    if (m != n && (full_matrices_ || m < n)) {
+      // Generate full m x m matrix Q by computing the product Q^T * I,
+      // where the transpose is to get back to row-major form.
+      // In the complex case we actually form Q^H * I and conjugate it
+      // to get Q in row-major form.
+      functor::EyeFunctor<GPUDevice, Scalar> eye;
+      auto q_reshaped = q->flat_inner_dims<Scalar, 3>();
+      eye(device, q_reshaped);
+#if GOOGLE_CUDA
+      cublasOperation_t trans = CublasAdjointOp<Scalar>();
+#elif TENSORFLOW_USE_ROCM
+      rocblas_operation trans = RocblasAdjointOp<Scalar>();
+#endif
+      for (int batch = 0; batch < batch_size; ++batch) {
+        // Notice: It appears that Unmqr does not write a zero into *info upon
+        // success (probably a bug), so we simply re-use the info array already
+        // zeroed by Geqrf above.
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->Unmqr(side, trans, m, m, min_size,
+                          &input_transposed_reshaped(batch, 0, 0), m,
+                          &tau_matrix(batch, 0), &q_reshaped(batch, 0, 0), m,
+                          dev_info.back().mutable_data() + batch),
+            done);
+      }
+      if (Eigen::NumTraits<Scalar>::IsComplex) {
+        functor::UnaryFunctor<GPUDevice, functor::conj<Scalar>> conj;
+        conj(device, q->flat<Scalar>() /*out*/,
+             const_cast<const Tensor*>(q)->flat<Scalar>() /*in*/);
+      }
+    } else {
+      // Generate m x n matrix Q. In this case we can use the more efficient
+      // algorithm in Ungqr to generate Q in place.
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "orgqr"));
+      for (int batch = 0; batch < batch_size; ++batch) {
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->Ungqr(
+                m, n, min_size, &input_transposed_reshaped(batch, 0, 0), m,
+                &tau_matrix(batch, 0), dev_info.back().mutable_data() + batch),
+            done);
+      }
+      OP_REQUIRES_OK_ASYNC(
+          context, DoMatrixTranspose(device, input_transposed, q), done);
+    }
+
+    // Asynchronously check return status from cuSolver kernels.
+    GpuSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
+                                                   std::move(done));
+  }
+
+ private:
+  bool full_matrices_;
+
+  QrOpGpu(const QrOpGpu&) = delete;
+  void operator=(const QrOpGpu&) = delete;
+};
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_QR_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h
new file mode 100644
index 00000000..4fba705f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_impl.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "Eigen/Eigenvalues"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar>
+class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
+ public:
+  typedef LinearAlgebraOp<Scalar> Base;
+
+  explicit SelfAdjointEigV2Op(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_v", &compute_v_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64_t n = input_matrix_shapes[0].dim_size(0);
+    if (compute_v_) {
+      return TensorShapes({TensorShape({n}), TensorShape({n, n})});
+    } else {
+      return TensorShapes({TensorShape({n})});
+    }
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const int64_t rows = inputs[0].rows();
+    if (rows == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      return;
+    }
+
+    // This algorithm relies on denormals, so switch them back on locally.
+    port::ScopedDontFlushDenormal dont_flush_denormals;
+
+    Eigen::SelfAdjointEigenSolver<Matrix> eig(
+        inputs[0],
+        compute_v_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
+    // TODO(rmlarsen): Output more detailed error info on failure.
+    OP_REQUIRES(
+        context, eig.info() == Eigen::Success,
+        errors::InvalidArgument("Self-adjoint eigen decomposition was not "
+                                "successful. The input might not be valid."));
+
+    outputs->at(0) = eig.eigenvalues().template cast<Scalar>();
+    if (compute_v_) {
+      outputs->at(1) = eig.eigenvectors();
+    }
+  }
+
+ private:
+  bool compute_v_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/svd_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/svd_op_impl.h
new file mode 100644
index 00000000..4e674585
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg/svd_op_impl.h
@@ -0,0 +1,135 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_SVD_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_SVD_OP_IMPL_H_
+
+// See docs in ../ops/linalg_ops.cc.
+//
+// This header file is used by the individual svd_*op*.cc files for registering
+// individual kernels. A separate file is used for each instantiated kernel to
+// improve compilation times.
+#include <algorithm>
+
+#include "Eigen/SVD"  // from @eigen_archive
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar>
+class SvdOp : public LinearAlgebraOp<Scalar> {
+ public:
+  typedef LinearAlgebraOp<Scalar> Base;
+
+  explicit SvdOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_uv", &compute_uv_));
+    OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSingleMatrix(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64_t m = input_matrix_shapes[0].dim_size(0);
+    int64_t n = input_matrix_shapes[0].dim_size(1);
+    int64_t min_size = std::min(m, n);
+    if (compute_uv_) {
+      return TensorShapes({TensorShape({min_size}),
+                           TensorShape({m, full_matrices_ ? m : min_size}),
+                           TensorShape({n, full_matrices_ ? n : min_size})});
+    } else {
+      return TensorShapes({TensorShape({min_size})});
+    }
+  }
+
+  // TODO(rmlarsen): This should depend on compute_uv. See b/30409375.
+  int64_t GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = 12 * std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64_t>(cost);
+  }
+
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    int64_t n = inputs[0].cols();
+    int64_t m = inputs[0].rows();
+    const bool empty = (m == 0 || n == 0);
+    int options = 0;  // Don't compute singular vectors;
+    if (compute_uv_) {
+      options = full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
+                               : Eigen::ComputeThinU | Eigen::ComputeThinV;
+    }
+
+    if (empty) {
+      // For an empty matrix where only one dimension is zero, we still set
+      // U or V to the unit matrix for the dimension that is non-zero.
+      if (compute_uv_ && full_matrices_) {
+        if (m > 0) {
+          outputs->at(1) = Matrix::Identity(m, m);
+        } else {
+          outputs->at(2) = Matrix::Identity(n, n);
+        }
+      }
+      return;
+    }
+
+    Eigen::BDCSVD<Matrix> svd(inputs[0], options);
+    if (svd.info() != Eigen::Success) {
+      LOG(ERROR) << "Eigen::BDCSVD failed with error code " << svd.info();
+      outputs->at(0).fill(std::numeric_limits<Scalar>::quiet_NaN());
+      if (compute_uv_) {
+        outputs->at(1).fill(std::numeric_limits<Scalar>::quiet_NaN());
+        outputs->at(2).fill(std::numeric_limits<Scalar>::quiet_NaN());
+      }
+    } else {
+      outputs->at(0) = svd.singularValues().template cast<Scalar>();
+      if (compute_uv_) {
+        outputs->at(1) = svd.matrixU();
+        outputs->at(2) = svd.matrixV();
+      }
+    }
+  }
+
+ private:
+  bool compute_uv_;
+  bool full_matrices_;
+
+  SvdOp(const SvdOp&) = delete;
+  void operator=(const SvdOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_SVD_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/linalg_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg_ops_common.h
new file mode 100644
index 00000000..0aa69801
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/linalg_ops_common.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
+
+// Temporary forwarding header.
+#include "tensorflow/core/kernels/linalg/linalg_ops_common.h"
+
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/list_kernels.h b/third_party/tflite-hdrs/tensorflow/core/kernels/list_kernels.h
new file mode 100644
index 00000000..9837b087
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/list_kernels.h
@@ -0,0 +1,1137 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/tensor_list.h"
+#include "tensorflow/core/kernels/tensor_list_util.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+#include "tensorflow/core/util/util.h"
+
+// stream.h isn't available in some platforms such as Android, iOS, ChromiumOS,
+// and Fuchsia. Only include it for platforms that PluggableDevice is tested on.
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED) &&                              \
+    (__x86_64__ || __i386__ || defined(__APPLE__) || defined(_WIN32)) && \
+    !defined(ANDROID) && !defined(__ANDROID__) && !TARGET_OS_IOS &&      \
+    !defined(PLATFORM_CHROMIUMOS) && !defined(__Fuchsia__)
+#define PLUGGABLE_DEVICE_SUPPORTED
+#endif
+
+#ifdef PLUGGABLE_DEVICE_SUPPORTED
+#include "xla/stream_executor/stream.h"
+#endif
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+absl::Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
+
+absl::Status GetElementShapeFromInput(OpKernelContext* c,
+                                      const TensorList& tensor_list, int index,
+                                      PartialTensorShape* element_shape);
+
+absl::Status GetInputList(OpKernelContext* c, int index,
+                          const TensorList** list);
+
+absl::Status ForwardInputOrCreateNewList(OpKernelContext* c,
+                                         int32_t input_index,
+                                         int32_t output_index,
+                                         const TensorList& input_list,
+                                         TensorList** output_list);
+
+// TODO(penporn): Move this to a proper place.
+inline bool IsPluggableDevice(OpKernelContext* c) {
+  return c->op_device_context() && c->op_device_context()->IsPluggableDevice();
+}
+
+template <typename Device, typename T>
+inline void SetZero(OpKernelContext* ctx, Tensor& tensor) {
+#ifdef PLUGGABLE_DEVICE_SUPPORTED
+  if (IsPluggableDevice(ctx)) {
+    auto ptr =
+        se::DeviceMemoryBase(tensor.flat<T>().data(), tensor.TotalBytes());
+    auto stream = ctx->op_device_context()->stream();
+    auto result = stream->MemZero(&ptr, tensor.TotalBytes()).ok();
+    DCHECK_EQ(true, result);
+  } else {
+#endif  // PLUGGABLE_DEVICE_SUPPORTED
+    functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                         tensor.flat<T>());
+#ifdef PLUGGABLE_DEVICE_SUPPORTED
+  }
+#endif  // PLUGGABLE_DEVICE_SUPPORTED
+}
+
+template <typename T>
+inline void CopyTensorPluggableDevice(OpKernelContext* ctx, Tensor& src,
+                                      Tensor& dst) {
+#ifdef PLUGGABLE_DEVICE_SUPPORTED
+  auto src_t = src.unaligned_flat<T>();
+  auto dst_t = dst.flat<T>();
+  DCHECK(DataTypeCanUseMemcpy(DataTypeToEnum<T>::v()));
+  auto src_ptr = se::DeviceMemoryBase(src_t.data(), src.TotalBytes());
+  auto dst_ptr = se::DeviceMemoryBase(dst_t.data(), dst.TotalBytes());
+  auto stream = ctx->op_device_context()->stream();
+  auto result = stream->Memcpy(&dst_ptr, src_ptr, src.TotalBytes()).ok();
+  DCHECK_EQ(true, result);
+#else
+  LOG(FATAL)  // Crash OK.
+      << "PluggableDevice is not supported on this platform.";
+#endif  // PLUGGABLE_DEVICE_SUPPORTED
+}
+
+template <typename Device, typename T>
+inline void CopyTensor(OpKernelContext* ctx, Tensor& src, Tensor& dst) {
+  auto src_t = src.unaligned_flat<T>();
+  auto dst_t = dst.flat<T>();
+  dst_t.device(ctx->eigen_device<Device>()) = src_t;
+}
+
+template <typename T>
+void ConcatPluggableDevice(
+    OpKernelContext* context,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
+#ifdef PLUGGABLE_DEVICE_SUPPORTED
+  DCHECK(DataTypeCanUseMemcpy(DataTypeToEnum<T>::v()));
+
+  se::Stream* stream = context->op_device_context()->stream();
+
+  size_t num_inputs = inputs.size();
+  std::vector<ptrdiff_t> sizes;
+  sizes.reserve(num_inputs);
+  int64 row_size = 0;
+  for (const auto& input : inputs) {
+    sizes.push_back(input->dimension(1));
+    row_size += sizes.back();
+  }
+
+  T* out = &(*output)(0, 0);
+  std::vector<const T*> inp;
+  inp.reserve(num_inputs);
+  for (const auto& input : inputs) {
+    inp.push_back(&(*input)(0, 0));
+  }
+  const int64 dim0 = output->dimension(0);
+  for (int64 i = 0; i < dim0; ++i) {
+    for (int64 j = 0; j < num_inputs; ++j) {
+      auto size = sizes[j];
+      se::DeviceMemoryBase out_base{out, size * sizeof(T)};
+      se::DeviceMemoryBase inp_base{const_cast<T*>(inp[j]), size * sizeof(T)};
+      OP_REQUIRES_OK(context,
+                     stream->Memcpy(&out_base, inp_base, size * sizeof(T)));
+      out += size;
+      inp[j] += size;
+    }
+  }
+#else
+  LOG(FATAL)  // Crash OK.
+      << "PluggableDevice is not supported on this platform.";
+#endif  // PLUGGABLE_DEVICE_SUPPORTED
+}
+
+template <typename Device, typename T>
+class TensorListStack : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+  explicit TensorListStack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_elements", &num_elements_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    if (num_elements_ != -1) {
+      OP_REQUIRES(c, tensor_list->tensors().size() == num_elements_,
+                  errors::InvalidArgument(
+                      "Operation expected a list with ", num_elements_,
+                      " elements but got a list with ",
+                      tensor_list->tensors().size(), " elements."));
+    }
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 1,
+                                               &partial_element_shape));
+    OP_REQUIRES(
+        c,
+        partial_element_shape.IsFullyDefined() ||
+            !tensor_list->tensors().empty(),
+        errors::InvalidArgument("Tried to stack elements of an empty ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int i = 0; i < tensor_list->tensors().size(); ++i) {
+        const Tensor& t = tensor_list->tensors()[i];
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
+      }
+    }
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(c, partial_element_shape.AsTensorShape(&element_shape),
+                errors::InvalidArgument(
+                    "Tried to stack list which only contains uninitialized ",
+                    "tensors and has a non-fully-defined element_shape: ",
+                    partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, tensor_list->tensors().size());
+    Tensor* output;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(tensor_list->tensors().size());
+    Tensor zeros;
+    for (const auto& t : tensor_list->tensors()) {
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          SetZero<Device, T>(c, zeros);
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (IsPluggableDevice(c)) {
+      ConcatPluggableDevice<T>(c, inputs_flat, &output_flat);
+    } else {
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+ private:
+  int num_elements_;
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListGetItem : public OpKernel {
+ public:
+  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    int32_t index = c->input(1).scalar<int32>()();
+    OP_REQUIRES(c, index < l->tensors().size(),
+                errors::InvalidArgument("Trying to access element ", index,
+                                        " in a list with ", l->tensors().size(),
+                                        " elements."));
+    if (l->tensors()[index].dtype() != DT_INVALID) {
+      c->set_output(0, l->tensors()[index]);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 2, &partial_element_shape));
+      TensorShape element_shape;
+      // If l->element_shape and the element_shape input are both not fully
+      // defined, try to infer the shape from other list elements. This requires
+      // that all initialized list elements have the same shape.
+      // NOTE(srbs): This might be a performance bottleneck since we are
+      // iterating over the entire list here. This is necessary for feature
+      // parity with TensorArray.read. TensorArray has a mode in which all
+      // elements are required to be of the same shape, TensorList does not.
+      // In that mode TensorArray sets the array's element_shape on the first
+      // write call. We could do something similar here if needed.
+      if (!partial_element_shape.IsFullyDefined()) {
+        for (const Tensor& t : l->tensors()) {
+          if (t.dtype() != DT_INVALID) {
+            PartialTensorShape tmp = partial_element_shape;
+            OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+          }
+        }
+      }
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined: ",
+                                  partial_element_shape.DebugString(),
+                                  " and no list element is set."));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(0, element_shape, &result, attr));
+      SetZero<Device, T>(c, *result);
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListPopBack : public OpKernel {
+ public:
+  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+
+    OP_REQUIRES(c, !l->tensors().empty(),
+                errors::InvalidArgument("Trying to pop from an empty list."));
+
+    const Tensor& t = l->tensors().back();
+    if (t.dtype() != DT_INVALID) {
+      c->set_output(1, t);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 1, &partial_element_shape));
+      TensorShape element_shape;
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined.",
+                                  partial_element_shape.DebugString()));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(1, element_shape, &result, attr));
+      SetZero<Device, T>(c, *result);
+    }
+
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors().pop_back();
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListConcat : public OpKernel {
+ public:
+  using ConstMatrixVector =
+      std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
+  explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+    if (c->HasAttr("element_shape")) {
+      OP_REQUIRES_OK(c, c->GetAttr("element_shape", &element_shape_));
+    }
+  }
+
+  void Compute(OpKernelContext* c) override {
+    PartialTensorShape element_shape_except_first_dim;
+    if (!element_shape_.unknown_rank()) {
+      auto dim_sizes = element_shape_.dim_sizes();
+      OP_REQUIRES(c, !dim_sizes.empty(),
+                  errors::InvalidArgument("element_shape must not be empty"));
+      element_shape_except_first_dim =
+          PartialTensorShape(absl::Span<const int64_t>(dim_sizes).subspan(1));
+    }
+    // Check that the input Variant tensor is indeed a TensorList and has the
+    // correct element type.
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    // The leading dimension of all list elements if they are all the same.
+    // This is used as the leading dim of uninitialized tensors in the list
+    // if leading_dims is not provided.
+    int64_t first_dim = -1;
+    if (c->num_inputs() > 1) {
+      // TensorListConcatV2
+      PartialTensorShape element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *tensor_list, 1, &element_shape));
+      OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Concat requires elements to be at least vectors, ",
+                      "found scalars instead."));
+      // Split `element_shape` into `first_dim` and
+      // `element_shape_except_first_dim`.
+      first_dim = element_shape.dim_size(0);
+      element_shape_except_first_dim = element_shape;
+      element_shape_except_first_dim.RemoveDim(0);
+    }
+    // If the TensorList is empty, element_shape_except_first_dim must be fully
+    // defined.
+    OP_REQUIRES(c,
+                !tensor_list->tensors().empty() ||
+                    element_shape_except_first_dim.IsFullyDefined(),
+                errors::InvalidArgument(
+                    "All except the first dimension must be fully defined ",
+                    "when concating an empty tensor list. element_shape: ",
+                    element_shape_except_first_dim.DebugString()));
+    // 1. Check that `element_shape_except_first_dim` input tensor is
+    //    compatible with the shapes of element tensors.
+    // 2. Check that the elements have the same shape except the first dim.
+    // 3. If `first_dim` is known, check that it is compatible with the leading
+    //    dims of all elements.
+    // 4. If `first_dim` is unknown (-1), check whether all initialized
+    //    elements have the same leading dim and if so set `first_dim` to that
+    //    value.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      bool check_dim = (first_dim == -1);
+      int64_t inferred_first_dim = first_dim;
+      for (int i = 0; i < tensor_list->tensors().size(); ++i) {
+        const Tensor& t = tensor_list->tensors()[i];
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = element_shape_except_first_dim;
+          OP_REQUIRES(
+              c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+              errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                      " but requires at least vectors."));
+          TensorShape shape_except_first_dim = TensorShape(
+              absl::Span<const int64_t>(t.shape().dim_sizes()).subspan(1));
+          OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
+                                          &element_shape_except_first_dim));
+          OP_REQUIRES(c, first_dim == -1 || first_dim == t.shape().dim_size(0),
+                      errors::InvalidArgument(
+                          "First entry of element_shape input does not match ",
+                          "the first dim of list element at index: ", i,
+                          " Expected: ", first_dim,
+                          " Actual: ", t.shape().dim_size(0)));
+          if (check_dim) {
+            if (inferred_first_dim == -1) {
+              inferred_first_dim = t.shape().dim_size(0);
+            } else if (inferred_first_dim != t.shape().dim_size(0)) {
+              inferred_first_dim = -1;
+              check_dim = false;
+            }
+          }
+        }
+      }
+      first_dim = inferred_first_dim;
+    }
+    TensorShape output_shape;
+    OP_REQUIRES(c, element_shape_except_first_dim.AsTensorShape(&output_shape),
+                errors::InvalidArgument(
+                    "Trying to concat list with only uninitialized tensors ",
+                    "but element_shape_except_first_dim is not fully defined: ",
+                    element_shape_except_first_dim.DebugString()));
+    // Build the lengths_tensor and leading dim of the output tensor by
+    // iterating over all element tensors.
+    Tensor* lengths_tensor = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(1,
+                                         TensorShape({static_cast<int64_t>(
+                                             tensor_list->tensors().size())}),
+                                         &lengths_tensor));
+    auto lengths_tensor_vec = lengths_tensor->vec<int64_t>();
+    int64_t leading_dim = 0;
+    for (size_t i = 0; i < tensor_list->tensors().size(); i++) {
+      int64_t dim;
+      if (tensor_list->tensors()[i].dtype() != DT_INVALID) {
+        dim = tensor_list->tensors()[i].shape().dim_size(0);
+      } else {
+        // If leading_dims is not provided or does not contain an entry for
+        // index i use the inferred `first_dim` if set.
+        if ((c->num_inputs() <= 2 || i >= c->input(2).NumElements()) &&
+            first_dim != -1) {
+          dim = first_dim;
+        } else {
+          OP_REQUIRES(c, c->num_inputs() > 2,
+                      errors::InvalidArgument(
+                          "Concating lists with uninitialized tensors is not ",
+                          "supported in this version of TensorListConcat. ",
+                          "Consider updating your GraphDef to run the newer ",
+                          "version."));
+          OP_REQUIRES(c, i < c->input(2).NumElements(),
+                      errors::InvalidArgument(
+                          "List contains uninitialized tensor at index ", i,
+                          " but leading_dims has only ",
+                          c->input(2).NumElements(), " elements."));
+          dim = c->input(2).vec<int64_t>()(i);
+        }
+      }
+      leading_dim += dim;
+      lengths_tensor_vec(i) = dim;
+    }
+    output_shape.InsertDim(0, leading_dim);
+    Tensor* output;
+    // Allocate the output tensor and fill it up with the concated element
+    // tensors.
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(tensor_list->tensors().size());
+    // Store the zeros tensors in a vector to prevent them from being GC'ed till
+    // concat is complete.
+    std::vector<Tensor> zeros_vec;
+    for (int i = 0; i < tensor_list->tensors().size(); i++) {
+      const Tensor& element_tensor = tensor_list->tensors()[i];
+      if (element_tensor.dtype() != DT_INVALID) {
+        if (element_tensor.NumElements() > 0) {
+          inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+              element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        }
+      } else {
+        AllocatorAttributes attr;
+        if (element_dtype_ == DT_VARIANT) {
+          attr.set_on_host(true);
+        }
+        TensorShape element_shape = output_shape;
+        element_shape.set_dim(0, lengths_tensor_vec(i));
+        zeros_vec.emplace_back();
+        Tensor& zeros = zeros_vec.back();
+        OP_REQUIRES_OK(
+            c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+        SetZero<Device, T>(c, zeros);
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (IsPluggableDevice(c)) {
+      ConcatPluggableDevice<T>(c, inputs_flat, &output_flat);
+    } else {
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+  PartialTensorShape element_shape_;
+};
+
+template <typename Device, typename T>
+class TensorListSplit : public OpKernel {
+ public:
+  TensorListSplit(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                errors::InvalidArgument(
+                    "TensorListSplit requires element_shape to be at least of ",
+                    "rank 1, but saw: ", element_shape.DebugString()));
+    TensorList output_list;
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    TensorShape tensor_shape_without_first_dim(input_tensor.shape());
+    tensor_shape_without_first_dim.RemoveDim(0);
+    PartialTensorShape element_shape_without_first_dim;
+    if (!element_shape.unknown_rank()) {
+      element_shape_without_first_dim =
+          PartialTensorShape(element_shape.dim_sizes());
+      element_shape_without_first_dim.RemoveDim(0);
+    }
+    OP_REQUIRES(c,
+                element_shape_without_first_dim.IsCompatibleWith(
+                    tensor_shape_without_first_dim),
+                errors::InvalidArgument(
+                    "tensor shape ", input_tensor.shape().DebugString(),
+                    " is not compatible with element_shape ",
+                    element_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    const Tensor& lengths = c->input(2);
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(lengths.shape()),
+                errors::InvalidArgument(
+                    "Expected lengths to be a vector, received shape: ",
+                    lengths.shape().DebugString()));
+    output_list.tensors().reserve(lengths.shape().dim_size(0));
+
+    const auto copy_tensor = IsPluggableDevice(c)
+                                 ? &CopyTensorPluggableDevice<T>
+                                 : &CopyTensor<Device, T>;
+
+    int64_t start = 0;
+    int64_t end = 0;
+    for (int i = 0; i < lengths.shape().dim_size(0); ++i) {
+      int64_t length = lengths.vec<int64_t>()(i);
+      OP_REQUIRES(
+          c, length >= 0,
+          errors::InvalidArgument("Invalid value in lengths: ", length));
+      end = start + length;
+      OP_REQUIRES(c, end <= input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument("Attempting to slice [", start, ", ",
+                                          end, "] from tensor with length ",
+                                          input_tensor.shape().dim_size(0)));
+      Tensor tmp = input_tensor.Slice(start, end);
+      start = end;
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      copy_tensor(c, tmp, aligned);
+      output_list.tensors().emplace_back(aligned);
+    }
+    OP_REQUIRES(c, end == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Unused values in tensor. Length of tensor: ",
+                    input_tensor.shape().dim_size(0), " Values used: ", end));
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
+template <typename Device, typename T>
+class TensorListGather : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+  explicit TensorListGather(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    const Tensor& indices = c->input(1);
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 2,
+                                               &partial_element_shape));
+    OP_REQUIRES(
+        c, partial_element_shape.IsFullyDefined() || indices.NumElements() > 0,
+        errors::InvalidArgument("Tried to gather 0-elements from "
+                                "a list with non-fully-defined shape: ",
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int index = 0; index < indices.NumElements(); ++index) {
+        const int i = indices.flat<int32>()(index);
+
+        OP_REQUIRES(c, 0 <= i && i < tensor_list->tensors().size(),
+                    absl::InvalidArgumentError(absl::StrCat(
+                        "Trying to gather element ", i, " in a list with ",
+                        tensor_list->tensors().size(), " elements.")));
+
+        const Tensor& t = tensor_list->tensors()[i];
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
+      }
+    }
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(
+        c, partial_element_shape.AsTensorShape(&element_shape),
+        errors::InvalidArgument("Tried to gather uninitialized tensors from a ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, indices.NumElements());
+    Tensor* output;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(indices.NumElements());
+    Tensor zeros;
+    for (int index = 0; index < indices.NumElements(); ++index) {
+      const int i = indices.flat<int32>()(index);
+      OP_REQUIRES(
+          c, i < tensor_list->tensors().size(),
+          errors::InvalidArgument("Index ", i, " out o range; list only has ",
+                                  tensor_list->tensors().size(), " elements."));
+      const Tensor& t = tensor_list->tensors()[i];
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          SetZero<Device, T>(c, zeros);
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (IsPluggableDevice(c)) {
+      ConcatPluggableDevice<T>(c, inputs_flat, &output_flat);
+    } else {
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListFromTensor : public OpKernel {
+ public:
+  TensorListFromTensor(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES(
+        c, !TensorShapeUtils::IsMatrixOrHigher(c->input(1).shape()),
+        errors::InvalidArgument(
+            "TensorListFromTensor: element_shape must be at most rank 1 but ",
+            "has the shape of ", c->input(1).shape().DebugString()));
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    TensorList output_list;
+    const Tensor& t = c->input(0);
+    output_list.element_dtype = t.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    t.shape().DebugString()));
+    TensorShape output_shape(t.shape());
+    output_shape.RemoveDim(0);
+    OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
+                errors::InvalidArgument(
+                    "Specified a list with shape ", element_shape.DebugString(),
+                    " from a tensor with shape ", output_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    output_list.tensors().reserve(t.shape().dim_size(0));
+
+    const auto copy_tensor = IsPluggableDevice(c)
+                                 ? &CopyTensorPluggableDevice<T>
+                                 : &CopyTensor<Device, T>;
+
+    for (int i = 0; i < t.shape().dim_size(0); ++i) {
+      Tensor tmp = t.Slice(i, i + 1);
+      TensorShape tmp_shape = tmp.shape();
+      tmp_shape.RemoveDim(0);
+      OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
+                  errors::Unknown("Unexpected shape error."));
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      copy_tensor(c, tmp, aligned);
+      output_list.tensors().push_back(aligned);
+    }
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
+// Scatters values in `value` into `list`. Assumes that `indices` are valid.
+template <typename Device, typename T>
+absl::Status Scatter(OpKernelContext* c, const Tensor& value,
+                     const Tensor& indices, TensorList* list) {
+  const auto copy_tensor = IsPluggableDevice(c) ? &CopyTensorPluggableDevice<T>
+                                                : &CopyTensor<Device, T>;
+  for (int index = 0; index < indices.NumElements(); ++index) {
+    const int i = indices.flat<int32>()(index);
+    Tensor tmp = value.Slice(index, index + 1);
+    TensorShape tmp_shape = tmp.shape();
+    tmp_shape.RemoveDim(0);
+    if (!tmp.CopyFrom(tmp, tmp_shape)) {
+      return errors::Unknown("Unexpected shape error.");
+    }
+    // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+    // prevent this.
+    Tensor aligned;
+    TF_RETURN_IF_ERROR(c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+    // TODO(apassos) do all slices in a single kernel invocation instead of
+    // many small ones.
+    copy_tensor(c, tmp, aligned);
+    std::swap(list->tensors()[i], aligned);
+  }
+  return absl::OkStatus();
+}
+
+template <typename Device, typename T>
+class TensorListScatterIntoExistingList : public OpKernel {
+ public:
+  TensorListScatterIntoExistingList(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    const Tensor& input_tensor = c->input(1);
+    const Tensor& indices = c->input(2);
+
+    // Check that inputs are valid.
+    OP_REQUIRES(c, input_tensor.dtype() == l->element_dtype,
+                errors::InvalidArgument(
+                    "Invalid data types; input tensor type: ",
+                    DataTypeString(input_tensor.dtype()),
+                    " list element_type: ", DataTypeString(l->element_dtype)));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument(
+                    "Expected indices to be a vector, but received shape: ",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(
+        c, indices.NumElements() == input_tensor.shape().dim_size(0),
+        errors::InvalidArgument(
+            "Expected len(indices) == tensor.shape[0], but saw: ",
+            indices.NumElements(), " vs. ", input_tensor.shape().dim_size(0)));
+
+    // Resize the list if needed to accommodate all indices.
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    const auto indices_vec = indices.vec<int32>();
+    int32_t max_index =
+        (indices.NumElements() == 0)
+            ? -1
+            : *std::max_element(indices_vec.data(),
+                                indices_vec.data() + indices.NumElements());
+    if (max_index + 1 > output_list->tensors().size()) {
+      output_list->tensors().resize(max_index + 1);
+    }
+
+    // Scatter the values.
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, output_list));
+  }
+};
+
+template <typename Device, typename T>
+class TensorListScatter : public OpKernel {
+ public:
+  TensorListScatter(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    Tensor indices = c->input(1);
+    PartialTensorShape element_shape;
+    OP_REQUIRES(
+        c, !TensorShapeUtils::IsMatrixOrHigher(c->input(2).shape()),
+        errors::InvalidArgument(
+            "TensorListScatter: element_shape must be at most rank 1 but has ",
+            "the shape of ", c->input(2).shape().DebugString()));
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
+    // TensorListScatterV2 passes the num_elements input, TensorListScatter does
+    // not.
+    int num_elements = -1;
+    if (c->num_inputs() >= 4) {
+      OP_REQUIRES(c, TensorShapeUtils::IsScalar(c->input(3).shape()),
+                  errors::InvalidArgument("num_elements must be a scalar"));
+      num_elements = c->input(3).scalar<int>()();
+    }
+    OP_REQUIRES(c, num_elements >= -1,
+                errors::InvalidArgument(
+                    "TensorListScatter expects num_elements >= -1, found: ",
+                    num_elements));
+    TensorList output_list;
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    TensorShape output_shape(input_tensor.shape());
+    output_shape.RemoveDim(0);
+    OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
+                errors::InvalidArgument(
+                    "Specified a list with shape ", element_shape.DebugString(),
+                    " from a tensor with shape ", output_shape.DebugString()));
+    output_list.element_shape = element_shape;
+
+    OP_REQUIRES(c, indices.NumElements() == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Invalid number of rows in input tensor. Expected: ",
+                    indices.NumElements(),
+                    " Actual: ", input_tensor.shape().dim_size(0)));
+
+    // Validate indices and resize output_list.tensors to fit the highest index.
+    {
+      int highest_index = -1;
+      for (int index = 0; index < indices.NumElements(); ++index) {
+        const int i = indices.flat<int32>()(index);
+        OP_REQUIRES(
+            c, i >= 0,
+            errors::InvalidArgument(
+                "Indices in TensorListScatter must all be non-negative."));
+        OP_REQUIRES(c, num_elements == -1 || i < num_elements,
+                    errors::InvalidArgument(
+                        "TensorListScatter: Trying to scatter at index ", i,
+                        " in list with size ", num_elements));
+        if (i > highest_index) {
+          highest_index = i;
+        }
+      }
+      output_list.tensors().resize(std::max(highest_index + 1, num_elements),
+                                   Tensor(DT_INVALID));
+    }
+
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, &output_list));
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
+template <typename Device>
+absl::Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
+                                 const TensorList& b, TensorList* out) {
+  return TensorListBinaryAdd(c, a, b, out, BinaryAddTensors<Device>);
+}
+
+template <typename Device>
+absl::Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
+                                 TensorList* y) {
+  return TensorListZerosLike(c, x, y, ZerosLikeTensor<Device>);
+}
+
+template <typename Device, typename T>
+class TensorListPushBackBatch : public OpKernel {
+ public:
+  explicit TensorListPushBackBatch(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(1);
+    OP_REQUIRES(c, element_dtype_ == input.dtype(),
+                errors::InvalidArgument("Invalid data types; list elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but tried to append ",
+                                        DataTypeString(input.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument(
+                    "Expected tensor to be at least a vector, but saw shape: ",
+                    input.shape().DebugString()));
+
+    const TensorShape& tls_shape = c->input(0).shape();
+
+    // For purposes of input forwarding, we want the least restrictive
+    // AllocatorAttributes possible.  If we need to allocate later,
+    // we'll request the DT_VARIANT be allocated on host.
+    AllocatorAttributes attr;
+
+    std::unique_ptr<Tensor> tls_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tls_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    bool ok_to_alias = tls_alias != nullptr;
+    if (tls_alias && tls_alias->dtype() == DT_VARIANT &&
+        tls_alias->NumElements() > 0) {
+      auto alias_t = tls_alias->flat<Variant>();
+      for (int i = 0; i < tls_alias->NumElements(); ++i) {
+        TensorList* tl_i = alias_t(i).get<TensorList>();
+        if (tl_i == nullptr || !tl_i->RefCountIsOne()) {
+          ok_to_alias = false;
+          break;
+        }
+      }
+    }
+    const Tensor& tls = ok_to_alias ? *tls_alias : c->input(0);
+
+    OP_REQUIRES(c, tls.dtype() == DT_VARIANT,
+                errors::InvalidArgument(
+                    "Expected input_handles dtype to be Variant, but saw: ",
+                    DataTypeString(tls.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(tls_shape),
+                errors::InvalidArgument(
+                    "Expected input_handles to be a vector, but saw shape: ",
+                    tls_shape.DebugString()));
+    const int64_t batch_size = tls.NumElements();
+    OP_REQUIRES(c, input.dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "Expected tensor.shape[0] == input_handles.size, but saw ",
+                    input.dim_size(0), " vs. ", batch_size));
+    auto tls_t = tls.vec<Variant>();
+
+    TensorShape input_element_shape = input.shape();
+    input_element_shape.RemoveDim(0);
+    std::vector<const TensorList*> tl_batch;
+    for (int64_t b = 0; b < batch_size; ++b) {
+      const TensorList* l = tls_t(b).get<TensorList>();
+      OP_REQUIRES(c, l != nullptr,
+                  errors::InvalidArgument("Input handle at index ", b,
+                                          " is not a list. Saw: '",
+                                          tls_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l->element_shape.IsCompatibleWith(input_element_shape),
+          errors::InvalidArgument(
+              "Tried to append a tensor with incompatible shape to a "
+              "list at index ",
+              b, ". Op element shape: ", input_element_shape.DebugString(),
+              " list shape: ", l->element_shape.DebugString()));
+      OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                  errors::InvalidArgument(
+                      "Invalid data type at index ", b, "; op elements ",
+                      DataTypeString(element_dtype_), " but list elements ",
+                      DataTypeString(l->element_dtype)));
+      tl_batch.push_back(l);
+    }
+
+    Tensor* result;
+
+    if (ok_to_alias) {
+      result = tls_alias.get();
+      c->set_output(0, *result);
+    } else {
+      // DT_VARIANT tensors always allocated on host.
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          c, c->allocate_output(0, TensorShape{batch_size}, &result, attr));
+    }
+
+    if (batch_size == 0) {
+      return;
+    }
+
+    auto input_t = input.flat_outer_dims<T, 2>();
+    auto result_t = result->vec<Variant>();
+
+    for (int64_t b = 0; b < batch_size; ++b) {
+      if (!ok_to_alias) {
+        result_t(b) = tl_batch[b]->Copy();
+      }
+      TensorList* output = result_t(b).get<TensorList>();
+      DCHECK(output != nullptr);
+      Tensor frame;
+      OP_REQUIRES_OK(
+          c, c->allocate_temp(element_dtype_, input_element_shape, &frame));
+      if (input_element_shape.num_elements() > 0) {
+        auto frame_t = frame.flat<T>();
+        // TODO(penporn): Get this if out of the batch loop.
+        if (IsPluggableDevice(c)) {
+          // The chip method need Eigen Device, so need to use Tensor.Slice
+          // instead of chip for pluggable device. The input should be reshaped
+          // to 2-D and so can be sliced by batch dim.
+          auto input_t_shape =
+              TensorShape({input_t.dimension(0), input_t.dimension(1)});
+          auto input_reshaped = Tensor();
+          OP_REQUIRES(c, input_reshaped.CopyFrom(input, input_t_shape),
+                      errors::Unknown("Unexpected shape error."));
+
+          auto input_batch = input_reshaped.Slice(b, b + 1);
+          CopyTensorPluggableDevice<T>(c, input_batch, frame);
+        } else {
+          frame_t.device(c->eigen_device<Device>()) =
+              input_t.template chip<0>(b);
+        }
+      }
+      output->tensors().push_back(std::move(frame));
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+}  // namespace tensorflow
+
+#undef PLUGGABLE_DEVICE_SUPPORTED
+#endif  // TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/logging_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/logging_ops.h
new file mode 100644
index 00000000..5cb12139
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/logging_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class AssertOp : public OpKernel {
+ public:
+  explicit AssertOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32 summarize_ = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/logistic-loss.h b/third_party/tflite-hdrs/tensorflow/core/kernels/logistic-loss.h
new file mode 100644
index 00000000..d848a1f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/logistic-loss.h
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
+
+#include <cmath>
+
+#include "tensorflow/core/kernels/loss.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+class LogisticLossUpdater : public DualLossUpdater {
+ public:
+  // Adding vs. Averaging in Distributed Primal-Dual Optimization.
+  // Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter
+  // Richtarik, Martin Takac http://arxiv.org/abs/1502.03508
+  double ComputeUpdatedDual(const int num_loss_partitions, const double label,
+                            const double example_weight,
+                            const double current_dual, const double wx,
+                            const double weighted_example_norm) const final {
+    // Newton algorithm converges quadratically so 10 steps will be largely
+    // enough to achieve a very good precision
+    static const int newton_total_steps = 10;
+    double x = 0;
+    for (int i = 0; i < newton_total_steps; ++i) {
+      x = NewtonStep(x, num_loss_partitions, label, wx, example_weight,
+                     weighted_example_norm, current_dual);
+    }
+    return 0.5 * (1 + tanh(x)) / label;
+  }
+
+  // Dual of logistic loss function.
+  // https://en.wikipedia.org/wiki/Convex_conjugate
+  double ComputeDualLoss(const double current_dual, const double example_label,
+                         const double example_weight) const final {
+    // Dual of the logistic loss function is
+    // ay * log(ay) + (1-ay) * log (1-ay), where a is the dual variable.
+    const double ay = current_dual * example_label;
+    const double log_ay = (ay > 0) ? log(ay) : 0;
+    const double one_minus_ay = 1 - ay;
+    const double log_one_minus_ay = (one_minus_ay > 0) ? log(one_minus_ay) : 0;
+    return ((ay * log_ay) + (one_minus_ay * log_one_minus_ay)) * example_weight;
+  }
+
+  // Logistic loss for binary classification.
+  // https://en.wikipedia.org/wiki/Loss_functions_for_classification
+  double ComputePrimalLoss(const double wx, const double example_label,
+                           const double example_weight) const final {
+    // Logistic loss:
+    //   log(1 + e^(-ywx))
+    //   log(e^0 + e^(-ywx))
+    //   a + log(e^(0-a) + e^(-ywx - a)),  where a is max(0, -ywx)
+    // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+    const double y_wx = example_label * wx;
+    if (y_wx > 0) {
+      // 0 + log(e^(0) + e^(-ywx - 0))
+      // log(1 + e^(-ywx))
+      return log1p(exp(-y_wx)) * example_weight;
+    }
+    // -ywx + log(e^(ywx) + e^(-ywx + ywx))
+    // log(e^(ywx) + e^(0)) - ywx
+    // log(1 + e^(ywx)) - ywx
+    return (log1p(exp(y_wx)) - y_wx) * example_weight;
+  }
+
+  // Derivative of logistic loss
+  double PrimalLossDerivative(const double wx, const double label,
+                              const double example_weight) const final {
+    double inverse_exp_term = 0;
+    if (label * wx > 0) {
+      inverse_exp_term = exp(-label * wx) / (1 + exp(-label * wx));
+    } else {
+      inverse_exp_term = 1 / (1 + exp(label * wx));
+    }
+    return -inverse_exp_term * label * example_weight;
+  }
+
+  // The smoothness constant is 4 since the derivative of logistic loss, which
+  // is exp(-x) / (1 + exp(-x)) can be shown to 0.25-Lipschitz (its derivative
+  // is bounded by 0.25)
+  double SmoothnessConstant() const final { return 4; }
+
+  // Converts binary example labels from 0.0 or 1.0 to -1.0 or 1.0 respectively
+  // as expected by logistic regression.
+  absl::Status ConvertLabel(float* const example_label) const final {
+    if (*example_label == 0.0) {
+      *example_label = -1;
+      return absl::OkStatus();
+    }
+    if (*example_label == 1.0) {
+      return absl::OkStatus();
+    }
+    return errors::InvalidArgument(
+        "Only labels of 0.0 or 1.0 are supported right now. "
+        "Found example with label: ",
+        *example_label);
+  }
+
+ private:
+  // We use Newton algorithm on a modified function (see readme.md).
+  double NewtonStep(const double x, const int num_loss_partitions,
+                    const double label, const double wx,
+                    const double example_weight,
+                    const double weighted_example_norm,
+                    const double current_dual) const {
+    const double tanhx = tanh(x);
+    const double numerator = -2 * label * x - wx -
+                             num_loss_partitions * weighted_example_norm *
+                                 example_weight *
+                                 (0.5 * (1 + tanhx) / label - current_dual);
+    const double denominator =
+        -2 * label - num_loss_partitions * weighted_example_norm *
+                         example_weight * (1 - tanhx * tanhx) * 0.5 / label;
+    return x - numerator / denominator;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_table_init_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_table_init_op.h
new file mode 100644
index 00000000..e94db921
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_table_init_op.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Helper function to initialize an InitializableLookupTable from a text file.
+absl::Status InitializeTableFromTextFile(const string& filename,
+                                         int64_t vocab_size, char delimiter,
+                                         int32_t key_index, int32_t value_index,
+                                         Env* env,
+                                         InitializableLookupTable* table);
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_INIT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_table_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_table_op.h
new file mode 100644
index 00000000..daa7f6e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_table_op.h
@@ -0,0 +1,352 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/lookup_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Lookup table op that supports different table implementations specified by
+// the 'Container' template. Container must be derived from LookupInterface. The
+// key and value are of the templated type "key_dtype" and "value_dtype"
+// respectively.
+template <class Container, class key_dtype, class value_dtype>
+class LookupTableOp : public OpKernel {
+ public:
+  // ctx is not owned by this class.
+  explicit LookupTableOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_set_(false) {
+    if (ctx->output_type(0) == DT_RESOURCE) {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(tensorflow::DT_RESOURCE,
+                                        tensorflow::TensorShape({}), &table_));
+    } else {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(tensorflow::DT_STRING,
+                                        tensorflow::TensorShape({2}), &table_));
+    }
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  // ctx is not owned by this function.
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+
+    if (!table_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator =
+        [ctx, this](lookup::LookupInterface** ret)
+            TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+              lookup::LookupInterface* container = new Container(ctx, this);
+              if (!ctx->status().ok()) {
+                container->Unref();
+                return ctx->status();
+              }
+              if (ctx->track_allocations()) {
+                ctx->record_persistent_memory_allocation(
+                    container->MemoryUsed() + table_.AllocatedBytes());
+              }
+              *ret = container;
+              return absl::OkStatus();
+            };
+
+    lookup::LookupInterface* table = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   cinfo_.resource_manager()
+                       ->template LookupOrCreate<lookup::LookupInterface>(
+                           cinfo_.container(), cinfo_.name(), &table, creator));
+    core::ScopedUnref unref_me(table);
+
+    OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
+                            *table, DataTypeToEnum<key_dtype>::v(),
+                            DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
+
+    if (ctx->expected_output_dtype(0) == DT_RESOURCE) {
+      if (!table_set_) {
+        auto h = table_.template scalar<ResourceHandle>();
+        h() = MakeResourceHandle<lookup::LookupInterface>(
+            ctx, cinfo_.container(), cinfo_.name());
+      }
+      ctx->set_output(0, table_);
+    } else {
+      if (!table_set_) {
+        auto h = table_.template flat<tstring>();
+        h(0) = cinfo_.container();
+        h(1) = cinfo_.name();
+      }
+      ctx->set_output_ref(0, &mu_, &table_);
+    }
+    table_set_ = true;
+  }
+
+  ~LookupTableOp() override {
+    // If the table object was not shared, delete it.
+    if (table_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<lookup::LookupInterface>(cinfo_.container(),
+                                                          cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
+    }
+  }
+
+ private:
+  mutex mu_;
+  Tensor table_ TF_GUARDED_BY(mu_);
+  bool table_set_ TF_GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  LookupTableOp(const LookupTableOp&) = delete;
+  void operator=(const LookupTableOp&) = delete;
+};
+
+// An anonymous version of LookupTableOp, which creates a new table resource
+// everytime `Compute` is called. The resource can only be accessed by the
+// returned resource handle (e.g. it can't be looked up by a name in a resource
+// manager). The resource will be automatically deleted when all resource
+// handles pointing to it are gone.
+template <class Container, class key_dtype, class value_dtype>
+class AnonymousLookupTableOp : public OpKernel {
+ public:
+  explicit AnonymousLookupTableOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table = new Container(ctx, this);
+    if (!ctx->status().ok()) {
+      table->Unref();
+      return;
+    }
+    Tensor table_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(tensorflow::DT_RESOURCE,
+                                tensorflow::TensorShape({}), &table_tensor));
+    if (ctx->track_allocations()) {
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() +
+                                               table_tensor.AllocatedBytes());
+    }
+    table_tensor.scalar<ResourceHandle>()() =
+        ResourceHandle::MakeRefCountingHandle<lookup::LookupInterface>(
+            table, ctx->device()->name());
+    ctx->set_output(0, table_tensor);
+  }
+
+ private:
+  AnonymousLookupTableOp(const AnonymousLookupTableOp&) = delete;
+  void operator=(const AnonymousLookupTableOp&) = delete;
+};
+
+namespace lookup {
+
+// Ensure that the compiler cannot elide a copy into a local, for
+// bounds checking on source tensors that might be updated asynchronously for
+// integral types. However non-integer variables are not allowed and therefore
+// the local copy is unnecessary.
+template <typename T>
+T SubtleMustCopyIfIntegral(const T& value) {
+  return internal::SubtleMustCopy(value);
+}
+
+inline const tstring& SubtleMustCopyIfIntegral(const tstring& value) {
+  return value;
+}
+
+inline const float SubtleMustCopyIfIntegral(const float value) { return value; }
+
+inline const double SubtleMustCopyIfIntegral(const double value) {
+  return value;
+}
+
+inline const Variant& SubtleMustCopyIfIntegral(const Variant& value) {
+  return value;
+}
+
+inline const ResourceHandle& SubtleMustCopyIfIntegral(
+    const ResourceHandle& value) {
+  return value;
+}
+
+// Returns a unique node name starting with "base".
+std::string UniqueNodeName(const std::string& base);
+
+// Lookup table that wraps an flat_hash_map, where the key and value data type
+// is specified.
+//
+// This table is recommended for any variations to key values.
+//
+// For look up, the table is required to be initialized (allocated
+// and populated). Once the table is marked as initialized it becomes read-only.
+//
+// Sample use case:
+//
+// HashTable<int64, int64> table;  // int64 -> int64.
+// table.Initialize(...);
+// table.Find(in_t, &out_t, default_t)
+//
+template <class K, class V>
+class HashTable : public InitializableLookupTable {
+ public:
+  HashTable(OpKernelContext* ctx, OpKernel* kernel) {}
+
+  absl::Status AsGraphDef(GraphDefBuilder* builder, Node** out) const override {
+    // We set use_node_name_sharing with a unique node name so that the resource
+    // can outlive the HashTableV2 kernel. This means that the lifetime of the
+    // HashTable resource will be tied to the lifetime of the resource manager
+    // it is created in.
+    // TODO(b/181695913): Provide a mechanism for deleting this resource
+    // earlier when appropriate.
+    Node* hash_table_node = ops::SourceOp(
+        "HashTableV2", builder->opts()
+                           .WithName(UniqueNodeName("HashTableFromGraphDef"))
+                           .WithAttr("key_dtype", key_dtype())
+                           .WithAttr("value_dtype", value_dtype())
+                           .WithAttr("use_node_name_sharing", true));
+    if (table_.empty()) {
+      *out = hash_table_node;
+      return absl::OkStatus();
+    }
+
+    if (initializer_serializer_ == nullptr) {
+      std::string message =
+          "Failed to serialize lookup table: no initialization function was "
+          "specified. Falling back to serializing a handle to the table.";
+      LOG(WARNING) << message;
+      return errors::Unimplemented(message);
+    }
+    Node* initializer;
+    TF_RETURN_IF_ERROR(initializer_serializer_->AsGraphDef(
+        builder, hash_table_node, &initializer));
+    *out = ops::UnaryOp("Identity", hash_table_node,
+                        builder->opts().WithControlInput(initializer));
+    return absl::OkStatus();
+  }
+
+  size_t size() const override {
+    if (!is_initialized())
+      return 0;
+    else
+      return table_.size();
+  }
+
+  absl::Status ExportValues(OpKernelContext* context) override {
+    if (!is_initialized()) {
+      return errors::Aborted("HashTable is not initialized.");
+    }
+
+    const int64_t size = table_.size();
+
+    Tensor* keys;
+    Tensor* values;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("keys", TensorShape({size}), &keys));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("values", TensorShape({size}), &values));
+
+    auto keys_data = keys->flat<K>();
+    auto values_data = values->flat<V>();
+    int64_t i = 0;
+    for (auto it = table_.begin(); it != table_.end(); ++it, ++i) {
+      keys_data(i) = it->first;
+      values_data(i) = it->second;
+    }
+    return absl::OkStatus();
+  }
+
+  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
+
+  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
+
+ protected:
+  absl::Status DoPrepare(size_t size) override {
+    if (is_initialized()) {
+      return errors::Aborted("HashTable already initialized.");
+    }
+    if (size > 0) {
+      table_.reserve(size);
+    }
+    return absl::OkStatus();
+  };
+
+  absl::Status DoLazyPrepare(std::function<int64(void)> size_fn) override {
+    return DoPrepare(size_fn());
+  }
+
+  absl::Status DoInsert(const Tensor& keys, const Tensor& values) override {
+    const auto key_values = keys.flat<K>();
+    const auto value_values = values.flat<V>();
+    for (int64_t i = 0; i < key_values.size(); ++i) {
+      auto&& key = SubtleMustCopyIfIntegral(key_values(i));
+      auto&& value = SubtleMustCopyIfIntegral(value_values(i));
+      auto result = table_.try_emplace(key, value);
+      if (!result.second && result.first->second != value) {
+        return errors::FailedPrecondition(
+            "HashTable has different value for same key. Key ", key, " has ",
+            result.first->second, " and trying to add value ", value);
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status DoFind(const Tensor& key, Tensor* value,
+                      const Tensor& default_value) override {
+    const V default_val = default_value.flat<V>()(0);
+    const auto key_values = key.flat<K>();
+    auto value_values = value->flat<V>();
+
+    for (int64_t i = 0; i < key_values.size(); ++i) {
+      value_values(i) = gtl::FindWithDefault(
+          table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
+    }
+    return absl::OkStatus();
+  }
+
+  int64_t MemoryUsed() const override {
+    if (!is_initialized()) {
+      return 0;
+    }
+    const int64_t num_elements = table_.size();
+    return num_elements * (sizeof(K) + sizeof(V));
+  }
+
+ private:
+  absl::flat_hash_map<K, V> table_;
+};
+
+}  // namespace lookup
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_util.h
new file mode 100644
index 00000000..677c6a56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/lookup_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
+
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+namespace tensorflow {
+namespace data {
+class DatasetBase;
+}  // namespace data
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace lookup {
+
+// Gets the LookupTable stored in the ctx->resource_manager() with key
+// passed by attribute with name input_name, returns null if the table
+// doesn't exist. Use GetResourceLookupTable() or GetReferenceLookupTable() if
+// the input dtype is known.
+absl::Status GetLookupTable(absl::string_view input_name, OpKernelContext* ctx,
+                            LookupInterface** table);
+absl::Status GetResourceLookupTable(absl::string_view input_name,
+                                    OpKernelContext* ctx,
+                                    LookupInterface** table);
+absl::Status GetReferenceLookupTable(absl::string_view input_name,
+                                     OpKernelContext* ctx,
+                                     LookupInterface** table);
+
+// Gets the InitializableLookupTable stored in the
+// ctx->resource_manager() with key passed by attribute with name
+// input_name, returns null if the table doesn't exist.
+absl::Status GetInitializableLookupTable(absl::string_view input_name,
+                                         OpKernelContext* ctx,
+                                         InitializableLookupTable** table);
+
+// Verify that the given key_dtype and value_dtype matches the corresponding
+// table's data types.
+absl::Status CheckTableDataTypes(const LookupInterface& table,
+                                 DataType key_dtype, DataType value_dtype,
+                                 const string& table_name);
+
+// Initializes `table` from `filename`.
+absl::Status InitializeTableFromTextFile(const string& filename,
+                                         int64_t vocab_size, char delimiter,
+                                         int32_t key_index, int32_t value_index,
+                                         int64_t offset, Env* env,
+                                         InitializableLookupTable* table);
+
+// Initializes `table` from `filename`. `func` may specify how to represent the
+// initializer as a graphdef, so that the table can be serialized as metadata.
+absl::Status InitializeTableFromTextFile(
+    const string& filename, int64_t vocab_size, char delimiter,
+    int32_t key_index, int32_t value_index, int64_t offset, Env* env,
+    std::unique_ptr<InitializableLookupTable::InitializerSerializer> serializer,
+    InitializableLookupTable* table);
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/loss.h b/third_party/tflite-hdrs/tensorflow/core/kernels/loss.h
new file mode 100644
index 00000000..85893ba8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/loss.h
@@ -0,0 +1,59 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_LOSS_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class DualLossUpdater {
+ public:
+  virtual ~DualLossUpdater() {}
+
+  // Compute update dual (alpha), based on a single example. Various strategies
+  // can be employed here, like newton step and/or line search or approximate
+  // step that decreases the dual sub-optimality.
+  virtual double ComputeUpdatedDual(
+      const int num_loss_partitions, const double label,
+      const double example_weight, const double current_dual, const double wx,
+      const double weighted_example_norm) const = 0;
+
+  // Compute dual loss based on the current dual (alpha), example label (y)
+  // and example weight (cost).
+  virtual double ComputeDualLoss(const double current_dual,
+                                 const double example_label,
+                                 const double example_weight) const = 0;
+
+  // Compute the primal loss based on current estimate of log-odds(wx),
+  // example label (y) and example weight (cost).
+  virtual double ComputePrimalLoss(const double wx, const double example_label,
+                                   const double example_weight) const = 0;
+
+  // Primal loss derivative used to compute the dual residue in AdaSDCA
+  virtual double PrimalLossDerivative(const double wx,
+                                      const double example_label,
+                                      const double example_weight) const = 0;
+
+  // This is gamma such that the loss derivative is 1/gamma Lipschitz
+  virtual double SmoothnessConstant() const = 0;
+
+  // Converts binary example labels from 0.0 or 1.0 to appropriate range for
+  // each loss function.
+  virtual absl::Status ConvertLabel(float* const example_label) const = 0;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_LOSS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/map_kernels.h b/third_party/tflite-hdrs/tensorflow/core/kernels/map_kernels.h
new file mode 100644
index 00000000..6949ff55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/map_kernels.h
@@ -0,0 +1,255 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/tensor_map.h"
+#include "tensorflow/core/util/batch_util.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+
+inline absl::Status GetInputMap(OpKernelContext* ctx, int index,
+                                const TensorMap** ret_map) {
+  if (!TensorShapeUtils::IsScalar(ctx->input(index).shape())) {
+    return errors::InvalidArgument("Input map must be a scalar. Saw: ",
+                                   ctx->input(index).shape().DebugString());
+  }
+  const TensorMap* map = ctx->input(index).scalar<Variant>()().get<TensorMap>();
+  if (map == nullptr) {
+    return errors::InvalidArgument(
+        "Input handle is not a map. Saw: '",
+        ctx->input(index).scalar<Variant>()().DebugString(), "'");
+  }
+  *ret_map = map;
+  return absl::OkStatus();
+}
+
+// TODO(kattian): change into templated function
+inline absl::Status ForwardInputOrCreateNewMap(OpKernelContext* ctx,
+                                               int32_t input_index,
+                                               int32_t output_index,
+                                               const TensorMap& input_map,
+                                               TensorMap** output_map) {
+  // Attempt to forward the input tensor to the output if possible.
+  std::unique_ptr<Tensor> maybe_output = ctx->forward_input(
+      input_index, output_index, DT_VARIANT, TensorShape{},
+      ctx->input_memory_type(input_index), AllocatorAttributes());
+  Tensor* output_tensor;
+  if (maybe_output != nullptr && maybe_output->dtype() == DT_VARIANT &&
+      maybe_output->NumElements() == 1) {
+    output_tensor = maybe_output.get();
+    TensorMap* tmp_out = output_tensor->scalar<Variant>()().get<TensorMap>();
+    if (tmp_out == nullptr) {
+      return errors::InvalidArgument(
+          "Expected input ", input_index, " to be a TensorMap but saw ",
+          output_tensor->scalar<Variant>()().TypeName());
+    }
+    if (tmp_out->RefCountIsOne()) {
+      // Woohoo, forwarding succeeded!
+      ctx->set_output(output_index, *output_tensor);
+      *output_map = tmp_out;
+      return absl::OkStatus();
+    }
+  }
+
+  // If forwarding is not possible allocate a new output tensor and copy
+  // the `input_map` to it.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(
+      ctx->allocate_output(output_index, {}, &output_tensor, attr));
+  output_tensor->scalar<Variant>()() = input_map.Copy();
+
+  *output_map = output_tensor->scalar<Variant>()().get<TensorMap>();
+  return absl::OkStatus();
+}
+
+class EmptyTensorMap : public OpKernel {
+ public:
+  explicit EmptyTensorMap(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
+    TensorMap empty;
+    result->scalar<Variant>()() = std::move(empty);
+  }
+};
+
+class TensorMapSize : public OpKernel {
+ public:
+  explicit TensorMapSize(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapSize() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
+    result->scalar<int32>()() = map->tensors().size();
+  }
+};
+
+class TensorMapLookup : public OpKernel {
+ public:
+  explicit TensorMapLookup(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapLookup() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(
+        ctx, map->tensors().find(key) != map->tensors().end(),
+        errors::InvalidArgument("Trying to lookup non-existent key. Could not "
+                                "find key \"" +
+                                key.SummarizeValue(100) + "\"."));
+
+    ctx->set_output(0, map->tensors().find(key)->second);
+  }
+};
+
+class TensorMapInsert : public OpKernel {
+ public:
+  explicit TensorMapInsert(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapInsert() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const Tensor& value = ctx->input(2);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    TensorMap* output_map = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ForwardInputOrCreateNewMap(ctx, 0, 0, *map, &output_map));
+    output_map->replace(key, value);
+  }
+};
+
+class TensorMapErase : public OpKernel {
+ public:
+  explicit TensorMapErase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(
+        ctx, map->tensors().find(key) != map->tensors().end(),
+        errors::InvalidArgument("Trying to erase non-existent item. Could not "
+                                "find key \"" +
+                                key.SummarizeValue(100) + "\"."));
+
+    TensorMap* output_map = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ForwardInputOrCreateNewMap(ctx, 0, 0, *map, &output_map));
+    output_map->tensors().erase(key);
+  }
+};
+
+class TensorMapHasKey : public OpKernel {
+ public:
+  explicit TensorMapHasKey(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapHasKey() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
+    result->scalar<bool>()() = map->tensors().find(key) != map->tensors().end();
+  }
+};
+
+class TensorMapStackKeys : public OpKernel {
+ public:
+  explicit TensorMapStackKeys(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_dtype", &key_dtype_));
+  }
+  ~TensorMapStackKeys() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(ctx, map->size() != 0,
+                errors::InvalidArgument(
+                    "TensorMapStackKeys cannot be called on empty map."));
+
+    auto it = map->tensors().begin();
+    TensorShape output_shape = it->first.shape();
+    output_shape.InsertDim(0, map->tensors().size());
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
+
+    int i = 0;
+    size_t sz = map->tensors().size();
+    TensorShape key_shape = it->first.shape();
+    while (it != map->tensors().end() && i < sz) {
+      OP_REQUIRES(
+          ctx, it->first.dtype() == key_dtype_,
+          errors::InvalidArgument("Key does not match requested dtype."));
+      OP_REQUIRES(
+          ctx, it->first.shape() == key_shape,
+          errors::InvalidArgument("Keys must all have the same shape."));
+      OP_REQUIRES_OK(ctx, batch_util::CopyElementToSlice(it->first, result, i));
+      i++;
+      it++;
+    }
+  }
+
+ private:
+  DataType key_dtype_;
+};
+
+template <typename Device>
+absl::Status TensorMapBinaryAdd(OpKernelContext* ctx, const TensorMap& a,
+                                const TensorMap& b, TensorMap* out) {
+  // Binary add returns a map containing the union of keys.
+  // Values with keys in the intersection are added.
+  out->tensors() = a.tensors();
+  for (const std::pair<TensorKey, Tensor>& p : b.tensors()) {
+    absl::flat_hash_map<TensorKey, Tensor>::iterator it =
+        out->tensors().find(p.first);
+    if (it != out->tensors().end()) {
+      Tensor out_tensor;
+      TF_RETURN_IF_ERROR(
+          BinaryAddTensors<Device>(ctx, p.second, it->second, &out_tensor));
+      it->second = out_tensor;
+    } else {
+      out->tensors().emplace(p.first, p.second);
+    }
+  }
+  return absl::OkStatus();
+}
+
+template <typename Device>
+absl::Status TensorMapZerosLike(OpKernelContext* ctx, const TensorMap& x,
+                                TensorMap* y) {
+  // Zeros like returns an empty map.
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_op.h
new file mode 100644
index 00000000..94a39794
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_op.h
@@ -0,0 +1,69 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MATMUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MATMUL_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/hash/hash.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+namespace tensorflow {
+namespace functor {
+
+// Helpers to define tensor<T> needed by MatMul op.
+template <typename T>
+struct MatMulTypes {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
+      out_type;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                           Eigen::Aligned>
+      in_type;
+};
+
+template <typename Device, typename In0, typename In1, typename Out,
+          typename DimPair>
+void MatMul(const Device& d, Out out, In0 in0, In1 in1,
+            const DimPair& dim_pair) {
+  out.device(d) = in0.contract(in1, dim_pair);
+}
+
+template <typename Device, typename T>
+struct MatMulFunctor {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, typename MatMulTypes<T>::out_type out,
+      typename MatMulTypes<T>::in_type in0,
+      typename MatMulTypes<T>::in_type in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair);
+};
+
+}  // end namespace functor
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MATMUL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_op_impl.h
new file mode 100644
index 00000000..50517dc9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_op_impl.h
@@ -0,0 +1,1156 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#ifndef TENSORFLOW_CORE_KERNELS_MATMUL_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_MATMUL_OP_IMPL_H_
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_autotune.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "xla/stream_executor/host_or_device_scalar.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/kernels/matmul_util.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/cuda/cuda_blas_lt.h"
+#endif  // GOOGLE_CUDA
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#if TF_HIPBLASLT
+#include "xla/stream_executor/rocm/hip_blas_lt.h"
+#endif
+#endif
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+
+// Returns the pair of dimensions along which to perform Tensor contraction to
+// emulate matrix multiplication.
+// For matrix multiplication of 2D Tensors X and Y, X is contracted along
+// second dimension and Y is contracted along the first dimension (if neither X
+// nor Y is adjointed). The dimension to contract along is switched when any
+// operand is adjointed.
+// See http://en.wikipedia.org/wiki/Tensor_contraction
+inline Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x,
+                                                           bool adj_y) {
+  return Eigen::IndexPair<Eigen::DenseIndex>(adj_x ? 0 : 1, adj_y ? 1 : 0);
+}
+
+// Parallel batch matmul kernel based on the multi-threaded tensor contraction
+// in Eigen.
+template <typename Scalar, bool IsComplex = true>
+struct ParallelMatMulKernel {
+  static void Conjugate(const OpKernelContext* context, Tensor* out) {
+    const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
+    auto z = out->tensor<Scalar, 3>();
+    z.device(d) = z.conjugate();
+  }
+
+  static void Run(const OpKernelContext* context, const Tensor& in_x,
+                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
+                  int batch_size) {
+    static_assert(IsComplex, "Complex type expected.");
+    auto Tx = in_x.tensor<Scalar, 3>();
+    auto Ty = in_y.tensor<Scalar, 3>();
+    auto Tz = out->tensor<Scalar, 3>();
+    // We use the identities
+    //   conj(a) * conj(b) = conj(a * b)
+    //   conj(a) * b = conj(a * conj(b))
+    // to halve the number of cases. The final conjugation of the result is
+    // done at the end of LaunchBatchMatMul<CPUDevice, Scalar>::Launch().
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
+    const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
+
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const auto& x_batch_indices = bcast.x_batch_indices();
+    const auto& y_batch_indices = bcast.y_batch_indices();
+    // TODO(rmlarsen): Consider launching these contractions asynchronously.
+    for (int64_t i = 0; i < batch_size; ++i) {
+      const int64_t x_batch_index = should_bcast ? x_batch_indices[i] : i;
+      const int64_t y_batch_index = should_bcast ? y_batch_indices[i] : i;
+
+      auto x = Tx.template chip<0>(x_batch_index);
+      auto z = Tz.template chip<0>(i);
+      if (adj_x != adj_y) {
+        auto y = Ty.template chip<0>(y_batch_index).conjugate();
+        z.device(d) = x.contract(y, contract_pairs);
+      } else {
+        auto y = Ty.template chip<0>(y_batch_index);
+        z.device(d) = x.contract(y, contract_pairs);
+      }
+    }
+  }
+};
+
+// The Eigen contraction kernel used here is very large and slow to compile,
+// so we partially specialize ParallelMatMulKernel for real types to avoid all
+// but one of the instantiations.
+template <typename Scalar>
+struct ParallelMatMulKernel<Scalar, false> {
+  static void Conjugate(const OpKernelContext* context, Tensor* out) {}
+
+  static void Run(const OpKernelContext* context, const Tensor& in_x,
+                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
+                  int batch_size) {
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const Eigen::ThreadPoolDevice d = context->eigen_cpu_device();
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
+    if (batch_size == 1 && !should_bcast) {
+      auto Tx = in_x.flat_inner_dims<Scalar, 2>();
+      auto Ty = in_y.flat_inner_dims<Scalar, 2>();
+      auto Tz = out->flat_inner_dims<Scalar, 2>();
+      Tz.device(d) = Tx.contract(Ty, contract_pairs);
+    } else {
+      auto Tx = in_x.tensor<Scalar, 3>();
+      auto Ty = in_y.tensor<Scalar, 3>();
+      auto Tz = out->tensor<Scalar, 3>();
+      const auto& x_batch_indices = bcast.x_batch_indices();
+      const auto& y_batch_indices = bcast.y_batch_indices();
+      // TODO(rmlarsen): Consider launching these contractions asynchronously.
+      for (int64_t i = 0; i < batch_size; ++i) {
+        const int64_t x_batch_index = should_bcast ? x_batch_indices[i] : i;
+        const int64_t y_batch_index = should_bcast ? y_batch_indices[i] : i;
+        auto x = Tx.template chip<0>(x_batch_index);
+        auto y = Ty.template chip<0>(y_batch_index);
+        auto z = Tz.template chip<0>(i);
+
+        z.device(d) = x.contract(y, contract_pairs);
+      }
+    }
+  }
+};
+
+// Basic y-combinator implementation.
+template <class Func>
+struct YCombinatorImpl {
+  Func func;
+  template <class... Args>
+  decltype(auto) operator()(Args&&... args) const {
+    return func(*this, std::forward<Args>(args)...);
+  }
+};
+
+template <class Func>
+YCombinatorImpl<std::decay_t<Func>> YCombinator(Func&& func) {
+  return YCombinatorImpl<std::decay_t<Func>>{std::forward<Func>(func)};
+}
+
+// Sequential batch matmul kernel that calls the regular Eigen matmul.
+// We prefer this over the tensor contraction because it performs
+// better on vector-matrix and matrix-vector products.
+template <typename Scalar>
+struct SequentialMatMulKernel {
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t,
+                                                      int slice) {
+    return ConstMatrixMap(
+        t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2),
+        t.dim_size(1), t.dim_size(2));
+  }
+
+  static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) {
+    return MatrixMap(
+        t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2),
+        t->dim_size(1), t->dim_size(2));
+  }
+
+  static void Run(const Tensor& in_x, const Tensor& in_y, bool adj_x,
+                  bool adj_y, bool trans_x, bool trans_y,
+                  const MatMulBCast& bcast, Tensor* out, int start, int limit) {
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const auto& x_batch_indices = bcast.x_batch_indices();
+    const auto& y_batch_indices = bcast.y_batch_indices();
+    for (int64_t i = start; i < limit; ++i) {
+      const int64_t x_batch_index = should_bcast ? x_batch_indices[i] : i;
+      const int64_t y_batch_index = should_bcast ? y_batch_indices[i] : i;
+      auto x = ConstTensorSliceToEigenMatrix(in_x, x_batch_index);
+      auto y = ConstTensorSliceToEigenMatrix(in_y, y_batch_index);
+      auto z = TensorSliceToEigenMatrix(out, i);
+      // Assume at most one of adj_x or trans_x is true. Similarly, for adj_y
+      // and trans_y.
+      if (!adj_x && !trans_x) {
+        if (!adj_y && !trans_y) {
+          z.noalias() = x * y;
+        } else if (adj_y) {
+          z.noalias() = x * y.adjoint();
+        } else {  // trans_y == true
+          z.noalias() = x * y.transpose();
+        }
+      } else if (adj_x) {
+        if (!adj_y && !trans_y) {
+          z.noalias() = x.adjoint() * y;
+        } else if (adj_y) {
+          z.noalias() = x.adjoint() * y.adjoint();
+        } else {  // trans_y == true
+          z.noalias() = x.adjoint() * y.transpose();
+        }
+      } else {  // trans_x == true
+        if (!adj_y && !trans_y) {
+          z.noalias() = x.transpose() * y;
+        } else if (adj_y) {
+          z.noalias() = x.transpose() * y.adjoint();
+        } else {  // trans_y == true
+          z.noalias() = x.transpose() * y.transpose();
+        }
+      }
+    }
+  }
+};
+
+// For single-batch multiplications, manually parallize by splitting the output
+// matrix.
+template <typename Scalar>
+struct SingleBatchParallelMatMulKernel {
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  static ConstMatrixMap ConstTensorToEigenMatrix(const Tensor& t) {
+    return ConstMatrixMap(t.flat<Scalar>().data(), t.dim_size(1),
+                          t.dim_size(2));
+  }
+
+  static MatrixMap TensorToEigenMatrix(Tensor* t) {
+    return MatrixMap(t->flat<Scalar>().data(), t->dim_size(1), t->dim_size(2));
+  }
+
+  static void Run(const CPUDevice& device, const Tensor& in_x,
+                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                  bool trans_y, Tensor* out) {
+    using Eigen::Index;
+    Eigen::ThreadPoolInterface* pool = device.getPool();
+
+    Index m = (trans_x || adj_x) ? in_x.dim_size(2) : in_x.dim_size(1);
+    Index k = (trans_x || adj_x) ? in_x.dim_size(1) : in_x.dim_size(2);
+    Index n = (trans_y || adj_y) ? in_y.dim_size(1) : in_y.dim_size(2);
+
+    auto x_mat = ConstTensorToEigenMatrix(in_x);
+    auto y_mat = ConstTensorToEigenMatrix(in_y);
+    auto out_mat = TensorToEigenMatrix(out);
+
+    // Computes a block of the output matrix.
+    auto compute_matmul_block = [&x_mat, &y_mat, &out_mat, adj_x, trans_x,
+                                 adj_y, trans_y](Index row, Index col,
+                                                 Index nrows, Index ncols) {
+      auto z = out_mat.block(row, col, nrows, ncols);
+
+      // Assume at most one of adj_x or trans_x is true. Similarly, for adj_y
+      // and trans_y.
+      if (!adj_x && !trans_x) {
+        auto x = x_mat.middleRows(row, nrows);
+        if (!adj_y && !trans_y) {
+          auto y = y_mat.middleCols(col, ncols);
+          z = x * y;
+        } else if (adj_y) {
+          auto y = y_mat.middleRows(col, ncols);
+          z.noalias() = x * y.adjoint();
+        } else {  // trans_y == true
+          auto y = y_mat.middleRows(col, ncols);
+          z.noalias() = x * y.transpose();
+        }
+      } else if (adj_x) {
+        auto x = x_mat.middleCols(row, nrows);
+        if (!adj_y && !trans_y) {
+          auto y = y_mat.middleCols(col, ncols);
+          z.noalias() = x.adjoint() * y;
+        } else if (adj_y) {
+          auto y = y_mat.middleRows(col, ncols);
+          z.noalias() = x.adjoint() * y.adjoint();
+        } else {  // trans_y == true
+          auto y = y_mat.middleRows(col, ncols);
+          z.noalias() = x.adjoint() * y.transpose();
+        }
+      } else {  // trans_x == true
+        auto x = x_mat.middleCols(row, nrows);
+        if (!adj_y && !trans_y) {
+          auto y = y_mat.middleCols(col, ncols);
+          z.noalias() = x.transpose() * y;
+        } else if (adj_y) {
+          auto y = y_mat.middleRows(col, ncols);
+          z.noalias() = x.transpose() * y.adjoint();
+        } else {  // trans_y == true
+          auto y = y_mat.middleRows(col, ncols);
+          z.noalias() = x.transpose() * y.transpose();
+        }
+      }
+    };
+
+    // Split the work across n threads, unless the total amount of work
+    // is small (e.g. 128 * 128) - in which case use fewer threads.  This is
+    // the same heuristic value used in LaunchBatchMatMul below.
+    const int64_t kMaxCostOuterParallelism = 128 * 128;
+    Index work_limit = std::max<Index>((m * k * n) / pool->NumThreads(),
+                                       kMaxCostOuterParallelism);
+    // Blocks should have a size no smaller than 8 * kPacketSize, except perhaps
+    // for tail blocks.
+    constexpr int kPacketSize = Eigen::internal::packet_traits<Scalar>::size;
+    constexpr Index kBlockMin = 8 * kPacketSize;
+
+    // Precompute how many blocks there will be.
+    auto compute_blocks = YCombinator([k, work_limit, kBlockMin](
+                                          auto& compute_blocks, Index row,
+                                          Index col, Index nrows,
+                                          Index ncols) -> Index {
+      Index work = nrows * k * ncols;
+      Index blocks = 0;
+      while (work > work_limit && (nrows > kBlockMin || ncols > kBlockMin)) {
+        if (nrows > ncols) {
+          Index half = Eigen::divup(nrows / 2, kBlockMin) * kBlockMin;
+          blocks += 1 + compute_blocks(row + half, col, nrows - half, ncols);
+          nrows = half;
+        } else {
+          Index half = Eigen::divup(ncols / 2, kBlockMin) * kBlockMin;
+          blocks += 1 + compute_blocks(row, col + half, nrows, ncols - half);
+          ncols = half;
+        }
+        work = nrows * k * ncols;
+      }
+      return blocks;
+    });
+    Index total_blocks = 1 + compute_blocks(0, 0, m, n);
+
+    // Recursively split work according to the exact same heuristic as above.
+    Eigen::Barrier barrier(total_blocks);
+    auto handle_range = YCombinator(
+        [k, pool, &barrier, work_limit, kBlockMin, &compute_matmul_block](
+            auto& handle_range, Index row, Index col, Index nrows,
+            Index ncols) -> void {
+          Index work = nrows * k * ncols;
+          while (work > work_limit &&
+                 (nrows > kBlockMin || ncols > kBlockMin)) {
+            if (nrows > ncols) {
+              Index half = Eigen::divup(nrows / 2, kBlockMin) * kBlockMin;
+              pool->Schedule([&handle_range, row, half, col, nrows, ncols]() {
+                handle_range(row + half, col, nrows - half, ncols);
+              });
+              nrows = half;
+            } else {
+              Index half = Eigen::divup(ncols / 2, kBlockMin) * kBlockMin;
+              pool->Schedule([&handle_range, row, half, col, nrows, ncols]() {
+                handle_range(row, col + half, nrows, ncols - half);
+              });
+              ncols = half;
+            }
+            work = nrows * k * ncols;
+          }
+
+          if (nrows > 0 && ncols > 0) {
+            // Compute the output block.
+            compute_matmul_block(row, col, nrows, ncols);
+          }
+          barrier.Notify();
+        });
+    handle_range(0, 0, m, n);
+    barrier.Wait();
+  }
+};
+
+}  // namespace
+
+template <typename Device, typename Scalar>
+struct LaunchBatchMatMul;
+
+template <typename Scalar>
+struct LaunchBatchMatMul<CPUDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                     bool trans_y, bool grad_x, bool grad_y,
+                     const MatMulBCast& bcast, Tensor* out) {
+    typedef ParallelMatMulKernel<Scalar, Eigen::NumTraits<Scalar>::IsComplex>
+        ParallelMatMulKernel;
+    bool conjugate_result = false;
+
+    // Number of matrix multiplies i.e. size of the batch.
+    const int64_t batch_size = bcast.output_batch_size();
+    const int64_t cost_per_unit =
+        in_x.dim_size(1) * in_x.dim_size(2) * out->dim_size(2);
+    const int64_t small_dim = std::min(
+        std::min(in_x.dim_size(1), in_x.dim_size(2)), out->dim_size(2));
+    // NOTE(nikhilsarda): This heuristic is optimal in benchmarks as of
+    // Jan 21, 2020.
+    const int64_t kMaxCostOuterParallelism = 128 * 128;  // heuristic.
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    // TODO(rmlarsen): Reconsider the heuristics now that we have asynchronous
+    // evaluation in Eigen Tensor.
+    if (small_dim > 1 &&
+        (batch_size == 1 || cost_per_unit > kMaxCostOuterParallelism)) {
+      // Parallelize over inner dims.
+      // For large matrix products it is counter-productive to parallelize
+      // over the batch dimension.
+      ParallelMatMulKernel::Run(context, in_x, in_y, adj_x, adj_y, trans_x,
+                                trans_y, bcast, out, batch_size);
+      conjugate_result = adj_x;
+    } else if (batch_size > 1) {
+      // Parallelize over outer dims. For small matrices and large batches, it
+      // is counter-productive to parallelize the inner matrix multiplies.
+      Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+            cost_per_unit,
+            [&in_x, &in_y, adj_x, adj_y, trans_x, trans_y, &bcast, out](
+                int start, int limit) {
+              SequentialMatMulKernel<Scalar>::Run(in_x, in_y, adj_x, adj_y,
+                                                  trans_x, trans_y, bcast, out,
+                                                  start, limit);
+            });
+    } else if (cost_per_unit > kMaxCostOuterParallelism) {
+      // Split along output blocks.
+      SingleBatchParallelMatMulKernel<Scalar>::Run(context->eigen_cpu_device(),
+                                                   in_x, in_y, adj_x, adj_y,
+                                                   trans_x, trans_y, out);
+    } else {
+      // Single small multiplication.
+      SequentialMatMulKernel<Scalar>::Run(in_x, in_y, adj_x, adj_y, trans_x,
+                                          trans_y, bcast, out, 0, batch_size);
+    }
+
+    if (conjugate_result) {
+      // We used one of the identities
+      //   conj(a) * conj(b) = conj(a * b)
+      //   conj(a) * b = conj(a * conj(b))
+      // above, we need to conjugate the final output. This is a
+      // no-op for non-complex types.
+      ParallelMatMulKernel::Conjugate(context, out);
+    }
+  }
+};
+
+#if GOOGLE_CUDA || TF_HIPBLASLT
+
+namespace {
+// A dummy type to group matmul autotune results together.
+struct BlasLtMatmulAutoTuneGroup {
+  static string name() { return "MatmulLt"; }
+};
+
+typedef AutotuneSingleton<BlasLtMatmulAutoTuneGroup, BlasLtMatmulPlanParams,
+                          se::blas::AlgorithmConfig,
+                          absl::Hash<BlasLtMatmulPlanParams>>
+    AutoTuneBatchMatmul;
+
+}  // namespace
+
+#endif  // GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class BlasScratchAllocator : public se::ScratchAllocator {
+ public:
+  using Stream = se::Stream;
+  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
+
+  BlasScratchAllocator(OpKernelContext* context)
+      : memory_limit_(0), total_byte_size_(0), context_(context) {}
+
+  BlasScratchAllocator(OpKernelContext* context, int64_t memory_limit)
+      : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
+
+  int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
+
+  tsl::StatusOr<DeviceMemoryBytes> AllocateBytes(int64_t byte_size) override {
+    Tensor temporary_memory;
+
+    if (memory_limit_ > 0 && byte_size > memory_limit_) {
+      return tsl::Status{
+          absl::StatusCode::kUnavailable,
+          absl::StrCat("Requested memory size (", byte_size,
+                       ") exceeds the memory limit (", memory_limit_, ").")};
+    }
+    AllocationAttributes allocation_attr;
+    allocation_attr.retry_on_failure = false;
+    Status allocation_status(context_->allocate_temp(
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return tsl::Status{
+          absl::StatusCode::kUnavailable,
+          absl::StrCat("Failed to allocate requested memory of (", byte_size,
+                       ").")};
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    total_byte_size_ += byte_size;
+    return tsl::StatusOr<DeviceMemoryBytes>(DeviceMemoryBytes::MakeFromByteSize(
+        temporary_memory.flat<uint8>().data(),
+        temporary_memory.flat<uint8>().size()));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64_t memory_limit_;
+  int64_t total_byte_size_;
+  OpKernelContext* context_;
+  std::vector<Tensor> allocated_tensors_;
+};
+
+template <typename Scalar>
+struct LaunchBatchMatMul<GPUDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
+                     bool trans_y, bool grad_x, bool grad_y,
+                     const MatMulBCast& bcast, Tensor* out) {
+    se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                   se::blas::Transpose::kTranspose,
+                                   se::blas::Transpose::kConjugateTranspose};
+    const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
+    const int64_t batch_size = bcast.output_batch_size();
+    auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)];
+    auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)];
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    typedef se::DeviceMemory<Scalar> DeviceMemoryType;
+    std::vector<DeviceMemoryType> a_device_memory;
+    std::vector<DeviceMemoryType> b_device_memory;
+    std::vector<DeviceMemoryType> c_device_memory;
+    std::vector<DeviceMemoryType*> a_ptrs;
+    std::vector<DeviceMemoryType*> b_ptrs;
+    std::vector<DeviceMemoryType*> c_ptrs;
+    a_device_memory.reserve(bcast.x_batch_size());
+    b_device_memory.reserve(bcast.y_batch_size());
+    c_device_memory.reserve(batch_size);
+    a_ptrs.reserve(batch_size);
+    b_ptrs.reserve(batch_size);
+    c_ptrs.reserve(batch_size);
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* b_base_ptr = in_y.template flat<Scalar>().data();
+    auto* c_base_ptr = out->template flat<Scalar>().data();
+    uint64 a_stride;
+    uint64 b_stride;
+    uint64 c_stride;
+
+    bool is_full_broadcast =
+        std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
+
+    // Use float as coefficient type for half and bfloat16 precision inputs,
+    // otherwise use the input type.
+    constexpr bool is_16bit_input = std::is_same_v<Scalar, Eigen::half> ||
+                                    std::is_same_v<Scalar, Eigen::bfloat16>;
+    using Coefficient = std::conditional_t<is_16bit_input, float, Scalar>;
+
+    se::blas::CallContext call_context = se::blas::CallContext::kNone;
+    OP_REQUIRES(context, grad_x == false || grad_y == false,
+                errors::InvalidArgument(
+                    "At least 1 of grad_x and grad_y shall be false"));
+    if (grad_x) {
+      call_context = se::blas::CallContext::kBackpropInput1;
+    }
+    if (grad_y) {
+      call_context = se::blas::CallContext::kBackpropInput2;
+    }
+#if GOOGLE_CUDA || TF_HIPBLASLT
+    static const bool use_autotune = MatmulAutotuneEnable();
+    bool bCublasLtSupport = true;
+
+    const auto& cc =
+        stream->parent()->GetDeviceDescription().gpu_compute_capability();
+    if (auto* procm = std::get_if<se::RocmComputeCapability>(&cc)) {
+      bCublasLtSupport = procm->gfx9_mi200_or_later();
+    }
+
+    if (EnableCublasLtGemm() && bCublasLtSupport) {
+      static const int64_t max_scratch_size =
+          GetWorkspaceLimit(1LL << 32);  // 4GB by default
+
+      bool requires_mixed_broadcasting =
+          bcast.IsBroadcastingRequired() && !is_full_broadcast;
+
+      if (!requires_mixed_broadcasting) {
+        a_device_memory.push_back(AsDeviceMemory(a_base_ptr));
+        b_device_memory.push_back(AsDeviceMemory(b_base_ptr));
+        c_device_memory.push_back(AsDeviceMemory(c_base_ptr));
+        a_ptrs.push_back(&a_device_memory.back());
+        b_ptrs.push_back(&b_device_memory.back());
+        c_ptrs.push_back(&c_device_memory.back());
+
+        BlasLtMatmulPlanParams matmul_params{
+            se::blas::ToDataType<Scalar>::value,
+            static_cast<size_t>(m),
+            static_cast<size_t>(n),
+            static_cast<size_t>(k),
+            blas_transpose_a,
+            blas_transpose_b,
+            static_cast<size_t>(batch_size),
+            /*broadcast_a=*/bcast.x_batch_size() == 1,
+            /*broadcast_b=*/bcast.y_batch_size() == 1};
+
+        std::optional<int> max_algorithm_count;
+        if (!use_autotune) max_algorithm_count = 1;
+        absl::Mutex* pmu = nullptr;
+        auto plan_and_algorithms_or = PlanAndAlgorithms::GetOrCreate(
+            stream, matmul_params, &pmu, max_algorithm_count);
+        OP_REQUIRES_OK(context, plan_and_algorithms_or.status());
+        absl::MutexLock lock(pmu);
+        const auto* plan_and_algorithms =
+            std::move(plan_and_algorithms_or).value();
+        auto n_algorithms = plan_and_algorithms->algorithms.size();
+
+        se::blas::AlgorithmConfig algorithm_config(se::blas::kNoAlgorithm);
+        if (!use_autotune) {
+          algorithm_config.set_algorithm(0);
+        } else if (!AutoTuneBatchMatmul::GetInstance()->Find(
+                       matmul_params, &algorithm_config)) {
+          VLOG(4) << "Autotuning BlasLtMatmul over " << n_algorithms
+                  << " algorithms.";
+          se::blas::ProfileResult best_result;
+          se::blas::ProfileResult profile_result;
+
+          for (size_t i = 0; i != n_algorithms; ++i) {
+            // Create a new scratch allocator with every autotuning run so that
+            // scratch space is deallocated between runs.
+            BlasScratchAllocator scratch_allocator(context, max_scratch_size);
+            Status cublas_launch_status = plan_and_algorithms->ExecuteOnStream(
+                stream, *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], i,
+                scratch_allocator, se::DeviceMemoryBase{}, &profile_result);
+
+            VLOG(4) << "  Autotune algorithm " << i
+                    << " result: " << profile_result.elapsed_time_in_ms()
+                    << " ms, valid=" << profile_result.is_valid()
+                    << ", workspace_size="
+                    << plan_and_algorithms->algorithms[i].workspace_size;
+
+            if (cublas_launch_status.ok() && profile_result.is_valid() &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+              // Use index into algorithms array, instead of cublas internal ID.
+              best_result.set_algorithm(i);
+            }
+          }
+
+          if (best_result.is_valid()) {
+            algorithm_config.set_algorithm(best_result.algorithm());
+          }
+          // Each matmul parameter set gets one pass of
+          // autotune. If no algorithms works, kNoAlgorithm is added to the
+          // autotune map.
+          AutoTuneBatchMatmul::GetInstance()->Insert(matmul_params,
+                                                     algorithm_config);
+        }
+        se::blas::AlgorithmType algorithm_idx = algorithm_config.algorithm();
+        OP_REQUIRES(context, 0 <= algorithm_idx && algorithm_idx < n_algorithms,
+                    errors::Internal("Missing/invalid BatchMatmul algorithm"));
+        BlasScratchAllocator scratch_allocator(context, max_scratch_size);
+        VLOG(4) << "Calling BlasLtMatMul: a.shape=(" << bcast.x_batch_size()
+                << ", " << in_x.dim_size(1) << ", " << in_x.dim_size(2)
+                << "), b.shape=(" << bcast.y_batch_size() << ", "
+                << in_y.dim_size(1) << ", " << in_y.dim_size(2) << "), m=" << m
+                << ", n=" << n << ", k=" << k << ", batch_size=" << batch_size
+                << "trans_x = " << trans_x << "trans_y = " << trans_y
+                << "adj_x = " << adj_x << "adj_y = " << adj_y;
+
+        OP_REQUIRES_OK(context, plan_and_algorithms->ExecuteOnStream(
+                                    stream, *a_ptrs[0], *b_ptrs[0], *c_ptrs[0],
+                                    algorithm_idx, scratch_allocator));
+      } else {  // requires mixed broadcasting
+        const std::vector<int64_t>& a_batch_indices = bcast.x_batch_indices();
+        const std::vector<int64_t>& b_batch_indices = bcast.y_batch_indices();
+        for (int64_t i = 0; i < bcast.x_batch_size(); ++i) {
+          a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+        }
+        for (int64_t i = 0; i < bcast.y_batch_size(); ++i) {
+          b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+        }
+        for (int64_t i = 0; i < batch_size; ++i) {
+          c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+          a_ptrs.push_back(&a_device_memory[a_batch_indices[i]]);
+          b_ptrs.push_back(&b_device_memory[b_batch_indices[i]]);
+          c_ptrs.push_back(&c_device_memory.back());
+        }
+
+        BlasScratchAllocator scratch_allocator(context, max_scratch_size);
+        auto blas = stream->parent()->AsBlas();
+        OP_REQUIRES(context, blas != nullptr,
+                    absl::InternalError("No blas support for stream"));
+        bool blas_launch_status = blas->DoBlasGemmBatched(
+            stream, blas_transpose_b, blas_transpose_a, n, m, k,
+            static_cast<Coefficient>(1.0), b_ptrs, adj_y || trans_y ? k : n,
+            a_ptrs, adj_x || trans_x ? m : k, static_cast<Coefficient>(0.0),
+            c_ptrs, n, batch_size, GetNumericOptions(), &scratch_allocator,
+            call_context);
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal(
+              "Blas xGEMMBatched launch failed: a.shape=",
+              in_x.shape().DebugString(),
+              ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+              ", k=", k, ", batch_size=", batch_size));
+        }
+      }
+    } else {
+#endif  // GOOGLE_CUDA
+      bool use_strided_batched =
+          (!bcast.IsBroadcastingRequired() || is_full_broadcast) &&
+          batch_size > 1;
+      if (use_strided_batched) {
+        a_stride = bcast.x_batch_size() != 1 ? m * k : 0;
+        b_stride = bcast.y_batch_size() != 1 ? k * n : 0;
+        c_stride = m * n;
+        a_device_memory.push_back(AsDeviceMemory(a_base_ptr));
+        b_device_memory.push_back(AsDeviceMemory(b_base_ptr));
+        c_device_memory.push_back(AsDeviceMemory(c_base_ptr));
+        a_ptrs.push_back(&a_device_memory.back());
+        b_ptrs.push_back(&b_device_memory.back());
+        c_ptrs.push_back(&c_device_memory.back());
+      } else if (!bcast.IsBroadcastingRequired()) {
+        for (int64_t i = 0; i < batch_size; ++i) {
+          a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+          b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+          c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+          a_ptrs.push_back(&a_device_memory.back());
+          b_ptrs.push_back(&b_device_memory.back());
+          c_ptrs.push_back(&c_device_memory.back());
+        }
+      } else {
+        const std::vector<int64_t>& a_batch_indices = bcast.x_batch_indices();
+        const std::vector<int64_t>& b_batch_indices = bcast.y_batch_indices();
+        for (int64_t i = 0; i < bcast.x_batch_size(); ++i) {
+          a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+        }
+        for (int64_t i = 0; i < bcast.y_batch_size(); ++i) {
+          b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+        }
+        for (int64_t i = 0; i < batch_size; ++i) {
+          c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+          a_ptrs.push_back(&a_device_memory[a_batch_indices[i]]);
+          b_ptrs.push_back(&b_device_memory[b_batch_indices[i]]);
+          c_ptrs.push_back(&c_device_memory.back());
+        }
+      }
+
+      // Blas does
+      // C = A x B
+      // where A, B and C are assumed to be in column major.
+      // We want the output to be in row-major, so we can compute
+      // C' = B' x A', where ' stands for transpose (not adjoint).
+      // TODO(yangzihao): Choose the best of the three strategies using
+      // autotune.
+      auto blas = stream->parent()->AsBlas();
+      OP_REQUIRES(context, blas != nullptr,
+                  absl::InternalError("No blas support for stream"));
+      if (batch_size == 1) {
+        // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+        // overhead of the scratch allocator and the batch interface.
+        // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
+        if constexpr (!std::is_same_v<Scalar, Eigen::half> &&
+                      !std::is_same_v<Scalar, Eigen::bfloat16>) {
+          if (n == 1 &&
+              blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
+              blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
+            // This is a matrix*vector multiply so use GEMV to compute A * b.
+            // Here we are multiplying in the natural order, so we have to flip
+            // the transposition flag to compensate for the tensor being stored
+            // row-major. Since GEMV doesn't provide a way to just conjugate an
+            // argument, we have to defer those cases to GEMM below.
+            auto gemv_trans_a =
+                blas_transpose_a == se::blas::Transpose::kTranspose
+                    ? se::blas::Transpose::kNoTranspose
+                    : se::blas::Transpose::kTranspose;
+            bool blas_launch_status = blas->DoBlasGemv(
+                stream, gemv_trans_a, adj_x || trans_x ? m : k,
+                adj_x || trans_x ? k : m, static_cast<Coefficient>(1.0),
+                *(a_ptrs[0]), adj_x || trans_x ? m : k, *(b_ptrs[0]), 1,
+                static_cast<Coefficient>(0.0), c_ptrs[0], 1);
+            if (!blas_launch_status) {
+              context->SetStatus(errors::Internal(
+                  "Blas xGEMV launch failed : a.shape=",
+                  in_x.shape().DebugString(), ", b.shape=",
+                  in_y.shape().DebugString(), ", m=", m, ", n=", n, ", k=", k));
+            }
+            return;
+          }
+        }
+
+        OP_REQUIRES_OK(
+            context,
+            blas->BlasGemm(stream, blas_transpose_b, blas_transpose_a, n, m, k,
+                           *(b_ptrs[0]), adj_y || trans_y ? k : n, *(a_ptrs[0]),
+                           adj_x || trans_x ? m : k, c_ptrs[0], n,
+                           GetNumericOptions(), call_context));
+      } else if (use_strided_batched) {
+        OP_REQUIRES_OK(
+            context, blas->BlasGemmStridedBatched(
+                         stream, blas_transpose_b, blas_transpose_a, n, m, k,
+                         static_cast<Coefficient>(1.0), *b_ptrs[0],
+                         adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
+                         adj_x || trans_x ? m : k, a_stride,
+                         static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
+                         batch_size, GetNumericOptions(), call_context));
+      } else {
+        BlasScratchAllocator scratch_allocator(context);
+        bool blas_launch_status = blas->DoBlasGemmBatched(
+            stream, blas_transpose_b, blas_transpose_a, n, m, k,
+            static_cast<Coefficient>(1.0), b_ptrs, adj_y || trans_y ? k : n,
+            a_ptrs, adj_x || trans_x ? m : k, static_cast<Coefficient>(0.0),
+            c_ptrs, n, batch_size, GetNumericOptions(), &scratch_allocator,
+            call_context);
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal(
+              "Blas xGEMMBatched launch failed : a.shape=",
+              in_x.shape().DebugString(),
+              ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+              ", k=", k, ", batch_size=", batch_size));
+        }
+      }
+#if GOOGLE_CUDA || TF_HIPBLASLT
+    }
+#endif  // GOOGLE_CUDA
+  }
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename T>
+inline void FastConvertToFloat(const T* src, float* dst, int64_t size) {
+  Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>> src_eigen(src, size);
+  Eigen::Map<Eigen::ArrayXf> dst_eigen(dst, size);
+  dst_eigen = src_eigen.template cast<float>();
+}
+
+template <typename T>
+inline void FastConvertFromFloat(const float* src, T* dst, int64_t size) {
+  Eigen::Map<const Eigen::ArrayXf> src_eigen(src, size);
+  Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>> dst_eigen(dst, size);
+  dst_eigen = src_eigen.template cast<T>();
+}
+
+template <>
+inline void FastConvertToFloat<bfloat16>(const bfloat16* src, float* dst,
+                                         int64_t size) {
+  BFloat16ToFloat(src, dst, size);
+}
+
+template <>
+inline void FastConvertFromFloat<bfloat16>(const float* src, bfloat16* dst,
+                                           int64_t size) {
+  FloatToBFloat16(src, dst, size);
+}
+
+template <typename Device, typename Ta, typename Tb, typename Tout>
+class BaseBatchMatMulOp : public OpKernel {
+ public:
+  explicit BaseBatchMatMulOp(OpKernelConstruction* context,
+                             bool is_legacy_matmul)
+      : OpKernel(context) {
+    if (is_legacy_matmul) {
+      // The old MatMul kernel has "transpose_a/transpose_b" attributes.
+      OP_REQUIRES_OK(context, context->GetAttr("transpose_a", &trans_x_));
+      OP_REQUIRES_OK(context, context->GetAttr("transpose_b", &trans_y_));
+      adj_x_ = false;
+      adj_y_ = false;
+      OP_REQUIRES_OK(context, context->GetAttr("grad_a", &grad_input_1_));
+      OP_REQUIRES_OK(context, context->GetAttr("grad_b", &grad_input_2_));
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+      OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+      trans_x_ = false;
+      trans_y_ = false;
+      OP_REQUIRES_OK(context, context->GetAttr("grad_x", &grad_input_1_));
+      OP_REQUIRES_OK(context, context->GetAttr("grad_y", &grad_input_2_));
+    }
+  }
+
+  ~BaseBatchMatMulOp() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+
+    const absl::Status s = ValidateInputTensors(ctx, in0, in1);
+    if (!s.ok()) {
+      ctx->SetStatus(s);
+      return;
+    }
+
+    MatMulBCast bcast(in0.shape().dim_sizes(), in1.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            in0.shape().DebugString(), " vs. ", in1.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+    auto d0 = in0.dim_size(in0.dims() - 2);
+    auto d1 = in0.dim_size(in0.dims() - 1);
+    Tensor in0_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in0_reshaped.CopyFrom(in0, TensorShape({bcast.x_batch_size(), d0, d1})),
+        errors::Internal("Failed to reshape In[0] from ",
+                         in0.shape().DebugString()));
+    auto d2 = in1.dim_size(in1.dims() - 2);
+    auto d3 = in1.dim_size(in1.dims() - 1);
+    Tensor in1_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in1_reshaped.CopyFrom(in1, TensorShape({bcast.y_batch_size(), d2, d3})),
+        errors::Internal("Failed to reshape In[1] from ",
+                         in1.shape().DebugString()));
+    if (adj_x_ || trans_x_) std::swap(d0, d1);
+    if (adj_y_ || trans_y_) std::swap(d2, d3);
+    OP_REQUIRES(
+        ctx, d1 == d2,
+        errors::InvalidArgument(
+            "Matrix size-incompatible: In[0]: ", in0.shape().DebugString(),
+            ", In[1]: ", in1.shape().DebugString()));
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d0));
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d3));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    if (in0.NumElements() == 0 || in1.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, Tout> f;
+      f(ctx->eigen_device<Device>(), out->flat<Tout>());
+      return;
+    }
+    Tensor out_reshaped;
+    OP_REQUIRES(ctx,
+                out_reshaped.CopyFrom(*out, TensorShape({batch_size, d0, d3})),
+                errors::Internal("Failed to reshape output from ",
+                                 out->shape().DebugString()));
+
+    // b/307285203: There seems to be an overly aggressive compiler optimization
+    // that optimizes away these data pointers unless we explicitly check them.
+    OP_REQUIRES(ctx,
+                in0_reshaped.data() != nullptr &&
+                    in1_reshaped.data() != nullptr &&
+                    out_reshaped.data() != nullptr,
+                absl::InternalError("Null data pointer encountered."));
+    if constexpr (std::is_same_v<Device, CPUDevice> && std::is_same_v<Ta, Tb> &&
+                  (std::is_same_v<Ta, bfloat16> ||
+                   std::is_same_v<Ta, Eigen::half>)) {
+      Tensor in0_reshaped_float, in1_reshaped_float, out_reshaped_float;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in0_reshaped.shape(),
+                                             &in0_reshaped_float));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in1_reshaped.shape(),
+                                             &in1_reshaped_float));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_reshaped.shape(),
+                                             &out_reshaped_float));
+
+      // TODO: Avoid extra copy to make (b)float16 matmul efficient on CPU.
+      FastConvertToFloat(in0_reshaped.flat<Ta>().data(),
+                         in0_reshaped_float.flat<float>().data(),
+                         in0_reshaped.NumElements());
+      FastConvertToFloat(in1_reshaped.flat<Tb>().data(),
+                         in1_reshaped_float.flat<float>().data(),
+                         in1_reshaped.NumElements());
+
+      LaunchBatchMatMul<Device, float>::Launch(
+          ctx, in0_reshaped_float, in1_reshaped_float, adj_x_, adj_y_, trans_x_,
+          trans_y_, grad_input_1_, grad_input_2_, bcast, &out_reshaped_float);
+      FastConvertFromFloat<Tout>(out_reshaped_float.flat<float>().data(),
+                                 out_reshaped.flat<Tout>().data(),
+                                 out->NumElements());
+    } else {
+      // Cast tensor to desired type to reuse Eigen.
+      // TODO(b/178749687): remove this cast if Eigen supports this natively.
+      if constexpr (!std::is_same<Ta, Tout>::value) {
+        in0_reshaped = CastTensor<Ta, Tout>(in0_reshaped);
+      }
+      if constexpr (!std::is_same<Tb, Tout>::value) {
+        in1_reshaped = CastTensor<Tb, Tout>(in1_reshaped);
+      }
+      LaunchBatchMatMul<Device, Tout>::Launch(
+          ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, trans_x_, trans_y_,
+          grad_input_1_, grad_input_2_, bcast, &out_reshaped);
+    }
+  }
+
+ protected:
+  virtual absl::Status ValidateInputTensors(OpKernelContext* ctx,
+                                            const Tensor& in0,
+                                            const Tensor& in1) = 0;
+
+ private:
+  // TODO(171979567) Make the ops take both adj and transpose attributes.
+  bool adj_x_ = false;
+  bool adj_y_ = false;
+  bool trans_x_ = false;
+  bool trans_y_ = false;
+  bool grad_input_1_ = false;
+  bool grad_input_2_ = false;
+
+  // Cast `t` from `SrcT` to `DstT`.
+  template <typename SrcT, typename DstT>
+  Tensor CastTensor(const Tensor& t) {
+    Tensor res = Tensor(DataTypeToEnum<DstT>::v(), t.shape());
+    res.flat<DstT>() = t.flat<SrcT>().template cast<DstT>();
+    return res;
+  }
+};
+
+// BatchMatMul Op implementation which disallows broadcasting.
+template <typename Device, typename Ta, typename Tb, typename Tout,
+          bool is_legacy_matmul = false>
+class BatchMatMulOp : public BaseBatchMatMulOp<Device, Ta, Tb, Tout> {
+ public:
+  explicit BatchMatMulOp(OpKernelConstruction* context)
+      : BaseBatchMatMulOp<Device, Ta, Tb, Tout>(context, is_legacy_matmul) {}
+
+  ~BatchMatMulOp() override {}
+
+ private:
+  absl::Status ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                                    const Tensor& in1) override {
+    // Disallow broadcasting support. Ensure that all batch dimensions of the
+    // input tensors match.
+    if (in0.dims() != in1.dims()) {
+      return errors::InvalidArgument(
+          "In[0] and In[1] has different ndims: ", in0.shape().DebugString(),
+          " vs. ", in1.shape().DebugString());
+    }
+    const int ndims = in0.dims();
+    if (is_legacy_matmul) {
+      if (ndims != 2) {
+        return errors::InvalidArgument("In[0] and In[1] ndims must be == 2: ",
+                                       ndims);
+      }
+    } else {
+      if (ndims < 2) {
+        return errors::InvalidArgument("In[0] and In[1] ndims must be >= 2: ",
+                                       ndims);
+      }
+      for (int i = 0; i < ndims - 2; ++i) {
+        if (in0.dim_size(i) != in1.dim_size(i)) {
+          return errors::InvalidArgument(
+              "In[0].dim(", i, ") and In[1].dim(", i,
+              ") must be the same: ", in0.shape().DebugString(), " vs ",
+              in1.shape().DebugString());
+        }
+      }
+    }
+    return absl::OkStatus();
+  }
+};
+
+// BatchMatMul Op implementation with broadcasting support.
+template <typename Device, typename Ta, typename Tb, typename Tout>
+class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Ta, Tb, Tout> {
+ public:
+  explicit BatchMatMulV2Op(OpKernelConstruction* context)
+      : BaseBatchMatMulOp<Device, Ta, Tb, Tout>(context,
+                                                /* is_legacy_matmul= */ false) {
+  }
+
+  ~BatchMatMulV2Op() override {}
+
+ private:
+  absl::Status ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                                    const Tensor& in1) override {
+    // Enable broadcasting support. Validity of broadcasting is checked in
+    // BaseBatchMatMulOp.
+    if (in0.dims() < 2) {
+      return errors::InvalidArgument("In[0] ndims must be >= 2: ", in0.dims());
+    }
+    if (in1.dims() < 2) {
+      return errors::InvalidArgument("In[1] ndims must be >= 2: ", in1.dims());
+    }
+    return absl::OkStatus();
+  }
+};
+
+// Register for MatMul, BatchMatMul, BatchMatMulv2 where Tin = Tout.
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),   \
+      BatchMatMulOp<CPUDevice, TYPE, TYPE, TYPE>);                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      BatchMatMulV2Op<CPUDevice, TYPE, TYPE, TYPE>);                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
+      BatchMatMulOp<CPUDevice, TYPE, TYPE, TYPE, /* is_legacy_matmul=*/true>)
+
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),   \
+      BatchMatMulOp<GPUDevice, TYPE, TYPE, TYPE>);                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("BatchMatMulV2").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      BatchMatMulV2Op<GPUDevice, TYPE, TYPE, TYPE>);                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("MatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),        \
+      BatchMatMulOp<GPUDevice, TYPE, TYPE, TYPE, /* is_legacy_matmul=*/true>)
+
+// Register for BatchMatMulv3 where Ta, Tb and Tout are not the same.
+#define REGISTER_BATCH_MATMUL_TOUT_CPU(Ta, Tb, Tout)         \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatMulV3")              \
+                              .Device(DEVICE_CPU)            \
+                              .TypeConstraint<Ta>("Ta")      \
+                              .TypeConstraint<Tb>("Tb")      \
+                              .TypeConstraint<Tout>("Tout"), \
+                          BatchMatMulV2Op<CPUDevice, Ta, Tb, Tout>)
+
+#define REGISTER_BATCH_MATMUL_TOUT_GPU(Ta, Tb, Tout)         \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatMulV3")              \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<Ta>("Ta")      \
+                              .TypeConstraint<Tb>("Tb")      \
+                              .TypeConstraint<Tout>("Tout"), \
+                          BatchMatMulV2Op<GPUDevice, Ta, Tb, Tout>)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MATMUL_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_util.h
new file mode 100644
index 00000000..0b73b881
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/matmul_util.h
@@ -0,0 +1,88 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_MATMUL_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_MATMUL_UTIL_H_
+
+#include <optional>
+#include <vector>
+
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
+
+#if GOOGLE_CUDA || TF_HIPBLASLT
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "tensorflow/core/framework/types.h"
+#include "tsl/platform/types.h"
+
+namespace tensorflow {
+
+// Get a workspace limit from the environment variable, which is in MB.
+// Return the workspace memory limit in bytes. If no value is set, return the
+// default value.
+int64_t GetWorkspaceLimit(int64_t default_value_in_bytes);
+
+struct BlasLtMatmulPlanParams {
+  std::string ToString() const;
+  bool operator==(const BlasLtMatmulPlanParams& other) const;
+
+  se::blas::DataType dtype;
+  size_t m;
+  size_t n;
+  size_t k;
+  se::blas::Transpose trans_a;
+  se::blas::Transpose trans_b;
+  size_t batch_count = 1;
+  bool broadcast_a = false;
+  bool broadcast_b = false;
+  se::gpu::BlasLt::Epilogue epilogue = se::gpu::BlasLt::Epilogue::kDefault;
+};
+
+struct PlanAndAlgorithms {
+  static StatusOr<const PlanAndAlgorithms*> GetOrCreate(
+      se::Stream* stream, const BlasLtMatmulPlanParams& params,
+      absl::Mutex** pmu, std::optional<int> max_algorithm_count = std::nullopt);
+
+  Status ExecuteOnStream(
+      se::Stream* stream, const se::DeviceMemoryBase& a,
+      const se::DeviceMemoryBase& b, se::DeviceMemoryBase& c,
+      size_t algorithm_idx, se::ScratchAllocator& scratch_allocator,
+      const se::DeviceMemoryBase& bias = se::DeviceMemoryBase{},
+      se::blas::ProfileResult* profile_result = nullptr) const;
+
+  se::gpu::BlasLt::MatmulPlanPtr plan;
+  std::vector<se::gpu::BlasLt::MatmulAlgorithm> algorithms;
+};
+
+namespace internal {
+
+inline auto AsTuple(const BlasLtMatmulPlanParams& p) {
+  return std::make_tuple(p.dtype, p.m, p.n, p.k, p.trans_a, p.trans_b,
+                         p.batch_count, p.broadcast_a, p.broadcast_b,
+                         p.epilogue);
+}
+
+}  // namespace internal
+
+template <typename H>
+H AbslHashValue(H h, const BlasLtMatmulPlanParams& params) {
+  return H::combine(std::move(h), internal::AsTuple(params));
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TF_HIPBLASLT
+
+#endif  // TENSORFLOW_CORE_KERNELS_MATMUL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/maxpooling_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/maxpooling_op.h
new file mode 100644
index 00000000..7c1d91d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/maxpooling_op.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_H_
+// Functor definition for MaxPoolingOp, must be compilable by nvcc.
+
+#include "xla/tsl/framework/fixedpoint/FixedPoint.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/kernels/eigen_pooling.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SpatialMaxPooling {
+  void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
+                  typename TTypes<T, 4>::ConstTensor input, int window_rows,
+                  int window_cols, int row_stride, int col_stride,
+                  const Eigen::PaddingType& padding) {
+    // Because we swap the layout, we swap the row/cols as well
+    output.swap_layout().device(d) =
+        Eigen::SpatialMaxPooling(input.swap_layout(), window_cols, window_rows,
+                                 col_stride, row_stride, padding);
+  }
+};
+
+template <typename Device>
+struct SpatialMaxPooling<Device, qint8> {
+  void operator()(const Device& d, typename TTypes<qint8, 4>::Tensor output,
+                  typename TTypes<qint8, 4>::ConstTensor input, int window_rows,
+                  int window_cols, int row_stride, int col_stride,
+                  const Eigen::PaddingType& padding) {}
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/maxpooling_op_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/maxpooling_op_gpu.h
new file mode 100644
index 00000000..650a01e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -0,0 +1,86 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace functor {
+// Run the forward pass of max pooling, optionally writing the argmax indices to
+// the mask array, if it is not nullptr. If mask is passed in as nullptr, the
+// argmax indices are not written.
+template <typename T>
+struct MaxPoolForwardWithOptionalArgmax {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, T* top_data, int64_t* mask,
+                  const Eigen::GpuDevice& d, bool propagate_nans,
+                  const bool include_batch_in_index);
+};
+
+struct MaxPoolForwardNoMask_NCHW_VECT_C {
+  bool operator()(const int32* bottom_data, const int batch, const int height,
+                  const int width, int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, int32* top_data,
+                  const Eigen::GpuDevice& d);
+};
+
+template <typename T>
+struct MaxPoolBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64_t* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d, const bool include_batch_in_index);
+};
+
+template <typename T>
+struct MaxPoolGradBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64_t* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d, const bool include_batch_in_index);
+};
+
+template <typename T>
+struct MaxPoolGradBackwardNoMask {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int height, const int width,
+                  const int kernel_h, const int kernel_w, const int stride_h,
+                  const int stride_w, const int pad_t, const int pad_l,
+                  const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/meta_support.h b/third_party/tflite-hdrs/tensorflow/core/kernels/meta_support.h
new file mode 100644
index 00000000..b1e81b4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/meta_support.h
@@ -0,0 +1,112 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_META_SUPPORT_H_
+#define TENSORFLOW_CORE_KERNELS_META_SUPPORT_H_
+
+#include "meta/multi_thread_gemm.h"
+#include "meta/multi_thread_transform.h"
+#include "meta/quantized_mul_kernels.h"
+#include "meta/streams.h"
+#include "meta/transform_kernels.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace meta {
+
+// Gemmlowp/meta is a small library of optimized Arm32/64 kernels for quantized
+// matrix multiplication and other quantized computations.
+
+// Set the maximum number of threads of computation that the internal workers
+// pool can use. If num_threads is 0, then use intra_op_parallelism_threads.
+void SetNumThreads(int num_threads);
+
+int GetNumThreads();
+
+// Toggle the internal workers pool. If set to false, the computations will
+// use the worker pool passed each time in the OpKernelContext. If set to true
+// then the OpKernelContext will be ignored, and the internal optimized workers
+// pool will be used.
+//
+// The internal workers pool is disabled by default (false).
+void SetUseLocalContext(bool use_local_context);
+
+bool GetUseLocalContext();
+
+// Toggles the codepath. Enabled by default (true) on supported platforms.
+void SetEnabled(bool enabled);
+
+// Returns true if the codepath is supported and is enabled. Use this call
+// before calling the compute functions. If the codepath is not supported, and
+// any of the compute function is called, the library will log a FATAL error.
+bool IsSupportedAndEnabled();
+
+// Calculate the quantized matrix multiplication:
+//
+// for (i, j) in [0, m) x [0, n) do
+//   c_data[i, j] :=
+//     sum((a_data[i, l] + offset_a) * (b_data[l, j] + offset_b)) : l in [0, k)
+//
+// If transpose_a is false the lhs operand has row major layout, otherwise
+// column major. Similarly transpose_b describes the layout of the rhs operand.
+// lda, ldb, and ldc are the strides of the lhs operand, rhs operand and the
+// result arrays.
+void QuantizedGemm(OpKernelContext* context, bool transpose_a, bool transpose_b,
+                   const quint8* a_data, const quint8* b_data, qint32* c_data,
+                   int m, int n, int k, int offset_a, int offset_b, int lda,
+                   int ldb, int ldc);
+
+// Take an array of numbers from the range [input_min, input_max] quantized
+// uniformly to int32 values, recover their float values, and then quantize
+// them back uniformly to the range [output_min, output_max] as uint8.
+// Saturate the uint8 values.
+void Requantize(OpKernelContext* context, const qint32* input, int count,
+                float input_min, float input_max, float output_min,
+                float output_max, quint8* output);
+
+// Take an array of numbers from the range [range_min, range_max] quantized
+// uniformly to uint8 values and recover their float values.
+void Dequantize(OpKernelContext* context, const quint8* input, int count,
+                float range_min, float range_max, float* output);
+
+// Take an array of float values and quantize them uniformly to the range
+// [range_min, range_max] expressed as uint8. Saturate the uint8 values.
+void Quantize(OpKernelContext*, const float* input, int count, float range_min,
+              float range_max, quint8* output);
+
+// Take two arrays: the inputs and the bias quantized uniformly in the ranges
+// [input_min, input_max], and [bias_min, bias_max] accordingly, as uint8
+// values. Recover their float values. Add the values. Quantize them back
+// uniformly to the range [output_min, output_max] as int32. Saturate the
+// int32 values.
+void QuantizedBiasAdd(OpKernelContext* context, const quint8* input,
+                      int input_count, const quint8* bias, int bias_count,
+                      float input_min, float input_max, float bias_min,
+                      float bias_max, float output_min, float output_max,
+                      qint32* output);
+
+// Take an array of uint8 values and clamp them to the range [clamp_min,
+// clamp_max].
+void Clamp(OpKernelContext* context, const quint8* input, int input_count,
+           quint8 clamp_min, quint8 clamp_max, quint8* output);
+
+}  // namespace meta
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_META_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc.h
new file mode 100644
index 00000000..790b5f7b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for computing MFCCs from spectrogram slices.
+
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/mfcc_dct.h"
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+class Mfcc {
+ public:
+  Mfcc();
+  bool Initialize(int input_length, double input_sample_rate);
+
+  // Input is a single squared-magnitude spectrogram frame. The input spectrum
+  // is converted to linear magnitude and weighted into bands using a
+  // triangular mel filterbank, and a discrete cosine transform (DCT) of the
+  // values is taken. Output is populated with the lowest dct_coefficient_count
+  // of these values.
+  void Compute(const std::vector<double>& spectrogram_frame,
+               std::vector<double>* output) const;
+
+  void set_upper_frequency_limit(double upper_frequency_limit) {
+    CHECK(!initialized_) << "Set frequency limits before calling Initialize.";
+    upper_frequency_limit_ = upper_frequency_limit;
+  }
+
+  void set_lower_frequency_limit(double lower_frequency_limit) {
+    CHECK(!initialized_) << "Set frequency limits before calling Initialize.";
+    lower_frequency_limit_ = lower_frequency_limit;
+  }
+
+  void set_filterbank_channel_count(int filterbank_channel_count) {
+    CHECK(!initialized_) << "Set channel count before calling Initialize.";
+    filterbank_channel_count_ = filterbank_channel_count;
+  }
+
+  void set_dct_coefficient_count(int dct_coefficient_count) {
+    CHECK(!initialized_) << "Set coefficient count before calling Initialize.";
+    dct_coefficient_count_ = dct_coefficient_count;
+  }
+
+ private:
+  MfccMelFilterbank mel_filterbank_;
+  MfccDct dct_;
+  bool initialized_;
+  double lower_frequency_limit_;
+  double upper_frequency_limit_;
+  int filterbank_channel_count_;
+  int dct_coefficient_count_;
+  Mfcc(const Mfcc&) = delete;
+  void operator=(const Mfcc&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc_dct.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc_dct.h
new file mode 100644
index 00000000..e7982d6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc_dct.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic minimal DCT class for MFCC speech processing.
+
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class MfccDct {
+ public:
+  MfccDct();
+  bool Initialize(int input_length, int coefficient_count);
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  bool initialized_;
+  int coefficient_count_;
+  int input_length_;
+  std::vector<std::vector<double> > cosines_;
+  MfccDct(const MfccDct&) = delete;
+  void operator=(const MfccDct&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc_mel_filterbank.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc_mel_filterbank.h
new file mode 100644
index 00000000..293d7745
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for applying a mel-scale mapping to a power spectrum.
+
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class MfccMelFilterbank {
+ public:
+  MfccMelFilterbank();
+  bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
+
+  // Takes a squared-magnitude spectrogram slice as input, computes a
+  // triangular-mel-weighted linear-magnitude filterbank, and places the result
+  // in output.
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  double FreqToMel(double freq) const;
+  bool initialized_;
+  int num_channels_;
+  double sample_rate_;
+  int input_length_;
+  std::vector<double> center_frequencies_;  // In mel, for each mel channel.
+
+  // Each FFT bin b contributes to two triangular mel channels, with
+  // proportion weights_[b] going into mel channel band_mapper_[b], and
+  // proportion (1 - weights_[b]) going into channel band_mapper_[b] + 1.
+  // Thus, weights_ contains the weighting applied to each FFT bin for the
+  // upper-half of the triangular band.
+  std::vector<double> weights_;  // Right-side weight for this fft  bin.
+
+  // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
+  std::vector<int> band_mapper_;
+  int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
+
+  MfccMelFilterbank(const MfccMelFilterbank&) = delete;
+  void operator=(const MfccMelFilterbank&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h
new file mode 100644
index 00000000..d7c1da14
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h
@@ -0,0 +1,104 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_BATCH_MATMUL_HELPER_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_BATCH_MATMUL_HELPER_H_
+#if defined(INTEL_MKL)
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace tensorflow {
+
+struct MklBatchMatMulHelper {
+  using dims = dnnl::memory::dims;
+  // This method makes the rank (ndims) of input same as the output by creating
+  // new axes to the input. For example, if input shape is [a, b, c, d] and
+  // output shape is [e, f, g, h, i, j], then the reshaped input would have a
+  // shape of [1, 1, a, b, c, d].
+  void ExpandInputDimsToOutputShape(const TensorShape& input_shape,
+                                    const TensorShape& output_shape,
+                                    dims* reshaped_dims) {
+    auto ndims_input = input_shape.dims();
+    auto ndims_output = output_shape.dims();
+    auto dim_offset = ndims_output - ndims_input;
+    DCHECK(dim_offset > 0);
+    reshaped_dims->clear();
+    reshaped_dims->resize(ndims_output, 1);
+    auto input_dims = input_shape.dim_sizes();
+    for (int dim_idx = 0; dim_idx < ndims_input; ++dim_idx)
+      reshaped_dims->at(dim_idx + dim_offset) = input_dims[dim_idx];
+  }
+
+  std::unique_ptr<MklMatMulParams> CreateMatMulParams(
+      string& prefix, const TensorShape& lhs_shape,
+      const TensorShape& rhs_shape, const TensorShape& out_shape, bool& adj_x,
+      bool& adj_y) {
+    const auto ndims_lhs = lhs_shape.dims();
+    const auto ndims_rhs = rhs_shape.dims();
+    const auto ndims_out = out_shape.dims();
+    auto lhs_dims = TFShapeToMklDnnDims(lhs_shape);
+    auto rhs_dims = TFShapeToMklDnnDims(rhs_shape);
+    auto out_dims = TFShapeToMklDnnDims(out_shape);
+
+    // DNNL matmul_primitive requires ranks of inputs and output to be same.
+    // Create dnnl::memory::dims for inputs and output of same rank.
+    // It is assumed here that MatMulBCast object creates output_batch_shape as
+    // a conforming superset of input batch shapes, i.e., ndims_out >=
+    // ndims_lhs and ndims_out >= ndims_rhs.
+    if (ndims_lhs < ndims_out) {
+      ExpandInputDimsToOutputShape(lhs_shape, out_shape, &lhs_dims);
+    }
+    if (ndims_rhs < ndims_out) {
+      ExpandInputDimsToOutputShape(rhs_shape, out_shape, &rhs_dims);
+    }
+    auto lhs_strides = CalculateTFStrides(lhs_dims);
+    auto rhs_strides = CalculateTFStrides(rhs_dims);
+    auto out_strides = CalculateTFStrides(out_dims);
+
+    if (adj_x) {
+      int m_idx = ndims_out - 1;
+      int k_idx = ndims_out - 2;
+      memory::dim m = lhs_dims[m_idx];  // number of rows in x
+      std::swap(lhs_dims[m_idx], lhs_dims[k_idx]);
+      lhs_strides[m_idx] = m;
+      lhs_strides[k_idx] = 1;
+    }
+
+    if (adj_y) {
+      int k_idx = ndims_out - 1;
+      int n_idx = ndims_out - 2;
+      memory::dim k = rhs_dims[k_idx];  // number of columns in x
+      std::swap(rhs_dims[k_idx], rhs_dims[n_idx]);
+      rhs_strides[k_idx] = k;
+      rhs_strides[n_idx] = 1;
+    }
+
+    return std::make_unique<MklMatMulParams>(prefix, lhs_dims, rhs_dims,
+                                             out_dims, lhs_strides, rhs_strides,
+                                             out_strides);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_BATCH_MATMUL_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_conv_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_conv_ops.h
new file mode 100644
index 00000000..eac82bea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_conv_ops.h
@@ -0,0 +1,711 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
+
+#ifdef INTEL_MKL
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "dnnl.hpp"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/onednn_env_vars.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+using dnnl::convolution_forward;
+using dnnl::prop_kind;
+using dnnl::stream;
+
+namespace tensorflow {
+
+#ifndef ENABLE_ONEDNN_V3
+// Op descriptor is no longer supported in oneDNN v3.x. Instead, primitive
+// descriptor will directly accept primitive parameters during creation.
+using ConvFwdDesc = dnnl::convolution_forward::desc;
+#endif  // !ENABLE_ONEDNN_V3
+using ConvFwdPd = dnnl::convolution_forward::primitive_desc;
+
+class MklDnnConvUtil {
+ protected:
+  OpKernelContext* context_;  // We don't own this.
+  std::vector<int32> strides_;
+  std::vector<int32> dilations_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+ public:
+  MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
+                 Padding pad, TensorFormat fm,
+                 const std::vector<int32>& dilations, bool is_depthwise = false)
+      : context_(context),
+        strides_(strides),
+        dilations_(dilations),
+        padding_(pad),
+        data_format_(fm) {}
+
+  virtual ~MklDnnConvUtil() { context_ = nullptr; }
+
+  // Calculate Convolution strides
+  virtual inline void GetStridesInMklOrder(memory::dims* strides) {
+    // For now we take the stride from the second and third dimensions only
+    // (we do not support striding on the batch or depth dimension).
+    DCHECK(strides);
+    if (strides_.size() == 4) {
+      int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+      int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+      *strides = {stride_rows, stride_cols};
+    } else if (strides_.size() == 5) {
+      int stride_planes = GetTensorDim(strides_, data_format_, '0');
+      int stride_rows = GetTensorDim(strides_, data_format_, '1');
+      int stride_cols = GetTensorDim(strides_, data_format_, '2');
+      *strides = {stride_planes, stride_rows, stride_cols};
+    }
+  }
+
+  // Calculate Convolution dilations
+  virtual inline void GetDilationsInMklOrder(memory::dims* dilations) {
+    // For now we take the dilation from the second and third dimensions only
+    // (we do not support dilation on the batch or depth dimension).
+    DCHECK(dilations);
+    if (dilations_.size() == 4) {
+      int dilations_rows = GetTensorDim(dilations_, data_format_, 'H');
+      int dilations_cols = GetTensorDim(dilations_, data_format_, 'W');
+      *dilations = {dilations_rows, dilations_cols};
+    } else if (dilations_.size() == 5) {
+      int dilations_planes = GetTensorDim(dilations_, data_format_, '0');
+      int dilations_rows = GetTensorDim(dilations_, data_format_, '1');
+      int dilations_cols = GetTensorDim(dilations_, data_format_, '2');
+      *dilations = {dilations_planes, dilations_rows, dilations_cols};
+    }
+  }
+
+  // Calculate Convolution input size in oneDNN order. oneDNN
+  // requires input in NCHW/NCDHW format. Function does not return anything.
+  // But errors arising from sanity checks are returned in context's
+  // status.
+  virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
+                                             memory::dims* input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
+
+    DCHECK(input_dims);
+
+    // Input channel
+    int64 input_depth_raw = GetTensorDim(input_shape, data_format_, 'C');
+    int input_depth = static_cast<int>(input_depth_raw);
+
+    // Input batch
+    int64 input_batch_raw = GetTensorDim(input_shape, data_format_, 'N');
+    CHECK_BOUNDS(input_batch_raw, "Input batch too large");
+    int input_batch = static_cast<int>(input_batch_raw);
+
+    if (strides_.size() == 4) {  // NCHW format for Conv2D
+      // Input rows/height
+      int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');
+      CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+      int input_rows = static_cast<int>(input_rows_raw);
+
+      // Input columns/width
+      int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');
+      CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+      int input_cols = static_cast<int>(input_cols_raw);
+
+      // oneDNN always requires input in NCHW format Conv2D.
+      std::vector<memory::dim> input_sizes(4, -1);
+      input_sizes[MklDnnDims::Dim_N] = input_batch;
+      input_sizes[MklDnnDims::Dim_C] = input_depth;
+      input_sizes[MklDnnDims::Dim_H] = input_rows;
+      input_sizes[MklDnnDims::Dim_W] = input_cols;
+      *input_dims = input_sizes;
+    } else if (strides_.size() == 5) {  // NCDHW format for Conv3D
+      // Input planes/third-dimension
+      int64 input_planes_raw = GetTensorDim(input_shape, data_format_, '0');
+      CHECK_BOUNDS(input_planes_raw, "Input depth too large");
+      int input_planes = static_cast<int>(input_planes_raw);
+
+      // Input rows/height
+      int64 input_rows_raw = GetTensorDim(input_shape, data_format_, '1');
+      CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+      int input_rows = static_cast<int>(input_rows_raw);
+
+      // Input columns/width
+      int64 input_cols_raw = GetTensorDim(input_shape, data_format_, '2');
+      CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+      int input_cols = static_cast<int>(input_cols_raw);
+
+      // oneDNN always requires input in NCDHW format for Conv3D.
+      std::vector<memory::dim> input_sizes(5, -1);
+      input_sizes[MklDnnDims3D::Dim3d_N] = input_batch;
+      input_sizes[MklDnnDims3D::Dim3d_C] = input_depth;
+      input_sizes[MklDnnDims3D::Dim3d_D] = input_planes;
+      input_sizes[MklDnnDims3D::Dim3d_H] = input_rows;
+      input_sizes[MklDnnDims3D::Dim3d_W] = input_cols;
+      *input_dims = input_sizes;
+    }
+#undef CHECK_BOUNDS
+  }
+
+  // Calculate Convolution filter size in oneDNN order.
+  // oneDNN requires filter in OIHW (Conv2D) or OIDHW (Conv3D) format.
+  // Function does not return anything.
+  // But errors arising from sanity checks are returned in context's
+  // status. This function differs from GetConvFilterSizeInMklOrder in
+  // parameter for input - it accepts src_shape since Convolution Backward
+  // Input gets shape of input tensor rather than actual tensor (Convolution
+  // forward gets actual tensor as input).
+  //
+  // TODO(intel-tf): Add similar function for input and filter in MklShape.
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                                              const TensorShape& filter_shape,
+                                              memory::dims* filter_dims,
+                                              bool* is_grouped_convolution,
+                                              bool is_depthwise) {
+    DCHECK(filter_dims);
+
+    OP_REQUIRES(context_, filter_shape.dims() == strides_.size(),
+                errors::InvalidArgument((strides_.size() == 4)
+                                            ? "filter must be 4-dimensional: "
+                                            : "filter must be 5-dimensional: ",
+                                        filter_shape.DebugString()));
+
+    for (int i = 0; i < ((strides_.size() == 4) ? 3 : 5); i++) {
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
+    }
+
+    int input_depth = GetTensorDim(input_shape, data_format_, 'C');
+
+    if (strides_.size() == 4) {  // Conv2D
+      // TF filter is always in (rows, cols, in_depth, out_depth) order.
+      int filter_rows =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_H));
+      int filter_cols =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_W));
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_I));
+      int filter_out_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_O));
+      OP_REQUIRES(context_, input_depth % filter_in_depth == 0,
+                  errors::InvalidArgument(
+                      "input depth must be evenly divisible by filter depth: ",
+                      input_depth, " vs ", filter_in_depth));
+      *is_grouped_convolution = filter_in_depth != input_depth;
+      int group_count = input_depth / filter_in_depth;
+      OP_REQUIRES(context_, group_count > 0,
+                  errors::InvalidArgument(
+                      "grouped convolution must have at least one group: ",
+                      group_count, " groups"));
+
+      // oneDNN always needs filter in OIHW format for regular convolutions
+      // and GOIHW for grouped/depthwise convolutions,
+      // OIHW = (out_depth, in_depth, rows, cols)
+      // GOIHW = (group, out_depth, in_depth, rows, cols)
+      // Specifically for depthwise G=filter_indepth, O=filter_outdepth, I=1
+      if (is_depthwise) {
+        std::vector<memory::dim> filter_sizes(5, -1);
+        filter_sizes[MKL_GROUP_FILTER_DIM_G] = filter_in_depth;
+        filter_sizes[MKL_GROUP_FILTER_DIM_O] = filter_out_depth;
+        filter_sizes[MKL_GROUP_FILTER_DIM_I] = 1;
+        filter_sizes[MKL_GROUP_FILTER_DIM_H] = filter_rows;
+        filter_sizes[MKL_GROUP_FILTER_DIM_W] = filter_cols;
+        *filter_dims = filter_sizes;
+      } else if (*is_grouped_convolution) {
+        // TODO(intel-tf): Directly set filter_dims. Same for other places.
+        std::vector<memory::dim> filter_sizes(5, -1);
+        filter_sizes[MKL_GROUP_FILTER_DIM_G] = group_count;
+        filter_sizes[MKL_GROUP_FILTER_DIM_O] = filter_out_depth / group_count;
+        filter_sizes[MKL_GROUP_FILTER_DIM_I] = filter_in_depth;
+        filter_sizes[MKL_GROUP_FILTER_DIM_H] = filter_rows;
+        filter_sizes[MKL_GROUP_FILTER_DIM_W] = filter_cols;
+        *filter_dims = filter_sizes;
+      } else {
+        std::vector<memory::dim> filter_sizes(4, -1);
+        filter_sizes[MklDnnDims::Dim_O] = filter_out_depth;
+        filter_sizes[MklDnnDims::Dim_I] = filter_in_depth;
+        filter_sizes[MklDnnDims::Dim_H] = filter_rows;
+        filter_sizes[MklDnnDims::Dim_W] = filter_cols;
+        *filter_dims = filter_sizes;
+      }
+    } else {  // Conv3D
+      OP_REQUIRES(context_, input_depth == filter_shape.dim_size(3),
+                  errors::InvalidArgument(
+                      "input and filter must have the same depth: ",
+                      input_depth, " vs ", filter_shape.dim_size(3)));
+
+      // TF filter is always in (planes, rows, cols, in_depth, out_depth) order.
+      int filter_planes =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_P));
+      int filter_rows =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_H));
+      int filter_cols =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_W));
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_I));
+      int filter_out_depth =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_O));
+
+      // oneDNN always needs filter in OIDHW format.
+      // OIDHW = (out_depth, in_depth, planes, rows, cols)
+      std::vector<memory::dim> filter_sizes(5, -1);
+      filter_sizes[MklDnnDims3D::Dim3d_O] = filter_out_depth;
+      filter_sizes[MklDnnDims3D::Dim3d_I] = filter_in_depth;
+      filter_sizes[MklDnnDims3D::Dim3d_D] = filter_planes;
+      filter_sizes[MklDnnDims3D::Dim3d_H] = filter_rows;
+      filter_sizes[MklDnnDims3D::Dim3d_W] = filter_cols;
+      *filter_dims = filter_sizes;
+    }
+  }
+
+  // Calculate Convolution filter size in oneDNN order.
+  // oneDNN requires filter in OIHW (Conv2D) or OIDHW(Conv3D format.
+  // Function does not return anything. But errors arising from sanity
+  // checks are returned in context's status.
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims* filter_dims,
+                                              bool* is_grouped_convolution,
+                                              bool is_depthwise) {
+    DCHECK(filter_dims);
+    GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
+                            GetTfShape(context_, filter_index), filter_dims,
+                            is_grouped_convolution, is_depthwise);
+  }
+
+  // Calculate Bias size for 2D or 3D Convolution. Function does not
+  // return anything, but may set an error in context status.
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims* bias_dims) {
+    const Tensor& bias = MklGetInput(context_, bias_index);
+    if (bias.dims() > 1) {
+      if (strides_.size() == 4) {
+        OP_REQUIRES(
+            context_, bias.dims() <= 4,
+            errors::InvalidArgument("For NHWC format, bias should have  "
+                                    "4 or less dimensions",
+                                    bias.shape().DebugString()));
+      } else if (strides_.size() == 5) {
+        OP_REQUIRES(
+            context_, bias.dims() <= 5,
+            errors::InvalidArgument("For NDHWC format, bias should have  "
+                                    "5 or less dimensions",
+                                    bias.shape().DebugString()));
+      }
+      // Make sure all the dims except channel(last) is 1
+      for (int i = 0; i < bias.dims() - 1; i++) {
+        OP_REQUIRES(
+            context_, bias.dim_size(i) == 1,
+            errors::InvalidArgument("For bias_dims > 1, all except the last "
+                                    "dimension (channel) must be 1: ",
+                                    bias.shape().DebugString()));
+      }
+      *bias_dims = {static_cast<int>(bias.dim_size(bias.dims() - 1))};
+    } else {
+      *bias_dims = {static_cast<int>(bias.dim_size(0))};
+    }
+  }
+
+  // Function to calculate output and padding size for 2D/3D convolution.
+  //
+  // Calculate output shape of Convolution in oneDNN and TensorFlow order.
+  // oneDNN uses NCHW(Conv2D) or NCDHW(Conv3D) for output order.
+  // But TensorFlow output will be in NHWC||NCHW(Conv2D) or
+  // NDHWC||NCDHW(Conv3D) format depending on data format.
+  // Function also calculates left, right, top and bottom pads.
+  // Function does not return any status which is set with context status.
+  //
+  // TODO(intel-tf): Add similar function for input and filter in MklShape.
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      const memory::dims& strides, const memory::dims& dilations,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r, bool is_grouped_convolution,
+      bool pad_enabled = false, bool is_depthwise = false) {
+    DCHECK(output_dims_tf_order);
+    DCHECK(output_dims_mkl_order);
+    DCHECK(pad_l);
+    DCHECK(pad_r);
+
+    bool is_conv2d = (strides_.size() == 4);
+    int input_planes, input_rows, input_cols;
+    if (is_conv2d) {
+      input_rows = GetTensorDim(input_shape, data_format_, 'H');
+      input_cols = GetTensorDim(input_shape, data_format_, 'W');
+    } else {
+      input_planes = GetTensorDim(input_shape, data_format_, '0');
+      input_rows = GetTensorDim(input_shape, data_format_, '1');
+      input_cols = GetTensorDim(input_shape, data_format_, '2');
+    }
+
+    // Filter dimension
+    // Conv2D:
+    //    First dimension: rows/height.
+    //    Second dimension: cols/width.
+    // Conv3D:
+    //    First dimension: planes/depth.
+    //    Second dimension: rows/height.
+    //    Third dimension: cols/width.
+
+    int filter_planes, filter_rows, filter_cols;
+    if (is_conv2d) {
+      filter_rows = filter_shape.dim_size(TF_2DFILTER_DIM_H);
+      filter_cols = filter_shape.dim_size(TF_2DFILTER_DIM_W);
+    } else {
+      filter_planes = filter_shape.dim_size(TF_3DFILTER_DIM_P);
+      filter_rows = filter_shape.dim_size(TF_3DFILTER_DIM_H);
+      filter_cols = filter_shape.dim_size(TF_3DFILTER_DIM_W);
+    }
+
+    int stride_planes, stride_rows, stride_cols;
+    int dilation_planes, dilation_rows, dilation_cols;
+    if (is_conv2d) {
+      // Conv2D stride is a vector of 2 elements: {s_r, s_c}
+      stride_rows = strides[0];
+      stride_cols = strides[1];
+      dilation_rows = dilations[0];
+      dilation_cols = dilations[1];
+    } else {
+      // Conv3D stride is a vector of 3 elements: {s_d, s_r, s_c}
+      stride_planes = strides[0];
+      stride_rows = strides[1];
+      stride_cols = strides[2];
+      dilation_planes = dilations[0];
+      dilation_rows = dilations[1];
+      dilation_cols = dilations[2];
+    }
+
+    // Output batch is same as input batch.
+    int out_batch = GetTensorDim(input_shape, data_format_, 'N');
+    int out_depth;
+
+    // TODO(intel-tf) add support for 3-D Depthwise
+
+    // Output depth is same as last dimension for filters for regular
+    // convolutions and group convolutions. For depthwise it is in_depth *
+    // channel_multiplier. The channel_multiplier is the last dimension of
+    // TF filter for depthwise convolutions.
+    if (is_depthwise) {
+      out_depth = (filter_shape.dim_size(TF_2DFILTER_DIM_I) *
+                   filter_shape.dim_size(TF_2DFILTER_DIM_O));
+    } else if (is_grouped_convolution) {
+      out_depth = filter_shape.dim_size(TF_2DFILTER_DIM_O);
+    } else {
+      out_depth = filter_shape.dim_size(
+          is_conv2d ? static_cast<int>(TF_2DFILTER_DIM_O)
+                    : static_cast<int>(TF_3DFILTER_DIM_O));
+    }
+
+    int64 out_rows = 0, out_cols = 0, out_planes = 0;
+    int64 pad_top = 0, pad_bottom = 0, pad_left = 0, pad_right = 0;
+    int64 pad_front, pad_back;
+
+    if (is_conv2d) {
+      Padding padding_type;
+      if (pad_enabled) {
+        padding_type = Padding::EXPLICIT;
+        pad_top = static_cast<int64_t>((*pad_l)[0]);
+        pad_left = static_cast<int64_t>((*pad_l)[1]);
+        pad_bottom = static_cast<int64_t>((*pad_r)[0]);
+        pad_right = static_cast<int64_t>((*pad_r)[1]);
+      } else {
+        padding_type = padding_;
+      }
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerbose(
+                         input_rows, filter_rows, dilation_rows, stride_rows,
+                         padding_type, &out_rows, &pad_top, &pad_bottom));
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerbose(
+                         input_cols, filter_cols, dilation_cols, stride_cols,
+                         padding_type, &out_cols, &pad_left, &pad_right));
+    } else {
+      Padding padding_type;
+      if (pad_enabled) {
+        padding_type = Padding::EXPLICIT;
+        pad_front = static_cast<int64>((*pad_l)[0]);
+        pad_top = static_cast<int64>((*pad_l)[1]);
+        pad_left = static_cast<int64>((*pad_l)[2]);
+        pad_back = static_cast<int64>((*pad_r)[0]);
+        pad_bottom = static_cast<int64>((*pad_r)[1]);
+        pad_right = static_cast<int64>((*pad_r)[2]);
+      } else {
+        padding_type = padding_;
+      }
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_planes, filter_planes, dilation_planes,
+                                   stride_planes, padding_type, &out_planes,
+                                   &pad_front, &pad_back));
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerbose(
+                         input_rows, filter_rows, dilation_rows, stride_rows,
+                         padding_type, &out_rows, &pad_top, &pad_bottom));
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerbose(
+                         input_cols, filter_cols, dilation_cols, stride_cols,
+                         padding_type, &out_cols, &pad_left, &pad_right));
+    }
+
+    if (is_conv2d) {
+      // If pad_enabled, i.e., pad and conv op are fused, then
+      // all pads are already passed from pad op through
+      // *pad_l and *pad_r and they don't need to be set here.
+      if (!pad_enabled) {
+        *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+        *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+      }
+    } else {
+      // If pad_enabled, i.e., pad and conv op are fused, then
+      // all pads are already passed from pad op through
+      // *pad_l and *pad_r and they don't need to be set here.
+      if (!pad_enabled) {
+        *pad_l = {static_cast<int>(pad_front), static_cast<int>(pad_top),
+                  static_cast<int>(pad_left)};
+        *pad_r = {static_cast<int>(pad_back), static_cast<int>(pad_bottom),
+                  static_cast<int>(pad_right)};
+      }
+    }
+    // Tensorflow output is in data_format order.
+    //     Conv2D: NHWC or NCHW
+    //     Conv3D: NDHWC or NCDHW
+    // oneDNN uses asymmetric padding.
+    TensorShape out_shape;
+    if (is_conv2d) {
+      OP_REQUIRES_OK(
+          context_, ShapeFromFormatWithStatus(data_format_, out_batch, out_rows,
+                                              out_cols, out_depth, &out_shape));
+    } else {
+      OP_REQUIRES_OK(context_, ShapeFromFormatWithStatus(
+                                   data_format_, out_batch,
+                                   {{out_planes, out_rows, out_cols}},
+                                   out_depth, &out_shape));
+    }
+    *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
+    if (is_grouped_convolution) {
+      int out_depth = GetTensorDim(out_shape, data_format_, 'C');
+      int input_depth = GetTensorDim(input_shape, data_format_, 'C');
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_I));
+      int num_groups = input_depth / filter_in_depth;
+      OP_REQUIRES(
+          context_, out_depth % num_groups == 0 && out_depth >= num_groups,
+          errors::InvalidArgument(
+              "output depth must be evenly divisible by number of groups: ",
+              out_depth, " vs ", num_groups));
+    }
+    if (is_conv2d) {
+      // For Conv2D, oneDNN always needs output in NCHW format.
+      std::vector<memory::dim> output_sizes(4, -1);
+      output_sizes[MklDnnDims::Dim_N] = out_batch;
+      output_sizes[MklDnnDims::Dim_C] = out_depth;
+      output_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+      output_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+      *output_dims_mkl_order = output_sizes;
+    } else {
+      std::vector<memory::dim> output_sizes(5, -1);
+      output_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
+      output_sizes[MklDnnDims3D::Dim3d_C] = out_depth;
+      output_sizes[MklDnnDims3D::Dim3d_D] = static_cast<int>(out_planes);
+      output_sizes[MklDnnDims3D::Dim3d_H] = static_cast<int>(out_rows);
+      output_sizes[MklDnnDims3D::Dim3d_W] = static_cast<int>(out_cols);
+      *output_dims_mkl_order = output_sizes;
+    }
+  }
+
+  // Calculate output and pad size of forward Convolution operator.
+  // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
+  //
+  // Function does not return anything, but sets error in context status.
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      const memory::dims& dilations, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r, bool is_grouped_convolution, bool is_depthwise) {
+    DCHECK(output_dims_tf_order);
+    DCHECK(output_dims_mkl_order);
+    DCHECK(pad_l);
+    DCHECK(pad_r);
+
+    auto input_tf_shape = GetTfShape(context_, src_index);
+    auto filter_tf_shape = GetTfShape(context_, filter_index);
+
+    if (strides_.size() == 4) {
+      // Conv2D
+      OP_REQUIRES(context_, input_tf_shape.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input_tf_shape.DebugString()));
+      OP_REQUIRES(context_, filter_tf_shape.dims() == 4,
+                  errors::InvalidArgument("filter must be 4-dimensional",
+                                          filter_tf_shape.DebugString()));
+    } else {
+      // Conv3D
+      OP_REQUIRES(context_, input_tf_shape.dims() == 5,
+                  errors::InvalidArgument("input must be 5-dimensional",
+                                          input_tf_shape.DebugString()));
+      OP_REQUIRES(context_, filter_tf_shape.dims() == 5,
+                  errors::InvalidArgument("filter must be 5-dimensional",
+                                          filter_tf_shape.DebugString()));
+    }
+
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  dilations, output_dims_tf_order,
+                                  output_dims_mkl_order, pad_l, pad_r,
+                                  is_grouped_convolution, is_depthwise);
+  }
+
+  // Wrapper function to calculate input, filter, and output sizes of
+  // Conv2D/Conv3D in MKL order:
+  //     Conv2D: NCHW for input and output; OIHW for filter.
+  //     Conv3D: NCDHW for input and output; OIDHW for filter.
+  // Function also calculates output shape in Tensorflow order.
+  // Additionally, it also calculates strides and paddings.
+  //
+  // Function does not return anything, but sets error in context status.
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      memory::dims* input_dims, memory::dims* filter_dims,
+      memory::dims* strides, memory::dims* dilations,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r, bool* is_grouped_convolution,
+      bool pad_enabled = false, bool is_depthwise = false) {
+    DCHECK(input_dims);
+    DCHECK(filter_dims);
+    DCHECK(strides);
+    DCHECK(dilations);
+    DCHECK(output_dims_tf_order);
+    DCHECK(output_dims_mkl_order);
+    DCHECK(pad_l);
+    DCHECK(pad_r);
+
+    GetInputSizeInMklOrder(input_shape, input_dims);
+    if (!context_->status().ok()) return;
+    GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims,
+                            is_grouped_convolution, is_depthwise);
+    if (!context_->status().ok()) return;
+    GetStridesInMklOrder(strides);
+    GetDilationsInMklOrder(dilations);
+    GetOutputAndPadSizeInMklOrder(
+        input_shape, filter_shape, *strides, *dilations, output_dims_tf_order,
+        output_dims_mkl_order, pad_l, pad_r, *is_grouped_convolution,
+        pad_enabled, is_depthwise);
+    if (!context_->status().ok()) return;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////
+///  Common class that implements ConvBackpropFilter and Input
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, class T, bool is_depthwise>
+class MklConvBackpropCommonOp : public OpKernel {
+ public:
+  ~MklConvBackpropCommonOp() {}
+  explicit MklConvBackpropCommonOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    // Depthwise Convolution doesn't have dilation parameter
+    if (!is_depthwise) {
+      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+      if (strides_.size() == 4) {
+        // Check Conv2D dilations
+        OP_REQUIRES(
+            context, dilations_.size() == 4,
+            errors::InvalidArgument("Sliding window dilations field must "
+                                    "specify 4 dimensions"));
+        int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+        int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+        int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+        int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+        OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                    errors::InvalidArgument(
+                        "Current implementation does not yet support "
+                        "dilations in the batch and depth dimensions."));
+        OP_REQUIRES(
+            context, dilation_h > 0 && dilation_w > 0,
+            errors::InvalidArgument("Dilated rates should be larger than 0."));
+      }
+    } else {
+      // Set dilations as 1 for depthwise conv
+      // for future support to align with Tensorflow
+      dilations_ = {1, 1, 1, 1};
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+ protected:
+  // data members accessible to derived classes.
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;  // NCHW or NHWC
+};
+
+/////////////////////////////////////////////////////////////////////
+///  Dummy Mkl op that is just used for operators that are intermediate
+///  output of node fusion in the graph
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklDummyOp : public OpKernel {
+ public:
+  ~MklDummyOp() {}
+
+  explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TF_CHECK_OK(
+        errors::Unimplemented("This is a dummy op."
+                              "It should not have been invoked."));
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h
new file mode 100644
index 00000000..a1d1268d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h
@@ -0,0 +1,351 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_ELTWISE_ACTIVATION_BASE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_ELTWISE_ACTIVATION_BASE_OP_H_
+
+// See docs in ../ops/mkl_nn_ops.cc.
+
+#ifdef INTEL_MKL
+
+#include <unordered_map>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "dnnl.hpp"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+#include "tensorflow/core/platform/mutex.h"
+#endif
+
+using dnnl::algorithm;
+using dnnl::eltwise_forward;
+using dnnl::memory;
+using dnnl::prop_kind;
+using dnnl::stream;
+
+using EltwiseFwdActivationPd = dnnl::eltwise_forward::primitive_desc;
+
+namespace tensorflow {
+#ifndef ENABLE_ONEDNN_V3
+#define GET_MEMORY_DESC(md) md.data
+#else
+#define GET_MEMORY_DESC(md) md
+#endif  // !ENABLE_ONEDNN_V3
+
+// TODO(tf-onednn): Consolidate this class with `MklEltWiseFwdParams`
+// in `mkl_relu_op.cc`.
+//
+// The implementation of this class is very similar to it and it
+// should be consolidated to one class
+template <typename T>
+class MklEltwiseFwdActivationParams {
+ public:
+  memory::dims src_dims;
+  memory::desc src_md;
+#ifdef ENABLE_ONEDNN_V3
+  memory::desc dst_md;
+#endif  // ENABLE_ONEDNN_V3
+  algorithm alg_kind;
+  float alpha;
+  float beta;
+
+  MklEltwiseFwdActivationParams(memory::dims src_dims, memory::desc src_md,
+#ifdef ENABLE_ONEDNN_V3
+                                memory::desc dst_md,
+#endif  // ENABLE_ONEDNN_V3
+                                algorithm alg_kind, float alpha, float beta)
+      : src_dims(src_dims),
+        src_md(src_md),
+#ifdef ENABLE_ONEDNN_V3
+        dst_md(dst_md),
+#endif  // ENABLE_ONEDNN_V3
+        alg_kind(alg_kind),
+        alpha(alpha),
+        beta(beta) {
+  }
+};
+
+template <typename T>
+class MklEltwiseFwdActivationPrimitive : public MklPrimitive {
+ public:
+  explicit MklEltwiseFwdActivationPrimitive(
+      const MklEltwiseFwdActivationParams<T>& fwdParams)
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
+    // create eltwise primitive
+    if (context_.eltwise_fwd == nullptr) {
+      Setup(fwdParams);
+    }
+  }
+
+  ~MklEltwiseFwdActivationPrimitive() {}
+
+  // Eltwise forward execute
+  //   src_data:  input data buffer of src
+  //   dst_data:  output data buffer of dst
+  void Execute(const T* src_data, T* dst_data, OpKernelContext* op_context) {
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+    mutex_lock lock(primitive_execution_mu_);
+#endif
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)));
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
+    DCHECK_EQ(context_.fwd_primitives.size(),
+              context_.fwd_primitives_args.size());
+
+    std::vector<primitive> net;
+    net.push_back(eltwise_forward(*context_.fwd_pd));
+    std::vector<MemoryArgsMap> net_args;
+    net_args.push_back(
+        {{DNNL_ARG_SRC, *context_.src_mem}, {DNNL_ARG_DST, *context_.dst_mem}});
+    // execute eltwise_fwd primitve
+    ExecutePrimitive(net, &net_args, GetEngine(), op_context);
+
+    // After execution, set data handle back.
+    context_.src_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
+  }
+
+  std::shared_ptr<EltwiseFwdActivationPd> GetEltwiseFwdActivationPd() {
+    return context_.fwd_pd;
+  }
+
+ private:
+  // Primitive reuse context for eltwise Fwd ops: Relu, Elu, Tanh
+  struct EltwiseFwdActivationContext {
+    // oneDNN memory
+    std::shared_ptr<memory> src_mem;
+    std::shared_ptr<memory> dst_mem;
+
+    // desc & primitive desc
+#ifndef ENABLE_ONEDNN_V3
+    std::shared_ptr<dnnl::eltwise_forward::desc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
+    std::shared_ptr<EltwiseFwdActivationPd> fwd_pd;
+
+    // memory desc
+    std::shared_ptr<memory::desc> src_md;
+    std::shared_ptr<memory::desc> dst_md;
+
+    // memory primitive desc
+    std::shared_ptr<memory::desc> src_mpd;
+
+    // Eltwise primitive
+    std::shared_ptr<dnnl::primitive> eltwise_fwd;
+
+    std::vector<dnnl::primitive> fwd_primitives;
+
+    std::vector<std::unordered_map<int, memory>> fwd_primitives_args;
+
+    EltwiseFwdActivationContext()
+        : src_mem(nullptr),
+          dst_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
+          fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
+          fwd_pd(nullptr),
+          src_md(nullptr),
+          dst_md(nullptr),
+          src_mpd(nullptr),
+          eltwise_fwd(nullptr) {
+    }
+  };
+
+  // Eltwise forward primitive setup
+  void Setup(const MklEltwiseFwdActivationParams<T>& fwdParams) {
+    // create memory descriptors for eltwise data with specified format
+    context_.src_md.reset(new memory::desc(GET_MEMORY_DESC(fwdParams.src_md)));
+    context_.src_mpd.reset(new memory::desc(*context_.src_md));
+
+    // Create an eltwise forward descriptor and primitive descriptor
+#ifndef ENABLE_ONEDNN_V3
+    context_.fwd_desc.reset(new eltwise_forward::desc(
+        prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
+        fwdParams.alpha, fwdParams.beta));
+    context_.fwd_pd.reset(
+        new EltwiseFwdActivationPd(*context_.fwd_desc, cpu_engine_));
+#else
+    context_.dst_md.reset(new memory::desc(fwdParams.dst_md));
+    context_.fwd_pd.reset(new EltwiseFwdActivationPd(
+        cpu_engine_, prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
+        *context_.dst_md, fwdParams.alpha, fwdParams.beta));
+#endif  // !ENABLE_ONEDNN_V3
+    auto fwd_pd = context_.fwd_pd.get();
+
+    // Create memory primitive based on dummy data
+    context_.src_mem.reset(
+        new memory(fwd_pd->src_desc(), cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(fwd_pd->dst_desc(), cpu_engine_, DummyData));
+    // Create eltwise primitive and add it to net
+    context_.eltwise_fwd.reset(new eltwise_forward(*context_.fwd_pd));
+    context_.fwd_primitives_args.push_back(
+        {{DNNL_ARG_SRC, *context_.src_mem}, {DNNL_ARG_DST, *context_.dst_mem}});
+    context_.fwd_primitives.push_back(*context_.eltwise_fwd);
+  }
+
+  struct EltwiseFwdActivationContext context_;
+
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  mutex primitive_execution_mu_;
+#endif
+};
+
+template <typename T>
+class MklEltwiseFwdActivationPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklEltwiseFwdActivationPrimitive<T>* Get(
+      const MklEltwiseFwdActivationParams<T>& fwdParams) {
+    MklEltwiseFwdActivationPrimitive<T>* eltwise_forward = nullptr;
+
+    // Get a eltwise fwd primitive from the cached pool
+    eltwise_forward = static_cast<MklEltwiseFwdActivationPrimitive<T>*>(
+        MklEltwiseFwdActivationPrimitiveFactory<T>::GetInstance()
+            .GetEltwiseFwdActivation(fwdParams));
+    if (eltwise_forward == nullptr) {
+      eltwise_forward = new MklEltwiseFwdActivationPrimitive<T>(fwdParams);
+      MklEltwiseFwdActivationPrimitiveFactory<T>::GetInstance()
+          .SetEltwiseFwdActivation(fwdParams, eltwise_forward);
+    }
+
+    return eltwise_forward;
+  }
+
+  static MklEltwiseFwdActivationPrimitiveFactory& GetInstance() {
+    static MklEltwiseFwdActivationPrimitiveFactory instance_;
+    return instance_;
+  }
+
+ private:
+  MklEltwiseFwdActivationPrimitiveFactory() {}
+  ~MklEltwiseFwdActivationPrimitiveFactory() {}
+
+  static string CreateKey(const MklEltwiseFwdActivationParams<T>& fwdParams) {
+    string prefix = "eltwise_fwd";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(fwdParams.src_dims);
+    key_creator.AddAsKey<int>(static_cast<int>(fwdParams.alg_kind));
+    key_creator.AddAsKey<float>(static_cast<float>(fwdParams.alpha));
+    key_creator.AddAsKey<float>(static_cast<float>(fwdParams.beta));
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetEltwiseFwdActivation(
+      const MklEltwiseFwdActivationParams<T>& fwdParams) {
+    string key = CreateKey(fwdParams);
+    return this->GetOp(key);
+  }
+
+  void SetEltwiseFwdActivation(
+      const MklEltwiseFwdActivationParams<T>& fwdParams, MklPrimitive* op) {
+    string key = CreateKey(fwdParams);
+    this->SetOp(key, op);
+  }
+};
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklEltwiseFwdActivationOpBase : public OpKernel {
+ public:
+  ~MklEltwiseFwdActivationOpBase() {}
+
+  explicit MklEltwiseFwdActivationOpBase(OpKernelConstruction* context,
+                                         float alpha, float beta)
+      : OpKernel(context), alpha_(alpha), beta_(beta) {}
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      const Tensor& src_tensor = context->input(0);
+      TensorShape src_shape = src_tensor.shape();
+      if (src_tensor.dims() == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+      // Allocate output (dst) tensor
+      TensorShape dst_shape = src_shape;
+      Tensor* dst_tensor = nullptr;
+      // Nothing to compute, return.
+      if (src_shape.num_elements() == 0) {
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(
+                           GetTensorDataIndex(0, context->num_outputs()),
+                           dst_shape, &dst_tensor));
+        return;
+      }
+      // Set DNN primitive - src
+      MklDnnData<T> src(&cpu_engine);
+      memory::dims src_dims;
+      memory::desc src_md({}, memory::data_type::undef,
+                          memory::format_tag::undef);
+
+      src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+      auto src_strides = CalculateTFStrides(src_dims);
+
+      // Create blocked memory descriptor
+      src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+
+#ifdef ENABLE_ONEDNN_V3
+      memory::desc dst_md = src_md;
+#endif  // ENABLE_ONEDNN_V3
+
+      // Try to get an eltwise forward primitive from caching pool
+      MklEltwiseFwdActivationParams<T> fwdParams(src_dims, src_md,
+#ifdef ENABLE_ONEDNN_V3
+                                                 dst_md,
+#endif  // ENABLE_ONEDNN_V3
+                                                 alg_kind, alpha_, beta_);
+      MklEltwiseFwdActivationPrimitive<T>* eltwise_fwd =
+          MklEltwiseFwdActivationPrimitiveFactory<T>::Get(fwdParams);
+
+      const T* src_data = src_tensor.flat<T>().data();
+
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  GetTensorDataIndex(0, context->num_outputs()),
+                                  dst_shape, &dst_tensor));
+
+      T* dst_data = dst_tensor->flat<T>().data();
+      // execute eltwise
+      eltwise_fwd->Execute(src_data, dst_data, context);
+    } catch (dnnl::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  engine cpu_engine = engine(engine::kind::cpu, 0);
+
+ protected:
+  float alpha_;
+  float beta_;
+};
+
+// TODO : Implement Eltwise bwd / eltwiseGrad class
+
+#undef GET_MEMORY_DESC
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_ELTWISE_ACTIVATION_BASE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_kernel_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_kernel_util.h
new file mode 100644
index 00000000..da600fb0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_kernel_util.h
@@ -0,0 +1,135 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_KERNEL_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_KERNEL_UTIL_H_
+
+#ifdef INTEL_MKL
+
+#include "dnnl.hpp"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/public/session.h"
+#include "tsl/platform/status.h"
+
+using dnnl::memory;
+
+using dnnl::memory;
+
+namespace tensorflow {
+
+class MklTestingUtil {
+ public:
+  static void RunMklQuantizeOp(const Tensor& input, const float input_min,
+                               const float input_max, DataType type,
+                               string mode, Tensor* output);
+  static void RunDequantizeOp(const Tensor& input, const Tensor& input_min,
+                              const Tensor& input_max, string mode,
+                              Tensor* output);
+
+  static void RunGraph(const tensorflow::GraphDef graph_def,
+                       const string& fetch, Tensor* output);
+  template <typename T>
+  static void ComputeMinMax(const Tensor& tf_tensor, T* tensor_min,
+                            T* tensor_max) {
+    auto eigen_tensor = tf_tensor.flat<T>();
+    Eigen::Tensor<T, 0, Eigen::RowMajor> min = eigen_tensor.minimum();
+    Eigen::Tensor<T, 0, Eigen::RowMajor> max = eigen_tensor.maximum();
+    *tensor_min = min();
+    *tensor_max = max();
+  }
+
+  // This utility function mimics Quantization of float/bfloat16 tensor with
+  // oneDNN backend QuantizeV2 operation. Since the op signature requires min
+  // and max values to be in float type, min_tensor and max_tensor should have
+  // their dtype set to DT_FLOAT.
+  template <typename T>
+  static Status GetQuantizationTensors(const Tensor& input, Tensor* output,
+                                       DataType out_type, const string mode,
+                                       Tensor* min_tensor, Tensor* max_tensor) {
+    if (min_tensor->dtype() != DT_FLOAT || max_tensor->dtype() != DT_FLOAT) {
+      return absl::UnimplementedError("Tensor must be float32.");
+    }
+    T min;
+    T max;
+    ComputeMinMax<T>(input, &min, &max);
+
+    float adjusted_min = static_cast<float>(min);
+    float adjusted_max = static_cast<float>(max);
+    if (mode == "SCALED") {
+      if (output->dtype() != DT_QINT8) {
+        return absl::UnimplementedError("Tensor must be QInt8 in SCALED mode.");
+      }
+      float range = std::max(std::abs(adjusted_min), std::abs(adjusted_max));
+      adjusted_min = -range;
+      adjusted_max = range;
+    }
+    RunMklQuantizeOp(input, adjusted_min, adjusted_max, out_type, mode, output);
+    min_tensor->flat<float>()(0) = adjusted_min;
+    max_tensor->flat<float>()(0) = adjusted_max;
+
+    return OkStatus();
+  }
+};
+
+#ifdef ENABLE_ONEDNN_V3
+// Since oneDNN v3.x exposes only an opaque memory descriptor, it is no longer
+// possible to cache the entire filter memory descriptor as is. So we store
+// all relevant information about it in the following class.
+//
+// TODO(intel-tf): When oneDNN major version changes to v4.x, weight
+// caching may not work as expected if the underlying memory descriptor
+// has changed (i.e. compared to v3.x). We have to return a status here
+// to catch oneDNN major version change to avoid unexpected results.
+class FilterMemoryDesc {
+ public:
+  FilterMemoryDesc() {}
+
+  explicit FilterMemoryDesc(int ndims, int inner_nblks,
+                            memory::data_type data_type,
+                            const memory::dims& dims,
+                            const memory::dims& inner_blks,
+                            const memory::dims& inner_idxs,
+                            const memory::dims& strides)
+      : ndims_(ndims),
+        inner_nblks_(inner_nblks),
+        data_type_(data_type),
+        dims_(dims),
+        inner_blks_(inner_blks),
+        inner_idxs_(inner_idxs),
+        strides_(strides) {}
+
+  ~FilterMemoryDesc() {}
+
+  bool operator==(const FilterMemoryDesc& other) const {
+    return (ndims_ == other.ndims_ && inner_nblks_ == other.inner_nblks_ &&
+            data_type_ == other.data_type_ && dims_ == other.dims_ &&
+            inner_blks_ == other.inner_blks_ &&
+            inner_idxs_ == other.inner_idxs_ && strides_ == other.strides_);
+  }
+
+ private:
+  int ndims_;
+  int inner_nblks_;
+  memory::data_type data_type_;
+  memory::dims dims_;
+  memory::dims inner_blks_;
+  memory::dims inner_idxs_;
+  memory::dims strides_;
+};
+#endif  // ENABLE_ONEDNN_V3
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_KERNEL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
new file mode 100644
index 00000000..8af21582
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -0,0 +1,1219 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
+
+#if defined(INTEL_MKL)
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "dnnl.hpp"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/mkl/mkl_kernel_util.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/onednn_env_vars.h"
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+#include "tensorflow/core/platform/mutex.h"
+#endif
+
+using dnnl::inner_product_forward;
+using dnnl::primitive_attr;
+using dnnl::prop_kind;
+using dnnl::stream;
+
+namespace tensorflow {
+
+#ifndef ENABLE_ONEDNN_V3
+#define APPEND_ELTWISE(scale, alg, alpha, beta) \
+  append_eltwise(scale, alg, alpha, beta)
+#define APPEND_ELTWISE_RELU6(scale, alpha, beta) \
+  append_eltwise(scale, dnnl::algorithm::eltwise_bounded_relu, alpha, beta)
+#define OUTPUT_SCALE_DCHECK (post_op_param.name == "output_scale")
+#define SET_MKL_LAYOUT(md) SetMklLayout(&md)
+#define TSCALED_BIAS Tbias
+#else
+#define APPEND_ELTWISE(scale, alg, alpha, beta) \
+  append_eltwise(alg, alpha, beta);             \
+  (void)scale
+#define APPEND_ELTWISE_RELU6(scale, alpha, beta)             \
+  append_eltwise(dnnl::algorithm::eltwise_clip, 0.0, alpha); \
+  (void)scale;                                               \
+  (void)beta
+#define OUTPUT_SCALE_DCHECK                  \
+  (post_op_param.name == "src_scale") ||     \
+      (post_op_param.name == "wei_scale") || \
+      (post_op_param.name == "dst_scale")
+#define SET_MKL_LAYOUT(md) SetMklLayout(md)
+#define TSCALED_BIAS float
+#endif  // !ENABLE_ONEDNN_V3
+
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
+#define FWD_STREAM , *fwd_stream
+#else
+#define FWD_STREAM
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
+
+static Eigen::internal::CacheSizes cache_sizes = Eigen::internal::CacheSizes();
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+inline bool ExecuteSingleThreadedGemm(int64_t m, int64_t n, int64_t k,
+                                      int bytes) {
+  // Ideally we would like to determine blocking and then come up with
+  // a heuristic but what we are targeting are very small models whose
+  // total size is < x*L2. So we will do this simple calculation
+  // to determine if the matrix multiplication should be run on a single thread.
+  // TODO(Intel-tf): this needs to be vastly improved, perhaps at a lower level
+  // than the integration.
+  ptrdiff_t l2_size = cache_sizes.m_l2;
+  constexpr float kHeuristicMultiplier = 1.01;
+  const float mul_size = bytes * (m * n + k * (m + n));
+  const float l2_heur = l2_size * kHeuristicMultiplier;
+  return (mul_size >= 0 && mul_size < l2_heur);
+}
+
+// This structure aggregates multiple inputs to MklDnnMatMul* methods.
+struct MklDnnMatMulFwdParams {
+  memory::dims src_dims;
+  memory::dims weight_dims;
+  memory::dims bias_dims;
+  memory::dims dst_dims;
+  memory::format_tag src_format;
+  memory::format_tag weight_format;
+  memory::format_tag dst_format;
+  string dtypes = string("");
+  bool const_weight;
+  struct PostOpParam {
+    string name;
+    std::vector<float> param;
+    string partial_key;
+  };
+  std::vector<PostOpParam> post_op_params;
+  string input_quant_mode;
+
+  MklDnnMatMulFwdParams(
+      memory::dims src_dims, memory::dims weight_dims, memory::dims bias_dims,
+      memory::dims dst_dims,
+      memory::format_tag src_format = memory::format_tag::any,
+      memory::format_tag weight_format = memory::format_tag::any,
+      memory::format_tag dst_format = memory::format_tag::any,
+      bool const_weight = false)
+      : src_dims(src_dims),
+        weight_dims(weight_dims),
+        bias_dims(bias_dims),
+        dst_dims(dst_dims),
+        src_format(src_format),
+        weight_format(weight_format),
+        dst_format(dst_format),
+        const_weight(const_weight) {}
+};
+
+// With quantization, input, weight, bias, and output can have different types.
+// So we use different template parameters for each type.
+// TODO(intel-tf): The template type "T" is currently used to match the
+// templatized class MklPrimitiveFactory (tensorflow/core/util/mkl_util.h).
+// In the future, with the removal of "T" from MklPrimitiveFactory, this class
+// needs to drop "T".
+template <typename T, typename Tinput, typename Tweight, typename Tbias,
+          typename Toutput>
+class MklDnnMatMulFwdPrimitive : public MklPrimitive {
+ public:
+  explicit MklDnnMatMulFwdPrimitive(
+      const MklDnnMatMulFwdParams& matmulFwdParams)
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
+    // Create matmul primitive
+    if (context_.matmul_fwd == nullptr) {
+      Setup(matmulFwdParams);
+    }
+  }
+
+  ~MklDnnMatMulFwdPrimitive() {}
+
+  dnnl::memory::desc GetScratchPadDesc() {
+    return context_.fwd_pd->scratchpad_desc();
+  }
+
+  // Inner-product forward execute with bias:
+  //  - src_data: input data buffer of src
+  //  - weight_data: input data buffer of weight
+  //  - bias_data: input data buffer of bias
+  //  - dst_data: output data buffer of dst
+  //  - sp_data: scratchpad data
+  void Execute(const Tinput* src_data, const Tweight* weight_data,
+               const void* bias_data, Toutput* dst_data,
+               const MklDnnMatMulFwdParams& matmul_fwd_params, void* sp_data,
+               std::shared_ptr<stream> fwd_stream) {
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+    mutex_lock lock(primitive_execution_mu_);
+#endif
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tinput*>(src_data)) FWD_STREAM);
+    context_.weight_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tweight*>(weight_data)) FWD_STREAM);
+    context_.bias_mem->set_data_handle(const_cast<void*>(bias_data) FWD_STREAM);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data) FWD_STREAM);
+    context_.sp_mem->set_data_handle(sp_data FWD_STREAM);
+    auto const& post_op_params = matmul_fwd_params.post_op_params;
+    if (!post_op_params.empty()) {
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "src_scale") {
+          context_.src_scale_mem->set_data_handle(static_cast<void*>(
+              const_cast<float*>(post_op_param.param.data())) FWD_STREAM);
+        } else if (post_op_param.name == "wei_scale") {
+          context_.wei_scale_mem->set_data_handle(static_cast<void*>(
+              const_cast<float*>(post_op_param.param.data())) FWD_STREAM);
+        } else if (post_op_param.name == "dst_scale") {
+          context_.dst_scale_mem->set_data_handle(static_cast<void*>(
+              const_cast<float*>(post_op_param.param.data())) FWD_STREAM);
+        }
+      }
+    }
+
+    execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
+
+    // After execution, set data handle back
+    context_.src_mem->set_data_handle(DummyData);
+    context_.weight_mem->set_data_handle(DummyData);
+    context_.bias_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
+  }
+
+  std::shared_ptr<dnnl::inner_product_forward::primitive_desc>
+  GetPrimitiveDesc() const {
+    return context_.fwd_pd;
+  }
+
+ private:
+  // Primitive reuse context for inner-product Fwd op
+  struct MklDnnMatMulFwdContext {
+    // oneDNN memory.
+    std::shared_ptr<dnnl::memory> src_mem;
+    std::shared_ptr<dnnl::memory> weight_mem;
+    std::shared_ptr<dnnl::memory> bias_mem;
+    std::shared_ptr<dnnl::memory> dst_mem;
+    std::shared_ptr<dnnl::memory> sp_mem;
+    // Quantization scale related memory
+    std::shared_ptr<dnnl::memory> src_scale_mem;
+    std::shared_ptr<dnnl::memory> wei_scale_mem;
+    std::shared_ptr<dnnl::memory> dst_scale_mem;
+
+    // Descriptor and primitive-descriptor for forward inner-product.
+#ifndef ENABLE_ONEDNN_V3
+    std::shared_ptr<dnnl::inner_product_forward::desc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
+    std::shared_ptr<dnnl::inner_product_forward::primitive_desc> fwd_pd;
+
+    // Memory descriptors.
+    std::shared_ptr<dnnl::memory::desc> src_md;
+    std::shared_ptr<dnnl::memory::desc> weight_md;
+    std::shared_ptr<dnnl::memory::desc> bias_md;
+    std::shared_ptr<dnnl::memory::desc> dst_md;
+    // Quantization scale related memory descriptors
+    std::shared_ptr<dnnl::memory::desc> src_scale_md;
+    std::shared_ptr<dnnl::memory::desc> wei_scale_md;
+    std::shared_ptr<dnnl::memory::desc> dst_scale_md;
+
+    // Inner-product primitive.
+    std::shared_ptr<dnnl::primitive> matmul_fwd;
+    std::vector<dnnl::primitive> fwd_primitives;
+
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    MklDnnMatMulFwdContext()
+        : src_mem(nullptr),
+          weight_mem(nullptr),
+          bias_mem(nullptr),
+          dst_mem(nullptr),
+          sp_mem(nullptr),
+          src_scale_mem(nullptr),
+          wei_scale_mem(nullptr),
+          dst_scale_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
+          fwd_desc(nullptr),
+#endif  // ENABLE_ONEDNN_V3
+          fwd_pd(nullptr),
+          src_md(nullptr),
+          weight_md(nullptr),
+          bias_md(nullptr),
+          dst_md(nullptr),
+          src_scale_md(nullptr),
+          wei_scale_md(nullptr),
+          dst_scale_md(nullptr),
+          matmul_fwd(nullptr) {
+    }
+  };
+
+  void Setup(const MklDnnMatMulFwdParams& matmul_fwd_params) {
+    // Create memory descriptors for inner-product data without specified
+    // format.
+    context_.src_md.reset(new memory::desc({matmul_fwd_params.src_dims},
+                                           MklDnnType<Tinput>(),
+                                           matmul_fwd_params.src_format));
+
+    context_.weight_md.reset(new memory::desc({matmul_fwd_params.weight_dims},
+                                              MklDnnType<Tweight>(),
+#ifdef DNNL_AARCH64_USE_ACL
+                                              memory::format_tag::any));
+#else
+                                              matmul_fwd_params.weight_format));
+#endif
+
+    context_.dst_md.reset(new memory::desc({matmul_fwd_params.dst_dims},
+                                           MklDnnType<Toutput>(),
+                                           matmul_fwd_params.dst_format));
+
+    memory::data_type bias_dt;
+#ifndef ENABLE_ONEDNN_V3
+    bias_dt = MklDnnType<Tbias>();
+#else
+    if (std::is_same<Tweight, qint8>::value) {
+      // For QuantizedMatMul, bias needs to be passed to oneDNN as float of
+      // bfloat16 (even if Tbias is qint32).
+      if (std::is_same<Tbias, bfloat16>::value &&
+          matmul_fwd_params.input_quant_mode == "SCALED") {
+        bias_dt = MklDnnType<bfloat16>();
+      } else {
+        bias_dt = MklDnnType<float>();
+      }
+    } else {
+      bias_dt = MklDnnType<Tbias>();
+    }
+#endif  // !ENABLE_ONEDNN_V3
+    context_.bias_md.reset(new memory::desc({matmul_fwd_params.bias_dims},
+                                            bias_dt, memory::format_tag::any));
+
+    // Create an inner-product.
+#ifndef ENABLE_ONEDNN_V3
+    context_.fwd_desc.reset(new inner_product_forward::desc(
+        matmul_fwd_params.const_weight ? prop_kind::forward_inference
+                                       : prop_kind::forward_training,
+        *context_.src_md, *context_.weight_md, *context_.bias_md,
+        *context_.dst_md));
+    context_.fwd_pd.reset(new inner_product_forward::primitive_desc(
+        *context_.fwd_desc, cpu_engine_));
+#endif  // !ENABLE_ONEDNN_V3
+
+    // Check if there is any fusion as post-ops
+    auto const& post_op_params = matmul_fwd_params.post_op_params;
+    dnnl::primitive_attr post_ops_attr;
+    post_ops_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    dnnl::post_ops post_ops;
+    std::unordered_map<string, bool> is_scale_set;
+    if (!post_op_params.empty()) {
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "Relu" || post_op_param.name == "LeakyRelu") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_relu,
+                                  op_alpha, op_beta);
+        } else if (post_op_param.name == "Relu6") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE_RELU6(op_scale, op_alpha, op_beta);
+        } else if (post_op_param.name == "Elu") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_elu,
+                                  op_alpha, op_beta);
+        } else if (post_op_param.name == "GeluApproximate") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_gelu_tanh,
+                                  op_alpha, op_beta);
+        } else if (post_op_param.name == "GeluExact") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_gelu_erf,
+                                  op_alpha, op_beta);
+        } else if (post_op_param.name == "Tanh") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_tanh,
+                                  op_alpha, op_beta);
+        } else if (post_op_param.name == "Sigmoid") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_logistic,
+                                  op_alpha, op_beta);
+        } else if (post_op_param.name == "linear") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_linear,
+                                  op_alpha, op_beta);
+#ifndef ENABLE_ONEDNN_V3
+        } else if (post_op_param.name == "output_scale") {
+          if (post_op_param.param.size() == 1) {
+            post_ops_attr.set_output_scales(0, post_op_param.param);
+          } else {
+            post_ops_attr.set_output_scales(2, post_op_param.param);
+          }
+#else
+        } else if (post_op_param.name == "src_scale") {
+          is_scale_set.insert({"src", true});
+          post_ops_attr.set_scales_mask(DNNL_ARG_SRC, 0);
+          context_.src_scale_md.reset(new memory::desc({1}, MklDnnType<float>(),
+                                                       memory::format_tag::x));
+          context_.src_scale_mem.reset(
+              new memory(*context_.src_scale_md, cpu_engine_, DummyData));
+        } else if (post_op_param.name == "wei_scale") {
+          is_scale_set.insert({"wei", true});
+          const int scale_size = post_op_param.param.size();
+          const int mask = scale_size == 1 ? 0 : 1;
+          post_ops_attr.set_scales_mask(DNNL_ARG_WEIGHTS, mask);
+          context_.wei_scale_md.reset(new memory::desc(
+              {scale_size}, MklDnnType<float>(), memory::format_tag::x));
+          context_.wei_scale_mem.reset(
+              new memory(*context_.wei_scale_md, cpu_engine_, DummyData));
+        } else if (post_op_param.name == "dst_scale") {
+          is_scale_set.insert({"dst", true});
+          const int scale_size = post_op_param.param.size();
+          const int mask = scale_size == 1 ? 0 : 1;
+          post_ops_attr.set_scales_mask(DNNL_ARG_DST, mask);
+          context_.dst_scale_md.reset(new memory::desc({1}, MklDnnType<float>(),
+                                                       memory::format_tag::x));
+          context_.dst_scale_mem.reset(
+              new memory(*context_.dst_scale_md, cpu_engine_, DummyData));
+#endif  // !ENABLE_ONEDNN_V3
+        } else if (post_op_param.name == "sum") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          float op_scale = post_op_param.param[0];
+          post_ops.append_sum(op_scale);
+
+        } else {
+          DCHECK((post_op_param.name == "Relu") ||
+                 (post_op_param.name == "Relu6") ||
+                 (post_op_param.name == "Elu") ||
+                 (post_op_param.name == "GeluApproximate") ||
+                 (post_op_param.name == "GeluExact") ||
+                 (post_op_param.name == "Tanh") ||
+                 (post_op_param.name == "Sigmoid") ||
+                 (post_op_param.name == "sum") ||
+                 (post_op_param.name == "Leakyrelu") || OUTPUT_SCALE_DCHECK);
+        }
+      }
+      post_ops_attr.set_post_ops(post_ops);
+    }
+
+#ifndef ENABLE_ONEDNN_V3
+    context_.fwd_pd.reset(new inner_product_forward::primitive_desc(
+        *context_.fwd_desc, post_ops_attr, cpu_engine_));
+#else
+    context_.fwd_pd.reset(new inner_product_forward::primitive_desc(
+        cpu_engine_,
+        matmul_fwd_params.const_weight ? prop_kind::forward_inference
+                                       : prop_kind::forward_training,
+        *context_.src_md, *context_.weight_md, *context_.bias_md,
+        *context_.dst_md, post_ops_attr));
+#endif  // !ENABLE_ONEDNN_V3
+
+    // Create memory primitive based on dummy data
+    context_.src_mem.reset(
+        new memory(context_.fwd_pd.get()->src_desc(), cpu_engine_, DummyData));
+    context_.weight_mem.reset(new memory(context_.fwd_pd.get()->weights_desc(),
+                                         cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(context_.fwd_pd.get()->dst_desc(), cpu_engine_, DummyData));
+    context_.bias_mem.reset(
+        new memory(context_.fwd_pd.get()->bias_desc(), cpu_engine_, DummyData));
+    auto scratchpad_md = context_.fwd_pd->scratchpad_desc();
+    context_.sp_mem.reset(
+        new dnnl::memory(scratchpad_md, cpu_engine_, DummyData));
+
+    // Create inner-product primitive.
+    context_.matmul_fwd.reset(new inner_product_forward(*context_.fwd_pd));
+    std::unordered_map<int, memory> net_args = {
+        {DNNL_ARG_SRC, *context_.src_mem},
+        {DNNL_ARG_WEIGHTS, *context_.weight_mem},
+        {DNNL_ARG_BIAS, *context_.bias_mem},
+        {DNNL_ARG_SCRATCHPAD, *context_.sp_mem},
+        {DNNL_ARG_DST, *context_.dst_mem}};
+#ifdef ENABLE_ONEDNN_V3
+    if (is_scale_set["src"]) {
+      net_args.insert(
+          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, *context_.src_scale_mem});
+    }
+    if (is_scale_set["wei"]) {
+      net_args.insert(
+          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, *context_.wei_scale_mem});
+    }
+    if (is_scale_set["dst"]) {
+      net_args.insert(
+          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, *context_.dst_scale_mem});
+    }
+#endif  // ENABLE_ONEDNN_V3
+    context_.net_args.push_back(net_args);
+    context_.fwd_primitives.push_back(*context_.matmul_fwd);
+    return;
+  }
+
+  struct MklDnnMatMulFwdContext context_;
+
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  // Guards Execution()
+  mutex primitive_execution_mu_;
+#endif
+};
+
+template <typename T, typename Tinput, typename Tweight, typename Tbias,
+          typename Toutput>
+class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>* Get(
+      const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims, bool do_not_cache) {
+    MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>* matmul_fwd =
+        nullptr;
+
+    if (do_not_cache) {
+      // Always create new primitive
+      matmul_fwd =
+          new MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>(
+              mkldnn_matmul_fwd_dims);
+    } else {
+      // Try to find a suitable one in pool
+      matmul_fwd = dynamic_cast<
+          MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>*>(
+          MklDnnMatMulFwdPrimitiveFactory<T, Tinput, Tweight, Tbias,
+                                          Toutput>::GetInstance()
+              .GetMklDnnMatMulFwd(mkldnn_matmul_fwd_dims));
+      if (matmul_fwd == nullptr) {
+        matmul_fwd =
+            new MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>(
+                mkldnn_matmul_fwd_dims);
+        MklDnnMatMulFwdPrimitiveFactory<T, Tinput, Tweight, Tbias,
+                                        Toutput>::GetInstance()
+            .SetMklDnnMatMulFwd(mkldnn_matmul_fwd_dims, matmul_fwd);
+      }
+    }
+    return matmul_fwd;
+  }
+
+ private:
+  MklDnnMatMulFwdPrimitiveFactory() {}
+  ~MklDnnMatMulFwdPrimitiveFactory() {}
+
+  static MklDnnMatMulFwdPrimitiveFactory& GetInstance() {
+    static MklDnnMatMulFwdPrimitiveFactory instance_;
+    return instance_;
+  }
+
+  static string CreateKey(const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims) {
+    string prefix = "matmul_fwd_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.src_dims);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.weight_dims);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.bias_dims);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dst_dims);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dtypes);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.weight_format);
+
+    // Generate keys for post-ops
+    for (auto const& post_op_param : mkldnn_matmul_fwd_dims.post_op_params) {
+      if (post_op_param.name == "Relu" || post_op_param.name == "Relu6" ||
+          post_op_param.name == "Elu" || post_op_param.name == "Tanh" ||
+          post_op_param.name == "Sigmoid" ||
+          post_op_param.name == "LeakyRelu" ||
+          post_op_param.name == "GeluApproximate" ||
+          post_op_param.name == "GeluExact" || post_op_param.name == "linear") {
+        DCHECK_EQ(post_op_param.param.size(), 3);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+        key_creator.AddAsKey(post_op_param.param[1]);
+        key_creator.AddAsKey(post_op_param.param[2]);
+      } else if (post_op_param.name == "sum") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+#ifndef ENABLE_ONEDNN_V3
+      } else if (post_op_param.name == "output_scale") {
+#else
+      } else if (post_op_param.name == "src_scale" ||
+                 post_op_param.name == "wei_scale" ||
+                 post_op_param.name == "dst_scale") {
+#endif  // !ENABLE_ONEDNN_V3
+        key_creator.AddAsKey(post_op_param.name);
+        if (post_op_param.partial_key.empty()) {
+          DCHECK_GE(post_op_param.param.size(), 1);
+          // Old Quantized MatMul kernels do not create part of key beforehand
+          // as primitive caching-key-creation optimization.
+          key_creator.AddAsKey(post_op_param.param[0]);
+        } else {
+          // New Quantized MatMul kernels pre-create partial key.
+          key_creator.AddAsKey(post_op_param.partial_key);
+        }
+      } else {
+        return string("not_a_key");
+      }
+    }
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetMklDnnMatMulFwd(
+      const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims) {
+    string key = CreateKey(mkldnn_matmul_fwd_dims);
+    return this->GetOp(key);
+  }
+
+  void SetMklDnnMatMulFwd(const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims,
+                          MklPrimitive* op) {
+    string key = CreateKey(mkldnn_matmul_fwd_dims);
+    this->SetOp(key, op);
+  }
+};
+
+template <class Tweight, class Tbias, class Toutput>
+class MklDnnMatMulOpBase : public OpKernel {
+ public:
+  explicit MklDnnMatMulOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+  // Allocate output tensor.
+  virtual void AllocateOutputTensor(
+      OpKernelContext* context,
+      const inner_product_forward::primitive_desc& mkldnn_matmul_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      MklTensorFormat output_tf_format, Tensor** output_tensor,
+      bool native_format = false) {
+    DCHECK(output_tensor);
+    auto dst_pd = mkldnn_matmul_prim_desc.dst_desc();
+
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SET_MKL_LAYOUT(dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<Toutput>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput)));
+
+    if (native_format) {
+      output_tf_shape = output_mkl_shape.GetTfShape();
+    }
+    // Allocate Output Tensor
+    AllocateOutputSetMklShape(context, kOutputIndexDst, output_tensor,
+                              output_tf_shape, output_mkl_shape, native_format);
+  }
+
+  // TF_LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // be acquired before entering the function, since it is acquired
+  // inside the function.
+  inline bool IsWeightCacheEmpty(OpKernelContext* context)
+      TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    return (weight_oi_.NumElements() == 0);
+  }
+
+  // Cache the converted weight in a tensor.
+  // Only one thread can execute this method at any given time.
+  void CacheWeight(
+      OpKernelContext* context,
+      const std::shared_ptr<dnnl::inner_product_forward::primitive_desc>&
+          matmul_fwd_pd,
+      Tweight* weight_data, const Tensor& weight_tensor,
+      MklDnnData<Tweight>& weight, const memory::desc& weight_md)
+      TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    const Tensor& weight_t = weight_oi_;
+
+    // If the weights are already cached, there's nothing to do
+    if (weight_t.NumElements() > 0) {
+      return;
+    }
+
+#ifdef ENABLE_ONEDNN_V3
+    // For now, cache weights only for blocked format
+    if (weight_md.get_format_kind() != memory::format_kind::blocked) {
+      return;
+    }
+#endif  // ENABLE_ONEDNN_V3
+
+    // reorder and cache the weight
+    weight.SetUsrMem(weight_md, &weight_tensor);
+    weight.CheckReorderToOpMem(matmul_fwd_pd.get()->weights_desc(), cpu_engine_,
+                               context);
+    weight_data = static_cast<Tweight*>(weight.GetOpMem().get_data_handle());
+
+    size_t weight_size = matmul_fwd_pd.get()->weights_desc().get_size();
+    TensorShape weight_tf_shape;
+    weight_tf_shape.AddDim(weight_size / sizeof(Tweight));
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<Tweight>::value,
+                                          weight_tf_shape, &weight_oi_));
+
+    void* weight_oi_t_data = weight.GetTensorBuffer(&weight_oi_);
+    memcpy(weight_oi_t_data, weight_data, weight_size);
+
+    // cache the memory descriptor
+    auto expected_md = matmul_fwd_pd->weights_desc();
+#ifndef ENABLE_ONEDNN_V3
+    TensorShape weight_mkl_format;
+    weight_mkl_format.AddDim(sizeof(expected_md) / sizeof(Tweight));
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<Tweight>::value,
+                                          weight_mkl_format, &weight_oi_md_));
+    *reinterpret_cast<memory::desc*>(weight_oi_md_.flat<Tweight>().data()) =
+        expected_md;
+#else
+    weight_oi_md_ = FilterMemoryDesc(
+        expected_md.get_ndims(), expected_md.get_inner_nblks(),
+        expected_md.get_data_type(), expected_md.get_dims(),
+        expected_md.get_inner_blks(), expected_md.get_inner_idxs(),
+        expected_md.get_strides());
+#endif  // !ENABLE_ONEDNN_V3
+  }
+
+  Tweight* GetCachedWeight(OpKernelContext* context,
+                           const memory::desc& expected_md)
+      TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    const Tensor& weight_t = weight_oi_;
+#ifndef ENABLE_ONEDNN_V3
+    const Tensor& weight_md_t = weight_oi_md_;
+
+    // Check if the memory descriptor of the cached weight is same as
+    // expected_md. if so use the cached memory, else return NULL
+    if (weight_md_t.flat<Tweight>().size()) {
+      const memory::desc& stored_md =
+          *(static_cast<memory::desc*>(weight_md_t.data()));
+      if (stored_md == expected_md) {
+        return static_cast<Tweight*>(
+            const_cast<Tweight*>(weight_t.flat<Tweight>().data()));
+      }
+    }
+    return nullptr;
+#else
+    // Return the cached weights only if the dimensions of the cached weights
+    // and the current weights match. Otherwise, return nullptr.
+    //
+    // TODO(intel-tf): The following check assumes that all dimensions are
+    // known before checking for equality. We may have to modify it in the
+    // future once we support runtime dimensions (especially if the dimensions
+    // are still unknown at this point).
+    if (weight_oi_md_ ==
+        FilterMemoryDesc(expected_md.get_ndims(), expected_md.get_inner_nblks(),
+                         expected_md.get_data_type(), expected_md.get_dims(),
+                         expected_md.get_inner_blks(),
+                         expected_md.get_inner_idxs(),
+                         expected_md.get_strides())) {
+      return static_cast<Tweight*>(
+          const_cast<Tweight*>(weight_t.flat<Tweight>().data()));
+    }
+    return nullptr;
+#endif  // !ENABLE_ONEDNN_V3
+  }
+
+  bool IsBiasCacheEmpty() TF_LOCKS_EXCLUDED(bias_cache_mutex_) {
+    tf_shared_lock lock(bias_cache_mutex_);
+    return (cached_bias_data_pt_.NumElements() == 0);
+  }
+
+  virtual bool IsCachedBiasValid(float, float)
+      TF_SHARED_LOCKS_REQUIRED(bias_cache_mutex_) {
+    return false;
+  }
+
+  void CacheBias(OpKernelContext* ctx, const Tensor& temp_scaled_bias_tensor,
+                 float min_input, float max_input)
+      TF_LOCKS_EXCLUDED(bias_cache_mutex_) {
+    mutex_lock lock(bias_cache_mutex_);
+    if (cached_bias_data_pt_.NumElements() > 0) {
+      return;
+    }
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(temp_scaled_bias_tensor.dtype(),
+                                           temp_scaled_bias_tensor.shape(),
+                                           &cached_bias_data_pt_));
+    tensor::DeepCopy(temp_scaled_bias_tensor, &cached_bias_data_pt_);
+    saved_min_input_ = min_input;
+    saved_max_input_ = max_input;
+  }
+
+  void GetCachedBias(float min_input, float max_input, void** bias_data)
+      TF_LOCKS_EXCLUDED(bias_cache_mutex_) {
+    tf_shared_lock lock(bias_cache_mutex_);
+    const Tensor& cached_bias_data = cached_bias_data_pt_;
+    if (IsCachedBiasValid(min_input, max_input)) {
+      *bias_data = static_cast<void*>(const_cast<TSCALED_BIAS*>(
+          cached_bias_data.flat<TSCALED_BIAS>().data()));
+    } else {
+      *bias_data = nullptr;
+    }
+  }
+
+  engine cpu_engine_ = engine(engine::kind::cpu, 0);
+
+ protected:
+  // Tensor to save reordered weight
+  mutex mu_;
+  Tensor weight_oi_ TF_GUARDED_BY(mu_);
+#ifndef ENABLE_ONEDNN_V3
+  Tensor weight_oi_md_ TF_GUARDED_BY(mu_);
+#else
+  FilterMemoryDesc weight_oi_md_ TF_GUARDED_BY(mu_);
+#endif  // !ENABLE_ONEDNN_V3
+
+  bool is_weight_const_;
+
+  bool is_bias_const_;
+  mutex bias_cache_mutex_;
+  // Persistent tensor for cached bias.
+  Tensor cached_bias_data_pt_ TF_GUARDED_BY(bias_cache_mutex_);
+  float saved_min_input_ = -std::numeric_limits<float>::infinity();
+  float saved_max_input_ = std::numeric_limits<float>::infinity();
+
+  const int kInputIndexSrc = 0;
+  const int kInputIndexWeight = 1;
+  const int kInputIndexBias = 2;
+  const int kOutputIndexDst = 0;
+};
+
+using dnnl::matmul;
+
+namespace {
+
+struct MklMatMulParams {
+  string prefix;
+  memory::dims a_dims;
+  memory::dims b_dims;
+  memory::dims c_dims;
+  memory::dims a_strides;
+  memory::dims b_strides;
+  memory::dims c_strides;
+  memory::dim a_nnz;
+  struct PostOpParam {
+    string name;
+    std::vector<float> param;
+    memory::dims dims;
+    memory::data_type data_type;
+    memory::format_tag format_tag;
+  };
+  std::vector<PostOpParam> post_op_params;
+
+  MklMatMulParams(string prefix, memory::dims a_dims, memory::dims b_dims,
+                  memory::dims c_dims, memory::dims a_strides,
+                  memory::dims b_strides, memory::dims c_strides,
+                  memory::dim a_nnz = 0)
+      : prefix(prefix),
+        a_dims(a_dims),
+        b_dims(b_dims),
+        c_dims(c_dims),
+        a_strides(a_strides),
+        b_strides(b_strides),
+        c_strides(c_strides),
+        a_nnz(a_nnz) {}
+};
+
+template <typename Tlhs, typename Trhs, typename Toutput, bool CSR = false>
+class MklMatMulPrimitive : public MklPrimitive {
+ public:
+  explicit MklMatMulPrimitive(const MklMatMulParams& params)
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
+    // Create matmul primitive
+    Setup(params);
+  }
+
+  ~MklMatMulPrimitive() {}
+
+  dnnl::memory::desc GetScratchPadDesc() {
+    return context_.prim_desc->scratchpad_desc();
+  }
+
+  void Execute(const std::shared_ptr<stream>& stream, const Tlhs* a_data,
+               const Trhs* b_data, const Toutput* c_data, void* sp_data,
+               void* mul_data = nullptr, void* add_data = nullptr,
+               const int32_t* a_col_indices = nullptr,
+               const int32_t* a_row_pointers = nullptr) {
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+    mutex_lock lock(primitive_execution_mu_);
+#endif
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
+    context_.a_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tlhs*>(a_data)), *stream);
+    context_.b_mem->set_data_handle(
+        static_cast<void*>(const_cast<Trhs*>(b_data)), *stream);
+    context_.c_mem->set_data_handle(
+        static_cast<void*>(const_cast<Toutput*>(c_data)), *stream);
+
+    if (sp_data != nullptr) context_.sp_mem->set_data_handle(sp_data, *stream);
+    if (mul_data != nullptr)
+      context_.mul_mem->set_data_handle(mul_data, *stream);
+    if (add_data != nullptr)
+      context_.add_mem->set_data_handle(add_data, *stream);
+#else
+    if constexpr (CSR) {
+      context_.a_mem->set_data_handle(
+          static_cast<void*>(const_cast<Tlhs*>(a_data)), 0);
+      context_.a_mem->set_data_handle(
+          static_cast<void*>(const_cast<int32_t*>(a_col_indices)), 1);
+      context_.a_mem->set_data_handle(
+          static_cast<void*>(const_cast<int32_t*>(a_row_pointers)), 2);
+    } else {
+      context_.a_mem->set_data_handle(
+          static_cast<void*>(const_cast<Tlhs*>(a_data)));
+    }
+    context_.b_mem->set_data_handle(
+        static_cast<void*>(const_cast<Trhs*>(b_data)));
+    context_.c_mem->set_data_handle(
+        static_cast<void*>(const_cast<Toutput*>(c_data)));
+    if (sp_data != nullptr) context_.sp_mem->set_data_handle(sp_data);
+    if (mul_data != nullptr) context_.mul_mem->set_data_handle(mul_data);
+    if (add_data != nullptr) context_.add_mem->set_data_handle(add_data);
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
+    execute_primitives(context_.matmul_primitives, stream, context_.net_args);
+
+    // After execution, set data handle back
+    context_.a_mem->set_data_handle(DummyData);
+    context_.b_mem->set_data_handle(DummyData);
+    context_.c_mem->set_data_handle(DummyData);
+    if (sp_data != nullptr) context_.sp_mem->set_data_handle(DummyData);
+    if (mul_data != nullptr) context_.mul_mem->set_data_handle(DummyData);
+    if (add_data != nullptr) context_.add_mem->set_data_handle(DummyData);
+  }
+
+  std::shared_ptr<dnnl::matmul::primitive_desc> GetPrimitiveDesc() const {
+    return context_.prim_desc;
+  }
+
+ private:
+  // Primitive reuse context for MatMul op
+  struct MklMatMulContext {
+    // oneDNN memory.
+    std::shared_ptr<dnnl::memory> a_mem;
+    std::shared_ptr<dnnl::memory> b_mem;
+    std::shared_ptr<dnnl::memory> c_mem;
+    std::shared_ptr<dnnl::memory> mul_mem;
+    std::shared_ptr<dnnl::memory> add_mem;
+    std::shared_ptr<dnnl::memory> sp_mem;
+
+    // Descriptor and primitive-descriptor for MatMul.
+#ifndef ENABLE_ONEDNN_V3
+    std::shared_ptr<matmul::desc> desc;
+#endif  // !ENABLE_ONEDNN_V3
+    std::shared_ptr<matmul::primitive_desc> prim_desc;
+
+    // Memory descriptors.
+    std::shared_ptr<dnnl::memory::desc> a_md;
+    std::shared_ptr<dnnl::memory::desc> b_md;
+    std::shared_ptr<dnnl::memory::desc> c_md;
+    std::shared_ptr<dnnl::memory::desc> mul_md;
+    std::shared_ptr<dnnl::memory::desc> add_md;
+
+    // MatMul primitive.
+    std::vector<dnnl::primitive> matmul_primitives;
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    MklMatMulContext()
+        : a_mem(nullptr),
+          b_mem(nullptr),
+          c_mem(nullptr),
+          mul_mem(nullptr),
+          add_mem(nullptr),
+          sp_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
+          desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
+          prim_desc(nullptr),
+          a_md(nullptr),
+          b_md(nullptr),
+          c_md(nullptr),
+          mul_md(nullptr),
+          add_md(nullptr) {
+    }
+  };
+
+  void Setup(const MklMatMulParams& params) {
+    std::shared_ptr<dnnl::primitive> matmul_primitive = nullptr;
+
+    // Create MatMul descriptor and primitive descriptor.
+    if constexpr (CSR) {
+      // If it's a CSR matrix.
+#ifdef ENABLE_ONEDNN_V3
+      const auto tmp = memory::desc::csr(
+          params.a_dims, MklDnnType<Tlhs>(), params.a_nnz,
+          dnnl::memory::data_type::s32, dnnl::memory::data_type::s32);
+      context_.a_md.reset(new memory::desc(tmp));
+#endif  // ENABLE_ONEDNN_V3
+    } else {
+      context_.a_md.reset(new memory::desc({params.a_dims}, MklDnnType<Tlhs>(),
+                                           params.a_strides));
+    }
+
+    context_.b_md.reset(new memory::desc({params.b_dims}, MklDnnType<Trhs>(),
+#ifdef DNNL_AARCH64_USE_ACL
+                                         memory::format_tag::any));
+#else
+                                         params.b_strides));
+#endif
+    context_.c_md.reset(new memory::desc({params.c_dims}, MklDnnType<Toutput>(),
+                                         params.c_strides));
+
+    // Create matmul.
+#ifndef ENABLE_ONEDNN_V3
+    context_.desc.reset(
+        new matmul::desc(*context_.a_md, *context_.b_md, *context_.c_md));
+#endif  // !ENABLE_ONEDNN_V3
+
+    // Check if there is any fusion as post-ops
+    auto const& post_op_params = params.post_op_params;
+    dnnl::primitive_attr post_ops_attr;
+    dnnl::post_ops post_ops;
+    if (!post_op_params.empty()) {
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "output_scale") {
+#ifndef ENABLE_ONEDNN_V3
+          // TODO(intel-tf): Verify if this code is needed. If not, it needs to
+          // be removed.
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          std::vector<float> scales;
+          scales.push_back(post_op_param.param[0]);
+          post_ops_attr.set_output_scales(0, scales);
+#endif  // !ENABLE_ONEDNN_V3
+        } else if (post_op_param.name == "mul") {
+          context_.mul_md.reset(new memory::desc({post_op_param.dims},
+                                                 post_op_param.data_type,
+                                                 post_op_param.format_tag));
+          post_ops.append_binary(dnnl::algorithm::binary_mul, *context_.mul_md);
+        } else if (post_op_param.name == "add") {
+          context_.add_md.reset(new memory::desc({post_op_param.dims},
+                                                 post_op_param.data_type,
+                                                 post_op_param.format_tag));
+          post_ops.append_binary(dnnl::algorithm::binary_add, *context_.add_md);
+        } else {
+          DCHECK((post_op_param.name == "output_scale"));
+        }
+      }
+      post_ops_attr.set_post_ops(post_ops);
+    }
+    post_ops_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+#ifndef ENABLE_ONEDNN_V3
+    context_.prim_desc.reset(
+        new matmul::primitive_desc(*context_.desc, post_ops_attr, cpu_engine_));
+#else
+    context_.prim_desc.reset(
+        new matmul::primitive_desc(cpu_engine_, *context_.a_md, *context_.b_md,
+                                   *context_.c_md, post_ops_attr));
+#endif  // !ENABLE_ONEDNN_V3
+
+    // Create memory primitive based on dummy data.
+    if constexpr (CSR) {
+      context_.a_mem.reset(new dnnl::memory(*context_.a_md, cpu_engine_,
+                                            std::vector<void*>(3, DummyData)));
+    } else {
+      context_.a_mem.reset(
+          new dnnl::memory(*context_.a_md, cpu_engine_, DummyData));
+    }
+#ifdef DNNL_AARCH64_USE_ACL
+    context_.b_mem.reset(new dnnl::memory(
+        context_.prim_desc.get()->weights_desc(), cpu_engine_, DummyData));
+#else
+    context_.b_mem.reset(
+        new dnnl::memory(*context_.b_md, cpu_engine_, DummyData));
+#endif
+    context_.c_mem.reset(
+        new dnnl::memory(*context_.c_md, cpu_engine_, DummyData));
+    auto scratchpad_md = context_.prim_desc->scratchpad_desc();
+    context_.sp_mem.reset(
+        new dnnl::memory(scratchpad_md, cpu_engine_, DummyData));
+
+    // Create matmul primitive.
+    matmul_primitive.reset(new dnnl::matmul(*context_.prim_desc));
+    context_.net_args.push_back({{DNNL_ARG_SRC, *context_.a_mem},
+                                 {DNNL_ARG_WEIGHTS, *context_.b_mem},
+                                 {DNNL_ARG_SCRATCHPAD, *context_.sp_mem},
+                                 {DNNL_ARG_DST, *context_.c_mem}});
+    if (!post_op_params.empty()) {
+      int count = 0;
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "mul") {
+          context_.mul_mem.reset(
+              new dnnl::memory(*context_.mul_md, cpu_engine_, DummyData));
+          context_.net_args[0].insert(
+              {DNNL_ARG_ATTR_MULTIPLE_POST_OP(count) | DNNL_ARG_SRC_1,
+               *context_.mul_mem});
+          count++;
+        } else if (post_op_param.name == "add") {
+          context_.add_mem.reset(
+              new dnnl::memory(*context_.add_md, cpu_engine_, DummyData));
+          context_.net_args[0].insert(
+              {DNNL_ARG_ATTR_MULTIPLE_POST_OP(count) | DNNL_ARG_SRC_1,
+               *context_.add_mem});
+          count++;
+        }
+      }
+    }
+
+    context_.matmul_primitives.push_back(*matmul_primitive);
+    return;
+  }
+
+  struct MklMatMulContext context_;
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  mutex primitive_execution_mu_;
+#endif
+};
+
+template <typename T, typename Tlhs, typename Trhs, typename Toutput,
+          bool CSR = false>
+class MklMatMulPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklMatMulPrimitive<Tlhs, Trhs, Toutput, CSR>* Get(
+      const MklMatMulParams& params, bool do_not_cache) {
+    MklMatMulPrimitive<Tlhs, Trhs, Toutput, CSR>* matmul_prim = nullptr;
+
+    if (do_not_cache) {
+      // Always create new primitive
+      matmul_prim = new MklMatMulPrimitive<Tlhs, Trhs, Toutput, CSR>(params);
+    } else {
+      // Try to find a suitable one in pool
+      matmul_prim = dynamic_cast<MklMatMulPrimitive<Tlhs, Trhs, Toutput, CSR>*>(
+          MklMatMulPrimitiveFactory<T, Tlhs, Trhs, Toutput, CSR>::GetInstance()
+              .GetMklMatMul(params));
+      if (matmul_prim == nullptr) {
+        matmul_prim = new MklMatMulPrimitive<Tlhs, Trhs, Toutput, CSR>(params);
+        MklMatMulPrimitiveFactory<T, Tlhs, Trhs, Toutput, CSR>::GetInstance()
+            .SetMklMatMul(params, matmul_prim);
+      }
+    }
+
+    return matmul_prim;
+  }
+
+ private:
+  MklMatMulPrimitiveFactory() {}
+  ~MklMatMulPrimitiveFactory() {}
+
+  static MklMatMulPrimitiveFactory& GetInstance() {
+    static MklMatMulPrimitiveFactory instance_;
+    return instance_;
+  }
+
+  static string CreateKey(const MklMatMulParams& params) {
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(params.prefix);
+    key_creator.AddAsKey(params.a_dims);
+    key_creator.AddAsKey(params.b_dims);
+    key_creator.AddAsKey(params.c_dims);
+    key_creator.AddAsKey(params.a_strides);
+    key_creator.AddAsKey(params.b_strides);
+    key_creator.AddAsKey(params.c_strides);
+    key_creator.AddAsKey(typeid(T).name());
+    key_creator.AddAsKey(typeid(Tlhs).name());
+    key_creator.AddAsKey(typeid(Trhs).name());
+    key_creator.AddAsKey(typeid(Toutput).name());
+
+    // Generate keys for post-ops
+    for (auto const& post_op_param : params.post_op_params) {
+      if (post_op_param.name == "output_scale") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else if (post_op_param.name == "mul" || post_op_param.name == "add") {
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.dims);
+      } else {
+        return string("not_a_key");
+      }
+    }
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetMklMatMul(const MklMatMulParams& params) {
+    string key = CreateKey(params);
+    return this->GetOp(key);
+  }
+
+  void SetMklMatMul(const MklMatMulParams& params, MklPrimitive* op) {
+    string key = CreateKey(params);
+    this->SetOp(key, op);
+  }
+};
+
+template <typename T>
+void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
+               float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
+               float beta, T* c, int64_t ldc, OpKernelContext* ctx = nullptr) {
+  using dims = dnnl::memory::dims;
+
+  // Prepare strides based on the transa and transb flags: transposed
+  // matrices have strides swapped
+  dims a_dims = dims{m, k};
+  dims b_dims = dims{k, n};
+  dims c_dims = dims{m, n};
+  dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
+  dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
+  dims c_strides = dims{ldc, 1};
+
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params("dnnl_gemm", a_dims, b_dims, c_dims, a_strides,
+                         b_strides, c_strides);
+  auto st = ExecuteSingleThreadedGemm(m, n, k, sizeof(T));
+  // Create the oneDNN wrapper over Eigen threadpool and set max threads
+  // in oneDNN.
+  Eigen::ThreadPoolInterface* eigen_interface =
+      EigenThreadPoolFromTfContext(ctx);
+  tsl::OneDnnThreadPool eigen_tp(eigen_interface, ThreadPoolUseCallerThread(),
+                                 st ? 1 : -1);
+  MklMatMulPrimitive<T, T, T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T, T, T, T>::Get(params, 0);
+
+  UserScratchPad<unsigned char> scratch_pad;
+  scratch_pad.AllocateSPTensor(matmul_prim, ctx);
+  // Execute matmul primitive.
+
+  std::shared_ptr<stream> cpu_stream;
+
+  cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
+  matmul_prim->Execute(cpu_stream, a, b, c, scratch_pad.Get());
+}
+
+}  // anonymous namespace
+
+#undef APPEND_ELTWISE
+#undef APPEND_ELTWISE_RELU6
+#undef OUTPUT_SCALE_DCHECK
+#undef SET_MKL_LAYOUT
+#undef TSCALED_BIAS
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
new file mode 100644
index 00000000..da031d5c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
@@ -0,0 +1,808 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
+
+#ifdef INTEL_MKL
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dnnl.hpp"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+#include "tensorflow/core/platform/mutex.h"
+#endif
+
+namespace tensorflow {
+
+#ifndef ENABLE_ONEDNN_V3
+#define GET_DIMS data.dims
+#define SET_MKL_LAYOUT(md) SetMklLayout(&md)
+#else
+#define GET_DIMS get_dims()
+#define SET_MKL_LAYOUT(md) SetMklLayout(md)
+#endif  // !ENABLE_ONEDNN_V3
+
+using dnnl::pooling_backward;
+using dnnl::pooling_forward;
+using dnnl::prop_kind;
+using dnnl::stream;
+
+using PoolingFwdPd = dnnl::pooling_forward::primitive_desc;
+using PoolingBwdPd = dnnl::pooling_backward::primitive_desc;
+
+struct MklPoolingParams {
+  memory::dims src_dims;
+  memory::dims dst_dims;
+  memory::dims filter_dims;
+  memory::dims strides;
+#ifdef ENABLE_ONEDNN_V3
+  memory::dims dilations;
+#endif  // ENABLE_ONEDNN_V3
+  memory::dims padding_left;
+  memory::dims padding_right;
+  dnnl::algorithm alg_kind;
+  dnnl::prop_kind prop_kind;
+  memory::format_tag src_format;
+  memory::desc src_md;
+  bool native_format;
+
+  MklPoolingParams(memory::dims src_dims, memory::dims dst_dims,
+                   memory::dims filter_dims, memory::dims strides,
+#ifdef ENABLE_ONEDNN_V3
+                   memory::dims dilations,
+#endif  // ENABLE_ONEDNN_V3
+                   memory::dims padding_left, memory::dims padding_right,
+                   dnnl::algorithm alg_kind, dnnl::prop_kind prop_kind,
+                   memory::format_tag src_format, memory::desc src_md,
+                   bool native_format)
+      : src_dims(src_dims),
+        dst_dims(dst_dims),
+        filter_dims(filter_dims),
+        strides(strides),
+#ifdef ENABLE_ONEDNN_V3
+        dilations(dilations),
+#endif  // ENABLE_ONEDNN_V3
+        padding_left(padding_left),
+        padding_right(padding_right),
+        alg_kind(alg_kind),
+        prop_kind(prop_kind),
+        src_format(src_format),
+        src_md(src_md),
+        native_format(native_format) {
+  }
+};
+
+template <typename T>
+class MklPoolingFwdPrimitive : public MklPrimitive {
+ public:
+  explicit MklPoolingFwdPrimitive(const MklPoolingParams& fwdParams)
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
+    if (context_.fwd == nullptr) Setup(fwdParams);
+  }
+
+  ~MklPoolingFwdPrimitive() {}
+
+  // Pooling forward execute
+  //   src_data:  input data buffer of src
+  //   ws_data:   output data buffer of workspace
+  //   dst_data:  output data buffer of dst
+  void Execute(const T* src_data, T* dst_data, void* ws_data,
+               std::shared_ptr<stream> fwd_stream);
+
+  std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
+    return context_.fwd_pd;
+  }
+
+  memory::format_tag GetSrcMemoryFormat() const { return context_.src_fmt; }
+  memory::format_tag GetDstMemoryFormat() const { return context_.dst_fmt; }
+
+ private:
+  void Setup(const MklPoolingParams& fwdParams);
+
+  struct PoolingFwdContext {
+    // Algorithm.
+    dnnl::algorithm alg_kind;
+
+    // Kind of propagation, forward or backward.
+    dnnl::prop_kind prop_kind;
+
+    // Expected memory format.
+    memory::format_tag src_fmt;
+    memory::format_tag dst_fmt;
+    memory::format_tag ws_fmt;
+
+    // Workspace shape.
+    memory::data_type ws_dt;
+    size_t ws_size;
+
+    // oneDNN memory, just dummy data.
+    std::shared_ptr<dnnl::memory> ws_mem;
+    std::shared_ptr<dnnl::memory> src_mem;
+    std::shared_ptr<dnnl::memory> dst_mem;
+
+    // Pooling forward descriptor and primitive descriptor.
+#ifndef ENABLE_ONEDNN_V3
+    std::shared_ptr<dnnl::pooling_forward::desc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
+    std::shared_ptr<PoolingFwdPd> fwd_pd;
+
+    // Memory descriptor.
+    std::shared_ptr<dnnl::memory::desc> src_md;
+    std::shared_ptr<dnnl::memory::desc> dst_md;
+
+    // Pooling primitive
+    std::shared_ptr<dnnl::pooling_forward> fwd;
+    std::shared_ptr<dnnl::stream> fwd_stream;
+    std::vector<dnnl::primitive> fwd_primitives;
+
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    PoolingFwdContext()
+        : src_fmt(memory::format_tag::any),
+          dst_fmt(memory::format_tag::any),
+          ws_fmt(memory::format_tag::any),
+          ws_dt(memory::data_type::u8),
+          ws_size(0),
+          ws_mem(nullptr),
+          src_mem(nullptr),
+          dst_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
+          fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
+          fwd_pd(nullptr),
+          src_md(nullptr),
+          dst_md(nullptr),
+          fwd(nullptr) {
+    }
+  };
+
+  struct PoolingFwdContext context_;
+
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  mutex primitive_execution_mu_;
+#endif
+};
+
+template <typename T>
+class MklPoolingFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklPoolingFwdPrimitive<T>* Get(const MklPoolingParams& fwdParams) {
+    MklPoolingFwdPrimitive<T>* pooling_forward = nullptr;
+    // Get pooling primitive from the pool
+    pooling_forward = static_cast<MklPoolingFwdPrimitive<T>*>(
+        MklPoolingFwdPrimitiveFactory<T>::GetInstance().GetPoolingFwd(
+            fwdParams));
+
+    if (pooling_forward == nullptr) {
+      pooling_forward = new MklPoolingFwdPrimitive<T>(fwdParams);
+      MklPoolingFwdPrimitiveFactory<T>::GetInstance().SetPoolingFwd(
+          fwdParams, pooling_forward);
+    }
+    return pooling_forward;
+  }
+
+  static MklPoolingFwdPrimitiveFactory& GetInstance() {
+    static MklPoolingFwdPrimitiveFactory instance_;
+    return instance_;
+  }
+
+ private:
+  MklPoolingFwdPrimitiveFactory() {}
+  ~MklPoolingFwdPrimitiveFactory() {}
+
+  // The key to be created will be used to get/set pooling
+  // primitive op from reuse perspective.
+  // A pooling key is a string which concates key parameters
+  // as well as algorithm kind (max versus avg).
+  static string CreateKey(const MklPoolingParams& fwdParams) {
+    string prefix = "pooling_fwd";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(fwdParams.src_dims);
+    key_creator.AddAsKey(fwdParams.dst_dims);
+    key_creator.AddAsKey(fwdParams.filter_dims);
+    key_creator.AddAsKey(fwdParams.strides);
+#ifdef ENABLE_ONEDNN_V3
+    key_creator.AddAsKey(fwdParams.dilations);
+#endif  // ENABLE_ONEDNN_V3
+    key_creator.AddAsKey(fwdParams.padding_left);
+    key_creator.AddAsKey(fwdParams.padding_right);
+    key_creator.AddAsKey(fwdParams.src_format);
+    key_creator.AddAsKey<int>(static_cast<int>(fwdParams.alg_kind));
+    key_creator.AddAsKey<int>(static_cast<int>(fwdParams.prop_kind));
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetPoolingFwd(const MklPoolingParams& fwdParams) {
+    string key = CreateKey(fwdParams);
+    return this->GetOp(key);
+  }
+
+  void SetPoolingFwd(const MklPoolingParams& fwdParams, MklPrimitive* op) {
+    string key = CreateKey(fwdParams);
+    this->SetOp(key, op);
+  }
+};
+
+template <typename T>
+class MklPoolingBwdPrimitive : public MklPrimitive {
+ public:
+  explicit MklPoolingBwdPrimitive(const MklPoolingParams& bwdParams)
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
+    if (context_.bwd == nullptr) Setup(bwdParams);
+  }
+
+  ~MklPoolingBwdPrimitive() {}
+
+  // Pooling backward execute
+  //   diff_dst_data:  input data buffer of diff_dst
+  //   diff_src_data:  output data buffer of diff_src
+  //   ws_data:        input data buffer of workspace
+  void Execute(const T* diff_dst_data, T* diff_src_data, const void* ws_data,
+               std::shared_ptr<stream> bwd_stream);
+
+ public:
+  std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
+    return context_.fwd_pd;
+  }
+  std::shared_ptr<PoolingBwdPd> GetPoolingBwdPd() const {
+    return context_.bwd_pd;
+  }
+
+  dnnl::memory::data_type GetWorkspaceDataType() const {
+    return context_.ws_dt;
+  }
+
+ private:
+  void Setup(const MklPoolingParams& bwdParams);
+
+  // Primitive reuse context for pooling bwd ops
+  struct PoolingBwdContext {
+    // Algorithm.
+    dnnl::algorithm alg_kind;
+
+    // Expected memory format.
+    memory::format_tag diff_src_fmt;
+    memory::format_tag diff_dst_fmt;
+    memory::format_tag ws_fmt;
+
+    // Workspace attribute.
+    dnnl::memory::data_type ws_dt;
+
+    // oneDNN memory.
+    std::shared_ptr<dnnl::memory> ws_mem;
+    std::shared_ptr<dnnl::memory> diff_src_mem;
+    std::shared_ptr<dnnl::memory> diff_dst_mem;
+
+    // Memory descriptors.
+    std::shared_ptr<dnnl::memory::desc> src_md;
+    std::shared_ptr<dnnl::memory::desc> dst_md;
+
+    // Forward and backward pooling descriptors and primitive descriptors.
+#ifndef ENABLE_ONEDNN_V3
+    std::shared_ptr<dnnl::pooling_forward::desc> fwd_desc;
+    std::shared_ptr<dnnl::pooling_backward::desc> bwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
+    std::shared_ptr<PoolingFwdPd> fwd_pd;
+    std::shared_ptr<PoolingBwdPd> bwd_pd;
+
+    // Backward pooling primitive.
+    std::shared_ptr<dnnl::pooling_backward> bwd;
+    std::shared_ptr<dnnl::stream> bwd_stream;
+
+    std::vector<dnnl::primitive> bwd_primitives;
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    PoolingBwdContext()
+        : diff_src_fmt(memory::format_tag::any),
+          diff_dst_fmt(memory::format_tag::any),
+          ws_fmt(memory::format_tag::any),
+          ws_dt(memory::data_type::u8),
+          ws_mem(nullptr),
+          diff_src_mem(nullptr),
+          diff_dst_mem(nullptr),
+          src_md(nullptr),
+          dst_md(nullptr),
+#ifndef ENABLE_ONEDNN_V3
+          fwd_desc(nullptr),
+          bwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
+          fwd_pd(nullptr),
+          bwd_pd(nullptr),
+          bwd(nullptr) {
+    }
+  };
+
+  struct PoolingBwdContext context_;
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  mutex primitive_execution_mu_;
+#endif
+};
+
+template <typename T>
+class MklPoolingBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklPoolingBwdPrimitive<T>* Get(const MklPoolingParams& bwdParams) {
+    MklPoolingBwdPrimitive<T>* pooling_backward = nullptr;
+
+    // Find a pooling backward primitive from the pool.
+    // If it does not exist, create a new one.
+    pooling_backward = static_cast<MklPoolingBwdPrimitive<T>*>(
+        MklPoolingBwdPrimitiveFactory<T>::GetInstance().GetPoolingBwd(
+            bwdParams));
+    if (pooling_backward == nullptr) {
+      pooling_backward = new MklPoolingBwdPrimitive<T>(bwdParams);
+      MklPoolingBwdPrimitiveFactory<T>::GetInstance().SetPoolingBwd(
+          bwdParams, pooling_backward);
+    }
+    return pooling_backward;
+  }
+
+  static MklPoolingBwdPrimitiveFactory& GetInstance() {
+    static MklPoolingBwdPrimitiveFactory instance_;
+    return instance_;
+  }
+
+ private:
+  MklPoolingBwdPrimitiveFactory() {}
+  ~MklPoolingBwdPrimitiveFactory() {}
+
+  // The key to be created will be used to get/set pooling
+  // primitive op from reuse perspective.
+  // A pooling key is a string which concates key parameters
+  // as well as algorithm kind (max versus avg).
+  static string CreateKey(const MklPoolingParams& bwdParams) {
+    string prefix = "pooling_bwd";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(bwdParams.src_dims);
+    key_creator.AddAsKey(bwdParams.dst_dims);
+    key_creator.AddAsKey(bwdParams.filter_dims);
+    key_creator.AddAsKey(bwdParams.strides);
+#ifdef ENABLE_ONEDNN_V3
+    key_creator.AddAsKey(bwdParams.dilations);
+#endif  // ENABLE_ONEDNN_V3
+    key_creator.AddAsKey(bwdParams.padding_left);
+    key_creator.AddAsKey(bwdParams.padding_right);
+    key_creator.AddAsKey(bwdParams.src_format);
+    key_creator.AddAsKey<int>(static_cast<int>(bwdParams.alg_kind));
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetPoolingBwd(const MklPoolingParams& bwdParams) {
+    string key = CreateKey(bwdParams);
+    return this->GetOp(key);
+  }
+
+  void SetPoolingBwd(const MklPoolingParams& bwdParams, MklPrimitive* op) {
+    string key = CreateKey(bwdParams);
+    this->SetOp(key, op);
+  }
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklPoolParameters {
+  int depth;
+
+  int tensor_in_planes;  // Pool3D
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_planes;  // Pool3D
+  int window_rows;
+  int window_cols;
+  int depth_window;
+
+  int planes_stride;  // Pool3D
+  int row_stride;
+  int col_stride;
+  int depth_stride;
+
+#ifdef ENABLE_ONEDNN_V3
+  int planes_dilation;  // Pool3D
+  int row_dilation;
+  int col_dilation;
+#endif  // ENABLE_ONEDNN_V3
+
+  int64 out_planes;  // Pool3D
+  int64 out_height;
+  int64 out_width;
+  int out_depth;
+
+  int64 pad_P1;  // Pool3D
+  int64 pad_P2;  // Pool3D
+  int64 pad_left;
+  int64 pad_right;
+  int64 pad_top;
+  int64 pad_bottom;
+  int pad_depth;
+
+  TensorFormat data_format;
+  MklPoolParameters()
+      : depth(0),
+        tensor_in_planes(0),
+        tensor_in_cols(0),
+        tensor_in_rows(0),
+        tensor_in_batch(0),
+        window_planes(0),
+        window_rows(0),
+        window_cols(0),
+        depth_window(0),
+        planes_stride(0),
+        row_stride(0),
+        col_stride(0),
+        depth_stride(0),
+#ifdef ENABLE_ONEDNN_V3
+        planes_dilation(0),
+        row_dilation(0),
+        col_dilation(0),
+#endif  // ENABLE_ONEDNN_V3
+        out_planes(0),
+        out_height(0),
+        out_width(0),
+        out_depth(0),
+        pad_P1(0),
+        pad_P2(0),
+        pad_left(0),
+        pad_right(0),
+        pad_top(0),
+        pad_bottom(0),
+        pad_depth(0),
+        data_format(TensorFormat::FORMAT_NCHW) {
+  }
+
+  // Updates context->status if there is an invalid input.
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const TensorShape& tensor_in_shape);
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklDnnShape* mkl_in_shape);
+
+ private:
+  // Common initialization for TensorFlow and MKL formats
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format);
+};
+
+template <class T>
+class MklPoolingOpBase : public OpKernel {
+ public:
+  explicit MklPoolingOpBase(OpKernelConstruction* context)
+      : OpKernel(context), workspace_enabled_(false) {
+    string data_format;
+    if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+      // Current quantized convolution doesn't have data_format attribute.
+      data_format = "NHWC";
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    }
+    OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
+                absl::InvalidArgumentError("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+    OP_REQUIRES(context, this->ksize_.size() == 4 || this->ksize_.size() == 5,
+                absl::InvalidArgumentError("Sliding window ksize field must "
+                                           "specify 4 or 5 dimensions"));
+    for (int i = 0; i < this->ksize_.size(); ++i) {
+      OP_REQUIRES(context, this->ksize_[i] > 0,
+                  errors::InvalidArgument(
+                      absl::StrCat("Sliding window ksize must be positive. The "
+                                   "specified or inferred ksize is: ",
+                                   absl::StrJoin(ksize_, ","))));
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+    OP_REQUIRES(context, this->stride_.size() == 4 || this->stride_.size() == 5,
+                absl::InvalidArgumentError("Sliding window strides field must "
+                                           "specify 4 or 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+    OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                absl::UnimplementedError("Pooling is not yet supported on the "
+                                         "batch dimension."));
+    bool is_pool2d = (this->ksize_.size() == 4);
+    this->tensor_format_mkldnn_ =
+        is_pool2d ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_)
+                  : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
+
+    this->data_format_mkldnn_ =
+        MklTensorFormatToMklDnnDataFormat(this->tensor_format_mkldnn_);
+
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    auto status =
+        context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+    (void)status;
+  }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  // Calculate output shape of pooling op in oneDNN and TensorFlow order.
+  // oneDNN uses NCHW(Pool2D) or NCDHW(Pool3D) for output order.
+  // But TensorFlow output will be in NHWC/NCHW(Pool2D) or
+  // NDHWC/NCDHW(Pool3D) format depending on data format. Function expects
+  // output height and width to have already been int32 bounds-checked.
+  void GetOutputDims(const MklPoolParameters& mkl_pool_params,
+                     memory::dims* output_dims_mkl_order) {
+    if (this->ksize_.size() == 4) {
+      // Pooling2D: oneDNN always needs output in NCHW format.
+      *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
+                                mkl_pool_params.out_depth,
+                                static_cast<int>(mkl_pool_params.out_height),
+                                static_cast<int>(mkl_pool_params.out_width)};
+    } else {
+      // Pooling3D: oneDNN always needs output in NCDHW format.
+      *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
+                                mkl_pool_params.out_depth,
+                                static_cast<int>(mkl_pool_params.out_planes),
+                                static_cast<int>(mkl_pool_params.out_height),
+                                static_cast<int>(mkl_pool_params.out_width)};
+    }
+  }
+
+  void InitMklPoolParameters(OpKernelContext* context,
+                             MklPoolParameters* pool_params,
+                             const MklDnnShape& original_input_mkl_shape,
+                             const TensorShape& input_tensor_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+                        this->data_format_tf_, input_tensor_shape);
+    } else {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+                        this->data_format_tf_, &original_input_mkl_shape);
+    }
+  }
+
+  void PoolParamsToDims(const MklPoolParameters* pool_params,
+                        memory::dims* filter_dims, memory::dims* strides,
+#ifdef ENABLE_ONEDNN_V3
+                        memory::dims* dilations,
+#endif  // ENABLE_ONEDNN_V3
+                        memory::dims* padding_left, memory::dims* padding_right,
+                        bool is_pool2d) {
+    if (is_pool2d) {
+      // Pool2D
+      *filter_dims =
+          memory::dims({pool_params->window_rows, pool_params->window_cols});
+      *strides =
+          memory::dims({pool_params->row_stride, pool_params->col_stride});
+#ifdef ENABLE_ONEDNN_V3
+      *dilations =
+          memory::dims({pool_params->row_dilation, pool_params->col_dilation});
+#endif  // ENABLE_ONEDNN_V3
+      *padding_left = memory::dims({static_cast<int>(pool_params->pad_top),
+                                    static_cast<int>(pool_params->pad_left)});
+      *padding_right = memory::dims({static_cast<int>(pool_params->pad_bottom),
+                                     static_cast<int>(pool_params->pad_right)});
+    } else {
+      // Pool3D
+      *filter_dims =
+          memory::dims({pool_params->window_planes, pool_params->window_rows,
+                        pool_params->window_cols});
+      *strides =
+          memory::dims({pool_params->planes_stride, pool_params->row_stride,
+                        pool_params->col_stride});
+#ifdef ENABLE_ONEDNN_V3
+      *dilations =
+          memory::dims({pool_params->planes_dilation, pool_params->row_dilation,
+                        pool_params->col_dilation});
+#endif  // ENABLE_ONEDNN_V3
+
+      *padding_left = memory::dims({static_cast<int>(pool_params->pad_P1),
+                                    static_cast<int>(pool_params->pad_top),
+                                    static_cast<int>(pool_params->pad_left)});
+      *padding_right = memory::dims({static_cast<int>(pool_params->pad_P2),
+                                     static_cast<int>(pool_params->pad_bottom),
+                                     static_cast<int>(pool_params->pad_right)});
+    }
+  }
+
+  void AllocateEmptyOutputTensor(OpKernelContext* context,
+                                 const int kOutputIndex,
+                                 MklPoolParameters* pool_params,
+                                 const memory::dims output_dims_mkl_order,
+                                 Tensor** output_tensor) {
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    TensorShape output_tf_shape;
+    if (pool_params->data_format == TensorFormat::FORMAT_NCHW) {
+      output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+    } else {
+      memory::dims output_dims_order;
+      // determine Pooling2D (NHWC) or Pooling3D (NDHWC)
+      if (this->ksize_.size() == 4) {
+        output_dims_order = {pool_params->tensor_in_batch,
+                             static_cast<int>(pool_params->out_height),
+                             static_cast<int>(pool_params->out_width),
+                             pool_params->out_depth};
+      } else {
+        output_dims_order = {pool_params->tensor_in_batch,
+                             static_cast<int>(pool_params->out_planes),
+                             static_cast<int>(pool_params->out_height),
+                             static_cast<int>(pool_params->out_width),
+                             pool_params->out_depth};
+      }
+      output_tf_shape = MklDnnDimsToTFShape(output_dims_order);
+    }
+    AllocateOutputSetMklShape(context, kOutputIndex, output_tensor,
+                              output_tf_shape, output_mkl_shape,
+                              native_format_);
+    DCHECK(output_tensor);
+  }
+
+  // Checks to make sure that the memory we need to allocate
+  // is a multiple of sizeof(T)
+  // returns the number of elements
+  size_t GetNumTElements(const memory::desc& pd) {
+    size_t num_bytes = pd.get_size();
+    size_t ret_val = num_bytes / sizeof(T);
+    if (num_bytes % sizeof(T) != 0) {
+      ret_val++;
+    }
+    return ret_val;
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_tf_;
+  MklTensorFormat tensor_format_mkldnn_;
+  memory::format_tag data_format_mkldnn_;
+  bool workspace_enabled_;
+  bool native_format_ = false;
+};
+
+template <class T>
+class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingForwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  void ConfigureInput(OpKernelContext* context,
+                      const MklDnnShape& input_mkl_shape,
+                      const Tensor& input_tensor,
+                      MklPoolParameters* pool_params,
+                      MklDnnData<T>* dnn_data_input) {
+    DCHECK(pool_params);
+    DCHECK(dnn_data_input);
+    TensorShape input_tensor_shape = input_tensor.shape();
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
+          input_mkl_shape.IsMklTensor()
+              ? input_mkl_shape.GetMklLayout()
+              : memory::desc(
+                    (this->ksize_.size() == 4)
+                        ? TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                                    this->data_format_tf_)
+                        : TFShapeToMklDnnDimsInNCDHW(input_tensor_shape,
+                                                     this->data_format_tf_),
+                    MklDnnType<T>(), this->data_format_mkldnn_);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+
+      if (this->ksize_.size() == 5) {
+        // Pool3D
+        std::vector<dnnl::memory::dim> input_sizes(5, -1);
+        input_sizes[MklDnnDims3D::Dim3d_N] = input_md.GET_DIMS[0];
+        input_sizes[MklDnnDims3D::Dim3d_C] = input_md.GET_DIMS[1];
+        input_sizes[MklDnnDims3D::Dim3d_D] = input_md.GET_DIMS[2];
+        input_sizes[MklDnnDims3D::Dim3d_H] = input_md.GET_DIMS[3];
+        input_sizes[MklDnnDims3D::Dim3d_W] = input_md.GET_DIMS[4];
+        dnn_data_input->SetOpMemDesc(input_sizes, this->data_format_mkldnn_);
+      }
+    }
+    this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
+                                input_tensor_shape);
+  }
+
+  void AllocateOutputTensor(OpKernelContext* context,
+                            const PoolingFwdPd& pool_fwd_prim_desc,
+                            const memory::dims output_dims_mkl_order,
+                            const MklTensorFormat& output_tf_format,
+                            Tensor** output_tensor) {
+    TensorShape output_tf_shape;
+    DCHECK(output_tensor);
+    memory::desc dst_pd = pool_fwd_prim_desc.dst_desc();
+
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SET_MKL_LAYOUT(dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+    // Only allocate enough space for the elements we need.
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+
+    if (this->native_format_) {
+      output_tf_shape = output_mkl_shape.GetTfShape();
+    }
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape,
+                              this->native_format_);
+    DCHECK(*output_tensor);
+  }
+
+  void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor,
+                        const MklDnnShape& input_mkl_shape) {
+    if (!input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(
+          context, input_tensor.dims() == 4 || input_tensor.dims() == 5,
+          absl::InvalidArgumentError("Input must be 4 or 5-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context,
+          input_mkl_shape.GetDimension() == 4 ||
+              input_mkl_shape.GetDimension() == 5,
+          absl::InvalidArgumentError("Input shape must be 4 or 5-dimensional"));
+    }
+  }
+  const int kInputTensorIndexInput = 0;
+  const int kOutputTensorIndexOutput = 0;
+};  // MklPoolingForwardBaseOp
+
+template <class T>
+class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  const int kOutputTensorIndexOutput = 0;
+
+  void AllocateOutputTensor(OpKernelContext* context,
+                            const PoolingBwdPd& pool_bkwd_prim_desc,
+                            const memory::dims output_dims_mkl_order,
+                            const MklTensorFormat& output_tf_format,
+                            Tensor** output_tensor) {
+    DCHECK(output_tensor);
+    memory::desc dst_pd = pool_bkwd_prim_desc.diff_src_desc();
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SET_MKL_LAYOUT(dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    if (this->native_format_) {
+      output_tf_shape = output_mkl_shape.GetTfShape();
+    }
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape,
+                              this->native_format_);
+    DCHECK(*output_tensor);
+  }
+};
+
+#undef GET_DIMS
+#undef SET_MKL_LAYOUT
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
new file mode 100644
index 00000000..0b6319c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
@@ -0,0 +1,93 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor.h"
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+template <class T>
+float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
+  int64 highest = static_cast<int64_t>(Eigen::NumTraits<T>::highest());
+  int64 lowest = static_cast<int64_t>(Eigen::NumTraits<T>::lowest());
+
+  // Adjusting for having a symmetric range.
+  // for example: for 8-bit [-127, 127] as opposed to [-128, 127].
+  if (lowest < -highest) ++lowest;
+
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           float min_b, float max_b,
+                                           float* min_c, float* max_c) {
+  const float a_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T2>(min_b, max_b);
+
+  const int64 c_highest = static_cast<int64_t>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64_t>(Eigen::NumTraits<T3>::lowest());
+  const float c_float_for_one_quant_level =
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           const Tensor& min_b_vector,
+                                           const Tensor& max_b_vector,
+                                           Tensor** min_c_vector,
+                                           Tensor** max_c_vector) {
+  DCHECK(min_b_vector.NumElements() == (*min_c_vector)->NumElements());
+  DCHECK(max_b_vector.NumElements() == (*max_c_vector)->NumElements());
+  size_t n_channel = min_b_vector.NumElements();
+  const int64 c_highest = static_cast<int64_t>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64_t>(Eigen::NumTraits<T3>::lowest());
+  const float* min_b = min_b_vector.flat<float>().data();
+  const float* max_b = max_b_vector.flat<float>().data();
+  float* min_c = (*min_c_vector)->flat<float>().data();
+  float* max_c = (*max_c_vector)->flat<float>().data();
+
+#ifdef ENABLE_ONEDNN_OPENMP
+#pragma omp parallel for
+#endif  // ENABLE_ONEDNN_OPENMP
+  // TODO(intel-tf): Add eigen parallel_for
+  for (int64_t n = 0; n < n_channel; ++n) {
+    float a_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+    float b_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]);
+    float c_float_for_one_quant_level =
+        a_float_for_one_quant_level * b_float_for_one_quant_level;
+    min_c[n] = c_float_for_one_quant_level * c_lowest;
+    max_c[n] = c_float_for_one_quant_level * c_highest;
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
new file mode 100644
index 00000000..59bf0e77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
@@ -0,0 +1,474 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_BINARY_OPS_TEST_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_BINARY_OPS_TEST_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/mlir_generated/base_ops_test.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// Base class for `BinaryOpsTest` fixture that has to be defined with a custom
+// TF device if you want to use the test macros in this file.
+class BinaryOpsTestBase : public OpsTestBase {
+ protected:
+  // This method should set the TF device, e.g. DEVICE_CPU, DEVICE_GPU.
+  void SetUp() override = 0;
+
+  template <typename T, typename OutT>
+  void SetOpKernel(const std::string& op_name, const TensorShape& lhs_shape,
+                   const absl::InlinedVector<T, 10>& lhs_input,
+                   const TensorShape& rhs_shape,
+                   const absl::InlinedVector<T, 10>& rhs_input,
+                   const test::OpsTestConfig& config) {
+    auto builder = NodeDefBuilder("some_name", op_name)
+                       .Input(FakeInput(DataTypeToEnum<T>::v()))
+                       .Input(FakeInput(DataTypeToEnum<T>::v()));
+    if (config.add_t) {
+      builder.Attr(config.input_attribute, DataTypeToEnum<T>::v());
+    }
+    if (config.add_tout) {
+      builder.Attr(config.output_attribute, DataTypeToEnum<OutT>::v());
+    }
+    TF_ASSERT_OK(builder.Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(lhs_shape, lhs_input);
+    AddInputFromArray<T>(rhs_shape, rhs_input);
+  }
+
+  // Run fully specified tests.
+
+  template <typename T, typename OutT>
+  void RunAndExpectResult(const std::string& op_name,
+                          const TensorShape& lhs_shape,
+                          const absl::InlinedVector<T, 10>& lhs_input,
+                          const TensorShape& rhs_shape,
+                          const absl::InlinedVector<T, 10>& rhs_input,
+                          const TensorShape& expected_shape,
+                          const absl::InlinedVector<OutT, 10>& expected_output,
+                          const test::OpsTestConfig& config) {
+    SetOpKernel<T, OutT>(op_name, lhs_shape, lhs_input, rhs_shape, rhs_input,
+                         config);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expectation.
+    Tensor expected_tensor(allocator(), DataTypeToEnum<OutT>::value,
+                           expected_shape);
+    test::FillValues<OutT>(&expected_tensor, expected_output);
+    if (config.expect_strictly_equal) {
+      test::ExpectEqual(expected_tensor, *GetOutput(0),
+                        config.supress_tolerance ? test::Tolerance::kNone
+                                                 : test::Tolerance::kDefault);
+    } else {
+      test::ExpectClose(expected_tensor, *GetOutput(0), config.atol,
+                        config.rtol);
+    }
+  }
+
+  template <typename T, typename OutT>
+  void RunAndExpectInvalidArgument(const std::string& op_name,
+                                   const TensorShape& lhs_shape,
+                                   const absl::InlinedVector<T, 10>& lhs_input,
+                                   const TensorShape& rhs_shape,
+                                   const absl::InlinedVector<T, 10>& rhs_input,
+                                   const test::OpsTestConfig& config) {
+    SetOpKernel<T, OutT>(op_name, lhs_shape, lhs_input, rhs_shape, rhs_input,
+                         config);
+    auto status = RunOpKernel();
+    EXPECT_FALSE(status.ok());
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  }
+
+  // Run common test cases.
+
+  template <typename T, typename OutT>
+  void TestIncompatibleShapes(const std::string& op_name,
+                              const absl::InlinedVector<T, 10>& lhs_input,
+                              const absl::InlinedVector<T, 10>& rhs_input,
+                              const test::OpsTestConfig& config) {
+    // Prepare incompatibly shaped inputs.
+    TensorShape lhs_shape{3};
+    TensorShape rhs_shape{2};
+    auto repeated_lhs_input =
+        test::RepeatInputToMatchShape(lhs_input, lhs_shape.num_elements());
+    auto repeated_rhs_input =
+        test::RepeatInputToMatchShape(rhs_input, rhs_shape.num_elements());
+
+    RunAndExpectInvalidArgument<T, OutT>(op_name, lhs_shape, repeated_lhs_input,
+                                         rhs_shape, repeated_rhs_input, config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestEqualShapes(const std::string& op_name, const TensorShape& shape,
+                       const absl::InlinedVector<T, 10>& lhs_input,
+                       const absl::InlinedVector<T, 10>& rhs_input,
+                       BaselineOutT (*baseline_callback)(BaselineT, BaselineT),
+                       const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    int64_t input_size = shape.num_elements();
+    CHECK(lhs_input.size() <= input_size && rhs_input.size() <= input_size &&
+          "expect input shape to hold all input values");
+    auto repeated_lhs_input =
+        test::RepeatInputToMatchShape(lhs_input, input_size);
+    auto repeated_rhs_input =
+        test::RepeatInputToMatchShape(rhs_input, input_size);
+
+    // Compute expected results.
+    absl::InlinedVector<OutT, 10> expected_output;
+    for (auto it_lhs = repeated_lhs_input.begin(),
+              it_rhs = repeated_rhs_input.begin(),
+              end = repeated_lhs_input.end();
+         it_lhs != end; ++it_lhs, ++it_rhs) {
+      auto lhs = static_cast<BaselineT>(*it_lhs);
+      auto rhs = static_cast<BaselineT>(*it_rhs);
+      auto result = static_cast<OutT>(baseline_callback(lhs, rhs));
+      expected_output.push_back(result);
+    }
+
+    RunAndExpectResult<T, OutT>(op_name, shape, repeated_lhs_input, shape,
+                                repeated_rhs_input, shape, expected_output,
+                                config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestOneScalar(const std::string& op_name, T scalar_input,
+                     const TensorShape& other_shape,
+                     const absl::InlinedVector<T, 10>& other_input,
+                     BaselineOutT (*baseline_callback)(BaselineT, BaselineT),
+                     const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape scalar_shape{};
+    CHECK(other_input.size() <= other_shape.num_elements() &&
+          "expect other input shape to hold all input values");
+    auto repeated_other_input =
+        test::RepeatInputToMatchShape(other_input, other_shape.num_elements());
+
+    // Compute expected results.
+    absl::InlinedVector<OutT, 10> expected_output;
+    for (auto it = repeated_other_input.begin(),
+              end = repeated_other_input.end();
+         it != end; ++it) {
+      auto scalar = static_cast<BaselineT>(scalar_input);
+      auto other_value = static_cast<BaselineT>(*it);
+      auto result = static_cast<OutT>(baseline_callback(scalar, other_value));
+      expected_output.push_back(result);
+    }
+
+    auto scalar_input_vector = test::InputAsVector<T>({scalar_input});
+    RunAndExpectResult<T, OutT>(op_name, scalar_shape, scalar_input_vector,
+                                other_shape, repeated_other_input,
+                                /*expected_shape=*/other_shape, expected_output,
+                                config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestOneEffectiveScalar(const std::string& op_name, T scalar_input,
+                              const TensorShape& other_shape,
+                              const absl::InlinedVector<T, 10>& other_input,
+                              BaselineOutT (*baseline_callback)(BaselineT,
+                                                                BaselineT),
+                              const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape effective_scalar_shape{1, 1, 1, 1, 1, 1, 1};
+    CHECK(other_input.size() <= other_shape.num_elements() &&
+          "expect other input shape to hold all input values");
+    auto repeated_other_input =
+        test::RepeatInputToMatchShape(other_input, other_shape.num_elements());
+
+    // Compute expected results.
+    absl::InlinedVector<OutT, 10> expected_output;
+    for (auto it = repeated_other_input.begin(),
+              end = repeated_other_input.end();
+         it != end; ++it) {
+      auto scalar = static_cast<BaselineT>(scalar_input);
+      auto other_value = static_cast<BaselineT>(*it);
+      auto result = static_cast<OutT>(baseline_callback(scalar, other_value));
+      expected_output.push_back(result);
+    }
+
+    auto scalar_input_vector = test::InputAsVector<T>({scalar_input});
+    TensorShape expected_shape = other_shape;
+    while (expected_shape.dims() < effective_scalar_shape.dims()) {
+      expected_shape.InsertDim(0, 1);
+    }
+    RunAndExpectResult<T, OutT>(
+        op_name, effective_scalar_shape, scalar_input_vector, other_shape,
+        repeated_other_input, expected_shape, expected_output, config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestBroadcastingExpand(const std::string& op_name,
+                              const absl::InlinedVector<T, 10>& lhs_input,
+                              const absl::InlinedVector<T, 10>& rhs_input,
+                              BaselineOutT (*baseline_callback)(BaselineT,
+                                                                BaselineT),
+                              const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape lhs_shape{1};
+    TensorShape rhs_shape{6};
+    auto repeated_lhs_input =
+        test::RepeatInputToMatchShape(lhs_input, lhs_shape.num_elements());
+    auto repeated_rhs_input =
+        test::RepeatInputToMatchShape(rhs_input, rhs_shape.num_elements());
+
+    // Compute expected results.
+    std::vector<int> lhs_indices = {0, 0, 0, 0, 0, 0};
+    std::vector<int> rhs_indices = {0, 1, 2, 3, 4, 5};
+    auto expected_output =
+        ComputeExpectedOutput<T, BaselineT, OutT, BaselineOutT>(
+            lhs_indices, repeated_lhs_input, rhs_indices, repeated_rhs_input,
+            baseline_callback);
+
+    RunAndExpectResult<T, OutT>(
+        op_name, lhs_shape, repeated_lhs_input, rhs_shape, repeated_rhs_input,
+        /*expected_shape=*/rhs_shape, expected_output, config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestBroadcastingInDim(const std::string& op_name,
+                             const absl::InlinedVector<T, 10>& lhs_input,
+                             const absl::InlinedVector<T, 10>& rhs_input,
+                             BaselineOutT (*baseline_callback)(BaselineT,
+                                                               BaselineT),
+                             const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape lhs_shape{3};
+    TensorShape rhs_shape{2, 3};
+    auto repeated_lhs_input =
+        test::RepeatInputToMatchShape(lhs_input, lhs_shape.num_elements());
+    auto repeated_rhs_input =
+        test::RepeatInputToMatchShape(rhs_input, rhs_shape.num_elements());
+
+    // Compute expected results.
+    std::vector<int> lhs_indices = {0, 1, 2, 0, 1, 2};
+    std::vector<int> rhs_indices = {0, 1, 2, 3, 4, 5};
+    auto expected_output =
+        ComputeExpectedOutput<T, BaselineT, OutT, BaselineOutT>(
+            lhs_indices, repeated_lhs_input, rhs_indices, repeated_rhs_input,
+            baseline_callback);
+
+    RunAndExpectResult<T, OutT>(
+        op_name, lhs_shape, repeated_lhs_input, rhs_shape, repeated_rhs_input,
+        /*expected_shape=*/rhs_shape, expected_output, config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestBroadcasting(const std::string& op_name,
+                        const absl::InlinedVector<T, 10>& lhs_input,
+                        const absl::InlinedVector<T, 10>& rhs_input,
+                        BaselineOutT (*baseline_callback)(BaselineT, BaselineT),
+                        const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape lhs_shape{2, 1};
+    TensorShape rhs_shape{3};
+    auto repeated_lhs_input =
+        test::RepeatInputToMatchShape(lhs_input, lhs_shape.num_elements());
+    auto repeated_rhs_input =
+        test::RepeatInputToMatchShape(rhs_input, rhs_shape.num_elements());
+
+    // Compute expected results.
+    TensorShape expected_shape{2, 3};
+    std::vector<int> lhs_indices = {0, 0, 0, 1, 1, 1};
+    std::vector<int> rhs_indices = {0, 1, 2, 0, 1, 2};
+    auto expected_output =
+        ComputeExpectedOutput<T, BaselineT, OutT, BaselineOutT>(
+            lhs_indices, repeated_lhs_input, rhs_indices, repeated_rhs_input,
+            baseline_callback);
+
+    RunAndExpectResult<T, OutT>(op_name, lhs_shape, repeated_lhs_input,
+                                rhs_shape, repeated_rhs_input, expected_shape,
+                                expected_output, config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestBroadcastingRank6(const std::string& op_name,
+                             const absl::InlinedVector<T, 10>& lhs_input,
+                             const absl::InlinedVector<T, 10>& rhs_input,
+                             BaselineOutT (*baseline_callback)(BaselineT,
+                                                               BaselineT),
+                             const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape lhs_shape{1, 2, 3, 1, 2, 1};
+    TensorShape rhs_shape{1, 1, 1, 2, 3};
+    auto repeated_lhs_input =
+        test::RepeatInputToMatchShape(lhs_input, lhs_shape.num_elements());
+    auto repeated_rhs_input =
+        test::RepeatInputToMatchShape(rhs_input, rhs_shape.num_elements());
+
+    // Compute expected results.
+    TensorShape expected_shape{1, 2, 3, 1, 2, 3};
+    std::vector<int> lhs_indices = {0, 0, 0, 1, 1, 1, 2,  2,  2,  3,  3,  3,
+                                    4, 4, 4, 5, 5, 5, 6,  6,  6,  7,  7,  7,
+                                    8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11};
+    std::vector<int> rhs_indices = {
+        0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+        0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+    };
+    auto expected_output =
+        ComputeExpectedOutput<T, BaselineT, OutT, BaselineOutT>(
+            lhs_indices, repeated_lhs_input, rhs_indices, repeated_rhs_input,
+            baseline_callback);
+
+    RunAndExpectResult<T, OutT>(op_name, lhs_shape, repeated_lhs_input,
+                                rhs_shape, repeated_rhs_input, expected_shape,
+                                expected_output, config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void TestEmptyShapeBroadcasting(const std::string& op_name,
+                                  const absl::InlinedVector<T, 10>& lhs_input,
+                                  const absl::InlinedVector<T, 10>& rhs_input,
+                                  const test::OpsTestConfig& config) {
+    // Prepare inputs.
+    TensorShape lhs_shape{2, 0, 1};
+    TensorShape rhs_shape{2, 0, 5};
+    absl::InlinedVector<T, 10> empty_input = {};
+
+    // Define expected result.
+    TensorShape expected_shape{2, 0, 5};
+    absl::InlinedVector<OutT, 10> expected_output = {};
+
+    RunAndExpectResult<T, OutT>(op_name, lhs_shape, empty_input, rhs_shape,
+                                empty_input, expected_shape, expected_output,
+                                config);
+  }
+
+ private:
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  absl::InlinedVector<OutT, 10> ComputeExpectedOutput(
+      std::vector<int> lhs_indices, absl::InlinedVector<T, 10> lhs_input,
+      std::vector<int> rhs_indices, absl::InlinedVector<T, 10> rhs_input,
+      BaselineOutT (*baseline_callback)(BaselineT, BaselineT)) {
+    absl::InlinedVector<OutT, 10> expected_output;
+    for (int64_t i = 0; i < lhs_indices.size(); i++) {
+      auto lhs = static_cast<BaselineT>(lhs_input[lhs_indices[i]]);
+      auto rhs = static_cast<BaselineT>(rhs_input[rhs_indices[i]]);
+      auto result = static_cast<OutT>(baseline_callback(lhs, rhs));
+      expected_output.push_back(result);
+    }
+    return expected_output;
+  }
+};
+
+// Macros to easily generate common test cases. The macros use `BinaryOpsTest`
+// fixture in order to share implementation across GPU and CPU platform tests.
+// For specific inputs, please define your own test fixtures.
+#define GENERATE_DEFAULT_NO_BROADCASTING_TESTS_2(                            \
+    op_name, test_name, T, BaselineT, OutT, BaselineOutT, lhs_input,         \
+    rhs_input, baseline_callback, config)                                    \
+  TEST_F(BinaryOpsTest, op_name##EqShapes##test_name) {                      \
+    TestEqualShapes<T, BaselineT, OutT, BaselineOutT>(                       \
+        #op_name, /*shape=*/test::DefaultInputShape(), lhs_input, rhs_input, \
+        baseline_callback, config);                                          \
+  }                                                                          \
+  TEST_F(BinaryOpsTest, op_name##IncompatibleShapes##test_name) {            \
+    TestIncompatibleShapes<T, OutT>(#op_name, lhs_input, rhs_input, config); \
+  }
+
+#define GENERATE_DEFAULT_TESTS_2(op_name, test_name, T, BaselineT, OutT,      \
+                                 BaselineOutT, lhs_input, rhs_input,          \
+                                 baseline_callback, config)                   \
+                                                                              \
+  GENERATE_DEFAULT_NO_BROADCASTING_TESTS_2(                                   \
+      op_name, test_name, T, BaselineT, OutT, BaselineOutT, lhs_input,        \
+      rhs_input, baseline_callback, config)                                   \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##OneScalar##test_name) {                      \
+    TestOneScalar<T, BaselineT, OutT, BaselineOutT>(                          \
+        #op_name, /*scalar_input=*/lhs_input.front(),                         \
+        /*other_shape=*/test::DefaultInputShape(), /*other_input=*/rhs_input, \
+        baseline_callback, config);                                           \
+  }                                                                           \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##TestOneEffectiveScalar##test_name) {         \
+    TestOneEffectiveScalar<T, BaselineT, OutT, BaselineOutT>(                 \
+        #op_name, /*scalar_input=*/lhs_input.front(),                         \
+        /*other_shape=*/test::DefaultInputShape(), /*other_input=*/rhs_input, \
+        baseline_callback, config);                                           \
+  }                                                                           \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##BroadcastingExpand##test_name) {             \
+    TestBroadcastingExpand<T, BaselineT, OutT, BaselineOutT>(                 \
+        #op_name, lhs_input, rhs_input, baseline_callback, config);           \
+  }                                                                           \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##BroadcastingInDim##test_name) {              \
+    TestBroadcastingInDim<T, BaselineT, OutT, BaselineOutT>(                  \
+        #op_name, lhs_input, rhs_input, baseline_callback, config);           \
+  }                                                                           \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##Broadcasting##test_name) {                   \
+    TestBroadcasting<T, BaselineT, OutT, BaselineOutT>(                       \
+        #op_name, lhs_input, rhs_input, baseline_callback, config);           \
+  }                                                                           \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##BroadcastingRank6##test_name) {              \
+    TestBroadcastingRank6<T, BaselineT, OutT, BaselineOutT>(                  \
+        #op_name, lhs_input, rhs_input, baseline_callback, config);           \
+  }                                                                           \
+                                                                              \
+  TEST_F(BinaryOpsTest, op_name##EmptyShapeBroadcasting##test_name) {         \
+    TestEmptyShapeBroadcasting<T, BaselineT, OutT, BaselineOutT>(             \
+        #op_name, lhs_input, rhs_input, config);                              \
+  }
+
+#define GENERATE_DEFAULT_TESTS(op_name, test_name, T, OutT, baseline_callback, \
+                               config)                                         \
+  GENERATE_DEFAULT_TESTS_2(op_name, test_name, T, T, OutT, OutT,               \
+                           test::DefaultInput<T>(), test::DefaultInput<T>(),   \
+                           baseline_callback, config)
+
+#define GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(                  \
+    op_name, test_name, T, OutT, lhs_input, rhs_input, baseline_callback,   \
+    config)                                                                 \
+  GENERATE_DEFAULT_TESTS_2(op_name, test_name, T, T, OutT, OutT, lhs_input, \
+                           rhs_input, baseline_callback, config)
+
+#define GENERATE_DEFAULT_NO_BROADCASTING_TESTS(op_name, test_name, T, OutT, \
+                                               baseline_callback)           \
+  GENERATE_DEFAULT_NO_BROADCASTING_TESTS_2(                                 \
+      op_name, test_name, T, T, OutT, OutT, test::DefaultInput<T>(),        \
+      test::DefaultInput<T>(), baseline_callback,                           \
+      test::OpsTestConfig().ExpectStrictlyEqual())
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_BINARY_OPS_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_gpu_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_gpu_op.h
new file mode 100644
index 00000000..c299e1c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_gpu_op.h
@@ -0,0 +1,117 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_GPU_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_GPU_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/mlir_generated/base_op.h"
+
+namespace tensorflow {
+
+/// Register kernels.
+
+#define REGISTER_ALIASED_GPU_KERNEL(tf_op, mlir_op, input_type, output_type) \
+  REGISTER_ALIASED_KERNEL(tf_op, mlir_op, GPU, input_type, output_type,      \
+                          /*no additional_cstrs*/)
+
+// clang-format off
+#define REGISTER_GPU_KERNEL(tf_op, input_type, output_type) \
+  REGISTER_KERNEL(tf_op, GPU, input_type, output_type, /*no additional_cstrs*/)
+// clang-format on
+
+#define REGISTER_COMPLEX_GPU_KERNEL(tf_op, input_type, output_type) \
+  REGISTER_COMPLEX_KERNEL(tf_op, GPU, input_type, output_type)
+
+#define REGISTER_GPU_KERNEL_NO_TYPE_CONSTRAINT(tf_op, input_type) \
+  REGISTER_KERNEL_NO_TYPE_CONSTRAINT(tf_op, GPU, input_type)
+
+/// Unary kernels.
+
+#define GENERATE_AND_REGISTER_UNARY_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_AND_REGISTER_UNARY_KERNEL(tf_op, GPU, input_type,      \
+                                     /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_UNARY_GPU_KERNEL2(tf_op, input_type,         \
+                                                output_type)               \
+  GENERATE_AND_REGISTER_UNARY_KERNEL2(tf_op, GPU, input_type, output_type, \
+                                      /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_UNARY_GPU_KERNEL3(                             \
+    tf_op, input_type, output_type, casted_input_type, casted_output_type)   \
+  GENERATE_AND_REGISTER_UNARY_KERNEL3(tf_op, GPU, input_type, output_type,   \
+                                      casted_input_type, casted_output_type, \
+                                      /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_UNARY_JIT_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_AND_REGISTER_UNARY_JIT_KERNEL(tf_op, GPU, input_type,      \
+                                         /*no additional_cstrs*/)
+
+#define GENERATE_UNARY_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_UNARY_KERNEL(tf_op, GPU, input_type)
+
+#define GENERATE_UNARY_GPU_KERNEL2(tf_op, input_type, output_type) \
+  GENERATE_UNARY_KERNEL2(tf_op, GPU, input_type, output_type)
+
+#define GENERATE_UNARY_GPU_KERNEL3(tf_op, input_type, output_type,        \
+                                   casted_input_type, casted_output_type) \
+  GENERATE_UNARY_KERNEL3(tf_op, GPU, input_type, output_type,             \
+                         casted_input_type, casted_output_type)
+
+/// Binary kernels.
+
+#define GENERATE_AND_REGISTER_BINARY_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_AND_REGISTER_BINARY_KERNEL(tf_op, GPU, input_type,      \
+                                      /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_BINARY_GPU_KERNEL2(tf_op, input_type,         \
+                                                 output_type)               \
+  GENERATE_AND_REGISTER_BINARY_KERNEL2(tf_op, GPU, input_type, output_type, \
+                                       /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_BINARY_GPU_KERNEL3(                             \
+    tf_op, input_type, output_type, casted_input_type, casted_output_type)    \
+  GENERATE_AND_REGISTER_BINARY_KERNEL3(tf_op, GPU, input_type, output_type,   \
+                                       casted_input_type, casted_output_type, \
+                                       /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_AND_REGISTER_BINARY_JIT_KERNEL(tf_op, GPU, input_type,      \
+                                          /*no additional_cstrs*/)
+
+#define GENERATE_BINARY_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_BINARY_KERNEL(tf_op, GPU, input_type)
+
+#define GENERATE_BINARY_GPU_KERNEL2(tf_op, input_type, output_type) \
+  GENERATE_BINARY_KERNEL2(tf_op, GPU, input_type, output_type)
+
+#define GENERATE_BINARY_GPU_KERNEL3(tf_op, input_type, output_type,        \
+                                    casted_input_type, casted_output_type) \
+  GENERATE_BINARY_KERNEL3(tf_op, GPU, input_type, output_type,             \
+                          casted_input_type, casted_output_type)
+
+/// Ternary kernels.
+
+#define GENERATE_AND_REGISTER_TERNARY_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_AND_REGISTER_TERNARY_KERNEL(tf_op, GPU, input_type,      \
+                                       /*no additional_cstrs*/)
+
+#define GENERATE_AND_REGISTER_TERNARY_JIT_GPU_KERNEL(tf_op, input_type) \
+  GENERATE_AND_REGISTER_TERNARY_JIT_KERNEL(tf_op, GPU, input_type,      \
+                                           /*no additional_cstrs*/)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_GPU_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_op.h
new file mode 100644
index 00000000..c7e92540
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_op.h
@@ -0,0 +1,346 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OP_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/ExecutionEngine/CRunnerUtils.h"  // from @llvm-project
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+// Unranked memref descriptor as it is expected and returned by the external
+// MLIR-generated "C" function.
+struct UnrankedMemRef {
+  int64_t rank;
+  void* descriptor;
+};
+
+// Returns a pointer to an allocated MlirTensorBuffer that takes ownership of
+// pre-allocated memory.
+TensorBuffer* GetMlirTensorBuffer(const void* ptr, size_t size,
+                                  Allocator* allocator);
+
+/// Used to allocate descriptors on stack when they are small.
+
+constexpr int kMaxRankForOnStackDescriptors = 10;
+
+static constexpr size_t GetSizeOfDescriptor(int rank) {
+  return sizeof(void*) * (2 * rank + 3);
+}
+
+using DescriptorBuffer =
+    llvm::SmallVector<unsigned char,
+                      GetSizeOfDescriptor(kMaxRankForOnStackDescriptors)>;
+
+/// Converts tensors to memory descriptors and back.
+
+UnrankedMemRef ConvertTensorToDescriptor(const Tensor& tensor,
+                                         DescriptorBuffer& buffer);
+
+TensorShape ExtractShapeFromDescriptor(UnrankedMemRef unranked_descriptor);
+
+template <typename ElemType>
+Tensor ConvertDescriptorToTensor(UnrankedMemRef unranked_descriptor,
+                                 DataType TfDataType, Allocator* allocator) {
+  void* base_ptr = static_cast<void**>(unranked_descriptor.descriptor)[0];
+  TensorShape result_shape = ExtractShapeFromDescriptor(unranked_descriptor);
+  TensorBuffer* buffer = GetMlirTensorBuffer(
+      base_ptr, sizeof(ElemType) * result_shape.num_elements(), allocator);
+
+  // Tensor takes ownership of the buffer.
+  Tensor tensor{TfDataType, result_shape, buffer};
+  // When Tensor is constructed, its ref-counter is incremented. We need to
+  // decrement it back.
+  buffer->Unref();
+  return tensor;
+}
+
+// OpKernel with Compute function that converts input tensors to unranked
+// memref descriptors and calls the MLIR-generated unranked kernel. The outputs
+// are converted back to tensors using MlirTensorBuffer to take ownership of
+// pre-allocated memory.
+template <DataType TfDataType, typename OutputDataType,
+          DataType CastedTfDataType = TfDataType>
+class MLIROpKernel : public OpKernel {
+ public:
+  explicit MLIROpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(4) << ctx->op_kernel().TraceString(*ctx, true);
+
+    // Convert tensor arguments to unranked memory descriptors.
+    llvm::SmallVector<DescriptorBuffer, 4> buffers(ctx->num_inputs());
+    llvm::SmallVector<UnrankedMemRef, 4> args;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      args.push_back(ConvertTensorToDescriptor(ctx->input(i), buffers[i]));
+    }
+
+    UnrankedMemRef result_desc = Invoke(ctx, args);
+    if (!ctx->status().ok()) {
+      free(result_desc.descriptor);
+      return;
+    }
+    void* result_data_ptr = static_cast<void**>(result_desc.descriptor)[0];
+
+    // Detect input buffer reuse.
+    for (int i = 0, end = ctx->num_inputs(); i < end; ++i) {
+      const Tensor& input = ctx->input(i);
+      if (input.data() == result_data_ptr) {
+        // Run a bitcast in case the output type is different.
+        Tensor output;
+        TensorShape result_shape = ExtractShapeFromDescriptor(result_desc);
+        OP_REQUIRES_OK(
+            ctx, output.BitcastFrom(input, CastedTfDataType, result_shape));
+
+        ctx->set_output(0, output);
+        free(result_desc.descriptor);
+        return;
+      }
+    }
+
+    tensorflow::AllocatorAttributes attrs;
+    auto* allocator = ctx->get_allocator(attrs);
+    Tensor result_tensor = ConvertDescriptorToTensor<OutputDataType>(
+        result_desc, TfDataType, allocator);
+    if (TfDataType != CastedTfDataType) {
+      Tensor casted_result_tensor;
+      OP_REQUIRES_OK(
+          ctx, casted_result_tensor.BitcastFrom(result_tensor, CastedTfDataType,
+                                                result_tensor.shape()));
+      result_tensor = casted_result_tensor;
+    }
+    free(result_desc.descriptor);
+    ctx->set_output(0, result_tensor);
+  }
+
+ protected:
+  virtual UnrankedMemRef Invoke(
+      OpKernelContext* ctx, llvm::SmallVectorImpl<UnrankedMemRef>& args) = 0;
+};
+
+/// Generate C function and kernel names.
+
+#define MLIR_FUNCTION(tf_op, platform, input_type, output_type) \
+  _mlir_ciface_##tf_op##_##platform##_##input_type##_##output_type
+
+#define MLIR_OP(tf_op, platform, input_type, output_type) \
+  Mlir##tf_op##platform##input_type##output_type##Op
+
+/// Register kernels.
+
+#define REGISTER_ALIASED_KERNEL(tf_op, mlir_op, platform, input_type,     \
+                                output_type, additional_cstrs)            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name(#tf_op)                                                        \
+          .Device(DEVICE_##platform)                                      \
+          .TypeConstraint<typename EnumToDataType<input_type>::Type>("T") \
+              additional_cstrs,                                           \
+      MLIR_OP(mlir_op, platform, input_type, output_type));
+
+#define REGISTER_KERNEL(tf_op, platform, input_type, output_type,          \
+                        additional_cstrs)                                  \
+  REGISTER_ALIASED_KERNEL(tf_op, tf_op, platform, input_type, output_type, \
+                          additional_cstrs)
+
+#define REGISTER_COMPLEX_KERNEL(tf_op, platform, input_type, output_type)      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(#tf_op)                                                             \
+          .Device(DEVICE_##platform)                                           \
+          .TypeConstraint<typename EnumToDataType<input_type>::Type>("T")      \
+          .TypeConstraint<typename EnumToDataType<output_type>::Type>("Tout"), \
+      MLIR_OP(tf_op, platform, input_type, output_type));
+
+#define REGISTER_KERNEL_NO_TYPE_CONSTRAINT(tf_op, platform, input_type) \
+  REGISTER_KERNEL_BUILDER(Name(#tf_op).Device(DEVICE_##platform),       \
+                          MLIR_OP(tf_op, platform, input_type, input_type));
+
+/// Unary kernels.
+
+#define GENERATE_AND_REGISTER_UNARY_KERNEL(tf_op, platform, input_type, \
+                                           additional_cstrs)            \
+  GENERATE_UNARY_KERNEL(tf_op, platform, input_type)                    \
+  REGISTER_KERNEL(tf_op, platform, input_type, input_type, additional_cstrs)
+
+#define GENERATE_AND_REGISTER_UNARY_KERNEL2(tf_op, platform, input_type,   \
+                                            output_type, additional_cstrs) \
+  GENERATE_UNARY_KERNEL(tf_op, platform, input_type, output_type)          \
+  REGISTER_KERNEL(tf_op, platform, input_type, output_type, additional_cstrs)
+
+#define GENERATE_AND_REGISTER_UNARY_KERNEL3(                              \
+    tf_op, platform, input_type, output_type, casted_input_type,          \
+    casted_output_type, additional_cstrs)                                 \
+  GENERATE_UNARY_KERNEL3(tf_op, platform, input_type, output_type,        \
+                         casted_input_type, casted_output_type)           \
+  REGISTER_KERNEL(tf_op, platform, casted_input_type, casted_output_type, \
+                  additional_cstrs)
+
+#define GENERATE_AND_REGISTER_UNARY_JIT_KERNEL(tf_op, platform, input_type, \
+                                               additional_cstrs)            \
+  GENERATE_AND_REGISTER_UNARY_KERNEL(tf_op, platform, input_type,           \
+                                     .Label(kJitKernelLabel) additional_cstrs)
+
+#define GENERATE_UNARY_KERNEL(tf_op, platform, input_type) \
+  GENERATE_UNARY_KERNEL2(tf_op, platform, input_type, input_type)
+
+#define GENERATE_UNARY_KERNEL2(tf_op, platform, input_type, output_type)       \
+  GENERATE_UNARY_KERNEL3(tf_op, platform, input_type, output_type, input_type, \
+                         output_type)
+
+#define GENERATE_UNARY_KERNEL3(tf_op, platform, input_type, output_type,     \
+                               casted_input_type, casted_output_type)        \
+  extern "C" void MLIR_FUNCTION(tf_op, platform, input_type, output_type)(   \
+      UnrankedMemRef * result, OpKernelContext * ctx, UnrankedMemRef * arg); \
+                                                                             \
+  namespace {                                                                \
+  class MLIR_OP(tf_op, platform, casted_input_type, casted_output_type)      \
+      : public MLIROpKernel<output_type,                                     \
+                            typename EnumToDataType<output_type>::Type,      \
+                            casted_output_type> {                            \
+   public:                                                                   \
+    using MLIROpKernel::MLIROpKernel;                                        \
+                                                                             \
+    UnrankedMemRef Invoke(                                                   \
+        OpKernelContext* ctx,                                                \
+        llvm::SmallVectorImpl<UnrankedMemRef>& args) override {              \
+      UnrankedMemRef result;                                                 \
+      MLIR_FUNCTION(tf_op, platform, input_type, output_type)                \
+      (&result, ctx, &args[0]);                                              \
+      return result;                                                         \
+    }                                                                        \
+  };                                                                         \
+  }
+
+/// Binary kernels.
+
+#define GENERATE_AND_REGISTER_BINARY_KERNEL(tf_op, platform, input_type, \
+                                            additional_cstrs)            \
+  GENERATE_BINARY_KERNEL(tf_op, platform, input_type)                    \
+  REGISTER_KERNEL(tf_op, platform, input_type, input_type, additional_cstrs)
+
+#define GENERATE_AND_REGISTER_BINARY_KERNEL2(tf_op, platform, input_type,   \
+                                             output_type, additional_cstrs) \
+  GENERATE_BINARY_KERNEL2(tf_op, platform, input_type, output_type)         \
+  REGISTER_KERNEL(tf_op, platform, input_type, output_type, additional_cstrs)
+
+#define GENERATE_AND_REGISTER_BINARY_KERNEL3(                             \
+    tf_op, platform, input_type, output_type, casted_input_type,          \
+    casted_output_type, additional_cstrs)                                 \
+  GENERATE_BINARY_KERNEL3(tf_op, platform, input_type, output_type,       \
+                          casted_input_type, casted_output_type)          \
+  REGISTER_KERNEL(tf_op, platform, casted_input_type, casted_output_type, \
+                  additional_cstrs)
+
+#define GENERATE_AND_REGISTER_BINARY_JIT_KERNEL(tf_op, platform, input_type, \
+                                                additional_cstrs)            \
+  GENERATE_AND_REGISTER_BINARY_KERNEL(                                       \
+      tf_op, platform, input_type, .Label(kJitKernelLabel) additional_cstrs)
+
+#define GENERATE_BINARY_KERNEL(tf_op, platform, input_type) \
+  GENERATE_BINARY_KERNEL2(tf_op, platform, input_type, input_type)
+
+#define GENERATE_BINARY_KERNEL2(tf_op, platform, input_type, output_type) \
+  GENERATE_BINARY_KERNEL3(tf_op, platform, input_type, output_type,       \
+                          input_type, output_type)
+
+#define GENERATE_BINARY_KERNEL3(tf_op, platform, input_type, output_type,    \
+                                casted_input_type, casted_output_type)       \
+  extern "C" void MLIR_FUNCTION(tf_op, platform, input_type, output_type)(   \
+      UnrankedMemRef * result, OpKernelContext * ctx, UnrankedMemRef * arg0, \
+      UnrankedMemRef * arg1);                                                \
+                                                                             \
+  namespace {                                                                \
+  class MLIR_OP(tf_op, platform, casted_input_type, casted_output_type)      \
+      : public MLIROpKernel<output_type,                                     \
+                            typename EnumToDataType<output_type>::Type,      \
+                            casted_output_type> {                            \
+   public:                                                                   \
+    using MLIROpKernel::MLIROpKernel;                                        \
+                                                                             \
+    UnrankedMemRef Invoke(                                                   \
+        OpKernelContext* ctx,                                                \
+        llvm::SmallVectorImpl<UnrankedMemRef>& args) override {              \
+      UnrankedMemRef result;                                                 \
+      MLIR_FUNCTION(tf_op, platform, input_type, output_type)                \
+      (&result, ctx, &args[0], &args[1]);                                    \
+      return result;                                                         \
+    }                                                                        \
+  };                                                                         \
+  }
+
+/// Ternary kernels.
+
+#define GENERATE_AND_REGISTER_TERNARY_KERNEL(tf_op, platform, input_type, \
+                                             additional_cstrs)            \
+  GENERATE_TERNARY_KERNEL(tf_op, platform, input_type)                    \
+  REGISTER_KERNEL(tf_op, platform, input_type, input_type, additional_cstrs)
+
+#define GENERATE_AND_REGISTER_TERNARY_KERNEL2(tf_op, platform, input_type,   \
+                                              output_type, additional_cstrs) \
+  GENERATE_TERNARY_KERNEL2(tf_op, platform, input_type, output_type)         \
+  REGISTER_KERNEL(tf_op, platform, input_type, output_type, additional_cstrs)
+
+#define GENERATE_AND_REGISTER_TERNARY_KERNEL3(                            \
+    tf_op, platform, input_type, output_type, casted_input_type,          \
+    casted_output_type, additional_cstrs)                                 \
+  GENERATE_TERNARY_KERNEL3(tf_op, platform, input_type, output_type,      \
+                           casted_input_type, casted_output_type)         \
+  REGISTER_KERNEL(tf_op, platform, casted_input_type, casted_output_type, \
+                  additional_cstrs)
+
+#define GENERATE_AND_REGISTER_TERNARY_JIT_KERNEL(tf_op, platform, input_type, \
+                                                 additional_cstrs)            \
+  GENERATE_AND_REGISTER_TERNARY_KERNEL(                                       \
+      tf_op, platform, input_type, .Label(kJitKernelLabel) additional_cstrs)
+
+#define GENERATE_TERNARY_KERNEL(tf_op, platform, input_type) \
+  GENERATE_TERNARY_KERNEL2(tf_op, platform, input_type, input_type)
+
+#define GENERATE_TERNARY_KERNEL2(tf_op, platform, input_type, output_type) \
+  GENERATE_TERNARY_KERNEL3(tf_op, platform, input_type, output_type,       \
+                           input_type, output_type)
+
+#define GENERATE_TERNARY_KERNEL3(tf_op, platform, input_type, output_type,   \
+                                 casted_input_type, casted_output_type)      \
+  extern "C" void MLIR_FUNCTION(tf_op, platform, input_type, output_type)(   \
+      UnrankedMemRef * result, OpKernelContext * ctx, UnrankedMemRef * arg0, \
+      UnrankedMemRef * arg1, UnrankedMemRef * arg2);                         \
+                                                                             \
+  namespace {                                                                \
+  class MLIR_OP(tf_op, platform, casted_input_type, casted_output_type)      \
+      : public MLIROpKernel<output_type,                                     \
+                            typename EnumToDataType<output_type>::Type,      \
+                            casted_output_type> {                            \
+   public:                                                                   \
+    using MLIROpKernel::MLIROpKernel;                                        \
+                                                                             \
+    UnrankedMemRef Invoke(                                                   \
+        OpKernelContext* ctx,                                                \
+        llvm::SmallVectorImpl<UnrankedMemRef>& args) override {              \
+      UnrankedMemRef result;                                                 \
+      MLIR_FUNCTION(tf_op, platform, input_type, output_type)                \
+      (&result, ctx, &args[0], &args[1], &args[2]);                          \
+      return result;                                                         \
+    }                                                                        \
+  };                                                                         \
+  }
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_ops_test.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_ops_test.h
new file mode 100644
index 00000000..d7a2a2d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_ops_test.h
@@ -0,0 +1,324 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OPS_TEST_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OPS_TEST_H_
+
+#include <complex>
+#include <limits>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace test {
+
+template <typename T>
+using is_integer = llvm::is_one_of<T, int8_t, int16_t, int32_t, int64_t,
+                                   uint8_t, uint16_t, uint32_t, uint64_t>;
+
+/// Helper functions to create or derive inputs of the right type and size.
+
+template <typename T, typename LiteralT>
+absl::InlinedVector<T, 10> InputAsVector(
+    std::initializer_list<LiteralT> input) {
+  absl::InlinedVector<T, 10> result;
+  result.reserve(input.size());
+  for (const LiteralT& value : input) {
+    result.push_back(static_cast<T>(value));
+  }
+  return result;
+}
+
+template <typename T>
+absl::InlinedVector<T, 10> RepeatInputToMatchShape(
+    absl::InlinedVector<T, 10> input, int64_t size) {
+  absl::InlinedVector<T, 10> result;
+  result.reserve(size);
+  for (int64_t i = 0; i < size; i++) {
+    auto value = input[i % input.size()];
+    result.push_back(value);
+  }
+  return result;
+}
+
+template <typename T>
+absl::InlinedVector<T, 10> RepeatElements(absl::InlinedVector<T, 10> input,
+                                          int64_t num_repeats) {
+  absl::InlinedVector<T, 10> result;
+  result.reserve(input.size() * num_repeats);
+  for (T value : input) {
+    for (int64_t i = 0; i < num_repeats; ++i) {
+      result.push_back(value);
+    }
+  }
+  return result;
+}
+
+/// Helper functions to get default input shapes.
+
+TensorShape DefaultInputShape();
+TensorShape DefaultInputShapeExceedingInt32();
+
+/// Helper functions to configure tests.
+
+struct OpsTestConfig {
+  bool add_t = true;
+  bool add_tout = false;
+  // Only used for gpu_unary_ops_test.
+  bool expect_buffer_reuse = true;
+  bool expect_strictly_equal = false;
+  bool supress_tolerance = false;
+  // Negative atol/rtol will make ExpectClose use the default.
+  double atol = -1;
+  double rtol = -1;
+  std::string input_attribute = "T";
+  std::string output_attribute = "Tout";
+  bool jit_compilation = false;
+  OpsTestConfig ExpectStrictlyEqual() {
+    OpsTestConfig config = *this;
+    config.expect_strictly_equal = true;
+    return config;
+  }
+  OpsTestConfig SuppressTolerance() {
+    OpsTestConfig config = *this;
+    config.supress_tolerance = true;
+    return config;
+  }
+  OpsTestConfig NoBufferReuse() {
+    OpsTestConfig config = *this;
+    config.expect_buffer_reuse = false;
+    return config;
+  }
+  OpsTestConfig AddTout() {
+    OpsTestConfig config = *this;
+    config.add_tout = true;
+    return config;
+  }
+  OpsTestConfig NoT() {
+    OpsTestConfig config = *this;
+    config.add_t = false;
+    return config;
+  }
+  OpsTestConfig RTol(double new_rtol) {
+    OpsTestConfig config = *this;
+    config.rtol = new_rtol;
+    return config;
+  }
+  OpsTestConfig ATol(double new_atol) {
+    OpsTestConfig config = *this;
+    config.atol = new_atol;
+    return config;
+  }
+  OpsTestConfig InputAttribute(const std::string& attr) {
+    OpsTestConfig config = *this;
+    config.input_attribute = attr;
+    return config;
+  }
+  OpsTestConfig OutputAttribute(const std::string& attr) {
+    OpsTestConfig config = *this;
+    config.output_attribute = attr;
+    return config;
+  }
+  OpsTestConfig JITCompilation() {
+    OpsTestConfig config = *this;
+    config.jit_compilation = true;
+    return config;
+  }
+};
+
+/// Helper functions to get more specific input data.
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> NearZeroAndExtremeInput() {
+  return InputAsVector<T, double>({-std::numeric_limits<double>::infinity(),
+                                   -0.1, -0.0, 0.0, 0.1,
+                                   std::numeric_limits<double>::infinity()});
+}
+
+template <typename T, std::enable_if_t<is_integer<T>::value, bool> = true>
+absl::InlinedVector<T, 10> NearZeroAndExtremeInput() {
+  return InputAsVector<T, T>({std::numeric_limits<T>::min(),
+                              std::numeric_limits<T>::min() + 1, -1, 0, 1,
+                              std::numeric_limits<T>::max()});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> NearZeroInfAndNanInput() {
+  return InputAsVector<T, double>({-std::numeric_limits<double>::quiet_NaN(),
+                                   -std::numeric_limits<double>::infinity(),
+                                   -0.1, -0.0, 0.0, 0.1,
+                                   std::numeric_limits<double>::infinity(),
+                                   std::numeric_limits<double>::quiet_NaN()});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> DefaultInputGreaterEqualOne() {
+  return test::InputAsVector<T, double>(
+      {18.0, 9.0, 1.0, std::numeric_limits<T>::max(), 42.0, 2.0, 1.0,
+       std::sqrt(std::numeric_limits<T>::max()), 9.0, 18.0});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> DefaultInputGreaterThanZero() {
+  return test::InputAsVector<T, double>({18.0, 9.0, 1e-6, 1.0, 0.1, 1e-6, 0.1,
+                                         0.2, 0.3, 0.5, 0.7, 0.9, 9.0, 18.0});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> DefaultInputGreaterOrEqualToZero() {
+  return test::InputAsVector<T, double>({18.0, 9.0, 1e-6, 0.0, 0.1, 1e-6, 0.1,
+                                         0.2, 0.3, 0.5, 0.7, 0.9, 9.0, 18.0});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> DefaultInputNonZero() {
+  return test::InputAsVector<T, double>({18.0, 9.0, 1e-6, -0.1, 0.1, 1e-6, 0.1,
+                                         0.2, 0.3, 0.5, 0.7, 0.9, 9.0, 18.0});
+}
+
+template <typename T, std::enable_if_t<is_integer<T>::value, bool> = true>
+absl::InlinedVector<T, 10> DefaultInputNonZero() {
+  return test::InputAsVector<T, int>({-18, -9, -1, 1, 3, 4, 5, 7, 9, 10, 18});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> DefaultInputBetweenZeroAndOne() {
+  return test::InputAsVector<T, double>({-0.999, -0.9, -0.8, -0.5, -0.1, -0.001,
+                                         -0, 0, 0.001, 0.1, 0.5, 0.8, 0.9,
+                                         0.999});
+}
+
+template <typename T, std::enable_if_t<is_integer<T>::value, bool> = true>
+absl::InlinedVector<T, 10> DefaultInputLessThanBitwidth() {
+  auto max_shift = sizeof(T) * 8 - 1;
+  absl::InlinedVector<T, 10> v;
+  for (auto i = 0; i < max_shift; ++i) v.push_back(i);
+  return v;
+}
+
+/// Helper functions to get default input data.
+
+template <typename T, std::enable_if_t<is_integer<T>::value, bool> = true>
+absl::InlinedVector<T, 10> DefaultInput() {
+  return InputAsVector<T, int>({-18, -9, -1, 0, 0, 1, 1, 2, 3, 5, 7, 9, 9, 18});
+}
+
+template <typename T, std::enable_if_t<
+                          llvm::is_one_of<T, Eigen::half, float, double>::value,
+                          bool> = true>
+absl::InlinedVector<T, 10> DefaultInput() {
+  return InputAsVector<T, double>({-18.0, -9.0, -0.7, -0.5, -0.3, -0.2, -0.1,
+                                   -1e-6, -0.0, 0.0, 1e-6, 0.1, 0.2, 0.3, 0.5,
+                                   0.7, 0.9, 18.0});
+}
+
+template <typename T,
+          std::enable_if_t<llvm::is_one_of<T, std::complex<float>,
+                                           std::complex<double>>::value,
+                           bool> = true>
+absl::InlinedVector<T, 10> DefaultInput() {
+  using ElementType = typename T::value_type;
+  auto input = test::DefaultInput<ElementType>();
+  absl::InlinedVector<T, 10> complex_input;
+  for (ElementType value : input) {
+    complex_input.emplace_back(value, -value);
+  }
+  return complex_input;
+}
+
+template <typename T,
+          std::enable_if_t<llvm::is_one_of<T, std::complex<float>,
+                                           std::complex<double>>::value,
+                           bool> = true>
+absl::InlinedVector<T, 10> ComplexInputFromValues(
+    const absl::InlinedVector<typename T::value_type, 10>& real,
+    const absl::InlinedVector<typename T::value_type, 10>& imag) {
+  using ElementType = typename T::value_type;
+  absl::InlinedVector<T, 10> complex_input;
+  CHECK_EQ(real.size(), imag.size());
+  for (size_t i = 0; i < real.size() && i < imag.size(); ++i) {
+    complex_input.emplace_back(real[i], imag[i]);
+  }
+  return complex_input;
+}
+
+template <typename T,
+          std::enable_if_t<llvm::is_one_of<T, std::complex<float>,
+                                           std::complex<double>>::value,
+                           bool> = true>
+absl::InlinedVector<T, 10> DefaultInputNonZero() {
+  auto real = test::DefaultInputNonZero<typename T::value_type>();
+  auto imag = real;
+  std::reverse(imag.begin(), imag.end());
+  return test::ComplexInputFromValues<T>(real, imag);
+}
+
+template <typename T,
+          std::enable_if_t<llvm::is_one_of<T, std::complex<float>,
+                                           std::complex<double>>::value,
+                           bool> = true>
+absl::InlinedVector<T, 10> DefaultInputGreaterOrEqualToZero() {
+  auto real = test::DefaultInputGreaterOrEqualToZero<typename T::value_type>();
+  auto imag = real;
+  std::reverse(imag.begin(), imag.end());
+  return test::ComplexInputFromValues<T>(real, imag);
+}
+
+template <typename T,
+          std::enable_if_t<llvm::is_one_of<T, std::complex<float>,
+                                           std::complex<double>>::value,
+                           bool> = true>
+absl::InlinedVector<T, 10> NearZeroInfAndNanInput() {
+  using ElementType = typename T::value_type;
+  auto input = test::NearZeroInfAndNanInput<ElementType>();
+  absl::InlinedVector<ElementType, 10> real;
+  absl::InlinedVector<ElementType, 10> imag;
+  for (ElementType r : input) {
+    for (ElementType i : input) {
+      real.push_back(r);
+      imag.push_back(i);
+    }
+  }
+  return test::ComplexInputFromValues<T>(real, imag);
+}
+
+template <typename T,
+          std::enable_if_t<llvm::is_one_of<T, bool>::value, bool> = true>
+absl::InlinedVector<T, 10> DefaultInput() {
+  return InputAsVector<T, bool>({true, false, true, true, false});
+}
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OPS_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
new file mode 100644
index 00000000..5edb7e7d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
@@ -0,0 +1,219 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_UNARY_OPS_TEST_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_UNARY_OPS_TEST_H_
+
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/mlir_generated/base_ops_test.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// Base class for `UnaryOpsTest` fixture that has to be defined with a custom TF
+// device if you want to use the test macros in this file.
+class UnaryOpsTestBase : public OpsTestBase {
+ protected:
+  // This method should set the TF device, e.g. DEVICE_CPU, DEVICE_GPU.
+  void SetUp() override = 0;
+
+  template <typename T, typename OutT>
+  void SetOpKernel(const std::string& op_name, const TensorShape& shape,
+                   const absl::InlinedVector<T, 10>& input,
+                   const test::OpsTestConfig& config) {
+    NodeDefBuilder builder("some_name", op_name);
+    builder.Input(FakeInput(DataTypeToEnum<T>::v()));
+    if (config.add_t) {
+      builder.Attr(config.input_attribute, DataTypeToEnum<T>::v());
+    }
+    if (config.add_tout) {
+      builder.Attr(config.output_attribute, DataTypeToEnum<OutT>::v());
+    }
+    TF_ASSERT_OK(builder.Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(shape, input);
+  }
+
+  template <typename T, typename OutT>
+  void RunAndExpectResult(const std::string& op_name, const TensorShape& shape,
+                          const absl::InlinedVector<T, 10>& input,
+                          const absl::InlinedVector<OutT, 10>& expected_output,
+                          const test::OpsTestConfig& config) {
+    SetOpKernel<T, OutT>(op_name, shape, input, config);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Assert buffer reuse if expected.
+    if (config.expect_buffer_reuse) {
+      void* arg_ptr_on_device = context_->input(0).data();
+      void* result_ptr_on_device = context_->mutable_output(0)->data();
+      ASSERT_EQ(arg_ptr_on_device, result_ptr_on_device);
+    }
+
+    // Assert expected results.
+    Tensor expected_tensor(allocator(), DataTypeToEnum<OutT>::value, shape);
+    test::FillValues<OutT>(&expected_tensor, expected_output);
+    if (config.expect_strictly_equal) {
+      test::ExpectEqual(expected_tensor, *GetOutput(0),
+                        config.supress_tolerance ? test::Tolerance::kNone
+                                                 : test::Tolerance::kDefault);
+    } else {
+      test::ExpectClose(expected_tensor, *GetOutput(0), kAbsoluteTolerance,
+                        kRelativeTolerance);
+    }
+
+    // For JIT-compiled kernels, expect exactly one entry in the JIT cache for
+    // the current test. The cache is not affected by other tests as we always
+    // set up a new environment.
+    if (config.jit_compilation) {
+      ResourceMgr* mgr = context_->resource_manager();
+      mlir::kernel_gen::tf_framework::JITCache* cache;
+      TF_ASSERT_OK(mgr->Lookup<mlir::kernel_gen::tf_framework::JITCache>(
+          mgr->default_container(),
+          mlir::kernel_gen::tf_framework::JITCache::kDefaultResourceName,
+          &cache));
+      core::ScopedUnref cache_ref(cache);
+      ASSERT_EQ(cache->Size(), 1);
+    }
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineCallback>
+  void TestImpl(const std::string& op_name, const TensorShape& shape,
+                const absl::InlinedVector<T, 10>& input,
+                const BaselineCallback& baseline_callback,
+                const test::OpsTestConfig& config) {
+    // Prepare inputs and compute expected results.
+    CHECK(input.size() <= shape.num_elements());
+    auto repeated_input =
+        test::RepeatInputToMatchShape(input, shape.num_elements());
+    absl::InlinedVector<OutT, 10> expected_output =
+        ComputeExpectedOutput<T, BaselineT, OutT>(repeated_input,
+                                                  baseline_callback);
+
+    RunAndExpectResult<T, OutT>(op_name, shape, repeated_input, expected_output,
+                                config);
+  }
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineCallback>
+  void Test(const std::string& op_name, const TensorShape& shape,
+            const absl::InlinedVector<T, 10>& input,
+            const BaselineCallback& baseline_callback,
+            const test::OpsTestConfig& config) {
+    TestImpl<T, BaselineT, OutT>(op_name, shape, input, baseline_callback,
+                                 config);
+  }
+
+  // Allow deduction of overloaded function with const ref input.
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void Test(const std::string& op_name, const TensorShape& shape,
+            const absl::InlinedVector<T, 10>& input,
+            BaselineOutT (*baseline_callback)(const BaselineT&),
+            const test::OpsTestConfig& config) {
+    TestImpl<T, BaselineT, OutT>(op_name, shape, input, baseline_callback,
+                                 config);
+  }
+
+  // Allow deduction of overloaded function with value input.
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineOutT>
+  void Test(const std::string& op_name, const TensorShape& shape,
+            const absl::InlinedVector<T, 10>& input,
+            BaselineOutT (*baseline_callback)(BaselineT),
+            const test::OpsTestConfig& config) {
+    TestImpl<T, BaselineT, OutT>(op_name, shape, input, baseline_callback,
+                                 config);
+  }
+
+  template <typename T, typename OutT>
+  void TestEmptyShape(const std::string& op_name,
+                      const test::OpsTestConfig& config) {
+    TensorShape shape{0, 1, 2};
+    absl::InlinedVector<T, 10> empty_input = {};
+    absl::InlinedVector<OutT, 10> expected_output = {};
+    RunAndExpectResult<T, OutT>(op_name, shape, empty_input, expected_output,
+                                config);
+  }
+
+ private:
+  constexpr static double kAbsoluteTolerance = 0.001;
+  constexpr static double kRelativeTolerance = 0.001;
+
+  template <typename T, typename BaselineT, typename OutT,
+            typename BaselineCallback>
+  absl::InlinedVector<OutT, 10> ComputeExpectedOutput(
+      absl::InlinedVector<T, 10> input,
+      const BaselineCallback& baseline_callback) {
+    absl::InlinedVector<OutT, 10> expected_output;
+    expected_output.reserve(input.size());
+    for (int64_t i = 0; i < input.size(); i++) {
+      auto arg = static_cast<BaselineT>(input[i]);
+      auto result = static_cast<OutT>(baseline_callback(arg));
+      expected_output.push_back(result);
+    }
+    return expected_output;
+  }
+};
+
+// Macros to easily generate common test cases. The macros use `UnaryOpsTest`
+// fixture in order to share implementation across GPU and CPU platform tests.
+// For specific inputs, please define your own test fixtures.
+#define GENERATE_DEFAULT_TEST(op_name, InT, OutT, baseline_callback, config) \
+  GENERATE_DEFAULT_TEST_2(op_name, InT, InT, OutT, OutT, baseline_callback,  \
+                          config)
+
+#define GENERATE_DEFAULT_TEST_2(op_name, InT, BaselineT, OutT, BaselineOutT, \
+                                baseline_callback, config)                   \
+  GENERATE_DEFAULT_TEST_WITH_SPECIFIC_INPUT_VALUES_2(                        \
+      op_name, InT, BaselineT, OutT, BaselineOutT,                           \
+      test::DefaultInput<NativeT>(), baseline_callback, config)
+
+#define GENERATE_DEFAULT_TEST_WITH_SPECIFIC_INPUT_VALUES(        \
+    op_name, InT, OutT, input_values, baseline_callback, config) \
+  GENERATE_DEFAULT_TEST_WITH_SPECIFIC_INPUT_VALUES_2(            \
+      op_name, InT, InT, OutT, OutT, input_values, baseline_callback, config)
+
+#define GENERATE_DEFAULT_TEST_WITH_SPECIFIC_INPUT_VALUES_2(                   \
+    op_name, InT, BaselineT, OutT, BaselineOutT, input_values,                \
+    baseline_callback, config)                                                \
+  TEST_F(UnaryOpsTest, op_name##InT##OutT) {                                  \
+    using NativeT = EnumToDataType<InT>::Type;                                \
+    using NativeBaselineT = EnumToDataType<BaselineT>::Type;                  \
+    using NativeOutT = EnumToDataType<OutT>::Type;                            \
+    using NativeBaselineOutT = EnumToDataType<BaselineOutT>::Type;            \
+    Test<NativeT, NativeBaselineT, NativeOutT, NativeBaselineOutT>(           \
+        #op_name, test::DefaultInputShape(), input_values, baseline_callback, \
+        config);                                                              \
+  }                                                                           \
+  TEST_F(UnaryOpsTest, op_name##InT##OutT##EmptyShape) {                      \
+    using NativeT = EnumToDataType<InT>::Type;                                \
+    using NativeOutT = EnumToDataType<OutT>::Type;                            \
+    TestEmptyShape<NativeT, NativeOutT>(#op_name, config);                    \
+  }
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_UNARY_OPS_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/multinomial_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/multinomial_op.h
new file mode 100644
index 00000000..34e21236
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/multinomial_op.h
@@ -0,0 +1,30 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MULTINOMIAL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MULTINOMIAL_OP_H_
+
+namespace tensorflow {
+
+namespace functor {
+
+// Generic helper functor for the Multinomial Op.
+template <typename Device, typename T, typename OutputType>
+struct MultinomialFunctor;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MULTINOMIAL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/nextafter_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/nextafter_op.h
new file mode 100644
index 00000000..89a39f49
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/nextafter_op.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+struct nextafter_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x1,
+                                                           const T& x2) const {
+    return std::nextafter(x1, x2);
+  }
+};
+
+template <typename T>
+struct nextafter : base<T, nextafter_op<T>> {};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/no_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/no_op.h
new file mode 100644
index 00000000..9e16d069
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/no_op.h
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NO_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class NoOp : public OpKernel {
+ public:
+  explicit NoOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NO_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/nth_element_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/nth_element_op.h
new file mode 100644
index 00000000..7a5ec3d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/nth_element_op.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NTH_ELEMENT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NTH_ELEMENT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct NthElementFunctor {
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NTH_ELEMENT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/numeric_options_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/numeric_options_utils.h
new file mode 100644
index 00000000..ced38d37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/numeric_options_utils.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
+
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/tsl/util/determinism.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tsl/platform/tensor_float_32_utils.h"
+
+namespace tensorflow {
+
+inline stream_executor::NumericOptions GetNumericOptions() {
+  return stream_executor::NumericOptions{
+      /*require_determinism=*/tsl::OpDeterminismRequired(),
+      /*allow_tf32=*/tsl::tensor_float_32_execution_enabled()};
+}
+
+inline stream_executor::NumericOptions GetNumericOptionsForCuDnn() {
+  static bool cudnn_deterministic_env_var = [] {
+    bool cudnn_deterministic = false;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
+                                   /*default_val=*/false,
+                                   &cudnn_deterministic));
+    return cudnn_deterministic;
+  }();
+  stream_executor::NumericOptions result = GetNumericOptions();
+  result.require_determinism |= cudnn_deterministic_env_var;
+  return result;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/one_hot_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/one_hot_op.h
new file mode 100644
index 00000000..afcf287a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/one_hot_op.h
@@ -0,0 +1,125 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc
+
+#ifndef TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
+// Generator definition for OneHotOp, must be compilable by nvcc.
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace generator {
+
+template <typename T, typename TI>
+class OneGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  OneGenerator(const typename TTypes<TI>::ConstMatrix& indices,
+               const typename TTypes<T>::ConstScalar& on_value,
+               const typename TTypes<T>::ConstScalar& off_value)
+      : indices_(indices), on_value_(on_value), off_value_(off_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, 3>& pre_depth_suff) const {
+    return (indices_(pre_depth_suff[0], pre_depth_suff[2]) == pre_depth_suff[1])
+               ? on_value_()
+               : off_value_();
+  }
+
+ private:
+  const typename TTypes<TI>::ConstMatrix indices_;
+  const typename TTypes<T>::ConstScalar on_value_;
+  const typename TTypes<T>::ConstScalar off_value_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T, typename TI>
+struct OneHot {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, const typename TTypes<TI>::ConstMatrix& indices,
+      const typename TTypes<T>::ConstScalar& on_value,
+      const typename TTypes<T>::ConstScalar& off_value,
+      typename TTypes<T, 3>::Tensor* output) {
+    generator::OneGenerator<T, TI> generator(indices, on_value, off_value);
+    output->device(d) = output->generate(generator);
+  }
+};
+
+template <typename T, typename TI>
+struct OneHot<CPUDevice, T, TI> {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const CPUDevice& d, const typename TTypes<TI>::ConstMatrix& indices,
+      const typename TTypes<T>::ConstScalar& on_value,
+      const typename TTypes<T>::ConstScalar& off_value,
+      typename TTypes<T, 3>::Tensor* output) {
+    // Pre-fill output with `off_value`.
+    output->device(d) = output->constant(off_value());
+
+    // Iterate through indices and update on_value elements in the output.
+    Eigen::Index prefix_size = output->dimensions()[0];
+    Eigen::Index depth_size = output->dimensions()[1];
+    Eigen::Index suffix_size = output->dimensions()[2];
+
+    // Cost of setting one `on_value` coefficient.
+    double bytes_loaded = sizeof(T);
+    double bytes_stored = sizeof(T);
+    double cycles = 0.0;
+    const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, cycles);
+
+    if (suffix_size == 1) {
+      const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
+        for (Eigen::Index i = start; i < end; ++i) {
+          const TI depth = internal::SubtleMustCopy(indices(i, 0));
+          if (FastBoundsCheck(depth, depth_size)) {
+            (*output)(i, depth, 0) = on_value();
+          }
+        }
+      };
+      d.parallelFor(prefix_size, cost, func);
+    } else {
+      const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
+        for (Eigen::Index i = start; i < end; ++i) {
+          const Eigen::Index d0 = i / suffix_size;
+          const Eigen::Index d1 = i - (d0 * suffix_size);
+          const TI depth = internal::SubtleMustCopy(indices(d0, d1));
+          if (FastBoundsCheck(depth, depth_size)) {
+            (*output)(d0, depth, d1) = on_value();
+          }
+        }
+      };
+      d.parallelFor(prefix_size * suffix_size, cost * suffix_size, func);
+    }
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/ops_testutil.h b/third_party/tflite-hdrs/tensorflow/core/kernels/ops_testutil.h
new file mode 100644
index 00000000..ef4a7cd5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/ops_testutil.h
@@ -0,0 +1,212 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
+#define TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
+
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+namespace test {
+
+void SetOutputAttrs(OpKernelContext::Params* params,
+                    std::vector<AllocatorAttributes>* attrs);
+
+}  // namespace test
+
+// Helpful functions to test operators.
+//
+// This class will eventually be replaced / heavily modified
+// to use the BrainClient interface.
+class OpsTestBase : public ::testing::Test {
+ public:
+  OpsTestBase();
+
+  ~OpsTestBase() override;
+
+  // Allow kernel unit tests to run on GPU
+  void SetDevice(const DeviceType& device_type, std::unique_ptr<Device> device);
+
+  void set_node_def(const NodeDef& node_def);
+
+  // Clients can manipulate the underlying NodeDef via this accessor.
+  NodeDef* node_def();
+
+  // Initializes an operator that takes in 'input_types' as input
+  // and output types as output.
+  //
+  // Returns the status of initialization.
+  absl::Status InitOp();
+
+  // Only use this directly if you have a deprecated op that you need to test.
+  absl::Status InitOpWithGraphVersion(int graph_def_version);
+
+  // Adds an input for every element described by the shape.
+  // 'input_mapping' maps an index (0...NumElements(shape)) to a
+  // value.
+  //
+  // TODO(vrv): Replace with something like a BrainClient Feed.
+  template <typename T>
+  void AddInput(const TensorShape& shape, std::function<T(int)> input_mapping) {
+    test::FillFn(AddInput(DataTypeToEnum<T>::v(), shape), input_mapping);
+  }
+
+  // Like AddInput but takes in an explicit arrayslice of data.
+  template <typename T>
+  void AddInputFromArray(const TensorShape& shape,
+                         const gtl::ArraySlice<T> data) {
+    test::FillValues<T>(AddInput(DataTypeToEnum<T>::v(), shape), data);
+  }
+
+  // Convenience function to add an input and populate it with the elements from
+  // an initializer list converting the types as needed.
+  template <typename T, typename SrcType>
+  void AddInputFromList(const TensorShape& shape,
+                        std::initializer_list<SrcType> data) {
+    test::FillValues<T>(AddInput(DataTypeToEnum<T>::v(), shape), data);
+  }
+
+  // Adds a Resource type as input. If <container> is empty, uses the default
+  // container name.
+  template <typename T>
+  void AddResourceInput(const string& container, const string& name,
+                        T* resource) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    ResourceMgr* rm = device_->resource_manager();
+    std::string container_name =
+        container.empty() ? rm->default_container() : container;
+    EXPECT_TRUE(rm->Create(container_name, name, resource).ok());
+    AddResourceInputInternal(container_name, name, TypeIndex::Make<T>());
+  }
+
+  // Runs an operation producing 'num_outputs' outputs.
+  //
+  // Returns the context's status after running the operation.
+  absl::Status RunOpKernel();
+
+  // Returns the tensor input for 'input_index'.
+  //
+  // REQUIRES: 0 <= input_index < context_->num_inputs()
+  const Tensor& GetInput(int input_index) const;
+
+  TensorValue mutable_input(int input_index);
+
+  // Returns the tensor output for 'output_index'.
+  //
+  // REQUIRES: 0 <= output_index < context_->num_outputs()
+  Tensor* GetOutput(int output_index);
+
+  Allocator* allocator();
+
+  OpKernel* op_kernel();
+
+  const DataTypeVector& output_types() const;
+
+  void set_session_metadata(SessionMetadata session_metadata) {
+    session_metadata_ = std::move(session_metadata);
+  }
+
+  const SessionMetadata& session_metadata() const { return session_metadata_; }
+
+ protected:
+  void CreateContext();
+  Tensor* AddInput(DataType dtype, const TensorShape& shape);
+  void AddResourceInputInternal(const std::string& container_name,
+                                const std::string& name,
+                                const TypeIndex& type_index);
+
+  // device_mgr_ owns device_.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device_;
+
+  // The device allocator, or the managed_allocator_ below if running on GPU.
+  Allocator* allocator_;
+
+  std::unique_ptr<OpKernel> kernel_;
+  std::unique_ptr<ScopedStepContainer> step_container_;
+  NodeDef node_def_;
+  DataTypeVector input_types_;
+  DeviceType device_type_;
+
+  mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs
+
+  absl::InlinedVector<TensorValue, 4> inputs_;
+  // Owns Tensors.
+  std::vector<Tensor*> tensors_;
+  // Copies of the outputs in unified memory (host and device accessible).
+  std::vector<Tensor*> managed_outputs_;
+
+  // AllocatorAttributes for the allocators of the outputs.
+  std::vector<AllocatorAttributes> out_alloc_attrs_;
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper_;
+  CancellationManager default_cancellation_manager_;
+  std::unique_ptr<OpKernelContext::Params> params_;
+  std::unique_ptr<OpKernelContext> context_;
+  // Unified memory allocator, only used when running on GPU.
+  std::unique_ptr<Allocator> managed_allocator_;
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  SessionMetadata session_metadata_;
+
+ private:
+  OpsTestBase(const OpsTestBase&) = delete;
+  void operator=(const OpsTestBase&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/ops_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/ops_util.h
new file mode 100644
index 00000000..842dd798
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/ops_util.h
@@ -0,0 +1,22 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
+
+// Placeholder for the ops_util library that is moved under core/framework.
+#include "tensorflow/core/framework/ops_util.h"
+
+#endif  // TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/pad_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/pad_op.h
new file mode 100644
index 00000000..34a19dfc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/pad_op.h
@@ -0,0 +1,56 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_PAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_PAD_OP_H_
+// Functor definition for PadOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by PadOp to do the computations.
+template <typename Device, typename T, typename Tpadding, int Dims>
+struct Pad {
+  // Pad "input" into "output", as specified by "paddings" and "pad_value".
+  // See pad_op.cc for details.
+  void operator()(const Device& d, typename TTypes<T, Dims>::Tensor output,
+                  typename TTypes<T, Dims>::ConstTensor input,
+                  Eigen::array<Eigen::IndexPair<Tpadding>, Dims> paddings,
+                  T pad_value) {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto output32, auto input32) {
+          output32.device(d) = input32.pad(paddings, pad_value);
+        },
+        output, input);
+  }
+};
+
+template <typename Device, typename T, typename Tpadding>
+struct Pad<Device, T, Tpadding, 0> {
+  // In the scalar case we simply copy the input.
+  void operator()(const Device& d, typename TTypes<T, 0>::Tensor output,
+                  typename TTypes<T, 0>::ConstTensor input,
+                  Eigen::array<Eigen::IndexPair<Tpadding>, 0>, T) {
+    output.device(d) = input;
+  }
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_PAD_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/padding_fifo_queue.h b/third_party/tflite-hdrs/tensorflow/core/kernels/padding_fifo_queue.h
new file mode 100644
index 00000000..74107e80
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/padding_fifo_queue.h
@@ -0,0 +1,90 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_PADDING_FIFO_QUEUE_H_
+#define TENSORFLOW_CORE_KERNELS_PADDING_FIFO_QUEUE_H_
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/typed_queue.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class PaddingFIFOQueue : public FIFOQueue {
+ public:
+  PaddingFIFOQueue(int32_t capacity, const DataTypeVector& component_dtypes,
+                   const std::vector<PartialTensorShape>& component_shapes,
+                   const string& name);
+
+  absl::Status Initialize() override;
+
+  // Implementations of QueueInterface methods --------------------------------
+
+  void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                      bool allow_small_batch,
+                      CallbackWithTuple callback) override;
+  absl::Status MatchesNodeDef(const NodeDef& node_def) override;
+
+ protected:
+  absl::Status ValidateManyTuple(const Tuple& tuple) override;
+  absl::Status ValidateTuple(const Tuple& tuple) override;
+  absl::Status CompatibleNodeDefShapes(const NodeDef& node_def) const;
+
+  // Convert a list of PartialTensorShape to a list of
+  // TensorShape.
+  // Any unknown dimension sizes are converted to 0.
+  // REQUIRED: All the input shapes have well defined rank.
+  static std::vector<TensorShape> ConvertShapesPartialDimensionsToZero(
+      absl::Span<const PartialTensorShape> partial_shapes);
+
+  // Sets the values in the given element to zero.
+  static absl::Status SetElementZero(Tensor* element);
+
+  // Copies element into the index^th slice (in the first dimension)
+  // of parent.  Allows for the parent's slice to have a larger size
+  // than the element, and copies the element into the upper left hand
+  // corner of the slice.
+  static absl::Status CopyElementToLargerSlice(const Tensor& element,
+                                               Tensor* parent, int index);
+
+  std::vector<PartialTensorShape> partial_shapes_;
+
+ private:
+  ~PaddingFIFOQueue() override {}
+
+  static absl::Status GetElementComponent(const PaddingFIFOQueue::Tuple& tuple,
+                                          int component, OpKernelContext* ctx,
+                                          Tensor* out_tensor);
+
+  static absl::Status IsSameSizeExceptZerosInFirst(const TensorShape& first,
+                                                   const TensorShape& second);
+
+  PaddingFIFOQueue(const PaddingFIFOQueue&) = delete;
+  void operator=(const PaddingFIFOQueue&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_PADDING_FIFO_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/parameterized_truncated_normal_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/parameterized_truncated_normal_op.h
new file mode 100644
index 00000000..4df75c78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/parameterized_truncated_normal_op.h
@@ -0,0 +1,66 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+// Sample a truncated normal random variable, with mean, stddev, minval, and
+// maxval parameters for each batch. Uses two rejection sampling algorithms
+// described in http://rd.springer.com/article/10.1007/BF00143942 and a randn
+// rejection sampler when most of the normal is inside the bounds.
+//
+// Either minval may be -infinity, or maxval may be +infinity. If the interval
+// (minval, maxval) is empty, the result is NaN.
+template <typename Device, typename T>
+struct TruncatedNormalFunctor {
+  void operator()(OpKernelContext* ctx, const Device& d, int64_t num_batches,
+                  int64_t samples_per_batch, int64_t num_elements,
+                  typename TTypes<T>::ConstFlat means,
+                  typename TTypes<T>::ConstFlat stddevs,
+                  typename TTypes<T>::ConstFlat minvals,
+                  typename TTypes<T>::ConstFlat maxvals,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<T>::Flat output);
+};
+
+// This version supports broadcasting of the arguments, as well as puts
+// the sample dimension on the left.
+template <typename Device, typename T>
+struct TruncatedNormalFunctorV2 {
+  void operator()(OpKernelContext* ctx, const Device& d, int64_t num_batches,
+                  int64_t samples_per_batch, int64_t num_elements,
+                  const BCastList<4>& bcast,
+                  typename TTypes<T>::ConstFlat means,
+                  typename TTypes<T>::ConstFlat stddevs,
+                  typename TTypes<T>::ConstFlat minvals,
+                  typename TTypes<T>::ConstFlat maxvals,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<T>::Flat output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/partitioned_function_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/partitioned_function_ops.h
new file mode 100644
index 00000000..2b2ec8ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/partitioned_function_ops.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_PARTITIONED_FUNCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_PARTITIONED_FUNCTION_OPS_H_
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class NameAttrList;
+class ConfigProto;
+
+// A `PartitionedCallOp` asynchronously executes a function, potentially across
+// multiple devices but within a single process. The kernel places and
+// partitions a given function's underlying graph, and executes each of the
+// partitioned subgraphs as a function.
+//
+// TODO(akshayka): Support distributed execution.
+class PartitionedCallOp : public AsyncOpKernel {
+ public:
+  explicit PartitionedCallOp(OpKernelConstruction* ctx);
+
+  ~PartitionedCallOp() override;
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ protected:
+  absl::Status FillOutputDevices(
+      const FunctionLibraryRuntime& lib, const Device& cpu_device,
+      AttrSlice attrs, FunctionLibraryRuntime::InstantiateOptions* opts);
+
+  absl::Status Instantiate(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                           std::vector<Tensor>* inputs,
+                           FunctionLibraryRuntime::Handle* handle);
+
+  void RunFunction(FunctionLibraryRuntime::Handle handle,
+                   const std::vector<Tensor>& inputs,
+                   FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                   DoneCallback done);
+
+  // Using unique pointers to avoid including proto headers in kernel headers
+  std::unique_ptr<NameAttrList> func_;
+  std::unique_ptr<ConfigProto> config_proto_;
+  string executor_type_;
+  bool shared_rendezvous_;
+  mutex mu_;
+  // Cache the handle per FLR because this kernel may be instantiated for
+  // a stateful op, different invocations of it may use different FLRs.
+  // Different device placements of PartitionedCallOp also use
+  // different FLRs.
+  gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_PARTITIONED_FUNCTION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/poisson-loss.h b/third_party/tflite-hdrs/tensorflow/core/kernels/poisson-loss.h
new file mode 100644
index 00000000..d946b066
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/poisson-loss.h
@@ -0,0 +1,109 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_POISSON_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_POISSON_LOSS_H_
+
+#include <cmath>
+
+#include "tensorflow/core/kernels/loss.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+class PoissonLossUpdater : public DualLossUpdater {
+ public:
+  // Update is found by a Newton algorithm (see readme.md).
+  double ComputeUpdatedDual(const int num_loss_partitions, const double label,
+                            const double example_weight,
+                            const double current_dual, const double wx,
+                            const double weighted_example_norm) const final {
+    // Newton algorithm converges quadratically so 10 steps will be largely
+    // enough to achieve a very good precision
+    static const int newton_total_steps = 10;
+    // Initialize the Newton optimization at x such that
+    // exp(x) = label - current_dual
+    const double y_minus_a = label - current_dual;
+    double x = (y_minus_a > 0) ? log(y_minus_a) : 0;
+    for (int i = 0; i < newton_total_steps; ++i) {
+      x = NewtonStep(x, num_loss_partitions, label, wx, example_weight,
+                     weighted_example_norm, current_dual);
+    }
+    return label - exp(x);
+  }
+
+  // Dual of poisson loss function.
+  // https://en.wikipedia.org/wiki/Convex_conjugate
+  double ComputeDualLoss(const double current_dual, const double example_label,
+                         const double example_weight) const final {
+    // Dual of the poisson loss function is
+    // (y-a)*(log(y-a)-1), where a is the dual variable.
+    // It is defined only for a<y.
+    const double y_minus_a = example_label - current_dual;
+    if (y_minus_a == 0.0) {
+      // (y-a)*(log(y-a)-1) approaches 0 as y-a approaches 0.
+      return 0.0;
+    }
+    if (y_minus_a < 0.0) {
+      return std::numeric_limits<double>::max();
+    }
+    return y_minus_a * (log(y_minus_a) - 1) * example_weight;
+  }
+
+  double ComputePrimalLoss(const double wx, const double example_label,
+                           const double example_weight) const final {
+    return (exp(wx) - wx * example_label) * example_weight;
+  }
+
+  double PrimalLossDerivative(const double wx, const double label,
+                              const double example_weight) const final {
+    return (exp(wx) - label) * example_weight;
+  }
+
+  // TODO(chapelle): We need to introduce a maximum_prediction parameter,
+  // expose that parameter to the user and have this method return
+  // 1.0/maximum_prediction.
+  // Setting this at 1 for now, it only impacts the adaptive sampling.
+  double SmoothnessConstant() const final { return 1; }
+
+  absl::Status ConvertLabel(float* const example_label) const final {
+    if (*example_label < 0.0) {
+      return errors::InvalidArgument(
+          "Only non-negative labels can be used with the Poisson log loss. "
+          "Found example with label: ", *example_label);
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  // One Newton step (see readme.md).
+  double NewtonStep(const double x, const int num_loss_partitions,
+                    const double label, const double wx,
+                    const double example_weight,
+                    const double weighted_example_norm,
+                    const double current_dual) const {
+    const double expx = exp(x);
+    const double numerator =
+        x - wx - num_loss_partitions * weighted_example_norm *
+        example_weight * (label - current_dual - expx);
+    const double denominator =
+       1 + num_loss_partitions * weighted_example_norm * example_weight * expx;
+    return x - numerator / denominator;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_3d.h b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_3d.h
new file mode 100644
index 00000000..c0a589ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_3d.h
@@ -0,0 +1,80 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+enum PoolingType { MAX, AVG };
+
+template <typename Device, typename T, PoolingType Type>
+struct LaunchPoolingOp;
+
+template <typename Device, typename T>
+struct LaunchAvgPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
+// A helper class to manage sizes and shapes for 3d pooling operations.
+struct Pool3dParameters {
+  // Updates context->status if there is an invalid input.
+  Pool3dParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+                   const std::vector<int32>& stride, Padding padding,
+                   TensorFormat data_format,
+                   const TensorShape& tensor_in_shape);
+
+  // Returns the shape of the output for "forward" pooling operations.
+  absl::Status forward_output_shape(TensorShape* shape);
+
+  int depth;
+
+  int tensor_in_planes;
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_planes;
+  int window_cols;
+  int window_rows;
+  int depth_window;
+
+  int plane_stride;
+  int col_stride;
+  int row_stride;
+  int depth_stride;
+
+  int64_t out_plane;
+  int64_t out_height;
+  int64_t out_width;
+
+  int64_t pad_planes;
+  int64_t pad_cols;
+  int64_t pad_rows;
+
+  TensorFormat data_format;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_3d_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_3d_gpu.h
new file mode 100644
index 00000000..002964a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace functor {
+template <typename T>
+struct MaxPool3dGradBackward {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch, const int pooled_plane,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int plane, const int height,
+                  const int width, const int kernel_p, const int kernel_h,
+                  const int kernel_w, const int stride_p, const int stride_h,
+                  const int stride_w, const int pad_p, const int pad_t,
+                  const int pad_l, const T* top_diff, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_common.h
new file mode 100644
index 00000000..bb5dda56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_common.h
@@ -0,0 +1,681 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
+
+#include <vector>
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/avgpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// A helper class to manage sizes and shapes for pooling operations.
+struct PoolParameters {
+  // Updates context->status if there is an invalid input.
+  // explicit_paddings has eight elements if padding==EXPLIICT, and zero
+  // elements otherwise.
+  PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+                 const std::vector<int32>& stride, Padding padding,
+                 std::vector<int64_t> explicit_paddings,
+                 TensorFormat data_format, const TensorShape& tensor_in_shape);
+
+  // Returns the shape of the output for "forward" pooling operations.
+  absl::Status forward_output_shape(TensorShape* shape);
+
+  int depth;
+
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_rows;
+  int window_cols;
+  int depth_window;
+
+  int row_stride;
+  int col_stride;
+  int depth_stride;
+
+  int64_t out_height;
+  int64_t out_width;
+  int out_depth;
+
+  int64_t pad_top;
+  int64_t pad_bottom;
+  int64_t pad_left;
+  int64_t pad_right;
+
+  int pad_depth;
+
+  TensorFormat data_format;
+};
+
+// An implementation of MaxPooling (forward).
+// TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
+//     QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
+template <typename Device, typename T>
+class MaxPoolingOp : public OpKernel {
+ public:
+  explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+    auto status = context->GetAttr("data_format", &data_format);
+    if (status.ok()) {
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ",
+                                  "on device type ",
+                                  DeviceTypeString(context->device_type())));
+    } else {
+      data_format_ = FORMAT_NHWC;
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(
+        context,
+        ksize_[0] > 0 && ksize_[1] > 0 && ksize_[2] > 0 && ksize_[3] > 0,
+        errors::InvalidArgument(
+            absl::StrCat("Sliding window ksize must be positive. The "
+                         "specified or inferred ksize is: ",
+                         absl::StrJoin(ksize_, ","))));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+    }
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    PoolParameters params{
+        context,     ksize_,           stride_, padding_, explicit_paddings_,
+        FORMAT_NHWC, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output = nullptr;
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, params_forward_output_shape, &output));
+
+    if (params.depth_window > 1) {
+      // Validate spec against the current implementation.  A
+      // relaxation of these requirements would be ideal.
+      OP_REQUIRES(context, params.depth % params.depth_window == 0,
+                  errors::Unimplemented(
+                      "Depthwise max pooling requires "
+                      "the depth window to evenly divide the input depth."));
+      OP_REQUIRES(
+          context, params.depth_window == params.depth_stride,
+          errors::Unimplemented("Depthwise max pooling requires "
+                                "the depth window to equal the depth stride."));
+      OP_REQUIRES(
+          context, padding_ != EXPLICIT,
+          errors::Unimplemented("Depthwise max pooling does not support "
+                                "explicit padding."));
+
+      DepthwiseMaxPool(context, output, tensor_in, params);
+    } else {
+      // MaxPoolingOp is only called on the GPU when the eigen_tensor label
+      // is used. In this case, explicit padding is not supported
+      if (std::is_same<Device, GPUDevice>::value &&
+          padding_ == Padding::EXPLICIT) {
+        context->SetStatus(errors::Unimplemented(
+            "MaxPoolingOp does not support explicit padding."));
+        return;
+      }
+      SpatialMaxPool(context, output, tensor_in, params, padding_);
+    }
+  }
+
+ private:
+  // Single-threaded implementation of DepthwiseMaxPool which
+  // does not handle all of the same options as SpatialMaxPool
+  // (strict assumptions on no padding, stride).
+  //
+  // TODO(vrv): implement a more general depthwise-max pool that works
+  // on GPU as well.
+  void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
+                        const Tensor& tensor_in, const PoolParameters& params) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
+                   tensor_in.NumElements() / params.depth_window);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
+        output->flat<T>().data(), 1, output->NumElements());
+    out_by_pool = in_by_pool.colwise().maxCoeff();
+  }
+
+  void SpatialMaxPool(OpKernelContext* context, Tensor* output,
+                      const Tensor& tensor_in, const PoolParameters& params,
+                      const Padding& padding) {
+    if (output->NumElements() == 0) {
+      return;
+    }
+    // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
+    // EigenMatrix version that is currently faster than Eigen's
+    // Spatial MaxPooling implementation.
+    //
+    // TODO(vrv): Remove this once we no longer need it.
+    if (std::is_same<Device, GPUDevice>::value) {
+      Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
+      functor::SpatialMaxPooling<Device, T>()(
+          context->eigen_device<Device>(), output->tensor<T, 4>(),
+          tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+          params.row_stride, params.col_stride, pt);
+    } else {
+      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          ConstEigenMatrixMap;
+      typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          EigenMatrixMap;
+
+      ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                                 params.tensor_in_cols * params.tensor_in_rows *
+                                     params.tensor_in_batch);
+      EigenMatrixMap out_mat(
+          output->flat<T>().data(), params.depth,
+          params.out_width * params.out_height * params.tensor_in_batch);
+
+      const DeviceBase::CpuWorkerThreads& worker_threads =
+          *(context->device()->tensorflow_cpu_worker_threads());
+
+      // The following code basically does the following:
+      // 1. Flattens the input and output tensors into two dimensional arrays.
+      //    tensor_in_as_matrix:
+      //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+      //    output_as_matrix:
+      //      depth by (out_width * out_height * tensor_in_batch)
+      //
+      // 2. Walks through the set of columns in the flattened
+      // tensor_in_as_matrix,
+      //    and updates the corresponding column(s) in output_as_matrix with the
+      //    max value.
+      auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
+        const int32_t in_rows = params.tensor_in_rows;
+        const int32_t in_cols = params.tensor_in_cols;
+        const int32_t pad_top = params.pad_top;
+        const int32_t pad_left = params.pad_left;
+        const int32_t window_rows = params.window_rows;
+        const int32_t window_cols = params.window_cols;
+        const int32_t row_stride = params.row_stride;
+        const int32_t col_stride = params.col_stride;
+        const int32_t out_height = params.out_height;
+        const int32_t out_width = params.out_width;
+
+        {
+          // Initializes the output tensor with MIN<T>.
+          const int32_t output_image_size =
+              out_height * out_width * params.depth;
+          EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
+                                   1, (limit - start) * output_image_size);
+          out_shard.setConstant(Eigen::NumTraits<T>::lowest());
+        }
+
+        for (int32_t b = start; b < limit; ++b) {
+          const int32_t out_offset_batch = b * out_height;
+          for (int32_t h = 0; h < in_rows; ++h) {
+            for (int32_t w = 0; w < in_cols; ++w) {
+              // (h_start, h_end) * (w_start, w_end) is the range that the input
+              // vector projects to.
+              const int32_t hpad = h + pad_top;
+              const int32_t wpad = w + pad_left;
+              const int32_t h_start =
+                  (hpad < window_rows) ? 0
+                                       : (hpad - window_rows) / row_stride + 1;
+              const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
+              const int32_t w_start =
+                  (wpad < window_cols) ? 0
+                                       : (wpad - window_cols) / col_stride + 1;
+              const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
+              // compute elementwise max
+              const int32_t in_offset = (b * in_rows + h) * in_cols + w;
+              for (int32_t ph = h_start; ph < h_end; ++ph) {
+                const int32_t out_offset_base =
+                    (out_offset_batch + ph) * out_width;
+                for (int32_t pw = w_start; pw < w_end; ++pw) {
+                  const int32_t out_offset = out_offset_base + pw;
+                  out_mat.col(out_offset) =
+                      out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
+                }
+              }
+            }
+          }
+        }
+      };
+
+      // TODO(andydavis) Consider sharding across batch x rows x cols.
+      // TODO(andydavis) Consider a higher resolution shard cost model.
+      const int64_t shard_cost =
+          params.tensor_in_rows * params.tensor_in_cols * params.depth;
+      Shard(worker_threads.num_threads, worker_threads.workers,
+            params.tensor_in_batch, shard_cost, shard);
+    }
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  std::vector<int64_t> explicit_paddings_;
+  TensorFormat data_format_;
+};
+
+template <typename Device>
+struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <>
+struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output) {
+#if GOOGLE_CUDA
+    bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
+        reinterpret_cast<const int32*>(input.flat<qint8>().data()),
+        params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
+        params.depth, params.out_height, params.out_width, params.window_rows,
+        params.window_cols, params.row_stride, params.col_stride,
+        params.pad_top, params.pad_left,
+        reinterpret_cast<int32*>(output->flat<qint8>().data()),
+        context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(errors::Internal(
+          "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
+    }
+#else
+    // ROCm TODO: add support __vmaxs4 on ROCm
+    context->SetStatus(errors::Internal(
+        "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
+#endif  // GOOGLE_CUDA
+  }
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Device, typename T>
+class MaxPoolingV2Op : public OpKernel {
+ public:
+  explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+    auto status = context->GetAttr("data_format", &data_format);
+    if (status.ok()) {
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context,
+          data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW_VECT_C,
+          errors::InvalidArgument(
+              "MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: ",
+              data_format));
+    } else {
+      data_format_ = FORMAT_NHWC;
+    }
+    if (context->num_inputs() == 1) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES(
+          context,
+          ksize_[0] > 0 && ksize_[1] > 0 && ksize_[2] > 0 && ksize_[3] > 0,
+          errors::InvalidArgument("Sliding window ksize must be positive."));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+
+    if (context->num_inputs() != 1) {
+      const Tensor& tensor_ksize = context->input(1);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(2);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(
+        context, ksize[0] > 0 && ksize[1] > 0 && ksize[2] > 0 && ksize[3] > 0,
+        errors::InvalidArgument("Sliding window ksize must be positive."));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+
+    PoolParameters params{
+        context,
+        ksize,
+        stride,
+        padding_,
+        /*explicit_paddings=*/{},
+        data_format_,
+        tensor_in.shape(),
+    };
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output = nullptr;
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, params_forward_output_shape, &output));
+
+    if (params.depth_window > 1) {
+      // Validate spec against the current implementation.  A
+      // relaxation of these requirements would be ideal.
+      OP_REQUIRES(context, params.depth % params.depth_window == 0,
+                  errors::Unimplemented(
+                      "Depthwise max pooling requires "
+                      "the depth window to evenly divide the input depth."));
+      OP_REQUIRES(
+          context, params.depth_window == params.depth_stride,
+          errors::Unimplemented("Depthwise max pooling requires "
+                                "the depth window to equal the depth stride."));
+
+      DepthwiseMaxPool(context, output, tensor_in, params);
+    } else {
+      SpatialMaxPool(context, output, tensor_in, params, padding_);
+    }
+  }
+
+ private:
+  // Single-threaded implementation of DepthwiseMaxPool which
+  // does not handle all of the same options as SpatialMaxPool
+  // (strict assumptions on no padding, stride).
+  //
+  // TODO(vrv): implement a more general depthwise-max pool that works
+  // on GPU as well.
+  void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
+                        const Tensor& tensor_in, const PoolParameters& params) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
+                   tensor_in.NumElements() / params.depth_window);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
+        output->flat<T>().data(), 1, output->NumElements());
+    out_by_pool = in_by_pool.colwise().maxCoeff();
+  }
+
+  void SpatialMaxPool(OpKernelContext* context, Tensor* output,
+                      const Tensor& tensor_in, const PoolParameters& params,
+                      const Padding& padding) {
+    if (output->NumElements() == 0) {
+      return;
+    }
+    // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
+    // EigenMatrix version that is currently faster than Eigen's
+    // Spatial MaxPooling implementation.
+    //
+    // TODO(vrv): Remove this once we no longer need it.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (std::is_same<Device, GPUDevice>::value) {
+      Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
+      if (std::is_same<T, qint8>::value) {
+        LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch(
+            context, params, tensor_in, output);
+      } else {
+        functor::SpatialMaxPooling<Device, T>()(
+            context->eigen_device<Device>(), output->tensor<T, 4>(),
+            tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+            params.row_stride, params.col_stride, pt);
+      }
+    } else
+#endif
+    {
+      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          ConstEigenMatrixMap;
+      typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          EigenMatrixMap;
+
+      ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                                 params.tensor_in_cols * params.tensor_in_rows *
+                                     params.tensor_in_batch);
+      EigenMatrixMap out_mat(
+          output->flat<T>().data(), params.depth,
+          params.out_width * params.out_height * params.tensor_in_batch);
+
+      const DeviceBase::CpuWorkerThreads& worker_threads =
+          *(context->device()->tensorflow_cpu_worker_threads());
+
+      // The following code basically does the following:
+      // 1. Flattens the input and output tensors into two dimensional arrays.
+      //    tensor_in_as_matrix:
+      //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+      //    output_as_matrix:
+      //      depth by (out_width * out_height * tensor_in_batch)
+      //
+      // 2. Walks through the set of columns in the flattened
+      // tensor_in_as_matrix,
+      //    and updates the corresponding column(s) in output_as_matrix with the
+      //    max value.
+      auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
+        const int32_t in_rows = params.tensor_in_rows;
+        const int32_t in_cols = params.tensor_in_cols;
+        const int32_t pad_top = params.pad_top;
+        const int32_t pad_left = params.pad_left;
+        const int32_t window_rows = params.window_rows;
+        const int32_t window_cols = params.window_cols;
+        const int32_t row_stride = params.row_stride;
+        const int32_t col_stride = params.col_stride;
+        const int32_t out_height = params.out_height;
+        const int32_t out_width = params.out_width;
+
+        {
+          // Initializes the output tensor with MIN<T>.
+          const int32_t output_image_size =
+              out_height * out_width * params.depth;
+          EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
+                                   1, (limit - start) * output_image_size);
+          out_shard.setConstant(Eigen::NumTraits<T>::lowest());
+        }
+
+        for (int32_t b = start; b < limit; ++b) {
+          const int32_t out_offset_batch = b * out_height;
+          for (int32_t h = 0; h < in_rows; ++h) {
+            for (int32_t w = 0; w < in_cols; ++w) {
+              // (h_start, h_end) * (w_start, w_end) is the range that the input
+              // vector projects to.
+              const int32_t hpad = h + pad_top;
+              const int32_t wpad = w + pad_left;
+              const int32_t h_start =
+                  (hpad < window_rows) ? 0
+                                       : (hpad - window_rows) / row_stride + 1;
+              const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
+              const int32_t w_start =
+                  (wpad < window_cols) ? 0
+                                       : (wpad - window_cols) / col_stride + 1;
+              const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
+              // compute elementwise max
+              const int32_t in_offset = (b * in_rows + h) * in_cols + w;
+              for (int32_t ph = h_start; ph < h_end; ++ph) {
+                const int32_t out_offset_base =
+                    (out_offset_batch + ph) * out_width;
+                for (int32_t pw = w_start; pw < w_end; ++pw) {
+                  const int32_t out_offset = out_offset_base + pw;
+                  out_mat.col(out_offset) =
+                      out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
+                }
+              }
+            }
+          }
+        }
+      };
+
+      // TODO(andydavis) Consider sharding across batch x rows x cols.
+      // TODO(andydavis) Consider a higher resolution shard cost model.
+      const int64_t shard_cost =
+          params.tensor_in_rows * params.tensor_in_cols * params.depth;
+      Shard(worker_threads.num_threads, worker_threads.workers,
+            params.tensor_in_batch, shard_cost, shard);
+    }
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+template <typename Device, typename T>
+void SpatialAvgPool(OpKernelContext* context, Tensor* output,
+                    const Tensor& input, const PoolParameters& params,
+                    const Padding& padding) {
+  if (output->NumElements() == 0) {
+    return;
+  }
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      ConstEigenMatrixMap;
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      EigenMatrixMap;
+
+  auto in_flat = input.flat<T>();
+  auto out_flat = output->flat<T>();
+
+  auto shard = [&params, &in_flat, &out_flat](int64_t start, int64_t limit) {
+    // Calculate indices for this shards chunk of work.
+    const int64_t input_image_size =
+        params.tensor_in_rows * params.tensor_in_cols * params.depth;
+    const int64_t output_image_size =
+        params.out_width * params.out_height * params.depth;
+    const int64_t shard_batch_size = limit - start;
+
+    ConstEigenMatrixMap in_mat(
+        in_flat.data() + start * input_image_size, params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * shard_batch_size);
+    EigenMatrixMap out_mat(
+        out_flat.data() + start * output_image_size, params.depth,
+        params.out_width * params.out_height * shard_batch_size);
+    Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
+    out_count.setZero();
+
+    // Initializes output to zero.
+    out_mat.setZero();
+
+    // The following code basically does the following:
+    // 1. Flattens the input and output tensors into two dimensional arrays.
+    //    tensor_in_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    output_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //
+    // 2. Walks through the set of columns in the flattened
+    // tensor_in_as_matrix,
+    //    and updates the corresponding column(s) in output_as_matrix with the
+    //    average value.
+    for (int b = 0; b < shard_batch_size; ++b) {
+      for (int h = 0; h < params.tensor_in_rows; ++h) {
+        for (int w = 0; w < params.tensor_in_cols; ++w) {
+          // (h_start, h_end) * (w_start, w_end) is the range that the input
+          // vector projects to.
+          const int hpad = h + params.pad_top;
+          const int wpad = w + params.pad_left;
+          const int h_start =
+              (hpad < params.window_rows)
+                  ? 0
+                  : (hpad - params.window_rows) / params.row_stride + 1;
+          const int h_end =
+              std::min<int>(hpad / params.row_stride + 1, params.out_height);
+          const int w_start =
+              (wpad < params.window_cols)
+                  ? 0
+                  : (wpad - params.window_cols) / params.col_stride + 1;
+          const int w_end =
+              std::min<int>(wpad / params.col_stride + 1, params.out_width);
+          const int in_offset =
+              (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
+          Eigen::DSizes<Eigen::DenseIndex, 2> in_indices(0, in_offset);
+          for (int ph = h_start; ph < h_end; ++ph) {
+            for (int pw = w_start; pw < w_end; ++pw) {
+              const int out_offset =
+                  (b * params.out_height + ph) * params.out_width + pw;
+              out_mat.col(out_offset) += in_mat.col(in_offset);
+              out_count(out_offset) += T(1);
+            }
+          }
+        }
+      }
+    }
+
+    DCHECK_GT(out_count.minCoeff(), T(0));
+    out_mat.array().rowwise() /= out_count.transpose().array();
+  };
+
+  const int64_t work_unit_size =
+      params.tensor_in_rows * params.tensor_in_cols * params.depth;
+  // NOTE: Constants in calculation below were estimated based on benchmarking.
+  // Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and
+  // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
+  // the work unit cost to an operating range in which it empirically performed
+  // best.
+  const int64_t work_unit_cost = std::max(int64_t{10000}, work_unit_size / 100);
+  const DeviceBase::CpuWorkerThreads& worker_threads =
+      *(context->device()->tensorflow_cpu_worker_threads());
+  Shard(worker_threads.num_threads, worker_threads.workers,
+        params.tensor_in_batch, work_unit_cost, shard);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_common_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_common_gpu.h
new file mode 100644
index 00000000..c5d51e59
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -0,0 +1,70 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
+
+#include <vector>
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/avgpooling_op.h"
+#include "tensorflow/core/kernels/maxpooling_op.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// A helper class that launch the cudnn pooling forward operations.
+template <typename T>
+class DnnPoolingOp {
+ public:
+  typedef GPUDevice Device;
+  static void Compute(OpKernelContext* context,
+                      se::dnn::PoolingMode pooling_mode,
+                      const std::vector<int32>& size,
+                      const std::vector<int32>& stride, Padding padding,
+                      std::vector<int64_t> explicit_paddings,
+                      TensorFormat data_format, const Tensor& tensor_in,
+                      const TensorShape& tensor_out_shape, bool propagate_nans);
+};
+
+// A helper class that launch the cudnn pooling backward operations.
+// The original input and output tensors are optional for AvgPoolGrad, but
+// mandatory for MaxPoolGrad.
+template <typename T>
+class DnnPoolingGradOp {
+ public:
+  typedef GPUDevice Device;
+  static void Compute(OpKernelContext* context,
+                      se::dnn::PoolingMode pooling_mode,
+                      const std::vector<int32>& size,
+                      const std::vector<int32>& stride, Padding padding,
+                      std::vector<int64_t> explicit_paddings,
+                      TensorFormat data_format, const Tensor* tensor_in,
+                      const Tensor* tensor_out, const Tensor& out_backprop,
+                      const TensorShape& tensor_in_shape, bool propagate_nans);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/population_count_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/population_count_op.h
new file mode 100644
index 00000000..2c981296
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/population_count_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct PopulationCount {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
+                  TTypes<uint8>::Flat output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/priority_queue.h b/third_party/tflite-hdrs/tensorflow/core/kernels/priority_queue.h
new file mode 100644
index 00000000..f7ca800a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/priority_queue.h
@@ -0,0 +1,95 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_PRIORITY_QUEUE_H_
+#define TENSORFLOW_CORE_KERNELS_PRIORITY_QUEUE_H_
+
+#include <deque>
+#include <queue>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/typed_queue.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using PriorityTensorPair = std::pair<int64_t, Tensor>;
+
+struct ComparePriorityTensorPair {
+  // 0 is a higher priority than 1, -MAX_LONG is a higher priority
+  // than MAX_LONG, etc.  Values coming in with a smaller
+  // priority number will bubble to the front of the queue.
+  bool operator()(const PriorityTensorPair& lhs,
+                  const PriorityTensorPair& rhs) const {
+    return lhs.first > rhs.first;
+  }
+};
+
+class PriorityQueue
+    : public TypedQueue<std::priority_queue<PriorityTensorPair,
+                                            std::vector<PriorityTensorPair>,
+                                            ComparePriorityTensorPair> > {
+ public:
+  PriorityQueue(int32_t capacity, const DataTypeVector& component_dtypes,
+                const std::vector<TensorShape>& component_shapes,
+                const string& name);
+
+  absl::Status Initialize()
+      override;  // Must be called before any other method.
+
+  // Implementations of QueueInterface methods --------------------------------
+
+  void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
+                  DoneCallback callback) override;
+  void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
+                      DoneCallback callback) override;
+  void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override;
+  void TryDequeueMany(int num_elements, OpKernelContext* ctx,
+                      bool allow_small_batch,
+                      CallbackWithTuple callback) override;
+  absl::Status MatchesNodeDef(const NodeDef& node_def) override;
+  absl::Status MatchesPriorityNodeDefTypes(const NodeDef& node_def) const;
+  absl::Status MatchesPriorityNodeDefShapes(const NodeDef& node_def) const;
+
+  int32 size() const override {
+    mutex_lock lock(mu_);
+    return queues_[0].size();
+  }
+
+ private:
+  ~PriorityQueue() override {}
+
+  // Helper for dequeuing a single element from queues_.
+  void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  static absl::Status GetElementComponentFromBatch(const Tuple& tuple,
+                                                   int index, int component,
+                                                   OpKernelContext* ctx,
+                                                   Tensor* out_element);
+
+  PriorityQueue(const PriorityQueue&) = delete;
+  void operator=(const PriorityQueue&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_PRIORITY_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/quantization_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/quantization_utils.h
new file mode 100644
index 00000000..88bee911
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/quantization_utils.h
@@ -0,0 +1,968 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+
+#include <cmath>
+#define EIGEN_USE_THREADS
+
+// This is a set of functions that standardizes how quantized values are
+// interpreted as float numbers.
+// All of the current implementations are for reference and have not been
+// optimized. They should be implementable using fixed point representations
+// to avoid a dependency on floating-point hardware.
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define QUANTIZATION_UTILS_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include <array>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#include "public/gemmlowp.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+// We have to be able to detect and handle overflows in int32, so this function
+// uses doubles and int64's to make sure we have enough room.
+template <class T>
+inline int64_t FloatToQuantizedUnclamped(float input, float range_min,
+                                         float range_max) {
+  const int64_t lowest_quantized =
+      static_cast<double>(Eigen::NumTraits<T>::lowest());
+  if (range_min == range_max) {
+    return lowest_quantized;
+  }
+  const int number_of_bits = sizeof(T) * 8;
+  const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
+  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
+  const double range = ((range_max - range_min) * range_adjust);
+  const double range_scale = (number_of_steps / range);
+  int64_t quantized =
+      (round(input * range_scale) - round(range_min * range_scale));
+  quantized += lowest_quantized;
+  return quantized;
+}
+
+template <>
+inline int64_t FloatToQuantizedUnclamped<float>(float input, float range_min,
+                                                float range_max) {
+  return -1;
+}
+
+// This converts the float into the final quantized type, clamping/saturating
+// any over or underflows.
+template <class T>
+T FloatToQuantized(float input, float range_min, float range_max) {
+  if (std::is_same<T, float>::value) {
+    // Specialization for float. This is used in reference implementation
+    // for float which is useful to compare performance between float
+    // and quantized type.
+    return input;
+  }
+  int64_t quantized = FloatToQuantizedUnclamped<T>(input, range_min, range_max);
+  const int64_t lowest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<T>::lowest());
+  const int64_t highest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<T>::highest());
+  quantized = std::max(quantized, lowest_quantized);
+  quantized = std::min(quantized, highest_quantized);
+  return static_cast<T>(static_cast<int32>(quantized));
+}
+
+template <class T>
+float QuantizedToFloat(T input, float range_min, float range_max) {
+  if (std::is_same<T, float>::value) {
+    // Specialization for float. This is used in reference implementation
+    // for float which is useful to compare performance between float
+    // and quantized type.
+    return input;
+  }
+  if (range_min == range_max) {
+    return range_min;
+  }
+  const int number_of_bits = sizeof(T) * 8;
+  const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
+  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
+  const double range = ((range_max - range_min) * range_adjust);
+  const double range_scale = (range / number_of_steps);
+  const int64_t lowest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<T>::lowest());
+  const double offset_input = static_cast<double>(input) - lowest_quantized;
+  // For compatibility with DEQUANTIZE_WITH_EIGEN, we should convert
+  // range_scale to a float, otherwise range_min_rounded might be slightly
+  // different.
+  const double range_min_rounded =
+      std::round(range_min / static_cast<float>(range_scale)) *
+      static_cast<float>(range_scale);
+  const double result = range_min_rounded + (offset_input * range_scale);
+  return static_cast<float>(result);
+}
+
+template <class T>
+float FloatForOneQuantizedLevel(float range_min, float range_max) {
+  const int64_t highest = static_cast<int64_t>(Eigen::NumTraits<T>::highest());
+  const int64_t lowest = static_cast<int64_t>(Eigen::NumTraits<T>::lowest());
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <class T1, class T2, class T3>
+void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
+                                        float max_b, float* min_c,
+                                        float* max_c) {
+  const float a_float_for_one_quant_level =
+      FloatForOneQuantizedLevel<T1>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+      FloatForOneQuantizedLevel<T2>(min_b, max_b);
+
+  const int64_t c_highest =
+      static_cast<int64_t>(Eigen::NumTraits<T3>::highest());
+  const int64_t c_lowest = static_cast<int64_t>(Eigen::NumTraits<T3>::lowest());
+  const float c_float_for_one_quant_level =
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+
+// input_array is an eigen Tensor.  q2f is a QuantizedToFloatStruct.
+// This evaluates to an eigen tensor expression, to be used like:
+// auto tensor = DEQUANTIZE_WITH_EIGEN(input_tensor, q2f);
+#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                         \
+  ((q2f.range_min_rounded - q2f.lowest_quantized() * q2f.range_scale) + \
+   input_array.template cast<float>() * q2f.range_scale)
+
+// input_array is an eigen Tensor.  f2q is a FloatToQuantizedStruct.
+// OutputType is the type of output (e.g. quint8).
+// This evaluates to an eigen tensor expression, to be used like:
+// auto tensor = QUANTIZE_WITH_EIGEN(input_tensor, f2q, T);
+#define QUANTIZE_WITH_EIGEN(input_array, f2q, OutputType) \
+  ((input_array * f2q.range_scale).round() -              \
+   (f2q.range_min_scaled - f2q.lowest_quantized()))       \
+      .cwiseMax(f2q.lower_bound_float())                  \
+      .cwiseMin(f2q.upper_bound_float())                  \
+      .template cast<int32>()                             \
+      .template cast<OutputType>()
+
+// For use with DEQUANTIZE_WITH_EIGEN.
+template <typename T>
+struct QuantizedToFloatStruct {
+  static constexpr int number_of_bits = sizeof(T) * 8;
+  static constexpr int64_t number_of_steps = static_cast<int64_t>(1)
+                                             << number_of_bits;
+
+  static float lowest_quantized() {
+    return static_cast<float>(Eigen::NumTraits<T>::lowest());
+  }
+
+  QuantizedToFloatStruct(float range_min, float range_max)
+      : range_min(range_min),
+        range_scale((range_max - range_min) / (number_of_steps - 1.0)),
+        range_min_rounded(range_max == range_min
+                              ? range_min
+                              : std::round(range_min / range_scale) *
+                                    range_scale) {}
+
+  const float range_min;
+  const float range_scale;
+  const float range_min_rounded;
+};
+
+// For use with QUANTIZE_WITH_EIGEN.
+template <typename T>
+struct FloatToQuantizedStruct {
+  static constexpr int number_of_bits = sizeof(T) * 8;
+  static constexpr int64_t number_of_steps = static_cast<int64_t>(1)
+                                             << number_of_bits;
+  static constexpr double range_adjust =
+      (number_of_steps / (number_of_steps - 1.0));
+
+  // Casting QInt32's lowest or highest to a float gives a float that can't be
+  // cast back to int32 or QInt32.  Instead, use bounds that can be converted
+  // back to int32 without going outside the range of an int32.
+  static float lower_bound_float() {
+    return Eigen::numext::maxi(
+        static_cast<float>(Eigen::NumTraits<T>::lowest()), -2.147483648e+09f);
+  }
+  static float upper_bound_float() {
+    return Eigen::numext::mini(
+        static_cast<float>(Eigen::NumTraits<T>::highest()), +2.147483520e+09f);
+  }
+
+  static float lowest_quantized() {
+    return static_cast<float>(Eigen::NumTraits<T>::lowest());
+  }
+
+  FloatToQuantizedStruct(float range_min, float range_max)
+      : range_min(range_min),
+        range_scale(range_max == range_min
+                        ? 0.0
+                        : (number_of_steps - 1.0) / (range_max - range_min)),
+        range_min_scaled(std::round(range_min * range_scale)) {}
+
+  const float range_min;
+  const float range_scale;
+  const float range_min_scaled;
+};
+
+template <class T1, class T2>
+inline T2 RequantizeInNewRange(T1 input, float min_input, float max_input,
+                               float min_new, float max_new) {
+  const float input_float = QuantizedToFloat<T1>(input, min_input, max_input);
+  return FloatToQuantized<T2>(input_float, min_new, max_new);
+}
+
+template <class T1, class T2>
+inline void RequantizeManyInNewRange(const T1* input, int64_t count,
+                                     float min_input, float max_input,
+                                     float min_output, float max_output,
+                                     T2* output) {
+  for (size_t index = 0; index < count; ++index) {
+    const float input_float =
+        QuantizedToFloat<T1>(input[index], min_input, max_input);
+    output[index] = FloatToQuantized<T2>(input_float, min_output, max_output);
+  }
+}
+
+// Because converting 32-bit accumulated results down to eight bit is a common
+// case, we have a specialized code path to handle it as efficiently as
+// possible using only fixed-point math for the inner loop.
+inline void RequantizeManyInNewRangeReference(const qint32* input,
+                                              int64_t count, float min_input,
+                                              float max_input, float min_output,
+                                              float max_output,
+                                              quint8* output) {
+  // Initially we calculate all the constants we need once, before we go into
+  // the inner loop.  If this is updated, also update the Eigen version.
+  const int fp_shift = 16;
+  const float input_range = max_input - min_input;
+  const float output_range = max_output - min_output;
+  const float recip_output_range =
+      output_range == 0.0 ? 0.0 : (255.0 / output_range);
+  const float input_rezero = (min_input + max_input) / 2.0;
+  const int64_t range_scale_fp =
+      output_range == 0.0 ? 0.0
+                          : static_cast<int64_t>(255.0 * (1 << fp_shift) *
+                                                 input_range / output_range);
+  const int64_t input_offset_fp =
+      static_cast<int64_t>(input_rezero * recip_output_range * (1 << fp_shift));
+  const int64_t output_offset_fp =
+      output_range == 0.0
+          ? 0
+          : std::lround((1 << fp_shift) * (min_output * 255.0) / output_range);
+  const int64_t rounding_delta = 1 << (fp_shift - 1);
+
+  // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
+  // that could be easily adapted for a SIMD implementation. It should also be
+  // possible to perform all the calculations in 32-bit rather than 64, but
+  // that's not been implemented yet.
+  for (int64_t index = 0; index < count; ++index) {
+    const int64_t input_value = static_cast<int64_t>(input[index]);
+    const int64_t fp_value =
+        ((input_value * range_scale_fp) >> 32) + input_offset_fp;
+    const int64_t offset_intermediate = fp_value - output_offset_fp;
+    const int64_t round_intermediate = offset_intermediate + rounding_delta;
+    int64_t quantized_int64 = round_intermediate >> fp_shift;
+    quantized_int64 = std::max(quantized_int64, int64_t{0});
+    quantized_int64 = std::min(quantized_int64, int64_t{255});
+    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
+  }
+}
+
+// Another common case is converting eight bit inputs up to thirty two bits, so
+// we have specialized fixed-point code to accelerate that. There is also a NEON
+// version for ARM devices below.
+inline void RequantizeManyInNewRange8To32BitReference(
+    const quint8* input, int64_t count, float min_input, float max_input,
+    float min_output, float max_output, qint32* output) {
+  const float code_0_float = QuantizedToFloat<quint8>(0, min_input, max_input);
+  const float code_1_float = QuantizedToFloat<quint8>(1, min_input, max_input);
+  const int64_t code_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_0_float, min_output, max_output);
+  const int64_t code_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_1_float, min_output, max_output);
+  const int32_t mult_int32 = code_1_int64 - code_0_int64;
+  const int64_t lowest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<qint32>::lowest());
+  const int64_t highest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<qint32>::highest());
+  for (int64_t i = 0; i < count; ++i) {
+    const int64_t input_value = static_cast<int64_t>(input[i]);
+    int64_t output_value = code_0_int64 + (input_value * mult_int32);
+    output_value = std::max(output_value, lowest_quantized);
+    output_value = std::min(output_value, highest_quantized);
+    output[i] = static_cast<int32>(output_value);
+  }
+}
+
+#ifdef QUANTIZATION_UTILS_USE_NEON
+// Speeds up the 32->8bit conversion using fixed-point arithmetic and NEON SIMD
+// intrinsics for ARM platforms.
+inline void RequantizeManyInNewRangeNeon(const qint32* input, int64 count,
+                                         float min_input, float max_input,
+                                         float min_output, float max_output,
+                                         quint8* output) {
+  // Initially we calculate all the constants we need once, before we go into
+  // the inner loop.  If this is updated, also update the Eigen version.
+  const int fp_shift = 16;
+
+  // Calculate range variables in advance.
+  // Input range.
+  const float input_range = max_input - min_input;
+  // Output range.
+  const float output_range = max_output - min_output;
+  // Ratio of output range.
+  const float recip_output_range =
+      output_range == 0.0 ? 0.0 : (255.0 / output_range);
+  // Average of input range as zero position of input.
+  const float input_rezero = (min_input + max_input) / 2.0;
+  // In-out range scale.
+  const int32 range_scale_fp =
+      output_range == 0.0 ? 0.0
+                          : static_cast<int32>(255.0 * (1 << (fp_shift - 16)) *
+                                               input_range / output_range);
+  // Input zero position offset to output.
+  const int32 input_offset_fp =
+      static_cast<int32>(input_rezero * recip_output_range * (1 << fp_shift));
+  // Output min offset.
+  const int32 output_offset_fp =
+      output_range == 0.0
+          ? 0
+          : static_cast<int32>((1 << fp_shift) * (min_output * 255.0) /
+                               output_range);
+  const int32 rounding_delta = 1 << (fp_shift - 1);
+
+  // broadcast range to each lane
+  const int32x4_t range_scale_fp_32x4 = vmovq_n_s32(range_scale_fp);
+  const int32x4_t input_offset_fp_32x4 = vmovq_n_s32(input_offset_fp);
+  const int32x4_t output_offset_fp_32x4 = vmovq_n_s32(output_offset_fp);
+  const int32x4_t rounding_delta_32x4 = vmovq_n_s32(rounding_delta);
+
+  int64 index = 0;
+  // Use SIMD to requantize.
+  for (; index < (count - 7); index += 8) {
+    const int32* input_ptr = &(input->value) + index;
+    const int32x4_t input_value_low_32x4 = vld1q_s32(input_ptr);
+    const int32x4_t input_value_high_32x4 = vld1q_s32(input_ptr + 4);
+    const int32x4_t fp_value_low_32x4 = vaddq_s32(
+        input_offset_fp_32x4,
+        vmulq_s32(vshrq_n_s32(input_value_low_32x4, 16), range_scale_fp_32x4));
+    const int32x4_t fp_value_high_32x4 = vaddq_s32(
+        input_offset_fp_32x4,
+        vmulq_s32(vshrq_n_s32(input_value_high_32x4, 16), range_scale_fp_32x4));
+    const int32x4_t offset_intermediate_low_32x4 =
+        vsubq_s32(fp_value_low_32x4, output_offset_fp_32x4);
+    const int32x4_t offset_intermediate_high_32x4 =
+        vsubq_s32(fp_value_high_32x4, output_offset_fp_32x4);
+    const int32x4_t round_intermediate_low_32x4 =
+        vaddq_s32(offset_intermediate_low_32x4, rounding_delta_32x4);
+    const int32x4_t round_intermediate_high_32x4 =
+        vaddq_s32(offset_intermediate_high_32x4, rounding_delta_32x4);
+    const int16x4_t quantized_low_16x4 =
+        vqmovn_s32(vshrq_n_s32(round_intermediate_low_32x4, fp_shift));
+    const int16x4_t quantized_high_16x4 =
+        vqmovn_s32(vshrq_n_s32(round_intermediate_high_32x4, fp_shift));
+    const uint8x8_t quantized_8x8 =
+        vqmovun_s16(vcombine_s16(quantized_low_16x4, quantized_high_16x4));
+    uint8* output_ptr = &(output->value) + index;
+    vst1_u8(output_ptr, quantized_8x8);
+  }
+
+  // Requantize remaining elements in array without SIMD.
+  for (; index < count; ++index) {
+    const int32 input_value = static_cast<int32>(input[index]);
+    const int32 fp_value =
+        static_cast<int32>(
+            (static_cast<int32>(input_value >> 16) * (range_scale_fp))) +
+        input_offset_fp;
+    const int32 offset_intermediate = fp_value - output_offset_fp;
+    const int32 round_intermediate = offset_intermediate + rounding_delta;
+    int32 quantized_int32 = round_intermediate >> fp_shift;
+    quantized_int32 = std::max(quantized_int32, 0);
+    quantized_int32 = std::min(quantized_int32, 255);
+    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int32));
+  }
+}
+
+template <>
+inline void RequantizeManyInNewRange<qint32, quint8>(
+    const qint32* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, quint8* output) {
+  const float input_range = max_input - min_input;
+  const float output_range = max_output - min_output;
+  if ((input_range / output_range) > 16384.0f) {
+    // Our NEON implementation uses 32-bit math and can't handle very
+    // large ranges, so fall back to the reference implementation. We don't
+    // expect these to be common in models, so this shouldn't be a performance
+    // problem in practice.
+    RequantizeManyInNewRangeReference(input, count, min_input, max_input,
+                                      min_output, max_output, output);
+  } else {
+    RequantizeManyInNewRangeNeon(input, count, min_input, max_input, min_output,
+                                 max_output, output);
+  }
+}
+
+// NEON accelerated 16bit rounded division by 2^n.
+template <int POW>
+inline int16x8_t Divide16x8PowRound(const int16x8_t val) {
+  const int16x8_t val_sign = vshrq_n_s16(val, 15);
+  const int16x8_t val_xor = veorq_s16(val, val_sign);
+  const int16x8_t val_pos = vsubq_s16(val_xor, val_sign);
+  const int16x8_t shifted_val_pos = vrshrq_n_s16(val_pos, POW);
+  const int16x8_t shifted_val_pos_xor = veorq_s16(shifted_val_pos, val_sign);
+  const int16x8_t shifted_val = vsubq_s16(shifted_val_pos_xor, val_sign);
+  return shifted_val;
+}
+
+// NEON accelerated 64bit rounded division by 2^n.
+template <int POW>
+inline int64x2_t Divide64x2PowRound(const int64x2_t val) {
+  const int64x2_t val_sign = vshrq_n_s64(val, 63);
+  const int64x2_t val_xor = veorq_s64(val, val_sign);
+  const int64x2_t val_pos = vsubq_s64(val_xor, val_sign);
+  const int64x2_t shifted_val_pos = vrshrq_n_s64(val_pos, POW);
+  const int64x2_t shifted_val_pos_xor = veorq_s64(shifted_val_pos, val_sign);
+  const int64x2_t shifted_val = vsubq_s64(shifted_val_pos_xor, val_sign);
+  return shifted_val;
+}
+
+// NEON accelerated 16bit division by 2^n.
+// CAVEAT: The input must be greater than min-int16 to avoid underflow.
+template <int POW>
+inline int16x8_t Divide16x8Pow(const int16x8_t val) {
+  static constexpr int16 FIRST_BIT_VAL = 0x0000000000000001;
+  static const int16x8_t FIRST_BIT = vmovq_n_s16(FIRST_BIT_VAL);
+  const int16x8_t val_sign = vshrq_n_s16(val, 15);
+  const int16x8_t neg_offset = vandq_s16(val_sign, FIRST_BIT);
+  const int16x8_t val_with_offset = vsubq_s16(val, neg_offset);
+  const int16x8_t shifted_wo_offset =
+      vsraq_n_s16(neg_offset, val_with_offset, POW);
+  return shifted_wo_offset;
+}
+
+// NEON accelerated 64bit division by 2^n.
+// CAVEAT: The input must be greater than min-int64 to avoid underflow.
+template <int POW>
+inline int64x2_t Divide64x2Pow(const int64x2_t val) {
+  static constexpr int64 FIRST_BIT_VAL = 0x0000000000000001;
+  static const int64x2_t FIRST_BIT = vmovq_n_s64(FIRST_BIT_VAL);
+  const int64x2_t val_sign = vshrq_n_s64(val, 63);
+  const int64x2_t neg_offset = vandq_s64(val_sign, FIRST_BIT);
+  const int64x2_t val_with_offset = vsubq_s64(val, neg_offset);
+  const int64x2_t shifted_wo_offset =
+      vsraq_n_s64(neg_offset, val_with_offset, POW);
+  return shifted_wo_offset;
+}
+
+// 32bit x 2 NEON accelerated lerp computation.
+template <int RESOLUTION>
+inline int32x2_t ComputeLerp32x2(const int32x2_t top_left,
+                                 const int32x2_t top_right,
+                                 const int32x2_t bottom_left,
+                                 const int32x2_t bottom_right,
+                                 const int32x2_t x_lerp,
+                                 const int32x2_t y_lerp) {
+  static_assert(RESOLUTION < 31, "RESOLUTION must be less than 31");
+  constexpr int32 RESOLUTION_MULT32 = (1 << RESOLUTION);
+  static const int32x2_t RESOLUTION_MULT32x2 = vmov_n_s32(RESOLUTION_MULT32);
+
+  const int64x2_t top_left_x_res = vmull_s32(top_left, RESOLUTION_MULT32x2);
+  const int64x2_t bottom_left_x_res =
+      vmull_s32(bottom_left, RESOLUTION_MULT32x2);
+
+  const int32x2_t top_right_sub_top_left = vsub_s32(top_right, top_left);
+  const int64x2_t top_x_res =
+      vmlal_s32(top_left_x_res, top_right_sub_top_left, x_lerp);
+  const int32x2_t bottom_right_sub_bottom_left =
+      vsub_s32(bottom_right, bottom_left);
+  const int64x2_t bottom_x_res =
+      vmlal_s32(bottom_left_x_res, bottom_right_sub_bottom_left, x_lerp);
+
+  const int64x2_t bottom_sub_top_x_res = vsubq_s64(bottom_x_res, top_x_res);
+  const int64x2_t bottom_sub_top =
+      Divide64x2Pow<RESOLUTION>(bottom_sub_top_x_res);
+  const int32x2_t bottom_sub_top_32 = vqmovn_s64(bottom_sub_top);
+  const int64x2_t top_add_bottom_sub_top_mul_ylerp_x_res =
+      vmlal_s32(top_x_res, bottom_sub_top_32, y_lerp);
+  const int64x2_t retval =
+      Divide64x2PowRound<RESOLUTION>(top_add_bottom_sub_top_mul_ylerp_x_res);
+  const int32x2_t retval32 = vqmovn_s64(retval);
+  return retval32;
+}
+
+// 8bit x 8 NEON accelerated lerp computation.
+template <int RESOLUTION>
+inline uint8x8_t ComputeLerp8x8(const uint8x8_t top_left8x8,
+                                const uint8x8_t top_right8x8,
+                                const uint8x8_t bottom_left8x8,
+                                const uint8x8_t bottom_right8x8,
+                                const int16x8_t x_lerp,
+                                const int16x8_t y_lerp) {
+  static_assert(RESOLUTION < 8, "RESOLUTION must be less than 8");
+  constexpr uint8 RESOLUTION_MULT_VAL = (1 << RESOLUTION);
+  static const uint8x8_t RESOLUTION_MULT = vdup_n_u8(RESOLUTION_MULT_VAL);
+
+  const int16x8_t top_left_x_res =
+      vreinterpretq_s16_u16(vmull_u8(top_left8x8, RESOLUTION_MULT));
+  const int16x8_t bottom_left_x_res =
+      vreinterpretq_s16_u16(vmull_u8(bottom_left8x8, RESOLUTION_MULT));
+
+  const int16x8_t top_right_sub_top_left =
+      vreinterpretq_s16_u16(vsubl_u8(top_right8x8, top_left8x8));
+  const int16x8_t top_x_res =
+      vmlaq_s16(top_left_x_res, top_right_sub_top_left, x_lerp);
+
+  const int16x8_t bottom_right_sub_bottom_left =
+      vreinterpretq_s16_u16(vsubl_u8(bottom_right8x8, bottom_left8x8));
+  const int16x8_t bottom_x_res =
+      vmlaq_s16(bottom_left_x_res, bottom_right_sub_bottom_left, x_lerp);
+
+  const int16x8_t bottom_sub_top_x_res = vsubq_s16(bottom_x_res, top_x_res);
+  const int16x8_t bottom_sub_top =
+      Divide16x8Pow<RESOLUTION>(bottom_sub_top_x_res);
+  const int16x8_t top_add_bottom_sub_top_mul_ylerp_x_res =
+      vmlaq_s16(top_x_res, bottom_sub_top, y_lerp);
+  const int16x8_t retval16 =
+      Divide16x8PowRound<RESOLUTION>(top_add_bottom_sub_top_mul_ylerp_x_res);
+  const uint8x8_t retval = vmovn_u16(vreinterpretq_u16_s16(retval16));
+  return retval;
+}
+
+// Requantize 8 x 8 quints to 8 x 32 qints in parallel by neon
+// Return std::array instead of pointer to leverage return value optimization
+inline std::array<int32x4_t, 2> Requantize8x8To32Neon(
+    const uint8* input_ptr, const int64x2_t input_0_64x2,
+    const int32x2_t input_mult_32x2) {
+  const uint8x8_t input_value_8x8 = vld1_u8(input_ptr);
+  const int16x8_t input_value_16x8 =
+      vreinterpretq_s16_u16(vmovl_u8(input_value_8x8));
+  const int16x4_t input_value_low_16x4 = vget_low_s16(input_value_16x8);
+  const int16x4_t input_value_high_16x4 = vget_high_s16(input_value_16x8);
+  const int32x4_t input_value_low_32x4 = vmovl_s16(input_value_low_16x4);
+  const int32x4_t input_value_high_32x4 = vmovl_s16(input_value_high_16x4);
+  const int32x2_t input_value_low_low_32x2 = vget_low_s32(input_value_low_32x4);
+  const int32x2_t input_value_low_high_32x2 =
+      vget_high_s32(input_value_low_32x4);
+  const int32x2_t input_value_high_low_32x2 =
+      vget_low_s32(input_value_high_32x4);
+  const int32x2_t input_value_high_high_32x2 =
+      vget_high_s32(input_value_high_32x4);
+  const int64x2_t mult_result_low_low_64x2 =
+      vmlal_s32(input_0_64x2, input_value_low_low_32x2, input_mult_32x2);
+  const int64x2_t mult_result_low_high_64x2 =
+      vmlal_s32(input_0_64x2, input_value_low_high_32x2, input_mult_32x2);
+  const int64x2_t mult_result_high_low_64x2 =
+      vmlal_s32(input_0_64x2, input_value_high_low_32x2, input_mult_32x2);
+  const int64x2_t mult_result_high_high_64x2 =
+      vmlal_s32(input_0_64x2, input_value_high_high_32x2, input_mult_32x2);
+  const int32x2_t output_value_low_low_32x2 =
+      vqmovn_s64(mult_result_low_low_64x2);
+  const int32x2_t output_value_low_high_32x2 =
+      vqmovn_s64(mult_result_low_high_64x2);
+  const int32x2_t output_value_high_low_32x2 =
+      vqmovn_s64(mult_result_high_low_64x2);
+  const int32x2_t output_value_high_high_32x2 =
+      vqmovn_s64(mult_result_high_high_64x2);
+  const int32x4_t output_value_low_32x4 =
+      vcombine_s32(output_value_low_low_32x2, output_value_low_high_32x2);
+  const int32x4_t output_value_high_32x4 =
+      vcombine_s32(output_value_high_low_32x2, output_value_high_high_32x2);
+  return std::array<int32x4_t, 2>{
+      {output_value_low_32x4, output_value_high_32x4}};
+}
+
+// Speeds up the 8->32bit conversion using fixed-point arithmetic and NEON SIMD
+// intrinsics for ARM platforms.
+template <>
+inline void RequantizeManyInNewRange<quint8, qint32>(
+    const quint8* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, qint32* output) {
+  // Pre-calculate zero position and multiplier.
+  // Calculate 0 and 1 value in float.
+  const float code_0_float = QuantizedToFloat<quint8>(0, min_input, max_input);
+  const float code_1_float = QuantizedToFloat<quint8>(1, min_input, max_input);
+
+  // Cast 0 and 1 value in int64.
+  const int64 code_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_0_float, min_output, max_output);
+  const int64 code_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_1_float, min_output, max_output);
+
+  // Calculate multiplier.
+  const int32 mult_int32 = static_cast<int32>(code_1_int64 - code_0_int64);
+
+  // Broadcast 0 position and multiplier to lanes
+  const int64x2_t code_0_64x2 = vmovq_n_s64(code_0_int64);
+  const int32x2_t mult_32x2 = vmov_n_s32(mult_int32);
+
+  int64 i = 0;
+
+  // Use SIMD to requantize array.
+  for (; i < (count - 7); i += 8) {
+    const uint8* input_ptr = &(input->value) + i;
+    int32* output_ptr = &(output->value) + i;
+    const std::array<int32x4_t, 2> output_value =
+        Requantize8x8To32Neon(input_ptr, code_0_64x2, mult_32x2);
+    vst1q_s32(output_ptr + 0, output_value[0]);
+    vst1q_s32(output_ptr + 4, output_value[1]);
+  }
+
+  // Requantize remaining elements in array without SIMD.
+  const int64 lowest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64_t>(Eigen::NumTraits<qint32>::highest());
+
+  for (; i < count; ++i) {
+    const int64 input_value = static_cast<int64_t>(input[i]);
+    int64 output_value = code_0_int64 + (input_value * mult_int32);
+    output_value = std::max(output_value, lowest_quantized);
+    output_value = std::min(output_value, highest_quantized);
+    output[i] = static_cast<int32>(output_value);
+  }
+}
+
+#else
+
+// If SIMD implementations aren't available, then use these default reference
+// versions.
+template <>
+inline void RequantizeManyInNewRange<qint32, quint8>(
+    const qint32* input, int64_t count, float min_input, float max_input,
+    float min_output, float max_output, quint8* output) {
+  RequantizeManyInNewRangeReference(input, count, min_input, max_input,
+                                    min_output, max_output, output);
+}
+
+template <>
+inline void RequantizeManyInNewRange<quint8, qint32>(
+    const quint8* input, int64_t count, float min_input, float max_input,
+    float min_output, float max_output, qint32* output) {
+  RequantizeManyInNewRange8To32BitReference(input, count, min_input, max_input,
+                                            min_output, max_output, output);
+}
+
+#endif
+
+template <int shift>
+struct int64_right_shift_op {
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const int64_t operator()(const int64_t a) const {
+    return a >> shift;
+  }
+};
+
+// See RequantizeManyInNewRange() for a non-eigen reference implementation.
+template <class T1, class T2>
+inline void RequantizeManyInNewRangeUsingEigen(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min_input,
+    float max_input, float min_output, float max_output, Tensor* output) {
+  auto input_array = input.flat<T1>();
+  QuantizedToFloatStruct<T1> q2f(min_input, max_input);
+  auto input_float = DEQUANTIZE_WITH_EIGEN(input_array, q2f);
+  FloatToQuantizedStruct<T2> f2q(min_output, max_output);
+  auto input_requantized = QUANTIZE_WITH_EIGEN(input_float, f2q, T2);
+
+  output->flat<T2>().device(device) = input_requantized;
+}
+
+// See RequantizeManyInNewRange() for a non-eigen reference implementation.
+//
+// Because converting 32-bit accumulated results down to eight bit is a common
+// case, we have a specialized code path to handle it as efficiently as
+// possible using only fixed-point math for the inner loop.
+template <>
+inline void RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min_input,
+    float max_input, float min_output, float max_output, Tensor* output) {
+  // Initially we calculate all the constants we need once, before we go into
+  // the inner loop.  If this is updated, also update the non-Eigen version.
+  const int fp_shift = 16;
+  const float input_range = max_input - min_input;
+  const float output_range = max_output - min_output;
+  const float recip_output_range =
+      output_range == 0.0 ? 0.0 : (255.0 / output_range);
+  const float input_rezero = (min_input + max_input) / 2.0;
+  const int64_t range_scale_fp =
+      output_range == 0.0 ? 0.0
+                          : static_cast<int64_t>(255.0 * (1 << fp_shift) *
+                                                 input_range / output_range);
+  const int64_t input_offset_fp =
+      static_cast<int64_t>(input_rezero * recip_output_range * (1 << fp_shift));
+  const int64_t output_offset_fp =
+      output_range == 0.0
+          ? 0
+          : std::lround((1 << fp_shift) * (min_output * 255.0) / output_range);
+  const int64_t rounding_delta = 1 << (fp_shift - 1);
+
+  // Inside this eigen expression we just do minimal adds, multiplies, and
+  // shifts. It should be possible to perform all the calculations in 32-bit
+  // rather than 64, but that's not been implemented yet.
+  auto input_array = input.flat<qint32>();
+  auto fp_value = ((input_array.template cast<int64_t>() * range_scale_fp)
+                       .unaryExpr(int64_right_shift_op<32>())) +
+                  (input_offset_fp - output_offset_fp + rounding_delta);
+  auto intermediate = fp_value.unaryExpr(int64_right_shift_op<fp_shift>());
+  auto input_requantized = intermediate.cwiseMax(int64_t{0})
+                               .cwiseMin(int64_t{255})
+                               .template cast<int32>()
+                               .template cast<quint8>();
+  output->flat<quint8>().device(device) = input_requantized;
+}
+
+// REQUIRES: 'result->NumElements() == input.NumElements()'
+template <class T>
+void FloatTensorToQuantizedInPlaceUsingEigen(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min,
+    float max, Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), result->dtype());
+  auto flat_input = input.flat<float>();
+  auto flat_result = result->flat<T>();
+  DCHECK_EQ(flat_input.size(), flat_result.size());
+
+  FloatToQuantizedStruct<T> f2q(min, max);
+  flat_result.device(device) = QUANTIZE_WITH_EIGEN(flat_input, f2q, T);
+}
+
+template <class T>
+void FloatTensorToQuantizedInPlace(const Tensor& input, float min, float max,
+                                   Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), result->dtype());
+  auto flat_input = input.flat<float>();
+  auto flat_result = result->flat<T>();
+  const int data_size = flat_input.size();
+  DCHECK(data_size == flat_result.size());
+  for (int i = 0; i < data_size; ++i) {
+    flat_result(i) = FloatToQuantized<T>(flat_input(i), min, max);
+  }
+}
+
+template <class T>
+Tensor FloatTensorToQuantized(const Tensor& input, float min, float max) {
+  Tensor result(DataTypeToEnum<T>::v(), input.shape());
+  FloatTensorToQuantizedInPlace<T>(input, min, max, &result);
+  return result;
+}
+
+// REQUIRES: 'result->NumElements() == input.NumElements()'
+template <class T>
+void QuantizedTensorToFloatInPlaceUsingEigen(
+    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min,
+    float max, Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), input.dtype());
+  auto flat_input = input.flat<T>();
+  auto flat_result = result->flat<float>();
+  const int data_size = flat_input.size();
+  DCHECK(data_size == flat_result.size());
+
+  QuantizedToFloatStruct<T> q2f(min, max);
+  flat_result.device(device) = DEQUANTIZE_WITH_EIGEN(flat_input, q2f);
+}
+
+// REQUIRES: 'result->NumElements() == input.NumElements()'
+template <class T>
+void QuantizedTensorToFloatInPlace(const Tensor& input, float min, float max,
+                                   Tensor* result) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), input.dtype());
+  auto flat_input = input.flat<T>();
+  auto flat_result = result->flat<float>();
+  const int data_size = flat_input.size();
+  DCHECK(data_size == flat_result.size());
+  for (int i = 0; i < data_size; ++i) {
+    flat_result(i) = QuantizedToFloat<T>(flat_input(i), min, max);
+  }
+}
+
+template <class T>
+Tensor QuantizedTensorToFloat(const Tensor& input, float min, float max) {
+  Tensor result(DT_FLOAT, input.shape());
+  QuantizedTensorToFloatInPlace<T>(input, min, max, &result);
+  return result;
+}
+
+void GetOutputMinAndMaxForQuantizedAdd(float input_min, float input_max,
+                                       float smaller_input_min,
+                                       float smaller_input_max,
+                                       float* output_min, float* output_max);
+
+// Add <input> and <smaller_input>.  If <smaller_input> has fewer elements than
+// <input>, then it is broadcast onto <input>.
+template <typename T1, typename T2, typename T3>
+void QuantizedAddUsingEigen(const Eigen::ThreadPoolDevice& device,
+                            const Tensor& input, float input_min,
+                            float input_max, const Tensor& smaller_input,
+                            float smaller_input_min, float smaller_input_max,
+                            Tensor* output, float* output_min,
+                            float* output_max) {
+  const auto& input_flat = input.flat<T1>();
+  const auto& smaller_input_flat = smaller_input.flat<T2>();
+  auto output_flat = output->flat<T3>();
+
+  GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, smaller_input_min,
+                                    smaller_input_max, output_min, output_max);
+  // To do addition properly, we need to compensate for a possibly unbalanced
+  // zero point in the total representation. The quantized value that
+  // represents the real number zero needs to be subtracted before addition to
+  // make sure that the identity of zero + zero = zero holds.
+  const T3 zero_in_total_space =
+      FloatToQuantized<T3>(0.0f, *output_min, *output_max);
+
+  const int64_t input_element_count = input.NumElements();
+  const int64_t smaller_input_element_count = smaller_input.NumElements();
+
+  QuantizedToFloatStruct<T1> input_q2f(input_min, input_max);
+  QuantizedToFloatStruct<T2> smaller_input_q2f(smaller_input_min,
+                                               smaller_input_max);
+  FloatToQuantizedStruct<T3> f2q(*output_min, *output_max);
+
+  auto smaller_input_float =
+      DEQUANTIZE_WITH_EIGEN(smaller_input_flat, smaller_input_q2f);
+  auto smaller_input_in_total_space =
+      QUANTIZE_WITH_EIGEN(smaller_input_float, f2q, T3);
+
+  auto input_float = DEQUANTIZE_WITH_EIGEN(input_flat, input_q2f);
+  auto input_in_total_space = QUANTIZE_WITH_EIGEN(input_float, f2q, T3);
+
+  Eigen::array<Eigen::DenseIndex, 1> bcast;
+  bcast[0] = input_element_count / smaller_input_element_count;
+  output_flat.device(device) =
+      input_in_total_space +
+      (smaller_input_in_total_space.broadcast(bcast) + zero_in_total_space);
+}
+
+// This is a reference implementation of the bias addition for quantized
+// buffers, designed to provide a clear specification for the result we
+// want. We'll want to specialize this for particular hardware, and
+// probably even fuse it with matrix multiplications in a lot of cases. It's
+// important to show the clamping behavior we want in particular.
+template <typename T1, typename T2, typename T3>
+void QuantizedAdd(const Eigen::ThreadPoolDevice& device, const Tensor& input,
+                  float input_min, float input_max, const Tensor& smaller_input,
+                  float smaller_input_min, float smaller_input_max,
+                  Tensor* output, float* output_min, float* output_max) {
+  const auto& input_flat = input.flat<T1>();
+  const auto& smaller_input_flat = smaller_input.flat<T2>();
+  auto output_flat = output->flat<T3>();
+
+  GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, smaller_input_min,
+                                    smaller_input_max, output_min, output_max);
+  // To do addition properly, we need to compensate for a possibly unbalanced
+  // zero point in the total representation. The quantized value that
+  // represents the real number zero needs to be subtracted before addition to
+  // make sure that the identity of zero + zero = zero holds.
+  const T3 zero_in_total_space =
+      FloatToQuantized<T3>(0.0f, *output_min, *output_max);
+
+  const int64_t input_element_count = input.NumElements();
+  const int64_t smaller_input_element_count = smaller_input.NumElements();
+
+  float total_min = *output_min;
+  float total_max = *output_max;
+  const size_t how_many_iterations =
+      (input_element_count / smaller_input_element_count);
+  for (size_t iteration = 0; iteration < how_many_iterations; ++iteration) {
+    const size_t offset = iteration * smaller_input_element_count;
+    for (int c = 0; c < smaller_input_element_count; ++c) {
+      const int index = (offset + c);
+      // The two numbers we're going to add can each be in very different
+      // ranges (e.g. the quantized value '127' may represent very different
+      // real numbers in both) so we need to convert them to a common range
+      // before we sum them.
+      const T1 input_value = input_flat(index);
+      const T3 input_in_total_space = RequantizeInNewRange<T1, T3>(
+          input_value, input_min, input_max, total_min, total_max);
+      const T2 smaller_input_value = smaller_input_flat(c);
+      const T3 smaller_input_in_total_space =
+          RequantizeInNewRange<T2, T3>(smaller_input_value, smaller_input_min,
+                                       smaller_input_max, total_min, total_max);
+      const T3 total_pre = input_in_total_space + smaller_input_in_total_space;
+      // As noted above, we need to compensate for the offset of the actual
+      // zero point in the space we're operating in.
+      const T3 total = total_pre + zero_in_total_space;
+      output_flat(index) = total;
+    }
+  }
+}
+
+// See gemmlowp/internal/multi_thread_gemm.h for the semantics of Execute.
+class TensorflowGemmlowpWorkersPool {
+ public:
+  TensorflowGemmlowpWorkersPool(thread::ThreadPool* workers)
+      : workers_(workers) {}
+
+  ~TensorflowGemmlowpWorkersPool() {
+    // This workaround ensures that all worker tasks have exited methods in the
+    // BlockingCounter. Without this, there is a race where the context is torn
+    // down while the counter is in use.
+    counter_to_decrement_when_ready_.Reset(0);
+  }
+
+  void Execute(const std::vector<gemmlowp::Task*>& tasks) {
+    assert(!tasks.empty());
+    assert(workers_ != nullptr);
+    counter_to_decrement_when_ready_.Reset(tasks.size());
+    for (gemmlowp::Task* task : tasks) {
+      workers_->Schedule([this, task]() {
+        // TODO(cwhipkey): get a local_allocator from a thread local storage.
+        gemmlowp::Allocator local_allocator;
+        CHECK(task != nullptr);
+        task->local_allocator = &local_allocator;
+        task->Run();
+        counter_to_decrement_when_ready_.DecrementCount();
+      });
+    }
+    counter_to_decrement_when_ready_.Wait();
+    for (gemmlowp::Task* task : tasks) {
+      delete task;
+    }
+  }
+
+ private:
+  thread::ThreadPool* const workers_;
+
+  // The BlockingCounter used to wait for the workers.
+  gemmlowp::BlockingCounter counter_to_decrement_when_ready_;
+
+  TensorflowGemmlowpWorkersPool(const TensorflowGemmlowpWorkersPool&) = delete;
+  void operator=(const TensorflowGemmlowpWorkersPool&) = delete;
+};
+
+class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
+ public:
+  TensorflowGemmContext(int num_threads, thread::ThreadPool* workers)
+      : workers_pool_(workers) {
+    set_max_num_threads(num_threads);
+  }
+
+  TensorflowGemmlowpWorkersPool* workers_pool() { return &workers_pool_; }
+
+ private:
+  TensorflowGemmlowpWorkersPool workers_pool_;
+
+  TensorflowGemmContext(const TensorflowGemmContext&) = delete;
+  void operator=(const TensorflowGemmContext&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/quantize_and_dequantize_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/quantize_and_dequantize_op.h
new file mode 100644
index 00000000..253d667a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -0,0 +1,322 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
+namespace functor {
+
+// TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleFunctor {
+  void operator()(const Device& d, typename TTypes<T>::ConstVec input,
+                  bool signed_input, int num_bits, bool range_given,
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  QuantizerRoundMode round_mode, bool narrow_range,
+                  typename TTypes<T>::Vec output);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor input,
+                  bool signed_input, int num_bits, bool range_given,
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  QuantizerRoundMode round_mode, bool narrow_range,
+                  typename TTypes<T, 3>::Tensor output);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min,
+                  typename TTypes<T>::ConstScalar input_max,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop);
+};
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func,
+          typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ClampScaleAndRound(const Device& d, ConstVec input, T min_range,
+                        T max_range, T scale, T inverse_scale, Func round_func,
+                        Vec output) {
+  output.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
+                         .unaryExpr(round_func) *
+                     inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ClampScaleAndRound(const Device& d, ConstVec input, T min_range,
+                        T max_range, T scale, T inverse_scale,
+                        QuantizerRoundMode round_mode, Vec output) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_half_to_even_op<T>(),
+                         output);
+      break;
+    case ROUND_HALF_UP:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_up_op<T>(), output);
+      break;
+  }
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func,
+          typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ScaleAndRound(const Device& d, ConstVec input, T scale, T inverse_scale,
+                   Func round_func, Vec output) {
+  output.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Vec = typename TTypes<T>::Vec,
+          typename ConstVec = typename TTypes<T>::ConstVec>
+void ScaleAndRound(const Device& d, ConstVec input, T scale, T inverse_scale,
+                   QuantizerRoundMode round_mode, Vec output) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_half_to_even_op<T>(), output);
+      break;
+    case ROUND_HALF_UP:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_up_op<T>(), output);
+      break;
+  }
+}
+
+template <typename T>
+void ComputeQuantizationRange(bool signed_input, int num_bits,
+                              QuantizerRoundMode round_mode, bool narrow_range,
+                              T* min_range, T* max_range, T* scale,
+                              T* inverse_scale) {
+  // Calculate the range for the simulated integer quantization:
+  // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
+  // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
+  // or [0, 255] for signed = false, num_bits = 8.
+  const int64_t min_quantized =
+      signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1
+                                  : -(1ULL << (num_bits - 1))
+                   : 0;
+  const int64_t max_quantized =
+      signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
+  // Determine the maximum scaling factor that would scale
+  // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+  // while keeping 0 unchanged.
+  const T scale_from_min_side = (min_quantized * *min_range > 0)
+                                    ? min_quantized / *min_range
+                                    : std::numeric_limits<T>::max();
+  const T scale_from_max_side = (max_quantized * *max_range > 0)
+                                    ? max_quantized / *max_range
+                                    : std::numeric_limits<T>::max();
+
+  // Note: Avoids changing the side of the range that determines scale.
+  if (scale_from_min_side < scale_from_max_side) {
+    *scale = scale_from_min_side;
+    *inverse_scale = *min_range / min_quantized;
+    *max_range = max_quantized * *inverse_scale;
+  } else {
+    *scale = scale_from_max_side;
+    *inverse_scale = *max_range / max_quantized;
+    *min_range = min_quantized * *inverse_scale;
+  }
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstVec input,
+                      bool signed_input, int num_bits, bool range_given,
+                      Tensor* input_min_tensor, Tensor* input_max_tensor,
+                      QuantizerRoundMode round_mode, bool narrow_range,
+                      typename TTypes<T>::Vec output) {
+    T min_range;
+    T max_range;
+    auto input_min = input_min_tensor->scalar<T>();
+    auto input_max = input_max_tensor->scalar<T>();
+    if (!range_given) {
+      input_min.device(d) = input.minimum();
+      input_max.device(d) = input.maximum();
+      d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
+      d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
+    } else {
+      // Copy the range values from their respective tensors on the host.
+      min_range = input_min_tensor->scalar<T>()();
+      max_range = input_max_tensor->scalar<T>()();
+    }
+
+    T scale, inverse_scale;
+    ComputeQuantizationRange(signed_input, num_bits, round_mode, narrow_range,
+                             &min_range, &max_range, &scale, &inverse_scale);
+
+    if (range_given) {
+      // Note: The clamping here is to avoid overflow in the quantized type.
+      // The semantics of the op does not guarantee to clamp to the specified
+      // min_range and max_range - because we may have changed either min_range
+      // or max_range.
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         round_mode, output);
+    } else {
+      ScaleAndRound(d, input, scale, inverse_scale, round_mode, output);
+    }
+  }
+};
+
+// The implementation below runs on both CPU and GPU.
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelImpl {
+  static void Compute(const Device& d, typename TTypes<T, 3>::ConstTensor input,
+                      bool signed_input, int num_bits, bool range_given,
+                      Tensor* input_min_tensor, Tensor* input_max_tensor,
+                      QuantizerRoundMode round_mode, bool narrow_range,
+                      typename TTypes<T, 3>::Tensor output) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    int num_channels = input.dimension(1);
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    std::vector<T> min_range(num_channels);
+    std::vector<T> max_range(num_channels);
+
+    if (!range_given) {
+      Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2> > reduce_dims;
+      input_min.device(d) = input.minimum(reduce_dims);
+      input_max.device(d) = input.maximum(reduce_dims);
+      d.memcpyDeviceToHost(min_range.data(), input_min.data(),
+                           num_channels * sizeof(T));
+      d.memcpyDeviceToHost(max_range.data(), input_max.data(),
+                           num_channels * sizeof(T));
+    } else {
+      // Copy the range values from their respective tensors on the host.
+      std::memcpy(min_range.data(), input_min_tensor->vec<T>().data(),
+                  num_channels * sizeof(T));
+      std::memcpy(max_range.data(), input_max_tensor->vec<T>().data(),
+                  num_channels * sizeof(T));
+    }
+
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto input_chip = input.template chip<1>(i);
+      auto output_chip = output.template chip<1>(i);
+
+      T scale, inverse_scale;
+      ComputeQuantizationRange(signed_input, num_bits, round_mode, narrow_range,
+                               &min_range[i], &max_range[i], &scale,
+                               &inverse_scale);
+      if (range_given) {
+        ClampScaleAndRound(d, input_chip, min_range[i], max_range[i], scale,
+                           inverse_scale, round_mode, output_chip);
+      } else {
+        ScaleAndRound(d, input_chip, scale, inverse_scale, round_mode,
+                      output_chip);
+      }
+    }
+  }
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                      typename TTypes<T>::ConstFlat input,
+                      typename TTypes<T>::ConstScalar input_min,
+                      typename TTypes<T>::ConstScalar input_max,
+                      typename TTypes<T>::Flat input_backprop,
+                      typename TTypes<T>::Scalar input_min_backprop,
+                      typename TTypes<T>::Scalar input_max_backprop) {
+    const T min_val = input_min();
+    const T max_val = input_max();
+    const auto in_range =
+        (input >= min_val && input <= max_val)
+            .select(input.constant(1.0f), input.constant(0.0f));
+    input_backprop.device(d) = gradient * in_range;
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientImpl {
+  static void Compute(const Device& d,
+                      typename TTypes<T, 3>::ConstTensor gradient,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      const Tensor* input_min_tensor,
+                      const Tensor* input_max_tensor,
+                      typename TTypes<T, 3>::Tensor input_backprop,
+                      typename TTypes<T>::Flat input_min_backprop,
+                      typename TTypes<T>::Flat input_max_backprop) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    int num_channels = input.dimension(1);
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto gradient_chip = gradient.template chip<1>(i);
+      const auto input_chip = input.template chip<1>(i);
+      const T min_val = input_min(i);
+      const T max_val = input_max(i);
+      const auto in_range =
+          (input_chip >= min_val && input_chip <= max_val)
+              .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
+      input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
+    }
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
+}  // end of namespace functor
+}  // end of namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/queue_base.h b/third_party/tflite-hdrs/tensorflow/core/kernels/queue_base.h
new file mode 100644
index 00000000..d39ab454
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/queue_base.h
@@ -0,0 +1,188 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
+
+#include <deque>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Functionality common to asynchronous QueueInterface implementations.
+class QueueBase : public QueueInterface {
+ public:
+  // As a possible value of 'capacity'.
+  static constexpr int32_t kUnbounded = INT_MAX;
+
+  // Args:
+  //   component_dtypes: The types of each component in a queue-element tuple.
+  //   component_shapes: The shapes of each component in a queue-element tuple,
+  //     which must either be empty (if the shapes are not specified) or
+  //     or have the same size as component_dtypes.
+  //   name: A name to use for the queue.
+  QueueBase(int32_t capacity, const DataTypeVector& component_dtypes,
+            const std::vector<TensorShape>& component_shapes,
+            const string& name);
+
+  // Implementations of QueueInterface methods --------------------------------
+  const DataTypeVector& component_dtypes() const override {
+    return component_dtypes_;
+  }
+
+  absl::Status ValidateTuple(const Tuple& tuple) override;
+  absl::Status ValidateManyTuple(const Tuple& tuple) override;
+
+  void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
+             DoneCallback callback) override;
+
+  // Other public methods -----------------------------------------------------
+  const std::vector<TensorShape>& component_shapes() const {
+    return component_shapes_;
+  }
+
+  int32 capacity() const { return capacity_; }
+
+  bool is_closed() const override {
+    mutex_lock lock(mu_);
+    return closed_;
+  }
+
+  // Copies the index^th slice (in the first dimension) of parent into element.
+  static absl::Status CopySliceToElement(const Tensor& parent, Tensor* element,
+                                         int64_t index);
+
+  // Copies element into the index^th slice (in the first dimension) of parent.
+  // NOTE(mrry): This method is deprecated. Use
+  // `tensorflow::batch_util::CopySliceToElement()` defined in
+  // "./batch_util.h" instead.
+  ABSL_DEPRECATED(
+      "Use `tensorflow::batch_util::CopySliceToElement()` defined in "
+      "\"./batch_util.h\" instead.")
+  static absl::Status CopyElementToSlice(const Tensor& element, Tensor* parent,
+                                         int64_t index);
+
+ protected:
+  enum Action { kEnqueue, kDequeue };
+  enum RunResult { kNoProgress, kProgress, kComplete };
+
+  // Tries to enqueue/dequeue (or close) based on whatever is at the
+  // front of enqueue_attempts_/dequeue_attempts_.  Appends to
+  // *finished the callback for any finished attempt (so it may be
+  // called once mu_ is released).  Returns true if any progress was
+  // made.
+  struct CleanUp {
+    CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm)
+        : finished(f), to_deregister(ct), cm(cm) {}
+    DoneCallback finished;
+    CancellationToken to_deregister;
+    CancellationManager* cm;
+  };
+
+  // Returns the number of components in a queue-element tuple.
+  int32 num_components() const { return component_dtypes_.size(); }
+
+  // True if shapes were specified.  If so, inputs will be validated
+  // against them, etc.
+  bool specified_shapes() const { return component_shapes_.size() > 0; }
+
+  // Code common to Validate*Tuple().
+  absl::Status ValidateTupleCommon(const Tuple& tuple) const;
+
+  TensorShape ManyOutShape(int i, int64_t batch_size) {
+    TensorShape shape({batch_size});
+    shape.AppendShape(component_shapes_[i]);
+    return shape;
+  }
+
+  void Cancel(Action action, CancellationManager* cancellation_manager,
+              CancellationToken token);
+
+  // Helper for cancelling all pending Enqueue(Many) operations when
+  // Close is called with cancel_pending_enqueues.
+  void CloseAndCancel();
+
+  bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Tries to make progress on the enqueues or dequeues at the front
+  // of the *_attempts_ queues.
+  void FlushUnlocked();
+
+  ~QueueBase() override;
+
+  // Helpers for implementing MatchesNodeDef().
+  static string ShapeListString(const absl::Span<const TensorShape>& shapes);
+  absl::Status MatchesNodeDefOp(const NodeDef& node_def,
+                                const string& op) const;
+  absl::Status MatchesNodeDefCapacity(const NodeDef& node_def,
+                                      int32_t capacity) const;
+  absl::Status MatchesNodeDefTypes(const NodeDef& node_def) const;
+  absl::Status MatchesNodeDefShapes(const NodeDef& node_def) const;
+
+ protected:
+  const int32 capacity_;
+  const DataTypeVector component_dtypes_;
+  const std::vector<TensorShape> component_shapes_;
+  const string name_;
+  mutable mutex mu_;
+  bool closed_ TF_GUARDED_BY(mu_);
+
+  struct Attempt;
+  typedef std::function<RunResult(Attempt*)> RunCallback;
+  struct Attempt {
+    int32 elements_requested;
+    DoneCallback done_callback;  // must be run outside mu_
+    OpKernelContext* context;
+    CancellationManager* cancellation_manager;  // not owned
+    CancellationToken cancellation_token;
+    RunCallback run_callback;  // must be run while holding mu_
+    bool is_cancelled;
+    Tuple tuple;
+    // tuples is used by some implementations allowing dynamic shapes.
+    std::vector<Tuple> tuples;
+
+    Attempt(int32_t elements_requested, DoneCallback done_callback,
+            OpKernelContext* context, CancellationManager* cancellation_manager,
+            CancellationToken cancellation_token, RunCallback run_callback)
+        : elements_requested(elements_requested),
+          done_callback(done_callback),
+          context(context),
+          cancellation_manager(cancellation_manager),
+          cancellation_token(cancellation_token),
+          run_callback(run_callback),
+          is_cancelled(false) {}
+  };
+  std::deque<Attempt> enqueue_attempts_ TF_GUARDED_BY(mu_);
+  std::deque<Attempt> dequeue_attempts_ TF_GUARDED_BY(mu_);
+
+  QueueBase(const QueueBase&) = delete;
+  void operator=(const QueueBase&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/queue_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/queue_op.h
new file mode 100644
index 00000000..57a771d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/queue_op.h
@@ -0,0 +1,279 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_QUEUE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_QUEUE_OP_H_
+
+#include <deque>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Defines a QueueOp, an abstract class for Queue construction ops.
+class QueueOp : public ResourceOpKernel<QueueInterface> {
+ public:
+  QueueOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ protected:
+  // Variables accessible by subclasses
+  int32 capacity_;
+  DataTypeVector component_types_;
+
+ private:
+  absl::Status VerifyResource(QueueInterface* queue) override;
+};
+
+class TypedQueueOp : public QueueOp {
+ public:
+  using QueueOp::QueueOp;
+
+ protected:
+  template <typename TypedQueue>
+  absl::Status CreateTypedQueue(TypedQueue* queue, QueueInterface** ret) {
+    if (queue == nullptr) {
+      return errors::ResourceExhausted("Failed to allocate queue.");
+    }
+    *ret = queue;
+    return queue->Initialize();
+  }
+};
+
+// Queue manipulator kernels
+
+class QueueOpKernel : public AsyncOpKernel {
+ public:
+  explicit QueueOpKernel(OpKernelConstruction* context);
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback callback) final;
+
+ protected:
+  virtual void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                            DoneCallback callback) = 0;
+};
+
+class QueueAccessOpKernel : public QueueOpKernel {
+ public:
+  explicit QueueAccessOpKernel(OpKernelConstruction* context);
+
+ protected:
+  int64_t timeout_;
+};
+
+// Defines an EnqueueOp, the execution of which enqueues a tuple of
+// tensors in the given Queue.
+//
+// The op has 1 + k inputs, where k is the number of components in the
+// tuples stored in the given Queue:
+// - Input 0: queue handle.
+// - Input 1: 0th element of the tuple.
+// - ...
+// - Input (1+k): kth element of the tuple.
+class EnqueueOp : public QueueAccessOpKernel {
+ public:
+  explicit EnqueueOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+ private:
+  EnqueueOp(const EnqueueOp&) = delete;
+  void operator=(const EnqueueOp&) = delete;
+};
+
+// Defines an EnqueueManyOp, the execution of which slices each
+// component of a tuple of tensors along the 0th dimension, and
+// enqueues tuples of slices in the given Queue.
+//
+// The op has 1 + k inputs, where k is the number of components in the
+// tuples stored in the given Queue:
+// - Input 0: queue handle.
+// - Input 1: 0th element of the tuple.
+// - ...
+// - Input (1+k): kth element of the tuple.
+//
+// N.B. All tuple components must have the same size in the 0th
+// dimension.
+class EnqueueManyOp : public QueueAccessOpKernel {
+ public:
+  explicit EnqueueManyOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+  ~EnqueueManyOp() override;
+
+ private:
+  EnqueueManyOp(const EnqueueManyOp&) = delete;
+  void operator=(const EnqueueManyOp&) = delete;
+};
+
+// Defines a DequeueOp, the execution of which dequeues a tuple of
+// tensors from the given Queue.
+//
+// The op has one input, which is the handle of the appropriate
+// Queue. The op has k outputs, where k is the number of components in
+// the tuples stored in the given Queue, and output i is the ith
+// component of the dequeued tuple.
+class DequeueOp : public QueueAccessOpKernel {
+ public:
+  explicit DequeueOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+  ~DequeueOp() override;
+
+ private:
+  DequeueOp(const DequeueOp&) = delete;
+  void operator=(const DequeueOp&) = delete;
+};
+
+// Defines a DequeueManyOp, the execution of which concatenates the
+// requested number of elements from the given Queue along the 0th
+// dimension, and emits the result as a single tuple of tensors.
+//
+// The op has two inputs:
+// - Input 0: the handle to a queue.
+// - Input 1: the number of elements to dequeue.
+//
+// The op has k outputs, where k is the number of components in the
+// tuples stored in the given Queue, and output i is the ith component
+// of the dequeued tuple.
+class DequeueManyOp : public QueueAccessOpKernel {
+ public:
+  explicit DequeueManyOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+  ~DequeueManyOp() override;
+
+ private:
+  DequeueManyOp(const DequeueManyOp&) = delete;
+  void operator=(const DequeueManyOp&) = delete;
+};
+
+// Defines a DequeueUpToOp, the execution of which concatenates the
+// requested number of elements from the given Queue along the 0th
+// dimension, and emits the result as a single tuple of tensors.
+//
+// The difference between this op and DequeueMany is the handling when
+// the Queue is closed.  While the DequeueMany op will return if there
+// an error when there are less than num_elements elements left in the
+// closed queue, this op will return between 1 and
+// min(num_elements, elements_remaining_in_queue), and will not block.
+// If there are no elements left, then the standard DequeueMany error
+// is returned.
+//
+// This op only works if the underlying Queue implementation accepts
+// the allow_small_batch = true parameter to TryDequeueMany.
+// If it does not, an errors::Unimplemented exception is returned.
+//
+// The op has two inputs:
+// - Input 0: the handle to a queue.
+// - Input 1: the number of elements to dequeue.
+//
+// The op has k outputs, where k is the number of components in the
+// tuples stored in the given Queue, and output i is the ith component
+// of the dequeued tuple.
+//
+// The op has one attribute: allow_small_batch.  If the Queue supports
+// it, setting this to true causes the queue to return smaller
+// (possibly zero length) batches when it is closed, up to however
+// many elements are available when the op executes.  In this case,
+// the Queue does not block when closed.
+class DequeueUpToOp : public QueueAccessOpKernel {
+ public:
+  explicit DequeueUpToOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+  ~DequeueUpToOp() override;
+
+ private:
+  DequeueUpToOp(const DequeueUpToOp&) = delete;
+  void operator=(const DequeueUpToOp&) = delete;
+};
+
+// Defines a QueueCloseOp, which closes the given Queue. Closing a
+// Queue signals that no more elements will be enqueued in it.
+//
+// The op has one input, which is the handle of the appropriate Queue.
+class QueueCloseOp : public QueueOpKernel {
+ public:
+  explicit QueueCloseOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+ private:
+  bool cancel_pending_enqueues_;
+  QueueCloseOp(const QueueCloseOp&) = delete;
+  void operator=(const QueueCloseOp&) = delete;
+};
+
+// Defines a QueueSizeOp, which computes the number of elements in the
+// given Queue, and emits it as an output tensor.
+//
+// The op has one input, which is the handle of the appropriate Queue;
+// and one output, which is a single-element tensor containing the current
+// size of that Queue.
+class QueueSizeOp : public QueueOpKernel {
+ public:
+  explicit QueueSizeOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+ private:
+  QueueSizeOp(const QueueSizeOp&) = delete;
+  void operator=(const QueueSizeOp&) = delete;
+};
+
+class QueueIsClosedOp : public QueueOpKernel {
+ public:
+  explicit QueueIsClosedOp(OpKernelConstruction* context);
+
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override;
+
+ private:
+  QueueIsClosedOp(const QueueIsClosedOp&) = delete;
+  void operator=(const QueueIsClosedOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_QUEUE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h b/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
new file mode 100644
index 00000000..7dc63ac8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
@@ -0,0 +1,189 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/match.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+#ifndef TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_TO_VARIANT_OP_TEST_H_
+#define TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_TO_VARIANT_OP_TEST_H_
+
+namespace tensorflow {
+
+class RaggedTensorToVariantKernelTest : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for the RaggedTensorToVariant op, and
+  // populates the `splits` input with the given values.
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void BuildEncodeRaggedTensorGraph(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values, const bool batched) {
+    const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+    int64_t num_splits = ragged_splits.size();
+    TF_ASSERT_OK(
+        NodeDefBuilder("tested_op", "RaggedTensorToVariant")
+            .Input(FakeInput(num_splits, splits_dtype))  // ragged_splits
+            .Input(FakeInput(values_dtype))              // ragged_values
+            .Attr("RAGGED_RANK", num_splits)
+            .Attr("Tvalues", values_dtype)
+            .Attr("Tsplits", splits_dtype)
+            .Attr("batched_input", batched)
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    for (const auto& splits : ragged_splits) {
+      int64_t splits_size = splits.size();
+      AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), splits);
+    }
+    AddInputFromArray<VALUE_TYPE>(ragged_values_shape, ragged_values);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void BuildEncodeRaggedTensorGraph(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape, const VALUE_TYPE& ragged_values,
+      const bool batched) {
+    const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+    int64_t num_splits = ragged_splits.size();
+    TF_ASSERT_OK(
+        NodeDefBuilder("tested_op", "RaggedTensorToVariant")
+            .Input(FakeInput(num_splits, splits_dtype))  // ragged_splits
+            .Input(FakeInput(values_dtype))              // ragged_values
+            .Attr("RAGGED_RANK", num_splits)
+            .Attr("Tvalues", values_dtype)
+            .Attr("Tsplits", splits_dtype)
+            .Attr("batched_input", batched)
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    for (const auto& splits : ragged_splits) {
+      int64_t splits_size = splits.size();
+      AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), splits);
+    }
+    AddInput<VALUE_TYPE>(ragged_values_shape,
+                         [&ragged_values](int i) { return ragged_values; });
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    RaggedTensorVariant encoded;
+    for (auto ragged_split : ragged_splits) {
+      int splits_size = ragged_split.size();
+      Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
+                    TensorShape({splits_size}));
+      test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
+      encoded.append_splits(splits);
+    }
+    Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
+    test::FillValues<VALUE_TYPE>(&values, ragged_values);
+    encoded.set_values(values);
+    return encoded;
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    int num_values = ragged_values.size();
+    return CreateVariantFromRagged(ragged_splits, {num_values}, ragged_values);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void ExpectRaggedTensorVariantEqual(const RaggedTensorVariant& expected,
+                                      const RaggedTensorVariant& actual) {
+    test::ExpectTensorEqual<VALUE_TYPE>(actual.values(), expected.values());
+    EXPECT_EQ(actual.ragged_rank(), expected.ragged_rank());
+    for (int i = 0; i < actual.ragged_rank(); ++i) {
+      test::ExpectTensorEqual<SPLIT_TYPE>(actual.splits(i), expected.splits(i));
+    }
+  }
+};
+
+class RaggedTensorToVariantGradientKernelTest
+    : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for the RaggedTensorToVariantGradient op,
+  // and populates the `encoded_ragged_grad`, `row_splits` and
+  // `dense_values_shape` input with the given values.
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void BuildEncodeRaggedTensorGradientGraph(
+      const std::vector<Variant>& encoded_ragged_grad,
+      const std::vector<SPLIT_TYPE>& row_splits,
+      const std::vector<int32>& dense_values_shape) {
+    const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+
+    TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedTensorToVariantGradient")
+                     .Input(FakeInput(DT_VARIANT))    // encoded_ragged_grad
+                     .Input(FakeInput(splits_dtype))  // row_splits
+                     .Input(FakeInput(DT_INT32))      // dense_values_shape
+                     .Attr("Tvalues", values_dtype)
+                     .Attr("Tsplits", splits_dtype)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    int64_t encoded_ragged_grad_size = encoded_ragged_grad.size();
+    AddInputFromArray<Variant>(TensorShape({encoded_ragged_grad_size}),
+                               encoded_ragged_grad);
+
+    int64_t splits_size = row_splits.size();
+    AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), row_splits);
+
+    int64_t dense_values_shape_size = dense_values_shape.size();
+    AddInputFromArray<int32>(TensorShape({dense_values_shape_size}),
+                             dense_values_shape);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    RaggedTensorVariant encoded;
+    for (auto ragged_split : ragged_splits) {
+      int splits_size = ragged_split.size();
+      Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
+                    TensorShape({splits_size}));
+      test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
+      encoded.append_splits(splits);
+    }
+    Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
+    test::FillValues<VALUE_TYPE>(&values, ragged_values);
+    encoded.set_values(values);
+    return encoded;
+  }
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_TO_VARIANT_OP_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_tensor_variant.h b/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_tensor_variant.h
new file mode 100644
index 00000000..1d2066b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_tensor_variant.h
@@ -0,0 +1,110 @@
+#include "tensorflow/core/framework/tensor_key.h"
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
+#define TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+
+// Class used to store a RaggedTensor as a Variant scalar.
+class RaggedTensorVariant {
+ public:
+  RaggedTensorVariant() {}
+  RaggedTensorVariant(Tensor values, const std::vector<Tensor>& nested_splits)
+      : values_(std::move(values)), nested_splits_(nested_splits) {}
+
+  // Variant support methods.
+  string TypeName() const;
+  string DebugString() const;
+  void Encode(VariantTensorData* data) const;
+  bool Decode(const VariantTensorData& data);
+
+  // The flat_values of the RaggedTensor.
+  const Tensor& values() const { return values_; }
+  Tensor* mutable_values() { return &values_; }
+  void set_values(const Tensor& new_values) { values_ = new_values; }
+
+  // The nested row_splits of the RaggedTensor.
+  int ragged_rank() const { return nested_splits_.size(); }
+  const std::vector<Tensor>& nested_splits() const { return nested_splits_; }
+  std::vector<Tensor>* mutable_nested_splits() { return &nested_splits_; }
+  const Tensor& splits(int i) const { return nested_splits_[i]; }
+  Tensor* mutable_splits(int i) { return &nested_splits_[i]; }
+  void set_nested_splits(const std::vector<Tensor>& nested_splits) {
+    nested_splits_ = nested_splits;
+  }
+  void append_splits(const Tensor& splits) { nested_splits_.push_back(splits); }
+
+ private:
+  Tensor values_;
+  std::vector<Tensor> nested_splits_;
+};
+
+template <typename Device>
+absl::Status RaggedTensorVariantZerosLike(OpKernelContext* c,
+                                          const RaggedTensorVariant& x,
+                                          RaggedTensorVariant* y) {
+  y->set_nested_splits(x.nested_splits());
+  TF_RETURN_IF_ERROR(
+      ZerosLikeTensor<Device>(c, x.values(), y->mutable_values()));
+  return absl::OkStatus();
+}
+
+template <typename Device>
+absl::Status RaggedTensorVariantBinaryAdd(OpKernelContext* c,
+                                          const RaggedTensorVariant& x,
+                                          const RaggedTensorVariant& y,
+                                          RaggedTensorVariant* out) {
+  if (x.values().dtype() != y.values().dtype()) {
+    return errors::InvalidArgument(
+        "Can't add RaggedTensorVariants of different dtypes. One is ",
+        DataTypeString(x.values().dtype()), " and the other is ",
+        DataTypeString(y.values().dtype()));
+  }
+  if (x.ragged_rank() != y.ragged_rank()) {
+    return errors::InvalidArgument(
+        "Can't add RaggedTensorVariants of different ragged rank. ", "One is ",
+        x.ragged_rank(), " and the other is ", y.ragged_rank());
+  }
+  for (int i = 0; i < x.ragged_rank(); ++i) {
+    if (TensorKey(x.splits(i)) != TensorKey(y.splits(i))) {
+      return errors::InvalidArgument(
+          "Can't add RaggedTensorVariants with different row_splits.");
+    }
+  }
+  out->set_nested_splits(x.nested_splits());
+  TF_RETURN_IF_ERROR(BinaryAddTensors<Device>(c, x.values(), y.values(),
+                                              out->mutable_values()));
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_utils.h
new file mode 100644
index 00000000..3ccd34a5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/ragged_utils.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_RAGGED_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_RAGGED_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Utility functions for RaggedTensor
+
+// Verifies that the splits are valid for ragged tensor
+template <typename SPLIT_TYPE>
+absl::Status RaggedTensorVerifySplits(const Tensor& ragged_splits,
+                                      bool check_last_element,
+                                      int64_t num_ragged_values) {
+  auto flat_ragged_splits = ragged_splits.flat<SPLIT_TYPE>();
+
+  if (ragged_splits.dims() != 1) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Invalid ragged splits: ragged splits must be rank 1 but is rank ",
+        ragged_splits.dims()));
+  }
+
+  if (ragged_splits.NumElements() < 1) {
+    return absl::InvalidArgumentError(
+        "Invalid ragged splits: ragged splits must have at least one splits, "
+        "but is empty");
+  }
+
+  if (flat_ragged_splits(0) != static_cast<SPLIT_TYPE>(0)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Invalid ragged splits: first element of ragged splits "
+                     " must be 0 but is ",
+                     flat_ragged_splits(0)));
+  }
+
+  SPLIT_TYPE last_split = 0;
+  for (int j = 1; j < ragged_splits.dim_size(0); j++) {
+    auto split = flat_ragged_splits(j);
+    if (split < last_split) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Invalid ragged splits: ragged splits must be "
+                       "monotonically increasing, but ragged_splits[",
+                       j, "]=", split, " is smaller than row_splits[", j - 1,
+                       "]=", last_split));
+    }
+    last_split = split;
+  }
+
+  if (check_last_element & last_split != num_ragged_values) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Invalid ragged splits: last element of ragged splits must be ",
+        "the number of ragged values(", num_ragged_values, ") but is ",
+        last_split));
+  }
+
+  return absl::OkStatus();
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RAGGED_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_binomial_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_binomial_op.h
new file mode 100644
index 00000000..e701e5ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_binomial_op.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_BINOMIAL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_BINOMIAL_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+// Sample a binomial random variable, with probs and counts for each batch.
+// Uses binomial inversion and a transformed rejection sampling method as
+// described in
+// https://pdfs.semanticscholar.org/471b/c2726e25bbf8801ef781630a2c13f654268e.pdf.
+// Two different algorithms are employed, depending on the size of
+// counts * probs (or counts * (1 - probs) if probs > 0.5.
+// If counts * probs < 10, we simply sum up Geometric random variables until
+// they exceed count, and the number we used is binomially distributed.
+// In expectation, this will take O(counts * probs) time, and requiring in
+// expectation the same number of random variates.
+// This can be much cheaper than summing bernoulli random variates, as we
+// will always need O(counts) bernoulli random variates (so this requires fewer
+// uniform r.v.s as well as can be faster).
+//
+// If counts * probs > 10, we use a transformed-rejection algorithm based on
+// pairs of uniform random variates due to Hormann.
+// https://pdfs.semanticscholar.org/471b/c2726e25bbf8801ef781630a2c13f654268e.pdf
+// This algorithm has higher acceptance rates for counts * probs large, as the
+// proposal distribution becomes quite tight, requiring approximately two
+// uniform random variates as counts * probs becomes large.
+template <typename Device, typename T, typename U>
+struct RandomBinomialFunctor {
+  void operator()(OpKernelContext* ctx, const Device& d, int64_t num_batches,
+                  int64_t samples_per_batch, int64_t num_elements,
+                  typename TTypes<T>::ConstFlat counts,
+                  typename TTypes<T>::ConstFlat probs,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<U>::Flat output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_BINOMIAL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_index_shuffle.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_index_shuffle.h
new file mode 100644
index 00000000..68b52ad6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_index_shuffle.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_INDEX_SHUFFLE_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_INDEX_SHUFFLE_H_
+
+#include <array>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace random {
+
+// Returns the position of `index` in a permutation of [0, ..., max_index].
+//
+// Index must be number in [0, ..., max_index].
+// Key is the random key for the permutation.
+// The returned index will also be in [0, ..., max_index]. For a fixed `key`
+// and `max_index` the all possible `index` values and the returned values
+// form a bijection.
+// Rounds must be a positive even integer >= 4. Larger values increase improve
+// 'randomness' of permutations for small `max_index` values. The time to
+// compute the result scales linear with the number of rounds. We recommend 8
+// rounds for a good treat off.
+//
+// For more details on the algorithm see the top of the cc file.
+uint64_t index_shuffle(const uint64_t index, const std::array<uint32_t, 3>& key,
+                       const uint64_t max_index, const int32_t rounds);
+
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_INDEX_SHUFFLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_op.h
new file mode 100644
index 00000000..ea16f54e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_op.h
@@ -0,0 +1,64 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+template <typename Device, class Distribution>
+struct FillPhiloxRandom;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+// Declares the partially CPU-specialized functor struct.
+//
+// NOTE: Due to inlining done by the compiler, you may need to add
+// explicit instantiation of the functor in random_op.cc.  See example
+// functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>.
+//
+// This functor can take the PhiloxRandom input from either device memory `key`
+// and `counter` or a stack value `gen`. If both `key` and `counter` are not
+// nullptr, they provide the input; otherwise `gen` provides the input.
+template <class Distribution>
+struct FillPhiloxRandom<CPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen,
+                  typename Distribution::ResultElementType* data, int64_t size,
+                  Distribution dist);
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+typedef Eigen::GpuDevice GPUDevice;
+// Declares the partially GPU-specialized functor struct.
+template <class Distribution>
+struct FillPhiloxRandom<GPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen,
+                  typename Distribution::ResultElementType* data, int64_t size,
+                  Distribution dist);
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_op_cpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_op_cpu.h
new file mode 100644
index 00000000..cfa927c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_op_cpu.h
@@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_CPU_H_
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
+#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push")       \
+      _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+using random::PhiloxRandom;
+using random::SingleSampleAdapter;
+
+// The default implementation of the functor, which should never be invoked
+// But we still need to provide implementation for now for the linker to work,
+// since we do not support all the distributions yet.
+template <typename Device, class Distribution>
+struct FillPhiloxRandom {
+  typedef typename Distribution::ResultElementType T;
+  void operator()(OpKernelContext* ctx, const Device&, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen, T* data,
+                  int64_t size, Distribution dist) {
+    OP_REQUIRES(
+        ctx, false,
+        errors::Internal(
+            "Default `FillPhiloxRandom` implementation should not be executed. "
+            "The cause of this error is probably that `FillPhiloxRandom` does "
+            "not support this device or random distribution yet."));
+  }
+};
+
+// A class to fill a specified range of random groups
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomTask;
+
+// Specialization for distribution that takes a fixed number of samples for
+// each output.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  static void Run(random::PhiloxRandom gen, T* data, int64_t size,
+                  int64_t start_group, int64_t limit_group, Distribution dist) {
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    gen.Skip(start_group);
+    int64_t offset = start_group * kGroupSize;
+
+    // First fill all the full-size groups
+    int64_t limit_group_full = std::min(limit_group, size / kGroupSize);
+    for (int64_t index = start_group; index < limit_group_full; ++index) {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    // If there are any remaining elements that need to be filled, process them
+    if (limit_group_full < limit_group) {
+      int64_t remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Specialization for distribution that takes a variable number of samples for
+// each output. This will be slower due to the generality.
+template <class Distribution>
+struct FillPhiloxRandomTask<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  static constexpr int64_t kReservedSamplesPerOutput = 256;
+
+  static void Run(random::PhiloxRandom base_gen, T* data, int64_t size,
+                  int64_t start_group, int64_t limit_group, Distribution dist) {
+    const int kGroupSize = Distribution::kResultElementCount;
+
+    static const int kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
+
+    int64_t offset = start_group * kGroupSize;
+
+    // First fill all the full-size groups
+    int64_t limit_group_full = std::min(limit_group, size / kGroupSize);
+    int64_t group_index;
+    for (group_index = start_group; group_index < limit_group_full;
+         ++group_index) {
+      // Reset the generator to the beginning of the output group region
+      // This is necessary if we want the results to be independent of order
+      // of work
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    // If there are any remaining elements that need to be filled, process them
+    if (limit_group_full < limit_group) {
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      int64_t remaining_size = size - limit_group_full * kGroupSize;
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Partial specialization for CPU to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
+    OpKernelContext* ctx, const CPUDevice&, const uint64* key,
+    const uint64* counter, random::PhiloxRandom gen,
+    typename Distribution::ResultElementType* data, int64_t size,
+    Distribution dist) {
+  if (key != nullptr && counter != nullptr) {
+    gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
+
+  const int kGroupSize = Distribution::kResultElementCount;
+
+  auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+
+  int64_t total_group_count = (size + kGroupSize - 1) / kGroupSize;
+
+  const int kGroupCost =
+      random::PhiloxRandom::kResultElementCount *
+      (random::PhiloxRandom::kElementCost + Distribution::kElementCost);
+
+  Shard(worker_threads.num_threads, worker_threads.workers, total_group_count,
+        kGroupCost,
+        [&gen, data, size, dist](int64_t start_group, int64_t limit_group) {
+          FillPhiloxRandomTask<
+              Distribution,
+              Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
+                                                            start_group,
+                                                            limit_group, dist);
+        });
+}
+
+}  // namespace functor
+
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_CPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_op_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_op_gpu.h
new file mode 100644
index 00000000..f8efa21d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_op_gpu.h
@@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+
+#if defined(__CUDACC__) || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+                                random::PhiloxRandom gen, T* data, int64 size,
+                                Distribution dist);
+};
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+                                random::PhiloxRandom base_gen, T* data,
+                                int64 size, Distribution dist);
+};
+
+template <typename T, int ElementCount>
+class SampleCopier {
+ public:
+  inline __device__ void operator()(
+      T* __restrict__ buf,
+      const tensorflow::random::Array<T, ElementCount>& array) const {
+#pragma unroll
+    for (int i = 0; i < ElementCount; i++) {
+      buf[i] = array[i];
+    }
+  }
+};
+
+template <>
+class SampleCopier<float, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      float* __restrict__ buf,
+      const tensorflow::random::Array<float, 4>& array) const {
+    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
+    // have 32-bit alignment vs 128-bit alignment. There seems to be no
+    // performance loss when assigning each element to a vector.
+    float4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    float4* buf_vector = reinterpret_cast<float4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int32, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int32* __restrict__ buf,
+      const tensorflow::random::Array<int32, 4>& array) const {
+    ::int4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    ::int4* buf_vector = reinterpret_cast<::int4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<double, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      double* __restrict__ buf,
+      const tensorflow::random::Array<double, 2>& array) const {
+    double2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    double2* buf_vector = reinterpret_cast<double2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int64, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int64* __restrict__ buf,
+      const tensorflow::random::Array<int64, 2>& array) const {
+    longlong2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a fixed number of samples.
+template <class Distribution>
+PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
+    const uint64* key, const uint64* counter, random::PhiloxRandom gen, T* data,
+    int64 size, Distribution dist) {
+  const int kGroupSize = Distribution::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int64 offset = thread_id * kGroupSize;
+  if (key != nullptr && counter != nullptr) {
+    gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
+  gen.Skip(thread_id);
+
+  const SampleCopier<T, kGroupSize> copier;
+  while (offset + kGroupSize <= size) {
+    const typename Distribution::ResultType samples = dist(&gen);
+    copier(&data[offset], samples);
+
+    offset += total_thread_count * kGroupSize;
+    gen.Skip(total_thread_count - 1);
+  }
+
+  typename Distribution::ResultType samples = dist(&gen);
+  for (int i = 0; i < kGroupSize; ++i) {
+    if (offset >= size) {
+      return;
+    }
+    data[offset] = samples[i];
+    ++offset;
+  }
+}
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a variable number of samples.
+template <class Distribution>
+PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
+    const uint64* key, const uint64* counter, random::PhiloxRandom base_gen,
+    T* data, int64 size, Distribution dist) {
+  if (key != nullptr && counter != nullptr) {
+    base_gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
+
+  using random::PhiloxRandom;
+  using random::SingleSampleAdapter;
+
+  const int kReservedSamplesPerOutput = 256;
+  const int kGroupSize = Distribution::kResultElementCount;
+  const int kGeneratorSkipPerOutputGroup = kGroupSize *
+                                           kReservedSamplesPerOutput /
+                                           PhiloxRandom::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int64 group_index = thread_id;
+  int64 offset = group_index * kGroupSize;
+
+  while (offset < size) {
+    // Since each output takes a variable number of samples, we need to
+    // realign the generator to the beginning for the current output group
+    PhiloxRandom gen = base_gen;
+    gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+    SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+    typename Distribution::ResultType samples = dist(&single_samples);
+
+    for (int i = 0; i < kGroupSize; ++i) {
+      if (offset >= size) {
+        return;
+      }
+      data[offset] = samples[i];
+      ++offset;
+    }
+
+    offset += (total_thread_count - 1) * kGroupSize;
+    group_index += total_thread_count;
+  }
+}
+
+// A simple launch pad to call the correct function templates to fill the data
+template <class Distribution>
+__global__ void __launch_bounds__(1024)
+    FillPhiloxRandomKernelLaunch(const uint64* key, const uint64* counter,
+                                 random::PhiloxRandom base_gen,
+                                 typename Distribution::ResultElementType* data,
+                                 int64 size, Distribution dist) {
+  FillPhiloxRandomKernel<Distribution,
+                         Distribution::kVariableSamplesPerOutput>()
+      .Run(key, counter, base_gen, data, size, dist);
+}
+
+// Partial specialization for GPU
+template <class Distribution>
+void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
+    OpKernelContext*, const GPUDevice& d, const uint64* key,
+    const uint64* counter, random::PhiloxRandom gen,
+    typename Distribution::ResultElementType* data, int64 size,
+    Distribution dist) {
+  if (size == 0) return;
+  const int32 block_size = d.maxGpuThreadsPerBlock();
+  const int32 num_blocks =
+      std::min<int64_t>(
+          d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+          size + block_size - 1) /
+      block_size;
+  TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
+                              num_blocks, block_size, 0, d.stream(), key,
+                              counter, gen, data, size, dist));
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // defined(__CUDACC__) || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_ops_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_ops_util.h
new file mode 100644
index 00000000..b9904569
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_ops_util.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using random::PhiloxRandom;
+
+// The following 2 functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64 input, uint32* output1,
+                                          uint32* output2) {
+  *output1 = static_cast<uint32>(input);
+  *output2 = static_cast<uint32>(input >> 32);
+}
+
+PHILOX_DEVICE_INLINE uint64 Uint32sToUint64(uint32 input1, uint32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return u64_1 | (u64_2 << 32);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
+    uint64 const* ptr) {
+  PhiloxRandom::ResultType counter;
+  Uint64ToUint32s(ptr[0], &counter[0], &counter[1]);
+  Uint64ToUint32s(ptr[1], &counter[2], &counter[3]);
+  return counter;
+}
+
+PHILOX_DEVICE_INLINE void WriteCounterToMem(
+    PhiloxRandom::ResultType const& counter, uint64* ptr) {
+  ptr[0] = Uint32sToUint64(counter[0], counter[1]);
+  ptr[1] = Uint32sToUint64(counter[2], counter[3]);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(uint64 const* ptr) {
+  PhiloxRandom::Key key;
+  Uint64ToUint32s(ptr[0], &key[0], &key[1]);
+  return key;
+}
+
+PHILOX_DEVICE_INLINE void WriteKeyToMem(PhiloxRandom::Key const& key,
+                                        uint64* ptr) {
+  *ptr = Uint32sToUint64(key[0], key[1]);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom GetPhiloxRandomFromCounterKeyMem(
+    uint64 const* counter_ptr, uint64 const* key_ptr) {
+  return PhiloxRandom(GetCounterFromMem(counter_ptr), GetKeyFromMem(key_ptr));
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/random_poisson_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/random_poisson_op.h
new file mode 100644
index 00000000..ca0dad4b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/random_poisson_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_POISSON_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_POISSON_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+// Generic helper functor for the Random Poisson Op.
+template <typename Device, typename T /* rate */, typename U /* output */>
+struct PoissonFunctor {
+  void operator()(OpKernelContext* ctx, const Device& d, const T* rate_flat,
+                  int64_t num_rate, int64_t num_samples,
+                  const random::PhiloxRandom& rng, U* samples_flat);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_POISSON_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/range_sampler.h b/third_party/tflite-hdrs/tensorflow/core/kernels/range_sampler.h
new file mode 100644
index 00000000..c49bbcc5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/range_sampler.h
@@ -0,0 +1,243 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_
+#define TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_
+
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/weighted_picker.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+using Env = tsl::Env;
+
+// Abstract subclass for sampling from the set of non-negative integers
+// [0, range)
+class RangeSampler {
+ public:
+  explicit RangeSampler(int64_t range) : range_(range) { CHECK_GT(range_, 0); }
+  virtual ~RangeSampler();
+
+  // Sample a single value
+  virtual int64_t Sample(random::SimplePhilox* rnd) const = 0;
+
+  // The probability that a single call to Sample() returns the given value.
+  // Assumes that value is in [0, range).  No range checking is done.
+  virtual float Probability(int64_t value) const = 0;
+
+  // Fill "batch" with samples from the distribution.
+  // If unique=true, then we re-pick each element until we get a
+  // value distinct from all previously picked values in the batch.
+  void SampleBatch(random::SimplePhilox* rnd, bool unique,
+                   absl::Span<int64_t> batch) const;
+
+  // Fill "batch" with samples from the distribution, and report
+  // "expected counts".
+  //
+  // The "expected count" of a value is an estimate of the expected
+  // number of occurrences of the value in the batch returned by a
+  // call to this function with the given parameters.  If unique=true,
+  // the expected count is an inclusion probability.  For details on
+  // this estimation, see the comment to "ExpectedCountHelper" in the
+  // .cc file.
+  //
+  // Expected counts for the elements of the returned "batch" are reported
+  // in the aligned array "batch_expected_count".
+  //
+  // The user can optionally provide "extras", containing values in the range.
+  // The expected counts for the extras are reported in the aligned array
+  // "extras_expected_count".
+  //
+  // "batch_expected_count" must have size equal to 0 or to the size of "batch".
+  // "extras" and "extras_expected_count" must have equal size.
+  void SampleBatchGetExpectedCount(
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count) const;
+
+  // Same as SampleBatchGetExpectedCount (see above), but with avoided values.
+  // We repick to avoid all of the values in "avoided_values".
+  // "avoided_values" is only supported with unique=true.  If
+  // unique=false, then avoided_values must be empty.
+  virtual void SampleBatchGetExpectedCountAvoid(
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const;
+
+  // Does this sampler need to be updated with values, e.g. UnigramSampler
+  virtual bool NeedsUpdates() const { return false; }
+
+  // Updates the underlying distribution
+  virtual void Update(absl::Span<const int64_t> values) {
+    LOG(FATAL) << "Update not supported for this sampler type.";
+  }
+
+  int64_t range() { return range_; }
+
+ protected:
+  const int64_t range_;
+};
+
+// An AllSampler only samples batches of size equal to range.
+// It returns the entire range.
+// It cannot sample single values.
+class AllSampler : public RangeSampler {
+ public:
+  explicit AllSampler(int64_t range);
+
+  ~AllSampler() override {}
+
+  int64_t Sample(random::SimplePhilox* rnd) const override {
+    LOG(FATAL) << "Should not be called";
+    return 0;
+  }
+
+  float Probability(int64_t value) const override {
+    LOG(FATAL) << "Should not be called";
+    return 0;
+  }
+
+  void SampleBatchGetExpectedCountAvoid(
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const override;
+};
+
+class UniformSampler : public RangeSampler {
+ public:
+  explicit UniformSampler(int64_t range);
+
+  ~UniformSampler() override {}
+
+  int64_t Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64_t value) const override;
+
+ private:
+  const float inv_range_;
+};
+
+class LogUniformSampler : public RangeSampler {
+ public:
+  explicit LogUniformSampler(int64_t range);
+
+  ~LogUniformSampler() override {}
+
+  int64_t Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64_t value) const override;
+
+ private:
+  const double log_range_;
+};
+
+// Thread-unsafe unigram sampler
+class ThreadUnsafeUnigramSampler : public RangeSampler {
+ public:
+  explicit ThreadUnsafeUnigramSampler(int64_t range);
+  ~ThreadUnsafeUnigramSampler() override {}
+
+  int64_t Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64_t value) const override;
+
+  bool NeedsUpdates() const override { return true; }
+  void Update(absl::Span<const int64_t> values) override;
+
+ private:
+  random::WeightedPicker picker_;
+};
+
+// Thread-safe unigram sampler
+class UnigramSampler : public RangeSampler {
+ public:
+  explicit UnigramSampler(int64_t range);
+  ~UnigramSampler() override {}
+
+  int64_t Sample(random::SimplePhilox* rnd) const override;
+
+  float Probability(int64_t value) const override;
+
+  // Overriding at a high level results in far fewer lock acquisitions.
+  void SampleBatchGetExpectedCountAvoid(
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const override;
+
+  bool NeedsUpdates() const override { return true; }
+  void Update(absl::Span<const int64_t> values) override;
+
+ private:
+  ThreadUnsafeUnigramSampler unsafe_sampler_ TF_GUARDED_BY(mu_);
+  mutable mutex mu_;
+};
+
+// A unigram sampler that uses a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the
+// distribution from data on the fly. There is also an option to skew the
+// distribution by applying a distortion power to the weights.
+class FixedUnigramSampler : public RangeSampler {
+ public:
+  FixedUnigramSampler(int64_t range, float distortion, int32_t num_reserved_ids,
+                      int32_t num_shards, int32_t shard);
+  // The vocab_file is assumed to be a CSV, with the last entry of each row a
+  // value representing the counts or probabilities for the corresponding ID.
+  absl::Status SetDistributionSampler(Env* env, const string& vocab_file);
+  absl::Status SetDistributionSampler(const std::vector<float>& unigrams);
+  float Probability(int64_t value) const override;
+
+  int64_t Sample(random::SimplePhilox* rnd) const override;
+
+ private:
+  // Underlying distribution sampler.
+  std::unique_ptr<random::DistributionSampler> dist_sampler_;
+  // Weights for individual samples. The probability of a sample i is defined
+  // as weights_.at(i) / total_weight_.
+  std::vector<float> weights_;
+  // The total weights of all samples.
+  float total_weight_;
+  // Sharding information of the sampler. The whole vocabulary is sharded
+  // into num_shards_ smaller ranges and each sampler is responsible for one
+  // such smaller range, identified by the shard number.
+  int32 num_shards_;
+  int32 shard_;
+  float distortion_;
+  // Fill the sampler with the appropriate number of reserved IDs.
+  void FillReservedIds(int32_t num_reserved_ids);
+  // Load IDs to sample from a CSV file. It is assumed that the last item of
+  // each row contains a count or probability for the corresponding ID.
+  absl::Status LoadFromFile(Env* env, const string& vocab_file,
+                            float distortion);
+  // Load from an in-memory array.
+  void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/record_yielder.h b/third_party/tflite-hdrs/tensorflow/core/kernels/record_yielder.h
new file mode 100644
index 00000000..7e4c0f5a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/record_yielder.h
@@ -0,0 +1,160 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RECORD_YIELDER_H_
+#define TENSORFLOW_CORE_KERNELS_RECORD_YIELDER_H_
+
+#include <atomic>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// RecordYielder produces value records from a set of tfrecord files
+// in a random order.
+//
+// It guarantees that:
+//   1) all records in tfrecords are yielded within every epoch;
+//   2) each record is yielded only once within every epoch;
+//   3) the order in which records are yielded is highly randomized.
+//   4) the peak memory usage is roughly avg record size *
+//      (opts.bufsize + opts.parallelism * 16).
+//
+// Usage example:
+//   RecordYielder::Options opts;
+//   opts.file_pattern = "input-*";
+//   opts.seed = 301;
+//   opts.bufsize = 1000000;    // A randomized buffer with 1M records.
+//   opts.parallelism = 8;      // Uses 8 tfrecord iterators to iterate
+//                              // through all files.
+//   RecordYielder yielder(opts);
+//   string val;
+//   while (true) {
+//     yielder.YieldOne(&val);
+//     // process val
+//   }
+//
+// RecordYielder can be accessed by multiple threads concurrently.
+class RecordYielder {
+ public:
+  struct Options {
+    // Glob pattern for tfrecords.
+    string file_pattern;
+
+    // Random seed. It determines how data files are shuffled and how
+    // records are shuffled.
+    int64_t seed = 0;
+
+    // Each epoch, all files are first shuffled according to the
+    // random seed and the epoch number, and then all files are
+    // left-shifted by file_shuffle_shift_ratio * num_files slots.  If
+    // file_shuffle_shift_ratio is not within [0, 1), the
+    // implementation clip it to [0, 1).
+    float file_shuffle_shift_ratio = 0;
+
+    // Randomization buffer keeps these many records.
+    uint64 bufsize = 1;
+
+    // Uses these many concurrent tfrecord iterators to iterate through
+    // tfrecords.
+    int32 parallelism = 1;
+
+    string compression_type;
+  };
+
+  explicit RecordYielder(OpKernelConstruction* context,
+                         const RecordYielder::Options& opts);
+  ~RecordYielder();
+
+  RecordYielder(const RecordYielder&) = delete;
+  RecordYielder& operator=(const RecordYielder&) = delete;
+
+  // Yields one 'value'.
+  absl::Status YieldOne(tstring* value);
+
+  // Returns the current epoch number.
+  int64_t current_epoch() const { return epoch_; }
+
+ private:
+  typedef RecordYielder ME;
+
+  Options opts_;
+
+  // Backgrounds threads. Owned.
+  thread::ThreadPool* thread_;
+
+  // Epoch number.
+  std::atomic<int64_t> epoch_;
+
+  mutex mu_;
+
+  // Turned to true when this is deleted.
+  bool stop_ TF_GUARDED_BY(mu_) = false;
+  absl::Status status_ TF_GUARDED_BY(mu_);
+
+  // PRG used for randomization.
+  std::mt19937_64 rnd_ TF_GUARDED_BY(mu_);
+
+  // Randomization buffer.
+  std::vector<string> buf_ TF_GUARDED_BY(mu_);
+
+  // True iff we are draining an epoch.
+  bool epoch_end_ = false;
+
+  int64_t num_records_added_in_epoch_ = 0;
+  int64_t num_records_yielded_in_epoch_ = 0;
+
+  // Trigger when the main loop has exited.
+  Notification main_loop_done_;
+
+  // condition_variables.
+  condition_variable buf_empty_;
+  bool BufEmpty() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+    return stop_ || buf_.empty();
+  }
+
+  condition_variable buf_not_full_;
+  bool BufNotFull() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+    return stop_ || buf_.size() < opts_.bufsize;
+  }
+
+  condition_variable buf_enough_;
+  bool BufEnough() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+    // NOTE: Unless we are finishing an epoch, we want to make sure
+    // the buf_ contains enough randomized elements before yielding
+    // any.
+    return stop_ || !status_.ok() || (epoch_end_ && !buf_.empty()) ||
+           (!epoch_end_ &&
+            buf_.size() >= std::max<uint64>(1, opts_.bufsize / 2));
+  }
+
+  void MainLoop();
+  struct Shard;
+  void ShardLoop(Shard* shard);
+  bool ShouldFinish(const absl::Status& s);
+  bool Add(std::vector<string>* values);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RECORD_YIELDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
new file mode 100644
index 00000000..a82e6c47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -0,0 +1,1412 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include <sstream>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/util/gpu_device_functions.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/transform_output_iterator.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+struct SqrtOfReal {
+  __host__ __device__ T operator()(const T& a) const {
+    return T(Eigen::numext::sqrt(Eigen::numext::real(a)));
+  }
+};
+
+template <typename T>
+struct Sum {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T>
+struct Square {
+  __host__ __device__ T operator()(const T& a) const {
+    return Prod<T>()(a, Eigen::numext::conj(a));
+  }
+};
+
+template <typename T, typename OUT_T = T>
+struct DividesBy {
+  T divisor;
+
+  __host__ __device__ explicit DividesBy(T divisor) : divisor(divisor) {}
+
+  __host__ __device__ OUT_T operator()(const T& x) const { return x / divisor; }
+};
+
+struct MaxPropagateNaN {
+  template <typename T>
+  __host__ __device__ inline T operator()(const T& a, const T& b) const {
+    return (a != a ? a : (a > b ? a : b));
+  }
+};
+
+struct MinPropagateNaN {
+  template <typename T>
+  __host__ __device__ inline T operator()(const T& a, const T& b) const {
+    return (a != a ? a : (a < b ? a : b));
+  }
+};
+
+#if GOOGLE_CUDA
+// TODO(rocm) : enable this once ROCm platform has support for complex datatypes
+//
+// needed to work around a compiler bug in nvcc - it doesn't seem to like
+// the overloaded ops for std::complex
+template <>
+struct DividesBy<std::complex<float>> {
+  cuFloatComplex divisor;
+
+  __host__ __device__ explicit DividesBy(std::complex<float> divisor)
+      : divisor(make_cuComplex(divisor.real(), divisor.imag())) {}
+
+  // implements
+  __host__ __device__ std::complex<float> operator()(
+      const std::complex<float>& x) const {
+    auto result = cuCdivf(make_cuComplex(x.real(), x.imag()), divisor);
+    return std::complex<float>(result.x, result.y);
+  }
+};
+
+template <>
+struct DividesBy<std::complex<double>> {
+  cuDoubleComplex divisor;
+
+  __host__ __device__ explicit DividesBy(std::complex<double> divisor)
+      : divisor(make_cuDoubleComplex(divisor.real(), divisor.imag())) {}
+
+  // implements
+  __host__ __device__ std::complex<double> operator()(
+      const std::complex<double>& x) const {
+    auto result = cuCdiv(make_cuDoubleComplex(x.real(), x.imag()), divisor);
+    return std::complex<double>(result.x, result.y);
+  }
+};
+#endif  // GOOGLE_CUDA
+
+template <typename T>
+struct DividesBy<float, T> {
+  float divisor;
+
+  __host__ __device__ explicit DividesBy(float divisor) : divisor(divisor) {}
+
+  __host__ __device__ T operator()(const float& x) const {
+    return T(x / divisor);
+  }
+};
+
+template <typename T>
+struct HalfToFloat {
+  __host__ __device__ float operator()(const T& x) const {
+    return static_cast<float>(x);
+  }
+};
+
+template <typename T>
+struct FloatToHalf {
+  __host__ __device__ T operator()(const float& x) const {
+    return static_cast<T>(x);
+  }
+};
+
+struct And {
+  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
+    return a && b;
+  }
+};
+
+struct Or {
+  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
+    return a || b;
+  }
+};
+
+// each block does a grid strided loop and reduces its values locally
+// the case of one block is used for low latency small reductions to scalars
+template <typename T, typename OUT_T, int num_threads, typename Op>
+__global__ __launch_bounds__(1024) void BlockReduceKernel(
+    T in, OUT_T out, int num_elems, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  const int bid = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  const int gid = bid * blockDim.x + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  typedef typename std::iterator_traits<T>::value_type value_type;
+
+  value_type sum = initVal;
+  if (gid < num_elems) {
+    sum = in[gid];
+    for (int pos = gid + stride; pos < num_elems; pos += stride) {
+      sum = op(sum, in[pos]);
+    }
+  }
+
+  typedef gpuprim::BlockReduce<value_type, num_threads> BlockReduce;
+
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // only include input values in the reduction
+  //
+  // elements: -----------------
+  // grid:     |====|====|====|====|====|
+  const int num_elements_to_reduce =
+      max(min(static_cast<int>(num_elems - bid * blockDim.x), num_threads), 0);
+
+  sum = BlockReduce(temp_storage).Reduce(sum, op, num_elements_to_reduce);
+
+  if (tid == 0) out[bid] = sum;
+}
+
+// maps a warp to each row
+template <typename T, typename OUT_T, typename Op>
+__global__ __launch_bounds__(1024) void RowReduceKernel(
+    T in, OUT_T out, int num_rows, int num_cols, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  // Defensive index computation to avoid integer overflow.
+  assert(blockDim.x % TF_RED_WARPSIZE == 0);
+  int warps_per_block = blockDim.x / TF_RED_WARPSIZE;
+  int warp_index = threadIdx.x / TF_RED_WARPSIZE;
+  const int row = blockIdx.x * warps_per_block + warp_index;
+  const int lane = threadIdx.x % TF_RED_WARPSIZE;
+
+  if (num_cols == 1) {
+    int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (gid < num_rows) out[gid] = in[gid];
+    return;
+  }
+
+  value_type sum = initVal;
+  int col = lane;
+
+  if (row < num_rows && col < num_cols) {
+    sum = in[row * num_cols + col];
+    col += TF_RED_WARPSIZE;
+    for (; col < num_cols; col += TF_RED_WARPSIZE) {
+      sum = op(sum, in[row * num_cols + col]);
+    }
+  }
+
+  typedef gpuprim::WarpReduce<value_type> WarpReduce;
+
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+
+  sum =
+      WarpReduce(temp_storage).Reduce(sum, op, min(num_cols, TF_RED_WARPSIZE));
+
+  if (row < num_rows && lane == 0) out[row] = sum;
+}
+
+template <typename T1>
+struct storage_type {
+  T1 val;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator T1() { return val; }
+  __host__ __device__ storage_type<T1>& operator=(const T1& in) {
+    val = in;
+    return *this;
+  }
+};
+
+template <typename T2>
+struct storage_type<std::complex<T2>> {
+  T2 real;
+  T2 imag;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator std::complex<T2>() {
+    return std::complex<T2>(real, imag);
+  }
+  __host__ __device__ storage_type<std::complex<T2>>& operator=(
+      const std::complex<T2>& in) {
+    real = in.real();
+    imag = in.imag();
+    return *this;
+  }
+};
+
+// Works only if there are <= 16 columns
+// each warps sums over multiple rows at once
+template <typename T, typename OUT_T, typename Op>
+__global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
+    T in, OUT_T out, int num_rows, int num_cols, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  int rows_per_warp = TF_RED_WARPSIZE / num_cols;
+
+  const int lane = threadIdx.x % TF_RED_WARPSIZE;
+  const int lane_row = lane / num_cols;
+
+  const int start_row_warp =
+      rows_per_warp * (blockIdx.y * blockDim.y + threadIdx.y);
+  const int start_row_lane = start_row_warp + lane_row;
+  int row = start_row_lane;
+  int col = lane % num_cols;
+
+  value_type sum = initVal;
+  if (row * num_cols + col < num_rows * num_cols)
+    sum = in[row * num_cols + col];
+
+    // 1D array necessary due to bug in CUDA 9 compiler.
+    // TODO(nluehr) revert to 2D array when compiler is ready.
+    // This is to mimic the following, but without any constructors:
+    //   __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
+    //   (TF_RED_WARPSIZE+1)];
+#if GOOGLE_CUDA
+  __shared__ __align__(alignof(value_type)) char
+      partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
+                       sizeof(value_type)];
+  value_type* partial_sums = reinterpret_cast<value_type*>(partial_sums_raw);
+#elif TENSORFLOW_USE_ROCM
+  __shared__ storage_type<value_type>
+      partial_sums[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1)];
+#endif
+
+  row += rows_per_warp * gridDim.y * blockDim.y;
+  for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
+    int global_pos = row * num_cols + col;
+    if (global_pos < (num_rows * num_cols))
+      sum = op(sum, in[row * num_cols + col]);
+  }
+
+  const int rows_in_this_warp = min(rows_per_warp, num_rows - start_row_warp);
+  // not the most efficient way to do this sum
+  for (int i = 1; i < rows_in_this_warp; ++i) {
+    value_type tmp = gpuprim::ShuffleIndex<TF_RED_WARPSIZE, value_type>(
+        sum, static_cast<int>(threadIdx.x + i * num_cols), 0xffffffff);
+    if (lane < num_cols) sum = op(sum, tmp);
+  }
+
+  if (lane < num_cols)
+    partial_sums[lane * (TF_RED_WARPSIZE + 1) + threadIdx.y] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.y == 0 && threadIdx.x < num_cols) {
+    value_type s = partial_sums[threadIdx.x * (TF_RED_WARPSIZE + 1)];
+
+    if (blockDim.y > 1) {
+      for (int row = 1; row < blockDim.y; ++row) {
+        value_type t = partial_sums[threadIdx.x * (TF_RED_WARPSIZE + 1) + row];
+        s = op(s, t);
+      }
+    }
+
+    out[col * gridDim.y + blockIdx.y] = s;
+  }
+}
+
+// Maps each block to a column range TF_RED_WARPSIZE wide
+template <typename T, typename OUT_T, typename Op>
+__global__ __launch_bounds__(1024) void ColumnReduceKernel(
+    T in, OUT_T out, int num_rows, int num_cols, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * TF_RED_WARPSIZE + threadIdx.x;
+
+  value_type sum = initVal;
+  if (row < num_rows && col < num_cols) sum = in[row * num_cols + col];
+
+    // 1D array necessary due to bug in CUDA 9 compiler.
+    // TODO(nluehr) revert to 2D array when compiler is ready.
+    // This is to mimic the following, but without constructors:
+    //     __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
+    //     (TF_RED_WARPSIZE + 1)];
+#if GOOGLE_CUDA
+  __shared__ __align__(alignof(value_type)) char
+      partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
+                       sizeof(value_type)];
+  value_type* partial_sums = reinterpret_cast<value_type*>(partial_sums_raw);
+#elif TENSORFLOW_USE_ROCM
+  __shared__ storage_type<value_type>
+      partial_sums[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1)];
+#endif
+
+  row += gridDim.y * blockDim.y;
+
+  if (col < num_cols) {
+    for (; row < num_rows; row += gridDim.y * blockDim.y) {
+      sum = op(sum, in[row * num_cols + col]);
+    }
+  }
+
+  partial_sums[threadIdx.x * (TF_RED_WARPSIZE + 1) + threadIdx.y] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.y == 0 && col < num_cols) {
+    value_type s = partial_sums[threadIdx.x * (TF_RED_WARPSIZE + 1)];
+
+    // only include input values in the reduction
+    // elem   block_rows
+    //  -         =
+    //  -         =
+    //  #         #  block boundary
+    //  -         =
+    //  -         =
+    //  #         #  block boundary
+    //  -         =
+    //            =
+    const int numRowsThisBlock =
+        min(static_cast<int>(blockDim.y), num_rows - blockIdx.y * blockDim.y);
+
+    for (int row = 1; row < numRowsThisBlock; ++row) {
+      value_type t = partial_sums[threadIdx.x * (TF_RED_WARPSIZE + 1) + row];
+      s = op(s, t);
+    }
+
+    out[col * gridDim.y + blockIdx.y] = s;
+  }
+}
+
+// does multiple warp size segmented reductions in parallel
+// segments cannot cross warp boundaries (mainly used for reducing the segments
+// that come from the Max16Columns column reduction kernel)
+template <typename T, typename OUT_T, typename Op>
+__global__ __launch_bounds__(1024) void CleanupSegments(
+    T partial_sums, OUT_T out, int num_rows, int num_cols, int segment_size,
+    Op op, typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  value_type val = initVal;
+  if (tid < segment_size * num_cols) val = partial_sums[tid];
+
+  typedef gpuprim::WarpReduce<value_type> WarpReduce;
+
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+
+  const bool head_flag = (threadIdx.x % segment_size) == 0;
+  value_type sum =
+      WarpReduce(temp_storage).HeadSegmentedReduce(val, head_flag, op);
+
+  if (head_flag && tid < segment_size * num_cols) {
+    out[tid / segment_size] = sum;
+  }
+}
+
+// assigns one thread to a column
+template <typename T, typename OUT_T, typename Op>
+__global__ __launch_bounds__(1024) void ColumnReduceSimpleKernel(
+    T in, OUT_T out, int num_planes, int num_rows, int num_cols, Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int elems_per_plane = num_rows * num_cols;
+
+  const int plane = gid / num_cols;
+  const int col = gid % num_cols;
+
+  if (plane >= num_planes) return;
+
+  if (num_rows == 1) {
+    out[plane * elems_per_plane + col] = in[plane * elems_per_plane + col];
+    return;
+  }
+
+  value_type sum = op(in[plane * elems_per_plane + col],
+                      in[plane * elems_per_plane + num_cols + col]);
+  for (int row = 2; row < num_rows; ++row) {
+    sum = op(sum, in[plane * elems_per_plane + row * num_cols + col]);
+  }
+
+  out[plane * num_cols + col] = sum;
+}
+
+namespace {
+constexpr int kUnroll = 8;
+}
+
+template <typename T, typename IN_T, typename Op>
+__device__ __inline__ T ComputeSum(IN_T in_, const int plane,
+                                   const int num_out_rows, int num_rows,
+                                   int num_cols, const int col, Op op) {
+  const int out_rows = num_rows / (2 * kUnroll);
+  const int num_rem_rows = num_rows % (2 * kUnroll);
+  const int elems_per_plane = num_rows * num_cols;
+  T reg[2 * kUnroll];
+  T sum;
+  int offset = 0;
+  if (out_rows != 0) {
+    for (int i = 0; i < 2 * kUnroll; i++) {
+      reg[i] =
+          in_[plane * elems_per_plane + i * (num_out_rows * num_cols) + col];
+    }
+    sum = reg[0];
+    for (int i = 1; i < 2 * kUnroll; i++) {
+      sum = op(sum, reg[i]);
+    }
+    offset = 2 * kUnroll * (num_out_rows * num_cols);
+  }
+
+  if (col < num_cols && num_rem_rows > 0) {
+    reg[0] = in_[plane * elems_per_plane + offset + 0 * num_cols + col];
+    if (out_rows != 0) {
+      sum = op(sum, reg[0]);
+    } else {
+      sum = reg[0];
+    }
+    for (int i = 1; i < num_rem_rows; i++) {
+      reg[0] = in_[plane * elems_per_plane + offset + i * num_cols + col];
+      sum = op(sum, reg[0]);
+    }
+  }
+  return sum;
+}
+
+template <typename IN_T, typename Op>
+__global__ __launch_bounds__(1024) void ColumnReduceInToTempKernel(
+    void* __restrict__ temp, int temp_in_offset, int temp_out_offset, IN_T in,
+    int num_planes, int num_rows, int num_cols, Op op) {
+  typedef typename std::iterator_traits<IN_T>::value_type value_type;
+
+  value_type* t = (value_type*)temp;
+  value_type* out_ = t + temp_out_offset;
+
+  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_out_rows = max(1, num_rows / (2 * kUnroll));
+  const int plane = gid / (num_out_rows * num_cols);
+  const int col = gid % (num_out_rows * num_cols);
+
+  if (plane >= num_planes) return;
+
+  value_type sum;
+  if (temp_in_offset == -1) {
+    auto in_ = in;
+    sum = ComputeSum<value_type, IN_T, Op>(in_, plane, num_out_rows, num_rows,
+                                           num_cols, col, op);
+  } else {
+    auto in_ = t + temp_in_offset;
+    sum = ComputeSum<value_type, value_type*, Op>(in_, plane, num_out_rows,
+                                                  num_rows, num_cols, col, op);
+  }
+  out_[plane * num_out_rows * num_cols + col] = sum;
+}
+
+template <typename T, typename OUT_T, typename Op>
+__global__ __launch_bounds__(1024) void ColumnReduceTempToOutKernel(
+    void* __restrict__ temp, int temp_in_offset, T in, OUT_T out,
+    int num_planes, int num_rows, int num_cols, Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  value_type* t = (value_type*)temp;
+  const int tid = threadIdx.x;
+  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  int elems_per_plane = num_rows * num_cols;
+
+  if (num_rows == 1) {
+    if (gid >= num_planes * num_cols) return;
+    if (temp_in_offset == -1) {
+      auto in_ = in;
+      out[gid] = in_[gid];
+    } else {
+      auto in_ = t + temp_in_offset;
+      out[gid] = in_[gid];
+    }
+    return;
+  }
+
+  const int planes_per_block = 1;
+  const int plane = blockIdx.x * planes_per_block + tid / elems_per_plane;
+  // A thread block contains one or multiple plane(s),
+  // i.e. num_rows * num_cols <= blockDim.x
+  const int col = tid % elems_per_plane;
+  const int local_plane = plane % planes_per_block;
+
+  if (tid >= planes_per_block * elems_per_plane || plane >= num_planes) return;
+
+  GPU_DYNAMIC_SHARED_MEM_DECL(8, char, ss);
+  value_type* const smem = reinterpret_cast<value_type*>(ss);
+
+  if (temp_in_offset == -1) {
+    auto in_ = in;
+    smem[local_plane * elems_per_plane + col] =
+        in_[plane * elems_per_plane + col];
+  } else {
+    auto in_ = t + temp_in_offset;
+    smem[local_plane * elems_per_plane + col] =
+        in_[plane * elems_per_plane + col];
+  }
+  __syncthreads();
+
+  int num_in_rows = num_rows;
+  int num_out_rows;
+  int num_rem_rows;
+
+  int in_offset = 0;
+  int out_offset = blockDim.x;
+
+  int in_elems_per_plane = elems_per_plane;
+  int out_elems_per_plane;
+
+  while (num_in_rows > 1) {
+    num_out_rows = num_in_rows / 2;
+    num_rem_rows = num_in_rows % 2;
+    out_elems_per_plane = num_out_rows * num_cols;
+
+    if (col < out_elems_per_plane) {
+      value_type sum;
+      sum = op(smem[in_offset + local_plane * in_elems_per_plane + col],
+               smem[in_offset + local_plane * in_elems_per_plane +
+                    out_elems_per_plane + col]);
+      if (num_rem_rows == 1 && col < num_cols) {
+        sum = op(sum, smem[in_offset + local_plane * in_elems_per_plane +
+                           2 * out_elems_per_plane + col]);
+      }
+      smem[out_offset + local_plane * out_elems_per_plane + col] = sum;
+    }
+
+    num_in_rows = num_out_rows;
+    in_elems_per_plane = out_elems_per_plane;
+    int t_offset = in_offset;
+    in_offset = out_offset;
+    out_offset = t_offset;
+    __syncthreads();
+  }
+
+  if (col < num_cols) {
+    out[plane * num_cols + col] =
+        smem[in_offset + local_plane * out_elems_per_plane + col];
+  }
+}
+
+struct RowOffset {
+  __host__ __device__ explicit RowOffset(const int& cols) : cols_(cols) {}
+
+  __host__ __device__ int operator()(const int& x) const { return cols_ * x; }
+
+  int cols_;
+};
+
+struct GatherOp {
+  __host__ __device__ GatherOp(const int& extent_x, const int& extent_y,
+                               const int& extent_z, bool kOne)
+      : extent_x_(extent_x),
+        extent_y_(extent_y),
+        extent_z_(extent_z),
+        kOne_(kOne) {
+    if (kOne_)
+      group_size_ = extent_y_;
+    else
+      group_size_ = extent_x_ * extent_z_;
+  }
+
+  __host__ __device__ int operator()(const int& ind) const {
+    const int group = kOne_ ? ind / group_size_ : ind % group_size_;
+    const int offset = kOne_ ? ind % group_size_ : ind / group_size_;
+
+    const int x = group / extent_z_;
+    const int z = group % extent_z_;
+
+    return x * extent_y_ * extent_z_ + z + offset * extent_z_;
+  }
+
+  int extent_x_;
+  int extent_y_;
+  int extent_z_;
+  bool kOne_;
+  int group_size_;
+};
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
+                           int in_size, Op op, T init,
+                           const gpuStream_t& cu_stream) {
+  // handle situations where low latency is important better than CUB
+  if (in_size <= 4096) {
+    const int num_blocks = 1;
+    const int num_threads = 256;
+    TF_CHECK_OK(GpuLaunchKernel(BlockReduceKernel<IN_T, OUT_T, num_threads, Op>,
+                                num_blocks, num_threads, 0, cu_stream, in, out,
+                                in_size, op, init));
+    return;
+  } else if (in_size <= 1 << 18) {
+    const int num_threads = 256;
+    const int num_blocks =
+        std::min(TF_RED_WARPSIZE, Eigen::divup(in_size, num_threads));
+    // it seems like tailoring this to the GPU
+    // would be more effective, but all attempts
+    // at making this a multiple of the number of
+    // multiprocessors have lead to lower perf
+    // in general
+    // TODO(eriche) investigate this more
+
+    Tensor temp_storage;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(
+                 DT_INT8,
+                 TensorShape({static_cast<int64_t>(num_blocks * sizeof(T))}),
+                 &temp_storage));
+
+    TF_CHECK_OK(GpuLaunchKernel(BlockReduceKernel<IN_T, T*, num_threads, Op>,
+                                num_blocks, num_threads, 0, cu_stream, in,
+                                (T*)temp_storage.flat<int8_t>().data(), in_size,
+                                op, init));
+
+    // take care that we only reduce blocks that had some valid elements in them
+    // TODO(eriche): CUB currently has a bug in HeadSegmentedReduce that
+    // requires it to be used with a full warp.  Can reduce TF_RED_WARPSIZE ->
+    // num_blocks when this is fixed.
+    TF_CHECK_OK(GpuLaunchKernel(CleanupSegments<T*, OUT_T, Op>, 1,
+                                TF_RED_WARPSIZE, 0, cu_stream,
+                                (T*)temp_storage.flat<int8_t>().data(), out, 1,
+                                1, num_blocks, op, init));
+    return;
+  }
+
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
+    auto success =
+        gpuprim::DeviceReduce::Reduce(temp_storage_ptr, temp_storage_bytes, in,
+                                      out, in_size, op, init, cu_stream);
+
+    OP_REQUIRES(
+        ctx, success == 0,
+        errors::Internal("CUB reduce error ", GpuGetErrorString(success)));
+  };
+
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
+                        int num_cols, Op op, T init,
+                        const gpuStream_t& cu_stream) {
+  if (num_cols < 1024) {
+    const int threads_per_block = 128;
+    const int warps_per_block = threads_per_block / TF_RED_WARPSIZE;
+    int num_blocks = (num_rows + warps_per_block - 1) / warps_per_block;
+
+    TF_CHECK_OK(GpuLaunchKernel(RowReduceKernel<IN_T, OUT_T, Op>, num_blocks,
+                                threads_per_block, 0, cu_stream, in, out,
+                                num_rows, num_cols, op, init));
+    return;
+  }
+
+  // setup segment offsets with counting and transform iterator
+  RowOffset row_offset_op(num_cols);
+  gpuprim::CountingInputIterator<int> counting_iter(0);
+  gpuprim::TransformInputIterator<int, RowOffset,
+                                  gpuprim::CountingInputIterator<int>>
+      transform_iter(counting_iter, row_offset_op);
+
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
+    auto success = gpuprim::DeviceSegmentedReduce::Reduce(
+        temp_storage_ptr, temp_storage_bytes, in, out, num_rows, transform_iter,
+        transform_iter + 1, op, init, cu_stream);
+
+    OP_REQUIRES(ctx, success == 0,
+                errors::Internal("CUB segmented reduce error",
+                                 GpuGetErrorString(success)));
+  };
+
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
+                                     int extent_x, int extent_y, Op op, T init,
+                                     const gpuStream_t& cu_stream) {
+  int rows_per_warp = TF_RED_WARPSIZE / extent_y;
+  dim3 block_dim(
+      TF_RED_WARPSIZE,
+      std::min(Eigen::divup(extent_x, rows_per_warp), (1024 / TF_RED_WARPSIZE)),
+      1);
+  dim3 grid_dim(1,
+                Eigen::divup(static_cast<unsigned int>(extent_x),
+                             rows_per_warp * block_dim.y),
+                1);
+
+  grid_dim.y = std::min((int)grid_dim.y, TF_RED_WARPSIZE);
+
+  if (grid_dim.y > 2 && grid_dim.y < TF_RED_WARPSIZE) {
+    int log2 = Log2Floor(grid_dim.y);
+    grid_dim.y = 1 << log2;
+  }
+
+  if (grid_dim.y == 1) {
+    TF_CHECK_OK(GpuLaunchKernel(ColumnReduceMax16ColumnsKernel<IN_T, OUT_T, Op>,
+                                grid_dim, block_dim, 0, cu_stream, in, out,
+                                extent_x, extent_y, op, init));
+  } else {
+    Tensor temp_storage;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_INT8,
+                                      TensorShape({static_cast<int64_t>(
+                                          sizeof(T) * extent_y * grid_dim.y)}),
+                                      &temp_storage));
+    TF_CHECK_OK(GpuLaunchKernel(ColumnReduceMax16ColumnsKernel<IN_T, T*, Op>,
+                                grid_dim, block_dim, 0, cu_stream, in,
+                                (T*)temp_storage.flat<int8_t>().data(),
+                                extent_x, extent_y, op, init));
+
+    dim3 new_grid_dim(
+        (grid_dim.y * extent_y + (TF_RED_WARPSIZE - 1)) / TF_RED_WARPSIZE, 1,
+        1);
+    dim3 num_threads(128, 1, 1);
+    TF_CHECK_OK(GpuLaunchKernel(CleanupSegments<T*, OUT_T, Op>, new_grid_dim,
+                                num_threads, 0, cu_stream,
+                                (T*)temp_storage.flat<int8_t>().data(), out,
+                                extent_x, extent_y, grid_dim.y, op, init));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
+                                       int extent_x, int extent_y, Op op,
+                                       T init, const gpuStream_t& cu_stream) {
+  dim3 block_dim(TF_RED_WARPSIZE, std::min(extent_x, (1024 / TF_RED_WARPSIZE)),
+                 1);
+  dim3 grid_dim((extent_y + (TF_RED_WARPSIZE - 1)) / TF_RED_WARPSIZE, 1, 1);
+
+  if (grid_dim.x < 16)
+    grid_dim.y = std::min((extent_x + (TF_RED_WARPSIZE - 1)) / TF_RED_WARPSIZE,
+                          TF_RED_WARPSIZE);
+
+  if (grid_dim.y > 2 && grid_dim.y < TF_RED_WARPSIZE) {
+    int log2 = Log2Floor(grid_dim.y);
+    grid_dim.y = 1 << log2;
+  }
+
+  if (grid_dim.y == 1) {
+    TF_CHECK_OK(GpuLaunchKernel(ColumnReduceKernel<IN_T, OUT_T, Op>, grid_dim,
+                                block_dim, 0, cu_stream, in, out, extent_x,
+                                extent_y, op, init));
+  } else {
+    Tensor temp_storage;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_INT8,
+                                      TensorShape({static_cast<int64_t>(
+                                          sizeof(T) * extent_y * grid_dim.y)}),
+                                      &temp_storage));
+
+    TF_CHECK_OK(GpuLaunchKernel(
+        ColumnReduceKernel<IN_T, T*, Op>, grid_dim, block_dim, 0, cu_stream, in,
+        (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op, init));
+
+    dim3 new_grid_dim(
+        (grid_dim.y * extent_y + (TF_RED_WARPSIZE - 1)) / TF_RED_WARPSIZE, 1,
+        1);
+    TF_CHECK_OK(GpuLaunchKernel(CleanupSegments<T*, OUT_T, Op>, new_grid_dim,
+                                block_dim, 0, cu_stream,
+                                (T*)temp_storage.flat<int8_t>().data(), out,
+                                extent_x, extent_y, grid_dim.y, op, init));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
+                           int extent_x, int extent_y, Op op, T init,
+                           const gpuStream_t& cu_stream) {
+  if (extent_y <= 16) {
+    LaunchColumnReduction_LTE16Cols(ctx, out, in, extent_x, extent_y, op, init,
+                                    cu_stream);
+  } else if (extent_y <= 4096) {
+    LaunchColumnReduction_LTE4096Cols(ctx, out, in, extent_x, extent_y, op,
+                                      init, cu_stream);
+  } else {
+    int threads_per_block = 128;
+    int num_blocks = Eigen::divup(extent_y, threads_per_block);
+
+    TF_CHECK_OK(GpuLaunchKernel(ColumnReduceSimpleKernel<IN_T, OUT_T, Op>,
+                                num_blocks, threads_per_block, 0, cu_stream, in,
+                                out, 1, extent_x, extent_y, op));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DYReductionSimple(OpKernelContext* ctx, OUT_T out, IN_T in,
+                              int extent_x, int extent_y, int extent_z, Op op,
+                              T init, const gpuStream_t& cu_stream) {
+  int threads_per_block = 128;
+  int num_blocks =
+      (extent_x * extent_z + threads_per_block - 1) / threads_per_block;
+
+  // TODO(eriche): this won't be very good in the case of small x
+  //                small z and large y.
+  TF_CHECK_OK(GpuLaunchKernel(ColumnReduceSimpleKernel<IN_T, OUT_T, Op>,
+                              num_blocks, threads_per_block, 0, cu_stream, in,
+                              out, extent_x, extent_y, extent_z, op));
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DYReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
+                        int extent_y, int extent_z, Op op, T init,
+                        const gpuStream_t& cu_stream) {
+  int threads_per_block = 128;
+
+  int n_group_in = extent_y;
+  int n_size = extent_z;
+
+  // Calculate and allocate temporary space
+  std::size_t temp_storage_bytes = 0;
+  // A plane's size is n_group_in * n_size. We make sure no single plane crosses
+  // more than one thread block, meaning a thread block will handle one whole
+  // plane or multiple planes in the second stage. Also, It may handle a partial
+  // plane when n_size is too large and the while-loop will stop at
+  // n_group_in = 1, where we directly copy the temp to output in the next
+  // stage.
+  while (n_group_in >= 2 && n_group_in * n_size > threads_per_block) {
+    int n_group_out = std::max(1, n_group_in / (2 * kUnroll));
+    temp_storage_bytes += n_group_out * n_size;
+    n_group_in = n_group_out;
+  }
+  temp_storage_bytes *= extent_x * sizeof(T);
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+               &temp_storage));
+
+  // Reduction
+  n_group_in = extent_y;
+  int temp_in_offset = -1;
+  int temp_out_offset = 0;
+  int num_blocks;
+  while (n_group_in >= 2 && n_group_in * n_size > threads_per_block) {
+    int n_group_out = std::max(1, n_group_in / (2 * kUnroll));
+    num_blocks =
+        Eigen::divup(extent_x * n_group_out * n_size, threads_per_block);
+    TF_CHECK_OK(GpuLaunchKernel(
+        ColumnReduceInToTempKernel<IN_T, Op>, num_blocks, threads_per_block, 0,
+        cu_stream, (void*)(temp_storage.flat<int8_t>().data()), temp_in_offset,
+        temp_out_offset, in, extent_x, n_group_in, extent_z, op));
+
+    n_group_in = n_group_out;
+    temp_in_offset = temp_out_offset;
+    temp_out_offset = temp_in_offset + extent_x * n_group_out * n_size;
+  }
+
+  if (n_group_in * n_size <= threads_per_block) {
+    num_blocks = extent_x;
+  } else {
+    DCHECK_EQ(1, n_group_in);
+    num_blocks = Eigen::divup(extent_x * n_size, threads_per_block);
+  }
+
+  TF_CHECK_OK(GpuLaunchKernel(
+      ColumnReduceTempToOutKernel<IN_T, OUT_T, Op>, num_blocks,
+      threads_per_block, 2 * sizeof(T) * threads_per_block, cu_stream,
+      (void*)(temp_storage.flat<int8_t>().data()), temp_in_offset, in, out,
+      extent_x, n_group_in, extent_z, op));
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
+                         int extent_y, int extent_z, Op op, T init,
+                         const gpuStream_t& cu_stream) {
+  // setup segment offsets with counting and transform iterator
+  RowOffset row_offset_op(extent_x * extent_z);
+  gpuprim::CountingInputIterator<int> counting_iter(0);
+  gpuprim::TransformInputIterator<int, RowOffset,
+                                  gpuprim::CountingInputIterator<int>>
+      transform_iter(counting_iter, row_offset_op);
+
+  GatherOp gather_op(extent_x, extent_y, extent_z, false);
+  typedef gpuprim::TransformInputIterator<int, GatherOp,
+                                          gpuprim::CountingInputIterator<int>>
+      gatherIterType;
+  gatherIterType gather_iter(counting_iter, gather_op);
+
+  PermutationInputIterator<T, IN_T, gatherIterType> permute_iter(in,
+                                                                 gather_iter);
+
+  std::size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
+    auto success = gpuprim::DeviceSegmentedReduce::Reduce(
+        temp_storage_ptr, temp_storage_bytes, permute_iter, out, extent_y,
+        transform_iter, transform_iter + 1, op, init, cu_stream);
+
+    OP_REQUIRES(ctx, success == 0,
+                errors::Internal("CUB segmented reduce error",
+                                 GpuGetErrorString(success)));
+  };
+
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
+}
+
+namespace reduction_op_helper {
+
+template <typename T, typename Op>
+struct IsSum {
+  constexpr static bool value =
+      (std::is_same<Op, gpuprim::Sum>::value ||
+       std::is_same<Op, Eigen::internal::SumReducer<T>>::value ||
+       std::is_same<Op, Sum<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsMax {
+  constexpr static bool value =
+      (std::is_same<Op, MaxPropagateNaN>::value ||
+       std::is_same<Op, gpuprim::Max>::value ||
+       std::is_same<
+           Op, Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>>::value);
+};
+
+template <typename T, typename Op>
+struct IsMin {
+  constexpr static bool value =
+      (std::is_same<Op, MinPropagateNaN>::value ||
+       std::is_same<Op, gpuprim::Min>::value ||
+       std::is_same<
+           Op, Eigen::internal::MinReducer<T, Eigen::PropagateNaN>>::value);
+};
+
+template <typename T, typename Op>
+struct IsProd {
+  constexpr static bool value =
+      (std::is_same<Op, Prod<T>>::value ||
+       std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IdentityValue {
+  static_assert(IsSum<T, Op>::value || IsMax<T, Op>::value ||
+                    IsMin<T, Op>::value || IsProd<T, Op>::value ||
+                    std::is_same<Op, And>::value || std::is_same<Op, Or>::value,
+                "IdentityValue not yet defined for this type");
+
+  template <typename U = T, typename OpCopy = Op>
+  U operator()(
+      typename std::enable_if<IsSum<U, OpCopy>::value, U>::type t = U(0)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  U operator()(typename std::enable_if<IsMax<U, OpCopy>::value, U>::type t =
+                   Eigen::NumTraits<U>::lowest()) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  U operator()(typename std::enable_if<IsMin<U, OpCopy>::value, U>::type t =
+                   Eigen::NumTraits<U>::highest()) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  U operator()(
+      typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  U operator()(typename std::enable_if<std::is_same<OpCopy, And>::value,
+                                       bool>::type t = true) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  U operator()(typename std::enable_if<std::is_same<OpCopy, Or>::value,
+                                       bool>::type t = false) {
+    return t;
+  }
+};
+
+}  // namespace reduction_op_helper
+
+template <typename T, typename Op, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+void ReduceImpl(OpKernelContext* ctx, OUT_T out, IN_T in, int in_rank,
+                int in_dim0, int in_dim1, int in_dim2, int out_rank,
+                const ReductionAxes& reduction_axes, Op op) {
+  T init = reduction_op_helper::IdentityValue<T, Op>()();
+  const gpuStream_t& cu_stream = GetGpuStream(ctx);
+  if (out_rank == 0) {
+    const int in_size = in_dim0 * in_dim1 * in_dim2;
+    LaunchScalarReduction(ctx, out, in, in_size, op, init, cu_stream);
+  } else if (in_rank == 2 && out_rank == 1 &&
+             reduction_axes[0] == 1) {  // row reduction
+    LaunchRowReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
+  } else if (in_rank == 2 && out_rank == 1 &&
+             reduction_axes[0] == 0) {  // column reduction
+    LaunchColumnReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
+  } else if (in_rank == 3 && out_rank == 2 && reduction_axes[0] == 1) {
+    int elems_per_thread = in_dim1 / (in_dim0 * in_dim2);
+    if (elems_per_thread >= 16) {
+      Launch3DYReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
+                         cu_stream);
+    } else {
+      Launch3DYReductionSimple(ctx, out, in, in_dim0, in_dim1, in_dim2, op,
+                               init, cu_stream);
+    }
+  } else if (in_rank == 3 && out_rank == 1 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2) {
+    Launch3DXZReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
+                        cu_stream);
+  } else {
+    std::stringstream ss;
+    ss << "Invalid reduction requested: in_rank, out_rank, axes " << in_rank
+       << " " << out_rank;
+    if (out_rank == 1) ss << " " << reduction_axes[0];
+    if (out_rank == 2) ss << " " << reduction_axes[1];
+    LOG(FATAL) << ss.str();
+  }
+}
+
+template <typename Reducer>
+struct ReduceFunctor<GPUDevice, Reducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer);
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::SumReducer<T>& reducer) {
+    ReduceImpl<T, Sum<T>, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Sum<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::SumReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+// Specialization for bfloat16 with fp32 accumulation.
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<Eigen::bfloat16>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(
+      OpKernelContext* ctx, OUT_T out, IN_T in,
+      const ReductionAxes& reduction_axes,
+      const Eigen::internal::SumReducer<Eigen::bfloat16>& reducer) {
+    typedef gpuprim::TransformInputIterator<float, HalfToFloat<Eigen::bfloat16>,
+                                            Eigen::bfloat16*>
+        inputIterType;
+    inputIterType input_itr((Eigen::bfloat16*)in.data(),
+                            HalfToFloat<Eigen::bfloat16>());
+
+    typedef TransformOutputIterator<Eigen::bfloat16, float,
+                                    FloatToHalf<Eigen::bfloat16>>
+        outputIterType;
+    outputIterType itr((Eigen::bfloat16*)out.data(),
+                       FloatToHalf<Eigen::bfloat16>());
+
+    ReduceImpl<float, gpuprim::Sum, outputIterType, inputIterType,
+               ReductionAxes>(ctx, itr, input_itr, in.rank(), in.dimension(0),
+                              in.rank() >= 2 ? in.dimension(1) : 1,
+                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
+                              reduction_axes, gpuprim::Sum());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::SumReducer<Eigen::bfloat16>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+// TODO(rmlarsen): Specialize for float16.
+template <typename T>
+struct ReduceFunctor<GPUDevice, functor::EuclideanNormReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::EuclideanNormReducer<T>& reducer) {
+    typedef gpuprim::TransformInputIterator<T, Square<T>, T*> inputIterType;
+    inputIterType input_itr((T*)in.data(), Square<T>());
+    typedef TransformOutputIterator<T, T, SqrtOfReal<T>> outputIterType;
+    outputIterType output_itr((T*)out.data(), SqrtOfReal<T>());
+    ReduceImpl<T, Sum<T>, outputIterType, inputIterType, ReductionAxes>(
+        ctx, output_itr, input_itr, in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Sum<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::EuclideanNormReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, functor::MeanReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::MeanReducer<T>& reducer) {
+    int divisor = 1;
+    if (out.rank() == 0)
+      divisor = in.size();
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+      divisor = in.dimension(0);
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2)
+      divisor = in.dimension(0) * in.dimension(2);
+    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+
+    DividesBy<T> div_op(static_cast<T>(divisor));
+    TransformOutputIterator<T, T, DividesBy<T>> itr((T*)out.data(), div_op);
+    ReduceImpl<T, Sum<T>, TransformOutputIterator<T, T, DividesBy<T>>, T*,
+               ReductionAxes>(ctx, itr, (T*)in.data(), in.rank(),
+                              in.dimension(0),
+                              in.rank() >= 2 ? in.dimension(1) : 1,
+                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
+                              reduction_axes, Sum<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::MeanReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <typename T, typename OUT_T, typename IN_T, typename ReductionAxes>
+void ReduceMeanWithFloatAccumulationImpl(
+    OpKernelContext* ctx, OUT_T out, IN_T in,
+    const ReductionAxes& reduction_axes,
+    const functor::MeanReducer<T>& reducer) {
+  float divisor = 1.f;
+  if (out.rank() == 0)
+    divisor = in.size();
+  else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+    divisor = in.dimension(0);
+  else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+    divisor = in.dimension(1);
+  else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+           reduction_axes[1] == 2)
+    divisor = in.dimension(0) * in.dimension(2);
+  else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+    divisor = in.dimension(1);
+  DividesBy<float, T> div_op(divisor);
+
+  typedef gpuprim::TransformInputIterator<float, HalfToFloat<T>, T*>
+      inputIterType;
+  inputIterType input_itr((T*)in.data(), HalfToFloat<T>());
+
+  typedef TransformOutputIterator<T, float, DividesBy<float, T>> outputIterType;
+  outputIterType itr((T*)out.data(), div_op);
+
+  ReduceImpl<float, gpuprim::Sum, outputIterType, inputIterType, ReductionAxes>(
+      ctx, itr, input_itr, in.rank(), in.dimension(0),
+      in.rank() >= 2 ? in.dimension(1) : 1,
+      in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+      gpuprim::Sum());
+}
+
+template <>
+struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::half>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::MeanReducer<Eigen::half>& reducer) {
+    ReduceMeanWithFloatAccumulationImpl(ctx, out, in, reduction_axes, reducer);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::MeanReducer<Eigen::half>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::bfloat16>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::MeanReducer<Eigen::bfloat16>& reducer) {
+    ReduceMeanWithFloatAccumulationImpl(ctx, out, in, reduction_axes, reducer);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const functor::MeanReducer<Eigen::bfloat16>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice,
+                     Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(
+      OpKernelContext* ctx, OUT_T out, IN_T in,
+      const ReductionAxes& reduction_axes,
+      const Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>& reducer) {
+    ReduceImpl<T, MaxPropagateNaN, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        MaxPropagateNaN());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice,
+                     Eigen::internal::MinReducer<T, Eigen::PropagateNaN>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(
+      OpKernelContext* ctx, OUT_T out, IN_T in,
+      const ReductionAxes& reduction_axes,
+      const Eigen::internal::MinReducer<T, Eigen::PropagateNaN>& reducer) {
+    ReduceImpl<T, MinPropagateNaN, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        MinPropagateNaN());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::MinReducer<T, Eigen::PropagateNaN>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::ProdReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::ProdReducer<T>& reducer) {
+    ReduceImpl<T, Prod<T>, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Prod<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::ProdReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::AndReducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::AndReducer& reducer) {
+    ReduceImpl<bool, And, bool*, bool*, ReductionAxes>(
+        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        And());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::AndReducer& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::OrReducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::OrReducer& reducer) {
+    ReduceImpl<bool, Or, bool*, bool*, ReductionAxes>(
+        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, Or());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::OrReducer& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops.h
new file mode 100644
index 00000000..510fbc93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops.h
@@ -0,0 +1,207 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_H_
+
+// Functor definitions for Reduction ops, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Reducer>
+struct ReducerTraits {
+  enum { IsScalarIdentity = true };
+};
+
+// Dummy class used for template specialization for mean reduction, which is
+// accomplished by SumReducer and on-the-fly division by the reduction factor.
+template <typename Scalar>
+struct MeanReducer {
+  Scalar initialize() const { return Scalar(0); }
+};
+
+// Dummy class used for template specialization for l2-norm reduction.
+template <typename Scalar>
+struct EuclideanNormReducer {
+  Scalar initialize() const { return Scalar(0); }
+};
+
+template <typename Scalar>
+struct ReducerTraits<EuclideanNormReducer<Scalar>> {
+  enum { IsScalarIdentity = false };
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Reducer>
+struct ReduceEigenImpl {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes, const Reducer& reducer) {
+    out.device(d) = in.reduce(reduction_axes, reducer);
+  }
+};
+
+// Specialization for BF16 Reducer to fix accuracy.
+// TODO: All BF16 reducers should have specializations to fix accuracy.
+#define CASTING_SPECIALIZATION(Reducer, ScalarType, IntermediateType)        \
+  template <typename Device, typename OUT_T, typename IN_T,                  \
+            typename ReductionAxes>                                          \
+  struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,                 \
+                         Reducer<ScalarType>> {                              \
+    void operator()(const Device& d, OUT_T out, IN_T in,                     \
+                    const ReductionAxes& reduction_axes,                     \
+                    const Reducer<ScalarType>& reducer) {                    \
+      static_assert(std::is_same<ScalarType, typename OUT_T::Scalar>::value, \
+                    "");                                                     \
+      Reducer<IntermediateType> intermediate_reducer;                        \
+      auto in_as_intermediate = in.template cast<IntermediateType>();        \
+      out.device(d) =                                                        \
+          in_as_intermediate.reduce(reduction_axes, intermediate_reducer)    \
+              .template cast<ScalarType>();                                  \
+    }                                                                        \
+  };
+
+CASTING_SPECIALIZATION(Eigen::internal::SumReducer, bfloat16, float);
+#undef CASTING_SPECIALIZATION
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Scalar>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::MeanReducer<Scalar>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::MeanReducer<Scalar>& reducer) {
+    static_assert(std::is_same<Scalar, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<Scalar> sum_reducer;
+    out.device(d) = in.reduce(reduction_axes, sum_reducer) /
+                    static_cast<Scalar>(in.size() / out.size());
+  }
+};
+
+// Specialization for which we do the reduction in IntermediateType to
+// avoid integer overflow and fix bfloat16 accuracy in some models.
+#define CASTING_SPECIALIZATION(ScalarType, IntermediateType)                  \
+  template <typename Device, typename OUT_T, typename IN_T,                   \
+            typename ReductionAxes>                                           \
+  struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,                  \
+                         functor::MeanReducer<ScalarType>> {                  \
+    void operator()(const Device& d, OUT_T out, IN_T in,                      \
+                    const ReductionAxes& reduction_axes,                      \
+                    const functor::MeanReducer<ScalarType>& reducer) {        \
+      static_assert(std::is_same<ScalarType, typename OUT_T::Scalar>::value,  \
+                    "");                                                      \
+      Eigen::internal::SumReducer<IntermediateType> sum_reducer;              \
+      out.device(d) = (in.template cast<IntermediateType>().reduce(           \
+                           reduction_axes, sum_reducer) /                     \
+                       static_cast<IntermediateType>(in.size() / out.size())) \
+                          .template cast<ScalarType>();                       \
+    }                                                                         \
+  }
+
+CASTING_SPECIALIZATION(uint8, uint64);
+CASTING_SPECIALIZATION(uint16, uint64);
+CASTING_SPECIALIZATION(uint32, uint64);
+CASTING_SPECIALIZATION(int8, int64_t);
+CASTING_SPECIALIZATION(int16, int64_t);
+CASTING_SPECIALIZATION(int32, int64_t);
+CASTING_SPECIALIZATION(bfloat16, float);
+#undef CASTING_SPECIALIZATION
+
+// TODO(rmlarsen): Refactor this such that taking the sqrt can be optional
+// controlled by an attribute.
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Scalar>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<Scalar>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<Scalar>& reducer) {
+    static_assert(std::is_same<Scalar, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<Scalar> sum_reducer;
+    out.device(d) =
+        (in * in.conjugate()).reduce(reduction_axes, sum_reducer).sqrt();
+  }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<bfloat16>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<bfloat16>& reducer) {
+    static_assert(std::is_same<bfloat16, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<float> sum_reducer;
+    auto in_as_float = in.template cast<float>();
+    out.device(d) = (in_as_float * in_as_float.conjugate())
+                        .reduce(reduction_axes, sum_reducer)
+                        .sqrt()
+                        .template cast<bfloat16>();
+  }
+};
+
+// For most reducers, the identity is Reducer::initialize()
+template <typename Reducer>
+struct Identity {
+  static auto identity(const Reducer& reducer)
+      -> decltype(reducer.initialize()) {
+    return reducer.initialize();
+  }
+};
+
+// MeanReducer is a special case, since it doesn't technically have an identity.
+// Thus, ideally we'd return nan.  However, mean is instantiated for integer
+// types as well, so we do the nan override only for floating point types.
+#define FIX_MEAN_IDENTITY(T)                            \
+  template <>                                           \
+  struct Identity<functor::MeanReducer<T>> {            \
+    static T identity(const functor::MeanReducer<T>&) { \
+      return Eigen::NumTraits<T>::quiet_NaN();          \
+    }                                                   \
+  };
+FIX_MEAN_IDENTITY(Eigen::half)
+FIX_MEAN_IDENTITY(Eigen::bfloat16)
+FIX_MEAN_IDENTITY(float)
+FIX_MEAN_IDENTITY(double)
+#undef FIX_MEAN_IDENTITY
+
+template <typename Device, typename OUT_T, typename Reducer>
+void FillIdentityEigenImpl(const Device& d, OUT_T out, const Reducer& reducer) {
+  MaybeWith32BitIndexing<Device>(
+      [&](auto out32) {
+        out32.device(d) = out32.constant(Identity<Reducer>::identity(reducer));
+      },
+      out);
+}
+
+template <typename Device, typename Reducer>
+struct ReduceFunctor {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer);
+
+  template <typename OUT_T>
+  static void FillIdentity(const Device& d, OUT_T out, const Reducer& reducer);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops_common.h
new file mode 100644
index 00000000..6ce777f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops_common.h
@@ -0,0 +1,279 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is an internal header file intended to only be included as the
+// front-matter in the implementation files of various reduction ops.  It
+// is a header file because we split the various reduction ops into their
+// own compilation units to get more parallelism in compilation.
+
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
+
+#define EIGEN_USE_THREADS
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+struct Constants {
+  // Derive Index type. int (32-bit) or long (64-bit) depending on the
+  // compile-time configuration. "float" here is not relevant.
+  // TODO(zhifengc): Moves the definition to TTypes.
+  typedef TTypes<float>::Tensor::Index Index;
+  Eigen::array<Index, 1> kZero;
+  Eigen::array<Index, 1> kOne;
+  Eigen::array<Index, 2> kZeroTwo;
+
+  Constants() {
+    kZero[0] = 0;
+    kOne[0] = 1;
+    kZeroTwo[0] = 0;
+    kZeroTwo[1] = 2;
+  }
+};
+
+struct ConstantsBase {
+  const Eigen::IndexList<Eigen::type2index<0>> kZero;
+  const Eigen::IndexList<Eigen::type2index<1>> kOne;
+  const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo;
+};
+template <>
+struct Constants<CPUDevice> : ConstantsBase {};
+
+class ReductionHelper {
+ public:
+  ReductionHelper() : reduce_first_axis_(false) {}
+
+  absl::Status Simplify(const Tensor& data, const Tensor& axis,
+                        const bool keep_dims);
+
+  // We need to do roughly:
+  //   tmp_out = allocate(out_reshape())
+  //   tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes)
+  //   out = tmp_out.reshape(out_shape)
+
+  // The reduction result must be allocated with this shape.
+  TensorShape out_reshape() const;
+
+  // The final output shape must be allocated with this shape.
+  TensorShape out_shape() const;
+
+  // The reduction is on a reshaped tensor of this rank.
+  int ndims() const { return data_reshape_.size(); }
+
+  // True if need to reduce the 0-th dimension.
+  bool reduce_first_axis() const { return reduce_first_axis_; }
+
+  // The output is reshaped.
+  template <typename T, int N>
+  typename TTypes<T, N>::Tensor out(Tensor* out) {
+    return out->shaped<T, N>(out_reshape_);
+  }
+
+  // The input is reshaped.
+  template <typename T, int N>
+  typename TTypes<T, N>::ConstTensor in(const Tensor& data) {
+    return data.shaped<T, N>(data_reshape_);
+  }
+
+  // Shape of shuffled input
+  TensorShape data_reshape() const {
+    TensorShape shape;
+    for (auto s : data_reshape_) shape.AddDim(s);
+    return shape;
+  }
+
+  // Shape with all reduction dimensions at the end
+  TensorShape shuffled_shape();
+
+  // Permutation of reduced dims needed to put reduction dimensions at the end
+  absl::InlinedVector<int32, 8> permutation();
+
+ private:
+  bool reduce_first_axis_;  // True if need to reduce the 0-th dimension.
+  absl::InlinedVector<int64_t, 4>
+      data_reshape_;                           // Reshape data before reduction.
+  absl::InlinedVector<int64_t, 4> out_shape_;  // The final output shape.
+  absl::InlinedVector<int64_t, 4>
+      out_reshape_;  // Reshape output for reduction.
+};
+
+// For operations where the output is a reduction function along some
+// dimensions of the input.
+template <typename Device, class T, typename Tperm, typename Reducer>
+class ReductionOp : public OpKernel {
+ public:
+  explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    const DataType pt = DataTypeToEnum<Tperm>::v();
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, pt}, {dt}));
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& data = ctx->input(0);
+    const Tensor& axes = ctx->input(1);
+    VLOG(1) << "data shape: " << data.shape().DebugString();
+    VLOG(1) << "axes      : " << axes.SummarizeValue(10);
+
+    ReductionHelper helper;
+    OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
+    CHECK_GE(helper.ndims(), 0);
+
+    bool is_scalar_identity = functor::ReducerTraits<Reducer>::IsScalarIdentity;
+    bool is_trivial = helper.ndims() == 0 ||
+                      (helper.ndims() == 1 && !helper.reduce_first_axis());
+    if (is_scalar_identity && is_trivial) {
+      Tensor out;
+      // Special case. Reduces nothing and does not alter the input values.
+      if (!out.CopyFrom(data, helper.out_shape())) {
+        ctx->SetStatus(errors::Internal("Error during reduction copy."));
+      }
+      ctx->set_output(0, out);
+      return;
+    }
+
+    // We must allocate temp tensors using the same alloc attr as
+    // output(0) because it is returned as output(0) in the end.
+    const AllocatorAttributes alloc_attr = ctx->output_alloc_attr(0);
+
+    Tensor tmp_out;
+    typedef functor::ReduceFunctor<Device, Reducer> Functor;
+    Constants<Device> constants;
+    const Device& d = ctx->eigen_device<Device>();
+    Reducer reducer;
+
+    if (data.NumElements() > 0 && is_trivial && !is_scalar_identity) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
+                                             TensorShape({data.NumElements()}),
+                                             &tmp_out, alloc_attr));
+      Functor::Reduce(ctx, tmp_out.flat<T>(),
+                      data.shaped<T, 2>({1, data.NumElements()}),
+                      constants.kZero, reducer);
+    } else {
+      // A temporary tensor whose size matches the size of the reduced
+      // output.
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
+                                  helper.out_reshape(), &tmp_out, alloc_attr));
+
+      if (tmp_out.NumElements() == 0) {
+        // Nothing to do, fall through to final reshaping.
+      } else if (data.NumElements() == 0) {
+        // Degenerate reduction where the input is empty but the output is
+        // nonempty (thus tmp_out.NumElements() > 0), and we must fill the
+        // output with identity elements.  Example: tf.reduce_sum(tf.zeros((0,
+        // 3)), [0]). Eigen sometimes crashes in this case, so we do it
+        // manually.
+        Functor::FillIdentity(d, tmp_out.flat<T>(), reducer);
+      } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
+        // Reduce to a scalar.
+        Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
+                        constants.kZero, reducer);
+      } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
+        // Can be viewed as a reduction of a matrix along 1st dimension.
+        Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+                        constants.kZero, reducer);
+      } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
+        // Can be viewed as a reduction of a matrix along 2nd dimension.
+        Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+                        constants.kOne, reducer);
+      } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
+        // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
+        // dimensions.
+        Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
+                        constants.kZeroTwo, reducer);
+      } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
+        // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
+        Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
+                        constants.kOne, reducer);
+      } else {
+        // If we don't hit one of the cases above, transpose the data so that
+        // all reduced dimensions are last and reuse the 2-D -> 1-D case.
+        Tensor data_reshaped;
+        OP_REQUIRES(ctx, data_reshaped.CopyFrom(data, helper.data_reshape()),
+                    errors::Internal("Error during reduction copy."));
+        Tensor shuffled;
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                               helper.shuffled_shape(),
+                                               &shuffled, alloc_attr));
+        OP_REQUIRES_OK(ctx, DoTranspose(d, data_reshaped, helper.permutation(),
+                                        &shuffled));
+        const int64_t unreduced = tmp_out.NumElements();
+        const int64_t reduced = shuffled.NumElements() / unreduced;
+        const Tensor& const_shuffled = shuffled;
+        Functor::Reduce(ctx, tmp_out.flat<T>(),
+                        const_shuffled.shaped<T, 2>({unreduced, reduced}),
+                        constants.kOne, reducer);
+      }
+    }
+
+    // Set the real output using the contents of the reduction but the
+    // real expected output shape.  The number of elements should
+    // match between the two shapes.
+    Tensor out;
+    OP_REQUIRES(ctx, out.CopyFrom(tmp_out, helper.out_shape()),
+                errors::Internal("Error during reduction copy."));
+    ctx->set_output(0, out);
+  }
+
+ private:
+  // True if the number of dimensions should be maintained.
+  bool keep_dims_;
+};
+
+namespace functor {
+
+template <typename Device, typename Reducer>
+struct ReduceFunctorBase {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer) {
+    const Device& d = ctx->eigen_device<Device>();
+    ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes, Reducer> reducer_impl;
+    reducer_impl(d, out, in, reduction_axes, reducer);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const Device& d, OUT_T out, const Reducer& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
+template <typename Reducer>
+struct ReduceFunctor<CPUDevice, Reducer>
+    : ReduceFunctorBase<CPUDevice, Reducer> {};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops_common_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops_common_gpu.h
new file mode 100644
index 00000000..b7bdb07c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reduction_ops_common_gpu.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with GPU support
+#endif
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Reducer>
+struct ReduceFunctor<Eigen::GpuDevice, Reducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Reducer& reducer);
+
+  template <typename OUT_T>
+  static void FillIdentity(const Eigen::GpuDevice& d, OUT_T out,
+                           const Reducer& reducer);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/redux_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/redux_functor.h
new file mode 100644
index 00000000..41ab917a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/redux_functor.h
@@ -0,0 +1,337 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
+
+#define EIGEN_USE_THREADS
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+namespace functor {
+
+// Compute reduction over outer dimensions.
+// Example:
+//   input: [D1, D2, ... , DN]
+//   ->
+//   output: [Di, ... , DN] where i belongs to set [1,N]
+template <typename InputT, typename AccumT, typename OutputT,
+          typename BinaryFunctor>
+struct ReduceOuterDimensions {
+  ReduceOuterDimensions() {}
+
+  template <int num_dims>
+  void operator()(const CPUDevice& device,
+                  const Eigen::DSizes<Eigen::Index, num_dims>& input_dims,
+                  const Tensor& input, Tensor* output) const {
+    // Compute inner and outer dim after reshaping into 2d tensor.
+    const int num_output_dims = output->dims();
+    auto output_dims = output->template flat<OutputT>().dimensions();
+
+    Eigen::Index inner_dim = 1, outer_dim = 1;
+    for (int i = 0; i < num_dims - num_output_dims; ++i)
+      outer_dim *= input_dims[i];
+    for (int i = num_dims - num_output_dims; i < num_dims; ++i)
+      inner_dim *= input_dims[i];
+
+    if (1 == outer_dim) {
+      // Nothing to do but passing input to output.
+      output->template flat<OutputT>() =
+          input.template flat<InputT>().template cast<OutputT>().reshape(
+              output_dims);
+      return;
+    }
+
+    // Get device thread num.
+    const Eigen::Index num_threads = device.numThreads();
+
+    // If the inner dim parallelism is large enough
+    // TODO(ezhulenev): There seems to be no benefits in going this route. Check
+    // if this can be improved, or use better heuristic?
+    if (inner_dim > num_threads * 32) {
+      // Do not create more blocks than there are threads in a pool.
+      const Eigen::Index num_blocks = num_threads;
+
+      // Block size along the outer dimension.
+      const Eigen::Index inner_block_size = Eigen::divup(inner_dim, num_blocks);
+      const InputT* input_data = input.template flat<InputT>().data();
+
+      // Allocate temporary buffer for partial reductions.
+      Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index> buffer(
+          {inner_dim});
+      buffer.setZero();
+      AccumT* buffer_data = buffer.data();
+
+      using Buffer = Eigen::TensorMap<
+          Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index>,
+          Eigen::Unaligned>;
+
+      using Input = Eigen::TensorMap<
+          Eigen::Tensor<const InputT, 1, Eigen::RowMajor, Eigen::Index>,
+          Eigen::Unaligned>;
+
+      const auto compute = [inner_dim, outer_dim, num_blocks, inner_block_size,
+                            input_data, buffer_data](
+                               Eigen::Index start, Eigen::Index limit) -> void {
+        DCHECK(start >= 0 && limit <= num_blocks);
+        Eigen::Index inner_dim_start = start * inner_block_size;
+        Eigen::Index inner_dim_limit = limit * inner_block_size;
+        inner_dim_limit = std::min(inner_dim, inner_dim_limit);
+        Eigen::Index my_job_len = inner_dim_limit - inner_dim_start;
+
+        const InputT* my_job_start = input_data + inner_dim_start;
+        Buffer buf(buffer_data + inner_dim_start, my_job_len);
+
+        for (Eigen::Index i = 0; i < outer_dim; ++i) {
+          auto in = Input(my_job_start + i * inner_dim, my_job_len);
+          auto cast = in.template cast<AccumT>();
+          buf = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf),
+                                           const decltype(cast)>(buf, cast);
+        }
+      };
+
+      // Compute cost of reducing a single block.
+      const Eigen::Index compute_size = outer_dim * inner_block_size;
+      const Eigen::Index compute_input_bytes = compute_size * sizeof(InputT);
+      const Eigen::TensorOpCost cost(
+          compute_input_bytes,
+          0,  // We'll be mostly writing to L1, assume store cost is 0
+          compute_size * Eigen::internal::functor_traits<BinaryFunctor>::Cost);
+
+      device.parallelFor(num_blocks, cost, compute);
+
+      // Write final result to the output.
+      output->template flat<OutputT>() =
+          buffer.template cast<OutputT>().reshape(output_dims);
+    } else {
+      // Compute block size along the outer dimension for efficiency.
+      const Eigen::Index parallel_cell_size = inner_dim;
+      const Eigen::Index total_workload = outer_dim * inner_dim;
+      const Eigen::Index max_parallelism = total_workload / parallel_cell_size;
+
+      const Eigen::Index min_block_workload = 2000;
+      const Eigen::Index min_block_size =
+          Eigen::divup(min_block_workload, parallel_cell_size);
+      const Eigen::Index max_num_blocks = std::min(
+          max_parallelism, Eigen::divup(total_workload, min_block_size));
+
+      // Do not create more blocks than there are threads in a pool.
+      const Eigen::Index num_blocks = std::min(max_num_blocks, num_threads);
+
+      // Block size along the outer dimension.
+      const Eigen::Index outer_block_size = Eigen::divup(outer_dim, num_blocks);
+
+      const InputT* input_data = input.template flat<InputT>().data();
+
+      // Allocate temporary buffer for partial reductions.
+      Tensor buffer(DataTypeToEnum<AccumT>::v(), {num_blocks, inner_dim});
+      buffer.template flat<AccumT>().setZero();
+      AccumT* buffer_data = buffer.template flat<AccumT>().data();
+
+      using Buffer = Eigen::TensorMap<
+          Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index>,
+          Eigen::Unaligned>;
+
+      using Input = Eigen::TensorMap<
+          Eigen::Tensor<const InputT, 1, Eigen::RowMajor, Eigen::Index>,
+          Eigen::Unaligned>;
+
+      const auto compute = [inner_dim, num_blocks, outer_block_size,
+                            buffer_data, input_data, outer_dim](
+                               Eigen::Index start, Eigen::Index limit) -> void {
+        DCHECK(start >= 0 && limit <= num_blocks);
+        Eigen::Index outer_dim_start = start * outer_block_size;
+        Eigen::Index outer_dim_limit = limit * outer_block_size;
+        outer_dim_limit = std::min(outer_dim, outer_dim_limit);
+
+        Buffer buf(buffer_data + start * inner_dim, inner_dim);
+        for (Eigen::Index i = outer_dim_start; i < outer_dim_limit; ++i) {
+          auto in = Input(input_data + i * inner_dim, inner_dim);
+          auto cast = in.template cast<AccumT>();
+          buf = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf),
+                                           const decltype(cast)>(buf, cast);
+        }
+      };
+
+      // Compute cost of reducing a single block.
+      const Eigen::Index compute_size = outer_block_size * inner_dim;
+      const Eigen::Index compute_input_bytes = compute_size * sizeof(InputT);
+      const Eigen::TensorOpCost cost(
+          compute_input_bytes,
+          0,  // We'll be mostly writing to L1, assume store cost is 0
+          compute_size * Eigen::internal::functor_traits<BinaryFunctor>::Cost);
+
+      device.parallelFor(num_blocks, cost, compute);
+
+      // Aggregate partial results from temporary buffer into first block.
+      auto buf0 = Buffer(buffer_data, inner_dim);
+      // Just sum the buffer up, as inner dimensions is not large in this case.
+      for (int i = 1; i < num_blocks; ++i) {
+        auto buf = Buffer(buffer_data + i * inner_dim, inner_dim);
+        buf0 = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf0),
+                                          const decltype(buf)>(buf0, buf);
+      }
+      // Write final result to the output.
+      output->template flat<OutputT>() =
+          buf0.template cast<OutputT>().reshape(output_dims);
+    }
+  }
+};
+
+// Compute reduction to some serial middle dimensions (like a axis).
+// Example:
+//   input: [D1, D2, ... , DN]
+//   ->
+//   output: [Di, ... , Dj] where i & j belongs to set [1,N].
+template <typename InputT, typename AccumT, typename OutputT,
+          typename BinaryFunctor, typename Reducer>
+struct ReduceMiddleDimensions {
+  ReduceMiddleDimensions() {}
+
+  template <int num_dims>
+  void operator()(const CPUDevice& device,
+                  const Eigen::DSizes<Eigen::Index, num_dims>& input_dims,
+                  const Tensor& input, Tensor* output,
+                  const int axis_begin_dim) const {
+    // Compute dims after reshaping into 3d tensor.
+    const int num_output_dims = output->dims();
+    auto output_dims = output->template flat<OutputT>().dimensions();
+
+    Eigen::Index inner_dim = 1, middle_dim = 1, outer_dim = 1;
+    for (int i = 0; i < axis_begin_dim; ++i) outer_dim *= input_dims[i];
+    for (int i = axis_begin_dim; i < axis_begin_dim + num_output_dims; ++i)
+      middle_dim *= input_dims[i];
+    for (int i = axis_begin_dim + num_output_dims; i < num_dims; ++i)
+      inner_dim *= input_dims[i];
+
+    if ((1 == inner_dim * outer_dim)) {
+      // Nothing to do.
+      output->template flat<OutputT>() =
+          input.template flat<InputT>().template cast<OutputT>().reshape(
+              output_dims);
+      return;
+    }
+
+    // Compute block size along the outer dimension for efficiency.
+    const Eigen::Index parallel_cell_size = inner_dim;
+    const Eigen::Index max_parallelism = outer_dim * middle_dim;
+    const Eigen::Index total_workload = max_parallelism * inner_dim;
+
+    const Eigen::Index min_block_workload = 2000;
+    const Eigen::Index min_block_size =
+        Eigen::divup(min_block_workload, parallel_cell_size);
+    const Eigen::Index max_num_blocks =
+        std::min(max_parallelism, Eigen::divup(total_workload, min_block_size));
+
+    // Do not create more blocks than there are threads in a pool.
+    const Eigen::Index num_threads = device.numThreads();
+    const Eigen::Index num_blocks = std::min(max_num_blocks, num_threads);
+
+    // Block size along the outer dimension.
+    const Eigen::Index outer_block_size =
+        Eigen::divup(total_workload, num_blocks);
+
+    const InputT* input_data = input.template flat<InputT>().data();
+
+    // Allocate temporary buffer for partial reductions.
+    Eigen::Tensor<AccumT, 2> buffer(num_blocks, middle_dim);
+    buffer.setZero();
+    AccumT* buffer_data = buffer.data();
+
+    using Buffer = Eigen::TensorMap<Eigen::Tensor<AccumT, 1>>;
+    using Input = Eigen::TensorMap<Eigen::Tensor<const InputT, 1>>;
+
+    Eigen::array<Eigen::Index, 1> reduction_axis = {0};
+    Reducer reducer;
+    const BinaryFunctor binary_op;
+
+    const auto compute = [inner_dim, middle_dim, input_data, buffer_data,
+                          total_workload, num_blocks, outer_block_size,
+                          reduction_axis, reducer, binary_op](
+                             Eigen::Index start, Eigen::Index limit) -> void {
+      DCHECK(start >= 0 && limit <= num_blocks);
+      Eigen::Index block_start = start * outer_block_size;
+      Eigen::Index block_limit = limit * outer_block_size;
+      block_limit = std::min(total_workload, block_limit);
+      Buffer buf(buffer_data + start * middle_dim, middle_dim);
+
+      const int align_start =
+          ((block_start + inner_dim - 1) / inner_dim) * inner_dim;
+      const int align_end = (block_limit / inner_dim) * inner_dim;
+
+      Eigen::Index coordinate = block_start / inner_dim % middle_dim;
+      Eigen::Tensor<AccumT, 0> reduced =
+          Input(&input_data[block_start], align_start - block_start)
+              .reduce(reduction_axis, reducer)
+              .template cast<AccumT>();
+
+      buf(coordinate) = binary_op(buf(coordinate), reduced(0));
+
+      coordinate = align_start / inner_dim % middle_dim;
+      for (int i = align_start; i < align_end; i += inner_dim) {
+        reduced = Input(&input_data[i], inner_dim)
+                      .reduce(reduction_axis, reducer)
+                      .template cast<AccumT>();
+        buf(coordinate) = binary_op(buf(coordinate), reduced(0));
+        ++coordinate;
+        if (middle_dim == coordinate) coordinate = 0;
+      }
+
+      reduced = Input(&input_data[align_end], block_limit - align_end)
+                    .reduce(reduction_axis, reducer)
+                    .template cast<AccumT>();
+      buf(coordinate) = binary_op(buf(coordinate), reduced(0));
+    };
+
+    // Compute cost of reducing a single block.
+    const Eigen::Index compute_size = outer_block_size * inner_dim;
+    const Eigen::Index compute_input_bytes = compute_size * sizeof(InputT);
+    const Eigen::TensorOpCost cost(
+        compute_input_bytes,
+        0,  // We'll be mostly writing to L1, assume store cost is 0
+        compute_size * Eigen::internal::functor_traits<BinaryFunctor>::Cost);
+
+    device.parallelFor(num_blocks, cost, compute);
+
+    using Output = Eigen::TensorMap<
+        Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index>,
+        Eigen::Unaligned>;
+    // Aggregate partial results from temporary buffer into first block.
+    auto buf0 = Output(buffer_data, middle_dim);
+    // TODO(ezhulenev): Parallelize this loop for large inner dimensions?
+    for (int i = 1; i < num_blocks; ++i) {
+      auto buf = Output(buffer_data + i * middle_dim, middle_dim);
+      buf0 = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf0),
+                                        const decltype(buf)>(buf0, buf);
+    }
+
+    // Write final result to the output.
+    output->template flat<OutputT>() =
+        buf0.template cast<OutputT>().reshape(output_dims);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reference_gemm.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reference_gemm.h
new file mode 100644
index 00000000..9d0bb60e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reference_gemm.h
@@ -0,0 +1,96 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#define TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+
+#include <stdlib.h>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/platform/types.h"
+
+// This is an unoptimized but debuggable implementation of the GEMM matrix
+// multiply function, used to compare to faster but more opaque versions, or
+// for bit depths or argument combinations that aren't supported by optimized
+// code.
+// It assumes the row-major convention used by TensorFlow, and implements
+// C = A * B, like the standard BLAS GEMM interface. If the transpose flags are
+// true, then the relevant matrix is treated as stored in column-major order.
+
+namespace tensorflow {
+template <class T1, class T2, class T3>
+void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
+                   size_t m, size_t n, size_t k, const T1* a, int32_t offset_a,
+                   size_t lda, const T2* b, int32_t offset_b, size_t ldb, T3* c,
+                   int32_t shift_c, int32_t offset_c, int32_t mult_c,
+                   size_t ldc) {
+  int a_i_stride;
+  int a_l_stride;
+  if (transpose_a) {
+    a_i_stride = 1;
+    a_l_stride = lda;
+  } else {
+    a_i_stride = lda;
+    a_l_stride = 1;
+  }
+  int b_j_stride;
+  int b_l_stride;
+  if (transpose_b) {
+    b_j_stride = ldb;
+    b_l_stride = 1;
+  } else {
+    b_j_stride = 1;
+    b_l_stride = ldb;
+  }
+  int c_i_stride;
+  int c_j_stride;
+  if (transpose_c) {
+    c_i_stride = 1;
+    c_j_stride = ldc;
+  } else {
+    c_i_stride = ldc;
+    c_j_stride = 1;
+  }
+
+  const int32_t highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
+  const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+  const int32_t rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
+
+  int i, j, l;
+  for (j = 0; j < n; j++) {
+    for (i = 0; i < m; i++) {
+      int32_t total = 0;
+      for (l = 0; l < k; l++) {
+        const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
+        const int32_t a_value = static_cast<int32>(a[a_index]) - offset_a;
+        const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
+        const int32_t b_value = static_cast<int32>(b[b_index]) - offset_b;
+        total += (a_value * b_value);
+      }
+      const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
+      int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c);
+      if (output > highest) {
+        output = highest;
+      }
+      if (output < lowest) {
+        output = lowest;
+      }
+      c[c_index] = static_cast<T3>(output);
+    }
+  }
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/relu_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/relu_op.h
new file mode 100644
index 00000000..4b64a69f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/relu_op.h
@@ -0,0 +1,283 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#ifndef TENSORFLOW_CORE_KERNELS_RELU_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RELU_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/relu_op_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T>
+class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Relu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+// Out of line check to save code space (we have this code once, rather
+// than once for every NDIMS * NumTypes * Num_different_relu_variants
+// functions.
+struct ReluHelpers {
+  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
+                                     const Tensor& a) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+  }
+  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
+                               const Tensor& a) {
+    ValidateSameSizeHelper(context, g, a);
+    return context->status().ok();
+  }
+};
+
+template <typename Device, typename T>
+class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): either the inputs that were passed to ReluOp(), or its
+  //               outputs (using either one yields the same result here).
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
+  }
+};
+
+template <typename Device, typename T>
+void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                              const Tensor& g, const Tensor& a,
+                                              Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::ReluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
+template <typename Device, typename T>
+class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Relu6<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): inputs that were passed to Relu6Op()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
+  }
+};
+
+template <typename Device, typename T>
+void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                               const Tensor& g, const Tensor& a,
+                                               Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::Relu6Grad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
+template <typename Device, typename T>
+class LeakyReluOp : public UnaryElementWiseOp<T, LeakyReluOp<Device, T>> {
+ public:
+  explicit LeakyReluOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, LeakyReluOp<Device, T>>(context) {
+    float alpha_tmp;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_tmp));
+    alpha_ = T(alpha_tmp);
+  }
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::LeakyRelu<Device, T> functor;
+    functor({context->eigen_device<Device>(), input.flat<T>(), alpha_,
+             output->flat<T>()});
+  }
+
+ private:
+  T alpha_;
+};
+
+template <typename Device, typename T>
+class LeakyReluGradOp
+    : public BinaryElementWiseOp<T, LeakyReluGradOp<Device, T>> {
+ public:
+  explicit LeakyReluGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, LeakyReluGradOp<Device, T>>(context) {
+    float alpha_tmp;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_tmp));
+    alpha_ = T(alpha_tmp);
+  }
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, T alpha, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): either the inputs that were passed to LeakyReluOp(), or its
+  //               outputs (using either one yields the same result here).
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, alpha_, output);
+  }
+
+ private:
+  T alpha_;
+};
+
+template <typename Device, typename T>
+void LeakyReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                                   const Tensor& g,
+                                                   const Tensor& a, T alpha,
+                                                   Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::LeakyReluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), alpha,
+          output->flat<T>());
+};
+
+template <typename Device, typename T>
+class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Elu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (outputs): outputs of the EluOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
+  }
+};
+
+template <typename Device, typename T>
+void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                             const Tensor& g, const Tensor& a,
+                                             Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::EluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
+template <typename Device, typename T>
+class SeluOp : public UnaryElementWiseOp<T, SeluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, SeluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Selu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class SeluGradOp : public BinaryElementWiseOp<T, SeluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, SeluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (outputs): outputs of the SeluOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
+  }
+};
+
+template <typename Device, typename T>
+void SeluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                              const Tensor& g, const Tensor& a,
+                                              Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::SeluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
+}  // namespace tensorflow
+
+#undef EIGEN_USE_THREADS
+
+#endif  // TENSORFLOW_CORE_KERNELS_RELU_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/relu_op_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/relu_op_functor.h
new file mode 100644
index 00000000..cacef949
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/relu_op_functor.h
@@ -0,0 +1,215 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RELU_OP_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_RELU_OP_FUNCTOR_H_
+// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by ReluOp to do the computations.
+template <typename Device, typename T>
+struct Relu {
+  // Computes Relu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0));
+  }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct ReluGrad {
+  // Computes ReluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Relu op.
+  // features: either the inputs that were passed to the Relu or, or its
+  //           outputs (using either one yields the same result here).
+  // backprops: gradients to backpropagate to the Relu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    // NOTE: When the activation is exactly zero, we do not propagate the
+    // associated gradient value. This allows the output of the Relu to be used,
+    // as well as its input.
+    backprops.device(d) =
+        gradients * (features > static_cast<T>(0)).template cast<T>();
+  }
+};
+
+// Functor used by Relu6Op to do the computations.
+template <typename Device, typename T>
+struct Relu6 {
+  // Computes Relu6 activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0))
+            .template cwiseMin<Eigen::PropagateNaN>(static_cast<T>(6));
+  }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct Relu6Grad {
+  // Computes Relu6Grad backprops.
+  //
+  // gradients: gradients backpropagated to the Relu6 op.
+  // features: inputs that where passed to the Relu6 op, or its outputs.
+  // backprops: gradients to backpropagate to the Relu6 inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    // NOTE: When the activation is exactly zero or six, we
+    // make sure not to propagate the associated gradient
+    // value. This allows "features" to be either the input or the output of
+    // the relu6.
+    backprops.device(d) = gradients * ((features > static_cast<T>(0)) *
+                                       (features < static_cast<T>(6)))
+                                          .template cast<T>();
+  }
+};
+
+// Functor used by LeakyReluOp to do the computations.
+template <typename Device, typename T>
+struct LeakyRelu {
+  // Computes LeakyRelu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+
+  // Need to bundle the args (to the LeakyRelu functor) within a struct
+  // Not doing so leads to Eigen kernel args not getting populated
+  // corretly for Eigen::half type (when building on the ROCM platform)
+  struct LeakyReluArgs {
+    const Device& d;
+    typename TTypes<T>::ConstTensor features;
+    T alpha;
+    typename TTypes<T>::Tensor activations;
+  };
+  void operator()(LeakyReluArgs args) {
+    // Note that alpha might be > 1 or < 0, so we don't use cwiseMax here.
+    args.activations.device(args.d) =
+        (args.features > static_cast<T>(0))
+            .select(args.features, args.features * args.alpha);
+  }
+};
+
+// Functor used by LeakyReluGradOp to do the computations.
+template <typename Device, typename T>
+struct LeakyReluGrad {
+  // Computes LeakyReluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the LeakyRelu op.
+  // features: either the inputs that were passed to the LeakyRelu or, or its
+  //           outputs (using either one yields the same result here).
+  // backprops: gradients to backpropagate to the LeakyRelu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features, T alpha,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        (features > static_cast<T>(0)).select(gradients, gradients * alpha);
+  }
+};
+
+// Functor used by EluOp to do the computations.
+template <typename Device, typename T>
+struct Elu {
+  // Computes Elu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    // features.constant(?)
+    activations.device(d) =
+        (features < static_cast<T>(0))
+            .select(features.exp() - features.constant(static_cast<T>(1)),
+                    features);
+  }
+};
+
+// Functor used by EluGradOp to do the computations.
+template <typename Device, typename T>
+struct EluGrad {
+  // Computes EluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Elu op.
+  // activations: outputs of the Elu op.
+  // backprops: gradients to backpropagate to the Elu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor activations,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        (activations < static_cast<T>(0))
+            .select((activations + static_cast<T>(1)) * gradients, gradients);
+  }
+};
+
+// Functor used by SeluOp to do the computations.
+template <typename Device, typename T>
+struct Selu {
+  // Computes Selu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    // features.constant(?)
+    const auto scale = static_cast<T>(1.0507009873554804934193349852946);
+    const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
+    const auto one = static_cast<T>(1);
+    const auto zero = static_cast<T>(0);
+    activations.device(d) =
+        (features < zero)
+            .select(scale_alpha * (features.exp() - features.constant(one)),
+                    scale * features);
+  }
+};
+
+// Functor used by SeluGradOp to do the computations.
+template <typename Device, typename T>
+struct SeluGrad {
+  // Computes SeluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Selu op.
+  // activations: outputs of the Selu op.
+  // backprops: gradients to backpropagate to the Selu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor activations,
+                  typename TTypes<T>::Tensor backprops) {
+    const auto scale = static_cast<T>(1.0507009873554804934193349852946);
+    const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
+    backprops.device(d) =
+        (activations < static_cast<T>(0))
+            .select(gradients * (activations + scale_alpha), gradients * scale);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RELU_OP_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reshape_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reshape_op.h
new file mode 100644
index 00000000..dd603374
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reshape_op.h
@@ -0,0 +1,168 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RESHAPE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RESHAPE_OP_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/overflow.h"
+
+namespace tensorflow {
+
+// Note that this op is subclassed for QuantizedReshapeOp.
+class ReshapeOp : public OpKernel {
+ public:
+  explicit ReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& sizes = context->input(1);
+    // Preliminary validation of sizes.
+    OP_REQUIRES(
+        context,
+        (TensorShapeUtils::IsVector(sizes.shape()) ||
+         // TODO(rmlarsen): Disallow legacy use of scalars to represent shape.
+         TensorShapeUtils::IsScalar(sizes.shape())),
+        errors::InvalidArgument("sizes input must be 1-D, not ",
+                                sizes.shape().DebugString()));
+    OP_REQUIRES(
+        context, sizes.NumElements() < TensorShape::MaxDimensions(),
+        errors::InvalidArgument("too many dimensions: must be < ",
+                                TensorShape::MaxDimensions(), ", but received ",
+                                sizes.NumElements()));
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64_t product = 1;
+    int unknown_index = -1;
+    bool sizes_has_zero_dim;
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context,
+                       ValidateSizes<int32>(sizes, &product, &unknown_index,
+                                            &shape, &sizes_has_zero_dim));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context,
+                       ValidateSizes<int64_t>(sizes, &product, &unknown_index,
+                                              &shape, &sizes_has_zero_dim));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
+    }
+    if (unknown_index != -1) {
+      int64_t input_num_elements = 1;
+      bool input_has_zero_dim = false;
+      for (int dim = 0; dim < input.dims(); dim++) {
+        // For zero dimension, we don't count it into `input_num_elements`
+        // unless `sizes` has no zero dimension, so we are still able to
+        // infer shapes for other dimensions.
+        if (input.dim_size(dim) > 0 || !sizes_has_zero_dim) {
+          input_num_elements *= input.dim_size(dim);
+        } else {
+          input_has_zero_dim = true;
+        }
+      }
+
+      const int64_t missing = input_num_elements / product;
+      if (!input_has_zero_dim) {
+        OP_REQUIRES(
+            context, product * missing == input_num_elements,
+            errors::InvalidArgument(
+                "Input to reshape is a tensor with ", input_num_elements,
+                " values, but the requested shape requires a multiple of ",
+                product));
+      }
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == input.NumElements(),
+                errors::InvalidArgument("Input to reshape is a tensor with ",
+                                        input.NumElements(),
+                                        " values, but the requested shape has ",
+                                        shape.num_elements()));
+
+    // Actually produce the reshaped output.
+    Tensor output(input.dtype());
+    CHECK(output.CopyFrom(input, shape));
+    context->set_output(0, std::move(output));
+  }
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  template <typename Tshape>
+  absl::Status ValidateSizes(const Tensor& sizes, int64_t* product,
+                             int* unknown_index, TensorShape* shape,
+                             bool* has_zero_dim) {
+    *product = 1;
+    *unknown_index = -1;
+    *has_zero_dim = false;
+    const int64_t num_dims = sizes.NumElements();
+    auto Svec = sizes.flat<Tshape>();
+    for (int d = 0; d < num_dims; ++d) {
+      const Tshape size = Svec(d);
+      if (size == -1) {
+        if (*unknown_index != -1) {
+          return errors::InvalidArgument(
+              "Only one input size may be -1, not both ", *unknown_index,
+              " and ", d);
+        }
+        *unknown_index = d;
+        TF_RETURN_IF_ERROR(shape->AddDimWithStatus(1));
+      } else if (size < 0) {
+        return errors::InvalidArgument("Size ", d,
+                                       " must be non-negative, not ", size);
+      } else if (size == 0) {
+        // We don't include zero-sized dimension in product, so that we can
+        // still calculate number of elements for non-zero-sized dimensions and
+        // therefore infer their shapes.
+        TF_RETURN_IF_ERROR(shape->AddDimWithStatus(size));
+        *has_zero_dim = true;
+      } else {
+        if (MultiplyWithoutOverflow(shape->num_elements(), size) < 0) {
+          string msg;
+          for (int ii = 0; ii < num_dims; ++ii) {
+            if (ii != 0) {
+              strings::StrAppend(&msg, ", ");
+            }
+            strings::StrAppend(&msg, Svec(ii));
+          }
+          return errors::InvalidArgument("Shape [", msg,
+                                         "] has too many elements");
+        }
+        TF_RETURN_IF_ERROR(shape->AddDimWithStatus(size));
+        (*product) *= size;
+      }
+    }
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESHAPE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reshape_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reshape_util.h
new file mode 100644
index 00000000..1945712c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reshape_util.h
@@ -0,0 +1,52 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+class Tensor;
+
+// Reshapes the input indices and input shape to the target shape.
+// Note: This template is explicitly instantiated for CPU and GPU devices.
+template <typename Device>
+void ReshapeSparseTensor(OpKernelContext *context,
+                         const Tensor &input_indices_in,
+                         const Tensor &input_shape_in,
+                         const Tensor &target_shape_in, int output_indices_idx,
+                         int output_shape_idx);
+
+namespace functor {
+
+template <typename Device>
+struct ReshapeSparseTensorFunctor {
+  absl::Status operator()(
+      OpKernelContext *context, const TensorShape &input_shape,
+      const TensorShape &output_shape,
+      typename TTypes<int64_t>::ConstMatrix input_indices,
+      typename TTypes<int64_t>::Matrix output_indices) const;
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/resource_variable_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/resource_variable_ops.h
new file mode 100644
index 00000000..1c8d7998
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/resource_variable_ops.h
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+
+namespace tensorflow {
+
+class VarHandleOp : public OpKernel {
+ public:
+  explicit VarHandleOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* ctx) override;
+  const Tensor* const_tensor() const override {
+    return is_anonymous_ ? nullptr : &const_tensor_;
+  }
+
+ private:
+  // Same fields as in ResourceHandleOp.
+  bool is_anonymous_;
+  string container_;
+  string name_;
+  string debug_name_;
+  Tensor const_tensor_;
+
+  DtypeAndPartialTensorShape dtype_and_shape_;
+};
+
+class ReadVariableOp : public OpKernel {
+ public:
+  explicit ReadVariableOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType dtype_;
+};
+
+class ReadVariablesOp : public OpKernel {
+ public:
+  explicit ReadVariablesOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+
+ private:
+  DataTypeVector dtypes_;
+};
+
+class DestroyResourceOp : public OpKernel {
+ public:
+  explicit DestroyResourceOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  bool ignore_lookup_error_;
+};
+
+class DisableCopyOnReadOp : public OpKernel {
+ public:
+  explicit DisableCopyOnReadOp(OpKernelConstruction* c) : OpKernel(c) {}
+  void Compute(OpKernelContext* ctx) override;
+};
+
+template <typename T>
+class VariableShapeOp : public OpKernel {
+ public:
+  explicit VariableShapeOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    core::RefCountPtr<Var> variable;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &variable));
+    variable->mu()->lock_shared();
+    TensorShape shape = variable->tensor()->shape();
+    variable->mu()->unlock_shared();
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {shape.dims()}, &output));
+    for (int i = 0; i < shape.dims(); ++i) {
+      output->flat<T>()(i) = shape.dim_size(i);
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/resource_variable_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/resource_variable_util.h
new file mode 100644
index 00000000..1222b4eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/resource_variable_util.h
@@ -0,0 +1,28 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_UTIL_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+absl::Status ValidateAssignUpdateVariableOpShapes(
+    const TensorShape& variable_shape, const TensorShape& value_shape);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reverse_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reverse_op.h
new file mode 100644
index 00000000..a2de766a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reverse_op.h
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REVERSE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_REVERSE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by ReverseOp to do the computations.
+template <typename Device, typename T, int Dims>
+struct Reverse {
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+                  const Eigen::array<bool, Dims>& reverse_dims,
+                  typename TTypes<T, Dims>::Tensor output) {
+    output.device(d) = input.reverse(reverse_dims);
+  }
+};
+
+template <typename Device, typename T>
+struct Reverse<Device, T, 0> {
+  void operator()(const Device& d, typename TTypes<T, 0>::ConstTensor input,
+                  const Eigen::array<bool, 0>& reverse_dims,
+                  typename TTypes<T, 0>::Tensor output) {
+    // Reversing a scalar is copying it.
+    output.device(d) = input;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REVERSE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/reverse_sequence_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/reverse_sequence_op.h
new file mode 100644
index 00000000..f25794f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/reverse_sequence_op.h
@@ -0,0 +1,78 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REVERSE_SEQUENCE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_REVERSE_SEQUENCE_OP_H_
+// Generator definition for ReverseSequenceOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace generator {
+
+template <typename T, typename Tlen, size_t Dims>
+class ReverseGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReverseGenerator(
+      typename TTypes<T, Dims>::ConstTensor input, int32_t batch_dim,
+      int32_t seq_dim, typename TTypes<Tlen>::ConstVec seq_lengths)
+      : input_(input),
+        batch_dim_(batch_dim),
+        seq_dim_(seq_dim),
+        seq_lengths_(seq_lengths) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, Dims>& coords) const {
+    Eigen::array<Eigen::DenseIndex, Dims> new_coords = coords;
+    if (coords[seq_dim_] < seq_lengths_(coords[batch_dim_])) {
+      new_coords[seq_dim_] =
+          seq_lengths_(coords[batch_dim_]) - coords[seq_dim_] - 1;
+    }
+
+    return input_(new_coords);
+  }
+
+ private:
+  typename TTypes<T, Dims>::ConstTensor input_;
+  int32 batch_dim_;
+  int32 seq_dim_;
+  typename TTypes<Tlen>::ConstVec seq_lengths_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T, typename Tlen, size_t Dims>
+struct ReverseSequence {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+      int32_t batch_dim, int32_t seq_dim,
+      typename TTypes<Tlen>::ConstVec seq_lengths,
+      typename TTypes<T, Dims>::Tensor output) {
+    generator::ReverseGenerator<T, Tlen, Dims> generator(input, batch_dim,
+                                                         seq_dim, seq_lengths);
+    output.device(d) = input.generate(generator);
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REVERSE_SEQUENCE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/blas_gemm.h b/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/blas_gemm.h
new file mode 100644
index 00000000..dabacedd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/blas_gemm.h
@@ -0,0 +1,97 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RNN_BLAS_GEMM_H_
+#define TENSORFLOW_CORE_KERNELS_RNN_BLAS_GEMM_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_activations.h"
+#include "tensorflow/core/platform/types.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+namespace tensorflow {
+class OpKernelContext;
+namespace functor {
+
+template <typename T>
+struct TensorCuBlasGemm {
+  void operator()(OpKernelContext* ctx, bool transa, bool transb, uint64 m,
+                  uint64 n, uint64 k, float alpha, const T* a, int lda,
+                  const T* b, int ldb, float beta, T* c, int ldc);
+};
+
+template <typename T>
+struct gemm_compute_type {
+  typedef T type;
+};
+
+template <>
+struct gemm_compute_type<Eigen::half> {
+  typedef float type;
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct TensorBlasGemm;
+
+template <typename Device, typename T>
+struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
+  static void compute(OpKernelContext* ctx, const Device& d, bool transa,
+                      bool transb, typename gemm_compute_type<T>::type alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b,
+                      typename gemm_compute_type<T>::type beta,
+                      typename TTypes<T>::Matrix c) {
+    int64_t m = c.dimensions()[0];
+    int64_t n = c.dimensions()[1];
+    int64_t k = transa ? a.dimensions()[0] : a.dimensions()[1];
+
+    TensorCuBlasGemm<T>()(ctx, transb, transa, n, m, k, alpha, b.data(),
+                          transb ? k : n, a.data(), transa ? m : k, beta,
+                          c.data(), n);
+  }
+};
+
+template <typename Device, typename T>
+struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
+  static void compute(OpKernelContext* ctx, const Device& d, bool transa,
+                      bool transb, typename gemm_compute_type<T>::type alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b,
+                      typename gemm_compute_type<T>::type beta,
+                      typename TTypes<T>::Matrix c) {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+    contract_pairs[0] =
+        Eigen::IndexPair<Eigen::DenseIndex>(transa == false, transb == true);
+    if (alpha == typename gemm_compute_type<T>::type(1.f) &&
+        beta == typename gemm_compute_type<T>::type(0.f)) {
+      c.device(d) = a.contract(b, contract_pairs);
+    } else if (alpha == typename gemm_compute_type<T>::type(1.f) &&
+               beta == typename gemm_compute_type<T>::type(1.f)) {
+      c.device(d) += a.contract(b, contract_pairs);
+    } else {
+      c.device(d) = c.constant(T(alpha)) * a.contract(b, contract_pairs) +
+                    c.constant(T(beta)) * c;
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RNN_BLAS_GEMM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/gru_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/gru_ops.h
new file mode 100644
index 00000000..8799401d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/gru_ops.h
@@ -0,0 +1,189 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RNN_GRU_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_RNN_GRU_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/rnn/blas_gemm.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace functor {
+
+struct GRUCell {
+  GRUCell(const int batch_size, const int input_size, const int cell_size)
+      : batch_size_(batch_size),
+        input_size_(input_size),
+        cell_size_(cell_size) {}
+
+  inline Eigen::array<Eigen::DenseIndex, 2> x_offsets() const { return {0, 0}; }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> x_extends() const {
+    return {batch_size_, input_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> h_offsets() const {
+    return {0, input_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> h_extends() const {
+    return {batch_size_, cell_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> ru_r_offset() const {
+    return {0, 0};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> ru_u_offset() const {
+    return {0, cell_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> cell_extents() const {
+    return {batch_size_, cell_size_};
+  }
+
+ protected:
+  const int batch_size_;
+  const int input_size_;
+  const int cell_size_;
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct GRUBlockCellFprop : public GRUCell {
+  GRUBlockCellFprop(const int batch_size, const int input_size,
+                    const int cell_size)
+      : GRUCell(batch_size, input_size, cell_size) {}
+
+  void operator()(
+      OpKernelContext* ctx, const Device& d, typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix h_prev,
+      typename TTypes<T>::ConstMatrix w_ru, typename TTypes<T>::ConstMatrix w_c,
+      typename TTypes<T>::ConstVec b_ru, typename TTypes<T>::ConstVec b_c,
+      typename TTypes<T>::Matrix r_u_bar, typename TTypes<T>::Matrix r,
+      typename TTypes<T>::Matrix u, typename TTypes<T>::Matrix c,
+      typename TTypes<T>::Matrix h, typename TTypes<T>::Matrix x_h_prev,
+      typename TTypes<T>::Matrix x_h_prevr) {
+    // Concat x_h_prev = [x, h_prev].
+    x_h_prev.slice(x_offsets(), x_extends()).device(d) = x;
+    x_h_prev.slice(h_offsets(), h_extends()).device(d) = h_prev;
+
+    // r_u_bar = x_h_prev * w_ru + b_ru
+    typename TTypes<T>::ConstMatrix const_x_h_prev(x_h_prev.data(),
+                                                   x_h_prev.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, false, typename gemm_compute_type<T>::type(1.f),
+        const_x_h_prev, w_ru, typename gemm_compute_type<T>::type(0.f),
+        r_u_bar);
+
+    // Creating a bias matrix for adding by broadcasting 'b_ru'
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({batch_size_, 1});
+    Eigen::array<Eigen::DenseIndex, 2> b_ru_shape({1, b_ru.dimensions()[0]});
+    r_u_bar.device(d) += b_ru.reshape(b_ru_shape).broadcast(broadcast_shape);
+
+    // Slice r_u_bar into r, u and apply the sigmoid.
+    r.device(d) = (r_u_bar.slice(ru_r_offset(), cell_extents())).sigmoid();
+    u.device(d) = (r_u_bar.slice(ru_u_offset(), cell_extents())).sigmoid();
+
+    // Concat x_h_prevr = [x,h_prev*r]
+    x_h_prevr.slice(x_offsets(), x_extends()).device(d) = x;
+    x_h_prevr.slice(h_offsets(), h_extends()).device(d) = h_prev * r;
+
+    // c = tanh(x_h_prevr*w_c+b_c), Note b_c is broadcasted before adding.
+    typename TTypes<T>::ConstMatrix const_x_h_prevr(x_h_prevr.data(),
+                                                    x_h_prevr.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, false, typename gemm_compute_type<T>::type(1.f),
+        const_x_h_prevr, w_c, typename gemm_compute_type<T>::type(0.f), c);
+
+    Eigen::array<Eigen::DenseIndex, 2> b_c_shape({1, b_c.dimensions()[0]});
+    c.device(d) += (b_c.reshape(b_c_shape).broadcast(broadcast_shape));
+    c.device(d) = c.tanh();
+
+    // h= u*h_prev + (1-u)*c
+    h.device(d) = u * (h_prev - c) + c;
+  }
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct GRUBlockCellBprop : public GRUCell {
+  GRUBlockCellBprop(const int batch_size, const int input_size,
+                    const int cell_size)
+      : GRUCell(batch_size, input_size, cell_size) {}
+
+  void operator()(
+      OpKernelContext* ctx, const Device& d, typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix h_prev,
+      typename TTypes<T>::ConstMatrix w_ru, typename TTypes<T>::ConstMatrix w_c,
+      typename TTypes<T>::ConstVec b_ru, typename TTypes<T>::ConstVec b_c,
+      typename TTypes<T>::ConstMatrix r, typename TTypes<T>::ConstMatrix u,
+      typename TTypes<T>::ConstMatrix c, typename TTypes<T>::ConstMatrix d_h,
+      typename TTypes<T>::Matrix d_x, typename TTypes<T>::Matrix d_h_prev,
+      typename TTypes<T>::Matrix d_c_bar,
+      typename TTypes<T>::Matrix d_r_bar_u_bar,
+      typename TTypes<T>::Matrix d_r_bar, typename TTypes<T>::Matrix d_u_bar,
+      typename TTypes<T>::Matrix d_hr,
+      typename TTypes<T>::Matrix d_x_comp1_and_h_prev_comp1,
+      typename TTypes<T>::Matrix d_x_comp2_and_h_prevr) {
+    // d_c_bar = d_h*(1-u)*(1-(c*c))
+    d_c_bar.device(d) =
+        ((d_h * (u.constant(T(1)) - u)) * (c.constant(T(1)) - c * c));
+
+    // d_u_bar = d_h*(h-c)*(u*(1-u))
+    d_u_bar.device(d) = d_h * (h_prev - c) * u * (u.constant(T(1)) - u);
+
+    // [2nd_component_of_d_x d_h_prevr] = d_c_bar X w_c^T
+    typename TTypes<T>::ConstMatrix const_d_c_bar(d_c_bar.data(),
+                                                  d_c_bar.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, true, typename gemm_compute_type<T>::type(1.f),
+        const_d_c_bar, w_c, typename gemm_compute_type<T>::type(0.f),
+        d_x_comp2_and_h_prevr);
+
+    d_hr.device(d) = d_x_comp2_and_h_prevr.slice(h_offsets(), h_extends());
+    d_r_bar.device(d) = (d_hr * h_prev * r) * (r.constant(T(1)) - r);
+
+    // d_r_bar_u_bar = concatenate(d_r_bar, d_u_bar) along axis = 1.
+    d_r_bar_u_bar.slice(ru_r_offset(), cell_extents()).device(d) = d_r_bar;
+    d_r_bar_u_bar.slice(ru_u_offset(), cell_extents()).device(d) = d_u_bar;
+
+    // [1st_component_of_d_x 1st_component_of_d_h_prev] = [d_r_bar d_u_bar] X
+    // w_ru^T
+    typename TTypes<T>::ConstMatrix const_d_r_bar_u_bar(
+        d_r_bar_u_bar.data(), d_r_bar_u_bar.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, true, typename gemm_compute_type<T>::type(1.f),
+        const_d_r_bar_u_bar, w_ru, typename gemm_compute_type<T>::type(0.f),
+        d_x_comp1_and_h_prev_comp1);
+
+    // d_x = d_x_comp1 + d_x_comp2
+    d_x.device(d) = (d_x_comp1_and_h_prev_comp1 + d_x_comp2_and_h_prevr)
+                        .slice(x_offsets(), x_extends());
+
+    // d_h_prev = d_h_comp1 + d_hr*r + d_h*u
+    d_h_prev.device(d) =
+        d_x_comp1_and_h_prev_comp1.slice(h_offsets(), h_extends()) +
+        (d_hr * r) + (d_h * u);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RNN_GRU_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/lstm_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/lstm_ops.h
new file mode 100644
index 00000000..f2457531
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/rnn/lstm_ops.h
@@ -0,0 +1,308 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RNN_LSTM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_RNN_LSTM_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_activations.h"
+#include "tensorflow/core/kernels/rnn/blas_gemm.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class OpKernelContext;
+
+enum GateLayout { ICFO, IFCO };
+
+constexpr int gate_c_offset(GateLayout gate_layout, int cell_size) {
+  return (gate_layout == ICFO) ? cell_size : cell_size * 2;
+}
+
+constexpr int gate_f_offset(GateLayout gate_layout, int cell_size) {
+  return (gate_layout == ICFO) ? cell_size * 2 : cell_size;
+}
+
+namespace functor {
+
+template <typename Device, typename T>
+struct TensorZero {
+  void operator()(const Device& d, typename TTypes<T>::Flat t) {
+    t.device(d) = t.constant(T(0));
+  }
+};
+
+template <typename Device, typename T>
+struct TensorUnalignedZero {
+  void operator()(const Device& d, typename TTypes<T>::UnalignedFlat t) {
+    t.device(d) = t.constant(T(0));
+  }
+};
+
+template <typename Device, typename T>
+struct TensorCopy {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat src,
+                  typename TTypes<T>::Flat dst) {
+    dst.device(d) = src;
+  }
+};
+
+template <typename Device, typename T>
+struct TensorCopyUnaligned {
+  void operator()(const Device& d, typename TTypes<T>::UnalignedConstFlat src,
+                  typename TTypes<T>::Flat dst) {
+    dst.device(d) = src;
+  }
+};
+
+template <typename Device, typename T>
+struct TensorCopyToUnaligned {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat src,
+                  typename TTypes<T>::UnalignedFlat dst) {
+    dst.device(d) = src;
+  }
+};
+
+template <typename Device, typename T>
+struct TensorAdd {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat a,
+                  typename TTypes<T>::ConstFlat b, typename TTypes<T>::Flat c) {
+    c.device(d) = a + b;
+  }
+};
+
+template <typename Device, typename T>
+struct TensorZeroPadding {
+  void operator()(const Device& d, const int64_t time_idx,
+                  typename TTypes<int64_t>::ConstVec seq_len,
+                  typename TTypes<T>::Vec mask, typename TTypes<T>::Matrix m) {
+    // mask is shape [batch_size].
+    mask.device(d) = seq_len.constant(time_idx) < seq_len;
+
+    // m_shape is [batch_size, 1].
+    Eigen::array<Eigen::DenseIndex, 2> m_shape({m.dimensions()[0], 1});
+    // broadcast_shape is [1, units].
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({1, m.dimensions()[1]});
+
+    // m is shape [batch_size, units].
+    m.device(d) = m * mask.reshape(m_shape).broadcast(broadcast_shape);
+  }
+};
+
+struct LSTMBlockCell {
+  LSTMBlockCell(const int batch_size, const int input_size, const int cell_size)
+      : batch_size_(batch_size),
+        input_size_(input_size),
+        cell_size_(cell_size) {}
+
+  int batch_size() const { return batch_size_; }
+
+  int input_size() const { return input_size_; }
+
+  int cell_size() const { return cell_size_; }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_i_offsets() const {
+    return {0, 0};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_c_offsets(
+      const GateLayout gate_layout) const {
+    return {0, gate_c_offset(gate_layout, cell_size_)};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_f_offsets(
+      const GateLayout gate_layout) const {
+    return {0, gate_f_offset(gate_layout, cell_size_)};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> gates_o_offsets() const {
+    return {0, cell_size_ * 3};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> cell_extents() const {
+    return {batch_size_, cell_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_x_offsets() const {
+    return {0, 0};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_x_extents() const {
+    return {batch_size_, input_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_h_offsets() const {
+    return {0, input_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_h_extents() const {
+    return {batch_size_, cell_size_};
+  }
+
+ protected:
+  const int batch_size_;
+  const int input_size_;
+  const int cell_size_;
+};
+
+// See lstm_ops.cc for CPUDevice implementation and lstm_ops_gpu.cu.cc for
+// GPUDevice implementation.
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
+struct LSTMBlockCellFprop : public LSTMBlockCell {
+  LSTMBlockCellFprop(const int batch_size, const int input_size,
+                     const int cell_size)
+      : LSTMBlockCell(batch_size, input_size, cell_size) {}
+
+  void operator()(OpKernelContext* ctx, const Device& d,
+                  const float forget_bias, const float cell_clip,
+                  bool use_peephole, typename TTypes<T>::ConstMatrix x,
+                  typename TTypes<T>::ConstMatrix cs_prev,
+                  typename TTypes<T>::ConstMatrix h_prev,
+                  typename TTypes<T>::ConstMatrix w,
+                  typename TTypes<T>::ConstVec wci,
+                  typename TTypes<T>::ConstVec wcf,
+                  typename TTypes<T>::ConstVec wco,
+                  typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,
+                  typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,
+                  typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,
+                  typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,
+                  typename TTypes<T>::Matrix gates,
+                  typename TTypes<T>::Matrix h);
+};
+
+// See lstm_ops.cc for CPUDevice implementation and lstm_ops_gpu.cu.cc for
+// GPUDevice implementation.
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
+struct LSTMBlockCellBprop : public LSTMBlockCell {
+  LSTMBlockCellBprop(const int batch_size, const int input_size,
+                     const int cell_size)
+      : LSTMBlockCell(batch_size, input_size, cell_size) {}
+
+  void operator()(
+      OpKernelContext* ctx, const Device& d, bool use_peephole,
+      typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix cs_prev,
+      typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
+      typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
+      typename TTypes<T>::ConstVec wco, typename TTypes<T>::ConstVec b,
+      typename TTypes<T>::ConstMatrix i, typename TTypes<T>::ConstMatrix cs,
+      typename TTypes<T>::ConstMatrix f, typename TTypes<T>::ConstMatrix o,
+      typename TTypes<T>::ConstMatrix ci, typename TTypes<T>::ConstMatrix co,
+      typename TTypes<T>::ConstMatrix cs_grad,
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
+      typename TTypes<T>::Matrix dgates,
+      typename TTypes<T>::Matrix cs_prev_grad, typename TTypes<T>::Vec wci_grad,
+      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad);
+};
+
+template <typename Device, typename T, bool USE_CUBLAS, GateLayout gate_layout>
+struct BlockLSTMBprop : public LSTMBlockCell {
+  BlockLSTMBprop(const int batch_size, const int input_size,
+                 const int cell_size)
+      : LSTMBlockCell(batch_size, input_size, cell_size) {}
+
+  void operator()(
+      OpKernelContext* ctx, const Device& d, bool use_peephole,
+      typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix cs_prev,
+      typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
+      typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
+      typename TTypes<T>::ConstVec wco, typename TTypes<T>::ConstVec b,
+      typename TTypes<T>::Matrix xh, typename TTypes<T>::ConstMatrix i,
+      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,
+      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,
+      typename TTypes<T>::ConstMatrix co,
+      typename TTypes<T>::ConstMatrix cs_grad,
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
+      typename TTypes<T>::Matrix dgates,
+      typename TTypes<T>::Matrix cs_prev_grad,
+      typename TTypes<T>::Matrix h_prev_grad,
+      typename TTypes<T>::Matrix xh_grad, typename TTypes<T>::Matrix x_grad,
+      typename TTypes<T>::Matrix w_grad, typename TTypes<T>::Vec wci_grad,
+      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad,
+      typename TTypes<T>::Vec b_grad) {
+    // do[t] = sigm'(o[t]) .* dh[t] .* co[t]
+    do_.device(d) = o * (o.constant(T(1)) - o) * h_grad * co;
+
+    // dcs[t] += tanh'(cs[t]) .* dh[t] .* o[t] + dcs[t + 1] .* f[t + 1]
+    dcs.device(d) = (co.constant(T(1)) - co * co) * h_grad * o + cs_grad;
+
+    Eigen::array<Eigen::DenseIndex, 2> p_shape({1, cell_size_});
+    Eigen::array<Eigen::DenseIndex, 2> p_broadcast_shape({batch_size_, 1});
+    if (use_peephole) {
+      dcs.device(d) =
+          dcs + do_ * wco.reshape(p_shape).broadcast(p_broadcast_shape);
+    }
+
+    // dci[t] = tanh'(ci[t]) dcs[t] i[t]
+    dci.device(d) = (ci.constant(T(1)) - ci * ci) * dcs * i;
+
+    // df[t] = sigm'(f[t]) dcs[t] cs[t - 1]
+    df.device(d) = f * (f.constant(T(1)) - f) * dcs * cs_prev;
+
+    // di[t] = sigm'(i[t]) dcs[t] ci[t]
+    di.device(d) = i * (i.constant(T(1)) - i) * dcs * ci;
+
+    dgates.slice(gates_i_offsets(), cell_extents()).device(d) = di;
+    dgates.slice(gates_c_offsets(gate_layout), cell_extents()).device(d) = dci;
+    dgates.slice(gates_f_offsets(gate_layout), cell_extents()).device(d) = df;
+    dgates.slice(gates_o_offsets(), cell_extents()).device(d) = do_;
+
+    cs_prev_grad.device(d) = dcs * f;
+    if (use_peephole) {
+      cs_prev_grad.device(d) =
+          cs_prev_grad +
+          di * wci.reshape(p_shape).broadcast(p_broadcast_shape) +
+          df * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
+    }
+
+    // xh_grad.
+    typename TTypes<T>::ConstMatrix const_dgates(dgates.data(),
+                                                 dgates.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, true, 1.f, const_dgates, w, 0.f, xh_grad);
+
+    // xh.
+    xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
+    xh.slice(xh_h_offsets(), xh_h_extents()).device(d) = h_prev;
+    typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
+
+    // x_grad.
+    x_grad.device(d) = xh_grad.slice(xh_x_offsets(), xh_x_extents());
+    h_prev_grad.device(d) = xh_grad.slice(xh_h_offsets(), xh_h_extents());
+
+    // w_grad.
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, true, false, 1.f, const_xh, const_dgates, 1.f, w_grad);
+
+    // b_grad.
+    b_grad.device(d) += dgates.sum(Eigen::array<int, 1>({0}));
+
+    if (use_peephole) {
+      wci_grad.device(d) += (di * cs_prev).sum(Eigen::array<int, 1>({0}));
+      wcf_grad.device(d) += (df * cs_prev).sum(Eigen::array<int, 1>({0}));
+      wco_grad.device(d) += (do_ * cs).sum(Eigen::array<int, 1>({0}));
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RNN_LSTM_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/roll_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/roll_op.h
new file mode 100644
index 00000000..7ae1d8f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/roll_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_ROLL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ROLL_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct Roll {
+  // dim_size - the size of each dimension
+  // dim_range - the number of indices over in the flattened tensor
+  //    you need to skip in order to make it over from one side of a dimension
+  //    to the other. Used to make the shifts wrap around after a threshold.
+  // threshold - the index for each dimension that the roll starts to wrap
+  //    back to the front
+  // isd - inner shift dimension
+  void operator()(const OpKernelContext* context, const int64_t num_elements,
+                  const int num_dims, const absl::Span<const int32> dim_size,
+                  const T* input, T* output,
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range, const int64_t isd);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_ROLL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/save_restore_tensor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/save_restore_tensor.h
new file mode 100644
index 00000000..f5fac541
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/save_restore_tensor.h
@@ -0,0 +1,73 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_H_
+#define TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_H_
+
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+// Legacy / V1 checkpoint format.
+
+// Save input tensors in *context to a writer built from builder_func().
+// context must have the following inputs:
+//  0: a single element string tensor that contains the file name.
+//  1: names for the remaining tensors
+// If save_slices is true:
+//  2: shape and slice specifications.
+//  rest: tensors to save
+void SaveTensors(
+    OpKernelContext* context,
+    checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func,
+    bool save_slices);
+
+// Reads a single tensor from the reader built from open_func() and produces
+// it as context->output(restore_index).  "preferred_shard" is the same the
+// TensorSliceReader preferred_shard parameter.
+//
+// context must have the following inputs:
+//  0: a single element string tensor that contains the file name.
+//  1: string tensor that names the outputs to be restored.
+// If restore_slice is true:
+//  2: shape and slice specification of the tensors to restore.
+//
+// restore_index indicates the variable name and slice to lookup
+// in context(1) and (2).
+void RestoreTensor(OpKernelContext* context,
+                   checkpoint::TensorSliceReader::OpenTableFunction open_func,
+                   int preferred_shard, bool restore_slice, int restore_index);
+
+// V2 checkpoint format.
+
+// Invokes the V2 checkpoint read path to read tensors.
+//
+// "context" is only used for allocating outputs.  In particular, the inputs are
+// explicitly provided and not accessed via the "input(i)" methods.
+// REQUIRES:
+//   * "prefix" has 1 element, DT_STRING.
+//   * "tensor_names" and "shape_and_slices" shaped {N}, both DT_STRING.
+//   * "dtypes" has N elements, the datatypes of the to-restore tensors.
+absl::Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
+                              const Tensor& tensor_names,
+                              const Tensor& shape_and_slices,
+                              absl::Span<const DataType> dtypes);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scan_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scan_ops.h
new file mode 100644
index 00000000..ad3f2e1e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scan_ops.h
@@ -0,0 +1,146 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCAN_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SCAN_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::Index Index;
+
+// TODO(b/154339590): Needs to be vectorized.
+template <typename Device, typename Reducer, typename T>
+struct Scan {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out, const Reducer& reducer,
+                  const bool reverse, const bool exclusive) {
+    // Perform the reverse ops directly with Eigen, which avoids copying the
+    // tensor twice compared to using individual ops.
+    Eigen::array<bool, 3> dims;
+    dims[0] = false;
+    dims[1] = reverse;
+    dims[2] = false;
+    MaybeWith32BitIndexing<Device>(
+        [&](auto in32, auto out32) {
+          out32.device(d) =
+              in32.reverse(dims).scan(1, reducer, exclusive).reverse(dims);
+        },
+        in, out);
+  }
+};
+
+template <typename T>
+struct LogSumExp {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
+                                                     const T& b) const {
+    auto mi = Eigen::internal::scalar_min_op<T>()(a, b);
+    auto ma = Eigen::internal::scalar_max_op<T>()(a, b);
+
+    auto sub = Eigen::internal::scalar_difference_op<T>();
+    auto add = Eigen::internal::scalar_sum_op<T>();
+    auto exp = Eigen::internal::scalar_exp_op<T>();
+    auto log1p = Eigen::internal::scalar_log1p_op<T>();
+    auto cmp_lt =
+        Eigen::internal::scalar_cmp_op<T, T, Eigen::internal::cmp_LT>();
+
+    auto logsumexp = add(log1p(exp(sub(mi, ma))), ma);
+    return cmp_lt(ma, Eigen::NumTraits<T>::lowest()) ? ma : logsumexp;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const T& a,
+                                                   const T& b) const {
+    auto mi = Eigen::internal::pmin(a, b);
+    auto ma = Eigen::internal::pmax(a, b);
+    using Eigen::internal::padd;
+    using Eigen::internal::pcmp_lt;
+    using Eigen::internal::pexp;
+    using Eigen::internal::plog1p;
+    using Eigen::internal::pset1;
+    using Eigen::internal::psub;
+
+    auto logsumexp = padd(plog1p(pexp(psub(mi, ma))), ma);
+    return pselect(pcmp_lt(ma, pset1(Eigen::NumTraits<T>::lowest())), ma,
+                   logsumexp);
+  }
+};
+
+template <typename T>
+struct LogSumExpReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    LogSumExp<T> logsumexp;
+    *accum = logsumexp(*accum, t);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p,
+                                                          Packet* accum) const {
+    LogSumExp<T> logsumexp;
+    *accum = logsumexp.packetOp(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return -Eigen::NumTraits<T>::infinity();
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return Eigen::internal::pset1(initialize());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+  finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
+  finalizeBoth(const T saccum, const Packet& vaccum) const {
+    auto max_reducer = Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>();
+    auto sum_reducer = Eigen::internal::SumReducer<T>();
+    auto exp = Eigen::internal::scalar_exp_op<T>();
+    auto cmp_lt =
+        Eigen::internal::scalar_cmp_op<T, T, Eigen::internal::cmp_LT>();
+    auto log = Eigen::internal::scalar_log_op<T>();
+    auto add = Eigen::internal::scalar_sum_op<T>();
+
+    using Eigen::internal::pexp;
+    using Eigen::internal::psub;
+
+    // `ma = max(x1, ..., xn)`
+    // If the max of all of the `xi` is `-infinity` then the result is
+    // -infinity. If the max is larger than `-infinity` then it's safe to use
+    // for normalization even if the other elements are `-infinity`.
+    //
+    // `logsumexp(x1, ..., xn) = ma + log (exp(x1 - ma) + ... + exp(xn - ma))`
+    auto ma = max_reducer.finalizeBoth(saccum, vaccum);
+    auto logsumexp = add(log(sum_reducer.finalizeBoth(
+                             exp(saccum - ma), pexp(psub(vaccum, pset1(ma))))),
+                         ma);
+    return cmp_lt(ma, Eigen::NumTraits<T>::lowest()) ? initialize() : logsumexp;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCAN_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scan_ops_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scan_ops_gpu.h
new file mode 100644
index 00000000..15b4e5e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scan_ops_gpu.h
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#define CUB_USE_COOPERATIVE_GROUPS
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/util/gpu_launch_config.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/permutation_output_iterator.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::Index Index;
+
+namespace functor {
+
+// Map a contiguous range to the actual memory locations depending on which
+// axis the scan is taking place over and whether or not reversed.
+struct MapIndexToLocation {
+  __host__ __device__ MapIndexToLocation(int dimx, int dimy, int dimz,
+                                         bool reverse = false)
+      : dimx_(dimx), dimy_(dimy), dimz_(dimz), reverse_(reverse) {}
+
+  __host__ __device__ int operator()(int id) const {
+    if (dimx_ == 1) {
+      int row = id % dimy_;
+      int col = id / dimy_;
+
+      if (reverse_) return (dimy_ - row - 1) * dimz_ + col;
+
+      return row * dimz_ + col;
+    } else if (dimz_ == 1) {
+      if (reverse_) {
+        int row = id / dimy_;
+        int col = id % dimy_;
+        return row * dimy_ + (dimy_ - col - 1);
+      }
+      return id;
+    } else {
+      int col = id % dimy_;
+      int tmp = id / dimy_;
+
+      int row1 = id / (dimy_ * dimz_);
+      int col1 = tmp % dimz_;
+
+      if (reverse_)
+        return row1 * dimy_ * dimz_ + (dimy_ - col - 1) * dimz_ + col1;
+
+      return row1 * dimy_ * dimz_ + col * dimz_ + col1;
+    }
+  }
+
+  int dimx_;
+  int dimy_;
+  int dimz_;
+  bool reverse_;
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T running_total, Op op)
+      : running_total_(running_total), op_(op) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // tid 0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total_;
+    running_total_ = op_(old_prefix, block_aggregate);
+    return old_prefix;
+  }
+};
+
+template <typename T>
+struct Sum {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T, typename Op>
+struct IsSum {
+  constexpr static bool value =
+      (std::is_same<Op, Sum<T>>::value ||
+       std::is_same<Op, Eigen::internal::SumReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsProd {
+  constexpr static bool value =
+      (std::is_same<Op, Prod<T>>::value ||
+       std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsLogSumExp {
+  constexpr static bool value = (std::is_same<Op, LogSumExp<T>>::value ||
+                                 std::is_same<Op, LogSumExpReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IdentityValue {
+  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value ||
+                    IsLogSumExp<T, Op>::value,
+                "IdentityValue not yet defined for this type.");
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsSum<U, OpCopy>::value, U>::type t = U(0)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U
+  operator()(typename std::enable_if<IsLogSumExp<U, OpCopy>::value, U>::type t =
+                 U(Eigen::NumTraits<U>::lowest())) {
+    return t;
+  }
+};
+
+// Each block is mapped to one sequence.  A contiguous range is mapped to the
+// appropriate locations in memory by the permutation iterators.  This is
+// ideal for 1-D and row based scans.  Column scans would be better if they
+// did a block load and then locally transposed.  CUB's device wide scan is not
+// used in the large 1D case, even though it would be more efficient, because
+// it is not deterministic.
+template <typename T, typename Op, int BlockDim = 128, int ItemsPerThread = 4>
+__launch_bounds__(BlockDim) __global__
+    void scan_kernel(const T* in, T* out, int dimx, int dimy, int dimz,
+                     bool exclusive, bool reverse, Op op) {
+  typedef gpuprim::BlockLoad<T, BlockDim, ItemsPerThread,
+                             gpuprim::BLOCK_LOAD_TRANSPOSE>
+      BlockLoad;
+  typedef gpuprim::BlockStore<T, BlockDim, ItemsPerThread,
+                              gpuprim::BLOCK_STORE_TRANSPOSE>
+      BlockStore;
+  typedef gpuprim::BlockScan<T, BlockDim> BlockScan;
+
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ union {
+    typename BlockLoad::TempStorage load;
+    typename BlockScan::TempStorage scan;
+    typename BlockStore::TempStorage store;
+  } temp_storage;
+
+  int problem_length = dimy;
+
+  // Initialize running total
+  BlockPrefixCallbackOp<T, Op> prefix_op(IdentityValue<T, Op>()(), op);
+
+  MapIndexToLocation map_op(dimx, dimy, dimz, reverse);
+  int block_start = problem_length * blockIdx.x;
+  // Have the block iterate over segments of items
+  for (int block_offset = block_start;
+       block_offset < block_start + problem_length;
+       block_offset += BlockDim * ItemsPerThread) {
+    int valid_items = min(BlockDim * ItemsPerThread,
+                          problem_length - (block_offset % problem_length));
+
+    // first construct a counting iterator that has the desired start point
+    typedef gpuprim::TransformInputIterator<int, MapIndexToLocation,
+                                            gpuprim::CountingInputIterator<int>>
+        MapIterType;
+
+    gpuprim::CountingInputIterator<int> counting_iter(block_offset);
+
+    // Next map the iterator to the actual locations in memory
+    MapIterType map_iter(counting_iter, map_op);
+
+    PermutationInputIterator<T, const T*, MapIterType> permutein_iter(in,
+                                                                      map_iter);
+    PermutationOutputIterator<T, T*, MapIterType> permuteout_iter(out,
+                                                                  map_iter);
+
+    // Load a segment of consecutive items that are blocked across threads
+    T thread_data[ItemsPerThread];
+    BlockLoad(temp_storage.load).Load(permutein_iter, thread_data, valid_items);
+    __syncthreads();
+
+    // Collectively compute the block-wide scan
+    if (exclusive) {
+      BlockScan(temp_storage.scan)
+          .ExclusiveScan(thread_data, thread_data, op, prefix_op);
+    } else {
+      BlockScan(temp_storage.scan)
+          .InclusiveScan(thread_data, thread_data, op, prefix_op);
+    }
+    __syncthreads();
+
+    // Store scanned items to output segment
+    BlockStore(temp_storage.store)
+        .Store(permuteout_iter, thread_data, valid_items);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Op>
+void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                typename TTypes<T, 3>::Tensor out, Op op, const bool reverse,
+                const bool exclusive) {
+  const int items_per_thread = 4;
+
+  int dimx = in.dimension(0);
+  int dimy = in.dimension(1);
+  int dimz = in.dimension(2);
+  int num_blocks = dimx * dimz;
+
+  int ideal_block_size = dimy / items_per_thread;
+  const int rocm_threads_per_warp = 64;
+  ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
+
+  // There seems to be a bug when the type is not float and block_size 1024.
+  // Launch on the smallest power of 2 block size that we can.
+  if (ideal_block_size >= 1024 && std::is_same<T, float>::value) {
+    const int block_size = 1024;
+    TF_CHECK_OK(
+        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                        num_blocks, block_size, 0, d.stream(), in.data(),
+                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+  } else if (ideal_block_size >= 512) {
+    const int block_size = 512;
+    TF_CHECK_OK(
+        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                        num_blocks, block_size, 0, d.stream(), in.data(),
+                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+  } else if (ideal_block_size >= 256) {
+    const int block_size = 256;
+    TF_CHECK_OK(
+        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                        num_blocks, block_size, 0, d.stream(), in.data(),
+                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+  } else if (ideal_block_size >= 128) {
+    const int block_size = 128;
+    TF_CHECK_OK(
+        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                        num_blocks, block_size, 0, d.stream(), in.data(),
+                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
+    // HIP-CLANG has some kind of problem here with 32 threads (possibly because
+    // the warpsize is 64). Reenable when working properly
+  } else if (true) {
+#else
+  } else if (ideal_block_size >= 64) {
+#endif
+    const int block_size = 64;
+    TF_CHECK_OK(
+        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                        num_blocks, block_size, 0, d.stream(), in.data(),
+                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+  } else {
+    const int block_size = 32;
+    TF_CHECK_OK(
+        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                        num_blocks, block_size, 0, d.stream(), in.data(),
+                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+  }
+}
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::SumReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::SumReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Sum<T>>(d, in, out, Sum<T>(), reverse, exclusive);
+  }
+};
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::ProdReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Prod<T>>(d, in, out, Prod<T>(), reverse, exclusive);
+  }
+};
+
+template <typename T>
+struct Scan<GPUDevice, LogSumExpReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const LogSumExpReducer<T>& reducer, const bool reverse,
+                  const bool exclusive) {
+    LaunchScan<T, LogSumExp<T>>(d, in, out, LogSumExp<T>(), reverse, exclusive);
+  }
+};
+
+}  // namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_functor.h
new file mode 100644
index 00000000..dcfae9b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_functor.h
@@ -0,0 +1,414 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_H_
+
+#include <type_traits>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/determinism.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace scatter_op {
+
+enum class UpdateOp { ASSIGN, ADD, SUB, MUL, DIV, MIN, MAX };
+
+namespace internal {
+
+template <scatter_op::UpdateOp Op>
+struct Assign {};
+template <>
+struct Assign<scatter_op::UpdateOp::ASSIGN> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = u;
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p.setConstant(u);
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::ADD> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p += u;
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p + u;
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::SUB> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p -= u;
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p + static_cast<Update>(-u);
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MUL> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p *= u;
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p * u;
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::DIV> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p /= u;
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p / u;
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MIN> {
+  // This method requires that Params and Update are tensor types.
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = p.cwiseMin(u);
+  }
+  // Same thing, but for Update being a scalar type.
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p.cwiseMin(u);
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MAX> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = p.cwiseMax(u);
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p.cwiseMax(u);
+  }
+};
+
+
+}  // namespace internal
+}  // namespace scatter_op
+
+namespace functor {
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterFunctor {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   typename TTypes<T>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices);
+};
+
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterFunctorBase {
+  Index ParallelExecute(OpKernelContext* c, const Device& d,
+                        typename TTypes<T>::Matrix params,
+                        typename TTypes<T>::ConstMatrix updates,
+                        typename TTypes<Index>::ConstFlat indices) {
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index kMaxLocks = 1024;
+    const Index entries_per_lock = (limit + kMaxLocks - 1) / kMaxLocks;
+    // To reduce the number of locks and the memory usage, we divide the whole
+    // index space into kMaxLocks regions with each lock serializing access to
+    // a region.
+    mutex accessed[kMaxLocks];
+    std::atomic<Index> bad_index(-1);
+    auto ParallelScatter = [&](Index start, Index end) {
+      for (Index i = start; i < end; ++i) {
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
+        const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+        if (!FastBoundsCheck(index, limit)) {
+          bad_index = i;
+          return;
+        }
+        const Index lock_id = index / entries_per_lock;
+        // Copy last Ndim-1 dimensions of updates[i] to params[index]
+        {
+          mutex_lock l(accessed[lock_id]);
+          scatter_op::internal::Assign<op>::Run(params.template chip<0>(index),
+                                                updates.template chip<0>(i));
+        }
+      }
+    };
+    const float kMovingCost = 2.5f;
+    float shard_cost = kMovingCost * params.dimension(1);
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(c->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, N, shard_cost,
+          ParallelScatter);  // TODO: Come up with a good cost estimate.
+    return bad_index;
+  }
+  Index SerialExecute(OpKernelContext* c, const Device& d,
+                      typename TTypes<T>::Matrix params,
+                      typename TTypes<T>::ConstMatrix updates,
+                      typename TTypes<Index>::ConstFlat indices) {
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; ++i) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in
+      // between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      scatter_op::internal::Assign<op>::Run(params.template chip<0>(index),
+                                            updates.template chip<0>(i));
+    }
+    return -1;
+  }
+
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   typename TTypes<T>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+#ifdef PLATFORM_GOOGLE
+    // The parallel version is significantly slower internally. Only call the
+    // serial version for now.
+    // TODO(penporn): Avoid locking in parallelization (sort beforehand).
+    return SerialExecute(c, d, params, updates, indices);
+#else
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index min_n_threshold = 1024;
+    const Index ser_par_ratio = 10000;
+    // For parallelizing the updates, duplicate entries need to be handled
+    // correctly. Multiple updates to the same index has to be serialized.
+    // This can lead to lock contention which may nullify the benefits of
+    // parallelization. Assuming uniform random distribution of the indices, we
+    // come up with a rough heuristic and determine whether the updates execute
+    // serially or parallelly. Also if 'N' is small, overheads of parallel
+    // execution outweigh its benefits and hence we check the value of N.
+    const bool execute_serial = N < min_n_threshold ||
+                                (N / limit) > ser_par_ratio ||
+                                OpDeterminismRequired();
+    if (execute_serial)
+      return SerialExecute(c, d, params, updates, indices);
+    else
+      return ParallelExecute(c, d, params, updates, indices);
+#endif  // PLATFORM_GOOGLE
+  }
+};
+
+template <typename Device, typename Index>
+struct ScatterFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   typename TTypes<Variant>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    DCHECK_EQ(N, updates.dimension(0));
+    DCHECK_EQ(cols, updates.dimension(1));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      for (int j = 0; j < cols; ++j) {
+        const Variant& to_scatter = updates(i, j);
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterFunctor<CPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<CPUDevice, Index> {};
+
+template <typename Index>
+struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
+
+
+template <typename T, typename Index>
+struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
+  Index operator()(OpKernelContext* c, const CPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   typename TTypes<T>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    if (!std::is_same<T, tstring>::value) {
+      for (Index i = 0; i < N; i++) {
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
+        const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+        if (!FastBoundsCheck(index, limit)) return i;
+        memmove(params.data() + index * params.dimension(1),
+                updates.data() + i * updates.dimension(1),
+                updates.dimension(1) * sizeof(T));
+      }
+    } else {
+      for (Index i = 0; i < N; i++) {
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
+        const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+        if (!FastBoundsCheck(index, limit)) return i;
+        // Copy last Ndim-1 dimensions of updates[i] to params[index]
+        scatter_op::internal::Assign<scatter_op::UpdateOp::ASSIGN>::Run(
+            params.template chip<0>(index), updates.template chip<0>(i));
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterFunctor<CPUDevice, T, Index, op>
+    : ScatterFunctorBase<CPUDevice, T, Index, op> {};
+
+
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices);
+};
+
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::Assign<op>::RunScalar(
+          params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+
+template <typename Device, typename Index>
+struct ScatterScalarFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   const typename TTypes<Variant>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    const Variant& to_scatter = update();
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      for (Index j = 0; j < cols; ++j) {
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterScalarFunctor<CPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<CPUDevice, Index> {};
+template <typename Index>
+struct ScatterScalarFunctor<GPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
+
+
+template <typename T, typename Index>
+struct ScatterScalarFunctorBase<CPUDevice, T, Index,
+                                scatter_op::UpdateOp::ASSIGN> {
+  Index operator()(OpKernelContext* c, const CPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::Assign<scatter_op::UpdateOp::ASSIGN>::RunScalar(
+          params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor<CPUDevice, T, Index, op>
+    : ScatterScalarFunctorBase<CPUDevice, T, Index, op> {};
+
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_functor_gpu.cu.h
new file mode 100644
index 00000000..61868b78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -0,0 +1,179 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace scatter_op_gpu {
+
+template <typename T, scatter_op::UpdateOp op>
+struct ScatterOpKernelBody;
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ASSIGN> {
+  __device__ void operator()(T* __restrict__ dest, T src) const { *dest = src; }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ADD> {
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicAdd(dest, src);
+  }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::SUB> {
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicSub(dest, src);
+  }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MUL> {
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicMul(dest, src);
+  }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::DIV> {
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicDiv(dest, src);
+  }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MIN> {
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicMin(dest, src);
+  }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MAX> {
+  __device__ void operator()(T* __restrict__ dest, T src) const {
+    GpuAtomicMax(dest, src);
+  }
+};
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+__global__ void ScatterOpCustomKernel(T* __restrict__ params,
+                                      const T* __restrict__ updates,
+                                      const Index* __restrict__ indices,
+                                      Index first_dim_size, Index updates_size,
+                                      Index indices_size) {
+  Index update_block = updates_size / indices_size;
+  ScatterOpKernelBody<T, op> body;
+  GPU_1D_KERNEL_LOOP(i, updates_size) {
+    int indices_i = i / update_block;
+    int updates_i = i;
+    int param_first_index = indices[indices_i];
+    if (!(param_first_index >= 0 && param_first_index < first_dim_size)) {
+      // Ignore indices that are out of range.
+      continue;
+    }
+    int64 params_i = param_first_index * update_block + (i % update_block);
+    body(&params[params_i], ldg(updates + updates_i));
+  }
+}
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+__global__ void ScatterScalarOpCustomKernel(T* __restrict__ params,
+                                            const T* __restrict__ update,
+                                            const Index* __restrict__ indices,
+                                            Index first_dim_size,
+                                            Index indices_size,
+                                            Index synthesized_updates_size) {
+  Index update_block = synthesized_updates_size / indices_size;
+  ScatterOpKernelBody<T, op> body;
+  GPU_1D_KERNEL_LOOP(i, synthesized_updates_size) {
+    int indices_i = i / update_block;
+    int param_first_index = indices[indices_i];
+    const T update_val = *update;
+    if (!(param_first_index >= 0 && param_first_index < first_dim_size)) {
+      // Ignore indices that are out of range.
+      continue;
+    }
+    int params_i = param_first_index * update_block + (i % update_block);
+    body(&params[params_i], update_val);
+  }
+}
+
+}  // namespace scatter_op_gpu
+
+namespace functor {
+// Specialization for a GPU device.
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterFunctor<GPUDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const GPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   typename TTypes<T>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // TODO(b/31801742): Implement indices range check. The hardest part is
+    // with returning a value after the range check, as we do not want to do
+    // device to host memcpy during a stream.
+    const Index first_dim_size = params.dimension(0);
+    const Index indices_size = indices.size();
+    const Index updates_size = updates.size();
+    GpuLaunchConfig config = GetGpuLaunchConfig(updates_size, d);
+    TF_CHECK_OK(GpuLaunchKernel(
+        scatter_op_gpu::ScatterOpCustomKernel<T, Index, op>, config.block_count,
+        config.thread_per_block, 0, d.stream(), params.data(), updates.data(),
+        indices.data(), first_dim_size, updates_size, indices_size));
+    return -1;
+  }
+};
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor<GPUDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const GPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // TODO(b/31801742): Implement indices range check. The hardest part is
+    // with returning a value after the range check, as we do not want to do
+    // device to host memcpy during a stream.
+    const Index first_dim_size = params.dimension(0);
+    const Index indices_size = indices.size();
+    const Index synthesized_updates_size = indices_size * params.dimension(1);
+    GpuLaunchConfig config = GetGpuLaunchConfig(synthesized_updates_size, d);
+    TF_CHECK_OK(GpuLaunchKernel(
+        scatter_op_gpu::ScatterScalarOpCustomKernel<T, Index, op>,
+        config.block_count, config.thread_per_block, 0, d.stream(),
+        params.data(), update.data(), indices.data(), first_dim_size,
+        indices_size, synthesized_updates_size));
+    return -1;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_op.h
new file mode 100644
index 00000000..b736d4b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_op.h
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class OpKernelContext;
+
+namespace scatter_nd_op {
+
+enum class UpdateOp { ASSIGN, ADD, SUB, MIN, MAX };
+
+}  // namespace scatter_nd_op
+
+namespace functor {
+
+// Functor used by ScatterOp to do the computations.
+template <typename Device, typename T, typename Index,
+          scatter_nd_op::UpdateOp op, int IXDIM>
+struct ScatterNdFunctor {
+  // Returns -1 on success or a nonnegative i s.t. indices[i] is a bad index.
+  Index operator()(
+      const Device& d, const Index slice_size,
+      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
+      typename TTypes<T, 2>::Tensor Tparams,
+      typename TTypes<Index, 2>::ConstTensor Tindices,
+      typename TTypes<T, 2>::ConstTensor Tupdates,
+      typename TTypes<T, 2>::Tensor Toutput);
+};
+
+// Scatter updates into indices in Tensor out.  The argument allocate
+// controls whether 'out' should be created.  If allocate is true,
+// *out will be updated to the scattered tensor upon successful completion.
+// If allocate is false, out must point to a Tensor allocated with the
+// right type (T) and shape.  This tensor will not be zeroed out
+// before the scatter is executed.
+template <typename Device, typename T, typename Index,
+          scatter_nd_op::UpdateOp Op>
+absl::Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
+                         const Tensor& updates, const TensorShape& shape,
+                         Tensor* out, bool allocate);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
new file mode 100644
index 00000000..c4cc570b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -0,0 +1,199 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+
+// Functor definitions for ScatterND ops, must be compilable by nvcc.
+
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/scatter_nd_op.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class OpKernelContext;
+
+// Specialization of UpdateExecutor to CPU
+namespace update_executor {
+
+template <typename T, typename Input, typename Update, typename Output,
+          scatter_nd_op::UpdateOp OP>
+class UpdateExecutor {
+ public:
+  EIGEN_STRONG_INLINE static void Execute(const T& device, Input value,
+                                          Update update, Output output);
+};
+
+template <typename T, typename Input, typename Update, typename Output>
+class UpdateExecutor<T, Input, Update, Output,
+                     scatter_nd_op::UpdateOp::ASSIGN> {
+ public:
+  EIGEN_STRONG_INLINE static void Execute(const T& device, Input /* input */,
+                                          Update update, Output output) {
+    output.device(device) = update;
+  }
+};
+
+template <typename T, typename Input, typename Update, typename Output>
+class UpdateExecutor<T, Input, Update, Output, scatter_nd_op::UpdateOp::ADD> {
+ public:
+  EIGEN_STRONG_INLINE static void Execute(const T& device, Input /* input */,
+                                          Update update, Output output) {
+    output.device(device) += update;
+  }
+};
+
+template <typename T, typename Input, typename Update, typename Output>
+class UpdateExecutor<T, Input, Update, Output, scatter_nd_op::UpdateOp::SUB> {
+ public:
+  EIGEN_STRONG_INLINE static void Execute(const T& device, Input /* input */,
+                                          Update update, Output output) {
+    output.device(device) -= update;
+  }
+};
+
+template <typename T, typename Input, typename Update, typename Output>
+class UpdateExecutor<T, Input, Update, Output, scatter_nd_op::UpdateOp::MIN> {
+ public:
+  EIGEN_STRONG_INLINE static void Execute(const T& device, Input /* input */,
+                                          Update update, Output output) {
+    output.device(device) = output.cwiseMin(update);
+  }
+};
+
+template <typename T, typename Input, typename Update, typename Output>
+class UpdateExecutor<T, Input, Update, Output, scatter_nd_op::UpdateOp::MAX> {
+ public:
+  EIGEN_STRONG_INLINE static void Execute(const T& device, Input /* input */,
+                                          Update update, Output output) {
+    output.device(device) = output.cwiseMax(update);
+  }
+};
+
+}  // namespace update_executor
+
+namespace functor {
+
+// Implementation of update functor for CPU.
+template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
+struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
+  Index operator()(
+      const CPUDevice& d, const Index slice_size,
+      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
+      typename TTypes<T, 2>::Tensor Tparams,
+      typename TTypes<Index, 2>::ConstTensor Tindices,
+      typename TTypes<T, 2>::ConstTensor Tupdates,
+      typename TTypes<T, 2>::Tensor Toutput) {
+    // error_loc is -1 if there's no out-of-bounds index,
+    // otherwise it is the location of an OOB index in Tindices.
+    Index error_loc = -1;
+
+    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
+
+    Index batch_strides[IXDIM];
+    if (IXDIM > 0) {
+      batch_strides[IXDIM - 1] = 1;
+    }
+    for (int dim = IXDIM - 2; dim >= 0; --dim) {
+      batch_strides[dim] =
+          batch_strides[dim + 1] * output_shape_prefix[dim + 1];
+    }
+
+    for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) {
+      Index i = 0;
+      bool out_of_bounds = false;
+      for (int dim = 0; dim < IXDIM; ++dim) {
+        const Index ix_d = internal::SubtleMustCopy(Tindices(loc, dim));
+        out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
+        i += ix_d * batch_strides[dim];
+      }
+      if (TF_PREDICT_FALSE(out_of_bounds)) {
+        error_loc = loc;
+        // Don't break the loop here, but continue to update the rest because
+        // the caller might ignore bad indices.
+        continue;
+      } else {
+        auto input_chip = Toutput.template chip<0>(i);
+        auto output_chip = input_chip;
+        auto update_chip = Tupdates.template chip<0>(loc);
+        update_executor::UpdateExecutor<
+            CPUDevice, decltype(input_chip), decltype(update_chip),
+            decltype(output_chip), OP>::Execute(d, input_chip, update_chip,
+                                                output_chip);
+      }
+    }
+
+    return error_loc;
+  }
+};
+
+#define REGISTER_SCATTER_ND_FULL(T, Index, op)                               \
+  template Index                                                             \
+  ScatterNdFunctor<CPUDevice, T, Index, op, CPU_PROVIDED_IXDIM>::operator()( \
+      const CPUDevice& d, const Index slice_size,                            \
+      const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>              \
+          output_shape_prefix,                                               \
+      typename TTypes<T, 2>::Tensor Tparams,                                 \
+      typename TTypes<Index, 2>::ConstTensor Tindices,                       \
+      typename TTypes<T, 2>::ConstTensor Tupdates,                           \
+      typename TTypes<T, 2>::Tensor Toutput)
+
+#define REGISTER_SCATTER_ND_INDEX(type, op)  \
+  REGISTER_SCATTER_ND_FULL(type, int32, op); \
+  REGISTER_SCATTER_ND_FULL(type, int64, op)
+
+#define REGISTER_SCATTER_ND_UPDATE(type) \
+  REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::ASSIGN);
+
+#define REGISTER_SCATTER_ND_MATH(type)                           \
+  REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::ADD); \
+  REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB);
+
+#define REGISTER_SCATTER_ND_MIN_MAX(type)                        \
+  REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::MAX); \
+  REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::MIN);
+
+TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
+REGISTER_SCATTER_ND_INDEX(tstring, scatter_nd_op::UpdateOp::ADD);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX);
+TF_CALL_bool(REGISTER_SCATTER_ND_MATH);
+
+#undef REGISTER_SCATTER_ND_MATH
+#undef REGISTER_SCATTER_ND_MIN_MAX
+#undef REGISTER_SCATTER_ND_UPDATE
+#undef REGISTER_SCATTER_ND_INDEX
+#undef REGISTER_SCATTER_ND_FULL
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_util.h
new file mode 100644
index 00000000..5095e925
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/scatter_nd_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
+
+#include "xla/tsl/util/env_var.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+// Validates the input shapes for the ScatterNdUpdateOp<scatter_nd_op::UpdateOp>
+absl::Status ValidateScatterNdUpdateShape(const TensorShape& params_shape,
+                                          const TensorShape& indices_shape,
+                                          const TensorShape& updates_shape);
+
+inline bool DisableScatterOpDeterminism() {
+  static bool cached_disable = [] {
+    bool disable = false;
+    // When determinism is enabled, the kernels for various scatter ops like
+    // ScatterNdAdd will still use the faster non-deterministic versions if this
+    // environmental variable is true. This is useful if the user is certain the
+    // scatter inputs don't have duplicate indices (in which cases scatter ops
+    // are always deterministic), since the deterministic implementations are
+    // currently slow.
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_DISABLE_SCATTER_OP_DETERMINISM",
+                                        /*default_val=*/false, &disable));
+    return disable;
+  }();
+  return cached_disable;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sdca_internal.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sdca_internal.h
new file mode 100644
index 00000000..8f5ac038
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sdca_internal.h
@@ -0,0 +1,394 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
+#define TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
+
+#define EIGEN_USE_THREADS
+
+#include <stddef.h>
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <new>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/loss.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+namespace sdca {
+
+// Statistics computed with input (ModelWeights, Example).
+struct ExampleStatistics {
+  // Logits for each class.
+  // For binary case, this should be a vector of length 1; while for multiclass
+  // case, this vector has the same length as the number of classes, where each
+  // value corresponds to one class.
+  // Use InlinedVector to avoid heap allocation for small number of classes.
+  absl::InlinedVector<double, 1> wx;
+
+  // Logits for each class, using the previous weights.
+  absl::InlinedVector<double, 1> prev_wx;
+
+  // Sum of squared feature values occurring in the example divided by
+  // L2 * sum(example_weights).
+  double normalized_squared_norm = 0;
+
+  // Num_weight_vectors equals to the number of classification classes in the
+  // multiclass case; while for binary case, it is 1.
+  ExampleStatistics(const int num_weight_vectors)
+      : wx(num_weight_vectors, 0.0), prev_wx(num_weight_vectors, 0.0) {}
+};
+
+class Regularizations {
+ public:
+  Regularizations() {}
+
+  // Initialize() must be called immediately after construction.
+  absl::Status Initialize(OpKernelConstruction* const context) {
+    TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
+    TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
+    shrinkage_ = symmetric_l1_ / symmetric_l2_;
+    return absl::OkStatus();
+  }
+
+  // Proximal SDCA shrinking for L1 regularization.
+  double Shrink(const double weight) const {
+    const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0);
+    if (shrinked > 0.0) {
+      return std::copysign(shrinked, weight);
+    }
+    return 0.0;
+  }
+
+  // Vectorized float variant of the above.
+  Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrinkVector(
+      const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const {
+    // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
+    return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
+                                 .cwiseMax(weights.constant(0.0)));
+  }
+
+  // Matrix float variant of the above.
+  Eigen::Tensor<float, 2, Eigen::RowMajor> EigenShrinkMatrix(
+      const Eigen::Tensor<float, 2, Eigen::RowMajor> weights) const {
+    // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
+    return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
+                                 .cwiseMax(weights.constant(0.0)));
+  }
+
+  float symmetric_l2() const { return symmetric_l2_; }
+
+ private:
+  float symmetric_l1_ = 0;
+  float symmetric_l2_ = 0;
+
+  // L1 divided by L2, pre-computed for use during weight shrinking.
+  double shrinkage_ = 0;
+
+  Regularizations(const Regularizations&) = delete;
+  void operator=(const Regularizations&) = delete;
+};
+
+class ModelWeights;
+
+// Struct describing a single example.
+class Example {
+ public:
+  // Compute matrix vector product between weights (a matrix) and features
+  // (a vector). This method also computes the normalized example norm used
+  // in SDCA update.
+  // For multiclass case, num_weight_vectors equals to the number of classes;
+  // while for binary case, it is 1.
+  const ExampleStatistics ComputeWxAndWeightedExampleNorm(
+      const int num_loss_partitions, const ModelWeights& model_weights,
+      const Regularizations& regularization,
+      const int num_weight_vectors) const;
+
+  float example_label() const { return example_label_; }
+
+  float example_weight() const { return example_weight_; }
+
+  double squared_norm() const { return squared_norm_; }
+
+  // Sparse features associated with the example.
+  // Indices and Values are the associated feature index, and values. Values
+  // can be optionally absent, in which we case we implicitly assume a value of
+  // 1.0f.
+  struct SparseFeatures {
+    std::unique_ptr<TTypes<const int64_t>::UnalignedConstVec> indices;
+    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
+        values;  // nullptr encodes optional.
+  };
+
+  // A dense vector which is a row-slice of the underlying matrix.
+  struct DenseVector {
+    // Returns a row slice from the matrix.
+    Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> Row()
+        const {
+      return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
+          data_matrix.data() + row_index * data_matrix.dimension(1),
+          data_matrix.dimension(1));
+    }
+
+    // Returns a row slice as a 1 * F matrix, where F is the number of features.
+    Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>
+    RowAsMatrix() const {
+      return Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>(
+          data_matrix.data() + row_index * data_matrix.dimension(1), 1,
+          data_matrix.dimension(1));
+    }
+
+    const TTypes<float>::ConstMatrix data_matrix;
+    const int64_t row_index;
+  };
+
+ private:
+  std::vector<SparseFeatures> sparse_features_;
+  std::vector<std::unique_ptr<DenseVector>> dense_vectors_;
+
+  float example_label_ = 0;
+  float example_weight_ = 0;
+  double squared_norm_ = 0;  // sum squared norm of the features.
+
+  // Examples fills Example in a multi-threaded way.
+  friend class Examples;
+
+  // ModelWeights use each example for model update w += \alpha * x_{i};
+  friend class ModelWeights;
+};
+
+// Weights related to features. For example, say you have two sets of sparse
+// features i.e. age bracket and country, then FeatureWeightsDenseStorage hold
+// the parameters for it. We keep track of the original weight passed in and the
+// delta weight which the optimizer learns in each call to the optimizer.
+class FeatureWeightsDenseStorage {
+ public:
+  FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals,
+                             TTypes<float>::Matrix deltas)
+      : nominals_(nominals), deltas_(deltas) {
+    CHECK_GT(deltas.rank(), 1);
+  }
+
+  // Check if a feature index is with-in the bounds.
+  bool IndexValid(const int64_t index) const {
+    return index >= 0 && index < deltas_.dimension(1);
+  }
+
+  // Nominals here are the original weight matrix.
+  TTypes<const float>::Matrix nominals() const { return nominals_; }
+
+  // Delta weights during mini-batch updates.
+  TTypes<float>::Matrix deltas() const { return deltas_; }
+
+  // Updates delta weights based on active dense features in the example and
+  // the corresponding dual residual.
+  void UpdateDenseDeltaWeights(
+      const Eigen::ThreadPoolDevice& device,
+      const Example::DenseVector& dense_vector,
+      const std::vector<double>& normalized_bounded_dual_delta);
+
+ private:
+  // The nominal value of the weight for a feature (indexed by its id).
+  const TTypes<const float>::Matrix nominals_;
+  // The accumulated delta weight for a feature (indexed by its id).
+  TTypes<float>::Matrix deltas_;
+};
+
+// Similar to FeatureWeightsDenseStorage, but the underlying weights are stored
+// in an unordered map.
+class FeatureWeightsSparseStorage {
+ public:
+  FeatureWeightsSparseStorage(const TTypes<const int64_t>::Vec indices,
+                              const TTypes<const float>::Matrix nominals,
+                              TTypes<float>::Matrix deltas)
+      : nominals_(nominals), deltas_(deltas) {
+    // Create a map from sparse index to the dense index of the underlying
+    // storage.
+    for (int64_t j = 0; j < indices.size(); ++j) {
+      indices_to_id_[indices(j)] = j;
+    }
+  }
+
+  // Check if a feature index exists.
+  bool IndexValid(const int64_t index) const {
+    return indices_to_id_.find(index) != indices_to_id_.end();
+  }
+
+  // Nominal value at a particular feature index and class label.
+  float nominals(const int class_id, const int64_t index) const {
+    auto it = indices_to_id_.find(index);
+    return nominals_(class_id, it->second);
+  }
+
+  // Delta weights during mini-batch updates.
+  float deltas(const int class_id, const int64_t index) const {
+    auto it = indices_to_id_.find(index);
+    return deltas_(class_id, it->second);
+  }
+
+  // Updates delta weights based on active sparse features in the example and
+  // the corresponding dual residual.
+  void UpdateSparseDeltaWeights(
+      const Eigen::ThreadPoolDevice& device,
+      const Example::SparseFeatures& sparse_features,
+      const std::vector<double>& normalized_bounded_dual_delta);
+
+ private:
+  // The nominal value of the weight for a feature (indexed by its id).
+  const TTypes<const float>::Matrix nominals_;
+  // The accumulated delta weight for a feature (indexed by its id).
+  TTypes<float>::Matrix deltas_;
+  // Map from feature index to an index to the dense vector.
+  std::unordered_map<int64_t, int64_t> indices_to_id_;
+};
+
+// Weights in the model, wraps both current weights, and the delta weights
+// for both sparse and dense features.
+class ModelWeights {
+ public:
+  ModelWeights() {}
+
+  bool SparseIndexValid(const int col, const int64_t index) const {
+    return sparse_weights_[col].IndexValid(index);
+  }
+
+  bool DenseIndexValid(const int col, const int64_t index) const {
+    return dense_weights_[col].IndexValid(index);
+  }
+
+  // Go through all the features present in the example, and update the
+  // weights based on the dual delta.
+  void UpdateDeltaWeights(
+      const Eigen::ThreadPoolDevice& device, const Example& example,
+      const std::vector<double>& normalized_bounded_dual_delta);
+
+  absl::Status Initialize(OpKernelContext* const context);
+
+  const std::vector<FeatureWeightsSparseStorage>& sparse_weights() const {
+    return sparse_weights_;
+  }
+
+  const std::vector<FeatureWeightsDenseStorage>& dense_weights() const {
+    return dense_weights_;
+  }
+
+ private:
+  std::vector<FeatureWeightsSparseStorage> sparse_weights_;
+  std::vector<FeatureWeightsDenseStorage> dense_weights_;
+
+  ModelWeights(const ModelWeights&) = delete;
+  void operator=(const ModelWeights&) = delete;
+};
+
+// Examples contains all the training examples that SDCA uses for a mini-batch.
+class Examples {
+ public:
+  Examples() {}
+
+  // Returns the Example at |example_index|.
+  const Example& example(const int example_index) const {
+    return examples_.at(example_index);
+  }
+
+  int sampled_index(const int id) const { return sampled_index_[id]; }
+
+  // Adaptive SDCA in the current implementation only works for
+  // binary classification, where the input argument for num_weight_vectors
+  // is 1.
+  absl::Status SampleAdaptiveProbabilities(
+      const int num_loss_partitions, const Regularizations& regularization,
+      const ModelWeights& model_weights,
+      const TTypes<float>::Matrix example_state_data,
+      const std::unique_ptr<DualLossUpdater>& loss_updater,
+      const int num_weight_vectors);
+
+  void RandomShuffle();
+
+  int num_examples() const { return examples_.size(); }
+
+  int num_features() const { return num_features_; }
+
+  // Initialize() must be called immediately after construction.
+  absl::Status Initialize(OpKernelContext* const context,
+                          const ModelWeights& weights, int num_sparse_features,
+                          int num_sparse_features_with_values,
+                          int num_dense_features);
+
+ private:
+  // Reads the input tensors, and builds the internal representation for sparse
+  // features per example. This function modifies the |examples| passed in
+  // to build the sparse representations.
+  static absl::Status CreateSparseFeatureRepresentation(
+      const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
+      int num_sparse_features, const ModelWeights& weights,
+      const OpInputList& sparse_example_indices_inputs,
+      const OpInputList& sparse_feature_indices_inputs,
+      const OpInputList& sparse_feature_values_inputs,
+      std::vector<Example>* const examples);
+
+  // Reads the input tensors, and builds the internal representation for dense
+  // features per example. This function modifies the |examples| passed in
+  // to build the sparse representations.
+  static absl::Status CreateDenseFeatureRepresentation(
+      const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
+      int num_dense_features, const ModelWeights& weights,
+      const OpInputList& dense_features_inputs,
+      std::vector<Example>* const examples);
+
+  // Computes squared example norm per example i.e |x|^2. This function modifies
+  // the |examples| passed in and adds the squared norm per example.
+  static absl::Status ComputeSquaredNormPerExample(
+      const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
+      int num_sparse_features, int num_dense_features,
+      std::vector<Example>* const examples);
+
+  // All examples in the batch.
+  std::vector<Example> examples_;
+
+  // Adaptive sampling variables.
+  std::vector<float> probabilities_;
+  std::vector<int> sampled_index_;
+  std::vector<int> sampled_count_;
+
+  int num_features_ = 0;
+
+  Examples(const Examples&) = delete;
+  void operator=(const Examples&) = delete;
+};
+
+}  // namespace sdca
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/searchsorted_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/searchsorted_op.h
new file mode 100644
index 00000000..fb4ade03
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/searchsorted_op.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SEARCHSORTED_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SEARCHSORTED_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, typename OutType>
+struct UpperBoundFunctor {
+  // Searches for values in sorted_inputs and returns the greatest possible
+  // index where they maintain sorted order.
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
+      const typename TTypes<T, 1>::ConstTensor& values, int batch_size,
+      int num_inputs, int num_values,
+      typename TTypes<OutType, 1>::Tensor* output);
+};
+
+template <typename Device, typename T, typename OutType>
+struct LowerBoundFunctor {
+  // Searches for values in sorted_inputs and returns the lowest possible
+  // index where they maintain sorted order.
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
+      const typename TTypes<T, 1>::ConstTensor& values, int batch_size,
+      int num_inputs, int num_values,
+      typename TTypes<OutType, 1>::Tensor* output);
+};
+}  // namespace functor
+
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_SEARCHSORTED_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops.h
new file mode 100644
index 00000000..93aa9636
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops.h
@@ -0,0 +1,173 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+bool UseDeterministicSegmentReductions();
+bool DisableSegmentReductionOpDeterminismExceptions();
+
+// Type of SparseSegmentReduction operation to perform gradient of.
+enum class SparseSegmentReductionOperation { kSum, kMean, kSqrtN };
+
+namespace functor {
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// Note that we define this ourselves to avoid a dependency on gpuprim.
+struct Sum {
+  template <typename T>
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+struct Prod {
+  template <typename T>
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+// Note that we don't use gpuprim::Min/Max because they use operator<, which is
+// not implemented for AlignedVector types.
+struct Min {
+  template <typename T>
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return min(a, b);
+  }
+};
+
+struct Max {
+  template <typename T>
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return max(a, b);
+  }
+};
+
+template <typename ReduceOp, typename T>
+struct ReduceOpIsAssociative {};
+template <typename T>
+struct ReduceOpIsAssociative<functor::Sum, T> : std::is_integral<T> {};
+template <typename T>
+struct ReduceOpIsAssociative<functor::Prod, T> : std::is_integral<T> {};
+template <typename T>
+struct ReduceOpIsAssociative<functor::Max, T> : std::true_type {};
+template <typename T>
+struct ReduceOpIsAssociative<functor::Min, T> : std::true_type {};
+
+typedef Eigen::GpuDevice GPUDevice;
+// Functor for SegmentReductionGPUOp.
+// output_rows: the number of output segments (unique segment ids in
+//                'segment_ids').
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
+//                perform segment sum operation.
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
+template <typename T, typename Index, typename InitialValueF,
+          typename EmptySegmentValueF, typename ReductionF>
+struct SegmentReductionFunctor {
+  void operator()(OpKernelContext* ctx, const GPUDevice& d,
+                  const Index output_rows, const TensorShape& segment_ids_shape,
+                  bool is_mean, typename TTypes<Index>::ConstFlat segment_ids,
+                  const Index data_size, const T* data,
+                  typename TTypes<T, 2>::Tensor output);
+  static constexpr bool atomic_reduction_is_associative =
+      ReduceOpIsAssociative<ReductionF, T>::value;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Device, typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor {
+  void operator()(OpKernelContext* ctx, const TensorShape& segment_ids_shape,
+                  typename TTypes<Index>::ConstFlat segment_ids,
+                  typename TTypes<T, 2>::ConstTensor data,
+                  typename TTypes<T, 2>::Tensor output);
+};
+
+// Initial value functors.
+template <typename T>
+struct Zero {
+  EIGEN_STRONG_INLINE T operator()() const { return T(0); }
+};
+
+template <typename T>
+struct One {
+  EIGEN_STRONG_INLINE T operator()() const { return T(1); }
+};
+
+template <typename T>
+struct Lowest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+
+template <typename T>
+struct Highest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+
+template <typename T, typename Index, typename SegmentId>
+struct SparseSegmentReductionFunctor {
+  absl::Status operator()(OpKernelContext* context, bool is_mean, bool is_sqrtn,
+                          T default_value,
+                          typename TTypes<T, 2>::ConstTensor input,
+                          typename TTypes<Index>::ConstVec indices,
+                          typename TTypes<SegmentId>::ConstVec segment_ids,
+                          typename TTypes<T, 2>::Tensor output);
+};
+
+template <class Device, typename T, typename Index, typename SegmentId>
+struct SparseSegmentGradFunctor {
+  void operator()(OpKernelContext* context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Index>::ConstVec indices_vec,
+                  typename TTypes<SegmentId>::ConstVec segment_vec,
+                  Tensor* output);
+};
+
+template <class Device, typename T, typename Index, typename SegmentId>
+struct SparseSegmentGradV2Functor {
+  void operator()(OpKernelContext* context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Index>::ConstVec indices_vec,
+                  typename TTypes<SegmentId>::ConstVec segment_vec,
+                  const TensorShape& dense_output_shape,
+                  typename AsyncOpKernel::DoneCallback done);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
new file mode 100644
index 00000000..f0ba0ce2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -0,0 +1,1413 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/gpu_prim_helpers.h"
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/util/determinism.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/gpu_device_functions.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/util/gpu_solvers.h"  // For ScratchSpace
+#include "tensorflow/core/util/permutation_input_iterator.h"
+
+#if (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+#include "tensorflow/core/platform/rocm.h"
+#endif
+
+namespace tensorflow {
+
+using GPUDevice = Eigen::GpuDevice;
+
+// Non/Atomic reduction functors for the gpu.
+#define DEFINE_REDUCE_UPDATE_OP_GPU(name, func)                             \
+  struct name##OpGpu {                                                      \
+    template <typename T>                                                   \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,          \
+                                                          const T& value) { \
+      func;                                                                 \
+    }                                                                       \
+  };
+DEFINE_REDUCE_UPDATE_OP_GPU(AtomicSum, GpuAtomicAdd(dest, value))
+DEFINE_REDUCE_UPDATE_OP_GPU(AtomicProd, GpuAtomicMul(dest, value))
+DEFINE_REDUCE_UPDATE_OP_GPU(AtomicMax, GpuAtomicMax(dest, value))
+DEFINE_REDUCE_UPDATE_OP_GPU(AtomicMin, GpuAtomicMin(dest, value))
+DEFINE_REDUCE_UPDATE_OP_GPU(NonAtomicSum, *dest += value)
+DEFINE_REDUCE_UPDATE_OP_GPU(NonAtomicProd, *dest *= value)
+DEFINE_REDUCE_UPDATE_OP_GPU(NonAtomicMax, *dest = max(*dest, value))
+DEFINE_REDUCE_UPDATE_OP_GPU(NonAtomicMin, *dest = min(*dest, value))
+#undef DEFINE_REDUCE_UPDATE_OP_GPU
+
+template <typename ReduceOp>
+struct ReduceUpdateOpFor {};
+
+#define DEFINE_REDUCE_UPDATE_OP_FOR(reduce_op, atomic, nonatomic) \
+  template <>                                                     \
+  struct ReduceUpdateOpFor<reduce_op> {                           \
+    using atomic_op = atomic;                                     \
+    using nonatomic_op = nonatomic;                               \
+  };
+DEFINE_REDUCE_UPDATE_OP_FOR(functor::Sum, AtomicSumOpGpu, NonAtomicSumOpGpu)
+DEFINE_REDUCE_UPDATE_OP_FOR(functor::Prod, AtomicProdOpGpu, NonAtomicProdOpGpu)
+DEFINE_REDUCE_UPDATE_OP_FOR(functor::Max, AtomicMaxOpGpu, NonAtomicMaxOpGpu)
+DEFINE_REDUCE_UPDATE_OP_FOR(functor::Min, AtomicMinOpGpu, NonAtomicMinOpGpu)
+#undef DEFINE_REDUCE_UPDATE_OP_FOR
+
+// PR#61339: MSVC does not support compound-assignment operators on device
+
+// SortedSegmentReductionFunctor kernel reduces input data just as
+// UnsortedSegmentReductionCustomKernel does except that input data
+// is partitioned along the outer reduction dimension. This is
+// because consecutive rows (elements in a row share the same
+// outer dimension index) in the flattened 2D input data likely
+// belong to the same segment in sorted segment sum operation.
+// Therefore such partitioning strategy has two advantages over
+// the UnsortedSegmentReductionFunctor kernel:
+// 1. Each thread reduces across multiple rows before writing
+// answers to the global memory, we can therefore
+// write reduction results to global memory less often.
+// 2. We may know that the current thread is the only contributor
+// to an output element because of the increasing nature of segment
+// ids. In such cases, we do not need to use atomic operations
+// to write results to global memory.
+// In the flattened view of input data (with only outer and inner
+// dimension), every thread processes a strip of input data of
+// size OuterDimTileSize x 1. This strip runs across multiple
+// rows of input data and all reduction elements share one inner
+// dimension index.
+template <typename T, typename Index, int OuterDimTileSize, typename ReductionF,
+          typename AtomicReductionF>
+__global__ void SortedSegmentReductionCustomKernel(
+    const Index input_outer_dim_size, const Index inner_dim_size,
+    const Index output_outer_dim_size, const Index* __restrict__ segment_ids,
+    const T* __restrict__ input, T* __restrict__ output,
+    const Index total_stripe_count, const T initial_value) {
+  for (int stripe_index : GpuGridRangeX(total_stripe_count)) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index input_outer_dim_index_base =
+        stripe_index / inner_dim_size * Index(OuterDimTileSize);
+
+    T reduce_res = initial_value;
+    Index first_segment_id = segment_ids[input_outer_dim_index_base];
+    Index last_output_segment_id = output_outer_dim_size;
+
+    const Index actual_stripe_height =
+        min(Index(OuterDimTileSize),
+            input_outer_dim_size - input_outer_dim_index_base);
+    for (Index j = 0; j < actual_stripe_height; j++) {
+      Index current_output_segment_id =
+          segment_ids[input_outer_dim_index_base + j];
+      // Decide whether to write result to global memory. Result is only written
+      // to global memory if we move to another segment. Otherwise we can keep
+      // accumulating locally.
+      if (current_output_segment_id > last_output_segment_id) {
+        const Index output_index =
+            last_output_segment_id * inner_dim_size + segment_offset;
+        // Decide whether to write result to global memory using atomic
+        // operations.
+        if (last_output_segment_id == first_segment_id) {
+          AtomicReductionF()(output + output_index, reduce_res);
+        } else {
+          ReductionF()(output + output_index, reduce_res);
+        }
+        reduce_res = initial_value;
+      }
+      ReductionF()(
+          &reduce_res,
+          ldg(input + (input_outer_dim_index_base + j) * inner_dim_size +
+              segment_offset));
+      last_output_segment_id = current_output_segment_id;
+    }
+    // For the last result in a strip, always write using atomic operations
+    // due to possible race conditions with threads computing
+    // the following strip.
+    const Index output_index =
+        last_output_segment_id * inner_dim_size + segment_offset;
+    AtomicReductionF()(output + output_index, reduce_res);
+  }
+}
+
+template <typename SegmentId, typename Index, typename T>
+__global__ void SegmentMeanNormalizeKernel(
+    SegmentId nsegments, Index ninner,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    T* __restrict__ output) {                   // [nsegments, ninner]
+  for (SegmentId seg : GpuGridRangeY(nsegments)) {
+    SegmentId segment_size = segment_offsets[seg + 1] - segment_offsets[seg];
+    segment_size = max(segment_size, Index(1));  // Avoid division by zero
+    T inv_norm = T(1) / static_cast<T>(segment_size);
+    for (Index i : GpuGridRangeX(ninner)) {
+      output[seg * ninner + i] *= inv_norm;
+    }
+  }
+}
+
+template <typename SegmentId, typename Index, typename T>
+Status LaunchSegmentMeanNormalizeKernel(
+    const GPUDevice& d, SegmentId nsegments, Index ninner,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    T* __restrict__ output) {                   // [nsegments, ninner]
+  Gpu2DLaunchConfig config = GetGpu2DLaunchConfig(
+      ninner, nsegments, d, SegmentMeanNormalizeKernel<SegmentId, Index, T>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(SegmentMeanNormalizeKernel<SegmentId, Index, T>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), nsegments, ninner, segment_offsets,
+                         output);
+}
+
+template <typename SegmentId, typename Index, typename T>
+__global__ void SegmentSetEmptyKernel(
+    SegmentId nsegments, Index ninner,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    const T empty_value,
+    T* __restrict__ output) {  // [nsegments, ninner]
+  for (SegmentId seg : GpuGridRangeY(nsegments)) {
+    SegmentId segment_size = segment_offsets[seg + 1] - segment_offsets[seg];
+    if (segment_size == 0) {
+      for (Index i : GpuGridRangeX(ninner)) {
+        output[seg * ninner + i] = empty_value;
+      }
+    }
+  }
+}
+
+template <typename SegmentId, typename Index, typename T>
+Status LaunchSegmentSetEmptyKernel(
+    const GPUDevice& d, SegmentId nsegments, Index ninner,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    const T empty_value,
+    T* __restrict__ output) {  // [nsegments, ninner]
+  Gpu2DLaunchConfig config = GetGpu2DLaunchConfig(
+      ninner, nsegments, d, SegmentSetEmptyKernel<SegmentId, Index, T>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(SegmentSetEmptyKernel<SegmentId, Index, T>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), nsegments, ninner, segment_offsets,
+                         empty_value, output);
+}
+
+// UnsortedSegmentSumKernel processes 'input_total_size' elements.
+// Each element is mapped from input to output by a combination of its
+// 'segment_ids' mapping and 'inner_dim_size'.
+template <typename T, typename Index, typename KernelReductionFunctor>
+__global__ void UnsortedSegmentCustomKernel(
+    const int64_t input_outer_dim_size, const int64_t inner_dim_size,
+    const int64_t output_outer_dim_size, const Index* __restrict__ segment_ids,
+    const T* __restrict__ input, T* __restrict__ output) {
+  const int64_t input_total_size = input_outer_dim_size * inner_dim_size;
+  for (int64_t input_index : GpuGridRangeX(input_total_size)) {
+    const int64_t input_segment_index = input_index / inner_dim_size;
+    const int64_t segment_offset = input_index % inner_dim_size;
+    const Index output_segment_index = segment_ids[input_segment_index];
+    if (output_segment_index < 0 ||
+        output_segment_index >= output_outer_dim_size) {
+      continue;
+    }
+    const int64_t output_index =
+        output_segment_index * inner_dim_size + segment_offset;
+    KernelReductionFunctor()(output + output_index, ldg(input + input_index));
+  }
+}
+
+template <typename Toffsets, typename Tsegmentids>
+__global__ void SegmentOffsetsKernel(
+    Toffsets size, Tsegmentids nsegments,
+    const Tsegmentids* __restrict__ segment_ids,  // [size]
+    Toffsets* __restrict__ segment_offsets) {     // [nsegments + 1]
+  GPU_1D_KERNEL_LOOP(i, size + 1) {
+    // IDs are clipped to [-1, nsegments] so that out-of-bounds IDs are ignored.
+    // Note that we can't report invalid IDs from the GPU without incurring
+    // additional overhead.
+    auto clip = [&](Tsegmentids id) {
+      return min(max(Tsegmentids(-1), id), nsegments);
+    };
+    const Tsegmentids cur_id = (i < size) ? clip(segment_ids[i]) : nsegments;
+    const Tsegmentids prev_id =
+        (i == 0) ? Tsegmentids(-1) : clip(segment_ids[i - 1]);
+    // At segment boundaries, write the offset for this ID and any missing IDs
+    // since the previous one.
+    for (Tsegmentids id = prev_id + 1; id <= cur_id; ++id) {
+      segment_offsets[id] = i;
+    }
+  }
+}
+
+// Finds the start offset of each segment in the given sorted segment_ids
+// vector. Missing IDs are given the same offset as the next ID so that they
+// represent empty ranges. Invalid IDs (those that are outside the range
+// [0, nsegments)) are ignored. The value at segment_offsets[0] is set to the
+// start index of the first valid ID (e.g., 0 if all IDs are valid), and the
+// value at segment_offsets[nsegments] is set to the end index of the last valid
+// ID (e.g., nsegments if all IDs are valid).
+template <typename Toffsets, typename Tsegmentids>
+Status LaunchSegmentOffsetsKernel(
+    const GPUDevice& d, Toffsets size, Tsegmentids nsegments,
+    const Tsegmentids* segment_ids,  // [size]
+    Toffsets* segment_offsets) {     // [nsegments + 1]
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      size + 1, d, &SegmentOffsetsKernel<Toffsets, Tsegmentids>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(SegmentOffsetsKernel<Toffsets, Tsegmentids>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), size, nsegments, segment_ids,
+                         segment_offsets);
+}
+
+template <typename T>
+struct RealTypeIfComplex {
+  using type = T;
+};
+
+template <typename Real>
+struct RealTypeIfComplex<std::complex<Real>> {
+  using type = Real;
+};
+
+// Reduces along columns of the thread block, returning the result in the first
+// row of threads.
+template <typename T, typename ReduceOp>
+__device__ T ReduceBlockAlongCols(ReduceOp reduce_op, const T& value,
+                                  bool is_valid) {
+  GPU_DYNAMIC_SHARED_MEM_DECL(/*ALIGN=*/16, char, shared_memory_raw);
+  T* const shared_partial_reduction =
+      reinterpret_cast<T*>(shared_memory_raw);  // [blockDim.y, blockDim.x]
+  const int x = threadIdx.x;
+  const int y = threadIdx.y;
+  T reduced = value;
+  // Reduce over the y dimension of the block.
+  for (unsigned k = blockDim.y / 2; k > 0; k /= 2) {
+    if (is_valid && y < 2 * k) {
+      shared_partial_reduction[y * blockDim.x + x] = reduced;
+    }
+    __syncthreads();
+    if (is_valid && y < k) {
+      reduced = reduce_op(reduced,
+                          shared_partial_reduction[(y + k) * blockDim.x + x]);
+    }
+    __syncthreads();
+  }
+  return reduced;
+}
+
+// This kernel uses a 2D thread decomposition. The x dimension maps to the inner
+// dimension of the input/output. The y grid dimension maps to segments, and y
+// threads within a block cooperate to reduce over the block's segment.
+// Note that Tinit is needed because Tvec and Treducevec may be vector types,
+// but Tinit is always a scalar type.
+// Note that the first dimension of input_vec is nouter if indices is not
+// provided; otherwise it is indexed indirectly via indices and can have any
+// size (as long as it spans at least the maximum value in indices). This also
+// applies to the weights vector.
+template <typename Treducevec, typename Tvec, typename Toffsets,
+          typename Tindices, typename Tsegmentids, typename ReduceOp,
+          typename Tinit, typename Tweights>
+__global__ void SegmentReduceVectorKernel(
+    Toffsets nouter, Toffsets ninner_vec, Tsegmentids nsegments,
+    ReduceOp reduce_op, Tinit initial_value, Tinit empty_segment_value,
+    bool is_mean, bool is_sqrtn,
+    const Tvec* __restrict__ input_vec,  // [nouter or any, ninner_vec]
+    const Toffsets* __restrict__ segment_offsets,  // [nsegments + 1]
+    const Tindices* __restrict__ indices,          // [nouter] (optional)
+    const Tweights* __restrict__ weights,          // [nouter or any] (optional)
+    Tvec* __restrict__ output_vec) {               // [nsegments, ninner_vec]
+  const int num_blocks_x = (ninner_vec - 1) / blockDim.x + 1;
+  // Grid-stride loop over inner dimension blocks.
+  for (Toffsets blk_x = blockIdx.x; blk_x < num_blocks_x; blk_x += gridDim.x) {
+    const Toffsets x = threadIdx.x + blk_x * blockDim.x;
+    const Toffsets y = threadIdx.y;
+    const bool x_ok = x < ninner_vec;
+    // Grid-stride loop over segment blocks, each processing one segment.
+    for (Tsegmentids seg = blockIdx.y; seg < nsegments; seg += gridDim.y) {
+      // Load segment range.
+      const Toffsets begin = segment_offsets[seg];
+      const Toffsets end = segment_offsets[seg + 1];
+      // Reduce over the segment.
+      Treducevec result = Treducevec(initial_value);
+      // Loop over the segment, reducing blockDim.y elements at a time.
+      for (Toffsets y_offset = begin; y_offset < end; y_offset += blockDim.y) {
+        const bool y_ok = (y_offset + y) < end;
+        // Perform indirect lookup if required.
+        const Toffsets y_idx =
+            indices && y_ok ? indices[y_offset + y] : y_offset + y;
+        const int64_t input_idx = static_cast<int64_t>(y_idx) * ninner_vec + x;
+        // Load the input row from global mem.
+        Treducevec block_result =
+            x_ok && y_ok ? input_vec[input_idx] : Tvec(initial_value);
+        // Apply weights if provided.
+        if (weights && y_ok) block_result = block_result * Tvec(weights[y_idx]);
+        // Reduce along the columns of the block, returning result in first row.
+        block_result = ReduceBlockAlongCols(reduce_op, block_result, x_ok);
+        if (y == 0 && x_ok) {
+          result = reduce_op(result, block_result);
+        }
+      }
+      // First row of the block stores the result to global memory.
+      if (y == 0 && x_ok) {
+        if (begin == end) {
+          // Empty segment.
+          result = Treducevec(empty_segment_value);
+        } else {
+          Tweights total_weight(end - begin);
+          // Normalize the results if necessary.
+          if (is_mean) {
+            result = result / Treducevec(total_weight);
+          } else if (is_sqrtn) {
+            result =
+                result / Treducevec(sqrt(static_cast<double>(total_weight)));
+          }
+        }
+        // Cast from Treducevec to Tvec.
+        const int64_t output_idx = static_cast<int64_t>(seg) * ninner_vec + x;
+        output_vec[output_idx] = static_cast<Tvec>(result);
+      }
+    }
+  }
+}
+
+// Reduces input matrix within segments over the outer dimension. Empty segments
+// always output empty_segment_value.
+// If is_mean or is_sqrtn is true, the results are normalized using the
+// corresponding function.
+// If indices is not nullptr, input rows are accessed indirectly as
+// input[indices[i]], instead of input[i].
+// Note: Treducevec is to allow reducing in higher precision than Tvec.
+template <typename Treducevec, typename Tvec, typename Toffsets,
+          typename Tindices, typename Tsegmentids, typename ReduceOp,
+          typename Tinit, typename Tweights>
+Status LaunchSegmentReduceVectorKernel(
+    const GPUDevice& d, Toffsets nouter, Toffsets ninner_vec,
+    Tsegmentids nsegments, ReduceOp reduce_op, Tinit initial_value,
+    Tinit empty_segment_value, bool is_mean, bool is_sqrtn,
+    const Tvec* input_vec,            // [nouter or any, ninner_vec]
+    const Toffsets* segment_offsets,  // [nsegments + 1]
+    const Tindices* indices,          // [nouter] (optional)
+    const Tweights* weights,          // [nouter or any] (optional)
+    Tvec* output_vec) {               // [nsegments, ninner_vec]
+  static constexpr const int kMaxGridX = (1u << 31) - 1;
+  static constexpr const int kMaxGridY = (1u << 16) - 1;
+  const int max_block_size = 1024;  // Can be tuned for perf (<= 1024)
+  const int min_block_size = 64;    // Can be tuned for perf
+  const Toffsets ninner_pow2 = Toffsets(1) << Log2Ceiling64(ninner_vec);
+  // This is a heuristic that first allocates threads in the block to the inner
+  // (x) dimension (which is most efficient) and then allocates the rest to the
+  // reduction (y) dimension (which is less efficient but increases
+  // parallelism).
+  int block_x = std::min(ninner_pow2, static_cast<Toffsets>(max_block_size));
+  const Toffsets avg_reduce_size =
+      Eigen::divup(nouter, static_cast<Toffsets>(nsegments));
+  const Toffsets avg_reduce_size_pow2 = Toffsets(1)
+                                        << Log2Ceiling64(avg_reduce_size);
+  dim3 block(
+      block_x,
+      std::min(static_cast<Toffsets>(Eigen::divup(min_block_size, block_x)),
+               avg_reduce_size_pow2));
+  dim3 grid(std::min(Eigen::divup(ninner_vec, static_cast<Toffsets>(block.x)),
+                     static_cast<Toffsets>(kMaxGridX)),
+            std::min(nsegments, static_cast<Tsegmentids>(kMaxGridY)));
+  unsigned shared_memory_bytes = block.x * block.y * sizeof(Treducevec);
+  return GpuLaunchKernel(
+      SegmentReduceVectorKernel<Treducevec, Tvec, Toffsets, Tindices,
+                                Tsegmentids, ReduceOp, Tinit, Tweights>,
+      grid, block, shared_memory_bytes, d.stream(), nouter, ninner_vec,
+      nsegments, reduce_op, initial_value, empty_segment_value, is_mean,
+      is_sqrtn, input_vec, segment_offsets, indices, weights, output_vec);
+}
+
+template <typename Tvec, typename Treducevec, typename Toffsets,
+          typename Tsegmentids, typename Tinit>
+__global__ void SegmentReduceEpilogueKernel(
+    Tsegmentids nsegments, Tinit empty_segment_value, bool is_mean,
+    bool is_sqrtn,
+    const Treducevec* __restrict__ output_raw,     // [nsegments]
+    const Toffsets* __restrict__ segment_offsets,  // [nsegments + 1]
+    Tvec* __restrict__ output) {                   // [nsegments]
+  GPU_1D_KERNEL_LOOP(seg, nsegments) {
+    Toffsets segment_size = segment_offsets[seg + 1] - segment_offsets[seg];
+    Treducevec val = output_raw[seg];
+    if (segment_size == 0) {
+      // Empty segment.
+      val = Treducevec(empty_segment_value);
+    } else if (is_mean) {
+      val = val / Treducevec(segment_size);
+    } else if (is_sqrtn) {
+      val = val / Treducevec(sqrt(static_cast<double>(
+                      typename RealTypeIfComplex<Tinit>::type(segment_size))));
+    }
+    // Cast from Treducevec to Tvec.
+    output[seg] = static_cast<Tvec>(val);
+  }
+}
+
+// Normalizes output_raw based on segment size and casts from Treducevec to
+// Tvec. If Tvec == Treducevec, this is safe to call with output_raw == output.
+// Note that Treducevec is the type that was used for the reduction, which may
+// be a higher-precision type than the output type Tvec (e.g., float vs. half).
+template <typename Tvec, typename Treducevec, typename Toffsets,
+          typename Tsegmentids, typename Tinit>
+Status LaunchSegmentReduceEpilogueKernel(
+    const GPUDevice& d, Tsegmentids nsegments, Tinit empty_segment_value,
+    bool is_mean, bool is_sqrtn,
+    const Treducevec* output_raw,     // [nsegments]
+    const Toffsets* segment_offsets,  // [nsegments + 1]
+    Tvec* output) {                   // [nsegments]
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      nsegments, d,
+      &SegmentReduceEpilogueKernel<Tvec, Treducevec, Toffsets, Tsegmentids,
+                                   Tinit>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(SegmentReduceEpilogueKernel<Tvec, Treducevec, Toffsets,
+                                                     Tsegmentids, Tinit>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), nsegments, empty_segment_value, is_mean,
+                         is_sqrtn, output_raw, segment_offsets, output);
+}
+
+template <typename Tto>
+struct CastFunctor {
+  template <typename T>
+  __device__ Tto operator()(const T& val) const {
+    return static_cast<Tto>(val);
+  }
+};
+
+template <typename Treducevec, typename Tvec, typename Tindices,
+          typename Tweights>
+struct LookupAndScaleAndCastInputsFunctor {
+  LookupAndScaleAndCastInputsFunctor(const Tvec* input_vec,
+                                     const Tindices* indices,
+                                     const Tweights* weights)
+      : input_vec_(input_vec), indices_(indices), weights_(weights) {}
+
+  template <typename Toffsets>
+  __device__ Treducevec operator()(Toffsets idx) const {
+    if (indices_) idx = indices_[idx];
+    Treducevec result = static_cast<Treducevec>(input_vec_[idx]);
+    if (weights_) result = result * Tvec(weights_[idx]);
+    return result;
+  }
+
+ private:
+  const Tvec* __restrict__ input_vec_;
+  const Tindices* __restrict__ indices_;
+  const Tweights* __restrict__ weights_;
+};
+
+template <typename Treducevec, typename Tvec, typename Toffsets,
+          typename Tindices, typename Tweights>
+struct CastIterator {
+  using FunctorTy =
+      LookupAndScaleAndCastInputsFunctor<Treducevec, Tvec, Tindices, Tweights>;
+  using InputIteratorTy = gpuprim::CountingInputIterator<Toffsets>;
+  using IteratorTy =
+      gpuprim::TransformInputIterator<Treducevec, FunctorTy, InputIteratorTy>;
+};
+
+template <typename Treducevec, typename Toffsets, typename Tvec,
+          typename Tindices, typename Tweights>
+typename CastIterator<Treducevec, Tvec, Toffsets, Tindices,
+                      Tweights>::IteratorTy
+MakeLookupAndScaleAndCastInputsIterator(const Tvec* input_vec,
+                                        const Tindices* indices,
+                                        const Tweights* weights) {
+  using CastIteratorTy =
+      CastIterator<Treducevec, Tvec, Toffsets, Tindices, Tweights>;
+  typename CastIteratorTy::FunctorTy functor(input_vec, indices, weights);
+  return typename CastIteratorTy::IteratorTy(
+      typename CastIteratorTy::InputIteratorTy(Toffsets(0)), functor);
+}
+
+template <typename Treducevec, typename Tvec, typename Toffsets,
+          typename Tindices, typename Tsegmentids, typename ReduceOp,
+          typename Tinit, typename Tweights>
+Status SegmentReduceGPUImplNoInnerDim(
+    OpKernelContext* ctx, Toffsets nouter, Tsegmentids nsegments,
+    ReduceOp reduce_op, Tinit initial_value, Tinit empty_segment_value,
+    bool is_mean, bool is_sqrtn,
+    const Tvec* input_vec,            // [nouter or any]
+    const Toffsets* segment_offsets,  // [nsegments + 1]
+    const Tindices* indices,          // [nouter] (optional)
+    const Tweights* weights,          // [nouter or any] (optional)
+    Tvec* output_vec) {               // [nsegments]
+  // Here we use gpuprim::DeviceSegmentedReduce (which is optimized for this
+  // shape) and add the additional required functionality using fancy input
+  // iterators and an epilogue kernel.
+
+  // Note: This reinterpret cast is only needed to avoid compilation error
+  // when Tvec != Treducevec; the result is only used if Tvec == Treducevec.
+  Treducevec* output_raw_ptr = reinterpret_cast<Treducevec*>(output_vec);
+  Tensor output_raw;
+  bool need_temp_output = !std::is_same<Tvec, Treducevec>::value;
+  if (need_temp_output) {
+    // Note: We must allocate and reinterpret as bytes because Treducevec may
+    // be a vector type and they are not supported as Tensor dtypes.
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT8,
+        TensorShape({static_cast<int64_t>(nsegments * sizeof(Treducevec))}),
+        &output_raw));
+    output_raw_ptr =
+        reinterpret_cast<Treducevec*>(output_raw.flat<int8>().data());
+  }
+  auto input_iter =
+      MakeLookupAndScaleAndCastInputsIterator<Treducevec, Toffsets>(
+          input_vec, indices, weights);
+  TF_RETURN_IF_ERROR(GpuSegmentedReduce(ctx, nsegments, reduce_op,
+                                        Treducevec(initial_value), input_iter,
+                                        segment_offsets, output_raw_ptr));
+  bool need_epilogue = !std::is_same<Tvec, Treducevec>::value ||
+                       initial_value != empty_segment_value || is_mean ||
+                       is_sqrtn;
+  if (need_epilogue) {
+    const GPUDevice& device = ctx->eigen_gpu_device();
+    // Normalize based on the segment size and cast results back to T.
+    TF_RETURN_IF_ERROR(LaunchSegmentReduceEpilogueKernel(
+        device, nsegments, empty_segment_value, is_mean, is_sqrtn,
+        output_raw_ptr, segment_offsets, output_vec));
+  }
+  return OkStatus();
+}
+
+template <typename Treducevec, typename Tvec, typename Toffsets,
+          typename Tindices, typename Tsegmentids, typename ReduceOp,
+          typename Tinit, typename Tweights>
+Status SegmentReduceGPUImpl(
+    OpKernelContext* ctx, Toffsets nouter, Toffsets ninner_vec,
+    Tsegmentids nsegments, ReduceOp reduce_op, Tinit initial_value,
+    Tinit empty_segment_value, bool is_mean, bool is_sqrtn,
+    const Tvec* input_vec,           // [nouter or any, ninner_vec]
+    const Tsegmentids* segment_ids,  // [nouter]
+    const Tindices* indices,         // [nouter] (optional)
+    const Tweights* weights,         // [nouter or any] (optional)
+    Tvec* output_vec) {              // [nsegments, ninner_vec]
+  const GPUDevice& device = ctx->eigen_gpu_device();
+
+  if (nouter == 0) {
+    // Just set output to empty_segment_value.
+    GPUDevice d = ctx->template eigen_device<GPUDevice>();
+    int64_t output_size = static_cast<int64_t>(nsegments) * ninner_vec;
+    GpuLaunchConfig config = GetGpuLaunchConfig(output_size, d);
+    return GpuLaunchKernel(SetToValue<Tvec, Tinit>, config.block_count,
+                           config.thread_per_block, 0, d.stream(), output_size,
+                           output_vec, empty_segment_value);
+  }
+
+  // Allocate and compute segment_offsets.
+  Tensor segment_offsets;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<Toffsets>::value,
+                                        TensorShape({nsegments + 1}),
+                                        &segment_offsets));
+  Toffsets* segment_offsets_ptr = segment_offsets.flat<Toffsets>().data();
+  TF_RETURN_IF_ERROR(LaunchSegmentOffsetsKernel(
+      device, nouter, nsegments, segment_ids, segment_offsets_ptr));
+
+  const Toffsets avg_reduce_size =
+      Eigen::divup(nouter, static_cast<Toffsets>(nsegments));
+  // This avg_reduce_size threshold is a performance heuristic.
+  if (ninner_vec == 1 && avg_reduce_size >= 512) {
+    // Here we use a gpuprim-based implementation that doesn't support an
+    // inner dimension but can be significantly faster for large reductions.
+    return SegmentReduceGPUImplNoInnerDim<Treducevec>(
+        ctx, nouter, nsegments, reduce_op, initial_value, empty_segment_value,
+        is_mean, is_sqrtn, input_vec, segment_offsets_ptr, indices, weights,
+        output_vec);
+  }
+  // Here we use a custom kernel that is optimized for ninner_vec >= ~64 and
+  // gives decent performance for smaller cases. It also handles indices,
+  // casting to/from Treducevec, and normalizing the output.
+  return LaunchSegmentReduceVectorKernel<Treducevec>(
+      device, nouter, ninner_vec, nsegments, reduce_op, initial_value,
+      empty_segment_value, is_mean, is_sqrtn, input_vec, segment_offsets_ptr,
+      indices, weights, output_vec);
+}
+
+template <typename Treduce>
+struct SegmentReduceGPUVectorized {
+  template <int vec_size>
+  struct Impl {
+    template <typename T, typename Toffsets, typename Tindices,
+              typename Tsegmentids, typename ReduceOp, typename Tweights>
+    Status operator()(OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
+                      Tsegmentids nsegments, ReduceOp reduce_op,
+                      T initial_value, T empty_segment_value, bool is_mean,
+                      bool is_sqrtn, const T* input,
+                      const Tsegmentids* segment_ids, const Tindices* indices,
+                      const Tweights* weights, T* output) {
+      DCHECK_EQ(ninner % vec_size, 0);
+      DCHECK_EQ(reinterpret_cast<std::uintptr_t>(input) % vec_size, 0);
+      DCHECK_EQ(reinterpret_cast<std::uintptr_t>(output) % vec_size, 0);
+      Toffsets ninner_vec = ninner / vec_size;
+      using Tvec = AlignedVector<T, vec_size>;
+      using Treducevec = AlignedVector<Treduce, vec_size>;
+      const Tvec* input_vec = reinterpret_cast<const Tvec*>(input);
+      Tvec* output_vec = reinterpret_cast<Tvec*>(output);
+
+      return SegmentReduceGPUImpl<Treducevec>(
+          ctx, nouter, ninner_vec, nsegments, reduce_op, initial_value,
+          empty_segment_value, is_mean, is_sqrtn, input_vec, segment_ids,
+          indices, weights, output_vec);
+    }
+  };
+};
+
+// Reduces input matrix within segments over the outer dimension. Empty segments
+// always output empty_segment_value.
+// The segment_ids vector must be sorted.
+// If is_mean or is_sqrtn is true, the results are normalized using the
+// corresponding function.
+// If indices is not nullptr, input rows are accessed indirectly as
+// input[indices[i]], instead of input[i].
+// The implementation is deterministic.
+// Note: Treduce is to allow reducing in higher precision than T.
+template <typename Treduce, typename T, typename Toffsets, typename Tindices,
+          typename Tsegmentids, typename ReduceOp, typename Tweights>
+Status SegmentReduceGPU(OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
+                        Tsegmentids nsegments, ReduceOp reduce_op,
+                        T initial_value, T empty_segment_value, bool is_mean,
+                        bool is_sqrtn,
+                        const T* input,  // [nouter or any, ninner]
+                        const Tsegmentids* segment_ids,  // [nouter]
+                        const Tindices* indices,         // [nouter] (optional)
+                        const Tweights* weights,  // [nouter or any] (optional)
+                        T* output) {              // [nsegments, ninner]
+  if (ninner == 0 || nsegments == 0) return OkStatus();
+  return DispatchToVectorized<
+      T, SegmentReduceGPUVectorized<Treduce>::template Impl>(
+      MinAlignmentOf(input, output, ninner), ctx, nouter, ninner, nsegments,
+      reduce_op, initial_value, empty_segment_value, is_mean, is_sqrtn, input,
+      segment_ids, indices, weights, output);
+}
+
+template <typename SegmentId, typename Index, typename Tweights>
+__global__ void SegmentWeightsKernel(
+    SegmentId nsegments, SparseSegmentReductionOperation operation,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    Tweights* __restrict__ weights) {           // [nsegments]
+  GPU_1D_KERNEL_LOOP(i, nsegments) {
+    Index segment_size = segment_offsets[i + 1] - segment_offsets[i];
+    segment_size = max(segment_size, Index(1));  // Avoid division by zero
+    if (operation == SparseSegmentReductionOperation::kMean) {
+      weights[i] = Tweights(1) / static_cast<Tweights>(segment_size);
+    } else if (operation == SparseSegmentReductionOperation::kSqrtN) {
+      weights[i] = Tweights(1) / sqrt(static_cast<Tweights>(segment_size));
+    }
+  }
+}
+
+template <typename SegmentId, typename Index, typename Tweights>
+Status LaunchSegmentWeightsKernel(
+    const GPUDevice& d, SegmentId nsegments,
+    SparseSegmentReductionOperation operation,
+    const Index* segment_offsets,  // [nsegments + 1]
+    Tweights* weights) {           // [nsegments]
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      nsegments, d, &SegmentWeightsKernel<SegmentId, Index, Tweights>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(SegmentWeightsKernel<SegmentId, Index, Tweights>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), nsegments, operation, segment_offsets,
+                         weights);
+}
+
+template <typename ReduceOp, typename T>
+struct ReduceType {
+  using type = T;
+};
+
+// Sum fp16 values using an fp32 accumulator to avoid numerical issues.
+template <>
+struct ReduceType<functor::Sum, Eigen::half> {
+  using type = float;
+};
+
+template <>
+struct ReduceType<functor::Sum, Eigen::bfloat16> {
+  using type = float;
+};
+
+namespace functor {
+
+template <typename T, typename Index, typename InitialValueF,
+          typename EmptySegmentValueF, typename ReductionF>
+void SegmentReductionFunctor<
+    T, Index, InitialValueF, EmptySegmentValueF,
+    ReductionF>::operator()(OpKernelContext* ctx, const GPUDevice& d,
+                            const Index output_rows,
+                            const TensorShape& segment_ids_shape, bool is_mean,
+                            typename TTypes<Index>::ConstFlat segment_ids,
+                            const Index data_size, const T* data,
+                            typename TTypes<T, 2>::Tensor output) {
+  if (output.size() == 0) {
+    return;
+  }
+
+  // Launch kernel(s) to compute sorted segment reduction.
+  // Notes:
+  // *) 'input_total_size' is the total number of elements to process.
+  // *) 'segment_ids.shape' is a prefix of data's shape.
+  // *) 'input_outer_dim_size' is the total number of segments to process.
+  const Index input_total_size = data_size;
+  const Index input_outer_dim_size = segment_ids.dimension(0);
+  const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
+  const Index num_segments = output.size() / input_inner_dim_size;
+
+  bool use_deterministic_kernels =
+      UseDeterministicSegmentReductions() ||
+      (OpDeterminismRequired() && !ReduceOpIsAssociative<ReductionF, T>::value);
+
+  // TODO(benbarsdell): If there are no performance concerns with the new
+  // deterministic kernels, remove this runtime check and the old
+  // non-deterministic kernels.
+  if (!use_deterministic_kernels) {
+    // Set 'output' to initial value.
+    GpuLaunchConfig config = GetGpuLaunchConfig(output.size(), d);
+    const T initial_value = InitialValueF()();
+    TF_CHECK_OK(GpuLaunchKernel(SetToValue<T>, config.block_count,
+                                config.thread_per_block, 0, d.stream(),
+                                output.size(), output.data(), initial_value));
+    if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
+      return;
+    }
+
+    const int OuterDimTileSize = 8;
+
+    const Index input_outer_dim_num_stripe =
+        Eigen::divup(input_outer_dim_size, Index(OuterDimTileSize));
+
+    const Index total_stripe_count =
+        input_inner_dim_size * input_outer_dim_num_stripe;
+
+    config = GetGpuLaunchConfig(total_stripe_count, d);
+    TF_CHECK_OK(GpuLaunchKernel(
+        SortedSegmentReductionCustomKernel<
+            T, Index, OuterDimTileSize,
+            typename ReduceUpdateOpFor<ReductionF>::nonatomic_op,
+            typename ReduceUpdateOpFor<ReductionF>::atomic_op>,
+        config.block_count, config.thread_per_block, 0, d.stream(),
+        input_outer_dim_size, input_inner_dim_size, output_rows,
+        segment_ids.data(), data, output.data(), total_stripe_count,
+        initial_value));
+
+    const T empty_value = EmptySegmentValueF()();
+    if (is_mean || initial_value != empty_value) {
+      Tensor segment_offsets;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<Index>::value,
+                                             TensorShape({num_segments + 1}),
+                                             &segment_offsets));
+      Index* segment_offsets_ptr = segment_offsets.flat<Index>().data();
+      OP_REQUIRES_OK(ctx, LaunchSegmentOffsetsKernel(
+                              d, input_outer_dim_size, num_segments,
+                              segment_ids.data(), segment_offsets_ptr));
+
+      if (is_mean) {
+        OP_REQUIRES_OK(ctx, LaunchSegmentMeanNormalizeKernel(
+                                d, num_segments, input_inner_dim_size,
+                                segment_offsets_ptr, output.data()));
+      }
+      if (initial_value != empty_value) {
+        OP_REQUIRES_OK(
+            ctx, LaunchSegmentSetEmptyKernel(
+                     d, num_segments, input_inner_dim_size, segment_offsets_ptr,
+                     empty_value, output.data()));
+      }
+    }
+  } else {
+    using Treduce = typename ReduceType<ReductionF, T>::type;
+    using Tweights = typename RealTypeIfComplex<T>::type;
+    OP_REQUIRES_OK(
+        ctx,
+        SegmentReduceGPU<Treduce>(
+            ctx, input_outer_dim_size, input_inner_dim_size, num_segments,
+            ReductionF(), InitialValueF()(), EmptySegmentValueF()(),
+            /*is_mean=*/is_mean, /*is_sqrtn=*/false, data, segment_ids.data(),
+            /*indices=*/static_cast<const Index*>(nullptr),
+            /*weights=*/static_cast<Tweights*>(nullptr), output.data()));
+  }
+}
+
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const TensorShape& segment_ids_shape,
+                  typename TTypes<Index>::ConstFlat unsorted_segment_ids,
+                  typename TTypes<T, 2>::ConstTensor data,
+                  typename TTypes<T, 2>::Tensor output) {
+    if (output.size() == 0) {
+      return;
+    }
+
+    bool use_deterministic_kernels =
+        UseDeterministicSegmentReductions() ||
+        (!ReduceOpIsAssociative<ReductionF, T>::value &&
+         OpDeterminismRequired());
+
+    bool determinism_requirement_met =
+        use_deterministic_kernels ||
+        ReduceOpIsAssociative<ReductionF, T>::value ||
+        !OpDeterminismRequired() ||
+        DisableSegmentReductionOpDeterminismExceptions();
+    OP_REQUIRES(
+        ctx, determinism_requirement_met,
+        errors::Unimplemented(
+            "Deterministic GPU implementation of unsorted segment reduction op"
+            " not available."));
+
+    // Launch kernel(s) to compute unsorted segment reduction.
+    // Notes:
+    // *) 'data_size' is the total number of elements to process.
+    // *) 'segment_ids.shape' is a prefix of data's shape.
+    // *) 'input_outer_dim_size' is the total number of segments to process.
+    const Index input_outer_dim_size = unsorted_segment_ids.dimension(0);
+    const Index input_inner_dim_size = data.dimension(1);
+    const Index output_outer_dim_size = output.dimension(0);
+    const Index num_segments = output.size() / input_inner_dim_size;
+
+    // TODO(benbarsdell): If there are no performance concerns with the new
+    // deterministic kernels, remove this runtime check and the old
+    // non-deterministic kernels.
+    if (!use_deterministic_kernels) {
+      // Set 'output' to initial value.
+      GPUDevice d = ctx->template eigen_device<GPUDevice>();
+      GpuLaunchConfig config = GetGpuLaunchConfig(output.size(), d);
+      TF_CHECK_OK(GpuLaunchKernel(
+          SetToValue<T>, config.block_count, config.thread_per_block, 0,
+          d.stream(), output.size(), output.data(), InitialValueF()()));
+      const int64_t data_size = data.size();
+      if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
+        return;
+      }
+      config = GetGpuLaunchConfig(data_size, d);
+      TF_CHECK_OK(GpuLaunchKernel(
+          UnsortedSegmentCustomKernel<
+              T, Index, typename ReduceUpdateOpFor<ReductionF>::atomic_op>,
+          config.block_count, config.thread_per_block, 0, d.stream(),
+          input_outer_dim_size, input_inner_dim_size, output_outer_dim_size,
+          unsorted_segment_ids.data(), data.data(), output.data()));
+    } else {
+      // Allocate temporary space and sort segment_ids, then call the sorted
+      // implem.
+      Tensor segment_ids;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(
+                   DataTypeToEnum<Index>::value,
+                   TensorShape({static_cast<int64_t>(input_outer_dim_size)}),
+                   &segment_ids));
+      Index* segment_ids_ptr = segment_ids.flat<Index>().data();
+      Tensor sorted_indices;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(
+                   DataTypeToEnum<Index>::value,
+                   TensorShape({static_cast<int64_t>(input_outer_dim_size)}),
+                   &sorted_indices));
+      Index* sorted_indices_ptr = sorted_indices.flat<Index>().data();
+      // Note: We must sort using all bits here because unsorted_segment_ids
+      // may contain negative values.
+      OP_REQUIRES_OK(
+          ctx, GpuRadixSort(ctx, input_outer_dim_size,
+                            /*keys_in=*/unsorted_segment_ids.data(),
+                            /*keys_out=*/segment_ids_ptr,
+                            /*indices_in=*/static_cast<const Index*>(nullptr),
+                            /*indices_out=*/sorted_indices_ptr));
+      using Treduce = typename ReduceType<ReductionF, T>::type;
+      using Tweights = typename RealTypeIfComplex<T>::type;
+      OP_REQUIRES_OK(
+          ctx,
+          SegmentReduceGPU<Treduce>(
+              ctx, input_outer_dim_size, input_inner_dim_size, num_segments,
+              ReductionF(), /*initial_value=*/InitialValueF()(),
+              /*empty_segment_value=*/InitialValueF()(), /*is_mean=*/false,
+              /*is_sqrtn=*/false, /*input=*/data.data(),
+              /*segment_ids=*/segment_ids_ptr, /*indices=*/sorted_indices_ptr,
+              /*weights=*/static_cast<Tweights*>(nullptr), output.data()));
+    }
+  }
+};
+
+template <typename T, typename Index, typename SegmentId>
+Status SparseSegmentReductionFunctor<T, Index, SegmentId>::operator()(
+    OpKernelContext* context, bool is_mean, bool is_sqrtn, T default_value,
+    typename TTypes<T, 2>::ConstTensor input,
+    typename TTypes<Index>::ConstVec indices,
+    typename TTypes<SegmentId>::ConstVec segment_ids,
+    typename TTypes<T, 2>::Tensor output) {
+  using ReduceOp = functor::Sum;
+  using Treduce = typename ReduceType<ReduceOp, T>::type;
+  using Tweights = typename RealTypeIfComplex<T>::type;
+  Index nouter = segment_ids.size();
+  Index ninner = input.dimension(1);
+  SegmentId nsegments = output.dimension(0);
+  return SegmentReduceGPU<Treduce>(
+      context, /*nouter=*/nouter, /*ninner=*/ninner,
+      /*nsegments=*/nsegments, /*reduce_op=*/ReduceOp(),
+      /*initial_value=*/T(0),
+      /*empty_segment_value=*/default_value,
+      /*is_mean=*/is_mean, /*is_sqrtn=*/is_sqrtn,
+      /*input=*/input.data(), /*segment_ids=*/segment_ids.data(),
+      /*indices=*/indices.data(), /*weights=*/static_cast<Tweights*>(nullptr),
+      /*output=*/output.data());
+}
+
+template <typename T, typename Index, typename SegmentId>
+struct SparseSegmentGradFunctor<GPUDevice, T, Index, SegmentId> {
+  void operator()(OpKernelContext* context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Index>::ConstVec indices_vec,
+                  typename TTypes<SegmentId>::ConstVec segment_vec,
+                  Tensor* output) {
+    const GPUDevice& device = context->eigen_gpu_device();
+
+    auto output_flat = output->flat_outer_dims<T>();
+    const SegmentId nsegments = input_flat.dimension(0);
+    const Index ninner = input_flat.dimension(1);
+    const Index nouter = indices_vec.dimension(0);
+    const Index noutput = output_flat.dimension(0);
+
+    // Allocate and compute segment weights (for Mean/SqrtN operations only).
+    Tensor weights;
+    using Tweights = typename RealTypeIfComplex<T>::type;
+    Tweights* weights_ptr = nullptr;
+    if (operation != SparseSegmentReductionOperation::kSum) {
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<Tweights>::value,
+                                          TensorShape({nsegments}), &weights));
+      weights_ptr = weights.flat<Tweights>().data();
+      // Allocate and compute segment_offsets.
+      Tensor segment_offsets;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<Index>::value,
+                                            TensorShape({nsegments + 1}),
+                                            &segment_offsets));
+      Index* segment_offsets_ptr = segment_offsets.flat<Index>().data();
+      OP_REQUIRES_OK(context, LaunchSegmentOffsetsKernel(
+                                  device, nouter, nsegments, segment_vec.data(),
+                                  segment_offsets_ptr));
+      // Compute the weights based on the segment sizes using segment_offsets.
+      OP_REQUIRES_OK(context, LaunchSegmentWeightsKernel(
+                                  device, nsegments, operation,
+                                  segment_offsets_ptr, weights_ptr));
+    }
+
+    const Index* sorted_indices_ptr = indices_vec.data();
+    const SegmentId* sorted_segment_ptr = segment_vec.data();
+    Tensor tmp_sorted_indices;
+    Tensor tmp_sorted_segment;
+    if (noutput > 1) {
+      // Sort indices and permute segments.
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<Index>::value,
+                                  TensorShape({nouter}), &tmp_sorted_indices));
+      Index* tmp_sorted_indices_ptr = tmp_sorted_indices.flat<Index>().data();
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<SegmentId>::value,
+                                  TensorShape({nouter}), &tmp_sorted_segment));
+      SegmentId* tmp_sorted_segment_ptr =
+          tmp_sorted_segment.flat<SegmentId>().data();
+      OP_REQUIRES_OK(context,
+                     GpuRadixSort(context, nouter,
+                                  /*keys_in=*/indices_vec.data(),
+                                  /*keys_out=*/tmp_sorted_indices_ptr,
+                                  /*indices_in=*/segment_vec.data(),
+                                  /*indices_out=*/tmp_sorted_segment_ptr,
+                                  /*num_bits=*/Log2Ceiling64(noutput)));
+      sorted_indices_ptr = tmp_sorted_indices_ptr;
+      sorted_segment_ptr = tmp_sorted_segment_ptr;
+    }
+
+    // Compute the gradient using a weighted SegmentReduceGPU with the segment
+    // IDs and indices swapped.
+    using ReduceOp = functor::Sum;
+    using Treduce = typename ReduceType<ReduceOp, T>::type;
+    OP_REQUIRES_OK(
+        context,
+        SegmentReduceGPU<Treduce>(
+            context, /*nouter=*/static_cast<SegmentId>(nouter),
+            /*ninner=*/static_cast<SegmentId>(ninner),
+            /*nsegments=*/noutput,
+            /*reduce_op=*/ReduceOp(),
+            /*initial_value=*/T(0),
+            /*empty_segment_value=*/T(0),
+            /*is_mean=*/false, /*is_sqrtn=*/false,
+            /*input=*/input_flat.data(), /*segment_ids=*/sorted_indices_ptr,
+            /*indices=*/sorted_segment_ptr, /*weights=*/weights_ptr,
+            /*output=*/output_flat.data()));
+  }
+};
+
+template <typename TindicesCompact>
+struct EdgeIndicatorFunctor {
+  EdgeIndicatorFunctor(const TindicesCompact* sorted_indices)
+      : sorted_indices_(sorted_indices) {}
+
+  template <typename Idx>
+  __device__ bool operator()(Idx idx) const {
+    return idx == 0 ? false : sorted_indices_[idx] != sorted_indices_[idx - 1];
+  }
+
+ private:
+  const TindicesCompact* __restrict__ sorted_indices_;
+};
+
+template <typename Toffsets, typename EdgeIndicatorIter,
+          typename TindicesCompact, typename Tindices>
+__global__ void ScatterUniqueIndicesKernel(
+    Toffsets nouter,
+    EdgeIndicatorIter sorted_indices_edge_indicator,     // [nouter]
+    const TindicesCompact* __restrict__ sorted_indices,  // [nouter]
+    const Toffsets* __restrict__ sorted_indices_ids,     // [nouter]
+    Tindices* __restrict__ sorted_unique_indices) {      // [num_unique]
+  for (int i : GpuGridRangeX(nouter)) {
+    if (i == 0 || sorted_indices_edge_indicator[i]) {
+      sorted_unique_indices[sorted_indices_ids[i]] =
+          static_cast<Tindices>(sorted_indices[i]);
+    }
+  }
+}
+
+template <typename Toffsets, typename EdgeIndicatorIter,
+          typename TindicesCompact, typename Tindices>
+Status LaunchScatterUniqueIndicesKernel(
+    const GPUDevice& d, Toffsets nouter,
+    EdgeIndicatorIter sorted_indices_edge_indicator,     // [nouter]
+    const TindicesCompact* __restrict__ sorted_indices,  // [nouter]
+    const Toffsets* __restrict__ sorted_indices_ids,     // [nouter]
+    Tindices* __restrict__ sorted_unique_indices) {      // [num_unique]
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      nouter, d,
+      &ScatterUniqueIndicesKernel<Toffsets, EdgeIndicatorIter, TindicesCompact,
+                                  Tindices>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(ScatterUniqueIndicesKernel<Toffsets, EdgeIndicatorIter,
+                                                    TindicesCompact, Tindices>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), nouter, sorted_indices_edge_indicator,
+                         sorted_indices, sorted_indices_ids,
+                         sorted_unique_indices);
+}
+
+template <typename T, typename Tindices, typename Tsegmentids>
+struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
+  void operator()(OpKernelContext* context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Tindices>::ConstVec indices_vec,
+                  typename TTypes<Tsegmentids>::ConstVec segment_vec,
+                  const TensorShape& dense_output_shape,
+                  typename AsyncOpKernel::DoneCallback done) {
+    const GPUDevice& device = context->eigen_gpu_device();
+
+    const int64_t nsegments = input_flat.dimension(0);
+    const int64_t ninner64 = input_flat.dimension(1);
+    const int64_t nouter64 = indices_vec.dimension(0);
+    // Note: nouter and ninner are not expected to be huge, so we use int32 to
+    // save memory bandwidth.
+    using Toffsets = int32;
+    OP_REQUIRES_ASYNC(context, nouter64 <= std::numeric_limits<Toffsets>::max(),
+                      absl::InvalidArgumentError(
+                          absl::StrCat("Indices vector of length ", nouter64,
+                                       " is too large to fit in int32.")),
+                      done);
+    const Toffsets nouter = static_cast<Toffsets>(nouter64);
+    OP_REQUIRES_ASYNC(context, ninner64 <= std::numeric_limits<Toffsets>::max(),
+                      absl::InvalidArgumentError(absl::StrCat(
+                          "Inner data dimension of size ", ninner64,
+                          " is too large to fit in int32.")),
+                      done);
+    const Toffsets ninner = static_cast<Toffsets>(ninner64);
+
+    // Cast indices to 32-bit to save memory bandwidth (the cost of the cast is
+    // worth it because the vector is used multiple times).
+    // Note that we can currently assume int32 is safe because the op's dense
+    // output_dim0 input is always int32.
+    using TindicesCompact = int32;
+    Tensor tmp_indices_internal;
+    const TindicesCompact* indices_internal_ptr;
+    if constexpr (std::is_same<Tindices, TindicesCompact>::value) {
+      indices_internal_ptr = indices_vec.data();
+    } else {
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          context->allocate_temp(DataTypeToEnum<TindicesCompact>::value,
+                                 TensorShape({nouter}), &tmp_indices_internal),
+          done);
+      auto indices_vec_internal = tmp_indices_internal.flat<TindicesCompact>();
+      indices_vec_internal.device(device) =
+          indices_vec.template cast<TindicesCompact>();
+      indices_internal_ptr = indices_vec_internal.data();
+    }
+
+    // Cast segment IDs to smallest possible type to save memory bandwidth.
+    if (nsegments <= std::numeric_limits<int16_t>::max()) {
+      CastSegmentIdsThenImpl<Toffsets, TindicesCompact, int16_t>(
+          context, operation, nouter, ninner, nsegments, input_flat.data(),
+          tmp_indices_internal, indices_internal_ptr, segment_vec,
+          dense_output_shape, done);
+    } else if (sizeof(Tsegmentids) > sizeof(int32) &&
+               nsegments <= std::numeric_limits<int32>::max()) {
+      CastSegmentIdsThenImpl<Toffsets, TindicesCompact, int32>(
+          context, operation, nouter, ninner, nsegments, input_flat.data(),
+          tmp_indices_internal, indices_internal_ptr, segment_vec,
+          dense_output_shape, done);
+    } else {
+      Impl<Toffsets, TindicesCompact, Tsegmentids>(
+          context, operation, nouter, ninner, nsegments, input_flat.data(),
+          tmp_indices_internal, indices_internal_ptr, Tensor(),
+          segment_vec.data(), dense_output_shape, done);
+    }
+  }
+
+ private:
+  using Tweights = typename RealTypeIfComplex<T>::type;
+
+  template <typename Toffsets, typename TindicesCompact,
+            typename Tsegmentids_internal>
+  void CastSegmentIdsThenImpl(
+      OpKernelContext* context, SparseSegmentReductionOperation operation,
+      Toffsets nouter, Toffsets ninner, Tsegmentids_internal nsegments,
+      const T* input, Tensor indices_tensor, const TindicesCompact* indices,
+      typename TTypes<Tsegmentids>::ConstVec segment_vec,
+      const TensorShape& dense_output_shape,
+      typename AsyncOpKernel::DoneCallback done) {
+    const GPUDevice& device = context->eigen_gpu_device();
+    Tensor tmp_segment_internal;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_temp(DataTypeToEnum<Tsegmentids_internal>::value,
+                               TensorShape({nouter}), &tmp_segment_internal),
+        done);
+    auto segment_vec_internal =
+        tmp_segment_internal.flat<Tsegmentids_internal>();
+    segment_vec_internal.device(device) =
+        segment_vec.template cast<Tsegmentids_internal>();
+
+    Impl<Toffsets, TindicesCompact, Tsegmentids_internal>(
+        context, operation, nouter, ninner, nsegments, input, indices_tensor,
+        indices, tmp_segment_internal, segment_vec_internal.data(),
+        dense_output_shape, done);
+  }
+
+  template <typename Toffsets, typename TindicesCompact,
+            typename Tsegmentids_internal>
+  void Impl(OpKernelContext* context, SparseSegmentReductionOperation operation,
+            Toffsets nouter, Toffsets ninner, Tsegmentids_internal nsegments,
+            const T* input, Tensor indices_tensor,
+            const TindicesCompact* indices, Tensor segment_ids_tensor,
+            const Tsegmentids_internal* segment_ids,
+            const TensorShape& dense_output_shape,
+            typename AsyncOpKernel::DoneCallback done) {
+    const int64_t dense_output_dim0 = dense_output_shape.dim_size(0);
+
+    // Allocate and compute segment weights (for Mean/SqrtN operations only).
+    Tensor tmp_weights;
+    Tweights* weights_ptr = nullptr;
+    if (operation != SparseSegmentReductionOperation::kSum) {
+      ComputeSegmentWeights(context, operation, nsegments, nouter, segment_ids,
+                            &tmp_weights, done);
+      weights_ptr = tmp_weights.flat<Tweights>().data();
+    }
+
+    const TindicesCompact* sorted_indices_ptr = indices;
+    const Tsegmentids_internal* permuted_segment_ptr = segment_ids;
+    Tensor tmp_sorted_indices;
+    Tensor tmp_permuted_segment;
+    if (dense_output_dim0 > 1) {
+      // Sort indices and permute segments.
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          context->allocate_temp(DataTypeToEnum<TindicesCompact>::value,
+                                 TensorShape({nouter}), &tmp_sorted_indices),
+          done);
+      TindicesCompact* tmp_sorted_indices_ptr =
+          tmp_sorted_indices.flat<TindicesCompact>().data();
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          context->allocate_temp(DataTypeToEnum<Tsegmentids_internal>::value,
+                                 TensorShape({nouter}), &tmp_permuted_segment),
+          done);
+      Tsegmentids_internal* tmp_permuted_segment_ptr =
+          tmp_permuted_segment.flat<Tsegmentids_internal>().data();
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          GpuRadixSort(context, nouter,
+                       /*keys_in=*/indices,
+                       /*keys_out=*/tmp_sorted_indices_ptr,
+                       /*indices_in=*/segment_ids,
+                       /*indices_out=*/tmp_permuted_segment_ptr,
+                       /*num_bits=*/Log2Ceiling64(dense_output_dim0)),
+          done);
+      sorted_indices_ptr = tmp_sorted_indices_ptr;
+      permuted_segment_ptr = tmp_permuted_segment_ptr;
+      // The original tensors are no longer needed.
+      indices_tensor = Tensor();
+      indices = nullptr;
+      segment_ids_tensor = Tensor();
+      segment_ids = nullptr;
+    }
+
+    using CountIter = gpuprim::CountingInputIterator<Toffsets>;
+    using EdgeIndicatorIter = gpuprim::TransformInputIterator<
+        Toffsets, EdgeIndicatorFunctor<TindicesCompact>, CountIter>;
+    EdgeIndicatorIter sorted_indices_edge_indicator(
+        CountIter(0),
+        EdgeIndicatorFunctor<TindicesCompact>(sorted_indices_ptr));
+
+    Tensor tmp_sorted_indices_unique_ids;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<Toffsets>::value,
+                                                TensorShape({nouter}),
+                                                &tmp_sorted_indices_unique_ids),
+                         done);
+    Toffsets* sorted_indices_unique_ids_ptr =
+        tmp_sorted_indices_unique_ids.flat<Toffsets>().data();
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        GpuInclusivePrefixSum(context, nouter, sorted_indices_edge_indicator,
+                              sorted_indices_unique_ids_ptr),
+        done);
+
+    se::Stream* stream = context->op_device_context()->stream();
+    OP_REQUIRES_ASYNC(context, stream,
+                      absl::InternalError("No GPU stream available."), done);
+
+    // Copy the last element of sorted_indices_unique_ids back to the host to
+    // obtain num_unique.
+    ScratchSpace<Toffsets> last_idx_host(context, 1, /*on_host=*/true);
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        stream->Memcpy(last_idx_host.mutable_data(),
+                       se::DeviceMemoryBase(const_cast<Toffsets*>(
+                                                sorted_indices_unique_ids_ptr) +
+                                                (nouter - 1),
+                                            sizeof(*last_idx_host.data())),
+                       sizeof(*last_idx_host.data())),
+        done);
+
+    auto async_finish_computation =
+        [this, context, dense_output_shape, nouter, ninner, input,
+         indices_tensor, tmp_sorted_indices, sorted_indices_ptr,
+         tmp_sorted_indices_unique_ids, sorted_indices_unique_ids_ptr,
+         segment_ids_tensor, tmp_permuted_segment, permuted_segment_ptr,
+         sorted_indices_edge_indicator, tmp_weights, weights_ptr, last_idx_host,
+         done]() -> void {
+      const GPUDevice& device = context->eigen_gpu_device();
+      Toffsets num_unique = (*last_idx_host.data()) + 1;
+
+      std::unique_ptr<se::ActivateContext> scoped_activation =
+          context->op_device_context()->stream()->parent()->Activate();
+
+      TensorShape output_shape = dense_output_shape;
+      OP_REQUIRES_OK_ASYNC(context,
+                           output_shape.SetDimWithStatus(0, num_unique), done);
+      Tensor* output = nullptr;
+      T* output_ptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, output_shape, &output), done);
+      output_ptr = output->flat<T>().data();
+
+      // Compute the gradient using a weighted SegmentReduceGPU with the segment
+      // IDs and indices swapped.
+      using ReduceOp = functor::Sum;
+      using Treduce = typename ReduceType<ReduceOp, T>::type;
+      OP_REQUIRES_OK_ASYNC(context,
+                           SegmentReduceGPU<Treduce>(
+                               context, /*nouter=*/nouter,
+                               /*ninner=*/ninner,
+                               /*nsegments=*/num_unique,
+                               /*reduce_op=*/ReduceOp(),
+                               /*initial_value=*/T(0),
+                               /*empty_segment_value=*/T(0),
+                               /*is_mean=*/false, /*is_sqrtn=*/false,
+                               /*input=*/input,
+                               /*segment_ids=*/sorted_indices_unique_ids_ptr,
+                               /*indices=*/permuted_segment_ptr,
+                               /*weights=*/weights_ptr,
+                               /*output=*/output_ptr),
+                           done);
+
+      Tensor* sorted_unique_indices = nullptr;
+      Tindices* sorted_unique_indices_ptr;
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          context->allocate_output(1, TensorShape({num_unique}),
+                                   &sorted_unique_indices),
+          done);
+      sorted_unique_indices_ptr =
+          sorted_unique_indices->flat<Tindices>().data();
+
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          LaunchScatterUniqueIndicesKernel(
+              device, nouter, sorted_indices_edge_indicator, sorted_indices_ptr,
+              sorted_indices_unique_ids_ptr, sorted_unique_indices_ptr),
+          done);
+
+      done();
+    };
+
+    context->device()
+        ->tensorflow_accelerator_device_info()
+        ->event_mgr->ThenExecute(stream, async_finish_computation);
+  }
+
+  template <typename Tsegmentids_internal, typename Toffsets>
+  void ComputeSegmentWeights(OpKernelContext* context,
+                             SparseSegmentReductionOperation operation,
+                             Tsegmentids_internal nsegments, Toffsets nouter,
+                             const Tsegmentids_internal* segment_ids,
+                             Tensor* tmp_weights,
+                             typename AsyncOpKernel::DoneCallback done) {
+    const GPUDevice& device = context->eigen_gpu_device();
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_temp(DataTypeToEnum<Tweights>::value,
+                               TensorShape({nsegments}), tmp_weights),
+        done);
+    Tweights* weights_ptr = tmp_weights->flat<Tweights>().data();
+    // Allocate and compute segment_offsets.
+    Tensor tmp_segment_offsets;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<Toffsets>::value,
+                                                TensorShape({nsegments + 1}),
+                                                &tmp_segment_offsets),
+                         done);
+    Toffsets* segment_offsets_ptr = tmp_segment_offsets.flat<Toffsets>().data();
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        LaunchSegmentOffsetsKernel(device, nouter, nsegments, segment_ids,
+                                   segment_offsets_ptr),
+        done);
+    // Compute the weights based on the segment sizes using segment_offsets.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        LaunchSegmentWeightsKernel(device, nsegments, operation,
+                                   segment_offsets_ptr, weights_ptr),
+        done);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops_impl.h
new file mode 100644
index 00000000..d087bfae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -0,0 +1,1488 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_IMPL_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/platform/types.h"
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "absl/container/flat_hash_map.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/determinism.h"
+#include "tensorflow/core/util/util.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/util/gpu_solvers.h"
+
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#include "tensorflow/core/util/gpu_solvers.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace internal {
+
+absl::Status ValidateSegmentReduction(OpKernelContext* c, const Tensor& input,
+                                      const Tensor& segment_ids);
+absl::Status ValidateUnsortedSegmentReduction(OpKernel* op_kernel,
+                                              OpKernelContext* context,
+                                              const Tensor& data,
+                                              const Tensor& segment_ids,
+                                              const Tensor& num_segments);
+absl::Status ValidateSparseSegmentReduction(OpKernelContext* context,
+                                            const Tensor& input,
+                                            const Tensor& indices,
+                                            const Tensor& segment_ids,
+                                            bool has_num_segments);
+}  // namespace internal
+
+// This operator handles reducing segments along the first dimension.
+// See core/ops/math_ops.cc for more details.
+template <typename Device, class T, class Index, typename Reducer,
+          int default_value>
+class SegmentReductionOp : public OpKernel {
+ public:
+  explicit SegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& segment_ids = context->input(1);
+
+    OP_REQUIRES_OK(context, internal::ValidateSegmentReduction(context, input,
+                                                               segment_ids));
+
+    const int64_t num_indices = segment_ids.NumElements();
+    auto input_flat = input.flat_outer_dims<T>();
+    const int64_t num_col = input_flat.dimension(1);
+
+    const auto segment_vec = segment_ids.vec<Index>();
+    // Note that the current implementation assumes that segment_vec values are
+    // sorted.
+    const Index output_rows =
+        num_indices > 0
+            ? internal::SubtleMustCopy(segment_vec(num_indices - 1)) + 1
+            : 0;
+    OP_REQUIRES(context, output_rows >= 0,
+                errors::InvalidArgument("segment ids must be >= 0"));
+
+    OP_REQUIRES(context, input.dims() >= 1,
+                errors::InvalidArgument("Shape must be at least rank 1"));
+
+    TensorShape output_shape = input.shape();
+    // Since we're changing the first dimension of the shape, we need to make
+    // sure the new shape won't overflow.
+    OP_REQUIRES_OK(context, output_shape.SetDimWithStatus(0, output_rows));
+
+    // Note that we do not initialize the output buffer with a default value, so
+    // we need to explicitly set missing indices to the default value.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    if (num_indices == 0) return;
+    OP_REQUIRES(context, output_rows > 0,
+                errors::InvalidArgument("segment ids must be >= 0"));
+    auto output_flat = output->flat_outer_dims<T>();
+
+    Eigen::IndexList<Eigen::type2index<0> > dims_to_reduce;
+    Index start = 0, end = 1;
+
+    Index uninitialized_index = 0;  // Index from which the output is not set.
+    Index out_index = internal::SubtleMustCopy(segment_vec(start));
+
+    // TODO(agarwal): if this loop becomes a bottleneck, consider sharding it
+    // across threads.
+    Eigen::DSizes<Eigen::DenseIndex, 1> out_slice_shape(num_col);
+    while (end <= num_indices) {
+      // We initialize next_index to 0 to avoid "warning: 'next_index' may be
+      // used uninitialized in this function" in the Mac build (since the
+      // compiler isn't smart enough to realize the code is safe).
+      Index next_index = 0;
+      if (end < num_indices) {
+        next_index = internal::SubtleMustCopy(segment_vec(end));
+        if (out_index == next_index) {
+          ++end;
+          continue;
+        }
+        // We have a new segment here.  Verify that the segment ids are growing.
+        OP_REQUIRES(context, out_index < next_index,
+                    errors::InvalidArgument("segment ids are not increasing"));
+      }
+
+      // Process segment [start, end)
+      const T* in_slice_ptr = &input_flat(start, 0);
+      typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
+                               Eigen::Unaligned>
+          OutT;
+
+      OP_REQUIRES(
+          context, FastBoundsCheck(out_index, output_rows),
+          errors::InvalidArgument(
+              "Segment id ", out_index, " out of range [0, ", output_rows,
+              "), possibly because 'segment_ids' input is not sorted."));
+
+      // If there is a gap between two indices, we need to set that gap to the
+      // default value.
+      if (out_index > uninitialized_index) {
+        Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+            out_index - uninitialized_index, num_col);
+        Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+            gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+        gap_slice.setConstant(T(default_value));
+      }
+
+      T* out_slice_ptr = &output_flat(out_index, 0);
+      OutT out_slice(out_slice_ptr, out_slice_shape);
+      // We don't use out_slice.device(context->eigen_device<Device>)
+      // because these pieces of work are likely to be very small and
+      // the context switching overhead dwarfs any benefit we get from
+      // using another thread to do this work.
+      if (start == end - 1) {
+        typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>,
+                                 Eigen::Unaligned>
+            InT;
+        InT in_slice(in_slice_ptr, out_slice_shape);
+        out_slice = in_slice;
+      } else {
+        Eigen::DSizes<Eigen::DenseIndex, 2> in_slice_shape(end - start,
+                                                           num_col);
+        typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                                 Eigen::Unaligned>
+            InT;
+        InT in_slice(in_slice_ptr, in_slice_shape);
+
+        out_slice = in_slice.reduce(dims_to_reduce, Reducer());
+      }
+      if (end >= num_indices) break;
+      start = end;
+      ++end;
+      uninitialized_index = out_index + 1;
+      out_index = next_index;
+    }
+  }
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+//  SegmentReductionGPUOp is a segment reduction operator implemented for GPU
+//  only.
+//  TODO: This implementation of SegmentReductionGPUOp is sometimes slower than
+//  its unsorted counterpart (mostly when problem size is small).
+//  This is due to the following two main reasons and a cost-effective way
+//  to resolve these problems is desirable.
+//  1. Sorted segment reduction requires a memory transfer from device to host
+//     in order to know the size of the output dimension whereas unsorted
+//     segment reduction receives the size of the output dimension as an input
+//     parameter.
+//  2. Sorted segment reduction is essentially a tiled version of unsorted
+//     segment reduction and therefore such optimization comes at an inherent
+//     cost. However such cost may not be justified when the problem size is
+//     small. When to use the tiled version or the untiled version depends on
+//     many factors including data alignments, ratio of calculation to memory
+//     traffic and obviously, the problem sizes.
+template <class T, class Index, class SegmentReductionFunctor, bool IsMean>
+class SegmentReductionGPUOp : public AsyncOpKernel {
+ public:
+  explicit SegmentReductionGPUOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& input = context->input(0);
+    const Tensor& segment_ids = context->input(1);
+
+    OP_REQUIRES_ASYNC(
+        context, TensorShapeUtils::IsVector(segment_ids.shape()),
+        errors::InvalidArgument("segment_ids should be a vector."), done);
+
+    OP_REQUIRES_ASYNC(context, input.dims() >= 1,
+                      errors::InvalidArgument("Shape must be at least rank 1"),
+                      done);
+
+    const int64_t num_indices = segment_ids.NumElements();
+    OP_REQUIRES_ASYNC(
+        context, num_indices == input.dim_size(0),
+        errors::InvalidArgument(
+            "segment_ids should be the same size as dimension 0 of"
+            " input."),
+        done);
+
+    if (num_indices == 0) {
+      TensorShape output_shape = input.shape();
+      output_shape.set_dim(0, 0);
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, output_shape, &output), done);
+      done();
+      return;
+    }
+
+    se::DeviceMemoryBase output_rows_device(
+        const_cast<Tensor&>(segment_ids).template flat<Index>().data() +
+        (num_indices - 1));
+    ScratchSpace<Index> output_rows_host(context, 1, /* on_host */ true);
+
+    auto stream = context->op_device_context()->stream();
+    OP_REQUIRES_OK_ASYNC(context,
+                         stream->Memcpy(output_rows_host.mutable_data(),
+                                        output_rows_device, sizeof(Index)),
+                         done);
+
+    SegmentReductionFunctor functor_;
+    auto create_and_check_output = [context, output_rows_host, &input,
+                                    &segment_ids, &functor_, done]() {
+      // Ensure that within the callback, the proper GPU settings are
+      // configured.
+      auto stream = context->op_device_context()->stream();
+      std::unique_ptr<stream_executor::ActivateContext> scoped_activation =
+          stream->parent()->Activate();
+
+      Index output_rows = *output_rows_host.data();
+      output_rows++;
+      OP_REQUIRES_ASYNC(context, output_rows > 0,
+                        errors::InvalidArgument("segment ids must be >= 0"),
+                        done);
+
+      TensorShape output_shape = input.shape();
+      // Since we're changing the first dimension of the shape, we need to make
+      // sure the new shape won't overflow.
+      OP_REQUIRES_OK_ASYNC(context,
+                           output_shape.SetDimWithStatus(0, output_rows), done);
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, output_shape, &output), done);
+
+      bool use_deterministic_kernels =
+          UseDeterministicSegmentReductions() ||
+          (!SegmentReductionFunctor::atomic_reduction_is_associative &&
+           OpDeterminismRequired());
+
+      // The determinism check is here, rather than inside the functor (as it is
+      // for the unsorted segment reduction ops) because the done callback
+      // (required for OP_REQUIRES_ASYNC) is not available inside the functor.
+      bool determinism_requirement_met =
+          use_deterministic_kernels ||
+          SegmentReductionFunctor::atomic_reduction_is_associative ||
+          !OpDeterminismRequired() ||
+          DisableSegmentReductionOpDeterminismExceptions();
+      OP_REQUIRES_ASYNC(
+          context, determinism_requirement_met,
+          errors::Unimplemented(
+              "Deterministic GPU implementation of sorted segment reduction op"
+              " not available."),
+          done);
+
+      auto output_flat = output->flat_outer_dims<T>();
+      auto data_ptr = input.template flat<T>().data();
+      auto segment_flat = segment_ids.flat<Index>();
+      functor_(context, context->eigen_device<GPUDevice>(), output_rows,
+               segment_ids.shape(), IsMean, segment_flat, input.NumElements(),
+               data_ptr, output_flat);
+
+      done();
+    };
+
+    context->device()
+        ->tensorflow_accelerator_device_info()
+        ->event_mgr->ThenExecute(stream, create_and_check_output);
+  }
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// ____________________________________________________________________________
+// Unsorted segment reduction ops.
+
+namespace functor {
+
+// The ReductionFunctor implementation for CPU.
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const TensorShape& segment_ids_shape,
+                  typename TTypes<Index>::ConstFlat segment_ids,
+                  typename TTypes<T, 2>::ConstTensor data,
+                  typename TTypes<T, 2>::Tensor output) {
+    auto cpu_device = ctx->eigen_cpu_device();
+    output.device(cpu_device) = output.constant(InitialValueF()());
+    if (data.size() == 0) {
+      return;
+    }
+
+    // This functor will reduce `N` rows input to `num_segments` rows output.
+    const int64_t N = segment_ids.dimension(0);
+    const int64_t num_segments = output.dimension(0);
+    const int64_t inner_dim = data.dimension(1);
+    const T* data_ptr = data.data();
+    T* out_ptr = output.data();
+    ReductionF reduction;
+
+    const bool is_inner_dim_1d = inner_dim == 1;
+
+    // `num_real_segment` counts the rows actually reduced from input,
+    // the rows with negative segment index will be excluded.
+    // It will be used for cost model.
+    int64_t num_real_segment = N;
+    // `num_reductions` counts the rows actually reduced in output,
+    // the rows only filled with InitialValueF() will be excluded.
+    int64_t num_reductions = 0;
+    // `row_counter` records how many input rows will be reduced in each
+    // output row, the row only fills with InitialValueF() will keep 0.
+    // Length of non-zero elements is `num_reductions`.
+    std::vector<Index> row_counter(num_segments, 0);
+
+    for (int64_t i = 0; i < N; ++i) {
+      Index j = internal::SubtleMustCopy(segment_ids(i));
+      if (j < 0) {
+        --num_real_segment;
+        continue;
+      }
+      OP_REQUIRES(ctx, FastBoundsCheck(j, num_segments),
+                  errors::InvalidArgument(
+                      "segment_ids", SliceDebugString(segment_ids_shape, i),
+                      " = ", j, " is out of range [0, ", num_segments, ")"));
+      if (row_counter[j] == 0) num_reductions++;
+      row_counter[j]++;
+    }
+
+    // Nothing to reduce. All output values equal to `InitialValueF()`.
+    if (num_reductions == 0) return;
+
+    // Parallelize by `num_segments`. It's simple, efficient and safe
+    // (no data dependency):
+    //
+    //   input   segment_ids                 num_segments  operation
+    //   | a0 |  | 0 |            worker 1:  |0|           f(a0, a1)
+    //   | b0 |  | 1 |            worker 2:  |1|           f(b0, b1)
+    // N | c0 |  | 2 |       -->  worker 3:  |2|           f(c0)
+    //   | b1 |  | 1 |
+    //   | a1 |  | 0 |
+    //
+    // TODO(intel-tf): Balance workload in `row_counter` to make parallelism
+    //                 more efficient.
+    auto reductionWorker = [&](int64_t begin, int64_t end) -> void {
+      for (int64_t i = 0; i < N; i++) {
+        Index j = internal::SubtleMustCopy(segment_ids(i));
+        // If `j` is in work scope of this worker, do the reduction.
+        if (j >= begin && j < end) {
+          reduction(data.template chip<0>(i), output.template chip<0>(j));
+        }
+      }
+    };
+    auto reductionWorker1D = [&](int64_t begin, int64_t end) -> void {
+      for (int64_t i = 0; i < N; i++) {
+        Index j = internal::SubtleMustCopy(segment_ids(i));
+        // If `j` is in work scope of this worker, do the reduction.
+        if (j >= begin && j < end) {
+          reduction(data_ptr[i], out_ptr[j]);
+        }
+      }
+    };
+    // Reduction functors includes Sum, Max, Min, etc. Simply consider it
+    // will cost 5 cycles per operation.
+    const int64_t kAverTaskSize = num_real_segment / num_segments;
+    const int64_t compute_cycles = 5 * inner_dim * kAverTaskSize;
+    const int64_t input_bytes = sizeof(T) * inner_dim * kAverTaskSize;
+    const int64_t output_bytes = sizeof(T) * inner_dim * kAverTaskSize;
+    const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
+    if (is_inner_dim_1d) {
+      cpu_device.parallelFor(num_segments, cost, reductionWorker1D);
+    } else {
+      cpu_device.parallelFor(num_segments, cost, reductionWorker);
+    }
+  }
+};
+
+template <typename T>
+using MatrixChip = Eigen::TensorChippingOp<0l, typename TTypes<T, 2>::Matrix>;
+
+template <typename T>
+using constMatrixChip =
+    Eigen::TensorChippingOp<0l, const typename TTypes<T, 2>::ConstMatrix>;
+
+// reduction functors
+template <typename T>
+struct SumOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output += data;
+  }
+  void operator()(const T& data, T& output) { output += data; }
+};
+
+template <typename T>
+struct MaxOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMax(output);
+  }
+  void operator()(const T& data, T& output) { output = std::max(data, output); }
+};
+
+template <typename T>
+struct MinOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMin(output);
+  }
+  void operator()(const T& data, T& output) { output = std::min(data, output); }
+};
+
+template <typename T>
+struct ProdOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output *= data;
+  }
+  void operator()(const T& data, T& output) { output *= data; }
+};
+}  // namespace functor
+
+// The UnsortedSegmentReduction OpKernel. The DeviceReductionFunctor
+// is the device specific implementation of the reduction. These device
+// specific implementations are templated themselves with the corresponding
+// initial value functors and reduction functors.
+template <typename T, typename Index, typename DeviceReductionFunctor>
+class UnsortedSegmentReductionOp : public OpKernel {
+ public:
+  explicit UnsortedSegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context), reduction_functor_(DeviceReductionFunctor()) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& data = context->input(0);
+    const Tensor& segment_ids = context->input(1);
+    const Tensor& num_segments = context->input(2);
+    OP_REQUIRES_OK(context,
+                   internal::ValidateUnsortedSegmentReduction(
+                       this, context, data, segment_ids, num_segments));
+    const auto segment_flat = segment_ids.flat<Index>();
+    const Index output_rows = internal::SubtleMustCopy(static_cast<Index>(
+        num_segments.dtype() == DT_INT32 ? num_segments.scalar<int32>()()
+                                         : num_segments.scalar<int64_t>()()));
+    OP_REQUIRES(context, output_rows >= 0,
+                errors::InvalidArgument("Input num_segments == ", output_rows,
+                                        " must not be negative."));
+    TensorShape output_shape;
+    OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(output_rows));
+    for (int i = segment_ids.dims(); i < data.dims(); i++) {
+      OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(data.dim_size(i)));
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_flat = output->flat_outer_dims<T>();
+    auto data_flat = data.flat_inner_outer_dims<T, 2>(segment_ids.dims() - 1);
+    reduction_functor_(context, segment_ids.shape(), segment_flat, data_flat,
+                       output_flat);
+  }
+
+ protected:
+  DeviceReductionFunctor reduction_functor_;
+};
+
+// ____________________________________________________________________________
+// Sparse segment reduction ops.
+
+// Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
+// by two dense tensors, one containing the data, and the other containing
+// indices into the data.
+//
+// The template parameters are:
+// * Device: An Eigen device object, on which the kernel will execute.
+// * T: The value type.
+// * Index: The element type of the indices tensor (int32 or int64).
+// * SegmentId: The element type of the segment_ids tensor (int32 or int64).
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionOpBase : public OpKernel {
+ public:
+  explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
+                                        bool is_mean, bool is_sqrtn,
+                                        bool has_num_segments, T default_value)
+      : OpKernel(context),
+        dtidx_(DataTypeToEnum<Index>::v()),
+        is_mean_(is_mean),
+        is_sqrtn_(is_sqrtn),
+        has_num_segments_(has_num_segments),
+        default_value_(default_value) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& indices = context->input(1);
+    const Tensor& segment_ids = context->input(2);
+
+    OP_REQUIRES_OK(
+        context, internal::ValidateSparseSegmentReduction(
+                     context, input, indices, segment_ids, has_num_segments_));
+
+    Index output_rows = -1;
+    if (has_num_segments_) {
+      const Tensor& num_segments = context->input(3);
+      // Note that there is a Tnumsegments parameter on the op, but it is not
+      // plumbed through to here and so always takes its default value of int32.
+      output_rows = internal::SubtleMustCopy(num_segments.scalar<int32>()());
+    }
+    const int64_t num_indices = indices.NumElements();
+
+    auto input_flat = input.flat_outer_dims<T>();
+    const int64_t num_col = input_flat.dimension(1);
+    const auto indices_vec = indices.vec<Index>();
+    const auto segment_vec = segment_ids.vec<SegmentId>();
+    // Note that the current implementation assumes that segment_vec values are
+    // sorted.
+    const SegmentId last_segment_id =
+        num_indices > 0 ? segment_vec(num_indices - 1) : 0;
+    int64_t limit = dtidx_ == DataType::DT_INT32 ? kint32max : kint64max;
+
+    OP_REQUIRES(
+        context, last_segment_id < limit,
+        errors::InvalidArgument("Last segment id must be < kintmax, got ",
+                                last_segment_id, " limit ", limit));
+
+    const SegmentId last_segment_id_plus_one =
+        num_indices > 0
+            ? internal::SubtleMustCopy(segment_vec(num_indices - 1)) + 1
+            : 0;
+
+    if (has_num_segments_) {
+      OP_REQUIRES(
+          context, output_rows >= last_segment_id_plus_one,
+          errors::InvalidArgument("segment ids must be < num_segments"));
+    } else {
+      output_rows = last_segment_id_plus_one;
+    }
+    OP_REQUIRES(context, output_rows >= 0,
+                errors::InvalidArgument("segment ids must be >= 0"));
+
+    TensorShape output_shape = input.shape();
+    OP_REQUIRES_OK(
+        context, output_shape.SetDimWithStatus(/*d=*/0, /*size=*/output_rows));
+
+    // Note that we do not initialize the output buffer with a default value, so
+    // we need to explicitly set missing indices to the default value.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    if (num_indices == 0) {
+      if (output_rows > 0) {
+        output->flat_outer_dims<T>().setConstant(default_value_);
+      }
+      return;
+    }
+    OP_REQUIRES(context, output_rows > 0,
+                errors::InvalidArgument("segment ids must be >= 0"));
+    auto output_flat = output->flat_outer_dims<T>();
+
+    // If we use DT_BFLOAT16 or DT_HALF, we need to use DT_FLOAT for
+    // accumulation. We create a temp tensor to perform this accumulation for
+    // every segment.
+    Tensor temp;
+    if (input.dtype() == DT_BFLOAT16 || input.dtype() == DT_HALF) {
+      TensorShape temp_shape = output_shape;
+      OP_REQUIRES_OK(context, temp_shape.SetDimWithStatus(/*d=*/0, /*size=*/1));
+      temp = tensorflow::Tensor(DT_FLOAT, temp_shape);
+    }
+    auto temp_flat = temp.flat_outer_dims<float>();
+
+    int64_t start = 0, end = 1;
+    // Index from which the output is not initialized.
+    SegmentId uninitialized_index = 0;
+    SegmentId out_index = internal::SubtleMustCopy(segment_vec(start));
+
+    while (true) {
+      // We initialize next_index to 0 to avoid "warning: 'next_index' may be
+      // used uninitialized in this function" in the Mac build (since the
+      // compiler isn't smart enough to realize the code is safe).
+      SegmentId next_index = 0;
+      if (end < num_indices) {
+        next_index = internal::SubtleMustCopy(segment_vec(end));
+        if (out_index == next_index) {
+          ++end;
+          continue;
+        }
+        // We have a new segment here.  Verify that the segment ids are growing.
+        OP_REQUIRES(context, out_index < next_index,
+                    errors::InvalidArgument("segment ids are not increasing"));
+      }
+
+      OP_REQUIRES(
+          context, FastBoundsCheck(out_index, output_rows),
+          errors::InvalidArgument(
+              "Segment id ", out_index, " out of range [0, ", output_rows,
+              "), possibly because 'segment_ids' input is not sorted."));
+
+      // If there is a gap between two indices, we need to set that gap to the
+      // default value.
+      if (out_index > uninitialized_index) {
+        Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+            out_index - uninitialized_index, num_col);
+        Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+            gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+        gap_slice.setConstant(default_value_);
+      }
+
+      auto out = output_flat.template chip<0>(out_index);
+      auto temp = temp_flat.template chip<0>(0);
+      const int bad_offset = Reduce<T, Index>(input_flat, indices_vec, start,
+                                              end - start, out, temp);
+      OP_REQUIRES(context, bad_offset < 0,
+                  errors::InvalidArgument(
+                      "Bad: indices[", start + bad_offset,
+                      "] == ", indices_vec(start + bad_offset),
+                      " out of range [0, ", input_flat.dimension(0), ")"));
+
+      start = end;
+      ++end;
+      uninitialized_index = out_index + 1;
+      out_index = next_index;
+      if (end > num_indices) break;
+    }
+
+    // Fill the gap at the end with the default value.
+    if (uninitialized_index < output_rows) {
+      Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+          output_rows - uninitialized_index, num_col);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+          gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+      gap_slice.setConstant(default_value_);
+    }
+  }
+
+ private:
+  const DataType dtidx_;
+
+  template <typename Tin>
+  using EnableIfBfloat16OrHalf =
+      typename std::enable_if<std::is_same<Tin, bfloat16>::value ||
+                                  std::is_same<Tin, Eigen::half>::value,
+                              int>::type;
+  template <typename Tin>
+  using EnableIfNotBfloat16OrHalf =
+      typename std::enable_if<!std::is_same<Tin, bfloat16>::value &&
+                                  !std::is_same<Tin, Eigen::half>::value,
+                              int>::type;
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16OrHalf<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index);
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16OrHalf<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index).template cast<float>();
+  }
+
+  template <typename Tout>
+  EIGEN_ALWAYS_INLINE Tout get_scaling_factor(int64_t num) {
+    Tout m(1);
+    if (is_mean_ && (num < 10)) {
+      m = Tout(num);
+    }
+    if (is_sqrtn_ && (num < 10)) {
+      m = Tout(sqrt(num));
+    }
+    return Tout(1) / m;
+  }
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16OrHalf<Tin> = 0>
+  int64_t Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64_t start,
+      int64_t num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    return ReduceImpl<Tin, Tindex, Tin>(input_flat, indices_vec, start, num,
+                                        out, get_scaling_factor<Tin>(num));
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16OrHalf<Tin> = 0>
+  int64_t Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64_t start,
+      int64_t num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    int64_t res =
+        ReduceImpl<Tin, Tindex, float>(input_flat, indices_vec, start, num,
+                                       temp, get_scaling_factor<float>(num));
+    out = temp.template cast<Tin>();
+    return res;
+  }
+
+  template <typename Tin, typename Tindex, typename Tout>
+  int64_t ReduceImpl(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64_t start,
+      int64_t num,
+      Eigen::TensorChippingOp<0, typename TTypes<Tout>::Matrix> out,
+      const Tout scaling_factor) {
+#define INDEX(n, i)                               \
+  const auto index##n = indices_vec(start + (i)); \
+  if (!FastBoundsCheck(index##n, input_flat.dimension(0))) return (i);
+
+#define L(n) fetch_val<Tin, Tindex>(input_flat, index##n)
+
+    if (num == 1) {
+      INDEX(0, 0);
+      out = L(0);
+    } else {
+      int64_t r = num & 7;
+      switch (r) {
+        case 2: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          out = (L(0) + L(1)) * scaling_factor;
+          break;
+        }
+        case 3: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          out = (L(0) + L(1) + L(2)) * scaling_factor;
+          break;
+        }
+        case 4: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          out = (L(0) + L(1) + L(2) + L(3)) * scaling_factor;
+          break;
+        }
+        case 5: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4)) * scaling_factor;
+          break;
+        }
+        case 6: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) * scaling_factor;
+          break;
+        }
+        case 7: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          INDEX(6, 6);
+          out =
+              (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) * scaling_factor;
+          break;
+        }
+        case 0: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          INDEX(6, 6);
+          INDEX(7, 7);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) *
+                scaling_factor;
+          r = 8;
+          break;
+        }
+        case 1: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          INDEX(6, 6);
+          INDEX(7, 7);
+          INDEX(8, 8);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) *
+                scaling_factor;
+          r = 9;
+          break;
+        }
+      }
+      for (; r < num; r += 8) {
+        INDEX(0, r);
+        INDEX(1, r + 1);
+        INDEX(2, r + 2);
+        INDEX(3, r + 3);
+        INDEX(4, r + 4);
+        INDEX(5, r + 5);
+        INDEX(6, r + 6);
+        INDEX(7, r + 7);
+        out += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
+      }
+      if (is_mean_ && num >= 10) {
+        out = out / static_cast<Tout>(num);
+      }
+      if (is_sqrtn_ && num >= 10) {
+        out = out / static_cast<Tout>(sqrt(num));
+      }
+    }
+
+    return -1;
+#undef L
+#undef INDEX
+  }
+
+  const bool is_mean_;
+  const bool is_sqrtn_;
+  const bool has_num_segments_;
+  const T default_value_;
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+// Specialization for GPU. Must be Async because may need to wait for a host to
+// device memcpy before allocating output.
+template <class T, typename Index, typename SegmentId>
+class SparseSegmentReductionOpBase<GPUDevice, T, Index, SegmentId>
+    : public AsyncOpKernel {
+ public:
+  explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
+                                        bool is_mean, bool is_sqrtn,
+                                        bool has_num_segments, T default_value)
+      : AsyncOpKernel(context),
+        is_mean_(is_mean),
+        is_sqrtn_(is_sqrtn),
+        has_num_segments_(has_num_segments),
+        default_value_(default_value) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& input = context->input(0);
+    const Tensor& indices = context->input(1);
+    const Tensor& segment_ids = context->input(2);
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        internal::ValidateSparseSegmentReduction(
+            context, input, indices, segment_ids, has_num_segments_),
+        done);
+
+    ScratchSpace<SegmentId> last_segment_id_host(context, 1, /*on_host=*/true);
+
+    auto create_and_check_output = [this, context, input, indices, segment_ids,
+                                    last_segment_id_host, done]() {
+      // Ensure that within the callback, the proper GPU settings are
+      // configured.
+      auto stream = context->op_device_context()->stream();
+      std::unique_ptr<stream_executor::ActivateContext> scoped_activation =
+          stream->parent()->Activate();
+
+      SegmentId last_segment_id = *last_segment_id_host.data();
+      SegmentId output_rows = last_segment_id + 1;
+      OP_REQUIRES_ASYNC(context, output_rows > 0,
+                        errors::InvalidArgument("segment ids must be >= 0"),
+                        done);
+
+      TensorShape output_shape = input.shape();
+      output_shape.set_dim(0, output_rows);
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, output_shape, &output), done);
+
+      auto input_flat = input.flat_outer_dims<T>();
+      const auto indices_vec = indices.vec<Index>();
+      const auto segment_ids_vec = segment_ids.vec<SegmentId>();
+      auto output_flat = output->flat_outer_dims<T>();
+
+      functor::SparseSegmentReductionFunctor<T, Index, SegmentId> functor;
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          functor(context, is_mean_, is_sqrtn_, default_value_, input_flat,
+                  indices_vec, segment_ids_vec, output_flat),
+          done);
+      done();
+    };
+
+    if (has_num_segments_) {
+      // No need to do any device to host memcpy, just compute synchronously.
+      const Tensor& num_segments_t = context->input(3);
+      SegmentId num_segments =
+          internal::SubtleMustCopy(num_segments_t.dtype() == DT_INT32
+                                       ? num_segments_t.scalar<int32>()()
+                                       : num_segments_t.scalar<int64_t>()());
+      *last_segment_id_host.mutable_data() = num_segments - 1;
+      create_and_check_output();
+    } else {
+      const int64_t num_indices = indices.NumElements();
+      if (num_indices == 0) {
+        TensorShape output_shape = input.shape();
+        output_shape.set_dim(0, 0);
+
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            context, context->allocate_output(0, output_shape, &output), done);
+        done();
+        return;
+      }
+
+      // Need to copy last element of segment_ids from device to host, and then
+      // asynchronously allocate the output and finish the computation.
+      se::DeviceMemoryBase last_segment_id_device(
+          const_cast<Tensor&>(segment_ids).template flat<SegmentId>().data() +
+          (num_indices - 1));
+      auto stream = context->op_device_context()->stream();
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          stream->Memcpy(last_segment_id_host.mutable_data(),
+                         last_segment_id_device, sizeof(SegmentId)),
+          done);
+      context->device()
+          ->tensorflow_accelerator_device_info()
+          ->event_mgr->ThenExecute(stream, create_and_check_output);
+    }
+  }
+
+ private:
+  const bool is_mean_;
+  const bool is_sqrtn_;
+  const bool has_num_segments_;
+  const T default_value_;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionMeanOp
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionMeanWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentReductionMeanWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionSqrtNOp
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentReductionSqrtNOp(OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionSqrtNWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentReductionSqrtNWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionSumOp
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentReductionSumWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentReductionSumWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T, Index, SegmentId>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+namespace functor {
+
+template <typename T, typename Index, typename SegmentId>
+struct SparseSegmentGradFunctor<CPUDevice, T, Index, SegmentId> {
+  void operator()(OpKernelContext* context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Index>::ConstVec indices_vec,
+                  typename TTypes<SegmentId>::ConstVec segment_vec,
+                  Tensor* output) {
+    auto output_flat = output->flat_outer_dims<T>();
+    const int64_t N = indices_vec.size();
+    const SegmentId M = output_flat.dimension(0);
+
+    // Note that similar to SparseSegmentMean, we assume that segment_vec is
+    // already sorted and has non-negative values.
+    const SegmentId num_segments = input_flat.dimension(0);
+    const SegmentId last_segment_id_plus_one =
+        internal::SubtleMustCopy(segment_vec(N - 1)) + 1;
+    OP_REQUIRES(context, last_segment_id_plus_one <= num_segments,
+                absl::InvalidArgumentError("Invalid number of segments"));
+
+    const auto scaling_or =
+        ComputeScalingFactors(operation, segment_vec, num_segments);
+    OP_REQUIRES_OK(context, scaling_or.status());
+    const std::vector<double>& scaling = scaling_or.value();
+
+    // If we use DT_BFLOAT16 or DT_HALF, we need to use DT_FLOAT for
+    // accumulation. We create a temp tensor to perform this accumulation for
+    // every segment.
+    Tensor temp;
+    if (output->dtype() == DT_BFLOAT16 || output->dtype() == DT_HALF) {
+      temp = tensorflow::Tensor(DT_FLOAT, output->shape());
+    }
+    auto temp_flat = temp.flat_outer_dims<float>();
+
+    if (output->dtype() == DT_BFLOAT16 || output->dtype() == DT_HALF) {
+      temp_flat.setZero();
+    } else {
+      output_flat.setZero();
+    }
+
+    for (int64_t i = 0; i < N; ++i) {
+      const Index output_idx = internal::SubtleMustCopy(indices_vec(i));
+      OP_REQUIRES(context, FastBoundsCheck(output_idx, M),
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "Index ", output_idx, " out of range [0, ", M, ").")));
+
+      const SegmentId idx = internal::SubtleMustCopy(segment_vec(i));
+      OP_REQUIRES(
+          context, FastBoundsCheck(idx, num_segments),
+          absl::InvalidArgumentError(absl::StrCat(
+              "Segment id ", idx, " out of range [0, ", num_segments, ").")));
+
+      const double scale = operation == SparseSegmentReductionOperation::kSum
+                               ? 1.0
+                               : scaling[idx];
+      Accumulate<T>(input_flat.template chip<0>(idx), scale,
+                    output_flat.template chip<0>(output_idx),
+                    temp_flat.template chip<0>(output_idx));
+    }
+
+    // Copy the contents of the temp tensor to the output tensor.
+    if (output->dtype() == DT_BFLOAT16 || output->dtype() == DT_HALF) {
+      output_flat = temp_flat.template cast<T>();
+    }
+  }
+
+ private:
+  template <typename Tin>
+  using EnableIfBfloat16OrHalf =
+      typename std::enable_if<std::is_same<Tin, bfloat16>::value ||
+                                  std::is_same<Tin, Eigen::half>::value,
+                              int>::type;
+  template <typename Tin>
+  using EnableIfNotBfloat16OrHalf =
+      typename std::enable_if<!std::is_same<Tin, bfloat16>::value &&
+                                  !std::is_same<Tin, Eigen::half>::value,
+                              int>::type;
+
+  template <typename Tin, EnableIfNotBfloat16OrHalf<Tin> = 0>
+  void Accumulate(
+      Eigen::TensorChippingOp<0, const typename TTypes<Tin>::ConstMatrix> in,
+      double scale,
+      Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    out += in * static_cast<Tin>(scale);
+  }
+
+  template <typename Tin, EnableIfBfloat16OrHalf<Tin> = 0>
+  void Accumulate(
+      Eigen::TensorChippingOp<0, const typename TTypes<Tin>::ConstMatrix> in,
+      double scale,
+      Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    temp += in.template cast<float>() * static_cast<float>(scale);
+  }
+
+  // Compute scaling factors for input.
+  absl::StatusOr<std::vector<double>> ComputeScalingFactors(
+      SparseSegmentReductionOperation operation,
+      typename TTypes<SegmentId>::ConstVec segment_vec,
+      const SegmentId num_segments) {
+    if (operation == SparseSegmentReductionOperation::kSum) {
+      return std::vector<double>(0);
+    }
+
+    std::vector<double> scaling(num_segments, 0);
+
+    for (int64_t i = 0; i < segment_vec.size(); ++i) {
+      const SegmentId idx = internal::SubtleMustCopy(segment_vec(i));
+      if (!FastBoundsCheck(idx, num_segments)) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Segment id ", idx, " out of range [0, ", num_segments, ")."));
+      }
+      scaling[idx] += 1;
+    }
+
+    if (operation == SparseSegmentReductionOperation::kMean) {
+      for (size_t i = 0; i < scaling.size(); ++i) {
+        scaling[i] = 1.0 / std::max(scaling[i], 1.0);
+      }
+    } else {
+      for (size_t i = 0; i < scaling.size(); ++i) {
+        scaling[i] = 1.0 / sqrt(std::max(scaling[i], 1.0));
+      }
+    }
+
+    return scaling;
+  }
+};
+
+template <typename T, typename Index, typename SegmentId>
+struct SparseSegmentGradV2Functor<CPUDevice, T, Index, SegmentId> {
+  void operator()(OpKernelContext* context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Index>::ConstVec indices_vec,
+                  typename TTypes<SegmentId>::ConstVec segment_vec,
+                  const TensorShape& dense_output_shape,
+                  typename AsyncOpKernel::DoneCallback /*done*/) {
+    const int64_t N = indices_vec.size();
+    const int64_t M = dense_output_shape.dim_size(0);
+    const SegmentId num_segments = input_flat.dimension(0);
+    const SegmentId last_segment_id_plus_one =
+        internal::SubtleMustCopy(segment_vec(N - 1)) + 1;
+    // Note: We do bounds-checking up front here so that it operates in the same
+    // order as the V1 implementation.
+    OP_REQUIRES(context, last_segment_id_plus_one <= num_segments,
+                errors::InvalidArgument("Invalid number of segments"));
+    for (int64_t i = 0; i < N; ++i) {
+      const Index output_idx = internal::SubtleMustCopy(indices_vec(i));
+      OP_REQUIRES(context, FastBoundsCheck(output_idx, M),
+                  errors::InvalidArgument("Index ", output_idx,
+                                          " out of range [0, ", M, ")."));
+      const SegmentId segment_id = internal::SubtleMustCopy(segment_vec(i));
+      OP_REQUIRES(
+          context, FastBoundsCheck(segment_id, num_segments),
+          errors::InvalidArgument("Segment id ", segment_id,
+                                  " out of range [0, ", num_segments, ")."));
+    }
+
+    std::vector<Index> permutation;
+    permutation.reserve(N);
+    for (int64_t i = 0; i < N; ++i) {
+      permutation.push_back(i);
+    }
+    std::stable_sort(
+        permutation.begin(), permutation.end(),
+        [&](Index a, Index b) { return indices_vec(a) < indices_vec(b); });
+    std::vector<Index> sorted_indices;
+    std::vector<SegmentId> permuted_segments;
+    sorted_indices.reserve(N);
+    permuted_segments.reserve(N);
+    for (Index j : permutation) {
+      sorted_indices.push_back(indices_vec(j));
+      permuted_segments.push_back(segment_vec(j));
+    }
+
+    // Maps indices to unique index IDs.
+    absl::flat_hash_map<Index, Index> unique_indices_map;
+    // The unique ID for each original index.
+    std::vector<Index> unique_index_ids;
+    unique_index_ids.reserve(N);
+    for (Index output_idx : sorted_indices) {
+      auto iter =
+          unique_indices_map.emplace(output_idx, unique_indices_map.size())
+              .first;
+      Index unique_id = iter->second;
+      unique_index_ids.push_back(unique_id);
+    }
+    const int64_t num_unique = unique_indices_map.size();
+
+    // The original index for each unique ID.
+    Tensor* unique_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, {num_unique}, &unique_indices));
+    typename TTypes<Index>::Vec unique_indices_vec =
+        unique_indices->vec<Index>();
+    for (const auto& idx_and_id : unique_indices_map) {
+      unique_indices_vec(idx_and_id.second) = idx_and_id.first;
+    }
+
+    TensorShape output_shape = dense_output_shape;
+    OP_REQUIRES_OK(context, output_shape.SetDimWithStatus(0, num_unique));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    // Call the V1 implementation with the unique/permuted indices/segments.
+    typename TTypes<Index>::ConstVec unique_index_ids_vec(
+        unique_index_ids.data(), unique_index_ids.size());
+    typename TTypes<SegmentId>::ConstVec permuted_segment_vec(
+        permuted_segments.data(), permuted_segments.size());
+    SparseSegmentGradFunctor<CPUDevice, T, Index, SegmentId>()(
+        context, operation, input_flat, unique_index_ids_vec,
+        permuted_segment_vec, output);
+  }
+};
+
+}  // namespace functor
+
+// Implements the common logic for the gradients of SparseSegmentReduction
+// kernels.
+//
+// The template parameters are:
+// * Device: An Eigen device object, on which the kernel will execute.
+// * T: The value type.
+// * Index: The element type of the indices tensor (int32 or int64).
+// * SegmentId: The element type of the segment_ids tensor (int32 or int64).
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentGradOpBase : public OpKernel {
+ public:
+  explicit SparseSegmentGradOpBase(OpKernelConstruction* context,
+                                   SparseSegmentReductionOperation operation)
+      : OpKernel(context), operation_(operation) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& indices = context->input(1);
+    const Tensor& segment_ids = context->input(2);
+    const Tensor& output_dim0 = context->input(3);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices should be a vector."));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
+                errors::InvalidArgument("segment_ids should be a vector."));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(output_dim0.shape()),
+                errors::InvalidArgument("output_dim0 should be a scalar."));
+
+    const int64_t N = indices.NumElements();
+    OP_REQUIRES(context, N == segment_ids.NumElements(),
+                errors::InvalidArgument(
+                    "segment_ids and indices should have same size."));
+    const SegmentId M = internal::SubtleMustCopy(output_dim0.scalar<int32>()());
+
+    auto input_flat = input.flat_outer_dims<T>();
+    const auto indices_vec = indices.vec<Index>();
+    const auto segment_vec = segment_ids.vec<SegmentId>();
+
+    TensorShape output_shape = input.shape();
+    OP_REQUIRES_OK(context, output_shape.SetDimWithStatus(0, M));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    if (M == 0 || N == 0) return;
+
+    functor::SparseSegmentGradFunctor<Device, T, Index, SegmentId>()(
+        context, operation_, input_flat, indices_vec, segment_vec, output);
+  }
+
+ private:
+  const SparseSegmentReductionOperation operation_;
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentSumGradOp
+    : public SparseSegmentGradOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentSumGradOp(OpKernelConstruction* context)
+      : SparseSegmentGradOpBase<Device, T, Index, SegmentId>(
+            context, SparseSegmentReductionOperation::kSum) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentMeanGradOp
+    : public SparseSegmentGradOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentMeanGradOp(OpKernelConstruction* context)
+      : SparseSegmentGradOpBase<Device, T, Index, SegmentId>(
+            context, SparseSegmentReductionOperation::kMean) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentSqrtNGradOp
+    : public SparseSegmentGradOpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentSqrtNGradOp(OpKernelConstruction* context)
+      : SparseSegmentGradOpBase<Device, T, Index, SegmentId>(
+            context, SparseSegmentReductionOperation::kSqrtN) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentGradV2OpCommon {
+ public:
+  absl::Status operator()(OpKernelContext* context,
+                          SparseSegmentReductionOperation operation,
+                          typename AsyncOpKernel::DoneCallback done = nullptr) {
+    const Tensor& input = context->input(0);
+    const Tensor& indices = context->input(1);
+    const Tensor& segment_ids = context->input(2);
+    const Tensor& dense_output_dim0 = context->input(3);
+
+    if (!TensorShapeUtils::IsVector(indices.shape())) {
+      return errors::InvalidArgument("indices should be a vector.");
+    }
+    if (!TensorShapeUtils::IsVector(segment_ids.shape())) {
+      return errors::InvalidArgument("segment_ids should be a vector.");
+    }
+    if (!TensorShapeUtils::IsScalar(dense_output_dim0.shape())) {
+      return errors::InvalidArgument("dense_output_dim0 should be a scalar.");
+    }
+
+    const int64_t N = indices.NumElements();
+    if (N != segment_ids.NumElements()) {
+      return errors::InvalidArgument(
+          "segment_ids and indices should have same size.");
+    }
+    const int32_t M =
+        internal::SubtleMustCopy(dense_output_dim0.scalar<int32_t>()());
+    TensorShape dense_output_shape = input.shape();
+    TF_RETURN_IF_ERROR(dense_output_shape.SetDimWithStatus(0, M));
+
+    if (M == 0 || N == 0) {
+      TensorShape output_shape = input.shape();
+      TF_RETURN_IF_ERROR(output_shape.SetDimWithStatus(0, 0));
+      Tensor* output = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, &output));
+      Tensor* sorted_unique_indices = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(1, TensorShape({0}),
+                                                  &sorted_unique_indices));
+      return absl::OkStatus();
+    }
+
+    auto input_flat = input.flat_outer_dims<T>();
+    const auto indices_vec = indices.vec<Index>();
+    const auto segment_vec = segment_ids.vec<SegmentId>();
+
+    functor::SparseSegmentGradV2Functor<Device, T, Index, SegmentId>()(
+        context, operation, input_flat, indices_vec, segment_vec,
+        dense_output_shape, done);
+
+    return absl::OkStatus();
+  }
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentGradV2OpBase {};
+
+// The CPU implementation is synchronous.
+template <class T, typename Index, typename SegmentId>
+class SparseSegmentGradV2OpBase<CPUDevice, T, Index, SegmentId>
+    : public OpKernel {
+ public:
+  explicit SparseSegmentGradV2OpBase(OpKernelConstruction* context,
+                                     SparseSegmentReductionOperation operation)
+      : OpKernel(context), operation_(operation) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(
+        context, (SparseSegmentGradV2OpCommon<CPUDevice, T, Index, SegmentId>()(
+                     context, operation_)));
+  }
+
+ private:
+  const SparseSegmentReductionOperation operation_;
+};
+
+// The GPU implementation is asynchronous.
+template <class T, typename Index, typename SegmentId>
+class SparseSegmentGradV2OpBase<GPUDevice, T, Index, SegmentId>
+    : public AsyncOpKernel {
+ public:
+  explicit SparseSegmentGradV2OpBase(OpKernelConstruction* context,
+                                     SparseSegmentReductionOperation operation)
+      : AsyncOpKernel(context), operation_(operation) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        (SparseSegmentGradV2OpCommon<GPUDevice, T, Index, SegmentId>()(
+            context, operation_, done)),
+        done);
+  }
+
+ private:
+  const SparseSegmentReductionOperation operation_;
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentSumGradV2Op
+    : public SparseSegmentGradV2OpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentSumGradV2Op(OpKernelConstruction* context)
+      : SparseSegmentGradV2OpBase<Device, T, Index, SegmentId>(
+            context, SparseSegmentReductionOperation::kSum) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentMeanGradV2Op
+    : public SparseSegmentGradV2OpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentMeanGradV2Op(OpKernelConstruction* context)
+      : SparseSegmentGradV2OpBase<Device, T, Index, SegmentId>(
+            context, SparseSegmentReductionOperation::kMean) {}
+};
+
+template <typename Device, class T, typename Index, typename SegmentId>
+class SparseSegmentSqrtNGradV2Op
+    : public SparseSegmentGradV2OpBase<Device, T, Index, SegmentId> {
+ public:
+  explicit SparseSegmentSqrtNGradV2Op(OpKernelConstruction* context)
+      : SparseSegmentGradV2OpBase<Device, T, Index, SegmentId>(
+            context, SparseSegmentReductionOperation::kSqrtN) {}
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sendrecv_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sendrecv_ops.h
new file mode 100644
index 00000000..34f27d10
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sendrecv_ops.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SENDRECV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class SendOp : public OpKernel {
+ public:
+  explicit SendOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+
+ private:
+  string key_prefix_;
+  Rendezvous::ParsedKey parsed_key_;
+  bool hostmem_sendrecv_;
+
+  SendOp(const SendOp&) = delete;
+  void operator=(const SendOp&) = delete;
+};
+
+class RecvOp : public AsyncOpKernel {
+ public:
+  explicit RecvOp(OpKernelConstruction* ctx);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+
+ private:
+  string key_prefix_;
+  Rendezvous::ParsedKey parsed_key_;
+  bool hostmem_sendrecv_;
+
+  RecvOp(const RecvOp&) = delete;
+  void operator=(const RecvOp&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SENDRECV_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sequence_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sequence_ops.h
new file mode 100644
index 00000000..fc81643c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sequence_ops.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SEQUENCE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SEQUENCE_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct RangeFunctor {
+  void operator()(OpKernelContext* context, int64_t size, T start, T delta,
+                  typename TTypes<T>::Flat output) const;
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SEQUENCE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/shape_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/shape_ops.h
new file mode 100644
index 00000000..d9c64c76
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/shape_ops.h
@@ -0,0 +1,269 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SHAPE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SHAPE_OPS_H_
+
+#include <limits>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+
+namespace tensorflow {
+
+namespace shape_op_helpers {
+inline absl::Status GetShape(OpKernelContext* ctx, int input_index,
+                             TensorShape* shape) {
+  *shape = ctx->input(input_index).shape();
+  return absl::OkStatus();
+}
+}  // namespace shape_op_helpers
+
+template <typename OutType>
+class ShapeOp : public OpKernel {
+ public:
+  explicit ShapeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
+    const int rank = shape.dims();
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out));
+    auto vec = out->vec<OutType>();
+    for (int i = 0; i < rank; ++i) {
+      int64_t dim_size = shape.dim_size(i);
+      if (out->dtype() == DT_INT32) {
+        OP_REQUIRES(
+            ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
+            errors::InvalidArgument("Shape output type is 32-bit ", " but dim ",
+                                    i, " is ", dim_size));
+      }
+      vec(i) = static_cast<OutType>(dim_size);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+template <typename OutType>
+class ShapeNOp : public OpKernel {
+ public:
+  explicit ShapeNOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      TensorShape shape;
+      OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, i, &shape));
+      const int dims = shape.dims();
+      Tensor* out = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, {dims}, &out));
+      auto vec = out->vec<OutType>();
+
+      for (int j = 0; j < dims; ++j) {
+        int64_t dim_size = shape.dim_size(j);
+        if (out->dtype() == DT_INT32) {
+          OP_REQUIRES(
+              ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
+              errors::InvalidArgument("ShapeN output type is 32-bit but shape ",
+                                      i, " dim ", j, " is ", dim_size));
+        }
+        vec(j) = static_cast<OutType>(dim_size);
+      }
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+class RankOp : public OpKernel {
+ public:
+  explicit RankOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
+    const int rank = shape.dims();
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+    out->scalar<int32>()() = rank;
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+template <typename OutType>
+class SizeOp : public OpKernel {
+ public:
+  explicit SizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
+    const int64_t size = shape.num_elements();
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+    if (out->dtype() == DT_INT32) {
+      OP_REQUIRES(
+          ctx, FastBoundsCheck(size, std::numeric_limits<int32>::max()),
+          errors::InvalidArgument("Number of elements was larger than "
+                                  "representable by 32-bit output type"));
+    }
+    out->scalar<OutType>()() = static_cast<OutType>(size);
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+template <typename Tdim>
+class ExpandDimsOp : public OpKernel {
+ public:
+  explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_t = ctx->input(0);
+    OP_REQUIRES(ctx, input_t.dtype() != DT_VARIANT,
+                errors::InvalidArgument("ExpandDims on Variant not supported"));
+
+    const Tensor& dim_t = ctx->input(1);
+    OP_REQUIRES(
+        ctx, (dim_t.NumElements() == 1),
+        errors::InvalidArgument("'dim' must be a tensor with a single value"));
+    DCHECK_EQ(dim_t.dtype(), DataTypeToEnum<Tdim>::v());
+    Tdim dim = *static_cast<const Tdim*>(DMAHelper::base(&dim_t));
+    const TensorShape& input_shape = input_t.shape();
+    int input_dims = input_shape.dims();
+    OP_REQUIRES(ctx, dim >= -1 - input_dims && dim <= input_dims,
+                errors::InvalidArgument("Tried to expand dim index ", dim,
+                                        " for tensor with ", input_dims,
+                                        " dimensions."));
+
+    // We emulate numpy's interpretation of the dim axis when
+    // -input.dims() >= dim <= input.dims().
+    if (dim < 0) {
+      // Clamp to the end if needed.
+      dim = std::min<Tdim>(dim + input_dims + 1, input_dims);
+    }
+
+    // Compute new shape with an additional dimension.
+    absl::InlinedVector<int64_t, 8> output_shape_vec(input_dims + 1);
+    for (int64_t i = 0; i < dim; ++i) {
+      output_shape_vec[i] = input_shape.dim_size(i);
+    }
+    output_shape_vec[dim] = 1;
+    for (int64_t i = dim + 1; i < input_dims + 1; ++i) {
+      output_shape_vec[i] = input_shape.dim_size(i - 1);
+    }
+    TensorShape output_shape(output_shape_vec);
+
+    Tensor output_t;
+    if (!output_t.CopyFrom(input_t, output_shape)) {
+      // This should never happen, since the sizes of the input and output
+      // should always be the same (we only expand the dimension with 1).
+      ctx->SetStatus(
+          errors::Internal("Could not expand dimension with input shape ",
+                           ctx->input(0).shape().DebugString(),
+                           " and output shape ", output_shape.DebugString()));
+    }
+    ctx->set_output(0, std::move(output_t));
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+class SqueezeOp : public OpKernel {
+ public:
+  explicit SqueezeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    std::vector<int32> squeeze_dims;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("squeeze_dims", &squeeze_dims));
+    squeeze_dims_.insert(squeeze_dims.begin(), squeeze_dims.end());
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OP_REQUIRES(ctx, ctx->input(0).dtype() != DT_VARIANT,
+                errors::InvalidArgument("Squeeze on Variant not supported"));
+
+    auto existing_dims = ctx->input(0).shape().dim_sizes();
+    const int existing_dims_size = static_cast<int>(existing_dims.size());
+    std::vector<int64_t> new_shape;
+
+    std::unordered_set<int32> wrapped_squeeze_dims;
+    wrapped_squeeze_dims.reserve(squeeze_dims_.size());
+    // Validate squeeze dims against the input.
+    for (int32_t dim : squeeze_dims_) {
+      OP_REQUIRES(
+          ctx, (dim >= -ctx->input(0).dims() && dim < ctx->input(0).dims()),
+          errors::InvalidArgument("Tried to squeeze dim index ", dim,
+                                  " for tensor with ", ctx->input(0).dims(),
+                                  " dimensions."));
+      // If dim is < 0, we wrap around (-1 means the last element).
+      if (dim < 0) {
+        dim = existing_dims_size + dim;
+      }
+
+      wrapped_squeeze_dims.insert(dim);
+    }
+
+    for (int i = 0; i < existing_dims_size; ++i) {
+      auto existing_dim = existing_dims[i];
+
+      // If squeeze_set is non-empty, only squeeze those dimensions.
+      if (!wrapped_squeeze_dims.empty()) {
+        if (wrapped_squeeze_dims.count(i) > 0) {
+          OP_REQUIRES(ctx, existing_dim == 1,
+                      errors::InvalidArgument(
+                          "Can not squeeze dim[", i,
+                          "], expected a dimension of 1, got ", existing_dim));
+        } else {
+          // This dimension is not being squeezed.
+          new_shape.push_back(existing_dim);
+        }
+      } else {
+        // Copy over all non-1-length dimensions.
+        if (existing_dim != 1) {
+          new_shape.push_back(existing_dim);
+        }
+      }
+    }
+
+    const TensorShape output_shape(new_shape);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output));
+    if (!output->CopyFrom(ctx->input(0), output_shape)) {
+      // This should never happen, since the sizes of the input and
+      // output should always be the same.
+      ctx->SetStatus(errors::Internal("Could not squeeze input with shape ",
+                                      ctx->input(0).shape().DebugString(),
+                                      " and output shape ",
+                                      output_shape.DebugString()));
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  std::unordered_set<int32> squeeze_dims_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SHAPE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/shuffle_common.h b/third_party/tflite-hdrs/tensorflow/core/kernels/shuffle_common.h
new file mode 100644
index 00000000..0eea7fd4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/shuffle_common.h
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common utilities for random shuffling.
+
+#ifndef TENSORFLOW_CORE_KERNELS_SHUFFLE_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_SHUFFLE_COMMON_H_
+
+#include <algorithm>
+#include <functional>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+// TODO(irving): If performance is critical, generate output directly instead
+// of an in-place shuffle using a pseudorandom permutation like
+//
+//   https://github.com/otherlab/geode/blob/master/geode/random/permute.cpp
+//
+// This is probably also the right thing if we want a GPU version of shuffling.
+
+// We use our own version of std::random_shuffle to guarantee that exactly
+// size - 1 samples are used.
+template <class Iter, class Random>
+static inline void ShuffleRange(Iter first, Iter last, Random& uniform) {
+  if (first == last) return;
+  const auto stop = last - 1;
+  for (auto i = first; i != stop; ++i) {
+    using std::iter_swap;
+    iter_swap(i, i + uniform(last - i));
+  }
+}
+
+template <class IntT, class InT, class OutT, class Random>
+static void IndexedShuffle(const int64_t size, const InT& input_mat,
+                           OutT output_mat, Random& uniform) {
+  std::vector<IntT> permutation(size);
+  for (IntT i = 0; i < size; i++) {
+    permutation[i] = i;
+  }
+  ShuffleRange(permutation.begin(), permutation.end(), uniform);
+  for (IntT i = 0; i < size; i++) {
+    output_mat.template chip<0>(i) = input_mat.template chip<0>(permutation[i]);
+  }
+}
+
+template <typename T>
+absl::Status RandomShuffle(
+    OpKernelContext* context, const Tensor& input, int output_idx,
+    std::function<random::PhiloxRandom(int64_t)> get_rng) {
+  if (input.NumElements() <= 1 || input.dim_size(0) <= 1) {
+    // No shuffling is required, so copy input directly to output
+    context->set_output(output_idx, input);
+  } else {
+    // Reserve enough random samples for shuffling
+    const int64_t size = input.dim_size(0);
+    const int64_t samples = size - 1;
+    auto rng = get_rng(samples);
+    random::SingleSampleAdapter<random::PhiloxRandom> single(&rng);
+    const auto uniform = [&single](uint32 n) { return single() % n; };
+
+    if (input.dims() == 1) {
+      // For 1D data, copy and then shuffle in place
+      context->set_output(output_idx, tensor::DeepCopy(input));
+      auto vec = context->mutable_output(output_idx)->vec<T>();
+      ShuffleRange(vec.data(), vec.data() + size, uniform);
+    } else {
+      // For >= 2D, shuffle indices and then copy across
+      Tensor* output = nullptr;
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(output_idx, input.shape(), &output));
+      const auto input_mat = input.flat_outer_dims<T>();
+      auto output_mat = output->flat_outer_dims<T>();
+      if (size < kint32max) {
+        IndexedShuffle<int32>(size, input_mat, output_mat, uniform);
+      } else {
+        IndexedShuffle<int64_t>(size, input_mat, output_mat, uniform);
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SHUFFLE_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/slice_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/slice_op.h
new file mode 100644
index 00000000..1992c604
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/slice_op.h
@@ -0,0 +1,45 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_OP_H_
+
+// Functor definition for SliceOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+struct Slice {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_sizes) {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto output32, auto input32, auto slice_indices32,
+            auto slice_sizes32) {
+          output32.device(d) = input32.slice(slice_indices32, slice_sizes32);
+        },
+        output, input, slice_indices, slice_sizes);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SLICE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/slice_op_cpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/slice_op_cpu_impl.h
new file mode 100644
index 00000000..9eda840a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -0,0 +1,39 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/bfloat16.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/slice_op.h"
+
+namespace tensorflow {
+
+using CpuDevice = Eigen::ThreadPoolDevice;
+
+#define DEFINE_CPU_KERNELS(T) \
+  template struct functor::Slice<CpuDevice, T, CPU_PROVIDED_IXDIM>;
+
+TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS);
+
+#undef DEFINE_CPU_KERNELS
+
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/smooth-hinge-loss.h b/third_party/tflite-hdrs/tensorflow/core/kernels/smooth-hinge-loss.h
new file mode 100644
index 00000000..8dc2c806
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/smooth-hinge-loss.h
@@ -0,0 +1,114 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SMOOTH_HINGE_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_SMOOTH_HINGE_LOSS_H_
+
+#include <limits>
+
+#include "tensorflow/core/kernels/loss.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class SmoothHingeLossUpdater : public DualLossUpdater {
+ public:
+  // Computes the updated dual variable (corresponding) to a single example. The
+  // updated dual value maximizes the objective function of the dual
+  // optimization problem associated with smooth hinge loss. The computations
+  // are detailed in readme.md.
+  double ComputeUpdatedDual(const int num_partitions, const double label,
+                            const double example_weight,
+                            const double current_dual, const double wx,
+                            const double weighted_example_norm) const final {
+    // Intuitively there are 3 cases:
+    // a. new optimal value of the dual variable falls within the admissible
+    // range [0, 1]. In this case we set new dual to this value.
+    // b. new optimal value is < 0. Then, because of convexity, the optimal
+    // valid value for new dual = 0
+    // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
+    const double candidate_optimal_dual =
+        current_dual +
+        (label - wx - gamma * current_dual) /
+            (num_partitions * example_weight * weighted_example_norm + gamma);
+    if (label * candidate_optimal_dual < 0) {
+      return 0.0;
+    }
+    if (label * candidate_optimal_dual > 1.0) {
+      return label;
+    }
+    return candidate_optimal_dual;
+  }
+
+  double ComputeDualLoss(const double current_dual, const double example_label,
+                         const double example_weight) const final {
+    // For binary classification, there are 2 conjugate functions, one per
+    // label value (-1 and 1).
+    const double y_alpha = current_dual * example_label;  // y \alpha
+    if (y_alpha < 0 || y_alpha > 1.0) {
+      return std::numeric_limits<double>::max();
+    }
+    return (-y_alpha + 0.5 * gamma * current_dual * current_dual) *
+           example_weight;
+  }
+
+  double ComputePrimalLoss(const double wx, const double example_label,
+                           const double example_weight) const final {
+    const double y_wx = example_label * wx;
+    if (y_wx >= 1) return 0;
+    if (y_wx <= 1 - gamma) return (1 - y_wx - gamma / 2) * example_weight;
+    return (1 - y_wx) * (1 - y_wx) * example_weight * 0.5 / gamma;
+  }
+
+  // Converts binary example labels from 0.0 or 1.0 to -1.0 or 1.0 respectively
+  // as expected by smooth hinge loss.
+  absl::Status ConvertLabel(float* const example_label) const final {
+    if (*example_label == 0.0) {
+      *example_label = -1;
+      return absl::OkStatus();
+    }
+    if (*example_label == 1.0) {
+      return absl::OkStatus();
+    }
+    return errors::InvalidArgument(
+        "Only labels of 0.0 or 1.0 are supported right now. "
+        "Found example with label: ",
+        *example_label);
+  }
+
+  double PrimalLossDerivative(const double wx, const double label,
+                              const double example_weight) const final {
+    if (label * wx >= 1) {
+      return 0;
+    }
+    if (label * wx <= 1 - gamma) {
+      return -label;
+    }
+    return (wx - label) / gamma;
+  }
+
+  double SmoothnessConstant() const final { return gamma; }
+
+ private:
+  // Smoothness constant of smooth hinge loss
+  // TODO(sibyl-Aix6ihai): expose this parameter
+  const double gamma = 1;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SMOOTH_HINGE_LOSS_H_
+// TENSORFLOW_KERNELS_SMOOTH_HINGE_LOSS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/snapshot_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/snapshot_op.h
new file mode 100644
index 00000000..1047b470
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/snapshot_op.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SnapshotOp.
+template <typename Device, typename Scalar>
+struct Snapshot {
+  void operator()(const Device& device,
+                  typename TTypes<Scalar>::ConstTensor input,
+                  typename TTypes<Scalar>::Tensor output) {
+    device.memcpy(output.data(), input.data(), input.size() * sizeof(Scalar));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/softmax_op_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/softmax_op_functor.h
new file mode 100644
index 00000000..2ce16ce8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/softmax_op_functor.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+// Functor definition for SoftmaxOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftmaxOp to do the computations.
+template <typename Device, typename T>
+struct SoftmaxFunctor {
+  // Computes Softmax or LogSoftmax activation.
+  //
+  // logits: dim: batch_size, num_classes.
+  // softmax: dims: batch_size, num_classes.
+  // log: boolean
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::Matrix softmax, const bool log);
+};
+
+// Eigen code implementing SoftmaxFunctor::operator() or
+// LogSoftmaxFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct SoftmaxEigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                      typename TTypes<T>::Matrix softmax, const bool log) {
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+    batch_by_one.set(0, batch_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+    one_by_class.set(1, num_classes);
+
+    // shifted_logits = logits - max(logits along classes);
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                        .eval()
+                                        .reshape(batch_by_one)
+                                        .broadcast(one_by_class));
+    if (log) {
+      // Calculate the log of the softmax
+      // softmax = logits - max(logits along classes);
+      softmax.device(d) = shifted_logits;
+      // softmax = softmax - log(sum(exp(softmax along classes)));
+      softmax.device(d) = (softmax - softmax.exp()
+                                         .sum(along_class)
+                                         .log()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
+    } else {
+      // NOTE(touts): If you modify this implementation please run
+      // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
+      //
+      // softmax = exp(logits - max(logits along classes));
+      softmax.device(d) = shifted_logits.exp();
+      // softmax = softmax * (1 / sum(softmax along classes));
+      softmax.device(d) = (softmax * softmax.sum(along_class)
+                                         .inverse()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SOFTMAX_OP_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/softplus_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/softplus_op.h
new file mode 100644
index 00000000..1fa271a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/softplus_op.h
@@ -0,0 +1,79 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SOFTPLUS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SOFTPLUS_OP_H_
+// Functor definition for SoftplusOp and SoftplusGradOp, must be compilable by
+// nvcc.
+
+// clang-format off
+#include "tensorflow/core/platform/bfloat16.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+// clang-format on
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftplusOp to do the computations.
+template <typename Device, typename T>
+struct Softplus {
+  // Computes Softplus activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    // Choose a threshold on x below which exp(x) may underflow
+    // when added to 1, but for which exp(x) is always within epsilon of the
+    // true softplus(x).  Offset of 2 from machine epsilon checked
+    // experimentally for float16, float32, float64.  Checked against
+    // softplus implemented with numpy's log1p and numpy's logaddexp.
+    static const T threshold =
+        Eigen::numext::log(Eigen::NumTraits<T>::epsilon()) + T(2);
+    // Value above which exp(x) may overflow, but softplus(x) == x
+    // is within machine epsilon.
+    auto too_large = features > features.constant(-threshold);
+    // Value below which exp(x) may underflow, but softplus(x) == exp(x)
+    // is within machine epsilon.
+    auto too_small = features < features.constant(threshold);
+    auto features_exp = features.exp();
+    activations.device(d) = too_large.select(
+        features,                       // softplus(x) ~= x for x large
+        too_small.select(features_exp,  // softplus(x) ~= exp(x) for x small
+                         features_exp.log1p()));
+  }
+};
+
+// Functor used by SoftplusGradOp to do the computations.
+template <typename Device, typename T>
+struct SoftplusGrad {
+  // Computes SoftplusGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Softplus op.
+  // features: inputs that where passed to the Softplus op.
+  // backprops: gradients to backpropagate to the Softplus inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        gradients / ((-features).exp() + features.constant(T(1)));
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SOFTPLUS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/softsign_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/softsign_op.h
new file mode 100644
index 00000000..15de7288
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/softsign_op.h
@@ -0,0 +1,60 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SOFTSIGN_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SOFTSIGN_OP_H_
+// Functor definition for SoftsignOp and SoftsignGradOp, must be compilable by
+// nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftsignOp to do the computations.
+template <typename Device, typename T>
+struct Softsign {
+  // Computes Softsign activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        features / (features.abs() + features.constant(T(1)));
+  }
+};
+
+// Functor used by SoftsignGradOp to do the computations.
+template <typename Device, typename T>
+struct SoftsignGrad {
+  // Computes SoftsignGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Softsign op.
+  // features: inputs that were passed to the Softsign op.
+  // backprops: gradients to backpropagate to the Softsign inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        gradients / (features.abs() + features.constant(T(1))).square();
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SOFTSIGN_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/spacetobatch_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/spacetobatch_functor.h
new file mode 100644
index 00000000..7838b5e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/spacetobatch_functor.h
@@ -0,0 +1,114 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPACETOBATCH_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_SPACETOBATCH_FUNCTOR_H_
+
+#include <type_traits>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Maximum number of non-collapsible blocked dimensions supported by the
+// {SpaceToBatch,BatchToSpace}ND operation.  To change the limit, modify this
+// constant and the TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS macro definition
+// below.
+constexpr int kMaxSpaceToBatchBlockDims = 4;
+
+// Expands to:
+//   MACRO(1, ## __VA_ARGS__)
+//   ...
+//   MACRO(kMaxSpaceToBatchBlockDims, ## __VA_ARGS__)
+//
+// Note: The space between the number and the comma is necessary for proper GCC
+// comma handling: https://gcc.gnu.org/onlinedocs/cpp/Variadic-Macros.html
+#define TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(MACRO, ...) \
+  MACRO(1 /**/, ##__VA_ARGS__)                              \
+  MACRO(2 /**/, ##__VA_ARGS__)                              \
+  MACRO(3 /**/, ##__VA_ARGS__)                              \
+  MACRO(4 /**/, ##__VA_ARGS__)                              \
+  /**/
+
+namespace internal {
+namespace spacetobatch {
+
+template <typename InputType, typename OutputType>
+void SubtleMustCopyFlatHelper(const Tensor& t, OutputType* output) {
+  const int64_t num_elements = t.shape().num_elements();
+  output->resize(num_elements);
+  auto eigen_vec = t.flat<InputType>();
+  for (int64_t i = 0; i < num_elements; ++i) {
+    (*output)[i] = SubtleMustCopy(eigen_vec(i));
+  }
+}
+
+// Copies flat contents of `t` to std::vector-like `*output`, which is resized
+// as needed.  `OutputType` may be either `std::vector<int64_t>` or
+// `gtl::InlinedVector<int64_t>`.
+//
+// Precondition: t.dtype() must be either DT_INT32 or DT_INT64.
+template <typename OutputType>
+void SubtleMustCopyFlat(const Tensor& t, OutputType* output) {
+  if (t.dtype() == DT_INT32) {
+    SubtleMustCopyFlatHelper<int32, OutputType>(t, output);
+  } else {
+    SubtleMustCopyFlatHelper<int64_t, OutputType>(t, output);
+  }
+}
+
+}  // namespace spacetobatch
+}  // namespace internal
+
+namespace functor {
+
+// Functor used by {SpaceToBatch,BatchToSpace}{ND,}Op to do the conversion.
+//
+// If B2S is false, then this performs the space-to-batch conversion.  If B2S is
+// true, then this performs the inverse batch-to-space conversion.
+template <typename Device, typename T, int NUM_BLOCK_DIMS, bool B2S = false>
+struct SpaceToBatchFunctor {
+  using InputT = typename std::conditional<B2S, T, const T>::type;
+  using OutputT = typename std::conditional<B2S, const T, T>::type;
+  // Implements the space to batch conversion.
+  //
+  // space_tensor: input tensor of space-to-batch operation.  If B2S = false,
+  //     then this is the input to the conversion.  If B2S = true, then this
+  //     is the output of the conversion.
+  // block_size: array of shape [NUM_BLOCK_DIMS] specifying the block sizes for
+  //     dimensions 1 through NUM_BLOCK_DIMS.
+  // paddings: row-major array of shape [NUM_BLOCK_DIMS, 2] specifying the
+  //     start and end padding for dimensions 1 through NUM_BLOCK_DIMS.
+  // batch_tensor: output tensor of the space-to-batch operation.  If
+  //     B2S = false, then this is the output of the conversion.  If B2S = true,
+  //     then this is the input to the conversion.
+  //
+  // The caller must ensure that the dimensions of the tensors are correct.
+  absl::Status operator()(
+      const Device& d,
+      typename TTypes<InputT, NUM_BLOCK_DIMS + 2>::Tensor space_tensor,
+      const int64_t block_shape[NUM_BLOCK_DIMS],
+      const int64_t paddings[NUM_BLOCK_DIMS * 2],
+      typename TTypes<OutputT, NUM_BLOCK_DIMS + 2>::Tensor batch_tensor);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPACETOBATCH_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/spacetodepth_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/spacetodepth_op.h
new file mode 100644
index 00000000..3cb1df5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/spacetodepth_op.h
@@ -0,0 +1,57 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
+// Functor definition for XentOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SpaceToDepthOp to do the computations.
+// Implements a family of Space to Depth transforms for a 4D 'input' tensor
+// to a 4D 'output' tensor, both tensors use type 'T' and layout 'data_format'.
+// These transforms divide the vertical and horizontal image sizes by
+// 'block_size', and multiply the depth dimension size by
+// (block_size * block_size). The offset within each block_size * block_size
+// patch within the image is combined with the input channel index to form
+// the output channel index, with the Y, X coordinates within each block of
+// the input image used as the high order component of the output channel.
+// e.g. for data_format = NHWC:
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
+template <typename Device, typename T, TensorFormat data_format>
+struct SpaceToDepthOpFunctor {
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output);
+
+  // This 5-D version is to support NCHW_VECT_C.
+  void operator()(const Device& d, typename TTypes<T, 5>::ConstTensor input,
+                  int block_size, typename TTypes<T, 5>::Tensor output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/kernels.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/kernels.h
new file mode 100644
index 00000000..aff14ca0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/kernels.h
@@ -0,0 +1,257 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_KERNELS_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/sparse/sparse_matrix.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+// Calculates number of nonzero entries per batch of a sorted rank-3
+// SparseTensor's indices.  indices is expected to have columns
+// corresponding to [batch, row, column],  where indices[:,0] < B.
+//
+// REQUIRES:
+//  indices.dimension(1) == 3
+//  nnz_per_batch.dimension(0) == B
+template <typename Device>
+struct CalculateNNZPerBatchMatrixFromIndices {
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<int64_t>::ConstMatrix indices,
+                          TTypes<int32>::Vec nnz_per_batch);
+};
+
+// Split a subset of a SparseTensors' indices into two vectors:
+// COO row inds and COO col inds.  Outputs are:
+//
+//   coo_row_ind = indices[:, row_dim]
+//   coo_col_ind = indices[:, row_dim + 1]
+//
+// where n = coo_row_ind.size()
+// and row_dim = #cols(indices) - 1
+//
+// REQUIRES:
+//   host_dense_shape.size() in [2, 3]
+//   indices.dim_size(1) == host_dense_shape.size()
+//   coo_row_ind.size() == coo_col_ind.size()
+//   coo_row_ind.size() == indices.dim_size(0)
+template <typename Device>
+struct SparseTensorToCOOSparseMatrix {
+  void operator()(const Device& d, TTypes<int64_t>::ConstVec host_dense_shape,
+                  TTypes<int64_t>::ConstMatrix indices,
+                  TTypes<int32>::Vec coo_row_ind,
+                  TTypes<int32>::Vec coo_col_ind);
+};
+
+// Write coo batch, row, and column vectors to output matrix indices:
+//
+//   indices[:, row_dim] = coo_row_ind
+//   indices[:, col_dim] = coo_col_ind
+//
+// where row_dim = #cols(indices) - 1 and n = coo_row_ind.size().
+// In addition, if #cols(indices) == 3, also store the batch:
+//
+//   indices[i, 0] = batch_of(i) where
+//      host_batch_ptrs(batch_of(i)) <= i < host_batch_ptrs(batch_of(i) + 1)
+//
+// REQUIRES:
+//
+//   host_dense_shape.size() in [2, 3]
+//   indices.dim_size(1) == host_dense_shape.size()
+//   host_batch_ptr.size() ==
+//   coo_row_ind.size() == coo_col_ind.size()
+//
+template <typename Device>
+struct COOSparseMatrixToSparseTensor {
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<int64_t>::ConstVec host_dense_shape,
+                          TTypes<int32>::ConstVec host_batch_ptrs,
+                          TTypes<int32>::Vec coo_row_ind,
+                          TTypes<int32>::ConstVec coo_col_ind,
+                          TTypes<int64_t>::Matrix indices);
+};
+
+// Convert a vector of coo row indices to csr row pointers.
+//
+// REQUIRES:
+//
+//   csr_row_ptr.size() == rows + 1.
+//   max(coo_row_ptr) < rows.
+//
+template <typename Device>
+struct COOSparseMatrixToCSRSparseMatrix {
+  absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
+                          TTypes<int32>::UnalignedVec coo_row_ind,
+                          TTypes<int32>::UnalignedVec csr_row_ptr);
+};
+
+// Convert a matrix of (batched) coo row and column indices to CSR SparseMatrix
+// batch ptrs, csr row pointers and coo column indices.
+//
+// REQUIRES:
+//   batch_ptr.size() == batch_size + 1
+//   csr_row_ptr.size() == batch_size * (num_rows + 1)
+//   csr_col_ind.size() == total_nnz
+//   batch_size == 1 if rank == 2
+//
+//   where
+//     total_nnz = indices.dim_size(0)
+//     rank = indices.dim_size(1)
+//   Also csr_row_ptr should be initially filled with zeros.
+//
+struct SparseTensorToCSRSparseMatrixCPUFunctor {
+  absl::Status operator()(int64_t batch_size, int num_rows, int num_cols,
+                          TTypes<int64_t>::ConstMatrix indices,
+                          TTypes<int32>::Vec batch_ptr,
+                          TTypes<int32>::Vec csr_row_ptr,
+                          TTypes<int32>::Vec csr_col_ind);
+};
+
+// Convert a vector of csr row pointers to coo row indices.
+//
+// REQUIRES:
+//
+//   coo_row_ptr.size() == nnz.
+//   csr_row_ptr[-1] == nnz.
+//
+template <typename Device>
+struct CSRSparseMatrixToCOOSparseMatrix {
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<int32>::UnalignedConstVec csr_row_ptr,
+                          TTypes<int32>::UnalignedVec coo_row_ind);
+};
+
+// Calculates C = matmul(A, B) or C = matmul(A, B)^T, where A is in CSR format
+// and B and C are dense.
+template <typename Device, typename T>
+struct CSRSparseMatrixMatMul {
+  explicit CSRSparseMatrixMatMul(const bool transpose_output);
+  absl::Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                       typename TTypes<T>::ConstMatrix b,
+                       typename TTypes<T>::Matrix c);
+};
+
+// Calculates y = A * x, y = A^T * x, or y = A^H * x, where A is in CSR format
+// and x and y are dense vectors.
+template <typename Device, typename T>
+class CSRSparseMatrixMatVec {
+  CSRSparseMatrixMatVec(bool transpose_a, bool adjoint_a);
+  absl::Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                       const T* x, T* y);
+};
+
+// Calculates C = functor(A, B) where A and B are CSR and C is CSR
+// with a different sparsity pattern.
+template <typename Device, typename T>
+struct CSRStructureModifyingFunctor {
+  virtual ~CSRStructureModifyingFunctor() {}
+
+  virtual absl::Status Initialize() = 0;
+
+  virtual absl::Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
+                                        const ConstCSRComponent<T>& b,
+                                        size_t* bufferSize) = 0;
+
+  virtual absl::Status GetOutputStructure(const ConstCSRComponent<T>& a,
+                                          const ConstCSRComponent<T>& b,
+                                          TTypes<int32>::UnalignedVec c_row_ptr,
+                                          int* output_nnz, void* workspace) = 0;
+
+  virtual absl::Status Compute(const ConstCSRComponent<T>& a,
+                               const ConstCSRComponent<T>& b,
+                               CSRComponent<T>* c, void* workspace) = 0;
+};
+
+// Calculates C = alpha * A + beta * B, where A and B are in CSR
+// format, and alpha and beta are scalars on the host.
+template <typename Device, typename T>
+struct CSRSparseMatrixAdd : public CSRStructureModifyingFunctor<Device, T> {
+  explicit CSRSparseMatrixAdd(OpKernelContext* ctx, const T alpha,
+                              const T beta);
+};
+
+// Calculates C = matmul(A, B), where A, B, and C are in CSR format.
+template <typename Device, typename T>
+struct CSRSparseSparseMatrixMatMul
+    : public CSRStructureModifyingFunctor<Device, T> {
+  explicit CSRSparseSparseMatrixMatMul(OpKernelContext* ctx, bool transpose_a,
+                                       bool transpose_b);
+};
+
+// Calculates Y = transpose(X) where X and Y are CSR format components.
+template <typename Device, typename T>
+struct CSRSparseMatrixTransposeComponent {
+  absl::Status operator()(OpKernelContext* ctx, const ConstCSRComponent<T>& x,
+                          CSRComponent<T>* y);
+};
+
+// Calculates Y = transpose(X) where X and Y are in CSR format.
+template <typename Device, typename T>
+struct CSRSparseMatrixTranspose {
+  absl::Status operator()(OpKernelContext* ctx, bool conjugate,
+                          const CSRSparseMatrix& input_matrix,
+                          CSRSparseMatrix* output_matrix);
+};
+
+// Calculates Y = softmax(X) where X and Y are in CSR format;
+// missing coefficients in X are treates as -inf (logits of 0 probability).
+template <typename Device, typename T>
+struct CSRSparseMatrixSoftmax {
+  absl::Status operator()(OpKernelContext* ctx, const CSRSparseMatrix& logits,
+                          typename TTypes<T>::Vec softmax_values);
+};
+
+template <typename Device, typename T>
+struct CSRSparseMatrixSoftmaxGrad {
+  absl::Status operator()(OpKernelContext* ctx, const CSRSparseMatrix& softmax,
+                          const CSRSparseMatrix& grad_softmax,
+                          typename TTypes<T>::Vec gradient_values);
+};
+
+template <typename Device, typename T>
+class CSRSparseMatrixMulScalar {
+ public:
+  explicit CSRSparseMatrixMulScalar() {}
+
+  absl::Status Compute(OpKernelContext* ctx, const CSRSparseMatrix& a,
+                       typename TTypes<T>::ConstScalar b, CSRSparseMatrix* c);
+};
+
+template <typename Device, typename T>
+class CSRSparseMatrixBatchMulVec {
+ public:
+  explicit CSRSparseMatrixBatchMulVec() {}
+
+  absl::Status Compute(OpKernelContext* ctx, const CSRSparseMatrix& a,
+                       typename TTypes<T>::ConstFlat b, CSRSparseMatrix* c);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/mat_mul_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/mat_mul_op.h
new file mode 100644
index 00000000..3e55cfbc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/mat_mul_op.h
@@ -0,0 +1,1018 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_MAT_MUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_MAT_MUL_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "Eigen/SparseCore"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/sparse/kernels.h"
+#include "tensorflow/core/kernels/sparse/sparse_matrix.h"
+#include "tensorflow/core/kernels/sparse/transpose_op.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/util/cuda_sparse.h"
+#include "tensorflow/core/util/gpu_solvers.h"
+#endif
+
+namespace tensorflow {
+
+// TODO(anudhyan): These constants may be tuned based on the performance of
+// 'benchmark_sparse_matrix_mat_vec_mul'. We would like to find constants
+// which work across hardware platforms for typical matrix sizes. It should be
+// possible to observe at least 30-50% improvement as we increase the number
+// of threads by 1. If not, then it may we worth increasing kMaxShards and
+// kNumShardsPerThread. However, once we have too many shards, latency may be
+// dominated by per-shard overhead.
+//
+// Maximum number of shards into which to divide the computation for each CSR
+// Sparse Matrix instance.
+static constexpr int32_t kMaxShards = 20;
+// Number of shards allocated to each thread.
+static constexpr int32_t kNumShardsPerThread = 3;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Abstract OpKernel to compute sparse-dense matrix multiplication.
+//
+// Implements a kernel which, given a SparseMatrix `a` and dense Tensor `b`,
+// computes a dense Tensor `c` satisfying `c = a * b` where * denotes matrix
+// multiplication.
+//
+// The boolean attributes `transpose_a` and `adjoint_a` will transpose or
+// adjoint `a` before multiplication, respectively. At most one of these
+// attributes must be set to True. Corresponding attributes will transpose or
+// adjoint `b` or the output (after multiplication).
+//
+// The rank of both `a` and `b` must be equal and their shapes must be
+// compatible for matrix multiplication. Otherwise, InvalidArgument runtime
+// errors will be thrown. Only rank 2 or rank 3 inputs are supported.
+//
+template <typename Device, typename T>
+class CSRMatMulOp : public OpKernel {
+ public:
+  explicit CSRMatMulOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("transpose_a", &transpose_a_));
+    OP_REQUIRES_OK(c, c->GetAttr("transpose_b", &transpose_b_));
+    bool adjoint_a;
+    OP_REQUIRES_OK(c, c->GetAttr("adjoint_a", &adjoint_a));
+    OP_REQUIRES(c, !(adjoint_a && transpose_a_),
+                absl::InvalidArgumentError(
+                    "Only one of adjoint_a and transpose_a may be true."));
+    bool adjoint_b;
+    OP_REQUIRES_OK(c, c->GetAttr("adjoint_b", &adjoint_b));
+    OP_REQUIRES(c, !(adjoint_b && transpose_b_),
+                absl::InvalidArgumentError(
+                    "Only one of adjoint_b and transpose_b may be true."));
+    OP_REQUIRES_OK(c, c->GetAttr("transpose_output", &transpose_output_));
+    OP_REQUIRES_OK(c, c->GetAttr("conjugate_output", &conjugate_output_));
+    transpose_a_ |= adjoint_a;
+    transpose_b_ |= adjoint_b;
+    if (is_complex<T>::value) {
+      conjugate_a_ = adjoint_a;
+      conjugate_b_ = adjoint_b;
+    } else {
+      conjugate_a_ = false;
+      conjugate_b_ = false;
+    }
+  }
+
+  ~CSRMatMulOp() override {}
+
+  absl::Status ValidateInputs(const CSRSparseMatrix& sparse_matrix_a,
+                              const Tensor& dense_tensor_b, int* rank,
+                              int64_t* batch_size) {
+    if (sparse_matrix_a.dtype() != dense_tensor_b.dtype()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Input types don't match.  a.dtype == ",
+          DataTypeString(sparse_matrix_a.dtype()),
+          " vs. b.dtype == ", DataTypeString(dense_tensor_b.dtype())));
+    }
+    *rank = sparse_matrix_a.dims();
+    // TODO(ebrevdo): Add support for broadcasting matmul.
+    if (*rank != dense_tensor_b.dims()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Ranks of a and b must match, saw: ", *rank, " vs. ",
+                       dense_tensor_b.dims(), "."));
+    }
+    // A valid CSR SparseMatrix has rank 2 or rank 3.
+    *batch_size = (*rank == 2) ? 1 : dense_tensor_b.dim_size(0);
+    if (sparse_matrix_a.batch_size() != *batch_size) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Batch sizes of a and b must match, saw: ",
+          sparse_matrix_a.batch_size(), " vs. ", *batch_size, "."));
+    }
+    const auto& a_dense_shape = sparse_matrix_a.dense_shape().vec<int64_t>();
+    const int64_t a_inner_dim =
+        a_dense_shape(this->transpose_a_ ? *rank - 2 : *rank - 1);
+    const int64_t b_inner_dim =
+        dense_tensor_b.dim_size(this->transpose_b_ ? *rank - 1 : *rank - 2);
+    if (a_inner_dim != b_inner_dim) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Inner product dimensions of A and B do not agree. ",
+                       "Shapes are: ", TensorShape(a_dense_shape).DebugString(),
+                       " vs. ", dense_tensor_b.shape().DebugString()));
+    }
+    return absl::OkStatus();
+  }
+
+ public:
+  bool transpose_a_;
+  bool transpose_b_;
+  bool conjugate_a_;
+  bool conjugate_b_;
+  bool transpose_output_;
+  bool conjugate_output_;
+};
+
+// CPU Kernel to compute sparse-dense matrix multiplication.
+//
+// Uses Eigen SparseMatrix to compute the sparse-dense multiplication between
+// a CSR SparseMatrix `a` and dense Tensor `b`. If intra-op parallelism is
+// available, the implementation parallelizes the computation across each row
+// of the sparse matrix.
+template <typename T>
+class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
+  using SparseMatrix = Eigen::SparseMatrix<T, Eigen::RowMajor>;
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+ public:
+  explicit CSRMatMulCPUOp(OpKernelConstruction* c)
+      : CSRMatMulOp<CPUDevice, T>(c) {}
+
+  ~CSRMatMulCPUOp() override {}
+
+  void Compute(OpKernelContext* ctx) final {
+    const CSRSparseMatrix* sparse_matrix_a;
+    OP_REQUIRES_OK(ctx, ExtractVariantFromInput(ctx, 0, &sparse_matrix_a));
+    const Tensor& matrix_b = ctx->input(1);
+
+    int rank;
+    int64_t batch_size;
+    OP_REQUIRES_OK(ctx, this->ValidateInputs(*sparse_matrix_a, matrix_b, &rank,
+                                             &batch_size));
+
+    const auto dense_shape = sparse_matrix_a->dense_shape().vec<int64_t>();
+    int64_t num_lhs_rows = dense_shape(rank - 2);
+    int64_t num_lhs_cols = dense_shape(rank - 1);
+    int64_t num_rhs_rows = matrix_b.dim_size(rank - 2);
+    int64_t num_rhs_cols = matrix_b.dim_size(rank - 1);
+
+    if (this->transpose_a_) {
+      std::swap(num_lhs_rows, num_lhs_cols);
+    }
+
+    // Possibly transpose the dense Tensor b.
+    const Tensor* rhs = &matrix_b;
+    Tensor b_transposed;
+    if (this->transpose_b_) {
+      OP_REQUIRES_OK(
+          ctx, TransposeAndConjugateTensor(ctx, matrix_b, this->conjugate_b_,
+                                           &b_transposed));
+      rhs = &b_transposed;
+      std::swap(num_rhs_rows, num_rhs_cols);
+    }
+
+    // If we're transposing the output, then allocate a temporary buffer to
+    // store the output. Otherwise allocate the output directly.
+    Tensor* output = nullptr;
+    Tensor* matmul_result = nullptr;
+    Tensor output_transposed;
+    OP_REQUIRES_OK(
+        ctx, AllocateOutput(ctx, rank, batch_size, num_lhs_rows, num_rhs_cols,
+                            this->transpose_output_, &output,
+                            &output_transposed, &matmul_result));
+
+    if (!this->transpose_a_) {
+      SparseDenseMatMulWithoutTransposedLHS(
+          ctx, batch_size, num_lhs_rows, *sparse_matrix_a, *rhs, matmul_result);
+    } else {  // transpose_a_ == true
+      SparseDenseMatMulWithTransposedLHS(ctx, batch_size, num_lhs_rows,
+                                         num_lhs_cols, *sparse_matrix_a, *rhs,
+                                         matmul_result);
+    }
+
+    // Transpose (and conjugate) the output if necessary.
+    // Note that conjugate is only true if transpose is also true.
+    if (this->transpose_output_) {
+      OP_REQUIRES_OK(
+          ctx, TransposeAndConjugateAllocatedTensor(
+                   ctx, output_transposed, this->conjugate_output_, output));
+    } else if (this->conjugate_output_) {
+      functor::maybe_conj_inplace<CPUDevice, T>::run(
+          ctx->eigen_device<CPUDevice>(), output);
+    }
+  }
+
+ private:
+  // Allocates the output with the appropriate shape. Additionally, if
+  // transpose_output is True, allocates a temporary buffer with the transposed
+  // output. 'matmul_result' points to either output or output_transposed, based
+  // on whether transpose_output is True.
+  absl::Status AllocateOutput(OpKernelContext* ctx, const int32_t rank,
+                              const int64_t batch_size, const int64_t num_rows,
+                              const int64_t num_cols,
+                              const bool transpose_output, Tensor** output,
+                              Tensor* output_transposed,
+                              Tensor** matmul_result) {
+    TensorShape output_shape;
+    if (rank == 3) {
+      TF_RETURN_IF_ERROR(output_shape.AddDimWithStatus(batch_size));
+    }
+
+    if (!transpose_output) {
+      output_shape.AppendShape({num_rows, num_cols});
+      TF_RETURN_IF_ERROR(ctx->allocate_output(0, output_shape, output));
+      *matmul_result = *output;
+    } else {
+      TensorShape output_transposed_shape = output_shape;
+      output_transposed_shape.AppendShape({num_rows, num_cols});
+      output_shape.AppendShape({num_cols, num_rows});
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                            output_transposed_shape,
+                                            output_transposed));
+      TF_RETURN_IF_ERROR(ctx->allocate_output(0, output_shape, output));
+      *matmul_result = output_transposed;
+    }
+    return absl::OkStatus();
+  }
+
+  // Returns an Eigen::Ref expression of a sparse sub-matrix from the given
+  // contiguous segment of rows of the CSR Sparse Matrix.
+  Eigen::Ref<const SparseMatrix> GetSparseMatrixRef(
+      const CSRSparseMatrix& csr_matrix, const int batch_index,
+      const int64_t row_begin, const int64_t num_shard_rows,
+      std::vector<int32>* row_ptrs) {
+    // Compute the row pointers of the sparse sub-matrix.
+    row_ptrs->resize(num_shard_rows + 1);
+    const int64_t row_offset =
+        csr_matrix.row_pointers_vec(batch_index)(row_begin);
+    for (int64_t row_idx = 0; row_idx <= num_shard_rows; ++row_idx) {
+      row_ptrs->at(row_idx) =
+          csr_matrix.row_pointers_vec(batch_index)(row_begin + row_idx) -
+          row_offset;
+    }
+    const int64_t num_cols =
+        csr_matrix.dense_shape().vec<int64_t>()(csr_matrix.dims() - 1);
+    return Eigen::Map<const SparseMatrix>(
+        num_shard_rows /* num_rows */, num_cols /* num_cols */,
+        row_ptrs->at(num_shard_rows) /* total_nnz */, row_ptrs->data(),
+        csr_matrix.col_indices_vec(batch_index).data() + row_offset,
+        csr_matrix.values_vec<T>(batch_index).data() + row_offset);
+  }
+
+  // Sparse-Dense Matrix Multiplication between a CSRSparseMatrix (LHS) and a
+  // dense Tensor (RHS).
+  void SparseDenseMatMulWithoutTransposedLHS(OpKernelContext* ctx,
+                                             const int64_t batch_size,
+                                             const int64_t num_lhs_rows,
+                                             const CSRSparseMatrix& lhs,
+                                             const Tensor& rhs,
+                                             Tensor* output) {
+    // Parallelize matrix multiplication across batch dimensions and across
+    // rows in each batch.
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    const int32_t num_threads = worker_threads.num_threads;
+    const int64_t block_size =
+        num_lhs_rows / std::max(kMaxShards, kNumShardsPerThread * num_threads);
+    const int64_t num_rhs_rows = rhs.dim_size(rhs.dims() - 2);
+    const int64_t num_rhs_cols = rhs.dim_size(rhs.dims() - 1);
+    worker_threads.workers->ParallelFor(
+        batch_size * num_lhs_rows /* total */,
+        thread::ThreadPool::SchedulingParams(
+            thread::ThreadPool::SchedulingStrategy::
+                kFixedBlockSize /* strategy */,
+            absl::nullopt /* cost_per_unit */, block_size),
+        [&](int64_t batch_and_row_begin, int64_t batch_and_row_end) {
+          HandleBatchAndRowRange(
+              num_lhs_rows, batch_and_row_begin, batch_and_row_end,
+              [&](int64_t batch_idx, int64_t row_begin, int64_t row_end) {
+                const int64_t num_shard_rows = row_end - row_begin;
+
+                // Define an Eigen::SparseMatrix over the row range:
+                // [row_begin, row_end) of the CSR SparseMatrix A.
+                std::vector<int32> row_ptrs;
+                auto sparse_matrix = GetSparseMatrixRef(
+                    lhs, batch_idx, row_begin, num_shard_rows, &row_ptrs);
+
+                // Map the corresponding rows of the rhs.
+                ConstMatrixMap rhs_map(rhs.flat<T>().data() + batch_idx *
+                                                                  num_rhs_rows *
+                                                                  num_rhs_cols,
+                                       num_rhs_rows, num_rhs_cols);
+
+                // Write to the corresponding rows of the output matrix.
+                MatrixMap output_map(
+                    output->flat<T>().data() +
+                        batch_idx * num_lhs_rows * num_rhs_cols +
+                        row_begin * num_rhs_cols,
+                    num_shard_rows, num_rhs_cols);
+                output_map.noalias() = sparse_matrix * rhs_map;
+              });
+        });
+  }
+
+  // Sparse-Dense Matrix Multiplication assuming the CSRSparseMatrix (LHS) is
+  // to be transposed before the operation.
+  void SparseDenseMatMulWithTransposedLHS(OpKernelContext* ctx,
+                                          const int64_t batch_size,
+                                          const int64_t num_lhs_rows,
+                                          const int64_t num_lhs_cols,
+                                          const CSRSparseMatrix& lhs,
+                                          const Tensor& rhs, Tensor* output) {
+    auto device = ctx->eigen_device<CPUDevice>();
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    const int32_t num_threads = worker_threads.num_threads;
+    const int64_t num_rhs_rows = rhs.dim_size(rhs.dims() - 2);
+    const int64_t num_rhs_cols = rhs.dim_size(rhs.dims() - 1);
+    // Usually, we want to avoid transposing the sparse matrix A since it may be
+    // an expensive operation. Instead, we use the identity (A^T B) = (B^T A)^T.
+    // We don't actually transpose B or the output because it is more convenient
+    // to have them in column major form.
+    //
+    // However, if A is hypersparse and B and C are huge, transposing A will be
+    // cheaper. In the future, we should have a cost model estimating the cost
+    // of transposing all matrices (A, B, C) to decide which variant to use.
+
+    // Each thread writes to its own copy of the matrix product. These
+    // `num_threads` copies are summed together to obtain the final result.
+    Tensor matmul_result_buffer;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                           TensorShape({num_threads + 1,
+                                                        output->NumElements()}),
+                                           &matmul_result_buffer));
+    functor::SetZeroFunctor<CPUDevice, T> set_zero;
+    set_zero(device, matmul_result_buffer.flat<T>());
+
+    // Parallelize matrix multiplication across batch dimensions and across
+    // columns of A^T in each batch. These correspond to rows of A.
+    const int64_t block_size =
+        num_lhs_cols / std::max(kMaxShards, kNumShardsPerThread * num_threads);
+    worker_threads.workers->ParallelForWithWorkerId(
+        batch_size * num_lhs_cols /* total */,
+        thread::ThreadPool::SchedulingParams(
+            thread::ThreadPool::SchedulingStrategy::
+                kFixedBlockSize /* strategy */,
+            absl::nullopt /* cost_per_unit */, block_size),
+        [&](int64_t batch_and_row_begin, int64_t batch_and_row_end, int tid) {
+          HandleBatchAndRowRange(
+              num_lhs_cols, batch_and_row_begin, batch_and_row_end,
+              [&](int64_t batch_idx, int64_t row_begin, int64_t row_end) {
+                const int64_t num_shard_rows = row_end - row_begin;
+
+                // Define a new sparse sub-matrix from the row range
+                // [row_begin, row_end) of the sparse matrix A.
+                std::vector<int32> row_ptrs;
+                auto sparse_matrix = GetSparseMatrixRef(
+                    lhs, batch_idx, row_begin, num_shard_rows, &row_ptrs);
+
+                // Map the corresponding `num_shard_rows` columns of B^T.
+                // This is the same as taking the `num_shard_rows` rows of B.
+                ConstMatrixMap b_dense_map(
+                    rhs.flat<T>().data() +
+                        batch_idx * num_rhs_rows * num_rhs_cols +
+                        row_begin * num_rhs_cols,
+                    num_shard_rows, num_rhs_cols);
+
+                // Map to the corresponding rows of the output.
+                MatrixMap output_map(
+                    matmul_result_buffer.flat<T>().data() +
+                        tid * batch_size * num_lhs_rows * num_rhs_cols +
+                        batch_idx * num_lhs_rows * num_rhs_cols,
+                    num_lhs_rows, num_rhs_cols);
+
+                // Compute the product C^T = B^T * A; restricted to the row
+                // range in the current shard.
+                if (this->conjugate_a_) {
+                  output_map.transpose().noalias() +=
+                      b_dense_map.transpose() * sparse_matrix.conjugate();
+                } else {
+                  output_map.transpose().noalias() +=
+                      b_dense_map.transpose() * sparse_matrix;
+                }
+              });
+        });
+
+    // Sum across each thread's matmul result.
+    using Reducer = Eigen::internal::SumReducer<T>;
+    using Index = typename TTypes<T>::Tensor::Index;
+    output->flat<T>().device(device) = matmul_result_buffer.matrix<T>().reduce(
+        Eigen::array<Index, 1>({0}), Reducer());
+  }
+
+  // Given a range [batch_and_row_begin, batch_and_row_end) which is a
+  // contiguous subset of [0, num_rows * batch_size), calls the function
+  // fn(batch_idx, row_begin, row_end) for each batch index
+  // and the row range [row_begin, row_end) contained in the batch.
+  void HandleBatchAndRowRange(
+      const int64_t num_rows, const int64_t batch_and_row_begin,
+      const int64_t batch_and_row_end,
+      const std::function<void(int64_t, int64_t, int64_t)>& fn) {
+    // Obtain the batch indices overlapping with the current shard.
+    const int64_t batch_begin = batch_and_row_begin / num_rows;
+    const int64_t batch_end_inclusive = batch_and_row_end / num_rows;
+
+    for (int64_t batch_idx = batch_begin; batch_idx <= batch_end_inclusive;
+         ++batch_idx) {
+      // Find the contiguous set of rows which are contained in this shard as
+      // well as the current batch. We intersect with interval [batch_idx *
+      // num_rows, (batch_idx + 1) * num_rows) which denotes the current batch.
+      const int64_t current_batch_row_begin =
+          std::max(batch_and_row_begin, batch_idx * num_rows);
+      const int64_t current_batch_row_end =
+          std::min(batch_and_row_end, (batch_idx + 1) * num_rows);
+
+      const int64_t row_begin = current_batch_row_begin % num_rows;
+      const int64_t num_shard_rows =
+          current_batch_row_end - current_batch_row_begin;
+      // Edge case for when current_batch_row_end is the first index of a new
+      // row.
+      if (num_shard_rows == 0) continue;
+
+      fn(batch_idx, row_begin, row_begin + num_shard_rows);
+    }
+  }
+
+  // Transposes (and optionally, conjugates) a given Tensor. Also allocates the
+  // required memory for the output Tensor.
+  absl::Status TransposeAndConjugateTensor(OpKernelContext* ctx,
+                                           const Tensor& input, bool conjugate,
+                                           Tensor* output) {
+    TensorShape transposed_shape = input.shape();
+    transposed_shape.set_dim(input.dims() - 1,
+                             input.dim_size(input.dims() - 2));
+    transposed_shape.set_dim(input.dims() - 2,
+                             input.dim_size(input.dims() - 1));
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_temp(DataTypeToEnum<T>::value, transposed_shape, output));
+    return TransposeAndConjugateAllocatedTensor(ctx, input, conjugate, output);
+  }
+
+  // Transposes (and optionally, conjugates) a given Tensor. The output should
+  // be already allocated.
+  absl::Status TransposeAndConjugateAllocatedTensor(OpKernelContext* ctx,
+                                                    const Tensor& input,
+                                                    bool conjugate,
+                                                    Tensor* output) {
+    if (conjugate) {
+      TF_RETURN_IF_ERROR(DoConjugateMatrixTranspose(
+          ctx->eigen_device<CPUDevice>(), input, output));
+    } else {
+      TF_RETURN_IF_ERROR(
+          DoMatrixTranspose(ctx->eigen_device<CPUDevice>(), input, output));
+    }
+    return absl::OkStatus();
+  }
+};
+
+// GPU Kernel to compute sparse-dense matrix multiplication.
+template <typename T>
+class CSRMatMulGPUOp : public CSRMatMulOp<GPUDevice, T> {
+  using SparseMatrix = Eigen::SparseMatrix<T, Eigen::RowMajor>;
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+ public:
+  explicit CSRMatMulGPUOp(OpKernelConstruction* c)
+      : CSRMatMulOp<GPUDevice, T>(c) {}
+
+  ~CSRMatMulGPUOp() override {}
+
+  void Compute(OpKernelContext* ctx) final {
+    const CSRSparseMatrix* a_matrix;
+    OP_REQUIRES_OK(ctx, ExtractVariantFromInput(ctx, 0, &a_matrix));
+    const Tensor& b_t = ctx->input(1);
+
+    int rank;
+    int64_t batch_size;
+    OP_REQUIRES_OK(ctx,
+                   this->ValidateInputs(*a_matrix, b_t, &rank, &batch_size));
+
+    const Tensor& a_dense_shape_t = a_matrix->dense_shape();
+    TensorShape a_dense_tensor_shape;
+    auto a_dense_shape = a_dense_shape_t.vec<int64_t>();
+    OP_REQUIRES_OK(
+        ctx, TensorShapeUtils::MakeShape(a_dense_shape, &a_dense_tensor_shape));
+
+    const int row_dim = (rank == 2) ? 0 : 1;
+    const int64_t a_outer_dim = a_dense_tensor_shape.dim_size(
+        this->transpose_a_ ? row_dim + 1 : row_dim);
+    const int64_t b_inner_dim =
+        b_t.shape().dim_size(this->transpose_b_ ? row_dim + 1 : row_dim);
+    const int64_t b_outer_dim =
+        b_t.dim_size(this->transpose_b_ ? row_dim : row_dim + 1);
+    const int64_t b_slice_size = b_inner_dim * b_outer_dim;
+
+    TensorShape c_shape;
+    if (rank == 3) {
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(batch_size));
+    }
+    if (this->transpose_output_) {
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(b_outer_dim));
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(a_outer_dim));
+    } else {
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(a_outer_dim));
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(b_outer_dim));
+    }
+
+    const int64_t c_matrix_lhs = c_shape.dim_size(row_dim);
+    const int64_t c_matrix_rhs = c_shape.dim_size(row_dim + 1);
+    const int64_t c_slice_size = c_matrix_lhs * c_matrix_rhs;
+    Tensor* c_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, c_shape, &c_t));
+
+    const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+    bool use_matrix_vector_multiply = (b_outer_dim == 1);
+#if TENSORFLOW_USE_ROCM
+    // ROCm hipsparse does not implement csrmv with transposed input a
+    use_matrix_vector_multiply =
+        use_matrix_vector_multiply && !this->transpose_a_;
+#endif
+    if (use_matrix_vector_multiply) {
+      // Call matrix-vector multiply if b is a vector.
+      TTypes<int64_t>::ConstVec a_dense_shape_comp(
+          a_dense_shape.data() + row_dim, 2);
+      Tensor b_conj_t;
+      const T* b_base_ptr = b_t.template flat<T>().data();
+      bool conjugate_a = this->conjugate_a_;
+      bool conjugate_output = this->conjugate_output_;
+      if (this->conjugate_b_) {
+        if (conjugate_a) {
+          // In this case we can use the identity
+          //   conj(a) * conj(b) = conj(a * b)
+          // instead of creating a conjugated copy of b.
+          conjugate_a = false;
+          conjugate_output = !conjugate_output;
+        } else {
+          OP_REQUIRES_OK(
+              ctx, ctx->forward_input_or_allocate_temp(
+                       {1}, DataTypeToEnum<T>::value, b_t.shape(), &b_conj_t));
+          functor::maybe_conj<GPUDevice, T>::run(d, b_t, &b_conj_t);
+          b_base_ptr = b_conj_t.template flat<T>().data();
+        }
+      }
+
+      functor::CSRSparseMatrixMatVec<GPUDevice, T> csr_spmv(this->transpose_a_,
+                                                            conjugate_a);
+      for (int i = 0; i < batch_size; ++i) {
+        auto a_row_ptr = a_matrix->row_pointers_vec(i);
+        auto a_col_ind = a_matrix->col_indices_vec(i);
+        auto a_values = a_matrix->values_vec<T>(i);
+        ConstCSRComponent<T> a_comp{a_row_ptr, a_col_ind, a_values,
+                                    a_dense_shape_comp};
+        const T* b_i = b_base_ptr + i * b_slice_size;
+        T* c_i = &c_t->template flat<T>()(i * c_slice_size);
+        absl::Status s = csr_spmv.Compute(ctx, a_comp, b_i, c_i);
+        OP_REQUIRES_OK(ctx, s);
+      }
+      if (conjugate_output) {
+        functor::maybe_conj_inplace<GPUDevice, T>::run(d, c_t);
+      }
+      return;
+    }
+
+    functor::CSRSparseMatrixMatMul<GPUDevice, T> csr_spmmadd(
+        this->transpose_output_);
+
+    Tensor c_mat_col_major_t;
+    if (!this->transpose_output_) {
+      // If transpose_output is false, we'll need to transpose the (col
+      // major) output of the csrgemm call to get proper (row-major)
+      // output.  Which means we need to keep a temporary buffer to
+      // store the intermediate gemm output.
+      TensorShape c_mat_col_major_shape;
+      if (rank == 2) {
+        c_mat_col_major_shape = TensorShape({c_matrix_rhs, c_matrix_lhs});
+      } else {
+        c_mat_col_major_shape =
+            TensorShape({batch_size, c_matrix_rhs, c_matrix_lhs});
+      }
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                  c_mat_col_major_shape, &c_mat_col_major_t));
+    }
+
+    // If transpose_output is true, return the direct (column-major i.e.,
+    // transposed) output of the csrgemm call.  Otherwise we'll need
+    // to transpose it to row major format.
+    auto c_mat_col_major = (this->transpose_output_)
+                               ? c_t->flat<T>()
+                               : c_mat_col_major_t.flat<T>();
+
+    // Possibly transpose a.
+    const CSRSparseMatrix* a_input_matrix;
+    // If we need to transpose a, we will store the result temporarily
+    // in the object below.
+    CSRSparseMatrix a_matrix_transposed;
+    if (!this->transpose_a_) {
+      a_input_matrix = a_matrix;
+    } else {
+      functor::CSRSparseMatrixTranspose<GPUDevice, T> transpose;
+      OP_REQUIRES_OK(ctx, transpose(ctx, this->conjugate_a_, *a_matrix,
+                                    &a_matrix_transposed));
+      a_input_matrix = &a_matrix_transposed;
+    }
+
+    auto a_input_dense_shape = a_input_matrix->dense_shape().vec<int64_t>();
+
+    // Possibly transpose b.
+    Tensor b_t_input;
+    if (!this->transpose_b_) {
+      b_t_input = b_t;
+    } else {
+      TensorShape b_t_transposed_shape;
+      if (rank == 3) {
+        OP_REQUIRES_OK(ctx, b_t_transposed_shape.AddDimWithStatus(batch_size));
+      }
+      OP_REQUIRES_OK(ctx, b_t_transposed_shape.AddDimWithStatus(
+                              b_t.dim_size(row_dim + 1)));
+      OP_REQUIRES_OK(
+          ctx, b_t_transposed_shape.AddDimWithStatus(b_t.dim_size(row_dim)));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             b_t_transposed_shape, &b_t_input));
+      const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+      if (this->conjugate_b_) {
+        OP_REQUIRES_OK(ctx, DoConjugateMatrixTranspose(d, b_t /*input*/,
+                                                       &b_t_input /*output*/));
+      } else {
+        OP_REQUIRES_OK(
+            ctx, DoMatrixTranspose(d, b_t /*input*/, &b_t_input /*output*/));
+      }
+    }
+
+    // Dense shape of a batch component of A.
+    TTypes<int64_t>::ConstVec a_input_dense_shape_comp(
+        a_input_dense_shape.data() + row_dim, 2);
+
+    auto b = b_t_input.flat<T>();
+
+    for (int i = 0; i < batch_size; ++i) {
+      auto a_row_ptr = a_input_matrix->row_pointers_vec(i);
+      auto a_col_ind = a_input_matrix->col_indices_vec(i);
+      auto a_values = a_input_matrix->values_vec<T>(i);
+      typename TTypes<T>::UnalignedConstMatrix b_i(b.data() + i * b_slice_size,
+                                                   {b_inner_dim, b_outer_dim});
+      typename TTypes<T>::UnalignedMatrix c_mat_col_major_i(
+          c_mat_col_major.data() + i * c_slice_size,
+          {c_matrix_lhs, c_matrix_rhs});
+      ConstCSRComponent<T> a_comp{a_row_ptr, a_col_ind, a_values,
+                                  a_input_dense_shape_comp};
+      absl::Status s = csr_spmmadd.Compute(ctx, a_comp, b_i, c_mat_col_major_i);
+      OP_REQUIRES_OK(ctx, s);
+    }
+
+    if (!this->transpose_output_) {
+      // We need to return values in row major format, so transpose
+      // the column-major values in c_mat_col_major_t to row-major output c_t.
+      OP_REQUIRES_OK(ctx, DoMatrixTranspose(d, /*input=*/c_mat_col_major_t,
+                                            /*output=*/c_t));
+    }
+    if (this->conjugate_output_) {
+      functor::maybe_conj_inplace<GPUDevice, T>::run(d, c_t);
+    }
+  }
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace functor {
+
+namespace gpu_data_type {
+
+// GPUDataType<T>::type translates from a C++ type (e.g. float) to a
+// GPUDataType_t (e.g. CUDA_R_32F).
+template <typename T>
+struct GPUDataType;
+
+template <>
+struct GPUDataType<Eigen::half> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_R_16F;
+#else
+  static constexpr hipDataType type = HIP_R_16F;
+#endif
+};
+
+template <>
+struct GPUDataType<float> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_R_32F;
+#else
+  static constexpr hipDataType type = HIP_R_32F;
+#endif
+};
+
+template <>
+struct GPUDataType<std::complex<float>> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_C_32F;
+#else
+  static constexpr hipDataType type = HIP_C_32F;
+#endif
+};
+
+template <>
+struct GPUDataType<double> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_R_64F;
+#else
+  static constexpr hipDataType type = HIP_R_64F;
+#endif
+};
+
+template <>
+struct GPUDataType<std::complex<double>> {
+#if GOOGLE_CUDA
+  static constexpr cudaDataType_t type = CUDA_C_64F;
+#else
+  static constexpr hipDataType type = HIP_C_64F;
+#endif
+};
+
+}  // namespace gpu_data_type
+
+template <typename T>
+class CSRSparseMatrixMatMul<GPUDevice, T> {
+ public:
+  explicit CSRSparseMatrixMatMul(const bool transpose_output)
+      : transpose_output_(transpose_output) {}
+
+  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                 typename TTypes<T>::UnalignedConstMatrix b,
+                 typename TTypes<T>::UnalignedMatrix c) {
+    GpuSparse cuda_sparse(ctx);
+    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
+    {
+      // Use Csrmm/SpMM to calculate:
+      //   C = alpha * op(A) * op(B) + beta * C
+      // where alpha = 1.0, beta = 0.0, A is sparse and B and C are dense.
+      // Note that Csrmm/Spmm assumes B and C are in column-major form; so we
+      // use transB == true, and manually transpose the output in place
+      // using blas<t>geam.
+      // TODO(ebrevdo,rmlarsen): Add support for transposition and adjoint.
+
+      // Create alpha and beta scalars; alpha = 1.0, beta = 0.0
+      // TODO(ebrevdo,rmlarsen): Add support for non-trivial alpha and beta.
+      const T alpha = 1;
+      const T beta = 0;
+
+      // A is (m, k), Bt is (ldb, k) and Ct is (ldc, n)
+      const int k = b.dimension(0);
+      DCHECK_EQ(k, a.dense_shape_host(1));
+
+      // If transpose_output_ is true, then the c matrix we receive
+      // here is the direct row major output (into which we will store
+      // csrgemm's col major output).  Otherwise it's a
+      // temporary tensor that will store the column major output that
+      // will eventually be transposed.
+      const int m = c.dimension(transpose_output_ ? 1 : 0);
+      const int n = c.dimension(transpose_output_ ? 0 : 1);
+      DCHECK_EQ(m, a.dense_shape_host(0));
+      DCHECK_EQ(n, b.dimension(1));
+      const int nnz = a.values.size();
+      DCHECK_EQ(nnz, a.col_ind.size());
+
+      // ldb: leading dimension of B. If op(B)=B, it must be at least max(1, k)
+      // if op(A) = A and at least max (1, m) otherwise. If op(B) != B, it must
+      // be at least max(1, n).
+      const int ldb = n;
+      // ldc: leading dimension of C. It must be at least max(1, m) if
+      // op(A) = A and at least max(1, k) otherwise.
+      const int ldc = m;
+
+      // transA must be non-transpose if transB is transpose (cusparse
+      // limitation).
+#if GOOGLE_CUDA
+      const gpusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+#elif TENSORFLOW_USE_ROCM
+      const gpusparseOperation_t transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+#endif
+
+      // transB: b is row-major, and cusparse requires col-major b (or
+      // equivalently transB == transpose).  this version is actually more
+      // efficient.
+#if GOOGLE_CUDA && CUDA_VERSION >= 10020
+
+      const gpusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;
+      gpusparseSpMatDescr_t matA;
+      gpusparseDnMatDescr_t matB, matC;
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsr(
+          &matA, m, k, nnz, const_cast<int*>(a.row_ptr.data()),
+          const_cast<int*>(a.col_ind.data()), const_cast<T*>(a.values.data()),
+          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+          gpu_data_type::GPUDataType<T>::type));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateDnMat(
+          &matB, n, k, ldb, const_cast<T*>(b.data()),
+          gpu_data_type::GPUDataType<T>::type, CUSPARSE_ORDER_COL));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateDnMat(
+          &matC, m, n, ldc, c.data(), gpu_data_type::GPUDataType<T>::type,
+          CUSPARSE_ORDER_COL));
+
+#if CUDA_VERSION >= 12000
+      cusparseSpMMAlg_t algo = CUSPARSE_SPMM_ALG_DEFAULT;
+#else
+      cusparseSpMMAlg_t algo = CUSPARSE_MM_ALG_DEFAULT;
+#endif
+      size_t bufferSize = 0;
+      TF_RETURN_IF_ERROR(cuda_sparse.SpMMBufferSize(
+          transA, transB, &alpha, matA, matB, &beta, matC, algo, &bufferSize));
+
+      Tensor buffer;
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(
+          DT_INT8, TensorShape({static_cast<int64_t>(bufferSize)}), &buffer));
+      DCHECK(buffer.flat<int8>().data() != nullptr);
+
+      TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
+                                          &beta, matC, algo,
+                                          buffer.flat<int8>().data()));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matB));
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matC));
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroySpMat(matA));
+
+#elif TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 40200
+      // Use SPMM
+      const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
+      gpusparseSpMatDescr_t matA;
+      gpusparseDnMatDescr_t matB, matC;
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateCsr(
+          &matA, m, k, nnz, const_cast<int*>(a.row_ptr.data()),
+          const_cast<int*>(a.col_ind.data()), const_cast<T*>(a.values.data()),
+          HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO,
+          gpu_data_type::GPUDataType<T>::type));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateDnMat(
+          &matB, n, k, ldb, const_cast<T*>(b.data()),
+          gpu_data_type::GPUDataType<T>::type, HIPSPARSE_ORDER_COLUMN));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateDnMat(
+          &matC, m, n, ldc, c.data(), gpu_data_type::GPUDataType<T>::type,
+          HIPSPARSE_ORDER_COLUMN));
+
+      size_t bufferSize = 0;
+      TF_RETURN_IF_ERROR(cuda_sparse.SpMMBufferSize(
+          transA, transB, &alpha, matA, matB, &beta, matC,
+          HIPSPARSE_MM_ALG_DEFAULT, &bufferSize));
+
+      Tensor buffer;
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(
+          DT_INT8, TensorShape({static_cast<int64_t>(bufferSize)}), &buffer));
+      DCHECK(buffer.flat<int8>().data() != nullptr);
+
+      TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
+                                          &beta, matC, HIPSPARSE_MM_ALG_DEFAULT,
+                                          buffer.flat<int8>().data()));
+
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseDestroyDnMat(matB));
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseDestroyDnMat(matC));
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseDestroySpMat(matA));
+
+#else
+
+#if GOOGLE_CUDA
+
+      const gpusparseOperation_t transB = CUSPARSE_OPERATION_TRANSPOSE;
+
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+
+#elif TENSORFLOW_USE_ROCM
+
+      const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
+
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          se::wrap::hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseSetMatIndexBase(
+          descrA, HIPSPARSE_INDEX_BASE_ZERO));
+#endif  // GOOGLE_CUDA
+
+      TF_RETURN_IF_ERROR(
+          cuda_sparse.Csrmm(transA, transB, m, n, k, nnz, &alpha, descrA,
+                            a.values.data(), a.row_ptr.data(), a.col_ind.data(),
+                            b.data(), ldb, &beta, c.data(), ldc));
+
+#endif  // GOOGLE_CUDA && CUDA_VERSION >= 10020
+    }
+
+    return OkStatus();
+  }
+
+ private:
+  bool transpose_output_;
+};
+
+template <typename T>
+class CSRSparseMatrixMatVec<GPUDevice, T> {
+ public:
+  CSRSparseMatrixMatVec(bool transpose_a, bool conjugate_a)
+      : transA_(TransposeAndConjugateToGpuSparseOp(transpose_a, conjugate_a,
+                                                   &status_)) {}
+
+  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                 const T* x, T* y) {
+    TF_RETURN_IF_ERROR(status_);
+    GpuSparse cuda_sparse(ctx);
+    TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
+    {
+      // Use Csrmv to calculate:
+      //   y = alpha * op(A) * x + beta * y
+      // where alpha = 1.0, beta = 0.0, A is a sparse matrix and x and y are
+      // dense vectors.
+
+      // Create alpha and beta scalars; alpha = 1.0, beta = 0.0
+      // TODO(rmlarsen,ebrevdo): Add support for general alpha, beta.
+      const T alpha = 1;
+      const T beta = 0;
+
+#if GOOGLE_CUDA && CUDA_VERSION < 10020
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+#elif TENSORFLOW_USE_ROCM
+      gpusparseMatDescr_t descrA;
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(
+          se::wrap::hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+      TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseSetMatIndexBase(
+          descrA, HIPSPARSE_INDEX_BASE_ZERO));
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+      const int m = a.dense_shape_host(0);
+      const int n = a.dense_shape_host(1);
+      const int nnz = a.values.size();
+      DCHECK_EQ(nnz, a.col_ind.size());
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10020)
+      TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha,
+                                           a.values.data(), a.row_ptr.data(),
+                                           a.col_ind.data(), x, &beta, y));
+#else
+      TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha, descrA,
+                                           a.values.data(), a.row_ptr.data(),
+                                           a.col_ind.data(), x, &beta, y));
+#endif
+    }
+
+    return OkStatus();
+  }
+
+ private:
+  Status status_;
+  const gpusparseOperation_t transA_;
+};
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_MAT_MUL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/sparse_matrix.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/sparse_matrix.h
new file mode 100644
index 00000000..8e5ff45f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/sparse_matrix.h
@@ -0,0 +1,655 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_SPARSE_MATRIX_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_SPARSE_MATRIX_H_
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+class CSRSparseMatrix {
+  // CreateCSRSparseMatrix is the main method used to construct a
+  // CSRSparseMatrix.  The representations for both 2D and 3D
+  // (batched) CSR Sparse Matrices are the same:
+  //
+  // dtype: The datatype of the values.
+  // dense_shape: The dense shape of the matrix.
+  //   * Host int64 vector, size 2 or 3.
+  //   * Takes on values: (rows, cols) or (batch_size, rows, cols).
+  // batch_pointers: Batch offset pointers into col_indices and values.
+  //   * Host int32 vector, size (batch_size + 1).
+  //   * Takes on values: (0, nnz[0], nnz[0] + nnz[1], ..., total_nnz).
+  // row_pointers: Row offset pointers into col_indices and values.
+  //   * Device int32 vector, size ((rows + 1) * batch_size).
+  //   * Each block of size (rows + 1) takes on values:
+  //     (0, num_rows{b}[0], num_rows{b}[0] + num_rows{b}[1], ..., nnz[b]).
+  //     for b = 0 .. batch_size - 1.
+  // col_indices: Column values for the given row and column index.
+  //   * Device int32 vector, size total_nnz.
+  // values: Actual values for the given row and column index.
+  //   * Device dtype vector, size total_nnz.
+  //
+  // The storage agreement is such that for a given (batch, row, ix):
+  //   offset = batch_pointers(batch) + row_pointers(batch * (rows + 1) + row)
+  //   col = col_indices(offset + ix)
+  //   val = values(offset + ix)
+  // where ix < #nnz columns in (batch, row).
+  // Then:
+  //   matrix(batch, row, col) = val.
+  //
+  // All other elements in the dense representation are treated as 0 / empty.
+  //
+  // For example, for a 2D sparse matrix m shaped (3, 4) such that:
+  //
+  //   m[0, 0] = 1.0
+  //   m[0, 1] = 2.0
+  //   m[0, 2] = 3.0
+  //   m[2, 2] = 4.0
+  //   m[2, 3] = 5.0
+  //
+  // The corresponding representation is:
+  //
+  //   dtype: DT_FLOAT
+  //   dense_shape: (3, 4)
+  //   batch_pointers: (0, 5)
+  //   row_pointers: (0, 3, 3, 5)
+  //   col_indices: concat((0, 1, 2), (), (2, 3))
+  //   values: concat((1.0, 2.0, 3.0), (), (4.0, 5.0))
+  //
+  // For a 3D sparse matrix m shaped (2, 3, 4) such that:
+  //
+  //   m[0, 0, 0] = 1.0
+  //   m[0, 0, 2] = 2.0
+  //   m[0, 2, 3] = 3.0
+  //   m[1, 0, 3] = 4.0
+  //   m[1, 1, 0] = 5.0
+  //
+  // The corresponding representation is:
+  //   dtype: DT_FLOAT
+  //   dense_shape: (2, 3, 4)
+  //   batch_pointers: (0, 3, 5)
+  //   row_pointers: concat((0, 2, 2, 3), (0, 1, 2, 2))
+  //   col_indices: concat(concat((0, 2), (), (3,)),
+  //                       concat((3,),   (), (0,)))
+  //   values: concat(concat((1.0, 2.0), (3.0,), ()),
+  ///                 concat((4.0,),     (5.0,), ()))
+  //
+ public:
+  static constexpr const char kTypeName[] = "tensorflow::CSRSparseMatrix";
+
+  CSRSparseMatrix() : metadata_{false, DT_INVALID} {}
+
+  CSRSparseMatrix(const CSRSparseMatrix& rhs)
+      : metadata_(rhs.metadata_),
+        dense_shape_(rhs.dense_shape_),
+        batch_pointers_(rhs.batch_pointers_),
+        row_pointers_(rhs.row_pointers_),
+        col_indices_(rhs.col_indices_),
+        values_(rhs.values_) {
+    SetupVecs();
+  }
+
+  CSRSparseMatrix(CSRSparseMatrix&& rhs)
+      : metadata_(rhs.metadata_),
+        dense_shape_(std::move(rhs.dense_shape_)),
+        batch_pointers_(std::move(rhs.batch_pointers_)),
+        row_pointers_(std::move(rhs.row_pointers_)),
+        col_indices_(std::move(rhs.col_indices_)),
+        values_(std::move(rhs.values_)) {
+    SetupVecs();
+    rhs.metadata_.validated = false;
+    rhs.metadata_.dtype = DT_INVALID;
+    rhs.ClearVecs();
+  }
+
+  CSRSparseMatrix& operator=(CSRSparseMatrix&& rhs) {
+    if (this == &rhs) return *this;
+    metadata_ = rhs.metadata_;
+    metadata_.validated = rhs.metadata_.validated;
+    dense_shape_ = std::move(rhs.dense_shape_);
+    batch_pointers_ = std::move(rhs.batch_pointers_);
+    row_pointers_ = std::move(rhs.row_pointers_);
+    col_indices_ = std::move(rhs.col_indices_);
+    values_ = std::move(rhs.values_);
+    SetupVecs();
+    rhs.metadata_ = {false, DT_INVALID};
+    rhs.ClearVecs();
+    return *this;
+  }
+
+  static absl::Status CreateCSRSparseMatrix(
+      DataType dtype,
+      const Tensor& dense_shape,     // on host
+      const Tensor& batch_pointers,  // on host
+      const Tensor& row_pointers, const Tensor& col_indices,
+      const Tensor& values, CSRSparseMatrix* matrix) {
+    *matrix = CSRSparseMatrix(dtype, dense_shape, batch_pointers, row_pointers,
+                              col_indices, values);
+    absl::Status s = matrix->Validate();
+    matrix->metadata_.validated = s.ok();
+    matrix->SetupVecs();
+    return s;
+  }
+
+  absl::Status Validate() const {
+    return ValidateTypesAndShapes(metadata_.dtype, dense_shape_,
+                                  batch_pointers_, row_pointers_, col_indices_,
+                                  values_);
+  }
+
+  void Clear() {
+    metadata_ = {false, DT_INVALID};
+    dense_shape_ = Tensor();
+    batch_pointers_ = Tensor();
+    row_pointers_ = Tensor();
+    col_indices_ = Tensor();
+    values_ = Tensor();
+    ClearVecs();
+  }
+
+  bool valid() const {
+    return metadata_.validated && dense_shape_.IsInitialized() &&
+           batch_pointers_.IsInitialized() && row_pointers_.IsInitialized() &&
+           col_indices_.IsInitialized() && values_.IsInitialized() &&
+           dense_shape_.NumElements() > 1 &&
+           batch_pointers_.NumElements() > 0 && row_pointers_.NumElements() > 0;
+  }
+
+  DataType dtype() const {
+    DCHECK(valid());
+    return metadata_.dtype;
+  }
+
+  inline int dims() const {
+    DCHECK(valid());
+    return dense_shape_.NumElements();
+  }
+
+  inline int nnz(int batch) const {
+    DCHECK_LT(batch, batch_size());
+    return (*batch_pointers_vec_)(batch + 1) - (*batch_pointers_vec_)(batch);
+  }
+
+  inline int batch_offset(int batch) const {
+    DCHECK_LT(batch, batch_size());
+    return (*batch_pointers_vec_)(batch);
+  }
+
+  inline int total_nnz() const {
+    DCHECK(valid());
+    return (*batch_pointers_vec_)(batch_size());
+  }
+
+  inline Tensor& dense_shape() {
+    DCHECK(valid());
+    return dense_shape_;
+  }
+
+  inline const Tensor& dense_shape() const {
+    DCHECK(valid());
+    return dense_shape_;
+  }
+
+  inline TTypes<int32>::UnalignedVec row_pointers_vec(int batch) {
+    DCHECK(valid());
+    DCHECK_LT(batch, batch_size());
+    const int64_t rows = dense_shape().vec<int64_t>()((dims() == 2) ? 0 : 1);
+    const int offset = batch * (rows + 1);
+    return TTypes<int32>::UnalignedVec(row_pointers_vec_->data() + offset,
+                                       rows + 1);
+  }
+
+  inline TTypes<int32>::UnalignedConstVec row_pointers_vec(int batch) const {
+    DCHECK(valid());
+    DCHECK_LT(batch, batch_size());
+    const int64_t rows = dense_shape().vec<int64_t>()((dims() == 2) ? 0 : 1);
+    const int offset = batch * (rows + 1);
+    return TTypes<int32>::UnalignedConstVec(row_pointers_vec_->data() + offset,
+                                            rows + 1);
+  }
+
+  inline TTypes<int32>::UnalignedVec col_indices_vec(int batch) {
+    DCHECK(valid());
+    DCHECK_LT(batch, batch_size());
+    const int offset = (*batch_pointers_vec_)(batch);
+    const int nnz_in_batch = nnz(batch);
+    return TTypes<int32>::UnalignedVec(col_indices_vec_->data() + offset,
+                                       nnz_in_batch);
+  }
+
+  inline TTypes<int32>::UnalignedConstVec col_indices_vec(int batch) const {
+    DCHECK(valid());
+    DCHECK_LT(batch, batch_size());
+    const int offset = (*batch_pointers_vec_)(batch);
+    const int nnz_in_batch = nnz(batch);
+    return TTypes<int32>::UnalignedConstVec(col_indices_vec_->data() + offset,
+                                            nnz_in_batch);
+  }
+
+  template <typename T>
+  inline typename TTypes<T>::UnalignedVec values_vec(int batch) {
+    DCHECK(valid());
+    DCHECK_LT(batch, batch_size());
+    const int offset = (*batch_pointers_vec_)(batch);
+    const int nnz_in_batch = nnz(batch);
+    return typename TTypes<T>::UnalignedVec(values().vec<T>().data() + offset,
+                                            nnz_in_batch);
+  }
+
+  template <typename T>
+  inline typename TTypes<T>::UnalignedConstVec values_vec(int batch) const {
+    DCHECK(valid());
+    DCHECK_LT(batch, batch_size());
+    const int offset = (*batch_pointers_vec_)(batch);
+    const int nnz_in_batch = nnz(batch);
+    return typename TTypes<T>::UnalignedConstVec(
+        values().vec<T>().data() + offset, nnz_in_batch);
+  }
+
+  inline Tensor& row_pointers() {
+    DCHECK(valid());
+    return row_pointers_;
+  }
+
+  inline const Tensor& row_pointers() const {
+    DCHECK(valid());
+    return row_pointers_;
+  }
+
+  inline Tensor& col_indices() {
+    DCHECK(valid());
+    return col_indices_;
+  }
+
+  inline const Tensor& col_indices() const {
+    DCHECK(valid());
+    return col_indices_;
+  }
+
+  inline Tensor& values() {
+    DCHECK(valid());
+    return values_;
+  }
+
+  inline const Tensor& values() const {
+    DCHECK(valid());
+    return values_;
+  }
+
+  inline Tensor& batch_pointers() {
+    DCHECK(valid());
+    return batch_pointers_;
+  }
+
+  inline const Tensor& batch_pointers() const {
+    DCHECK(valid());
+    return batch_pointers_;
+  }
+
+  std::string TypeName() const { return kTypeName; }
+
+  // TODO(ebrevdo): A better debug string.
+  std::string DebugString() const { return dense_shape_.DebugString(); }
+
+  // Returns the number of elements.  This is equal to 1 if the
+  // CSRSparseMatrix is a singleton matrix (dense_shape is length 2).
+  int batch_size() const {
+    DCHECK(valid());
+    return batch_pointers_.NumElements() - 1;
+  }
+
+  bool Decode(const VariantTensorData& p) {
+    if (p.tensors_.empty()) return false;
+    Metadata metadata;
+    if (!p.get_metadata(&metadata)) return false;
+    const bool validated = metadata.validated;
+    const DataType dtype = metadata.dtype;
+
+    // p.tensors_ should contain tensors {dense_shape, batch_pointers,
+    // row_pointers, col_indices, values}.
+    if (p.tensors_.size() != 5) return false;
+
+    Tensor dense_shape = p.tensors_[0];
+    if (dense_shape.dtype() != DT_INT64) return false;
+    if (dense_shape.dims() != 1) return false;
+    int rank = dense_shape.dim_size(0);
+    if (rank < 2 || rank > 3) return false;
+
+    Tensor batch_pointers(p.tensors_[1]);
+    Tensor row_pointers(p.tensors_[2]);
+    Tensor col_indices(p.tensors_[3]);
+    Tensor values(p.tensors_[4]);
+
+    // Check that the validated bool is consistent with the data.
+    absl::Status s = ValidateTypesAndShapes(dtype, dense_shape, batch_pointers,
+                                            row_pointers, col_indices, values);
+    if (s.ok() != validated) return false;
+
+    // Save to this object.
+    metadata_ = metadata;
+    dense_shape_ = std::move(dense_shape);
+    batch_pointers_ = std::move(batch_pointers);
+    row_pointers_ = std::move(row_pointers);
+    col_indices_ = std::move(col_indices);
+    values_ = std::move(values);
+    SetupVecs();
+    return true;
+  }
+
+  void Encode(VariantTensorData* p) const {
+    DCHECK(valid());
+
+    // Store metadata_ to p's metadata
+    p->set_metadata(metadata_);
+
+    // Store dense_shape, row_pointers, col_indices, and values to p->tensors_.
+    p->tensors_.reserve(5);
+    p->tensors_.push_back(dense_shape_);
+    p->tensors_.push_back(batch_pointers_);
+    p->tensors_.push_back(row_pointers_);
+    p->tensors_.push_back(col_indices_);
+    p->tensors_.push_back(values_);
+  }
+
+  // This static method copies CSRSparseMatrices in all directions:
+  //   Host->Device, Device->Host, and Device->Device.
+  static absl::Status DeviceCopy(
+      const CSRSparseMatrix& from, CSRSparseMatrix* to,
+      const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+    VLOG(2) << "DeviceCopy from type: " << DataTypeString(from.dtype())
+            << " and shape: " << from.dense_shape().DebugString();
+    Tensor to_row_ptr(DT_INT32);
+    Tensor to_col_ind(DT_INT32);
+    Tensor to_values(from.dtype());
+    TF_RETURN_IF_ERROR(copy(from.row_pointers(), &to_row_ptr));
+    TF_RETURN_IF_ERROR(copy(from.col_indices(), &to_col_ind));
+    TF_RETURN_IF_ERROR(copy(from.values(), &to_values));
+    return CreateCSRSparseMatrix(from.dtype(),
+                                 from.dense_shape(),     // Always on host.
+                                 from.batch_pointers(),  // Always on host.
+                                 to_row_ptr, to_col_ind, to_values, to);
+  }
+
+ private:
+  CSRSparseMatrix(DataType dtype, const Tensor& dense_shape,
+                  const Tensor& batch_pointers, const Tensor& row_pointers,
+                  const Tensor& col_indices, const Tensor& values)
+      : metadata_{false, dtype},
+        dense_shape_(dense_shape),
+        batch_pointers_(batch_pointers),
+        row_pointers_(row_pointers),
+        col_indices_(col_indices),
+        values_(values) {}
+
+  void SetupVecs() {
+    if (!metadata_.validated) return;
+    batch_pointers_vec_.reset(
+        new TTypes<int32>::Vec(batch_pointers_.vec<int32>()));
+    row_pointers_vec_.reset(new TTypes<int32>::Vec(row_pointers_.vec<int32>()));
+    col_indices_vec_.reset(new TTypes<int32>::Vec(col_indices_.vec<int32>()));
+  }
+
+  void ClearVecs() {
+    batch_pointers_vec_.reset();
+    row_pointers_vec_.reset();
+    col_indices_vec_.reset();
+  }
+
+  static absl::Status ValidateTypesAndShapes(DataType dtype,
+                                             const Tensor& dense_shape,
+                                             const Tensor& batch_pointers,
+                                             const Tensor& row_pointers,
+                                             const Tensor& col_indices,
+                                             const Tensor& values) {
+    // TODO(ebrevdo): Consider adding support for other floating point types
+    // (namely, float16).
+    if (dtype != DT_FLOAT && dtype != DT_DOUBLE && dtype != DT_COMPLEX64 &&
+        dtype != DT_COMPLEX128) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: dtype = ", DataTypeString(dtype),
+          " not in {float32, float64, complex64, complex128}");
+    }
+    // dense_shape checks
+    if (dense_shape.dtype() != DT_INT64) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: dense_shape.dtype() = ",
+          DataTypeString(dense_shape.dtype()), " != int64");
+    }
+    if (dense_shape.dims() != 1) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: dense_shape should be a vector, but saw "
+          "tensor: ",
+          dense_shape.DebugString());
+    }
+    int rank = dense_shape.dim_size(0);
+    if (rank < 2 || rank > 3) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: dense_shape should be a 2- or 3- vector, "
+          "but saw: ",
+          dense_shape.SummarizeValue(5));
+    }
+    auto dense_shape_t = dense_shape.vec<int64_t>();
+    const int64_t batch_size = (rank == 2) ? 1 : dense_shape_t(0);
+    const int64_t num_rows = (rank == 2) ? dense_shape_t(0) : dense_shape_t(1);
+
+    if (batch_pointers.dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: batch_pointers.dtype() = ",
+          DataTypeString(batch_pointers.dtype()), " != int32");
+    }
+    if (batch_pointers.dims() != 1) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: batch_indices is not a vector, saw "
+          "shape: ",
+          batch_pointers.shape().DebugString());
+    }
+
+    // batch size checks
+    if (batch_size != batch_pointers.NumElements() - 1) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: dense_shape is ",
+          dense_shape.SummarizeValue(5),
+          " but batch pointers implies batch size is ",
+          batch_pointers.NumElements() - 1);
+    }
+
+    if (row_pointers.dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: row_pointers.dtype() = ",
+          DataTypeString(row_pointers.dtype()), " != int32");
+    }
+    if (row_pointers.dims() != 1) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: row_pointers is not a vector, saw "
+          "shape: ",
+          row_pointers.shape().DebugString());
+    }
+    if (row_pointers.dim_size(0) != batch_size * (num_rows + 1)) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: row_pointers should have size batch_size "
+          "* (num_rows + 1), saw shapes: ",
+          dense_shape.DebugString(), " vs. ",
+          row_pointers.shape().DebugString());
+    }
+    if (col_indices.dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: col_indices.dtype() = ",
+          DataTypeString(col_indices.dtype()), " != int32");
+    }
+    if (col_indices.dims() != 1) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: col_indices is not a vector, saw shape: ",
+          col_indices.shape().DebugString());
+    }
+    if (values.dtype() != dtype) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: values.dtype() = ",
+          DataTypeString(values.dtype()),
+          " != dtype = ", DataTypeString(dtype));
+    }
+    if (values.dims() != 1) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: values is not a vector, saw shape: ",
+          values.shape().DebugString());
+    }
+    if (col_indices.dim_size(0) != values.dim_size(0)) {
+      return errors::InvalidArgument(
+          "CSRSparseMatrix::Validate: size(col_indices) = ",
+          col_indices.dim_size(0), " != size(values) = ", values.dim_size(0));
+    }
+    return absl::OkStatus();
+  }
+
+  struct Metadata {
+    bool validated;
+    DataType dtype;
+  };
+  Metadata metadata_;
+  Tensor dense_shape_;
+  Tensor batch_pointers_;
+  Tensor row_pointers_;
+  Tensor col_indices_;
+  Tensor values_;
+  std::unique_ptr<TTypes<int32>::Vec> batch_pointers_vec_;
+  std::unique_ptr<TTypes<int32>::Vec> row_pointers_vec_;
+  std::unique_ptr<TTypes<int32>::Vec> col_indices_vec_;
+};
+
+// Call BinaryFunctor<Device, T>()(ctx, a, b, c)
+// where T depends on a.dtype().  T will be one of: float, double,
+// complex64, complex128.
+template <typename Device, template <typename, typename> class BinaryFunctor>
+absl::Status CSRSparseMatrixBinaryHelper(OpKernelContext* ctx,
+                                         const CSRSparseMatrix& a,
+                                         const CSRSparseMatrix& b,
+                                         CSRSparseMatrix* c) {
+  DataType dt = a.dtype();
+  if (dt != b.dtype()) {
+    return errors::InvalidArgument(
+        "CSRSparseMatrixBinaryHelper: Inconsistent dtypes for input matrices, "
+        "a "
+        "dtype: ",
+        DataTypeString(dt), ", b dtype: ", DataTypeString(b.dtype()));
+  }
+  switch (dt) {
+    case DT_FLOAT: {
+      BinaryFunctor<Device, float> functor(ctx);
+      return functor(a, b, c);
+    }
+    case DT_DOUBLE: {
+      BinaryFunctor<Device, double> functor(ctx);
+      return functor(a, b, c);
+    }
+    case DT_COMPLEX64: {
+      BinaryFunctor<Device, complex64> functor(ctx);
+      return functor(a, b, c);
+    }
+    case DT_COMPLEX128: {
+      BinaryFunctor<Device, complex128> functor(ctx);
+      return functor(a, b, c);
+    }
+    default:
+      return errors::InvalidArgument(
+          "CSRSparseMatrixBinaryHelper: a.dtype (", DataTypeString(dt),
+          ") is not one of: float, double, complex64, complex128");
+  }
+}
+
+// Call UnaryFunctor<Device, T>()(ctx, a, b)
+// where T depends on a.dtype().  T will be one of: float, double,
+// complex64, complex128.
+template <typename Device, template <typename, typename> class UnaryFunctor>
+absl::Status CSRSparseMatrixUnaryHelper(OpKernelContext* ctx,
+                                        const CSRSparseMatrix& a,
+                                        CSRSparseMatrix* b) {
+  DataType dt = a.dtype();
+  switch (dt) {
+    case DT_FLOAT: {
+      UnaryFunctor<Device, float> functor(ctx);
+      return functor(a, b);
+    }
+    case DT_DOUBLE: {
+      UnaryFunctor<Device, double> functor(ctx);
+      return functor(a, b);
+    }
+    case DT_COMPLEX64: {
+      UnaryFunctor<Device, complex64> functor(ctx);
+      return functor(a, b);
+    }
+    case DT_COMPLEX128: {
+      UnaryFunctor<Device, complex128> functor(ctx);
+      return functor(a, b);
+    }
+    default:
+      return errors::InvalidArgument(
+          "CSRSparseMatrixUnaryHelper: a.dtype (", DataTypeString(dt),
+          ") is not one of: float, double, complex64, complex128");
+  }
+}
+
+template <typename T>
+struct ConstCSRComponent {
+  TTypes<int32>::UnalignedConstVec row_ptr;
+  TTypes<int32>::UnalignedConstVec col_ind;
+  typename TTypes<T>::UnalignedConstVec values;
+  TTypes<int64_t>::ConstVec dense_shape_host;
+};
+
+template <typename T>
+struct CSRComponent {
+  TTypes<int32>::UnalignedVec row_ptr;
+  TTypes<int32>::UnalignedVec col_ind;
+  typename TTypes<T>::UnalignedVec values;
+  TTypes<int64_t>::Vec dense_shape_host;
+};
+
+template <typename T>
+absl::Status ExtractVariantFromInput(OpKernelContext* ctx, int index,
+                                     const T** value) {
+  const Tensor& input_t = ctx->input(index);
+  if (!TensorShapeUtils::IsScalar(input_t.shape())) {
+    return errors::InvalidArgument(
+        "Invalid input matrix: Shape must be rank 0 but is rank ",
+        input_t.dims());
+  }
+  const Variant& input_variant = input_t.scalar<Variant>()();
+  *value = input_variant.get<T>();
+  if (*value == nullptr) {
+    return errors::InvalidArgument("Could not retrieve Variant input ", index);
+  }
+  if (!(*value)->valid()) {
+    return errors::InvalidArgument("Variant input ", index, " is not valid.");
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_SPARSE_MATRIX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/transpose_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/transpose_op.h
new file mode 100644
index 00000000..2a8f0671
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/transpose_op.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_TRANSPOSE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_TRANSPOSE_OP_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct maybe_conj_inplace {
+  static void run(const Device& d, Tensor* t) {}
+};
+
+template <typename Device>
+struct maybe_conj_inplace<Device, complex64> {
+  static void run(const Device& d, Tensor* t) {
+    functor::UnaryFunctor<Device, functor::conj<complex64>> conj;
+    conj(d, t->flat<complex64>() /*out*/,
+         const_cast<const Tensor*>(t)->flat<complex64>() /*in*/);
+  }
+};
+
+template <typename Device>
+struct maybe_conj_inplace<Device, complex128> {
+  static void run(const Device& d, Tensor* t) {
+    functor::UnaryFunctor<Device, functor::conj<complex128>> conj;
+    conj(d, t->flat<complex128>() /*out*/,
+         const_cast<const Tensor*>(t)->flat<complex128>() /*in*/);
+  }
+};
+
+template <typename Device, typename T>
+struct maybe_conj {
+  static void run(const Device& d, const Tensor& in, Tensor* out) { *out = in; }
+};
+
+template <typename Device>
+struct maybe_conj<Device, complex64> {
+  static void run(const Device& d, const Tensor& in, Tensor* out) {
+    functor::UnaryFunctor<Device, functor::conj<complex64>> conj;
+    conj(d, out->flat<complex64>() /*out*/, in.flat<complex64>() /*in*/);
+  }
+};
+
+template <typename Device>
+struct maybe_conj<Device, complex128> {
+  static void run(const Device& d, const Tensor& in, Tensor* out) {
+    functor::UnaryFunctor<Device, functor::conj<complex128>> conj;
+    conj(d, out->flat<complex128>() /*out*/, in.flat<complex128>() /*in*/);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_TRANSPOSE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/zeros_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/zeros_op.h
new file mode 100644
index 00000000..2a86089e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse/zeros_op.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_ZEROS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_ZEROS_OP_H_
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/sparse/sparse_matrix.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename Device>
+struct CSRSparseMatrixZeros {
+  absl::Status operator()(OpKernelContext* c, DataType dtype,
+                          const Tensor& dense_shape_t,
+                          CSRSparseMatrix* matrix) {
+    auto dense_shape = dense_shape_t.vec<int64_t>();
+    const int rank = dense_shape.size();
+    if (!(rank == 2 || rank == 3)) {
+      return errors::InvalidArgument("sparse tensor must have rank == 2 or 3; ",
+                                     "but dense shape has ", rank, " entries");
+    }
+    const int64_t batch_size = (rank == 2) ? 1 : dense_shape(0);
+    const int64_t rows = dense_shape((rank == 2) ? 0 : 1);
+
+    Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
+                       TensorShape({batch_size + 1}));
+    batch_ptr_t.vec<int32>().setZero();  // On host.
+
+    Allocator* allocator = c->device()->GetAllocator(AllocatorAttributes());
+    // An all-zeros CSR matrix is composed of an empty set of column
+    // indices, an empty set of values, and a vector of all zero row
+    // pointers.  The length of the row pointers vector is #rows + 1.
+    // Each row pointer is just an offset into the cols and
+    // values vectors, and those are empty, all coefficients are zero.
+    Tensor csr_row_ptr_t;
+    Tensor coo_col_ind_t(allocator, DT_INT32, TensorShape({0}));
+    Tensor csr_values_t(allocator, dtype, TensorShape({0}));
+    const Device& d = c->eigen_device<Device>();
+    functor::SetZeroFunctor<Device, int32> set_zero;
+    TF_RETURN_IF_ERROR(c->allocate_temp(
+        DT_INT32, TensorShape({batch_size * (rows + 1)}), &csr_row_ptr_t));
+    set_zero(d, csr_row_ptr_t.flat<int32>());
+
+    TF_RETURN_IF_ERROR(CSRSparseMatrix::CreateCSRSparseMatrix(
+        dtype, dense_shape_t, batch_ptr_t, csr_row_ptr_t, coo_col_ind_t,
+        csr_values_t, matrix));
+
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_ZEROS_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_concat_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_concat_op.h
new file mode 100644
index 00000000..c13ae502
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_concat_op.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_CONCAT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_CONCAT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct SparseConcatFunctor {
+  void operator()(OpKernelContext* context, const OpInputList& inds,
+                  const OpInputList& vals, const OpInputList& shapes,
+                  int concat_dim);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_CONCAT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_conditional_accumulator.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_conditional_accumulator.h
new file mode 100644
index 00000000..9d45d52b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -0,0 +1,438 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
+
+#include "tensorflow/core/kernels/typed_conditional_accumulator_base.h"
+
+namespace tensorflow {
+
+/**
+ * An aggregation object for adding sparse gradients, represented as a tuple of
+ * indices, values, and a (possibly empty) shape.
+ *
+ * The two main methods of this class are TryApplyGrad and TryTakeGrad.
+ *
+ * TryApplyGrad tries add a gradient to the accumulator. The attempt is
+ * successful if local_step >= global_step, i.e., if the gradient is not stale,
+ * having been computed using up-to-date information. Otherwise, the gradient is
+ * silently dropped.
+ *
+ * TryTakeGrad logs an attempt to read the average gradient. The attempt is
+ * blocked until the number of gradients accumulated (via TryApplyGrad) is equal
+ * or exceeds the number requested by TryTakeGrad.
+ * Once this condition is satisfied, the following actions are taken:
+ * (1) the value of the average gradient is returned
+ * (2) the count of accumulated gradients is reset to 0
+ * (3) the internal global_step value (current_global_step_) is incremented by 1
+ *
+ * SparseConditionalAccumulator is the datatype-dependent templated sub-class of
+ * ConditionalAccumulatorBase. It implements the virtual arithmetic methods that
+ * are used by for aggregating, averaging, allocating, returning indexed slices.
+ */
+template <typename Device, typename T>
+class SparseConditionalAccumulator
+    : public TypedConditionalAccumulatorBase<
+          std::tuple<const Tensor*, const Tensor*, const Tensor*>> {
+ public:
+  SparseConditionalAccumulator(const DataType& dtype,
+                               const PartialTensorShape& shape,
+                               const string& name, const string& reduction_type)
+      : TypedConditionalAccumulatorBase<
+            std::tuple<const Tensor*, const Tensor*, const Tensor*>>(
+            dtype, shape, name, reduction_type),
+        accum_val_(std::make_unique<Tensor>()) {}
+
+ protected:
+  std::unique_ptr<std::vector<int64_t>> accum_idx_vec_;
+  std::unique_ptr<std::vector<int>> count_element_;
+
+  std::unique_ptr<Tensor> accum_val_;
+
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
+                           Eigen::Unaligned>
+      SliceT;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>,
+                           Eigen::Unaligned>
+      SliceConstT;
+
+  absl::Status ValidateShape(
+      std::tuple<const Tensor*, const Tensor*, const Tensor*>* tensor,
+      bool has_known_shape) TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+    const Tensor* tensor_idx = std::get<0>(*tensor);
+    const Tensor* tensor_val = std::get<1>(*tensor);
+    const Tensor* tensor_shape = std::get<2>(*tensor);
+    int64_t grad_val_dims = tensor_val->dims();
+    int64_t grad_dims = grad_val_dims;
+
+    // Compare with provided shape
+    if (has_known_shape) {
+      if (shape_.dims() > tensor_shape->NumElements()) {
+        return errors::InvalidArgument(
+            "Shape mismatch: expected shape rank at least ", shape_.dims(),
+            ", got ", tensor_shape->NumElements());
+      }
+      const auto tensor_shape_flat = tensor_shape->flat<int64_t>();
+      for (int64_t i = 0; i < shape_.dims(); i++) {
+        if (shape_.dim_size(i) != -1 &&
+            shape_.dim_size(i) != tensor_shape_flat(i)) {
+          return errors::InvalidArgument("Shape mismatch: expected shape dim ",
+                                         i, " to be ", shape_.dim_size(i),
+                                         ", got ", tensor_shape_flat(i));
+        }
+      }
+    }
+    // Check that indices are within limits
+    if (shape_.dims() > 0 && shape_.dim_size(0) != -1 &&
+        tensor_idx->dims() > 0) {
+      for (int64_t i = 0; i < tensor_idx->dim_size(0); i++) {
+        if (tensor_idx->vec<int64_t>()(i) >= shape_.dim_size(0)) {
+          return errors::InvalidArgument(
+              "Shape mismatch: index of slice ", i, " exceeded limits of shape",
+              "; index is ", tensor_idx->vec<int64_t>()(i), " exceeded ",
+              shape_.dim_size(0));
+        }
+      }
+    }
+
+    // Check values compatibility with accumulated gradient if available
+    if (counter_ > 0) {
+      int64_t accum_val_dims = accum_val_->dims();
+      if (accum_val_dims != grad_val_dims) {
+        return errors::InvalidArgument("Shape mismatch: expected values rank ",
+                                       accum_val_dims, ", got ", grad_val_dims);
+      }
+      for (int64_t i = 1; i < accum_val_dims; i++) {
+        if (accum_val_->dim_size(i) != tensor_val->dim_size(i)) {
+          return errors::InvalidArgument("Shape mismatch: expected values dim ",
+                                         i, " to be ", accum_val_->dim_size(i),
+                                         ", got ", tensor_val->dim_size(i));
+        }
+      }
+    } else {
+      // If there are no accumulated gradients, check against shape_
+      if (shape_.dims() > grad_dims) {
+        return errors::InvalidArgument(
+            "Shape mismatch: expected values rank at least ", shape_.dims(),
+            ", got ", grad_dims);
+      }
+      // Check that values have correct dimensions
+      for (int64_t i = 1; i < shape_.dims(); i++) {
+        if (shape_.dim_size(i) != -1 &&
+            shape_.dim_size(i) != tensor_val->dim_size(i)) {
+          return errors::InvalidArgument("Shape mismatch: expected values dim ",
+                                         i, " to be ", shape_.dim_size(i),
+                                         ", got ", tensor_val->dim_size(i));
+        }
+      }
+    }
+
+    return absl::OkStatus();
+  }
+
+  void AllocateAndAssignToAccumGradFunction(
+      OpKernelContext* ctx,
+      std::tuple<const Tensor*, const Tensor*, const Tensor*>* grad) override {
+    const Tensor* grad_idx = std::get<0>(*grad);
+    const Tensor* grad_val = std::get<1>(*grad);
+
+    const int64_t nnz = grad_idx->dim_size(0);
+
+    // Assign indices
+    accum_idx_vec_ = std::make_unique<std::vector<int64_t>>();
+    accum_idx_vec_->reserve(nnz);
+    for (int i = 0; i < nnz; i++) {
+      accum_idx_vec_->push_back(grad_idx->vec<int64_t>()(i));
+    }
+
+    // Assign values to accum_val_tensor
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(dtype_, grad_val->shape(), accum_val_.get()));
+    accum_val_->flat<T>().device(ctx->template eigen_device<Device>()) =
+        grad_val->flat<T>();
+
+    // Assign count_element_
+    count_element_ = std::make_unique<std::vector<int>>(nnz, 1);
+
+    // Do not need shape; Assume that the op has checked that the shapes match,
+    // so grad's shape == shape_
+  }
+
+  void AddToAccumGradFunction(
+      OpKernelContext* ctx,
+      std::tuple<const Tensor*, const Tensor*, const Tensor*>* grad) override {
+    // Modeled after third_party/tensorflow/core/kernels/sparse_add_op
+
+    const Tensor* grad_idx = std::get<0>(*grad);
+    const Tensor* grad_val = std::get<1>(*grad);
+
+    const int64_t accum_nnz = accum_idx_vec_->size();
+    const int64_t grad_nnz = grad_idx->dim_size(0);
+
+    // Source enumerates the origin of a non-zero element: whether it is from
+    // the new gradient, the accumulated gradient, or the sum of both.
+    enum Source { from_accum, from_grad, from_accum_and_grad };
+
+    // (1) do a pass over inputs, and append values and indices to vectors
+    std::vector<std::tuple<Source, int64, int64>> entries_to_copy;
+    entries_to_copy.reserve(accum_nnz + grad_nnz);
+
+    // Pass over all non-zero elements of both the gradient and the accumulated
+    // value, to identify where each non-zero element of the sum comes from.
+    // The input and output indexed slices are assumed to be ordered along
+    // increasing dimension number.
+    int64_t i = 0, j = 0;
+    int64_t sum_nnz = 0;
+    while (i < accum_nnz && j < grad_nnz) {
+      sum_nnz++;
+      switch (cmp(accum_idx_vec_.get(), grad_idx, i, j)) {
+        case -1:
+          entries_to_copy.emplace_back(from_accum, i, -1);
+          ++i;
+          break;
+        case 0:
+          entries_to_copy.emplace_back(from_accum_and_grad, i, j);
+          ++i;
+          ++j;
+          break;
+        case 1:
+          entries_to_copy.emplace_back(from_grad, -1, j);
+          ++j;
+          break;
+      }
+    }
+
+    // Handle leftovers
+    while (i < accum_nnz) {
+      sum_nnz++;
+      entries_to_copy.emplace_back(from_accum, i, -1);
+      ++i;
+    }
+    while (j < grad_nnz) {
+      sum_nnz++;
+      entries_to_copy.emplace_back(from_grad, -1, j);
+      ++j;
+    }
+
+    // (2) Copy or sum the non-zero elements into sum_indices and sum_tensor
+    std::vector<int64_t>* sum_indices_vec = new std::vector<int64_t>();
+    sum_indices_vec->reserve(sum_nnz);
+
+    std::vector<int>* sum_counts = new std::vector<int>();
+    sum_counts->reserve(sum_nnz);
+
+    Tensor* sum_tensor = new Tensor();
+
+    TensorShape sum_shape = grad_val->shape();
+    sum_shape.set_dim(0, sum_nnz);
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(dtype_, sum_shape, sum_tensor));
+    auto sum_flat = sum_tensor->flat_outer_dims<T>();
+    auto accum_flat = accum_val_->flat_outer_dims<T>();
+    auto grad_flat = grad_val->flat_outer_dims<T>();
+
+    const int64_t num_col = grad_flat.dimension(1);
+
+    Eigen::DSizes<Eigen::DenseIndex, 1> slice_shape(num_col);
+
+    for (i = 0; i < sum_nnz; ++i) {
+      const Source src = std::get<0>(entries_to_copy[i]);
+      const int64_t idx_a = std::get<1>(entries_to_copy[i]);
+      const int64_t idx_b = std::get<2>(entries_to_copy[i]);
+      T* sum_slice_ptr = &sum_flat(i, 0);
+      SliceT sum_slice(sum_slice_ptr, slice_shape);
+      if (src == from_accum) {
+        // Element comes from accumulator; directly copy data structures over
+        sum_indices_vec->push_back(accum_idx_vec_->at(idx_a));
+        T* accum_slice_ptr = &accum_flat(idx_a, 0);
+        SliceT accum_slice(accum_slice_ptr, slice_shape);
+        sum_slice = accum_slice;
+        sum_counts->push_back(count_element_->at(idx_a));
+      } else if (src == from_accum_and_grad) {
+        // Element is a sum of accumulated value and new gradient;
+        // compute sum here
+        sum_indices_vec->push_back(accum_idx_vec_->at(idx_a));
+        const T* grad_slice_ptr = &grad_flat(idx_b, 0);
+        SliceConstT grad_slice(grad_slice_ptr, slice_shape);
+        T* accum_slice_ptr = &accum_flat(idx_a, 0);
+        SliceT accum_slice(accum_slice_ptr, slice_shape);
+        sum_slice = grad_slice + accum_slice;
+        sum_counts->push_back(count_element_->at(idx_a) + 1);
+      } else if (src == from_grad) {
+        // Element comes from new gradient; make a copy of indices and values
+        sum_indices_vec->push_back(grad_idx->vec<int64_t>()(idx_b));
+        const T* grad_slice_ptr = &grad_flat(idx_b, 0);
+        SliceConstT grad_slice(grad_slice_ptr, slice_shape);
+        sum_slice = grad_slice;
+        sum_counts->push_back(1);
+      }
+    }
+
+    // (3) Keep output, i.e., switch pointers to point to new data structures
+    // representing the sum
+    // Indices
+    accum_idx_vec_.reset(sum_indices_vec);
+    // Values
+    accum_val_.reset(sum_tensor);
+    // Counts
+    count_element_.reset(sum_counts);
+
+    // No need to copy shape, since shape remains the same after sum.
+  }
+
+  void DivideAccumGradByCounter(OpKernelContext* ctx) override
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+    const int64_t nnz = count_element_->size();
+    auto accum_flat = accum_val_->flat_outer_dims<T>();
+    std::vector<T> count_typet;
+    std::transform(count_element_->begin(), count_element_->end(),
+                   std::back_inserter(count_typet),
+                   TypeConverter<T, int>::ConvertUToT);
+
+    // Option 1: divide all by counter
+    /*
+    std::transform(
+        &accum_flat(0,0), &accum_flat(nnz,0), &accum_flat(0,0),
+        std::bind2nd(std::divides<T>(),
+                     TypeConverter<T, int>::ConvertUToT(this->counter_)));
+    */
+
+    // Option 2: average element-wise
+    Eigen::DSizes<Eigen::DenseIndex, 1> slice_shape(accum_flat.dimension(1));
+    for (int64_t i = 0; i < nnz; i++) {
+      T* accum_slice_ptr = &accum_flat(i, 0);
+      SliceT accum_slice(accum_slice_ptr, slice_shape);
+      accum_slice.device(ctx->template eigen_device<Device>()) =
+          accum_slice / count_typet[i];
+    }
+  }
+
+  bool SetOutput(OpKernelContext* ctx) override {
+    bool is_successful = true;
+    if (is_successful) is_successful = ReturnIdxTensor(ctx);
+    if (is_successful) is_successful = ReturnValTensor(ctx);
+    if (is_successful) is_successful = ReturnShapeTensor(ctx);
+    return is_successful;
+  }
+
+  bool GetAndValidateTensorInputForApplyGrad(
+      OpKernelContext* ctx,
+      std::tuple<const Tensor*, const Tensor*, const Tensor*>** tensor) override
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+    // TODO(xinghao, jmchen): The roundabout way of getting attr from
+    // OpKernelContext (instead of OpKernelConstruction) is a hack, and should
+    // be fixed if it affects efficiency.
+    bool has_known_shape = false;
+    OP_REQUIRES_OK_BOOLEAN(
+        ctx, GetNodeAttr(ctx->op_kernel().def(), "has_known_shape",
+                         &has_known_shape));
+
+    // Get input gradient tensors
+    const Tensor* grad_idx_tensor;
+    OP_REQUIRES_OK_BOOLEAN(ctx,
+                           ctx->input("gradient_indices", &grad_idx_tensor));
+    const Tensor* grad_val_tensor;
+    OP_REQUIRES_OK_BOOLEAN(ctx,
+                           ctx->input("gradient_values", &grad_val_tensor));
+    const Tensor* grad_shape_tensor = nullptr;
+    if (has_known_shape) {
+      OP_REQUIRES_OK_BOOLEAN(ctx,
+                             ctx->input("gradient_shape", &grad_shape_tensor));
+    }
+
+    // Checks
+    OP_REQUIRES_BOOLEAN(
+        ctx, TensorShapeUtils::IsVector(grad_idx_tensor->shape()),
+        errors::InvalidArgument(
+            "Input indices should be vector but received shape: ",
+            grad_idx_tensor->shape().DebugString()));
+    const int64_t nnz = grad_idx_tensor->dim_size(0);
+    OP_REQUIRES_BOOLEAN(
+        ctx, grad_val_tensor->dims() > 0,
+        errors::InvalidArgument("Values cannot be 0-dimensional."));
+    OP_REQUIRES_BOOLEAN(ctx, grad_val_tensor->dim_size(0) == nnz,
+                        errors::InvalidArgument("Expected ", nnz,
+                                                " non-empty input values, got ",
+                                                grad_val_tensor->dim_size(0)));
+
+    *tensor = new std::tuple<const Tensor*, const Tensor*, const Tensor*>(
+        grad_idx_tensor, grad_val_tensor, grad_shape_tensor);
+
+    OP_REQUIRES_OK_BOOLEAN(ctx, this->ValidateShape(*tensor, has_known_shape));
+
+    return true;
+  }
+
+  void CleanUpGradTensor(std::tuple<const Tensor*, const Tensor*,
+                                    const Tensor*>* tensor) override {
+    if (tensor != nullptr) delete tensor;
+  }
+
+ private:
+  inline int cmp(std::vector<int64_t>* a_idx, const Tensor* b_idx,
+                 const int64_t a_row, const int64_t b_row) {
+    const int64_t a = a_idx->at(a_row);
+    const int64_t b = b_idx->vec<int64_t>()(b_row);
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return 1;
+    }
+    return 0;
+  }
+
+  inline bool ReturnIdxTensor(OpKernelContext* ctx) {
+    Tensor* idx_tensor;
+    const int64_t nnz = accum_idx_vec_->size();
+    OP_REQUIRES_OK_BOOLEAN(ctx, ctx->allocate_output(0, {nnz}, &idx_tensor));
+    // If allocate_output fails, OP_REQUIRES_OK_BOOLEAN will short-circuit
+    // the remaining code and just return false
+    auto idx_tensor_vec = idx_tensor->vec<int64_t>();
+    for (int i = 0; i < nnz; ++i) {
+      idx_tensor_vec(i) = accum_idx_vec_->at(i);
+    }
+    return true;
+  }
+
+  inline bool ReturnValTensor(OpKernelContext* ctx) {
+    ctx->set_output(1, *accum_val_);
+    return true;
+  }
+
+  inline bool ReturnShapeTensor(OpKernelContext* ctx) {
+    int64_t accum_val_dims = accum_val_->dims();
+    Tensor* shape_tensor;
+    OP_REQUIRES_OK_BOOLEAN(
+        ctx, ctx->allocate_output(2, {accum_val_dims}, &shape_tensor));
+    // If allocate_output fails, OP_REQUIRES_OK_BOOLEAN will short-circuit
+    // the remaining code and just return false
+
+    // First dim of shape is defined by shape_, others by accum_val_->shape
+    shape_tensor->flat<int64_t>()(0) =
+        (shape_.dims() > 0) ? shape_.dim_size(0) : -1;
+    for (int64_t i = 1; i < accum_val_dims; i++) {
+      shape_tensor->flat<int64_t>()(i) = accum_val_->dim_size(i);
+    }
+    return true;
+  }
+
+  SparseConditionalAccumulator(const SparseConditionalAccumulator&) = delete;
+  void operator=(const SparseConditionalAccumulator&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_matmul_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_matmul_op.h
new file mode 100644
index 00000000..589a65af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_matmul_op.h
@@ -0,0 +1,501 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_MATMUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_MATMUL_OP_H_
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/types.h"
+
+#if defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/windows/intrinsics_port.h"
+#endif
+
+namespace Eigen {
+namespace internal {
+
+// Return the float representation of the bfloat16 value
+// in the lower 16-bits of input
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pexpand_bf16_l(const Packet& from) {
+  tensorflow::uint32 tmp;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;
+#else
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;
+#endif
+  return reinterpret_cast<const float&>(tmp);
+}
+
+// Return the float representation of the bfloat16 value
+// in the upper 16-bits of input
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pexpand_bf16_u(const Packet& from) {
+  tensorflow::uint32 tmp;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;
+#else
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;
+#endif
+  return reinterpret_cast<const float&>(tmp);
+}
+
+// Specialization non-scalar version on non-sse.
+// Enable vectorization on z13 and higher
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) || \
+    defined(EIGEN_VECTORIZE_NEON) || defined(EIGEN_VECTORIZE_ZVECTOR)
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) {
+  float r[4];
+  tensorflow::uint32 p[4];
+  pstoreu(r, from);
+  tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r);
+  p[0] = (ir[0] << 16) & 0xffff0000;
+  p[1] = ir[0] & 0xffff0000;
+  p[2] = (ir[1] << 16) & 0xffff0000;
+  p[3] = ir[1] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) {
+  float r[4];
+  tensorflow::uint32 p[4];
+  pstoreu(r, from);
+  tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r);
+  p[0] = (ir[2] << 16) & 0xffff0000;
+  p[1] = ir[2] & 0xffff0000;
+  p[2] = (ir[3] << 16) & 0xffff0000;
+  p[3] = ir[3] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
+}
+#endif
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pinterleave4x64(const Packet& from) {
+  return from;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pbroadcast_first(const Packet& a) {
+  return a;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pbroadcast_second(const Packet& a) {
+  assert(false && "Not applicable to Scalar Values");
+  return a;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pbroadcast_third(const Packet& a) {
+  assert(false && "Not applicable to Scalar Values");
+  return a;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pbroadcast_fourth(const Packet& a) {
+  assert(false && "Not applicable to Scalar Values");
+  return a;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pload4bf16(
+    const typename unpacket_traits<Packet>::type* from) {
+  assert(false && "Not applicable to Scalar Values");
+  return Packet();
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pload2bf16(
+    const typename unpacket_traits<Packet>::type* from) {
+  assert(false && "Not applicable to Scalar Values");
+  return Packet();
+}
+
+// Specialization for pload4bf16 and pload2bf16 for non-sse.
+// Enable vectorization on z13 and higher.
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) || \
+    defined(EIGEN_VECTORIZE_NEON) || defined(EIGEN_VECTORIZE_ZVECTOR)
+template <>
+EIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) {
+  tensorflow::uint32 p[4];
+  const tensorflow::uint32* ir =
+      reinterpret_cast<const tensorflow::uint32*>(from);
+  p[0] = (ir[0] << 16) & 0xffff0000;
+  p[1] = ir[0] & 0xffff0000;
+  p[2] = (ir[1] << 16) & 0xffff0000;
+  p[3] = ir[1] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
+  tensorflow::uint32 p[4];
+  const tensorflow::uint32* ir =
+      reinterpret_cast<const tensorflow::uint32*>(from);
+  p[0] = (ir[0] << 16) & 0xffff0000;
+  p[1] = ir[0] & 0xffff0000;
+  p[2] = (ir[0] << 16) & 0xffff0000;
+  p[3] = ir[0] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
+}
+#endif
+
+#if defined(EIGEN_VECTORIZE_NEON)
+// Return a packet with the first value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
+  return pset1<Packet4f>(pfirst(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pbroadcast_first<Packet2f>(const Packet2f& a) {
+  return pset1<Packet2f>(pfirst(a));
+}
+
+// Return a packet with the second value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
+  return pset1<Packet4f>(vgetq_lane_f32(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pbroadcast_second<Packet2f>(const Packet2f& a) {
+  return pset1<Packet2f>(vget_lane_f32(a, 1));
+}
+
+// Return a packet with the third value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
+  return pset1<Packet4f>(vgetq_lane_f32(a, 2));
+}
+
+// Return a packet with the fourth value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
+  return pset1<Packet4f>(vgetq_lane_f32(a, 3));
+}
+#endif
+
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+// Return a packet with the first value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
+  return vec_splat(a, 0);
+}
+
+// Return a packet with the second value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
+  return vec_splat(a, 1);
+}
+
+// Return a packet with the third value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
+  return vec_splat(a, 2);
+}
+
+// Return a packet with the fourth value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
+  return vec_splat(a, 3);
+}
+#endif
+
+#ifdef EIGEN_VECTORIZE_SSE2
+// For PacketSize of 4 floats the Packet is not modified
+template <>
+EIGEN_STRONG_INLINE Packet4f pinterleave4x64<Packet4f>(const Packet4f& from) {
+  return from;
+}
+
+// Return a Packet with 4 floats loaded from 4 bfloat16 values
+template <>
+EIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
+  return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp));
+}
+
+// Return a Packet with 2 floats loaded from 2 bfloat16 values
+template <>
+EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
+  return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp));
+}
+
+// Return a Packet with 4 floats expanded from 4 bfloat16 values
+// in the lower half of the 128-bit lane
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castps_si128(from);
+  return _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp));
+}
+
+// Return a Packet with 4 floats expanded from 4 bfloat16 values
+// in the upper half of the 128-bit lane
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castps_si128(from);
+  return _mm_castsi128_ps(_mm_unpackhi_epi16(zero, tmp));
+}
+
+// Return a packet with the first value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
+  return _mm_set1_ps(pfirst<Packet4f>(a));
+}
+
+// Return a packet with the second value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
+  return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 1)));
+}
+
+// Return a packet with the third value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
+  return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 2)));
+}
+
+// Return a packet with the fourth value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
+  return _mm_set1_ps(_mm_cvtss_f32(_mm_shuffle_ps(a, a, 3)));
+}
+
+#endif
+
+#ifdef EIGEN_VECTORIZE_AVX512
+template <>
+EIGEN_STRONG_INLINE Packet16f
+pbroadcast_first<Packet16f>(const Packet16f& a_in) {
+  Packet4f a = _mm512_castps512_ps128(a_in);
+  return _mm512_broadcastss_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f
+pbroadcast_second<Packet16f>(const Packet16f& a_in) {
+  Packet4f a = _mm512_castps512_ps128(a_in);
+  return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f
+pbroadcast_third<Packet16f>(const Packet16f& a_in) {
+  Packet4f a = _mm512_castps512_ps128(a_in);
+  return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f
+pbroadcast_fourth<Packet16f>(const Packet16f& a_in) {
+  Packet4f a = _mm512_castps512_ps128(a_in);
+  return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pbroadcast_first<Packet8d>(const Packet8d& a_in) {
+  Packet2d a = _mm512_castpd512_pd128(a_in);
+  return _mm512_broadcastsd_pd(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
+  Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3);
+  return _mm512_broadcastsd_pd(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
+  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
+  return _mm512_broadcastsd_pd(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
+  Packet2d a =
+      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
+  return _mm512_broadcastsd_pd(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i
+pbroadcast_first<Packet16i>(const Packet16i& a_in) {
+  Packet4i a = _mm512_castsi512_si128(a_in);
+  return _mm512_broadcastd_epi32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i
+pbroadcast_second<Packet16i>(const Packet16i& a_in) {
+  Packet4i a = _mm512_castsi512_si128(a_in);
+  return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i
+pbroadcast_third<Packet16i>(const Packet16i& a_in) {
+  Packet4i a = _mm512_castsi512_si128(a_in);
+  return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i
+pbroadcast_fourth<Packet16i>(const Packet16i& a_in) {
+  Packet4i a = _mm512_castsi512_si128(a_in);
+  return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)));
+}
+#endif
+
+#ifdef EIGEN_VECTORIZE_AVX
+// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords
+template <>
+EIGEN_STRONG_INLINE Packet8f pinterleave4x64<Packet8f>(const Packet8f& from) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from),
+                                                      _MM_SHUFFLE(3, 1, 2, 0)));
+#else
+  auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2);
+  auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3);
+  auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4);
+  auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5);
+  auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3);
+  return _mm256_castsi256_ps(tmp5);
+#endif
+}
+// Return a Packet with 4 floats loaded from 4 bfloat16 values
+template <>
+EIGEN_STRONG_INLINE Packet8f pload4bf16<Packet8f>(const float* from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
+  return _mm256_castps128_ps256(
+      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
+}
+// Return a Packet with 2 floats loaded from 2 bfloat16 values
+template <>
+EIGEN_STRONG_INLINE Packet8f pload2bf16<Packet8f>(const float* from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
+  return _mm256_castps128_ps256(
+      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512
+// Return a Packet with 4 floats loaded from 4 bfloat16 values
+template <>
+EIGEN_STRONG_INLINE Packet16f pload4bf16<Packet16f>(const float* from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
+  return _mm512_castps128_ps512(
+      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
+}
+// Return a Packet with 2 floats loaded from 2 bfloat16 values
+template <>
+EIGEN_STRONG_INLINE Packet16f pload2bf16<Packet16f>(const float* from) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
+  return _mm512_castps128_ps512(
+      _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
+}
+#endif
+
+// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half
+// of the 128-bit lane
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet8f pexpand_bf16_l(const Packet8f& from) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i zero = _mm256_setzero_si256();
+  __m256i tmp = _mm256_castps_si256(from);
+  return _mm256_castsi256_ps(_mm256_unpacklo_epi16(zero, tmp));
+#else
+  __m128i zero = _mm_setzero_si128();
+  __m128i low = _mm_castps_si128(_mm256_extractf128_ps(from, 0));
+  __m128i res_l = _mm_unpacklo_epi16(zero, low);
+  __m128i high = _mm_castps_si128(_mm256_extractf128_ps(from, 1));
+  __m128i res_h = _mm_unpacklo_epi16(zero, high);
+  __m256 res = _mm256_castps128_ps256(_mm_castsi128_ps(res_l));
+  res = _mm256_insertf128_ps(res, _mm_castsi128_ps(res_h), 1);
+  return res;
+#endif
+}
+
+// For each 128-bit lane convert 4 bfloat to 4 float values from the upper half
+// of the 128-bit lane
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet8f pexpand_bf16_u(const Packet8f& from) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i zero = _mm256_setzero_si256();
+  __m256i tmp = _mm256_castps_si256(from);
+  return _mm256_castsi256_ps(_mm256_unpackhi_epi16(zero, tmp));
+#else
+  __m128i zero = _mm_setzero_si128();
+  __m128i low = _mm_castps_si128(_mm256_extractf128_ps(from, 0));
+  __m128i res_l = _mm_unpackhi_epi16(zero, low);
+  __m128i high = _mm_castps_si128(_mm256_extractf128_ps(from, 1));
+  __m128i res_h = _mm_unpackhi_epi16(zero, high);
+  __m256 res = _mm256_castps128_ps256(_mm_castsi128_ps(res_l));
+  res = _mm256_insertf128_ps(res, _mm_castsi128_ps(res_h), 1);
+  return res;
+#endif
+}
+
+// Return a packet with the first value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet8f pbroadcast_first<Packet8f>(const Packet8f& a) {
+  return _mm256_set1_ps(pfirst<Packet8f>(a));
+}
+
+// Return a packet with the second value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet8f pbroadcast_second<Packet8f>(const Packet8f& a) {
+  return _mm256_set1_ps(
+      _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 1))));
+}
+
+// Return a packet with the third value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet8f pbroadcast_third<Packet8f>(const Packet8f& a) {
+  return _mm256_set1_ps(
+      _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 2))));
+}
+
+// Return a packet with the fourth value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
+  return _mm256_set1_ps(
+      _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3))));
+}
+
+#endif
+
+#ifdef EIGEN_VECTORIZE_AVX512
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
+  return _mm512_castsi512_ps(_mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
+      16));
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
+  Packet16i tmp = _mm512_castps_si512(from);
+  Packet16i tmp2 = _mm512_alignr_epi32(tmp, tmp, 8);
+  return _mm512_castsi512_ps(_mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp2)), 16));
+}
+
+#endif
+}  // namespace internal
+}  // namespace Eigen
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_MATMUL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_reorder_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_reorder_op.h
new file mode 100644
index 00000000..0af44c55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_reorder_op.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_REORDER_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_REORDER_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct SparseReorderFunctor {
+  void operator()(OpKernelContext* context, const Tensor& input_ind,
+                  const Tensor& input_val, const Tensor& input_shape_in);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_REORDER_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_slice_grad_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_slice_grad_op.h
new file mode 100644
index 00000000..6358ed02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_slice_grad_op.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_SLICE_GRAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_SLICE_GRAD_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct SparseSliceGradFunctor {
+  void operator()(OpKernelContext* ctx,
+                  typename TTypes<T>::ConstFlat backprop_val_grad,
+                  typename TTypes<int64_t>::ConstMatrix input_indices_mat,
+                  typename TTypes<int64_t>::ConstFlat input_start_flat,
+                  typename TTypes<int64_t>::ConstMatrix output_indices_mat,
+                  typename TTypes<T>::Flat val_grad) const;
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_SLICE_GRAD_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_slice_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_slice_op.h
new file mode 100644
index 00000000..62e0b0cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_slice_op.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_SLICE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_SLICE_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct SparseSliceFunctor {
+  void operator()(OpKernelContext* context, const Tensor& input_indices,
+                  const Tensor& input_values, const Tensor& input_shape,
+                  const Tensor& input_start, const Tensor& input_size,
+                  typename AsyncOpKernel::DoneCallback done = nullptr) const;
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_SLICE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_split_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_split_op.h
new file mode 100644
index 00000000..7fba47a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_split_op.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_SPLIT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_SPLIT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct SparseSplitFunctor {
+  void operator()(OpKernelContext* context, const Tensor& input_indices,
+                  const Tensor& input_values, const TensorShape& dense_shape,
+                  const int64_t axis, const int num_split,
+                  typename AsyncOpKernel::DoneCallback done = nullptr);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_SPLIT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_tensor_dense_add_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
new file mode 100644
index 00000000..44a85785
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/scatter_functor.h"
+
+namespace tensorflow {
+namespace functor {
+
+// TODO(zongheng): this should be a general functor that powers SparseAdd and
+// ScatterNd ops.  It should be moved to its own head file, once the other ops
+// are implemented.
+template <typename Device, typename T, typename Index, int NDIMS,
+          scatter_op::UpdateOp op>
+struct ScatterNdFunctor {
+  // Returns -1 on success or a nonnegative i s.t. indices[i] is a bad index.
+  Index operator()(const Device& d, typename TTypes<Index>::ConstMatrix indices,
+                   typename TTypes<T>::ConstFlat updates,
+                   typename TTypes<T, NDIMS>::Tensor out);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
new file mode 100644
index 00000000..fef151ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T, typename Tindices, bool ADJ_A,
+          bool ADJ_B>
+struct SparseTensorDenseMatMulFunctor {
+  static EIGEN_ALWAYS_INLINE absl::Status Compute(
+      OpKernelContext* ctx, typename TTypes<T>::Matrix out,
+      typename TTypes<Tindices>::ConstMatrix a_indices,
+      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
+};
+
+template <typename MATRIX, bool ADJ>
+class MaybeAdjoint;
+
+template <typename MATRIX>
+class MaybeAdjoint<MATRIX, false> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
+      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
+    return m_(i, j);
+  }
+
+ private:
+  const MATRIX m_;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T MaybeConj(T v) {
+  return Eigen::numext::conj(v);
+}
+
+template <typename MATRIX>
+class MaybeAdjoint<MATRIX, true> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
+      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
+    return Eigen::numext::conj(m_(j, i));
+  }
+
+ private:
+  const MATRIX m_;
+};
+
+template <typename T>
+struct SumType {
+  using type = T;
+};
+
+template <>
+struct SumType<Eigen::half> {
+  using type = float;  // Use fp32 accumulator for fp16 input values
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_to_dense_op_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_to_dense_op_gpu.h
new file mode 100644
index 00000000..c19ffa72
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_to_dense_op_gpu.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_TO_DENSE_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_TO_DENSE_OP_GPU_H_
+
+#include "xla/stream_executor/device_memory.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace functor {
+template <typename T, typename Index>
+struct LaunchSparseToDense {
+  void operator()(OpKernelContext* c, AsyncOpKernel::DoneCallback done,
+                  AsyncOpKernel* op, bool validate_indices,
+                  const Tensor& indices, const Tensor& values,
+                  const Tensor& shape, const T default_value, Tensor* dense);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_TO_DENSE_OP_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_utils.h
new file mode 100644
index 00000000..8f86b518
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_utils.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for writing OpKernels for sparse tensors.
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+// Find the index i of the first element for which
+// indices_mat(sparse_index_begin, 0) < indices_mat(i, 0).
+// The search is conducted in the open interval
+// [sparse_index_begin, indices_mat.dimension(0)) and when no such i is found,
+// indices_mat.dimension(0) is returned.
+// indices_mat(k, 0) should be non-decreasing over the interval
+// [begin, indices_mat.dimension(0)).
+// Requires 0 <= sparse_index_begin < indices_mat.dimension(0).
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat);
+
+// Returns the vector v of indices in indices_mat at which new dense matrix
+// rows begin.
+// v.front() = 0, v.back() = indices_mat.dimension(0), and for i > 0,
+// v[i] - v[i-1] is the length of the ith dense row in indices_mat.
+// *contains_empty_rows = true if and only if indices_mat contains empty rows
+// (rows without values) between row 0 and the last row.
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows);
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns true if and only if the sparse matrix indices_mat whose row start
+// indices are represented by row_start_indices has empty dense rows
+// (between its first and last dense rows).
+// This function satisfies the identity row_start_indices ==
+// GetStartIndicesOfEachDenseRow(indices_mat, &return_value).
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices);
+
+// Methods for validating sparse indices.
+enum class IndexValidation {
+  kNone,      // Indices are not used by the op, or are not directly accessible
+              // (e.g. on GPU).
+  kOrdered,   // Indices must be unique, in lexicographical order, and within
+              // safe bounds.
+  kUnordered  // Indices must be within safe bounds, but may repeat or appear
+              // out-of-order.
+};
+
+// Validates the three component tensors of a sparse tensor have the proper
+// shapes.  Also validates index values according to the method supplied.
+template <typename Tindices>
+absl::Status ValidateSparseTensor(const Tensor& indices, const Tensor& values,
+                                  const Tensor& shape,
+                                  IndexValidation index_validation);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_xent_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_xent_op.h
new file mode 100644
index 00000000..d0ad3c4b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/sparse_xent_op.h
@@ -0,0 +1,232 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_XENT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_XENT_OP_H_
+// Functor definition for SparseXentOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace sparse_xent_helpers {
+
+template <typename T>
+typename TTypes<const T, 1>::Tensor32Bit To32BitConst(
+    typename TTypes<T>::Vec in) {
+  return To32Bit(typename TTypes<T>::ConstVec(in.data(), in.dimensions()));
+}
+
+template <typename T>
+typename TTypes<const T, 2>::Tensor32Bit To32BitConst(
+    typename TTypes<T>::Matrix in) {
+  return To32Bit(typename TTypes<T>::ConstMatrix(in.data(), in.dimensions()));
+}
+
+}  // namespace sparse_xent_helpers
+
+namespace generator {
+
+// Generator for calculation of the sparse Xent loss.
+// This generator takes the logits, the sum of the exponentiated
+// logits, and the label indices.  For each minibatch entry, ignoring
+// the batch index b, it calculates:
+//
+//   loss[j] = (log(sum_exp_logits) - logits[j]) * 1{ j == label }
+//
+// for j = 0 .. num_classes.  This value must be summed over all j for
+// the final loss.
+template <typename T, typename Index>
+class SparseXentLossGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SparseXentLossGenerator(
+      typename TTypes<const T, 2>::Tensor32Bit logits,
+      typename TTypes<const T, 1>::Tensor32Bit sum_exp_logits,
+      typename TTypes<const Index, 1>::Tensor32Bit labels,
+      const Index max_depth)
+      : logits_(logits),
+        sum_exp_logits_(sum_exp_logits),
+        labels_(labels),
+        max_depth_(max_depth) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<int, 2>& coords) const {
+    const int batch = coords[0];
+    const int depth = coords[1];
+    const Index label = tensorflow::internal::SubtleMustCopy(labels_(batch));
+    if (!FastBoundsCheck(label, max_depth_)) {
+      return Eigen::NumTraits<T>::quiet_NaN();
+    }
+    return TF_PREDICT_FALSE(label == depth)
+               ? (Eigen::numext::log(sum_exp_logits_(batch)) - logits_(coords))
+               : T(0.0);
+  };
+
+ private:
+  typename TTypes<const T, 2>::Tensor32Bit logits_;
+  typename TTypes<const T, 1>::Tensor32Bit sum_exp_logits_;
+  typename TTypes<const Index, 1>::Tensor32Bit labels_;
+  const Index max_depth_;
+};
+
+// Generator for calculation of the sparse Xent gradient.
+// This generator takes the exponentiated logits, their sums, and the label
+// indices. For each minibatch entry, ignoring the batch index b, it calculates:
+//
+//   exp_logits[j] / sum_exp_logits - 1{ j == label }
+//
+// for j = 0 .. num_classes.
+template <typename T, typename Index>
+class SparseXentGradGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SparseXentGradGenerator(
+      typename TTypes<const T, 2>::Tensor32Bit exp_logits,
+      typename TTypes<const T, 1>::Tensor32Bit sum_exp_logits,
+      typename TTypes<const Index, 1>::Tensor32Bit labels,
+      const Index max_depth)
+      : exp_logits_(exp_logits),
+        sum_exp_logits_(sum_exp_logits),
+        labels_(labels),
+        max_depth_(max_depth) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<int, 2>& coords) const {
+    const int batch = coords[0];
+    const int depth = coords[1];
+    const Index label = tensorflow::internal::SubtleMustCopy(labels_(batch));
+    if (!FastBoundsCheck(label, max_depth_)) {
+      return Eigen::NumTraits<T>::quiet_NaN();
+    }
+    T subtract = TF_PREDICT_FALSE(depth == label) ? T(1.0) : T(0.0);
+    return exp_logits_(coords) / sum_exp_logits_(batch) - subtract;
+  };
+
+ private:
+  typename TTypes<const T, 2>::Tensor32Bit exp_logits_;
+  typename TTypes<const T, 1>::Tensor32Bit sum_exp_logits_;
+  typename TTypes<const Index, 1>::Tensor32Bit labels_;
+  const Index max_depth_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T>
+struct RowMaxReduction {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+    Eigen::IndexList<Eigen::type2index<1> > along_row;
+    Device d = ctx->eigen_device<Device>();
+    To32Bit(maximum).device(d) = To32Bit(logits).maximum(along_row);
+  }
+};
+
+// Functor used by SparseXentOp to do the computations.
+template <typename Device, typename T, typename Index>
+struct SparseXentFunctor {
+  // Computes Cross Entropy loss and backprop.
+  //
+  // logits: batch_size, num_classes.
+  // labels: num_classes.
+  // scratch: temporary tensor, dims: batch_size, 1
+  // loss: output tensor for the loss, dims: batch_size.
+  // backprop: output tensor for the backprop, dims: batch_size, num_classes.
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<Index>::ConstVec labels,
+                  typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop);
+};
+
+// Eigen code implementing SparseXentFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T, typename Index>
+struct SparseXentEigenImpl {
+  static void Compute(OpKernelContext* ctx,
+                      typename TTypes<T>::ConstMatrix logits,
+                      typename TTypes<Index>::ConstVec labels,
+                      typename TTypes<T>::Vec scratch,
+                      typename TTypes<T>::Vec loss,
+                      typename TTypes<T>::Matrix backprop) {
+    // NOTE(touts): This duplicates some of the computations in softmax_op
+    // because we need the intermediate (logits -max(logits)) values to
+    // avoid a log(exp()) in the computation of the loss.
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+    batch_by_one.set(0, batch_size);
+    Eigen::IndexList<int> batch_only;
+    batch_only.set(0, batch_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+    one_by_class.set(1, num_classes);
+
+    // scratch = max_logits along classes.
+    RowMaxReduction<Device, T>::Compute(ctx, logits, scratch);
+
+    Device d = ctx->eigen_device<Device>();
+    // backprop = logits - max_logits.
+    To32Bit(backprop).device(d) =
+        To32Bit(logits) -
+        To32Bit(scratch).reshape(batch_by_one).broadcast(one_by_class);
+
+    // scratch = sum(exp(logits - max_logits)) along classes.
+    To32Bit(scratch).device(d) = To32Bit(backprop).exp().sum(along_class);
+
+    //  sum(-labels *
+    //     ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
+    //  along classes
+    generator::SparseXentLossGenerator<T, Index> sparse_xent_loss_gen(
+        sparse_xent_helpers::To32BitConst<T>(backprop),
+        sparse_xent_helpers::To32BitConst<T>(scratch), To32Bit(labels),
+        backprop.dimension(1) /* max_depth */);
+    To32Bit(loss).device(d) =
+        To32Bit(backprop).generate(sparse_xent_loss_gen).sum(along_class);
+
+    // backprop: prob - labels, where
+    //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
+    To32Bit(backprop).device(d) = To32Bit(backprop).exp();
+    generator::SparseXentGradGenerator<T, Index> sparse_xent_grad_gen(
+        sparse_xent_helpers::To32BitConst<T>(backprop),
+        sparse_xent_helpers::To32BitConst<T>(scratch), To32Bit(labels),
+        backprop.dimension(1) /* max_depth */);
+    To32Bit(backprop).device(d) =
+        To32Bit(backprop).generate(sparse_xent_grad_gen);
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_XENT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
new file mode 100644
index 00000000..6b8bb7cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
@@ -0,0 +1,724 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPECIAL_MATH_SPECIAL_MATH_OP_MISC_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SPECIAL_MATH_SPECIAL_MATH_OP_MISC_IMPL_H_
+
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <functional>
+#include <type_traits>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace Eigen {
+namespace internal {
+
+// Implementation of Dawson's integral based on Cephes.
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_dawsn_interval_1(const Scalar& x) {
+  // Rational approximation on [0, 3.25)
+  const Scalar AN[] = {
+      Scalar(1.13681498971755972054E-11), Scalar(8.49262267667473811108E-10),
+      Scalar(1.94434204175553054283E-8),  Scalar(9.53151741254484363489E-7),
+      Scalar(3.07828309874913200438E-6),  Scalar(3.52513368520288738649E-4),
+      Scalar(-8.50149846724410912031E-4), Scalar(4.22618223005546594270E-2),
+      Scalar(-9.17480371773452345351E-2), Scalar(9.99999999999999994612E-1),
+  };
+  const Scalar AD[] = {
+      Scalar(2.40372073066762605484E-11), Scalar(1.48864681368493396752E-9),
+      Scalar(5.21265281010541664570E-8),  Scalar(1.27258478273186970203E-6),
+      Scalar(2.32490249820789513991E-5),  Scalar(3.25524741826057911661E-4),
+      Scalar(3.48805814657162590916E-3),  Scalar(2.79448531198828973716E-2),
+      Scalar(1.58874241960120565368E-1),  Scalar(5.74918629489320327824E-1),
+      Scalar(1.00000000000000000539E0),
+  };
+  const Scalar x2 = x * x;
+  Scalar y = (x * internal::ppolevl<Scalar, 9>::run(x2, AN)) /
+             internal::ppolevl<Scalar, 10>::run(x2, AD);
+  return y;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_dawsn_interval_2(const Scalar& x) {
+  // Rational approximation on [3.25, 6.25)
+  const Scalar BN[] = {
+      Scalar(5.08955156417900903354E-1),  Scalar(-2.44754418142697847934E-1),
+      Scalar(9.41512335303534411857E-2),  Scalar(-2.18711255142039025206E-2),
+      Scalar(3.66207612329569181322E-3),  Scalar(-4.23209114460388756528E-4),
+      Scalar(3.59641304793896631888E-5),  Scalar(-2.14640351719968974225E-6),
+      Scalar(9.10010780076391431042E-8),  Scalar(-2.40274520828250956942E-9),
+      Scalar(3.59233385440928410398E-11),
+  };
+  const Scalar BD[] = {
+      Scalar(1.0),
+      Scalar(-6.31839869873368190192E-1),
+      Scalar(2.36706788228248691528E-1),
+      Scalar(-5.31806367003223277662E-2),
+      Scalar(8.48041718586295374409E-3),
+      Scalar(-9.47996768486665330168E-4),
+      Scalar(7.81025592944552338085E-5),
+      Scalar(-4.55875153252442634831E-6),
+      Scalar(1.89100358111421846170E-7),
+      Scalar(-4.91324691331920606875E-9),
+      Scalar(7.18466403235734541950E-11),
+  };
+  const Scalar one = Scalar(1);
+  const Scalar half = Scalar(0.5);
+
+  const Scalar inverse_x = one / x;
+  const Scalar inverse_x2 = inverse_x * inverse_x;
+  Scalar z = (internal::ppolevl<Scalar, 10>::run(inverse_x2, BN) /
+              (x * internal::ppolevl<Scalar, 10>::run(inverse_x2, BD)));
+  Scalar y = inverse_x2 * z + inverse_x;
+  return half * y;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_dawsn_interval_3(const Scalar& x) {
+  // Rational approximation on [6.25, 1.0e9)
+  const Scalar CN[] = {
+      Scalar(-5.90592860534773254987E-1), Scalar(6.29235242724368800674E-1),
+      Scalar(-1.72858975380388136411E-1), Scalar(1.64837047825189632310E-2),
+      Scalar(-4.86827613020462700845E-4),
+  };
+  const Scalar CD[] = {
+      Scalar(1.0),
+      Scalar(-2.69820057197544900361E0),
+      Scalar(1.73270799045947845857E0),
+      Scalar(-3.93708582281939493482E-1),
+      Scalar(3.44278924041233391079E-2),
+      Scalar(-9.73655226040941223894E-4),
+  };
+  const Scalar one = Scalar(1);
+  const Scalar half = Scalar(0.5);
+
+  const Scalar inverse_x = one / x;
+  Scalar inverse_x2 = inverse_x * inverse_x;
+  Scalar z = (internal::ppolevl<Scalar, 4>::run(inverse_x2, CN) /
+              (x * internal::ppolevl<Scalar, 5>::run(inverse_x2, CD)));
+  Scalar y = inverse_x2 * z + inverse_x;
+  return half * y;
+  return pmul(half, y);
+}
+
+template <typename Scalar>
+struct dawsn_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    const Scalar half = Scalar(0.5);
+    const Scalar a = Scalar(3.25);
+    const Scalar b = Scalar(6.25);
+    const Scalar c = Scalar(1.0e9);
+
+    Scalar abs_x = pabs(x);
+
+    Scalar dawsn;
+    if (abs_x < a) {
+      dawsn = generic_dawsn_interval_1<Scalar>(abs_x);
+    } else if (abs_x < b) {
+      dawsn = generic_dawsn_interval_2<Scalar>(abs_x);
+    } else if (abs_x < c) {
+      dawsn = generic_dawsn_interval_3<Scalar>(abs_x);
+    } else {
+      dawsn = half / x;
+    }
+
+    if (x < Scalar(0)) {
+      dawsn = -dawsn;
+    }
+    return dawsn;
+  }
+};
+
+// Implementation of exponential integral, based on Cephes.
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_1(const Scalar& x) {
+  /* 0 < x <= 2
+   Ei(x) - EUL - ln(x) = x A(x)/B(x)
+   Theoretical peak relative error 9.73e-18  */
+  const Scalar A[] = {
+      Scalar(-5.350447357812542947283E0), Scalar(2.185049168816613393830E2),
+      Scalar(-4.176572384826693777058E3), Scalar(5.541176756393557601232E4),
+      Scalar(-3.313381331178144034309E5), Scalar(1.592627163384945414220E6),
+  };
+  const Scalar B[] = {
+      Scalar(1.0),
+      Scalar(-5.250547959112862969197E1),
+      Scalar(1.259616186786790571525E3),
+      Scalar(-1.756549581973534652631E4),
+      Scalar(1.493062117002725991967E5),
+      Scalar(-7.294949239640527645655E5),
+      Scalar(1.592627163384945429726E6),
+  };
+
+  // Euler gamma.
+  const Scalar EUL = Scalar(0.5772156649015329);
+
+  const Scalar f = (internal::ppolevl<Scalar, 5>::run(x, A) /
+                    internal::ppolevl<Scalar, 6>::run(x, B));
+  return x * f + EUL + numext::log(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_2(const Scalar& x) {
+  /* 2 <= x <= 4
+   x exp(-x) Ei(x) - 1  =  1/x A6(1/x) / B6(1/x)
+   Theoretical absolute error = 4.89e-17  */
+  const Scalar A6[] = {
+      Scalar(1.981808503259689673238E-2),  Scalar(-1.271645625984917501326E0),
+      Scalar(-2.088160335681228318920E0),  Scalar(2.755544509187936721172E0),
+      Scalar(-4.409507048701600257171E-1), Scalar(4.665623805935891391017E-2),
+      Scalar(-1.545042679673485262580E-3), Scalar(7.059980605299617478514E-5),
+  };
+  const Scalar B6[] = {
+      Scalar(1.0),
+      Scalar(1.476498670914921440652E0),
+      Scalar(5.629177174822436244827E-1),
+      Scalar(1.699017897879307263248E-1),
+      Scalar(2.291647179034212017463E-2),
+      Scalar(4.450150439728752875043E-3),
+      Scalar(1.727439612206521482874E-4),
+      Scalar(3.953167195549672482304E-5),
+  };
+
+  const Scalar one = Scalar(1.0);
+  Scalar w = one / x;
+  Scalar f = (internal::ppolevl<Scalar, 7>::run(w, A6) /
+              internal::ppolevl<Scalar, 7>::run(w, B6));
+  f = w * f + one;
+  return numext::exp(x) * w * f;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_3(const Scalar& x) {
+  /* 4 <= x <= 8
+     x exp(-x) Ei(x) - 1  =  1/x A5(1/x) / B5(1/x)
+     Theoretical absolute error = 2.20e-17  */
+  const Scalar A5[] = {
+      Scalar(-1.373215375871208729803E0), Scalar(-7.084559133740838761406E-1),
+      Scalar(1.580806855547941010501E0),  Scalar(-2.601500427425622944234E-1),
+      Scalar(2.994674694113713763365E-2), Scalar(-1.038086040188744005513E-3),
+      Scalar(4.371064420753005429514E-5), Scalar(2.141783679522602903795E-6),
+  };
+  const Scalar B5[] = {
+      Scalar(1.0),
+      Scalar(8.585231423622028380768E-1),
+      Scalar(4.483285822873995129957E-1),
+      Scalar(7.687932158124475434091E-2),
+      Scalar(2.449868241021887685904E-2),
+      Scalar(8.832165941927796567926E-4),
+      Scalar(4.590952299511353531215E-4),
+      Scalar(-4.729848351866523044863E-6),
+      Scalar(2.665195537390710170105E-6),
+  };
+
+  const Scalar one = Scalar(1.0);
+  Scalar w = one / x;
+  Scalar f = (internal::ppolevl<Scalar, 7>::run(w, A5) /
+              internal::ppolevl<Scalar, 8>::run(w, B5));
+  f = w * f + one;
+  return numext::exp(x) * w * f;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_4(const Scalar& x) {
+  /* 8 <= x <= 16
+   x exp(-x) Ei(x) - 1 = 1/x R(1/x)
+   Theoretical peak absolute error = 1.07e-17  */
+  const Scalar A2[] = {
+      Scalar(-2.106934601691916512584E0),  Scalar(1.732733869664688041885E0),
+      Scalar(-2.423619178935841904839E-1), Scalar(2.322724180937565842585E-2),
+      Scalar(2.372880440493179832059E-4),  Scalar(-8.343219561192552752335E-5),
+      Scalar(1.363408795605250394881E-5),  Scalar(-3.655412321999253963714E-7),
+      Scalar(1.464941733975961318456E-8),  Scalar(6.176407863710360207074E-10),
+  };
+  const Scalar B2[] = {
+      Scalar(1.0),
+      Scalar(-2.298062239901678075778E-1),
+      Scalar(1.105077041474037862347E-1),
+      Scalar(-1.566542966630792353556E-2),
+      Scalar(2.761106850817352773874E-3),
+      Scalar(-2.089148012284048449115E-4),
+      Scalar(1.708528938807675304186E-5),
+      Scalar(-4.459311796356686423199E-7),
+      Scalar(1.394634930353847498145E-8),
+      Scalar(6.150865933977338354138E-10),
+  };
+
+  const Scalar one = Scalar(1.0);
+  Scalar w = one / x;
+  Scalar f = (internal::ppolevl<Scalar, 9>::run(w, A2) /
+              internal::ppolevl<Scalar, 9>::run(w, B2));
+  f = w * f + one;
+  return numext::exp(x) * w * f;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_5(const Scalar& x) {
+  /* 16 <= x <= 32
+   x exp(-x) Ei(x) - 1  =  1/x A4(1/x) / B4(1/x)
+   Theoretical absolute error = 1.22e-17  */
+  const Scalar A4[] = {
+      Scalar(-2.458119367674020323359E-1), Scalar(-1.483382253322077687183E-1),
+      Scalar(7.248291795735551591813E-2),  Scalar(-1.348315687380940523823E-2),
+      Scalar(1.342775069788636972294E-3),  Scalar(-7.942465637159712264564E-5),
+      Scalar(2.644179518984235952241E-6),  Scalar(-4.239473659313765177195E-8),
+  };
+  const Scalar B4[] = {
+      Scalar(1.0),
+      Scalar(-1.044225908443871106315E-1),
+      Scalar(-2.676453128101402655055E-1),
+      Scalar(9.695000254621984627876E-2),
+      Scalar(-1.601745692712991078208E-2),
+      Scalar(1.496414899205908021882E-3),
+      Scalar(-8.462452563778485013756E-5),
+      Scalar(2.728938403476726394024E-6),
+      Scalar(-4.239462431819542051337E-8),
+  };
+
+  const Scalar one = Scalar(1.0);
+  Scalar w = one / x;
+  Scalar f = (internal::ppolevl<Scalar, 7>::run(w, A4) /
+              internal::ppolevl<Scalar, 8>::run(w, B4));
+  f = w * f + one;
+  return numext::exp(x) * w * f;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_6(const Scalar& x) {
+  /* 32 <= x <= 64
+   x exp(-x) Ei(x) - 1  =  1/x A7(1/x) / B7(1/x)
+   Theoretical absolute error = 7.71e-18  */
+  const Scalar A7[] = {
+      Scalar(1.212561118105456670844E-1), Scalar(-5.823133179043894485122E-1),
+      Scalar(2.348887314557016779211E-1), Scalar(-3.040034318113248237280E-2),
+      Scalar(1.510082146865190661777E-3), Scalar(-2.523137095499571377122E-5),
+  };
+  const Scalar B7[] = {
+      Scalar(1.0),
+      Scalar(-1.002252150365854016662E0),
+      Scalar(2.928709694872224144953E-1),
+      Scalar(-3.337004338674007801307E-2),
+      Scalar(1.560544881127388842819E-3),
+      Scalar(-2.523137093603234562648E-5),
+  };
+
+  const Scalar one = Scalar(1.0);
+  Scalar w = one / x;
+  Scalar f = (internal::ppolevl<Scalar, 5>::run(w, A7) /
+              internal::ppolevl<Scalar, 5>::run(w, B7));
+  f = w * f + one;
+  return numext::exp(x) * w * f;
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_expint_interval_7(const Scalar& x) {
+  /* x > 64
+   x exp(-x) Ei(x) - 1  =  1/x A3(1/x)/B3(1/x)
+   Theoretical absolute error = 6.15e-17  */
+  const Scalar A3[] = {
+      Scalar(-7.657847078286127362028E-1), Scalar(6.886192415566705051750E-1),
+      Scalar(-2.132598113545206124553E-1), Scalar(3.346107552384193813594E-2),
+      Scalar(-3.076541477344756050249E-3), Scalar(1.747119316454907477380E-4),
+      Scalar(-6.103711682274170530369E-6), Scalar(1.218032765428652199087E-7),
+      Scalar(-1.086076102793290233007E-9),
+  };
+  const Scalar B3[] = {
+      Scalar(1.0),
+      Scalar(-1.888802868662308731041E0),
+      Scalar(1.066691687211408896850E0),
+      Scalar(-2.751915982306380647738E-1),
+      Scalar(3.930852688233823569726E-2),
+      Scalar(-3.414684558602365085394E-3),
+      Scalar(1.866844370703555398195E-4),
+      Scalar(-6.345146083130515357861E-6),
+      Scalar(1.239754287483206878024E-7),
+      Scalar(-1.086076102793126632978E-9),
+  };
+
+  const Scalar one = Scalar(1.0);
+  Scalar w = one / x;
+  Scalar f = (internal::ppolevl<Scalar, 8>::run(w, A3) /
+              internal::ppolevl<Scalar, 9>::run(w, B3));
+  f = w * f + one;
+  return numext::exp(x) * w * f;
+}
+
+template <typename Scalar>
+struct expint_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    const Scalar zero = Scalar(0.0);
+    const Scalar two = Scalar(2.0);
+    const Scalar four = Scalar(4.0);
+    const Scalar eight = Scalar(8.0);
+    const Scalar sixteen = Scalar(16.0);
+    const Scalar thirty_two = Scalar(32.0);
+    const Scalar sixty_four = Scalar(64.0);
+    const Scalar nan = Scalar(NumTraits<Scalar>::quiet_NaN());
+
+    if (x < zero) {
+      return nan;
+    }
+
+    if (x < two) {
+      return generic_expint_interval_1<Scalar>(x);
+    } else if (x < four) {
+      return generic_expint_interval_2<Scalar>(x);
+    } else if (x < eight) {
+      return generic_expint_interval_3<Scalar>(x);
+    } else if (x < sixteen) {
+      return generic_expint_interval_4<Scalar>(x);
+    } else if (x < thirty_two) {
+      return generic_expint_interval_5<Scalar>(x);
+    } else if (x < sixty_four) {
+      return generic_expint_interval_6<Scalar>(x);
+    }
+    return generic_expint_interval_7<Scalar>(x);
+  }
+};
+
+// Implementation of Fresnel cosine and sine integrals, based on Cephes.
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_fresnel_cos_interval_1(const Scalar& x) {
+  const Scalar CN[] = {
+      Scalar(-4.98843114573573548651E-8), Scalar(9.50428062829859605134E-6),
+      Scalar(-6.45191435683965050962E-4), Scalar(1.88843319396703850064E-2),
+      Scalar(-2.05525900955013891793E-1), Scalar(9.99999999999999998822E-1),
+  };
+  const Scalar CD[] = {
+      Scalar(3.99982968972495980367E-12), Scalar(9.15439215774657478799E-10),
+      Scalar(1.25001862479598821474E-7),  Scalar(1.22262789024179030997E-5),
+      Scalar(8.68029542941784300606E-4),  Scalar(4.12142090722199792936E-2),
+      Scalar(1.00000000000000000118E0),
+  };
+
+  const Scalar x2 = x * x;
+  Scalar x4 = x2 * x2;
+  return (x * internal::ppolevl<Scalar, 5>::run(x4, CN) /
+          internal::ppolevl<Scalar, 6>::run(x4, CD));
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_fresnel_sin_interval_1(const Scalar& x) {
+  const Scalar SN[] = {
+      Scalar(-2.99181919401019853726E3),  Scalar(7.08840045257738576863E5),
+      Scalar(-6.29741486205862506537E7),  Scalar(2.54890880573376359104E9),
+      Scalar(-4.42979518059697779103E10), Scalar(3.18016297876567817986E11),
+  };
+  const Scalar SD[] = {
+      Scalar(1.0),
+      Scalar(2.81376268889994315696E2),
+      Scalar(4.55847810806532581675E4),
+      Scalar(5.17343888770096400730E6),
+      Scalar(4.19320245898111231129E8),
+      Scalar(2.24411795645340920940E10),
+      Scalar(6.07366389490084639049E11),
+  };
+
+  const Scalar x2 = x * x;
+  Scalar x4 = x2 * x2;
+  Scalar z = x * x2;
+  return (z * internal::ppolevl<Scalar, 5>::run(x4, SN) /
+          internal::ppolevl<Scalar, 6>::run(x4, SD));
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+generic_fresnel_asymp(const Scalar& x, bool use_sin) {
+  const Scalar FN[] = {
+      Scalar(4.21543555043677546506E-1),  Scalar(1.43407919780758885261E-1),
+      Scalar(1.15220955073585758835E-2),  Scalar(3.45017939782574027900E-4),
+      Scalar(4.63613749287867322088E-6),  Scalar(3.05568983790257605827E-8),
+      Scalar(1.02304514164907233465E-10), Scalar(1.72010743268161828879E-13),
+      Scalar(1.34283276233062758925E-16), Scalar(3.76329711269987889006E-20),
+  };
+  const Scalar FD[] = {
+      Scalar(1.0),
+      Scalar(7.51586398353378947175E-1),
+      Scalar(1.16888925859191382142E-1),
+      Scalar(6.44051526508858611005E-3),
+      Scalar(1.55934409164153020873E-4),
+      Scalar(1.84627567348930545870E-6),
+      Scalar(1.12699224763999035261E-8),
+      Scalar(3.60140029589371370404E-11),
+      Scalar(5.88754533621578410010E-14),
+      Scalar(4.52001434074129701496E-17),
+      Scalar(1.25443237090011264384E-20),
+  };
+  const Scalar GN[] = {
+      Scalar(5.04442073643383265887E-1),  Scalar(1.97102833525523411709E-1),
+      Scalar(1.87648584092575249293E-2),  Scalar(6.84079380915393090172E-4),
+      Scalar(1.15138826111884280931E-5),  Scalar(9.82852443688422223854E-8),
+      Scalar(4.45344415861750144738E-10), Scalar(1.08268041139020870318E-12),
+      Scalar(1.37555460633261799868E-15), Scalar(8.36354435630677421531E-19),
+      Scalar(1.86958710162783235106E-22),
+  };
+  const Scalar GD[] = {
+      Scalar(1.0),
+      Scalar(1.47495759925128324529E0),
+      Scalar(3.37748989120019970451E-1),
+      Scalar(2.53603741420338795122E-2),
+      Scalar(8.14679107184306179049E-4),
+      Scalar(1.27545075667729118702E-5),
+      Scalar(1.04314589657571990585E-7),
+      Scalar(4.60680728146520428211E-10),
+      Scalar(1.10273215066240270757E-12),
+      Scalar(1.38796531259578871258E-15),
+      Scalar(8.39158816283118707363E-19),
+      Scalar(1.86958710162783236342E-22),
+  };
+
+  const Scalar HALF_PI = Scalar(1.5707963267948966);
+  const Scalar PI = Scalar(EIGEN_PI);
+  const Scalar one = Scalar(1);
+  const Scalar half = Scalar(0.5);
+
+  const Scalar x2 = x * x;
+  const Scalar t = one / pmul(PI, x2);
+  Scalar u = t * t;
+
+  Scalar f = one - u * (internal::ppolevl<Scalar, 9>::run(u, FN) /
+                        internal::ppolevl<Scalar, 10>::run(u, FD));
+  Scalar g = (t * internal::ppolevl<Scalar, 10>::run(u, GN) /
+              internal::ppolevl<Scalar, 11>::run(u, GD));
+
+  const Scalar z = HALF_PI * x2;
+  const Scalar c = numext::cos(z);
+  const Scalar s = numext::sin(z);
+  const Scalar y = one / (PI * x);
+  if (use_sin) {
+    Scalar intermediate = f * c;
+    intermediate += g * s;
+    return half - intermediate * y;
+  }
+  Scalar intermediate = f * s;
+  intermediate -= g * c;
+  return half + intermediate * y;
+}
+
+template <typename Scalar>
+struct fresnel_cos_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    const Scalar zero = Scalar(0.);
+    const Scalar half = Scalar(0.5);
+    const Scalar a = Scalar(2.5625);
+    const Scalar b = Scalar(36974.0);
+
+    const Scalar abs_x = numext::abs(x);
+
+    if (abs_x > b) {
+      if (x < zero) {
+        return -half;
+      }
+      return half;
+    }
+
+    const Scalar x2 = x * x;
+
+    Scalar fresnel_cos;
+    if (x2 < a) {
+      fresnel_cos = generic_fresnel_cos_interval_1<Scalar>(abs_x);
+    } else {
+      fresnel_cos = generic_fresnel_asymp<Scalar>(abs_x, false);
+    }
+    if (x < zero) {
+      return -fresnel_cos;
+    }
+    return fresnel_cos;
+  }
+};
+
+template <typename Scalar>
+struct fresnel_sin_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    const Scalar zero = Scalar(0.);
+    const Scalar half = Scalar(0.5);
+    const Scalar a = Scalar(2.5625);
+    const Scalar b = Scalar(36974.0);
+    const Scalar abs_x = numext::abs(x);
+
+    if (abs_x > b) {
+      if (x < zero) {
+        return -half;
+      }
+      return half;
+    }
+
+    const Scalar x2 = x * x;
+
+    Scalar fresnel_sin;
+    if (x2 < a) {
+      fresnel_sin = generic_fresnel_sin_interval_1<Scalar>(abs_x);
+    } else {
+      fresnel_sin = generic_fresnel_asymp<Scalar>(abs_x, true);
+    }
+
+    if (x < zero) {
+      return -fresnel_sin;
+    }
+    return fresnel_sin;
+  }
+};
+
+// Implementation of Spence's Integral based on Cephes.
+template <typename Scalar>
+struct spence_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& x) const {
+    const Scalar A[] = {
+        Scalar(4.65128586073990045278E-5), Scalar(7.31589045238094711071E-3),
+        Scalar(1.33847639578309018650E-1), Scalar(8.79691311754530315341E-1),
+        Scalar(2.71149851196553469920E0),  Scalar(4.25697156008121755724E0),
+        Scalar(3.29771340985225106936E0),  Scalar(1.00000000000000000126E0),
+    };
+    const Scalar B[] = {
+        Scalar(6.90990488912553276999E-4), Scalar(2.54043763932544379113E-2),
+        Scalar(2.82974860602568089943E-1), Scalar(1.41172597751831069617E0),
+        Scalar(3.63800533345137075418E0),  Scalar(5.03278880143316990390E0),
+        Scalar(3.54771340985225096217E0),  Scalar(9.99999999999999998740E-1),
+    };
+    const Scalar zero = Scalar(0.0);
+    const Scalar one = Scalar(1.0);
+    const Scalar three_halves = Scalar(1.5);
+    const Scalar two = Scalar(2.0);
+    const Scalar half = Scalar(0.5);
+    const Scalar nan = Scalar(NumTraits<Scalar>::quiet_NaN());
+    // pi**2 / 6.
+    const Scalar PI2O6 = Scalar(EIGEN_PI * EIGEN_PI / 6.0);
+
+    if (x < zero) {
+      return nan;
+    } else if (x == zero) {
+      return PI2O6;
+    } else if (x == one) {
+      return zero;
+    }
+
+    Scalar y;
+    if (x < two) {
+      y = x;
+    } else {
+      y = one / x;
+    }
+
+    Scalar w;
+    if (three_halves < y) {
+      w = one / y - one;
+    } else {
+      if (y < half) {
+        w = -y;
+      } else {
+        w = y - one;
+      }
+    }
+    Scalar spence = -w * (internal::ppolevl<Scalar, 7>::run(w, A) /
+                          internal::ppolevl<Scalar, 7>::run(w, B));
+    Scalar z = numext::log(y);
+    if (y < half) {
+      spence = -z * numext::log1p(-y) + PI2O6 - spence;
+    }
+    if (three_halves < x) {
+      spence = -half * z * z - spence;
+    }
+    return spence;
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+struct dawsn : base<T, Eigen::internal::dawsn_op<T>> {};
+
+template <typename T>
+struct expint : base<T, Eigen::internal::expint_op<T>> {};
+
+template <typename T>
+struct fresnel_cos : base<T, Eigen::internal::fresnel_cos_op<T>> {};
+
+template <typename T>
+struct fresnel_sin : base<T, Eigen::internal::fresnel_sin_op<T>> {};
+
+template <typename T>
+struct spence : base<T, Eigen::internal::spence_op<T>> {};
+
+// Bessel Functions
+
+template <typename T>
+struct bessel_i0 : base<T, Eigen::internal::scalar_bessel_i0_op<T>> {};
+
+template <typename T>
+struct bessel_i0e : base<T, Eigen::internal::scalar_bessel_i0e_op<T>> {};
+
+template <typename T>
+struct bessel_i1 : base<T, Eigen::internal::scalar_bessel_i1_op<T>> {};
+
+template <typename T>
+struct bessel_i1e : base<T, Eigen::internal::scalar_bessel_i1e_op<T>> {};
+
+template <typename T>
+struct bessel_k0 : base<T, Eigen::internal::scalar_bessel_k0_op<T>> {};
+
+template <typename T>
+struct bessel_k0e : base<T, Eigen::internal::scalar_bessel_k0e_op<T>> {};
+
+template <typename T>
+struct bessel_k1 : base<T, Eigen::internal::scalar_bessel_k1_op<T>> {};
+
+template <typename T>
+struct bessel_k1e : base<T, Eigen::internal::scalar_bessel_k1e_op<T>> {};
+
+template <typename T>
+struct bessel_j0 : base<T, Eigen::internal::scalar_bessel_j0_op<T>> {};
+
+template <typename T>
+struct bessel_j1 : base<T, Eigen::internal::scalar_bessel_j1_op<T>> {};
+
+template <typename T>
+struct bessel_y0 : base<T, Eigen::internal::scalar_bessel_y0_op<T>> {};
+
+template <typename T>
+struct bessel_y1 : base<T, Eigen::internal::scalar_bessel_y1_op<T>> {};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPECIAL_MATH_SPECIAL_MATH_OP_MISC_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/spectrogram.h b/third_party/tflite-hdrs/tensorflow/core/kernels/spectrogram.h
new file mode 100644
index 00000000..4b6b9c8b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/spectrogram.h
@@ -0,0 +1,126 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class for generating spectrogram slices from a waveform.
+// Initialize() should be called before calls to other functions.  Once
+// Initialize() has been called and returned true, The Compute*() functions can
+// be called repeatedly with sequential input data (ie. the first element of the
+// next input vector directly follows the last element of the previous input
+// vector). Whenever enough audio samples are buffered to produce a
+// new frame, it will be placed in output. Output is cleared on each
+// call to Compute*(). This class is thread-unsafe, and should only be
+// called from one thread at a time.
+// With the default parameters, the output of this class should be very
+// close to the results of the following MATLAB code:
+// overlap_samples = window_length_samples - step_samples;
+// window = hann(window_length_samples, 'periodic');
+// S = abs(spectrogram(audio, window, overlap_samples)).^2;
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#define TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+
+#include <complex>
+#include <deque>
+#include <vector>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class Spectrogram {
+ public:
+  Spectrogram() : initialized_(false) {}
+  ~Spectrogram() {}
+
+  // Initializes the class with a given window length and step length
+  // (both in samples). Internally a Hann window is used as the window
+  // function. Returns true on success, after which calls to Process()
+  // are possible. window_length must be greater than 1 and step
+  // length must be greater than 0.
+  bool Initialize(int window_length, int step_length);
+
+  // Initialize with an explicit window instead of a length.
+  bool Initialize(const std::vector<double>& window, int step_length);
+
+  // Reset internal variables.
+  // Spectrogram keeps internal state: remaining input data from previous call.
+  // As a result it can produce different number of frames when you call
+  // ComputeComplexSpectrogram multiple times (even though input data
+  // has the same size). As it is shown in
+  // MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames
+  // in tensorflow/core/kernels/spectrogram_test.cc.
+  // But if you need to compute Spectrogram on input data without keeping
+  // internal state (and clear remaining input data from the previous call)
+  // you have to call Reset() before computing Spectrogram.
+  // For example in tensorflow/core/kernels/spectrogram_op.cc
+  bool Reset();
+
+  // Processes an arbitrary amount of audio data (contained in input)
+  // to yield complex spectrogram frames. After a successful call to
+  // Initialize(), Process() may be called repeatedly with new input data
+  // each time.  The audio input is buffered internally, and the output
+  // vector is populated with as many temporally-ordered spectral slices
+  // as it is possible to generate from the input.  The output is cleared
+  // on each call before the new frames (if any) are added.
+  //
+  // The template parameters can be float or double.
+  template <class InputSample, class OutputSample>
+  bool ComputeComplexSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<std::complex<OutputSample>>>* output);
+
+  // This function works as the one above, but returns the power
+  // (the L2 norm, or the squared magnitude) of each complex value.
+  template <class InputSample, class OutputSample>
+  bool ComputeSquaredMagnitudeSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<OutputSample>>* output);
+
+  // Return reference to the window function used internally.
+  const std::vector<double>& GetWindow() const { return window_; }
+
+  // Return the number of frequency channels in the spectrogram.
+  int output_frequency_channels() const { return output_frequency_channels_; }
+
+ private:
+  template <class InputSample>
+  bool GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                              int* input_start);
+  void ProcessCoreFFT();
+
+  int fft_length_;
+  int output_frequency_channels_;
+  int window_length_;
+  int step_length_;
+  bool initialized_;
+  int samples_to_next_step_;
+
+  std::vector<double> window_;
+  std::vector<double> fft_input_output_;
+  std::deque<double> input_queue_;
+
+  // Working data areas for the FFT routines.
+  std::vector<int> fft_integer_working_area_;
+  std::vector<double> fft_double_working_area_;
+
+  Spectrogram(const Spectrogram&) = delete;
+  void operator=(const Spectrogram&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/spectrogram_test_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/spectrogram_test_utils.h
new file mode 100644
index 00000000..d4187076
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/spectrogram_test_utils.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+
+#include <complex>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// Reads a wav format file into a vector of floating-point values with range
+// -1.0 to 1.0.
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data);
+
+// Reads a binary file containing 32-bit floating point values in the
+// form [real_1, imag_1, real_2, imag_2, ...] into a rectangular array
+// of complex values where row_length is the length of each inner vector.
+bool ReadRawFloatFileToComplexVector(
+    const string& file_name, int row_length,
+    std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a CSV file of numbers in the format 1.1+2.2i,1.1,2.2i,3.3j into data.
+void ReadCSVFileToComplexVectorOrDie(
+    const string& file_name,
+    std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a 2D array of floats from an ASCII text file, where each line is a row
+// of the array, and elements are separated by commas.
+void ReadCSVFileToArrayOrDie(const string& filename,
+                             std::vector<std::vector<float> >* array);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleVectorToFile(const string& file_name,
+                             const std::vector<double>& data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatVectorToFile(const string& file_name,
+                            const std::vector<float>& data);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+                            const double* data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatArrayToFile(const string& file_name, int size,
+                           const float* data);
+
+// Write a binary file in the format read by
+// ReadRawDoubleFileToComplexVector above.
+bool WriteComplexVectorToRawFloatFile(
+    const string& file_name,
+    const std::vector<std::vector<std::complex<double> > >& data);
+
+// Generate a sine wave with the provided parameters, and populate
+// data with the samples.
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+              std::vector<double>* data);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/split_lib.h b/third_party/tflite-hdrs/tensorflow/core/kernels/split_lib.h
new file mode 100644
index 00000000..28257ed4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/split_lib.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPLIT_LIB_H_
+#define TENSORFLOW_CORE_KERNELS_SPLIT_LIB_H_
+// Functor definition for SplitOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct SplitCustom {
+  void operator()(const Device& d, typename TTypes<T, 2>::Tensor output,
+                  typename TTypes<T, 2>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_sizes);
+};
+
+template <typename Device, typename T, int NDims>
+struct Split {
+  void operator()(const Device& d, typename TTypes<T, NDims>::Tensor output,
+                  typename TTypes<T, NDims>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
+};
+
+template <typename T, int NDims>
+struct Split<Eigen::ThreadPoolDevice, T, NDims> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T, NDims>::Tensor output,
+                  typename TTypes<T, NDims>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
+};
+
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPLIT_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/split_lib_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/split_lib_gpu.h
new file mode 100644
index 00000000..ae767b07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/split_lib_gpu.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPLIT_LIB_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SPLIT_LIB_GPU_H_
+
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_GPU
+
+#include <memory>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+#include "tensorflow/core/kernels/split_lib.h"
+
+namespace tensorflow {
+
+template <typename T>
+struct SplitOpGPULaunch {
+  void Run(const Eigen::GpuDevice& d, const T* input, int32_t prefix_dim_size,
+           int32_t split_dim_size, int32_t suffix_dim_size,
+           const GpuDeviceArrayStruct<T*>& output_ptr_data);
+};
+
+template <typename T, typename IntType>
+struct SplitVOpGPULaunch {
+  void Run(const Eigen::GpuDevice& d, bool fixed, const T* input,
+           int total_cols, int total_rows,
+           const GpuDeviceArrayStruct<IntType>& output_scan,
+           const GpuDeviceArrayStruct<T*>& output_ptr_data);
+};
+
+// Explicit instantiations in split_lib_gpu.cu.cc.
+#define REGISTER_GPU_KERNEL(T)                        \
+  extern template struct SplitOpGPULaunch<T>;         \
+  extern template struct SplitVOpGPULaunch<T, int8>;  \
+  extern template struct SplitVOpGPULaunch<T, int32>; \
+  extern template struct SplitVOpGPULaunch<T, int64_t>;
+
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPLIT_LIB_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/squared-loss.h b/third_party/tflite-hdrs/tensorflow/core/kernels/squared-loss.h
new file mode 100644
index 00000000..3b334d68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/squared-loss.h
@@ -0,0 +1,73 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SQUARED_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_SQUARED_LOSS_H_
+
+#include "tensorflow/core/kernels/loss.h"
+
+namespace tensorflow {
+
+class SquaredLossUpdater : public DualLossUpdater {
+ public:
+  // Closed form solution that decreases the dual squared loss.
+  // See page 23 of http://arxiv.org/pdf/1309.2375v2.pdf for the derivation of
+  // the update rule when the example weights are equal to 1.0.
+  // Note: There is a typo in the formula in the paper: the denominator should
+  // be 1 + ||x_i||^2/(\lambda n) (without the 2 multiplier).
+  //
+  // The CoCoA+ modification is detailed in readme.md.
+  double ComputeUpdatedDual(const int num_loss_partitions, const double label,
+                            const double example_weight,
+                            const double current_dual, const double wx,
+                            const double weighted_example_norm) const final {
+    const double delta_numerator = label - current_dual - wx;
+    const double delta_denominator =
+        1 + num_loss_partitions * weighted_example_norm * example_weight;
+    return current_dual + delta_numerator / delta_denominator;
+  }
+
+  // Dual of squared loss function.
+  // https://en.wikipedia.org/wiki/Convex_conjugate
+  double ComputeDualLoss(const double current_dual, const double example_label,
+                         const double example_weight) const final {
+    // Dual of the squared loss function = b * (y + b/2), where b is the
+    // dual variable and y is the label.  This is Dual(-b).
+    return current_dual * (0.5 * current_dual - example_label) * example_weight;
+  }
+
+  // Squared loss for linear regression.
+  double ComputePrimalLoss(const double wx, const double example_label,
+                           const double example_weight) const final {
+    const double error = wx - example_label;
+    return error * error * example_weight * 0.5;
+  }
+
+  inline double PrimalLossDerivative(const double wx, const double label,
+                                     const double example_weight) const final {
+    return (wx - label) * example_weight;
+  }
+
+  inline double SmoothnessConstant() const final { return 1.0; }
+
+  // Labels don't require conversion for linear regression.
+  absl::Status ConvertLabel(float* const example_label) const final {
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SQUARED_LOSS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stack.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stack.h
new file mode 100644
index 00000000..a9c6a607
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stack.h
@@ -0,0 +1,77 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STACK_H_
+#define TENSORFLOW_CORE_KERNELS_STACK_H_
+
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// A per-run local stack. The stack uses a "per-step" resource manager which
+// ensures that correct garbage collection on error or successful completion.
+class StackOp : public OpKernel {
+ public:
+  explicit StackOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType elem_type_;
+  string stack_name_;
+
+  StackOp(const StackOp&) = delete;
+  void operator=(const StackOp&) = delete;
+};
+
+class StackPushOp : public AsyncOpKernel {
+ public:
+  StackPushOp(OpKernelConstruction* context, bool allow_swapping);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+
+ private:
+  bool swap_memory_ = false;
+};
+
+// Templated helper to make it easier to register kernels with or without
+// swapping.
+template <bool allow_swapping>
+class TemplatedStackPushOp : public StackPushOp {
+ public:
+  TemplatedStackPushOp(OpKernelConstruction* context)
+      : StackPushOp(context, allow_swapping) {}
+};
+
+class StackPopOp : public AsyncOpKernel {
+ public:
+  explicit StackPopOp(OpKernelConstruction* context);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+};
+
+class StackCloseOp : public OpKernel {
+ public:
+  explicit StackCloseOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stateful_random_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stateful_random_ops.h
new file mode 100644
index 00000000..21a08fa0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stateful_random_ops.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
+// in b/111604096 and cl/171681867), so we use signed int here. We choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU, and
+// because of the "int32 problem".
+using StateElementType = int64_t;
+static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
+static constexpr DataType ALGORITHM_DTYPE = STATE_ELEMENT_DTYPE;
+
+using random::PhiloxRandom;
+
+static constexpr int64_t PHILOX_MIN_STATE_SIZE =
+    (PhiloxRandom::ResultType::kElementCount +
+     PhiloxRandom::Key::kElementCount) /
+    2;
+static constexpr int64_t THREEFRY_MIN_STATE_SIZE = 2;
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
new file mode 100644
index 00000000..74eb40f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
+
+#include "tensorflow/core/kernels/random_ops_util.h"
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+
+namespace tensorflow {
+
+PHILOX_DEVICE_INLINE PhiloxRandom
+GetPhiloxRandomFromMem(StateElementType const* ptr) {
+  auto ptr_ = reinterpret_cast<uint64 const*>(ptr);
+  return GetPhiloxRandomFromCounterKeyMem(ptr_, ptr_ + 2);
+}
+
+PHILOX_DEVICE_INLINE void WritePhiloxRandomToMem(PhiloxRandom const& philox,
+                                                 StateElementType* ptr) {
+  auto ptr_ = reinterpret_cast<uint64*>(ptr);
+  WriteCounterToMem(philox.counter(), ptr_);
+  WriteKeyToMem(philox.key(), ptr_ + 2);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom SkipPhiloxRandom(PhiloxRandom const& philox,
+                                                   uint64 output_size) {
+  auto new_philox = philox;
+  // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change it
+  // just here.
+  auto delta = output_size * 256;
+  new_philox.Skip(delta);  // do the actual increasing
+  return new_philox;
+}
+
+PHILOX_DEVICE_INLINE void UpdateMemWithPhiloxRandom(PhiloxRandom const& philox,
+                                                    uint64 output_size,
+                                                    StateElementType* ptr) {
+  auto new_philox = SkipPhiloxRandom(philox, output_size);
+  WritePhiloxRandomToMem(new_philox, ptr);
+}
+
+PHILOX_DEVICE_INLINE void UpdateCounterMemWithPhiloxRandom(
+    PhiloxRandom::ResultType const& counter, uint64 output_size,
+    StateElementType* ptr) {
+  auto philox = PhiloxRandom(counter, PhiloxRandom::Key() /*dummy*/);
+  auto new_philox = SkipPhiloxRandom(philox, output_size);
+  WriteCounterToMem(new_philox.counter(), reinterpret_cast<uint64*>(ptr));
+}
+
+namespace functor {
+
+// A per-device helper function that does the actual work for
+// `UpdateVariableAndFill`.
+// Reason to use functor: C++ doesn't allow function-template partial
+// specialization.
+template <typename Device, typename Distribution>
+struct UpdateVariableAndFill_Philox;
+
+template <typename Device>
+struct RngSkip_Philox;
+
+}  // end namespace functor
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+class ScopedUnlockUnrefVar;
+
+struct UpdateVariableAndFill_Philox_Arg {
+  int64_t output_size;
+  int64_t alg_tag_skip;
+  ScopedUnlockUnrefVar* state_var_guard;
+  Tensor* state_tensor;
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+using GPUDevice = Eigen::GpuDevice;
+
+namespace functor {
+
+// Declares the partially GPU-specialized functor structs.
+// must be kept at <=6 arguments because of a gcc/clang ABI incompatibility bug
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& device,
+                  Distribution dist, UpdateVariableAndFill_Philox_Arg* arg,
+                  typename Distribution::ResultElementType* output_data);
+};
+
+template <>
+struct RngSkip_Philox<GPUDevice> {
+  void operator()(const GPUDevice& device, const StateElementType* in_data,
+                  uint64 delta, StateElementType* out_data);
+};
+
+}  // end namespace functor
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_gamma_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_gamma_op.h
new file mode 100644
index 00000000..426dbd5e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_gamma_op.h
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_GAMMA_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_GAMMA_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+// This functor can take the PhiloxRandom input from either device memory `key`
+// and `counter` or a stack value `random`. If both `key` and `counter` are not
+// nullptr, they provide the input; otherwise `random` provides the input.
+template <typename Device, typename T>
+struct StatelessRandomGammaFunctor {
+  static absl::Status Fill(OpKernelContext* ctx, const T* alpha_flat,
+                           int64_t num_samples, int64_t num_alphas,
+                           int64_t samples_per_alpha, const uint64* key,
+                           const uint64* counter,
+                           const random::PhiloxRandom& random, T* samples_flat);
+};
+
+}  // namespace functor
+
+// Buffer that holds multiple samples. Operator()(random::PhiloxRandom*) returns
+// a single sample from this buffer. If the buffer is empty, it first generates
+// new samples using the provided distribution.
+//
+// If the call to Distribution::operator() returns samples[0...N-1], then this
+// class returns samples in the following order:
+//
+//   samples[N-1], samples[N-2],..., samples[1], samples[0]
+//
+// For comparison, random::SingleSampleAdapter returns samples in
+// the following order:
+//
+//   samples[0], samples[1],...,samples[N-2], samples[N-1].
+//
+template <class Distribution>
+class RandomSampleBuffer {
+ public:
+  typedef typename Distribution::ResultElementType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  explicit RandomSampleBuffer(Distribution* distribution)
+      : distribution_(distribution), remaining_numbers_(0) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultElementType operator()(random::PhiloxRandom* random) {
+    if (remaining_numbers_ == 0) {
+      results_ = (*distribution_)(random);
+      remaining_numbers_ = Distribution::kResultElementCount;
+    }
+
+    remaining_numbers_--;
+    return results_[remaining_numbers_];
+  }
+
+  // Mark this buffer as empty. The next call to operator() will fill it
+  // with new random numbers.
+  PHILOX_DEVICE_INLINE
+  void Clear() { remaining_numbers_ = 0; }
+
+ private:
+  typedef typename Distribution::ResultType ResultType;
+
+  Distribution* distribution_;
+  ResultType results_;
+  int remaining_numbers_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_GAMMA_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops.h
new file mode 100644
index 00000000..42ce3bff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+// Generates a key and counter that can be used to seed a PhiloxRandom,
+// generator, based on the seed value in `seed_t`.
+//
+// REQUIRES: `seed_t` must be a length-2 vector of type DT_INT{32,64}.
+// `out_key` and `out_counter` must be non-null.
+absl::Status GenerateKey(Tensor seed_t, random::PhiloxRandom::Key* out_key,
+                         random::PhiloxRandom::ResultType* out_counter);
+
+// A base class for kernels of stateless RNG ops that take shape and seed as the
+// first 2 inputs.
+class StatelessRandomOpBase : public OpKernel {
+ public:
+  explicit StatelessRandomOpBase(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ protected:
+  // The part of Compute that depends on device, type, and distribution.
+  // Must be a tail call because it doesn't report error via return value.
+  virtual void Fill(OpKernelContext* context, random::PhiloxRandom random,
+                    Tensor* output) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops_v2.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops_v2.h
new file mode 100644
index 00000000..0b5b8945
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops_v2.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+inline absl::Status CheckKeyCounterShape(int minimum_counter_size,
+                                         TensorShape const& key_shape,
+                                         TensorShape const& counter_shape) {
+  if (!(key_shape.dims() == 1 && key_shape.dim_size(0) == RNG_KEY_SIZE)) {
+    return errors::InvalidArgument(
+        "key must have shape [", RNG_KEY_SIZE, "], not ",
+        key_shape.DebugString(),
+        ". (Note that batched keys are not supported yet.)");
+  }
+  if (!(counter_shape.dims() == 1 &&
+        counter_shape.dim_size(0) >= minimum_counter_size)) {
+    return errors::InvalidArgument(
+        "counter must be a vector with length at least ", minimum_counter_size,
+        "; got shape: ", counter_shape.DebugString(),
+        ". (Note that batched counters are not supported yet.)");
+  }
+  return absl::OkStatus();
+}
+
+// A base class for kernels of stateless RNG ops that take shape, key, counter
+// and algorithm as the first 4 inputs.
+class StatelessRandomOpBaseWithKeyCounter : public OpKernel {
+ public:
+  explicit StatelessRandomOpBaseWithKeyCounter(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  // The part of Compute that depends on device, type, and distribution.
+  // Must be a tail call because it doesn't report error via return value.
+  virtual void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                    const Tensor& counter, Tensor* output) = 0;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops_v2_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops_v2_util.h
new file mode 100644
index 00000000..a5798342
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stateless_random_ops_v2_util.h
@@ -0,0 +1,86 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_UTIL_H_
+
+// Utilities for V2 stateless random ops' (non-XLA) kernels.
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/stateless_random_ops_v2.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+template <typename T>
+absl::Status GetScalar(const Tensor& tensor, int input_idx, T* result) {
+  auto dtype = DataTypeToEnum<T>::v();
+  if (tensor.dims() != 0) {
+    return errors::InvalidArgument("input ", std::to_string(input_idx),
+                                   " (0-based) must have shape [], not ",
+                                   tensor.shape().DebugString());
+  }
+  if (tensor.dtype() != dtype) {
+    return errors::InvalidArgument("dtype of input ", std::to_string(input_idx),
+                                   " (0-based) must be ", DataTypeString(dtype),
+                                   ", not ", DataTypeString(tensor.dtype()));
+  }
+  *result = tensor.flat<T>()(0);
+  return absl::OkStatus();
+}
+
+inline absl::StatusOr<std::tuple<Tensor, Tensor, Algorithm>>
+GetKeyCounterAlgFromInputs(OpKernelContext* ctx, int key_input_idx,
+                           int counter_input_idx, int alg_input_idx) {
+  const Tensor& key_t = ctx->input(key_input_idx);
+  const Tensor& counter_t = ctx->input(counter_input_idx);
+  const Tensor& alg_t = ctx->input(alg_input_idx);
+
+  int alg_id;
+  TF_RETURN_IF_ERROR(GetScalar(alg_t, alg_input_idx, &alg_id));
+  Algorithm alg = Algorithm(alg_id);
+  if (alg == RNG_ALG_AUTO_SELECT) {
+    alg = RNG_ALG_PHILOX;
+  }
+
+  TF_RETURN_IF_ERROR(
+      CheckKeyCounterShape(alg, key_t.shape(), counter_t.shape()));
+  return std::make_tuple(key_t, counter_t, alg);
+}
+
+template <typename Device, typename Distribution>
+void FillRandomTensor(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                      const Tensor& counter, Distribution dist,
+                      Tensor* tensor) {
+  typedef typename Distribution::ResultElementType T;
+  auto flat = tensor->flat<T>();
+  if (alg == RNG_ALG_PHILOX) {
+    // Reuse the compute kernels from the stateful random ops
+    auto key_data = key.flat<uint64>().data();
+    auto counter_data = counter.flat<uint64>().data();
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+        random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
+  } else {
+    OP_REQUIRES(ctx, false,
+                errors::InvalidArgument("Unsupported algorithm id: ", alg));
+  }
+}
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/stochastic_cast_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/stochastic_cast_op.h
new file mode 100644
index 00000000..a1039b7f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/stochastic_cast_op.h
@@ -0,0 +1,140 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STOCHASTIC_CAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STOCHASTIC_CAST_OP_H_
+
+#include <limits>
+#include <type_traits>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Base class that dispatches random algorithm, key and counter for
+// StochasticCast ops.
+class StochasticCastOpBase : public OpKernel {
+ public:
+  explicit StochasticCastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  // Subclasses can implement this rounding kernel with assumption that random
+  // algorithm, key, counter have been given.
+  virtual void RoundOff(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                        const Tensor& counter, Tensor* output) = 0;
+};
+
+}  // namespace internal
+}  // namespace tensorflow
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, typename IntResultType, typename Generator>
+struct StochasticRoundToIntOp {
+  static_assert(std::is_integral<IntResultType>::value,
+                "Integer type expected");
+  typedef tensorflow::random::UniformDistribution<Generator, Scalar>
+      Distribution;
+  const Scalar max =
+      static_cast<Scalar>(std::numeric_limits<IntResultType>::max());
+  const Scalar min =
+      static_cast<Scalar>(std::numeric_limits<IntResultType>::min());
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC explicit StochasticRoundToIntOp(
+      Generator* g)
+      : gen(g) {}
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar
+  operator()(const Scalar& s) const {
+    if (TF_PREDICT_FALSE(Eigen::numext::isnan(s))) {
+      return Scalar{0};
+    }
+    if (s >= max) {
+      return max;
+    }
+    if (s <= min) {
+      return min;
+    }
+    // Already integer, doesn't need to be rounded.
+    if (Eigen::numext::floor(s) == s) {
+      return s;
+    }
+    // In order to match comparison-based algorithm on some hardware
+    // implementations which rounds abs(operand) up when random <
+    // abs(fractional), we deal with positive and negative operands differently.
+    // TODO(b/232442915): Revisit RNG multi-threading issue when needed.
+    Distribution dist;
+    Scalar random = dist(gen)[0];
+    if (s < 0) {
+      return Eigen::numext::floor(s + random);
+    } else {
+      return Eigen::numext::floor(s + Scalar{1} - random);
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet packetOp(const Packet& p) const {
+    constexpr size_t kPacketSize =
+        Eigen::internal::unpacket_traits<Packet>::size;
+    Scalar unpacked_random[kPacketSize];
+    Distribution dist;
+    auto const sample = dist(gen);
+    for (int i = 0; i < kPacketSize; i += Distribution::kResultElementCount) {
+      int granularity = std::min(Distribution::kResultElementCount,
+                                 static_cast<int>(kPacketSize - i));
+      std::copy(&sample[0], &sample[0] + granularity, &unpacked_random[i]);
+    }
+    Packet random = pload<Packet>(unpacked_random);
+    Packet rounded =
+        pselect(pcmp_eq(pfloor(p), p), p,
+                pselect(pcmp_lt(p, pzero(p)), pfloor(padd(p, random)),
+                        pfloor(padd(p, psub(pset1<Packet>(1), random)))));
+    // Handles out of range inputs.
+    Packet result =
+        pselect(pcmp_le(pset1<Packet>(max), p), pset1<Packet>(max), rounded);
+    result =
+        pselect(pcmp_le(p, pset1<Packet>(min)), pset1<Packet>(min), result);
+    // Handles NaN input.
+    return pselect(pcmp_eq(p, p), result, pset1<Packet>(0));
+  }
+  Generator* gen;
+};
+
+template <typename Scalar, typename IntResultType, typename Generator>
+struct functor_traits<
+    StochasticRoundToIntOp<Scalar, IntResultType, Generator>> {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::AddCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasRound,
+  };
+};
+
+// TODO(b/232442915): Add support for rounding floats to lower precision floats.
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_STOCHASTIC_CAST_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op.h
new file mode 100644
index 00000000..439f22e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op.h
@@ -0,0 +1,123 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_H_
+
+// Functor definition for StridedSliceOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/strided_slice_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, int NDIMS>
+struct StridedSlice {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& start_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& stop_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& strides) {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto output32, auto input32, const auto& start_indices32,
+            const auto& stop_indices32, const auto& strides32) {
+          output32.device(d) =
+              input32.stridedSlice(start_indices32, stop_indices32, strides32);
+        },
+        output, input, start_indices, stop_indices, strides);
+  }
+};
+
+template <typename T, int NDIMS, typename Device>
+struct InitOutput {
+  static void run(const Device& d, typename TTypes<T, NDIMS>::Tensor output) {
+    output.device(d) = output.constant(T(0));
+  }
+};
+
+template <int NDIMS, typename Device>
+struct InitOutput<ResourceHandle, NDIMS, Device> {
+  static void run(const Device& d,
+                  typename TTypes<ResourceHandle, NDIMS>::Tensor output) {
+    output.device(d) = output.constant(ResourceHandle());
+  }
+};
+
+template <int NDIMS, typename Device>
+struct InitOutput<tstring, NDIMS, Device> {
+  static void run(const Device& d,
+                  typename TTypes<tstring, NDIMS>::Tensor output) {
+    output.device(d) = output.constant(tstring());
+  }
+};
+
+template <typename Device, typename T, int NDIMS>
+struct StridedSliceGrad {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& start_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& stop_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& strides) {
+    InitOutput<T, NDIMS, Device>::run(d, output);
+    MaybeWith32BitIndexing<Device>(
+        [&](auto output32, const auto& start_indices32,
+            const auto& stop_indices32, const auto& strides32) {
+          output32.stridedSlice(start_indices32, stop_indices32, strides32)
+              .device(d) = input;
+        },
+        output, start_indices, stop_indices, strides);
+  }
+};
+
+template <typename Device, typename T, int NDIMS>
+struct StridedSliceAssign {
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& start_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& stop_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& strides,
+                  const StridedSliceAssignBCast& bcast) {
+    MaybeWith32BitIndexing<Device>(
+        [&](auto output32, auto input32, const auto& start_indices32,
+            const auto& stop_indices32, const auto& strides32) {
+          if (bcast.IsBroadcastingRequired()) {
+            output32.stridedSlice(start_indices32, stop_indices32, strides32)
+                .device(d) = input32.broadcast(bcast.bcast());
+          } else {
+            output32.stridedSlice(start_indices32, stop_indices32, strides32)
+                .device(d) = input32;
+          }
+        },
+        output, input, start_indices, stop_indices, strides);
+  }
+};
+
+template <typename Device, typename T>
+struct StridedSliceAssignScalar {
+  void operator()(const Device& d, typename TTypes<T, 1>::Tensor output,
+                  typename TTypes<T, 1>::ConstTensor input) {
+    output.device(d) = input;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op_gpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
new file mode 100644
index 00000000..23a3ff86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_KERNELS(T)                                   \
+  template struct functor::StridedSlice<GPUDevice, T, 1>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 2>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 3>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 4>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 5>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 6>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 7>;       \
+  template struct functor::StridedSlice<GPUDevice, T, 8>;       \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 1>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 2>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 3>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 4>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 5>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 6>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 7>;   \
+  template struct functor::StridedSliceGrad<GPUDevice, T, 8>;   \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 1>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 2>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 3>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 4>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 5>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 6>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 7>; \
+  template struct functor::StridedSliceAssign<GPUDevice, T, 8>; \
+  template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op_impl.h
new file mode 100644
index 00000000..01e58c9b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -0,0 +1,304 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_IMPL_H_
+
+// Functor definition for StridedSliceOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/mem.h"
+
+namespace tensorflow {
+
+template <typename Device, typename T, int NDIM>
+void HandleStridedSliceCase(OpKernelContext* context,
+                            const absl::Span<const int64_t>& begin,
+                            const absl::Span<const int64_t>& end,
+                            const absl::Span<const int64_t>& strides,
+                            const TensorShape& processing_shape,
+                            bool is_simple_slice, Tensor* result);
+
+template <typename Device, typename T, int NDIM>
+void HandleStridedSliceGradCase(OpKernelContext* context,
+                                const absl::Span<const int64_t>& begin,
+                                const absl::Span<const int64_t>& end,
+                                const absl::Span<const int64_t>& strides,
+                                const TensorShape& processing_shape,
+                                bool is_simple_slice, Tensor* result);
+
+template <typename Device, typename T, int NDIM>
+class HandleStridedSliceAssignCase {
+ public:
+  void operator()(OpKernelContext* context,
+                  const absl::Span<const int64_t>& begin,
+                  const absl::Span<const int64_t>& end,
+                  const absl::Span<const int64_t>& strides,
+                  const StridedSliceAssignBCast& bcast, Tensor* result);
+};
+}  // namespace tensorflow
+
+// The actual implementation. This is designed so multiple
+// translation units can include this file in the form
+//
+// #define STRIDED_SLICE_INSTANTIATE_DIM 1
+// #include <thisfile>
+// #undef STRIDED_SLICE_INSTANTIATE_DIM
+//
+#ifdef STRIDED_SLICE_INSTANTIATE_DIM
+
+namespace tensorflow {
+
+template <typename Device, typename T, int NDIM>
+void HandleStridedSliceCase(OpKernelContext* context,
+                            const absl::Span<const int64_t>& begin,
+                            const absl::Span<const int64_t>& end,
+                            const absl::Span<const int64_t>& strides,
+                            const TensorShape& processing_shape,
+                            bool is_simple_slice, Tensor* result) {
+  typedef typename proxy_type<Device, T>::type Proxy;
+
+  absl::InlinedVector<int64_t, 4UL> processing_dims =
+      processing_shape.dim_sizes();
+  if (is_simple_slice) {
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes_di;
+    for (int i = 0; i < NDIM; ++i) {
+      begin_di[i] = begin[i];
+      sizes_di[i] = end[i] - begin[i];
+    }
+    functor::Slice<Device, Proxy, NDIM>()(
+        context->eigen_device<Device>(),
+        result->bit_casted_shaped<Proxy, NDIM>(processing_dims),
+        context->input(0).bit_casted_tensor<Proxy, NDIM>(), begin_di, sizes_di);
+  } else {
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> end_di;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> strides_di;
+    for (int i = 0; i < NDIM; ++i) {
+      begin_di[i] = begin[i];
+      end_di[i] = end[i];
+      strides_di[i] = strides[i];
+    }
+    functor::StridedSlice<Device, Proxy, NDIM>()(
+        context->eigen_device<Device>(),
+        result->bit_casted_shaped<Proxy, NDIM>(processing_dims),
+        context->input(0).bit_casted_tensor<Proxy, NDIM>(), begin_di, end_di,
+        strides_di);
+  }
+}
+
+template <typename Device, typename T, int NDIM>
+void HandleStridedSliceGradCase(OpKernelContext* context,
+                                const absl::Span<const int64_t>& begin,
+                                const absl::Span<const int64_t>& end,
+                                const absl::Span<const int64_t>& strides,
+                                const TensorShape& processing_shape,
+                                bool is_simple_slice, Tensor* result) {
+  absl::InlinedVector<int64_t, 4UL> processing_dims =
+      processing_shape.dim_sizes();
+
+  Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
+  Eigen::DSizes<Eigen::DenseIndex, NDIM> end_di;
+  Eigen::DSizes<Eigen::DenseIndex, NDIM> strides_di;
+  for (int i = 0; i < NDIM; ++i) {
+    begin_di[i] = begin[i];
+    end_di[i] = end[i];
+    strides_di[i] = strides[i];
+  }
+
+  typedef typename proxy_type<Device, T>::type Proxy;
+  functor::StridedSliceGrad<Device, Proxy, NDIM>()(
+      context->eigen_device<Device>(), result->bit_casted_tensor<Proxy, NDIM>(),
+      context->input(4).bit_casted_shaped<Proxy, NDIM>(processing_dims),
+      begin_di, end_di, strides_di);
+}
+
+template <typename Device, typename T, int NDIM>
+void HandleStridedSliceAssignCase<Device, T, NDIM>::operator()(
+    OpKernelContext* context, const absl::Span<const int64_t>& begin,
+    const absl::Span<const int64_t>& end,
+    const absl::Span<const int64_t>& strides,
+    const StridedSliceAssignBCast& bcast, Tensor* result) {
+  typedef typename proxy_type<Device, T>::type Proxy;
+  Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
+  Eigen::DSizes<Eigen::DenseIndex, NDIM> end_di;
+  Eigen::DSizes<Eigen::DenseIndex, NDIM> strides_di;
+  for (int i = 0; i < NDIM; ++i) {
+    begin_di[i] = begin[i];
+    end_di[i] = end[i];
+    strides_di[i] = strides[i];
+  }
+
+  constexpr int kRhsInput = 4;
+  const Tensor& input = context->input(kRhsInput);
+  functor::StridedSliceAssign<Device, Proxy, NDIM>()(
+      context->eigen_device<Device>(), result->bit_casted_tensor<Proxy, NDIM>(),
+      input.bit_casted_shaped<Proxy, NDIM>(bcast.reshape()), begin_di, end_di,
+      strides_di, bcast);
+}
+
+template <typename Device, typename T>
+class HandleStridedSliceAssignCase<Device, T, 0> {
+ public:
+  enum { NDIM_PROXY = 1 };
+  void operator()(OpKernelContext* context,
+                  const absl::Span<const int64_t>& begin,
+                  const absl::Span<const int64_t>& end,
+                  const absl::Span<const int64_t>& strides,
+                  const StridedSliceAssignBCast& bcast, Tensor* result) {
+    absl::InlinedVector<int64_t, 1UL> processing_dims(1);
+    processing_dims[0] = 1;
+
+    typedef typename proxy_type<Device, T>::type Proxy;
+    functor::StridedSliceAssignScalar<Device, Proxy>()(
+        context->eigen_device<Device>(),
+        result->bit_casted_shaped<Proxy, 1>(processing_dims),
+        context->input(4).bit_casted_shaped<Proxy, 1>(processing_dims));
+  }
+};
+
+// NOTE(aselle): according to bsteiner, we need this because otherwise
+// nvcc instantiates templates that are invalid. strided_slice_op_gpu.cu
+// handles instantiates externally. It is important that this is done
+// before the HandleXXCase's are instantiated to avoid duplicate
+// specialization errors.
+
+#define PREVENT_INSTANTIATE_DIM1_AND_UP(T, NDIM)                   \
+  namespace functor {                                              \
+  template <>                                                      \
+  void StridedSlice<GPUDevice, T, NDIM>::operator()(               \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& start,         \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& stop,          \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides);      \
+  extern template struct StridedSlice<GPUDevice, T, NDIM>;         \
+  template <>                                                      \
+  void Slice<GPUDevice, T, NDIM>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+  extern template struct Slice<GPUDevice, T, NDIM>;                \
+  template <>                                                      \
+  void StridedSliceGrad<GPUDevice, T, NDIM>::operator()(           \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& start,         \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& stop,          \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides);      \
+  extern template struct StridedSliceGrad<GPUDevice, T, NDIM>;     \
+  template <>                                                      \
+  void StridedSliceAssign<GPUDevice, T, NDIM>::operator()(         \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& start,         \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& stop,          \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides,       \
+      const StridedSliceAssignBCast& bcast);                       \
+  extern template struct StridedSliceAssign<GPUDevice, T, NDIM>;   \
+  }  // namespace functor
+#define PREVENT_INSTANTIATE_DIM0_ONLY(T, NDIM)                   \
+  namespace functor {                                            \
+  template <>                                                    \
+  void StridedSliceAssignScalar<GPUDevice, T>::operator()(       \
+      const GPUDevice& d, typename TTypes<T, 1>::Tensor output,  \
+      typename TTypes<T, 1>::ConstTensor input);                 \
+  extern template struct StridedSliceAssignScalar<GPUDevice, T>; \
+  }  // namespace functor
+
+// Dimension 0 only instantiates some functors. So we only need
+// to prevent ones defined by PREVENT_INSTANTIATE_DIM0_ONLY
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if STRIDED_SLICE_INSTANTIATE_DIM == 0
+#define PREVENT_INSTANTIATE(T, NDIM) PREVENT_INSTANTIATE_DIM0_ONLY(T, NDIM)
+#else
+#define PREVENT_INSTANTIATE(T, NDIM) PREVENT_INSTANTIATE_DIM1_AND_UP(T, NDIM)
+#endif
+#else
+#define PREVENT_INSTANTIATE(T, NDIM)
+#endif
+
+#define INSTANTIATE_DIM1_AND_UP_HANDLERS(DEVICE, T, DIM)                \
+  template void HandleStridedSliceCase<DEVICE, T, DIM>(                 \
+      OpKernelContext * context, const gtl::ArraySlice<int64_t>& begin, \
+      const gtl::ArraySlice<int64_t>& end,                              \
+      const gtl::ArraySlice<int64_t>& strides,                          \
+      const TensorShape& processing_shape, bool is_simple_slice,        \
+      Tensor* result);                                                  \
+  template void HandleStridedSliceGradCase<DEVICE, T, DIM>(             \
+      OpKernelContext * context, const gtl::ArraySlice<int64_t>& begin, \
+      const gtl::ArraySlice<int64_t>& end,                              \
+      const gtl::ArraySlice<int64_t>& strides,                          \
+      const TensorShape& processing_shape, bool is_simple_slice,        \
+      Tensor* result);
+
+#define INSTANTIATE_DIM0_AND_UP_HANDLERS(DEVICE, T, DIM) \
+  template class HandleStridedSliceAssignCase<DEVICE, T, DIM>;
+
+// Only some kernels need to be instantiated on dim 0.
+#if STRIDED_SLICE_INSTANTIATE_DIM == 0
+#define INSTANTIATE(DEVICE, T, DIM) \
+  INSTANTIATE_DIM0_AND_UP_HANDLERS(DEVICE, T, DIM)
+#else
+#define INSTANTIATE(DEVICE, T, DIM)                \
+  INSTANTIATE_DIM0_AND_UP_HANDLERS(DEVICE, T, DIM) \
+  INSTANTIATE_DIM1_AND_UP_HANDLERS(DEVICE, T, DIM)
+#endif
+
+#define DECLARE_FOR_N_CPU(T) \
+  INSTANTIATE(CPUDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+#define PREVENT_FOR_N_GPU(T) \
+  PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+#define DECLARE_FOR_N_GPU(T) \
+  INSTANTIATE(GPUDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+TF_CALL_GPU_PROXY_TYPES(PREVENT_FOR_N_GPU);
+TF_CALL_COMPLEX_TYPES(PREVENT_FOR_N_GPU);
+
+TF_CALL_INTEGRAL_TYPES(DECLARE_FOR_N_GPU);
+TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
+#endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
+TF_CALL_QUANTIZED_TYPES(DECLARE_FOR_N_CPU);
+TF_CALL_float8_e5m2(DECLARE_FOR_N_CPU);
+TF_CALL_float8_e4m3fn(DECLARE_FOR_N_CPU);
+
+#undef INSTANTIATE
+#undef DECLARE_FOR_N_CPU
+#undef DECLARE_FOR_N_GPU
+
+}  // end namespace tensorflow
+
+#endif  // END STRIDED_SLICE_INSTANTIATE_DIM
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
new file mode 100644
index 00000000..f9119259
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
@@ -0,0 +1,67 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRING_TO_HASH_BUCKET_FAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STRING_TO_HASH_BUCKET_FAST_OP_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+template <uint64 hash(absl::string_view)>
+class StringToHashBucketOp : public OpKernel {
+ public:
+  explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_buckets", &num_buckets_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<tstring>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", input_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<int64_t>();
+
+    typedef decltype(input_flat.size()) Index;
+    for (Index i = 0; i < input_flat.size(); ++i) {
+      const uint64 input_hash = hash(input_flat(i));
+      const uint64 bucket_id = input_hash % num_buckets_;
+      // The number of buckets is always in the positive range of int64 so is
+      // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
+      // safe.
+      output_flat(i) = static_cast<int64_t>(bucket_id);
+    }
+  }
+
+ private:
+  int64_t num_buckets_;
+
+  StringToHashBucketOp(const StringToHashBucketOp&) = delete;
+  void operator=(const StringToHashBucketOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRING_TO_HASH_BUCKET_FAST_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/string_to_hash_bucket_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/string_to_hash_bucket_op.h
new file mode 100644
index 00000000..71fba9b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRING_TO_HASH_BUCKET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STRING_TO_HASH_BUCKET_OP_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+template <uint64 hash(const uint64 (&)[2], const string&)>
+class StringToKeyedHashBucketOp : public OpKernel {
+ public:
+  explicit StringToKeyedHashBucketOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_buckets", &num_buckets_));
+
+    std::vector<int64_t> key;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key));
+    OP_REQUIRES(ctx, key.size() == 2,
+                errors::InvalidArgument("Key must have 2 elements"));
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<tstring>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", input_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<int64_t>();
+
+    typedef decltype(input_flat.size()) Index;
+    for (Index i = 0; i < input_flat.size(); ++i) {
+      const uint64 input_hash = hash(key_, input_flat(i));
+      const uint64 bucket_id = input_hash % num_buckets_;
+      // The number of buckets is always in the positive range of int64 so is
+      // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
+      // safe.
+      output_flat(i) = static_cast<int64_t>(bucket_id);
+    }
+  }
+
+ private:
+  int64_t num_buckets_;
+  uint64 key_[2];
+
+  StringToKeyedHashBucketOp(const StringToKeyedHashBucketOp&) = delete;
+  void operator=(const StringToKeyedHashBucketOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRING_TO_HASH_BUCKET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/string_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/string_util.h
new file mode 100644
index 00000000..58230d3d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/string_util.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+// Enumeration for unicode encodings.  Used by ops such as
+// tf.strings.unicode_encode and tf.strings.unicode_decode.
+enum class UnicodeEncoding { UTF8, UTF16BE, UTF32BE };
+
+// Enumeration for character units.  Used by string such as
+// tf.strings.length and tf.substr.
+// TODO(edloper): Add support for: UTF32_CHAR, etc.
+enum class CharUnit { BYTE, UTF8_CHAR };
+
+// Whether or not the given byte is the trailing byte of a UTF-8/16/32 char.
+inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
+
+// Sets `encoding` based on `str`.
+absl::Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding);
+
+// Sets `unit` value based on `str`.
+absl::Status ParseCharUnit(const string& str, CharUnit* unit);
+
+// Returns the number of Unicode characters in a UTF-8 string.
+// Result may be incorrect if the input string is not valid UTF-8.
+int32 UTF8StrLen(const string& str);
+
+// Get the next UTF8 character position starting at the given position and
+// skipping the given number of characters. Position is a byte offset, and
+// should never be `null`. The function return true if successful. However, if
+// the end of the string is reached before the requested characters, then the
+// position will point to the end of string and this function will return false.
+template <typename T>
+bool ForwardNUTF8CharPositions(const absl::string_view in,
+                               const T num_utf8_chars_to_shift, T* pos) {
+  const size_t size = in.size();
+  T utf8_chars_counted = 0;
+  while (utf8_chars_counted < num_utf8_chars_to_shift && *pos < size) {
+    // move forward one utf-8 character
+    do {
+      ++*pos;
+    } while (*pos < size && IsTrailByte(in[*pos]));
+    ++utf8_chars_counted;
+  }
+  return utf8_chars_counted == num_utf8_chars_to_shift;
+}
+
+// Get the previous UTF8 character position starting at the given position and
+// skipping the given number of characters. Position is a byte offset with a
+// positive value, relative to the beginning of the string, and should never be
+// `null`. The function return true if successful. However, if the beginning of
+// the string is reached before the requested character, then the position will
+// point to the beginning of the string and this function will return false.
+template <typename T>
+bool BackNUTF8CharPositions(const absl::string_view in,
+                            const T num_utf8_chars_to_shift, T* pos) {
+  const size_t start = 0;
+  T utf8_chars_counted = 0;
+  while (utf8_chars_counted < num_utf8_chars_to_shift && (*pos > start)) {
+    // move back one utf-8 character
+    do {
+      --*pos;
+    } while (IsTrailByte(in[*pos]) && *pos > start);
+    ++utf8_chars_counted;
+  }
+  return utf8_chars_counted == num_utf8_chars_to_shift;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/summary_interface.h b/third_party/tflite-hdrs/tensorflow/core/kernels/summary_interface.h
new file mode 100644
index 00000000..f423d4ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/summary_interface.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
+#define TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Event;
+class GraphDef;
+
+// Main interface for the summary writer resource.
+class SummaryWriterInterface : public ResourceBase {
+ public:
+  virtual ~SummaryWriterInterface() override {}
+
+  // Flushes all unwritten messages in the queue.
+  virtual absl::Status Flush() = 0;
+
+  // These are called in the OpKernel::Compute methods for the summary ops.
+  virtual absl::Status WriteTensor(int64_t global_step, Tensor t,
+                                   const string& tag,
+                                   const string& serialized_metadata) = 0;
+
+  virtual absl::Status WriteScalar(int64_t global_step, Tensor t,
+                                   const string& tag) = 0;
+
+  virtual absl::Status WriteHistogram(int64_t global_step, Tensor t,
+                                      const string& tag) = 0;
+
+  virtual absl::Status WriteImage(int64_t global_step, Tensor t,
+                                  const string& tag, int max_images,
+                                  Tensor bad_color) = 0;
+
+  virtual absl::Status WriteAudio(int64_t global_step, Tensor t,
+                                  const string& tag, int max_outputs_,
+                                  float sample_rate) = 0;
+
+  virtual absl::Status WriteGraph(int64_t global_step,
+                                  std::unique_ptr<GraphDef> graph) = 0;
+
+  virtual absl::Status WriteEvent(std::unique_ptr<Event> e) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_array.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_array.h
new file mode 100644
index 00000000..aef4a97b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_array.h
@@ -0,0 +1,629 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
+
+#include <limits.h>
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/aggregate_ops.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace tensor_array {
+
+// Full implementations are in tensor_array.cc
+template <typename Device, typename T>
+absl::Status AddToTensor(OpKernelContext* ctx, Tensor* sum,
+                         const Tensor* current, const Tensor* add) {
+  return errors::InvalidArgument(
+      "tensor_array::AddToTensor type not supported: ",
+      DataTypeString(DataTypeToEnum<T>::value));
+}
+
+#define TENSOR_ARRAY_WRITE_OR_ADD(Device, T)                         \
+  template <>                                                        \
+  Status AddToTensor<Device, T>(OpKernelContext * ctx, Tensor * sum, \
+                                const Tensor* current, const Tensor* add);
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_CPU(T) TENSOR_ARRAY_WRITE_OR_ADD(CPUDevice, T)
+TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
+#undef TENSOR_ARRAY_WRITE_OR_ADD_CPU
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
+TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_COMPLEX_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+#undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef TENSOR_ARRAY_WRITE_OR_ADD
+
+template <typename Device, typename T>
+absl::Status TensorSetZero(OpKernelContext* ctx, Tensor* value) {
+  return errors::InvalidArgument(
+      "tensor_array::TensorSetZero type not supported: ",
+      DataTypeString(DataTypeToEnum<T>::value));
+}
+
+#define TENSOR_ARRAY_SET_ZERO(Device, T) \
+  template <>                            \
+  Status TensorSetZero<Device, T>(OpKernelContext * ctx, Tensor * value);
+
+#define TENSOR_ARRAY_SET_ZERO_CPU(T) TENSOR_ARRAY_SET_ZERO(CPUDevice, T)
+TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU);
+TF_CALL_bool(TENSOR_ARRAY_SET_ZERO_CPU);
+#undef TENSOR_ARRAY_SET_ZERO_CPU
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
+TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_COMPLEX_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
+#undef TENSOR_ARRAY_SET_ZERO_GPU
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef TENSOR_ARRAY_SET_ZERO
+
+}  // namespace tensor_array
+
+// The TensorArray object keeps an array of Tensors.  It allows reading from the
+// array and writing to the array.
+//
+// Important properties:
+//   * Usually, writing to a particular index in the TensorArray is allowed at
+//     most once per index.  In a special case, writes with the flag
+//     multiple_writes_aggregate allow multiple writes to the same
+//     index.  In this case, the writes are summed.
+//   * Multiple reads are supported.
+//   * Deep copies of Tensors are rarely made.  The only time they are made is
+//     when WriteOrAggregate is called at least twice on the same index with the
+//     flag multiple_writes_aggregate = True.
+//   * Reading and Writing to the array is protected by a mutex.
+//     All operations on a TensorArray are thread-safe.
+//   * A TensorArray may be preemptively closed, which releases all
+//     memory associated with it.
+//
+// These properties together allow the TensorArray to work as a
+// functional object and makes gradient computation easy.  For
+// example:
+//   * Write-Once semantics mean the gradient of a TensorArray Read never has to
+//     worry which of multiple writes to that index the gradient value
+//     is meant for.
+//   * Read-Many semantics (when using clear_after_read=false) allow the
+//     TensorArray to be read, packed, or concatenated multiple times;
+//     and the gradient operations use the multiple_writes_aggregate
+//     flag to aggregate the backprop writes.  Multiple backprop writes to
+//     the same index are partial gradients corresponding to the
+//     multiple reads of that index in the forward phase.
+//
+class TensorArray : public ResourceBase {
+ public:
+  static std::atomic<int64_t> tensor_array_counter;
+
+  // Construct a TensorArray for holding Tensors of type 'dtype' with
+  // 'N' elements.  While the underlying storage is a std::vector and
+  // can hold more than MAX_INT entries, in practice we do not expect
+  // users to construct this many Tensors for storage in a TensorArray.
+  TensorArray(const string& key, const DataType& dtype, const Tensor& handle,
+              int32_t N, const PartialTensorShape& element_shape,
+              bool identical_element_shapes, bool dynamic_size,
+              bool multiple_writes_aggregate, bool is_grad, int32_t marked_size,
+              bool clear_after_read)
+      : key_(key),
+        dtype_(dtype),
+        handle_(handle),
+        closed_(false),
+        dynamic_size_(dynamic_size),
+        multiple_writes_aggregate_(multiple_writes_aggregate),
+        gradients_disallowed_(false),
+        clear_after_read_(clear_after_read),
+        is_grad_(is_grad),
+        marked_size_(marked_size),
+        element_shape_(element_shape),
+        identical_element_shapes_(identical_element_shapes),
+        tensors_(N) {}
+
+  // Write Tensor 'value' to index 'index'.
+  //
+  // Preconditions:
+  //  * The TensorArray is not closed
+  //  * If the array has dynamic size:
+  //      The index is >= 0
+  //    Otherwise:
+  //      The index is in [0, N) where N == Size()
+  //  * The dtype of the Tensor in 'value' matches the TensorArray's dtype.
+  //  * If multiple_writes_aggregate is false:
+  //    The Tensor at 'index' has not yet been written to.
+  //  * If multiple_writes_aggregate is true:
+  //    The Tensor at 'index' has the same shape as value.
+  //
+  // Side effects:
+  //  * On the first write to 'index':
+  //    - The underlying Tensor in 'value' has a new reference to it.
+  //    - The index 'index' is marked as written.
+  //  * If multiple_writes_aggregate is false, subsequent writes to 'index'
+  //    raise an InvalidArgument error.
+  //  * If multiple_writes_aggregate is true, subsequent writes to 'index':
+  //    - The underlying Tensors in 'value' and from the first write
+  //      are released and a local Tensor is created.
+  //    - Index 'index' is also marked as local_copy.
+  //    - The gradients_disallowed flag is set true (GradientsAllowed()
+  //      will now return false).
+  //
+  // Note, value is passed as a pointer because we its underlying
+  // Tensor's shape is accessed.  Otherwise it is not modified.
+  template <typename Device, typename T>
+  absl::Status WriteOrAggregate(OpKernelContext* ctx, const int32_t index,
+                                const Tensor* value) {
+    mutex_lock l(mu_);
+    return LockedWriteOrAggregate<Device, T>(ctx, index, value);
+  }
+
+  template <typename Device, typename T>
+  absl::Status WriteOrAggregateMany(OpKernelContext* ctx,
+                                    const std::vector<int32>& indices,
+                                    std::vector<Tensor>* values) {
+    mutex_lock l(mu_);
+    int32_t i = 0;
+    for (const int32_t ix : indices) {
+      absl::Status s =
+          LockedWriteOrAggregate<Device, T>(ctx, ix, &(*values)[i]);
+      ++i;
+      TF_RETURN_IF_ERROR(s);
+    }
+    return absl::OkStatus();
+  }
+
+  // Read from index 'index' into Tensor 'value'.
+  //
+  // Preconditions:
+  //  * The TensorArray is not closed
+  //  * The index is in [0, N)
+  //  * The Tensor at 'index' has been written to.
+  //  * The Tensor at 'index' has not been read from with flag
+  //    clear_after_read = true.
+  //
+  // Side effects:
+  //  * If clear_after_read is true, the reference to the underlying
+  //    Tensor is deleted.
+  //  * The reference to the underlying Tensor at 'index' is copied to
+  //    the returned '*value'.
+  //  * The index is marked as read (it cannot be rewritten to).
+  template <typename Device, typename T>
+  absl::Status Read(OpKernelContext* ctx, const int32_t index, Tensor* value) {
+    mutex_lock l(mu_);
+    return LockedRead<Device, T>(ctx, index, value);
+  }
+
+  template <typename Device, typename T>
+  absl::Status ReadMany(OpKernelContext* ctx, const std::vector<int32>& indices,
+                        std::vector<Tensor>* values) {
+    mutex_lock l(mu_);
+    values->clear();
+    values->resize(indices.size());
+    int32_t i = 0;
+    for (const int32_t ix : indices) {
+      absl::Status s = LockedRead<Device, T>(ctx, ix, &(*values)[i]);
+      ++i;
+      if (!s.ok()) return s;
+    }
+    return absl::OkStatus();
+  }
+
+  DataType ElemType() const { return dtype_; }
+
+  PartialTensorShape ElemShape() {
+    mutex_lock l(mu_);
+    return element_shape_;
+  }
+
+  absl::Status SetElemShape(const PartialTensorShape& candidate) {
+    mutex_lock l(mu_);
+    PartialTensorShape new_element_shape_;
+    absl::Status s = element_shape_.MergeWith(candidate, &new_element_shape_);
+    if (!s.ok()) {
+      return s;
+    }
+    element_shape_ = new_element_shape_;
+    return absl::OkStatus();
+  }
+
+  string DebugString() const override {
+    mutex_lock l(mu_);
+    CHECK(!closed_);
+    return strings::StrCat("TensorArray[", tensors_.size(), "]");
+  }
+
+  bool IsClosed() {
+    mutex_lock l(mu_);
+    return closed_;
+  }
+
+  // Return the size of the TensorArray.
+  absl::Status Size(int32* size) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+    *size = tensors_.size();
+    return absl::OkStatus();
+  }
+
+  // Record the size of the TensorArray after an unpack or split.
+  absl::Status SetMarkedSize(int32_t size) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+    if (!is_grad_) {
+      marked_size_ = size;
+    }
+    return absl::OkStatus();
+  }
+
+  // Return the marked size of the TensorArray.
+  absl::Status MarkedSize(int32* size) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+    *size = marked_size_;
+    return absl::OkStatus();
+  }
+
+  // Return the size that should be used by pack or concat op.
+  absl::Status PackOrConcatSize(int32* size) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+    *size = is_grad_ ? marked_size_ : tensors_.size();
+    return absl::OkStatus();
+  }
+
+  // Once a TensorArray is being used for gradient calculations, it
+  // should be marked as no longer resizeable.
+  void DisableDynamicSize() {
+    mutex_lock l(mu_);
+    dynamic_size_ = false;
+  }
+
+  bool HasDynamicSize() {
+    mutex_lock l(mu_);
+    return dynamic_size_;
+  }
+
+  bool GradientsAllowed() {
+    mutex_lock l(mu_);
+    return !gradients_disallowed_;
+  }
+
+  bool HasIdenticalElementShapes() const { return identical_element_shapes_; }
+
+  // Copy the TensorShapes from another TensorArray into this one.
+  // If `shapes_to_prepend` is set, expands the rank of the copied shape by
+  // prepending the passed in shape prefix to the shape values in `rhs`.
+  // The sizes of the two TensorArrays must match and this one
+  // may not have any entries filled in.  This performs a "soft copy",
+  // essentially filling the current TensorArray with virtual
+  // zero-tensors, which will be replaced by future aggregate writes,
+  // or instantiated by future reads.  Requires a non-const pointer
+  // to the rhs to access its mutex.
+  absl::Status CopyShapesFrom(TensorArray* rhs,
+                              const TensorShape* shape_to_prepend);
+
+  // Clear the TensorArray, including any Tensor references, and mark as closed.
+  void ClearAndMarkClosed() {
+    mutex_lock l(mu_);
+    tensors_.clear();
+    closed_ = true;
+  }
+
+  mutex* mu() { return &mu_; }
+  Tensor* handle() { return &handle_; }
+
+  ResourceHandle resource_handle(OpKernelContext* ctx) {
+    return ctx->step_container()->MakeResourceHandle<TensorArray>(
+        key_, *ctx->device());
+  }
+
+ private:
+  absl::Status LockedWrite(OpKernelContext* ctx, const int32_t index,
+                           Tensor* value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  template <typename Device, typename T>
+  absl::Status LockedWriteOrAggregate(OpKernelContext* ctx, const int32_t index,
+                                      const Tensor* value)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  template <typename Device, typename T>
+  absl::Status LockedRead(OpKernelContext* ctx, const int32_t index,
+                          Tensor* value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Status LockedReturnIfClosed() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (closed_) {
+      return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
+                                     " has already been closed.");
+    }
+    return absl::OkStatus();
+  }
+
+  const string key_;
+
+  const DataType dtype_;
+  Tensor handle_;
+
+  mutable mutex mu_;
+
+  // Marks that the tensor_array_ has been cleared.
+  bool closed_ TF_GUARDED_BY(mu_);
+
+  // Writes are allowed to grow the array.
+  bool dynamic_size_;
+
+  // Multiple writes to the same index will result in summation of the
+  // values (used by backprop)
+  const bool multiple_writes_aggregate_;
+
+  // If multiple Writes were attempted (e.g. via attribute
+  // multiple_writes_aggregate), then gradients are disallowed.
+  bool gradients_disallowed_ TF_GUARDED_BY(mu_);
+
+  // After a read at an index, clear away its Tensor to release memory.
+  const bool clear_after_read_;
+
+  // True iff this is a gradient tensor array.
+  const bool is_grad_;
+
+  // The size of the TensorArray after a (legacy) unpack or split is performed.
+  // -1 if there has been no unpack or split performed on the TensorArray.
+  int32 marked_size_;
+
+  // The shape of each element in the TensorArray, may be partially known or not
+  // known at all.
+  PartialTensorShape element_shape_ TF_GUARDED_BY(mu_);
+
+  // Whether all elements in the TensorArray have identical shapes.
+  // This allows certain behaviors, like dynamically checking for
+  // consistent shapes on write, and being able to fill in properly
+  // shaped zero tensors on stack -- even if the initial element_shape
+  // was not fully defined.
+  const bool identical_element_shapes_;
+
+  // TensorAndState is used to keep track of the Tensors stored in the
+  // TensorArray, along with their shapes, and a boolean that determines whether
+  // they have already been read or not.
+  struct TensorAndState {
+    TensorAndState()
+        : written(false), read(false), cleared(false), local_copy(false) {}
+    Tensor tensor;
+    TensorShape shape;
+    bool written;  // True if a Tensor has been written to the index.
+    bool read;  // True if a Tensor has been written to and read from the index.
+    bool cleared;  // True if a tensor has been read with
+                   // clear_after_read = true;
+
+    // Used by writes when multiple_writes_aggregate is true.  In this
+    // case, the first time a value is written, it is a shallow copy.
+    // The second time a value is written, it is aggregated.  However,
+    // in this case a new Tensor must be constructed to hold the
+    // aggregated value.  This flag marks that such a Tensor is being
+    // used.  All future writes will aggregate to the existing local Tensor.
+    bool local_copy;
+  };
+  // The list of underlying Tensors and states.
+  std::vector<TensorAndState> tensors_ TF_GUARDED_BY(mu_);
+};
+
+template <typename Device, typename T>
+absl::Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
+                                                 const int32_t index,
+                                                 const Tensor* value) {
+  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+  size_t index_size = static_cast<size_t>(index);
+  if (index < 0 || (!dynamic_size_ && index_size >= tensors_.size())) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<tstring>()(1), ": Tried to write to index ",
+        index, " but array is not resizeable and size is: ", tensors_.size());
+  }
+  if (dynamic_size_) {
+    // We must grow the internal TensorArray
+    if (index_size >= tensors_.capacity()) {
+      tensors_.reserve(2 * (index_size + 1));
+    }
+    if (index_size >= tensors_.size()) {
+      tensors_.resize(index_size + 1);
+    }
+  }
+  TensorAndState& t = tensors_[index];
+
+  if (value->dtype() != dtype_) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<tstring>()(1),
+        ": Could not write to TensorArray index ", index,
+        " because the value dtype is ", DataTypeString(value->dtype()),
+        " but TensorArray dtype is ", DataTypeString(dtype_), ".");
+  }
+  if (!element_shape_.IsCompatibleWith(value->shape())) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<tstring>()(1),
+        ": Could not write to TensorArray index ", index,
+        " because the value shape is ", value->shape().DebugString(),
+        " which is incompatible with the TensorArray's inferred element "
+        "shape: ",
+        element_shape_.DebugString(), " (consider setting infer_shape=False).");
+  } else if (identical_element_shapes_ && !element_shape_.IsFullyDefined()) {
+    element_shape_ = PartialTensorShape(value->shape().dim_sizes());
+  }
+
+  if (t.read) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
+                                   ": Could not write to TensorArray index ",
+                                   index, " because it has already been read.");
+  }
+
+  if (!multiple_writes_aggregate_ && t.written) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
+                                   ": Could not write to TensorArray index ",
+                                   index,
+                                   " because it has already been written to.");
+  }
+
+  if (t.written) {
+    DCHECK(multiple_writes_aggregate_);
+
+    // Check that value shape matches t.shape
+    if (value->shape() != t.shape) {
+      return errors::InvalidArgument(
+          "TensorArray ", handle_.vec<tstring>()(1),
+          ": Could not aggregate to TensorArray index ", index,
+          " because the existing shape is ", t.shape.DebugString(),
+          " but the new input shape is ", value->shape().DebugString(), ".");
+    }
+
+    if (!t.tensor.IsInitialized() || t.tensor.NumElements() == 0) {
+      // If existing_t == nullptr but written == true, then what was stored
+      // was just a shape, which just means zeros.  So all we must do in this
+      // case is copy the reference over and return early.
+      t.tensor = *value;
+      return absl::OkStatus();
+    }
+
+    Tensor* existing_t = &t.tensor;
+
+    if (t.local_copy) {
+      absl::Status s = tensor_array::AddToTensor<Device, T>(ctx, existing_t,
+                                                            existing_t, value);
+      TF_RETURN_IF_ERROR(s);
+    } else {
+      Tensor local_tensor;
+      TF_RETURN_IF_ERROR(
+          ctx->allocate_temp(dtype_, existing_t->shape(), &local_tensor));
+      absl::Status s = tensor_array::AddToTensor<Device, T>(ctx, &local_tensor,
+                                                            existing_t, value);
+      TF_RETURN_IF_ERROR(s);
+      t.tensor = local_tensor;
+      t.local_copy = true;
+    }
+
+    // We've aggregated the values, so disallow backprop on this
+    // TensorArray.
+    gradients_disallowed_ = true;
+  } else {
+    t.tensor = *value;
+    t.shape = value->shape();
+    t.written = true;
+  }
+  return absl::OkStatus();
+}
+
+template <typename Device, typename T>
+absl::Status TensorArray::LockedRead(OpKernelContext* ctx, const int32_t index,
+                                     Tensor* value) {
+  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+  if ((index < 0) ||
+      (!is_grad_ && (static_cast<size_t>(index) >= tensors_.size()))) {
+    return errors::InvalidArgument("Tried to read from index ", index,
+                                   " but array size is: ", tensors_.size());
+  }
+  size_t index_t = static_cast<size_t>(index);
+  if ((is_grad_ && (index_t >= tensors_.size() || !tensors_[index].written)) ||
+      (!is_grad_ && (index_t < tensors_.size() && !tensors_[index].written))) {
+    // Special case returning zeros if this is a gradient read that happens
+    // after a stop_gradients call with dynamic forward TensorArrays.
+    // There is sometimes a race condition where the gradient is not
+    // written due to stop_gradients, but is later read.
+    TensorShape element_shape;
+    if (is_grad_ && index_t < tensors_.size() &&
+        tensors_[index].shape.dims() > 0) {
+      // A gradient TensorArray has more specific gradient information
+      // available for each entry.  A forward TensorArray must rely on
+      // the global element_shape_ to fill in zeros on read.
+      element_shape = tensors_[index].shape;
+    } else if (!element_shape_.IsFullyDefined()) {
+      return errors::InvalidArgument(
+          "TensorArray ", handle_.vec<tstring>()(1),
+          ": Could not read from TensorArray index ", index,
+          ".  Furthermore, the element shape is not fully defined: ",
+          element_shape_.DebugString(),
+          ".  It is possible you are working with a resizeable TensorArray and "
+          "stop_gradients is not allowing the gradients to be written.  If you "
+          "set the full "
+          "element_shape property on the forward TensorArray, the proper "
+          "all-zeros tensor "
+          "will be returned instead of incurring this error.");
+    } else {
+      element_shape_.AsTensorShape(&element_shape);  // Always succeeds.
+    }
+    if (index_t >= tensors_.size()) {
+      // Fill in tensors_ up to index to have known shape.
+      size_t old_tensors_size = tensors_.size();
+      tensors_.resize(index + 1);
+      for (size_t i = old_tensors_size; i < index + 1; ++i) {
+        tensors_[i].shape = element_shape;
+        tensors_[i].written = true;
+      }
+    } else {
+      tensors_[index].shape = element_shape;
+      tensors_[index].written = true;
+    }
+  }
+
+  TensorAndState& t = tensors_[index];
+
+  if (t.cleared) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
+                                   ": Could not read index ", index,
+                                   " twice because it was cleared after a "
+                                   "previous read (perhaps try setting "
+                                   "clear_after_read = false?).");
+  }
+
+  if (!t.tensor.IsInitialized() || t.tensor.NumElements() == 0) {
+    // We stored just a shape, but no value.  This means create and
+    // return zeros of the appropriate shape.
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(dtype_, t.shape, &t.tensor));
+    if (t.shape.num_elements() > 0) {
+      absl::Status s = tensor_array::TensorSetZero<Device, T>(ctx, &t.tensor);
+      if (!s.ok()) return s;
+    }
+  }
+
+  // Data is available inside the tensor, copy the reference over.
+  *value = t.tensor;
+
+  if (clear_after_read_) {
+    t.tensor = Tensor();
+    t.cleared = true;
+  }
+  t.read = true;
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_cord.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_cord.h
new file mode 100644
index 00000000..2d3d4e3f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_cord.h
@@ -0,0 +1,363 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_CORD_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_CORD_H_
+
+#include <array>
+#include <numeric>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+
+namespace tensorflow {
+
+typedef void (*CordRepReleaser)(void*);
+
+class TensorCord {
+  // A TensorCord keeps a view into some data, and a cleanup method to clean up
+  // that data when the TensorCord destructor is called.  Copying a TensorCord
+  // increments a reference count to the cleanup method, and so the cleanup
+  // method is only called when all copies of the original TensorCord are
+  // cleared.
+  //
+  // Example:
+  //
+  // const string& s = t.scalar<string>()();
+  // TensorCord tc(s, &t);
+  // ASSERT_EQ(s, tc.view());
+  // TensorCord copy(tc);
+  // tc = TensorCord();  // cleanup not called; the reference is held by `copy`.
+  // copy = TensorCord();  // cleanup happens now, the reference is destroyed.
+  //
+  // Another example:
+  //
+  // void TensorProtoDeleter(void* ptr) {
+  //   delete static_cast<TensorProto*>(ptr);
+  // }
+  //
+  // auto p = std::make_unique<TensorProto>(...);
+  // absl::string_view content(p->tensor_content());
+  // TensorCord tc(content, TensorProtoDeleter, p.release());
+  //
+
+ public:
+  static constexpr const char kTypeName[] = "tensorflow::TensorCord";
+
+  TensorCord() : chunks_() {}
+
+  ~TensorCord();
+
+  // Args:
+  //   `view`: should point to a location in memory that is guaranteed to remain
+  //           valid until `releaser` is called.
+  //   `releaser`: A callback that will be executed when there are no references
+  //               left on `view`.  It will be called via `releaser(memory)`.
+  //   `memory`: The argument passed to `releaser` when it is called.
+  //
+  // You are STRONGLY advised to provide a non-null `releaser`, and a pointer
+  // to the underlying data (while ensuring that the data will not be deleted
+  // until `releaser(memory)` is called).  Otherwise the TensorCord may
+  // outlive the data backing `view`.
+  TensorCord(absl::string_view view, CordRepReleaser releaser,
+             void* memory = nullptr)
+      : chunks_({new CordRep(view, releaser, memory)}) {}
+
+  // Args:
+  //   `view`: should point to a location in memory backed by `tensor`,
+  //      e.g., `view` is a string_view on a tstring which is an element
+  //      of `tensor`.  Furthermore, the associated tstring is not expected
+  //      to be modified in such a way that the underlying memory will
+  //      be changed after this TensorCord is created.
+  TensorCord(absl::string_view view, Tensor* tensor)
+      : chunks_({NewCordRepFromTensor(view, tensor)}) {}
+
+  // Disallow construction with empty callback or empty tensor.
+  TensorCord(absl::string_view view, std::nullptr_t, void* memory) = delete;
+  TensorCord(absl::string_view view, std::nullptr_t) = delete;
+
+  TensorCord(const TensorCord& other);
+
+  TensorCord(TensorCord&& other) noexcept;
+
+  TensorCord& operator=(const TensorCord& other);
+
+  TensorCord& operator=(TensorCord&& other) noexcept;
+
+  void Append(const TensorCord& other);
+
+  void Append(absl::string_view view, CordRepReleaser releaser,
+              void* memory = nullptr);
+
+  void Append(absl::string_view view, Tensor* tensor);
+
+  // Disallow Appends with empty callbacks or empty tensors.
+  void Append(absl::string_view view, std::nullptr_t, void* memory) = delete;
+  void Append(absl::string_view view, std::nullptr_t) = delete;
+
+  size_t size() const;
+  bool empty() const { return size() == 0; }
+
+  // NOTE: This performs an expensive copy of the underlying data.
+  explicit operator string() const;
+
+  class ChunkIterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = absl::string_view;
+    using difference_type = ptrdiff_t;
+    using pointer = const value_type*;
+    using reference = value_type;
+
+    ChunkIterator& operator++();
+
+    ChunkIterator operator++(int) {
+      ChunkIterator tmp(*this);
+      operator++();
+      return tmp;
+    }
+
+    bool operator==(const ChunkIterator& other) const {
+      return (cord_ == other.cord_ && chunk_index_ == other.chunk_index_);
+    }
+
+    bool operator!=(const ChunkIterator& other) const {
+      return !(*this == other);
+    }
+    reference operator*() const {
+      assert(cord_ != nullptr);
+      return view_;
+    }
+    pointer operator->() const {
+      assert(cord_ != nullptr);
+      return &view_;
+    }
+
+    friend class TensorCord;
+
+   private:
+    // Constructs a `begin()` iterator from `cord`.
+    explicit ChunkIterator(const TensorCord* cord, int chunk_index);
+
+    const TensorCord* const cord_;
+    int chunk_index_;
+    absl::string_view view_;
+  };
+
+  class ChunkRange {
+   public:
+    explicit ChunkRange(const TensorCord* cord) : cord_(cord) {}
+
+    ChunkIterator begin() const { return ChunkIterator(cord_, 0); }
+
+    ChunkIterator end() const {
+      return ChunkIterator(cord_, cord_->chunks_.size());
+    }
+
+   private:
+    const TensorCord* cord_;
+  };
+
+  // Note that the ordinary caveats of temporary lifetime extension apply:
+  //
+  //   void Process() {
+  //     for (absl::string_view chunk : CordFactory().Chunks()) {
+  //       // The temporary Cord returned by CordFactory has been destroyed!
+  //     }
+  //   }
+  ChunkRange Chunks() const { return ChunkRange(this); }
+
+  ChunkIterator chunk_begin() const { return ChunkIterator(this, 0); }
+
+  ChunkIterator chunk_end() const {
+    return ChunkIterator(this, chunks_.size());
+  }
+
+  static string TypeName() { return kTypeName; }
+
+  string DebugString() const {
+    return absl::StrCat("<TensorCord size=", size(), ">");
+  }
+
+  void Encode(VariantTensorData* data) const;
+
+  bool Decode(VariantTensorData data);
+
+ private:
+  void Cleanup();
+
+  class CordRep : public core::RefCounted {
+   public:
+    CordRep(absl::string_view view, CordRepReleaser releaser,
+            void* arg = nullptr)
+        : is_inline_(false), rep_(view, releaser, arg) {}
+
+    // **WARNING** Only use this constructor if
+    //    view.size() < CordRep::kMaxInlineSize.
+    explicit CordRep(absl::string_view view) : is_inline_(true), rep_(view) {}
+
+    ~CordRep() override;
+
+    absl::string_view view() const {
+      if (is_inline_) {
+        return absl::string_view(
+            rep_.internal.data() + 1,
+            *reinterpret_cast<const uint8*>(rep_.internal.data()));
+      } else {
+        return rep_.external.view;
+      }
+    }
+
+   private:
+    friend class TensorCord;
+
+    struct ExternalRep {
+      absl::string_view view;
+      CordRepReleaser releaser;
+      void* arg;
+
+      ExternalRep(absl::string_view view_, CordRepReleaser releaser_,
+                  void* arg_)
+          : view(view_), releaser(releaser_), arg(arg_) {}
+    };
+
+    // We save the size in the first byte, so subtract 1.
+    static constexpr int kMaxInlineSize = sizeof(ExternalRep) - 1;
+    static_assert(kMaxInlineSize < 255,
+                  "Cannot store size of InlineRep in a single byte.");
+
+    // The first byte stores the size as a uint8.  The rest of the bytes are the
+    // string itself.
+    using InlineRep = std::array<char, sizeof(ExternalRep)>;
+
+    // Member variables.
+    const bool is_inline_;
+    const union _rep_union {
+      InlineRep internal;
+      ExternalRep external;
+
+      _rep_union(absl::string_view view, CordRepReleaser releaser, void* arg)
+          : external(view, releaser, arg) {}
+
+      explicit _rep_union(absl::string_view view) {
+        DCHECK_LT(view.size(), kMaxInlineSize);
+        *reinterpret_cast<uint8*>(internal.data()) = view.size();
+        std::memcpy(static_cast<char*>(internal.data() + 1), view.data(),
+                    view.size());
+      }
+    } rep_;
+  };
+
+  static TensorBuffer* TensorBufWithRef(Tensor* tensor);
+  static void TensorBufReleaser(void* tensor_buffer);
+  static void StringReleaser(void* str_ptr);
+  static CordRep* NewCordRepFromTensor(absl::string_view view, Tensor* tensor);
+
+  absl::InlinedVector<CordRep*, 2> chunks_;
+};
+
+inline TensorCord::TensorCord(const TensorCord& other)
+    : chunks_(other.chunks_) {
+  for (auto* rep : chunks_) {
+    rep->Ref();
+  }
+}
+
+inline TensorCord::TensorCord(TensorCord&& other) noexcept
+    : chunks_(std::move(other.chunks_)) {
+  other.chunks_.clear();
+}
+
+inline TensorCord& TensorCord::operator=(const TensorCord& other) {
+  Cleanup();
+  chunks_ = other.chunks_;
+  for (auto* rep : chunks_) {
+    rep->Ref();
+  }
+  return *this;
+}
+
+inline TensorCord& TensorCord::operator=(TensorCord&& other) noexcept {
+  Cleanup();
+  std::swap(chunks_, other.chunks_);
+  return *this;
+}
+
+inline void TensorCord::Append(const TensorCord& other) {
+  for (auto* rep : other.chunks_) {
+    chunks_.push_back(rep);
+    rep->Ref();
+  }
+}
+
+inline void TensorCord::Append(absl::string_view view, CordRepReleaser releaser,
+                               void* memory) {
+  chunks_.push_back(new CordRep(view, releaser, memory));
+}
+
+inline void TensorCord::Append(absl::string_view view, Tensor* tensor) {
+  chunks_.push_back(NewCordRepFromTensor(view, tensor));
+}
+
+inline size_t TensorCord::size() const {
+  return (chunks_.empty())
+             ? 0
+             : std::accumulate(chunk_begin(), chunk_end(), 0,
+                               [](size_t acc, absl::string_view b) {
+                                 return acc + b.size();
+                               });
+}
+
+inline TensorCord::ChunkIterator& TensorCord::ChunkIterator::operator++() {
+  assert(cord_ != nullptr);
+  assert(chunk_index_ < cord_->chunks_.size());
+  chunk_index_ += 1;
+  if (chunk_index_ != cord_->chunks_.size()) {
+    view_ = cord_->chunks_[chunk_index_]->view();
+  }
+  return *this;
+}
+
+inline TensorCord::ChunkIterator::ChunkIterator(const TensorCord* cord,
+                                                int index)
+    : cord_(cord), chunk_index_(index) {
+  if (index < cord_->chunks_.size()) {
+    view_ = cord_->chunks_[index]->view();
+  }
+}
+
+inline TensorCord::CordRep* TensorCord::NewCordRepFromTensor(
+    absl::string_view view, Tensor* tensor) {
+  if (view.size() <= TensorCord::CordRep::kMaxInlineSize) {
+    return new CordRep(view);
+  } else {
+    return new CordRep(view, &TensorBufReleaser, TensorBufWithRef(tensor));
+  }
+}
+
+inline void TensorCord::Cleanup() {
+  if (chunks_.empty()) return;
+  for (auto* rep : chunks_) {
+    rep->Unref();
+  }
+  chunks_.clear();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_CORD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_flag_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_flag_utils.h
new file mode 100644
index 00000000..f20ecad7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_flag_utils.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for parsing tensors as runtime flags.
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns OkStatus() if and only if config is a float scalar or a matrix with
+// dimensions M x 3. If config is a scalar then config must be in the range
+// [0, 1.0). If config is a matrix then config must have shape M x 3, all of
+// its entries must be positive, and entries in the last column may not
+// exceed 1.0. If config is a matrix then it may not be empty.
+absl::Status ValidateSparseMatrixShardingConfig(const Tensor& config);
+
+// Returns OkStatus() if and only if config is a float scalar or a non-empty
+// matrix with dimensions M x 2.
+absl::Status ValidateScalarQuantityShardingConfig(const Tensor& config);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key);
+
+// Returns largest multiple of bucket_size less than value.
+// Expects 1 <= bucket_size <= value.
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size);
+
+// Returns the largest power of bucket_size less than value.
+// Expects 1 <= bucket_size <= value. If bucket_size = 1, returns 1.
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_list.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_list.h
new file mode 100644
index 00000000..5d3921cf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_list.h
@@ -0,0 +1,160 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_LIST_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_LIST_H_
+
+#include <utility>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+namespace tensorflow {
+
+// Variant compatible type for a list of tensors. This is mutable but instances
+// should never be mutated after stored in a variant tensor.
+//
+// **NOTE**: TensorList stores a refcounted container of tf::Tensor objects,
+// which are accessible via TensorList::tensors().  Because it is refcounted,
+// straight copies of the form:
+//
+//    TensorList b = a;
+//    b.tensors().push_back(t);  // WARNING: This modifies a.tensors().
+//
+// Do not create a true copy of the underlying container - but instead increment
+// a reference count.  Modifying b.tensors() modifies a.tensors().  In this way,
+// TensorList should be considered similar to the tf::Tensor object.
+//
+// In order to get a copy of the underlying list, use the Copy method:
+//
+//    TensorList b = a.Copy();
+//    b.tensors().push_back(t);  // This does not modify a.tensors().
+//
+// Note that this is not a deep copy: the memory locations of the underlying
+// tensors will still point to the same locations of the corresponding tensors
+// in the original.  To truly perform a deep copy, Device and Type-specific
+// code needs to be applied to the underlying tensors as usual.
+//
+// The most important implication of RefCounted TLs is that OpKernels
+// wishing to reuse TensorList inputs as outputs via context->forward_input()
+// need to perform an additional check on the refcount of the TensorList,
+// to ensure aliasing can be performed safely.  For example:
+//
+//     bool can_alias = false;
+//     auto fw = c->forward_input(..., DT_VARIANT, {}, ...);
+//     if (fw && fw->dtype() == DT_VARIANT && fw->NumElements() == 1) {
+//       auto* tl = fw->scalar<Variant>()().get<TensorList>();
+//       if (tl && tl->RefCountIsOne()) {
+//         can_alias = true;
+//       }
+//     }
+//
+class TensorList {
+ public:
+  TensorList() : tensors_(new Tensors) {}
+  ~TensorList();
+
+  TensorList(const TensorList& other)
+      : element_shape(other.element_shape),
+        element_dtype(other.element_dtype),
+        max_num_elements(other.max_num_elements),
+        tensors_(other.tensors_) {
+    tensors_->Ref();
+  }
+
+  TensorList(TensorList&& rhs)
+      : element_shape(std::move(rhs.element_shape)),
+        element_dtype(rhs.element_dtype),
+        max_num_elements(rhs.max_num_elements),
+        tensors_(rhs.tensors_) {
+    rhs.tensors_ = nullptr;
+  }
+
+  TensorList& operator=(const TensorList& rhs) {
+    if (this == &rhs) return *this;
+    element_shape = rhs.element_shape;
+    element_dtype = rhs.element_dtype;
+    max_num_elements = rhs.max_num_elements;
+    tensors_->Unref();
+    tensors_ = rhs.tensors_;
+    tensors_->Ref();
+    return *this;
+  }
+
+  TensorList& operator=(TensorList&& rhs) {
+    if (this == &rhs) return *this;
+    element_shape = rhs.element_shape;
+    element_dtype = rhs.element_dtype;
+    max_num_elements = rhs.max_num_elements;
+    std::swap(tensors_, rhs.tensors_);
+    return *this;
+  }
+
+  static const char kTypeName[];
+
+  string TypeName() const { return kTypeName; }
+
+  void Encode(VariantTensorData* data) const;
+
+  bool Decode(const VariantTensorData& data);
+
+  // TODO(apassos) fill this out
+  string DebugString() const { return "TensorList"; }
+
+  PartialTensorShape element_shape;
+
+  DataType element_dtype;
+
+  // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size
+  // of `tensors` is unbounded.
+  int max_num_elements = -1;
+
+  // Access to the underlying tensor container.
+  std::vector<Tensor>& tensors() { return tensors_->values_; }
+  const std::vector<Tensor>& tensors() const { return tensors_->values_; }
+
+  // Get a new TensorList containing a copy of the underlying tensor container.
+  TensorList Copy() const {
+    TensorList out;
+    out.element_shape = element_shape;
+    out.element_dtype = element_dtype;
+    out.max_num_elements = max_num_elements;
+    // This performs a copy of the std::vector.
+    out.tensors_->values_ = tensors_->values_;
+    return out;
+  }
+
+  // Is this TensorList the only one with a reference to the underlying
+  // container?
+  bool RefCountIsOne() const { return tensors_->RefCountIsOne(); }
+
+ private:
+  class Tensors : public core::RefCounted {
+   public:
+    std::vector<Tensor> values_;
+  };
+  Tensors* tensors_;
+};
+
+#if defined(PLATFORM_GOOGLE)
+// TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices.
+// For 32-bit devices, it's acceptable not to inline.
+static_assert(Variant::CanInlineType<TensorList>() || sizeof(void*) < 8,
+              "Must be able to inline TensorList into a Variant");
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_LIST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_list_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_list_util.h
new file mode 100644
index 00000000..7ffabce8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_list_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_LIST_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_LIST_UTIL_H_
+
+#include <functional>
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+class TensorList;
+class Tensor;
+
+absl::Status TensorListBinaryAdd(
+    OpKernelContext* c, const TensorList& a, const TensorList& b,
+    TensorList* out,
+    std::function<absl::Status(OpKernelContext* ctx, const Tensor& a,
+                               const Tensor& b, Tensor* out)>
+        binary_add_func);
+
+absl::Status TensorListZerosLike(
+    OpKernelContext* c, const TensorList& x, TensorList* y,
+    std::function<absl::Status(OpKernelContext* ctx, const Tensor& input,
+                               Tensor* out)>
+        zeros_like_func);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_LIST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_map.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_map.h
new file mode 100644
index 00000000..cb4c827c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_map.h
@@ -0,0 +1,181 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_key.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+namespace tensorflow {
+
+// Variant compatible type for a map of tensors. This is mutable but instances
+// should never be mutated after stored in a variant tensor.
+//
+// **NOTE**: TensorMap stores a refcounted container of tf::Tensor objects,
+// which are accessible via TensorMap::tensors().  Because it is refcounted,
+// straight copies of the form:
+//
+//    TensorMap b = a;
+//    b.tensors().insert(k,v);  // WARNING: This modifies a.tensors().
+//
+// Do not create a true copy of the underlying container - but instead increment
+// a reference count.  Modifying b.tensors() modifies a.tensors().  In this way,
+// TensorMap should be considered similar to the tf::Tensor object.
+//
+// In order to get a copy of the underlying map, use the Copy method:
+//
+//    TensorMap b = a.Copy();
+//    b.tensors().insert(k, v);  // This does not modify a.tensors().
+//
+// Note that this is not a deep copy: the memory locations of the underlying
+// tensors will still point to the same locations of the corresponding tensors
+// in the original.  To truly perform a deep copy, Device and Type-specific
+// code needs to be applied to the underlying tensors as usual.
+//
+// The most important implication of RefCounted TensorMaps is that OpKernels
+// wishing to reuse TensorMap inputs as outputs via context->forward_input()
+// need to perform an additional check on the refcount of the TensorList,
+// to ensure aliasing can be performed safely.  For example:
+//
+//     bool can_alias = false;
+//     auto fw = c->forward_input(..., DT_VARIANT, {}, ...);
+//     if (fw && fw->dtype() == DT_VARIANT && fw->NumElements() == 1) {
+//       auto* tl = fw->scalar<Variant>()().get<TensorMap>();
+//       if (tl && tl->RefCountIsOne()) {
+//         can_alias = true;
+//       }
+//     }
+//
+class TensorMap {
+ public:
+  TensorMap() : tensors_(new Tensors) {}
+  ~TensorMap();
+
+  TensorMap(const TensorMap& other) : tensors_(other.tensors_) {
+    tensors_->Ref();
+  }
+
+  TensorMap(TensorMap&& rhs) : tensors_(rhs.tensors_) {
+    rhs.tensors_ = nullptr;
+  }
+
+  TensorMap& operator=(const TensorMap& rhs) {
+    if (this == &rhs) return *this;
+    tensors_->Unref();
+    tensors_ = rhs.tensors_;
+    tensors_->Ref();
+    return *this;
+  }
+
+  TensorMap& operator=(TensorMap&& rhs) {
+    if (this == &rhs) return *this;
+    std::swap(tensors_, rhs.tensors_);
+    return *this;
+  }
+
+  static const char kTypeName[];
+
+  string TypeName() const { return kTypeName; }
+
+  void Encode(VariantTensorData* data) const;
+
+  bool Decode(const VariantTensorData& data);
+
+  // TODO(apassos) fill this out
+  string DebugString() const { return "TensorMap"; }
+
+  // Access to the underlying tensor container.
+  absl::flat_hash_map<TensorKey, Tensor>& tensors() {
+    return tensors_->values_;
+  }
+
+  const absl::flat_hash_map<TensorKey, Tensor>& tensors() const {
+    return tensors_->values_;
+  }
+
+  // Get a new TensorMap containing a copy of the underlying tensor container.
+  TensorMap Copy() const {
+    TensorMap out;
+    // This performs a copy of the absl::hashmap.
+    out.tensors_->values_ = tensors_->values_;
+    return out;
+  }
+
+  // Insert key and value if the key does not already exist.
+  // Returns true if the insertion happens.
+  bool insert(const TensorKey& key, const Tensor& value) {
+    auto r = tensors_->values_.try_emplace(key, value);
+    return r.second;
+  }
+
+  // Lookup given key. Returns iterator to found key or end.
+  absl::flat_hash_map<TensorKey, Tensor>::iterator find(TensorKey key) {
+    return tensors_->values_.find(key);
+  }
+
+  Tensor& lookup(TensorKey key) { return tensors_->values_.find(key)->second; }
+
+  Tensor& operator[](TensorKey& k) { return tensors_->values_[k]; }
+
+  bool replace(const TensorKey& k, const Tensor& v) {
+    tensors_->values_[k] = v;
+    return true;
+  }
+
+  // Removes element with given key. Return size of removed element.
+  size_t erase(TensorKey key) { return tensors_->values_.erase(key); }
+
+  // Size returns the number of elements in the map
+  size_t size() const { return tensors_->values_.size(); }
+
+  std::vector<Tensor> keys() const {
+    std::vector<Tensor> keys;
+    keys.reserve(tensors_->values_.size());
+    absl::flat_hash_map<TensorKey, Tensor>::iterator it =
+        tensors_->values_.begin();
+    while (it != tensors_->values_.end()) {
+      keys.push_back(it->first);
+      it++;
+    }
+    return keys;
+  }
+
+  // Is this TensorMap the only one with a reference to the underlying
+  // container?
+  bool RefCountIsOne() const { return tensors_->RefCountIsOne(); }
+
+ private:
+  class Tensors : public core::RefCounted {
+   public:
+    absl::flat_hash_map<TensorKey, Tensor> values_;
+  };
+  Tensors* tensors_;
+};
+
+#if defined(PLATFORM_GOOGLE)
+// TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices.
+// For 32-bit devices, it's acceptable not to inline.
+static_assert(Variant::CanInlineType<TensorMap>() || sizeof(void*) < 8,
+              "Must be able to inline TensorMap into a Variant");
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_to_hash_bucket_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
new file mode 100644
index 00000000..cdf7dab2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_TO_HASH_BUCKET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_TO_HASH_BUCKET_OP_H_
+
+#include <string>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct LaunchTensorToHashBucket {
+  void operator()(OpKernelContext* c, const int64_t num_buckets, const T* input,
+                  const int num_elems, int64_t* output) {
+    string format = "%";
+    switch (DataTypeToEnum<T>::value) {
+      case DT_INT8:
+      case DT_INT16:
+      case DT_INT32:
+        strings::Appendf(&format, "d");
+        break;
+      case DT_INT64:
+        strings::Appendf(&format, "lld");
+        break;
+      default:
+        bool type_not_supported = true;
+        OP_REQUIRES(
+            c, !type_not_supported,
+            errors::InvalidArgument("Type not supported: ",
+                                    DataTypeString(DataTypeToEnum<T>::value)));
+    }
+
+    for (int i = 0; i < num_elems; ++i) {
+      string input_str = strings::Printf(format.c_str(), input[i]);
+      const uint64 input_hash = Fingerprint64(input_str);
+      const uint64 bucket_id = input_hash % num_buckets;
+      // The number of buckets is always in the positive range of int64 so is
+      // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
+      // safe.
+      output[i] = static_cast<int64_t>(bucket_id);
+    }
+  }
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename T>
+struct LaunchTensorToHashBucket<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* c, const int64_t num_buckets, const T* input,
+                  const int num_elems, int64_t* output);
+};
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_TO_HASH_BUCKET_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor.h
new file mode 100644
index 00000000..d5f27eca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor.h
@@ -0,0 +1,110 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+// Device-specific naive implementation for Tile.
+
+template <typename T>
+void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
+                const Tensor& in);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename T>
+void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename Device, typename T, typename Tmultiples, int NDIM>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<Tmultiples> broadcast_array) {
+  Eigen::array<Tmultiples, NDIM> b;
+  for (int i = 0; i < NDIM; ++i) b[i] = broadcast_array[i];
+  MaybeWith32BitIndexing<Device>(
+      [&](auto out32, auto in32) { out32.device(d) = in32.broadcast(b); },
+      out->tensor<T, NDIM>(), in.tensor<T, NDIM>());
+}
+
+template <typename Device, typename T, typename Tmultiples>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<Tmultiples>) {
+  auto x = in.tensor<T, 0>();
+  auto y = out->tensor<T, 0>();
+  // In the scalar case we simply copy the input.
+  y.device(d) = x;
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+template <typename Device, typename T, typename Tmultiples>
+struct Tile {
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<Tmultiples> broadcast_array) const {
+    switch (in.dims()) {
+      case 0:
+        internal::TileUsingEigen<Device, T, Tmultiples>(d, out, in,
+                                                        broadcast_array);
+        break;
+      case 1:
+        internal::TileUsingEigen<Device, T, Tmultiples, 1>(d, out, in,
+                                                           broadcast_array);
+        break;
+      case 2:
+        internal::TileUsingEigen<Device, T, Tmultiples, 2>(d, out, in,
+                                                           broadcast_array);
+        break;
+      case 3:
+        internal::TileUsingEigen<Device, T, Tmultiples, 3>(d, out, in,
+                                                           broadcast_array);
+        break;
+      case 4:
+        internal::TileUsingEigen<Device, T, Tmultiples, 4>(d, out, in,
+                                                           broadcast_array);
+        break;
+      case 5:
+        internal::TileUsingEigen<Device, T, Tmultiples, 5>(d, out, in,
+                                                           broadcast_array);
+        break;
+      case 6:
+        internal::TileUsingEigen<Device, T, Tmultiples, 6>(d, out, in,
+                                                           broadcast_array);
+        break;
+      case 7:
+        internal::TileUsingEigen<Device, T, Tmultiples, 7>(d, out, in,
+                                                           broadcast_array);
+        break;
+      default:
+        internal::TileSimple<T>(d, out, in);
+        break;
+    }
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor_cpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor_cpu.h
new file mode 100644
index 00000000..dee100e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor_cpu.h
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_CPU_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
+
+namespace tensorflow {
+namespace internal {
+
+template <typename Device, typename T>
+void TileSimpleImpl(const Device& d, Tensor* out, const Tensor& in) {
+  const int ndims = in.dims();
+  const int64_t nelem = out->NumElements();
+  absl::InlinedVector<int64_t, 8UL> in_strides =
+      ComputeStride<int64_t>(in.shape());
+  absl::InlinedVector<int64_t, 8UL> out_strides =
+      ComputeStride<int64_t>(out->shape());
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+
+  for (int64_t o_idx = 0; o_idx < nelem; ++o_idx) {
+    int64_t i_idx = 0;
+    int64_t t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in.dim_size(i) * in_strides[i];
+      t %= out_strides[i];
+    }
+    q[o_idx] = p[i_idx];
+  }
+}
+
+template <typename T>
+void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
+                const Tensor& in) {
+  return TileSimpleImpl<Eigen::ThreadPoolDevice, T>(d, out, in);
+}
+
+}  // namespace internal
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_CPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor_gpu.h
new file mode 100644
index 00000000..8d825a68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_functor_gpu.h
@@ -0,0 +1,91 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+namespace internal {
+
+template <typename T>
+__global__ void TileKernel(int nthreads, const T* __restrict__ src,
+                           const int32* __restrict__ buf, const int32 ndims,
+                           T* __restrict__ dst) {
+  const int32* in_strides = buf;
+  const int32* out_strides = buf + ndims;
+  const int32* in_dim_sizes = buf + ndims * 2;
+  GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
+    int32 i_idx = 0;
+    int32 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in_dim_sizes[i] * in_strides[i];
+      t %= out_strides[i];
+    }
+    dst[o_idx] = ldg(src + i_idx);
+  }
+}
+
+template <typename T>
+void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
+  // Ensures we can use 32-bit index.
+  const int64 in_nelem = in.NumElements();
+  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  const int64 out_nelem = out->NumElements();
+  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  // Pack strides and input dimension sizes into one buffer.
+  const int32 ndims = in.dims();
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
+  for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
+    host_buf[ndims * 2 + i] = in.dim_size(i);
+  }
+  // Copies the input strides, output strides and input dimension sizes to the
+  // device.
+  auto num_bytes = sizeof(int32) * host_buf.size();
+  auto dev_buf = d.allocate(num_bytes);
+  // NOTE: host_buf is not allocated by GpuHostAllocator, and
+  // therefore we are doing a sync copy effectively.
+  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
+  // Launch kernel to q[...] = p[...].
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+  GpuLaunchConfig cfg = GetGpuLaunchConfig(out_nelem, d);
+  TF_CHECK_OK(
+      GpuLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+                      d.stream(), cfg.virtual_thread_count, p,
+                      reinterpret_cast<const int32*>(dev_buf), ndims, q));
+  // Safe to deallocate immediately after the kernel launch.
+  d.deallocate(dev_buf);
+}
+
+}  // end namespace internal
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_cpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_cpu_impl.h
new file mode 100644
index 00000000..066954a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/tile_ops_impl.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Register functors used for TileGradientOp.
+#define DEFINE_DIM(T, NDIM)                     \
+  template struct TileGrad<CPUDevice, T, NDIM>; \
+  template struct ReduceAndReshape<CPUDevice, T, NDIM, 1>;
+#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
+
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+
+#undef DEFINE_DIM
+#undef DEFINE_TYPE
+
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_gpu_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_gpu_impl.h
new file mode 100644
index 00000000..f1bbbf1e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+
+// Header used to split up compilation of GPU tile ops.  For each type you want
+// to have tile ops, create a .cu.cc file containing
+//
+//   #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+//   #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
+//   DEFINE_TILE_OPS(NDIM)
+//   #endif  // GOOGLE_CUDA
+//
+// where NDIM is an integer.
+//
+// NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.
+
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error "This header must be included inside with CUDA or ROCm defined"
+#endif
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/kernels/tile_ops_impl.h"
+
+#define DEFINE_DIM(T, NDIM)                            \
+  template struct TileGrad<Eigen::GpuDevice, T, NDIM>; \
+  template struct ReduceAndReshape<Eigen::GpuDevice, T, NDIM, 1>;
+
+#define DEFINE_TILE_OPS(NDIM)       \
+  namespace tensorflow {            \
+  namespace functor {               \
+  DEFINE_DIM(int16, NDIM)           \
+  DEFINE_DIM(int32, NDIM)           \
+  DEFINE_DIM(int64, NDIM)           \
+  DEFINE_DIM(Eigen::half, NDIM)     \
+  DEFINE_DIM(Eigen::bfloat16, NDIM) \
+  DEFINE_DIM(float, NDIM)           \
+  DEFINE_DIM(double, NDIM)          \
+  DEFINE_DIM(complex64, NDIM)       \
+  DEFINE_DIM(complex128, NDIM)      \
+  }                                 \
+  }
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_impl.h b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_impl.h
new file mode 100644
index 00000000..9f9a11b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/tile_ops_impl.h
@@ -0,0 +1,71 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_IMPL_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T, int NDIM>
+struct TileGrad {
+  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
+                  typename TTypes<T, NDIM>::ConstTensor in,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes,
+                  bool first) const {
+    if (first) {
+      out.device(d) = in.slice(indices, sizes);
+    } else {
+      out.device(d) += in.slice(indices, sizes);
+    }
+  }
+};
+
+template <typename Device, typename T>
+struct TileGrad<Device, T, 0> {
+  void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
+                  typename TTypes<T, 0>::ConstTensor in,
+                  const Eigen::DSizes<Eigen::DenseIndex, 0>&,
+                  const Eigen::DSizes<Eigen::DenseIndex, 0>&,
+                  bool first) const {
+    if (first) {
+      out.device(d) = in;
+    } else {
+      out.device(d) += in;
+    }
+  }
+};
+
+template <typename Device, typename T, int NDIM, int REDUCEDNDIM>
+struct ReduceAndReshape {
+  void operator()(
+      const Device& d, typename TTypes<T, NDIM>::Tensor out,
+      typename TTypes<T, NDIM>::ConstTensor in,
+      const Eigen::DSizes<Eigen::DenseIndex, REDUCEDNDIM>& reduce_dim,
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& reshape_dim) const {
+    out.device(d) = in.sum(reduce_dim).reshape(reshape_dim);
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/topk_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/topk_op.h
new file mode 100644
index 00000000..cdebb07f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/topk_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_H_
+#define TENSORFLOW_CORE_KERNELS_TOPK_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T, typename Tidx>
+struct TopKFunctor {
+  static absl::Status Compute(OpKernelContext* context, bool sorted, int k,
+                              const typename TTypes<T, 2>::ConstTensor& input,
+                              const int64_t num_rows, const int64_t num_cols,
+                              typename TTypes<T, 2>::Tensor values,
+                              typename TTypes<Tidx, 2>::Tensor indices);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/topk_op_gpu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/topk_op_gpu.h
new file mode 100644
index 00000000..26162abc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/topk_op_gpu.h
@@ -0,0 +1,597 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/gpu_prim_helpers.h"
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace impl {
+
+enum class HeapType { kMinHeap, kMaxHeap };
+enum class PreferIndices { kLower, kHigher };
+
+template <typename T>
+struct Entry {
+  int index;
+  T value;
+
+  // Test-only.
+  static bool greater(const Entry<T>& a, const Entry<T>& b) {
+    if (a.value == b.value) {
+      return a.index < b.index;
+    }
+    return a.value > b.value;
+  }
+};
+
+template <typename T>
+struct LinearData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const { return data[index]; }
+
+  __device__ int get_index(int i) const { return data[i].index; }
+  __device__ T get_value(int i) const { return data[i].value; }
+
+  Entry* const data;
+};
+
+template <typename T>
+struct IndirectLinearData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const { return data[index]; }
+
+  __device__ int get_index(int i) const {
+    return backing_data[data[i].index].index;
+  }
+  __device__ T get_value(int i) const { return data[i].value; }
+
+  Entry* const data;
+  Entry* const backing_data;
+};
+
+template <typename T>
+struct StridedData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const {
+    return data[index * blockDim.x + threadIdx.x];
+  }
+
+  __device__ int get_index(int i) const { return (*this)[i].index; }
+  __device__ T get_value(int i) const { return (*this)[i].value; }
+
+  Entry* const data;
+};
+
+// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
+template <HeapType heapType, PreferIndices preferIndices,
+          template <typename> class Data, typename T>
+struct IndexedHeap {
+  typedef typename Data<T>::Entry Entry;
+  const Data<T> data;
+  __device__ IndexedHeap(const Data<T>& d) : data(d) {}
+
+  __device__ bool is_above(int left, int right) {
+    T left_value = data.get_value(left);
+    T right_value = data.get_value(right);
+    if (left_value == right_value) {
+      if (preferIndices == PreferIndices::kLower) {
+        return data.get_index(left) < data.get_index(right);
+      } else {
+        return data.get_index(left) > data.get_index(right);
+      }
+    }
+    if (heapType == HeapType::kMinHeap) {
+      return left_value < right_value;
+    } else {
+      return left_value > right_value;
+    }
+  }
+
+  __device__ void assign(int i, const Entry& entry) { data[i] = entry; }
+
+  __device__ void push_up(int i) {
+    int child = i;
+    int parent;
+    for (; child > 0; child = parent) {
+      parent = (child - 1) / 2;
+      if (!is_above(child, parent)) {
+        // Heap property satisfied.
+        break;
+      }
+      swap(child, parent);
+    }
+  }
+
+  __device__ void swap(int a, int b) {
+    auto tmp = data[b];
+    data[b] = data[a];
+    data[a] = tmp;
+  }
+
+  __device__ void push_root_down(int k) { push_down(0, k); }
+
+  // MAX-HEAPIFY in Cormen
+  __device__ void push_down(int node, int k) {
+    while (true) {
+      const int left = 2 * node + 1;
+      const int right = left + 1;
+      int smallest = node;
+      if (left < k && is_above(left, smallest)) {
+        smallest = left;
+      }
+      if (right < k && is_above(right, smallest)) {
+        smallest = right;
+      }
+      if (smallest == node) {
+        break;
+      }
+      swap(smallest, node);
+      node = smallest;
+    }
+  }
+
+  // BUILD-MAX-HEAPIFY in Cormen
+  __device__ void build(int k) {
+    for (int node = (k - 1) / 2; node >= 0; node--) {
+      push_down(node, k);
+    }
+  }
+
+  // HEAP-EXTRACT-MAX in Cormen
+  __device__ void remove_root(int k) {
+    data[0] = data[k - 1];
+    push_root_down(k - 1);
+  }
+
+  // in-place HEAPSORT in Cormen
+  // This method destroys the heap property.
+  __device__ void sort(int k) {
+    for (int slot = k - 1; slot > 0; slot--) {
+      // This is like remove_root but we insert the element at the end.
+      swap(slot, 0);
+      // Heap is now an element smaller.
+      push_root_down(/*k=*/slot);
+    }
+  }
+
+  __device__ void replace_root(const Entry& entry, int k) {
+    data[0] = entry;
+    push_root_down(k);
+  }
+
+  __device__ const Entry& root() { return data[0]; }
+};
+
+template <HeapType heapType, PreferIndices preferIndices,
+          template <typename> class Data, typename T>
+__device__ IndexedHeap<heapType, preferIndices, Data, T> make_indexed_heap(
+    typename Data<T>::Entry* data) {
+  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
+}
+
+// heapTopK walks over [input, input+length) with `step_size` stride starting at
+// `start_index`.
+// It builds a top-`k` heap that is stored in `heap_entries` using `Accessor` to
+// access elements in `heap_entries`. If sorted=true, the elements will be
+// sorted at the end.
+template <typename T, template <typename> class Data = LinearData>
+__device__ void heapTopK(const T* __restrict__ input, int length, int k,
+                         Entry<T>* __restrict__ heap_entries,
+                         bool sorted = false, int start_index = 0,
+                         int step_size = 1) {
+  assert(k <= length);
+
+  auto heap =
+      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
+          heap_entries);
+
+  int heap_end_index = start_index + k * step_size;
+  if (heap_end_index > length) {
+    heap_end_index = length;
+  }
+  // Initialize the min-heap.
+  for (int index = start_index, slot = 0; index < heap_end_index;
+       index += step_size, slot++) {
+    heap.assign(slot, {index, input[index]});
+  }
+
+  heap.build(k);
+
+  // Now iterate over the remaining items.
+  // If an item is smaller than the min element, it is not amongst the top k.
+  // Otherwise, replace the min element with it and push upwards.
+  for (int index = heap_end_index; index < length; index += step_size) {
+    // We prefer elements with lower indices. This is given here.
+    // Later elements automatically have higher indices, so can be discarded.
+    if (input[index] > heap.root().value) {
+      // This element should replace the min.
+      heap.replace_root({index, input[index]}, k);
+    }
+  }
+
+  // Sort if wanted.
+  if (sorted) {
+    heap.sort(k);
+  }
+}
+
+// mergeShards performs a top-k merge on `num_shards` many sorted streams that
+// are sorted and stored in `entries` in a strided way:
+// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
+// The overall top k elements are written to `top_k_values` and their indices
+// to top_k_indices.
+// `top_k_heap` is used as temporary storage for the merge heap.
+template <typename T>
+__device__ void mergeShards(int num_shards, int k,
+                            Entry<T>* __restrict__ entries,
+                            Entry<T>* __restrict__ top_k_heap, T* top_k_values,
+                            int* top_k_indices) {
+  // If k < num_shards, we can use a min-heap with k elements to get the top k
+  // of the sorted blocks.
+  // If k > num_shards, we can initialize a min-heap with the top element from
+  // each sorted block.
+  const int heap_size = k < num_shards ? k : num_shards;
+
+  // Min-heap part.
+  {
+    auto min_heap = IndexedHeap<HeapType::kMinHeap, PreferIndices::kHigher,
+                                IndirectLinearData, T>{
+        IndirectLinearData<T>{top_k_heap, entries}};
+    // Initialize the heap as a min-heap.
+    for (int slot = 0; slot < heap_size; slot++) {
+      min_heap.assign(slot, {slot, entries[slot].value});
+    }
+    min_heap.build(heap_size);
+
+    // Now perform top k with the remaining shards (if num_shards > heap_size).
+    for (int shard = heap_size; shard < num_shards; shard++) {
+      const auto entry = entries[shard];
+      const auto root = min_heap.root();
+      if (entry.value < root.value) {
+        continue;
+      }
+      if (entry.value == root.value &&
+          entry.index > entries[root.index].index) {
+        continue;
+      }
+      // This element should replace the min.
+      min_heap.replace_root({shard, entry.value}, heap_size);
+    }
+  }
+
+  // Max-part.
+  {
+    // Turn the min-heap into a max-heap in-place.
+    auto max_heap = IndexedHeap<HeapType::kMaxHeap, PreferIndices::kLower,
+                                IndirectLinearData, T>{
+        IndirectLinearData<T>{top_k_heap, entries}};
+    // Heapify into a max heap.
+    max_heap.build(heap_size);
+
+    // Now extract the minimum k-1 times.
+    // k is treated specially.
+    const int last_k = k - 1;
+    for (int rank = 0; rank < last_k; rank++) {
+      const Entry<T>& max_element = max_heap.root();
+      top_k_values[rank] = max_element.value;
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
+      int next_shard_index = shard_index + num_shards;
+      // For rank < k-1, each top k heap still contains at least 1 element,
+      // so we can draw a replacement.
+      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
+                            heap_size);
+    }
+
+    // rank == last_k.
+    const Entry<T>& max_element = max_heap.root();
+    top_k_values[last_k] = max_element.value;
+    int shard_index = max_element.index;
+    top_k_indices[last_k] = entries[shard_index].index;
+  }
+}
+
+#if GOOGLE_CUDA
+extern __shared__ char shared_memory[];
+#endif  // GOOGLE_CUDA
+
+template <typename T>
+#if TENSORFLOW_USE_ROCM
+__attribute__((amdgpu_flat_work_group_size(1, 256)))
+#endif  // TENSORFLOW_USE_ROCM
+__global__ void
+TopKKernel(const T* __restrict__ input, int length, int k, bool sorted,
+           T* __restrict__ output, int* __restrict__ indices) {
+#if TENSORFLOW_USE_ROCM
+  HIP_DYNAMIC_SHARED(char, shared_memory);
+#endif  // TENSORFLOW_USE_ROCM
+
+  const int batch_index = blockIdx.x;
+  const T* batch_input = input + batch_index * length;
+
+  const int thread_index = threadIdx.x;
+  const int thread_count = blockDim.x;
+
+  Entry<T>* shared_entries = (Entry<T>*)shared_memory;
+
+  heapTopK<T, StridedData>(batch_input, length, k, shared_entries, true,
+                           thread_index, thread_count);
+
+  __syncthreads();
+  if (thread_index == 0) {
+    const int offset = batch_index * k;
+    auto batch_output = output + offset;
+    auto batch_indices = indices + offset;
+    Entry<T>* top_k_heap = shared_entries + thread_count * k;
+
+    // TODO(blackhc): Erich says: Performance can likely be improved
+    // significantly by having the merge be done by multiple threads rather than
+    // just one.  ModernGPU has some nice primitives that could help with this.
+    mergeShards(thread_count, k, shared_entries, top_k_heap, batch_output,
+                batch_indices);
+  }
+}
+
+template <typename T>
+cudaError LaunchTopKKernel(const gpuStream_t& stream, int num_shards,
+                           const T* input, int batch_size, int length, int k,
+                           bool sorted, T* output, int* indices) {
+  // This code assumes that k is small enough that the computation
+  // fits inside shared memory (hard coded to 48KB).  In practice this
+  // means k <= 3072 for T=float/int32 and k <= 2048 for T=double/int64.
+  // The calculation is:
+  //   shared_memory_size / (2 * (sizeof(int) + sizeof(T))) < k.
+
+  // Use as many shards as possible.
+  if (num_shards <= 0) {
+    constexpr auto shared_memory_size = 48 << 10;  // 48 KB
+    const auto heap_size = k * sizeof(Entry<T>);
+    // shared_memory_size = (num_shards + 1) * heap_size <=>
+    num_shards = shared_memory_size / heap_size - 1;
+    if (num_shards <= 0) {
+      num_shards = 1;
+    }
+    auto shard_size = length / num_shards;
+    auto min_shard_size = 2 * k;
+    if (shard_size < min_shard_size) {
+      num_shards = length / min_shard_size;
+    }
+    if (num_shards <= 0) {
+      num_shards = 1;
+#if GOOGLE_CUDA
+    } else if (num_shards > 1024) {
+      num_shards = 1024;
+    }
+#elif TENSORFLOW_USE_ROCM
+      // ROCm can't execute with 1024 and requires an explicit
+      // amdgpu_flat_work_group_size attribute with >256
+    } else if (num_shards > 256) {
+      num_shards = 256;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  }
+  // We are limited by the amount of shared memory we have per block.
+  auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
+
+  TF_CHECK_OK(GpuLaunchKernel(TopKKernel<T>, batch_size, num_shards,
+                              shared_memory_size, stream, input, length, k,
+                              sorted, output, indices));
+  return cudaGetLastError();
+}
+
+struct SegmentOffsetCreator {
+  EIGEN_DEVICE_FUNC
+  SegmentOffsetCreator(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+struct ColumnIndexCreator {
+  ColumnIndexCreator(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
+                        int num_cols, int k,
+                        typename TTypes<T, 2>::Tensor values,
+                        TTypes<int, 2>::Tensor indices) {
+  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const auto& cu_stream = GetGpuStream(ctx);
+  size_t temp_storage_bytes = -1;
+
+  // TODO(ebrevdo): Once gpuprim supports iterators for ValueT replace that
+  // tensor with an iterator that directly returns the correct value.
+  Tensor input_indices;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(
+      DT_INT32, TensorShape({num_rows, num_cols}), &input_indices));
+  auto input_indices_t = To32Bit(input_indices.flat<int32>());
+  input_indices_t.device(d) =
+      input_indices_t.generate(ColumnIndexCreator(num_cols));
+
+  gpuprim::CountingInputIterator<int> counting_iter(0);
+  gpuprim::TransformInputIterator<int, SegmentOffsetCreator,
+                                  gpuprim::CountingInputIterator<int>>
+      segment_offsets_t(counting_iter, SegmentOffsetCreator(num_cols));
+
+  Tensor temp_values;
+  Tensor temp_indices;
+  T* sorted_values_ptr;
+  int* sorted_indices_ptr;
+  if (k == num_cols) {
+    // Doing a full sort, no intermediate values needed.
+    sorted_values_ptr = values.data();
+    sorted_indices_ptr = indices.data();
+  } else {
+    // Need to create intermediate values for sorting.
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT32, TensorShape({num_rows, num_cols}), &temp_indices));
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                          TensorShape({num_rows, num_cols}),
+                                          &temp_values));
+    sorted_indices_ptr = temp_indices.flat<int32>().data();
+    sorted_values_ptr = temp_values.flat<T>().data();
+  }
+
+  bool ran_nonsegmented_version = false;
+  if (num_rows == 1) {
+    // Note: DeviceSegmentedRadixSort is very slow when num_segments=1 because
+    // it only uses 1 SM per segment. Calling the un-segmented version is much
+    // faster in this case.
+    TF_RETURN_IF_ERROR(
+        GpuRadixSortDescending(ctx, num_cols, /*keys_in=*/input,
+                               /*keys_out=*/sorted_values_ptr,
+                               /*indices_in=*/input_indices_t.data(),
+                               /*indices_out=*/sorted_indices_ptr,
+                               /*num_bits=*/sizeof(T) * 8));
+    ran_nonsegmented_version = true;
+  }
+  if (!ran_nonsegmented_version) {
+    auto err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
+        /* d_temp_storage */ nullptr,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_keys_in */ input,
+        /* d_keys_out */ sorted_values_ptr,
+        /* d_values_in */ input_indices_t.data(),
+        /* d_values_out */ sorted_indices_ptr,
+        /* num_items */ num_cols * num_rows,
+        /* num_segments */ num_rows,
+        /* d_begin_offsets */ segment_offsets_t,
+        /* d_end_offsets */ segment_offsets_t + 1,
+        /* begin_bit */ 0,
+        /* end_bit */ sizeof(T) * 8,
+        /* stream */ cu_stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "TopKOp: Could not launch "
+          "gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+          "temp_storage_bytes, status: ",
+          cudaGetErrorString(err));
+    }
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+        &temp_storage));
+    err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
+        /* d_temp_storage */ temp_storage.flat<int8>().data(),
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_keys_in */ input,
+        /* d_keys_out */ sorted_values_ptr,
+        /* d_values_in */ input_indices_t.data(),
+        /* d_values_out */ sorted_indices_ptr,
+        /* num_items */ num_cols * num_rows,
+        /* num_segments */ num_rows,
+        /* d_begin_offsets */ segment_offsets_t,
+        /* d_end_offsets */ segment_offsets_t + 1,
+        /* begin_bit */ 0,
+        /* end_bit */ sizeof(T) * 8,
+        /* stream */ cu_stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "TopKOp: Could not launch "
+          "gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to sort "
+          "input, "
+          "temp_storage_bytes: ",
+          temp_storage_bytes, ", status: ", cudaGetErrorString(err));
+    }
+  }
+  if (k < num_cols) {
+    // Need to copy subsets of sorted_indices and sorted_outputs to
+    // indices and outputs.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    To32Bit(indices).device(d) =
+        To32Bit(temp_indices.matrix<int32>()).slice(slice_indices, slice_sizes);
+    To32Bit(values).device(d) =
+        To32Bit(temp_values.matrix<T>()).slice(slice_indices, slice_sizes);
+  }
+  return OkStatus();
+}
+
+}  // namespace impl
+
+namespace functor {
+
+template <typename T, typename Tidx>
+struct TopKFunctor<GPUDevice, T, Tidx> {
+  static EIGEN_ALWAYS_INLINE Status
+  Compute(OpKernelContext* context, bool sorted, int k,
+          const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows,
+          const int64 num_cols, typename TTypes<T, 2>::Tensor values,
+          typename TTypes<Tidx, 2>::Tensor indices) {
+    // For small k, use the heap implementation.  For larger k, use
+    // the in-place gpuprim sort.  For k == num_cols, always use the
+    // in-place gpuprim sort.  The thresholds for n and k were determined
+    // empirically.
+    if (num_cols <= 1000 || k == num_cols || k >= 100) {
+      return impl::LaunchSortKernel(context, input.data(), num_rows, num_cols,
+                                    k, values, indices);
+    } else {
+      const auto& cu_stream = GetGpuStream(context);
+      auto err = impl::LaunchTopKKernel(cu_stream, /* num_shards */ 0,
+                                        input.data(), num_rows, num_cols, k,
+                                        sorted, values.data(), indices.data());
+      if (err != cudaSuccess) {
+        return errors::Internal(
+            "Could not launch TopKKernel: ", cudaGetErrorString(err), ".");
+      } else {
+        return OkStatus();
+      }
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/training_op_helpers.h b/third_party/tflite-hdrs/tensorflow/core/kernels/training_op_helpers.h
new file mode 100644
index 00000000..83ee04fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/training_op_helpers.h
@@ -0,0 +1,301 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
+#define TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tsl/platform/mutex.h"
+
+namespace tensorflow {
+
+// Must be called before performing a sparse operation on a variable. Ensures
+// that no concurrent dense operations can happen while holding the variable's
+// lock.
+// @param ctx        OpKernelContext for variable tensor cloning
+// @param var        Variable to be shared
+// @param lock_held  Whether the variable mutex was already held or not
+// NOTE: This function uses variable's `copy_on_read_mode` flag to decide if
+// it should immediately return or continue to lock the variable mutex for more
+// processing, and always sets the `copy_on_read_mode` flag to true when this
+// function returns. However, there is no guarantee that another op won't set
+// the `copy_on_read_mode` flag back to false after this function.
+// Therefore, for the operation that requires `copy_on_read` to stay true during
+// its execution, the caller needs to lock the variable mutex outside and call
+// this function with `lock_held = true` to avoid double locking.
+template <typename Device, typename T>
+absl::Status EnsureSparseVariableAccess(OpKernelContext* ctx, Var* var) {
+  if (var->copy_on_read_mode.load()) {
+    return absl::OkStatus();
+  }
+
+  tsl::mutex_lock ml(*var->mu());
+
+  // It may be possible that there are multiple threads that invoke
+  // `EnsureSparseVariableAccess` at the same time. If so, the first thread that
+  // enters this critical section will set the `copy_on_read_mode` flag to true.
+  // All other threads can then exit this critical section immediately.
+  if (var->copy_on_read_mode.load()) {
+    return absl::OkStatus();
+  }
+
+  // Once copy-on-read mode is True the refcount is guaranteed to be 1. This can
+  // also happen if there are no concurrent reads of the variable and
+  // copy-on-read mode is false.
+  if (var->tensor()->RefCountIsOne()) {
+    var->copy_on_read_mode.store(true);
+    return absl::OkStatus();
+  }
+  Tensor tmp;
+  if (std::is_same<T, Variant>::value) {
+    tsl::AllocatorAttributes attr;
+    attr.set_on_host(true);
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(var->tensor()->dtype(),
+                                          var->tensor()->shape(), &tmp, attr));
+
+    const auto elements_in = var->tensor()->flat<Variant>();
+    auto elements_out = tmp.flat<Variant>();
+    for (int64_t i = 0; i < elements_in.size(); ++i) {
+      elements_out(i) = elements_in(i);
+    }
+  } else {
+    tsl::AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(var->tensor()->dtype(),
+                                          var->tensor()->shape(), &tmp, attr));
+    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+    copy_functor(ctx->eigen_device<Device>(), tmp.flat<T>(),
+                 const_cast<const Tensor*>(var->tensor())->flat<T>());
+  }
+  *var->tensor() = tmp;
+  var->copy_on_read_mode.store(true);
+  return absl::OkStatus();
+}
+
+// Utility structure that releases a sequence of borrowed mutexes when it is
+// deleted.
+class VariableInputLockHolder {
+ public:
+  VariableInputLockHolder(
+      std::vector<Var*> vars,
+      std::unique_ptr<std::vector<tsl::mutex_lock>> locks,
+      std::unique_ptr<std::vector<tsl::tf_shared_lock>> shared_locks)
+      : vars_(std::move(vars)),
+        locks_(std::move(locks)),
+        shared_locks_(std::move(shared_locks)) {}
+
+  VariableInputLockHolder(VariableInputLockHolder&& other)
+      : vars_(std::move(other.vars_)),
+        locks_(std::move(other.locks_)),
+        shared_locks_(std::move(other.shared_locks_)) {}
+
+  ~VariableInputLockHolder() {
+    // Release the locks before unrefing the Vars, because each lock
+    // is potentially borrowed from a Var in vars_.
+    locks_.reset();
+    for (Var* var : vars_) {
+      var->Unref();
+    }
+  }
+
+ private:
+  std::vector<Var*> vars_;
+  // NOTE: Use a `std::unique_ptr` instead of moving in a vector directly,
+  // because a `std::vector<mutex_lock>` is not movable on all platforms.
+  std::unique_ptr<std::vector<tsl::mutex_lock>> locks_;
+  std::unique_ptr<std::vector<tsl::tf_shared_lock>> shared_locks_;
+};
+
+// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`.
+//
+// If `input` corresponds to a `DT_RESOURCE`-type variable input,
+// `*maybe_resource` will be updated to contain the underlying resource, and the
+// caller will be responsible for calling `Unref()` on that resource.
+template <typename Device, typename T>
+tsl::mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input,
+                                     Var** maybe_resource) {
+  *maybe_resource = nullptr;
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) {
+      return (*maybe_resource)->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          absl::InternalError("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
+// in address order to mitigate deadlock.  Returns a structure that, when
+// deleted, will release the acquired mutexes. Safe to pass duplicates - will
+// only lock each distinct mutex once. If sparse is true, will ensure the
+// variable gets switched to copy-on-read mode before trying to acquire the
+// locks. If do_lock is false, returns immediately for reference variables. For
+// resource variables in copy-on-read-mode, it will grab a shared lock if
+// do_lock is false, exclusive lock otherwise.  Note that this silently doesn't
+// lock mutexes for invalid variable references; in all usages this is followed
+// by GetInputTensor which will signal a failure.
+template <typename Device, typename T>
+VariableInputLockHolder MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, bool sparse,
+    const std::vector<int>& input_ids) {
+  bool any_resource = false;
+  for (auto i : input_ids) {
+    if (ctx->input_dtype(i) == DT_RESOURCE) {
+      any_resource = true;
+      break;
+    }
+  }
+  if (!do_lock && !any_resource) {
+    return VariableInputLockHolder({}, {}, {});
+  }
+  std::vector<Var*> vars;
+  std::vector<tsl::mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    Var* var;
+    tsl::mutex* mutex = GetTrainingVariableMutex<Device, T>(ctx, input, &var);
+    if (var) vars.push_back(var);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+
+  if (sparse) {
+    for (Var* var : vars) {
+      EnsureSparseVariableAccess<Device, T>(ctx, var).IgnoreError();
+    }
+  }
+
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  auto locks = std::make_unique<std::vector<tsl::mutex_lock>>();
+  auto shared_locks = std::make_unique<std::vector<tsl::tf_shared_lock>>();
+  locks->reserve(acquire_order.size());
+
+  for (auto acquire : acquire_order) {
+    tsl::mutex* mu = mutexes[acquire];
+    if (mu != nullptr) {
+      if (!sparse || do_lock) {
+        locks->emplace_back(*mu);
+      } else {
+        shared_locks->emplace_back(*mu);
+      }
+    }
+  }
+  auto variableInputLock =
+      VariableInputLockHolder(vars, std::move(locks), std::move(shared_locks));
+  return variableInputLock;
+}
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output);
+
+// This is for use with ResourceVariables to ensure *tensor has a
+// reference count of 1 before you update it.
+// REQUIRES: If you pass in variable->tensor(), *variable->mu() must be held.
+template <typename Device, typename T>
+absl::Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor,
+                                     bool copy_on_read_mode) {
+  if (copy_on_read_mode || !tensor->RefCountIsOne()) {
+    // Tensor's buffer is in use by some read, so we need to copy before
+    // updating.
+    Tensor tmp;
+    if (std::is_same<T, Variant>::value) {
+      tsl::AllocatorAttributes attr;
+      attr.set_on_host(true);
+      TF_RETURN_IF_ERROR(
+          ctx->allocate_temp(tensor->dtype(), tensor->shape(), &tmp, attr));
+
+      const auto elements_in = tensor->flat<Variant>();
+      auto elements_out = tmp.flat<Variant>();
+      for (int64_t i = 0; i < elements_in.size(); ++i) {
+        elements_out(i) = elements_in(i);
+      }
+    } else {
+      tsl::AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      TF_RETURN_IF_ERROR(
+          ctx->allocate_temp(tensor->dtype(), tensor->shape(), &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(ctx->eigen_device<Device>(), tmp.flat<T>(),
+                   const_cast<const Tensor*>(tensor)->flat<T>());
+    }
+    *tensor = tmp;
+  }
+  return absl::OkStatus();
+}
+
+// This gives you `*out`, a tensor you can update, corresponding to a variable
+// passed as input index `input`.  This handles the differences between
+// reference and resource variables.
+
+// For reference variables we can just grab the tensor, grabbing the lock if
+// `lock_held` is False.
+//
+// For resource variables:
+// * If sparse is true: return the underlying tensor.
+// * If sparse is false: ensure its refcount is 1 (by potentially copying its
+//   contents), and then return the underlying tensor.
+// `lock_held` is ignored for resource variables.
+template <typename Device, typename T>
+absl::Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                        bool lock_held, bool sparse,
+                                        Tensor* out) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    core::RefCountPtr<Var> var;
+    TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
+    if (sparse) {
+      var->mu()->assert_held_shared();
+      *out = *var->tensor();
+      return absl::OkStatus();
+    }
+    var->mu()->assert_held();
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(
+        ctx, var->tensor(), var->copy_on_read_mode.load()));
+    *out = *var->tensor();
+    return absl::OkStatus();
+  }
+  *out = ctx->mutable_input(input, lock_held);
+  return absl::OkStatus();
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/training_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/training_ops.h
new file mode 100644
index 00000000..8f986d13
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/training_ops.h
@@ -0,0 +1,322 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_TRAINING_OPS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Each training algorithm has a ApplyXYZ functor struct declared in
+// this header file. They are specialized for different devices
+// (CPUDevice in training_ops.cc or GPUDevice in training_ops_gpu.cc).
+
+template <typename Device, typename T>
+struct ApplyGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstFlat delta);
+};
+
+template <typename Device, typename T>
+struct ApplyAdadelta {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat accum_update,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T, typename Tindex>
+struct SparseApplyAdadelta {
+  void operator()(const Device& d, typename TTypes<T>::Matrix var,
+                  typename TTypes<T>::Matrix accum,
+                  typename TTypes<T>::Matrix accum_update,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstMatrix grad,
+                  typename TTypes<Tindex>::ConstFlat indices);
+};
+
+template <typename Device, typename T>
+struct FobosElasticNet {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyProximalGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyAdagrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad, bool update_slots);
+};
+
+template <typename Device, typename T>
+struct ApplyAdagradV2 {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool update_slots);
+};
+
+template <typename Device, typename T>
+struct ApplyAdagradDA {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat gradient_accum,
+                  typename TTypes<T>::Flat gradient_squared_accum,
+                  typename TTypes<T>::ConstScalar lr, int64_t global_step,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T, typename Tindex, bool has_epsilon>
+struct SparseApplyAdagrad {
+  // Note that epsilon is ignored if has_epsilon is false.
+  absl::Status operator()(const Device& d, typename TTypes<T>::Matrix var,
+                          typename TTypes<T>::Matrix accum,
+                          typename TTypes<T>::ConstScalar lr,
+                          typename TTypes<T>::ConstScalar epsilon,
+                          typename TTypes<T>::ConstMatrix grad,
+                          typename TTypes<Tindex>::ConstVec indices,
+                          int64_t inner_dim, bool update_slots);
+};
+
+template <typename Device, typename T>
+struct ApplyProximalAdagrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T, typename Tindex>
+struct SparseApplyProximalAdagrad {
+  absl::Status operator()(const Device& d, typename TTypes<T>::Matrix var,
+                          typename TTypes<T>::Matrix accum,
+                          typename TTypes<T>::ConstScalar lr,
+                          typename TTypes<T>::ConstScalar l1,
+                          typename TTypes<T>::ConstScalar l2,
+                          typename TTypes<T>::ConstMatrix grad,
+                          typename TTypes<Tindex>::ConstVec indices,
+                          int64_t inner_dim);
+};
+
+template <typename Device, typename T>
+struct ApplyFtrl {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar lr_power);
+};
+
+template <typename Device, typename T>
+struct ApplyFtrlMultiplyLinearByLr {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar lr_power);
+};
+
+template <typename Device, typename T>
+struct ApplyFtrlV2 {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar l2_shrinkage,
+                  typename TTypes<T>::ConstScalar lr_power);
+};
+
+template <typename Device, typename T>
+struct ApplyFtrlV2MultiplyLinearByLr {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar l2_shrinkage,
+                  typename TTypes<T>::ConstScalar lr_power);
+};
+
+template <typename Device, typename T, typename Tindex, bool has_l2_shrinkage>
+struct SparseApplyFtrl {
+  absl::Status operator()(const Device& d, typename TTypes<T>::Matrix var_flat,
+                          typename TTypes<T>::Matrix accum_flat,
+                          typename TTypes<T>::Matrix linear_flat,
+                          typename TTypes<T>::ConstScalar lr,
+                          typename TTypes<T>::ConstScalar l1,
+                          typename TTypes<T>::ConstScalar l2,
+                          typename TTypes<T>::ConstScalar l2_shrinkage,
+                          typename TTypes<T>::ConstScalar lr_power,
+                          typename TTypes<T>::ConstMatrix grad_flat,
+                          typename TTypes<Tindex>::ConstVec indices_vec,
+                          int64_t inner_dim, bool multiply_linear_by_lr);
+};
+
+template <typename Device, typename T>
+struct ApplyMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
+};
+
+template <typename Device, typename T>
+struct ApplyKerasMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
+};
+
+template <typename Device, typename T, typename Tindex>
+struct SparseApplyKerasMomentum {
+  Tindex operator()(const Device& d, typename TTypes<T>::Matrix var,
+                    typename TTypes<T>::Matrix accum,
+                    typename TTypes<T>::ConstScalar lr,
+                    typename TTypes<T>::ConstMatrix grad,
+                    typename TTypes<Tindex>::ConstFlat indices,
+                    typename TTypes<T>::ConstScalar momentum,
+                    bool use_nesterov);
+};
+
+template <typename Device, typename T>
+struct ApplyAdam {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov);
+};
+
+template <typename Device, typename T>
+struct ApplyAdamWithAmsgrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyAdaMax {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyRMSProp {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar momentum,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyCenteredRMSProp {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat mg, typename TTypes<T>::Flat ms,
+                  typename TTypes<T>::Flat mom,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar rho,
+                  typename TTypes<T>::ConstScalar momentum,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyAddSign {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyPowerSign {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TRAINING_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/transpose_functor.h b/third_party/tflite-hdrs/tensorflow/core/kernels/transpose_functor.h
new file mode 100644
index 00000000..f4c905b1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/transpose_functor.h
@@ -0,0 +1,258 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TRANSPOSE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_TRANSPOSE_FUNCTOR_H_
+
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+// Transpose tensor 'in' into tensor 'out' according to dimension
+// permutation 'perm'.
+//
+// REQUIRES: in.dtype() == out->dtype()
+// REQUIRES: in.dims() == out->dims()
+// REQUIRES: in.dims() == perm.size()
+// REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
+template <typename Device>
+absl::Status DoTranspose(const Device& device, const Tensor& in,
+                         const absl::Span<const int32> perm, Tensor* out);
+
+// Conjugate and transpose tensor 'in' into tensor 'out' according to dimension
+// permutation 'perm'.
+//
+// REQUIRES: in.dtype() == out->dtype()
+// REQUIRES: in.dims() == out->dims()
+// REQUIRES: in.dims() == perm.size()
+// REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
+template <typename Device>
+absl::Status DoConjugateTranspose(const Device& device, const Tensor& in,
+                                  const absl::Span<const int32> perm,
+                                  Tensor* out);
+
+// Convenience versions of DoTranspose that only swap the last (inner) two
+// dimensions.
+template <typename Device>
+absl::Status DoMatrixTranspose(const Device& device, const Tensor& in,
+                               Tensor* out);
+
+// Convenience versions of DoConjugateTranspose that only swap the last (inner)
+// two dimensions.
+template <typename Device>
+absl::Status DoConjugateMatrixTranspose(const Device& device, const Tensor& in,
+                                        Tensor* out);
+
+// Primary device specific functor to be specialized for each device and type.
+template <typename Device, typename T, bool conjugate = false>
+struct Transpose {
+  static void run(const Device& d, const Tensor& in,
+                  const absl::Span<const int32> perm, Tensor* out);
+};
+
+// Implementation details.
+namespace internal {
+
+typedef absl::InlinedVector<int64_t, 8UL> TransposeDimsVec;
+typedef absl::InlinedVector<int32, 8UL> TransposePermsVec;
+
+// Helper function that takes a tensor shape, a permutation, combines the
+// neighboring shapes if their indices in the permutation are consecutive.
+// The function outputs the combined shape and new permutation.
+// Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
+// produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
+inline void ReduceTransposeDimensions(const TensorShape& shape,
+                                      absl::Span<const int32> perm,
+                                      TransposePermsVec* new_perm,
+                                      TransposeDimsVec* new_dims) {
+  CHECK_EQ(shape.dims(), perm.size());
+  if (shape.dims() == 1) {
+    // If input dimension is already 1, no need to reduce dimension.
+    new_perm->resize(1);
+    (*new_perm)[0] = perm[0];
+    (*new_dims)[0] = shape.dim_size(0);
+    return;
+  }
+  TransposePermsVec new_dim_position(shape.dims(), -1);
+  TransposeDimsVec combined_dims(shape.dims(), 0);
+  int cur_head = perm[0];
+  new_dim_position[cur_head] = 0;
+  combined_dims[0] = shape.dim_size(cur_head);
+  int dim_idx = 0;
+  for (int perm_idx = 1; perm_idx < shape.dims(); ++perm_idx) {
+    // If two indices in permutation are consecutive numbers, combine their
+    // dimensions.
+    if (cur_head + 1 == perm[perm_idx]) {
+      cur_head = perm[perm_idx];
+      combined_dims[dim_idx] *= shape.dim_size(cur_head);
+    } else {
+      // Else start a new dimension.
+      cur_head = perm[perm_idx];
+      dim_idx++;
+      new_dim_position[cur_head] = dim_idx;
+      combined_dims[dim_idx] = shape.dim_size(cur_head);
+    }
+  }
+  // Compact the new permutations and dimension sizes.
+  new_perm->resize(dim_idx + 1);
+  new_dims->resize(dim_idx + 1);
+  dim_idx = 0;
+  for (int i = 0; i < new_dim_position.size(); ++i) {
+    if (new_dim_position[i] >= 0) {
+      int new_perm_idx = new_dim_position[i];
+      (*new_perm)[dim_idx] = new_perm_idx;
+      (*new_dims)[dim_idx] = combined_dims[new_perm_idx];
+      dim_idx++;
+    }
+  }
+}
+
+// If all non-singleton dimensions remain in ascending order, the shuffled
+// singletons can be transposed by a reshape, saving a memory allocation & copy.
+// |permutation| must be a permutation of {0, .., input_shape.dims() - 1}.
+// That is, for all i, 0 <= perm[i] < input_shape.dims().
+// In practice, this is checked in TransposeOp::Compute prior to calling this
+// function, and the function sits here to facilitate unit testing.
+inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
+                                        const std::vector<int32>& permutation) {
+  int last_nonsingleton_perm_dim = -1;
+  for (int perm_dim : permutation) {
+    if (input_shape.dim_size(perm_dim) == 1) {
+      continue;
+    }
+    if (perm_dim < last_nonsingleton_perm_dim) {
+      return false;
+    }
+    last_nonsingleton_perm_dim = perm_dim;
+  }
+  return true;
+}
+
+// Uses Eigen to transpose.
+template <typename Device, typename T, int NDIMS>
+void TransposeUsingEigen(const Device& d, const Tensor& in,
+                         const absl::Span<const int32> perm, bool conjugate,
+                         Tensor* out) {
+  Eigen::array<int, NDIMS> p;
+  for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
+  auto x = typename TTypes<T, NDIMS>::ConstTensor(
+      reinterpret_cast<const T*>(in.tensor_data().data()),
+      in.shape().AsEigenDSizes<NDIMS>());
+  auto y = typename TTypes<T, NDIMS>::Tensor(
+      reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
+      out->shape().AsEigenDSizes<NDIMS>());
+  if (conjugate) {
+    y.device(d) = x.conjugate().shuffle(p);
+  } else {
+    y.device(d) = x.shuffle(p);
+  }
+}
+
+template <typename Device>
+absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
+                             const absl::Span<const int32> perm, bool conjugate,
+                             Tensor* out) {
+  CHECK_EQ(in.dims(), out->dims());
+  CHECK_EQ(in.dims(), perm.size());
+  CHECK_EQ(in.dtype(), out->dtype());
+  switch (in.dtype()) {
+    case DT_BOOL:
+    case DT_INT8:
+    case DT_QINT8:
+    case DT_QUINT8:
+    case DT_UINT8:
+    case DT_FLOAT8_E5M2:
+    case DT_FLOAT8_E4M3FN:
+      Transpose<Device, uint8>::run(d, in, perm, out);
+      break;
+
+    case DT_BFLOAT16:
+    case DT_HALF:
+    case DT_INT16:
+    case DT_QINT16:
+    case DT_QUINT16:
+    case DT_UINT16:
+      Transpose<Device, uint16>::run(d, in, perm, out);
+      break;
+
+    case DT_FLOAT:
+    case DT_INT32:
+    case DT_QINT32:
+    case DT_UINT32:
+      Transpose<Device, uint32>::run(d, in, perm, out);
+      break;
+
+    case DT_DOUBLE:
+    case DT_INT64:
+    case DT_UINT64:
+      Transpose<Device, uint64>::run(d, in, perm, out);
+      break;
+
+    case DT_COMPLEX64:
+      if (conjugate) {
+#if defined(__ANDROID__) and !defined(__clang__)
+        // Workaround for GCC compiler bug in Android toolchain.
+        return errors::Unimplemented(
+            "Conjugate transpose of complex64 not supported for GCC on "
+            "Android.");
+#else
+        Transpose<Device, complex64, /*conjugate=*/true>::run(d, in, perm, out);
+#endif
+      } else {
+        Transpose<Device, uint64>::run(d, in, perm, out);
+      }
+      break;
+
+    case DT_COMPLEX128:
+      if (conjugate) {
+        Transpose<Device, complex128, /*conjugate=*/true>::run(d, in, perm,
+                                                               out);
+      } else {
+        Transpose<Device, complex128, /*conjugate=*/false>::run(d, in, perm,
+                                                                out);
+      }
+      break;
+
+    case DT_STRING:
+      Transpose<Device, tstring>::run(d, in, perm, out);
+      break;
+
+    default:
+      return errors::Unimplemented("Unsupported dtype on CPU: ", in.dtype());
+  }
+  return absl::OkStatus();
+}
+
+template <typename Device>
+inline absl::Status DoMatrixTransposeImpl(const Device& device,
+                                          const Tensor& in, bool conjugate,
+                                          Tensor* out) {
+  const int ndims = in.dims();
+  if (ndims == 0) return absl::OkStatus();
+  TransposePermsVec perm(ndims);
+  std::iota(perm.begin(), perm.end(), 0);
+  std::swap(perm[ndims - 2], perm[ndims - 1]);
+  return DoTransposeImpl(device, in, perm, conjugate, out);
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TRANSPOSE_FUNCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/transpose_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/transpose_op.h
new file mode 100644
index 00000000..8f0405b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/transpose_op.h
@@ -0,0 +1,106 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class TransposeOp : public OpKernel {
+ public:
+  explicit TransposeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  virtual absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                   absl::Span<const int32> perm,
+                                   Tensor* out) = 0;
+  virtual bool IsConjugate() const { return false; }
+};
+
+class TransposeCpuOp : public TransposeOp {
+ public:
+  explicit TransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                           absl::Span<const int32> perm, Tensor* out) override;
+};
+
+#if defined(INTEL_MKL)
+class MklTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif  // INTEL_MKL
+
+class TransposeGpuOp : public TransposeOp {
+ public:
+  explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                           absl::Span<const int32> perm, Tensor* out) override;
+};
+
+
+// Conjugating transpose ops.
+class ConjugateTransposeCpuOp : public TransposeOp {
+ public:
+  explicit ConjugateTransposeCpuOp(OpKernelConstruction* ctx)
+      : TransposeOp(ctx) {}
+
+ protected:
+  absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                           absl::Span<const int32> perm, Tensor* out) override;
+  bool IsConjugate() const override { return true; }
+};
+
+#if defined(INTEL_MKL)
+class MklConjugateTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklConjugateTransposeCpuOp(OpKernelConstruction* ctx)
+      : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+  bool IsConjugate() const override { return true; }
+};
+#endif  // INTEL_MKL
+
+class ConjugateTransposeGpuOp : public TransposeOp {
+ public:
+  explicit ConjugateTransposeGpuOp(OpKernelConstruction* ctx)
+      : TransposeOp(ctx) {}
+
+ protected:
+  absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                           absl::Span<const int32> perm, Tensor* out) override;
+  bool IsConjugate() const override { return true; }
+};
+
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/third_party/tflite-hdrs/tensorflow/core/kernels/typed_conditional_accumulator_base.h
new file mode 100644
index 00000000..f6574416
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
+
+#include "tensorflow/core/kernels/conditional_accumulator_base.h"
+
+namespace tensorflow {
+
+/*
+ * TypedConditionalAccumulatorBase is a templated companion of
+ * ConditionalAccumulatorBase which allows for subclasses to use different
+ * types for the input gradients. (See ConditionalAccumulator and
+ * SparseConditionalAccumulator.)
+ *
+ * TypedConditionalAccumulatorBase defines virtual methods and implements
+ * methods which depend on the gradient type. These are mainly methods that are
+ * used for adding a new gradient to the accumulator.
+ */
+template <typename GradientTensorType>
+class TypedConditionalAccumulatorBase : public ConditionalAccumulatorBase {
+ public:
+  TypedConditionalAccumulatorBase(const DataType& dtype,
+                                  const PartialTensorShape& shape,
+                                  const string& name,
+                                  const string& reduction_type)
+      : ConditionalAccumulatorBase(dtype, shape, name, reduction_type) {}
+
+  /**
+   * Attempts to add a gradient to the accumulator. An ApplyGrad attempt is
+   * successful (i.e., has its gradient applied) if its local_step >=
+   * current_global_step_ at the time the attempt is processed. Otherwise, if
+   * local_step < current_global_step_, the stale gradient is silently dropped.
+   *
+   * local_step: Time-step at which the gradient was computed.
+   * grad:       Gradient tensor to be added to the accumulator.
+   * ctx:        Context in which the op is executed.
+   */
+  void TryApplyGrad(int64_t local_step, OpKernelContext* ctx) override {
+    {
+      mutex_lock l(mu_);
+      if (local_step >= current_global_step_) {
+        GradientTensorType* grad = nullptr;
+        bool is_valid = GetAndValidateTensorInputForApplyGrad(ctx, &grad);
+        if (is_valid) {
+          if (counter_ > 0) {
+            AddToAccumGradFunction(ctx, grad);
+          } else {
+            AllocateAndAssignToAccumGradFunction(ctx, grad);
+          }
+          counter_++;
+        }
+        CleanUpGradTensor(grad);
+      }
+    }
+    FlushUnlocked();
+  }
+
+ protected:
+  // Virtual methods to be implemented by sub-classes for different datatypes.
+  // Implements arithmetic operations specific to datatype.
+  virtual void AllocateAndAssignToAccumGradFunction(
+      OpKernelContext* ctx, GradientTensorType* grad) = 0;
+
+  virtual void AddToAccumGradFunction(OpKernelContext* ctx,
+                                      GradientTensorType* grad) = 0;
+
+  // Method for extracting and validating input provided in an OpKernelContext.
+  // Returns true if input was successfully retrieved and is valid.
+  // Gradient is returned via the GradientTensorType** tensor.
+  virtual bool GetAndValidateTensorInputForApplyGrad(
+      OpKernelContext* ctx, GradientTensorType** tensor)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+
+  // Method for cleaning up any memory allocated in
+  // GetAndValidateTensorInputForApplyGrad
+  virtual void CleanUpGradTensor(GradientTensorType* tensor) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/typed_queue.h b/third_party/tflite-hdrs/tensorflow/core/kernels/typed_queue.h
new file mode 100644
index 00000000..e4c82f0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/typed_queue.h
@@ -0,0 +1,118 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
+#define TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
+
+#include <deque>
+#include <queue>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// TypedQueue builds on QueueBase, with backing class (SubQueue)
+// known and stored within.  Shared methods that need to have access
+// to the backed data sit in this class.
+template <typename SubQueue>
+class TypedQueue : public QueueBase {
+ public:
+  TypedQueue(const int32_t capacity, const DataTypeVector& component_dtypes,
+             const std::vector<TensorShape>& component_shapes,
+             const string& name);
+
+  virtual absl::Status Initialize();  // Must be called before any other method.
+
+  int64_t MemoryUsed() const override;
+
+ protected:
+  std::vector<SubQueue> queues_ TF_GUARDED_BY(mu_);
+};  // class TypedQueue
+
+template <typename SubQueue>
+TypedQueue<SubQueue>::TypedQueue(
+    int32_t capacity, const DataTypeVector& component_dtypes,
+    const std::vector<TensorShape>& component_shapes, const string& name)
+    : QueueBase(capacity, component_dtypes, component_shapes, name) {}
+
+template <typename SubQueue>
+absl::Status TypedQueue<SubQueue>::Initialize() {
+  if (component_dtypes_.empty()) {
+    return errors::InvalidArgument("Empty component types for queue ", name_);
+  }
+  if (!component_shapes_.empty() &&
+      component_dtypes_.size() != component_shapes_.size()) {
+    return errors::InvalidArgument(
+        "Different number of component types.  ",
+        "Types: ", DataTypeSliceString(component_dtypes_),
+        ", Shapes: ", ShapeListString(component_shapes_));
+  }
+
+  mutex_lock lock(mu_);
+  queues_.reserve(num_components());
+  for (int i = 0; i < num_components(); ++i) {
+    queues_.push_back(SubQueue());
+  }
+  return absl::OkStatus();
+}
+
+template <typename SubQueue>
+inline int64_t SizeOf(const SubQueue& sq) {
+  static_assert(sizeof(SubQueue) != sizeof(SubQueue), "SubQueue size unknown.");
+  return 0;
+}
+
+template <>
+inline int64_t SizeOf(const std::deque<Tensor>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * sq.front().AllocatedBytes();
+}
+
+template <>
+inline int64_t SizeOf(const std::vector<Tensor>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * sq.front().AllocatedBytes();
+}
+
+using TensorPair = std::pair<int64_t, Tensor>;
+
+template <typename U, typename V>
+int64_t SizeOf(const std::priority_queue<TensorPair, U, V>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * (sizeof(TensorPair) + sq.top().second.AllocatedBytes());
+}
+
+template <typename SubQueue>
+inline int64_t TypedQueue<SubQueue>::MemoryUsed() const {
+  int memory_size = 0;
+  mutex_lock l(mu_);
+  for (const auto& sq : queues_) {
+    memory_size += SizeOf(sq);
+  }
+  return memory_size;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/uniform_quant_ops/math_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/uniform_quant_ops/math_utils.h
new file mode 100644
index 00000000..5cd9c1b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/uniform_quant_ops/math_utils.h
@@ -0,0 +1,334 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_MATH_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_MATH_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+// Multiply by the effective quantized multiplier and shift.
+// Caller is responsible for guaranteeing:
+// quantized_multiplier >= 0
+// shift >= -31 && shift <= 30
+// The usage of this function is restricted to "multiply by quantized_multiplier
+// and shift which were calcluated from QuantizeMultiplier() function below",
+// so the conditions are expected to be met.
+//
+// Reference (TFLite MultiplyByQuantizedMultiplier with TFLITE_SINGLE_ROUNDING):
+// https://github.com/tensorflow/tensorflow/blob/47c640a961874f644cd071752835c7b792450bb8/tensorflow/lite/kernels/internal/common.h#L145
+// Above implementation refers from ruy MultiplyByQuantizedMultiplier
+// (https://github.com/google/ruy/blob/97ebb72aa0655c0af98896b317476a5d0dacad9c/ruy/apply_multiplier.cc)
+//
+// After mutiplying fixed point quantized_multiplier, apply single rounding
+// operation (addition of 'round' to result and then shift right by
+// total_shift). where round=(1 << (30 - shift)) and total_shift=(31 - shift)
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  const int64_t total_shift = 31 - shift;
+  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
+  int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
+  result = result >> total_shift;
+
+  result = std::clamp(
+      result, static_cast<int64_t>(std::numeric_limits<int32_t>::min()),
+      static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
+  return static_cast<int32_t>(result);
+}
+
+}  // namespace internal
+
+// Quantize eigen Tensor input_tensor using given inv_scale and zero_point,
+// using the formula:
+// quantized_val = floor(input_val * inv_scale + 0.5f) + zero_point
+//
+// The caller is reponsible for the validity of the inv_scale (Avoid precision
+// loss from taking inverse, and ensure that inv_scale is a finite number.)
+template <typename ConstTensorTin, typename TensorTout>
+void AffineQuantize(const ConstTensorTin& input_tensor, float inv_scale,
+                    int32_t zero_point, int32_t quantization_min_val,
+                    int32_t quantization_max_val, TensorTout quantized_tensor) {
+  quantized_tensor = ((input_tensor.template cast<float>() * inv_scale + 0.5f)
+                          .floor()
+                          .template cast<int32_t>() +
+                      zero_point)
+                         .cwiseMin(quantization_max_val)
+                         .cwiseMax(quantization_min_val)
+                         .template cast<typename TensorTout::Scalar>();
+}
+
+// Dequantize eigen Tensor input_tensor using given scale and zero_point, using
+// the formula:
+// dequantized_val = (input_val - zero_point) * scale
+template <typename ConstTensorTin, typename TensorTout>
+void AffineDequantize(const ConstTensorTin& input_tensor, float scale,
+                      int32_t zero_point, TensorTout dequantized_tensor) {
+  dequantized_tensor = (((input_tensor.template cast<int32_t>() - zero_point))
+                            .template cast<float>() *
+                        scale)
+                           .template cast<typename TensorTout::Scalar>();
+}
+
+// Given a portion of input float tensor, quantizes the data and writes output
+// to the corresponding portion in quantized_tensor. The quantization scale and
+// zero_point is calculated using the input data min and max.
+// This function is used for dynamic range quantization in hybrid (float x qint)
+// kernels.
+//
+// This function behavior aligns with TFLite AsymmetricQuantize()
+// (https://github.com/tensorflow/tensorflow/blob/779d3824c8b38a622773940011ced0388697b951/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc#L72)
+// to achieve feature parity with TFLite which is required since supporting
+// mobile executions is the one of the major use cases. The behavior is same
+// except for following difference: TFLite AsymmetricQuantize() uses round(input
+// / scale + zero_point), while AffineQuantize() uses floor(input_val *
+// (1./scale) + 0.5) + zero_point
+template <typename ConstTensorTin, typename TensorTout>
+absl::Status AsymmetricQuantize(const ConstTensorTin& input_tensor,
+                                int32_t quantization_min_val,
+                                int32_t quantization_max_val, float& scale,
+                                int32& zero_point,
+                                TensorTout quantized_tensor) {
+  if (quantization_min_val >= quantization_max_val) {
+    // NOLINTNEXTLINE
+    return errors::InvalidArgument(
+        "quantization_min_val must be smaller than quantization_max_val. "
+        "Given ",
+        quantization_min_val, ", ", quantization_max_val);
+  }
+
+  Eigen::Tensor<float, 0, Eigen::RowMajor> input_tensor_min =
+      input_tensor.minimum();
+  Eigen::Tensor<float, 0, Eigen::RowMajor> input_tensor_max =
+      input_tensor.maximum();
+  const double rmin = static_cast<double>(std::min(0.0f, input_tensor_min()));
+  const double rmax = static_cast<double>(std::max(0.0f, input_tensor_max()));
+  const double qmin_double = quantization_min_val;
+  const double qmax_double = quantization_max_val;
+
+  float inv_scale = 0;
+  scale = (rmax - rmin) / (qmax_double - qmin_double);
+  if (rmax - rmin != 0) {
+    // Re-calculate the inverse instead of using (1./scale), to avoid loss of
+    // precision.
+    inv_scale = (qmax_double - qmin_double) / (rmax - rmin);
+  }
+  if (scale == 0 || !std::isfinite(inv_scale)) {
+    quantized_tensor.setZero();
+    scale = 1.0;
+    zero_point = 0;
+    return absl::OkStatus();
+  }
+
+  // Using the scale calculated from the quantization range and data range,
+  // calculate zero point from quantization min and quantization max.
+  // Among those two, choose the zero point that has smaller error.
+  const double zero_point_from_min = qmin_double - rmin / scale;
+  const double zero_point_from_max = qmax_double - rmax / scale;
+  const double zero_point_from_min_error =
+      std::abs(qmin_double) + std::abs(rmin / scale);
+  const double zero_point_from_max_error =
+      std::abs(qmax_double) + std::abs(rmax / scale);
+  const double zero_point_double =
+      zero_point_from_min_error < zero_point_from_max_error
+          ? zero_point_from_min
+          : zero_point_from_max;
+
+  int8_t nudged_zero_point = 0;
+  if (zero_point_double <= qmin_double) {
+    nudged_zero_point = quantization_min_val;
+  } else if (zero_point_double >= qmax_double) {
+    nudged_zero_point = quantization_max_val;
+  } else {
+    nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
+  }
+  zero_point = nudged_zero_point;
+
+  AffineQuantize(input_tensor, inv_scale, zero_point, quantization_min_val,
+                 quantization_max_val, quantized_tensor);
+  return absl::OkStatus();
+}
+
+// Given double_multiplier, quantize it where it is represented by two int32_t,
+// quantized_multiplier and shift.
+//
+// double_multiplier must be a positive finite number. Otherwise returns
+// InvalidArgument.
+//
+// Output quantized_multiplier is clamped to range [0, INT32_MAX],
+// and shift is clamped to range [-31, 30].
+absl::Status QuantizeMultiplier(double double_multiplier,
+                                int32_t& quantized_multiplier, int32_t& shift);
+
+// Requantize input_val given quantized effective_muliplier|shift and
+// input|output zero_point.
+// Effective multiplier and shift should be calculated from effective scale
+// which is:
+// (product of input scales) / (product of output scales).
+template <typename Tin, typename Tout>
+Tout AffineRequantizeWithQuantizedMultiplierAndShift(
+    Tin input_val, int32_t effective_quantized_multiplier, int effective_shift,
+    int32_t input_zero_point, int32_t output_zero_point,
+    int32_t quantization_min_val, int32_t quantization_max_val) {
+  const int32_t input = static_cast<int32_t>(input_val) - input_zero_point;
+
+  const int32_t unclamped =
+      internal::MultiplyByQuantizedMultiplier(
+          input, effective_quantized_multiplier, effective_shift) +
+      output_zero_point;
+
+  // Clamp with [quantization_min_val, quantization_max_val].
+  return static_cast<Tout>(
+      std::max<int32_t>(std::min<int32_t>(unclamped, quantization_max_val),
+                        quantization_min_val));
+}
+
+namespace internal {
+
+// Requantize from per-tensor to per-tensor.
+template <typename Tin, typename Tout>
+absl::Status PerTensorToPerTensorRequantize(
+    const Tensor& input, float input_scale, int32_t input_zero_point,
+    float output_scale, int32_t output_zero_point, int32_t quantization_min_val,
+    int32_t quantization_max_val, Tensor& output) {
+  const double effective_multiplier =
+      static_cast<double>(input_scale) / output_scale;
+  int32_t effective_quantized_multiplier;
+  int32_t effective_shift;
+  TF_RETURN_IF_ERROR(QuantizeMultiplier(
+      effective_multiplier, effective_quantized_multiplier, effective_shift));
+
+  output.flat<Tout>() = input.flat<Tin>().unaryExpr(
+      [effective_quantized_multiplier, effective_shift, input_zero_point,
+       output_zero_point, quantization_min_val,
+       quantization_max_val](Tin input_val) {
+        return AffineRequantizeWithQuantizedMultiplierAndShift<Tin, Tout>(
+            input_val, effective_quantized_multiplier, effective_shift,
+            input_zero_point, output_zero_point, quantization_min_val,
+            quantization_max_val);
+      });
+  return absl::OkStatus();
+}
+
+// Requantize where the input or output contains any per-axis quantized cases.
+// - From per-tensor to per-axis.
+// - From per-axis to per-tensor.
+// - From per-axis to per-axis.
+template <typename Tin, typename Tout>
+absl::Status PerAxisRequantize(OpKernelContext* context, const Tensor& input,
+                               const Tensor& input_scales,
+                               const Tensor& input_zero_points,
+                               const Tensor& output_scales,
+                               const Tensor& output_zero_points,
+                               int quantization_axis,
+                               int32_t quantization_min_val,
+                               int32_t quantization_max_val, Tensor& output) {
+  const bool input_per_axis_quantization = input_scales.dims() == 1;
+  const bool output_per_axis_quantization = output_scales.dims() == 1;
+  const auto& per_axis_scales_shape = input_per_axis_quantization
+                                          ? input_scales.shape()
+                                          : output_scales.shape();
+
+  Tensor effective_quantized_multipliers;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, per_axis_scales_shape,
+                                            &effective_quantized_multipliers));
+  Tensor effective_shifts;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, per_axis_scales_shape,
+                                            &effective_shifts));
+
+  const float* input_scales_data = input_scales.flat<float>().data();
+  const float* output_scales_data = output_scales.flat<float>().data();
+  int32_t* effective_quantized_multipliers_data =
+      effective_quantized_multipliers.flat<int32_t>().data();
+  int32_t* effective_shifts_data = effective_shifts.flat<int32_t>().data();
+
+  const int64_t quantization_dim_size = output.dim_size(quantization_axis);
+
+  for (int64_t i = 0; i < quantization_dim_size; ++i) {
+    const double effective_multiplier =
+        static_cast<double>(
+            input_scales_data[input_per_axis_quantization ? i : 0]) /
+        output_scales_data[output_per_axis_quantization ? i : 0];
+    TF_RETURN_IF_ERROR(QuantizeMultiplier(
+        effective_multiplier, effective_quantized_multipliers_data[i],
+        effective_shifts_data[i]));
+  }
+
+  const int32* input_zero_points_data = input_zero_points.flat<int32>().data();
+  const int32* output_zero_points_data =
+      output_zero_points.flat<int32>().data();
+
+  auto input_tensor =
+      input.template flat_inner_outer_dims<Tin, 3>(quantization_axis - 1);
+  auto output_tensor =
+      output.template flat_inner_outer_dims<Tout, 3>(quantization_axis - 1);
+
+  for (int i = 0; i < quantization_dim_size; ++i) {
+    output_tensor.template chip<1>(i) =
+        input_tensor.template chip<1>(i).unaryExpr(
+            [effective_quantized_multipliers_data, effective_shifts_data,
+             input_zero_points_data, output_zero_points_data,
+             quantization_min_val, quantization_max_val,
+             input_per_axis_quantization, output_per_axis_quantization,
+             i](Tin input_val) {
+              return AffineRequantizeWithQuantizedMultiplierAndShift<Tin, Tout>(
+                  input_val, effective_quantized_multipliers_data[i],
+                  effective_shifts_data[i],
+                  input_zero_points_data[input_per_axis_quantization ? i : 0],
+                  output_zero_points_data[output_per_axis_quantization ? i : 0],
+                  quantization_min_val, quantization_max_val);
+            });
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace internal
+
+template <typename Tin, typename Tout>
+absl::Status EvalRequantize(
+    OpKernelContext* context, const Tensor& input, const Tensor& input_scales,
+    const Tensor& input_zero_points, const Tensor& output_scales,
+    const Tensor& output_zero_points, int input_quantization_axis,
+    int output_quantization_axis, int32_t quantization_min_val,
+    int32_t quantization_max_val, Tensor& output) {
+  if (input_quantization_axis == -1 && output_quantization_axis == -1) {
+    return internal::PerTensorToPerTensorRequantize<Tin, Tout>(
+        input, input_scales.scalar<float>()(),
+        input_zero_points.scalar<int32>()(), output_scales.scalar<float>()(),
+        output_zero_points.scalar<int32>()(), quantization_min_val,
+        quantization_max_val, output);
+  } else {
+    const int quantization_axis = input_quantization_axis >= 0
+                                      ? input_quantization_axis
+                                      : output_quantization_axis;
+    return internal::PerAxisRequantize<Tin, Tout>(
+        context, input, input_scales, input_zero_points, output_scales,
+        output_zero_points, quantization_axis, quantization_min_val,
+        quantization_max_val, output);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_MATH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.h b/third_party/tflite-hdrs/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.h
new file mode 100644
index 00000000..4a303a3f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_TENSOR_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_TENSOR_UTILS_H_
+
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// Returns if all elements in given tensors are positive.
+template <typename T>
+bool AllElementsPositive(const Tensor& tensor) {
+  Eigen::Tensor<bool, 0, Eigen::RowMajor> positive =
+      (tensor.flat<T>() > 0).all();
+  return positive();
+}
+
+// Given data tensor's shape and quantization params, returns if the shapes are
+// valid.
+absl::Status QuantizationAxisAndShapeValid(const TensorShape& data_shape,
+                                           const TensorShape& scales_shape,
+                                           const TensorShape& zero_points_shape,
+                                           int quantization_axis);
+
+// Given in_shape and perm to transpose, returns out shape after the transpose.
+// perm must be a permutation of [0, 1, ..., in_shape.rank - 1]. The caller is
+// responsible for guaranteeing it.
+TensorShape TransposedShape(const TensorShape& in_shape,
+                            const absl::Span<const int32_t> perm);
+
+// Given in Tensor and perm to transpose, transpose in Tensor and write to out
+// Tensor.
+// perm must be a permutation of [0, 1, ..., in_shape.rank - 1]. The caller is
+// responsible for guaranteeing it.
+// Reference:
+// https://github.com/tensorflow/tensorflow/blob/c09dc18b15a56f3e72a08c9f3a53e7ef347d159d/tensorflow/core/kernels/transpose_functor_cpu.cc#L35
+template <typename T>
+void Transpose(const Tensor& in, const absl::Span<const int32_t> perm,
+               Tensor& out) {
+  absl::InlinedVector<int64_t, 8UL> in_strides =
+      ComputeStride<int64_t>(in.shape());
+  absl::InlinedVector<int64_t, 8UL> out_strides =
+      ComputeStride<int64_t>(out.shape());
+  const T* in_data = in.flat<T>().data();
+  T* out_data = out.flat<T>().data();
+
+  for (int64_t out_idx = 0; out_idx < out.NumElements(); ++out_idx) {
+    int64_t in_idx = 0;
+    int64_t remain_out_idx = out_idx;
+    for (int dim = 0; dim < out.dims(); ++dim) {
+      const int64_t ratio = remain_out_idx / out_strides[dim];
+      remain_out_idx -= ratio * out_strides[dim];
+      in_idx += ratio * in_strides[perm[dim]];
+    }
+    out_data[out_idx] = in_data[in_idx];
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/unique_op_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/unique_op_gpu.cu.h
new file mode 100644
index 00000000..23d7f89d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/unique_op_gpu.cu.h
@@ -0,0 +1,449 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_UNIQUE_OP_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_UNIQUE_OP_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/gpu_prim_helpers.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/core/util/gpu_solvers.h"  // For ScratchSpace
+
+#if TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#endif
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace unique_op_gpu {
+
+// Returns true iff index is at the end of a segment (which is equivalent to the
+// beginning of the next segment).
+template <typename T, typename TIndex>
+struct SegmentIndicatorFunctor {
+  const T* __restrict__ sorted_input_ptr_;
+  SegmentIndicatorFunctor(const T* sorted_input_ptr)
+      : sorted_input_ptr_(sorted_input_ptr) {}
+  __device__ bool operator()(const TIndex& i) const {
+    return i > 0 && sorted_input_ptr_[i] != sorted_input_ptr_[i - 1];
+  }
+};
+
+template <typename TIndex>
+__global__ void ExtractFirstOccurrenceIndicesKernel(
+    int64_t input_size, int64_t uniq_size,
+    const TIndex* __restrict__ sorted_input_inds,
+    const TIndex* __restrict__ sorted_input_unique_ids,
+    TIndex* __restrict__ unique_input_inds, TIndex* __restrict__ segment_ends) {
+  GPU_1D_KERNEL_LOOP(i, input_size) {
+    TIndex sorted_input_unique_id = sorted_input_unique_ids[i];
+    if (i == 0 || sorted_input_unique_id != sorted_input_unique_ids[i - 1]) {
+      unique_input_inds[sorted_input_unique_id] = sorted_input_inds[i];
+      if (segment_ends) {
+        if (i == 0) {
+          // First thread writes the last element.
+          segment_ends[uniq_size - 1] = input_size;
+        } else {
+          segment_ends[sorted_input_unique_id - 1] = i;
+        }
+      }
+    }
+  }
+}
+
+// Scatters the index of the first occurrence of each unique input value to
+// unique_input_inds.
+// If segment_ends is not nullptr, it is filled with the end index of each
+// unique value's range in the sorted input (the last element is always set
+// to input_size).
+template <typename TIndex>
+Status ExtractFirstOccurrenceIndices(const GPUDevice& d, int64_t input_size,
+                                     int64_t uniq_size,
+                                     const TIndex* sorted_input_inds,
+                                     const TIndex* sorted_input_unique_ids,
+                                     TIndex* unique_input_inds,
+                                     TIndex* segment_ends) {
+  CHECK_GT(input_size, 0);  // Crash OK
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      input_size, d, &ExtractFirstOccurrenceIndicesKernel<TIndex>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(ExtractFirstOccurrenceIndicesKernel<TIndex>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), input_size, uniq_size, sorted_input_inds,
+                         sorted_input_unique_ids, unique_input_inds,
+                         segment_ends);
+}
+
+template <typename T, typename TIndex>
+__global__ void GatherOutputsAndInvertPermutationKernel(
+    int64_t uniq_size, const T* __restrict__ input,
+    const TIndex* __restrict__ sorted_unique_input_inds,
+    const TIndex* __restrict__ sorted_unique_perm,
+    const TIndex* __restrict__ segment_ends, T* __restrict__ output,
+    TIndex* __restrict__ inv_sorted_unique_perm, TIndex* __restrict__ count) {
+  GPU_1D_KERNEL_LOOP(i, uniq_size) {
+    output[i] = input[sorted_unique_input_inds[i]];
+    auto j = sorted_unique_perm[i];
+    inv_sorted_unique_perm[j] = i;
+    if (count) {
+      TIndex beg = j == 0 ? 0 : segment_ends[j - 1];
+      TIndex end = segment_ends[j];
+      count[i] = end - beg;
+    }
+  }
+}
+
+// Gathers input values using sorted_unique_input_inds, and inverts the
+// permutation specified by sorted_unique_perm.
+template <typename T, typename TIndex>
+Status GatherOutputsAndInvertPermutation(const GPUDevice& d, int64_t uniq_size,
+                                         const T* input,
+                                         const TIndex* sorted_unique_input_inds,
+                                         const TIndex* sorted_unique_perm,
+                                         const TIndex* segment_ends, T* output,
+                                         TIndex* inv_sorted_unique_perm,
+                                         TIndex* count) {
+  if (uniq_size == 0) return OkStatus();
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      uniq_size, d, &GatherOutputsAndInvertPermutationKernel<T, TIndex>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(GatherOutputsAndInvertPermutationKernel<T, TIndex>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), uniq_size, input, sorted_unique_input_inds,
+                         sorted_unique_perm, segment_ends, output,
+                         inv_sorted_unique_perm, count);
+}
+
+template <typename TIndex>
+__global__ void LookupAndScatterUniqueIdsKernel(
+    int64_t input_size, const TIndex* sorted_input_inds,
+    const TIndex* __restrict__ sorted_input_unique_ids,
+    const TIndex* __restrict__ inv_sorted_unique_perm,
+    TIndex* __restrict__ idx) {
+  GPU_1D_KERNEL_LOOP(i, input_size) {
+    idx[sorted_input_inds[i]] =
+        inv_sorted_unique_perm[sorted_input_unique_ids[i]];
+  }
+}
+
+// Maps the values of sorted_input_unique_ids and scatters them to idx using
+// sorted_input_inds.
+template <typename TIndex>
+Status LookupAndScatterUniqueIds(const GPUDevice& d, int64_t input_size,
+                                 const TIndex* sorted_input_inds,
+                                 const TIndex* sorted_input_unique_ids,
+                                 const TIndex* inv_sorted_unique_perm,
+                                 TIndex* idx) {
+  CHECK_GT(input_size, 0);  // Crash OK
+  GpuLaunchConfig config = GetGpuLaunchConfig(
+      input_size, d, &LookupAndScatterUniqueIdsKernel<TIndex>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(LookupAndScatterUniqueIdsKernel<TIndex>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), input_size, sorted_input_inds,
+                         sorted_input_unique_ids, inv_sorted_unique_perm, idx);
+}
+
+}  // namespace unique_op_gpu
+
+// This only supports Unique[WithCounts], not Unique[WithCounts]V2.
+template <typename T, typename TIndex>
+class UniqueOpGPU : public AsyncOpKernel {
+ public:
+  explicit UniqueOpGPU(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  template <typename U>
+  void AllocateTemp(OpKernelContext* context, int64_t size, Tensor* tensor,
+                    U** tensor_data, DoneCallback done) const {
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<U>::value,
+                                                TensorShape({size}), tensor),
+                         done);
+    *tensor_data = tensor->flat<U>().data();
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& input = context->input(0);
+    // TODO(dga):  Make unique polymorphic for returning int32 and int64
+    // vectors to support large tensors.
+    OP_REQUIRES_ASYNC(context,
+                      input.NumElements() <= std::numeric_limits<int32>::max(),
+                      errors::InvalidArgument(
+                          "unique does not support input tensors larger than ",
+                          std::numeric_limits<int32>::max(), " elements"),
+                      done);
+
+    OP_REQUIRES_ASYNC(context, TensorShapeUtils::IsVector(input.shape()),
+                      errors::InvalidArgument("unique expects a 1D vector."),
+                      done);
+
+    se::Stream* stream = context->op_device_context()->stream();
+    OP_REQUIRES_ASYNC(context, stream,
+                      errors::Internal("No GPU stream available."), done);
+
+    int64_t input_size = input.NumElements();
+    bool has_count_output = num_outputs() > 2;
+    if (input_size == 0) {
+      // Early exit for trivial case.
+      Tensor* t = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, TensorShape({0}), &t), done);
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(1, TensorShape({0}), &t), done);
+      if (has_count_output) {
+        OP_REQUIRES_OK_ASYNC(
+            context, context->allocate_output(2, TensorShape({0}), &t), done);
+      }
+      done();
+      return;
+    }
+
+    // The algorithm implemented here is as follows:
+    // input = [3, 5, 3, 4, 1, 4, 9, 8, 6, 3, 5, 7, 8, 8, 4, 6, 4, 2, 5, 6]
+    // 1) Sort the input to group equal values together in segments.
+    //      sorted_input, sorted_input_inds = sort(input)
+    // sorted_input:
+    //   [1, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 8, 8, 8, 9]
+    // sorted_input_inds:
+    //   [4, 17, 0, 2, 9, 3, 5, 14, 16, 1, 10, 18, 8, 15, 19, 11, 7, 12, 13, 6]
+    // 2) Identify the boundaries between segments and use prefix sum to
+    //    compute the unique ID for each sorted value.
+    //      sorted_input_unique_ids = prefix_sum(indicator(sorted_input))
+    // indicator(sorted_input):
+    //   [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1]
+    // sorted_input_unique_ids:
+    //   [0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 7, 7, 7, 8]
+    // 3) Extract the input index of the first occurrence of each unique value.
+    //    If counts are required, also extract the end index of each segment.
+    //      unique_input_inds[sorted_input_unique_ids] =
+    //          sorted_input_inds (@ indicator)
+    //      segment_ends[sorted_input_unique_ids[i] - 1] = i (@ indicator)
+    // unique_input_inds: [4, 17, 0, 3, 1, 8, 11, 7, 6]
+    // segment_ends: [1, 2, 5, 9, 12, 15, 16, 19, 20]
+    // 4) Sort the extracted unique input indices to put them in order of
+    //    first appearance.
+    //      sorted_unique_input_inds, sorted_unique_perm =
+    //          sort(unique_input_inds)
+    // sorted_unique_input_inds: [0, 1, 3, 4, 6, 7, 8, 11, 17]
+    // sorted_unique_perm: [2, 4, 3, 0, 8, 7, 5, 6, 1]
+    // 5) Gather the sorted unique input values to produce output, and invert
+    //    the second sort permutation to produce an inverse ID mapping. If
+    //    counts are required, also take the adjacent difference between
+    //    segment_ends indices to produce counts.
+    //      output = input[sorted_unique_input_inds]
+    //      inv_sorted_unique_perm[sorted_unique_perm[i]] = i
+    //      counts = adjacent_difference(segment_ends)
+    // output: [3, 5, 4, 1, 9, 8, 6, 7, 2]
+    // inv_sorted_unique_perm: [3, 8, 0, 2, 1, 6, 7, 5, 4]
+    // counts: [3, 3, 4, 1, 1, 3, 3, 1, 1]
+    // 6) Look up unique IDs via the inverse ID mapping and scatter them using
+    //    the original sort permutation to produce the indices output.
+    //      idx[sorted_input_inds] =
+    //          inv_sorted_unique_perm[sorted_input_unique_ids]
+    // idx: [0, 1, 0, 2, 3, 2, 4, 5, 6, 0, 1, 7, 5, 5, 2, 6, 2, 8, 1, 6]
+
+    Tensor sorted_input_inds;
+    TIndex* sorted_input_inds_ptr = nullptr;
+    AllocateTemp(context, input_size, &sorted_input_inds,
+                 &sorted_input_inds_ptr, done);
+    if (!context->status().ok()) return;
+
+    Tensor sorted_input;
+    T* sorted_input_ptr = nullptr;
+    AllocateTemp(context, input_size, &sorted_input, &sorted_input_ptr, done);
+    if (!context->status().ok()) return;
+
+    const T* input_ptr = input.flat<T>().data();
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        GpuRadixSort(context, input_size, /*keys_in=*/input_ptr,
+                     /*keys_out=*/sorted_input_ptr,
+                     /*indices_in=*/static_cast<const TIndex*>(nullptr),
+                     /*indices_out=*/sorted_input_inds_ptr),
+        done);
+
+    using namespace unique_op_gpu;
+
+    // Create a fancy input iterator to indicate segment boundaries.
+    gpuprim::CountingInputIterator<TIndex> counting_iter(0);
+    gpuprim::TransformInputIterator<TIndex, SegmentIndicatorFunctor<T, TIndex>,
+                                    gpuprim::CountingInputIterator<TIndex>>
+        segment_indicator_iter(counting_iter, {sorted_input_ptr});
+
+    Tensor sorted_input_unique_ids;
+    TIndex* sorted_input_unique_ids_ptr = nullptr;
+    AllocateTemp(context, input_size, &sorted_input_unique_ids,
+                 &sorted_input_unique_ids_ptr, done);
+    if (!context->status().ok()) return;
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        GpuInclusivePrefixSum(context, input_size, segment_indicator_iter,
+                              sorted_input_unique_ids_ptr),
+        done);
+
+    // Copy the last element of sorted_input_unique_ids back to the host to
+    // obtain uniq_size.
+    ScratchSpace<TIndex> last_idx_host(context, 1, /*on_host=*/true);
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        stream->Memcpy(last_idx_host.mutable_data(),
+                       se::DeviceMemoryBase(
+                           const_cast<TIndex*>(sorted_input_unique_ids_ptr) +
+                               (input_size - 1),
+                           sizeof(*last_idx_host.data())),
+                       sizeof(*last_idx_host.data())),
+        done);
+
+    auto async_finish_computation = [this, context, input_size, input_ptr,
+                                     sorted_input_inds, sorted_input_inds_ptr,
+                                     sorted_input_unique_ids,
+                                     sorted_input_unique_ids_ptr, last_idx_host,
+                                     has_count_output, done]() -> void {
+      const GPUDevice& device = context->eigen_gpu_device();
+      int64 uniq_size = (*last_idx_host.data()) + 1;
+
+      std::unique_ptr<se::ActivateContext> scoped_activation =
+          context->op_device_context()->stream()->parent()->Activate();
+
+      Tensor unique_input_inds;
+      TIndex* unique_input_inds_ptr = nullptr;
+      AllocateTemp(context, uniq_size, &unique_input_inds,
+                   &unique_input_inds_ptr, done);
+      if (!context->status().ok()) return;
+
+      Tensor segment_ends;
+      TIndex* segment_ends_ptr = nullptr;
+      if (has_count_output) {
+        AllocateTemp(context, uniq_size, &segment_ends, &segment_ends_ptr,
+                     done);
+        if (!context->status().ok()) return;
+      }
+
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          ExtractFirstOccurrenceIndices(
+              device, input_size, uniq_size, sorted_input_inds_ptr,
+              sorted_input_unique_ids_ptr, unique_input_inds_ptr,
+              segment_ends_ptr),
+          done);
+
+      Tensor sorted_unique_input_inds;
+      TIndex* sorted_unique_input_inds_ptr = nullptr;
+      AllocateTemp(context, uniq_size, &sorted_unique_input_inds,
+                   &sorted_unique_input_inds_ptr, done);
+      if (!context->status().ok()) return;
+
+      Tensor sorted_unique_perm;
+      TIndex* sorted_unique_perm_ptr = nullptr;
+      AllocateTemp(context, uniq_size, &sorted_unique_perm,
+                   &sorted_unique_perm_ptr, done);
+      if (!context->status().ok()) return;
+
+      // Sort by input index so that output is in order of appearance.
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          GpuRadixSort(context, uniq_size,
+                       /*keys_in=*/unique_input_inds_ptr,
+                       /*keys_out=*/sorted_unique_input_inds_ptr,
+                       /*indices_in=*/static_cast<const TIndex*>(nullptr),
+                       /*indices_out=*/sorted_unique_perm_ptr,
+                       /*num_bits=*/Log2Ceiling(input_size)),
+          done);
+
+      // Free temporary tensor that is no longer needed.
+      unique_input_inds = Tensor();
+      unique_input_inds_ptr = nullptr;
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          context->allocate_output(0, TensorShape({uniq_size}), &output), done);
+      T* output_ptr = output->flat<T>().data();
+
+      Tensor inv_sorted_unique_perm;
+      TIndex* inv_sorted_unique_perm_ptr = nullptr;
+      AllocateTemp(context, uniq_size, &inv_sorted_unique_perm,
+                   &inv_sorted_unique_perm_ptr, done);
+      if (!context->status().ok()) return;
+
+      TIndex* count_ptr = nullptr;
+      if (has_count_output) {
+        Tensor* count = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            context->allocate_output(2, TensorShape({uniq_size}), &count),
+            done);
+        count_ptr = count->flat<TIndex>().data();
+      }
+
+      // Compute output and counts (if necessary).
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          GatherOutputsAndInvertPermutation(
+              device, uniq_size, input_ptr, sorted_unique_input_inds_ptr,
+              sorted_unique_perm_ptr, segment_ends_ptr, output_ptr,
+              inv_sorted_unique_perm_ptr, count_ptr),
+          done);
+
+      // Free temporary tensors that are no longer needed.
+      sorted_unique_perm = Tensor();
+      sorted_unique_perm_ptr = nullptr;
+      sorted_unique_input_inds = Tensor();
+      sorted_unique_input_inds_ptr = nullptr;
+      segment_ends = Tensor();
+      segment_ends_ptr = nullptr;
+
+      Tensor* idx = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(1, TensorShape({input_size}), &idx),
+          done);
+      TIndex* idx_ptr = idx->flat<TIndex>().data();
+
+      // Compute indices output.
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          LookupAndScatterUniqueIds(device, input_size, sorted_input_inds_ptr,
+                                    sorted_input_unique_ids_ptr,
+                                    inv_sorted_unique_perm_ptr, idx_ptr),
+          done);
+
+      done();
+    };
+
+    context->device()
+        ->tensorflow_accelerator_device_info()
+        ->event_mgr->ThenExecute(stream, async_finish_computation);
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_UNIQUE_OP_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/variable_ops.h b/third_party/tflite-hdrs/tensorflow/core/kernels/variable_ops.h
new file mode 100644
index 00000000..035b583a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/variable_ops.h
@@ -0,0 +1,47 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_VARIABLE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_VARIABLE_OPS_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class VariableOp : public OpKernel {
+ public:
+  explicit VariableOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType dtype_;
+  TensorShape shape_;
+  ContainerInfo cinfo_;
+
+  VariableOp(const VariableOp&) = delete;
+  void operator=(const VariableOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_VARIABLE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/variant_ops_util.h b/third_party/tflite-hdrs/tensorflow/core/kernels/variant_ops_util.h
new file mode 100644
index 00000000..d6d1e831
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/variant_ops_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_VARIANT_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_VARIANT_OPS_UTIL_H_
+
+#include <functional>
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+class Tensor;
+class Variant;
+
+void AddNVariant(OpKernelContext* ctx,
+                 std::function<absl::Status(OpKernelContext*, const Variant&,
+                                            const Variant&, Variant*)>
+                     binary_add_variant);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_VARIANT_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/where_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/where_op.h
new file mode 100644
index 00000000..fceea011
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/where_op.h
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_WHERE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_WHERE_OP_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+#define TF_CALL_WHERE_GPU_TYPES(m) \
+  TF_CALL_int8(m);                 \
+  TF_CALL_uint8(m);                \
+  TF_CALL_int64(m);                \
+  TF_CALL_float(m);                \
+  TF_CALL_double(m);               \
+  TF_CALL_complex64(m);            \
+  TF_CALL_complex128(m);           \
+  TF_CALL_bool(m);
+
+namespace functor {
+
+template <typename Device, typename T, typename TIndex>
+struct NumTrue {
+  EIGEN_ALWAYS_INLINE static absl::Status Compute(
+      OpKernelContext* ctx, const Device& d,
+      typename TTypes<T>::ConstFlat input,
+      typename TTypes<TIndex>::UnalignedScalar num_true);
+};
+
+template <typename Device, int NDIM, typename T, typename TIndex>
+struct Where {
+  // Copies indices of true values in input into output.  The pointer
+  // found_true should sit on the host.  Compute should copy the
+  // number of true elements found into it.  At the end, if
+  //   *found_true != output.dimension(0),
+  // then the input may have changed between the initial counting of
+  // the true values and the call to Where.
+  EIGEN_ALWAYS_INLINE static absl::Status Compute(
+      OpKernelContext* ctx, const Device& d,
+      typename TTypes<T, NDIM>::ConstTensor input,
+      typename TTypes<int64_t>::Matrix output, TIndex* found_true);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_WHERE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/where_op_gpu.cu.h b/third_party/tflite-hdrs/tensorflow/core/kernels/where_op_gpu.cu.h
new file mode 100644
index 00000000..5eb03ec6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -0,0 +1,353 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/gpu_prim.h"
+#include "tensorflow/core/kernels/where_op.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <int NDIM, typename TIndex>
+__global__ void PropagateWhereIndicesKernel(
+    const TIndex output_rows, const typename Eigen::array<TIndex, NDIM> strides,
+    int64* __restrict__ output) {
+  // TODO(ebrevdo): Use a multi-dimensional loop, increasing the
+  // dimensions of individual indices manually, instead of relying on
+  // a scalar loop variable and using integer division.
+  GPU_1D_KERNEL_LOOP(i, output_rows) {
+    TIndex index_value = ldg(output + NDIM * i);
+#pragma unroll
+    for (int c = 0; c < NDIM; ++c) {
+      *(output + NDIM * i + c) = index_value / strides[c];
+      index_value %= strides[c];
+    }
+  }
+}
+
+namespace {
+
+template <typename T>
+struct IsNonzero {
+  EIGEN_DEVICE_FUNC IsNonzero() : zero(T(0)) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const T& x) const {
+    return (x != zero);
+  }
+  const T zero;
+};
+
+template <typename T, typename TIndex>
+struct CubDeviceReduceCount {
+  gpuError_t operator()(void* d_temp_storage, size_t& temp_storage_bytes,
+                        const T* d_in, TIndex* d_out, int num_items,
+                        gpuStream_t stream = 0,
+                        bool debug_synchronous = false) {
+    IsNonzero<T> is_nonzero;
+    gpuprim::TransformInputIterator<bool, IsNonzero<T>, const T*>
+        is_nonzero_iter(d_in, is_nonzero);
+    return gpuprim::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
+                                      is_nonzero_iter, d_out, num_items, stream,
+                                      debug_synchronous);
+  }
+};
+
+template <typename TIndex>
+struct CubDeviceReduceCount<bool, TIndex> {
+  gpuError_t operator()(void* d_temp_storage, size_t& temp_storage_bytes,
+                        const bool* d_in, TIndex* d_out, int num_items,
+                        gpuStream_t stream = 0,
+                        bool debug_synchronous = false) {
+    return gpuprim::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in,
+                                      d_out, num_items, stream,
+                                      debug_synchronous);
+  }
+};
+
+template <typename T, typename TIndex, typename OutputIterator,
+          bool IsConvertibleToBool>
+struct CubDeviceSelectFlaggedCounter;
+
+template <typename T, typename TIndex, typename OutputIterator>
+struct CubDeviceSelectFlaggedCounter<T, TIndex, OutputIterator,
+                                     false /*IsConvertibleToBool*/> {
+  gpuError_t operator()(void* d_temp_storage, size_t& temp_storage_bytes,
+                        const T* d_flags, OutputIterator d_out,
+                        TIndex* d_num_selected_out, int num_items,
+                        gpuStream_t stream = 0,
+                        bool debug_synchronous = false) {
+    gpuprim::CountingInputIterator<TIndex> select_counter(0);
+    IsNonzero<T> is_nonzero;
+    gpuprim::TransformInputIterator<bool, IsNonzero<T>, const T*>
+        is_nonzero_iter(d_flags, is_nonzero);
+    return gpuprim::DeviceSelect::Flagged(
+        d_temp_storage, temp_storage_bytes, select_counter /*d_in*/,
+        is_nonzero_iter /*d_flags*/, d_out, d_num_selected_out, num_items,
+        stream, debug_synchronous);
+  }
+};
+
+template <typename T, typename TIndex, typename OutputIterator>
+struct CubDeviceSelectFlaggedCounter<T, TIndex, OutputIterator,
+                                     true /*IsConvertibleToBool*/> {
+  gpuError_t operator()(void* d_temp_storage, size_t& temp_storage_bytes,
+                        const T* d_flags, OutputIterator d_out,
+                        TIndex* d_num_selected_out, int num_items,
+                        gpuStream_t stream = 0,
+                        bool debug_synchronous = false) {
+    gpuprim::CountingInputIterator<TIndex> select_counter(0);
+    return gpuprim::DeviceSelect::Flagged(
+        d_temp_storage, temp_storage_bytes, select_counter /*d_in*/, d_flags,
+        d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+  }
+};
+
+}  // namespace
+
+template <typename T, typename TIndex>
+struct NumTrue<GPUDevice, T, TIndex> {
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const GPUDevice& d,
+      typename TTypes<T>::ConstFlat input,
+      typename TTypes<TIndex>::UnalignedScalar num_true) {
+    const auto& cu_stream = GetGpuStream(ctx);
+
+    std::size_t temp_storage_bytes = 0;
+    const T* input_data = input.data();
+    TIndex* num_true_data = num_true.data();
+
+    // TODO(ebrevdo): sum doesn't work; perhaps need a different
+    // iterator?
+    auto reducer = CubDeviceReduceCount<T, TIndex>();
+    auto first_success = reducer(/*temp_storage*/ nullptr, temp_storage_bytes,
+                                 /*d_in*/ input_data,
+                                 /*d_out*/ num_true_data,
+                                 /*num_items*/ input.size(),
+                                 /*stream*/ cu_stream);
+
+    if (first_success != gpuSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch gpuprim::DeviceReduce::Sum to calculate "
+          "temp_storage_bytes, status: ",
+          GpuGetErrorString(first_success));
+    }
+
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+        &temp_storage));
+
+    auto second_success = reducer(
+        /*temp_storage*/ temp_storage.flat<int8>().data(), temp_storage_bytes,
+        /*d_in*/ input_data,
+        /*d_out*/ num_true_data,
+        /*num_items*/ input.size(),
+        /*stream*/ cu_stream);
+
+    if (second_success != gpuSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch gpuprim::DeviceReduce::Sum to count "
+          "number of true / nonzero indices.  temp_storage_bytes: ",
+          temp_storage_bytes, ", status: ", GpuGetErrorString(second_success));
+    }
+
+    return OkStatus();
+  }
+};
+
+#define NUMTRUE_GPU_FUNCTOR(T)                  \
+  template struct NumTrue<GPUDevice, T, int32>; \
+  template struct NumTrue<GPUDevice, T, int64>;
+
+// We only need to declare the NumTrue functor once, but this file is
+// included from where_op_gpu_impl_X.cu.cc for X=1,2,...
+// Only declare for X = 1.
+#if GPU_PROVIDED_DIM == 1
+
+TF_CALL_WHERE_GPU_TYPES(NUMTRUE_GPU_FUNCTOR);
+
+#endif  // GPU_PROVIDED_DIM == 1
+
+#undef NUMTRUE_GPU_FUNCTOR
+
+template <int NDIM>
+class WhereOutputIterator {
+ public:
+  // Required iterator traits
+  typedef WhereOutputIterator self_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef void value_type;
+  typedef void pointer;
+  typedef int64& reference;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust
+  // 1.7 (or newer) methods
+  typedef typename thrust::detail::iterator_facade_category<
+      thrust::device_system_tag, thrust::random_access_traversal_tag,
+      value_type,
+      reference>::type iterator_category;  ///< The iterator category
+#else
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+#endif  // THRUST_VERSION
+
+  WhereOutputIterator(int64* ptr, const Eigen::DenseIndex max_row)
+      : ptr_(ptr), max_row_(max_row) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int64& operator[](int n) const {
+    // If the selection mechanism finds too many true values (because
+    // the input tensor changed between allocation of output and now),
+    // we may accidentally try to write past the allowable memory.  If
+    // valid is false, then we don't do this.  Instead, we'll read off
+    // the number of items found in Flagged()'s d_num_selected_out at
+    // the end and confirm that it matches the number of rows of output.
+    const bool valid = FastBoundsCheck(n, max_row_);
+    return *(ptr_ + (valid ? (NDIM * n) : 0));
+  }
+
+ private:
+  int64* ptr_;
+  const Eigen::DenseIndex max_row_;
+};
+
+template <typename TIndex, typename T, int NDIM>
+Eigen::array<TIndex, NDIM> CalculateStrides(
+    typename TTypes<T, NDIM>::ConstTensor input) {
+  const Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions();
+  Eigen::array<TIndex, NDIM> strides;
+  EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
+                       static_cast<int>(Eigen::RowMajor)),
+                      INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
+  strides[NDIM - 1] = 1;
+  for (int i = NDIM - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+template <int NDIM, typename T, typename TIndex>
+struct Where<GPUDevice, NDIM, T, TIndex> {
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const GPUDevice& d,
+      typename TTypes<T, NDIM>::ConstTensor input,
+      typename TTypes<int64_t>::Matrix output, TIndex* found_true_host) {
+    if (output.dimension(0) == 0) {
+      // Nothing to do.
+      return OkStatus();
+    }
+
+    const auto& cu_stream = GetGpuStream(ctx);
+
+    std::size_t temp_storage_bytes = 0;
+
+    Tensor found_true_t;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<TIndex>::v(),
+                                          TensorShape({}), &found_true_t));
+    TIndex* found_true_device = found_true_t.scalar<TIndex>().data();
+
+    WhereOutputIterator<NDIM> output_iterator(
+        output.data(),
+        /* max_row */ output.dimension(0));
+
+    typedef std::decay<T> DT;
+    CubDeviceSelectFlaggedCounter<
+        T, TIndex, decltype(output_iterator) /*OutputIterator*/,
+        std::is_convertible<DT, bool>::value /*IsConvertibleToBool*/>
+        counter;
+    auto first_success = counter(/*temp_storage*/ nullptr, temp_storage_bytes,
+                                 /*d_flags*/ input.data(),
+                                 /*d_out*/ output_iterator,
+                                 /*d_num_selected_out*/ found_true_device,
+                                 /*num_items*/ input.size(),
+                                 /*stream*/ cu_stream);
+    if (first_success != gpuSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch gpuprim::DeviceSelect::Flagged to "
+          "calculate "
+          "temp_storage_bytes, status: ",
+          GpuGetErrorString(first_success));
+    }
+
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
+        &temp_storage));
+
+    auto second_success = counter(
+        /*temp_storage*/ temp_storage.flat<int8>().data(), temp_storage_bytes,
+        /*d_flags*/ input.data(),
+        /*d_out*/ output_iterator,
+        /*d_num_selected_out*/ found_true_device,
+        /*num_items*/ input.size(),
+        /*stream*/ cu_stream);
+
+    if (second_success != gpuSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch gpuprim::DeviceSelect::Flagged to copy "
+          "indices out, status: ",
+          GpuGetErrorString(second_success));
+    }
+
+    // TODO(ebrevdo): Find a way to synchronously copy back data from
+    // found_true_device to *found_true_host.
+
+    const Eigen::array<TIndex, NDIM> strides =
+        CalculateStrides<TIndex, T, NDIM>(input);
+    const TIndex output_rows = output.dimension(0);
+    GpuLaunchConfig config = GetGpuLaunchConfig(output_rows, d);
+    TF_CHECK_OK(GpuLaunchKernel(PropagateWhereIndicesKernel<NDIM, TIndex>,
+                                config.block_count, config.thread_per_block, 0,
+                                d.stream(), output_rows, strides,
+                                output.data()));
+
+    return OkStatus();
+  }
+};
+
+#define DECLARE_GPU_SPEC_INDEX(Dims, T, TIndex) \
+  template struct Where<GPUDevice, Dims, T, TIndex>
+
+#define DECLARE_GPU_SPEC(T)                           \
+  DECLARE_GPU_SPEC_INDEX(GPU_PROVIDED_DIM, T, int32); \
+  DECLARE_GPU_SPEC_INDEX(GPU_PROVIDED_DIM, T, int64)
+
+TF_CALL_WHERE_GPU_TYPES(DECLARE_GPU_SPEC);
+
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_INDEX
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/winograd_transform.h b/third_party/tflite-hdrs/tensorflow/core/kernels/winograd_transform.h
new file mode 100644
index 00000000..4f4067e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/winograd_transform.h
@@ -0,0 +1,377 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#define TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+
+#include "tensorflow/core/kernels/deep_conv2d.h"
+
+namespace tensorflow {
+
+// Winograd DeepConv2DTransform implementation for 3x3 filters.
+// Details:
+// *) Arithmetic complexity of computations: Shmuel Winograd
+// *) Fast Algorithms for Convolutional Neural Networks: Lavin, Gray
+
+template <typename T>
+class WinogradTransform : public DeepConv2DTransform<T> {
+ public:
+  typedef typename DeepConv2DTransform<T>::Shape Shape;
+
+  WinogradTransform()
+      : filter_shape_(3, 3), input_shape_(4, 4), output_shape_(2, 2) {}
+
+  virtual void GetFilterTransformMatrix(const int64_t rows, const int64_t cols,
+                                        T* transform_matrix) const;
+
+  virtual void GetInputTransformMatrix(const int64_t rows, const int64_t cols,
+                                       T* transform_matrix) const;
+
+  virtual void GetOutputTransformMatrix(const int64_t rows, const int64_t cols,
+                                        T* transform_matrix) const;
+
+  virtual const Shape& filter_shape() const { return filter_shape_; }
+  virtual const Shape& input_shape() const { return input_shape_; }
+  virtual const Shape& output_shape() const { return output_shape_; }
+
+ private:
+  const Shape filter_shape_;
+  const Shape input_shape_;
+  const Shape output_shape_;
+};
+
+// The filter transform matrix is the kronecker product 'M * M' of the
+// following matrix 'M':
+//
+//   [ 1    0   0   ]
+//   [ 1/2  1/2 1/2 ]
+//   [ 1/2 -1/2 1/2 ]
+//   [ 0    0   1   ]
+//
+// The data layout of 'transform_matrix':
+//   [input_tile_spatial_size, filter_spatial_size]
+//
+template <typename T>
+void WinogradTransform<T>::GetFilterTransformMatrix(const int64_t rows,
+                                                    const int64_t cols,
+                                                    T* transform_matrix) const {
+  CHECK_GT(rows, 0);
+  CHECK_GT(cols, 0);
+  memset(transform_matrix, 0, sizeof(T) * rows * cols);
+
+  // Sub matrix [0,0]
+  transform_matrix[0 * cols + 0] = T(1.0);
+
+  transform_matrix[1 * cols + 0] = T(0.5);
+  transform_matrix[1 * cols + 1] = T(0.5);
+  transform_matrix[1 * cols + 2] = T(0.5);
+
+  transform_matrix[2 * cols + 0] = T(0.5);
+  transform_matrix[2 * cols + 1] = T(-0.5);
+  transform_matrix[2 * cols + 2] = T(0.5);
+
+  transform_matrix[3 * cols + 2] = T(1.0);
+
+  // Sub matrix [1,0]
+  transform_matrix[4 * cols + 0] = T(0.5);
+
+  transform_matrix[5 * cols + 0] = T(0.25);
+  transform_matrix[5 * cols + 1] = T(0.25);
+  transform_matrix[5 * cols + 2] = T(0.25);
+
+  transform_matrix[6 * cols + 0] = T(0.25);
+  transform_matrix[6 * cols + 1] = T(-0.25);
+  transform_matrix[6 * cols + 2] = T(0.25);
+
+  transform_matrix[7 * cols + 2] = T(0.5);
+
+  // Sub matrix [1,1]
+  transform_matrix[4 * cols + 3] = T(0.5);
+
+  transform_matrix[5 * cols + 3] = T(0.25);
+  transform_matrix[5 * cols + 4] = T(0.25);
+  transform_matrix[5 * cols + 5] = T(0.25);
+
+  transform_matrix[6 * cols + 3] = T(0.25);
+  transform_matrix[6 * cols + 4] = T(-0.25);
+  transform_matrix[6 * cols + 5] = T(0.25);
+
+  transform_matrix[7 * cols + 5] = T(0.5);
+
+  // Sub matrix [1,2]
+  transform_matrix[4 * cols + 6] = T(0.5);
+
+  transform_matrix[5 * cols + 6] = T(0.25);
+  transform_matrix[5 * cols + 7] = T(0.25);
+  transform_matrix[5 * cols + 8] = T(0.25);
+
+  transform_matrix[6 * cols + 6] = T(0.25);
+  transform_matrix[6 * cols + 7] = T(-0.25);
+  transform_matrix[6 * cols + 8] = T(0.25);
+
+  transform_matrix[7 * cols + 8] = T(0.5);
+
+  // Sub matrix [2,0]
+  transform_matrix[8 * cols + 0] = T(0.5);
+
+  transform_matrix[9 * cols + 0] = T(0.25);
+  transform_matrix[9 * cols + 1] = T(0.25);
+  transform_matrix[9 * cols + 2] = T(0.25);
+
+  transform_matrix[10 * cols + 0] = T(0.25);
+  transform_matrix[10 * cols + 1] = T(-0.25);
+  transform_matrix[10 * cols + 2] = T(0.25);
+
+  transform_matrix[11 * cols + 2] = T(0.5);
+
+  // Sub matrix [2,1]
+  transform_matrix[8 * cols + 3] = T(-0.5);
+
+  transform_matrix[9 * cols + 3] = T(-0.25);
+  transform_matrix[9 * cols + 4] = T(-0.25);
+  transform_matrix[9 * cols + 5] = T(-0.25);
+
+  transform_matrix[10 * cols + 3] = T(-0.25);
+  transform_matrix[10 * cols + 4] = T(0.25);
+  transform_matrix[10 * cols + 5] = T(-0.25);
+
+  transform_matrix[11 * cols + 5] = T(-0.5);
+
+  // Sub matrix [2,2]
+  transform_matrix[8 * cols + 6] = T(0.5);
+
+  transform_matrix[9 * cols + 6] = T(0.25);
+  transform_matrix[9 * cols + 7] = T(0.25);
+  transform_matrix[9 * cols + 8] = T(0.25);
+
+  transform_matrix[10 * cols + 6] = T(0.25);
+  transform_matrix[10 * cols + 7] = T(-0.25);
+  transform_matrix[10 * cols + 8] = T(0.25);
+
+  transform_matrix[11 * cols + 8] = T(0.5);
+
+  // Sub matrix [3,2]
+  transform_matrix[12 * cols + 6] = T(1.0);
+
+  transform_matrix[13 * cols + 6] = T(0.5);
+  transform_matrix[13 * cols + 7] = T(0.5);
+  transform_matrix[13 * cols + 8] = T(0.5);
+
+  transform_matrix[14 * cols + 6] = T(0.5);
+  transform_matrix[14 * cols + 7] = T(-0.5);
+  transform_matrix[14 * cols + 8] = T(0.5);
+
+  transform_matrix[15 * cols + 8] = T(1.0);
+}
+
+// The input transform matrix is the kronecker product 'M * M' of the
+// following matrix 'M':
+//
+//   [1   0  -1   0]
+//   [0   1   1   0]
+//   [0  -1   1   0]
+//   [0   1   0  -1]
+//
+// Data layout of 'transform_matrix':
+//   [tile_spatial_size, tile_spatial_size]
+//
+template <typename T>
+void WinogradTransform<T>::GetInputTransformMatrix(const int64_t rows,
+                                                   const int64_t cols,
+                                                   T* transform_matrix) const {
+  CHECK_GT(rows, 0);
+  CHECK_GT(cols, 0);
+  memset(transform_matrix, 0, sizeof(T) * rows * cols);
+
+  // Sub matrix [0,0]
+  transform_matrix[0 * cols + 0] = T(1.0);
+  transform_matrix[0 * cols + 2] = T(-1.0);
+
+  transform_matrix[1 * cols + 1] = T(1.0);
+  transform_matrix[1 * cols + 2] = T(1.0);
+
+  transform_matrix[2 * cols + 1] = T(-1.0);
+  transform_matrix[2 * cols + 2] = T(1.0);
+
+  transform_matrix[3 * cols + 1] = T(1.0);
+  transform_matrix[3 * cols + 3] = T(-1.0);
+
+  // Sub matrix [0,2]
+  transform_matrix[0 * cols + 8] = T(-1.0);
+  transform_matrix[0 * cols + 10] = T(1.0);
+
+  transform_matrix[1 * cols + 9] = T(-1.0);
+  transform_matrix[1 * cols + 10] = T(-1.0);
+
+  transform_matrix[2 * cols + 9] = T(1.0);
+  transform_matrix[2 * cols + 10] = T(-1.0);
+
+  transform_matrix[3 * cols + 9] = T(-1.0);
+  transform_matrix[3 * cols + 11] = T(1.0);
+
+  // Sub matrix [1,1]
+  transform_matrix[4 * cols + 4] = T(1.0);
+  transform_matrix[4 * cols + 6] = T(-1.0);
+
+  transform_matrix[5 * cols + 5] = T(1.0);
+  transform_matrix[5 * cols + 6] = T(1.0);
+
+  transform_matrix[6 * cols + 5] = T(-1.0);
+  transform_matrix[6 * cols + 6] = T(1.0);
+
+  transform_matrix[7 * cols + 5] = T(1.0);
+  transform_matrix[7 * cols + 7] = T(-1.0);
+
+  // Sub matrix [1,2]
+  transform_matrix[4 * cols + 8] = T(1.0);
+  transform_matrix[4 * cols + 10] = T(-1.0);
+
+  transform_matrix[5 * cols + 9] = T(1.0);
+  transform_matrix[5 * cols + 10] = T(1.0);
+
+  transform_matrix[6 * cols + 9] = T(-1.0);
+  transform_matrix[6 * cols + 10] = T(1.0);
+
+  transform_matrix[7 * cols + 9] = T(1.0);
+  transform_matrix[7 * cols + 11] = T(-1.0);
+
+  // Sub matrix [2,1]
+  transform_matrix[8 * cols + 4] = T(-1.0);
+  transform_matrix[8 * cols + 6] = T(1.0);
+
+  transform_matrix[9 * cols + 5] = T(-1.0);
+  transform_matrix[9 * cols + 6] = T(-1.0);
+
+  transform_matrix[10 * cols + 5] = T(1.0);
+  transform_matrix[10 * cols + 6] = T(-1.0);
+
+  transform_matrix[11 * cols + 5] = T(-1.0);
+  transform_matrix[11 * cols + 7] = T(1.0);
+
+  // Sub matrix [2,2]
+  transform_matrix[8 * cols + 8] = T(1.0);
+  transform_matrix[8 * cols + 10] = T(-1.0);
+
+  transform_matrix[9 * cols + 9] = T(1.0);
+  transform_matrix[9 * cols + 10] = T(1.0);
+
+  transform_matrix[10 * cols + 9] = T(-1.0);
+  transform_matrix[10 * cols + 10] = T(1.0);
+
+  transform_matrix[11 * cols + 9] = T(1.0);
+  transform_matrix[11 * cols + 11] = T(-1.0);
+
+  // Sub matrix [3,1]
+  transform_matrix[12 * cols + 4] = T(1.0);
+  transform_matrix[12 * cols + 6] = T(-1.0);
+
+  transform_matrix[13 * cols + 5] = T(1.0);
+  transform_matrix[13 * cols + 6] = T(1.0);
+
+  transform_matrix[14 * cols + 5] = T(-1.0);
+  transform_matrix[14 * cols + 6] = T(1.0);
+
+  transform_matrix[15 * cols + 5] = T(1.0);
+  transform_matrix[15 * cols + 7] = T(-1.0);
+
+  // Sub matrix [3,3]
+  transform_matrix[12 * cols + 12] = T(-1.0);
+  transform_matrix[12 * cols + 14] = T(1.0);
+
+  transform_matrix[13 * cols + 13] = T(-1.0);
+  transform_matrix[13 * cols + 14] = T(-1.0);
+
+  transform_matrix[14 * cols + 13] = T(1.0);
+  transform_matrix[14 * cols + 14] = T(-1.0);
+
+  transform_matrix[15 * cols + 13] = T(-1.0);
+  transform_matrix[15 * cols + 15] = T(1.0);
+};
+
+// The output transform matrix is the kronecker product 'M * M' of the
+// following matrix 'M':
+//
+//   [1  1  1  0]
+//   [0  1 -1 -1]
+//
+// Data layout of 'transform_matrix':
+//   [out_tile_spatial_size, tile_spatial_size]
+//
+template <typename T>
+void WinogradTransform<T>::GetOutputTransformMatrix(const int64_t rows,
+                                                    const int64_t cols,
+                                                    T* transform_matrix) const {
+  CHECK_GT(rows, 0);
+  CHECK_GT(cols, 0);
+  memset(transform_matrix, 0, sizeof(T) * rows * cols);
+
+  // Sub matrix [0,0]
+  transform_matrix[0 * cols + 0] = T(1.0);
+  transform_matrix[0 * cols + 1] = T(1.0);
+  transform_matrix[0 * cols + 2] = T(1.0);
+
+  transform_matrix[1 * cols + 1] = T(1.0);
+  transform_matrix[1 * cols + 2] = T(-1.0);
+  transform_matrix[1 * cols + 3] = T(-1.0);
+
+  // Sub matrix [0,1]
+  transform_matrix[0 * cols + 4] = T(1.0);
+  transform_matrix[0 * cols + 5] = T(1.0);
+  transform_matrix[0 * cols + 6] = T(1.0);
+
+  transform_matrix[1 * cols + 5] = T(1.0);
+  transform_matrix[1 * cols + 6] = T(-1.0);
+  transform_matrix[1 * cols + 7] = T(-1.0);
+
+  // Sub matrix [0,2]
+  transform_matrix[0 * cols + 8] = T(1.0);
+  transform_matrix[0 * cols + 9] = T(1.0);
+  transform_matrix[0 * cols + 10] = T(1.0);
+
+  transform_matrix[1 * cols + 9] = T(1.0);
+  transform_matrix[1 * cols + 10] = T(-1.0);
+  transform_matrix[1 * cols + 11] = T(-1.0);
+
+  // Sub matrix [1,1]
+  transform_matrix[2 * cols + 4] = T(1.0);
+  transform_matrix[2 * cols + 5] = T(1.0);
+  transform_matrix[2 * cols + 6] = T(1.0);
+
+  transform_matrix[3 * cols + 5] = T(1.0);
+  transform_matrix[3 * cols + 6] = T(-1.0);
+  transform_matrix[3 * cols + 7] = T(-1.0);
+
+  // Sub matrix [1,2]
+  transform_matrix[2 * cols + 8] = T(-1.0);
+  transform_matrix[2 * cols + 9] = T(-1.0);
+  transform_matrix[2 * cols + 10] = T(-1.0);
+
+  transform_matrix[3 * cols + 9] = T(-1.0);
+  transform_matrix[3 * cols + 10] = T(1.0);
+  transform_matrix[3 * cols + 11] = T(1.0);
+
+  // Sub matrix [1,3]
+  transform_matrix[2 * cols + 12] = T(-1.0);
+  transform_matrix[2 * cols + 13] = T(-1.0);
+  transform_matrix[2 * cols + 14] = T(-1.0);
+
+  transform_matrix[3 * cols + 13] = T(-1.0);
+  transform_matrix[3 * cols + 14] = T(1.0);
+  transform_matrix[3 * cols + 15] = T(1.0);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/kernels/xent_op.h b/third_party/tflite-hdrs/tensorflow/core/kernels/xent_op.h
new file mode 100644
index 00000000..07870f50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/kernels/xent_op.h
@@ -0,0 +1,115 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_XENT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_XENT_OP_H_
+// Functor definition for XentOp, must be compilable by nvcc.
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by XentOp to do the computations.
+template <typename Device, typename T>
+struct XentFunctor {
+  // Computes Cross Entropy loss and backprop.
+  //
+  // logits: batch_size, num_classes.
+  // labels: batch_size, num_classes.
+  // scratch: temporary tensor, dims: batch_size, 1
+  // loss: output tensor for the loss, dims: batch_size.
+  // backprop: output tensor for the backprop, dims: batch_size, num_classes.
+  void operator()(const Device &d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::ConstMatrix labels,
+                  typename TTypes<T>::Matrix scratch,
+                  typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop);
+};
+
+// Eigen code implementing XentFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct XentEigenImpl {
+  static void Compute(const Device &d,
+                      const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                      const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                      const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                      typename TTypes<T>::ConstMatrix logits,
+                      typename TTypes<T>::ConstMatrix labels,
+                      typename TTypes<T>::Matrix scratch,
+                      typename TTypes<T>::Vec loss,
+                      typename TTypes<T>::Matrix backprop) {
+    // NOTE(touts): This duplicates some of the computations in softmax_op
+    // because we need the intermediate (logits -max(logits)) values to
+    // avoid a log(exp()) in the computation of the loss.
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = shape[kBatchDim];
+    const int num_classes = shape[kClassDim];
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+    batch_by_one.set(0, batch_size);
+    Eigen::IndexList<int> batch_only;
+    batch_only.set(0, batch_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+    one_by_class.set(1, num_classes);
+
+    // max_logits along classes.
+    scratch.reshape(batch_only).device(d) =
+        logits.broadcast(logits_bcast).maximum(along_class);
+
+    // logits - max_logits.
+    backprop.device(d) =
+        logits.broadcast(logits_bcast) - scratch.broadcast(one_by_class);
+
+    // sum(exp(logits - max_logits)) along classes.
+    scratch.reshape(batch_only).device(d) = backprop.exp().sum(along_class);
+
+    // NOTE(keveman): Eigen on GPU dispatches to an optimized implementation
+    // for an expression of the form lhs = rhs.sum().
+    // lhs = -rhs.sum() doesn't match the above pattern, so folding in the
+    // negation before calling sum().
+    //  sum(-labels *
+    //     ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
+    //  along classes
+    loss.device(d) = (labels.broadcast(labels_bcast) *
+                      (scratch.log().eval().broadcast(one_by_class) - backprop))
+                         .eval()
+                         .sum(along_class);
+
+    // backprop: prob - labels, where
+    //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
+    backprop.device(d) = (backprop.exp() / scratch.broadcast(one_by_class)) -
+                         labels.broadcast(labels_bcast);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_XENT_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/bfloat16/bfloat16.h b/third_party/tflite-hdrs/tensorflow/core/lib/bfloat16/bfloat16.h
new file mode 100644
index 00000000..d6ac77b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
+#define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
+
+#include "tensorflow/core/platform/bfloat16.h"
+
+#endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/arena.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/arena.h
new file mode 100644
index 00000000..14d80422
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/arena.h
@@ -0,0 +1,111 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(vrv): Switch this to an open-sourced version of Arena.
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_ARENA_H_
+#define TENSORFLOW_CORE_LIB_CORE_ARENA_H_
+
+#include <assert.h>
+
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace core {
+
+// This class is "thread-compatible": different threads can access the
+// arena at the same time without locking, as long as they use only
+// const methods.
+class Arena {
+ public:
+  // Allocates a thread-compatible arena with the specified block size.
+  explicit Arena(const size_t block_size);
+  ~Arena();
+
+  char* Alloc(const size_t size) {
+    return reinterpret_cast<char*>(GetMemory(size, 1));
+  }
+
+  char* AllocAligned(const size_t size, const size_t alignment) {
+    return reinterpret_cast<char*>(GetMemory(size, alignment));
+  }
+
+  void Reset();
+
+// This should be the worst-case alignment for any type.  This is
+// good for IA-32, SPARC version 7 (the last one I know), and
+// supposedly Alpha.  i386 would be more time-efficient with a
+// default alignment of 8, but ::operator new() uses alignment of 4,
+// and an assertion will fail below after the call to MakeNewBlock()
+// if you try to use a larger alignment.
+#ifdef __i386__
+  static const int kDefaultAlignment = 4;
+#else
+  static constexpr int kDefaultAlignment = 8;
+#endif
+
+ protected:
+  bool SatisfyAlignment(const size_t alignment);
+  void MakeNewBlock(const uint32 alignment);
+  void* GetMemoryFallback(const size_t size, const int align);
+  void* GetMemory(const size_t size, const int align) {
+    assert(remaining_ <= block_size_);                  // an invariant
+    if (size > 0 && size < remaining_ && align == 1) {  // common case
+      void* result = freestart_;
+      freestart_ += size;
+      remaining_ -= size;
+      return result;
+    }
+    return GetMemoryFallback(size, align);
+  }
+
+  size_t remaining_;
+
+ private:
+  struct AllocatedBlock {
+    char* mem;
+    size_t size;
+  };
+
+  // Allocate new block of at least block_size, with the specified
+  // alignment.
+  // The returned AllocatedBlock* is valid until the next call to AllocNewBlock
+  // or Reset (i.e. anything that might affect overflow_blocks_).
+  AllocatedBlock* AllocNewBlock(const size_t block_size,
+                                const uint32 alignment);
+
+  const size_t block_size_;
+  char* freestart_;  // beginning of the free space in most recent block
+  char* freestart_when_empty_;  // beginning of the free space when we're empty
+  // STL vector isn't as efficient as it could be, so we use an array at first
+  size_t blocks_alloced_;  // how many of the first_blocks_ have been alloced
+  AllocatedBlock first_blocks_[16];  // the length of this array is arbitrary
+  // if the first_blocks_ aren't enough, expand into overflow_blocks_.
+  std::vector<AllocatedBlock>* overflow_blocks_;
+
+  void FreeBlocks();  // Frees all except first block
+
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
+};
+
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_ARENA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/bitmap.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/bitmap.h
new file mode 100644
index 00000000..86e825db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/bitmap.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#define TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+
+#include "xla/tsl/lib/core/bitmap.h"
+
+namespace tensorflow {
+namespace core {
+
+using Bitmap = tsl::core::Bitmap;
+
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/bits.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/bits.h
new file mode 100644
index 00000000..8bcc448b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/bits.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_BITS_H_
+#define TENSORFLOW_CORE_LIB_CORE_BITS_H_
+
+#include "xla/tsl/lib/core/bits.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+
+// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+using ::tsl::Log2Floor;
+using ::tsl::Log2Floor64;
+
+// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+using ::tsl::Log2Ceiling;
+using ::tsl::Log2Ceiling64;
+
+using ::tsl::NextPowerOfTwo;
+using ::tsl::NextPowerOfTwo64;
+
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_BITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/coding.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/coding.h
new file mode 100644
index 00000000..47b645eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/coding.h
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_CODING_H_
+#define TENSORFLOW_CORE_LIB_CORE_CODING_H_
+
+#include "tensorflow/core/platform/coding.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/errors.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/errors.h
new file mode 100644
index 00000000..94154429
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/errors.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_ERRORS_H_
+#define TENSORFLOW_CORE_LIB_CORE_ERRORS_H_
+
+#include "tensorflow/core/platform/errors.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_ERRORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/notification.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/notification.h
new file mode 100644
index 00000000..c22f695f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/notification.h
@@ -0,0 +1,23 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_NOTIFICATION_H_
+#define TENSORFLOW_CORE_LIB_CORE_NOTIFICATION_H_
+
+// Notification implementation is platform-dependent, to support
+// alternative synchronization primitives.
+#include "tensorflow/core/platform/notification.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_NOTIFICATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/raw_coding.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/raw_coding.h
new file mode 100644
index 00000000..b4adbb7f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/raw_coding.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_RAW_CODING_H_
+#define TENSORFLOW_CORE_LIB_CORE_RAW_CODING_H_
+
+#include "tensorflow/core/platform/raw_coding.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_RAW_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/refcount.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/refcount.h
new file mode 100644
index 00000000..3bc634af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/refcount.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_REFCOUNT_H_
+#define TENSORFLOW_CORE_LIB_CORE_REFCOUNT_H_
+
+#include "tensorflow/core/platform/refcount.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_REFCOUNT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/status.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/status.h
new file mode 100644
index 00000000..2146cbd5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/status.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_STATUS_H_
+#define TENSORFLOW_CORE_LIB_CORE_STATUS_H_
+
+#include "tensorflow/core/platform/status.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/status_test_util.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/status_test_util.h
new file mode 100644
index 00000000..3c604ee8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/status_test_util.h
@@ -0,0 +1,22 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_STATUS_TEST_UTIL_H_
+#define TENSORFLOW_CORE_LIB_CORE_STATUS_TEST_UTIL_H_
+
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_STATUS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/stringpiece.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/stringpiece.h
new file mode 100644
index 00000000..d00ce8c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/stringpiece.h
@@ -0,0 +1,31 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// StringPiece is a simple structure containing a pointer into some external
+// storage and a size.  The user of a StringPiece must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a StringPiece without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same StringPiece must use
+// external synchronization.
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
+#define TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
+
+#include "tensorflow/core/platform/stringpiece.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool.h
new file mode 100644
index 00000000..4aa4b69b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_THREADPOOL_H_
+#define TENSORFLOW_CORE_LIB_CORE_THREADPOOL_H_
+
+#include "tensorflow/core/platform/threadpool.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool_interface.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool_interface.h
new file mode 100644
index 00000000..1a51e38e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool_interface.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_THREADPOOL_INTERFACE_H_
+#define TENSORFLOW_CORE_LIB_CORE_THREADPOOL_INTERFACE_H_
+
+#include "tensorflow/core/platform/threadpool_interface.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_THREADPOOL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool_options.h b/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool_options.h
new file mode 100644
index 00000000..64f7e647
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/core/threadpool_options.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_CORE_THREADPOOL_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_CORE_THREADPOOL_OPTIONS_H_
+
+#include "tensorflow/core/platform/threadpool_options.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_THREADPOOL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/db/sqlite.h b/third_party/tflite-hdrs/tensorflow/core/lib/db/sqlite.h
new file mode 100644
index 00000000..992001e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/db/sqlite.h
@@ -0,0 +1,457 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_DB_SQLITE_H_
+#define TENSORFLOW_CORE_LIB_DB_SQLITE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+#include "absl/log/check.h"
+#include "sqlite3.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/status.h"
+
+/// TensorFlow SQLite Veneer
+///
+/// - Memory safety
+/// - Less boilerplate
+/// - Removes deprecated stuff
+/// - Pretends UTF16 doesn't exist
+/// - Transaction compile-time safety
+/// - Statically loads our native extensions
+/// - Error reporting via tensorflow::Status et al.
+///
+/// SQLite>=3.8.2 needs to be supported until April 2019, which is when
+/// Ubuntu 14.04 LTS becomes EOL.
+
+namespace tensorflow {
+
+class SqliteLock;
+class SqliteStatement;
+class SqliteTransaction;
+
+/// \brief SQLite connection object.
+///
+/// The SQLite connection is closed automatically by the destructor.
+/// Reference counting ensures that happens after its statements are
+/// destructed.
+///
+/// Instances are reference counted and can be shared between threads.
+/// This class offers the same thread safety behaviors as the SQLite
+/// API itself.
+///
+/// This veneer uses auto-commit mode by default, which means a 4ms
+/// fsync() happens after every write unless a SqliteTransaction is
+/// used or WAL mode is enabled beforehand.
+class TF_LOCKABLE Sqlite : public core::RefCounted {
+ public:
+  /// \brief Closes SQLite connection, which can take milliseconds.
+  ~Sqlite() override;
+
+  /// \brief Opens SQLite database file.
+  ///
+  /// Most users will want to set flags to SQLITE_OPEN_READWRITE |
+  /// SQLITE_OPEN_CREATE. There are many other open flags; here are
+  /// notes on a few of them:
+  ///
+  /// - SQLITE_OPEN_READONLY: Allowed if no WAL journal is active.
+  /// - SQLITE_OPEN_SHAREDCACHE: Will be ignored because this veneer
+  ///   doesn't support the unlock notify API.
+  /// - SQLITE_OPEN_NOMUTEX: Means access to this connection MUST be
+  ///   serialized by the caller in accordance with the same contracts
+  ///   implemented by this API.
+  ///
+  /// This function sets PRAGMA values from TF_SQLITE_* environment
+  /// variables. See sqlite.cc to learn more.
+  static absl::Status Open(const string& path, int flags, Sqlite** db);
+
+  /// \brief Creates SQLite statement.
+  ///
+  /// This routine should never fail if sql is valid and does not
+  /// reference tables. When tables are referenced, system calls are
+  /// needed which can take microseconds. When the schema changes, this
+  /// routine will retry automatically and then possibly fail.
+  ///
+  /// The returned statement holds a reference to this object.
+  absl::Status Prepare(const absl::string_view& sql, SqliteStatement* stmt);
+  SqliteStatement PrepareOrDie(const absl::string_view& sql);
+
+  /// \brief Returns extended result code of last error.
+  ///
+  /// If the most recent API call was successful, the result is
+  /// undefined. The legacy result code can be obtained by saying
+  /// errcode() & 0xff.
+  int errcode() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_extended_errcode(db_);
+  }
+
+  /// \brief Returns pointer to current error message state.
+  const char* errmsg() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_errmsg(db_);
+  }
+
+  /// \brief Returns rowid assigned to last successful insert.
+  int64_t last_insert_rowid() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_last_insert_rowid(db_);
+  }
+
+  /// \brief Returns number of rows directly changed by last write.
+  int64_t changes() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_changes(db_);
+  }
+
+ private:
+  friend class SqliteLock;
+  friend class SqliteStatement;
+  friend class SqliteTransaction;
+
+  Sqlite(sqlite3* db, sqlite3_stmt* begin, sqlite3_stmt* commit,
+         sqlite3_stmt* rollback) noexcept
+      : db_(db), begin_(begin), commit_(commit), rollback_(rollback) {}
+
+  sqlite3* const db_;
+  sqlite3_stmt* const begin_;
+  sqlite3_stmt* const commit_;
+  sqlite3_stmt* const rollback_;
+  bool is_in_transaction_ = false;
+
+  Sqlite(const Sqlite&) = delete;
+  void operator=(const Sqlite&) = delete;
+};
+
+/// \brief SQLite prepared statement.
+///
+/// Instances can only be shared between threads if caller serializes
+/// access from first Bind*() to *Reset().
+///
+/// When reusing a statement in a loop, be certain to not have jumps
+/// betwixt Bind*() and *Reset().
+class SqliteStatement {
+ public:
+  /// \brief Initializes an empty statement to be assigned later.
+  SqliteStatement() noexcept = default;
+
+  /// \brief Finalizes statement.
+  ///
+  /// This can take milliseconds if it was blocking the Sqlite
+  /// connection object from being freed.
+  ~SqliteStatement() {
+    sqlite3_finalize(stmt_);
+    if (db_ != nullptr) db_->Unref();
+  }
+
+  /// \brief Returns true if statement is initialized.
+  explicit operator bool() const { return stmt_ != nullptr; }
+
+  /// \brief Returns SQL text from when this query was prepared.
+  const char* sql() const { return sqlite3_sql(stmt_); }
+
+  /// \brief Number of bytes bound since last *Reset().
+  uint64 size() { return size_; }
+
+  /// \brief Executes query for fetching arbitrary rows.
+  ///
+  /// `is_done` will always be set to true unless SQLITE_ROW is
+  /// returned by the underlying API. If status() is already in an
+  /// error state, then this method is a no-op and the existing status
+  /// is returned.
+  ///
+  /// The OrDie version returns `!is_done` which, if true, indicates a
+  /// row is available.
+  ///
+  /// This statement should be Reset() or destructed when finished with
+  /// the result.
+  absl::Status Step(bool* is_done);
+  bool StepOrDie() TF_MUST_USE_RESULT;
+
+  /// \brief Executes query when only one row is desired.
+  ///
+  /// If a row isn't returned, an internal error Status is returned
+  /// that won't be reflected in the connection error state.
+  ///
+  /// This statement should be Reset() or destructed when finished with
+  /// the result.
+  absl::Status StepOnce();
+  const SqliteStatement& StepOnceOrDie();
+
+  /// \brief Executes query, ensures zero rows returned, then Reset().
+  ///
+  /// If a row is returned, an internal error Status is returned that
+  /// won't be reflected in the connection error state.
+  absl::Status StepAndReset();
+  void StepAndResetOrDie();
+
+  /// \brief Resets statement so it can be executed again.
+  ///
+  /// Implementation note: This method diverges from canonical API
+  /// behavior by calling sqlite3_clear_bindings() in addition to
+  /// sqlite3_reset(). That makes the veneer safer; we haven't found a
+  /// super compelling reason yet to call them independently.
+  void Reset();
+
+  /// \brief Binds signed 64-bit integer to 1-indexed query parameter.
+  void BindInt(int parameter, int64_t value) {
+    Update(sqlite3_bind_int64(stmt_, parameter, value), parameter);
+    size_ += sizeof(int64_t);
+  }
+  void BindInt(const char* parameter, int64_t value) {
+    BindInt(GetParameterIndex(parameter), value);
+  }
+
+  /// \brief Binds double to 1-indexed query parameter.
+  void BindDouble(int parameter, double value) {
+    Update(sqlite3_bind_double(stmt_, parameter, value), parameter);
+    size_ += sizeof(double);
+  }
+  void BindDouble(const char* parameter, double value) {
+    BindDouble(GetParameterIndex(parameter), value);
+  }
+
+  /// \brief Copies UTF-8 text to 1-indexed query parameter.
+  ///
+  /// If NUL characters are present, they will still go in the DB and
+  /// be successfully retrieved by ColumnString(); however, the
+  /// behavior of these values with SQLite functions is undefined.
+  ///
+  /// When using the unsafe methods, the data must not be changed or
+  /// freed until this statement is Reset() or finalized.
+  void BindText(int parameter, const absl::string_view& text) {
+    Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
+                               SQLITE_TRANSIENT, SQLITE_UTF8),
+           parameter);
+    size_ += text.size();
+  }
+  void BindText(const char* parameter, const absl::string_view& text) {
+    BindText(GetParameterIndex(parameter), text);
+  }
+  void BindTextUnsafe(int parameter, const absl::string_view& text) {
+    Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
+                               SQLITE_STATIC, SQLITE_UTF8),
+           parameter);
+    size_ += text.size();
+  }
+  void BindTextUnsafe(const char* parameter, const absl::string_view& text) {
+    BindTextUnsafe(GetParameterIndex(parameter), text);
+  }
+
+  /// \brief Copies binary data to 1-indexed query parameter.
+  ///
+  /// When using the unsafe methods, the data must not be changed or
+  /// freed until this statement is Reset() or finalized.
+  void BindBlob(int parameter, const absl::string_view& blob) {
+    Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
+                               SQLITE_TRANSIENT),
+           parameter);
+    size_ += blob.size();
+  }
+  void BindBlob(const char* parameter, const absl::string_view& blob) {
+    BindBlob(GetParameterIndex(parameter), blob);
+  }
+  void BindBlobUnsafe(int parameter, const absl::string_view& blob) {
+    Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
+                               SQLITE_STATIC),
+           parameter);
+    size_ += blob.size();
+  }
+  void BindBlobUnsafe(const char* parameter, const absl::string_view& text) {
+    BindBlobUnsafe(GetParameterIndex(parameter), text);
+  }
+
+  /// \brief Returns number of columns in result set.
+  int ColumnCount() const TF_MUST_USE_RESULT {
+    return sqlite3_column_count(stmt_);
+  }
+
+  /// \brief Returns type of 0-indexed column value in row data.
+  ///
+  /// Please note that SQLite is dynamically typed and the type of a
+  /// particular column can vary from row to row.
+  int ColumnType(int column) const TF_MUST_USE_RESULT {
+    return sqlite3_column_type(stmt_, column);
+  }
+
+  /// \brief Returns 0-indexed column from row result coerced as an integer.
+  int64_t ColumnInt(int column) const TF_MUST_USE_RESULT {
+    return sqlite3_column_int64(stmt_, column);
+  }
+
+  /// \brief Returns 0-indexed column from row result coerced as a double.
+  double ColumnDouble(int column) const TF_MUST_USE_RESULT {
+    return sqlite3_column_double(stmt_, column);
+  }
+
+  /// \brief Copies 0-indexed column from row result coerced as a string.
+  ///
+  /// NULL values are returned as empty string. This method should be
+  /// used for both BLOB and TEXT columns. See also: ColumnType().
+  string ColumnString(int column) const TF_MUST_USE_RESULT {
+    auto data = sqlite3_column_blob(stmt_, column);
+    if (data == nullptr) return "";
+    return {static_cast<const char*>(data),
+            static_cast<size_t>(ColumnSize(column))};
+  }
+
+  /// \brief Returns pointer to binary data at 0-indexed column.
+  ///
+  /// Empty values are returned as NULL. The returned memory will no
+  /// longer be valid the next time Step() or Reset() is called. No NUL
+  /// terminator is added.
+  absl::string_view ColumnStringUnsafe(int column) const TF_MUST_USE_RESULT {
+    return {static_cast<const char*>(sqlite3_column_blob(stmt_, column)),
+            static_cast<size_t>(ColumnSize(column))};
+  }
+
+  /// \brief Returns number of bytes stored at 0-indexed column.
+  int ColumnSize(int column) const TF_MUST_USE_RESULT {
+    return sqlite3_column_bytes(stmt_, column);
+  }
+
+  /// \brief Move constructor, after which <other> is reset to empty.
+  SqliteStatement(SqliteStatement&& other) noexcept
+      : db_(other.db_), stmt_(other.stmt_), bind_error_(other.bind_error_) {
+    other.db_ = nullptr;
+    other.stmt_ = nullptr;
+    other.bind_error_ = SQLITE_OK;
+  }
+
+  /// \brief Move assignment, after which <other> is reset to empty.
+  SqliteStatement& operator=(SqliteStatement&& other) noexcept {
+    if (&other != this) {
+      if (db_ != nullptr) db_->Unref();
+      if (stmt_ != nullptr) sqlite3_finalize(stmt_);
+      db_ = other.db_;
+      stmt_ = other.stmt_;
+      bind_error_ = other.bind_error_;
+      size_ = other.size_;
+      other.db_ = nullptr;
+      other.stmt_ = nullptr;
+      other.bind_error_ = SQLITE_OK;
+      other.size_ = 0;
+    }
+    return *this;
+  }
+
+ private:
+  friend class Sqlite;
+
+  SqliteStatement(Sqlite* db, sqlite3_stmt* stmt) noexcept
+      : db_(db), stmt_(stmt) {
+    db_->Ref();
+  }
+
+  void Update(int rc, int parameter) {
+    // Binding strings can fail if they exceed length limit.
+    if (TF_PREDICT_FALSE(rc != SQLITE_OK)) {
+      if (bind_error_ == SQLITE_OK) {
+        bind_error_ = rc;
+        bind_error_parameter_ = parameter;
+      }
+    }
+  }
+
+  int GetParameterIndex(const char* parameter) {
+    int index = sqlite3_bind_parameter_index(stmt_, parameter);
+    DCHECK(index > 0);  // OK to compile away since it'll fail again
+    return index;
+  }
+
+  Sqlite* db_ = nullptr;
+  sqlite3_stmt* stmt_ = nullptr;
+  int bind_error_ = SQLITE_OK;
+  int bind_error_parameter_ = 0;
+  uint64 size_ = 0;
+
+  SqliteStatement(const SqliteStatement&) = delete;
+  void operator=(const SqliteStatement&) = delete;
+};
+
+/// \brief Reentrant SQLite connection object lock
+///
+/// This is a no-op if SQLITE_OPEN_NOMUTEX was used.
+class TF_SCOPED_LOCKABLE SqliteLock {
+ public:
+  explicit SqliteLock(Sqlite& db) TF_EXCLUSIVE_LOCK_FUNCTION(db)
+      : mutex_(sqlite3_db_mutex(db.db_)) {
+    sqlite3_mutex_enter(mutex_);
+  }
+  SqliteLock(Sqlite& db, std::try_to_lock_t) TF_EXCLUSIVE_LOCK_FUNCTION(db)
+      : mutex_(sqlite3_db_mutex(db.db_)) {
+    if (TF_PREDICT_FALSE(sqlite3_mutex_try(mutex_) != SQLITE_OK)) {
+      is_locked_ = false;
+    }
+  }
+  ~SqliteLock() TF_UNLOCK_FUNCTION() {
+    if (is_locked_) sqlite3_mutex_leave(mutex_);
+  }
+  explicit operator bool() const { return is_locked_; }
+
+ private:
+  sqlite3_mutex* const mutex_;
+  bool is_locked_ = true;
+  SqliteLock(const SqliteLock&) = delete;
+  void operator=(const SqliteLock&) = delete;
+};
+#define SqliteLock(x) static_assert(0, "sqlite_lock_decl_missing_name");
+
+/// \brief SQLite transaction scope.
+///
+/// This class acquires an exclusive lock on the connection object (if
+/// mutexes weren't disabled) and runs BEGIN / ROLLBACK automatically.
+/// Unlike SqliteLock this scope is non-reentrant. To avoid program
+/// crashes, business logic should use the TF_EXCLUSIVE_LOCK_FUNCTION and
+/// TF_LOCKS_EXCLUDED annotations as much as possible.
+class TF_SCOPED_LOCKABLE SqliteTransaction {
+ public:
+  /// \brief Locks db and begins deferred transaction.
+  ///
+  /// This will crash if a transaction is already active.
+  explicit SqliteTransaction(Sqlite& db) TF_EXCLUSIVE_LOCK_FUNCTION(db);
+
+  /// \brief Runs ROLLBACK and unlocks.
+  ~SqliteTransaction() TF_UNLOCK_FUNCTION();
+
+  /// \brief Commits transaction.
+  ///
+  /// If this is successful, a new transaction will be started, which
+  /// is rolled back when exiting the scope.
+  absl::Status Commit();
+
+ private:
+  void Begin();
+  Sqlite* const db_;
+
+  SqliteTransaction(const SqliteTransaction&) = delete;
+  void operator=(const SqliteTransaction&) = delete;
+};
+
+#define SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(...) \
+  TF_EXCLUSIVE_LOCKS_REQUIRED(__VA_ARGS__)
+#define SQLITE_TRANSACTIONS_EXCLUDED(...) TF_LOCKS_EXCLUDED(__VA_ARGS__)
+
+inline SqliteStatement Sqlite::PrepareOrDie(const absl::string_view& sql) {
+  SqliteStatement stmt;
+  TF_CHECK_OK(Prepare(sql, &stmt));
+  return stmt;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_DB_SQLITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gif/gif_io.h b/third_party/tflite-hdrs/tensorflow/core/lib/gif/gif_io.h
new file mode 100644
index 00000000..ae7d5125
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gif/gif_io.h
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read and write images in GIF format.
+//
+// The advantage over image/codec/png{enc,dec}oder.h is that this library
+// supports both 8 and 16 bit images.
+//
+// The decoding routine accepts binary image data as a StringPiece.  These are
+// implicitly constructed from strings or char* so they're completely
+// transparent to the caller.  They're also very cheap to construct so this
+// doesn't introduce any additional overhead.
+//
+// The primary benefit of StringPieces being, in this case, that APIs already
+// returning StringPieces (e.g., Bigtable Scanner) or Cords (e.g., IOBuffer;
+// only when they're flat, though) or protocol buffer fields typed to either of
+// these can be decoded without copying the data into a C++ string.
+
+#ifndef TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
+#define TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gif {
+
+uint8* Decode(const void* srcdata, int datasize,
+              const std::function<uint8*(int, int, int, int)>& allocate_output,
+              string* error_string, bool expand_animations = true);
+
+}  // namespace gif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/array_slice.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/array_slice.h
new file mode 100644
index 00000000..ddacf4d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/array_slice.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
+#define TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
+
+#include "absl/base/macros.h"
+#include "absl/types/span.h"
+// TODO(timshen): This is kept only because lots of targets transitively depend
+// on it. Remove all targets' dependencies.
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace gtl {
+
+template <typename T>
+using ArraySlice ABSL_DEPRECATE_AND_INLINE() = absl::Span<const T>;
+
+template <typename T>
+using MutableArraySlice ABSL_DEPRECATE_AND_INLINE() = absl::Span<T>;
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/cleanup.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/cleanup.h
new file mode 100644
index 00000000..3e54f828
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/cleanup.h
@@ -0,0 +1,113 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// MakeCleanup(f) returns an RAII cleanup object that calls 'f' in its
+// destructor. The easiest way to use MakeCleanup is with a lambda argument,
+// capturing the return value in an 'auto' local variable. Most users will not
+// need more sophisticated syntax than that.
+//
+// Example:
+//   void func() {
+//     FILE* fp = fopen("data.txt", "r");
+//     if (fp == nullptr) return;
+//     auto fp_cleaner = gtl::MakeCleanup([fp] { fclose(fp); });
+//     // No matter what, fclose(fp) will happen.
+//     DataObject d;
+//     while (ReadDataObject(fp, &d)) {
+//       if (d.IsBad()) {
+//         LOG(ERROR) << "Bad Data";
+//         return;
+//       }
+//       PushGoodData(d);
+//     }
+//   }
+//
+// You can use Cleanup<F> directly, instead of using MakeCleanup and auto,
+// but there's rarely a reason to do that.
+//
+// You can call 'release()' on a Cleanup object to cancel the cleanup.
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_CLEANUP_H_
+#define TENSORFLOW_CORE_LIB_GTL_CLEANUP_H_
+
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// A move-only RAII object that calls a stored cleanup functor when
+// destroyed. Cleanup<F> is the return type of gtl::MakeCleanup(F).
+template <typename F>
+class Cleanup {
+ public:
+  Cleanup() : released_(true), f_() {}
+
+  template <typename G>
+  explicit Cleanup(G&& f)          // NOLINT
+      : f_(std::forward<G>(f)) {}  // NOLINT(build/c++11)
+
+  Cleanup(Cleanup&& src)  // NOLINT
+      : released_(src.is_released()), f_(src.release()) {}
+
+  // Implicitly move-constructible from any compatible Cleanup<G>.
+  // The source will be released as if src.release() were called.
+  // A moved-from Cleanup can be safely destroyed or reassigned.
+  template <typename G>
+  Cleanup(Cleanup<G>&& src)  // NOLINT
+      : released_(src.is_released()), f_(src.release()) {}
+
+  // Assignment to a Cleanup object behaves like destroying it
+  // and making a new one in its place, analogous to unique_ptr
+  // semantics.
+  Cleanup& operator=(Cleanup&& src) {  // NOLINT
+    if (!released_) f_();
+    released_ = src.released_;
+    f_ = src.release();
+    return *this;
+  }
+
+  ~Cleanup() {
+    if (!released_) f_();
+  }
+
+  // Releases the cleanup function instead of running it.
+  // Hint: use c.release()() to run early.
+  F release() {
+    released_ = true;
+    return std::move(f_);
+  }
+
+  bool is_released() const { return released_; }
+
+ private:
+  static_assert(!std::is_reference<F>::value, "F must not be a reference");
+
+  bool released_ = false;
+  F f_;
+};
+
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
+TF_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
+  return Cleanup<DecayF>(std::forward<F>(f));
+}
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_CLEANUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/compactptrset.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/compactptrset.h
new file mode 100644
index 00000000..6655ac92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/compactptrset.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#define TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+
+#include "xla/tsl/lib/gtl/compactptrset.h"
+
+namespace tensorflow {
+namespace gtl {
+
+using ::tsl::gtl::CompactPointerSet;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/edit_distance.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/edit_distance.h
new file mode 100644
index 00000000..94a5ad68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/edit_distance.h
@@ -0,0 +1,108 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_EDIT_DISTANCE_H_
+#define TENSORFLOW_CORE_LIB_GTL_EDIT_DISTANCE_H_
+
+#include <numeric>
+
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// Calculate the Levenshtein Edit Distance between two contiguous
+// sequences, s and t, of type T.
+//
+// The Levenshtein distance is a symmetric distance defined as the
+// smallest number of insertions, deletions, and substitutions
+// required to convert sequence s to t (and vice versa).
+// Note, this distance does not consider transpositions.
+//
+// For more details and a reference implementation, see:
+//   https://en.wikipedia.org/wiki/Levenshtein_distance
+//
+// This implementation has time complexity O(|s|*|t|)
+// and space complexity O(min(|s|, |t|)), where
+//   |x| := x.size()
+//
+// A simple call to LevenshteinDistance looks like:
+//
+//  int64 dist = LevenshteinDistance("hi", "bye", std::equal_to<char>());
+//
+template <typename T, typename Cmp>
+inline int64_t LevenshteinDistance(const gtl::ArraySlice<T> s,
+                                   const gtl::ArraySlice<T> t, const Cmp& cmp) {
+  const int64_t s_size = s.size();
+  const int64_t t_size = t.size();
+
+  if (t_size > s_size) return LevenshteinDistance(t, s, cmp);
+
+  const T* s_data = s.data();
+  const T* t_data = t.data();
+
+  if (t_size == 0) return s_size;
+  if (s == t) return 0;
+
+  // Create work vector
+  absl::InlinedVector<int64_t, 32UL> scratch_holder(t_size);
+
+  int64_t* scratch = scratch_holder.data();
+
+  // Special case for i = 0: Distance between empty string and string
+  // of length j is just j.
+  for (size_t j = 1; j < t_size; ++j) scratch[j - 1] = j;
+
+  for (size_t i = 1; i <= s_size; ++i) {
+    // Invariant: scratch[j - 1] equals cost(i - 1, j).
+    int substitution_base_cost = i - 1;
+    int insertion_cost = i + 1;
+    for (size_t j = 1; j <= t_size; ++j) {
+      // Invariants:
+      //  scratch[k - 1] = cost(i, k)  for 0 < k < j.
+      //  scratch[k - 1] = cost(i - 1, k)  for j <= k <= t_size.
+      //  substitution_base_cost = cost(i - 1, j - 1)
+      //  insertion_cost = cost(i, j - 1)
+      const int replacement_cost = cmp(s_data[i - 1], t_data[j - 1]) ? 0 : 1;
+      const int substitution_cost = substitution_base_cost + replacement_cost;
+      const int deletion_cost = scratch[j - 1] + 1;
+
+      // Select the cheapest edit.
+      const int cheapest =  // = cost(i, j)
+          std::min(deletion_cost, std::min(insertion_cost, substitution_cost));
+
+      // Restore invariant for the next iteration of the loop.
+      substitution_base_cost = scratch[j - 1];  // = cost(i - 1, j)
+      scratch[j - 1] = cheapest;                // = cost(i, j)
+      insertion_cost = cheapest + 1;            // = cost(i, j) + 1
+    }
+  }
+  return scratch[t_size - 1];
+}
+
+template <typename Container1, typename Container2, typename Cmp>
+inline int64_t LevenshteinDistance(const Container1& s, const Container2& t,
+                                   const Cmp& cmp) {
+  return LevenshteinDistance(
+      gtl::ArraySlice<typename Container1::value_type>(s.data(), s.size()),
+      gtl::ArraySlice<typename Container1::value_type>(t.data(), t.size()),
+      cmp);
+}
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_EDIT_DISTANCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatmap.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatmap.h
new file mode 100644
index 00000000..3b112a71
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatmap.h
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+
+#include "xla/tsl/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatrep.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+
+using tsl::gtl::FlatMap;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatrep.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatrep.h
new file mode 100644
index 00000000..59caa4b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatrep.h
@@ -0,0 +1,31 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+
+#include "xla/tsl/lib/gtl/flatrep.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace internal {
+
+using tsl::gtl::internal::FlatRep;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace internal
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatset.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatset.h
new file mode 100644
index 00000000..fcb7ed96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/flatset.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+
+#include "xla/tsl/lib/gtl/flatset.h"
+
+namespace tensorflow {
+namespace gtl {
+
+using tsl::gtl::FlatSet;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/inlined_vector.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/inlined_vector.h
new file mode 100644
index 00000000..df9d1a24
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/inlined_vector.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_INLINED_VECTOR_H_
+#define TENSORFLOW_CORE_LIB_GTL_INLINED_VECTOR_H_
+
+#include "xla/tsl/lib/gtl/inlined_vector.h"  // IWYU pragma: export
+// TODO(kramerb): This is kept only because lots of targets transitively depend
+// on it. Remove all targets' dependencies.
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+
+using ::tsl::gtl::InlinedVector;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_INLINED_VECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/int_type.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/int_type.h
new file mode 100644
index 00000000..c161ee91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/int_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_INT_TYPE_H_
+#define TENSORFLOW_CORE_LIB_GTL_INT_TYPE_H_
+
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace tensorflow {
+namespace gtl {
+
+using ::tsl::gtl::IntType;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_INT_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/iterator_range.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/iterator_range.h
new file mode 100644
index 00000000..ca980fd5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/iterator_range.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This provides a very simple, boring adaptor for a begin and end iterator
+// into a range type. This should be used to build range views that work well
+// with range based for loops and range based constructors.
+//
+// Note that code here follows more standards-based coding conventions as it
+// is mirroring proposed interfaces for standardization.
+//
+// Converted from chandlerc@'s code to Google style by joshl@.
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_ITERATOR_RANGE_H_
+#define TENSORFLOW_CORE_LIB_GTL_ITERATOR_RANGE_H_
+
+#include "xla/tsl/lib/gtl/iterator_range.h"
+
+namespace tensorflow {
+namespace gtl {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::gtl::iterator_range;
+using ::tsl::gtl::make_range;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_ITERATOR_RANGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/manual_constructor.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/manual_constructor.h
new file mode 100644
index 00000000..4431f5e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/manual_constructor.h
@@ -0,0 +1,245 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ManualConstructor statically-allocates space in which to store some
+// object, but does not initialize it.  You can then call the constructor
+// and destructor for the object yourself as you see fit.  This is useful
+// for memory management optimizations, where you want to initialize and
+// destroy an object multiple times but only allocate it once.
+//
+// (When I say ManualConstructor statically allocates space, I mean that
+// the ManualConstructor object itself is forced to be the right size.)
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_MANUAL_CONSTRUCTOR_H_
+#define TENSORFLOW_CORE_LIB_GTL_MANUAL_CONSTRUCTOR_H_
+
+#include <stddef.h>
+#include <new>
+#include <utility>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mem.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace internal {
+
+//
+// Provides a char array with the exact same alignment as another type. The
+// first parameter must be a complete type, the second parameter is how many
+// of that type to provide space for.
+//
+//   TF_LIB_GTL_ALIGNED_CHAR_ARRAY(struct stat, 16) storage_;
+//
+// Because MSVC and older GCCs require that the argument to their alignment
+// construct to be a literal constant integer, we use a template instantiated
+// at all the possible powers of two.
+#ifndef SWIG
+template <int alignment, int size>
+struct AlignType {};
+template <int size>
+struct AlignType<0, size> {
+  typedef char result[size];
+};
+#if defined(_MSC_VER)
+#define TF_LIB_GTL_ALIGN_ATTRIBUTE(X) __declspec(align(X))
+#define TF_LIB_GTL_ALIGN_OF(T) __alignof(T)
+#else
+#define TF_LIB_GTL_ALIGN_ATTRIBUTE(X) __attribute__((aligned(X)))
+#define TF_LIB_GTL_ALIGN_OF(T) __alignof__(T)
+#endif
+
+#if defined(TF_LIB_GTL_ALIGN_ATTRIBUTE)
+
+#define TF_LIB_GTL_ALIGNTYPE_TEMPLATE(X)                     \
+  template <int size>                                        \
+  struct AlignType<X, size> {                                \
+    typedef TF_LIB_GTL_ALIGN_ATTRIBUTE(X) char result[size]; \
+  }
+
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(1);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(2);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(4);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(8);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(16);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(32);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(64);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(128);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(256);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(512);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(1024);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(2048);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(4096);
+TF_LIB_GTL_ALIGNTYPE_TEMPLATE(8192);
+// Any larger and MSVC++ will complain.
+
+#define TF_LIB_GTL_ALIGNED_CHAR_ARRAY(T, Size)                          \
+  typename tensorflow::gtl::internal::AlignType<TF_LIB_GTL_ALIGN_OF(T), \
+                                                sizeof(T) * Size>::result
+
+#undef TF_LIB_GTL_ALIGNTYPE_TEMPLATE
+#undef TF_LIB_GTL_ALIGN_ATTRIBUTE
+
+#else  // defined(TF_LIB_GTL_ALIGN_ATTRIBUTE)
+#error "You must define TF_LIB_GTL_ALIGNED_CHAR_ARRAY for your compiler."
+#endif  // defined(TF_LIB_GTL_ALIGN_ATTRIBUTE)
+
+#else  // !SWIG
+
+// SWIG can't represent alignment and doesn't care about alignment on data
+// members (it works fine without it).
+template <typename Size>
+struct AlignType {
+  typedef char result[Size];
+};
+#define TF_LIB_GTL_ALIGNED_CHAR_ARRAY(T, Size) \
+  tensorflow::gtl::internal::AlignType<Size * sizeof(T)>::result
+
+// Enough to parse with SWIG, will never be used by running code.
+#define TF_LIB_GTL_ALIGN_OF(Type) 16
+
+#endif  // !SWIG
+
+}  // namespace internal
+}  // namespace gtl
+
+template <typename Type>
+class ManualConstructor {
+ public:
+  // No constructor or destructor because one of the most useful uses of
+  // this class is as part of a union, and members of a union cannot have
+  // constructors or destructors.  And, anyway, the whole point of this
+  // class is to bypass these.
+
+  // Support users creating arrays of ManualConstructor<>s.  This ensures that
+  // the array itself has the correct alignment.
+  static void* operator new[](size_t size) {
+    return port::AlignedMalloc(size, TF_LIB_GTL_ALIGN_OF(Type));
+  }
+  static void operator delete[](void* mem) { port::AlignedFree(mem); }
+
+  inline Type* get() { return reinterpret_cast<Type*>(space_); }
+  inline const Type* get() const {
+    return reinterpret_cast<const Type*>(space_);
+  }
+
+  inline Type* operator->() { return get(); }
+  inline const Type* operator->() const { return get(); }
+
+  inline Type& operator*() { return *get(); }
+  inline const Type& operator*() const { return *get(); }
+
+  inline void Init() { new (space_) Type; }
+
+// Init() constructs the Type instance using the given arguments
+// (which are forwarded to Type's constructor). In C++11, Init() can
+// take any number of arguments of any type, and forwards them perfectly.
+// On pre-C++11 platforms, it can take up to 11 arguments, and may not be
+// able to forward certain kinds of arguments.
+//
+// Note that Init() with no arguments performs default-initialization,
+// not zero-initialization (i.e it behaves the same as "new Type;", not
+// "new Type();"), so it will leave non-class types uninitialized.
+#ifdef LANG_CXX11
+  template <typename... Ts>
+  inline void Init(Ts&&... args) {                 // NOLINT
+    new (space_) Type(std::forward<Ts>(args)...);  // NOLINT
+  }
+#else   // !defined(LANG_CXX11)
+  template <typename T1>
+  inline void Init(const T1& p1) {
+    new (space_) Type(p1);
+  }
+
+  template <typename T1, typename T2>
+  inline void Init(const T1& p1, const T2& p2) {
+    new (space_) Type(p1, p2);
+  }
+
+  template <typename T1, typename T2, typename T3>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3) {
+    new (space_) Type(p1, p2, p3);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4) {
+    new (space_) Type(p1, p2, p3, p4);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5) {
+    new (space_) Type(p1, p2, p3, p4, p5);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+            typename T6>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5, const T6& p6) {
+    new (space_) Type(p1, p2, p3, p4, p5, p6);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+            typename T6, typename T7>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5, const T6& p6, const T7& p7) {
+    new (space_) Type(p1, p2, p3, p4, p5, p6, p7);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+            typename T6, typename T7, typename T8>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5, const T6& p6, const T7& p7, const T8& p8) {
+    new (space_) Type(p1, p2, p3, p4, p5, p6, p7, p8);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+            typename T6, typename T7, typename T8, typename T9>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5, const T6& p6, const T7& p7, const T8& p8,
+                   const T9& p9) {
+    new (space_) Type(p1, p2, p3, p4, p5, p6, p7, p8, p9);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+            typename T6, typename T7, typename T8, typename T9, typename T10>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5, const T6& p6, const T7& p7, const T8& p8,
+                   const T9& p9, const T10& p10) {
+    new (space_) Type(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+            typename T6, typename T7, typename T8, typename T9, typename T10,
+            typename T11>
+  inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4,
+                   const T5& p5, const T6& p6, const T7& p7, const T8& p8,
+                   const T9& p9, const T10& p10, const T11& p11) {
+    new (space_) Type(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11);
+  }
+#endif  // LANG_CXX11
+
+  inline void Destroy() { get()->~Type(); }
+
+ private:
+  TF_LIB_GTL_ALIGNED_CHAR_ARRAY(Type, 1) space_;
+};
+
+#undef TF_LIB_GTL_ALIGNED_CHAR_ARRAY
+#undef TF_LIB_GTL_ALIGN_OF
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_MANUAL_CONSTRUCTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/map_util.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/map_util.h
new file mode 100644
index 00000000..47d28e7d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/map_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides utility functions for use with STL map-like data
+// structures, such as std::map and hash_map. Some functions will also work with
+// sets, such as ContainsKey().
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_MAP_UTIL_H_
+#define TENSORFLOW_CORE_LIB_GTL_MAP_UTIL_H_
+
+#include "xla/tsl/lib/gtl/map_util.h"
+
+namespace tensorflow {
+namespace gtl {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::gtl::EraseKeyReturnValuePtr;
+using ::tsl::gtl::FindOrNull;
+using ::tsl::gtl::FindPtrOrNull;
+using ::tsl::gtl::FindWithDefault;
+using ::tsl::gtl::InsertIfNotPresent;
+using ::tsl::gtl::InsertOrUpdate;
+using ::tsl::gtl::LookupOrInsert;
+using ::tsl::gtl::ReverseMap;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_MAP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/priority_queue_util.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/priority_queue_util.h
new file mode 100644
index 00000000..93bf3d30
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/priority_queue_util.h
@@ -0,0 +1,55 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
+#define TENSORFLOW_CORE_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
+
+#include <algorithm>
+#include <queue>
+#include <utility>
+
+namespace tensorflow {
+namespace gtl {
+
+// Removes the top element from a std::priority_queue and returns it.
+// Supports movable types.
+template <typename T, typename Container, typename Comparator>
+T ConsumeTop(std::priority_queue<T, Container, Comparator>* q) {
+  // std::priority_queue is required to implement pop() as if it
+  // called:
+  //   std::pop_heap()
+  //   c.pop_back()
+  // unfortunately, it does not provide access to the removed element.
+  // If the element is move only (such as a unique_ptr), there is no way to
+  // reclaim it in the standard API.  std::priority_queue does, however, expose
+  // the underlying container as a protected member, so we use that access
+  // to extract the desired element between those two calls.
+  using Q = std::priority_queue<T, Container, Comparator>;
+  struct Expose : Q {
+    using Q::c;
+    using Q::comp;
+  };
+  auto& c = q->*&Expose::c;
+  auto& comp = q->*&Expose::comp;
+  std::pop_heap(c.begin(), c.end(), comp);
+  auto r = std::move(c.back());
+  c.pop_back();
+  return r;
+}
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/subtle/map_traits.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/subtle/map_traits.h
new file mode 100644
index 00000000..c4cca1fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/subtle/map_traits.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Traits classes for performing uniform lookup on different map value types.
+//
+// The access is computed as follows:
+//
+//   1. If T has a `first` or `second` field, use them.
+//   2. Otherwise if it has `key()` or `value()` methods, use them.
+//   3. Otherwise the program is ill-formed.
+#ifndef TENSORFLOW_CORE_LIB_GTL_SUBTLE_MAP_TRAITS_H_
+#define TENSORFLOW_CORE_LIB_GTL_SUBTLE_MAP_TRAITS_H_
+
+#include "xla/tsl/lib/gtl/subtle/map_traits.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace subtle {
+namespace internal_map_traits {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::gtl::subtle::internal_map_traits::GetKey;
+using ::tsl::gtl::subtle::internal_map_traits::GetMapped;
+using ::tsl::gtl::subtle::internal_map_traits::Rank0;
+using ::tsl::gtl::subtle::internal_map_traits::Rank1;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace internal_map_traits
+}  // namespace subtle
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_SUBTLE_MAP_TRAITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/gtl/top_n.h b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/top_n.h
new file mode 100644
index 00000000..1f871e61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/gtl/top_n.h
@@ -0,0 +1,336 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This simple class finds the top n elements of an incrementally provided set
+// of elements which you push one at a time.  If the number of elements exceeds
+// n, the lowest elements are incrementally dropped.  At the end you get
+// a vector of the top elements sorted in descending order (through Extract() or
+// ExtractNondestructive()), or a vector of the top elements but not sorted
+// (through ExtractUnsorted() or ExtractUnsortedNondestructive()).
+//
+// The value n is specified in the constructor.  If there are p elements pushed
+// altogether:
+//   The total storage requirements are O(min(n, p)) elements
+//   The running time is O(p * log(min(n, p))) comparisons
+// If n is a constant, the total storage required is a constant and the running
+// time is linear in p.
+//
+// NOTE(zhifengc): There is a way to do this in O(min(n, p)) storage and O(p)
+// runtime. The basic idea is to repeatedly fill up a buffer of 2 * n elements,
+// discarding the lowest n elements whenever the buffer is full using a linear-
+// time median algorithm. This may have better performance when the input
+// sequence is partially sorted.
+//
+// NOTE(zhifengc): This class should be redesigned to avoid reallocating a
+// vector for each Extract.
+
+#ifndef TENSORFLOW_CORE_LIB_GTL_TOP_N_H_
+#define TENSORFLOW_CORE_LIB_GTL_TOP_N_H_
+
+#include <stddef.h>
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// Cmp is an stl binary predicate.  Note that Cmp is the "greater" predicate,
+// not the more commonly used "less" predicate.
+//
+// If you use a "less" predicate here, the TopN will pick out the bottom N
+// elements out of the ones passed to it, and it will return them sorted in
+// ascending order.
+//
+// TopN is rule-of-zero copyable and movable if its members are.
+template <class T, class Cmp = std::greater<T> >
+class TopN {
+ public:
+  // The TopN is in one of the three states:
+  //
+  //  o UNORDERED: this is the state an instance is originally in,
+  //    where the elements are completely orderless.
+  //
+  //  o BOTTOM_KNOWN: in this state, we keep the invariant that there
+  //    is at least one element in it, and the lowest element is at
+  //    position 0. The elements in other positions remain
+  //    unsorted. This state is reached if the state was originally
+  //    UNORDERED and a peek_bottom() function call is invoked.
+  //
+  //  o HEAP_SORTED: in this state, the array is kept as a heap and
+  //    there are exactly limit_ elements in the array. This
+  //    state is reached when at least (limit_+1) elements are
+  //    pushed in.
+  //
+  //  The state transition graph is at follows:
+  //
+  //             peek_bottom()                (limit_+1) elements pushed
+  //  UNORDERED --------------> BOTTOM_KNOWN --------------------> HEAP_SORTED
+  //      |                                                           ^
+  //      |                (limit_+1) elements pushed                 |
+  //      +-----------------------------------------------------------+
+
+  enum State { UNORDERED, BOTTOM_KNOWN, HEAP_SORTED };
+  using UnsortedIterator = typename std::vector<T>::const_iterator;
+
+  // 'limit' is the maximum number of top results to return.
+  explicit TopN(size_t limit) : TopN(limit, Cmp()) {}
+  TopN(size_t limit, const Cmp &cmp) : limit_(limit), cmp_(cmp) {}
+
+  size_t limit() const { return limit_; }
+
+  // Number of elements currently held by this TopN object.  This
+  // will be no greater than 'limit' passed to the constructor.
+  size_t size() const { return elements_.size(); }
+
+  bool empty() const { return size() == 0; }
+
+  // If you know how many elements you will push at the time you create the
+  // TopN object, you can call reserve to preallocate the memory that TopN
+  // will need to process all 'n' pushes.  Calling this method is optional.
+  void reserve(size_t n) {
+    // We may need limit_+1 for the case where we transition from an unsorted
+    // set of limit_ elements to a heap.
+    elements_.reserve(std::min(n, limit_ + 1));
+  }
+
+  // Push 'v'.  If the maximum number of elements was exceeded, drop the
+  // lowest element and return it in 'dropped' (if given). If the maximum is not
+  // exceeded, 'dropped' will remain unchanged. 'dropped' may be omitted or
+  // nullptr, in which case it is not filled in.
+  // Requires: T is CopyAssignable, Swappable
+  void push(const T &v) { push(v, nullptr); }
+  void push(const T &v, T *dropped) { PushInternal(v, dropped); }
+
+  // Move overloads of push.
+  // Requires: T is MoveAssignable, Swappable
+  void push(T &&v) {  // NOLINT(build/c++11)
+    push(std::move(v), nullptr);
+  }
+  void push(T &&v, T *dropped) {  // NOLINT(build/c++11)
+    PushInternal(std::move(v), dropped);
+  }
+
+  // Peeks the bottom result without calling Extract()
+  const T &peek_bottom();
+
+  // Extract the elements as a vector sorted in descending order.  The caller
+  // assumes ownership of the vector and must delete it when done.  This is a
+  // destructive operation.  The only method that can be called immediately
+  // after Extract() is Reset().
+  std::vector<T> *Extract();
+
+  // Similar to Extract(), but makes no guarantees the elements are in sorted
+  // order.  As with Extract(), the caller assumes ownership of the vector and
+  // must delete it when done.  This is a destructive operation.  The only
+  // method that can be called immediately after ExtractUnsorted() is Reset().
+  std::vector<T> *ExtractUnsorted();
+
+  // A non-destructive version of Extract(). Copy the elements in a new vector
+  // sorted in descending order and return it.  The caller assumes ownership of
+  // the new vector and must delete it when done.  After calling
+  // ExtractNondestructive(), the caller can continue to push() new elements.
+  std::vector<T> *ExtractNondestructive() const;
+
+  // A non-destructive version of Extract(). Copy the elements to a given
+  // vector sorted in descending order. After calling
+  // ExtractNondestructive(), the caller can continue to push() new elements.
+  // Note:
+  //  1. The given argument must to be allocated.
+  //  2. Any data contained in the vector prior to the call will be deleted
+  //     from it. After the call the vector will contain only the elements
+  //     from the data structure.
+  void ExtractNondestructive(std::vector<T> *output) const;
+
+  // A non-destructive version of ExtractUnsorted(). Copy the elements in a new
+  // vector and return it, with no guarantees the elements are in sorted order.
+  // The caller assumes ownership of the new vector and must delete it when
+  // done.  After calling ExtractUnsortedNondestructive(), the caller can
+  // continue to push() new elements.
+  std::vector<T> *ExtractUnsortedNondestructive() const;
+
+  // A non-destructive version of ExtractUnsorted(). Copy the elements into
+  // a given vector, with no guarantees the elements are in sorted order.
+  // After calling ExtractUnsortedNondestructive(), the caller can continue
+  // to push() new elements.
+  // Note:
+  //  1. The given argument must to be allocated.
+  //  2. Any data contained in the vector prior to the call will be deleted
+  //     from it. After the call the vector will contain only the elements
+  //     from the data structure.
+  void ExtractUnsortedNondestructive(std::vector<T> *output) const;
+
+  // Return an iterator to the beginning (end) of the container,
+  // with no guarantees about the order of iteration. These iterators are
+  // invalidated by mutation of the data structure.
+  UnsortedIterator unsorted_begin() const { return elements_.begin(); }
+  UnsortedIterator unsorted_end() const { return elements_.end(); }
+
+  // Accessor for comparator template argument.
+  Cmp *comparator() { return &cmp_; }
+
+  // This removes all elements.  If Extract() or ExtractUnsorted() have been
+  // called, this will put it back in an empty but useable state.
+  void Reset();
+
+ private:
+  template <typename U>
+  void PushInternal(U &&v, T *dropped);  // NOLINT(build/c++11)
+
+  // elements_ can be in one of two states:
+  //   elements_.size() <= limit_ && state_ != HEAP_SORTED:
+  //      elements_ is an unsorted vector of elements pushed so far.
+  //   elements_.size() == limit_ && state_ == HEAP_SORTED:
+  //      elements_ is an stl heap.
+  std::vector<T> elements_;
+  size_t limit_;  // Maximum number of elements to find
+  Cmp cmp_;       // Greater-than comparison function
+  State state_ = UNORDERED;
+};
+
+// ----------------------------------------------------------------------
+// Implementations of non-inline functions
+
+template <class T, class Cmp>
+template <typename U>
+void TopN<T, Cmp>::PushInternal(U &&v, T *dropped) {  // NOLINT(build/c++11)
+  if (limit_ == 0) {
+    if (dropped) *dropped = std::forward<U>(v);  // NOLINT(build/c++11)
+    return;
+  }
+  if (state_ != HEAP_SORTED) {
+    // We may temporarily extend one beyond limit_ elements here.  This is
+    // necessary for finding and removing the smallest element.
+    elements_.push_back(std::forward<U>(v));  // NOLINT(build/c++11)
+    if (elements_.size() == limit_ + 1) {
+      // Transition from unsorted vector to a heap.
+      std::make_heap(elements_.begin(), elements_.end(), cmp_);
+      std::pop_heap(elements_.begin(), elements_.end(), cmp_);
+      if (dropped) *dropped = std::move(elements_.back());
+      elements_.pop_back();  // Restore to size limit_.
+      state_ = HEAP_SORTED;
+    } else if (state_ == UNORDERED ||
+               cmp_(elements_.back(), elements_.front())) {
+      // Easy case: we just push the new element back
+    } else {
+      // To maintain the BOTTOM_KNOWN state, we need to make sure that
+      // the element at position 0 is always the smallest. So we put
+      // the new element at position 0 and push the original bottom
+      // element in the back.
+      // Warning: this code is subtle.
+      using std::swap;
+      swap(elements_.front(), elements_.back());
+    }
+
+  } else {
+    // Only insert the new element if it is greater than the least element.
+    if (cmp_(v, elements_.front())) {
+      // Remove the top (smallest) element of the min heap, then push the new
+      // value in.
+      std::pop_heap(elements_.begin(), elements_.end(), cmp_);
+      if (dropped) *dropped = std::move(elements_.back());
+      elements_.back() = std::forward<U>(v);
+      std::push_heap(elements_.begin(), elements_.end(), cmp_);
+    } else {
+      if (dropped) *dropped = std::forward<U>(v);  // NOLINT(build/c++11)
+    }
+  }
+}
+
+template <class T, class Cmp>
+const T &TopN<T, Cmp>::peek_bottom() {
+  CHECK(!empty());
+  if (state_ == UNORDERED) {
+    // We need to do a linear scan to find out the bottom element
+    int min_candidate = 0;
+    for (size_t i = 1; i < elements_.size(); ++i) {
+      if (cmp_(elements_[min_candidate], elements_[i])) {
+        min_candidate = i;
+      }
+    }
+    // By swapping the element at position 0 and the minimal
+    // element, we transition to the BOTTOM_KNOWN state
+    if (min_candidate != 0) {
+      using std::swap;
+      swap(elements_[0], elements_[min_candidate]);
+    }
+    state_ = BOTTOM_KNOWN;
+  }
+  return elements_.front();
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::Extract() {
+  auto out = new std::vector<T>;
+  out->swap(elements_);
+  if (state_ != HEAP_SORTED) {
+    std::sort(out->begin(), out->end(), cmp_);
+  } else {
+    std::sort_heap(out->begin(), out->end(), cmp_);
+  }
+  return out;
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::ExtractUnsorted() {
+  auto out = new std::vector<T>;
+  out->swap(elements_);
+  return out;
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::ExtractNondestructive() const {
+  auto out = new std::vector<T>;
+  ExtractNondestructive(out);
+  return out;
+}
+
+template <class T, class Cmp>
+void TopN<T, Cmp>::ExtractNondestructive(std::vector<T> *output) const {
+  CHECK(output);
+  *output = elements_;
+  if (state_ != HEAP_SORTED) {
+    std::sort(output->begin(), output->end(), cmp_);
+  } else {
+    std::sort_heap(output->begin(), output->end(), cmp_);
+  }
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::ExtractUnsortedNondestructive() const {
+  auto elements = new std::vector<T>;
+  ExtractUnsortedNondestructive(elements);
+  return elements;
+}
+
+template <class T, class Cmp>
+void TopN<T, Cmp>::ExtractUnsortedNondestructive(std::vector<T> *output) const {
+  CHECK(output);
+  *output = elements_;
+}
+
+template <class T, class Cmp>
+void TopN<T, Cmp>::Reset() {
+  elements_.clear();
+  state_ = UNORDERED;
+}
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GTL_TOP_N_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/hash/crc32c.h b/third_party/tflite-hdrs/tensorflow/core/lib/hash/crc32c.h
new file mode 100644
index 00000000..7e8c8307
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/hash/crc32c.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_HASH_CRC32C_H_
+#define TENSORFLOW_CORE_LIB_HASH_CRC32C_H_
+
+#include <stddef.h>
+
+#include "xla/tsl/lib/hash/crc32c.h"
+#include "tensorflow/core/platform/cord.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace crc32c {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::crc32c::Extend;
+using tsl::crc32c::kMaskDelta;
+using tsl::crc32c::Mask;
+using tsl::crc32c::Unmask;
+using tsl::crc32c::Value;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace crc32c
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_HASH_CRC32C_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/hash/hash.h b/third_party/tflite-hdrs/tensorflow/core/lib/hash/hash.h
new file mode 100644
index 00000000..fa2cc295
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/hash/hash.h
@@ -0,0 +1,23 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple hash functions used for internal data structures
+
+#ifndef TENSORFLOW_CORE_LIB_HASH_HASH_H_
+#define TENSORFLOW_CORE_LIB_HASH_HASH_H_
+
+#include "tensorflow/core/platform/hash.h"
+
+#endif  // TENSORFLOW_CORE_LIB_HASH_HASH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/histogram/histogram.h b/third_party/tflite-hdrs/tensorflow/core/lib/histogram/histogram.h
new file mode 100644
index 00000000..281e190f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/histogram/histogram.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_HISTOGRAM_HISTOGRAM_H_
+#define TENSORFLOW_CORE_LIB_HISTOGRAM_HISTOGRAM_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/tsl/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using tsl::HistogramProto;  // NOLINT
+
+namespace histogram {
+
+using tsl::histogram::Histogram;            // NOLINT
+using tsl::histogram::ThreadSafeHistogram;  // NOLINT
+
+}  // namespace histogram
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_HISTOGRAM_HISTOGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/block.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/block.h
new file mode 100644
index 00000000..d3cfb88f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/block.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_BLOCK_H_
+#define TENSORFLOW_CORE_LIB_IO_BLOCK_H_
+
+#include "xla/tsl/lib/io/block.h"
+#include "tensorflow/core/lib/io/iterator.h"
+
+namespace tensorflow {
+namespace table {
+using tsl::table::Block;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_BLOCK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/block_builder.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/block_builder.h
new file mode 100644
index 00000000..b47278cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/block_builder.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_BLOCK_BUILDER_H_
+#define TENSORFLOW_CORE_LIB_IO_BLOCK_BUILDER_H_
+
+#include "xla/tsl/lib/io/block_builder.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace table {
+using tsl::table::BlockBuilder;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_BLOCK_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/buffered_inputstream.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/buffered_inputstream.h
new file mode 100644
index 00000000..15023e6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/buffered_inputstream.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_BUFFERED_INPUTSTREAM_H_
+
+#include "xla/tsl/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::BufferedInputStream;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_BUFFERED_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/cache.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/cache.h
new file mode 100644
index 00000000..3afd011f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/cache.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_CACHE_H_
+#define TENSORFLOW_CORE_LIB_IO_CACHE_H_
+
+#include "xla/tsl/lib/io/cache.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+using tsl::Slice;  // NOLINT(misc-unused-using-decls)
+namespace table {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::table::Cache;
+using tsl::table::NewLRUCache;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace table
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/compression.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/compression.h
new file mode 100644
index 00000000..628de375
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/compression.h
@@ -0,0 +1,34 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#define TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+
+#include "xla/tsl/lib/io/compression.h"
+
+namespace tensorflow {
+namespace io {
+namespace compression {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::io::compression::kGzip;
+using tsl::io::compression::kNone;
+using tsl::io::compression::kSnappy;
+using tsl::io::compression::kZlib;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/format.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/format.h
new file mode 100644
index 00000000..49f96d19
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/format.h
@@ -0,0 +1,36 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_FORMAT_H_
+#define TENSORFLOW_CORE_LIB_IO_FORMAT_H_
+
+#include "xla/tsl/lib/io/format.h"
+#include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace table {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::table::BlockContents;
+using tsl::table::BlockHandle;
+using tsl::table::kBlockTrailerSize;
+using tsl::table::kTableMagicNumber;
+using tsl::table::ReadBlock;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace table
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_FORMAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/inputbuffer.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/inputbuffer.h
new file mode 100644
index 00000000..2573a816
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/inputbuffer.h
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_INPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_INPUTBUFFER_H_
+
+#include "xla/tsl/lib/io/inputbuffer.h"
+#include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::InputBuffer;  // NOLINT(misc-unused-using-decls)
+}
+}
+
+#endif  // TENSORFLOW_CORE_LIB_IO_INPUTBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/inputstream_interface.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/inputstream_interface.h
new file mode 100644
index 00000000..f38489d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/inputstream_interface.h
@@ -0,0 +1,31 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
+#define TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+#include "tensorflow/core/platform/cord.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::InputStreamInterface;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/iterator.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/iterator.h
new file mode 100644
index 00000000..4f3c0960
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/iterator.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef TENSORFLOW_CORE_LIB_IO_ITERATOR_H_
+#define TENSORFLOW_CORE_LIB_IO_ITERATOR_H_
+
+#include "xla/tsl/lib/io/iterator.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace table {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::table::Iterator;
+using tsl::table::NewEmptyIterator;
+using tsl::table::NewErrorIterator;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace table
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/path.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/path.h
new file mode 100644
index 00000000..f5deacd1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/path.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_PATH_H_
+#define TENSORFLOW_CORE_LIB_IO_PATH_H_
+
+#include "tensorflow/core/platform/path.h"
+
+#endif  // TENSORFLOW_CORE_LIB_IO_PATH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/proto_encode_helper.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/proto_encode_helper.h
new file mode 100644
index 00000000..8ca1d5be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/proto_encode_helper.h
@@ -0,0 +1,31 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_PROTO_ENCODE_HELPER_H_
+#define TENSORFLOW_CORE_LIB_IO_PROTO_ENCODE_HELPER_H_
+
+#include "xla/tsl/lib/io/proto_encode_helper.h"
+#include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::ProtoEncodeHelper;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_PROTO_ENCODE_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/random_inputstream.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/random_inputstream.h
new file mode 100644
index 00000000..70651bc6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/random_inputstream.h
@@ -0,0 +1,30 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_RANDOM_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_RANDOM_INPUTSTREAM_H_
+
+#include "xla/tsl/lib/io/random_inputstream.h"
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+#include "tensorflow/core/platform/cord.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::RandomAccessInputStream;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_RANDOM_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/record_reader.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/record_reader.h
new file mode 100644
index 00000000..c2a06c6b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/record_reader.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
+#define TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
+
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
+#endif  // IS_SLIM_BUILD
+#include "xla/tsl/lib/io/record_reader.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::io::RecordReader;
+using tsl::io::RecordReaderOptions;
+using tsl::io::SequentialRecordReader;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/record_writer.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/record_writer.h
new file mode 100644
index 00000000..602de00e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/record_writer.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_RECORD_WRITER_H_
+#define TENSORFLOW_CORE_LIB_IO_RECORD_WRITER_H_
+
+#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_outputbuffer.h"
+#endif  // IS_SLIM_BUILD
+#include "xla/tsl/lib/io/record_writer.h"
+#include "tensorflow/core/platform/cord.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::io::RecordWriter;
+using tsl::io::RecordWriterOptions;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_RECORD_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/table.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/table.h
new file mode 100644
index 00000000..0045829a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/table.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_TABLE_H_
+#define TENSORFLOW_CORE_LIB_IO_TABLE_H_
+
+#include "xla/tsl/lib/io/table.h"
+#include "tensorflow/core/lib/io/iterator.h"
+
+namespace tensorflow {
+namespace table {
+using tsl::table::Table;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_TABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/table_builder.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/table_builder.h
new file mode 100644
index 00000000..52e27e9a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/table_builder.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+
+#ifndef TENSORFLOW_CORE_LIB_IO_TABLE_BUILDER_H_
+#define TENSORFLOW_CORE_LIB_IO_TABLE_BUILDER_H_
+
+#include "xla/tsl/lib/io/table_builder.h"
+#include "tensorflow/core/lib/io/table_options.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace table {
+using tsl::table::TableBuilder;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_TABLE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/table_options.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/table_options.h
new file mode 100644
index 00000000..c16d4aca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/table_options.h
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_TABLE_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_TABLE_OPTIONS_H_
+
+#include "xla/tsl/lib/io/table_options.h"
+
+namespace tensorflow {
+namespace table {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::table::CompressionType;
+using tsl::table::kNoCompression;
+using tsl::table::kSnappyCompression;
+using tsl::table::Options;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace table
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_TABLE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/two_level_iterator.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/two_level_iterator.h
new file mode 100644
index 00000000..c2b94de7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/two_level_iterator.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_TWO_LEVEL_ITERATOR_H_
+#define TENSORFLOW_CORE_LIB_IO_TWO_LEVEL_ITERATOR_H_
+
+#include "xla/tsl/lib/io/two_level_iterator.h"
+#include "tensorflow/core/lib/io/iterator.h"
+
+namespace tensorflow {
+namespace table {
+using tsl::table::NewTwoLevelIterator;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_TWO_LEVEL_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_compression_options.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_compression_options.h
new file mode 100644
index 00000000..a0d43378
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_compression_options.h
@@ -0,0 +1,28 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::ZlibCompressionOptions;  // NOLINT(misc-unused-using-decls)
+}
+}
+
+#endif  // TENSORFLOW_CORE_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_inputstream.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_inputstream.h
new file mode 100644
index 00000000..086493e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_inputstream.h
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_ZLIB_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_ZLIB_INPUTSTREAM_H_
+
+#include "xla/tsl/lib/io/zlib_inputstream.h"
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::ZlibInputStream;  // NOLINT(misc-unused-using-decls);
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_ZLIB_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_outputbuffer.h b/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_outputbuffer.h
new file mode 100644
index 00000000..7d3950f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_ZLIB_OUTPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_ZLIB_OUTPUTBUFFER_H_
+
+#include "xla/tsl/lib/io/zlib_outputbuffer.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+using tsl::io::ZlibOutputBuffer;  // NOLINT(misc-unused-using-decls)
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_ZLIB_OUTPUTBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/jpeg/jpeg_handle.h b/third_party/tflite-hdrs/tensorflow/core/lib/jpeg/jpeg_handle.h
new file mode 100644
index 00000000..8b2dd418
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/jpeg/jpeg_handle.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares the functions and structures for memory I/O with libjpeg
+// These functions are not meant to be used directly, see jpeg_mem.h instead.
+
+#ifndef TENSORFLOW_CORE_LIB_JPEG_JPEG_HANDLE_H_
+#define TENSORFLOW_CORE_LIB_JPEG_JPEG_HANDLE_H_
+
+#include "tensorflow/core/platform/jpeg.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace jpeg {
+
+// Handler for fatal JPEG library errors: clean up & return
+void CatchError(j_common_ptr cinfo);
+
+typedef struct {
+  struct jpeg_destination_mgr pub;
+  JOCTET *buffer;
+  int bufsize;
+  int datacount;
+  tstring *dest;
+} MemDestMgr;
+
+typedef struct {
+  struct jpeg_source_mgr pub;
+  const unsigned char *data;
+  unsigned long int datasize;
+  bool try_recover_truncated_jpeg;
+} MemSourceMgr;
+
+void SetSrc(j_decompress_ptr cinfo, const void *data,
+            unsigned long int datasize, bool try_recover_truncated_jpeg);
+
+// JPEG destination: we will store all the data in a buffer "buffer" of total
+// size "bufsize", if the buffer overflows, we will be in trouble.
+void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize);
+// Same as above, except that buffer is only used as a temporary structure and
+// is emptied into "destination" as soon as it fills up.
+void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize,
+             tstring *destination);
+
+}  // namespace jpeg
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_JPEG_JPEG_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/jpeg/jpeg_mem.h b/third_party/tflite-hdrs/tensorflow/core/lib/jpeg/jpeg_mem.h
new file mode 100644
index 00000000..200e129b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/jpeg/jpeg_mem.h
@@ -0,0 +1,163 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines functions to compress and uncompress JPEG files
+// to and from memory.  It provides interfaces for raw images
+// (data array and size fields).
+// Direct manipulation of JPEG strings are supplied: Flip, Rotate, Crop..
+
+#ifndef TENSORFLOW_CORE_LIB_JPEG_JPEG_MEM_H_
+#define TENSORFLOW_CORE_LIB_JPEG_JPEG_MEM_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/jpeg.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace jpeg {
+
+// Flags for Uncompress
+struct UncompressFlags {
+  // ratio can be 1, 2, 4, or 8 and represent the denominator for the scaling
+  // factor (eg ratio = 4 means that the resulting image will be at 1/4 original
+  // size in both directions).
+  int ratio = 1;
+
+  // The number of bytes per pixel (1, 3 or 4), or 0 for autodetect.
+  int components = 0;
+
+  // If true, decoder will use a slower but nicer upscaling of the chroma
+  // planes (yuv420/422 only).
+  bool fancy_upscaling = true;
+
+  // If true, will attempt to fill in missing lines of truncated files
+  bool try_recover_truncated_jpeg = false;
+
+  // The minimum required fraction of lines read before the image is accepted.
+  float min_acceptable_fraction = 1.0;
+
+  // The distance in bytes from one scanline to the other.  Should be at least
+  // equal to width*components*sizeof(JSAMPLE).  If 0 is passed, the stride
+  // used will be this minimal value.
+  int stride = 0;
+
+  // Setting of J_DCT_METHOD enum in jpeglib.h, for choosing which
+  // algorithm to use for DCT/IDCT.
+  //
+  // Setting this has a quality/speed trade-off implication.
+  J_DCT_METHOD dct_method = JDCT_DEFAULT;
+
+  // Settings of crop window before decompression.
+  bool crop = false;
+  // Vertical coordinate of the top-left corner of the result in the input.
+  int crop_x = 0;
+  // Horizontal coordinate of the top-left corner of the result in the input.
+  int crop_y = 0;
+  // Width of the output image.
+  int crop_width = 0;
+  // Height of the output image.
+  int crop_height = 0;
+};
+
+// Uncompress some raw JPEG data given by the pointer srcdata and the length
+// datasize.
+// - width and height are the address where to store the size of the
+//   uncompressed image in pixels.  May be nullptr.
+// - components is the address where the number of read components are
+//   stored.  This is *output only*: to request a specific number of
+//   components use flags.components.  May be nullptr.
+// - nwarn is the address in which to store the number of warnings.
+//   May be nullptr.
+// The function returns a pointer to the raw uncompressed data or NULL if
+// there was an error. The caller of the function is responsible for
+// freeing the memory (using delete []).
+uint8* Uncompress(const void* srcdata, int datasize,
+                  const UncompressFlags& flags, int* width, int* height,
+                  int* components,  // Output only: useful with autodetect
+                  int64_t* nwarn);
+
+// Version of Uncompress that allocates memory via a callback.  The callback
+// arguments are (width, height, components).  If the size is known ahead of
+// time this function can return an existing buffer; passing a callback allows
+// the buffer to be shaped based on the JPEG header.  The caller is responsible
+// for freeing the memory *even along error paths*.
+uint8* Uncompress(const void* srcdata, int datasize,
+                  const UncompressFlags& flags, int64_t* nwarn,
+                  std::function<uint8*(int, int, int)> allocate_output);
+
+// Read jpeg header and get image information.  Returns true on success.
+// The width, height, and components points may be null.
+bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
+                  int* components);
+
+// Note: (format & 0xff) = number of components (<=> bytes per pixels)
+enum Format {
+  FORMAT_GRAYSCALE = 0x001,  // 1 byte/pixel
+  FORMAT_RGB = 0x003,        // 3 bytes/pixel RGBRGBRGBRGB...
+  FORMAT_RGBA = 0x004,       // 4 bytes/pixel RGBARGBARGBARGBA...
+  FORMAT_ABGR = 0x104        // 4 bytes/pixel ABGRABGRABGR...
+};
+
+// Flags for compression
+struct CompressFlags {
+  // Encoding of the input data for compression
+  Format format;
+
+  // Quality of the compression from 0-100
+  int quality = 95;
+
+  // If true, create a jpeg image that loads progressively
+  bool progressive = false;
+
+  // If true, reduce jpeg size without changing quality (at the cost of CPU/RAM)
+  bool optimize_jpeg_size = false;
+
+  // See http://en.wikipedia.org/wiki/Chroma_subsampling
+  bool chroma_downsampling = true;
+
+  // Resolution
+  int density_unit = 1;  // 1 = in, 2 = cm
+  int x_density = 300;
+  int y_density = 300;
+
+  // If not empty, embed this XMP metadata in the image header
+  absl::string_view xmp_metadata;
+
+  // The distance in bytes from one scanline to the other.  Should be at least
+  // equal to width*components*sizeof(JSAMPLE).  If 0 is passed, the stride
+  // used will be this minimal value.
+  int stride = 0;
+};
+
+// Compress some raw image given in srcdata, the data is a 2D array of size
+// stride*height with one of the formats enumerated above.
+// The encoded data is returned as a string.
+// If not empty, XMP metadata can be embedded in the image header
+// On error, returns the empty string (which is never a valid jpeg).
+tstring Compress(const void* srcdata, int width, int height,
+                 const CompressFlags& flags);
+
+// On error, returns false and sets output to empty.
+bool Compress(const void* srcdata, int width, int height,
+              const CompressFlags& flags, tstring* output);
+
+}  // namespace jpeg
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_JPEG_JPEG_MEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/llvm_rtti/llvm_rtti.h b/third_party/tflite-hdrs/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
new file mode 100644
index 00000000..a159e76c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_LLVM_RTTI_LLVM_RTTI_H_
+#define TENSORFLOW_CORE_LIB_LLVM_RTTI_LLVM_RTTI_H_
+
+#include "llvm/Support/Casting.h"
+
+namespace tensorflow {
+using llvm::dyn_cast;
+using llvm::isa;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_LLVM_RTTI_LLVM_RTTI_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/math/math_util.h b/third_party/tflite-hdrs/tensorflow/core/lib/math/math_util.h
new file mode 100644
index 00000000..39bae7f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/math/math_util.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MATH_MATH_UTIL_H_
+#define TENSORFLOW_CORE_LIB_MATH_MATH_UTIL_H_
+
+#include "xla/tsl/lib/math/math_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::MathUtil;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_MATH_MATH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/cell_reader-inl.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/cell_reader-inl.h
new file mode 100644
index 00000000..f7be2b62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/cell_reader-inl.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_CELL_READER_INL_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_CELL_READER_INL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/lib/monitoring/cell_reader-inl.h"
+#include "tensorflow/core/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/test_utils.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+namespace testing {
+namespace internal {
+using tsl::monitoring::testing::internal::CollectMetrics;
+using tsl::monitoring::testing::internal::GetDelta;
+using tsl::monitoring::testing::internal::GetLatestPoint;
+using tsl::monitoring::testing::internal::GetLatestValueOrDefault;
+using tsl::monitoring::testing::internal::GetMetricKind;
+using tsl::monitoring::testing::internal::GetPoints;
+using tsl::monitoring::testing::internal::GetValue;
+}  // namespace internal
+}  // namespace testing
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_CELL_READER_INL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/cell_reader.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/cell_reader.h
new file mode 100644
index 00000000..fead3ceb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/cell_reader.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_CELL_READER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_CELL_READER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/lib/monitoring/cell_reader-inl.h"
+#include "tensorflow/core/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+namespace testing {
+using tsl::monitoring::testing::CellReader;
+}  // namespace testing
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_CELL_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/collected_metrics.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/collected_metrics.h
new file mode 100644
index 00000000..fe707016
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/collected_metrics.h
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Standard format in which the metrics are collected, before being exported.
+// These are to be used only by the CollectionRegistry and exporters which
+// collect metrics using the CollectionRegistry.
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+using tsl::monitoring::CollectedMetrics;
+using tsl::monitoring::MetricDescriptor;
+using tsl::monitoring::Point;
+using tsl::monitoring::PointSet;
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/collection_registry.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/collection_registry.h
new file mode 100644
index 00000000..fa379115
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/collection_registry.h
@@ -0,0 +1,79 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+// We use a null implementation for mobile platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include <functional>
+#include <map>
+#include <memory>
+
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/macros.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+using tsl::monitoring::CollectionRegistry;
+using tsl::monitoring::MetricCollector;
+using tsl::monitoring::MetricCollectorGetter;
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#else  // !defined(IS_MOBILE_PLATFORM)
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace monitoring {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::monitoring::CollectionRegistry;
+using tsl::monitoring::Exporter;
+using tsl::monitoring::MetricCollector;
+using tsl::monitoring::MetricCollectorGetter;
+using tsl::monitoring::exporter_registration::ExporterRegistration;
+using tsl::monitoring::internal::Collector;
+namespace test_util {
+class CollectionRegistryTestAccess;
+}  // namespace test_util
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // IS_MOBILE_PLATFORM
+
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/counter.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/counter.h
new file mode 100644
index 00000000..35f68891
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/counter.h
@@ -0,0 +1,43 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+
+#include "xla/tsl/lib/monitoring/counter.h"
+#ifdef IS_MOBILE_PLATFORM
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#else
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#endif
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+
+using tsl::monitoring::Counter;
+using tsl::monitoring::CounterCell;
+
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/gauge.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/gauge.h
new file mode 100644
index 00000000..301f3683
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/gauge.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+
+#include "xla/tsl/lib/monitoring/gauge.h"
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+using tsl::monitoring::Gauge;
+using tsl::monitoring::GaugeCell;
+
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/metric_def.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/metric_def.h
new file mode 100644
index 00000000..bd256d50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/metric_def.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+
+#include <array>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+using tsl::monitoring::MetricDef;
+using tsl::monitoring::MetricKind;
+using tsl::monitoring::ValueType;
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/percentile_sampler.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/percentile_sampler.h
new file mode 100644
index 00000000..8ac77500
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/percentile_sampler.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_PERCENTILE_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_PERCENTILE_SAMPLER_H_
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+#include "xla/tsl/lib/monitoring/percentile_sampler.h"
+// clang-format on
+
+// We replace this implementation with a null implementation for mobile
+// platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/macros.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+
+using tsl::monitoring::PercentileSampler;
+using tsl::monitoring::PercentileSamplerCell;
+
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#else  // IS_MOBILE_PLATFORM
+
+#include <cmath>
+#include <map>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace monitoring {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::monitoring::PercentileSampler;
+using tsl::monitoring::PercentileSamplerCell;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // IS_MOBILE_PLATFORM
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_PERCENTILE_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/sampler.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/sampler.h
new file mode 100644
index 00000000..e794890a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/sampler.h
@@ -0,0 +1,54 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+
+#include "xla/tsl/lib/monitoring/sampler.h"
+#ifdef IS_MOBILE_PLATFORM
+
+#include <memory>
+
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#else  // IS_MOBILE_PLATFORM
+
+#include <float.h>
+
+#include <map>
+
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#endif
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+
+using tsl::monitoring::Buckets;
+using tsl::monitoring::Sampler;
+using tsl::monitoring::SamplerCell;
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/test_utils.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/test_utils.h
new file mode 100644
index 00000000..a479c878
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/test_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_TEST_UTILS_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_TEST_UTILS_H_
+
+#include <cstdint>
+
+#include "xla/tsl/lib/monitoring/test_utils.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/statusor.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+namespace testing {
+using tsl::monitoring::testing::Histogram;
+using tsl::monitoring::testing::Percentiles;
+
+}  // namespace testing
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/timed.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/timed.h
new file mode 100644
index 00000000..c8ec0b8c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/timed.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
+
+#include "xla/tsl/lib/monitoring/timed.h"
+#include "tensorflow/core/platform/env_time.h"
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+using tsl::monitoring::MakeTimed;
+using tsl::monitoring::Timed;
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/types.h b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/types.h
new file mode 100644
index 00000000..d84a7402
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/monitoring/types.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_TYPES_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_TYPES_H_
+
+#include <cmath>
+#include <vector>
+
+#include "xla/tsl/lib/monitoring/types.h"
+#include "tensorflow/core/platform/types.h"
+
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace monitoring {
+using tsl::monitoring::PercentilePoint;
+using tsl::monitoring::Percentiles;
+using tsl::monitoring::UnitOfMeasure;
+
+}  // namespace monitoring
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/png/png_io.h b/third_party/tflite-hdrs/tensorflow/core/lib/png/png_io.h
new file mode 100644
index 00000000..a7fff84c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/png/png_io.h
@@ -0,0 +1,116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read and write images in PNG format.
+//
+// The advantage over image/codec/png{enc,dec}ocder.h is that this library
+// supports both 8 and 16 bit images.
+//
+// The decoding routine accepts binary image data as a StringPiece.  These are
+// implicitly constructed from strings or char* so they're completely
+// transparent to the caller.  They're also very cheap to construct so this
+// doesn't introduce any additional overhead.
+//
+// The primary benefit of StringPieces being, in this case, that APIs already
+// returning StringPieces (e.g., Bigtable Scanner) or Cords (e.g., IOBuffer;
+// only when they're flat, though) or protocol buffer fields typed to either of
+// these can be decoded without copying the data into a C++ string.
+
+#ifndef TENSORFLOW_CORE_LIB_PNG_PNG_IO_H_
+#define TENSORFLOW_CORE_LIB_PNG_PNG_IO_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "tensorflow/core/platform/png.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace png {
+
+// Handy container for decoding information and struct pointers
+struct DecodeContext {
+  const uint8* data;
+  int data_left;
+  png_structp png_ptr;
+  png_infop info_ptr;
+  png_uint_32 width, height;
+  int num_passes;
+  int color_type;
+  int bit_depth;
+  int channels;
+  bool need_to_synthesize_16;
+  bool error_condition;
+  DecodeContext() : png_ptr(nullptr), info_ptr(nullptr) {}
+};
+
+bool DecodeHeader(absl::string_view png_string, int* width, int* height,
+                  int* components, int* channel_bit_depth,
+                  std::vector<std::pair<std::string, std::string> >* metadata);
+
+// Sample usage for reading PNG:
+//
+// string png_string;  /* fill with input PNG format data */
+// DecodeContext context;
+// CHECK(CommonInitDecode(png_string, 3 /*RGB*/, 8 /*uint8*/, &context));
+// char* image_buffer = new char[3*context.width*context.height];
+// CHECK(CommonFinishDecode(absl::bit_cast<png_byte*>(image_buffer),
+//       3*context.width /*stride*/, &context));
+//
+// desired_channels may be 0 to detected it from the input.
+
+bool CommonInitDecode(absl::string_view png_string, int desired_channels,
+                      int desired_channel_bits, DecodeContext* context);
+
+bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context);
+
+// Normally called automatically from CommonFinishDecode.  If CommonInitDecode
+// is called but not CommonFinishDecode, call this to clean up.  Safe to call
+// extra times.
+void CommonFreeDecode(DecodeContext* context);
+
+// Sample usage for writing PNG:
+//
+// uint16* image_buffer = new uint16[width*height];  /* fill with pixels */
+// string png_string;
+// CHECK(WriteImageToBuffer(image_buffer, width, height, 2*width /*stride*/,
+//       1 /*gray*/, 16 /*uint16*/, &png_string, NULL));
+//
+// compression is in [-1,9], where 0 is fast and weak compression, 9 is slow
+// and strong, and -1 is the zlib default.
+
+template <typename T>
+bool WriteImageToBuffer(
+    const void* image, int width, int height, int row_bytes, int num_channels,
+    int channel_bits, int compression, T* png_string,
+    const std::vector<std::pair<std::string, std::string> >* metadata);
+
+// Explicit instantiations defined in png_io.cc.
+extern template bool WriteImageToBuffer<std::string>(
+    const void* image, int width, int height, int row_bytes, int num_channels,
+    int channel_bits, int compression, std::string* png_string,
+    const std::vector<std::pair<std::string, std::string> >* metadata);
+extern template bool WriteImageToBuffer<tstring>(
+    const void* image, int width, int height, int row_bytes, int num_channels,
+    int channel_bits, int compression, tstring* png_string,
+    const std::vector<std::pair<std::string, std::string> >* metadata);
+
+}  // namespace png
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_PNG_PNG_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/distribution_sampler.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/distribution_sampler.h
new file mode 100644
index 00000000..6218d899
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/distribution_sampler.h
@@ -0,0 +1,47 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DistributionSampler allows generating a discrete random variable with a given
+// distribution.
+// The values taken by the variable are [0, N) and relative weights for each
+// value are specified using a vector of size N.
+//
+// The Algorithm takes O(N) time to precompute data at construction time and
+// takes O(1) time (2 random number generation, 2 lookups) for each sample.
+// The data structure takes O(N) memory.
+//
+// In contrast, util/random/weighted-picker.h provides O(lg N) sampling.
+// The advantage of that implementation is that weights can be adjusted
+// dynamically, while DistributionSampler doesn't allow weight adjustment.
+//
+// The algorithm used is Walker's Aliasing algorithm, described in Knuth, Vol 2.
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+
+#include "xla/tsl/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace random {
+using tsl::random::DistributionSampler;  // NOLINT(misc-unused-using-decls)
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/exact_uniform_int.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/exact_uniform_int.h
new file mode 100644
index 00000000..cd511d43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/exact_uniform_int.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Exact uniform integers using rejection sampling
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_EXACT_UNIFORM_INT_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_EXACT_UNIFORM_INT_H_
+
+#include "xla/tsl/lib/random/exact_uniform_int.h"
+
+namespace tensorflow {
+namespace random {
+using tsl::random::ExactUniformInt;  // NOLINT(misc-unused-using-decls)
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_EXACT_UNIFORM_INT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/philox_random.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/philox_random.h
new file mode 100644
index 00000000..2fe4120f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/philox_random.h
@@ -0,0 +1,35 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implement the Philox algorithm to generate random numbers in parallel.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+
+#include "xla/tsl/lib/random/philox_random.h"
+
+namespace tensorflow {
+namespace random {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::random::Array;
+using tsl::random::PhiloxRandom;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/random.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/random.h
new file mode 100644
index 00000000..78dedde0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/random.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_H_
+
+#include "tensorflow/core/platform/random.h"
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/random_distributions.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/random_distributions.h
new file mode 100644
index 00000000..57ce99a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/random_distributions.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions_utils.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace random {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::random::BoxMullerDouble;
+using tsl::random::NormalDistribution;
+using tsl::random::SignedAdd;
+using tsl::random::SingleSampleAdapter;
+using tsl::random::TruncatedNormalDistribution;
+using tsl::random::Uint16ToGfloat16;
+using tsl::random::Uint16ToHalf;
+using tsl::random::UniformDistribution;
+using tsl::random::UniformFullIntDistribution;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/random_distributions_utils.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/random_distributions_utils.h
new file mode 100644
index 00000000..4c268049
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/random_distributions_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_UTILS_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_UTILS_H_
+
+#include <string.h>
+
+#include <cstdint>
+
+#include "xla/tsl/lib/random/random_distributions_utils.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+namespace random {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::random::BoxMullerFloat;
+using tsl::random::Uint32ToFloat;
+using tsl::random::Uint64ToDouble;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/simple_philox.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/simple_philox.h
new file mode 100644
index 00000000..7c94ca21
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/simple_philox.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_SIMPLE_PHILOX_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_SIMPLE_PHILOX_H_
+
+#include "xla/tsl/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+namespace random {
+using tsl::random::SimplePhilox;  // NOLINT(misc-unused-using-decls)
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_SIMPLE_PHILOX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/random/weighted_picker.h b/third_party/tflite-hdrs/tensorflow/core/lib/random/weighted_picker.h
new file mode 100644
index 00000000..ae404814
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/random/weighted_picker.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An abstraction to pick from one of N elements with a specified
+// weight per element.
+//
+// The weight for a given element can be changed in O(lg N) time
+// An element can be picked in O(lg N) time.
+//
+// Uses O(N) bytes of memory.
+//
+// Alternative: distribution-sampler.h allows O(1) time picking, but no weight
+// adjustment after construction.
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_WEIGHTED_PICKER_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_WEIGHTED_PICKER_H_
+
+#include <assert.h>
+
+#include "xla/tsl/lib/random/weighted_picker.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace random {
+using tsl::random::WeightedPicker;  // NOLINT(misc-unused-using-decls)
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_WEIGHTED_PICKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/base64.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/base64.h
new file mode 100644
index 00000000..bb7cbfb3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/base64.h
@@ -0,0 +1,21 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_BASE64_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_BASE64_H_
+
+#include "tensorflow/core/platform/base64.h"
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_BASE64_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/numbers.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/numbers.h
new file mode 100644
index 00000000..cbc53d47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/numbers.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
+
+#include "tensorflow/core/platform/numbers.h"
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/ordered_code.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/ordered_code.h
new file mode 100644
index 00000000..e7485bd5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/ordered_code.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module provides routines for encoding a sequence of typed
+// entities into a string.  The resulting strings can be
+// lexicographically compared to yield the same comparison value that
+// would have been generated if the encoded items had been compared
+// one by one according to their type.
+//
+// More precisely, suppose:
+//  1. string A is generated by encoding the sequence of items [A_1..A_n]
+//  2. string B is generated by encoding the sequence of items [B_1..B_n]
+//  3. The types match; i.e., for all i: A_i was encoded using
+//     the same routine as B_i
+// Then:
+//    Comparing A vs. B lexicographically is the same as comparing
+//    the vectors [A_1..A_n] and [B_1..B_n] lexicographically.
+//
+// Furthermore, if n < m, the encoding of [A_1..A_n] is a strict prefix of
+// [A_1..A_m] (unless m = n+1 and A_m is the empty string encoded with
+// WriteTrailingString, in which case the encodings are equal).
+//
+// This module is often useful when generating multi-part sstable
+// keys that have to be ordered in a particular fashion.
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_ORDERED_CODE_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_ORDERED_CODE_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace strings {
+
+class OrderedCode {
+ public:
+  // -------------------------------------------------------------------
+  // Encoding routines: each one of the following routines append
+  // one item to "*dest" in an encoding where larger values are
+  // ordered lexicographically after smaller values.
+  static void WriteString(string* dest, absl::string_view str);
+  static void WriteNumIncreasing(string* dest, uint64 num);
+  static void WriteSignedNumIncreasing(string* dest, int64_t num);
+
+  // -------------------------------------------------------------------
+  // Decoding routines: these extract an item earlier encoded using
+  // the corresponding WriteXXX() routines above.  The item is read
+  // from "*src"; "*src" is modified to point past the decoded item;
+  // and if "result" is non-NULL, "*result" is modified to contain the
+  // result.  In case of string result, the decoded string is appended to
+  // "*result".  Returns true if the next item was read successfully, false
+  // otherwise.
+  static bool ReadString(absl::string_view* src, string* result);
+  static bool ReadNumIncreasing(absl::string_view* src, uint64* result);
+  static bool ReadSignedNumIncreasing(absl::string_view* src, int64_t* result);
+
+  // Helper for testing: corrupt "*str" by changing the kth item separator
+  // in the string.
+  static void TEST_Corrupt(string* str, int k);
+
+  // Helper for testing.
+  // SkipToNextSpecialByte is an internal routine defined in the .cc file
+  // with the following semantics. Return a pointer to the first byte
+  // in the range "[start..limit)" whose value is 0 or 255.  If no such
+  // byte exists in the range, returns "limit".
+  static const char* TEST_SkipToNextSpecialByte(const char* start,
+                                                const char* limit);
+
+ private:
+  // This has only static methods, so disallow construction entirely
+  OrderedCode();
+  OrderedCode(const OrderedCode&) = delete;
+  void operator=(const OrderedCode&) = delete;
+};
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_ORDERED_CODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/proto_serialization.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/proto_serialization.h
new file mode 100644
index 00000000..e0c253f5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/proto_serialization.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
+
+#include "xla/tsl/lib/strings/proto_serialization.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::AreSerializedProtosEqual;
+using ::tsl::DeterministicProtoHash64;
+using ::tsl::SerializeToBufferDeterministic;
+using ::tsl::SerializeToStringDeterministic;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/proto_text_util.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/proto_text_util.h
new file mode 100644
index 00000000..ef73108b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/proto_text_util.h
@@ -0,0 +1,169 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/numbers.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/strcat.h"
+
+namespace tensorflow {
+namespace strings {
+
+static constexpr char kColonSeparator[] = ": ";
+
+// Helper functions for writing proto-text output.
+// Used by the code generated from tools/proto_text/gen_proto_text_lib.cc.
+class ProtoTextOutput {
+ public:
+  // Construct a ProtoTextOutput that writes to <output> If short_debug is true,
+  // outputs text to match proto.ShortDebugString(); else matches
+  // proto.DebugString().
+  ProtoTextOutput(string* output, bool short_debug)
+      : output_(output),
+        short_debug_(short_debug),
+        field_separator_(short_debug ? " " : "\n") {}
+
+  // Writes opening of nested message and increases indent level.
+  void OpenNestedMessage(const char field_name[]) {
+    StrAppend(output_, level_empty_ ? "" : field_separator_, indent_,
+              field_name, " {", field_separator_);
+    if (!short_debug_) StrAppend(&indent_, "  ");
+    level_empty_ = true;
+  }
+
+  // Writes close of nested message and decreases indent level.
+  void CloseNestedMessage() {
+    if (!short_debug_) indent_.resize(indent_.size() - 2);
+    StrAppend(output_, level_empty_ ? "" : field_separator_, indent_, "}");
+    level_empty_ = false;
+  }
+
+  // Print the close of the top-level message that was printed.
+  void CloseTopMessage() {
+    if (!short_debug_ && !level_empty_) StrAppend(output_, "\n");
+  }
+
+  // Appends a numeric value, like my_field: 123
+  template <typename T>
+  void AppendNumeric(const char field_name[], T value) {
+    AppendFieldAndValue(field_name, StrCat(value));
+  }
+
+  // Appends a numeric value, like my_field: 123, but only if value != 0.
+  template <typename T>
+  void AppendNumericIfNotZero(const char field_name[], T value) {
+    if (value != 0) AppendNumeric(field_name, value);
+  }
+
+  // Appends a bool value, either my_field: true or my_field: false.
+  void AppendBool(const char field_name[], bool value) {
+    AppendFieldAndValue(field_name, value ? "true" : "false");
+  }
+
+  // Appends a bool value, as my_field: true, only if value is true.
+  void AppendBoolIfTrue(const char field_name[], bool value) {
+    if (value) AppendBool(field_name, value);
+  }
+
+  // Appends a string value, like my_field: "abc123".
+  void AppendString(const char field_name[], const string& value) {
+    AppendFieldAndValue(field_name, StrCat("\"", absl::CEscape(value), "\""));
+  }
+
+  // Appends a string value, like my_field: "abc123", but only if value is not
+  // empty.
+  void AppendStringIfNotEmpty(const char field_name[], const string& value) {
+    if (!value.empty()) AppendString(field_name, value);
+  }
+
+  // Appends the string name of an enum, like my_field: FIRST_ENUM.
+  void AppendEnumName(const char field_name[], const string& name) {
+    AppendFieldAndValue(field_name, name);
+  }
+
+ private:
+  void AppendFieldAndValue(const char field_name[],
+                           absl::string_view value_text) {
+    absl::StrAppend(output_, level_empty_ ? "" : field_separator_, indent_,
+                    field_name, kColonSeparator, value_text);
+    level_empty_ = false;
+  }
+
+  string* const output_;
+  const bool short_debug_;
+  const string field_separator_;
+  string indent_;
+
+  // False when at least one field has been output for the message at the
+  // current deepest level of nesting.
+  bool level_empty_ = true;
+
+  ProtoTextOutput(const ProtoTextOutput&) = delete;
+  void operator=(const ProtoTextOutput&) = delete;
+};
+
+inline void ProtoSpaceAndComments(Scanner* scanner) {
+  for (;;) {
+    scanner->AnySpace();
+    if (scanner->Peek() != '#') return;
+    // Skip until newline.
+    while (scanner->Peek('\n') != '\n') scanner->One(Scanner::ALL);
+  }
+}
+
+// Parse the next numeric value from <scanner>, returning false if parsing
+// failed.
+template <typename T>
+bool ProtoParseNumericFromScanner(Scanner* scanner, T* value) {
+  absl::string_view numeric_str;
+  scanner->RestartCapture();
+  if (!scanner->Many(Scanner::LETTER_DIGIT_DOT_PLUS_MINUS)
+           .GetResult(nullptr, &numeric_str)) {
+    return false;
+  }
+
+  // Special case to disallow multiple leading zeroes, to match proto parsing.
+  int leading_zero = 0;
+  for (size_t i = 0; i < numeric_str.size(); ++i) {
+    const char ch = numeric_str[i];
+    if (ch == '0') {
+      if (++leading_zero > 1) return false;
+    } else if (ch != '-') {
+      break;
+    }
+  }
+
+  ProtoSpaceAndComments(scanner);
+  return SafeStringToNumeric<T>(numeric_str, value);
+}
+
+// Parse the next boolean value from <scanner>, returning false if parsing
+// failed.
+bool ProtoParseBoolFromScanner(Scanner* scanner, bool* value);
+
+// Parse the next string literal from <scanner>, returning false if parsing
+// failed.
+bool ProtoParseStringLiteralFromScanner(Scanner* scanner, string* value);
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/scanner.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/scanner.h
new file mode 100644
index 00000000..c41e4475
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/scanner.h
@@ -0,0 +1,21 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_SCANNER_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_SCANNER_H_
+
+#include "tensorflow/core/platform/scanner.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_SCANNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/str_util.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/str_util.h
new file mode 100644
index 00000000..a20cbdb5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/str_util.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
+
+#include "tensorflow/core/platform/str_util.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/strcat.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/strcat.h
new file mode 100644
index 00000000..d728231f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/strcat.h
@@ -0,0 +1,25 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// #status: RECOMMENDED
+// #category: operations on strings
+// #summary: Merges strings or numbers with no delimiter.
+//
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_STRCAT_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_STRCAT_H_
+
+#include "tensorflow/core/platform/strcat.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_STRCAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/strings/stringprintf.h b/third_party/tflite-hdrs/tensorflow/core/lib/strings/stringprintf.h
new file mode 100644
index 00000000..836632d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/strings/stringprintf.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Printf variants that place their output in a C++ string.
+//
+// Usage:
+//      string result = strings::Printf("%d %s\n", 10, "hello");
+//      strings::SPrintf(&result, "%d %s\n", 10, "hello");
+//      strings::Appendf(&result, "%d %s\n", 20, "there");
+
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
+
+#include "tensorflow/core/platform/stringprintf.h"
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/lib/wav/wav_io.h b/third_party/tflite-hdrs/tensorflow/core/lib/wav/wav_io.h
new file mode 100644
index 00000000..99a3df50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/lib/wav/wav_io.h
@@ -0,0 +1,105 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to write audio in WAV format.
+
+#ifndef TENSORFLOW_CORE_LIB_WAV_WAV_IO_H_
+#define TENSORFLOW_CORE_LIB_WAV_WAV_IO_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace wav {
+
+// Encode the provided interleaved buffer of audio as a signed 16-bit PCM
+// little-endian WAV file.
+//
+// Example usage for 4 frames of an 8kHz stereo signal:
+// First channel is -1, 1, -1, 1.
+// Second channel is 0, 0, 0, 0.
+//
+// float audio_buffer[] = { -1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+// string wav_string;
+// if (EncodeAudioAsS16LEWav(audio_buffer, 8000, 2, 4, &wav_string).ok()) {
+//   // Use wav_string.
+// }
+template <typename T>
+absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
+                                   size_t num_channels, size_t num_frames,
+                                   T* wav_string);
+
+// Explicit instantiations defined in wav_io.cc.
+extern template Status EncodeAudioAsS16LEWav<std::string>(
+    const float* audio, size_t sample_rate, size_t num_channels,
+    size_t num_frames, std::string* wav_string);
+extern template Status EncodeAudioAsS16LEWav<tstring>(const float* audio,
+                                                      size_t sample_rate,
+                                                      size_t num_channels,
+                                                      size_t num_frames,
+                                                      tstring* wav_string);
+
+// Decodes the little-endian signed 16-bit PCM WAV file data (aka LIN16
+// encoding) into a float Tensor. The channels are encoded as the lowest
+// dimension of the tensor, with the number of frames as the second. This means
+// that a four frame stereo signal will have the shape [4, 2]. The sample rate
+// is read from the file header, and an error is returned if the format is not
+// supported.
+// The results are output as floats within the range -1 to 1,
+absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
+                                          std::vector<float>* float_values,
+                                          uint32* sample_count,
+                                          uint16* channel_count,
+                                          uint32* sample_rate);
+
+// Everything below here is only exposed publicly for testing purposes.
+
+// Handles moving the data index forward, validating the arguments, and avoiding
+// overflow or underflow.
+absl::Status IncrementOffset(int old_offset, int64_t increment, size_t max_size,
+                             int* new_offset);
+
+// This function is only exposed in the header for testing purposes, as a
+// template that needs to be instantiated. Reads a typed numeric value from a
+// stream of data.
+template <class T>
+absl::Status ReadValue(const std::string& data, T* value, int* offset) {
+  int new_offset;
+  TF_RETURN_IF_ERROR(
+      IncrementOffset(*offset, sizeof(T), data.size(), &new_offset));
+  if (port::kLittleEndian) {
+    memcpy(value, data.data() + *offset, sizeof(T));
+  } else {
+    *value = 0;
+    const uint8* data_buf =
+        reinterpret_cast<const uint8*>(data.data() + *offset);
+    int shift = 0;
+    for (int i = 0; i < sizeof(T); ++i, shift += 8) {
+      *value = *value | (data_buf[i] << shift);
+    }
+  }
+  *offset = new_offset;
+  return absl::OkStatus();
+}
+
+}  // namespace wav
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_WAV_WAV_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/nccl/collective_communicator.h b/third_party/tflite-hdrs/tensorflow/core/nccl/collective_communicator.h
new file mode 100644
index 00000000..484f1100
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/nccl/collective_communicator.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_NCCL_COLLECTIVE_COMMUNICATOR_H_
+#define TENSORFLOW_CORE_NCCL_COLLECTIVE_COMMUNICATOR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// Creates a NcclCommunicator if built with NCCL support (unless configured to
+// use no GPU devices), otherwise it returns nullptr.
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator(
+    const ConfigProto& config);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_NCCL_COLLECTIVE_COMMUNICATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/nccl/nccl_manager.h b/third_party/tflite-hdrs/tensorflow/core/nccl/nccl_manager.h
new file mode 100644
index 00000000..0e620139
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/nccl/nccl_manager.h
@@ -0,0 +1,283 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
+#define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <vector>
+
+// TODO(rmlarsen): Get rid of this workaround. "gpu_assert" is defined when
+// setting EIGEN_USE_THREADS. But when defining EIGEN_USE_THREADS here,
+// incAtomic and other CUDA specific symbols are no longer recognized.
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+
+#include "absl/container/flat_hash_map.h"
+#if GOOGLE_CUDA
+#include "third_party/nccl/nccl.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#endif
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+// NCCL manager is used to make the asynchronous communicator calls and to
+// manage the per-device streams used for communication.
+//
+// See nccl_ops.cc for example usage, including description of memory
+// management and stream synchronization.
+class NcclManager {
+ public:
+  typedef std::function<void(Status)> DoneCallback;
+  NcclManager();
+  ~NcclManager();
+
+  static NcclManager* instance();
+
+#if TENSORFLOW_USE_ROCM
+  static int instance_count;
+#endif
+
+  // Calls `ncclGetUniqueId` and returns the id as a string.  The returned value
+  // may be shared with other participants on different nodes and passed in to
+  // multi-node collective invocations.
+  string GenerateCommunicatorKey();
+
+  // A participant in a Collective.
+  struct Participant {
+    Participant(se::StreamExecutor* executor, se::Stream* tensor_stream,
+                const DeviceBase::AcceleratorDeviceInfo* info,
+                const Tensor* input, Tensor* output, int global_rank,
+                DoneCallback done_callback)
+        : executor(executor),
+          tensor_stream(tensor_stream),
+          event_mgr(info->event_mgr),
+          gpu_device_id(info->gpu_id),
+#if TENSORFLOW_USE_ROCM
+          context(static_cast<GPUDeviceContext*>(info->default_context)),
+#endif
+          input(input),
+          output(output),
+          global_rank(global_rank),
+          done_callback(std::move(done_callback)),
+          root(false) {
+      DCHECK(executor != nullptr);
+      DCHECK(event_mgr != nullptr);
+      DCHECK(tensor_stream != nullptr);
+    }
+
+    // StreamExecutor for the device. Expected to be live for process lifetime.
+    se::StreamExecutor* const executor = nullptr;
+
+    // `tensor_stream` is the stream that should be waited on to ensure
+    // `input`'s data is available on the GPU for the communication stream to
+    // access. It is also the stream that will use the produced data;
+    // `done_callback` is not called until the next kernel launched on `stream`
+    // would see the data. Owned by the caller, who must keep it live until
+    // `done_callback` is called.
+    se::Stream* const tensor_stream;
+
+    // EventMgr which polls on executor.
+    // Owned by the caller, who must keep it live until `done_callback` is
+    // called.
+    EventMgr* const event_mgr;
+
+    const int gpu_device_id;
+
+#if TENSORFLOW_USE_ROCM
+    GPUDeviceContext* const context;
+#endif
+
+    // Owned by the caller, who must keep it live until `done_callback` is
+    // called. Is NULL for participants that only receive data.
+    const Tensor* input;
+
+    // Owned by the caller, who must keep it live until `done_callback` is
+    // called. Is NULL for participants that only send data.
+    Tensor* output;
+
+    // Rank across all devices and all nodes.
+    // `global_rank` is not required for single-node collectives.
+    const int global_rank;
+
+    // The callback which is called at the completion of the NCCL operation.
+    // When called, `output` has been set to the result of the operation. (note:
+    // the stream may not yet have been synced)
+    DoneCallback done_callback;
+
+    // True if this is the root of the collective, e.g. source of broadcast.
+    bool root;
+  };
+
+  // Data that provides context for the collective operation, including the
+  // operation key, number of participants, and communicator key.
+  struct Context {
+    Context(const string& collective_key, int num_local_devices,
+            int num_global_devices, const string& communicator_key,
+            int source_rank)
+        : collective_key(collective_key),
+          num_local_devices(num_local_devices),
+          num_global_devices(num_global_devices),
+          communicator_key(communicator_key),
+          source_rank(source_rank) {}
+
+    // Unique key for this collective instance
+    const string& collective_key;
+
+    // Devices local to this node
+    int num_local_devices;
+
+    // Devices across all nodes
+    int num_global_devices;
+
+    // In order to use NCCL across nodes, the callee first has to generate a
+    // `communicator_key` via `GenerateCommunicatorKey()` function and share
+    // this with all the other nodes.  Each node should pass in this
+    // `communicator_key` to the `NcclManager` functions.
+    // `communicator_key` is not required for single-node collectives and can be
+    // empty.
+    const string& communicator_key;
+
+    // Rank of broadcast source.
+    int source_rank;
+  };
+
+  // Adds one participant to an all-reduce.
+  void AddToAllReduce(std::unique_ptr<Participant> participant,
+                      const Context& context, ncclRedOp_t reduction_op);
+
+  // Adds one participant to an all-gather.
+  void AddToAllGather(std::unique_ptr<Participant> participant,
+                      const Context& context);
+
+  // Adds one participant to a reduce-scatter.
+  void AddToReduceScatter(std::unique_ptr<Participant> participant,
+                          const Context& context, ncclRedOp_t reduction_op);
+
+  // AddBroadcastSend and AddBroadcastRecv combine to send data from one sender
+  // to all receivers.
+  void AddBroadcastSend(std::unique_ptr<Participant> participant,
+                        const Context& context);
+  void AddBroadcastRecv(std::unique_ptr<Participant> participant,
+                        const Context& context);
+
+  // AddReduceSend and AddReduceRecv combine to send data from all senders
+  // to one receiver.
+  void AddReduceSend(std::unique_ptr<Participant> participant,
+                     const Context& context, ncclRedOp_t reduction_op);
+  void AddReduceRecv(std::unique_ptr<Participant> participant,
+                     const Context& context, ncclRedOp_t reduction_op);
+
+  // Adds one participant to an all-to-all.
+  void AddToAllToAll(std::unique_ptr<Participant> participant,
+                     const Context& context);
+
+  // Signals that the `Collective` corresponding to `key` is ready to launch
+  // across all nodes participating in this multi-node collective operation.
+  //
+  // This should only be called for multi-node collectives; single-node
+  // collectives are implicitly ready when all participants have called Add*
+  // function.
+  void SignalMultiNodeReady(const string& collective_key);
+
+  // Aborts all collectives. After abortion, no further collectives can be
+  // launched with this NcclManager.
+  void StartAbort(const Status& s);
+
+  // Resets a previously aborted NcclManager, making it available for future
+  // collectives.
+  void Reset();
+
+ private:
+  enum CollectiveType {
+    kAllReduce = 1,
+    kBroadcast = 2,
+    kReduce = 3,
+    kAllGather = 4,
+    kReduceScatter = 5,
+    kAllToAll = 6,
+  };
+  struct Collective;
+  struct Communicator;
+  struct CommunicatorMember;
+  struct NcclStream;
+
+  // Gets the `Communicator` object that will be used to enqueue NCCL kernels
+  // for `collective`, and returns it via `communicator`.
+  //
+  // This may involve creating CUDA streams and NCCL initialization.  If a NCCL
+  // or CUDA error occurs in the process, this returns an INTERNAL error with
+  // the corresponding NCCL/CUDA error string.
+  Status GetCommunicator(Collective* collective, Communicator** communicator);
+
+  // Adds a participant device to the local `Collective` instance corresponding
+  // to `collective_key`.  Launches the `Collective` if it is ready, which it
+  // checks by calling `CheckReady()`.  Also performs consistency and sanity
+  // checks before launching.
+  void AddParticipant(std::unique_ptr<Participant> participant,
+                      const Context& context, CollectiveType collective_type,
+                      ncclRedOp_t reduction_op);
+
+  // If `collective` is ready to run, removes it from the `collectives_` map and
+  // returns true.  Otherwise returns false.
+  // Assumes `collective_key` corresponds to `collective`.
+  //
+  // A collective is ready to run when all local participants have called Add*
+  // function, and the collective is signalled globally ready via
+  // `SetMultiNodeReady`.
+  bool CheckReady(const string& collective_key, Collective* collective)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Run <collective>.  This calls takes ownership of <collective>.
+  void RunCollective(Collective* collective);
+  void LoopKernelLaunches(NcclStream* stream);
+
+  mutex mu_;
+
+  // Maps key to collectives currently being assembled or run.
+  absl::flat_hash_map<string, Collective*> collectives_ TF_GUARDED_BY(mu_);
+
+  // Maps a device to the communication streams that make up its collective.
+  // This is used to share the stream across different communicators that
+  // include the same device.
+  absl::flat_hash_map<se::StreamExecutor*, std::vector<NcclStream*>>
+      device_to_comm_streams_ TF_GUARDED_BY(mu_);
+
+  std::vector<std::unique_ptr<Communicator>> communicators_ TF_GUARDED_BY(mu_);
+
+  Status status_ TF_GUARDED_BY(mu_);
+
+  NcclManager(const NcclManager&) = delete;
+  void operator=(const NcclManager&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/ops/compat/op_compatibility_lib.h b/third_party/tflite-hdrs/tensorflow/core/ops/compat/op_compatibility_lib.h
new file mode 100644
index 00000000..776a6039
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/ops/compat/op_compatibility_lib.h
@@ -0,0 +1,86 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_OPS_COMPAT_OP_COMPATIBILITY_LIB_H_
+#define TENSORFLOW_CORE_OPS_COMPAT_OP_COMPATIBILITY_LIB_H_
+
+#include <set>
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class OpCompatibilityLib {
+ public:
+  // `ops_prefix` is a filename prefix indicating where to find the
+  //   ops files.
+  // `history_version` is used to construct the ops history file name.
+  // `*stable_ops` has an optional list of ops that we care about.
+  //   If stable_ops == nullptr, we use all registered ops.
+  //   Otherwise ValidateCompatible() ignores ops not in *stable_ops
+  //   and require all ops in *stable_ops to exist.
+  OpCompatibilityLib(const string& ops_prefix, const string& history_version,
+                     const std::set<string>* stable_ops);
+
+  // Name of the file that contains the checked-in versions of *all*
+  // ops, with docs.
+  const string& ops_file() const { return ops_file_; }
+
+  // Name of the file that contains all versions of *stable* ops,
+  // without docs.  Op history is in (alphabetical, oldest-first)
+  // order.
+  const string& op_history_file() const { return op_history_file_; }
+
+  // Name of the directory that contains all versions of *stable* ops,
+  // without docs.  Op history is one file per op, in oldest-first
+  // order within the file.
+  const string& op_history_directory() const { return op_history_directory_; }
+
+  // Should match the contents of ops_file().  Run before calling
+  // ValidateCompatible().
+  string OpsString() const {
+    string result;
+    google::protobuf::TextFormat::PrintToString(op_list_, &result);
+    return result;
+  }
+
+  // Returns the number of ops in OpsString(), includes all ops, not
+  // just stable ops.
+  int num_all_ops() const { return op_list_.op_size(); }
+
+  // <file name, file contents> pairs representing op history.
+  typedef std::vector<std::pair<string, OpList>> OpHistory;
+
+  // Make sure the current version of the *stable* ops are compatible
+  // with the historical versions, and if out_op_history != nullptr,
+  // generate a new history adding all changed ops.  Sets
+  // *changed_ops/*added_ops to the number of changed/added ops
+  // (ignoring doc changes).
+  absl::Status ValidateCompatible(Env* env, int* changed_ops, int* added_ops,
+                                  OpHistory* out_op_history);
+
+ private:
+  const string ops_file_;
+  const string op_history_file_;
+  const string op_history_directory_;
+  const std::set<string>* stable_ops_;
+  OpList op_list_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_OPS_COMPAT_OP_COMPATIBILITY_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/abi.h b/third_party/tflite-hdrs/tensorflow/core/platform/abi.h
new file mode 100644
index 00000000..8191011a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/abi.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ABI_H_
+#define TENSORFLOW_CORE_PLATFORM_ABI_H_
+
+#include "tsl/platform/abi.h"
+
+namespace tensorflow {
+namespace port {
+
+using ::tsl::port::MaybeAbiDemangle;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ABI_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/base64.h b/third_party/tflite-hdrs/tensorflow/core/platform/base64.h
new file mode 100644
index 00000000..126455fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/base64.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BASE64_H_
+#define TENSORFLOW_CORE_PLATFORM_BASE64_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tsl/platform/base64.h"
+
+namespace tensorflow {
+
+using tsl::Base64Decode;  // NOLINT
+using tsl::Base64Encode;  // NOLINT
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BASE64_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/bfloat16.h b/third_party/tflite-hdrs/tensorflow/core/platform/bfloat16.h
new file mode 100644
index 00000000..d6091aa2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/bfloat16.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BFLOAT16_H_
+#define TENSORFLOW_CORE_PLATFORM_BFLOAT16_H_
+
+// clang-format off
+#include "tensorflow/core/platform/byte_order.h"
+#include "tsl/platform/bfloat16.h"
+// clang-format on
+
+namespace tensorflow {
+typedef tsl::bfloat16 bfloat16;
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BFLOAT16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/blocking_counter.h b/third_party/tflite-hdrs/tensorflow/core/platform/blocking_counter.h
new file mode 100644
index 00000000..4e629804
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/blocking_counter.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BLOCKING_COUNTER_H_
+#define TENSORFLOW_CORE_PLATFORM_BLOCKING_COUNTER_H_
+
+#include <atomic>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tsl/platform/blocking_counter.h"
+
+namespace tensorflow {
+using tsl::BlockingCounter;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BLOCKING_COUNTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/byte_order.h b/third_party/tflite-hdrs/tensorflow/core/platform/byte_order.h
new file mode 100644
index 00000000..f6e1d172
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/byte_order.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+#define TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+
+#include "tsl/platform/byte_order.h"
+
+namespace tensorflow {
+namespace port {
+
+constexpr bool kLittleEndian = tsl::port::kLittleEndian;
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/casts.h b/third_party/tflite-hdrs/tensorflow/core/platform/casts.h
new file mode 100644
index 00000000..791ac095
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/casts.h
@@ -0,0 +1,21 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CASTS_H_
+#define TENSORFLOW_CORE_PLATFORM_CASTS_H_
+
+#include "tsl/platform/casts.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CASTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/auth_provider.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/auth_provider.h
new file mode 100644
index 00000000..987cc39f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/auth_provider.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_AUTH_PROVIDER_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_AUTH_PROVIDER_H_
+
+#include <string>
+
+#include "xla/tsl/platform/cloud/auth_provider.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::AuthProvider;
+using tsl::EmptyAuthProvider;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_AUTH_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/compute_engine_metadata_client.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
new file mode 100644
index 00000000..4c83d28a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
+
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/retrying_utils.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+using tsl::ComputeEngineMetadataClient;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/compute_engine_zone_provider.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/compute_engine_zone_provider.h
new file mode 100644
index 00000000..6b416481
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/compute_engine_zone_provider.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_ZONE_PROVIDER_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_ZONE_PROVIDER_H_
+
+#include "xla/tsl/platform/cloud/compute_engine_zone_provider.h"
+#include "tensorflow/core/platform/cloud/compute_engine_metadata_client.h"
+#include "tensorflow/core/platform/cloud/zone_provider.h"
+
+namespace tensorflow {
+using tsl::ComputeEngineZoneProvider;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_ZONE_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/curl_http_request.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/curl_http_request.h
new file mode 100644
index 00000000..385091ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/curl_http_request.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <curl/curl.h>
+#include "xla/tsl/platform/cloud/curl_http_request.h"
+#include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CurlHttpRequest;
+using tsl::LibCurl;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/expiring_lru_cache.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/expiring_lru_cache.h
new file mode 100644
index 00000000..03af7ee7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_EXPIRING_LRU_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_EXPIRING_LRU_CACHE_H_
+
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "xla/tsl/platform/cloud/expiring_lru_cache.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::ExpiringLRUCache;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_EXPIRING_LRU_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/file_block_cache.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/file_block_cache.h
new file mode 100644
index 00000000..4c907437
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/file_block_cache.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::FileBlockCache;
+using tsl::FileBlockCacheStatsInterface;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_dns_cache.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_dns_cache.h
new file mode 100644
index 00000000..813bcd0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -0,0 +1,79 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
+
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+const int64_t kDefaultRefreshRateSecs = 60;
+
+// DnsCache is a userspace DNS cache specialized for the GCS filesystem.
+//
+// Some environments have unreliable DNS resolvers. DnsCache ameliorates the
+// situation by radically reducing the number of DNS requests by performing
+// 2 DNS queries per minute (by default) on a background thread. Updated cache
+// entries are used to override curl's DNS resolution processes.
+class GcsDnsCache {
+ public:
+  // Default no-argument constructor.
+  GcsDnsCache() : GcsDnsCache(kDefaultRefreshRateSecs) {}
+
+  // Constructs a GcsDnsCache with the specified refresh rate.
+  GcsDnsCache(int64_t refresh_rate_secs)
+      : GcsDnsCache(Env::Default(), refresh_rate_secs) {}
+
+  GcsDnsCache(Env* env, int64_t refresh_rate_secs);
+
+  ~GcsDnsCache() {
+    mutex_lock l(mu_);
+    cancelled_ = true;
+    cond_var_.notify_one();
+  }
+
+  // Annotate the given HttpRequest with resolve overrides from the cache.
+  void AnnotateRequest(HttpRequest* request);
+
+ private:
+  static std::vector<string> ResolveName(const string& name);
+  static std::vector<std::vector<string>> ResolveNames(
+      const std::vector<string>& names);
+  void WorkerThread();
+
+  // Define a friend class for testing.
+  friend class GcsDnsCacheTest;
+
+  mutex mu_;
+  Env* env_;
+  condition_variable cond_var_;
+  std::default_random_engine random_ TF_GUARDED_BY(mu_);
+  bool started_ TF_GUARDED_BY(mu_) = false;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  std::unique_ptr<Thread> worker_ TF_GUARDED_BY(mu_);  // After mutable vars.
+  const int64_t refresh_rate_secs_;
+
+  // Entries in this vector correspond to entries in kCachedDomainNames.
+  std::vector<std::vector<string>> addresses_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_file_system.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_file_system.h
new file mode 100644
index 00000000..5545d2b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
+
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/gcs_file_system.h"
+#include "tensorflow/core/platform/cloud/auth_provider.h"
+#include "tensorflow/core/platform/cloud/compute_engine_metadata_client.h"
+#include "tensorflow/core/platform/cloud/compute_engine_zone_provider.h"
+#include "tensorflow/core/platform/cloud/expiring_lru_cache.h"
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+#include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/retrying_file_system.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::GcsFileSystem;
+using tsl::GcsStatsInterface;
+using tsl::GetEnvVar;
+using tsl::kBlockSize;
+using tsl::kDefaultBlockSize;
+using tsl::kDefaultMaxCacheSize;
+using tsl::kDefaultMaxStaleness;
+using tsl::kMaxCacheSize;
+using tsl::kMaxStaleness;
+using tsl::RetryingGcsFileSystem;
+using tsl::UploadSessionHandle;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_throttle.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_throttle.h
new file mode 100644
index 00000000..e4a33a38
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+
+#include "xla/tsl/platform/cloud/gcs_throttle.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::GcsThrottle;
+using tsl::GcsThrottleConfig;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/google_auth_provider.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/google_auth_provider.h
new file mode 100644
index 00000000..afefb308
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
+
+#include <memory>
+
+#include "xla/tsl/platform/cloud/google_auth_provider.h"
+#include "tensorflow/core/platform/cloud/auth_provider.h"
+#include "tensorflow/core/platform/cloud/compute_engine_metadata_client.h"
+#include "tensorflow/core/platform/cloud/oauth_client.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+using tsl::GoogleAuthProvider;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/http_request.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/http_request.h
new file mode 100644
index 00000000..aae023b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/http_request.h
@@ -0,0 +1,36 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::HttpRequest;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/http_request_fake.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/http_request_fake.h
new file mode 100644
index 00000000..de1177ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/http_request_fake.h
@@ -0,0 +1,43 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
+
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <curl/curl.h>
+#include "xla/tsl/platform/cloud/http_request_fake.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cloud/curl_http_request.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::FakeHttpRequest;
+using tsl::FakeHttpRequestFactory;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/now_seconds_env.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/now_seconds_env.h
new file mode 100644
index 00000000..395e563c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/now_seconds_env.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
+
+#include "xla/tsl/platform/cloud/now_seconds_env.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::NowSecondsEnv;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/oauth_client.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/oauth_client.h
new file mode 100644
index 00000000..ca390c9f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/oauth_client.h
@@ -0,0 +1,31 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+
+#include <memory>
+
+#include "json/json.h"
+#include "xla/tsl/platform/cloud/oauth_client.h"
+#include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+using tsl::OAuthClient;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/ram_file_block_cache.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/ram_file_block_cache.h
new file mode 100644
index 00000000..d4de2b42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/ram_file_block_cache.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/ram_file_block_cache.h"
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::RamFileBlockCache;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/time_util.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/time_util.h
new file mode 100644
index 00000000..7110d13c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/time_util.h
@@ -0,0 +1,26 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+
+#include "xla/tsl/platform/cloud/time_util.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+using tsl::ParseRfc3339Time;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cloud/zone_provider.h b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/zone_provider.h
new file mode 100644
index 00000000..07ef0609
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cloud/zone_provider.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_ZONE_PROVIDER_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_ZONE_PROVIDER_H_
+
+#include <string>
+
+#include "xla/tsl/platform/cloud/zone_provider.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+using tsl::ZoneProvider;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_ZONE_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/coding.h b/third_party/tflite-hdrs/tensorflow/core/platform/coding.h
new file mode 100644
index 00000000..091d7544
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/coding.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CODING_H_
+#define TENSORFLOW_CORE_PLATFORM_CODING_H_
+
+#include "tensorflow/core/platform/raw_coding.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/coding.h"
+
+namespace tensorflow {
+namespace core {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::core::EncodeFixed16;
+using tsl::core::EncodeFixed32;
+using tsl::core::EncodeFixed64;
+using tsl::core::EncodeVarint32;
+using tsl::core::EncodeVarint64;
+using tsl::core::GetVarint32;
+using tsl::core::GetVarint32Ptr;
+using tsl::core::GetVarint32PtrFallback;
+using tsl::core::GetVarint64;
+using tsl::core::GetVarint64Ptr;
+using tsl::core::kMaxVarint32Bytes;
+using tsl::core::kMaxVarint64Bytes;
+using tsl::core::PutFixed16;
+using tsl::core::PutFixed32;
+using tsl::core::PutFixed64;
+using tsl::core::PutVarint32;
+using tsl::core::PutVarint64;
+using tsl::core::VarintLength;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/context.h b/third_party/tflite-hdrs/tensorflow/core/platform/context.h
new file mode 100644
index 00000000..f93b5695
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/context.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CONTEXT_H_
+#define TENSORFLOW_CORE_PLATFORM_CONTEXT_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tsl/platform/context.h"
+
+namespace tensorflow {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::Context;
+using tsl::ContextKind;
+using tsl::WithContext;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cord.h b/third_party/tflite-hdrs/tensorflow/core/platform/cord.h
new file mode 100644
index 00000000..fa7d2a5d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cord.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CORD_H_
+#define TENSORFLOW_CORE_PLATFORM_CORD_H_
+
+#include "tsl/platform/cord.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CORD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cpu_feature_guard.h b/third_party/tflite-hdrs/tensorflow/core/platform/cpu_feature_guard.h
new file mode 100644
index 00000000..3d7bfe95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cpu_feature_guard.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CPU_FEATURE_GUARD_H_
+#define TENSORFLOW_CORE_PLATFORM_CPU_FEATURE_GUARD_H_
+
+namespace tensorflow {
+namespace port {
+
+// Called by the framework when we expect heavy CPU computation and we want to
+// be sure that the code has been compiled to run optimally on the current
+// hardware. The first time it's called it will run lightweight checks of
+// available SIMD acceleration features and log warnings about any that aren't
+// used.
+void InfoAboutUnusedCPUFeatures();
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CPU_FEATURE_GUARD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cpu_info.h b/third_party/tflite-hdrs/tensorflow/core/platform/cpu_info.h
new file mode 100644
index 00000000..8e0b101b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cpu_info.h
@@ -0,0 +1,94 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CPU_INFO_H_
+#define TENSORFLOW_CORE_PLATFORM_CPU_INFO_H_
+
+#include <string>
+
+// TODO(ahentz): This is not strictly required here but, for historical
+// reasons, many people depend on cpu_info.h in order to use kLittleEndian.
+#include "tensorflow/core/platform/byte_order.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace tensorflow {
+namespace port {
+using tsl::port::Aarch64CPU;
+using tsl::port::ADX;
+using tsl::port::AES;
+using tsl::port::AMX_BF16;
+using tsl::port::AMX_FP16;
+using tsl::port::AMX_INT8;
+using tsl::port::AMX_TILE;
+using tsl::port::AVX;
+using tsl::port::AVX2;
+using tsl::port::AVX512_4FMAPS;
+using tsl::port::AVX512_4VNNIW;
+using tsl::port::AVX512_BF16;
+using tsl::port::AVX512_FP16;
+using tsl::port::AVX512_VNNI;
+using tsl::port::AVX512BW;
+using tsl::port::AVX512CD;
+using tsl::port::AVX512DQ;
+using tsl::port::AVX512ER;
+using tsl::port::AVX512F;
+using tsl::port::AVX512IFMA;
+using tsl::port::AVX512PF;
+using tsl::port::AVX512VBMI;
+using tsl::port::AVX512VL;
+using tsl::port::AVX_NE_CONVERT;
+using tsl::port::AVX_VNNI;
+using tsl::port::AVX_VNNI_INT8;
+using tsl::port::BMI1;
+using tsl::port::BMI2;
+using tsl::port::CMOV;
+using tsl::port::CMPXCHG16B;
+using tsl::port::CMPXCHG8B;
+using tsl::port::CPUFamily;
+using tsl::port::CPUFeature;
+using tsl::port::CPUIDNumSMT;
+using tsl::port::CPUModelNum;
+using tsl::port::CPUVendorIDString;
+using tsl::port::F16C;
+using tsl::port::FMA;
+using tsl::port::GetCurrentCPU;
+using tsl::port::HYPERVISOR;
+using tsl::port::kUnknownCPU;
+using tsl::port::MaxParallelism;
+using tsl::port::MMX;
+using tsl::port::NominalCPUFrequency;
+using tsl::port::NumHyperthreadsPerCore;
+using tsl::port::NumSchedulableCPUs;
+using tsl::port::NumTotalCPUs;
+using tsl::port::PCLMULQDQ;
+using tsl::port::POPCNT;
+using tsl::port::PREFETCHW;
+using tsl::port::PREFETCHWT1;
+using tsl::port::RDRAND;
+using tsl::port::RDSEED;
+using tsl::port::SMAP;
+using tsl::port::SSE;
+using tsl::port::SSE2;
+using tsl::port::SSE3;
+using tsl::port::SSE4_1;
+using tsl::port::SSE4_2;
+using tsl::port::SSSE3;
+using tsl::port::TestAarch64CPU;
+using tsl::port::TestCPUFeature;
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CPU_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/crash_analysis.h b/third_party/tflite-hdrs/tensorflow/core/platform/crash_analysis.h
new file mode 100644
index 00000000..c4555ee9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/crash_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CRASH_ANALYSIS_H_
+#define TENSORFLOW_CORE_PLATFORM_CRASH_ANALYSIS_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tsl/platform/crash_analysis.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CRASH_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/ctstring.h b/third_party/tflite-hdrs/tensorflow/core/platform/ctstring.h
new file mode 100644
index 00000000..3b9359d4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/ctstring.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CTSTRING_H_
+#define TENSORFLOW_CORE_PLATFORM_CTSTRING_H_
+
+#include "tsl/platform/ctstring.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CTSTRING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/ctstring_internal.h b/third_party/tflite-hdrs/tensorflow/core/platform/ctstring_internal.h
new file mode 100644
index 00000000..c087dbca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/ctstring_internal.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CTSTRING_INTERNAL_H_
+#define TENSORFLOW_CORE_PLATFORM_CTSTRING_INTERNAL_H_
+
+#include "tsl/platform/ctstring_internal.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CTSTRING_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/cuda.h b/third_party/tflite-hdrs/tensorflow/core/platform/cuda.h
new file mode 100644
index 00000000..d032f23a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/cuda.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CUDA_H_
+#define TENSORFLOW_CORE_PLATFORM_CUDA_H_
+
+#include "tensorflow/core/platform/platform.h"  // IWYU pragma: keep
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CUDA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/demangle.h b/third_party/tflite-hdrs/tensorflow/core/platform/demangle.h
new file mode 100644
index 00000000..fd569122
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/demangle.h
@@ -0,0 +1,28 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#define TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/demangle.h"
+
+namespace tensorflow {
+namespace port {
+using tsl::port::Demangle;
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/denormal.h b/third_party/tflite-hdrs/tensorflow/core/platform/denormal.h
new file mode 100644
index 00000000..47dcf75c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/denormal.h
@@ -0,0 +1,35 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_DENORMAL_H_
+#define TENSORFLOW_CORE_PLATFORM_DENORMAL_H_
+
+#include "tensorflow/core/platform/macros.h"
+#include "tsl/platform/denormal.h"
+
+namespace tensorflow {
+namespace port {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::port::DenormalState;
+using tsl::port::GetDenormalState;
+using tsl::port::ScopedDontFlushDenormal;
+using tsl::port::ScopedFlushDenormal;
+using tsl::port::ScopedRestoreFlushDenormalState;
+using tsl::port::SetDenormalState;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DENORMAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/dynamic_annotations.h b/third_party/tflite-hdrs/tensorflow/core/platform/dynamic_annotations.h
new file mode 100644
index 00000000..795c978f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/dynamic_annotations.h
@@ -0,0 +1,22 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+#define TENSORFLOW_CORE_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tsl/platform/dynamic_annotations.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DYNAMIC_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/enable_tf2_utils.h b/third_party/tflite-hdrs/tensorflow/core/platform/enable_tf2_utils.h
new file mode 100644
index 00000000..856ee1f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/enable_tf2_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ENABLE_TF2_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_ENABLE_TF2_UTILS_H_
+
+namespace tensorflow {
+
+// Sets the tf2 execution state. This can be used to indicate whether the user
+// has explicitly asked for tf2 execution.
+void set_tf2_execution(bool enabled);
+
+// Returns true or false depending on whether the user flag for tf2 execution
+// has been set. The default is false.
+bool tf2_execution_enabled();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ENABLE_TF2_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/env.h b/third_party/tflite-hdrs/tensorflow/core/platform/env.h
new file mode 100644
index 00000000..c88c758a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/env.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ENV_H_
+#define TENSORFLOW_CORE_PLATFORM_ENV_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/env.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::Env;
+using tsl::EnvWrapper;
+using tsl::FileSystemCopyFile;
+using tsl::ReadBinaryProto;
+using tsl::ReadFileToString;
+using tsl::ReadTextOrBinaryProto;
+using tsl::ReadTextProto;
+using tsl::setenv;
+using tsl::Thread;
+using tsl::ThreadOptions;
+using tsl::unsetenv;
+using tsl::WriteBinaryProto;
+using tsl::WriteStringToFile;
+using tsl::WriteTextProto;
+namespace register_file_system {
+using tsl::register_file_system::Register;
+}  // namespace register_file_system
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/env_time.h b/third_party/tflite-hdrs/tensorflow/core/platform/env_time.h
new file mode 100644
index 00000000..b2831965
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/env_time.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_ENV_TIME_H_
+#define TENSORFLOW_CORE_PLATFORM_ENV_TIME_H_
+
+#include <stdint.h>
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/env_time.h"
+
+namespace tensorflow {
+using tsl::EnvTime;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ENV_TIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/error_logging.h b/third_party/tflite-hdrs/tensorflow/core/platform/error_logging.h
new file mode 100644
index 00000000..378a0cb6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/error_logging.h
@@ -0,0 +1,25 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ERROR_LOGGING_H_
+#define TENSORFLOW_CORE_PLATFORM_ERROR_LOGGING_H_
+
+#include "tsl/platform/error_logging.h"
+
+namespace tensorflow {
+using tsl::error_logging::Log;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ERROR_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/error_payloads.h b/third_party/tflite-hdrs/tensorflow/core/platform/error_payloads.h
new file mode 100644
index 00000000..7f1d8b61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/error_payloads.h
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ERROR_PAYLOADS_H_
+#define TENSORFLOW_CORE_PLATFORM_ERROR_PAYLOADS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/core_platform_payloads.pb.h"
+// This file contains macros and payload keys for the error counter in
+// EagerClient.
+
+namespace tsl {
+
+// Proto: tensorflow::core::platform::ErrorSourceProto
+// Location: tensorflow/core/protobuf/core_platform_payloads.proto
+// Usage: Payload key for recording the error raised source. Payload value is
+// retrieved to update counter in
+// tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc.
+constexpr char kErrorSource[] =
+    "type.googleapis.com/tensorflow.core.platform.ErrorSourceProto";
+
+// Set payload when status is not ok and ErrorSource payload hasn't been set.
+// The code below will be used at every place where we would like to catch
+// the error for the error counter in EagerClient.
+
+void OkOrSetErrorCounterPayload(
+    const tensorflow::core::platform::ErrorSourceProto::ErrorSource&
+        error_source,
+    absl::Status& status);
+}  // namespace tsl
+
+namespace tensorflow {
+using tsl::kErrorSource;                // NOLINT
+using tsl::OkOrSetErrorCounterPayload;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ERROR_PAYLOADS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/errors.h b/third_party/tflite-hdrs/tensorflow/core/platform/errors.h
new file mode 100644
index 00000000..343edd91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/errors.h
@@ -0,0 +1,107 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ERRORS_H_
+#define TENSORFLOW_CORE_PLATFORM_ERRORS_H_
+
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tsl/platform/errors.h"
+
+namespace tensorflow {
+namespace errors {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// Maps UNIX errors into a Status.
+using error::OK;
+using tsl::errors::Aborted;
+using tsl::errors::AbortedWithPayloads;
+using tsl::errors::AlreadyExists;
+using tsl::errors::AlreadyExistsWithPayloads;
+using tsl::errors::AppendToMessage;
+using tsl::errors::Cancelled;
+using tsl::errors::CancelledWithPayloads;
+using tsl::errors::CopyPayloads;
+using tsl::errors::Create;
+using tsl::errors::CreateWithUpdatedMessage;
+using tsl::errors::DataLoss;
+using tsl::errors::DataLossWithPayloads;
+using tsl::errors::DeadlineExceeded;
+using tsl::errors::DeadlineExceededWithPayloads;
+using tsl::errors::FailedPrecondition;
+using tsl::errors::FailedPreconditionWithPayloads;
+using tsl::errors::FormatColocationNodeForError;
+using tsl::errors::FormatFunctionForError;
+using tsl::errors::FormatNodeNameForError;
+using tsl::errors::FormatNodeNamesForError;
+using tsl::errors::FormatOriginalNodeLocationForError;
+using tsl::errors::GetPayloads;
+using tsl::errors::InsertPayloads;
+using tsl::errors::Internal;
+using tsl::errors::InternalWithPayloads;
+using tsl::errors::InvalidArgument;
+using tsl::errors::InvalidArgumentWithPayloads;
+using tsl::errors::IOError;
+using tsl::errors::IsAborted;
+using tsl::errors::IsAlreadyExists;
+using tsl::errors::IsCancelled;
+using tsl::errors::IsDataLoss;
+using tsl::errors::IsDeadlineExceeded;
+using tsl::errors::IsFailedPrecondition;
+using tsl::errors::IsInternal;
+using tsl::errors::IsInvalidArgument;
+using tsl::errors::IsNotFound;
+using tsl::errors::IsOutOfRange;
+using tsl::errors::IsPermissionDenied;
+using tsl::errors::IsResourceExhausted;
+using tsl::errors::IsUnauthenticated;
+using tsl::errors::IsUnavailable;
+using tsl::errors::IsUnimplemented;
+using tsl::errors::IsUnknown;
+using tsl::errors::NotFound;
+using tsl::errors::NotFoundWithPayloads;
+using tsl::errors::OutOfRange;
+using tsl::errors::OutOfRangeWithPayloads;
+using tsl::errors::PermissionDenied;
+using tsl::errors::PermissionDeniedWithPayloads;
+using tsl::errors::ReplaceErrorFromNonCommunicationOps;
+using tsl::errors::ResourceExhausted;
+using tsl::errors::ResourceExhaustedWithPayloads;
+using tsl::errors::Unauthenticated;
+using tsl::errors::UnauthenticatedWithPayloads;
+using tsl::errors::Unavailable;
+using tsl::errors::UnavailableWithPayloads;
+using tsl::errors::Unimplemented;
+using tsl::errors::UnimplementedWithPayloads;
+using tsl::errors::Unknown;
+using tsl::errors::UnknownPayloads;
+namespace internal {
+using tsl::errors::internal::PrepareForStrCat;
+}
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace errors
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ERRORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/file_statistics.h b/third_party/tflite-hdrs/tensorflow/core/platform/file_statistics.h
new file mode 100644
index 00000000..b9059288
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/file_statistics.h
@@ -0,0 +1,26 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/file_statistics.h"
+
+namespace tensorflow {
+using tsl::FileStatistics;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/file_system.h b/third_party/tflite-hdrs/tensorflow/core/platform/file_system.h
new file mode 100644
index 00000000..14826a90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/file_system.h
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_H_
+
+#include <stdint.h>
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/platform/cord.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_statistics.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/file_system.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::FileSystem;
+using tsl::FileSystemRegistry;
+using tsl::RandomAccessFile;
+using tsl::ReadOnlyMemoryRegion;
+using tsl::TransactionToken;
+using tsl::WrappedFileSystem;
+using tsl::WritableFile;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/file_system_helper.h b/third_party/tflite-hdrs/tensorflow/core/platform/file_system_helper.h
new file mode 100644
index 00000000..01b3a92d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/file_system_helper.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tsl/platform/file_system_helper.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::Env;
+using tsl::FileSystem;
+
+namespace internal {
+using tsl::internal::FileExists;
+using tsl::internal::GetMatchingPaths;
+}  // namespace internal
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/fingerprint.h b/third_party/tflite-hdrs/tensorflow/core/platform/fingerprint.h
new file mode 100644
index 00000000..d209799c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/fingerprint.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
+#define TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
+
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace tensorflow {
+
+using Fprint128 = tsl::Fprint128;
+using Fprint128Hasher = tsl::Fprint128Hasher;
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::Fingerprint128;
+using tsl::Fingerprint32;
+using tsl::Fingerprint64;
+using tsl::FingerprintCat64;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/float8.h b/third_party/tflite-hdrs/tensorflow/core/platform/float8.h
new file mode 100644
index 00000000..e2cad449
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/float8.h
@@ -0,0 +1,26 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FLOAT8_H_
+#define TENSORFLOW_CORE_PLATFORM_FLOAT8_H_
+
+#include "tsl/platform/ml_dtypes.h"
+
+namespace tensorflow {
+typedef tsl::float8_e4m3fn float8_e4m3fn;
+typedef tsl::float8_e5m2 float8_e5m2;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FLOAT8_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/gif.h b/third_party/tflite-hdrs/tensorflow/core/platform/gif.h
new file mode 100644
index 00000000..79af3822
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/gif.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_GIF_H_
+#define TENSORFLOW_CORE_PLATFORM_GIF_H_
+
+#include "gif_lib.h"  // from @gif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_GIF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/hash.h b/third_party/tflite-hdrs/tensorflow/core/platform/hash.h
new file mode 100644
index 00000000..85364243
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/hash.h
@@ -0,0 +1,35 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple hash functions used for internal data structures
+
+#ifndef TENSORFLOW_CORE_PLATFORM_HASH_H_
+#define TENSORFLOW_CORE_PLATFORM_HASH_H_
+
+#include "tsl/platform/hash.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::hash;
+using ::tsl::Hash32;
+using ::tsl::Hash64;
+using ::tsl::Hash64Combine;
+using ::tsl::Hash64CombineUnordered;
+using ::tsl::StringPieceHasher;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+
+#endif  // TENSORFLOW_CORE_PLATFORM_HASH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/host_info.h b/third_party/tflite-hdrs/tensorflow/core/platform/host_info.h
new file mode 100644
index 00000000..caab7ae3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/host_info.h
@@ -0,0 +1,31 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_HOST_INFO_H_
+#define TENSORFLOW_CORE_PLATFORM_HOST_INFO_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/host_info.h"
+
+namespace tensorflow {
+namespace port {
+using tsl::port::Hostname;
+using tsl::port::IOStatistics;
+using tsl::port::JobName;
+using tsl::port::JobUid;
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_HOST_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/human_readable_json.h b/third_party/tflite-hdrs/tensorflow/core/platform/human_readable_json.h
new file mode 100644
index 00000000..73cc5165
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/human_readable_json.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
+#define TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
+
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/human_readable_json.h"
+
+namespace tensorflow {
+using tsl::HumanReadableJsonToProto;
+using tsl::ProtoToHumanReadableJson;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/init_main.h b/third_party/tflite-hdrs/tensorflow/core/platform/init_main.h
new file mode 100644
index 00000000..07b0620e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/init_main.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_INIT_MAIN_H_
+#define TENSORFLOW_CORE_PLATFORM_INIT_MAIN_H_
+
+#include "tsl/platform/init_main.h"
+
+namespace tensorflow {
+namespace port {
+using tsl::port::InitMain;
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_INIT_MAIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/intrusive_ptr.h b/third_party/tflite-hdrs/tensorflow/core/platform/intrusive_ptr.h
new file mode 100644
index 00000000..b46bf5d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/intrusive_ptr.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_INTRUSIVE_PTR_H_
+#define TENSORFLOW_CORE_PLATFORM_INTRUSIVE_PTR_H_
+
+#include <algorithm>
+
+#include "tsl/platform/intrusive_ptr.h"
+
+namespace tensorflow {
+namespace core {
+
+template <class T>
+using IntrusivePtr = tsl::core::IntrusivePtr<T>;
+
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_INTRUSIVE_PTR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/jpeg.h b/third_party/tflite-hdrs/tensorflow/core/platform/jpeg.h
new file mode 100644
index 00000000..68dadd18
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/jpeg.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_JPEG_H_
+#define TENSORFLOW_CORE_PLATFORM_JPEG_H_
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+extern "C" {
+#include "jerror.h"  // from @libjpeg_turbo   // IWYU pragma: export
+#include "jpeglib.h"  // from @libjpeg_turbo  // IWYU pragma: export
+}
+
+#endif  // TENSORFLOW_CORE_PLATFORM_JPEG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/load_library.h b/third_party/tflite-hdrs/tensorflow/core/platform/load_library.h
new file mode 100644
index 00000000..6bb4a416
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/load_library.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_LOAD_LIBRARY_H_
+#define TENSORFLOW_CORE_PLATFORM_LOAD_LIBRARY_H_
+
+#include "tsl/platform/load_library.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+using ::tsl::internal::FormatLibraryFileName;
+using ::tsl::internal::GetSymbolFromLibrary;
+using ::tsl::internal::LoadDynamicLibrary;
+
+}  // namespace internal
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_LOAD_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/logging.h b/third_party/tflite-hdrs/tensorflow/core/platform/logging.h
new file mode 100644
index 00000000..0a5b0205
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/logging.h
@@ -0,0 +1,36 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_LOGGING_H_
+#define TENSORFLOW_CORE_PLATFORM_LOGGING_H_
+
+#include "tensorflow/core/platform/types.h"   // IWYU pragma: export
+#include "tsl/platform/logging.h"  // IWYU pragma: export
+
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace internal {
+using tsl::internal::LogString;
+}  // namespace internal
+using tsl::TFAddLogSink;
+using tsl::TFGetLogSinks;
+using tsl::TFLogEntry;
+using tsl::TFLogSink;
+using tsl::TFRemoveLogSink;
+using tsl::UpdateLogVerbosityIfDefined;
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+
+#endif  // TENSORFLOW_CORE_PLATFORM_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/macros.h b/third_party/tflite-hdrs/tensorflow/core/platform/macros.h
new file mode 100644
index 00000000..975f1c59
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/macros.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_MACROS_H_
+#define TENSORFLOW_CORE_PLATFORM_MACROS_H_
+
+#include "tsl/platform/macros.h"  // IWYU pragma: export
+
+namespace tensorflow {
+namespace internal {
+template <typename T>
+constexpr auto remove_unused_variable_compiler_warning =
+    tsl::internal::remove_unused_variable_compiler_warning<T>;
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/mem.h b/third_party/tflite-hdrs/tensorflow/core/platform/mem.h
new file mode 100644
index 00000000..20acf859
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/mem.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_MEM_H_
+#define TENSORFLOW_CORE_PLATFORM_MEM_H_
+
+#include "tsl/platform/mem.h"
+// TODO(cwhipkey): remove this when callers use annotations directly.
+#include "tensorflow/core/platform/dynamic_annotations.h"
+
+namespace tensorflow {
+namespace port {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::port::AlignedFree;
+using ::tsl::port::AlignedMalloc;
+using ::tsl::port::AvailableRam;
+using ::tsl::port::Free;
+using ::tsl::port::GetMemoryBandwidthInfo;
+using ::tsl::port::GetMemoryInfo;
+using ::tsl::port::Malloc;
+using ::tsl::port::MallocExtension_GetAllocatedSize;
+using ::tsl::port::MallocExtension_ReleaseToSystem;
+using ::tsl::port::MemoryBandwidthInfo;
+using ::tsl::port::MemoryInfo;
+using ::tsl::port::Realloc;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_MEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/mutex.h b/third_party/tflite-hdrs/tensorflow/core/platform/mutex.h
new file mode 100644
index 00000000..4a8d76c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/mutex.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_MUTEX_H_
+#define TENSORFLOW_CORE_PLATFORM_MUTEX_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/mutex.h"
+
+namespace tensorflow {
+
+using tsl::Condition;
+using tsl::condition_variable;
+using tsl::ConditionResult;
+using tsl::kCond_MaybeNotified;
+using tsl::kCond_Timeout;
+using tsl::LINKER_INITIALIZED;
+using tsl::LinkerInitialized;
+using tsl::mutex;
+using tsl::mutex_lock;
+using tsl::tf_shared_lock;
+using tsl::WaitForMilliseconds;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_MUTEX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/net.h b/third_party/tflite-hdrs/tensorflow/core/platform/net.h
new file mode 100644
index 00000000..4b9d51fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/net.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NET_H_
+#define TENSORFLOW_CORE_PLATFORM_NET_H_
+
+#include "tsl/platform/net.h"
+
+namespace tensorflow {
+namespace internal {
+using ::tsl::internal::PickUnusedPortOrDie;  // NOLINT(misc-unused-using-decls)
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/notification.h b/third_party/tflite-hdrs/tensorflow/core/platform/notification.h
new file mode 100644
index 00000000..a2d48a63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/notification.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NOTIFICATION_H_
+#define TENSORFLOW_CORE_PLATFORM_NOTIFICATION_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tsl/platform/notification.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::Notification;
+using tsl::WaitForNotificationWithTimeout;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NOTIFICATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/null_file_system.h b/third_party/tflite-hdrs/tensorflow/core/platform/null_file_system.h
new file mode 100644
index 00000000..3fc7d179
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/null_file_system.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
+
+#include "tsl/platform/null_file_system.h"
+
+namespace tensorflow {
+#ifndef SWIG
+using ::tsl::NullFileSystem;  // NOLINT(misc-unused-using-decls)
+#endif
+
+// END_SKIP_DOXYGEN
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/numa.h b/third_party/tflite-hdrs/tensorflow/core/platform/numa.h
new file mode 100644
index 00000000..6333c01f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/numa.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NUMA_H_
+#define TENSORFLOW_CORE_PLATFORM_NUMA_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/numa.h"
+
+namespace tensorflow {
+namespace port {
+using tsl::port::kNUMANoAffinity;
+using tsl::port::NUMAEnabled;
+using tsl::port::NUMAFree;
+using tsl::port::NUMAGetMemAffinity;
+using tsl::port::NUMAGetThreadNodeAffinity;
+using tsl::port::NUMAMalloc;
+using tsl::port::NUMANumNodes;
+using tsl::port::NUMASetThreadNodeAffinity;
+}  // namespace port
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PLATFORM_NUMA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/numbers.h b/third_party/tflite-hdrs/tensorflow/core/platform/numbers.h
new file mode 100644
index 00000000..3164aab4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/numbers.h
@@ -0,0 +1,52 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NUMBERS_H_
+#define TENSORFLOW_CORE_PLATFORM_NUMBERS_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/numbers.h"
+
+namespace tensorflow {
+namespace strings {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::strings::DoubleToBuffer;
+using tsl::strings::FastInt32ToBufferLeft;
+using tsl::strings::FastInt64ToBufferLeft;
+using tsl::strings::FastUInt32ToBufferLeft;
+using tsl::strings::FastUInt64ToBufferLeft;
+using tsl::strings::FloatToBuffer;
+using tsl::strings::FpToString;
+using tsl::strings::HexStringToUint64;
+using tsl::strings::HumanReadableElapsedTime;
+using tsl::strings::HumanReadableNum;
+using tsl::strings::HumanReadableNumBytes;
+using tsl::strings::kFastToBufferSize;
+using tsl::strings::ProtoParseNumeric;
+using tsl::strings::safe_strto32;
+using tsl::strings::safe_strto64;
+using tsl::strings::safe_strtod;
+using tsl::strings::safe_strtof;
+using tsl::strings::safe_strtou32;
+using tsl::strings::safe_strtou64;
+using tsl::strings::SafeStringToNumeric;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NUMBERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/path.h b/third_party/tflite-hdrs/tensorflow/core/platform/path.h
new file mode 100644
index 00000000..ca13a99f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/path.h
@@ -0,0 +1,47 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PATH_H_
+#define TENSORFLOW_CORE_PLATFORM_PATH_H_
+
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/path.h"
+
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace tensorflow {
+namespace io {
+namespace internal {
+using tsl::io::internal::JoinPathImpl;
+}
+#ifndef SWIG  // variadic templates
+using tsl::io::JoinPath;
+#endif /* SWIG */
+using tsl::io::Basename;
+using tsl::io::BasenamePrefix;
+using tsl::io::CleanPath;
+using tsl::io::CommonPathPrefix;
+using tsl::io::CreateURI;
+using tsl::io::Dirname;
+using tsl::io::Extension;
+using tsl::io::GetTempFilename;
+using tsl::io::GetTestUndeclaredOutputsDir;
+using tsl::io::IsAbsolutePath;
+using tsl::io::ParseURI;
+}  // namespace io
+}  // namespace tensorflow
+// NOLINTEND(misc-unused-using-decls)
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PATH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/platform.h b/third_party/tflite-hdrs/tensorflow/core/platform/platform.h
new file mode 100644
index 00000000..6d5d9879
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/platform.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_H_
+
+#include "tsl/platform/platform.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/platform_strings.h b/third_party/tflite-hdrs/tensorflow/core/platform/platform_strings.h
new file mode 100644
index 00000000..a42f7c76
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/platform_strings.h
@@ -0,0 +1,362 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+
+// This header defines the macro TF_PLATFORM_STRINGS() which should be used
+// once in each dynamically loadable TensorFlow module.  It embeds static
+// strings into the compilation unit that allow TensorFlow to determine what
+// compilation options were in effect when the compilation unit was built.  All
+// compilation units within the same dynamically loadable library should be
+// built with the same options (or at least, the strings should be embedded in
+// the compilation unit built with the most restrictive options).
+
+// The platform strings embedded into a binary may be retrieved with the
+// GetPlatformStrings function.
+
+// Rationale:
+// We wish to load only those libraries that this CPU can execute.  For
+// example, we should not load a library compiled with avx256 instructions on a
+// CPU that cannot execute them.
+//
+// One might think that one could dlopen() the library, and call a routine that
+// would return which cpu type it was compiled for.  Alas, this does not work,
+// because at dlopen() time, a library containing C++ will execute constructors
+// of class variables with static storage class.  Even code that looks
+// innocuous may use optional platform-specific instructions.  For example,
+// the fastest way to zero a region of memory might use optional instructions.
+//
+// One might think one could run a tool such as "objdump" to read flags from
+// the libraries' headers, or perhaps disassemble each library to look for
+// particular instructions.  Unfortunately, the desired flags are not present
+// in the headers, and disassembly can be prohibitively slow ("objdump -d" is
+// very slow, for example).  Moreover, a tool to examine the library may not
+// be present on the system unless the user has installed special packages (for
+// example, on Windows).
+//
+// Instead, we adopt a crude but straightforward solution:  We require
+// developers to use the macro TF_PLATFORM_STRINGS() in their library, to
+// embed the compilation options as constant strings.  The compiler's
+// predefined macros pick which strings are included.  We then search for the
+// strings in the files, and then dlopen() only those libraries that have or
+// lack strings as needed.
+//
+// We adopt the approach of placing in the binary a fairly raw copy of the
+// predefined macros, rather than trying to interpret them in complex ways at
+// compile time.  This allows the loading binary to alter its interpretation of
+// the strings without library developers having to recompile.
+
+#include <stdio.h>
+
+#include <string>
+#include <vector>
+
+// Aside from the header guard, the internal macros defined here have the form:
+//   TF_PLAT_STR_*
+
+// If a macro is removed from the list of tested macros, the major version in
+// the following version number should be incremented, and the minor version
+// set to zero.  Otherwise, if a macro is added to the list of tested macros,
+// the minor number should be incremented.
+#define TF_PLAT_STR_VERSION_ "1.0"
+
+// Prefix of each option string indicator in the binary.
+// After the prefix, such strings have the form:
+//    [A-Za-z_0-9]=<value>
+// followed by a terminating nul.  To simplify searching, this prefix is all
+// ASCII, starts with a nul, and contains no character twice.
+#define TF_PLAT_STR_MAGIC_PREFIX_ "\0S\\s\":^p*L}"
+
+// A helper macro for TF_PLAT_STR_AS_STR_().
+#define TF_PLAT_STR_STR_1_(x) #x
+
+// Yield a constant string corresponding to x, after macro expansion.
+#define TF_PLAT_STR_AS_STR_(x) TF_PLAT_STR_STR_1_(x)
+
+// An empty definition to make lists more uniform.
+#define TF_PLAT_STR_TERMINATOR_
+
+// TF_PLAT_STR_(x) introduces a constant string indicating whether a
+// particular compilation option has been turned on.
+//
+// In gcc and clang, we might imagine using something like
+// #define TF_PLAT_STR_(x) \
+//     (sizeof (#x) != sizeof (TF_PLAT_STR_AS_STR_ (x))? \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_ (x) : \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=0"),
+// but some compilers (notably MSVC) place both "foo" and "bar" in the binary
+// when presented with
+//    (true?  "foo" : "bar")
+// so we must use #if to select the strings we need, which is rather verbose.
+#define TF_PLAT_STR_(x) TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_(x)
+
+// Include the #if machinery that sets the macros used below.
+// platform_strings_computed.h can be generated by filtering this header file
+// through:
+// awk '
+// header == "" { print; }
+// /\*\// && header == "" {
+//     print "// Generated from platform_strings.h.";
+//     print "";
+//     print "#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "";
+//     header = 1;
+// }
+// /^#define TF_PLAT_STR_LIST_[a-zA-Z0-9_]*\(\) *\\$/ { active = 1; }
+// /TF_PLAT_STR_TERMINATOR_/ { active = 0; }
+// /^ *TF_PLAT_STR_[A-Za-z0-9_]* *\\$/ && active {
+//     x = $0;
+//     sub(/^ *TF_PLAT_STR_/, "", x);
+//     sub(/ *\\$/, "", x);
+//     printf ("#if defined(%s)\n", x);
+//     printf ("#define TF_PLAT_STR_%s TF_PLAT_STR_(%s)\n", x, x);
+//     printf ("#else\n");
+//     printf ("#define TF_PLAT_STR_%s\n", x);
+//     printf ("#endif\n");
+// }
+// END {
+//     print "";
+//     print "#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+// }'
+#include "tensorflow/core/platform/platform_strings_computed.h"
+
+// clang-format butchers the following lines.
+// clang-format off
+
+// x86_64 and x86_32 optional features.
+#define TF_PLAT_STR_LIST___x86_64__()                                      \
+        TF_PLAT_STR__M_IX86_FP                                             \
+        TF_PLAT_STR__NO_PREFETCHW                                          \
+        TF_PLAT_STR___3dNOW_A__                                            \
+        TF_PLAT_STR___3dNOW__                                              \
+        TF_PLAT_STR___ABM__                                                \
+        TF_PLAT_STR___ADX__                                                \
+        TF_PLAT_STR___AES__                                                \
+        TF_PLAT_STR___AVX2__                                               \
+        TF_PLAT_STR___AVX512BW__                                           \
+        TF_PLAT_STR___AVX512CD__                                           \
+        TF_PLAT_STR___AVX512DQ__                                           \
+        TF_PLAT_STR___AVX512ER__                                           \
+        TF_PLAT_STR___AVX512F__                                            \
+        TF_PLAT_STR___AVX512IFMA__                                         \
+        TF_PLAT_STR___AVX512PF__                                           \
+        TF_PLAT_STR___AVX512VBMI__                                         \
+        TF_PLAT_STR___AVX512VL__                                           \
+        TF_PLAT_STR___AVX__                                                \
+        TF_PLAT_STR___BMI2__                                               \
+        TF_PLAT_STR___BMI__                                                \
+        TF_PLAT_STR___CLFLUSHOPT__                                         \
+        TF_PLAT_STR___CLZERO__                                             \
+        TF_PLAT_STR___F16C__                                               \
+        TF_PLAT_STR___FMA4__                                               \
+        TF_PLAT_STR___FMA__                                                \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___FSGSBASE__                                           \
+        TF_PLAT_STR___FXSR__                                               \
+        TF_PLAT_STR___LWP__                                                \
+        TF_PLAT_STR___LZCNT__                                              \
+        TF_PLAT_STR___MMX__                                                \
+        TF_PLAT_STR___MWAITX__                                             \
+        TF_PLAT_STR___PCLMUL__                                             \
+        TF_PLAT_STR___PKU__                                                \
+        TF_PLAT_STR___POPCNT__                                             \
+        TF_PLAT_STR___PRFCHW__                                             \
+        TF_PLAT_STR___RDRND__                                              \
+        TF_PLAT_STR___RDSEED__                                             \
+        TF_PLAT_STR___RTM__                                                \
+        TF_PLAT_STR___SHA__                                                \
+        TF_PLAT_STR___SSE2_MATH__                                          \
+        TF_PLAT_STR___SSE2__                                               \
+        TF_PLAT_STR___SSE_MATH__                                           \
+        TF_PLAT_STR___SSE__                                                \
+        TF_PLAT_STR___SSE3__                                               \
+        TF_PLAT_STR___SSE4A__                                              \
+        TF_PLAT_STR___SSE4_1__                                             \
+        TF_PLAT_STR___SSE4_2__                                             \
+        TF_PLAT_STR___SSSE3__                                              \
+        TF_PLAT_STR___TBM__                                                \
+        TF_PLAT_STR___XOP__                                                \
+        TF_PLAT_STR___XSAVEC__                                             \
+        TF_PLAT_STR___XSAVEOPT__                                           \
+        TF_PLAT_STR___XSAVES__                                             \
+        TF_PLAT_STR___XSAVE__                                              \
+        TF_PLAT_STR_TERMINATOR_
+
+// PowerPC (64- and 32-bit) optional features.
+#define TF_PLAT_STR_LIST___powerpc64__()                                   \
+        TF_PLAT_STR__SOFT_DOUBLE                                           \
+        TF_PLAT_STR__SOFT_FLOAT                                            \
+        TF_PLAT_STR___ALTIVEC__                                            \
+        TF_PLAT_STR___APPLE_ALTIVEC__                                      \
+        TF_PLAT_STR___CRYPTO__                                             \
+        TF_PLAT_STR___FLOAT128_HARDWARE__                                  \
+        TF_PLAT_STR___FLOAT128_TYPE__                                      \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___HTM__                                                \
+        TF_PLAT_STR___NO_FPRS__                                            \
+        TF_PLAT_STR___NO_LWSYNC__                                          \
+        TF_PLAT_STR___POWER8_VECTOR__                                      \
+        TF_PLAT_STR___POWER9_VECTOR__                                      \
+        TF_PLAT_STR___PPC405__                                             \
+        TF_PLAT_STR___QUAD_MEMORY_ATOMIC__                                 \
+        TF_PLAT_STR___RECIPF__                                             \
+        TF_PLAT_STR___RECIP_PRECISION__                                    \
+        TF_PLAT_STR___RECIP__                                              \
+        TF_PLAT_STR___RSQRTEF__                                            \
+        TF_PLAT_STR___RSQRTE__                                             \
+        TF_PLAT_STR___TM_FENCE__                                           \
+        TF_PLAT_STR___UPPER_REGS_DF__                                      \
+        TF_PLAT_STR___UPPER_REGS_SF__                                      \
+        TF_PLAT_STR___VEC__                                                \
+        TF_PLAT_STR___VSX__                                                \
+        TF_PLAT_STR_TERMINATOR_
+
+// aarch64 and 32-bit arm optional features
+#define TF_PLAT_STR_LIST___aarch64__()                                     \
+        TF_PLAT_STR___ARM_ARCH                                             \
+        TF_PLAT_STR___ARM_FEATURE_CLZ                                      \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRYPTO                                   \
+        TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING                        \
+        TF_PLAT_STR___ARM_FEATURE_DSP                                      \
+        TF_PLAT_STR___ARM_FEATURE_FMA                                      \
+        TF_PLAT_STR___ARM_FEATURE_IDIV                                     \
+        TF_PLAT_STR___ARM_FEATURE_LDREX                                    \
+        TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN                           \
+        TF_PLAT_STR___ARM_FEATURE_QBIT                                     \
+        TF_PLAT_STR___ARM_FEATURE_QRDMX                                    \
+        TF_PLAT_STR___ARM_FEATURE_SAT                                      \
+        TF_PLAT_STR___ARM_FEATURE_SIMD32                                   \
+        TF_PLAT_STR___ARM_FEATURE_UNALIGNED                                \
+        TF_PLAT_STR___ARM_FP                                               \
+        TF_PLAT_STR___ARM_NEON_FP                                          \
+        TF_PLAT_STR___ARM_NEON__                                           \
+        TF_PLAT_STR___ARM_WMMX                                             \
+        TF_PLAT_STR___IWMMXT2__                                            \
+        TF_PLAT_STR___IWMMXT__                                             \
+        TF_PLAT_STR___VFP_FP__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+// Generic features, including indication of architecture and OS.
+// The _M_* macros are defined by Visual Studio.
+// It doesn't define __LITTLE_ENDIAN__ or __BYTE_ORDER__;
+// Windows is assumed to be little endian.
+#define TF_PLAT_STR_LIST___generic__()                                     \
+        TF_PLAT_STR_TARGET_IPHONE_SIMULATOR                                \
+        TF_PLAT_STR_TARGET_OS_IOS                                          \
+        TF_PLAT_STR_TARGET_OS_IPHONE                                       \
+        TF_PLAT_STR__MSC_VER                                               \
+        TF_PLAT_STR__M_ARM                                                 \
+        TF_PLAT_STR__M_ARM64                                               \
+        TF_PLAT_STR__M_ARM_ARMV7VE                                         \
+        TF_PLAT_STR__M_ARM_FP                                              \
+        TF_PLAT_STR__M_IX86                                                \
+        TF_PLAT_STR__M_X64                                                 \
+        TF_PLAT_STR__WIN32                                                 \
+        TF_PLAT_STR__WIN64                                                 \
+        TF_PLAT_STR___ANDROID__                                            \
+        TF_PLAT_STR___APPLE__                                              \
+        TF_PLAT_STR___BYTE_ORDER__                                         \
+        TF_PLAT_STR___CYGWIN__                                             \
+        TF_PLAT_STR___FreeBSD__                                            \
+        TF_PLAT_STR___LITTLE_ENDIAN__                                      \
+        TF_PLAT_STR___NetBSD__                                             \
+        TF_PLAT_STR___OpenBSD__                                            \
+        TF_PLAT_STR_____MSYS__                                             \
+        TF_PLAT_STR___aarch64__                                            \
+        TF_PLAT_STR___alpha__                                              \
+        TF_PLAT_STR___arm__                                                \
+        TF_PLAT_STR___i386__                                               \
+        TF_PLAT_STR___i686__                                               \
+        TF_PLAT_STR___ia64__                                               \
+        TF_PLAT_STR___linux__                                              \
+        TF_PLAT_STR___mips32__                                             \
+        TF_PLAT_STR___mips64__                                             \
+        TF_PLAT_STR___powerpc64__                                          \
+        TF_PLAT_STR___powerpc__                                            \
+        TF_PLAT_STR___riscv___                                             \
+        TF_PLAT_STR___s390x__                                              \
+        TF_PLAT_STR___sparc64__                                            \
+        TF_PLAT_STR___sparc__                                              \
+        TF_PLAT_STR___x86_64__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+#if !defined(__x86_64__) && !defined(_M_X64) && \
+    !defined(__i386__) && !defined(_M_IX86)
+#undef TF_PLAT_STR_LIST___x86_64__
+#define TF_PLAT_STR_LIST___x86_64__()
+#endif
+#if !defined(__powerpc64__) && !defined(__powerpc__)
+#undef TF_PLAT_STR_LIST___powerpc64__
+#define TF_PLAT_STR_LIST___powerpc64__()
+#endif
+#if !defined(__aarch64__) && !defined(_M_ARM64) && \
+    !defined(__arm__) && !defined(_M_ARM)
+#undef TF_PLAT_STR_LIST___aarch64__
+#define TF_PLAT_STR_LIST___aarch64__()
+#endif
+
+// Macro to be used in each dynamically loadable library.
+//
+// The BSS global variable tf_cpu_option_global and the class
+// instance tf_cpu_option_avoid_omit_class are needed to prevent
+// compilers/linkers such as clang from omitting the static variable
+// tf_cpu_option[], which would otherwise appear to be unused.  We cannot make
+// tf_cpu_option[] global, because we then might get multiply-defined symbols
+// if TF_PLAT_STR() is used twice in the same library.
+// (tf_cpu_option_global doesn't see such errors because it is
+// defined in BSS, so multiple definitions are combined by the linker.)  gcc's
+// __attribute__((used)) is insufficient because it seems to be ignored by
+// linkers.
+#define TF_PLATFORM_STRINGS()                                                  \
+    static const char tf_cpu_option[] =                                        \
+        TF_PLAT_STR_MAGIC_PREFIX_ "TF_PLAT_STR_VERSION=" TF_PLAT_STR_VERSION_  \
+        TF_PLAT_STR_LIST___x86_64__()                                          \
+        TF_PLAT_STR_LIST___powerpc64__()                                       \
+        TF_PLAT_STR_LIST___aarch64__()                                         \
+        TF_PLAT_STR_LIST___generic__()                                         \
+    ;                                                                          \
+    const char *tf_cpu_option_global;                                          \
+    namespace {                                                                \
+    class TFCPUOptionHelper {                                                  \
+     public:                                                                   \
+      TFCPUOptionHelper() {                                                    \
+        /* Compilers/linkers remove unused variables aggressively.  The */     \
+        /* following gyrations subvert most such optimizations. */             \
+        tf_cpu_option_global = tf_cpu_option;                                  \
+        /* Nothing is printed because the string starts with a nul. */         \
+        printf("%s%s", tf_cpu_option, "");                                     \
+      }                                                                        \
+    } tf_cpu_option_avoid_omit_class;                                          \
+    }  /* anonymous namespace */
+// clang-format on
+
+namespace tensorflow {
+
+// Retrieves the platform strings from the file at the given path and appends
+// them to the given vector. If the returned int is non-zero, an error occurred
+// reading the file and vector may or may not be modified. The returned error
+// code is suitable for use with strerror().
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/platform_strings_computed.h b/third_party/tflite-hdrs/tensorflow/core/platform/platform_strings_computed.h
new file mode 100644
index 00000000..6a17f3bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/platform_strings_computed.h
@@ -0,0 +1,735 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Generated from platform_strings.h.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+
+#if defined(_M_IX86_FP)
+#define TF_PLAT_STR__M_IX86_FP TF_PLAT_STR_(_M_IX86_FP)
+#else
+#define TF_PLAT_STR__M_IX86_FP
+#endif
+#if defined(_NO_PREFETCHW)
+#define TF_PLAT_STR__NO_PREFETCHW TF_PLAT_STR_(_NO_PREFETCHW)
+#else
+#define TF_PLAT_STR__NO_PREFETCHW
+#endif
+#if defined(__3dNOW_A__)
+#define TF_PLAT_STR___3dNOW_A__ TF_PLAT_STR_(__3dNOW_A__)
+#else
+#define TF_PLAT_STR___3dNOW_A__
+#endif
+#if defined(__3dNOW__)
+#define TF_PLAT_STR___3dNOW__ TF_PLAT_STR_(__3dNOW__)
+#else
+#define TF_PLAT_STR___3dNOW__
+#endif
+#if defined(__ABM__)
+#define TF_PLAT_STR___ABM__ TF_PLAT_STR_(__ABM__)
+#else
+#define TF_PLAT_STR___ABM__
+#endif
+#if defined(__ADX__)
+#define TF_PLAT_STR___ADX__ TF_PLAT_STR_(__ADX__)
+#else
+#define TF_PLAT_STR___ADX__
+#endif
+#if defined(__AES__)
+#define TF_PLAT_STR___AES__ TF_PLAT_STR_(__AES__)
+#else
+#define TF_PLAT_STR___AES__
+#endif
+#if defined(__AVX2__)
+#define TF_PLAT_STR___AVX2__ TF_PLAT_STR_(__AVX2__)
+#else
+#define TF_PLAT_STR___AVX2__
+#endif
+#if defined(__AVX512BW__)
+#define TF_PLAT_STR___AVX512BW__ TF_PLAT_STR_(__AVX512BW__)
+#else
+#define TF_PLAT_STR___AVX512BW__
+#endif
+#if defined(__AVX512CD__)
+#define TF_PLAT_STR___AVX512CD__ TF_PLAT_STR_(__AVX512CD__)
+#else
+#define TF_PLAT_STR___AVX512CD__
+#endif
+#if defined(__AVX512DQ__)
+#define TF_PLAT_STR___AVX512DQ__ TF_PLAT_STR_(__AVX512DQ__)
+#else
+#define TF_PLAT_STR___AVX512DQ__
+#endif
+#if defined(__AVX512ER__)
+#define TF_PLAT_STR___AVX512ER__ TF_PLAT_STR_(__AVX512ER__)
+#else
+#define TF_PLAT_STR___AVX512ER__
+#endif
+#if defined(__AVX512F__)
+#define TF_PLAT_STR___AVX512F__ TF_PLAT_STR_(__AVX512F__)
+#else
+#define TF_PLAT_STR___AVX512F__
+#endif
+#if defined(__AVX512IFMA__)
+#define TF_PLAT_STR___AVX512IFMA__ TF_PLAT_STR_(__AVX512IFMA__)
+#else
+#define TF_PLAT_STR___AVX512IFMA__
+#endif
+#if defined(__AVX512PF__)
+#define TF_PLAT_STR___AVX512PF__ TF_PLAT_STR_(__AVX512PF__)
+#else
+#define TF_PLAT_STR___AVX512PF__
+#endif
+#if defined(__AVX512VBMI__)
+#define TF_PLAT_STR___AVX512VBMI__ TF_PLAT_STR_(__AVX512VBMI__)
+#else
+#define TF_PLAT_STR___AVX512VBMI__
+#endif
+#if defined(__AVX512VL__)
+#define TF_PLAT_STR___AVX512VL__ TF_PLAT_STR_(__AVX512VL__)
+#else
+#define TF_PLAT_STR___AVX512VL__
+#endif
+#if defined(__AVX__)
+#define TF_PLAT_STR___AVX__ TF_PLAT_STR_(__AVX__)
+#else
+#define TF_PLAT_STR___AVX__
+#endif
+#if defined(__BMI2__)
+#define TF_PLAT_STR___BMI2__ TF_PLAT_STR_(__BMI2__)
+#else
+#define TF_PLAT_STR___BMI2__
+#endif
+#if defined(__BMI__)
+#define TF_PLAT_STR___BMI__ TF_PLAT_STR_(__BMI__)
+#else
+#define TF_PLAT_STR___BMI__
+#endif
+#if defined(__CLFLUSHOPT__)
+#define TF_PLAT_STR___CLFLUSHOPT__ TF_PLAT_STR_(__CLFLUSHOPT__)
+#else
+#define TF_PLAT_STR___CLFLUSHOPT__
+#endif
+#if defined(__CLZERO__)
+#define TF_PLAT_STR___CLZERO__ TF_PLAT_STR_(__CLZERO__)
+#else
+#define TF_PLAT_STR___CLZERO__
+#endif
+#if defined(__F16C__)
+#define TF_PLAT_STR___F16C__ TF_PLAT_STR_(__F16C__)
+#else
+#define TF_PLAT_STR___F16C__
+#endif
+#if defined(__FMA4__)
+#define TF_PLAT_STR___FMA4__ TF_PLAT_STR_(__FMA4__)
+#else
+#define TF_PLAT_STR___FMA4__
+#endif
+#if defined(__FMA__)
+#define TF_PLAT_STR___FMA__ TF_PLAT_STR_(__FMA__)
+#else
+#define TF_PLAT_STR___FMA__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__FSGSBASE__)
+#define TF_PLAT_STR___FSGSBASE__ TF_PLAT_STR_(__FSGSBASE__)
+#else
+#define TF_PLAT_STR___FSGSBASE__
+#endif
+#if defined(__FXSR__)
+#define TF_PLAT_STR___FXSR__ TF_PLAT_STR_(__FXSR__)
+#else
+#define TF_PLAT_STR___FXSR__
+#endif
+#if defined(__LWP__)
+#define TF_PLAT_STR___LWP__ TF_PLAT_STR_(__LWP__)
+#else
+#define TF_PLAT_STR___LWP__
+#endif
+#if defined(__LZCNT__)
+#define TF_PLAT_STR___LZCNT__ TF_PLAT_STR_(__LZCNT__)
+#else
+#define TF_PLAT_STR___LZCNT__
+#endif
+#if defined(__MMX__)
+#define TF_PLAT_STR___MMX__ TF_PLAT_STR_(__MMX__)
+#else
+#define TF_PLAT_STR___MMX__
+#endif
+#if defined(__MWAITX__)
+#define TF_PLAT_STR___MWAITX__ TF_PLAT_STR_(__MWAITX__)
+#else
+#define TF_PLAT_STR___MWAITX__
+#endif
+#if defined(__PCLMUL__)
+#define TF_PLAT_STR___PCLMUL__ TF_PLAT_STR_(__PCLMUL__)
+#else
+#define TF_PLAT_STR___PCLMUL__
+#endif
+#if defined(__PKU__)
+#define TF_PLAT_STR___PKU__ TF_PLAT_STR_(__PKU__)
+#else
+#define TF_PLAT_STR___PKU__
+#endif
+#if defined(__POPCNT__)
+#define TF_PLAT_STR___POPCNT__ TF_PLAT_STR_(__POPCNT__)
+#else
+#define TF_PLAT_STR___POPCNT__
+#endif
+#if defined(__PRFCHW__)
+#define TF_PLAT_STR___PRFCHW__ TF_PLAT_STR_(__PRFCHW__)
+#else
+#define TF_PLAT_STR___PRFCHW__
+#endif
+#if defined(__RDRND__)
+#define TF_PLAT_STR___RDRND__ TF_PLAT_STR_(__RDRND__)
+#else
+#define TF_PLAT_STR___RDRND__
+#endif
+#if defined(__RDSEED__)
+#define TF_PLAT_STR___RDSEED__ TF_PLAT_STR_(__RDSEED__)
+#else
+#define TF_PLAT_STR___RDSEED__
+#endif
+#if defined(__RTM__)
+#define TF_PLAT_STR___RTM__ TF_PLAT_STR_(__RTM__)
+#else
+#define TF_PLAT_STR___RTM__
+#endif
+#if defined(__SHA__)
+#define TF_PLAT_STR___SHA__ TF_PLAT_STR_(__SHA__)
+#else
+#define TF_PLAT_STR___SHA__
+#endif
+#if defined(__SSE2_MATH__)
+#define TF_PLAT_STR___SSE2_MATH__ TF_PLAT_STR_(__SSE2_MATH__)
+#else
+#define TF_PLAT_STR___SSE2_MATH__
+#endif
+#if defined(__SSE2__)
+#define TF_PLAT_STR___SSE2__ TF_PLAT_STR_(__SSE2__)
+#else
+#define TF_PLAT_STR___SSE2__
+#endif
+#if defined(__SSE_MATH__)
+#define TF_PLAT_STR___SSE_MATH__ TF_PLAT_STR_(__SSE_MATH__)
+#else
+#define TF_PLAT_STR___SSE_MATH__
+#endif
+#if defined(__SSE__)
+#define TF_PLAT_STR___SSE__ TF_PLAT_STR_(__SSE__)
+#else
+#define TF_PLAT_STR___SSE__
+#endif
+#if defined(__SSE3__)
+#define TF_PLAT_STR___SSE3__ TF_PLAT_STR_(__SSE3__)
+#else
+#define TF_PLAT_STR___SSE3__
+#endif
+#if defined(__SSE4A__)
+#define TF_PLAT_STR___SSE4A__ TF_PLAT_STR_(__SSE4A__)
+#else
+#define TF_PLAT_STR___SSE4A__
+#endif
+#if defined(__SSE4_1__)
+#define TF_PLAT_STR___SSE4_1__ TF_PLAT_STR_(__SSE4_1__)
+#else
+#define TF_PLAT_STR___SSE4_1__
+#endif
+#if defined(__SSE4_2__)
+#define TF_PLAT_STR___SSE4_2__ TF_PLAT_STR_(__SSE4_2__)
+#else
+#define TF_PLAT_STR___SSE4_2__
+#endif
+#if defined(__SSSE3__)
+#define TF_PLAT_STR___SSSE3__ TF_PLAT_STR_(__SSSE3__)
+#else
+#define TF_PLAT_STR___SSSE3__
+#endif
+#if defined(__TBM__)
+#define TF_PLAT_STR___TBM__ TF_PLAT_STR_(__TBM__)
+#else
+#define TF_PLAT_STR___TBM__
+#endif
+#if defined(__XOP__)
+#define TF_PLAT_STR___XOP__ TF_PLAT_STR_(__XOP__)
+#else
+#define TF_PLAT_STR___XOP__
+#endif
+#if defined(__XSAVEC__)
+#define TF_PLAT_STR___XSAVEC__ TF_PLAT_STR_(__XSAVEC__)
+#else
+#define TF_PLAT_STR___XSAVEC__
+#endif
+#if defined(__XSAVEOPT__)
+#define TF_PLAT_STR___XSAVEOPT__ TF_PLAT_STR_(__XSAVEOPT__)
+#else
+#define TF_PLAT_STR___XSAVEOPT__
+#endif
+#if defined(__XSAVES__)
+#define TF_PLAT_STR___XSAVES__ TF_PLAT_STR_(__XSAVES__)
+#else
+#define TF_PLAT_STR___XSAVES__
+#endif
+#if defined(__XSAVE__)
+#define TF_PLAT_STR___XSAVE__ TF_PLAT_STR_(__XSAVE__)
+#else
+#define TF_PLAT_STR___XSAVE__
+#endif
+#if defined(_SOFT_DOUBLE)
+#define TF_PLAT_STR__SOFT_DOUBLE TF_PLAT_STR_(_SOFT_DOUBLE)
+#else
+#define TF_PLAT_STR__SOFT_DOUBLE
+#endif
+#if defined(_SOFT_FLOAT)
+#define TF_PLAT_STR__SOFT_FLOAT TF_PLAT_STR_(_SOFT_FLOAT)
+#else
+#define TF_PLAT_STR__SOFT_FLOAT
+#endif
+#if defined(__ALTIVEC__)
+#define TF_PLAT_STR___ALTIVEC__ TF_PLAT_STR_(__ALTIVEC__)
+#else
+#define TF_PLAT_STR___ALTIVEC__
+#endif
+#if defined(__APPLE_ALTIVEC__)
+#define TF_PLAT_STR___APPLE_ALTIVEC__ TF_PLAT_STR_(__APPLE_ALTIVEC__)
+#else
+#define TF_PLAT_STR___APPLE_ALTIVEC__
+#endif
+#if defined(__CRYPTO__)
+#define TF_PLAT_STR___CRYPTO__ TF_PLAT_STR_(__CRYPTO__)
+#else
+#define TF_PLAT_STR___CRYPTO__
+#endif
+#if defined(__FLOAT128_HARDWARE__)
+#define TF_PLAT_STR___FLOAT128_HARDWARE__ TF_PLAT_STR_(__FLOAT128_HARDWARE__)
+#else
+#define TF_PLAT_STR___FLOAT128_HARDWARE__
+#endif
+#if defined(__FLOAT128_TYPE__)
+#define TF_PLAT_STR___FLOAT128_TYPE__ TF_PLAT_STR_(__FLOAT128_TYPE__)
+#else
+#define TF_PLAT_STR___FLOAT128_TYPE__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__HTM__)
+#define TF_PLAT_STR___HTM__ TF_PLAT_STR_(__HTM__)
+#else
+#define TF_PLAT_STR___HTM__
+#endif
+#if defined(__NO_FPRS__)
+#define TF_PLAT_STR___NO_FPRS__ TF_PLAT_STR_(__NO_FPRS__)
+#else
+#define TF_PLAT_STR___NO_FPRS__
+#endif
+#if defined(__NO_LWSYNC__)
+#define TF_PLAT_STR___NO_LWSYNC__ TF_PLAT_STR_(__NO_LWSYNC__)
+#else
+#define TF_PLAT_STR___NO_LWSYNC__
+#endif
+#if defined(__POWER8_VECTOR__)
+#define TF_PLAT_STR___POWER8_VECTOR__ TF_PLAT_STR_(__POWER8_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER8_VECTOR__
+#endif
+#if defined(__POWER9_VECTOR__)
+#define TF_PLAT_STR___POWER9_VECTOR__ TF_PLAT_STR_(__POWER9_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER9_VECTOR__
+#endif
+#if defined(__PPC405__)
+#define TF_PLAT_STR___PPC405__ TF_PLAT_STR_(__PPC405__)
+#else
+#define TF_PLAT_STR___PPC405__
+#endif
+#if defined(__QUAD_MEMORY_ATOMIC__)
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__ TF_PLAT_STR_(__QUAD_MEMORY_ATOMIC__)
+#else
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__
+#endif
+#if defined(__RECIPF__)
+#define TF_PLAT_STR___RECIPF__ TF_PLAT_STR_(__RECIPF__)
+#else
+#define TF_PLAT_STR___RECIPF__
+#endif
+#if defined(__RECIP_PRECISION__)
+#define TF_PLAT_STR___RECIP_PRECISION__ TF_PLAT_STR_(__RECIP_PRECISION__)
+#else
+#define TF_PLAT_STR___RECIP_PRECISION__
+#endif
+#if defined(__RECIP__)
+#define TF_PLAT_STR___RECIP__ TF_PLAT_STR_(__RECIP__)
+#else
+#define TF_PLAT_STR___RECIP__
+#endif
+#if defined(__RSQRTEF__)
+#define TF_PLAT_STR___RSQRTEF__ TF_PLAT_STR_(__RSQRTEF__)
+#else
+#define TF_PLAT_STR___RSQRTEF__
+#endif
+#if defined(__RSQRTE__)
+#define TF_PLAT_STR___RSQRTE__ TF_PLAT_STR_(__RSQRTE__)
+#else
+#define TF_PLAT_STR___RSQRTE__
+#endif
+#if defined(__TM_FENCE__)
+#define TF_PLAT_STR___TM_FENCE__ TF_PLAT_STR_(__TM_FENCE__)
+#else
+#define TF_PLAT_STR___TM_FENCE__
+#endif
+#if defined(__UPPER_REGS_DF__)
+#define TF_PLAT_STR___UPPER_REGS_DF__ TF_PLAT_STR_(__UPPER_REGS_DF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_DF__
+#endif
+#if defined(__UPPER_REGS_SF__)
+#define TF_PLAT_STR___UPPER_REGS_SF__ TF_PLAT_STR_(__UPPER_REGS_SF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_SF__
+#endif
+#if defined(__VEC__)
+#define TF_PLAT_STR___VEC__ TF_PLAT_STR_(__VEC__)
+#else
+#define TF_PLAT_STR___VEC__
+#endif
+#if defined(__VSX__)
+#define TF_PLAT_STR___VSX__ TF_PLAT_STR_(__VSX__)
+#else
+#define TF_PLAT_STR___VSX__
+#endif
+#if defined(__ARM_ARCH)
+#define TF_PLAT_STR___ARM_ARCH TF_PLAT_STR_(__ARM_ARCH)
+#else
+#define TF_PLAT_STR___ARM_ARCH
+#endif
+#if defined(__ARM_FEATURE_CLZ)
+#define TF_PLAT_STR___ARM_FEATURE_CLZ TF_PLAT_STR_(__ARM_FEATURE_CLZ)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CLZ
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRYPTO)
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO TF_PLAT_STR_(__ARM_FEATURE_CRYPTO)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO
+#endif
+#if defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING \
+  TF_PLAT_STR_(__ARM_FEATURE_DIRECTED_ROUNDING)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING
+#endif
+#if defined(__ARM_FEATURE_DSP)
+#define TF_PLAT_STR___ARM_FEATURE_DSP TF_PLAT_STR_(__ARM_FEATURE_DSP)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DSP
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#define TF_PLAT_STR___ARM_FEATURE_FMA TF_PLAT_STR_(__ARM_FEATURE_FMA)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_FMA
+#endif
+#if defined(__ARM_FEATURE_IDIV)
+#define TF_PLAT_STR___ARM_FEATURE_IDIV TF_PLAT_STR_(__ARM_FEATURE_IDIV)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_IDIV
+#endif
+#if defined(__ARM_FEATURE_LDREX)
+#define TF_PLAT_STR___ARM_FEATURE_LDREX TF_PLAT_STR_(__ARM_FEATURE_LDREX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_LDREX
+#endif
+#if defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN \
+  TF_PLAT_STR_(__ARM_FEATURE_NUMERIC_MAXMIN)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN
+#endif
+#if defined(__ARM_FEATURE_QBIT)
+#define TF_PLAT_STR___ARM_FEATURE_QBIT TF_PLAT_STR_(__ARM_FEATURE_QBIT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QBIT
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX TF_PLAT_STR_(__ARM_FEATURE_QRDMX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX
+#endif
+#if defined(__ARM_FEATURE_SAT)
+#define TF_PLAT_STR___ARM_FEATURE_SAT TF_PLAT_STR_(__ARM_FEATURE_SAT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SAT
+#endif
+#if defined(__ARM_FEATURE_SIMD32)
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32 TF_PLAT_STR_(__ARM_FEATURE_SIMD32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32
+#endif
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED \
+  TF_PLAT_STR_(__ARM_FEATURE_UNALIGNED)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED
+#endif
+#if defined(__ARM_FP)
+#define TF_PLAT_STR___ARM_FP TF_PLAT_STR_(__ARM_FP)
+#else
+#define TF_PLAT_STR___ARM_FP
+#endif
+#if defined(__ARM_NEON_FP)
+#define TF_PLAT_STR___ARM_NEON_FP TF_PLAT_STR_(__ARM_NEON_FP)
+#else
+#define TF_PLAT_STR___ARM_NEON_FP
+#endif
+#if defined(__ARM_NEON__)
+#define TF_PLAT_STR___ARM_NEON__ TF_PLAT_STR_(__ARM_NEON__)
+#else
+#define TF_PLAT_STR___ARM_NEON__
+#endif
+#if defined(__ARM_WMMX)
+#define TF_PLAT_STR___ARM_WMMX TF_PLAT_STR_(__ARM_WMMX)
+#else
+#define TF_PLAT_STR___ARM_WMMX
+#endif
+#if defined(__IWMMXT2__)
+#define TF_PLAT_STR___IWMMXT2__ TF_PLAT_STR_(__IWMMXT2__)
+#else
+#define TF_PLAT_STR___IWMMXT2__
+#endif
+#if defined(__IWMMXT__)
+#define TF_PLAT_STR___IWMMXT__ TF_PLAT_STR_(__IWMMXT__)
+#else
+#define TF_PLAT_STR___IWMMXT__
+#endif
+#if defined(__VFP_FP__)
+#define TF_PLAT_STR___VFP_FP__ TF_PLAT_STR_(__VFP_FP__)
+#else
+#define TF_PLAT_STR___VFP_FP__
+#endif
+#if defined(TARGET_IPHONE_SIMULATOR)
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR \
+  TF_PLAT_STR_(TARGET_IPHONE_SIMULATOR)
+#else
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR
+#endif
+#if defined(TARGET_OS_IOS)
+#define TF_PLAT_STR_TARGET_OS_IOS TF_PLAT_STR_(TARGET_OS_IOS)
+#else
+#define TF_PLAT_STR_TARGET_OS_IOS
+#endif
+#if defined(TARGET_OS_IPHONE)
+#define TF_PLAT_STR_TARGET_OS_IPHONE TF_PLAT_STR_(TARGET_OS_IPHONE)
+#else
+#define TF_PLAT_STR_TARGET_OS_IPHONE
+#endif
+#if defined(_MSC_VER)
+#define TF_PLAT_STR__MSC_VER TF_PLAT_STR_(_MSC_VER)
+#else
+#define TF_PLAT_STR__MSC_VER
+#endif
+#if defined(_M_ARM)
+#define TF_PLAT_STR__M_ARM TF_PLAT_STR_(_M_ARM)
+#else
+#define TF_PLAT_STR__M_ARM
+#endif
+#if defined(_M_ARM64)
+#define TF_PLAT_STR__M_ARM64 TF_PLAT_STR_(_M_ARM64)
+#else
+#define TF_PLAT_STR__M_ARM64
+#endif
+#if defined(_M_ARM_ARMV7VE)
+#define TF_PLAT_STR__M_ARM_ARMV7VE TF_PLAT_STR_(_M_ARM_ARMV7VE)
+#else
+#define TF_PLAT_STR__M_ARM_ARMV7VE
+#endif
+#if defined(_M_ARM_FP)
+#define TF_PLAT_STR__M_ARM_FP TF_PLAT_STR_(_M_ARM_FP)
+#else
+#define TF_PLAT_STR__M_ARM_FP
+#endif
+#if defined(_M_IX86)
+#define TF_PLAT_STR__M_IX86 TF_PLAT_STR_(_M_IX86)
+#else
+#define TF_PLAT_STR__M_IX86
+#endif
+#if defined(_M_X64)
+#define TF_PLAT_STR__M_X64 TF_PLAT_STR_(_M_X64)
+#else
+#define TF_PLAT_STR__M_X64
+#endif
+#if defined(_WIN32)
+#define TF_PLAT_STR__WIN32 TF_PLAT_STR_(_WIN32)
+#else
+#define TF_PLAT_STR__WIN32
+#endif
+#if defined(_WIN64)
+#define TF_PLAT_STR__WIN64 TF_PLAT_STR_(_WIN64)
+#else
+#define TF_PLAT_STR__WIN64
+#endif
+#if defined(__ANDROID__)
+#define TF_PLAT_STR___ANDROID__ TF_PLAT_STR_(__ANDROID__)
+#else
+#define TF_PLAT_STR___ANDROID__
+#endif
+#if defined(__APPLE__)
+#define TF_PLAT_STR___APPLE__ TF_PLAT_STR_(__APPLE__)
+#else
+#define TF_PLAT_STR___APPLE__
+#endif
+#if defined(__BYTE_ORDER__)
+#define TF_PLAT_STR___BYTE_ORDER__ TF_PLAT_STR_(__BYTE_ORDER__)
+#else
+#define TF_PLAT_STR___BYTE_ORDER__
+#endif
+#if defined(__CYGWIN__)
+#define TF_PLAT_STR___CYGWIN__ TF_PLAT_STR_(__CYGWIN__)
+#else
+#define TF_PLAT_STR___CYGWIN__
+#endif
+#if defined(__FreeBSD__)
+#define TF_PLAT_STR___FreeBSD__ TF_PLAT_STR_(__FreeBSD__)
+#else
+#define TF_PLAT_STR___FreeBSD__
+#endif
+#if defined(__LITTLE_ENDIAN__)
+#define TF_PLAT_STR___LITTLE_ENDIAN__ TF_PLAT_STR_(__LITTLE_ENDIAN__)
+#else
+#define TF_PLAT_STR___LITTLE_ENDIAN__
+#endif
+#if defined(__NetBSD__)
+#define TF_PLAT_STR___NetBSD__ TF_PLAT_STR_(__NetBSD__)
+#else
+#define TF_PLAT_STR___NetBSD__
+#endif
+#if defined(__OpenBSD__)
+#define TF_PLAT_STR___OpenBSD__ TF_PLAT_STR_(__OpenBSD__)
+#else
+#define TF_PLAT_STR___OpenBSD__
+#endif
+#if defined(____MSYS__)
+#define TF_PLAT_STR_____MSYS__ TF_PLAT_STR_(____MSYS__)
+#else
+#define TF_PLAT_STR_____MSYS__
+#endif
+#if defined(__aarch64__)
+#define TF_PLAT_STR___aarch64__ TF_PLAT_STR_(__aarch64__)
+#else
+#define TF_PLAT_STR___aarch64__
+#endif
+#if defined(__alpha__)
+#define TF_PLAT_STR___alpha__ TF_PLAT_STR_(__alpha__)
+#else
+#define TF_PLAT_STR___alpha__
+#endif
+#if defined(__arm__)
+#define TF_PLAT_STR___arm__ TF_PLAT_STR_(__arm__)
+#else
+#define TF_PLAT_STR___arm__
+#endif
+#if defined(__i386__)
+#define TF_PLAT_STR___i386__ TF_PLAT_STR_(__i386__)
+#else
+#define TF_PLAT_STR___i386__
+#endif
+#if defined(__i686__)
+#define TF_PLAT_STR___i686__ TF_PLAT_STR_(__i686__)
+#else
+#define TF_PLAT_STR___i686__
+#endif
+#if defined(__ia64__)
+#define TF_PLAT_STR___ia64__ TF_PLAT_STR_(__ia64__)
+#else
+#define TF_PLAT_STR___ia64__
+#endif
+#if defined(__linux__)
+#define TF_PLAT_STR___linux__ TF_PLAT_STR_(__linux__)
+#else
+#define TF_PLAT_STR___linux__
+#endif
+#if defined(__mips32__)
+#define TF_PLAT_STR___mips32__ TF_PLAT_STR_(__mips32__)
+#else
+#define TF_PLAT_STR___mips32__
+#endif
+#if defined(__mips64__)
+#define TF_PLAT_STR___mips64__ TF_PLAT_STR_(__mips64__)
+#else
+#define TF_PLAT_STR___mips64__
+#endif
+#if defined(__powerpc64__)
+#define TF_PLAT_STR___powerpc64__ TF_PLAT_STR_(__powerpc64__)
+#else
+#define TF_PLAT_STR___powerpc64__
+#endif
+#if defined(__powerpc__)
+#define TF_PLAT_STR___powerpc__ TF_PLAT_STR_(__powerpc__)
+#else
+#define TF_PLAT_STR___powerpc__
+#endif
+#if defined(__riscv___)
+#define TF_PLAT_STR___riscv___ TF_PLAT_STR_(__riscv___)
+#else
+#define TF_PLAT_STR___riscv___
+#endif
+#if defined(__s390x__)
+#define TF_PLAT_STR___s390x__ TF_PLAT_STR_(__s390x__)
+#else
+#define TF_PLAT_STR___s390x__
+#endif
+#if defined(__sparc64__)
+#define TF_PLAT_STR___sparc64__ TF_PLAT_STR_(__sparc64__)
+#else
+#define TF_PLAT_STR___sparc64__
+#endif
+#if defined(__sparc__)
+#define TF_PLAT_STR___sparc__ TF_PLAT_STR_(__sparc__)
+#else
+#define TF_PLAT_STR___sparc__
+#endif
+#if defined(__x86_64__)
+#define TF_PLAT_STR___x86_64__ TF_PLAT_STR_(__x86_64__)
+#else
+#define TF_PLAT_STR___x86_64__
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/png.h b/third_party/tflite-hdrs/tensorflow/core/platform/png.h
new file mode 100644
index 00000000..fc1a3421
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/png.h
@@ -0,0 +1,30 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PNG_H_
+#define TENSORFLOW_CORE_PLATFORM_PNG_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) && !defined(IS_MOBILE_PLATFORM)
+#include "png.h"  // from @png  // IWYU pragma: export
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID) || defined(IS_MOBILE_PLATFORM)
+#include <png.h>  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PNG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/prefetch.h b/third_party/tflite-hdrs/tensorflow/core/platform/prefetch.h
new file mode 100644
index 00000000..019493f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/prefetch.h
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PREFETCH_H_
+#define TENSORFLOW_CORE_PLATFORM_PREFETCH_H_
+
+#include "tsl/platform/prefetch.h"
+
+namespace tensorflow {
+namespace port {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::port::prefetch;
+using ::tsl::port::PREFETCH_HINT_NTA;
+using ::tsl::port::PREFETCH_HINT_T0;
+using ::tsl::port::PrefetchHint;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PREFETCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
new file mode 100644
index 00000000..610f507c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
+
+#include <sys/types.h>
+
+#include "xla/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
+#include "tensorflow/core/platform/types.h"
+
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 21) && \
+    (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
+struct perf_event_attr;
+
+namespace tensorflow {
+namespace profile_utils {
+using tsl::profile_utils::AndroidArmV7ACpuUtilsHelper;  // NOLINT
+}  // namespace profile_utils
+}  // namespace tensorflow
+
+#endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
+        // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
new file mode 100644
index 00000000..da58a612
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+
+#include <algorithm>
+
+#include "xla/tsl/platform/profile_utils/clock_cycle_profiler.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+
+namespace tensorflow {
+using tsl::ClockCycleProfiler;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/cpu_utils.h b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/cpu_utils.h
new file mode 100644
index 00000000..fde59166
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This class is designed to get accurate profile for programs.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
+
+#include <chrono>
+#include <memory>
+
+#include "xla/tsl/platform/profile_utils/cpu_utils.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace profile_utils {
+using tsl::profile_utils::CpuUtils;  // NOLINT
+}  // namespace profile_utils
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
new file mode 100644
index 00000000..f9357c6c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
+
+#include "xla/tsl/platform/profile_utils/i_cpu_utils_helper.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profile_utils {
+using tsl::profile_utils::ICpuUtilsHelper;  // NOLINT
+}  // namespace profile_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/protobuf.h b/third_party/tflite-hdrs/tensorflow/core/platform/protobuf.h
new file mode 100644
index 00000000..d7dda8b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/protobuf.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
+#define TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/protobuf.h"
+
+namespace tensorflow {
+namespace protobuf = tsl::protobuf;  // NOLINT(misc-unused-alias-decls)
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::kProtobufInt64Typename;
+using tsl::kProtobufUint64Typename;
+using tsl::ParseFromTString;
+using tsl::ParseProtoUnlimited;
+using tsl::protobuf_int64;
+using tsl::protobuf_uint64;
+using tsl::ProtobufStringToString;
+using tsl::SerializeToTString;
+using tsl::SetProtobufStringSwapAllowed;
+using tsl::TStringOutputStream;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/protobuf_internal.h b/third_party/tflite-hdrs/tensorflow/core/platform/protobuf_internal.h
new file mode 100644
index 00000000..b766b42b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/protobuf_internal.h
@@ -0,0 +1,45 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PROTOBUF_INTERNAL_H_
+#define TENSORFLOW_CORE_PLATFORM_PROTOBUF_INTERNAL_H_
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Utility for parsing an Any value with full or lite protos.
+template <class T>
+absl::Status ParseAny(const google::protobuf::Any& any, T* message,
+                      const string& type_name) {
+  CHECK_EQ(type_name, message->GetTypeName());
+  if (!any.Is<T>()) {
+    return errors::FailedPrecondition(
+        "Expected Any type_url for: ", message->GetTypeName(),
+        ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
+  }
+  if (!any.UnpackTo(message)) {
+    return errors::FailedPrecondition("Failed to unpack: ", any.DebugString());
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PROTOBUF_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/ram_file_system.h b/third_party/tflite-hdrs/tensorflow/core/platform/ram_file_system.h
new file mode 100644
index 00000000..2043737b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/ram_file_system.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RAM_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_RAM_FILE_SYSTEM_H_
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/ram_file_system.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::RamFileSystem;
+using tsl::RamRandomAccessFile;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RAM_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/random.h b/third_party/tflite-hdrs/tensorflow/core/platform/random.h
new file mode 100644
index 00000000..ceb54e4a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/random.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RANDOM_H_
+#define TENSORFLOW_CORE_PLATFORM_RANDOM_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/random.h"
+
+namespace tensorflow {
+namespace random {
+using tsl::random::New64;             // NOLINT
+using tsl::random::New64DefaultSeed;  // NOLINT
+}  // namespace random
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RANDOM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/raw_coding.h b/third_party/tflite-hdrs/tensorflow/core/platform/raw_coding.h
new file mode 100644
index 00000000..9b3c31d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/raw_coding.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RAW_CODING_H_
+#define TENSORFLOW_CORE_PLATFORM_RAW_CODING_H_
+
+#include <string.h>
+
+#include "tsl/platform/raw_coding.h"
+
+namespace tensorflow {
+namespace core {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::core::DecodeFixed16;
+using ::tsl::core::DecodeFixed32;
+using ::tsl::core::DecodeFixed64;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RAW_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/refcount.h b/third_party/tflite-hdrs/tensorflow/core/platform/refcount.h
new file mode 100644
index 00000000..9d8b21b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/refcount.h
@@ -0,0 +1,36 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_REFCOUNT_H_
+#define TENSORFLOW_CORE_PLATFORM_REFCOUNT_H_
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tsl/platform/refcount.h"
+
+namespace tensorflow {
+namespace core {
+// NOLINTBEGIN(misc-unused-using-decls)
+using ::tsl::core::RefCountDeleter;
+using ::tsl::core::RefCounted;
+using ::tsl::core::RefCountPtr;
+using ::tsl::core::ScopedUnref;
+using ::tsl::core::WeakNotifyFn;
+using ::tsl::core::WeakPtr;
+using ::tsl::core::WeakRefCounted;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace core
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_REFCOUNT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/regexp.h b/third_party/tflite-hdrs/tensorflow/core/platform/regexp.h
new file mode 100644
index 00000000..0c2025ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/regexp.h
@@ -0,0 +1,20 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_REGEXP_H_
+#define TENSORFLOW_CORE_PLATFORM_REGEXP_H_
+#include "tsl/platform/regexp.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_REGEXP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/resource.h b/third_party/tflite-hdrs/tensorflow/core/platform/resource.h
new file mode 100644
index 00000000..1088b388
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/resource.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RESOURCE_H_
+#define TENSORFLOW_CORE_PLATFORM_RESOURCE_H_
+
+#include <memory>
+
+#include "tsl/platform/resource.h"
+
+namespace tensorflow {
+
+using ::tsl::ResourceTagger;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RESOURCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/resource_loader.h b/third_party/tflite-hdrs/tensorflow/core/platform/resource_loader.h
new file mode 100644
index 00000000..e4d6d56e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/resource_loader.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Small helper library to access "data" dependencies defined in BUILD files.
+// Requires the relative paths starting from tensorflow/...
+// For example, to get this file, a user would call:
+// GetDataDependencyFilepath("tensorflow/core/platform/resource_loadder.h")
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RESOURCE_LOADER_H_
+#define TENSORFLOW_CORE_PLATFORM_RESOURCE_LOADER_H_
+
+#include "tsl/platform/resource_loader.h"
+
+namespace tensorflow {
+
+using tsl::GetDataDependencyFilepath;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RESOURCE_LOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/retrying_file_system.h b/third_party/tflite-hdrs/tensorflow/core/platform/retrying_file_system.h
new file mode 100644
index 00000000..c8eb328c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/retrying_file_system.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/retrying_utils.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/retrying_file_system.h"
+
+namespace tensorflow {
+
+using tsl::RetryingFileSystem;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/retrying_utils.h b/third_party/tflite-hdrs/tensorflow/core/platform/retrying_utils.h
new file mode 100644
index 00000000..a42d02ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/retrying_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_RETRYING_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_RETRYING_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/retrying_utils.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::RetryConfig;
+using tsl::RetryingUtils;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_RETRYING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/rocm.h b/third_party/tflite-hdrs/tensorflow/core/platform/rocm.h
new file mode 100644
index 00000000..8fc0fa9d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/rocm.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ROCM_H_
+#define TENSORFLOW_CORE_PLATFORM_ROCM_H_
+
+#include "tensorflow/core/platform/platform.h"  // IWYU pragma: keep
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ROCM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/rocm_rocdl_path.h b/third_party/tflite-hdrs/tensorflow/core/platform/rocm_rocdl_path.h
new file mode 100644
index 00000000..dc656131
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/rocm_rocdl_path.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_
+#define TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/rocm_rocdl_path.h"
+
+namespace tensorflow {
+using tsl::RocdlRoot;  // NOLINT
+using tsl::RocmRoot;   // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/scanner.h b/third_party/tflite-hdrs/tensorflow/core/platform/scanner.h
new file mode 100644
index 00000000..edea0a65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/scanner.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_SCANNER_H_
+#define TENSORFLOW_CORE_PLATFORM_SCANNER_H_
+
+#include "tsl/platform/scanner.h"
+
+namespace tensorflow {
+namespace strings {
+
+using ::tsl::strings::Scanner;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_SCANNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/setround.h b/third_party/tflite-hdrs/tensorflow/core/platform/setround.h
new file mode 100644
index 00000000..efd1f03e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/setround.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_SETROUND_H_
+#define TENSORFLOW_CORE_PLATFORM_SETROUND_H_
+
+#include "tensorflow/core/platform/macros.h"
+#include "tsl/platform/setround.h"
+
+namespace tensorflow {
+namespace port {
+using tsl::port::ScopedSetRound;  // NOLINT
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_SETROUND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/snappy.h b/third_party/tflite-hdrs/tensorflow/core/platform/snappy.h
new file mode 100644
index 00000000..53fa5de6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/snappy.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_SNAPPY_H_
+#define TENSORFLOW_CORE_PLATFORM_SNAPPY_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/snappy.h"
+
+#if !defined(PLATFORM_WINDOWS)
+#include <sys/uio.h>
+#else
+namespace tensorflow {
+using tsl::iovec;
+}  // namespace tensorflow
+#endif
+
+namespace tensorflow {
+namespace port {
+using tsl::port::Snappy_Compress;
+using tsl::port::Snappy_CompressFromIOVec;
+using tsl::port::Snappy_GetUncompressedLength;
+using tsl::port::Snappy_Uncompress;
+using tsl::port::Snappy_UncompressToIOVec;
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_SNAPPY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stack_frame.h b/third_party/tflite-hdrs/tensorflow/core/platform/stack_frame.h
new file mode 100644
index 00000000..cd5c3ff1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stack_frame.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STACK_FRAME_H_
+#define TENSORFLOW_CORE_PLATFORM_STACK_FRAME_H_
+
+#include "tsl/platform/stack_frame.h"
+
+namespace tensorflow {
+typedef tsl::StackFrame StackFrame;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STACK_FRAME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stacktrace.h b/third_party/tflite-hdrs/tensorflow/core/platform/stacktrace.h
new file mode 100644
index 00000000..b8aaf464
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stacktrace.h
@@ -0,0 +1,30 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STACKTRACE_H_
+#define TENSORFLOW_CORE_PLATFORM_STACKTRACE_H_
+
+#include "tensorflow/core/platform/platform.h"  // IWYU pragma: export
+#include "tsl/platform/stacktrace.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CurrentStackTrace;
+using tsl::DebugWriteToString;
+using tsl::SavedStackTrace;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STACKTRACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stacktrace_handler.h b/third_party/tflite-hdrs/tensorflow/core/platform/stacktrace_handler.h
new file mode 100644
index 00000000..8a81a6a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stacktrace_handler.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STACKTRACE_HANDLER_H_
+#define TENSORFLOW_CORE_PLATFORM_STACKTRACE_HANDLER_H_
+
+#include "tsl/platform/stacktrace_handler.h"
+
+namespace tensorflow {
+namespace testing {
+
+// Installs signal handlers to print out stack trace.
+// Although GoogleTest has support for generating stacktraces with abseil via
+// https://github.com/google/googletest/pull/1653, this doesn't cover our use
+// case of getting C++ stacktraces in our python tests.
+using tsl::testing::InstallStacktraceHandler;
+
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STACKTRACE_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/status.h b/third_party/tflite-hdrs/tensorflow/core/platform/status.h
new file mode 100644
index 00000000..99f66009
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/status.h
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STATUS_H_
+#define TENSORFLOW_CORE_PLATFORM_STATUS_H_
+
+#include "absl/base/macros.h"
+#include "absl/status/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stack_frame.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/status.h"
+
+#if !defined(ABSL_DEPRECATE_AND_INLINE)
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+#ifdef SWIG
+using tsl::FromAbslStatus;
+using tsl::OkStatus;
+using tsl::Status;
+using tsl::ToAbslStatus;
+#else
+ABSL_DEPRECATE_AND_INLINE()
+inline ::absl::Status FromAbslStatus(const ::absl::Status& s) { return s; }
+ABSL_DEPRECATE_AND_INLINE()
+inline ::absl::Status ToAbslStatus(const ::absl::Status& s) { return s; }
+ABSL_DEPRECATE_AND_INLINE()
+inline ::absl::Status OkStatus() { return ::absl::OkStatus(); };
+using Status ABSL_DEPRECATE_AND_INLINE() = ::absl::Status;
+#endif
+using tsl::StatusCallback;
+using tsl::StatusGroup;
+using tsl::TfCheckOpHelper;
+using tsl::TfCheckOpHelperOutOfLine;
+
+namespace errors {
+#ifdef SWIG
+using tsl::errors::Code;
+#else
+using Code ABSL_DEPRECATE_AND_INLINE() = ::absl::StatusCode;
+#endif
+using tsl::errors::GetStackTrace;
+using tsl::errors::SetStackTrace;
+}  // namespace errors
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/status_matchers.h b/third_party/tflite-hdrs/tensorflow/core/platform/status_matchers.h
new file mode 100644
index 00000000..6fd5791f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/status_matchers.h
@@ -0,0 +1,46 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_STATUS_MATCHERS_H_
+#define TENSORFLOW_CORE_PLATFORM_STATUS_MATCHERS_H_
+
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+
+namespace testing {
+namespace internal_status {
+using tsl::testing::internal_status::GetStatus;
+using tsl::testing::internal_status::IsOkAndHoldsMatcher;
+using tsl::testing::internal_status::IsOkAndHoldsMatcherImpl;
+using tsl::testing::internal_status::IsOkMatcher;
+using tsl::testing::internal_status::MonoIsOkMatcherImpl;
+using tsl::testing::internal_status::MonoStatusIsMatcherImpl;
+using tsl::testing::internal_status::StatusIsMatcher;
+using tsl::testing::internal_status::StatusIsMatcherCommonImpl;
+}  // namespace internal_status
+using tsl::testing::IsOk;
+using tsl::testing::IsOkAndHolds;
+using tsl::testing::StatusIs;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STATUS_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/statusor.h b/third_party/tflite-hdrs/tensorflow/core/platform/statusor.h
new file mode 100644
index 00000000..1a5f77e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/statusor.h
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STATUSOR_H_
+#define TENSORFLOW_CORE_PLATFORM_STATUSOR_H_
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/statusor.h"
+namespace tensorflow {
+using tsl::StatusOr;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STATUSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/str_util.h b/third_party/tflite-hdrs/tensorflow/core/platform/str_util.h
new file mode 100644
index 00000000..fbea09af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/str_util.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STR_UTIL_H_
+#define TENSORFLOW_CORE_PLATFORM_STR_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/str_util.h"
+
+// Basic string utility routines
+namespace tensorflow {
+namespace str_util {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::str_util::AllowEmpty;
+using tsl::str_util::ArgDefCase;
+using tsl::str_util::CEscape;
+using tsl::str_util::ConsumeLeadingDigits;
+using tsl::str_util::ConsumeNonWhitespace;
+using tsl::str_util::ConsumePrefix;
+using tsl::str_util::ConsumeSuffix;
+using tsl::str_util::CUnescape;
+using tsl::str_util::EndsWith;
+using tsl::str_util::Join;
+using tsl::str_util::Lowercase;
+using tsl::str_util::RemoveLeadingWhitespace;
+using tsl::str_util::RemoveTrailingWhitespace;
+using tsl::str_util::RemoveWhitespaceContext;
+using tsl::str_util::SkipEmpty;
+using tsl::str_util::SkipWhitespace;
+using tsl::str_util::Split;
+using tsl::str_util::StartsWith;
+using tsl::str_util::StrContains;
+using tsl::str_util::StringReplace;
+using tsl::str_util::StripPrefix;
+using tsl::str_util::StripSuffix;
+using tsl::str_util::StripTrailingWhitespace;
+using tsl::str_util::Strnlen;
+using tsl::str_util::TitlecaseString;
+using tsl::str_util::Uppercase;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace str_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/strcat.h b/third_party/tflite-hdrs/tensorflow/core/platform/strcat.h
new file mode 100644
index 00000000..9a11dd2d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/strcat.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STRCAT_H_
+#define TENSORFLOW_CORE_PLATFORM_STRCAT_H_
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/numbers.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/strcat.h"
+
+namespace tensorflow {
+namespace strings {
+
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::strings::AlphaNum;
+using tsl::strings::Hex;
+using tsl::strings::kZeroPad10;
+using tsl::strings::kZeroPad11;
+using tsl::strings::kZeroPad12;
+using tsl::strings::kZeroPad13;
+using tsl::strings::kZeroPad14;
+using tsl::strings::kZeroPad15;
+using tsl::strings::kZeroPad16;
+using tsl::strings::kZeroPad2;
+using tsl::strings::kZeroPad3;
+using tsl::strings::kZeroPad4;
+using tsl::strings::kZeroPad5;
+using tsl::strings::kZeroPad6;
+using tsl::strings::kZeroPad7;
+using tsl::strings::kZeroPad8;
+using tsl::strings::kZeroPad9;
+using tsl::strings::PadSpec;
+using tsl::strings::StrAppend;
+using tsl::strings::StrCat;
+// NOLINTEND(misc-unused-using-decls)
+
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STRCAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stream_executor.h b/third_party/tflite-hdrs/tensorflow/core/platform/stream_executor.h
new file mode 100644
index 00000000..58acf8eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stream_executor.h
@@ -0,0 +1,34 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STREAM_EXECUTOR_H_
+#define TENSORFLOW_CORE_PLATFORM_STREAM_EXECUTOR_H_
+
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/host/host_platform_id.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/dso_loader.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STREAM_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stream_executor_no_cuda.h b/third_party/tflite-hdrs/tensorflow/core/platform/stream_executor_no_cuda.h
new file mode 100644
index 00000000..e6013d76
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STREAM_EXECUTOR_NO_CUDA_H_
+#define TENSORFLOW_CORE_PLATFORM_STREAM_EXECUTOR_NO_CUDA_H_
+
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/host/host_platform_id.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tsl/platform/dso_loader.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STREAM_EXECUTOR_NO_CUDA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stringpiece.h b/third_party/tflite-hdrs/tensorflow/core/platform/stringpiece.h
new file mode 100644
index 00000000..43f3d4a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stringpiece.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// StringPiece is a simple structure containing a pointer into some external
+// storage and a size.  The user of a StringPiece must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a StringPiece without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same StringPiece must use
+// external synchronization.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STRINGPIECE_H_
+#define TENSORFLOW_CORE_PLATFORM_STRINGPIECE_H_
+
+#include "absl/base/macros.h"
+#include "tsl/platform/stringpiece.h"  // IWYU pragma: export
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+
+using StringPiece ABSL_DEPRECATE_AND_INLINE() = absl::string_view;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STRINGPIECE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/stringprintf.h b/third_party/tflite-hdrs/tensorflow/core/platform/stringprintf.h
new file mode 100644
index 00000000..27d30089
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/stringprintf.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Printf variants that place their output in a C++ string.
+//
+// Usage:
+//      string result = strings::Printf("%d %s\n", 10, "hello");
+//      strings::Appendf(&result, "%d %s\n", 20, "there");
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STRINGPRINTF_H_
+#define TENSORFLOW_CORE_PLATFORM_STRINGPRINTF_H_
+
+#include <stdarg.h>
+
+#include <string>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/stringprintf.h"
+
+namespace tensorflow {
+namespace strings {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::strings::Appendf;
+using tsl::strings::Appendv;
+using tsl::strings::Printf;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace strings
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STRINGPRINTF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/strong_hash.h b/third_party/tflite-hdrs/tensorflow/core/platform/strong_hash.h
new file mode 100644
index 00000000..c442103c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/strong_hash.h
@@ -0,0 +1,45 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_STRONG_HASH_H_
+#define TENSORFLOW_CORE_PLATFORM_STRONG_HASH_H_
+
+#include "highwayhash/sip_hash.h"  // from @highwayhash
+#include "highwayhash/state_helpers.h"  // from @highwayhash
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// This is a strong keyed hash function interface for strings.
+// The hash function is deterministic on the content of the string within the
+// process. The key of the hash is an array of 2 uint64 elements.
+// A strong hash makes it difficult, if not infeasible, to compute inputs that
+// hash to the same bucket.
+//
+// Usage:
+//   uint64 key[2] = {123, 456};
+//   string input = "input string";
+//   uint64 hash_value = StrongKeyedHash(key, input);
+//
+inline uint64 StrongKeyedHash(const tensorflow::uint64 (&key)[2],
+                              const string& s) {
+  return highwayhash::StringHasher<highwayhash::SipHashState>()(
+      {key[0], key[1]}, s);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_STRONG_HASH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/subprocess.h b/third_party/tflite-hdrs/tensorflow/core/platform/subprocess.h
new file mode 100644
index 00000000..0406f529
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/subprocess.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_SUBPROCESS_H_
+#define TENSORFLOW_CORE_PLATFORM_SUBPROCESS_H_
+
+#include "xla/tsl/platform/subprocess.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::ACTION_CLOSE;
+using tsl::ACTION_DUPPARENT;
+using tsl::ACTION_PIPE;
+using tsl::CHAN_STDERR;
+using tsl::CHAN_STDIN;
+using tsl::CHAN_STDOUT;
+using tsl::Channel;
+using tsl::ChannelAction;
+using tsl::CreateSubProcess;
+using tsl::SubProcess;
+}  // namespace tensorflow
+
+#include "tensorflow/core/platform/platform.h"
+
+
+#endif  // TENSORFLOW_CORE_PLATFORM_SUBPROCESS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/tensor_coding.h b/third_party/tflite-hdrs/tensorflow/core/platform/tensor_coding.h
new file mode 100644
index 00000000..b024e143
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/tensor_coding.h
@@ -0,0 +1,137 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper routines for encoding/decoding tensor contents.
+#ifndef TENSORFLOW_CORE_PLATFORM_TENSOR_CODING_H_
+#define TENSORFLOW_CORE_PLATFORM_TENSOR_CODING_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace port {
+
+// Store src contents in *out.  If backing memory for src is shared with *out,
+// will ref obj during the call and will arrange to unref obj when no
+// longer needed.
+void AssignRefCounted(absl::string_view src, core::RefCounted* obj,
+                      std::string* out);
+
+// Copy contents of src to dst[0,src.size()-1].
+inline void CopyToArray(const std::string& src, char* dst) {
+  memcpy(dst, src.data(), src.size());
+}
+
+// Copy subrange [pos:(pos + n)) from src to dst. If pos >= src.size() the
+// result is empty. If pos + n > src.size() the subrange [pos, size()) is
+// copied.
+inline void CopySubrangeToArray(const std::string& src, size_t pos, size_t n,
+                                char* dst) {
+  if (pos >= src.size()) return;
+  memcpy(dst, src.data() + pos, std::min(n, src.size() - pos));
+}
+
+// Store encoding of strings[0..n-1] in *out.
+void EncodeStringList(const tstring* strings, int64_t n, std::string* out);
+
+// Decode n strings from src and store in strings[0..n-1].
+// Returns true if successful, false on parse error.
+bool DecodeStringList(const std::string& src, tstring* strings, int64_t n);
+
+// Assigns base[0..bytes-1] to *s
+void CopyFromArray(std::string* s, const char* base, size_t bytes);
+
+// Encodes sequences of strings and serialized protocol buffers into a string.
+// Normal usage consists of zero or more calls to Append() and a single call to
+// Finalize().
+class StringListEncoder {
+ public:
+  virtual ~StringListEncoder() = default;
+
+  // Encodes the given protocol buffer. This may not be called after Finalize().
+  virtual void Append(const protobuf::MessageLite& m) = 0;
+
+  // Encodes the given string. This may not be called after Finalize().
+  virtual void Append(const std::string& s) = 0;
+
+  // Signals end of the encoding process. No other calls are allowed after this.
+  virtual void Finalize() = 0;
+};
+
+// Decodes a string into sequences of strings (which may represent serialized
+// protocol buffers). Normal usage involves a single call to ReadSizes() in
+// order to retrieve the length of all the strings in the sequence. For each
+// size returned a call to Data() is expected and will return the actual
+// string.
+class StringListDecoder {
+ public:
+  virtual ~StringListDecoder() = default;
+
+  // Populates the given vector with the lengths of each string in the sequence
+  // being decoded. Upon returning the vector is guaranteed to contain as many
+  // elements as there are strings in the sequence.
+  virtual bool ReadSizes(std::vector<uint32>* sizes) = 0;
+
+  // Returns a pointer to the next string in the sequence, then prepares for the
+  // next call by advancing 'size' characters in the sequence.
+  virtual const char* Data(uint32 size) = 0;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// Store src contents in *out.  If backing memory for src is shared with *out,
+// will ref obj during the call and will arrange to unref obj when no
+// longer needed.
+void AssignRefCounted(absl::string_view src, core::RefCounted* obj,
+                      absl::Cord* out);
+
+// TODO(kmensah): Macro guard this with a check for Cord support.
+inline void CopyToArray(const absl::Cord& src, char* dst) {
+  src.CopyToArray(dst);
+}
+
+// Copy n bytes of src to dst. If pos >= src.size() the result is empty.
+// If pos + n > src.size() the subrange [pos, size()) is copied.
+inline void CopySubrangeToArray(const absl::Cord& src, int64_t pos, int64_t n,
+                                char* dst) {
+  src.Subcord(pos, n).CopyToArray(dst);
+}
+
+// Store encoding of strings[0..n-1] in *out.
+void EncodeStringList(const tstring* strings, int64_t n, absl::Cord* out);
+
+// Decode n strings from src and store in strings[0..n-1].
+// Returns true if successful, false on parse error.
+bool DecodeStringList(const absl::Cord& src, std::string* strings, int64_t n);
+bool DecodeStringList(const absl::Cord& src, tstring* strings, int64_t n);
+
+// Assigns base[0..bytes-1] to *c
+void CopyFromArray(absl::Cord* c, const char* base, size_t bytes);
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(absl::Cord* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const absl::Cord& in);
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TENSOR_CODING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/tensor_float_32_utils.h b/third_party/tflite-hdrs/tensorflow/core/platform/tensor_float_32_utils.h
new file mode 100644
index 00000000..efcb9941
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/tensor_float_32_utils.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+
+#include "tsl/platform/tensor_float_32_utils.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::enable_tensor_float_32_execution;
+using tsl::tensor_float_32_execution_enabled;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/test.h b/third_party/tflite-hdrs/tensorflow/core/platform/test.h
new file mode 100644
index 00000000..d57a08f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/test.h
@@ -0,0 +1,36 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TEST_H_
+#define TENSORFLOW_CORE_PLATFORM_TEST_H_
+
+#include <gtest/gtest.h>  // IWYU pragma: export
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/test.h"
+
+namespace tensorflow {
+
+namespace testing {
+using tsl::testing::PickUnusedPortOrDie;
+using tsl::testing::RandomSeed;
+using tsl::testing::TensorFlowSrcRoot;
+using tsl::testing::TmpDir;
+
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/test_benchmark.h b/third_party/tflite-hdrs/tensorflow/core/platform/test_benchmark.h
new file mode 100644
index 00000000..ed964a89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/test_benchmark.h
@@ -0,0 +1,30 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple benchmarking facility.
+#ifndef TENSORFLOW_CORE_PLATFORM_TEST_BENCHMARK_H_
+#define TENSORFLOW_CORE_PLATFORM_TEST_BENCHMARK_H_
+
+#include "tsl/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace testing {
+using tsl::testing::DoNotOptimize;         // NOLINT
+using tsl::testing::InitializeBenchmarks;  // NOLINT
+using tsl::testing::RunBenchmarks;         // NOLINT
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TEST_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/thread_annotations.h b/third_party/tflite-hdrs/tensorflow/core/platform/thread_annotations.h
new file mode 100644
index 00000000..4178265a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/thread_annotations.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file contains the macro definitions for thread safety
+// annotations that allow the developers to document the locking policies
+// of their multi-threaded code. The annotations can also help program
+// analysis tools to identify potential thread safety issues.
+//
+// The primary documentation on these annotations is external:
+// http://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+//
+// The annotations are implemented using compiler attributes.
+// Using the macros defined here instead of the raw attributes allows
+// for portability and future compatibility.
+//
+// When referring to mutexes in the arguments of the attributes, you should
+// use variable names or more complex expressions (e.g. my_object->mutex_)
+// that evaluate to a concrete mutex object whenever possible. If the mutex
+// you want to refer to is not in scope, you may use a member pointer
+// (e.g. &MyClass::mutex_) to refer to a mutex in some (unknown) object.
+//
+
+#ifndef TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
+#define TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
+
+// IWYU pragma: private, include "third_party/tensorflow/core/platform/thread_annotations.h"
+// IWYU pragma: friend third_party/tensorflow/core/platform/thread_annotations.h
+
+#include "tsl/platform/thread_annotations.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/threadpool.h b/third_party/tflite-hdrs/tensorflow/core/platform/threadpool.h
new file mode 100644
index 00000000..02129fd4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/threadpool.h
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_THREADPOOL_H_
+#define TENSORFLOW_CORE_PLATFORM_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace thread {
+using tsl::thread::EigenEnvironment;  // NOLINT
+using tsl::thread::ThreadPool;        // NOLINT
+
+}  // namespace thread
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/threadpool_interface.h b/third_party/tflite-hdrs/tensorflow/core/platform/threadpool_interface.h
new file mode 100644
index 00000000..7e07e560
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/threadpool_interface.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_THREADPOOL_INTERFACE_H_
+#define TENSORFLOW_CORE_PLATFORM_THREADPOOL_INTERFACE_H_
+
+#include "tsl/platform/threadpool_interface.h"
+
+namespace tensorflow {
+namespace thread {
+
+using ThreadPoolInterface = tsl::thread::ThreadPoolInterface;
+
+}  // namespace thread
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_THREADPOOL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/threadpool_options.h b/third_party/tflite-hdrs/tensorflow/core/platform/threadpool_options.h
new file mode 100644
index 00000000..c6237fa8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/threadpool_options.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_THREADPOOL_OPTIONS_H_
+#define TENSORFLOW_CORE_PLATFORM_THREADPOOL_OPTIONS_H_
+
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tsl/platform/threadpool_options.h"
+
+namespace tensorflow {
+namespace thread {
+
+using tsl::thread::ThreadPoolOptions;  // NOLINT
+
+}  // namespace thread
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_THREADPOOL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/tracing.h b/third_party/tflite-hdrs/tensorflow/core/platform/tracing.h
new file mode 100644
index 00000000..24917a6d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/tracing.h
@@ -0,0 +1,53 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TRACING_H_
+#define TENSORFLOW_CORE_PLATFORM_TRACING_H_
+
+// Tracing interface
+
+#include <array>
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/tracing.h"
+
+namespace tensorflow {
+namespace tracing {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::tracing::EventCategory;
+using tsl::tracing::EventCollector;
+using tsl::tracing::GetArgForName;
+using tsl::tracing::GetEventCategoryName;
+using tsl::tracing::GetEventCollector;
+using tsl::tracing::GetLogDir;
+using tsl::tracing::GetNumEventCategories;
+using tsl::tracing::GetUniqueArg;
+using tsl::tracing::RecordEvent;
+using tsl::tracing::ScopedRegion;
+using tsl::tracing::SetEventCollector;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tracing
+}  // namespace tensorflow
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/tracing_impl.h"
+#else
+#include "xla/tsl/platform/default/tracing_impl.h"
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TRACING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/tstring.h b/third_party/tflite-hdrs/tensorflow/core/platform/tstring.h
new file mode 100644
index 00000000..7795811d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/tstring.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TSTRING_H_
+#define TENSORFLOW_CORE_PLATFORM_TSTRING_H_
+
+#include "tensorflow/core/platform/cord.h"
+#include "tensorflow/core/platform/ctstring.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tsl/platform/tstring.h"
+
+namespace tensorflow {
+
+using tstring = tsl::tstring;
+}
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TSTRING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/types.h b/third_party/tflite-hdrs/tensorflow/core/platform/types.h
new file mode 100644
index 00000000..a3159bfe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/types.h
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TYPES_H_
+#define TENSORFLOW_CORE_PLATFORM_TYPES_H_
+
+#include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tsl/platform/types.h"
+
+namespace tensorflow {
+
+// Alias tensorflow::string to std::string.
+using tsl::string;
+
+using tsl::uint16;
+using tsl::uint32;
+using tsl::uint4;
+using tsl::uint64;
+using tsl::uint8;
+
+using tsl::int16;
+using tsl::int32;
+using tsl::int4;
+using tsl::int64;
+using tsl::int8;
+
+using tsl::float8_e4m3fn;
+using tsl::float8_e5m2;
+
+static const uint8 kuint8max = tsl::kuint8max;
+static const uint16 kuint16max = tsl::kuint16max;
+static const uint32 kuint32max = tsl::kuint32max;
+static const uint64 kuint64max = tsl::kuint64max;
+static const int8_t kint8min = tsl::kint8min;
+static const int8_t kint8max = tsl::kint8max;
+static const int16_t kint16min = tsl::kint16min;
+static const int16_t kint16max = tsl::kint16max;
+static const int32_t kint32min = tsl::kint32min;
+static const int32_t kint32max = tsl::kint32max;
+static const int64_t kint64min = tsl::kint64min;
+static const int64_t kint64max = tsl::kint64max;
+
+// A typedef for a uint64 used as a short fingerprint.
+using tsl::bfloat16;
+using tsl::Fprint;
+using tsl::tstring;  // NOLINT: suppress 'using decl 'tstring' is unused'
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/platform/unbounded_work_queue.h b/third_party/tflite-hdrs/tensorflow/core/platform/unbounded_work_queue.h
new file mode 100644
index 00000000..cd6cdf97
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/platform/unbounded_work_queue.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
+#define TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
+
+#include "tensorflow/core/platform/platform.h"
+#include "tsl/platform/unbounded_work_queue.h"
+
+// An `UnboundedWorkQueue` feeds potentially-blocking work into a thread-pool
+// whose size automatically increases with demand.
+
+namespace tensorflow {
+using tsl::UnboundedWorkQueue;  // NOLINT(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/compute_inference_latency.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/compute_inference_latency.h
new file mode 100644
index 00000000..91632c90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/compute_inference_latency.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_COMPUTE_INFERENCE_LATENCY_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_COMPUTE_INFERENCE_LATENCY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+
+namespace tensorflow::profiler {
+
+// Compute the inference latency from inference stats proto.
+OverviewInferenceLatency ComputeInferenceLatencyResult(
+    const InferenceStats& inference_stats);
+
+}  // namespace tensorflow::profiler
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_COMPUTE_INFERENCE_LATENCY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_analysis.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_analysis.h
new file mode 100644
index 00000000..cdff8177
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_analysis.h
@@ -0,0 +1,225 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
+
+#include <array>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/convert/dcn_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Structure representing a DcnMessage using two entries:
+// One for the start of the message and one for the end.
+struct TimestampEvent {
+  uint64_t timestamp_ns;  // TraceMe logging timestamp
+  uint64_t duration_ns;   // 0 for start of message, duration for end of message
+  int32_t message_diff;   // +1/-1 for start/end of message.
+                          // Makes handling 0-sized messages easier and is
+                          // convenient for the burst generation algorithm.
+  size_t size_diff;       // +size/-size for start/end of message.
+  int32_t src_slice_id;   // Source slice for message, used for stragglers
+};
+
+// We use an multi map since TimestampEvents will be ordered and we
+// need separate entries for possible events happening at exactly the
+// same time.
+typedef std::multimap<uint64_t, std::shared_ptr<TimestampEvent>> TimestampMap;
+typedef absl::flat_hash_map<std::string, TimestampMap> CollectiveTimestampMap;
+
+// Straggler messages. These are shown at the end of the bursts they belong to.
+struct Straggler {
+  uint64_t duration_ns;       // Message duration in ns
+  uint64_t end_timestamp_ns;  // End of the message. For the last straggler
+                              // this will be the end of the burst
+  size_t size_bytes;          // Size of the message in bytes
+  int32_t src_slice_id;       // Source slice of the message
+                              // TODO(emizan) Add host info.
+};
+
+static constexpr uint32_t kMaxStragglersPerBurst = 4;
+
+// DCN Burst description.
+// A burst is defined as a period of time during which there is at least one
+// message in the network. Since DCN traffic is bursty this structure is
+// convenient to summarize 100K+ messages in a few 10s of bursts.
+// Burst scope is flexible. In this analysis we have per-host bursts, which
+// include messages arriving on a single host independent of sender/target TPU/
+// and collective. We also have per collective/TPU bursts which include messages
+// for a single collective+TPU combination.
+struct DcnBurst {
+  uint64_t start_timestamp_ns;        // Beginning of burst in ns
+  uint64_t end_timestamp_ns;          // End of burst in ns
+  uint64_t burst_size_bytes;          // Total number of bytes in burst
+  uint64_t num_messages;              // Messages in burst
+  uint64_t max_overlapping_messages;  // Max overlapping messages in burst
+  // Buffer of stragglers in a bursts. Contains the last few messages in a burst
+  std::array<Straggler, kMaxStragglersPerBurst> stragglers;
+};
+
+// Class with functionality to generate DcnBursts out of TimestampEvents.
+// Burst creation is a non-trivial state machine
+class DcnBurstManager {
+ public:
+  DcnBurstManager() = default;
+  uint64_t TotalLatency() const { return total_latency_; }
+  void SetToDisplay(bool to_display) { to_display_ = to_display; }
+  bool ToDisplay() const { return to_display_; }
+  const std::vector<DcnBurst> &GetBursts() const { return bursts_; }
+
+  // Run burst state machine creation out of timestamp map.
+  void CreateBursts(const TimestampMap &tm_events);
+  // For debugging purposes.
+  void PrintBursts() {
+    for (const auto &burst : bursts_) {
+      LOG(INFO) << burst.start_timestamp_ns << " " << burst.end_timestamp_ns
+                << " " << burst.num_messages << " " << burst.burst_size_bytes
+                << " " << burst.max_overlapping_messages;
+    }
+  }
+
+ private:
+  std::vector<DcnBurst> bursts_;  // Bursts created by this manager
+  uint64_t total_latency_ = 0;    // Total latency of all bursts created
+                                  // Used to see if bursts will be displayed
+  bool to_display_ = false;       // Set to true to enable burst display
+
+  int32_t active_burst_messages_;  // Used by burst creation state machine.
+  DcnBurst active_burst_;          // Active burst in creation
+  uint32_t straggler_idx_;
+
+  // Initializes state machine when new burst is detected.
+  void ResetBurstState();
+};
+
+typedef absl::flat_hash_map<std::string, DcnBurstManager>
+    CollectiveBurstManager;
+
+class DcnEventsProcessor {
+ public:
+  DcnEventsProcessor() = delete;
+  DcnEventsProcessor(uint32_t num_tpu_tensor_cores, bool is_megacore);
+
+  uint32_t NumTpuTensorCores() const { return num_tpu_tensor_cores_; }
+  bool IsMegacore() const { return is_megacore_; }
+
+  // Populates available megascale messages from event metadata.
+  void SetupMessageInfo(const tensorflow::profiler::XPlaneVisitor &plane);
+
+  std::optional<int32_t> MegaScaleMessageId(absl::string_view msg_name) const {
+    auto iter = megascale_msg_.find(msg_name);
+    if (iter != megascale_msg_.end()) {
+      return iter->second;
+    }
+    return std::nullopt;
+  }
+
+  uint32_t NumReceivedMessages() const { return received_messages_.size(); }
+  const tensorflow::profiler::DcnMessage &GetMessage(uint32_t i) const {
+    return received_messages_[i];
+  }
+
+  // Checks if messages with msg event name have been found in event metadata.
+  bool HasDcnMessages(absl::string_view msg_name) const {
+    return (megascale_msg_.find(msg_name) != megascale_msg_.end());
+  }
+
+  const TimestampMap &HostTsMap() const { return host_ts_map_; }
+  const std::vector<DcnBurst> &GetHostBursts() const {
+    return host_dcn_bursts_.GetBursts();
+  }
+
+  // Main function to process receive messages, and call other functions
+  // to generate timestamp events and bursts.
+  void ProcessReceiveMessages(const tensorflow::profiler::XPlaneVisitor &plane);
+
+  // Update XPlanes using DCN traffic info
+  void AddHostDcnTrafficToXPlane(tensorflow::profiler::XPlane *host_xplane);
+  void AddTpuCollectiveDcnTrafficToXPlane(
+      tensorflow::profiler::XPlane *device_xplane);
+
+ private:
+  // Tensor cores and megacore flag for this host. DCN messages are sent to a
+  // TPU chip, so we need to know the number of tensor cores and whether
+  // megacore is used to map DCN traffic to the proper tensor core.
+  const uint32_t num_tpu_tensor_cores_;
+  const bool is_megacore_;
+
+  // Used for visualization of BW and computation of BW utilization.
+  static constexpr float kLimitLowHostDcnBw = 4.17;
+  static constexpr float kLimitMedHostDcnBw = 8.34;
+  static constexpr float kMaxHostDcnBw = 12.5;
+
+  std::vector<absl::string_view> registered_dcn_messages_;
+
+  // Available megascale messages for this trace.
+  absl::flat_hash_map<absl::string_view, int32_t> megascale_msg_;
+
+  std::vector<tensorflow::profiler::DcnMessage> received_messages_;
+
+  // TimestampMaps for messages that arrive to this host
+  // and for messages of distinct collectives going to different TPUs.
+  TimestampMap host_ts_map_;
+  std::vector<CollectiveTimestampMap> tpu_collective_ts_map_;
+
+  // DcnBurstManagers for bursts that arrive to this host
+  // and for burst from distinct collectives going to different TPUs.
+  DcnBurstManager host_dcn_bursts_;
+  std::vector<CollectiveBurstManager> tpu_collective_bursts_;
+
+  // Find the TPU index a DCN message goes to.
+  uint32_t FindTpuIdx(int tpu);
+
+  // Generates BW info to display in the trace viewer.
+  // This included trace event BW level string, mean BW per burst and
+  // utilization.
+  absl::string_view GetBwInfo(bool is_per_tpu, const DcnBurst &burst,
+                              float &burst_mean_bw,
+                              float &burst_bw_utilization);
+
+  // Qualify collectives to display on trace viewer.
+  // Qualified collectives are given a dedicated line, while for the rest
+  // we share a single line for their stragglers.
+  uint32_t NumCollectivesQualified(const std::vector<uint64_t> &latencies);
+  void QualifyCollectives();
+  // Export collective DCN activity to trace viewer.
+  void AddQualifiedCollectivesToXPlane(
+      tensorflow::profiler::XPlaneBuilder &plane_builder, uint32_t tpu_idx);
+  void AddUnqualifiedCollectivesToXPlane(
+      tensorflow::profiler::XPlaneBuilder &plane_builder, uint32_t tpu_idx);
+
+  // Create timestamp events for every message
+  void GenerateTimestampEvents(
+      const tensorflow::profiler::DcnMessage &dcn_message);
+  // For debugging purposes
+  void PrintTimestampEvents();
+  // Generate bursts (host and TPU/collective) from timestamp events.
+  void GenerateBursts();
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h
new file mode 100644
index 00000000..f0fc727a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_SLACK_ANALYSIS_COMBINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_SLACK_ANALYSIS_COMBINER_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tensorflow::profiler::DcnSlackAnalysis;
+using tensorflow::profiler::DcnSlackSummary;
+
+class DcnSlackAnalysisCombiner {
+ private:
+  absl::flat_hash_map<std::string, DcnSlackSummary> slack_summary_;
+
+ public:
+  // Combine the DCN Slack Summary in the DcnSlackAnalysis.
+  // The DcnSlackAnalysis consists of average durations, The combine phase, the
+  // summary consists of the total duration for all the occurrences. Finazile
+  // must be called to get the accurate value.
+  void Combine(const DcnSlackAnalysis& slack_analysis);
+
+  // Finalize the DcnSlackSummary by converting total durations to averages.
+  DcnSlackAnalysis Finalize();
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_SLACK_ANALYSIS_COMBINER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_utils.h
new file mode 100644
index 00000000..e0dd3a17
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/dcn_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
+
+#include <string>
+
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// DCN Message Validity
+enum DcnMessageValidity {
+  // Valid message
+  DCN_MESSAGE_VALID = 1,
+  // Valid message, but should not go through DCN, so it should not use BW.
+  DCN_MESSAGE_VALID_LOOPBACK = 2,
+  // Invalid message with 0 duration due to clock skew. Should be ignored.
+  DCN_MESSAGE_INVALID_CLOCK_SKEW = 3,
+  // Message that cannot be decoded. Should be ignored.
+  DCN_MESSAGE_INVALID_BAD_KEY = 4
+};
+
+// Structure representing a DCN event
+struct DcnMessage {
+  // Unique collective that generated this message, format should be
+  // <col name>_<number>, e.g. all_gather_34
+  std::string collective_name = "";
+  // Src info
+  // TODO(emizan) Add host info when you figure out how to get it from
+  // slice+tpu.
+  int32_t slice_src = -1;
+  int32_t tpu_src = -1;
+  // Dst info
+  int32_t slice_dst = -1;
+  int32_t tpu_dst = -1;
+  // Timing info in ns. Since MSXLA TraceMe's have us timestamps, we need to
+  // multiply by 1000 to get these timestamps.
+  uint64_t start_timestamp_ns = 0;
+  uint64_t end_timestamp_ns = 0;
+  uint64_t duration_us = 0;
+  // Size info
+  size_t size_bytes = 0;
+  // Chunk and Loop index
+  int32_t chunk_id = -1;
+  int32_t loop_index_id = -1;
+  // Is message valid/invalid and why
+  DcnMessageValidity validity_info = DCN_MESSAGE_INVALID_BAD_KEY;
+  // TBD: Add flow events in case you need to connect to other events pointed to
+  // by MSXLA TraceMe's
+};
+
+DcnMessage GetDcnMessageFromXEvent(
+    const tsl::profiler::XEventVisitor& event_visitor);
+
+// Check if the XEventVisitor is a DCN Message
+bool IsDcnEvent(const tsl::profiler::XEventVisitor& event);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
new file mode 100644
index 00000000..b3a3a7c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
@@ -0,0 +1,101 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_GRAPH_VIEW_H_
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_graph_dumper.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/convert/tool_options.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// All the parameters for graph viewer.
+struct GraphViewerParams {
+  // Whether to use GraphView or TxtView.
+  std::string type;
+  // Parameters for GraphView.
+  std::string node_name;
+  int graph_width;
+  xla::HloRenderOptions render_options;
+  xla::RenderedGraphFormat format;
+  // Parameters for TxtView.
+  bool verbose;
+  bool show_metadata;
+};
+
+// Return mapping from style key word to op names separated by comma.
+// following hlo_graph_dumper styling
+absl::StatusOr<std::string> GetNodeStyles();
+
+// Parse tool options to get the parameters for graph viewer.
+absl::StatusOr<GraphViewerParams> ParseGraphViewerParams(
+    const ToolOptions& options);
+
+// Get graph render format.
+xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string);
+
+// Convert `hlo_proto` to GraphView with the provided render options.
+absl::StatusOr<std::string> ConvertHloProtoToGraph(
+    const xla::HloProto& hlo_proto, const std::string& node_name,
+    int graph_width, const xla::HloRenderOptions& render_options,
+    const xla::RenderedGraphFormat& format);
+
+// Convert `hlo_proto` to ModelExplorer Graph JSON data.
+absl::StatusOr<std::string> ConvertHloProtoToMeGraph(
+    const xla::HloProto& hlo_proto, const std::string& node_name,
+    int graph_width);
+
+// Render graph with the provided render options.
+absl::StatusOr<std::string> RenderGraphView(
+    const xla::HloComputation& computation, absl::string_view label,
+    const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
+    xla::HloRenderOptions hlo_render_options = {});
+
+// Render graph with centered node and depth
+absl::StatusOr<std::string> RenderGraphNeighborhoodAround(
+    const xla::HloInstruction& node, int radius,
+    xla::RenderedGraphFormat format,
+    xla::HloRenderOptions hlo_render_options = {},
+    const absl::flat_hash_set<const xla::HloInstruction*>& boundary = {});
+
+// Convert `hlo_proto` to StringView.
+absl::StatusOr<std::string> ConvertHloProtoToStringView(
+    const xla::HloProto& hlo_proto, bool verbose, bool metadata);
+
+// Convert dot into certain format
+absl::StatusOr<std::string> WrapDotInFormat(std::string dot,
+                                            xla::RenderedGraphFormat format);
+
+// Convert dot into visual graph in html
+std::string WrapDotInHtml(std::string dot);
+
+// Registers a function which implements RenderedGraphFormat::kUrl.
+// The input to the function is dot, and the output should be a URL or an error.
+// There can only be one active renderer, and the last call to this function
+// wins.
+void RegisterGraphvizURLRenderer(
+    std::function<absl::StatusOr<std::string>(absl::string_view dot)> renderer);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_GRAPH_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
new file mode 100644
index 00000000..e7a681de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_MEMORY_VISUALIZATION_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_MEMORY_VISUALIZATION_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "xla/service/hlo.pb.h"
+#include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+constexpr int kSmallBufferSize = 16 * 1024;
+
+// Convert HloProto to PreprocessResult proto for memory visualization.
+// small_buffer_size sets the byte size within which we collapse buffer entries
+// for the max-heap display.
+// <heap_simulator_trace_id> is the index of heap simulator trace to be
+// displayed. By default it is -1, which means the profiler will infer the heap
+// simulator trace id from <memory_color>.
+// By default the memory color is 0, which is HBM.
+absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
+    const xla::HloProto& hlo_proto,
+    int64_t small_buffer_size = kSmallBufferSize, int64_t memory_color = 0);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_MEMORY_VISUALIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_to_tools_data.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_to_tools_data.h
new file mode 100644
index 00000000..b567c973
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/hlo_to_tools_data.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_HLO_TO_TOOLS_DATA_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_HLO_TO_TOOLS_DATA_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/convert/tool_options.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Convert HLO proto to tool specific data.
+// <options> must provide a "module_name" field to identify which HLO proto
+// is used for the conversion.
+// Return the serialized string of tool specific data when the conversion is
+// successful, else return an error status.
+absl::StatusOr<std::string> ConvertHloProtoToToolData(
+    const SessionSnapshot& session_snapshot, absl::string_view tool_name,
+    const ToolOptions& options);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_HLO_TO_TOOLS_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats.h
new file mode 100644
index 00000000..2789694a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/device_utils.h"
+#include "xla/tsl/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Generates PerHostInferenceStats from the given trace events.
+// For TPU, get time breakdown from device_traces. For GPU, get time breakdown
+// from nonoverlapped_step_events.
+// Get batching parameters from TFstreamz xplane in <xspace>.
+void GenerateInferenceStats(
+    const std::vector<tensorflow::profiler::XPlane*>& device_traces,
+    const tensorflow::profiler::StepEvents& nonoverlapped_step_events,
+    const tsl::profiler::GroupMetadataMap& group_metadata_map,
+    const tensorflow::profiler::XSpace& xspace,
+    tsl::profiler::DeviceType device_type, int32_t host_id,
+    tensorflow::profiler::InferenceStats* inference_stats);
+
+// Parses model name from TFstreamz.
+// Returns whether the parsing is successful and the actual model name. If
+// parsing failed, returns false and an empty string.
+std::pair<bool, absl::string_view> ParseModelName(absl::string_view param);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_combiner.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_combiner.h
new file mode 100644
index 00000000..ceccc9cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_combiner.h
@@ -0,0 +1,25 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_COMBINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_COMBINER_H_
+#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
+
+namespace tensorflow::profiler {
+void CombineInferenceStatsResult(int src_host_id, const InferenceStats& src,
+                                 InferenceStats* dst);
+}  // namespace tensorflow::profiler
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_COMBINER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_grouping.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_grouping.h
new file mode 100644
index 00000000..7d60da0f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_grouping.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_GROUPING_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_GROUPING_H_
+
+#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
+
+namespace tensorflow::profiler {
+
+// Change inference stats from per host to per model_id by doing a regroup.
+// Future analysis of inference_stats will be on a per model_id basis.
+void RegroupInferenceStatsByModel(
+    tensorflow::profiler::InferenceStats* inference_stats);
+
+}  // namespace tensorflow::profiler
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_GROUPING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_sampler.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_sampler.h
new file mode 100644
index 00000000..2706c16a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/inference_stats_sampler.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_SAMPLER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_SAMPLER_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
+
+namespace tensorflow::profiler {
+
+// Sampled inference stats of a model.
+// The pointers of RequestDetail and BatchDetail point to the actual data stored
+// in TfOpStats.InferenceStats.
+struct SampledPerModelInferenceStats {
+  // Sampled requests and their percentile.
+  std::vector<std::pair<const tensorflow::profiler::RequestDetail*, double>>
+      sampled_requests;
+  // Sampled batches and their percentile.
+  std::vector<std::pair<const tensorflow::profiler::BatchDetail*, double>>
+      sampled_batches;
+};
+
+// All the sampled inference stats of a profile.
+// TODO: Move to use SampledInferenceStatsProto if feasible.
+using SampledInferenceStats =
+    absl::flat_hash_map<int /*model_index*/, SampledPerModelInferenceStats>;
+
+// Samples a subset of InferenceStats from <inference_stats> based on sampling
+// column <request_percentile_column> and <batch_percentile_column>.
+SampledInferenceStats SampleInferenceStats(
+    absl::string_view request_percentile_column,
+    absl::string_view batch_percentile_column,
+    const tensorflow::profiler::InferenceStats& inference_stats);
+
+}  // namespace tensorflow::profiler
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h
new file mode 100644
index 00000000..51348097
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Converts and combines multiple XSpace protos into a single OpStats
+// <combined_op_stats>.
+// Return the first error status during conversion, or return OkStatus() if
+// there is no error.
+absl::Status ConvertMultiXSpacesToCombinedOpStats(
+    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
+    OpStats* combined_op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h
new file mode 100644
index 00000000..3ea9af85
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XSPACE_TO_INFERENCE_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XSPACE_TO_INFERENCE_STATS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
+namespace tensorflow::profiler {
+absl::Status ConvertMultiXSpaceToInferenceStats(
+    const SessionSnapshot& session_snapshot, absl::string_view request_column,
+    absl::string_view batch_column, InferenceStats* inference_stats);
+}
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XSPACE_TO_INFERENCE_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_metrics_db_combiner.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
new file mode 100644
index 00000000..76019da8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_DB_COMBINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_DB_COMBINER_H_
+
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Copies OpMetrics metadata (e.g., category, provenance) from src to dst.
+void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst);
+
+// Combines OpMetrics data (e.g., occurrences, time) from src into dst.
+// If <update_num_cores> is set to true, update the dst->num_cores to
+// calculate the number of cores a certain op occurs.
+void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst,
+                      bool update_num_cores);
+
+// Combines the memory access breakdown.
+void CombineMemoryAccessedBreakdown(
+    const protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>& src,
+    protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>* dst);
+
+// Helper to combine op metrics databases.
+class OpMetricsDbCombiner : public OpMetricsDbBuilder {
+ public:
+  explicit OpMetricsDbCombiner(OpMetricsDb* dst) : OpMetricsDbBuilder(dst) {}
+
+  // Combine the OpMetrics in OpMetricsDb <src> to current OpMetricsDbCombiner.
+  // If <update_num_cores> is set to true, update the OpMetrics.num_cores to
+  // calculate the number of cores a certain op occurs.
+  void Combine(const OpMetricsDb& src, bool update_num_cores = true);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_DB_COMBINER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_metrics_to_record.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_metrics_to_record.h
new file mode 100644
index 00000000..37dfa14c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_metrics_to_record.h
@@ -0,0 +1,343 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/device_utils.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+std::vector<const OpMetrics*> SortedOpMetricsDb(const OpMetricsDb& metrics_db,
+                                                int max_records = -1);
+
+inline double GigaFlopsPerSecondPerCore(const OpMetrics& metrics) {
+  // flops and time_ps are accumulated across all occurrences on all cores.
+  // time_ps is used instead of self_time_ps because flops for an op includes
+  // the flops executed by children (nested) ops.
+  return tsl::profiler::SafeDivide(
+      metrics.flops(), tsl::profiler::PicoToNano(metrics.time_ps()));
+}
+
+inline double GigaModelFlopsPerSecondPerCore(const OpMetrics& metrics) {
+  // flops and time_ps are accumulated across all occurrences on all cores.
+  // time_ps is used instead of self_time_ps because flops for an op includes
+  // the flops executed by children (nested) ops.
+  return tsl::profiler::SafeDivide(
+      metrics.model_flops(), tsl::profiler::PicoToNano(metrics.time_ps()));
+}
+
+// Return ByteAccessed for memory_space and operation_type.
+inline double BytesAccessedPerCore(
+    const OpMetrics& metrics, uint64_t memory_space,
+    OpMetrics::MemoryAccessed::OperationType operation_type) {
+  uint64_t bytes = 0;
+  if (memory_space == MemorySpace::MEMORY_SPACE_ALL) {
+    bytes = metrics.bytes_accessed();
+  } else {
+    for (const auto& breakdown : metrics.memory_accessed_breakdown()) {
+      // Count either on-chip or off-chip bytes.
+      if ((breakdown.operation_type() != operation_type) &&
+          (operation_type != OpMetrics::MemoryAccessed::UNKNOWN)) {
+        continue;
+      }
+      if (((memory_space == MemorySpace::MEMORY_SPACE_HBM) &&
+           (breakdown.memory_space() == MemorySpace::MEMORY_SPACE_HBM)) ||
+          ((memory_space == MemorySpace::MEMORY_SPACE_ON_CHIP) &&
+           (breakdown.memory_space() != MemorySpace::MEMORY_SPACE_HBM))) {
+        bytes += breakdown.bytes_accessed();
+      }
+    }
+  }
+  return bytes;
+}
+
+inline double GigaBytesPerSecondPerCore(
+    const OpMetrics& metrics, uint64_t memory_space,
+    OpMetrics::MemoryAccessed::OperationType operation_type) {
+  // bytes_accessed and time_ps are accumulated across all occurrences on all
+  // cores.
+  // time_ps is used instead of self_time_ps because bytes_accessed for an op
+  // includes the bytes accessed by children (nested) ops.
+  return tsl::profiler::SafeDivide(
+      BytesAccessedPerCore(metrics, memory_space, operation_type),
+      tsl::profiler::PicoToNano(metrics.time_ps()));
+}
+
+inline double GibiBytesPerSecondPerCore(
+    const OpMetrics& metrics, uint64_t memory_space,
+    OpMetrics::MemoryAccessed::OperationType op_type) {
+  return tsl::profiler::GigaToGibi(
+      GigaBytesPerSecondPerCore(metrics, memory_space, op_type));
+}
+
+template <typename Record>
+inline void SetExecutionTimes(const OpMetrics& metrics, Record* record) {
+  record->set_occurrences(metrics.occurrences());
+  record->set_total_time_in_us(tsl::profiler::PicoToMicro(metrics.time_ps()));
+  record->set_avg_time_in_us(
+      SafeDivide(record->total_time_in_us(), metrics.occurrences()));
+  record->set_total_self_time_in_us(
+      tsl::profiler::PicoToMicro(metrics.self_time_ps()));
+  record->set_avg_self_time_in_us(
+      SafeDivide(record->total_self_time_in_us(), metrics.occurrences()));
+}
+
+template <typename Record>
+inline void SetTpuUnitFractions(const OpMetrics& metrics, Record* record) {
+  record->set_dma_stall_fraction(
+      tsl::profiler::SafeDivide(metrics.dma_stall_ps(), metrics.time_ps()));
+}
+
+template <typename Record>
+inline void SetRankAndTimeFractions(double total_time_us,
+                                    const Record& prev_record, Record* record) {
+  record->set_rank(prev_record.rank() + 1);
+  record->set_total_self_time_as_fraction(
+      SafeDivide(record->total_self_time_in_us(), total_time_us));
+  record->set_cumulative_total_self_time_as_fraction(
+      prev_record.cumulative_total_self_time_as_fraction() +
+      record->total_self_time_as_fraction());
+}
+
+template <typename Record>
+inline void SetRankAndDeviceTimeFractions(double total_time_us,
+                                          const Record& prev_record,
+                                          Record* record) {
+  record->set_rank(prev_record.rank() + 1);
+  record->set_device_total_self_time_as_fraction(
+      SafeDivide(record->total_self_time_in_us(), total_time_us));
+  record->set_device_cumulative_total_self_time_as_fraction(
+      prev_record.device_cumulative_total_self_time_as_fraction() +
+      record->device_total_self_time_as_fraction());
+}
+
+template <typename Record>
+inline void SetRankAndHostTimeFractions(double total_time_us,
+                                        const Record& prev_record,
+                                        Record* record) {
+  record->set_rank(prev_record.rank() + 1);
+  record->set_host_total_self_time_as_fraction(
+      SafeDivide(record->total_self_time_in_us(), total_time_us));
+  record->set_host_cumulative_total_self_time_as_fraction(
+      prev_record.host_cumulative_total_self_time_as_fraction() +
+      record->host_total_self_time_as_fraction());
+}
+
+// Returns the memory bandwidth in GigaBytes/s in the PerfEnv.
+// memory space is chosen by index following order in xplane_to_op_stats.cc
+static inline double GetMemoryPeakBandwidth(const PerfEnv& perf_env,
+                                            const int index) {
+  if (perf_env.peak_bws_giga_bytes_per_second_size() > index) {
+    return perf_env.peak_bws_giga_bytes_per_second(index);
+  }
+  return perf_env.peak_hbm_bw_giga_bytes_per_second();
+}
+
+template <typename Record>
+inline void SetRooflineMetrics(const OpMetrics& metrics, const PerfEnv perf_env,
+                               const RunEnvironment& run_env, Record* record) {
+  using ::tensorflow::profiler::MemorySpace;
+  using ::tensorflow::profiler::PerformanceInfo;
+  using ::tensorflow::profiler::PicoToNano;
+
+  // Set overall performance metrics.
+  record->set_measured_flop_rate(GigaFlopsPerSecondPerCore(metrics));
+  record->set_model_flop_rate(GigaModelFlopsPerSecondPerCore(metrics));
+  record->set_measured_memory_bw(GibiBytesPerSecondPerCore(
+      metrics, tensorflow::profiler::MemorySpace::MEMORY_SPACE_ALL,
+      OpMetrics::MemoryAccessed::UNKNOWN));
+  record->set_flops(metrics.flops());
+  record->set_bytes_accessed(metrics.bytes_accessed());
+  record->set_operational_intensity(
+      tsl::profiler::SafeDivide(metrics.flops(), metrics.bytes_accessed()));
+  // Set performance metrics per memory access type.
+  uint64_t hbm_bytes = 0;
+  uint64_t cmem_read_bytes = 0;
+  uint64_t cmem_write_bytes = 0;
+  uint64_t vmem_read_bytes = 0;
+  uint64_t vmem_write_bytes = 0;
+  for (const auto& memory_access : metrics.memory_accessed_breakdown()) {
+    if (memory_access.memory_space() == PerformanceInfo::MemoryAccessed::HBM) {
+      hbm_bytes += memory_access.bytes_accessed();
+    } else if (memory_access.memory_space() ==
+               PerformanceInfo::MemoryAccessed::CMEM) {
+      if (memory_access.operation_type() == OpMetrics::MemoryAccessed::READ) {
+        cmem_read_bytes += memory_access.bytes_accessed();
+      } else if (memory_access.operation_type() ==
+                 OpMetrics::MemoryAccessed::WRITE) {
+        cmem_write_bytes += memory_access.bytes_accessed();
+      }
+    } else if (memory_access.memory_space() ==
+               PerformanceInfo::MemoryAccessed::VMEM) {
+      if (memory_access.operation_type() == OpMetrics::MemoryAccessed::READ) {
+        vmem_read_bytes += memory_access.bytes_accessed();
+      } else if (memory_access.operation_type() ==
+                 OpMetrics::MemoryAccessed::WRITE) {
+        vmem_write_bytes += memory_access.bytes_accessed();
+      }
+    }
+  }
+  if (metrics.memory_accessed_breakdown_size() == 0) {
+    // For legacy profiles without memory access breakdown, consider all memory
+    // access as HBM access.
+    hbm_bytes = metrics.bytes_accessed();
+  }
+  record->set_hbm_bw(tsl::profiler::GibibytesPerSecond(
+      hbm_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
+  record->set_cmem_read_bw(tsl::profiler::GibibytesPerSecond(
+      cmem_read_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
+  record->set_cmem_write_bw(tsl::profiler::GibibytesPerSecond(
+      cmem_write_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
+  record->set_vmem_read_bw(tsl::profiler::GibibytesPerSecond(
+      vmem_read_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
+  record->set_vmem_write_bw(tsl::profiler::GibibytesPerSecond(
+      vmem_write_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
+  record->set_hbm_operational_intensity(
+      tsl::profiler::SafeDivide(metrics.flops(), hbm_bytes));
+  record->set_cmem_read_operational_intensity(
+      tsl::profiler::SafeDivide(metrics.flops(), cmem_read_bytes));
+  record->set_cmem_write_operational_intensity(
+      tsl::profiler::SafeDivide(metrics.flops(), cmem_write_bytes));
+  record->set_vmem_read_operational_intensity(
+      tsl::profiler::SafeDivide(metrics.flops(), vmem_read_bytes));
+  record->set_vmem_write_operational_intensity(
+      tsl::profiler::SafeDivide(metrics.flops(), vmem_write_bytes));
+  // Resources considered for roofline analysis.
+  constexpr absl::string_view kUnknown = "Unknown";
+  constexpr absl::string_view kCompute = "Compute";
+  constexpr absl::string_view kHbm = "HBM";
+  constexpr absl::string_view kCmemRead = "CMEM Read";
+  constexpr absl::string_view kCmemWrite = "CMEM Write";
+  constexpr absl::string_view kVmemRead = "VMEM Read";
+  constexpr absl::string_view kVmemWrite = "VMEM Write";
+  constexpr absl::string_view kShmL1 = "Shm/L1";
+  // Compute the bound time assuming the peak capacity of each resource and
+  // choose the highest one as the bottleneck. See go/xprof-roofline-pxc for
+  // more details.
+  // NOTE: The roofline analysis result is the same for Megacore because every
+  // resource's capacity is doubled for Megacore so the comparison result is the
+  // same.
+  absl::string_view bottleneck_resource = kUnknown;
+  double bottleneck_utilization = 0;
+  double bottleneck_operational_intensity = 0;
+  double peak_flops =
+      tsl::profiler::TeraToGiga(perf_env.peak_tera_flops_per_second());
+  double flops_utilization =
+      SafeDivide(record->measured_flop_rate(), peak_flops);
+  if (bottleneck_utilization < flops_utilization) {
+    bottleneck_resource = kCompute;
+    bottleneck_utilization = flops_utilization;
+    bottleneck_operational_intensity = record->operational_intensity();
+  }
+  double peak_hbm_bw = GetMemoryPeakBandwidth(perf_env, 0);
+  double hbm_bw_utilization =
+      SafeDivide(record->hbm_bw(), tsl::profiler::GigaToGibi(peak_hbm_bw));
+  if (bottleneck_utilization < hbm_bw_utilization) {
+    bottleneck_resource = kHbm;
+    bottleneck_utilization = hbm_bw_utilization;
+    bottleneck_operational_intensity = record->hbm_operational_intensity();
+  }
+  tensorflow::profiler::HardwareType hardware_type = run_env.hardware_type();
+  if (hardware_type == tensorflow::profiler::HardwareType::TPU) {
+    if (cmem_read_bytes) {
+      double peak_cmem_read_bw = GetMemoryPeakBandwidth(perf_env, 3);
+      if (peak_cmem_read_bw) {
+        double cmem_read_bw_utilization =
+            SafeDivide(record->cmem_read_bw(),
+                       tsl::profiler::GigaToGibi(peak_cmem_read_bw));
+        if (bottleneck_utilization < cmem_read_bw_utilization) {
+          bottleneck_resource = kCmemRead;
+          bottleneck_utilization = cmem_read_bw_utilization;
+          bottleneck_operational_intensity =
+              record->cmem_read_operational_intensity();
+        }
+      }
+    }
+    if (cmem_write_bytes) {
+      double peak_cmem_write_bw = GetMemoryPeakBandwidth(perf_env, 4);
+      if (peak_cmem_write_bw) {
+        double cmem_write_bw_utilization =
+            SafeDivide(record->cmem_write_bw(),
+                       tsl::profiler::GigaToGibi(peak_cmem_write_bw));
+        if (bottleneck_utilization < cmem_write_bw_utilization) {
+          bottleneck_resource = kCmemWrite;
+          bottleneck_utilization = cmem_write_bw_utilization;
+          bottleneck_operational_intensity =
+              record->cmem_write_operational_intensity();
+        }
+      }
+    }
+    if (vmem_read_bytes) {
+      double peak_vmem_read_bw = GetMemoryPeakBandwidth(perf_env, 5);
+      if (peak_vmem_read_bw) {
+        double vmem_read_bw_utilization =
+            SafeDivide(record->vmem_read_bw(),
+                       tsl::profiler::GigaToGibi(peak_vmem_read_bw));
+        if (bottleneck_utilization < vmem_read_bw_utilization) {
+          bottleneck_resource = kVmemRead;
+          bottleneck_utilization = vmem_read_bw_utilization;
+          bottleneck_operational_intensity =
+              record->vmem_read_operational_intensity();
+        }
+      }
+    }
+    if (vmem_write_bytes) {
+      double peak_vmem_write_bw = GetMemoryPeakBandwidth(perf_env, 6);
+      if (peak_vmem_write_bw) {
+        double vmem_write_bw_utilization =
+            SafeDivide(record->vmem_write_bw(),
+                       tsl::profiler::GigaToGibi(peak_vmem_write_bw));
+        if (bottleneck_utilization < vmem_write_bw_utilization) {
+          bottleneck_resource = kVmemWrite;
+          bottleneck_utilization = vmem_write_bw_utilization;
+          bottleneck_operational_intensity =
+              record->vmem_write_operational_intensity();
+        }
+      }
+    }
+  }
+  if (hardware_type == tensorflow::profiler::HardwareType::GPU) {
+    double peak_shm_l1_bw = GetMemoryPeakBandwidth(perf_env, 2);
+    if (peak_shm_l1_bw) {
+      // Currently, we only have general read/write bandwidth in record.
+      double shm_l1_bw_utilization = SafeDivide(
+          record->hbm_bw(), tsl::profiler::GigaToGibi(peak_shm_l1_bw));
+      if (bottleneck_utilization < shm_l1_bw_utilization) {
+        bottleneck_resource = kShmL1;
+        bottleneck_utilization = shm_l1_bw_utilization;
+        bottleneck_operational_intensity = record->hbm_operational_intensity();
+      }
+    }
+  }
+  record->set_bound_by(std::string(bottleneck_resource));
+  record->set_bottleneck_operational_intensity(
+      bottleneck_operational_intensity);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_profile_builder.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_profile_builder.h
new file mode 100644
index 00000000..3d4e7abd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_profile_builder.h
@@ -0,0 +1,157 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_PROFILE_BUILDER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_PROFILE_BUILDER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct OpProfileOptions {
+  bool group_by_program = true;
+  bool group_by_deduplicated_name = true;
+  int children_per_node = 100;
+};
+
+// The structure of an op profile tree may looks like below:
+// 1. group "by_program"
+// - It starts from the root node, named as "by_program", and this node does
+// not show up in op profile.
+// - The children of root node is a list of hlo program node, named as the
+// program/module name (eg. cluster.xx).
+// - The children of a program node is hlo op category node, named as the
+// category name (eg. data formatting).
+// - The children of a category node is a list of op node or deduplicated
+// group node:
+//   - For op that has duplicates, the child will be a deduplicated node,
+// named like "copy.1111 and its deduplicate(s)". Its children will be all op
+// nodes that are deduplicated.
+//   - For op that does not have duplicates, the child will be an op node
+// under the op category (eg. copy.2222).
+//
+// Example path: "by_program" -> "main(...)"
+// -> "data_formatting" -> "copy.12345 and its duplicate(s) -> "copy.12345"
+//
+// 2. group "by_category"
+// Similarly to how the `by_program` op profile tree is constructed,
+// `by_category` just removed the "program_node" layer:
+// - It starts from the root node, named as "by_category", this node also does
+// not show up in op profile.
+// - The children of root node is a list of op category node, everything below
+// is similar to above.
+// - ...
+//
+// Example path: "by_category" -> "data_formatting" -> "copy.12345 and its
+// duplicate(s) -> "copy.12345"
+//
+// How the op profile metrics are calculated:
+// 1. For parent node in the nested structure like root node and program node:
+// - time_ps will be accumulated from the self_time of all op nodes under it
+// (might still be off a bit if the parent node has self_time, more details in
+// b/333608397#comment5)
+// - flops and memory access will only be accumulated from leaf op node under
+// it to avoid double counting
+// - unable to get occurrences of program executions now
+// 2. For conceptual horizontal grouping node (eg.category, deduplicated)
+// - all op_metris fields will be accumulated from leaf op node only in the
+// group, to avoid double counting
+class OpProfileBuilder {
+ public:
+  OpProfileBuilder(const OpProfileOptions& options, op_profile::Node* root,
+                   const tensorflow::protobuf::Map<uint64_t, std::string>*
+                       program_name_map = nullptr);
+
+  // Accumulate the op_metrics to the op_profile node tree
+  void AddOp(const OpMetrics& op_metrics);
+
+  // Finalize the op_profile proto in a few steps (inter-dependent):
+  // 1. Reset time_ps for root node for more precise total time
+  // 2. Loop over the node to op_metrics map, populate corresponding op_metrics
+  // to the node.metrics
+  // 3. `SortAndPruneChildren` given query param `op_profile_limit`
+  // 4. `FinalizeDeduplicatedNodes` by coping the first op node data to the
+  // deduplicated node
+  void Finalize(double peak_gigaflops_per_second_per_core,
+                std::vector<double> peak_mem_gibibytes_per_second_per_core,
+                uint64_t total_time_ps);
+
+ private:
+  struct Category {
+    op_profile::Node* node;
+    absl::flat_hash_map<std::string, op_profile::Node*> deduplicated_nodes;
+  };
+
+  struct Program {
+    op_profile::Node* node;
+    absl::flat_hash_map<std::string, Category> categories;
+  };
+
+  std::string GenerateProgramName(uint64_t program_id) const;
+
+  // Adds and returns a node for op_metrics.
+  // If op_metrics corresponds to a fusion, adds children to the node for the
+  // fused instructions.
+  // If deduplicated_node is not null, adds the node under it.
+  // Otherwise, if category is not null, adds the node under category.
+  // Otherwise, adds the node under root.
+  op_profile::Node* AddOpNode(const OpMetrics& op_metrics,
+                              Category* category = nullptr,
+                              op_profile::Node* deduplicated_node = nullptr);
+
+  // Returns a node for op_metrics.deduplicated_name().
+  // Adds a node to the tree if necessary.
+  op_profile::Node* LookupOrAddDeduplicatedNode(const OpMetrics& op_metrics,
+                                                Category* category);
+
+  // Returns a node for op_metrics.category().
+  // Adds a node to the tree if necessary.
+  // If program is not null, the category node is added under program.
+  // Otherwise, the category node is added under root.
+  Category* LookupOrAddCategoryNode(const OpMetrics& op_metrics,
+                                    Program* program);
+
+  // Returns a node for op_metrics.hlo_module_id().
+  // Adds a node to the Node tree if necessary.
+  Program* LookupOrAddProgramNode(const OpMetrics& op_metrics);
+
+  OpProfileOptions options_;
+  op_profile::Node* root_;
+
+  // Map to look up and aggregate OpMetrics.
+  absl::node_hash_map<op_profile::Node*, OpMetrics> metrics_;
+
+  // Maps to look up if a category / program / deduplicated node has
+  // already been added to the tree.
+  absl::flat_hash_map<uint64_t, Program> programs_map_;
+  absl::flat_hash_map<std::string, Category> category_map_;
+
+  // Map to look up program names by id.
+  const tensorflow::protobuf::Map<uint64_t, std::string>* program_name_map_ =
+      nullptr;
+};
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_PROFILE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stack.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stack.h
new file mode 100644
index 00000000..6bfa4d77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stack.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+
+template <typename OpInfo>
+class OpStack {
+ public:
+  // Pushes an Op onto the stack.
+  void Push(uint32 op_id, std::unique_ptr<OpInfo> op_info) {
+    stack_.emplace_back(op_id, std::move(op_info));
+  }
+
+  // Pops the Op with the given op_id from the stack.
+  std::unique_ptr<OpInfo> Pop(uint32 op_id) {
+    // Pop until match or stack_ is empty.
+    std::unique_ptr<OpInfo> result;
+    while (!stack_.empty()) {
+      auto back = std::move(stack_.back());
+      stack_.pop_back();
+      if (op_id == back.first) {
+        result = std::move(back.second);
+        break;
+      }
+    }
+    return result;
+  }
+
+  // Returns the Op at the top of the stack.
+  OpInfo* Top() const {
+    return stack_.empty() ? nullptr : stack_.back().second.get();
+  }
+
+  // Returns true if the stack is empty.
+  bool Empty() const { return stack_.empty(); }
+
+  // Clears the stack.
+  void Clear() { stack_.clear(); }
+
+ private:
+  std::vector<std::pair<uint32 /*op_id*/, std::unique_ptr<OpInfo>>> stack_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_combiner.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_combiner.h
new file mode 100644
index 00000000..a8cb3c62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_combiner.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Whether a host is a coordinator.
+bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type);
+
+// Translates the core id from single host to the one for multiple-host.
+// We need this translation because the device_ordinal was assigned when a
+// single host response was given. Now, we need a global core_id to distinguish
+// it with multiple hosts.
+uint32 GlobalCoreId(int host_id, uint32 device_ordinal);
+
+// Combines the src map into the dst map.
+// The src map keys are local core_ids. The src_host_id is used to convert them
+// into global core_ids used as keys in the dst map.
+// REQUIRED: cores from src_host_id are not already in dst.
+template <typename CoreIdMap>
+void CombineCoreIdMap(int src_host_id, const CoreIdMap& src, CoreIdMap* dst) {
+  for (const auto& core_id_and_value : src) {
+    uint32 global_core_id = GlobalCoreId(src_host_id, core_id_and_value.first);
+    auto iter_and_inserted =
+        dst->insert({global_core_id, core_id_and_value.second});
+    DCHECK(iter_and_inserted.second)
+        << "Duplicated core_id: " << iter_and_inserted.first->first;
+  }
+}
+
+// A struct that contains all the information that is needed to combine OpStats.
+struct OpStatsInfo {
+  OpStatsInfo(const OpStats* op_stats, HardwareType hardware_type,
+              int src_host_id)
+      : op_stats(op_stats),
+        hardware_type(hardware_type),
+        src_host_id(src_host_id) {}
+  const OpStats* op_stats;
+  HardwareType hardware_type;
+  int src_host_id;
+};
+
+// Returns true if there is no device (accelerator) in any of the hosts.
+bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info);
+
+// Compute the StepIntersection to merge OpStats.
+// Profiler will limit the number of steps to be at most <max_step_per_host>.
+StepIntersection ComputeStepIntersectionToMergeOpStats(
+    const std::vector<OpStatsInfo>& all_op_stats_info,
+    uint32 max_step_per_host);
+
+// Combine all the OpStats in <all_op_stats_info> using the steps in range
+// <step_intersection>. The result is stored in <combined_op_stats>.
+void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
+                       const StepIntersection& step_intersection,
+                       OpStats* combined_op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h
new file mode 100644
index 00000000..1037ef19
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_HLO_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_HLO_STATS_H_
+
+#include "tensorflow/core/profiler/protobuf/hlo_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+tensorflow::profiler::hlo_stats::HloStatsDatabase ConvertOpStatsToHloStats(
+    const tensorflow::profiler::OpStats& op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_HLO_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
new file mode 100644
index 00000000..c9de162e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
+
+#include <string>
+
+#include "google/protobuf/any.pb.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/stats_calculator.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+StepSummary GetStepSummaryForSampleStats(const tsl::Stat<double>& sample_stats);
+
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsSignificant, we should advise the
+// user to optimize this transfer.
+constexpr double kHostToDeviceTimePercentAsSignificant = 10.0;
+
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsDominant, we should ONLY advise the
+// user to optimize this transfer; we won't bother to suggest optimization for
+// tf.data.
+constexpr double kHostToDeviceTimePercentAsDominant = 90.0;
+
+// Computes the summary of step time in milliseconds.
+StepSummary ComputeStepTimeSummaryInMs(
+    const ::tensorflow::protobuf::RepeatedPtrField<PerCoreStepInfo>&
+        grouped_by_step);
+
+void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
+                        InputPipelineAnalysisResult* result);
+
+InputPipelineAnalysisRecommendation GenerateRecommendation();
+
+// Returns the performance bottleneck of the program executed.
+BottleneckAnalysis ComputeBottleneckAnalysis(
+    const InputTimeBreakdown& input_time_breakdown,
+    const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
+        any_step_details);
+
+InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
+    const OpStats& op_stats);
+
+// Returns true if explanation for "All Others" time is also included in
+// input_statement.
+bool InputAnalysis(double input_percent, double all_other_percent,
+                   std::string* input_classification,
+                   std::string* input_statement);
+
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement);
+
+string GetSummaryNextStep(absl::string_view input_classification,
+                          const InputTimeBreakdown& breakdown);
+
+// Returns the percentage of the input time that is spent on transferring the
+// data from host to device.
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown);
+
+void AddErrorMessages(const OpStats& op_stats,
+                      InputPipelineAnalysisResult* result);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_op_profile.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_op_profile.h
new file mode 100644
index 00000000..1fcfefb5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_op_profile.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OP_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OP_PROFILE_H_
+
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Assembles a hierarchical performance profile based on HLOs in the op metrics
+// db.
+// The node hierarchy is as following:
+//    by_category
+//      - combined_root
+//          - category 1
+//          - category 2
+//          - ...
+//      - idle
+//    by_program
+//      - program_1_root
+//          - category 1
+//          - category 2
+//          - ...
+//      - program_2_root
+//          - category 1
+//          - ...
+//      - idle
+// The nodes in the profile are sorted by time in decreasing order and pruned
+// to reduce the profile size. Only 100 nodes are kept for level >= 3.
+// See op_profile.proto for the detailed semantics of the returned profile.
+void ConvertOpStatsToOpProfile(
+    const tensorflow::profiler::OpStats& op_stats,
+    tensorflow::profiler::HardwareType hardware_type,
+    tensorflow::profiler::op_profile::Profile& profile,
+    int op_profile_limit = 100);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OP_PROFILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
new file mode 100644
index 00000000..2911e956
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Reports tf-function optimization opportunity in the Overview Page if the
+// expensive-call-time percentage is over this threshold for at least one of
+// the tf-functions profiled.
+const double kTfFunctionReportThresholdInPercent = 20;
+
+// Reports eager-mode optimization opportunity in the Overview Page if the
+// percent of Op time on host (or device) that is spent on eager mode is over
+// this threshold.
+const double kEagerReportThresholdInPercent = 10;
+
+// Reports outside-compilation opportunity in the Overview Page if the
+// percent of Op time on device that is for outside compilation is over
+// this threshold.
+const double kOutsideCompilationThresholdInPercent = 5;
+
+void SetCommonRecommendation(
+    absl::string_view input_classification, absl::string_view input_statement,
+    absl::string_view output_statement, HardwareType hardware_type,
+    absl::string_view tf_function_statement_html,
+    absl::string_view eager_statement_html,
+    absl::string_view outside_compilation_statement_html,
+    OverviewPageRecommendation* re);
+
+OverviewPageRecommendation ComputeGenericRecommendation(
+    const BottleneckAnalysis& bottleneck,
+    const PrecisionStats& precision_stats);
+
+OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
+
+OverviewPageRunEnvironment ComputeRunEnvironment(
+    const RunEnvironment& run_environment);
+
+OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats);
+
+// Returns a html which provides tf-function related recommendation.
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
+
+// Returns a html which provides eager-mode related recommendation.
+std::string EagerRecommendationHtml(double host_op_time_eager_percent,
+                                    double device_op_time_eager_percent);
+
+// Returns a html which provides outside-compilation related recommendation.
+std::string OutsideCompilationRecommendationHtml(
+    double device_op_time_outside_compilation_percent);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h
new file mode 100644
index 00000000..bd3d7406
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
+
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h
new file mode 100644
index 00000000..c45c9939
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
+
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+PodViewerDatabase ConvertOpStatsToPodViewer(const OpStats& op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_roofline_model.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_roofline_model.h
new file mode 100644
index 00000000..d745b96f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_roofline_model.h
@@ -0,0 +1,98 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_ROOFLINE_MODEL_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_ROOFLINE_MODEL_H_
+
+#include <cstdint>
+
+#include "tsl/platform/protobuf.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/roofline_model.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tensorflow::profiler::OpMetrics;
+using tensorflow::profiler::roofline_model::RecordType;
+using tensorflow::profiler::roofline_model::RooflineModelDatabase;
+using tensorflow::profiler::roofline_model::RooflineModelRecord;
+
+RooflineModelRecord ConvertOpMetricsToRooflineModelRecord(
+    const OpStats& op_stats, const OpMetrics& metrics, RecordType record_type,
+    uint32_t step_num, uint64_t total_time_ps,
+    const RooflineModelDatabase& roofline_model_db,
+    bool include_infeed_outfeed);
+
+RooflineModelRecord GenerateRooflineModelProgramRecord(
+    const OpStats& op_stats, const OpMetricsDb& db, RecordType record_type,
+    uint32_t step_num, const RooflineModelDatabase& roofline_model_db,
+    bool include_infeed_outfeed);
+
+tsl::protobuf::RepeatedPtrField<RooflineModelRecord>
+ConvertOpMetricsDbToRooflineModelRecords(
+    const OpStats& op_stats, const OpMetricsDb& db, RecordType record_type,
+    uint32_t step_num, const RooflineModelDatabase& roofline_model_db,
+    bool include_infeed_outfeed);
+
+tensorflow::profiler::roofline_model::RooflineModelDatabase
+ConvertOpStatsToRooflineModel(const tensorflow::profiler::OpStats& tf_op_stats,
+                              bool include_infeed_outfeed);
+
+tensorflow::profiler::roofline_model::RooflineModelDatabase
+InitializeRooflineModelDatabaseFromOpStats(const OpStats& op_stats,
+                                           bool include_infeed_outfeed);
+// Generate RooflineModelRecord for the HLO DB over the entire profiling
+// duration including incomplete steps.
+inline void AddRooflineModelRecordForProfileDuration(
+    const OpStats& op_stats, RooflineModelDatabase& roofline_model_db,
+    bool include_infeed_outfeed) {
+  *roofline_model_db.mutable_roofline_model_record() =
+      ConvertOpMetricsDbToRooflineModelRecords(
+          op_stats, op_stats.device_op_metrics_db(), RecordType::ALL,
+          /*step_num=*/0, roofline_model_db, include_infeed_outfeed);
+}
+
+// Generate RooflineModelRecord for the HLO DB over complete steps only.
+inline void AddRooflineModelRecordsForCompleteSteps(
+    const OpStats& op_stats, RooflineModelDatabase& roofline_model_db,
+    bool include_infeed_outfeed) {
+  if (op_stats.has_hlo_metrics_db_complete_steps_only()) {
+    *roofline_model_db.add_roofline_model_record() =
+        GenerateRooflineModelProgramRecord(
+            op_stats, op_stats.hlo_metrics_db_complete_steps_only(),
+            RecordType::AVERAGE_STEP, /*step_num=*/0, roofline_model_db,
+            include_infeed_outfeed);
+  }
+}
+
+// Generate RooflineModelRecords for the per-step DBs.
+inline void AddRooflineModelRecordsPerStep(
+    const OpStats& op_stats, RooflineModelDatabase& roofline_model_db,
+    bool include_infeed_outfeed) {
+  for (const auto& step_info : op_stats.step_db().step_sequence()) {
+    *roofline_model_db.add_roofline_model_record() =
+        GenerateRooflineModelProgramRecord(
+            op_stats, step_info.hlo_metrics_db(), RecordType::PER_STEP,
+            step_info.step_num(), roofline_model_db, include_infeed_outfeed);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_ROOFLINE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_tf_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_tf_stats.h
new file mode 100644
index 00000000..3b8a06ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/op_stats_to_tf_stats.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_TF_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_TF_STATS_H_
+
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+TfStatsDatabase ConvertOpStatsToTfStats(const OpStats& op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_TF_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
new file mode 100644
index 00000000..4c86ed87
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
+
+#include "xla/tsl/profiler/utils/group_events.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Preprocess XSpaces before tools conversion.
+// If step_grouping = true, perform events grouping for step tracking.
+// If derived_timeline, generate derived timeline (XLines).
+// If group_metadata_map is not nullptr, populate the group metadata map.
+void PreprocessSingleHostXSpace(
+    XSpace* space, bool step_grouping, bool derived_timeline,
+    tsl::profiler::GroupMetadataMap* group_metadata_map = nullptr);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/process_megascale_dcn.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/process_megascale_dcn.h
new file mode 100644
index 00000000..794c2bea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/process_megascale_dcn.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PROCESS_MEGASCALE_DCN_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_PROCESS_MEGASCALE_DCN_H_
+
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Process Dcn Megascale TraceMe info.
+void ProcessMegascaleDcn(XSpace* space);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_PROCESS_MEGASCALE_DCN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/profile_time_breakdown.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/profile_time_breakdown.h
new file mode 100644
index 00000000..1e3379be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/profile_time_breakdown.h
@@ -0,0 +1,244 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PROFILE_TIME_BREAKDOWN_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_PROFILE_TIME_BREAKDOWN_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/convert/xla_op_utils.h"
+#include "xla/tsl/profiler/utils/math_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Allows accumulating time spent in different HLO instruction categories to
+// breakdown the total profile time and compute metrics of interest.
+class ProfileTimeBreakdown {
+ public:
+  // Category should be the operator category disambiguated by xprof instead of
+  // the original category from XLA.
+  // For a correct time breakdown, we need to use the self time of operators,
+  // instead of total time to avoid double counting. Note that for leaf ops,
+  // self time and total time are the same.
+  void IncrementCategoryTimePs(absl::string_view category,
+                               uint64_t self_time_ps) {
+    time_ps_by_category_[category] += self_time_ps;
+    total_time_ps_ += self_time_ps;
+  }
+
+  // Profile time cannot be smaller than the total time in all categories.
+  // If combining profiles across multiple cores, profile time should be the
+  // profiling duration multiplied by the number of cores that were profiled.
+  // go/autograppler_profile_time
+  void SetProfileTimePs(uint64_t profile_time_ps) {
+    DCHECK_LE(total_time_ps_, profile_time_ps);
+    profile_time_ps_ = profile_time_ps;
+  }
+
+  // Breaks down "sparsecorev0 infeed" into two components:
+  // 1) "sparsecorev0 infeed wait": Time spent waiting on the SparseCoreV0.
+  // 2) "sparsecorev0 infeed transform": Time spent transforming activations in
+  //    SparseCoreV0 layout into XLA layout.
+  // Even though 2) is part of the overall embedding computation, it is time
+  // spent doing work on the TensorCore.
+  void BreakdownSparseCoreV0Infeed();
+
+  // Duty cycle is the fraction of time an accelerator is being actively used.
+  // go/accelerator-metrics-definitions#common-accelerator-metrics
+  // go/ag-tpu-duty-cycle
+  double DutyCycle() const { return TimeFraction(OnDutyTimePs()); }
+
+  double IdleFraction() const { return TimeFraction(IdleTimePs()); }
+
+  double InfeedFraction() const {
+    return CategoryFraction(tsl::profiler::kHloInfeed);
+  }
+
+  double OutfeedFraction() const {
+    return CategoryFraction(tsl::profiler::kHloOutfeed);
+  }
+
+  double SparseCoreV0InfeedFraction() const {
+    return CategoriesFraction({tsl::profiler::kHloSparseCoreV0Infeed,
+                               tsl::profiler::kHloSparseCoreV0InfeedWait,
+                               tsl::profiler::kHloSparseCoreV0InfeedTransform});
+  }
+
+  double SparseCoreV0OutfeedFraction() const {
+    return CategoryFraction(tsl::profiler::kHloSparseCoreV0Outfeed);
+  }
+
+  double AllReduceFraction() const {
+    return CategoryFraction(tsl::profiler::kHloAllReduce);
+  }
+
+  double AllReduceFusionFraction() const {
+    return CategoryFraction(tsl::profiler::kHloAllReduceFusion);
+  }
+
+  double SendRecvFraction() const {
+    return CategoriesFraction(
+        {tsl::profiler::kHloSend, tsl::profiler::kHloSendDone,
+         tsl::profiler::kHloRecv, tsl::profiler::kHloRecvDone});
+  }
+
+  double HostSendRecvFraction() const {
+    return CategoriesFraction(
+        {tsl::profiler::kHloHostSend, tsl::profiler::kHloHostSendDone,
+         tsl::profiler::kHloHostRecv, tsl::profiler::kHloHostRecvDone});
+  }
+
+  double CategoriesFraction(
+      const std::initializer_list<absl::string_view>& categories) const {
+    return TimeFraction(CategoriesTimePs(categories));
+  }
+
+  double CategoryFraction(absl::string_view category) const {
+    return TimeFraction(CategoryTimePs(category));
+  }
+
+  uint64_t ProfileTimePs() const { return profile_time_ps_; }
+
+  uint64_t TotalTimePs() const { return total_time_ps_; }
+
+  uint64_t IdleTimePs() const { return profile_time_ps_ - total_time_ps_; }
+
+  uint64_t OnDutyTimePs() const { return profile_time_ps_ - OffDutyTimePs(); }
+
+  uint64_t OffDutyTimePs() const {
+    return IdleTimePs() +
+           CategoriesTimePs(
+               {tsl::profiler::kHloInfeed, tsl::profiler::kHloOutfeed,
+                tsl::profiler::kHloHostSend, tsl::profiler::kHloHostSendDone,
+                tsl::profiler::kHloHostRecv, tsl::profiler::kHloHostRecvDone,
+                tsl::profiler::kHloMegacoreFusion});
+  }
+
+  uint64_t InfeedTimePs() const {
+    return CategoryTimePs(tsl::profiler::kHloInfeed);
+  }
+
+  uint64_t OutfeedTimePs() const {
+    return CategoryTimePs(tsl::profiler::kHloOutfeed);
+  }
+
+  uint64_t SparseCoreV0InfeedWaitTimePs() const {
+    return CategoryTimePs(tsl::profiler::kHloSparseCoreV0InfeedWait);
+  }
+
+  uint64_t SparseCoreV0InfeedTransformTimePs() const {
+    return CategoryTimePs(tsl::profiler::kHloSparseCoreV0InfeedTransform);
+  }
+
+  uint64_t SparseCoreV0OutfeedTimePs() const {
+    return CategoryTimePs(tsl::profiler::kHloSparseCoreV0Outfeed);
+  }
+
+  uint64_t AllReduceOrAllToAllTimePs() const {
+    return CategoriesTimePs({tsl::profiler::kHloAllReduce,
+                             tsl::profiler::kHloAllReduceFusion,
+                             tsl::profiler::kHloAllToAll});
+  }
+
+  uint64_t SendTimePs() const {
+    return CategoriesTimePs(
+        {tsl::profiler::kHloSend, tsl::profiler::kHloSendDone});
+  }
+
+  uint64_t RecvTimePs() const {
+    return CategoriesTimePs(
+        {tsl::profiler::kHloRecv, tsl::profiler::kHloRecvDone});
+  }
+
+  uint64_t HostSendTimePs() const {
+    return CategoriesTimePs(
+        {tsl::profiler::kHloHostSend, tsl::profiler::kHloHostSendDone});
+  }
+
+  uint64_t HostRecvTimePs() const {
+    return CategoriesTimePs(
+        {tsl::profiler::kHloHostRecv, tsl::profiler::kHloHostRecvDone});
+  }
+
+  // Megacore fusion runs different operations on each core, e.g., a convolution
+  // on one core and an all-reduce on the other core. In a trace, megacore
+  // fusion is the parent operation, and its self time is the time that the core
+  // executing the faster operation waits for the core executing the slower
+  // operation to reach the synchronization point.
+  uint64_t MegacoreFusionTimePs() const {
+    return CategoryTimePs(tsl::profiler::kHloMegacoreFusion);
+  }
+
+  uint64_t HighFlopsComputeTimePs() const {
+    return CategoriesTimePs({tsl::profiler::kHloConvolution,
+                             tsl::profiler::kHloConvolutionBaseDilated,
+                             tsl::profiler::kHloConvolutionWindowDilated,
+                             tsl::profiler::kHloConvolutionFusion,
+                             tsl::profiler::kHloOutputFusion});
+  }
+
+  // Calculated according to the "TC busy time" defined in go/tpu_kpis
+  uint64_t TensorCoreBusyTimePs() const {
+    return profile_time_ps_ - OffDutyTimePs() - SparseCoreV0InfeedWaitTimePs();
+  }
+
+  uint64_t CategoriesTimePs(
+      const std::initializer_list<absl::string_view>& categories) const {
+    uint64_t time_ps = 0;
+    for (auto category : categories) {
+      time_ps += CategoryTimePs(category);
+    }
+    return time_ps;
+  }
+
+  uint64_t CategoryTimePs(absl::string_view category) const {
+    auto iter = time_ps_by_category_.find(category);
+    return (iter == time_ps_by_category_.end()) ? 0 : iter->second;
+  }
+
+  template <typename Map>
+  void ComputeCategoryFractions(Map& category_fractions) {
+    for (const auto& [category, time_ps] : time_ps_by_category_) {
+      category_fractions[category] = TimeFraction(time_ps);
+    }
+  }
+
+  std::string DebugString() const;
+
+ private:
+  // Overwrites the time attributed to the given category.
+  void SetCategoryTimePs(absl::string_view category, uint64_t time_ps);
+
+  // Removes and returns the time attributed to the given category.
+  uint64_t PopCategoryTimePs(absl::string_view category);
+
+  double TimeFraction(uint64_t time_ps) const {
+    return tsl::profiler::SafeDivide(time_ps, profile_time_ps_);
+  }
+
+  absl::flat_hash_map<std::string, uint64_t> time_ps_by_category_;
+  uint64_t total_time_ps_ = 0;  // Sum of values in time_ps_by_category_.
+  uint64_t profile_time_ps_ = 0;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_PROFILE_TIME_BREAKDOWN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/repository.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/repository.h
new file mode 100644
index 00000000..af990aa5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/repository.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_REPOSITORY_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_REPOSITORY_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/file_system_utils.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/utils/hlo_module_map.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+constexpr char kAllHostsIdentifier[] = "ALL_HOSTS";
+constexpr char kNoHostIdentifier[] = "NO_HOST";
+
+enum StoredDataType {
+  DCN_COLLECTIVE_STATS,
+};
+
+static auto* kHostDataSuffixes =
+    new std::vector<std::pair<StoredDataType, const char*>>(
+        {{StoredDataType::DCN_COLLECTIVE_STATS, ".dcn_collective_stats.pb"}});
+
+// File system directory snapshot of a profile session.
+class SessionSnapshot {
+ public:
+  // Performs validation and creates SessionSnapshot.
+  // <xspace_paths> are the file paths to XSpace protos.
+  // Optionally, <xspaces> can contain the XSpace protos pre-loaded by the
+  // profiler plugin.
+  static absl::StatusOr<SessionSnapshot> Create(
+      std::vector<std::string> xspace_paths,
+      std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces);
+
+  // Returns the number of XSpaces in the profile session.
+  size_t XSpaceSize() const { return xspace_paths_.size(); }
+
+  // Gets XSpace proto.
+  // The caller of this function will take ownership of the XSpace.
+  absl::StatusOr<std::unique_ptr<XSpace>> GetXSpace(size_t index) const;
+
+  // Gets XSpace proto.
+  // The caller of this function will take ownership of the XSpace.
+  absl::StatusOr<std::unique_ptr<XSpace>> GetXSpaceByName(
+      absl::string_view name) const;
+
+  // Gets host name.
+  std::string GetHostname(size_t index) const;
+
+  // Gets the run directory of the profile session.
+  absl::string_view GetSessionRunDir() const { return session_run_dir_; }
+
+  // Gets whether the session has an accessible run dir. If false, any
+  // path-based file read will be disabled in this mode.
+  bool HasAccessibleRunDir() const { return has_accessible_run_dir_; }
+
+  // Gets the path of the fast file for a given tool.
+  std::optional<std::string> GetFilePath(absl::string_view toolname,
+                                         absl::string_view host) const;
+
+  // Gets the name of the host data file.
+  absl::StatusOr<std::string> GetHostDataFileName(StoredDataType data_type,
+                                                  std::string host) const;
+
+  // Gets the path of the host data file.
+  absl::StatusOr<std::optional<std::string>> GetHostDataFilePath(
+      StoredDataType data_type, std::string host) const;
+
+  /* Gets whether the cache file is present in run dir. First value indicates
+  whether cache file is present or not. Second value indicates the path of cache
+  file. Possible cases are:
+      1. <false, "">: If no cache file is present
+      2. <true, "">: If cache file is present but file contains no data_type
+     events
+      3. <true, filepath>: If cache file is present and file contains data_type
+     events
+  */
+  absl::StatusOr<std::pair<bool, std::string>> HasCacheFile(
+      StoredDataType data_type) const;
+
+  template <typename T>
+  absl::Status WriteBinaryProto(const StoredDataType data_type,
+                                const std::string host, T& proto) const {
+    // Gets name for host data file.
+    TF_ASSIGN_OR_RETURN(std::string filename,
+                        GetHostDataFileName(data_type, host));
+
+    std::string filepath =
+        tsl::profiler::ProfilerJoinPath(GetSessionRunDir(), filename);
+
+    return tensorflow::WriteBinaryProto(tsl::Env::Default(), filepath, proto);
+  }
+
+  template <typename T>
+  absl::Status ReadBinaryProto(const StoredDataType data_type,
+                               const std::string host, T* proto) const {
+    // Gets file path for host data.
+    TF_ASSIGN_OR_RETURN(std::optional<std::string> filepath,
+                        GetHostDataFilePath(data_type, host));
+    if (filepath) {
+      return tensorflow::ReadBinaryProto(tsl::Env::Default(), filepath.value(),
+                                         proto);
+    }
+
+    return absl::NotFoundError(
+        absl::StrCat("No binary proto found for ", host, " and ", data_type));
+  }
+
+ private:
+  SessionSnapshot(std::vector<std::string> xspace_paths,
+                  std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces)
+      : xspace_paths_(std::move(xspace_paths)),
+        // If the snapshot was initialized by xspaces, the file path and run dir
+        // is a path tensorflow can't read from or write to so any file IO
+        // encapsulated in this class will be disabled in this mode.
+        has_accessible_run_dir_(!xspaces.has_value()),
+        xspaces_(std::move(xspaces)) {
+    session_run_dir_ = tensorflow::io::Dirname(xspace_paths_.at(0));
+    for (size_t i = 0; i < xspace_paths_.size(); ++i) {
+      std::string host_name = GetHostname(i);
+      hostname_map_[host_name] = i;
+    }
+  }
+
+  // File paths to XSpace protos.
+  std::vector<std::string> xspace_paths_;
+  // The run directory of the profile session.
+  absl::string_view session_run_dir_;
+
+  absl::flat_hash_map<std::string /*host_name*/, size_t /*index*/>
+      hostname_map_;
+
+  const bool has_accessible_run_dir_;
+
+  // XSpace protos pre-loaded by the profiler plugin.
+  // TODO(profiler): Use blobstore paths to initialize SessionSnapshot instead
+  // of using pre-loaded XSpaces.
+  mutable std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces_;
+};
+
+// Writes binary proto format T for a host and data_type to a session.
+template <typename T>
+absl::Status WriteBinaryProto(const SessionSnapshot& session_snapshot,
+                              const StoredDataType data_type,
+                              const std::string& host, T& proto) {
+  return session_snapshot.WriteBinaryProto(data_type, host, proto);
+}
+
+// Reads binary proto format T for a host and data_type to a session.
+template <typename T>
+absl::Status ReadBinaryProto(const SessionSnapshot& session_snapshot,
+                             const StoredDataType data_type,
+                             const std::string& host, T* proto) {
+  return session_snapshot.ReadBinaryProto(data_type, host, proto);
+}
+
+// Process HloModuleMap from all XSpaces in a session.
+inline absl::StatusOr<HloModuleMap> ProcessHloModuleMap(
+    const SessionSnapshot& session_snapshot) {
+  HloModuleMap hlo_module_map;
+  for (int i = 0; i < session_snapshot.XSpaceSize(); i++) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
+                        session_snapshot.GetXSpace(i));
+    ProcessHloModuleMapFromXSpace(hlo_module_map, xspace.get());
+  }
+  return hlo_module_map;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_REPOSITORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/step_events_to_steps_db.h
new file mode 100644
index 00000000..9764c46c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/step_events_to_steps_db.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+
+namespace tensorflow {
+namespace profiler {
+
+TF_CONST_INIT extern const uint32 kDefaultGpuLocalCoreId;
+
+// Converts from overlapped Step-Events to StepDatabaseResult.
+StepDatabaseResult ConvertStepEventsToStepDb(
+    bool has_device, bool maybe_drop_incomplete_steps,
+    StepEvents& overlapped_step_events);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/tool_options.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/tool_options.h
new file mode 100644
index 00000000..85f285e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/tool_options.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TOOL_OPTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TOOL_OPTIONS_H_
+
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using ToolOptions =
+    absl::flat_hash_map<std::string, std::variant<int, std::string>>;
+
+// Helper function to get parameter from tool options.
+template <typename T>
+std::optional<T> GetParam(const ToolOptions& options, const std::string& key) {
+  const auto iter = options.find(key);
+  if (iter == options.end()) {
+    return std::nullopt;
+  }
+
+  const T* result = std::get_if<T>(&iter->second);
+  if (!result) {
+    return std::nullopt;
+  }
+  return *result;
+}
+
+// Helper function to get parameter from tool options with default value.
+template <typename T>
+T GetParamWithDefault(const ToolOptions& options, const std::string& key,
+                      const T& default_param) {
+  if (auto param = GetParam<T>(options, key)) {
+    return *param;
+  }
+  return default_param;
+}
+
+inline std::string DebugString(const ToolOptions& options) {
+  std::string output;
+  for (const auto& [k, v] : options) {
+    absl::StrAppend(
+        &output, k, ":",
+        std::visit([](const auto& value) { return absl::StrCat(value); }, v),
+        ":", v.index(), ";");
+  }
+  return absl::StrCat("{", output, "}");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TOOL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h
new file mode 100644
index 00000000..352a2b77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TPU_INPUT_PIPELINE_ANALYSIS_CONSTANTS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TPU_INPUT_PIPELINE_ANALYSIS_CONSTANTS_H_
+
+#include "absl/strings/string_view.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+namespace profiler {
+
+TF_CONST_INIT extern const absl::string_view kProfileAllHostsDoc;
+TF_CONST_INIT extern const absl::string_view kSparseCoreV0Name;
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TPU_INPUT_PIPELINE_ANALYSIS_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
new file mode 100644
index 00000000..73a0f81e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Helper class for adding arguments to TraceEventsArguments.
+class TraceEventArgumentsBuilder {
+ public:
+  explicit TraceEventArgumentsBuilder(TraceEventArguments* args)
+      : args_(args) {}
+
+  void Append(absl::string_view key, absl::string_view value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key.data(), key.size());
+    arg->set_str_value(value.data(), value.size());
+  }
+
+  void Append(absl::string_view key, int64_t value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key.data(), key.size());
+    arg->set_int_value(value);
+  }
+
+  void Append(absl::string_view key, uint64_t value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key.data(), key.size());
+    arg->set_uint_value(value);
+  }
+
+  void Append(absl::string_view key, double value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key.data(), key.size());
+    arg->set_double_value(value);
+  }
+
+ private:
+  TraceEventArguments* args_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
new file mode 100644
index 00000000..0581aab2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
@@ -0,0 +1,513 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/optimization.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/lib/io/table.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
+#include "tensorflow/core/profiler/lib/context_types.h"
+#include "tensorflow/core/profiler/protobuf/task.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/file_system.h"
+#include "tsl/platform/status.h"
+#include "tsl/profiler/lib/context_types.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// A track of events in the trace-viewer.
+using TraceEventTrack = std::vector<TraceEvent*>;
+
+// Merge-sorts the given event tracks. Each track must be sorted.
+std::vector<TraceEvent*> MergeEventTracks(
+    const std::vector<const TraceEventTrack*>& event_tracks);
+
+absl::Status DoStoreAsLevelDbTable(
+    std::unique_ptr<tsl::WritableFile>& file, const Trace& trace,
+    const std::vector<std::vector<const TraceEvent*>>& events_by_level);
+
+absl::Status DoLoadFromLevelDbTable(
+    const std::string& filename,
+    std::unique_ptr<TraceEventsFilterInterface> filter,
+    std::unique_ptr<TraceVisibilityFilter> visibility_filter,
+    int64_t filter_by_visibility_threshold, Trace& trace,
+    bool& filter_by_visibility,
+    const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
+    const std::function<void(TraceEvent*)>& add_arena_event);
+
+// Reads the trace metadata from a file with given path
+absl::Status ReadFileTraceMetadata(std::string& filepath, Trace* trace);
+
+std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
+    const Trace& trace, std::vector<TraceEvent*>& events);
+
+// Return the minimum duration an event can have in `level`.
+uint64_t LayerResolutionPs(unsigned level);
+
+// Returns <lower, upper> bounds (in picoseconds) for the level that an event
+// with `duration_ps` would go into. (upper >= duration_ps > lower)
+std::pair<uint64_t, uint64_t> GetLevelBoundsForDuration(uint64_t duration_ps);
+
+struct EventFactory {
+  TraceEvent* Create() {
+    events.push_back(std::make_unique<TraceEvent>());
+    return events.back().get();
+  }
+  std::vector<std::unique_ptr<TraceEvent>> events;
+};
+
+struct DefaultStdHash {
+  size_t operator()(absl::string_view input) {
+    return std::hash<absl::string_view>()(input);
+  }
+};
+
+template <typename EventFactory, typename RawData,
+          typename Hash = DefaultStdHash>
+class TraceEventsContainerBase {
+ public:
+  TraceEventsContainerBase() {
+    arenas_.insert(std::make_shared<EventFactory>());
+  }
+
+  // Movable but non-copyable.
+  TraceEventsContainerBase(TraceEventsContainerBase&&) = default;
+  TraceEventsContainerBase& operator=(TraceEventsContainerBase&&) = default;
+  TraceEventsContainerBase(const TraceEventsContainerBase&) = delete;
+  TraceEventsContainerBase& operator=(const TraceEventsContainerBase&) = delete;
+
+  // Creates a TraceEvent prefilled with the given values.
+  void AddCompleteEvent(absl::string_view name, uint32_t resource_id,
+                        uint32_t device_id, tsl::profiler::Timespan timespan,
+                        RawData* raw_data = nullptr,
+                        std::optional<int64_t> group_id = std::nullopt,
+                        std::optional<int64_t> serial = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    MaybeInternEventName(event, name);
+    event->set_resource_id(resource_id);
+    event->set_device_id(device_id);
+    event->set_timestamp_ps(timespan.begin_ps());
+    if (timespan.duration_ps() != 0) {
+      event->set_duration_ps(timespan.duration_ps());
+    }
+    if (raw_data) {
+      MaybeInternTraceArgument(raw_data);
+      raw_data->SerializePartialToString(event->mutable_raw_data());
+      if (event->raw_data().empty()) event->clear_raw_data();
+    }
+    if (group_id) {
+      event->set_group_id(*group_id);
+    }
+    if (serial && *serial > 0) {
+      event->set_serial(static_cast<uint32_t>(*serial));
+    }
+    AddArenaEvent(event);
+  }
+
+  // Similar to above, but the TraceEvent also has an associated flow_id and
+  // flow_entry_type, to make it part of a flow.
+  void AddFlowEvent(absl::string_view name, uint32_t resource_id,
+                    uint32_t device_id, tsl::profiler::Timespan timespan,
+                    uint64_t flow_id, TraceEvent::FlowEntryType flow_entry_type,
+                    tsl::profiler::ContextType flow_category =
+                        tsl::profiler::ContextType::kGeneric,
+                    RawData* raw_data = nullptr,
+                    std::optional<int64_t> group_id = std::nullopt,
+                    std::optional<int64_t> serial = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    MaybeInternEventName(event, name);
+    event->set_resource_id(resource_id);
+    event->set_device_id(device_id);
+    event->set_timestamp_ps(timespan.begin_ps());
+    if (timespan.duration_ps() != 0) {
+      event->set_duration_ps(timespan.duration_ps());
+    }
+    event->set_flow_id(flow_id);
+    event->set_flow_entry_type(flow_entry_type);
+    event->set_flow_category(static_cast<uint32_t>(flow_category));
+    if (raw_data) {
+      MaybeInternTraceArgument(raw_data);
+      raw_data->SerializePartialToString(event->mutable_raw_data());
+      if (event->raw_data().empty()) event->clear_raw_data();
+    }
+    if (group_id) {
+      event->set_group_id(*group_id);
+    }
+    if (serial && *serial > 0) {
+      event->set_serial(static_cast<uint32_t>(*serial));
+    }
+    AddArenaEvent(event);
+  }
+
+  // Similar to above, but the "async" TraceEvent don't have a resource id, its
+  // name is used as "async channel" which are used as "thread" name. It has an
+  // associated unique flow_id and flow_entry_type to signal asynchronous
+  // start and end events and match up between them.
+  void AddAsyncEvent(absl::string_view name, uint32_t device_id,
+                     tsl::profiler::Timespan timespan, uint64_t flow_id,
+                     TraceEvent::FlowEntryType flow_entry_type,
+                     tsl::profiler::ContextType flow_category =
+                         tsl::profiler::ContextType::kGeneric,
+                     RawData* raw_data = nullptr,
+                     std::optional<int64_t> group_id = std::nullopt,
+                     std::optional<int64_t> serial = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    MaybeInternEventName(event, name);
+    event->set_device_id(device_id);
+    event->set_timestamp_ps(timespan.begin_ps());
+    if (timespan.duration_ps() != 0) {
+      event->set_duration_ps(timespan.duration_ps());
+    }
+    event->set_flow_id(flow_id);
+    event->set_flow_entry_type(flow_entry_type);
+    event->set_flow_category(static_cast<uint32_t>(flow_category));
+    if (raw_data) {
+      MaybeInternTraceArgument(raw_data);
+      raw_data->SerializePartialToString(event->mutable_raw_data());
+      if (event->raw_data().empty()) event->clear_raw_data();
+    }
+    if (group_id) {
+      event->set_group_id(*group_id);
+    }
+    if (serial && *serial > 0) {
+      event->set_serial(static_cast<int32_t>(*serial));
+    }
+    AddArenaEvent(event);
+  }
+
+  // Similar to above, but the TraceEvent also has an associated counter name
+  // and value in RawData.args. Counter events are per device, so no resource_id
+  // is passed.
+  void AddCounterEvent(absl::string_view name, uint32_t device_id,
+                       uint64_t timestamp_ps, const RawData& raw_data,
+                       std::optional<int64_t> serial = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    event->set_name(name.data(), name.size());
+    event->set_device_id(device_id);
+    // Do not set resource_id for counter events, they are per device.
+    event->set_timestamp_ps(timestamp_ps);
+    DCHECK(raw_data.has_args());
+    DCHECK_EQ(raw_data.args().arg_size(), 1);
+    DCHECK(raw_data.args().arg(0).has_uint_value());
+    raw_data.SerializePartialToString(event->mutable_raw_data());
+    if (serial && *serial > 0) {
+      event->set_serial(static_cast<uint32_t>(*serial));
+    }
+    AddArenaEvent(event);
+  }
+
+  // Returns a device descriptor.
+  Device* MutableDevice(uint32_t device_id) {
+    return &(*trace_.mutable_devices())[device_id];
+  }
+
+  // Returns a resource descriptor,
+  Resource* MutableResource(uint32_t resource_id, uint32_t device_id) {
+    Device* device = MutableDevice(device_id);
+    return &(*device->mutable_resources())[resource_id];
+  }
+
+  // Adds metadata events to set the name of each device and resource.
+  // The arguments are callbacks that return the names given ids.
+  // This must be called after all AddEvent calls, and no more AddEvent
+  // calls should be made after calling AddMetadataEvents.
+  void AddMetadataEvents(
+      const std::function<std::string(uint32_t /*device_id*/)>& device_name,
+      const std::function<std::string(
+          uint32_t /*device_id*/, uint32_t /*resource_id*/)>& resource_name) {
+    for (const auto& id_and_device : events_by_device_) {
+      uint32_t device_id = id_and_device.first;
+      auto& device = (*trace_.mutable_devices())[device_id];
+      device.set_device_id(device_id);
+      device.set_name(device_name(device_id));
+      const DeviceEvents& device_events = id_and_device.second;
+      for (const auto& id_and_resource : device_events.events_by_resource) {
+        uint32_t resource_id = id_and_resource.first;
+        auto& resource = (*device.mutable_resources())[resource_id];
+        resource.set_resource_id(resource_id);
+        resource.set_name(resource_name(device_id, resource_id));
+        resource.set_num_events(id_and_resource.second.size());
+      }
+    }
+  }
+
+  // Adds task metadata for the given host.
+  void AddTask(int host_id, const Task& task) {
+    (*trace_.mutable_tasks())[host_id] = task;
+  }
+
+  // Stores the contents of this container in a level-db sstable file.
+  absl::Status StoreAsLevelDbTable(
+      std::unique_ptr<tsl::WritableFile> file) const {
+    Trace trace = trace_;
+    trace.set_num_events(NumEvents());
+    auto events_by_level = EventsByLevel();
+    return DoStoreAsLevelDbTable(file, trace, events_by_level);
+  }
+
+  std::vector<std::vector<const TraceEvent*>> GetTraceEventsByLevel() const {
+    return EventsByLevel();
+  }
+
+  // Loads the contents of this container from a level-db sstable file.
+  // In order to be efficient, requires resolution__ to be set.
+  // If span_ is not set, it is initialized from the loaded trace_.
+  absl::Status LoadFromLevelDbTable(
+      const std::string& filename,
+      std::unique_ptr<TraceEventsFilterInterface> filter = nullptr,
+      std::unique_ptr<TraceVisibilityFilter> visibility = nullptr,
+      int64_t filter_by_visibility_threshold = -1LL) {
+    return DoLoadFromLevelDbTable(
+        filename, std::move(filter), std::move(visibility),
+        filter_by_visibility_threshold, trace_, filter_by_visibility_,
+        absl::bind_front(&TraceEventsContainerBase::CopyEventToArena, this),
+        absl::bind_front(&TraceEventsContainerBase::AddArenaEvent, this));
+  }
+
+  // Calls 'callback' with all events stored in this container.
+  template <typename Callback>
+  void ForAllEvents(Callback callback) const {
+    for (const auto& [device_id, device] : events_by_device_) {
+      for (const auto& [counter_name, events] : device.counter_events_by_name) {
+        for (auto* event : events) {
+          callback(*event);
+        }
+      }
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        for (auto* event : events) {
+          callback(*event);
+        }
+      }
+    }
+  }
+
+  // Calls 'callback' with all event tracks stored in this container.
+  template <typename Callback>
+  void ForAllTracks(Callback callback) const {
+    for (const auto& [device_id, device] : events_by_device_) {
+      for (const auto& [counter_name, events] : device.counter_events_by_name) {
+        if (!events.empty()) {
+          if (ABSL_PREDICT_FALSE(!callback(device_id, counter_name, events)))
+            return;
+        }
+      }
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        if (!events.empty()) {
+          if (ABSL_PREDICT_FALSE(!callback(device_id, resource_id, events)))
+            return;
+        }
+      }
+    }
+  }
+
+  // Calls 'callback' with all event tracks stored in this container.
+  template <typename Callback>
+  void ForAllMutableTracks(Callback callback) const {
+    for (auto& [device_id, device] : events_by_device_) {
+      for (auto& [counter_name, events] : device.counter_events_by_name) {
+        if (!events.empty()) {
+          callback(device_id, counter_name, &events);
+        }
+      }
+      for (auto& [resource_id, events] : device.events_by_resource) {
+        if (!events.empty()) {
+          callback(device_id, resource_id, &events);
+        }
+      }
+    }
+  }
+
+  // Calls 'callback' with all event flows stored in this container.
+  template <typename Callback>
+  void ForAllFlows(Callback callback) const {
+    absl::flat_hash_map<uint64_t /*flow_id*/, TraceEventFlow> flows;
+    for (const auto& [device_id, device] : events_by_device_) {
+      // Counter events are not flow events.
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        for (auto* event : events) {
+          if (event->has_flow_id()) flows[event->flow_id()].push_back(event);
+        }
+      }
+    }
+    for (auto& [flow_id, combined_flow] : flows) {
+      // If the flow_id is reused, split into individual flows.
+      for (auto& flow : SplitEventFlow(std::move(combined_flow))) {
+        callback(flow_id, flow);
+      }
+    }
+  }
+
+  // Returns the metadata for this trace container.
+  const Trace& trace() const { return trace_; }
+
+  // Returns the number of events.
+  size_t NumEvents() const {
+    size_t count = 0;
+    for (const auto& [device_id, device] : events_by_device_) {
+      for (const auto& [counter_name, events] : device.counter_events_by_name) {
+        count += events.size();
+      }
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        count += events.size();
+      }
+    }
+    return count;
+  }
+
+  // Returns the number of tracks.
+  size_t NumTracks() const {
+    return std::accumulate(
+        events_by_device_.begin(), events_by_device_.end(), 0,
+        [](const size_t tracks, const std::pair<uint32_t, DeviceEvents> item) {
+          return tracks + item.second.counter_events_by_name.size() +
+                 item.second.events_by_resource.size();
+        });
+  }
+
+  bool FilterByVisibility() const { return filter_by_visibility_; }
+
+ protected:
+  // Allocates an event in the first of the arenas_.
+  TraceEvent* CreateArenaEvent() { return (*arenas_.begin())->Create(); }
+
+  // Copies event into arenas_.
+  TraceEvent* CopyEventToArena(const TraceEvent& event) {
+    TraceEvent* copy = CreateArenaEvent();
+    *copy = event;
+    return copy;
+  }
+
+  // Adds an event from arenas_ to events_by_device_.
+  void AddArenaEvent(TraceEvent* event) {
+    ExpandTraceSpan(EventSpan(*event), &trace_);
+    DeviceEvents& device_events = events_by_device_[event->device_id()];
+    if (!event->has_resource_id()) {
+      device_events.counter_events_by_name[event->name()].push_back(event);
+    } else {
+      device_events.events_by_resource[event->resource_id()].push_back(event);
+    }
+  }
+
+  // Returns all events grouped by visibility level.
+  std::vector<std::vector<const TraceEvent*>> EventsByLevel() const {
+    std::vector<TraceEvent*> events = SortedEvents();
+    return GetEventsByLevel(trace_, events);
+  }
+
+  // Returns all events sorted using TraceEventsComparator.
+  // Helper for EventsByLevel().
+  // REQUIRED: All events have been added and SortTracks() has been called.
+  std::vector<TraceEvent*> SortedEvents() const {
+    std::vector<const TraceEventTrack*> event_tracks;
+    event_tracks.reserve(NumTracks());
+    ForAllMutableTracks(
+        [&event_tracks](uint32_t device_id,
+                        std::variant<uint32_t, absl::string_view> resource_id,
+                        TraceEventTrack* events) {
+          event_tracks.push_back(events);
+        });
+    return MergeEventTracks(event_tracks);
+  }
+
+  uint64_t MaybeInternString(absl::string_view name) {
+    uint64_t fp = hash_(name);
+    auto& it = (*trace_.mutable_name_table())[fp];
+    if (it.empty()) {
+      it = name;
+    }
+    return fp;
+  }
+
+  void MaybeInternEventName(TraceEvent* event, absl::string_view name) {
+    static constexpr size_t kNameInternThreshold = 32;
+    if (name.size() > kNameInternThreshold) {
+      event->set_name_ref(MaybeInternString(name));
+    } else {
+      event->set_name(name.data(), name.size());
+    }
+  }
+
+  void MaybeInternTraceArgument(RawData* raw_data) {
+    if (raw_data->has_args()) {
+      for (auto& arg : *raw_data->mutable_args()->mutable_arg()) {
+        constexpr size_t kTraceArgInternThreshold = 16;
+        if (arg.has_str_value() &&
+            arg.str_value().size() > kTraceArgInternThreshold) {
+          // Use name table to string intern the trace argument.
+          if (arg.name() == "long_name" || arg.name() == "hlo_text") {
+            // Also mark it as potential stack frame.
+            arg.set_ref_value(MaybeInternString("@@" + arg.str_value()));
+          } else {
+            arg.set_ref_value(MaybeInternString(arg.str_value()));
+          }
+        }
+      }
+    }
+  }
+
+  // Events shown within a single device.
+  struct DeviceEvents {
+    // Counter events, which are per-device (don't have resource_id), and are
+    // plotted in different tracks for each counter name.
+    absl::flat_hash_map<std::string, TraceEventTrack> counter_events_by_name;
+
+    // Complete events and flow events, mapped by resource_id.
+    std::map<uint32_t, TraceEventTrack> events_by_resource;
+  };
+
+  // Events, mapped by device_id.
+  mutable std::map<uint32_t, DeviceEvents> events_by_device_;
+
+  // Indicator on if visibility filtering is applied or not
+  // Currently skip visibility filtering only applies to ssTable
+  bool filter_by_visibility_ = true;
+
+  // The arenas containing events constructed in this container or in containers
+  // that have been merged into this container.
+  using Arenas = absl::flat_hash_set<std::shared_ptr<EventFactory>>;
+  Arenas arenas_;
+
+  Trace trace_;
+  Hash hash_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
new file mode 100644
index 00000000..24f63203
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
+
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Trace event filter interface.
+class TraceEventsFilterInterface {
+ public:
+  virtual ~TraceEventsFilterInterface() = default;
+
+  // Allow sub-classes to set up filtering by processing the trace, e.g., by
+  // capturing the names of devices and resources that need to be filtered.
+  virtual void SetUp(const Trace& trace) = 0;
+
+  // Returns true if event should not be added to a TraceEventsContainer.
+  virtual bool Filter(const TraceEvent& event) = 0;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
new file mode 100644
index 00000000..873a791d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
@@ -0,0 +1,610 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/time/time.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h"
+#include "tensorflow/core/profiler/lib/context_types.h"
+#include "tensorflow/core/profiler/protobuf/task.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/profiler/lib/context_types.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// JSON generation options.
+struct JsonTraceOptions {
+  using Details = std::vector<std::pair<std::string, bool>>;
+
+  // Options and values for filtering based on the "details" menu.
+  Details details;
+
+  // If selected_device_ids is set, we add a field "selected_device_ids"
+  // in the Trace JSON.
+  std::optional<absl::flat_hash_set<uint32_t>> selected_device_ids;
+
+  // Device IDs of devices whose resources should be sorted by name instead of
+  // by resource ID.
+  absl::flat_hash_set<uint32_t /*device_id*/> sort_resources_by_name;
+
+  // Returns the color for an event.
+  TraceEventsColorerInterface* colorer = nullptr;
+
+  bool generate_stack_frames = true;
+  bool use_new_backend = false;
+  std::string code_link;
+};
+
+// Counts generated JSON events by type.
+class JsonEventCounter {
+ public:
+  JsonEventCounter() : event_count_(kNumEventTypes, 0) {}
+  ~JsonEventCounter() { LOG(INFO) << ToString(); }
+
+  // Types of JSON events (bit.ly/trace-event-format)
+  enum EventType {
+    kCompleteEvent,
+    kCompleteEventWithFlow,
+    kCounterEvent,
+    kAsyncEvent,
+  };
+
+  void Inc(EventType e) { ++event_count_[e]; }
+
+  std::string ToString() const {
+    std::string output = "Generated JSON events:";
+    for (size_t i = 0; i < event_count_.size(); ++i) {
+      absl::StrAppend(&output, " ", kEventTypeName[i], ": ", event_count_[i]);
+    }
+    return output;
+  }
+
+ private:
+  static constexpr absl::string_view kEventTypeName[] = {
+      "complete",
+      "complete+flow",
+      "counter",
+      "async",
+  };
+
+  static constexpr size_t kNumEventTypes = ABSL_ARRAYSIZE(kEventTypeName);
+
+  absl::FixedArray<size_t> event_count_;
+};
+
+// Adds a separator between elements of a JSON array or object.
+template <typename IOBuffer>
+class JsonSeparator {
+ public:
+  explicit JsonSeparator(IOBuffer* output) : output_(output) {}
+
+  // Does nothing on the first call; adds a comma to the output on subsequent
+  // calls.
+  void Add() {
+    output_->Append(sep_);
+    sep_ = ",";
+  }
+
+ private:
+  IOBuffer* output_;
+  absl::string_view sep_;
+};
+
+// Converts picoseconds to microseconds.
+inline double PicosToMicros(uint64_t ps) { return ps / 1E6; }
+
+// Escapes the contents of "raw" in JSON style.
+// Also adds double quotes to the beginning and end of the string.
+std::string JsonEscape(absl::string_view raw);
+
+std::string ProtoString(const tsl::protobuf::Message& pb);
+
+template <typename RawDataType, typename IOBuffer>
+void WriteTpuData(const RawDataType& data, JsonSeparator<IOBuffer>* separator,
+                  IOBuffer* output) {}
+
+// Writes JSON events from a TraceEvent.
+template <typename IOBuffer, typename RawDataType>
+class JsonEventWriter {
+ public:
+  JsonEventWriter(const TraceEventsColorerInterface* colorer,
+                  const Trace& trace,
+                  const std::map<uint64_t, uint64_t>& references,
+                  IOBuffer* output)
+      : colorer_(colorer),
+        trace_(trace),
+        references_(references),
+        output_(output) {}
+
+  void WriteEvent(const TraceEvent& event) const {
+    std::optional<TraceEvent> async_event;
+    output_->Append(R"({"pid":)", event.device_id());
+    if (event.has_resource_id()) {
+      output_->Append(R"(,"tid":)", event.resource_id());
+    }
+    const std::string& event_name =
+        event.has_name_ref() ? trace_.name_table().at(event.name_ref())
+                             : event.name();
+    output_->Append(R"(,"name":)", JsonEscape(event_name));
+    tsl::profiler::Timespan span = EventSpan(event);
+    // "%.17g" is the default double format in proto2::util::JsonFormat.
+    absl::Format(output_, R"(,"ts":%.17g)", PicosToMicros(span.begin_ps()));
+    JsonEventCounter::EventType event_type = JsonEventCounter::kCounterEvent;
+    if (event.has_resource_id()) {
+      event_type = event.has_flow_id()
+                       ? JsonEventCounter::kCompleteEventWithFlow
+                       : JsonEventCounter::kCompleteEvent;
+      // A complete event must have a duration, otherwise trace-viewer will
+      // extend the event to the end of the trace and append "(Did Not Finish)"
+      // to its name. Make the minimum duration 1 picosecond.
+      uint64_t duration_ps = std::max(span.duration_ps(), uint64_t{1});
+      absl::Format(output_, R"(,"dur":%.17g)", PicosToMicros(duration_ps));
+
+      if (std::optional<uint32_t> color_id = colorer_->GetColor(event)) {
+        output_->Append(R"(,"cname":)", TraceViewerColorName(*color_id));
+      }
+
+      // FlowV2
+      if (event_type == JsonEventCounter::kCompleteEventWithFlow) {
+        output_->Append(R"(,"bind_id":)", event.flow_id());
+        if (event.has_flow_category()) {
+          tsl::profiler::ContextType type =
+              tsl::profiler::GetSafeContextType(event.flow_category());
+          if (type != tsl::profiler::ContextType::kGeneric &&
+              type != tsl::profiler::ContextType::kLegacy) {
+            const char* category = tsl::profiler::GetContextTypeString(type);
+            output_->Append(R"(,"cat":")", category, R"(")");
+          }
+        }
+        switch (event.flow_entry_type()) {
+          case TraceEvent::FLOW_NONE:
+            // The caller prevents this case from happening.
+            break;
+          case TraceEvent::FLOW_START:
+            output_->Append(R"(,"flow_out":true)");
+            break;
+          case TraceEvent::FLOW_MID:
+            output_->Append(R"(,"flow_in":true,"flow_out":true)");
+            break;
+          case TraceEvent::FLOW_END:
+            output_->Append(R"(,"flow_in":true)");
+            break;
+        }
+      }
+      output_->Append(R"(,"ph":"X")");
+    } else {
+      event_type = event.has_flow_id() ? JsonEventCounter::kAsyncEvent
+                                       : JsonEventCounter::kCounterEvent;
+      if (event_type == JsonEventCounter::kCounterEvent) {
+        output_->Append(R"(,"ph":"C")");
+      } else {  // async events
+        output_->Append(R"(,"id":)", event.flow_id());
+        if (event.has_flow_category()) {
+          tsl::profiler::ContextType type =
+              tsl::profiler::GetSafeContextType(event.flow_category());
+          const char* category = tsl::profiler::GetContextTypeString(type);
+          output_->Append(R"(,"cat":")", category, R"(")");
+        }
+        switch (event.flow_entry_type()) {
+          case TraceEvent::FLOW_NONE:
+            // The caller prevents this case from happening.
+            break;
+          case TraceEvent::FLOW_START:
+            output_->Append(R"(,"ph":"b")");
+            break;
+          case TraceEvent::FLOW_END:
+            output_->Append(R"(,"ph":"e")");
+            break;
+          case TraceEvent::FLOW_MID:
+            output_->Append(R"(,"ph":"b")");
+            async_event.emplace(event);
+            async_event->set_flow_entry_type(TraceEvent::FLOW_END);
+            async_event->set_timestamp_ps(event.timestamp_ps() +
+                                          event.duration_ps());
+            async_event->clear_raw_data();
+            break;
+        }
+      }
+    }
+    WriteArgs(event);
+    if (event.has_serial()) {
+      output_->Append(R"(,"z":)", event.serial());
+    }
+
+    output_->Append("}");
+    counter_.Inc(event_type);
+    if (async_event) {
+      output_->Append(",");
+      WriteEvent(*async_event);
+    }
+  }
+
+ private:
+  void WriteArgs(const TraceEvent& event) const {
+    if (!event.has_group_id() && !event.has_raw_data()) {
+      return;
+    }
+    output_->Append(R"(,"args":{)");
+    std::optional<uint64_t> stack_frames;
+    JsonSeparator<IOBuffer> separator(output_);
+    if (event.has_group_id()) {
+      separator.Add();
+      output_->Append(R"("group_id":)", event.group_id());
+    }
+    if (event.has_raw_data()) {
+      RawDataType data;
+      data.ParseFromString(event.raw_data());
+      switch (data.raw_data_case()) {
+        case RawDataType::RAW_DATA_NOT_SET:
+          break;
+        case RawDataType::kTpuData:
+          WriteTpuData<RawDataType, IOBuffer>(data, &separator, output_);
+          break;
+        case RawDataType::kDmaActivity:
+          separator.Add();
+          output_->Append(R"("DMA activity":)",
+                          ProtoString(data.dma_activity()));
+          break;
+        case RawDataType::kArgs:
+          for (const auto& arg : data.args().arg()) {
+            switch (arg.value_case()) {
+              case TraceEventArguments::Argument::kStrValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.str_value());
+                break;
+              case TraceEventArguments::Argument::kIntValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.int_value());
+                break;
+              case TraceEventArguments::Argument::kUintValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.uint_value());
+                break;
+              case TraceEventArguments::Argument::kDoubleValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.double_value());
+                break;
+              case TraceEventArguments::Argument::kRefValue: {
+                const auto& it = trace_.name_table().find(arg.ref_value());
+                if (it != trace_.name_table().end()) {
+                  // Each event could only have one stack frame.
+                  if (absl::StartsWith(it->second, "@@") && !stack_frames) {
+                    stack_frames = arg.ref_value();
+                  } else {
+                    separator.Add();
+                    WriteArg(arg.name(), it->second);
+                  }
+                }
+                break;
+              }
+              case TraceEventArguments::Argument::VALUE_NOT_SET:
+                break;
+            }
+          }
+          break;
+      }
+    }
+    output_->Append("}");
+
+    // Write the optional stack frame.
+    if (stack_frames.has_value()) {
+      output_->Append(R"(,"sf":)", references_.at(*stack_frames), R"()");
+    }
+  }
+  void WriteArg(absl::string_view name, absl::string_view value) const {
+    output_->Append(JsonEscape(name), ":", JsonEscape(value));
+  }
+  void WriteArg(absl::string_view name, uint64_t value) const {
+    // Limit beyond which integers converted to 64-bit IEEE floating point may
+    // lose accuracy. JavaScript stores all numbers as doubles, quote the value
+    // to preserve accuracy.
+    // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
+    constexpr uint64_t kIeeeLimit = 1ULL << 53;
+    if (value > kIeeeLimit) {
+      output_->Append(JsonEscape(name), ":\"", value, "\"");
+    } else {
+      output_->Append(JsonEscape(name), ":", value);
+    }
+  }
+  void WriteArg(absl::string_view name, int64_t value) const {
+    // Limit beyond which integers converted to 64-bit IEEE floating point may
+    // lose accuracy. JavaScript stores all numbers as doubles, quote the value
+    // to preserve accuracy.
+    // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
+    constexpr uint64_t kIeeeLimit = 1ULL << 53;
+    if (abs(value) > kIeeeLimit) {
+      output_->Append(JsonEscape(name), ":\"", value, "\"");
+    } else {
+      output_->Append(JsonEscape(name), ":", value);
+    }
+  }
+  void WriteArg(absl::string_view name, double value) const {
+    if (std::isfinite(value)) {
+      output_->Append(JsonEscape(name));
+      // "%.17g" is the default double format in proto2::util::JsonFormat.
+      absl::Format(output_, ":%.17g", value);
+    } else if (std::isinf(value)) {
+      output_->Append(JsonEscape(name), R"(:"Infinity")");
+    } else if (std::isinf(-value)) {
+      output_->Append(JsonEscape(name), R"(:"-Infinity")");
+    } else {
+      output_->Append(JsonEscape(name), R"(:"NaN")");
+    }
+  }
+
+  const TraceEventsColorerInterface* colorer_;
+  const Trace& trace_;
+  const std::map<uint64_t, uint64_t>& references_;
+  IOBuffer* output_;
+  mutable JsonEventCounter counter_;
+};
+
+template <typename IOBuffer>
+void WriteTasks(const Trace& trace, IOBuffer* output) {
+  const auto& tasks = trace.tasks();
+  if (tasks.empty()) return;
+  output->Append(R"("tasks":[)");
+  JsonSeparator<IOBuffer> task_separator(output);
+  std::map<uint32_t, Task> ordered_tasks(tasks.begin(), tasks.end());
+  for (const auto& entry : ordered_tasks) {
+    const uint32_t host_id = entry.first;
+    const auto& task = entry.second;
+
+    task_separator.Add();
+    output->Append("{");
+    JsonSeparator<IOBuffer> field_separator(output);
+    field_separator.Add();
+    output->Append(R"("host_id":)", host_id);
+    if (task.has_changelist()) {
+      field_separator.Add();
+      output->Append(R"("changelist":)", task.changelist());
+    }
+    if (task.has_clean_build()) {
+      field_separator.Add();
+      output->Append(R"("clean_build":)", task.clean_build());
+    }
+    if (task.has_build_time()) {
+      field_separator.Add();
+      output->Append(
+          R"("build_time":)",
+          JsonEscape(absl::FormatTime(absl::FromUnixNanos(task.build_time()),
+                                      absl::UTCTimeZone())));
+    }
+    if (task.has_build_target()) {
+      field_separator.Add();
+      output->Append(R"("build_target":)", JsonEscape(task.build_target()));
+    }
+    if (task.has_command_line()) {
+      field_separator.Add();
+      output->Append(R"("command_line":)", JsonEscape(task.command_line()));
+    }
+    if (task.has_start_time()) {
+      field_separator.Add();
+      output->Append(
+          R"("start_time":)",
+          JsonEscape(absl::FormatTime(absl::FromUnixNanos(task.start_time()),
+                                      absl::UTCTimeZone())));
+    }
+    if (task.has_gtc_freq_hz()) {
+      field_separator.Add();
+      output->Append(R"("gtc_freq_hz":)", task.gtc_freq_hz());
+    }
+    if (task.has_tensor_core_freq_hz()) {
+      field_separator.Add();
+      output->Append(R"("tensor_core_freq_hz":)", task.tensor_core_freq_hz());
+    }
+    if (task.has_sparse_core_freq_hz()) {
+      field_separator.Add();
+      output->Append(R"("sparse_core_freq_hz":)", task.sparse_core_freq_hz());
+    }
+    output->Append("}");
+  }
+  output->Append("],");
+}
+
+template <typename IOBuffer>
+void WriteStackFrames(const Trace& trace,
+                      const std::map<uint64_t, uint64_t>& references,
+                      IOBuffer* output) {
+  const auto& name_table = trace.name_table();
+  output->Append(R"("stackFrames":{)");
+  JsonSeparator<IOBuffer> separator(output);
+  for (const auto& [fp, name] : name_table) {
+    if (!absl::StartsWith(name, "@@")) continue;
+    separator.Add();
+    std::string_view name_view = name;
+    absl::ConsumePrefix(&name_view, "@@");
+    output->Append(R"(")", references.at(fp), R"(":{"name":)",
+                   JsonEscape(name_view), R"(})");
+  }
+  output->Append("},");
+}
+
+template <typename IOBuffer>
+void WriteDetails(const JsonTraceOptions::Details& details, IOBuffer* output) {
+  if (details.empty()) return;
+  output->Append(R"("details":[)");
+  JsonSeparator<IOBuffer> separator(output);
+  for (const auto& detail : details) {
+    separator.Add();
+    output->Append(R"({"name":)", JsonEscape(detail.first), R"(,"value":)",
+                   detail.second ? "true" : "false", "}");
+  }
+  output->Append("],");
+}
+
+template <typename IOBuffer>
+void WriteSelectedDeviceIds(
+    const absl::optional<absl::flat_hash_set<uint32_t>>& selected_device_ids,
+    IOBuffer* output) {
+  if (!selected_device_ids.has_value()) return;
+
+  output->Append(R"("selected_device_ids":[)");
+  JsonSeparator<IOBuffer> separator(output);
+  for (const auto& device_id : selected_device_ids.value()) {
+    separator.Add();
+    output->Append(device_id);
+  }
+  output->Append("],");
+}
+
+std::map<uint64_t, uint64_t> BuildStackFrameReferences(const Trace& trace);
+
+template <typename IOBuffer>
+void WriteReturnedEventsSize(const int events_size, IOBuffer* output) {
+  output->Append(R"("returnedEventsSize":)", events_size, R"(,)");
+}
+
+template <typename IOBuffer>
+void WriteFilteredByVisibility(bool filtered_by_visibility, IOBuffer* output) {
+  absl::string_view filtered_by_visibility_str =
+      filtered_by_visibility ? "true" : "false";
+  output->Append(R"("filteredByVisibility":)", filtered_by_visibility_str,
+                 R"(,)");
+}
+
+template <typename IOBuffer>
+void WriteTraceFullTimespan(const Trace* trace, IOBuffer* output) {
+  auto start_time_ms = trace->min_timestamp_ps() / 1000000000.0;
+  auto end_time_ms = trace->max_timestamp_ps() / 1000000000.0;
+  output->Append(R"("fullTimespan":[)", start_time_ms, R"(,)", end_time_ms,
+                 R"(],)");
+}
+
+template <typename IOBuffer, typename TraceEventsContainer,
+          typename RawDataType>
+void TraceEventsToJson(const JsonTraceOptions& options,
+                       const TraceEventsContainer& events, IOBuffer* output) {
+  // Set the displayTimeUnit to nanoseconds (default is milliseconds), so the UI
+  // uses higher-precision when manipulating event times. Note that the
+  // timestamps of trace events are always given in microseconds.
+  output->Append(
+      R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true}, "codeLink":")",
+      options.code_link, R"(",)");
+
+  output->Append(absl::StrFormat(R"("useNewBackend": %s,)",
+                                 options.use_new_backend ? "true" : "false"));
+  WriteDetails(options.details, output);
+  WriteSelectedDeviceIds(options.selected_device_ids, output);
+  WriteReturnedEventsSize(events.NumEvents(), output);
+  WriteFilteredByVisibility(events.FilterByVisibility(), output);
+  WriteTraceFullTimespan(&events.trace(), output);
+
+  const Trace& trace = events.trace();
+
+  WriteTasks(trace, output);
+
+  auto references = BuildStackFrameReferences(trace);
+  if (options.generate_stack_frames) {
+    WriteStackFrames(trace, references, output);
+  }
+
+  output->Append(R"("traceEvents":[)");
+  JsonSeparator<IOBuffer> separator(output);
+  // Write metadata events.
+  std::map<uint32_t, Device> ordered_devices(trace.devices().begin(),
+                                             trace.devices().end());
+  for (const auto& [device_id, device] : ordered_devices) {
+    if (device.has_name()) {
+      separator.Add();
+      output->Append(R"({"args":{"name":)", JsonEscape(device.name()),
+                     R"(},"name":"process_name","ph":"M","pid":)", device_id,
+                     R"(,"thread_count":)", device.resources_size(), "}");
+    }
+    separator.Add();
+    output->Append(R"({"args":{"sort_index":)", device_id,
+                   R"(},"name":"process_sort_index","ph":"M","pid":)",
+                   device_id, "}");
+    std::map<uint32_t, Resource> ordered_resources(device.resources().begin(),
+                                                   device.resources().end());
+    for (const auto& [resource_id, resource] : ordered_resources) {
+      if (resource.has_name()) {
+        separator.Add();
+        output->Append(R"({"args":{"name":)", JsonEscape(resource.name()),
+                       R"(},"name":"thread_name","ph":"M","pid":)", device_id,
+                       R"(,"tid":)", resource_id, "}");
+      }
+      if (!options.sort_resources_by_name.count(device_id)) {
+        separator.Add();
+        output->Append(R"({"args":{"sort_index":)", resource_id,
+                       R"(},"name":"thread_sort_index","ph":"M","pid":)",
+                       device_id, R"(,"tid":)", resource_id, "}");
+      }
+    }
+  }
+
+  TraceEventsColorerInterface* colorer = options.colorer;
+  DefaultTraceEventsColorer default_colorer;
+  if (colorer == nullptr) colorer = &default_colorer;
+  colorer->SetUp(trace);
+
+  // Write events.
+  JsonEventWriter<IOBuffer, RawDataType> writer(colorer, trace, references,
+                                                output);
+  events.ForAllEvents([&](const TraceEvent& event) {
+    separator.Add();
+    writer.WriteEvent(event);
+  });
+  output->Append("]}");
+}
+
+class IOBufferAdapter {
+ public:
+  explicit IOBufferAdapter(std::string* output) : output_(output) {}
+
+  template <typename... AV>
+  inline void Append(AV&&... args) {
+    absl::StrAppend(output_, std::forward<AV>(args)...);
+  }
+
+  // Support IOBufferAdapter as a sink object for absl::Format.
+  friend void AbslFormatFlush(IOBufferAdapter* buffer, absl::string_view s) {
+    absl::StrAppend(buffer->output_, s);
+  }
+
+ private:
+  std::string* output_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
new file mode 100644
index 00000000..832da3f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
@@ -0,0 +1,168 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Returns the resource name for the given (device_id, resource_id) in trace.
+inline absl::string_view ResourceName(const Trace& trace, uint32_t device_id,
+                                      uint32_t resource_id) {
+  return trace.devices().at(device_id).resources().at(resource_id).name();
+}
+
+// Returns the resource name for the given event in trace.
+inline absl::string_view ResourceName(const Trace& trace,
+                                      const TraceEvent& event) {
+  return ResourceName(trace, event.device_id(), event.resource_id());
+}
+
+// Functor that compares trace events for sorting.
+// Trace events are sorted by timestamp_ps (ascending) and duration_ps
+// (descending) so nested events are sorted from outer to innermost.
+struct TraceEventsComparator {
+  bool operator()(const TraceEvent* a, const TraceEvent* b) const {
+    if (a->timestamp_ps() < b->timestamp_ps()) return true;
+    if (a->timestamp_ps() > b->timestamp_ps()) return false;
+    return (a->duration_ps() > b->duration_ps());
+  }
+};
+
+// Creates a tsl::profiler::Timespan from a TraceEvent.
+inline tsl::profiler::Timespan EventSpan(const TraceEvent& event) {
+  return tsl::profiler::Timespan(event.timestamp_ps(), event.duration_ps());
+}
+
+// Creates a tsl::profiler::Timespan from a Trace.
+inline tsl::profiler::Timespan TraceSpan(const Trace& trace) {
+  return tsl::profiler::Timespan::FromEndPoints(trace.min_timestamp_ps(),
+                                                trace.max_timestamp_ps());
+}
+
+// A flow of events in the trace-viewer.
+// All events in the flow have the same flow_id.
+using TraceEventFlow = std::vector<TraceEvent*>;
+
+// In case the flow_id was re-used, split into individual flows based on the
+// flow_entry_type.
+std::vector<TraceEventFlow> SplitEventFlow(TraceEventFlow&& flow);
+
+// Returns whether the flow is complete.
+inline bool IsCompleteFlow(const TraceEventFlow& flow) {
+  DCHECK(!flow.empty());
+  return flow.front()->flow_entry_type() == TraceEvent::FLOW_START &&
+         flow.back()->flow_entry_type() == TraceEvent::FLOW_END;
+}
+
+// Updates the timestamps of a Trace to ensure it includes the given
+// tsl::profiler::Timespan.
+void ExpandTraceSpan(const tsl::profiler::Timespan& span, Trace* trace);
+
+// Nway-merge implementation.
+
+// Reorders the elements of the range [first, last) to restore the heap
+// condition (i.e. `std::is_heap(first, last, comp)`) following a change
+// in the value of `*first`.
+//
+// REQUIRES: `first < last`, and [first, last) would be a valid heap if `*first`
+// had a suitable value.
+template <typename RandIt, typename Compare>
+void push_down_root(RandIt first, RandIt last, Compare comp) {
+  size_t size = last - first;
+  size_t hole = 0;  // root.
+  auto value = std::move(*first);
+  while (true) {
+    size_t l_child = 2 * hole + 1;
+    size_t r_child = l_child + 1;
+    size_t max_child = l_child;
+    if (r_child < size && comp(first[l_child], first[r_child])) {
+      max_child = r_child;
+    }
+    if (max_child >= size) break;
+    if (!comp(value, first[max_child])) break;
+    first[hole] = std::move(first[max_child]);
+    hole = max_child;
+  }
+  first[hole] = std::move(value);
+}
+
+// ContainerContainer could be a container of pointers to container.
+template <typename ContainerContainer, typename Out, typename Cmp>
+Out nway_merge(const ContainerContainer& containers, Out out, Cmp cmp) {
+  using std::begin;
+  using std::end;
+  using In = decltype(begin(**begin(containers)));  // The input iterator type.
+  using Range = std::pair<In, In>;
+  std::vector<Range> sources;
+  for (const auto& container : containers) {
+    Range r(begin(*container), end(*container));
+    if (r.first != r.second) {
+      sources.push_back(r);
+    }
+  }
+  if (sources.empty()) return out;
+  // Take a comparator for T and produce an inverse comparator
+  // for std::pair<In<T>, In<T>>, inverted so as to produce a min-heap.
+  auto heap_cmp = [&](const Range& a, const Range& b) {
+    // Compares b < a instead of a < b.
+    return cmp(*b.first, *a.first);
+  };
+  std::make_heap(sources.begin(), sources.end(), heap_cmp);
+  while (true) {
+    Range& r = sources.front();
+    *out = *r.first;
+    ++r.first;
+    ++out;
+    if (r.first == r.second) {
+      if (sources.size() == 1) return out;
+      r = std::move(sources.back());
+      sources.pop_back();
+    }
+    push_down_root(sources.begin(), sources.end(), heap_cmp);
+  }
+}
+
+// Interface that allows defining classes that map XLines within a single XPlane
+// to multiple virtual devices in trace viewer.
+class ResourceGrouperInterface {
+ public:
+  virtual ~ResourceGrouperInterface() = default;
+
+  virtual std::vector<std::pair<uint32_t /*resource_id*/, absl::string_view>>
+  Devices() const = 0;
+
+  virtual uint32_t GetDeviceId(uint32_t resource_id) const = 0;
+};
+
+std::unique_ptr<ResourceGrouperInterface> CreateDefaultResourceGrouper(
+    uint32_t device_id, absl::string_view name);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
new file mode 100644
index 00000000..be2bb9f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Pre-defined color names (excluding "black" and "white") from:
+// https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html.
+// Possible value of TraceEvent.color_id
+enum TraceViewerColor {
+  kThreadStateUninterruptible,
+  kThreadStateIowait,
+  kThreadStateRunning,
+  kThreadStateRunnable,
+  kThreadStateUnknown,
+  kBackgroundMemoryDump,
+  kLightMemoryDump,
+  kDetailedMemoryDump,
+  kVsyncHighlightColor,
+  kGenericWork,
+  kGood,
+  kBad,
+  kTerrible,
+  kGrey,
+  kYellow,
+  kOlive,
+  kRailResponse,
+  kRailAnimation,
+  kRailIdle,
+  kRailLoad,
+  kStartup,
+  kHeapDumpStackFrame,
+  kHeapDumpObjectType,
+  kHeapDumpChildNodeArrow,
+  kCqBuildRunning,
+  kCqBuildPassed,
+  kCqBuildFailed,
+  kCqBuildAbandoned,
+  kCqBuildAttemptRunnig,
+  kCqBuildAttemptPassed,
+  kCqBuildAttemptFailed,
+};
+
+// Number of named colors in TraceViewer.
+constexpr uint32_t kNumTraceViewerColors =
+    TraceViewerColor::kCqBuildAttemptFailed + 1;
+
+// Returns the color name for a given color id.
+// Used to decode the value in TraceEvent.color_id.
+absl::string_view TraceViewerColorName(uint32_t color_id);
+
+// Trace event colorer interface.
+class TraceEventsColorerInterface {
+ public:
+  virtual ~TraceEventsColorerInterface() = default;
+
+  // Allow sub-classes to set up coloring by processing the trace, e.g., by
+  // capturing the names of devices and resources that need to be colored.
+  virtual void SetUp(const Trace& trace) = 0;
+
+  // Returns the color for a trace event.
+  virtual std::optional<uint32_t> GetColor(const TraceEvent& event) const = 0;
+};
+
+class DefaultTraceEventsColorer : public TraceEventsColorerInterface {
+ public:
+  void SetUp(const Trace& trace) override {}
+
+  std::optional<uint32_t> GetColor(const TraceEvent& event) const override {
+    return std::nullopt;
+  }
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
new file mode 100644
index 00000000..13dfabe5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
@@ -0,0 +1,179 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Determines whether an event will be visible in trace viewer within a visible
+// tsl::profiler::Timespan at a certain resolution.
+// Events must be evaluated in order by timestamp, because when an event is
+// determined to be visible, the internal state of this class is updated.
+class TraceViewerVisibility {
+ public:
+  // Create with visible timespan and resolution (in picoseconds).
+  // The visible timespan must have non-zero duration.
+  // If resolution is zero, no events are downsampled.
+  explicit TraceViewerVisibility(tsl::profiler::Timespan visible_span,
+                                 uint64_t resolution_ps = 0);
+
+  // Returns true if the event overlaps the visible span and is distinguishable
+  // at resolution_ps.
+  bool Visible(const TraceEvent& event);
+
+  // Returns true if the event is distinguishable at resolution_ps.
+  bool VisibleAtResolution(const TraceEvent& event);
+
+  // Records that event is distinguishable at resolution_ps.
+  void SetVisibleAtResolution(const TraceEvent& event);
+
+  tsl::profiler::Timespan VisibleSpan() const { return visible_span_; }
+  // TODO(tf-profiler) Rename ResolutionPs and resolution_ps to be more
+  // self-explanatory (eg. MinDurationPs)
+  uint64_t ResolutionPs() const { return resolution_ps_; }
+
+ private:
+  // Identifier for one Trace Viewer row.
+  using RowId = std::pair<uint32_t /*device_id*/, uint32_t /*resource_id*/>;
+  using CounterRowId = std::pair<uint32_t /*device_id*/, std::string /*name*/>;
+
+  // Visibility for one Trace Viewer row.
+  class RowVisibility {
+   public:
+    // Returns the nesting depth for an event at begin_timestamp_ps.
+    size_t Depth(uint64_t begin_timestamp_ps) const;
+
+    // Returns the end_timestamp_ps of the last visibile event at the given
+    // nesting depth.
+    std::optional<uint64_t> LastEndTimestampPs(size_t depth) const {
+      std::optional<uint64_t> result;
+      if (depth < last_end_timestamp_ps_.size()) {
+        result = last_end_timestamp_ps_[depth];
+      }
+      return result;
+    }
+
+    // Returns the arrow timestamp of the last visible flow event.
+    std::optional<uint64_t> LastFlowTimestampPs() const {
+      return last_flow_timestamp_ps_;
+    }
+
+    // Sets the last visible timestamp at the given nesting depth.
+    void SetLastEndTimestampPs(size_t depth, uint64_t timestamp_ps) {
+      last_end_timestamp_ps_.resize(depth);
+      last_end_timestamp_ps_.push_back(timestamp_ps);
+    }
+
+    // Sets the last visible arrow timestamp.
+    void SetLastFlowTimestampPs(uint64_t timestamp_ps) {
+      last_flow_timestamp_ps_ = timestamp_ps;
+    }
+
+   private:
+    // Stack of most recently visible event end times. A stack is used to handle
+    // nested events.
+    std::vector<uint64_t> last_end_timestamp_ps_;
+
+    // Timestamp of the arrow binding point of the last visible flow event.
+    std::optional<uint64_t> last_flow_timestamp_ps_;
+  };
+
+  // Constructor arguments.
+  tsl::profiler::Timespan visible_span_;
+  uint64_t resolution_ps_;
+
+  // Visibility data for all rows.
+  absl::flat_hash_map<RowId, RowVisibility> rows_;
+
+  // Visibility of flows.
+  absl::flat_hash_map<uint64_t /*flow_id*/, bool> flows_;
+
+  // Visibility data for counter events.
+  absl::flat_hash_map<CounterRowId, uint64_t> last_counter_timestamp_ps_;
+};
+
+class TraceVisibilityFilter : public TraceEventsFilterInterface {
+ public:
+  // If visible_span.Instant(), all events are visible.
+  // If resolution is 0.0, events aren't downsampled.
+  TraceVisibilityFilter(tsl::profiler::Timespan visible_span, double resolution)
+      : resolution_(resolution),
+        visibility_(visible_span, ResolutionPs(visible_span.duration_ps())) {}
+
+  tsl::profiler::Timespan VisibleSpan() const {
+    return visibility_.VisibleSpan();
+  }
+  uint64_t ResolutionPs() const { return visibility_.ResolutionPs(); }
+
+  void SetUp(const Trace& trace) override {
+    // Update visible_span with trace bounds and recompute the resolution in
+    // picoseconds.
+    tsl::profiler::Timespan visible_span = VisibleSpan();
+    uint64_t start_time_ps = visible_span.begin_ps();
+    uint64_t end_time_ps = visible_span.end_ps();
+    if (end_time_ps == 0 && trace.has_max_timestamp_ps()) {
+      end_time_ps = trace.max_timestamp_ps();
+    }
+    if (start_time_ps == 0 && trace.has_min_timestamp_ps()) {
+      start_time_ps = trace.min_timestamp_ps();
+    }
+    visible_span =
+        tsl::profiler::Timespan::FromEndPoints(start_time_ps, end_time_ps);
+    visibility_ = TraceViewerVisibility(
+        visible_span, ResolutionPs(visible_span.duration_ps()));
+  }
+
+  // Updates the visibility based on `resolution`.
+  void UpdateVisibility(double resolution) {
+    resolution_ = resolution;
+    visibility_ = TraceViewerVisibility(
+        visibility_.VisibleSpan(),
+        ResolutionPs(visibility_.VisibleSpan().duration_ps()));
+  }
+
+  bool Filter(const TraceEvent& event) override {
+    return !visibility_.Visible(event);
+  }
+
+ private:
+  // Returns the minimum duration in picoseconds that an event must have in
+  // order to be visible.
+  uint64_t ResolutionPs(uint64_t duration_ps) {
+    return (resolution_ == 0.0) ? 0 : std::llround(duration_ps / resolution_);
+  }
+
+  double resolution_;  // number of visible events per row
+  TraceViewerVisibility visibility_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
new file mode 100644
index 00000000..68e0b491
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_DCN_COLLECTIVE_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_DCN_COLLECTIVE_STATS_H_
+
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Converts multiple XSpaces to dcn collective stats.
+// Stores the dcn collective stats as files in the same directory
+// as the xspace files.
+absl::StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
+    const SessionSnapshot& session_snapshot);
+
+// Returns whether there are dcn collective stats in the profile.
+absl::StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
+    const SessionSnapshot& session_snapshot);
+
+// Gets DcnSlackAnalysis proto for a host.
+absl::StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
+    const SessionSnapshot& session_snapshot, std::string hostname);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_DCN_COLLECTIVE_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_hlo.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_hlo.h
new file mode 100644
index 00000000..2361ba6e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_hlo.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_HLO_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_HLO_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/service/hlo.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Get HLO proto by module name.
+absl::StatusOr<xla::HloProto> GetHloProtoByModuleName(
+    const SessionSnapshot& session_snapshot, absl::string_view module_name);
+
+// Converts multiple XSpaces to HLO protos.
+// Stores the HLO protos as files in the same directory as the xspace files.
+// Returns whether there are HLO protos in this profile.
+absl::StatusOr<bool> ConvertMultiXSpaceToHloProto(
+    const SessionSnapshot& session_snapshot);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_HLO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
new file mode 100644
index 00000000..7cf9430c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
+
+#include <functional>
+
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/gpu_event_stats.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void ConvertDeviceTraceXPlaneToKernelReports(
+    const XPlane& device_trace,
+    const std::function<void(const GpuEventStats&, KernelReport*)>&
+        on_kernel_fn,
+    KernelReportMap* reports);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
new file mode 100644
index 00000000..00f919d4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Process the host threads XPlane and generate MemoryProfile result; at most
+// max_num_snapshots will be displayed on the UI.
+// REQUIRED: host_plane should have been grouped by calling GroupTfEvents().
+MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
+                                           int64_t max_num_snapshots = 1000);
+
+absl::Status ConvertXSpaceToMemoryProfileJson(const XSpace& xspace,
+                                              std::string* json_output);
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
new file mode 100644
index 00000000..c5d2a229
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_METRICS_DB_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_METRICS_DB_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/op_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Data per host thread for TensorFlow Op Metrics Database.
+struct TfMetricsDbData {
+  // A database of TF-Op metrics for this core.
+  OpMetricsDb tf_metrics_db;
+  HostOpMetricsDbBuilder tf_metrics_db_builder{&tf_metrics_db};
+};
+
+absl::flat_hash_map<int64_t, tsl::profiler::TfOp>
+CollectTfOpsFromHostThreadsXPlane(const XPlane& host_trace);
+
+TfMetricsDbData ConvertHostThreadsXLineToTfMetricsDbData(
+    const XLineVisitor& line,
+    const absl::flat_hash_map<int64_t, tsl::profiler::TfOp>& tf_ops);
+
+void ConsumeTfMetricsDbData(TfMetricsDbData src, OpMetricsDbCombiner* dst);
+
+OpMetricsDb ConvertHostThreadsXPlaneToOpMetricsDb(const XPlane& host_trace);
+
+OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace);
+
+// Convert TPU DeviceTrace XPlane to OpMetricDb
+OpMetricsDb ConvertTpuDeviceTraceXPlaneToOpMetricsDb(
+    const XPlane& device_trace);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_METRICS_DB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_op_stats.h
new file mode 100644
index 00000000..994efb03
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
+
+#include <vector>
+
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct OpStatsOptions {
+  bool maybe_drop_incomplete_steps = false;
+  bool generate_op_metrics_db = false;
+  bool generate_step_db = false;
+  bool generate_kernel_stats_db = false;
+};
+
+// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
+OpStats ConvertXSpaceToOpStats(const XSpace& space,
+                               const OpStatsOptions& options);
+
+// Populates the program_id_to_name map in OpStats.
+void SetProgramIdToNameMap(const HloProtoMap& hlo_proto_map,
+                           tensorflow::profiler::OpStats& op_stats);
+
+// Populates the given RunEnvironment with data from XSpace.
+void SetRunEnvironment(const XSpace& space, RunEnvironment* env);
+
+// Propagate and dedup the diagnostics in XSpace and add to OpStats.
+void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
+                                         OpStats* op_stats);
+
+// Populates PerfEnv.
+PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
+                    std::vector<double> peak_bws);
+
+// Extracts PerfEnv from XPlane stats.
+PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_step_events.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_step_events.h
new file mode 100644
index 00000000..acd84574
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_step_events.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_EVENTS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_EVENTS_H_
+
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Convert the host threads in XLine format to StepEvents format. If
+// device_step_events is non-null, we will filter out events that only happens
+// on CPU.
+StepEvents ConvertHostThreadsXLineToStepEvents(
+    const XLineVisitor& line, const StepEvents* device_step_events);
+
+// Convert the host threads in XPlane format to StepEvents format. If
+// device_step_events is non-null, we will filter out events that only happens
+// on CPU.
+StepEvents ConvertHostThreadsXPlaneToStepEvents(
+    const XPlane& host_trace, const StepEvents* device_step_events);
+
+// Convert the device trace in XLine format to StepEvents.
+StepEvents ConvertDeviceTraceXLineToStepEvents(const XLineVisitor& line);
+
+// Convert the device trace in XPlane format to StepEvents.
+StepEvents ConvertDeviceTraceXPlaneToStepEvents(const XPlane& device_trace);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_EVENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_step_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_step_stats.h
new file mode 100644
index 00000000..5d5ff20c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_step_stats.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_STATS_H_
+
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Converts XSpace collected by profiling a GPU device to StepStats.
+void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
new file mode 100644
index 00000000..f5f53488
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+TF_CONST_INIT extern const int64_t kSlowCallThresholdPs;
+
+enum class BottleneckType {
+  kSlowSource,
+  kSlowDataService,
+  kSlowRemoteSource,
+  kSlowTransformationWithParallelVersion,
+  kSlowTransformationWithoutParallelVersion,
+  kOther,
+};
+
+BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name);
+
+class CombinedTfDataStatsBuilder {
+ public:
+  explicit CombinedTfDataStatsBuilder(
+      CombinedTfDataStats* combined_tf_data_stats,
+      bool generate_suggestion = true)
+      : combined_tf_data_stats_(combined_tf_data_stats),
+        generate_suggestion_(generate_suggestion) {}
+
+  void Add(absl::string_view host_name, XPlane* host_plane);
+
+  // Finalizes by populating TfDataBottleneckAnalysis.
+  void Finalize();
+
+ private:
+  CombinedTfDataStats* combined_tf_data_stats_;
+  bool generate_suggestion_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tf_functions.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
new file mode 100644
index 00000000..fbff7cce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
+
+#include <string>
+
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Converts from the given XLine to a TfFunctionDb.
+TfFunctionDb ConvertHostThreadsXLineToTfFunctionDb(const XLineVisitor& line);
+
+// Returns a debugging string for the given TfFunctionDb.
+std::string DebugString(TfFunctionDb tf_function_db);
+
+// Combines the tf-function statistics from src and dst into dst.
+void CombineTfFunctionDb(const TfFunctionDb& src, TfFunctionDb* dst);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tool_names.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tool_names.h
new file mode 100644
index 00000000..a1e93694
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tool_names.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOL_NAMES_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOL_NAMES_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Gets the names of the available tools given a session snapshot.
+// Returns a comma separated list of tool names.
+absl::StatusOr<std::string> GetAvailableToolNames(
+    const SessionSnapshot& session_snapshot);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOL_NAMES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tools_data.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tools_data.h
new file mode 100644
index 00000000..8a40e03a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_tools_data.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/convert/tool_options.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Convert XSpace protos to a tool specific data.
+// Return the serialized string of tool specific data when the conversion is
+// successful, else return error status.
+absl::StatusOr<std::string> ConvertMultiXSpacesToToolData(
+    const SessionSnapshot& session_snapshot, absl::string_view tool_name,
+    const ToolOptions& options);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_trace_container.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_trace_container.h
new file mode 100644
index 00000000..cdf3a72f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xplane_to_trace_container.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
+
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using TraceEventsContainer = TraceEventsContainerBase<EventFactory, RawData>;
+
+// Converts XEvents within the XSpace into trace_viewer events container.
+void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
+                                         const XSpace& xspace,
+                                         TraceEventsContainer* container);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
new file mode 100644
index 00000000..2f9e5551
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
@@ -0,0 +1,167 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
+
+#include <cstdint>
+#include <deque>
+#include <list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tensorflow/core/profiler/protobuf/dcn_collective_info.pb.h"
+#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
+#include "tensorflow/core/profiler/protobuf/topology.pb.h"
+#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tensorflow::profiler::DcnSlackAnalysis;
+
+namespace dcn_analysis_internal {
+
+struct DcnOpState {
+  uint64_t start_time = 0;
+  uint64_t end_time = 0;
+
+  // Duration of containing send/send-done/recv/recv-done ops that needs to be
+  // subtracted from the total duration
+  uint64_t overlapping_duration = 0;
+  std::string rendezvous_name;
+  std::string transfer_type;
+  uint64_t stall_duration_ns = 0;
+  std::string send_op_name;
+  int replica_group_size = 0;
+
+  OpInstance send;
+  OpInstance send_done;
+  OpInstance recv;
+  OpInstance recv_done;
+};
+
+// Structure to extract and store the DcnHostEvents.
+struct DcnHostEvent {
+  std::string rendezvous_name;
+  tsl::profiler::Timespan timespan;
+  int multi_slice_device_id;
+};
+
+// When visiting DcnHostEvents from the megascale planes, The events are stored
+// in separate lines in an ascending (by time) order. The List allows insertion
+// of multiple arrays of sorted events.
+class DcnHostEventList {
+ public:
+  // Insert the event into the sorted list.
+  void insert(DcnHostEvent event);
+
+  // Pop the events from the front that is included within the timestamp when
+  // available.
+  std::optional<DcnHostEvent> pop(const tsl::profiler::Timespan& timespan);
+
+  // Number of events.
+  int size() const { return events_.size(); }
+
+ private:
+  std::list<DcnHostEvent> events_;
+  std::list<DcnHostEvent>::iterator iter_ = events_.begin();
+};
+
+struct InstrMetadata {
+  xla::HloOpcode opcode;
+  uint64_t channel_id;
+  std::optional<std::string> rendezvous_name;
+  int64_t size = 0;
+  std::optional<std::string> transfer_type;
+};
+
+class DcnTracker {
+ public:
+  explicit DcnTracker(const tensorflow::profiler::HloProtoMap& hlo_proto_map,
+                      bool is_megacore)
+      : hlo_proto_map_(hlo_proto_map), is_megacore_(is_megacore) {}
+
+  absl::StatusOr<InstrMetadata> GetInstructionMetadata(std::string_view module,
+                                                       std::string_view instr);
+
+  DcnSlackAnalysis Finalize();
+
+  void DebugString();
+
+  void VisitOp(const InstrMetadata& instr,
+               const tsl::profiler::XEventVisitor& visitor);
+
+  void VisitHostEvent(const DcnHostEvent& event);
+
+  void ProcessTopology(const tensorflow::profiler::Topology& topology);
+
+ private:
+  DcnSlackAnalysis slack_analysis_;
+  absl::flat_hash_map<std::string, DcnOpState> rendezvous_to_op_map_;
+  absl::flat_hash_map<uint64_t, std::string> channel_id_to_rendezvous_map_;
+  absl::flat_hash_map<std::string, InstrMetadata> instruction_metadata_map_;
+  absl::flat_hash_map<std::string, DcnHostEventList> core_id_to_host_event_map_;
+  const tensorflow::profiler::HloProtoMap& hlo_proto_map_;
+  absl::flat_hash_map<int, int> global_chip_id_to_local_index_map_;
+  absl::flat_hash_map<std::string, std::unique_ptr<xla::HloModule>>
+      hlo_module_cache_;
+  absl::flat_hash_map<std::string, int> rendezvous_to_replica_group_size_map_;
+  bool is_megacore_ = true;
+
+  absl::StatusOr<InstrMetadata> GetInstrMetadataFromHloModule(
+      std::string_view module, std::string_view instr);
+
+  void UpdateActiveOps(uint64_t duration);
+
+  void SummarizeDcnSlackAnalysis();
+
+  std::optional<DcnHostEvent> GetCollectiveHostEvent(
+      int core_id, std::string_view rendezvous_name,
+      tsl::profiler::Timespan timespan);
+
+  // GetLocalIndex when available, else return the global_device_id itself.
+  int GetLocalIndex(int dcn_device_id);
+
+  // Get number of replica group
+  int GetReplicaGroupSize(const std::string& rendezvous_name,
+                          const tsl::profiler::XEventVisitor& visitor);
+
+  // Compute data transmitted size based on number of replica groups
+  uint64_t ComputeTransmittedDataSize(int64_t buffer_size, int group_size,
+                                      const std::string& transfer_type);
+};
+
+}  // namespace dcn_analysis_internal
+
+// Convert Hlo Events in XSpace to Dcn Slack analysis.
+DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(
+    const tensorflow::profiler::XSpace& xspace,
+    const tensorflow::profiler::XPlane* dcn_host_plane,
+    const tensorflow::profiler::Topology* topology, bool is_megacore = true);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
new file mode 100644
index 00000000..9ea9bdac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -0,0 +1,113 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This checker checks the accelerator's utilization.
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+
+#include <algorithm>
+#include <map>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+struct ExecStats {
+ public:
+  // Earliest start time of a step.
+  int64_t start_micros;
+  // Latest finish time of a step.
+  int64_t end_micros;
+  // The duration spent on running a kernel during a step.
+  int64_t exec_micros;
+};
+
+class AcceleratorUtilizationChecker : public Checker {
+ public:
+  string name() const override { return kCheckers[0]; }
+
+ private:
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
+    if (!stats) {
+      absl::FPrintF(
+          stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n", name());
+      return reports_;
+    }
+    for (const auto& n : stats->nodes()) {
+      BuildExecStats(n.second.get());
+    }
+    return CheckInternal();
+  }
+
+  AdviceProto::Checker CheckInternal() {
+    for (const auto& s : accelerator_exec_stats_) {
+      const ExecStats& stat = s.second;
+      int64_t total_micros = stat.end_micros - stat.start_micros;
+      if (total_micros <= 0) continue;
+      double utilization = 1.0 * stat.exec_micros / total_micros;
+      if (utilization >= 0.5) {
+        reports_.add_reports(absl::StrFormat("device: %s utilization: %.2f",
+                                             s.first, utilization));
+      } else if (utilization < 0.5 && utilization > 0.2) {
+        reports_.add_reports(absl::StrFormat("device: %s low utilization: %.2f",
+                                             s.first, utilization));
+      } else if (utilization <= 0.2) {
+        reports_.add_reports(absl::StrFormat("device: %s low utilization: %.2f",
+                                             s.first, utilization));
+      }
+    }
+    return reports_;
+  }
+
+  void BuildExecStats(const TFGraphNode* node) {
+    const auto& execs = node->all_op_execs();
+    if (execs.empty()) {
+      return;
+    }
+    if (!IsPlacedOnAccelerator(node->canonical_device())) {
+      return;
+    }
+
+    if (accelerator_exec_stats_.find(node->canonical_device()) ==
+        accelerator_exec_stats_.end()) {
+      accelerator_exec_stats_.insert(
+          std::pair<string, ExecStats>(node->canonical_device(), ExecStats()));
+    }
+    ExecStats& stats = accelerator_exec_stats_.at(node->canonical_device());
+
+    // TODO(xpan): Use multiple steps?
+    const ExecStep& exec = execs.rbegin()->second;
+
+    if (stats.start_micros == 0) {
+      stats.start_micros = exec.all_start_micros();
+    } else if (exec.all_start_micros() != 0) {
+      stats.start_micros =
+          std::min(stats.start_micros, exec.all_start_micros());
+    }
+    stats.end_micros = std::max(stats.end_micros, exec.latest_end_micros());
+    stats.exec_micros += exec.accelerator_exec_micros();
+  }
+
+  std::map<string, ExecStats> accelerator_exec_stats_;
+  std::map<string, int64_t> ps_placement_;
+  AdviceProto::Checker reports_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/checker.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/checker.h
new file mode 100644
index 00000000..3fc345cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/checker.h
@@ -0,0 +1,51 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// Append only.
+static const char* const kCheckers[] = {
+    "AcceleratorUtilizationChecker", "OperationChecker",
+    "ExpensiveOperationChecker",
+    "JobChecker",  // Internal checker.
+};
+
+class Checker {
+ public:
+  virtual ~Checker() = default;
+
+  virtual string name() const = 0;
+
+  AdviceProto::Checker Run(const AdvisorOptionsProto::CheckerOption& options,
+                           const TFStats* stats) {
+    return Check(options, stats);
+  }
+
+ protected:
+  virtual AdviceProto::Checker Check(
+      const AdvisorOptionsProto::CheckerOption& options,
+      const TFStats* stats) = 0;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
new file mode 100644
index 00000000..4ec0cb57
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This checker checks the most expensive operations.
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ExpensiveOperationChecker : public Checker {
+ public:
+  string name() const override { return kCheckers[2]; }
+
+ private:
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
+    if (!stats) {
+      absl::FPrintF(
+          stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n", name());
+      return reports_;
+    }
+    if (stats->steps().empty()) {
+      absl::FPrintF(stderr, "Missing RunMetadata info. Skip %s\n", name());
+    }
+    CheckOpView(stats);
+    CheckScopeView(stats);
+    CheckCodeView(stats);
+    return reports_;
+  }
+
+  void CheckOpView(const TFStats* stats) {
+    if (stats->steps().empty()) {
+      absl::FPrintF(stderr, "Missing run_meta for %s\n", name());
+      return;
+    }
+    Options opts(3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -1, "micros", {".*"}, {".*"},
+                 {}, {".*"}, {}, false, {"micros", "occurrence"}, "none", {});
+    const MultiGraphNodeProto root = stats->ShowMultiGraphNode("op", opts);
+    if (root.children_size() == 0) {
+      return;
+    }
+    const MultiGraphNodeProto* node = &root;
+    std::vector<string> outputs;
+    for (int i = 0; i < 3 && node->children_size() > 0; ++i) {
+      node = &node->children(0);
+      outputs.push_back(absl::StrFormat(
+          "top %d operation type: %s, "
+          "cpu: %s, accelerator: %s, total: %s (%.2f%%)",
+          i + 1, node->name(), FormatTime(node->cpu_exec_micros()),
+          FormatTime(node->accelerator_exec_micros()),
+          FormatTime(node->exec_micros()),
+          100.0 * node->exec_micros() / (root.total_exec_micros() + 1e-10)));
+    }
+    reports_.add_reports(absl::StrJoin(outputs, "\n"));
+  }
+
+  void CheckCodeView(const TFStats* stats) {
+    if (!stats->has_code_traces()) {
+      absl::FPrintF(stderr, "Missing op_log (code traces) for %s\n", name());
+      return;
+    }
+    Options opts(100, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -1, "micros", {".*"},
+                 {".*"}, {}, {".*"}, {}, false, {"micros"}, "none", {});
+    const MultiGraphNodeProto root = stats->ShowMultiGraphNode("code", opts);
+    const MultiGraphNodeProto* node = &root;
+    // A trick here is: Usually, codes in library file are usually referenced
+    // only once, while user's own code are referenced multiple times.
+    while (node->children_size() == 1) {
+      node = &node->children(0);
+    }
+    if (node->children_size() == 0) {
+      return;
+    }
+
+    std::vector<string> outputs;
+    CodeViewHelper(node, 0, &outputs);
+    reports_.add_reports(absl::StrJoin(outputs, "\n"));
+  }
+
+  void CheckScopeView(const TFStats* stats) {
+    Options opts(100, 0, 0, 0, 0, 100, 0, 0, 0, 0, 0, -1, "micros", {".*"},
+                 {".*"}, {}, {".*"}, {}, false, {"micros"}, "none", {});
+    const GraphNodeProto root = stats->ShowGraphNode("scope", opts);
+    if (root.children_size() == 0) {
+      return;
+    }
+    std::vector<string> outputs;
+    for (int i = 0; i < 3 && i < root.children_size(); ++i) {
+      const GraphNodeProto& node = root.children(i);
+      outputs.push_back(absl::StrFormat(
+          "top %d graph node: %s, cpu: %s, accelerator: %s, total: %s", i + 1,
+          node.name(), FormatTime(node.cpu_exec_micros()),
+          FormatTime(node.accelerator_exec_micros()),
+          FormatTime(node.exec_micros())));
+    }
+    reports_.add_reports(absl::StrJoin(outputs, "\n"));
+  }
+
+  void CodeViewHelper(const MultiGraphNodeProto* node, int depth,
+                      std::vector<string>* outputs) {
+    if (node->children_size() <= 1 || depth > 3) {
+      return;
+    }
+    for (int j = 0; j < 3 && j < node->children_size(); ++j) {
+      const MultiGraphNodeProto* c = &node->children(j);
+      if (c->total_exec_micros() < 1000) {
+        continue;
+      }
+      outputs->push_back(
+          absl::StrFormat("%s%s, cpu: %s, accelerator: %s, total: %s",
+                          std::string(depth * 2, ' '), c->name(),
+                          FormatTime(c->total_cpu_exec_micros()),
+                          FormatTime(c->total_accelerator_exec_micros()),
+                          FormatTime(c->total_exec_micros())));
+      CodeViewHelper(c, depth + 1, outputs);
+    }
+  }
+
+  AdviceProto::Checker reports_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
new file mode 100644
index 00000000..6fc16cf9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFStats;
+
+AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
+                                const TFStats* stats);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/operation_checker.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/operation_checker.h
new file mode 100644
index 00000000..5142639f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -0,0 +1,78 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This checker checks common wrong configurations of operations.
+//
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class OperationChecker : public Checker {
+ public:
+  string name() const override { return kCheckers[1]; }
+
+ private:
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
+    if (!stats) {
+      absl::FPrintF(
+          stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n", name());
+      return reports_;
+    }
+    bool use_batch_norm = false;
+    bool use_fused_batch_norm = false;
+    bool recommend_nchw = false;
+    for (const auto& n : stats->nodes()) {
+      const TFGraphNode* node = n.second.get();
+      if (node->name().find("BatchNorm") != node->name().npos) {
+        use_batch_norm = true;
+      }
+      if (node->op_types().find("FusedBatchNorm") != node->op_types().end()) {
+        use_fused_batch_norm = true;
+      }
+
+      const AttrValue* attr = node->op_attrs("data_format");
+      if (attr) {
+        if (attr->s() == "NHWC" &&
+            IsPlacedOnAccelerator(node->canonical_device())) {
+          recommend_nchw = true;
+        }
+      }
+    }
+    if (use_batch_norm && !use_fused_batch_norm) {
+      reports_.add_reports(
+          "Maybe use faster FusedBatchNorm instead of BatchNorm");
+    }
+    if (recommend_nchw) {
+      // TODO(xpan): Maybe print which Op supports NCHW.
+      reports_.add_reports(
+          "Found operation using NHWC data_format on GPU. Maybe "
+          "NCHW is faster.");
+    }
+    return reports_;
+  }
+
+ private:
+  AdviceProto::Checker reports_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
new file mode 100644
index 00000000..e1db57cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -0,0 +1,84 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
+
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
+#include "tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h"
+#include "tensorflow/core/profiler/internal/advisor/internal_checker_runner.h"
+#include "tensorflow/core/profiler/internal/advisor/operation_checker.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// The Advisor runs a list of Checkers, each checks a specific area.
+class Advisor {
+ public:
+  Advisor(const TFStats* stats) : stats_(stats) {}
+
+  static AdvisorOptionsProto DefaultOptions() {
+    AdvisorOptionsProto options;
+    std::vector<string> checkers(
+        kCheckers, kCheckers + sizeof(kCheckers) / sizeof(*kCheckers));
+    for (const string& checker : checkers) {
+      (*options.mutable_checkers())[checker];
+    }
+    return options;
+  }
+
+  AdviceProto Advise(const AdvisorOptionsProto& options) {
+    // Note: Release a checker's memory ASAP.
+    AdviceProto ret = RunInternalCheckers(options, stats_);
+
+    if (options.checkers().find(kCheckers[0]) != options.checkers().end()) {
+      AcceleratorUtilizationChecker au_checker;
+      (*ret.mutable_checkers())[kCheckers[0]].MergeFrom(
+          au_checker.Run(options.checkers().at(kCheckers[0]), stats_));
+    }
+    if (options.checkers().find(kCheckers[1]) != options.checkers().end()) {
+      OperationChecker op_checker;
+      (*ret.mutable_checkers())[kCheckers[1]].MergeFrom(
+          op_checker.Run(options.checkers().at(kCheckers[1]), stats_));
+    }
+    if (options.checkers().find(kCheckers[2]) != options.checkers().end()) {
+      ExpensiveOperationChecker expensive_op_checker;
+      (*ret.mutable_checkers())[kCheckers[2]].MergeFrom(
+          expensive_op_checker.Run(options.checkers().at(kCheckers[2]),
+                                   stats_));
+    }
+    for (const auto& checker : ret.checkers()) {
+      absl::FPrintF(stdout, "\n%s:\n", checker.first);
+      for (const string& r : checker.second.reports()) {
+        absl::FPrintF(stdout, "%s\n", r);
+      }
+    }
+    fflush(stdout);
+    return ret;
+  }
+
+ private:
+  const TFStats* stats_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/print_model_analysis.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/print_model_analysis.h
new file mode 100644
index 00000000..ab1887a8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -0,0 +1,66 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+
+#include <string>
+
+namespace tensorflow {
+namespace tfprof {
+struct Options;
+
+// **********************
+// APIs in this file are only for swig.
+// Talk to xpan@ if you want to call it directly!
+// *********************
+
+// Multi-step Profiler.
+//
+bool NewProfiler(const std::string* graph, const std::string* op_log);
+
+void DeleteProfiler();
+
+double AddStep(int64_t step, const std::string* graph,
+               const std::string* run_meta, const std::string* op_log);
+
+// Write the profiler's profile to a proto buffer.
+void WriteProfile(const std::string* filename);
+
+// Load the profile to profiler from a proto buffer file.
+void ProfilerFromFile(const std::string* filename);
+
+// Returns a binary string that represents the serialized ProfileProto.
+std::string SerializeToString();
+
+std::string Profile(const std::string* command, const std::string* options);
+
+// Single-step Profiler.
+//
+// Interface defined for Python API swig. Calls the tfprof core API.
+// 'graph', 'run_meta', 'op_log' are serialized GraphDef, RunMetadata,
+// OpLogProto strings, respectively.
+// 'graph', 'command' and 'options' are required. Others can be nullptr
+// if not available.
+std::string PrintModelAnalysis(const std::string* graph,
+                               const std::string* run_meta,
+                               const std::string* op_log,
+                               const std::string* command,
+                               const std::string* options);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_code.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_code.h
new file mode 100644
index 00000000..5664fb0c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_code.h
@@ -0,0 +1,96 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a tree structure based on the TensorFlow model's python code stacks.
+// Stats are aggregated from descendants to ancestors.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/profile.pb.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class PprofProfile {
+ public:
+  virtual ~PprofProfile() = default;
+
+  virtual uint64 AddLocation(const CodeNode* callee,
+                             const CodeNode* caller) = 0;
+
+  virtual void AddSample(const CodeNode* leaf,
+                         std::vector<uint64>* call_ids) = 0;
+
+  virtual absl::Status WritePprofProfile(const string& filename) = 0;
+};
+
+class TFCode : public TFMultiShow {
+ public:
+  TFCode() = default;
+  ~TFCode() override = default;
+
+  // Add nodes to the code view. Called before Build()
+  void AddNode(TFGraphNode* node) override;
+
+  // Build the code view structure. Called after all nodes
+  // are added via AddNode().
+  void Build() override;
+
+ private:
+  const ShowMultiNode* ShowInternal(const Options& opts,
+                                    Timeline* timeline) override;
+
+  std::vector<CodeNode*> SearchRoot(std::vector<CodeNode*> roots,
+                                    const std::vector<string>& regexes);
+
+  std::vector<CodeNode*> PrintScope(std::vector<CodeNode*> roots,
+                                    const Options& opts, int depth,
+                                    int last_ident);
+
+  std::vector<CodeNode*> Account(const std::vector<CodeNode*>& roots,
+                                 const Options& opts);
+
+  void Format(const CodeNode* root, const std::vector<CodeNode*>& nodes,
+              const Options& opts, string* display_str,
+              MultiGraphNodeProto* proto, std::vector<uint64>* call_ids);
+
+  string FormatNode(CodeNode* node, const Options& opts, int64_t indent) const;
+  string FormatNodeMemory(CodeNode* node, int64_t bytes,
+                          int64_t total_bytes) const;
+
+  std::unique_ptr<CodeNode> root_;
+  std::unique_ptr<TFMultiGraphNode> graph_root_;
+  std::unique_ptr<PprofProfile> pprof_profile_;
+  std::map<string, std::vector<TFGraphNode*>> grad_nodes_;
+  std::map<string, TFGraphNode*> forward_nodes_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_constants.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_constants.h
new file mode 100644
index 00000000..d4a47931
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_constants.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+
+namespace tensorflow {
+namespace tfprof {
+
+// Op name of root of everything. Aggregates all stats.
+static const char* const kTFProfRoot = "_TFProfRoot";
+// Op type for nodes that doesn't represent a physical node in the
+// TensorFlow model. Only exist as a placehold to aggregate children.
+// For example, kTFProfRoot belongs to this type.
+static const char* const kTFGraphParent = "_TFGraphParent";
+static const char* const kTFScopeParent = "_kTFScopeParent";
+// Op type for tf.trainable_variables().
+static const char* const kTrainableVarType = "_trainable_variables";
+// Op type for tensors in the checkpoint file.
+static const char* const kCkptVarType = "_checkpoint_variables";
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_graph.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_graph.h
new file mode 100644
index 00000000..89ae0b37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_graph.h
@@ -0,0 +1,87 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a graph structure based on op inputs/outputs. The graph is a directed
+// acyclic graph pointing *from outputs to inputs*.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+
+#include <deque>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// Organize tensorflow ops in a graph structure, pointing from output ops
+// to input ops.
+class TFGraph : public TFShow {
+ public:
+  explicit TFGraph(checkpoint::CheckpointReader* ckpt_reader)
+      : TFShow(ckpt_reader), root_(nullptr) {}
+  ~TFGraph() override = default;
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
+
+  bool ShouldShowIfExtra(const ShowNode* node, const Options& opts,
+                         int depth) const override {
+    return true;
+  }
+
+  GraphNode* CreateParentNode(const string& name);
+
+  std::vector<GraphNode*> SearchRoot(const std::vector<GraphNode*>& roots,
+                                     const std::vector<string>& regexes,
+                                     std::set<string>* visited);
+
+  std::vector<GraphNode*> PrintGraph(std::vector<GraphNode*> roots,
+                                     const Options& opts, int depth,
+                                     int last_ident, std::set<string>* visits);
+
+  std::vector<GraphNode*> Account(const std::vector<GraphNode*>& roots,
+                                  const Options& opts,
+                                  std::set<string>* visits);
+
+  void Format(std::vector<GraphNode*> roots, string* display_str,
+              GraphNodeProto* proto);
+
+  MemoryTracker memory_tracker_;
+  GraphNode* root_;
+  std::vector<std::unique_ptr<NodeDef>> node_defs_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<GraphNode>> nodes_map_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_node.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_node.h
new file mode 100644
index 00000000..e0645654
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_node.h
@@ -0,0 +1,920 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+
+namespace tensorflow {
+namespace tfprof {
+std::vector<int64_t> ShapeProtoToVec(const TensorShapeProto& shape_pb);
+
+TensorShapeProto VecToShapeProto(const std::vector<int64_t>& shape_vec);
+
+class TFGraphNode;
+
+class CallStack {
+ public:
+  class Trace {
+   public:
+    Trace(const CodeDef::Trace* trace,
+          const std::map<int64_t, string>* id_to_string)
+        : trace_(trace), id_to_string_(id_to_string) {}
+
+    int32 lineno() const { return trace_->lineno(); }
+    string file() const {
+      // Backward compatible with old proto files.
+      if (!trace_->file().empty()) return trace_->file();
+      return id_to_string_->at(trace_->file_id());
+    }
+    string function() const {
+      // Backward compatible with old proto files.
+      if (!trace_->function().empty()) return trace_->function();
+      return id_to_string_->at(trace_->function_id());
+    }
+    int32 func_start_line() const { return trace_->func_start_line(); }
+
+   private:
+    const CodeDef::Trace* trace_;
+    const std::map<int64_t, string>* id_to_string_;
+  };
+
+  CallStack(const CodeDef& def, const std::map<int64_t, string>* id_to_string)
+      : def_(def) {
+    traces_.reserve(def.traces_size());
+    for (const auto& t : def_.traces()) {
+      traces_.emplace_back(&t, id_to_string);
+    }
+  }
+
+  const CodeDef& code_def() const { return def_; }
+  const std::vector<Trace>& traces() const { return traces_; }
+
+ private:
+  std::vector<Trace> traces_;
+  CodeDef def_;
+};
+
+class ExecStep {
+ public:
+  ExecStep() = default;
+
+  void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
+
+  void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
+
+  int64_t run_count() const { return exec_.run_count(); }
+  // The execution time of an op. If it runs on accelerator, then it's
+  // accelerator_exec_micros(). Otherwise, it's CPU time.
+  int64_t exec_micros() const;
+  // The accelerator execution time of an op. 0 if not run on accelerator.
+  int64_t accelerator_exec_micros() const;
+  // The cpu execution time of an op.
+  int64_t cpu_exec_micros() const;
+
+  const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& op_execs()
+      const {
+    return op_execs_;
+  }
+  const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& cpu_execs()
+      const {
+    return cpu_execs_;
+  }
+  int64_t all_start_micros() const { return exec_.all_start_micros(); }
+  int64_t latest_end_micros() const { return exec_.latest_end_micros(); }
+  int64_t lastest_schedule_end_micros() const {
+    int64_t ret = 0;
+    for (const auto& exec : cpu_execs_) {
+      for (const auto& pair : exec.second) {
+        ret = std::max(ret, pair.first + pair.second);
+      }
+    }
+    return ret;
+  }
+  int64_t requested_bytes() const {
+    int64_t requested_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      requested_bytes += exec.requested_bytes();
+    }
+    return requested_bytes;
+  }
+  int64_t peak_bytes() const {
+    int64_t peak_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      peak_bytes += exec.peak_bytes();
+    }
+    return peak_bytes;
+  }
+  int64_t residual_bytes() const {
+    int64_t residual_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      residual_bytes += exec.residual_bytes();
+    }
+    return residual_bytes;
+  }
+  int64_t output_bytes() const {
+    int64_t output_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      output_bytes += exec.output_bytes();
+    }
+    return output_bytes;
+  }
+  int64_t accelerator_temp_bytes() const {
+    int64_t accelerator_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_temp_bytes += exec.accelerator_temp_bytes();
+    }
+    return accelerator_temp_bytes;
+  }
+  int64_t host_temp_bytes() const {
+    int64_t host_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_temp_bytes += exec.host_temp_bytes();
+    }
+    return host_temp_bytes;
+  }
+  int64_t accelerator_persistent_bytes() const {
+    int64_t accelerator_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
+    }
+    return accelerator_persistent_bytes;
+  }
+  int64_t host_persistent_bytes() const {
+    int64_t host_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_persistent_bytes += exec.host_persistent_bytes();
+    }
+    return host_persistent_bytes;
+  }
+  std::map<int64_t, int64_t> allocator_bytes_in_use() const {
+    std::map<int64_t, int64_t> bytes_in_use;
+    for (const ExecMemory& exec : memory_execs_) {
+      bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
+    }
+    return bytes_in_use;
+  }
+
+  const std::vector<AllocationRecord>& allocations() const {
+    return allocations_;
+  }
+
+  const ExecProfile& ToProto() {
+    exec_.mutable_accelerator_execs()->clear();
+    for (const auto& e : accelerator_execs_) {
+      auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first];
+      for (const auto& p : e.second) {
+        auto* t = exec_time.mutable_times()->Add();
+        t->add_int64_values(p.first);
+        t->add_int64_values(p.second);
+      }
+    }
+
+    exec_.mutable_cpu_execs()->clear();
+    for (const auto& e : cpu_execs_) {
+      auto& exec_time = (*exec_.mutable_cpu_execs())[e.first];
+      for (const auto& p : e.second) {
+        auto* t = exec_time.mutable_times()->Add();
+        t->add_int64_values(p.first);
+        t->add_int64_values(p.second);
+      }
+    }
+
+    exec_.mutable_devices()->Clear();
+    exec_.mutable_devices()->Reserve(devices_.size());
+    for (const string& d : devices_) {
+      exec_.add_devices(d);
+    }
+    exec_.mutable_allocations()->Clear();
+    for (const auto& r : allocations_) {
+      exec_.add_allocations()->MergeFrom(r);
+    }
+
+    exec_.mutable_memory_execs()->Clear();
+    for (const auto& m : memory_execs_) {
+      exec_.add_memory_execs()->MergeFrom(m);
+    }
+    return exec_;
+  }
+
+  void FromProto(const ExecProfile& exec) {
+    exec_.Clear();
+    exec_.MergeFrom(exec);
+
+    devices_.clear();
+    devices_.insert(exec.devices().begin(), exec.devices().end());
+
+    accelerator_execs_.clear();
+    cpu_execs_.clear();
+    op_execs_.clear();
+
+    allocations_.clear();
+    memory_execs_.clear();
+
+    for (const auto& exec_time : exec_.accelerator_execs()) {
+      auto& exec = accelerator_execs_[exec_time.first];
+      auto& op_exec = op_execs_[exec_time.first];
+      for (const auto& p : exec_time.second.times()) {
+        exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+        op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+      }
+    }
+    for (const auto& exec_time : exec_.cpu_execs()) {
+      auto& exec = cpu_execs_[exec_time.first];
+      auto& op_exec = op_execs_[exec_time.first];
+      for (const auto& p : exec_time.second.times()) {
+        exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+        op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+      }
+    }
+    for (const auto& r : exec_.allocations()) {
+      allocations_.push_back(r);
+    }
+    for (const auto& m : exec_.memory_execs()) {
+      memory_execs_.push_back(m);
+    }
+  }
+
+ private:
+  ExecProfile exec_;
+  // device -> vector of {op_start_micros, op_exec_micros} pairs.
+  // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
+  // For accelerator, vector size can be larger than 1, multiple kernel fires
+  // or in tf.while_loop.
+  std::map<string, std::vector<std::pair<int64_t, int64_t>>> accelerator_execs_;
+  // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
+  // For cpu, vector size can be larger than 1 if in tf.while_loop.
+  std::map<string, std::vector<std::pair<int64_t, int64_t>>> cpu_execs_;
+  // combines accelerator_execs_ and cpu_execs_.
+  std::map<string, std::vector<std::pair<int64_t, int64_t>>> op_execs_;
+  // Each ExecMemory corresponds to one scheduling of the op. Normally,
+  // there are multiple schedulings in while_loop.
+  std::vector<ExecMemory> memory_execs_;
+  // All devices the op is associated with (e.g. gpu:0 (scheduling),
+  // gpu:0:stream:xx (kernel exec), cpu:0 host)
+  std::set<string> devices_;
+
+  // The history of accelerator allocations and deallocations of this step.
+  std::vector<AllocationRecord> allocations_;
+};
+
+#define GRAPH_NODE_BYTES(type)             \
+  do {                                     \
+    if (execs_.empty()) {                  \
+      return 0;                            \
+    }                                      \
+    if (step >= 0) {                       \
+      auto exec = execs_.find(step);       \
+      if (exec == execs_.end()) return 0;  \
+      return exec->second.type##_bytes();  \
+    }                                      \
+                                           \
+    int64_t bytes = 0;                     \
+    for (const auto& exec : execs_) {      \
+      bytes += exec.second.type##_bytes(); \
+    }                                      \
+    return bytes / execs_.size();          \
+  } while (0)
+
+class TFGraphNode {
+ public:
+  TFGraphNode(const ProfileNode& node, const ProfileProto& profile,
+              const std::map<int64_t, string>* id_to_string,
+              const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+    nodes_map_ = nodes_map;
+    FromProto(node, profile, id_to_string);
+  }
+
+  TFGraphNode(const NodeDef* node, int64_t id,
+              const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+    nodes_map_ = nodes_map;
+    node_.set_id(id);
+    node_.set_name(node->name());
+    node_.set_op(node->op());
+    node_.set_float_ops(0);
+
+    for (const auto& attr : node->attr()) {
+      (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second);
+      if (attr.first == "shape" && attr.second.has_shape()) {
+        if (!shape_.empty()) {
+          absl::FPrintF(stderr, "Found duplicated shapes!\n");
+          continue;
+        }
+        shape_ = ShapeProtoToVec(attr.second.shape());
+      } else if (attr.first == "_output_shapes" && attr.second.has_list()) {
+        if (!output_shapes_.empty()) {
+          absl::FPrintF(stderr, "Found duplicated output shapes!\n");
+          continue;
+        }
+        for (int i = 0; i < attr.second.list().shape_size(); ++i) {
+          output_shapes_[i] = ShapeProtoToVec(attr.second.list().shape(i));
+        }
+      }
+    }
+    op_types_.insert(node->op());
+  }
+
+  void AddInput(const string& input, int64_t output_index, int input_idx) {
+    inputs_[input_idx] = input;
+    src_output_idx_[input] = output_index;
+  }
+
+  void AddOpType(const string& op_type) { op_types_.insert(op_type); }
+
+  void AddStepStat(int64_t step, const string& device,
+                   const NodeExecStats& step_stat);
+
+  void AddFloatOps(int64_t float_ops) { node_.set_float_ops(float_ops); }
+
+  // TODO(xpan): This could take a lot of memory.
+  void AddCode(const CodeDef& code,
+               const std::map<int64_t, string>* id_to_string) {
+    if (!call_stack_) {
+      call_stack_ = std::make_unique<CallStack>(code, id_to_string);
+    }
+  }
+
+  const string& name() const { return node_.name(); }
+  int64_t id() const { return node_.id(); }
+  const string& op() const { return node_.op(); }
+  const ProfileNode& node() { return node_; }
+
+  bool trackable(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) return false;
+
+    if (exec->second.all_start_micros() == 0) return false;
+    if (node_.canonical_device().empty() || node_.host_device().empty()) {
+      return false;
+    }
+    return true;
+  }
+
+  const ProfileNode& ToProto(
+      const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) {
+    node_.clear_shape();
+    node_.mutable_shape()->Reserve(shape().size());
+    for (int64_t s : shape()) {
+      node_.add_shape(s);
+    }
+
+    node_.clear_op_types();
+    node_.mutable_op_types()->Reserve(op_types().size());
+    for (const string& t : op_types()) {
+      node_.add_op_types(t);
+    }
+
+    node_.clear_execs();
+    for (auto& exec : execs_) {
+      auto& exec_pb = (*node_.mutable_execs())[exec.first];
+      exec_pb.MergeFrom(exec.second.ToProto());
+    }
+
+    node_.clear_inputs();
+    for (const auto& inp : inputs_) {
+      (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id();
+    }
+
+    node_.clear_input_shapes();
+    for (const auto& s : input_shapes_) {
+      auto& shape = (*node_.mutable_input_shapes())[s.first];
+      for (int64_t d : s.second) {
+        shape.add_int64_values(d);
+      }
+    }
+
+    node_.clear_output_shapes();
+    for (const auto& s : output_shapes_) {
+      auto& shape = (*node_.mutable_output_shapes())[s.first];
+      for (int64_t d : s.second) {
+        shape.add_int64_values(d);
+      }
+    }
+
+    node_.clear_src_output_index();
+    for (const auto& s : src_output_idx_) {
+      int64_t id = nodes_map.at(s.first)->id();
+      (*node_.mutable_src_output_index())[id] = s.second;
+    }
+
+    if (call_stack_) {
+      node_.clear_trace();
+      node_.mutable_trace()->MergeFrom(call_stack_->code_def());
+    }
+    return node_;
+  }
+
+  void FromProto(const ProfileNode& node, const ProfileProto& profile,
+                 const std::map<int64_t, string>* id_to_string) {
+    node_.Clear();
+    node_.MergeFrom(node);
+
+    call_stack_ = std::make_unique<CallStack>(node.trace(), id_to_string);
+
+    op_types_.clear();
+    op_types_.insert(node_.op_types().begin(), node_.op_types().end());
+
+    shape_.clear();
+    for (int64_t s : node_.shape()) {
+      shape_.push_back(s);
+    }
+
+    execs_.clear();
+    for (const auto& exec_pb : node.execs()) {
+      auto& exec = execs_[exec_pb.first];
+      exec.FromProto(exec_pb.second);
+    }
+
+    inputs_.clear();
+    for (const auto& inp : node.inputs()) {
+      inputs_[inp.first] = profile.nodes().at(inp.second).name();
+    }
+
+    input_shapes_.clear();
+    for (const auto& s : node.input_shapes()) {
+      auto& shape = input_shapes_[s.first];
+      for (const int64_t d : s.second.int64_values()) {
+        shape.push_back(d);
+      }
+    }
+
+    output_shapes_.clear();
+    for (const auto& s : node.output_shapes()) {
+      auto& shape = output_shapes_[s.first];
+      for (const int64_t d : s.second.int64_values()) {
+        shape.push_back(d);
+      }
+    }
+
+    src_output_idx_.clear();
+    for (const auto& s : node.src_output_index()) {
+      src_output_idx_[profile.nodes().at(s.first).name()] = s.second;
+    }
+  }
+
+  const std::map<int32, string>& inputs() const { return inputs_; }
+
+  // Number of times the graph node is executed. When step < 0, the
+  // average number of times executed across all steps.
+  int64_t run_count(int64_t step) const {
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      if (exec == execs_.end()) {
+        return 0;
+      }
+      return exec->second.run_count();
+    }
+    int64_t total_run_count = 0;
+    for (const auto& exec : execs_) {
+      total_run_count += exec.second.run_count();
+    }
+    return total_run_count / execs_.size();
+  }
+  // This is overall computation time, including both cpu and accelerator.
+  // Note, cpu and accelerator might or might not run in parallel.
+  int64_t exec_micros(int64_t step) const {
+    // Empty when no RunMetadata is provided.
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      if (exec == execs_.end()) {
+        return 0;
+      }
+      return exec->second.exec_micros();
+    }
+
+    int64_t total_micros = 0;
+    for (const auto& exec : execs_) {
+      total_micros += exec.second.exec_micros();
+    }
+    return total_micros / execs_.size();
+  }
+
+  // This is accelerator computation time of a step, or average of
+  // multiple step, when step < 0.
+  int64_t accelerator_exec_micros(int64_t step) const {
+    // Empty when no RunMetadata is provided.
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      if (exec == execs_.end()) {
+        return 0;
+      }
+      return exec->second.accelerator_exec_micros();
+    }
+
+    int64_t total_micros = 0;
+    for (const auto& exec : execs_) {
+      total_micros += exec.second.accelerator_exec_micros();
+    }
+    return total_micros / execs_.size();
+  }
+
+  // This is cpu computation time of a step, or average of
+  // multiple step, when step < 0.
+  int64_t cpu_exec_micros(int64_t step) const {
+    // Empty when no RunMetadata is provided.
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      if (exec == execs_.end()) {
+        return 0;
+      }
+      return exec->second.cpu_exec_micros();
+    }
+
+    int64_t total_micros = 0;
+    for (const auto& exec : execs_) {
+      total_micros += exec.second.cpu_exec_micros();
+    }
+    return total_micros / execs_.size();
+  }
+
+  int64_t requested_bytes(int64_t step) const { GRAPH_NODE_BYTES(requested); }
+  int64_t peak_bytes(int64_t step) const { GRAPH_NODE_BYTES(peak); }
+  int64_t residual_bytes(int64_t step) const { GRAPH_NODE_BYTES(residual); }
+  int64_t output_bytes(int64_t step) const { GRAPH_NODE_BYTES(output); }
+
+  int64_t all_start_micros(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.all_start_micros();
+  }
+
+  int64_t latest_end_micros(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.latest_end_micros();
+  }
+
+  int64_t lastest_schedule_end_micros(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.lastest_schedule_end_micros();
+  }
+
+  const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& op_execs(
+      int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_execs_;
+    }
+    return exec->second.op_execs();
+  }
+  const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& cpu_execs(
+      int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_execs_;
+    }
+    return exec->second.cpu_execs();
+  }
+
+  const std::map<int64_t, ExecStep>& all_op_execs() const { return execs_; }
+
+  int64_t accelerator_temp_bytes(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.accelerator_temp_bytes();
+  }
+  int64_t host_temp_bytes(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.host_temp_bytes();
+  }
+  int64_t accelerator_persistent_bytes() const {
+    int64_t persistent_bytes = 0;
+    for (const auto& exec : execs_) {
+      persistent_bytes = std::max(persistent_bytes,
+                                  exec.second.accelerator_persistent_bytes());
+    }
+    return persistent_bytes;
+  }
+  std::map<int64_t, int64_t> allocator_bytes_in_use(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_bytes_in_use_;
+    }
+    return exec->second.allocator_bytes_in_use();
+  }
+
+  const std::vector<AllocationRecord>& allocations(int64_t step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_allocations_;
+    }
+    return exec->second.allocations();
+  }
+
+  int64_t parameters() const {
+    if (!shape().empty()) {
+      int64_t params = 1;
+      bool complete_shape = true;
+      for (int64_t d : shape()) {
+        // Sometimes parameters could be <0 when a dim is unknown.
+        if (d < 0) {
+          complete_shape = false;
+          break;
+        }
+        params *= d;
+      }
+      if (complete_shape) {
+        return params;
+      } else {
+        LOG(INFO) << "Incomplete shape.\n";
+      }
+    }
+    return 0;
+  }
+
+  int64_t float_ops(int64_t step) const {
+    // If not run, return static analysis.
+    if (execs_.empty()) {
+      return node_.float_ops();
+    }
+    // Otherwise, return dynamic float_ops.
+    return node_.float_ops() * run_count(step);
+  }
+  const CallStack* call_stack() { return call_stack_.get(); }
+  string canonical_device() const { return node_.canonical_device(); }
+  string host_device() const { return node_.host_device(); }
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const AttrValue* op_attrs(const string& name) const {
+    const auto it = node_.attrs().find(name);
+    if (it == node_.attrs().end()) {
+      return nullptr;
+    }
+    return &it->second;
+  }
+
+  const std::vector<int64_t>& shape() const { return shape_; }
+
+  const std::map<int, std::vector<int64_t>>& output_shapes() const {
+    return output_shapes_;
+  }
+
+  std::map<int, std::vector<int64_t>> input_shapes() const {
+    std::map<int, std::vector<int64_t>> input_shapes;
+    for (const auto& inp : inputs_) {
+      // Always create an empty vec even if the shape info might be missing.
+      std::vector<int64_t>& shape_vec = input_shapes[inp.first];
+      if (!nodes_map_) continue;
+      auto input_it = nodes_map_->find(inp.second);
+      if (input_it == nodes_map_->end()) continue;
+      auto output_it = src_output_idx_.find(inp.second);
+      if (output_it == src_output_idx_.end()) continue;
+
+      const TFGraphNode* input_node = input_it->second.get();
+      if (!input_node) continue;
+      const auto& output_shapes = input_node->output_shapes();
+      const auto& output_shape = output_shapes.find(output_it->second);
+      if (output_shape == output_shapes.end()) continue;
+
+      if (output_shape != input_node->output_shapes().end()) {
+        shape_vec.assign(output_shape->second.begin(),
+                         output_shape->second.end());
+      }
+    }
+    return input_shapes;
+  }
+
+ private:
+  // maps graph node name to TFGraphNode. Not owned.
+  const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_;
+  // inputs to the node. input index -> input node name.
+  std::map<int, string> inputs_;
+  // The output index of the source node.
+  std::map<string, int32> src_output_idx_;
+  // proto for serialize/deserialized representation of the node.
+  ProfileNode node_;
+  // Python call stack that creates the name.
+  std::unique_ptr<CallStack> call_stack_;
+  // Shape of the node (e.g. Variable) if available.
+  std::vector<int64_t> shape_;
+  // Won't missing input_idx. But some shapes might be empty (unknown).
+  std::map<int, std::vector<int64_t>> input_shapes_;
+  // Could miss output_idx if no _output_shapes attr. some shapes can also
+  // be empty.
+  std::map<int, std::vector<int64_t>> output_shapes_;
+
+  std::set<string> op_types_;
+
+  std::map<int64_t, ExecStep> execs_;
+
+  // Placeholder for empty cases.
+  std::map<int64_t, int64_t> empty_bytes_in_use_;
+  std::map<string, std::vector<std::pair<int64_t, int64_t>>> empty_execs_;
+  std::vector<AllocationRecord> empty_allocations_;
+};
+
+class TFMultiGraphNode {
+ public:
+  TFMultiGraphNode(const string& name)
+      : name_(name),
+        step_(-1),
+        run_count_(0),
+        exec_micros_(0),
+        accelerator_exec_micros_(0),
+        cpu_exec_micros_(0),
+        requested_bytes_(0),
+        peak_bytes_(0),
+        residual_bytes_(0),
+        output_bytes_(0),
+        float_ops_(0),
+        parameters_(0) {}
+
+  bool SnapshotNodes(int64_t step, const std::vector<string>& type_regexes) {
+    run_count_ = 0;
+    exec_micros_ = 0;
+    accelerator_exec_micros_ = 0;
+    cpu_exec_micros_ = 0;
+
+    requested_bytes_ = 0;
+    peak_bytes_ = 0;
+    residual_bytes_ = 0;
+    output_bytes_ = 0;
+
+    float_ops_ = 0;
+    parameters_ = 0;
+    op_types_.clear();
+    shapes_.clear();
+    devices_.clear();
+    snapshot_nodes_.clear();
+
+    step_ = step;
+    std::vector<const TFGraphNode*> nodes = pick_nodes(type_regexes);
+
+    if (nodes.empty()) {
+      return (type_regexes.size() == 1 && type_regexes[0] == ".*");
+    }
+
+    for (const TFGraphNode* node : nodes) {
+      op_types_.insert(node->op_types().begin(), node->op_types().end());
+
+      run_count_ += node->run_count(step);
+      exec_micros_ += node->exec_micros(step);
+      accelerator_exec_micros_ += node->accelerator_exec_micros(step);
+      cpu_exec_micros_ += node->cpu_exec_micros(step);
+
+      requested_bytes_ += node->requested_bytes(step);
+      peak_bytes_ += node->peak_bytes(step);
+      residual_bytes_ += node->residual_bytes(step);
+      output_bytes_ += node->output_bytes(step);
+
+      float_ops_ += node->float_ops(step);
+      parameters_ += node->parameters();
+      if (!node->shape().empty()) {
+        shapes_.push_back(node->shape());
+      }
+      devices_.insert(node->canonical_device());
+      snapshot_nodes_[node->name()] = node;
+    }
+    return true;
+  }
+
+  int64_t step() const { return step_; }
+
+  void AddGraphNode(const TFGraphNode* node) {
+    if (nodes_.find(node->name()) != nodes_.end()) {
+      return;
+    }
+    nodes_[node->name()] = node;
+  }
+
+  const std::map<string, const TFGraphNode*>& graph_nodes() const {
+    return snapshot_nodes_;
+  }
+
+  const string& name() const { return name_; }
+
+  int64_t run_count() const { return run_count_; }
+  int64_t exec_micros() const { return exec_micros_; }
+  int64_t accelerator_exec_micros() const { return accelerator_exec_micros_; }
+  int64_t cpu_exec_micros() const { return cpu_exec_micros_; }
+
+  int64_t requested_bytes() const { return requested_bytes_; }
+  int64_t peak_bytes() const { return peak_bytes_; }
+  int64_t residual_bytes() const { return residual_bytes_; }
+  int64_t output_bytes() const { return output_bytes_; }
+
+  int64_t float_ops() const { return float_ops_; }
+
+  int64_t parameters() const { return parameters_; }
+
+  const std::set<string>& devices() const { return devices_; }
+
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::vector<std::vector<int64_t>>& shapes() const { return shapes_; }
+
+ private:
+  std::vector<const TFGraphNode*> pick_nodes(
+      const std::vector<string>& type_regexes) {
+    if (type_regexes.empty()) {
+      return {};
+    }
+    std::vector<const TFGraphNode*> ret;
+    if (type_regexes.size() == 1 && type_regexes[0] == ".*") {
+      for (const auto& n : nodes_) {
+        ret.push_back(n.second);
+      }
+      return ret;
+    }
+
+    for (const string& regex : type_regexes) {
+      for (const auto& n : nodes_) {
+        for (const string& type : n.second->op_types()) {
+          if (RE2::FullMatch(type, regex)) {
+            ret.push_back(n.second);
+            break;
+          }
+        }
+      }
+    }
+    return ret;
+  }
+
+  const string name_;
+  int64_t step_;
+  // Snapshot based on type_regexes
+  std::set<string> op_types_;
+  int64_t run_count_;
+  int64_t exec_micros_;
+  int64_t accelerator_exec_micros_;
+  int64_t cpu_exec_micros_;
+
+  int64_t requested_bytes_;
+  int64_t peak_bytes_;
+  int64_t residual_bytes_;
+  int64_t output_bytes_;
+  int64_t float_ops_;
+  int64_t parameters_;
+  std::set<string> devices_;
+  std::vector<std::vector<int64_t>> shapes_;
+  std::map<string, const TFGraphNode*> snapshot_nodes_;
+
+  // Overall data held by the TFMultiGraphNode.
+  std::map<string, const TFGraphNode*> nodes_;
+};
+
+bool IsPlacedOnCPU(const string& device);
+bool IsPlacedOnAccelerator(const string& device);
+bool CountAsAcceleratorTime(const string& device);
+bool CountAsCPUTime(const string& device);
+bool IsCanonicalDevice(const string& device);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_node_show.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_node_show.h
new file mode 100644
index 00000000..e3d4b86a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_node_show.h
@@ -0,0 +1,160 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Node classes used for different views. They are wrappers with "show"
+// methods.
+//
+// ScopeNode is for scope view. GraphNode is for graph view, CodeNode
+// is for code view and OpNode for op view.
+// ScopeNode and GraphNode each maps to one TFGraphNode.
+// CodeNode and OpNode each maps to one TFMultiGraphNode.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ShowNode {
+ public:
+  explicit ShowNode(const TFGraphNode* node);
+  virtual ~ShowNode() = default;
+
+  const string& name() const { return node->name(); }
+  GraphNodeProto* mutable_proto();
+  const GraphNodeProto& proto() const;
+
+  void ReInit(int64_t step);
+
+  void AggregateTotalStats(ShowNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  const TFGraphNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  GraphNodeProto proto_;
+};
+
+class GraphNode : public ShowNode {
+ public:
+  explicit GraphNode(TFGraphNode* node) : ShowNode(node) {}
+
+  bool Trackable(int64_t step) const { return node->trackable(step); }
+
+  std::vector<GraphNode*> children;
+  std::vector<GraphNode*> show_children;
+};
+
+class ScopeNode : public ShowNode {
+ public:
+  explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
+  ~ScopeNode() override = default;
+
+  std::vector<ScopeNode*> children;
+  std::vector<ScopeNode*> show_children;
+};
+
+class ShowMultiNode {
+ public:
+  explicit ShowMultiNode(TFMultiGraphNode* node);
+  virtual ~ShowMultiNode() = default;
+
+  bool ReInit(int64_t step, const std::vector<string>& type_regexes);
+
+  const string& name() const { return node->name(); }
+  MultiGraphNodeProto* mutable_proto();
+  const MultiGraphNodeProto& proto() const;
+
+  void AggregateTotalStats(ShowMultiNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFMultiGraphNode* node;
+  bool account;
+  bool show;
+  string formatted_str;
+
+ protected:
+  MultiGraphNodeProto proto_;
+};
+
+class CodeNode : public ShowMultiNode {
+ public:
+  CodeNode(TFMultiGraphNode* node, const CallStack::Trace* trace,
+           const string& suffix)
+      : ShowMultiNode(node), trace_(trace), suffix_(suffix) {}
+  ~CodeNode() override = default;
+
+  CodeNode* AddChildren(const string& name, const CallStack::Trace* trace,
+                        const string suffix) {
+    auto it = children_.find(name);
+    if (it != children_.end()) {
+      return it->second.get();
+    }
+
+    graph_children_.push_back(std::make_unique<TFMultiGraphNode>(name));
+    auto child = &children_[name];
+    *child =
+        std::make_unique<CodeNode>(graph_children_.back().get(), trace, suffix);
+    children.push_back(child->get());
+    return child->get();
+  }
+
+  bool has_trace() const { return trace_ != nullptr; }
+  int32 lineno() const { return trace_->lineno(); }
+  string file() const { return trace_->file(); }
+  string function() const { return trace_->function() + suffix_; }
+  int32 func_start_line() const { return trace_->func_start_line(); }
+
+  std::vector<CodeNode*> children;
+  std::vector<CodeNode*> show_children;
+
+ private:
+  const CallStack::Trace* trace_;
+  string suffix_;
+  std::vector<std::unique_ptr<TFMultiGraphNode>> graph_children_;
+  std::map<string, std::unique_ptr<CodeNode>> children_;
+};
+
+class OpNode : public ShowMultiNode {
+ public:
+  explicit OpNode(TFMultiGraphNode* node) : ShowMultiNode(node) {}
+  ~OpNode() override = default;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_op.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_op.h
new file mode 100644
index 00000000..0aa4887e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_op.h
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a flat structure of ops.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+
+#include <deque>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// Organize tensorflow ops in a graph structure, pointing from output ops
+// to input ops.
+class TFOp : public TFMultiShow {
+ public:
+  explicit TFOp() : TFMultiShow() {}
+  ~TFOp() override = default;
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  const ShowMultiNode* ShowInternal(const Options& opts,
+                                    Timeline* timeline) override;
+
+  int64_t SearchRoot(std::vector<OpNode*> nodes,
+                     const std::vector<string>& regexes);
+
+  bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
+                         int depth) const override {
+    const int max_num_graph_nodes = node->node->graph_nodes().size();
+    if (opts.min_occurrence > max_num_graph_nodes) {
+      return false;
+    }
+    return true;
+  }
+
+  string FormatNode(OpNode* node, OpNode* root, const Options& opts) const;
+  string FormatMemoryNode(int64_t node_total_bytes, int64_t root_total_bytes,
+                          int64_t node_bytes) const;
+
+  std::unique_ptr<OpNode> root_;
+  std::map<string, std::unique_ptr<OpNode>> cnodes_map_;
+  std::map<string, std::unique_ptr<TFMultiGraphNode>> tfcnodes_map_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_scope.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_scope.h
new file mode 100644
index 00000000..ede6d633
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_scope.h
@@ -0,0 +1,76 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a tree structure based on the TensorFlow op names.
+// For example, 'name1/name2' is a child of 'name1'.
+// Stats are aggregated from descendants to ancestors.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFScope : public TFShow {
+ public:
+  explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
+      : TFShow(ckpt_reader), root_(nullptr) {}
+  ~TFScope() override = default;
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
+
+  ScopeNode* CreateParentNode(const string& name);
+
+  std::vector<ScopeNode*> SearchRoot(std::vector<ScopeNode*> roots,
+                                     const std::vector<string>& regexes);
+
+  std::vector<ScopeNode*> PrintScope(std::vector<ScopeNode*> roots,
+                                     const Options& opts, int depth,
+                                     int last_ident);
+
+  std::vector<ScopeNode*> Account(const std::vector<ScopeNode*>& roots,
+                                  const Options& opts);
+
+  void Format(std::vector<ScopeNode*> roots, string* display_str,
+              GraphNodeProto* proto);
+
+  ScopeNode* root_;
+  std::vector<std::unique_ptr<NodeDef>> node_defs_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<ScopeNode>> nodes_map_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_show.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_show.h
new file mode 100644
index 00000000..ef713cbe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_show.h
@@ -0,0 +1,157 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Parent class and utilities for tfprof_graph and tfprof_scope.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFShow {
+ public:
+  explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
+      : ckpt_reader_(ckpt_reader) {}
+  virtual ~TFShow() = default;
+  virtual void AddNode(TFGraphNode* node) = 0;
+  virtual void Build() = 0;
+  virtual const GraphNodeProto& Show(const string& prefix,
+                                     const Options& opts) final;
+
+ protected:
+  virtual const ShowNode* ShowInternal(const Options& opts,
+                                       Timeline* timeline) = 0;
+
+  bool LookUpCheckPoint(const string& name,
+                        std::unique_ptr<TFProfTensor>* tensor);
+
+  // Overridden by subclass if extra requirements need to be met.
+  virtual bool ShouldShowIfExtra(const ShowNode* node, const Options& opts,
+                                 int depth) const {
+    return true;
+  }
+
+  bool ShouldShow(const ShowNode* node, const Options& opts, int depth) const;
+
+  bool ShouldTrim(const ShowNode* node,
+                  const std::vector<string>& regexes) const;
+
+  bool ReAccount(ShowNode* node, const Options& opts);
+
+  string FormatNode(ShowNode* node, const Options& opts) const;
+  string FormatNodeMemory(ShowNode* node, int64_t bytes,
+                          int64_t total_bytes) const;
+
+  string FormatLegend(const Options& opts) const;
+
+  template <typename T>
+  std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
+    if (opts.order_by.empty() || nodes.empty()) {
+      return nodes;
+    }
+    std::vector<T*> sorted_nodes = nodes;
+    std::stable_sort(sorted_nodes.begin(), sorted_nodes.end(),
+                     [&opts](const T* n1, const T* n2) {
+                       if (n1->name() == kTFProfRoot) return true;
+                       if (n2->name() == kTFProfRoot) return false;
+                       bool name_cmp = n1->name() < n2->name();
+                       if (opts.order_by == kOrderBy[0]) {
+                         return name_cmp;
+                       } else if (opts.order_by == kOrderBy[1]) {
+                         return n1->proto().total_requested_bytes() >
+                                n2->proto().total_requested_bytes();
+                       } else if (opts.order_by == kOrderBy[2]) {
+                         return n1->proto().total_peak_bytes() >
+                                n2->proto().total_peak_bytes();
+                       } else if (opts.order_by == kOrderBy[3]) {
+                         return n1->proto().total_residual_bytes() >
+                                n2->proto().total_residual_bytes();
+                       } else if (opts.order_by == kOrderBy[4]) {
+                         return n1->proto().total_output_bytes() >
+                                n2->proto().total_output_bytes();
+                       } else if (opts.order_by == kOrderBy[5]) {
+                         return n1->proto().total_exec_micros() >
+                                n2->proto().total_exec_micros();
+                       } else if (opts.order_by == kOrderBy[6]) {
+                         return n1->proto().total_accelerator_exec_micros() >
+                                n2->proto().total_accelerator_exec_micros();
+                       } else if (opts.order_by == kOrderBy[7]) {
+                         return n1->proto().total_cpu_exec_micros() >
+                                n2->proto().total_cpu_exec_micros();
+                       } else if (opts.order_by == kOrderBy[8]) {
+                         return n1->proto().total_parameters() >
+                                n2->proto().total_parameters();
+                       } else if (opts.order_by == kOrderBy[9]) {
+                         return n1->proto().total_float_ops() >
+                                n2->proto().total_float_ops();
+                       }
+                       return name_cmp;
+                     });
+    return sorted_nodes;
+  }
+
+  checkpoint::CheckpointReader* ckpt_reader_;
+};
+
+template <typename T>
+string FormatTotalExecTime(const T* node, const Options& opts) {
+  string time = FormatTime(node->proto().total_exec_micros());
+  if (node->account) {
+    time = FormatTime(node->proto().exec_micros()) + "/" + time;
+  } else {
+    time = "--/" + time;
+  }
+  return time;
+}
+template <typename T>
+string FormatCPUExecTime(const T* node, const Options& opts) {
+  string time = FormatTime(node->proto().total_cpu_exec_micros());
+  if (node->account) {
+    time = FormatTime(node->proto().cpu_exec_micros()) + "/" + time;
+  } else {
+    time = "--/" + time;
+  }
+  return time;
+}
+template <typename T>
+string FormatAcceleratorExecTime(const T* node, const Options& opts) {
+  string time = FormatTime(node->proto().total_accelerator_exec_micros());
+  if (node->account) {
+    time = FormatTime(node->proto().accelerator_exec_micros()) + "/" + time;
+  } else {
+    time = "--/" + time;
+  }
+  return time;
+}
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_show_multi.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_show_multi.h
new file mode 100644
index 00000000..1f424dd0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -0,0 +1,127 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Parent class and utilities for tfprof_code.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFMultiShow {
+ public:
+  explicit TFMultiShow() = default;
+  virtual ~TFMultiShow() = default;
+  virtual void AddNode(TFGraphNode* node) = 0;
+  virtual void Build() = 0;
+  const MultiGraphNodeProto& Show(const string& prefix, const Options& opts);
+
+ protected:
+  virtual const ShowMultiNode* ShowInternal(const Options& opts,
+                                            Timeline* timeline) = 0;
+
+  bool LookUpCheckPoint(const string& name,
+                        std::unique_ptr<TFProfTensor>* tensor);
+
+  // Overridden by subclass if extra requirements need to be met.
+  virtual bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
+                                 int depth) const {
+    return true;
+  }
+
+  bool ShouldShow(const ShowMultiNode* node, const Options& opts,
+                  int depth) const;
+
+  bool ShouldTrim(const ShowMultiNode* node,
+                  const std::vector<string>& regexes) const;
+
+  bool ReAccount(ShowMultiNode* node, const Options& opts);
+
+  string FormatLegend(const Options& opts) const;
+  string FormatInputShapes(const MultiGraphNodeProto& proto) const;
+  std::vector<string> FormatTimes(const ShowMultiNode* node,
+                                  const Options& opts) const;
+
+  template <typename T>
+  std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
+    if (opts.order_by.empty() || nodes.empty()) {
+      return nodes;
+    }
+    std::vector<T*> sorted_nodes = nodes;
+    std::stable_sort(sorted_nodes.begin(), sorted_nodes.end(),
+                     [&opts](const T* n1, const T* n2) {
+                       if (n1->name() == kTFProfRoot) return true;
+                       if (n2->name() == kTFProfRoot) return false;
+                       bool name_cmp = n1->name() < n2->name();
+                       if (opts.order_by == kOrderBy[0]) {
+                         return name_cmp;
+                       } else if (opts.order_by == kOrderBy[1]) {
+                         return n1->proto().total_requested_bytes() >
+                                n2->proto().total_requested_bytes();
+                       } else if (opts.order_by == kOrderBy[2]) {
+                         return n1->proto().total_peak_bytes() >
+                                n2->proto().total_peak_bytes();
+                       } else if (opts.order_by == kOrderBy[3]) {
+                         return n1->proto().total_residual_bytes() >
+                                n2->proto().total_residual_bytes();
+                       } else if (opts.order_by == kOrderBy[4]) {
+                         return n1->proto().total_output_bytes() >
+                                n2->proto().total_output_bytes();
+                       } else if (opts.order_by == kOrderBy[5]) {
+                         return n1->proto().total_exec_micros() >
+                                n2->proto().total_exec_micros();
+                       } else if (opts.order_by == kOrderBy[6]) {
+                         return n1->proto().total_accelerator_exec_micros() >
+                                n2->proto().total_accelerator_exec_micros();
+                       } else if (opts.order_by == kOrderBy[7]) {
+                         return n1->proto().total_cpu_exec_micros() >
+                                n2->proto().total_cpu_exec_micros();
+                       } else if (opts.order_by == kOrderBy[8]) {
+                         return n1->proto().total_parameters() >
+                                n2->proto().total_parameters();
+                       } else if (opts.order_by == kOrderBy[9]) {
+                         return n1->proto().total_float_ops() >
+                                n2->proto().total_float_ops();
+                       } else if (opts.order_by == kOrderBy[10]) {
+                         return n1->node->graph_nodes().size() >
+                                n2->node->graph_nodes().size();
+                       }
+                       return name_cmp;
+                     });
+    return sorted_nodes;
+  }
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_stats.h
new file mode 100644
index 00000000..67cbdf56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -0,0 +1,127 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Core API of tfprof.
+// 1. Load protos generated from a tensorflow model.
+// 2. Build in-memory representations of the tensorflow model, annotate the
+//    representation with various stats, such as params,times,memory,etc.
+// 3. Accept command and options to selectively aggregate stats for analysis
+//    and print out the results.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/internal/tfprof_code.h"
+#include "tensorflow/core/profiler/internal/tfprof_graph.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_op.h"
+#include "tensorflow/core/profiler/internal/tfprof_scope.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFStats {
+ public:
+  TFStats(std::unique_ptr<GraphDef> graph,
+          std::unique_ptr<RunMetadata> run_meta,
+          std::unique_ptr<OpLogProto> op_log,
+          std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+
+  TFStats(const string& filename,
+          std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+
+  ~TFStats() = default;
+
+  const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
+    return nodes_map_;
+  }
+  const std::set<int64_t>& steps() const { return steps_; }
+  bool has_code_traces() const { return has_code_traces_; }
+  double run_coverage() const {
+    return covered_nodes_.size() / (nodes_map_.size() + 1e-10);
+  }
+
+  void BuildView(const string& cmd);
+  void BuildAllViews();
+
+  // Note: Must first BuildView(view_foo) before ShowXXX(view_foo) methods.
+  //
+  // Organize the TensorFlow model as different types of views, and generate
+  // outputs for profiling.
+  // TODO(xpan): Should it return reference here?
+  const GraphNodeProto& ShowGraphNode(const string& cmd,
+                                      const Options& opts) const;
+  const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
+                                                const Options& opts) const;
+
+  // Add a (partial) graph to existing graph.
+  void AddGraph(std::unique_ptr<GraphDef> graph);
+
+  // Add a step of run time meta data.
+  void AddRunMeta(int64_t step, std::unique_ptr<RunMetadata> run_meta);
+  // Add tfprof operation meta data, such as customized op type, float_ops,
+  // and code traces.
+  void AddOpLogProto(std::unique_ptr<OpLogProto> op_log);
+
+  void SerializeToString(string* content);
+  void WriteProfile(const string& filename);
+
+  // For test purpose only.
+  void AddNodeForTest(int64_t step, std::unique_ptr<TFGraphNode> node);
+
+ private:
+  bool Validate(const Options& opts) const;
+  string MaybeReportMissingTrace() const;
+
+  std::set<int64_t> steps_;
+  bool has_code_traces_;
+  bool miss_accelerator_stream_;
+  std::unique_ptr<TFScope> scope_view_;
+  std::unique_ptr<TFGraph> graph_view_;
+  std::unique_ptr<TFCode> code_view_;
+  std::unique_ptr<TFOp> op_view_;
+  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
+  // TODO(xpan): Store TFGraphNode instead of TFGraphNode* to avoid large
+  // number of dynamic alloc.
+  // Maps from graph node name to TFGraphNode.
+  std::map<string, std::unique_ptr<TFGraphNode>> nodes_map_;
+  GraphNodeProto empty_graph_node_;
+  MultiGraphNodeProto empty_multi_graph_node_;
+
+  std::map<int64_t, string> id_to_string_;
+  // Graph nodes covered by RunMetadata, that is traced with run time stats.
+  std::set<int64_t> covered_nodes_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_tensor.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_tensor.h
new file mode 100644
index 00000000..4a04b005
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_tensor.h
@@ -0,0 +1,175 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TFProf representation of a Tensor's value.
+// 1. Multi-dimension tensor is flattened in row major, and stored in proto.
+// 2. integer are up-casted to int64. floats are up-casted to double. string
+//    is not supported by TensorFlow CheckPointReader library, though it is
+//    supported in current code.
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+
+#include <memory>
+#include <sstream>
+#include <typeinfo>
+#include <utility>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFProfTensor {
+ public:
+  explicit TFProfTensor(std::unique_ptr<Tensor> tensor)
+      : tensor_(std::move(tensor)) {
+    Build();
+  }
+
+  // If pointers are provided, they are filled by the method.
+  void Display(string* formatted_str, TFProfTensorProto* tfprof_tensor_pb);
+
+ private:
+  // Max length of tensor value displayed to CLI.
+  const int64_t kTFProfTenosrMaxDisplayLen = 10000;
+  // Max length after which a latency warning will be printed.
+  const int64_t kTFProfTensorMaxWarnLen = 100000;
+
+  void Build();
+
+  template <typename T>
+  bool AddValue(const T& value, TFProfTensorProto* dim) {
+    std::ostringstream sstream;
+    sstream << value;
+    if (typeid(value) == typeid(double)) {
+      double double_val = 0.0;
+      CHECK(absl::SimpleAtod(sstream.str(), &double_val));  // Crash OK
+      dim->add_value_double(double_val);
+      absl::StrAppendFormat(&formatted_str_, "%.2f ",
+                            dim->value_double(dim->value_double_size() - 1));
+    } else if (typeid(value) == typeid(int64_t)) {
+      int64_t int64_val = 0;
+      CHECK(absl::SimpleAtoi(sstream.str(), &int64_val));  // Crash OK
+      dim->add_value_int64(int64_val);
+      absl::StrAppendFormat(&formatted_str_, "%d ",
+                            dim->value_int64(dim->value_int64_size() - 1));
+    } else if (typeid(value) == typeid(string)) {
+      dim->add_value_str(sstream.str());
+      absl::StrAppend(&formatted_str_, "'",
+                      dim->value_str(dim->value_str_size() - 1), "' ");
+    } else {
+      CHECK(false) << "Unsupported type: " << typeid(value).name();
+    }
+  }
+
+  // It assumes the flatten values are stored in row-major, which is mentioned
+  // indirectly at various places:
+  // TODO(xpan): Further verifying it.
+  template <typename T>
+  int64_t BuildOutput(int64_t start, int depth, const std::vector<T>& values,
+                      TFProfTensorProto* dim) {
+    formatted_str_ += "[";
+    int64_t nstart = start;
+    if (tensor_->dims() == 0 && values.size() == 1) {
+      std::ostringstream sstream;
+      sstream << values[nstart];
+
+      if (typeid(values[nstart]) == typeid(double)) {
+        double double_val = 0.0;
+        CHECK(absl::SimpleAtod(sstream.str(), &double_val));  // Crash OK
+        dim->add_value_double(double_val);
+        absl::StrAppendFormat(&formatted_str_, "%.2f ",
+                              dim->value_double(dim->value_double_size() - 1));
+      } else if (typeid(values[nstart]) == typeid(int64_t)) {
+        int64_t int64_val = 0;
+        CHECK(absl::SimpleAtoi(sstream.str(), &int64_val));  // Crash OK
+        dim->add_value_int64(int64_val);
+        absl::StrAppendFormat(&formatted_str_, "%d ",
+                              dim->value_int64(dim->value_int64_size() - 1));
+      } else if (typeid(values[nstart]) == typeid(string)) {
+        dim->add_value_str(sstream.str());
+        absl::StrAppend(&formatted_str_, "'",
+                        dim->value_str(dim->value_str_size() - 1), "' ");
+      } else {
+        CHECK(false) << "Unsupported type: " << typeid(values[nstart]).name();
+      }
+    } else {
+      for (int i = 0; i < tensor_->dim_size(depth); i++) {
+        // Last dimension, pull the values.
+        if (depth == tensor_->dims() - 1) {
+          std::ostringstream sstream;
+          sstream << values[nstart];
+
+          if (typeid(values[nstart]) == typeid(double)) {
+            double double_val = 0.0;
+            CHECK(absl::SimpleAtod(sstream.str(), &double_val));  // Crash OK
+            dim->add_value_double(double_val);
+            absl::StrAppendFormat(
+                &formatted_str_, "%.2f ",
+                dim->value_double(dim->value_double_size() - 1));
+          } else if (typeid(values[nstart]) == typeid(int64_t)) {
+            int64_t int64_val = 0;
+            CHECK(absl::SimpleAtoi(sstream.str(), &int64_val));  // Crash OK
+            dim->add_value_int64(int64_val);
+            absl::StrAppendFormat(
+                &formatted_str_, "%d ",
+                dim->value_int64(dim->value_int64_size() - 1));
+          } else if (typeid(values[nstart]) == typeid(string)) {
+            dim->add_value_str(sstream.str());
+            absl::StrAppend(&formatted_str_, "'",
+                            dim->value_str(dim->value_str_size() - 1), "' ");
+          } else {
+            CHECK(false) << "Unsupported type: "
+                         << typeid(values[nstart]).name();
+          }
+          ++nstart;
+        } else {
+          // Not-last dimension. Drill deeper.
+          nstart = BuildOutput<T>(nstart, depth + 1, values, dim);
+        }
+      }
+    }
+    if (formatted_str_.length() > kTFProfTenosrMaxDisplayLen) {
+      formatted_str_ = formatted_str_.substr(0, kTFProfTenosrMaxDisplayLen);
+    }
+    formatted_str_ += "],\n";
+    return nstart;
+  }
+
+  template <typename T, typename U>
+  void GetValueVec(std::vector<U>* value_vec) {
+    // TODO(xpan): Address the huge tensor problem.
+    if (tensor_->NumElements() > kTFProfTensorMaxWarnLen) {
+      absl::FPrintF(stderr, "Showing huge tensor, the tool might halt...\n");
+    }
+    auto values = tensor_->flat<T>();
+    for (int64_t i = 0; i < tensor_->NumElements(); i++) {
+      value_vec->push_back(static_cast<U>(values(i)));
+    }
+  }
+
+  TFProfTensorProto tfprof_tensor_pb_;
+  std::unique_ptr<Tensor> tensor_;
+  string formatted_str_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_timeline.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_timeline.h
new file mode 100644
index 00000000..b50c5633
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -0,0 +1,197 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "json/json.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+typedef std::map<string, string> Event;
+
+// Class for generating timeline json output.
+class ChromeTraceFormatter {
+ public:
+  ChromeTraceFormatter() = default;
+  // The following methods creates timeline nodes. See chrome tracing format
+  // document for details.
+  Json::Value CreateEvent(const string& ph, const string& category,
+                          const string& name, int64_t pid, int64_t tid,
+                          int64_t ts);
+
+  void EmitPID(const string& name, int64_t pid);
+
+  void EmitRegion(int64_t ts, int64_t duration, int64_t pid, int64_t tid,
+                  const string& category, const string& name, Json::Value args);
+
+  void EmitFlowStart(const string& name, int64_t ts, int64_t pid, int64_t tid,
+                     int64_t flow_id);
+
+  void EmitFlowEnd(const string& name, int64_t ts, int64_t pid, int64_t tid,
+                   int64_t flow_id);
+
+  void EmitCounter(const string& category, const string& name, int64_t pid,
+                   int64_t ts, const string& device, int64_t bytes,
+                   const std::map<int64_t, std::vector<string>>& tensor_mem);
+
+  string Format();
+
+ private:
+  // A event is a visualization unit in timeline.
+  std::vector<Json::Value> events_;
+  std::vector<Json::Value> metadata_;
+};
+
+// A process (time series of events) in the timeline.
+class Process {
+ public:
+  Process(const string& device, int64_t pid) : device(device), pid(pid) {}
+
+  // Each lane is a map from start_time to end_time.
+  std::vector<std::map<int64_t, int64_t>> lanes;
+  // device for the time series.
+  string device;
+  // unique id for the time series.
+  int64_t pid;
+};
+
+class TimeNode {
+ public:
+  TimeNode(Process* process, GraphNode* node, int64_t start_micros,
+           int64_t exec_micros)
+      : process(process),
+        node(node),
+        start_micros(start_micros),
+        exec_micros(exec_micros),
+        tid(-1) {}
+  virtual ~TimeNode() = default;
+
+  const string& name() { return node->name(); }
+
+  Process* process;
+  GraphNode* node;
+  int64_t start_micros;
+  int64_t exec_micros;
+  int64_t tid;
+  std::vector<TimeNode*> next_tnodes;
+};
+
+// Tracking the memory based on the op input/output, temporary bytes and
+// persistent bytes.
+// Currently, we calculate a "predicted" memory, but do not use it for display.
+// The displayed memory timeline is directly from the TensorFlow allocator,
+// which is the groundtruth.
+class MemoryTracker {
+ public:
+  class Device {
+   public:
+    // map from tensor name to a pair of <alloc time, bytes_in_use>.
+    std::map<string, std::map<int64_t, int64_t>> tensor_allocs;
+    // ground truth memory stats. time->bytes.
+    std::map<int64_t, int64_t> allocations;
+    // tracked allocations, might miss some bytes.
+    std::map<int64_t, int64_t> tracked_allocations;
+  };
+
+  void TrackNode(int64_t step, const GraphNode* node);
+
+  const std::map<string, Device>& devices() const { return devices_; }
+
+ private:
+  std::map<string, Device> devices_;
+};
+
+class Timeline {
+ public:
+  Timeline(int64_t step, const string& outfile)
+      : step_(step), outfile_(outfile) {}
+  ~Timeline() = default;
+
+  int64_t step() const { return step_; }
+  void SetStep(int64_t step) { step_ = step; }
+
+  void GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes);
+
+  void GenerateScopeTimeline(const ScopeNode* node);
+
+  void GenerateCodeTimeline(const CodeNode* node);
+
+ private:
+  void TrackNode(const GraphNode* node) { mem_tracker_.TrackNode(step_, node); }
+
+  void OutputTimeline();
+
+  template <typename Node>
+  void EmitTreeNode(const Node* node, int64_t start_time, int64_t duration,
+                    int64_t depth, std::set<int64_t>* visited_depth) {
+    if (visited_depth->find(depth) == visited_depth->end()) {
+      chrome_formatter_.EmitPID(absl::StrCat("Scope:", depth), depth);
+      visited_depth->insert(depth);
+    }
+
+    Json::Value args(Json::objectValue);
+    args["name"] = Json::Value(node->name());
+    args["op"] = Json::Value(node->name());
+    chrome_formatter_.EmitRegion(start_time, duration, depth, 0, "Op",
+                                 node->name(), args);
+
+    int64_t total_micros = 0;
+    int64_t c_start_time = start_time;
+    for (const Node* child : node->show_children) {
+      int64_t total_exec_micros = child->proto().total_exec_micros();
+      if (total_exec_micros <= 0) {
+        continue;
+      }
+      EmitTreeNode(child, c_start_time, total_exec_micros, depth + 1,
+                   visited_depth);
+      c_start_time += total_exec_micros;
+      total_micros += total_exec_micros;
+    }
+    CHECK(total_micros <= duration) << node->name() << " parent:" << duration
+                                    << " children:" << total_micros;
+  }
+
+  void AllocateTimeNodes(GraphNode* gnode);
+
+  void AllocateLanes();
+
+  int64_t AllocatePID();
+
+  int64_t step_;
+  const string outfile_;
+  int64_t next_pid_ = 0;
+  MemoryTracker mem_tracker_;
+  ChromeTraceFormatter chrome_formatter_;
+  std::map<string, int64_t> device_pids_;
+
+  std::map<string, std::unique_ptr<Process>> process_;
+  std::map<int64_t, std::map<int64_t, std::map<int64_t, TimeNode*>>>
+      alloc_nodes_;
+  std::map<string, std::map<int64_t, std::unique_ptr<TimeNode>>> tnodes_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_utils.h
new file mode 100644
index 00000000..7f4e49ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/internal/tfprof_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+
+namespace tensorflow {
+namespace tfprof {
+string FormatNumber(int64_t n);
+
+string FormatTime(int64_t micros);
+
+string FormatMemory(int64_t bytes);
+
+string FormatShapes(const std::vector<int64_t>& shapes);
+
+absl::Status ParseCmdLine(const string& line, string* cmd,
+                          tensorflow::tfprof::Options* opts);
+
+string StringReplace(const string& str, const string& oldsub,
+                     const string& newsub);
+
+template <typename T>
+absl::Status ReadProtoFile(Env* env, const string& fname, T* proto,
+                           bool binary_first) {
+  string out;
+  absl::Status s = ReadFileToString(env, fname, &out);
+  if (!s.ok()) return s;
+
+  if (binary_first) {
+    if (ReadBinaryProto(tensorflow::Env::Default(), fname, proto).ok()) {
+      return absl::Status();
+    } else if (protobuf::TextFormat::ParseFromString(out, proto)) {
+      return absl::Status();
+    }
+  } else {
+    if (protobuf::TextFormat::ParseFromString(out, proto)) {
+      return absl::Status();
+    } else if (ReadBinaryProto(tensorflow::Env::Default(), fname, proto).ok()) {
+      return absl::Status();
+    }
+  }
+  return errors::InvalidArgument("Cannot parse proto file.");
+}
+
+void PrintHelp();
+
+// Generate helper message based on the command and options.
+string QueryDoc(const string& cmd, const Options& opts);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/annotated_traceme.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/annotated_traceme.h
new file mode 100644
index 00000000..150b8097
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Combination of TraceMe and ScopedAnnotation which share the same label.
+// Optimization are done to ensure the label generation are done once.
+class AnnotatedTraceMe {
+ public:
+  template <typename NameGeneratorT>
+  explicit AnnotatedTraceMe(NameGeneratorT&& name_generator, int level = 1) {
+    DCHECK_GE(level, 1);
+    bool annotation_enabled = tsl::profiler::ScopedAnnotation::IsEnabled();
+    bool traceme_enabled = tsl::profiler::TraceMe::Active(level);
+    if (TF_PREDICT_TRUE(!annotation_enabled && !traceme_enabled)) {
+      return;
+    }
+    std::string name = name_generator();
+    if (annotation_enabled) {
+      scoped_annotation_.emplace(name);
+    }
+    if (TF_PREDICT_TRUE(traceme_enabled)) {
+      trace_me_.emplace([&name] { return std::move(name); }, level);
+    }
+  }
+
+ private:
+  std::optional<tsl::profiler::TraceMe> trace_me_;
+  std::optional<tsl::profiler::ScopedAnnotation> scoped_annotation_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/connected_traceme.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000..e696cdaf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/connected_traceme.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/lib/context_types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using TraceMeConsumer ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeConsumer;  // NOLINT
+using TraceMeProducer ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeProducer;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/context_types.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/context_types.h
new file mode 100644
index 00000000..dbb7fc2e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/context_types.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
+
+#include <cstdint>
+
+#include "absl/base/macros.h"
+#include "tsl/profiler/lib/context_types.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using ContextType ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ContextType;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline const char* GetContextTypeString(
+    tsl::profiler::ContextType context_type) {
+  return tsl::profiler::GetContextTypeString(context_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline tsl::profiler::ContextType GetSafeContextType(uint32_t context_type) {
+  return tsl::profiler::GetSafeContextType(context_type);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/device_profiler_session.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/device_profiler_session.h
new file mode 100644
index 00000000..179a3795
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/device_profiler_session.h
@@ -0,0 +1,83 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_DEVICE_PROFILER_SESSION_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_DEVICE_PROFILER_SESSION_H_
+
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/profiler/convert/xplane_to_step_stats.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#endif
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tensorflow {
+
+// Wraps a ProfilerSession configured to collect only device traces.
+// Returns data in StepStats format.
+class DeviceProfilerSession {
+ public:
+  // Creates a DeviceProfilerSession and starts tracing.
+  // Traces GPU devices if present.
+  // Does not trace TPU devices (not supported).
+  static std::unique_ptr<DeviceProfilerSession> Create() {
+#if !defined(IS_MOBILE_PLATFORM)
+    ProfileOptions options = tsl::ProfilerSession::DefaultOptions();
+    options.set_host_tracer_level(0);
+    options.set_device_type(ProfileOptions::GPU);
+    return absl::WrapUnique(new DeviceProfilerSession(options));
+#else
+    return nullptr;
+#endif
+  }
+
+  // Stops tracing and converts the data to StepStats format.
+  // Should be called at most once.
+  absl::Status CollectData(StepStats* step_stats) {
+#if defined(IS_MOBILE_PLATFORM)
+    return errors::Unimplemented("Profiling not supported on mobile platform.");
+#else
+    profiler::XSpace space;
+    TF_RETURN_IF_ERROR(profiler_session_->CollectData(&space));
+    profiler::ConvertGpuXSpaceToStepStats(space, step_stats);
+    return absl::OkStatus();
+#endif
+  }
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit DeviceProfilerSession(const ProfileOptions& options)
+#if !defined(IS_MOBILE_PLATFORM)
+      : profiler_session_(tsl::ProfilerSession::Create(options))
+#endif
+  {
+  }
+
+  // DeviceProfilerSession is neither copyable nor movable.
+  DeviceProfilerSession(const DeviceProfilerSession&) = delete;
+  DeviceProfilerSession& operator=(const DeviceProfilerSession&) = delete;
+
+#if !defined(IS_MOBILE_PLATFORM)
+  // TODO(b/256013238)
+  std::unique_ptr<tsl::ProfilerSession> profiler_session_;
+#endif
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_DEVICE_PROFILER_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_controller.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_controller.h
new file mode 100644
index 00000000..21936dcd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_controller.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_CONTROLLER_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_CONTROLLER_H_
+
+#include <memory>
+
+#include "absl/base/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/lib/profiler_controller.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using ProfilerController ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerController;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_CONTROLLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_factory.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_factory.h
new file mode 100644
index 00000000..ebba761b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_factory.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_FACTORY_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_FACTORY_H_
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/lib/profiler_factory.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+// A ProfilerFactory returns an instance of ProfilerInterface if ProfileOptions
+// require it. Otherwise, it might return nullptr.
+using ProfilerFactor ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerFactory;  // NOLINT
+
+// Registers a profiler factory. Should be invoked at most once per factory.
+ABSL_DEPRECATE_AND_INLINE()
+inline void RegisterProfilerFactory(tsl::profiler::ProfilerFactory factory) {
+  tsl::profiler::RegisterProfilerFactory(std::move(factory));
+}
+
+// Invokes all registered profiler factories with the given options, and
+// returns the instantiated (non-null) profiler interfaces.
+ABSL_DEPRECATE_AND_INLINE()
+inline std::vector<std::unique_ptr<tsl::profiler::ProfilerInterface>>
+CreateProfilers(const tensorflow::ProfileOptions& options) {
+  return tsl::profiler::CreateProfilers(options);
+}
+
+// For testing only.
+ABSL_DEPRECATE_AND_INLINE()
+inline void ClearRegisteredProfilersForTest() {
+  tsl::profiler::ClearRegisteredProfilersForTest();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_interface.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_interface.h
new file mode 100644
index 00000000..11423c1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_interface.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
+
+#include "absl/base/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using ProfilerInterface ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerInterface;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_lock.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_lock.h
new file mode 100644
index 00000000..7480df58
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_lock.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
+
+#include "absl/base/macros.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tsl/profiler/lib/profiler_lock.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using ProfilerLock ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerLock;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_session.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_session.h
new file mode 100644
index 00000000..76099cc1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/profiler_session.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
+
+#include "absl/base/macros.h"
+#include "tsl/profiler/lib/profiler_session.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+
+using ProfilerSession ABSL_DEPRECATE_AND_INLINE() =
+    tsl::ProfilerSession;  // NOLINT
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/scoped_annotation.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/scoped_annotation.h
new file mode 100644
index 00000000..8fa9fd67
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_SCOPED_ANNOTATION_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_SCOPED_ANNOTATION_H_
+
+#include <stddef.h>
+
+#include <atomic>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
+#endif
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using ScopedAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ScopedAnnotation;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_SCOPED_ANNOTATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
new file mode 100644
index 00000000..e44cdb3c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "tsl/profiler/lib/scoped_memory_debug_annotation.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using MemoryDebugAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::MemoryDebugAnnotation;  // NOLINT
+using ScopedMemoryDebugAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ScopedMemoryDebugAnnotation;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/traceme.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/traceme.h
new file mode 100644
index 00000000..23e48948
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/traceme.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
+
+#include "absl/base/macros.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"  // IWYU pragma: export
+#include "tsl/profiler/lib/traceme.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "xla/tsl/profiler/utils/time_utils.h"
+#endif
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::kInfo;                                          // NOLINT
+using TraceMe ABSL_DEPRECATE_AND_INLINE() = tsl::profiler::TraceMe;  // NOLINT
+using TraceMeLevel ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeLevel;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline int GetTFTraceMeLevel(bool is_expensive) {
+  return tsl::profiler::GetTFTraceMeLevel(is_expensive);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline bool TfOpDetailsEnabled() { return tsl::profiler::TfOpDetailsEnabled(); }
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/lib/traceme_encode.h b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/traceme_encode.h
new file mode 100644
index 00000000..0ebd2051
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/lib/traceme_encode.h
@@ -0,0 +1,100 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+
+#include <string.h>
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+using TraceMeArg ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeArg;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(
+    std::string name, std::initializer_list<tsl::profiler::TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(std::move(name), args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<tsl::profiler::TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(
+    const char* name, std::initializer_list<tsl::profiler::TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(
+    std::initializer_list<tsl::profiler::TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+// Concatenates op_name and op_type.
+inline std::string TraceMeOp(absl::string_view op_name,
+                             absl::string_view op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOp(const char* op_name, const char* op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOp(std::string&& op_name, absl::string_view op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+// Concatenates op_name and op_type.
+inline std::string TraceMeOpOverride(absl::string_view op_name,
+                                     absl::string_view op_type) {
+  return tsl::profiler::TraceMeOpOverride(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOpOverride(const char* op_name, const char* op_type) {
+  return tsl::profiler::TraceMeOpOverride(op_name, op_type);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/profiler_client.h b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/profiler_client.h
new file mode 100644
index 00000000..73563d1f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/profiler_client.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/tsl/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/profiler/protobuf/profiler_analysis.grpc.pb.h"
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::MonitorGrpc;            // NOLINT
+using tsl::profiler::NewSessionGrpc;         // NOLINT
+using tsl::profiler::ProfileGrpc;            // NOLINT
+using tsl::profiler::RemoteProfilerSession;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
new file mode 100644
index 00000000..3d0b9f58
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::AddressResolver;               // NOLINT
+using tsl::profiler::RemoteProfilerSessionManager;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/save_profile.h b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/save_profile.h
new file mode 100644
index 00000000..1de60aeb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
+
+#include <ostream>
+#include <string>
+
+#include "xla/tsl/profiler/rpc/client/save_profile.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/protobuf/profiler_service.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::GetCurrentTimeStampAsString;     // NOLINT
+using tsl::profiler::GetTensorBoardProfilePluginDir;  // NOLINT
+using tsl::profiler::SaveGzippedToolData;             // NOLINT
+using tsl::profiler::SaveProfile;                     // NOLINT
+using tsl::profiler::SaveXSpace;                      // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/grpc.h b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/grpc.h
new file mode 100644
index 00000000..d37c535d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/grpc.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC utilities
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_GRPC_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_GRPC_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Returns default credentials for use when creating a gRPC server.
+std::shared_ptr<::grpc::ServerCredentials> GetDefaultServerCredentials();
+
+// Returns default credentials for use when creating a gRPC channel.
+std::shared_ptr<::grpc::ChannelCredentials> GetDefaultChannelCredentials();
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_GRPC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/profiler_server.h b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/profiler_server.h
new file mode 100644
index 00000000..dec0a235
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/profiler_server.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+
+#include <memory>
+
+#include "grpcpp/grpcpp.h"
+#include "xla/tsl/profiler/rpc/profiler_server.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::ProfilerServer;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/profiler_service_impl.h b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/profiler_service_impl.h
new file mode 100644
index 00000000..f3b6a293
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "xla/tsl/profiler/rpc/profiler_service_impl.h"
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::CreateProfilerService;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/tfprof_options.h b/third_party/tflite-hdrs/tensorflow/core/profiler/tfprof_options.h
new file mode 100644
index 00000000..61143b49
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/tfprof_options.h
@@ -0,0 +1,186 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_TFPROF_OPTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_TFPROF_OPTIONS_H_
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tfprof {
+static const char* const kOptions[] = {
+    "-max_depth",
+    "-min_bytes",
+    "-min_peak_bytes",
+    "-min_residual_bytes",
+    "-min_output_bytes",
+    "-min_micros",
+    "-min_accelerator_micros",
+    "-min_cpu_micros",
+    "-min_params",
+    "-min_float_ops",
+    "-min_occurrence",
+    "-step",
+    "-order_by",
+    "-account_type_regexes",
+    "-start_name_regexes",
+    "-trim_name_regexes",
+    "-show_name_regexes",
+    "-hide_name_regexes",
+    "-account_displayed_op_only",
+    "-select",
+    "-output",
+};
+
+static const char* const kOrderBy[] = {
+    "name",         "bytes",     "peak_bytes",         "residual_bytes",
+    "output_bytes", "micros",    "accelerator_micros", "cpu_micros",
+    "params",       "float_ops", "occurrence",
+};
+
+// Append Only.
+// TODO(xpan): As we are adding more fields to be selected, we
+// need to have a way to tell users what fields are available in which view.
+static const char* const kShown[] = {"bytes",          "micros",
+                                     "params",         "float_ops",
+                                     "tensor_value",   "device",
+                                     "op_types",       "occurrence",
+                                     "input_shapes",   "accelerator_micros",
+                                     "cpu_micros",     "peak_bytes",
+                                     "residual_bytes", "output_bytes"};
+
+static const char* const kCmds[] = {
+    "scope", "graph", "code", "op", "advise", "set", "help",
+};
+
+static const char* const kOutput[] = {"timeline", "stdout", "file", "pprof",
+                                      "none"};
+
+static const char* const kTimelineOpts[] = {
+    "outfile",
+};
+
+static const char* const kTimelineRequiredOpts[] = {"outfile"};
+
+static const char* const kFileOpts[] = {
+    "outfile",
+};
+
+static const char* const kFileRequiredOpts[] = {
+    "outfile",
+};
+
+static const char* const kPprofOpts[] = {
+    "outfile",
+};
+
+static const char* const kPprofRequiredOpts[] = {
+    "outfile",
+};
+
+struct Options {
+ public:
+  static absl::Status FromProtoStr(const string& opts_proto_str, Options* opts);
+
+  virtual ~Options() {}
+  Options()
+      : Options(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", {}, {}, {}, {}, {},
+                false, {}, "", {}) {}
+
+  Options(int max_depth, int64_t min_bytes, int64_t min_peak_bytes,
+          int64_t min_residual_bytes, int64_t min_output_bytes,
+          int64_t min_micros, int64_t min_accelerator_micros,
+          int64_t min_cpu_micros, int64_t min_params, int64_t min_float_ops,
+          int64_t min_occurrence, int64_t step, const string& order_by,
+          const std::vector<string>& account_type_regexes,
+          const std::vector<string>& start_name_regexes,
+          const std::vector<string>& trim_name_regexes,
+          const std::vector<string>& show_name_regexes,
+          const std::vector<string>& hide_name_regexes,
+          bool account_displayed_op_only, const std::vector<string>& select,
+          const string& output_type,
+          const std::map<string, string>& output_options)
+      : max_depth(max_depth),
+        min_bytes(min_bytes),
+        min_peak_bytes(min_peak_bytes),
+        min_residual_bytes(min_residual_bytes),
+        min_output_bytes(min_output_bytes),
+        min_micros(min_micros),
+        min_accelerator_micros(min_accelerator_micros),
+        min_cpu_micros(min_cpu_micros),
+        min_params(min_params),
+        min_float_ops(min_float_ops),
+        min_occurrence(min_occurrence),
+        step(step),
+        order_by(order_by),
+        account_type_regexes(account_type_regexes),
+        start_name_regexes(start_name_regexes),
+        trim_name_regexes(trim_name_regexes),
+        show_name_regexes(show_name_regexes),
+        hide_name_regexes(hide_name_regexes),
+        account_displayed_op_only(account_displayed_op_only),
+        select(select.begin(), select.end()),
+        output_type(output_type),
+        output_options(output_options) {}
+
+  string ToString() const;
+
+  int max_depth;
+  int64_t min_bytes;
+  int64_t min_peak_bytes;
+  int64_t min_residual_bytes;
+  int64_t min_output_bytes;
+  int64_t min_micros;
+  int64_t min_accelerator_micros;
+  int64_t min_cpu_micros;
+  int64_t min_params;
+  int64_t min_float_ops;
+  int64_t min_occurrence;
+  int64_t step;
+  string order_by;
+
+  std::vector<string> account_type_regexes;
+  std::vector<string> start_name_regexes;
+  std::vector<string> trim_name_regexes;
+  std::vector<string> show_name_regexes;
+  std::vector<string> hide_name_regexes;
+  bool account_displayed_op_only;
+
+  std::set<string> select;
+
+  string output_type;
+  std::map<string, string> output_options;
+};
+
+// Parse the -output option.
+// 'output_opt': User input string with format: output_type:key=value,key=value.
+// 'output_type' and 'output_options' are extracted from 'output_opt'.
+absl::Status ParseOutput(const string& output_opt, string* output_type,
+                         std::map<string, string>* output_options);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_TFPROF_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/cost_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/cost_utils.h
new file mode 100644
index 00000000..7ea14fe9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/cost_utils.h
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// This is a wrapper of tensorflow::grappler::OpLevelCostEstimator and use
+// tracing time information to estimate the roof line stats for each traced
+// tensorflow op.
+class TfOpRoofLineCostEstimator
+    : public tensorflow::grappler::OpLevelCostEstimator {
+ public:
+  TfOpRoofLineCostEstimator() = default;
+  ~TfOpRoofLineCostEstimator() override;
+
+  grappler::DeviceInfo GetDeviceInfo(
+      const DeviceProperties& device) const override;
+
+  struct OpRoofLineStats {
+    uint64 flops = 0LL;
+    uint64 bytes_accessed = 0LL;
+    bool inaccurate = false;
+  };
+  OpRoofLineStats Predict(const XEventVisitor& event);
+
+ private:
+  absl::flat_hash_set<std::string>
+      unsupported_ops_;  // summary for unsupported ops.
+
+  TfOpRoofLineCostEstimator(const TfOpRoofLineCostEstimator&) = delete;
+  void operator=(const TfOpRoofLineCostEstimator&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/derived_timeline.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/derived_timeline.h
new file mode 100644
index 00000000..6d2b5e5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/derived_timeline.h
@@ -0,0 +1,202 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/tsl/profiler/utils/group_events.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Store the mapping from child scope range id to parent scope range id, which
+// logically form a scope range call stack tree/forest.
+typedef absl::flat_hash_map<int64_t /* child_scope_range_id */,
+                            int64_t /* parent_scope_range_id */>
+    ScopeRangeIdTree;
+
+// Helper for deriving XEvents.
+class DerivedXEventBuilder {
+ public:
+  DerivedXEventBuilder(XEventBuilder event, std::optional<int64_t> group_id,
+                       std::optional<int64_t> scope_range_id = std::nullopt);
+
+  bool ShouldExpand(const XEventMetadata& event_metadata,
+                    std::optional<int64_t> group_id,
+                    std::optional<int64_t> scope_range_id = std::nullopt) const;
+
+  void Expand(tsl::profiler::Timespan event_span);
+  tsl::profiler::Timespan GetTimespan() const { return event_.GetTimespan(); }
+  void SetTimespan(tsl::profiler::Timespan event_span) {
+    event_.SetTimespan(event_span);
+  }
+
+  template <typename ValueT>
+  void SetOrAddStatValue(const XStatMetadata& metadata, ValueT&& value) {
+    event_.SetOrAddStatValue(metadata, std::forward<ValueT>(value));
+  }
+
+ private:
+  XEventBuilder event_;
+  std::optional<int64_t> group_id_;
+  std::optional<int64_t> scope_range_id_;
+};
+
+// Helper for deriving an XLine from events in another XLine.
+class DerivedXLineBuilder {
+ public:
+  DerivedXLineBuilder(XPlaneBuilder* plane, int64_t line_id,
+                      absl::string_view name, int64_t timestamp_ns,
+                      std::vector<DerivedXLineBuilder*> dependent_lines);
+
+  XLineBuilder& Line() { return line_; }
+
+  // Either merges event with the last event or creates a new event on this
+  // XLine. group_id and low_level_event_name may be passed to separate
+  // consecutive invocations of the same event, depending on the XEvent type:
+  //   TF-op, TF name scope: both group_id and low_level_event_name are used.
+  //   HLO-op, step: only group_id is used.
+  //   HLO module, source: both group_id and low_level_event_name are NOT used.
+  // If scope_range_id is provided, it will be compared with the one in the
+  // event which is to be merged with. If they are different, merging is not
+  // allowed.
+  void ExpandOrAddEvent(const XEventMetadata& event_metadata,
+                        tsl::profiler::Timespan event_span,
+                        std::optional<int64_t> group_id,
+                        std::optional<int64_t> scope_range_id = std::nullopt);
+
+  // The multi-level version of ExpandOrAddEvent. Here, the XEvents at different
+  // levels all share the same group_id and low_level_event_name.
+  // Conceptually, the scope_range_ids should be of same length as the
+  // events_metadata_per_level. However, if it is shorter, this function will
+  // assume the missing elements at the end of scope_range_ids vector with the
+  // value of std::nullopt; and if it is longer, the extra elements in
+  // scope_range_ids will be ignored.
+  void ExpandOrAddEvents(
+      const std::vector<XEventMetadata*>& events_metadata_per_level,
+      tsl::profiler::Timespan event_span, std::optional<int64_t> group_id,
+      absl::Span<std::optional<int64_t>> scope_range_ids = {});
+
+  // Reset the last events lower than or equal to the given level.
+  void ResetLastEvents(int level = 0);
+
+  // To avoid using templates while need hide its implementation in .cc file,
+  // use two functions to set stat value for int64_t and uint64_t here.
+  void AddStatToLevelEvent(int level, const XStatMetadata& metadata,
+                           int64_t value);
+
+  void AddStatToLevelEvent(int level, const XStatMetadata& metadata,
+                           uint64_t value);
+
+  const XStatMetadata* GetCorrelationIdMetadata() const {
+    return correlation_id_metadata_;
+  }
+
+  const XStatMetadata* GetCudaGraphIdMetadata() const {
+    return cuda_graph_id_metadata_;
+  }
+
+ private:
+  // If the last event of the given level has the same metadata, expands it to
+  // include the time until the given event's end time.
+  // Otherwise, adds a new event and clears last_event_by_level_ for the levels
+  // below the given level and all levels of the dependent lines. Clearing
+  // last_event_by_level_ prevents a nested event from growing larger than the
+  // parent event(s).
+  void ExpandOrAddLevelEvent(const XEventMetadata& event_metadata,
+                             tsl::profiler::Timespan event_span,
+                             std::optional<int64_t> group_id,
+                             std::optional<int64_t> scope_range_id, int level);
+  void AdjustDurationForTraceViewer(int level);
+
+  const XStatMetadata* group_id_stat_metadata_ = nullptr;
+  const XStatMetadata* correlation_id_metadata_ = nullptr;
+  const XStatMetadata* cuda_graph_id_metadata_ = nullptr;
+
+  XLineBuilder line_;
+  absl::flat_hash_map<int, std::optional<DerivedXEventBuilder>>
+      last_event_by_level_;
+  std::vector<DerivedXLineBuilder*> dependent_lines_;
+  bool is_gpu_plane_ = false;
+};
+
+struct Symbol {
+  absl::string_view tf_op_name;
+  std::string source_info;
+  std::string hlo_text;
+};
+
+using SymbolResolver = std::function<Symbol(std::optional<uint64_t> program_id,
+                                            absl::string_view hlo_module_name,
+                                            absl::string_view hlo_op)>;
+
+// Derives TF name scope and op events from the TF op's fully qualified name
+// with the name of the originating low-level event.
+void ProcessTfOpEvent(absl::string_view tf_op_full_name,
+                      tsl::profiler::Timespan event_span,
+                      std::optional<int64_t> group_id,
+                      XPlaneBuilder& plane_builder,
+                      DerivedXLineBuilder& tf_name_scope_line_builder,
+                      DerivedXLineBuilder& tf_op_line_builder);
+
+// Derives "Steps" line from group_id XStat in XEvents.
+void DeriveStepEventsFromGroups(
+    const tsl::profiler::GroupMetadataMap& group_metadata_map,
+    XPlane* device_trace);
+
+// Derives "TensorFlow Ops", "TensorFlow Name Scope", "XLA Ops" and "XLA Module"
+// lines in an NVIDIA_GPU device trace from data passed as ScopedAnnotations and
+// stored as XStats in XEvents corresponding to GPU Kernels. Consecutive
+// annotations with the same value are merged into a single event except for XLA
+// modules. The device_trace is both input and output.
+void DeriveEventsFromAnnotations(
+    const SymbolResolver& symbol_resolver, XPlane* device_trace,
+    const ScopeRangeIdTree* scope_range_id_tree = nullptr);
+
+// Derives "Launch Activities Summary" line from host trace.
+void DeriveEventsFromHostTrace(
+    const XPlane* host_trace,
+    const tsl::profiler::GroupMetadataMap& group_metadata_map,
+    std::vector<XPlane*> device_traces);
+
+// Loops through XPlanes of input XSpace, if it is "device" XPlane, generating
+// derived timelines for the plane by calling DeriveEventsFromAnnotations.
+void GenerateDerivedTimeLines(
+    const tsl::profiler::GroupMetadataMap& group_metadata_map, XSpace* space);
+
+// Derives `Tensorflow Ops`, `Tensorflow Name Scope` and `Source Code` lines
+// from device_trace.
+void DeriveLinesFromStats(tensorflow::profiler::XPlane* device_trace);
+
+// Devices Framework Op and Module lines for XLA:CPU ops.
+void DeriveLinesForXlaCpuOps(tensorflow::profiler::XPlane* host_trace);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/device_caps_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/device_caps_utils.h
new file mode 100644
index 00000000..db6bf44e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/device_caps_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAPS_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAPS_UTILS_H_
+
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void SetDeviceCaps(const DeviceCapabilities& caps, XPlane* plane);
+DeviceCapabilities GetDeviceCaps(const XPlane& plane);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/diagnostics.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/diagnostics.h
new file mode 100644
index 00000000..e5c41751
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/diagnostics.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_DIAGNOSTICS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_DIAGNOSTICS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Error message that the visualization is based on incomplete step.
+TF_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
+
+// Error message that no step marker is seen and visualization contains no
+// step info.
+TF_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
+
+TF_CONST_INIT extern const absl::string_view kNoDeviceTraceCollected;
+
+TF_CONST_INIT extern const absl::string_view kStepsDropped;
+
+void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag);
+
+void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_DIAGNOSTICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/event_span.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/event_span.h
new file mode 100644
index 00000000..f1e3a5b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/event_span.h
@@ -0,0 +1,268 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// The various event types. Enumerations are numbered such that a bigger number
+// has a higher priority than a smaller number when used in execution-time
+// breakdown.
+enum EventType {
+  // No event associated with the time. It could be that the machine was idle or
+  // executing some events which were not traced.
+  UNKNOWN_TIME = 0,
+  // Host is computing.
+  HOST_COMPUTE = 10,
+  // Host is preprocessing the data before the execution on device.
+  HOST_PREPROCESS = 20,
+  // Host is postprocessing the data after the execution on device.
+  HOST_POSTPROCESS = 30,
+  // Host is batching data (for inference).
+  HOST_BATCH_FORMATION = 40,
+  // Host runtime, like memory allocation and etc.
+  HOST_RUNTIME = 50,
+  // Host is compiling.
+  HOST_COMPILE = 60,
+  // Host-to-host communication.
+  HOST_TO_HOST = 70,
+  // Host-to-device communication.
+  HOST_TO_DEVICE = 80,
+  // Host is preparing to launch a computation on device.
+  HOST_PREPARE = 90,
+  // Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
+  // because if an all-reduce event is overlapped with an host-wait-input event,
+  // we want to count it as waiting for input.
+  // Collective Ops such as All-Reduce.
+  DEVICE_COLLECTIVES = 100,
+  // Host is waiting for input.
+  HOST_WAIT_INPUT = 110,
+  // Device-to-device communication.
+  DEVICE_TO_DEVICE = 120,
+  // Device-to-host communication.
+  DEVICE_TO_HOST = 130,
+  // Device is computing with 32-bit precision.
+  DEVICE_COMPUTE_32 = 140,
+  // Device is computing with 16-bit precision.
+  DEVICE_COMPUTE_16 = 150,
+  // Device is waiting for another device.
+  DEVICE_WAIT_DEVICE = 160,
+  // Device is waiting for host.
+  DEVICE_WAIT_HOST = 170,
+  LAST_EVENT_TYPE = DEVICE_WAIT_HOST
+};
+
+// Generic event types that shown to the user.
+enum GenericEventType {
+  kFirstGenericEventType = 1,
+  // Device is computing.
+  kDeviceCompute = kFirstGenericEventType,
+  // Device-to-device communication.
+  kDeviceToDevice,
+  // Collective Ops such as All-Reduce and NCCL.
+  kDeviceCollectives,
+  // Host is computing.
+  kHostCompute,
+  // Host is preparing to launch a computation on device.
+  kHostPrepare,
+  // Device waiting for input from the host.
+  kInput,
+  // Device sending output to the host.
+  kOutput,
+  // Host is compling.
+  kCompile,
+  // No recognized event associated with the time.
+  kAllOthers,
+  kLastGenericEventType = kAllOthers,
+};
+
+// Contains the type and timespan of an event.
+struct EventTypeSpan {
+  EventType type;                // type of this event.
+  tsl::profiler::Timespan span;  // timespan of this event.
+  EventTypeSpan(EventType t, tsl::profiler::Timespan s) : type(t), span(s) {}
+  // Equality test.
+  bool operator==(const EventTypeSpan& other) const {
+    return type == other.type && span == other.span;
+  }
+  // Inequality test.
+  bool operator!=(const EventTypeSpan& other) const {
+    return !(*this == other);
+  }
+};
+
+enum class StepMarkerType {
+  // "TraceContext" TraceMe events.
+  kExplicitHostStepMarker,
+  // Identified by group_events (e.g., FunctionRun, SessionRun).
+  kImplicitHostStepMarker,
+  // Derived from the result of group_events. A device step marker starts with
+  // the first device event of the group and ends with the last event of the
+  // group.
+  kDeviceStepMarker,
+};
+
+// Record of an event that is used as a step marker.
+struct StepMarker {
+  StepMarkerType type;
+  std::string event_name;  // name of this event.
+  std::string step_name;
+  tsl::profiler::Timespan span;  // timespan of this event.
+  StepMarker(StepMarkerType step_marker_type, absl::string_view name,
+             tsl::profiler::Timespan s)
+      : type(step_marker_type), event_name(name), span(s) {}
+  // Equality test.
+  bool operator==(const StepMarker& other) const {
+    return type == other.type && event_name == other.event_name &&
+           span == other.span;
+  }
+  // Inequality test.
+  bool operator!=(const StepMarker& other) const { return !(*this == other); }
+};
+
+// Details of a step. Note that this could be the result of combining the
+// StepDetails of the same step executed on different cores.
+class StepDetails {
+ public:
+  StepDetails() : device_memory_transfers_(3) {}
+
+  const std::vector<StepMarker>& Markers() const { return markers_; }
+  const std::vector<EventTypeSpan>& Events() const { return events_; }
+
+  const absl::flat_hash_map<uint32, AllReduceDbResult>& Collectives() const {
+    return collectives_;
+  }
+  const std::vector<DeviceMemoryTransfer>& DeviceMemoryTransfers() const {
+    return device_memory_transfers_;
+  }
+
+  absl::flat_hash_map<uint32, OpMetricsDb>& PerCoreOpMetricsDb() {
+    return per_core_op_metrics_db_;
+  }
+  // Returns the step time.
+  tsl::profiler::Timespan StepTime() const;
+  // Adds a step-marker to this step.
+  void AddMarker(const StepMarker& m);
+  // Adds an EventTypeSpan to this step.
+  void AddEvent(const EventTypeSpan& e);
+  // Adds a collective op to this step.
+  void AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e);
+  // Appends device memory transfer events to this step.
+  // Only event type of HOST_TO_DEVICE/DEVICE_TO_DEVICE/DEVICE_TO_HOST are
+  // allowed.
+  void AddDeviceMemoryTransferEvent(EventType event_type,
+                                    const tsl::profiler::Timespan& time_span,
+                                    uint64 bytes);
+  // Returns the step name.
+  std::string StepName() const { return step_name_; }
+  // Sets the name of this step.
+  void SetStepName(std::string step_name) { step_name_ = step_name; }
+
+  // Converts from overlapped events to non-overlapped events.
+  StepDetails ToNonOverlapped() const;
+
+  // Combines other.
+  void Combine(const StepDetails& other);
+
+  // Equality test.
+  bool operator==(const StepDetails& other) const;
+  // Inequality test.
+  bool operator!=(const StepDetails& other) const { return !(*this == other); }
+
+  // Returns a string that prints the content of this object.
+  std::string DebugString() const;
+
+  void SetPerCoreOpMetricsDb(OpMetricsDb db, uint32 core_id) {
+    per_core_op_metrics_db_[core_id] = db;
+  }
+
+ private:
+  // Accumulates the device memory transfers from another step to this step.
+  void AggregateDeviceMemoryTransfers(
+      const std::vector<DeviceMemoryTransfer>& device_memory_transfers);
+
+  // All step-markers found for marking this step in the traces. There could be
+  // multiple step-markers for a single step for different reasons. One such
+  // reason is that there may be one step-marker for the same step on each core;
+  // so after combining the StepDetails from multiple cores, there would be
+  // multiple step-markers for the same step.
+  std::vector<StepMarker> markers_;
+  // All events belonging to this step.
+  std::vector<EventTypeSpan> events_;
+  // Collective operation related events such as all-reduce etc.
+  absl::flat_hash_map<uint32, AllReduceDbResult> collectives_;
+  // Device memory transfers (including time and bytes involved).
+  // TODO(jiesun): Consider to use IntervalSet instead of just sum up the event
+  // durations.
+  std::vector<DeviceMemoryTransfer> device_memory_transfers_;
+  std::string step_name_;
+
+  absl::flat_hash_map<uint32, OpMetricsDb> per_core_op_metrics_db_;
+};
+
+// Map from step_id to the events happened in that step.
+using StepEvents = absl::flat_hash_map<int64_t /*step_id*/, StepDetails>;
+
+// Equality test for StepEvents.
+bool operator==(const StepEvents& a, const StepEvents& b);
+
+// Returns the name of the given EventType.
+std::string PrintEventType(EventType event_type);
+
+// Returns the string of the given GenericEventType.
+absl::string_view GetGenericEventTypeStr(GenericEventType event_type);
+
+// Returns a string that prints the given EventTypeSpan.
+std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span);
+
+// Returns a string that prints the given StepMarker.
+std::string PrintStepMarker(const StepMarker& step_marker);
+
+// Returns a string that prints the given StepEvents.
+std::string PrintStepEvents(const StepEvents& step_events);
+
+// Unions the map of StepEvents and combines the src StepEvents into dst.
+void UnionCombineStepEvents(const StepEvents& src, StepEvents* dst);
+
+// Intersects the map of StepEvents and combines the src StepEvents into dst.
+void IntersectCombineStepEvents(const StepEvents& src, StepEvents* dst);
+
+// Converts from overlapped events to non-overlapped events.
+std::vector<EventTypeSpan> ToNonOverlappedEvents(
+    const std::vector<EventTypeSpan>& overlapped_events);
+
+// Converts from overlapped step-events to non-overlapped step events.
+StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events);
+
+// Returns the precision stats of the given non-overlapped step events.
+PrecisionStats ComputePrecisionStats(
+    const StepEvents& nonoverlapped_step_events);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/gpu_event_stats.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/gpu_event_stats.h
new file mode 100644
index 00000000..1c711249
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/gpu_event_stats.h
@@ -0,0 +1,83 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Stats from a GPU stream XEvent.
+struct GpuEventStats {
+  explicit GpuEventStats(const XEventVisitor* event);
+
+  bool IsKernel() const { return !kernel_details.empty(); }
+  bool IsMemCpy() const { return !memcpy_details.empty(); }
+  bool IsCudaGraphExecution() const { return cuda_graph_exec_id.has_value(); }
+
+  bool IsXlaOp() const { return !hlo_op_names.empty(); }
+  bool IsTfOp() const { return !tf_op_fullname.empty(); }
+
+  // Stats from TensorFlow.
+  absl::string_view tf_op_fullname;
+  absl::string_view equation;
+  absl::string_view tensor_shapes;
+
+  // Stats from XLA.
+  std::vector<absl::string_view> hlo_op_names;
+  absl::string_view hlo_module_name;
+  std::optional<uint64_t> program_id;
+
+  // Stats from CUPTI.
+  absl::string_view kernel_details;
+  absl::string_view memcpy_details;
+  std::optional<int64_t> correlation_id;
+  std::optional<int64_t> scope_range_id;
+
+  // Stats derived by grouping.
+  std::optional<int64_t> group_id;
+  bool is_eager = false;
+  std::optional<uint64_t> cuda_graph_exec_id;
+  std::optional<uint64_t> cuda_graph_id_for_inner_node;
+};
+
+// Stats for a host-side GPU launch XEvent.
+struct LaunchEventStats {
+  explicit LaunchEventStats(const XEventVisitor* event);
+
+  bool IsLaunch() const {
+    return device_id.has_value() && correlation_id.has_value();
+  }
+
+  // Stats from CUPTI.
+  std::optional<int64_t> device_id;
+  std::optional<int64_t> correlation_id;
+
+  // Stat derived by grouping.
+  std::optional<int64_t> group_id;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hardware_type_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hardware_type_utils.h
new file mode 100644
index 00000000..41b1bd4b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hardware_type_utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct GpuFlopCapabilities {
+  struct FlopCapabilityOnPrecisions {
+    double fp64_tflops = 0;
+    double fp32_tflops = 0;  // also for tf32 for nvidia tensor core
+    double bf16_tflops = 0;
+    double fp16_tflops = 0;
+    double fp8_tflops = 0;
+    double int8_tops = 0;
+    double fp4_tflops = 0;
+    double int4_tops = 0;
+
+    void ScaleWith(double scale) {
+      fp64_tflops *= scale;
+      fp32_tflops *= scale;
+      bf16_tflops *= scale;
+      fp16_tflops *= scale;
+      fp8_tflops *= scale;
+      int8_tops *= scale;
+      fp4_tflops *= scale;
+      int4_tops *= scale;
+    }
+  };
+
+  FlopCapabilityOnPrecisions cuda_core;
+  FlopCapabilityOnPrecisions tensor_core;
+  bool has_tensor_core_sparsity_support = false;
+
+  void ScaleWith(double scale) {
+    cuda_core.ScaleWith(scale);
+    tensor_core.ScaleWith(scale);
+  }
+};
+
+// Get peak single precision throughput of the GPU in GFLOPS per
+// streaming multiprocessor.
+// TODO: Need design on how to use the sparsity capability of FLOPs.
+double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
+
+// for Nvidia GPU, return shared memory bandwidth in Bytes Per Second on
+// one single SM given the GPU core freq in device_cap.
+double GetSharedMemoryBandwidthPerSM(const DeviceCapabilities& device_cap);
+
+// Returns the GPU model name from the given DeviceCapabilities.
+// For nvidia GPUs, the name is like "Nvidia GPU (Kepler)" or "Nvidia GPU
+// (Turing)". For AMD GPUs, the name is like "AMD GPU - gfx-10XX series".
+// The model name here for Nvidia GPU in fact refers to its microarchitecture
+// name.
+absl::string_view GpuModelName(const DeviceCapabilities& device_cap);
+
+HardwareType ParseHardwareType(absl::string_view device_type);
+
+// Returns true if the given hardware type has a device.
+bool HasDevice(HardwareType x);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_module_map.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_module_map.h
new file mode 100644
index 00000000..1ea242f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_module_map.h
@@ -0,0 +1,212 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+class HloInstructionInterface {
+ public:
+  virtual ~HloInstructionInterface() = default;
+  virtual absl::string_view Name() const = 0;
+  virtual xla::HloOpcode HloOpcode() const = 0;
+  virtual absl::string_view Category() const = 0;
+  virtual std::string HloOpcodeString() const = 0;
+  virtual const xla::OpMetadata& Metadata() const = 0;
+  virtual size_t flops() const = 0;
+  virtual size_t bytes_accessed() const = 0;
+  virtual std::string_view op_full_name() const = 0;
+  virtual std::string source_info() const = 0;
+  virtual bool isRoot() const = 0;
+  virtual bool IsFusion() const = 0;
+  virtual const std::string& Expression() const = 0;
+
+  virtual void ProcessXlaCostAnalysis(
+      const xla::HloCostAnalysis* cost_analysis) = 0;
+};
+
+// This wrapper allows caching the results of HloInstruction methods.
+// This wrapper is not thread safe.
+class HloInstructionWrapper : public HloInstructionInterface {
+ public:
+  explicit HloInstructionWrapper(
+      const xla::HloInstruction* instr,
+      const xla::HloCostAnalysis* cost_analysis = nullptr);
+
+  // Non copyable
+  HloInstructionWrapper(const HloInstructionWrapper&) = delete;
+  HloInstructionWrapper& operator=(const HloInstructionWrapper&) = delete;
+  // Movable.
+  HloInstructionWrapper(HloInstructionWrapper&&) = default;
+  HloInstructionWrapper& operator=(HloInstructionWrapper&&) = default;
+
+  absl::string_view Name() const override { return instr_->name(); }
+
+  xla::HloOpcode HloOpcode() const override { return instr_->opcode(); }
+
+  absl::string_view Category() const override { return category_; }
+
+  std::string HloOpcodeString() const override {
+    return std::string(xla::HloOpcodeString(instr_->opcode()));
+  }
+
+  const xla::OpMetadata& Metadata() const override {
+    return instr_->metadata();
+  }
+
+  size_t flops() const override { return flops_; }
+  size_t bytes_accessed() const override { return bytes_accessed_; }
+
+  std::string_view op_full_name() const override { return op_full_name_; }
+  std::string source_info() const override;
+
+  bool isRoot() const override { return instr_->IsRoot(); }
+  bool IsFusion() const override { return !fused_children_.empty(); };
+
+  void ProcessXlaCostAnalysis(
+      const xla::HloCostAnalysis* cost_analysis) override {
+    if (cost_analysis == nullptr) return;
+    flops_ = cost_analysis->flop_count(*instr_);
+    bytes_accessed_ = cost_analysis->bytes_accessed(*instr_);
+  }
+
+  const std::string& Expression() const override { return expression_; }
+
+  void AddFusedChild(const HloInstructionWrapper* child) {
+    fused_children_.push_back(child);
+  };
+
+  const std::vector<const HloInstructionWrapper*>& FusedChildren() const {
+    return fused_children_;
+  }
+
+ private:
+  const xla::HloInstruction* instr_;
+  std::vector<const HloInstructionWrapper*> fused_children_;
+  std::string op_full_name_;
+  size_t flops_ = 0;
+  size_t bytes_accessed_ = 0;
+  std::string category_;
+  std::string expression_;
+};
+
+// Helper class for accessing HloModule.
+class HloModuleInterface {
+ public:
+  virtual ~HloModuleInterface() = default;
+
+  // If the module contains no instructions.
+  virtual bool Empty() const = 0;
+  virtual absl::string_view Name() const = 0;
+  // Function to populated nested childs= instructions in a fusion.
+  virtual void GatherFusionInstructions(xla::HloInstruction* inst) = 0;
+};
+
+// Wraps HLO module and provides an interface that maps HLO names to
+// HloInstructionWrappers.
+class HloModuleWrapper : public HloModuleInterface {
+ public:
+  explicit HloModuleWrapper(
+      const xla::HloProto& hlo_proto,
+      std::function<int64_t(const xla::Shape&)> shape_func = nullptr);
+
+  explicit HloModuleWrapper(
+      std::unique_ptr<xla::HloModule> module,
+      std::function<int64_t(const xla::Shape&)> shape_func);
+
+  const HloInstructionWrapper* GetHloInstruction(
+      absl::string_view hlo_name) const;
+  HloInstructionWrapper* GetMutableHloInstruction(absl::string_view hlo_name);
+
+  bool Empty() const override { return instructions_by_name_.empty(); }
+
+  absl::string_view Name() const override { return module_->name(); }
+  void GatherFusionInstructions(xla::HloInstruction* inst) override;
+
+ private:
+  std::unique_ptr<xla::HloModule> module_;
+
+  // Map of HloInstructionWrappers by name.
+  using HloInstructionMap =
+      absl::flat_hash_map<absl::string_view, HloInstructionWrapper>;
+  HloInstructionMap instructions_by_name_;
+};
+
+// Map of HloModuleWrappers by program_id.
+using HloModuleMap =
+    absl::flat_hash_map<uint64_t /*program_id*/, HloModuleWrapper>;
+
+void AddHloProto(HloModuleMap& hlo_module_map, uint64_t program_id,
+                 const xla::HloProto& hlo_proto);
+
+// Process HloModuleMap from single XSpace.
+void ProcessHloModuleMapFromXSpace(HloModuleMap& hlo_module_map,
+                                   const XSpace* space);
+
+// WARNING: The returned pointer will be invalidated if HloModuleMap is mutated.
+inline const HloModuleWrapper* GetHloModule(const HloModuleMap* hlo_module_map,
+                                            uint64_t program_id) {
+  if (hlo_module_map == nullptr) return nullptr;
+  auto iter = hlo_module_map->find(program_id);
+  if (iter == hlo_module_map->end()) return nullptr;
+  return &iter->second;
+}
+
+inline const HloInstructionWrapper* GetHloInstruction(
+    const HloModuleMap& hlo_module_map, std::optional<uint64_t> program_id,
+    absl::string_view hlo_name) {
+  if (!program_id.has_value()) return nullptr;
+  const auto* hlo_module = GetHloModule(&hlo_module_map, *program_id);
+  if (hlo_module == nullptr) return nullptr;
+  return hlo_module->GetHloInstruction(hlo_name);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_module_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_module_utils.h
new file mode 100644
index 00000000..100671de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_module_utils.h
@@ -0,0 +1,81 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
+
+#include <cstddef>
+#include <string>
+
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Sometimes HLO produce a huge string (>100MB). Limit the name size to 1MB.
+static constexpr size_t kMaxHlolNameSize = 1000000;
+
+inline const xla::HloInstruction* FindInstruction(const xla::HloModule& module,
+                                                  std::string node_name) {
+  if (absl::StartsWith(node_name, "%")) {
+    node_name.erase(node_name.begin());
+  }
+  for (const xla::HloComputation* computation : module.computations()) {
+    auto instrs = computation->instructions();
+    auto it = absl::c_find_if(instrs, [&](const xla::HloInstruction* instr) {
+      // Try with and without "%" at the beginning of the node name.
+      return absl::EqualsIgnoreCase(instr->name(), node_name) ||
+             absl::EqualsIgnoreCase(instr->name(),
+                                    absl::StrCat("%", node_name));
+    });
+    if (it != instrs.end()) {
+      return *it;
+    }
+  }
+  return nullptr;
+}
+
+inline const xla::HloComputation* FindComputation(
+    const xla::HloModule& module, const std::string& comp_name) {
+  for (const xla::HloComputation* computation : module.computations()) {
+    if (absl::EqualsIgnoreCase(computation->name(), comp_name)) {
+      return computation;
+    }
+  }
+  return nullptr;
+}
+
+inline std::string UncachedExpression(const xla::HloInstruction* instr,
+                                      bool skip_expression, size_t max_size) {
+  if (skip_expression) {
+    return "";
+  }
+  static const auto* hlo_print_options =
+      new xla::HloPrintOptions(xla::HloPrintOptions()
+                                   .set_print_metadata(false)
+                                   .set_print_backend_config(false)
+                                   .set_print_infeed_outfeed_config(false));
+  std::string expression = instr->ToString(*hlo_print_options);
+  if (expression.size() > max_size) {
+    expression.resize(max_size);
+  }
+  return expression;
+}
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_proto_map.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_proto_map.h
new file mode 100644
index 00000000..cb376966
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_proto_map.h
@@ -0,0 +1,84 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_MAP_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_MAP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/hlo.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+absl::flat_hash_map<uint64_t /*program_id*/, std::unique_ptr<xla::HloProto>>
+ParseHloProtosFromXSpace(const XSpace& space);
+
+class HloProtoMap {
+ public:
+  void AddHloProtosFromXSpace(const XSpace& space);
+
+  void AddHloProto(uint64_t program_id,
+                   std::unique_ptr<const xla::HloProto> hlo_proto);
+  // Returns whether <hlo_proto> is new to HloProtoMap.
+  bool AddHloProto(uint64_t program_id, const xla::HloProto* hlo_proto);
+
+  size_t size() const { return hlo_protos_by_program_id_.size(); }
+
+  auto begin() const { return hlo_protos_by_program_id_.begin(); }
+  auto end() const { return hlo_protos_by_program_id_.end(); }
+
+  bool contains(absl::string_view name) const {
+    return hlo_protos_by_name_.contains(name);
+  }
+
+  bool contains(uint64_t program_id) const {
+    return hlo_protos_by_program_id_.contains(program_id);
+  }
+
+  // Returns a list of module names (not sorted).
+  std::vector<absl::string_view> GetModuleList() const;
+
+  // Returns a list of module names sorted alphabetically.
+  std::vector<absl::string_view> GetSortedModuleList() const;
+
+  // Returns a list of hlo module names sorted first by heap trace size and then
+  // by hlo module name alphabetically.
+  std::vector<absl::string_view> GetSortedModuleListByHeapTraceSize() const;
+
+  absl::StatusOr<const xla::HloProto*> GetHloProtoByModuleName(
+      absl::string_view module_name) const;
+
+  absl::StatusOr<const xla::HloProto*> GetHloProtoByProgramId(
+      uint64_t program_id) const;
+
+ private:
+  absl::flat_hash_map<uint64_t, const xla::HloProto*> hlo_protos_by_program_id_;
+  absl::flat_hash_map<std::string, const xla::HloProto*> hlo_protos_by_name_;
+  std::vector<std::unique_ptr<const xla::HloProto>> owned_hlo_protos_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_proto_to_module.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_proto_to_module.h
new file mode 100644
index 00000000..d89b919d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/hlo_proto_to_module.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_TO_MODULE_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_TO_MODULE_H_
+
+#include <memory>
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertHloProtoToModule(
+    const xla::HloProto& hlo_proto);
+
+std::unique_ptr<xla::HloModule> ConvertHloProtoToModuleIgnoringErrors(
+    const xla::HloProto& hlo_proto);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_TO_MODULE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/host_offload_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/host_offload_utils.h
new file mode 100644
index 00000000..4bb96f2e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/host_offload_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/layout.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct LineBuilderAndEventEndTimeFrontier {
+  XLineBuilder line_builder;
+  uint64_t event_end_time_frontier_ns;
+};
+
+class HostOffloadEventProcessor {
+ public:
+  HostOffloadEventProcessor(XPlaneBuilder* plane_builder,
+                            uint64_t start_timestamp_ns)
+      : plane_builder_(plane_builder),
+        start_timestamp_ns_(start_timestamp_ns) {}
+  ~HostOffloadEventProcessor() = default;
+
+  void ProcessHostOffloadOpEvent(const XEventVisitor& event,
+                                 std::optional<int64_t> group_id);
+
+  bool IsHostOffloadOpName(const XEventVisitor& event) const;
+
+ private:
+  std::string GetOffloadInstructionID(absl::string_view op_name) const;
+  std::string GetOffloadInstructionName(absl::string_view op_name) const;
+
+  absl::flat_hash_map<std::string, std::queue<const XEventVisitor*>>
+      seen_events_;
+  std::string host_memory_label_ =
+      absl::StrCat("S(", xla::Layout::kHostMemorySpace, ")");
+
+  XPlaneBuilder* plane_builder_;
+  uint64_t start_timestamp_ns_;
+
+  std::vector<LineBuilderAndEventEndTimeFrontier>
+      host_offload_op_line_builders_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/html_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/html_utils.h
new file mode 100644
index 00000000..215d9f51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/html_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Creates a html that links to the given url with the given text.
+inline std::string AnchorElement(absl::string_view url,
+                                 absl::string_view text) {
+  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/kernel_stats_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/kernel_stats_utils.h
new file mode 100644
index 00000000..ee6f56d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/kernel_stats_utils.h
@@ -0,0 +1,135 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Populates kernel launch information from a kKernelDetails XStat.
+void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
+                             KernelReport* kernel);
+
+// Returns true if kernel uses TensorCores.
+bool IsKernelUsingTensorCore(absl::string_view kernel_name);
+
+// Returns true if operation is eligible to use TensorCores.
+bool IsOpTensorCoreEligible(absl::string_view tf_op_name);
+
+// Returns true if Einsum equation is eligible to use TensorCores.
+bool IsEinsumTensorCoreEligible(absl::string_view equation);
+
+// Less than comparator for Kernel Reports.
+struct KernelReportLessThanComparator {
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs) const;
+};
+
+// Equal to comparator for Kernel Reports.
+struct KernelReportEqualToComparator {
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs) const;
+};
+
+// Sorts kernel reorts by total duration descendingly.
+// Keeps only the top kernel reports with long kernel duration in the given
+// KernelStatsDb. Kernel reports with shorter kernel duration are dropped.
+void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db);
+
+struct KernelReportValue {
+  uint64 total_duration_ns = 0;
+  uint64 min_duration_ns = 0;
+  uint64 max_duration_ns = 0;
+  uint64 occurrences = 0;
+};
+
+struct KernelKeyWrap {
+  const KernelReport* key;
+  template <typename H>
+  friend H AbslHashValue(H h, KernelKeyWrap wrap) {
+    // Kernel reports are grouped by these fields, hence they are used as
+    // hashing criteria.
+    // clang-format off
+    return H::combine(
+        std::move(h),
+        wrap.key->is_kernel_using_tensor_core(),
+        wrap.key->is_op_tensor_core_eligible(),
+        wrap.key->block_dim(0),
+        wrap.key->block_dim(1),
+        wrap.key->block_dim(2),
+        wrap.key->grid_dim(0),
+        wrap.key->grid_dim(1),
+        wrap.key->grid_dim(2),
+        wrap.key->registers_per_thread(),
+        wrap.key->static_shmem_bytes(),
+        wrap.key->dynamic_shmem_bytes(),
+        wrap.key->name(),
+        wrap.key->op_name());
+    // clang-format on
+  }
+};
+
+struct KernelHash {
+  size_t operator()(const KernelReport& key) const {
+    return absl::Hash<KernelKeyWrap>()(KernelKeyWrap{&key});
+  }
+};
+
+using KernelReportMap =
+    absl::flat_hash_map<KernelReport, KernelReportValue, KernelHash,
+                        KernelReportEqualToComparator>;
+
+// Copies the top kernel reports with long kernel duration into the given
+// KernelStatsDb.
+void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
+                                       KernelStatsDb* dst);
+
+// Inserts or aggregates KernelReports into the given KernelReportMap.
+void InsertOrUpdateKernelReport(const KernelReport& kernel,
+                                const KernelReportValue& value,
+                                KernelReportMap* dst);
+
+// Aggregates values from one KernelReportMap into another.
+void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst);
+
+// Kernel stats aggregated at TF operation level.
+struct OpLevelKernelStats {
+  // Whether op is eligible to use TensorCore.
+  bool is_op_tensor_core_eligible = false;
+  // The accumulated duration of all the kernels launched in this op.
+  uint64 total_duration_ns = 0;
+  // The accumulated duration of all the kernels using TensorCore in this op.
+  // If this value is not 0, at least one of the kernels launched by this op
+  // is using TensorCore.
+  uint64 tensor_core_duration_ns = 0;
+};
+
+using KernelStatsByOpName =
+    absl::flat_hash_map<absl::string_view, OpLevelKernelStats>;
+
+// Groups KernelReport in <kernel_stats_db> by tensorflow operation name.
+KernelStatsByOpName GroupKernelReportsByOpName(
+    const KernelStatsDb& kernel_stats_db);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/math_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/math_utils.h
new file mode 100644
index 00000000..380884ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/math_utils.h
@@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_MATH_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_MATH_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/base/macros.h"
+#include "xla/tsl/profiler/utils/math_utils.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double CyclesToSeconds(double cycles, double frequency_hz) {
+  return tsl::profiler::CyclesToSeconds(cycles, frequency_hz);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double GibibytesPerSecond(double gigabytes, double ns) {
+  return tsl::profiler::GibibytesPerSecond(gigabytes, ns);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double GibiToGiga(double gibi) {
+  return tsl::profiler::GibiToGiga(gibi);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double GigaToGibi(double giga) {
+  return tsl::profiler::GigaToGibi(giga);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double GigaToTera(double giga) {
+  return tsl::profiler::GigaToTera(giga);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double GigaToUni(double giga) { return tsl::profiler::GigaToUni(giga); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double MicroToMilli(double u) { return tsl::profiler::MicroToMilli(u); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double MicroToNano(double u) { return tsl::profiler::MicroToNano(u); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline uint64_t MilliToNano(double m) { return tsl::profiler::MilliToNano(m); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline uint64_t MilliToPico(double m) { return tsl::profiler::MilliToPico(m); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double MilliToUni(double m) { return tsl::profiler::MilliToUni(m); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double NanoToMicro(uint64_t n) { return tsl::profiler::NanoToMicro(n); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double NanoToMilli(uint64_t n) { return tsl::profiler::NanoToMilli(n); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline uint64_t NanoToPico(uint64_t n) { return tsl::profiler::NanoToPico(n); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double PicoToMicro(uint64_t p) { return tsl::profiler::PicoToMicro(p); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double PicoToMilli(uint64_t p) { return tsl::profiler::PicoToMilli(p); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double PicoToNano(uint64_t p) { return tsl::profiler::PicoToNano(p); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double PicoToUni(uint64_t p) { return tsl::profiler::PicoToUni(p); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double SafeDivide(double dividend, double divisor) {
+  return tsl::profiler::SafeDivide(dividend, divisor);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline double TeraToGiga(double tera) {
+  return tsl::profiler::TeraToGiga(tera);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double UniToGiga(double uni) { return tsl::profiler::UniToGiga(uni); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline double UniToMicro(double uni) { return tsl::profiler::UniToMicro(uni); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline uint64_t UniToNano(double uni) { return tsl::profiler::UniToNano(uni); }
+
+ABSL_DEPRECATE_AND_INLINE()
+inline uint64_t UniToPico(double uni) { return tsl::profiler::UniToPico(uni); }
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_MATH_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/op_metrics_db_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/op_metrics_db_utils.h
new file mode 100644
index 00000000..e3ff3fcc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/op_metrics_db_utils.h
@@ -0,0 +1,138 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// The name of OpMetrics to represent the idle time.
+TF_CONST_INIT extern const absl::string_view kIdle;
+// The core index to add to sparse core index in op metrics.
+TF_CONST_INIT extern const uint32_t kSparseCoreIndexStart;
+
+// Helps build an op metrics database (borrowed).
+// Enables fast lookup of existing ops and prevents the creation of duplicate
+// ops. It is the user's responsibility to ensure an op metrics database
+// outlives its builder, and that no ops are added to the database outside of
+// the builder.
+class OpMetricsDbBuilder {
+ public:
+  // Create with a borrowed op database.
+  // REQUIRED: The op database must be empty.
+  explicit OpMetricsDbBuilder(OpMetricsDb* db);
+
+ protected:
+  // Looks up the given OP name. If it is already in the database,
+  // return its OpMetrics; otherwise, insert a new one.
+  OpMetrics* LookupOrInsertNewOpMetrics(uint64 hlo_module_id,
+                                        absl::string_view name,
+                                        uint64_t fingerprint);
+
+  OpMetricsDb* db() { return db_; }
+
+ private:
+  // Map op (hlo_module_id, name) to the corresponding metrics in the op
+  // database.
+  absl::flat_hash_map<uint64 /*hlo_module_id*/,
+                      absl::flat_hash_map<std::string /*name*/, OpMetrics*>>
+      op_metrics_map_;
+
+  // The op database.
+  OpMetricsDb* db_;
+};
+
+// Helps build an op metrics database (borrowed) from XEvents,
+class XEventsOpMetricsDbBuilder {
+ public:
+  // Add OpMetric from XEventVisitor.
+  void AddOpMetric(const tsl::profiler::XEventVisitor& xevent);
+
+  // Finalize OpMetricDb and add total time and Idle op.
+  OpMetricsDb Finalize(uint64_t total_time);
+
+  // Finalize OpMetricDb, but the total time is unknown at the moment, So ignore
+  // the total time and Idle Op and will be handled by the caller.
+  OpMetricsDb Finalize();
+
+ private:
+  using OpMetricBySymbol =
+      absl::flat_hash_map</*symbol_id=*/uint64_t, OpMetrics>;
+  absl::flat_hash_map</*program_id=*/uint64_t, OpMetricBySymbol>
+      flat_op_metric_;
+};
+
+// Sets the total time for OpMetricsDb, ensuring idle time is not negative.
+inline void SetTotalTimePs(OpMetricsDb& db, uint64_t total_time_ps) {
+  db.set_total_time_ps(std::max(db.total_op_time_ps(), total_time_ps));
+}
+
+// Returns the total time in OpMetricsDb, optionally excluding the idle time.
+inline uint64_t TotalTimePs(const OpMetricsDb& db, bool exclude_idle = false) {
+  return exclude_idle ? db.total_op_time_ps() : db.total_time_ps();
+}
+
+// Returns the ratio of time that is idle (no op execution) over total time.
+double IdleTimeRatio(const OpMetricsDb& db);
+
+// Returns the idle time in picoseconds.
+uint64 IdleTimePs(const OpMetricsDb& db);
+
+// Populates an OpMetrics record representing idle time, i.e., the amount of
+// time spent without any op execution.
+void SetIdleOp(uint64_t idle_time_ps, OpMetrics& metrics);
+
+// Adds an OpMetrics record representing idle time, i.e., the amount of time
+// spent without any op execution.
+// REQUIRED: All ops must have been added to the database and the total time
+// must have been set.
+void AddIdleOp(OpMetricsDb& db);
+
+// Returns true if the given metrics represents idle time.
+inline bool IsIdleOp(const OpMetrics& metrics) {
+  return metrics.category() == kIdle;
+}
+
+// Returns the time spent in children (nested) ops.
+inline uint64_t ChildrenTimePs(const OpMetrics& metrics) {
+  return metrics.time_ps() - metrics.self_time_ps();
+}
+
+// Returns the ratio of time spent sending data from the host to the device
+// relative to the total time the host was active.
+std::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db);
+
+// Converts from the device op metrics to Tf-op metrics.
+OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
+    const OpMetricsDb& device_op_metrics_db, bool with_idle = true);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/op_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/op_utils.h
new file mode 100644
index 00000000..b3329b08
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/op_utils.h
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/hlo_module_map.h"
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Annotate the op_metrics with the metadata from the instr_wrapper.
+void EnterOpMetadata(OpMetrics* op_metrics,
+                     const HloInstructionWrapper* instr_wrapper);
+void EnterOpMetadataFromHloModuleMap(OpMetrics* op_metrics,
+                                     const HloModuleMap& hlo_module_map);
+
+void AddFusionChildrenToOpMetricsFromHloInstruction(
+    OpMetrics* op_metrics, const HloInstructionWrapper* instr_wrapper);
+
+class HostOpMetricsDbBuilder : public OpMetricsDbBuilder {
+ public:
+  explicit HostOpMetricsDbBuilder(OpMetricsDb* db) : OpMetricsDbBuilder(db) {}
+
+  // A function that will be called when the end of an OP is
+  // observed on a trace, where:
+  //   name = the OP name.
+  //   category = the OP category.
+  //   is_eager = whether this OP is eagerly executed.
+  //   time_ps = the total execution time of the OP in picoseconds, including
+  //             the execution time of its children.
+  //   children_time_ps = the execution time of the children of this OP in
+  //                      picoseconds
+  void EnterOp(absl::string_view name, absl::string_view category,
+               bool is_eager, uint64 time_ps, uint64 children_time_ps);
+
+  // Updates total_host_infeed_enq_duration_ps_ and
+  // total_host_infeed_enq_duration_ps_.
+  void EnterHostInfeedEnqueue(tsl::profiler::Timespan host_infeed_enqueue);
+
+ private:
+  // The tsl::profiler::Timespan of the last InfeedEnqueue op on this thread.
+  tsl::profiler::Timespan last_host_infeed_enqueue_;
+};
+
+class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
+ public:
+  explicit DeviceOpMetricsDbBuilder(OpMetricsDb* db) : OpMetricsDbBuilder(db) {}
+
+  // A function that will be called when the end of an OP is
+  // observed on a trace, where:
+  //   program_id = the ID of the program that contains this OP.
+  //   name = the OP name.
+  //   category = the OP category.
+  //   provenance = the provenance of this OP (e.g. original TF OP).
+  //   is_eager = whether this OP is eagerly executed.
+  //   occurrences = the number of occurrences of this OP.
+  //   time_ps = the total execution time of the OP in picoseconds, including
+  //             the execution time of its children.
+  //   children_time_ps = the execution time of the children of this OP in
+  //                      picoseconds.
+  //   flops = the number of floating-point operations computed.
+  //   bytes_accessed = the sum of bytes read and bytes written by this OP.
+  //   memory_accessed_breakdown = the breakdown of memory accessed by operation
+  //                               type and memory space.
+  void EnterOp(uint64 program_id, absl::string_view name,
+               absl::string_view category, absl::string_view provenance,
+               absl::string_view deduplicated_name, bool is_eager,
+               uint64 occurrences, uint64 time_ps, uint64 children_time_ps,
+               int64_t flops, int64_t bytes_accessed,
+               const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
+                   memory_accessed_breakdown = {},
+               int64_t model_flops = 0);
+
+  void EnterOpMetadata(uint64 program_id, absl::string_view program_name,
+                       absl::string_view category, absl::string_view provenance,
+                       absl::string_view deduplicated_name, bool is_eager,
+                       absl::string_view long_name = "");
+
+  void EnterOpMetadataFromHloModuleMap(uint64 program_id,
+                                       absl::string_view op_name,
+                                       const HloModuleMap& hlo_module_map);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/step_intersection.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/step_intersection.h
new file mode 100644
index 00000000..cf2961ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/step_intersection.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Description of how two step sequences are aligned.
+struct StepsAlignment {
+  uint32 begin_subordinate_idx;  // where the alignment begins on the
+                                 // subordinate steps.
+  uint32 begin_chief_idx;  // where the alignment begins on the chief steps.
+  uint32 num_steps;        // aligned for how many steps.
+};
+
+class StepIntersection {
+ public:
+  StepIntersection(
+      uint32 max_steps,
+      const absl::flat_hash_map</*host_id=*/uint32, const StepDatabaseResult*>&
+          perhost_stepdb);
+
+  // Returns the number of steps in the intersection.
+  uint32 NumSteps() const { return end_chief_idx_ - begin_chief_idx_; }
+
+  // Returns the value of empty_intersect_ (see the explanation of
+  // empty_intersect_ below).
+  bool EmptyIntersect() const { return empty_intersect_; }
+
+  // Returns the step numbers for the destination (i.e. the intersection
+  // result).
+  std::vector<uint32> DstStepNumbers() const;
+
+  // Returns the index to the step in the given host that corresponds to the
+  // first step in the intersection.
+  uint32 FirstStepIndex(uint32 host_id) const;
+
+  // Returns the number of steps dropped due to the max_steps constraint
+  // specified in the constructor.
+  uint32 StepsDropped() const { return steps_dropped_; }
+
+  std::string DebugString() const;
+
+ private:
+  absl::flat_hash_map</*host_id=*/uint32, StepsAlignment> perhost_alignment_;
+  uint32
+      chief_host_id_;  // the host whose step sequence is selected as the chief.
+  uint32 steps_dropped_;  // number of steps dropped.
+  // If NumSteps() is 0, empty_intersect indicates one of two possible reasons:
+  //   (i) At least one host has some steps, but the intersection over all hosts
+  //   is empty. In this case, empty_intersect is true,
+  //   (ii) None of the hosts has any steps. In this case, empty_intersect is
+  //   false.
+  // If NumSteps() > 0, empty_intersect is don't care.
+  bool empty_intersect_;
+  // The begin and end indices to the chief step sequence for this step
+  // intersection. Note that the begin index is inclusive but the end index is
+  // exclusive.
+  uint32 begin_chief_idx_;
+  uint32 end_chief_idx_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tfstreamz_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tfstreamz_utils.h
new file mode 100644
index 00000000..25b7436c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tfstreamz_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/lib/monitoring/collected_metrics.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct TfStreamzSnapshot {
+  std::unique_ptr<monitoring::CollectedMetrics> metrics;
+  uint64 start_time_ns;  // time before collection.
+  uint64 end_time_ns;    // time after collection.
+};
+
+absl::Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
+                               XPlane* plane, uint64 line_start_time_ns);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h
new file mode 100644
index 00000000..731481a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h
@@ -0,0 +1,75 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_BREAKDOWN_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_BREAKDOWN_UTILS_H_
+
+#include <cstdint>
+
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Total duration of infeed from host or SparseCoreV0 to TensorCore.
+inline uint64_t InfeedDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.infeed_duration_ps() + tpu.wait_for_scv0_duration_ps() +
+         tpu.scv0_infeed_transform_ps();
+}
+
+// Total duration of outfeed from TensorCore to host or SparseCoreV0.
+inline uint64_t OutfeedDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.host_outfeed_ps() + tpu.scv0_outfeed_ps();
+}
+
+// Total duration of infeed from host to SparseCoreV0.
+inline uint64_t ScV0InfeedDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.wait_for_scv0_duration_ps() * tpu.scv0_infeed_percent() / 100.0;
+}
+
+// Total duration of SparseCoreV0 compute.
+inline uint64_t ScV0ComputeDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.wait_for_scv0_duration_ps() - ScV0InfeedDurationPs(tpu);
+}
+
+// Total duration of infeed from host to TensorCore or SparseCoreV0.
+inline uint64_t TcPlusScV0InfeedDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.infeed_duration_ps() + ScV0InfeedDurationPs(tpu);
+}
+
+// Total duration of send and recv ops.
+inline uint64_t SendRecvDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.send_duration_ps() + tpu.recv_duration_ps();
+}
+
+// Total duration of host send and host recv ops.
+inline uint64_t HostSendRecvDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.host_send_duration_ps() + tpu.host_recv_duration_ps();
+}
+
+// Total duration TensorCore spends waiting for host.
+inline uint64_t WaitForHostDurationPs(const TpuStepBreakdown& tpu) {
+  return tpu.infeed_duration_ps() + tpu.host_outfeed_ps() +
+         HostSendRecvDurationPs(tpu) + tpu.tc_idle_ps();
+}
+
+// Total duration TensorCore spends waiting for host or SparseCoreV0.
+inline uint64_t WaitForHostOrScV0DurationPs(const TpuStepBreakdown& tpu) {
+  return WaitForHostDurationPs(tpu) + tpu.wait_for_scv0_duration_ps();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_BREAKDOWN_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tpu_step_details_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tpu_step_details_utils.h
new file mode 100644
index 00000000..d26e4973
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/tpu_step_details_utils.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_DETAILS_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_DETAILS_UTILS_H_
+
+#include <cstdint>
+
+#include "tensorflow/core/profiler/protobuf/tpu_input_pipeline.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+inline double ComputeTimeMs(const PerTpuStepDetails& details) {
+  return details.tc_compute_time_ms() + details.scv0_compute_time_ms();
+}
+
+inline double InfeedTimeMs(const PerTpuStepDetails& details) {
+  return details.tc_infeed_time_ms() + details.scv0_infeed_time_ms();
+}
+
+inline double AllReduceTimeMs(const PerTpuStepDetails& details) {
+  return details.all_reduce_compute_time_ms() +
+         details.all_reduce_sync_time_ms();
+}
+
+inline double NonIdleTimeMs(const PerTpuStepDetails& details) {
+  return ComputeTimeMs(details) + InfeedTimeMs(details) +
+         AllReduceTimeMs(details) + details.tc_outfeed_time_ms();
+}
+
+// Time spent by a training step on TPU.
+inline double StepTimeMs(const PerTpuStepDetails& details) {
+  return NonIdleTimeMs(details) + details.tc_idle_time_ms();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_DETAILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/trace_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/trace_utils.h
new file mode 100644
index 00000000..89e2b4cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/trace_utils.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_TRACE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_TRACE_UTILS_H_
+
+#include "xla/tsl/profiler/utils/trace_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::IsDerivedThreadId;            // NOLINT
+using tsl::profiler::kFirstDeviceId;               // NOLINT
+using tsl::profiler::kHostThreadsDeviceId;         // NOLINT
+using tsl::profiler::kLastDeviceId;                // NOLINT
+using tsl::profiler::kThreadIdDerivedMax;          // NOLINT
+using tsl::profiler::kThreadIdDerivedMin;          // NOLINT
+using tsl::profiler::kThreadIdHloModule;           // NOLINT
+using tsl::profiler::kThreadIdHloOp;               // NOLINT
+using tsl::profiler::kThreadIdHostOffloadOpEnd;    // NOLINT
+using tsl::profiler::kThreadIdHostOffloadOpStart;  // NOLINT
+using tsl::profiler::kThreadIdKernelLaunch;        // NOLINT
+using tsl::profiler::kThreadIdOverhead;            // NOLINT
+using tsl::profiler::kThreadIdSource;              // NOLINT
+using tsl::profiler::kThreadIdStepInfo;            // NOLINT
+using tsl::profiler::kThreadIdTfNameScope;         // NOLINT
+using tsl::profiler::kThreadIdTfOp;                // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_TRACE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_builder.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_builder.h
new file mode 100644
index 00000000..c0e2c39b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_builder.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::XEventBuilder;  // NOLINT
+using tsl::profiler::XLineBuilder;   // NOLINT
+using tsl::profiler::XPlaneBuilder;  // NOLINT
+using tsl::profiler::XStatsBuilder;  // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_schema.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_schema.h
new file mode 100644
index 00000000..cfa748bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_schema.h
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
+
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::FindHostEventType;             // NOLINT
+using tsl::profiler::FindStatType;                  // NOLINT
+using tsl::profiler::FindTfOpEventType;             // NOLINT
+using tsl::profiler::GetHostEventTypeStr;           // NOLINT
+using tsl::profiler::GetStatTypeStr;                // NOLINT
+using tsl::profiler::GpuPlaneName;                  // NOLINT
+using tsl::profiler::HostEventType;                 // NOLINT
+using tsl::profiler::IsHostEventType;               // NOLINT
+using tsl::profiler::IsInternalEvent;               // NOLINT
+using tsl::profiler::IsInternalStat;                // NOLINT
+using tsl::profiler::IsStatType;                    // NOLINT
+using tsl::profiler::kCuptiDriverApiPlaneName;      // NOLINT
+using tsl::profiler::kCustomPlanePrefix;            // NOLINT
+using tsl::profiler::kDeviceVendorAMD;              // NOLINT
+using tsl::profiler::kDeviceVendorNvidia;           // NOLINT
+using tsl::profiler::kGpuPlanePrefix;               // NOLINT
+using tsl::profiler::kHostOffloadOpLineName;        // NOLINT
+using tsl::profiler::kHostThreadsPlaneName;         // NOLINT
+using tsl::profiler::kKernelLaunchLineName;         // NOLINT
+using tsl::profiler::kMegaScaleBarrier;             // NOLINT
+using tsl::profiler::kMegaScaleD2HTransferFinished;  // NOLINT
+using tsl::profiler::kMegaScaleD2HTransferStart;     // NOLINT
+using tsl::profiler::kMegaScaleDcnReceive;           // NOLINT
+using tsl::profiler::kMegaScaleDcnSend;              // NOLINT
+using tsl::profiler::kMegaScaleDcnSendFinished;      // NOLINT
+using tsl::profiler::kMegaScaleH2DTransferFinished;  // NOLINT
+using tsl::profiler::kMegaScaleH2DTransferStart;     // NOLINT
+using tsl::profiler::kMegaScaleHostCommand;          // NOLINT
+using tsl::profiler::kMegaScaleTopologyDiscovery;    // NOLINT
+using tsl::profiler::kMetadataPlaneName;             // NOLINT
+using tsl::profiler::kPythonTracerPlaneName;         // NOLINT
+using tsl::profiler::kRoctracerApiPlaneName;         // NOLINT
+using tsl::profiler::kSourceLineName;                // NOLINT
+using tsl::profiler::kSparseCorePlaneRegex;          // NOLINT
+using tsl::profiler::kStepLineName;                  // NOLINT
+using tsl::profiler::kTensorFlowNameScopeLineName;   // NOLINT
+using tsl::profiler::kTensorFlowOpLineName;          // NOLINT
+using tsl::profiler::kTFStreamzPlaneName;            // NOLINT
+using tsl::profiler::kTpuPlanePrefix;                // NOLINT
+using tsl::profiler::kTpuPlaneRegex;                 // NOLINT
+using tsl::profiler::kTpuRuntimePlaneName;           // NOLINT
+using tsl::profiler::kXlaAsyncOpLineName;            // NOLINT
+using tsl::profiler::kXlaModuleLineName;             // NOLINT
+using tsl::profiler::kXlaOpLineName;                 // NOLINT
+using tsl::profiler::kXProfMetadataBufferSize;       // NOLINT
+using tsl::profiler::kXProfMetadataFlow;             // NOLINT
+using tsl::profiler::kXProfMetadataKey;              // NOLINT
+using tsl::profiler::kXProfMetadataTransfers;        // NOLINT
+using tsl::profiler::StatType;                       // NOLINT
+using tsl::profiler::TpuPlaneName;                   // NOLINT
+using tsl::profiler::XFlow;                          // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_test_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_test_utils.h
new file mode 100644
index 00000000..c2619394
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_test_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+
+#include <initializer_list>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/variant.h"
+#include "xla/tsl/profiler/utils/xplane_test_utils.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::CreateTfFunctionCallEvent;  // NOLINT
+using tsl::profiler::CreateXEvent;               // NOLINT
+using tsl::profiler::GetOrCreateGpuXPlane;       // NOLINT
+using tsl::profiler::GetOrCreateHostXPlane;      // NOLINT
+using tsl::profiler::GetOrCreateTpuXPlane;       // NOLINT
+using tsl::profiler::XStatValue;                 // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_utils.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_utils.h
new file mode 100644
index 00000000..9292ed6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_utils.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_UTILS_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "xla/tsl/profiler/utils/xplane_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::AddFlowsToXplane;               // NOLINT
+using tsl::profiler::AggregateXPlane;                // NOLINT
+using tsl::profiler::FindLinesWithId;                // NOLINT
+using tsl::profiler::FindLineWithId;                 // NOLINT
+using tsl::profiler::FindLineWithName;               // NOLINT
+using tsl::profiler::FindMutablePlanes;              // NOLINT
+using tsl::profiler::FindMutablePlanesWithPrefix;    // NOLINT
+using tsl::profiler::FindMutablePlaneWithName;       // NOLINT
+using tsl::profiler::FindOrAddMutablePlaneWithName;  // NOLINT
+using tsl::profiler::FindOrAddMutableStat;           // NOLINT
+using tsl::profiler::FindPlanes;                     // NOLINT
+using tsl::profiler::FindPlanesWithNames;            // NOLINT
+using tsl::profiler::FindPlanesWithPrefix;           // NOLINT
+using tsl::profiler::FindPlaneWithName;              // NOLINT
+using tsl::profiler::GetDevicePlaneFingerprint;      // NOLINT
+using tsl::profiler::GetSortedEvents;                // NOLINT
+using tsl::profiler::GetStartTimestampNs;            // NOLINT
+using tsl::profiler::IsEmpty;                        // NOLINT
+using tsl::profiler::MergePlanes;                    // NOLINT
+using tsl::profiler::NormalizeTimestamps;            // NOLINT
+using tsl::profiler::RemoveEmptyLines;               // NOLINT
+using tsl::profiler::RemoveEmptyPlanes;              // NOLINT
+using tsl::profiler::RemoveEvents;                   // NOLINT
+using tsl::profiler::RemoveLine;                     // NOLINT
+using tsl::profiler::RemovePlane;                    // NOLINT
+using tsl::profiler::RemovePlanes;                   // NOLINT
+using tsl::profiler::SortPlanesById;                 // NOLINT
+using tsl::profiler::SortXLinesBy;                   // NOLINT
+using tsl::profiler::SortXPlane;                     // NOLINT
+using tsl::profiler::SortXSpace;                     // NOLINT
+using tsl::profiler::XEventContextTracker;           // NOLINT
+using tsl::profiler::XEventsComparator;              // NOLINT
+using tsl::profiler::XEventTimespan;                 // NOLINT
+using tsl::profiler::XLinesComparatorByName;         // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_visitor.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_visitor.h
new file mode 100644
index 00000000..81db4a4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
+
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::TypeGetter;             // NOLINT
+using tsl::profiler::TypeGetterList;         // NOLINT
+using tsl::profiler::XEventMetadataVisitor;  // NOLINT
+using tsl::profiler::XEventVisitor;          // NOLINT
+using tsl::profiler::XLineVisitor;           // NOLINT
+using tsl::profiler::XPlaneVisitor;          // NOLINT
+using tsl::profiler::XStatsOwner;            // NOLINT
+using tsl::profiler::XStatVisitor;           // NOLINT
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h
new file mode 100644
index 00000000..6977295c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPROF_GPU_COST_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPROF_GPU_COST_ANALYSIS_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// XProfGpuCostAnalysis provides additional cost analysis for XProf, which
+// normalizes the flops to the device flops based on input bit widths.
+class XProfGpuCostAnalysis : public xla::gpu::GpuHloCostAnalysis {
+ public:
+  explicit XProfGpuCostAnalysis(const xla::HloCostAnalysis::Options& options)
+      : xla::gpu::GpuHloCostAnalysis(options) {}
+
+  absl::Status Postprocess(const xla::HloInstruction* hlo) override;
+
+  int64_t GetDeviceFlopsAdjustment(const xla::HloInstruction& hlo);
+
+ protected:
+  std::unique_ptr<xla::HloCostAnalysis> CreateNestedCostAnalysis() override;
+
+ private:
+  static inline constexpr absl::string_view kDeviceFlopsAdjustment =
+      "device_flops_adjustment";
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPROF_GPU_COST_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/public/session.h b/third_party/tflite-hdrs/tensorflow/core/public/session.h
new file mode 100644
index 00000000..b16a5955
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/public/session.h
@@ -0,0 +1,362 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PUBLIC_SESSION_H_
+#define TENSORFLOW_CORE_PUBLIC_SESSION_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/threadpool_options.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class DeviceMgr;
+
+/// \brief A Session instance lets a caller drive a TensorFlow graph
+/// computation.
+///
+/// When a Session is created with a given target, a new Session object
+/// is bound to the universe of resources specified by that target.
+/// Those resources are available to this session to perform
+/// computation described in the GraphDef.  After extending the session
+/// with a graph, the caller uses the Run() API to perform the
+/// computation and potentially fetch outputs as Tensors.
+///
+/// Example:
+///
+/// ```c++
+///
+///     tensorflow::GraphDef graph;
+///     // ... Create or load graph into "graph".
+///
+///     // This example uses the default options which connects
+///     // to a local runtime.
+///     tensorflow::SessionOptions options;
+///     std::unique_ptr<tensorflow::Session>
+///     session(tensorflow::NewSession(options));
+///
+///     // Create the session with this graph.
+///     tensorflow::Status s = session->Create(graph);
+///     if (!s.ok()) { ... }
+///
+///     // Run the graph and fetch the first output of the "output"
+///     // operation, and also run to but do not return anything
+///     // for the "update_state" operation.
+///     std::vector<tensorflow::Tensor> outputs;
+///     s = session->Run({}, {"output:0"}, {"update_state"}, &outputs);
+///     if (!s.ok()) { ... }
+///
+///     // Map the output as a flattened float tensor, and do something
+///     // with it.
+///     auto output_tensor = outputs[0].flat<float>();
+///     if (output_tensor(0) > 0.5) { ... }
+///
+///     // Close the session to release the resources associated with
+///     // this session.
+///     session->Close();
+///
+/// ```
+///
+/// A Session allows concurrent calls to Run(), though a Session must
+/// be created / extended by a single thread.
+///
+/// Only one thread must call Close(), and Close() must only be called
+/// after all other calls to Run() have returned.
+class Session {
+ public:
+  Session();
+  virtual ~Session();
+
+  /// \brief Create the graph to be used for the session.
+  ///
+  /// Returns an error if this session has already been created with a
+  /// graph. To re-use the session with a different graph, the caller
+  /// must Close() the session first.
+  virtual absl::Status Create(const GraphDef& graph) = 0;
+#ifndef SWIG
+  virtual absl::Status Create(GraphDef&& graph) { return Create(graph); }
+#endif
+
+  /// \brief Adds operations to the graph that is already registered with the
+  /// Session.
+  ///
+  /// The names of new operations in "graph" must not exist in the
+  /// graph that is already registered.
+  virtual absl::Status Extend(const GraphDef& graph) = 0;
+#ifndef SWIG
+  virtual absl::Status Extend(GraphDef&& graph) { return Extend(graph); }
+#endif
+
+  /// \brief Runs the graph with the provided input tensors and fills
+  /// `outputs` for the endpoints specified in `output_tensor_names`.
+  /// Runs to but does not return Tensors for the nodes in
+  /// `target_tensor_names`.
+  ///
+  /// The order of tensors in `outputs` will match the order provided
+  /// by `output_tensor_names`.
+  ///
+  /// If `Run` returns `OK()`, then `outputs->size()` will be equal to
+  /// `output_tensor_names.size()`.  If `Run` does not return `OK()`, the
+  /// state of `outputs` is undefined.
+  ///
+  /// REQUIRES: The name of each Tensor of the input or output must
+  /// match a "Tensor endpoint" in the `GraphDef` passed to `Create()`.
+  ///
+  /// REQUIRES: At least one of `output_tensor_names` and
+  /// `target_tensor_names` must be non-empty.
+  ///
+  /// REQUIRES: outputs is not nullptr if `output_tensor_names` is non-empty.
+  virtual absl::Status Run(
+      const std::vector<std::pair<std::string, Tensor> >& inputs,
+      const std::vector<std::string>& output_tensor_names,
+      const std::vector<std::string>& target_tensor_names,
+      std::vector<Tensor>* outputs) = 0;
+
+  /// \brief Implementations which support `RunOptions`.
+  //
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status Create(const RunOptions& run_options,
+                              const GraphDef& graph) {
+    return absl::UnimplementedError(
+        "Create(const RunOptions& run_options, const GraphDef& graph) is not "
+        "supported for this session.");
+  }
+  virtual absl::Status Extend(const RunOptions& run_options,
+                              const GraphDef& graph) {
+    return absl::UnimplementedError(
+        "Extend(const RunOptions& run_options, const GraphDef& graph) is not "
+        "supported for this session.");
+  }
+#ifndef SWIG
+  virtual absl::Status Create(const RunOptions& run_options, GraphDef&& graph) {
+    return Create(run_options, graph);
+  }
+  virtual absl::Status Extend(const RunOptions& run_options, GraphDef&& graph) {
+    return Extend(run_options, graph);
+  }
+#endif
+  virtual absl::Status Close(const RunOptions& run_options) {
+    return absl::UnimplementedError(
+        "Close(const RunOptions& run_options) is not supported for this "
+        "session.");
+  }
+
+  /// \brief Like `Run`, but allows users to pass in a `RunOptions` proto and
+  /// to retrieve non-Tensor metadata output via a `RunMetadata` proto for this
+  /// step.  `run_metadata` may be nullptr, in which case any metadata output is
+  /// discarded.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status Run(
+      const RunOptions& run_options,
+      const std::vector<std::pair<std::string, Tensor> >& inputs,
+      const std::vector<std::string>& output_tensor_names,
+      const std::vector<std::string>& target_tensor_names,
+      std::vector<Tensor>* outputs, RunMetadata* run_metadata);
+
+  /// \brief Like `Run` with `RunOptions` proto, but allows user to provide
+  /// custom threadpool implementation via ThreadPoolOptions.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status Run(
+      const RunOptions& run_options,
+      const std::vector<std::pair<std::string, Tensor> >& inputs,
+      const std::vector<std::string>& output_tensor_names,
+      const std::vector<std::string>& target_tensor_names,
+      std::vector<Tensor>* outputs, RunMetadata* run_metadata,
+      const thread::ThreadPoolOptions& threadpool_options) {
+    return absl::UnimplementedError(
+        "Run with threadpool is not supported for this session.");
+  }
+
+  /// \brief Sets up a graph for partial execution. All future feeds and
+  /// fetches are specified by `input_names` and `output_names`. Returns
+  /// `handle` that can be used to perform a sequence of partial feeds and
+  /// fetches.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status PRunSetup(const std::vector<std::string>& input_names,
+                                 const std::vector<std::string>& output_names,
+                                 const std::vector<std::string>& target_nodes,
+                                 std::string* handle);
+
+  /// \brief Continues the pending execution specified by `handle` with the
+  /// provided input tensors and fills `outputs` for the endpoints specified
+  /// in `output_names`.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status PRun(
+      const std::string& handle,
+      const std::vector<std::pair<std::string, Tensor> >& inputs,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor>* outputs);
+
+  /// \brief List devices in the session.
+  ///
+  /// Retrieves the list of available devices within the session, and populates
+  /// *response. This API is optional. If it is unimplemented, Status will
+  /// return a corresponding error message, and *response will be unmodified.
+  virtual absl::Status ListDevices(std::vector<DeviceAttributes>* response) = 0;
+
+  /// \brief Closes this session.
+  ///
+  /// Closing a session releases the resources used by this session
+  /// on the TensorFlow runtime (specified during session creation by
+  /// the `SessionOptions::target` field).
+  virtual absl::Status Close() = 0;
+
+  // NOTE(ashankar): As of July 2017, this method was added to facilitate some
+  // experimentation. Reconsider/re-evaluate after September 2017.
+  //
+  // Sets `*output` to the `DeviceMgr` that owns accessible devices in the
+  // address-space of the caller.
+  virtual absl::Status LocalDeviceManager(const DeviceMgr** output) {
+    return absl::UnimplementedError(
+        "LocalDeviceManager is not supported for this session.");
+  }
+
+  /// \brief A handle to a subgraph, created with `Session::MakeCallable()`.
+  typedef int64_t CallableHandle;
+
+  /// \brief Creates a `handle` for invoking the subgraph defined by
+  /// `callable_options`.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status MakeCallable(const CallableOptions& callable_options,
+                                    CallableHandle* out_handle) {
+    return absl::UnimplementedError(
+        "MakeCallable is not supported for this session.");
+  }
+
+  /// \brief Invokes the subgraph named by `handle` with the given options and
+  /// input tensors.
+  ///
+  /// The order of tensors in `feed_tensors` must and `fetch_tensors` will
+  /// match the order of names in `CallableOptions::feed()` and
+  /// `CallableOptions::fetch()` when this subgraph was created.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status RunCallable(CallableHandle handle,
+                                   const std::vector<Tensor>& feed_tensors,
+                                   std::vector<Tensor>* fetch_tensors,
+                                   RunMetadata* run_metadata) {
+    return absl::UnimplementedError(
+        "RunCallable is not supported for this session.");
+  }
+
+  /// \brief Invokes the subgraph named by `handle` with the given options and
+  /// input tensors. User can provide custom threadpool implementation via
+  /// threadpool_options.
+  ///
+  /// The order of tensors in `feed_tensors` must and `fetch_tensors` will
+  /// match the order of names in `CallableOptions::feed()` and
+  /// `CallableOptions::fetch()` when this subgraph was created.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status RunCallable(
+      CallableHandle handle, const std::vector<Tensor>& feed_tensors,
+      std::vector<Tensor>* fetch_tensors, RunMetadata* run_metadata,
+      const thread::ThreadPoolOptions& threadpool_options) {
+    return absl::UnimplementedError(
+        "RunCallable with threadpool is not supported for this session.");
+  }
+
+  /// \brief Releases resources associated with the given `handle` in this
+  /// session.
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status ReleaseCallable(CallableHandle handle) {
+    return absl::UnimplementedError(
+        "ReleaseCallable is not supported for this session.");
+  }
+
+  /// \brief Release global graph-related state in this session.
+  ///
+  /// After calling `this->Finalize()`, calls to `this->Run()` with previously
+  /// unseen feeds and fetches, and calls to `this->MakeCallable()` will fail.
+  /// Using `MakeCallable()` and `RunCallable()` is recommended, because
+  /// explicit callable creation makes it clearer where the `Finalize()` call
+  /// should be placed.
+  ///
+  /// This API can be used in conjunction with a "warmup" phase to reduce the
+  /// memory consumed by the session:
+  ///
+  /// 1. Call `Session::Create()`.
+  /// 2. Call `Session::MakeCallable()` for all subgraphs that you will execute
+  ///    in the session.
+  /// 3. Call `Session::Finalize()` to release global graph-related state.
+  /// 4. Call `Session::RunCallable()` with the handle(s) created in step 2.
+  ///
+  /// NOTE: This API is still experimental and may change.
+  virtual absl::Status Finalize() {
+    return absl::UnimplementedError(
+        "Finalize is not supported for this session.");
+  }
+};
+
+/// \brief Create a new session with the given options.
+///
+/// If session creation succeeds, the new `Session` will be stored in
+/// `*out_session`, the caller will take ownership of the returned
+/// `*out_session`, and this function will return `OK()`. Otherwise, this
+/// function will return an error status and set *out_session to nullptr.
+absl::Status NewSession(const SessionOptions& options, Session** out_session);
+
+/// \brief Resets resource containers associated with a target.
+///
+/// Reset() allows misbehaving or slow sessions to be aborted and closed, and
+/// causes their resources eventually to be released.  Reset() does not wait
+/// for the computations in old sessions to cease; it merely starts the
+/// process of tearing them down.  However, if a new session is started after
+/// a Reset(), the new session is isolated from changes that old sessions
+/// (started prior to the Reset()) may continue to make to resources, provided
+/// all those resources are in containers listed in "containers".
+///
+/// Old sessions may continue to have side-effects on resources not in
+/// containers listed in "containers", and thus may affect future
+/// sessions' results in ways that are hard to predict.  Thus, if well-defined
+/// behavior is desired, it is recommended that all containers be listed in
+/// "containers".
+///
+/// `containers` is a vector of string representation of resource container
+/// names. When a resource container is reset, the resources held by the
+/// container will be released. In particular, all Variables in the container
+/// will become undefined.  If the "containers" vector is empty, the default
+/// container is assumed.  If the "containers" vector is non-empty, the
+/// default container should be listed explicitly.
+///
+/// If Reset succeeds, this function will return `OK()`. Otherwise, this
+/// function will return an error status.
+absl::Status Reset(const SessionOptions& options,
+                   const std::vector<std::string>& containers);
+
+/// \brief Create a new session with the given options.
+///
+/// If a new `Session` object could not be created, this function will
+/// return nullptr.
+///
+/// *Strongly prefer* the version of NewSession that returns Status,
+/// which contains more helpful error information.
+Session* NewSession(const SessionOptions& options);
+
+/// \brief Export the metric that indicates the session is created.
+void SetSessionCreatedMetric();
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PUBLIC_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/public/session_options.h b/third_party/tflite-hdrs/tensorflow/core/public/session_options.h
new file mode 100644
index 00000000..92134528
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/public/session_options.h
@@ -0,0 +1,67 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PUBLIC_SESSION_OPTIONS_H_
+#define TENSORFLOW_CORE_PUBLIC_SESSION_OPTIONS_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+
+/// Configuration information for a Session.
+struct SessionOptions {
+  /// The environment to use.
+  tsl::Env* env;
+
+  /// \brief The TensorFlow runtime to connect to.
+  ///
+  /// If 'target' is empty or unspecified, the local TensorFlow runtime
+  /// implementation will be used.  Otherwise, the TensorFlow engine
+  /// defined by 'target' will be used to perform all computations.
+  ///
+  /// "target" can be either a single entry or a comma separated list
+  /// of entries. Each entry is a resolvable address of the
+  /// following format:
+  ///   local
+  ///   ip:port
+  ///   host:port
+  ///   ... other system-specific formats to identify tasks and jobs ...
+  ///
+  /// NOTE: at the moment 'local' maps to an in-process service-based
+  /// runtime.
+  ///
+  /// Upon creation, a single session affines itself to one of the
+  /// remote processes, with possible load balancing choices when the
+  /// "target" resolves to a list of possible processes.
+  ///
+  /// If the session disconnects from the remote process during its
+  /// lifetime, session calls may fail immediately.
+  std::string target;
+
+  /// Configuration options.
+  ConfigProto config;
+
+  SessionOptions();
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PUBLIC_SESSION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/public/version.h b/third_party/tflite-hdrs/tensorflow/core/public/version.h
new file mode 100644
index 00000000..72ec42a5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/public/version.h
@@ -0,0 +1,127 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PUBLIC_VERSION_H_
+#define TENSORFLOW_CORE_PUBLIC_VERSION_H_
+
+// TensorFlow uses semantic versioning, see http://semver.org/.
+
+// Also update tensorflow/tensorflow.bzl and
+// tensorflow/tools/pip_package/setup.py
+#define TF_MAJOR_VERSION 2
+#define TF_MINOR_VERSION 19
+#define TF_PATCH_VERSION 0
+
+// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
+// "-beta", "-rc", "-rc.1")
+#define TF_VERSION_SUFFIX ""
+
+#define TF_STR_HELPER(x) #x
+#define TF_STR(x) TF_STR_HELPER(x)
+
+// e.g. "0.5.0" or "0.6.0-alpha".
+#define TF_VERSION_STRING                                            \
+  (TF_STR(TF_MAJOR_VERSION) "." TF_STR(TF_MINOR_VERSION) "." TF_STR( \
+      TF_PATCH_VERSION) TF_VERSION_SUFFIX)
+
+// GraphDef compatibility versions (the versions field in graph.proto).
+//
+// Each graph has producer and min_consumer versions, and each
+// consumer has its own version and a min_producer.  In addition, graphs can
+// mark specific consumer versions as bad (to prevent bugs from executing).
+// A consumer will execute a graph if the consumer's version is at least the
+// graph's min_consumer, the graph's producer version is at least the consumer's
+// min_producer, and the consumer version isn't specifically disallowed by the
+// graph.
+//
+// By default, newly created graphs have producer version TF_GRAPH_DEF_VERSION
+// min_consumer TF_GRAPH_DEF_MIN_CONSUMER, and no other bad consumer versions.
+//
+// Version history:
+//
+// 0. Graphs created before GraphDef versioning
+// 1. First real version (2dec2015)
+// 2. adjust_contrast only takes float, doesn't perform clamping (11dec2015)
+// 3. Remove TileGrad, since it was equivalent to reduce_sum (30dec2015)
+// 4. When support for this version is removed, we can safely make AttrValue
+//    parsing more strict with respect to empty list values (see
+//    111635679, 7jan2016).
+// 5. Graphs are wholly-validated during Session::Create() (7jan2016).
+// 6. TensorFlow is scalar strict within Google (27jan2016).
+// 7. Remove TopK in favor of TopKV2 (5feb2016).
+// 8. Replace RandomCrop from C++ with pure Python (5feb2016).
+// 9. Deprecate batch_norm_with_global_normalization (16feb2016).
+// 10. Deprecate conv3d_backprop_{filter,input} (10jun2016).
+// 11. Deprecate {batch}_self_adjoint_eig (3aug2016).
+// 12. Graph consumers understand the node_def field of FunctionDef (22aug2016).
+// 13. Deprecate multiple batch linear algebra ops (9sep2016).
+// 14. Deprecate batch_matrix_* ops. (10sep2016).
+// 15. Deprecate batch_fft_* ops. (14sep2016).
+// 16. Deprecate tensor_array (v1) ops in favor of v2 (10nov2016).
+// 17. Deprecate inv (11nov2016).
+// 17. Expose reverse_v2 (10nov2016)
+// 18. Add VariableV2 (30nov2016)
+// 19. Deprecated ops created by models moved out of core SkipGram, NegTrain.
+//     (08dec2016)
+// 20. Catch all version 1.0 changes to Python API generation. SplitV is now
+//     used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is
+//     now used by tf.concat. Graphs use flooring
+//     division and mod semantics. TensorArrayV3. (12dec2016)
+//     Also considered the version for when it is required for reduction
+//     ops' indices to be scalar or vector, and not higher rank.
+//     Some earlier graph def versions allowed this.
+// 21. Dropped FunctionDef.Node support, switched to node_def introduced
+//     in version 12. (11jan2017)
+// 22. Placeholder now can specify and enforce scalar and partial
+//     shapes, particularly when restoring a graph from GraphDef
+//     produced at version 22 or later.  (04/10/2016)
+// 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
+// 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
+// 25. Deprecate stack (v1) ops in favor of v2 (2017/6/15).
+// 25. Deprecate RandomPoisson (v1) ops in favor of v2 (2017/10/25).
+// 26. Add a bool 'stripped_default_attrs' to MetaInfoDef indicating
+//     whether default-valued attrs have been stripped from the nodes in the
+//     GraphDef. (7dec2017)
+// 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops
+//     deprecated in favor of V2 ops. (2018/01/23)
+// 28. Deprecate MatrixExponential op in favor of Python implementation.
+//     (2018/08/21).
+// (2019/02/15). Added `control_ret` field to FunctionDef proto, and
+//     `control_output` field to OpDef proto.
+// 29. Deprecate StatefulStandardNormal op in favor of StatefulStandardNormalV2.
+//     (2019/03/25).
+// (2019/04/17). Added `arg_attr` field to FunctionDefProto.
+// 30. (2019/05/09) First date based GraphDef version. GraphDef
+//     versions advance by 1 each day after this point.
+
+#define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
+#define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
+#define TF_GRAPH_DEF_VERSION 2102  // Updated: 2025/1/9
+
+// Checkpoint compatibility versions (the versions field in SavedSliceMeta).
+//
+// The checkpoint versions have the same semantics as GraphDef versions, but the
+// numbering scheme is separate.  We have no plans to ever deprecate checkpoint
+// versions, but it's good to have this in place in case we ever need to.
+//
+// Version history:
+//
+// 0. Checkpoints saved before checkpoint versioning.
+// 1. First real version (10feb2015).
+#define TF_CHECKPOINT_VERSION_MIN_PRODUCER 0
+#define TF_CHECKPOINT_VERSION_MIN_CONSUMER 0
+#define TF_CHECKPOINT_VERSION 1
+
+#endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/bef_executor_flags.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/bef_executor_flags.h
new file mode 100644
index 00000000..eccc43de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/bef_executor_flags.h
@@ -0,0 +1,51 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_BEF_EXECUTOR_FLAGS_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_BEF_EXECUTOR_FLAGS_H_
+
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "tfrt/bef_executor_driver/bef_executor_driver.h"  // from @tf_runtime
+
+namespace tfrt {
+ABSL_CONST_INIT extern const char kDefaultInputFilename[];
+
+struct HostAllocatorTypeWrapper {
+  HostAllocatorTypeWrapper(HostAllocatorType type) : type(type) {}
+  operator HostAllocatorType() { return type; }
+  HostAllocatorType type;
+};
+
+}  // namespace tfrt
+
+ABSL_DECLARE_FLAG(std::string, input_filename);
+ABSL_DECLARE_FLAG(std::string, shared_libs);
+ABSL_DECLARE_FLAG(std::string, functions);
+ABSL_DECLARE_FLAG(std::string, test_init_function);
+ABSL_DECLARE_FLAG(std::string, work_queue_type);
+ABSL_DECLARE_FLAG(tfrt::HostAllocatorTypeWrapper, host_allocator_type);
+
+namespace tfrt {
+
+bool AbslParseFlag(absl::string_view text,
+                   tfrt::HostAllocatorTypeWrapper* host_allocator_type,
+                   std::string* error);
+
+std::string AbslUnparseFlag(tfrt::HostAllocatorTypeWrapper host_allocator_type);
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_BEF_EXECUTOR_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/conversion/conversion.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/conversion/conversion.h
new file mode 100644
index 00000000..c31855e2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/conversion/conversion.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements conversion function between RuntimeFallback and
+// KernelFallback.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_CONVERSION_CONVERSION_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_CONVERSION_CONVERSION_H_
+
+namespace tfrt {
+
+class TensorConversionFnRegistry;
+
+}
+
+namespace tensorflow {
+namespace tfd {
+void RegisterRuntimeFallbackTensorToKernelFallbackConversionFn(
+    tfrt::TensorConversionFnRegistry* registry);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_CONVERSION_CONVERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/attr_util.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/attr_util.h
new file mode 100644
index 00000000..4abbb4f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/attr_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_ATTR_UTIL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_ATTR_UTIL_H_
+
+#include <map>
+#include <string>
+#include <typeinfo>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/StringMap.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/runtime_fallback/util/attr_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
+#include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// Map from attribute name to a string value representation.
+typedef llvm::StringMap<std::string> AttrMap;
+
+// Parse value from the given string input.
+absl::Status ParseValue(absl::string_view input, bool* value);
+absl::Status ParseValue(absl::string_view input, int32* value);
+absl::Status ParseValue(absl::string_view input, DataType* value);
+absl::Status ParseValue(absl::string_view input, std::string* value);
+absl::Status ParseValue(absl::string_view input, std::vector<int32>* value);
+absl::Status ParseValue(absl::string_view input, Padding* value);
+
+absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
+                       tfrt::OpAttrs* opattrs);
+
+absl::Status FillOpAttrs(tfrt::RemainingAttributes attrs,
+                         tfrt::OpAttrs* opattrs);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_ATTR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/conversion/conversion.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/conversion/conversion.h
new file mode 100644
index 00000000..782e31f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/conversion/conversion.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements conversion function between KernelFallback and Host
+// Tensor.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_CONVERSION_CONVERSION_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_CONVERSION_CONVERSION_H_
+
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+namespace tfrt {
+
+class TensorConversionFnRegistry;
+class DenseHostTensor;
+class CpuDevice;
+class Device;
+class ExecutionContext;
+}
+
+namespace tensorflow {
+class KernelFallbackTensor;
+namespace tfd {
+
+void RegisterKernelFallbackTensorConversionFn(
+    tfrt::TensorConversionFnRegistry* registry);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_CONVERSION_CONVERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
new file mode 100644
index 00000000..6cfbf88c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
@@ -0,0 +1,249 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_COMPAT_REQUEST_STATE_H__
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_COMPAT_REQUEST_STATE_H__
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/support/pointer_util.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+// FallbackResourceArray holds the tensors that are computed only once during
+// initialization and read-only afterwards.
+class FallbackResourceArray {
+ public:
+  // Sets `tensor` in the array at `index`. `index` should be dense and
+  // duplicate indices are not allowed.
+  void SetResource(int index, tfrt_stub::ImmutableTensor tensor);
+
+  // Returns the resource tensor wrapped in AsyncValue value at `index`.
+  tfrt::AsyncValuePtr<tfrt_stub::FallbackTensor> GetResource(int index) const {
+    return resource_async_values_.at(index).AsPtr();
+  }
+
+  // Returns the resource tensor at `index`.
+  const tfrt_stub::FallbackTensor& GetResourceAsFallbackTensor(
+      int index) const {
+    return GetResource(index).get();
+  }
+
+ private:
+  // `resources_` holds the ownership of all the resource tensors. Note that it
+  // may not be a one-to-one mapping between `resources_` and
+  // `resource_async_values_`.
+  std::vector<std::unique_ptr<tfrt_stub::ImmutableTensor>> resources_;
+
+  // Storage for async values with manually managed lifetime.
+  std::vector<std::unique_ptr<
+      tfrt::internal::AsyncValueStorage<tfrt_stub::FallbackTensor>>>
+      resource_storage_;
+
+  // `resource_async_values_` holds the UnRefCountedAsyncValue of the fallback
+  // tensors that can be directly used by fallback kernels in the graph.
+  std::vector<tfrt::AsyncValueOwningRef<tfrt_stub::FallbackTensor>>
+      resource_async_values_;
+};
+
+// Per-request state in kernel falllback compat mode.
+class KernelFallbackCompatRequestState {
+ public:
+  // NOTE: This is the constructor for training.
+  KernelFallbackCompatRequestState(
+      std::function<void(std::function<void()>)>* runner,
+      const tensorflow::DeviceMgr* device_manager, int64_t step_id,
+      tfrt::OwnedOrUnownedPtr<ScopedStepContainer> step_container,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      core::RefCountPtr<Rendezvous> rendezvous,
+      tfrt_stub::OpKernelRunnerTable* runner_table,
+      FallbackResourceArray* resource_array,
+      tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
+      const absl::optional<SessionMetadata>& model_metadata,
+      const tensorflow::ProcessFunctionLibraryRuntime* pflr);
+
+  // NOTE: This is the constructor for inference.
+  KernelFallbackCompatRequestState(
+      std::function<void(std::function<void()>)>* runner,
+      const tensorflow::DeviceMgr* device_manager, int64_t step_id,
+      tfrt_stub::OpKernelRunnerTable* runner_table,
+      FallbackResourceArray* resource_array,
+      tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
+      const absl::optional<SessionMetadata>& model_metadata,
+      const tensorflow::ProcessFunctionLibraryRuntime* pflr);
+
+  int64_t step_id() const { return step_id_; }
+
+  // Returns the user-specified custom device corresponding to the given device.
+  // It is currently only used for configure per-request intra op threadpool.
+  tensorflow::Device* custom_device(const tensorflow::Device* device) const {
+    auto it = custom_device_.find(device);
+    if (it == custom_device_.end()) return nullptr;
+    return it->second.get();
+  }
+
+  tensorflow::Device* cpu_device() const { return cpu_device_; }
+  tensorflow::FunctionLibraryRuntime* cpu_function_library_runtime() const {
+    return cpu_function_library_runtime_;
+  }
+
+  ScopedStepContainer* step_container() const { return step_container_.get(); }
+
+  const tensorflow::DeviceMgr& device_manager() const {
+    return *device_manager_;
+  }
+
+  const tensorflow::ProcessFunctionLibraryRuntime&
+  process_function_library_runtime() const {
+    return *pflr_;
+  }
+
+  CollectiveExecutor* collective_executor() const {
+    return collective_executor_;
+  }
+
+  tfrt_stub::OpKernelRunnerTable* runner_table() const { return runner_table_; }
+
+  FallbackResourceArray* resource_array() const { return resource_array_; }
+
+  std::function<void(std::function<void()>)>* runner() const { return runner_; }
+
+  CancellationManager* cancellation_manager() const {
+    return cancellation_manager_;
+  }
+  void set_cancellation_manager(CancellationManager* cancellation_manager) {
+    cancellation_manager_ = cancellation_manager;
+  }
+
+  RendezvousInterface* rendezvous() const { return rendezvous_.get(); }
+
+  void set_log_device_placement(bool log) { log_device_placement_ = log; }
+  bool log_device_placement() const { return log_device_placement_; }
+
+  tensorflow::thread::ThreadPoolInterface* intra_op_threadpool() const {
+    return intra_op_threadpool_;
+  }
+
+  const SessionMetadata& session_metadata() const { return session_metadata_; }
+
+  // Nullable.
+  tensorflow::tfrt_stub::CostRecorder* cost_recorder() const {
+    return cost_recorder_;
+  }
+  void set_cost_recorder(tensorflow::tfrt_stub::CostRecorder* cost_recorder) {
+    cost_recorder_ = cost_recorder;
+  }
+
+  // Nullable.
+  tfrt::ResourceContext* client_graph_resource_context() const {
+    return client_graph_resource_context_;
+  }
+  void set_client_graph_resource_context(
+      tfrt::ResourceContext* client_graph_resource_context) {
+    client_graph_resource_context_ = client_graph_resource_context;
+  }
+
+  void set_runtime_config(
+      const tensorflow::tfrt_stub::RuntimeConfig* runtime_config) {
+    runtime_config_ = runtime_config;
+  }
+
+  const tensorflow::tfrt_stub::RuntimeConfig* runtime_config() const {
+    return runtime_config_;
+  }
+
+ private:
+  int64_t step_id_ = 0;
+  // Below are resources needed by current tensorflow.
+  std::function<void(std::function<void()>)>* runner_ = nullptr;
+  ::tfrt::OwnedOrUnownedPtr<ScopedStepContainer> step_container_;
+  absl::flat_hash_map<const tensorflow::Device*,
+                      std::unique_ptr<tensorflow::Device>>
+      custom_device_;
+  std::unique_ptr<tensorflow::Device> custom_cpu_device_;
+  tensorflow::Device* cpu_device_ = nullptr;
+  tensorflow::FunctionLibraryRuntime* cpu_function_library_runtime_ = nullptr;
+  std::unique_ptr<CollectiveExecutor::Handle> collective_executor_handle_;
+  CollectiveExecutor* collective_executor_ = nullptr;
+  core::RefCountPtr<Rendezvous> rendezvous_;
+  CancellationManager* cancellation_manager_ = nullptr;
+
+  const tensorflow::DeviceMgr* device_manager_ = nullptr;
+
+  // `runner_table` holds the prepopulated tensorflow::OpKernel instances for
+  // kernel fallback compat mode.
+  tfrt_stub::OpKernelRunnerTable* runner_table_ = nullptr;
+
+  // Resource array is used for keeping static values in the runtime. It is
+  // accessed through tfrt_fallback_async.set_resource and
+  // tfrt_fallback_async.get_resource kernels.
+  FallbackResourceArray* resource_array_ = nullptr;
+
+  tensorflow::thread::ThreadPoolInterface* intra_op_threadpool_ = nullptr;
+
+  // Model metadata used for monitoring and tracing purpose.
+  SessionMetadata session_metadata_;
+
+  const tensorflow::ProcessFunctionLibraryRuntime* pflr_ = nullptr;
+
+  bool log_device_placement_ = false;
+
+  // Records the cost per op.
+  tensorflow::tfrt_stub::CostRecorder* cost_recorder_ = nullptr;
+
+  tfrt::ResourceContext* client_graph_resource_context_ = nullptr;
+
+  const tensorflow::tfrt_stub::RuntimeConfig* runtime_config_ = nullptr;
+};
+
+// Set up fallback context with common tensorflow states such as devices,
+// function library runtime. They will be forwarded to tensorflow::OpKernel as
+// in tensorflow::Executor. If `runner` is nullptr, internally it will use a
+// default runner that executes tasks in the caller thread.
+absl::Status SetUpKernelFallbackCompatRequestContext(
+    tfrt::RequestContextBuilder* builder,
+    const tensorflow::DeviceMgr* device_manager,
+    const tensorflow::ProcessFunctionLibraryRuntime* pflr,
+    tfrt_stub::OpKernelRunnerTable* runner_table,
+    FallbackResourceArray* resource_array,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
+    const std::optional<SessionMetadata>& model_metadata,
+    std::function<void(std::function<void()>)>* runner,
+    tfrt_stub::CostRecorder* cost_recorder,
+    tfrt::ResourceContext* client_graph_resource_context,
+    tensorflow::CancellationManager* cancellation_manager,
+    const tensorflow::tfrt_stub::RuntimeConfig* runtime_config);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_COMPAT_REQUEST_STATE_H__
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.h
new file mode 100644
index 00000000..f0c6359b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.h
@@ -0,0 +1,51 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Provides a way to execute a TensorFlow kernel using TFRT kernel fallback.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
+
+namespace tfrt {
+class AsyncKernelFrame;
+}  // namespace tfrt
+
+namespace tensorflow {
+namespace tfd {
+
+enum KernelFallbackOutputType {
+  TENSOR = 0,                 // Output type is tensorflow::Tensor
+  KERNEL_FALLBACK_TENSOR = 1  // Output type is KernelFallbackTensor
+};
+
+// Runs kernel asynchronously.
+// `frame` must contain tensorflow::Tensor inputs and pre-allocated
+// tensorflow::Tensor or tfrt::KernelFallbackTensor outputs.
+bool KernelFallbackExecute(
+    const tfrt::ExecutionContext& exec_ctx, tfrt::string_view op_name,
+    llvm::ArrayRef<tfrt::AsyncValue*> arguments,
+    llvm::MutableArrayRef<tfrt::RCReference<tfrt::AsyncValue>> results,
+    const tfrt::OpAttrsRef& attrs, KernelFallbackOutputType output_type);
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
new file mode 100644
index 00000000..a3888486
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_H_
+
+#include <functional>
+#include <optional>
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/chain.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor.h"  // from @tf_runtime
+
+namespace tfrt {
+class SyncKernelFrame;
+}  // namespace tfrt
+
+namespace tensorflow {
+namespace tfd {
+
+ABSL_CONST_INIT extern const char kOpKernelRunnerCacheResourceName[];
+
+// The CoreRuntime dispatch function to run a TF kernel in kernel fallback
+// compat mode.
+tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackExecuteCompatCoreRuntimeDispatch(
+    const tfrt::ExecutionContext& exec_ctx, tfrt::string_view op_name,
+    tfrt::string_view device_name, llvm::ArrayRef<tfrt::Tensor*> arguments,
+    llvm::MutableArrayRef<tfrt::RCReference<tfrt::AsyncValue>> results,
+    const KernelFallbackCompatRequestState& fallback_request_state,
+    const tfrt_stub::OpKernelRunner& op_kernel_runner);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h
new file mode 100644
index 00000000..cf3e0014
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_EAGER_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_EAGER_H_
+
+#include <optional>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+// Runner_table can be nullptr. In that case, kernel_fallback will use
+// the default runner_table.
+absl::Status SetUpKernelFallbackCompatRequestContext(
+    tfrt::RequestContextBuilder* builder,
+    tfrt_stub::OpKernelRunnerTable* runner_table,
+    tensorflow::EagerContext* eager_context,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr,
+    const absl::optional<SessionMetadata>& model_metadata = std::nullopt);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_EAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.h
new file mode 100644
index 00000000..b003c4e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares KernelFallbackOpHandler, responsible for running TFRT ops
+// on Tensorflow.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_OP_HANDLER_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_OP_HANDLER_H_
+
+#include "llvm/Support/Error.h"
+#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
+#include "tfrt/core_runtime/op_handler.h"  // from @tf_runtime
+#include "tfrt/host_context/device.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+llvm::Expected<tfrt::OpHandler*> CreateKernelFallbackOpHandler(
+    tfrt::CoreRuntime* runtime, tfrt::RCReference<tfrt::Device> device);
+
+}  // namespace tfd
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_OP_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h
new file mode 100644
index 00000000..8ade7d00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h
@@ -0,0 +1,66 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares TF kernel fallback tensor.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_TENSOR_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_TENSOR_H_
+
+#include <utility>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tfrt/dtype/dtype.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor_shape.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+class BaseKernelFallbackTensor : public tfrt::Tensor {
+ public:
+  explicit BaseKernelFallbackTensor(::tensorflow::Tensor tensor);
+  BaseKernelFallbackTensor(const tfrt::TensorShape& shape, tfrt::DType dtype,
+                           ::tensorflow::Tensor tensor);
+
+  void Print(tfrt::raw_ostream& os) const override;
+
+  const ::tensorflow::Tensor* GetTensor() const { return &tensor_; }
+
+ private:
+  ::tensorflow::Tensor tensor_;
+  bool is_valid_type_;
+};
+
+class KernelFallbackTensor final
+    : public BaseKernelFallbackTensor,
+      public tfrt::TensorTraits<KernelFallbackTensor> {
+ public:
+  explicit KernelFallbackTensor(::tensorflow::Tensor tensor)
+      : BaseKernelFallbackTensor(std::move(tensor)) {}
+  KernelFallbackTensor(const tfrt::TensorShape& shape, tfrt::DType dtype,
+                       ::tensorflow::Tensor tensor)
+      : BaseKernelFallbackTensor(shape, dtype, std::move(tensor)) {}
+
+  static KernelFallbackTensor Create(const tensorflow::Tensor& tensor) {
+    return KernelFallbackTensor(tensor);
+  }
+
+  // Tensor type name for KernelFallbackTensor.
+  static const char* name() { return "KernelFallback"; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h
new file mode 100644
index 00000000..b3879716
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_UTILS_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/sync_kernel_utils.h"  // from @tf_runtime
+#include "tfrt/host_context/value.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/support/variant.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+std::function<void(std::function<void()>)>* GetDefaultRunner();
+
+using TfInputs =
+    tfrt::Variant<tfrt::Monostate, llvm::ArrayRef<tfrt::AsyncValue*>,
+                  tfrt::RepeatedSyncArguments<tfrt_stub::FallbackTensor>&>;
+
+// Sets up the OpKernelcontext::Params in `run_state` with the objects and data
+// in `runner`, `fallback_request_state` and `device`.
+void SetUpParams(const tensorflow::tfrt_stub::OpKernelRunner& runner,
+                 const KernelFallbackCompatRequestState& fallback_request_state,
+                 tensorflow::Device* device,
+                 tensorflow::tfrt_stub::OpKernelRunState& run_state);
+
+// Return the device to be used for the fallback kernel execution. The device is
+// guaranteed to be alive during the graph execution.
+tensorflow::Device* GetDeviceFromFallbackState(
+    const KernelFallbackCompatRequestState& fallback_request_state,
+    const tfrt_stub::OpKernelRunner& kernel_runner);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/tensor_util.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/tensor_util.h
new file mode 100644
index 00000000..6126f104
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/tensor_util.h
@@ -0,0 +1,132 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TENSOR_UTIL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TENSOR_UTIL_H_
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/framework/device.h"
+#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+
+namespace tfrt {
+class Device;
+}  // namespace tfrt
+
+namespace tensorflow {
+class KernelFallbackTensor;
+namespace tfd {
+
+// Transfers tensor `src` from `src_device` to `dst_device`.
+// Returns the transferred tensor on `dst_device` wrapped as
+// `TensorWrapperType`.
+template <typename TensorWrapperType>
+tfrt::AsyncValueRef<TensorWrapperType> TransferTensorToDevice(
+    const tfrt::ExecutionContext& exec_ctx, const Tensor& src,
+    Device* src_device, Device* dst_device) {
+  const bool is_same_device =
+      (src_device == dst_device) || (src_device->name() == dst_device->name());
+
+  // Note: source and destination CPU devices are expected to be on the same
+  // host. Currently TFRT doesn't support checking if a CPU is remote CPU,
+  // we may consider adding a remote CPU device type in the future.
+  const bool src_cpu =
+      src_device->tensorflow_accelerator_device_info() == nullptr;
+  const bool dst_cpu =
+      dst_device->tensorflow_accelerator_device_info() == nullptr;
+  const bool is_between_cpu_devices = dst_cpu && src_cpu;
+
+  if (is_same_device || is_between_cpu_devices) {
+    return tfrt::MakeAvailableAsyncValueRef<TensorWrapperType>(src);
+  }
+
+  if (!dst_cpu && (src.dtype() != tensorflow::DT_VARIANT &&
+                   !tensorflow::DataTypeCanUseMemcpy(src.dtype()))) {
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(tfrt::StrCat(
+        "Can't copy Tensor with type ", tensorflow::DataTypeString(src.dtype()),
+        " to device ", dst_device->name(), ".")));
+  }
+  tensorflow::AllocatorAttributes attr;
+  if (src.dtype() == tensorflow::DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  tensorflow::Tensor dst(dst_device->GetAllocator(attr), src.dtype(),
+                         src.shape());
+  if (src.shape().num_elements() == 0) {
+    return tfrt::MakeAvailableAsyncValueRef<TensorWrapperType>(dst);
+  }
+
+  auto result = tfrt::MakeUnconstructedAsyncValueRef<TensorWrapperType>();
+  bool enqueued = tfrt::EnqueueBlockingWork(
+      exec_ctx.host(), [result = result.CopyRef(), src_cpu, dst_cpu, src_device,
+                        dst_device, src, dst = std::move(dst)]() mutable {
+        tensorflow::DeviceContext* src_device_context = nullptr;
+        if (!src_cpu) {
+          src_device_context =
+              src_device->tensorflow_accelerator_device_info()->default_context;
+        }
+        tensorflow::DeviceContext* dst_device_context = nullptr;
+        if (!dst_cpu) {
+          dst_device_context =
+              dst_device->tensorflow_accelerator_device_info()->default_context;
+        }
+        // TODO(tfrt-devs): The Sync() call below may be more aggressive than
+        // necessary. It is based on knowledge of implementation details - that
+        // GPU devices are implemented using 3 streams - one for host->device
+        // copies, one for device->host copies and one for sending operations to
+        // the GPU. With that setup, Sync()ing across all 3 streams should be
+        // sufficient but more than necessary (since it waits for operations
+        // that might have nothing to do with this tensor to complete).
+        absl::Status s = src_device->Sync();
+        if (!s.ok()) {
+          result.SetError(absl::InternalError(s.message()));
+          return;
+        }
+        tensorflow::Notification n;
+        absl::Status status;
+        tensorflow::CopyTensor::ViaDMA(
+            "copy", src_device_context, dst_device_context, src_device,
+            dst_device, tensorflow::AllocatorAttributes(),
+            tensorflow::AllocatorAttributes(), &src, &dst,
+            0 /*dev_to_dev_stream_index*/,
+            [&status, &n](const absl::Status& s) {
+              status = s;
+              n.Notify();
+            });
+        n.WaitForNotification();
+        if (status.ok()) {
+          result.emplace(std::move(dst));
+        }
+      });
+
+  if (!enqueued) {
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(
+        "Failed to enqueue blocking task to transfer tensor."));
+  }
+  return result;
+}
+
+tfrt::AsyncValueRef<KernelFallbackTensor> TransferTensorToDevice(
+    const tfrt::ExecutionContext& exec_ctx, const KernelFallbackTensor& tensor,
+    const tfrt::Device& src_device, const tfrt::Device& dst_device);
+
+llvm::Expected<Device*> GetTfDevice(const tfrt::ExecutionContext& exec_ctx,
+                                    const tfrt::Device& device);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TENSOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
new file mode 100644
index 00000000..e370fde5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
@@ -0,0 +1,317 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Compatibility layer for calling directly into a TensorFlow kernel via TFRT,
+// bypassing the existing TensorFlow runtime. This file defines:
+//
+//   TFRTOpKernel
+//   TFRTOpKernelConstruction
+//   TFRTOpKernelContext
+//
+// Note that these are standalone objects that do not share a base class with
+// TF's corresponding OpKernel, OpKernelConstruction, and OpKernelContext types.
+// There is no common base class to avoid virtual call overhead. Kernels that
+// support these fallback types must be templated: see
+// core/kernels/aggregate_ops.h for an example.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TFRT_OP_KERNEL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TFRT_OP_KERNEL_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/runtime_fallback/kernel/attr_util.h"
+#include "tensorflow/core/runtime_fallback/util/attr_util.h"
+#include "tfrt/common/compat/eigen/thread_pool_device.h"  // from @tf_runtime
+#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
+
+namespace tfrt {
+class AsyncKernelFrame;
+}  // namespace tfrt
+
+namespace tensorflow {
+
+class TFRTOpKernel;
+class TFRTOpMeta;
+class Tensor;
+class TensorShape;
+
+//////////////////////////////////////////////////////////////////////
+// OpKernel interface.
+//////////////////////////////////////////////////////////////////////
+class TFRTOpKernelConstruction {
+ public:
+  explicit TFRTOpKernelConstruction(const tfrt::OpAttrsRef& attributes);
+
+  template <class T>
+  absl::Status GetAttr(absl::string_view attr_name, T* value) const;
+
+  void CtxFailure(const absl::Status& s);
+  void CtxFailureWithWarning(const absl::Status& s);
+  void CtxFailure(const char* file, int line, const absl::Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const absl::Status& s);
+
+  absl::Status MatchSignature(const DataTypeSlice expected_inputs,
+                              const DataTypeSlice expected_outputs) {
+    // TODO(annarev): Move MatchSignatureHelper out of op_kernel.h
+    // and call it here.
+    return absl::OkStatus();
+  }
+
+  const std::optional<std::string>& error();
+
+ private:
+  const tfrt::OpAttrsRef& attributes_;
+  // If an error occurs, the error message is stored here.
+  std::optional<std::string> error_;
+};
+
+template <>
+absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
+                                               std::string* value) const;
+
+template <>
+absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
+                                               DataType* value) const;
+
+template <>
+absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
+                                               Padding* value) const;
+
+template <>
+absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
+                                               std::vector<int32>* value) const;
+
+absl::Status MissingAttributeError(absl::string_view attr_name);
+
+template <class T>
+absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
+                                               T* value) const {
+  bool success = attributes_.Get<T>(
+      llvm::StringRef(attr_name.data(), attr_name.size()), value);
+  if (!success) {
+    return MissingAttributeError(attr_name);
+  }
+  return absl::OkStatus();
+}
+
+// An implementation of OpKernelContext that fetches inputs from a
+// tfrt::AsyncKernelFrame. Outputs and errors are stored internally.
+class TFRTOpKernelContext {
+ public:
+  explicit TFRTOpKernelContext(
+      llvm::ArrayRef<tfrt::RCReference<tfrt::AsyncValue>> inputs,
+      int num_outputs, const TFRTOpMeta* op_meta, tfrt::HostContext* host);
+  const Tensor& output(int index);
+  const std::optional<std::string>& error();
+
+  // OpKernelContext interface implementation.
+  bool ValidateInputsAreSameShape(TFRTOpKernel* op);
+  const Tensor& input(int index);
+  int num_inputs() const;
+  void set_output(int index, const Tensor& tensor);
+  int num_outputs() const;
+  bool forward_input_to_output_with_shape(int input_index, int output_index,
+                                          const TensorShape& output_shape,
+                                          Tensor** output) {
+    return false;
+  }
+  absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                             Tensor* out_temp);
+  absl::Status allocate_output(int index, const TensorShape& shape,
+                               Tensor** tensor);
+  DataType expected_output_dtype(int i) const;
+
+  template <typename EigenDeviceType>
+  const EigenDeviceType& eigen_device() const;
+
+  void CtxFailure(const absl::Status& s);
+  void CtxFailureWithWarning(const absl::Status& s);
+  void CtxFailure(const char* file, int line, const absl::Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const absl::Status& s);
+
+ private:
+  llvm::ArrayRef<tfrt::RCReference<tfrt::AsyncValue>> inputs_;
+  const TFRTOpMeta* op_meta_;
+
+  // The kernel's outputs are kept here. We can't directly store outputs in the
+  // AsyncKernelFrame because we must temporarily store allocate_output's Tensor
+  // somewhere until the Tensor is initialized. If we stored the allocated
+  // Tensor directly in the AsyncKernelFrame, the frame's output becomes
+  // available and downstream kernels may use the allocated (but uninitialized)
+  // Tensor.
+  std::vector<Tensor> outputs_;
+
+  // If an error occurs, the error message is stored here.
+  std::optional<std::string> error_;
+
+  tfrt::compat::EigenHostContext eigen_host_context_;
+};
+
+class TFRTOpKernel {
+ public:
+  explicit TFRTOpKernel(TFRTOpKernelConstruction* context) {}
+  virtual ~TFRTOpKernel() = default;
+  virtual void Compute(TFRTOpKernelContext* context) = 0;
+};
+
+inline void CheckNotInComputeAsync(TFRTOpKernelConstruction*, const char*) {}
+inline void CheckNotInComputeAsync(TFRTOpKernelContext*, const char*) {}
+
+//////////////////////////////////////////////////////////////////////
+// Forwarding op metadata.
+//////////////////////////////////////////////////////////////////////
+
+// Op metadata. For now TFRTOpMeta only stores the op's output types.
+class TFRTOpMeta {
+ public:
+  explicit TFRTOpMeta(std::vector<DataType> output_types);
+  DataType output_type(int index) const;
+
+ private:
+  std::vector<DataType> output_types_;
+};
+
+// Construct a TFRTOpMeta from .Input(), .Output(), and .Attr()
+// specifications. This supports the same syntax as TF's REGISTER_OP macro, but
+// this implementation only supports a subset of the full language.
+//
+// Currently, this only supports single-tensor outputs with fixed type.
+// TODO(lauj) Support attribute outputs and compound attribute types as used by
+// AddN.
+class TFRTOpMetaBuilder {
+ public:
+  explicit TFRTOpMetaBuilder(absl::string_view op_name);
+  TFRTOpMetaBuilder& Output(absl::string_view output_spec);
+  TFRTOpMetaBuilder& Input(absl::string_view input_spec);
+  TFRTOpMetaBuilder& Attr(absl::string_view attr_spec);
+
+  const string& op_name() const;
+  TFRTOpMeta BuildMeta() const;
+
+ private:
+  string op_name_;
+  std::vector<DataType> output_types_;
+};
+
+// Map from op name to TFRTOpMeta.
+class TFRTOpMetaMap {
+ public:
+  TFRTOpMetaMap();
+  void RegisterOpMeta(const TFRTOpMetaBuilder& op_builder);
+
+  // Returns nullptr if there is no metadata for op_name.
+  const TFRTOpMeta* GetOpMeta(absl::string_view op_name) const;
+
+ private:
+  llvm::StringMap<TFRTOpMeta> op_metas_;
+};
+
+extern llvm::ManagedStatic<TFRTOpMetaMap> tfrt_forwarding_op_meta_map;
+
+// Implementation detail for REGISTER_KERNEL_FALLBACK_OP. This helps with
+// evaluating the .Input()/.Output()/.Attr() clauses in the REGISTER_OP syntax
+// before calling BuildMeta().
+class TFRTOpRegisterer {
+ public:
+  TFRTOpRegisterer(  // NOLINT(google-explicit-constructor)
+      const TFRTOpMetaBuilder& op_builder);
+};
+
+#define REGISTER_KERNEL_FALLBACK_OP(name) \
+  REGISTER_KERNEL_FALLBACK_OP_UNIQ_HELPER(__COUNTER__, name)
+
+#define REGISTER_KERNEL_FALLBACK_OP_UNIQ_HELPER(ctr, name) \
+  REGISTER_KERNEL_FALLBACK_OP_UNIQ(ctr, name)
+
+#define REGISTER_KERNEL_FALLBACK_OP_UNIQ(ctr, name)                         \
+  static TFRTOpRegisterer global_tfrt_forwarding_op_meta_builder_##ctr##_ = \
+      TFRTOpMetaBuilder(name)
+
+//////////////////////////////////////////////////////////////////////
+// Forwarding kernel registration.
+//////////////////////////////////////////////////////////////////////
+
+// Represents Kernel Fallback kernel registration information.
+struct TFRTOpKernelReg {
+  using CallbackT =
+      std::unique_ptr<TFRTOpKernel> (*)(TFRTOpKernelConstruction*);
+
+  explicit TFRTOpKernelReg(CallbackT callback) : callback(callback) {}
+
+  // Callback that creates a kernel.
+  CallbackT callback;
+  // Map from attribute names to type it must match.
+  // For e.g. foo: DT_FLOAT indicates that foo attribute
+  // must be a tfdtype attribute with type float.
+  llvm::StringMap<DataType> type_constraints;
+};
+
+class TFRTOpKernelFactories {
+ public:
+  TFRTOpKernelFactories();
+  void RegisterFactory(absl::string_view kernel_class_name,
+                       TFRTOpKernelReg kernel_info);
+
+  // Creates a kernel with the given name and passes op_kernel_construction
+  // to kernel constructor.
+  // Returns the constructed kernel on success.
+  // In case of failure, returns a nullptr. Kernel creation can fail in one
+  // of the following cases:
+  //   1. Kernel with the given name is not found.
+  //   2. Attributes in op_kernel_construction don't match type constraints
+  //      for any of the kernels with this name.
+  //      Note that we consider a constraint to be "not matched" if attribute
+  //      it applies to is not in op_kernel_construction.
+  std::unique_ptr<TFRTOpKernel> CreateKernel(
+      absl::string_view kernel_class_name,
+      TFRTOpKernelConstruction* op_kernel_construction) const;
+
+ private:
+  llvm::StringMap<std::vector<TFRTOpKernelReg>> factories_;
+};
+
+// TODO(lauj) Should we move these kernel registrations to tfrt::KernelRegistry?
+extern llvm::ManagedStatic<TFRTOpKernelFactories>
+    tfrt_forwarding_kernel_factories;
+
+#define REGISTER_KERNEL_FALLBACK_KERNEL(name, ...) \
+  REGISTER_KERNEL_FALLBACK_KERNEL_UNIQ_HELPER(__COUNTER__, name, __VA_ARGS__)
+
+#define REGISTER_KERNEL_FALLBACK_KERNEL_UNIQ_HELPER(ctr, name, ...) \
+  REGISTER_KERNEL_FALLBACK_KERNEL_UNIQ(ctr, name, __VA_ARGS__)
+
+#define REGISTER_KERNEL_FALLBACK_KERNEL_UNIQ(ctr, name, ...)             \
+  static bool global_tfrt_forwarding_kernel_##ctr##_registered_ = []() { \
+    ::tensorflow::tfrt_forwarding_kernel_factories->RegisterFactory(     \
+        name, TFRTOpKernelReg([](TFRTOpKernelConstruction* construction) \
+                                  -> std::unique_ptr<TFRTOpKernel> {     \
+          return std::make_unique<__VA_ARGS__>(construction);            \
+        }));                                                             \
+    return true;                                                         \
+  }();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TFRT_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/conversion_function.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/conversion_function.h
new file mode 100644
index 00000000..d8537e6c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/conversion_function.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements TFRuntimeFallback tensor conversion function for
+// converting to host tensor.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_CONVERSION_FUNCTION_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_CONVERSION_FUNCTION_H_
+
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tfrt {
+
+class TensorConversionFnRegistry;
+class CpuDevice;
+class ExecutionContext;
+class DenseHostTensor;
+}
+
+namespace tensorflow {
+namespace tfd {
+class RuntimeFallbackTensor;
+
+tfrt::Expected<tfrt::DenseHostTensor>
+ConvertRuntimeFallbackTensorToDenseHostTensor(
+    const RuntimeFallbackTensor &tensor, const tfrt::CpuDevice &src,
+    const tfrt::CpuDevice &dst, const tfrt::ExecutionContext &exec_ctx);
+
+// Register conversion functions for TFRuntimeFallbackTensors.
+void RegisterTFRuntimeFallbackTensorToHostConversionFn(
+    tfrt::TensorConversionFnRegistry* registry);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_CONVERSION_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
new file mode 100644
index 00000000..ef45282a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
@@ -0,0 +1,285 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_FALLBACK_BATCH_KERNEL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_FALLBACK_BATCH_KERNEL_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/batch_kernels.h"
+#include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_stats.h"
+#include "tensorflow/core/kernels/batching_util/warmup.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
+ public:
+  explicit BatchFunctionFallbackKernelBase(OpKernelConstruction* c);
+
+ protected:
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically,
+  // and the last one must equal 'max_batch_size_'.
+  absl::Status ValidateAllowedBatchSizes() const;
+
+  // Initialize vars by reading from op-kernel-construction.
+  // Vars
+  // - enable_adaptive_batch_threads_
+  //   true if value of attribute `kEnableAdaptiveSchedulerAttr` is true, or
+  //   if `num_batch_threads` is not positive.
+  // - adaptive_batch_scheduler_options_
+  //   Read from corresponding attributes as long as they are set.
+  void SetAdaptiveBatchSchedulerOptions(OpKernelConstruction* c,
+                                        int32_t num_batch_threads);
+
+  static int32 NumBatchThreadsFromEnvironmentWithDefault(
+      int default_num_batch_threads);
+  static thread::ThreadPool* GetOrCreateBatchThreadsPool();
+  static constexpr int64_t kBatchThreadPoolSize = 128;
+
+  std::string container_;
+  std::string shared_name_;
+  std::string batcher_queue_;
+  int32_t num_batch_threads_;
+  int32_t max_batch_size_;
+  int32_t batch_timeout_micros_;
+  int32_t max_enqueued_batches_;
+  std::vector<int32_t> allowed_batch_sizes_;
+  int32 low_priority_max_batch_size_;
+  int32 low_priority_batch_timeout_micros_;
+  int32 low_priority_max_enqueued_batches_;
+  std::vector<int32> low_priority_allowed_batch_sizes_;
+  std::string mixed_priority_policy_;
+  bool enable_large_batch_splitting_;
+  bool has_attribute_enable_large_batch_splitting_;
+  bool disable_padding_;
+  std::string batch_padding_policy_;
+
+  // Parameters for adaptive batch scheduler only.
+  // Note 'num_batch_threads_' above is shared by two implementations of batch
+  // scheduler.
+  // Per-model inflight batches parameters.
+  static constexpr int64_t kMinInflightBatches = 16;
+  static constexpr int64_t kInitialInflightBatches = 16;
+  static constexpr int64_t kBatchesToAverageOver = 10;
+  static constexpr int64_t kMaxInflightBatches = 64;
+  bool enable_adaptive_batch_threads_ = false;
+  struct AdaptiveBatchSchedulerOptions {
+    int32 min_in_flight_batches_limit = kMinInflightBatches;
+    int32 initial_in_flight_batches_limit = kInitialInflightBatches;
+    int32 max_in_flight_batches_limit = kMaxInflightBatches;
+    int32 batches_to_average_over = kBatchesToAverageOver;
+  };
+  std::optional<AdaptiveBatchSchedulerOptions>
+      adaptive_batch_scheduler_options_ = std::nullopt;
+};
+
+// Legacy TF kernel which is a variant of tf.BatchFunction.
+template <typename BatchResourceType>
+class BatchFunctionFallbackKernel : public BatchFunctionFallbackKernelBase {
+ public:
+  using BatchFunctionType = typename BatchResourceType::BatchFunctionType;
+
+  explicit BatchFunctionFallbackKernel(OpKernelConstruction* c)
+      : BatchFunctionFallbackKernelBase(c) {
+    int64_t handle;
+    OP_REQUIRES_OK(c, c->GetAttr("opaque_function_handle", &handle));
+    batch_function_ = BatchResourceType::CastHandleToFunction(handle);
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final;
+
+ private:
+  BatchFunctionType batch_function_;
+};
+
+template <typename BatchResourceType>
+void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
+    OpKernelContext* c, DoneCallback done) {
+  RecordBatchSplitUsage(has_attribute_enable_large_batch_splitting_
+                            ? std::make_optional(enable_large_batch_splitting_)
+                            : std::nullopt,
+                        GetModelName(c));
+  RecordBatchParamNumBatchThreads(num_batch_threads_, GetModelName(c));
+  OP_REQUIRES_VALUE(tfrt::ResourceContext * client_graph_resource_context, c,
+                    BatchResourceType::GetClientGraphResourceContext(c));
+  OP_REQUIRES_ASYNC(
+      c, client_graph_resource_context != nullptr,
+      errors::FailedPrecondition("client graph resource context not found"),
+      done);
+  std::function<
+      absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>>()>
+      creator;
+  if (adaptive_batch_scheduler_options_ != std::nullopt) {
+    creator = [this, c]()
+        -> absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>> {
+      serving::AdaptiveSharedBatchScheduler<
+          serving::BatchResourceBase::BatchTask>::Options
+          adaptive_shared_batch_scheduler_options;
+      adaptive_shared_batch_scheduler_options.thread_pool_name =
+          "adaptive_batch_threads";
+      adaptive_shared_batch_scheduler_options.num_batch_threads =
+          adaptive_batch_scheduler_options_->max_in_flight_batches_limit;
+      adaptive_shared_batch_scheduler_options.thread_pool =
+          GetOrCreateBatchThreadsPool();
+
+      // When we explicitly specify 'thread_pool', you'd think ASBS would ignore
+      // 'num_batch_threads', but in fact ASBS still uses num_batch_threads as
+      // the max number of in-flight batches.  It makes no sense to have more
+      // in-flight batches than threads (it would result in strictly bad
+      // batching decisions), so we cap this parameter (which otherwise comes
+      // from the saved model) to the actual number of batch threads (which
+      // comes from a process-wide environment variable).
+      //
+      // We have to apply the same capping to min_ and initial_
+      // in_flight_batches_limit below to produce valid configurations.
+      adaptive_shared_batch_scheduler_options.num_batch_threads = std::min(
+          NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+          adaptive_batch_scheduler_options_->max_in_flight_batches_limit);
+
+      // adaptive_shared_batch_scheduler_options.full_batch_scheduling_boost_micros
+      // is 0 (default value) intentionally, so tasks are scheduled in a FIFO
+      // way.
+      // Two rationales to use default value (zero) for
+      // `full_batch_scheduling_boost_micros`
+      // 1) In this way, tasks scheduling policy is FIFO. Compared with round
+      // robin (what shared batch scheduler does), FIFO ensures that model
+      // with low QPS (i.e., models enqueue fewer tasks in the shared queue)
+      // will be processed timely.
+      // 2) If set, `full_batch_scheduling_boost_micros` should be of order
+      // the batch processing latency (which varies on a model basis).
+      // If a non-zero value is not set properly, it harms tail latency.
+      adaptive_shared_batch_scheduler_options.min_in_flight_batches_limit =
+          std::min(
+              NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+              adaptive_batch_scheduler_options_->min_in_flight_batches_limit);
+      adaptive_shared_batch_scheduler_options
+          .initial_in_flight_batches_limit = std::min(
+          NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+          adaptive_batch_scheduler_options_->initial_in_flight_batches_limit);
+      adaptive_shared_batch_scheduler_options.batches_to_average_over =
+          adaptive_batch_scheduler_options_->batches_to_average_over;
+      adaptive_shared_batch_scheduler_options.fifo_scheduling = true;
+
+      std::unique_ptr<BatchResourceType> new_resource;
+      auto status = BatchResourceType::Create(
+          c, adaptive_shared_batch_scheduler_options, max_batch_size_,
+          batch_timeout_micros_, max_enqueued_batches_, allowed_batch_sizes_,
+          batch_function_, disable_padding_, &new_resource);
+      if (!status.ok()) return status;
+      if (c->session_metadata() != nullptr) {
+        new_resource->set_session_metadata(*c->session_metadata());
+      }
+      return tensorflow::core::RefCountPtr<BatchResourceType>(
+          new_resource.release());
+    };
+  } else {
+    creator = [this, c]()
+        -> absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>> {
+      serving::BatchResourceOptions batch_resource_options;
+      TF_ASSIGN_OR_RETURN(
+          batch_resource_options.mixed_priority_batching_policy,
+          serving::GetMixedPriorityBatchingPolicy(mixed_priority_policy_));
+      batch_resource_options.num_batch_threads = num_batch_threads_;
+      batch_resource_options.max_batch_size = max_batch_size_;
+      batch_resource_options.batch_timeout_micros = batch_timeout_micros_;
+      batch_resource_options.max_enqueued_batches = max_enqueued_batches_;
+      batch_resource_options.allowed_batch_sizes = allowed_batch_sizes_;
+      batch_resource_options.batch_padding_policy = batch_padding_policy_;
+      batch_resource_options.low_priority_max_batch_size =
+          low_priority_max_batch_size_;
+      batch_resource_options.low_priority_batch_timeout_micros =
+          low_priority_batch_timeout_micros_;
+      batch_resource_options.low_priority_max_enqueued_batches =
+          low_priority_max_enqueued_batches_;
+      batch_resource_options.low_priority_allowed_batch_sizes =
+          low_priority_allowed_batch_sizes_;
+
+      serving::ModelBatchStats& model_batch_stats =
+          serving::GlobalBatchStatsRegistry().model(
+              /* model_name= */ std::string(GetModelName(c)),
+              /* op_name= */ c->op_kernel().name());
+      model_batch_stats.SetBatchTimeoutMicros(batch_timeout_micros_);
+      model_batch_stats.SetNumBatchThreads(num_batch_threads_);
+
+      std::unique_ptr<BatchResourceType> new_resource;
+      auto status = BatchResourceType::Create(
+          c, batch_resource_options, batch_function_,
+          enable_large_batch_splitting_, disable_padding_, &new_resource);
+      if (!status.ok()) return status;
+      if (c->session_metadata() != nullptr) {
+        new_resource->set_session_metadata(*c->session_metadata());
+      }
+      return tensorflow::core::RefCountPtr<BatchResourceType>(
+          new_resource.release());
+    };
+  }
+
+  auto br = client_graph_resource_context->GetOrCreateResource<
+      tensorflow::core::RefCountPtr<BatchResourceType>>(shared_name_, creator);
+  if (!br.ok()) OP_REQUIRES_OK_ASYNC(c, br.status(), done);
+  auto expected_name = BatchResourceType::GetBatchFunctionName(batch_function_);
+  auto received_name =
+      BatchResourceType::GetBatchFunctionName((*br)->get()->batch_function());
+
+  // TODO(b/187173237): When we can guarantee only 1 copy of BEF function is
+  // generated for the batched function, we can assert the pointers are equal
+  OP_REQUIRES_ASYNC(
+      c, expected_name == received_name,
+      errors::InvalidArgument(absl::StrCat(
+          "Provided BEF function doesn't match with BatchResource. Expected:",
+          expected_name, " Received:", received_name)),
+      done);
+  const uint64_t guid = random::New64();
+  auto create_batch_task_fn = [c]() {
+    return BatchResourceType::CreateBatchTask(c);
+  };
+  absl::Status status;
+  if (serving::ShouldWarmupAllBatchSizes(c)) {
+    status = (*br)->get()->RegisterWarmupInputs(guid, c, batcher_queue_,
+                                                create_batch_task_fn, done);
+  } else {
+    status = (*br)->get()->RegisterInput(guid, c, batcher_queue_,
+                                         create_batch_task_fn, done);
+  }
+  OP_REQUIRES_OK_ASYNC(c, status, done);
+  // Assume br calls done, so nothing to do here.
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_FALLBACK_BATCH_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/kernel_utils.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
new file mode 100644
index 00000000..e4978b80
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
@@ -0,0 +1,161 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares kernel utils.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_KERNEL_UTILS_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_KERNEL_UTILS_H_
+
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
+#include "tfrt/dtype/dtype.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/support/error_util.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor_shape.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+template <typename T>
+struct AutoReleaser {
+  void operator()(T* p) const { p->Release(); }
+};
+template <typename T>
+using AutoReleasePtr = std::unique_ptr<T, AutoReleaser<T>>;
+
+using OwnedEagerContext = AutoReleasePtr<EagerContext>;
+using OwnedEagerOperation = AutoReleasePtr<EagerOperation>;
+using OwnedTensorHandle = AutoReleasePtr<TensorHandle>;
+using OwnedAbstractTensorInterface = AutoReleasePtr<AbstractTensorInterface>;
+
+// Check if a TensorHandle physically resides on GPU.
+inline bool IsGpuTensorHandle(const tensorflow::TensorHandle& handle) {
+  absl::Status dummy_status;
+  // BackingDeviceName is where the tensor is physically located, not where the
+  // op that produces the tensor is.
+  // Note that dummy_status is never set in TensorHandle::BackingDeviceName.
+  absl::string_view device_name = handle.BackingDeviceName(&dummy_status);
+  return absl::StrContains(device_name, "GPU");
+}
+
+// TODO(zhangqiaorjc): Allowlist more dtypes as tfrt GPU supports more.
+// RuntimeFallbackTensor of supported dtypes below will be eagerly converted to
+// tfrt::DenseGpuTensor after each RuntimeFallbackOpHandler::Execute.
+inline bool IsSupportedByTFRTGpu(DataType dtype) {
+  switch (dtype) {
+    default:
+      return false;
+    case DataType::DT_FLOAT:
+    case DataType::DT_DOUBLE:
+    case DataType::DT_INT32:
+      return true;
+  }
+}
+
+// TODO(b/165872892): Remove this method.
+// This method is needed because we use different device name in TF-TFRT
+// integration and mlir test. In TF-TFRT integration, we reuse the device full
+// name (e.g. /job:localhost/replica:0/task:0/device:GPU:0) from TF. But in mlir
+// test, we use simplified device name "GPU:0". And lot of things in fallback
+// need to be used in both cases. As a result, we need to look up the device
+// with both device names.
+inline const char* ConvertTfDeviceNameToTfrtDefault(const char* device_name) {
+  assert(strlen(device_name) >= 5);
+  return &device_name[strlen(device_name) - 5];
+}
+
+// Create and initialize EagerContext.
+tfrt::Expected<OwnedEagerContext> InitEagerContext();
+
+tfrt::Expected<OwnedEagerContext> InitEagerContext(
+    DynamicDeviceMgr* device_mgr, const SessionOptions& session_opts,
+    ContextDevicePlacementPolicy default_device_placement_policy,
+    bool is_async);
+
+// Obtain EagerContext from ExecutionContext.
+tfrt::Expected<EagerContext*> GetEagerContext(tfrt::ExecutionContext exec_ctx);
+
+// Return the CoreRuntimeOp for `op_name` using fallback op_handler.
+llvm::Expected<tfrt::CoreRuntimeOp> GetFallbackOp(tfrt::string_view op_name,
+                                                  tfrt::HostContext* host);
+
+constexpr char kEagerContextResourceName[] = "EagerContextResourceName";
+
+class EagerContextResource {
+ public:
+  explicit EagerContextResource()
+      : device_mgr_(std::make_unique<DynamicDeviceMgr>()),
+        ctx_{InitEagerContext(
+            device_mgr_.get(), tensorflow::SessionOptions(),
+            tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+            /*is_async=*/false)} {}
+  explicit EagerContextResource(
+      const SessionOptions& session_opts,
+      ContextDevicePlacementPolicy default_device_placement_policy,
+      bool is_async)
+      : device_mgr_(std::make_unique<DynamicDeviceMgr>()),
+        ctx_{InitEagerContext(device_mgr_.get(), session_opts,
+                              default_device_placement_policy, is_async)} {}
+
+  tfrt::Expected<EagerContext*> GetTFEagerContext() {
+    if (!ctx_) return ctx_.takeError();
+    return ctx_.get().get();
+  }
+
+  DynamicDeviceMgr* GetDeviceMgr() { return device_mgr_.get(); }
+
+  llvm::Error AddDevices(std::vector<std::unique_ptr<Device>> devices) {
+    if (!ctx_) return ctx_.takeError();
+    absl::Status s = dynamic_cast<tensorflow::DynamicDeviceMgr*>(
+                         ctx_.get()->local_device_mgr())
+                         ->AddDevices(std::move(devices));
+    if (!s.ok()) return tfrt::MakeStringError(s.message());
+    ctx_.get()->InitPrioritizedDeviceTypeList();
+    ctx_.get()->pflr()->InitializeDeviceAndFlr();
+    return llvm::Error::success();
+  }
+
+ private:
+  // EagerContext uses this device_mgs as local_device_mgr. We manage the
+  // device_mgr_ here to allow TFRT adding new devices after EagerContext
+  // initialization.
+  // Today, TFRT only adds TPU devices after EagerContext initialization.
+  std::unique_ptr<DynamicDeviceMgr> device_mgr_;
+
+  tfrt::Expected<OwnedEagerContext> ctx_;
+};
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_KERNEL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/op_logger.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/op_logger.h
new file mode 100644
index 00000000..c920715d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/op_logger.h
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines a logger for op names.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_OP_LOGGER_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_OP_LOGGER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "tfrt/host_context/shared_context.h"  // from @tf_runtime
+#include "tfrt/support/concurrent_vector.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tfrt {
+class HostContext;
+}
+
+namespace tensorflow {
+namespace tfd {
+
+class OpLogger : public tfrt::SharedContext {
+ public:
+  explicit OpLogger(tfrt::HostContext* host)
+      : op_names_(std::make_unique<tfrt::ConcurrentVector<std::string>>(8)) {}
+
+  void LogOp(tfrt::string_view op_name) {
+    op_names_->emplace_back(op_name.str());
+  }
+
+  tfrt::ArrayRef<std::string> GetLoggedOps() const {
+    absl::Span<const std::string> span = op_names_->ToConstSpan();
+    return tfrt::ArrayRef<std::string>(span.data(), span.size());
+  }
+
+  // Cannot be called concurrently with any API in this class.
+  void Clear() {
+    op_names_ = std::make_unique<tfrt::ConcurrentVector<std::string>>(8);
+  }
+
+ private:
+  std::unique_ptr<tfrt::ConcurrentVector<std::string>> op_names_;
+};
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_OP_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.h
new file mode 100644
index 00000000..833b92f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares kernels for running TFRT ops/kernels via TF runtime
+// fallback.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_KERNELS_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_KERNELS_H_
+
+#include <memory>
+
+#include "llvm/Support/Error.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
+#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/chain.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/shared_context.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+// Create an EagerOperation to run the op, taking tensorflow::TensorHandle and
+// returning tensorflow::AbstractTensorHandle*.
+absl::Status CallEagerExecute(
+    const tfrt::ExecutionContext& exec_ctx, EagerContext* eager_ctx,
+    const char* op_name, const char* device_name,
+    llvm::ArrayRef<TensorHandle*> input_tensor_handles,
+    const tfrt::OpAttrsRef& attrs,
+    llvm::MutableArrayRef<tensorflow::AbstractTensorHandle*>
+        result_tensor_handles);
+
+// Take and return RuntimeFallbackTensors.
+tfrt::AsyncValueRef<tfrt::Chain> RuntimeFallbackExecute(
+    const tfrt::ExecutionContext& exec_ctx, const char* op_name,
+    const char* device_name, tfrt::ArrayRef<tfrt::Tensor*> arguments,
+    const tfrt::OpAttrsRef& attrs,
+    tfrt::MutableArrayRef<tfrt::RCReference<tfrt::AsyncValue>> results);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h
new file mode 100644
index 00000000..54d404fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares RuntimeFallbackOpHandler, responsible for running TFRT ops
+// on Tensorflow.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_OP_HANDLER_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_OP_HANDLER_H_
+
+#include <memory>
+
+#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
+#include "tfrt/core_runtime/op_handler.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+llvm::Expected<tfrt::OpHandler*> CreateRuntimeFallbackOpHandler(
+    tfrt::CoreRuntime* runtime, tfrt::string_view tf_device_name);
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_OP_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h
new file mode 100644
index 00000000..53c6ab69
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares TF runtime fallback tensor.
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_TENSOR_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_TENSOR_H_
+
+#include "llvm/ADT/STLExtras.h"
+#include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/host_tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/string_host_tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+class RuntimeFallbackTensor final
+    : public tfrt::Tensor,
+      public tfrt::TensorTraits<RuntimeFallbackTensor> {
+ public:
+  explicit RuntimeFallbackTensor(const tfrt::TensorShape& shape,
+                                 tfrt::DType dtype, OwnedTensorHandle th);
+
+  void Print(tfrt::raw_ostream& os) const override;
+
+  // Note that this method does not add ref to the return tensor_handle.
+  TensorHandle* GetTensorHandle() const { return tensor_handle_.get(); }
+
+  // Tensor type name for RuntimeFallbackTensor.
+  static const char* name() { return "RuntimeFallback"; }
+
+ private:
+  template <typename T>
+  static void PrintTensorValues(void* data, ssize_t size,
+                                llvm::raw_ostream& os) {
+    llvm::ArrayRef<T> elements = llvm::ArrayRef(static_cast<T*>(data), size);
+    llvm::interleaveComma(elements, os);
+  }
+
+  OwnedTensorHandle tensor_handle_;
+};
+
+llvm::SmallVector<tfrt::Index, 4> GetShape(
+    AbstractTensorInterface* tensor_interface);
+
+tfrt::Expected<tfrt::StringHostTensor> CopyTfStringTensorToStringHostTensor(
+    AbstractTensorInterface* tensor_interface, tfrt::HostContext* host);
+
+tfrt::Expected<RuntimeFallbackTensor>
+CreateRuntimeFallbackTensorFromTfTensorHandle(OwnedTensorHandle owned_th,
+                                              tfrt::HostContext* host);
+
+RuntimeFallbackTensor MoveDHTToRuntimeFallbackTensor(
+    tfrt::DenseHostTensor&& dht, tfrt::HostContext* host);
+
+RuntimeFallbackTensor CopyRefDHTToRuntimeFallbackTensor(
+    const tfrt::DenseHostTensor& dht, tfrt::HostContext* host);
+
+RuntimeFallbackTensor CopySHTToRuntimeFallbackTensor(
+    const tfrt::StringHostTensor& sht, tfrt::HostContext* host);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_RUNTIME_FALLBACK_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/test/coreruntime_driver.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/test/coreruntime_driver.h
new file mode 100644
index 00000000..00ea4b0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/test/coreruntime_driver.h
@@ -0,0 +1,79 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_TEST_CORERUNTIME_DRIVER_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_TEST_CORERUNTIME_DRIVER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/chain.h"  // from @tf_runtime
+#include "tfrt/host_context/location.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tfrt {
+
+class OpHandle;
+class OpHandler;
+class OpAttrsRef;
+class TensorHandle;
+
+class CoreRuntimeDriver final : public tfrt::LocationHandler {
+ public:
+  explicit CoreRuntimeDriver();
+
+  void Execute(string_view op_name,
+               tfrt::MutableArrayRef<tfrt::TensorHandle> args,
+               const tfrt::OpAttrsRef& attrs,
+               tfrt::MutableArrayRef<tfrt::TensorHandle> results,
+               tfrt::string_view filename, int line);
+
+  ExecutionContext CreateExecutionContext(tfrt::string_view filename, int line);
+
+  void InitializeCpuRuntimeFallbackOpHandler();
+
+  void InitializeGpuRuntimeFallbackOpHandler(int gpu_ordinal);
+
+  void InitializeCpuKernelFallbackOpHandler();
+
+  HostContext* GetHost() const;
+
+  CoreRuntimeOp MakeOp(string_view op_name);
+
+  void WaitForHostContextQuiesce();
+
+  DecodedLocation DecodeLocation(Location loc) const override;
+
+ private:
+  explicit CoreRuntimeDriver(std::unique_ptr<tfrt::CoreRuntime> corert);
+
+  std::unique_ptr<tfrt::CoreRuntime> corert_;
+  tfrt::OpHandler* op_handler_;
+  tfrt::AsyncValueRef<tfrt::Chain> chain_;
+  tfrt::ResourceContext resource_context_;
+
+  // `location_map_` is a map from (filename, line) to the opaque location data,
+  // which is the index in `locations_`.
+  absl::flat_hash_map<std::pair<std::string, int>, int> location_map_;
+  std::vector<std::pair<std::string, int>> locations_;
+};
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_TEST_CORERUNTIME_DRIVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/attr_util.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/attr_util.h
new file mode 100644
index 00000000..2bb7f137
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/attr_util.h
@@ -0,0 +1,99 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_ATTR_UTIL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_ATTR_UTIL_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/bef/bef_encoding.h"  // from @tf_runtime
+#include "tfrt/core_runtime/op_attr_type.h"  // from @tf_runtime
+#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+// Converts a TFRT string_view to the Abseil version.
+inline absl::string_view ToAbslStringView(tfrt::string_view sv) {
+  return absl::string_view(sv.data(), sv.size());
+}
+
+// Parses the string representation of the DataType in `dtype` into `data_type`.
+// Aborts the program for unsupported dtypes.
+absl::Status ParseTfDataType(absl::string_view dtype, DataType* data_type);
+
+// The following 2 functions convert between Tensorflow DataTypes and
+// OpAttrTypes. The mapping between OpAttrType and DataType is defined in
+// attr_type.def. Aborts on unsupported types.
+DataType ConvertToTfDataType(tfrt::OpAttrType op_attr_type);
+tfrt::OpAttrType ConvertFromTfDataType(DataType data_type);
+
+// The following 2 functions convert between BEF attribute types and Tensorflow
+// DataTypes. Aborts on unsupported datatypes.
+DataType ConvertBefAttrTypeToTfDataType(tfrt::DType attr_type);
+tfrt::DType ConvertTfDataTypeToBefAttrType(DataType data_type);
+
+// Parses the tensor valued `attr_value` and constructs the tensor with its
+// contents in `tensor`. Returns OK status on success, INVALID_ARGUMENT on
+// failure.
+absl::Status ParseTensorAttrValue(absl::string_view attr_value,
+                                  tensorflow::Tensor* tensor);
+
+// Parses a string of the form "[1,2,3,...]" in `attr_value` and returns the
+// constituent dimension sizes (shape) in `int_list_val`. Returns
+// INVALID_ARGUMENT on invalid input.
+absl::Status ParseTensorShapeAttrValue(absl::string_view attr_value,
+                                       std::vector<int64_t>* shape_val);
+
+// Parses a boolean from `attr_value` into `bool_val` and returns OK status on
+// success. Returns INVALID_ARGUMENT on invalid input.
+absl::Status ParseBoolAttrValue(absl::string_view attr_value, bool* bool_val);
+
+// Parses an int64_t from `attr_value` into `int_val` and returns OK status on
+// success. Returns INVLAID_ARGUMENT on invalid input.
+absl::Status ParseIntAttrValue(absl::string_view attr_value, int64_t* int_val);
+
+inline std::vector<absl::string_view> AttrValueSplit(absl::string_view str) {
+  return absl::StrSplit(str, absl::MaxSplits('$', 1));
+}
+
+// Returns true if `attr_name` is an attribute that is not required by TFRT
+// (usually added by stages higher in the lowering process)
+bool IsUnusedAttribute(absl::string_view attr_name);
+
+// Fills in the passed in AttrValueMap `attr_value_map` with attributes from
+// `attrs`.
+llvm::Error FillAttrValueMap(const tfrt::OpAttrsRef& attrs,
+                             tfrt::HostContext* host,
+                             AttrValueMap* attr_value_map);
+
+// Fills in the passed in AttrValueMap `attr_value_map`.
+absl::Status SetUpAttrValueMap(tfrt::AggregateAttr op_attr_array,
+                               tfrt::AggregateAttr op_func_attr_array,
+                               tensorflow::AttrValueMap* attr_value_map);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_ATTR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/fallback_test_util.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/fallback_test_util.h
new file mode 100644
index 00000000..cdfa6331
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/fallback_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_FALLBACK_TEST_UTIL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_FALLBACK_TEST_UTIL_H_
+
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+tfrt::ExecutionContext CreateFallbackTestExecutionContext(
+    tfrt::HostContext* host, tfrt::ResourceContext* resource_context,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool =
+        nullptr);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_FALLBACK_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/tensor_metadata.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/tensor_metadata.h
new file mode 100644
index 00000000..f192ace8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/tensor_metadata.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_METADATA_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_METADATA_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/runtime_fallback/util/type_util.h"
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor_metadata.h"  // from @tf_runtime
+
+namespace tensorflow::tfd {
+
+// Retrieves TFRT TensorMetadata from a tensorflow::Tensor.
+inline tfrt::TensorMetadata GetTensorMetadata(
+    const tensorflow::Tensor& tf_tensor) {
+  auto dtype = tfd::GetTfrtDtype(tf_tensor.dtype());
+  auto dim_sizes = tf_tensor.shape().dim_sizes();
+  static_assert(sizeof(tfrt::Index) == sizeof(dim_sizes.front()),
+                "Invalid dimension type size");
+  auto shape = llvm::ArrayRef(reinterpret_cast<tfrt::Index*>(dim_sizes.data()),
+                              dim_sizes.size());
+  return tfrt::TensorMetadata(dtype, shape);
+}
+
+}  // namespace tensorflow::tfd
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/tensor_util.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/tensor_util.h
new file mode 100644
index 00000000..f974edf2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/tensor_util.h
@@ -0,0 +1,68 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_UTIL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/runtime_fallback/util/tensor_metadata.h"  // IWYU pragma: export
+#include "tfrt/dtype/dtype.h"  // from @tf_runtime
+#include "tfrt/host_context/host_buffer.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/tensor/string_host_tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor_shape.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+};
+using OwnedTFTensor = std::unique_ptr<TF_Tensor, TFTensorDeleter>;
+
+// Moves one ref on HostBuffer to tensorflow::Tensor.
+tensorflow::Tensor MoveHostBufferToTfTensor(
+    tfrt::RCReference<tfrt::HostBuffer> host_buffer, tfrt::DType dtype,
+    const tfrt::TensorShape& shape);
+
+// Creates a tensorflow::Tensor based on StringHostTensor.
+tensorflow::Tensor CopyShtToTfTensor(const tfrt::StringHostTensor& sht);
+
+// Converts tfrt shape to tensorflow shape.
+inline tensorflow::TensorShape GetTfShape(const tfrt::TensorShape& shape) {
+  llvm::SmallVector<tfrt::Index, 4> dimensions;
+  shape.GetDimensions(&dimensions);
+  llvm::SmallVector<int64_t, 4> dims(dimensions.begin(), dimensions.end());
+  return tensorflow::TensorShape(dims);
+}
+
+inline void CheckBoolCompatibility() {
+  // sizeof(bool) is implementation defined. The following may only work when
+  // sizeof(bool) is 1.
+  //
+  // TODO(tfrt-devs): It is still undefined behavior to directly cast char*
+  // between bool* and access the data. Consider allocating target objects and
+  // using memcpy instead.
+  static_assert(sizeof(bool) == 1, "Only support when bool is 1 byte.");
+}
+
+}  // namespace tfd
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/type_util.h b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/type_util.h
new file mode 100644
index 00000000..32a859c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/runtime_fallback/util/type_util.h
@@ -0,0 +1,59 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TYPE_UTIL_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TYPE_UTIL_H_
+
+#include "llvm/Support/ErrorHandling.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tfrt/dtype/dtype.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+// Map tfrt::Dtype to TF_DataType.
+inline DataType GetTfDataType(tfrt::DType dtype) {
+  switch (dtype) {
+    case tfrt::DType::Invalid:
+    case tfrt::DType::Unsupported:
+    case tfrt::DType::Resource:
+      DCHECK(false) << "invalid dtype";
+      return DataType::DT_INVALID;
+#define DTYPE(TFRT_ENUM, DT_ENUM) \
+  case tfrt::DType::TFRT_ENUM:    \
+    return DataType::DT_ENUM;
+#include "tensorflow/core/runtime_fallback/util/dtype.def"  // NOLINT
+  }
+}
+
+inline tfrt::DType GetTfrtDtype(DataType dtype) {
+  switch (dtype) {
+    default:
+      return tfrt::DType(tfrt::DType::Unsupported);
+    case DataType::DT_INVALID:
+      return tfrt::DType();
+    case DataType::DT_RESOURCE:
+      return tfrt::DType(tfrt::DType::Resource);
+#define DTYPE(TFRT_ENUM, DT_ENUM) \
+  case DataType::DT_ENUM:         \
+    return tfrt::DType(tfrt::DType::TFRT_ENUM);
+#include "tensorflow/core/runtime_fallback/util/dtype.def"  // NOLINT
+  }
+}
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/summary/schema.h b/third_party/tflite-hdrs/tensorflow/core/summary/schema.h
new file mode 100644
index 00000000..dc13bbfb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/summary/schema.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
+#define TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/db/sqlite.h"
+
+namespace tensorflow {
+
+constexpr uint32 kTensorboardSqliteApplicationId = 0xfeedabee;
+
+/// \brief Creates TensorBoard SQLite tables and indexes.
+///
+/// If they are already created, this has no effect. If schema
+/// migrations are necessary, they will be performed with logging.
+absl::Status SetupTensorboardSqliteDb(Sqlite* db);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/summary/summary_converter.h b/third_party/tflite-hdrs/tensorflow/core/summary/summary_converter.h
new file mode 100644
index 00000000..ab196692
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/summary/summary_converter.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// TODO(jart): Delete these methods in favor of new Python implementation.
+absl::Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
+                                        Summary* s);
+absl::Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
+                                           Summary* s);
+absl::Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
+                                       int max_images, const Tensor& bad_color,
+                                       Summary* s);
+absl::Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
+                                       int max_outputs, float sample_rate,
+                                       Summary* s);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/summary/summary_db_writer.h b/third_party/tflite-hdrs/tensorflow/core/summary/summary_db_writer.h
new file mode 100644
index 00000000..545f849e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/summary/summary_db_writer.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/db/sqlite.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+/// \brief Creates SQLite SummaryWriterInterface.
+///
+/// This can be used to write tensors from the execution graph directly
+/// to a database. The schema must be created beforehand. Entries in
+/// Users, Experiments, and Runs tables will be created automatically
+/// if they don't already exist.
+///
+/// Please note that the type signature of this function may change in
+/// the future if support for other DBs is added to core.
+///
+/// The result holds a new reference to db.
+absl::Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
+                                   const string& run_name,
+                                   const string& user_name, Env* env,
+                                   SummaryWriterInterface** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/summary/summary_file_writer.h b/third_party/tflite-hdrs/tensorflow/core/summary/summary_file_writer.h
new file mode 100644
index 00000000..847e7cb8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/summary/summary_file_writer.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+/// \brief Creates SummaryWriterInterface which writes to a file.
+///
+/// The file is an append-only records file of tf.Event protos. That
+/// makes this summary writer suitable for file systems like GCS.
+///
+/// It will enqueue up to max_queue summaries, and flush at least every
+/// flush_millis milliseconds. The summaries will be written to the
+/// directory specified by logdir and with the filename suffixed by
+/// filename_suffix. The caller owns a reference to result if the
+/// returned status is ok. The Env object must not be destroyed until
+/// after the returned writer.
+absl::Status CreateSummaryFileWriter(int max_queue, int flush_millis,
+                                     const string& logdir,
+                                     const string& filename_suffix, Env* env,
+                                     SummaryWriterInterface** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/async_value_tensor.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/async_value_tensor.h
new file mode 100644
index 00000000..06e99f8f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/async_value_tensor.h
@@ -0,0 +1,72 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_ASYNC_VALUE_TENSOR_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_ASYNC_VALUE_TENSOR_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// The implementation of a Tensor for an AsyncValue and PjRtBuffer. We used it
+// to integrate TF with TFRT.
+// TODO(b/243983834) After the migration of using PjRt for data transfer is
+// completed, GetAsyncRef and SetAsyncRef will be removed and this class will be
+// renamed to PjRtBufferTensor.
+class AsyncValueTensor {
+ public:
+  // Downcast from a Tensor to an AsyncValueTensor. Return nullptr if the
+  // downcast fails.
+  static AsyncValueTensor* FromTensor(const Tensor* tensor);
+
+  const tfrt::RCReference<tfrt::AsyncValue>& GetAsyncRef();
+
+  void SetAsyncRef(tfrt::RCReference<tfrt::AsyncValue> av_ref);
+
+  std::shared_ptr<xla::PjRtBuffer> GetBuffer();
+
+  void SetBuffer(std::shared_ptr<xla::PjRtBuffer> buffer);
+
+  // Convert from a raw pointer to an AsyncValueTensor, removing the pointer
+  // tag.
+  static AsyncValueTensor* FromOpaquePointer(void* ptr);
+
+  // Convert to a raw pointer from an AsyncValueTensor, adding the pointer tag.
+  static void* ToOpaquePointer(AsyncValueTensor* tensor);
+
+ private:
+  tfrt::RCReference<tfrt::AsyncValue> av_ref_;
+  std::shared_ptr<xla::PjRtBuffer> buffer_;
+};
+
+class AsyncValueAllocator : public Allocator {
+ public:
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+
+  bool AllocatesOpaqueHandle() const override { return true; }
+  string Name() override { return "async-value"; }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_ASYNC_VALUE_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/create_pjrt_client_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/create_pjrt_client_util.h
new file mode 100644
index 00000000..fe8dfbb8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/create_pjrt_client_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_CREATE_PJRT_CLIENT_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_CREATE_PJRT_CLIENT_UTIL_H_
+
+#include <optional>
+#include <set>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// Gets PJRT client from TFGlobalResourceManager. If it is not found, creates a
+// PJRT client and adds it to TFGlobalResourceManager. Different `DeviceType`
+// can choose to create the PJRT client explicitly (e.g. in ops) and add it to
+// TFGlobalResourceManager, or create a PJRT client on the first use implicitly
+// in this method.
+// The inputs are the device_type of the caller, and an optional
+// set of device IDs `allowed_devices` for which the stream executor will be
+// created. `allowed_devices` is only used for GPU.
+// TODO(b/260802979): consider passing `XlaPlatformInfo` for the options to
+// create a client, or creating a class similar to `LocalClientOptions`.
+// TODO(b/280111106): make PjrtClientFactoryOptions an input of
+// GetOrCreatePjRtClient.
+absl::StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
+    const DeviceType& device_type,
+    std::optional<std::set<int>> allowed_devices = std::nullopt);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_CREATE_PJRT_CLIENT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/global_state.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/global_state.h
new file mode 100644
index 00000000..117a4365
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/global_state.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_GLOBAL_STATE_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_GLOBAL_STATE_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_global {
+
+class GlobalHostContext {
+ public:
+  static void Set(::tfrt::HostContext* host_ctx);
+  static ::tfrt::HostContext* Get();
+
+ private:
+  static ::tfrt::HostContext* host_ctx_;
+};
+
+// A global resource manager in TF core framework. It can be used to store
+// resources that are per host.
+ResourceMgr* GetTFGlobalResourceMgr();
+
+}  // namespace tfrt_global
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_GLOBAL_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/metrics.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/metrics.h
new file mode 100644
index 00000000..e8486b68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/metrics.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_METRICS_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_METRICS_H_
+
+#include <cstdint>
+#include <string>
+
+#include "xla/tsl/lib/monitoring/sampler.h"
+
+namespace tensorflow {
+namespace tfrt_metrics {
+
+tsl::monitoring::SamplerCell* GetTfrtGraphExecutorLatencySampler(
+    const std::string& model_name, int64_t model_version,
+    const std::string& graph_name);
+
+tsl::monitoring::SamplerCell* GetTfrtDeviceExecutionLatency(
+    const std::string& model_name, int64_t model_version);
+
+}  // namespace tfrt_metrics
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_client_factory_options.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
new file mode 100644
index 00000000..70e3092c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
+
+#include <optional>
+#include <set>
+#include <string>
+
+namespace xla {
+// PjrtClientFactoryOptions store arguments to create PJRT client.
+// Caller is responsible to set option value for corresponding PJRT client
+// factory.
+struct PjrtClientFactoryOptions {
+  struct GpuClientCreateOptions {
+    bool asynchronous = false;
+    int node_id = 0;
+    std::optional<std::set<int>> allowed_devices = std::nullopt;
+    std::optional<std::string> platform_name = std::nullopt;
+  };
+
+  struct CpuClientCreateOptions {
+    bool asynchronous = false;
+  };
+  GpuClientCreateOptions gpu_options;
+  CpuClientCreateOptions cpu_options;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
new file mode 100644
index 00000000..2a04e9af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_REGISTRY_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/tsl/framework/device_type.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace xla {
+
+using PjrtClientFactory =
+    std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>(
+        const PjrtClientFactoryOptions&)>;
+
+// The Pjrt client factory registry holds all the registered client factories.
+class PjrtClientFactoryRegistry {
+ public:
+  explicit PjrtClientFactoryRegistry() = default;
+
+  // Registers PjrtClientFactory with DeviceType as key.
+  tensorflow::InitOnStartupMarker RegisterPjrtClientFactory(
+      const tsl::DeviceType& device_type,
+      const PjrtClientFactory& client_factory);
+
+  // Given the device type, finds related PjrtClientFactory function which takes
+  // factory option and returns PjrtClient if succeeds.
+  absl::StatusOr<std::unique_ptr<PjRtClient>> GetPjrtClient(
+      const tsl::DeviceType& device_type,
+      const PjrtClientFactoryOptions& options);
+
+  // Returns singleton instance of PjrtClientFactoryRegistry class.
+  static PjrtClientFactoryRegistry& Get();
+
+ private:
+  absl::flat_hash_map<std::string, PjrtClientFactory> registry_
+      TF_GUARDED_BY(mu_);
+
+  mutable ::tensorflow::mutex mu_;
+};
+
+// The `REGISTER_PJRT_CLIENT_FACTORY()` calls RegisterPjrtClientFactory on
+// program startup.
+#define REGISTER_PJRT_CLIENT_FACTORY(pjrt_client, device_type, client_factory) \
+  static ::tensorflow::InitOnStartupMarker const register_##pjrt_client =      \
+      ::tensorflow::InitOnStartupMarker{}                                      \
+      << ::xla::PjrtClientFactoryRegistry::Get().RegisterPjrtClientFactory(    \
+             tsl::DeviceType(device_type), client_factory)
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_state.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_state.h
new file mode 100644
index 00000000..c3df6806
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_state.h
@@ -0,0 +1,90 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_STATE_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_PJRT_STATE_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/local_device_state.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+const char kPjRtStateResourceName[] = "pjrt_state";
+using PjRtClientsMap = std::map<DeviceType, std::unique_ptr<xla::PjRtClient>>;
+
+// Information needed to create a PjRt GPU Client which is used when creating
+// a client after after information about remote devices is available.
+struct PjRtGpuClientCreationInfo {
+  std::set<int> allowed_devices;
+  std::unique_ptr<se::MultiDeviceAdapter> allocator;
+  std::unique_ptr<tsl::Allocator> host_memory_allocator;
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
+  xla::LocalClient* local_client;
+};
+
+// The class for the state related to PjRt. It contains a map from `DeviceType`
+// to `PjRtClient`. It will be stored in the global `ResourceManager`.
+class PjRtState : public ResourceBase {
+ public:
+  static PjRtState* Create();
+  absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type);
+  absl::StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
+      const DeviceType& device_type);
+  absl::Status SetPjRtClient(const DeviceType& device_type,
+                             std::unique_ptr<xla::PjRtClient> client);
+  // Moves PJRT client to `unused_`. The PJRT client moved to `unused_` will not
+  // be returned by `GetPjRtClient`.
+  absl::Status MovePjRtClientToUnused(const DeviceType& device_type);
+  string DebugString() const override;
+
+  // Saves information needed to create a PJRT client (to enable creating a
+  // client with remote devices).
+  absl::Status SetPjRtGpuClientCreationInfo(
+      std::unique_ptr<PjRtGpuClientCreationInfo> info);
+
+  // Retrieves information needed to create a PJRT client (for creating a
+  // client with remote devices).
+  PjRtGpuClientCreationInfo* GetPjRtGpuClientCreationInfo();
+
+ private:
+  explicit PjRtState() {}
+  absl::Mutex mu_;
+  PjRtClientsMap clients_ ABSL_GUARDED_BY(mu_);
+  // Store the PJRT clients that are no longer used to guarantee that PJRT
+  // clients outlive PJRT buffers.
+  std::vector<std::unique_ptr<xla::PjRtClient>> unused_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<PjRtGpuClientCreationInfo> pjrt_gpu_client_creation_info_
+      ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_util.h
new file mode 100644
index 00000000..aaba7ad9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/common/pjrt_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/common/pjrt_state.h"
+
+namespace tensorflow {
+
+// Sets PJRT client for device_type in TFGlobalResourceManager. If a PJRT client
+// for this device_type already exists, the existing PJRT client will not be
+// destroyed, and will be kept alive in an "unused client" vector. PJRT API
+// semantics require the PJRT client to outlive PJRT buffers.
+absl::Status SetPjRtClientInTFGlobalResourceManager(
+    const DeviceType& device_type, std::unique_ptr<xla::PjRtClient> client);
+
+// Gets (the most recent) PJRT client for device_type from
+// TFGlobalResourceManager.
+absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type);
+
+absl::Status SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info);
+absl::StatusOr<PjRtGpuClientCreationInfo*> GetPjRtGpuClientCreationInfo();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/cost_recorder.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/cost_recorder.h
new file mode 100644
index 00000000..e1d1b7f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/cost_recorder.h
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines a recorder for op cost measurement.
+
+#ifndef TENSORFLOW_CORE_TFRT_FALLBACK_COST_RECORDER_H_
+#define TENSORFLOW_CORE_TFRT_FALLBACK_COST_RECORDER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Thread-safe.
+// Maintains the execution durations by `op_key`. Note that `op_key` is only
+// unique within a model.
+class CostRecorder {
+ public:
+  // Records an execution duration for the op keyed by `op_key`.
+  void RecordCost(int64_t op_key, uint64_t execution_time);
+
+  // Returns the normalized average execution duration of the op keyed by
+  // `op_key`. If there is no record for `op_key`, returns the uint32_t::max to
+  // avoid stream merging. Note that we don't use uint64_t::max because
+  // otherwise adding op costs would cause overflow.
+  uint64_t GetCost(int64_t op_key) const;
+
+  // Writes the op cost map (in format of `OpCostMapProto`) to a file specified
+  // by the env var name `MesuredCostPathEnvVarName()`.
+  // TODO(b/263837451): Fix the op_key unstableness during serialization.
+  absl::Status WriteToFile() const;
+
+  size_t size() const;
+
+  static const char* MesuredCostPathEnvVarName() {
+    return "TF_TFRT_MEASURED_COST_PATH";
+  }
+
+ private:
+  mutable tensorflow::mutex op_cost_map_mutex_;
+  // Map op key to {sum of op execution duration, #occurences of the op}.
+  absl::flat_hash_map<int64_t, std::pair<uint64_t, uint64_t>> op_cost_map_
+      TF_GUARDED_BY(op_cost_map_mutex_);
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_FALLBACK_COST_RECORDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/device_with_custom_allocator.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/device_with_custom_allocator.h
new file mode 100644
index 00000000..f04e95f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/device_with_custom_allocator.h
@@ -0,0 +1,101 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_FALLBACK_DEVICE_WITH_CUSTOM_ALLOCATOR_H_
+#define TENSORFLOW_CORE_TFRT_FALLBACK_DEVICE_WITH_CUSTOM_ALLOCATOR_H_
+
+#include <utility>
+
+#include "xla/tsl/framework/allocator.h"
+#include "tensorflow/core/framework/device.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class DeviceWithCustomAllocator : public tensorflow::Device {
+ public:
+  DeviceWithCustomAllocator(tensorflow::Device* device,
+                            tensorflow::Allocator* allocator)
+      : Device(device->env(), device->attributes()),
+        device_(device),
+        allocator_(allocator) {
+    DCHECK(device_);
+    DCHECK(allocator_);
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return allocator_;
+  }
+
+  const DeviceBase* UnderlyingDevice() const override {
+    return device_->UnderlyingDevice();
+  }
+  DeviceBase* UnderlyingDevice() override {
+    return device_->UnderlyingDevice();
+  }
+
+  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
+    return device_->tensorflow_cpu_worker_threads();
+  }
+
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64_t step_id) override {
+    return device_->GetScopedAllocator(attr, step_id);
+  }
+
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return device_->GetScopedAllocatorMgr();
+  }
+
+  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
+    return device_->eigen_cpu_device();
+  }
+
+  thread::ThreadPool* tensorflow_device_thread_pool() override {
+    return device_->tensorflow_device_thread_pool();
+  }
+
+  bool has_eigen_cpu_device() const override {
+    return device_->has_eigen_cpu_device();
+  }
+
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override {
+    return device_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
+  }
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override {
+    device_->CopyTensorInSameDevice(input_tensor, output_tensor, device_context,
+                                    std::move(done));
+  }
+
+  absl::Status Sync() override { return device_->Sync(); }
+
+  // Returns the resource manager associated w/ this device.
+  ResourceMgr* resource_manager() override {
+    return device_->resource_manager();
+  }
+
+ private:
+  tensorflow::Device* device_ = nullptr;
+  tensorflow::Allocator* allocator_ = nullptr;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_FALLBACK_DEVICE_WITH_CUSTOM_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/fallback_state.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/fallback_state.h
new file mode 100644
index 00000000..ffbf0695
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -0,0 +1,105 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_FALLBACK_FALLBACK_STATE_H_
+#define TENSORFLOW_CORE_TFRT_FALLBACK_FALLBACK_STATE_H_
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/graph_execution_state.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// FallbackState contains the necessary runtime states (eg. Devices) used in
+// current tensorflow. It also provides methods used in current tensorflow.
+class FallbackState {
+ public:
+  // The FunctionDefLibrary is passed in to initialize the
+  // ProcessFunctionLibraryRuntime member of this class
+  static absl::StatusOr<std::unique_ptr<FallbackState>> Create(
+      const SessionOptions &session_options,
+      const tensorflow::FunctionDefLibrary &fdef_lib);
+
+  static absl::StatusOr<std::unique_ptr<FallbackState>> CreateWithCpuDevice(
+      const SessionOptions &session_options,
+      const tensorflow::FunctionDefLibrary &fdef_lib);
+
+  static absl::StatusOr<std::unique_ptr<FallbackState>> CreateWithMockGpuDevice(
+      const SessionOptions &session_options,
+      const tensorflow::FunctionDefLibrary &fdef_lib);
+
+  static absl::StatusOr<std::unique_ptr<FallbackState>> CreateWithDeviceMgr(
+      const SessionOptions &session_options,
+      const tensorflow::FunctionDefLibrary &fdef_lib,
+      absl::Nonnull<DynamicDeviceMgr *> device_mgr);
+
+  FallbackState(const SessionOptions &session_options,
+                std::variant<std::vector<std::unique_ptr<Device>>,
+                             absl::Nonnull<DynamicDeviceMgr *>>
+                    device_mgr,
+                const tensorflow::FunctionDefLibrary &fdef_lib);
+
+  // Create GraphExecutionState from the `graph_def`. The result will contain a
+  // preprocessed graph with runtime information such as devices.
+  absl::StatusOr<std::unique_ptr<GraphExecutionState>>
+  CreateGraphExecutionState(GraphDef graph_def, bool run_placer = true,
+                            bool enable_tf2xla_mlir_bridge = true) const;
+
+  // Adds `func_def` to the function library.
+  absl::Status AddFunctionDef(const FunctionDef &func_def);
+
+  const SessionOptions &session_options() const { return session_options_; }
+
+  const DeviceMgr &device_manager() const { return *device_manager_ptr_; }
+
+  DeviceMgr &device_manager() { return *device_manager_ptr_; }
+
+  const DeviceSet &device_set() const { return device_set_; }
+
+  const ProcessFunctionLibraryRuntime &process_function_library_runtime()
+      const {
+    return pflr_;
+  }
+
+  const FunctionLibraryDefinition &func_lib_def() const {
+    return func_lib_def_;
+  }
+
+ private:
+  SessionOptions session_options_;
+  DynamicDeviceMgr device_manager_;
+  absl::Nonnull<DynamicDeviceMgr *> device_manager_ptr_;
+  DeviceSet device_set_;
+  FunctionLibraryDefinition func_lib_def_;
+  ProcessFunctionLibraryRuntime pflr_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_FALLBACK_FALLBACK_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/op_kernel_runner.h
new file mode 100644
index 00000000..317d0956
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -0,0 +1,236 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_FALLBACK_OP_KERNEL_RUNNER_H_
+#define TENSORFLOW_CORE_TFRT_FALLBACK_OP_KERNEL_RUNNER_H_
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class OpKernelRunner {
+ public:
+  static absl::StatusOr<OpKernelRunner> Create(
+      absl::string_view op_name, absl::string_view node_name,
+      absl::string_view device_name, int num_args,
+      const std::function<absl::Status(tensorflow::AttrValueMap*)>&
+          attr_builder,
+      const tensorflow::DeviceMgr& device_manager,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime);
+
+  ABSL_DEPRECATED("Please use the Create() method that takes node_name.")
+  static absl::StatusOr<OpKernelRunner> Create(
+      absl::string_view op_name, absl::string_view device_name, int num_args,
+      const std::function<absl::Status(tensorflow::AttrValueMap*)>&
+          attr_builder,
+      const tensorflow::DeviceMgr& device_manager,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime) {
+    return Create(op_name, /*node_name=*/op_name, device_name, num_args,
+                  attr_builder, device_manager,
+                  process_function_library_runtime);
+  }
+
+  static absl::StatusOr<OpKernelRunner> Create(
+      absl::string_view op_name, absl::string_view node_name, int num_args,
+      const std::function<absl::Status(tensorflow::AttrValueMap*)>&
+          attr_builder,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime,
+      tensorflow::Device* device);
+
+  ABSL_DEPRECATED("Please use the Create() method that takes node_name.")
+  static absl::StatusOr<OpKernelRunner> Create(
+      absl::string_view op_name, int num_args,
+      const std::function<absl::Status(tensorflow::AttrValueMap*)>&
+          attr_builder,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime,
+      tensorflow::Device* device) {
+    return Create(op_name, /*node_name=*/op_name, num_args, attr_builder,
+                  process_function_library_runtime, device);
+  }
+
+  OpKernelRunner() = default;
+
+  explicit operator bool() const { return op_kernel_ != nullptr; }
+
+  void Run(OpKernelContext* context) const {
+    DVLOG(1) << "KernelFallbackExecuteCompat Running Op: "
+             << op_kernel_->def().DebugString()
+             << ", on Device: " << context->device()->name();
+
+    // For TFRT GPU or TPU, we currently only run xla clusters on GPU or TPU,
+    // and all other ops are run on CPU.
+
+    op_kernel_->Compute(context);
+  }
+
+  void RunAsync(OpKernelContext* context,
+                AsyncOpKernel::DoneCallback done_callback) const;
+
+  bool IsAsync() const { return info_->is_async; }
+
+  tensorflow::OpKernel* op_kernel() const { return op_kernel_.get(); }
+  tensorflow::Device* device() const { return info_->device; }
+  tensorflow::FunctionLibraryRuntime* function_library_runtime() const {
+    return info_->function_library_runtime;
+  }
+  tensorflow::ResourceMgr* resource_manager() const {
+    return info_->resource_manager;
+  }
+
+  absl::Span<const AllocatorAttributes> input_alloc_attrs() const {
+    return input_alloc_attrs_;
+  }
+  absl::Span<const AllocatorAttributes> output_alloc_attrs() const {
+    return output_alloc_attrs_;
+  }
+
+ private:
+  explicit OpKernelRunner(
+      tensorflow::Device* device,
+      tensorflow::FunctionLibraryRuntime* function_library_runtime,
+      std::unique_ptr<OpKernel> op_kernel);
+
+  std::unique_ptr<OpKernel> op_kernel_;
+  absl::Span<const AllocatorAttributes> input_alloc_attrs_;
+  absl::Span<const AllocatorAttributes> output_alloc_attrs_;
+
+  struct Info {
+    tensorflow::Device* device = nullptr;
+    tensorflow::FunctionLibraryRuntime* function_library_runtime = nullptr;
+    tensorflow::ResourceMgr* resource_manager = nullptr;
+    bool is_async = false;
+    absl::InlinedVector<AllocatorAttributes, 4UL> input_alloc_attrs;
+    absl::InlinedVector<AllocatorAttributes, 1UL> output_alloc_attrs;
+  };
+  std::unique_ptr<Info> info_;
+};
+
+// OpKernelRunState keeps the states needed for per-kernel execution.
+struct OpKernelRunState {
+  std::vector<const tensorflow::TensorBuffer*> tensor_buffers;
+  std::vector<tensorflow::TensorValue> input_tf_tensor_values;
+  OpKernelContext::Params params;
+  absl::InlinedVector<tensorflow::Tensor, 4UL> input_tf_tensors;
+
+  OpKernelRunState() = default;
+  OpKernelRunState(absl::Span<const tensorflow::TensorValue> tensor_values,
+                   const OpKernelContext::Params& p,
+                   tensorflow::DeviceBase* device = nullptr) {
+    // `input_tf_tensor_values` contains the reference to all tensor used,
+    // while `input_tf_tensors` only contains those needs ownership so their
+    // sizes may not match. For this copy assignment, we conservatively copy all
+    // tensors.
+    input_tf_tensors.reserve(tensor_values.size());
+    for (const auto& tensor_value : tensor_values) {
+      input_tf_tensors.push_back(*tensor_value.tensor);
+    }
+    for (auto& tensor : input_tf_tensors) {
+      input_tf_tensor_values.emplace_back(&tensor);
+    }
+
+    // Since `input_tf_tensor_values` and `params` contains pointers to
+    // `input_tf_tensors`, we need to change those pointers to the correct ones
+    // after copying.
+    params = p;
+    params.inputs = input_tf_tensor_values;
+    // Clear eigen_gpu_device to ensure OpKernelContext constructor will make a
+    // new eigen GPU device.
+    params.eigen_gpu_device = nullptr;
+    if (device != nullptr) params.device = device;
+  }
+
+  OpKernelRunState(const OpKernelRunState& other) = delete;
+  OpKernelRunState& operator=(const OpKernelRunState& other) = delete;
+
+  ~OpKernelRunState() = default;
+};
+
+// OpKernelRunnerTable for keeping OpKernelRunner instances to avoid expensive
+// reinstantiation of OpKernel and other repeated setup per kernel execution.
+// OpKernelRunnerTable is thread-compatible.
+class OpKernelRunnerTable {
+ public:
+  OpKernelRunnerTable() = default;
+
+  // Return true if it successfully inserts `runner`. `index` is supposed to be
+  // dense.
+  bool Insert(int64_t index, OpKernelRunner runner) {
+    if (runners_.size() <= index) runners_.resize(index + 1);
+    if (runners_[index]) return false;
+    runners_[index] = std::move(runner);
+    return true;
+  }
+
+  // Return the OpKernelRunner at the corresponding `index` in the table. The
+  // result can never be nullptr. It is a fatal error to use an index that is
+  // not in the table. Note that the returned pointer will be invalidated if
+  // Insert() is called.
+  const OpKernelRunner* Get(int64_t index) const {
+    // Out of bounds vector access will throw an exception and anyway will crash
+    // the binary, prefer a more readable error message.
+    CHECK_GT(runners_.size(), index)  // Crash OK
+        << "runner index is out of bounds: index=" << index
+        << " size=" << runners_.size();
+    CHECK(runners_[index])  // Crash OK
+        << "runner is not available: index=" << index;
+    return GetUnsafe(index);
+  }
+
+  const OpKernelRunner* GetUnsafe(int64_t index) const {
+    DCHECK_GT(runners_.size(), index);
+    auto& result = runners_[index];
+    DCHECK(result);
+    return &result;
+  }
+
+ private:
+  std::vector<OpKernelRunner> runners_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_FALLBACK_OP_KERNEL_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
new file mode 100644
index 00000000..64f1060e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
@@ -0,0 +1,74 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_FALLBACK_OP_KERNEL_RUNNER_CACHE_H_
+#define TENSORFLOW_CORE_TFRT_FALLBACK_OP_KERNEL_RUNNER_CACHE_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tfrt/host_context/location.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class OpLocationKey {
+ public:
+  explicit OpLocationKey(tfrt::Location loc) : loc_(loc) {}
+
+  template <typename H>
+  friend H AbslHashValue(H h, const OpLocationKey& key) {
+    // NOTE: Each BEF file has its own LocationHandler. Using LocationHandler
+    // as part of cache key here can avoid cache collision between different
+    // BEF file.
+    return H::combine(std::move(h), key.loc_.data, key.loc_.GetHandler());
+  }
+
+  friend bool operator==(const OpLocationKey& x, const OpLocationKey& y) {
+    return x.loc_.data == y.loc_.data &&
+           x.loc_.GetHandler() == y.loc_.GetHandler();
+  }
+
+ private:
+  tfrt::Location loc_;
+};
+
+// OpKernelRunnerCache is similar to OpKernelRunnerTable but thread-safe.
+class OpKernelRunnerCache {
+ public:
+  OpKernelRunnerCache() = default;
+
+  absl::StatusOr<OpKernelRunner*> GetOrCreate(
+      tfrt::Location loc, absl::string_view op_name,
+      absl::string_view device_name, int num_args,
+      const std::function<absl::Status(tensorflow::AttrValueMap*)>&
+          attr_builder,
+      const tensorflow::DeviceMgr& device_manager,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime);
+
+ private:
+  mutable mutex mu_;
+  absl::flat_hash_map<OpLocationKey, std::unique_ptr<OpKernelRunner>> map_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TFRT_FALLBACK_OP_KERNEL_RUNNER_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/gpu/kernel/gpu_runner.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/gpu/kernel/gpu_runner.h
new file mode 100644
index 00000000..5c51b8d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/gpu/kernel/gpu_runner.h
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GPU_KERNEL_GPU_RUNNER_H_
+#define TENSORFLOW_CORE_TFRT_GPU_KERNEL_GPU_RUNNER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "xla/tsl/framework/serving_device_selector.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tensorflow/core/tfrt/utils/gpu_variables_table.h"
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace gpu {
+
+constexpr char kGpuRunnerResourceName[] = "GpuRunnerResource";
+
+struct GpuRunInputs {
+  std::vector<tfrt_stub::FallbackTensor> args;
+  int num_outputs;
+  std::vector<int64_t> resource_indices;
+  std::vector<int64_t> used_output_indices;
+  std::string func_name;
+  Device* cpu_device;
+  absl::flat_hash_map<int, Device*> gpu_devices;
+  const tfd::KernelFallbackCompatRequestState* fallback_request_state;
+  tfrt::HostContext* host_ctx;
+};
+
+class GpuRunner {
+ public:
+  explicit GpuRunner(tsl::ServingDeviceSelector* serving_device_selector)
+      : serving_device_selector_(serving_device_selector) {}
+
+  // This compiles the given program and runs the given input tensors in
+  // `run_inputs`, and returns the output tensor AsyncValues.
+  absl::StatusOr<
+      llvm::SmallVector<tfrt::AsyncValueRef<tfrt_stub::FallbackTensor>>>
+  Run(GpuRunInputs run_inputs);
+
+ private:
+  tsl::ServingDeviceSelector* serving_device_selector_;
+  tfrt::gpu::GpuVariablesTable vars_table_;
+};
+
+}  // namespace gpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GPU_KERNEL_GPU_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/gpu/kernel/tfrt_gpu_init.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/gpu/kernel/tfrt_gpu_init.h
new file mode 100644
index 00000000..f36356b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/gpu/kernel/tfrt_gpu_init.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GPU_KERNEL_TFRT_GPU_INIT_H_
+#define TENSORFLOW_CORE_TFRT_GPU_KERNEL_TFRT_GPU_INIT_H_
+#include "absl/status/status.h"
+#include "xla/tsl/framework/serving_device_selector_policies.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+
+namespace tensorflow {
+namespace gpu {
+
+struct GpuRunnerOptions {
+  int num_gpu_streams = 1;
+  tsl::ServingDeviceSelectorPolicy serving_selector_policy =
+      tsl::ServingDeviceSelectorPolicy::kRoundRobin;
+};
+
+absl::Status InitTfrtGpu(const GpuRunnerOptions& options,
+                         tensorflow::tfrt_stub::Runtime& runtime);
+
+}  // namespace gpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GPU_KERNEL_TFRT_GPU_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/config.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/config.h
new file mode 100644
index 00000000..b0e3fbf1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/config.h
@@ -0,0 +1,84 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_CONFIG_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_CONFIG_H_
+
+#include <string>
+
+#include "google/protobuf/any.pb.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/tfrt/graph_executor/config.pb.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// The helper class for building RuntimeConfigProto and retrieving configs of
+// certain types from the RuntimeConfigProto.
+class RuntimeConfig {
+ public:
+  RuntimeConfig() = default;
+
+  static absl::StatusOr<RuntimeConfig> CreateFromProto(
+      RuntimeConfigProto proto);
+
+  template <typename ConcreteProto>
+  absl::Status Add(const ConcreteProto& config) {
+    const auto& full_name = config.GetDescriptor()->full_name();
+    if (map_.contains(full_name)) {
+      return absl::AlreadyExistsError(
+          absl::StrCat(full_name, " already exists in ModelConfig."));
+    }
+
+    size_t id = proto_.config_size();
+    if (!proto_.add_config()->PackFrom(config)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Failed to pack proto to Any: ", full_name));
+    }
+    map_[full_name] = id;
+    return absl::OkStatus();
+  }
+
+  template <typename ConcreteProto>
+  absl::StatusOr<ConcreteProto> Get() const {
+    const auto& full_name = ConcreteProto::GetDescriptor()->full_name();
+    auto iter = map_.find(full_name);
+
+    if (iter == map_.end()) {
+      return absl::NotFoundError(
+          absl::StrCat(full_name, " not found in ModelConfig."));
+    }
+
+    ConcreteProto config;
+    if (!proto_.config().at(iter->second).UnpackTo(&config)) {
+      return absl::DataLossError(
+          absl::StrCat("Failed to unpack proto: ", full_name));
+    }
+    return config;
+  }
+
+  const RuntimeConfigProto& ToProto() const { return proto_; }
+
+ private:
+  RuntimeConfigProto proto_;
+  absl::flat_hash_map<std::string, size_t> map_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/executable_context.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/executable_context.h
new file mode 100644
index 00000000..fb02ab34
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/executable_context.h
@@ -0,0 +1,65 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXECUTABLE_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXECUTABLE_CONTEXT_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+#include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Stores executable-related data.
+struct ExecutableContext {
+  ExecutableContext(mlrt::bc::Buffer bytecode_buffer,
+                    std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable)
+      : bytecode_buffer(std::move(bytecode_buffer)),
+        bytecode_executable(std::move(bytecode_executable)) {}
+
+  ExecutableContext(tfrt::BefBuffer bef,
+                    tfrt::RCReference<tfrt::BEFFile> bef_file)
+      : bef(std::move(bef)), bef_file(std::move(bef_file)) {}
+
+  bool IsForMlrt() const { return bytecode_executable != nullptr; }
+
+  // Only one set of values will be filled.
+
+  // For the MLRT path.
+  mlrt::bc::Buffer bytecode_buffer;
+  std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable;
+
+  // For the TFRT path.
+  tfrt::BefBuffer bef;
+  tfrt::RCReference<tfrt::BEFFile> bef_file;
+
+  // There are some resources that need re-creating when the executable is
+  // re-created, so a resource context is stored along with the executable.
+  // This resource context is meant to be passed to the op kernels for their
+  // references. See the comment above `GraphExecutor::resource_context_`
+  // about the todo to merge that resource context with this one.
+  tfrt::ResourceContext resource_context;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXECUTABLE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/export_mlir.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/export_mlir.h
new file mode 100644
index 00000000..ac687711
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/export_mlir.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXPORT_MLIR_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXPORT_MLIR_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class XsymbolUploader {
+ public:
+  virtual ~XsymbolUploader() = default;
+
+  virtual std::string MaybeUploadMlirToXsymbol(mlir::ModuleOp module) {
+    return "";
+  }
+};
+
+class XsymbolUploaderRegistry {
+ public:
+  XsymbolUploaderRegistry()
+      : xsymbol_uploader_(std::make_unique<XsymbolUploader>()) {}
+
+  void Register(std::unique_ptr<XsymbolUploader> xsymbol_uploader) {
+    xsymbol_uploader_ = std::move(xsymbol_uploader);
+  }
+
+  XsymbolUploader &Get() const { return *xsymbol_uploader_; }
+
+ private:
+  std::unique_ptr<XsymbolUploader> xsymbol_uploader_;
+};
+
+inline XsymbolUploaderRegistry &GetGlobalXsymbolUploaderRegistry() {
+  static auto *const registry = new XsymbolUploaderRegistry;
+  return *registry;
+}
+
+inline std::string MaybeUploadMlirToXsymbol(mlir::ModuleOp module) {
+  return GetGlobalXsymbolUploaderRegistry().Get().MaybeUploadMlirToXsymbol(
+      module);
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXPORT_MLIR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/graph_execution_options.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
new file mode 100644
index 00000000..32d0a007
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
@@ -0,0 +1,162 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTION_OPTIONS_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTION_OPTIONS_H_
+
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "absl/time/time.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class Runtime;
+
+// General options for graph execution.
+struct GraphExecutionOptions {
+  explicit GraphExecutionOptions(const tensorflow::tfrt_stub::Runtime* rt)
+      : runtime(rt) {
+    DCHECK(runtime);
+  }
+
+  // If true, when creating an optimized subgraph, Placer and Grappler will
+  // also run on the functions.
+  bool run_placer_grappler_on_functions = false;
+
+  // If true, the function optimizer in the grappler will be enabled, and
+  // optimizations like function inlining will be applied.
+  bool enable_grappler_function_optimizer = false;
+
+  // Whether to enable TFRT GPU.
+  bool enable_tfrt_gpu = false;
+
+  // The number of virtual GPUs to create on a physical GPU.
+  int tfrt_gpu_parallelism = 1;
+
+  // if not zero, override the reserved memory space for gpu system.
+  int gpu_system_memory_size_in_mb = 0;
+
+  // Whether to use gpurt.compile_and_execute for GPU.
+  // TODO(b/294895431): Remove the flag and default to the fused op.
+  bool tfrt_use_fused_gpu_op = false;
+
+  // Runtime configuration. Refer to tensorflow::tfrt_stub::Runtime class for
+  // more details. It must not be nullptr;
+  const tensorflow::tfrt_stub::Runtime* runtime = nullptr;
+
+  // Model metadata used for monitoring and tracing.
+  tensorflow::SessionMetadata model_metadata;
+
+  // The model-specific runtime configurations.
+  tensorflow::tfrt_stub::RuntimeConfig runtime_config;
+
+  // TODO(b/266251216): Maybe flip the default value.
+  [[deprecated(
+      "Use CostAnalysisOptions's `CostAnalysisOptions::ONCE` instead")]] bool
+      enable_online_cost_analysis = false;
+
+  // Determines how often op costs are recorded, and how often these costs
+  // are used to re-compile the executable. Note to users: CostAnalysisOptions
+  // is overwritten when `enable_online_cost_analysis = true`.
+  struct CostAnalysisOptions {
+    enum CostAnalysisVersion {
+      kDisabled,
+      kOnce,  // Cost recording and recompilation occurs on the first run only.
+      kPeriodic,  // This is experimental.
+    };
+    CostAnalysisVersion version = kDisabled;
+
+    // Time between resets in Op cost estimates. Upon reset, the executable
+    // will be recompiled.
+    // However, a reset always occurs after the first execution.
+    absl::Duration reset_interval = absl::ZeroDuration();
+
+    // Number of times to record costs before resetting Op cost estimates.
+    // However, a reset always occurs after the first execution.
+    int updates_per_interval = 1;
+  };
+
+  CostAnalysisOptions cost_analysis_options;
+
+  // If true, the MLRT interpreter will be used instead of the BEF executor.
+  // This option is experimental.
+  bool enable_mlrt = false;
+
+  // If true, the IFRT will be used instead of the TPU Runner.
+  // This option is experimental.
+  bool use_ifrt = false;
+
+  tensorflow::TfrtCompileOptions compile_options;
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const GraphExecutionOptions& options);
+
+// Per-request options for graph execution.
+struct GraphExecutionRunOptions {
+  std::optional<std::chrono::system_clock::time_point> deadline;
+
+  // Priority of the request. Larger number means higher priority.
+  int priority = 0;
+
+  // If true, the input specs will be checked before running, and an error
+  // will be raised upon mismatch.
+  bool validate_input_specs = false;
+
+  // TODO(b/279197040) Remove after b/279197040 is fixed.
+  // If true, the input specs will be checked before running, and an error
+  // will be logged upon mismatch.
+  bool validate_input_specs_dry_run = false;
+
+  // The thread pool used for this run. If it is nullptr, a default one set
+  // in the tensorflow::tfrt_stub::Runtime will be used.
+  tensorflow::tfrt_stub::WorkQueueInterface* work_queue = nullptr;
+
+  // If true, just-in-time host compilation is disabled, and then if the
+  // specified graph is not compiled, the execution will return an error.
+  bool disable_compilation = false;
+
+  std::function<void(absl::flat_hash_map<std::string, tensorflow::Tensor>)>
+      streamed_output_callback;
+
+  // The optional name for debugging purposes. If empty, the runtime will pick a
+  // name e.g. the joined string of input names and output names.
+  std::string name;
+};
+
+// Creates the default `SessionOptions` from a `GraphExecutionOptions`.
+// The created `SessionOptions` contains the Grappler configs.
+tensorflow::SessionOptions CreateDefaultSessionOptions(
+    const GraphExecutionOptions& options);
+
+// Updates TPU target to fallback if bridge uncompatible, otherwise TPU runtime.
+void UpdateTpuTargetByBridgeCompatibility(
+    tensorflow::tfrt_stub::GraphExecutionOptions& options,
+    const tensorflow::GraphDef& graph_def);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/graph_executor.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/graph_executor.h
new file mode 100644
index 00000000..18375802
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -0,0 +1,394 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/lib/monitoring/sampler.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/graph_executor/executable_context.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/runtime/stream.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+#include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
+#include "tsl/platform/thread_annotations.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+#include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
+#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/function.h"  // from @tf_runtime
+#include "tfrt/host_context/request_deadline_tracker.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Contains request related info.
+struct RequestInfo {
+  tfrt::RCReference<tfrt::RequestContext> tfrt_request_context;
+  // If this request needs to create a new queue, it is stored here. Otherwise,
+  // it can be nullptr.
+  std::unique_ptr<WorkQueueInterface> request_queue_owner;
+  // The inter-op thread pool to be used for this request, and it must not be
+  // nullptr. If `request_queue_owner` is not nullptr, then `request_queue` is
+  // the raw pointer inside `request_queue_owner`.
+  WorkQueueInterface* request_queue = nullptr;
+  // The task runner used by tensorflow::OpKernel.
+  std::function<void(std::function<void()>)> runner;
+
+  tensorflow::CancellationManager cancellation_manager;
+};
+
+struct SymbolUids {
+  std::string tf_symbol_uid;
+  std::string tfrt_symbol_uid;
+};
+
+// Creates a `RequestInfo` given relative data.
+// Note: `resource_context` is per-graph-executor and
+// `client_graph_resource_context` is per-loaded-client-graph. See the comment
+// above `GraphExecutor::resource_context_` about the todo to merge these two.
+absl::StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
+    const GraphExecutionOptions& options,
+    const GraphExecutionRunOptions& run_options,
+    tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
+    tfrt::ResourceContext* resource_context,
+    tfrt::ResourceContext* client_graph_resource_context,
+    OpKernelRunnerTable* runner_table,
+    tfd::FallbackResourceArray* resource_array, FallbackState& fallback_state,
+    const ProcessFunctionLibraryRuntime& process_function_library_runtime,
+    CostRecorder* cost_recorder = nullptr);
+
+// Runs on a function given input/output and other info.
+// Note: `resource_context` is per-graph-executor and
+// `client_graph_resource_context` is per-loaded-client-graph. See the comment
+// above `GraphExecutor::resource_context_` about the todo to merge these two.
+//
+// TODO(chky): Refactor this function to take `LoadedClientGraph` instead of
+// having a long list of parameters.
+absl::Status GraphExecutionRunOnFunction(
+    const GraphExecutionOptions& options,
+    const GraphExecutionRunOptions& run_options,
+    absl::string_view signature_name, const SymbolUids& symbol_uids,
+    const tfrt::Function* func, const mlrt::LoadedExecutable* loaded_executable,
+    absl::Span<const tensorflow::Tensor> inputs,
+    std::vector<tensorflow::Tensor>* outputs,
+    tfrt::ResourceContext* resource_context,
+    tfrt::ResourceContext* client_graph_resource_context,
+    OpKernelRunnerTable* runner_table,
+    tfd::FallbackResourceArray* resource_array, const Runtime& runtime,
+    FallbackState& fallback_state,
+    const tensorflow::ProcessFunctionLibraryRuntime&
+        process_function_library_runtime,
+    tfrt::RequestDeadlineTracker* req_deadline_tracker,
+    std::optional<StreamCallbackId> stream_callback_id,
+    CostRecorder* cost_recorder = nullptr);
+
+// Runs a MLRT function for executing tensorflow graphs.
+absl::Status RunMlrtFunction(
+    mlrt::bc::Function function,
+    const mlrt::LoadedExecutable& loaded_executable,
+    const tsl::RCReference<tfrt::RequestContext>& request_context,
+    tfrt::ConcurrentWorkQueue& work_queue,
+    absl::Span<const tensorflow::Tensor> inputs,
+    std::vector<tensorflow::Tensor>* outputs,
+    SyncResourceState* sync_resource_state);
+
+// Loads (if not yet) and runs a subgraph in a graph as per each request.
+class GraphExecutor {
+ public:
+  using Options = GraphExecutionOptions;
+  using RunOptions = GraphExecutionRunOptions;
+
+  // The loading result of a `ClientGraph`.
+  class LoadedClientGraph {
+   public:
+    LoadedClientGraph(std::string name, SymbolUids symbol_uids,
+                      GraphExecutor* graph_executor,
+                      std::unique_ptr<mlir::MLIRContext> mlir_context,
+                      mlir::OwningOpRef<mlir::ModuleOp> tf_mlir_with_op_keys,
+                      mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
+                      std::shared_ptr<ExecutableContext> executable_context,
+                      std::optional<StreamCallbackId> stream_callback_id,
+                      bool is_restore, FunctionLibraryDefinition flib_def,
+                      tsl::monitoring::SamplerCell* latency_sampler);
+
+    // Returns this instance's CostRecorder if it is time to update costs,
+    // else returns nullptr. Only allows one non-null return value at a time
+    // in order to provide thread-safety. If do_recompilation becomes `true`,
+    // then recompiles using updated costs occurs.
+    CostRecorder* MaybeGetCostRecorder(absl::Time now, bool* do_recompilation);
+    // Updates the op cost values in this `LoadedClientGraph` with records from
+    // `cost_recorder`.
+    absl::Status UpdateCost(const CostRecorder& cost_recorder,
+                            const Runtime& runtime);
+    // Updates `cost_analysis_data_` to make it accurate for the next execution.
+    // Assumes a cost update occurred this cycle.
+    void UpdateCostAnalysisData(absl::Time now, bool do_recompilation);
+    // Getters.
+    std::shared_ptr<ExecutableContext> executable_context() const {
+      tensorflow::mutex_lock lock(executable_context_mu_);
+      return executable_context_;
+    }
+    absl::string_view name() const { return name_; }
+    const SymbolUids& symbol_uids() const { return symbol_uids_; }
+
+    OpKernelRunnerTable& runner_table() { return runner_table_; }
+    tfd::FallbackResourceArray& resource_array() { return resource_array_; }
+    SyncResourceState& sync_resource_state() { return sync_resource_state_; }
+
+    std::optional<StreamCallbackId> stream_callback_id() const {
+      return stream_callback_id_;
+    }
+
+    bool is_restore() const { return is_restore_; }
+
+    const ProcessFunctionLibraryRuntime& process_function_library_runtime()
+        const {
+      return pflr_;
+    }
+    tsl::monitoring::SamplerCell* latency_sampler() { return latency_sampler_; }
+
+   private:
+    std::string name_;
+    SymbolUids symbol_uids_;
+    GraphExecutor* graph_executor_ = nullptr;
+
+    // `mlir_context_` is declared here because the resources declared later may
+    // hold references to the MLIR objects.
+    std::unique_ptr<mlir::MLIRContext> mlir_context_;
+
+    struct CostAnalysisData {
+      mutable tensorflow::mutex mu;
+      // Ensures only one GraphExecutor thread updates costs at a time.
+      bool is_available TF_GUARDED_BY(mu) = false;
+      // Maintains the book-keeping of op costs.
+      std::unique_ptr<CostRecorder> cost_recorder;
+      // For recompilation in MLRT, TFRT respectively.
+      mlir::OwningOpRef<mlir::ModuleOp> tf_mlir_with_op_keys;
+      mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir;
+      // Start of current cost measurement cycle.
+      absl::Time start_time TF_GUARDED_BY(mu) = absl::Now();
+      // Cost recordings within the current measurement cycle.
+      int num_cost_updates TF_GUARDED_BY(mu) = 0;
+    };
+    CostAnalysisData cost_analysis_data_;
+
+    OpKernelRunnerTable runner_table_;
+    tfd::FallbackResourceArray resource_array_;
+    mutable tensorflow::mutex executable_context_mu_;
+    // Can be updated if online cost analysis is enabled.
+    std::shared_ptr<ExecutableContext> executable_context_
+        TF_GUARDED_BY(executable_context_mu_);
+    SyncResourceState sync_resource_state_;
+
+    std::optional<StreamCallbackId> stream_callback_id_;
+    bool is_restore_;
+    FunctionLibraryDefinition flib_def_;
+    ProcessFunctionLibraryRuntime pflr_;
+    tsl::monitoring::SamplerCell* latency_sampler_;
+  };
+
+  // A subgraph constructed by specifying input/output tensors.
+  struct ClientGraph {
+    // The human-readable name for the graph, e.g. the signature_name in the
+    // saved model.
+    std::string name;
+    // The feed nodes for the corresponding inputs, but they might not be in the
+    // original order and if there are more than one original inputs mapped to
+    // the same feed node, only one is picked here.
+    tensorflow::GraphImportConfig::InputArrays input_nodes;
+    // The fetch nodes for the outputs, which should be in the original order.
+    std::vector<std::string> output_nodes;
+    // The target nodes that should be run but not returned as outputs.
+    std::vector<std::string> target_nodes;
+  };
+
+  // Creates a `GraphExecutor` given the args.
+  static absl::StatusOr<std::unique_ptr<GraphExecutor>> Create(
+      Options options, std::unique_ptr<FallbackState> fallback_state,
+      std::unique_ptr<tfrt::ResourceContext> resource_context,
+      tensorflow::GraphDef graph_def,
+      std::unique_ptr<mlrt::KernelRegistry> kernel_registry,
+      tensorflow::tfrt_stub::RuntimeConfig* runtime_config = nullptr);
+
+  // Ctor. Public for `Create()`. Do not use directly.
+  GraphExecutor(Options options, std::unique_ptr<FallbackState> fallback_state,
+                std::unique_ptr<tfrt::ResourceContext> resource_context,
+                std::unique_ptr<tensorflow::tfrt_stub::TfrtGraphExecutionState>
+                    graph_execution_state,
+                std::unique_ptr<mlrt::KernelRegistry> kernel_registry,
+                tensorflow::tfrt_stub::RuntimeConfig* runtime_config = nullptr);
+
+  // Runs on the graph according to given input/output.
+  absl::Status Run(
+      const RunOptions& run_options,
+      absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_tensor_names,
+      std::vector<tensorflow::Tensor>* outputs);
+
+  // Runs the graph identified by `graph_name` using the input `inputs` and
+  // stores the output of the execution in `outputs`. It is the client's
+  // responsibility to ensure `graph_name` corresponds to logically different
+  // graphs, since this name is used to lookup compiled graphs in the cache. The
+  // graph is run synchronously with the TFRT interpreter.
+  absl::Status RunWithSyncInterpreter(
+      const std::string& graph_name, absl::Span<mlrt::Value> input_values,
+      absl::Span<const std::string> input_names,
+      absl::Span<const tensorflow::DataType> input_dtypes,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_tensor_names,
+      absl::Span<mlrt::Value> outputs);
+
+  // Extends the current graph by `graph`.
+  absl::Status Extend(const GraphDef& graph);
+
+  tensorflow::tfrt_stub::TfrtGraphExecutionState& graph_execution_state()
+      const {
+    return *graph_execution_state_;
+  }
+
+  // Returns the underlying runtime.
+  const tensorflow::tfrt_stub::Runtime& runtime() const {
+    DCHECK(options_.runtime);
+    return *options_.runtime;
+  }
+
+  tfrt::ResourceContext& resource_context() { return *resource_context_; }
+
+  const Options& options() const { return options_; }
+  const FallbackState& fallback_state() const { return *fallback_state_; }
+  FallbackState& fallback_state() { return *fallback_state_; }
+
+  // Compiles graph for `graph_name` and runs any initializers.
+  absl::Status CompileGraph(
+      const std::string& graph_name,
+      absl::Span<const std::string> input_tensor_names,
+      absl::Span<const tensorflow::DataType> input_tensor_dtypes,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_tensor_names);
+
+  const mlrt::KernelRegistry& kernel_registry() const {
+    return *kernel_registry_;
+  }
+
+ private:
+  // A set of methods to load a client graph.
+  absl::StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
+  LoadClientGraph(
+      const GraphExecutor::ClientGraph& client_graph,
+      tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
+      absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs);
+  absl::StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
+  ImportAndCompileClientGraph(
+      const GraphExecutor::ClientGraph& client_graph,
+      absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs);
+  absl::StatusOr<
+      std::pair<FunctionLibraryDefinition, mlir::OwningOpRef<mlir::ModuleOp>>>
+  ImportClientGraphToMlirModule(const GraphExecutor::ClientGraph& client_graph,
+                                mlir::MLIRContext* context) const;
+  absl::StatusOr<tfrt::BefBuffer> CompileMlirModuleToBef(
+      mlir::ModuleOp module) const;
+
+  absl::Status InitBef(LoadedClientGraph* loaded_client_graph,
+                       tensorflow::tfrt_stub::WorkQueueInterface* work_queue);
+
+  absl::Status InitBytecode(LoadedClientGraph* loaded_graph);
+
+  // Returns a `LoadedClientGraph` given input/output tensor info. If there is
+  // no existing one yet, creates one first.
+  absl::StatusOr<std::reference_wrapper<GraphExecutor::LoadedClientGraph>>
+  GetOrCreateLoadedClientGraph(
+      const RunOptions& run_options,
+      absl::Span<const std::string> input_tensor_names,
+      absl::Span<const tensorflow::DataType> input_tensor_dtypes,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_tensor_names,
+      tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
+      absl::string_view graph_name = "",
+      absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs = {})
+      TF_LOCKS_EXCLUDED(loaded_client_graphs_mu_);
+
+  Options options_;
+  std::unique_ptr<FallbackState> fallback_state_;
+
+  std::unique_ptr<tensorflow::tfrt_stub::TfrtGraphExecutionState>
+      graph_execution_state_;
+
+  tfrt::RequestDeadlineTracker req_deadline_tracker_;
+
+  tensorflow::mutex loaded_client_graphs_mu_;
+  // Caches `LoadedClientGraph` by the joined name.
+  // For pointer stability of values in `absl::flat_hash_map<>`, additional
+  // `std::unique_ptr<>` is necessary. (See https://abseil.io/tips/136.)
+  absl::flat_hash_map<std::string /*joined_name*/,
+                      std::unique_ptr<LoadedClientGraph>>
+      loaded_client_graphs_ TF_GUARDED_BY(loaded_client_graphs_mu_);
+
+  std::unique_ptr<mlrt::KernelRegistry> kernel_registry_;
+
+  std::unique_ptr<tfrt::ResourceContext> resource_context_;
+
+ protected:
+  // For testing basic Cost Analysis functionality.
+  absl::Duration simulated_duration_ = absl::ZeroDuration();
+  tensorflow::mutex num_recompilations_mu_;
+  int num_recompilations_ TF_GUARDED_BY(num_recompilations_mu_) = 0;
+};
+
+void RegisterMlirDialect(mlir::DialectRegistry& registry,
+                         tensorflow::BackendCompiler* backend_compiler);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/sync_resource_state.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
new file mode 100644
index 00000000..9bb9bb58
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_SYNC_RESOURCE_STATE_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_SYNC_RESOURCE_STATE_H_
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/tfrt/utils/any_ptr.h"
+#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class SyncResourceState {
+ public:
+  // Sets `dht` in the array at `index`. `index` should be dense and
+  // duplicate indices are not allowed.
+  void SetResourceDht(int index, tfrt::DenseHostTensor dht) {
+    if (resource_dht_.size() <= index) {
+      resource_dht_.resize(index + 1);
+    }
+
+    resource_dht_[index] = std::move(dht);
+  }
+
+  tfrt::DenseHostTensor GetResourceDht(int index) const {
+    return resource_dht_.at(index).CopyRef();
+  }
+
+  template <typename T>
+  void Set(int index, T* resource) {
+    if (resources_.size() <= index) {
+      resources_.resize(index + 1);
+    }
+
+    resources_[index] = tfrt::AnyPtr(resource);
+  }
+
+  template <typename T>
+  T* Get(int index) const {
+    return resources_.at(index).get<T>();
+  }
+
+ private:
+  std::vector<tfrt::DenseHostTensor> resource_dht_;
+  // TODO(b/288899457): Consider provide a simpler solution than forking AnyPtr.
+  std::vector<tfrt::AnyPtr> resources_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_SYNC_RESOURCE_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/checkpoint_loader.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/checkpoint_loader.h
new file mode 100644
index 00000000..e47c78bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/checkpoint_loader.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_CHECKPOINT_LOADER_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_CHECKPOINT_LOADER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// TODO(b/352551302) Move the unit test in ifrt_ops_kernel for restore to test
+// this class's APIs.
+// Implement the `CheckpointLoaderInterface` by using RestoreV2.
+class CheckpointLoader {
+ public:
+  struct PrepareRestoreArgs {
+    mlir::MLIRContext* context;
+    tensorflow::MetaGraphDef meta_graph_def;
+    tfrt_stub::FallbackState* fallback_state;
+    std::string saved_model_dir;
+    bool run_placer_grappler_on_functions;
+  };
+
+  explicit CheckpointLoader(
+      IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry,
+      tfrt::ConcurrentWorkQueue* checkpoint_loader_work_queue,
+      bool use_async_restore = true)
+      : ifrt_restore_tensor_registry_(ifrt_restore_tensor_registry),
+        checkpoint_loader_work_queue_(checkpoint_loader_work_queue),
+        use_async_restore_(use_async_restore) {}
+  virtual ~CheckpointLoader() = default;
+
+  // Called before `Load` to do some preparation work.
+  virtual absl::Status PrepareRestore(const PrepareRestoreArgs& args);
+
+  // Load the checkpoint. This API is designed to be compatible with the
+  // `tf_mlrt.ifrt_restore_variable` kernel.
+  virtual absl::Status Load(
+      const tensorflow::tfrt_stub::FallbackTensor& prefix,
+      const std::vector<tensorflow::tfrt_stub::FallbackTensor>& var_handles,
+      const tensorflow::tfrt_stub::FallbackTensor& tensor_names,
+      const tensorflow::tfrt_stub::FallbackTensor& shape_and_slices,
+      absl::Span<const tensorflow::DataType> restored_dtypes,
+      const std::vector<bool>& truncate_in_cast, tf_mlrt::Context& context);
+
+ protected:
+  IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry_;
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_work_queue_;
+  bool use_async_restore_ = true;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_CHECKPOINT_LOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/grid.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/grid.h
new file mode 100644
index 00000000..28e52809
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/grid.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_GRID_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_GRID_H_
+
+#include <ostream>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_format.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Coordinates that identify a particular point in a 4-d grid (usually a TPU
+// topology).
+struct GridCoords {
+  int dim[4];
+
+  constexpr GridCoords(int d0, int d1, int d2, int d3) : dim{d0, d1, d2, d3} {}
+  GridCoords() : GridCoords(0, 0, 0, 0) {}
+
+  static GridCoords Zeroes() { return GridCoords(0, 0, 0, 0); }
+  static GridCoords Ones() { return GridCoords(1, 1, 1, 1); }
+
+  int operator[](int i) const {
+    DCHECK_GE(i, 0);
+    DCHECK_LT(i, 4);
+    return dim[i];
+  }
+
+  int& operator[](int i) {
+    DCHECK_GE(i, 0);
+    DCHECK_LT(i, 4);
+    return dim[i];
+  }
+
+  int Product() const { return dim[0] * dim[1] * dim[2] * dim[3]; }
+
+  std::string ToString() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const GridCoords& value) {
+    absl::Format(&sink, "%s", value.ToString());
+  }
+
+  friend bool operator==(const GridCoords& a, const GridCoords& b) {
+    return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const GridCoords& c) {
+    return os << c.ToString();
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const GridCoords& c) {
+    return H::combine(std::move(h), c[0], c[1], c[2], c[3]);
+  }
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_GRID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_device_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_device_utils.h
new file mode 100644
index 00000000..f779aa62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_device_utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_DEVICE_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_DEVICE_UTILS_H_
+
+#include <optional>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Returns the assigned IFRT devices based on the device assignment attribute.
+//
+// params:
+//   ifrt_client: The ifrt client.
+//   num_replicas: The number of replicas.
+//   num_cores_per_replica: The number of cores per replica.
+//
+//   device_assignment: The device assignment array encoded as
+//   [x0,y0,z0,core0,x1,y1,z1,core1, ...]. Optional. If not provided, the
+//   devices will be assigned based on the default order returned by the IFRT
+//   client.
+//
+// returns:
+//   The assigned devices.
+absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedIfrtDevices(
+    const xla::ifrt::Client& ifrt_client, int num_replicas,
+    int num_cores_per_replica,
+    std::optional<std::vector<int>> device_assignment);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_DEVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
new file mode 100644
index 00000000..25b9e6c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
@@ -0,0 +1,104 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_EXECUTABLE_REGISTRY_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_EXECUTABLE_REGISTRY_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Maintains a process-wide map from program ids to executables. Used by the
+// `IfrtCall` TensorFlow op to look up executables and invoke them.
+//
+// Invoking a TPU program inside a `IfrtCall` TF op requires being
+// able to retrieve an executable for the given program. Since there's no easy
+// way to pass non-serializable attributes to TF ops, we encode a program id
+// (that is unique within a process) as an attribute of a `IfrtCall` op and
+// use this registry class to let the `IfrtCall` op look up an executable
+// during TF op execution.
+class ServingExecutableRegistry {
+ public:
+  // RAII handle for registered executables.
+  class Handle {
+   public:
+    Handle();  // Constructs an empty handle.
+
+    // Move only.
+    Handle(Handle&& other);
+    Handle& operator=(Handle&& other);
+    Handle(const Handle&) = delete;
+    Handle& operator=(const Handle&) = delete;
+
+    ~Handle();
+
+    // Returns the program id that the handle represents, or `std::nullopt` if
+    // the handle is empty.
+    std::optional<int64_t> program_id() const { return program_id_; }
+
+    // Unregisters the owned executable, if any, early (before the destructor).
+    // Calling this method multiple times is a no-op.
+    void Release();
+
+    // Freezes the program's compilation. After Freeze() is called, no new model
+    // signature will be compiled. Using a signature or an input shape that
+    // wasn't compiled before the freeze will lead to an error.
+    absl::Status Freeze();
+
+   private:
+    friend class ServingExecutableRegistry;
+
+    // Can only be constructed by `ServingExecutableRegistry::Register()`.
+    explicit Handle(int64_t program_id);
+
+    // Program id. `std::nullopt` if the handle is already released.
+    std::optional<int64_t> program_id_;
+  };
+
+  // Registers an executable under the given program id. Returns an RAII handle
+  // that unregisters the program at its destruction.
+  static absl::StatusOr<Handle> Register(
+      int64_t program_id, std::unique_ptr<IfrtServingExecutable> executable);
+
+  // Looks up an executable registered under the given program id, or returns
+  // nullptr if there's no such program.
+  static IfrtServingExecutable* Lookup(int64_t program_id);
+
+ private:
+  friend class Handle;
+  friend class IfrtBackendCompilerTest;
+
+  static absl::Mutex mu_;
+
+  // Mapping from program ids to executables.
+  static absl::flat_hash_map<int64_t,
+                             std::unique_ptr<IfrtServingExecutable>>* const
+      executables_ ABSL_GUARDED_BY(&mu_);
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_EXECUTABLE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
new file mode 100644
index 00000000..d488d936
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -0,0 +1,98 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// This class is thread safe.
+class IfrtLoadedVariableRegistry {
+ public:
+  // The key is per variable tensor per device assignment. For single -device
+  // program, variables can be loaded on multiple devices with core selection.
+  // For SPMD program, we currently assume all devices will be used, so we use
+  // vector to make it compatible with SPMD.
+  struct Key {
+    // We use a vector to make it compatible with SPMD because the order of the
+    // devices used for sharding must match the order of the devices used for
+    // xla compilation.
+    std::vector<int> device_ids;
+    std::string input_name;
+    xla::HloSharding hlo_sharding;
+    template <typename H>
+    friend H AbslHashValue(H h, const Key& key) {
+      h = H::combine(std::move(h), key.input_name, key.device_ids,
+                     key.hlo_sharding);
+      return h;
+    }
+
+    friend bool operator==(const Key& x, const Key& y) {
+      return x.input_name == y.input_name && x.device_ids == y.device_ids &&
+             x.hlo_sharding == y.hlo_sharding;
+    }
+
+    std::string ToString() const {
+      return absl::StrCat(input_name, ":", absl::StrJoin(device_ids, ","), ":",
+                          hlo_sharding.ToString());
+    }
+  };
+
+  struct LoadedVariable {
+    xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>> array;
+  };
+  using LoadedVariableConstructor =
+      absl::AnyInvocable<absl::StatusOr<LoadedVariable>() const>;
+
+  // Tries to register a loaded variable with the given name.
+  // Returns an error if the named array does not already exists and
+  // loaded_variable_constructor failed to create an array. Note that it returns
+  // OK if the named array already exists.
+  // loaded_variable_constructor is invoked in the caller thread.
+  absl::Status TryRegisterLoadedVariable(
+      const Key& key, LoadedVariableConstructor&& loaded_variable_constructor)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  absl::StatusOr<LoadedVariable> GetLoadedVariable(const Key& key) const
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<Key, LoadedVariable> loaded_variable_map_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
new file mode 100644
index 00000000..6fea3a57
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/client.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// An index to indicate a non per-core executable bundle cache.
+inline constexpr int kNoCoreSelectedIndex = -1;
+
+// TODO(b/352551302) Delete VariableDeviceShardingConfigProto.
+struct VariableDeviceShardingConfig {
+  std::vector<int> device_ids;
+  xla::HloSharding hlo_sharding;
+};
+
+absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
+    const ResourceHandle& resource_handle);
+
+// Returns the runtime name from the resource handle. The name will be concat of
+// handle's container name and handle's name.
+std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle);
+
+// Loads a restored tensor as an IFRT loaded variable and set the restored
+// tensor in the `restored_tensor_promise` as output. It is an async loading. We
+// look for the restored tensor in `ifrt_restore_tensor_registry` and save a
+// future of IFRT loaded variable in `ifrt_loaded_variable_registry`. The caller
+// can look for the actual loaded variable value in
+// `ifrt_loaded_variable_registry`.
+absl::Status AsyncLoadRestoredTensorAsIfrtLoadedVariable(
+    absl::string_view runtime_name,
+    std::shared_ptr<xla::ifrt::Client> ifrt_client,
+    const tsl::thread::ThreadPool& thread_pool,
+    const ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
+    ifrt_serving::IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry,
+    tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+    const VariableDeviceShardingConfig& sharding_config);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_model_context.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
new file mode 100644
index 00000000..7c41a947
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
@@ -0,0 +1,191 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_CONTEXT_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/topology.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+inline constexpr absl::string_view kIfrtModelContextName = "IfrtModelContext";
+
+// Device specific configuration not available through ifrt. This should be
+// rare.
+struct DeviceConfig {
+  tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn =
+      tensorflow::IdentityShapeRepresentationFn();
+};
+
+// The runtime context for ifrt to be used in TFRT serving.
+//
+// This class is thread compatible.
+class IfrtModelContext {
+ public:
+  explicit IfrtModelContext(
+      std::shared_ptr<xla::ifrt::Client> client,
+      IfrtServingCoreSelector* ifrt_serving_core_selector,
+      tsl::thread::ThreadPool* thread_pool,
+      std::unique_ptr<tsl::protobuf::Message> compilation_environment_proto)
+      : client_(std::move(client)),
+        ifrt_serving_core_selector_(ifrt_serving_core_selector),
+        thread_pool_(*thread_pool),
+        compilation_environment_proto_(
+            std::move(compilation_environment_proto)) {}
+  IfrtModelContext(
+      std::shared_ptr<xla::ifrt::Client> client,
+      IfrtServingCoreSelector* ifrt_serving_core_selector,
+      tsl::thread::ThreadPool* thread_pool, tensorflow::DeviceMgr* device_mgr,
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      std::unique_ptr<tsl::protobuf::Message> compilation_environment_proto,
+      std::shared_ptr<const void> topology, TfToHloCompiler* tf_to_hlo_compiler,
+      IfrtPersistentCompilationCache* persistent_compilation_cache = nullptr)
+      : client_(std::move(client)),
+        topology_(topology),
+        ifrt_serving_core_selector_(ifrt_serving_core_selector),
+        thread_pool_(*thread_pool),
+        device_mgr_(device_mgr),
+        shape_representation_fn_(shape_representation_fn),
+        compilation_environment_proto_(
+            std::move(compilation_environment_proto)),
+        tf_to_hlo_compiler_(tf_to_hlo_compiler),
+        persistent_compilation_cache_(persistent_compilation_cache) {}
+
+  void RegisterHandle(ServingExecutableRegistry::Handle handle) {
+    handles_.push_back(std::move(handle));
+  }
+
+  std::shared_ptr<xla::ifrt::Client> GetClient() const { return client_; }
+
+  const tensorflow::XlaHelpers::ShapeRepresentationFn&
+  GetShapeRepresentationFn() const {
+    return shape_representation_fn_;
+  }
+
+  tsl::thread::ThreadPool& GetThreadPool() const;
+
+  const IfrtLoadedVariableRegistry& GetLoadedVariableRegistry() const {
+    return loaded_variable_registry_;
+  }
+  IfrtLoadedVariableRegistry& GetLoadedVariableRegistry() {
+    return loaded_variable_registry_;
+  }
+
+  const IfrtRestoreTensorRegistry& GetRestoreTensorRegistry() const {
+    return restore_tensor_registry_;
+  }
+  IfrtRestoreTensorRegistry& GetRestoreTensorRegistry() {
+    return restore_tensor_registry_;
+  }
+
+  IfrtPersistentCompilationCache* GetPersistentCompilationCache() const {
+    return persistent_compilation_cache_;
+  }
+
+  tensorflow::DeviceMgr* GetDeviceMgr() const { return device_mgr_; }
+  IfrtServingCoreSelector* GetIfrtServingCoreSelector() const {
+    return ifrt_serving_core_selector_;
+  }
+
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue() const {
+    return checkpoint_loader_queue_;
+  }
+  void set_checkpoint_loader_queue(tfrt::ConcurrentWorkQueue* work_queue) {
+    checkpoint_loader_queue_ = work_queue;
+  }
+
+  void set_default_signature_inputs(
+      const DefaultSignatureInputConfig& default_signature_inputs) {
+    default_signature_inputs_ = default_signature_inputs;
+  }
+
+  const DefaultSignatureInputConfig& default_signature_inputs() const {
+    return default_signature_inputs_;
+  }
+
+  tsl::protobuf::Message* GetCompilationEnvironmentProto() const {
+    return compilation_environment_proto_.get();
+  }
+
+  TfToHloCompiler* GetTfToHloCompiler() const { return tf_to_hlo_compiler_; }
+
+  // Freeze the model: release the resources such as host tensors that are used
+  // by the device only. The caller guarantees all resources released in this
+  // function is no longer in use in regular execution path.
+  // After Freeze() is called, no new model signature will be compiled. Using a
+  // signature or an input shape that wasn't compiled before the freeze will
+  // leads to an error.
+  absl::Status Freeze();
+
+  bool IsFrozen() const { return frozen_; }
+
+ private:
+  std::shared_ptr<xla::ifrt::Client> client_;
+  // Keep hardware specific topology info alive. This is currently used for
+  // shape determination.
+  std::shared_ptr<const void> topology_;
+
+  IfrtServingCoreSelector* ifrt_serving_core_selector_;  // May be nullptr
+  tsl::thread::ThreadPool& thread_pool_;
+
+  tensorflow::DeviceMgr* device_mgr_ = nullptr;  // Not owned.
+  tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_ =
+      tensorflow::IdentityShapeRepresentationFn();
+  std::unique_ptr<tsl::protobuf::Message> compilation_environment_proto_ =
+      nullptr;
+
+  // Dedicated work queue for heavy task such as variable tensor restoration.
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue_ = nullptr;
+
+  std::vector<ServingExecutableRegistry::Handle> handles_;
+
+  DefaultSignatureInputConfig default_signature_inputs_;
+
+  IfrtLoadedVariableRegistry loaded_variable_registry_;
+  IfrtRestoreTensorRegistry restore_tensor_registry_;
+  TfToHloCompiler* tf_to_hlo_compiler_ = nullptr;
+  IfrtPersistentCompilationCache* persistent_compilation_cache_ = nullptr;
+  bool frozen_ = false;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_model_restore_context.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_model_restore_context.h
new file mode 100644
index 00000000..da9528ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_model_restore_context.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_RESTORE_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_RESTORE_CONTEXT_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/tfrt/ifrt/checkpoint_loader.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+inline constexpr absl::string_view kIfrtModelRestoreContextName =
+    "IfrtModelRestoreContext";
+
+// A resource context that holds the `CheckpointLoader` for a model. We need a
+// different context than `IfrtModelContext` because `IfrtModelContext` is too
+// large to be a dependency of other libraries.
+class IfrtModelRestoreContext {
+ public:
+  explicit IfrtModelRestoreContext(
+      std::unique_ptr<CheckpointLoader> checkpoint_loader)
+      : checkpoint_loader_(std::move(checkpoint_loader)) {}
+
+  CheckpointLoader* checkpoint_loader() const {
+    return checkpoint_loader_.get();
+  }
+
+ private:
+  std::unique_ptr<CheckpointLoader> checkpoint_loader_;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_RESTORE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h
new file mode 100644
index 00000000..56c76329
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h
@@ -0,0 +1,75 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_PERSISTENT_COMPILATION_CACHE_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_PERSISTENT_COMPILATION_CACHE_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/tsl/concurrency/ref_count.h"
+namespace tensorflow {
+namespace ifrt_serving {
+
+class IfrtPersistentCompilationCache {
+ public:
+  IfrtPersistentCompilationCache() = default;
+  virtual ~IfrtPersistentCompilationCache() = default;
+
+  // The implementation of this API should be thread-safe. It generates a key
+  // for looking up the executable in the persistent cache and it will return
+  // the LoadedExecutable if hits cache. Otherwise, it will call the `value_fn`
+  // to generate and return the LoadedExecutable.
+  virtual absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+  LookupLoadedExecutableOrCreate(
+      std::unique_ptr<xla::ifrt::HloProgram> hlo_program,
+      tsl::RCReference<xla::ifrt::DeviceList> device_list,
+      const xla::CompileOptions& xla_compile_options,
+      const std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>&
+          loaded_host_callbacks,
+      xla::ifrt::Client* client,
+      absl::AnyInvocable<
+          absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>(
+              std::unique_ptr<xla::ifrt::Program> program,
+              std::unique_ptr<xla::ifrt::CompileOptions> options)>
+          value_fn);
+
+  // The implementation of this API should be thread-safe. It generates a key
+  // for looking up the Tf2HloResult in the persistent cache and it will return
+  // the Tf2HloResult if hits cache. Otherwise, it will call the `value_fn` to
+  // generate and return the Tf2HloResult.
+  virtual absl::StatusOr<Tf2HloResult> LookupTf2HloResultOrCreate(
+      Tf2HloArg tf2hlo_arg, TfToHloCompiler* tf_to_hlo_compiler);
+
+  virtual bool IsXlaCompilationCacheEnabled() const { return false; }
+  virtual bool IsTf2HloCompilationCacheEnabled() const { return false; }
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_PERSISTENT_COMPILATION_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
new file mode 100644
index 00000000..73b7fec3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_RESTORE_TENSOR_REGISTRY_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_RESTORE_TENSOR_REGISTRY_H_
+
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "xla/python/ifrt/future.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// This class is thread safe.
+class IfrtRestoreTensorRegistry {
+ public:
+  struct RestoredTensorInfo {
+    bool used_by_host = false;
+    DtypeAndShape dtype_and_shape;
+    xla::ifrt::Future<tensorflow::Tensor> tensor_future;
+  };
+  // Tries to register a loaded variable with the given name.
+  // Returns an error if the named tensor already exists.
+  absl::Status TryRegister(absl::string_view name,
+                           RestoredTensorInfo restored_tensor_info)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  xla::ifrt::Future<tensorflow::Tensor> GetRestoredTensor(
+      absl::string_view name) const ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Sets the tensor as used by the host. To ensure a tensor's host memory
+  // is released, this function must be called at least once before the Freeze.
+  absl::Status SetUsedByHost(absl::string_view name)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  absl::StatusOr<DtypeAndShape> GetDtypeAndShape(absl::string_view name) const
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Part of freezing the model is to release the host tensors that are used by
+  // the device only. The caller guarantees those tensors are already loaded to
+  // the device.
+  void Freeze() ABSL_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<std::string, RestoredTensorInfo> restored_tensors_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_RESTORE_TENSOR_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h
new file mode 100644
index 00000000..a4505cba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_CORE_SELECTOR_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_CORE_SELECTOR_H_
+
+#include <cstdint>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/framework/serving_device_selector.h"
+namespace tensorflow {
+namespace ifrt_serving {
+
+// A wrapper of a `tsl::ServingDeviceSelector` that will be responsible for the
+// core selection during Ifrt TPU execution.
+class IfrtServingCoreSelector {
+ public:
+  explicit IfrtServingCoreSelector(tsl::ServingDeviceSelector* device_selector,
+                                   int num_cores);
+  // Reserves a device for the given `program_id`. The `program_id` is used to
+  // identify an IFRT executable and should be the key of
+  // `tensorflow::ifrt_serving::ServingExecutableRegistry `.
+  tsl::DeviceReservation ReserveDevice(int64_t program_id);
+
+ private:
+  tsl::ServingDeviceSelector* device_selector_;
+
+  absl::Mutex mu_;
+  // A counter of the number of runs for each program. For a given program, it
+  // is used to determine if the core selector should treat the incoming request
+  // as a warmup request to warm up a core.
+  absl::flat_hash_map<int64_t, int64_t> run_counter_ ABSL_GUARDED_BY(mu_);
+  int num_cores_;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_CORE_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
new file mode 100644
index 00000000..b9402d25
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -0,0 +1,254 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_H_
+
+#include <stdbool.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+class IfrtServingExecutable {
+ public:
+  static absl::StatusOr<std::unique_ptr<IfrtServingExecutable>> Create(
+      int64_t program_id, absl::string_view model_name,
+      absl::string_view signature_name,
+      mlir::OwningOpRef<mlir::ModuleOp> module,
+      std::shared_ptr<xla::ifrt::Client> client,
+      tsl::thread::ThreadPool* thread_pool,
+      IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
+      const IfrtRestoreTensorRegistry* ifrt_restore,
+      tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+      tensorflow::DeviceMgr* device_mgr,
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      IfrtServingCoreSelector* ifrt_serving_core_selector,
+      tsl::protobuf::Message* compilation_environment_proto,
+      TfToHloCompiler* tf_to_hlo_compiler,
+      IfrtPersistentCompilationCache* persistent_compilation_cache);
+
+  // Movable but not copyable.
+  IfrtServingExecutable(IfrtServingExecutable&& other) = default;
+  IfrtServingExecutable& operator=(IfrtServingExecutable&& other) = default;
+  IfrtServingExecutable(const IfrtServingExecutable& other) = delete;
+  IfrtServingExecutable& operator=(const IfrtServingExecutable& other) = delete;
+
+  absl::string_view model_name() const { return model_name_; }
+  absl::string_view signature_name() const { return signature_name_; }
+
+  // Executes the computation.
+  // variable_arg_indices are in sorted order.
+  absl::StatusOr<std::vector<tensorflow::Tensor>> Execute(
+      absl::Span<const tensorflow::Tensor> inputs,
+      absl::Span<const int> variable_arg_indices);
+
+  // Freezes the model. After the Freeze(), JIT compile is not supported and
+  // Execute() will return error if inputs contain uncompiled shapes.
+  void Freeze();
+
+  int num_executables() const {
+    absl::MutexLock lock(&mutex_);
+    return executable_bundles_.size();
+  }
+
+ private:
+  friend class IfrtBackendCompilerTest;
+  // In memory cache key.
+  struct Key {
+    std::vector<tensorflow::TensorShape> input_shapes;
+    template <typename H>
+    friend H AbslHashValue(H h, const Key& key) {
+      for (const auto& shape : key.input_shapes) {
+        for (auto size : shape.dim_sizes()) {
+          h = H::combine(std::move(h), size);
+        }
+      }
+      return h;
+    }
+
+    friend bool operator==(const Key& x, const Key& y) {
+      return x.input_shapes == y.input_shapes;
+    }
+  };
+
+  struct CachedExecutableBundle {
+    std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable;
+    tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+    std::vector<std::unique_ptr<TfHostCallback>> host_callbacks;
+
+    CachedExecutableBundle() = default;
+    // Move only
+    CachedExecutableBundle(CachedExecutableBundle&& other) = default;
+    CachedExecutableBundle& operator=(CachedExecutableBundle&& other) = default;
+    CachedExecutableBundle(const CachedExecutableBundle& other) = delete;
+    CachedExecutableBundle& operator=(const CachedExecutableBundle& other) =
+        delete;
+  };
+
+  IfrtServingExecutable(
+      int64_t program_id, absl::string_view model_name,
+      absl::string_view signature_name,
+      mlir::OwningOpRef<mlir::ModuleOp> module,
+      std::shared_ptr<xla::ifrt::Client> client,
+      tsl::thread::ThreadPool* thread_pool,
+      IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
+      const IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry,
+      tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+      tensorflow::DeviceMgr* device_mgr,
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      IfrtServingCoreSelector* ifrt_serving_core_selector,
+      tensorflow::tpu::TPUCompileMetadataProto original_compile_metadata,
+      tsl::RCReference<xla::ifrt::DeviceList> assigned_device_list,
+      tsl::protobuf::Message* compilation_environment_proto,
+      TfToHloCompiler* tf_to_hlo_compiler,
+      IfrtPersistentCompilationCache* persistent_compilation_cache)
+      : program_id_(program_id),
+        model_name_(std::string(model_name)),
+        signature_name_(std::string(signature_name)),
+        module_(std::move(module)),
+        original_compile_metadata_(std::move(original_compile_metadata)),
+        assigned_device_list_(std::move(assigned_device_list)),
+        ifrt_client_(std::move(client)),
+        thread_pool_(*thread_pool),
+        ifrt_loaded_variable_registry_(*ifrt_loaded_variable_registry),
+        ifrt_restore_tensor_registry_(*ifrt_restore_tensor_registry),
+        checkpoint_loader_queue_(checkpoint_loader_queue),
+        device_mgr_(device_mgr),
+        shape_representation_fn_(std::move(shape_representation_fn)),
+        ifrt_serving_core_selector_(std::move(ifrt_serving_core_selector)),
+        compilation_environment_proto_(compilation_environment_proto),
+        tf_to_hlo_compiler_(tf_to_hlo_compiler),
+        persistent_compilation_cache_(persistent_compilation_cache) {}
+
+  int64_t program_id_;
+  using SharedCachedExecutableBundle = std::shared_ptr<CachedExecutableBundle>;
+
+  std::string model_name_;
+  std::string signature_name_;
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_ ABSL_GUARDED_BY(mutex_);
+  // The original compile metadata. We need to keep it around to be able to
+  // test portable execution condition even if the Module itself is already
+  // released.
+  tensorflow::tpu::TPUCompileMetadataProto original_compile_metadata_;
+  const tsl::RCReference<xla::ifrt::DeviceList> assigned_device_list_;
+
+  std::shared_ptr<xla::ifrt::Client> ifrt_client_;
+  tsl::thread::ThreadPool& thread_pool_;
+
+  IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry_;
+  const IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry_;
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue_;
+  tensorflow::DeviceMgr* device_mgr_;  // Not owned. For host callback.
+  tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_;
+  IfrtServingCoreSelector* ifrt_serving_core_selector_;
+
+  tsl::protobuf::Message*
+      compilation_environment_proto_;  // NOT OWNED. can be nullptr.
+
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<Key, xla::ifrt::Future<SharedCachedExecutableBundle>>
+      executable_bundles_ ABSL_GUARDED_BY(mutex_);
+
+  bool is_frozen_ ABSL_GUARDED_BY(mutex_) = false;
+
+  // The tf_to_hlo_compiler_ is not owned by this executable. It is expected to
+  // be alive during the lifetime of the executable.
+  TfToHloCompiler* tf_to_hlo_compiler_;
+
+  // The persistent compilation cache is a global cache and is not owned by
+  // this executable. When it is nullptr, the persistent compilation cache is
+  // disabled at ifrt serving level.
+  IfrtPersistentCompilationCache* persistent_compilation_cache_;
+
+  // Asynchronously load the restored variable tensors to Ifrt array.
+  absl::Status AsyncLoadIfrtArray(
+      absl::Span<const tensorflow::Tensor> inputs,
+      absl::Span<const int> variable_arg_indices,
+      const CachedExecutableBundle& executable_bundle,
+      const tsl::RCReference<xla::ifrt::DeviceList>& devices);
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> ConvertTensorToArray(
+      const tensorflow::Tensor& tensor,
+      const tsl::RCReference<xla::ifrt::DeviceList>& device_list,
+      const xla::OpSharding& sharding);
+
+  xla::ifrt::Future<SharedCachedExecutableBundle> LookUpOrCreateExecutable(
+      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
+      absl::Span<const DtypeAndShape> dtypes_and_shapes,
+      absl::Span<const int> variable_arg_indices);
+  absl::StatusOr<IfrtServingExecutable::SharedCachedExecutableBundle>
+  CreateExecutableSynchronously(
+      mlir::OwningOpRef<mlir::ModuleOp> module_copy,
+      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
+      absl::Span<const DtypeAndShape> dtypes_and_shapes,
+      absl::Span<const int> variable_arg_indices);
+
+  absl::StatusOr<std::unique_ptr<xla::ifrt::Sharding>> CreateSharding(
+      int num_devices, const xla::ifrt::Shape& arg_xla_shape,
+      const xla::ifrt::Shape& sharded_shapes);
+
+  std::vector<xla::ifrt::Shape> GetArgShape(
+      int arg_index, const CachedExecutableBundle& entry);
+
+  bool UsePortableExecution(
+      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata);
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h
new file mode 100644
index 00000000..9f527765
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h
@@ -0,0 +1,89 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_TEST_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_TEST_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/tsl/framework/test_util/mock_serving_device_selector.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace test_utils {
+
+// A test helper class to create and IfrtServingExecutable.
+class IfrtServingExecutableTestHelper {
+ public:
+  explicit IfrtServingExecutableTestHelper(
+      tsl::test_util::MockServingDeviceSelector* device_selector);
+
+  // Creates an IfrtServingExecutable with the given program id.
+  // Note the instance of this class must outlive the returned
+  // IfrtServingExecutable.
+  std::unique_ptr<IfrtServingExecutable> MakeExecutable(
+      int64_t program_id, std::string mlir_module_path);
+
+  IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry() {
+    return &ifrt_restore_tensor_registry_;
+  }
+
+  int num_cores() const { return client_->addressable_device_count(); }
+
+ private:
+  static constexpr int kThreadPoolNumThreads = 16;
+
+  tsl::test_util::MockServingDeviceSelector* device_selector_;  // Not owned.
+  std::unique_ptr<IfrtServingCoreSelector> core_selector_;
+  std::shared_ptr<xla::ifrt::Client> client_;
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry_;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry_;
+  std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue_;
+  std::unique_ptr<tensorflow::DynamicDeviceMgr> device_mgr_;
+
+  mlir::DialectRegistry registry_;
+  std::unique_ptr<mlir::MLIRContext> context_;
+  std::unique_ptr<IfrtPersistentCompilationCache>
+      ifrt_persistent_compilation_cache_;
+  TfToHloCompiler tf_to_hlo_compiler_;
+};
+
+// Returns the path to the MLIR module for the given module name.
+std::string GetMlirModulePath(absl::string_view module_name);
+
+}  // namespace test_utils
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h
new file mode 100644
index 00000000..6235e414
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_TENSOR_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_TENSOR_UTILS_H_
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+absl::StatusOr<tensorflow::DataType> ToTensorDataType(
+    xla::ifrt::DType ifrt_dtype);
+
+absl::StatusOr<xla::ifrt::DType> ToIfrtDType(tensorflow::DataType tensor_dtype);
+
+xla::ifrt::Shape ToIfrtShape(const tensorflow::TensorShape& shape);
+
+tensorflow::TensorShape ToTensorShape(const xla::ifrt::Shape& shape);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/sharding_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/sharding_utils.h
new file mode 100644
index 00000000..a777a822
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/sharding_utils.h
@@ -0,0 +1,80 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Create a tensor from the given host tensor based on given device ids and
+// sharding information.
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    absl::Span<const int> device_ids, const xla::HloSharding& hlo_sharding,
+    const tsl::thread::ThreadPool& thread_pool);
+
+// A variant of the above api. The difference is that the user passes in
+// device_list directly instead of a list of device_ids.
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    const tsl::RCReference<xla::ifrt::DeviceList>& device_list,
+    const xla::HloSharding& hlo_sharding,
+    const tsl::thread::ThreadPool& thread_pool);
+
+// Reshard an disassembled array list back to one single tensor
+// based on given sharding spec.
+//
+// input_array: the input device buffers.
+//
+// hlo_sharding: sharding spec that describes how the input device buffers are
+// sharded.
+//
+// device_list: list of devices that is aligned with the order of device buffers
+// in the `input_array`.
+//
+xla::ifrt::Future<tensorflow::Tensor> MakeTensorFromArray(
+    xla::ifrt::Client& ifrt_client, xla::ifrt::Array& input_array,
+    const xla::HloSharding& hlo_sharding,
+    const tsl::RCReference<xla::ifrt::DeviceList>& device_list,
+    tsl::thread::ThreadPool& thread_pool);
+
+// A wrapper around xla::ShapeUtil::ByteStrides to get the byte strides of a
+// TensorFlow tensor.
+std::optional<absl::InlinedVector<int64_t, 4>> GetByteStrides(
+    tensorflow::DataType dtype, const tensorflow::TensorShape& shape);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  //  TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/tf_host_callback.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/tf_host_callback.h
new file mode 100644
index 00000000..2b19f239
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/ifrt/tf_host_callback.h
@@ -0,0 +1,85 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_TF_HOST_CALLBACK_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_TF_HOST_CALLBACK_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// A host callback implementation to run a TF graph.
+// TODO(b/332774825): Use TFRT executor for host callback.
+class TfHostCallback {
+ public:
+  // Creates a TfHostCallback instance. `device_mgr` ptr is guaranteed to be
+  // alive throughout the lifetime of model.
+  static absl::StatusOr<std::unique_ptr<TfHostCallback>> Create(
+      absl::Span<const tensorflow::FunctionDef> functions,
+      absl::string_view entry_function_name,
+      absl::Span<const DtypeAndShape> operand_type_and_shapes,
+      absl::Span<const DtypeAndShape> result_type_and_shapes,
+      tensorflow::DeviceMgr* device_mgr);
+
+  // The host callback function takes two pointer arrays, each element of which
+  // points to allocated host buffer in host layout according to corresponding
+  // operand or result's shape. The buffers are only guaranteed to be alive
+  // during the call.
+  absl::Status Call(void** inputs, void** outputs);
+
+ private:
+  TfHostCallback(absl::string_view entry_function_name,
+                 absl::Span<const DtypeAndShape> operand_type_and_shapes,
+                 absl::Span<const DtypeAndShape> result_type_and_shape,
+                 tensorflow::EagerContextPtr ctx)
+      : ctx_(std::move(ctx)),
+        entry_function_name_(entry_function_name),
+        operand_type_and_shapes_(operand_type_and_shapes.begin(),
+                                 operand_type_and_shapes.end()),
+        result_type_and_shapes_(result_type_and_shape.begin(),
+                                result_type_and_shape.end()) {}
+
+  // Per-callback TF Eager context.
+  tensorflow::EagerContextPtr ctx_;
+
+  // Entry function name to be called on invocation.
+  std::string entry_function_name_;
+
+  std::vector<DtypeAndShape> operand_type_and_shapes_;
+  std::vector<DtypeAndShape> result_type_and_shapes_;
+};
+
+absl::StatusOr<std::unique_ptr<tensorflow::DynamicDeviceMgr>>
+CreateTfDynamicDeviceMgr();
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_TF_HOST_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/ifrt_program_ops.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
new file mode 100644
index 00000000..463647ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_KERNELS_IFRT_PROGRAM_OPS_H_
+#define TENSORFLOW_CORE_TFRT_KERNELS_IFRT_PROGRAM_OPS_H_
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// TensorFlow op that calls a Ifrt program registered in `ProgramRegistry`.
+class IfrtCallOp : public tensorflow::OpKernel {
+ public:
+  explicit IfrtCallOp(tensorflow::OpKernelConstruction* ctx);
+
+  IfrtCallOp(const IfrtCallOp& other) = delete;
+  IfrtCallOp& operator=(const IfrtCallOp& other) = delete;
+
+  void Compute(tensorflow::OpKernelContext* ctx) override;
+
+ private:
+  // Op attributes.
+  int64_t program_id_;
+
+  std::vector<std::string> variable_names_;
+  std::vector<int> variable_arg_indices_;
+
+  // Ifrt program to be called. Cached after the first call.
+  absl::once_flag init_once_;
+  tensorflow::ifrt_serving::IfrtServingExecutable* executable_;  // Not owned.
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TFRT_KERNELS_IFRT_PROGRAM_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops.h
new file mode 100644
index 00000000..bef61d92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_H_
+#define TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tfrt/runtime/stream.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// TensorFlow op that immediately sends results back to the serving controller.
+class PwStreamResultsOp : public tensorflow::OpKernel {
+ public:
+  explicit PwStreamResultsOp(tensorflow::OpKernelConstruction* ctx);
+
+  PwStreamResultsOp(const PwStreamResultsOp& other) = delete;
+  PwStreamResultsOp& operator=(const PwStreamResultsOp& other) = delete;
+
+  void Compute(tensorflow::OpKernelContext* ctx) override;
+
+ private:
+  // Op attributes.
+  std::string controller_address_;
+  std::string model_name_;
+  StreamCallbackId callback_id_;
+  std::vector<std::string> names_;
+
+  std::unique_ptr<tensorflow::tfrt_stub::StreamWorkerInterface> stream_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops_util.h
new file mode 100644
index 00000000..b6fa0223
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_UTIL_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Unbatches `tensors` according to the step ids and returns a list of (step_id,
+// unbatched_tensors) pairs.
+//
+// If `step_ids` is a scalar, each tensor in `tensors` is treated as if they are
+// not batched and the entire tensor is associated with the single step id.
+//
+// If `step_ids` is a 1-D tensor, this tensor represents the step id of each
+// example in the batch. Tensors in `tensors` are "unbatched" along the leading
+// dimension according to the step id tensor and the unbatched tensors are
+// associated with the corresponding step ids.
+absl::StatusOr<std::vector<std::pair<int64_t, std::vector<tensorflow::Tensor>>>>
+UnbatchStreamResults(const tensorflow::Tensor& step_ids,
+                     absl::Span<const tensorflow::Tensor> tensors);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops_util_constants.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops_util_constants.h
new file mode 100644
index 00000000..ef8bad04
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/kernels/stream_ops_util_constants.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_UTIL_CONSTANTS_H_
+#define TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_UTIL_CONSTANTS_H_
+
+#include <cstddef>
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Step id and batch id are packed together to a 64 bit integer in the stream
+// callback. Step id takes the MSB 32 bit.
+inline constexpr size_t kStepIdBitSize = 32;
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_KERNELS_STREAM_OPS_UTIL_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mla/mla_test_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mla/mla_test_utils.h
new file mode 100644
index 00000000..c445e537
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mla/mla_test_utils.h
@@ -0,0 +1,47 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLA_MLA_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_MLA_MLA_UTILS_H_
+
+// This file contains stub implementations for Google internal MLA APIs.
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+inline std::string CopySavedModelFromTestDataToTempDir(
+    absl::string_view tf_dir, absl::string_view saved_model_name) {
+  return "";
+}
+
+inline Status ConvertSavedModelAndAddToMla(
+    absl::string_view saved_model_path, const int saved_model_version,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& entry_points,
+    absl::string_view mla_module_name) {
+  return tensorflow::errors::Unimplemented("Not supported in OSS");
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLA_MLA_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mla/mla_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mla/mla_utils.h
new file mode 100644
index 00000000..79965f4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mla/mla_utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLA_MLA_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_MLA_MLA_UTILS_H_
+
+// This file contains stub implementations for Google internal MLA APIs.
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+inline StatusOr<std::string> GetSavedModelDirFromMlaDir(
+    absl::string_view mla_dir) {
+  return tensorflow::errors::Unimplemented("Not supported in OSS");
+}
+
+inline bool IsMlarchive(absl::string_view saved_model_dir) { return false; }
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLA_MLA_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/attribute/attribute.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/attribute/attribute.h
new file mode 100644
index 00000000..f27118ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/attribute/attribute.h
@@ -0,0 +1,128 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_ATTRIBUTE_ATTRIBUTE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_ATTRIBUTE_ATTRIBUTE_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+class ShapeAttr {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(uint8_t, unranked);
+    DEFINE_BYTECODE_FIELD(mlrt::bc::Vector<int64_t>, dims);
+  };
+
+  class Constructor {
+   public:
+    Constructor(mlrt::bc::Allocator* allocator, mlrt::bc::BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    void set_unranked(bool unranked) {
+      StorageType::construct_unranked(allocator_, address_, unranked);
+    }
+
+    template <typename... Args>
+    auto construct_shape(Args&&... args) {
+      return StorageType::construct_dims(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    mlrt::bc::BcAddr_t address() const { return address_; }
+
+   private:
+    mlrt::bc::Allocator* allocator_;
+    mlrt::bc::BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit ShapeAttr(const char* p) : p_(p) {}
+
+  bool unranked() const { return StorageType::read_unranked(p_); }
+  mlrt::bc::Vector<int64_t> dims() const { return StorageType::read_dims(p_); }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+class TensorAttr {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(tensorflow::DataType, dtype);
+    DEFINE_BYTECODE_FIELD(uint64_t, num_elements);
+    DEFINE_BYTECODE_FIELD(mlrt::bc::Vector<int64_t>, shape);
+    DEFINE_BYTECODE_FIELD(mlrt::bc::Vector<char>, data);
+  };
+
+  class Constructor {
+   public:
+    Constructor(mlrt::bc::Allocator* allocator, mlrt::bc::BcAddr_t address,
+                tensorflow::DataType dtype)
+        : allocator_(allocator), address_(address) {
+      StorageType::construct_dtype(allocator_, address_, dtype);
+    }
+
+    void set_num_elements(size_t num) {
+      StorageType::construct_num_elements(allocator_, address_, num);
+    }
+
+    template <typename... Args>
+    auto construct_shape(Args&&... args) {
+      return StorageType::construct_shape(allocator_, address_,
+                                          std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_data(Args&&... args) {
+      return StorageType::construct_data(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    mlrt::bc::BcAddr_t address() const { return address_; }
+
+   private:
+    mlrt::bc::Allocator* allocator_;
+    mlrt::bc::BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit TensorAttr(const char* p) : p_(p) {}
+
+  tensorflow::DataType dtype() const { return StorageType::read_dtype(p_); }
+  mlrt::bc::Vector<int64_t> shape() const {
+    return StorageType::read_shape(p_);
+  }
+  mlrt::bc::Vector<char> data() const { return StorageType::read_data(p_); }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+absl::StatusOr<std::string> EncodeTensorflowAttribute(
+    const mlrt::ModuleEmitterContext& module_context, mlir::Attribute attr);
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_ATTRIBUTE_ATTRIBUTE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
new file mode 100644
index 00000000..f82666f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
@@ -0,0 +1,526 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_BYTECODE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_BYTECODE_H_
+
+// This file defines bytecode primitives that can be used to build bytecode
+// structures. This library is C++17 compliant and portable for different
+// platforms. It should be also as effcient as plain C++ structs on common
+// platforms.
+//
+// Usage:
+//
+// class CustomStruct {
+//  public:
+//    // The actual storage of this CustomStruct should be defined as a member
+//    // struct of this class. Defining storage struct is almost as simple as
+//    // defining a plain C++ struct;
+//    struct Storage {
+//      using Self = Storage;
+//      // DEFINE_BYTECODE_FIELD will generate helpers for reading and
+//      constructing
+//      // the field in bytecode.
+//      DEFINE_BYTECODE_FIELD(uint32_t, x);
+//      DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, y);
+//    };
+//
+//    // If the storage involves indirection like std::vector, a member class
+//    // Constructor should be also provided.
+//    class Constructor {
+//      public:
+//        // The Constructor will use `allocator` to allocate indirect storage,
+//        // though the direct storage is assumed to be already allocated using
+//        // the same allocator starting at `address`.
+//        explicit Constructor(Allocator* allocator, BcAddr_t address)
+//          : allocator_(allocator), address_(address) {}
+//
+//      // Setting trivial fields only need to call construct_<field_name>
+//      // provided by DEFINE_BYTECODE_FIELD.
+//      void set_x(uint32_t x) {
+//        Storage::construct_x(allocator_, address_, x);
+//      }
+//
+//      // Setting non-trivial fields only need to call construct_<field_name>
+//      // provided by DEFINE_BYTECODE_FIELD and also return the field's
+//      constructor. bc::Vector<uint32_t>::Constructor construct_y(size_t
+//      y_size) {
+//        return Storage::construct_y(allocator_, address_, y_size);
+//      }
+//
+//      BcAddr_t address() const { return address_; }
+//
+//      private:
+//        bc::Allocator* allocator_;
+//        BcAddr_t address_;
+//    };
+//    using NonTrivialConstructorType = Constructor;
+//
+//    explicit CustomStruct(const char* p) : p_(p) {}
+//
+//    // Reading fields needs only calling read_<field_name> methods provided by
+//    // DEFINE_BYTECODE_FIELD.
+//    uint32_t x() const { return Storage::read_x(p_); }
+//    bc::Vector<uint32_t> y() const { return Storage::read_y(p_); }
+//
+//    private:
+//      // The CustomStruct can contain only the pointer to the actual memory
+//      // blob. So fields need not be touched if not necessary, which would
+//      // otherwise incurs overhead.
+//      const char* p_;
+// };
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+
+namespace mlrt {
+namespace bc {
+
+using BcAddr_t = uint64_t;
+
+class Buffer {
+ public:
+  char* Get(BcAddr_t address) {
+    DCHECK_LT(address, buffer_.size());
+    return &buffer_.at(address);
+  }
+
+  char* data() { return buffer_.data(); }
+  const char* data() const { return buffer_.data(); }
+  size_t size() const { return buffer_.size(); }
+  bool empty() const { return buffer_.empty(); }
+
+  void shrink_to_fit() { buffer_.shrink_to_fit(); }
+
+ private:
+  static_assert(alignof(std::max_align_t) >= 8,
+                "The bytecode buffer needs to be at least 8-byte aligned.");
+  std::vector<char> buffer_;
+
+  friend class Allocator;
+};
+
+class Allocator {
+ public:
+  explicit Allocator(Buffer* buffer) : buffer_(buffer) {
+    DCHECK(buffer != nullptr);
+  }
+
+  BcAddr_t Allocate(size_t size, size_t alignment) {
+    DCHECK_LE(alignment, 8);
+
+    // Calculate the next buffer size that is greater or equal to the previous
+    // buffer size, and is also aligned to `alignment`.
+    size_t next_align =
+        (buffer_->buffer_.size() + alignment - 1) / alignment * alignment;
+
+    buffer_->buffer_.resize(next_align + size);
+
+    return next_align;
+  }
+
+  template <typename T>
+  BcAddr_t Allocate() {
+    static_assert(std::is_trivial<T>::value, "T must be trivial.");
+    return Allocate(sizeof(T), alignof(T));
+  }
+
+  size_t size() const { return buffer_->size(); }
+
+  char* raw(BcAddr_t address) { return buffer_->Get(address); }
+
+ private:
+  Buffer* buffer_;
+};
+
+// AccessTraits encapsulates the fundamental Read() and Construct() methods for
+// reading and constructing bytecode data structures.
+
+// AccessTraits specialized for trivial types.
+template <typename T, typename Enable = void>
+struct AccessTraits {
+  using StorageType = T;
+  static_assert(std::is_trivial<StorageType>::value,
+                "StorageType must be trivial.");
+
+  using ConstructorType = void;
+
+  static T Read(const char* p) {
+    // To be compliant with C++ standard on object lifetime and strict aliasing
+    // rules, we have to copy the data from memory to construct a new object.
+    // This is fine on most platforms as the copy can be optimized away,
+    // assuming `p` is sufficiently aligned.
+    T value;
+    std::memcpy(&value, p, sizeof(T));
+    return value;
+  }
+
+  template <typename... Args>
+  static BcAddr_t Construct(Allocator* allocator, BcAddr_t address,
+                            Args&&... args) {
+    // Similar to Read(), memcpy is used to serialize data to bytecode.
+    T value(std::forward<Args>(args)...);
+    std::memcpy(allocator->raw(address), &value, sizeof(T));
+    return address;
+  }
+
+  // Place the bytes directly for this trivial type T. It also supports placing
+  // bytes for a contiguous array of T. The number of bytes, `size` must not be
+  // greater than `num` * sizeof(T).
+  static void Place(Allocator* allocator, BcAddr_t address, const char* data,
+                    size_t size, size_t num = 1) {
+    CHECK_LE(size, num * sizeof(T));  // Crash Ok
+    std::memcpy(allocator->raw(address), data, size);
+  }
+};
+
+// AccessTraits specialized for non-trivial types.
+template <typename T>
+struct AccessTraits<T, std::void_t<typename T::NonTrivialConstructorType>> {
+  // Non-trivial types should provide a member struct `StorageType` to
+  // specify the storage layout.
+  using StorageType = typename T::StorageType;
+  static_assert(std::is_trivial<StorageType>::value,
+                "StorageType must be trivial.");
+
+  // Non-trivial types should provide a member type `NonTrivialConstructorType`
+  // for constructing storages.
+  using ConstructorType = typename T::NonTrivialConstructorType;
+
+  static T Read(const char* p) {
+    // Reading non-trivial types is simply constructing the bytecode type with
+    // the pointer to the memory blob. All reading methods are encapsulated in
+    // `T`.
+    return T(p);
+  }
+
+  template <typename... Args>
+  static ConstructorType Construct(Allocator* allocator, BcAddr_t address,
+                                   Args&&... args) {
+    // Constructing non-trivial types is simply creating the corresponding
+    // constructor.
+    return ConstructorType(allocator, address, std::forward<Args>(args)...);
+  }
+};
+
+// The bytecode counterparts of malloc() and operator new() are also provided.
+template <typename T>
+BcAddr_t Allocate(Allocator* allocator) {
+  return allocator->Allocate<typename AccessTraits<T>::StorageType>();
+}
+template <typename T, typename... Args>
+auto New(Allocator* allocator, Args&&... args) {
+  auto address = Allocate<T>(allocator);
+  return AccessTraits<T>::Construct(allocator, address,
+                                    std::forward<Args>(args)...);
+}
+
+// The iterator for reading bytecode data. It uses AccessTraits<T>::Read() for
+// reading the data. It is an input iterator as we cannot return the type-safe
+// reference to the data in bytecode in a C++ compliant way due to object
+// lifetime and strict aliasing rule.
+template <typename T>
+class ReadIterator {
+  using StorageType = typename AccessTraits<T>::StorageType;
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = std::remove_cv_t<T>;
+  using pointer = void;
+  using reference = value_type;
+  using iterator_category = std::input_iterator_tag;
+
+  explicit ReadIterator(const char* data) : data_(data) {}
+
+  const char* data() const { return data_; }
+
+  value_type operator*() const { return AccessTraits<T>::Read(data_); }
+
+  ReadIterator& operator++() {
+    data_ += sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator++(int) {
+    ReadIterator r = *this;
+    data_ += sizeof(StorageType);
+    return r;
+  }
+
+  ReadIterator& operator+=(difference_type offset) {
+    data_ += offset * sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator+(difference_type offset) const {
+    ReadIterator r = *this;
+    r += offset;
+    return r;
+  }
+
+  ReadIterator& operator--() {
+    data_ -= sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator--(int) {
+    ReadIterator r = *this;
+    data_ -= sizeof(StorageType);
+    return r;
+  }
+
+  ReadIterator& operator-=(difference_type offset) {
+    data_ -= offset * sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator-(difference_type offset) const {
+    ReadIterator r = *this;
+    r -= offset;
+    return r;
+  }
+
+  difference_type operator-(const ReadIterator& other) const {
+    DCHECK_EQ((data_ - other.data_) % sizeof(StorageType), 0);
+    return (data_ - other.data_) / sizeof(StorageType);
+  }
+
+  friend bool operator==(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ == b.data_;
+  }
+
+  friend bool operator!=(const ReadIterator& a, const ReadIterator& b) {
+    return !(a == b);
+  }
+
+  friend bool operator<(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ < b.data_;
+  }
+
+  friend bool operator<=(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ <= b.data_;
+  }
+
+  friend bool operator>(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ > b.data_;
+  }
+
+  friend bool operator>=(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ >= b.data_;
+  }
+
+ private:
+  const char* data_ = nullptr;
+};
+
+// DEFINE_BYTECODE_FIELD provides helper functions for reading and constructing
+// member fields in bytecode.
+#define DEFINE_BYTECODE_FIELD(Type, name)                                   \
+  typename ::mlrt::bc::AccessTraits<Type>::StorageType name;                \
+  static const char* name##_pointer(const char* base) {                     \
+    return base + offsetof(Self, name);                                     \
+  }                                                                         \
+  static ::mlrt::bc::BcAddr_t name##_address(::mlrt::bc::BcAddr_t base) {   \
+    return base + offsetof(Self, name);                                     \
+  }                                                                         \
+  static Type read_##name(const char* base) {                               \
+    return ::mlrt::bc::AccessTraits<Type>::Read(name##_pointer(base));      \
+  }                                                                         \
+  template <typename... Args>                                               \
+  static auto construct_##name(::mlrt::bc::Allocator* allocator,            \
+                               ::mlrt::bc::BcAddr_t base, Args&&... args) { \
+    return ::mlrt::bc::AccessTraits<Type>::Construct(                       \
+        allocator, name##_address(base), std::forward<Args>(args)...);      \
+  }                                                                         \
+  static_assert(                                                            \
+      std::is_trivial<                                                      \
+          typename ::mlrt::bc::AccessTraits<Type>::StorageType>::value,     \
+      "Bytecode storage types must be trivial.")
+
+// Defines a bytecode vector.
+template <typename T, typename SizeType = uint32_t>
+class Vector {
+ public:
+  struct Storage {
+    using Self = Storage;
+    DEFINE_BYTECODE_FIELD(SizeType, size);
+    DEFINE_BYTECODE_FIELD(SizeType, offset);
+  };
+  static_assert(std::is_trivial<Storage>::value, "StorageType is trivial");
+  static_assert(std::is_standard_layout<Storage>::value,
+                "StorageType has standard layout");
+  static_assert(sizeof(Storage) == 2 * sizeof(SizeType));
+  static_assert(alignof(Storage) == alignof(SizeType));
+
+  using StorageType = Storage;
+  using ElementStorageType = typename AccessTraits<T>::StorageType;
+
+  using value_type = T;
+  using iterator = ReadIterator<T>;
+  using const_iterator = iterator;
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address, size_t size)
+        : allocator_(allocator), address_(address) {
+      DCHECK_GE(allocator->size(), address + sizeof(StorageType));
+      size_t data_start = allocator->Allocate(size * sizeof(ElementStorageType),
+                                              alignof(ElementStorageType));
+
+      CHECK_LT(size, std::numeric_limits<SizeType>::max());  // Crash Ok
+      CHECK_LT(data_start - address,                         // Crash Ok
+               std::numeric_limits<SizeType>::max());
+      storage_.size = size;
+      storage_.offset = data_start - address;
+      AccessTraits<StorageType>::Construct(allocator, address, storage_);
+    }
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    Constructor(Allocator* allocator, BcAddr_t address,
+                const std::vector<T>& vec)
+        : Constructor(allocator, address, vec.size()) {
+      Assign(vec.begin(), vec.end());
+    }
+
+    template <typename... Args>
+    auto ConstructAt(size_t index, Args&&... args) {
+      DCHECK_LT(index, size());
+      return AccessTraits<T>::Construct(allocator_, GetElementAddress(index),
+                                        std::forward<Args>(args)...);
+    }
+
+    template <typename V>
+    void Assign(std::initializer_list<V> ilist) {
+      DCHECK_EQ(ilist.size(), size());
+      Assign(ilist.begin(), ilist.end());
+    }
+
+    template <typename Range>
+    void Assign(const Range& range) {
+      DCHECK_EQ(std::distance(std::begin(range), std::end(range)), size());
+      Assign(std::begin(range), std::end(range));
+    }
+
+    template <typename Iter>
+    void Assign(Iter begin, Iter end) {
+      size_t i = 0;
+      for (; begin != end; ++begin) {
+        ConstructAt(i++, *begin);
+      }
+      DCHECK_EQ(i, size());
+    }
+
+    // If T is a trivial inplace type like int32_t, we can place the bytes for
+    // this vector directly instead of constructing the elements one by one.
+    template <
+        typename U = T,
+        typename std::enable_if<
+            std::is_same_v<typename AccessTraits<U>::ConstructorType, void>,
+            int>::type = 0>
+    void Place(const char* data, size_t size) {
+      AccessTraits<U>::Place(allocator_, address_ + storage_.offset, data, size,
+                             storage_.size);
+    }
+
+    // TODO(chky): Implement iterators for construction.
+
+    size_t size() const { return storage_.size; }
+    BcAddr_t address() const { return address_; }
+
+   private:
+    BcAddr_t GetElementAddress(size_t index) const {
+      return address_ + storage_.offset + index * sizeof(ElementStorageType);
+    }
+
+    Allocator* allocator_;
+    BcAddr_t address_;
+    Vector::Storage storage_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit Vector(const char* p) : p_(p) {
+    static_assert(!std::is_trivial_v<Vector>);
+    DCHECK(p_ != nullptr);
+  }
+  Vector() {
+    static_assert(!std::is_trivial_v<Vector>);
+    static Storage kEmptyStorage{0, 0};
+    p_ = reinterpret_cast<const char*>(&kEmptyStorage);
+  }
+
+  const char* data() const { return p_ + offset(); }
+
+  size_t size() const { return StorageType::read_size(p_); }
+  bool empty() const { return size() == 0; }
+
+  iterator begin() const { return iterator(data()); }
+  iterator end() const {
+    return iterator(data() + size() * sizeof(ElementStorageType));
+  }
+
+  T operator[](size_t index) const {
+    DCHECK_LT(index, size());
+    auto iter = begin();
+    iter += index;
+    return *iter;
+  }
+
+ private:
+  SizeType offset() const { return StorageType::read_offset(p_); }
+
+  const char* p_;
+};
+
+class String : public Vector<char, uint64_t> {
+ public:
+  using Base = Vector<char, uint64_t>;
+  using Base::Base;
+
+  class Constructor : public Base::Constructor {
+   public:
+    using Base::Constructor::Assign;
+
+    Constructor(Allocator* allocator, BcAddr_t address, absl::string_view str)
+        : Base::Constructor(allocator, address, str.size()) {
+      Assign(str.begin(), str.end());
+    }
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  using Base::data;
+  using Base::size;
+
+  std::string str() const { return std::string(data(), size()); }
+  absl::string_view Get() const { return absl::string_view(data(), size()); }
+
+  operator absl::string_view() const {  // NOLINT
+    return absl::string_view(data(), size());
+  }
+
+  friend bool operator==(String x, absl::string_view y) { return x.Get() == y; }
+  friend bool operator==(absl::string_view x, String y) { return x == y.Get(); }
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_BYTECODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/executable.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/executable.h
new file mode 100644
index 00000000..2f6f9c0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/executable.h
@@ -0,0 +1,90 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_EXECUTABLE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_EXECUTABLE_H_
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
+
+namespace mlrt {
+namespace bc {
+
+// Defines the bytecode format for the executable, which contains the following
+// section:
+//  1) kernel_names: an ordered list of strings for kernel names that appear in
+//  this file. The `code` fields of kernels in `functions` will be indices to
+//  this list.
+//
+//  2) attributes: an ordered list of strings that are raw bytes. It is kernel
+//  implementations' resposiblity to decode the bytes properly. The `attributes`
+//  field of kernels in `functions` will be indices to this list.
+//
+//  3) functions: an order list of functions, which contains kernels and other
+//  metadata. Please refer to function.h for its detailed format.
+class Executable {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(Vector<String>, kernel_names);
+    DEFINE_BYTECODE_FIELD(Vector<Function>, functions);
+    DEFINE_BYTECODE_FIELD(Vector<String>, attributes);
+  };
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    template <typename... Args>
+    auto construct_kernel_names(Args&&... args) {
+      return StorageType::construct_kernel_names(allocator_, address_,
+                                                 std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_attributes(Args&&... args) {
+      return StorageType::construct_attributes(allocator_, address_,
+                                               std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_functions(Args&&... args) {
+      return StorageType::construct_functions(allocator_, address_,
+                                              std::forward<Args>(args)...);
+    }
+
+    BcAddr_t address() const { return address_; }
+
+   private:
+    Allocator* allocator_;
+    BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit Executable(const char* p) : p_(p) {}
+
+  Vector<String> kernel_names() const {
+    return StorageType::read_kernel_names(p_);
+  }
+  Vector<Function> functions() const { return StorageType::read_functions(p_); }
+  Vector<String> attributes() const { return StorageType::read_attributes(p_); }
+
+ private:
+  const char* p_;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/function.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/function.h
new file mode 100644
index 00000000..c85fc40d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/function.h
@@ -0,0 +1,110 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_FUNCTION_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_FUNCTION_H_
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+
+namespace mlrt {
+namespace bc {
+
+class Function {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(String, name);
+    DEFINE_BYTECODE_FIELD(uint32_t, num_regs);
+    DEFINE_BYTECODE_FIELD(Vector<uint32_t>, input_regs);
+    DEFINE_BYTECODE_FIELD(Vector<uint32_t>, output_regs);
+    DEFINE_BYTECODE_FIELD(Vector<uint8_t>, output_last_uses);
+    DEFINE_BYTECODE_FIELD(Vector<Kernel>, kernels);
+  };
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    template <typename... Args>
+    auto construct_name(Args&&... args) {
+      return StorageType::construct_name(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    void set_num_regs(uint32_t num_regs) {
+      StorageType::construct_num_regs(allocator_, address_, num_regs);
+    }
+
+    template <typename... Args>
+    auto construct_input_regs(Args&&... args) {
+      return StorageType::construct_input_regs(allocator_, address_,
+                                               std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_output_regs(Args&&... args) {
+      return StorageType::construct_output_regs(allocator_, address_,
+                                                std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_output_last_uses(Args&&... args) {
+      return StorageType::construct_output_last_uses(
+          allocator_, address_, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_kernels(Args&&... args) {
+      return StorageType::construct_kernels(allocator_, address_,
+                                            std::forward<Args>(args)...);
+    }
+
+    BcAddr_t address() const { return address_; }
+
+   private:
+    Allocator* allocator_;
+    BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  Function() = default;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Function(std::nullptr_t) : p_(nullptr) {}
+  explicit Function(const char* p) : p_(p) {}
+
+  String name() const { return StorageType::read_name(p_); }
+  uint32_t num_regs() const { return StorageType::read_num_regs(p_); }
+  Vector<uint32_t> input_regs() const {
+    return StorageType::read_input_regs(p_);
+  }
+  Vector<uint32_t> output_regs() const {
+    return StorageType::read_output_regs(p_);
+  }
+  Vector<uint8_t> output_last_uses() const {
+    return StorageType::read_output_last_uses(p_);
+  }
+  Vector<Kernel> kernels() const { return StorageType::read_kernels(p_); }
+
+  explicit operator bool() const { return p_ != nullptr; }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/kernel.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/kernel.h
new file mode 100644
index 00000000..b4e6f53b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/kernel.h
@@ -0,0 +1,90 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_KERNEL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_KERNEL_H_
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+namespace bc {
+
+class Kernel {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(uint32_t, code);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, arguments);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, results);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, attributes);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint8_t>, last_uses);
+  };
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    void set_code(uint32_t code) {
+      StorageType::construct_code(allocator_, address_, code);
+    }
+
+    template <typename... Args>
+    auto construct_arguments(Args&&... args) {
+      return StorageType::construct_arguments(allocator_, address_,
+                                              std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_results(Args&&... args) {
+      return StorageType::construct_results(allocator_, address_,
+                                            std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_attributes(Args&&... args) {
+      return StorageType::construct_attributes(allocator_, address_,
+                                               std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_last_uses(Args&&... args) {
+      return StorageType::construct_last_uses(allocator_, address_,
+                                              std::forward<Args>(args)...);
+    }
+
+    BcAddr_t address() const { return address_; }
+
+   private:
+    Allocator* allocator_;
+    BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit Kernel(const char* p) : p_(p) {}
+  Kernel() : p_(nullptr) {}
+
+  uint32_t code() const { return StorageType::read_code(p_); }
+  Vector<uint32_t> arguments() const { return StorageType::read_arguments(p_); }
+  Vector<uint32_t> results() const { return StorageType::read_results(p_); }
+  Vector<uint32_t> attributes() const {
+    return StorageType::read_attributes(p_);
+  }
+  Vector<uint8_t> last_uses() const { return StorageType::read_last_uses(p_); }
+
+ private:
+  const char* p_;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/span.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/span.h
new file mode 100644
index 00000000..bf8ce7eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/bytecode/span.h
@@ -0,0 +1,86 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_SPAN_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_SPAN_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+namespace bc {
+
+// Span is a range view of contiguous byte region like bc::Vector. It reads the
+// array size and start pointer eagerly, so that the range can be adapted.
+template <typename T>
+class Span {
+ public:
+  using value_type = T;
+  using iterator = ReadIterator<T>;
+  using const_iterator = iterator;
+
+  Span() = default;
+  Span(const char* data, size_t size) : data_(data), size_(size) {}
+
+  template <typename SizeType>
+  Span(const Vector<T, SizeType>& vec)  // NOLINT(google-explicit-constructor)
+      : Span(vec.data(), vec.size()) {}
+  Span(const String& vec)  // NOLINT(google-explicit-constructor)
+      : Span(vec.data(), vec.size()) {}
+  Span(const std::vector<T>& vec)  // NOLINT(google-explicit-constructor)
+      : Span(reinterpret_cast<const char*>(vec.data()), vec.size()) {}
+
+  const char* data() const { return data_; }
+  const char* data(size_t index) const { return data_ + index * sizeof(T); }
+
+  iterator begin() const { return iterator(data_); }
+  iterator end() const { return iterator(data_ + size_ * sizeof(T)); }
+  T back() const {
+    DCHECK_GT(size_, 0);
+    return *iterator(data_ + (size_ - 1) * sizeof(T));
+  }
+
+  T operator[](size_t index) const {
+    DCHECK_LT(index, size());
+    auto iter = begin();
+    iter += index;
+    return *iter;
+  }
+
+  size_t size() const { return size_; }
+  bool empty() const { return size_ == 0; }
+
+  Span drop_front(size_t num = 1) const {
+    auto beg = begin();
+    beg += num;
+    DCHECK_GE(size(), num);
+    return Span(beg.data(), size() - num);
+  }
+
+  Span drop_back(size_t num = 1) const {
+    DCHECK_GE(size(), num);
+    return Span(data(), size() - num);
+  }
+
+ private:
+  const char* data_ = nullptr;
+  size_t size_ = 0;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_SPAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/async_handle.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/async_handle.h
new file mode 100644
index 00000000..3f033492
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/async_handle.h
@@ -0,0 +1,177 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ASYNC_HANDLE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ASYNC_HANDLE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
+
+namespace mlrt {
+
+// mlrt::AsyncHandle is a specialized future for mananging context of an async
+// execution.
+//
+// Example usage:
+//
+//  // Create the context the async execution by copying the current context.
+//  auto [promise, handle] = AsyncHandle::Allocate(current_context);
+//
+//  // Set up completion signal through the `promise` created.
+//  handle.execution_context().set_exit_handler(
+//      [promise = std::move(promise)]() { promise.Finish(); });
+//
+//  // Launch execution.
+//  thread_pool.Schedule([&execution_context = handle.execution_context()](){
+//    execution_context.Call(...);
+//    Execute(execution_context);
+//  });
+//
+//  // Pass `handle` to places that need to wait for the execution.
+//  other_execution_context.Await(std::move(handle));
+//
+class AsyncHandle {
+ public:
+  class Promise {
+   public:
+    Promise(const Promise&) = delete;
+    Promise& operator=(const Promise&) = delete;
+    Promise(Promise&&) = default;
+    Promise& operator=(Promise&&) = default;
+
+    ~Promise() {
+      DCHECK(!shared_state_ || shared_state_.IsAvailable())
+          << "A non-empty promise must be fulfilled.";
+    }
+
+    void Finish(absl::Status status) && {
+      if (status.ok()) {
+        shared_state_.SetStateConcrete();
+      } else {
+        shared_state_.SetError(std::move(status));
+      }
+    }
+
+    // We don't need HandleError() method for AsyncHandle::Promise because it is
+    // managed by the framework internally and should never be placed in the
+    // register.
+
+   private:
+    explicit Promise(tsl::AsyncValueRef<tsl::Chain> shared_state)
+        : shared_state_(std::move(shared_state)) {}
+    tsl::AsyncValueRef<tsl::Chain> shared_state_;
+
+    friend class AsyncHandle;
+  };
+
+  // Allocate an AsyncHandle and the corresponding promise.
+  static std::pair<Promise, AsyncHandle> Allocate(
+      const ExecutionContext& current);
+
+  AsyncHandle(const AsyncHandle&) = delete;
+  AsyncHandle& operator=(const AsyncHandle&) = delete;
+  AsyncHandle(AsyncHandle&&) = default;
+  AsyncHandle& operator=(AsyncHandle&&) = default;
+
+  ~AsyncHandle() {
+    CHECK(!shared_state_ || shared_state_.IsAvailable())  // Crash OK
+        << "A non-empty AsyncHandle must be awaited.";
+  }
+
+  // Then() enqueues a callback which will be called when the future is
+  // fulfilled with either an error or a value.
+  //
+  // The following Then() overloads accept a callback with the following
+  // signatures:
+  //
+  // 1) void(absl::Status)
+  //    The argument is the status of this future in ready state.
+  //
+  // 2) void()
+  //    There is no argument. The callback will be called whenever it is ready.
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if<std::is_same_v<Arg, absl::Status>, void>::type Then(
+      F then) && {
+    CHECK(shared_state_);  // Crash OK
+    auto* shared_state_ptr = shared_state_.GetAsyncValue();
+    shared_state_ptr->AndThen([shared_state = std::move(shared_state_),
+                               execution_context =
+                                   std::move(execution_context_),
+                               then = std::move(then)]() mutable {
+      future_internal::InvokeThen(std::move(then), shared_state.GetAsyncValue(),
+                                  future_internal::ArgTag<Arg>());
+    });
+  }
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if<std::is_void_v<Arg>, void>::type Then(F then) && {
+    CHECK(shared_state_);  // Crash OK
+    auto* shared_state_ptr = shared_state_.GetAsyncValue();
+    shared_state_ptr->AndThen(
+        [shared_state = std::move(shared_state_),
+         execution_context = std::move(execution_context_),
+         then = std::move(then)]() mutable { std::move(then)(); });
+  }
+
+  void HandleError(Value* arg) {
+    if (!shared_state_ || shared_state_.IsAvailable()) {
+      // This is an empty handle or it is already finished.
+      return;
+    }
+
+    auto& execution_context = *arg->Get<ExecutionContext*>();
+    execution_context.LogError(absl::InternalError(absl::StrCat(
+        "UnwindOnError: unwind AsyncHandle of context ",
+        absl::Hex(reinterpret_cast<std::uintptr_t>(execution_context_.get())),
+        " from context ",
+        absl::Hex(reinterpret_cast<std::uintptr_t>(&execution_context)),
+        " of state ", execution_context.state_)));
+    execution_context.Await(std::move(*this));
+  }
+
+  bool IsReady() const { return shared_state_.IsAvailable(); }
+  bool IsError() const { return shared_state_.IsError(); }
+
+  const absl::Status& GetError() const { return shared_state_.GetError(); }
+
+  ExecutionContext& execution_context() { return *execution_context_; }
+
+ private:
+  AsyncHandle(std::unique_ptr<ExecutionContext> execution_context,
+              tsl::AsyncValueRef<tsl::Chain> shared_state)
+      : execution_context_(std::move(execution_context)),
+        shared_state_(std::move(shared_state)) {
+    DCHECK(execution_context_);
+    DCHECK(shared_state_);
+  }
+
+  std::unique_ptr<ExecutionContext> execution_context_;
+  tsl::AsyncValueRef<tsl::Chain> shared_state_;
+};
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ASYNC_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h
new file mode 100644
index 00000000..485aeceb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h
@@ -0,0 +1,87 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ATTRIBUTE_SPAN_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ATTRIBUTE_SPAN_H_
+
+#include <cstring>
+#include <type_traits>
+
+#include "absl/log/check.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/iterator.h"
+
+namespace mlrt {
+namespace attribute_internal {
+
+// LINT.IfChange(mlrt_attributes)
+template <typename T>
+inline constexpr bool kCanAttributeBeInlined =
+    (std::is_integral_v<T> ||
+     std::is_floating_point_v<T>)&&(sizeof(T) <= sizeof(uint32_t));
+// LINT.ThenChange(../../../../compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc:mlrt_attributes)
+
+}  // namespace attribute_internal
+
+class AttributeSpan {
+  class Iterator
+      : public iterator_internal::IteratorBase<Iterator, bc::String,
+                                               bc::Span<bc::String>> {
+   public:
+    using IteratorBase<Iterator, bc::String,
+                       bc::Span<bc::String>>::IteratorBase;
+  };
+
+ public:
+  using value_type = bc::String;
+  using iterator = Iterator;
+  using const_iterator = iterator;
+
+  AttributeSpan(bc::Span<uint32_t> attr_indices,
+                bc::Span<bc::String> attributes)
+      : attr_indices_(attr_indices), attributes_(attributes) {}
+
+  bc::String operator[](size_t id) const {
+    return attributes_[attr_indices_[id]];
+  }
+
+  template <typename T>
+  T GetAs(size_t id) const {
+    if constexpr (std::is_same_v<T, bc::String>) {
+      return attributes_[attr_indices_[id]];
+    }
+
+    if constexpr (attribute_internal::kCanAttributeBeInlined<T>) {
+      return bc::AccessTraits<T>::Read(attr_indices_.data(id));
+    }
+
+    return bc::AccessTraits<T>::Read(attributes_[attr_indices_[id]].data());
+  }
+
+  size_t size() const { return attr_indices_.size(); }
+
+  iterator begin() const {
+    return iterator(attr_indices_.begin(), attributes_);
+  }
+  iterator end() const { return iterator(attr_indices_.end(), attributes_); }
+
+ private:
+  bc::Span<uint32_t> attr_indices_;
+  bc::Span<bc::String> attributes_;
+};
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ATTRIBUTE_SPAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h
new file mode 100644
index 00000000..11273211
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h
@@ -0,0 +1,73 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_BUILTIN_KERNELS_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_BUILTIN_KERNELS_H_
+
+#include <type_traits>
+
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace mlrt {
+
+void CallOp(KernelFrame& frame);
+void ReturnOp(KernelFrame& frame);
+
+void AsyncOp(KernelFrame& frame);
+void AwaitHandleOp(KernelFrame& frame);
+
+// The base class for the PromiseReturnOp.
+template <typename Derived>
+class PromiseReturnOpBase : public KernelFrame {
+ public:
+  using KernelFrame::KernelFrame;
+
+  Promise& promise() const {
+    return static_cast<const Derived*>(this)->promise();
+  }
+
+  decltype(auto) value() const {
+    return static_cast<const Derived*>(this)->value();
+  }
+
+  bool value_last_use() const {
+    return static_cast<const Derived*>(this)->value_last_use();
+  }
+
+  void Invoke() {
+    tsl::profiler::TraceMe trace_me(Derived::kName);
+
+    // Set the execution context to kReturn state so that the callbacks in the
+    // futures, which may invoke Resume(), knows we are exiting.
+    execution_context().Return({});
+    auto& p = promise();
+
+    using ValueType = std::decay_t<decltype(value())>;
+
+    decltype(auto) value = this->value();
+    if (value_last_use()) {
+      std::move(p).template Set<ValueType>(std::move(value));
+    } else {
+      std::move(p).template Set<ValueType>(value);
+    }
+  }
+};
+
+void RegisterBuiltinKernels(KernelRegistry& registry);
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_BUILTIN_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/context.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/context.h
new file mode 100644
index 00000000..35329ced
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/context.h
@@ -0,0 +1,595 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_CONTEXT_H_
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace mlrt {
+
+class KernelFrame;
+class ExecutionContext;
+
+class Future;
+template <typename T, typename FutureLikeContainer, typename ResultRefContainer>
+Future AwaitAll(FutureLikeContainer futures, ResultRefContainer results);
+template <typename FutureLikeContainer>
+Future AwaitAll(FutureLikeContainer futures);
+
+using KernelImplementation = void (*)(KernelFrame);
+
+class KernelRegistry {
+ public:
+  void Register(absl::string_view name, KernelImplementation kernel);
+
+  KernelImplementation Get(absl::string_view name) const;
+
+  template <typename KernelClass>
+  void Register(absl::string_view name);
+
+  template <typename KernelClass>
+  void Register() {
+    Register<KernelClass>(KernelClass::kName);
+  }
+
+  void Merge(const KernelRegistry& other);
+
+ private:
+  absl::flat_hash_map<std::string, KernelImplementation> map_;
+};
+
+class LoadedExecutable {
+ public:
+  LoadedExecutable(bc::Executable executable,
+                   const KernelRegistry& kernel_registry);
+
+  absl::Span<const KernelImplementation> kernels() const { return kernels_; }
+
+  bc::Function GetFunction(absl::string_view name) const {
+    if (auto iter = functions_.find(name); iter != functions_.end()) {
+      return iter->second;
+    }
+
+    return nullptr;
+  }
+
+  bc::Executable executable() const { return executable_; }
+
+ private:
+  bc::Executable executable_;
+
+  absl::flat_hash_map<std::string, bc::Function> functions_;
+  std::vector<KernelImplementation> kernels_;
+};
+
+// A helper structure that holds states for a kernel. Typical usuage is that a
+// control kernel wants to call a function and then come back to the same
+// kernel, e.g. WhileOp.
+struct KernelContext {
+  // Any non-zero value indicates the kernel just reentered.
+  int reenter = 0;
+  // Registers for callee.
+  std::vector<Value> registers;
+};
+
+namespace execute_internal {
+
+void UnwindOnError(ExecutionContext& context, int64_t pc);
+
+}
+
+class FunctionContext {
+ public:
+  FunctionContext(bc::Function function, ExecutionContext* execution_context)
+      : pc_(0),
+        registers_(function.num_regs()),
+        function_object_(function),
+        execution_context_(execution_context) {
+    DCHECK(execution_context);
+  }
+
+  FunctionContext(const FunctionContext&) = delete;
+  FunctionContext& operator=(const FunctionContext&) = delete;
+  FunctionContext(FunctionContext&&) = default;
+  FunctionContext& operator=(FunctionContext&&) = default;
+
+  ExecutionContext& execution_context() { return *execution_context_; }
+
+  const bc::Function& function_object() const { return function_object_; }
+
+  absl::Span<Value> regs() { return absl::MakeSpan(registers_); }
+
+  // Argument passing is via either copy or move.
+  template <typename Args, typename Results>
+  void Call(bc::Span<uint8_t> last_uses, Args args, Results results) {
+    auto idx_iter = function_object_.input_regs().begin();
+
+    DCHECK_EQ(function_object_.input_regs().size(), args.size());
+
+    DCHECK_EQ(args.size(), last_uses.size());
+    auto last_use_iter = last_uses.begin();
+    for (auto& arg : args) {
+      if (*last_use_iter) {
+        registers_[*idx_iter] = std::move(arg);
+      } else {
+        registers_[*idx_iter] = arg;
+      }
+      ++idx_iter;
+      ++last_use_iter;
+    }
+
+    results_.reserve(results.size());
+    for (auto& result : results) {
+      results_.push_back(&result);
+    }
+  }
+
+  // Argument passing is via move.
+  template <typename Args, typename Results>
+  void CallByMove(Args args, Results results) {
+    auto idx_iter = function_object_.input_regs().begin();
+
+    DCHECK_EQ(function_object_.input_regs().size(), args.size());
+
+    for (auto& arg : args) {
+      registers_[*idx_iter] = std::move(arg);
+      ++idx_iter;
+    }
+
+    results_.reserve(results.size());
+    for (auto& result : results) {
+      results_.push_back(&result);
+    }
+  }
+
+  // The return operation copies or moves (if not a ref) the results.
+  void Return(RegisterSpan results) {
+    DCHECK_EQ(results.size(), function_object_.output_regs().size());
+    auto result_iter = results.begin();
+    auto output_last_uses = function_object_.output_last_uses();
+
+    for (int i = 0; i < results_.size(); ++i) {
+      auto* result = results_[i];
+
+      if (!output_last_uses.empty() && output_last_uses[i]) {
+        // We only move the result only if it is the last use.
+        *result = std::move(*result_iter);
+      } else {
+        *result = *result_iter;
+      }
+      ++result_iter;
+    }
+  }
+
+  const KernelContext& kernel_context() const { return kernel_context_; }
+  KernelContext& kernel_context() { return kernel_context_; }
+
+ private:
+  int64_t pc_;
+  std::vector<Value> registers_;
+  std::vector<Value*> results_;
+  bc::Function function_object_;
+  KernelContext kernel_context_;
+
+  ExecutionContext* execution_context_ = nullptr;
+
+  friend class ExecutionContext;
+  friend void Execute(ExecutionContext& context);
+  friend void execute_internal::UnwindOnError(ExecutionContext& context,
+                                              int64_t pc);
+};
+
+namespace context_internal {
+
+inline std::atomic<int>& GetNextId() {
+  static std::atomic<int> next_id = 0;
+  return next_id;
+}
+
+class UserContextBase {
+ public:
+  virtual ~UserContextBase();
+
+  virtual std::unique_ptr<UserContextBase> Copy() const = 0;
+};
+
+}  // namespace context_internal
+
+// Every user context should inherit from this class. Internally it generates a
+// unique id for each user context type for internal management.
+template <typename Derived>
+class UserContext : public context_internal::UserContextBase {
+ public:
+  using Base = context_internal::UserContextBase;
+
+  static int id() { return id_; }
+
+  std::unique_ptr<Base> Copy() const final {
+    return std::make_unique<Derived>(*static_cast<const Derived*>(this));
+  }
+
+ private:
+  inline static int id_ = context_internal::GetNextId()++;
+};
+
+class ExecutionContext {
+ public:
+  explicit ExecutionContext(const LoadedExecutable* loaded_executable)
+      : user_contexts_(context_internal::GetNextId().load()),
+        loaded_executable_(loaded_executable) {}
+
+  ExecutionContext(
+      const LoadedExecutable* loaded_executable,
+      std::vector<std::unique_ptr<context_internal::UserContextBase>>
+          user_contexts,
+      const std::vector<std::function<void(absl::Status)>>& user_error_loggers)
+      : user_contexts_(std::move(user_contexts)),
+        user_error_loggers_(user_error_loggers),
+        loaded_executable_(loaded_executable) {}
+
+  void set_exit_handler(absl::AnyInvocable<void() &&> exit_handler) {
+    exit_handler_ = std::move(exit_handler);
+  }
+
+  tfrt::ConcurrentWorkQueue* work_queue() const { return work_queue_; }
+
+  void set_work_queue(tfrt::ConcurrentWorkQueue* work_queue) {
+    work_queue_ = work_queue;
+  }
+
+  template <typename Args, typename Results>
+  void Call(bc::Function function_object, bc::Span<uint8_t> last_uses,
+            Args args, Results results) {
+    auto& function_context =
+        function_stack_.emplace_back(function_object, this);
+    function_context.Call(last_uses, args, results);
+    state_ = State::kReady;
+  }
+
+  template <typename Args, typename Results>
+  void CallByMove(bc::Function function_object, Args args, Results results) {
+    auto& function_context =
+        function_stack_.emplace_back(function_object, this);
+    function_context.CallByMove(args, results);
+    state_ = State::kReady;
+  }
+
+  void Return(RegisterSpan results) {
+    auto& function_context = function_stack_.back();
+    function_context.Return(results);
+    state_ = State::kReturn;
+  }
+
+  size_t function_stack_size() const { return function_stack_.size(); }
+  FunctionContext& function_context() { return function_stack_.back(); }
+
+  // Enqueues the current execution to the wait list of the `future`. Once the
+  // `future` is ready, the execution will be resumed. And the value will be
+  // populated in `result` if it is not an error.
+  template <typename T, typename FutureLike>
+  void Await(FutureLike future, Value* result) {
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      } else {
+        std::move(future).Then(
+            [result](T value) { result->Set(std::move(value)); });
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, result, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then([this, result, resume = std::move(resume)](
+                                 absl::StatusOr<T> value) mutable {
+        if (!value.ok()) {
+          Fail(std::move(value).status());
+        } else {
+          result->Set(*std::move(value));
+          state_ = State::kRunning;
+        }
+
+        std::move(resume)();
+      });
+    };
+  }
+
+  template <typename FutureLike>
+  void Await(FutureLike future) {
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then(
+          [this, resume = std::move(resume)](absl::Status status) mutable {
+            if (!status.ok()) {
+              Fail(std::move(status));
+            } else {
+              state_ = State::kRunning;
+            }
+
+            std::move(resume)();
+          });
+    };
+  }
+
+  template <typename T, typename FutureLikeContainer,
+            typename ResultRefContainer>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void AwaitAll(FutureLikeContainer futures,
+                                             ResultRefContainer results) {
+    auto future = mlrt::AwaitAll<T>(futures, results);
+
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then(
+          [this, resume = std::move(resume)](absl::Status status) mutable {
+            state_ = State::kRunning;
+
+            if (!status.ok()) {
+              Fail(status);
+            }
+
+            std::move(resume)();
+          });
+    };
+  }
+
+  template <typename FutureLikeContainer>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void AwaitAll(FutureLikeContainer futures) {
+    auto future = mlrt::AwaitAll(futures);
+
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then(
+          [this, resume = std::move(resume)](absl::Status status) mutable {
+            state_ = State::kRunning;
+
+            if (!status.ok()) {
+              Fail(status);
+            }
+
+            std::move(resume)();
+          });
+    };
+  }
+
+  const LoadedExecutable& loaded_executable() const {
+    return *loaded_executable_;
+  }
+
+  void Fail(absl::Status status) {
+    state_ = State::kError;
+    status_ = std::move(status);
+  }
+
+  void FailOnCancellation() { Fail(absl::CancelledError()); }
+
+  const absl::Status& status() const { return status_; }
+
+  // Add an instance of user context to the execution context.
+  template <typename T>
+  void AddUserContext(std::unique_ptr<T> user_context) {
+    static_assert(std::is_base_of_v<UserContext<T>, T>);
+    DCHECK_LT(T::id(), user_contexts_.size());
+    user_contexts_[T::id()] = std::move(user_context);
+  }
+
+  // Return an reference to the user context.
+  template <typename T>
+  T& GetUserContext() const {
+    static_assert(std::is_base_of_v<UserContext<T>, T>);
+    DCHECK_LT(T::id(), user_contexts_.size());
+    return *static_cast<T*>(user_contexts_[T::id()].get());
+  }
+
+  std::vector<std::unique_ptr<context_internal::UserContextBase>>
+  CopyUserContexts() const {
+    std::vector<std::unique_ptr<context_internal::UserContextBase>>
+        user_contexts;
+    user_contexts.reserve(user_contexts_.size());
+    for (const auto& user_context : user_contexts_) {
+      if (user_context) {
+        user_contexts.push_back(user_context->Copy());
+      } else {
+        user_contexts.push_back(nullptr);
+      }
+    }
+    return user_contexts;
+  }
+
+  void AddUserErrorLogger(std::function<void(absl::Status)> error_logger) {
+    user_error_loggers_.push_back(error_logger);
+  }
+
+  const std::vector<std::function<void(absl::Status)>>& user_error_loggers()
+      const {
+    return user_error_loggers_;
+  }
+
+  void LogError(absl::Status status) {
+    for (auto& error_logger : user_error_loggers_) {
+      error_logger(status);
+    }
+  }
+
+  enum class State {
+    // The function is pushed to the stack, and ready for execution.
+    kReady = 0,
+
+    // The function is being executed and has not reached the return op yet.
+    kRunning,
+
+    // The function finished executing the return op, and ready for being popped
+    // from the stack.
+    kReturn,
+
+    // The function is suspended from execution due to context switches.
+    kSuspended,
+
+    // The execution reports an error in the current thread, and the execution
+    // will be aborted by cleaning the states.
+    kError
+  };
+  State state() const { return state_; }
+
+ private:
+  absl::InlinedVector<FunctionContext, 2> function_stack_;
+
+  State state_ = State::kReady;
+
+  absl::Status status_;
+
+  // The `suspend_handler_` is a callable whose argument is another callable
+  // that resumes the execution (or error handling).
+  absl::AnyInvocable<void(absl::AnyInvocable<void() &&> resume) &&>
+      suspend_handler_;
+  absl::AnyInvocable<void() &&> exit_handler_;
+
+  tfrt::ConcurrentWorkQueue* work_queue_ = nullptr;
+
+  std::vector<std::unique_ptr<context_internal::UserContextBase>>
+      user_contexts_;
+
+  std::vector<std::function<void(absl::Status)>> user_error_loggers_;
+
+  const LoadedExecutable* loaded_executable_ = nullptr;
+
+  friend class AsyncHandle;
+  friend void Execute(ExecutionContext& context);
+  friend void execute_internal::UnwindOnError(ExecutionContext& context,
+                                              int64_t pc);
+};
+
+class KernelFrame {
+ public:
+  struct State {
+    State(absl::Span<Value> regs, bc::Span<bc::String> attrs,
+          ExecutionContext* execution_context)
+        : regs(regs), attrs(attrs), execution_context(execution_context) {
+      DCHECK(execution_context);
+    }
+
+    explicit State(FunctionContext* function_context)
+        : State(function_context->regs(),
+                function_context->execution_context()
+                    .loaded_executable()
+                    .executable()
+                    .attributes(),
+                &function_context->execution_context()) {}
+
+    bc::Kernel kernel;
+    absl::Span<Value> regs;
+    bc::Span<bc::String> attrs;
+    ExecutionContext* execution_context = nullptr;
+  };
+
+  explicit KernelFrame(State* state) : state_(state) { DCHECK(state_); }
+
+  template <typename T>
+  operator T() const {  // NOLINT
+    return T(state_);
+  }
+
+  RegisterSpan arguments() const {
+    return RegisterSpan(kernel().arguments(), regs());
+  }
+
+  RegisterSpan results() const {
+    return RegisterSpan(kernel().results(), regs());
+  }
+
+  AttributeSpan attributes() const {
+    return AttributeSpan(kernel().attributes(), attrs());
+  }
+
+  bc::Span<uint8_t> last_uses() const { return kernel().last_uses(); }
+
+  ExecutionContext& execution_context() { return *state_->execution_context; }
+  const ExecutionContext& execution_context() const {
+    return *state_->execution_context;
+  }
+
+  void set_kernel(bc::Kernel kernel) { this->kernel() = kernel; }
+
+ private:
+  bc::Kernel& kernel() { return state_->kernel; }
+  const bc::Kernel& kernel() const { return state_->kernel; }
+
+  absl::Span<Value> regs() const { return state_->regs; }
+  bc::Span<bc::String> attrs() const { return state_->attrs; }
+
+  State* state_ = nullptr;
+
+  friend void Execute(ExecutionContext& context);
+};
+
+template <typename KernelClass>
+inline void KernelRegistry::Register(absl::string_view name) {
+  Register(
+      name, +[](KernelFrame frame) { KernelClass(frame).Invoke(); });
+}
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/execute.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/execute.h
new file mode 100644
index 00000000..7492d44a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/execute.h
@@ -0,0 +1,26 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_EXECUTE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_EXECUTE_H_
+
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+
+namespace mlrt {
+
+void Execute(ExecutionContext& context);
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_EXECUTE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/future.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/future.h
new file mode 100644
index 00000000..fd32214c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/future.h
@@ -0,0 +1,348 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_FUTURE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_FUTURE_H_
+
+#include <atomic>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/log/check.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tfrt/concurrency/async_value.h"  // from @tf_runtime
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace mlrt {
+namespace future_internal {
+
+// The overloads of GetArgumentType() are used to get the argument type of a
+// callable.
+void GetArgumentType(void (*)());
+template <typename F>
+void GetArgumentType(void (F::*)());
+template <typename F>
+void GetArgumentType(void (F::*)() const);
+template <typename Arg>
+Arg GetArgumentType(void (*)(Arg));
+template <typename F, typename Arg>
+Arg GetArgumentType(void (F::*)(Arg));
+template <typename F, typename Arg>
+Arg GetArgumentType(void (F::*)(Arg) const);
+template <typename F>
+decltype(GetArgumentType(&F::operator())) GetArgumentType(F);
+
+template <typename F>
+using ArgumentType = decltype(GetArgumentType(std::declval<F>()));
+
+template <typename T>
+struct ArgTag {};
+
+// The overloads of InvokeThen() are used to invoke different implementation
+// according to `then`'s argument type.
+template <typename F, typename T>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void InvokeThen(F&& then,
+                                             tsl::AsyncValue* shared_state,
+                                             ArgTag<T>) {
+  auto& arg = shared_state->get<T>();
+  if (shared_state->IsUnique()) {
+    std::forward<F>(then)(std::move(arg));
+  } else {
+    std::forward<F>(then)(arg);
+  }
+}
+
+template <typename F>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void InvokeThen(F&& then,
+                                             tsl::AsyncValue* shared_state,
+                                             ArgTag<absl::Status>) {
+  if (shared_state->IsError()) {
+    std::forward<F>(then)(shared_state->GetError());
+  } else {
+    std::forward<F>(then)(absl::OkStatus());
+  }
+}
+
+template <typename F, typename T>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void InvokeThen(F&& then,
+                                             tsl::AsyncValue* shared_state,
+                                             ArgTag<absl::StatusOr<T>>) {
+  if (shared_state->IsError()) {
+    std::forward<F>(then)(shared_state->GetError());
+  } else {
+    InvokeThen(std::forward<F>(then), shared_state, ArgTag<T>());
+  }
+}
+
+}  // namespace future_internal
+
+struct Control {};
+
+// mlrt::Future is similar to std::shared_future<T> but type-erased.
+class Future {
+ public:
+  // Constructs a mlrt::Future directly from tsl::AsyncValue. This is used to
+  // integrate MLRT with existing systems that uses AsyncValue directly. For new
+  // use cases, creating mlrt::Future through mlrt::Promise is preferred.
+  template <typename T>
+  explicit Future(tsl::AsyncValueRef<T> async_value)
+      : shared_state_(std::move(async_value)) {}
+
+  Future(const Future& other) = default;
+  Future& operator=(const Future& other) = default;
+  Future(Future&& other) = default;
+  Future& operator=(Future&& other) = default;
+
+  explicit operator bool() const { return shared_state_ != nullptr; }
+
+  bool IsReady() const {
+    DCHECK(shared_state_);
+    return shared_state_->IsAvailable();
+  }
+
+  bool IsError() const {
+    DCHECK(shared_state_);
+    return shared_state_->IsError();
+  }
+
+  template <typename T>
+  const T& Get() const {
+    DCHECK(shared_state_);
+    return shared_state_->get<T>();
+  }
+
+  const absl::Status& GetError() const {
+    DCHECK(shared_state_);
+    return shared_state_->GetError();
+  }
+
+  // Then() enqueues a callback which will be called when the future is
+  // fulfilled with either an error or a value.
+  //
+  // The following Then() overloads accept a callback with the following
+  // signatures:
+  //
+  // 1) void(absl::StatusOr<T>)
+  //    The argument can be either the error or the value.
+  //
+  // 2) void(absl::Status)
+  //    The argument is the status of this future in ready state.
+  //
+  // 3) void(T)
+  //    The argument is the fulfilled value. It is undefined behavior if there
+  //    is an error.
+  //
+  // 4) void()
+  //    There is no argument. The callback will be called whenever it is ready.
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if_t<!std::is_void_v<Arg>, void> Then(F then) && {
+    DCHECK(shared_state_);
+    auto* shared_state_ptr = shared_state_.get();
+    shared_state_ptr->AndThen([shared_state = std::move(shared_state_),
+                               then = std::move(then)]() mutable {
+      future_internal::InvokeThen(std::move(then), shared_state.get(),
+                                  future_internal::ArgTag<Arg>());
+    });
+  }
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if_t<std::is_void_v<Arg>, void> Then(F then) && {
+    DCHECK(shared_state_);
+    auto* shared_state_ptr = shared_state_.get();
+    shared_state_ptr->AndThen(
+        [shared_state = std::move(shared_state_),
+         then = std::move(then)]() mutable { std::move(then)(); });
+  }
+
+  size_t UseCount() const {
+    DCHECK(shared_state_);
+    return shared_state_->NumRef();
+  }
+
+  // We don't need HandleError() method for Future because
+  // AsyncHandle::HandleError() is enough for error handling for async
+  // execution.
+
+ private:
+  friend class Promise;
+
+  explicit Future(tsl::RCReference<tsl::AsyncValue> shared_state)
+      : shared_state_(std::move(shared_state)) {}
+
+  tsl::RCReference<tsl::AsyncValue> shared_state_;
+};
+
+// mlrt::Promise is similar to std::promise<T> but type-erased.
+class Promise {
+ public:
+  template <typename T>
+  static Promise Allocate() {
+    return Promise(tsl::MakeUnconstructedAsyncValueRef<T>().ReleaseRCRef());
+  }
+
+  ~Promise() {
+    DCHECK(!shared_state_ || shared_state_->IsAvailable())
+        << "A non-empty promise must be fulfilled.";
+  }
+
+  Promise(const Promise&) = delete;
+  Promise& operator=(const Promise&) = delete;
+  Promise(Promise&&) = default;
+  Promise& operator=(Promise&&) = default;
+
+  Future GetFuture() const { return Future(shared_state_); }
+
+  template <typename T, typename... Args>
+  void Set(Args&&... args) && {
+    DCHECK(shared_state_);
+
+    auto shared_state = std::move(shared_state_);
+    auto* shared_state_ptr = shared_state.get();
+
+    // Since each waiter will hold a reference to the shared state, we can drop
+    // the reference in mlrt::Promise::Set() in order to trigger passing by move
+    // for the last waiter.
+    if (!shared_state->IsUnique()) {
+      shared_state.reset();
+    }
+
+    shared_state_ptr->emplace<T>(std::forward<Args>(args)...);
+  }
+
+  void SetError(absl::Status status) && {
+    DCHECK(shared_state_);
+
+    DCHECK(!status.ok());
+    shared_state_->SetError(std::move(status));
+    shared_state_.reset();
+  }
+
+  void HandleError(Value* arg) && {
+    if (!shared_state_ || shared_state_->IsAvailable()) {
+      // This is an empty promise or it is already fulfilled.
+      return;
+    }
+
+    auto& execution_context = *arg->Get<ExecutionContext*>();
+    DCHECK(!execution_context.status().ok());
+
+    std::move(*this).SetError(execution_context.status());
+  }
+
+  explicit operator bool() const { return shared_state_ != nullptr; }
+
+ private:
+  explicit Promise(tsl::RCReference<tsl::AsyncValue> shared_state)
+      : shared_state_(std::move(shared_state)) {}
+
+  tsl::RCReference<tsl::AsyncValue> shared_state_;
+};
+
+namespace future_internal {
+
+struct State {
+  State(int size, mlrt::Promise promise)
+      : count(size), promise(std::move(promise)) {}
+
+  std::atomic<int> count;
+  mlrt::Promise promise;
+
+  absl::Mutex mu;
+  absl::Status status;
+
+  void SetError(absl::Status status) {
+    absl::MutexLock lock(&mu);
+    this->status = std::move(status);
+  }
+
+  // Returns true if it is the last consumer of the state. If this method
+  // returns false, *this object might be destroyed anytime so the data can no
+  // longer be accessed after it returns false.
+  bool DecrementCount() {
+    if (count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      if (status.ok()) {
+        std::move(promise).Set<Control>(Control());
+      } else {
+        std::move(promise).SetError(std::move(status));
+      }
+      return true;
+    }
+    return false;
+  }
+};
+
+}  // namespace future_internal
+
+template <typename T, typename FutureLikeContainer, typename ResultRefContainer>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Future AwaitAll(FutureLikeContainer futures,
+                                             ResultRefContainer results) {
+  DCHECK(!futures.empty());
+
+  auto promise = Promise::Allocate<Control>();
+  auto await_all = promise.GetFuture();
+  auto* state = new future_internal::State(futures.size(), std::move(promise));
+
+  DCHECK_EQ(futures.size(), results.size());
+  for (int i = 0; i < futures.size(); ++i) {
+    auto& future = futures[i];
+    std::move(future).Then(
+        [state, result = &results[i]](absl::StatusOr<T> value) {
+          if (value.ok()) {
+            result->Set(std::move(*value));
+          } else {
+            state->SetError(std::move(value).status());
+          }
+
+          if (state->DecrementCount()) {
+            delete state;
+          }
+        });
+  }
+
+  return await_all;
+}
+
+template <typename FutureLikeContainer>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Future AwaitAll(FutureLikeContainer futures) {
+  DCHECK(!futures.empty());
+
+  auto promise = Promise::Allocate<Control>();
+  auto await_all = promise.GetFuture();
+  auto* state = new future_internal::State(futures.size(), std::move(promise));
+
+  for (int i = 0; i < futures.size(); ++i) {
+    auto& future = futures[i];
+    std::move(future).Then([state](absl::Status status) {
+      if (!status.ok()) {
+        state->SetError(std::move(status));
+      }
+
+      if (state->DecrementCount()) {
+        delete state;
+      }
+    });
+  }
+
+  return await_all;
+}
+
+// TODO(chky): Implement type-safe version of Future and Promise.
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_FUTURE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h
new file mode 100644
index 00000000..2b1d967a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h
@@ -0,0 +1,126 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_INTERPRETER_TESTUTIL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_INTERPRETER_TESTUTIL_H_
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
+
+namespace mlrt {
+namespace testing {
+
+class SymbolTable {
+ public:
+  int Def(absl::string_view name) {
+    auto iter = reg_names_.find(name);
+    if (iter != reg_names_.end()) {
+      return iter->second;
+    }
+
+    int& id = reg_names_[name];
+    id = next_reg_id_++;
+
+    return id;
+  }
+
+  std::vector<int> Def(absl::Span<const std::string> names) {
+    return DefOrUse(names,
+                    [this](absl::string_view name) { return Def(name); });
+  }
+
+  int Use(absl::string_view name) const {
+    DCHECK(reg_names_.contains(name));
+    return reg_names_.at(name);
+  }
+
+  std::vector<int> Use(absl::Span<const std::string> names) {
+    return DefOrUse(names,
+                    [this](absl::string_view name) { return Use(name); });
+  }
+
+  size_t size() const { return reg_names_.size(); }
+
+ private:
+  std::vector<int> DefOrUse(
+      absl::Span<const std::string> names,
+      absl::FunctionRef<int(absl::string_view)> def_or_use) {
+    std::vector<int> ids;
+    ids.reserve(names.size());
+    for (const auto& name : names) {
+      ids.push_back(def_or_use(name));
+    }
+    return ids;
+  }
+
+  absl::flat_hash_map<std::string, int> reg_names_;
+  int next_reg_id_ = 0;
+};
+
+class AttributeTable {
+ public:
+  explicit AttributeTable(bc::Vector<bc::String>::Constructor attributes_ctor)
+      : ctor_(attributes_ctor) {}
+
+  void Add(absl::string_view name, absl::string_view value) {
+    handles_[name] = next_id_;
+    ctor_.ConstructAt(next_id_++, value);
+  }
+
+  void Add(absl::string_view name, const char* value) {
+    Add(name, absl::string_view(value));
+  }
+
+  void AddInline(absl::string_view name, absl::string_view value) {
+    DCHECK_LE(value.size(), sizeof(uint32_t));
+    std::memcpy(&handles_[name], value.data(), value.size());
+  }
+
+  template <typename T,
+            typename std::enable_if_t<
+                attribute_internal::kCanAttributeBeInlined<T>, int> = 0>
+  void Add(absl::string_view name, T value) {
+    AddInline(name, absl::string_view(reinterpret_cast<const char*>(&value),
+                                      sizeof(value)));
+  }
+
+  template <typename T, typename std::enable_if_t<
+                            std::is_trivial_v<T> &&
+                                !attribute_internal::kCanAttributeBeInlined<T>,
+                            int> = 0>
+  void Add(absl::string_view name, T value) {
+    Add(name, absl::string_view(reinterpret_cast<const char*>(&value),
+                                sizeof(value)));
+  }
+
+  uint32_t GetHandle(absl::string_view name) { return handles_.at(name); }
+
+ private:
+  bc::Vector<bc::String>::Constructor ctor_;
+  int next_id_ = 0;
+  absl::flat_hash_map<std::string, uint32_t> handles_;
+};
+
+}  // namespace testing
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_INTERPRETER_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/iterator.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/iterator.h
new file mode 100644
index 00000000..582e7def
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/iterator.h
@@ -0,0 +1,131 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ITERATOR_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ITERATOR_H_
+
+#include <iterator>
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+namespace iterator_internal {
+
+template <typename Iter, typename ValueType, typename ValueRangeType>
+class IteratorBase {
+  const Iter& self() const { return static_cast<const Iter&>(*this); }
+  Iter& self() { return static_cast<Iter&>(*this); }
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = ValueType;
+  using pointer = ValueType*;
+  using reference = ValueType&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  explicit IteratorBase(bc::ReadIterator<uint32_t> index_iter,
+                        ValueRangeType values)
+      : index_iter_(index_iter), values_(values) {}
+
+  reference operator*() const { return values_[*index_iter_]; }
+
+  pointer operator->() const { return &values_[*index_iter_]; }
+
+  reference operator[](difference_type i) const {
+    return values_[*(index_iter_ + i)];
+  }
+
+  Iter& operator+=(difference_type d) {
+    index_iter_ += d;
+    return self();
+  }
+
+  Iter& operator-=(difference_type d) {
+    index_iter_ -= d;
+    return self();
+  }
+
+  Iter& operator++() {
+    ++index_iter_;
+    return self();
+  }
+
+  Iter operator++(int) {
+    Iter r = self();
+    ++index_iter_;
+    return r;
+  }
+
+  Iter& operator--() {
+    --index_iter_;
+    return self();
+  }
+
+  Iter operator--(int) {
+    Iter r = self();
+    --index_iter_;
+    return r;
+  }
+
+  Iter operator+(difference_type d) const {
+    Iter r = self();
+    r += d;
+    return r;
+  }
+
+  friend Iter operator+(difference_type d, const Iter& i) { return i + d; }
+
+  Iter operator-(difference_type d) const {
+    Iter r = self();
+    r -= d;
+    return r;
+  }
+
+  difference_type operator-(const Iter& other) const {
+    return index_iter_ - other.index_iter_;
+  }
+
+  friend bool operator==(const Iter& a, const Iter& b) {
+    return a.index_iter_ == b.index_iter_;
+  }
+
+  friend bool operator!=(const Iter& a, const Iter& b) {
+    return a.index_iter_ != b.index_iter_;
+  }
+
+  friend bool operator<(const Iter& a, const Iter& b) {
+    return a.index_iter_ < b.index_iter_;
+  }
+
+  friend bool operator<=(const Iter& a, const Iter& b) {
+    return a.index_iter_ <= b.index_iter_;
+  }
+
+  friend bool operator>(const Iter& a, const Iter& b) {
+    return a.index_iter_ > b.index_iter_;
+  }
+
+  friend bool operator>=(const Iter& a, const Iter& b) {
+    return a.index_iter_ >= b.index_iter_;
+  }
+
+ private:
+  bc::ReadIterator<uint32_t> index_iter_;
+  ValueRangeType values_;
+};
+
+}  // namespace iterator_internal
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/register_span.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/register_span.h
new file mode 100644
index 00000000..1fd575ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/register_span.h
@@ -0,0 +1,225 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_REGISTER_SPAN_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_REGISTER_SPAN_H_
+
+#include <iterator>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/iterator.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+
+namespace mlrt {
+
+class RegisterIterator
+    : public iterator_internal::IteratorBase<RegisterIterator, Value,
+                                             absl::Span<Value>> {
+ public:
+  using IteratorBase<RegisterIterator, Value, absl::Span<Value>>::IteratorBase;
+};
+
+class ConstRegisterIterator
+    : public iterator_internal::IteratorBase<ConstRegisterIterator, const Value,
+                                             absl::Span<const Value>> {
+  using IteratorBase<ConstRegisterIterator, const Value,
+                     absl::Span<const Value>>::IteratorBase;
+};
+
+class RegisterSpan {
+ public:
+  using value_type = Value;
+  using size_type = size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = Value&;
+  using const_reference = const Value&;
+  using pointer = Value*;
+  using const_pointer = const Value*;
+  using iterator = RegisterIterator;
+  using const_iterator = ConstRegisterIterator;
+
+  RegisterSpan() = default;
+  RegisterSpan(bc::Span<uint32_t> reg_indices, absl::Span<Value> regs)
+      : reg_indices_(reg_indices), regs_(regs) {}
+
+  Value& operator[](size_t idx) { return regs_[reg_indices_[idx]]; }
+  const Value& operator[](size_t idx) const { return regs_[reg_indices_[idx]]; }
+  Value& back() const { return regs_[reg_indices_.back()]; }
+
+  size_t size() const { return reg_indices_.size(); }
+
+  iterator begin() const { return iterator(reg_indices_.begin(), regs_); }
+  iterator end() const { return iterator(reg_indices_.end(), regs_); }
+
+  RegisterSpan drop_front(int num = 1) {
+    return RegisterSpan(reg_indices_.drop_front(num), regs_);
+  }
+
+  RegisterSpan drop_back(int num = 1) {
+    return RegisterSpan(reg_indices_.drop_back(num), regs_);
+  }
+
+ private:
+  bc::Span<uint32_t> reg_indices_;
+  absl::Span<Value> regs_;
+};
+
+template <typename T>
+class RegisterValueIterator {
+  using Iter = RegisterValueIterator;
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = T;
+  using pointer = T*;
+  using reference = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  explicit RegisterValueIterator(RegisterIterator reg_iter)
+      : reg_iter_(reg_iter) {}
+
+  reference operator*() const { return (*reg_iter_).Get<T>(); }
+
+  pointer operator->() const { return &(*reg_iter_).Get<T>(); }
+
+  reference operator[](difference_type i) const {
+    return (*(reg_iter_ + i)).Get<T>();
+  }
+
+  Iter& operator+=(difference_type d) {
+    reg_iter_ += d;
+    return *this;
+  }
+
+  Iter& operator-=(difference_type d) {
+    reg_iter_ -= d;
+    return *this;
+  }
+
+  Iter& operator++() {
+    ++reg_iter_;
+    return *this;
+  }
+
+  Iter operator++(int) {
+    Iter r = *this;
+    ++reg_iter_;
+    return r;
+  }
+
+  Iter& operator--() {
+    --reg_iter_;
+    return *this;
+  }
+
+  Iter operator--(int) {
+    Iter r = *this;
+    --reg_iter_;
+    return r;
+  }
+
+  Iter operator+(difference_type d) const {
+    Iter r = *this;
+    r += d;
+    return r;
+  }
+
+  friend Iter operator+(difference_type d, const Iter& i) { return i + d; }
+
+  Iter operator-(difference_type d) const {
+    Iter r = *this;
+    r -= d;
+    return r;
+  }
+
+  difference_type operator-(const Iter& other) const {
+    return reg_iter_ - other.reg_iter_;
+  }
+
+  friend bool operator==(const Iter& a, const Iter& b) {
+    return a.reg_iter_ == b.reg_iter_;
+  }
+
+  friend bool operator!=(const Iter& a, const Iter& b) {
+    return a.reg_iter_ != b.reg_iter_;
+  }
+
+  friend bool operator<(const Iter& a, const Iter& b) {
+    return a.reg_iter_ < b.reg_iter_;
+  }
+
+  friend bool operator<=(const Iter& a, const Iter& b) {
+    return a.reg_iter_ <= b.reg_iter_;
+  }
+
+  friend bool operator>(const Iter& a, const Iter& b) {
+    return a.reg_iter_ > b.reg_iter_;
+  }
+
+  friend bool operator>=(const Iter& a, const Iter& b) {
+    return a.reg_iter_ >= b.reg_iter_;
+  }
+
+ private:
+  RegisterIterator reg_iter_;
+};
+
+template <typename T>
+class RegisterValueSpan {
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = T&;
+  using const_reference = const T&;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using iterator = RegisterValueIterator<T>;
+  using const_iterator = RegisterValueIterator<const T>;
+
+  RegisterValueSpan(bc::Span<uint32_t> reg_indices, absl::Span<Value> regs)
+      : reg_span_(reg_indices, regs) {}
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  RegisterValueSpan(RegisterSpan reg_span) : reg_span_(reg_span) {}
+
+  T& operator[](size_t idx) { return reg_span_[idx].Get<T>(); }
+  const T& operator[](size_t idx) const { return reg_span_[idx].Get<T>(); }
+
+  void Destroy(size_t idx) { reg_span_[idx].Destroy<T>(); }
+
+  size_t size() const { return reg_span_.size(); }
+
+  iterator begin() const { return iterator(reg_span_.begin()); }
+  iterator end() const { return iterator(reg_span_.end()); }
+
+  bool empty() const { return size() == 0; }
+
+  RegisterValueSpan drop_front(int num = 1) {
+    return reg_span_.drop_front(num);
+  }
+
+  RegisterValueSpan drop_back(int num = 1) { return reg_span_.drop_back(num); }
+
+  RegisterSpan reg_span() const { return reg_span_; }
+
+ private:
+  RegisterSpan reg_span_;
+};
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_REGISTER_SPAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/value.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/value.h
new file mode 100644
index 00000000..a7113a7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/interpreter/value.h
@@ -0,0 +1,419 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_VALUE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_VALUE_H_
+
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+
+namespace mlrt {
+
+class Value;
+
+namespace value_internal {
+
+struct InPlaceStorageT {
+  // Many tensor implementations like tensorflow::Tensor requires multiple
+  // words, and we'd like to keep these values inplace.
+  //
+  // TODO(chky): Consider a better size for inplace storage.
+  alignas(8) char data[56];
+};
+
+template <typename T>
+using IsInPlaceStorage =
+    std::integral_constant<bool, sizeof(T) <= sizeof(InPlaceStorageT) &&
+                                     alignof(T) <= alignof(InPlaceStorageT) &&
+                                     std::is_move_constructible<T>::value>;
+
+// Since we type-erase the value to be put in class Value, we need to an enum
+// value to select the operation that should be applied on the type-erased
+// value.
+enum class Action {
+  kDestroy = 0,  // Destructor
+  kCopy,         // Copy constructor/assignment
+  kMove,         // Move constructor/assignment
+  kError,        // Error handler
+  kTypeInfo      // Get type info
+};
+
+struct TypeInfo {};
+
+using HandlerFuncPtr = TypeInfo* (*)(Action, Value*, Value*);
+
+template <typename T>
+class InPlaceHandler;
+template <typename T>
+class OutOfPlaceHandler;
+
+template <typename T>
+using Handler = std::conditional_t<IsInPlaceStorage<T>::value,
+                                   InPlaceHandler<T>, OutOfPlaceHandler<T>>;
+
+template <typename T, typename Enable = void>
+struct HasHandleError : std::false_type {};
+
+template <typename T>
+struct HasHandleError<
+    T, std::void_t<decltype(std::declval<T>().HandleError(nullptr))>>
+    : std::true_type {};
+
+}  // namespace value_internal
+
+// A container for type-erased value. The value should be at least copy
+// constructable to be put into this container. This container has both move and
+// copy semantics, but if the concrete value does not support copy, calling the
+// copy operations on this class will result in undefined behavior.
+class alignas(64) Value {
+ public:
+  // Value is default constructible. The payload is unset in the default
+  // constructed Value.
+  Value() = default;
+
+  Value(const Value&);
+  Value& operator=(const Value&);
+  Value(Value&&) noexcept;
+  Value& operator=(Value&&) noexcept;
+
+  // Construct Value and store `t` as the payload.
+  template <typename T,
+            typename std::enable_if<!std::is_same_v<std::decay_t<T>, Value>,
+                                    int>::type = 0>
+  explicit Value(T&& t);
+
+  template <typename T,
+            typename std::enable_if<!std::is_same_v<std::decay_t<T>, Value>,
+                                    int>::type = 0>
+  Value& operator=(T&& value) {
+    Set(std::forward<T>(value));
+    return *this;
+  }
+
+  ~Value();
+
+  // Get() function returns the payload of the Value object in the requested
+  // type.
+  //
+  // Dynamic type checking is performed in the debug mode.
+  template <typename T>
+  T& Get();
+
+  template <typename T>
+  const T& Get() const;
+
+  // Emplace() constructs the payload object of type T in place with the given
+  // args. If the value is already initialized, the original value will be
+  // destroyed.
+  template <typename T, typename... Args>
+  void Emplace(Args&&... args);
+
+  // Construct() constructs the payload object of type T in place with the given
+  // args. The value should be uninitialized before calling this method.
+  // Otherwise the behavior is undefined.
+  template <typename T, typename... Args>
+  void Construct(Args&&... args);
+
+  // Destroy() destroys the payload object of type T. The value must be already
+  // initialized with a value of type T. Otherwise the behavior is undefined.
+  template <typename T>
+  void Destroy();
+
+  // Set() stores the argument `t` as the payload of Value.
+  template <typename T>
+  void Set(T&& t);
+
+  // Reset the Value object to empty.
+  void Reset();
+
+  // Call T::HandleError() method on the underlying value of type T. If T does
+  // not have a HandleError() method, this method does nothing.
+  void HandleError(Value& arg);
+
+  // Check if Value contains a payload.
+  bool HasValue() const { return handler_ != nullptr; }
+
+  // Check if Value contains object of type T.
+  template <typename T>
+  bool IsType() const;
+
+  // Check if object of type T is stored in place.
+  template <typename T>
+  static constexpr bool IsInPlace() {
+    return value_internal::IsInPlaceStorage<T>::value;
+  }
+
+ private:
+  union {
+    value_internal::InPlaceStorageT storage_{};
+    void* value_;
+  };
+  value_internal::HandlerFuncPtr handler_ = nullptr;
+
+  template <typename>
+  friend class value_internal::InPlaceHandler;
+  template <typename>
+  friend class value_internal::OutOfPlaceHandler;
+};
+
+// We only optimize the code for 64-bit architectures for now.
+static_assert(sizeof(Value) == 64 || sizeof(void*) != 8);
+
+// -----------------------------------------------------------
+// Implementation details.
+
+namespace value_internal {
+
+template <typename T>
+TypeInfo* GetTypeInfo();
+
+template <typename T,
+          typename std::enable_if_t<HasHandleError<T>::value, int> = 0>
+void HandleErrorInternal(Value* self, Value* arg) {
+  std::move(self->Get<T>()).HandleError(arg);
+}
+
+template <typename T,
+          typename std::enable_if_t<!HasHandleError<T>::value, int> = 0>
+static void HandleErrorInternal(Value* self, Value* arg) {}
+
+template <class T>
+struct InPlaceHandler {
+  template <typename... Args>
+  static void Construct(Value* self, Args&&... args) {
+    new (&self->storage_) T(std::forward<Args>(args)...);
+    self->handler_ = &Handle;
+  }
+
+  static TypeInfo* Handle(Action action, Value* self, Value* other) {
+    switch (action) {
+      case Action::kDestroy:
+        Destroy(self);
+        return nullptr;
+      case Action::kCopy:
+        Copy(self, other);
+        return nullptr;
+      case Action::kMove:
+        Move(self, other);
+        return nullptr;
+      case Action::kError:
+        HandleError(self, other);
+        return nullptr;
+      case Action::kTypeInfo:
+        return GetTypeInfo<T>();
+    }
+  }
+
+  static void Destroy(Value* self) {
+    DCHECK(self->HasValue());
+    auto* p = std::launder(reinterpret_cast<T*>(&self->storage_));
+    p->~T();
+    self->handler_ = nullptr;
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    Construct(dest, *std::launder(reinterpret_cast<const V*>(&self->storage_)));
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            !std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    LOG(FATAL) << "Copying a mlrt::Value whose underlying type is "  // Crash Ok
+                  "not copyable is a runtime error.";
+  }
+
+  static void Copy(Value* self, Value* dest) { CopyInternal<T>(self, dest); }
+
+  static void Move(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    Construct(dest,
+              std::move(*std::launder(reinterpret_cast<T*>(&self->storage_))));
+    Destroy(self);
+  }
+
+  static void HandleError(Value* self, Value* arg) {
+    HandleErrorInternal<T>(self, arg);
+  }
+};
+
+template <class T>
+struct OutOfPlaceHandler {
+  template <typename... Args>
+  static void Construct(Value* self, Args&&... args) {
+    self->value_ = new T(std::forward<Args>(args)...);
+    self->handler_ = &Handle;
+  }
+
+  static TypeInfo* Handle(Action action, Value* self, Value* other) {
+    switch (action) {
+      case Action::kDestroy:
+        Destroy(self);
+        return nullptr;
+      case Action::kCopy:
+        Copy(self, other);
+        return nullptr;
+      case Action::kMove:
+        Move(self, other);
+        return nullptr;
+      case Action::kError:
+        HandleError(self, other);
+        return nullptr;
+      case Action::kTypeInfo:
+        return GetTypeInfo<T>();
+    }
+  }
+
+  static void Destroy(Value* self) {
+    DCHECK(self->HasValue());
+    delete static_cast<T*>(self->value_);
+    self->handler_ = nullptr;
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    Construct(dest, *static_cast<const V*>(self->value_));
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            !std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    LOG(FATAL) << "Copying a mlrt::Value whose underlying type is "  // Crash Ok
+                  "not copyable is a runtime error.";
+  }
+
+  static void Copy(Value* self, Value* dest) { CopyInternal<T>(self, dest); }
+
+  static void Move(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    dest->value_ = self->value_;
+    dest->handler_ = &Handle;
+    self->handler_ = nullptr;
+  }
+
+  static void HandleError(Value* self, Value* arg) {
+    HandleErrorInternal<T>(self, arg);
+  }
+};
+
+template <typename T>
+__attribute__((noinline)) TypeInfo* GetTypeInfo() {
+  static TypeInfo kTypeInfo;
+  return &kTypeInfo;
+}
+
+}  // namespace value_internal
+
+template <typename T, typename std::enable_if<
+                          !std::is_same_v<std::decay_t<T>, Value>, int>::type>
+Value::Value(T&& t) {
+  Construct<std::decay_t<T>>(std::forward<T>(t));
+}
+
+inline Value::Value(const Value& v) {
+  if (v.HasValue())
+    v.handler_(value_internal::Action::kCopy, const_cast<Value*>(&v), this);
+}
+
+inline Value& Value::operator=(const Value& v) {
+  Reset();
+  if (v.HasValue())
+    v.handler_(value_internal::Action::kCopy, const_cast<Value*>(&v), this);
+  return *this;
+}
+
+inline Value::Value(Value&& v) noexcept {
+  if (v.HasValue()) v.handler_(value_internal::Action::kMove, &v, this);
+}
+
+inline Value& Value::operator=(Value&& v) noexcept {
+  Reset();
+  if (v.HasValue()) v.handler_(value_internal::Action::kMove, &v, this);
+  return *this;
+}
+
+inline void Value::HandleError(Value& arg) {
+  if (HasValue()) handler_(value_internal::Action::kError, this, &arg);
+}
+
+inline Value::~Value() { Reset(); }
+
+template <typename T>
+T& Value::Get() {
+  return const_cast<T&>(static_cast<const Value*>(this)->Get<T>());
+}
+
+template <typename T>
+const T& Value::Get() const {
+  DCHECK(IsType<T>());
+
+  if constexpr (IsInPlace<T>()) {
+    return *std::launder(reinterpret_cast<const T*>(&storage_));
+  }
+
+  return *static_cast<const T*>(value_);
+}
+
+// Emplace() constructs the payload object of type T in place with the given
+// args.
+template <typename T, typename... Args>
+void Value::Emplace(Args&&... args) {
+  Reset();
+  Construct<std::decay_t<T>>(std::forward<Args>(args)...);
+}
+
+// Set() stores the argument `t` as the payload of Value.
+template <typename T>
+void Value::Set(T&& t) {
+  Emplace<T>(std::forward<T>(t));
+}
+
+template <typename T, typename... Args>
+void Value::Construct(Args&&... args) {
+  DCHECK(!HasValue());
+  static_assert(!std::is_same_v<T, Value>);
+  value_internal::Handler<T>::Construct(this, std::forward<Args>(args)...);
+}
+
+template <typename T>
+void Value::Destroy() {
+  DCHECK(HasValue());
+  DCHECK(IsType<T>());
+  static_assert(!std::is_same_v<T, Value>);
+  value_internal::Handler<T>::Destroy(this);
+}
+
+// Reset the Value object to empty.
+inline void Value::Reset() {
+  if (handler_ == nullptr) return;
+  handler_(value_internal::Action::kDestroy, this, nullptr);
+}
+
+template <typename T>
+bool Value::IsType() const {
+  return handler_(value_internal::Action::kTypeInfo, const_cast<Value*>(this),
+                  nullptr) == value_internal::GetTypeInfo<T>();
+}
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_VALUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h
new file mode 100644
index 00000000..a7b8e5f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_BATCH_KERNEL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_BATCH_KERNEL_H_
+
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+void RegisterTfMlrtBatchKernels(mlrt::KernelRegistry& registry);
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_BATCH_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/context.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/context.h
new file mode 100644
index 00000000..fa682f22
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/context.h
@@ -0,0 +1,132 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_CONTEXT_H_
+
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+// The context for tensorflow::OpKernel.
+class Context : public mlrt::UserContext<Context> {
+ public:
+  explicit Context(
+      const tfd::KernelFallbackCompatRequestState* fallback_request_state,
+      tfrt::ResourceContext* resource_context,
+      const tfrt::CancellationContext* cancellation_context = nullptr)
+      : fallback_request_state_(fallback_request_state),
+        op_kernel_context_(fallback_request_state_),
+        resource_context_(resource_context),
+        cancellation_context_(cancellation_context) {
+    DCHECK(resource_context_);
+  }
+
+  Context(const Context&) = default;
+  Context& operator=(const Context&) = default;
+
+  const tfd::KernelFallbackCompatRequestState& fallback_request_state() const {
+    return *fallback_request_state_;
+  }
+  void set_fallback_request_state(
+      const tfd::KernelFallbackCompatRequestState* fallback_request_state) {
+    DCHECK(fallback_request_state);
+    fallback_request_state_ = fallback_request_state;
+  }
+
+  OpKernelContext::Params& params() { return op_kernel_context_.params; }
+  OpKernelContext& op_kernel_context() {
+    return op_kernel_context_.op_kernel_context;
+  }
+
+  tfrt::ResourceContext& resource_context() const { return *resource_context_; }
+
+  const tfrt::CancellationContext* cancellation_context() const {
+    return cancellation_context_;
+  }
+
+  tfrt_stub::OpKernelRunState& run_state() {
+    // Keep states needed by kernel execution in a thread local storage to avoid
+    // repeated reallocation and destruction of them.
+    thread_local tfrt_stub::OpKernelRunState run_state;
+    return run_state;
+  }
+
+  // Return true if there is a cancellation request.
+  bool IsCancelled() {
+    return cancellation_context_ != nullptr &&
+           cancellation_context_->IsCancelled();
+  }
+
+ private:
+  const tfd::KernelFallbackCompatRequestState* fallback_request_state_ =
+      nullptr;
+
+  struct CopyableOpKernelContext {
+    OpKernelContext::Params params;
+    OpKernelContext op_kernel_context;
+
+    explicit CopyableOpKernelContext(
+        const tfd::KernelFallbackCompatRequestState* fallback_request_state)
+        : params(),
+          op_kernel_context(
+              [this, fallback_request_state]() {
+                DCHECK(fallback_request_state);
+                params.step_id = fallback_request_state->step_id();
+                auto* device = fallback_request_state->cpu_device();
+                params.device = device;
+                // Still use original device's resource_manager.
+                params.resource_manager = device->resource_manager();
+                params.step_container =
+                    fallback_request_state->step_container();
+                // Following two parameters are used to support executing
+                // tf.data via fallback.
+                params.function_library =
+                    fallback_request_state->cpu_function_library_runtime();
+                params.runner = fallback_request_state->runner();
+                params.collective_executor =
+                    fallback_request_state->collective_executor();
+                params.rendezvous = fallback_request_state->rendezvous();
+                params.session_metadata =
+                    &fallback_request_state->session_metadata();
+                params.cancellation_manager =
+                    fallback_request_state->cancellation_manager();
+                return &params;
+              }(),
+              0) {}
+    CopyableOpKernelContext(const CopyableOpKernelContext& other)
+        : params(other.params),
+          op_kernel_context(&params, other.op_kernel_context.num_outputs()) {}
+    CopyableOpKernelContext& operator=(const CopyableOpKernelContext& other) {
+      params = other.params;
+      op_kernel_context.ResetOutputs(other.op_kernel_context.num_outputs());
+      return *this;
+    }
+    ~CopyableOpKernelContext() { op_kernel_context.ResetOutputs(); }
+  };
+  CopyableOpKernelContext op_kernel_context_;
+
+  tfrt::ResourceContext* resource_context_ = nullptr;
+  const tfrt::CancellationContext* cancellation_context_;
+};
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/kernel.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/kernel.h
new file mode 100644
index 00000000..36ee01d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/kernel.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_H_
+
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+mlrt::KernelRegistry& GetTfMlrtOptionalKernelRegistry();
+
+void RegisterTfMlrtKernels(mlrt::KernelRegistry& registry);
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
new file mode 100644
index 00000000..daecf14a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
@@ -0,0 +1,150 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_RUNNER_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_RUNNER_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/optimization.h"
+#include "absl/cleanup/cleanup.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+void LaunchAsyncOpKernel(const tfrt_stub::OpKernelRunner& kernel_runner,
+                         const tfrt_stub::OpKernelRunState& run_state,
+                         const OpKernelContext::Params& params,
+                         mlrt::RegisterSpan results,
+                         std::shared_ptr<tensorflow::DeviceBase> custom_device);
+
+inline void SetUpParams(const tfrt_stub::OpKernelRunner& kernel_runner,
+                        absl::Span<const TensorValue> input_tf_tensor_values,
+                        OpKernelContext::Params& params) {
+  params.inputs = input_tf_tensor_values;
+  params.op_kernel = kernel_runner.op_kernel();
+  params.input_alloc_attrs = kernel_runner.input_alloc_attrs();
+  params.output_attr_array = kernel_runner.output_alloc_attrs().data();
+}
+
+template <bool IsAsync, typename Frame>
+void ExecuteKernelRunner(
+    Frame& frame, Context& context,
+    const tfd::KernelFallbackCompatRequestState& fallback_request_state,
+    const tfrt_stub::OpKernelRunner& kernel_runner) {
+  tsl::profiler::TraceMe trace_me([&]() -> std::string {
+    return tsl::profiler::TraceMeOp(
+        kernel_runner.op_kernel()->name_view(),
+        kernel_runner.op_kernel()->type_string_view());
+  });
+
+  auto args = frame.args();
+  auto last_uses = frame.last_uses();
+
+  auto& run_state = context.run_state();
+  auto& tensor_buffers = run_state.tensor_buffers;
+
+  auto clean_up_inputs = absl::MakeCleanup([&]() {
+    for (const auto* buffer : tensor_buffers) {
+      DCHECK(buffer);
+      buffer->Unref();
+    }
+    tensor_buffers.clear();
+  });
+
+  // Prepare the input tensors.
+  auto& input_tf_tensor_values = run_state.input_tf_tensor_values;
+  input_tf_tensor_values.resize(args.size());
+  for (int i = 0; i < args.size(); ++i) {
+    auto& fallback_tensor = args[i];
+    // If the argument is immutable or it is the last use in the current scope,
+    // we can just keep the reference without copying that invovles expensive
+    // atomic reference counting. And if it is the last use, it can enable
+    // buffer forwarding optimization in many tensorflow OpKernels.
+    if (!fallback_tensor.is_immutable() && !last_uses[i]) {
+      if (const auto* buffer = fallback_tensor.buffer()) {
+        buffer->Ref();
+        tensor_buffers.push_back(buffer);
+      }
+    }
+    input_tf_tensor_values[i].tensor = &fallback_tensor.tensor();
+  }
+
+  auto& params = context.params();
+  SetUpParams(kernel_runner, input_tf_tensor_values, params);
+
+  auto results = frame.results();
+
+  if constexpr (!IsAsync) {
+    tensorflow::DeviceBase* device = nullptr;
+    if constexpr (Frame::kUseCustomDevice) {
+      // If the kernel is using custom device, save the current device and
+      // change to the custom device.
+      device = params.device;
+      params.device = frame.device().get();
+    }
+
+    auto& op_kernel_context = context.op_kernel_context();
+    op_kernel_context.ResetOutputs(results.size());
+
+    kernel_runner.Run(&op_kernel_context);
+
+    if constexpr (Frame::kUseCustomDevice) {
+      // We need to restore the device as params will be reused by kernels
+      // invoked later.
+      params.device = device;
+    }
+
+    if (ABSL_PREDICT_FALSE(!op_kernel_context.status().ok())) {
+      frame.execution_context().Fail(op_kernel_context.status());
+      return;
+    }
+
+    for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
+      DCHECK(op_kernel_context.mutable_output(i));
+      results[i].template Emplace<tensorflow::tfrt_stub::FallbackTensor>(
+          std::move(*op_kernel_context.mutable_output(i)));
+    }
+  } else {
+    std::shared_ptr<tensorflow::DeviceBase> custom_device = nullptr;
+    if constexpr (Frame::kUseCustomDevice) {
+      custom_device = frame.device();
+    }
+
+    LaunchAsyncOpKernel(kernel_runner, run_state, params, results,
+                        std::move(custom_device));
+  }
+
+  auto reg_span = args.reg_span();
+  for (int i = 0; i < last_uses.size(); ++i) {
+    if (last_uses[i]) {
+      reg_span[i].template Destroy<tensorflow::tfrt_stub::FallbackTensor>();
+    }
+  }
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_RUNNER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h
new file mode 100644
index 00000000..d194b687
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/mlrt/kernel/shard_restore_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_SHARD_RESTORE_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_SHARD_RESTORE_UTIL_H_
+
+#include <cstddef>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+// Shard variables into cluster of roughly the same size.
+//
+// `num_shards` is the number of shards to create.
+// `variable_sizes` is the sizes of the variables.
+//
+// Returns a list of clusters, each of which is represented
+// as a vector of variable indices.
+std::vector<std::vector<int>> ShardVariables(
+    int num_shards, absl::Span<int64_t> variable_sizes);
+
+}  // namespace  tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_SHARD_RESTORE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
new file mode 100644
index 00000000..87baccab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
@@ -0,0 +1,458 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_H_
+#define TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+#include "tfrt/host_context/task_function.h"  // from @tf_runtime
+namespace Eigen {
+struct ThreadPoolDevice;
+}
+
+namespace tfrt {
+namespace tf {
+
+class RunHandler;
+
+// Options for RunHanler.
+struct RunHandlerOptions {
+  RunHandlerOptions() : priority(0) {}
+
+  // Request priority.
+  int priority;
+};
+
+// RunHandlerPool is a fixed size pool of pre-allocated RunHandlers
+// that can be used for tracking op work for a given inference request.
+// RunHandler(s) in the pool are initially 'inactive'. A RunHandler becomes
+// 'active' when its unique_ptr is returned by Get() and is being used by a
+// client. It becomes 'inactive' once more when its unique_ptr gets destroyed.
+//
+// Expected usage:
+//
+// * Create a single RunHandlerPool (say run_handler_pool_).
+//
+// * When an inference request is invoked, obtain a handler by:
+// auto handler = run_handler_pool_->Get();
+//
+// * Use handler for scheduling all inter-op work by:
+// handler->ScheduleInterOpClosure(closure);
+//
+// This class is thread safe.
+class RunHandlerPool {
+ public:
+  struct Options {
+    // The number of main threads.
+    int num_inter_op_threads = 1;
+
+    // The number of complimentary threads.
+    int num_intra_op_threads = 1;
+
+    // The number of max concurrent handlers.
+    int max_concurrent_handler = 128;
+
+    // The number of sub thread pool configed.
+    int num_sub_thread_pool = 1;
+
+    // The number of threads in each sub thread pool. The length of the vector
+    // should equal to num_sub_thread_pool.
+    std::vector<int> num_threads_in_sub_thread_pool = {1};
+
+    // The percentage of requests the first N sub thread pool handles. The
+    // length of the vector should equal to num_sub_thread_pool. For example,
+    // {0.5, 1} means the first sub thread pool will handle the first 50%
+    // requests based on priority and the second thread pool will handle the
+    // second 50% requests based on priority.
+    std::vector<double> sub_thread_request_percentage = {1.0};
+
+    // Sleep time for non blocking threads if there is no pending task.
+    int non_blocking_threads_sleep_time_micro_sec = 1000;
+
+    // Max sleep time for blocking threads if there is no pending task and no
+    // new task wakes up the thread.
+    int blocking_threads_max_sleep_time_micro_sec = 1000;
+
+    // If true, use adaptive waiting time.
+    bool use_adaptive_waiting_time = true;
+
+    // If true, threads won't wake itself up if there is no active requests.
+    bool wait_if_no_active_request = true;
+
+    // If true, threads will be waken up by new tasks.
+    bool enable_wake_up = true;
+  };
+  explicit RunHandlerPool(Options options);
+  ~RunHandlerPool();
+
+  // Returns an inactive RunHandler from the pool.
+  //
+  // RunHandlers in RunHandlerPool are initially 'inactive'.
+  // A RunHandler becomes 'active' when its unique_ptr its returned by Get()
+  // and is being used by a client.  It becomes 'inactive' once more when the
+  // unique_ptr is destroyed.
+  //
+  // Will block unless there is an inactive handler.
+  std::unique_ptr<RunHandler> Get(
+      int64_t step_id = 0, int64_t timeout_in_ms = 0,
+      const RunHandlerOptions& options = RunHandlerOptions());
+
+  // Get the priorities for active handlers. The return result is with the same
+  // order of the active handler list.
+  std::vector<int64_t> GetActiveHandlerPrioritiesForTesting() const;
+
+  // Block until the system is quiescent (no pending work and no inflight work).
+  void Quiesce() const;
+
+ private:
+  class Impl;
+  friend class RunHandler;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+// RunHandler can be used to schedule inter/intra-op closures to run on a global
+// pool shared across all Session::Run(s). The closures are enqueued to a
+// handler specific queue, from which the work is stolen in a priority order
+// (time of the Get() call).
+//
+// It can only be created via RunHandlerPool::Get().
+//
+// This class can be used instead of directly scheduling closures on a global
+// pool since it maintains a global view across all sessions and optimizes pool
+// scheduling to improve (median and tail) latency.
+//
+// This class is thread safe.
+class RunHandler {
+ public:
+  void ScheduleInterOpClosure(TaskFunction fn);
+  void ScheduleIntraOpClosure(TaskFunction fn);
+
+  tensorflow::thread::ThreadPoolInterface* AsIntraThreadPoolInterface() const;
+
+  int NumThreads() const;
+
+  int64_t step_id() const;
+
+  ~RunHandler();
+
+ private:
+  class Impl;
+  friend class RunHandlerPool::Impl;
+
+  explicit RunHandler(Impl* impl);
+
+  Impl* impl_;  // NOT OWNED.
+};
+
+namespace internal {
+
+// TODO(azaks): Refactor with thread:ThreadPool
+class RunHandlerEnvironment {
+ public:
+  typedef tensorflow::Thread EnvThread;
+  struct TaskImpl {
+    TaskFunction f;
+    tensorflow::Context context;
+    uint64_t trace_id;
+  };
+  tensorflow::Env* const env_;
+  const tensorflow::ThreadOptions thread_options_;
+  const std::string name_;
+
+ public:
+  struct Task {
+    std::unique_ptr<TaskImpl> f;
+  };
+
+  RunHandlerEnvironment(tensorflow::Env* env,
+                        const tensorflow::ThreadOptions& thread_options,
+                        const std::string& name);
+
+  EnvThread* CreateThread(std::function<void()> f);
+
+  Task CreateTask(TaskFunction f);
+
+  void ExecuteTask(const Task& t);
+};
+
+typedef typename RunHandlerEnvironment::Task Task;
+typedef Eigen::RunQueue<Task, 1024> Queue;
+
+// To reduce cache misses, we use a doubly-linked list of Waiter structs and
+// queue them in LIFO order rather than the FIFO order used by a single
+// condition variable.
+struct Waiter {
+  Waiter() {
+    next = this;
+    prev = this;
+  }
+  tensorflow::condition_variable cv;
+  int num_waiting_threads = 0;
+  tensorflow::mutex mu;
+  Waiter* next;
+  Waiter* prev;
+};
+
+class ThreadWorkSource {
+ public:
+  ThreadWorkSource();
+
+  ~ThreadWorkSource();
+
+  Task EnqueueTask(Task t, bool is_blocking, bool enable_wake_up);
+
+  Task PopBlockingTask();
+
+  Task PopNonBlockingTask(int start_index, bool search_from_all_queue);
+
+  int TaskQueueSize(bool is_blocking);
+
+  int64_t GetTracemeId();
+
+  void SetTracemeId(int64_t value);
+
+  void SetWaiter(uint64_t version, Waiter* waiter, tensorflow::mutex* mutex);
+
+  int64_t GetInflightTaskCount(bool is_blocking);
+
+  void IncrementInflightTaskCount(bool is_blocking);
+
+  void DecrementInflightTaskCount(bool is_blocking);
+
+  int64_t GetPendingTaskCount();
+
+  void IncrementPendingTaskCount();
+
+  void DecrementPendingTaskCount();
+
+  unsigned NonBlockingWorkShardingFactor();
+
+  std::string ToString();
+
+ private:
+  struct NonBlockingQueue {
+    tensorflow::mutex queue_op_mu;
+    char pad[128];
+    Queue queue;
+  };
+
+  int32_t non_blocking_work_sharding_factor_;
+  Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
+
+  // The number of tasks that are executing now.
+  std::atomic<int64_t> blocking_inflight_;
+  std::atomic<int64_t> non_blocking_inflight_;
+
+  // The number of tasks that are enqueued and not finished.
+  std::atomic<int64_t> pending_tasks_;
+
+  Queue blocking_work_queue_;
+  tensorflow::mutex blocking_queue_op_mu_;
+  char pad_[128];
+  tensorflow::mutex waiters_mu_;
+  Waiter queue_waiters_ TF_GUARDED_BY(waiters_mu_);
+  std::atomic<int64_t> traceme_id_;
+
+  tensorflow::mutex run_handler_waiter_mu_;
+  uint64_t version_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  tensorflow::mutex* sub_thread_pool_waiter_mu_
+      TF_GUARDED_BY(run_handler_waiter_mu_);
+  Waiter* sub_thread_pool_waiter_ TF_GUARDED_BY(run_handler_waiter_mu_);
+};
+
+class RunHandlerThreadPool {
+ public:
+  struct Options {
+    int num_blocking_threads;
+    int num_non_blocking_threads;
+    bool wait_if_no_active_request;
+    int non_blocking_threads_sleep_time_micro_sec;
+    int blocking_threads_max_sleep_time_micro_sec;
+    bool use_adaptive_waiting_time;
+    bool enable_wake_up;
+    int max_concurrent_handler;
+    std::vector<int> num_threads_in_sub_thread_pool;
+    std::vector<double> sub_thread_request_percentage;
+    Options(int num_blocking_threads, int num_non_blocking_threads,
+            bool wait_if_no_active_request,
+            int non_blocking_threads_sleep_time_micro_sec,
+            int blocking_threads_max_sleep_time_micro_sec,
+            bool use_adaptive_waiting_time, bool enable_wake_up,
+            int max_concurrent_handler,
+            const std::vector<int>& num_threads_in_sub_thread_pool,
+            const std::vector<double>& sub_thread_request_percentage)
+        : num_blocking_threads(num_blocking_threads),
+          num_non_blocking_threads(num_non_blocking_threads),
+          wait_if_no_active_request(wait_if_no_active_request),
+          non_blocking_threads_sleep_time_micro_sec(
+              non_blocking_threads_sleep_time_micro_sec),
+          blocking_threads_max_sleep_time_micro_sec(
+              blocking_threads_max_sleep_time_micro_sec),
+          use_adaptive_waiting_time(use_adaptive_waiting_time),
+          enable_wake_up(enable_wake_up),
+          max_concurrent_handler(max_concurrent_handler),
+          num_threads_in_sub_thread_pool(num_threads_in_sub_thread_pool),
+          sub_thread_request_percentage(sub_thread_request_percentage) {}
+  };
+  struct PerThread {
+    constexpr PerThread() : pool(nullptr), thread_id(-1) {}
+    RunHandlerThreadPool* pool;  // Parent pool, or null for normal threads.
+    int thread_id;               // Worker thread index in pool.
+  };
+
+  RunHandlerThreadPool(Options options, tensorflow::Env* env,
+                       const tensorflow::ThreadOptions& thread_options,
+                       const std::string& name,
+                       Eigen::MaxSizeVector<tensorflow::mutex>* waiters_mu,
+                       Eigen::MaxSizeVector<Waiter>* queue_waiters);
+
+  ~RunHandlerThreadPool();
+
+  void Start();
+
+  void StartOneThreadForTesting();
+
+  void AddWorkToQueue(ThreadWorkSource* tws, bool is_blocking, TaskFunction fn);
+
+  // Set work queues from which the thread 'tid' can steal its work.
+  void SetThreadWorkSources(
+      int tid, uint64_t version,
+      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources);
+
+  PerThread* GetPerThread();
+
+  int CurrentThreadId() const;
+
+  int NumThreads() const;
+
+  int NumBlockingThreads() const;
+
+  int NumNonBlockingThreads() const;
+
+  void WorkerLoop(int thread_id, bool may_steal_blocking_work);
+
+  // Search tasks from Requets range searching_range_start to
+  // searching_range_end. If there is no tasks in the search range and
+  // may_steal_blocking_work is true, then search from all requests.
+  Task FindTask(
+      int searching_range_start, int searching_range_end, int thread_id,
+      int sub_thread_pool_id, int max_blocking_inflight,
+      bool may_steal_blocking_work,
+      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources,
+      bool* task_from_blocking_queue, ThreadWorkSource** tws);
+
+  void WaitForWorkInSubThreadPool(int thread_id, bool is_blocking,
+                                  int sub_thread_pool_id);
+
+ private:
+  struct ThreadData {
+    ThreadData();
+    tensorflow::mutex mu;
+    uint64_t new_version;
+    tensorflow::condition_variable sources_not_empty;
+    std::unique_ptr<tensorflow::Thread> thread;
+    int current_index;
+    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+        new_thread_work_sources TF_GUARDED_BY(mu);
+
+    uint64_t current_version;
+    // Should only be accessed by one thread.
+    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+        current_thread_work_sources;
+
+    int sub_thread_pool_id;
+  };
+
+  const int num_threads_;
+  const int num_blocking_threads_;
+  const int num_non_blocking_threads_;
+  const bool adaptive_sleep_time_;
+  const bool wait_if_no_active_request_;
+  const int non_blocking_thread_sleep_time_;
+  const int blocking_thread_max_waiting_time_;
+  const bool enable_wake_up_;
+  Eigen::MaxSizeVector<ThreadData> thread_data_;
+  internal::RunHandlerEnvironment env_;
+  std::atomic<bool> cancelled_;
+  std::string name_;
+  Eigen::MaxSizeVector<tensorflow::mutex>* waiters_mu_;
+  Eigen::MaxSizeVector<Waiter>* queue_waiters_;
+
+  std::vector<int> num_threads_in_sub_thread_pool_;
+
+  // Threads in each sub thread pool will search tasks from
+  // the end_request_percentage of previous sub thread pool to its own
+  // end_request_percentage in a round robin fashion.
+  std::vector<double> sub_thread_pool_end_request_percentage_;
+};
+
+}  // namespace internal
+
+class RunHandlerWorkQueue : public tensorflow::tfrt_stub::WorkQueueInterface {
+ public:
+  explicit RunHandlerWorkQueue(std::unique_ptr<RunHandler> run_handler)
+      : WorkQueueInterface(run_handler->step_id(),
+                           run_handler->AsIntraThreadPoolInterface()),
+        run_handler_(std::move(run_handler)) {
+    DCHECK(run_handler_);
+  }
+  ~RunHandlerWorkQueue() override = default;
+
+  std::string name() const override { return "run_handler"; }
+
+  int GetParallelismLevel() const override;
+
+  void AddTask(TaskFunction work) override;
+
+  std::optional<TaskFunction> AddBlockingTask(TaskFunction work,
+                                              bool allow_queuing) override;
+
+  void Await(
+      llvm::ArrayRef<tfrt::RCReference<tfrt::AsyncValue>> values) override;
+
+  bool IsInWorkerThread() const override;
+
+  void Quiesce() override {
+    LOG(FATAL) << "RunHandlerWorkQueue::Quiesce() is not "  // Crash OK
+                  "implemented, and supposed to be removed.";
+  }
+
+ private:
+  std::unique_ptr<RunHandler> run_handler_;
+};
+
+}  // end namespace tf
+}  // end namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
new file mode 100644
index 00000000..23dd6c86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
@@ -0,0 +1,142 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_CONCURRENT_WORK_QUEUE_H_
+#define TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_CONCURRENT_WORK_QUEUE_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/support/thread_environment.h"  // from @tf_runtime
+#include "third_party/concurrent_work_queue/lib/blocking_work_queue.h"
+#include "third_party/concurrent_work_queue/lib/non_blocking_work_queue.h"
+
+namespace tfrt {
+namespace tf {
+
+// Concurrent Work Queue based on Run Handler thread Pool. All tasks are queued
+// based on requests.
+class RunHandlerThreadWorkQueue
+    : public tensorflow::tfrt_stub::WorkQueueInterface {
+ public:
+  struct Options {
+    // The number of threads used for the main thread pool.
+    int num_main_threads;
+
+    // The number of threads used for complementary thread pool.
+    int num_complementary_threads;
+
+    // Timeout for InitRequest().
+    // The timeout may trigger as the work queue limits the number of concurrent
+    // in-flight requests for better latency.
+    int64_t init_timeout_ms;
+
+    // The number of max concurrent handlers.
+    int max_concurrent_handler = 128;
+
+    // The number of sub thread pool configed.
+    int num_sub_thread_pool = 1;
+
+    // The number of threads in each sub thread pool. The length of the vector
+    // should equal to num_sub_thread_pool.
+    std::vector<int> num_threads_in_sub_thread_pool = {1};
+
+    // The percentage of requests the first N sub thread pool handles. The
+    // length of the vector should equal to num_sub_thread_pool.
+    std::vector<double> sub_thread_request_percentage = {1.0};
+
+    // Sleep time for non blocking threads if there is no pending task.
+    int non_blocking_threads_sleep_time_micro_sec = 1000;
+
+    // Max sleep time for blocking threads if there is no pending task and no
+    // new task wakes up the thread.
+    int blocking_threads_max_sleep_time_micro_sec = 1000;
+
+    // If true, use adaptive waiting time.
+    bool use_adaptive_waiting_time = true;
+
+    // If true, threads won't wake itself up if there is no active requests.
+    bool wait_if_no_active_request = true;
+
+    // If true, threads will be waken up by new tasks.
+    bool enable_wake_up = true;
+  };
+
+  explicit RunHandlerThreadWorkQueue(const Options& options);
+  ~RunHandlerThreadWorkQueue() override = default;
+
+  std::string name() const override {
+    return tensorflow::strings::StrCat(
+        "RunHandlerThreadWorkQueue C++ work queue (", options_.num_main_threads,
+        " main threads, ", options_.num_complementary_threads,
+        " complementary threads)");
+  }
+
+  absl::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
+  InitializeRequest(int64_t request_id) const override;
+
+  int GetParallelismLevel() const override {
+    return options_.num_main_threads + options_.num_complementary_threads;
+  }
+
+  void AddTask(TaskFunction work) override;
+
+  std::optional<TaskFunction> AddBlockingTask(TaskFunction work,
+                                              bool allow_queuing) override;
+
+  void Quiesce() override;
+
+  void Await(ArrayRef<RCReference<AsyncValue>> values) override;
+
+  bool IsInWorkerThread() const override;
+
+ private:
+  Options options_;
+
+  // Handler Pool.
+  // Each request will require a handler from the pool, and release the handler
+  // back to the pool once it is done.
+  std::unique_ptr<RunHandlerPool> handler_pool_;
+
+  // An id assigned to each request for tracing purpose.
+  static std::atomic_int_fast64_t step_id_counter_;
+
+  // QuiescingState for non_blocking_work_queue_ and blocking_work_queue_.
+  std::unique_ptr<::tfrt::internal::QuiescingState> quiescing_state_;
+
+  // Nonblocking queue used for cases without execution context.
+  ::tfrt::internal::NonBlockingWorkQueue<ThreadingEnvironment>
+      non_blocking_work_queue_;
+
+  // Blocking queue used for cases without execution context.
+  ::tfrt::internal::BlockingWorkQueue<ThreadingEnvironment>
+      blocking_work_queue_;
+};
+
+std::ostream& operator<<(std::ostream& strm,
+                         const RunHandlerThreadWorkQueue::Options& options);
+}  // namespace tf
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_CONCURRENT_WORK_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.h
new file mode 100644
index 00000000..acf15a70
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace tfrt {
+namespace tf {
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. Return 'default_value' otherwise.
+double ParamFromEnvWithDefault(const char* var_name, double default_value);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. The value must be in format val1,val2... Return
+// 'default_value' otherwise.
+std::vector<double> ParamFromEnvWithDefault(const char* var_name,
+                                            std::vector<double> default_value);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. The value must be in format val1,val2... Return
+// 'default_value' otherwise.
+std::vector<int> ParamFromEnvWithDefault(const char* var_name,
+                                         std::vector<int> default_value);
+
+// Look up environment variable named 'var_name' and return the value if it
+// exist and can be parsed. Return 'default_value' otherwise.
+bool ParamFromEnvBoolWithDefault(const char* var_name, bool default_value);
+
+}  // namespace tf
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/runtime.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/runtime.h
new file mode 100644
index 00000000..1a6925c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/runtime.h
@@ -0,0 +1,248 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_RUNTIME_RUNTIME_H_
+#define TENSORFLOW_CORE_TFRT_RUNTIME_RUNTIME_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+#include "tsl/platform/errors.h"
+#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// ModelRuntimeContext provides model contexts for injected backends to
+// initialize their per-model states.
+class ModelRuntimeContext {
+ public:
+  ModelRuntimeContext(const GraphExecutionOptions* graph_execution_options,
+                      std::string export_dir,
+                      tfrt::ResourceContext* resource_context)
+      : graph_execution_options_(graph_execution_options),
+        export_dir_(std::move(export_dir)),
+        resource_context_(resource_context) {
+    DCHECK(graph_execution_options_);
+    DCHECK(resource_context_);
+  }
+
+  absl::string_view name() const {
+    return graph_execution_options_->model_metadata.name();
+  }
+  int64_t version() const {
+    return graph_execution_options_->model_metadata.version();
+  }
+
+  absl::string_view export_dir() const { return export_dir_; }
+
+  const GraphDef* graph_def() const { return graph_def_; }
+  void set_graph_def(const GraphDef* graph_def) { graph_def_ = graph_def; }
+
+  const CallableOptions* callable_options() const { return callable_options_; }
+  void set_callable_options(const CallableOptions* callable_options) {
+    callable_options_ = callable_options;
+  }
+
+  FunctionLibraryDefinition* function_library_definition() const {
+    return flib_def_;
+  }
+  void set_function_library_definition(FunctionLibraryDefinition* flib_def) {
+    flib_def_ = flib_def;
+  }
+
+  tensorflow::DeviceMgr* device_mgr() const { return device_mgr_; }
+  void set_device_mgr(tensorflow::DeviceMgr* device_mgr) {
+    device_mgr_ = device_mgr;
+  }
+
+  bool is_local_session() const { return is_local_session_; }
+
+  void set_is_local_session(bool is_local_session) {
+    is_local_session_ = is_local_session;
+  }
+
+  tfrt::ResourceContext& resource_context() { return *resource_context_; }
+
+  const GraphExecutionOptions& graph_execution_options() const {
+    return *graph_execution_options_;
+  }
+
+  absl::string_view checkpoint_path() const { return checkpoint_path_; }
+
+  void set_checkpoint_path(absl::string_view checkpoint_path) {
+    checkpoint_path_ = checkpoint_path;
+  }
+
+ private:
+  const GraphExecutionOptions* graph_execution_options_ = nullptr;
+
+  std::string export_dir_;
+  const GraphDef* graph_def_ = nullptr;
+  const CallableOptions* callable_options_ = nullptr;
+  tfrt::ResourceContext* resource_context_ = nullptr;
+  tensorflow::DeviceMgr* device_mgr_ = nullptr;
+
+  FunctionLibraryDefinition* flib_def_ = nullptr;
+
+  bool is_local_session_ = false;
+  std::string checkpoint_path_;
+};
+
+// This defines the runtime abstraction in tensorflow for TFRT. It is supposed
+// to provide tensorflow specific functionalities that are implemented using
+// TFRT. Currently, the only intended uses for this class are:
+//  1) Creating the runtime instance with user specified dependencies (eg.
+//  thread pool).
+//  2) Creating tensors that can be used by the runtime.
+//
+// It is temporary and will be replaced by the official
+// tensorflow::experimental::cc::Runtime when it lands.
+class Runtime {
+ public:
+  // Creates a runtime instance with specified threading configuration. Returns
+  // null upon creation error.
+  static std::unique_ptr<Runtime> Create(int num_inter_op_threads,
+                                         int num_intra_op_threads = 0);
+
+  // Creates a runtime instance with the specified work_queue. Returns null upon
+  // creation error.
+  static std::unique_ptr<Runtime> Create(
+      std::unique_ptr<WorkQueueInterface> work_queue);
+
+  ~Runtime();
+  Runtime(Runtime&&) = default;
+  Runtime& operator=(Runtime&&) = default;
+
+  // TODO(tfrt-devs): Add methods for creating TFRT tensors.
+
+  // TODO(chky): Make this method private as it should be only used by
+  // tfrt::SavedModel. Simply making tfrt::SavedModel a friend class does not
+  // work because the it resides in a different namespace. But we should
+  // consider moving it to the same namespace.
+  tfrt::CoreRuntime* core_runtime() const { return core_runtime_.get(); }
+  WorkQueueInterface* work_queue() const { return work_queue_; }
+
+  // `AddCreateRuntimeResourceFn` allows the client to inject per model
+  // resources that are related to system-wide concepts, such as devices, when
+  // loading a SavedModel.
+  //
+  // A longer term plan is to use a Device concept for this purpose, so that
+  // Runtime contains a vector of Devices. Since it will take some time to
+  // iterate on the Device concept and integrate with the existing
+  // `tfrt::Device` class, we use the callback function as a temporary solution.
+  //
+  // The argument `fn` should be thread-safe.
+  void AddCreateRuntimeResourceFn(
+      std::function<void(tfrt::ResourceContext*)> fn) {
+    runtime_resource_fns_.emplace_back(
+        [fn = std::move(fn)](ModelRuntimeContext& model_context) {
+          fn(&model_context.resource_context());
+          return absl::OkStatus();
+        });
+  }
+
+  void AddCreateRuntimeResourceFn(
+      std::function<absl::Status(ModelRuntimeContext& model_context)> fn) {
+    runtime_resource_fns_.emplace_back(std::move(fn));
+  }
+
+  // `CreateRuntimeResources` populates `resource_ctx` with runtime-related
+  // resources.
+  //
+  // This function is thread-safe.
+  absl::Status CreateRuntimeResources(
+      ModelRuntimeContext& model_context) const {
+    for (auto& fn : runtime_resource_fns_) {
+      TF_RETURN_IF_ERROR(fn(model_context));
+    }
+    return absl::OkStatus();
+  }
+
+  ABSL_DEPRECATED("Use the overload that take ModelRuntimeContext instead.")
+  void CreateRuntimeResources(const GraphExecutionOptions& options,
+                              tfrt::ResourceContext* resource_ctx) const {
+    ModelRuntimeContext model_context(
+        &options, options.compile_options.saved_model_dir, resource_ctx);
+    for (auto& fn : runtime_resource_fns_) {
+      auto status = fn(model_context);
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to create runtime resource: " << status;
+        return;
+      }
+    }
+  }
+
+  void SetCreateRequestQueueFn(
+      std::function<
+          absl::StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
+          create_request_queue_fn) {
+    create_request_queue_fn_ = std::move(create_request_queue_fn);
+  }
+
+  // Creates a work queue for a request.
+  absl::StatusOr<std::unique_ptr<WorkQueueInterface>> CreateRequestQueue(
+      int64_t request_id) const {
+    if (create_request_queue_fn_) {
+      return create_request_queue_fn_(request_id);
+    }
+
+    return work_queue_->InitializeRequest(request_id);
+  }
+
+ private:
+  explicit Runtime(std::unique_ptr<tfrt::CoreRuntime> core_runtime,
+                   WorkQueueInterface* work_queue);
+
+  std::unique_ptr<tfrt::CoreRuntime> core_runtime_;
+  std::function<absl::StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
+      create_request_queue_fn_;
+  WorkQueueInterface* work_queue_ = nullptr;
+  std::vector<std::function<absl::Status(ModelRuntimeContext&)>>
+      runtime_resource_fns_;
+};
+
+// Get a singleton instance of tfrt_stub::Runtime. Returns nullptr until
+// SetGlobalRuntime has been called.
+// Not thread safe.
+Runtime* GetGlobalRuntime();
+
+// Instantiates the singleton instance of tfrt_stub::Runtime by transferring
+// an instance of tfrt_stub::Runtime.
+// Not thread safe.
+void SetGlobalRuntime(std::unique_ptr<Runtime> runtime);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_RUNTIME_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/step_id.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/step_id.h
new file mode 100644
index 00000000..f9de1a7d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/step_id.h
@@ -0,0 +1,110 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_RUNTIME_STEP_ID_H_
+#define TENSORFLOW_CORE_TFRT_RUNTIME_STEP_ID_H_
+
+#include <atomic>
+#include <cstdint>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/tfrt/kernels/stream_ops_util_constants.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// A base template for common utilities for a type safe id.
+template <typename Derived>
+struct SafeId {
+  SafeId() : id(0) {}
+  explicit constexpr SafeId(int64_t id) : id(id) {}
+
+  using Base = SafeId;
+
+  int64_t id;
+
+  friend bool operator==(const Derived& x, const Derived& y) {
+    return x.id == y.id;
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Derived& x) {
+    absl::Format(&sink, "%d", x.id);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Derived& x) {
+    return H::combine(std::move(h), x.id);
+  }
+};
+
+// A type-safe step id.
+struct StepId : SafeId<StepId> {
+  using Base::Base;
+
+  bool valid() const { return id != 0; }
+  static constexpr StepId GetInvalidStepId() { return StepId(0); }
+};
+
+// The initial value of the step id.
+std::atomic<uint64_t>& GetGlobalInitialStepId();
+
+// StepIdGenerator provides the utility to generate a monotonically increasing
+// step id. And the number of bits can be configured at compile time. The step
+// id is positive and the maximum value is 2^(kStepIdBitSize)-1.
+class StepIdGenerator {
+ public:
+  StepIdGenerator() : next_id_(GetGlobalInitialStepId().load()) {}
+
+  StepIdGenerator(const StepIdGenerator&) = delete;
+  StepIdGenerator& operator=(const StepIdGenerator&) = delete;
+
+  // Generates a positive step id that is within the bit-range specified by
+  // `kStepIdBitSize`.
+  StepId GetNextStepId() {
+    uint64_t next_id = next_id_.fetch_add(1, std::memory_order_relaxed);
+    // Use kStepIdBitSize bits because we need to pack it with batch id if batch
+    // function is used.
+    static_assert(kStepIdBitSize <= 32);
+    next_id = (next_id & ((1ull << kStepIdBitSize) - 1));
+
+    if (next_id == 0) {
+      return GetNextStepId();
+    }
+
+    return StepId(static_cast<int64_t>(next_id));
+  }
+
+ private:
+  std::atomic<uint64_t> next_id_{0};
+};
+
+// Set up the initial step_id used by StepIdGenerator. This class is
+// test-only.
+class TEST_ScopedInitialStepId {
+ public:
+  explicit TEST_ScopedInitialStepId(uint64_t step_id);
+  ~TEST_ScopedInitialStepId();
+
+  TEST_ScopedInitialStepId(const TEST_ScopedInitialStepId&) = delete;
+  TEST_ScopedInitialStepId& operator=(const TEST_ScopedInitialStepId&) = delete;
+
+ private:
+  uint64_t step_id_ = 0;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_RUNTIME_STEP_ID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/stream.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/stream.h
new file mode 100644
index 00000000..03b0784b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/stream.h
@@ -0,0 +1,285 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and limitations
+under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_RUNTIME_STREAM_H_
+#define TENSORFLOW_CORE_TFRT_RUNTIME_STREAM_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tfrt/runtime/step_id.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/threadpool_interface.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+struct StreamedResult {
+  absl::flat_hash_map<std::string, tensorflow::Tensor> tensors;
+  absl::Time enqueued_time;
+};
+
+struct StreamCallbackId : SafeId<StreamCallbackId> {
+  using Base::Base;
+};
+
+// An interface that abstracts communication between the
+// `StreamCallbackRegistry` and the stream controller backend.
+class StreamControllerInterface {
+ public:
+  explicit StreamControllerInterface(std::string controller_address)
+      : controller_address_(std::move(controller_address)) {}
+  virtual ~StreamControllerInterface() = default;
+
+  absl::string_view controller_address() const { return controller_address_; }
+
+  virtual void RecordDequeueLatency(absl::string_view model_name,
+                                    absl::Duration latency) {}
+
+  virtual void RecordCallbackLatency(absl::string_view model_name,
+                                     absl::Duration latency) {}
+
+ private:
+  std::string controller_address_;
+};
+
+// An interface that abstracts the communication from the `PwStreamResultsOp`
+// worker to the controller.
+class StreamWorkerInterface {
+ public:
+  explicit StreamWorkerInterface(std::string controller_address)
+      : controller_address_(std::move(controller_address)) {}
+  virtual ~StreamWorkerInterface() = default;
+
+  absl::string_view controller_address() const { return controller_address_; }
+
+  virtual void RecordSendLatency(absl::string_view model_name,
+                                 absl::Duration latency) {}
+  virtual absl::Status InvokeStreamCallback(
+      const StreamCallbackId& callback_id,
+      const std::vector<std::string>& names,
+      const std::vector<std::pair<int64_t, std::vector<tensorflow::Tensor>>>&
+          responses) = 0;
+
+ private:
+  std::string controller_address_;
+};
+
+class ScopedStreamCallback;
+
+class StreamInterfaceFactory {
+ public:
+  using CreateWorkerStreamInterfaceFn =
+      std::function<absl::StatusOr<std::unique_ptr<StreamWorkerInterface>>(
+          absl::string_view)>;
+
+  void RegisterController(
+      absl::AnyInvocable<
+          absl::StatusOr<std::unique_ptr<StreamControllerInterface>>() const>
+          interface_factory) {
+    absl::MutexLock lock(&mu_);
+    controller_interface_factory_ = std::move(interface_factory);
+  }
+
+  absl::StatusOr<std::unique_ptr<StreamControllerInterface>>
+  CreateControllerStreamInterface() const {
+    absl::MutexLock lock(&mu_);
+    return controller_interface_factory_();
+  }
+
+  void RegisterWorker(CreateWorkerStreamInterfaceFn interface_factory) {
+    absl::MutexLock lock(&mu_);
+    worker_interface_factory_ = std::move(interface_factory);
+  }
+
+  CreateWorkerStreamInterfaceFn CreateWorkerStreamInterface() const {
+    absl::MutexLock lock(&mu_);
+    return worker_interface_factory_;
+  }
+
+ private:
+  mutable absl::Mutex mu_;
+  absl::AnyInvocable<
+      absl::StatusOr<std::unique_ptr<StreamControllerInterface>>() const>
+      controller_interface_factory_ ABSL_GUARDED_BY(mu_) = []() {
+        return absl::InternalError(
+            "The factory for StreamControllerInterface is not registered.");
+      };
+
+  CreateWorkerStreamInterfaceFn worker_interface_factory_ ABSL_GUARDED_BY(mu_) =
+      [](absl::string_view) {
+        return absl::InternalError(
+            "The factory for StreamWorkerInterface is not registered.");
+      };
+};
+
+// Returns the global factory for the stream interface. The factory for the
+// stream interface must be registered first before calling
+// GetGlobalStreamCallbackRegistry().
+StreamInterfaceFactory& GetGlobalStreamInterfaceFactory();
+
+// Mapping from tuples of (callback_id, step_id) to callback states. The mapping
+// is stored in a global variable so that it can be shared between
+// `ScopedStreamCallback` and `InvokeStreamCallbackOp`.
+//
+// This class is thread-safe.
+class StreamCallbackRegistry {
+ public:
+  explicit StreamCallbackRegistry(
+      std::unique_ptr<StreamControllerInterface> interface)
+      : interface_(std::move(interface)) {
+    DCHECK(interface_);
+  }
+
+  // Registers a callback under the given id. A stream callback is uniquely
+  // identified by a tuple of a callback id (unique to each executable) and a
+  // step id (unique to each invocation of a given executable). Returns an RAII
+  // object that removes the callback from the registry on its deallocation, or
+  // an error if the id already exists in the registry.
+  //
+  // If a program runs `tf.PwStreamResults` with a matching callback/step id,
+  // `callback` will be called with the arguments of `tf.PwStreamResults`.
+  //
+  // All invocations to `callback` are handled serially by a single thread, so
+  // `callback` doesn't need to be thread-safe even if multiple
+  // `tf.PwStreamResults` ops may run concurrently.
+  absl::StatusOr<ScopedStreamCallback> Register(
+      absl::string_view model_name, StreamCallbackId callback_id,
+      StepId step_id,
+      absl::AnyInvocable<
+          void(absl::flat_hash_map<std::string, tensorflow::Tensor>)>
+          callback);
+
+  absl::Status Invoke(tsl::thread::ThreadPoolInterface* thread_pool,
+                      StreamCallbackId callback_id, StepId step_id,
+                      StreamedResult result);
+
+  StreamControllerInterface& stream_interface() const { return *interface_; }
+
+ private:
+  friend class ScopedStreamCallback;
+
+  class CallbackState {
+   public:
+    CallbackState(StreamCallbackRegistry* registry,
+                  absl::string_view model_name, StreamCallbackId callback_id,
+                  StepId step_id,
+                  absl::AnyInvocable<void(
+                      absl::flat_hash_map<std::string, tensorflow::Tensor>)>
+                      callback)
+        : registry_(registry),
+          model_name_(model_name),
+          callback_id_(callback_id),
+          step_id_(step_id),
+          callback_(std::move(callback)) {
+      DCHECK(registry_);
+    }
+
+    // Invokes the callback in `thread_pool` with `result`.
+    absl::Status Invoke(tsl::thread::ThreadPoolInterface* thread_pool,
+                        StreamedResult result);
+
+    // Closes the callback so that it can no longer be invoked. This method also
+    // waits for outstanding results to finish.
+    void Close();
+
+   private:
+    StreamControllerInterface& interface() {
+      return registry_->stream_interface();
+    }
+    void InvokeCallback(StreamedResult result);
+
+    StreamCallbackRegistry* registry_ = nullptr;
+    std::string model_name_;
+    StreamCallbackId callback_id_;
+    StepId step_id_;
+    absl::AnyInvocable<void(
+        absl::flat_hash_map<std::string, tensorflow::Tensor>)>
+        callback_;
+
+    absl::Mutex mu_;
+    bool closed_ ABSL_GUARDED_BY(mu_) = false;
+    int num_outstanding_ ABSL_GUARDED_BY(mu_) = 0;
+  };
+
+  std::unique_ptr<CallbackState> Unregister(StreamCallbackId callback_id,
+                                            StepId step_id);
+
+  std::unique_ptr<StreamControllerInterface> interface_;
+
+  mutable absl::Mutex mu_;
+  absl::flat_hash_map<std::pair<StreamCallbackId, StepId>,
+                      std::unique_ptr<CallbackState>>
+      stream_callbacks_ ABSL_GUARDED_BY(mu_);
+};
+
+// Returns the global registry for the stream callbacks. The stream interface
+// must have been registered through GetGlobalStreamInterfaceFactory() before
+// calling this function.
+StreamCallbackRegistry& GetGlobalStreamCallbackRegistry();
+
+// Creates a new stream callback id and rewrites the given module with
+// information required to trigger this callback remotely. Returns the callback
+// id, or `std::nullopt` if the module has no stream outputs.
+absl::StatusOr<std::optional<StreamCallbackId>> CreateStreamCallbackId(
+    absl::string_view model_name, mlir::ModuleOp module);
+
+// Implements an RAII object that registers a callback to be called on receiving
+// streamed tensors.
+class ScopedStreamCallback {
+ public:
+  ScopedStreamCallback() = default;
+
+  // Moveable but not copyable.
+  ScopedStreamCallback(ScopedStreamCallback&& other);
+  ScopedStreamCallback& operator=(ScopedStreamCallback&& other);
+
+  ~ScopedStreamCallback() { Unregister(); }
+
+ private:
+  friend class StreamCallbackRegistry;
+
+  explicit ScopedStreamCallback(StreamCallbackRegistry* registry,
+                                StreamCallbackId callback_id, StepId step_id)
+      : registry_(registry), callback_id_(callback_id), step_id_(step_id) {}
+
+  void Unregister();
+
+  StreamCallbackRegistry* registry_ = nullptr;
+  std::optional<StreamCallbackId> callback_id_;
+  StepId step_id_ = StepId::GetInvalidStepId();
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_RUNTIME_STREAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
new file mode 100644
index 00000000..be7acaee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
@@ -0,0 +1,90 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_RUNTIME_TF_THREADPOOL_CONCURRENT_WORK_QUEUE_H_
+#define TENSORFLOW_CORE_TFRT_RUNTIME_TF_THREADPOOL_CONCURRENT_WORK_QUEUE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/task_function.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// This class defines a work queue based on the WorkQueueInterface that uses the
+// Tensorflow threadpools to execute inter-op and intra-op closures.
+class TfThreadPoolWorkQueue : public WorkQueueInterface {
+ public:
+  TfThreadPoolWorkQueue(
+      tensorflow::thread::ThreadPoolInterface* intra_op_threadpool,
+      tensorflow::thread::ThreadPoolInterface* inter_op_threadpool)
+      : TfThreadPoolWorkQueue(/*id=*/0, intra_op_threadpool,
+                              inter_op_threadpool) {}
+
+  TfThreadPoolWorkQueue(
+      int64_t id, tensorflow::thread::ThreadPoolInterface* intra_op_threadpool,
+      tensorflow::thread::ThreadPoolInterface* inter_op_threadpool)
+      : WorkQueueInterface(id, intra_op_threadpool),
+        intra_op_threadpool_(intra_op_threadpool),
+        inter_op_threadpool_(inter_op_threadpool) {}
+
+  absl::StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
+      int64_t request_id) const override;
+
+  int GetParallelismLevel() const override {
+    return inter_op_threadpool_->NumThreads();
+  }
+  std::string name() const override { return "TfThreadPoolWorkQueue"; }
+
+  void AddTask(tfrt::TaskFunction work) override;
+
+  std::optional<tfrt::TaskFunction> AddBlockingTask(
+      tfrt::TaskFunction work, bool allow_queuing) override;
+
+  ABSL_DEPRECATED("Use the destructor instead.")
+  void Quiesce() override;
+
+  void Await(
+      tfrt::ArrayRef<::tfrt::RCReference<::tfrt::AsyncValue>> values) override;
+
+  bool IsInWorkerThread() const override;
+
+ private:
+  tensorflow::thread::ThreadPoolInterface* intra_op_threadpool_ = nullptr;
+  tensorflow::thread::ThreadPoolInterface* inter_op_threadpool_ = nullptr;
+};
+
+// Create a default TfThreadPoolWorkQueue that is implemented by
+// tensorflow::thread::ThreadPool. `num_inter_op_threads` and
+// `num_intra_op_threads` must be larger than zero.
+std::unique_ptr<TfThreadPoolWorkQueue> CreateDefaultTfThreadPoolWorkQueue(
+    int num_inter_op_threads, int num_intra_op_threads);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_RUNTIME_TF_THREADPOOL_CONCURRENT_WORK_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/work_queue_interface.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/work_queue_interface.h
new file mode 100644
index 00000000..4eca1d72
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/runtime/work_queue_interface.h
@@ -0,0 +1,113 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_RUNTIME_WORK_QUEUE_INTERFACE_H_
+#define TENSORFLOW_CORE_TFRT_RUNTIME_WORK_QUEUE_INTERFACE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/support/error_util.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// This is an intermediate interface in tensorflow for injecting thread pool
+// implementation into TFRT. We can add savedmodel/tensorflow specific
+// methods (eg. create an intra op thread pool) without changing TFRT core.
+class WorkQueueInterface : public tfrt::ConcurrentWorkQueue {
+ public:
+  WorkQueueInterface() = default;
+  explicit WorkQueueInterface(int64_t id) : id_(id) {}
+  explicit WorkQueueInterface(int64_t id,
+                              thread::ThreadPoolInterface* intra_op_threadpool)
+      : id_(id), intra_op_threadpool_(intra_op_threadpool) {}
+  ~WorkQueueInterface() override = 0;
+
+  int64_t id() const { return id_; }
+
+  thread::ThreadPoolInterface* GetIntraOpThreadPool() const {
+    return intra_op_threadpool_;
+  }
+
+  // Returns per-request work queue if possible. A nullptr should be returned if
+  // the implementation does not implement the per-request work queue.
+  //
+  // TODO(b/198671794): Remove per-request concepts from the work queue
+  // interface so that the interface is more composable. Per-request logic
+  // should be handled separately.
+  ABSL_DEPRECATED("Create the instance directly instead.")
+  virtual absl::StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
+      int64_t request_id) const {
+    return {nullptr};
+  }
+
+ private:
+  int64_t id_ = 0;
+  thread::ThreadPoolInterface* intra_op_threadpool_ = nullptr;
+};
+
+inline WorkQueueInterface::~WorkQueueInterface() = default;
+
+// Creates a WorkQueueInterface from a ConcurrentWorkQueue. The returned
+// WorkQueueInterface simply delegates all its public methods to the specified
+// ConcurrentWorkQueue.
+std::unique_ptr<WorkQueueInterface> WrapDefaultWorkQueue(
+    std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue);
+
+// Creates a WorkQueueInterface from a ConcurrentWorkQueue. The returned
+// WorkQueueInterface simply delegates all its public methods to the specified
+// ConcurrentWorkQueue. The `intra_thread_pool` is stored and will be passed out
+// when `InitializeRequest()` is called.
+std::unique_ptr<WorkQueueInterface> WrapDefaultWorkQueue(
+    std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue,
+    thread::ThreadPoolInterface* intra_thread_pool);
+
+// A helper function that wraps tasks with traceme events.
+template <typename Callable>
+tfrt::TaskFunction WrapWork(int64_t id, absl::string_view name,
+                            Callable&& work) {
+  tensorflow::Context context(tensorflow::ContextKind::kThread);
+  tsl::profiler::TraceMeProducer producer(
+      [&]() { return absl::StrCat("producer_", name); },
+      tsl::profiler::ContextType::kTfrtExecutor);
+  return tfrt::TaskFunction([traceme_id = producer.GetContextId(),
+                             name = std::string(name),
+                             context = std::move(context),
+                             work = std::forward<Callable>(work)]() mutable {
+    tsl::profiler::TraceMeConsumer consumer(
+        [&]() { return absl::StrCat("consumer_", name); },
+        tsl::profiler::ContextType::kTfrtExecutor, traceme_id,
+        tsl::profiler::TraceMeLevel::kInfo);
+    tensorflow::WithContext wc(context);
+    std::forward<Callable>(work)();
+  });
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_RUNTIME_WORK_QUEUE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h
new file mode 100644
index 00000000..cddf15cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_PYTHON_SAVED_MODEL_LOAD_AND_RUN_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_PYTHON_SAVED_MODEL_LOAD_AND_RUN_H_
+
+#include <Python.h>
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model.h"
+
+namespace tensorflow::tfrt_stub {
+
+absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags);
+
+std::vector<tensorflow::Tensor> RunConvertor(PyObject* args);
+
+absl::Status Run(
+    SavedModel* saved_model,
+    const tensorflow::tfrt_stub::GraphExecutionRunOptions& run_options,
+    absl::string_view name, const std::vector<tensorflow::Tensor>& inputs,
+    std::vector<tensorflow::Tensor>* outputs);
+}  // namespace tensorflow::tfrt_stub
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_PYTHON_SAVED_MODEL_LOAD_AND_RUN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model.h
new file mode 100644
index 00000000..b4c050aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model.h
@@ -0,0 +1,353 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_H_
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
+#include "tsl/platform/protobuf.h"
+#include "tfrt/host_context/function.h"  // from @tf_runtime
+#include "tfrt/host_context/request_deadline_tracker.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tfrt {
+class BEFFile;
+class HostContext;
+}  // namespace tfrt
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class FunctionMetadata {
+ public:
+  explicit FunctionMetadata(const internal::Signature* signature)
+      : signature_(signature) {
+    assert(signature);
+  }
+
+  const std::vector<std::string>& GetInputNames() const {
+    return signature_->input_names;
+  }
+
+  const std::vector<TensorSpec>& GetInputSpecs() const {
+    return signature_->input_specs;
+  }
+
+  const std::vector<std::string>& GetOutputNames() const {
+    return signature_->output_names;
+  }
+
+  const std::vector<TensorSpec>& GetOutputSpecs() const {
+    return signature_->output_specs;
+  }
+
+  const protobuf::Map<std::string, TensorProto>& GetDefaultInputs() const {
+    return signature_->default_inputs;
+  }
+
+ private:
+  friend class SavedModelImpl;
+
+  const internal::Signature* signature_ = nullptr;
+};
+
+// SavedModel represents the in-memory states (graphs and variables) loaded from
+// a tensorflow saved model directory.
+class SavedModel {
+ public:
+  struct Options {
+    explicit Options(const Runtime* rt) : graph_execution_options(rt) {}
+
+    // If true, the loading of any signature (or signature combination) will be
+    // deferred until the first corresponding invocationof running. Otherwise,
+    // the individual signatures will be loaded along with the saved model.
+    bool enable_lazy_loading = false;
+
+    // If true, we'll attempt to find MLArchive within the given loading path.
+    // If not found, will use the path as a normal SavedModel directory.
+    //
+    // This field is deprecated.
+    bool maybe_load_from_mla = false;
+
+    // If true, the lazy loading path will use tfrt_stub::GraphExecutor.
+    //
+    // TODO(b/216379787): Remove this option once b/279197040 is unblocked.
+    bool lazy_loading_use_graph_executor = false;
+
+    // True if and only if SavedModel is being loaded to generate AOT results.
+    bool aot_generation = false;
+
+    // Make a best-effort guess at the model type and emit a metric. E.g.
+    // detecting JAX models by looking for the `XlaCallModule` op in the
+    // MetaGraphDef.
+    bool emit_model_type_metric = false;
+
+    GraphExecutionOptions graph_execution_options;
+  };
+
+  // Per-request options.
+  using RunOptions = GraphExecutionRunOptions;
+
+  explicit SavedModel(const Runtime* runtime) : options_(runtime) {
+    DCHECK(runtime);
+  }
+  explicit SavedModel(Options options,
+                      std::unique_ptr<GraphExecutor> graph_executor)
+      : options_(std::move(options)),
+        graph_executor_(std::move(graph_executor)) {}
+  virtual ~SavedModel();
+
+  const SessionMetadata& model_metadata() const {
+    return options_.graph_execution_options.model_metadata;
+  }
+
+  const Runtime& runtime() const {
+    DCHECK(options_.graph_execution_options.runtime);
+    return *options_.graph_execution_options.runtime;
+  }
+  tfrt::HostContext* GetHostContext() const;
+
+  GraphExecutor& graph_executor() const { return *graph_executor_; }
+
+  // Returns meta graph def. Note that the graph_def field in the MetaGraphDef
+  // has already been removed.
+  //
+  // TODO(b/191931702): Change the method to return SignatureDefs instead.
+  virtual const tensorflow::MetaGraphDef& GetMetaGraphDef() const = 0;
+
+  // Returns all the function names.
+  virtual std::vector<std::string> GetFunctionNames() const = 0;
+
+  // Returns the `FunctionMetadata` for a function. If the function is not
+  // found, returns nullopt instead.
+  virtual std::optional<FunctionMetadata> GetFunctionMetadata(
+      absl::string_view func_name) const = 0;
+
+  // Runs the signature specified by `name`. Both `inputs` and `outputs`
+  // are all host tensors. The `outputs` must be non-null. If the returned
+  // status is non-OK, the `outputs` are invalid.
+  virtual absl::Status Run(const RunOptions& run_options,
+                           absl::string_view name,
+                           absl::Span<const tensorflow::Tensor> inputs,
+                           std::vector<tensorflow::Tensor>* outputs) = 0;
+
+  // Runs the signatures specified by `names`. Both `inputs` and `outputs` are
+  // all host tensors. The `outputs` must be non-null. If the returned status is
+  // non-OK, the `outputs` are invalid.
+  //
+  // NOTE: If the given signatures have overlapping input nodes, the input
+  // tensors for these overlapping nodes must be the same. Having different
+  // input tensors for overlapping nodes results UNDEFINED BEHAVIOR.
+  //
+  // NOTE: The input/output tensors can only be dense tensors (as opposed to
+  // sparse tensors or composite tensors).
+  virtual absl::Status RunMultipleSignatures(
+      const RunOptions& run_options, absl::Span<const std::string> names,
+      absl::Span<const std::vector<tensorflow::Tensor>> multi_inputs,
+      std::vector<std::vector<tensorflow::Tensor>>* multi_outputs) = 0;
+
+  // Runs the graphs specified by the tensor names terminal tensors (eg. feed
+  // tensors, fetch tesnors) in the graph.
+  virtual absl::Status RunByTensorNames(
+      const RunOptions& run_options,
+      absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_node_names,
+      std::vector<tensorflow::Tensor>* outputs) = 0;
+
+ protected:
+  const FallbackState& fallback_state() const {
+    return graph_executor_->fallback_state();
+  }
+  FallbackState& fallback_state() { return graph_executor_->fallback_state(); }
+
+  const Options options_;
+  std::unique_ptr<GraphExecutor> graph_executor_;
+};
+
+using SignatureMap = absl::flat_hash_map<std::string, internal::Signature>;
+using ::tensorflow::StatusOr;
+
+class SavedModelImpl final : public SavedModel {
+ public:
+  struct JoinedSignature;
+
+  // Loads all SignatureDefs in a MetaGraphDef that matches the `tags` in the
+  // tensorflow saved model from `saved_model_dir`. Refer to
+  // http://g3doc/learning/serving/g3doc/saved_model/overview.md
+  // for explanations on SavedModel.
+  //
+  // If `options.maybe_load_from_mla` is true, tries opening `saved_model_dir`
+  // as an MLA. If it's not an MLA, uses it as a normal SavedModel directory.
+  static absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+      Options options, absl::string_view saved_model_dir,
+      const std::unordered_set<std::string>& tags);
+
+  // Loads all SignatureDefs in `meta_graph_def`. Refer to
+  // http://g3doc/learning/serving/g3doc/saved_model/overview.md
+  // for explanations on SavedModel.
+  static absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+      Options options, tensorflow::MetaGraphDef meta_graph_def,
+      absl::string_view saved_model_dir);
+
+  SavedModelImpl(
+      Options options, SymbolUids symbol_uids,
+      tensorflow::MetaGraphDef meta_graph_def, tfrt::BefBuffer bef,
+      tfrt::RCReference<tfrt::BEFFile> bef_file, mlrt::bc::Buffer bytecode,
+      std::optional<mlrt::LoadedExecutable> loaded_executable,
+      absl::flat_hash_map<std::string, internal::Signature> signatures,
+      std::unique_ptr<OpKernelRunnerTable> runner_table,
+      std::unique_ptr<tfd::FallbackResourceArray> resource_array,
+      std::unique_ptr<GraphExecutor> graph_executor);
+
+  ~SavedModelImpl() override = default;
+
+  SavedModelImpl(const SavedModelImpl&) = delete;
+  SavedModelImpl& operator=(const SavedModelImpl&) = delete;
+
+  const tensorflow::MetaGraphDef& GetMetaGraphDef() const override;
+
+  std::vector<std::string> GetFunctionNames() const override;
+
+  std::optional<FunctionMetadata> GetFunctionMetadata(
+      absl::string_view func_name) const override;
+
+  absl::Status Run(const RunOptions& run_options, absl::string_view name,
+                   absl::Span<const tensorflow::Tensor> inputs,
+                   std::vector<tensorflow::Tensor>* outputs) override;
+
+  absl::Status RunMultipleSignatures(
+      const RunOptions& run_options, absl::Span<const std::string> names,
+      absl::Span<const std::vector<tensorflow::Tensor>> multi_inputs,
+      std::vector<std::vector<tensorflow::Tensor>>* multi_outputs) override;
+
+  absl::Status RunByTensorNames(
+      const RunOptions& run_options,
+      absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_node_names,
+      std::vector<tensorflow::Tensor>* outputs) override;
+
+ private:
+  // The result of loading signature(s).
+  struct LoadingResult {
+    std::string name;
+    SymbolUids symbol_uids;
+
+    // For the MLRT path.
+    mlrt::bc::Buffer bytecode_buffer;
+    std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable;
+
+    // For the TFRT path.
+    tfrt::BefBuffer bef;
+    tfrt::RCReference<tfrt::BEFFile> bef_file;
+
+    std::unique_ptr<OpKernelRunnerTable> runner_table;
+    std::unique_ptr<tfd::FallbackResourceArray> resource_array;
+
+    // There are some resources that need re-creating when the executable is
+    // re-created, so a resource context is stored along with the executable.
+    // This resource context is meant to be passed to the op kernels for their
+    // references. See the comment above `GraphExecutor::resource_context_`
+    // about the todo to merge that resource context with this one.
+    std::unique_ptr<tfrt::ResourceContext> resource_context;
+  };
+
+  // Imports a subgraph as an MLIR module with the specified `input_nodes`,
+  // `output_nodes`.
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSubgraph(
+      mlir::MLIRContext* context, absl::string_view name,
+      const tensorflow::GraphImportConfig::InputArrays& input_nodes,
+      const std::vector<std::string>& output_nodes,
+      const std::vector<std::string>& target_nodes);
+
+  // Given the joined signature, loads the subgraph and returns loading result.
+  absl::StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
+  LoadJoinedSignature(const JoinedSignature& joined_signature)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(loading_result_cache_mu_);
+
+  // Returns the loading result given the signature names.
+  absl::StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
+  GetOrCreateLoadingResult(const RunOptions& run_options,
+                           absl::Span<const std::string> names)
+      TF_LOCKS_EXCLUDED(loading_result_cache_mu_);
+
+  SymbolUids symbol_uids_;
+  // `meta_graph_def_` only contains metadata of the model. The graph_def field
+  // is removed.
+  //
+  // TODO(b/191931702): We should only keep content that are actually used
+  // (eg. SignatureDefs), instead of keeping the whole saved model, to avoid
+  // unnecessary memory usage.
+  tensorflow::MetaGraphDef meta_graph_def_;
+  tfrt::BefBuffer bef_;
+  tfrt::RCReference<tfrt::BEFFile> bef_file_;
+
+  mlrt::bc::Buffer bytecode_;
+  std::optional<mlrt::LoadedExecutable> loaded_executable_;
+
+  tfrt::RequestDeadlineTracker req_deadline_tracker_;
+  absl::flat_hash_map<std::string, internal::Signature> signatures_;
+  std::unique_ptr<OpKernelRunnerTable> runner_table_;
+  std::unique_ptr<tfd::FallbackResourceArray> resource_array_;
+  tensorflow::mutex loading_result_cache_mu_;
+  // For pointer stability of values in `absl::flat_hash_map<>`, additional
+  // `std::unique_ptr<>` is necessary. (See https://abseil.io/tips/136.)
+  absl::flat_hash_map<std::string /*joined_name*/,
+                      std::unique_ptr<LoadingResult>>
+      loading_result_cache_ TF_GUARDED_BY(loading_result_cache_mu_);
+};
+
+class SavedModelMiraImpl;
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+namespace tfrt {
+
+using SavedModel = ::tensorflow::tfrt_stub::SavedModel;
+using SavedModelImpl = ::tensorflow::tfrt_stub::SavedModelImpl;
+using SavedModelMiraImpl = ::tensorflow::tfrt_stub::SavedModelMiraImpl;
+using TensorSpec = ::tensorflow::tfrt_stub::TensorSpec;
+using FunctionMetadata = ::tensorflow::tfrt_stub::FunctionMetadata;
+
+namespace internal {
+using Signature = ::tensorflow::tfrt_stub::internal::Signature;
+}
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
new file mode 100644
index 00000000..27db2c92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
@@ -0,0 +1,105 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_AOT_COMPILE_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_AOT_COMPILE_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/compiler.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+
+namespace tensorflow::tfrt_stub {
+struct AotOptions {
+  AotOptions();
+  std::unordered_set<std::string> tags = {};
+  std::shared_ptr<GraphExecutionOptions> graph_execution_options;
+  // TODO(b/296466237): support compiling for multiple signature functions.
+  // The signature name to be AOT compiled.
+  std::string signature_name;
+};
+
+struct AotResult {
+  using ExecutableMap =
+      absl::flat_hash_map<DeviceCompilationClusterSignature, std::string,
+                          DeviceCompilationClusterSignature::Hash>;
+  std::variant<tfrt::BefBuffer, mlrt::bc::Buffer> buffer;
+  // TODO(b/296466237): Investigate whether the whole FunctionDefLibrary should
+  // be put here.
+  // XLA cluster functions corresponding to `XlaLaunch` op, generated during
+  // bridge.
+  std::vector<FunctionDef> xla_functions;
+};
+
+// AOT compiles saved_model in input_model_dir and returns AotResult, otherwise
+// returns error.
+absl::StatusOr<AotResult> AotCompileSavedModel(
+    absl::string_view input_model_dir, AotOptions aot_options = {});
+
+// TODO(b/296466237): Add unit test.
+// Runs bridge and compiles the generated XLA functions corresponding to the
+// signature function with name `siganture_name` in MetaGraphDef.
+// `input_shapes` maps input signature node name to its tensor shape, and is
+// used to make up for the missing input shape information in the graph if any
+// so that shape inference pass in bridge can proceed correctly. Returns
+// AotResult::ExecutableMap as compilation result, which maps function
+// signatures to serialized executables.
+absl::StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
+    const MetaGraphDef& meta_graph_def, const std::string& signature_name,
+    const absl::flat_hash_map<std::string, tensorflow::TensorShapeProto>&
+        input_shapes,
+    const tensorflow::FunctionDefLibrary& fdef_lib,
+    const tensorflow::SessionOptions& session_options,
+    const mlir::DialectRegistry& registry, const AotOptions& aot_options,
+    absl::string_view input_model_dir, ModelRuntimeContext& model_context);
+
+// TODO(b/296466237): make this function general for all devices.
+// AOT compiles `function` into PjRtExecutable. It is the counterpart of the JIT
+// version `CompileToPjRtLoadedExecutable`. `compilation_result` contains the
+// generated XLA computation.
+absl::StatusOr<std::unique_ptr<xla::PjRtExecutable>>
+AotCompileToGpuPjRtExecutable(
+    const FunctionLibraryDefinition* flib_def, const NameAttrList& function,
+    int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
+    bool has_ref_vars, bool may_alias_resource_update,
+    const stream_executor::GpuTargetConfigProto& gpu_target_config,
+    XlaCompiler::CompilationResult** compilation_result);
+
+// Returns serialized PJRT loaded GPU executable. This function requires GPU
+// device to be present during compilation.
+absl::StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
+    const FunctionLibraryDefinition* flib_def, const NameAttrList& function,
+    int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
+    bool has_ref_vars, bool may_alias_resource_update,
+    XlaCompiler::CompilationResult** compilation_result);
+}  // namespace tensorflow::tfrt_stub
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_AOT_COMPILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_import_input.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
new file mode 100644
index 00000000..5ff375a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
@@ -0,0 +1,67 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_IMPORT_INPUT_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_IMPORT_INPUT_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+#include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// TfrtSavedModelMLIRImportInput implements SavedModelMLIRImportInput, so that
+// it can perform customization (eg. Placer and Grappler) on the input graph to
+// the MLIR importer.
+class TfrtSavedModelMLIRImportInput : public SavedModelMLIRImportInput {
+ public:
+  static absl::StatusOr<TfrtSavedModelMLIRImportInput> Create(
+      const FallbackState& fallback_state, const MetaGraphDef* meta_graph_def,
+      const GraphDebugInfo& debug_info,
+      bool run_placer_grappler_on_nested_functions = false,
+      tensorflow::tfrt_stub::RuntimeConfig* runtime_config = nullptr);
+
+  TfrtSavedModelMLIRImportInput(
+      const MetaGraphDef* meta_graph_def, const GraphDebugInfo& debug_info,
+      std::unique_ptr<TfrtGraphExecutionState> graph_execution_state);
+
+  absl::StatusOr<const tensorflow::Graph*> GetSubGraph(
+      absl::string_view name, GraphImportConfig& graph_import_config) override;
+
+  // Return the time used by grappler.
+  absl::Duration GetGrapplerDuration() const { return grappler_duration_; }
+
+  // Return the time used by functionalization.
+  absl::Duration GetFunctionalizationDuration() const {
+    return functionalization_duration_;
+  }
+
+ private:
+  std::unique_ptr<TfrtGraphExecutionState> graph_execution_state_;
+  absl::flat_hash_map<std::string, std::unique_ptr<tensorflow::Graph>>
+      optimized_graphs_;
+
+  absl::Duration functionalization_duration_;
+  absl::Duration grappler_duration_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_IMPORT_INPUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_testutil.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
new file mode 100644
index 00000000..c0a69cd9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
@@ -0,0 +1,127 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_TESTUTIL_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_TESTUTIL_H_
+
+#include <stdlib.h>
+
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model.h"
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+#if defined(PLATFORM_GOOGLE)
+ABSL_DECLARE_FLAG(bool, enable_optimizer);
+ABSL_DECLARE_FLAG(std::string, force_data_format);
+#endif
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+std::unique_ptr<tensorflow::tfrt_stub::Runtime> DefaultTfrtRuntime(
+    int num_threads);
+
+struct UserSavedModelOptions {
+  bool enable_mlrt = false;
+  bool enable_optimizer = false;
+  bool enable_grappler = false;
+  std::string force_data_format = "";
+};
+
+SavedModel::Options DefaultSavedModelOptions(
+    tensorflow::tfrt_stub::Runtime* runtime,
+    std::optional<UserSavedModelOptions> user_options = std::nullopt);
+
+class TFRTSavedModelTest {
+ public:
+  explicit TFRTSavedModelTest(const std::string& saved_model_dir);
+  TFRTSavedModelTest(const std::string& saved_model_dir,
+                     std::unique_ptr<tensorflow::tfrt_stub::Runtime> runtime);
+
+  SavedModel* GetSavedModel() { return saved_model_.get(); }
+
+  tfrt::HostContext* GetHostContext() const {
+    return saved_model_->GetHostContext();
+  }
+
+ private:
+  std::unique_ptr<tensorflow::tfrt_stub::Runtime> runtime_;
+  std::unique_ptr<SavedModel> saved_model_;
+};
+
+template <typename T, typename U = T>
+tensorflow::Tensor CreateTfTensor(absl::Span<const int64_t> shape,
+                                  absl::Span<const U> data) {
+  tensorflow::Tensor tensor(tensorflow::DataTypeToEnum<T>::value,
+                            tensorflow::TensorShape(shape));
+  auto flat = tensor.flat<T>();
+  for (int i = 0; i < data.size(); ++i) {
+    flat(i) = data[i];
+  }
+  return tensor;
+}
+
+template <typename T>
+std::vector<T> GetTfTensorData(const tensorflow::Tensor& tensor) {
+  return std::vector<T>(tensor.flat<T>().data(),
+                        tensor.flat<T>().data() + tensor.NumElements());
+}
+
+inline tensorflow::Tensor CreateTfStringTensor(
+    absl::Span<const int64_t> shape, absl::Span<const std::string> data) {
+  return CreateTfTensor<tensorflow::tstring>(shape, data);
+}
+
+void ComputeCurrentTFResult(const std::string& saved_model_dir,
+                            const std::string& signature_name,
+                            const std::vector<std::string>& input_names,
+                            const std::vector<tensorflow::Tensor>& inputs,
+                            const std::vector<std::string>& output_names,
+                            std::vector<tensorflow::Tensor>* outputs,
+                            bool enable_mlir_bridge = false,
+                            bool disable_grappler = false);
+
+// Compute the results using TF1 session loaded from the saved model. In
+// addition to returning the result tensors, it also fills `bundle` with the
+// loaded savedmodel. This is useful as sometimes the result tensors may only be
+// valid when the bundle is alive.
+void ComputeCurrentTFResult(const std::string& saved_model_dir,
+                            const std::string& signature_name,
+                            const std::vector<std::string>& input_names,
+                            const std::vector<tensorflow::Tensor>& inputs,
+                            const std::vector<std::string>& output_names,
+                            std::vector<tensorflow::Tensor>* outputs,
+                            tensorflow::SavedModelBundle* bundle,
+                            bool enable_mlir_bridge = false,
+                            bool disable_grappler = false);
+
+void ExpectTensorEqual(const tensorflow::Tensor& x, const tensorflow::Tensor& y,
+                       std::optional<double> error = std::nullopt);
+
+SavedModel::Options DefaultTpuModelOptions(
+    tensorflow::tfrt_stub::Runtime* runtime,
+    tensorflow::TfrtDeviceInfraTarget device_target);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_TESTUTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_util.h
new file mode 100644
index 00000000..409a31a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/saved_model_util.h
@@ -0,0 +1,154 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_UTIL_H_
+
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tsl/platform/protobuf.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Filename for serialized BEF Buffer.
+inline constexpr char kBefBufferFileName[] = "serialized_bef.mlir.bef";
+
+// Filename for serialized MLRT bytecode Buffer.
+inline constexpr char kMlrtBufferFileName[] = "serialized_mlrt.mlir.mlrt";
+
+// Filename for serialized MLIR_MODULE.
+inline constexpr char kMlirModuleFilename[] = "serialized_mlir.mlir";
+
+// Subdirectory where AoT Packages are saved
+inline constexpr char kAotPackagesDirectory[] = "aot_packages";
+
+// TODO(tfrt-dev): Replace tfrt::TensorSpec with tensorflow::TensorSpec once the
+// latter is checked in.
+struct TensorSpec {
+  tensorflow::DataType dtype;
+  tensorflow::PartialTensorShape shape;
+
+  explicit TensorSpec(tensorflow::DataType dtype) : dtype(dtype) {}
+  TensorSpec(tensorflow::DataType dtype, tensorflow::PartialTensorShape shape)
+      : dtype(dtype), shape(std::move(shape)) {}
+};
+
+inline bool operator==(const TensorSpec& a, const TensorSpec& b) {
+  return a.dtype == b.dtype && a.shape.IsIdenticalTo(b.shape);
+}
+
+namespace internal {
+
+struct Signature {
+  // The following three fields should have the same size.
+  std::vector<std::string> input_names;
+  std::vector<TensorSpec> input_specs;
+  std::vector<std::string> input_devices;
+
+  // The following two fields should have the same size.
+  std::vector<std::string> output_names;
+  std::vector<TensorSpec> output_specs;
+  protobuf::Map<std::string, TensorProto> default_inputs;
+};
+
+}  // namespace internal
+
+// If `import_signature_names` is non-empty, this function only imports the
+// graph that corresponds to this list.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+    mlir::MLIRContext* context, const tensorflow::MetaGraphDef& meta_graph_def,
+    const FallbackState& fallback_state, std::string saved_model_dir,
+    bool import_user_signatures, bool run_placer_grappler_on_functions,
+    const std::vector<std::string>& import_signature_names = {},
+    tensorflow::tfrt_stub::RuntimeConfig* runtime_config = nullptr);
+
+absl::StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags);
+
+using SignatureMap = absl::flat_hash_map<std::string, internal::Signature>;
+using ::tensorflow::StatusOr;
+
+struct Initializer {
+  std::string name;
+  std::vector<tensorflow::Tensor> inputs;
+};
+
+struct InitializersAndSignatures {
+  // Initializers are kept in a certain order as they need to be executed in
+  // that order.
+  std::vector<Initializer> initializers;
+  SignatureMap signature_map;
+};
+
+// If `saved_model_dir` is non-empty, this function fills in the Initializer's
+// inputs in the returned result.
+absl::StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
+    mlir::ModuleOp module, absl::string_view saved_model_dir = "");
+
+std::string GetAotPackagePath(absl::string_view saved_model_dir);
+
+std::string GetBefFilePath(std::string aot_package_directory);
+
+std::string GetMlirFilePath(const std::string& aot_package_directory);
+
+// TODO(b/295241000): Implement MLIR deserialization to skip it AoT and remove
+// redundant steps
+absl::StatusOr<tfrt::BefBuffer> LoadBefAndMlir(
+    const TfrtCompileOptions& options, mlir::ModuleOp mlir_module,
+    const std::string& saved_model_dir,
+    tfrt_stub::FallbackState* fallback_state);
+
+absl::StatusOr<mlrt::bc::Buffer> LoadMlrtAndMlir(
+    const TfrtCompileOptions& options, mlir::ModuleOp mlir_module,
+    const std::string& saved_model_dir,
+    tfrt_stub::FallbackState* fallback_state);
+
+absl::Status DeserializeAoTMlirModule(
+    absl::string_view saved_model_dir, mlir::MLIRContext* context,
+    mlir::OwningOpRef<mlir::ModuleOp>* mlir_module);
+
+CallableOptions CombineSignatureDefs(
+    const google::protobuf::Map<std::string, SignatureDef>& signature_defs);
+
+void RegisterTfrtDialectsForAot(mlir::DialectRegistry& registry);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
new file mode 100644
index 00000000..6708b44a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_UTILS_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tsl/platform/env.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Serializes the BefBuffer into a file.
+absl::Status SerializeBEF(const tfrt::BefBuffer &bef,
+                          const std::string &filepath);
+
+// Deserializes BEF file from filepath into a BEFBuffer.
+absl::StatusOr<tfrt::BefBuffer> DeserializeBEFBuffer(
+    const std::string &filepath);
+
+// Serializes the MLRTBytecodeBuffer into a file.
+absl::Status SerializeMLRTBytecode(const mlrt::bc::Buffer &byteCode,
+                                   const std::string &filepath);
+
+// Deserializes byte code from the given filepath into a MLRTBytecodeBuffer.
+absl::StatusOr<mlrt::bc::Buffer> DeserializeMlrtBytecodeBuffer(
+    const std::string &filepath);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/stubs/model_config_stub.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/stubs/model_config_stub.h
new file mode 100644
index 00000000..6518fd21
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/stubs/model_config_stub.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_STUBS_MODEL_CONFIG_STUB_H_
+#define TENSORFLOW_CORE_TFRT_STUBS_MODEL_CONFIG_STUB_H_
+
+#include <memory>
+
+#include "absl/log/log.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// TODO(b/299140515): Deprecate this stub and OSS the implementation.
+// The tfrt model config stub that provides interface for internal and OSS
+// with different impls.
+class ModelConfigStub {
+ public:
+  virtual ~ModelConfigStub() = default;
+
+  virtual void GetDefaultInputsFromModelConfig(ModelRuntimeContext& context,
+                                               SignatureMap& signatures) {
+    LOG(INFO) << "Unimplemented in non internal env";
+  }
+};
+
+// The return value is to facilitate the global registration.
+bool RegisterModelConfigStub(std::unique_ptr<ModelConfigStub> stub);
+
+void GetDefaultInputsFromModelConfig(ModelRuntimeContext& context,
+                                     SignatureMap& signatures);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_STUBS_MODEL_CONFIG_STUB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/stubs/tfrt_native_lowering_stub.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/stubs/tfrt_native_lowering_stub.h
new file mode 100644
index 00000000..d27fe02d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/stubs/tfrt_native_lowering_stub.h
@@ -0,0 +1,59 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_STUBS_TFRT_NATIVE_LOWERING_STUB_H_
+#define TENSORFLOW_CORE_TFRT_STUBS_TFRT_NATIVE_LOWERING_STUB_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/graph_executor/executable_context.h"
+#include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+namespace tfrt {
+
+// The tfrt native lowering stub that provides interface for internal and OSS
+// with different impls.
+class TfrtNativeLoweringStub {
+ public:
+  virtual ~TfrtNativeLoweringStub() = default;
+  virtual void AddSyncContext(
+      mlrt::ExecutionContext& execution_context, HostContext& host_context,
+      tensorflow::tfrt_stub::SyncResourceState* sync_state) {}
+  virtual absl::StatusOr<
+      std::shared_ptr<tensorflow::tfrt_stub::ExecutableContext>>
+  BuildExecutableContext(mlir::ModuleOp module,
+                         const mlrt::KernelRegistry& kernel_registry) {
+    return absl::UnimplementedError("");
+  }
+};
+
+void RegisterTfrtNativeLoweringStub(
+    std::unique_ptr<TfrtNativeLoweringStub> stub);
+
+void AddSyncContext(mlrt::ExecutionContext& execution_context,
+                    tfrt::HostContext& host_context,
+                    tensorflow::tfrt_stub::SyncResourceState* sync_state);
+
+absl::StatusOr<std::shared_ptr<tensorflow::tfrt_stub::ExecutableContext>>
+BuildExecutableContext(mlir::ModuleOp module,
+                       const mlrt::KernelRegistry& kernel_registry);
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_STUBS_TFRT_NATIVE_LOWERING_STUB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/tfrt_session/tfrt_session.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
new file mode 100644
index 00000000..84de49eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
@@ -0,0 +1,121 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_TFRT_SESSION_TFRT_SESSION_H_
+#define TENSORFLOW_CORE_TFRT_TFRT_SESSION_TFRT_SESSION_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Struct exposing a few threadpool configuration options. These
+// correspond to the options in RunHandlerThreadWorkQueue::Options.
+struct TfrtThreadpoolOptions {
+  // Number of threads used for running graphs.
+  int32_t num_main_threads = port::MaxParallelism();
+
+  // Time to wait for the init function to complete.
+  absl::Duration init_timeout = absl::Milliseconds(100);
+
+  // Maximum number of concurrent RunHandlers.
+  int32_t max_concurrent_handler = 128;
+
+  // Number of sub thread pools.
+  int32_t num_sub_thread_pool = 1;
+};
+
+struct TfrtSessionOptions {
+  TfrtThreadpoolOptions threadpool_options;
+  tensorflow::tfrt_stub::Runtime* runtime = nullptr;
+  bool enable_mlrt = false;
+  // Should only set one of `use_tpu` and `use_gpu` and `backend_compiler`.
+  bool use_tpu = false;
+  bool use_gpu = false;
+  tensorflow::BackendCompiler* backend_compiler = nullptr;
+};
+
+// Factory class to create `TfrtSession` instances.
+class TfrtSessionFactory : public tensorflow::SessionFactory {
+ public:
+  TfrtSessionFactory();
+
+  bool AcceptsOptions(const SessionOptions& options) override;
+
+  absl::Status NewSession(const SessionOptions& options,
+                          Session** out_session) override
+      TF_LOCKS_EXCLUDED(mutex_);
+
+  // This should only be used for the sake initializing resources for
+  // Python executables. It should only be called before main.
+  //
+  // Due to lack of applications and a concern for the ordering of initializers,
+  // this may only be called once.
+  using RuntimeInitializer = absl::Status (*)(tfrt_stub::Runtime*);
+  static void RegisterInitializer(RuntimeInitializer initializer);
+
+  // May not be called within code holding mutex_.
+  static tfrt_stub::Runtime* GetRuntime();
+
+ private:
+  class ThreadPoolManager;
+  friend absl::Status InitializeTfrtSession(const TfrtSessionOptions& options);
+  friend absl::Status UpdateTfrtSessionOptionsLocked(
+      const TfrtSessionOptions& options);
+  absl::Status InitializeLocked(const TfrtSessionOptions& options)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  bool IsInitialized() const TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    return runtime_ != nullptr;
+  }
+
+  mutable absl::Mutex mutex_;
+  mutable absl::Mutex runtime_mutex_;
+  tensorflow::tfrt_stub::Runtime* runtime_ TF_GUARDED_BY(mutex_) = nullptr;
+  std::unique_ptr<tensorflow::tfrt_stub::Runtime> owned_runtime_
+      TF_GUARDED_BY(mutex_);
+
+  TfrtDeviceInfraTarget device_target_ TF_GUARDED_BY(mutex_) =
+      TfrtDeviceInfraTarget::kCpu;
+  bool tpu_use_tpu_runner_ TF_GUARDED_BY(mutex_) = false;
+  bool use_gpu_ TF_GUARDED_BY(mutex_) = false;
+  std::unique_ptr<ThreadPoolManager> thread_pool_manager_ TF_GUARDED_BY(mutex_);
+  bool enable_mlrt_ TF_GUARDED_BY(mutex_) = false;
+  tensorflow::BackendCompiler* backend_compiler_ TF_GUARDED_BY(mutex_);
+  std::unique_ptr<StaticDeviceMgr> device_manager_;
+};
+
+// Configures the TfrtSessionFactory according to `options`. Should not be
+// called within functions that are passed into
+// `TfrtSessionFactory::RegisterInitializer`, because it acquires `mutex_`.
+absl::Status InitializeTfrtSession(const TfrtSessionOptions& options);
+
+// Version of `InitializeTfrtSession` that can be used within functions passed
+// into `TfrtSessionFactory::RegisterInitializer`.
+absl::Status UpdateTfrtSessionOptionsLocked(const TfrtSessionOptions& options);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_TFRT_SESSION_TFRT_SESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/tfrt_session/tfrt_session_init.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/tfrt_session/tfrt_session_init.h
new file mode 100644
index 00000000..7891a0a8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/tfrt_session/tfrt_session_init.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_TFRT_SESSION_TFRT_SESSION_INIT_H_
+#define TENSORFLOW_CORE_TFRT_TFRT_SESSION_TFRT_SESSION_INIT_H_
+
+#include "tensorflow/core/common_runtime/local_session_selection.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Use TfrtSession as the Session implementation for local session.
+//
+// TODO(jingdong): Merge this function with the InitializeTfrtSession() in
+// tfrt_session.h after we decouple TPU logic from TfrtSession.
+inline absl::Status InitializeTfrtSession() {
+  SetDefaultLocalSessionImpl(LocalSessionImpl::kTfrtSession);
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_TFRT_SESSION_TFRT_SESSION_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/any_ptr.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/any_ptr.h
new file mode 100644
index 00000000..8b1a496c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/any_ptr.h
@@ -0,0 +1,170 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_ANY_PTR_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_ANY_PTR_H_
+
+#include <cstddef>
+#include <memory>
+
+namespace tfrt {
+
+/// A (sort of) type-safe void*. Appears as null if a caller attempts to use it
+/// as the wrong type.
+///
+///  Example use:
+///
+///    // A function that returns an AnyPtr:
+///    AnyPtr StringOrInt() {
+///      if (use_string) {
+///        return AnyPtr(&some_string);
+///      } else {
+///        return AnyPtr(&some_int);
+///      }
+///    }
+///
+///    // Use an AnyPtr at the correct type:
+///    AnyPtr ptr = StringOrInt();
+///    if (ptr.get<int>() != nullptr) {
+///      DoSomethingWithInt(*ptr.get<int>());
+///    } else if (ptr.get<string>() != nullptr) {
+///      DoSomethingWithString(*ptr.get<string>());
+///    } else {
+///      // Handle error.
+///    }
+///
+/// Typical best practice for this class is to use it when two disjoint pieces
+/// of code must agree on type, but intermediate code is type agnostic. Large
+/// chains of conditionals that handle a multitude of types is discouraged as an
+/// anti-pattern.
+///
+/// Note that this will appear null even if T is somewhere on the underlying
+/// type's inheritance hierarchy, if you must use the object at some other type
+/// you must do so explicitly when constructing an AnyPtr, like so:
+///
+///   SomeObject object;
+///   AnyPtr any_ptr(static_cast<SomeInterface*>(&object));
+///   SomeInterface* interface = any_ptr.get<SomeInterface>();
+///
+/// This class is a value type; It can be copied or assigned. It performs no
+/// internal allocations and should be relatively cheap to copy or return by
+/// value.
+class AnyPtr {
+ public:
+  /// AnyPtr is void and null by default.
+  AnyPtr() : type_id_(FastTypeId<void>()), ptr_(nullptr) {}
+
+  /// Implicit construction from nullptr.
+  AnyPtr(std::nullptr_t) : AnyPtr() {}  // NOLINT
+
+  /// Construct from a pointer to any type.
+  template <typename T>
+  AnyPtr(T* ptr)  // NOLINT
+      : type_id_(FastTypeId<T>()),
+        // We need a double cast here, first to drop the type, and second to
+        // drop constness. We always cast back to the appropriate type and
+        // constness in get<>(), since FastTypeId is different for a const and
+        // non-const T.
+        ptr_(const_cast<void*>(reinterpret_cast<const void*>(ptr))) {}
+
+  /// Accessor for the underlying pointer if it is of type T, otherwise null.
+  template <typename T>
+  T* get() const {
+    if (type_id_ != FastTypeId<T>()) {
+      return nullptr;
+    }
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+ private:
+  template <typename Type>
+  static size_t FastTypeId() {
+    // Use a static variable to get a unique per-type address.
+    static int dummy;
+    return reinterpret_cast<std::size_t>(&dummy);
+  }
+
+  // The code for the type of 'ptr_'.
+  std::size_t type_id_;
+
+  // The underlying pointer.
+  void* ptr_;
+};
+
+/// Like AnyPtr, but owns the pointed-to object (calls delete upon destruction).
+/// This class is move-only, like std::unique_ptr.
+class UniqueAnyPtr {
+ public:
+  /// UniqueAnyPtr is void and null by default.
+  UniqueAnyPtr() = default;
+  UniqueAnyPtr(std::nullptr_t) : UniqueAnyPtr() {}  // NOLINT
+
+  /// Construct from a unique pointer to any type.
+  template <typename T>
+  explicit UniqueAnyPtr(std::unique_ptr<T> ptr)
+      : ptr_(ptr.release()), deleter_(DeleterForType<T>()) {}
+
+  ~UniqueAnyPtr() { deleter_(ptr_); }
+
+  // Disable copy.
+  UniqueAnyPtr(const UniqueAnyPtr& other) = delete;
+  UniqueAnyPtr& operator=(const UniqueAnyPtr& other) = delete;
+
+  // Allow move.
+  UniqueAnyPtr(UniqueAnyPtr&& other) noexcept { swap(other); }
+
+  UniqueAnyPtr& operator=(UniqueAnyPtr&& other) noexcept {
+    swap(other);
+    return *this;
+  }
+
+  /// Accessor for the underlying pointer if it is of type T, otherwise null.
+  template <typename T>
+  T* get() const {
+    return ptr_.get<T>();
+  }
+
+  /// Accessor for the underlying pointer as an AnyPtr.
+  const AnyPtr& as_any_ptr() const { return ptr_; }
+
+  void swap(UniqueAnyPtr& other) noexcept {
+    using ::std::swap;
+    swap(ptr_, other.ptr_);
+    swap(deleter_, other.deleter_);
+  }
+
+ private:
+  // We use a raw function pointer. This eliminates the copy and calling
+  // overhead of std::function.
+  using Deleter = void (*)(AnyPtr ptr);
+
+  // Returns a 'Deleter' that will delete it's argument as an instance of 'T'.
+  // Always returns the same value for the same 'T'.
+  template <typename T>
+  static Deleter DeleterForType() {
+    return [](AnyPtr ptr) { delete ptr.get<T>(); };
+  }
+
+  static Deleter NoOpDeleter() {
+    return [](AnyPtr ptr) {};
+  }
+
+  AnyPtr ptr_ = nullptr;
+  Deleter deleter_ = NoOpDeleter();
+};
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_ANY_PTR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/bridge_graph_analysis.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/bridge_graph_analysis.h
new file mode 100644
index 00000000..3fe4d67d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/bridge_graph_analysis.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_BRIDGE_GRAPH_ANALYSIS_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_BRIDGE_GRAPH_ANALYSIS_H_
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tfrt {
+
+inline tensorflow::Status CheckTpuMlirBridgeCompatibility(
+    const tensorflow::GraphDef& graph_def) {
+  return tensorflow::OkStatus();
+}
+
+inline tensorflow::Status CheckSpmdGraph(
+    const tensorflow::GraphDef& graph_def) {
+  return tensorflow::OkStatus();
+}
+
+}  // namespace tfrt
+
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_BRIDGE_GRAPH_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h
new file mode 100644
index 00000000..068c19ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_DEBUG_NODE_IO_DUMP_REWRITER_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_DEBUG_NODE_IO_DUMP_REWRITER_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Rewrites `graph` by inserting dump nodes for `nodes_to_dump`. During graph
+// execution, the inputs and outputs of `nodes_to_dump` will be dumped to the
+// folder specified by env var `TF_DUMP_GRAPH_PREFIX`.
+absl::Status InsertDumpOps(
+    Graph& graph, const absl::flat_hash_set<std::string>& nodes_to_dump,
+    absl::string_view dump_dir = "");
+// Similar to the above, but rewrites a `meta_graph_def`.
+absl::Status InsertDumpOps(
+    MetaGraphDef& meta_graph_def,
+    const absl::flat_hash_set<std::string>& nodes_to_dump,
+    absl::string_view dump_dir = "");
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_DEBUG_NODE_IO_DUMP_REWRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/device_variables_table.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/device_variables_table.h
new file mode 100644
index 00000000..1b1a742e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/device_variables_table.h
@@ -0,0 +1,98 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_DEVICE_VARIABLES_TABLE_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_DEVICE_VARIABLES_TABLE_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+
+namespace tfrt {
+
+// A variable table that keeps track of the device copies of host tensors.
+// The same variable can have multiple copies on devices (e.g., on different TPU
+// cores), and hence they are differenticated via `copy_index`.
+// The table maps from <host tensor, copy_index> to device tensor.
+template <typename HostTensorType, typename DeviceTensorType>
+class DeviceVariablesTable {
+ public:
+  virtual ~DeviceVariablesTable() { ClearDeviceVariablesTable(); }
+
+  void AddOrUpdateDeviceVariable(
+      const HostTensorType& host_tensor, int copy_index,
+      AsyncValueRef<DeviceTensorType> device_tensor) {
+    absl::MutexLock lock(&device_variables_mu_);
+    device_variables_table_.insert_or_assign(
+        std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index),
+        std::move(device_tensor));
+  }
+
+  AsyncValueRef<DeviceTensorType> GetDeviceVariable(
+      const HostTensorType& host_tensor, int copy_index) {
+    absl::ReaderMutexLock lock(&device_variables_mu_);
+    auto it = device_variables_table_.find(
+        std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index));
+    return it != device_variables_table_.end()
+               ? it->second.CopyRef()
+               : AsyncValueRef<DeviceTensorType>();
+  }
+
+  AsyncValueRef<DeviceTensorType> GetOrAddDeviceVariable(
+      const HostTensorType& host_tensor, int copy_index,
+      llvm::unique_function<void(AsyncValueRef<DeviceTensorType>)> creator) {
+    absl::ReleasableMutexLock lock(&device_variables_mu_);
+    auto it = device_variables_table_.find(
+        std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index));
+    if (it != device_variables_table_.end()) return it->second.CopyRef();
+
+    auto device_tensor = MakeUnconstructedAsyncValueRef<DeviceTensorType>();
+    device_variables_table_.insert(
+        {std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index),
+         device_tensor.CopyRef()});
+    lock.Release();
+    creator(device_tensor.CopyRef());
+    return device_tensor;
+  }
+
+  void ClearDeviceVariablesTable() {
+    absl::MutexLock lock(&device_variables_mu_);
+    device_variables_table_.clear();
+  }
+
+  int size() {
+    absl::ReaderMutexLock lock(&device_variables_mu_);
+    return device_variables_table_.size();
+  }
+
+ protected:
+  // Get the host tensor data pointer, which is used as a part of the table key.
+  virtual const void* GetHostTensorDataPtr(
+      const HostTensorType& host_tensor) = 0;
+
+ private:
+  absl::Mutex device_variables_mu_;
+
+  // A map from <host tensor data, copy_index> to device tensor.
+  absl::flat_hash_map<std::pair<const void*, int>,
+                      AsyncValueRef<DeviceTensorType>>
+      device_variables_table_ ABSL_GUARDED_BY(device_variables_mu_);
+};
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_DEVICE_VARIABLES_TABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/error_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/error_util.h
new file mode 100644
index 00000000..229b854a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/error_util.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_ERROR_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_ERROR_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/support/error_util.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tfrt {
+class DecodedDiagnostic;
+
+tfrt::ErrorCode ConvertTfErrorCodeToTfrtErrorCode(const absl::Status& status);
+
+absl::Status CreateTfErrorStatus(const DecodedDiagnostic& error);
+
+absl::Status ToTfStatus(const AsyncValue* av);
+
+inline std::string MakeStatusString(absl::Status status) {
+  switch (static_cast<absl::StatusCode>(status.code())) {
+    case absl::StatusCode::kOk:
+      return "OK";
+    case absl::StatusCode::kCancelled:
+      return absl::StrCat("Cancelled: ", status.message());
+    case absl::StatusCode::kUnknown:
+      return absl::StrCat("Unknown: ", status.message());
+    case absl::StatusCode::kInvalidArgument:
+      return absl::StrCat("Invalid argument: ", status.message());
+    case absl::StatusCode::kDeadlineExceeded:
+      return absl::StrCat("Deadline exceeded: ", status.message());
+    case absl::StatusCode::kNotFound:
+      return absl::StrCat("Not found: ", status.message());
+    case absl::StatusCode::kAlreadyExists:
+      return absl::StrCat("Already exists: ", status.message());
+    case absl::StatusCode::kPermissionDenied:
+      return absl::StrCat("Permission denied: ", status.message());
+    case absl::StatusCode::kUnauthenticated:
+      return absl::StrCat("Unauthenticated: ", status.message());
+    case absl::StatusCode::kResourceExhausted:
+      return absl::StrCat("Resource exhausted: ", status.message());
+    case absl::StatusCode::kFailedPrecondition:
+      return absl::StrCat("Failed precondition: ", status.message());
+    case absl::StatusCode::kAborted:
+      return absl::StrCat("Aborted: ", status.message());
+    case absl::StatusCode::kOutOfRange:
+      return absl::StrCat("Out of range: ", status.message());
+    case absl::StatusCode::kUnimplemented:
+      return absl::StrCat("Unimplemented: ", status.message());
+    case absl::StatusCode::kInternal:
+      return absl::StrCat("Internal: ", status.message());
+    case absl::StatusCode::kUnavailable:
+      return absl::StrCat("Unavailable: ", status.message());
+    case absl::StatusCode::kDataLoss:
+      return absl::StrCat("Data loss: ", status.message());
+    default:
+      return absl::StrCat("Unknown code: ", status.message());
+  }
+}
+
+inline llvm::Error MakeStatusError(absl::Status status) {
+  return MakeStringError(MakeStatusString(status));
+}
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_ERROR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/fallback_tensor.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/fallback_tensor.h
new file mode 100644
index 00000000..c5b81f36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/fallback_tensor.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// A special tensor wrapper for immutable tensors that live a long time and are
+// reused across steps in a program, eg. weights.
+class ImmutableTensor {
+ public:
+  ImmutableTensor() = default;
+  // Create an ImmutableTensor by copying the content in `tensor`.
+  static ImmutableTensor Create(tensorflow::Tensor tensor);
+
+  // Accessors for this underlying tensor. Users must not modify its content. It
+  // is guaranteed that RefCountIsOne() always return false for the tensor.
+  tensorflow::Tensor& tensor() { return tensor_; }
+  const tensorflow::Tensor& tensor() const { return tensor_; }
+
+ private:
+  explicit ImmutableTensor(tensorflow::Tensor tensor)
+      : tensor_(std::move(tensor)) {
+    DCHECK(!tensor_.RefCountIsOne())
+        << "Immutable tensors' buffers cannot be forwarded.";
+  }
+
+  tensorflow::Tensor tensor_;
+};
+
+// A wrapper class over normal tensors and immutable tensors. This class is used
+// as the currency type in TFRT fallback execution. Note that this class does
+// not own the underlying tensor if it is an immutable tensor.
+class FallbackTensor {
+ public:
+  FallbackTensor() = default;
+
+  explicit FallbackTensor(const tensorflow::Tensor& tensor) : tensor_(tensor) {}
+  explicit FallbackTensor(tensorflow::Tensor&& tensor)
+      : tensor_(std::move(tensor)) {}
+
+  explicit FallbackTensor(ImmutableTensor* immutable_tensor)
+      : tensor_(immutable_tensor->tensor()), is_immutable_(true) {}
+
+  FallbackTensor(const FallbackTensor& other) { *this = other; }
+  FallbackTensor& operator=(const FallbackTensor& other) {
+    tsl::profiler::TraceMe trace_me("FallbackTensor::Copy");
+    if (!other.is_immutable() && other.buffer() != nullptr) {
+      // Create a new TensorBuffer which contains a new atomic counter for each
+      // result, to avoid downstream threads contending the original atomic
+      // counter.
+      tensor_ = std::move(
+          tensorflow::tfrt_stub::ImmutableTensor::Create(other.tensor())
+              .tensor());
+    } else {
+      // For immutable tensors or empty tensors, we just need to copy the
+      // pointer as they don't incur atomic operations when they are referenced.
+      tensor_ = other.tensor();
+    }
+    is_immutable_ = true;
+    return *this;
+  }
+
+  FallbackTensor(FallbackTensor&&) noexcept = default;
+  FallbackTensor& operator=(FallbackTensor&&) noexcept = default;
+
+  const TensorBuffer* buffer() const {
+    return tensorflow::DMAHelper::buffer(&tensor());
+  }
+  TensorBuffer* buffer() { return tensorflow::DMAHelper::buffer(&tensor()); }
+
+  bool is_immutable() const { return is_immutable_; }
+
+  tensorflow::Tensor& tensor() { return tensor_; }
+  const tensorflow::Tensor& tensor() const { return tensor_; }
+
+ private:
+  tensorflow::Tensor tensor_;
+  bool is_immutable_ = false;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/gpu_variables_table.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/gpu_variables_table.h
new file mode 100644
index 00000000..7e413f56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/gpu_variables_table.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_GPU_VARIABLES_TABLE_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_GPU_VARIABLES_TABLE_H_
+
+#include "tensorflow/core/tfrt/utils/device_variables_table.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+
+namespace tfrt {
+namespace gpu {
+
+// This is for creating/getting GpuVariablesTable object in the execution
+// context at runtime.
+constexpr char kGpuVariablesTableResourceName[] = "GpuVariablesTableResource";
+
+// A variable table that keeps track of the device copies of GPU host tensors.
+class GpuVariablesTable
+    : public DeviceVariablesTable<tensorflow::tfrt_stub::FallbackTensor,
+                                  tensorflow::tfrt_stub::FallbackTensor> {
+ private:
+  const void* GetHostTensorDataPtr(
+      const tensorflow::tfrt_stub::FallbackTensor& host_tensor) override {
+    return host_tensor.tensor().data();
+  }
+};
+
+}  // namespace gpu
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_GPU_VARIABLES_TABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/graph_partition.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/graph_partition.h
new file mode 100644
index 00000000..4f5cedd2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/graph_partition.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_GRAPH_PARTITION_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_GRAPH_PARTITION_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Inserts send/recv ops to `graph` if nodes are assigned to multiple devices.
+// Specifically, nodes on the same device will be wrapped in a function and
+// invoked by a PartitionedCall op. All PartitionedCall ops are connected to a
+// StatefulPartitionedCall op (which behaves as a 'stateful IdentityN') to
+// protect them from being pruned in the subsequent MLIR lowering passes
+// (b/232026253).
+//
+// The following shows a simple example of using this method.
+//
+// The original graph has four nodes that are placed on different devices.
+//
+//        ----->  op1(host)  ------
+//       /                         \
+//   input(host)               output(host)
+//       \                         /
+//        -----> op2(device) ------
+//
+// Calling this method will return the following graph, where `op1` is wrapped
+// in the function invoked by `PartitionedCall_1`, and `op2` is wrapped in the
+// function invoked by `PartitionedCall_2`. Both of them have a data dependency
+// with the `StatefulPartitionedCall` op.
+//
+//   input ---> PartitionedCall_1 ----
+//                                    \
+//                         StatefulPartitionedCall ---> output
+//                                    /
+//              PartitionedCall_2 ----
+//
+absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
+    const std::string& graph_func_name, const DeviceSet& device_set,
+    const Device* host_device, const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::vector<std::string>& control_outputs,
+    std::unique_ptr<Graph> graph);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_GRAPH_PARTITION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/tensor_util.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/tensor_util.h
new file mode 100644
index 00000000..358d7604
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/tensor_util.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_TENSOR_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_TENSOR_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor.h"  // from @tf_runtime
+
+namespace tfrt {
+
+// Converts a tfrt::Tensor to tensorflow::Tensor.
+llvm::Expected<tensorflow::Tensor> TFRTTensorToTFTensor(const Tensor& tensor);
+
+// Converts a tensorflow::Tensor to tfrt::TensorHandle.
+AsyncValueRef<TensorHandle> TFTensorToTFRTTensorHandle(
+    const tensorflow::Tensor& tf_tensor, HostContext* host_ctx);
+
+// Creates a TFRT TensorHandle using the shape and data in a tensorflow tensor.
+absl::StatusOr<TensorHandle> CreateTensorHandleFromTFTensor(
+    const tensorflow::Tensor& tensor, HostContext* host);
+
+// Creates a tensorflow tensor using the shape and data in a TFRT tensorhandle.
+absl::StatusOr<tensorflow::Tensor> CreateTFTensorFromTensorHandle(
+    const TensorHandle& tensor_handle);
+
+// Converts a tensorflow::Tensor to tfrt::DenseHostTensor.
+// TODO(tfrt-devs): consider generalize to TFTensorToTFRTTensor
+Expected<DenseHostTensor> ConvertTfTensorToDHT(tensorflow::Tensor tf_tensor);
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_TENSOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
new file mode 100644
index 00000000..2912c2ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
@@ -0,0 +1,145 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_TFRT_GRAPH_EXECUTION_STATE_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_TFRT_GRAPH_EXECUTION_STATE_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/mlir_bridge_config_v1.pb.h"
+#include "tensorflow/core/common_runtime/graph_execution_state.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// This is a TFRT variant of `tensorflow::GraphExecutionState`. It wraps
+// `tensorflow::GraphExecutionState` and adds TFRT-specific adjustments.
+//
+// Responsible for generating an executable `Graph` from the original `GraphDef`
+// that specifies the complete graph and from `GraphImportConfig` that specifies
+// input/output nodes.
+//
+// Thread-safe.
+class TfrtGraphExecutionState {
+ public:
+  struct OptimizationResult {
+    std::unique_ptr<tensorflow::Graph> graph;
+    absl::Duration functionalization_duration;
+    absl::Duration grappler_duration;
+  };
+
+  struct Options {
+    bool run_placer_grappler_on_functions = false;
+    bool run_placer_on_graph = true;
+  };
+
+  // Creates a `GraphExecutionState` given `graph_def` and `fallback_state`.
+  static absl::StatusOr<std::unique_ptr<TfrtGraphExecutionState>> Create(
+      const Options& options, tensorflow::GraphDef graph_def,
+      const FallbackState& fallback_state,
+      tensorflow::tfrt_stub::RuntimeConfig* runtime_config = nullptr);
+
+  // Ctor. Do not use directly. Public only for `std::make_unique<>()`.
+  TfrtGraphExecutionState(
+      const Options& options,
+      std::unique_ptr<tensorflow::GraphExecutionState> graph_execution_state,
+      const FallbackState& fallback_state,
+      absl::flat_hash_set<std::string> functions_to_optimize)
+      : options_(options),
+        graph_execution_state_(std::move(graph_execution_state)),
+        fallback_state_(fallback_state),
+        functions_to_optimize_(std::move(functions_to_optimize)) {}
+
+  // Creates an optimized graph by pruning with `graph_import_config` and
+  // best-effort Grappler run.
+  absl::StatusOr<OptimizationResult> CreateOptimizedGraph(
+      tensorflow::GraphImportConfig& graph_import_config);
+
+  // Extends the current graph by `graph`.
+  absl::Status Extend(const GraphDef& graph);
+
+  // Return the preprocessed full graph. Note that it does not contain the
+  // function library in the original graph.
+  const tensorflow::Graph& graph() const {
+    absl::MutexLock lock(&graph_execution_state_mu_);
+    DCHECK(graph_execution_state_->full_graph());
+    return *graph_execution_state_->full_graph();
+  }
+
+  // The original graph.
+  const GraphDef* original_graph_def() const {
+    absl::MutexLock lock(&graph_execution_state_mu_);
+    return graph_execution_state_->original_graph_def();
+  }
+
+  // Return the function library in the original graph.
+  const FunctionLibraryDefinition& flib_def() const {
+    absl::MutexLock lock(&graph_execution_state_mu_);
+    return graph_execution_state_->flib_def();
+  }
+
+ private:
+  absl::StatusOr<std::unique_ptr<tensorflow::Graph>> OptimizeGraph(
+      const tensorflow::Graph& graph,
+      const tensorflow::BuildGraphOptions& build_graph_options);
+
+  Options options_;
+
+  std::unique_ptr<tensorflow::GraphExecutionState> graph_execution_state_
+      ABSL_GUARDED_BY(graph_execution_state_mu_);
+  // We need this mutex even thought `GraphExecutionState` is thread-safe,
+  // because `swap()` is not thread-safe.
+  mutable absl::Mutex graph_execution_state_mu_;
+
+  const FallbackState& fallback_state_;
+  // Only valid if `options_.run_placer_grappler_on_functions` is true.
+  absl::flat_hash_set<std::string> functions_to_optimize_
+      ABSL_GUARDED_BY(graph_execution_state_mu_);
+};
+
+// Prunes the `graph_def` using the feed/fetch nodes specified in
+// `callable_options`. It is a TFRT-specific version that it performs more
+// pruning (e.g., prunes the input edges to the feed nodes) than
+// `ComputeTransitiveFanin()` so that the graph can be functionalized properly
+// later.
+absl::Status PruneGraphDef(GraphDef& graph_def,
+                           const CallableOptions& callable_options);
+
+// Eliminates ref variables in V1 control flow, which is required for
+// functionalization. Current strategy is to insert an identity node between
+// each ref node and its ref input and in-place update the ref node to its
+// non-ref counterpart.
+absl::Status EliminateRefVariablesFromV1ControlFlow(GraphDef& graph_def);
+
+// Removes the "_input_shapes" attribute of functions in the graph.
+void RemoveInputShapesInFunctions(tensorflow::GraphDef& graph_def);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_TFRT_GRAPH_EXECUTION_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/thread_pool.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/thread_pool.h
new file mode 100644
index 00000000..0efe9133
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/thread_pool.h
@@ -0,0 +1,61 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_THREAD_POOL_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_THREAD_POOL_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/threadpool_interface.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class TfThreadPool : public thread::ThreadPoolInterface {
+ public:
+  explicit TfThreadPool(const std::string& name, int num_threads)
+      : underlying_threadpool_(tensorflow::Env::Default(), name, num_threads) {}
+
+  void Schedule(std::function<void()> fn) override {
+    underlying_threadpool_.Schedule(std::move(fn));
+  }
+
+  void ScheduleWithHint(std::function<void()> fn, int start, int end) override {
+    underlying_threadpool_.ScheduleWithHint(std::move(fn), start, end);
+  }
+
+  void Cancel() override {
+    underlying_threadpool_.AsEigenThreadPool()->Cancel();
+  }
+
+  int NumThreads() const override {
+    return underlying_threadpool_.NumThreads();
+  }
+
+  int CurrentThreadId() const override {
+    return underlying_threadpool_.CurrentThreadId();
+  }
+
+ private:
+  tensorflow::thread::ThreadPool underlying_threadpool_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_THREAD_POOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/utils.h b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/utils.h
new file mode 100644
index 00000000..970de920
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tfrt/utils/utils.h
@@ -0,0 +1,136 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
+#include "tfrt/dtype/dtype.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tensorflow {
+class Device;
+}  // namespace tensorflow
+
+namespace tfrt {
+
+class BEFFile;
+class ExecutionContext;
+class HostContext;
+
+typedef absl::InlinedVector<tfrt::DType, 4UL> TfrtDataTypeVector;
+typedef absl::Span<const tfrt::DType> TfrtDataTypeSlice;
+
+DType ConvertTfDTypeToTfrtDType(tensorflow::DataType dtype);
+
+// Runs the runtime initialization function. A runtime initialization function
+// is added by runtime/compiler workflow and is not present in the original
+// savedmodel.
+//
+// TODO(b/178714905): We should avoid special handling on initialization by
+// letting compiler to handle it.
+absl::Status RunRuntimeInitializer(const tfrt::ExecutionContext& exec_ctx,
+                                   tfrt::BEFFile* bef_file,
+                                   absl::string_view fallback_init_func);
+
+// Creates dummy TF devices from the input device names. Currently this method
+// is used to create the TPU_SYSTEM device for worker server.
+void CreateDummyTfDevices(
+    const std::vector<std::string>& device_names,
+    std::vector<std::unique_ptr<tensorflow::Device>>* dummy_tf_devices);
+
+// Creates and add dummy TFRT devices from the input device names. Currently
+// this method is used to create the TPU_SYSTEM device for worker server.
+void AddDummyTfrtDevices(const std::vector<std::string>& device_names,
+                         tfrt::HostContext* host_ctx);
+
+// Creates a BEF file from a BEF buffer. `runtime` is used to provide host
+// context for opening `bef`.
+absl::StatusOr<RCReference<tfrt::BEFFile>> CreateBefFileFromBefBuffer(
+    const tensorflow::tfrt_stub::Runtime& runtime, const tfrt::BefBuffer& bef);
+
+// Returns a unique integer within this process.
+int64_t GetUniqueInt();
+
+// Returns current CPU time.
+uint64_t GetCpuClockCycle();
+
+// A list of macros similar to `TF_RETURN_IF_ERROR`, with additional model
+// loading stage info.
+#define RETURN_IF_ERROR_IN_IMPORT(...) \
+  RETURN_IF_ERROR_WITH_STAGE_INFO("GraphDef proto -> MLIR", __VA_ARGS__)
+
+#define RETURN_IF_ERROR_IN_COMPILE(...)                                      \
+  RETURN_IF_ERROR_WITH_STAGE_INFO(                                           \
+      "TF dialect -> TFRT dialect, compiler issue, please contact the TFRT " \
+      "team",                                                                \
+      __VA_ARGS__)
+
+#define RETURN_IF_ERROR_IN_INIT(...) \
+  RETURN_IF_ERROR_WITH_STAGE_INFO("Initialize TFRT", __VA_ARGS__)
+
+#define RETURN_IF_ERROR_WITH_STAGE_INFO(stage, ...)                       \
+  do {                                                                    \
+    ::tensorflow::Status _status = (__VA_ARGS__);                         \
+    if (TF_PREDICT_FALSE(!_status.ok())) {                                \
+      return ::tensorflow::errors::CreateWithUpdatedMessage(              \
+          _status,                                                        \
+          ::tensorflow::strings::StrCat(stage, ": ", _status.message())); \
+    }                                                                     \
+  } while (0)
+
+// A list of macros similar to `TF_ASSIGN_OR_RETURN`, with additional model
+// loading stage info.
+#define ASSIGN_OR_RETURN_IN_IMPORT(lhs, rexpr) \
+  ASSIGN_OR_RETURN_WITH_STAGE_INFO("GraphDef proto -> MLIR", lhs, rexpr)
+
+#define ASSIGN_OR_RETURN_IN_COMPILE(lhs, rexpr)                              \
+  ASSIGN_OR_RETURN_WITH_STAGE_INFO(                                          \
+      "TF dialect -> TFRT dialect, compiler issue, please contact the TFRT " \
+      "team",                                                                \
+      lhs, rexpr)
+
+#define ASSIGN_OR_RETURN_IN_INIT(lhs, rexpr) \
+  ASSIGN_OR_RETURN_WITH_STAGE_INFO("Initialize TFRT", lhs, rexpr)
+
+#define ASSIGN_OR_RETURN_WITH_STAGE_INFO(stage, lhs, rexpr)                    \
+  ASSIGN_OR_RETURN_WITH_STAGE_INFO_IMPL(                                       \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), stage, lhs, \
+      rexpr)
+
+#define ASSIGN_OR_RETURN_WITH_STAGE_INFO_IMPL(statusor, stage, lhs, rexpr) \
+  auto statusor = (rexpr);                                                 \
+  if (TF_PREDICT_FALSE(!statusor.ok())) {                                  \
+    const auto& _status = statusor.status();                               \
+    return ::tensorflow::errors::CreateWithUpdatedMessage(                 \
+        _status,                                                           \
+        ::tensorflow::strings::StrCat(stage, ": ", _status.message()));    \
+  }                                                                        \
+  lhs = std::move(statusor.value())
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.h
new file mode 100644
index 00000000..746ec93d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COMBINE_TPU_EMBEDDING_LOAD_RETRIEVE_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COMBINE_TPU_EMBEDDING_LOAD_RETRIEVE_PASS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Merges per-table TPUEmbedding load and retrieve operators into global
+// operators.
+class CombineTPUEmbeddingLoadRetrievePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COMBINE_TPU_EMBEDDING_LOAD_RETRIEVE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/cond_builder.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/cond_builder.h
new file mode 100644
index 00000000..dd827a3b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/cond_builder.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COND_BUILDER_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COND_BUILDER_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Conditional builder.
+// Convenience builder to make it easy to construct a conditional. E.g.,
+//   Node* pred = ...;
+//   CondBuilder cb("cond", g);
+//   auto switch_var = cb.AddInput("var", DT_RESOURCE);
+//   g->AddEdge(pred, 0, cb.pred(), 0);
+// Will create the nodes of a conditional that takes as input a resource
+// variable ("var") as input and that switches on pred.
+//
+// This currently only handles the case needed by distributed_tpu_rewrite_pass
+// and is not completely general.
+class CondBuilder {
+ public:
+  enum Branch { kElseBranch = 0, kThenBranch = 1 };
+
+  CondBuilder(std::string name, std::string device, const NodeDebugInfo& debug,
+              Graph* graph);
+
+  // Returns node corresponding to the predicate input.
+  Node* pred();
+
+  // Returns node corresponding to switch_f branch of predicate switch.
+  Node* switch_f();
+
+  // Returns node corresponding to switch_t branch of predicate switch.
+  Node* switch_t();
+
+  // Returns node corresponding to control successor.
+  Node* control_successor();
+
+  // Returns the Switch node to feed a value of the given type into the
+  // conditional.
+  absl::Status AddInput(const std::string& input_name, const DataType& type,
+                        const std::string& device, const NodeDebugInfo& debug,
+                        Node** input);
+
+ private:
+  Node* control_successor_;
+  Node* switch_f_;
+  Node* switch_t_;
+  Node* pred_;
+  Graph* const graph_;
+  const std::string name_;
+  const std::string device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_COND_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.h
new file mode 100644
index 00000000..977447f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites ConfigureTPUEmbedding Op into nodes which set up TPUEmbedding.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_CONFIGURE_TPU_EMBEDDING_REWRITE_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_CONFIGURE_TPU_EMBEDDING_REWRITE_PASS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// TODO(shizhiw): Clean up embedding related code from
+//  distributed_tpu_configuration_rewrite_pass.cc.
+// Replaces dummy ConfigureTPUEmbedding Ops assigned to TPU_SYSTEM
+// devices with nodes which will set up TPU Embedding.
+class ConfigureTPUEmbeddingRewritePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_CONFIGURE_TPU_EMBEDDING_REWRITE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_pass.h
new file mode 100644
index 00000000..ecde017f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_configuration_rewrite_pass.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites ConfigureDistributedTPU Op into a graph that configures each host.
+//
+// See the comment at the top of
+// third_party/tensorflow/core/ops/tpu_configuration_ops.cc to see the
+// sequence of Ops used to configure a distributed TPU system.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_CONFIGURATION_REWRITE_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_CONFIGURATION_REWRITE_PASS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Replaces dummy ConfigureDistributedTPU Ops assigned to TPU_SYSTEM
+// devices with _ConfigureDistributedTPU and _WaitForDistributedTPU
+// Ops on TPU_SYSTEM, and _InitializeHostForDistributedTPU on the CPU
+// device of each host in the same job as the given TPU_SYSTEM device.
+class DistributedTPUConfigurationRewritePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+// Replaces dummy ShutdownDistributedTPU Ops assigned to TPU_SYSTEM
+// devices with _ShutdownDistributedTPU Ops on TPU_SYSTEM and
+// _DisconnectHostFromDistributedTPUSystem on the CPU device of each
+// host in the same job as the given TPU_SYSTEM device.
+class DistributedTPUShutdownRewritePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_CONFIGURATION_REWRITE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
new file mode 100644
index 00000000..ae4bfc8b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for TPU rewrite passes.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_HELPERS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_HELPERS_H_
+
+#include <functional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+class DistributedTPURewriteHelpers {
+ public:
+  // Given a user-assigned device string, system_spec_string, parse it into
+  // system_spec. Verify that the device type is either TPU_SYSTEM or
+  // unassigned, and in the latter case set it to TPU_SYSTEM:0. Having set the
+  // type, verify that the spec matches a unique device in device_set, and
+  // return that device in system_device. The normal use case is for
+  // system_spec_string to identify the TPU_SYSTEM on replica 0, task 0 of the
+  // job that contains the TPU hardware.
+  // TODO(b/110910013): Possibly remove the tpu system device.
+  static absl::Status GetSystemDevice(const string& system_spec_string,
+                                      const DeviceSet& device_set,
+                                      DeviceNameUtils::ParsedName* system_spec,
+                                      Device** system_device);
+
+  // Given a parsed system spec (e.g., the one returned above from
+  // GetSystemDeviceName), return in host_devices the TPU_SYSTEM:0 device on
+  // every host in the spec's job. If the spec does not include an explicit job,
+  // "localhost" is used.  Returns an error if system_spec matches devices from
+  // a multiple jobs or replicas.
+  static absl::Status GetHostSystemDevices(
+      const DeviceNameUtils::ParsedName& system_spec,
+      const DeviceSet& device_set, std::vector<Device*>* host_system_devices);
+
+  // Given a parsed system spec (e.g., the one returned above from
+  // GetSystemDeviceName), sets `*tpu_devices` to a per-host vector of the TPU
+  // devices on every host in the spec's job. If the spec does not include an
+  // explicit job, "localhost" is used. Sets `*num_tpus_per_host` to the number
+  // of TPU devices in each host, and verifies that each host in the job has
+  // the same number of TPU devices.
+  // Returns an error if system_spec matches devices from a multiple jobs or
+  // replicas.
+  static absl::Status GetTPUDevices(
+      const DeviceNameUtils::ParsedName& system_spec,
+      const DeviceSet& device_set, int* num_tpus_per_host,
+      std::vector<std::vector<Device*>>* tpu_devices);
+
+  // Perform 'action' on every node in 'graph' of type
+  // 'node_type'. This function is designed for use with configuration
+  // Ops that have no inputs or outputs. The arguments passed to 'action' are:
+  // 'configuration_node_name': the name of the node that matched
+  // 'configuration_device_name': the name of the device that the
+  // matching node is placed on
+  // 'host_devices': the set of TPU_SYSTEM devices on hosts with TPUs that are
+  // in the same system as the node that matched.
+  // 'input_dependencies': the set of nodes that have control edges to
+  // the matching node.
+  // 'output_dependencies': the set of output port, destination node, input port
+  // triples that have edges from the matching node. Input port is
+  // Graph::kControlSlot for a control edge.
+  // 'graph': the graph being mutated.
+  struct OutputDependency {
+    int src_output;
+    Node* dst;
+    int dst_input;
+  };
+  static absl::Status ForConfigurationNodeMatchingType(
+      const string& node_type, Graph* graph, const DeviceSet& device_set,
+      const std::function<
+          absl::Status(const NodeDef& configuration_node_def,
+                       const string& configuration_device_name,
+                       const std::vector<Device*>& host_devices,
+                       const std::vector<Node*>& input_dependencies,
+                       const std::vector<OutputDependency>& output_dependencies,
+                       Graph* graph)>& action);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h
new file mode 100644
index 00000000..2c31b6d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.h
@@ -0,0 +1,619 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites TPUReplicate nodes into replicated computations on TPU.
+//
+// To represent a distributed TPU computation, we use the
+// TPUReplicate operator, that describes a subgraph (represented as a
+// Tensorflow function) to replicate across a TPU pod.
+//
+// Model parallelism and data parallelism:
+// ---------------------------------------
+// We support two different kinds of parallelism on TPU:
+// * data parallelism (replication), or parallelization across batches, and
+// * model parallelism, or parallelization within a batch.
+//
+// The function passed to a TPUReplicate operator is replicated many
+// times across a TPU pod (data parallelism). The `num_replicas` attribute
+// controls how many replicas of the computation to create. Replicas are mostly
+// independent; replicas can only communicate using the CrossReplicaSum
+// operator, which is typically used to communicate gradients during training.
+//
+// Each replica may optionally use more than one TPU core (model
+// parallelism). The `num_cores_per_replica` attribute controls how many cores
+// there are per replica. For each core, there is a virtual TPU_REPLICATED_CORE
+// device that is only valid within replicated TPU computations (e.g.,
+// TPU_REPLICATED_CORE:0, TPU_REPLICATED_CORE:1, etc.); each TPU_REPLICATED_CORE
+// device corresponds to one TPU core in every replica.
+// Each replica has runs its own copy of the computation assigned to each
+// TPU_REPLICATED_CORE device.
+//
+// The Python code is responsible for providing a device_assignment that
+// describes how the replicated logical cores map to physical cores on the TPU
+// topology.
+//
+// Inputs to TPUReplicate:
+// ------------------------------
+// The TPUReplicate operator takes three kinds of inputs, in the
+// following order:
+// * per-replica inputs. If there are three per-replica inputs (A, B, C) and two
+//   replicas, the first six arguments to TPUReplicate will be:
+//   A0 B0 C0 A1 B1 C1
+//   where Ai is the A input to the i-th replica.
+// * distributed inputs. These inputs follow the per-replica inputs.
+//   If there are two distributed inputs (E, F) and two replicas, the following
+//   arguments to TPUReplicate will be: E F.
+//   But there is local E and F on each replica.
+// * broadcast inputs. These inputs follow the distributed inputs. All
+//   replicas receive a copy of each of these inputs.
+// * variables. Resource variables accessed by the computation follow the
+//   broadcast inputs.
+//
+// For example, for a computation with two replicas, three per-replica inputs
+// (A, B, C), two distributed inputs(E, F), two broadcast inputs (X, Y), and two
+// variables (V, W), the arguments to TPUReplicate will be:
+// A0 B0 C0 A1 B1 C1 E F X Y V W
+// and each replica will receive the following arguments:
+// A B C E F X Y V W
+//
+// Distributed TPU compilation requires that the shapes of all operators
+// be known statically at compilation time, before any nodes have executed.
+// Shapes are determined using shape information emitted by InferShapes. It
+// is not possible to replicate Tensorflow operators with unknown or dynamic
+// shapes for TPU at present.
+//
+// Graph rewrite:
+// --------------
+// Compilation replaces TPUReplicate operators with:
+// * a single TPUCompile node that compiles the computations,
+// * one TPUExecute node for each TPU device in the system that
+//   executes the relevant computation,
+// * one ReadVariableOp for each variable accessed by the replicated
+//   computation,
+// * one AssignVariableOp for each variable accessed by the replicated
+//   computation. An assignment is built even if a variable is only read by the
+//   computation. We do not know which variables are written until we apply the
+//   XlaCompiler to the computation, but that does not happen until after the
+//   rewrite. Conservatively, we write back the values of all variables after
+//   the computation completes.
+//   TODO(phawkins): only write back variables that the computation may write.
+// * one Shape node for each Tensor or Variable input to the computation whose
+//   shape is not statically known at rewrite time. The input shapes are fed
+//   to the TPUCompile node.
+//
+// To ensure that the reads and writes seem to happen at the right time in the
+// graph execution, we add control edges from all predecessors of the original
+// TPUReplicate operator to each of the ReadVariableOp operators.
+// Similarly, we add control edges from all of the AssignVariableOp operators to
+// all of the successors of the TPUReplicate operator.
+//
+// The TPUReplicate rewrite must run before placement, since resource
+// variable inputs will have DT_RESOURCE, which cannot be sent across devices,
+// leading to objections from the placer. The rewrite rewrites the resource
+// accesses into explicit ReadVariableOp and AssignVariableOp operators that the
+// placer is free to colocate with the variables.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "xla/service/computation_placer.h"
+#include "xla/stream_executor/tpu/tpu_topology.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Replaces clusters assigned to TPU_SYSTEM devices with
+// TPUCompile and TPUExecute nodes assigned to the corresponding
+// TPU devices.
+class DistributedTPURewritePass : public GraphOptimizationPass {
+ public:
+  static void SetDistributedTpuRewritePassOptions(
+      bool distribute_vars, bool allow_xla_spmd_partition,
+      bool replicate_inputs_outputs_by_default_for_xla_spmd,
+      bool enable_cross_replica_sharding_mirrored_variables,
+      bool enable_automatic_model_parallelism, bool enable_xla_param_broadcast,
+      bool enable_multicore_locking, bool use_nd_sharding_ops);
+
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+  // The following methods are public only for the use of unit tests.
+
+  // See comment at the top of the file for how the inputs are ordered.
+  // Encapsulates the different TPU replicated node input and output
+  // information, and provide common APIs over them.
+  class ParameterInfo {
+   public:
+    ParameterInfo() = default;
+    ParameterInfo(int64_t num_replicas, int64_t num_per_replica_args,
+                  int64_t num_distributed_args, int64_t num_broadcast_args,
+                  int64_t num_variables, int64_t num_guaranteed_constants,
+                  int64_t num_retvals_per_replica)
+        : num_replicas_(num_replicas),
+          num_per_replica_args_(num_per_replica_args),
+          num_distributed_args_(num_distributed_args),
+          num_broadcast_args_(num_broadcast_args),
+          num_variables_(num_variables),
+          num_guaranteed_constants_(num_guaranteed_constants),
+          num_retvals_per_replica_(num_retvals_per_replica) {}
+
+    int64_t NumReplicas() const { return num_replicas_; }
+
+    int64_t NumPerReplicaArgs() const { return num_per_replica_args_; }
+
+    int64_t NumDistributedArgs() const { return num_distributed_args_; }
+
+    int64_t NumBroadcastArgs() const { return num_broadcast_args_; }
+
+    int64_t NumVariables() const { return num_variables_; }
+
+    int64_t NumGuaranteedConstants() const { return num_guaranteed_constants_; }
+
+    int64_t NumRetvalsPerReplica() const { return num_retvals_per_replica_; }
+
+    bool IsPerReplicaArg(int64_t index) const {
+      return index < num_per_replica_args_;
+    }
+
+    bool IsDistributedArg(int64_t index) const {
+      return index >= num_per_replica_args_ &&
+             index < (num_per_replica_args_ + num_distributed_args_);
+    }
+
+    bool IsBroadcastArg(int64_t index) const {
+      return (index >= num_per_replica_args_ + num_distributed_args_) &&
+             index < (num_per_replica_args_ + num_distributed_args_ +
+                      num_broadcast_args_);
+    }
+
+    bool IsVariableArg(int64_t index) const {
+      return index >= (num_per_replica_args_ + num_distributed_args_ +
+                       num_broadcast_args_) &&
+             index < (num_per_replica_args_ + num_distributed_args_ +
+                      num_broadcast_args_ + num_variables_);
+    }
+
+    bool IsConstantArg(int64_t index) const {
+      return index >= (num_per_replica_args_ + num_distributed_args_ +
+                       num_broadcast_args_ + num_variables_) &&
+             index < (num_per_replica_args_ + num_distributed_args_ +
+                      num_broadcast_args_ + num_variables_ +
+                      num_guaranteed_constants_);
+    }
+
+    // Returns the number of inputs which has been received by the host.
+    int64_t NumInputsFromHost() const {
+      return num_replicas_ * num_per_replica_args_ + num_distributed_args_ +
+             num_broadcast_args_ + num_variables_ + num_guaranteed_constants_;
+    }
+
+    // Returns the number of inputs which will be sent to each replica.
+    int64_t NumInputsToEachReplica() const {
+      return num_per_replica_args_ + num_distributed_args_ +
+             num_broadcast_args_ + num_variables_ + num_guaranteed_constants_;
+    }
+
+    // Returns the total number of output values returned to the host (for all
+    // replicas).
+    int64_t NumOutputsToHost() const {
+      return num_replicas_ * num_retvals_per_replica_;
+    }
+
+    // Returns the position of the first per-replica argument, within the set
+    // of all hosts arguments.
+    // Broadcast arguments follow the distributed arguments.
+    int64_t FirstBroadcastArgFromHost() const {
+      return num_replicas_ * num_per_replica_args_ + num_distributed_args_;
+    }
+
+    // Indices of mirrored variables across replicas, which should be
+    // categorized as per_replica_args.
+    const std::set<int64_t>& mirrored_variable_indices() const {
+      return mirrored_variable_indices_;
+    }
+    std::set<int64_t>* mutable_mirrored_variable_indices() {
+      return &mirrored_variable_indices_;
+    }
+
+   private:
+    int64_t num_replicas_ = 1;
+    int64_t num_per_replica_args_ = 0;
+    int64_t num_distributed_args_ = 0;
+    int64_t num_broadcast_args_ = 0;
+    int64_t num_variables_ = 0;
+    int64_t num_guaranteed_constants_ = 0;
+    int64_t num_retvals_per_replica_ = 0;
+    std::set<int64_t> mirrored_variable_indices_;
+  };
+
+  // Mapping from TPUReplicate cluster name to tpu device names. Value is a
+  // mapping from [replica][core] to a TF device name.
+  typedef absl::flat_hash_map<std::string,
+                              std::vector<std::vector<std::string>>>
+      TPUReplicateDeviceNamesMapping;
+
+  // Determines which devices to use to run the computation.
+  // Inputs:
+  // * num_tpus_per_task: the number of TPU devices attached to each task
+  // * tpu_devices: a [task][device] collection of TPU devices
+  // * num_replicas: the number of replicas requested
+  // * num_cores_per_replica: the number of cores in each computation instance
+  // * topology_attr: the topology TPUReplicate attribute
+  // * device_assignment_attr: the device_assignment TPUReplicate attribute
+  // Outputs:
+  // * tf_device_assignment: a mapping from [replica][core] to a TF device name
+  // * devices_to_lock: a flat array of integer indices corresponding to devices
+  //   that are used in this computation. They will be locked before the
+  //   TPUExecute kernels are run, to ensure that the kernels from concurrent
+  //   multi-core executions are enqueued consistently, i.e., all kernels from
+  //   computation A before any kernel from computation B, thus preventing
+  //   deadlock.
+  // * xla_device_assignment: a mapping from [replica][core] to a linearized TPU
+  //   coordinate.
+  // TODO(phawkins): change tf_device_assignment to an xla::Array2D.
+  static absl::Status BuildDeviceAssignment(
+      const tpu::TpuTopologyExternal& topology, int num_tpus_per_task,
+      const std::vector<std::vector<Device*>>& tpu_devices, int num_replicas,
+      int num_cores_per_replica, const std::string& topology_attr,
+      absl::Span<const int> device_assignment_attr,
+      std::vector<std::vector<std::string>>* tf_device_assignment,
+      std::vector<int>* devices_to_lock,
+      std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment);
+
+  // Returns the `computation` graph attached to TPUReplicate operator
+  // `node`. `flr` is a FunctionLibraryRuntime to use when
+  // instantiating the function body. Sets `*arg_types` and
+  // `*retval_types` to the argument/return types of the function.
+  static absl::Status GetComputationForTPUReplicateOp(
+      const NameAttrList& function, FunctionLibraryRuntime* flr,
+      Graph* computation, DataTypeVector* arg_types,
+      DataTypeVector* retval_types);
+
+  // Returns the shapes of the argument tensors and return values of the
+  // TPUReplicate operator `node` using the _output_shapes,
+  // _output_handle_shapes, and _output_handle_types annotations on the input
+  // nodes. Expects inputs in the following order (see comment at top of file):
+  // * num_replicas * num_per_replica_args per-replica inputs,
+  // * num_broadcast_args broadcast inputs,
+  // * num_variables variable inputs.
+  // Returns an error if the input shapes to `node` are not statically known.
+  // Also verifies that all replicas have identical input shapes for their
+  // per-replica inputs.
+  static absl::Status GetArgAndRetvalShapes(
+      const GraphShapeInfo& shape_info, const Node& node,
+      const ParameterInfo& params_info, std::vector<InferredShape>* arg_shapes,
+      std::vector<InferredShape>* retval_shapes);
+
+  // Assigns arguments and return values to cores. The assignment is represented
+  // as an XLA op sharding, so that an argument can be replicated across cores.
+  // `arg_sharding` and `retval_sharding` are vectors of shardings indexed by
+  // argument/retval number.
+  // `arg_fast_mem` is vector of fast_mem indication which is indexed by
+  // argument number.
+  static absl::Status AssignArgsAndRetvalsToCores(
+      int num_cores_per_replica, const ParameterInfo& params_info,
+      const DataTypeVector& arg_types,
+      const std::vector<InferredShape>& arg_shapes,
+      const DataTypeVector& retval_types,
+      const std::vector<InferredShape>& retval_shapes, const Graph& graph,
+      const Node* replicate_node, FunctionLibraryRuntime* flr,
+      bool allow_parameter_replication_for_spmd,
+      std::vector<::xla::OpSharding>* arg_sharding,
+      std::vector<bool>* arg_fast_mem,
+      std::vector<::xla::OpSharding>* retval_sharding,
+      std::vector<std::string>* arg_names);
+
+  // Populates `*variables` with the "variables" inputs to `index`-th output of
+  // `node`.
+  struct VariableInput {
+    Node* node;
+    int index;
+
+    // Type of the variable's value. Note that this is different to the type of
+    // the output of 'variable', which is always DT_RESOURCE.
+    DataType dtype;
+  };
+  static absl::Status FindVariableInputs(const Node& node,
+                                         const NameRangeMap& input_range_map,
+                                         std::vector<VariableInput>* variables);
+
+  // Populates '*guaranteed_constants' with the "guaranteed_constants" inputs
+  // to 'node'.
+  static absl::Status FindGuaranteedConstantInputs(
+      const Node& node, const NameRangeMap& input_range_map,
+      std::vector<Node*>* guaranteed_constants);
+
+  // Builds Shape nodes that compute the shapes of arguments whose shapes are
+  // not statically known.
+  static absl::Status BuildDynamicShapeNodes(
+      const Node& replicate_node, const std::vector<InferredShape>& arg_shapes,
+      const ParameterInfo& params_info,
+      const std::vector<Node*>& variable_reads, Graph* graph,
+      std::vector<Node*>* dynamic_shape_nodes);
+
+  // Builds a TPUCompile node that compiles the computation in
+  // `function_names`. calls `nodes`.
+  // TODO(b/33943292): at present, for model parallelism with Send/Recv to work
+  // the `nodes` must correspond to the computations assigned to TPU:0,
+  // TPU:1, ... in order since XLA hard-codes the chip IDs in the generated
+  // executables.
+  static absl::Status BuildCompileNode(
+      const Node* replicate_node, const NameAttrList& function,
+      uint64_t library_fingerprint, const ParameterInfo& params_info,
+      const std::vector<InferredShape>& arg_shapes,
+      const DataTypeVector& arg_types,
+      const std::vector<Node*>& guaranteed_constant_nodes,
+      const std::string& session_handle,
+      const std::vector<::xla::OpSharding>& arg_sharding,
+      const std::vector<bool>& arg_fast_mem,
+      const std::vector<std::string>& arg_names,
+      const std::vector<::xla::OpSharding>& retval_sharding,
+      int num_cores_per_replica, const std::string& compile_device,
+      const xla::DeviceAssignment* xla_device_assignment,
+      const std::vector<Node*>& dynamic_shape_nodes, Graph* graph,
+      Node** compile_node, int64_t autotuner_thresh);
+
+  // Builds a TPUCompileSucceededAssert node that verifies that compilation
+  // succeeded and replaces the TPUCompilationStatus node in the graph.
+  static absl::Status BuildCompilationStatusReturnNodes(
+      Node* replicate_node, Node* compile_node,
+      absl::Span<const int> devices_to_lock, Node** control_after_compilation,
+      Node** multilock_acquire, Graph* graph);
+
+  // Builds ReadVariableOp nodes that read `variables`, with a control
+  // edges that ensure they happen after `control_predecessor`.
+  static absl::Status BuildVariableReads(
+      absl::Span<const VariableInput> variables, Node* control_predecessor,
+      Graph* graph, std::vector<Node*>* variable_reads);
+
+  // Returns true if graph or functions contain resource write op, otherwise
+  // return false.
+  // TODO(b/137048563): Recognize unused resource rewrite op.
+  static bool ContainsResourceWriteOp(const Graph& graph,
+                                      const FunctionLibraryDefinition& fld);
+  // Struct that describes a variable value to be written back from TPUExecute.
+  struct VariableWrite {
+    // A node:output pair containing a boolean tensor that determines whether
+    // the value should be written back.
+    Node* predicate;
+    int predicate_output;
+
+    // A node:output pair containing the value to be written back.
+    Node* value;
+    int value_output;
+  };
+
+  // Builds AssignVariableOp nodes that write `variables` with the values from
+  // `variable_writes`, with control edges that ensure the writes happen before
+  // `control_successor`.
+  static absl::Status BuildVariableWrites(
+      absl::Span<const VariableInput> variables, Node* control_successor,
+      absl::Span<const VariableWrite> variable_writes, Graph* graph);
+
+  // Builds TPUExecute operators assigned to each TPU device
+  // involved in the computation.
+  // Arguments:
+  // * `params_info` is the structure containing the information about the
+  //    TPUReplicate node inputs and outputs.
+  // * `num_tasks` is the number of TensorFlow tasks in the slice.
+  // * `num_cores_per_replica` is the number of cores which are dedicated to
+  //    each replica.
+  // * `replicate_node` is the original TPUReplicate node.
+  // * `arg_names` are the names of the arguments to the computation function
+  //    passed as argument to TPUReplicate, including per-replica,
+  //    broadcast, and variable arguments.
+  // * `arg_types` are the corresponding types of the arguments.
+  // * `arg_shapes` are the corresponding shapes (and handle types/shapes, if
+  //    applicable).
+  // * `arg_shardings` and `retval_shardings` are mappings from
+  //    arguments/return indices to shardings, as returned by
+  //    `AssignArgsAndRetvalsToCores`.
+  // * `pod_devices` lists the devices to assign to each core of each replica.
+  // * `variable_reads` is a vectors of ReadVariableOp operators, one for each
+  //    variable argument to the computation.
+  // * The execute operators will have a control edge from
+  //   `control_predecessor` and another control edge to `control_successor`.
+  // Populates '*variable_writes' with information about variable values to
+  // write back.
+  static absl::Status BuildExecuteNodes(
+      const ParameterInfo& params_info, int num_tasks,
+      int num_cores_per_replica, const Node& replicate_node,
+      const std::vector<std::string>& arg_names,
+      const DataTypeVector& arg_types,
+      const std::vector<InferredShape>& arg_shapes,
+      const DataTypeVector& retval_types,
+      const std::vector<::xla::OpSharding>& arg_shardings,
+      const std::vector<::xla::OpSharding>& retval_shardings,
+      const std::vector<std::vector<std::string>>& tpu_device_names,
+      Node* compile_node, const std::vector<Node*>& variable_reads,
+      Node* control_predecessor, Node* control_successor,
+      Node* multilock_acquire, std::vector<VariableWrite>* variable_writes,
+      Graph* graph);
+
+  // Connects the compile node to all the host transfer nodes, and removes the
+  // key placeholder node that was previously standing in for it.
+  // Arguments:
+  // * `compile_node` is the TPUCompile node that has been added to the graph.
+  // * `key_placeholder_node` is the placeholder node to send the key to all the
+  // host
+  // * transfer nodes in the original graph.
+  // * `graph` is the graph being rewritten.
+  static absl::Status ConnectHostComputeNodes(Node* compile_node,
+                                              Node* key_placeholder_node,
+                                              Graph* graph);
+
+  // Map from a Node in an outside_compilation cluster in the original graph to
+  // the list of Nodes, one for each replica, that it is expanded into during
+  // replication.
+  typedef absl::node_hash_map<Node*, std::vector<Node*>> NodeToNodeReplicasMap;
+
+  // Map from the name of an outside_compilation cluster to the model-parallel
+  // core index that the HostCompute Op should be placed on in that cluster.
+  typedef std::map<std::string, int> HostComputeCoreMap;
+
+  // Map from the name of an outside_compilation cluster to the list of Nodes
+  // that should run on the host for that cluster.
+  typedef std::map<std::string, std::vector<Node*>> OutsideCompilationNodeMap;
+
+  // Copies the outside_compilation nodes in a cluster to create replica
+  // replica_index.
+  static absl::Status CopyOutsideCompilationNodes(
+      int replica_index, const std::vector<Node*>& outside_compilation_nodes,
+      const DeviceNameUtils::ParsedName& tpu_device,
+      const DeviceNameUtils::ParsedName& partial_device,
+      NodeToNodeReplicasMap* node_images, Graph* graph);
+
+  // Replicates all the nodes in outside_compilation clusters in a compiled
+  // computation.
+  static absl::Status ReplicateOutsideCompilationNodes(
+      const std::vector<std::vector<std::string>>& tf_device_assignment,
+      const HostComputeCoreMap& host_compute_core,
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      NodeToNodeReplicasMap* node_images, Graph* graph);
+
+  // Lifts the edges between original outside_compilation nodes in a cluster
+  // onto their replicas.
+  static absl::Status CopyOutsideCompilationEdges(
+      const std::vector<Node*>& outside_compilation_nodes,
+      const NodeToNodeReplicasMap& node_images,
+      std::unordered_map<std::string, Node*> outside_compilation_inputs,
+      Graph* graph);
+
+  // Lifts all the edges in outside_compilation clusters in a compiled
+  // computation to their replicas.
+  static absl::Status ReplicateOutsideCompilationEdges(
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      const NodeToNodeReplicasMap& node_images,
+      std::unordered_map<std::string, Node*> outside_compilation_inputs,
+      Graph* graph);
+
+  // Removes all the original outside_compilation nodes from the graph,
+  // following replication.
+  static absl::Status RemoveOutsideCompilationNodes(
+      const NodeToNodeReplicasMap& node_images, Graph* graph);
+
+  // Lowers outside compilation functional nodes (If/While/function call).
+  // Otherwise, when we have multiple workers, device placer will not be able to
+  // place nodes if outside compilation has DT_RESOURCE inputs (e.g. a
+  // DT_RESOURCE input fed into multiple While nodes on different devices).
+  static absl::Status LowerOutsideCompilationFunctionalNodes(
+      Graph* g, FunctionLibraryDefinition& flib_def,
+      const TPUReplicateDeviceNamesMapping& tpu_replicate_device_names_mapping);
+
+  // Parses the 'host_compute_core' attribute on replicate_node to get the
+  // replicated core id of each outside_compilation cluster.
+  static absl::Status ParseHostComputeCores(
+      const Node& replicate_node,
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      HostComputeCoreMap* host_compute_core);
+
+  // Gets the physical topology information about the TPU system.
+  static absl::Status GetDeviceTopology(
+      const DeviceSet& device_set, const Node& replicate_node,
+      int* num_replicas, int* num_cores_per_replica, int* num_tasks,
+      std::vector<std::vector<std::string>>* tf_device_assignment,
+      std::vector<int>* devices_to_lock,
+      std::unique_ptr<xla::DeviceAssignment>* xla_device_assignment,
+      std::string* tpu_compilation_device);
+
+  // Gets the types of args, retvals, and parameters.
+  static absl::Status GetIOTypes(
+      int num_replicas, const Node& replicate_node, FunctionLibraryRuntime* flr,
+      Graph* graph, NameRangeMap* input_name_map, const NameAttrList** function,
+      std::unique_ptr<Graph>* computation, DataTypeVector* arg_types,
+      DataTypeVector* retval_types, ParameterInfo* params_info);
+
+  // Find known constants and deals with variable reads.
+  static absl::Status DealWithConstantsAndVariables(
+      const Node& replicate_node, const NameRangeMap& input_name_map,
+      Graph* graph, Node* host_transfer_sequencer, Node* control_before,
+      Node* control_after, absl::Span<const VariableInput> variable_nodes,
+      std::vector<Node*>* guaranteed_constant_nodes,
+      std::vector<Node*>* variable_reads);
+
+  // Adds NoOp nodes for sequencing computation and variable reads/writes.
+  static absl::Status BuildSequencingNodes(
+      const std::string& tpu_compilation_device, const Node& replicate_node,
+      Graph* graph, Node** host_transfer_sequencer, Node** control_before,
+      Node** control_after);
+
+  // Performs the pass's rewrite on a TPUReplicate node `node`.
+  static absl::Status RewriteTPUReplicateNode(
+      const std::string& session_handle, const DeviceSet& device_set,
+      Node* replicate_node, FunctionLibraryDefinition* flib_def,
+      FunctionLibraryRuntime* flr, Node* host_compute_key_placeholder_node,
+      const OutsideCompilationNodeMap& outside_compilation_nodes,
+      const std::vector<Node*>& head_tail_outside_compilation_nodes,
+      NodeToNodeReplicasMap* outside_compilation_node_images, Graph* graph,
+      const GraphShapeInfo& shape_info,
+      TPUReplicateDeviceNamesMapping* tpu_replicate_device_names_mapping,
+      int64_t autotuner_thresh);
+
+  // Performs host training loop optimization. For example, when TPUExecute
+  // node is inside a while loop, then model weight variables can be sharded
+  // in XLA preferred layout and then unsharded only at the very last iteration
+  // to reduce the number of all_gather.
+  static absl::Status PerformHostTrainingLoopOptimization(
+      Graph* graph, FunctionLibraryDefinition* flib_def,
+      FunctionLibraryRuntime* flr);
+
+  // Heuristically place some nodes with unassigned devices on TPUs for
+  // performance reasons.
+  static absl::Status PlaceUnassignedDeviceNodesOnTPUIfPossible(Graph* graph);
+
+  // Updates the head and tail outside compiled nodes so that nodes have the
+  // correct device and removes the replication and outside compilation
+  // attributes so that these nodes do not trigger further graph optimization
+  // passes.
+  static absl::Status UpdateHeadTailOutsideCompilation(
+      const std::vector<std::vector<std::string>>& tf_device_assignment,
+      const std::vector<Node*>& head_tail_outside_compilation_nodes);
+
+ private:
+  static bool distribute_vars_;
+  static bool allow_xla_spmd_partition_;
+  static bool replicate_inputs_outputs_by_default_for_xla_spmd_;
+  static bool enable_cross_replica_sharding_mirrored_variables_;
+  static bool enable_automatic_model_parallelism_;
+  static bool enable_xla_param_broadcast_;
+  static bool enable_multicore_locking_;
+  static bool use_nd_sharding_ops_;
+  absl::Status InternalRun(const GraphOptimizationPassOptions& options);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h
new file mode 100644
index 00000000..ad4d74c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass_internal.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_INTERNAL_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_INTERNAL_H_
+
+#include <cstdint>
+
+namespace tensorflow {
+
+// Implementation details of distributed_tpu_rewrite_pass.cc, please DO NOT
+// depend on these.
+namespace internal {
+
+// When set to a value >= 0, overrides the node_id. Used for getting
+// deterministic node_ids during testing.
+void OverrideNodeIdForTesting(int64_t node_id);
+
+// Retrieves the node id, used to make some node names unique in the rewrite
+// pass.
+uint64_t GetNodeId();
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_DISTRIBUTED_TPU_REWRITE_PASS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h
new file mode 100644
index 00000000..37ba029c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites computations generated by the tpu.replicate() Python code into
+// TPUReplicate operators.
+//
+// The tpu.replicate() does two main things:
+// a) marks operators that make up a TPU computation with the attribute
+//    _tpu_replicate=XYZ, where XYZ is a unique key.
+// b) adds TPUReplicatedInput and TPUReplicatedOutput nodes to represent
+//    replicated inputs. These nodes are not marked with the _tpu_replicate
+//    attribute.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_ENCAPSULATE_TPU_COMPUTATIONS_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_ENCAPSULATE_TPU_COMPUTATIONS_PASS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Encapsulates nodes marked with the _tpu_replicate attribute into
+// TPUReplicate operators.
+class EncapsulateTPUComputationsPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+  // The following methods are public only for unit tests.
+
+  // This pass has two stages:
+  // a) first, we call the EncapsulateSubgraphsPass to encapsulate all nodes
+  //    marked with the same _tpu_replicate attribute into functions. These
+  //    functions contain the computations to be passed to TPUReplicate. During
+  //    encapsulation, we sort the arguments into the order expected by
+  //    TPUReplicate.
+  static absl::Status Encapsulate(std::unique_ptr<Graph>* graph,
+                                  FunctionLibraryDefinition* flib_def);
+
+  // b) we rewrite the function calls generated in phase (a) into TPUReplicate
+  //    operators. We also flatten the TPUReplicatedInput and
+  //    TPUReplicatedOutput replicated input and output nodes of the function
+  //    call into the replicated input and outputs of the TPUReplicate operator.
+  static absl::Status BuildTPUReplicateOps(Graph* graph);
+};
+
+// Graph optimization pass that calls `ExtractOutsideCompilation` for all XLA
+// computation nodes.
+class ExtractOutsideCompilationPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+  static absl::Status ProcessHeadTailOutsideCompilation(
+      const std::string& outside_compilation_attr_name, int* lifted_arg_count,
+      std::unordered_map<std::string, XlaClusterInfo>* clusters, Graph* g,
+      FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_ENCAPSULATE_TPU_COMPUTATIONS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h
new file mode 100644
index 00000000..a3d2e01f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_HOST_TRAINING_LOOP_OPTIMIZATION_UTIL_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_HOST_TRAINING_LOOP_OPTIMIZATION_UTIL_H_
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+struct LoopArgInfo {
+  std::string enter_node_name;
+  // Exit nodes are optional for loop invariant while loop args.
+  std::optional<std::string> exit_node_name;
+};
+
+struct HostTrainingLoopInfo {
+  // Name and attribute information about the function in which
+  // host training loop is included. If host training loop is not
+  // inside a function call, then `function_name` and `function_attrs`
+  // are nullopt.
+  std::optional<std::string> encapsulating_function_name;
+  std::optional<AttrValueMap> encapsulating_function_attrs;
+
+  // TPU Compile node as within a host training loop.
+  std::string compile_node_name;
+
+  // Name of the while loop in which TPU compile op is located.
+  std::string while_loop_name;
+
+  // Name of the node that represents loop condition.
+  std::string loop_cond_node_name;
+
+  // Exit and Enter node names for each loop arguments.
+  std::vector<LoopArgInfo> loop_arguments;
+
+  std::unordered_set<Node*> loop_nodes;  // NOLINT
+};
+
+// Walks through the `graph`, recursively if functional nodes exist, and
+// identifies all host training loops. Host training loops are the inner
+// most while loops that encapsulates TPUCompileOp node. This would be
+// later used/analyzed to introduce host loop specific optimizations such
+// as adding sharded weight update.
+absl::Status DetectHostTrainingLoop(
+    const std::string* current_function_name,
+    const AttrValueMap* current_function_attr,
+    const FunctionLibraryDefinition* library, Graph* graph,
+    FunctionLibraryRuntime* flr,
+    std::vector<HostTrainingLoopInfo>* host_training_loops_info);
+
+// Injects VariableReshardOps to before and after TPUExecute op inside
+// host training loop body. This effectively applies sharded weight update
+// on model weight variables.
+absl::Status AddReshardOp(Graph* graph,
+                          const HostTrainingLoopInfo& host_loop_info);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_HOST_TRAINING_LOOP_OPTIMIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
new file mode 100644
index 00000000..27304087
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_INCOMPLETE_NODEDEF_BUILDER_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_INCOMPLETE_NODEDEF_BUILDER_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Convenience builder to build NodeDefs without specifying the inputs. This is
+// similar to NodeDefBuilder except inputs are not specified.
+// TODO(jpienaar): Clean up NodeDefBuilder and remove this class.
+class IncompleteNodeDefBuilder {
+ public:
+  IncompleteNodeDefBuilder(const string& name, const string& op,
+                           const NodeDebugInfo& debug);
+
+  IncompleteNodeDefBuilder& AddAttr(const string& attr, const DataType& type);
+  IncompleteNodeDefBuilder& AddAttr(const string& attr, int val);
+
+  IncompleteNodeDefBuilder& Device(const string& device);
+
+  absl::Status Build(Graph* graph, Node** n);
+
+  static IncompleteNodeDefBuilder Identity(const string& name,
+                                           const DataType& type,
+                                           const NodeDebugInfo& debug);
+  static IncompleteNodeDefBuilder Merge(const string& name,
+                                        const DataType& type,
+                                        const NodeDebugInfo& debug, int n);
+  static IncompleteNodeDefBuilder Switch(const string& name,
+                                         const DataType& type,
+                                         const NodeDebugInfo& debug);
+
+ private:
+  NodeDef nodedef_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_INCOMPLETE_NODEDEF_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/tpu_embedding_rewrite_pass_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/tpu_embedding_rewrite_pass_utils.h
new file mode 100644
index 00000000..3589dd83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/tpu_embedding_rewrite_pass_utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_TPU_EMBEDDING_REWRITE_PASS_UTILS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_TPU_EMBEDDING_REWRITE_PASS_UTILS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Adds a new TensorFlow graph node, with the output convention matching most TF
+// code rather than the order used by Graph::AddNode().
+absl::Status AddNode(const NodeDef& n_def, Node** n, Graph* graph);
+
+// Replaces one TensorFlow graph node with another (specified by a NodeDef),
+// moving all the edges.
+absl::Status ReplaceNode(const NodeDef& to_def, Node* from, Node** to,
+                         Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_TPU_EMBEDDING_REWRITE_PASS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.h
new file mode 100644
index 00000000..7be458b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_TPU_EMBEDDING_SOFTWARE_DEDUPLICATION_REWRITE_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_TPU_EMBEDDING_SOFTWARE_DEDUPLICATION_REWRITE_PASS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Rewrites the graph and function defs in the specified
+// GraphOptimizationPassOptions object for software deduplication.
+//
+// For the graph, groups the RecvTPUEmbeddingActivations and
+// SendTPUEmbeddingGradients nodes by their _tpu_replicate attribute. For each
+// such group:
+// 1. Inserts a XlaRecvTPUEmbeddingDeduplicationData node into the graph.
+// 2. Replaces the public RecvTPUEmbeddingActivations node (if present) with the
+//    internal XlaRecvTPUEmbeddingActivations node.
+// 3. Replaces the public SendTPUEmbeddingGradients node (if present) with the
+//    internal XlaSendTPUEmbeddingGradients node.
+// 4. Connects the outputs of the XlaRecvTPUEmbeddingDeduplicationData node with
+//    the inputs of the XlaRecvTPUEmbeddingActivations and
+//    XlaSendTPUEmbeddingGradients nodes.
+//
+// Iterates through the list of functions in the specified
+// GraphOptimizationPassOptions object. Performs the same steps 1-4 specified
+// above for each function.
+//
+// If multiple RecvTPUEmbeddingActivations nodes or SendTPUEmbeddingGradients
+// nodes are present in the same function or in the same _tpu_replicate group,
+// an InvalidArgument error is returned to the caller.
+class TPUEmbeddingSoftwareDeduplicationRewritePass :
+    public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_TPU_EMBEDDING_SOFTWARE_DEDUPLICATION_REWRITE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.h
new file mode 100644
index 00000000..1c1dd7bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Rewrites ConfigureTPUEmbedding Op into nodes which set up TPUEmbedding.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_UPDATE_TPU_EMBEDDING_OPS_PASSES_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_UPDATE_TPU_EMBEDDING_OPS_PASSES_H_
+
+#include <map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class UpdateTPUEmbeddingEnqueueOrdinalPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+class UpdateTPUEmbeddingModePass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+
+  static absl::Status GetEnqueueOpsFromGraph(
+      Graph* graph, absl::flat_hash_map<Node*, bool>* enqueue);
+  static absl::Status UpdateGraphEnqueueOp(bool training, Graph* graph,
+                                           Node* enqueue);
+  static absl::Status GetEnqueueOpsFromFunctionDef(
+      FunctionDef* function, std::map<int, bool>* enqueue);
+  static absl::Status UpdateFunctionDefEnqueueOp(int enqueue, bool training,
+                                                 FunctionDef* function,
+                                                 bool* updated);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_UPDATE_TPU_EMBEDDING_OPS_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h
new file mode 100644
index 00000000..eaaaf1cf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Optimization pass that merges VarHandleOps and ReadVariableOps into their
+// fused forms.
+//
+// The goal of this pass is to fix a latency problem sometimes observed in
+// inference benchmarks. Often a inference step starts by reading the value of
+// many weights. Reading a resource variable requires a VarHandleOp and a
+// ReadVariableOp per variable. Running hundreds of trivial ops can add hundreds
+// of microseconds of latency to the critical path of an inference step. The
+// inter-op latency of the executor can be easily hundreds of nanoseconds, which
+// rapidly adds up over many inexpensive ops.
+//
+// This pass merges VarHandleOps that have only the graph source node as a
+// predecessor into a single VarHandlesOp that reads all at once.
+// It then merges ReadVariablesOp that have no control inputs and originate from
+// the same handle op into a single large ReadVariablesOp.
+
+#ifndef TENSORFLOW_CORE_TPU_GRAPH_REWRITE_VARIABLE_MERGER_PASS_H_
+#define TENSORFLOW_CORE_TPU_GRAPH_REWRITE_VARIABLE_MERGER_PASS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class VariableMergerPass : public GraphOptimizationPass {
+ public:
+  absl::Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_GRAPH_REWRITE_VARIABLE_MERGER_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/compiled_subgraph.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/compiled_subgraph.h
new file mode 100644
index 00000000..bec973c6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/compiled_subgraph.h
@@ -0,0 +1,173 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_COMPILED_SUBGRAPH_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_COMPILED_SUBGRAPH_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Forward declaration to avoid circular dependency.
+class TpuCompilationCacheInterface;
+
+// Cache for compiled TPU program.
+//
+// Each key identifies a unique subgraph, and the value is the vector of
+// protos that are emitted by compiling the subgraph.
+//
+// When a subgraph is considered for compilation, the client calls
+//
+// auto subgraph_key = <compute key for subgraph>;
+// auto compile_function = <lambda to compile subgraph into protos>;
+// auto per_step_ref_holder = <container to control lifetime of cached
+// results>;
+// int64 uid;
+// std::vector<string> proto_key;
+// CompileIfKeyAbsent(subgraph_key, per_step_ref_holder, &uid, &proto_key,
+//                    compile_function);
+//
+// where subgraph_key is the key computed for the subgraph. On success,
+// proto_key contains a vector of keys, where proto_key[i] can be used to look
+// up the ith proto compiled from the subgraph, and uid contains an identifier
+// that can be used in place of key for clients that require cheap
+// serializable handles. If the compiled protos were not present in the cache,
+// compile_function would be called to generate them. per_step_ref_holder
+// extends the lifetime of cached results: it is guaranteed that the protos
+// indicated in proto_key will be available for lookup for at least as long as
+// per_step_ref_holder is not deleted.
+//
+// If the caller passes nullptr instead of a per_step_ref_holder then the
+// caller is responsible for calling Release(subgraph_key) once for every call
+// to CompileIfKeyAbsent(subgraph_key, ...) to discard the reference to the
+// compilation results, after the caller is sure it will not look up the
+// compiled executables again.
+//
+// Subsequently the client can call
+//
+// std::unique_ptr<CompilationCacheEntryRef> entry;
+// Lookup(proto_key, &entry);
+// auto proto = entry->get();
+//
+// or
+//
+// std::unique_ptr<CompilationCacheEntryRef> entry;
+// Lookup(uid, proto_index, &entry);
+// auto proto = entry->get();
+//
+// to access a cached proto.
+// TODO(misard) Switch the existing TPU ops to use uid+proto_index instead of
+// string keys for proto lookups.
+//
+//
+// Usage details within the system:
+//
+// This cache lives in the resource manager of the TPU_SYSTEM device where the
+// compiler runs, typically worker 0 of the system. The cache is discarded and
+// a new one created whenever the system is reinitialized.
+//
+// A compiled subgraph is placed into the cache using a key that is a
+// combination of the function name, guaranteed_constants, the shapes of the
+// dynamic inputs to the subgraph, and the function library in use at the time
+// of execution.
+//
+// Whenever a compile Op is run, it looks to see if there is already an entry
+// in the cache corresponding to that Op and the current dynamic shapes, and
+// creates one if not. The entry is marked as most recently used in the cache
+// by the compile Op. The entry is reference counted. The cache owns one entry
+// , and each step that has executed a compile Op referring to the entry owns
+// a reference until that step completes.
+//
+// If the cache exceeds a configured storage limit, entries are marked for
+// eviction in order of least recently used. An entry is not evicted until all
+// references to it are discarded, so an entry that is marked for eviction can
+// still be looked up by the execute Ops in a running step. If another Compile
+// Op looks up an entry that is marked for eviction, the entry will be
+// unmarked and set to most recently used.
+//
+struct CompiledSubgraph : public core::RefCounted {
+  TpuCompilationCacheInterface* parent = nullptr;  // Not owned.
+
+  bool initialized = false;
+
+  // The Status returned by the compilation function when the entry is
+  // initialized. This status will be returned to any client that requests the
+  // entry.
+  absl::Status initialization_status;
+
+  // Counter to keep track of LRU entries for the eviction policy.
+  int64_t last_use = -1;
+
+  // The unique key describing this entry.
+  std::string subgraph_key;
+
+  // The uid describing this entry.
+  int64_t uid;
+
+  // Compilation cache proto key to identify the cache entry.
+  std::vector<std::string> proto_key;
+
+  // Fingerprints of sharding programs if there is any.
+  std::vector<std::string> sharding_key;
+
+  // The number of 'external' client-held references to the entry.
+  int external_references = 0;
+
+  // The sum of the SpaceUsed of each of the elements of programs; an estimate
+  // of how much RAM the entry consumes, used to determine when entries must
+  // be marked for eviction.
+  int64_t total_size = 0;
+
+  // Debug info in case we miss.
+  std::string cache_entry_debug_string;
+
+  // Entries representing the associated sharding and unsharding programs,
+  // which share the same life time of the owning main entry, so we always use
+  // the main entry's ref count.
+  std::unique_ptr<CompiledSubgraph> sharding_entry;
+  std::unique_ptr<CompiledSubgraph> unsharding_entry;
+
+  // Only used for the nested sharding/unsharding entries to point to the
+  // owning main entry.
+  CompiledSubgraph* main_entry = nullptr;
+
+  // Compiled TPU program group.
+  std::unique_ptr<TpuProgramGroupInterface> tpu_program_group;
+
+  // Computes total program size.
+  size_t ComputeTotalSize() const {
+    CHECK_EQ(total_size, 0);
+    int64_t size = tpu_program_group->program_size();
+
+    if (sharding_entry != nullptr) {
+      size += sharding_entry->total_size;
+    }
+    if (unsharding_entry != nullptr) {
+      size += unsharding_entry->total_size;
+    }
+    return size;
+  }
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_COMPILED_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/infeed_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/infeed_ops.h
new file mode 100644
index 00000000..d6e24cf4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/infeed_ops.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/shape.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+namespace tensorflow {
+
+// TODO(b/65200690): Rework this when there is a callback based infeed API to
+// StreamExecutor.
+
+// The InfeedEnqueue op is used to deliver data to the device infeed queue.
+class TpuInfeedEnqueueOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuInfeedEnqueueOp(
+      OpKernelConstruction* ctx,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op);
+  absl::Status DoWork(OpKernelContext* ctx, int device_ordinal) override;
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  TpuInfeedEnqueueOp(const TpuInfeedEnqueueOp&) = delete;
+  TpuInfeedEnqueueOp& operator=(const TpuInfeedEnqueueOp&) = delete;
+};
+
+// The InfeedEnqueueTuple op is used on the host to deliver multiple tensors to
+// the device infeed queue as an XLA tuple.
+class TpuInfeedEnqueueTupleOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuInfeedEnqueueTupleOp(
+      OpKernelConstruction* ctx,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op);
+  absl::Status DoWork(OpKernelContext* ctx, int device_ordinal) override;
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  xla::Shape tuple_shape_;
+
+  TpuInfeedEnqueueTupleOp(const TpuInfeedEnqueueTupleOp&) = delete;
+  TpuInfeedEnqueueTupleOp& operator=(const TpuInfeedEnqueueTupleOp&) = delete;
+};
+
+// The InfeedEnqueuePrelinearizedBufferOp op is used to transfer prelinearized
+// buffers to the device infeed queue.
+class InfeedEnqueuePrelinearizedBufferOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit InfeedEnqueuePrelinearizedBufferOp(
+      OpKernelConstruction* ctx,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op);
+
+  absl::Status DoWork(OpKernelContext* ctx, int device_ordinal) override;
+
+ private:
+  InfeedEnqueuePrelinearizedBufferOp(
+      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
+  InfeedEnqueuePrelinearizedBufferOp& operator=(
+      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/outfeed_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/outfeed_ops.h
new file mode 100644
index 00000000..8f1562e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/outfeed_ops.h
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace tensorflow {
+
+// The OutfeedDequeue op is used to retrieve a single tensor from the device
+// outfeed queue.
+template <class T>
+class TpuOutfeedDequeueOp : public T {
+ public:
+  explicit TpuOutfeedDequeueOp(
+      OpKernelConstruction* ctx,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op)
+      : T(ctx, "outfeed_dequeue", 1, std::move(transfer_op)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &xla_shape_));
+  }
+
+  absl::Status DoWork(OpKernelContext* ctx, int device_ordinal) override {
+    Tensor* output;
+    TF_RETURN_IF_ERROR(ctx->allocate_output(0, shape_, &output));
+
+    // Transfer from the outfeed interface of the device.
+    xla::MutableBorrowingLiteral literal;
+    TF_RETURN_IF_ERROR(
+        HostTensorToMutableBorrowingLiteral(xla_shape_, output, &literal));
+
+    VLOG(1) << "TransferLiteralFromOutfeed "
+            << xla::ShapeUtil::HumanStringWithLayout(xla_shape_);
+
+    TF_RETURN_IF_ERROR(
+        T::transfer_op_->TransferLiteralFromOutfeed(device_ordinal, literal));
+
+    VLOG(1) << "TransferLiteralFromOutfeed complete.";
+
+    return absl::OkStatus();
+  }
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  TpuOutfeedDequeueOp(const TpuOutfeedDequeueOp&) = delete;
+  TpuOutfeedDequeueOp& operator=(const TpuOutfeedDequeueOp&) = delete;
+};
+
+// The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
+// device outfeed queue.
+template <class T>
+class TpuOutfeedDequeueTupleOp : public T {
+ public:
+  explicit TpuOutfeedDequeueTupleOp(
+      OpKernelConstruction* ctx,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op)
+      : T(ctx, "outfeed_dequeue", 1, std::move(transfer_op)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+    OP_REQUIRES(
+        ctx, shapes_.size() == dtypes_.size(),
+        errors::InvalidArgument("shapes and dtypes must be the same length."));
+    // The `dtypes` list is inferred from the supplied inputs, so it
+    // is always the correct length.
+    for (int i = 0; i < shapes_.size(); i++) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(ctx,
+                     TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+      xla_shapes_.push_back(xla_shape);
+    }
+    tuple_shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes_);
+  }
+
+  absl::Status DoWork(OpKernelContext* ctx, int device_ordinal) override {
+    VLOG(1) << "TransferLiteralFromOutfeed "
+            << xla::ShapeUtil::HumanStringWithLayout(tuple_shape_);
+
+    for (int i = 0; i < shapes_.size(); ++i) {
+      Tensor* output;
+      TF_RETURN_IF_ERROR(ctx->allocate_output(i, shapes_[i], &output));
+
+      xla::MutableBorrowingLiteral literal;
+      TF_RETURN_IF_ERROR(HostTensorToMutableBorrowingLiteral(xla_shapes_[i],
+                                                             output, &literal));
+      TF_RETURN_IF_ERROR(
+          T::transfer_op_->TransferLiteralFromOutfeed(device_ordinal, literal));
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  std::vector<xla::Shape> xla_shapes_;
+  xla::Shape tuple_shape_;
+
+  TpuOutfeedDequeueTupleOp(const TpuOutfeedDequeueTupleOp&) = delete;
+  TpuOutfeedDequeueTupleOp& operator=(const TpuOutfeedDequeueTupleOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sharding_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sharding_utils.h
new file mode 100644
index 00000000..e557c5dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sharding_utils.h
@@ -0,0 +1,455 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SHARDING_UTILS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SHARDING_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace sharding_internal {
+absl::Status ValidateShapesForSlice(absl::string_view input_name,
+                                    const Tensor* input,
+                                    const std::vector<int32_t>& num_splits,
+                                    const std::vector<int32_t>& paddings);
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> TF_ATTRIBUTE_NOINLINE
+ShapeAsEigenDSizes(const TensorShape& shape);
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> ShapeAsEigenDSizes(
+    const TensorShape& shape) {
+  return shape.AsEigenDSizes<Rank>();
+}
+
+}  // namespace sharding_internal
+
+// Converts flatten index to start indices (subscript scaled with slice shape)
+// for determining where to start a slice in the input tensor.
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 1> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 2> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 3> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 4> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 5> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 6> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 7> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 8> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, int index);
+
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape,
+    const int index) {
+  return Eigen::DSizes<Eigen::DenseIndex, Rank>();
+}
+
+// Shared base class to save code space
+template <typename Device, typename T>
+class XlaNDSplitter {
+ public:
+  static absl::StatusOr<XlaNDSplitter<Device, T>> Create(
+      const std::vector<int32_t>& num_splits, int num_slices,
+      const std::vector<int32_t>& paddings, bool has_paddings) {
+    if (num_splits.size() != paddings.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("num_splits size ", num_splits.size(),
+                       " mismatch with paddings size ", paddings.size(), "."));
+    }
+
+    int splits_cnt = 1;
+    for (auto split : num_splits) {
+      splits_cnt *= split;
+    }
+
+    if (num_slices != splits_cnt) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Expect num_slices ", splits_cnt, " but got ", num_slices));
+    }
+
+    return XlaNDSplitter<Device, T>(num_splits, num_slices, paddings,
+                                    has_paddings);
+  }
+
+  // Split the given input.
+  //
+  // The splitted outputs are stored into tensors allocated by
+  // `allocate_output_fn`. In the simple case of pass through (no split and no
+  // padding), the output is stored through the fast path by
+  // `assign_or_copy_value_fn`.
+  absl::Status Split(
+      const Tensor* input, absl::string_view input_name,
+      const std::function<absl::Status(const Tensor&)>& assign_or_copy_value_fn,
+      const std::function<absl::Status(int index, const TensorShape& shape,
+                                       Tensor** tensor)>& allocate_output_fn,
+      const Device& device) {
+    if (num_splits_.size() != paddings_.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("num_splits size ", num_splits_.size(),
+                       " mismatch with paddings size ", paddings_.size(), "."));
+    }
+
+    const int rank = input->shape().dims();
+    const auto& input_shape = input->shape().dim_sizes();
+
+    TF_RETURN_IF_ERROR(sharding_internal::ValidateShapesForSlice(
+        input_name, input, num_splits_, paddings_));
+
+    TensorShape output_slice_shape;
+    for (int i = 0; i < rank; ++i) {
+      output_slice_shape.AddDim((input_shape[i] + paddings_[i]) /
+                                ((num_slices_ == 1) ? 1 : num_splits_[i]));
+    }
+    if (num_slices_ == 1 && !has_paddings_) {
+      // Handle simple case first
+      TF_RETURN_IF_ERROR(assign_or_copy_value_fn(*input));
+    } else {
+      std::vector<Tensor*> output_slices(num_slices_);
+      for (int i = 0; i < num_slices_; i++) {
+        TF_RETURN_IF_ERROR(allocate_output_fn(
+            /*index=*/i, output_slice_shape, &output_slices[i]));
+      }
+
+      if (rank == 1) {
+        SliceAndMaybePad<1>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 2) {
+        SliceAndMaybePad<2>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 3) {
+        SliceAndMaybePad<3>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 4) {
+        SliceAndMaybePad<4>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 5) {
+        SliceAndMaybePad<5>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 6) {
+        SliceAndMaybePad<6>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 7) {
+        SliceAndMaybePad<7>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 8) {
+        SliceAndMaybePad<8>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      }
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  template <int Rank>
+  class SliceAndMaybePadState {
+   public:
+    int num_complete_pad_dims_;
+    int num_partial_pad_dims_;
+    TensorShape non_padded_slice_shape_;
+    Eigen::array<Eigen::IndexPair<int64_t>, Rank> slice_paddings_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_indices_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> output_slice_shape_dsizes_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> non_padded_slice_shape_dsizes_;
+
+    TF_ATTRIBUTE_NOINLINE SliceAndMaybePadState(
+        absl::Span<const int32_t> num_splits,
+        const absl::Span<const int64_t> input_shape,
+        const TensorShape& output_slice_shape, int slice_index) {
+      output_slice_shape_dsizes_ =
+          sharding_internal::ShapeAsEigenDSizes<Rank>(output_slice_shape);
+      num_complete_pad_dims_ = 0;
+      num_partial_pad_dims_ = 0;
+      slice_indices_ = GetSliceIndices<Rank>(
+          num_splits, output_slice_shape_dsizes_, slice_index);
+
+      // Calculate paddings necessary for slice instead of padding input and
+      // slicing subsequently to reduce temporary memory allocation.
+      for (int dim = 0; dim < Rank; ++dim) {
+        const int64_t dim_size = input_shape[dim];
+        const int64_t out_dim = output_slice_shape_dsizes_[dim];
+        int64_t non_padded_dim = 0;
+        if (slice_indices_[dim] >= dim_size) {
+          // Complete padding.
+          slice_indices_[dim] = dim_size;
+          non_padded_dim = 0;
+          slice_paddings_[dim] = {0, out_dim};
+          num_complete_pad_dims_++;
+        } else if (slice_indices_[dim] + out_dim > dim_size) {
+          // Partial padding.
+          non_padded_dim = dim_size - slice_indices_[dim];
+          slice_paddings_[dim] = {0, out_dim - non_padded_dim};
+          num_partial_pad_dims_++;
+        } else {
+          non_padded_dim = out_dim;
+        }
+        non_padded_slice_shape_.AddDim(non_padded_dim);
+      }
+      non_padded_slice_shape_dsizes_ =
+          sharding_internal::ShapeAsEigenDSizes<Rank>(non_padded_slice_shape_);
+    }
+  };
+
+  std::vector<int32_t> num_splits_;
+  int num_slices_;
+  std::vector<int32_t> paddings_;
+  bool has_paddings_;
+
+  explicit XlaNDSplitter(const std::vector<int32_t>& num_splits, int num_slices,
+                         const std::vector<int32_t>& paddings,
+                         bool has_paddings)
+      : num_splits_(num_splits),
+        num_slices_(num_slices),
+        paddings_(paddings),
+        has_paddings_(has_paddings) {}
+
+  void TF_ATTRIBUTE_NOINLINE SetToConstant(Tensor* output_slice,
+                                           const Device& device) {
+    auto output_flat = output_slice->flat<T>();
+    output_flat.device(device) = output_flat.constant(T());
+  }
+
+  template <int Rank>
+  void TF_ATTRIBUTE_NOINLINE AssignFromInput(
+      Tensor* output_slice, const Device& device, const Tensor* input,
+      const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_indices,
+      const Eigen::DSizes<Eigen::DenseIndex, Rank>& output_slice_shape_dsizes) {
+    output_slice->tensor<T, Rank>().device(device) =
+        input->tensor<T, Rank>().slice(slice_indices,
+                                       output_slice_shape_dsizes);
+  }
+
+  template <int Rank>
+  void TF_ATTRIBUTE_NOINLINE
+  SliceAndMaybePad(const Device& device, const Tensor* input,
+                   const absl::Span<const int64_t> input_shape,
+                   const TensorShape& output_slice_shape,
+                   const std::vector<Tensor*>& output_slices) {
+    const auto& input_tensor = input->tensor<T, Rank>();
+    // Slice shape with optional padding.
+    for (int i = 0; i < num_slices_; ++i) {
+      Tensor* output_slice = output_slices[i];
+      SliceAndMaybePadState<Rank> r(num_splits_, input_shape,
+                                    output_slice_shape, i);
+      if (r.num_complete_pad_dims_ == Rank ||
+          (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0)) {
+        // Need to init padding
+        SetToConstant(output_slice, device);
+      }
+      if (r.num_complete_pad_dims_ == Rank) {
+        // Done
+      } else if (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0) {
+        output_slice->tensor<T, Rank>()
+            .slice(Eigen::DSizes<Eigen::DenseIndex, Rank>(),
+                   r.non_padded_slice_shape_dsizes_)
+            .device(device) = input_tensor.slice(
+            r.slice_indices_, r.non_padded_slice_shape_dsizes_);
+      } else {
+        AssignFromInput<Rank>(output_slice, device, input, r.slice_indices_,
+                              r.output_slice_shape_dsizes_);
+      }
+    }
+  }
+};
+
+// Shared base class to save code space
+template <typename Device, typename T>
+class XlaNDConcatenator {
+ public:
+  static absl::StatusOr<XlaNDConcatenator<Device, T>> Create(
+      const std::vector<int32_t>& num_concats, int num_slices,
+      const std::vector<int32_t>& paddings, bool has_paddings) {
+    if (num_concats.size() != paddings.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("num_concats size ", num_concats.size(),
+                       " mismatch with paddings size ", paddings.size(), "."));
+    }
+
+    int concats_cnt = 1;
+    for (auto concat : num_concats) {
+      concats_cnt *= concat;
+    }
+
+    if (num_slices != concats_cnt) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Expect num_slices ", concats_cnt, " but got ", num_slices));
+    }
+
+    return XlaNDConcatenator<Device, T>(num_concats, num_slices, paddings,
+                                        has_paddings);
+  }
+  absl::Status ComputeInternal(
+      absl::Span<const Tensor> inputs,
+      const std::function<absl::Status(const Tensor&)>& assign_or_copy_value_fn,
+      const std::function<absl::StatusOr<Tensor*>()>& get_output_fn,
+      const Device& device) {
+    const int rank = inputs[0].shape().dims();
+
+    if (rank < 1 || rank > 8) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "'inputs' tensors must have rank in range (0, 8], but got ", rank,
+          "."));
+    }
+
+    if (num_slices_ == 1 && !has_paddings_) {
+      // Simple case
+      return assign_or_copy_value_fn(inputs[0]);
+    }
+
+    TF_ASSIGN_OR_RETURN(Tensor * output, get_output_fn());
+
+    if (rank == 1) {
+      MaybeUnpadAndAssign<1>(device, inputs, output);
+    } else if (rank == 2) {
+      MaybeUnpadAndAssign<2>(device, inputs, output);
+    } else if (rank == 3) {
+      MaybeUnpadAndAssign<3>(device, inputs, output);
+    } else if (rank == 4) {
+      MaybeUnpadAndAssign<4>(device, inputs, output);
+    } else if (rank == 5) {
+      MaybeUnpadAndAssign<5>(device, inputs, output);
+    } else if (rank == 6) {
+      MaybeUnpadAndAssign<6>(device, inputs, output);
+    } else if (rank == 7) {
+      MaybeUnpadAndAssign<7>(device, inputs, output);
+    } else if (rank == 8) {
+      MaybeUnpadAndAssign<8>(device, inputs, output);
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  template <int Rank>
+  class MaybeUnpadAndAssignState {
+   public:
+    int num_complete_pad_dims_;
+    int num_partial_pad_dims_;
+    TensorShape non_padded_slice_shape_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_shape_dsizes_;
+    Eigen::array<Eigen::IndexPair<int64_t>, Rank> slice_paddings_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_indices_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> output_slice_shape_dsizes_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> non_padded_slice_shape_dsizes_;
+
+    TF_ATTRIBUTE_NOINLINE MaybeUnpadAndAssignState(
+        absl::Span<const int32_t> num_concats, const Tensor& input0,
+        Tensor* output, int slice_index) {
+      slice_shape_dsizes_ = input0.shape().AsEigenDSizes<Rank>();
+      slice_indices_ =
+          GetSliceIndices<Rank>(num_concats, slice_shape_dsizes_, slice_index);
+      num_complete_pad_dims_ = 0;
+      num_partial_pad_dims_ = 0;
+      // Calculate paddings necessary to strip from slice.
+      for (int dim = 0; dim < Rank; ++dim) {
+        const int64_t dim_size = output->shape().dim_size(dim);
+        int64_t non_padded_dim = 0;
+        if (slice_indices_[dim] >= dim_size) {
+          // Complete padding.
+          slice_indices_[dim] = dim_size;
+          non_padded_dim = 0;
+          num_complete_pad_dims_++;
+        } else if (slice_indices_[dim] + slice_shape_dsizes_[dim] > dim_size) {
+          // Partial padding.
+          non_padded_dim = dim_size - slice_indices_[dim];
+          num_partial_pad_dims_++;
+        } else {
+          non_padded_dim = slice_shape_dsizes_[dim];
+        }
+        non_padded_slice_shape_.AddDim(non_padded_dim);
+      }
+      non_padded_slice_shape_dsizes_ =
+          non_padded_slice_shape_.AsEigenDSizes<Rank>();
+    }
+  };
+
+  std::vector<int32_t> num_concats_;
+  int num_slices_;
+  std::vector<int32_t> paddings_;
+  bool has_paddings_;
+
+  explicit TF_ATTRIBUTE_NOINLINE XlaNDConcatenator(
+      const std::vector<int32_t>& num_concats, int num_slices,
+      const std::vector<int32_t>& paddings, bool has_paddings)
+      : num_concats_(num_concats),
+        num_slices_(num_slices),
+        paddings_(paddings),
+        has_paddings_(has_paddings) {}
+
+  template <int Rank>
+  void TF_ATTRIBUTE_NOINLINE MaybeUnpadAndAssign(
+      const Device& device, absl::Span<const Tensor> inputs, Tensor* output) {
+    for (int i = 0; i < num_slices_; ++i) {
+      MaybeUnpadAndAssignState<Rank> r(num_concats_, inputs[0], output, i);
+      if (r.num_complete_pad_dims_ == Rank) {
+        continue;
+      } else if (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0) {
+        output->tensor<T, Rank>()
+            .slice(r.slice_indices_, r.non_padded_slice_shape_dsizes_)
+            .device(device) = inputs[i].tensor<T, Rank>().slice(
+            Eigen::DSizes<Eigen::DenseIndex, Rank>(),
+            r.non_padded_slice_shape_dsizes_);
+      } else {
+        output->tensor<T, Rank>()
+            .slice(r.slice_indices_, r.slice_shape_dsizes_)
+            .device(device) = inputs[i].tensor<T, Rank>();
+      }
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SHARDING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_layout.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_layout.h
new file mode 100644
index 00000000..9f4697c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_layout.h
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_LAYOUT_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_LAYOUT_H_
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/tpu/kernels/sparse_core_layout.pb.h"
+
+namespace tensorflow::tpu {
+
+// A class to figure out which tables to stack.
+class SparseCoreLayoutStacker {
+ public:
+  // Constructor.  Arguments:
+  //   num_partitions: How many shards the sparse core shards are concatenated
+  //     into (usually one per TPU chip).
+  //       NOTE: As of Q4 2023, SPMD is not supported by the sparse core python
+  //       libraries so we don't support it here.
+  //   sparse_cores_per_partition: Number of sparsecore per partition
+  //   disable_table_stacking: Should not stack tables.
+  explicit SparseCoreLayoutStacker(int num_partitions,
+                                   bool disable_table_stacking = false,
+                                   int sparse_cores_per_partition = 4);
+
+  // Change various limits. You must call these before calling Addtable.
+  void SetActivationMemoryBytesLimit(int64_t activation_mem_bytes_limit) {
+    CHECK(stacks_by_group_.empty()) << "must call before AddTable";
+    activation_mem_bytes_limit_ = activation_mem_bytes_limit;
+  }
+  void SetVariableShardBytesLimit(int64_t variable_shard_bytes_limit) {
+    CHECK(stacks_by_group_.empty()) << "must call before AddTable";
+    variable_shard_bytes_limit_ = variable_shard_bytes_limit;
+  }
+  void SetStackingEnabled(bool stacking_enabled) {
+    CHECK(stacks_by_group_.empty()) << "must call before AddTable";
+    stacking_enabled_ = stacking_enabled;
+  }
+  void SetStackingRowLimit(int64_t row_limit) {
+    CHECK(stacks_by_group_.empty()) << "must call before AddTable";
+    row_limit_ = row_limit;
+  }
+  void SetStackingTableLimit(int table_limit) {
+    CHECK(stacks_by_group_.empty()) << "must call before AddTable";
+    table_limit_ = table_limit;
+  }
+
+  // Add a new table.  Arguments:
+  //   table_name: How this table will be referred to.
+  //   table_height: The number of rows.
+  //   table_width: The number of columns in the input layer. For storage, this
+  //     will be rounded up to a multiple of eight, but the padding columns will
+  //     be stripped off when fed into the rest of the model.
+  //   group: An arbitrary identifier that should be derived from the optimizer
+  //     and hyperparameters. Only tables with the same group and rounded
+  //     table_width can be stacked. The actual contents of this field are not
+  //     particularly meaningful except they are used to construct the
+  //     stack_name field in the SparseCoreTableLayout.
+  //   output_samples: How many times a row from this table will have to be
+  //     returned per batch. This is ordinarily the batch size unless we lay out
+  //     several values from the same example in a sequence, or if multiple
+  //     features share the same table.
+  //
+  // Be sure you call AddTable in a deterministic order; the details of the
+  // stacking will depend on the order you call AddTable.
+  absl::Status AddTable(absl::string_view table_name, int64_t table_height,
+                        int64_t table_width, absl::string_view group,
+                        int64_t output_samples);
+
+  // Get the information about each table out.
+  absl::StatusOr<SparseCoreTableLayouts> GetLayouts();
+
+ private:
+  struct TableStack {
+    // A name we give the stack while we're constructing it. The name will be
+    // overridden later to be equal to the names of the tables.
+    std::string temporary_name;
+    int64_t padded_width = 0;
+    int64_t unsharded_height = 0;
+    int64_t total_activation_mem_bytes = 0;
+    int64_t total_variable_shard_bytes = 0;
+
+    // While we're filling out this structure, we can't fill out all the fields
+    // in the SparseCoreTableLayout; we fill out as many of them as we can.
+    std::vector<SparseCoreTableLayout> incomplete_tables;
+  };
+
+  const int num_partitions_;
+  const int sparse_cores_per_partition_;
+  const int num_sparse_cores_;
+
+  bool stacking_enabled_ = true;
+  int64_t activation_mem_bytes_limit_ = 0;
+  int64_t variable_shard_bytes_limit_ = 0;
+  // Sparse core ops use signed int for row numbers so we had better not stack
+  // beyond this limit.
+  int64_t row_limit_ = (1LL << 31) - 1;
+
+  // The maximum number of tables in any stack.
+  int table_limit_ = std::numeric_limits<int>::max();
+
+  // All the stacks that we currently know about. Note that we use a btree_map
+  // rather than a flat_hash_map so the resulting order is deterministic as long
+  // as we are called in a deterministic order. Key is (padded_width, group).
+  absl::btree_map<std::pair<int64_t, std::string>, std::vector<TableStack>>
+      stacks_by_group_;
+};
+
+}  // namespace tensorflow::tpu
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_ops_stats_handler.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_ops_stats_handler.h
new file mode 100644
index 00000000..8667d49a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_ops_stats_handler.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_STATS_HANDLER_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_STATS_HANDLER_H_
+
+#include <cstdint>
+#include <string>
+
+enum class StatsType {
+  NUM_MINIBATCHES_PER_SC,
+  MAX_IDS_PER_PARTITION,
+  MAX_UNIQUE_IDS_PER_PARTITION,
+  IDS_PER_PARTITION,
+  UNIQUE_IDS_PER_PARTITION,
+  DROPPED_ID_COUNT,
+};
+
+class SparseCoreOpsStatsHandler {
+ public:
+  virtual ~SparseCoreOpsStatsHandler() = default;
+  virtual void Record(
+      StatsType type, int64_t value, std::string device_name,
+      std::string table_name) { /* Default implementation does nothing */
+  }
+};
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_STATS_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
new file mode 100644
index 00000000..dc9b028e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Pad value used for SparseCore mini batching logic.
+const int32_t kXlaPadValue = std::numeric_limits<int32_t>::max();
+
+std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
+                                                   int max_division_level);
+
+int64 ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
+                                        int max_division_level);
+
+absl::Status ValidateInputCombiner(const std::string& combiner);
+
+std::function<float(float)> GetCombinerScaleContributionFunction(
+    absl::string_view combiner);
+
+std::function<float(float)> GetCombinerScaleTransformFunction(
+    absl::string_view combiner);
+
+// Stacks tables, so long as table have the same 'group' index. We assume that
+// all tables with a given group index have the same width. Returns a list of
+// list of table names, in alphabetical order.
+std::vector<std::vector<std::string>> GetTableStacks(
+    const std::vector<int64_t>& table_height,
+    const std::vector<int64_t>& table_width,
+    const std::vector<int64_t>& table_num_samples,
+    const std::vector<int64_t>& table_group,
+    const std::vector<std::string>& table_names, int64_t num_tpu_chips);
+
+int GetMinibatchMaxDivisionLevel();
+
+bool GetDisableTableStacking();
+
+int64_t GetXlaSparseCoreStackingMemLimit();
+
+int64_t GetXlaSparseCoreStackingTableShardLimit();
+
+absl::Status GetMaxIdsAndUniquesExternal(const std::string& program_key,
+                                         const std::string& table_name,
+                                         int64_t num_samples_per_sparse_core,
+                                         int64_t feature_width,
+                                         int64_t* max_ids_per_partition,
+                                         int64_t* max_unique_ids_per_partition);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
new file mode 100644
index 00000000..96b39458
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -0,0 +1,261 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_PREPROCESS_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_PREPROCESS_OPS_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/tpu/kernels/sparse_core_ops_stats_handler.h"
+
+namespace tensorflow {
+
+// Struct to describe an embedding lookup input data.
+struct EmbeddingLookupInput {
+  // Which replica it belongs.
+  int32 replica_id;
+  // Token id.
+  int32 token_id;
+  // Sample id.
+  int32 sample_id;
+  // Gain.
+  float gain;
+
+  EmbeddingLookupInput(int32 replica_id, int32 token_id, int32 sample_id,
+                       float gain)
+      : replica_id(replica_id),
+        token_id(token_id),
+        sample_id(sample_id),
+        gain(gain) {}
+};
+
+absl::Status ValidateInputs(const Tensor& indices_or_row_splits,
+                            const Tensor& values, const Tensor& weights,
+                            int sample_count);
+
+// Compute the row id list before padding.
+absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
+                                        int32 total_id_count,
+                                        int32 sample_count,
+                                        int32* row_ids_before_padding);
+
+class GetMinibatchesInCsrWithPhysicalReplicaOp : public OpKernel {
+ public:
+  explicit GetMinibatchesInCsrWithPhysicalReplicaOp(OpKernelConstruction* ctx);
+  ~GetMinibatchesInCsrWithPhysicalReplicaOp() override = default;
+  GetMinibatchesInCsrWithPhysicalReplicaOp(
+      const GetMinibatchesInCsrWithPhysicalReplicaOp&) = delete;
+  GetMinibatchesInCsrWithPhysicalReplicaOp& operator=(
+      const GetMinibatchesInCsrWithPhysicalReplicaOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  int sample_count_ = 1;
+  int feature_width_ = 1;
+  int64_t num_sc_per_chip_;
+  std::string table_name_;
+  std::unique_ptr<SparseCoreOpsStatsHandler> sparse_core_ops_stats_handler_;
+
+  bool allow_id_dropping_for_minibatching_ = false;
+
+ private:
+  int num_replica_ = 1;
+  int max_minibatches_per_sc_ = 1;
+  int max_ids_per_chip_per_sample_ = 1;
+  int table_vocab_size_ = 1;
+  std::string device_name_;
+};
+
+class GetMinibatchSplitsWithPhysicalReplicaOp : public OpKernel {
+ public:
+  explicit GetMinibatchSplitsWithPhysicalReplicaOp(OpKernelConstruction* ctx);
+  ~GetMinibatchSplitsWithPhysicalReplicaOp() override = default;
+  GetMinibatchSplitsWithPhysicalReplicaOp(
+      const GetMinibatchSplitsWithPhysicalReplicaOp&) = delete;
+  GetMinibatchSplitsWithPhysicalReplicaOp& operator=(
+      const GetMinibatchSplitsWithPhysicalReplicaOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
+                                 tstring program_key,
+                                 int64_t max_ids_per_partition,
+                                 int64_t max_unique_ids_per_partition,
+                                 int32_t dropped_id_count) {}
+  virtual inline int32_t CalculateBucketIdWithHashing(int32_t col_id,
+                                                      int32_t num_buckets) {
+    // TODO(pineapplejuice233): Add a proper hashing function here.
+    return col_id % num_buckets;
+  }
+
+  std::string device_name_;
+  std::string table_name_;
+  std::unique_ptr<SparseCoreOpsStatsHandler> sparse_core_ops_stats_handler_;
+  bool allow_id_dropping_for_minibatching_ = false;
+  bool allow_id_shuffling_for_minibatching_ = false;
+
+ private:
+  int num_replica_ = 1;
+  int sample_count_ = 1;
+  int table_vocab_size_ = 1;
+  int feature_width_ = 1;
+  int64_t num_sc_per_chip_;
+};
+
+class StoreMinibatchStatisticsInFdoOp : public OpKernel {
+ public:
+  explicit StoreMinibatchStatisticsInFdoOp(OpKernelConstruction* ctx);
+  ~StoreMinibatchStatisticsInFdoOp() override = default;
+  StoreMinibatchStatisticsInFdoOp(const StoreMinibatchStatisticsInFdoOp&) =
+      delete;
+  StoreMinibatchStatisticsInFdoOp& operator=(
+      const StoreMinibatchStatisticsInFdoOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
+                                 tstring program_key,
+                                 int64_t max_ids_per_partition,
+                                 int64_t max_unique_ids_per_partition) {}
+  std::string device_name_;
+  std::string table_name_;
+
+ private:
+  int num_replica_ = 1;
+  int sample_count_ = 1;
+  int feature_width_ = 1;
+  int64_t num_sc_per_chip_;
+};
+
+// TODO(pineapplejuice233): Unify this op with ConvertToListOfCooTensorsV2Op.
+class ConvertToListOfSparseCoreCooTensorsOp : public OpKernel {
+ public:
+  explicit ConvertToListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx);
+  ~ConvertToListOfSparseCoreCooTensorsOp() override = default;
+  ConvertToListOfSparseCoreCooTensorsOp(
+      const ConvertToListOfSparseCoreCooTensorsOp&) = delete;
+  ConvertToListOfSparseCoreCooTensorsOp& operator=(
+      const ConvertToListOfSparseCoreCooTensorsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  void WriteToOutputTensor(int32* row_ids, int32* col_ids, float* gains,
+                           int32* row_ids_tensor_ptr, int32* col_ids_tensor_ptr,
+                           float* gains_tensor_ptr, int32_t begin_index,
+                           int32_t end_index, int32_t sc_id,
+                           std::optional<std::vector<float>> gains_rescale);
+  int sample_count_;
+  int num_sc_per_chip_;
+  int per_sc_sample_count_;
+  int row_offset_;
+  int col_offset_;
+  int col_shift_;
+  int num_sc_shards_;
+  int stacked_table_sample_count_;
+  int num_sc_shards_bit_mod_;
+  int num_sc_shards_bit_mod_inv_;
+  int per_sc_row_offset_;
+  int per_sc_stacked_table_sample_count_;
+  std::string combiner_;
+};
+
+class SortListOfSparseCoreCooTensorsOp : public OpKernel {
+ public:
+  explicit SortListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx);
+  ~SortListOfSparseCoreCooTensorsOp() override = default;
+  SortListOfSparseCoreCooTensorsOp(const SortListOfSparseCoreCooTensorsOp&) =
+      delete;
+  SortListOfSparseCoreCooTensorsOp& operator=(
+      const SortListOfSparseCoreCooTensorsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32_t num_sc_per_chip_;
+  int32_t feature_width_;
+  int32_t num_replica_;
+  int32_t num_physical_replica_;
+  int32_t num_physical_replica_bit_;
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+  std::string table_name_;
+  std::vector<int32_t> sample_count_list_;
+  std::vector<int32_t> col_offset_list_;
+  std::map<int32_t, std::vector<int32_t>> col_offset_to_feature_id_;
+};
+
+class ConvertToSparseCoreCsrWrappedCooTensorOp : public OpKernel {
+ public:
+  explicit ConvertToSparseCoreCsrWrappedCooTensorOp(OpKernelConstruction* ctx);
+  ~ConvertToSparseCoreCsrWrappedCooTensorOp() override = default;
+  ConvertToSparseCoreCsrWrappedCooTensorOp(
+      const ConvertToSparseCoreCsrWrappedCooTensorOp&) = delete;
+  ConvertToSparseCoreCsrWrappedCooTensorOp& operator=(
+      const ConvertToSparseCoreCsrWrappedCooTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32_t num_sc_per_chip_;
+  int32_t table_vocab_size_;
+  int32_t feature_width_;
+  int32_t num_replica_;
+  int32_t sample_count_per_sc_;
+  int32_t max_minibatches_per_sc_;
+  int32_t max_ids_per_chip_per_sample_;
+  bool allow_id_dropping_;
+  std::string table_name_;
+  std::string device_name_;
+};
+
+class GetStatsFromListOfSparseCoreCooTensorsOp : public OpKernel {
+ public:
+  explicit GetStatsFromListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx);
+  ~GetStatsFromListOfSparseCoreCooTensorsOp() override = default;
+  GetStatsFromListOfSparseCoreCooTensorsOp(
+      const GetStatsFromListOfSparseCoreCooTensorsOp&) = delete;
+  GetStatsFromListOfSparseCoreCooTensorsOp& operator=(
+      const GetStatsFromListOfSparseCoreCooTensorsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32_t num_sc_per_chip_;
+  int32_t feature_width_;
+  int32_t num_replica_;
+  int32_t num_physical_replica_;
+  int32_t num_physical_replica_bit_;
+  std::string table_name_;
+  std::vector<int32_t> sample_count_list_;
+  std::vector<int32_t> col_offset_list_;
+  std::map<int32_t, std::vector<int32_t>> col_offset_to_feature_id_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_PREPROCESS_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
new file mode 100644
index 00000000..42a25836
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_FLAGS_DEFAULTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_FLAGS_DEFAULTS_H_
+
+#include <stdint.h>
+
+namespace tensorflow {
+
+constexpr int kDefaultSparseCoreMinibatchMaxDivisionLevel = 6;
+constexpr bool kDefaultDisableTableStacking = false;
+constexpr int64_t kDefaultXlaSparseCoreStackingMemLimit = 2097152;
+constexpr int64_t kDefaultXlaSparseCoreStackingTableShardLimit = 2147483648;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_FLAGS_DEFAULTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
new file mode 100644
index 00000000..71995cb9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/macros.h"
+
+// RAII helper to set the frontend attribute for the target chip to the SC.
+// Automatically restores the frontend attributes on exit.
+class UseSparseCoreFrontendAttributes {
+ public:
+  explicit UseSparseCoreFrontendAttributes(xla::XlaBuilder* builder)
+      : builder_(builder),
+        original_frontend_attributes_(builder->frontend_attributes()) {
+    xla::FrontendAttributes sc_attributes = original_frontend_attributes_;
+    (*sc_attributes.mutable_map())["_xla_compute_type"] = "sparse";
+    builder_->SetFrontendAttributes(sc_attributes);
+  }
+
+  ~UseSparseCoreFrontendAttributes() {
+    builder_->SetFrontendAttributes(original_frontend_attributes_);
+  }
+
+ private:
+  xla::XlaBuilder* builder_;
+  const xla::FrontendAttributes original_frontend_attributes_;
+
+  UseSparseCoreFrontendAttributes(const UseSparseCoreFrontendAttributes&) =
+      delete;
+  void operator=(const UseSparseCoreFrontendAttributes&) = delete;
+};
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
new file mode 100644
index 00000000..52d2d8b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_H_
+
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Cache entry to hold a `TpuProgramGroupInterface` object that can be used to
+// fetch a TPU program for a given TPU core index.
+class TpuCompilationCacheEntry {
+ public:
+  explicit TpuCompilationCacheEntry(
+      const TpuProgramGroupInterface* tpu_program_group, int core_index)
+      : tpu_program_group_(tpu_program_group), core_index_(core_index) {}
+
+  // Constructor for an empty entry.
+  TpuCompilationCacheEntry() : tpu_program_group_(nullptr), core_index_(-1) {}
+
+  const TpuProgramGroupInterface* tpu_program_group() const {
+    return tpu_program_group_;
+  }
+
+  int core_index() const { return core_index_; }
+
+ private:
+  const TpuProgramGroupInterface* tpu_program_group_;
+  int core_index_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
new file mode 100644
index 00000000..c85376a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuCompilationCacheEntryUnloader : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheEntryUnloader(TpuCompilationCacheInterface* cache)
+      : cache_(cache) {
+    // Hold a reference to the cache until the unloader is destroyed.
+    cache_->Ref();
+    VLOG(1) << "Will unload compilation cache entries when session closes.";
+  }
+
+  ~TpuCompilationCacheEntryUnloader() override {
+    absl::MutexLock lock(&mu_);
+    for (int64_t uid : cache_entry_uids_) {
+      absl::Status s = cache_->MarkEntryForEviction(uid);
+      if (!s.ok()) {
+        LOG(WARNING) << "MarkEntryForEviction in "
+                        "~CompilationCacheEntryUnloader fails with error "
+                     << s;
+      }
+    }
+    // Release our reference to the cache.
+    cache_->Unref();
+  }
+
+  // Add cache entry uid to be unloaded in destructor.
+  void AddCacheEntryUid(int64_t uid) {
+    absl::MutexLock lock(&mu_);
+    cache_entry_uids_.insert(uid);
+  }
+
+  std::string DebugString() const override {
+    return "CompilationCacheEntryUnloader";
+  }
+
+ private:
+  TpuCompilationCacheEntryUnloader(const TpuCompilationCacheEntryUnloader&) =
+      delete;
+  void operator=(const TpuCompilationCacheEntryUnloader&) = delete;
+  mutable absl::Mutex mu_;
+  TpuCompilationCacheInterface* cache_;  // Not owned.
+  absl::flat_hash_set<int64_t> cache_entry_uids_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_UNLOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
new file mode 100644
index 00000000..d415feae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_EXTERNAL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_EXTERNAL_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
+ public:
+  explicit TpuCompilationCacheExternal(int64_t max_cache_size)
+      : TpuCompilationCacheInterface(max_cache_size) {}
+
+  std::string DebugString() const override {
+    return "TpuCompilationCacheExternal";
+  }
+
+ private:
+  // Creates a new entry by running initialize_programs and places it in the
+  // cache to be looked up by key. The new entry is in the 'marked for eviction'
+  // state (not present in entries_by_last_use_) and the caller is expected to
+  // call LookupEntryMarkedForEviction after InitializeEntry.
+  //
+  // **InitializeEntry releases mu_ during the call to initialize_programs.**
+  CompiledSubgraph* InitializeEntry(
+      const std::string& key,
+      const std::function<absl::Status(TpuProgramGroupInterface*)>&
+          initialize_program,
+      const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(TpuCompilationCacheInterface::mu_) override;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_EXTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h
new file mode 100644
index 00000000..4710f916
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_FACTORY_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_FACTORY_H_
+
+#include <functional>
+
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+std::function<TpuCompilationCacheInterface*()> GetCompilationCacheCreateFn();
+
+void SetCompilationCacheCreateFn(
+    std::function<TpuCompilationCacheInterface*()> fn);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
new file mode 100644
index 00000000..b7c6b7c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
@@ -0,0 +1,236 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Copied from auto-generated gRPC code in order to enable using grpc_call.h
+// for raw message handling.
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
+
+#include <functional>
+#include <memory>
+
+#include "grpcpp/generic/async_generic_service.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/client_callback.h"
+#include "grpcpp/impl/codegen/client_context.h"
+#include "grpcpp/impl/codegen/completion_queue.h"
+#include "grpcpp/impl/codegen/method_handler.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/server_callback.h"
+#include "grpcpp/impl/codegen/server_context.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
+
+#if defined(LIBTPU_ON_GCE)
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#else
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"  // copybara"
+#endif
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace grpc {
+class TpuCompilationCacheService final {
+ public:
+  using RequestType = ::tensorflow::tpu::GetTpuProgramRequest;
+#if defined(LIBTPU_ON_GCE)
+  using ResponseType = ::tensorflow::tpu::GetTpuProgramResponseExternal;
+#else
+  using ResponseType = ::tensorflow::tpu::GetTpuProgramResponse;
+#endif
+
+  // N.B. This must be synchronized with the method order in
+  // tpu_compilation_cache.proto.
+  enum class MethodId { kGetTpuProgram = 0 };
+
+  static constexpr char const* service_full_name() {
+#if defined(LIBTPU_ON_GCE)
+    return "tensorflow.tpu.TpuCompilationCacheServiceExternal";
+#else
+    return "tensorflow.tpu.TpuCompilationCacheService";
+#endif
+  }
+  class StubInterface {
+   public:
+    virtual ~StubInterface() = default;
+    // This method requests the cached proto that the TPU execute op has
+    // been instructed to execute.
+    virtual ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
+                                         const RequestType& request,
+                                         ResponseType* response) = 0;
+    std::unique_ptr<::grpc::ClientAsyncResponseReaderInterface<ResponseType>>
+    AsyncGetTpuProgram(::grpc::ClientContext* context,
+                       const RequestType& request,
+                       ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<
+          ::grpc::ClientAsyncResponseReaderInterface<ResponseType>>(
+          AsyncGetTpuProgramRaw(context, request, cq));
+    }
+    std::unique_ptr<::grpc::ClientAsyncResponseReaderInterface<ResponseType>>
+    PrepareAsyncGetTpuProgram(::grpc::ClientContext* context,
+                              const RequestType& request,
+                              ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<
+          ::grpc::ClientAsyncResponseReaderInterface<ResponseType>>(
+          PrepareAsyncGetTpuProgramRaw(context, request, cq));
+    }
+
+   private:
+    virtual ::grpc::ClientAsyncResponseReaderInterface<ResponseType>*
+    AsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                          const RequestType& request,
+                          ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientAsyncResponseReaderInterface<ResponseType>*
+    PrepareAsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ::grpc::CompletionQueue* cq) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    explicit Stub(const std::shared_ptr<::grpc::ChannelInterface>& channel);
+    ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ResponseType* response) override;
+    std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>
+    AsyncGetTpuProgram(::grpc::ClientContext* context,
+                       const RequestType& request,
+                       ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>(
+          AsyncGetTpuProgramRaw(context, request, cq));
+    }
+    std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>
+    PrepareAsyncGetTpuProgram(::grpc::ClientContext* context,
+                              const RequestType& request,
+                              ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>(
+          PrepareAsyncGetTpuProgramRaw(context, request, cq));
+    }
+
+   private:
+    std::shared_ptr<::grpc::ChannelInterface> channel_;
+    ::grpc::ClientAsyncResponseReader<ResponseType>* AsyncGetTpuProgramRaw(
+        ::grpc::ClientContext* context, const RequestType& request,
+        ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientAsyncResponseReader<ResponseType>*
+    PrepareAsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ::grpc::CompletionQueue* cq) override;
+    const ::grpc::internal::RpcMethod rpcmethod_get_tpu_program_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr<::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class Service : public ::grpc::Service {
+   public:
+    Service();
+    ~Service() override;
+    // This method requests the cached proto that the TPU execute op has
+    // been instructed to execute.
+    virtual ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                         const RequestType* request,
+                                         ResponseType* response);
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithAsyncMethod_GetTpuProgram() { ::grpc::Service::MarkMethodAsync(0); }
+    ~WithAsyncMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestGetTpuProgram(
+        ::grpc::ServerContext* context, RequestType* request,
+        ::grpc::ServerAsyncResponseWriter<ResponseType>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+
+    // Make RequestAsyncUnary accessible to grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+  typedef WithAsyncMethod_GetTpuProgram<Service> AsyncService;
+  template <class BaseClass>
+  class WithGenericMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithGenericMethod_GetTpuProgram() { ::grpc::Service::MarkMethodGeneric(0); }
+    ~WithGenericMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithStreamedUnaryMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithStreamedUnaryMethod_GetTpuProgram() {
+      ::grpc::Service::MarkMethodStreamed(
+          0,
+          new ::grpc::internal::StreamedUnaryHandler<RequestType, ResponseType>(
+              std::bind(&WithStreamedUnaryMethod_GetTpuProgram<
+                            BaseClass>::StreamedGetTpuProgram,
+                        this, std::placeholders::_1, std::placeholders::_2)));
+    }
+    ~WithStreamedUnaryMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable regular version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    // replace default version of method with streamed unary
+    virtual ::grpc::Status StreamedGetTpuProgram(
+        ::grpc::ServerContext* context,
+        ::grpc::ServerUnaryStreamer<RequestType, ResponseType>*
+            server_unary_streamer) = 0;
+  };
+  typedef WithStreamedUnaryMethod_GetTpuProgram<Service> StreamedUnaryService;
+  typedef Service SplitStreamedService;
+  typedef WithStreamedUnaryMethod_GetTpuProgram<Service> StreamedService;
+};
+}  // namespace grpc
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
new file mode 100644
index 00000000..eca894ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -0,0 +1,335 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "xla/util.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
+#include "tensorflow/core/tpu/kernels/trace_util.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Base class that holds references to compiled protos so that the protos are
+// not garbage-collected before being used by execute ops. Use
+// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
+// ref holder object.
+class CompilationRefHolder : public ResourceBase {
+ public:
+  ~CompilationRefHolder() override = default;
+};
+
+// Wrapper for a cache entry returned by all the TpuCompilationCacheInterface
+// `Lookup` methods, and ensures the underlying proto is not garbage-collected
+// until the client discards the ptr.
+class CompilationCacheEntryRef {
+ public:
+  CompilationCacheEntryRef();
+  CompilationCacheEntryRef(TpuCompilationCacheInterface* parent,
+                           CompiledSubgraph* entry, int index);
+
+  virtual ~CompilationCacheEntryRef();
+
+  // Returns a TpuCompilationCacheEntry that should not be used beyond the
+  // lifetime of the CompilationCacheEntryRef.
+  virtual TpuCompilationCacheEntry get();
+
+  // Mutates this ref to point to the entry's subentry (for
+  // sharding/unsharding) or main entry (unchanged) as specified by
+  // fetch_target. The refcount is kept unchanged, since we only track the
+  // refcount of the main entry. The entry ref needs to point to the main
+  // entry before this call.
+  //
+  // If the requested subentry does not exist, the ref will point to a nullptr
+  // entry, and the original entry will be unref'ed.
+  virtual absl::Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target);
+
+ protected:
+  TpuCompilationCacheInterface* parent_;  // Not owned.
+  // A reference to entry_ is acquired in the constructor and released via
+  // parent->DiscardEntryRefs in the destructor.
+  CompiledSubgraph* entry_;
+  // The index of the program in entry_ that is returned by the get method.
+  int index_;
+};
+
+class TpuCompilationCacheInterface : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheInterface(int64_t max_cache_size);
+  ~TpuCompilationCacheInterface() override;
+
+  // Ensures there is an entry for key present in the cache. By the time
+  // CompileIfKeyAbsent returns there is guaranteed to be an entry in the cache
+  // for key, and that entry will remain valid at least until
+  // per_step_ref_holder is deleted. The first call to CompileIfKeyAbsent with a
+  // key that is not in the cache will evaluate compile_function to compute the
+  // value to use in the entry. Subsequent calls with the same key will block
+  // until compile_function completes. Other cache reads and inserts may proceed
+  // on other threads while compile_function is executing. If
+  // per_step_ref_holder is nullptr then the caller is responsible for calling
+  // Release(subgraph_key) to manually discard its reference to the compiled
+  // program, once the caller will not look up the compiled program again.
+  //
+  // compile_function should compile the subgraph represented by key and fill in
+  // one TPUExecutableProto per model-parallel core into its passed argument. It
+  // should return OK if and only if compilation succeeds. The executable proto
+  // vector will be discarded on non-OK status.
+  absl::Status CompileIfKeyAbsent(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64_t* uid,
+      std::vector<std::string>* proto_key,
+      std::vector<std::string>* sharding_key,
+      std::vector<bool>* may_modify_variables,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<absl::Status(TpuProgramGroupInterface*)>&
+          compile_function);
+
+  // Differences between MarkEntryForEviction and Release:
+  // There are two modes of managing cache entries:
+  // 1) LRU eviction + pinning; 2) manual.
+  // We use mode 1) if CompilationRefHolder is provided to CompileIfKeyAbsent.
+  // Otherwise it is manual mode (mainly used by XRT).
+  // MarkEntryForEviction should only be used in mode 1) to eagerly evict cache
+  // entries when callers know that they do not need them anymore.
+  // Release should only be used in mode 2) to explicitly remove an entry.
+
+  // Mark the entry indexed by `subgraph_uid` for eviction. This should only be
+  // called if per_step_ref_holder was NOT nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...). Otherwise, use Release(int64
+  // subgraph_uid).
+  absl::Status MarkEntryForEviction(int64_t subgraph_uid);
+
+  // Manually discards a reference to the compiled subgraph. This should only be
+  // called if per_step_ref_holder was nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...).
+  absl::Status Release(int64_t subgraph_uid);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by key. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  absl::Status Lookup(const std::string& proto_key,
+                      std::unique_ptr<CompilationCacheEntryRef>* entry);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by uid. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  absl::Status Lookup(int64_t uid, int proto_index,
+                      std::unique_ptr<CompilationCacheEntryRef>* entry);
+
+  // Looks up the subgraph represented by uid, and returns the vector of keys,
+  // one per core, corresponding to that subgraph.
+  absl::Status GetKeysFromUid(int64_t uid, std::vector<std::string>* keys);
+
+  // Makes a reference holder for this cache, that can be stored in the per-step
+  // resource manager and will ensure that compiled entries persist until the
+  // end of a step.
+  CompilationRefHolder* MakePerStepRefHolder();
+
+  // Convenience method called by ~RefHolder without mu_ held. Calls
+  // DiscardEntryRef on every element of entries.
+  void DiscardEntryRefs(absl::Span<CompiledSubgraph* const> entries);
+
+  std::string DebugString() const override { return "TpuCompilationCacheBase"; }
+
+ protected:
+  std::string ConstructCompilationCacheKey(const TpuCompilationCacheKey& key) {
+    if (!key.has_guaranteed_const) {
+      return key.prefix;
+    }
+    return absl::StrCat(key.prefix, "|", key.session_handle, "|",
+                        key.guaranteed_const_fingerprint());
+  }
+
+  // Private implementation of the generic CompilationRefHolder that knows about
+  // CompiledSubgraph entries.
+  class RefHolder : public CompilationRefHolder {
+   public:
+    explicit RefHolder(TpuCompilationCacheInterface* parent);
+    ~RefHolder() override;
+
+    // Adds entry to the list of entries that will be released when the
+    // RefHolder is destroyed. Each entry is released via a call to
+    // parent_->DiscardEntryRefs.
+    void AddRef(CompiledSubgraph* entry);
+
+    std::string DebugString() const override;
+
+   private:
+    TpuCompilationCacheInterface* parent_;  // Not owned.
+    std::vector<CompiledSubgraph*> entries_;
+  };
+
+  // The bulk of implementation of CompileIfKeyAbsent() with the exception
+  // of unloading programs that corresponds to possibly removed cache
+  // entries. The split helps to manage locking since we prefer to perform
+  // unloading without holding extra locks.
+  absl::Status CompileIfKeyAbsentHelper(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64_t* uid,
+      std::vector<std::string>* proto_key,
+      std::vector<std::string>* sharding_key,
+      std::vector<bool>* may_modify_variables,
+      std::vector<CompiledSubgraph*>* removed_entries,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<absl::Status(TpuProgramGroupInterface*)>&
+          compile_function);
+
+  // This is called by the cache when entry is marked for eviction; by
+  // a RefHolder (via DiscardEntryRefs) when a step completes; and by
+  // an EntryRefImpl when it is destroyed. Releases one reference to entry
+  // if more than 1 remains. If only one reference is left, the entry is removed
+  // from cache_ and is returned to the caller; which must eventually call
+  // UnloadAndDestroy(). We do not call UnloadAndDestroy within DiscardEntryRef
+  // to avoid holding the lock during program unloading.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* DiscardEntryRef(
+      CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Marks the oldest unmarked entry for eviction. Requires that there is at
+  // least one such entry. In case the evicted entry had only 1 reference it
+  // is removed from the cache and returned to the caller which must eventually
+  // call UnloadAndDestroy.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* MarkOldestEntryForEviction()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Updates datastructures to indicate that entry, which had been marked for
+  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
+  // entry is newly created, or an entry that has been marked for eviction but
+  // not yet evicted is looked up.
+  //
+  // First the entry is unmarked for eviction, i.e. the cache gains a reference
+  // to entry, entry's last_use field is set to be the most recent value of
+  // use_counter_ and entries_by_last_use_ is updated accordingly.
+  //
+  // Next, the size of the cache is examined to see if any other entries need to
+  // be marked for eviction now that entry has been unmarked. While the total
+  // size of unmarked cached entries is greater than max_cache_size_, entries
+  // are marked for eviction in LRU order. The most recently used entry is never
+  // marked for eviction, so an entry larger than the max cache size will remain
+  // in the cache until it is replaced by something else. In case some entries
+  // actually were removed from the cache, they are a returned to the caller via
+  // removed_entries. The caller must eventually delete them by calling
+  // UnloadAndDestroy.
+  void LookupEntryMarkedForEviction(
+      CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Removes the entry with given key from cache.
+  size_t RemoveEntry(const std::string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Inserts the given key and entry to cache.
+  void InsertEntry(const std::string& key, CompiledSubgraph* entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns the cache key matching given subgraph_key.
+  std::string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Creates a new entry by running initialize_programs and places it in the
+  // cache to be looked up by key. The new entry is in the 'marked for eviction'
+  // state (not present in entries_by_last_use_) and the caller is expected to
+  // call LookupEntryMarkedForEviction after InitializeEntry.
+  //
+  // **InitializeEntry releases mu_ during the call to initialize_programs.**
+  virtual CompiledSubgraph* InitializeEntry(
+      const std::string& key,
+      const std::function<absl::Status(TpuProgramGroupInterface*)>&
+          initialize_programs,
+      const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+
+  // Unloads the program associated with the entry from all local devices
+  // and deletes the entry itself. It is assumed no one else has a reference
+  // to it and all related keys had already been removed from the cache.
+  // The call can perform device IO so no locks should be held while calling it.
+  void UnloadAndDestroy(CompiledSubgraph* entry) ABSL_LOCKS_EXCLUDED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // marked for eviction.
+  const int64_t max_cache_size_;
+  // Mutex to protect access to shared resources under multi-threading
+  // environment.
+  absl::Mutex mu_;
+  // The total size of entries that are stored and not marked for eviction.
+  int64_t cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The total size of entries that are marked for eviction.
+  int64_t marked_for_eviction_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64_t use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // session_key_map_ and fingerprint_key_map_ are used for looking up the
+  // cache_ key matching a given subgraph key. When doing a lookup, check
+  // session_key_map_ first to avoid unnecessay fingerprint computation.
+  // Map from key prefix + session_handle to a cache_ key.
+  absl::node_hash_map<std::string, std::string> session_key_map_
+      ABSL_GUARDED_BY(mu_);
+  // Map from key prefix + fingerprint to a cache_ key.
+  absl::node_hash_map<std::string, std::string> fingerprint_key_map_
+      ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache. An entry is
+  // marked for eviction iff it is present in cache_ and not in
+  // entries_by_last_use_.
+  std::unordered_map<std::string, CompiledSubgraph*> cache_
+      ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache, indexed by
+  // uid.
+  absl::flat_hash_map<int64_t, CompiledSubgraph*> entries_by_uid_
+      ABSL_GUARDED_BY(mu_);
+  // All the protos that can be looked up in the cache, indexed by proto
+  // key. The value of the map is a subgraph and the index of the proto compiled
+  // for that subgraph.
+  std::unordered_map<std::string, std::pair<CompiledSubgraph*, int>>
+      entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to mark entries for eviction in LRU
+  // order. If an entry's last_use counter is not present as a key in
+  // entries_by_last_use_ then the entry has been marked for eviction.
+  std::map<int64_t, CompiledSubgraph*> entries_by_last_use_
+      ABSL_GUARDED_BY(mu_);
+
+  TpuCompilationMetrics tpu_compilation_metrics_;
+
+ private:
+  TpuCompilationCacheInterface(const TpuCompilationCacheInterface&) = delete;
+  TpuCompilationCacheInterface& operator=(const TpuCompilationCacheInterface&) =
+      delete;
+};
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h
new file mode 100644
index 00000000..59086f84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_KEY_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_KEY_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace tensorflow {
+namespace tpu {
+
+struct TpuCompilationCacheKey {
+  // Prefix of the key.
+  std::string prefix;
+
+  // A boolean flag to specify if `guaranteed_const` is used. Guarantee const is
+  // normally used in TPU inference to avoid re-copying unchanged variables onto
+  // the TPU device. It promises the value is identical for every execution in
+  // the same session even if the actual value changes in later executions.
+  bool has_guaranteed_const = false;
+
+  // Unique session identifier. It is set when `has_guaranteed_const` is true.
+  std::string session_handle;
+
+  // Unique session identifier for TPU compilation; it should be a 64 bit
+  // positive integer, which can uniquely distinguish a live session.
+  // TPU compiler may use this information to choose dynamically provided
+  // compilation options without hurting reproducibility for debugging.
+  uint64_t session_id;
+
+  // Fingerprint of `guaranteed_const` value. It is set when the value of the
+  // `has_guaranteed_const` is true. Produce the value when necessary.
+  std::function<std::string()> guaranteed_const_fingerprint;
+
+  // A more verbose key for debugging purpose.
+  std::string debug_string;
+
+  // Constructs the TPU compilation cache key by concatenating the `prefix`,
+  // `session_handle` and `guaranteed_const_fingerprint`.
+  std::string ToString() const {
+    if (!has_guaranteed_const) {
+      return prefix;
+    }
+    return absl::StrCat(prefix, "|", session_handle, "|",
+                        guaranteed_const_fingerprint());
+  }
+
+  explicit TpuCompilationCacheKey() = default;
+  explicit TpuCompilationCacheKey(const std::string& p) : prefix(p) {}
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_KEY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
new file mode 100644
index 00000000..40b4f862
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Class for looking up TPU programs when the execute and compile Op are in the
+// same address space. The proto is simply looked up in the compilation cache,
+// without any serialization taking place.
+class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
+ public:
+  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
+  ~TpuCompilationCacheLocalLookup() override;
+
+  absl::Status Lookup(const std::string& proto_key,
+                      std::unique_ptr<CompilationCacheEntryRef>* entry,
+                      CompilationCacheFetchTarget fetch_target) override;
+
+  absl::Status Lookup(int64_t uid, int proto_index,
+                      std::unique_ptr<CompilationCacheEntryRef>* entry,
+                      CompilationCacheFetchTarget fetch_target) override;
+
+  std::string DebugString() const override;
+
+ private:
+  // The subgraph compilation cache, in the same process address space where the
+  // lookups are happening.
+  TpuCompilationCacheInterface* cache_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
new file mode 100644
index 00000000..0cdfe64e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// TODO(b/162241759): consider merging TpuCompilationCacheLookup and
+// TpuCompilationCacheInterface.
+// Base class allowing Execute Ops to look up TPU programs. Different subclasses
+// are used when the execute Op is in the same address space as the compile Op,
+// and when they need to communicate over RPC.
+class TpuCompilationCacheLookup : public ResourceBase {
+ public:
+  ~TpuCompilationCacheLookup() override = default;
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by key. On success a wrapper for the proto is
+  // returned in program. The wrapper is guaranteed to be valid only during the
+  // execution of the Op requesting the proto.
+  //
+  // Only one of the main, sharding, unsharding entries is fetched, as specified
+  // in fetch_target.
+  //
+  // If the compilation does not create sharding/unsharding programs, but the
+  // fetch_target requests one of them, then after this call
+  //   (*entry)->get().get_executable() will return nullptr.
+  virtual absl::Status Lookup(const std::string& proto_key,
+                              std::unique_ptr<CompilationCacheEntryRef>* entry,
+                              CompilationCacheFetchTarget fetch_target) = 0;
+
+  virtual absl::Status Lookup(
+      const std::string& proto_key,
+      std::unique_ptr<CompilationCacheEntryRef>* entry) {
+    return Lookup(proto_key, std::move(entry),
+                  CompilationCacheFetchTarget::MAIN);
+  }
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by uid. On success a wrapper for the proto is
+  // returned in program. The wrapper is guaranteed to be valid only during the
+  // execution of the Op requesting the proto.
+  virtual absl::Status Lookup(int64_t uid, int proto_index,
+                              std::unique_ptr<CompilationCacheEntryRef>* entry,
+                              CompilationCacheFetchTarget fetch_target) = 0;
+
+  virtual absl::Status Lookup(
+      int64_t uid, int proto_index,
+      std::unique_ptr<CompilationCacheEntryRef>* entry) {
+    return Lookup(uid, proto_index, std::move(entry),
+                  CompilationCacheFetchTarget::MAIN);
+  }
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
new file mode 100644
index 00000000..e8666ec6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Class for looking up and caching TPU program via RPC.
+class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
+ public:
+  using StubType = tpu::grpc::TpuCompilationCacheService::Stub;
+
+  TpuCompilationCacheRpcLookup(const string& server_address,
+                               int64_t max_cache_size);
+  ~TpuCompilationCacheRpcLookup() override = default;
+
+  absl::Status Lookup(const string& proto_key,
+                      std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+                      tpu::CompilationCacheFetchTarget fetch_target) override;
+
+  absl::Status Lookup(int64_t uid, int proto_index,
+                      std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+                      tpu::CompilationCacheFetchTarget fetch_target) override;
+
+  string DebugString() const override;
+
+ private:
+  // Helper method to make the RPC request to the central cache.
+  absl::Status RemoteLookupLocked(const string& local_proto_key,
+                                  const tpu::GetTpuProgramRequest& request,
+                                  std::shared_ptr<CacheEntry>* cache_entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Helper method to adjust datastructures after a cache lookup.
+  // We use `removed_entries` so that actual CacheEntry destruction happens
+  // outside the lock.
+  void PostLookupLocked(
+      std::shared_ptr<CacheEntry>* cache_entry,
+      std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+      std::vector<std::shared_ptr<CacheEntry>>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // evicted.
+  const int64_t max_cache_size_;
+
+  std::unique_ptr<StubType> stub_;
+
+  // Protect concurrent access to member variables below.
+  mutable absl::Mutex mu_;
+
+  // The total size of entries in the cache.
+  int64_t cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64_t use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // The entries that can be looked up in the cache. An entry is deleted from
+  // the cache as soon as it is evicted, but the underlying shared_ptr won't be
+  // freed until any wrappers holding it go out of scope.
+  std::unordered_map<std::string, std::shared_ptr<CacheEntry>> cache_
+      ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to evict entries in LRU order.
+  std::map<int64_t, CacheEntry*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+};
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
new file mode 100644
index 00000000..7a2f25f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
@@ -0,0 +1,99 @@
+#include "absl/status/statusor.h"
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_SUPPORT_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_SUPPORT_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/support/slice.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// A cache entry for remote TPU compilation.
+struct CacheEntry {
+  CacheEntry() : size(0), last_use(-1) {}
+  virtual ~CacheEntry() {
+    if (tpu_program_group != nullptr) {
+      tpu_program_group->UnloadAndDestroyPrograms();
+    }
+  }
+  std::unique_ptr<TpuProgramGroupInterface> tpu_program_group;
+  std::string key;
+  int64_t size;
+
+  // An integer-based monotonically increasing counter used by the TPU
+  // compilation cache to sort and evict the least recently used entry when the
+  // cache size exceeded the maximum size limit. The value is initialized to
+  // `-1` as an initial value.
+  int64_t last_use;
+};
+
+// Implementation of `CompilationCacheEntryRef` that holds a shared_ptr to the
+// local cache entry until the wrapper is destroyed.
+class CacheWrapper : public CompilationCacheEntryRef {
+ public:
+  explicit CacheWrapper(std::shared_ptr<CacheEntry> entry)
+      : cache_entry_(std::move(entry)) {}
+  ~CacheWrapper() override = default;
+
+  TpuCompilationCacheEntry get() override {
+    if (cache_entry_->size == 0) {
+      // Create an empty entry if the size is 0. This corresponds to
+      // non-existing sharding/unsharding entries.
+      return TpuCompilationCacheEntry();
+    }
+    return TpuCompilationCacheEntry(cache_entry_->tpu_program_group.get(),
+                                    /*core_index=*/0);
+  }
+
+  absl::Status ToSubEntryRef(
+      CompilationCacheFetchTarget fetch_target) override {
+    LOG(FATAL) << "Not implemented by designed.";
+  }
+
+ private:
+  std::shared_ptr<CacheEntry> cache_entry_;
+};
+
+// Creates gRPC channel credentials for the current runtime env.
+std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials();
+
+// Fills an uinitialized `CacheEntry` from `GetTpuProgramResponse` proto. The
+// `cache_entry` will be instantiated by the function.
+template <typename ResponseType>
+absl::Status DeserializeRpcResponseToCacheEntry(
+    absl::string_view local_proto_key, ResponseType* response,
+    std::shared_ptr<CacheEntry>* cache_entry);
+
+// Serializes `TpuCompilationCacheEntry` to gRPC bufer slices.
+absl::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+    const TpuCompilationCacheEntry& cache_entry);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
new file mode 100644
index 00000000..6dd644d3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
+
+#include <atomic>
+#include <memory>
+
+#include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+// gRPC service for handling CompilationCache requests.
+// To avoid OOMs during execution, this service using the asynchronous raw gRPC
+// interface to serialize cache results directly to gRPC byte buffers. This
+// allows us to control serialization concurrency and avoids making an extra
+// copy of the program cache for each worker.
+class TpuCompilationCacheService {
+ public:
+  using ServiceType = ::tensorflow::tpu::grpc::TpuCompilationCacheService;
+  using AsyncService = ServiceType::AsyncService;
+
+  TpuCompilationCacheService(::grpc::ServerBuilder* server_builder,
+                             tpu::TpuCompilationCacheInterface* cache);
+  ~TpuCompilationCacheService();
+
+  void Start();
+  bool Shutdown(int timeout_sec);
+  void SetMemoryQuota(size_t max_bytes);
+
+ private:
+  void HandleRPCsLoop();
+
+  using GetTpuProgramCall =
+      tsl::Call<TpuCompilationCacheService, AsyncService,
+                tpu::GetTpuProgramRequest, ::grpc::ByteBuffer>;
+
+  // Schedule the cache fetch into the serving thread pool.
+  void HandleGetTpuProgram(GetTpuProgramCall* call);
+
+  // Performs the actual cache fetch and serialization.
+  void GetTpuProgram(GetTpuProgramCall* call);
+
+  std::atomic<bool> running_;
+  tpu::TpuCompilationCacheInterface* cache_;
+  ::grpc::ServerBuilder* server_builder_;
+  std::unique_ptr<::grpc::Server> server_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<Thread> polling_thread_;
+  AsyncService service_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_metrics.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_metrics.h
new file mode 100644
index 00000000..f201fd27
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compilation_metrics.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_METRICS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_METRICS_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Tracks Tpu compilation and cache metrics.
+class TpuCompilationMetrics {
+ public:
+  // Increments the number of cache lookup count.
+  static void IncrementCacheLookupCount(bool is_cache_hit,
+                                        absl::string_view session_name);
+
+  // Sets the total count of cache entries.
+  static void SetCacheEntryCount(int64_t count);
+
+  // Increments number of compilation.
+  static void IncrementCompilationCount(absl::string_view session_name);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op.h
new file mode 100644
index 00000000..4b8956f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op.h
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_common.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// The TPUCompile operator compiles a Tensorflow function into a
+// TPU executable to be run by TPUExecute.
+//
+class TpuCompileOp : public OpKernel {
+ public:
+  explicit TpuCompileOp(OpKernelConstruction* ctx);
+
+  TpuCompileOp(const TpuCompileOp&) = delete;
+  TpuCompileOp& operator=(const TpuCompileOp&) = delete;
+
+  ~TpuCompileOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<TpuCompileOpKernelCommon> impl_;
+};
+
+// The TPUCompile operator compiles a MLIR module into a
+// TPU executable to be run by TPUExecute.
+//
+class TpuCompileMlirOp : public OpKernel {
+ public:
+  explicit TpuCompileMlirOp(OpKernelConstruction* ctx);
+
+  TpuCompileMlirOp(const TpuCompileMlirOp&) = delete;
+  TpuCompileMlirOp& operator=(const TpuCompileMlirOp&) = delete;
+
+  ~TpuCompileMlirOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<TpuCompileOpKernelCommon> impl_;
+};
+
+class TpuCompileSucceededAssertOp : public OpKernel {
+ public:
+  explicit TpuCompileSucceededAssertOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  TpuCompileSucceededAssertOp(const TpuCompileSucceededAssertOp&) = delete;
+  TpuCompileSucceededAssertOp& operator=(const TpuCompileSucceededAssertOp&) =
+      delete;
+
+  ~TpuCompileSucceededAssertOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
new file mode 100644
index 00000000..4c4dfdd0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -0,0 +1,207 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_COMMON_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_COMMON_H_
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace tensorflow {
+namespace tpu {
+
+// Forward declaration, defined below.
+class TpuCompileOpKernelCommon;
+
+// A base factory class for creating a `TpuCompileOpKernelImpl` variant.
+// By design, the actual factory can only be set once.
+class CompileOpImplFactory {
+ public:
+  virtual ~CompileOpImplFactory() = default;
+
+  virtual absl::StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>>
+  CreateNonMlirImpl(OpKernelConstruction* ctx) = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>>
+  CreateMlirImpl(OpKernelConstruction* ctx) = 0;
+
+  static CompileOpImplFactory* Get();
+  static void Register(CompileOpImplFactory* factory);
+
+ private:
+  static CompileOpImplFactory* factory_;
+};
+
+// Abstract base class for TpuCompileOpKernel implementation.
+class TpuCompileOpKernelCommon {
+ public:
+  TpuCompileOpKernelCommon(const std::string& mlir_module,
+                           const tpu::TPUCompileMetadataProto metadata,
+                           int num_computations, bool return_hlo_protos,
+                           bool unload_cache_on_session_close)
+      : metadata_(metadata),
+        use_mlir_(true),
+        mlir_module_(mlir_module),
+        num_computations_(num_computations),
+        return_hlo_protos_(return_hlo_protos),
+        unload_cache_entry_on_session_close_(unload_cache_on_session_close),
+        persistent_cache_(nullptr) {
+    mlir_module_fingerprint_ = tensorflow::Fingerprint64(mlir_module_);
+  }
+
+  TpuCompileOpKernelCommon(
+      const NameAttrList& function, const tpu::TPUCompileMetadataProto metadata,
+      int num_computations, bool return_hlo_protos,
+      bool unload_cache_on_session_close,
+      std::unique_ptr<TpuPersistentCompilationCacheInterface> persistent_cache)
+      : metadata_(metadata),
+        use_mlir_(false),
+        function_(function),
+        num_computations_(num_computations),
+        return_hlo_protos_(return_hlo_protos),
+        unload_cache_entry_on_session_close_(unload_cache_on_session_close),
+        persistent_cache_(std::move(persistent_cache)) {}
+
+  TpuCompileOpKernelCommon(const TpuCompileOpKernelCommon&) = delete;
+  TpuCompileOpKernelCommon& operator=(const TpuCompileOpKernelCommon&) = delete;
+
+  virtual ~TpuCompileOpKernelCommon() = default;
+
+  void Compute(OpKernelContext* ctx);
+
+  // Lowers Mlir or TF Function computation into HLO IR and using XLA compiler
+  // compiles into TPU programs ready for execution.
+  virtual absl::Status Compile(
+      const std::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+      const XLA_TpuMeshState* mesh_state,
+      const std::vector<TensorShape>& arg_shapes,
+      const TpuCompilationCacheKey* key,
+      TpuProgramGroupInterface* tpu_program_group) = 0;
+
+  // Performs shape inference on `computation`, filling shape_info with operator
+  // shapes. The shapes of the _Arg nodes are taken from `arg_shapes`.
+  static absl::Status RunShapeInferenceOnComputation(
+      const tpu::TPUCompileMetadataProto& metadata,
+      const std::vector<PartialTensorShape>& arg_shapes, Graph* graph,
+      FunctionLibraryRuntime* flr, GraphShapeInfo* shape_info);
+
+ protected:
+  absl::Status ComputeInternal(OpKernelContext* ctx);
+
+  // Compile TPU program locally and populate the host compilation cache.
+  absl::Status CompileLocallyAndFillHostCache(
+      FunctionLibraryRuntime* flib_runtime,
+      const SessionMetadata* session_metadata,
+      const TpuMeshStateInterface* mesh_state,
+      const std::vector<TensorShape>& dynamic_shapes,
+      const OpInputList& guaranteed_constants,
+      const tpu::TpuCompilationCacheKey& key,
+      TpuProgramGroupInterface* tpu_program_group);
+
+  absl::Status CompileLocallyAndFillHostCacheInternal(
+      FunctionLibraryRuntime* flib_runtime,
+      const SessionMetadata* session_metadata,
+      const TpuMeshStateInterface* mesh_state,
+      const std::vector<TensorShape>& dynamic_shapes,
+      const OpInputList& guaranteed_constants,
+      const tpu::TpuCompilationCacheKey& key,
+      TpuProgramGroupInterface* tpu_program_group);
+
+  // Lookup from persistent compilation cache and populate both host cache and
+  // persistent cache.
+  virtual absl::Status LookupPersistentCompilationCacheAndFillCaches(
+      FunctionLibraryRuntime* flib_runtime,
+      const SessionMetadata* session_metadata,
+      const TpuMeshStateInterface* mesh_state,
+      const std::vector<TensorShape>& dynamic_shapes,
+      const OpInputList& guaranteed_constants,
+      TpuPersistentCompilationCacheInterface* persistent_cache,
+      const tpu::TpuCompilationCacheKey& key,
+      TpuProgramGroupInterface* tpu_program_group) {
+    LOG(FATAL) << "Lookup from a persistent cache is NOT supported.";
+  }
+
+  // Sleeps for `kSleepSeconds` seconds to give time for TPUCompileOp to finish
+  // before terminating peacefully.
+  static void ExitCountdown(tsl::Env* env,
+                            std::shared_ptr<std::atomic<bool>> done);
+
+  // Converts the `dynamic_shapes` arguments to the compile operator into
+  // TensorShapes.
+  static absl::Status GetDynamicShapes(OpKernelContext* ctx,
+                                       std::vector<TensorShape>* shapes);
+
+  tpu::TPUCompileMetadataProto metadata_;
+
+  // Whether to compile given MLIR module in `mlir_module` instead of
+  // TensorFlow function referenced in `function_`.
+  bool use_mlir_;
+
+  // Function containing the computation to compile.
+  NameAttrList function_;
+
+  // A serialized MLIR ModuleOp.
+  std::string mlir_module_;
+  // Fingerprint of the MLIR Module created once on construction to avoid paying
+  // the cost on each invocation.
+  uint64 mlir_module_fingerprint_ = 0;
+
+  // Number of different programs to compile. This maps to number of cores in
+  // each replica.
+  int num_computations_;
+
+  // A flag to populate HLO protos field in CompilationResultProto. The HLO
+  // metadata could be large so default to not populating it unless explicitly
+  // requested.
+  bool return_hlo_protos_;
+
+  // If enabled, DirectSession::Close will unload cache entries created during
+  // the lifetime of the session.
+  bool unload_cache_entry_on_session_close_;
+
+  // Persistent cache for compiled TPU program for inference.
+  std::unique_ptr<TpuPersistentCompilationCacheInterface> persistent_cache_;
+
+  absl::Status RegisterXLAFingerprints(
+      const std::vector<TensorShape>& arg_shapes,
+      TpuProgramGroupInterface* tpu_program_group, uint64 fingerprint);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
new file mode 100644
index 00000000..1f5fdb52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_impl.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_IMPL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_IMPL_H_
+
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_common.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Base class for TpuCompileOp and TpuCompileMlirOp.
+// Depends on whether it is given a computation in the form of serialized MLIR
+// module or a Tensorflow function, TpuCompileOpKernelImpl converts computation
+// into XLA HLO and then into a TPU execuable binary.
+class TpuCompileOpKernelImpl : public TpuCompileOpKernelCommon {
+ public:
+  TpuCompileOpKernelImpl(const std::string& mlir_module,
+                         const tpu::TPUCompileMetadataProto& metadata,
+                         int num_computations, bool return_hlo_protos,
+                         bool unload_cache_on_session_close)
+      : TpuCompileOpKernelCommon(mlir_module, metadata, num_computations,
+                                 return_hlo_protos,
+                                 unload_cache_on_session_close) {}
+
+  TpuCompileOpKernelImpl(const NameAttrList& function,
+                         const tpu::TPUCompileMetadataProto& metadata,
+                         int num_computations, bool return_hlo_protos,
+                         bool unload_cache_on_session_close)
+      : TpuCompileOpKernelCommon(
+            function, metadata, num_computations, return_hlo_protos,
+            unload_cache_on_session_close, /*persistent_cache=*/nullptr) {}
+
+  absl::Status Compile(
+      const std::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+      const XLA_TpuMeshState* mesh_state,
+      const std::vector<TensorShape>& arg_shapes,
+      const TpuCompilationCacheKey* key,
+      TpuProgramGroupInterface* tpu_program_group) override;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_options.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_options.h
new file mode 100644
index 00000000..b81fe4a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_options.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_OPTIONS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_OPTIONS_H_
+
+#include <string>
+
+namespace tensorflow {
+namespace internal {
+
+// Setter and getter that determine how TPUCompile responds to cancelled
+// compilation.  By default this is true, meaning cancelled compilation will
+// abort the process, since that's the only mechanism we have available.
+//
+// Setting this to false allows the process to remain alive, and should only be
+// used in tests.
+void SetTpuCompilationCancellationTerminatesProcess(bool b);
+bool TpuCompilationCancellationTerminatesProcess();
+
+// Setter and getter that determine whether TPU compilation failure will cause
+// chips to close. By default this is true, it is suitable for training. For
+// inference, we never want servers to die and thus chips will keep alive.
+// See b/109873767.
+void SetTpuCompilationFailureClosesChips(bool value);
+bool TpuCompilationFailureClosesChips();
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
new file mode 100644
index 00000000..e7ec4ac6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -0,0 +1,168 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_SUPPORT_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_SUPPORT_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// List of parameters for lowering Mlir to HLO IR.
+// If mlir_module_op is set, it will be used instead of mlir_module.
+struct MlirToHloArgs {
+  absl::string_view mlir_module;
+  ConfigProto::Experimental::MlirBridgeRollout rollout_state =
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
+  std::optional<mlir::ModuleOp> mlir_module_op;
+};
+
+// Variant of guaranteed constant tensors types.
+using GuaranteedConsts = std::variant<absl::Span<const TensorProto* const>,
+                                      const OpInputList* const>;
+
+// List of parameters for lowering function library definition to HLO IR.
+struct FunctionToHloArgs {
+  const NameAttrList* const function;
+  const FunctionLibraryDefinition* flib_def;
+  int graph_def_version;
+  GuaranteedConsts guaranteed_constants;
+};
+
+// Persistent cache for compiled TPU program and the related compiler metadata
+// intended for TPU inference.
+// TODO(henrytan): there is an opportunity to consolidate the interface with the
+// `TpuCompilationCacheInterface` once `TpuPersistentCompilationCache` is
+// converted into a ref count based class.
+class TpuPersistentCompilationCacheInterface {
+ public:
+  virtual ~TpuPersistentCompilationCacheInterface() = default;
+
+  // Returns the location where cache entries are stored.
+  virtual std::string cache_location() const = 0;
+};
+
+// Describes the position of an argument or return value after the computation
+// has been partitioned into cores.
+struct ShardingAndIndex {
+  // Sharding across cores.
+  ::xla::OpSharding sharding;
+  // Argument/return value number. If sharding is single-core, `indices` has a
+  // single element; otherwise, it has num_cores elements.
+  std::vector<int> indices;
+};
+
+// TODO(b/158279168): Dedup with internal version.
+// Return the per-device shape for a `shape` with a given `sharding`.
+xla::Shape GetPerDeviceShape(const xla::Shape& shape,
+                             const xla::HloSharding& sharding, int64_t device);
+
+absl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
+    const xla::ProgramShape& program_shape,
+    absl::Span<const xla::Shape> argument_shapes,
+    std::optional<const xla::Shape> result_layout,
+    std::optional<const xla::DeviceAssignment> device_assignment,
+    int replica_count, int num_partitions,
+    const xla::DebugOptions* debug_options, const int* seed,
+    const int* launch_id, const bool* alias_passthrough_params,
+    const xla::FusionConfigCollection* fusion_config_collection,
+    const std::vector<std::vector<bool>>* fusion_config);
+
+absl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
+    const xla::ProgramShape& program_shape,
+    absl::Span<const xla::Shape> argument_shapes,
+    std::optional<const xla::Shape> result_layout,
+    std::optional<const xla::DeviceAssignment> device_assignment,
+    int replica_count, int num_partitions,
+    const xla::DebugOptions* debug_options);
+
+xla::ShapeTree<xla::HloSharding> GetSubtree(
+    const xla::ShapeTree<xla::HloSharding>& tuple_shape_tree,
+    int element_index);
+
+xla::Shape GetPerDeviceShape(const xla::Shape& shape,
+                             const xla::HloSharding& sharding, int64_t device);
+
+absl::Status AddVariableUpdatesToCores(
+    const TPUCompileMetadataProto& metadata,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const std::vector<ShardingAndIndex>& arg_core_mapping,
+    std::vector<bool>* may_modify_variables,
+    std::vector<std::vector<xla::Shape>>* per_core_output_shapes,
+    std::vector<std::vector<std::pair<int, bool>>>* per_core_variable_indices);
+
+absl::Status ComputeOutputShapesForEachCore(
+    const tpu::TPUCompileMetadataProto& metadata,
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::vector<std::vector<xla::Shape>>* per_core_output_shapes);
+
+absl::Status CreateHloModules(
+    const TPUCompileMetadataProto& metadata,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const std::optional<xla::DeviceAssignment>& device_assignment,
+    std::vector<std::unique_ptr<xla::HloModule>>* hlo_modules);
+
+absl::StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
+    const std::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+    const TPUCompileMetadataProto& metadata,
+    const std::vector<TensorShape>& arg_shapes);
+
+absl::Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
+                                          TPUCompileMetadataProto* metadata,
+                                          NameAttrList* function_name,
+                                          std::string* mlir_module);
+
+// Computes shapes for each argument. Uses both the static shape from the
+// metadata, and the dynamic shapes where the static shape is not
+// defined. There must be one dynamic_shape for each argument with a
+// partially defined shape, in index order.
+absl::Status ComputeArgumentShapes(
+    const TPUCompileMetadataProto& metadata,
+    const std::vector<TensorShape>& dynamic_shapes,
+    std::vector<TensorShape>* arg_shapes);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
new file mode 100644
index 00000000..fe5eeb22
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -0,0 +1,176 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+
+absl::Status CreateTpuCompilationCache(
+    ResourceMgr* rmgr, tpu::TpuCompilationCacheInterface** compilation_cache);
+
+absl::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx);
+
+// The ConfigureDistributedTpu op is used to start an TPUDriver from
+// TensorFlow. It should be run on a TPU_SYSTEM device and returns the
+// connection host:port for the CompilationCacheServer. The
+// CompilationCacheServer will remain live until the device's Resource Manager
+// is cleared or a ShutdownDistributedTpuOp is run on the same device.
+class ConfigureDistributedTpuOp : public OpKernel {
+ public:
+  explicit ConfigureDistributedTpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES(ctx, ctx->num_inputs() > 0,
+                absl::InternalError(
+                    "_ConfigureDistributedTPU needs at least one input"));
+  }
+  void Compute(OpKernelContext* ctx) override;
+  ~ConfigureDistributedTpuOp() override = default;
+
+ private:
+  // ConfigureDistributedTpuOp is neither copyable nor movable.
+  ConfigureDistributedTpuOp(const ConfigureDistributedTpuOp&) = delete;
+  ConfigureDistributedTpuOp& operator=(const ConfigureDistributedTpuOp&) =
+      delete;
+};
+
+// The WaitForDistributedTpuOp op is used to block execution until
+// the distributed Tpu system has started up. It must be run on
+// the same TPU_SYSTEM device that ConfigureDistributedTpuOp was run
+// on, after all of the InitializeHostForDistributedTpuOp Ops have
+// completed.
+class WaitForDistributedTpuOp : public OpKernel {
+ public:
+  explicit WaitForDistributedTpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("startup_timeout_sec", &startup_timeout_sec_));
+    OP_REQUIRES(
+        ctx, startup_timeout_sec_ > 0,
+        absl::InvalidArgumentError(absl::StrCat(
+            "startup_timeout_sec ", startup_timeout_sec_, " must be >0")));
+  }
+  void Compute(OpKernelContext* ctx) override;
+  ~WaitForDistributedTpuOp() override = default;
+
+ private:
+  // The time to wait for all hosts to start up.
+  int startup_timeout_sec_;
+
+  // WaitForDistributedTpuOp is neither copyable nor movable.
+  WaitForDistributedTpuOp(const WaitForDistributedTpuOp&) = delete;
+  WaitForDistributedTpuOp& operator=(const WaitForDistributedTpuOp&) = delete;
+};
+
+// The ShutdownDistributedTpu op is used to stop a running TPUDriver from
+// TensorFlow. It should be run on the TPU_SYSTEM device where
+// ConfigureDistributedTpuOp was run.
+class ShutdownDistributedTpuOp : public OpKernel {
+ public:
+  explicit ShutdownDistributedTpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+  ~ShutdownDistributedTpuOp() override = default;
+
+ private:
+  // ShutdownDistributedTpuOp is neither copyable nor movable.
+  ShutdownDistributedTpuOp(const ShutdownDistributedTpuOp&) = delete;
+  ShutdownDistributedTpuOp& operator=(const ShutdownDistributedTpuOp&) = delete;
+};
+
+// The InitializeHostForDistributedTpu op is used to initialize the
+// TPUPlatform on a host in a distributed TPU system. It should be
+// run on every host containing TPU devices before any other Ops that use
+// TPU are run.
+class InitializeHostForDistributedTpuOp : public OpKernel {
+ public:
+  explicit InitializeHostForDistributedTpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    ctx->GetAttr("enable_whole_mesh_compilations",
+                 &enable_whole_mesh_compilations_)
+        .IgnoreError();
+    ctx->GetAttr("tpu_cancellation_closes_chips",
+                 &tpu_cancellation_closes_chips_)
+        .IgnoreError();
+  }
+
+  void Compute(OpKernelContext* ctx) override;
+
+  ~InitializeHostForDistributedTpuOp() override = default;
+
+ private:
+  // InitializeHostForDistributedTpuOp is neither copyable nor movable.
+  InitializeHostForDistributedTpuOp(const InitializeHostForDistributedTpuOp&) =
+      delete;
+  InitializeHostForDistributedTpuOp& operator=(
+      const InitializeHostForDistributedTpuOp&) = delete;
+
+  bool enable_whole_mesh_compilations_ = false;
+  int tpu_cancellation_closes_chips_ = 0;
+};
+
+// The SetGlobalTPUArray op is used to initialize the TPUPlatform on a
+// host in a distributed TPU system. It should be run on every host
+// containing TPU devices before any other Ops that use TPU are run.
+class SetGlobalTPUArrayOp : public OpKernel {
+ public:
+  explicit SetGlobalTPUArrayOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+  ~SetGlobalTPUArrayOp() override = default;
+
+ private:
+  // SetGlobalTPUArrayOp is neither copyable nor movable.
+  SetGlobalTPUArrayOp(const SetGlobalTPUArrayOp&) = delete;
+  SetGlobalTPUArrayOp& operator=(const SetGlobalTPUArrayOp&) = delete;
+};
+
+// The DisconnectDistributedTpuChips op is used to disconnect all the chips on a
+// host from a running TPUDriver instance. It should be run on every host
+// containing TPU devices before the ShutdownDistributedTpuOp is run on
+// the TPU_SYSTEM.
+class DisconnectDistributedTpuChipsOp : public OpKernel {
+ public:
+  explicit DisconnectDistributedTpuChipsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+  ~DisconnectDistributedTpuChipsOp() override = default;
+
+ private:
+  // DisconnectDistributedTpuChipsOp is neither copyable nor movable.
+  DisconnectDistributedTpuChipsOp(const DisconnectDistributedTpuChipsOp&) =
+      delete;
+  DisconnectDistributedTpuChipsOp& operator=(
+      const DisconnectDistributedTpuChipsOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
new file mode 100644
index 00000000..73b0a492
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENGINE_STATE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENGINE_STATE_INTERFACE_H_
+
+#include <string>
+
+#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+class TpuEmbeddingEngineState;
+
+namespace tpu {
+
+const char kTpuEmbeddingEngineStateInterfaceResourceName[] =
+    "tpu_embedding_engine_state";
+
+class TpuEmbeddingEngineStateInterface : public ResourceBase {
+ public:
+  explicit TpuEmbeddingEngineStateInterface(XLA_TpuEmbeddingEngineState* handle)
+      : engine_state_(handle) {}
+
+  ~TpuEmbeddingEngineStateInterface() override {
+    if (engine_state_ != nullptr) {
+      stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_FreeFn(
+          engine_state_);
+    }
+  }
+
+  tensorflow::TpuEmbeddingEngineState* GetState() const {
+    if (engine_state_ == nullptr) {
+      return nullptr;
+    }
+    return static_cast<tensorflow::TpuEmbeddingEngineState*>(
+        stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_GetStateFn(
+            engine_state_));
+  }
+
+  static TpuEmbeddingEngineStateInterface* Create() {
+    XLA_TpuEmbeddingEngineState* state = nullptr;
+    if (stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_CreateFn !=
+        nullptr) {
+      state =
+          stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_CreateFn();
+    }
+    return new TpuEmbeddingEngineStateInterface(state);
+  }
+
+  string DebugString() const override {
+    return "TpuEmbeddingEngineStateInterface";
+  }
+
+ private:
+  XLA_TpuEmbeddingEngineState* engine_state_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENGINE_STATE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
new file mode 100644
index 00000000..e06c02c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENQUEUE_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENQUEUE_OPS_H_
+
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+
+namespace tensorflow {
+
+// Validates that all the combiners passed are one of the following: sum, mean,
+// or sqrtn.
+absl::Status ValidateCombiners(absl::Span<const std::string> combiners);
+
+// Validates the `mode_override` input of the TPUEnqueue* ops, and, if correct,
+// sets the `mode` to pass on to the TPU Embedding manager.
+absl::Status GetValidatedModeOverride(
+    const string& mode_override, tpu::TPUEmbeddingConfiguration::Mode* mode);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENQUEUE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h
new file mode 100644
index 00000000..51459c6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h
@@ -0,0 +1,99 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Ops to load and retrieve embeddings for TPU Embedding.
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_LOAD_RETRIEVE_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_LOAD_RETRIEVE_OPS_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
+
+namespace tensorflow {
+
+// The LoadAllTPUEmbeddingParameters op is used to load initial embedding
+// table parameters onto a host that has already been configured using
+// ConfigureTPUEmbeddingHost. This Op should be used when TPUEmbedding is part
+// of a training loop. The Op takes four input lists of tensors. Each list has
+// one entry per embedding table, but some entries are ignored based on the
+// particular optimization algorithm used for each table. parameters is the
+// initial values of the embedding tables, and auxiliary[1-3] are the initial
+// values of the auxiliary parameters.
+class LoadAllTPUEmbeddingParametersOp : public OpKernel {
+ public:
+  explicit LoadAllTPUEmbeddingParametersOp(OpKernelConstruction* ctx);
+  ~LoadAllTPUEmbeddingParametersOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  void GetStateVariables(
+      OpKernelContext* ctx,
+      std::array<std::vector<absl::Span<const float>>,
+                 tpu::kMaxAuxiliaryParameterCount + 1>& state_variable_vector);
+
+ private:
+  tpu::TPUEmbeddingConfiguration config_;
+  std::vector<TensorShape> table_shapes_;
+
+  LoadAllTPUEmbeddingParametersOp(const LoadAllTPUEmbeddingParametersOp&) =
+      delete;
+  void operator=(const LoadAllTPUEmbeddingParametersOp&) = delete;
+};
+
+// The RetrieveAllTPUEmbeddingParameters op is used to retrieve updated
+// embedding table parameters from a TPU that has already been
+// configured using ConfigureTPUEmbeddingHostOp. This Op should be used when
+// TPUEmbedding is part of a training loop. The Op returns four output lists of
+// tensors. Each list has one entry per embedding table, but entries are empty
+// when the relevant table does not have that number of auxiliary parameters.
+// The parameters output is the updated values of the embedding tables, and
+// auxiliary[1-3] are the updated values of the auxiliary parameters.
+
+// Currently, this op is the only method to make sure that the TPUEmbedding has
+// completed execution of the mini-batches enqueued so far.
+// TODO(misard, b/34936670): Add a TensorFlow op that waits till all
+// minibatches have been processed by the TPUEmbedding on the current host.
+class RetrieveAllTPUEmbeddingParametersOp : public OpKernel {
+ public:
+  explicit RetrieveAllTPUEmbeddingParametersOp(OpKernelConstruction* ctx);
+  ~RetrieveAllTPUEmbeddingParametersOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  void GetStateVariables(
+      OpKernelContext* ctx,
+      std::array<std::vector<absl::Span<float>>,
+                 tpu::kMaxAuxiliaryParameterCount + 1>& state_variable_vector,
+      std::vector<int>& num_state_variables);
+
+  tpu::TPUEmbeddingConfiguration config_;
+  std::vector<TensorShape> table_shapes_;
+
+  RetrieveAllTPUEmbeddingParametersOp(
+      const RetrieveAllTPUEmbeddingParametersOp&) = delete;
+  void operator=(const RetrieveAllTPUEmbeddingParametersOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_LOAD_RETRIEVE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_execute_op.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_execute_op.h
new file mode 100644
index 00000000..d0e70dbc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_execute_op.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// Op that executes a precompiled TPU computation.
+class TPUExecuteOp : public AsyncOpKernel {
+ public:
+  explicit TPUExecuteOp(OpKernelConstruction* context);
+  ~TPUExecuteOp() override;
+
+  AsyncOpKernel* AsAsync() override;
+
+  void Compute(OpKernelContext* context) override;
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
+
+ protected:
+  // Used by TPUExecuteAndUpdateVariablesOp to set the fused variable reads and
+  // updates indices in the XLA computation. The two vectors must have the same
+  // size, and a pair of read index and write index represents a variable's
+  // input to the program and its updated value from the program. If the
+  // variable is not updated, use -1 as the output index.
+  std::vector<int> fused_device_var_reads_in_computation_inputs_;
+  std::vector<int> fused_device_var_updates_in_computation_outputs_;
+
+ private:
+  absl::Status DoWork(OpKernelContext* context);
+
+  TPUExecuteOp(const TPUExecuteOp&) = delete;
+  void operator=(const TPUExecuteOp&) = delete;
+};
+
+// A variant of TPUExecuteOp that contains fused device variable reads and
+// updates.
+class TPUExecuteAndUpdateVariablesOp : public TPUExecuteOp {
+ public:
+  explicit TPUExecuteAndUpdateVariablesOp(OpKernelConstruction* context);
+  ~TPUExecuteAndUpdateVariablesOp() override = default;
+
+ private:
+  TPUExecuteAndUpdateVariablesOp(const TPUExecuteAndUpdateVariablesOp&) =
+      delete;
+  void operator=(const TPUExecuteAndUpdateVariablesOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_execute_op_options.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_execute_op_options.h
new file mode 100644
index 00000000..950fb884
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_execute_op_options.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_OPTIONS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_OPTIONS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace internal {
+
+enum class TpuCancellationClosesChipsMode : int {
+  kUnset = 0,     // fallback to other configuration, e.g. absl flag
+  kEnabled = 1,   // Close TPU chips when cancellation happens
+  kDisabled = 2,  // Do not close TPU chips when cancellation happens
+};
+
+// Set TPU cancellation closing chips mode from an integer. See the enum
+// definition of `TpuCancellationClosesChipsConfig` above for valid values.
+absl::Status SetTpuCancellationClosesChips(int val);
+
+// Get whether to close chips when TPUExecutionOp is cancelled. If unset, return
+// the value specified by the `default_value` argument.
+bool TpuCancellationClosesChipsGetOrDefault(bool default_value);
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EXECUTE_OP_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h
new file mode 100644
index 00000000..fe98817c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h
@@ -0,0 +1,95 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_FINGERPRINT_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_FINGERPRINT_LOOKUP_H_
+
+#include <cstddef>
+#include <deque>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// A class that holds the key-value pair of fingerprints. By calling the
+// Register method, this class can map the key to the value. Note that this
+// class holds invariant key-value pairs. That is, it does not allow updating
+// key-value pairs, nor N-key-to-1-value and 1-key-to-M-value pairs. If such
+// cases occur, the class keeps the earliest registered pairs and discards any
+// violating pairs.
+//
+// Example:
+//  TpuFingerprintLookup fingerprint_lookup;
+//
+//  // Register key-intermediate pair.
+//  fingerprint_lookup.RegisterKeyValuePair("key1", "intermediate1");
+//  // Register intermediate-value pair.
+//  fingerprint_lookup.RegisterKeyValuePair("intermediate1", "value1");
+//
+//  // Lookup fingerprint with key.
+//  std::string fingerprint = fingerprint_lookup.Lookup("key1");
+//
+// TODO(chiachenc): use templates and add Unregister methods.
+class TpuFingerprintLookup : public ResourceBase {
+ public:
+  // Creates an instance of TpuFingerprintLookup.
+  static TpuFingerprintLookup* Create();
+
+  // Register key-intermediate pair
+  void RegisterKeyAndIntermediatePair(uint64 key, uint64 intermediate);
+
+  // Register intermediate-value pair. A successful registration requires a
+  // preceding RegisterKeyAndIntermediatePair. Return true if successfully
+  // registering a key-value pair; otherwise, return false.
+  bool RegisterIntermediateAndValuePair(uint64 intermediate, std::string value);
+
+  // Look up fingerprint with key.
+  // Return std::nullopt if not found.
+  std::optional<::tensorflow::StringPiece> Lookup(uint64 key);
+
+  size_t num_valid() {
+    absl::MutexLock lock(&mu_);
+    return key_to_value_.size();
+  }
+
+  std::string DebugString() const override { return "TpuFingerprintLookup"; }
+
+ private:
+  explicit TpuFingerprintLookup() {}
+
+  absl::Mutex mu_;
+  // Main storage for lookup
+  absl::node_hash_map<uint64, std::string> key_to_value_ ABSL_GUARDED_BY(mu_);
+
+  // An auxiliary storage to ensure 1-to-1 and invariant key-value pair
+  absl::node_hash_map<std::string, uint64> value_to_key_ ABSL_GUARDED_BY(mu_);
+
+  // An auxiliary storage to keep intermediate-key pairs.
+  absl::flat_hash_map<uint64, uint64> intermediate_to_key_ ABSL_GUARDED_BY(mu_);
+
+  TpuFingerprintLookup(const TpuFingerprintLookup&) = delete;
+  TpuFingerprintLookup& operator=(const TpuFingerprintLookup&) = delete;
+};
+}  // namespace tpu
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_FINGERPRINT_LOOKUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_functional_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_functional_ops.h
new file mode 100644
index 00000000..3c8287af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_functional_ops.h
@@ -0,0 +1,383 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_FUNCTIONAL_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_FUNCTIONAL_OPS_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/tpu/topology.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_ordinal_selector.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+#include "absl/container/flat_hash_map.h"
+
+namespace tensorflow {
+// Holds node's shape information for Concat/Split.
+using EdgeShapes = absl::flat_hash_map<const Edge*, std::vector<int>>;
+using GroupedEdges =
+    absl::flat_hash_map<std::string, std::vector<const Edge*>>;
+
+// Contains attrs "T", "sharding", "_tpu_replicate" for each XlaSharding op that
+// we find as part of searching for inputs to models that are replicated.
+using XlaShardingInfoMap = absl::flat_hash_map<
+    std::string, std::tuple<tensorflow::DataType, std::string, std::string>>;
+
+// Contains attrs "T", and a pointer to tpu_replicated_metadata for ctrl dep
+// for each TpuReplicatedInput op that we find as part of searching for inputs
+// to models that are replicated.
+using TpuReplicatedInputInfoMap =
+    absl::flat_hash_map<std::string,
+                           std::tuple<tensorflow::DataType, Node*>>;
+
+namespace tpu_functional_internal {
+
+// Helper functions for graph rewrites.
+GroupedEdges GroupTensorsForInputPacking(
+    const EdgeShapes& tpu_input_shapes,
+    const absl::flat_hash_map<const Edge*, DataType>& tpu_input_dtypes,
+    bool input_shape_opt, bool group_tensors_for_packing);
+GroupedEdges GroupTensorsForOutputPacking(Graph* graph,
+                                          EdgeShapes& tpu_output_shapes,
+                                          GraphShapeInfo* shape_info);
+
+absl::Status CreateConcatAndSplitNodesForInputTensor(
+    Graph* graph, const string& cluster_name, EdgeShapes* tpu_input_shapes,
+    const absl::flat_hash_map<std::string, std::vector<const Edge*>>&
+        grouped_input_edges,
+    int32_t minimum_input_tensors_packing, bool xla_spmd_input_sharded,
+    const XlaShardingInfoMap& xla_sharding_info,
+    const TpuReplicatedInputInfoMap& tpu_replicated_input_info);
+absl::Status CreateConcatAndSplitNodesForOutputTensor(
+    Graph* graph, const string& cluster_name, EdgeShapes* tpu_output_shapes,
+    GraphShapeInfo* tpu_inferred_info, GroupedEdges shape_to_output,
+    int32_t minimum_output_tensors_packing);
+
+absl::Status InsertReshapeNodePairs(Graph* graph, const string& cluster_name,
+                                    EdgeShapes* tpu_input_shapes,
+                                    int num_cores_per_replica);
+
+}  // namespace tpu_functional_internal
+
+typedef FunctionLibraryRuntime::Handle FHandle;
+
+// A `TPUPartitionedCallOp` asynchronously executes a function on exactly one
+// TPU core and potentially across multiple other devices, but within a single
+// process. The kernel places and partitions the function's underlying graph,
+// executing each of the partitioned subgraphs as a function.
+//
+// The core on which the TPU computation is executed must be specified via the
+// `device_ordinal` input. Different invocations of this op may specify
+// different device ordinals, making it possible to map TPU computations to
+// different cores at runtime. Currently, macro-substitution of device ordinals
+// is only supported for the following whitelisted ops:
+//   * TPUExecute
+//   * InfeedEnqueue
+//   * InfeedEnqueueTuple
+//
+// Attempting to compute a TPUPartitionedCallOp whose function body has a
+// non-whitelisted node bearing an attribute named "device_ordinal" will result
+// in an error.
+//
+// TODO(akshayka): This class duplicates most of the logic of
+// `PartitionedCallOp`; once that class and this one have evolved to stable
+// states, and if at that time they remain sufficiently similar, either unify
+// them in one op or set up an inheritance structure that allows for code reuse.
+class TPUPartitionedCallOp : public AsyncOpKernel {
+ public:
+  explicit TPUPartitionedCallOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        pool_(ctx->env(), "InitializeVarOnTPUPool", 1),
+        library_runtime_(nullptr) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    // If the importer has set the original function name, it means the function
+    // attribute is referring to a rewritten function, but we need to use the
+    // original function name in order to find it in the function library.
+    std::string orig_f;
+    if (ctx->GetAttr("_orig_f", &orig_f).ok()) {
+      func_.set_name(orig_f);
+    }
+    auto status = ctx->GetAttr("autotuner_thresh", &autotuner_thresh_);
+    if (!status.ok()) {
+      autotuner_thresh_ = 0;
+    }
+    stream_executor::tpu::OpsApiFn()->TfTpu_GetTpuPartitionedCallParamsFn(
+        &runtime_params_);
+  }
+
+  ~TPUPartitionedCallOp() override = default;
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  struct DeviceAndFHandle {
+    std::string device;
+    FHandle handle;
+
+    // The FLD passed to `library_runtime_` as an overlay function library for
+    // instantiation of function `handle`. This is a snapshot of the currrent
+    // `flib_def_`. Since `flib_def_` can be changed concurrently by another
+    // graph rewrite when executing `handle`, we need to make sure each
+    // `handle` uses a different FLD to avoid races. See b/181149591.
+    std::unique_ptr<FunctionLibraryDefinition> flib_def;
+  };
+
+  struct TPUMetadata {
+    tpu::TopologyProto topology;
+    int num_cores_per_replica = 1;
+    std::vector<int> device_assignment;
+  };
+
+  // This method is thread-safe.
+  absl::Status GetTpuCoreOrdinal(OpKernelContext* ctx, uint64 input_hash,
+                                 int64_t* ordinal_selector_req_id,
+                                 int32_t* core_ordinal);
+
+  // Helper to create and initialize a TPU variable given a CPU variable
+  // var: the CPU variable created by the user
+  // ndef: the node def of the corresponding TPU var handle that we created
+  // device_ordinal: TPU device ordinal on which to initialize this variable
+  absl::Status InitializeVarOnTPU(OpKernelContext* ctx,
+                                  const core::RefCountPtr<Var>& var,
+                                  NodeDef* ndef, int device_ordinal,
+                                  bool fast_mem)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Helper to create and initialize partitioned TPU variables given a CPU
+  // variable with XLA sharding annotation.
+  // var: the CPU variable created by the user.
+  // ndefs: the node def of the corresponding TPU var handle on all the logical
+  //   cores.
+  // split_dim: the partition dimension of the variable. If -1, the variable is
+  //   replicated.
+  // device_ordinal: The index of the TPU core that is scheduled to run
+  //   the computation. In the case of XLA SPMD, it is the "primary" core, which
+  //   is the smallest index of all the cores.
+  absl::Status InitializeShardedVarOnTPU(OpKernelContext* ctx,
+                                         const core::RefCountPtr<Var>& var,
+                                         std::vector<NodeDef>& ndefs,
+                                         int split_dim,
+                                         const std::vector<string>& tpu_devices)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Check if any of the immediate successors of node has attribute
+  // "_tpu_replicate".
+  bool IsInputToTPUReplicate(Node* node) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Replace an _Arg node of type DT_RESOURCE by a VarHandleOp on TPU
+  absl::Status ReplaceResourceArgsWithVarHandleOps(
+      Graph* graph, OpKernelContext* ctx, int device_ordinal,
+      bool enable_spmd_xla_partitioning, const TPUMetadata& tpu_metadata)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Replace a _Arg node indicates a variable on CPU host by sharded/replicated
+  // variables on all logical TPU devices.
+  absl::Status ReplaceAndPartitionXLAShardingVariable(
+      Graph* graph, OpKernelContext* ctx, int device_ordinal,
+      ResourceHandle& handle, Node* variable, const TPUMetadata& tpu_metadata)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Status ShardInputsWithXlaSharding(Graph* graph,
+                                          const std::string& cluster_name,
+                                          int num_cores_per_replica,
+                                          OpKernelContext* ctx)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Rewrite the graph for input and output optimiazations.
+  // TODO(ylc): Move this function to Graph optimization pass.
+  absl::Status OptimizeTpuInputOutputTensors(
+      Graph* graph, bool enable_spmd_xla_partitioning,
+      int num_cores_per_replica,
+      std::map<std::string, std::vector<int>>& named_input_shapes,
+      OpKernelContext* ctx) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Status InferShapesWithResourceVar(
+      Graph* graph, OpKernelContext* ctx,
+      std::map<int, InferredShape>& arg_shapes,
+      GraphShapeInfo* tpu_inferred_info);
+
+  // Copies the graph backing `func_` into `graph`.
+  absl::Status GetGraphFromFunction(Graph* graph, int device_ordinal,
+                                    bool* use_spmd_for_xla_partitioning,
+                                    TPUMetadata* tpu_metadata)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Places the graph carried by `optimization_options` and runs graph
+  // optimization passes (pre-placement, post-placement, and post-rewrite).
+  absl::Status PlacementHelper(
+      const DeviceSet& device_set,
+      const GraphOptimizationPassOptions& optimization_options,
+      const string& function_name);
+  // Partitions `graph`, populates `subgraphs` with the partitions, and runs
+  // the post-partitioning graph optimization passes.
+  absl::Status PartitionHelper(
+      const DeviceSet& device_set,
+      const GraphOptimizationPassOptions& optimization_options, Graph* graph,
+      std::unordered_map<std::string, std::unique_ptr<Graph>>* subgraphs);
+
+  // Adds and instantiates a function backed by `graph` with name
+  // `function_name` on device `target_device`, storing the handle in `handle`.
+  // If `out_flib_def` is not null, it will be set to a copy of `flib_def_` and
+  // used for instantiation.
+  absl::Status InstantiatePartition(
+      const Graph& graph, const string& function_name,
+      const string& target_device, FHandle* handle,
+      std::unique_ptr<FunctionLibraryDefinition>* out_flib_def)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Adds and instantiates functions for each subgraph in `subgraphs` after
+  // rewriting nodes' `device_ordinal` attributes to match `replica_id` when
+  // num_cores_per_replica == 1.
+  absl::Status InstantiateFunctionsFromSubgraphs(
+      const DeviceSet& device_set, int replica_id, uint64 cache_hash,
+      int num_cores_per_replica,
+      std::unordered_map<std::string, std::unique_ptr<Graph>> subgraphs)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Rewrites `graph` such that the device ordinal attributes of all whitelisted
+  // nodes (see `IsSupportedTPUOp`) are set to `device_ordinal`;
+  // `*modified` is set to true if the graph is modified in the process (i.e.,
+  // if it contains a whitelisted node), otherwise is unmodified.
+  //
+  // Returns an error if
+  //   (1) the graph contains a non-whitelisted node that carries an attribute
+  //       with name "device_ordinal", or
+  //   (2) the set of device ordinals found among the graph's nodes has
+  //       cardinality greater than 1.
+  absl::Status SetDeviceOrdinal(const DeviceSet& device_set, int device_ordinal,
+                                Graph* graph, bool* modified)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void ExecuteRemoteFunction(const FunctionLibraryRuntime::Options& opts,
+                             FHandle handle, OpKernelContext* ctx,
+                             ReffedStatusCallback* done)
+      ABSL_LOCKS_EXCLUDED(mu_);
+  void ExecuteLocalFunction(const FunctionLibraryRuntime::Options& opts,
+                            const OpInputList& arguments, FHandle handle,
+                            OpKernelContext* ctx, ReffedStatusCallback* done)
+      ABSL_LOCKS_EXCLUDED(mu_);
+  void ExecuteFunctions(const std::vector<DeviceAndFHandle>& functions,
+                        OpKernelContext* ctx, int device_ordinal,
+                        int64_t ordinal_selector_req_id, DoneCallback done)
+      ABSL_LOCKS_EXCLUDED(mu_);
+
+  absl::Status ShouldUseRemoteExecutionForFn(const std::string& target_device,
+                                             bool* remote_execution) {
+    DeviceNameUtils::ParsedName target_device_parsed;
+    DeviceNameUtils::ParsedName local_device_parsed;
+
+    if (!DeviceNameUtils::ParseFullOrLocalName(target_device,
+                                               &target_device_parsed)) {
+      return errors::InvalidArgument("Cannot parse target device ",
+                                     target_device);
+    }
+    if (!DeviceNameUtils::ParseFullOrLocalName(local_device_name_,
+                                               &local_device_parsed)) {
+      return errors::InvalidArgument("Cannot parse local device ",
+                                     local_device_name_);
+    }
+
+    if (DeviceNameUtils::AreCompatibleDevNames(target_device_parsed,
+                                               local_device_parsed)) {
+      *remote_execution = false;
+    } else {
+      *remote_execution = true;
+    }
+    return absl::OkStatus();
+  }
+
+  // Init once flagas.
+  absl::once_flag once_;
+  absl::once_flag ordinal_selector_once_;
+
+  // Device manager and device set.
+  const DeviceMgr* device_mgr_;
+  DeviceSet device_set_;
+
+  // Threadpool.
+  thread::ThreadPool pool_;
+
+  // `func_` is the original function supplied to this OpKernel.
+  NameAttrList func_;
+  string local_device_name_;
+  // Maps from cache key to their corresponding functions, which are
+  // represented as (device, handle) pairs.
+  gtl::FlatMap<uint64, std::vector<DeviceAndFHandle>> partition_cache_
+      ABSL_GUARDED_BY(mu_);
+
+  // A set contains seen ordinals. Used by variable initialization on TPU.
+  absl::flat_hash_set<int> seen_ordinals_;
+
+  // Record the indices of the _Arg with type DT_RESOURCE that goes
+  // into a TPU Op.
+  std::vector<bool> replaced_input_indices_;
+
+  absl::Mutex mu_;
+  // Function shards are added to the `flib_def_`, and later on it'll create
+  // a copy of `flib_def_` to pass to `library_runtime_` as an overlay function
+  // library for instantiation.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  FunctionLibraryRuntime* library_runtime_;
+
+  // Used to uniquify function names in `flib_def_`.
+  uint32 suffix_ = 0;
+
+  // Minimum number of run steps (batches) necessary to trigger xla autotuner.
+  int autotuner_thresh_ = 0;
+
+  // TPU core selection.
+  std::shared_ptr<tpu::TPUOrdinalSelector> ordinal_selector_;
+
+  // Maps input hash to TF fingerprint.
+  absl::flat_hash_map<uint64, uint64> inputs_to_fingerprint_;
+
+  // List of TPU devices
+  std::vector<Device*> tpu_devices_;
+
+  TpuPartitionedCall_Params runtime_params_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_FUNCTIONAL_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
new file mode 100644
index 00000000..6e84dde2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_INTERFACE_H_
+
+#include <string>
+
+#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace tensorflow {
+
+class TpuMeshCommonState;
+
+namespace tpu {
+
+const char kTpuMeshStateInterfaceResourceName[] = "tpu_mesh_common_state";
+
+class TpuMeshStateInterface : public tensorflow::ResourceBase {
+ public:
+  explicit TpuMeshStateInterface(XLA_TpuMeshState* handle)
+      : mesh_state_(handle) {}
+
+  ~TpuMeshStateInterface() override {
+    if (mesh_state_ != nullptr) {
+      stream_executor::tpu::OpsApiFn()->TpuMeshState_FreeFn(mesh_state_);
+    }
+  }
+
+  static TpuMeshStateInterface* Create() {
+    XLA_TpuMeshState* state = nullptr;
+    if (stream_executor::tpu::OpsApiFn()->TpuMeshState_CreateFn != nullptr) {
+      state = stream_executor::tpu::OpsApiFn()->TpuMeshState_CreateFn();
+    }
+    return new TpuMeshStateInterface(state);
+  }
+
+  const XLA_TpuMeshState* data() const { return mesh_state_; }
+
+  tensorflow::TpuMeshCommonState* mesh_common_state() const {
+    if (mesh_state_ == nullptr) {
+      return nullptr;
+    }
+    return static_cast<tensorflow::TpuMeshCommonState*>(
+        stream_executor::tpu::OpsApiFn()->TpuMeshState_MeshCommonStateFn(
+            mesh_state_));
+  }
+
+  // Returns whether we should include the device assignment as a static field
+  // to the TPU program. This also determines whether we should include the
+  // device assignment as part of the compilation cache key.
+  bool NeedsStaticDeviceAssignment(const TPUCompileMetadataProto& metadata,
+                                   TpuCoreTypeEnum tpu_core_type) const {
+    if (mesh_state_ == nullptr) {
+      return false;
+    }
+    // Static device assignment enables XLA to perform certain optimization when
+    // all cores are used in the replicated computation.
+    return metadata.num_cores_per_replica() * metadata.num_replicas() ==
+           stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoreCountFn(
+               mesh_state_, tpu_core_type);
+  }
+
+  string DebugString() const override { return "TpuMeshStateInterface"; }
+
+ private:
+  XLA_TpuMeshState* mesh_state_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_op_consts.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_op_consts.h
new file mode 100644
index 00000000..cbf2c994
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_op_consts.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+
+#include "absl/base/attributes.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Resource names in the ResourceMgr.
+//
+// Name of cache for compiled TPU ISA protos. CompilationCache is created by
+// ConfigureDistributedTpuOp, so only the master has a CompilationCache.
+ABSL_CONST_INIT extern const char kCompilationCacheResourceName[];
+// Name of base class allowing Execute Ops to look up ISA protos.
+// CompiledProtoCache is created by InitializeHostForDistributedTpuOp, so each
+// tpu_worker has a CompiledProtoCache.
+ABSL_CONST_INIT extern const char kCompiledProtoCacheResourceName[];
+// Name of cache unloader for compiled TPU ISA protos. Cache unloader should be
+// put into TPU_SYSTEM device resource manager. Inference may use it to unload
+// cache entries created during lifetime of a DirectSession.
+ABSL_CONST_INIT extern const char kCompilationCacheUnloaderResourceName[];
+// TBD
+ABSL_CONST_INIT extern const char kFingerprintLookupResourceName[];
+
+}  // namespace tpu
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_op_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_op_util.h
new file mode 100644
index 00000000..d0ca805f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+// Creates a fingerprint given the name and the vector of shapes.
+uint64 CreateFingerprintWithNameAndShapes(
+    uint64 name, const std::vector<tensorflow::TensorShape>& shapes);
+
+// Creates a unique compilation cache `key`.
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state, uint64_t session_id = 0,
+    ResourceMgr* resource_mgr = nullptr);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
new file mode 100644
index 00000000..9ea689b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_H_
+
+#include <optional>
+
+#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// A reserved ID for deferred core selection. Intentionally set at a number
+// that is more than the number of cores available in a future system.
+constexpr int32_t kDeferredCoreSelectionReserved = -8193;
+
+class TPUOrdinalSelector : TPUOrdinalSelectorInterface {
+ public:
+  explicit TPUOrdinalSelector(int num_cores_per_replica = 1) {
+    stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_CreateFn(
+        &ordinal_selector_, num_cores_per_replica);
+  }
+  ~TPUOrdinalSelector() override {
+    stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_DestroyFn(
+        ordinal_selector_);
+  }
+  int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) override {
+    int64_t ordinal;
+    stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_GetOrdinalFn(
+        ordinal_selector_, key, req_id, &ordinal);
+    return ordinal;
+  }
+  void DequeueFromCoreSelector(int32_t device_ordinal,
+                               int64_t req_id) override {
+    stream_executor::tpu::OpsApiFn()
+        ->TfTpuOrdinalSelector_DequeueFromCoreSelectorFn(
+            ordinal_selector_, device_ordinal, req_id);
+  }
+
+ private:
+  TfTpuOrdinalSelector* ordinal_selector_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
new file mode 100644
index 00000000..040959d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_INTERFACE_H_
+
+#include <optional>
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TPUOrdinalSelectorInterface {
+ public:
+  virtual ~TPUOrdinalSelectorInterface() = default;
+  virtual int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) = 0;
+  virtual void DequeueFromCoreSelector(int32_t device_ordinal,
+                                       int64_t req_id) = 0;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_pod_state.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_pod_state.h
new file mode 100644
index 00000000..b24a512d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_pod_state.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h"
+
+namespace tensorflow {
+
+// Name of tpu pod state.
+ABSL_CONST_INIT extern const char kTpuPodStateResourceName[];
+
+// Wrapper to hold centralized state for the distributed TPU in the TPU_SYSTEM
+// device's resource manager.
+class TpuPodState : public ResourceBase {
+ public:
+  // The port number given by isa_cache_port will be freed with
+  // RecycleUnusedPort in the destructor if it is non-negative.
+  TpuPodState(int service_port,
+              std::unique_ptr<TpuCompilationCacheService> cache_service);
+
+  ~TpuPodState() override;
+
+  string DebugString() const override;
+
+ private:
+  std::unique_ptr<TpuCompilationCacheService> cache_service_;
+  int service_port_;
+};
+
+// Returns the TPU pod state or an error.
+absl::Status GetTPUPodState(const ResourceMgr* rmgr, TpuPodState** pod_state);
+
+// Checks whether the TPU POD state configuration is present within the resource
+// manager.
+bool HasTPUPodState(const ResourceMgr* rmgr);
+
+// Construct TpuPodState.
+absl::Status ConstructTpuPodState(
+    ResourceMgr* rmgr, const std::vector<int32_t>& num_devices_per_host,
+    tpu::TpuCompilationCacheInterface* compilation_cache,
+    std::string* host_config_proto);
+
+absl::Status GetServerAddressAndPort(std::string* server_address,
+                                     int* serving_port);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_program_group.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_program_group.h
new file mode 100644
index 00000000..1b82d17b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -0,0 +1,189 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/compile_only_client.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuAotCompilationOptions : public xla::AotCompilationOptions {
+ public:
+  explicit TpuAotCompilationOptions(int64_t replica_count)
+      : num_cores_(0), replica_count_(replica_count) {}
+
+  // Returns the ID of the platform to which these options apply.
+  se::Platform::Id PlatformId() const override {
+    LOG(FATAL) << "Not implemented.";
+    return nullptr;
+  };
+
+  void set_num_cores(int64_t tpu_cores) { num_cores_ = tpu_cores; }
+  int64_t replica_count() const override { return replica_count_; }
+  int64_t num_cores() const override { return num_cores_; }
+
+  void set_allow_separate_sharding_programs(bool allow) {
+    allow_separate_sharding_programs_ = allow;
+  }
+  bool allow_separate_sharding_programs() const {
+    return allow_separate_sharding_programs_;
+  }
+
+  std::vector<xla::HloModuleConfig::ShardableValueUpdatePair>
+  shardable_value_update_pairs() const {
+    return shardable_value_update_pairs_;
+  }
+  void set_shardable_value_update_pairs(
+      std::vector<xla::HloModuleConfig::ShardableValueUpdatePair> pairs) {
+    shardable_value_update_pairs_ = std::move(pairs);
+  }
+
+ private:
+  int64_t num_cores_;
+  int64_t replica_count_;
+
+  // Whether to allow the compiler to create separte sharding and unsharding
+  // programs, and modify the original program's input/output sharded size. This
+  // is used for XLA-chosen sharding on parameters without an on-device loop:
+  // the caller can invoke sharding first, then (repeatedly) invoke the sharded
+  // main program, and finally invoke the unsharding program when it needs the
+  // full output.
+  bool allow_separate_sharding_programs_ = false;
+
+  // The list of input/output pairs in the main program that could be sharded.
+  std::vector<xla::HloModuleConfig::ShardableValueUpdatePair>
+      shardable_value_update_pairs_;
+};
+
+class TpuProgramGroup : public TpuProgramGroupInterface {
+ public:
+  using Status = absl::Status;
+
+  // Compiles Mlir or TF function computation by lowering into HLO IR and
+  // returns TPU programs ready for execution.
+  static Status CompileAndBuild(
+      const TpuCompilationRequestProto& compilation_request,
+      const XLA_TpuMeshState* mesh_state,
+      TpuProgramGroupInterface* tpu_program_group_interface);
+
+
+  // Initializes `TpuProgramGroup` object with `xla_tpu_programs`.
+  void Initialize(absl::Span<XLA_TpuProgram* const> xla_tpu_programs);
+
+  TpuProgramGroup() = default;
+  TpuProgramGroup(TpuProgramGroup&& other);
+  TpuProgramGroup& operator=(TpuProgramGroup&&) = delete;
+
+  bool has_sharding_program() const override;
+
+  size_t program_count() const override;
+
+  int64_t program_size() const override;
+
+  bool LogProgramMemorySummary() override;
+
+  void UnloadAndDestroyPrograms() override;
+
+  const std::vector<bool>& may_modify_variables_list() const override;
+  void set_may_modify_variables(const std::vector<bool>& may_modify_variables);
+  bool may_modify_variables(int index) const override;
+
+  const std::vector<std::string>& fingerprints() const;
+  void set_fingerprints();
+
+  const std::string& fingerprint(int index) const override;
+
+  const std::vector<XLA_TpuProgram*>& tpu_programs() const;
+  std::vector<XLA_TpuProgram*> tpu_programs(TpuProgramShardingType type) const;
+  const XLA_TpuProgram* tpu_program(int index) const override;
+  void set_tpu_programs(absl::Span<XLA_TpuProgram* const> tpu_programs);
+
+  const TPUExecutableInfoProto& executable_info(int index) const override;
+
+  const TPUHostTransferInfoProto& host_transfer_info(int index) const override;
+  void set_hlo_metadatas(absl::Span<const xla::HloProto> hlo_metadatas);
+  const xla::HloProto* hlo_metadata(int index) const;
+  absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
+
+  // Deserializes `GetTpuProgramResponse` protos from remote cache.
+  Status DeserializeFromRpcResponseProtos(
+      const std::vector<TpuSerializedProto>& rpc_response_protos);
+
+  // Serializes executable proto from the TPU program for the given core
+  // `index`.
+  Status SerializeExecutable(int index,
+                             TpuExecutableSerializedProto* executable) const;
+
+  // Serializes compiler metadata of the TPU program for the given core `index`.
+  Status SerializeCompilerMetadata(
+      int index, CompilerMetadataSerializedProto* compiler_metadata) const;
+
+  // Serializes host compute metadata of the TPU program for the given core
+  // `index`.
+  Status SerializeHostComputeMetadata(
+      int index,
+      HostComputeMetadataSerializedProto* host_compute_metadata) const;
+
+ private:
+  TPUExecutableInfoProto ConstructExecutableInfo(
+      const XLA_TpuProgram* tpu_program);
+  TPUHostTransferInfoProto ConstructHostTransferInfo(
+      const XLA_TpuProgram* tpu_program);
+  xla::HloProto ConstructHloMetadata(const XLA_TpuProgram* tpu_program);
+
+  // Update `hlo_metadatas__ptrs_` array from `hlo_metadatas_`. This needs to be
+  // called on `hlo_metadatas_` change(s).
+  void RefreshHloMetadatasPtrs();
+
+  std::vector<bool> may_modify_variables_;
+  std::vector<std::string> tpu_program_fingerprints_;
+
+  std::vector<XLA_TpuProgram*> tpu_programs_;  // Not owned.
+  std::vector<TPUExecutableInfoProto> executable_infos_;
+  std::vector<TPUHostTransferInfoProto> host_transfer_infos_;
+
+  // To be consistent with the TpuProgramGroupInterface::hlo_metadatas()
+  // signature, we store HloProto values in hlo_metadatas_ when
+  // set_hlo_metadata(...) is called, and return their pointers from
+  // hlo_metadatas_ptrs_ when hlo_metadatas() is called. hlo_metadata_ptrs_ is
+  // refreshed whenever hlo_metadatas_ is set or the object is moved.
+  std::vector<xla::HloProto> hlo_metadatas_;  // Owned.
+  std::vector<const xla::HloProto*> hlo_metadatas_ptrs_;
+
+  TpuProgramGroup(const TpuProgramGroup&) = delete;
+  void operator=(const TpuProgramGroup&) = delete;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
new file mode 100644
index 00000000..02e91f5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// An interface to holds all the programs and metadatas generated by the
+// compiler, including those for the sharding/unsharding programs.
+class TpuProgramGroupInterface {
+ public:
+  virtual ~TpuProgramGroupInterface() = default;
+
+  // Check if whether sharding/unsharding program exists.
+  virtual bool has_sharding_program() const = 0;
+
+  // Computes program count.
+  virtual size_t program_count() const = 0;
+
+  // Computes total program size.
+  virtual int64_t program_size() const = 0;
+
+  // Unloads and destroys safely TPU programs.
+  virtual void UnloadAndDestroyPrograms() = 0;
+
+  // Logs program memory summary.
+  virtual bool LogProgramMemorySummary() = 0;
+
+  // Program fingerprints.
+  virtual const std::string& fingerprint(int index) const = 0;
+
+  // Hlo metadatas. The pointers can only be used as long as the cache entry is
+  // referenced.
+  virtual absl::Span<const xla::HloProto* const> hlo_metadatas() const = 0;
+
+  // Boolean array to indicate if the modification of variables are
+  // allowed.
+  virtual const std::vector<bool>& may_modify_variables_list() const = 0;
+
+  // Gets may modify variables value of the TPU program for the given core
+  // `index`.
+  virtual bool may_modify_variables(int index) const = 0;
+
+  // Get Executable Info Proto
+  virtual const TPUExecutableInfoProto& executable_info(int index) const = 0;
+
+  // Get HostTransferInfo Proto
+  virtual const TPUHostTransferInfoProto& host_transfer_info(
+      int index) const = 0;
+
+  // Get XLA_TpuProgram Proto
+  virtual const XLA_TpuProgram* tpu_program(int index) const = 0;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.h
new file mode 100644
index 00000000..1d6c7bab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+
+namespace tensorflow {
+
+// Op that changes the sharding state for a set of on-device variables. The
+// sharding state is represented as the key of the compilation that generated
+// the sharding/unsharding programs along with the main program. The op checks
+// if the current sharding state matches the desired one, and if not, uses the
+// sharding/unsharding programs to transform the variables to the desired state.
+class TPUReshardVariablesOpKernel : public AsyncOpKernel {
+ public:
+  explicit TPUReshardVariablesOpKernel(OpKernelConstruction* context);
+  ~TPUReshardVariablesOpKernel() override;
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
+
+ private:
+  absl::Status DoWork(OpKernelContext* context);
+  absl::Status DoTpuExecute(OpKernelContext* context, const Tensor& format_key,
+                            tpu::CompilationCacheFetchTarget fetch_target);
+
+  int64_t num_vars_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
new file mode 100644
index 00000000..c731cc10
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_UTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace reshard_variables {
+
+absl::Status FlushProgramMemory(se::Platform* platform, int device_ordinal);
+
+absl::Status CheckIsValidKey(const Tensor& key);
+
+bool IsDefaultKey(const Tensor& key);
+
+absl::Status GetComputationCacheEntry(
+    const Tensor& key, string* rendezvous_key_base,
+    std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+    tpu::CompilationCacheFetchTarget fetch_target);
+
+absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
+    OpKernelContext* context, const std::vector<VariableInfo>& variables,
+    const xla::Shape& input_host_shape, xla::Backend* backend,
+    int device_ordinal, se::Stream* stream);
+
+absl::Status PerformCompaction(stream_executor::Stream* stream);
+
+absl::Status UpdateOutputVariables(
+    OpKernelContext* context, xla::ScopedShapedBuffer result_buffers,
+    absl::Span<const TensorShapeProto* const> output_tensor_shape_protos,
+    xla::Backend* backend, se::Stream* stream, int device_ordinal,
+    const std::vector<VariableInfo>& variables,
+    const std::shared_ptr<se::Event>& definition_event);
+
+}  // namespace reshard_variables
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_util.h
new file mode 100644
index 00000000..0b0aedd4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/tpu_util.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "grpcpp/server_builder.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/compile_only_client.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Utility to get session_name from `SessionMetadata`. `SessionMetadata` may
+// be null.
+std::string SessionNameFromMetadata(const SessionMetadata* session_metadata);
+
+// Generates cache proto key for a given computation on a TPU core.
+std::string ProtoKeyForComputation(const std::string& key, int core);
+
+// Returns a TpuCompilationCacheKey parsed from given key or an error.
+absl::StatusOr<TpuCompilationCacheKey> ParseCompilationCacheKey(
+    const std::string& key);
+
+xla::CompileOnlyClient::AotXlaComputationInstance
+BuildAotXlaComputationInstance(
+    const XlaCompiler::CompilationResult& compilation_result);
+
+// Returns true if TPU compilation is enabled.
+bool IsTpuCompilationEnabled();
+
+// Converts an int64 host memory `tensor` to a `shape`.
+absl::Status ShapeTensorToTensorShape(const Tensor& tensor, TensorShape* shape);
+
+absl::Status DynamicShapesToTensorShapes(const OpInputList& dynamic_shapes,
+                                         std::vector<TensorShape>* shapes);
+absl::Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
+                                         std::vector<TensorShape>* shapes);
+
+// Creates gRPC ServerBuilder.
+absl::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
+    int serving_port);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/trace_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/trace_util.h
new file mode 100644
index 00000000..e1c96233
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/trace_util.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TRACE_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TRACE_UTIL_H_
+
+#ifdef PLATFORM_GOOGLE
+#include "base/tracer.h"  // IWYU pragma: export
+#else
+#undef TRACESTRING
+#define TRACESTRING(x)
+#undef TRACELITERAL
+#define TRACELITERAL(x)
+#endif
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TRACE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/transfer_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/transfer_ops.h
new file mode 100644
index 00000000..3c12d22f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+
+#include "xla/literal.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tpu/noncopyable_buffer.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+#include "xla/stream_executor/tpu/tpu_transfer_manager_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace tensorflow {
+
+class TpuTransferOpInterface {
+ public:
+  virtual ~TpuTransferOpInterface() = default;
+  virtual void Cancel() = 0;
+  virtual absl::StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) = 0;
+
+  virtual absl::Status TransferBuffersToInfeed(
+      int device_ordinal,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) = 0;
+  virtual absl::Status TransferLiteralToInfeed(
+      int device_ordinal, const xla::LiteralSlice& literal) = 0;
+  virtual absl::Status TransferLiteralFromOutfeed(
+      int device_ordinal, xla::MutableBorrowingLiteral literal) = 0;
+};
+
+// Base class providing common functionality for async ops that transfer from
+// host to TPU.
+class TpuTransferAsyncOpKernelBase : public AsyncOpKernel {
+ public:
+  explicit TpuTransferAsyncOpKernelBase(
+      OpKernelConstruction* ctx, const std::string& transfer_type,
+      int number_of_threads,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op);
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ protected:
+  virtual absl::Status DoWork(OpKernelContext* context, int device_ordinal) = 0;
+
+  absl::Status RunTransferWithOrdinal(OpKernelContext* ctx, int device_ordinal);
+  std::string transfer_type_;
+  std::unique_ptr<TpuTransferOpInterface> transfer_op_;
+
+ private:
+  virtual absl::Status RunTransfer(OpKernelContext* ctx) = 0;
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  mutex mu_;
+
+  // TpuTransferAsyncOpKernelBase is neither copyable nor movable.
+  TpuTransferAsyncOpKernelBase(const TpuTransferAsyncOpKernelBase&) = delete;
+  TpuTransferAsyncOpKernelBase& operator=(const TpuTransferAsyncOpKernelBase&) =
+      delete;
+};
+
+class TpuTransferAsyncOpKernel : public TpuTransferAsyncOpKernelBase {
+ public:
+  explicit TpuTransferAsyncOpKernel(
+      OpKernelConstruction* ctx, const std::string& transfer_type,
+      int number_of_threads,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op);
+
+ private:
+  absl::Status RunTransfer(OpKernelContext* ctx) override;
+  int device_ordinal_;
+
+  // TpuTransferAsyncOpKernel is neither copyable nor movable.
+  TpuTransferAsyncOpKernel(const TpuTransferAsyncOpKernel&) = delete;
+  TpuTransferAsyncOpKernel& operator=(const TpuTransferAsyncOpKernel&) = delete;
+};
+
+class TpuTransferAsyncDynamicOrdinalOpKernel
+    : public TpuTransferAsyncOpKernelBase {
+ public:
+  explicit TpuTransferAsyncDynamicOrdinalOpKernel(
+      OpKernelConstruction* ctx, const std::string& transfer_type,
+      int number_of_threads,
+      std::unique_ptr<TpuTransferOpInterface> transfer_op);
+
+ private:
+  absl::Status RunTransfer(OpKernelContext* ctx) override;
+
+  // TpuTransferAsyncDynamicOpKernel is neither copyable nor movable.
+  TpuTransferAsyncDynamicOrdinalOpKernel(
+      const TpuTransferAsyncDynamicOrdinalOpKernel&) = delete;
+  TpuTransferAsyncDynamicOrdinalOpKernel& operator=(
+      const TpuTransferAsyncDynamicOrdinalOpKernel&) = delete;
+};
+
+class StreamExecutorTransferOpImpl : public TpuTransferOpInterface {
+ public:
+  explicit StreamExecutorTransferOpImpl();
+  ~StreamExecutorTransferOpImpl() override = default;
+  void Cancel() override;
+  absl::StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) override;
+
+  absl::Status TransferBuffersToInfeed(
+      int device_ordinal,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) override;
+  absl::Status TransferLiteralToInfeed(
+      int device_ordinal, const xla::LiteralSlice& literal) override;
+
+  absl::Status TransferLiteralFromOutfeed(
+      int device_ordinal, xla::MutableBorrowingLiteral literal) override;
+
+ private:
+  absl::StatusOr<stream_executor::StreamExecutor*> GetStreamExecutor(
+      int device_ordinal);
+  xla::TpuTransferManagerInterface* transfer_manager_;
+  tpu::TpuPlatformInterface* tpu_platform_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/ops/tpu_embedding_ops.h b/third_party/tflite-hdrs/tensorflow/core/tpu/ops/tpu_embedding_ops.h
new file mode 100644
index 00000000..324f2b4e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/ops/tpu_embedding_ops.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_OPS_H_
+#define TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_OPS_H_
+
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+// Get the names of the LoadTPUEmbedding*Parameters ops.
+std::vector<std::string> GetPerTableLoadOptimizationParametersOps();
+
+// Get the names of the RetrieveTPUEmbedding*Parameters ops.
+std::vector<std::string> GetPerTableRetrieveOptimizationParametersOps();
+
+// Type enum of elements in deduplication data tuple.
+enum DedupTupleElementType {
+  kInteger = 0,
+  kFloat = 1,
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/ops/tpu_embedding_shape_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/ops/tpu_embedding_shape_util.h
new file mode 100644
index 00000000..1d1e9138
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/ops/tpu_embedding_shape_util.h
@@ -0,0 +1,70 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_SHAPE_UTIL_H_
+#define TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_SHAPE_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Utility class for inferring TpuEmbedding shape information.
+class TpuEmbeddingShapeUtil {
+ public:
+  // Compute the shape of one embedding table stored on the
+  // TpuEmbeddingEngine. The table descriptor from the TpuEmbedding
+  // configuration is supplied in config. On success, shape is populated with
+  // the shape of the embedding table that will be loaded or retrieved using
+  // Ops such as {Load,Retrieve}TpuEmbedding*Parameters.
+  static absl::Status ComputeOneTableShape(int64_t vocabulary_size,
+                                           int table_dimension, int shard_id,
+                                           int num_shards,
+                                           TensorShapeProto* shape);
+
+  // Compute the shapes of the embedding tables stored on the
+  // TpuEmbeddingEngine. The TpuEmbedding configuration is supplied in
+  // config. On success, shapes is populated with the shape of each embedding
+  // table that will be loaded or retrieved using Ops such as
+  // {Load,Retrieve}AllTpuEmbeddingParameters.
+  static absl::Status ComputeTableShapes(
+      absl::Span<const int64_t> vocabulary_sizes,
+      absl::Span<const int> table_dimensions, int shard_id, int num_shards,
+      std::vector<TensorShapeProto>* shapes);
+
+  static absl::Status ComputeTableShapes(
+      const tensorflow::tpu::TPUEmbeddingConfiguration& config, int shard_id,
+      int num_shards, std::vector<TensorShapeProto>* shapes);
+
+  static TensorShapeProto MakeEmpty2DShape();
+
+ private:
+  // Compute the number of embedding IDs per embedding table shard.
+  // There are as many shards as the number of hosts in the job.
+  static absl::StatusOr<int64_t> ComputeNumEmbeddingIdsPerShard(
+      int64_t vocabulary_size, int shard_id, int num_shards);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_SHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_compile.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_compile.h
new file mode 100644
index 00000000..f606c7e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_compile.h
@@ -0,0 +1,72 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_COMPILE_H_
+#define TENSORFLOW_CORE_TPU_TPU_COMPILE_H_
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "xla/client/compile_only_client.h"
+#include "xla/shape.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace internal {
+
+// Performs shape inference on the body of `graph`. Shapes for arguments
+// are taken from `metadata` and `arg_shapes`.
+absl::Status RunShapeInferenceOnComputation(
+    const tpu::TPUCompileMetadataProto& metadata,
+    const std::vector<PartialTensorShape>& arg_shapes, Graph* graph,
+    FunctionLibraryRuntime* flr, GraphShapeInfo* shape_info);
+}  // namespace internal
+
+// Converts a TF Function into XLA HLO, stores generated HLO module and
+// accompanying metadata in CompilationResult.
+absl::Status CompileTFFunctionToHlo(
+    const FunctionLibraryDefinition& flib_def, int graph_def_version,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    const std::vector<TensorShape>& arg_shapes, const DeviceType& device_type,
+    const GuaranteedConsts& guaranteed_constants, const NameAttrList& function,
+    const tpu::TPUCompileMetadataProto& metadata,
+    xla::CompileOnlyClient* client,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    bool use_tuple_args, XlaCompiler::CompilationResult* compilation_result);
+
+// Gets information regarding how input arguments are sharded across multiple
+// cores.
+absl::Status GetShardingInfo(
+    const tpu::TPUCompileMetadataProto& metadata,
+    absl::Span<const TensorShape> arg_shapes,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_COMPILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_compile_interface.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_compile_interface.h
new file mode 100644
index 00000000..a97e721b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_compile_interface.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_TPU_COMPILE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_TPU_COMPILE_INTERFACE_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+
+// Some legacy code requires different implementations for operations like
+// fingerprint/hashing during compilation and/or graph rewriting. These
+// alternate implementations can be registered (via a module initializer) to
+// change the default behavior.
+class TpuCompileInterface {
+ public:
+  virtual ~TpuCompileInterface() {}
+  static TpuCompileInterface* Get();
+  static bool RegisterImplementation(TpuCompileInterface* impl);
+
+  virtual uint64_t FingerprintString(absl::string_view str) = 0;
+
+  // Proto: tensorflow::tpu::CompilationResultProto
+  // Location: tensorflow/core/protobuf/tpu/compilation_result.proto
+  static inline constexpr char kTpuCompileErrorPayloadKey[] =
+      "type.googleapis.com/tensorflow.tpu.CompilationResultProto";
+
+  // Unique string added to the error message for permanent errors during
+  // XLA:TPU compilation. This can be used by TensorFlow models to distinguish
+  // compilation errors from transient errors created by TPU worker preemptions
+  // and restarts.
+  static inline constexpr char kTpuCompileErrorMessage[] =
+      "XLA:TPU compile permanent error";
+};
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_COMPILE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_configuration.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_configuration.h
new file mode 100644
index 00000000..0fbdb0f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_configuration.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+void MaybeInitializeTPUSystemForTests();
+
+// Returns a process-wide global ResourceMgr.
+ResourceMgr* GetTPUConfigResourceMgr(bool initialize_first = true);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_defs.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_defs.h
new file mode 100644
index 00000000..b5c36680
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_defs.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common definitions related to TPUs.
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+#define TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+
+#include <array>
+
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+// Name of the TPU device, which corresponds to a single core.
+extern const char* const DEVICE_TPU_NODE;  // "TPU";
+
+// The TPU_REPLICATED_CORE device is a virtual device corresponding to one core
+// of a replicated TPU computation. Only valid within the body of a
+// TPUReplicate computation.
+extern const char* const DEVICE_TPU_REPLICATED_CORE;
+
+// DEVICE_TPU_SYSTEM is now defined in tensorflow/core/framework/types.h/.cc
+
+// Name of the XLA_TPU_JIT compilation device, which is an internal device to
+// compile graphs for TPU. Not registered as a device; no operators can be
+// assigned to this device by a user.
+extern const char* const DEVICE_TPU_XLA_JIT;  // "XLA_TPU_JIT";
+
+// Attribute used internally to pass "is_mirrored_variable" attribute on
+// TPUReplicatedInput nodes to _TPUReplicate.
+extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
+
+// Attribute used internally to annotate ops which might consume TPU FastMem
+// variable.
+extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
+
+extern const char* const kTPUReplicateAttr;
+extern const char* const kOutsideCompilationAttr;
+
+// Supported types for TPUs.
+inline constexpr std::array<DataType, 21> kTpuAllTypes = {
+    {DT_INT32,    DT_UINT32, DT_FLOAT8_E4M3FN, DT_FLOAT8_E5M2, DT_HALF,
+     DT_BFLOAT16, DT_FLOAT,  DT_DOUBLE,        DT_BOOL,        DT_COMPLEX64,
+     DT_INT64,    DT_UINT64, DT_QINT8,         DT_QUINT8,      DT_QINT32,
+     DT_INT8,     DT_UINT8,  DT_INT16,         DT_UINT16,      DT_INT4,
+     DT_UINT4}};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_configuration_proto_rewrite.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_configuration_proto_rewrite.h
new file mode 100644
index 00000000..063f6668
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_configuration_proto_rewrite.h
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_CONFIGURATION_PROTO_REWRITE_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_CONFIGURATION_PROTO_REWRITE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+
+namespace tensorflow {
+
+// Validates the TPU embedding configuration has been populated correctly and
+// fills in missing fields. The user model is expected to fill in exactly one of
+// the following:
+//
+// (1) batch_size_per_tensor_core and TableDescriptor.num_features, or
+// (2) feature_descriptor.
+//
+// (1) If the user model fills in batch_size_per_tensor_core and
+// TableDescriptor.num_features, this function validates that the
+// feature_descriptor has not been filled in, and then populates
+// feature_descriptor with appropriate values.
+//
+// (2) If the user model fills in feature_descriptor, this function validates
+// that batch_size_per_tensor_core and TableDescriptor.num_features have not
+// been filled in, and then populated them with appropriate values.
+absl::Status PopulateMissingFieldsInTPUEmbeddingConfig(
+    tpu::TPUEmbeddingConfiguration* config);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_CONFIGURATION_PROTO_REWRITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_configuration_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_configuration_utils.h
new file mode 100644
index 00000000..3ac55d17
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_configuration_utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_CONFIGURATION_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_CONFIGURATION_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Returns the total number of unique dynamic input tags used in optimizers. If
+// the tag specific is erroneous, returns an invalid argument error. For correct
+// tag specification, see the comment next to the OptimizerDynamicInput proto in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+absl::StatusOr<int32_t> ComputeTotalTagCountForOptimizerDynamicInputs(
+    const tensorflow::tpu::TPUEmbeddingConfiguration& tpu_embedding_config);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_CONFIGURATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_errors.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_errors.h
new file mode 100644
index 00000000..42a91124
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_errors.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_ERRORS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_ERRORS_H_
+
+#include <string>
+
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+
+namespace tensorflow::tpu {
+
+// The payload URL for TPU embedding initialization permanent errors.
+constexpr absl::string_view kTpuEmbeddingErrorUrl =
+    "type.googleapis.com/tensorflow.tpu.TPUEmbeddingError";
+
+constexpr absl::string_view kTpuEmbeddingErrorMessage =
+    "TPUEmbedding permanent error";
+
+// Appends a payload of type tensorflow::tpu::kTpuEmbeddingErrorUrl to the
+// tensorflow::Status obj if the status is NOT OK. Returns the
+// tensorflow::Status obj unchanged if the status is OK.
+absl::Status AppendTpuEmbeddingErrorPayload(absl::Status obj);
+
+// Appends a payload of type tensorflow::tpu::kTpuEmbeddingErrorUrl to the
+// tensorflow::Status obj if the status is NOT OK. Returns obj.value() if the
+// status is OK.
+template <typename T>
+StatusOr<T> AppendTpuEmbeddingErrorPayload(StatusOr<T> obj) {
+  if (obj.ok()) {
+    return std::move(obj.value());
+  } else {
+    const std::string error_message =
+        absl::StrCat(kTpuEmbeddingErrorMessage, ". ", obj.status().message());
+    absl::Status status(obj.status().code(), error_message);
+    TPUEmbeddingError error_payload;
+    status.SetPayload(kTpuEmbeddingErrorUrl,
+                      absl::Cord(error_payload.SerializeAsString()));
+    return status;
+  }
+}
+
+// Returns true if the tensorflow::Status obj has a payload of type
+// tensorflow::tpu::kTpuEmbeddingErrorUrl.
+bool HasTpuEmbeddingErrorPayload(const absl::Status& status);
+
+// Returns true if the tensorflow::Status obj error message contains
+// tensorflow::tpu::kTpuEmbeddingErrorMessage as a substring.
+bool HasTpuEmbeddingErrorMessage(const absl::Status& status);
+
+}  // namespace tensorflow::tpu
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_ERRORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
new file mode 100644
index 00000000..43643fbd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
@@ -0,0 +1,136 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/protobuf/tpu/optimization_parameters.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+using OptimizationAlgorithm = OptimizationParameters::ParametersCase;
+
+// Returns the name of the optimization algorithm.
+std::string GetOptimizationAlgorithmName(OptimizationAlgorithm alg);
+
+// Returns a user-friendly name for the optimization algorithm.
+std::string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg);
+
+// Returns all supported optimization algorithms.
+std::vector<OptimizationAlgorithm> GetOptimizationAlgorithms();
+
+enum class GradientAccumulationSupport {
+  // Accumulation cannot be used with this optimizer.
+  kNotSupported,
+
+  // Accumulation is allowed and changes optimizer behavior.
+  kSupported,
+};
+
+// Returns the number of optimization parameter vectors used by the optimization
+// algorithm, excluding the weights themselves and assuming no gradient
+// accumulation.
+absl::Status GetBaseAuxiliaryParameterCount(
+    const OptimizationParameters &params, int *count);
+
+// Returns whether (and how) an optimization algorithm supports gradient
+// accumulation.
+absl::Status GetGradientAccumulationSupport(
+    const OptimizationParameters &params, GradientAccumulationSupport *support);
+
+// Returns whether both the given set of optimization parameters has gradient
+// accumulation turned on and that the algorithm used supports it or should
+// ignore that setting. Returns an error if gradient accumulation is enabled and
+// the algorithm does not support it.
+absl::Status UseGradientAccumulation(const OptimizationParameters &params,
+                                     bool *use_gradient_accumulation);
+
+// Returns the parameter specifications for the optimization algorithm (the main
+// parameters first, followed by any auxiliary parameters such as Adagrad
+// accumulators).
+absl::Status GetOptimizationAlgorithmStateVariables(
+    const OptimizationParameters &params,
+    std::vector<StateVariableSpecification> *state_variables);
+
+// Returns the set of dynamic input tags used by the optimization algorithm.
+// This includes both dynamic learning rates and other hyperparameters (e.g.,
+// step counters for the frequency aware Adagrad optimizer).
+absl::flat_hash_set<int> GetOptimizerDynamicInputTags(
+    const OptimizationParameters &params);
+
+// Returns the set of dynamic hyperparameter tags used by the optimization
+// algorithm. This includes other hyperparameters used by the optimization
+// algorithm (e.g., step counters for the frequency aware Adagrad optimizer). It
+// excludes the dynamic learning rate tag.
+absl::flat_hash_set<int> GetOptimizerHyperParameterTags(
+    const OptimizationParameters &params);
+
+// Returns true if the optimization algorithm uses dynamic inputs in its
+// computation.
+bool UsesDynamicInputsInOptimizer(const OptimizationParameters &params);
+
+// Maximum value of auxiliary_parametery_count for any optimization algorithm.
+// This count is used by TPU embedding load/retrieve and needs to be independent
+// of any particular TPU version and hence, we take the maximum across all TPU
+// versions.
+static constexpr int kMaxAuxiliaryParameterCount = 7;
+
+// Fill value for gradient accumulators. This is a denormal so that it will be
+// flushed to zero on the current TPU platforms and needs to continue to have
+// the following properties in the future:
+//
+// 1. Does not have the same bit pattern as a zero and can be distinguished from
+// it using integer operations.
+// 2. Treated as zero by floating-point arithmetic operations (at least addition
+// and subtraction).
+// 3. Cannot be produced by any floating-point arithmetic operation, including
+// those involving itself.
+//
+// It does not need to compare equal or not equal to zero in floating point. We
+// need to use a non-zero value here because some optimization algorithms are
+// not no-ops on zero gradients, so we need to distinguish an accumulated
+// gradient of zero from one that has been cleared after its gradients have
+// already been applied to the parameters and accumulators.
+inline float GradientAccumulatorInitialValue() {
+  return absl::bit_cast<float, uint32_t>(1);
+}
+
+// Generic shape function for per-optimization-algorithm load ops.
+class LoadOpShapeFunction {
+ public:
+  // Computes resulting shape and does parameter checking.
+  absl::Status operator()(shape_inference::InferenceContext *c) const;
+};
+
+// Generic shape function for per-optimization-algorithm retrieve ops.
+class RetrieveOpShapeFunction {
+ public:
+  // Computes resulting shape and does parameter checking.
+  absl::Status operator()(shape_inference::InferenceContext *c) const;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
new file mode 100644
index 00000000..f05b774b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Computes the shape of the output tensors from an embedding configuration.
+absl::Status ComputeOutputTensorShapes(
+    const tensorflow::tpu::TPUEmbeddingConfiguration& config,
+    std::vector<tensorflow::TensorShapeProto>* shapes);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h
new file mode 100644
index 00000000..957de62b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_SPMD_SHARDING_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_SPMD_SHARDING_UTILS_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Gets SPMD manual sharding annotation from the input shape. If the shape is a
+// scalar (rank = 0), the tensor is replicated across all the cores within the
+// replica. If the shape is a non-scalar (rank >= 1), the tensor is sharded on
+// dimension `0' across all the cores within the same replica.
+absl::StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
+    const xla::Shape& shape, int core_count_per_replica,
+    xla::XlaBuilder* builder);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_SPMD_SHARDING_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_execute.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_execute.h
new file mode 100644
index 00000000..bfa177d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_execute.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EXECUTE_H_
+#define TENSORFLOW_CORE_TPU_TPU_EXECUTE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/tpu/tpu_node_context.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+
+namespace tensorflow {
+
+// Runs a TPU executable. `input_allocations` and `output_allocations` are
+// non-owning pointers to the root buffers of each argument/result tuple.
+// `output_shape` is the output shape of the XLA computation from which
+// `program` was derived. If `session_module` is not nullptr, it will be filled
+// with the input and output literals of the execution.
+absl::StatusOr<xla::ExecutionOutput> TPUExecute(
+    const TPUExecutableInfoProto& executable,
+    const TPUHostTransferInfoProto& host_transfers,
+    const xla::HloProto& hlo_metadata,
+    std::vector<xla::ExecutionInput> arguments,
+    const std::string& rendezvous_key_base, uint32 rng_seed,
+    tpu::TpuNodeContext* node_context, xla::DeviceAssignment* device_assignment,
+    CancellationManager* cancellation_manager, OpKernelContext* ctx,
+    stream_executor::Stream* stream,
+    stream_executor::Stream* host_to_device_stream,
+    const XLA_TpuProgram* tpu_program);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EXECUTE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_fingerprint_utils.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_fingerprint_utils.h
new file mode 100644
index 00000000..c7cb99db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_fingerprint_utils.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_FINGERPRINT_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_FINGERPRINT_UTILS_H_
+
+#include <cstdint>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+// Computes a fingerprint of the contents of `library`.
+absl::Status FingerprintFunctionLibrary(
+    const FunctionLibraryDefinition& library, uint64_t& fingerprint);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_FINGERPRINT_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_global_init.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_global_init.h
new file mode 100644
index 00000000..4d6dd064
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_global_init.h
@@ -0,0 +1,78 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_TPU_GLOBAL_INIT_H_
+#define TENSORFLOW_CORE_TPU_TPU_GLOBAL_INIT_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/tpu/topology.pb.h"
+
+namespace tensorflow {
+
+// Initializes the TPU system globally. The state of initialization can then be
+// shared by different sessions running on these TPUs, on the same process. This
+// API is provided for multi-tenant usecases where multiple sessions in a
+// process are using the same set of TPUs.
+//
+// Returns status errors if initialization is unsuccessful and returns the TPU
+// TopologyProto as an output parameter.
+//
+// REQUIRES:
+// * Call this API before any sessions using TPUs are run.
+// * If you are using this API for initialization, please don't use the TPU
+// configuration ops within your graph. This will cause errors to be returned
+// from the API which is called second.
+//
+// DISTRIBUTED SETUP:
+// To properly initialize a TPU topology that is beyond donut level, caller is
+// required to provide correct following arguments:
+//
+// 1. job_name
+// The name of the job under distributed settings. For example, if the job is
+// '/job:tpu_worker/replica:0/task:0/...', the "tpu_worker" is the desired
+// job_name here.
+//
+// 2. session_target
+// The target string that will be used to create a Session and run the
+// distributed TPU initialization graph. Generally this would be the master
+// session from the cluster.
+//
+// 3.device_set
+// The GLOBAL set of devices in the distributed setting, including proper
+// "TPU_SYSTEM" devices across all tasks.
+// For example, device_set should contain two "TPU_SYSTEM" devices on 2 tasks
+// for a 4x2 (2 TPU workers) setup, and other non "TPU_SYSTEM" devices.
+absl::Status InitializeTPUSystemGlobally(absl::string_view job_name,
+                                         absl::string_view session_target,
+                                         const DeviceSet& device_set, Env* env,
+                                         tpu::TopologyProto* tpu_topology);
+
+absl::Status InitializeTPUSystemGlobally(Env* env,
+                                         tpu::TopologyProto* tpu_topology);
+
+absl::Status InitializeTPUSystemGlobally();
+
+}  // namespace tensorflow
+
+// Many clients rely on ADL to lookup InitializeTPUSystemGlobally, now that Env
+// moved to namespace tsl they are all broken without these forwarding
+// declarations.
+namespace tsl {
+using tensorflow::InitializeTPUSystemGlobally;  // NOLINT
+}
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_GLOBAL_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_init_mode.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_init_mode.h
new file mode 100644
index 00000000..0f8ad389
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_init_mode.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+#define TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+enum class TPUInitMode : int { kNone, kGlobal, kRegular };
+
+// Sets the TPU initialization mode appropriately.
+//
+// Requires that mode is not kNone, and mode doesn't transition kGlobal
+// <-> kRegular.
+//
+// IMPLEMENTATION DETAILS:
+// Used internally to record the current mode and type of API used for TPU
+// initialization in a global static variable.
+absl::Status SetTPUInitMode(TPUInitMode mode);
+
+// Returns the current TPUInitMode.
+TPUInitMode GetTPUInitMode();
+
+namespace test {
+
+// Forces the tpu init mode to be changed.
+void ForceSetTPUInitMode(TPUInitMode mode);
+
+}  // namespace test
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_model_server_initializer.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_model_server_initializer.h
new file mode 100644
index 00000000..7ebafaea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_model_server_initializer.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_MODEL_SERVER_INITIALIZER_H_
+#define TENSORFLOW_CORE_TPU_TPU_MODEL_SERVER_INITIALIZER_H_
+
+#include "xla/stream_executor/tpu/libtftpu.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tpu {}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_MODEL_SERVER_INITIALIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_node_device_util.h b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_node_device_util.h
new file mode 100644
index 00000000..b784727f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/tpu_node_device_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_UTIL_H_
+#define TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_UTIL_H_
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+
+namespace tensorflow {
+
+// This is a BackendOpFilter. (see tensorflow/compiler/tf2xla/xla_op_registry.h)
+// It returns true if the op should be registered on the device, it may
+// optionally modify the KernelDef.
+bool TpuOpFilter(KernelDef* kdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/tpu/virtual_device.h b/third_party/tflite-hdrs/tensorflow/core/tpu/virtual_device.h
new file mode 100644
index 00000000..08233ece
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/tpu/virtual_device.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_VIRTUAL_DEVICE_H_
+#define TENSORFLOW_CORE_TPU_VIRTUAL_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+
+namespace tensorflow {
+
+// A dummy device that exists primarily for operator placement, without
+// corresponding directly to a piece of hardware.
+class VirtualDevice : public Device {
+ public:
+  VirtualDevice(Env* env, const DeviceAttributes& device_attributes);
+
+  absl::Status Sync() override;
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   const AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
+  absl::Status TryGetDeviceContext(DeviceContext** out_context) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_VIRTUAL_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/cf_sink/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/cf_sink/pass.h
new file mode 100644
index 00000000..6db168c5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/cf_sink/pass.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_CF_SINK_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_CF_SINK_PASS_H_
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+namespace tfg {
+
+std::unique_ptr<Pass> CreateControlFlowSinkPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_CF_SINK_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/consolidate_attrs/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/consolidate_attrs/pass.h
new file mode 100644
index 00000000..e945c18a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/consolidate_attrs/pass.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_CONSOLIDATE_ATTRS_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_CONSOLIDATE_ATTRS_PASS_H_
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+namespace tfg {
+std::unique_ptr<Pass> CreateConsolidateAttributesPass();
+std::unique_ptr<Pass> CreatePrepareAttributesForExportPass();
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_CONSOLIDATE_ATTRS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/const_dedupe_hoist/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/const_dedupe_hoist/pass.h
new file mode 100644
index 00000000..4b9b8133
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/const_dedupe_hoist/pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_CONST_DEDUPE_HOIST_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_CONST_DEDUPE_HOIST_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_DEDUPEANDHOISTCONSTANT
+#include "tensorflow/core/transforms/passes.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateDedupeAndHoistConstantPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_CONST_DEDUPE_HOIST_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/constant_folding/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/constant_folding/pass.h
new file mode 100644
index 00000000..99603d4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/constant_folding/pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TRANSFORMS_CONSTANT_FOLDING_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_CONSTANT_FOLDING_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_CONSTANTFOLDINGPASS
+#include "tensorflow/core/transforms/passes.h.inc"
+
+// Create a constant folding pass.
+std::unique_ptr<Pass> CreateConstantFoldingPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_CONSTANT_FOLDING_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/cse/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/cse/pass.h
new file mode 100644
index 00000000..65f1d24b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/cse/pass.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_CSE_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_CSE_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+std::unique_ptr<Pass> CreateCSEPass();
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_CSE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/drop_unregistered_attribute/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/drop_unregistered_attribute/pass.h
new file mode 100644
index 00000000..b0b46bca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/drop_unregistered_attribute/pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_DROP_UNREGISTERED_ATTRIBUTE_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_DROP_UNREGISTERED_ATTRIBUTE_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_DROPOUTPUTSHAPESATTR
+#include "tensorflow/core/transforms/passes.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateDropOutputShapesAttrPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_DROP_UNREGISTERED_ATTRIBUTE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.h
new file mode 100644
index 00000000..186ab2d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the constructor for the eliminate passthrough iteration
+// arguments pass.
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_ELIMINATE_PASSTHROUGH_ITER_ARGS_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_ELIMINATE_PASSTHROUGH_ITER_ARGS_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+// Creates a pass that eliminates passthrough iteration arguments from
+// region-based loop operations.
+std::unique_ptr<Pass> CreateEliminatePassthroughIterArgsPass();
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_ELIMINATE_PASSTHROUGH_ITER_ARGS_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/func_to_graph/func_to_graph.h b/third_party/tflite-hdrs/tensorflow/core/transforms/func_to_graph/func_to_graph.h
new file mode 100644
index 00000000..5cab621b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/func_to_graph/func_to_graph.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_FUNC_TO_GRAPH_FUNC_TO_GRAPH_H_
+#define TENSORFLOW_CORE_TRANSFORMS_FUNC_TO_GRAPH_FUNC_TO_GRAPH_H_
+
+#include "tensorflow/core/ir/ops.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+namespace tfg {
+
+// Lowers a lifted graph func back to the graph. The uses of function arguments
+// will be replaced with the associated value according to
+// `tfg.lifted_value_attr` attribute.
+absl::Status FuncToGraph(GraphFuncOp func);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_FUNC_TO_GRAPH_FUNC_TO_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/func_to_graph/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/func_to_graph/pass.h
new file mode 100644
index 00000000..498aabc6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/func_to_graph/pass.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_FUNC_TO_GRAPH_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_FUNC_TO_GRAPH_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+// Creates a pass which turns the function to a graph. Note that only the
+// function which is lifted from graph is valid.
+std::unique_ptr<Pass> CreateFuncToGraphPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_FUNC_TO_GRAPH_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/functional_to_region/impl.h b/third_party/tflite-hdrs/tensorflow/core/transforms/functional_to_region/impl.h
new file mode 100644
index 00000000..023843d4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/functional_to_region/impl.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_FUNCTIONAL_TO_REGION_IMPL_H_
+#define TENSORFLOW_CORE_TRANSFORMS_FUNCTIONAL_TO_REGION_IMPL_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+void PopulateFunctionalToRegionPatterns(RewritePatternSet &patterns,
+                                        SymbolTable &table);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_FUNCTIONAL_TO_REGION_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/functional_to_region/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/functional_to_region/pass.h
new file mode 100644
index 00000000..362f11b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/functional_to_region/pass.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_FUNCTIONAL_TO_REGION_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_FUNCTIONAL_TO_REGION_PASS_H_
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+namespace tfg {
+
+std::unique_ptr<Pass> CreateFunctionalToRegionPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_FUNCTIONAL_TO_REGION_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/graph_compactor/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_compactor/pass.h
new file mode 100644
index 00000000..6767bb59
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_compactor/pass.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_GRAPH_COMPACTOR_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_GRAPH_COMPACTOR_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+std::unique_ptr<Pass> CreateNameCompressPass();
+std::unique_ptr<Pass> CreateStripDefaultAttrsPass();
+std::unique_ptr<Pass> CreateAddDefaultAttrsPass();
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_GRAPH_COMPACTOR_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/graph_to_func/graph_to_func.h b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_to_func/graph_to_func.h
new file mode 100644
index 00000000..94723c96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_to_func/graph_to_func.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_GRAPH_TO_FUNC_GRAPH_TO_FUNC_H_
+#define TENSORFLOW_CORE_TRANSFORMS_GRAPH_TO_FUNC_GRAPH_TO_FUNC_H_
+
+#include <string>
+
+#include "tensorflow/core/ir/ops.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+namespace tfg {
+
+// Lifts a graph into a function, using the provided array of `feeds` for
+// function arguments, `fetches` for function returned values, and
+// `control_rets` for returned control values. The Graph op is replaced in-place
+// by a GraphFuncOp with a name defined in the dialect.
+absl::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
+                         ArrayRef<Value> fetches, ArrayRef<Value> control_rets);
+
+// Lifts a graph into a function, using the provided array of `feeds` for
+// function arguments, `fetches` for function returned values, and
+// `control_rets` for returned control values. The Graph op is replaced in-place
+// by a GraphFuncOp with a name defined in the dialect.
+absl::Status GraphToFunc(GraphOp graph, ArrayRef<std::string> feeds_names,
+                         ArrayRef<std::string> fetches_names,
+                         ArrayRef<std::string> control_rets);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_GRAPH_TO_FUNC_GRAPH_TO_FUNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/graph_to_func/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_to_func/pass.h
new file mode 100644
index 00000000..798f5c95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_to_func/pass.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_GRAPH_TO_FUNC_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_GRAPH_TO_FUNC_PASS_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_GRAPHTOFUNC
+#include "tensorflow/core/transforms/passes.h.inc"
+
+// Returns a pass that runs on a Module and expects to find a single GraphOp
+// to transform into a function. The provided feeds and fetches are used to form
+// the function arguments and returned values.
+std::unique_ptr<Pass> CreateGraphToFuncPass(
+    ArrayRef<std::string> feeds = {}, ArrayRef<std::string> fetches = {},
+    ArrayRef<std::string> control_rets = {});
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_GRAPH_TO_FUNC_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/graph_transform_wrapper.h b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_transform_wrapper.h
new file mode 100644
index 00000000..030f428c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/graph_transform_wrapper.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_GRAPH_TRANSFORM_WRAPPER_H_
+#define TENSORFLOW_CORE_TRANSFORMS_GRAPH_TRANSFORM_WRAPPER_H_
+
+#include <initializer_list>
+#include <memory>
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+namespace tfg {
+
+// Runs a sequence of passes over Graph* and attached function library. The
+// Graph* is converted to TFG, provided passes executed and the passed in Graph*
+// replaced. If the pass fails, then graph is not modified.
+//
+// This is meant for simple interop where there is a Graph* currently. Passes
+// created here are constrained to run on Module ops.
+absl::Status RunTransformOnGraph(
+    tensorflow::Graph* graph,
+    const std::initializer_list<
+        llvm::function_ref<std::unique_ptr<mlir::Pass>()>>& passes,
+    const tensorflow::GraphDebugInfo& debug_info = {});
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_GRAPH_TRANSFORM_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/legacy_call/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/legacy_call/pass.h
new file mode 100644
index 00000000..95faae4d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/legacy_call/pass.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_LEGACY_CALL_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_LEGACY_CALL_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+std::unique_ptr<Pass> CreateLiftLegacyCallPass();
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_LEGACY_CALL_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/pass_registration.h b/third_party/tflite-hdrs/tensorflow/core/transforms/pass_registration.h
new file mode 100644
index 00000000..55aa88ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/pass_registration.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_PASS_REGISTRATION_H_
+#define TENSORFLOW_CORE_TRANSFORMS_PASS_REGISTRATION_H_
+
+#include <memory>
+
+#include "tensorflow/core/transforms/cf_sink/pass.h"
+#include "tensorflow/core/transforms/consolidate_attrs/pass.h"
+#include "tensorflow/core/transforms/const_dedupe_hoist/pass.h"
+#include "tensorflow/core/transforms/constant_folding/pass.h"
+#include "tensorflow/core/transforms/cse/pass.h"
+#include "tensorflow/core/transforms/drop_unregistered_attribute/pass.h"
+#include "tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.h"
+#include "tensorflow/core/transforms/func_to_graph/pass.h"
+#include "tensorflow/core/transforms/functional_to_region/pass.h"
+#include "tensorflow/core/transforms/graph_compactor/pass.h"
+#include "tensorflow/core/transforms/graph_to_func/pass.h"
+#include "tensorflow/core/transforms/legacy_call/pass.h"
+#include "tensorflow/core/transforms/region_to_functional/pass.h"
+#include "tensorflow/core/transforms/remapper/pass.h"
+#include "tensorflow/core/transforms/shape_inference/pass.h"
+#include "tensorflow/core/transforms/toposort/pass.h"
+
+namespace mlir {
+namespace tfg {
+
+// Generate the code for registering passes for command-line parsing.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/core/transforms/passes.h.inc"
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_PASS_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/region_to_functional/impl.h b/third_party/tflite-hdrs/tensorflow/core/transforms/region_to_functional/impl.h
new file mode 100644
index 00000000..77c3ec91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/region_to_functional/impl.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_REGION_TO_FUNCTIONAL_IMPL_H_
+#define TENSORFLOW_CORE_TRANSFORMS_REGION_TO_FUNCTIONAL_IMPL_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+// Populate the patterns to convert region ops to functional ops. Please refer
+// to `tfg-region-to-functional` pass description.
+void PopulateRegionToFunctionalPatterns(RewritePatternSet &patterns,
+                                        SymbolTable &table,
+                                        bool force_control_capture = false);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_REGION_TO_FUNCTIONAL_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/region_to_functional/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/region_to_functional/pass.h
new file mode 100644
index 00000000..e1dc6fc5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/region_to_functional/pass.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_REGION_TO_FUNCTIONAL_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_REGION_TO_FUNCTIONAL_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_REGIONTOFUNCTIONAL
+#include "tensorflow/core/transforms/passes.h.inc"
+
+// Creates a conversion pass from region control-flow to functional
+// control-flow. If `force_control_capture` is set, then all region control-flow
+// ops are guaranteed to be converted to functional form by capturing implicit
+// control tokens using a `Const` node.
+std::unique_ptr<Pass> CreateRegionToFunctionalPass(
+    bool force_control_capture = false);
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_REGION_TO_FUNCTIONAL_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/remapper/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/remapper/pass.h
new file mode 100644
index 00000000..aadb124a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/remapper/pass.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_REMAPPER_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_REMAPPER_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_REMAPPER
+#include "tensorflow/core/transforms/passes.h.inc"
+
+// Creates a remapper pass to remap the operations onto other opreations which
+// decrease the amount of operations to perform a computation.
+std::unique_ptr<Pass> CreateRemapperPass(bool enable_onednn_patterns = false,
+                                         bool xla_auto_clustering = false);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_REMAPPER_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/remapper/remapping_helper.h b/third_party/tflite-hdrs/tensorflow/core/transforms/remapper/remapping_helper.h
new file mode 100644
index 00000000..1d8db8fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/remapper/remapping_helper.h
@@ -0,0 +1,245 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TRANSFORMS_REMAPPER_REMAPPING_HELPER_H_
+#define TENSORFLOW_CORE_TRANSFORMS_REMAPPER_REMAPPING_HELPER_H_
+
+#include <string>
+
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/transforms/utils/op_cat_helper.h"
+#include "tensorflow/core/transforms/utils/utils.h"
+
+namespace mlir {
+namespace tfg {
+
+// The following structures store info of the operations to be fused. These
+// are mainly used for combining operands info and attributes for a fused
+// operation. They are also used for some predicate functions like
+// `IsCpuCompatible` and `IsGpuCompatible` to check if the relevant fusion is
+// supported on CPU and GPU, respectively. Another reason to keep these
+// structures is to follow similar logics in current grappler-remapper.
+// TODO(intel-tf): Remove redundancies once the similar functionality is
+// achieved by tfg-remapper.
+struct ContractionBiasAdd {
+  Operation* contraction;
+  Operation* bias_add;
+};
+
+struct ContractionBiasAddActivation {
+  Operation* contraction;
+  Operation* bias_add;
+  Operation* activation;
+};
+
+struct ContractionBiasAddAdd {
+  Operation* contraction;
+  Operation* bias_add;
+  Operation* add;
+};
+
+struct ContractionBiasAddAddActivation {
+  Operation* contraction;
+  Operation* bias_add;
+  Operation* add;
+  Operation* activation;
+};
+
+struct FusedBatchNormEx {
+  Operation* fused_batch_norm;
+  Value side_input;
+  Operation* activation;
+};
+
+class OpPropertyHelper : public OpCatHelper {
+ public:
+  OpPropertyHelper() = default;
+  explicit OpPropertyHelper(TFGraphDialect* dialect,
+                            bool onednn_enabled = false,
+                            bool xla_auto_clustering = false)
+      : OpCatHelper(dialect),
+        is_onednn_enabled_(onednn_enabled),
+        is_xla_auto_clustering_enabled_(xla_auto_clustering) {}
+
+  bool HasControlOperandsOrResultUsers(Operation* op) const {
+    TFOp wrapper_op(op);
+    bool has_ctl_operands = !(wrapper_op.getControlOperands().empty());
+    bool has_ctl_ret_users = !(wrapper_op.controlRet().getUsers().empty());
+    if (has_ctl_operands || has_ctl_ret_users)
+      return true;
+    else
+      return false;
+  }
+
+  // This function is to be used for an operation that has at least 1
+  // non-control result.
+  bool HasAtMostOneUserOfResult0(Operation* op) const {
+    // All tfg operation has 1 control result. When the operation has at least 1
+    // non-control result, the number of results should be at least 2.
+    return op->getNumResults() > 1 &&
+           (op->getResult(0).hasOneUse() || op->getResult(0).use_empty());
+  }
+
+  bool IsContraction(Operation* op) const {
+    return dialect_->IsConv2D(op) || dialect_->IsConv3D(op) ||
+           dialect_->IsDepthwiseConv2dNative(op) || dialect_->IsMatMul(op);
+  }
+
+  bool HaveSameDataType(Operation* lhs_op, Operation* rhs_op,
+                        StringRef attr_name = "T") const {
+    auto lhs_attr = lhs_op->getAttrOfType<TypeAttr>(attr_name);
+    auto rhs_attr = rhs_op->getAttrOfType<TypeAttr>(attr_name);
+    if (!lhs_attr || !rhs_attr) return false;
+    return lhs_attr == rhs_attr;
+  }
+
+  // This function is currently used by contraction ops.
+  bool IsGpuCompatibleDataType(Operation* contraction_op,
+                               StringRef attr_name = "T") const {
+    auto attr = contraction_op->getAttrOfType<TypeAttr>(attr_name);
+    if (!attr) return false;
+    Type dtype = attr.getValue();
+    if (dialect_->IsConv2D(contraction_op)) {
+      return mlir::isa<Float32Type>(dtype);
+    } else if (dialect_->IsMatMul(contraction_op)) {
+      return mlir::isa<Float32Type, Float64Type>(dtype);
+    } else {
+      return false;
+    }
+  }
+
+  // This function is currently used by contraction ops.
+  bool IsCpuCompatibleDataType(Operation* contraction_op,
+                               StringRef attr_name = "T") const {
+    auto attr = contraction_op->getAttrOfType<TypeAttr>(attr_name);
+    if (!attr) return false;
+    Type dtype = attr.getValue();
+    if (is_onednn_enabled_) {
+      // Only contraction ops (MatMul, Conv2D, Conv3D, and
+      // DepthwiseConv2dNative) and BatchMatMul are supported. BatchMatMul
+      // fusions are handled differently than contraction ops.
+      bool is_supported = IsContraction(contraction_op) ||
+                          dialect_->IsAnyBatchMatMul(contraction_op);
+      return is_supported && mlir::isa<Float32Type, BFloat16Type>(dtype);
+    }
+
+    if (dialect_->IsConv2D(contraction_op)) {
+      return mlir::isa<Float32Type, Float64Type>(dtype);
+    } else if (dialect_->IsMatMul(contraction_op)) {
+      return mlir::isa<Float32Type>(dtype);
+    } else {
+      return false;
+    }
+  }
+
+  // This function is currently used by convolution type op
+  bool IsGpuCompatibleDataFormat(Operation* conv_op,
+                                 StringRef attr_name = "data_format") const {
+    StringRef data_format;
+    if (auto attr = conv_op->getAttrOfType<StringAttr>(attr_name)) {
+      data_format = attr.getValue();
+    } else {
+      return false;
+    }
+    if (dialect_->IsConv2D(conv_op)) {
+      return data_format == "NHWC" || data_format == "NCHW";
+    } else {
+      return false;
+    }
+  }
+
+  // This function is currently used by convolution type op
+  bool IsCpuCompatibleDataFormat(Operation* conv_op,
+                                 StringRef attr_name = "data_format") const {
+    StringRef data_format;
+    if (auto attr = conv_op->getAttrOfType<StringAttr>(attr_name)) {
+      data_format = attr.getValue();
+    } else {
+      return false;
+    }
+    if (dialect_->IsConv2D(conv_op)) {
+      return data_format == "NHWC" ||
+             (is_onednn_enabled_ && data_format == "NCHW");
+    } else if (dialect_->IsConv3D(conv_op)) {
+      return data_format == "NDHWC" ||
+             (is_onednn_enabled_ && data_format == "NCDHW");
+    } else {
+      return false;
+    }
+  }
+
+  bool IsGpuCompatible(const ContractionBiasAddActivation& pattern) const {
+#if TENSORFLOW_USE_ROCM
+    // ROCm does not support _FusedConv2D. Does it suppport _FusedMatMul?
+    return false;
+#endif
+    // The TF->XLA bridge does not support `_FusedMatMul` so we avoid creating
+    // this op. Furthermore, XLA already does this fusion internally so there
+    // is no true benefit from doing this optimization if XLA is going to
+    // compile the unfused operations anyway.
+    if (is_xla_auto_clustering_enabled_) return false;
+    if (!util::OpHasDevice(pattern.contraction, tensorflow::DEVICE_GPU))
+      return false;
+    if (!dialect_->IsRelu(pattern.activation)) return false;
+    if (dialect_->IsMatMul(pattern.contraction)) {
+      return IsGpuCompatibleDataType(pattern.contraction);
+    } else {
+      // TODO(intel-tf): Add spatial convolution support on GPU
+      return false;
+    }
+  }
+
+  // Currently GPU does not supprt contraction + bias_add
+  bool IsGpuCompatible(const ContractionBiasAdd&) const { return false; }
+
+  bool IsCpuCompatible(Operation* contraction_op) const {
+    if (!util::OpHasDevice(contraction_op, tensorflow::DEVICE_CPU))
+      return false;
+    if (dialect_->IsConv2D(contraction_op) ||
+        dialect_->IsConv3D(contraction_op)) {
+      return IsCpuCompatibleDataType(contraction_op) &&
+             IsCpuCompatibleDataFormat(contraction_op);
+    } else if (dialect_->IsMatMul(contraction_op) ||
+               dialect_->IsAnyBatchMatMul(contraction_op) ||
+               dialect_->IsDepthwiseConv2dNative(contraction_op)) {
+      return IsCpuCompatibleDataType(contraction_op);
+    } else {
+      return false;
+    }
+  }
+
+  template <typename Pattern>
+  bool IsDeviceCompatible(const Pattern& pattern) const {
+    // Currently, this function is used by contraction based fussion.
+    if constexpr (!std::is_same<Pattern, ContractionBiasAdd>::value &&
+                  !std::is_same<Pattern, ContractionBiasAddActivation>::value &&
+                  !std::is_same<Pattern, ContractionBiasAddAdd>::value &&
+                  !std::is_same<Pattern, ContractionBiasAddActivation>::value) {
+      return false;
+    }
+    return IsGpuCompatible(pattern) || IsCpuCompatible(pattern.contraction);
+  }
+
+  bool isOneDNNEnabled() const { return is_onednn_enabled_; }
+
+ private:
+  bool is_onednn_enabled_;
+  bool is_xla_auto_clustering_enabled_;
+};
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_REMAPPER_REMAPPING_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/shape_inference/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/shape_inference/pass.h
new file mode 100644
index 00000000..046d6556
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/shape_inference/pass.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_SHAPE_INFERENCE_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_SHAPE_INFERENCE_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+#define GEN_PASS_DECL_SHAPEINFERENCE
+#include "tensorflow/core/transforms/passes.h.inc"
+
+// Pass that infers the output shape of operations.
+std::unique_ptr<Pass> CreateShapeInferencePass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_SHAPE_INFERENCE_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/toposort/pass.h b/third_party/tflite-hdrs/tensorflow/core/transforms/toposort/pass.h
new file mode 100644
index 00000000..84760adb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/toposort/pass.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_TOPOSORT_PASS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_TOPOSORT_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+
+namespace mlir {
+namespace tfg {
+
+// Sort topologically (following SSA defs-uses edges) the given block.
+// The sort is stable. Optionally accepts an instance of the TFG dialect for
+// virtually breaking NextIteration -> Merge cycles.
+void SortTopologically(Block *block, TFGraphDialect *dialect = nullptr);
+
+// Programmatically create a pass that topologically sort graphs.
+std::unique_ptr<Pass> CreateTopoSortPass();
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_TOPOSORT_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/utils/eval_utils.h b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/eval_utils.h
new file mode 100644
index 00000000..28128938
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/eval_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_UTILS_EVAL_UTILS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_UTILS_EVAL_UTILS_H_
+
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/ir/tf_op_wrapper.h"
+
+namespace Eigen {
+class ThreadPoolDevice;
+}  // namespace Eigen
+
+namespace mlir {
+namespace tfg {
+namespace util {
+
+// A simple CPU device for operation evaluation.
+class SimpleDevice : public tensorflow::DeviceBase {
+ public:
+  SimpleDevice();
+  ~SimpleDevice() override;
+
+  absl::Status MakeTensorFromProto(
+      const tensorflow::TensorProto& tensor_proto,
+      const tensorflow::AllocatorAttributes alloc_attrs,
+      tensorflow::Tensor* tensor) override;
+
+  tensorflow::Allocator* GetAllocator(
+      tensorflow::AllocatorAttributes attr) override;
+
+  const std::string& device_type() const override { return device_type_; }
+
+ private:
+  std::unique_ptr<tensorflow::thread::ThreadPool> eigen_worker_;
+  tensorflow::DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+  const std::string device_type_ = tensorflow::DEVICE_CPU;
+};
+
+// Attempts to evaluates an MLIR Operation with the op registered kernel. The op
+// is always executed on the local host CPU irrespective of the device attribute
+// of the given op. The results will be filled in the results vector.
+LogicalResult EvaluateOperation(tensorflow::DeviceBase* cpu_device,
+                                tensorflow::ResourceMgr* resource_mgr, TFOp op,
+                                ArrayRef<ElementsAttr> operands,
+                                SmallVectorImpl<TypedAttr>& results);
+}  // namespace util
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_UTILS_EVAL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/utils/op_cat_helper.h b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/op_cat_helper.h
new file mode 100644
index 00000000..8cb212bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/op_cat_helper.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TRANSFORMS_UTILS_OP_CAT_HELPER_H_
+#define TENSORFLOW_CORE_TRANSFORMS_UTILS_OP_CAT_HELPER_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/tf_op_wrapper.h"
+
+namespace mlir {
+namespace tfg {
+// A Helper class to identify if an op belongs to certain op category.
+class OpCatHelper {
+ public:
+  OpCatHelper() = default;
+  explicit OpCatHelper(TFGraphDialect *dialect) : dialect_(dialect) {}
+
+  bool IsAggregate(TFOp op);
+  bool IsCommutative(TFOp op);
+
+  // Returns true if it's a splat tensor type and has the splat value 1.
+  bool IsOnes(TFOp op);
+  // Returns true if it's a splat tensor type and has the splat value 0.
+  bool IsZeros(TFOp op);
+
+  // Returns true if the op is known to use persistent memory to store its
+  // value.
+  bool IsPersistent(TFOp op);
+
+  // Returns true if the op belongs to the NC_DATASET class (see graph/graph.h).
+  bool IsDataset(TFOp op);
+
+  TFGraphDialect *getDialect() const { return dialect_; }
+
+ protected:
+  TFGraphDialect *dialect_;
+};
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_UTILS_OP_CAT_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/utils/pdll/utils.h b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/pdll/utils.h
new file mode 100644
index 00000000..75f33509
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/pdll/utils.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_UTILS_PDLL_UTILS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_UTILS_PDLL_UTILS_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace tfg {
+
+// Register the common utils.
+void RegisterPDLLUtils(RewritePatternSet &patterns);
+
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_UTILS_PDLL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/transforms/utils/utils.h b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/utils.h
new file mode 100644
index 00000000..9b4b2e2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/transforms/utils/utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TRANSFORMS_UTILS_UTILS_H_
+#define TENSORFLOW_CORE_TRANSFORMS_UTILS_UTILS_H_
+
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/core/ir/dialect.h"
+
+namespace mlir {
+
+class Operation;
+class NamedAttrList;
+
+namespace tfg {
+namespace util {
+
+// Returns true if the op has the requested device attribute.
+bool OpHasDevice(Operation *op, const char *device_name);
+
+// Erase the attribute starts with "_".
+void EraseRegularNodeAttributes(NamedAttrList &attr_list);
+
+// When rewriting an operation 1-to-1, intrinsic attributes are manually
+// forwarded, modified, or dropped. For example, when `If` is rewritten to
+// `IfRegion`,
+//
+// 1. `Tout` is forwarded as is,
+// 2. `then_branch` is changed to `then_attrs` which contain the attribute
+// dictionary part of the `#tf_type.func`, and
+// 3. `Tin` is dropped.
+//
+// Non-intrinsic attributes, e.g. `_tpu_cluster`, are blindly forwarded to the
+// new operation.
+void ForwardNonIntrinsicAttributes(Operation *src, Operation *dst);
+
+// Add an argument to a loop region. This inserts the new data argument and
+// control argument at the correct positions and returns them. Also, this
+// function updates any preserved argument attributes by inserting a null.
+struct LoopRegionArgumentUpdate {
+  BlockArgument data, ctl;
+};
+LoopRegionArgumentUpdate LoopRegionAddArgument(Region &region, Type type);
+
+// Erase an argument from a loop region. This erases the corresponding control
+// argument. Also, this function updates any preserved argument attributes by
+// deleting them.
+void LoopRegionEraseArgument(Region &region, unsigned index);
+
+// Indicate that a result has been added to a loop region. Call this function to
+// update the preserved result attributes.
+void LoopRegionResultAdded(Region &region, unsigned num = 1);
+
+// Indicate that a result has been erased from a loop region. Call this function
+// to update the preserved result attributes.
+void LoopRegionResultErased(Region &region, unsigned index);
+
+// Erase operands from an op that might have an `operand_segment_sizes` ,
+// updating the attribute in-place if present.
+void SizedOperandSegmentsEraseOperands(Operation *op,
+                                       ArrayRef<unsigned> indices);
+void SizedOperandSegmentsEraseOperands(Operation *op,
+                                       const llvm::BitVector &erase);
+
+}  // namespace util
+}  // namespace tfg
+}  // namespace mlir
+
+#endif  // TENSORFLOW_CORE_TRANSFORMS_UTILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/activation_mode.h b/third_party/tflite-hdrs/tensorflow/core/util/activation_mode.h
new file mode 100644
index 00000000..2c2e6476
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/activation_mode.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_ACTIVATION_MODE_H_
+#define TENSORFLOW_CORE_UTIL_ACTIVATION_MODE_H_
+
+// This file contains helper routines to deal with activation mode in various
+// ops and kernels.
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// ActivationMode: the activation function we apply to the input tensor:
+enum ActivationMode {
+  NONE = 0,
+  SIGMOID = 1,
+  RELU = 2,
+  RELU6 = 3,
+  RELUX = 4,
+  TANH = 5,
+  BANDPASS = 6,
+};
+
+// Specialization to parse an attribute directly into a ActivationMode enum.
+absl::Status GetActivationModeFromString(const string& str_value,
+                                         ActivationMode* value);
+
+inline absl::string_view ToString(ActivationMode mode) {
+  switch (mode) {
+    case NONE:
+      return "NONE";
+    case SIGMOID:
+      return "SIGMOID";
+    case RELU:
+      return "RELU";
+    case RELU6:
+      return "RELU6";
+    case RELUX:
+      return "RELUX";
+    case TANH:
+      return "TANH";
+    case BANDPASS:
+      return "BANDPASS";
+  }
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_ACTIVATION_MODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/autotune_serialize.h b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/autotune_serialize.h
new file mode 100644
index 00000000..745eb1ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/autotune_serialize.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// For Google-internal use only.
+//
+// Supports serializing the autotune maps to string
+// (SerializeAutotuneMaps), as well as deserializing them from
+// string and injecting them into TF runtime
+// (LoadSerializedAutotuneMaps).
+//
+// Aims to speed up the warmup time of neural nets.
+
+#ifndef TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_AUTOTUNE_SERIALIZE_H_
+#define TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_AUTOTUNE_SERIALIZE_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// TODO(b/189530096) Support autotune maps for more ops.
+// Loads autotune maps from string output by SerializeAutotuneMaps and uses
+// them to update the runtime autotune maps.
+absl::Status LoadSerializedAutotuneMaps(absl::string_view s);
+
+// Serializes all the autotune maps into a string that can be decoded by
+// LoadSerializedAutotuneMaps.
+absl::Status SerializeAutotuneMaps(std::string* output);
+
+// Resets all autotune maps. For test use only.
+void ResetAutotuneMaps();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_AUTOTUNE_SERIALIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_autotune_maps.h b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
new file mode 100644
index 00000000..7c00348a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// For Google-internal use only.
+//
+// This file defines the map data structure for storing autotuning results for
+// fused_conv2d_bias_activation_op_kernels.
+//
+// The key of the map uniquely identifies a convolution operation that runs on a
+// particular device model while the value might be the autotuned algorithm we
+// choose for the conv.
+//
+// This map will be merged after fused_conv2d_bias_activation_op_kernels is
+// merged into conv_ops_fused_impl.h (b/177365158, b/189530096)
+
+#ifndef TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_AUTOTUNE_MAPS_H_
+#define TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_AUTOTUNE_MAPS_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include <string>
+
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
+
+namespace tensorflow {
+
+// A dummy type to group forward convolution autotune results together.
+struct ConvAutotuneGroup {
+  static string name() { return "Conv"; }
+};
+
+using ConvAutotuneMap = AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
+                                          AutotuneEntry<se::dnn::ConvOp>>;
+
+// A dummy type to group fused convolution autotune results together.
+struct ConvFusedAutotuneGroup {
+  static string name() { return "FusedConv"; }
+};
+
+using FusedConvAutotuneMap =
+    AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
+                      AutotuneEntry<se::dnn::FusedConvOp>>;
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_AUTOTUNE_MAPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_map_wrapper.h b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_map_wrapper.h
new file mode 100644
index 00000000..39ce9845
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_map_wrapper.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_MAP_WRAPPER_H_
+#define TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_MAP_WRAPPER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/core/util/autotune_maps/autotune_map.pb.h"
+
+namespace tensorflow {
+
+// This class is a thin wrapper around `ConvMapProto::Entry`. It is used to
+// provide opaque accessors to an entry's key and value without exposing the
+// internal structure of the entry.
+class ConvMapWrapper {
+ public:
+  using OpaqueKey = std::string;
+  using OpaqueValue = std::string;
+
+  // Creates an `ConvMapWrapper` from a key and value. The provided key and
+  // value must be ones that were previously returned by calls to `Key()` and
+  // `Value()`.
+  static absl::StatusOr<ConvMapWrapper> FromKeyAndValue(OpaqueKey key,
+                                                        OpaqueValue value);
+
+  // An opaque string that can be used as a key for this autotuning result.
+  // Do not rely on the format of this string.
+  OpaqueKey Key() const;
+
+  // An opaque string that encodes the autotuning result.
+  // Do not rely on the format of this string.
+  OpaqueValue Value() const;
+
+  static std::vector<ConvMapWrapper> ConvMapToWrappers(
+      const ConvMapProto& autotune_results);
+
+  // Returns the `ConvMapProto` proto that corresponds to the provided
+  // wrappers.
+  static absl::StatusOr<ConvMapProto> ConvMapFromWrappers(
+      const std::vector<ConvMapWrapper>& wrappers);
+
+ private:
+  explicit ConvMapWrapper(const ConvMapProto::Entry& entry)
+      : conv_map_entry_(entry) {}
+
+  ConvMapProto::Entry conv_map_entry_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_MAP_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_parameters.h b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_parameters.h
new file mode 100644
index 00000000..6658fa6e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/autotune_maps/conv_parameters.h
@@ -0,0 +1,137 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_PARAMETERS_H_
+#define TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_PARAMETERS_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.pb.h"
+
+namespace tensorflow {
+// Uniquely identifies a convolution operation that runs on a particular device
+// model.
+//
+// This can serve as a hashtable key, where the value might be the autotuned
+// algorithm we choose for the conv.
+//
+// All of the data in this class other than the device_id is stored in the
+// ConvParametersProto, so it can be easily serialized (for the purposes of
+// ahead-of-time autotuning).
+//
+// When using the cudnn frontend API, two autotuning results for two different
+// GPUs of the same model are not interchangeable, because an autotuning result
+// includes a cudnn execution plan, which is tied to the GPU.  As a result, we
+// need to create separate ConvParameters objects for them.
+class ConvParameters {
+ public:
+  struct FusionInfo {
+    // For some implementations (e.g. cuDNN new backend) these scales are part
+    // of the algorithm, not part of the parameters an algorithm take. They need
+    // to be used to distinguish different algorithms.
+    double conv_scale;
+    double side_input_scale;
+    double leakyrelu_alpha;
+    stream_executor::dnn::ActivationMode activation_mode;
+    bool is_contrib;
+  };
+
+  // LINT.IfChange(conv_parameters_version)
+  // A positive number that denotes the version of this class. Should be
+  // incremented everytime this class or ConvParametersProto are updated in a
+  // way that may invalidate autotune results.
+  static constexpr int kVersion = 3;
+  // LINT.ThenChange()
+
+  // We have three kinds of convolutions today.  Vanilla unfused convolutions,
+  // fused convolutions, and fused convolutions as implemented in the `contrib`
+  // directory.  The two fused convolutions ultimately correspond to the same
+  // cudnn calls, but have slightly different semantics (e.g. they interpret
+  // padding differently).
+  ConvParameters(
+      se::StreamExecutor* stream_exec, int64_t batch, int64_t in_depths,
+      absl::Span<const int64_t> in, int data_format, int64_t out_depths,
+      absl::Span<const int64_t> filter, absl::Span<const int64_t> dilation,
+      absl::Span<const int64_t> stride, absl::Span<const int64_t> padding,
+      DataType dtype, int group_count,
+      absl::optional<FusionInfo> fusion_info = absl::optional<FusionInfo>(),
+      // This argument should be set only for test use.
+      int version = kVersion);
+
+  ConvParameters(int device_id, const ConvParametersProto& proto);
+
+  ConvParameters(se::StreamExecutor* stream_exec,
+                 const ConvParametersProto& proto)
+      : ConvParameters(stream_exec->device_ordinal(), proto) {}
+
+  bool operator==(const ConvParameters& other) const;
+
+  bool operator!=(const ConvParameters& other) const {
+    return !(*this == other);
+  }
+  uint64 hash() const { return hash_code_; }
+
+  string ToString() const;
+
+  const ConvParametersProto& proto() const { return proto_; }
+
+ private:
+  int device_id_;
+  ConvParametersProto proto_;
+  uint64 hash_code_;
+};
+
+class MatmulParameters {
+ public:
+  // LINT.IfChange(matmul_parameters_version)
+  // A positive number that denotes the version of this class. Should be
+  // incremented everytime this class or ConvParametersProto are updated in a
+  // way that may invalidate autotune results.
+  static constexpr int kVersion = 2;
+  // LINT.ThenChange()
+
+  MatmulParameters(se::StreamExecutor* stream_exec, DataType ab_dtype,
+                   DataType c_dtype, bool trans_a, bool trans_b, uint64_t m,
+                   uint64_t n, uint64_t k, int64_t lda, int64_t ldb,
+                   int64_t ldc,
+                   stream_executor::dnn::ActivationMode activation_mode,
+                   // This argument should be set only for test use.
+                   int version = kVersion);
+
+  MatmulParameters(se::StreamExecutor* stream_exec,
+                   const MatmulParametersProto& proto);
+
+  bool operator==(const MatmulParameters& other) const;
+
+  bool operator!=(const MatmulParameters& other) const {
+    return !(*this == other);
+  }
+  uint64 hash() const { return hash_code_; }
+
+  string ToString() const;
+
+  const MatmulParametersProto& proto() const { return proto_; }
+
+ private:
+  int device_id_;
+  MatmulParametersProto proto_;
+  uint64 hash_code_;
+};
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_CONV_PARAMETERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/bad_indices_policy.h b/third_party/tflite-hdrs/tensorflow/core/util/bad_indices_policy.h
new file mode 100644
index 00000000..ee8f4a89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/bad_indices_policy.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_BAD_INDICES_POLICY_H_
+#define TENSORFLOW_CORE_UTIL_BAD_INDICES_POLICY_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+enum class BadIndicesPolicy {
+  // Default behavior: return an error on CPU and ignore on GPU. This is because
+  // we handle bad indices differently on CPU and GPU before this policy is
+  // introduced.
+  kDefault,
+  // Return an error.
+  kError,
+  // Ignore bad indices.
+  kIgnore,
+};
+
+absl::StatusOr<BadIndicesPolicy> BadIndicesPolicyFromString(
+    absl::string_view str);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_BAD_INDICES_POLICY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/batch_util.h b/third_party/tflite-hdrs/tensorflow/core/util/batch_util.h
new file mode 100644
index 00000000..176c229a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/batch_util.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+//
+// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
+// to move the `element` argument into this function, and the implementation
+// may be able to optimize the copy to a move. This is particularly important
+// for DT_STRING tensors.
+absl::Status CopyElementToSlice(Tensor element, Tensor* parent, int64_t index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+absl::Status CopySliceToElement(const Tensor& parent, Tensor* element,
+                                int64_t index);
+
+// Copies 'num_slices' contiguous slices from 'src' tensor starting from index
+// 'src_offset' into target tensor 'dst', and places them into slices
+// starting from 'dst_offset'.
+//
+// This function requires 'src' and 'dst' to have compatible shapes. That is it
+// requires cum_prod(src.shape[1:] == cum_prod(dst->shape[1:]). For example if
+// source is of shape [x, 2, 1] and dst is a tensor of shape [y, 1, 2], this
+// function can still proceed successfully.
+absl::Status CopyContiguousSlices(const Tensor& src, int64_t src_offset,
+                                  int64_t dst_offset, int64_t num_slices,
+                                  Tensor* dst);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+absl::Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element,
+                                     int64_t index);
+
+// Moves `src` Tensor's data in [src_offset, src_offset+num_slices) along
+// the first dimension if possible. Otherwise, copy them into `dst`.
+absl::Status MaybeMoveContiguousSlices(Tensor& src, int64_t src_offset,
+                                       int64_t dst_offset, int64_t num_slices,
+                                       Tensor* dst);
+
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+absl::Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+absl::Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                      int index);
+
+}  // namespace batch_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/bcast.h b/third_party/tflite-hdrs/tensorflow/core/util/bcast.h
new file mode 100644
index 00000000..61d1fb5a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/bcast.h
@@ -0,0 +1,427 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_BCAST_H_
+#define TENSORFLOW_CORE_UTIL_BCAST_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns the mapping from the output batch indices to the corresponding
+// input's batch indices, given the input's "reshape" and "bcast" shapes as
+// returned by the BCastList helper class. The i'th element denotes the
+// (flattened) batch index of the input that must be used to compute the i'th
+// batch output.
+//
+inline void ComputeBatchIndices(
+    const int64_t output_batch_size,
+    const absl::InlinedVector<int64_t, 4UL>& reshape,
+    const absl::InlinedVector<int64_t, 4UL>& bcast,
+    std::vector<int64_t>* out_indices) {
+  // Populates the mapping in out_indices. This algorithm is identical to
+  // the following steps:
+  //  - Reshape {0, 1, ..., input_batch_size - 1} to the input shape.
+  //  - Broadcast to the output shape.
+  //  - Reshape back to a flat 1D vector.
+  out_indices->resize(output_batch_size);
+  int64_t num_output_elements = 1;
+  int64_t num_input_elements = 1;
+  for (int64_t i = reshape.size() - 1; i >= 0; --i) {
+    // Replicate the already populated mapping an additional (dim - 1) times.
+    // If we are broadcasting, just copy the existing mapping.
+    // Otherwise, add another dimension from the input shape.
+    const int64_t dim = std::max(reshape[i], bcast[i]);
+    const int64_t incr = bcast[i] > 1 ? 0 : num_input_elements;
+    for (int64_t k = 0; k < (dim - 1) * num_output_elements; ++k) {
+      (*out_indices)[num_output_elements + k] = (*out_indices)[k] + incr;
+    }
+    num_output_elements *= dim;
+    num_input_elements *= reshape[i];
+  }
+}
+
+template <int N>
+class BCastList {
+ public:
+  // A vector of int64 representing the shape of tensor. The 0-th
+  // element is the outer-most dimension and the last element is the
+  // inner-most dimension. Note that we do not use TensorShape since
+  // it's more convenient to manipulate Vec directly for this module.
+  typedef absl::InlinedVector<int64_t, 4UL> Vec;
+
+  // Constructs all helper shapes, following the aforementioned rules.
+  //
+  // If "fewer_dims_optimization" is set to true (the default), the
+  // implementation tries to reduce intermediate dimensions needed to be more
+  // efficient.  This is transparent to the caller.
+  //
+  // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have
+  // the same number of dimensions as the larger of the two inputs.
+  //
+  // If return_flattened_batch_indices is true, the implementation will compute
+  // for each output member of the flattened output, which batch indices of
+  // each input correspond to it. This is disabled by default.
+  explicit BCastList(const Vec (&x)[N], bool fewer_dims_optimization = true,
+                     bool return_flattened_batch_indices = false);
+  ~BCastList() = default;
+
+  // Returns true iff two operands are compatible according to the
+  // broadcasting rule.
+  bool IsValid() const { return valid_; }
+  bool IsBroadcastingRequired() const { return broadcasting_required_; }
+
+  // If and only if IsValid(), the following fields can be used in
+  // implementing a broadcasted binary tensor operation according to
+  // the broadcasting rule.
+  const Vec& reshape(int i) const { return reshape_[i]; }
+  const Vec& bcast(int i) const { return bcast_[i]; }
+  const Vec& result_shape() const { return result_; }
+  const Vec& output_shape() const { return output_; }
+  const Vec& grad_reduce_idx(int i) const { return grad_reduce_idx_[i]; }
+  int64_t output_batch_size() const { return output_batch_size_; }
+
+  // Returns the mapping from the flattened output batch indices to x's
+  // flattened batch indices. The result is a vector of length
+  // output_batch_size(). To compute the i'th batch output, a binary matmul-like
+  // operation should use the `x_batch_indices()[i]`th batch index of `x`.
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int64_t>& batch_indices(int i) const {
+    return batch_indices_[i];
+  }
+
+ protected:
+  bool valid_ = true;
+  bool broadcasting_required_ = true;
+  Vec reshape_[N];
+  Vec bcast_[N];
+  Vec result_;
+  Vec output_;
+  Vec grad_reduce_idx_[N];
+
+  int64_t output_batch_size_;
+  std::vector<int64_t> batch_indices_[N];
+
+  static void Reverse(Vec* shape) {
+    std::reverse(shape->begin(), shape->end());
+  }
+
+  BCastList(const BCastList&) = delete;
+  void operator=(const BCastList&) = delete;
+};
+
+template <int N>
+BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
+                        const bool fewer_dims_optimization,
+                        const bool return_flattened_batch_indices) {
+  typedef BCastList::Vec Vec;
+
+  // Safely multiplies dimensions taking into account symbolic shapes.
+  auto mul_dims = [](int64_t dim1, int64_t dim2) -> int64_t {
+    return dim1 != 0 && dim2 != 0 && (dim1 < 0 || dim2 < 0) ? -1 : dim1 * dim2;
+  };
+
+  bool all_equal = true;
+  size_t largest_rank = 0;
+  output_batch_size_ = 1;
+  for (int i = 0; i < N; ++i) {
+    if (x[i] != x[0]) {
+      all_equal = false;
+    }
+    if (x[i].size() > largest_rank) {
+      largest_rank = x[i].size();
+    }
+  }
+  if (all_equal) {
+    broadcasting_required_ = false;
+  }
+  if (all_equal && TF_PREDICT_TRUE(fewer_dims_optimization)) {
+    // Fast path for common case of identical shapes.
+    int64_t elements = 1;
+    const int rank = x[0].size();
+    output_.resize(rank);
+    for (int i = 0; i < rank; i++) {
+      const int64_t dim = x[0][i];
+      elements = mul_dims(elements, dim);
+      output_[i] = dim;
+    }
+    result_.push_back(elements);
+    output_batch_size_ = elements;
+    for (int i = 0; i < N; ++i) {
+      reshape_[i].push_back(elements);
+      bcast_[i].push_back(1);
+    }
+    // grad_reduce_ is left as empty
+    return;
+  }
+
+  // Reverse all the shapes for convenience
+  // After the reverse, 0-th is the inner-most dimension.
+  Vec copy[N];
+  for (int i = 0; i < N; ++i) {
+    copy[i] = x[i];
+    Reverse(&copy[i]);
+  }
+
+  // 1-extend and align all vectors.
+  for (int i = 0; i < N; ++i) {
+    if (copy[i].size() < largest_rank) {
+      copy[i].resize(largest_rank, 1);
+    }
+  }
+  // Going through each dimension starting from the inner-most
+  // dimension, compares dimension of x and y. They are compatible if
+  // they are equal or either is 1.
+
+  // indices of j-th component of each input.
+  bool prev_is_one[N];
+  bool current_is_one[N];
+  for (int i = 0; i < N; ++i) {
+    prev_is_one[i] = false;
+    current_is_one[i] = false;
+  }
+  bool output_dim_set = false;
+  int64_t output_dim = -1;
+  bool none_is_one = true;
+  bool set_one = false;
+  for (int j = 0; j < largest_rank; ++j) {
+    output_dim = -1;
+    output_dim_set = false;
+    none_is_one = true;
+    // Find which indices are 1.
+    for (int i = 0; i < N; ++i) {
+      // Keep track of which indices are 1.
+      if (copy[i][j] == 1) {
+        current_is_one[i] = true;
+        none_is_one = false;
+      } else {
+        current_is_one[i] = false;
+        if (!output_dim_set || copy[i][j] == output_dim) {
+          output_dim = copy[i][j];
+          output_dim_set = true;
+        } else {
+          valid_ = false;
+          return;
+        }
+      }
+    }
+    output_.push_back(output_dim_set ? output_dim : 1);
+    output_batch_size_ = mul_dims(output_batch_size_, output_.back());
+    // All dimensions are 1.
+    if (!output_dim_set) {
+      if (!TF_PREDICT_TRUE(fewer_dims_optimization)) {
+        for (int i = 0; i < N; ++i) {
+          bcast_[i].push_back(1);
+          reshape_[i].push_back(1);
+        }
+        result_.push_back(1);
+      }
+      for (int i = 0; i < N; ++i) {
+        grad_reduce_idx_[i].push_back(largest_rank - 1 - j);
+      }
+      // This will skip updating the previous state to the current one. We'll
+      // explain why this is safe below.
+      // Consider the previous state P, current state C and the next state N.
+      // In the case where N also is all ones (N == C), we'll do the same
+      // optimization here (push back one dimensions if we need to), which is
+      // safe and is expected.
+      //
+      // When N != C, we'll continue as usual. However, we might trigger the
+      // next block if N == P (because we didn't update the previous state).
+      // We trigger the next block if `fewer_dims_optimization` is true.
+      // This means that we did not modify and broadcast / reshapes in this
+      // block (we skipped updating, since the one dimensions can be ignored).
+      // In essence, we only need to check whether the previous non-one state is
+      // equal to the current non-one state.
+
+      continue;
+    } else if (TF_PREDICT_TRUE(fewer_dims_optimization) &&
+               std::equal(current_is_one, current_is_one + N, prev_is_one) &&
+               set_one) {
+      // It is a run of the same broadcasting case as last time.
+      // We can reshape the input so that fewer dimensions
+      // are involved in the intermediate computation.
+      result_.back() = mul_dims(result_.back(), output_dim);
+      for (int i = 0; i < N; ++i) {
+        reshape_[i].back() = mul_dims(reshape_[i].back(), copy[i][j]);
+        bcast_[i].back() =
+            mul_dims(bcast_[i].back(), current_is_one[i] ? output_dim : 1);
+        if (current_is_one[i] && !none_is_one) {
+          grad_reduce_idx_[i].push_back(largest_rank - 1 - j);
+        }
+      }
+    } else {
+      result_.push_back(output_dim);
+      for (int i = 0; i < N; ++i) {
+        reshape_[i].push_back(copy[i][j]);
+        bcast_[i].push_back(current_is_one[i] ? output_dim : 1);
+        if (current_is_one[i] && !none_is_one) {
+          grad_reduce_idx_[i].push_back(largest_rank - 1 - j);
+        }
+      }
+    }
+    set_one = true;
+    for (int i = 0; i < N; ++i) {
+      prev_is_one[i] = current_is_one[i];
+    }
+  }
+  if (result_.empty()) {
+    result_.push_back(1);
+    for (int i = 0; i < N; ++i) {
+      reshape_[i].push_back(1);
+      bcast_[i].push_back(1);
+    }
+  }
+  // Do something about batches.
+  for (int i = 0; i < N; ++i) {
+    Reverse(&reshape_[i]);
+    Reverse(&bcast_[i]);
+    Reverse(&grad_reduce_idx_[i]);
+  }
+  Reverse(&result_);
+  Reverse(&output_);
+  // Only compute batch indices when we need broadcasting, and we aren't doing
+  // needless work (when the output size is 0 or the
+  // return_flattened_batch_indices isn't enabled).
+  if (return_flattened_batch_indices && broadcasting_required_ &&
+      output_batch_size_ > 0) {
+    for (int i = 0; i < N; ++i) {
+      ComputeBatchIndices(output_batch_size_, reshape_[i], bcast_[i],
+                          &batch_indices_[i]);
+    }
+  }
+}
+
+// BCast is a helper for broadcasting binary tensor operation.
+// TensorFlow's broadcasting rule follows that of numpy (See
+// http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+//
+// The rule has the following properties:
+//
+//   1. suffix matching: the rule starts with the right-most
+//      dimension, and works towards the left-most dimension. Since
+//      TensorFlow is row-major, the right-most dimension (the last
+//      element in the shape of a tensor) is the inner-most, a.k.a.
+//      the fastest changing, dimension.
+//
+//   2. Two dimensions are compatible for broadcasting if both are the
+//      same or either is 1.
+//
+// BCast takes the shape of two tensors and computes a few vectors of
+// int32 that are useful for the caller to reshape the tensors, apply
+// the right broadcasts to them, compute the broadcasted operation,
+// and possibly the gradients. In a nutshell, the caller is expected
+// to compute the broadcasted operation as following:
+//
+//   BCast b(x.shape(), y.shape());
+//   output = x.reshape(b.x_reshape()).broadcast(b.x_bcast())
+//            _op_
+//            y.reshape(b.y_reshape()).broadcast(b.y_bcast())
+//
+// For the gradient computation,
+//   grad_x = sum(grad * backprop_x(x, y), grad_x_reduce_idx)
+//            .reshape(x.shape())
+//   grad_y = sum(grad * backprop_y(x, y), grad_y_reduce_idx)
+//            .reshape(y.shape())
+// backprop_x and backprop_y are functionals of the binary function "op",
+// e.g.,
+//   for +, backprop_x(x, y) = backprop_y(x, y) = 1;
+//   for *, backprop_x(x, y) =  y, backprop_y(x, y) = x;
+//   for /, backprop_x(x, y) = 1/y, backprop_y(x, y) = -x/y^2;
+//
+// The multiplication in the grad * backprop_x itself is also
+// broadcasting following the same rule.
+class BCast : public BCastList<2> {
+ public:
+  // Constructs all helper shapes, following the aforementioned rules.
+  //
+  // If "fewer_dims_optimization" is set to true (the default), the
+  // implementation tries to reduce intermediate dimensions needed to be more
+  // efficient.  This is transparent to the caller.
+  //
+  // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have
+  // the same number of dimensions as the larger of the two inputs.
+  typedef absl::InlinedVector<int64_t, 4UL> Vec;
+
+  BCast(const Vec& x, const Vec& y, const bool fewer_dims_optimization = true,
+        const bool return_flattened_batch_indices = false)
+      : BCastList<2>({x, y}, fewer_dims_optimization,
+                     return_flattened_batch_indices) {}
+
+  ~BCast() = default;
+
+  // If and only if IsValid(), the following fields can be used in
+  // implementing a broadcasted binary tensor operation according to
+  // the broadcasting rule.
+  const Vec& x_reshape() const { return reshape_[0]; }
+  const Vec& x_bcast() const { return bcast_[0]; }
+  const Vec& y_reshape() const { return reshape_[1]; }
+  const Vec& y_bcast() const { return bcast_[1]; }
+  const Vec& result_shape() const { return result_; }
+  const Vec& output_shape() const { return output_; }
+  const Vec& grad_x_reduce_idx() const { return grad_reduce_idx_[0]; }
+  const Vec& grad_y_reduce_idx() const { return grad_reduce_idx_[1]; }
+
+  // Returns the mapping from the flattened output batch indices to x's
+  // flattened batch indices. The result is a vector of length
+  // output_batch_size(). To compute the i'th batch output, a binary matmul-like
+  // operation should use the `x_batch_indices()[i]`th batch index of `x`.
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int64_t>& x_batch_indices() const {
+    return batch_indices_[0];
+  }
+  // Returns the mapping from the flattened output batch indices to y's
+  // flattened batch indices. Similar to x_batch_indices().
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int64_t>& y_batch_indices() const {
+    return batch_indices_[1];
+  }
+
+  template <typename IndexType, int NDIMS>
+  static Eigen::array<IndexType, NDIMS> ToIndexArrayType(
+      const BCast::Vec& vec) {
+    CHECK_EQ(vec.size(), NDIMS);
+    Eigen::array<IndexType, NDIMS> ret;
+    for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i];
+    return ret;
+  }
+
+  template <int NDIMS>
+  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+      const BCast::Vec& vec) {
+    return ToIndexArrayType<Eigen::DenseIndex, NDIMS>(vec);
+  }
+
+  // Static helpers.
+  static Vec FromShape(const TensorShape& shape);
+  static TensorShape ToShape(const Vec& vec);
+
+ private:
+  BCast(const BCast&) = delete;
+  void operator=(const BCast&) = delete;
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_BCAST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/command_line_flags.h b/third_party/tflite-hdrs/tensorflow/core/util/command_line_flags.h
new file mode 100644
index 00000000..ebc58f7e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/command_line_flags.h
@@ -0,0 +1,31 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/util/command_line_flags.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using tsl::Flag;   // NOLINT
+using tsl::Flags;  // NOLINT
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_entry.h b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_entry.h
new file mode 100644
index 00000000..c8a23036
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_entry.h
@@ -0,0 +1,154 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+
+#ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_ENTRY_H_
+#define TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_ENTRY_H_
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ctc/ctc_loss_util.h"
+
+namespace tensorflow {
+namespace ctc {
+
+// The ctc_beam_search namespace holds several classes meant to be accessed only
+// in case of extending the CTCBeamSearch decoder to allow custom scoring
+// functions.
+//
+// BeamEntry is exposed through template arguments BeamScorer and BeamComparer
+// of CTCBeamSearch (ctc_beam_search.h).
+namespace ctc_beam_search {
+
+struct EmptyBeamState {};
+
+template <typename T>
+struct BeamProbability {
+  BeamProbability()
+      : total(kLogZero<T>()), blank(kLogZero<T>()), label(kLogZero<T>()) {}
+  void Reset() {
+    total = kLogZero<T>();
+    blank = kLogZero<T>();
+    label = kLogZero<T>();
+  }
+  T total;
+  T blank;
+  T label;
+};
+
+template <class T, class CTCBeamState>
+class BeamRoot;
+
+template <class T, class CTCBeamState = EmptyBeamState>
+struct BeamEntry {
+  // BeamRoot<CTCBeamState>::AddEntry() serves as the factory method.
+  friend BeamEntry<T, CTCBeamState>* BeamRoot<T, CTCBeamState>::AddEntry(
+      BeamEntry<T, CTCBeamState>* p, int l);
+  inline bool Active() const { return newp.total != kLogZero<T>(); }
+  // Return the child at the given index, or construct a new one in-place if
+  // none was found.
+  BeamEntry<T, CTCBeamState>& GetChild(int ind) {
+    auto entry = children.emplace(ind, nullptr);
+    auto& child_entry = entry.first->second;
+    // If this is a new child, populate the BeamEntry<CTCBeamState>*.
+    if (entry.second) {
+      child_entry = beam_root->AddEntry(this, ind);
+    }
+    return *child_entry;
+  }
+  std::vector<int> LabelSeq(bool merge_repeated) const {
+    std::vector<int> labels;
+    int prev_label = -1;
+    const BeamEntry<T, CTCBeamState>* c = this;
+    while (c->parent != nullptr) {  // Checking c->parent to skip root leaf.
+      if (!merge_repeated || c->label != prev_label) {
+        labels.push_back(c->label);
+      }
+      prev_label = c->label;
+      c = c->parent;
+    }
+    std::reverse(labels.begin(), labels.end());
+    return labels;
+  }
+
+  BeamEntry<T, CTCBeamState>* parent;
+  int label;
+  // All instances of child BeamEntry are owned by *beam_root.
+  gtl::FlatMap<int, BeamEntry<T, CTCBeamState>*> children;
+  BeamProbability<T> oldp;
+  BeamProbability<T> newp;
+  CTCBeamState state;
+
+ private:
+  // Constructor giving parent, label, and the beam_root.
+  // The object pointed to by p cannot be copied and should not be moved,
+  // otherwise parent will become invalid.
+  // This private constructor is only called through the factory method
+  // BeamRoot<CTCBeamState>::AddEntry().
+  BeamEntry(BeamEntry* p, int l, BeamRoot<T, CTCBeamState>* beam_root)
+      : parent(p), label(l), beam_root(beam_root) {}
+  BeamRoot<T, CTCBeamState>* beam_root;
+  BeamEntry(const BeamEntry&) = delete;
+  void operator=(const BeamEntry&) = delete;
+};
+
+// This class owns all instances of BeamEntry.  This is used to avoid recursive
+// destructor call during destruction.
+template <class T, class CTCBeamState = EmptyBeamState>
+class BeamRoot {
+ public:
+  BeamRoot(BeamEntry<T, CTCBeamState>* p, int l) {
+    root_entry_ = AddEntry(p, l);
+  }
+  BeamRoot(const BeamRoot&) = delete;
+  BeamRoot& operator=(const BeamRoot&) = delete;
+
+  BeamEntry<T, CTCBeamState>* AddEntry(BeamEntry<T, CTCBeamState>* p, int l) {
+    auto* new_entry = new BeamEntry<T, CTCBeamState>(p, l, this);
+    beam_entries_.emplace_back(new_entry);
+    return new_entry;
+  }
+  BeamEntry<T, CTCBeamState>* RootEntry() const { return root_entry_; }
+
+ private:
+  BeamEntry<T, CTCBeamState>* root_entry_ = nullptr;
+  std::vector<std::unique_ptr<BeamEntry<T, CTCBeamState>>> beam_entries_;
+};
+
+// BeamComparer is the default beam comparer provided in CTCBeamSearch.
+template <class T, class CTCBeamState = EmptyBeamState>
+class BeamComparer {
+ public:
+  virtual ~BeamComparer() {}
+  virtual bool inline operator()(const BeamEntry<T, CTCBeamState>* a,
+                                 const BeamEntry<T, CTCBeamState>* b) const {
+    return a->newp.total > b->newp.total;
+  }
+};
+
+}  // namespace ctc_beam_search
+
+}  // namespace ctc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_ENTRY_H_
+// LINT.ThenChange(//tensorflow/lite/kernels/ctc/ctc_beam_entry.h)
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_scorer.h b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_scorer.h
new file mode 100644
index 00000000..1ea370f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_scorer.h
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+
+// Collection of scoring classes that can be extended and provided to the
+// CTCBeamSearchDecoder to incorporate additional scoring logic (such as a
+// language model).
+//
+// To build a custom scorer extend and implement the pure virtual methods from
+// BeamScorerInterface. The default CTC decoding behavior is implemented
+// through BaseBeamScorer.
+
+#ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SCORER_H_
+#define TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SCORER_H_
+
+#include "tensorflow/core/util/ctc/ctc_beam_entry.h"
+
+namespace tensorflow {
+namespace ctc {
+
+// Base implementation of a beam scorer used by default by the decoder that can
+// be subclassed and provided as an argument to CTCBeamSearchDecoder, if complex
+// scoring is required. Its main purpose is to provide a thin layer for
+// integrating language model scoring easily.
+template <typename T, typename CTCBeamState>
+class BaseBeamScorer {
+ public:
+  virtual ~BaseBeamScorer() {}
+  // State initialization.
+  virtual void InitializeState(CTCBeamState* root) const {}
+  // ExpandState is called when expanding a beam to one of its children.
+  // Called at most once per child beam. In the simplest case, no state
+  // expansion is done.
+  virtual void ExpandState(const CTCBeamState& from_state, int from_label,
+                           CTCBeamState* to_state, int to_label) const {}
+  // ExpandStateEnd is called after decoding has finished. Its purpose is to
+  // allow a final scoring of the beam in its current state, before resorting
+  // and retrieving the TopN requested candidates. Called at most once per beam.
+  virtual void ExpandStateEnd(CTCBeamState* state) const {}
+  // GetStateExpansionScore should be an inexpensive method to retrieve the
+  // (cached) expansion score computed within ExpandState. The score is
+  // multiplied (log-addition) with the input score at the current step from
+  // the network.
+  //
+  // The score returned should be a log-probability. In the simplest case, as
+  // there's no state expansion logic, the expansion score is zero.
+  virtual T GetStateExpansionScore(const CTCBeamState& state,
+                                   T previous_score) const {
+    return previous_score;
+  }
+  // GetStateEndExpansionScore should be an inexpensive method to retrieve the
+  // (cached) expansion score computed within ExpandStateEnd. The score is
+  // multiplied (log-addition) with the final probability of the beam.
+  //
+  // The score returned should be a log-probability.
+  virtual T GetStateEndExpansionScore(const CTCBeamState& state) const {
+    return T(0);
+  }
+};
+
+}  // namespace ctc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SCORER_H_
+// LINT.ThenChange(//tensorflow/lite/kernels/ctc/ctc_beam_scorer.h)
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_search.h b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_search.h
new file mode 100644
index 00000000..a592d7a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -0,0 +1,437 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+
+#ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SEARCH_H_
+#define TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SEARCH_H_
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ctc/ctc_beam_entry.h"
+#include "tensorflow/core/util/ctc/ctc_beam_scorer.h"
+#include "tensorflow/core/util/ctc/ctc_decoder.h"
+#include "tensorflow/core/util/ctc/ctc_loss_util.h"
+
+namespace tensorflow {
+namespace ctc {
+
+template <typename T, typename CTCBeamState = ctc_beam_search::EmptyBeamState,
+          typename CTCBeamComparer =
+              ctc_beam_search::BeamComparer<T, CTCBeamState>>
+class CTCBeamSearchDecoder : public CTCDecoder<T> {
+  // Beam Search
+  //
+  // Example (GravesTh Fig. 7.5):
+  //         a    -
+  //  P = [ 0.3  0.7 ]  t = 0
+  //      [ 0.4  0.6 ]  t = 1
+  //
+  // Then P(l = -) = P(--) = 0.7 * 0.6 = 0.42
+  //      P(l = a) = P(a-) + P(aa) + P(-a) = 0.3*0.4 + ... = 0.58
+  //
+  // In this case, Best Path decoding is suboptimal.
+  //
+  // For Beam Search, we use the following main recurrence relations:
+  //
+  // Relation 1:
+  // ---------------------------------------------------------- Eq. 1
+  //      P(l=abcd @ t=7) = P(l=abc  @ t=6) * P(d @ 7)
+  //                      + P(l=abcd @ t=6) * (P(d @ 7) + P(- @ 7))
+  // where P(l=? @ t=7), ? = a, ab, abc, abcd are all stored and
+  // updated recursively in the beam entry.
+  //
+  // Relation 2:
+  // ---------------------------------------------------------- Eq. 2
+  //      P(l=abc? @ t=3) = P(l=abc @ t=2) * P(? @ 3)
+  // for ? in a, b, d, ..., (not including c or the blank index),
+  // and the recurrence starts from the beam entry for P(l=abc @ t=2).
+  //
+  // For this case, the length of the new sequence equals t+1 (t
+  // starts at 0).  This special case can be calculated as:
+  //   P(l=abc? @ t=3) = P(a @ 0)*P(b @ 1)*P(c @ 2)*P(? @ 3)
+  // but we calculate it recursively for speed purposes.
+  typedef ctc_beam_search::BeamEntry<T, CTCBeamState> BeamEntry;
+  typedef ctc_beam_search::BeamRoot<T, CTCBeamState> BeamRoot;
+  typedef ctc_beam_search::BeamProbability<T> BeamProbability;
+
+ public:
+  typedef BaseBeamScorer<T, CTCBeamState> DefaultBeamScorer;
+
+  // The beam search decoder is constructed specifying the beam_width (number of
+  // candidates to keep at each decoding timestep) and a beam scorer (used for
+  // custom scoring, for example enabling the use of a language model).
+  // The ownership of the scorer remains with the caller. The default
+  // implementation, CTCBeamSearchDecoder<>::DefaultBeamScorer, generates the
+  // standard beam search.
+  CTCBeamSearchDecoder(int num_classes, int beam_width,
+                       BaseBeamScorer<T, CTCBeamState>* scorer,
+                       int batch_size = 1, bool merge_repeated = false)
+      : CTCDecoder<T>(num_classes, batch_size, merge_repeated),
+        beam_width_(beam_width),
+        leaves_(beam_width),
+        beam_scorer_(CHECK_NOTNULL(scorer)) {
+    Reset();
+  }
+
+  ~CTCBeamSearchDecoder() override {}
+
+  // Run the hibernating beam search algorithm on the given input.
+  absl::Status Decode(const typename CTCDecoder<T>::SequenceLength& seq_len,
+                      const std::vector<typename CTCDecoder<T>::Input>& input,
+                      std::vector<typename CTCDecoder<T>::Output>* output,
+                      typename CTCDecoder<T>::ScoreOutput* scores) override;
+
+  // Calculate the next step of the beam search and update the internal state.
+  template <typename Vector>
+  void Step(const Vector& log_input_t);
+
+  template <typename Vector>
+  T GetTopK(const int K, const Vector& input, std::vector<T>* top_k_logits,
+            std::vector<int>* top_k_indices);
+
+  // Retrieve the beam scorer instance used during decoding.
+  BaseBeamScorer<T, CTCBeamState>* GetBeamScorer() const {
+    return beam_scorer_;
+  }
+
+  // Set label selection parameters for faster decoding.
+  // See comments for label_selection_size_ and label_selection_margin_.
+  void SetLabelSelectionParameters(int label_selection_size,
+                                   T label_selection_margin) {
+    label_selection_size_ = label_selection_size;
+    label_selection_margin_ = label_selection_margin;
+  }
+
+  // Reset the beam search
+  void Reset();
+
+  // Extract the top n paths at current time step
+  absl::Status TopPaths(int n, std::vector<std::vector<int>>* paths,
+                        std::vector<T>* log_probs, bool merge_repeated) const;
+
+ private:
+  int beam_width_;
+
+  // Label selection is designed to avoid possibly very expensive scorer calls,
+  // by pruning the hypotheses based on the input alone.
+  // Label selection size controls how many items in each beam are passed
+  // through to the beam scorer. Only items with top N input scores are
+  // considered.
+  // Label selection margin controls the difference between minimal input score
+  // (versus the best scoring label) for an item to be passed to the beam
+  // scorer. This margin is expressed in terms of log-probability.
+  // Default is to do no label selection.
+  // For more detail: https://research.google.com/pubs/pub44823.html
+  int label_selection_size_ = 0;       // zero means unlimited
+  T label_selection_margin_ = -1;      // -1 means unlimited.
+
+  gtl::TopN<BeamEntry*, CTCBeamComparer> leaves_;
+  std::unique_ptr<BeamRoot> beam_root_;
+  BaseBeamScorer<T, CTCBeamState>* beam_scorer_;
+
+  CTCBeamSearchDecoder(const CTCBeamSearchDecoder&) = delete;
+  void operator=(const CTCBeamSearchDecoder&) = delete;
+};
+
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+absl::Status CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Decode(
+    const typename CTCDecoder<T>::SequenceLength& seq_len,
+    const std::vector<typename CTCDecoder<T>::Input>& input,
+    std::vector<typename CTCDecoder<T>::Output>* output,
+    typename CTCDecoder<T>::ScoreOutput* scores) {
+  // Storage for top paths.
+  std::vector<std::vector<int>> beams;
+  std::vector<T> beam_log_probabilities;
+  int top_n = output->size();
+  if (std::any_of(output->begin(), output->end(),
+                  [this](const typename CTCDecoder<T>::Output& output) -> bool {
+                    return output.size() < this->batch_size_;
+                  })) {
+    return errors::InvalidArgument(
+        "output needs to be of size at least (top_n, batch_size).");
+  }
+  if (scores->rows() < this->batch_size_ || scores->cols() < top_n) {
+    return errors::InvalidArgument(
+        "scores needs to be of size at least (batch_size, top_n).");
+  }
+
+  for (int b = 0; b < this->batch_size_; ++b) {
+    int seq_len_b = seq_len[b];
+    Reset();
+
+    for (int t = 0; t < seq_len_b; ++t) {
+      // Pass log-probabilities for this example + time.
+      Step(input[t].row(b));
+    }  // for (int t...
+
+    // O(n * log(n))
+    std::unique_ptr<std::vector<BeamEntry*>> branches(leaves_.Extract());
+    leaves_.Reset();
+    for (int i = 0; i < branches->size(); ++i) {
+      BeamEntry* entry = (*branches)[i];
+      beam_scorer_->ExpandStateEnd(&entry->state);
+      entry->newp.total +=
+          beam_scorer_->GetStateEndExpansionScore(entry->state);
+      leaves_.push(entry);
+    }
+
+    absl::Status status =
+        TopPaths(top_n, &beams, &beam_log_probabilities, this->merge_repeated_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    CHECK_EQ(top_n, beam_log_probabilities.size());
+    CHECK_EQ(beams.size(), beam_log_probabilities.size());
+
+    for (int i = 0; i < top_n; ++i) {
+      // Copy output to the correct beam + batch
+      (*output)[i][b].swap(beams[i]);
+      (*scores)(b, i) = -beam_log_probabilities[i];
+    }
+  }  // for (int b...
+  return absl::OkStatus();
+}
+
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+template <typename Vector>
+T CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::GetTopK(
+    const int K, const Vector& input, std::vector<T>* top_k_logits,
+    std::vector<int>* top_k_indices) {
+  // Find Top K choices, complexity nk in worst case. The array input is read
+  // just once.
+  CHECK_EQ(this->num_classes_, input.size());
+  top_k_logits->clear();
+  top_k_indices->clear();
+  top_k_logits->resize(K, -INFINITY);
+  top_k_indices->resize(K, -1);
+  for (int j = 0; j < this->num_classes_ - 1; ++j) {
+    const T logit = input(j);
+    if (logit > (*top_k_logits)[K - 1]) {
+      int k = K - 1;
+      while (k > 0 && logit > (*top_k_logits)[k - 1]) {
+        (*top_k_logits)[k] = (*top_k_logits)[k - 1];
+        (*top_k_indices)[k] = (*top_k_indices)[k - 1];
+        k--;
+      }
+      (*top_k_logits)[k] = logit;
+      (*top_k_indices)[k] = j;
+    }
+  }
+  // Return max value which is in 0th index or blank character logit
+  return std::max((*top_k_logits)[0], input(this->num_classes_ - 1));
+}
+
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+template <typename Vector>
+void CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Step(
+    const Vector& raw_input) {
+  std::vector<T> top_k_logits;
+  std::vector<int> top_k_indices;
+  const bool top_k =
+      (label_selection_size_ > 0 && label_selection_size_ < raw_input.size());
+  // Number of character classes to consider in each step.
+  const int max_classes =
+      top_k ? label_selection_size_ : (this->num_classes_ - 1);
+  // Get max coefficient and remove it from raw_input later.
+  T max_coeff;
+  if (top_k) {
+    max_coeff = GetTopK(label_selection_size_, raw_input, &top_k_logits,
+                        &top_k_indices);
+  } else {
+    max_coeff = raw_input.maxCoeff();
+  }
+  // Get normalization term of softmax: log(sum(exp(logit[j]-max_coeff))).
+  T logsumexp = T(0.0);
+  for (int j = 0; j < raw_input.size(); ++j) {
+    logsumexp += Eigen::numext::exp(raw_input(j) - max_coeff);
+  }
+  logsumexp = Eigen::numext::log(logsumexp);
+  // Final normalization offset to get correct log probabilities.
+  T norm_offset = max_coeff + logsumexp;
+
+  const T label_selection_input_min =
+      (label_selection_margin_ >= 0) ? (max_coeff - label_selection_margin_)
+                                     : -std::numeric_limits<T>::infinity();
+
+  // Extract the beams sorted in decreasing new probability
+  CHECK_EQ(this->num_classes_, raw_input.size());
+
+  std::unique_ptr<std::vector<BeamEntry*>> branches(leaves_.Extract());
+  leaves_.Reset();
+
+  for (BeamEntry* b : *branches) {
+    // P(.. @ t) becomes the new P(.. @ t-1)
+    b->oldp = b->newp;
+  }
+
+  for (BeamEntry* b : *branches) {
+    if (b->parent != nullptr) {  // if not the root
+      if (b->parent->Active()) {
+        // If last two sequence characters are identical:
+        //   Plabel(l=acc @ t=6) = (Plabel(l=acc @ t=5)
+        //                          + Pblank(l=ac @ t=5))
+        // else:
+        //   Plabel(l=abc @ t=6) = (Plabel(l=abc @ t=5)
+        //                          + P(l=ab @ t=5))
+        T previous = (b->label == b->parent->label) ? b->parent->oldp.blank
+                                                    : b->parent->oldp.total;
+        b->newp.label =
+            LogSumExp(b->newp.label,
+                      beam_scorer_->GetStateExpansionScore(b->state, previous));
+      }
+      // Plabel(l=abc @ t=6) *= P(c @ 6)
+      b->newp.label += raw_input(b->label) - norm_offset;
+    }
+    // Pblank(l=abc @ t=6) = P(l=abc @ t=5) * P(- @ 6)
+    b->newp.blank = b->oldp.total + raw_input(this->blank_index_) - norm_offset;
+    // P(l=abc @ t=6) = Plabel(l=abc @ t=6) + Pblank(l=abc @ t=6)
+    b->newp.total = LogSumExp(b->newp.blank, b->newp.label);
+
+    // Push the entry back to the top paths list.
+    // Note, this will always fill leaves back up in sorted order.
+    leaves_.push(b);
+  }
+
+  // we need to resort branches in descending oldp order.
+
+  // branches is in descending oldp order because it was
+  // originally in descending newp order and we copied newp to oldp.
+
+  // Grow new leaves
+  for (BeamEntry* b : *branches) {
+    // A new leaf (represented by its BeamProbability) is a candidate
+    // iff its total probability is nonzero and either the beam list
+    // isn't full, or the lowest probability entry in the beam has a
+    // lower probability than the leaf.
+    auto is_candidate = [this](const BeamProbability& prob) {
+      return (prob.total > kLogZero<T>() &&
+              (leaves_.size() < beam_width_ ||
+               prob.total > leaves_.peek_bottom()->newp.total));
+    };
+
+    if (!is_candidate(b->oldp)) {
+      continue;
+    }
+
+    for (int ind = 0; ind < max_classes; ind++) {
+      const int label = top_k ? top_k_indices[ind] : ind;
+      const T logit = top_k ? top_k_logits[ind] : raw_input(ind);
+      // Perform label selection: if input for this label looks very
+      // unpromising, never evaluate it with a scorer.
+      // We may compare logits instead of log probabilities, 
+      // since the difference is the same in both cases.
+      if (logit < label_selection_input_min) {
+        continue;
+      }
+      BeamEntry& c = b->GetChild(label);
+      if (!c.Active()) {
+        //   Pblank(l=abcd @ t=6) = 0
+        c.newp.blank = kLogZero<T>();
+        // If new child label is identical to beam label:
+        //   Plabel(l=abcc @ t=6) = Pblank(l=abc @ t=5) * P(c @ 6)
+        // Otherwise:
+        //   Plabel(l=abcd @ t=6) = P(l=abc @ t=5) * P(d @ 6)
+        beam_scorer_->ExpandState(b->state, b->label, &c.state, c.label);
+        T previous = (c.label == b->label) ? b->oldp.blank : b->oldp.total;
+        c.newp.label = logit - norm_offset +
+                       beam_scorer_->GetStateExpansionScore(c.state, previous);
+        // P(l=abcd @ t=6) = Plabel(l=abcd @ t=6)
+        c.newp.total = c.newp.label;
+
+        if (is_candidate(c.newp)) {
+          // Before adding the new node to the beam, check if the beam
+          // is already at maximum width.
+          if (leaves_.size() == beam_width_) {
+            // Bottom is no longer in the beam search.  Reset
+            // its probability; signal it's no longer in the beam search.
+            BeamEntry* bottom = leaves_.peek_bottom();
+            bottom->newp.Reset();
+          }
+          leaves_.push(&c);
+        } else {
+          // Deactivate child.
+          c.oldp.Reset();
+          c.newp.Reset();
+        }
+      }
+    }
+  }  // for (BeamEntry* b...
+}
+
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+void CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Reset() {
+  leaves_.Reset();
+
+  // This beam root, and all of its children, will be in memory until
+  // the next reset.
+  beam_root_.reset(new BeamRoot(nullptr, -1));
+  beam_root_->RootEntry()->newp.total = T(0.0);  // ln(1)
+  beam_root_->RootEntry()->newp.blank = T(0.0);  // ln(1)
+
+  // Add the root as the initial leaf.
+  leaves_.push(beam_root_->RootEntry());
+
+  // Call initialize state on the root object.
+  beam_scorer_->InitializeState(&beam_root_->RootEntry()->state);
+}
+
+template <typename T, typename CTCBeamState, typename CTCBeamComparer>
+absl::Status CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::TopPaths(
+    int n, std::vector<std::vector<int>>* paths, std::vector<T>* log_probs,
+    bool merge_repeated) const {
+  CHECK_NOTNULL(paths)->clear();
+  CHECK_NOTNULL(log_probs)->clear();
+  if (n > beam_width_) {
+    return errors::InvalidArgument("requested more paths than the beam width.");
+  }
+  if (n > leaves_.size()) {
+    return errors::InvalidArgument(
+        "Less leaves in the beam search than requested.");
+  }
+
+  gtl::TopN<BeamEntry*, CTCBeamComparer> top_branches(n);
+
+  // O(beam_width_ * log(n)), space complexity is O(n)
+  for (auto it = leaves_.unsorted_begin(); it != leaves_.unsorted_end(); ++it) {
+    top_branches.push(*it);
+  }
+  // O(n * log(n))
+  std::unique_ptr<std::vector<BeamEntry*>> branches(top_branches.Extract());
+
+  for (int i = 0; i < n; ++i) {
+    BeamEntry* e((*branches)[i]);
+    paths->push_back(e->LabelSeq(merge_repeated));
+    log_probs->push_back(e->newp.total);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace ctc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SEARCH_H_
+// LINT.ThenChange(//tensorflow/lite/kernels/ctc/ctc_beam_search.h)
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_decoder.h b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_decoder.h
new file mode 100644
index 00000000..8e6b3477
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_decoder.h
@@ -0,0 +1,122 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+
+#ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
+#define TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
+
+#include <memory>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace ctc {
+
+// The CTCDecoder is an abstract interface to be implemented when providing a
+// decoding method on the timestep output of a RNN trained with CTC loss.
+//
+// The two types of decoding available are:
+//   - greedy path, through the CTCGreedyDecoder
+//   - beam search, through the CTCBeamSearchDecoder
+template <class T>
+class CTCDecoder {
+ public:
+  typedef Eigen::Map<const Eigen::ArrayXi> SequenceLength;
+  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      Input;
+  typedef std::vector<std::vector<int>> Output;
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      ScoreOutput;
+
+  CTCDecoder(int num_classes, int batch_size, bool merge_repeated)
+      : num_classes_(num_classes),
+        blank_index_(num_classes - 1),
+        batch_size_(batch_size),
+        merge_repeated_(merge_repeated) {}
+
+  virtual ~CTCDecoder() {}
+
+  // Dimensionality of the input/output is expected to be:
+  //  - seq_len[b] - b = 0 to batch_size_
+  //  - input[t].rows(b) - t = 0 to timesteps; b = 0 t batch_size_
+  //  - output.size() specifies the number of beams to be returned.
+  //  - scores(b, i) - b = 0 to batch_size; i = 0 to output.size()
+  virtual absl::Status Decode(const SequenceLength& seq_len,
+                              const std::vector<Input>& input,
+                              std::vector<Output>* output,
+                              ScoreOutput* scores) = 0;
+
+  int batch_size() { return batch_size_; }
+  int num_classes() { return num_classes_; }
+
+ protected:
+  int num_classes_;
+  int blank_index_;
+  int batch_size_;
+  bool merge_repeated_;
+};
+
+// CTCGreedyDecoder is an implementation of the simple best path decoding
+// algorithm, selecting at each timestep the most likely class at each timestep.
+template <class T>
+class CTCGreedyDecoder : public CTCDecoder<T> {
+ public:
+  typedef CTCDecoder<T> Decoder;
+  CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated)
+      : CTCDecoder<T>(num_classes, batch_size, merge_repeated) {}
+
+  absl::Status Decode(const typename CTCDecoder<T>::SequenceLength& seq_len,
+                      const std::vector<typename CTCDecoder<T>::Input>& input,
+                      std::vector<typename CTCDecoder<T>::Output>* output,
+                      typename CTCDecoder<T>::ScoreOutput* scores) override {
+    if (output->empty() || (*output)[0].size() < Decoder::batch_size_) {
+      return errors::InvalidArgument(
+          "output needs to be of size at least (1, batch_size).");
+    }
+    if (scores->rows() < Decoder::batch_size_ || scores->cols() == 0) {
+      return errors::InvalidArgument(
+          "scores needs to be of size at least (batch_size, 1).");
+    }
+    // For each batch entry, identify the transitions
+    for (int b = 0; b < Decoder::batch_size_; ++b) {
+      int seq_len_b = seq_len[b];
+      // Only writing to beam 0
+      std::vector<int>& output_b = (*output)[0][b];
+
+      int prev_class_ix = -1;
+      (*scores)(b, 0) = 0;
+      for (int t = 0; t < seq_len_b; ++t) {
+        auto row = input[t].row(b);
+        int max_class_ix;
+        (*scores)(b, 0) += -row.maxCoeff(&max_class_ix);
+        if (max_class_ix != Decoder::blank_index_ &&
+            !(Decoder::merge_repeated_ && max_class_ix == prev_class_ix)) {
+          output_b.push_back(max_class_ix);
+        }
+        prev_class_ix = max_class_ix;
+      }
+    }
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace ctc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
+// LINT.ThenChange(//tensorflow/lite/kernels/ctc/ctc_decoder.h)
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_loss_calculator.h b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_loss_calculator.h
new file mode 100644
index 00000000..12c4ac0a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -0,0 +1,544 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_CALCULATOR_H_
+#define TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_CALCULATOR_H_
+
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/ctc/ctc_loss_util.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace ctc {
+
+template <class T>
+class CTCLossCalculator {
+  // Connectionist Temporal Classification Loss
+  //
+  // Implementation by kanishkarao@, posenhuang@, and ebrevdo@.
+  //
+  // The CTC Loss layer learns a *transition* probability value for each
+  // input time step.  The transitions are on the class alphabet
+  //   {0, 1, ..., N-2}
+  // where N is the depth of the input layer (the size of the alphabet is N-1).
+  // Note: The token N-1 is reserved for the "no transition" output, so
+  // make sure that your input layer has a depth that's one larger than
+  // the set of classes you're training on.  Also make sure that your
+  // training labels do not have a class value of N-1, as training will skip
+  // these examples.
+  //
+  // Reference materials:
+  //  GravesTh: Alex Graves, "Supervised Sequence Labeling with Recurrent
+  //    Neural Networks" (PhD Thesis), Technische Universit¨at M¨unchen.
+ public:
+  typedef std::vector<std::vector<int>> LabelSequences;
+  using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  // typedef Eigen::MatrixXd Matrix;
+  using Array = Eigen::Array<T, Eigen::Dynamic, 1>;
+  // typedef Eigen::ArrayXd Array;
+  using InputMap = Eigen::Map<const Matrix>;
+  // typedef Eigen::Map<const Eigen::MatrixXd> InputMap;
+  using OutputMap = Eigen::Map<Matrix>;
+  // typedef Eigen::Map<Eigen::MatrixXd> OutputMap;
+
+  CTCLossCalculator(int blank_index, int output_delay)
+      : blank_index_(blank_index), output_delay_(output_delay) {}
+
+  template <typename VectorIn, typename VectorOut, typename MatrixIn,
+            typename MatrixOut>
+  absl::Status CalculateLoss(
+      const VectorIn& seq_len, const LabelSequences& labels,
+      const std::vector<MatrixIn>& inputs, bool preprocess_collapse_repeated,
+      bool ctc_merge_repeated, bool ignore_longer_outputs_than_inputs,
+      VectorOut* loss, std::vector<MatrixOut>* gradients,
+      DeviceBase::CpuWorkerThreads* workers = nullptr) const;
+
+ private:
+  void CalculateForwardVariables(const std::vector<int>& l_prime,
+                                 const Matrix& y, bool ctc_merge_repeated,
+                                 Matrix* log_alpha) const;
+
+  void CalculateBackwardVariables(const std::vector<int>& l_prime,
+                                  const Matrix& y, bool ctc_merge_repeated,
+                                  Matrix* log_beta) const;
+
+  void CalculateGradient(const std::vector<int>& l_prime, const Matrix& y,
+                         const Matrix& log_alpha, const Matrix& log_beta,
+                         T log_p_z_x, Matrix* dy) const;
+
+  void GetLPrimeIndices(const std::vector<int>& l,
+                        std::vector<int>* l_prime) const;
+
+  // Helper function that calculates the l_prime indices for all
+  // batches at the same time, and identifies errors for any given
+  // batch.  Return value:
+  //    max_{b in batch_size} l_primes[b].size()
+  template <typename Vector>
+  absl::Status PopulateLPrimes(bool preprocess_collapse_repeated,
+                               bool ignore_longer_outputs_than_inputs,
+                               int batch_size, int num_classes,
+                               const Vector& seq_len,
+                               const LabelSequences& labels,
+                               size_t* max_u_prime,
+                               LabelSequences* l_primes) const;
+
+  // Utility indices for the CTC algorithm.
+  int blank_index_;
+
+  // Delay for target labels in time steps.
+  // The delay in time steps before the output sequence.
+  const int output_delay_;
+};
+
+template <class T>
+template <typename VectorIn, typename VectorOut, typename MatrixIn,
+          typename MatrixOut>
+absl::Status CTCLossCalculator<T>::CalculateLoss(
+    const VectorIn& seq_len, const LabelSequences& labels,
+    const std::vector<MatrixIn>& inputs, bool preprocess_collapse_repeated,
+    bool ctc_merge_repeated, bool ignore_longer_outputs_than_inputs,
+    VectorOut* loss, std::vector<MatrixOut>* gradients,
+    DeviceBase::CpuWorkerThreads* workers) const {
+  using Eigen::numext::log;
+
+  auto num_time_steps = inputs.size();
+
+  if (loss == nullptr) {
+    return errors::InvalidArgument("loss == nullptr");
+  }
+
+  bool requires_backprop = (gradients != nullptr);
+
+  auto batch_size = inputs[0].rows();
+  auto num_classes = inputs[0].cols();
+
+  if (loss->size() != batch_size) {
+    return errors::InvalidArgument("loss.size() != batch_size");
+  }
+  loss->setZero();
+
+  for (int t = 1; t < num_time_steps; ++t) {
+    if (inputs[t].rows() != batch_size) {
+      return errors::InvalidArgument("Expected batch size at t: ", t,
+                                     " to be: ", batch_size,
+                                     " but got: ", inputs[t].rows());
+    }
+    if (inputs[t].cols() != num_classes) {
+      return errors::InvalidArgument("Expected class count at t: ", t,
+                                     " to be: ", num_classes,
+                                     " but got: ", inputs[t].cols());
+    }
+  }
+
+  // Check validity of sequence_length array values.
+  auto max_seq_len = seq_len(0);
+  for (int b = 0; b < batch_size; b++) {
+    if (seq_len(b) < 0) {
+      return errors::InvalidArgument("seq_len(", b, ") < 0");
+    }
+    if (seq_len(b) > num_time_steps) {
+      return errors::InvalidArgument("seq_len(", b, ") > num_time_steps");
+    }
+    max_seq_len = std::max(seq_len(b), max_seq_len);
+  }
+
+  // Calculate the modified label sequence l' for each batch element,
+  // and calculate the maximum necessary allocation size.
+  LabelSequences l_primes(batch_size);
+  size_t max_u_prime = 0;
+  absl::Status l_p_ret = PopulateLPrimes(
+      preprocess_collapse_repeated, ignore_longer_outputs_than_inputs,
+      batch_size, num_classes, seq_len, labels, &max_u_prime, &l_primes);
+  if (!l_p_ret.ok()) {
+    return l_p_ret;
+  }
+
+  // Process each item in a batch in parallel, using at most kMaxThreads.
+  auto ComputeLossAndGradients = [this, num_classes, &labels, &l_primes,
+                                  &seq_len, &inputs, requires_backprop,
+                                  ctc_merge_repeated,
+                                  ignore_longer_outputs_than_inputs, &loss,
+                                  &gradients](int64_t start_row,
+                                              int64_t limit_row) {
+    for (int b = start_row; b < limit_row; b++) {
+      // Return zero gradient for empty sequences or sequences with labels
+      // longer than input, which is not supported by CTC.
+      if (seq_len(b) == 0 ||
+          (ignore_longer_outputs_than_inputs &&
+           labels[b].size() > seq_len(b) - this->output_delay_)) {
+        VLOG(1) << "The sequence length is either zero or shorter than the "
+                   "target output (CTC works only with shorter target sequence "
+                   "than input sequence). You can turn this into a warning by "
+                   "using the flag ignore_longer_outputs_than_inputs - "
+                << b << ": " << absl::StrJoin(labels[b], " ");
+        continue;
+      }
+
+      // For each batch element, log(alpha) and log(beta).
+      //   row size is: u_prime == l_prime.size()
+      //   col size is: seq_len[b] - output_delay_
+      const std::vector<int>& l_prime = l_primes[b];
+
+      Matrix log_alpha_b(l_prime.size(), seq_len(b) - this->output_delay_);
+      Matrix log_beta_b(l_prime.size(), seq_len(b) - this->output_delay_);
+
+      // Work matrices, pre-allocated to the size required by this batch item.
+      Matrix y(num_classes, seq_len(b));
+      Matrix dy;
+      if (requires_backprop) {
+        dy = Matrix::Zero(y.rows(), y.cols());
+      }
+
+      // For this batch, we'll only work with this shortened sequence_length.
+      Matrix y_b = y.leftCols(seq_len(b));
+
+      // Convert label from DistBelief
+      // y, prob are in num_classes x seq_len(b)
+      // Output activations.
+      Array y_b_col;
+      for (int t = 0; t < seq_len(b); t++) {
+        // Calculate the softmax of y_b.  Use original precision
+        // arithmetic for the sum.
+        T max_coeff = inputs[t].row(b).maxCoeff();
+        y_b_col = (inputs[t].row(b).array() - max_coeff).exp();
+        y_b.col(t) = y_b_col / y_b_col.sum();
+      }
+
+      // Compute forward, backward.
+      // Forward variables.
+      CalculateForwardVariables(l_prime, y_b, ctc_merge_repeated, &log_alpha_b);
+      // Backward variables.
+      CalculateBackwardVariables(l_prime, y_b, ctc_merge_repeated, &log_beta_b);
+
+      // The loss is computed as the log(p(z|x)) between the target and
+      // prediction. Do lazy evaluation of log_prob here.
+      T log_p_z_x = kLogZero<T>();
+      for (int u = 0; u < l_prime.size(); ++u) {
+        // (GravesTh) Eq 7.26, sum over all paths for t = 0.
+        log_p_z_x = LogSumExp(log_p_z_x, log_alpha_b(u, 0) + log_beta_b(u, 0));
+      }
+
+      (*loss)(b) = -log_p_z_x;  // Use negative log loss for display.
+
+      // We compute the derivative if needed.
+      if (requires_backprop) {
+        // Gradients with respect to input activations.
+        // Calculate gradient.
+        dy.setZero();
+        CalculateGradient(l_prime, y_b, log_alpha_b, log_beta_b, log_p_z_x,
+                          &dy);
+
+        // Convert gradient for current sample to DistBelief.
+        for (int t = 0; t < seq_len(b); t++) {
+          (*gradients)[t].row(b).array() = dy.col(t);
+        }
+      }
+    }  // for (int b = ...
+  };
+  if (workers) {
+    // *Rough* estimate of the cost for one item in the batch.
+    // Forward, Backward: O(T * U (= 2L + 1)), Gradients: O(T * (U + L)).
+    //
+    // softmax: T * L * (Cost(Exp) + Cost(Div))softmax +
+    // fwd,bwd: T * 2 * (2*L + 1) * (Cost(LogSumExp) + Cost(Log)) +
+    // grad: T * ((2L + 1) * Cost(LogSumExp) + L * (Cost(Expf) + Cost(Add)).
+    const int64_t cost_exp = Eigen::internal::functor_traits<
+        Eigen::internal::scalar_exp_op<T>>::Cost;
+    const int64_t cost_log = Eigen::internal::functor_traits<
+        Eigen::internal::scalar_log_op<T>>::Cost;
+    const int64_t cost_log_sum_exp =
+        Eigen::TensorOpCost::AddCost<T>() + cost_exp + cost_log;
+    const int64_t cost =
+        max_seq_len * num_classes *
+            (cost_exp + Eigen::TensorOpCost::DivCost<T>()) +
+        max_seq_len * 2 * (2 * num_classes + 1) *
+            (cost_log_sum_exp + cost_log) +
+        max_seq_len *
+            ((2 * num_classes + 1) * cost_log_sum_exp +
+             num_classes * (cost_exp + Eigen::TensorOpCost::AddCost<T>()));
+    Shard(workers->num_threads, workers->workers, batch_size, cost,
+          ComputeLossAndGradients);
+  } else {
+    ComputeLossAndGradients(0, batch_size);
+  }
+  return absl::OkStatus();
+}
+
+template <class T>
+template <typename Vector>
+absl::Status CTCLossCalculator<T>::PopulateLPrimes(
+    bool preprocess_collapse_repeated, bool ignore_longer_outputs_than_inputs,
+    int batch_size, int num_classes, const Vector& seq_len,
+    const LabelSequences& labels, size_t* max_u_prime,
+    LabelSequences* l_primes) const {
+  // labels is a Label array of size batch_size
+  if (labels.size() != batch_size) {
+    return errors::InvalidArgument(
+        "labels.size() != batch_size: ", labels.size(), " vs. ", batch_size);
+  }
+
+  *max_u_prime = 0;  // keep track of longest l' modified label sequence.
+  for (int b = 0; b < batch_size; b++) {
+    // Assume label is in Label proto
+    const std::vector<int>& label = labels[b];
+    if (label.size() == 0) {
+      return errors::InvalidArgument("Labels length is zero in batch ", b);
+    }
+
+    // If debugging: output the labels coming into training.
+    //
+    VLOG(2) << "label for batch: " << b << ": " << absl::StrJoin(label, " ");
+
+    // Target indices, length = U.
+    std::vector<int> l;
+
+    // Convert label from DistBelief
+    bool finished_sequence = false;
+    for (int i = 0; i < label.size(); ++i) {
+      if (i == 0 || !preprocess_collapse_repeated || label[i] != label[i - 1]) {
+        if (label[i] >= num_classes - 1) {
+          finished_sequence = true;
+        } else {
+          if (finished_sequence) {
+            // Saw an invalid sequence with non-null following null
+            // labels.
+            return errors::InvalidArgument(
+                "Saw a non-null label (index >= num_classes - 1) "
+                "following a ",
+                "null label, batch: ", b, " num_classes: ", num_classes,
+                " labels: ", absl::StrJoin(label, ","),
+                " labels seen so far: ", absl::StrJoin(l, ","));
+          }
+          l.push_back(label[i]);
+        }
+      }
+    }
+
+    for (int l_i : l) {
+      if (l_i < 0) {
+        return errors::InvalidArgument(
+            "All labels must be nonnegative integers, batch: ", b,
+            " labels: ", absl::StrJoin(l, ","));
+      } else if (l_i >= num_classes) {
+        return errors::InvalidArgument(
+            "No label may be greater than num_classes. ",
+            "num_classes: ", num_classes, ", batch: ", b,
+            " labels: ", absl::StrJoin(l, ","));
+      }
+    }
+    if (!ignore_longer_outputs_than_inputs) {
+      // Make sure there is enough time to output the target indices.
+      int time = seq_len(b) - output_delay_;
+      int required_time = label.size();
+      if (required_time > time) {
+        return errors::InvalidArgument(
+            "Not enough time for target transition sequence ("
+            "required: ",
+            required_time, ", available: ", time, ")", b,
+            "You can turn this error into a warning by using the flag "
+            "ignore_longer_outputs_than_inputs");
+      }
+    }
+    // Target indices with blanks before each index and a blank at the end.
+    // Length U' = 2U + 1.
+    // Convert l to l_prime
+    GetLPrimeIndices(l, &l_primes->at(b));
+    *max_u_prime = std::max(*max_u_prime, l_primes->at(b).size());
+  }
+  return absl::OkStatus();
+}
+
+// Calculates the alpha(t, u) as described in (GravesTh) Section 7.3.
+// Starting with t = 0 instead of t = 1 used in the text.
+// Based on Kanishka's CTC.
+template <typename TT>
+void CTCLossCalculator<TT>::CalculateForwardVariables(
+    const std::vector<int>& l_prime, const Matrix& y, bool ctc_merge_repeated,
+    Matrix* log_alpha) const {
+  using Eigen::numext::log;
+
+  // Number of cols is the number of time steps = number of cols in target
+  // after the output delay.
+  log_alpha->setConstant(kLogZero<TT>());
+
+  int U = l_prime.size();
+  int T = log_alpha->cols();
+
+  CHECK_EQ(U, log_alpha->rows());
+
+  // Initial alpha values in (GravesTh) Eq 7.5 and Eq 7.6.
+  log_alpha->coeffRef(0, 0) = log(y(blank_index_, output_delay_));
+  // Below, l_prime[1] == labels[0]
+  auto label_0 = (l_prime.size() > 1) ? l_prime[1] : blank_index_;
+  log_alpha->coeffRef(1, 0) = log(y(label_0, output_delay_));
+
+  for (int t = 1; t < T; ++t) {
+    // If there is not enough time to output the remaining labels or
+    // some labels have been skipped, then let log_alpha(u, t) continue to
+    // be kLogZero.
+    for (int u = std::max(0, U - (2 * (T - t))); u < std::min(U, 2 * (t + 1));
+         ++u) {
+      // Begin (GravesTh) Eq 7.9
+      // Add in the u, t - 1 term.
+      auto sum_log_alpha = kLogZero<TT>();
+      if (ctc_merge_repeated || l_prime[u] == blank_index_) {
+        sum_log_alpha = log_alpha->coeff(u, t - 1);
+      }
+
+      // Add in the u - 1, t - 1 term.
+      if (u > 0) {
+        sum_log_alpha =
+            LogSumExp(sum_log_alpha, log_alpha->coeff(u - 1, t - 1));
+      }
+
+      // Add in the u - 2, t - 1 term if l_prime(u) != blank or l_prime(u-2).
+      if (u > 1) {
+        const bool matching_labels_merge =
+            ctc_merge_repeated && (l_prime[u] == l_prime[u - 2]);
+        if (l_prime[u] != blank_index_ && !matching_labels_merge) {
+          sum_log_alpha =
+              LogSumExp(sum_log_alpha, log_alpha->coeff(u - 2, t - 1));
+        }
+      }
+      // Multiply the summed alphas with the activation log probability.
+      log_alpha->coeffRef(u, t) =
+          log(y(l_prime[u], output_delay_ + t)) + sum_log_alpha;
+    }  // End (GravesTh) Eq 7.9.
+  }
+}
+
+// Calculates the beta(t, u) as described in (GravesTh) Section 7.3.
+template <class TT>
+void CTCLossCalculator<TT>::CalculateBackwardVariables(
+    const std::vector<int>& l_prime, const Matrix& y, bool ctc_merge_repeated,
+    Matrix* log_beta) const {
+  // Number of cols is the number of time steps =  number of cols in target.
+  // Matrix log_beta =
+  //    Matrix::Constant(l_prime.size(), y.cols() - output_delay_,
+  // kLogZero);
+  using Eigen::numext::log;
+
+  log_beta->setConstant(kLogZero<TT>());
+  int T = log_beta->cols();
+  int U = l_prime.size();
+  CHECK_EQ(U, log_beta->rows());
+
+  // Initial beta values in (GravesTh) Eq 7.13: log of probability 1.
+  for (int u = U - 2; u < U; ++u) log_beta->coeffRef(u, T - 1) = 0;
+
+  for (int t = T - 1 - 1; t >= 0; --t) {
+    // If there is not enough time to output the remaining labels or
+    // some labels have been skipped, then let log_beta(u, t) continue to
+    // be kLogZero.
+    for (int u = std::max(0, U - (2 * (T - t))); u < std::min(U, 2 * (t + 1));
+         ++u) {
+      // Begin (GravesTh) Eq 7.15
+      // Add in the u, t + 1 term.
+      if (ctc_merge_repeated || l_prime[u] == blank_index_) {
+        log_beta->coeffRef(u, t) =
+            LogSumExp(log_beta->coeff(u, t),
+                      log_beta->coeff(u, t + 1) +
+                          log(y(l_prime[u], output_delay_ + t + 1)));
+      }
+
+      // Add in the u + 1, t + 1 term.
+      if (u + 1 < U) {
+        log_beta->coeffRef(u, t) =
+            LogSumExp(log_beta->coeff(u, t),
+                      log_beta->coeff(u + 1, t + 1) +
+                          log(y(l_prime[u + 1], output_delay_ + t + 1)));
+      }
+
+      // Add in the u + 2, t + 1 term if l_prime(u) != blank or l_prime(u+2).
+      if (u + 2 < U) {
+        const bool matching_labels_merge =
+            ctc_merge_repeated && (l_prime[u] == l_prime[u + 2]);
+        if (l_prime[u] != blank_index_ && !matching_labels_merge) {
+          // Add in u + 2 term.
+          log_beta->coeffRef(u, t) =
+              LogSumExp(log_beta->coeff(u, t),
+                        log_beta->coeff(u + 2, t + 1) +
+                            log(y(l_prime[u + 2], output_delay_ + t + 1)));
+        }
+      }  // End (GravesTh) Eq. 7.15
+    }
+  }
+}
+
+// Using (GravesTh) Eq 7.26 & 7.34.
+template <typename TT>
+void CTCLossCalculator<TT>::CalculateGradient(const std::vector<int>& l_prime,
+                                              const Matrix& y,
+                                              const Matrix& log_alpha,
+                                              const Matrix& log_beta,
+                                              TT log_p_z_x, Matrix* dy) const {
+  // Only working with the leftmost part of dy for this batch element.
+  auto dy_b = dy->leftCols(y.cols());
+
+  // It is possible that no valid path is found if the activations for the
+  // targets are zero.
+  if (log_p_z_x == kLogZero<TT>()) {
+    LOG(WARNING) << "No valid path found.";
+    dy_b = y;
+    return;
+  }
+
+  int L = y.rows();
+  int T = y.cols();
+  int U = l_prime.size();
+
+  for (int t = 0; t < T - output_delay_; ++t) {
+    Array prob_sum(L);
+    prob_sum.setConstant(kLogZero<TT>());
+
+    for (int u = 0; u < U; ++u) {
+      int l = l_prime[u];
+      prob_sum[l] = LogSumExp(prob_sum[l], log_alpha(u, t) + log_beta(u, t));
+    }
+
+    for (int l = 0; l < L; ++l) {
+      // Negative term in (GravesTh) Eq 7.28.
+      auto negative_term = expf(prob_sum[l] - log_p_z_x);
+
+      dy_b(l, output_delay_ + t) = y(l, output_delay_ + t) - negative_term;
+    }
+  }
+}
+
+template <class TT>
+void CTCLossCalculator<TT>::GetLPrimeIndices(const std::vector<int>& l,
+                                             std::vector<int>* l_prime) const {
+  // Assumption is that l_prime is empty.
+  l_prime->reserve(2 * l.size() + 1);
+
+  for (auto label : l) {
+    l_prime->push_back(blank_index_);
+    l_prime->push_back(label);
+  }
+  // Add final blank to l'.
+  l_prime->push_back(blank_index_);
+}
+
+}  // namespace ctc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_CALCULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_loss_util.h b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_loss_util.h
new file mode 100644
index 00000000..e9fc99af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ctc/ctc_loss_util.h
@@ -0,0 +1,55 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+
+#ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_UTIL_H_
+
+#include <cmath>
+#include <limits>
+
+namespace tensorflow {
+namespace ctc {
+
+template <class T>
+constexpr T kLogZero() {
+  return -std::numeric_limits<T>::infinity();  // NOLINT
+}
+
+// Add logarithmic probabilities using:
+// ln(a + b) = ln(a) + ln(1 + exp(ln(b) - ln(a)))
+// The two inputs are assumed to be log probabilities.
+// (GravesTh) Eq. 7.18
+template <typename T>
+inline T LogSumExp(T log_prob_1, T log_prob_2) {
+  // const T kLogZero = -std::numeric_limits<T>::infinity();
+  // Always have 'b' be the smaller number to avoid the exponential from
+  // blowing up.
+  if (log_prob_1 == kLogZero<T>()) {
+    return log_prob_2;
+  } else if (log_prob_2 == kLogZero<T>()) {
+    return log_prob_1;
+  } else {
+    return (log_prob_1 > log_prob_2)
+               ? log_prob_1 + log1pf(expf(log_prob_2 - log_prob_1))
+               : log_prob_2 + log1pf(expf(log_prob_1 - log_prob_2));
+  }
+}
+
+}  // namespace ctc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_UTIL_H_
+// LINT.ThenChange(//tensorflow/lite/kernels/ctc/ctc_loss_util.h)
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/cuda_sparse.h b/third_party/tflite-hdrs/tensorflow/core/util/cuda_sparse.h
new file mode 100644
index 00000000..ca3ac8ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/cuda_sparse.h
@@ -0,0 +1,722 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_SPARSE_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_SPARSE_H_
+
+// This header declares the class GpuSparse, which contains wrappers of
+// cuSparse libraries for use in TensorFlow kernels.
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <functional>
+#include <vector>
+
+#if GOOGLE_CUDA
+
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cusparse.h"
+
+using gpusparseStatus_t = cusparseStatus_t;
+using gpusparseOperation_t = cusparseOperation_t;
+using gpusparseMatDescr_t = cusparseMatDescr_t;
+using gpusparseAction_t = cusparseAction_t;
+using gpusparseHandle_t = cusparseHandle_t;
+using gpuStream_t = cudaStream_t;
+#if CUDA_VERSION >= 10020
+using gpusparseDnMatDescr_t = cusparseDnMatDescr_t;
+using gpusparseSpMatDescr_t = cusparseSpMatDescr_t;
+using gpusparseSpMMAlg_t = cusparseSpMMAlg_t;
+#endif
+
+#define GPUSPARSE(postfix) CUSPARSE_##postfix
+#define gpusparse(postfix) cusparse##postfix
+
+#elif TENSORFLOW_USE_ROCM
+
+#include "rocm/rocm_config.h"
+#include "xla/stream_executor/rocm/hipsparse_wrapper.h"
+
+using gpusparseStatus_t = hipsparseStatus_t;
+using gpusparseOperation_t = hipsparseOperation_t;
+using gpusparseMatDescr_t = hipsparseMatDescr_t;
+using gpusparseAction_t = hipsparseAction_t;
+using gpusparseHandle_t = hipsparseHandle_t;
+using gpuStream_t = hipStream_t;
+#if TF_ROCM_VERSION >= 40200
+using gpusparseDnMatDescr_t = hipsparseDnMatDescr_t;
+using gpusparseSpMatDescr_t = hipsparseSpMatDescr_t;
+using gpusparseSpMMAlg_t = hipsparseSpMMAlg_t;
+#endif
+#define GPUSPARSE(postfix) HIPSPARSE_##postfix
+#define gpusparse(postfix) hipsparse##postfix
+
+#endif
+
+#include "xla/stream_executor/data_type.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/public/version.h"
+
+#if GOOGLE_CUDA
+#include "xla/stream_executor/cuda/cuda_blas_utils.h"
+#endif
+
+// Macro that specializes a sparse method for all 4 standard
+// numeric types.
+// TODO: reuse with cuda_solvers
+#define TF_CALL_LAPACK_TYPES(m) \
+  m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z)
+
+namespace tensorflow {
+
+inline std::string ConvertGPUSparseErrorToString(
+    const gpusparseStatus_t status) {
+  switch (status) {
+#define STRINGIZE(q) #q
+#define RETURN_IF_STATUS(err) \
+  case err:                   \
+    return STRINGIZE(err);
+
+#if GOOGLE_CUDA
+
+    RETURN_IF_STATUS(CUSPARSE_STATUS_SUCCESS)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_NOT_INITIALIZED)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_ALLOC_FAILED)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_INVALID_VALUE)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_ARCH_MISMATCH)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_MAPPING_ERROR)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_EXECUTION_FAILED)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_INTERNAL_ERROR)
+    RETURN_IF_STATUS(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)
+
+    default:
+      return strings::StrCat("Unknown CUSPARSE error: ",
+                             static_cast<int>(status));
+#elif TENSORFLOW_USE_ROCM
+
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_SUCCESS)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_NOT_INITIALIZED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_ALLOC_FAILED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_INVALID_VALUE)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_ARCH_MISMATCH)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_MAPPING_ERROR)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_EXECUTION_FAILED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_INTERNAL_ERROR)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)
+    RETURN_IF_STATUS(HIPSPARSE_STATUS_ZERO_PIVOT)
+
+    default:
+      return strings::StrCat("Unknown hipSPARSE error: ",
+                             static_cast<int>(status));
+#endif
+
+#undef RETURN_IF_STATUS
+#undef STRINGIZE
+  }
+}
+
+#if GOOGLE_CUDA
+
+#define TF_RETURN_IF_GPUSPARSE_ERROR(expr)                                 \
+  do {                                                                     \
+    auto status = (expr);                                                  \
+    if (TF_PREDICT_FALSE(status != CUSPARSE_STATUS_SUCCESS)) {             \
+      return errors::Internal(__FILE__, ":", __LINE__, " (", TF_STR(expr), \
+                              "): cuSparse call failed with status ",      \
+                              ConvertGPUSparseErrorToString(status));      \
+    }                                                                      \
+  } while (0)
+
+#elif TENSORFLOW_USE_ROCM
+
+#define TF_RETURN_IF_GPUSPARSE_ERROR(expr)                                 \
+  do {                                                                     \
+    auto status = (expr);                                                  \
+    if (TF_PREDICT_FALSE(status != HIPSPARSE_STATUS_SUCCESS)) {            \
+      return errors::Internal(__FILE__, ":", __LINE__, " (", TF_STR(expr), \
+                              "): hipSPARSE call failed with status ",     \
+                              ConvertGPUSparseErrorToString(status));      \
+    }                                                                      \
+  } while (0)
+
+#endif
+
+inline gpusparseOperation_t TransposeAndConjugateToGpuSparseOp(bool transpose,
+                                                               bool conjugate,
+                                                               Status* status) {
+#if GOOGLE_CUDA
+  if (transpose) {
+    return conjugate ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
+                     : CUSPARSE_OPERATION_TRANSPOSE;
+  } else {
+    if (conjugate) {
+      DCHECK(status != nullptr);
+      *status = errors::InvalidArgument(
+          "Conjugate == True and transpose == False is not supported.");
+    }
+    return CUSPARSE_OPERATION_NON_TRANSPOSE;
+  }
+#elif TENSORFLOW_USE_ROCM
+  if (transpose) {
+    return conjugate ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
+                     : HIPSPARSE_OPERATION_TRANSPOSE;
+  } else {
+    if (conjugate) {
+      DCHECK(status != nullptr);
+      *status = errors::InvalidArgument(
+          "Conjugate == True and transpose == False is not supported.");
+    }
+    return HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  }
+#endif
+}
+
+#if GOOGLE_CUDA && (CUDA_VERSION >= 12000)
+
+template <typename T>
+struct ToGpuSparseIndexType;
+template <>
+struct ToGpuSparseIndexType<int> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+template <>
+struct ToGpuSparseIndexType<int64_t> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+};
+
+class GpuSparseSpGEMMDescr {
+ public:
+  GpuSparseSpGEMMDescr() : initialized_(false) {}
+  ~GpuSparseSpGEMMDescr() {
+    if (initialized_) {
+      cusparseSpGEMM_destroyDescr(descr_);
+    }
+  }
+  Status Initialize() {
+    if (initialized_) {
+      return errors::Internal("Double initializion of GpuSparseSpGEMMDescr.");
+    }
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpGEMM_createDescr(&descr_));
+    initialized_ = true;
+    return OkStatus();
+  }
+  cusparseSpGEMMDescr_t& get() { return descr_; }
+
+ private:
+  bool initialized_;
+  cusparseSpGEMMDescr_t descr_;
+
+  GpuSparseSpGEMMDescr(const GpuSparseSpGEMMDescr&) = delete;
+  void operator=(const GpuSparseSpGEMMDescr&) = delete;
+};
+
+class GpuSparseSpMatDescr {
+ public:
+  GpuSparseSpMatDescr() : initialized_(false) {}
+  ~GpuSparseSpMatDescr() {
+    if (initialized_) {
+      cusparseDestroySpMat(descr_);
+    }
+  }
+  template <typename IndexType, typename FloatType>
+  Status InitializeCsr(int64_t rows, int64_t cols, int64_t nnz,
+                       IndexType* csrRowOffsets, IndexType* csrColInd,
+                       FloatType* csrValues) {
+    if (initialized_) {
+      return errors::Internal("Double initializion of gpusparseSpMatDescr.");
+    }
+    using stream_executor::cuda::AsCudaDataType;
+    using stream_executor::dnn::ToDataType;
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsr(
+        &descr_, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues,
+        ToGpuSparseIndexType<IndexType>::value,
+        ToGpuSparseIndexType<IndexType>::value, CUSPARSE_INDEX_BASE_ZERO,
+        AsCudaDataType(ToDataType<FloatType>::value)));
+    initialized_ = true;
+    return OkStatus();
+  }
+  gpusparseSpMatDescr_t& get() { return descr_; }
+
+ private:
+  bool initialized_;
+  cusparseSpMatDescr_t descr_;
+  GpuSparseSpMatDescr(const GpuSparseSpMatDescr&) = delete;
+  void operator=(const GpuSparseSpMatDescr&) = delete;
+};
+
+class GpuSparseConstSpMatDescr {
+ public:
+  GpuSparseConstSpMatDescr() : initialized_(false) {}
+  ~GpuSparseConstSpMatDescr() {
+    if (initialized_) {
+      cusparseDestroySpMat(descr_);
+    }
+  }
+  template <typename IndexType, typename FloatType>
+  Status InitializeCsr(int64_t rows, int64_t cols, int64_t nnz,
+                       const IndexType* csrRowOffsets,
+                       const IndexType* csrColInd, const FloatType* csrValues) {
+    if (initialized_) {
+      return errors::Internal("Double initializion of gpusparseSpMatDescr.");
+    }
+    using stream_executor::cuda::AsCudaDataType;
+    using stream_executor::dnn::ToDataType;
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateConstCsr(
+        &descr_, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues,
+        ToGpuSparseIndexType<IndexType>::value,
+        ToGpuSparseIndexType<IndexType>::value, CUSPARSE_INDEX_BASE_ZERO,
+        AsCudaDataType(ToDataType<FloatType>::value)));
+    initialized_ = true;
+    return OkStatus();
+  }
+  cusparseConstSpMatDescr_t& get() { return descr_; }
+
+ private:
+  bool initialized_;
+  cusparseConstSpMatDescr_t descr_;
+  GpuSparseConstSpMatDescr(const GpuSparseConstSpMatDescr&) = delete;
+  void operator=(const GpuSparseConstSpMatDescr&) = delete;
+};
+
+#endif
+
+// The GpuSparse class provides a simplified templated API for cuSparse
+// (http://docs.nvidia.com/cuda/cusparse/index.html).
+// An object of this class wraps static cuSparse instances,
+// and will launch Cuda kernels on the stream wrapped by the GPU device
+// in the OpKernelContext provided to the constructor.
+//
+// Notice: All the computational member functions are asynchronous and simply
+// launch one or more Cuda kernels on the Cuda stream wrapped by the GpuSparse
+// object.
+
+class GpuSparse {
+ public:
+  // This object stores a pointer to context, which must outlive it.
+  explicit GpuSparse(OpKernelContext* context);
+  virtual ~GpuSparse() {}
+
+  // This initializes the GpuSparse class if it hasn't
+  // been initialized yet.  All following public methods require the
+  // class has been initialized.  Can be run multiple times; all
+  // subsequent calls after the first have no effect.
+  Status Initialize();  // Move to constructor?
+
+  // ====================================================================
+  // Wrappers for cuSparse start here.
+  //
+
+  // Solves tridiagonal system of equations.
+  // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2
+  template <typename Scalar>
+  Status Gtsv2(int m, int n, const Scalar* dl, const Scalar* d,
+               const Scalar* du, Scalar* B, int ldb, void* pBuffer) const;
+
+  // Computes the size of a temporary buffer used by Gtsv2.
+  // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2_bufferSize
+  template <typename Scalar>
+  Status Gtsv2BufferSizeExt(int m, int n, const Scalar* dl, const Scalar* d,
+                            const Scalar* du, const Scalar* B, int ldb,
+                            size_t* bufferSizeInBytes) const;
+
+  // Solves tridiagonal system of equations without partial pivoting.
+  // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2_nopivot
+  template <typename Scalar>
+  Status Gtsv2NoPivot(int m, int n, const Scalar* dl, const Scalar* d,
+                      const Scalar* du, Scalar* B, int ldb,
+                      void* pBuffer) const;
+
+  // Computes the size of a temporary buffer used by Gtsv2NoPivot.
+  // See:
+  // https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2_nopivot_bufferSize
+  template <typename Scalar>
+  Status Gtsv2NoPivotBufferSizeExt(int m, int n, const Scalar* dl,
+                                   const Scalar* d, const Scalar* du,
+                                   const Scalar* B, int ldb,
+                                   size_t* bufferSizeInBytes) const;
+
+  // Solves a batch of tridiagonal systems of equations. Doesn't support
+  // multiple right-hand sides per each system. Doesn't do pivoting.
+  // See: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2stridedbatch
+  template <typename Scalar>
+  Status Gtsv2StridedBatch(int m, const Scalar* dl, const Scalar* d,
+                           const Scalar* du, Scalar* x, int batchCount,
+                           int batchStride, void* pBuffer) const;
+
+  // Computes the size of a temporary buffer used by Gtsv2StridedBatch.
+  // See:
+  // https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2stridedbatch_bufferSize
+  template <typename Scalar>
+  Status Gtsv2StridedBatchBufferSizeExt(int m, const Scalar* dl,
+                                        const Scalar* d, const Scalar* du,
+                                        const Scalar* x, int batchCount,
+                                        int batchStride,
+                                        size_t* bufferSizeInBytes) const;
+
+  // Compresses the indices of rows or columns. It can be interpreted as a
+  // conversion from COO to CSR sparse storage format. See:
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csr2coo.
+  Status Csr2coo(const int* CsrRowPtr, int nnz, int m, int* cooRowInd) const;
+
+  // Uncompresses the indices of rows or columns. It can be interpreted as a
+  // conversion from CSR to COO sparse storage format. See:
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-coo2csr.
+  Status Coo2csr(const int* cooRowInd, int nnz, int m, int* csrRowPtr) const;
+
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || \
+    (TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 40200)
+  // Sparse-dense matrix multiplication C = alpha * op(A) * op(B)  + beta * C,
+  // where A is a sparse matrix in CSR format, B and C are dense tall
+  // matrices.  This routine allows transposition of matrix B, which
+  // may improve performance.  See:
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmm2
+  //
+  // **NOTE** Matrices B and C are expected to be in column-major
+  // order; to make them consistent with TensorFlow they
+  // must be transposed (or the matmul op's pre/post-processing must take this
+  // into account).
+  //
+  // **NOTE** This is an in-place operation for data in C.
+  template <typename Scalar>
+  Status Csrmm(gpusparseOperation_t transA, gpusparseOperation_t transB, int m,
+               int n, int k, int nnz, const Scalar* alpha_host,
+               const gpusparseMatDescr_t descrA, const Scalar* csrSortedValA,
+               const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+               const Scalar* B, int ldb, const Scalar* beta_host, Scalar* C,
+               int ldc) const;
+#else  // CUDA_VERSION >=10200 || TF_ROCM_VERSION >= 40200
+  // Workspace size query for sparse-dense matrix multiplication. Helper
+  // function for SpMM which computes y = alpha * op(A) * op(B) + beta * C,
+  // where A is a sparse matrix in CSR format, B and C are dense matricies in
+  // column-major format. Returns needed workspace size in bytes.
+  template <typename Scalar>
+  Status SpMMBufferSize(gpusparseOperation_t transA,
+                        gpusparseOperation_t transB, const Scalar* alpha,
+                        const gpusparseSpMatDescr_t matA,
+                        const gpusparseDnMatDescr_t matB, const Scalar* beta,
+                        gpusparseDnMatDescr_t matC, gpusparseSpMMAlg_t alg,
+                        size_t* bufferSize) const;
+
+  // Sparse-dense matrix multiplication y = alpha * op(A) * op(B) + beta * C,
+  // where A is a sparse matrix in CSR format, B and C are dense matricies in
+  // column-major format. Buffer is assumed to be at least as large as the
+  // workspace size returned by SpMMBufferSize().
+  //
+  // **NOTE** This is an in-place operation for data in C.
+  template <typename Scalar>
+  Status SpMM(gpusparseOperation_t transA, gpusparseOperation_t transB,
+              const Scalar* alpha, const gpusparseSpMatDescr_t matA,
+              const gpusparseDnMatDescr_t matB, const Scalar* beta,
+              gpusparseDnMatDescr_t matC, gpusparseSpMMAlg_t alg,
+              int8* buffer) const;
+#endif
+
+  // Sparse-dense vector multiplication y = alpha * op(A) * x  + beta * y,
+  // where A is a sparse matrix in CSR format, x and y are dense vectors. See:
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv_mergepath
+  //
+  // **NOTE** This is an in-place operation for data in y.
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
+  template <typename Scalar>
+  Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
+               const Scalar* alpha_host, const gpusparseMatDescr_t descrA,
+               const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+               const int* csrSortedColIndA, const Scalar* x,
+               const Scalar* beta_host, Scalar* y) const;
+#else
+  template <typename Scalar>
+  Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
+               const Scalar* alpha_host, const Scalar* csrSortedValA,
+               const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+               const Scalar* x, const Scalar* beta_host, Scalar* y) const;
+#endif  // CUDA_VERSION < 10020
+
+  // Computes workspace size for sparse - sparse matrix addition of matrices
+  // stored in CSR format.
+  template <typename Scalar>
+  Status CsrgeamBufferSizeExt(
+      int m, int n, const Scalar* alpha, const gpusparseMatDescr_t descrA,
+      int nnzA, const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+      const int* csrSortedColIndA, const Scalar* beta,
+      const gpusparseMatDescr_t descrB, int nnzB, const Scalar* csrSortedValB,
+      const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+      const gpusparseMatDescr_t descrC, Scalar* csrSortedValC,
+      int* csrSortedRowPtrC, int* csrSortedColIndC, size_t* bufferSize);
+
+  // Computes sparse-sparse matrix addition of matrices
+  // stored in CSR format.  This is part one: calculate nnz of the
+  // output.  csrSortedRowPtrC must be preallocated on device with
+  // m + 1 entries.  See:
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgeam.
+  Status CsrgeamNnz(int m, int n, const gpusparseMatDescr_t descrA, int nnzA,
+                    const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+                    const gpusparseMatDescr_t descrB, int nnzB,
+                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    int* nnzTotalDevHostPtr, void* workspace);
+
+  // Computes sparse - sparse matrix addition of matrices
+  // stored in CSR format.  This is part two: perform sparse-sparse
+  // addition.  csrValC and csrColIndC must be allocated on the device
+  // with nnzTotalDevHostPtr entries (as calculated by CsrgeamNnz).  See:
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgeam.
+  template <typename Scalar>
+  Status Csrgeam(int m, int n, const Scalar* alpha,
+                 const gpusparseMatDescr_t descrA, int nnzA,
+                 const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
+                 const int* csrSortedColIndA, const Scalar* beta,
+                 const gpusparseMatDescr_t descrB, int nnzB,
+                 const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+                 const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
+                 Scalar* csrSortedValC, int* csrSortedRowPtrC,
+                 int* csrSortedColIndC, void* workspace);
+
+  // Computes sparse-sparse matrix multiplication of matrices
+  // stored in CSR format.
+#if TENSORFLOW_USE_ROCM
+  // Part one: calculate nnz of the output.
+  // csrSortedRowPtrC must be preallocated on device with m + 1 entries.
+  Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB,
+                    int m, int k, int n, const gpusparseMatDescr_t descrA,
+                    int nnzA, const int* csrSortedRowPtrA,
+                    const int* csrSortedColIndA,
+                    const gpusparseMatDescr_t descrB, int nnzB,
+                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    int* nnzTotalDevHostPtr);
+  // Part two: perform sparse-sparse matmul.
+  // csrValC and csrColIndC must be allocated on the device with
+  // nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).
+  template <typename Scalar>
+  Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB,
+                 int m, int k, int n, const gpusparseMatDescr_t descrA,
+                 int nnzA, const Scalar* csrSortedValA,
+                 const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+                 const gpusparseMatDescr_t descrB, int nnzB,
+                 const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+                 const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
+                 Scalar* csrSortedValC, int* csrSortedRowPtrC,
+                 int* csrSortedColIndC);
+#elif CUDA_VERSION < 12000
+  // Part zero: calculate required workspace size.
+  template <typename Scalar>
+  Status CsrgemmBufferSize(
+      int m, int n, int k, const gpusparseMatDescr_t descrA, int nnzA,
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+      const gpusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
+      const int* csrSortedColIndB, csrgemm2Info_t info, size_t* workspaceBytes);
+  // Part one: calculate nnz of the output.
+  // csrSortedRowPtrC must be preallocated on device with m + 1 entries.
+  Status CsrgemmNnz(int m, int n, int k, const gpusparseMatDescr_t descrA,
+                    int nnzA, const int* csrSortedRowPtrA,
+                    const int* csrSortedColIndA,
+                    const gpusparseMatDescr_t descrB, int nnzB,
+                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    int* nnzTotalDevHostPtr, csrgemm2Info_t info,
+                    void* workspace);
+  // Part two: perform sparse-sparse matmul.
+  // csrValC and csrColIndC must be allocated on the device with
+  // nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).
+  template <typename Scalar>
+  Status Csrgemm(int m, int n, int k, const gpusparseMatDescr_t descrA,
+                 int nnzA, const Scalar* csrSortedValA,
+                 const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+                 const gpusparseMatDescr_t descrB, int nnzB,
+                 const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
+                 const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
+                 Scalar* csrSortedValC, int* csrSortedRowPtrC,
+                 int* csrSortedColIndC, const csrgemm2Info_t info,
+                 void* workspace);
+#else  // CUDA_VERSION >= 12000
+  template <typename Scalar>
+  Status SpGEMM_workEstimation(GpuSparseConstSpMatDescr& matA,
+                               GpuSparseConstSpMatDescr& matB,
+                               GpuSparseSpMatDescr& matC,
+                               GpuSparseSpGEMMDescr& spgemmDescr,
+                               size_t* bufferSize1, void* externalBuffer1);
+  template <typename Scalar>
+  Status SpGEMM_compute(GpuSparseConstSpMatDescr& matA,
+                        GpuSparseConstSpMatDescr& matB,
+                        GpuSparseSpMatDescr& matC,
+                        GpuSparseSpGEMMDescr& spgemmDescr, size_t* bufferSize2,
+                        void* externalBuffer2);
+  template <typename Scalar>
+  Status SpGEMM_copy(GpuSparseConstSpMatDescr& matA,
+                     GpuSparseConstSpMatDescr& matB, GpuSparseSpMatDescr& matC,
+                     GpuSparseSpGEMMDescr& spgemmDescr);
+#endif
+
+  // In-place reordering of unsorted CSR to sorted CSR.
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csru2csr
+  template <typename Scalar>
+  Status Csru2csr(int m, int n, int nnz, const gpusparseMatDescr_t descrA,
+                  Scalar* csrVal, const int* csrRowPtr, int* csrColInd);
+
+  // Converts from CSR to CSC format (equivalently, transpose).
+  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-csr2cscEx
+  template <typename Scalar>
+  Status Csr2csc(int m, int n, int nnz, const Scalar* csrVal,
+                 const int* csrRowPtr, const int* csrColInd, Scalar* cscVal,
+                 int* cscRowInd, int* cscColPtr,
+                 const gpusparseAction_t copyValues);
+
+ private:
+  bool initialized_;
+  OpKernelContext* context_;  // not owned.
+  gpuStream_t gpu_stream_;
+  gpusparseHandle_t* gpusparse_handle_;  // not owned.
+
+  GpuSparse(const GpuSparse&) = delete;
+  void operator=(const GpuSparse&) = delete;
+};
+
+// A wrapper class to ensure that a CUDA sparse matrix descriptor is initialized
+// only once. For more details on the descriptor (gpusparseMatDescr_t), see:
+// https://docs.nvidia.com/cuda/cusparse/index.html#cusparsematdescrt
+class GpuSparseMatrixDescriptor {
+ public:
+  explicit GpuSparseMatrixDescriptor() : initialized_(false) {}
+
+  GpuSparseMatrixDescriptor(GpuSparseMatrixDescriptor&& rhs)
+      : initialized_(rhs.initialized_), descr_(std::move(rhs.descr_)) {
+    rhs.initialized_ = false;
+  }
+
+  GpuSparseMatrixDescriptor& operator=(GpuSparseMatrixDescriptor&& rhs) {
+    if (this == &rhs) return *this;
+    Release();
+    initialized_ = rhs.initialized_;
+    descr_ = std::move(rhs.descr_);
+    rhs.initialized_ = false;
+    return *this;
+  }
+
+  ~GpuSparseMatrixDescriptor() { Release(); }
+
+  // Initializes the underlying descriptor.  Will fail on the second call if
+  // called more than once.
+  Status Initialize() {
+    DCHECK(!initialized_);
+#if GOOGLE_CUDA
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descr_));
+#elif TENSORFLOW_USE_ROCM
+    TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateMatDescr(&descr_));
+#endif
+    initialized_ = true;
+    return OkStatus();
+  }
+
+  gpusparseMatDescr_t& descr() {
+    DCHECK(initialized_);
+    return descr_;
+  }
+
+  const gpusparseMatDescr_t& descr() const {
+    DCHECK(initialized_);
+    return descr_;
+  }
+
+ private:
+  void Release() {
+    if (initialized_) {
+#if GOOGLE_CUDA
+      cusparseDestroyMatDescr(descr_);
+#elif TENSORFLOW_USE_ROCM
+      se::wrap::hipsparseDestroyMatDescr(descr_);
+#endif
+      initialized_ = false;
+    }
+  }
+
+  bool initialized_;
+  gpusparseMatDescr_t descr_;
+
+  GpuSparseMatrixDescriptor(const GpuSparseMatrixDescriptor&) = delete;
+  void operator=(const GpuSparseMatrixDescriptor&) = delete;
+};
+
+#if GOOGLE_CUDA
+
+// A wrapper class to ensure that an unsorted/sorted CSR conversion information
+// struct (csru2csrInfo_t) is initialized only once. See:
+// https://docs.nvidia.com/cuda/cusparse/index.html#csru2csr
+class GpuSparseCsrSortingConversionInfo {
+ public:
+  explicit GpuSparseCsrSortingConversionInfo() : initialized_(false) {}
+
+  GpuSparseCsrSortingConversionInfo(GpuSparseCsrSortingConversionInfo&& rhs)
+      : initialized_(rhs.initialized_), info_(std::move(rhs.info_)) {
+    rhs.initialized_ = false;
+  }
+
+  GpuSparseCsrSortingConversionInfo& operator=(
+      GpuSparseCsrSortingConversionInfo&& rhs) {
+    if (this == &rhs) return *this;
+    Release();
+    initialized_ = rhs.initialized_;
+    info_ = std::move(rhs.info_);
+    rhs.initialized_ = false;
+    return *this;
+  }
+
+  ~GpuSparseCsrSortingConversionInfo() { Release(); }
+
+  // Initializes the underlying info. Will fail on the second call if called
+  // more than once.
+  Status Initialize() {
+    DCHECK(!initialized_);
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsru2csrInfo(&info_));
+    initialized_ = true;
+    return OkStatus();
+  }
+
+  csru2csrInfo_t& info() {
+    DCHECK(initialized_);
+    return info_;
+  }
+
+  const csru2csrInfo_t& info() const {
+    DCHECK(initialized_);
+    return info_;
+  }
+
+ private:
+  void Release() {
+    if (initialized_) {
+      cusparseDestroyCsru2csrInfo(info_);
+      initialized_ = false;
+    }
+  }
+
+  bool initialized_;
+  csru2csrInfo_t info_;
+
+  GpuSparseCsrSortingConversionInfo(const GpuSparseCsrSortingConversionInfo&) =
+      delete;
+  void operator=(const GpuSparseCsrSortingConversionInfo&) = delete;
+};
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_SPARSE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/debug_data_dumper.h b/third_party/tflite-hdrs/tensorflow/core/util/debug_data_dumper.h
new file mode 100644
index 00000000..44eee52c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/debug_data_dumper.h
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_DEBUG_DATA_DUMPER_H_
+#define TENSORFLOW_CORE_UTIL_DEBUG_DATA_DUMPER_H_
+
+#include <optional>
+#include <set>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#define DEBUG_DATA_DUMPER() ::tensorflow::DebugDataDumper::Global()
+
+inline constexpr const char* kDebugGroupMain = "main";
+inline constexpr const char* kDebugGroupOpStacktrace = "op_stacktrace";
+inline constexpr const char* kDebugGroupGraphOptPass = "graph_opt_pass";
+inline constexpr const char* kDebugGroupBridgePhase1Clustering =
+    "bridge_phase1_clustering";
+inline constexpr const char* kDebugGroupRuntimeLowering = "runtime_lowering";
+inline constexpr const char* kDebugGroupBridgePhase1ExecutorExport =
+    "bridge_phase1_executor_export";
+inline constexpr const char* kDebugGroupBridgePhase2 = "bridge_phase2";
+inline constexpr const char* kDebugGroupDTensorMlir = "dtensor_mlir";
+inline constexpr const char* kDebugGroupDTensorGraph = "dtensor_graph";
+inline constexpr const char* kDebugGroupDTensorLayout = "dtensor_layout";
+
+namespace tensorflow {
+
+class FunctionLibraryDefinition;
+class Graph;
+
+////////////////////////////////////////////////////////////////////////////////
+// This class is responsible for dumping debugging data (e.g., GraphDef, MLIR).
+//
+// To dump GraphDef/MLIRs, take the following steps:
+// * Set envvar TF_DUMP_GRAPH_PREFIX to your target dump directory.
+// * Set envvar TF_DUMP_GRAPH_NAME_FILTER to '*' to dump all graphs,
+//   or a name filter to dump graphs with a name containing it.
+// * Set envvar TF_DUMP_GRAPH_GROUPS to your dump groups (comma-separated).
+//
+// The dumped graphs then can be found in your target dump directory.
+// The filename of the dump looks like this:
+// <name>.<order-id>.<group>.<tag>
+//
+// This is what each field means:
+// * <name>     : The name of your dump.
+// * <order-id> : The order of dumps of a specific name.
+//                Lower orders are executed before higher orders.
+// * <group>    : The group of your dump, e.g., main.
+// * <tag>      : The tag of your dump, e.g., your pass name.
+//
+// Example dump files are:
+// __inference_train_step_441.0.main.before_pre_placement_passes.pbtxt
+// __inference_train_step_441.1.main.before_placer.pbtxt
+// __inference_train_step_441.2.main.before_post_placement_passes.pbtxt
+// __inference_train_step_441.3.main.before_graph_optimization.pbtxt
+// __inference_train_step_441.4.main.after_graph_optimization.pbtxt
+// __inference_train_step_441.5.main.before_post_rewrite_for_exec_passes.pbtxt
+////////////////////////////////////////////////////////////////////////////////
+class DebugDataDumper {
+ public:
+  // Get the singleton instance.
+  static DebugDataDumper* Global();
+
+  // Initialize the debug data dumper.
+  void LoadEnvvars();
+
+  // Check if we should dump debug data.
+  // We should dump debug data only if the followings are true:
+  // 1. Envvar TF_DUMP_GRAPH_PREFIX is set to your target dump directory.
+  // 2. This condition is true if one of the followings is true.
+  //    2.1. TF_DUMP_GRAPH_NAME_FILTER is set to '*'
+  //    2.2. TF_DUMP_GRAPH_NAME_FILTER is set to a name filter
+  //         which is a substr of name.
+  // 3. The group is defined in TF_DUMP_GRAPH_GROUPS.
+  bool ShouldDump(const std::string& name, const std::string& group) const;
+
+  // Dump op creation callstacks, if ShouldDump returns true.
+  void DumpOpCreationStackTraces(const std::string& name,
+                                 const std::string& group,
+                                 const std::string& tag, const Graph* graph);
+
+  // Dump a graph, if ShouldDump returns true.
+  void DumpGraph(const std::string& name, const std::string& group,
+                 const std::string& tag, const Graph* graph,
+                 const FunctionLibraryDefinition* func_lib_def,
+                 bool bypass_filter = false);
+
+  // Get the dump file basename. Dump file basenames are in this format:
+  // <name>.<order-id>.<group>.<tag>
+  //
+  // What each field means is explained on the class level comment.
+  std::string GetDumpFilename(const std::string& name, const std::string& group,
+                              const std::string& tag);
+
+ private:
+  DebugDataDumper();
+
+  // Get next dump id for a name.
+  int GetNextDumpId(const std::string& name) {
+    // Use a lock to make sure this is thread safe.
+    const mutex_lock lock(lock_);
+    return dump_order_ids_[name]++;
+  }
+
+  // A dict to maintain the mapping from dump name to its current dump id.
+  absl::flat_hash_map<std::string, int> dump_order_ids_;
+
+  // A mutex to make sure this is thread safe.
+  tensorflow::mutex lock_;
+
+  // The name filter.
+  std::optional<std::string> name_filter_;
+
+  // The groups filter.
+  std::set<string> groups_filter_;
+
+  // A flag indicating whether to dump wrapped graphs.
+  bool dump_wrapped_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DEBUG_DATA_DUMPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/debug_events_writer.h b/third_party/tflite-hdrs/tensorflow/core/util/debug_events_writer.h
new file mode 100644
index 00000000..7b104279
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/debug_events_writer.h
@@ -0,0 +1,277 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
+#define TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
+
+#include <atomic>
+#include <deque>
+#include <memory>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/debug_event.pb.h"
+
+namespace tensorflow {
+namespace tfdbg {
+
+// The set of files generated by a debugged TensorFlow program.
+enum DebugEventFileType {
+  METADATA,
+  SOURCE_FILES,
+  STACK_FRAMES,
+  GRAPHS,
+  EXECUTION,
+  GRAPH_EXECUTION_TRACES,
+};
+
+// Helper class for DebugEventsWriter.
+// This class manages the writing of data to a single TFRecord file.
+// Each object of the DebugEventsWriter class below involves multiple
+// TFRecord files, and hence utilizes multiple objects of this helper class.
+class SingleDebugEventFileWriter {
+ public:
+  explicit SingleDebugEventFileWriter(const string& file_path);
+
+  absl::Status Init();
+
+  void WriteSerializedDebugEvent(absl::string_view debug_event_str);
+
+  absl::Status Flush();
+  absl::Status Close();
+
+  const string FileName();
+
+ private:
+  Env* env_;
+  const string file_path_;
+  std::atomic_int_fast32_t num_outstanding_events_;
+
+  std::unique_ptr<WritableFile> writable_file_;
+  std::unique_ptr<io::RecordWriter> record_writer_ TF_PT_GUARDED_BY(writer_mu_);
+  mutex writer_mu_;
+};
+
+// The DebugEvents writer class.
+class DebugEventsWriter {
+ public:
+#ifndef SWIG
+  // Prefix of version string present in the first entry of every event file.
+  // Default size of each circular buffer (unit: number of DebugEvent protos).
+  static constexpr const int64_t kDefaultCyclicBufferSize = 1000;
+
+  static constexpr const char* kFileNamePrefix = "tfdbg_events";
+  static constexpr const char* kMetadataSuffix = "metadata";
+  static constexpr const char* kSourceFilesSuffix = "source_files";
+  static constexpr const char* kStackFramesSuffix = "stack_frames";
+  static constexpr const char* kGraphsSuffix = "graphs";
+  static constexpr const char* kExecutionSuffix = "execution";
+  static constexpr const char* kGraphExecutionTracesSuffix =
+      "graph_execution_traces";
+
+  static constexpr const char* kVersionPrefix = "debug.Event:";
+  static constexpr const int kCurrentFormatVersion = 1;
+#endif
+
+  // Get the DebugEventsWriter for the given dump_root.
+  // For a given dump_root value, it is a singleton. tfdbg event files come in
+  // sets of six. The singleton pattern avoids storing multiple sets in a single
+  // folder, which might cause confusion.
+  //
+  // If an instance of DebugEventsWriter has already been created at a
+  // `dump_root`, calling this method with the same `dump_root` will return
+  // the existing instance.
+  //
+  // Args:
+  //   dump_root: Dump root directory. If it doesn't exist, will be created.
+  //   tfdbg_run_id: Debugging run ID of the writer.
+  //   circular_buffer_size: Circular buffer size (in number of DebugEvent
+  //     protos). If set to a value <=0, will abolish the circular-buffer
+  //     behavior.
+  // Returns:
+  //   A pointer to a DebugEventsWriter object: a per-dump_root singleton.
+  static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
+                                                 const string& tfdbg_run_id,
+                                                 int64_t circular_buffer_size);
+  // Look up existing events writer by dump_root.
+  // If no DebugEventsWriter has been created at the dump_root, a non-OK
+  // Status will be returned. Else an OK status will be returned, with
+  // the pointer to the existing instance provided by reference.
+  static absl::Status LookUpDebugEventsWriter(
+      const string& dump_root, DebugEventsWriter** debug_events_writer);
+  ~DebugEventsWriter();
+
+  // Sets the debug event filenames and opens file for writing.
+  // All files (see the DebugEventFileType enum) share the same prefix and
+  // differ only in their suffixes. If not called by user, will be invoked
+  // automatically by a call to FileName() or any of the Write*() methods().
+  // Idempotent: if the metadata file exists and is open, this is a no-op.
+  // If on the other hand the file was opened, but has since disappeared (e.g.
+  // deleted by another process), this will open a new file.
+  absl::Status Init();
+
+  // The four DebugEvent fields below are written _without_ the circular
+  // buffer. Source file contents are written to the *.source_files file.
+  // Takes ownership of source_file.
+  absl::Status WriteSourceFile(SourceFile* source_file);
+  // Stack frames are written to the *.code_locations file.
+  // Takes ownership of stack_frame_with_id.
+  absl::Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
+  // Graph op creation events are written to the *.graphs file.
+  // Takes ownership of graph_op_creation.
+  absl::Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
+  // Debugged graphs are written to the *.graphs file.
+  // Takes ownership of debugged_graph.
+  absl::Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
+
+  // The two DebugEvent fields below are written to the circular buffer
+  // and saved to disk only at the FlushExecutionFiles() call.
+  // Execution events (eager execution of an op or a tf.function) are written
+  // to the *.execution file. Takes ownership of execution.
+  absl::Status WriteExecution(Execution* execution);
+  // Graph execution traces (graph-internal tensor values or their summaries)
+  // are written to the *.graph_execution_traces file.
+  // Takes ownership of graph_execution_trace.
+  absl::Status WriteGraphExecutionTrace(
+      GraphExecutionTrace* graph_execution_trace);
+
+  // Write a graph execution trace without using a protocol buffer.
+  // Instead, pass the raw values related to the graph execution trace.
+  // Args:
+  //   tfdbg_context_id: A unique ID for the context of interest, e.g., a
+  //   concreted compiled tf.function that the op of interest belongs to.
+  //   op_name: Name of the op that this graph execution trace is concerned
+  //     with. Applicable only to the single-tensor trace case. For cases in
+  //     which the trace concerns multiple tensors, this is an empty string.
+  //   output_slot: Output slot index of the op that this trace is concerned
+  //     with.
+  //   tensor_debug_mode: An integer that represents the tensor-debug mode
+  //   enum. tensor_value: The value of the tensor that describes the
+  //   tensor(s)
+  //     that this trace is concerned with. The semantics of this tensor value
+  //     depends on the value of `tensor_debug_mode`.
+  absl::Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
+                                        const string& device_name,
+                                        const string& op_name,
+                                        int32_t output_slot,
+                                        int32_t tensor_debug_mode,
+                                        const Tensor& tensor_value);
+
+  // Writes a serialized DebugEvent to one of the debug-events files
+  // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
+  // and GRAPHS files.
+  // NOTE: Actually used in the Python binding, to avoid overhead of
+  // serializing and parsing protos at the language interface.
+  void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str,
+                                             DebugEventFileType type);
+
+  // Writes a serialized DebugEvent to one of the debug-events files
+  // concerned with the execution-related events: the EXECUTION and
+  // GRAPH_EXECUTION_TRACES files. This involves the cyclic-buffer behavior if
+  // circular_buffer_size is configured to be >0.
+  // NOTE: Actually used in the Python binding, to avoid overhead of
+  // serializing and parsing protos at the language interface.
+  void WriteSerializedExecutionDebugEvent(const string& debug_event_str,
+                                          DebugEventFileType type);
+
+  // Given name of the device, retrieve a unique integer ID. As a side effect,
+  // if this is the first time this object encounters the device name,
+  // writes a DebuggedDevice proto to the .graphs file in the file set.
+  int RegisterDeviceAndGetId(const string& device_name);
+
+  // EventWriter automatically flushes and closes on destruction, but
+  // this method is provided for users who want to write to disk sooner
+  // and/or check for success.
+  // FlushNonExecutionFiles() pushes outstanding DebugEvents not written
+  // events to the circular buffer to their respective files.
+  absl::Status FlushNonExecutionFiles();
+
+  // Writes current contents of the circular buffers to their respective
+  // debug event files and clears the circular buffers.
+  absl::Status FlushExecutionFiles();
+
+  // Close() calls FlushNonExecutionFiles() and FlushExecutionFiles()
+  // and then closes the current debug events files.
+  absl::Status Close();
+
+ private:
+  static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
+
+  // Get a static map from dump-root path to DebugEventsWriter objects.
+  // This helps the per-dump-root singletone pattern.
+  GetDebugEventsWriterMap();
+
+  // Guards calls to the GetDebugEventsWriter() method.
+  static mutex factory_mu_;
+
+  DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
+                    int64_t circular_buffer_size);
+
+  // Get the path prefix. The same for all files, which differ only in the
+  // suffix.
+  string FileName(DebugEventFileType type);
+
+  // Initialize the TFRecord writer for non-metadata file type.
+  absl::Status InitNonMetadataFile(DebugEventFileType type);
+
+  absl::Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                           DebugEventFileType type);
+
+  void SelectWriter(DebugEventFileType type,
+                    std::unique_ptr<SingleDebugEventFileWriter>** writer);
+  const string GetSuffix(DebugEventFileType type);
+  string GetFileNameInternal(DebugEventFileType type);
+
+  Env* env_;
+  const string dump_root_;
+  const string tfdbg_run_id_;
+
+  string file_prefix_;
+  bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
+  mutex initialization_mu_;
+
+  const int64_t circular_buffer_size_;
+  std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
+  mutex execution_buffer_mu_;
+  std::deque<string> graph_execution_trace_buffer_
+      TF_GUARDED_BY(graph_execution_trace_buffer_mu_);
+  mutex graph_execution_trace_buffer_mu_;
+
+  absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_);
+  mutex device_mu_;
+
+  std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_;
+  std::unique_ptr<SingleDebugEventFileWriter> source_files_writer_;
+  std::unique_ptr<SingleDebugEventFileWriter> stack_frames_writer_;
+  std::unique_ptr<SingleDebugEventFileWriter> graphs_writer_;
+  std::unique_ptr<SingleDebugEventFileWriter> execution_writer_;
+  std::unique_ptr<SingleDebugEventFileWriter> graph_execution_traces_writer_;
+
+  DebugEventsWriter(const DebugEventsWriter&) = delete;
+  void operator=(const DebugEventsWriter&) = delete;
+
+  friend class DebugEventsWriterTest;
+};
+
+}  // namespace tfdbg
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/determinism.h b/third_party/tflite-hdrs/tensorflow/core/util/determinism.h
new file mode 100644
index 00000000..136534ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/determinism.h
@@ -0,0 +1,29 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_DETERMINISM_H_
+#define TENSORFLOW_CORE_UTIL_DETERMINISM_H_
+
+#include "xla/tsl/util/determinism.h"
+
+namespace tensorflow {
+
+using tsl::EnableOpDeterminism;
+using tsl::OpDeterminismRequired;
+using tsl::OpOrderDeterminismRequired;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DETERMINISM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/device_name_utils.h b/third_party/tflite-hdrs/tensorflow/core/util/device_name_utils.h
new file mode 100644
index 00000000..28b5b0f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/device_name_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
+#define TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
+
+#include "xla/tsl/util/device_name_utils.h"
+
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::DeviceNameUtils;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/dump_graph.h b/third_party/tflite-hdrs/tensorflow/core/util/dump_graph.h
new file mode 100644
index 00000000..0d0c5575
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/dump_graph.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#ifndef TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+#define TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+
+#include <functional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Dumps 'graph_def' to a file, as a GraphDef text or binary proto. Returns the
+// file name chosen. The format is determined by the TF_DUMP_GRAPH_FMT
+// environment variable (TXT or BIN).
+//
+// If the TF_DUMP_GRAPH_PREFIX environment variable is "-", then instead the
+// GraphDef will be logged (using the LOG() macro).
+//
+// Automatically picks a file name. Prefixes 'name' with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if 'dirname' is empty, and suffixes
+// 'name' with '.pbtxt' or '.pb'. If a graph has already been dumped by
+// this process with the same name, suffixes with "_n.pb(txt)", where 'n' is a
+// sequence number.
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, use CostGraphDef instead of GraphDef.
+string DumpCostGraphDefToFile(const string& name, CostGraphDef const& graph_def,
+                              const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
+// and an optional function library 'flib_def'. Returns the file name chosen.
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def = nullptr,
+                       const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
+// proto. Returns the file name chosen.
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but dumps a proto of any type. Returns the
+// file name chosen.
+string DumpProtoToFile(const string& name,
+                       tensorflow::protobuf::Message const& proto,
+                       const string& dirname = "");
+
+// Sets a custom Graph dumper. If set, this dumper will be used to dump graphs
+// instead via DumpGraphToFile. As the custom dumper may not produce protobufs,
+// allow specifying a file suffix/extension too.
+void SetGraphDumper(
+    std::function<absl::Status(const Graph& graph,
+                               const FunctionLibraryDefinition* flib_def,
+                               WritableFile*)>
+        dumper,
+    string suffix = ".pbtxt");
+
+// Dump data to a file.
+// This function will create a WritableFile and pass it to the dumper.
+// The dumper callback will be responsible for writing data to the file.
+string DumpToFile(const string& name, const string& dirname,
+                  const string& suffix, absl::string_view type_name,
+                  std::function<absl::Status(WritableFile*)> dumper);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/einsum_op_util.h b/third_party/tflite-hdrs/tensorflow/core/util/einsum_op_util.h
new file mode 100644
index 00000000..6155b8a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/einsum_op_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_EINSUM_OP_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_EINSUM_OP_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+using Labels = absl::InlinedVector<int, 8UL>;
+using OperandLabels = absl::InlinedVector<Labels, 2UL>;
+using LabelCounts = absl::InlinedVector<int, 8UL>;
+using OperandLabelCounts = absl::InlinedVector<LabelCounts, 2UL>;
+
+// Dummy axis label used to denote an ellipsis in an input or output subscript.
+constexpr int kEllipsisLabel = -1;
+
+// Each dimension is categorized into exactly one of five types based on
+// whether its corresponding label is present in the input and/or the output
+// subscripts.
+enum EinsumDimensionType {
+  // Batch dimensions are those present in two inputs as well as the output.
+  // They are part of the batch dimensions during Tensor contraction. Such
+  // dimensions may be broadcasting dimensions (those mapping to ellipsis)
+  // or explicit batch dimensions corresponding to named axis labels.
+  kBroadcasting = 0,
+  kBatch = 1,
+  // Free dimensions are present in exactly one of the inputs, and also the
+  // output. These are non-contracted axes in the Tensor contraction.
+  kFree = 2,
+  // Contract dimensions are present in two inputs, but not the output. These
+  // dimensions are contracted in Tensor contraction.
+  kContract = 3,
+  // Reduce dimensions are present in exactly one input; and not in the output
+  // and are summed over prior to Tensor contraction.
+  kReduce = 4,
+};
+
+// Parses and validates an einsum equation in explicit form.
+absl::Status ValidateEinsumEquation(
+    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
+    string* output_subscript);
+
+// Parses and validates the equation and the input shapes. Single character
+// labels are integerized and we populate input and output label subscripts
+// and corresponding counts. Also create the mapping from (named) labels to
+// their EinsumDimensionType.
+absl::Status ParseEinsumEquation(
+    const string& equation, OperandLabels* input_labels, Labels* output_labels,
+    std::vector<EinsumDimensionType>* label_types,
+    OperandLabelCounts* input_label_counts, LabelCounts* output_label_counts,
+    absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
+    bool* output_has_ellipsis);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_EINSUM_OP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/env_var.h b/third_party/tflite-hdrs/tensorflow/core/util/env_var.h
new file mode 100644
index 00000000..faad6153
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/env_var.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_ENV_VAR_H_
+#define TENSORFLOW_CORE_UTIL_ENV_VAR_H_
+
+#include "xla/tsl/util/env_var.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using tsl::ReadBoolFromEnvVar;
+using tsl::ReadFloatFromEnvVar;
+using tsl::ReadInt64FromEnvVar;
+using tsl::ReadStringFromEnvVar;
+using tsl::ReadStringsFromEnvVar;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_ENV_VAR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/equal_graph_def.h b/third_party/tflite-hdrs/tensorflow/core/util/equal_graph_def.h
new file mode 100644
index 00000000..9803b2db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/equal_graph_def.h
@@ -0,0 +1,100 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EQUAL_GRAPH_DEF_H_
+#define TENSORFLOW_CORE_UTIL_EQUAL_GRAPH_DEF_H_
+
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class GraphDef;
+class NodeDef;
+
+struct EqualGraphDefOptions {
+  // Should internal attributes (attribute names that start with '_') be
+  // ignored?
+  bool ignore_internal_attrs = true;
+};
+
+// Determines if actual and expected are equal, ignoring versions and ordering
+// of nodes, attrs, and control inputs.  If the GraphDefs are different and
+// diff != nullptr, *diff is set to an explanation of the difference.  Note that
+// we use node names to match up nodes between the graphs, and so the naming of
+// nodes must be consistent.
+bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
+                   string* diff, const EqualGraphDefOptions& options = {});
+
+// Returns a hash of `gdef` that is consistent with EqualGraphDef. In other
+// words, if two graph defs compare equal according to EqualGraphDef,
+// GraphDefHash will return the same value for both of them when called
+// with the same `options` that was used in the call to EqualGraphDef.
+// Similarly to protobuf deterministic serialization, hash value is
+// guaranteed to be stable only for a given binary. In particular, one should
+// probably not persist the returned value.
+uint64 GraphDefHash(const GraphDef& gdef,
+                    const EqualGraphDefOptions& options = {});
+
+// Determines if actual and expected are equal, ignoring: ordering of
+// attrs, internal attributes (if set in `options`), and control inputs.
+//
+// If the NodeDefs are different and
+// diff != nullptr, *diff is set to an explanation of the difference.
+bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
+                  const EqualGraphDefOptions& options = {});
+
+// Returns a hash of `ndef` that is consistent with EqualNodeDef. In other
+// words, if two node defs compare equal according to EqualNodeDef, NodeDefHash
+// will return the same value for both of them when called with the same
+// `options` that was used in the call to EqualNodeDef.
+// Similarly to protobuf deterministic serialization, hash value is
+// guaranteed to be stable only for a given binary. In particular, one should
+// probably not persist the returned value.
+uint64 NodeDefHash(const NodeDef& ndef,
+                   const EqualGraphDefOptions& options = {});
+
+// Determines if actual and expected are equal, ignoring ordering. If they're
+// different and diff != nullptr, *diff is set to an explanation of the
+// difference.
+bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
+                          const protobuf::RepeatedPtrField<NodeDef>& expected,
+                          string* diff,
+                          const EqualGraphDefOptions& options = {});
+
+// Returns a hash of `ndefs` that is consistent with EqualRepeatedNodeDef.
+// In other words, if two ndefs compare equal according to
+// EqualRepeatedNodeDef, RepeatedNodeDefHash will return the same value for
+// both of them when called with the same `options` that was used in
+// the call to EqualRepeatedNodeDef.
+// Similarly to protobuf deterministic serialization, hash value is
+// guaranteed to be stable only for a given binary. In particular, one should
+// probably not persist the returned value.
+uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
+                           const EqualGraphDefOptions& options = {});
+
+#define TF_EXPECT_GRAPH_EQ(expected, actual)            \
+  do {                                                  \
+    string diff;                                        \
+    EXPECT_TRUE(EqualGraphDef(actual, expected, &diff)) \
+        << diff << "\nExpected:\n"                      \
+        << SummarizeGraphDef(expected) << "\nActual:\n" \
+        << SummarizeGraphDef(actual);                   \
+  } while (false)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_EQUAL_GRAPH_DEF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/events_writer.h b/third_party/tflite-hdrs/tensorflow/core/util/events_writer.h
new file mode 100644
index 00000000..a06eac7d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/events_writer.h
@@ -0,0 +1,103 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EVENTS_WRITER_H_
+#define TENSORFLOW_CORE_UTIL_EVENTS_WRITER_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/event.pb.h"
+
+namespace tensorflow {
+
+class EventsWriter {
+ public:
+#ifndef SWIG
+  // Prefix of version string present in the first entry of every event file.
+  static constexpr const char* kVersionPrefix = "brain.Event:";
+  static constexpr const int kCurrentVersion = 2;
+  static constexpr const char* kWriterSourceMetadata =
+      "tensorflow.core.util.events_writer";
+#endif
+
+  // Events files typically have a name of the form
+  //   '/some/file/path/my.file.out.events.[timestamp].[hostname][suffix]'
+  // To create and EventWriter, the user should provide file_prefix =
+  //   '/some/file/path/my.file'
+  // The EventsWriter will append '.out.events.[timestamp].[hostname][suffix]'
+  // to the ultimate filename once Init() is called.
+  // Note that it is not recommended to simultaneously have two
+  // EventWriters writing to the same file_prefix.
+  explicit EventsWriter(const std::string& file_prefix);
+  ~EventsWriter();
+
+  // Sets the event file filename and opens file for writing.  If not called by
+  // user, will be invoked automatically by a call to FileName() or Write*().
+  // Returns false if the file could not be opened.  Idempotent: if file exists
+  // and is open this is a no-op.  If on the other hand the file was opened,
+  // but has since disappeared (e.g. deleted by another process), this will open
+  // a new file with a new timestamp in its filename.
+  absl::Status Init();
+  absl::Status InitWithSuffix(const std::string& suffix);
+
+  // Returns the filename for the current events file:
+  // filename_ = [file_prefix_].out.events.[timestamp].[hostname][suffix]
+  std::string FileName();
+
+  // Append "event" to the file.  The "tensorflow::" part is for swig happiness.
+  void WriteEvent(const tensorflow::Event& event);
+
+  // Append "event_str", a serialized Event, to the file.
+  // Note that this function does NOT check that de-serializing event_str
+  // results in a valid Event proto.  The tensorflow:: bit makes SWIG happy.
+  void WriteSerializedEvent(absl::string_view event_str);
+
+  // EventWriter automatically flushes and closes on destruction, but
+  // these two methods are provided for users who want to write to disk sooner
+  // and/or check for success.
+  //   Flush() pushes outstanding events to disk.  Returns false if the
+  // events file could not be created, or if the file exists but could not
+  // be written too.
+  //   Close() calls Flush() and then closes the current events file.
+  // Returns true only if both the flush and the closure were successful.
+  absl::Status Flush();
+  absl::Status Close();
+
+ private:
+  absl::Status FileStillExists();  // OK if event_file_path_ exists.
+  absl::Status InitIfNeeded();
+
+  Env* env_;
+  const std::string file_prefix_;
+  std::string file_suffix_;
+  std::string filename_;
+  std::unique_ptr<WritableFile> recordio_file_;
+  std::unique_ptr<io::RecordWriter> recordio_writer_;
+  int num_outstanding_events_;
+#ifndef SWIG
+  EventsWriter(const EventsWriter&) = delete;
+  void operator=(const EventsWriter&) = delete;
+#endif
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_EVENTS_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/example_proto_fast_parsing.h b/third_party/tflite-hdrs/tensorflow/core/util/example_proto_fast_parsing.h
new file mode 100644
index 00000000..6ba6d89a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/example_proto_fast_parsing.h
@@ -0,0 +1,172 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+namespace example {
+
+// FastParseExampleConfig defines how to parse features in Example.
+// Each sub-config is responsible for one feature identified with feature_name.
+// FastParseExampleConfig can't have two sub-configs with the same feature_name.
+// dtype identifies the type of output vector and the kind of Feature expected
+// in Example.
+struct FastParseExampleConfig {
+  struct Dense {
+    Dense(absl::string_view feature_name, DataType dtype,
+          PartialTensorShape shape, Tensor default_value, bool variable_length,
+          std::size_t elements_per_stride)
+        : feature_name(feature_name),  // TODO(mrry): Switch to preallocated
+                                       // tstring when this is available.
+          dtype(dtype),
+          shape(std::move(shape)),
+          default_value(std::move(default_value)),
+          variable_length(variable_length),
+          elements_per_stride(elements_per_stride) {}
+    Dense() = default;
+
+    tstring feature_name;
+    DataType dtype;
+    // These 2 fields correspond exactly to dense_shapes and dense_defaults in
+    // ParseExample op.
+    // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
+    PartialTensorShape shape;
+    Tensor default_value;
+    bool variable_length;
+    std::size_t elements_per_stride;
+  };
+
+  struct Sparse {
+    Sparse(absl::string_view feature_name, DataType dtype)
+        : feature_name(feature_name),  // TODO(mrry): Switch to preallocated
+                                       // tstring when this is available.
+          dtype(dtype) {}
+    Sparse() = default;
+
+    tstring feature_name;
+    DataType dtype;
+  };
+
+  struct Ragged {
+    Ragged(absl::string_view feature_name, DataType dtype,
+           DataType splits_dtype)
+        : feature_name(feature_name),  // TODO(mrry): Switch to preallocated
+                                       // tstring when this is available.
+          dtype(dtype),
+          splits_dtype(splits_dtype) {}
+    Ragged() = default;
+
+    tstring feature_name;
+    DataType dtype;
+    DataType splits_dtype;
+  };
+
+  std::vector<Dense> dense;
+  std::vector<Sparse> sparse;
+  std::vector<Ragged> ragged;
+
+  // If `true`, `Result::feature_stats` will contain one
+  // `PerExampleFeatureStats` for each serialized example in the input.
+  bool collect_feature_stats = false;
+};
+
+// Statistics about the features in each example passed to
+// `FastParse[Single]Example()`.
+//
+// TODO(b/111553342): The gathered statistics currently have two limitations:
+// * Feature names that appear more than once will be counted multiple times.
+// * The feature values count only represents the counts for features that were
+//   requested in the `FastParseExampleConfig`.
+// These could be addressed with additional work at runtime.
+struct PerExampleFeatureStats {
+  // The number of feature names in an example.
+  size_t features_count = 0;
+
+  // The sum of the number of values in each feature that is parsed.
+  size_t feature_values_count = 0;
+};
+
+// This is exactly the output of TF's ParseExample Op.
+// Documentation is available in: tensorflow/core/ops/parsing_ops.cc
+struct Result {
+  std::vector<Tensor> sparse_indices;
+  std::vector<Tensor> sparse_values;
+  std::vector<Tensor> sparse_shapes;
+  std::vector<Tensor> dense_values;
+  std::vector<Tensor> ragged_values;
+  std::vector<Tensor> ragged_splits;
+  std::vector<Tensor> ragged_outer_splits;  // For SequenceExamples
+
+  // This vector will be populated with one element per example if
+  // `FastParseExampleConfig::collect_feature_stats` is set to `true`.
+  std::vector<PerExampleFeatureStats> feature_stats;
+};
+
+// Parses a batch of serialized Example protos and converts them into result
+// according to given config.
+// Given example names have to either be empty or the same size as serialized.
+// example_names are used only for error messages.
+absl::Status FastParseExample(const FastParseExampleConfig& config,
+                              absl::Span<const tstring> serialized,
+                              absl::Span<const tstring> example_names,
+                              thread::ThreadPool* thread_pool, Result* result);
+
+// TODO(mrry): Move the hash table construction into the config object.
+typedef FastParseExampleConfig FastParseSingleExampleConfig;
+
+absl::Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
+                                    absl::string_view serialized,
+                                    Result* result);
+
+// Parses a batch of serialized SequenceExample protos and converts them into
+// result according to given config.
+// Given example names have to either be empty or the same size as serialized.
+// example_names are used only for error messages.
+// (If batch=true, then this parses a single SequenceExample.)
+absl::Status FastParseSequenceExample(
+    const example::FastParseExampleConfig& context_config,
+    const example::FastParseExampleConfig& sequence_config,
+    absl::Span<const tstring> serialized,
+    absl::Span<const tstring> example_names, thread::ThreadPool* thread_pool,
+    example::Result* context_result, example::Result* sequence_result,
+    std::vector<Tensor>* dense_feature_lengths, bool is_batch = true);
+
+// This function parses serialized Example and populates given example.
+// It uses the same specialized parser as FastParseExample which is efficient.
+// But then constructs Example which is relatively slow.
+// It is exported here as a convenient API to test parser part separately.
+bool TestFastParse(const string& serialized, Example* example);
+
+}  // namespace example
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/example_proto_helper.h b/third_party/tflite-hdrs/tensorflow/core/util/example_proto_helper.h
new file mode 100644
index 00000000..801aae37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/example_proto_helper.h
@@ -0,0 +1,369 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+// This is a set of helper methods that will make it possible to share
+// tensorflow::Example proto Tensor conversion code inside the ExampleParserOp
+// OpKernel as well as in external code.
+namespace tensorflow {
+
+// "Dense" feature configuration.
+struct FixedLenFeature {
+  string key;
+  DataType dtype;
+  TensorShape shape;
+  Tensor default_value;
+  string values_output_tensor_name;
+};
+
+// "Sparse" feature configuration.
+struct VarLenFeature {
+  string key;
+  DataType dtype;
+  string values_output_tensor_name;
+  string indices_output_tensor_name;
+  string shapes_output_tensor_name;
+};
+
+// Given a single tensorflow::Example, with an optional example name
+// at a particular index within a batch, and dense and sparse feature
+// configurations from fixed_len_features, var_len_features, this method
+// updates the dense value tensor and the sparse values temporary vector
+// of tensors. The indexing of the output vectors correspond 1:1 to the
+// indexing of the feature configuration vectors.
+//
+// The fixed_len_features and var_len_features maps are assume to be
+// have disjoint key fields from the Feature map in the tensorflow.Example
+// proto.
+//
+// For each sparse feature, the sparse values temporary vector holds a
+// tensor for each Example. Each tensor is either empty or filled, depending
+// on if the sparse feature value is set for the Example. This
+// temporary structure is needed because we need to know the total number
+// of filled elements in the batch to get the proper final sparse tensor
+// shapes allocated.  After the entire batch is processed,
+// GetSparseTensorShape can be used to calculate the final shapes and
+// CopyIntoSparseTensor can be used to copy from the temporary vector
+// into the final allocated tensors.
+absl::Status SingleExampleProtoToTensors(
+    const Example& example, const string& name, int batch_index,
+    const std::vector<FixedLenFeature>& fixed_len_features,
+    const std::vector<VarLenFeature>& var_len_features,
+    std::vector<Tensor*>* output_dense_values_tensor,
+    std::vector<std::vector<Tensor>>* output_sparse_values_tmp);
+
+// The shape of the indices and values tensors associated with a SparseTensor
+// are dependent on the contents of the batch.
+struct VarLenFeatureBatchShapes {
+  TensorShape indices_shape;
+  TensorShape values_shape;
+  int max_num_features;
+};
+
+// Get the shape of the sparse values and indices tensors for the batch,
+// given how many of the tensors in the temporary sparse values vector
+// are actually filled.
+absl::Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
+                                   const std::vector<Tensor>& sparse_values_tmp,
+                                   int batch_size,
+                                   VarLenFeatureBatchShapes* output_shapes);
+
+// A method to convert a batch of tensorflow::Example protos into output
+// tensors. This method is useful if there already is a batch of deserialized
+// Example protos in memory (such as a serving use-case) and we do not wish
+// to incur an extraneous serialize/deserialize.  It is intended
+// as an outside of OpKernel compatible replacement for the functionality of
+// ExampleParserOp. In a serving setting, this method could be used to produce
+// a feed_dict of Tensors that could bypass the ExampleParserOp.
+//
+// Note that unlike SingleExampleProtoToTensors, output tensors are
+// allocated using a provided Allocator within this method.
+absl::Status BatchExampleProtoToTensors(
+    const std::vector<const Example*>& examples,
+    const std::vector<string>& names,
+    const std::vector<FixedLenFeature>& fixed_len_features,
+    const std::vector<VarLenFeature>& var_len_features, Allocator* allocator,
+    std::vector<Tensor>* output_dense_values_tensor,
+    std::vector<Tensor>* output_sparse_indices_tensor,
+    std::vector<Tensor>* output_sparse_values_tensor,
+    std::vector<Tensor>* output_sparse_shapes_tensor);
+
+// Check that the given dtype is one that is compatible with
+// tensorflow::Example protocol buffer feature values.
+absl::Status CheckValidType(const DataType& dtype);
+
+// Check that the provided Feature proto message's oneof value
+// matches that of the provided dtype.
+absl::Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
+                             bool* match);
+
+// For a single Example, copy a dense feature value into an output
+// dense value tensor Out at the provided out_index offset.
+absl::Status FeatureDenseCopy(std::size_t out_index, const string& name,
+                              const string& key, const DataType& dtype,
+                              const TensorShape& shape, const Feature& feature,
+                              Tensor* out);
+
+// Copy the value a provided Tensor into an output dense_value tensor Out
+// at the provided out_index offset.
+void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
+                  const Tensor& in, Tensor* out);
+
+// For a single Example, and given sparse feature return a temporary output
+// Tensor suitable for being collected in the temporary sparse value vector.
+Tensor FeatureSparseCopy(std::size_t batch, const string& key,
+                         const DataType& dtype, const Feature& feature);
+
+// Copy a temporary Tensor into the final sparse indices and values
+// tensor at a given batch index and element offset. This method
+// assumes that the indices/values Tensors have been properly allocated
+// for the batch.
+int64_t CopyIntoSparseTensor(const Tensor& in, int batch, int64_t offset,
+                             Tensor* indices, Tensor* values);
+
+// Check that each dense_shape has known rank and inner dimensions; and
+// update variable_length (whether the outer dimension is None) and
+// elements_per_stride for each denes_shape.
+absl::Status GetDenseShapes(const std::vector<PartialTensorShape>& dense_shapes,
+                            std::vector<bool>* variable_length,
+                            std::vector<std::size_t>* elements_per_stride);
+
+// Parses the attributes passed to ParseExample.
+// REQUIRES: Init must be called after construction.
+struct ParseExampleAttrs {
+ public:
+  template <typename ContextType>
+  absl::Status Init(ContextType* ctx, int op_version = 1) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_types", &sparse_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Tdense", &dense_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("dense_shapes", &dense_shapes));
+    TF_RETURN_IF_ERROR(
+        GetDenseShapes(dense_shapes, &variable_length, &elements_per_stride));
+    switch (op_version) {
+      case 1:
+        TF_RETURN_IF_ERROR(ctx->GetAttr("Nsparse", &num_sparse));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("Ndense", &num_dense));
+        break;
+      case 2:
+        TF_RETURN_IF_ERROR(
+            ctx->GetAttr("ragged_value_types", &ragged_value_types));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("num_sparse", &num_sparse));
+        TF_RETURN_IF_ERROR(
+            ctx->GetAttr("ragged_split_types", &ragged_split_types));
+        break;
+      default:
+        return errors::InvalidArgument("Unexpected op_version", op_version);
+    }
+    return FinishInit(op_version);
+  }
+
+  int64_t num_sparse;
+  int64_t num_dense;
+  int64_t num_ragged;
+  std::vector<DataType> sparse_types;
+  std::vector<DataType> dense_types;
+  std::vector<DataType> ragged_value_types;
+  std::vector<DataType> ragged_split_types;
+  std::vector<PartialTensorShape> dense_shapes;
+  std::vector<bool> variable_length;
+  std::vector<std::size_t> elements_per_stride;
+
+ private:
+  absl::Status FinishInit(
+      int op_version);  // for context-independent parts of Init.
+};
+
+// Parses the attributes passed to ParseSingleExample.
+// REQUIRES: Init must be called after construction.
+struct ParseSingleExampleAttrs {
+ public:
+  template <typename ContextType>
+  absl::Status Init(ContextType* ctx) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_keys", &sparse_keys));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_types", &sparse_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("dense_keys", &dense_keys));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Tdense", &dense_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("dense_shapes", &dense_shapes));
+
+    int num_sparse;
+    TF_RETURN_IF_ERROR(ctx->GetAttr("num_sparse", &num_sparse));
+    if (num_sparse != sparse_keys.size() || num_sparse != sparse_types.size()) {
+      return errors::InvalidArgument(
+          "num_sparse (", num_sparse, ") must match the size of sparse_keys (",
+          sparse_keys.size(), ") and sparse_types (", sparse_types.size(), ")");
+    }
+
+    TF_RETURN_IF_ERROR(
+        GetDenseShapes(dense_shapes, &variable_length, &elements_per_stride));
+    return FinishInit();
+  }
+
+  std::vector<tstring> sparse_keys;
+  std::vector<DataType> sparse_types;
+  std::vector<tstring> dense_keys;
+  std::vector<DataType> dense_types;
+  std::vector<PartialTensorShape> dense_shapes;
+  std::vector<bool> variable_length;
+  std::vector<std::size_t> elements_per_stride;
+
+ private:
+  absl::Status FinishInit();  // for context-independent parts of Init.
+};
+
+// Parses the attributes passed to ParseSequenceExample.
+// REQUIRES: Init must be called after construction.
+struct ParseSequenceExampleAttrs {
+ public:
+  template <typename ContextType>
+  absl::Status Init(ContextType* ctx, int op_version = 1) {
+    switch (op_version) {
+      case 1: {
+        std::vector<string> missing_empty_vector;
+        TF_RETURN_IF_ERROR(ctx->GetAttr(
+            "feature_list_dense_missing_assumed_empty", &missing_empty_vector));
+        for (const string& feature : missing_empty_vector) {
+          feature_list_dense_missing_assumed_empty.insert(feature);
+        }
+      }
+        TF_RETURN_IF_ERROR(
+            ctx->GetAttr("context_sparse_keys", &context_sparse_keys));
+        TF_RETURN_IF_ERROR(
+            ctx->GetAttr("context_dense_keys", &context_dense_keys));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("feature_list_sparse_keys",
+                                        &feature_list_sparse_keys));
+        TF_RETURN_IF_ERROR(
+            ctx->GetAttr("feature_list_dense_keys", &feature_list_dense_keys));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("Ncontext_dense", &num_context_dense));
+        break;
+      case 2:
+        TF_RETURN_IF_ERROR(ctx->GetAttr("context_ragged_value_types",
+                                        &context_ragged_value_types));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("context_ragged_split_types",
+                                        &context_ragged_split_types));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("feature_list_ragged_value_types",
+                                        &feature_list_ragged_value_types));
+        TF_RETURN_IF_ERROR(ctx->GetAttr("feature_list_ragged_split_types",
+                                        &feature_list_ragged_split_types));
+        break;
+      default:
+        return errors::InvalidArgument("Unexpected op_version", op_version);
+    }
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("context_sparse_types", &context_sparse_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("Nfeature_list_dense", &num_feature_list_dense));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Ncontext_sparse", &num_context_sparse));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Tcontext_dense", &context_dense_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("feature_list_sparse_types", &feature_list_sparse_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("feature_list_dense_types", &feature_list_dense_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("Nfeature_list_sparse", &num_feature_list_sparse));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("context_dense_shapes", &context_dense_shapes));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("feature_list_dense_shapes", &feature_list_dense_shapes));
+    return FinishInit(op_version);
+  }
+
+  std::unordered_set<string> feature_list_dense_missing_assumed_empty;
+  int64_t num_context_sparse;
+  int64_t num_context_dense;
+  int64_t num_context_ragged;
+  int64_t num_feature_list_sparse;
+  int64_t num_feature_list_dense;
+  int64_t num_feature_list_ragged;
+  std::vector<tstring> context_sparse_keys;
+  std::vector<tstring> context_dense_keys;
+  std::vector<tstring> feature_list_sparse_keys;
+  std::vector<tstring> feature_list_dense_keys;
+  std::vector<DataType> context_sparse_types;
+  std::vector<DataType> context_dense_types;
+  std::vector<TensorShape> context_dense_shapes;
+  std::vector<DataType> feature_list_sparse_types;
+  std::vector<DataType> feature_list_dense_types;
+  std::vector<TensorShape> feature_list_dense_shapes;
+  std::vector<DataType> context_ragged_value_types;
+  std::vector<DataType> context_ragged_split_types;
+  std::vector<DataType> feature_list_ragged_value_types;
+  std::vector<DataType> feature_list_ragged_split_types;
+
+ private:
+  absl::Status FinishInit(
+      int op_version);  // for context-independent parts of Init.
+};
+
+// Parses the attributes passed to ParseSingleSequenceExample.
+// REQUIRES: Init must be called after construction.
+struct ParseSingleSequenceExampleAttrs {
+ public:
+  template <typename ContextType>
+  absl::Status Init(ContextType* ctx) {
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("context_sparse_types", &context_sparse_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Ncontext_dense", &num_context_dense));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("Nfeature_list_dense", &num_feature_list_dense));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Ncontext_sparse", &num_context_sparse));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Tcontext_dense", &context_dense_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("feature_list_sparse_types", &feature_list_sparse_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("feature_list_dense_types", &feature_list_dense_types));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("Nfeature_list_sparse", &num_feature_list_sparse));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("context_dense_shapes", &context_dense_shapes));
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("feature_list_dense_shapes", &feature_list_dense_shapes));
+    return FinishInit();
+  }
+
+  int64_t num_context_sparse;
+  int64_t num_context_dense;
+  int64_t num_feature_list_sparse;
+  int64_t num_feature_list_dense;
+  std::vector<DataType> context_sparse_types;
+  std::vector<DataType> context_dense_types;
+  std::vector<TensorShape> context_dense_shapes;
+  std::vector<DataType> feature_list_sparse_types;
+  std::vector<DataType> feature_list_dense_types;
+  std::vector<TensorShape> feature_list_dense_shapes;
+
+ private:
+  absl::Status FinishInit();  // for context-independent parts of Init.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/exec_on_stall.h b/third_party/tflite-hdrs/tensorflow/core/util/exec_on_stall.h
new file mode 100644
index 00000000..d4a6c552
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/exec_on_stall.h
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
+#define TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
+
+#include <functional>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// An object that executes a particular function only if it
+// is not deleted within the allotted number of seconds.
+//
+// This can be useful in diagnosing deadlocks, stalls and memory leaks
+// without logging too aggressively.
+class ExecuteOnStall {
+ public:
+  // delay_secs: If the object still exists after this many seconds,
+  //     execute f.
+  // f: The function to be executed, for example a detailed log of the
+  //    the state of an object to which this is attached.
+  // poll_microseconds: The spawned thread will wake and test whether
+  //    the destructor has been invoked this frequently.
+  ExecuteOnStall(int delay_secs, std::function<void()> f,
+                 int32_t poll_microseconds = 100)
+      : disabled_(false),
+        joined_(false),
+        env_(Env::Default()),
+        f_(f),
+        poll_microseconds_(poll_microseconds) {
+    deadline_ = env_->NowMicros() + 1000000 * delay_secs;
+    env_->SchedClosure([this]() {
+      while (env_->NowMicros() < deadline_) {
+        {
+          mutex_lock l(mu_);
+          if (disabled_) {
+            break;
+          }
+        }
+        env_->SleepForMicroseconds(poll_microseconds_);
+      }
+      {
+        mutex_lock l(mu_);
+        if (!disabled_) {
+          f_();
+        }
+        joined_ = true;
+        cond_var_.notify_all();
+      }
+    });
+  }
+
+  ~ExecuteOnStall() {
+    // Wait for spawned thread to terminate.
+    mutex_lock l(mu_);
+    disabled_ = true;
+    if (!joined_) {
+      cond_var_.wait(l);
+    }
+  }
+
+ private:
+  mutex mu_;
+  condition_variable cond_var_;
+  bool disabled_ TF_GUARDED_BY(mu_);
+  bool joined_ TF_GUARDED_BY(mu_);
+  Env* env_;
+  std::function<void()> f_;
+  int64_t deadline_;
+  int32 poll_microseconds_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/fake_clock_env.h b/third_party/tflite-hdrs/tensorflow/core/util/fake_clock_env.h
new file mode 100644
index 00000000..2ded1708
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/fake_clock_env.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CORE_UTIL_FAKE_CLOCK_ENV_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// An Env implementation with a fake clock for NowMicros().
+// The clock doesn't advance on its own. It advances
+// via an explicit AdvanceByMicroseconds() method. All other Env virtual methods
+// pass through to a wrapped Env.
+class FakeClockEnv : public EnvWrapper {
+ public:
+  explicit FakeClockEnv(Env* wrapped);
+  ~FakeClockEnv() override = default;
+
+  // Advance the clock by a certain number of microseconds.
+  void AdvanceByMicroseconds(int64_t micros);
+
+  // Returns the current time of FakeClockEnv in microseconds.
+  uint64 NowMicros() const override;
+
+ private:
+  mutable mutex mu_;
+  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+
+  FakeClockEnv(const FakeClockEnv&) = delete;
+  void operator=(const FakeClockEnv&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/gpu_cuda_alias.h b/third_party/tflite-hdrs/tensorflow/core/util/gpu_cuda_alias.h
new file mode 100644
index 00000000..0a15d15e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/gpu_cuda_alias.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
+#define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
+
+// Several forwarding macros are defined in this file to serve for backward
+// compatibility usage as we migrating from CUDA prefixed function to GPU
+// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix
+// naming scheme. In the migration period, we provide equivalent CUDA* and GPU*
+// function. Over time, all CUDA* functions will be deprecated.
+
+namespace tensorflow {
+
+// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
+  template <typename... Args>                             \
+  auto cuda_alias(Args&&... args)                         \
+      ->decltype(func(std::forward<Args>(args)...)) {     \
+    return func(std::forward<Args>(args)...);             \
+  }
+#else
+#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
+#endif
+
+// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA
+// Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
+  template <typename... Args>                               \
+  __device__ auto cuda_alias(Args&&... args)                \
+      ->decltype(func(std::forward<Args>(args)...)) {       \
+    return func(std::forward<Args>(args)...);               \
+  }
+#else
+#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
+#endif
+
+// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
+#else
+#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias)
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/gpu_device_functions.h b/third_party/tflite-hdrs/tensorflow/core/util/gpu_device_functions.h
new file mode 100644
index 00000000..bb9ff8c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/gpu_device_functions.h
@@ -0,0 +1,1002 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
+#define TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
+
+/**
+ * Wrappers and helpers for CUDA device code.
+ *
+ * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
+ * backwards compatibility, see go/volta-porting for details.
+ * Provides atomic operations on types that aren't natively supported.
+ * Defines a number of macros and types providing a shared interface
+ * to either CUDA or ROCm APIs, depending on the build.
+ */
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <algorithm>
+#include <complex>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#else
+#include "rocm/include/hip/hip_complex.h"
+#endif
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_cuda_alias.h"
+
+#if GOOGLE_CUDA
+using gpuStream_t = cudaStream_t;
+using gpuEvent_t = cudaEvent_t;
+#define gpuEventRecord cudaEventRecord
+#define gpuEventSynchronize cudaEventSynchronize
+#define gpuEventDestroy cudaEventDestroy
+#define gpuEventCreate cudaEventCreate
+#define gpuEventCreateWithFlags cudaEventCreateWithFlags
+#define gpuEventDisableTiming cudaEventDisableTiming
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuFree cudaFree
+#elif TENSORFLOW_USE_ROCM
+using gpuStream_t = hipStream_t;
+using gpuEvent_t = hipEvent_t;
+using cudaError = int;
+using cudaError_t = int;
+#define cudaSuccess 0
+#define cudaGetLastError hipGetLastError
+#define gpuEventRecord hipEventRecord
+#define gpuEventDestroy hipEventDestroy
+#define gpuEventSynchronize hipEventSynchronize
+#define gpuEventCreate hipEventCreate
+#define gpuEventCreateWithFlags hipEventCreateWithFlags
+#define gpuEventDisableTiming hipEventDisableTiming
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuFree hipFree
+static std::string cudaGetErrorString(int err) { return std::to_string(err); }
+#endif
+
+#define TF_RETURN_IF_CUDA_ERROR(result)                                       \
+  do {                                                                        \
+    cudaError_t error(result);                                                \
+    if (!TF_PREDICT_TRUE(error == cudaSuccess)) {                             \
+      return absl::InternalError(                                             \
+          absl::StrCat("Cuda call failed with ", cudaGetErrorString(error))); \
+    }                                                                         \
+  } while (0)
+
+#define TF_OP_REQUIRES_CUDA_SUCCESS(context, result)                          \
+  do {                                                                        \
+    cudaError_t error(result);                                                \
+    if (!TF_PREDICT_TRUE(error == cudaSuccess)) {                             \
+      context->SetStatus(absl::InternalError(                                 \
+          absl::StrCat("Cuda call failed with", cudaGetErrorString(error)))); \
+      return;                                                                 \
+    }                                                                         \
+  } while (0)
+
+namespace tensorflow {
+// According to HIP developer guide at
+// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#assert
+// assert is not supported by HIP. While we are waiting for assert support in
+// hip kernels, the assert call should be macroed to NOP so that it does not
+// block us from creating a debug build
+#if TENSORFLOW_USE_ROCM
+#undef assert
+#define assert(x) \
+  {}
+#endif
+
+namespace detail {
+
+// Helper for range-based for loop using 'delta' increments.
+// Usage: see GpuGridRange?() functions below.
+template <typename T>
+class GpuGridRange {
+  struct Iterator {
+    __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __device__ T operator*() const { return index_; }
+    __device__ Iterator& operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __device__ bool operator!=(const Iterator& other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    T index_;
+    const T delta_;
+  };
+
+ public:
+  __device__ GpuGridRange(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+#ifndef TENSORFLOW_USE_ROCM
+template <typename... T>
+using CudaGridRange = GpuGridRange<T...>;
+#endif
+}  // namespace detail
+
+// Helper to visit indices in the range 0 <= i < count, using the x-coordinate
+// of the global thread index. That is, each index i is visited by all threads
+// with the same x-coordinate.
+// Usage: for(int i : GpuGridRangeX(count)) { visit(i); }
+template <typename T>
+__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
+  return detail::GpuGridRange<T>(
+      /*begin=*/blockIdx.x * blockDim.x + threadIdx.x,
+      /*delta=*/gridDim.x * blockDim.x, /*end=*/count);
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeX, CudaGridRangeX);
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : GpuGridRangeY(count)) { visit(i); }
+template <typename T>
+__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
+  return detail::GpuGridRange<T>(
+      /*begin=*/blockIdx.y * blockDim.y + threadIdx.y,
+      /*delta=*/gridDim.y * blockDim.y, /*end=*/count);
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeY, CudaGridRangeY);
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : GpuGridRangeZ(count)) { visit(i); }
+template <typename T>
+__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
+  return detail::GpuGridRange<T>(
+      /*begin=*/blockIdx.z * blockDim.z + threadIdx.z,
+      /*delta=*/gridDim.z * blockDim.z, /*end=*/count);
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeZ, CudaGridRangeZ);
+
+// Mask for all 32 threads in a warp.
+__device__ const unsigned kCudaWarpAll = 0xffffffff;
+// ROCM TODO add ROCM implementation
+// Mask for all 64 threads in a wavefront.
+__device__ const unsigned kGpuWarpAll = 0xffffffff;
+
+// Returns the warp lane ID of the calling thread
+__device__ inline unsigned GpuLaneId() {
+  unsigned int lane_id;
+#if GOOGLE_CUDA
+#if __clang__
+  return __nvvm_read_ptx_sreg_laneid();
+#else   // __clang__
+  asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+#endif  // __clang__
+#elif TENSORFLOW_USE_ROCM
+  lane_id = __lane_id();
+#endif
+  return lane_id;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLaneId, CudaLaneId);
+
+namespace detail {
+// Returns true if mask is a valid parameter for __shfl*sync to return a well
+// defined value, assuming the calling lane will read from src_lane as part of
+// the shuffle operation.
+//
+// Specifically, returns true iff mask has the calling lane bit and the src_lane
+// bit set, and the src_lane calls this function with the same mask value
+// (required for the two threads to wait for each other).
+//
+// On Volta, for some invalid masks, this function hangs or returns false
+// positives, because the implementation shuffles with the same mask that
+// we are validating. Run on Pascal if you suspect that the mask is incorrect.
+__device__ inline bool GpuValidateShuffleSyncMask(unsigned mask,
+                                                  unsigned src_lane) {
+  unsigned src_dst_mask = 1u << GpuLaneId() | 1u << src_lane;
+#if GOOGLE_CUDA
+  unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
+#else  // TENSORFLOW_USE_ROCM
+  unsigned src_lane_mask =
+      __shfl(static_cast<int>(mask), static_cast<int>(src_lane));
+#endif
+  return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuValidateShuffleSyncMask,
+                                  CudaValidateShuffleSyncMask);
+
+// Returns the actual source lane for shuffle.
+__device__ inline unsigned GpuShuffleGetSrcLane(int src_lane, int width) {
+  int lane_id = GpuLaneId();
+  int lane_base = lane_id & ~width + 1;
+  int lane_offset = src_lane & width - 1;
+  return lane_base + lane_offset;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleGetSrcLane, CudaShuffleGetSrcLane);
+
+// Returns the source lane for shuffle up.
+__device__ inline unsigned GpuShuffleUpGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = GpuLaneId();
+  if ((lane_id & width - 1) < delta) {
+    return lane_id;
+  }
+  return lane_id - delta;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpGetSrcLane,
+                                  CudaShuffleUpGetSrcLane);
+
+// Returns the source lane for shuffle down.
+__device__ inline unsigned GpuShuffleDownGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = GpuLaneId();
+  if ((lane_id & width - 1) + delta >= width) {
+    return lane_id;
+  }
+  return lane_id + delta;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownGetSrcLane,
+                                  CudaShuffleDownGetSrcLane);
+
+// Returns the source lane for shuffle xor.
+__device__ inline unsigned GpuShuffleXorGetSrcLane(int lane_mask, int width) {
+  int lane_id = GpuLaneId();
+  int src_lane = lane_id ^ lane_mask;
+  if (src_lane > (lane_id | width - 1)) {
+    return lane_id;
+  }
+  return src_lane;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorGetSrcLane,
+                                  CudaShuffleXorGetSrcLane);
+}  // namespace detail
+
+// For all *_sync wrappers below, it is illegal to synchronize threads from
+// different program locations, because that is not supported before sm_70.
+// In other words, all threads in 'mask' must call the functions in convergence.
+// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
+//
+// It is also illegal to shuffle with a mask that produces an undefined result
+// for any of the threads. Specifically, all source threads of the shuffle
+// must have their corresponding bit in 'mask' set.
+
+// Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
+__device__ inline void GpuSyncWarp(unsigned mask = kCudaWarpAll) {
+  assert(mask & 1u << GpuLaneId());
+#if GOOGLE_CUDA
+  __syncwarp(mask);
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuSyncWarp, CudaSyncWarp);
+
+// Wrapper for __ballot_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline unsigned GpuBallotSync(unsigned mask, int pred) {
+  assert(mask & 1u << GpuLaneId());
+#if GOOGLE_CUDA
+  return __ballot_sync(mask, pred);
+#else  // TENSORFLOW_USE_ROCM
+  return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuBallotSync, CudaBallotSync);
+
+// Wrapper for __any_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int GpuAnySync(unsigned mask, int pred) {
+  assert(mask & 1u << GpuLaneId());
+#if GOOGLE_CUDA
+  return __any_sync(mask, pred);
+#else  // TENSORFLOW_USE_ROCM
+  return __any(pred);
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAnySync, CudaAnySync);
+
+// Wrapper for __all_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int GpuAllSync(unsigned mask, int pred) {
+  assert(mask & 1u << GpuLaneId());
+#if GOOGLE_CUDA
+  return __all_sync(mask, pred);
+#else  // TENSORFLOW_USE_ROCM
+  return __all(pred);
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAllSync, CudaAllSync);
+
+// Wrapper for __shfl_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T GpuShuffleSync(unsigned mask, T value, int src_lane,
+                            int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleGetSrcLane(src_lane, width)));
+#if GOOGLE_CUDA
+  return __shfl_sync(mask, value, src_lane, width);
+#else  // TENSORFLOW_USE_ROCM
+  return __shfl(value, src_lane, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double GpuShuffleSync(unsigned mask, double value,
+                                        int src_lane, int width = warpSize) {
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = GpuShuffleSync(mask, hi, src_lane, width);
+  lo = GpuShuffleSync(mask, lo, src_lane, width);
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl(static_cast<int>(hi), src_lane, width);
+  lo = __shfl(static_cast<int>(lo), src_lane, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleSync, CudaShuffleSync);
+
+// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ inline T GpuShuffleUpSync(unsigned mask, T value, unsigned delta,
+                                     int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleUpGetSrcLane(delta, width)));
+#if GOOGLE_CUDA
+  return __shfl_up_sync(mask, value, delta, width);
+#else  // TENSORFLOW_USE_ROCM
+  return __shfl_up(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double GpuShuffleUpSync(unsigned mask, double value,
+                                          unsigned delta,
+                                          int width = warpSize) {
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = GpuShuffleUpSync(mask, hi, delta, width);
+  lo = GpuShuffleUpSync(mask, lo, delta, width);
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl_up(static_cast<int>(hi), delta, width);
+  lo = __shfl_up(static_cast<int>(lo), delta, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);
+
+// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
+// in convergence, see comment above for details.
+template <typename T>
+__device__ inline T GpuShuffleDownSync(unsigned mask, T value, unsigned delta,
+                                       int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleDownGetSrcLane(delta, width)));
+#if GOOGLE_CUDA
+  return __shfl_down_sync(mask, value, delta, width);
+#else  // TENSORFLOW_USE_ROCM
+  return __shfl_down(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double GpuShuffleDownSync(unsigned mask, double value,
+                                            unsigned delta,
+                                            int width = warpSize) {
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = GpuShuffleDownSync(mask, hi, delta, width);
+  lo = GpuShuffleDownSync(mask, lo, delta, width);
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl_down(static_cast<int>(hi), delta, width);
+  lo = __shfl_down(static_cast<int>(lo), delta, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);
+
+// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T GpuShuffleXorSync(unsigned mask, T value, int lane_mask,
+                               int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
+#if GOOGLE_CUDA
+  return __shfl_xor_sync(mask, value, lane_mask, width);
+#elif TENSORFLOW_USE_ROCM
+  // ROCM TODO: check if HIP should be changed to cope with more types
+  return __shfl_xor(static_cast<int>(value), lane_mask, width);
+#endif
+}
+
+#if TENSORFLOW_USE_ROCM
+__device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
+                                                Eigen::half value,
+                                                int lane_mask,
+                                                int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
+  // TODO(rocm): This doesn't preserve NaN payload and flushes denorms to zero,
+  // maybe this should be implemented differently?
+  return static_cast<Eigen::half>(
+      __shfl_xor(static_cast<float>(value), lane_mask, width));
+}
+#endif
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double GpuShuffleXorSync(unsigned mask, double value,
+                                           int lane_mask,
+                                           int width = warpSize) {
+#if GOOGLE_CUDA
+  auto tmp = __double_as_longlong(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = GpuShuffleXorSync(mask, hi, lane_mask, width);
+  lo = GpuShuffleXorSync(mask, lo, lane_mask, width);
+  return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
+#elif TENSORFLOW_USE_ROCM
+  auto tmp = static_cast<uint64_t>(value);
+  auto lo = static_cast<unsigned>(tmp);
+  auto hi = static_cast<unsigned>(tmp >> 32);
+  hi = __shfl_xor(static_cast<int>(hi), lane_mask, width);
+  lo = __shfl_xor(static_cast<int>(lo), lane_mask, width);
+  return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                             static_cast<uint64_t>(lo));
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorSync, CudaShuffleXorSync);
+
+// Wrapper for __ldg.
+template <typename T>
+__host__ __device__ T GpuLdg(const T* address) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(address);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline bool GpuLdg(const bool* address) {
+  return GpuLdg(reinterpret_cast<const char*>(address)) != 0;
+}
+
+__host__ __device__ inline std::complex<float> GpuLdg(
+    const std::complex<float>* address) {
+#if __CUDA_ARCH__ >= 350
+  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
+  return std::complex<float>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline std::complex<double> GpuLdg(
+    const std::complex<double>* address) {
+#if __CUDA_ARCH__ >= 350
+  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
+  return std::complex<double>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLdg, CudaLdg);
+
+// Zeroes count elements starting at ptr using all threads of a 1-D grid.
+// Note: this function does not synchronize, and therefore the memory range is
+// not guaranteed to be zero until the next kernel launch.
+template <typename T>
+__global__ void SetZero(const int count, T* __restrict__ ptr) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : GpuGridRangeX(count)) {
+    ptr[i] = T(0);
+  }
+}
+
+// Helper to set all tensor entries to a specific value.
+template <typename T, typename Tvalue = T>
+__global__ void SetToValue(const int count, T* __restrict__ ptr, Tvalue value) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : GpuGridRangeX(count)) {
+    ptr[i] = static_cast<T>(value);
+  }
+}
+
+namespace detail {
+// Helper function for atomic accumulation implemented as CAS.
+template <typename T, typename F>
+__device__ T GpuAtomicCasHelper(T* ptr, F accumulate) {
+  T old = *ptr;
+  T assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(ptr, assumed, accumulate(assumed));
+  } while (assumed != old);
+  return old;
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicCasHelper, CudaAtomicCasHelper);
+
+// Overload for floating point (using integer comparison to handle NaN
+// correctly).
+template <typename F>
+__device__ float GpuAtomicCasHelper(float* ptr, F accumulate) {
+  return __int_as_float(
+      GpuAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
+        return __float_as_int(accumulate(__int_as_float(a)));
+      }));
+}
+template <typename F>
+__device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
+#if TENSORFLOW_USE_ROCM
+  // FIXME: remove the workaround below once bug is fixed.
+  // HIP has a bug in the implementation of __longlong_as_double
+  // So workaround it by using reinterpret_cast<double*>.
+  uint64_t result =
+      GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
+                         [accumulate](tensorflow::uint64 a) {
+                           return __double_as_longlong(
+                               accumulate(*(reinterpret_cast<double*>(&a))));
+                         });
+  return *(reinterpret_cast<double*>(&result));
+#else
+  return __longlong_as_double(GpuAtomicCasHelper(
+      reinterpret_cast<unsigned long long*>(ptr),
+      [accumulate](tensorflow::uint64 a) {
+        return __double_as_longlong(accumulate(__longlong_as_double(a)));
+      }));
+#endif
+}
+
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half GpuAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(Eigen::numext::bit_cast<Eigen::half>(high));
+      return (static_cast<uint32>(Eigen::numext::bit_cast<uint16>(acc)) << 16) |
+             (arg & 0xffff);
+    });
+    return Eigen::numext::bit_cast<Eigen::half>(
+        static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(Eigen::numext::bit_cast<Eigen::half>(low));
+      return (arg & 0xffff0000) |
+             static_cast<uint32>(Eigen::numext::bit_cast<uint16>(acc));
+    });
+    return Eigen::numext::bit_cast<Eigen::half>(
+        static_cast<uint16>(result & 0xffff));
+  }
+}
+
+template <typename F>
+__device__ Eigen::bfloat16 GpuAtomicCasHelper(Eigen::bfloat16* ptr,
+                                              F accumulate) {
+  Eigen::half ret = detail::GpuAtomicCasHelper(
+      reinterpret_cast<Eigen::half*>(ptr), [accumulate](Eigen::half a) {
+        Eigen::bfloat16 acc =
+            accumulate(Eigen::numext::bit_cast<Eigen::bfloat16>(a));
+        return Eigen::numext::bit_cast<Eigen::half>(acc);
+      });
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(ret);
+}
+
+template <typename F>
+__device__ long long GpuAtomicCasHelper(long long* ptr, F accumulate) {
+  return static_cast<long long>(
+      GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
+                         [accumulate](unsigned long long a) {
+                           return static_cast<unsigned long long>(
+                               accumulate(static_cast<long long>(a)));
+                         }));
+}
+
+template <typename From, typename To>
+using ToTypeIfConvertible =
+    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
+
+template <typename T>
+struct CudaSupportedTypeImpl {
+  using type = T;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long long> {
+  using type = unsigned long long;
+};
+
+template <>
+struct CudaSupportedTypeImpl<unsigned long> {
+  using type =
+      typename std::conditional<sizeof(unsigned long) == sizeof(unsigned int),
+                                unsigned int, unsigned long long>::type;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long> {
+  // This cast should be safe since module-2 addition should work fine. However,
+  // signed overflow is not handled correctly since it's undefined behavior.
+  using type = typename CudaSupportedTypeImpl<unsigned long>::type;
+};
+
+template <typename T>
+using CudaSupportedType = typename CudaSupportedTypeImpl<T>::type;
+
+template <typename T>
+__device__ CudaSupportedType<T>* ToCudaSupportedPtr(T* ptr) {
+  return reinterpret_cast<CudaSupportedType<T>*>(ptr);
+}
+
+}  // namespace detail
+
+// CUDA provides atomic ops, but not for all types.  We provide wrappers
+// for some ops and provide implementation for all reasonable types.
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
+  return atomicAdd(detail::ToCudaSupportedPtr(ptr), value);
+}
+
+__device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
+}
+
+__device__ inline Eigen::bfloat16 GpuAtomicAdd(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return a + value; });
+}
+
+#if (__CUDA_ARCH__ < 600) || TENSORFLOW_USE_ROCM
+__device__ inline double GpuAtomicAdd(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](double a) { return a + value; });
+}
+#endif
+
+// GpuAtomicAdd
+// Specializations of GpuAtomicAdd for complex types, which GpuAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+__device__ inline std::complex<float> GpuAtomicAdd(std::complex<float>* ptr,
+                                                   std::complex<float> value) {
+  auto ptr_scalar = reinterpret_cast<float*>(ptr);
+  return std::complex<float>(GpuAtomicAdd(ptr_scalar, value.real()),
+                             GpuAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+__device__ inline std::complex<double> GpuAtomicAdd(
+    std::complex<double>* ptr, std::complex<double> value) {
+  auto ptr_scalar = reinterpret_cast<double*>(ptr);
+  return std::complex<double>(GpuAtomicAdd(ptr_scalar, value.real()),
+                              GpuAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+#endif
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicAdd, CudaAtomicAdd);
+
+// GpuAtomicSub
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicSub(T* ptr, U value) {
+  return atomicSub(ptr, value);
+}
+
+// Specializations of substraction which add the negative value.
+__device__ inline float GpuAtomicSub(float* ptr, float value) {
+  return GpuAtomicAdd(ptr, -value);
+}
+
+__device__ inline double GpuAtomicSub(double* ptr, double value) {
+  return GpuAtomicAdd(ptr, -value);
+}
+
+__device__ inline int64_t GpuAtomicSub(int64_t* ptr, int64_t value) {
+  return GpuAtomicAdd(ptr, -value);
+}
+
+__device__ inline tensorflow::uint64 GpuAtomicSub(tensorflow::uint64* ptr,
+                                                  tensorflow::uint64 value) {
+  return GpuAtomicAdd(ptr, -static_cast<int64_t>(value));
+}
+
+__device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
+}
+
+__device__ inline Eigen::bfloat16 GpuAtomicSub(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return a - value; });
+}
+
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
+
+// GpuAtomicMax
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
+  return atomicMax(detail::ToCudaSupportedPtr(ptr), value);
+}
+
+#if TENSORFLOW_USE_ROCM
+
+/*
+ * CUDA runtime headers have the following defined
+ *   __device__  int max(int, int)
+ *   __device__  float max(float, float)
+ *   __device__  double max(double, double)
+ *
+ * and many others, where as HIP runtime headers only have the "int" version
+ *
+ * Therefore need to special case ROCm version to call the correct underlying
+ * routines for float and double types.
+ *
+ */
+
+__device__ inline float GpuAtomicMax(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](float a) { return fmaxf(a, value); });
+}
+
+__device__ inline double GpuAtomicMax(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](double a) { return fmax(a, value); });
+}
+
+#else
+
+__device__ inline float GpuAtomicMax(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](float a) { return max(a, value); });
+}
+
+__device__ inline double GpuAtomicMax(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](double a) { return max(a, value); });
+}
+
+#endif
+
+__device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return max(a, value); });
+}
+
+__device__ inline Eigen::bfloat16 GpuAtomicMax(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return max(a, value); });
+}
+
+#if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
+__device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
+                                                  tensorflow::uint64 value) {
+  return detail::GpuAtomicCasHelper(
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return max(a, value); });
+}
+
+__device__ inline int64_t GpuAtomicMax(int64_t* ptr, int64_t value) {
+  return detail::GpuAtomicCasHelper(
+      detail::ToCudaSupportedPtr(ptr),
+      [value](int64_t a) { return max(a, value); });
+}
+#endif
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
+
+// GpuAtomicMin
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
+  return atomicMin(detail::ToCudaSupportedPtr(ptr), value);
+}
+
+#if TENSORFLOW_USE_ROCM
+
+/*
+ * CUDA runtime headers have the following defined
+ *   __device__  int min(int, int)
+ *   __device__  float min(float, float)
+ *   __device__  double min(double, double)
+ *
+ * and many others, where as HIP runtime headers only have the "int" version
+ *
+ * Therefore need to special case ROCm version to call the correct underlying
+ * routines for float and double types.
+ *
+ */
+
+__device__ inline float GpuAtomicMin(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](float a) { return fminf(a, value); });
+}
+
+__device__ inline double GpuAtomicMin(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](double a) { return fmin(a, value); });
+}
+
+#else
+
+__device__ inline float GpuAtomicMin(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](float a) { return min(a, value); });
+}
+
+__device__ inline double GpuAtomicMin(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](double a) { return min(a, value); });
+}
+
+#endif
+
+__device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return min(a, value); });
+}
+
+__device__ inline Eigen::bfloat16 GpuAtomicMin(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return min(a, value); });
+}
+
+#if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
+__device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
+                                                  tensorflow::uint64 value) {
+  return detail::GpuAtomicCasHelper(
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return min(a, value); });
+}
+
+__device__ inline int64_t GpuAtomicMin(int64_t* ptr, int64_t value) {
+  return detail::GpuAtomicCasHelper(
+      detail::ToCudaSupportedPtr(ptr),
+      [value](int64_t a) { return min(a, value); });
+}
+#endif
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
+
+// GpuAtomicMul
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMul(T* ptr, U value) {
+  return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a * value; });
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMul, CudaAtomicMul);
+
+// GpuAtomicDiv
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicDiv(T* ptr, U value) {
+  return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a / value; });
+}
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicDiv, CudaAtomicDiv);
+
+// Import all specialized std::complex device operators in namespace tensorflow.
+#if GOOGLE_CUDA && defined(EIGEN_USING_STD_COMPLEX_OPERATORS)
+EIGEN_USING_STD_COMPLEX_OPERATORS
+#endif  // GOOGLE_CUDA
+
+namespace functor {
+// Import all specialized std::complex device operators in namespace functor.
+#if GOOGLE_CUDA && defined(EIGEN_USING_STD_COMPLEX_OPERATORS)
+EIGEN_USING_STD_COMPLEX_OPERATORS
+#endif  // GOOGLE_CUDA
+
+// ROCm hcc(clang) has severe difficulties dealing with std::complex directly
+// due to a header issue. This template assists in casting std::complex into the
+// corresponding internal ROCm types.
+template <class T>
+struct MapComplexToHipComplex {
+  typedef T TM;
+};
+
+#if TENSORFLOW_USE_ROCM
+template <>
+struct MapComplexToHipComplex<std::complex<float> > {
+  typedef hipFloatComplex TM;
+};
+
+template <>
+struct MapComplexToHipComplex<std::complex<double> > {
+  typedef hipDoubleComplex TM;
+};
+#endif
+};  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_UTIL_GPU_DEVICE_FUNCTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/gpu_kernel_helper.h b/third_party/tflite-hdrs/tensorflow/core/util/gpu_kernel_helper.h
new file mode 100644
index 00000000..ae9894cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/gpu_kernel_helper.h
@@ -0,0 +1,524 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
+#define TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <type_traits>
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
+#endif
+#include "tensorflow/core/util/gpu_cuda_alias.h"
+#include "tensorflow/core/util/gpu_device_functions.h"
+#include "tensorflow/core/util/gpu_launch_config.h"
+
+#if GOOGLE_CUDA
+#define TF_RED_WARPSIZE 32
+#elif TENSORFLOW_USE_ROCM
+#define TF_RED_WARPSIZE 64
+#endif
+
+// Deprecated, use 'for(int i : GpuGridRangeX(n))' instead.
+#define GPU_1D_KERNEL_LOOP(i, n) \
+  for (int i : ::tensorflow::GpuGridRangeX<int>(n))
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i : ::tensorflow::GpuGridRangeX<int>(n))
+
+// Deprecated, use 'for(int i : GpuGridRange?(n))' instead.
+#define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
+  for (int i : ::tensorflow::GpuGridRange##axis<int>(n))
+#define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
+  for (int i : ::tensorflow::GpuGridRange##axis<int>(n))
+
+#if GOOGLE_CUDA
+#define gpuSuccess cudaSuccess
+using gpuStream_t = cudaStream_t;
+using gpuError_t = cudaError_t;
+#elif TENSORFLOW_USE_ROCM
+#define gpuSuccess hipSuccess
+using gpuStream_t = hipStream_t;
+using gpuError_t = hipError_t;
+#endif
+
+// macro wrapper to declare dynamic shared memory
+#if GOOGLE_CUDA
+
+#define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
+  extern __shared__ __align__(ALIGN)                   \
+  TYPE NAME[]
+
+#elif TENSORFLOW_USE_ROCM
+
+#define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
+  HIP_DYNAMIC_SHARED(TYPE, NAME)
+
+#endif
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+// cudaGetErrorString is available to both host and device
+__host__ __device__ inline const char* GpuGetErrorString(cudaError_t error) {
+  return cudaGetErrorString(error);
+}
+#elif TENSORFLOW_USE_ROCM
+// hipGetErrorString is available on host side only
+inline const char* GpuGetErrorString(hipError_t error) {
+  return hipGetErrorString(error);
+}
+#endif
+
+// Returns a raw reference to the current cuda stream. Required by a
+// number of kernel calls (for which StreamInterface* does not work),
+// i.e. CUB and certain cublas primitives.
+inline gpuStream_t GetGpuStream(OpKernelContext* context) {
+  void* opaque_stream = CHECK_NOTNULL(context->op_device_context()
+                                          ->stream()
+                                          ->platform_specific_handle()
+                                          .stream);
+  return reinterpret_cast<gpuStream_t>(opaque_stream);
+}
+
+// Launches a GPU kernel through cudaLaunchKernel in CUDA environment, or
+// hipLaunchKernel in ROCm environment with the given arguments.
+//
+// The kernel parameters 'Ts' must be constructible from the arguments 'Args'.
+template <typename... Ts, typename... Args>
+Status GpuLaunchKernel(void (*function)(Ts...), dim3 grid_dim, dim3 block_dim,
+                       size_t shared_memory_size_bytes, gpuStream_t stream,
+                       Args... arguments) {
+  static_assert(detail::NoneIsReference<Ts...>(),
+                "Kernels with reference arguments have undefined behaviour.");
+  if (grid_dim.x * grid_dim.y * grid_dim.z > 0 &&
+      block_dim.x * block_dim.y * block_dim.z > 0) {
+#if GOOGLE_CUDA
+    auto func_ptr = absl::bit_cast<const void*>(function);
+    // Cast arguments and forward them as an array of pointers.
+    auto args_tuple = std::tuple<Ts...>(arguments...);
+    auto arg_ptrs = detail::GetArrayOfElementPointers(&args_tuple);
+    auto result =
+        cudaLaunchKernel(func_ptr, grid_dim, block_dim, arg_ptrs.data(),
+                         shared_memory_size_bytes, stream);
+    if (result != cudaSuccess) {
+      return errors::Internal(cudaGetErrorString(result));
+    }
+#elif TENSORFLOW_USE_ROCM
+    hipLaunchKernelGGL(function, grid_dim, block_dim, shared_memory_size_bytes,
+                       stream, std::forward<Args>(arguments)...);
+    TF_RETURN_IF_CUDA_ERROR(hipGetLastError());
+#endif
+  }
+  return OkStatus();
+}
+
+// Perfect forwarding to make CudaLaunchKernel available to both ROCm and CUDA
+// builds
+template <typename... Args>
+auto CudaLaunchKernel(Args&&... args)
+    -> decltype(GpuLaunchKernel(std::forward<Args>(args)...)) {
+  return GpuLaunchKernel(std::forward<Args>(args)...);
+}
+
+__host__ __device__ inline tensorflow::bfloat16 GpuLdg(
+    const tensorflow::bfloat16* address) {
+  return Eigen::numext::bit_cast<tensorflow::bfloat16>(
+      GpuLdg(reinterpret_cast<const uint16_t*>(address)));
+}
+// Already aliased in gpu_device_functions.h
+
+template <typename T>
+__host__ __device__ inline T ldg(const T* ptr) {
+  return GpuLdg(ptr);
+}
+
+template <typename T>
+__host__ __device__ inline const T& tf_min(const T& x, const T& y) {
+  return x < y ? x : y;
+}
+
+template <typename T>
+__host__ __device__ inline const T& tf_max(const T& x, const T& y) {
+  return x < y ? y : x;
+}
+
+// Overloads of the above functions for float and double.
+__host__ __device__ inline float tf_min(float x, float y) {
+  return fminf(x, y);
+}
+__host__ __device__ inline double tf_min(double x, double y) {
+  return fmin(x, y);
+}
+__host__ __device__ inline float tf_max(float x, float y) {
+  return fmaxf(x, y);
+}
+__host__ __device__ inline double tf_max(double x, double y) {
+  return fmax(x, y);
+}
+
+#ifdef _MSC_VER
+#if _MSC_VER >= 1930
+using std::max;
+using std::min;
+__host__ __device__ inline int tf_min(int x, int y) { return min(x, y); }
+__host__ __device__ inline int tf_max(int x, int y) { return max(x, y); }
+#endif
+#endif
+
+// ROCM TODO re-enable them after adding fp16 support logic
+#if GOOGLE_CUDA
+__device__ inline Eigen::half GpuShuffleSync(unsigned mask, Eigen::half value,
+                                             int src_lane,
+                                             int width = warpSize) {
+  return Eigen::half(
+      GpuShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
+}
+// Aliased in gpu_device_functions.h
+
+__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleUpSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      GpuShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
+}
+// Aliased in gpu_device_functions.h
+
+__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleDownSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      GpuShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
+}
+// Aliased in gpu_device_functions.h
+
+__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleXorSync(
+    unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
+  return Eigen::half(
+      GpuShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
+}
+// Aliased in gpu_device_functions.h
+#endif
+
+#ifdef __CUDA_ARCH__
+#define UNROLL_ON_DEVICE _Pragma("unroll")
+#else
+#define UNROLL_ON_DEVICE
+#endif
+
+// Represents an aligned array of N elements of T. Data pointers can be
+// reinterpreted as this type to generate vectorized loads/stores in a kernel.
+template <typename T, int N>
+class alignas(alignof(T) * N) AlignedVector {
+ public:
+  typedef T value_type;
+  static constexpr const int kSize = N;
+
+  AlignedVector() = default;
+
+  // Uniform initialization.
+  __host__ __device__ explicit AlignedVector(value_type uniform) {
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] = uniform; }
+  }
+  // Uniform initialization with explicit conversion.
+  // Note: This is required for T=Eigen::half because it only supports explicit
+  // conversions from other types and its template constructor is too relaxed
+  // to be able to use std::is_constructible.
+  template <typename U, typename std::enable_if<std::is_arithmetic<U>::value,
+                                                int>::type = 0>
+  __host__ __device__ explicit AlignedVector(U uniform_u) {
+    value_type uniform(uniform_u);
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] = uniform; }
+  }
+  // Implicit conversion.
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value, int>::type = 0>
+  __host__ __device__ AlignedVector(const AlignedVector<U, N>& other) {
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] = other[i]; }
+  }
+  // Explicit conversion.
+  template <typename U,
+            typename std::enable_if<!std::is_convertible<U, T>::value &&
+                                        std::is_constructible<T, U>::value,
+                                    int>::type = 0>
+  __host__ __device__ explicit AlignedVector(const AlignedVector<U, N>& other) {
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) {
+      values_[i] = T(other[i]);
+    }
+  }
+
+  __host__ __device__ value_type& operator[](int i) { return values_[i]; }
+  __host__ __device__ const value_type& operator[](int i) const {
+    return values_[i];
+  }
+
+#define DEFINE_BINARY_UPDATE_OPERATOR(op)                                      \
+  __host__ __device__ AlignedVector& operator op(const AlignedVector& rhs) {   \
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) { values_[i] op rhs[i]; } \
+    return *this;                                                              \
+  }
+  DEFINE_BINARY_UPDATE_OPERATOR(+=)
+  DEFINE_BINARY_UPDATE_OPERATOR(-=)
+  DEFINE_BINARY_UPDATE_OPERATOR(*=)
+  DEFINE_BINARY_UPDATE_OPERATOR(/=)
+#undef DEFINE_BINARY_UPDATE_OPERATOR
+
+#define DEFINE_BINARY_OPERATOR(op)                          \
+  friend __host__ __device__ AlignedVector operator op(     \
+      const AlignedVector& lhs, const AlignedVector& rhs) { \
+    AlignedVector ret;                                      \
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) {      \
+      ret[i] = lhs[i] op rhs[i];                            \
+    }                                                       \
+    return ret;                                             \
+  }
+  DEFINE_BINARY_OPERATOR(+)
+  DEFINE_BINARY_OPERATOR(-)
+  DEFINE_BINARY_OPERATOR(*)
+  DEFINE_BINARY_OPERATOR(/)
+#undef DEFINE_BINARY_OPERATOR
+
+#define DEFINE_BINARY_FUNCTION(func)                                        \
+  friend __host__ __device__ AlignedVector func(const AlignedVector& lhs,   \
+                                                const AlignedVector& rhs) { \
+    AlignedVector ret;                                                      \
+    UNROLL_ON_DEVICE for (int i = 0; i < kSize; ++i) {                      \
+      ret[i] = func(lhs[i], rhs[i]);                                        \
+    }                                                                       \
+    return ret;                                                             \
+  }
+  DEFINE_BINARY_FUNCTION(min)
+  DEFINE_BINARY_FUNCTION(max)
+#undef DEFINE_BINARY_FUNCTION
+
+ private:
+  value_type values_[N];
+};
+
+#undef UNROLL_ON_DEVICE
+
+// Returns the maximum power-of-two alignment (in units of elements, not bytes)
+// of a stride or pointer value.
+inline int64_t alignment_of(int64_t element_stride) {
+  // A zero/nullptr value means that the stride/pointer is not used, so it
+  // effectively has infinite alignment.
+  constexpr int64_t kMaxAlignment = 512;
+  if (element_stride == 0) return kMaxAlignment;
+  return element_stride & -element_stride;
+}
+
+template <typename T>
+inline int64_t alignment_of(T* ptr) {
+  const intptr_t ptr_val = reinterpret_cast<std::uintptr_t>(ptr);
+  // Pointers should always be aligned to sizeof(T) bytes.
+  DCHECK_EQ(ptr_val % sizeof(T), 0);
+  // Note that we want the alignment in elements, not bytes.
+  return alignment_of(ptr_val / sizeof(T));
+}
+
+template <typename... Args>
+int64_t MinAlignmentOf(Args... args) {
+  return std::min({alignment_of(args)...});
+}
+
+namespace detail {
+
+template <int64_t VecSize, template <int vec_size> class Functor>
+struct DispatchToVectorizedHelper {
+  template <typename... Args>
+  Status operator()(int64_t max_vec_size, Args&&... args) const {
+    if (max_vec_size >= VecSize) {
+      return Functor<VecSize>()(std::forward<Args>(args)...);
+    }
+    return DispatchToVectorizedHelper<VecSize / 2, Functor>()(
+        max_vec_size, std::forward<Args>(args)...);
+  }
+};
+template <template <int vec_size> class Functor>
+struct DispatchToVectorizedHelper<1, Functor> {
+  template <typename... Args>
+  Status operator()(int64_t max_vec_size, Args&&... args) const {
+    return Functor<1>()(std::forward<Args>(args)...);
+  }
+};
+
+}  // namespace detail
+
+// Calls Functor<vec_size>()(args...) with vec_size set to the optimal GPU
+// vector instruction size for type T that is <= max_vec_size. The max_vec_size
+// argument should be set to the minimum alignment of all relevant parameters.
+// Requires sizeof(T) to be a power of 2.
+template <typename T, template <int vec_size> class Functor, typename... Args>
+Status DispatchToVectorized(int64_t max_vec_size, Args&&... args) {
+  static_assert((sizeof(T) & (sizeof(T) - 1)) == 0,
+                "sizeof(T) must be a power of 2");
+  if (max_vec_size <= 0) {
+    return errors::InvalidArgument("DispatchToVectorized: max_vec_size (",
+                                   max_vec_size,
+                                   ") must be greater than zero.");
+  }
+  constexpr const int kOptimalVecSizeBytes = 16;
+  // The optimal number of (aligned) elements of T to load/store in a
+  // single instruction inside a kernel.
+  constexpr const int optimal_vec_size =
+      (kOptimalVecSizeBytes - 1) / sizeof(T) + 1;
+  return detail::DispatchToVectorizedHelper<optimal_vec_size, Functor>()(
+      max_vec_size, std::forward<Args>(args)...);
+}
+
+// Similar to std::upper_bound, this returns the index of the first element in
+// [first, first + count) that is greater than `val`, or `count` if no such
+// element is found. Assumes [first, first + count) is sorted.
+namespace gpu_helper {
+template <typename T, typename OutType = int32, typename Iterator = const T*>
+__device__ OutType upper_bound(Iterator first, OutType count, T val) {
+  Iterator orig = first;
+  OutType step = 0;
+  while (count > 0) {
+    Iterator it = first;
+    step = count / 2;
+    it += step;
+    if (!(val < *it)) {
+      first = ++it;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+
+  return first - orig;
+}
+
+// Similar to std::lower_bound, this returns the index of the first element in
+// [first, first + count) that is not less than `val`, or `count` if no such
+// element is found. Assumes [first, first + count) is sorted.
+template <typename T, typename OutType = int32, typename Iterator = const T*>
+__device__ OutType lower_bound(Iterator first, OutType count, T val) {
+  Iterator orig = first;
+  OutType step = 0;
+  while (count > 0) {
+    Iterator it = first;
+    step = count / 2;
+    it += step;
+    if (*it < val) {
+      first = ++it;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+
+  return first - orig;
+}
+
+}  // namespace gpu_helper
+
+#ifndef TENSORFLOW_USE_ROCM
+namespace cuda_helper = gpu_helper;
+#endif
+
+// For int division, we can substitute the fast multiplication for slow
+// division. For detailed information see:
+//   https://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+//
+// Warning: This implementation only works when the divisor is [1, INT32_MAX]
+//          and the numerator has to be [0, INT32_MAX]. This is enough for our
+//          purpose for computing integer indices.
+// Basics: the typical int division can be written as:
+//   n / d = (m * n) / 2^(32 + s)
+// where 'n' is the numerator and 'd' is the divisor. For a given 'd', we
+// need to find a magic number 'm' and a shift 's'. See update_magic().
+struct FastDividerUint32 {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC FastDividerUint32(uint32_t d)
+      : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+    update_magic();
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void update_magic() {
+    // (1). The shift 's' is calculated by log2ceil(d).
+#if defined(__CUDA_ARCH__)
+    shift = 32 - __clz(divisor - 1);
+#else
+    for (shift = 0; shift < 32; shift++) {
+      if ((1U << shift) >= divisor) break;
+    }
+#endif
+
+    // (2). The magic number 'm' is calculated by:
+    //   m = 2^(32 + s) / d + 1
+    // Note, the digit '1' is to round up 'm * n', which will be rounded down
+    // later by dividing two. In practice, 'm' is a 33-bit value. To fit the
+    // 32-bit range, we introduce:
+    //   magic = m - 2^32
+    //         = 2^(32 + s) / d - 2^32 + 1
+    //         = 2^32 * 2^s / d - 2^32 * d / d + 1
+    //         = (2^32 * (2^s - d)) / d + 1, where 'magic' will be in 32-bit.
+    uint64_t m = (0x100000000ull * ((0x1ull << shift) - divisor)) / divisor + 1;
+    magic = static_cast<uint32_t>(m);
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC FastDividerUint32& operator=(
+      uint32_t d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+    this->divisor = d;
+    update_magic();
+    return *this;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC operator uint32_t() const {
+    return divisor;
+  }
+
+  uint32_t divisor;
+  uint32_t magic;
+  uint32_t shift;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint32_t
+operator/(const uint32_t n, const FastDividerUint32& fdiv) {
+  // (3). We use the 32-bit 'magic' instead of 'm' in the formula:
+  //   n / d = (m * n) / 2^(32 + s)
+  //         = (magic + 2^32) * n / 2^(32 + s)
+  //         = (magic * n) / 2^(32 + s) + n / 2^s
+  //         = (magic * n) / 2^32 / 2^s + n / 2^s
+  //         = (magic * n / 2^32 + n) / 2^s
+#if defined(__CUDA_ARCH__)
+  uint32_t q = __umulhi(n, fdiv.magic);
+#else
+  uint32_t q =
+      static_cast<uint32_t>((static_cast<uint64_t>(n) * fdiv.magic) >> 32);
+#endif
+  return (n + q) >> fdiv.shift;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint32_t
+operator%(const uint32_t n, const FastDividerUint32& fdiv) {
+  return n - (n / fdiv) * fdiv.divisor;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint32_t
+operator/(const int n, const FastDividerUint32& fdiv) {
+  return static_cast<uint32_t>(n) / fdiv;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint32_t
+operator%(const int n, const FastDividerUint32& fdiv) {
+  return static_cast<uint32_t>(n) % fdiv;
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_UTIL_GPU_KERNEL_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/gpu_launch_config.h b/third_party/tflite-hdrs/tensorflow/core/util/gpu_launch_config.h
new file mode 100644
index 00000000..e659d727
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/gpu_launch_config.h
@@ -0,0 +1,401 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_LAUNCH_CONFIG_H_
+#define TENSORFLOW_CORE_UTIL_GPU_LAUNCH_CONFIG_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <algorithm>
+
+#include "absl/base/casts.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_cuda_alias.h"
+
+// Usage of GetGpuLaunchConfig, GetGpu2DLaunchConfig, and
+// GetGpu3DLaunchConfig:
+//
+// There are two versions of GetGpuLaunchConfig and GetGpu2DLaunchConfig, one
+// version uses heuristics without any knowledge of the device kernel, the other
+// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
+// launch parameters that maximize occupancy. Currently, only the maximum
+// occupancy version of GetGpu3DLaunchConfig is available.
+//
+// For large number of work elements, the convention is that each kernel would
+// iterate through its assigned range. The return value of GetGpuLaunchConfig
+// is struct GpuLaunchConfig, which contains all the information needed for the
+// kernel launch, including: virtual number of threads, the number of threads
+// per block and number of threads per block used inside <<< >>> of a kernel
+// launch. GetGpu2DLaunchConfig and GetGpu3DLaunchConfig does the same thing
+// as GpuLaunchConfig. The only difference is the dimension. The macros
+// GPU_1D_KERNEL_LOOP and GPU_AXIS_KERNEL_LOOP might be used to do inner loop.
+//
+/* Sample code:
+
+__global__ void MyKernel1D(GpuLaunchConfig config, other_args...) {
+  GPU_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+    do_your_job_here;
+  }
+}
+
+__global__ void MyKernel2D(Gpu2DLaunchConfig config, other_args...) {
+  GPU_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    GPU_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      do_your_job_here;
+    }
+  }
+}
+
+__global__ void MyKernel3D(Gpu3DLaunchConfig config, other_args...) {
+  GPU_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    GPU_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      GPU_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+        do_your_job_here;
+      }
+    }
+  }
+}
+
+void MyDriverFunc(const Eigen::GpuDevice &d) {
+  // use heuristics
+  GpuLaunchConfig cfg1 = GetGpuLaunchConfig(10240, d);
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
+  Gpu2DLaunchConfig cfg2 = GetGpu2DLaunchConfig(10240, 10240, d);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
+  Gpu3DLaunchConfig cfg3 = GetGpu3DLaunchConfig(4096, 4096, 100, d);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
+
+  // maximize occupancy
+  GpuLaunchConfig cfg4 = GetGpuLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
+  Gpu2DLaunchConfig cfg5 = GetGpu2DLaunchConfig(10240, 10240, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
+  Gpu3DLaunchConfig cfg6 = GetGpu3DLaunchConfig(4096, 4096, 100, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+}
+
+// See the test for this for more example:
+//
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+
+*/
+
+namespace tensorflow {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+struct GpuLaunchConfig {
+  // Logical number of thread that works on the elements. If each logical
+  // thread works on exactly a single element, this is the same as the working
+  // element count.
+  int virtual_thread_count = -1;
+  // Number of threads per block.
+  int thread_per_block = -1;
+  // Number of blocks for GPU kernel launch.
+  int block_count = -1;
+};
+CREATE_CUDA_TYPE_ALIAS(GpuLaunchConfig, CudaLaunchConfig);
+
+// Calculate the GPU launch config we should use for a kernel launch.
+// This is assuming the kernel is quite simple and will largely be
+// memory-limited.
+// REQUIRES: work_element_count >= 0.
+inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
+                                          const Eigen::GpuDevice& d) {
+  CHECK_GE(work_element_count, 0);
+  GpuLaunchConfig config;
+  const int virtual_thread_count = work_element_count;
+  const int physical_thread_count = std::min(
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+      virtual_thread_count);
+  const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  const int block_count =
+      std::min(DivUp(physical_thread_count, thread_per_block),
+               d.getNumGpuMultiProcessors());
+
+  config.virtual_thread_count = virtual_thread_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+#ifndef TENSORFLOW_USE_ROCM
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d) {
+  return GetGpuLaunchConfig(work_element_count, d);
+}
+#endif
+
+// Calculate the GPU launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+// REQUIRES: work_element_count >= 0.
+template <typename DeviceFunc>
+GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
+                                   const Eigen::GpuDevice& d, DeviceFunc func,
+                                   size_t dynamic_shared_memory_size,
+                                   int block_size_limit) {
+  CHECK_GE(work_element_count, 0);
+  GpuLaunchConfig config;
+  int block_count = 0;
+  int thread_per_block = 0;
+
+#if GOOGLE_CUDA
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+  hipError_t err = hipOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, hipSuccess);
+#endif
+
+  block_count =
+      std::min(block_count, DivUp(work_element_count, thread_per_block));
+
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig);
+
+// Calculate the GPU launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+// The returned launch config has thread_per_block set to fixed_block_size.
+// REQUIRES: work_element_count >= 0.
+template <typename DeviceFunc>
+GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
+    int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int fixed_block_size) {
+  CHECK_GE(work_element_count, 0);
+  GpuLaunchConfig config;
+  int block_count = 0;
+
+#if GOOGLE_CUDA
+  cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
+  CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
+  CHECK_EQ(err, hipSuccess);
+#endif
+  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
+                         DivUp(work_element_count, fixed_block_size));
+
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = fixed_block_size;
+  config.block_count = block_count;
+  return config;
+}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize,
+                                GetCudaLaunchConfigFixedBlockSize);
+
+struct Gpu2DLaunchConfig {
+  dim3 virtual_thread_count = dim3(0, 0, 0);
+  dim3 thread_per_block = dim3(0, 0, 0);
+  dim3 block_count = dim3(0, 0, 0);
+};
+CREATE_CUDA_TYPE_ALIAS(Gpu2DLaunchConfig, Cuda2DLaunchConfig);
+
+inline Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
+                                              const Eigen::GpuDevice& d) {
+  Gpu2DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0) {
+    return config;
+  }
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = std::min(xdim, kThreadsPerBlock);
+  // ok to round down here and just do more loops in the kernel
+  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
+
+  const int physical_thread_count =
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
+
+  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
+
+  config.virtual_thread_count = dim3(xdim, ydim, 1);
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
+
+  config.block_count = dim3(
+      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
+  return config;
+}
+#ifndef TENSORFLOW_USE_ROCM
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                                const Eigen::GpuDevice& d) {
+  return GetGpu2DLaunchConfig(xdim, ydim, d);
+}
+#endif
+
+// Calculate the GPU 2D and 3D launch config we should use for a kernel launch.
+// This variant takes the resource limits of func into account to maximize
+// occupancy.
+using Gpu3DLaunchConfig = Gpu2DLaunchConfig;
+CREATE_CUDA_TYPE_ALIAS(Gpu3DLaunchConfig, Cuda3DLaunchConfig);
+
+template <typename DeviceFunc>
+Gpu3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
+                                       const Eigen::GpuDevice& d,
+                                       DeviceFunc func,
+                                       size_t dynamic_shared_memory_size,
+                                       int block_size_limit) {
+  Gpu3DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
+    return config;
+  }
+
+  int dev;
+  int xthreadlimit = 0, ythreadlimit = 0, zthreadlimit = 0;
+  int xgridlimit = 0, ygridlimit = 0, zgridlimit = 0;
+#if GOOGLE_CUDA
+  cudaGetDevice(&dev);
+  cudaDeviceGetAttribute(&xthreadlimit, cudaDevAttrMaxBlockDimX, dev);
+  cudaDeviceGetAttribute(&ythreadlimit, cudaDevAttrMaxBlockDimY, dev);
+  cudaDeviceGetAttribute(&zthreadlimit, cudaDevAttrMaxBlockDimZ, dev);
+  cudaDeviceGetAttribute(&xgridlimit, cudaDevAttrMaxGridDimX, dev);
+  cudaDeviceGetAttribute(&ygridlimit, cudaDevAttrMaxGridDimY, dev);
+  cudaDeviceGetAttribute(&zgridlimit, cudaDevAttrMaxGridDimZ, dev);
+#elif TENSORFLOW_USE_ROCM
+  hipError_t err = hipGetDevice(&dev);
+  CHECK_EQ(err, hipSuccess);
+  err =
+      hipDeviceGetAttribute(&xthreadlimit, hipDeviceAttributeMaxBlockDimX, dev);
+  CHECK_EQ(err, hipSuccess);
+  err =
+      hipDeviceGetAttribute(&ythreadlimit, hipDeviceAttributeMaxBlockDimY, dev);
+  CHECK_EQ(err, hipSuccess);
+  err =
+      hipDeviceGetAttribute(&zthreadlimit, hipDeviceAttributeMaxBlockDimZ, dev);
+  CHECK_EQ(err, hipSuccess);
+  err = hipDeviceGetAttribute(&xgridlimit, hipDeviceAttributeMaxGridDimX, dev);
+  CHECK_EQ(err, hipSuccess);
+  err = hipDeviceGetAttribute(&ygridlimit, hipDeviceAttributeMaxGridDimY, dev);
+  CHECK_EQ(err, hipSuccess);
+  err = hipDeviceGetAttribute(&zgridlimit, hipDeviceAttributeMaxGridDimZ, dev);
+  CHECK_EQ(err, hipSuccess);
+#endif
+
+  int block_count = 0;
+  int thread_per_block = 0;
+
+#if GOOGLE_CUDA
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+  // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
+  // implemented
+  // hipError_t err = hipOccupancyMaxPotentialBlockSize(
+  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+  //    block_size_limit);
+  // CHECK_EQ(err, hipSuccess);
+
+  const int physical_thread_count =
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
+  thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
+
+  int threadsx = std::min({xdim, thread_per_block, xthreadlimit});
+  int threadsy =
+      std::min({ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit});
+  int threadsz =
+      std::min({zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
+                zthreadlimit});
+
+  int blocksx = std::min({block_count, DivUp(xdim, threadsx), xgridlimit});
+  int blocksy = std::min(
+      {DivUp(block_count, blocksx), DivUp(ydim, threadsy), ygridlimit});
+  int blocksz = std::min({DivUp(block_count, (blocksx * blocksy)),
+                          DivUp(zdim, threadsz), zgridlimit});
+
+  config.virtual_thread_count = dim3(xdim, ydim, zdim);
+  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
+  config.block_count = dim3(blocksx, blocksy, blocksz);
+  return config;
+}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig);
+
+template <typename DeviceFunc>
+Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
+                                       const Eigen::GpuDevice& d,
+                                       DeviceFunc func,
+                                       size_t dynamic_shared_memory_size,
+                                       int block_size_limit) {
+  return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
+                              dynamic_shared_memory_size, block_size_limit);
+}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);
+
+#if GOOGLE_CUDA
+template <typename DeviceFunc>
+Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                         const Eigen::GpuDevice& d,
+                                         DeviceFunc func,
+                                         size_t dynamic_shared_memory_size,
+                                         int block_size_limit) {
+  return GetGpu2DLaunchConfig(xdim, ydim, d, func, dynamic_shared_memory_size,
+                              block_size_limit);
+}
+#endif  // GOOGLE_CUDA
+
+namespace detail {
+template <typename... Ts, size_t... Is>
+std::array<void*, sizeof...(Ts)> GetArrayOfElementPointersImpl(
+    std::tuple<Ts...>* tuple, absl::index_sequence<Is...>) {
+  return {{&std::get<Is>(*tuple)...}};
+}
+// Returns an array of void pointers to the elements of the given tuple.
+template <typename... Ts>
+std::array<void*, sizeof...(Ts)> GetArrayOfElementPointers(
+    std::tuple<Ts...>* tuple) {
+  return GetArrayOfElementPointersImpl(tuple,
+                                       absl::index_sequence_for<Ts...>{});
+}
+
+template <bool...>
+struct BoolPack;
+template <bool... Bs>
+using NoneTrue = std::is_same<BoolPack<Bs..., false>, BoolPack<false, Bs...>>;
+// Returns whether none of the types in Ts is a reference.
+template <typename... Ts>
+constexpr bool NoneIsReference() {
+  return NoneTrue<(std::is_reference<Ts>::value)...>::value;
+}
+}  // namespace detail
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // TENSORFLOW_CORE_UTIL_GPU_LAUNCH_CONFIG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/gpu_solvers.h b/third_party/tflite-hdrs/tensorflow/core/util/gpu_solvers.h
new file mode 100644
index 00000000..ef654d0f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/gpu_solvers.h
@@ -0,0 +1,706 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================
+*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_SOLVERS_H_
+#define TENSORFLOW_CORE_UTIL_GPU_SOLVERS_H_
+
+// This header declares the class GpuSolver, which contains wrappers of linear
+// algebra solvers in the cuBlas/cuSolverDN or rocmSolver libraries for use in
+// TensorFlow kernels.
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <functional>
+#include <vector>
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cusolverDn.h"
+#else
+#include "rocm/include/hip/hip_complex.h"
+#include "rocm/include/rocblas.h"
+#include "rocm/rocm_config.h"
+#if TF_ROCM_VERSION >= 40500
+#include "xla/stream_executor/rocm/hipsolver_wrapper.h"
+#endif
+#include "xla/stream_executor/rocm/rocsolver_wrapper.h"
+#endif
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_reference.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+// Type traits to get CUDA complex types from std::complex<T>.
+template <typename T>
+struct CUDAComplexT {
+  typedef T type;
+};
+template <>
+struct CUDAComplexT<std::complex<float>> {
+  typedef cuComplex type;
+};
+template <>
+struct CUDAComplexT<std::complex<double>> {
+  typedef cuDoubleComplex type;
+};
+// Converts pointers of std::complex<> to pointers of
+// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename CUDAComplexT<T>::type* CUDAComplex(const T* p) {
+  return reinterpret_cast<const typename CUDAComplexT<T>::type*>(p);
+}
+template <typename T>
+inline typename CUDAComplexT<T>::type* CUDAComplex(T* p) {
+  return reinterpret_cast<typename CUDAComplexT<T>::type*>(p);
+}
+
+// Template to give the Cublas adjoint operation for real and complex types.
+template <typename T>
+cublasOperation_t CublasAdjointOp() {
+  return Eigen::NumTraits<T>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
+}
+#else  // TENSORFLOW_USE_ROCM
+// Type traits to get ROCm complex types from std::complex<T>.
+template <typename T>
+struct ROCmComplexT {
+  typedef T type;
+};
+template <>
+struct ROCmComplexT<std::complex<float>> {
+  typedef rocblas_float_complex type;
+};
+template <>
+struct ROCmComplexT<std::complex<double>> {
+  typedef rocblas_double_complex type;
+};
+// Converts pointers of std::complex<> to pointers of
+// ROCmComplex/ROCmDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename ROCmComplexT<T>::type* ROCmComplex(const T* p) {
+  return reinterpret_cast<const typename ROCmComplexT<T>::type*>(p);
+}
+template <typename T>
+inline typename ROCmComplexT<T>::type* ROCmComplex(T* p) {
+  return reinterpret_cast<typename ROCmComplexT<T>::type*>(p);
+}
+
+// Type traits to get HIP complex types from std::complex<>
+
+template <typename T>
+struct HipComplexT {
+  typedef T type;
+};
+
+template <>
+struct HipComplexT<std::complex<float>> {
+  typedef hipFloatComplex type;
+};
+
+template <>
+struct HipComplexT<std::complex<double>> {
+  typedef hipDoubleComplex type;
+};
+
+// Convert pointers of std::complex<> to pointers of
+// hipFloatComplex/hipDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename HipComplexT<T>::type* AsHipComplex(const T* p) {
+  return reinterpret_cast<const typename HipComplexT<T>::type*>(p);
+}
+
+template <typename T>
+inline typename HipComplexT<T>::type* AsHipComplex(T* p) {
+  return reinterpret_cast<typename HipComplexT<T>::type*>(p);
+}
+// Template to give the Rocblas adjoint operation for real and complex types.
+template <typename T>
+rocblas_operation RocblasAdjointOp() {
+  return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose
+                                        : rocblas_operation_transpose;
+}
+
+#if TF_ROCM_VERSION >= 40500
+using gpuSolverOp_t = hipsolverOperation_t;
+using gpuSolverFill_t = hipsolverFillMode_t;
+using gpuSolverSide_t = hipsolverSideMode_t;
+#else
+using gpuSolverOp_t = rocblas_operation;
+using gpuSolverFill_t = rocblas_fill;
+using gpuSolverSide_t = rocblas_side;
+#endif
+#endif
+
+// Container of LAPACK info data (an array of int) generated on-device by
+// a GpuSolver call. One or more such objects can be passed to
+// GpuSolver::CopyLapackInfoToHostAsync() along with a callback to
+// check the LAPACK info data after the corresponding kernels
+// finish and LAPACK info has been copied from the device to the host.
+class DeviceLapackInfo;
+
+// Host-side copy of LAPACK info.
+class HostLapackInfo;
+
+// The GpuSolver class provides a simplified templated API for the dense linear
+// solvers implemented in cuSolverDN (http://docs.nvidia.com/cuda/cusolver) and
+// cuBlas (http://docs.nvidia.com/cuda/cublas/#blas-like-extension/).
+// An object of this class wraps static cuSolver and cuBlas instances,
+// and will launch Cuda kernels on the stream wrapped by the GPU device
+// in the OpKernelContext provided to the constructor.
+//
+// Notice: All the computational member functions are asynchronous and simply
+// launch one or more Cuda kernels on the Cuda stream wrapped by the GpuSolver
+// object. To check the final status of the kernels run, call
+// CopyLapackInfoToHostAsync() on the GpuSolver object to set a callback that
+// will be invoked with the status of the kernels launched thus far as
+// arguments.
+//
+// Example of an asynchronous TensorFlow kernel using GpuSolver:
+//
+// template <typename Scalar>
+// class SymmetricPositiveDefiniteSolveOpGpu : public AsyncOpKernel {
+//  public:
+//   explicit SymmetricPositiveDefiniteSolveOpGpu(OpKernelConstruction* context)
+//       : AsyncOpKernel(context) { }
+//   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+//     // 1. Set up input and output device ptrs. See, e.g.,
+//     // matrix_inverse_op.cc for a full example.
+//     ...
+//
+//     // 2. Initialize the solver object.
+//     std::unique_ptr<GpuSolver> solver(new GpuSolver(context));
+//
+//     // 3. Launch the two compute kernels back to back on the stream without
+//     // synchronizing.
+//     std::vector<DeviceLapackInfo> dev_info;
+//     const int batch_size = 1;
+//     dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "potrf");
+//     // Compute the Cholesky decomposition of the input matrix.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver->Potrf(uplo, n, dev_matrix_ptrs, n,
+//                                        dev_info.back().mutable_data()),
+//                          done);
+//     dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "potrs");
+//     // Use the Cholesky decomposition of the input matrix to solve A X = RHS.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver->Potrs(uplo, n, nrhs, dev_matrix_ptrs, n,
+//                                        dev_output_ptrs, ldrhs,
+//                                        dev_info.back().mutable_data()),
+//                          done);
+//
+//     // 4. Check the status after the computation finishes and call done.
+//     solver.CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
+//                                                std::move(done));
+//   }
+// };
+
+template <typename Scalar>
+class ScratchSpace;
+
+class GpuSolver {
+ public:
+  // This object stores a pointer to context, which must outlive it.
+  explicit GpuSolver(OpKernelContext* context);
+  virtual ~GpuSolver();
+
+  // Launches a memcpy of solver status data specified by dev_lapack_info from
+  // device to the host, and asynchronously invokes the given callback when the
+  // copy is complete. The first Status argument to the callback will be
+  // Status::OK if all lapack infos retrieved are zero, otherwise an error
+  // status is given. The second argument contains a host-side copy of the
+  // entire set of infos retrieved, and can be used for generating detailed
+  // error messages.
+  // `info_checker_callback` must call the DoneCallback of any asynchronous
+  // OpKernel within which `solver` is used.
+  static void CheckLapackInfoAndDeleteSolverAsync(
+      std::unique_ptr<GpuSolver> solver,
+      const std::vector<DeviceLapackInfo>& dev_lapack_info,
+      std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+          info_checker_callback);
+
+  // Simpler version to use if no special error checking / messages are needed
+  // apart from checking that the Status of all calls was Status::OK.
+  // `done` may be nullptr.
+  static void CheckLapackInfoAndDeleteSolverAsync(
+      std::unique_ptr<GpuSolver> solver,
+      const std::vector<DeviceLapackInfo>& dev_lapack_info,
+      AsyncOpKernel::DoneCallback done);
+
+  // Returns a ScratchSpace. The GpuSolver object maintains a TensorReference
+  // to the underlying Tensor to prevent it from being deallocated prematurely.
+  template <typename Scalar>
+  ScratchSpace<Scalar> GetScratchSpace(const TensorShape& shape,
+                                       const std::string& debug_info,
+                                       bool on_host);
+  template <typename Scalar>
+  ScratchSpace<Scalar> GetScratchSpace(int64_t size,
+                                       const std::string& debug_info,
+                                       bool on_host);
+  // Returns a DeviceLapackInfo that will live for the duration of the
+  // GpuSolver object.
+  inline DeviceLapackInfo GetDeviceLapackInfo(int64_t size,
+                                              const std::string& debug_info);
+
+  // Allocates a temporary tensor that will live for the duration of the
+  // GpuSolver object.
+  Status allocate_scoped_tensor(DataType type, const TensorShape& shape,
+                                Tensor* scoped_tensor);
+  Status forward_input_or_allocate_scoped_tensor(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, Tensor* input_alias_or_new_scoped_tensor);
+
+  OpKernelContext* context() { return context_; }
+
+#if TENSORFLOW_USE_ROCM
+  // ====================================================================
+  // Wrappers for ROCSolver start here
+  //
+  // The method names below
+  // map to those in ROCSolver/Hipsolver, which follow the naming
+  // convention in LAPACK. See rocm_solvers.cc for a mapping of
+  // GpuSolverMethod to library API
+
+  // LU factorization.
+  // Computes LU factorization with partial pivoting P * A = L * U.
+  template <typename Scalar>
+  Status Getrf(int m, int n, Scalar* dev_A, int lda, int* dev_pivots,
+               int* info);
+
+  // Uses LU factorization to solve A * X = B.
+  template <typename Scalar>
+  Status Getrs(const gpuSolverOp_t trans, int n, int nrhs, Scalar* A, int lda,
+               int* dev_pivots, Scalar* B, int ldb, int* dev_lapack_info);
+
+  template <typename Scalar>
+  Status GetrfBatched(int n, Scalar** dev_A, int lda, int* dev_pivots,
+                      DeviceLapackInfo* info, const int batch_count);
+
+  template <typename Scalar>
+  Status GetrsBatched(const rocblas_operation trans, int n, int nrhs,
+                      Scalar** A, int lda, int* dev_pivots, Scalar** B,
+                      const int ldb, int* lapack_info, const int batch_count);
+
+  // Computes the Cholesky factorization A = L * L^H for a single matrix.
+  template <typename Scalar>
+  Status Potrf(gpuSolverFill_t uplo, int n, Scalar* dev_A, int lda,
+               int* dev_lapack_info);
+
+  // Computes matrix inverses for a batch of small matrices. Uses the outputs
+  // from GetrfBatched. No HipSolver implementation yet
+  template <typename Scalar>
+  Status GetriBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                      const int* dev_pivots,
+                      const Scalar* const host_a_inverse_dev_ptrs[], int ldainv,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size);
+
+  // Computes matrix inverses for a batch of small matrices with size n < 32.
+  // Returns OkStatus() if the kernel was launched successfully. Uses
+  // GetrfBatched and GetriBatched
+  template <typename Scalar>
+  Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                       const Scalar* const host_a_inverse_dev_ptrs[],
+                       int ldainv, DeviceLapackInfo* dev_lapack_info,
+                       int batch_size);
+
+  // Cholesky factorization
+  // Computes the Cholesky factorization A = L * L^H for a batch of small
+  // matrices.
+  template <typename Scalar>
+  Status PotrfBatched(gpuSolverFill_t uplo, int n,
+                      const Scalar* const host_a_dev_ptrs[], int lda,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size);
+
+  // See
+  // https://rocblas.readthedocs.io/en/latest/API_Reference_Guide.html#trsm_batched
+  // trsm_batched performs the following batched operation:
+  // op(A_i)*X_i = alpha*B_i or
+  // X_i*op(A_i) = alpha*B_i, for i = 1, ..., batch_count,
+  // where alpha is a scalar, X and B are batched m by n matrices,
+  // A is triangular batched matrix and op(A) is one of
+  // op( A ) = A   or
+  // op( A ) = A^T   or
+  // op( A ) = A^H.
+  // Each matrix X_i is overwritten on B_i for i = 1, ..., batch_count.
+  template <typename Scalar>
+  Status TrsmBatched(rocblas_side side, rocblas_fill uplo,
+                     rocblas_operation trans, rocblas_diagonal diag, int m,
+                     int n, const Scalar* alpha,
+                     const Scalar* const dev_Aarray[], int lda,
+                     Scalar* dev_Barray[], int ldb, int batch_size);
+
+  template <typename Scalar>
+  Status Trsv(rocblas_fill uplo, rocblas_operation trans, rocblas_diagonal diag,
+              int n, const Scalar* A, int lda, Scalar* x, int intcx);
+
+  template <typename Scalar>
+  Status Trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
+              rocblas_diagonal diag, int m, int n, const Scalar* alpha,
+              const Scalar* A, int lda, Scalar* B, int ldb);
+
+  // Singular value decomposition.
+  // See: https://hipsolver.readthedocs.io/en/latest/api_lapackfunc.html#svds
+  // No GesvdjBatched yet.
+  template <typename Scalar>
+  Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
+               int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+               int ldvt, int* dev_lapack_info);
+
+  // QR factorization.
+  // Computes QR factorization A = Q * R.
+  template <typename Scalar>
+  Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau,
+               int* dev_lapack_info);
+
+  // This function performs the matrix-matrix addition/transposition
+  //   C = alpha * op(A) + beta * op(B).
+  template <typename Scalar>
+  Status Geam(rocblas_operation transa, rocblas_operation transb, int m, int n,
+              const Scalar* alpha, /* host or device pointer */
+              const Scalar* A, int lda,
+              const Scalar* beta, /* host or device pointer */
+              const Scalar* B, int ldb, Scalar* C, int ldc);
+
+  // Overwrite matrix C by product of C and the unitary Householder matrix Q.
+  // The Householder matrix Q is represented by the output from Geqrf in dev_a
+  // and dev_tau.
+  template <typename Scalar>
+  Status Unmqr(gpuSolverSide_t side, gpuSolverOp_t trans, int m, int n, int k,
+               const Scalar* dev_a, int lda, const Scalar* dev_tau,
+               Scalar* dev_c, int ldc, int* dev_lapack_info);
+
+  // Overwrites QR factorization produced by Geqrf by the unitary Householder
+  // matrix Q. On input, the Householder matrix Q is represented by the output
+  // from Geqrf in dev_a and dev_tau. On output, dev_a is overwritten with the
+  // first n columns of Q. Requires m >= n >= 0.
+  template <typename Scalar>
+  Status Ungqr(int m, int n, int k, Scalar* dev_a, int lda,
+               const Scalar* dev_tau, int* dev_lapack_info);
+
+#if TF_ROCM_VERSION >= 40500
+  // Hermitian (Symmetric) Eigen decomposition.
+  template <typename Scalar>
+  Status Heevd(hipsolverEigMode_t jobz, gpuSolverFill_t uplo, int n,
+               Scalar* dev_A, int lda,
+               typename Eigen::NumTraits<Scalar>::Real* dev_W,
+               int* dev_lapack_info);
+#endif
+
+#else  // GOOGLE_CUDA
+  // ====================================================================
+  // Wrappers for cuSolverDN and cuBlas solvers start here.
+  //
+  // Apart from capitalization of the first letter, the method names below
+  // map to those in cuSolverDN and cuBlas, which follow the naming
+  // convention in LAPACK see, e.g.,
+  // http://docs.nvidia.com/cuda/cusolver/#naming-convention
+
+  // This function performs the matrix-matrix addition/transposition
+  //   C = alpha * op(A) + beta * op(B).
+  // Returns OkStatus() if the kernel was launched successfully.  See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam
+  // NOTE(ebrevdo): Does not support in-place transpose of non-square
+  // matrices.
+
+  template <typename Scalar>
+  Status Geam(cublasOperation_t transa, cublasOperation_t transb, int m, int n,
+              const Scalar* alpha, /* host or device pointer */
+              const Scalar* A, int lda,
+              const Scalar* beta, /* host or device pointer */
+              const Scalar* B, int ldb, Scalar* C, int ldc) const;
+
+  // Computes the Cholesky factorization A = L * L^H for a single matrix.
+  // Returns OkStatus() if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
+  template <typename Scalar>
+  Status Potrf(cublasFillMode_t uplo, int n, Scalar* dev_A, int lda,
+               int* dev_lapack_info);
+
+  // Computes the Cholesky factorization A = L * L^H for a batch of small
+  // matrices.
+  // Returns OkStatus() if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrfBatched
+  template <typename Scalar>
+  Status PotrfBatched(cublasFillMode_t uplo, int n,
+                      const Scalar* const host_a_dev_ptrs[], int lda,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size);
+  // LU factorization.
+  // Computes LU factorization with partial pivoting P * A = L * U.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrf
+  template <typename Scalar>
+  Status Getrf(int m, int n, Scalar* dev_A, int lda, int* dev_pivots,
+               int* dev_lapack_info);
+
+  // Uses LU factorization to solve A * X = B.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrs
+  template <typename Scalar>
+  Status Getrs(cublasOperation_t trans, int n, int nrhs, const Scalar* A,
+               int lda, const int* pivots, Scalar* B, int ldb,
+               int* dev_lapack_info) const;
+
+  // Computes partially pivoted LU factorizations for a batch of small matrices.
+  // Returns OkStatus() if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
+  template <typename Scalar>
+  Status GetrfBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                      int* dev_pivots, DeviceLapackInfo* dev_lapack_info,
+                      int batch_size);
+
+  // Batched linear solver using LU factorization from getrfBatched.
+  // Notice that lapack_info is returned on the host, as opposed to
+  // most of the other functions that return it on the device. See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
+  template <typename Scalar>
+  Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
+                      const Scalar* const dev_Aarray[], int lda,
+                      const int* devIpiv, const Scalar* const dev_Barray[],
+                      int ldb, int* host_lapack_info, int batch_size);
+
+  // Computes matrix inverses for a batch of small matrices. Uses the outputs
+  // from GetrfBatched. Returns OkStatus() if the kernel was launched
+  // successfully. See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getribatched
+  template <typename Scalar>
+  Status GetriBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                      const int* dev_pivots,
+                      const Scalar* const host_a_inverse_dev_ptrs[], int ldainv,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size);
+
+  // Computes matrix inverses for a batch of small matrices with size n < 32.
+  // Returns OkStatus() if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-matinvbatched
+  template <typename Scalar>
+  Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                       const Scalar* const host_a_inverse_dev_ptrs[],
+                       int ldainv, DeviceLapackInfo* dev_lapack_info,
+                       int batch_size);
+
+  // QR factorization.
+  // Computes QR factorization A = Q * R.
+  // Returns OkStatus() if the kernel was launched successfully.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
+  template <typename Scalar>
+  Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau,
+               int* dev_lapack_info);
+
+  // Overwrite matrix C by product of C and the unitary Householder matrix Q.
+  // The Householder matrix Q is represented by the output from Geqrf in dev_a
+  // and dev_tau.
+  // Notice: If Scalar is real, only trans=CUBLAS_OP_N or trans=CUBLAS_OP_T is
+  // supported. If Scalar is complex, trans=CUBLAS_OP_N or trans=CUBLAS_OP_C is
+  // supported.
+  // Returns OkStatus() if the kernel was launched successfully.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-ormqr
+  template <typename Scalar>
+  Status Unmqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n,
+               int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,
+               Scalar* dev_c, int ldc, int* dev_lapack_info);
+
+  // Overwrites QR factorization produced by Geqrf by the unitary Householder
+  // matrix Q. On input, the Householder matrix Q is represented by the output
+  // from Geqrf in dev_a and dev_tau. On output, dev_a is overwritten with the
+  // first n columns of Q. Requires m >= n >= 0.
+  // Returns OkStatus() if the kernel was launched successfully.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-orgqr
+  template <typename Scalar>
+  Status Ungqr(int m, int n, int k, Scalar* dev_a, int lda,
+               const Scalar* dev_tau, int* dev_lapack_info);
+
+  // Hermitian (Symmetric) Eigen decomposition.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-syevd
+  template <typename Scalar>
+  Status Heevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
+               Scalar* dev_A, int lda,
+               typename Eigen::NumTraits<Scalar>::Real* dev_W,
+               int* dev_lapack_info);
+
+  // Singular value decomposition.
+  // Returns OkStatus() if the kernel was launched successfully.
+  // TODO(rmlarsen, volunteers): Add support for complex types.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-gesvd
+  template <typename Scalar>
+  Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
+               int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+               int ldvt, int* dev_lapack_info);
+  template <typename Scalar>
+  Status GesvdjBatched(cusolverEigMode_t jobz, int m, int n, Scalar* dev_A,
+                       int lda, Scalar* dev_S, Scalar* dev_U, int ldu,
+                       Scalar* dev_V, int ldv, int* dev_lapack_info,
+                       int batch_size);
+
+  // Triangular solve
+  // Returns OkStatus() if the kernel was launched successfully.
+  // See https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-trsm
+  template <typename Scalar>
+  Status Trsm(cublasSideMode_t side, cublasFillMode_t uplo,
+              cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+              const Scalar* alpha, const Scalar* A, int lda, Scalar* B,
+              int ldb);
+
+  template <typename Scalar>
+  Status Trsv(cublasFillMode_t uplo, cublasOperation_t trans,
+              cublasDiagType_t diag, int n, const Scalar* A, int lda, Scalar* x,
+              int intcx);
+
+  // See
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-trsmbatched
+  template <typename Scalar>
+  Status TrsmBatched(cublasSideMode_t side, cublasFillMode_t uplo,
+                     cublasOperation_t trans, cublasDiagType_t diag, int m,
+                     int n, const Scalar* alpha,
+                     const Scalar* const dev_Aarray[], int lda,
+                     Scalar* dev_Barray[], int ldb, int batch_size);
+#endif
+
+ private:
+  OpKernelContext* context_;  // not owned.
+#if GOOGLE_CUDA
+  cudaStream_t cuda_stream_;
+  cusolverDnHandle_t cusolver_dn_handle_;
+  cublasHandle_t cublas_handle_;
+#else  // TENSORFLOW_USE_ROCM
+  hipStream_t hip_stream_;
+  rocblas_handle rocm_blas_handle_;
+#if TF_ROCM_VERSION >= 40500
+  hipsolverHandle_t hipsolver_handle_;
+#endif
+#endif
+
+  std::vector<TensorReference> scratch_tensor_refs_;
+
+  GpuSolver(const GpuSolver&) = delete;
+  void operator=(const GpuSolver&) = delete;
+};
+
+// Helper class to allocate scratch memory and keep track of debug info.
+// Mostly a thin wrapper around Tensor & allocate_temp.
+template <typename Scalar>
+class ScratchSpace {
+ public:
+  ScratchSpace(OpKernelContext* context, int64_t size, bool on_host)
+      : ScratchSpace(context, TensorShape({size}), "", on_host) {}
+
+  ScratchSpace(OpKernelContext* context, int64_t size,
+               const std::string& debug_info, bool on_host)
+      : ScratchSpace(context, TensorShape({size}), debug_info, on_host) {}
+
+  ScratchSpace(OpKernelContext* context, const TensorShape& shape,
+               const std::string& debug_info, bool on_host)
+      : context_(context), debug_info_(debug_info), on_host_(on_host) {
+    AllocatorAttributes alloc_attr;
+    if (on_host) {
+      // Allocate pinned memory on the host to avoid unnecessary
+      // synchronization.
+      alloc_attr.set_on_host(true);
+      alloc_attr.set_gpu_compatible(true);
+    }
+    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<Scalar>::value, shape,
+                                       &scratch_tensor_, alloc_attr));
+  }
+
+  virtual ~ScratchSpace() {}
+
+  Scalar* mutable_data() {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  const Scalar* data() const {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  Scalar& operator()(int64_t i) {
+    return scratch_tensor_.template flat<Scalar>()(i);
+  }
+  const Scalar& operator()(int64_t i) const {
+    return scratch_tensor_.template flat<Scalar>()(i);
+  }
+  int64_t bytes() const { return scratch_tensor_.TotalBytes(); }
+  int64_t size() const { return scratch_tensor_.NumElements(); }
+  const std::string& debug_info() const { return debug_info_; }
+
+  Tensor& tensor() { return scratch_tensor_; }
+  const Tensor& tensor() const { return scratch_tensor_; }
+
+  // Returns true if this ScratchSpace is in host memory.
+  bool on_host() const { return on_host_; }
+
+ protected:
+  OpKernelContext* context() const { return context_; }
+
+ private:
+  OpKernelContext* context_;  // not owned
+  const std::string debug_info_;
+  const bool on_host_;
+  Tensor scratch_tensor_;
+};
+
+class HostLapackInfo : public ScratchSpace<int> {
+ public:
+  HostLapackInfo(OpKernelContext* context, int64_t size,
+                 const std::string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ true) {}
+};
+
+class DeviceLapackInfo : public ScratchSpace<int> {
+ public:
+  DeviceLapackInfo(OpKernelContext* context, int64_t size,
+                   const std::string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ false) {}
+
+  // Allocates a new scratch space on the host and launches a copy of the
+  // contents of *this to the new scratch space. Sets success to true if
+  // the copy kernel was launched successfully.
+  HostLapackInfo CopyToHost(bool* success) const {
+    CHECK(success != nullptr);
+    HostLapackInfo copy(context(), size(), debug_info());
+    auto stream = context()->op_device_context()->stream();
+    se::DeviceMemoryBase wrapped_src(
+        static_cast<void*>(const_cast<int*>(this->data())));
+    *success =
+        stream->Memcpy(copy.mutable_data(), wrapped_src, this->bytes()).ok();
+    return copy;
+  }
+};
+
+template <typename Scalar>
+ScratchSpace<Scalar> GpuSolver::GetScratchSpace(const TensorShape& shape,
+                                                const std::string& debug_info,
+                                                bool on_host) {
+  ScratchSpace<Scalar> new_scratch_space(context_, shape, debug_info, on_host);
+  scratch_tensor_refs_.emplace_back(new_scratch_space.tensor());
+  return std::move(new_scratch_space);
+}
+
+template <typename Scalar>
+ScratchSpace<Scalar> GpuSolver::GetScratchSpace(int64_t size,
+                                                const std::string& debug_info,
+                                                bool on_host) {
+  return GetScratchSpace<Scalar>(TensorShape({size}), debug_info, on_host);
+}
+
+inline DeviceLapackInfo GpuSolver::GetDeviceLapackInfo(
+    int64_t size, const std::string& debug_info) {
+  DeviceLapackInfo new_dev_info(context_, size, debug_info);
+  scratch_tensor_refs_.emplace_back(new_dev_info.tensor());
+  return new_dev_info;
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#endif  // TENSORFLOW_CORE_UTIL_GPU_SOLVERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/guarded_philox_random.h b/third_party/tflite-hdrs/tensorflow/core/util/guarded_philox_random.h
new file mode 100644
index 00000000..d22d60ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/guarded_philox_random.h
@@ -0,0 +1,83 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GUARDED_PHILOX_RANDOM_H_
+#define TENSORFLOW_CORE_UTIL_GUARDED_PHILOX_RANDOM_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// A thread safe wrapper around a Philox generator.  Example usage:
+//
+//   GuardedRandomPhilox generator;
+//   generator.Init(context);
+//
+//   // In thread safe code
+//   const int samples = ...;
+//   auto local_generator = generator.ReserveSamples128(samples);
+//   for (int i = 0; i < samples; i++)
+//     Array<uint32, 4> sample = local_generator();
+//     // Use sample
+//   }
+//
+class GuardedPhiloxRandom {
+ public:
+  // Must call Init to finish initialization
+  GuardedPhiloxRandom() : initialized_(false) {}
+
+  // Initialize the generator from attributes "seed" and "seed2".
+  // If both seeds are unspecified, use random seeds.
+  // Must be called exactly once.
+  absl::Status Init(OpKernelConstruction* context);
+
+  // Initialize with given seeds.
+  void Init(int64_t seed, int64_t seed2);
+  void Init(random::PhiloxRandom::ResultType counter,
+            random::PhiloxRandom::Key key);
+
+  // Reserve a certain number of 128-bit samples.
+  // This function is thread safe.  The returned generator is valid for the
+  // given number of samples, and can be used without a lock.
+  random::PhiloxRandom ReserveSamples128(int64_t samples);
+
+  // Reserve a certain number of 32-bit samples.
+  random::PhiloxRandom ReserveSamples32(int64_t samples) {
+    return ReserveSamples128((samples + 3) / 4);
+  }
+
+  // Reserve enough random samples in the generator for the given output count.
+  random::PhiloxRandom ReserveRandomOutputs(int64_t output_count,
+                                            int multiplier) {
+    int64_t conservative_sample_count = output_count * multiplier;
+    return ReserveSamples128(conservative_sample_count);
+  }
+
+ private:
+  mutex mu_;
+  random::PhiloxRandom generator_ TF_GUARDED_BY(mu_);
+  bool initialized_;
+
+  GuardedPhiloxRandom(const GuardedPhiloxRandom&) = delete;
+  void operator=(const GuardedPhiloxRandom&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_GUARDED_PHILOX_RANDOM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/image_resizer_state.h b/third_party/tflite-hdrs/tensorflow/core/util/image_resizer_state.h
new file mode 100644
index 00000000..e028ec5c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/image_resizer_state.h
@@ -0,0 +1,259 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a helper struct to package up the input and output
+// parameters of an image resizer (the height, widths, etc.).  To
+// reduce code duplication and ensure consistency across the different
+// resizers, it performs the input validation.
+
+#ifndef TENSORFLOW_CORE_UTIL_IMAGE_RESIZER_STATE_H_
+#define TENSORFLOW_CORE_UTIL_IMAGE_RESIZER_STATE_H_
+
+#define EIGEN_USE_THREADS
+#include <math.h>
+
+#include <algorithm>
+#include <array>
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// CalculateResizeScale determines the float scaling factor.
+inline float CalculateResizeScale(int64_t in_size, int64_t out_size,
+                                  bool align_corners) {
+  return (align_corners && in_size > 1 && out_size > 1)
+             ? (in_size - 1) / static_cast<float>(out_size - 1)
+             : in_size / static_cast<float>(out_size);
+}
+
+// Half pixel scaler scales assuming that the pixel centers are at 0.5, i.e. the
+// floating point coordinates of the top,left pixel is 0.5,0.5.
+struct HalfPixelScaler {
+  HalfPixelScaler() = default;
+  inline float operator()(const int x, const float scale) const {
+    // Note that we subtract 0.5 from the return value, as the existing bilinear
+    // sampling code etc assumes pixels are in the old coordinate system.
+    return (static_cast<float>(x) + 0.5f) * scale - 0.5f;
+  }
+};
+
+// Older incorrect scaling method that causes all resizes to have a slight
+// translation leading to inconsistent results. For example, a flip then a
+// resize gives different results then a resize then a flip.
+struct LegacyScaler {
+  LegacyScaler() = default;
+  inline float operator()(const int x, const float scale) const {
+    return static_cast<float>(x) * scale;
+  }
+};
+
+struct ImageResizerState {
+  explicit ImageResizerState(bool align_corners, bool half_pixel_centers)
+      : align_corners_(align_corners),
+        half_pixel_centers_(half_pixel_centers) {}
+
+  // ValidateAndCalculateOutputSize checks the bounds on the input tensors
+  // and requested size, sets up some of the resizing state such as the
+  // height_scale and width_scale, and calculates the output size.
+  // If any of these operations fails, it sets an error status in
+  // the context, which the caller must check.
+  void ValidateAndCalculateOutputSize(OpKernelContext* context) {
+    OP_REQUIRES(
+        context,
+        !half_pixel_centers_ || (half_pixel_centers_ && !align_corners_),
+        errors::InvalidArgument("If half_pixel_centers is True, "
+                                "align_corners must be False."));
+
+    const TensorShape& input_shape = context->input(0).shape();
+    OP_REQUIRES(context, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    batch_size = input_shape.dim_size(0);
+    channels = input_shape.dim_size(3);
+    OP_REQUIRES(
+        context, channels > 0,
+        errors::InvalidArgument("image must have at least one channel"));
+
+    // Verify and assign `in_height` and `in_width`.
+    OP_REQUIRES(
+        context, input_shape.dim_size(1) > 0 && input_shape.dim_size(2) > 0,
+        errors::InvalidArgument("input image must be of non-zero size"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_shape.dim_size(1),
+                        std::numeric_limits<int32>::max()) &&
+            FastBoundsCheck(input_shape.dim_size(2),
+                            std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("input sizes must be between 0 and max int32"));
+    in_height = static_cast<int32>(input_shape.dim_size(1));
+    in_width = static_cast<int32>(input_shape.dim_size(2));
+
+    // Verify the output tensor's shape.
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().DebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().DebugString()));
+
+    // Verify and assign `out_height` and `out_width`.
+    auto Svec = shape_t.vec<int32>();
+    out_height = internal::SubtleMustCopy(Svec(0));
+    out_width = internal::SubtleMustCopy(Svec(1));
+    OP_REQUIRES(context, out_height > 0 && out_width > 0,
+                errors::InvalidArgument("output dimensions must be positive"));
+
+    height_scale = CalculateResizeScale(in_height, out_height, align_corners_);
+    width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
+
+    // Guard against overflows
+    OP_REQUIRES(context,
+                ceilf((out_height - 1) * height_scale) <=
+                    static_cast<float>(std::numeric_limits<int64_t>::max()),
+                errors::InvalidArgument(
+                    "input image height scale would cause an overflow"));
+    OP_REQUIRES(
+        context,
+        ceilf((out_width - 1) * width_scale) <= static_cast<float>(INT_MAX),
+        errors::InvalidArgument(
+            "input image width scale would cause an overflow"));
+  }
+
+  // Calculates all the required variables, and allocates the output.
+  void ValidateAndCreateOutput(OpKernelContext* context) {
+    ValidateAndCalculateOutputSize(context);
+    if (!context->status().ok()) return;
+
+    TensorShape shape;
+    // Guard against shape overflow
+    OP_REQUIRES_OK(context, shape.AddDimWithStatus(batch_size));
+    OP_REQUIRES_OK(context, shape.AddDimWithStatus(out_height));
+    OP_REQUIRES_OK(context, shape.AddDimWithStatus(out_width));
+    OP_REQUIRES_OK(context, shape.AddDimWithStatus(channels));
+
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
+  }
+
+  int64_t batch_size;
+  int64_t out_height;
+  int64_t out_width;
+  int64_t in_height;
+  int64_t in_width;
+  int64_t channels;
+  float height_scale;
+  float width_scale;
+  Tensor* output = nullptr;
+
+ private:
+  bool align_corners_;
+  bool half_pixel_centers_;
+};
+
+struct ImageResizerGradientState {
+  explicit ImageResizerGradientState(bool align_corners,
+                                     bool half_pixel_centers)
+      : align_corners_(align_corners),
+        half_pixel_centers_(half_pixel_centers) {}
+
+  void ValidateAndCreateOutput(OpKernelContext* context) {
+    OP_REQUIRES(
+        context,
+        !half_pixel_centers_ || (half_pixel_centers_ && !align_corners_),
+        errors::InvalidArgument("If half_pixel_centers is True, "
+                                "align_corners must be False."));
+
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input_grad must be 4-dimensional",
+                                        input.shape().DebugString()));
+
+    // Resizers always produce float images, so input gradient must
+    // always be a float.
+    OP_REQUIRES(context, input.dtype() == DT_FLOAT,
+                errors::InvalidArgument("input_grad must be of type float",
+                                        DataTypeString(input.dtype())));
+
+    batch_size = input.dim_size(0);
+    channels = input.dim_size(3);
+
+    resized_height = input.dim_size(1);
+    resized_width = input.dim_size(2);
+
+    // The following check is also carried out for the forward op. It is added
+    // here to prevent a divide-by-zero exception when either height_scale or
+    // width_scale is being calculated.
+    OP_REQUIRES(context, resized_height > 0 && resized_width > 0,
+                errors::InvalidArgument("resized dimensions must be positive"));
+
+    const TensorShape& output_shape = context->input(1).shape();
+    OP_REQUIRES(context, output_shape.dims() == 4,
+                errors::InvalidArgument("original_image must be 4-dimensional",
+                                        output_shape.DebugString()));
+    original_height = output_shape.dim_size(1);
+    original_width = output_shape.dim_size(2);
+
+    // The following check is also carried out for the forward op. It is added
+    // here to prevent either height_scale or width_scale from being set to
+    // zero, which would cause a divide-by-zero exception in the deterministic
+    // back-prop path.
+    OP_REQUIRES(
+        context, original_height > 0 && original_width > 0,
+        errors::InvalidArgument("original dimensions must be positive"));
+
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(original_height, std::numeric_limits<int32>::max()) &&
+            FastBoundsCheck(original_width, std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "original sizes must be between 0 and max int32"));
+
+    height_scale =
+        CalculateResizeScale(original_height, resized_height, align_corners_);
+    width_scale =
+        CalculateResizeScale(original_width, resized_width, align_corners_);
+
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0,
+                                TensorShape({batch_size, original_height,
+                                             original_width, channels}),
+                                &output));
+  }
+
+  int64_t batch_size;
+  int64_t channels;
+  int64_t resized_height;
+  int64_t resized_width;
+  int64_t original_height;
+  int64_t original_width;
+  float height_scale;
+  float width_scale;
+  Tensor* output = nullptr;
+
+ private:
+  bool align_corners_;
+  bool half_pixel_centers_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_IMAGE_RESIZER_STATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/incremental_barrier.h b/third_party/tflite-hdrs/tensorflow/core/util/incremental_barrier.h
new file mode 100644
index 00000000..2ccb5a51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/incremental_barrier.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_INCREMENTAL_BARRIER_H_
+#define TENSORFLOW_CORE_UTIL_INCREMENTAL_BARRIER_H_
+
+#include <atomic>
+#include <functional>
+
+namespace tensorflow {
+
+class InternalIncrementalBarrier;
+
+// BarrierClosure (see
+// https://github.com/chromium/chromium/blob/master/base/barrier_closure.h)
+// executes a callback after it has been invoked |num_closures| times.
+// Plus, `BarrierClosure` is a continuation-passing style abstraction and self-
+// deleting.
+
+// IncrementalBarrier is a convenience class to be used in place of a barrier
+// closure, which is particularly helpful (e.g. simplify code) because callers
+// don't need to calculate the |num_closures| beforehand.
+//
+// Example Usage:
+//   void MakeCalls() {
+//     typedef std::function<void()> Callback;
+//     typedef std::function<void(Callback)> OtherCallback;
+//     Callback done_callback = ...
+//     OtherCallback cb1 = ...
+//     OtherCallback cb2 = ...
+//     std::thread threads[2];
+//     {
+//         IncrementalBarrier barrier(done_callback);
+//         threads[0] = std::thread(cb1(barrier.Inc());
+//         threads[1] = std::thread(cb2(barrier.Inc());
+//         ... at this moment, `barrier` is incremented twice, and then
+//         destructed....
+//     }
+//     threads[0].join();
+//     threads[1].join();
+//   }
+//
+//  `done_callback` will be called when both conditions are true:
+//  1) after `barrier` is destructed.
+//  2) Each `BarrierCallback` returned by `Inc` is called.
+// This class is thread-safe.
+class IncrementalBarrier {
+ public:
+  typedef std::function<void()> DoneCallback;
+  typedef std::function<void()> BarrierCallback;
+  explicit IncrementalBarrier(DoneCallback callback);
+
+  ~IncrementalBarrier();
+
+  // Returns a BarrierCallback (std::function) that individual task call to
+  // signal its completeness.
+  // The returned BarrierCallback outlives this `IncrementalBarrier` instance.
+  // Furthermore, each task should eventually call the returned function, or
+  // else done_callback wouldn't be called.
+  BarrierCallback Inc();
+
+ private:
+  // self-deleting, thereby not owned by 'IncrementalBarrier'.
+  InternalIncrementalBarrier* internal_barrier_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_INCREMENTAL_BARRIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/managed_stack_trace.h b/third_party/tflite-hdrs/tensorflow/core/util/managed_stack_trace.h
new file mode 100644
index 00000000..f06bd074
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/managed_stack_trace.h
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MANAGED_STACK_TRACE_H_
+#define TENSORFLOW_CORE_UTIL_MANAGED_STACK_TRACE_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/stack_frame.h"
+
+namespace tensorflow {
+
+// Returns "true" on filenames which should be skipped.
+using StackTraceFilter = std::function<bool(const char*)>;
+
+using SourceLoc = std::pair<std::string, int>;
+
+// Using absl::Hash breaks NVCC under Windows :P
+struct PairHash {
+  template <class T1, class T2>
+  std::size_t operator()(const std::pair<T1, T2>& pair) const {
+    std::size_t h1 = std::hash<T1>()(pair.first);
+    std::size_t h2 = std::hash<T2>()(pair.second);
+    return h1 + 0x9e3779b9 + (h2 << 6) + (h2 >> 2);
+  }
+};
+
+// Maps filename/line_no combination into a stack frame.
+using SourceMap = std::unordered_map<SourceLoc, StackFrame, PairHash>;
+
+using ToStackFramesFunctor = std::vector<StackFrame>(int, const SourceMap&,
+                                                     const StackTraceFilter&,
+                                                     bool, int);
+
+// Returns whether the given frame is internal to TF.
+inline bool IsInternalFrameForFilename(absl::string_view file_name) {
+  // Use a simple heuristic for now.
+  // TODO(cheshire): Build a more sophisticated mechanism, rely on @tf.export.
+  return (absl::StrContains(file_name, "tensorflow/python") ||
+          absl::StrContains(file_name, "tensorflow\\python")) &&
+         !absl::StrContains(file_name, "keras") &&
+         !absl::StrContains(file_name, "test.py");
+}
+
+class CapturedStackTrace {
+ public:
+  virtual ~CapturedStackTrace() = default;
+
+  std::vector<StackFrame> ToStackFrames(const SourceMap& source_map,
+                                        const StackTraceFilter& filtered) {
+    return ToStackFrames(source_map, filtered, /*reverse_traversal=*/false,
+                         /*limit=*/-1);
+  }
+  virtual std::vector<StackFrame> ToStackFrames(
+      const SourceMap& source_map, const StackTraceFilter& filtered,
+      bool reverse_traversal, int limit) const = 0;
+};
+
+// Kept for backwards compatibility with existing users, this simply wraps an
+// underlying stack trace pointer.
+class ManagedStackTrace : public CapturedStackTrace {
+ public:
+  explicit ManagedStackTrace(std::shared_ptr<CapturedStackTrace> trace)
+      : trace_(trace) {}
+
+  ~ManagedStackTrace() override { trace_.reset(); }
+
+  // Returns stack trace as a vector of `StackFrame`s.
+  std::vector<StackFrame> ToStackFrames(const SourceMap& source_map,
+                                        const StackTraceFilter& filtered,
+                                        bool reverse_traversal,
+                                        int limit) const override {
+    return trace_->ToStackFrames(source_map, filtered, reverse_traversal,
+                                 limit);
+  }
+
+ private:
+  std::shared_ptr<CapturedStackTrace> trace_;
+};
+
+// Generates a message with a definition location based on a provided stack
+// trace, or an empty one if the stack trace is empty.
+inline std::string DefinitionLocationMsg(
+    const absl::optional<ManagedStackTrace>& stack_trace) {
+  if (stack_trace.has_value()) {
+    std::vector<StackFrame> stack_frames =
+        stack_trace->ToStackFrames({}, IsInternalFrameForFilename,
+                                   /*reverse_traversal=*/true,
+                                   /*limit=*/1);
+    if (!stack_frames.empty()) {
+      const StackFrame& last_frame = stack_frames[0];
+      return absl::StrCat(" (defined @ ", last_frame.file_name, ":",
+                          last_frame.line_number, ")");
+    }
+  }
+  return "";
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_MANAGED_STACK_TRACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/matmul_autotune.h b/third_party/tflite-hdrs/tensorflow/core/util/matmul_autotune.h
new file mode 100644
index 00000000..5846cae2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/matmul_autotune.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to check matmul autotune related flags.
+
+#ifndef TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#define TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+
+namespace tensorflow {
+
+bool MatmulAutotuneEnable();
+bool MatmulDoFP32ComputationFP16Input();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/matmul_bcast.h b/third_party/tflite-hdrs/tensorflow/core/util/matmul_bcast.h
new file mode 100644
index 00000000..6af757d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/matmul_bcast.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
+#define TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+
+// Simple wrapper over BCast specialized for MatMul.
+// Provides utilities for broadcasting across batch dimensions for binary
+// MatMul-like operations. If neither argument has batch dimensions (rank <= 2)
+// then no broadcasting is needed and the operation MatMul operation is
+// considered valid.
+class MatMulBCast {
+ public:
+  using Vec = BCast::Vec;
+
+  MatMulBCast(const Vec& x, const Vec& y) {
+    if (std::max(x.size(), y.size()) == 2) return;
+    const Vec x_resized(x.begin(), x.end() - 2);
+    const Vec y_resized(y.begin(), y.end() - 2);
+
+    batch_bcast_ =
+        std::make_unique<BCast>(std::move(x_resized), std::move(y_resized));
+    if (!batch_bcast_->IsValid()) {
+      // Set broadcasting_required_ to true to make IsValid() return false;
+      broadcasting_required_ = true;
+      return;
+    }
+
+    x_batch_size_ = TensorShape(batch_bcast_->x_reshape()).num_elements();
+    y_batch_size_ = TensorShape(batch_bcast_->y_reshape()).num_elements();
+    output_batch_shape_ = TensorShape(batch_bcast_->output_shape());
+    output_batch_size_ = output_batch_shape_.num_elements();
+    broadcasting_required_ =
+        std::min(x_batch_size_, y_batch_size_) != output_batch_size_;
+
+    if (broadcasting_required_) {
+      ComputeBatchIndices(output_batch_size_, batch_bcast_->x_reshape(),
+                          batch_bcast_->x_bcast(), &x_batch_indices_);
+      ComputeBatchIndices(output_batch_size_, batch_bcast_->y_reshape(),
+                          batch_bcast_->y_bcast(), &y_batch_indices_);
+    }
+  }
+
+  bool IsValid() const {
+    return !broadcasting_required_ || (batch_bcast_ && batch_bcast_->IsValid());
+  }
+  bool IsBroadcastingRequired() const { return broadcasting_required_; }
+
+  int64_t output_batch_size() const { return output_batch_size_; }
+  int64_t x_batch_size() const { return x_batch_size_; }
+  int64_t y_batch_size() const { return y_batch_size_; }
+  const TensorShape& output_batch_shape() const { return output_batch_shape_; }
+
+  // Returns the mapping from the flattened output batch indices to x's
+  // flattened batch indices. The result is a vector of length
+  // output_batch_size(). To compute the i'th batch output, a binary matmul-like
+  // operation should use the `x_batch_indices()[i]`th batch index of `x`.
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int64_t>& x_batch_indices() const {
+    return x_batch_indices_;
+  }
+  // Returns the mapping from the flattened output batch indices to y's
+  // flattened batch indices. Similar to x_batch_indices().
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int64_t>& y_batch_indices() const {
+    return y_batch_indices_;
+  }
+
+ private:
+  std::unique_ptr<BCast> batch_bcast_;
+  bool broadcasting_required_ = false;
+  int64_t x_batch_size_ = 1;
+  int64_t y_batch_size_ = 1;
+  TensorShape output_batch_shape_;
+  int64_t output_batch_size_ = 1;
+  std::vector<int64_t> x_batch_indices_;
+  std::vector<int64_t> y_batch_indices_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/memmapped_file_system.h b/third_party/tflite-hdrs/tensorflow/core/util/memmapped_file_system.h
new file mode 100644
index 00000000..225defc4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/memmapped_file_system.h
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MEMMAPPED_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_UTIL_MEMMAPPED_FILE_SYSTEM_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+// A file system that uses a graph saved in memmapped format by
+// MemmappedEnvWriter as a file system.
+//
+// The format supports saved tensors and protos. Tensors are saved at aligned
+// offsets.
+//
+// Format specification:
+// - last 8 bytes of a package is encoded offset to the directory. The encoding
+// is always little endian, independently from the platform, done by functions
+// EncodeUint64LittleEndian/DecodeUint64LittleEndian
+// - the directory starts from the encoded offset and is saved proto
+// MemmappedFileSystemDirectory with names and offsets to the regions.
+// - at the offsets in the directory the file regions are stored. Tensor regions
+// are aligned such way that when the package mapped to RAM they have the right
+// offset to be used by ImmutableConst operator.
+//
+// Region naming:
+// Region naming is up to the application, all of them starts from
+// kMemmappedPackagePrefix. The default graph usually has name
+// kMemmappedPackageDefaultGraphDef;
+//
+// A "frozen" GraphDef can be converted into this format using
+// tensorflow/contrib/util/convert_graphdef_memmapped_format
+class MemmappedFileSystem : public FileSystem {
+ public:
+  // Memmapped regions use this prefix to distinguish from
+  // the filesystem.
+  static constexpr const char kMemmappedPackagePrefix[] =
+      "memmapped_package://";
+
+  // The default graphdef in the package.
+  static constexpr const char kMemmappedPackageDefaultGraphDef[] =
+      "memmapped_package://.";
+
+  MemmappedFileSystem();
+  ~MemmappedFileSystem() override = default;
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override;
+  absl::Status NewRandomAccessFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  // All these functions return Unimplemented error, the memmapped storage is
+  // read only.
+  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override;
+  absl::Status NewAppendableFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override;
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* r) override;
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* results) override;
+  absl::Status DeleteFile(const string& f, TransactionToken* token) override;
+  absl::Status CreateDir(const string& d, TransactionToken* token) override;
+  absl::Status DeleteDir(const string& d, TransactionToken* token) override;
+  absl::Status RenameFile(const string& s, const string& t,
+                          TransactionToken* token) override;
+
+  // These functions are implemented.
+  absl::Status GetFileSize(const string& f, TransactionToken* token,
+                           uint64* s) override;
+  // Currently just returns size.
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override;
+
+  // Initializes filesystem from a file in memmapped format.
+  absl::Status InitializeFromFile(Env* env, const string& filename);
+
+  // Checks if the filename has a correct prefix.
+  static bool IsMemmappedPackageFilename(const string& filename);
+
+  static bool IsWellFormedMemmappedPackageFilename(const string& filename);
+
+ private:
+  struct FileRegion {
+    FileRegion(uint64 o, uint64 l) : offset(o), length(l) {}
+
+    uint64 offset;  // Offset from the beginning of the file.
+    uint64 length;  // Length of the region.
+  };
+
+  using DirectoryType = std::unordered_map<string, FileRegion>;
+
+  const void* GetMemoryWithOffset(uint64 offset) const;
+
+  std::unique_ptr<ReadOnlyMemoryRegion> mapped_memory_;
+  DirectoryType directory_;
+
+  MemmappedFileSystem(const MemmappedFileSystem&) = delete;
+  void operator=(const MemmappedFileSystem&) = delete;
+};
+
+class MemmappedEnv : public EnvWrapper {
+ public:
+  explicit MemmappedEnv(Env* env);
+  ~MemmappedEnv() override = default;
+  absl::Status GetFileSystemForFile(const string& fname,
+                                    FileSystem** result) override;
+  absl::Status GetRegisteredFileSystemSchemes(
+      std::vector<string>* schemes) override;
+  absl::Status InitializeFromFile(const string& filename);
+
+ protected:
+  std::unique_ptr<MemmappedFileSystem> memmapped_file_system_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_MEMMAPPED_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/memmapped_file_system_writer.h b/third_party/tflite-hdrs/tensorflow/core/util/memmapped_file_system_writer.h
new file mode 100644
index 00000000..9d0db927
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/memmapped_file_system_writer.h
@@ -0,0 +1,54 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_MEMMAPPED_FILE_SYSTEM_WRITER_H_
+#define TENSORFLOW_CORE_UTIL_MEMMAPPED_FILE_SYSTEM_WRITER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/memmapped_file_system.h"
+#include "tensorflow/core/util/memmapped_file_system.pb.h"
+
+namespace tensorflow {
+
+// A class for saving into the memmapped format that can be read by
+// MemmappedFileSystem.
+class MemmappedFileSystemWriter {
+ public:
+  MemmappedFileSystemWriter() = default;
+  ~MemmappedFileSystemWriter() = default;
+  absl::Status InitializeToFile(Env* env, const string& filename);
+  absl::Status SaveTensor(const Tensor& tensor, const string& element_name);
+  absl::Status SaveProtobuf(const protobuf::MessageLite& message,
+                            const string& element_name);
+  // Writes out the directory of regions and closes the output file.
+  absl::Status FlushAndClose();
+
+ private:
+  absl::Status AdjustAlignment(uint64 alignment);
+  void AddToDirectoryElement(const string& element_name, uint64 length);
+  MemmappedFileSystemDirectory directory_;
+  // The current offset in the file, to support alignment.
+  uint64 output_file_offset_ = 0;
+  std::unique_ptr<WritableFile> output_file_;
+  MemmappedFileSystemWriter(const MemmappedFileSystemWriter&) = delete;
+  void operator=(const MemmappedFileSystemWriter&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_MEMMAPPED_FILE_SYSTEM_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/mirror_pad_mode.h b/third_party/tflite-hdrs/tensorflow/core/util/mirror_pad_mode.h
new file mode 100644
index 00000000..eea7c741
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/mirror_pad_mode.h
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MIRROR_PAD_MODE_H_
+#define TENSORFLOW_CORE_UTIL_MIRROR_PAD_MODE_H_
+
+// This file contains helper routines to deal with padding in various ops and
+// kernels.
+
+#include <string>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+// REFLECT: Border elements are not mirrored to padded regions.
+// SYMMETRIC: Border elements are mirrored to padded regions.
+//
+// For example, if two elements are padded to the right of an array [1, 2, 3],
+// then the result is [1, 2, 3, 2, 1] for REFLECT mode, and is [1, 2, 3, 3, 2]
+// for SYMMETRIC mode.
+enum class MirrorPadMode {
+  REFLECT = 1,
+  SYMMETRIC = 2,
+};
+
+// Return the string containing the list of valid padding modes, that can be
+// used as an Attr() in REGISTER_OP.
+string GetMirrorPadModeAttrString();
+
+// Forward declaration to avoid including core/framework/graph.proto.
+class NodeDef;
+
+// Specialization to parse an attribute directly into a MirrorPadMode enum.
+absl::Status GetNodeAttr(const NodeDef& node_def, absl::string_view attr_name,
+                         MirrorPadMode* value);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_MIRROR_PAD_MODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/mkl_heuristics.h b/third_party/tflite-hdrs/tensorflow/core/util/mkl_heuristics.h
new file mode 100644
index 00000000..60a99cd9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/mkl_heuristics.h
@@ -0,0 +1,194 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains heuristics data and methods that are used to
+// decide whether to rewrite node to use oneDNN kernels
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_HEURISTICS_H_
+#define TENSORFLOW_CORE_UTIL_MKL_HEURISTICS_H_
+#ifdef INTEL_MKL
+
+#include <vector>
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace tensorflow {
+
+struct RewriteThreshold {
+  std::string op;
+  int cpu_family;
+  int cpu_model_num;
+  // The model that is used to decide whether it is worth
+  // accelerating operations using oneDNN is:
+  //
+  // threshold = thread_synchronisation * thread_num + framework_tax
+  //
+  // This finds threshold when framework overhead and thread synchronisations
+  // are amortized with amount of computation that has to be performed.
+  // If we are below this threshold then we will not rewrite the operation to
+  // to be run using oneDNN primitive.
+  struct PerformanceParameters {
+    double thread_sync_cost;
+    double framework_cost;
+  } params;
+};
+
+// Table storing thread synchronization and framework overhead costs on each CPU
+// architecture for each oneNN-eligible operation. Our heuristics use these
+// costs to determine whether we should rewrite the operation to use oneDNN.
+static const RewriteThreshold rewrite_thresholds[] = {
+#ifdef DNNL_AARCH64_USE_ACL
+    {"Conv2D", 0x41, 0xd40, {0.9349, 22.603}},
+    {"_FusedConv2D", 0x41, 0xd40, {0.9349, 22.603}},
+    {"FusedBatchNormV3", 0x41, 0xd40, {0.3223, -0.8822}},
+    {"Sigmoid", 0x41, 0xd40, {0.0, 0.064736}},
+#endif  // DNNL_AARCH64_USE_ACL
+    {"", 0x0, 0x0, {0, 0}}};
+
+static double FindRewriteThreshold(const string node_name, int threads) {
+  int cpu_family_ = tsl::port::CPUFamily();
+  int cpu_model_num_ = tsl::port::CPUModelNum();
+
+  if (threads == 0) {
+    // if we do not have information how many threads are used
+    // to parallelise operation we revert to the old behaviour
+    return 0;
+  }
+
+  for (const RewriteThreshold* i = rewrite_thresholds;
+       i->op != "" && threads > 0; i++) {
+    if (node_name == i->op && cpu_family_ == i->cpu_family &&
+        cpu_model_num_ == i->cpu_model_num) {
+      return i->params.thread_sync_cost * threads + i->params.framework_cost;
+    }
+  }
+
+  return 0;
+}
+
+static double CalculateNodeMFlops(const AttrSlice& attrs,
+                                  const string node_name) {
+  // Check if we can obtained dimensions for this node.
+  std::vector<const TensorShapeProto*> shape_attrs;
+  if (!TryGetNodeAttr(attrs, "_input_shapes", &shape_attrs)) {
+    // We can't obtain shape so we will revert to default behaviour
+    // to rewrite node.
+    return -1;
+  }
+
+  if ((node_name == "Conv2D" || node_name == "_FusedConv2D") &&
+      shape_attrs.size() == 2) {
+    TensorShape input_shape, filter_shape;
+    if (TensorShape::BuildTensorShape(*shape_attrs[0], &input_shape) !=
+        tsl::OkStatus()) {
+      return -1;
+    }
+    if (TensorShape::BuildTensorShape(*shape_attrs[1], &filter_shape) !=
+        tsl::OkStatus()) {
+      return -1;
+    }
+
+    // MFLOPS = N * H * W * C * FH * FW * FC / 1e6.
+    return input_shape.dim_size(0) * input_shape.dim_size(1) *
+           input_shape.dim_size(2) * input_shape.dim_size(3) *
+           filter_shape.dim_size(0) * filter_shape.dim_size(1) *
+           filter_shape.dim_size(3) / (double)1e6;
+  } else if ((node_name == "FusedBatchNormV3" || node_name == "Sigmoid") &&
+             shape_attrs.size() >= 1) {
+    TensorShape input_shape;
+    if (TensorShape::BuildTensorShape(*shape_attrs[0], &input_shape) !=
+        tsl::OkStatus()) {
+      return -1;
+    }
+    return input_shape.dim_size(0) * input_shape.dim_size(1) *
+           input_shape.dim_size(2) * input_shape.dim_size(3) / (double)1e6;
+  }
+
+  return -1;
+}
+
+// MatMulHeuristic returns true to rewrite the node with oneDNN
+// false to execute the node in Eigen
+static bool MatMulHeuristic(const Node* n) {
+  // Run heuristic only if CPU is ARM_NEOVERSE_V1
+  if (!tsl::port::TestAarch64CPU(tsl::port::Aarch64CPU::ARM_NEOVERSE_V1)) {
+    return true;
+  }
+  // Check if we can obtain dimensions for this node.
+  std::vector<const TensorShapeProto*> shape_attrs;
+  if (!TryGetNodeAttr(n->attrs(), "_input_shapes", &shape_attrs)) {
+    // We can't obtain shape so we will revert to default behaviour
+    // to rewrite node.
+    return true;
+  }
+
+  if ((n->type_string() == "MatMul" || n->type_string() == "_FusedMatMul")) {
+    TensorShape lhs_shape, rhs_shape;
+    if (TensorShape::BuildTensorShape(*shape_attrs[0], &lhs_shape) !=
+        tsl::OkStatus()) {
+      return true;
+    }
+    if (TensorShape::BuildTensorShape(*shape_attrs[1], &rhs_shape) !=
+        tsl::OkStatus()) {
+      return true;
+    }
+
+    auto M = lhs_shape.dim_size(0);
+    auto K = lhs_shape.dim_size(1);
+    auto N = rhs_shape.dim_size(1);
+    auto ops = M * N * K;
+    std::array<int, 3> n_threshold = {7560, 250, 1536};
+    std::array<int, 2> m_threshold = {378, 80};
+    std::array<int, 2> ops_threshold = {5242880, 1090519040};
+
+    if (N <= n_threshold.at(0)) {
+      if (ops <= ops_threshold.at(0)) {
+        if (M <= m_threshold.at(0)) {
+          return false;
+        } else {
+          if (N <= n_threshold.at(1)) {
+            return false;
+          } else {
+            return true;
+          }
+        }
+      } else {
+        if (M <= m_threshold.at(1)) {
+          if (N <= n_threshold.at(2)) {
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          if (ops <= ops_threshold.at(1)) {
+            return true;
+          } else {
+            return false;
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_HEURISTICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/mkl_util.h b/third_party/tflite-hdrs/tensorflow/core/util/mkl_util.h
new file mode 100644
index 00000000..a6164cc0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/mkl_util.h
@@ -0,0 +1,2322 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifdef INTEL_MKL
+
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "dnnl.hpp"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/onednn_env_vars.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+#include "tensorflow/core/platform/mutex.h"
+#endif
+#include "xla/tsl/util/onednn_threadpool.h"
+
+using dnnl::engine;
+using dnnl::memory;
+using dnnl::primitive;
+using dnnl::reorder;
+using dnnl::stream;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using MemoryArgsMap = std::unordered_map<int, memory>;
+using ReorderPd = dnnl::reorder::primitive_desc;
+
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
+namespace tensorflow {
+
+// The file contains a number of utility classes and functions used by MKL
+// enabled kernels
+
+// This class encapsulates all the meta data that is associated with an MKL
+// tensor. A tensor is an MKL tensor if it was created as the result of an
+// MKL operation, and did not go through a conversion to a standard
+// Tensorflow tensor.
+
+// The dimensions order that oneDNN internally uses for 2D activations
+// [Batch, Channel, Height, Width] and
+// for 2D filters [Out_Channel, In_Channel, Height, Width].
+typedef enum {
+  Dim_N = 0,
+  Dim_C = 1,
+  Dim_H = 2,
+  Dim_W = 3,
+  Dim_O = 0,
+  Dim_I = 1
+} MklDnnDims;
+
+// The dimensions order that oneDNN internally uses for 3D activations
+// [Batch, Channel, Depth, Height, Width] and
+// for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
+typedef enum {
+  Dim3d_N = 0,
+  Dim3d_C = 1,
+  Dim3d_D = 2,
+  Dim3d_H = 3,
+  Dim3d_W = 4,
+  Dim3d_O = 0,
+  Dim3d_I = 1
+} MklDnnDims3D;
+
+// Enum for the order of dimensions of a TF 2D filter with shape [filter_height,
+// filter_width, in_channels, out_channels]
+typedef enum {
+  TF_2DFILTER_DIM_H = 0,
+  TF_2DFILTER_DIM_W = 1,
+  TF_2DFILTER_DIM_I = 2,
+  TF_2DFILTER_DIM_O = 3
+} TFFilterDims2d;
+
+// Enum for the order of dimensions of a TF 3D filter with shape [filter_depth,
+// filter_height, filter_width, in_channels, out_channels]
+typedef enum {
+  TF_3DFILTER_DIM_P = 0,
+  TF_3DFILTER_DIM_H = 1,
+  TF_3DFILTER_DIM_W = 2,
+  TF_3DFILTER_DIM_I = 3,
+  TF_3DFILTER_DIM_O = 4
+} TFFilterDims3d;
+
+// The dimensions order that oneDNN requires for the filter in a grouped
+// convolution (2D only)
+typedef enum {
+  MKL_GROUP_FILTER_DIM_G = 0,
+  MKL_GROUP_FILTER_DIM_O = 1,
+  MKL_GROUP_FILTER_DIM_I = 2,
+  MKL_GROUP_FILTER_DIM_H = 3,
+  MKL_GROUP_FILTER_DIM_W = 4
+} MklDnnFilterGroupDims;
+
+// Enum used to templatize MklOp kernel implementation
+// that support both fp32 and int8 versions.
+enum class MklQuantization {
+  QUANTIZED_VERSION,
+  FP_VERSION,
+};
+
+static const int kSmallBatchSize = 32;
+
+enum class OneDNNMathModeSetting {
+  kNone = 0,
+  kBF16,
+};
+
+inline OneDNNMathModeSetting SetFPMathMode() {
+  static OneDNNMathModeSetting math_mode = [] {
+    OneDNNMathModeSetting mode = OneDNNMathModeSetting::kNone;
+    if (FPMathModeSetting() == "BF16") {
+      if (dnnl::set_default_fpmath_mode(dnnl::fpmath_mode::bf16) ==
+          dnnl::status::success) {
+        mode = OneDNNMathModeSetting::kBF16;
+      }
+    }
+    return mode;
+  }();
+
+  return math_mode;
+}
+
+inline void execute_primitives(
+    std::vector<dnnl::primitive>& primitives, std::shared_ptr<stream> stream,
+    std::vector<std::unordered_map<int, memory>>& net_args) {
+  DCHECK_EQ(primitives.size(), net_args.size());
+  for (size_t i = 0; i < primitives.size(); ++i) {
+    primitives.at(i).execute(*stream, net_args.at(i));
+  }
+}
+
+#ifndef ENABLE_ONEDNN_V3
+#define ARE_MEMORY_DESCS_EQUAL(md1, md2) dnnl_memory_desc_equal(&md1, &md2)
+#define CREATE_MEMORY_DESC_USING_STRIDES dnnl_memory_desc_init_by_strides
+#define GET_DATA_TYPE data_type
+#define GET_DIMS dims
+#define GET_INNER_BLKS format_desc.blocking.inner_blks
+#define GET_INNER_DIMS(dims, dims_1) dims_1
+#define GET_INNER_IDXS format_desc.blocking.inner_idxs
+#define GET_INNER_NBLKS format_desc.blocking.inner_nblks
+#define GET_MEMORY_DESC get_desc().data
+#define GET_MEMORY_DESC_FLAGS extra.flags
+#define GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR GetMklLayout().data
+#define GET_NDIMS ndims
+#define GET_STRIDES format_desc.blocking.strides
+#define GET_STRIDES_DIMS(dims, dims_outer_blocks) dims_outer_blocks
+#define INIT_DIMS_FROM_DESC(in_dims, md) in_dims(md.dims, &md.dims[md.ndims])
+#define MEMORY_DESC dnnl_memory_desc_t
+#else
+#define ARE_MEMORY_DESCS_EQUAL(md1, md2) md1 == md2
+#define CREATE_MEMORY_DESC_USING_STRIDES dnnl_memory_desc_create_with_strides
+#define GET_DATA_TYPE get_data_type()
+#define GET_DIMS get_dims()
+#define GET_INNER_BLKS get_inner_blks()
+#define GET_INNER_DIMS(dims, dims_1) dims
+#define GET_INNER_IDXS get_inner_idxs()
+#define GET_INNER_NBLKS get_inner_nblks()
+#define GET_MEMORY_DESC get_desc()
+#define GET_MEMORY_DESC_FLAGS get_size()
+#define GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR GetMklLayout()
+#define GET_NDIMS get_ndims()
+#define GET_STRIDES get_strides()
+#define GET_STRIDES_DIMS(dims, dims_outer_blocks) dims
+#define INIT_DIMS_FROM_DESC(in_dims, md) in_dims = md.get_dims()
+#define MEMORY_DESC memory::desc
+#endif  // !ENABLE_ONEDNN_V3
+
+// In oneDNN v1.x, the format (ex. NCHW) used to initialize a memory descriptor
+// (md) structure will no longer be recorded in its `format` field. Instead, it
+// will be set to a canonical `blocked` format for every fully described md.
+//
+// Currently, we query this `format` field while mapping oneDNN's data format
+// to TF's data format. Due to the above restriction, we will now get this data
+// format information from TF's `data_format` attribute (i.e. via
+// `TensorFormat`) for oneDNN v1.x.
+//
+// Some oneDNN operators such as ReLU do not have a `data_format` attribute
+// since they are usually in `blocked` format. Therefore, in order to
+// distinguish between blocked and non-blocked formats, we have defined a new
+// enum called `MklTensorFormat` that is semantically similar to `TensorFormat`
+// but with the following additional fields namely:
+//  1) FORMAT_BLOCKED: as described above, this is needed for element-wise
+//     operators such as ReLU.
+//  2) FORMAT_INVALID: for error-checking (ex. unsupported format)
+//  3) FORMAT_X, FORMAT_NC, FORMAT_TNC: to distinguish between MKL tensors based
+//     on their dimensions in operators such as Softmax, i.e.:
+//        FORMAT_X   - 1D tensor
+//        FORMAT_NC  - 2D tensor
+//        FORMAT_TNC - 3D tensor
+enum class MklTensorFormat {
+  FORMAT_NHWC = 0,
+  FORMAT_NCHW = 1,
+  FORMAT_NDHWC = 2,
+  FORMAT_NCDHW = 3,
+  FORMAT_X = 4,
+  FORMAT_NC = 5,
+  FORMAT_TNC = 6,
+  FORMAT_BLOCKED = 7,
+  FORMAT_INVALID = 8,
+};
+
+// Forward declarations
+memory::format_tag MklTensorFormatToMklDnnDataFormat(MklTensorFormat format);
+
+TensorFormat MklDnn3DDataFormatToTFDataFormat(MklTensorFormat format);
+TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format);
+
+memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
+Status CreateBlockedMemDescHelper(const memory::dims& dim,
+                                  const memory::dims& strides,
+                                  memory::data_type dtype,
+                                  dnnl_memory_desc_t* blocked_md);
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const memory::format_tag& tag) {
+  if (tag == memory::format_tag::undef) {
+    os << "undef";
+  } else if (tag == memory::format_tag::any) {
+    os << "any";
+  } else {
+    os << "invalid";
+  }
+  return os;
+}
+
+inline void operator<<(std::ostream& os, const MklTensorFormat& format) {
+  if (format == MklTensorFormat::FORMAT_NHWC) {
+    os << "FORMAT_NHWC";
+  } else if (format == MklTensorFormat::FORMAT_NCHW) {
+    os << "FORMAT_NCHW";
+  } else if (format == MklTensorFormat::FORMAT_NDHWC) {
+    os << "FORMAT_NDHWC";
+  } else if (format == MklTensorFormat::FORMAT_NCDHW) {
+    os << "FORMAT_NCDHW";
+  } else if (format == MklTensorFormat::FORMAT_X) {
+    os << "FORMAT_X";
+  } else if (format == MklTensorFormat::FORMAT_NC) {
+    os << "FORMAT_NC";
+  } else if (format == MklTensorFormat::FORMAT_TNC) {
+    os << "FORMAT_TNC";
+  } else if (format == MklTensorFormat::FORMAT_BLOCKED) {
+    os << "FORMAT_BLOCKED";
+  } else {
+    os << "INVALID FORMAT";
+  }
+}
+
+template <typename T>
+inline bool array_cmp(const T* a1, const T* a2, size_t size) {
+  for (size_t i = 0; i < size; ++i)
+    if (a1[i] != a2[i]) return false;
+  return true;
+}
+
+inline dnnl::stream* CreateStream(tsl::OneDnnThreadPool* eigen_tp,
+                                  const engine& engine) {
+#ifndef ENABLE_ONEDNN_OPENMP
+  if (eigen_tp != nullptr) {
+    stream* tp_stream =
+        new stream(dnnl::threadpool_interop::make_stream(engine, eigen_tp));
+    return tp_stream;
+  } else {
+    stream* tp_stream = new stream(engine);
+    return tp_stream;
+  }
+#else
+  stream* tp_stream = new stream(engine);
+  return tp_stream;
+#endif  // !ENABLE_ONEDNN_OPENMP
+}
+
+class MklDnnShape {
+ private:
+  struct MklShapeData {
+    // Flag to indicate if the tensor is an MKL tensor or not
+    bool is_mkl_tensor_ = false;
+    // Number of dimensions in Tensorflow format
+    size_t dimension_ = 0;
+    dnnl_dims_t sizes_;  // Required by MKL for conversions
+    MklTensorFormat tf_data_format_ = MklTensorFormat::FORMAT_BLOCKED;
+    memory::data_type T_ = memory::data_type::undef;
+    // MKL layout
+    MEMORY_DESC mkl_md_;
+    /// TF dimension corresponding to this MKL dimension
+    dnnl_dims_t map_;
+  };
+  MklShapeData data_;
+
+  typedef std::remove_extent<dnnl_dims_t>::type dnnl_dim_t;
+
+#define INVALID_DIM_SIZE -1
+
+ public:
+  MklDnnShape() : data_{} {
+    for (size_t i = 0; i < sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+         ++i) {
+      data_.sizes_[i] = -1;
+    }
+    for (size_t i = 0; i < sizeof(data_.map_) / sizeof(data_.map_[0]); ++i) {
+      data_.map_[i] = -1;
+    }
+  }
+
+  ~MklDnnShape() {}
+  MklDnnShape(const MklDnnShape&) = delete;
+  void operator=(const MklDnnShape&) = delete;  // Cannot copy
+
+  /// Equality function for MklDnnShape objects
+  /// @return true if both are equal; false otherwise.
+  inline bool operator==(const MklDnnShape& input_shape) const {
+    if (this->IsMklTensor() != input_shape.IsMklTensor()) {
+      return false;
+    }
+
+    // If input tensors are in MKL layout, then we check for dimensions and
+    // sizes.
+    if (this->IsMklTensor()) {
+      auto const& cur_md = this->GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR;
+      auto const& input_shape_md =
+          input_shape.GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR;
+      return (this->GetTfShape() == input_shape.GetTfShape()) &&
+             ARE_MEMORY_DESCS_EQUAL(cur_md, input_shape_md);
+    }
+
+    // Both inputs are not MKL tensors.
+    return true;
+  }
+
+  /// Equality operator for MklDnnShape and TFShape.
+  /// Returns: true if TF shapes for both are the same, false otherwise
+  inline bool operator==(const TensorShape& input_shape) const {
+    if (!this->IsMklTensor()) {
+      return false;
+    }
+
+    return this->GetTfShape() == input_shape;
+  }
+
+  inline const bool IsMklTensor() const { return data_.is_mkl_tensor_; }
+  inline void SetMklTensor(bool is_mkl_tensor) {
+    data_.is_mkl_tensor_ = is_mkl_tensor;
+  }
+
+  inline void SetDimensions(const size_t dimension) {
+    data_.dimension_ = dimension;
+  }
+  inline size_t GetDimension(char dimension) const {
+    int index = GetMklDnnTensorDimIndex(dimension);
+    CHECK(index >= 0 && index < this->GetDimension())
+        << "Invalid index from the dimension: " << index << ", " << dimension;
+    return this->DimSize(index);
+  }
+
+  inline size_t GetDimension3D(char dimension) const {
+    int index = GetMklDnnTensor3DDimIndex(dimension);
+    CHECK(index >= 0 && index < this->GetDimension())
+        << "Invalid index from the dimension: " << index << ", " << dimension;
+    return this->DimSize(index);
+  }
+
+  inline int32 GetMklDnnTensorDimIndex(char dimension) const {
+    switch (dimension) {
+      case 'N':
+        return MklDnnDims::Dim_N;
+      case 'C':
+        return MklDnnDims::Dim_C;
+      case 'H':
+        return MklDnnDims::Dim_H;
+      case 'W':
+        return MklDnnDims::Dim_W;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  }
+
+  inline int32 GetMklDnnTensor3DDimIndex(char dimension) const {
+    switch (dimension) {
+      case 'N':
+        return MklDnnDims3D::Dim3d_N;
+      case 'C':
+        return MklDnnDims3D::Dim3d_C;
+      case 'D':
+        return MklDnnDims3D::Dim3d_D;
+      case 'H':
+        return MklDnnDims3D::Dim3d_H;
+      case 'W':
+        return MklDnnDims3D::Dim3d_W;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  }
+
+  inline size_t GetDimension() const { return data_.dimension_; }
+  inline const int* GetSizes() const {
+    return reinterpret_cast<const int*>(&data_.sizes_[0]);
+  }
+
+  // Returns an dnnl::memory::dims object that contains the sizes of this
+  // MklDnnShape object.
+  inline memory::dims GetSizesAsMklDnnDims() const {
+    memory::dims retVal;
+    if (data_.is_mkl_tensor_) {
+      size_t dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+      for (size_t i = 0; i < dimensions; i++) {
+        if (data_.sizes_[i] != INVALID_DIM_SIZE)
+          retVal.push_back(data_.sizes_[i]);
+      }
+    } else {
+      CHECK_EQ(data_.is_mkl_tensor_, true);
+    }
+    return retVal;
+  }
+
+  inline int64 DimSize(int index) const {
+    CHECK_LT(index, sizeof(data_.sizes_) / sizeof(data_.sizes_[0]));
+    return data_.sizes_[index];
+  }
+
+  /// Return TensorShape that describes the Tensorflow shape of the tensor
+  /// represented by this MklShape.
+  inline TensorShape GetTfShape() const {
+    CHECK_EQ(data_.is_mkl_tensor_, true);
+
+    std::vector<int32> shape(data_.dimension_, -1);
+    // As mentioned in the comment above, we now rely on TF's `data_format`
+    // attribute to determine if TF shape is in blocked format or not.
+    if (data_.tf_data_format_ != MklTensorFormat::FORMAT_BLOCKED) {
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[TfDimIdx(idx)];
+      }
+    } else {
+      // If Tensorflow shape is in Blocked format, then we don't have dimension
+      // map for it. So we just create Tensorflow shape from sizes in the
+      // specified order.
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[idx];
+      }
+    }
+
+    TensorShape ts;
+    bool ret = TensorShapeUtils::MakeShape(shape, &ts).ok();
+    CHECK_EQ(ret, true);
+    return ts;
+  }
+
+  inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
+  inline const memory::data_type GetElemType() { return data_.T_; }
+
+#ifndef ENABLE_ONEDNN_V3
+  inline void SetMklLayout(memory::desc* md) {
+    CHECK_NOTNULL(md);
+    data_.mkl_md_ = md->data;
+  }
+#else
+  inline void SetMklLayout(const memory::desc& md) { data_.mkl_md_ = md; }
+#endif  // !ENABLE_ONEDNN_V3
+
+  inline const memory::desc GetMklLayout() const {
+    return memory::desc(data_.mkl_md_);
+  }
+
+  inline MklTensorFormat GetTfDataFormat() const {
+    return data_.tf_data_format_;
+  }
+
+  /// We don't create primitive_descriptor for TensorFlow layout now.
+  /// We use lazy evaluation and create it only when needed. Input format can
+  /// also be Blocked format.
+  inline void SetTfLayout(size_t dims, const memory::dims& sizes,
+                          MklTensorFormat format) {
+    DCHECK_EQ(dims, sizes.size())
+        << "SetTfLayout: Number of dimensions does not"
+           "match with dimension array";
+    data_.dimension_ = dims;
+    for (size_t ii = 0; ii < dims; ++ii) {
+      data_.sizes_[ii] = sizes[ii];
+    }
+    data_.tf_data_format_ = format;
+    if (format != MklTensorFormat::FORMAT_BLOCKED) {
+      if (dims == 2) {
+        data_.map_[0] = MklDnnDims::Dim_N;
+        data_.map_[1] = MklDnnDims::Dim_C;
+      } else {
+        SetTfDimOrder(dims, format);
+      }
+    }
+  }
+
+  inline const memory::desc GetTfLayout() const {
+    memory::dims dims;
+    for (size_t ii = 0; ii < data_.dimension_; ++ii) {
+      dims.push_back(data_.sizes_[ii]);
+    }
+
+    // Create Blocked memory desc if input TF format was set like that.
+    if (data_.tf_data_format_ == MklTensorFormat::FORMAT_BLOCKED) {
+      auto strides = CalculateTFStrides(dims);
+      dnnl_memory_desc_t blocked_md;
+      TF_CHECK_OK(
+          CreateBlockedMemDescHelper(dims, strides, data_.T_, &blocked_md));
+      return memory::desc(blocked_md);
+    } else {
+      auto format_tag =
+          MklTensorFormatToMklDnnDataFormat(data_.tf_data_format_);
+      return memory::desc(dims, data_.T_, format_tag);
+    }
+  }
+
+  inline const memory::desc GetCurLayout() const {
+    return IsMklTensor() ? GetMklLayout() : GetTfLayout();
+  }
+
+  // We don't need a case of default dimension order because
+  // when an operator that does not get data_format attribute gets all inputs
+  // in Tensorflow format, it will produce output in Tensorflow format.
+  inline void SetTfDimOrder(const size_t dimension, const dnnl_dims_t map) {
+    CHECK(dimension == data_.dimension_);
+    for (size_t ii = 0; ii < dimension; ii++) {
+      data_.map_[ii] = map[ii];
+    }
+  }
+
+  inline void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
+    if (dimension == 5) {
+      CHECK(dimension == data_.dimension_);
+      data_.map_[GetTensorDimIndex<3>(data_format, '0')] =
+          MklDnnDims3D::Dim3d_D;
+      data_.map_[GetTensorDimIndex<3>(data_format, '1')] =
+          MklDnnDims3D::Dim3d_H;
+      data_.map_[GetTensorDimIndex<3>(data_format, '2')] =
+          MklDnnDims3D::Dim3d_W;
+      data_.map_[GetTensorDimIndex<3>(data_format, 'C')] =
+          MklDnnDims3D::Dim3d_C;
+      data_.map_[GetTensorDimIndex<3>(data_format, 'N')] =
+          MklDnnDims3D::Dim3d_N;
+    } else {
+      CHECK_EQ(dimension, 4);
+      CHECK(dimension == data_.dimension_);
+      data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+    }
+  }
+
+  inline void SetTfDimOrder(const size_t dimension, MklTensorFormat format) {
+    TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
+    SetTfDimOrder(dimension, data_format);
+  }
+
+  inline const dnnl_dim_t* GetTfToMklDimMap() const { return &data_.map_[0]; }
+  inline size_t TfDimIdx(int index) const { return data_.map_[index]; }
+  inline int64 TfDimSize(int index) const {
+    return data_.sizes_[TfDimIdx(index)];
+  }
+
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Channel dimension.
+  inline bool IsMklChannelDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_C;
+  }
+
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Batch dimension.
+  inline bool IsMklBatchDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_N;
+  }
+
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Width dimension.
+  inline bool IsMklWidthDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_W;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Height dimension.
+  inline bool IsMklHeightDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_H;
+  }
+
+  /// Check if the TF-MKL dimension ordering map specifies if the input
+  /// tensor is in NCHW format.
+  inline bool IsTensorInNCHWFormat() const {
+    TensorFormat data_format = FORMAT_NCHW;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  /// Check if the TF-MKL dimension ordering map specifies if the input
+  /// tensor is in NHWC format.
+  inline bool IsTensorInNHWCFormat() const {
+    TensorFormat data_format = FORMAT_NHWC;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  /// The following methods are used for serializing and de-serializing the
+  /// contents of the mklshape object.
+  /// The data is serialized in this order
+  /// is_mkl_tensor_ : dimension_ : sizes_ : map_: format_ : T_ : mkl_pd_;
+
+  /// Size of buffer to hold the serialized object, the size is computed by
+  /// following above mentioned order
+  inline size_t GetSerializeBufferSize() const { return sizeof(MklShapeData); }
+
+  void SerializeMklDnnShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= GetSerializeBufferSize())
+        << "Buffer size is too small to SerializeMklDnnShape";
+    *reinterpret_cast<MklShapeData*>(buf) = data_;
+  }
+
+  void DeSerializeMklDnnShape(const unsigned char* buf, size_t buf_size) {
+    // Make sure buffer holds at least is_mkl_tensor_.
+    CHECK(buf_size >= sizeof(data_.is_mkl_tensor_))
+        << "Buffer size is too small in DeSerializeMklDnnShape";
+
+    const bool is_mkl_tensor = *reinterpret_cast<const bool*>(buf);
+    if (is_mkl_tensor) {  // If it is an MKL Tensor then read the rest
+      CHECK(buf_size >= GetSerializeBufferSize())
+          << "Buffer size is too small in DeSerializeMklDnnShape";
+      data_ = *reinterpret_cast<const MklShapeData*>(buf);
+    }
+  }
+};
+
+inline Eigen::ThreadPoolInterface* EigenThreadPoolFromTfContext(
+    OpKernelContext* context) {
+  return context->device()
+      ->tensorflow_cpu_worker_threads()
+      ->workers->AsEigenThreadPool();
+}
+
+// List of MklShape objects. Used in Concat/Split layers.
+typedef std::vector<MklDnnShape> MklDnnShapeList;
+
+template <typename T>
+class MklDnnData;
+
+// TODO(intel-tf): Merge with the execute_primitives.
+inline void ExecutePrimitive(const std::vector<primitive>& net,
+                             const std::vector<MemoryArgsMap>* net_args,
+                             const engine& cpu_engine,
+                             OpKernelContext* context = nullptr) {
+  DCHECK(net_args);
+  DCHECK_EQ(net.size(), net_args->size());
+  std::unique_ptr<stream> cpu_stream;
+  // Create the oneDNN wrapper over Eigen threadpool and set max threads
+  // in oneDNN.
+  tsl::OneDnnThreadPool eigen_tp;
+  if (context != nullptr) {
+    Eigen::ThreadPoolInterface* eigen_interface =
+        EigenThreadPoolFromTfContext(context);
+    eigen_tp =
+        tsl::OneDnnThreadPool(eigen_interface, ThreadPoolUseCallerThread());
+    cpu_stream.reset(CreateStream(&eigen_tp, cpu_engine));
+  } else {
+    cpu_stream.reset(CreateStream(nullptr, cpu_engine));
+  }
+  for (size_t i = 0; i < net.size(); ++i) {
+    net.at(i).execute(*cpu_stream, net_args->at(i));
+  }
+  cpu_stream->wait();
+}
+template <typename T>
+inline Status ConvertMklToTF(OpKernelContext* context,
+                             const Tensor& input_mkl_tensor,
+                             const MklDnnShape& input_mkl_shape,
+                             Tensor* output_tf_tensor) {
+  try {
+    if (!input_mkl_shape.IsMklTensor()) {
+      // Return input as is since it is already a TF tensor
+      *output_tf_tensor = input_mkl_tensor;
+      return OkStatus();
+    }
+
+    // Allocate output tensor.
+    TensorShape output_tf_shape = input_mkl_shape.GetTfShape();
+    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<T>::v(), output_tf_shape,
+                                       output_tf_tensor));
+
+    engine cpu_engine(engine::kind::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get MKL layout of input tensor.
+    auto input_mkl_md = input_mkl_shape.GetMklLayout();
+    auto output_tf_md = input_mkl_shape.GetTfLayout();
+    input.SetUsrMem(input_mkl_md, &input_mkl_tensor);
+
+    if (input.IsReorderNeeded(output_tf_md)) {
+      std::vector<primitive> net;
+      std::vector<MemoryArgsMap> net_args;
+      bool status = input.CheckReorderToOpMem(output_tf_md, output_tf_tensor,
+                                              net, net_args, cpu_engine);
+      if (!status) {
+        return absl::InternalError(
+            "ConvertMklToTF(): Failed to create reorder for input");
+      }
+      ExecutePrimitive(net, &net_args, cpu_engine, context);
+    } else {
+      // If not, just forward input tensor to output tensor.
+      bool status =
+          output_tf_tensor->CopyFrom(input_mkl_tensor, output_tf_shape);
+      if (!status) {
+        return absl::InternalError(
+            "ConvertMklToTF(): Failed to forward input tensor to output");
+      }
+    }
+    return OkStatus();
+  } catch (dnnl::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
+}
+
+// Get the MKL shape from the second string tensor
+inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape,
+                        bool eager_mode) {
+  if (!eager_mode) {
+    mklshape->DeSerializeMklDnnShape(
+        ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+            .flat<uint8>()
+            .data(),
+        ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+                .flat<uint8>()
+                .size() *
+            sizeof(uint8));
+  } else {
+    mklshape->SetMklTensor(false);
+  }
+}
+
+inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
+  GetMklShape(ctext, n, mklshape, false);
+}
+
+// Gets the actual input
+inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
+  return ctext->input(GetTensorDataIndex(n, ctext->num_inputs()));
+}
+
+inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
+                            OpInputList* input_tensors) {
+  CHECK_NOTNULL(input_tensors);
+  TF_CHECK_OK(ctext->input_list(name, input_tensors));
+}
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklDnnShapeList* mkl_shapes,
+                            bool native_format = false) {
+  if (!native_format) {
+    OpInputList input_mkl_tensors;
+    GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+    for (int i = 0; i < input_mkl_tensors.size(); i++) {
+      (*mkl_shapes)[i].DeSerializeMklDnnShape(
+          input_mkl_tensors[i].flat<uint8>().data(),
+          input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+    }
+  } else {
+    for (int i = 0; i < mkl_shapes->size(); ++i) {
+      (*mkl_shapes)[i].SetMklTensor(false);
+    }
+  }
+}
+
+/// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
+/// If the input tensor is in MKL layout, then obtains TensorShape from
+/// MklShape.
+inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx,
+                              bool eager_mode = false) {
+  // Sanity check.
+  CHECK_NOTNULL(context);
+  CHECK_LT(input_idx, context->num_inputs());
+
+  MklDnnShape input_mkl_shape;
+  GetMklShape(context, input_idx, &input_mkl_shape, eager_mode);
+  if (input_mkl_shape.IsMklTensor() && !eager_mode) {
+    return input_mkl_shape.GetTfShape();
+  } else {
+    const Tensor& t = MklGetInput(context, input_idx);
+    return t.shape();
+  }
+}
+
+// Allocate the second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      const MklDnnShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklDnnShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tf_shape,
+                                      const MklDnnShape& mkl_shape,
+                                      bool eager_mode = false) {
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()),
+                                    tf_shape, output));
+  if (!eager_mode) {
+    Tensor* second_tensor = nullptr;
+    TensorShape second_shape;
+    second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+    OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                              GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                              second_shape, &second_tensor));
+    mkl_shape.SerializeMklDnnShape(
+        second_tensor->flat<uint8>().data(),
+        second_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+// Allocates a temp tensor and returns the data buffer for temporary storage.
+template <typename T>
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           const memory::desc& pd, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(pd.get_size() / sizeof(T) + 1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<T>().data());
+}
+
+template <typename T>
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           TensorShape tf_shape) {
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+                                                 tf_shape, tensor_out));
+}
+
+template <typename T>
+struct UserScratchPad {
+  template <typename MklPrim>
+  // NOTE: if scratchpad is not required for a particular primitive the
+  //      spad_md.get_size() will return 0. It is fine to return
+  //      nullptr in this case
+  inline void AllocateSPTensor(MklPrim* mkl_prim, OpKernelContext* context) {
+    allocated_ = false;
+    auto spad_md = mkl_prim->GetScratchPadDesc();
+    size_t spad_size = spad_md.get_size();
+    if (spad_size == 0) return;
+
+    size_t allocate_size = (spad_size + sizeof(T) - 1) / sizeof(T);
+    TensorShape tf_shape;
+    tf_shape.AddDim(allocate_size);
+    AllocTmpBuffer<T>(context, &scratch_pad_, tf_shape);
+    allocated_ = true;
+  }
+  inline void* Get() {
+    if (allocated_) {
+      return static_cast<void*>(scratch_pad_.flat<T>().data());
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  Tensor scratch_pad_;
+  bool allocated_ = false;
+};
+
+inline void GetStridesFromSizes(MklTensorFormat data_format, size_t* strides,
+                                const size_t* sizes) {
+  DCHECK_NE(data_format, MklTensorFormat::FORMAT_INVALID);
+  // MKL requires strides in NCHW
+  if (data_format == MklTensorFormat::FORMAT_NHWC) {
+    strides[0] = sizes[2];
+    strides[1] = sizes[0] * sizes[2];
+    strides[2] = 1;
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  } else {
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  }
+}
+
+inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                 int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  const Tensor& meta = context->input(idx_meta_in);
+  Tensor output(data.dtype());
+  Tensor meta_output(meta.dtype());
+
+  // TODO(intel-tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, data.shape()));
+  CHECK(meta_output.CopyFrom(meta, meta.shape()));
+  context->set_output(idx_data_out, output);
+  context->set_output(idx_meta_out, meta_output);
+}
+
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel-tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
+}
+
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklDnnShape dnn_shape_output;
+  dnn_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, dnn_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                    int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+    context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+    context->set_output(idx_meta_out, context->input(idx_meta_in));
+  }
+}
+
+// Set a dummy oneDNN shape (called when the output is in TF format)
+inline void SetDummyMklDnnShapeOutput(OpKernelContext* context,
+                                      uint32 idx_data_out) {
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
+}
+
+// If the input tensor has ref count as 1, it is forwarded to the desired
+// output port and the function returns true. In that case, it also allocates
+// the serialized MklDnnShape object. Otherwise, the function returns false.
+inline bool ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
+                                                int idx_in, int idx_out,
+                                                Tensor** output,
+                                                const MklDnnShape& mkl_shape,
+                                                bool always_forward = true) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  bool is_forwarded = false;
+  const Tensor& input_tensor = context->input(idx_data_in);
+  const auto output_shape = input_tensor.shape();
+  if (always_forward) {
+    if (IsRefType(context->input_dtype(idx_data_in))) {
+      context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+    } else {
+      context->set_output(idx_data_out, input_tensor);
+    }
+  } else {
+    is_forwarded = context->forward_input_to_output_with_shape(
+        idx_data_in, idx_data_out, output_shape, output);
+  }
+  if (is_forwarded || always_forward) {
+    AllocateOutputSetMklShape(context, idx_out, mkl_shape);
+    return true;
+  }
+  return false;
+}
+
+// Forward the MKL shape ONLY (used in elementwise and other ops where
+// we call the eigen implementation and MKL shape is not used)
+inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
+                                      uint32 idx_data_in,
+                                      uint32_t idx_data_out) {
+  uint32 idx_meta_in =
+      GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
+  uint32 idx_meta_out =
+      GetTensorMetaDataIndex(idx_data_out, context->num_outputs());
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out);
+  } else {
+    context->set_output(idx_meta_out, context->input(idx_meta_in));
+  }
+}
+
+// -------------------------------------------------------------------
+//          Common utility functions used by MKL unit tests
+
+inline Tensor GetMklMetaTensor() {
+  MklDnnShape non_mkl_shape;
+  non_mkl_shape.SetMklTensor(false);
+
+  auto size = static_cast<int64_t>(non_mkl_shape.GetSerializeBufferSize());
+  Tensor tensor(DT_UINT8, {size});
+
+  non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
+                                     size * sizeof(uint8));
+  return tensor;
+}
+
+// -------------------------------------------------------------------
+
+/// Return oneDNN data type (memory::data_type) for input type T
+///
+/// @input None
+/// @return memory::data_type corresponding to type T
+template <typename T>
+static memory::data_type MklDnnType();
+
+/// Instantiation for float type. Add similar instantiations for other
+/// type if needed.
+template <>
+memory::data_type MklDnnType<float>() {
+  return memory::data_type::f32;
+}
+
+template <>
+memory::data_type MklDnnType<quint8>() {
+  return memory::data_type::u8;
+}
+
+template <>
+memory::data_type MklDnnType<uint8>() {
+  return memory::data_type::u8;
+}
+
+template <>
+memory::data_type MklDnnType<qint8>() {
+  return memory::data_type::s8;
+}
+
+template <>
+memory::data_type MklDnnType<qint32>() {
+  return memory::data_type::s32;
+}
+template <>
+memory::data_type MklDnnType<bfloat16>() {
+  return memory::data_type::bf16;
+}
+template <>
+memory::data_type MklDnnType<Eigen::half>() {
+  return memory::data_type::f16;
+}
+
+// Map MklTensorFormat to oneDNN format tag
+//
+// @input: MklTensorFormat i.e. TensorFlow data format
+// @return: oneDNN's memory format tag corresponding to MklTensorFormat.
+//          Fails with an error if invalid data format.
+inline memory::format_tag MklTensorFormatToMklDnnDataFormat(
+    MklTensorFormat format) {
+  if (format == MklTensorFormat::FORMAT_NHWC) return memory::format_tag::nhwc;
+  if (format == MklTensorFormat::FORMAT_NCHW) return memory::format_tag::nchw;
+  if (format == MklTensorFormat::FORMAT_NDHWC) return memory::format_tag::ndhwc;
+  if (format == MklTensorFormat::FORMAT_NCDHW) return memory::format_tag::ncdhw;
+  if (format == MklTensorFormat::FORMAT_X) return memory::format_tag::x;
+  if (format == MklTensorFormat::FORMAT_NC) return memory::format_tag::nc;
+  if (format == MklTensorFormat::FORMAT_TNC) return memory::format_tag::tnc;
+  return memory::format_tag::undef;
+}
+
+/// Map TensorFlow data format into oneDNN 3D data format
+/// @input: TensorFlow data format
+/// @return: oneDNN 3D data format corresponding to TensorFlow data format;
+///          Fails with an error if invalid data format.
+inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NDHWC;
+  if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCDHW;
+  TF_CHECK_OK(absl::InvalidArgumentError("Unsupported data format"));
+  return MklTensorFormat::FORMAT_INVALID;
+}
+
+/// Map TensorFlow data format into oneDNN data format
+///
+/// @input: TensorFlow data format
+/// @return: oneDNN data format corresponding to TensorFlow data format;
+///          Fails with an error if invalid data format.
+inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NHWC;
+  if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCHW;
+  TF_CHECK_OK(absl::InvalidArgumentError("Unsupported data format"));
+  return MklTensorFormat::FORMAT_INVALID;
+}
+
+/// Map oneDNN data format into TensorFlow data format
+///
+/// @input: oneDNN data format
+/// @return: Tensorflow data format corresponding to oneDNN data format;
+///          Fails with an error if invalid data format.
+inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) {
+  if (format == MklTensorFormat::FORMAT_NHWC ||
+      format == MklTensorFormat::FORMAT_NDHWC)
+    return FORMAT_NHWC;
+  if (format == MklTensorFormat::FORMAT_NCHW ||
+      format == MklTensorFormat::FORMAT_NCDHW)
+    return FORMAT_NCHW;
+  TF_CHECK_OK(absl::InvalidArgumentError("Unsupported data format"));
+
+  // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
+  // that we don't come here.
+  return FORMAT_NHWC;
+}
+
+/// Map TensorShape object into memory::dims required by oneDNN
+///
+/// This function will simply map input TensorShape into oneDNN dims
+/// naively. So it will preserve the order of dimensions. E.g., if
+/// input tensor is in NHWC format, then dims will be in NHWC format also.
+///
+/// @input TensorShape object in shape
+/// @return memory::dims corresponding to TensorShape
+inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
+  memory::dims dims(shape.dims());
+  for (int d = 0; d < shape.dims(); ++d) {
+    dims[d] = shape.dim_size(d);
+  }
+  return dims;
+}
+
+/// Map TensorShape object into memory::dims in NCHW format required by oneDNN
+///
+/// This function is a specific one than above function. It will map input
+/// TensorShape into oneDNN dims in NCHW format. So it may not preserve the
+/// order of dimensions. E.g., if input tensor is in NHWC format, then dims
+/// will be in NCHW format, and not in NHWC format.
+///
+/// @input TensorShape object in shape
+/// @return memory::dims in oneDNN required NCHW format
+inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
+                                              TensorFormat format) {
+  // Check validity of format.
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
+
+  int n = shape.dim_size(GetTensorDimIndex(format, 'N'));
+  int c = shape.dim_size(GetTensorDimIndex(format, 'C'));
+  int h = shape.dim_size(GetTensorDimIndex(format, 'H'));
+  int w = shape.dim_size(GetTensorDimIndex(format, 'W'));
+
+  // oneDNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
+}
+
+inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
+                                               TensorFormat format) {
+  // Validate format.
+  DCHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
+
+  int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N'));
+  int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C'));
+  int d = shape.dim_size(GetTensorDimIndex<3>(format, '0'));
+  int h = shape.dim_size(GetTensorDimIndex<3>(format, '1'));
+  int w = shape.dim_size(GetTensorDimIndex<3>(format, '2'));
+
+  // oneDNN requires dimensions in NCDHW format.
+  return memory::dims({n, c, d, h, w});
+}
+
+/// Overloaded version of function TFShapeToMklDnnDimsInNCHW above.
+/// Input parameters are self-explanatory.
+inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
+                                     TensorFormat format) {
+  // Validate format.
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
+
+  int n = in_dims[GetTensorDimIndex(format, 'N')];
+  int c = in_dims[GetTensorDimIndex(format, 'C')];
+  int h = in_dims[GetTensorDimIndex(format, 'H')];
+  int w = in_dims[GetTensorDimIndex(format, 'W')];
+
+  // oneDNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
+}
+
+/// Overloaded version of function TFShapeToMklDnnDimsInNCDHW above.
+/// Input parameters are self-explanatory.
+inline memory::dims MklDnnDimsInNCDHW(const memory::dims& in_dims,
+                                      TensorFormat format) {
+  // Validate format.
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
+
+  int n = in_dims[GetTensorDimIndex<3>(format, 'N')];
+  int c = in_dims[GetTensorDimIndex<3>(format, 'C')];
+  int d = in_dims[GetTensorDimIndex<3>(format, '0')];
+  int h = in_dims[GetTensorDimIndex<3>(format, '1')];
+  int w = in_dims[GetTensorDimIndex<3>(format, '2')];
+
+  // MKL DNN requires dimensions in NCDHW format.
+  return memory::dims({n, c, d, h, w});
+}
+
+/// Map MklDnn memory::dims object into TensorShape object.
+///
+/// This function will simply map input shape in oneDNN memory::dims format
+/// in Tensorflow's TensorShape object by preserving dimension order.
+///
+/// @input oneDNN memory::dims object
+/// @output TensorShape corresponding to memory::dims
+inline TensorShape MklDnnDimsToTFShape(const memory::dims& dims) {
+  std::vector<int32> shape(dims.size(), -1);
+  for (int d = 0; d < dims.size(); d++) {
+    shape[d] = dims[d];
+  }
+
+  TensorShape ret;
+  CHECK_EQ(TensorShapeUtils::MakeShape(shape, &ret).ok(), true);
+  return ret;
+}
+
+/// Function to calculate strides given tensor shape in Tensorflow order
+/// E.g., if dims_tf_order is {1, 2, 3, 4}, then as per Tensorflow convention,
+/// dimension with size 1 is outermost dimension; while dimension with size 4 is
+/// innermost dimension. So strides for this tensor would be {4 * 3 * 2,
+/// 4 * 3, 4, 1}, i.e., {24, 12, 4, 1}.
+///
+/// @input Tensorflow shape in memory::dims type
+/// @return memory::dims containing strides for the tensor.
+inline memory::dims CalculateTFStrides(const memory::dims& dims_tf_order) {
+  CHECK_GT(dims_tf_order.size(), 0);
+  memory::dims strides(dims_tf_order.size());
+  int last_dim_idx = dims_tf_order.size() - 1;
+  strides[last_dim_idx] = 1;
+  for (int d = last_dim_idx - 1; d >= 0; d--) {
+    strides[d] = strides[d + 1] * dims_tf_order[d + 1];
+  }
+  return strides;
+}
+
+/// Helper function to create memory descriptor in Blocked format
+///
+/// @input: Tensor dimensions
+/// @input: strides corresponding to dimensions. One can use utility
+///         function such as CalculateTFStrides to compute strides
+///         for given dimensions.
+/// @output: dnnl_memory_desc_t object corresponding to blocked memory
+///          format for given dimensions and strides.
+/// @return: Status indicating whether the blocked memory descriptor
+///          was successfully created.
+inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
+                                         const memory::dims& strides,
+                                         memory::data_type dtype,
+                                         dnnl_memory_desc_t* blocked_md) {
+  DCHECK_EQ(dim.size(), strides.size());
+  const int kNumDims = dim.size();
+  dnnl_dim_t* input_dims = new dnnl_dim_t[kNumDims];
+  dnnl_dim_t* input_strides = new dnnl_dim_t[kNumDims];
+  for (int i = 0; i < kNumDims; ++i) {
+    input_dims[i] = dim[i];
+    input_strides[i] = strides[i];
+  }
+  try {
+    CREATE_MEMORY_DESC_USING_STRIDES(blocked_md, kNumDims, input_dims,
+                                     memory::convert_to_c(dtype),
+                                     input_strides);
+    delete[] input_dims;
+    delete[] input_strides;
+  } catch (dnnl::error& e) {
+    delete[] input_dims;
+    delete[] input_strides;
+    return absl::InternalError(
+        absl::StrCat("Failed to create blocked memory descriptor.",
+                     "Status: ", e.status, ", message: ", e.message));
+  }
+  return OkStatus();
+}
+
+inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
+                                    const memory& src_mem,
+                                    const memory& dst_mem, const engine& engine,
+                                    OpKernelContext* ctx = nullptr,
+                                    memory* scale_mem = nullptr) {
+  std::vector<primitive> net;
+  net.push_back(dnnl::reorder(reorder_desc));
+  std::vector<MemoryArgsMap> net_args;
+#ifndef ENABLE_ONEDNN_V3
+  net_args.push_back({{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, dst_mem}});
+#else
+  if (scale_mem != nullptr) {
+    net_args.push_back({{DNNL_ARG_FROM, src_mem},
+                        {DNNL_ARG_TO, dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, *scale_mem}});
+  } else {
+    net_args.push_back({{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, dst_mem}});
+  }
+#endif  // !ENABLE_ONEDNN_V3
+  ExecutePrimitive(net, &net_args, engine, ctx);
+}
+
+class MklReorderPrimitive;
+
+template <typename T>
+inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
+                                                const memory* to);
+
+// Class to represent all the resources corresponding to a tensor in TensorFlow
+// that are required to execute an operation (such as Convolution).
+template <typename T>
+class MklDnnData {
+ private:
+  /// oneDNN memory primitive for input user memory
+  memory* user_memory_;
+
+  /// oneDNN memory primitive in case input or output reorder is needed.
+  memory* reorder_memory_;
+
+  /// Operations memory descriptor
+  memory::desc* op_md_;
+  // flat to indicate if data is 3D or not.
+  bool bIs3D;
+  /// Operations temp buffer
+  void* allocated_buffer_;
+  /// CPU engine on which operation will be executed
+  const engine* cpu_engine_;
+
+ public:
+  explicit MklDnnData(const engine* e)
+      : user_memory_(nullptr),
+        reorder_memory_(nullptr),
+        op_md_(nullptr),
+        bIs3D(false),
+        allocated_buffer_(nullptr),
+        cpu_engine_(e) {}
+
+  // MklDnnData does not use any smart pointers,
+  // hence default operator= will result in memory leak if user_memory was
+  // already initialized. See
+  // https://github.com/tensorflow/tensorflow/pull/45593 as an example of such
+  // leak.
+  MklDnnData(const MklDnnData&) = default;
+  MklDnnData& operator=(const MklDnnData&) = delete;
+
+  ~MklDnnData() {
+    if (allocated_buffer_ != nullptr) {
+      cpu_allocator()->DeallocateRaw(allocated_buffer_);
+    }
+    cpu_engine_ = nullptr;  // We don't own this.
+    delete (user_memory_);
+    delete (reorder_memory_);
+    delete (op_md_);
+  }
+
+  inline void* GetTensorBuffer(const Tensor* tensor) const {
+    CHECK_NOTNULL(tensor);
+    return const_cast<void*>(
+        static_cast<const void*>(tensor->flat<T>().data()));
+  }
+
+  void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; }
+  bool GetIs3D() { return bIs3D; }
+
+  /// Set user memory primitive using specified dimensions, memory format tag
+  /// and data_buffer. Function automatically uses element data type by using
+  /// input type T used for creating call object.
+  ///
+  /// In a nutshell, function allows user to describe the input tensor to
+  /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
+  /// memory format tag HWIO, and the buffer that contains actual values is
+  /// pointed by data_buffer.
+  inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm,
+                        void* data_buffer = nullptr) {
+    auto md = memory::desc(dim, MklDnnType<T>(), fm);
+    SetUsrMem(md, data_buffer);
+  }
+
+  inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm,
+                        const Tensor* tensor) {
+    DCHECK(tensor);
+    SetUsrMem(dim, fm, GetTensorBuffer(tensor));
+  }
+
+  /// Helper function to create memory descriptor in Blocked format
+  ///
+  /// @input: Tensor dimensions
+  /// @input: strides corresponding to dimensions. One can use utility
+  ///         function such as CalculateTFStrides to compute strides
+  ///         for given dimensions.
+  /// @return: memory::desc object corresponding to blocked memory format
+  ///          for given dimensions and strides.
+  static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
+                                                  const memory::dims& strides) {
+    dnnl_memory_desc_t blocked_md;
+    TF_CHECK_OK(
+        CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>(), &blocked_md));
+    return memory::desc(blocked_md);
+  }
+
+  /// A version of SetUsrMem call that allows user to create memory in blocked
+  /// format. So in addition to accepting dimensions, it also accepts strides.
+  /// This allows user to create memory for tensor in a format that is not
+  /// supported by oneDNN. E.g., oneDNN does not support tensor format for 6
+  /// dimensional tensor as a native format. But by using blocked format, a user
+  /// can create memory for 6D tensor.
+  inline void SetUsrMem(const memory::dims& dim, const memory::dims& strides,
+                        void* data_buffer = nullptr) {
+    CHECK_EQ(dim.size(), strides.size());
+    auto blocked_md = MklDnnData<T>::CreateBlockedMemDesc(dim, strides);
+    SetUsrMem(blocked_md, data_buffer);
+  }
+
+  inline void SetUsrMem(const memory::dims& dim, const memory::dims& strides,
+                        const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(dim, strides, GetTensorBuffer(tensor));
+  }
+
+  /// A version of SetUsrMem with memory descriptor and tensor
+  inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(md, GetTensorBuffer(tensor));
+  }
+
+  /// A version of function to set user memory type that accepts memory
+  /// descriptor directly, instead of accepting dimensions and format. This
+  /// function is more generic than the one above, but the function above is
+  /// sufficient in most cases.
+  inline void SetUsrMem(const memory::desc& pd, void* data_buffer = nullptr) {
+    DCHECK(cpu_engine_);
+    if (user_memory_) delete user_memory_;
+    // TODO(intel-tf): can we remove dynamic memory allocation?
+    if (data_buffer) {
+      user_memory_ = new memory(pd, *cpu_engine_, data_buffer);
+    } else {
+      user_memory_ = new memory(pd, *cpu_engine_);
+    }
+  }
+
+  /// Get function for user memory primitive.
+  inline const memory* GetUsrMem() const { return user_memory_; }
+
+  /// Get function for descriptor of user memory.
+  inline memory::desc GetUsrMemDesc() const {
+    DCHECK(user_memory_);
+    return user_memory_->get_desc();
+  }
+
+  /// Get function for data buffer of user memory primitive.
+  inline void* GetUsrMemDataHandle() const {
+    CHECK_NOTNULL(user_memory_);
+    return user_memory_->get_data_handle();
+  }
+
+  /// Set function for data buffer of user memory primitive.
+  inline void SetUsrMemDataHandle(void* data_buffer,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
+    CHECK_NOTNULL(user_memory_);
+    CHECK_NOTNULL(data_buffer);
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
+    user_memory_->set_data_handle(data_buffer, *t_stream);
+#else
+    user_memory_->set_data_handle(data_buffer);
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
+  }
+
+  /// Set function for data buffer of user memory primitive.
+  inline void SetUsrMemDataHandle(const Tensor* tensor,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
+    SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
+  }
+
+  /// allocate function for data buffer
+  inline void AllocateBuffer(size_t size) {
+    const int64 kMemoryAlignment = 64;  // For AVX512 memory alignment.
+    allocated_buffer_ = cpu_allocator()->AllocateRaw(kMemoryAlignment, size);
+  }
+
+  inline void* GetAllocatedBuffer() { return allocated_buffer_; }
+
+  /// Get the memory primitive for input and output of an op. If inputs
+  /// to an op require reorders, then this function returns memory primitive
+  /// for reorder. Otherwise, it will return memory primitive for user memory.
+  ///
+  /// E.g., Conv2D(I, F) is a primitive with I and F being inputs. Then to
+  /// execute Conv2D, we need memory primitive for I and F. But if reorder is
+  /// required for I and F (say I_r is reorder primitive for I; F_r is reorder
+  /// primitive for F), then we need I_r and F_r to perform Conv2D.
+  inline const memory& GetOpMem() const {
+    return reorder_memory_ ? *reorder_memory_ : *user_memory_;
+  }
+
+  /// Set memory descriptor of an operation in terms of dimensions and memory
+  /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
+  /// but memory::format_tag would be dnnl::any because we want oneDNN to
+  /// choose the best layout/format for given input dimensions.
+  inline void SetOpMemDesc(const memory::dims& dim, memory::format_tag fm) {
+    // TODO(intel-tf): can we remove dynamic memory allocation?
+    op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
+  }
+
+  /// Get function for memory descriptor for an operation
+  inline const memory::desc& GetOpMemDesc() const { return *op_md_; }
+
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// pointed by op_md.
+  ///
+  /// @input: op_md - memory descriptor of the given input of an operation.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::desc& op_pd) const {
+    DCHECK(user_memory_);
+    return op_pd != user_memory_->get_desc();
+  }
+
+  /// Function to create a reorder from memory pointed by from to memory pointed
+  /// by to. Returns created primitive.
+  inline primitive CreateReorder(const memory* from, const memory* to) const {
+    CHECK_NOTNULL(from);
+    CHECK_NOTNULL(to);
+    return reorder(*from, *to);
+  }
+
+  /// Function to handle input reordering
+  ///
+  /// Check if we need to reorder this input of an operation.
+  /// Return true and allocate reorder memory primitive if reorder is needed.
+  /// Otherwise, return false and do not allocate reorder memory primitive.
+  ///
+  /// To check if reorder is needed, this function compares memory primitive
+  /// descriptor (memory descriptor for v1.x) of an operation (op_pd) for
+  /// the given input with the user-specified memory descriptor.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///                 operation
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, dnnl::memory>.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  std::vector<primitive>& net,
+                                  std::vector<MemoryArgsMap>& net_args,
+                                  const engine& engine) {
+    DCHECK(user_memory_);
+    DCHECK_EQ(net.size(), net_args.size());
+    if (IsReorderNeeded(op_md)) {
+      // TODO(intel-tf): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_md, engine);
+      net.push_back(CreateReorder(user_memory_, reorder_memory_));
+      net_args.push_back(MemoryArgsMap{{DNNL_ARG_FROM, *user_memory_},
+                                       {DNNL_ARG_TO, *reorder_memory_}});
+      return true;
+    }
+    return false;
+  }
+
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  const engine& engine,
+                                  OpKernelContext* context = nullptr) {
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_md)) {
+      // TODO(intel-tf): can we remove dynamic memory allocation?
+      // primitive reuse don't allow two same reorder prim in
+      // one stream, so submit it immediately
+      reorder_memory_ = new memory(op_md, engine);
+      auto* prim = FindOrCreateReorder<T>(user_memory_, reorder_memory_);
+      std::shared_ptr<stream> cpu_stream;
+      tsl::OneDnnThreadPool eigen_tp;
+      if (context != nullptr) {
+        Eigen::ThreadPoolInterface* eigen_interface =
+            EigenThreadPoolFromTfContext(context);
+        eigen_tp =
+            tsl::OneDnnThreadPool(eigen_interface, ThreadPoolUseCallerThread());
+        cpu_stream.reset(CreateStream(&eigen_tp, prim->GetEngine()));
+      } else {
+        cpu_stream.reset(CreateStream(nullptr, prim->GetEngine()));
+      }
+      std::vector<primitive> net;
+      net.push_back(*(prim->GetPrimitive()));
+      std::vector<MemoryArgsMap> net_args;
+      net_args.push_back(
+          {{DNNL_ARG_FROM, *user_memory_}, {DNNL_ARG_TO, *reorder_memory_}});
+      execute_primitives(net, cpu_stream, net_args);
+      return true;
+    }
+    return false;
+  }
+
+  /// Overloaded version of above function that accepts memory buffer
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x)
+  ///                 of the given input of an operation
+  /// @reorder_data_handle - memory buffer where output of reorder needs to be
+  ///                        stored. Primitive does not check if buffer has
+  ///                        enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, dnnl::memory>.
+  /// @input: engine - oneDNN's abstraction of a computational device
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  void* reorder_data_handle,
+                                  std::vector<primitive>& net,
+                                  std::vector<MemoryArgsMap>& net_args,
+                                  const engine& engine) {
+    DCHECK(reorder_data_handle);
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_md)) {
+      // TODO(intel-tf): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_md, engine, reorder_data_handle);
+      net.push_back(CreateReorder(user_memory_, reorder_memory_));
+      net_args.push_back(MemoryArgsMap{{DNNL_ARG_FROM, *user_memory_},
+                                       {DNNL_ARG_TO, *reorder_memory_}});
+      return true;
+    }
+    return false;
+  }
+
+  /// This is a faster path with reorder primitive cache compared with
+  /// CheckReorderToOpMem(..., std::vector<primitive>* net).
+  /// The slower path will be removed in the future
+  /// TODO(intel-tf): Need to use reorder cache here for better performance.
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  void* reorder_data_handle,
+                                  const engine& engine,
+                                  OpKernelContext* context = nullptr) {
+    DCHECK(reorder_data_handle);
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_md)) {
+      // TODO(intel-tf): can we remove dynamic memory allocation?
+      // primitive reuse don't allow two same reorder prim in
+      // one stream, so submit it immediately
+      reorder_memory_ = new memory(op_md, engine, reorder_data_handle);
+      auto* prim = FindOrCreateReorder<T>(user_memory_, reorder_memory_);
+      std::shared_ptr<stream> cpu_stream;
+      tsl::OneDnnThreadPool eigen_tp;
+      if (context != nullptr) {
+        Eigen::ThreadPoolInterface* eigen_interface =
+            EigenThreadPoolFromTfContext(context);
+        eigen_tp =
+            tsl::OneDnnThreadPool(eigen_interface, ThreadPoolUseCallerThread());
+        cpu_stream.reset(CreateStream(&eigen_tp, prim->GetEngine()));
+      } else {
+        cpu_stream.reset(CreateStream(nullptr, prim->GetEngine()));
+      }
+      std::vector<primitive> net;
+      net.push_back(*(prim->GetPrimitive()));
+      std::vector<MemoryArgsMap> net_args;
+      net_args.push_back(
+          {{DNNL_ARG_FROM, *user_memory_}, {DNNL_ARG_TO, *reorder_memory_}});
+      execute_primitives(net, cpu_stream, net_args);
+      return true;
+    }
+    return false;
+  }
+
+  /// Another overloaded version of CheckReorderToOpMem that accepts Tensor
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_md - memory primitive descriptor (memory descriptor for v1.x)
+  ///                 of the given input of an operation
+  /// @reorder_tensor - Tensor whose buffer is to be used to store output of
+  ///                   reorder. Primitive does not check if buffer is
+  ///                   enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, dnnl::memory>.
+  /// @input: engine - MKL-DNN's abstraction of a computational device
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::desc& op_md,
+                                  Tensor* reorder_tensor,
+                                  std::vector<primitive>& net,
+                                  std::vector<MemoryArgsMap>& net_args,
+                                  const engine& engine) {
+    DCHECK(reorder_tensor);
+    return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), net,
+                               net_args, engine);
+  }
+
+  /// TODO(intel-tf): this is a faster path with reorder primitive cache
+  /// compared with CheckReorderToOpMem(op_md, reorder_tensor, net, net_args,
+  /// engine), will remove slow path in the future.
+  inline bool CheckReorderToOpMem(const memory::desc& op_pd,
+                                  Tensor* reorder_tensor,
+                                  OpKernelContext* ctx = nullptr) {
+    DCHECK(reorder_tensor);
+    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor),
+                               *cpu_engine_, ctx);
+  }
+
+  /// Function to handle output reorder
+  ///
+  /// This function performs very similar functionality as input reordering
+  /// function above. The only difference is that this function does not add
+  /// reorder primitive to the net. The reason for this is: the reorder
+  /// primitive for output needs to be added to the list only after operation
+  /// has executed. But we need to prepare a temporary buffer in case output
+  /// reorder is needed. And this temporary buffer will hold the output of
+  /// an operation before it is fed to reorder primitive.
+  ///
+  /// @input - memory primitive descriptor (memory descriptor for v1.x) for the
+  ///          given output of an operation
+  /// @return: true in case reorder of output is needed; false, otherwise.
+  inline bool PrepareReorderToUserMemIfReq(const memory::desc& op_pd) {
+    DCHECK(user_memory_);
+    if (IsReorderNeeded(op_pd)) {
+      // TODO(intel-tf): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_pd, *cpu_engine_);
+      return true;
+    }
+    return false;
+  }
+
+  /// Function to actually insert reorder primitive in the net
+  ///
+  /// This function completes remaining part of output reordering. It inserts
+  /// a reordering primitive from the temporary buffer that holds the output
+  /// to the user-specified output buffer.
+  ///
+  /// @input: net - net to which to add reorder primitive
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, dnnl::memory>.
+  inline void InsertReorderToUserMem(std::vector<primitive>& net,
+                                     std::vector<MemoryArgsMap>& net_args) {
+    DCHECK(user_memory_);
+    DCHECK(reorder_memory_);
+    net.push_back(CreateReorder(reorder_memory_, user_memory_));
+    net_args.push_back(MemoryArgsMap{{DNNL_ARG_FROM, *reorder_memory_},
+                                     {DNNL_ARG_TO, *user_memory_}});
+  }
+
+  /// TODO(intel-tf): this is a faster path with reorder primitive cache
+  ///     compared with InsertReorderToUserMem(net, net_args), will remove
+  ///     slow path in the future
+  inline void InsertReorderToUserMem(OpKernelContext* ctx = nullptr) {
+    DCHECK(user_memory_);
+    DCHECK(reorder_memory_);
+    DCHECK(cpu_engine_);
+    // primitive reuse don't allow two same reorder prim in
+    // one stream, so submit it immediately
+    std::vector<primitive> net;
+    auto* prim = FindOrCreateReorder<T>(reorder_memory_, user_memory_);
+    net.push_back(*(prim->GetPrimitive()));
+    std::vector<MemoryArgsMap> net_args;
+    net_args.push_back(
+        {{DNNL_ARG_FROM, *reorder_memory_}, {DNNL_ARG_TO, *user_memory_}});
+    std::shared_ptr<stream> cpu_stream;
+    tsl::OneDnnThreadPool eigen_tp;
+    if (ctx != nullptr) {
+      Eigen::ThreadPoolInterface* eigen_interface =
+          EigenThreadPoolFromTfContext(ctx);
+      eigen_tp =
+          tsl::OneDnnThreadPool(eigen_interface, ThreadPoolUseCallerThread());
+      cpu_stream.reset(CreateStream(&eigen_tp, prim->GetEngine()));
+    } else {
+      cpu_stream.reset(CreateStream(nullptr, prim->GetEngine()));
+    }
+    execute_primitives(net, cpu_stream, net_args);
+  }
+};
+
+/// Base class for operations with reuse of primitives
+class MklPrimitive {
+ public:
+  virtual ~MklPrimitive() {}
+  MklPrimitive() {}
+  MklPrimitive(const engine& cpu_engine) { cpu_engine_ = cpu_engine; }
+  // Dummy data which MKL DNN never operates on
+  unsigned char* DummyData = nullptr;
+  engine cpu_engine_ = engine(engine::kind::cpu, 0);
+  const engine& GetEngine() { return cpu_engine_; }
+};
+
+const dnnl::memory::dims NONE_DIMS = {};
+
+//
+// LRUCache is a class which implements LRU (Least Recently Used) cache.
+// The implementation is similar to that of
+//    tensorflow/core/platform/cloud/expiring_lru_cache.h
+// without its thread-safe part because the cache is supposed to be
+// used as thread local (for instance, MklPrimitive caching).
+//
+// The LRU list maintains objects in chronological order based on
+// creation time, with the least recently accessed object at the
+// tail of LRU list, while the most recently accessed object
+// at the head of LRU list.
+//
+// This class is used to maintain an upper bound on the total number of
+// cached items. When the cache reaches its capacity, the LRU item will
+// be removed and replaced by a new one from SetOp call.
+//
+template <typename T>
+class LRUCache {
+ public:
+  explicit LRUCache(size_t capacity) {
+    capacity_ = capacity;
+    Clear();
+  }
+
+  T* GetOp(const string& key) {
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+    mutex_lock lock(lru_mu_);
+#endif
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return nullptr;
+    }
+
+    // Move to the front of LRU list as the most recently accessed.
+    lru_list_.erase(it->second.lru_iterator);
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return it->second.op;
+  }
+
+  void SetOp(const string& key, T* op) {
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+    mutex_lock lock(lru_mu_);
+#endif
+    if (lru_list_.size() >= capacity_) {
+      Delete();
+    }
+
+    // Insert an entry to the front of the LRU list
+    lru_list_.push_front(key);
+    Entry entry(op, lru_list_.begin());
+    cache_.emplace(std::make_pair(key, std::move(entry)));
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+    FinishedAllocation(key);
+#endif
+  }
+
+  void Clear() {
+    if (lru_list_.empty()) return;
+
+    // Clean up the cache
+    cache_.clear();
+    lru_list_.clear();
+  }
+
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  bool IsAllocating(const string& key) {
+    mutex_lock lock(in_flight_mu_);
+    return in_flight_.find(key) != in_flight_.end();
+  }
+
+  void Allocate(const string& key) {
+    mutex_lock lock(in_flight_mu_);
+    in_flight_.insert(key);
+  }
+
+  void FinishedAllocation(const string& key) {
+    mutex_lock lock(in_flight_mu_);
+    in_flight_.erase(key);
+  }
+#endif
+
+ private:
+  struct Entry {
+    // The entry's value.
+    T* op;
+
+    // A list iterator pointing to the entry's position in the LRU list.
+    std::list<string>::iterator lru_iterator;
+
+    // Constructor
+    Entry(T* op, std::list<string>::iterator it) {
+      this->op = op;
+      this->lru_iterator = it;
+    }
+
+    // Move constructor
+    Entry(Entry&& source) noexcept
+        : lru_iterator(std::move(source.lru_iterator)) {
+      op = std::move(source.op);
+      source.op = std::forward<T*>(nullptr);
+    }
+
+    // Destructor
+    ~Entry() {
+      if (op != nullptr) delete op;
+    }
+  };
+
+  // Remove the least recently accessed entry from LRU list, which
+  // is the tail of lru_list_. Update cache_ correspondingly.
+  bool Delete() {
+    if (lru_list_.empty()) return false;
+    string key = lru_list_.back();
+    lru_list_.pop_back();
+    cache_.erase(key);
+    return true;
+  }
+
+  // Cache capacity
+  size_t capacity_;
+
+  // The cache, a map from string key to a LRU entry.
+  std::unordered_map<string, Entry> cache_;
+
+  // The LRU list of entries.
+  // The front of the list contains the key of the most recently accessed
+  // entry, while the back of the list is the least recently accessed entry.
+  std::list<string> lru_list_;
+
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  // Guards access to the cache and LRU list
+  mutex lru_mu_;
+
+  // The keys that are currently under creation
+  std::set<string> in_flight_;
+  TF_GUARDED_BY(in_flight_mu_)
+  mutex in_flight_mu_;
+#endif
+};
+
+template <typename T>
+class MklPrimitiveFactory {
+ public:
+  MklPrimitiveFactory() {}
+
+  ~MklPrimitiveFactory() {}
+
+  MklPrimitive* GetOp(const string& key) {
+#if !defined(DNNL_AARCH64_USE_ACL) || !defined(ENABLE_ONEDNN_OPENMP)
+    auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+    return lru_cache.GetOp(key);
+#else
+    while (true) {
+      // TODO(milpuz01): Consider if it is possible to narrow scope to be
+      // only around checks for allocations and conditional wait.
+      mutex_lock lock(primitive_creation_mu_);
+      auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+
+      // Check to see whether primitive already exists.
+      MklPrimitive* primitive = lru_cache.GetOp(key);
+      if (primitive != nullptr) {
+        return primitive;
+      }
+
+      // Now check whether some other thread is creating this primitive.
+      if (!lru_cache.IsAllocating(key)) {
+        // This thread is going to pick it up and create the primitive.
+        lru_cache.Allocate(key);
+        return nullptr;
+        // Now we release lock as primitive creation might take long time.
+      }
+
+      // At this point we cannot create primitive as other thread is creating
+      // it. We should wait for primitive to get created.
+      primitive_creation_cv_.wait(lock);
+
+      // The primitive is created and is in the cache so we are going to try
+      // retrieve it again after getting a lock on it as multiple threads might
+      // be waiting for the primitive.
+    }
+#endif
+  }
+
+  void SetOp(const string& key, MklPrimitive* op) {
+#if !defined(DNNL_AARCH64_USE_ACL) || !defined(ENABLE_ONEDNN_OPENMP)
+    auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+    lru_cache.SetOp(key, op);
+#else
+    {
+      mutex_lock lock(primitive_creation_mu_);
+      auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+      lru_cache.SetOp(key, op);
+    }
+
+    // Now we can inform all waiting threads that primitive is created.
+    primitive_creation_cv_.notify_all();
+#endif
+  }
+
+  /// Function to decide whether HW has AVX512 or AVX2
+  /// For those legacy device(w/o AVX512 and AVX2),
+  /// MKL-DNN GEMM will be used.
+  static inline bool IsLegacyPlatform() {
+#ifdef DNNL_AARCH64_USE_ACL
+    return false;
+#else
+    static const bool is_legacy_platform =
+        (!port::TestCPUFeature(port::CPUFeature::AVX512F) &&
+         !port::TestCPUFeature(port::CPUFeature::AVX2));
+    return is_legacy_platform;
+#endif  // DNNL_AARCH64_USE_ACL
+  }
+
+  /// Function to check whether primitive memory optimization is enabled
+  static inline bool IsPrimitiveMemOptEnabled() {
+    static const bool is_primitive_mem_opt_enabled = [] {
+      bool value = true;
+      TF_CHECK_OK(
+          ReadBoolFromEnvVar("TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE", true, &value));
+      return value;
+    }();
+    return is_primitive_mem_opt_enabled;
+  }
+
+#ifdef DNNL_AARCH64_USE_ACL
+  static int IncrementCounter() {
+    static std::atomic_int counter{1};
+    return counter.fetch_add(1);
+  }
+#endif
+
+ private:
+  static inline LRUCache<MklPrimitive>& GetLRUCache() {
+    static const int kCapacity = 1024;  // cache capacity
+#if !defined(DNNL_AARCH64_USE_ACL) || !defined(ENABLE_ONEDNN_OPENMP)
+    static thread_local LRUCache<MklPrimitive> lru_cache_(kCapacity);
+#else
+    static LRUCache<MklPrimitive> lru_cache_(kCapacity);
+#endif
+    return lru_cache_;
+  }
+
+#if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
+  mutex primitive_creation_mu_;
+  condition_variable primitive_creation_cv_;
+#endif
+};
+
+// utility class for creating keys of MKL primitive pool.
+class FactoryKeyCreator {
+ public:
+  FactoryKeyCreator() { key_.reserve(kMaxKeyLength); }
+
+  ~FactoryKeyCreator() {}
+
+  void AddAsKey(const string& str) { Append(str); }
+
+  void AddAsKey(const dnnl::memory::dims& dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AddAsKey<int>(dims[i]);
+    }
+  }
+
+  template <typename T>
+  void AddAsKey(const T data) {
+    auto buffer = reinterpret_cast<const char*>(&data);
+    Append(StringPiece(buffer, sizeof(T)));
+  }
+
+  // generalisation to handle pointers
+  void AddAsKey(const void* data) {
+    auto buffer = reinterpret_cast<const char*>(&data);
+    Append(StringPiece(buffer, sizeof(data)));
+  }
+
+  string GetKey() { return key_; }
+
+ private:
+  string key_;
+  const char delimiter = 'x';
+  const int kMaxKeyLength = 256;
+  void Append(StringPiece s) {
+    key_.append(string(s));
+    key_.append(1, delimiter);
+  }
+};
+
+class MklReorderPrimitive : public MklPrimitive {
+ public:
+  explicit MklReorderPrimitive(const memory* from, const memory* to)
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
+    Setup(from, to);
+  }
+  ~MklReorderPrimitive() {}
+
+  std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
+
+  void SetMemory(const memory* from, const memory* to) {
+    context_.src_mem->set_data_handle(from->get_data_handle());
+    context_.dst_mem->set_data_handle(to->get_data_handle());
+  }
+
+  std::shared_ptr<dnnl::stream> GetStream() { return stream_; }
+
+ private:
+  struct ReorderContext {
+    std::shared_ptr<dnnl::memory> src_mem;
+    std::shared_ptr<dnnl::memory> dst_mem;
+    std::shared_ptr<primitive> reorder_prim;
+    ReorderContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
+
+  std::shared_ptr<dnnl::stream> stream_;
+
+  void Setup(const memory* from, const memory* to) {
+    context_.src_mem.reset(
+        new memory(from->get_desc(), cpu_engine_, DummyData));
+    context_.dst_mem.reset(new memory(to->get_desc(), cpu_engine_, DummyData));
+    context_.reorder_prim = std::make_shared<dnnl::reorder>(
+        reorder(*context_.src_mem, *context_.dst_mem));
+    stream_.reset(new stream(cpu_engine_));
+  }
+};
+
+template <typename T>
+class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklReorderPrimitive* Get(const memory* from, const memory* to) {
+    auto reorderPrim = static_cast<MklReorderPrimitive*>(
+        MklReorderPrimitiveFactory<T>::GetInstance().GetReorder(from, to));
+    if (reorderPrim == nullptr) {
+      reorderPrim = new MklReorderPrimitive(from, to);
+      MklReorderPrimitiveFactory<T>::GetInstance().SetReorder(from, to,
+                                                              reorderPrim);
+    }
+    reorderPrim->SetMemory(from, to);
+    return reorderPrim;
+  }
+
+  static MklReorderPrimitiveFactory& GetInstance() {
+    static MklReorderPrimitiveFactory instance_;
+    return instance_;
+  }
+
+  static string CreateKey(const memory* from, const memory* to) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = from->GET_MEMORY_DESC;
+    auto const& to_desc = to->GET_MEMORY_DESC;
+    memory::dims INIT_DIMS_FROM_DESC(from_dims, from_desc);
+    memory::dims INIT_DIMS_FROM_DESC(to_dims, to_desc);
+    auto from_strides = from_desc.GET_STRIDES;
+    auto from_inner_nblks = from_desc.GET_INNER_NBLKS;
+    auto from_inner_blks = from_desc.GET_INNER_BLKS;
+    auto from_inner_idxs = from_desc.GET_INNER_IDXS;
+    auto to_inner_nblks = to_desc.GET_INNER_NBLKS;
+    auto to_inner_blks = to_desc.GET_INNER_BLKS;
+    auto to_inner_idxs = to_desc.GET_INNER_IDXS;
+    auto to_strides = to_desc.GET_STRIDES;
+#ifndef ENABLE_ONEDNN_V3
+    memory::dims from_inner_blks_1(from_inner_blks,
+                                   &from_inner_blks[from_inner_nblks]);
+    memory::dims from_inner_idxs_1(from_inner_idxs,
+                                   &from_inner_idxs[from_inner_nblks]);
+    memory::dims to_inner_blks_1(to_inner_blks, &to_inner_blks[to_inner_nblks]);
+    memory::dims to_inner_idxs_1(to_inner_idxs, &to_inner_idxs[to_inner_nblks]);
+    memory::dims from_strides_outer_blocks(from_strides,
+                                           &from_strides[from_desc.ndims]);
+    memory::dims to_strides_outer_blocks(to_strides,
+                                         &to_strides[to_desc.ndims]);
+#endif  // !ENABLE_ONEDNN_V3
+
+    key_creator.AddAsKey(prefix);
+#ifdef DNNL_AARCH64_USE_ACL
+    // The reorder primitives have local memory (calls to SetMemory) so we
+    // need to make sure that memory for those primitives is cached per thread.
+    key_creator.AddAsKey(std::this_thread::get_id());
+#endif
+    // TODO(intel-tf): dnnl_memory_extra_desc_t (from/to_desc.extra) can no
+    // longer be queried in oneDNN v3.x. In oneDNN v2.x, this was used to
+    // create a unique key for int8 reorder primitive cache. To overcome this
+    // limitation in oneDNN v3.x, we are using md.get_size() instead. Note that
+    // get_size() has the limitation that it can return the same value for both
+    // s8s8 and zero point compensation. Since we currently support only s8s8
+    // compensation, this needs to be refactored once we support zero point
+    // compensation.
+    key_creator.AddAsKey(static_cast<int>(from_desc.GET_MEMORY_DESC_FLAGS));
+    key_creator.AddAsKey(static_cast<int>(from_inner_nblks));
+    key_creator.AddAsKey(GET_INNER_DIMS(from_inner_blks, from_inner_blks_1));
+    key_creator.AddAsKey(GET_INNER_DIMS(from_inner_idxs, from_inner_idxs_1));
+    key_creator.AddAsKey(static_cast<int>(from_desc.GET_DATA_TYPE));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(
+        GET_STRIDES_DIMS(from_strides, from_strides_outer_blocks));
+    key_creator.AddAsKey(static_cast<int>(to_desc.GET_MEMORY_DESC_FLAGS));
+    key_creator.AddAsKey(static_cast<int>(to_inner_nblks));
+    key_creator.AddAsKey(GET_INNER_DIMS(to_inner_blks, to_inner_blks_1));
+    key_creator.AddAsKey(GET_INNER_DIMS(to_inner_idxs, to_inner_idxs_1));
+    key_creator.AddAsKey(static_cast<int>(to_desc.GET_DATA_TYPE));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(GET_STRIDES_DIMS(to_strides, to_strides_outer_blocks));
+    return key_creator.GetKey();
+  }
+
+ private:
+  MklReorderPrimitiveFactory() {}
+  ~MklReorderPrimitiveFactory() {}
+
+  MklPrimitive* GetReorder(const memory* from, const memory* to) {
+    string key = CreateKey(from, to);
+    return this->GetOp(key);
+  }
+
+  void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
+    string key = CreateKey(from, to);
+    this->SetOp(key, op);
+  }
+};
+
+/// Function to find(or create) a reorder from memory pointed by
+/// from to memory pointed by to, it will created primitive or
+/// get primitive from pool if it is cached.
+/// Returns the primitive.
+template <typename T>
+inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
+                                                const memory* to) {
+  CHECK_NOTNULL(from);
+  CHECK_NOTNULL(to);
+  MklReorderPrimitive* reorder_prim =
+      MklReorderPrimitiveFactory<T>::Get(from, to);
+  return reorder_prim;
+}
+
+// utility function to determine if it is conv 1x1 and stride != 1
+// for purpose of temporarily disabling primitive reuse
+inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
+                                memory::dims strides) {
+  if (filter_dims.size() != 4 || strides.size() != 2) return false;
+
+  return ((filter_dims[2] == 1) && (filter_dims[3] == 1) &&
+          ((strides[0] != 1) || (strides[1] != 1)));
+}
+
+#undef ARE_MEMORY_DESCS_EQUAL
+#undef CREATE_MEMORY_DESC_USING_STRIDES
+#undef GET_DATA_TYPE
+#undef GET_DIMS
+#undef GET_INNER_BLKS
+#undef GET_INNER_DIMS
+#undef GET_INNER_IDXS
+#undef GET_INNER_NBLKS
+#undef GET_MEMORY_DESC
+#undef GET_MEMORY_DESC_FLAGS
+#undef GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR
+#undef GET_NDIMS
+#undef GET_STRIDES
+#undef GET_STRIDES_DIMS
+#undef INIT_DIMS_FROM_DESC
+#undef MEMORY_DESC
+
+}  // namespace tensorflow
+
+/////////////////////////////////////////////////////////////////////
+// Macros for handling registration for various types
+/////////////////////////////////////////////////////////////////////
+
+#define REGISTER_TEST_FLOAT32(TEST) REGISTER_TEST(TEST, DT_FLOAT, Float32Input);
+
+#define REGISTER_TEST_BFLOAT16(TEST) \
+  REGISTER_TEST(TEST, DT_BFLOAT16, BFloat16Input);
+
+#define REGISTER_TEST_ALL_TYPES(TEST) \
+  REGISTER_TEST_FLOAT32(TEST);        \
+  REGISTER_TEST_BFLOAT16(TEST);
+#else
+#define REGISTER_TEST_ALL_TYPES(TEST) REGISTER_TEST_FLOAT32(TEST);
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/onednn_env_vars.h b/third_party/tflite-hdrs/tensorflow/core/util/onednn_env_vars.h
new file mode 100644
index 00000000..d7debd22
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/onednn_env_vars.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_ONEDNN_ENV_VARS_H_
+#define TENSORFLOW_CORE_UTIL_ONEDNN_ENV_VARS_H_
+#ifdef INTEL_MKL
+
+#include <string>
+
+namespace tensorflow {
+
+bool AreWeightsFrozen();
+
+bool UseSystemAlloc();
+
+bool ThreadPoolUseCallerThread();
+
+bool UseOnednnSpmm();
+
+std::string FPMathModeSetting();
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_ONEDNN_ENV_VARS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/overflow.h b/third_party/tflite-hdrs/tensorflow/core/util/overflow.h
new file mode 100644
index 00000000..a041d86b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/overflow.h
@@ -0,0 +1,67 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_OVERFLOW_H_
+#define TENSORFLOW_CORE_UTIL_OVERFLOW_H_
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Multiply two nonnegative int64's, returning negative for overflow
+// If any of the arguments is negative, return negative too.
+inline int64_t MultiplyWithoutOverflow(int64_t x, int64_t y) {
+  if (TF_PREDICT_FALSE(x < 0)) return -1;
+  if (TF_PREDICT_FALSE(y < 0)) return -1;
+  if (TF_PREDICT_FALSE(x == 0)) return 0;
+
+  // Multiply in uint64 rather than int64 since signed overflow is undefined.
+  // Negative values will wrap around to large unsigned values in the casts
+  // (see section 4.7 [conv.integral] of the C++14 standard).
+  const uint64 ux = x;
+  const uint64 uy = y;
+  const uint64 uxy = ux * uy;
+
+  // Check if we overflow uint64, using a cheap check if both inputs are small
+  if (TF_PREDICT_FALSE((ux | uy) >> 32 != 0)) {
+    // Otherwise, detect overflow using a division
+    if (uxy / ux != uy) return -1;
+  }
+
+  // Cast back to signed. A negative value signals an overflow.
+  return static_cast<int64_t>(uxy);
+}
+
+// Add two nonnegative int64's, returning negative for overflow
+// If any of the arguments is negative, return negative too.
+inline int64_t AddWithoutOverflow(int64_t x, int64_t y) {
+  if (TF_PREDICT_FALSE((x < 0)) || (y < 0)) return -1;
+
+  // Add in uint64 rather than int64 since signed overflow is undefined.
+  // Negative values will wrap around to large unsigned values in the casts
+  // (see section 4.7 [conv.integral] of the C++14 standard).
+  const uint64 ux = x;
+  const uint64 uy = y;
+  const uint64 uxy = ux + uy;
+
+  // Cast back to signed. A negative value signals an overflow.
+  return static_cast<int64_t>(uxy);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_OVERFLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/padding.h b/third_party/tflite-hdrs/tensorflow/core/util/padding.h
new file mode 100644
index 00000000..3c1351df
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/padding.h
@@ -0,0 +1,68 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PADDING_H_
+#define TENSORFLOW_CORE_UTIL_PADDING_H_
+
+// This file contains helper routines to deal with padding in various ops and
+// kernels.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+class NodeDef;
+
+// Padding: the padding we apply to the input tensor along the rows and columns
+// dimensions. This is usually used to make sure that the spatial dimensions do
+// not shrink when we progress with convolutions. Three types of padding are
+// supported:
+//   VALID: No padding is carried out.
+//   SAME: The pad value is computed so that the output will have the same
+//         dimensions as the input.
+//   EXPLICIT: The user specifies the pad values in the explicit_paddings
+//             attribute.
+// The padded area is typically zero-filled. For pooling ops, the padded area is
+// instead ignored. For max pool, this is equivalent to padding with -infinity.
+enum Padding {
+  VALID = 1,     // No padding.
+  SAME = 2,      // Input and output layers have the same size.
+  EXPLICIT = 3,  // Padding is explicitly specified
+};
+
+// Returns an error if the padding attributes are invalid.
+absl::Status CheckValidPadding(Padding padding_type,
+                               const std::vector<int64_t>& explicit_paddings,
+                               int num_dims, TensorFormat data_format);
+
+// Return the string containing the list of valid padding types, that can be
+// used as an Attr() in REGISTER_OP.
+std::string GetPaddingAttrString();
+
+// Like GetPaddingAttrString(), but also includes EXPLICIT.
+std::string GetPaddingAttrStringWithExplicit();
+
+std::string GetExplicitPaddingsAttrString();
+
+// Sets padding value based on the given string padding value.
+absl::Status GetPaddingFromString(absl::string_view str_value, Padding* value);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PADDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/permutation_input_iterator.h b/third_party/tflite-hdrs/tensorflow/core/util/permutation_input_iterator.h
new file mode 100644
index 00000000..649318eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/permutation_input_iterator.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename InputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationInputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationInputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;   ///< The type of a pointer to an element the
+                                ///< iterator can point to
+  typedef ValueType reference;  ///< The type of a reference to an element the
+                                ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  InputIteratorT input_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationInputIterator(
+      InputIteratorT input_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)  ///< Conversion functor to wrap
+      : input_itr(input_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return input_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(input_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(input_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return input_itr[index_itr[n]];
+  }
+
+  /// Structure dereference
+  __host__ __device__ __forceinline__ pointer operator->() {
+    return input_itr + *index_itr;
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && input_itr == rhs.input_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/permutation_output_iterator.h b/third_party/tflite-hdrs/tensorflow/core/util/permutation_output_iterator.h
new file mode 100644
index 00000000..638c0f45
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/permutation_output_iterator.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename OutputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationOutputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;    ///< The type of a pointer to an element the
+                                 ///< iterator can point to
+  typedef ValueType& reference;  ///< The type of a reference to an element the
+                                 ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  OutputIteratorT output_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationOutputIterator(
+      OutputIteratorT output_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)    ///< Conversion functor to wrap
+      : output_itr(output_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return output_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(output_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(output_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return output_itr[index_itr[n]];
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && output_itr == rhs.output_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/port.h b/third_party/tflite-hdrs/tensorflow/core/util/port.h
new file mode 100644
index 00000000..9b2608ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/port.h
@@ -0,0 +1,71 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PORT_H_
+#define TENSORFLOW_CORE_UTIL_PORT_H_
+
+namespace tensorflow {
+
+// Returns true if GOOGLE_CUDA is defined.
+bool IsGoogleCudaEnabled();
+
+// Returns true if TENSORFLOW_USE_ROCM is defined. (i.e. TF is built with ROCm)
+bool IsBuiltWithROCm();
+
+// Returns true if TENSORFLOW_USE_XLA is defined. (i.e. TF is built with XLA)
+bool IsBuiltWithXLA();
+
+// Returns true if TENSORFLOW_USE_NVCC is defined. (i.e. TF is built with nvcc)
+bool IsBuiltWithNvcc();
+
+// Returns true if TF_LLVM_AARCH32_AVAILABLE is defined. (i.e. TF is built for
+// AArch32)
+bool IsAArch32Available();
+
+// Returns true if TF_LLVM_AARCH64_AVAILABLE is defined. (i.e. TF is built for
+// AArch64)
+bool IsAArch64Available();
+
+// Returns true if TF_LLVM_POWERPC_AVAILABLE is defined. (i.e. TF is built for
+// PowerPC)
+bool IsPowerPCAvailable();
+
+// Returns true if TF_LLVM_S390X_AVAILABLE is defined. (i.e. TF is built for
+// System z)
+bool IsSystemZAvailable();
+
+// Returns true if TF_LLVM_X86_AVAILABLE is defined. (i.e. TF is built for X86)
+bool IsX86Available();
+
+// Returns true if either
+//
+//   GOOGLE_CUDA is defined, and the given CUDA version supports
+//   half-precision matrix multiplications and convolution operations.
+//
+//     OR
+//
+//   TENSORFLOW_USE_ROCM is defined
+//
+bool GpuSupportsHalfMatMulAndConv();
+
+// Returns true if INTEL_MKL is defined
+bool IsMklEnabled();
+
+// Returns true if TF_ENABLE_ZENDNN_OPTS is set to 1
+bool IsZenDnnEnabled();
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/presized_cuckoo_map.h b/third_party/tflite-hdrs/tensorflow/core/util/presized_cuckoo_map.h
new file mode 100644
index 00000000..2a03c511
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/presized_cuckoo_map.h
@@ -0,0 +1,339 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PRESIZED_CUCKOO_MAP_H_
+#define TENSORFLOW_CORE_UTIL_PRESIZED_CUCKOO_MAP_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/base/prefetch.h"
+#include "absl/numeric/int128.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Class for efficiently storing key->value mappings when the size is
+// known in advance and the keys are pre-hashed into uint64s.
+// Keys should have "good enough" randomness (be spread across the
+// entire 64 bit space).
+//
+// Important:  Clients wishing to use deterministic keys must
+// ensure that their keys fall in the range 0 .. (uint64max-1);
+// the table uses 2^64-1 as the "not occupied" flag.
+//
+// Inserted keys must be unique, and there are no update
+// or delete functions (until some subsequent use of this table
+// requires them).
+//
+// Threads must synchronize their access to a PresizedCuckooMap.
+//
+// The cuckoo hash table is 4-way associative (each "bucket" has 4
+// "slots" for key/value entries).  Uses breadth-first-search to find
+// a good cuckoo path with less data movement (see
+// http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf )
+
+template <class value>
+class PresizedCuckooMap {
+ public:
+  // The key type is fixed as a pre-hashed key for this specialized use.
+  typedef uint64 key_type;
+
+  explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); }
+
+  void Clear(uint64 num_entries) {
+    cpq_.reset(new CuckooPathQueue());
+    double n(num_entries);
+    n /= kLoadFactor;
+    num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
+    // Very small cuckoo tables don't work, because the probability
+    // of having same-bucket hashes is large.  We compromise for those
+    // uses by having a larger static starting size.
+    num_buckets_ += 32;
+    Bucket empty_bucket;
+    for (int i = 0; i < kSlotsPerBucket; i++) {
+      empty_bucket.keys[i] = kUnusedSlot;
+    }
+    buckets_.clear();
+    buckets_.resize(num_buckets_, empty_bucket);
+  }
+
+  // Returns false if k is already in table or if the table
+  // is full; true otherwise.
+  bool InsertUnique(const key_type k, const value& v) {
+    uint64 tk = key_transform(k);
+    uint64 b1 = fast_map_to_buckets(tk);
+    uint64 b2 = fast_map_to_buckets(h2(tk));
+
+    // Merged find and duplicate checking.
+    uint64 target_bucket = 0;
+    int target_slot = kNoSpace;
+
+    for (auto bucket : {b1, b2}) {
+      Bucket* bptr = &buckets_[bucket];
+      for (int slot = 0; slot < kSlotsPerBucket; slot++) {
+        if (bptr->keys[slot] == k) {  // Duplicates are not allowed.
+          return false;
+        } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) {
+          target_bucket = bucket;
+          target_slot = slot;
+        }
+      }
+    }
+
+    if (target_slot != kNoSpace) {
+      InsertInternal(tk, v, target_bucket, target_slot);
+      return true;
+    }
+
+    return CuckooInsert(tk, v, b1, b2);
+  }
+
+  // Returns true if found.  Sets *out = value.
+  bool Find(const key_type k, value* out) const {
+    uint64 tk = key_transform(k);
+    return FindInBucket(k, fast_map_to_buckets(tk), out) ||
+           FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
+  }
+
+  // Prefetch memory associated with the key k into cache.
+  void PrefetchKey(const key_type k) const {
+    const uint64 tk = key_transform(k);
+    absl::PrefetchToLocalCache(&buckets_[fast_map_to_buckets(tk)].keys);
+    absl::PrefetchToLocalCache(&buckets_[fast_map_to_buckets(h2(tk))].keys);
+  }
+
+  int64_t MemoryUsed() const {
+    return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
+  }
+
+ private:
+  static constexpr int kSlotsPerBucket = 4;
+
+  // The load factor is chosen slightly conservatively for speed and
+  // to avoid the need for a table rebuild on insertion failure.
+  // 0.94 is achievable, but 0.85 is faster and keeps the code simple
+  // at the cost of a small amount of memory.
+  // NOTE:  0 < kLoadFactor <= 1.0
+  static constexpr double kLoadFactor = 0.85;
+
+  // Cuckoo insert:  The maximum number of entries to scan should be ~400
+  // (Source:  Personal communication with Michael Mitzenmacher;  empirical
+  // experiments validate.).  After trying 400 candidate locations, declare
+  // the table full - it's probably full of unresolvable cycles.  Less than
+  // 400 reduces max occupancy;  much more results in very poor performance
+  // around the full point.  For (2,4) a max BFS path len of 5 results in ~682
+  // nodes to visit, calculated below, and is a good value.
+
+  static constexpr uint8 kMaxBFSPathLen = 5;
+
+  // Constants for BFS cuckoo path search:
+  // The visited list must be maintained for all but the last level of search
+  // in order to trace back the path.  The BFS search has two roots
+  // and each can go to a total depth (including the root) of 5.
+  // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682.
+  // The visited queue, however, does not need to hold the deepest level,
+  // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170
+  static constexpr int kMaxQueueSize = 682;
+  static constexpr int kVisitedListSize = 170;
+
+  static constexpr int kNoSpace = -1;  // SpaceAvailable return
+  static constexpr uint64 kUnusedSlot = ~(0ULL);
+
+  // Buckets are organized with key_types clustered for access speed
+  // and for compactness while remaining aligned.
+  struct Bucket {
+    key_type keys[kSlotsPerBucket];
+    value values[kSlotsPerBucket];
+  };
+
+  // Insert uses the BFS optimization (search before moving) to reduce
+  // the number of cache lines dirtied during search.
+
+  struct CuckooPathEntry {
+    uint64 bucket;
+    int depth;
+    int parent;       // To index in the visited array.
+    int parent_slot;  // Which slot in our parent did we come from?  -1 == root.
+  };
+
+  // CuckooPathQueue is a trivial circular queue for path entries.
+  // The caller is responsible for not inserting more than kMaxQueueSize
+  // entries.  Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue
+  // that it reuses across inserts.
+  class CuckooPathQueue {
+   public:
+    CuckooPathQueue() : head_(0), tail_(0) {}
+
+    void push_back(CuckooPathEntry e) {
+      queue_[tail_] = e;
+      tail_ = (tail_ + 1) % kMaxQueueSize;
+    }
+
+    CuckooPathEntry pop_front() {
+      CuckooPathEntry& e = queue_[head_];
+      head_ = (head_ + 1) % kMaxQueueSize;
+      return e;
+    }
+
+    bool empty() const { return head_ == tail_; }
+
+    bool full() const { return ((tail_ + 1) % kMaxQueueSize) == head_; }
+
+    void reset() { head_ = tail_ = 0; }
+
+   private:
+    CuckooPathEntry queue_[kMaxQueueSize];
+    int head_;
+    int tail_;
+  };
+
+  typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath;
+
+  // Callers are expected to have pre-hashed the keys into a uint64
+  // and are expected to be able to handle (a very low rate) of
+  // collisions, OR must ensure that their keys are always in
+  // the range 0 - (uint64max - 1).  This transforms 'not found flag'
+  // keys into something else.
+  inline uint64 key_transform(const key_type k) const {
+    return k + (k == kUnusedSlot);
+  }
+
+  // h2 performs a very quick mix of h to generate the second bucket hash.
+  // Assumes there is plenty of remaining entropy in the initial h.
+  inline uint64 h2(uint64 h) const {
+    const uint64 m = 0xc6a4a7935bd1e995;
+    return m * ((h >> 32) | (h << 32));
+  }
+
+  // alt_bucket identifies the "other" bucket for key k, where
+  // other is "the one that isn't bucket b"
+  inline uint64 alt_bucket(key_type k, uint64 b) const {
+    if (fast_map_to_buckets(k) != b) {
+      return fast_map_to_buckets(k);
+    }
+    return fast_map_to_buckets(h2(k));
+  }
+
+  inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
+    Bucket* bptr = &buckets_[b];
+    bptr->keys[slot] = k;
+    bptr->values[slot] = v;
+  }
+
+  // For the associative cuckoo table, check all of the slots in
+  // the bucket to see if the key is present.
+  bool FindInBucket(key_type k, uint64 b, value* out) const {
+    const Bucket& bref = buckets_[b];
+    for (int i = 0; i < kSlotsPerBucket; i++) {
+      if (bref.keys[i] == k) {
+        *out = bref.values[i];
+        return true;
+      }
+    }
+    return false;
+  }
+
+  //  returns either kNoSpace or the index of an
+  //  available slot (0 <= slot < kSlotsPerBucket)
+  inline int SpaceAvailable(uint64 bucket) const {
+    const Bucket& bref = buckets_[bucket];
+    for (int i = 0; i < kSlotsPerBucket; i++) {
+      if (bref.keys[i] == kUnusedSlot) {
+        return i;
+      }
+    }
+    return kNoSpace;
+  }
+
+  inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
+                       int dst_slot) {
+    Bucket& src_ref = buckets_[src_bucket];
+    Bucket& dst_ref = buckets_[dst_bucket];
+    dst_ref.keys[dst_slot] = src_ref.keys[src_slot];
+    dst_ref.values[dst_slot] = src_ref.values[src_slot];
+  }
+
+  bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
+    int visited_end = 0;
+    cpq_->reset();
+
+    cpq_->push_back({b1, 1, 0, 0});  // Note depth starts at 1.
+    cpq_->push_back({b2, 1, 0, 0});
+
+    while (!cpq_->empty()) {
+      CuckooPathEntry e = cpq_->pop_front();
+      int free_slot;
+      free_slot = SpaceAvailable(e.bucket);
+      if (free_slot != kNoSpace) {
+        while (e.depth > 1) {
+          // "copy" instead of "swap" because one entry is always zero.
+          // After, write target key/value over top of last copied entry.
+          CuckooPathEntry parent = visited_[e.parent];
+          CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot);
+          free_slot = e.parent_slot;
+          e = parent;
+        }
+        InsertInternal(k, v, e.bucket, free_slot);
+        return true;
+      } else {
+        if (e.depth < (kMaxBFSPathLen)) {
+          auto parent_index = visited_end;
+          visited_[visited_end] = e;
+          visited_end++;
+          // Don't always start with the same slot, to even out the path depth.
+          int start_slot = (k + e.bucket) % kSlotsPerBucket;
+          const Bucket& bref = buckets_[e.bucket];
+          for (int i = 0; i < kSlotsPerBucket; i++) {
+            int slot = (start_slot + i) % kSlotsPerBucket;
+            uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
+            // Optimization:  Avoid single-step cycles (from e, don't
+            // add a child node that is actually e's parent).
+            uint64 e_parent_bucket = visited_[e.parent].bucket;
+            if (next_bucket != e_parent_bucket) {
+              cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
+            }
+          }
+        }
+      }
+    }
+
+    LOG(WARNING) << "Cuckoo path finding failed: Table too small?";
+    return false;
+  }
+
+  inline uint64 fast_map_to_buckets(uint64 x) const {
+    // Map x (uniform in 2^64) to the range [0, num_buckets_ -1]
+    // using Lemire's alternative to modulo reduction:
+    // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    // Instead of x % N, use (x * N) >> 64.
+    return absl::Uint128High64(absl::uint128(x) * absl::uint128(num_buckets_));
+  }
+
+  // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
+  uint64 num_buckets_;
+  std::vector<Bucket> buckets_;
+
+  std::unique_ptr<CuckooPathQueue> cpq_;
+  CuckooPathEntry visited_[kVisitedListSize];
+
+  PresizedCuckooMap(const PresizedCuckooMap&) = delete;
+  void operator=(const PresizedCuckooMap&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PRESIZED_CUCKOO_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/proto/decode.h b/third_party/tflite-hdrs/tensorflow/core/util/proto/decode.h
new file mode 100644
index 00000000..7d43e34b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/proto/decode.h
@@ -0,0 +1,713 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Inline functions for parsing the protocol buffers wire format.
+//
+// These functions have been optimized at the expense of safety.
+// They are broken out into a separate file for readability but are
+// not intended for use by clients other than the decode_proto op.
+//
+// The calling code in the decode_proto op does some fairly
+// complicated things to ensure that this code is called
+// safely. Changes to this code should be thoroughly fuzz tested.
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+using tensorflow::protobuf::internal::WireFormatLite;
+using tensorflow::protobuf::io::CodedInputStream;
+using tensorflow::protobuf::io::CodedOutputStream;
+using tensorflow::protobuf::io::StringOutputStream;
+
+// Converts an uint64 to an int64 without loss of information.
+// Unsigned values greater than INT64_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int64_t WrapUnsignedAsSigned64(uint64 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT64_MAX) {
+    return static_cast<int64_t>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT64_MIN) {
+    return static_cast<int64_t>(unsigned_value - INT64_MIN) + INT64_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Converts an uint32 to an int32 without loss of information.
+// Unsigned values greater than INT_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT_MAX) {
+    return static_cast<int32>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT_MIN) {
+    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value);
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+// This is slightly less efficient than the private version in
+// coded_stream.cc but we duplicate less code by calling
+// the 64 bit version instead of copying the code.
+inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
+                                          uint32* value) {
+  uint64 tmp = 0;
+  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+  *value = tmp & 0xffffffff;
+  return buf;
+}
+
+// Reads a single proto field value from a byte array into an array.
+// The array is part of a Tensor that was allocated by the caller
+// with type TensorType, while DeclaredType is the proto field type.
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+
+template <>
+inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
+    const uint8* buf, int64_t* value) {
+  uint32 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int64_t>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int32>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
+    const uint8* buf, int64_t* value) {
+  uint64 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, uint64* value) {
+  uint32 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, uint32* value) {
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  return ReadVarint32FromArray(buf, &unused_ok, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT64>(
+    const uint8* buf, uint64* value) {
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  return ReadVarint64FromArray(buf, &unused_ok, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
+    const uint8* buf, int64_t* value) {
+  uint64 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
+    const uint8* buf, int64_t* value) {
+  uint64 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, uint64* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, uint32* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED64>(
+    const uint8* buf, uint64* value) {
+  protobuf_uint64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
+                                               WireFormatLite::TYPE_FIXED64>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
+    const uint8* buf, int64_t* value) {
+  int32_t temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<int32,
+                                               WireFormatLite::TYPE_SFIXED32>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
+    const uint8* buf, int32* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32,
+                                                WireFormatLite::TYPE_SFIXED32>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
+    const uint8* buf, int64_t* value) {
+  protobuf_int64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
+                                               WireFormatLite::TYPE_SFIXED64>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8* buf, float* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<float,
+                                                WireFormatLite::TYPE_FLOAT>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
+    const uint8* buf, double* value) {
+  float temp;
+  buf =
+      WireFormatLite::ReadPrimitiveFromArray<float, WireFormatLite::TYPE_FLOAT>(
+          buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8* buf, double* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<double,
+                                                WireFormatLite::TYPE_DOUBLE>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8* buf, bool* value) {
+  uint64 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = temp != 0;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8* buf, int* value) {
+  uint32 temp = 0;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int>(temp);
+  return buf;
+}
+
+// Reads packed values from an array.
+// Stride is set to 1 for repeated fields, and 0 for non-repeated fields
+// (where any value overwrites previous values).
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+inline int ReadPackedPrimitives(const void* bufp, const size_t len,
+                                const int index, const int stride,
+                                void* datap) {
+  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
+  const uint8* bound = buf + len;
+  TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
+  int count;
+
+  // This could overrun the bound by stride-1. This is defended
+  // against in the caller, where it ensures that the input buffer
+  // contains complete values.
+  for (count = 0; buf < bound; count += stride) {
+    buf = ReadFromArray<TensorType, DeclaredType>(buf, data + count);
+  }
+  return count;
+}
+
+// Reads a value of a primitive type field from a serialized proto.
+// The value is parsed from the serialized format, then static_cast
+// to the desired type for TensorFlow and stored.
+template <class ValueType, class TensorType,
+          enum WireFormatLite::FieldType DeclaredType>
+inline absl::Status ReadPrimitive(CodedInputStream* input, int index,
+                                  void* data) {
+  ValueType v;
+  if (!WireFormatLite::ReadPrimitive<ValueType, DeclaredType>(input, &v)) {
+    return errors::DataLoss("Failed reading primitive");
+  }
+
+  reinterpret_cast<TensorType*>(data)[index] = v;
+  return absl::OkStatus();
+}
+
+// Reads a string, submessage, or other variable-length field from a
+// serialized proto.
+// May read all or part of a repeated field.
+inline absl::Status ReadBytes(CodedInputStream* input, int index, void* datap) {
+  tstring* data = reinterpret_cast<tstring*>(datap) + index;
+
+  uint32 length;
+  if (!input->ReadVarint32(&length)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+
+  data->resize_uninitialized(length);
+
+  if (!input->ReadRaw(data->data(), length)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+  return absl::OkStatus();
+}
+
+// Reads a tag-delimited field (TYPE_GROUP) from a serialized proto,
+// as a bytestring.
+inline absl::Status ReadGroupBytes(CodedInputStream* input, int field_number,
+                                   int index, void* datap) {
+  // WireFormatLite::SkipField has an option to emit the
+  // skipped bytes to an output stream. We could do better by implementing our
+  // own scanner but this is simpler for now.
+  // TODO(nix): there is a faster way to grab TYPE_GROUP bytes by relying
+  // on input->IsFlat() == true and using input->GetDirectBufferPointer()
+  // with input->CurrentPosition().
+  tstring* data = reinterpret_cast<tstring*>(datap) + index;
+  // TODO(dero): To mitigate the string to tstring copy, we can implement our
+  // own scanner as described above.  We would first need to obtain the length
+  // in an initial pass and resize/reserve the tstring. But, given that
+  // TYPE_GROUP is deprecated and currently no tests in
+  // tensorflow/python/kernel_tests/proto:decode_proto_op_test target a
+  // TYPE_GROUP tag, we use std::string as a read buffer.
+  string buf;
+  StringOutputStream string_stream(&buf);
+  {
+    CodedOutputStream out(&string_stream);
+    if (!WireFormatLite::SkipField(
+            input,
+            WireFormatLite::MakeTag(field_number,
+                                    WireFormatLite::WIRETYPE_START_GROUP),
+            &out)) {
+      return errors::DataLoss("Failed reading group");
+    }
+  }
+  *data = buf;
+  return absl::OkStatus();
+}
+
+// Reads a single field value from a CodedInputStream into a tensor.
+inline absl::Status ReadValue(CodedInputStream* input,
+                              WireFormatLite::FieldType field_type,
+                              int field_number, DataType dtype, int index,
+                              void* datap) {
+  // Dispatch to the appropriately typed field reader based on the schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return ReadPrimitive<double, double, WireFormatLite::TYPE_DOUBLE>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FLOAT:
+      switch (dtype) {
+        case DataType::DT_DOUBLE:
+          return ReadPrimitive<float, double, WireFormatLite::TYPE_FLOAT>(
+              input, index, datap);
+        case DataType::DT_FLOAT:
+          return ReadPrimitive<float, float, WireFormatLite::TYPE_FLOAT>(
+              input, index, datap);
+        default:
+          return errors::DataLoss("Failed reading TYPE_FLOAT for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_INT64:
+      return ReadPrimitive<protobuf_int64, int64_t, WireFormatLite::TYPE_INT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_UINT64:
+      return ReadPrimitive<protobuf_uint64, uint64,
+                           WireFormatLite::TYPE_UINT64>(input, index, datap);
+    case WireFormatLite::TYPE_INT32:
+      switch (dtype) {
+        case DataType::DT_INT64:
+          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_INT32>(
+              input, index, datap);
+        case DataType::DT_INT32:
+          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+              input, index, datap);
+        default:
+          return errors::DataLoss("Failed reading TYPE_INT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_FIXED64:
+      return ReadPrimitive<protobuf_uint64, uint64,
+                           WireFormatLite::TYPE_FIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_FIXED32:
+      switch (dtype) {
+        case DataType::DT_UINT64:
+          return ReadPrimitive<uint32, uint64, WireFormatLite::TYPE_FIXED32>(
+              input, index, datap);
+        case DataType::DT_UINT32:
+          return ReadPrimitive<uint32, uint32, WireFormatLite::TYPE_FIXED32>(
+              input, index, datap);
+        default:
+          return errors::DataLoss("Failed reading TYPE_FIXED32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_BOOL:
+      return ReadPrimitive<bool, bool, WireFormatLite::TYPE_BOOL>(input, index,
+                                                                  datap);
+    case WireFormatLite::TYPE_STRING:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_GROUP:
+      return ReadGroupBytes(input, field_number, index, datap);
+    case WireFormatLite::TYPE_MESSAGE:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_BYTES:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_UINT32:
+      switch (dtype) {
+        case DataType::DT_UINT64:
+          return ReadPrimitive<uint32, uint64, WireFormatLite::TYPE_UINT32>(
+              input, index, datap);
+        case DataType::DT_UINT32:
+          return ReadPrimitive<uint32, uint32, WireFormatLite::TYPE_UINT32>(
+              input, index, datap);
+        default:
+          return errors::DataLoss("Failed reading TYPE_UINT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_ENUM:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED32:
+      switch (dtype) {
+        case DataType::DT_INT64:
+          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_SFIXED32>(
+              input, index, datap);
+        case DataType::DT_INT32:
+          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+              input, index, datap);
+        default:
+          return errors::DataLoss("Failed reading TYPE_SFIXED32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_SFIXED64:
+      return ReadPrimitive<protobuf_int64, int64_t,
+                           WireFormatLite::TYPE_SFIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_SINT32:
+      switch (dtype) {
+        case DataType::DT_INT64:
+          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_SINT32>(
+              input, index, datap);
+        case DataType::DT_INT32:
+          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+              input, index, datap);
+        default:
+          return errors::DataLoss("Failed reading TYPE_SINT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_SINT64:
+      return ReadPrimitive<protobuf_int64, int64_t,
+                           WireFormatLite::TYPE_SINT64>(input, index, datap);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads and stores a length-delimited list of values.
+inline absl::Status ReadPackedFromArray(
+    const void* buf, size_t buf_size,
+    const WireFormatLite::FieldType field_type, const int field_number,
+    const DataType dtype, const int stride, int* index, void* data) {
+  // Dispatch to the appropriately typed field reader based on the schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      *index += ReadPackedPrimitives<double, WireFormatLite::TYPE_DOUBLE>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+    case WireFormatLite::TYPE_FLOAT:
+      switch (dtype) {
+        case DataType::DT_DOUBLE:
+          *index += ReadPackedPrimitives<double, WireFormatLite::TYPE_FLOAT>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        case DataType::DT_FLOAT:
+          *index += ReadPackedPrimitives<float, WireFormatLite::TYPE_FLOAT>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        default:
+          return errors::DataLoss("Failed reading TYPE_FLOAT for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_INT64:
+      *index += ReadPackedPrimitives<int64_t, WireFormatLite::TYPE_INT64>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+    case WireFormatLite::TYPE_UINT64:
+      *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_UINT64>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+    case WireFormatLite::TYPE_INT32:
+      switch (dtype) {
+        case DataType::DT_INT64:
+          *index += ReadPackedPrimitives<int64_t, WireFormatLite::TYPE_INT32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        case DataType::DT_INT32:
+          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        default:
+          return errors::DataLoss("Failed reading TYPE_INT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_FIXED64:
+      *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_FIXED64>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+    case WireFormatLite::TYPE_FIXED32:
+      switch (dtype) {
+        case DataType::DT_UINT64:
+          *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_FIXED32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        case DataType::DT_UINT32:
+          *index += ReadPackedPrimitives<uint32, WireFormatLite::TYPE_FIXED32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        default:
+          return errors::DataLoss("Failed reading TYPE_FIXED32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_BOOL:
+      *index += ReadPackedPrimitives<bool, WireFormatLite::TYPE_BOOL>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return errors::DataLoss("Non-primitive type encountered as packed");
+    case WireFormatLite::TYPE_UINT32:
+      switch (dtype) {
+        case DataType::DT_UINT64:
+          *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_UINT32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        case DataType::DT_UINT32:
+          *index += ReadPackedPrimitives<uint32, WireFormatLite::TYPE_UINT32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        default:
+          return errors::DataLoss("Failed reading TYPE_UINT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_ENUM:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+    case WireFormatLite::TYPE_SFIXED32:
+      switch (dtype) {
+        case DataType::DT_INT64:
+          *index +=
+              ReadPackedPrimitives<int64_t, WireFormatLite::TYPE_SFIXED32>(
+                  buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        case DataType::DT_INT32:
+          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        default:
+          return errors::DataLoss("Failed reading TYPE_INT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_SFIXED64:
+      *index += ReadPackedPrimitives<int64_t, WireFormatLite::TYPE_SFIXED64>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+
+    case WireFormatLite::TYPE_SINT32:
+      switch (dtype) {
+        case DataType::DT_INT64:
+          *index += ReadPackedPrimitives<int64_t, WireFormatLite::TYPE_SINT32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        case DataType::DT_INT32:
+          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+              buf, buf_size, *index, stride, data);
+          return absl::OkStatus();
+        default:
+          return errors::DataLoss("Failed reading TYPE_SINT32 for ",
+                                  DataTypeString(dtype));
+      }
+    case WireFormatLite::TYPE_SINT64:
+      *index += ReadPackedPrimitives<int64_t, WireFormatLite::TYPE_SINT64>(
+          buf, buf_size, *index, stride, data);
+      return absl::OkStatus();
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads a varint from the given buffer, write it to *value, and return the
+// new buffer pointer.
+// This was copied from coded_stream.cc where it is private.
+// Important: This routine may read as much as kMaxVarintBytes from
+// the buffer. It is the caller's responsibility to make sure that there is
+// enough space in the buffer.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value) {
+  const uint8* ptr = buffer;
+  uint32 b;
+
+  // Splitting into 32-bit pieces gives better performance on 32-bit
+  // processors.
+  uint32 part0 = 0, part1 = 0, part2 = 0;
+
+  b = *(ptr++);
+  part0 = b;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80;
+  b = *(ptr++);
+  part0 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 7;
+  b = *(ptr++);
+  part0 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 14;
+  b = *(ptr++);
+  part0 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 21;
+  b = *(ptr++);
+  part1 = b;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80;
+  b = *(ptr++);
+  part1 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 7;
+  b = *(ptr++);
+  part1 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 14;
+  b = *(ptr++);
+  part1 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 21;
+  b = *(ptr++);
+  part2 = b;
+  if (!(b & 0x80)) goto done;
+  part2 -= 0x80;
+  b = *(ptr++);
+  part2 += b << 7;
+  if (!(b & 0x80)) goto done;
+  // "part2 -= 0x80 << 7" is irrelevant because (0x80 << 7) << 56 is 0.
+
+  // We have overrun the maximum size of a varint (10 bytes).  Assume
+  // the data is corrupt.
+  *ok = false;
+  return ptr;
+
+done:
+  *ok = true;
+  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
+           (static_cast<uint64>(part2) << 56);
+  return ptr;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/proto/descriptor_pool_registry.h b/third_party/tflite-hdrs/tensorflow/core/util/proto/descriptor_pool_registry.h
new file mode 100644
index 00000000..59c709ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class DescriptorPoolRegistry {
+ public:
+  typedef std::function<absl::Status(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool)>
+      DescriptorPoolFn;
+
+  // Returns a pointer to a global DescriptorPoolRegistry object.
+  static DescriptorPoolRegistry* Global();
+
+  // Returns a pointer to a descriptor pool function for the given source.
+  DescriptorPoolFn* Get(const string& source);
+
+  // Registers a descriptor pool factory.
+  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+
+ private:
+  std::map<string, DescriptorPoolFn> fns_;
+};
+
+namespace descriptor_pool_registration {
+
+class DescriptorPoolRegistration {
+ public:
+  DescriptorPoolRegistration(
+      const string& source,
+      const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+    DescriptorPoolRegistry::Global()->Register(source, pool_fn);
+  }
+};
+
+}  // namespace descriptor_pool_registration
+
+#define REGISTER_DESCRIPTOR_POOL(source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(__COUNTER__, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(ctr, source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)       \
+  static descriptor_pool_registration::DescriptorPoolRegistration \
+      descriptor_pool_registration_fn_##ctr(source, pool_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/proto/descriptors.h b/third_party/tflite-hdrs/tensorflow/core/util/proto/descriptors.h
new file mode 100644
index 00000000..3402ed05
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/proto/descriptors.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tsl {
+class Env;
+}  // namespace tsl
+namespace tensorflow {
+using tsl::Env;
+
+// Gets a `DescriptorPool` object from the `descriptor_source`. This may be:
+//
+// 1) An empty string  or "local://", in which case the local descriptor pool
+// created for proto definitions linked to the binary is returned.
+//
+// 2) A file path, in which case the descriptor pool is created from the
+// contents of the file, which is expected to contain a `FileDescriptorSet`
+// serialized as a string. The descriptor pool ownership is transferred to the
+// caller via `owned_desc_pool`.
+//
+// 3) A "bytes://<bytes>", in which case the descriptor pool is created from
+// `<bytes>`, which is expected to be a `FileDescriptorSet` serialized as a
+// string. The descriptor pool ownership is transferred to the caller via
+// `owned_desc_pool`.
+//
+// Custom schemas can be supported by registering a handler with the
+// `DescriptorPoolRegistry`.
+absl::Status GetDescriptorPool(
+    Env* env, string const& descriptor_source,
+    protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/proto/proto_utils.h b/third_party/tflite-hdrs/tensorflow/core/util/proto/proto_utils.h
new file mode 100644
index 00000000..01b8ad0d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/proto/proto_utils.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/proto/proto_utils.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace proto_utils {
+
+using tensorflow::protobuf::FieldDescriptor;
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::proto_utils::FromDurationProto;
+using tsl::proto_utils::ToDurationProto;
+// NOLINTEND(misc-unused-using-decls)
+
+// Returns true if the proto field type can be converted to the tensor dtype.
+bool IsCompatibleType(FieldDescriptor::Type field_type, DataType dtype);
+
+// Parses a text-formatted protobuf from a string into the given Message* output
+// and returns status OK if valid, or INVALID_ARGUMENT with an accompanying
+// parser error message if the text format is invalid.
+absl::Status ParseTextFormatFromString(absl::string_view input,
+                                       protobuf::Message* output);
+
+class StringErrorCollector : public protobuf::io::ErrorCollector {
+ public:
+  // String error_text is unowned and must remain valid during the use of
+  // StringErrorCollector.
+  explicit StringErrorCollector(string* error_text);
+  // If one_indexing is set to true, all line and column numbers will be
+  // increased by one for cases when provided indices are 0-indexed and
+  // 1-indexed error messages are desired
+  StringErrorCollector(string* error_text, bool one_indexing);
+  StringErrorCollector(const StringErrorCollector&) = delete;
+  StringErrorCollector& operator=(const StringErrorCollector&) = delete;
+
+  // Implementation of protobuf::io::ErrorCollector::AddError.
+  void AddError(int line, int column, const string& message) override;
+
+  // Implementation of protobuf::io::ErrorCollector::AddWarning.
+  void AddWarning(int line, int column, const string& message) override;
+
+ private:
+  string* const error_text_;
+  const int index_offset_;
+};
+
+
+}  // namespace proto_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/quantization/uniform_quant_ops_params.h b/third_party/tflite-hdrs/tensorflow/core/util/quantization/uniform_quant_ops_params.h
new file mode 100644
index 00000000..b4ec58b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/quantization/uniform_quant_ops_params.h
@@ -0,0 +1,121 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_QUANTIZATION_UNIFORM_QUANT_OPS_PARAMS_H_
+#define TENSORFLOW_CORE_UTIL_QUANTIZATION_UNIFORM_QUANT_OPS_PARAMS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/quantization/uniform_quant_ops_attr.pb.h"
+
+namespace tensorflow {
+
+// Utility class to load and retrieve params for
+// UniformQuantizedConvolution{Hybrid} Op kernels.
+//
+// NOTE: This class instance is fully loaded and valid, only after (1) One of
+// LoadFromAttrs() is called (2) ValidateOrFillParamsAndValidateShape is called.
+// Member retrieve and CalculateOutputShape() can be used only after both are
+// called.
+class UniformQuantizedConvolutionParams {
+ public:
+  UniformQuantizedConvolutionParams() = default;
+  // Only for unit tests.
+  UniformQuantizedConvolutionParams(
+      const std::vector<int>& window_strides,
+      const std::vector<int>& lhs_dilation,
+      const std::vector<int>& rhs_dilation,
+      const UniformQuantizedConvolutionDimensionNumbersAttr& dimension_numbers,
+      int feature_group_count, int batch_group_count,
+      const std::string& padding, const std::vector<int>& padding_list = {})
+      : window_strides_(window_strides),
+        lhs_dilation_(lhs_dilation),
+        rhs_dilation_(rhs_dilation),
+        dimension_numbers_(dimension_numbers),
+        feature_group_count_(feature_group_count),
+        batch_group_count_(batch_group_count),
+        padding_(padding),
+        padding_list_(padding_list) {}
+
+  const std::vector<int>& window_strides() const { return window_strides_; }
+  const std::vector<int>& lhs_dilation() const { return lhs_dilation_; }
+  const std::vector<int>& rhs_dilation() const { return rhs_dilation_; }
+  const UniformQuantizedConvolutionDimensionNumbersAttr& dimension_numbers()
+      const {
+    return dimension_numbers_;
+  }
+  int batch_group_count() const { return batch_group_count_; }
+
+  const std::vector<int>& padding_list() const { return padding_list_; }
+  int feature_group_count() const { return feature_group_count_; }
+
+  // Load UniformQuantizedConvolutionParams members by reading op attrs.
+  absl::Status LoadFromAttrs(const OpKernelConstruction& context);
+  absl::Status LoadFromAttrs(const shape_inference::InferenceContext& context);
+
+  // Check if UniformQuantizedConvolutionParams members loaded from Attr are
+  // valid regarding the lhs_shape and rhs_shape, and fill param values if
+  // required. (Set default of empty optional Attrs, and fill padding_list_ if
+  // required.)
+  // Then, validate given lhs_shape and rhs_shape.
+  //
+  // NOTE: This method should be called only after calling one of
+  // LoadFromAttrs().
+  absl::Status ValidateOrFillParamsAndValidateShape(
+      const TensorShape& lhs_shape, const TensorShape& rhs_shape);
+
+  // Calculate output shape using lhs_shape, rhs_shape, and the params.
+  //
+  // NOTE: this method can be used only after calling both LoadFromAttrs() and
+  // ValidateOrFillParamsAndValidateShape().
+  // Reference:
+  // https://github.com/google/jax/blob/0584c6a1c405b23317deb1596c2c161eb5709c84/jax/_src/lax/convolution.py#L349
+  absl::StatusOr<TensorShape> CalculateOutputShape(
+      const TensorShape& lhs_shape, const TensorShape& rhs_shape) const;
+
+  // Given the original size of a dimension and a dilation, calculate the
+  // resulting size after dilation is applied.
+  inline static int64_t DilatedSize(int64_t size, int dilation) {
+    return size == 0 ? 0 : size + (dilation - 1) * (size - 1);
+  }
+
+ private:
+  template <typename ContextT>
+  absl::Status LoadFromAttrsInternal(const ContextT& context);
+  absl::Status ValidateShape(const TensorShape& lhs_shape,
+                             const TensorShape& rhs_shape);
+  absl::Status ValidateOrFillPaddingList(const TensorShape& lhs_shape,
+                                         const TensorShape& rhs_shape);
+
+  // Params from Attrs.
+  std::vector<int> window_strides_;
+  std::vector<int> lhs_dilation_;
+  std::vector<int> rhs_dilation_;
+  UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers_;
+  int feature_group_count_;
+  int batch_group_count_;
+  std::string padding_;
+
+  // Params derived from Attrs and Inputs.
+  std::vector<int> padding_list_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_QUANTIZATION_UNIFORM_QUANT_OPS_PARAMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ragged_to_dense_util.h b/third_party/tflite-hdrs/tensorflow/core/util/ragged_to_dense_util.h
new file mode 100644
index 00000000..438e3d1d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ragged_to_dense_util.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/util/ragged_to_dense_util_common.h"
+
+namespace tensorflow {
+
+string RowPartitionTypeToString(RowPartitionType row_partition_type);
+
+absl::Status GetRowPartitionTypesHelper(
+    const std::vector<string>& row_partition_type_strings,
+    std::vector<RowPartitionType>* row_partition_types);
+
+// ContextType must be InferenceContext or OpKernelConstruction.
+template <typename ContextType>
+absl::Status GetRowPartitionTypes(
+    ContextType* context, std::vector<RowPartitionType>* row_partition_types) {
+  std::vector<string> row_partition_type_strings;
+  TF_RETURN_IF_ERROR(
+      context->GetAttr("row_partition_types", &row_partition_type_strings));
+  return GetRowPartitionTypesHelper(row_partition_type_strings,
+                                    row_partition_types);
+}
+
+absl::Status GetRowPartitionTypesHelper(
+    const std::vector<string>& row_partition_type_strings,
+    std::vector<RowPartitionType>* row_partition_types);
+
+absl::Status CombineRaggedTensorToTensorShapes(
+    int ragged_rank, const TensorShapeProto& shape,
+    const TensorShapeProto& value_shape, TensorShapeProto* output_shape);
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types);
+
+absl::Status ValidateDefaultValueShape(
+    const TensorShapeProto& default_value_shape,
+    const TensorShapeProto& value_shape);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/ragged_to_dense_util_common.h b/third_party/tflite-hdrs/tensorflow/core/util/ragged_to_dense_util_common.h
new file mode 100644
index 00000000..07954450
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/ragged_to_dense_util_common.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
+#define TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tensorflow {
+enum class RowPartitionType {
+  FIRST_DIM_SIZE,
+  VALUE_ROWIDS,
+  ROW_LENGTHS,
+  ROW_SPLITS,
+  ROW_LIMITS,
+  ROW_STARTS
+};
+
+inline std::string RowPartitionTypeToString(
+    RowPartitionType row_partition_type) {
+  switch (row_partition_type) {
+    case RowPartitionType::FIRST_DIM_SIZE:
+      return "FIRST_DIM_SIZE";
+    case RowPartitionType::VALUE_ROWIDS:
+      return "VALUE_ROWIDS";
+    case RowPartitionType::ROW_LENGTHS:
+      return "ROW_LENGTHS";
+    case RowPartitionType::ROW_SPLITS:
+      return "ROW_SPLITS";
+    case RowPartitionType::ROW_LIMITS:
+      return "ROW_LIMITS";
+    case RowPartitionType::ROW_STARTS:
+      return "ROW_STARTS";
+    default:
+      return "UNKNOWN ROW PARTITION TYPE";
+  }
+}
+
+inline std::vector<RowPartitionType> GetRowPartitionTypesHelper(
+    const std::vector<std::string>& row_partition_type_strings) {
+  static const auto kStringToType =
+      new std::unordered_map<std::string, RowPartitionType>(
+          {{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
+           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
+           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
+           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
+           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
+           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
+  std::vector<RowPartitionType> result;
+  for (const auto& type_str : row_partition_type_strings) {
+    const auto iter = kStringToType->find(type_str);
+    if (iter == kStringToType->end()) {
+      break;
+    }
+    result.push_back(iter->second);
+  }
+  return result;
+}
+
+inline int GetRaggedRank(
+    const std::vector<RowPartitionType>& row_partition_types) {
+  if (row_partition_types.empty()) {
+    return 0;
+  }
+  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    return row_partition_types.size() - 1;
+  }
+  return row_partition_types.size();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/reffed_status_callback.h b/third_party/tflite-hdrs/tensorflow/core/util/reffed_status_callback.h
new file mode 100644
index 00000000..921f46c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/reffed_status_callback.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
+#define TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
+
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The ReffedStatusCallback is a refcounted object that accepts a
+// StatusCallback.  When it is destroyed (its refcount goes to 0), the
+// StatusCallback is called with the first non-OK status passed to
+// UpdateStatus(), or OkStatus() if no non-OK status was set.
+class ReffedStatusCallback : public core::RefCounted {
+ public:
+  explicit ReffedStatusCallback(StatusCallback done) : done_(std::move(done)) {}
+
+  void UpdateStatus(const absl::Status& s) {
+    mutex_lock lock(mu_);
+    status_group_.Update(s);
+  }
+
+  bool ok() {
+    tf_shared_lock lock(mu_);
+    return status_group_.ok();
+  }
+
+  // Returns a copy of the current status.
+  absl::Status status() {
+    tf_shared_lock lock(mu_);
+    return status_group_.as_summary_status();
+  }
+
+  ~ReffedStatusCallback() override { done_(status_group_.as_summary_status()); }
+
+ private:
+  StatusCallback done_;
+  mutex mu_;
+  StatusGroup status_group_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/reporter.h b/third_party/tflite-hdrs/tensorflow/core/util/reporter.h
new file mode 100644
index 00000000..2db7a6f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/reporter.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_REPORTER_H_
+#define TENSORFLOW_CORE_UTIL_REPORTER_H_
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "xla/tsl/util/reporter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using tsl::TestReporter;    // NOLINT
+using tsl::TestReportFile;  // NOLINT
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/saved_tensor_slice_util.h b/third_party/tflite-hdrs/tensorflow/core/util/saved_tensor_slice_util.h
new file mode 100644
index 00000000..102e32a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/saved_tensor_slice_util.h
@@ -0,0 +1,242 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for saving/restoring tensor slice checkpoints.
+
+#ifndef TENSORFLOW_CORE_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
+
+#include <string>  // for string
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"  // for Status
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+namespace checkpoint {
+
+// The key for the metadata in the tensor slice checkpoint files. It is "" so
+// that the metadata is always at the beginning of a checkpoint file.
+extern const char kSavedTensorSlicesKey[];
+
+// Encode a tensor name + a tensor slice into an ordered code and outputs it as
+// a string.
+// The format is
+//  <0>
+//  <tensor_name>
+//  <rank>
+//  <dim-0-start><dim-0-length>
+//  <dim-1-start><dim-1-length>
+//  ...
+
+string EncodeTensorNameSlice(const string& name,
+                             const tensorflow::TensorSlice& slice);
+
+// Parse out the name and the slice from string encoded as an ordered code.
+absl::Status DecodeTensorNameSlice(const string& code, string* name,
+                                   tensorflow::TensorSlice* slice);
+
+// Extracts the full shape, slice spec, and shape of the slice from
+// "shape_and_slice".  On non-OK return, caller must clear the out-arguments
+// before reusing.
+absl::Status ParseShapeAndSlice(const string& shape_and_slice,
+                                TensorShape* shape, TensorSlice* slice,
+                                TensorShape* shape_slice);
+
+template <typename T>
+struct SaveTypeTraits;
+
+template <typename T>
+int TensorProtoDataSize(const TensorProto& t);
+
+template <typename T>
+const typename SaveTypeTraits<T>::SavedType* TensorProtoData(
+    const TensorProto& t);
+
+template <typename T>
+typename SaveTypeTraits<T>::RepeatedField* MutableTensorProtoData(
+    TensorProto* t);
+
+template <typename T>
+void Fill(T* data, size_t n, TensorProto* t);
+
+#define TENSOR_PROTO_EXTRACT_TYPE_HELPER(TYPE, FIELD, FTYPE, STYPE)      \
+  template <>                                                            \
+  struct SaveTypeTraits<TYPE> {                                          \
+    static constexpr bool supported = true;                              \
+    typedef STYPE SavedType;                                             \
+    typedef protobuf::RepeatedField<FTYPE> RepeatedField;                \
+  };                                                                     \
+  template <>                                                            \
+  inline const STYPE* TensorProtoData<TYPE>(const TensorProto& t) {      \
+    static_assert(SaveTypeTraits<TYPE>::supported,                       \
+                  "Specified type " #TYPE " not supported for Restore"); \
+    return reinterpret_cast<const STYPE*>(t.FIELD##_val().data());       \
+  }                                                                      \
+  template <>                                                            \
+  inline protobuf::RepeatedField<FTYPE>* MutableTensorProtoData<TYPE>(   \
+      TensorProto * t) {                                                 \
+    static_assert(SaveTypeTraits<TYPE>::supported,                       \
+                  "Specified type " #TYPE " not supported for Save");    \
+    return reinterpret_cast<protobuf::RepeatedField<FTYPE>*>(            \
+        t->mutable_##FIELD##_val());                                     \
+  }
+
+#define TENSOR_PROTO_EXTRACT_TYPE(TYPE, FIELD, FTYPE)             \
+  TENSOR_PROTO_EXTRACT_TYPE_HELPER(TYPE, FIELD, FTYPE, FTYPE)     \
+  template <>                                                     \
+  inline int TensorProtoDataSize<TYPE>(const TensorProto& t) {    \
+    return t.FIELD##_val_size();                                  \
+  }                                                               \
+  template <>                                                     \
+  inline void Fill(const TYPE* data, size_t n, TensorProto* t) {  \
+    typename protobuf::RepeatedField<FTYPE> copy(data, data + n); \
+    t->mutable_##FIELD##_val()->Swap(&copy);                      \
+  }
+
+// Complex needs special treatment since proto doesn't have native complex
+#define TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(TYPE, FIELD, FTYPE)       \
+  TENSOR_PROTO_EXTRACT_TYPE_HELPER(TYPE, FIELD, FTYPE, TYPE)        \
+  template <>                                                       \
+  inline int TensorProtoDataSize<TYPE>(const TensorProto& t) {      \
+    return t.FIELD##_val_size() / 2;                                \
+  }                                                                 \
+  template <>                                                       \
+  inline void Fill(const TYPE* data, size_t n, TensorProto* t) {    \
+    const FTYPE* sub = reinterpret_cast<const FTYPE*>(data);        \
+    typename protobuf::RepeatedField<FTYPE> copy(sub, sub + 2 * n); \
+    t->mutable_##FIELD##_val()->Swap(&copy);                        \
+  }
+
+TENSOR_PROTO_EXTRACT_TYPE(bool, bool, bool);
+TENSOR_PROTO_EXTRACT_TYPE(float, float, float);
+TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
+TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
+TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
+TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
+TENSOR_PROTO_EXTRACT_TYPE(int64_t, int64, protobuf_int64);
+TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
+TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(int16, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(qint8, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(quint8, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32);
+
+#undef TENSOR_PROTO_EXTRACT_TYPE_COMPLEX
+#undef TENSOR_PROTO_EXTRACT_TYPE_HELPER
+#undef TENSOR_PROTO_EXTRACT_TYPE
+
+// Custom implementation for qint32, based on the one for int32.
+
+template <>
+struct SaveTypeTraits<qint32> : SaveTypeTraits<int32> {};
+
+template <>
+inline int TensorProtoDataSize<qint32>(const TensorProto& t) {
+  return t.int_val_size();
+}
+
+template <>
+inline const int32* TensorProtoData<qint32>(const TensorProto& t) {
+  static_assert(SaveTypeTraits<qint32>::supported,
+                "Specified type qint32 not supported for Restore");
+  return reinterpret_cast<const int32*>(t.int_val().data());
+}
+
+inline void Fill(const qint32* data, size_t n, TensorProto* t) {
+  const int32* p = reinterpret_cast<const int32*>(data);
+  typename protobuf::RepeatedField<int32> copy(p, p + n);
+  t->mutable_int_val()->Swap(&copy);
+}
+
+// Custom implementation for Eigen::half.
+
+template <>
+struct SaveTypeTraits<Eigen::half> {
+  static constexpr bool supported = true;
+  typedef int SavedType;
+  typedef protobuf::RepeatedField<int32> RepeatedField;
+};
+
+template <>
+inline int TensorProtoDataSize<Eigen::half>(const TensorProto& t) {
+  return t.half_val_size();
+}
+
+template <>
+inline const int* TensorProtoData<Eigen::half>(const TensorProto& t) {
+  return t.half_val().data();
+}
+
+template <>
+inline protobuf::RepeatedField<int32>* MutableTensorProtoData<Eigen::half>(
+    TensorProto* t) {
+  return t->mutable_half_val();
+}
+
+template <>
+inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
+  typename protobuf::RepeatedField<int32>* val = t->mutable_half_val();
+  val->Resize(n, 0);
+  for (size_t i = 0; i < n; ++i) {
+    val->Set(i, Eigen::numext::bit_cast<uint16>(data[i]));
+  }
+}
+
+// Custom implementation for string.
+
+template <>
+struct SaveTypeTraits<tstring> {
+  static constexpr bool supported = true;
+  typedef const string* SavedType;
+  typedef protobuf::RepeatedPtrField<string> RepeatedField;
+};
+
+template <>
+inline int TensorProtoDataSize<tstring>(const TensorProto& t) {
+  return t.string_val_size();
+}
+
+template <>
+inline const string* const* TensorProtoData<tstring>(const TensorProto& t) {
+  static_assert(SaveTypeTraits<tstring>::supported,
+                "Specified type tstring not supported for Restore");
+  return t.string_val().data();
+}
+
+template <>
+inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<tstring>(
+    TensorProto* t) {
+  static_assert(SaveTypeTraits<tstring>::supported,
+                "Specified type tstring not supported for Save");
+  return t->mutable_string_val();
+}
+
+template <>
+inline void Fill(const tstring* data, size_t n, TensorProto* t) {
+  typename protobuf::RepeatedPtrField<string> copy(data, data + n);
+  t->mutable_string_val()->Swap(&copy);
+}
+
+}  // namespace checkpoint
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/sparse/dim_comparator.h b/third_party/tflite-hdrs/tensorflow/core/util/sparse/dim_comparator.h
new file mode 100644
index 00000000..7112d6a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/sparse/dim_comparator.h
@@ -0,0 +1,121 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_SPARSE_DIM_COMPARATOR_H_
+#define TENSORFLOW_CORE_UTIL_SPARSE_DIM_COMPARATOR_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace sparse {
+
+/////////////////
+// DimComparator
+/////////////////
+//
+// Helper class, mainly used by the IndexSortOrder. This comparator
+// can be passed to e.g. std::sort, or any other sorter, to sort two
+// rows of an index matrix according to the dimension(s) of interest.
+// The dimensions to sort by are passed to the constructor as "order".
+//
+// Example: if given index matrix IX, two rows ai and bi, and order = {2,1}.
+// operator() compares
+//    IX(ai,2) < IX(bi,2).
+// If IX(ai,2) == IX(bi,2), it compares
+//    IX(ai,1) < IX(bi,1).
+//
+// This can be used to sort a vector of row indices into IX according to
+// the values in IX in particular columns (dimensions) of interest.
+class DimComparator {
+ public:
+  typedef absl::Span<const int64_t> VarDimArray;
+
+  DimComparator(const TTypes<int64_t>::Matrix& ix, const VarDimArray& order,
+                const VarDimArray& shape)
+      : ix_(ix), order_(order), dims_(shape.size()) {
+    DCHECK_GT(order.size(), size_t{0}) << "Must order using at least one index";
+    DCHECK_LE(order.size(), shape.size()) << "Can only sort up to dims";
+    for (size_t d = 0; d < order.size(); ++d) {
+      DCHECK_GE(order[d], 0);
+      DCHECK_LT(order[d], shape.size());
+    }
+  }
+
+  inline bool operator()(const int64_t i, const int64_t j) const {
+    for (int di = 0; di < dims_; ++di) {
+      const int64_t d = order_[di];
+      if (ix_(i, d) < ix_(j, d)) return true;
+      if (ix_(i, d) > ix_(j, d)) return false;
+    }
+    return false;
+  }
+
+  // Compares two indices taken from corresponding index matrices, using the
+  // standard, row-major (or lexicographic) order.  Useful for cases that need
+  // to distinguish between all three orderings (<, ==, >).
+  inline static int cmp(const TTypes<int64_t>::ConstMatrix& a_idx,
+                        const TTypes<int64_t>::ConstMatrix& b_idx,
+                        const int64_t a_row, const int64_t b_row,
+                        const int dims) {
+    for (int d = 0; d < dims; ++d) {
+      const int64_t a = a_idx(a_row, d);
+      const int64_t b = b_idx(b_row, d);
+      if (a < b) {
+        return -1;
+      } else if (a > b) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+ protected:
+  const TTypes<int64_t>::Matrix ix_;
+  const VarDimArray order_;
+  const int dims_;
+  const std::vector<int64_t>* ix_order_;
+};
+
+template <int ORDER_DIM>
+class FixedDimComparator : DimComparator {
+ public:
+  FixedDimComparator(const TTypes<int64_t>::Matrix& ix,
+                     const VarDimArray& order, const VarDimArray& shape)
+      : DimComparator(ix, order, shape) {
+    DCHECK_EQ(order.size(), ORDER_DIM);
+  }
+  inline bool operator()(const int64_t i, const int64_t j) const {
+    bool value = false;
+    for (int di = 0; di < ORDER_DIM; ++di) {
+      const int64_t d = order_[di];
+      if (ix_(i, d) < ix_(j, d)) {
+        value = true;
+        break;
+      }
+      if (ix_(i, d) > ix_(j, d)) break;
+    }
+    return value;
+  }
+};
+
+}  // namespace sparse
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_SPARSE_DIM_COMPARATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/sparse/group_iterator.h b/third_party/tflite-hdrs/tensorflow/core/util/sparse/group_iterator.h
new file mode 100644
index 00000000..0d425fa6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/sparse/group_iterator.h
@@ -0,0 +1,153 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_SPARSE_GROUP_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_SPARSE_GROUP_ITERATOR_H_
+
+#include <vector>
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace sparse {
+
+class GroupIterable;  // Predeclare GroupIterable for Group.
+
+// This class is returned when dereferencing a GroupIterable iterator.
+// It provides the methods group(), indices(), and values(), which
+// provide access into the underlying SparseTensor.
+class Group {
+ public:
+  Group(GroupIterable* iter, int64_t loc, int64_t next_loc)
+      : iter_(iter), loc_(loc), next_loc_(next_loc) {}
+
+  std::vector<int64_t> group() const;
+  int64_t group_at(size_t index) const;
+  TTypes<int64_t>::UnalignedConstMatrix indices() const;
+  template <typename T>
+  typename TTypes<T>::UnalignedVec values() const;
+
+ private:
+  GroupIterable* iter_;
+  int64_t loc_;
+  int64_t next_loc_;
+};
+
+/////////////////
+// GroupIterable
+/////////////////
+//
+// Returned when calling sparse_tensor.group({dim0, dim1, ...}).
+//
+// Please note: the sparse_tensor should already be ordered according
+// to {dim0, dim1, ...}.  Otherwise this iteration will return invalid groups.
+//
+// Allows grouping and iteration of the SparseTensor according to the
+// subset of dimensions provided to the group call.
+//
+// The actual grouping dimensions are stored in the
+// internal vector group_dims_.  Iterators inside the iterable provide
+// the three methods:
+//
+// *  group(): returns a vector with the current group dimension values.
+// *  indices(): a map of index, providing the indices in
+//    this group.
+// *  values(): a map of values, providing the values in
+//    this group.
+//
+// To iterate across GroupIterable, see examples in README.md.
+//
+
+// Forward declaration of SparseTensor
+class GroupIterable {
+ public:
+  typedef absl::Span<const int64_t> VarDimArray;
+
+  GroupIterable(Tensor ix, Tensor vals, int dims, const VarDimArray& group_dims)
+      : ix_(ix),
+        ix_matrix_(ix_.matrix<int64_t>()),
+        vals_(vals),
+        dims_(dims),
+        group_dims_(group_dims.begin(), group_dims.end()) {}
+
+  class IteratorStep;
+
+  IteratorStep begin() { return IteratorStep(this, 0); }
+  IteratorStep at(int64_t loc) {
+    CHECK(loc >= 0 && loc <= ix_.dim_size(0))
+        << "loc provided must lie between 0 and " << ix_.dim_size(0);
+    return IteratorStep(this, loc);
+  }
+  IteratorStep end() { return IteratorStep(this, ix_.dim_size(0)); }
+
+  template <typename TIX>
+  inline bool GroupMatches(const TIX& ix, int64_t loc_a, int64_t loc_b) const {
+    for (int d : group_dims_) {
+      if (ix(loc_a, d) != ix(loc_b, d)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  class IteratorStep {
+   public:
+    IteratorStep(GroupIterable* iter, int64_t loc)
+        : iter_(iter), loc_(loc), next_loc_(loc_) {
+      UpdateEndOfGroup();
+    }
+
+    void UpdateEndOfGroup();
+    bool operator!=(const IteratorStep& rhs) const;
+    bool operator==(const IteratorStep& rhs) const;
+    IteratorStep& operator++();    // prefix ++
+    IteratorStep operator++(int);  // postfix ++
+    Group operator*() const { return Group(iter_, loc_, next_loc_); }
+    int64_t loc() const { return loc_; }
+
+   private:
+    GroupIterable* iter_;
+    int64_t loc_;
+    int64_t next_loc_;
+  };
+
+ private:
+  friend class Group;
+  const Tensor ix_;
+  const TTypes<int64_t>::ConstMatrix ix_matrix_;
+  Tensor vals_;
+  const int dims_;
+  const absl::InlinedVector<int64_t, 8UL> group_dims_;
+};
+
+inline int64_t Group::group_at(size_t index) const {
+  const auto& ix_t = iter_->ix_matrix_;
+  return ix_t(loc_, index);
+}
+
+// Implementation of Group::values<T>()
+template <typename T>
+typename TTypes<T>::UnalignedVec Group::values() const {
+  return typename TTypes<T>::UnalignedVec(&(iter_->vals_.vec<T>()(loc_)),
+                                          next_loc_ - loc_);
+}
+
+}  // namespace sparse
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_SPARSE_GROUP_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/sparse/sparse_tensor.h b/third_party/tflite-hdrs/tensorflow/core/util/sparse/sparse_tensor.h
new file mode 100644
index 00000000..b7b7b17b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/sparse/sparse_tensor.h
@@ -0,0 +1,674 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_SPARSE_SPARSE_TENSOR_H_
+#define TENSORFLOW_CORE_UTIL_SPARSE_SPARSE_TENSOR_H_
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/sparse/dim_comparator.h"
+#include "tensorflow/core/util/sparse/group_iterator.h"
+
+namespace tensorflow {
+namespace sparse {
+
+class SparseTensor {
+ public:
+  typedef absl::Span<const int64_t> VarDimArray;
+  typedef absl::InlinedVector<int64_t, 8UL> ShapeArray;
+
+  static absl::Status Create(Tensor ix, Tensor vals, const VarDimArray shape,
+                             const VarDimArray order, SparseTensor* result);
+
+  static absl::Status Create(Tensor ix, Tensor vals, const TensorShape& shape,
+                             SparseTensor* result);
+
+  static absl::Status Create(Tensor ix, Tensor vals, const VarDimArray shape,
+                             SparseTensor* result);
+
+  static absl::Status Create(Tensor ix, Tensor vals, const TensorShape& shape,
+                             const VarDimArray order, SparseTensor* result);
+
+  SparseTensor() : dims_(0) {}
+
+  ABSL_DEPRECATED("Use Create() functions instead of constructors directly.")
+  SparseTensor(Tensor ix, Tensor vals, const TensorShape& shape)
+      : SparseTensor(std::move(ix), std::move(vals), TensorShapeToVector(shape),
+                     UndefinedOrder(TensorShapeToVector(shape))) {}
+
+  ABSL_DEPRECATED("Use Create() functions instead of constructors directly.")
+  SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape)
+      : SparseTensor(std::move(ix), std::move(vals), shape,
+                     UndefinedOrder(shape)) {}
+
+  ABSL_DEPRECATED("use Create() functions instead of constructors directly.")
+  SparseTensor(Tensor ix, Tensor vals, const TensorShape& shape,
+               const VarDimArray order)
+      : SparseTensor(std::move(ix), std::move(vals), TensorShapeToVector(shape),
+                     order) {}
+
+  ABSL_DEPRECATED("Use Create() functions instead of constructors directly.")
+  SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
+               const VarDimArray order);
+
+  SparseTensor(const SparseTensor& other)
+      : SparseTensor(other.ix_, other.vals_, other.shape_, other.order_) {}
+
+  SparseTensor(SparseTensor&& other) noexcept
+      : SparseTensor(std::move(other.ix_), std::move(other.vals_),
+                     std::move(other.shape_), std::move(other.order_)) {}
+
+  SparseTensor& operator=(const SparseTensor& other) {
+    ix_ = other.ix_;
+    vals_ = other.vals_;
+    shape_ = other.shape_;
+    order_ = other.order_;
+    dims_ = other.dims_;
+    return *this;
+  }
+
+  SparseTensor& operator=(SparseTensor&& other) noexcept {
+    ix_ = std::move(other.ix_);
+    vals_ = std::move(other.vals_);
+    shape_ = std::move(other.shape_);
+    order_ = std::move(other.order_);
+    dims_ = std::move(other.dims_);
+    return *this;
+  }
+
+  std::size_t num_entries() const { return ix_.dim_size(0); }
+
+  int dims() const { return shape_.size(); }
+
+  const Tensor& indices() const { return ix_; }
+
+  const Tensor& values() const { return vals_; }
+
+  DataType dtype() const { return vals_.dtype(); }
+
+  absl::Status IndicesValid() const;
+
+  VarDimArray shape() const { return shape_; }
+
+  VarDimArray order() const { return order_; }
+
+  // Resorts the indices and values according to the dimensions in order.
+  template <typename T>
+  void Reorder(const VarDimArray& order);
+
+  // Returns a group iterable that can be used for clumping indices
+  // and values according to the group indices of interest.
+  //
+  // Precondition: order()[0..group_ix.size()] == group_ix.
+  //
+  // See the README.md in this directory for more usage information.
+  GroupIterable group(const VarDimArray& group_ix) const {
+    DCHECK_LE(group_ix.size(), dims_);
+    for (std::size_t di = 0; di < group_ix.size(); ++di) {
+      DCHECK_GE(group_ix[di], 0) << "Group dimension out of range";
+      DCHECK_LT(group_ix[di], dims_) << "Group dimension out of range";
+      DCHECK_EQ(group_ix[di], order_[di])
+          << "Group dimension does not match sorted order";
+    }
+    return GroupIterable(ix_, vals_, dims_, group_ix);
+  }
+
+  // Stores the sparse indices into the dense tensor out.
+  // Preconditions:
+  //   out->shape().dims() == shape().dims()
+  //   out->shape().dim_size(d) >= shape(d) for all d
+  //
+  // Returns true on success.  False on failure (mismatched dimensions
+  // or out-of-bounds indices).
+  //
+  // If initialize==True, ToDense first overwrites all coefficients in out to 0.
+  //
+  template <typename T>
+  bool ToDense(Tensor* out, bool initialize = true);
+
+  // Concat() will concatenate all the tensors according to their first order
+  // dimension.  All tensors must have identical shape except for
+  // the first order dimension.  All tensors orders' first dimension
+  // must match.
+  //
+  // If all of the tensors have identical ordering, then the output
+  // will have this ordering.  Otherwise the output is set as not
+  // having any order and a Reorder<T>() should be called on it before
+  // performing any subsequent operations.
+  template <typename T>
+  static SparseTensor Concat(const absl::Span<const SparseTensor>& tensors);
+
+  // Split() will split the input SparseTensor into a list of num_split
+  // SparseTensor given a splitting dimension. If the input dimension range
+  // isn't an integer multiple of split_dim, we add one extra dimension for
+  // each slice.
+  template <typename T>
+  static absl::Status Split(const SparseTensor& tensor, const int split_dim,
+                            const int num_split,
+                            std::vector<SparseTensor>* result);
+
+  // Slice() will slice the input SparseTensor into a SparseTensor based on
+  // specified start and size. Both start and size are 1-D array with each
+  // element of the array representing one dimension. The start is the start
+  // index at each dimension and the size is the size at each dimension.
+  template <typename T>
+  static absl::StatusOr<SparseTensor> Slice(
+      const SparseTensor& tensor, const absl::Span<const int64_t> start,
+      const absl::Span<const int64_t> size);
+
+  // Picks out the dimensions according to `dim_indices`.
+  std::vector<int64_t> PickDims(absl::Span<const int64_t> dim_indices) const {
+    std::vector<int64_t> res(dim_indices.size());
+    for (size_t i = 0; i < dim_indices.size(); ++i) {
+      res[i] = shape_[dim_indices[i]];
+    }
+    return res;
+  }
+
+ private:
+  static inline ShapeArray UndefinedOrder(const VarDimArray shape) {
+    return ShapeArray(shape.size(), -1);
+  }
+
+  static inline ShapeArray TensorShapeToVector(const TensorShape& shape) {
+    ShapeArray vec(shape.dims());
+    for (int i = 0; i < shape.dims(); ++i) vec[i] = shape.dim_size(i);
+    return vec;
+  }
+
+  // Optimized implementation of `IndicesValid` for 1-D sparse tensors.
+  // REQUIRES: `shape_.size() == 1`.
+  bool IndicesValidVectorFastPath() const;
+
+  // Optimized implementation of `IndicesValid` for 2-D sparse tensors whose
+  // indices fit within the range of an `int32`.
+  // REQUIRES: `shape_.size() == 2`.
+  bool IndicesValidMatrix32BitFastPath() const;
+
+  template <bool standard_order>
+  absl::Status IndicesValidHelper() const;
+
+  // Helper for ToDense<T>()
+  template <typename T>
+  bool ValidateAndInitializeToDense(Tensor* out, bool initialize);
+
+  // Helper for Split() that returns the slice index.
+  static inline int GetSliceIndex(const int dim, const int split_size,
+                                  const int residual) {
+    DCHECK_GT(split_size, 0);
+    DCHECK_GE(dim, 0);
+    if (residual == 0) return dim / split_size;
+    const int offset = residual * (split_size + 1);
+    if (dim < offset) {
+      return dim / (split_size + 1);
+    } else {
+      return residual + ((dim - offset) / split_size);
+    }
+  }
+
+  // Helper for Split() that returns the dimension in the slice.
+  static inline int GetDimensionInSlice(const int dim, const int split_size,
+                                        const int residual) {
+    DCHECK_GT(split_size, 0);
+    DCHECK_GE(dim, 0);
+    if (residual == 0) return dim % split_size;
+    const int offset = residual * (split_size + 1);
+    if (dim < offset) {
+      return dim % (split_size + 1);
+    } else {
+      return (dim - offset) % split_size;
+    }
+  }
+
+  // Helper for Split() that returns the shape given a slice index.
+  static inline int GetSliceShape(const int slice_index, const int split_size,
+                                  const int residual) {
+    DCHECK_GT(split_size, 0);
+    DCHECK_GE(slice_index, 0);
+    if (residual == 0) return split_size;
+    if (slice_index < residual) {
+      return split_size + 1;
+    } else {
+      return split_size;
+    }
+  }
+
+  Tensor ix_;
+  Tensor vals_;
+  ShapeArray shape_;
+  ShapeArray order_;
+  int dims_;
+};
+
+// This operation updates the indices and values Tensor rows, so it is
+// an in-place algorithm.  It requires O(N log N) time and O(N)
+// temporary space.
+template <typename T>
+inline void SparseTensor::Reorder(const VarDimArray& order) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), dtype())
+      << "Reorder requested with the wrong datatype";
+  DCHECK_EQ(order.size(), dims_) << "Order length must be SparseTensor rank";
+  auto ix_t = ix_.matrix<int64_t>();
+  auto vals_t = vals_.vec<T>();
+
+  std::vector<int64_t> reorder(num_entries());
+  std::iota(reorder.begin(), reorder.end(), 0);
+
+  // Sort to get order of indices
+  switch (order.size()) {
+#define CASE_SORT(ORDER_SIZE)                                    \
+  case ORDER_SIZE: {                                             \
+    FixedDimComparator<ORDER_SIZE> sorter(ix_t, order, shape()); \
+    std::sort(reorder.begin(), reorder.end(), sorter);           \
+    break;                                                       \
+  }
+    CASE_SORT(0);
+    CASE_SORT(1);
+    CASE_SORT(2);
+    CASE_SORT(3);
+    CASE_SORT(4);
+    CASE_SORT(5);
+#undef CASE_SORT
+    default: {
+      DimComparator sorter(ix_t, order, shape());
+      std::sort(reorder.begin(), reorder.end(), sorter);
+    }
+  }
+
+  // We have a forward reordering, but what we'll need is a
+  // permutation (the inverse).  This can be calculated with O(1)
+  // additional
+  // and O(n) time (INVPERM) but we just do the simple thing here.
+  std::vector<size_t> permutation(reorder.size());
+  for (std::size_t n = 0; n < reorder.size(); ++n) {
+    permutation[reorder[n]] = n;
+  }
+
+  // Update indices & values by converting the permutations to
+  // a product of transpositions.  Iterate over the cycles in the
+  // permutation, and convert each of those into a product of
+  // transpositions (swaps):
+  //   https://en.wikipedia.org/wiki/Cyclic_permutation
+  // This is N swaps, 2*N comparisons.
+  for (std::size_t n = 0; n + 1 < permutation.size(); ++n) {
+    while (n != permutation[n]) {
+      std::size_t r = permutation[n];
+      std::swap_ranges(&(ix_t(n, 0)), &(ix_t(n + 1, 0)), &(ix_t(r, 0)));
+      std::swap(vals_t(n), vals_t(r));
+      std::swap(permutation[n], permutation[r]);
+    }
+  }
+
+  order_ = ShapeArray(order.begin(), order.end());
+}
+
+template <typename T>
+inline bool SparseTensor::ValidateAndInitializeToDense(Tensor* out,
+                                                       bool initialize) {
+  DCHECK_EQ(DataTypeToEnum<T>::v(), dtype())
+      << "ToDense requested with the wrong datatype";
+
+  DCHECK_EQ(out->shape().dims(), dims_)
+      << "Incompatible dimensions between SparseTensor and output";
+
+  DCHECK_EQ(out->dtype(), DataTypeToEnum<T>::v())
+      << "Output must be type: " << DataTypeToEnum<T>::v()
+      << " but got: " << out->dtype();
+
+  // Make sure the dense output is the same rank and has room
+  // to hold the SparseTensor.
+  const auto& out_shape = out->shape();
+  if (shape_.size() != out_shape.dims()) return false;
+  for (int d = 0; d < shape_.size(); ++d) {
+    if (shape_[d] > out_shape.dim_size(d)) return false;
+  }
+
+  if (initialize) {
+    auto out_t = out->flat<T>();
+    out_t.setConstant(T());
+  }
+
+  return true;
+}
+
+template <typename T>
+inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
+  if (!ValidateAndInitializeToDense<T>(out, initialize)) return false;
+
+  auto out_t = out->flat<T>();
+  auto vals_t = vals_.vec<T>();
+  auto ix_t = ix_.matrix<int64_t>();
+  const int64_t* const ix_ptr = ix_t.data();
+
+  if (dims_ == 1) {
+    // Fast path for sparse vectors.
+    const int64_t out_length = out->shape().dim_size(0);
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
+      const int64_t index = internal::SubtleMustCopy(ix_ptr[n]);
+      if (!FastBoundsCheck(index, out_length)) return false;
+      out_t(index) = vals_t(n);
+    }
+    return true;
+  } else if (dims_ == 2) {
+    // Fast path for sparse matrices.
+    const auto& out_shape = out->shape();
+    const int64_t out_rows = out_shape.dim_size(0);
+    const int64_t out_cols = out_shape.dim_size(1);
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
+      const int64_t row_index = internal::SubtleMustCopy(ix_ptr[n * 2]);
+      const int64_t col_index = internal::SubtleMustCopy(ix_ptr[n * 2 + 1]);
+      if (!(FastBoundsCheck(row_index, out_rows) &&
+            FastBoundsCheck(col_index, out_cols))) {
+        return false;
+      }
+      out_t(row_index * out_cols + col_index) = vals_t(n);
+    }
+    return true;
+  } else {
+    // General path for N-dimensional sparse tensors.
+    absl::InlinedVector<int64_t, 4UL> strides(dims_);
+    const auto& out_shape = out->shape().dim_sizes();
+    if (dims_ > 0) {
+      strides[dims_ - 1] = 1;
+    }
+    for (int d = dims_ - 2; d >= 0; --d) {
+      strides[d] = strides[d + 1] * out_shape[d + 1];
+    }
+
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
+      bool invalid_dims = false;
+      int64_t ix = 0;
+      for (int d = 0; d < dims_; ++d) {
+        const int64_t ix_n_d = internal::SubtleMustCopy(ix_ptr[n * dims_ + d]);
+        if (!FastBoundsCheck(ix_n_d, out_shape[d])) {
+          invalid_dims = true;
+        }
+        ix += strides[d] * ix_n_d;
+      }
+      if (invalid_dims) return false;
+      out_t(ix) = vals_t(n);
+    }
+    return true;
+  }
+}
+
+template <typename T>
+inline SparseTensor SparseTensor::Concat(
+    const absl::Span<const SparseTensor>& tensors) {
+  DCHECK_GE(tensors.size(), size_t{1}) << "Cannot concat 0 SparseTensors";
+  const int dims = tensors[0].dims_;
+  DCHECK_GE(dims, 1) << "Cannot concat 0-dimensional SparseTensors";
+  auto order_0 = tensors[0].order();
+  const int primary_dim = order_0[0];
+  ShapeArray final_order(order_0.begin(), order_0.end());
+  ShapeArray final_shape(tensors[0].shape().begin(), tensors[0].shape().end());
+  final_shape[primary_dim] = 0;  // We'll build this up as we go along.
+  int num_entries = 0;
+
+  bool fully_ordered = true;
+  for (const SparseTensor& st : tensors) {
+    DCHECK_EQ(st.dims_, dims) << "All SparseTensors must have the same rank.";
+    DCHECK_EQ(DataTypeToEnum<T>::v(), st.dtype())
+        << "Concat requested with the wrong data type";
+    DCHECK_GE(st.order()[0], 0) << "SparseTensor must be ordered";
+    DCHECK_EQ(st.order()[0], primary_dim)
+        << "All SparseTensors' order[0] must match.  This is the concat dim.";
+    if (st.order() != final_order) fully_ordered = false;
+    const VarDimArray& st_shape = st.shape();
+    for (int d = 0; d < dims - 1; ++d) {
+      const int cdim = (d < primary_dim) ? d : d + 1;
+      DCHECK_EQ(final_shape[cdim], st_shape[cdim])
+          << "All SparseTensors' shapes must match except on the concat dim.  "
+          << "Concat dim: " << primary_dim
+          << ", mismatched shape at dim: " << cdim
+          << ".  Expecting shape like: [" << absl::StrJoin(final_shape, ",")
+          << "] but saw shape: [" << absl::StrJoin(st_shape, ",") << "]";
+    }
+
+    // Update dimension of final shape
+    final_shape[primary_dim] =
+        (final_shape[primary_dim] + st_shape[primary_dim]);
+
+    num_entries += st.num_entries();  // Update number of entries
+  }
+
+  // If nonconsistent ordering among inputs, set final order to -1s.
+  if (!fully_ordered) {
+    final_order = UndefinedOrder(final_shape);
+  }
+
+  Tensor output_ix(DT_INT64, TensorShape({num_entries, dims}));
+  Tensor output_vals(DataTypeToEnum<T>::v(), TensorShape({num_entries}));
+
+  TTypes<int64_t>::Matrix ix_t = output_ix.matrix<int64_t>();
+  typename TTypes<T>::Vec vals_t = output_vals.vec<T>();
+
+  Eigen::DenseIndex offset = 0;
+  int64_t shape_offset = 0;
+  for (const SparseTensor& st : tensors) {
+    const int st_num_entries = st.num_entries();
+
+    // Fill in indices & values.
+    if (st_num_entries > 0) {
+      std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+
+      const auto* st_ix = &st.ix_.matrix<int64_t>()(0, 0);
+      auto* ix_out = &ix_t(offset, 0);
+      for (std::size_t i = 0; i < st_num_entries * dims; ++i) {
+        *ix_out++ = *st_ix++ + ((i % dims == primary_dim) ? shape_offset : 0);
+      }
+    }
+
+    offset += st_num_entries;
+    shape_offset += st.shape()[primary_dim];
+  }
+
+  return SparseTensor(output_ix, output_vals, final_shape, final_order);
+}
+
+template <typename T>
+inline absl::Status SparseTensor::Split(const SparseTensor& input_tensor,
+                                        const int split_dim,
+                                        const int num_split,
+                                        std::vector<SparseTensor>* result) {
+  std::vector<Tensor> output_indices;
+  std::vector<Tensor> output_values;
+  std::vector<TensorShape> output_shapes;
+  output_indices.reserve(num_split);
+  output_values.reserve(num_split);
+  output_shapes.reserve(num_split);
+
+  std::vector<typename TTypes<int64_t>::Matrix> output_indices_t;
+  std::vector<typename TTypes<T>::Vec> output_values_t;
+  output_indices_t.reserve(num_split);
+  output_values_t.reserve(num_split);
+  auto input_values_t = input_tensor.values().vec<T>();
+  auto input_indices_t = input_tensor.indices().matrix<int64_t>();
+
+  std::vector<int> num_values(num_split, 0);
+  const int num_dim = input_tensor.shape().size();
+  const int split_dim_size = input_tensor.shape()[split_dim];
+  const int split_size = split_dim_size / num_split;
+
+  if (!(num_split > 0 && num_split <= split_dim_size)) {
+    return errors::InvalidArgument("num_split must be in the interval (0, ",
+                                   split_dim_size, "]");
+  }
+  if (!(split_dim >= 0 && split_dim < num_dim)) {
+    return errors::InvalidArgument("num_dim must be in the interval [0, ",
+                                   num_dim, ")");
+  }
+
+  const int residual = split_dim_size % num_split;
+  for (int i = 0; i < input_tensor.indices().dim_size(0); ++i) {
+    const int dim = input_tensor.indices().matrix<int64_t>()(i, split_dim);
+    int slice_index = GetSliceIndex(dim, split_size, residual);
+    if (slice_index >= num_values.size()) {
+      return errors::InvalidArgument("Slice index ", slice_index,
+                                     " is larger than num_split.");
+    }
+    num_values[slice_index]++;
+  }
+
+  for (int i = 0; i < num_split; ++i) {
+    // TODO(ataei): Pass an allocator to avoid allocating large memory buffer.
+    output_indices.emplace_back(DT_INT64,
+                                TensorShape({num_values[i], num_dim}));
+    output_values.emplace_back(DataTypeToEnum<T>::v(),
+                               TensorShape({num_values[i]}));
+    output_shapes.emplace_back(input_tensor.shape());
+    output_indices_t.emplace_back(output_indices[i].matrix<int64_t>());
+    output_values_t.emplace_back(output_values[i].vec<T>());
+    const int size = GetSliceShape(i, split_size, residual);
+    output_shapes[i].set_dim(split_dim, size);
+  }
+
+  std::vector<int> values_inserted_in_slice(num_split, 0);
+  for (int i = 0; i < input_tensor.indices().dim_size(0); ++i) {
+    const int dim = input_indices_t(i, split_dim);
+    const int slice_index = GetSliceIndex(dim, split_size, residual);
+    const int slice_dim = values_inserted_in_slice[slice_index]++;
+    output_values_t[slice_index](slice_dim) = input_values_t(i);
+    for (int j = 0; j < num_dim; ++j) {
+      const int64_t original_dim = input_indices_t(i, j);
+      output_indices_t[slice_index](slice_dim, j) =
+          (j == split_dim)
+              ? GetDimensionInSlice(original_dim, split_size, residual)
+              : original_dim;
+    }
+  }
+
+  result->clear();
+  result->reserve(num_split);
+  for (int i = 0; i < num_split; ++i) {
+    SparseTensor tensor;
+    absl::Status create_status =
+        Create(output_indices[i], output_values[i], output_shapes[i], &tensor);
+    if (!create_status.ok()) {
+      return create_status;
+    }
+    result->push_back(std::move(tensor));
+  }
+  return absl::OkStatus();
+}
+
+template <typename T>
+inline absl::StatusOr<SparseTensor> SparseTensor::Slice(
+    const SparseTensor& input_tensor, const absl::Span<const int64_t> start,
+    const absl::Span<const int64_t> size) {
+  TensorShape output_shape(input_tensor.shape());
+
+  const int dims = input_tensor.dims();
+  for (int dim = 0; dim < dims; dim++) {
+    // Determine the size of the result; if the selected slice goes beyond the
+    // input boundary, the result will correspond to the size of the overlap
+    // between the input and the selected slice.
+    const int64_t input_size = output_shape.dim_size(dim);
+    const int64_t start_index = start[dim];
+    const int64_t slice_size = size[dim];
+
+    if (start_index < input_size - slice_size) {
+      // The entire selection is within input boundaries.
+      TF_RETURN_IF_ERROR(output_shape.SetDimWithStatus(dim, slice_size));
+    } else if (start_index < input_size) {
+      // The selection starts within input boundaries, but goes beyond them.
+      TF_RETURN_IF_ERROR(
+          output_shape.SetDimWithStatus(dim, input_size - start_index));
+    } else {
+      // The selection is entirely out of input boundaries.
+      TF_RETURN_IF_ERROR(output_shape.SetDimWithStatus(dim, 0));
+    }
+  }
+
+  auto input_indices_t = input_tensor.indices().matrix<int64_t>();
+  auto input_values_t = input_tensor.values().vec<T>();
+
+  // Find the number of indices that fall inside start and size.
+  int count = 0;
+  for (int i = 0; i < input_tensor.indices().dim_size(0); i++) {
+    // The following will check to see if an input is within the
+    // range specified by start and size.
+    // The for loop below iterates through all dimensions. In case
+    // the index falls outside of the start and size at any dimension,
+    // it will be considered as a "no hit" (hit = false). In this
+    // case, it will not be counted as the index that fall inside
+    // the range specified by start and size.
+    bool hit = true;
+    for (int dim = 0; dim < dims; dim++) {
+      if (!(start[dim] <= input_indices_t(i, dim) &&
+            input_indices_t(i, dim) < start[dim] + size[dim])) {
+        hit = false;
+        break;
+      }
+    }
+    if (!hit) {
+      continue;
+    }
+    count++;
+  }
+
+  Tensor output_values(DataTypeToEnum<T>::v(), TensorShape({count}));
+  Tensor output_indices(DT_INT64, TensorShape({count, dims}));
+
+  auto output_values_t = output_values.vec<T>();
+  auto output_indices_t = output_indices.matrix<int64_t>();
+
+  // Obtain the output indices that fall inside start and size.
+  int index = 0;
+  for (int i = 0; i < input_tensor.indices().dim_size(0) && index < count;
+       i++) {
+    // The logic here is similar as the above except that the above
+    // only count the number of indices while here we actually generate
+    // the output.
+    bool hit = true;
+    for (int dim = 0; dim < dims; dim++) {
+      if (!(start[dim] <= input_indices_t(i, dim) &&
+            input_indices_t(i, dim) < start[dim] + size[dim])) {
+        hit = false;
+        break;
+      }
+    }
+    if (!hit) {
+      continue;
+    }
+    output_values_t(index) = input_values_t(i);
+    for (int dim = 0; dim < dims; dim++) {
+      output_indices_t(index, dim) = input_indices_t(i, dim) - start[dim];
+    }
+    index++;
+  }
+
+  return SparseTensor(output_indices, output_values, output_shape);
+}
+
+}  // namespace sparse
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_SPARSE_SPARSE_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/stat_summarizer.h b/third_party/tflite-hdrs/tensorflow/core/util/stat_summarizer.h
new file mode 100644
index 00000000..3eae427f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/stat_summarizer.h
@@ -0,0 +1,117 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_H_
+#define TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_H_
+
+#include <stdlib.h>
+
+#include <cmath>
+#include <limits>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/stat_summarizer_options.h"
+#include "tensorflow/core/util/stats_calculator.h"
+
+namespace tensorflow {
+
+class GraphDef;
+class StepStats;
+class NodeExecStats;
+
+// A StatSummarizer assists in performance analysis of Graph executions.
+//
+// It summarizes time spent executing (on GPU/CPU), memory used etc. across
+// multiple executions of a single Graph from the StepStats collected during
+// graph execution.
+//
+// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
+class StatSummarizer {
+ public:
+  explicit StatSummarizer(const StatSummarizerOptions& options);
+
+  // Deprecated: Use StatSummarizer(const StatSummarizerOptions&) instead. The
+  // GraphDef is not needed by the StatSummarizer.
+  explicit StatSummarizer(const tensorflow::GraphDef& tensorflow_graph);
+
+  ~StatSummarizer();
+
+  // Adds another run's StepStats output to the aggregate counts.
+  void ProcessStepStats(const StepStats& step_stats);
+
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const {
+    return stats_calculator_->GetOutputString();
+  }
+
+  std::string ShortSummary() const {
+    return stats_calculator_->GetShortSummary();
+  }
+
+  // Prints the string returned by GetOutputString().
+  void PrintStepStats() const;
+
+  // Prints the output tensor sizes and types for each node.
+  void PrintOutputs() const;
+
+  void ComputeStatsByType(
+      std::map<std::string, int64_t>* node_type_map_count,
+      std::map<std::string, int64_t>* node_type_map_time,
+      std::map<std::string, int64_t>* node_type_map_memory,
+      std::map<std::string, int64_t>* node_type_map_times_called,
+      int64_t* accumulated_us) const {
+    stats_calculator_->ComputeStatsByType(
+        node_type_map_count, node_type_map_time, node_type_map_memory,
+        node_type_map_times_called, accumulated_us);
+  }
+
+  std::string GetStatsByNodeType() const {
+    return stats_calculator_->GetStatsByNodeType();
+  }
+
+  std::string GetStatsByMetric(const string& title,
+                               StatsCalculator::SortingMetric sorting_metric,
+                               int num_stats) const {
+    return stats_calculator_->GetStatsByMetric(title, sorting_metric,
+                                               num_stats);
+  }
+
+  int num_runs() const { return stats_calculator_->num_runs(); }
+
+  // Returns stats of total microseconds spent by all nodes in each run.
+  const Stat<int64_t>& run_total_us() const {
+    return stats_calculator_->run_total_us();
+  }
+
+ private:
+  void Validate(const std::vector<TensorDescription>* outputs,
+                const NodeExecStats& ns) const;
+
+  std::map<std::string, std::vector<TensorDescription> > outputs_;
+
+  std::unique_ptr<StatsCalculator> stats_calculator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/stat_summarizer_options.h b/third_party/tflite-hdrs/tensorflow/core/util/stat_summarizer_options.h
new file mode 100644
index 00000000..71f9bf37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/stat_summarizer_options.h
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#define TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+
+#include "xla/tsl/util/stat_summarizer_options.h"
+
+namespace tensorflow {
+using tsl::StatSummarizerOptions;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/stats_calculator.h b/third_party/tflite-hdrs/tensorflow/core/util/stats_calculator.h
new file mode 100644
index 00000000..7f188fe0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/stats_calculator.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_STATS_CALCULATOR_H_
+#define TENSORFLOW_CORE_UTIL_STATS_CALCULATOR_H_
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/util/stats_calculator.h"
+#include "tensorflow/core/util/stat_summarizer_options.h"
+
+namespace tensorflow {
+
+using tsl::Stat;
+using tsl::StatsCalculator;
+using tsl::StatWithPercentiles;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STATS_CALCULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/stream_executor_util.h b/third_party/tflite-hdrs/tensorflow/core/util/stream_executor_util.h
new file mode 100644
index 00000000..4787bcf6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/stream_executor_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_STREAM_EXECUTOR_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_STREAM_EXECUTOR_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+// StreamExecutorUtil contains functions useful for interfacing
+// between StreamExecutor classes and TensorFlow.
+class StreamExecutorUtil {
+ public:
+  // Map a Tensor as a DeviceMemory object wrapping the given typed
+  // buffer.
+  template <typename T>
+  static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
+    T* ptr = reinterpret_cast<T*>(const_cast<char*>(t.tensor_data().data()));
+    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STREAM_EXECUTOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/strided_slice_op.h b/third_party/tflite-hdrs/tensorflow/core/util/strided_slice_op.h
new file mode 100644
index 00000000..356b77a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/strided_slice_op.h
@@ -0,0 +1,154 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#define TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+struct StridedSliceShapeSpec {
+  // Begin mask canonlized in dense form.
+  int32_t begin_dense_mask;
+  // End mask canonlized in dense form.
+  int32_t end_dense_mask;
+  // Shrink axis mask canonlized in dense form.
+  int32_t shrink_axis_dense_mask;
+  // output_to_sparse_mapping[i] represents output[i]'s the corresponding dim
+  // index in the begin_tensor. If
+  // output_to_sparse_mapping[i] is -1, it means the dimension doesn't show up
+  // in sparse_mapping.
+  absl::InlinedVector<int64_t, 4UL> output_to_sparse_mapping;
+  // output_to_processing_mapping is similar to output_to_sparse_mapping, but
+  // for processing shape.
+  absl::InlinedVector<int64_t, 4UL> output_to_processing_mapping;
+  // processing_to_sparse_mapping[i] represents input_shape[i]'s corresponding
+  // dim index in the begin_tensor.
+  absl::InlinedVector<int64_t, 4UL> processing_to_sparse_mapping;
+};
+
+// Runs validation on the strided slice op parameters.
+//
+// Is a separate translation unit from the kernel so that:
+// 1. The op's shape function can use it.
+// 2. The code size is reduced vs templating this on the kernel's type.
+//
+// Note that when input_shape is not fully specified, only <final_shape> and
+// <processing_shape> are valid; <is_identity>, <is_simple_slice> and other
+// output parameters will not be accurate.
+//
+// If the rank of <input_shape> is unknown (i.e., "input_shape.unknown_rank()"
+// is true)), the method returns an invalid status.
+//
+// If <begin_tensor> or <end_tensor> are nullptr, <begin> and <end> will not be
+// valid. In this case, <slice_dim0> and <is_identity> will be true only if a
+// determination can be made based on the information given. A best effort is
+// made to set <processing_shape> and <final_shape> based on <input_shape>, but
+// some dimensions of <processing_shape> and/or <final_shape> may be unknown
+// (-1). Any validation that can be done without complete information is
+// performed.
+//
+absl::Status ValidateStridedSliceOp(
+    const Tensor* begin_tensor, const Tensor* end_tensor,
+    const Tensor& strides_tensor, const PartialTensorShape& input_shape,
+    int32_t begin_mask_spec, int32_t end_mask_spec, int32_t ellipsis_mask,
+    int32_t new_axis_mask, int32_t shrink_axis_mask,
+    PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
+    bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
+    absl::InlinedVector<int64_t, 4UL>* begin,
+    absl::InlinedVector<int64_t, 4UL>* end,
+    absl::InlinedVector<int64_t, 4UL>* strides,
+    StridedSliceShapeSpec* shape_spec = nullptr);
+
+// Same as above, but the outputs are TensorShape, not PartialTensorShape
+absl::Status ValidateStridedSliceOp(
+    const Tensor* begin_tensor, const Tensor* end_tensor,
+    const Tensor& strides_tensor, const PartialTensorShape& input_shape,
+    int32_t begin_mask_spec, int32_t end_mask_spec, int32_t ellipsis_mask,
+    int32_t new_axis_mask, int32_t shrink_axis_mask,
+    TensorShape* processing_shape, TensorShape* final_shape, bool* is_identity,
+    bool* is_simple_slice, bool* slice_dim0,
+    absl::InlinedVector<int64_t, 4UL>* begin,
+    absl::InlinedVector<int64_t, 4UL>* end,
+    absl::InlinedVector<int64_t, 4UL>* strides,
+    StridedSliceShapeSpec* shape_spec = nullptr);
+
+// Simple class for determining if it is possible to broadcast a tensor to a
+// strided slice.  Modelled after tensorflow::BCast, but with a few key
+// differences:
+// - the input_shape must be broadcastable to output_shape
+//   (i.e. the slice shape does not grow).
+// - does not allow reducing or flattening dimensions, since we cannot apply
+//   these simplications to the destination slice.
+// - allows for remapping dimensions, required in order to associate the input
+//   with correct dimensions in the full (unsliced) destination tensor.
+class StridedSliceAssignBCast {
+ public:
+  using Vec = absl::InlinedVector<int64_t, 4UL>;
+
+  StridedSliceAssignBCast(const Vec& input_shape, const Vec& output_shape);
+
+  // Remaps broadcast, resize, and output dimensions via the provided map.
+  // Negative values in the map correspond to dimensions being removed.
+  // Unmapped dimensions are set to 1.
+  //
+  // This is to support remapping slice -> processing dimensions.  To relate
+  // the sliced output dimensions back to processing dimensions (i.e. those
+  // relative to the original unsliced input), we need to remove any axes
+  // that were added via the `new_axis_mask`, and add back any axes that were
+  // removed via the `shrink_axis_mask`.  For example, an expression like
+  //
+  // >>> t = tf.zeros([3, 3])
+  // >>> t[2, tf.newaxis, 0:2, tf.newaxis] = tf.ones([1, 3, 1])
+  //       ^                                          ^  ^  ^
+  //       |__ shrink axis                 new axis __|  |  |__ new axis
+  //                                                     |_____ dim 1 of t
+  //
+  // would have `new_axis_mask = 0b1010` and `shrink_axis_mask = 0b0001`. The
+  // slice has shape [1, 3, 1], but the original input tensor `t` has shape
+  // [3, 3]. To remap the slice dimensions back to the input dimensions, the
+  // mapping would use `num_dims = 2`, `dimension_map = {-1, 1, -1}`. This
+  // removes the two new axes added for the slice, maps the middle slice
+  // dimension to input dimension 1, and leaves input dimension 0 to have a
+  // default size of 1 to add back the shrink axis.
+  //
+  // Returns false if the remapping fails.
+  bool RemapDimensions(int64_t num_dims, const Vec& dimension_map);
+
+  bool IsValid() const { return valid_; }
+
+  bool IsBroadcastingRequired() const { return broadcasting_required_; }
+
+  const Vec& reshape() const { return reshape_; }
+
+  const Vec& bcast() const { return bcast_; }
+
+  const Vec& result_shape() const { return result_shape_; }
+
+ private:
+  bool valid_ = true;
+  bool broadcasting_required_ = false;
+  Vec reshape_;
+  Vec bcast_;
+  Vec result_shape_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/byte_swap_array.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/byte_swap_array.h
new file mode 100644
index 00000000..97315b12
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/byte_swap_array.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
+
+#include "xla/tsl/util/byte_swap_array.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+using tsl::ByteSwapArray;  // NOLINT(misc-unused-using-decls)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
new file mode 100644
index 00000000..c86ffc26
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/util/tensor_bundle/byte_swap_array.h"
+
+namespace tensorflow {
+
+// Check if a data type is byte swappable.
+bool IsByteSwappable(DataType dtype);
+
+// Byte-swap a tensor's backing buffer in place.
+//
+// Args:
+//  t: Tensor to be modified IN PLACE. Any tensors that share a backing
+//     buffer with this one will also end up byte-swapped.
+// Returns: OkStatus() on success, -1 otherwise
+// TODO(frreiss): Should this be a member of the Tensor class?
+absl::Status ByteSwapTensor(Tensor* t);
+
+// Byte-swap a tensor proto's backing buffer in place.
+//
+// Args:
+//  t: TensorProto to be modified IN PLACE.
+// Returns: OkStatus() on success, -1 otherwise
+absl::Status ByteSwapTensorProto(TensorProto* tp);
+
+// Swap tensor_content field of Const Op Tensors in the named functions
+// in NodeDef
+absl::Status ByteSwapTensorContentInNode(NodeDef& node);
+
+// Swap tensor_content field of Const Op Tensors in the named functions
+// in MetaGraphDef
+absl::Status ByteSwapTensorContentInMetaGraphDef(MetaGraphDef* meta_graph_def);
+
+// Swap tensor_content field of Const Op Tensors in the named functions
+// in GraphDef
+absl::Status ByteSwapTensorContentInGraphDef(GraphDef* graph_def);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/naming.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/naming.h
new file mode 100644
index 00000000..3e851392
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/naming.h
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A minimal library exposing the naming logic used in tensor_bundle.
+//
+// A tensor bundle contains a metadata file and sharded data files, which all
+// share a common pathname prefix.
+//
+// Given the prefix, the actual pathnames of the files can be queried via:
+//
+//   MetaFilename(prefix): pathname of the metadata file.
+//   DataFilename(prefix, shard_id, num_shards): pathname of a data file.
+//
+// Typical usage includes forming a filepattern to match files on disk:
+//
+//   // To find the unique metadata file.
+//   const string metadata_file = MetaFilename("/fs/train/ckpt-step");
+//   Env::Default()->GetMatchingFiles(metadata_file, &path);
+//
+// Regexp can also be used: e.g. R"<prefix>.data-\d{5}-of-\d{5}" for data files.
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_NAMING_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_NAMING_H_
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+string MetaFilename(StringPiece prefix);
+string DataFilename(StringPiece prefix, int32_t shard_id, int32_t num_shards);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_NAMING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/tensor_bundle.h
new file mode 100644
index 00000000..a0fcb134
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -0,0 +1,419 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A tensor bundle is a set of immutable persistent files storing a set of named
+// tensors.  It is designed for checkpointing TensorFlow tensors.
+//
+// The paths of the managed files share a common prefix; e.g., with the prefix:
+//   /fs/model/train/ckpt-step/ckpt
+//
+// the bundle may contain a metadata file, and sharded data files:
+//   /fs/model/train/ckpt-step/
+//       ckpt.index
+//       ckpt.data-00000-of-00020
+//       ckpt.data-00001-of-00020
+//       ...
+//       ckpt.data-00019-of-00020
+//
+// The ".index" file is a string-string immutable table
+// (tensorflow::table::Table).  Each key is a name of a tensor and its value is
+// a serialized BundleEntryProto.  Each BundleEntryProto describes the metadata
+// of a tensor: which of the "data" files contains the content of a tensor, the
+// offset into that file, checksum, some auxiliary data, etc.
+//
+// A tensor bundle can be accessed randomly using a BundleReader.  Usage:
+//
+//   BundleReader reader(env, "/fs/model/train/ckpt-step/ckpt");
+//   reader.Lookup("name", &tensor);
+//
+// A tensor bundle can be built using BundleWriter.  Each BundleWriter builds a
+// single data file bundle.  Multiple bundles can then be merged by
+// MergeBundles() without reading and writing large chunk of data: it reads the
+// metadata files and outputs a single merged metadata.  Typical usage:
+//
+//   worker 0:
+//     BundleWriter writer(env, "/fs/model/train/ckpt-step/tmp/worker0-step");
+//     writer.Add(...);  // Adds the tensors on this worker.
+//     writer.Finish();  // Flushes.
+//   worker 1:
+//     BundleWriter writer(env, "/fs/model/train/ckpt-step/tmp/worker1-step");
+//     writer.Add(...);
+//     writer.Finish();
+//   worker 2:
+//     MergeBundles(env,
+//       {"/fs/model/train/ckpt-step/tmp/worker0-step",
+//        "/fs/model/train/ckpt-step/tmp/worker1-step"},
+//       "/fs/model/train/ckpt-step/ckpt" /* merged prefix */);
+//
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/lib/io/buffered_file.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/io/cache.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/iterator.h"
+#include "tensorflow/core/lib/io/table.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/protobuf/tensor_bundle.pb.h"
+#include "tensorflow/core/util/tensor_slice_set.h"
+#include "tsl/platform/errors.h"
+
+namespace tensorflow {
+
+// Versioning of the tensor bundle format.
+// Follows the same rules as 3p/tf/core/public/version.h.
+//
+// History:
+// 0. Any tensor bundles produced before this field was added.
+// 1. Added this field (2016-09-14).
+extern const int kTensorBundleMinProducer;
+extern const int kTensorBundleMinConsumer;
+extern const int kTensorBundleVersion;
+
+// The empty string, hence always the first key in the metadata table.  Its
+// corresponding value is a BundleHeaderProto.
+extern const char* const kHeaderEntryKey;
+
+// Builds a string-string table of tensor names to BundleEntryProto (metadata).
+//
+// On construction, attempts to create a directory given by the dirname of
+// "prefix", so "status()" must be checked before calling any member functions.
+//
+// All threads accessing the same BundleWriter must synchronize.
+class BundleWriter {
+ public:
+  struct Options {
+    Options() {}
+    // Alignment, in bytes, for tensor data.
+    // Must be >= 1. The default size of 1 densely packs tensors.
+    int data_alignment{1};
+  };
+  BundleWriter(Env* env, absl::string_view prefix,
+               const Options& options = Options());
+
+  // Adds the tensor "val" under key "key".
+  // Across calls "key" must be unique but can be added in any order.
+  absl::Status Add(absl::string_view key, const Tensor& val);
+
+  // Partitioned variables support.
+  // A slice of a full tensor is stored in two entries in the metadata table:
+  //
+  //   full_tensor_key   -> BundleEntryProto, describing all stored slices
+  //                        of this full tensor.  Does not append to the data
+  //                        file.
+  //   encoded slice key -> BundleEntryProto, describing one particular slice.
+  //                        Appends values of this slice to the data file.
+  //
+  // Slices of a full tensor can be added in any order.
+  //
+  // If a full tensor has slices placed on N devices and N BundleWriter's are
+  // concurrently used, the caller must use MergeBundles() to ensure that a
+  // consistent entry for "full_tensor_key" is produced.
+  //
+  // Returns an error if the same slice is added the second time.
+  absl::Status AddSlice(absl::string_view full_tensor_key,
+                        const TensorShape& full_tensor_shape,
+                        const TensorSlice& slice_spec,
+                        const Tensor& slice_tensor);
+
+  // Finishes the writer and flushes.
+  absl::Status Finish();
+
+  absl::Status status() const { return status_; }
+
+ private:
+  Env* const env_;  // Not owned.
+  const Options options_;
+  const std::string prefix_;
+  std::string metadata_path_;
+  std::string data_path_;
+  bool use_temp_file_;
+  std::unique_ptr<tsl::BufferedWritableFile> out_;
+  int64_t size_;  // Number of bytes written into out_.
+  std::map<std::string, BundleEntryProto> entries_;
+  absl::Status status_;
+
+  BundleWriter(const BundleWriter&) = delete;
+  void operator=(const BundleWriter&) = delete;
+};
+
+// Merges a set of bundles (given their prefixes) into a single bundle with the
+// given "merged_prefix".  The merged metadata is guaranteed to be consistent.
+//
+// If there are N bundles in "prefixes", during the merge the data files will be
+// renamed to contain a proper sharded file spec, with num_shards set to the sum
+// of num_shards across the N input bundles.
+//
+// The caller should only rely on the metadata file of the merged bundle to
+// query information about a tensor.  In particular, this function does not
+// guarantee not to re-order the input data files.
+//
+// Once merged, makes a best effort to delete the old metadata files.
+// Returns OK iff all bundles are successfully merged.
+//
+// "allow_missing_files": If set to true, merges "prefixes" as long as
+// at least one file exists. (Defaults to false.)
+//
+// Returns an InvalidArgumentError when "allow_missing_files" is set to true
+// and all data files named in "prefixes" do not exist.
+//
+// Returns a NotFoundError when "allow_missing_files" is set to false and
+// any data file named in "prefixes" does not exist.
+absl::Status MergeBundles(Env* env, absl::Span<const tstring> prefixes,
+                          absl::string_view merged_prefix,
+                          bool allow_missing_files = false);
+
+class BundleCache;
+
+// On construction, silently attempts to read the metadata associated with
+// "prefix".  If caller intends to call any function afterwards, "status()"
+// must be checked.
+// All threads accessing the same BundleReader must synchronize.
+class BundleReader {
+ public:
+  BundleReader(Env* env, absl::string_view prefix,
+               bool enable_multi_threading_for_testing = false);
+
+  struct Options {
+    // If supplied, a shared cache that is used to read tensor data. If not
+    // supplied, a BundleCache private to the BundleReader is used.
+    BundleCache* cache = nullptr;
+
+    // For tests only.
+    bool enable_multi_threading_for_testing = false;
+  };
+  BundleReader(Env* env, absl::string_view prefix, Options options);
+
+  ~BundleReader();
+
+  // Is ok() iff the reader construction is successful (completed the read of
+  // the metadata).
+  absl::Status status() const { return status_; }
+
+  // Queries whether the bundle contains an entry keyed by "key".  Calls Seek()
+  // internally, so this call invalidates the reader's current position.
+  // REQUIRES: status().ok()
+  bool Contains(absl::string_view key);
+
+  // Sorts a `container` of tensors to read such that when `Seek(key)` is called
+  // on the elements of the sorted container, the underlying file access is
+  // sequential. Sorting can greatly improve overall read speed.
+  //
+  // `get_key` should be a function that when passed an element in `container`,
+  // returns the `key` of the tensor.
+  //
+  // REQUIRES: status().ok()
+  template <class T>
+  absl::Status SortForSequentialAccess(
+      std::vector<T>& container,
+      absl::FunctionRef<std::string(const T&)> get_key);
+
+  // Looks up the dtype and the shape of the tensor keyed by "key".
+  // REQUIRES: status().ok()
+  absl::Status LookupDtypeAndShape(absl::string_view key, DataType* dtype,
+                                   TensorShape* shape);
+
+  // Looks up the shape of the tensor keyed by "key".
+  // Clears "shape" if not found.
+  // REQUIRES: status().ok()
+  absl::Status LookupTensorShape(absl::string_view key, TensorShape* shape);
+
+  // Looks up the tensor keyed by "key".  If "key" refers to a partitioned
+  // tensor, attempts to look up the full contents using all stored slices.
+  //
+  // Caller must make sure "val" has the same shape and dtype as the
+  // corresponding contents, so that its buffer can be filled without needing
+  // extra allocation.  These can be queried via "LookupDtypeAndShape()".
+  //
+  // On error, "val" may contain nonsense data.  Returns a NotFound error if
+  // tensor keyed by "key" does not exist in this bundle.
+  //
+  // Validates the stored crc32c checksum against the restored bytes.
+  // REQUIRES: status().ok()
+  absl::Status Lookup(absl::string_view key, Tensor* val);
+
+  // Looks up the tensor pointed to by the internal iterator.
+  //
+  // On error, "val" may contain nonsense data.
+  //
+  // Validates the stored crc32c checksum against the restored bytes.
+  // REQUIRES: status().ok() && Valid()
+  absl::Status ReadCurrent(Tensor* val);
+
+  // Looks up the slices of the tensor keyed by "key".  On OK, "slices"
+  // is non-empty if and only if the tensor is a partitioned tensor.
+  //
+  // Warning - there is no guaranteed ordering for the returned slices, so
+  // a slice with a larger start index in some dimension could come before
+  // another slice with a smaller start index in the same dimension.
+  // REQUIRES: status().ok()
+  absl::Status LookupTensorSlices(absl::string_view key,
+                                  std::vector<TensorSlice>* slices);
+
+  // Looks up a specific slice of a partitioned tensor.
+  // It is only required that the stored slices cover the requested slice,
+  // namely "slice_spec" is a subset of the union of the stored slices.
+  // REQUIRES: status().ok()
+  absl::Status LookupSlice(absl::string_view full_tensor_key,
+                           const TensorSlice& slice_spec, Tensor* val);
+
+  // Seeks to the first position in the bundle whose key is no less than "key".
+  // REQUIRES: status().ok()
+  void Seek(absl::string_view key) { return iter_->Seek(key); }
+  // Moves to the next position in the bundle.
+  // REQUIRES: status().ok()
+  void Next() const { iter_->Next(); }
+  // Returns true iff the reader is positioned to a key/val pair.
+  // REQUIRES: status().ok()
+  bool Valid() const { return iter_->Valid(); }
+
+  // Returns the key at the current position.
+  // REQUIRES: status().ok() && Valid()
+  absl::string_view key() const { return iter_->key(); }
+  // Returns the raw value at the current position.
+  // REQUIRES: status().ok() && Valid()
+  absl::string_view value() const { return iter_->value(); }
+
+  std::string DebugString();
+
+ private:
+  // Seeks for "key" and reads the metadata proto.
+  // On non-OK return, clears "entry" for the caller.
+  // REQUIRES: status().ok()
+  absl::Status GetBundleEntryProto(absl::string_view key,
+                                   BundleEntryProto* entry);
+
+  // Reads the tensor value described by the metadata proto "entry".
+  // Usage for "val" follows the comment of "Lookup()".
+  absl::Status GetValue(const BundleEntryProto& entry, Tensor* val);
+
+  // Reads the slice described by "slice_spec".  The corresponding full tensor
+  // has key "ful_tensor_key" and metadata proto "full_tensor_entry".
+  // REQUIRES: full_tensor_entry.slices_size() > 0
+  absl::Status GetSliceValue(absl::string_view full_tensor_key,
+                             const BundleEntryProto& full_tensor_entry,
+                             const TensorSlice& slice_spec, Tensor* val);
+
+  Env* env_;  // Not owned.
+  const std::string prefix_;
+  std::unique_ptr<BundleCache> owned_cache_;  // may be null
+  BundleCache* cache_;  // Not owned, or owned_cache_.get()
+
+  absl::Status status_;
+  RandomAccessFile* metadata_;  // Owned.
+  table::Table* table_;
+  table::Cache* index_cache_;
+  table::Iterator* iter_;
+
+  // Owned InputBuffer objects. cache_ owns the underlying RandomAccessFiles.
+  std::unordered_map<int32_t, io::InputBuffer*> data_;
+
+  // Maps each partitioned tensor's key to its stored slices (represented in a
+  // TensorSliceSet).  Populated on-demand.
+  std::unordered_map<std::string, checkpoint::TensorSliceSet*> tensor_slices_;
+
+  // Expected number of data file shards in the bundle.  Extracted by reading
+  // the header entry in the metadata table.
+  int num_shards_;
+
+  // Flag that this class sets to true when the endianness of the target bundle
+  // differs from that of the current system's processor architecture.
+  bool need_to_swap_bytes_;
+
+  friend class TensorBundleAlignmentTest;  // For testing data alignment.
+
+  bool enable_multi_threading_for_testing_ = false;
+
+  BundleReader(const BundleReader&) = delete;
+  void operator=(const BundleReader&) = delete;
+};
+
+template <class T>
+absl::Status BundleReader::SortForSequentialAccess(
+    std::vector<T>& container,
+    absl::FunctionRef<std::string(const T&)> get_key) {
+  struct FileOffset {
+    int32_t shard_id;
+    int64_t offset;
+  };
+  absl::flat_hash_map<std::string, FileOffset> file_offsets;
+  for (const T& element : container) {
+    BundleEntryProto entry;
+    TF_RETURN_IF_ERROR(GetBundleEntryProto(get_key(element), &entry));
+    file_offsets[get_key(element)] = {entry.shard_id(), entry.offset()};
+  }
+  absl::c_sort(container, [&get_key, &file_offsets](const T& a, const T& b) {
+    const FileOffset& file_offset_a = file_offsets[get_key(a)];
+    const FileOffset& file_offset_b = file_offsets[get_key(b)];
+    if (file_offset_a.shard_id == file_offset_b.shard_id) {
+      return file_offset_a.offset < file_offset_b.offset;
+    } else {
+      return file_offset_a.shard_id < file_offset_b.shard_id;
+    }
+  });
+  return absl::OkStatus();
+}
+
+// BundleCache provides cached opening of files.
+// Used internally by BundleReader.
+// Safe for concurrent uses by multiple threads and BundleReaders.
+class BundleCache {
+ public:
+  explicit BundleCache(Env* env);
+
+  // Get the underlying file object for fname. The result will remain valid
+  // while the BundleCache lives.
+  absl::Status GetFile(const std::string& fname, RandomAccessFile** file);
+
+ private:
+  // State for each opened file (opened on first read).
+  struct FileState {
+    absl::once_flag once;  // Ensures file is opened exactly once.
+
+    std::unique_ptr<RandomAccessFile> file;
+    absl::Status open_status;  // Records any error encountered on open
+  };
+
+  FileState* EnsureOpened(std::string name);
+
+  Env* const env_;
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::string, std::unique_ptr<FileState>> opened_files_
+      TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_format.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_format.h
new file mode 100644
index 00000000..46ed9530
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_format.h
@@ -0,0 +1,678 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
+
+#include <array>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Tensor format for input/output activations used in convolution operations.
+// The mnemonics specify the meaning of each tensor dimension sorted from
+// largest to smallest memory stride.
+// N = Batch, H = Image Height, W = Image Width, C = Number of Channels.
+// TODO(pauldonnelly): It would probably be better to switch to a registration
+// process for tensor formats, so specialized formats could be defined more
+// locally to where they are used.
+enum TensorFormat {
+  // FORMAT_NHWC is the default format in TensorFlow.
+  FORMAT_NHWC = 0,
+
+  // FORMAT_NCHW often improves performance on GPUs.
+  FORMAT_NCHW = 1,
+
+  // NCHW_VECT_C is the most performant tensor format for cudnn6's quantized
+  // int8 convolution and fused convolution. It is laid out in the same order
+  // as NCHW, except that the size of the Channels dimension is divided by 4,
+  // and a new dimension of size 4 is appended, which packs 4 adjacent channel
+  // activations for the same pixel into an int32. Thus an NCHW format tensor
+  // with dimensions [N, C, H, W] would have dimensions [N, C/4, H, W, 4] in
+  // NCHW_VECT_C format.
+  // A pre-condition of this format is that C must be a multiple of 4.
+  FORMAT_NCHW_VECT_C = 2,
+
+  // Similar to NHWC, but the size of the W dimension is divided by 4, and a
+  // new dimension of size 4 is appended, which packs 4 adjacent activations
+  // in the width dimension.
+  FORMAT_NHWC_VECT_W = 3,
+
+  // Note: although the current code in this file assumes VECT_C and VECT_W
+  // enums imply int8x4 vectors, this should not be relied upon.
+  // In the future we may change the meaning of these enums to include vectors
+  // of other types such as int16x2, with op implementations automatically
+  // determining which format is implied based on the datatype.
+
+  // FORMAT_HWNC is for TPUs.
+  FORMAT_HWNC = 4,
+
+  // FORMAT_HWCN is for TPUs.
+  FORMAT_HWCN = 5,
+};
+
+// Tensor format for convolutional filters.
+// The mnemonics specify the meaning of each tensor dimension sorted
+// from largest to smallest memory stride.
+// H = Kernel Height, W = Kernel Width, I = Input Channels, O = Output Channels.
+// Note: In cudnnGetFilter4dDescriptor(), 'O' is called 'K', 'I' is called 'C'.
+enum FilterTensorFormat {
+  // FORMAT_HWIO is the default filter format in TensorFlow.
+  // Ops that do not have a 'filter_format' attribute will assume this format.
+  FORMAT_HWIO = 0,
+
+  // FORMAT_OIHW often improves performance on GPUs.
+  FORMAT_OIHW = 1,
+
+  // FORMAT_OHWI used by cuDNN for NHWC convolutions.
+  FORMAT_OHWI = 2,
+
+  // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
+  // int8 convolution and fused convolution. It is analogous to the NCHW_VECT_C
+  // data format. It is laid out in the same order as OIHW, except that the size
+  // of the Input Channels dimension is divided by 4, and a new dimension of
+  // size 4 is appended, which packs 4 adjacent input channel weights into an
+  // int32. Thus an OIHW format filter with dimensions [O, I, H, W] would have
+  // dimensions [O, I/4, H, W, 4] in OIHW_VECT_I format.
+  // A pre-condition of this format is that I must be a multiple of 4.
+  FORMAT_OIHW_VECT_I = 3,
+};
+
+// Parse tensor format from the given string.
+// Return true if the parsing succeeds, and false if it fails.
+bool FormatFromString(absl::string_view format_str, TensorFormat* format);
+
+// Parse tensor format from the given string.
+// Return true if the parsing succeeds, and false if it fails.
+bool FilterFormatFromString(absl::string_view format_str,
+                            FilterTensorFormat* format);
+
+// Convert a tensor format into string.
+std::string ToString(TensorFormat format);
+
+// Convert a filter tensor format into string.
+std::string ToString(FilterTensorFormat format);
+
+// Returns the number of spatial dims of a tensor of rank 'num_dims' and tensor
+// format 'format'.
+inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NCHW:
+    case FORMAT_HWNC:
+    case FORMAT_HWCN:
+      return num_dims - 2;  // Exclude N,C.
+    case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
+      // Note: the VECT_W is not counted as an independent spatial dim here,
+      // since it just a component of the width dimension.
+      return num_dims - 3;  // Exclude N,C,VectDim.
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+inline int GetFilterTensorSpatialDims(int num_dims, FilterTensorFormat format) {
+  if (format == FORMAT_OIHW_VECT_I) {
+    return num_dims - 3;  // Exclude O,I,InnerI.
+  } else {
+    return num_dims - 2;  // Exclude O,I.
+  }
+}
+
+// Returns the rank of a tensor with 'num_spatial_dims' spatial dimensions and
+// tensor format 'format'. This is the inverse of GetTensorSpatialDims.
+inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
+                                        TensorFormat format) {
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NCHW:
+    case FORMAT_HWNC:
+    case FORMAT_HWCN:
+      return num_spatial_dims + 2;  // Include N,C.
+    case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
+      return num_spatial_dims + 3;  // Include N,C,VectDim.
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the rank of a tensor with 'num_spatial_dims' spatial dimensions and
+// filter tensor format 'format'.
+inline int GetFilterTensorDimsFromSpatialDims(int num_spatial_dims,
+                                              FilterTensorFormat format) {
+  if (format == FORMAT_OIHW_VECT_I) {
+    return num_spatial_dims + 3;  // Include O,I,InnerI.
+  } else {
+    return num_spatial_dims + 2;  // Include O,I.
+  }
+}
+
+// Returns the index of the batch dimension.
+inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NCHW:
+    case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
+      return 0;
+    case FORMAT_HWNC:
+      return num_dims - 2;
+    case FORMAT_HWCN:
+      return num_dims - 1;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the feature dimension. If format is NCHW_VECT_C, returns
+// the index of the outer feature dimension (i.e. dimension 1, whose size would
+// be num_features / 4 in this case).
+inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_HWNC:
+      return num_dims - 1;
+    case FORMAT_NHWC_VECT_W:
+    case FORMAT_HWCN:
+      return num_dims - 2;
+    case FORMAT_NCHW:
+    case FORMAT_NCHW_VECT_C:
+      return 1;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the inner feature dimension.
+inline int GetTensorInnerFeatureDimIndex(int num_dims, TensorFormat format) {
+  DCHECK_EQ(format, FORMAT_NCHW_VECT_C);
+  return num_dims - 1;
+}
+
+// Returns the index of the inner width dimension.
+inline int GetTensorInnerWidthDimIndex(int num_dims, TensorFormat format) {
+  DCHECK_EQ(format, FORMAT_NHWC_VECT_W);
+  return num_dims - 1;
+}
+
+// Returns the dimension index of the specified 'spatial_dim' within an
+// activation tensor. If format is NHWC_VECT_W and spatial_dim is 1, returns
+// the index of the outer width dimension (i.e. dimension 2, whose size would
+// be width / 4 in this case).
+inline int GetTensorSpatialDimIndex(int num_dims, TensorFormat format,
+                                    int spatial_dim) {
+  CHECK(spatial_dim >= 0 &&
+        spatial_dim < GetTensorSpatialDims(num_dims, format))
+      << spatial_dim << " " << num_dims << " " << ToString(format);
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NHWC_VECT_W:
+      return spatial_dim + 1;
+    case FORMAT_NCHW:
+    case FORMAT_NCHW_VECT_C:
+      return spatial_dim + 2;
+    case FORMAT_HWNC:
+    case FORMAT_HWCN:
+      return spatial_dim;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+inline int GetFilterTensorSpatialDimIndex(int num_dims,
+                                          FilterTensorFormat format, int dim) {
+  CHECK(dim >= 0 && dim < GetFilterTensorSpatialDims(num_dims, format))
+      << dim << " " << num_dims << " " << ToString(format);
+  switch (format) {
+    case FORMAT_HWIO:
+      return dim;
+    case FORMAT_OIHW:
+    case FORMAT_OIHW_VECT_I:
+      return dim + 2;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the inner input channels dimension.
+inline int GetFilterTensorInnerInputChannelsDimIndex(
+    int num_dims, FilterTensorFormat format) {
+  DCHECK_EQ(format, FORMAT_OIHW_VECT_I);
+  return num_dims - 1;
+}
+
+// Returns the index of the input channels dimension.
+// If 'format' is FORMAT_OIHW_VECT_I, returns the dimension index of the
+// outer input channel (i.e. 1), which holds num_input_channels / 4.
+inline int GetFilterTensorInputChannelsDimIndex(int num_dims,
+                                                FilterTensorFormat format) {
+  switch (format) {
+    case FORMAT_HWIO:
+      return num_dims - 2;
+    case FORMAT_OIHW:
+    case FORMAT_OIHW_VECT_I:
+      return 1;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the output channels dimension.
+inline int GetFilterTensorOutputChannelsDimIndex(int num_dims,
+                                                 FilterTensorFormat format) {
+  switch (format) {
+    case FORMAT_HWIO:
+      return num_dims - 1;
+    case FORMAT_OIHW:
+    case FORMAT_OIHW_VECT_I:
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// TODO(pauldonnelly): Replace these tensor dimension index functions with
+// constant structs to improve performance and reduce code size in Compute()
+// functions.
+
+// Return the dimension index for the specified 'dimension' of the specified
+// data 'tensor_format'.  'dimension' is a char that can be 'N' (batch size),
+// 'C' (channels), 'H' (height), 'W' (width),  or a numbered spatial dimension:
+// '0',  .. (NUM_SPATIAL_DIMS-1)..
+// If 'format' is NCHW_VECT_C and 'dimension' is 'C', returns the index of
+// the outer channel dimension (i.e. 1).
+template <int NUM_SPATIAL_DIMS>
+inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
+  if (format == FORMAT_NHWC || format == FORMAT_NHWC_VECT_W) {
+    // clang-format off
+    switch (dimension) {
+      case 'N': return 0;
+      case '0': return 1;
+      case '1': return 2;
+      case '2': return 3;
+      case 'H': return NUM_SPATIAL_DIMS - 1;
+      case 'W': return NUM_SPATIAL_DIMS;
+      case 'C': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else if (format == FORMAT_NCHW || format == FORMAT_NCHW_VECT_C) {
+    switch (dimension) {
+      case 'N': return 0;
+      case 'C': return 1;
+      case '0': return 2;
+      case '1': return 3;
+      case '2': return 4;
+      case 'H': return NUM_SPATIAL_DIMS;
+      case 'W': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else if (format == FORMAT_HWNC) {
+    switch (dimension) {
+      case '0': return 0;
+      case '1': return 1;
+      case '2': return 2;
+      case 'H': return NUM_SPATIAL_DIMS - 2;
+      case 'W': return NUM_SPATIAL_DIMS - 1;
+      case 'N': return NUM_SPATIAL_DIMS;
+      case 'C': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else if (format == FORMAT_HWCN) {
+    switch (dimension) {
+      case '0': return 0;
+      case '1': return 1;
+      case '2': return 2;
+      case 'H': return NUM_SPATIAL_DIMS - 2;
+      case 'W': return NUM_SPATIAL_DIMS - 1;
+      case 'C': return NUM_SPATIAL_DIMS;
+      case 'N': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else {
+    LOG(FATAL) << "Invalid format: " << static_cast<int>(format);
+    return -1;  // Avoid compiler warning about missing return value
+  }
+  // clang-format on
+}
+
+// Return the dimension index for the specified 'dimension' of the specified
+// 'filter_tensor_format'.  'dimension' is a char that can be 'O' (num output
+// channels), 'I' (num input channels), 'H' (height), 'W' (width), or a
+// numbered spatial dimension: '0',  .. (NUM_SPATIAL_DIMS-1).
+// If 'format' is OIHW_VECT_I and 'dimension' is 'I', returns the index of the
+// outer input channels dimension (i.e. 1).
+template <int NUM_SPATIAL_DIMS>
+inline int GetFilterDimIndex(FilterTensorFormat filter_tensor_format,
+                             char dimension) {
+  // clang-format off
+  if (filter_tensor_format == FORMAT_HWIO) {
+    switch (dimension) {
+      case '0': return 0;
+      case '1': return 1;
+      case '2': return 2;
+      case 'H': return NUM_SPATIAL_DIMS - 2;
+      case 'W': return NUM_SPATIAL_DIMS - 1;
+      case 'I': return NUM_SPATIAL_DIMS;
+      case 'O': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else if (filter_tensor_format == FORMAT_OIHW ||
+             filter_tensor_format == FORMAT_OIHW_VECT_I) {
+    switch (dimension) {
+      case 'O': return 0;
+      case 'I': return 1;
+      case '0': return 2;
+      case '1': return 3;
+      case '2': return 4;
+      case 'H': return NUM_SPATIAL_DIMS;
+      case 'W': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else {
+    LOG(FATAL) << "Invalid format: " << static_cast<int>(filter_tensor_format);
+    return -1;  // Avoid compiler warning about missing return value
+  }
+  // clang-format on
+}
+
+inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
+  return GetTensorDimIndex<2>(format, dimension);
+}
+
+inline int32 GetTensorDimIndex(TensorFormat format, char dimension,
+                               int num_total_dims) {
+  int32_t index = (GetTensorSpatialDims(num_total_dims, format) == 3)
+                      ? GetTensorDimIndex<3>(format, dimension)
+                      : GetTensorDimIndex<2>(format, dimension);
+  CHECK(index >= 0 && index < num_total_dims)  // Crash OK.
+      << "Invalid index from the dimension: " << index << ", " << format << ", "
+      << dimension;
+  return index;
+}
+
+// Return the element from 'dimension_attributes' that corresponds to the
+// specified 'dimension' according to 'tensor_format'.
+template <typename T>
+T GetTensorDim(gtl::ArraySlice<T> dimension_attributes,
+               TensorFormat tensor_format, char dimension) {
+  int index =
+      GetTensorDimIndex(tensor_format, dimension, dimension_attributes.size());
+  return dimension_attributes[index];
+}
+
+// Return the element from 'dimension_attribute' that corresponds to the
+// specified 'dimension' according to 'filter_tensor_format'.
+template <typename T>
+T GetFilterDim(gtl::ArraySlice<T> dimension_attribute,
+               FilterTensorFormat filter_tensor_format, char dimension) {
+  int index = (GetFilterTensorSpatialDims(dimension_attribute.size(),
+                                          filter_tensor_format) == 3)
+                  ? GetFilterDimIndex<3>(filter_tensor_format, dimension)
+                  : GetFilterDimIndex<2>(filter_tensor_format, dimension);
+  using size_type = typename gtl::ArraySlice<T>::size_type;
+  CHECK(index >= 0 &&
+        static_cast<size_type>(index) < dimension_attribute.size())
+      << "Invalid index from the dimension: " << index << ", "
+      << filter_tensor_format << ", " << dimension;
+  return dimension_attribute[index];
+}
+
+template <typename T>
+T GetTensorDim(const std::vector<T>& attributes, TensorFormat format,
+               char dimension) {
+  return GetTensorDim(gtl::ArraySlice<T>(attributes), format, dimension);
+}
+
+// Return the size of the specified 'dimension' within 'tensor_shape'
+// according to 'tensor_format'.
+inline int64_t GetTensorDim(const TensorShape& tensor_shape,
+                            TensorFormat tensor_format, char dimension) {
+  return GetTensorDim(absl::Span<const int64_t>(tensor_shape.dim_sizes()),
+                      tensor_format, dimension);
+}
+
+// Return the size of the specified 'dimension' within 'tensor_shape'
+// according to 'tensor_filter_format'.
+inline int64_t GetFilterDim(const TensorShape& tensor_shape,
+                            FilterTensorFormat tensor_filter_format,
+                            char dimension) {
+  return GetFilterDim(absl::Span<const int64_t>(tensor_shape.dim_sizes()),
+                      tensor_filter_format, dimension);
+}
+
+// Return the size of the specified 'dimension' of 'tensor' according to
+// 'tensor_format'.
+inline int64_t GetTensorDim(const Tensor& tensor, TensorFormat tensor_format,
+                            char dimension) {
+  return GetTensorDim(tensor.shape(), tensor_format, dimension);
+}
+
+// Return the size of the specified 'dimension' of 'tensor' according to
+// 'filter_tensor_format'.
+inline int64_t GetFilterDim(const Tensor& tensor,
+                            FilterTensorFormat filter_tensor_format,
+                            char dimension) {
+  return GetFilterDim(tensor.shape(), filter_tensor_format, dimension);
+}
+
+inline void GetExplicitPaddingForDim(
+    const std::vector<int64_t>& explicit_paddings, TensorFormat tensor_format,
+    char dimension, int64_t* padding_before, int64_t* padding_after) {
+  int index =
+      GetTensorDimIndex(tensor_format, dimension, explicit_paddings.size() / 2);
+  *padding_before = explicit_paddings[2 * index];
+  *padding_after = explicit_paddings[2 * index + 1];
+}
+
+// Return the string that specifies the data format for convnet operations.
+std::string GetConvnetDataFormatAttrString();
+std::string GetConvnet3dDataFormatAttrString();
+
+// Return the string that specifies the filter format for convnet operations.
+std::string GetConvnetFilterFormatAttrString();
+std::string GetConvnet3dFilterFormatAttrString();
+std::string GetConvnetDataFormat2D3DAttrString();
+
+// Returns a tensor shape for the specified format and dimension sizes.
+// Works for both 2D and 3D operations. The output shapes are as follows:
+// FORMAT_NHWC:        (N, spatial, C); rank = spatial.size() + 2
+// FORMAT_NCHW:        (N, C, spatial); rank = spatial.size() + 2
+// FORMAT_NCHW_VECT_C: (N, C, spatial, InnerC); rank = spatial.size() + 3
+// FORMAT_NHWC_VECT_W: (N, spatial, C, InnerW); rank = spatial.size() + 3
+inline absl::Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
+                                              absl::Span<const int64_t> spatial,
+                                              int64_t C, TensorShape* shape) {
+  const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
+  absl::InlinedVector<int64_t, 6UL> dim_sizes(dims);
+  dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
+  for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
+    auto dim_size = spatial[dim];
+    if (format == FORMAT_NHWC_VECT_W &&
+        static_cast<size_t>(dim) == spatial.size() - 1) {
+      CHECK_EQ(0, dim_size % 4)
+          << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
+          << dim_size;
+      dim_sizes[GetTensorInnerWidthDimIndex(dims, format)] = 4;
+      dim_size /= 4;
+    }
+    dim_sizes[GetTensorSpatialDimIndex(dims, format, dim)] = dim_size;
+  }
+
+  int feature_index = GetTensorFeatureDimIndex(dims, format);
+  if (format == FORMAT_NCHW_VECT_C) {
+    CHECK_EQ(0, C % 4) << "NCHW_VECT_C requires C to be a multiple of 4, but C="
+                       << C;
+    C /= 4;
+    dim_sizes[GetTensorInnerFeatureDimIndex(dims, format)] = 4;
+  }
+  dim_sizes[feature_index] = C;
+  return TensorShapeUtils::MakeShape(dim_sizes, shape);
+}
+
+inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N,
+                                   absl::Span<const int64_t> spatial,
+                                   int64_t C) {
+  TensorShape shape;
+  TF_CHECK_OK(ShapeFromFormatWithStatus(format, N, spatial, C, &shape));
+  return shape;
+}
+
+// Return a tensor shape of the specified 'format', and dimensions.
+// Works for both 2D and 3D operations. If 'format' is OIHW_VECT_I,
+// the output TensorShape has spatial.size() + 3 dimensions, otherwise
+// it has spatial.size() + 2 dimensions.
+inline TensorShape ShapeFromFilterTensorFormat(
+    FilterTensorFormat format, absl::Span<const int64_t> spatial, int64_t I,
+    int64_t O) {
+  const int dims = GetFilterTensorDimsFromSpatialDims(spatial.size(), format);
+  absl::InlinedVector<int64_t, 6UL> dim_sizes(dims);
+  dim_sizes[GetFilterTensorOutputChannelsDimIndex(dims, format)] = O;
+  for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
+    dim_sizes[GetFilterTensorSpatialDimIndex(dims, format, dim)] = spatial[dim];
+  }
+
+  if (format == FORMAT_OIHW_VECT_I) {
+    CHECK_EQ(0, I % 4) << "OIHW_VECT_I requires I to be a multiple of 4, but I="
+                       << I;
+    I /= 4;
+    dim_sizes[GetFilterTensorInnerInputChannelsDimIndex(dims, format)] = 4;
+  }
+  dim_sizes[GetFilterTensorInputChannelsDimIndex(dims, format)] = I;
+  return TensorShape(dim_sizes);
+}
+
+// Return a tensor shape of the specified 'format', and dimensions.
+inline absl::Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
+                                              int64_t H, int64_t W, int64_t C,
+                                              TensorShape* shape) {
+  return ShapeFromFormatWithStatus(format, N, {H, W}, C, shape);
+}
+
+// Return a tensor shape of the specified 'format', and dimensions.
+inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N, int64_t H,
+                              int64_t W, int64_t C) {
+  TensorShape shape;
+  TF_CHECK_OK(ShapeFromFormatWithStatus(format, N, {H, W}, C, &shape));
+  return shape;
+}
+
+// Return a filter tensor shape of the specified 'format', and dimensions.
+inline TensorShape ShapeFromFilterTensorFormat(FilterTensorFormat format,
+                                               int64_t H, int64_t W, int64_t I,
+                                               int64_t O) {
+  return ShapeFromFilterTensorFormat(format, {H, W}, I, O);
+}
+
+// Returns a copy of the specified tensor 'src_shape' converted from
+// 'src_format' to 'dst_format'.
+inline absl::Status ShapeFromFormatWithStatus(TensorFormat dst_format,
+                                              const TensorShape& src_shape,
+                                              TensorFormat src_format,
+                                              TensorShape* shape) {
+  if (src_format == dst_format) {
+    *shape = src_shape;
+    return absl::OkStatus();
+  }
+
+  const int64_t batch = GetTensorDim(src_shape, src_format, 'N');
+  const int64_t channels = GetTensorDim(src_shape, src_format, 'C') *
+                           (src_format == FORMAT_NCHW_VECT_C ? 4 : 1);
+  const int num_src_spatial_dims =
+      GetTensorSpatialDims(src_shape.dims(), src_format);
+  std::vector<int64_t> spatial_dims(num_src_spatial_dims);
+  for (int spatial_dim = 0; spatial_dim < num_src_spatial_dims; ++spatial_dim) {
+    spatial_dims[spatial_dim] = absl::Span<const int64_t>(
+        src_shape.dim_sizes())[GetTensorSpatialDimIndex(
+        src_shape.dims(), src_format, spatial_dim)];
+  }
+  if (src_format == FORMAT_NHWC_VECT_W) {
+    spatial_dims[num_src_spatial_dims - 1] *= 4;
+  }
+  return ShapeFromFormatWithStatus(dst_format, batch, {spatial_dims}, channels,
+                                   shape);
+}
+
+inline TensorShape ShapeFromFormat(TensorFormat dst_format,
+                                   const TensorShape& src_shape,
+                                   TensorFormat src_format) {
+  TensorShape shape;
+  TF_CHECK_OK(
+      ShapeFromFormatWithStatus(dst_format, src_shape, src_format, &shape));
+  return shape;
+}
+
+// Returns a copy of the specified filter tensor 'src_shape' converted from
+// 'src_filter_format' to 'dst_filter_format'.
+inline TensorShape ShapeFromFilterFormat(FilterTensorFormat dst_filter_format,
+                                         const TensorShape& src_shape,
+                                         FilterTensorFormat src_filter_format) {
+  if (src_filter_format == dst_filter_format) {
+    return src_shape;
+  }
+
+  const int64_t output_channels =
+      GetFilterDim(src_shape, src_filter_format, 'O');
+  const int64_t input_channels =
+      GetFilterDim(src_shape, src_filter_format, 'I') *
+      (src_filter_format == FORMAT_OIHW_VECT_I ? 4 : 1);
+
+  if (GetFilterTensorSpatialDims(src_shape.dims(), src_filter_format) == 3) {
+    return ShapeFromFilterTensorFormat(
+        dst_filter_format,
+        {{GetFilterDim(src_shape, src_filter_format, '0'),
+          GetFilterDim(src_shape, src_filter_format, '1'),
+          GetFilterDim(src_shape, src_filter_format, '2')}},
+        input_channels, output_channels);
+  }
+
+  return ShapeFromFilterTensorFormat(
+      dst_filter_format,
+      {{GetFilterDim(src_shape, src_filter_format, 'H'),
+        GetFilterDim(src_shape, src_filter_format, 'W')}},
+      input_channels, output_channels);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_ops_util.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_ops_util.h
new file mode 100644
index 00000000..ccb0c8ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_ops_util.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+absl::Status ZerosLikeTensor(OpKernelContext* ctx, const Tensor& x,
+                             Tensor* out) {
+  AllocatorAttributes attr;
+  if (x.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(x.dtype(), x.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                       \
+  case DataTypeToEnum<dtype>::value:                            \
+    /* TODO(skyewm): use SetZeroFunctor like in ZerosLikeOp? */ \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) =    \
+        out->flat<dtype>().constant(dtype(0));                  \
+    break;
+
+    TF_CALL_POD_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DT_INVALID: {
+      *out = Tensor(DT_INVALID);
+      break;
+    }
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(
+          UnaryOpVariant<Device>(ctx, ZEROS_LIKE_VARIANT_UNARY_OP,
+                                 x.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument(
+          "Trying to compute zeros_like for unsupported dtype ",
+          DataTypeString(out->dtype()));
+  }
+  return absl::OkStatus();
+}
+
+template <typename Device>
+absl::Status BinaryAddTensors(OpKernelContext* ctx, const Tensor& a,
+                              const Tensor& b, Tensor* out) {
+  if (a.dtype() == DT_INVALID) {
+    *out = b;
+    return absl::OkStatus();
+  }
+  if (b.dtype() == DT_INVALID) {
+    *out = a;
+    return absl::OkStatus();
+  }
+  if (a.dtype() != b.dtype()) {
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element types. ",
+        "One is ", DataTypeString(a.dtype()), " and the other is ",
+        DataTypeString(b.dtype()));
+  }
+  if (a.shape() != b.shape()) {
+    // TODO(apassos) support broadcasting additions here?
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element shapes. ",
+        "One is ", a.shape().DebugString(), " and the other is ",
+        b.shape().DebugString());
+  }
+
+  AllocatorAttributes attr;
+  if (a.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(a.dtype(), a.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                    \
+  case DataTypeToEnum<dtype>::value:                         \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) = \
+        a.flat<dtype>() + b.flat<dtype>();                   \
+    break;
+
+    TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(BinaryOpVariants<Device>(
+          ctx, ADD_VARIANT_BINARY_OP, a.scalar<Variant>()(),
+          b.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument("Trying to add unsupported dtype ",
+                                     out->dtype());
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_reader.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_reader.h
new file mode 100644
index 00000000..be6a0b6f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_reader.h
@@ -0,0 +1,215 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to read checkpoints for google brain tensor ops and v3
+// checkpoints for dist_belief.
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/saved_tensor_slice.pb.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+#include "tensorflow/core/util/tensor_slice_set.h"
+#include "tensorflow/core/util/tensor_slice_util.h"
+
+namespace tensorflow {
+
+namespace checkpoint {
+
+// The reader reads in all the meta data about all the tensor slices. Then it
+// will try to read the relevant data on-demand to produce the data for the
+// slices needed.
+// NOTE(yangke): another way to do this is to first load a list of the tensor
+// slices needed and then just selectively read some of the meta data. That
+// might optimize the loading but makes the logic a bit more complicated. We
+// might want to revisit that.
+// TODO(yangke): consider moving to TensorProto.
+class TensorSliceReader {
+ public:
+  // Abstract interface for reading data out of a tensor slice checkpoint file
+  class Table {
+   public:
+    virtual ~Table();
+    virtual bool Get(const string& key, string* value) = 0;
+  };
+  typedef std::function<absl::Status(const string&, Table**)> OpenTableFunction;
+
+  static constexpr int kLoadAllShards = -1;
+  TensorSliceReader(const string& filepattern);
+  TensorSliceReader(const string& filepattern, OpenTableFunction open_function);
+  TensorSliceReader(const string& filepattern, OpenTableFunction open_function,
+                    int preferred_shard);
+  virtual ~TensorSliceReader();
+
+  // Get the filename this reader is attached to.
+  const string& filepattern() const { return filepattern_; }
+
+  // Get the number of files matched.
+  int num_files() const { return sss_.size(); }
+
+  // Get the status of the reader.
+  absl::Status status() const { return status_; }
+
+  // Checks if the reader contains any slice of a tensor. In case the reader
+  // does contain the tensor, if "shape" is not nullptr, fill "shape" with the
+  // shape of the tensor; if "type" is not nullptr, fill "type" with the type
+  // of the tensor.
+  bool HasTensor(const string& name, TensorShape* shape, DataType* type) const;
+
+  // Checks if the reader contains all the data about a tensor slice, and if
+  // yes, copies the data of the slice to "data". The caller needs to make sure
+  // that "data" points to a buffer that holds enough data.
+  // This is a slow function since it needs to read sstables.
+  template <typename T>
+  bool CopySliceData(const string& name, const TensorSlice& slice,
+                     T* data) const;
+
+  // Get the tensors.
+  const std::unordered_map<string, TensorSliceSet*>& Tensors() const {
+    return tensors_;
+  }
+
+  // Returns value for one tensor. Only single slice checkpoints are supported
+  // at the moment.
+  absl::Status GetTensor(const string& name,
+                         std::unique_ptr<tensorflow::Tensor>* out_tensor) const;
+
+  typedef std::unordered_map<string, TensorShape> VarToShapeMap;
+  typedef std::unordered_map<string, DataType> VarToDataTypeMap;
+
+  // Returns a map from tensor name to shape.
+  VarToShapeMap GetVariableToShapeMap() const;
+
+  // Returns a map from tensor name to data type.
+  VarToDataTypeMap GetVariableToDataTypeMap() const;
+
+  // Returns a string containing names and shapes of all the tensors.
+  const string DebugString() const;
+
+ private:
+  friend class TensorSliceWriteTestHelper;
+
+  void LoadShard(int shard) const;
+  void LoadAllShards() const;
+
+  const TensorSliceSet* FindTensorSlice(
+      const string& name, const TensorSlice& slice,
+      std::vector<std::pair<TensorSlice, string>>* details) const;
+
+  const string filepattern_;
+  const OpenTableFunction open_function_;
+  std::vector<string> fnames_;
+  std::unordered_map<string, int> fname_to_index_;
+
+  // Guards the attributes below.
+  mutable mutex mu_;
+  mutable bool all_shards_loaded_ = false;
+  mutable std::vector<std::unique_ptr<Table>> sss_;
+  mutable std::unordered_map<string, TensorSliceSet*> tensors_;
+  mutable absl::Status status_;
+
+  TensorSliceReader(const TensorSliceReader&) = delete;
+  void operator=(const TensorSliceReader&) = delete;
+};
+
+absl::Status OpenTableTensorSliceReader(const string& fname,
+                                        TensorSliceReader::Table** result);
+
+template <typename T>
+bool TensorSliceReader::CopySliceData(const string& name,
+                                      const TensorSlice& slice, T* data) const {
+  std::vector<std::pair<TensorSlice, string>> details;
+  const TensorSliceSet* tss;
+  {
+    mutex_lock l(mu_);
+    tss = FindTensorSlice(name, slice, &details);
+    if (!tss && !all_shards_loaded_) {
+      VLOG(1) << "Did not find slice in preferred shard, loading all shards."
+              << name << ": " << slice.DebugString();
+      LoadAllShards();
+      tss = FindTensorSlice(name, slice, &details);
+    }
+    if (!tss) {
+      // No such tensor
+      return false;
+    }
+  }
+  // We have the data -- copy it over.
+  string value;
+  for (const auto& x : details) {
+    const TensorSlice& slice_s = x.first;
+    const string& fname = x.second;
+    int idx = gtl::FindWithDefault(fname_to_index_, fname, -1);
+    CHECK_GE(idx, 0) << "Failed to find the index for filename " << fname;
+    // We read a record in the corresponding sstable
+    const string key = EncodeTensorNameSlice(name, slice_s);
+    if (!sss_[idx]->Get(key, &value)) {
+      VLOG(1) << "Failed to seek to the record for tensor " << name
+              << ", slice " << slice_s.DebugString()
+              << ": computed key = " << key;
+      return false;
+    }
+    SavedTensorSlices sts;
+    if (!ParseProtoUnlimited(&sts, value)) {
+      VLOG(1) << "Failed to parse the record for tensor " << name << ", slice "
+              << slice_s.DebugString() << ": computed key = " << key;
+      return false;
+    }
+    // Ensure the TensorSlice contains the expected amount of data.
+    TensorShape shp_s;
+    absl::Status s = slice_s.SliceTensorShape(tss->shape(), &shp_s);
+    if (!s.ok()) {
+      VLOG(1) << "Failed to slice tensor " << name << ", slice "
+              << slice_s.DebugString() << ": " << s;
+      return false;
+    }
+    if (checkpoint::TensorProtoDataSize<T>(sts.data().data()) !=
+        shp_s.num_elements()) {
+      VLOG(1) << "Tensor " << name << ", slice " << slice_s.DebugString()
+              << " had an unexpected amount of data: expected = "
+              << shp_s.num_elements() << ", got = "
+              << checkpoint::TensorProtoDataSize<T>(sts.data().data());
+      return false;
+    }
+    CopyDataFromTensorSliceToTensorSlice(
+        tss->shape(), slice_s, slice,
+        checkpoint::TensorProtoData<T>(sts.data().data()), data);
+  }
+  return true;
+}
+
+}  // namespace checkpoint
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_reader_cache.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_reader_cache.h
new file mode 100644
index 00000000..cc85b7f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_reader_cache.h
@@ -0,0 +1,91 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to read checkpoints for google brain tensor ops and v3
+// checkpoints for dist_belief.
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
+
+#include <set>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+
+namespace tensorflow {
+
+namespace checkpoint {
+
+class TensorSliceReaderCache;
+
+// Wrapper to a lazily allocated TensorSliceReaderCache.
+class TensorSliceReaderCacheWrapper {
+ public:
+  TensorSliceReaderCacheWrapper();
+  ~TensorSliceReaderCacheWrapper();
+
+  // Same as TensorSliceReaderCache::GetReader().
+  const TensorSliceReader* GetReader(
+      const string& filepattern,
+      TensorSliceReader::OpenTableFunction open_function,
+      int preferred_shard) const;
+
+ private:
+  mutable mutex mu_;
+  mutable TensorSliceReaderCache* cache_ = nullptr;
+};
+
+// A cache of TensorSliceReaders.
+class TensorSliceReaderCache {
+ public:
+  TensorSliceReaderCache();
+  ~TensorSliceReaderCache();
+
+  // Returns the TensorSliceReader corresponding to 'filepattern' and the
+  // open_function.  May return nullptr if we can not create a new
+  // TensorSliceReader for the filepattern/open_function combination.
+  const TensorSliceReader* GetReader(
+      const string& filepattern,
+      TensorSliceReader::OpenTableFunction open_function, int preferred_shard);
+
+ private:
+  // Need to use a regular function type in the key map as std::function does
+  // not support ==.
+  typedef absl::Status (*OpenFuncType)(const string&,
+                                       TensorSliceReader::Table**);
+
+  // Protects attributes below.
+  mutex mu_;
+
+  // Maps of opened readers.
+  std::unordered_map<string, std::pair<OpenFuncType, TensorSliceReader*>>
+      readers_;
+
+  // Set of keys that a previous GetReader() call is still trying to populate.
+  std::set<string> still_opening_;
+
+  // Condition variable to notify when a reader has been created.
+  condition_variable cv_;
+};
+
+}  // namespace checkpoint
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_set.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_set.h
new file mode 100644
index 00000000..f7b3d08d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_set.h
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A class to manage slices of a tensor. You can "register" set of slices for a
+// tensor and then "query" if we have data for a given slice.
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_SET_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_SET_H_
+
+#include <string>  // for string
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"       // for Status
+#include "tensorflow/core/lib/core/stringpiece.h"  // for StringPiece
+#include "tensorflow/core/platform/types.h"        // for int64
+
+namespace tensorflow {
+
+namespace checkpoint {
+
+class TensorSliceSet {
+ public:
+  TensorSliceSet(const TensorShape& shape, DataType type);
+  virtual ~TensorSliceSet();
+
+  const TensorShape& shape() const { return shape_; }
+  DataType type() const { return type_; }
+
+  // Register a new slice for the tensor. The "tag" is an arbitrary string
+  // associated with the slice (in one application it denotes the name of the
+  // file that contains the slice); the "data" points to the data of the tensor
+  // slice (it can be a nullptr).
+  absl::Status Register(const TensorSlice& slice, const string& tag);
+
+  // Alternative way of querying about a new slice: instead of copying the
+  // data, it returns a list of meta data about the stored slices that will
+  // supply data for the slice.
+  bool QueryMeta(
+      const TensorSlice& slice,
+      std::vector<std::pair<tensorflow::TensorSlice, string>>* results) const;
+
+  struct SliceInfo {
+    TensorSlice slice;
+    const string tag;
+    int64_t num_floats;
+  };
+
+  // Returns the map from slice string to SliceInfo.
+  const std::unordered_map<string, SliceInfo>& Slices() const {
+    return slices_;
+  }
+
+ private:
+  const TensorShape shape_;
+  const DataType type_;
+  // We maintain a mapping from the slice string to the slice information.
+  std::unordered_map<string, SliceInfo> slices_;
+
+  // Minimal slice which contains all presented slices. Used for speeding up
+  // overlap check when slices are being added consequently.
+  TensorSlice slices_hull_;
+};
+
+// Registers "slice" in the TensorSliceSet stored in "tensor_slices", under key
+// "name".  Other arguments are used for validations.  Does not modify the map
+// or its values on non-OK.
+// REQUIRES: tensor_slices != nullptr
+absl::Status RegisterTensorSlice(
+    const string& name, const TensorShape& shape, DataType type,
+    const string& tag, const TensorSlice& slice,
+    std::unordered_map<string, TensorSliceSet*>* tensor_slices);
+
+}  // namespace checkpoint
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_SET_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_util.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_util.h
new file mode 100644
index 00000000..b58ecf55
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_util.h
@@ -0,0 +1,191 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_UTIL_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Some hackery to invoke eigen tensor to copy over tensor slices with variable
+// dimension tensors.
+// TODO(yangke): get rid of that once the variable dimension tensor support is
+// in.
+static const int kTensorSliceMaxRank = 8;
+
+// Create a tensor map with the given shape: we support up to 8 dimensions. If
+// the shape has less than 8 dimensions, we pad the remaining dimension with 1.
+template <typename T>
+Eigen::TensorMap<Eigen::Tensor<T, kTensorSliceMaxRank, Eigen::RowMajor>>
+GetEigenTensorMapFromTensorShape(const TensorShape& shape, T* data) {
+  Eigen::DSizes<Eigen::DenseIndex, kTensorSliceMaxRank> dsizes =
+      shape.AsEigenDSizesWithPadding<kTensorSliceMaxRank>();
+  Eigen::TensorMap<Eigen::Tensor<T, kTensorSliceMaxRank, Eigen::RowMajor>> eig(
+      data, dsizes);
+  return eig;
+}
+
+// For everything except string, a standard Eigen cast and assignment works
+template <typename DstT>
+struct CopyThatWorksWithStringPointer {
+  template <typename SrcTensor, typename DstTensor, typename Shape>
+  static void Copy(const SrcTensor& s, Shape s_start, Shape len, DstTensor& d,
+                   Shape d_start) {
+    d.slice(d_start, len) = s.slice(s_start, len).template cast<DstT>();
+  }
+};
+
+// Eigen makes it extremely difficult to dereference a tensor of string* into
+// string, so we roll our own loop instead.
+template <>
+struct CopyThatWorksWithStringPointer<tstring> {
+  template <typename SrcTensor, typename DstTensor, typename Shape>
+  static void Copy(const SrcTensor& s, Shape s_start, Shape len, DstTensor& d,
+                   Shape d_start) {
+    typedef typename SrcTensor::Index Index;
+    static_assert(kTensorSliceMaxRank == 8,
+                  "If kTensorSliceMaxRank changes, modify the loop below.");
+    for (Index i0 = 0; i0 < len[0]; i0++) {
+      for (Index i1 = 0; i1 < len[1]; i1++) {
+        for (Index i2 = 0; i2 < len[2]; i2++) {
+          for (Index i3 = 0; i3 < len[3]; i3++) {
+            for (Index i4 = 0; i4 < len[4]; i4++) {
+              for (Index i5 = 0; i5 < len[5]; i5++) {
+                for (Index i6 = 0; i6 < len[6]; i6++) {
+                  for (Index i7 = 0; i7 < len[7]; i7++) {
+                    d(d_start[0] + i0, d_start[1] + i1, d_start[2] + i2,
+                      d_start[3] + i3, d_start[4] + i4, d_start[5] + i5,
+                      d_start[6] + i6, d_start[7] + i7) =
+                        *s(s_start[0] + i0, s_start[1] + i1, s_start[2] + i2,
+                           s_start[3] + i3, s_start[4] + i4, s_start[5] + i5,
+                           s_start[6] + i6, s_start[7] + i7);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Checkpointing of half is done by storing the raw 16 bits as a signed 32bit
+// integer. To restore the checkpoint we need to do the reverse operation by
+// reinterpreting the integer as a 16 bit float. This prevents us from using
+// the default cast operation.
+template <>
+struct CopyThatWorksWithStringPointer<Eigen::half> {
+  template <typename SrcTensor, typename DstTensor, typename Shape>
+  static void Copy(const SrcTensor& s, Shape s_start, Shape len, DstTensor& d,
+                   Shape d_start) {
+    typedef typename SrcTensor::Index Index;
+    static_assert(kTensorSliceMaxRank == 8,
+                  "If kTensorSliceMaxRank changes, modify the loop below.");
+    for (Index i0 = 0; i0 < len[0]; i0++) {
+      for (Index i1 = 0; i1 < len[1]; i1++) {
+        for (Index i2 = 0; i2 < len[2]; i2++) {
+          for (Index i3 = 0; i3 < len[3]; i3++) {
+            for (Index i4 = 0; i4 < len[4]; i4++) {
+              for (Index i5 = 0; i5 < len[5]; i5++) {
+                for (Index i6 = 0; i6 < len[6]; i6++) {
+                  for (Index i7 = 0; i7 < len[7]; i7++) {
+                    d(d_start[0] + i0, d_start[1] + i1, d_start[2] + i2,
+                      d_start[3] + i3, d_start[4] + i4, d_start[5] + i5,
+                      d_start[6] + i6, d_start[7] + i7) =
+                        Eigen::numext::bit_cast<Eigen::half, uint16_t>(
+                            s(s_start[0] + i0, s_start[1] + i1, s_start[2] + i2,
+                              s_start[3] + i3, s_start[4] + i4, s_start[5] + i5,
+                              s_start[6] + i6, s_start[7] + i7));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Given a tensor described by "shape", two slices "slice_s" and "slice_d",
+// and two pointers "ptr_s" and "ptr_d", where "ptr_s" points to a chunk of
+// memory that stores the data for "slice_s" and "ptr_d" points to a chunk of
+// memory that stores the data for "slice_d". This function copies the data
+// that belongs to the intersection of the two slices from slice_s to
+// slice_d.  Uses Tensor cast<DstT>() to convert from SrcT to DstT. Returns true
+// iff the two slices share any intersection (and thus some data is copied).
+// TODO(yangke): figure out if we can make it private.
+template <typename SrcT, typename DstT>
+static bool CopyDataFromTensorSliceToTensorSlice(const TensorShape& shape,
+                                                 const TensorSlice& slice_s,
+                                                 const TensorSlice& slice_d,
+                                                 const SrcT* ptr_s,
+                                                 DstT* ptr_d) {
+  CHECK_LE(shape.dims(), kTensorSliceMaxRank)
+      << "Only tensors of size up to " << kTensorSliceMaxRank
+      << " are supported";
+  // We need to compute the intersection of the two slices.
+  TensorSlice inter;
+  if (!slice_s.Intersect(slice_d, &inter)) {
+    // There is no intersection: returns false.
+    return false;
+  } else {
+    // We need to compute the applied shapes after applying slice_s and
+    // slice_d.
+    TensorShape shp_s, shp_d;
+    absl::Status s;
+    s = slice_s.SliceTensorShape(shape, &shp_s);
+    if (!s.ok()) {
+      LOG(WARNING) << s;
+      return false;
+    }
+    s = slice_d.SliceTensorShape(shape, &shp_d);
+    if (!s.ok()) {
+      LOG(WARNING) << s;
+      return false;
+    }
+
+    // We need to compute the relative slice of "inter" w.r.t. both slice_s and
+    // slice_d.
+    TensorSlice rel_s, rel_d;
+    slice_s.ComputeRelative(inter, &rel_s);
+    slice_d.ComputeRelative(inter, &rel_d);
+
+    // Get the eigen tensor maps to the data.
+    auto t_s = GetEigenTensorMapFromTensorShape(shp_s, ptr_s);
+    auto t_d = GetEigenTensorMapFromTensorShape(shp_d, ptr_d);
+
+    Eigen::DSizes<Eigen::DenseIndex, kTensorSliceMaxRank> s_start, s_len,
+        d_start, d_len;
+
+    rel_s.FillIndicesAndSizes<kTensorSliceMaxRank>(shp_s, &s_start, &s_len);
+    rel_d.FillIndicesAndSizes<kTensorSliceMaxRank>(shp_d, &d_start, &d_len);
+    CopyThatWorksWithStringPointer<DstT>::Copy(t_s, s_start, s_len, t_d,
+                                               d_start);
+    return true;
+  }
+}
+
+}  // namespace
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_writer.h b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_writer.h
new file mode 100644
index 00000000..dbdfeea0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/tensor_slice_writer.h
@@ -0,0 +1,210 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to write checkpoints for google brain tensor ops and v3
+// checkpoints for dist_belief.
+
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
+
+#include <functional>
+#include <map>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/saved_tensor_slice.pb.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+namespace tensorflow {
+
+namespace checkpoint {
+
+class TensorSliceWriter {
+ public:
+  // Abstract interface that TensorSliceWriter uses for building
+  class Builder {
+   public:
+    virtual ~Builder() = default;
+    virtual void Add(absl::string_view key, absl::string_view value) = 0;
+    virtual absl::Status Finish(int64_t* file_size) = 0;
+  };
+  typedef std::function<absl::Status(const string&, Builder**)>
+      CreateBuilderFunction;
+
+  TensorSliceWriter(const string& filename,
+                    CreateBuilderFunction create_builder);
+  virtual ~TensorSliceWriter() = default;
+  // Adds a slice. We support float and int32 for now.
+  // TODO(yangke): add more supports
+  template <typename T>
+  absl::Status Add(const string& name, const TensorShape& shape,
+                   const TensorSlice& slice, const T* data);
+  absl::Status Finish();
+
+  // Allocate "num_elements" elements in "ss" and save the data in "data"
+  // there.
+  template <typename T>
+  static absl::Status SaveData(const T* data, int64_t num_elements,
+                               SavedSlice* ss);
+
+  static size_t MaxBytesPerElement(DataType dt);
+
+ private:
+  static size_t MaxBytesPerElementOrZero(DataType dt);
+
+  static constexpr size_t kMaxMessageBytes = 1LL << 31;
+  // Filling in the TensorProto in a SavedSlice will add the following
+  // header bytes, in addition to the data:
+  // - 1 byte: TensorProto tag and wire format
+  // - <= 5 bytes: TensorProto length
+  // - 1 byte: Repeated *_val tag and wire format
+  // - <= 5 bytes: *_val length
+  // However, we add 1KB of slack, to be conservative and guard
+  // against other additions to the TensorProto.
+  static constexpr size_t kTensorProtoHeaderBytes = 1 << 10;
+
+  const string filename_;
+  const CreateBuilderFunction create_builder_;
+  string data_filename_;
+  bool use_temp_file_;
+
+  // A mapping from the tensor names to their index in meta_.saved_slice_meta()
+  std::unordered_map<string, int> name_to_index_;
+  // The metadata that holds all the saved tensor slices.
+  SavedTensorSlices sts_;
+  // The data to be written to the builder
+  std::map<string, string> data_;
+  // Total number of slices written
+  int slices_;
+  TensorSliceWriter(const TensorSliceWriter&) = delete;
+  void operator=(const TensorSliceWriter&) = delete;
+};
+
+template <typename T>
+absl::Status TensorSliceWriter::Add(const string& name,
+                                    const TensorShape& shape,
+                                    const TensorSlice& slice, const T* data) {
+  // The tensor and the slice have to be compatible
+  if (shape.dims() != slice.dims()) {
+    return errors::Internal("Incompatible tensor shape and slice: ", "shape = ",
+                            shape.DebugString(),
+                            ", slice = ", slice.DebugString());
+  }
+  DataType dt = DataTypeToEnum<T>::value;
+  // We need to add an entry for "name" if there isn't an entry already.
+  int index = gtl::FindWithDefault(name_to_index_, name, -1);
+  if (index >= 0) {
+    // The same tensor has been registered -- we verify that the shapes and the
+    // type agree.
+    const SavedSliceMeta& ssm = sts_.meta().tensor(index);
+    CHECK_EQ(name, ssm.name()) << ssm.ShortDebugString();
+    TensorShape ssm_shape(ssm.shape());
+    if (!shape.IsSameSize(ssm_shape)) {
+      return errors::Internal(
+          "Mismatching shapes: existing tensor = ", ssm_shape.DebugString(),
+          ", trying to add name ", name, ", shape = ", shape.DebugString());
+    }
+    if (dt != ssm.type()) {
+      return errors::Internal(
+          "Mismatching types: existing type = ", DataTypeString(ssm.type()),
+          ", trying to add name ", name, ", type = ", DataTypeString(dt));
+    }
+  } else {
+    // Insert the new tensor name with the shape information
+    index = sts_.meta().tensor_size();
+    name_to_index_.insert(std::make_pair(name, index));
+    SavedSliceMeta* ssm = sts_.mutable_meta()->add_tensor();
+    ssm->set_name(name);
+    shape.AsProto(ssm->mutable_shape());
+    ssm->set_type(dt);
+  }
+  // Now we need to add the slice info the list of slices.
+  SavedSliceMeta* ssm = sts_.mutable_meta()->mutable_tensor(index);
+  slice.AsProto(ssm->add_slice());
+
+  // Now we need to add the real data.
+  {
+    SavedTensorSlices sts;
+    SavedSlice* ss = sts.mutable_data();
+    ss->set_name(name);
+    slice.AsProto(ss->mutable_slice());
+    TensorShape saved_shape(ssm->shape());
+    TensorShape sliced_shape;
+    TF_RETURN_IF_ERROR(slice.SliceTensorShape(saved_shape, &sliced_shape));
+    TF_RETURN_IF_ERROR(SaveData(data, sliced_shape.num_elements(), ss));
+    string key = EncodeTensorNameSlice(name, slice);
+    // TODO(yangke): consider doing a two-pass thing where the first pass just
+    // list the tensor slices we want to save and then another pass to actually
+    // set the data. Need to figure out if the interface works well.
+    std::pair<string, string> key_value(key, "");
+    if (!sts.AppendToString(&key_value.second)) {
+      return errors::Internal("Error writing Tensor. Possible size overflow.");
+    }
+    data_.insert(key_value);
+  }
+  ++slices_;
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status TensorSliceWriter::SaveData(const T* data, int64_t num_elements,
+                                         SavedSlice* ss) {
+  size_t max_bytes_per_element =
+      MaxBytesPerElementOrZero(DataTypeToEnum<T>::value);
+  if (max_bytes_per_element == 0) {
+    return errors::InvalidArgument(
+        "Tensor slice serialization not implemented for dtype ",
+        DataTypeToEnum<T>::value);
+  }
+  size_t size_bound = ss->ByteSize() + kTensorProtoHeaderBytes +
+                      (max_bytes_per_element * num_elements);
+  if (size_bound > kMaxMessageBytes) {
+    return errors::InvalidArgument(
+        "Tensor slice is too large to serialize (conservative estimate: ",
+        size_bound, " bytes)");
+  }
+  Fill(data, num_elements, ss->mutable_data());
+  DCHECK_GE(ss->ByteSize(), 0);
+  DCHECK_LE(ss->ByteSize(), size_bound);
+  return absl::OkStatus();
+}
+
+template <>
+absl::Status TensorSliceWriter::SaveData(const tstring* data,
+                                         int64_t num_elements, SavedSlice* ss);
+
+// Create a table builder that will write to "filename" in
+// tensorflow::io::Table format.  If successful, return OK
+// and set "*builder" to the allocated builder.  Otherwise, return a
+// non-OK status.
+absl::Status CreateTableTensorSliceBuilder(
+    const string& filename, TensorSliceWriter::Builder** builder);
+
+}  // namespace checkpoint
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/transform_output_iterator.h b/third_party/tflite-hdrs/tensorflow/core/util/transform_output_iterator.h
new file mode 100644
index 00000000..2dd201cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/transform_output_iterator.h
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename StoreType, typename InputType, typename ConversionOp,
+          typename OffsetT = ptrdiff_t>
+class TransformOutputIterator {
+ protected:
+  // Proxy object
+  struct Reference {
+    StoreType* ptr;
+    ConversionOp conversion_op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ Reference(StoreType* ptr,
+                                                  ConversionOp conversion_op)
+        : ptr(ptr), conversion_op(conversion_op) {}
+
+    /// Assignment
+    __host__ __device__ __forceinline__ InputType operator=(InputType val) {
+      *ptr = conversion_op(val);
+      return val;
+    }
+  };
+
+ public:
+  // Required iterator traits
+  typedef TransformOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;            ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef void
+      value_type;        ///< The type of the element the iterator can point to
+  typedef void pointer;  ///< The type of a pointer to an element the iterator
+                         ///< can point to
+  typedef Reference reference;  ///< The type of a reference to an element the
+                                ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+  /*private:*/
+
+  StoreType* ptr;
+  ConversionOp conversion_op;
+
+ public:
+  /// Constructor
+  template <typename QualifiedStoreType>
+  __host__ __device__ __forceinline__ TransformOutputIterator(
+      QualifiedStoreType* ptr,
+      ConversionOp conversionOp)  ///< Native pointer to wrap
+      : ptr(ptr), conversion_op(conversionOp) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    ptr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    ptr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return Reference(ptr, conversion_op);
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(ptr + n, conversion_op);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    ptr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(ptr - n, conversion_op);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    ptr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return ptr - other.ptr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return Reference(ptr + n, conversion_op);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (ptr == rhs.ptr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return (ptr != rhs.ptr);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/use_cudnn.h b/third_party/tflite-hdrs/tensorflow/core/util/use_cudnn.h
new file mode 100644
index 00000000..ba13b740
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/use_cudnn.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to check Cudnn dependency and set Cudnn-related flags.
+
+#ifndef TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
+#define TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
+
+#include <cstdint>
+
+#include "xla/tsl/util/use_cudnn.h"
+
+namespace tensorflow {
+
+using tsl::CudnnDisableConv1x1Optimization;
+using tsl::CudnnRnnUseAutotune;
+using tsl::CudnnUseAutotune;
+using tsl::CudnnUseFrontend;
+using tsl::CudnnUseRuntimeFusion;
+using tsl::DebugCudnnRnn;
+using tsl::DebugCudnnRnnAlgo;
+using tsl::DebugCudnnRnnUseTensorOps;
+using tsl::ShouldCudnnGroupedConvolutionBeUsed;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/util.h b/third_party/tflite-hdrs/tensorflow/core/util/util.h
new file mode 100644
index 00000000..d3dd88a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/util.h
@@ -0,0 +1,78 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/cpu_info.h"
+
+namespace tensorflow {
+
+// If op_name has '/' in it, then return everything before the first '/'.
+// Otherwise return empty string.
+absl::string_view NodeNamePrefix(const absl::string_view& op_name);
+
+// If op_name has '/' in it, then return everything before the last '/'.
+// Otherwise return empty string.
+absl::string_view NodeNameFullPrefix(const absl::string_view& op_name);
+
+class MovingAverage {
+ public:
+  explicit MovingAverage(int window);
+  ~MovingAverage();
+
+  void Clear();
+
+  double GetAverage() const;
+  void AddValue(double v);
+
+ private:
+  const int window_;  // Max size of interval
+  double sum_;        // Sum over interval
+  double* data_;      // Actual data values
+  int head_;          // Offset of the newest statistic in data_
+  int count_;         // # of valid data elements in window
+};
+
+// Returns a string printing bytes in ptr[0..n).  The output looks
+// like "00 01 ef cd cd ef".
+std::string PrintMemory(const char* ptr, size_t n);
+
+// Given a flattened index into a tensor, computes a string s so that
+// StrAppend("tensor", s) is a Python indexing expression.  E.g.,
+// "tensor", "tensor[i]", "tensor[i, j]", etc.
+std::string SliceDebugString(const TensorShape& shape, int64_t flat);
+
+// Check if MKL is enabled in runtime
+bool IsMKLEnabled();
+
+// Flag a warning if input type is unsupported on CPU when oneDNN is enabled
+void DataTypeUnsupportedWarning(const DataType& dt);
+
+// Check if input type is supported on CPU when oneDNN is enabled
+bool IsDataTypeSupportedByOneDNNOnThisCPU(const DataType& dt);
+
+// Check if input type supports AMX on CPU when oneDNN is enabled
+bool IsAMXDataTypeSupportedByOneDNNOnThisCPU(const DataType& dt);
+
+bool IsAVXConvertSupportedByOneDNNOnThisCPU();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/work_sharder.h b/third_party/tflite-hdrs/tensorflow/core/util/work_sharder.h
new file mode 100644
index 00000000..c414f706
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/work_sharder.h
@@ -0,0 +1,103 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_WORK_SHARDER_H_
+#define TENSORFLOW_CORE_UTIL_WORK_SHARDER_H_
+
+#include <functional>
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// DEPRECATED: Prefer threadpool->ParallelFor with SchedulingStrategy, which
+// allows you to specify the strategy for choosing shard sizes, including using
+// a fixed shard size. Use this function only if you want to manually cap
+// parallelism.
+//
+// Shards the "total" unit of work assuming each unit of work having
+// roughly "cost_per_unit". Each unit of work is indexed 0, 1, ...,
+// total - 1. Each shard contains 1 or more units of work and the
+// total cost of each shard is roughly the same. The calling thread and the
+// "workers" are used to compute each shard (calling work(start,
+// limit). A common configuration is that "workers" is a thread pool
+// with at least "max_parallelism" threads.
+//
+// "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+// if not CPU-bound) to complete a unit of work. Overestimating creates too
+// many shards and CPU time will be dominated by per-shard overhead, such as
+// Context creation. Underestimating may not fully make use of the specified
+// parallelism.
+//
+// "work" should be a callable taking (int64, int64) arguments.
+// work(start, limit) computes the work units from [start,
+// limit), i.e., [start, limit) is a shard.
+//
+// Too much parallelism can also cause excessive thread switches,
+// therefore, Shard() often limits the maximum parallelism. Each
+// caller can provide the 1st argument max_parallelism. A thread can
+// call SetMaxParallelism() so that all Shard() calls later limits the
+// thread parallelism.
+//
+// REQUIRES: max_parallelism >= 0
+// REQUIRES: workers != nullptr
+// REQUIRES: total >= 0
+// REQUIRES: cost_per_unit >= 0
+void Shard(int max_parallelism, thread::ThreadPool* workers, int64_t total,
+           int64_t cost_per_unit, std::function<void(int64_t, int64_t)> work);
+
+// Each thread has an associated option to express the desired maximum
+// parallelism. Its default is a very large quantity.
+//
+// Within TF runtime, per-thread max parallelism affects Shard() and
+// intra-op parallelism. E.g., if SetPerThreadMaxParallelism(1) is
+// arranged to be called by a tf_compute thread, Shard() calls and
+// eigen device assignment happens in that thread afterwards becomes
+// single-threaded.
+void SetPerThreadMaxParallelism(int max_parallelism);
+int GetPerThreadMaxParallelism();
+
+// Helper to set and unset per-thread max parallelism.
+class ScopedPerThreadMaxParallelism {
+ public:
+  ScopedPerThreadMaxParallelism(int max_parallelism)
+      : previous_(GetPerThreadMaxParallelism()) {
+    SetPerThreadMaxParallelism(max_parallelism);
+  }
+
+  ~ScopedPerThreadMaxParallelism() { SetPerThreadMaxParallelism(previous_); }
+
+ private:
+  int previous_ = -1;
+};
+
+// Implementation details for Shard().
+class Sharder {
+ public:
+  typedef std::function<void()> Closure;
+  typedef std::function<void(Closure)> Runner;
+  typedef std::function<void(int64_t, int64_t)> Work;
+
+  // Refers to Shard()'s comment for the meaning of total,
+  // cost_per_unit, work, max_parallelism. runner is an interface to
+  // schedule a closure. Shard() uses thread::ThreadPool instead.
+  static void Do(int64_t total, int64_t cost_per_unit, const Work& work,
+                 const Runner& runner, int max_parallelism);
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_WORK_SHARDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/core/util/xla_config_registry.h b/third_party/tflite-hdrs/tensorflow/core/util/xla_config_registry.h
new file mode 100644
index 00000000..dbfc0d90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/core/util/xla_config_registry.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H_
+
+#include <functional>
+
+#include "tensorflow/core/framework/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+namespace xla_config_registry {
+
+// XlaGlobalJitLevel is used by XLA to expose its JIT level for processing
+// single gpu and general (multi-gpu) graphs.
+struct XlaGlobalJitLevel {
+  OptimizerOptions::GlobalJitLevel single_gpu;
+  OptimizerOptions::GlobalJitLevel general;
+};
+
+// Input is the jit_level in session config, and return value is the jit_level
+// from XLA, reflecting the effect of the environment variable flags.
+typedef std::function<XlaGlobalJitLevel(
+    const OptimizerOptions::GlobalJitLevel&)>
+    GlobalJitLevelGetterTy;
+
+void RegisterGlobalJitLevelGetter(GlobalJitLevelGetterTy getter);
+
+XlaGlobalJitLevel GetGlobalJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts);
+
+#define REGISTER_XLA_CONFIG_GETTER(getter) \
+  REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(__COUNTER__, getter)
+
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(ctr, getter) \
+  REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)
+
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)                    \
+  static bool xla_config_registry_registration_##ctr =                  \
+      (::tensorflow::xla_config_registry::RegisterGlobalJitLevelGetter( \
+           getter),                                                     \
+       true)
+
+}  // namespace xla_config_registry
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/distribute/experimental/rpc/kernels/grpc_credentials.h b/third_party/tflite-hdrs/tensorflow/distribute/experimental/rpc/kernels/grpc_credentials.h
new file mode 100644
index 00000000..541e4049
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/distribute/experimental/rpc/kernels/grpc_credentials.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DISTRIBUTE_EXPERIMENTAL_RPC_KERNELS_GRPC_CREDENTIALS_H_
+#define TENSORFLOW_DISTRIBUTE_EXPERIMENTAL_RPC_KERNELS_GRPC_CREDENTIALS_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace tensorflow {
+namespace rpc {
+
+// Returns default credentials for use when creating a gRPC server.
+std::shared_ptr<::grpc::ServerCredentials> GetDefaultServerCredentials();
+
+// Returns default credentials for use when creating a gRPC channel.
+std::shared_ptr<::grpc::ChannelCredentials> GetDefaultChannelCredentials();
+
+}  // namespace rpc
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DISTRIBUTE_EXPERIMENTAL_RPC_KERNELS_GRPC_CREDENTIALS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/distribute/experimental/rpc/kernels/grpc_rpc_service.h b/third_party/tflite-hdrs/tensorflow/distribute/experimental/rpc/kernels/grpc_rpc_service.h
new file mode 100644
index 00000000..c479234f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/distribute/experimental/rpc/kernels/grpc_rpc_service.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DISTRIBUTE_EXPERIMENTAL_RPC_KERNELS_GRPC_RPC_SERVICE_H_
+#define TENSORFLOW_DISTRIBUTE_EXPERIMENTAL_RPC_KERNELS_GRPC_RPC_SERVICE_H_
+
+#include "tensorflow/distribute/experimental/rpc/proto/tf_rpc_service.grpc.pb.h"
+#include "tensorflow/distribute/experimental/rpc/proto/tf_rpc_service.pb.h"
+
+#endif  // TENSORFLOW_DISTRIBUTE_EXPERIMENTAL_RPC_KERNELS_GRPC_RPC_SERVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/constants.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/constants.h
new file mode 100644
index 00000000..8f11d629
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/constants.h
@@ -0,0 +1,170 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_CONSTANTS_H_
+#define TENSORFLOW_DTENSOR_CC_CONSTANTS_H_
+
+namespace tensorflow {
+namespace dtensor {
+// Constants used within dtensor scope.
+
+// Qualified attribute without `_` prefix.
+// Used in Ops attribute registration.
+static constexpr char kQualifiedLayoutAttr[] = "layout";
+
+// Internal attribute to DTensor MLIR passes and Graph nodes.
+// Prefixed with `_` so that it doesn't require op attribute registration.
+static constexpr char kLayoutAttr[] = "_layout";
+
+// Indicates a non-binding layout hint provided by the user.
+// `tf` prefix attached in MLIR importer for dialect requirements.
+static constexpr char kCustomDefaultLayoutAttr[] = "tf._default_layout";
+
+// Indicates a non-binding layout hint provided by the user.
+static constexpr char kDefaultLayoutAttr[] = "_default_layout";
+
+// Attribute carries layout information from Custom Device Arguments.
+// `tf` prefix attached in MLIR importer for dialect requirements.
+static constexpr char kCustomDeviceAttr[] = "tf._layout";
+
+// Indicates a default mesh provided by the user as fallback during mesh
+// propagation. `tf` prefix attached in MLIR importer for dialect requirements.
+static constexpr char kCustomDefaultMeshAttr[] = "tf._default_mesh";
+
+// Attribute attached on _Arg node for the mesh config.
+static constexpr char kMeshAttr[] = "_mesh";
+
+// Attribute carries mesh information from Custom Device Arguments.
+// `tf` prefix attached in MLIR importer for dialect requirements.
+static constexpr char kCustomDeviceMeshAttr[] = "tf._mesh";
+
+// Attribute carries argument indices for newly inferred layout of resource
+// handle.
+static constexpr char kNewResourceLayoutIndices[] =
+    "_inferred_resource_indices";
+
+// Attribute carries layout for newly inferred layout of resource handle.
+static constexpr char kNewResourceArgLayouts[] = "_inferred_resource_layouts";
+
+static constexpr char kNumLocalOutputsAttr[] = "_num_local_outputs";
+
+// Attribute carries input layout information for shape op.
+static constexpr char kShapeOpInputLayout[] = "_shape_input_layout";
+
+// Attribute carries input layout index for shape op. This forms a 1 -> 1
+// mapping for kShapeOpInputLayout above.
+static constexpr char kShapeOpInputLayoutIndices[] = "_shape_input_indices";
+
+// Attribute that carries global shape of operation. Used to preserve global
+// shape to be used during SPMD expansion.
+static constexpr char kGlobalShape[] = "_global_shape";
+
+// Global shape attribute with `tf.` dialect to be used for annotating func op
+// arguments/return values.
+static constexpr char kGlobalShapeDialectAttr[] = "tf._global_shape";
+
+// Attribute attached to resource-type function arguments containing the local
+// shape of the tensor that is being assigned to it.
+static constexpr char kAssignedResourceLocalShape[] =
+    "tf._assigned_resource_local_shape";
+
+// Tensor handles smaller than this is considered as small tensor. We perform
+// some optimizations around it. For example, will be transformed into constant
+// values during graph building, instead of being passed as inputs. In addition,
+// we allow automatical broadcasting small non-DTensor to DTensor device, which
+// is very useful for shape/axis info tensor in eager mode (eliminating the need
+// forcing users to do explicit copy-to-mesh).
+static constexpr int kSmallTensorThreshold = 20;
+
+// Contains a serialized mesh. Will be attached to a FloorMod op to denote which
+// mesh the output of the FloorMod op is giving coordinates for.
+static constexpr char kMeshCoordinatesAttr[] = "_mesh_coordinates";
+
+// Attribute used to determine if a module pass should log long form information
+// such as IR dumps etc.
+static constexpr char kDoNotLog[] = "dtensor.do_not_log";
+
+// Attribute used to record the name of the eager operation triggered the
+// DTensor rewrites.
+static constexpr char kEagerOperationName[] = "dtensor.eager_operation_name";
+
+// The number of TPU cores in a donut.
+static constexpr int kTpuDonutSize = 8;
+
+// An attribute used to cache the computation of device seeds, so that we don't
+// constantly recompute device seeds in a cluster for a given layout.
+static constexpr char kDeviceSeedForMeshDims[] =
+    "dtensor.device_seed_for_mesh_dims";
+
+// Attribute that determines whether to skip XlA compilation. There are some ops
+// that run on a TPU mesh but are not expected to be compiled by XLA, e.g.
+// VarHandleOp, DestroyResourceOp, etc. For such an case, set this attribute
+// to true on the StatefulPartitionedCallOp generated by MLIR lowering.
+static constexpr char kSkipXlaCompilation[] = "_skip_xla_compilation";
+
+// An attribute which stores the cache_key for the graph in the module. Used
+// to uniquely name functions.
+static constexpr char kCacheKey[] = "dtensor.cache_key";
+
+// An attribute on Const nodes to record which argument it was originally
+// from.
+static constexpr char kFromArgIndex[] = "dtensor.from_arg_index";
+
+// To record the target layout of a DTensorSend, which is computed after
+// layout propagation.
+static constexpr char kTargetLayoutAttr[] = "target_layout";
+
+// To record the source layout of a DTensorRecv, which is computed after
+// layout propagation.
+static constexpr char kSourceLayoutAttr[] = "source_layout";
+
+// An attribute that determines whether a tensor is a sparse tensor. If this
+// attribute exists in a tensor, then this tensor is a sparse tensor.
+static constexpr char kSparseValue[] = "tf._sparse";
+
+// Attribute which stores the layouts to be applied to the elements returned by
+// calling IteratorGetNextOp on a tf.data iterator.
+static constexpr char kIteratorElementLayouts[] = "tf._element_layouts";
+
+// Attribute used in tf.data ops which stores the shapes of the output elements.
+static constexpr char kIteratorOutputShapes[] = "output_shapes";
+
+// The number of list of regular tensors used to represent sparse tensors.
+static constexpr int kSparseTensorNum = 3;
+
+// Attribute which stores the environment variable value for all_reduce
+// optimization group size: DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE.
+// This represents the maximum number of AllReduce ops to merge into one op. It
+// is a determining factor used during dtensor_allreduce_combine_optimization.
+static constexpr char kAllReduceNumOpsInGroup[] =
+    "dtensor.all_reduce_combiner.num_ops_in_group";
+
+// Attribute which stores the environment variable value for whether
+// multi-device expansion is enabled: DTENSOR_ENABLE_MULTI_DEVICE_EXPANSION.
+static constexpr char kEnableMultiDeviceMode[] =
+    "dtensor.enable_multi_device_mode";
+
+// Attribute which stores the environment variable value for all_reduce
+// optimization group size: DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE.
+// This represents the maximum distance between two AllReduce on the compute
+// graph in terms of topological level. It is a determining factor used during
+// dtensor_allreduce_combine_optimization.
+static constexpr char kAllReduceTopologicalDistance[] =
+    "dtensor.all_reduce_combiner.topological_distance";
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/dstatus.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dstatus.h
new file mode 100644
index 00000000..62de9e96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dstatus.h
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_DSTATUS_H_
+#define TENSORFLOW_DTENSOR_CC_DSTATUS_H_
+
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+template <typename T>
+using StatusOr = tsl::StatusOr<T>;
+
+inline absl::Status WithContext(const absl::Status& ds, absl::string_view file,
+                                int line_number,
+                                absl::string_view context = "") {
+  if (ds.ok()) {
+    return ds;
+  }
+  return absl::Status(ds.code(), absl::StrCat(ds.message(), "\n", file, ":",
+                                              line_number, " :: ", context));
+}
+
+template <class T>
+inline StatusOr<T> WithContext(StatusOr<T>&& ds, absl::string_view file,
+                               int line_number,
+                               absl::string_view context = "") {
+  if (ds.ok()) {
+    return ds;
+  }
+  return absl::Status(ds.status().code(),
+                      absl::StrCat(ds.status().message(), "\n", file, ":",
+                                   line_number, " :: ", context));
+}
+
+#define DT_CTX(dstatus, ...) \
+  ::tensorflow::dtensor::WithContext(dstatus, __FILE__, __LINE__, #__VA_ARGS__);
+
+#undef TF_RETURN_IF_ERROR
+#define TF_RETURN_IF_ERROR(...)                                               \
+  do {                                                                        \
+    ::tensorflow::Status _status = (__VA_ARGS__);                             \
+    if (!_status.ok()) {                                                      \
+      return ::tensorflow::dtensor::WithContext(_status, __FILE__, __LINE__); \
+    }                                                                         \
+  } while (0);
+
+#undef TF_RETURN_WITH_CONTEXT
+#define TF_RETURN_WITH_CONTEXT(status, ...)                                  \
+  do {                                                                       \
+    ::tensorflow::Status _status = (status);                                 \
+    if (!_status.ok()) {                                                     \
+      return ::tensorflow::dtensor::WithContext(_status, __FILE__, __LINE__, \
+                                                ##__VA_ARGS__);              \
+    }                                                                        \
+  } while (0);
+
+#define DT_STATUS_MACROS_CONCAT_NAME(x, y) DT_STATUS_MACROS_CONCAT_IMPL(x, y)
+#define DT_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+
+#define DT_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr, ...)                \
+  auto statusor = (rexpr);                                                 \
+  if (!statusor.ok()) {                                                    \
+    return ::tensorflow::dtensor::WithContext(statusor.status(), __FILE__, \
+                                              __LINE__, ##__VA_ARGS__);    \
+  }                                                                        \
+  lhs = std::move(statusor.value())
+
+#undef TF_ASSIGN_OR_RETURN
+#define TF_ASSIGN_OR_RETURN(lhs, rexpr, ...)                                   \
+  DT_ASSIGN_OR_RETURN_IMPL(                                                    \
+      DT_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, rexpr, \
+      ##__VA_ARGS__)
+
+// Undefine TF status macros to ensure users use the context macros instead
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_DSTATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_device.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_device.h
new file mode 100644
index 00000000..0a30d7e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_device.h
@@ -0,0 +1,147 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_H_
+#define TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Configure a custom device which runs dtensor while executing
+// operations on `underlying_devices`. Allocates `device_info` and fills
+// `device`, which should then be passed to
+// TFE_RegisterCustomDevice. This only affects eager execution.
+//
+// `device_name` arg should match the `device_name` argument to
+// TFE_RegisterCustomDevice, and is the name of the custom device itself
+// (e.g. pass it to `tf.device` to place operations on it from Python).
+// TODO(b/268241383): Remove the `status = nullptr` overload.
+void AllocateDTensorDevice(absl::string_view device_name,
+                           TFE_CustomDevice* device, void** device_info,
+                           bool is_async, int in_flight_nodes_limit,
+                           TF_Status* status = nullptr);
+
+// Add a mesh to the `DTensorDevice` indicated by `device_info`.
+//
+// `serialized_mesh` is a serialized Mesh proto.
+//
+// If `is_async` is true, it indicates the DTensor operations on this mesh will
+// return immediately (with "non-ready" handles), otherwise block until
+// executed. This is exposed as an option for ease of debugging, and will
+// typically be on.
+//
+// `is_host_mesh` indicates this is a CPU mesh used only for sea-of-donuts-style
+// host collectives.
+//
+// in_flight_nodes_limit throttles the number of inflight nodes in the eager
+// async executors used by DTensor. The throttling bounds the memory usage
+// of an eager training loop. Python API sets this value to 8 by default.
+void AddMesh(const std::string& serialized_mesh, void* device_info,
+             bool is_host_mesh, TF_Status* status);
+
+// Sets a requested layout for outputs of all operations.
+void ExperimentalSetDefaultLayout(const std::string& serialized_layout,
+                                  void* device_info, TF_Status* status);
+void ExperimentalClearDefaultLayout(void* device_info, TF_Status* status);
+
+// TODO(b/175928457): remove once the bug is fixed.
+// Sets a requested default mesh.
+void ExperimentalSetDefaultMesh(const std::string& serialized_mesh,
+                                void* device_info, TF_Status* status);
+void ExperimentalClearDefaultMesh(void* device_info, TF_Status* status);
+
+// Determines whether tensors with a shape previously associated with only one
+// layout use that layout if nothing else can be inferred.
+void SetSameShapePolicy(void* device_info, bool enabled);
+
+// Sets the global device ID-to-core ID mapping for a mesh. Global device IDs
+// are equal to XLA replica IDs for the single XLA computation used by DTensor.
+//
+// See the comment above Mesh::tpu_core_ids() for some nuances.
+void SetTPUCoreIDs(const std::string& mesh_name,
+                   const std::vector<int>& tpu_core_ids, void* device_info,
+                   TF_Status* status);
+
+// TODO(b/187112276): Delete once we have the TPUCoreIDs live with Device.
+void ClearTPUCoreIDs(void* device_info);
+
+// Returns TPU core locations when given a list of TPU core IDs.
+std::vector<std::vector<int>> TPUCoreIDsToLocations(
+    TFE_Context* context, const std::vector<int>& tpu_core_ids,
+    void* device_info);
+
+// Returns TPU core IDs when given a list of TPU core locations.
+std::vector<int> TPUCoreLocationsToIDs(
+    TFE_Context* context,
+    const std::vector<std::vector<int>>& tpu_core_locations, void* device_info);
+
+// Pack `inputs` tensors into a single parallel tensor handle.
+TFE_TensorHandle* Pack(TFE_Context* context, int num_inputs,
+                       TFE_TensorHandle** inputs,
+                       const std::string& string_layout, void* device_info,
+                       TF_Status* status);
+
+// Returns the raw components placed on each device of `inputs`'s mesh.
+std::vector<TFE_TensorHandle*> Unpack(TFE_Context* context,
+                                      TFE_TensorHandle* input,
+                                      void* device_info, TF_Status* status);
+
+// Returns the layout of the dtensor 'input'.
+std::string FetchLayout(TFE_Context* context, TFE_TensorHandle* input,
+                        void* device_info, TF_Status* status);
+
+// Returns whether `input` is a dtensor.
+bool IsDTensor(TFE_Context* context, TFE_TensorHandle* input, void* device_info,
+               TF_Status* status);
+
+// Pack `indices`, `values`, `shapes` tensors into a SparseTensorWithLayout.
+TFE_TensorHandle* SparsePack(TFE_Context* context, int num_inputs,
+                             TFE_TensorHandle** indices,
+                             TFE_TensorHandle** values,
+                             TFE_TensorHandle** shapes,
+                             const std::string& string_layout,
+                             void* device_info, TF_Status* status);
+
+// Returns whether `input` is a sparse dtensor. Used in `Unpack` at the python
+// level to determine whether we should wrap component tensors back into a
+// SparseTensor.
+bool IsSparseDTensor(TFE_Context* context, TFE_TensorHandle* input,
+                     void* device_info, TF_Status* status);
+
+// Returns a dictionary with cache stats.
+// 'hit': cache hit count,
+// 'miss': cache miss count,
+// 'size': number of entries in the cache.
+std::unordered_map<std::string, int> GetStats(TFE_Context* context,
+                                              void* device_info,
+                                              TF_Status* status);
+
+// Sets the layouts for the elements emitted by an iterator resource tensor.
+void SetIteratorElementLayouts(TFE_Context* context, TFE_TensorHandle* input,
+                               const std::vector<std::string>& string_layouts,
+                               void* device_info, TF_Status* status);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_device_util.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_device_util.h
new file mode 100644
index 00000000..97ddf298
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_device_util.h
@@ -0,0 +1,840 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_UTIL_H_
+#define TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_UTIL_H_
+
+#include <atomic>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/safe_ptr.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/dtensor_operation.h"
+#include "tensorflow/dtensor/cc/small_constant_optimization.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/cc/tensor_with_layout.h"
+#include "tsl/platform/fingerprint.h"
+#include "tsl/platform/refcount.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+using TensorHandlePtr = tensorflow::Safe_TFE_TensorHandlePtr;
+
+#define RETURN_STATUS(status, code, message)   \
+  {                                            \
+    TF_SetStatus((status), (code), (message)); \
+    return;                                    \
+  }
+
+#define RETURN_C_STATUS_IF_NOT_OK(cpp_status, c_status)                   \
+  {                                                                       \
+    auto return_if_not_ok_status = (cpp_status);                          \
+    if (!return_if_not_ok_status.ok()) {                                  \
+      RETURN_STATUS((c_status),                                           \
+                    static_cast<TF_Code>(return_if_not_ok_status.code()), \
+                    tsl::NullTerminatedMessage(return_if_not_ok_status)); \
+    }                                                                     \
+  }
+
+// Using a counter to uniquify instead of a new block allows `var` to declare a
+// new variable.
+#define ASSIGN_OR_RETURN_C_STATUS(var, cpp_status, c_status)               \
+  ASSIGN_OR_RETURN_C_STATUS_IMPL(                                          \
+      TF_STATUS_MACROS_CONCAT_NAME(_dtensor_status_or_value, __COUNTER__), \
+      var, cpp_status, c_status)
+
+#define ASSIGN_OR_RETURN_C_STATUS_IMPL(statusor, var, cpp_status, c_status) \
+  auto statusor = (cpp_status);                                             \
+  RETURN_C_STATUS_IF_NOT_OK(statusor.status(), (c_status));                 \
+  var = std::move(statusor.value());
+
+struct TranslatedFunction {
+  // Mesh for which specified function will run.
+  Mesh function_mesh;
+
+  // StatefulPartitionedCall op to run the mesh function.
+  const Node* node_to_execute = nullptr;
+
+  // Maps i-th local input index to input index in global graph.
+  std::vector<int> input_index_map;
+
+  // Maps i-th local output to output index of global graph.
+  std::vector<int> output_index_map;
+
+  // Original function name in the graph.
+  std::string function_name;
+  // Translated function name to be called.
+  std::string translated_function_name;
+  // For resource ops, layouts of resource handles are inferred lazily
+  // during SPMD expansion of resource assign ops. In that case,
+  // inferred layouts of resource handles are attached to arg nodes
+  // of the returned graph.
+  std::map<int, Layout> resource_input_layouts;
+  // Record some metadata for output of a shape op. This would help recover
+  // local shape on future operations over the Tensor.
+  std::map<int, Layout> shape_output_metadata;
+  std::vector<Layout> output_layouts;
+  // Local shapes inferred for function outputs; these may be partially known.
+  std::vector<PartialTensorShape> local_output_shapes;
+  // Output data types.
+  std::vector<TF_DataType> output_dtypes;
+  // Number of local outputs for each layout.
+  std::vector<std::int64_t> num_local_outputs;
+};
+
+struct ExecutionFunctions {
+  // Stores information about all functions to execute for provided computation.
+  std::vector<TranslatedFunction> function_list;
+
+  // Number of device ids args added to translated functions.
+  // During translation, we insert one device id arg node per mesh.
+  // For a single mesh function, it equals 1.
+  // For a multi-mesh function (e.g. pipelining), it equals the number of
+  // meshes.
+  int num_device_ids;
+
+  // Mesh fingerprint of function_list. Set only when ExecutionFunctions refers
+  // to a function for performance reason, since an eager op doesn't use it.
+  uint64 function_mesh_fingerprint = 0;
+};
+
+class TensorWithLayoutTf
+    : public llvm::RTTIExtends<TensorWithLayoutTf, TensorWithLayout> {
+ public:
+  // Broadcast a single non-parallel tensor onto `mesh` with a fully replicated
+  // sharding spec. Does not take ownership of `tensor`. The tensor must not
+  // already be on a DTensorDevice.
+  static std::unique_ptr<TensorWithLayoutTf> Broadcast(
+      TFE_Context* context, TFE_TensorHandle* tensor, const Mesh& target_mesh,
+      TF_Status* status);
+
+  // Given an already-parallel tensor, wraps it with a mesh and a layout.
+  static StatusOr<std::unique_ptr<TensorWithLayoutTf>> Wrap(
+      std::vector<TensorHandlePtr>&& tensors, const Layout& layout,
+      std::optional<std::vector<int64_t>>&& shape);
+
+  // Given a single tensor, wraps it with a single device layout.
+  static std::unique_ptr<TensorWithLayoutTf> Wrap(TensorHandlePtr single_tensor,
+                                                  const Layout& layout,
+                                                  TF_Status* status);
+
+  // Creates a dummy TensorWithLayoutTf without holding a ParallelTensor.
+  static std::unique_ptr<TensorWithLayoutTf> Dummy(
+      const std::vector<int64_t>& local_shape, TF_DataType dtype,
+      const Layout& layout);
+
+  ~TensorWithLayoutTf() override = default;
+
+  const Layout& layout() const override { return layout_; }
+
+  TensorType tensor_type() const override { return TensorType::kDense; }
+
+  TF_DataType dtype() const override {
+    return dtype_.has_value() ? *dtype_
+                              : TFE_TensorHandleDataType(tensors_[0].get());
+  }
+  // Encodes the NodeDef via provided builder, if applicable.
+  void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override {}
+
+  tensorflow::Fprint128 CacheKey() const override;
+
+  TFE_TensorHandle* get_tensor(size_t index) const override {
+    return tensors_[index].get();
+  }
+
+  size_t num_tensors() const override {
+    return layout_.IsSingleDevice() ? 1 : tensors_.size();
+  }
+
+  std::vector<TFE_TensorHandle*> tensors() const {
+    std::vector<TFE_TensorHandle*> result;
+    result.reserve(tensors_.size());
+    for (const TensorHandlePtr& tensor : tensors_) {
+      result.emplace_back(tensor.get());
+    }
+    return result;
+  }
+
+  TFE_TensorHandle* single_tensor() const {
+    return layout_.IsSingleDevice() ? get_tensor(0) : nullptr;
+  }
+
+  std::string SummarizeValue() const override;
+
+  std::string DebugString() const override;
+
+  std::vector<int64_t> global_shape() const override {
+    return layout_.GlobalShapeFromLocalShape(local_shape_, &local_shapes_);
+  }
+
+  ConstValueNode* const_value_node() const override {
+    return const_value_node_.get();
+  }
+
+  // llvm::RTTIExtends ID.
+  static char ID;  // NOLINT
+
+ protected:
+  TensorWithLayoutTf(std::vector<TensorHandlePtr>&& tensors,
+                     const Layout& layout,
+                     const std::vector<int64_t>& local_shape,
+                     const std::vector<std::vector<int64_t>>& local_shapes,
+                     std::optional<TF_DataType> dtype = std::nullopt,
+                     std::optional<NodeDef> const_value = std::nullopt)
+      : tensors_(std::move(tensors)),
+        layout_(layout),
+        local_shape_(local_shape),
+        local_shapes_(std::move(local_shapes)),
+        dtype_(dtype) {
+    const_value_node_ = std::make_unique<ConstValueNode>(const_value);
+  }
+
+  TensorWithLayoutTf(TensorHandlePtr&& single_tensor, const Layout& layout,
+                     const std::vector<int64_t>& local_shape,
+                     std::optional<TF_DataType> dtype = std::nullopt,
+                     std::optional<NodeDef> const_value = std::nullopt)
+      : tensors_([&single_tensor] {
+          std::vector<TensorHandlePtr> result;
+          result.emplace_back(std::move(single_tensor));
+          return result;
+        }()),
+        layout_(layout),
+        local_shape_(local_shape),
+        dtype_(dtype) {
+    const_value_node_ = std::make_unique<ConstValueNode>(const_value);
+  }
+
+  std::vector<TensorHandlePtr> tensors_;
+
+  Layout layout_;
+
+  // The local shape of tensors placed on each of `tensor_`'s component devices.
+  std::vector<int64_t> local_shape_;
+  // The local shape of each individual tensor in `tensors_`.
+  // Initialized only when there is dynamic shape.
+  std::vector<std::vector<int64_t>> local_shapes_;
+
+  // dtype of tensor_. Empty if the layout is Single Device.
+  std::optional<TF_DataType> dtype_;
+
+  std::unique_ptr<ConstValueNode> const_value_node_;
+};
+
+// Extension of TensorWithLayout which holds resource handle with layout.
+//
+// The major differences are
+// 1. The layout, shape, dtype are lazily set as they are unavailable upon
+//    creation.
+// 2. Small const optimization should be disabled.
+class ResourceHandleWithLayout
+    : public llvm::RTTIExtends<ResourceHandleWithLayout, TensorWithLayoutTf> {
+ public:
+  // Similar to `Wrap` in `TensorWithLayoutTf` but for resource handle.
+  static StatusOr<std::unique_ptr<ResourceHandleWithLayout>> Wrap(
+      std::vector<TensorHandlePtr>&& tensors, const Layout& layout,
+      std::optional<std::vector<int64_t>>&& shape);
+
+  // Similar to `Dummy` in `TensorWithLayoutTf` but for resource handle.
+  static std::unique_ptr<ResourceHandleWithLayout> Dummy(
+      const std::vector<int64_t>& local_shape, const Layout& layout);
+
+  // The layout of uninitialized resource tensors, or the layout of the tensor
+  // contained in an initialized resource.
+  const Layout& layout() const override {
+    return dereferenced_layout_.has_value() ? dereferenced_layout_.value()
+                                            : layout_;
+  }
+
+  TensorType tensor_type() const override { return TensorType::kResource; }
+
+  TF_DataType dtype() const override {
+    return dtype_.has_value() ? *dtype_
+                              : TFE_TensorHandleDataType(tensors_[0].get());
+  }
+
+  void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override;
+
+  tensorflow::Fprint128 CacheKey() const override;
+
+  // Updates the layout for the tensors.
+  absl::Status UpdateLayout(const Layout& new_layout);
+
+  // Updates the element layouts for the tensors.
+  absl::Status UpdateElementLayouts(const std::vector<Layout>& layouts) {
+    dereferenced_element_layouts_.emplace(layouts);
+    return absl::OkStatus();
+  }
+
+  // Updates the local shape and dtype of the tensors.
+  absl::Status UpdateShapeAndDType(const TensorShapeProto& shape,
+                                   const DataType& dtype) {
+    set_dereferenced_shape(shape);
+    set_dereferenced_dtype(dtype);
+    return absl::OkStatus();
+  }
+
+  ConstValueNode* const_value_node() const override { return nullptr; }
+
+  void set_dereferenced_shape(const TensorShapeProto& shape) {
+    dereferenced_shape_.emplace(shape);
+  }
+  void set_dereferenced_dtype(const DataType& dtype) {
+    dereferenced_dtype_.emplace(dtype);
+  }
+
+  const std::optional<std::vector<Layout>>& dereferenced_element_layouts()
+      const {
+    return dereferenced_element_layouts_;
+  }
+
+  const std::optional<TensorShapeProto>& dereferenced_shape() const {
+    return dereferenced_shape_;
+  }
+  const std::optional<DataType>& dereferenced_dtype() const {
+    return dereferenced_dtype_;
+  }
+
+  // llvm::RTTIExtends ID.
+  static char ID;  // NOLINT
+
+ private:
+  ResourceHandleWithLayout(std::vector<TensorHandlePtr>&& tensors,
+                           const Layout& layout,
+                           const std::vector<int64_t>& local_shape)
+      : llvm::RTTIExtends<ResourceHandleWithLayout, TensorWithLayoutTf>(
+            std::move(tensors), layout, local_shape, {}, TF_RESOURCE) {}
+
+  // The layout of the tensor pointed to by this handle, if any.
+  std::optional<Layout> dereferenced_layout_;
+  // The layouts of the tensors emitted by this resource handle if it is an
+  // iterator resource.
+  std::optional<std::vector<Layout>> dereferenced_element_layouts_;
+  // The shape and dtype of the tensor pointed to by this resource tensor.
+  std::optional<TensorShapeProto> dereferenced_shape_;
+  std::optional<DataType> dereferenced_dtype_;
+};
+
+// TensorWithLayout for SparseTensors.
+//
+// The main difference between this and TensorWithLayout is this
+// contains 3 lists of tensors as opposed to one (values, indices, shapes).
+// The shapes of the SparseTensors will always be the dense view of the shapes,
+// and thus will have no difference with the TensorWithLayout in terms of
+// shapes.
+class SparseTensorWithLayout
+    : public llvm::RTTIExtends<SparseTensorWithLayout, TensorWithLayoutTf> {
+ public:
+  static StatusOr<std::unique_ptr<SparseTensorWithLayout>> Wrap(
+      std::unique_ptr<parallel_device::ParallelTensor> indices_tensor,
+      std::unique_ptr<parallel_device::ParallelTensor> values_tensor,
+      std::unique_ptr<parallel_device::ParallelTensor> shapes_tensor,
+      const Layout& layout, const std::vector<int64_t>& local_shape);
+
+  // A dummy TensorWithLayout without holding a ParallelTensor.
+  static std::unique_ptr<SparseTensorWithLayout> Dummy(
+      const std::vector<int64_t>& local_shape, const Layout& layout) {
+    return absl::WrapUnique(new SparseTensorWithLayout(
+        /*indices=*/nullptr, /*values=*/nullptr, /*dense_shapes=*/nullptr,
+        layout, local_shape));
+  }
+
+  // Add attribute '_sparse' to the NodeDefBuilder so that the mlir::Value
+  // that originate from SparseTensorWithLayout are marked as '_sparse'.
+  void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override {
+    builder.Attr("_sparse", true);
+  }
+
+  TensorType tensor_type() const override { return TensorType::kSparse; }
+
+  size_t num_tensors() const override {
+    return kSparseTensorNum * indices_->num_tensors();
+  }
+
+  TFE_TensorHandle* get_tensor(size_t index) const override;
+
+  std::string SummarizeValue() const override;
+
+  std::string DebugString() const override;
+
+  TF_DataType dtype() const override;
+
+  parallel_device::ParallelTensor* indices() const { return indices_.get(); }
+
+  parallel_device::ParallelTensor* values() const { return values_.get(); }
+
+  parallel_device::ParallelTensor* dense_shapes() const {
+    return dense_shapes_.get();
+  }
+
+  ConstValueNode* const_value_node() const override { return nullptr; }
+
+  // llvm::RTTIExtends ID.
+  static char ID;  // NOLINT
+
+ private:
+  SparseTensorWithLayout(
+      std::unique_ptr<parallel_device::ParallelTensor> indices,
+      std::unique_ptr<parallel_device::ParallelTensor> values,
+      std::unique_ptr<parallel_device::ParallelTensor> dense_shapes,
+      const Layout& layout, const std::vector<int64_t>& local_shape,
+      std::optional<TF_DataType> dtype = std::nullopt,
+      std::optional<NodeDef> const_value = std::nullopt)
+      : llvm::RTTIExtends<SparseTensorWithLayout, TensorWithLayoutTf>(
+            std::vector<TensorHandlePtr>(), layout, local_shape, {}),
+        indices_(std::move(indices)),
+        values_(std::move(values)),
+        dense_shapes_(std::move(dense_shapes)) {}
+
+  std::unique_ptr<parallel_device::ParallelTensor> indices_;
+  std::unique_ptr<parallel_device::ParallelTensor> values_;
+  std::unique_ptr<parallel_device::ParallelTensor> dense_shapes_;
+};
+
+std::unique_ptr<TensorWithLayoutTf> CreateDummyTensorWithLayout(
+    const std::vector<int64_t>& local_shape, TF_DataType dtype,
+    const Layout& layout);
+
+// Creates a DTensor from one or more tensor handles and a compatible
+// layout. Optionally accepts a `shape` argument that overrides the
+// actual shape of the underlying tensors; this argument should be
+// provided when there's a possibility of the inferred shape from
+// differing from the actual shape (like when it is dynamic).
+StatusOr<std::unique_ptr<TensorWithLayoutTf>> CreateTensorWithLayout(
+    std::vector<TensorHandlePtr>&& tensor, const Layout& layout,
+    std::optional<std::vector<int64_t>>&& shape = std::nullopt);
+
+template <typename T>
+std::string ShapeToDebugString(const std::vector<T> shape_vector) {
+  std::vector<tensorflow::int64> cast_shape(shape_vector.begin(),
+                                            shape_vector.end());
+  tensorflow::PartialTensorShape shape;
+  if (!tensorflow::PartialTensorShape::MakePartialShape(
+           cast_shape.data(), cast_shape.size(), &shape)
+           .ok()) {
+    return "<error displaying shape>";
+  } else {
+    return shape.DebugString();
+  }
+}
+
+// Internal class with shared functions for every ExecutableManager<T>.
+class ExecutableManagerImpl {
+  template <typename T>
+  friend class ExecutableManager;
+
+ public:
+  absl::flat_hash_map<int, NodeDef> GetConstantFoldableTensors(
+      const std::vector<TensorWithLayout*>& inputs);
+
+  // Cache key for dtensor operation name, which includes the op name
+  // and the input shapes. This is needed as a higher level cache for constant
+  // folding.
+  tensorflow::Fprint128 CacheKeyForDTensorOperation(
+      const DTensorOperation& doperation) const;
+
+ private:
+  ExecutableManagerImpl() = default;
+};
+
+struct ExecutionManagerStats {
+  int64_t hits;    // number of hits.
+  int64_t misses;  // number of misses.
+  int64_t size;    // size of cache (number of entries).
+};
+
+// Template Class that holds information about DTensor executable ran, including
+// cached lowered executable and constant folding input information per
+// function.
+//
+//
+// The caching policy for constant folded inputs is the following:
+//   In the first call to a function, we assume that all the inputs that
+//   are constant foldable are constant folded and save these values. In the
+//   next call to the same function call, we compare the values of constant
+//   folded inputs to the previous constant folded inputs. We disable constant
+//   folding for the changed values, and save these new inputs.
+// TODO(b/169348205) Support cache eviction if the cache gets bloated.
+template <typename T>
+class ExecutableManager : public tsl::core::WeakRefCounted {
+ public:
+  ExecutableManager() = default;
+
+  // Caches the executable with ParallelExecutable.
+  const T* AddCachedExecutable(tensorflow::Fprint128 cache_key, T executable);
+
+  // Removes the executable.
+  void Remove(tensorflow::Fprint128 cache_key);
+
+  // Returns the cache key and the cached lowered executable for the function.
+  // Returns a nullptr for the lowered executable if there is a cache miss.
+  // Upon a cache miss, this will save some metadata about the function
+  // and the small inputs to keep track of information for constant folding.
+  StatusOr<std::pair<tensorflow::Fprint128, const T*>> GetCachedExecutable(
+      const DTensorOperation& doperation, const NameAttrList& attributes,
+      const std::vector<TensorWithLayout*>& inputs,
+      const std::vector<const Layout*>& output_layouts);
+
+  // Returns the cached lowered graph for the function.
+  // Returns a nullptr for the lowered graph if there is a cache miss.
+  // This Get operation has no side effect.
+  const T* GetCachedExecutableSimple(tensorflow::Fprint128 cache_key);
+
+  // Returns whether the input at `input_index` should be constant
+  // folded into function `doperation`. An input is not constant folded if we
+  // have ran this function at least twice and the small input value changed
+  // across separate runs.
+  StatusOr<bool> ShouldFoldInput(const DTensorOperation& doperation,
+                                 const std::vector<TensorWithLayout*>& inputs,
+                                 int input_index) const;
+
+  // Returns the current Stats of the execution manager.
+  // The result is a snapshot at the moment of the call.
+  ExecutionManagerStats GetStats() const {
+    ExecutionManagerStats stats;
+    stats.hits = stats_.hits;
+    stats.misses = stats_.misses;
+    // A reader Lock is probably more suitable, but this code branch is
+    // barely executed.
+    mutex_lock lock(mu_);
+    stats.size = function_cache_.size();
+    return stats;
+  }
+
+ private:
+  // Generates a cache key for the graph, including its attributes,
+  // inputs, and outputs.
+  StatusOr<tensorflow::Fprint128> CacheKeyForGraph(
+      const DTensorOperation& doperation, const NameAttrList& attributes,
+      const std::vector<TensorWithLayout*>& inputs,
+      const std::vector<const Layout*>& output_layouts);
+
+  // Returns true for a missing entry in the small inputs cache.
+  bool UpdateDTensorOpAndSmallInputsCache(
+      const DTensorOperation& doperation,
+      const std::vector<TensorWithLayout*>& inputs);
+
+  mutable mutex mu_;
+  mutable mutex dtensor_op_and_small_inputs_mu_;
+
+  // Maps the hash of a graph with the lowered graph.
+  absl::flat_hash_map<tensorflow::Fprint128, T, tensorflow::Fprint128Hasher>
+      function_cache_ TF_GUARDED_BY(mu_);
+
+  // Maps the hash of dtensor_operation and its input shapes to a map
+  // representing the small constant indices and values to the function. The
+  // small constant indices are saved to make faster comparisons for constant
+  // folding validation.
+  absl::flat_hash_map<tensorflow::Fprint128, absl::flat_hash_map<int, NodeDef>,
+                      tensorflow::Fprint128Hasher>
+      dtensor_op_and_small_inputs_
+          TF_GUARDED_BY(dtensor_op_and_small_inputs_mu_);
+
+  ExecutableManagerImpl executable_manager_impl_;
+  struct {
+    std::atomic<int64_t> hits = 0;
+    std::atomic<int64_t> misses = 0;
+  } stats_;
+};
+
+// Returns the shape of a given tensor.
+StatusOr<std::vector<int64_t>> GetTensorShapeAsVector(
+    const tensorflow::PartialTensorShape& shape);
+
+// Returns the shape of a given tensor.
+StatusOr<std::vector<int64_t>> GetTensorShapeAsVector(TFE_TensorHandle* tensor);
+
+absl::Status InferOutputLayouts(const DTensorOperation& doperation,
+                                const NameAttrList& attributes,
+                                const std::optional<Layout>& default_layout,
+                                tensorflow::Graph* graph,
+                                std::vector<const Layout*>* output_layouts);
+// Creates a Graph with _Arg and _Retval nodes surrounding an
+// `operation_name`-type node.
+absl::Status PrepareGraphForMlir(
+    const ExecutableManager<mlir::OwningOpRef<mlir::ModuleOp>>& module_manager,
+    const std::vector<TensorWithLayout*>& inputs,
+    const DTensorOperation& doperation,
+    const tensorflow::FunctionLibraryDefinition& flib_def,
+    const NameAttrList& attributes,
+    const std::vector<const Layout*>& output_layouts, tensorflow::Graph* graph,
+    std::vector<PartialTensorShape>* global_output_shapes);
+
+// Returns set of functions to run to execute DTensor computation.
+StatusOr<ExecutionFunctions> IdentifyAllFunctionsToExecute(
+    const tensorflow::Graph& graph,
+    const std::vector<PartialTensorShape>& global_output_shapes);
+
+// For functions with control outputs, add identity nodes between
+// StatefulPartitionedCall and _Retvals, in order to preserve control output
+// dependencies after StatefulPartitionedCall is inlined at runtime.
+// Consider calling this in PrepareGraphForMlir, once the identity nodes won't
+// be dropped during MLIR lowering.
+// TODO(b/171265131): fix the underlying issue to avoid inserting identity
+// nodes.
+absl::Status MaybeInsertIdentityNodes(const FunctionDef* function_def,
+                                      Graph* graph);
+
+// Add DTensor specific function attributes to be compatible with eager runtime.
+void AddDTensorFunctionAttr(FunctionDef& function_def);
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementation details for ExecutableManager<T>
+
+// Thread safe method.
+// Generates a cache key for the graph, including its attributes,
+// inputs, and outputs.
+// Cache key computation should consider all features of an op that affects
+// the SPMD lowering. The cache keys of two ops must be different if the
+// translated functions are different.
+// - op name and attr
+// - input shapes and layouts
+// - default layout of outputs.
+// - default mesh.
+// - values of constant foldable inputs.
+template <typename T>
+StatusOr<tensorflow::Fprint128> ExecutableManager<T>::CacheKeyForGraph(
+    const DTensorOperation& doperation, const NameAttrList& attributes,
+    const std::vector<TensorWithLayout*>& inputs,
+    const std::vector<const Layout*>& output_layouts) {
+  tensorflow::Fprint128 cache_key = tensorflow::Fingerprint128(doperation.name);
+  std::string serialized;
+  SerializeToStringDeterministic(attributes, &serialized);
+  cache_key =
+      FingerprintCat128(cache_key, tensorflow::Fingerprint128(serialized));
+  cache_key = FingerprintCat128(
+      cache_key,
+      tensorflow::Fingerprint128(doperation.default_mesh.ToString()));
+  // Higher level cache based on operation name and input shapes.
+  for (int i = 0; i < inputs.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(bool should_fold_input,
+                        ShouldFoldInput(doperation, inputs, i));
+    if (!should_fold_input && inputs[i]->const_value_node()) {
+      inputs[i]->const_value_node()->reset_const_value();
+    }
+    cache_key = FingerprintCat128(
+        cache_key, tensorflow::Fingerprint128(absl::StrFormat("%x", i)));
+    cache_key = FingerprintCat128(cache_key, inputs[i]->CacheKey());
+  }
+  for (int output_index = 0; output_index < output_layouts.size();
+       ++output_index) {
+    if (output_layouts[output_index]) {
+      cache_key = FingerprintCat128(cache_key, output_index);
+      cache_key = FingerprintCat128(
+          cache_key,
+          tensorflow::Fingerprint128(output_layouts[output_index]->ToString()));
+    }
+  }
+  return cache_key;
+}
+
+// Thread-safe method.
+template <typename T>
+StatusOr<std::pair<tensorflow::Fprint128, const T*>>
+ExecutableManager<T>::GetCachedExecutable(
+    const DTensorOperation& doperation, const NameAttrList& attributes,
+    const std::vector<TensorWithLayout*>& inputs,
+    const std::vector<const Layout*>& output_layouts) {
+  TF_ASSIGN_OR_RETURN(
+      tensorflow::Fprint128 cache_key,
+      CacheKeyForGraph(doperation, attributes, inputs, output_layouts));
+
+  {
+    mutex_lock lock(mu_);
+    // Early return if we have a cache hit.
+    if (auto iter = function_cache_.find(cache_key);
+        iter != function_cache_.end()) {
+      stats_.hits++;
+      return {{cache_key, &iter->second}};
+    }
+  }
+
+  bool missed = UpdateDTensorOpAndSmallInputsCache(doperation, inputs);
+
+  if (missed) {
+    stats_.misses++;
+    return {{cache_key, nullptr}};
+  }
+  // Generate a new cache key since we updated small const inputs which change
+  // the cache key.
+  TF_ASSIGN_OR_RETURN(cache_key, CacheKeyForGraph(doperation, attributes,
+                                                  inputs, output_layouts));
+
+  stats_.misses++;
+  return {{cache_key, nullptr}};
+}
+
+template <typename T>
+bool ExecutableManager<T>::UpdateDTensorOpAndSmallInputsCache(
+    const DTensorOperation& doperation,
+    const std::vector<TensorWithLayout*>& inputs) {
+  const tensorflow::Fprint128 doperation_hash =
+      executable_manager_impl_.CacheKeyForDTensorOperation(doperation);
+
+  mutex_lock lock(dtensor_op_and_small_inputs_mu_);
+  // Save the constant folded inputs to this doperation if we have not seen
+  // this before. This is needed so that in the next call to this operation,
+  // we can compare these inputs to confirm which one is indeed a constant.
+  auto doperation_iter = dtensor_op_and_small_inputs_.find(doperation_hash);
+  if (doperation_iter == dtensor_op_and_small_inputs_.end()) {
+    dtensor_op_and_small_inputs_.insert(
+        {doperation_hash,
+         executable_manager_impl_.GetConstantFoldableTensors(inputs)});
+    return true;
+  }
+
+  // If we are here, then we have ran this function before but constant folded
+  // some input(s) when it was not a constant input i.e. one of the small
+  // value to this function input changed. So mark those changed values as
+  // non-constant.
+  absl::flat_hash_map<int, NodeDef>& previous_small_inputs =
+      doperation_iter->second;
+  std::vector<int> non_constant_indices;
+
+  for (auto const& [index, previous_small_input] : previous_small_inputs) {
+    // Some Ops the number of inputs can vary. We'll just skip updating them.
+    if (index >= inputs.size()) continue;
+    auto* const_value_node = inputs[index]->const_value_node();
+    if (const_value_node == nullptr) {
+      continue;
+    }
+    if (const_value_node->const_value().has_value()) {
+      if (NodeDefsHaveDifferentTensorProto(
+              previous_small_input, const_value_node->const_value().value())) {
+        non_constant_indices.push_back(index);
+      }
+    }
+  }
+  for (int non_constant_index : non_constant_indices) {
+    previous_small_inputs.erase(non_constant_index);
+  }
+  return false;
+}
+
+// Thread-safe method.
+template <typename T>
+const T* ExecutableManager<T>::GetCachedExecutableSimple(
+    tensorflow::Fprint128 cache_key) {
+  mutex_lock lock(mu_);
+  auto iter = function_cache_.find(cache_key);
+  if (iter == function_cache_.end()) {
+    stats_.misses++;
+    return nullptr;
+  }
+  stats_.hits++;
+  return &iter->second;
+}
+
+template <typename T>
+const T* ExecutableManager<T>::AddCachedExecutable(
+    tensorflow::Fprint128 cache_key, T executable) {
+  mutex_lock lock(mu_);
+  return &function_cache_.insert({cache_key, std::move(executable)})
+              .first->second;
+}
+
+template <typename T>
+void ExecutableManager<T>::Remove(tensorflow::Fprint128 cache_key) {
+  mutex_lock lock(mu_);
+  auto iter = function_cache_.find(cache_key);
+  if (iter != function_cache_.end()) {
+    function_cache_.erase(iter);
+  }
+}
+
+template <typename T>
+StatusOr<bool> ExecutableManager<T>::ShouldFoldInput(
+    const DTensorOperation& doperation,
+    const std::vector<TensorWithLayout*>& inputs, const int input_index) const {
+  const auto input = inputs[input_index];
+  const bool can_fold = input->const_value_node() &&
+                        input->const_value_node()->const_value().has_value();
+  // For eager ops, assume the inputs are constant foldable.
+  if (!doperation.is_func()) {
+    // Fold if we are in a function or if a special eager op.
+    // TODO(b/270762002): Think about how to generalize this so it does not
+    // depend on operation_name. For example, we can check the max abs value of
+    // the tensor value.
+
+    if (doperation.name == absl::string_view("StatelessRandomUniform") ||
+        doperation.name == absl::string_view("StatelessRandomUniformFullInt") ||
+        doperation.name == absl::string_view("StatelessRandomNormal") ||
+        doperation.name == absl::string_view("StatelessTruncatedNormal")) {
+      // For all stateless rng ops, we avoid fold seed (input_index==1) in
+      // graph. This is an important optimization to avoid unnecessary MLIR SPMD
+      // lowering and TPU compilation during model parameters initialization
+      // process. which typically have the same shape for rng ops but different
+      // seeds.
+      return can_fold && (input_index != 1);
+    }
+    // Certain Ops we shall never fold in their inputs. Enable caching to reduce
+    // sizes of the graphs. This list is incomplete.
+    // FIXME(b/270762002): We only need constant folding for args that are
+    // matched against Constants in MLIR.
+    if (doperation.name != absl::string_view("Identity") &&
+        doperation.name != absl::string_view("DivNoNan") &&
+        doperation.name != absl::string_view("CopyToMesh") &&
+        doperation.name != absl::string_view("CopyToMeshGrad") &&
+        doperation.name != absl::string_view("Relayout") &&
+        doperation.name != absl::string_view("RelayoutGrad")) {
+      return can_fold;
+    }
+  }
+  const tensorflow::Fprint128 doperation_hash =
+      executable_manager_impl_.CacheKeyForDTensorOperation(doperation);
+
+  mutex_lock lock(dtensor_op_and_small_inputs_mu_);
+  // If we didn't see this doperation before then optimisticly assume this is
+  // foldable. The input at `input_index` is foldable only if it is one of the
+  // indices we have saved as the small inputs.
+  auto doperation_iter = dtensor_op_and_small_inputs_.find(doperation_hash);
+  return can_fold && (doperation_iter == dtensor_op_and_small_inputs_.end() ||
+                      doperation_iter->second.contains(input_index));
+}
+
+// ExecutionFunctions manager can not check if the input is foldable.
+template <>
+StatusOr<bool> ExecutableManager<ExecutionFunctions>::ShouldFoldInput(
+    const DTensorOperation& doperation,
+    const std::vector<TensorWithLayout*>& inputs, int input_index) const;
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
new file mode 100644
index 00000000..f9ea51b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_GRAPH_TO_MLIR_PASS_H_
+#define TENSORFLOW_DTENSOR_CC_DTENSOR_GRAPH_TO_MLIR_PASS_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+namespace tensorflow {
+
+class DTensorMlirPassRunner {
+ public:
+  DTensorMlirPassRunner();
+
+  // Imports Graph to MLIR module in tf_execute Dialect with DTensor attributes.
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportGraphToMlir(
+      const DeviceSet& device_set, absl::string_view name, bool is_func,
+      const dtensor::Mesh& default_mesh,
+      const FunctionLibraryDefinition& flib_def, const Graph& graph,
+      Fprint128 cache_key);
+
+  // Transforms input MLIR module with DTensor Pass pipeline.
+  absl::Status Run(mlir::ModuleOp module);
+
+ private:
+  // N.B. op_registration_ must be initialized before context/pass-manager to
+  // ensure DTensor operations are available during optimization passes.
+  bool op_registration_ = mlir::TF::RegisterDTensorTFOps();
+  mlir::MLIRContext context_;
+  mlir::PassManager pass_manager_;
+
+  bool logging_enabled_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_DTENSOR_GRAPH_TO_MLIR_PASS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_operation.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_operation.h
new file mode 100644
index 00000000..406c7dfa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_operation.h
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_OPERATION_H_
+#define TENSORFLOW_DTENSOR_CC_DTENSOR_OPERATION_H_
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Captures the properties of an Operation currently being executed by DTensor.
+struct DTensorOperation {
+  // For all by-ref fields: not owned. lifetime covers the whole usage.
+  const char* name;
+  const FunctionDef* function_def;
+  // Default mesh is used when Mesh Propagation does not identify a mesh
+  // otherwise.
+  const Mesh default_mesh;
+  const StackTracesMap& stack_traces;
+  inline bool is_func() const { return function_def != nullptr; }
+
+  // Returns True if the op has no side effects.
+  // Side effects include global side effect marked by IsStateful and
+  // Input or output of Resources.
+  // This definition is correct for all DTensor support Ops.
+  // Some odder TF Ops (e.g. Queue) do not mark themselve as stateful, but are
+  // still stateful. DTensor doesn't support them.
+  inline bool is_pure() const {
+    if (is_func()) {
+      // FIXME(feyu): some functions can still be pure, but we just don't yet
+      // handle the case, and treat all functions as non-pure.
+      return false;
+    }
+    const OpDef* op_def = nullptr;
+    absl::Status status = OpRegistry::Global()->LookUpOpDef(name, &op_def);
+    DCHECK(status.ok());  // Not found. This really shouldn't happen.
+    if (!status.ok()) {
+      return false;
+    }
+    return !op_def->is_stateful();
+  }
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_DTENSOR_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_utils.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_utils.h
new file mode 100644
index 00000000..b89a85cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/dtensor_utils.h
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_UTILS_H_
+#define TENSORFLOW_DTENSOR_CC_DTENSOR_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Returns the DTensor client ID of this process, usually equal to the TF task
+// ID on this host.
+int ClientId();
+
+// Returns the total number of DTensor clients, usually equal to the total
+// number of TF tasks.
+int NumClients();
+
+// Returns whether to enable logging for passes and layouts on all passes.
+bool LogOnAllTasks();
+
+// Returns whether to log op-by-op execution in addition to function execution
+// when logging is enabled.
+bool LogOpByOp(absl::string_view op_name);
+
+// Returns the maximum number of steps to run layout propagation. If the number
+// of steps exceeds this amount, layout propagation will fail.
+int LayoutPropagationMaxSteps();
+
+// Returns whether to upcast bfloat16 reduction inputs to float32 for
+// sufficient reduction group size.
+bool EnableMixedPrecisionReduce();
+
+// Returns whether *not* to fuse AllReduce + AllScatter into ReduceScatter op,
+// which can be more efficiently implemented.
+bool DoNotFuseReduceScatter();
+
+// Returns the maximum reduction group size for bfloat16 reduction. If the
+// group size exceeds this, then tensors are upcasted to float32 before the
+// reduce op.
+int ReduceInBfloat16MaxGroupSize();
+
+// Returns whether to lower DTensorAllGather to CollectiveReduceV2. If false,
+// lowers it to CollectiveReduceV2 for GPU and CPU for supported data types.
+bool LowerCollectiveGatherToCollectiveGatherV2();
+
+// Returns whether to enable defaulting TF ops that do not have SPMD
+// implementation to default to the ReplicatedOpSpmdExpander.
+bool EnableReplicatedSpmdAsDefault(const std::string& op_name);
+
+// Returns whether to use all-to-all collective for relayout when possible.
+bool EnableAllToAllForRelayout();
+
+// Returns the maximum number of AllReduce ops to merge into a group. This value
+// determines the AllReduce grouping in dtensor_allreduce_combine_optimization.
+// The input value should be in range of [0, INT_MAX]. It is advised to pick
+// a value based on knowledge of the total number of AllReduces. When the value
+// is too big, the behaviour will act as aggressive grouping. When the value is
+// too small, the behaviour will act as having no extended grouping.
+int AllReduceCombineOptimizationGroupSize();
+
+// Returns the maximum topological distance between two AllReduce ops to merge
+// into a single AllReduce. This value is used to determine AllReduce grouping
+// in dtensor_allreduce_combine_optimization. The input value should be in range
+// of [0, INT_MAX]. However, it is advised to select a value based on knowledge
+// of the compute graph, such as the minimum distance between two model layers.
+// When the input value is too big, the behaviour will act as aggressive group-
+// ing. When the input value is too small, the behaviour will act as having no
+// extended grouping.
+int AllReduceCombineOptimizationTopologicalDistance();
+
+// Returns whether to perform multi-device expansion.
+bool EnableMultiDeviceMode();
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_DTENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/mesh_type.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/mesh_type.h
new file mode 100644
index 00000000..4902ab7b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/mesh_type.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_MESH_TYPE_H_
+#define TENSORFLOW_DTENSOR_CC_MESH_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+
+extern "C" {
+typedef struct TF_Mesh TF_Mesh;
+}
+DEFINE_CONVERSION_FUNCTIONS(dtensor::Mesh, TF_Mesh);
+
+typedef struct TF_Layout TF_Layout;
+DEFINE_CONVERSION_FUNCTIONS(dtensor::Layout, TF_Layout);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_CC_MESH_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/parallel_executor.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/parallel_executor.h
new file mode 100644
index 00000000..e57218e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/parallel_executor.h
@@ -0,0 +1,80 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_PARALLEL_EXECUTOR_H_
+#define TENSORFLOW_DTENSOR_CC_PARALLEL_EXECUTOR_H_
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/cc/tensor_with_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+template <typename T = void>
+using Future = ::xla::PjRtFuture<T>;
+
+// ParallelExecutor Interface
+// Note: The interface is under development and APIs are subject to change.
+class ParallelExecutor {
+ public:
+  virtual ~ParallelExecutor() = default;
+
+  // Broadcasts `tensor` to `mesh` using replicated sharding and returns a
+  // DTensor representation. `mesh` can be a single device mesh and in that case
+  // `const_value` is useless.
+  virtual StatusOr<std::unique_ptr<TensorWithLayout>> Broadcast(
+      const Tensor& tensor, const Mesh& mesh,
+      std::optional<NodeDef> const_value) = 0;
+
+  // Takes input TensorWithLayouts and a MLIR module.
+  // The MLIR module should have `main` as its entry function name.
+  // Attributes are forwarded to executed operations unmodified.
+  // The execute is non-blocking and returns a Future of output TensorWithLayout
+  // raw pointers.
+  // The client is responsible for the ownership of the outputs.
+  struct ExecutionResult {
+    Future<> status;
+    // The pointed data of `outputs` are filled after `status` future resolves
+    // as ok.
+    std::vector<TensorWithLayout*> outputs;
+  };
+  virtual StatusOr<ExecutionResult> Execute(
+      TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+      mlir::ModuleOp module, const TFE_OpAttrs* attributes) const = 0;
+
+  // Disassembles `t` into multiple TensorWithLayouts. `t` may or may not be
+  // valid to use afterwards.
+  virtual StatusOr<std::vector<std::unique_ptr<TensorWithLayout>>> Disassemble(
+      TensorWithLayout* t) = 0;
+
+  // Returns a tensor copied from `t` when `t` contains only a single device.
+  virtual Future<Tensor> ToHostBuffer(TensorWithLayout* t) = 0;
+};
+
+// Factory method for Default ParallelExecutor instance.
+StatusOr<std::unique_ptr<ParallelExecutor>> CreateDefaultParallelExecutor();
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_PARALLEL_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/save_restore_util.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/save_restore_util.h
new file mode 100644
index 00000000..aac4817f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/save_restore_util.h
@@ -0,0 +1,156 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_SAVE_RESTORE_UTIL_H_
+#define TENSORFLOW_DTENSOR_CC_SAVE_RESTORE_UTIL_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Defines an Metadata entry when saving a Tensor.
+struct SavingTensorMetadata {
+  // Tracks index from the original save op.
+  int64_t tensor_index;
+  // The global shape of the saving tensor.
+  std::vector<int64_t> shape;
+  // The layout of the saving tensor.
+  Layout layout;
+
+  SavingTensorMetadata(int64_t index, std::vector<int64_t> global_shape,
+                       Layout tensor_layout)
+      : tensor_index(index),
+        shape(std::move(global_shape)),
+        layout(std::move(tensor_layout)) {}
+};
+
+// Tracks a complete specification for a particular save op.
+// The users would build out multiple save ops using the following manner for
+// the given fields:
+//
+// save_op[i] = tf.SaveV2(
+//              prefix = new_prefixes[i],
+//              tensor_indices = tensor_indies[i],
+//              shape_and_slices = shape_and_slice_spec[i])
+struct SaveOpSpecs {
+  std::vector<mlir::Value> new_prefixes;
+  std::vector<std::vector<int>> tensor_indices;
+  std::vector<std::vector<std::string>> shape_and_slice_spec;
+
+  SaveOpSpecs(std::vector<mlir::Value> prefixes,
+              std::vector<std::vector<int>> indices,
+              std::vector<std::vector<std::string>> specs)
+      : new_prefixes(std::move(prefixes)),
+        tensor_indices(std::move(indices)),
+        shape_and_slice_spec(std::move(specs)) {}
+};
+
+// Returns a device suffix with printf formatting.
+std::string DeviceSuffix(int device_id, int total_devices);
+
+// Builds a complete saving specification for each device on the mesh.
+//
+// The returned map contains a map of <device_id, SavingSpec>.
+// Device_id is where the saving should happen, and SavingSpec is a
+// mapping of <tensor_index -> shape_and_slices>. e.g.,
+//
+// A map of {device_id : 0 ->  {
+//     0 : "2 0,1",
+//     1 : ""
+//   }
+// }
+//
+// Means that device_0 is responsible for saving tensor 0 and 1 from the passed
+// in tensors list. For tensor[0], it saves the only the first element in that
+// 1d vector with 2 elements. For tensor[1], it saves all elements.
+//
+// We accept another map as input, that records the mapping of
+// <tensor_index -> (tensor_global_shape, tensor_layout)>.
+//
+// (tensor_global_shape, tensor_layout & tensor_layout.mesh) defines which
+// device saves what slices of the Tensor.
+//
+// For a complete definition of shape_and_slices field, please see:
+// third_party/tensorflow/core/framework/tensor_slice.h
+StatusOr<absl::flat_hash_map<
+    int64_t, absl::flat_hash_map<int64_t, std::vector<std::string>>>>
+BuildSavingSpec(absl::Span<const SavingTensorMetadata> tensor_metadatas);
+
+// For a given per device saving spec, find out the counts of SaveV2 ops
+// needed and their corresponding inputs.
+//
+// Current SaveV2 op requires tensor_names to be unique in the list, which is a
+// contract that distributed saving would break. For example, if the saving spec
+// decides that device 0 is responsible for saving two slices of tensor[a], then
+// a single SaveV2 op can't fufill. The setup is very likely to happen when
+// saving on TPU - where 8 cores maps to 1 host. In that case, the CPU host will
+// be responsible for saving slices on the same tensor across 8 TPU cores.
+// TODO(b/179126981): Investigate whether we can make TF core API run with
+// different slice spec on a same tensor key.
+//
+// That said, building one SaveV2 op for each save is wasteful, when a single
+// SaveV2 op is capable of saving different tensors. Instead, we simply need to
+// break the SaveV2 op to be able to track the longest saving specs for a single
+// tensor happening on the device,  e.g.,
+//
+// For given saving specs:
+//
+// { 'tensor_name_a' : <"spec_a", "spec_a_2"> }
+// { 'tensor_name_b' : <"spec_b"> }
+//
+// would result into two save ops, where:
+//
+// SaveOp1 (tensor_names = <"tensor_name_a", tensor_name_b">,
+//                           slice_spec = <"spec_a", "spec_b">)
+//
+// SaveOp2 (tensor_names = "<tensor_name_a>", slice_spec = <"spec_a_2">.
+//
+// The output vectors tracks the new SaveV2 op parameters and they must agree on
+// size and indexing for saving tensors.
+//
+// tensor_indices trackes a list of indices of tensors that are being saved for
+// each Save op, e.g.,
+//
+// tensor_indices[0] is a list of tensors (in index form) that needs to be saved
+// on the first SaveV2 op.
+//
+// shape_and_slice_specs tracks a list of shape_and_slice_specs being saved for
+// each Save op, e.g.,
+//
+// shape_and_slice_spec[0] is a list of shape_and_slices parameters for SaveV2
+// op.
+SaveOpSpecs BuildPerDeviceSave(
+    mlir::OpBuilder& builder,
+    const absl::flat_hash_map<int64_t, std::vector<std::string>>& saving_spec,
+    int device_id, mlir::Value prefix, int total_devices);
+
+// Figures out the tensor slice_spec for a given layout and mesh device
+// location.
+StatusOr<std::vector<std::string>> SliceSpecOnDevice(
+    const Layout& layout, const Mesh& mesh, const DeviceLocation& device_coords,
+    absl::Span<const int64_t> global_shape);
+}  // namespace dtensor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_SAVE_RESTORE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/slice_util.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/slice_util.h
new file mode 100644
index 00000000..4d2d5c72
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/slice_util.h
@@ -0,0 +1,315 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_SLICE_UTIL_H_
+#define TENSORFLOW_DTENSOR_CC_SLICE_UTIL_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace slice_util {
+
+// Defines a token of the strided slicing mini-language.
+// Refer to the definition of StridedSlice Op for the informal definition of
+// the language. During slicing, axes of the input tensor are processed one
+// by one according to the tokens of the slicing spec vector.
+struct Token {
+  enum TokenType {
+    REGULAR,   // Slice the current axis by begin/end/begin_mask/end_mask and
+               // stride.
+    NEW_AXIS,  // Add a new axis at the current location to the output.
+    ELLIPSIS,  // Copy over following axes to the output till the ellipsis ends.
+    SHRINK_AXIS  // Like a regular axis, but sequeeze this axis from output
+                 // after slicing.
+  } token_type;
+
+  int64_t begin = 0;          // Begin of the slice.
+  int64_t end = 0;            // End of the slice.
+  int64_t stride = 0;         // Stride of the slice.
+  bool dynamic_mask = false;  // If begin, end, or stride is a dynamic value.
+  bool begin_mask = false;    // True if the begin is maximal.
+  bool end_mask = false;      // True if the end is maximal.
+
+  Token() = default;
+  Token(TokenType token_type, int64_t begin, int64_t end, int64_t stride,
+        bool dynamic_mask = false, bool begin_mask = false,
+        bool end_mask = false)
+      : token_type(token_type),
+        begin(begin),
+        end(end),
+        stride(stride),
+        dynamic_mask(dynamic_mask),
+        begin_mask(begin_mask),
+        end_mask(end_mask) {}
+
+  // Normalizes the token such that (end - begin) is evenly divided by stride,
+  // and the result equals the total elements after the slicing.
+  Token normalize(int64_t dim_size) const;
+  std::optional<Token> GetLocalToken(int64_t dim_size,
+                                     int64_t num_shards) const;
+};
+
+// TODO(feyu): is there a C++ way to do vari args and templates move this out
+// of this class?
+template <typename T, typename... Types>
+StatusOr<T> CreateAndRun(const std::vector<Token>& tokens, Types... args) {
+  T visitor(args...);
+  TF_RETURN_IF_ERROR(visitor.Run(tokens));
+  return visitor;
+}
+
+class TokenProcessor {
+ public:
+  explicit TokenProcessor(int64_t input_rank) : input_rank_(input_rank) {}
+  virtual ~TokenProcessor() = default;
+
+  absl::Status Run(const std::vector<Token>& tokens);
+
+ protected:
+  // Loop for an ellipsis or the unconsumed axes in the end.
+  bool VisitLoop(int64_t input_rank, int64_t output_rank, int64_t ellipsis_size,
+                 int64_t* input_index, int64_t* output_index);
+
+  virtual void VisitImplicitAxis(int64_t input_index, int64_t output_index) = 0;
+
+  virtual void VisitEllipsisAxis(const Token& token) = 0;
+
+  virtual void VisitShrinkAxis(const Token& token, int64_t input_index,
+                               int64_t output_index) = 0;
+
+  virtual void VisitNewAxis(const Token& token, int64_t input_index,
+                            int64_t output_index) = 0;
+
+  virtual void VisitRegularAxis(const Token& token, int64_t input_index,
+                                int64_t output_index) = 0;
+
+  virtual void PrepareResults(int64_t spec_rank, int64_t input_rank,
+                              int64_t output_rank) = 0;
+
+  virtual absl::Status FinalizeResults(int64_t input_rank,
+                                       int64_t output_rank) = 0;
+
+ private:
+  const int64_t input_rank_;
+};
+
+// Forward layout inference of from a StridedSlice token vector.
+//
+// For value_layout = StridedSlice(input_layout, tokens)
+//
+// The inference consumes input_layout, and produces:
+//  - a planned expander_input_layout that is suitable for SPMD expansion.
+//  - a planned expander_value_layout that is suitable for SPMD expansion.
+//  - a local_tokens vector for the arguments of the post-SPMD StridedSliceOp.
+//  expander_input_layout and expander_value_layout are consistent with
+//  local_tokens.
+class ForwardLayoutInference : public TokenProcessor {
+ public:
+  ForwardLayoutInference(const Layout& input_layout,
+                         const llvm::ArrayRef<int64_t> input_shape)
+      : TokenProcessor(input_shape.size()),
+        input_layout_(input_layout),
+        input_shape_(input_shape),
+        input_sharding_(input_layout.sharding_spec_strs()) {}
+
+  const Layout& expander_value_layout() const { return expander_value_layout_; }
+
+  const Layout& expander_input_layout() const { return expander_input_layout_; }
+
+  const std::vector<Token>& local_tokens() const { return local_tokens_; }
+
+ protected:
+  void VisitEllipsisAxis(const Token& token) override {
+    local_tokens_.push_back(token);
+  }
+
+  void VisitImplicitAxis(int64_t input_index, int64_t output_index) override {
+    expander_input_sharding_.push_back(input_sharding_[output_index]);
+    expander_value_sharding_.push_back(input_sharding_[output_index]);
+  }
+
+  void VisitShrinkAxis(const Token& token, int64_t input_index,
+                       int64_t output_index) override {
+    local_tokens_.push_back(token);
+    expander_input_sharding_.push_back(Layout::kUnshardedDim);
+    // Skips this axis from values, since it will be removed from the inputs.
+  }
+
+  void VisitNewAxis(const Token& token, int64_t input_index,
+                    int64_t output_index) override {
+    local_tokens_.push_back(token);
+    expander_value_sharding_.push_back(Layout::kUnshardedDim);
+  }
+
+  void VisitRegularAxis(const Token& token, int64_t input_index,
+                        int64_t output_index) override {
+    auto local_token = token.GetLocalToken(
+        /*dim_size=*/input_shape_[input_index],
+        /*num_shards*/ input_layout_.num_shards_for_dim(input_index));
+    std::string sharding = input_sharding_[input_index];
+    if (local_token.has_value()) {
+      local_tokens_.push_back(*local_token);
+    } else {
+      sharding = Layout::kUnshardedDim;
+      local_tokens_.push_back(token);
+    }
+    expander_value_sharding_.push_back(sharding);
+    expander_input_sharding_.push_back(sharding);
+  }
+
+  void PrepareResults(int64_t spec_rank, int64_t input_rank,
+                      int64_t output_rank) override {
+    local_tokens_.reserve(spec_rank);
+    expander_input_sharding_.reserve(input_rank);
+    expander_value_sharding_.reserve(output_rank);
+  }
+
+  absl::Status FinalizeResults(int64_t input_rank,
+                               int64_t output_rank) override {
+    DCHECK_EQ(expander_input_sharding_.size(), input_rank);
+    DCHECK_EQ(expander_value_sharding_.size(), output_rank);
+    TF_ASSIGN_OR_RETURN(
+        expander_input_layout_,
+        Layout::GetLayout(expander_input_sharding_, input_layout_.mesh()));
+    TF_ASSIGN_OR_RETURN(
+        expander_value_layout_,
+        Layout::GetLayout(expander_value_sharding_, input_layout_.mesh()));
+    return absl::OkStatus();
+  }
+
+ private:
+  const Layout& input_layout_;
+  const llvm::ArrayRef<int64_t> input_shape_;
+  std::vector<std::string> input_sharding_;
+  std::vector<std::string> expander_value_sharding_;
+  std::vector<std::string> expander_input_sharding_;
+  // Outputs
+  Layout expander_value_layout_;
+  Layout expander_input_layout_;
+  std::vector<Token> local_tokens_;
+};
+
+// Backward layout inference for a StridedSlice token vector.
+//
+// For value_layout = StridedSlice(input_layout, tokens)
+//
+// The inference consumes value_layout, and produces:
+//  - a planned expander_input_layout that is suitable for SPMD expansion.
+//  - a planned expander_value_layout that is suitable for SPMD expansion.
+//  - a local_tokens vector for the arguments of the post-SPMD StridedSliceOp.
+//  expander_input_layout and expander_value_layout are consistent with
+//  local_tokens.
+class BackwardLayoutInference : public TokenProcessor {
+ public:
+  BackwardLayoutInference(const Layout& value_layout,
+                          const llvm::ArrayRef<int64_t> input_shape)
+      : TokenProcessor(input_shape.size()),
+        value_layout_(value_layout),
+        input_shape_(input_shape),
+        value_sharding_(value_layout.sharding_spec_strs()) {}
+
+  const Layout& expander_input_layout() const { return expander_input_layout_; }
+
+  const Layout& expander_value_layout() const { return expander_value_layout_; }
+
+  const std::vector<Token>& local_tokens() const { return local_tokens_; }
+
+ protected:
+  void VisitEllipsisAxis(const Token& token) override {
+    local_tokens_.push_back(token);
+  }
+
+  void VisitImplicitAxis(int64_t input_index, int64_t output_index) override {
+    expander_input_sharding_.push_back(value_sharding_[output_index]);
+    expander_value_sharding_.push_back(value_sharding_[output_index]);
+  }
+
+  void VisitShrinkAxis(const Token& token, int64_t input_index,
+                       int64_t output_index) override {
+    local_tokens_.push_back(token);
+    // There is no constraint on the input sharding, but we prefer to keep it
+    // unsharded to avoid inserting relayout toward the internal input layout.
+    expander_input_sharding_.push_back(Layout::kUnshardedDim);
+  }
+
+  void VisitNewAxis(const Token& token, int64_t input_index,
+                    int64_t output_index) override {
+    local_tokens_.push_back(token);
+    // No corresponding input axis.
+    expander_value_sharding_.push_back(Layout::kUnshardedDim);
+  }
+
+  void VisitRegularAxis(const Token& token, int64_t input_index,
+                        int64_t output_index) override {
+    auto local_token = token.GetLocalToken(
+        /*dim_size=*/input_shape_[input_index],
+        /*num_shards*/ value_layout_.num_shards_for_dim(output_index));
+    if (local_token.has_value()) {
+      std::string sharding = value_sharding_[output_index];
+      local_tokens_.push_back(*local_token);
+      expander_input_sharding_.push_back(sharding);
+      expander_value_sharding_.push_back(sharding);
+    } else {
+      local_tokens_.push_back(token);
+      // There is no constraint on the input sharding, but we prefer to keep it
+      // unsharded to avoid inserting relayout toward the internal input layout.
+      expander_input_sharding_.push_back(Layout::kUnshardedDim);
+      expander_value_sharding_.push_back(Layout::kUnshardedDim);
+    }
+  }
+
+  void PrepareResults(int64_t spec_rank, int64_t input_rank,
+                      int64_t output_rank) override {
+    local_tokens_.reserve(spec_rank);
+    expander_input_sharding_.reserve(input_rank);
+    expander_value_sharding_.reserve(output_rank);
+  }
+
+  absl::Status FinalizeResults(int64_t input_rank,
+                               int64_t output_rank) override {
+    DCHECK_EQ(expander_input_sharding_.size(), input_rank);
+    DCHECK_EQ(expander_value_sharding_.size(), output_rank);
+    TF_ASSIGN_OR_RETURN(
+        expander_input_layout_,
+        Layout::GetLayout(expander_input_sharding_, value_layout_.mesh()));
+    TF_ASSIGN_OR_RETURN(
+        expander_value_layout_,
+        Layout::GetLayout(expander_value_sharding_, value_layout_.mesh()));
+    return absl::OkStatus();
+  }
+
+ private:
+  const Layout& value_layout_;
+  const llvm::ArrayRef<int64_t> input_shape_;
+  std::vector<std::string> value_sharding_;
+  std::vector<std::string> expander_input_sharding_;
+  std::vector<std::string> expander_value_sharding_;
+  // Outputs
+  Layout expander_input_layout_;
+  Layout expander_value_layout_;
+  std::vector<Token> local_tokens_;
+};
+
+}  // namespace slice_util
+}  // namespace dtensor
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_CC_SLICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/small_constant_optimization.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/small_constant_optimization.h
new file mode 100644
index 00000000..dfdf7b69
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/small_constant_optimization.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_SMALL_CONSTANT_OPTIMIZATION_H_
+#define TENSORFLOW_DTENSOR_CC_SMALL_CONSTANT_OPTIMIZATION_H_
+
+#include <optional>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Attempt to convert small constant tensors into a constant NodeDef operation.
+// This constant value will be available for constant propagation in DTensor and
+// MLIR.
+
+// This conversion is currently required for some DTensor operations. In
+// particular, reductions require access to the axis argument at compilation
+// time. While this is not strictly necessary, it greatly simplifies SPMD code
+// generation and is generally available.
+std::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
+                                               TFE_TensorHandle* tensor,
+                                               const Layout& layout,
+                                               TF_Status* status);
+
+// Returns true if the tensor proto of a and b are different.
+bool NodeDefsHaveDifferentTensorProto(const NodeDef& a, const NodeDef& b);
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_SMALL_CONSTANT_OPTIMIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/tensor_layout.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/tensor_layout.h
new file mode 100644
index 00000000..40fb0119
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/tensor_layout.h
@@ -0,0 +1,467 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_TENSOR_LAYOUT_H_
+#define TENSORFLOW_DTENSOR_CC_TENSOR_LAYOUT_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
+
+// Definitions for DTensor mesh & layout.
+//
+// A mesh describes how a set of devices is partitioned.
+// A layout describes how a distributed tensor is partitioned across a mesh (and
+// thus across devices). Defining tensor layouts in terms of mesh dimensions
+// allows us to efficiently determine the communication required when computing
+// an operation with tensors of different layouts.
+namespace tensorflow {
+namespace dtensor {
+
+// Returns true if `size` is a dynamic size based on either MLIR and TF
+// standards.
+bool IsDynamicSize(int64_t size);
+
+// Returns true if `shape` is a dynamic shape based on either MLIR and TF
+// standards.
+bool IsDynamicShape(absl::Span<const int64_t> shape);
+
+// The location of a device in a mesh.
+//
+// Each device has a unique location in the mesh, which is indicated by the
+// offset in each mesh dimension. e.g. a mesh:
+//
+// [x:4, y:3, z:2]
+//
+// Must consist of 24 devices placed densely into the corresponding 3D space.
+using DeviceLocation = absl::InlinedVector<int64, 4>;
+
+// A shard refers to a partition of a tensor. Shards are arranged in
+// ShardVectors that contains a list of Shards and a list of integers
+// representing the number of shards in each dimension.
+//
+// Example: layout = sharding_specs:x,y, mesh:|x=2,y=2|. This can be represented
+// with a ShardVector:
+//          - shards = (1,1), (1,2), (2,1), (2,2)
+//          - num_shards_per_dim = (2,2).
+//
+// The number of elements in each shard matches the tensor rank.
+using Shard = std::vector<int>;
+
+struct ShardVector {
+  bool operator==(const ShardVector& other) const;
+  bool operator!=(const ShardVector& other) const { return !(*this == other); }
+  std::string ToString() const;
+
+  bool ContainsShard(const Shard& shard) const;
+
+  std::vector<Shard> shards;
+  std::vector<int> num_shards_per_dim;
+};
+
+struct MeshDimension {
+  MeshDimension(const std::string& name, int64 size)
+      : name(std::move(name)), size(size) {}
+  MeshDimension() = default;
+
+  std::string name;
+  int64 size;
+};
+
+class Mesh {
+ public:
+  // Failed serialized strings are represented with an empty string, therefore
+  // we use this string representation of an empty mesh instead to avoid
+  // confusion.
+  static constexpr const char* kEmptyMeshString = "empty_mesh";
+  static constexpr const char* kUseXLASPMDString = "use_xla_spmd";
+  static constexpr bool kUseXLASPMD = false;
+  enum class MeshType {
+    kTile,
+    kSingleDevice,
+  };
+
+  static Mesh Empty();
+  bool IsEmpty() const;
+  Mesh() { mesh_type_ = MeshType::kTile; }
+
+  inline bool IsTile() const { return mesh_type_ == MeshType::kTile; }
+  inline bool IsSingleDevice() const {
+    return mesh_type_ == MeshType::kSingleDevice;
+  }
+
+  // Creates fully defined mesh.
+  //
+  // When `use_xla_spmd` is true, all ops running on this mesh will use XLA SPMD
+  // instead of DTensor SPMD.
+  static Mesh CreateMesh(const std::string& mesh_name,
+                         const std::vector<std::string>& dim_names,
+                         const std::vector<std::int64_t>& mesh_shape,
+                         const std::vector<std::int64_t>& global_device_ids,
+                         const std::vector<std::string>& global_devices_str,
+                         const std::vector<std::int64_t>& local_device_ids,
+                         const std::vector<std::string>& local_devices_str,
+                         bool use_xla_spmd = Mesh::kUseXLASPMD);
+
+  // Parses from MeshProto.
+  static StatusOr<Mesh> ParseFromProto(const MeshProto& proto);
+  // Parses from a human readable string version of the mesh, currently used
+  // to represent meshes in MLIR:
+  //  mesh = <name|List[MeshDim]|List[GlobalId]|List[LocalId]|List[Devices]>
+  //
+  // Example:
+  //  mesh =
+  //  <name|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  static StatusOr<Mesh> FromString(absl::string_view str);
+  std::string ToString() const;
+  StatusOr<MeshProto> ToProto() const;
+
+  // Creates mesh without specific devices associated to it (aka abstract mesh).
+  // This is an experimental API. Use only if strictly needed.
+  static StatusOr<Mesh> GetAbstractMesh(
+      const std::string& name, const std::vector<MeshDimension>& mesh_dims);
+  // Creates fully defined mesh.
+  static StatusOr<Mesh> GetMesh(
+      const std::string& name, const std::vector<MeshDimension>& mesh_dims,
+      const std::vector<std::int64_t>& global_device_ids,
+      const std::vector<std::int64_t>& local_device_ids,
+      const std::vector<std::string>& local_devices,
+      const std::vector<std::string>& global_devices,
+      bool use_xla_spmd = Mesh::kUseXLASPMD);
+  static StatusOr<Mesh> GetSingleDeviceMesh(absl::string_view single_device);
+
+  bool is_cpu_mesh() const { return device_type() == "CPU"; }
+  bool is_epu_mesh() const { return device_type() == "EPU"; }
+  bool is_gpu_mesh() const { return device_type() == "GPU"; }
+  bool is_tpu_mesh() const { return device_type() == "TPU"; }
+  // Returns whether the mesh is a remote mesh.
+  bool is_remote() const {
+    return local_device_ids_.empty() && !global_device_ids_.empty();
+  }
+
+  StatusOr<Mesh> host_mesh() const { return ToDeviceType("CPU"); }
+
+  // Device information methods.
+  std::string device_type() const;
+  // Takes an index in the flattened list of devices and returns a location
+  // in the mesh.
+  StatusOr<const DeviceLocation> device_location(int offset) const;
+  int64 num_devices() const;
+  absl::Span<const std::string> local_devices() const { return local_devices_; }
+  absl::Span<const int64_t> local_device_ids() const {
+    return local_device_ids_;
+  }
+  // Parses names of local_devices according to TF's Device Name Utils.
+  StatusOr<const std::vector<DeviceNameUtils::ParsedName>> ParsedDevices()
+      const;
+  // Convert to given device type.
+  StatusOr<Mesh> ToDeviceType(const std::string& device_type) const;
+  std::vector<std::string> hosts() const;
+
+  // Consumes a location in the mesh and returns its corresponding index in
+  // the flattened list of devices.
+  int64 GetFlattenedCoordinate(const DeviceLocation& loc) const;
+
+  const MeshDimension& dim(int64 index) const { return mesh_dims_[index]; }
+  std::vector<MeshDimension> dims() const { return mesh_dims_; }
+  // Returns size of mesh dimension.
+  StatusOr<int64> dim_size(absl::string_view name) const;
+  // Returns list of mesh dimension sizes.
+  std::vector<int64> dim_sizes() const;
+  const std::string& dim_name(int64 index) const {
+    return mesh_dims_[index].name;
+  }
+  int64_t min_global_device_id() const {
+    DCHECK(!global_device_ids_.empty());
+    return *std::min_element(global_device_ids_.begin(),
+                             global_device_ids_.end());
+  }
+  int64_t num_local_devices() const { return local_devices_.size(); }
+
+  absl::Span<const int64_t> global_device_ids() const {
+    return global_device_ids_;
+  }
+
+  const std::vector<std::string>& global_devices() const {
+    return global_devices_;
+  }
+  // Returns index of given dim_name in the mesh.
+  StatusOr<int32> idx_for_dim(absl::string_view dim_name) const;
+
+  // Returns the index of MeshDimension in mesh where the mesh dimension name is
+  // `mesh_name`.
+  int GetMeshDimIndexWithName(const std::string& mesh_name) const;
+  bool IsMeshDim(const std::string& dim_name) const;
+  std::vector<std::string> MeshDimNames() const;
+
+  int64 rank() const;
+  int64 size() const;
+  bool use_xla_spmd() const { return use_xla_spmd_; }
+  const std::string& name() const { return name_; }
+  absl::string_view single_device() const { return single_device_; }
+
+  // Global unique fingerprint. Same on different workers.
+  uint64 GlobalFingerprint() const;
+
+  // Uses proto to compare the equality. If any conversion to proto fails,
+  // returns false.
+  bool operator==(const Mesh& b) const;
+  bool operator!=(const Mesh& b) const { return !((*this) == b); }
+  bool operator<(const Mesh& b) const {
+    return this->ToString() < b.ToString();
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Mesh& m) {
+    return H::combine(std::move(h), m.ToString());
+  }
+
+  // A map from mesh names to their corresponding core ID mappings. The core ID
+  // mapping is stored as a vector. The i-th element in the vector is the ID of
+  // the core represented by global device ID of i in this mesh.
+  //
+  // The entry stored under the empty name key (the so-called "default mapping"
+  // in some comments) is special. It is always set at the end of TPU
+  // initialization. It represents the mapping for any mesh whose global device
+  // IDs follow TF task-device ordinals. Legacy and test meshes created without
+  // using the `create_tpu_mesh` helper follow that rule and can use this entry.
+  static std::map<std::string, std::vector<int>>& tpu_core_ids();
+
+  // The host mesh associated with any user-defined TPU mesh.
+  static std::string& tpu_host_mesh();
+
+ private:
+  MeshType mesh_type_;
+  std::string name_;
+  // The following fields store the information for tile sharding. Usable only
+  // when the mesh has type `kTile`.
+  std::vector<MeshDimension> mesh_dims_;
+  std::vector<std::string> local_devices_;
+  std::vector<int64_t> local_device_ids_;
+  std::vector<int64_t> global_device_ids_;
+  std::vector<std::string> global_devices_;
+  bool use_xla_spmd_ = Mesh::kUseXLASPMD;
+
+  // Stores the device when mesh is used for representing the state of a tensor
+  // on one device. Usable only when the mesh has type `kSingleDevice`.
+  std::string single_device_;
+};
+
+// Obtain all possible forms of indexing a mesh.
+//
+// e.g. given a mesh with dimensions [x=2, y=3], returns {
+//   [0, 0], [0, 1], [0, 2],
+//   [1, 0], [1, 1], [1, 2]
+// }
+std::vector<DeviceLocation> ComputeDeviceLocations(const Mesh& mesh);
+
+class Layout {
+ public:
+  enum class LayoutType {
+    kEmpty,
+    kStatic,
+    kSingleDevice,
+    kParted,
+  };
+
+  static constexpr const char* kPartedPrefix = "parted:";
+  static constexpr const char* kStaticPrefix = "sharding_specs:";
+  static constexpr const char* kSingleDevicePrefix = "maximal:";
+
+  static constexpr const char* kUnshardedDim = "unsharded";
+  // This spec should only be used to express no preferred sharding in the
+  // Layout propagation algorithm.
+  static constexpr const char* kAny = "any";
+  // Failed serialized strings are represented with an empty string, therefore
+  // we use this string representation of an empty layout instead to avoid
+  // confusion.
+  static constexpr const char* kEmptyLayoutString = "empty_layout";
+  // Used for the relayout operation, to allow relayout act as an identity on
+  // the layout for the given dimension.
+  static constexpr const char* kMatch = "match";
+
+  Layout() = default;
+  Layout(const Layout& other) = default;
+
+  inline bool IsSingleDevice() const { return mesh_.IsSingleDevice(); }
+
+  // Returns empty layout.
+  static Layout Empty();
+
+  // Parses from LayoutProto.
+  static StatusOr<Layout> FromProto(const LayoutProto& proto);
+  // Parses from a human readable string version of the layout, currently used
+  // to represent layouts in MLIR:
+  //  layout = <sharding_specs:List[specs] mesh:name|List[MeshDim]|
+  //  List[GlobalId]|List[LocalId]|List[Devices]>
+  //
+  // Example:
+  //  layout = <sharding_specs:x,not_sharded mesh:name|x=2,y=2|0,1,2,3|0,1,2,3|
+  //  /job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,
+  //  /job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  static StatusOr<Layout> FromString(absl::string_view layout_str);
+  // Creates human readable string version of a layout.
+  std::string ToString() const;
+  StatusOr<LayoutProto> ToProto() const;
+
+  LayoutType type() const { return type_; }
+  const Mesh& mesh() const { return mesh_; }
+  static Layout ReplicatedOnMesh(const Mesh& mesh, int rank);
+  static Layout BatchShardedOnMesh(const Mesh& mesh, int rank,
+                                   const string& mesh_dim, int axis = 0);
+  static Layout ReplicatedLike(const Layout& layout);
+  static Layout BatchShardedLike(const Layout& layout, const string& mesh_dim,
+                                 int axis = 0);
+  static Layout AnyOnMesh(const Mesh& mesh, int rank);
+  // Creates a mesh of unique shards.
+  Mesh ReducedMesh() const;
+
+  // Deprecated: Replace calls with GetLayout that creates a new instance.
+  void set_mesh(Mesh mesh) { mesh_ = mesh; }
+
+  // Returns a layout for the transposed matrix for given layout. This assumes
+  // that only the last two dimensions are used for matrix computation and all
+  // dimensions before are batch dimensions.
+  static StatusOr<Layout> Transposed2D(const Layout& layout);
+  static bool IsUnshardedDimension(const absl::string_view name) {
+    return name == kUnshardedDim;
+  }
+  static bool IsShardedDimension(const absl::string_view name) {
+    return !IsUnshardedDimension(name);
+  }
+  static StatusOr<Layout> GetLayout(
+      LayoutType type, const std::vector<std::string>& sharding_spec_strs,
+      const Mesh& mesh);
+
+  // Deprecated: Update all call sites to GetLayout(LayoutType::kStatic, ...);
+  static StatusOr<Layout> GetLayout(
+      const std::vector<std::string>& sharding_spec_strs, const Mesh& mesh) {
+    return GetLayout(LayoutType::kStatic, sharding_spec_strs, mesh);
+  }
+
+  // Deprecated: Update all call sites to GetLayout(LayoutType::kSingleDevice,
+  // {}, ...);
+  static StatusOr<Layout> GetSingleDeviceLayout(const Mesh& mesh) {
+    return GetLayout(LayoutType::kSingleDevice, {}, mesh);
+  }
+
+  // Makes a new layout from this one dropping the given dimensions.
+  // If keep_dims is true, the dimensions are replicated rather than
+  // deleted.
+  StatusOr<Layout> GetLayoutWithReducedDims(
+      const absl::flat_hash_set<int>& reduced_dims, bool keep_dims) const;
+
+  // Converts the Layout to Parted.
+  StatusOr<Layout> ToParted() const {
+    return GetLayout(LayoutType::kParted, sharding_specs_, mesh_);
+  }
+
+  // Truncates a layout at the front or back, depending on the value of end.
+  // end = false returns the layout up to the split point,
+  // end = true returns the layout from the split point.
+  Layout Truncate(int64 split_point, bool end = false) const;
+
+  // Left or right pad the layout to a max rank.
+  Layout LeftPad(int64 rank) const;
+
+  // Minimally pads replicated axes on the left, or removes axes on the right,
+  // such that the result layout has the provided rank.
+  StatusOr<Layout> EnsureRank(int64_t rank) const;
+
+  bool IsFullyReplicated() const;
+  bool IsLastDimReplicated() const;
+  // Checks that the last N-1 dimensions are replicated
+  bool IsBatchParallel() const;
+  // Checks that the dimensions from [-non_batch_rank, end) are replicated.
+  bool IsBatchParallel(int non_batch_rank) const;
+  bool IsEmpty() const;
+
+  // Compute global shape using the layout and provided local_shape.
+  // Optionally take a second parameter `local_shapes` that represents the shape
+  // of all local tensors.
+  std::vector<int64_t> GlobalShapeFromLocalShape(
+      absl::Span<const int64_t> local_shape,
+      const std::vector<std::vector<int64_t>>* local_shapes = nullptr) const;
+
+  std::vector<int64_t> LocalShapeFromGlobalShape(
+      absl::Span<const int64_t> global_shape) const;
+  PartialTensorShape LocalShapeFromGlobalShape(
+      const PartialTensorShape& global_shape) const;
+
+  int64 rank() const { return sharding_specs_.size(); }
+  size_t num_shards_for_dim(int) const;
+  std::vector<int32> num_shards() const;
+
+  // Computes the corresponding shard vector to this layout.
+  ShardVector GetShardVector() const;
+
+  // Returns sharding specs in string form.
+  std::vector<std::string> sharding_spec_strs() const;
+
+  int64 num_devices() const { return mesh_.num_devices(); }
+
+  // Map hosts to shards.
+  std::map<std::string, ShardVector> HostShardMap() const;
+
+  const std::string& sharding_spec(int idx) const;
+
+  // Similar to IsEquivalentIgnoringType, but also verifies the layout type are
+  // equal.
+  bool IsEquivalent(const Layout& b) const;
+  // Two layouts are equivalent if they would result in the same sharding for
+  // the tensor. E.g. if one is unsharded and the other is sharded on a mesh
+  // dimension of size 1.
+  bool IsEquivalentIgnoringType(const Layout& b) const;
+  // Uses proto to compare the equality. If any conversion to proto fails,
+  // returns false.
+  bool operator==(const Layout& b) const;
+  bool operator!=(const Layout& b) const { return !((*this) == b); }
+  bool operator<(const Layout& b) const {
+    return this->ToString() < b.ToString();
+  }
+
+ private:
+  std::vector<std::string> sharding_specs_;
+  LayoutType type_;
+  Mesh mesh_;
+};
+
+// Takes two layouts and concatenates their TensorDimensions. If the meshes for
+// the two layouts are different or both layouts are using the same mesh
+// dimension returns an error rather than a layout.
+StatusOr<Layout> ConcatenateLayouts(const Layout& layout_a,
+                                    const Layout& layout_b);
+
+StatusOr<Layout> GetMostShardedLayout(const std::vector<Layout>& layouts);
+StatusOr<Layout> GetLeastShardedLayout(const std::vector<Layout>& layouts);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_TENSOR_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/tensor_with_layout.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/tensor_with_layout.h
new file mode 100644
index 00000000..82143b86
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/tensor_with_layout.h
@@ -0,0 +1,147 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_TENSOR_WITH_LAYOUT_H_
+#define TENSORFLOW_DTENSOR_CC_TENSOR_WITH_LAYOUT_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+enum TensorType {
+  kDense = 0,
+  kResource = 1,
+  kSparse = 2,
+};
+
+class ConstValueNode {
+ public:
+  explicit ConstValueNode(std::optional<NodeDef> const_value)
+      : const_value_(const_value),
+        input_layout_for_shape_op_result_(std::nullopt) {}
+
+  // Small constant value optimization for non-resource-handle tensors.
+  void set_const_value(NodeDef& const_node) {
+    // If we extracted a constant value from the tensor, check if this
+    // value was the output from `tf.shape`. In this case, we need to
+    // forward the kShapeOpInputLayout attribute to the new node def. This
+    // is needed for layout propagation when running in op-by-op mode.
+    //
+    // TODO(b/162747667): Improve the presentation for Shape input Op
+    //                    layout.
+    if (input_layout_for_shape_op_result_.has_value()) {
+      AddNodeAttr(kShapeOpInputLayout,
+                  {input_layout_for_shape_op_result_->ToString()},
+                  &(const_node));
+    }
+    const_value_.emplace(const_node);
+  }
+
+  // Clears the cached const value if present.
+  void reset_const_value() { const_value_.reset(); }
+
+  const std::optional<NodeDef>& const_value() const { return const_value_; }
+
+  void set_input_layout_for_shape_op_result(const Layout& layout) {
+    input_layout_for_shape_op_result_.emplace(layout);
+  }
+
+  const std::optional<Layout>& shape_metadata_layout() const {
+    return input_layout_for_shape_op_result_;
+  }
+
+ private:
+  // The value of a small, non-resource tensor. Small constants
+  // are directly folded into the SPMD graph instead of being passed as inputs.
+  // This provides extra information to the layout propagation and SPMD passes
+  // during op-by-op execution. (For example, the reduction indices for Sum,
+  // target shapes for Rng/Reshape, etc).
+  std::optional<NodeDef> const_value_;
+
+  // The original input layout for a shape Op returned Tensor.
+  // This is used to preserve information for a shape op output so that future
+  // uses could recover local shape.
+  std::optional<Layout> input_layout_for_shape_op_result_ = std::nullopt;
+};
+
+// The representation of tensors transferred to underlying devices and the
+// layout for the tensors.
+class TensorWithLayout
+    : public llvm::RTTIExtends<TensorWithLayout, llvm::RTTIRoot> {
+ public:
+  // Gets the layout for the tensors.
+  virtual const Layout& layout() const = 0;
+
+  // Gets the tensor type which indicates whether the tensors are dense,
+  // resource or sparse.
+  virtual TensorType tensor_type() const = 0;
+
+  // Gets the data type of tensors.
+  virtual TF_DataType dtype() const = 0;
+
+  // Encodes the NodeDef via provided builder, if applicable.
+  virtual void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const = 0;
+
+  // Generates a key which can be used for SPMD lowering.
+  virtual tensorflow::Fprint128 CacheKey() const = 0;
+
+  // Gets the tensor handle at position `index`. This makes sense only when the
+  // implementation owns a list of tensor handles. Otherwise this returns
+  // `nullptr`.
+  virtual TFE_TensorHandle* get_tensor(size_t index) const = 0;
+
+  // Gets the number of tensors.
+  virtual size_t num_tensors() const = 0;
+
+  // Returns a string which includes just the value and layout of the tensors.
+  virtual std::string SummarizeValue() const = 0;
+
+  // Returns a string which includes `SummarizeValue` along with shape and type
+  // information.
+  virtual std::string DebugString() const = 0;
+
+  // Gets the mesh for the tensors.
+  const Mesh& mesh() const { return layout().mesh(); }
+
+  // Computes global shape from layout & local tensor shape.
+  //
+  // For replicated layout tensors, global shape is simply the shape of local
+  // tensors on each device. For sharded tensor, this is the global shape
+  // encodes layout & local shape on each device.
+  virtual std::vector<int64_t> global_shape() const = 0;
+
+  // Gets a `ConstValueNode` which can operate on a `NodeDef` representing a
+  // small const tensor. If it is not null, it can be used in the SPMD
+  // expansion, regardless of which runtime is being used.
+  virtual ConstValueNode* const_value_node() const = 0;
+
+  // llvm::RTTIExtends ID.
+  static char ID;  // NOLINT
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_TENSOR_WITH_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/tpu_system_interface.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/tpu_system_interface.h
new file mode 100644
index 00000000..bfcebf34
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/tpu_system_interface.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_TPU_SYSTEM_INTERFACE_H_
+#define TENSORFLOW_DTENSOR_CC_TPU_SYSTEM_INTERFACE_H_
+
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+
+// Forward declare TFE_Context to avoid interface depending on c_api.
+typedef struct TFE_Context TFE_Context;
+
+namespace tensorflow {
+namespace dtensor {
+
+// DTensor TPU ops by default use the stream_executor-based TPU runtime.
+// This class defines what an alternative runtime (e.g. TFRT) needs to be
+// capable of to replace the default runtime.
+class TpuSystemInterface {
+ public:
+  virtual ~TpuSystemInterface() = default;
+
+  virtual absl::Status Initialize(OpKernelContext* ctx, ResourceMgr* rmgr,
+                                  absl::Duration retry_timeout,
+                                  std::vector<int32>* core_id_output_vec,
+                                  bool use_tfrt_host_runtime) = 0;
+
+  virtual absl::Status Shutdown() = 0;
+
+  virtual std::vector<std::vector<int>> TPUCoreIDsToLocations(
+      TFE_Context* context, const std::vector<int>& tpu_core_ids) = 0;
+
+  virtual std::vector<int> TPUCoreLocationsToIDs(
+      TFE_Context* context,
+      const std::vector<std::vector<int>>& tpu_core_locations) = 0;
+};
+
+// Sets a TPU system for DTensor to initialize and shut down the TPU mesh.
+// This function takes over the ownership of `tpu_system`.
+void SetPreferredTpuSystem(TpuSystemInterface* tpu_system);
+
+// Returns the currently set preferred TPU system, nullptr if none.
+TpuSystemInterface* GetPreferredTpuSystem();
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_TPU_SYSTEM_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h b/third_party/tflite-hdrs/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
new file mode 100644
index 00000000..3f6c0d91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
+#define TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Mhlo sharding string attribute, used for setting hlo sharding on ops, inputs,
+// and outputs of a function for XLA SPMD.
+constexpr char kXlaShardingAttr[] = "mhlo.sharding";
+
+// Represents a permutation of DTensor Layout mesh dimensions from major to
+// minor.
+//
+// Sizes of `permutation` and `sizes` must be equal.
+struct MeshMajorToMinor {
+  // A permutation of range [0...n].
+  std::vector<int64_t> permutation;
+  // The size of mesh dimensions before the permutation.
+  std::vector<int64_t> sizes;
+
+  // Produces a flat list of device ids according to the permutation.
+  std::vector<int64_t> ToDeviceList();
+};
+
+// Get the mesh dimensions from major to minor.
+StatusOr<MeshMajorToMinor> ConvertMeshMajorToMinor(const Layout& layout,
+                                                   const Mesh& mesh);
+
+// Returns an ::xla::OpSharding protobuf from `layout`.
+StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/collectives.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/collectives.h
new file mode 100644
index 00000000..fc0f8f02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/collectives.h
@@ -0,0 +1,116 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_H_
+#define TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Emits collective ops to convert `input` from `src_layout` to `tgt_layout`.
+// `src_layout` and `tgt_layout` must have the same rank. For each dimension,
+// it can only go from sharded to replicated. `input` must have static shapes.
+StatusOr<mlir::Value> EmitAllGather(
+    mlir::OpBuilder& builder, mlir::Value input,
+    const dtensor::Layout& src_layout, const dtensor::Layout& tgt_layout,
+    llvm::SmallPtrSet<mlir::Operation*, 4>* newly_created_ops = nullptr);
+
+// Given an input layout and a desired layout, inserts the necessary slice to
+// slice the original value based on the device id. All ops created by this
+// function are added to new_created_ops.
+//
+// Note that the newly created ops are inserted `after` original_value.
+StatusOr<const mlir::Value> EmitAllScatter(
+    mlir::OpBuilder& builder, const mlir::Value& original_value,
+    const Layout& original_layout, const Layout& desired_layout,
+    llvm::SmallPtrSet<mlir::Operation*, 4>* newly_created_ops = nullptr);
+
+// Emits splits and calls EmitAllGather (once) to relayout from the src layout
+// to the tgt layout on a single mesh.
+// Shape of input is expected to be the local shape for src_layout.
+StatusOr<mlir::Value> EmitRelayout(
+    mlir::Value input, const dtensor::Layout& src_layout,
+    const dtensor::Layout& tgt_layout,
+    llvm::SmallPtrSet<mlir::Operation*, 4>* newly_created_ops = nullptr);
+
+// Emits TransposeOp that permutes the input shape.
+mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
+                                 const mlir::Location& loc, mlir::Value input,
+                                 std::vector<int64_t>& perm_arr);
+
+// Emits collective ops to reduce `input` over `reduced_dims`.
+StatusOr<mlir::Operation*> EmitAllReduce(
+    mlir::OpBuilder& builder, const dtensor::Layout& output_layout,
+    const absl::flat_hash_set<std::string>& reduced_dims,
+    mlir::Operation* input, absl::string_view reduce_op);
+
+// Emits a barrier used for synchronization purposes and returns
+// a R1 const value using `value`. More precisely, this barrier
+// guarantees that
+//    1. Side-effect Ops before this barrier are complete before this op begins.
+//    2. Side-effect Ops after this barrier start after this barrier completes.
+//
+// Note that the returned operation must be used in the graph. If it is not
+// used, then this op will be removed from the graph from various compiler
+// passes and thus there will be no barrier.
+//
+// Used for introducing a barrier before every Merge op during checkpointing.
+StatusOr<mlir::Operation*> EmitBarrierWithConstValue(mlir::OpBuilder& builder,
+                                                     mlir::Location loc,
+                                                     const Mesh& mesh,
+                                                     int32 value);
+
+// Given input `tensor` that is sharded across spatial dimensions, conduct
+// halo exchange such that each spatially sharded input blocks exchange
+// `halo_size` slice with its neighboring processors.
+// If the input block is at the left/right/top/bottom edge, then ghost halo
+// tensor (zero) are padded instead. `mesh_dim` specifies the dimension which
+// halo exchange will be conducted. For example, if we consider a 4D Tensor
+// (batch, height, width, channel) that has layout (*, h, w, *). Then,
+// `mesh_dim` ==  "w" would mean that halo exchange will occur along the width
+// dimension. That is halo tensors with right/left neighbors will be exchanged.
+StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
+                                       const std::string& mesh_dim,
+                                       const Layout& layout,
+                                       mlir::Value mesh_coordinates,
+                                       mlir::tf_device::ClusterOp cluster,
+                                       mlir::Location location,
+                                       mlir::Value tensor);
+
+// Emits a DenseToSparse op followed by a SparseToDenseOp.
+// This is useful for emitting a Relayout on a SparseTensor.
+// One usage of this is in EmitRelayout when the input is a SparseTensor.
+StatusOr<mlir::Value> EmitDenseToSparseToDense(
+    mlir::OpBuilder& builder, mlir::Value input,
+    llvm::SmallPtrSet<mlir::Operation*, 4>* newly_created_ops = nullptr);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/collectives_common.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/collectives_common.h
new file mode 100644
index 00000000..6041eb45
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/collectives_common.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_COMMON_H_
+#define TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_COMMON_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Computes AllReduce partitions using reduced mesh dimension names.
+StatusOr<std::map<DeviceLocation, std::vector<int32>>>
+GetAllReducePartitionsFromReducedDims(
+    const dtensor::Layout& output_layout,
+    const absl::flat_hash_set<std::string>& reduced_dims);
+
+// Use the first device in the mesh to extract the device name.
+StatusOr<std::string> DeviceTypeFromMesh(const Mesh& mesh);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
new file mode 100644
index 00000000..121056f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
@@ -0,0 +1,169 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_CREATE_DTENSOR_MLIR_PASSES_H_
+#define TENSORFLOW_DTENSOR_MLIR_CREATE_DTENSOR_MLIR_PASSES_H_
+
+#include <memory>
+#include <optional>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorOpToDeviceClusterPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorDeviceMeshClusterCoarsening();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateDTensorDCE();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorUndoMergeConstAcrossMesh();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorElideIdentityBeforeCopyToMesh();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorConstantFolding();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorAllReduceSumOptimization();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorAllReduceScatterOptimization();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorAllReduceCombineOptimization();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorMixedPrecisionReducePass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorSetDefaultSharding();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorDesignateResourceHandleMesh();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorPropagateDefaultLayout();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorHandleCrossClusterDependencies();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAnnotateGlobalShape();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorLayoutPropagationPassV2();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorMeshPropagationPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorSPMDExpansion();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorClusterFunctionConversion();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorPropagateDeviceIdToFunctionArgs();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorTPUIntegration();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorTpuAddResourceDeviceAttribute();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorUpdateTPUMetadata();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateFunctionRenamingPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorMultiDeviceExpansionPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAllReduceLoweringPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAllToAllLoweringPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorReduceScatterLoweringPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAllGatherLoweringPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAllScatterLoweringPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorMergeClustersPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorDecomposeControlflowPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorLowerSendRecv();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorMoveCompilationToHost();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorSparseTensorToDenseTensor();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorSparseExpansion();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorInferShapesForRestoreV2Op();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorSetHloShardingPass(
+    std::optional<bool> check_layout_use_xla_spmd = std::optional<bool>(false));
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorLayoutToXlaShardingOpPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorRemoveDTensorLayoutPass();
+
+// Creates a pass that replaces `tf.Relayout` with `tf.Identity`.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorReplaceRelayoutWithIdentityPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorCollectiveTypeLoweringPass();
+
+// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_CREATE_DTENSOR_MLIR_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/device_utils.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/device_utils.h
new file mode 100644
index 00000000..0df776ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/device_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DEVICE_UTILS_H_
+#define TENSORFLOW_DTENSOR_MLIR_DEVICE_UTILS_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Returns an MLIR value representing the current device ID. Passing op is
+// assumed to be wrapped by a function graph from dtensor_device, and the first
+// argument of the it need to be DeviceId.
+StatusOr<mlir::Value> DeviceId(mlir::Operation* op);
+StatusOr<mlir::Value> DeviceId(mlir::Value val);
+
+// Returns the device ordinal of the device executing a function. Device ordinal
+// is the index of a TF device among all TF devices of the same type attached to
+// the same TF task.
+StatusOr<mlir::Value> GetDeviceOrdinal(const Mesh& mesh,
+                                       const mlir::Location& loc,
+                                       mlir::func::FuncOp function,
+                                       mlir::OpBuilder* builder,
+                                       bool return_int64_type = true);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DEVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h
new file mode 100644
index 00000000..d23df1d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_DIALECT_H_
+#define TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_DIALECT_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+
+// Dialect main class is defined in ODS, we include it here. The
+// constructor and the printing/parsing of dialect types are manually
+// implemented (see ops.cpp).
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h.inc"
+
+namespace mlir {
+namespace dtensor {
+
+//===----------------------------------------------------------------------===//
+// DTENSOR dialect types.
+//===----------------------------------------------------------------------===//
+
+}  // namespace dtensor
+}  // namespace mlir
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_DIALECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h
new file mode 100644
index 00000000..7e61a817
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines attributes for DTensor.
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_DTENSOR_ATTRIBUTES_H_
+#define TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_DTENSOR_ATTRIBUTES_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
+
+namespace mlir {
+namespace dtensor {
+
+namespace detail {
+struct LayoutAttrStorage;
+struct MeshAttrStorage;
+}  // namespace detail
+
+// Attribute to keep track of a mesh.
+class MeshAttr
+    : public Attribute::AttrBase<MeshAttr, Attribute, detail::MeshAttrStorage> {
+ public:
+  using Base::Base;
+  using Mesh = tensorflow::dtensor::Mesh;
+
+  static constexpr StringLiteral name = "dtensor.mesh";
+
+  // Constructor of attribute
+  static MeshAttr get(MLIRContext* context, const Mesh& mesh);
+
+  // Returns Mesh
+  const Mesh& getValue() const;
+};
+
+// Custom attribute to keep track of dtensor layouts.
+class LayoutAttr : public Attribute::AttrBase<LayoutAttr, Attribute,
+                                              detail::LayoutAttrStorage> {
+ public:
+  using Base::Base;
+  using Layout = tensorflow::dtensor::Layout;
+  using Mesh = tensorflow::dtensor::Mesh;
+
+  static constexpr StringLiteral name = "dtensor.layout";
+
+  // Create a layout attribute.
+  static LayoutAttr get(MLIRContext* context, Layout layout);
+
+  // Get layout.
+  const Layout& getValue() const;
+};
+
+}  // namespace dtensor
+}  // namespace mlir
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_DTENSOR_ATTRIBUTES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.h
new file mode 100644
index 00000000..d2096bcd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_OPS_H_
+#define TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_OPS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h"
+
+#define GET_OP_CLASSES
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.h.inc"
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DTENSOR_DIALECT_IR_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_location.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_location.h
new file mode 100644
index 00000000..d681f192
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_location.h
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_LOCATION_H_
+#define TENSORFLOW_DTENSOR_MLIR_DTENSOR_LOCATION_H_
+
+#include <string>
+
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+// mlir::Location utilities for DTensor. `DTensorLocation` augments a location
+// object with the current file and line _of the C++ code creating an
+// operation_. This simplifies tracking down the creator of an invalid operation
+// while debugging.
+namespace tensorflow {
+namespace dtensor {
+
+mlir::Location DTensorLocation(mlir::Location loc, llvm::StringRef file,
+                               unsigned int line, llvm::StringRef name = "");
+
+mlir::Location DTensorLocation(mlir::Operation* op, llvm::StringRef file,
+                               unsigned int line, llvm::StringRef name = "");
+
+// Creates a string from a location of the following format:
+//    >> pass_file_1:line1:col1
+//    >> pass_file_2:line2:col2
+//
+// DTensor location format overloads the filename value to encode pass
+// information.
+//   original_file
+//    >> pass_file_1:line1:col1
+//    >> pass_file_2:line2:col2
+//   original_line:original_col
+std::string DTensorLocationToString(mlir::Location loc);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+// Creates a location, reusing the current name scope.
+#define DT_LOC(loc) \
+  ::tensorflow::dtensor::DTensorLocation(loc, __FILE__, __LINE__)
+
+// Creates a location, recording a new nested name scope.
+#define DT_LOC2(loc, name) \
+  ::tensorflow::dtensor::DTensorLocation(loc, __FILE__, __LINE__, name)
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DTENSOR_LOCATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_mlir_passes.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_mlir_passes.h
new file mode 100644
index 00000000..b57ae35e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_mlir_passes.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_MLIR_PASSES_H_
+#define TENSORFLOW_DTENSOR_MLIR_DTENSOR_MLIR_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+bool MaybeEnableLogging(mlir::PassManager* pm);
+
+// Adds MLIR passes to `pm`.
+void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions& options,
+                           mlir::OpPassManager* pm);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DTENSOR_MLIR_PASSES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_send_recv.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_send_recv.h
new file mode 100644
index 00000000..d4b21144
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/dtensor_send_recv.h
@@ -0,0 +1,127 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_SEND_RECV_H_
+#define TENSORFLOW_DTENSOR_MLIR_DTENSOR_SEND_RECV_H_
+
+#include "absl/status/status.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Given DTensorSend or DTensorRecv op, returns the corresponding DTensorRecv
+// or DTensorSend op with the same key.
+template <typename DTensorOp>
+StatusOr<mlir::Operation*> GetCorrespondingDTensorSendRecvOp(
+    mlir::ModuleOp module, DTensorOp dtensor_op) {
+  mlir::Operation* corresponding_op = nullptr;
+  if (std::is_same<DTensorOp, mlir::TF::DTensorSend>::value) {
+    module.walk([&](mlir::Operation* op) {
+      if (auto xla_recv_tpu = llvm::dyn_cast<mlir::TF::XlaRecvFromHostOp>(op)) {
+        if (dtensor_op.getKey() == xla_recv_tpu.getKey()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      } else if (auto xla_recv_cpu =
+                     llvm::dyn_cast<mlir::TF::_XlaRecvAtHostV2Op>(op)) {
+        if (dtensor_op.getKey() == xla_recv_cpu.getKey()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      } else if (auto dtensor_recv =
+                     llvm::dyn_cast<mlir::TF::DTensorRecv>(op)) {
+        if (dtensor_op.getKey() == dtensor_recv.getKey()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      } else if (auto host_recv = llvm::dyn_cast<mlir::TF::_HostRecvOp>(op)) {
+        if (dtensor_op.getKey() == host_recv.getTensorName()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      }
+      return mlir::WalkResult::advance();
+    });
+  } else {
+    const bool is_recv = std::is_same<DTensorOp, mlir::TF::DTensorRecv>::value;
+    if (!is_recv) {
+      return absl::InternalError(
+          "Error checking if is same for DTensorOp and DTensorRecv.");
+    }
+    module.walk([&](mlir::Operation* op) {
+      if (auto xla_send_tpu = llvm::dyn_cast<mlir::TF::XlaSendToHostOp>(op)) {
+        if (dtensor_op.getKey() == xla_send_tpu.getKey()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      } else if (auto xla_send_cpu =
+                     llvm::dyn_cast<mlir::TF::_XlaSendFromHostV2Op>(op)) {
+        if (dtensor_op.getKey() == xla_send_cpu.getKey()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      } else if (auto dtensor_send =
+                     llvm::dyn_cast<mlir::TF::DTensorSend>(op)) {
+        if (dtensor_op.getKey() == dtensor_send.getKey()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      } else if (auto host_send = llvm::dyn_cast<mlir::TF::_HostSendOp>(op)) {
+        if (dtensor_op.getKey() == host_send.getTensorName()) {
+          corresponding_op = op;
+          return mlir::WalkResult::interrupt();
+        }
+      }
+      return mlir::WalkResult::advance();
+    });
+  }
+
+  if (!corresponding_op)
+    return absl::InvalidArgumentError(
+        "DTensorSend/DTensorRecv op must have corresponding "
+        "DTensorRecv/DTensorSend op.");
+
+  return corresponding_op;
+}
+
+// Lowers DTensorSend to a number of different device-specific ops:
+// _HostSend, XlaSendFromHost, XlaSendToHost, etc.
+StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
+                                            mlir::Operation* recv_op);
+
+// Lowers DTensorRecv to a number of different device-specific ops:
+// _HostRecv, XlaRecvAtHost, XlaRecvFromHost, etc.
+StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
+                                            mlir::Operation* recv_op);
+
+// Lowers a DTensorSend and DTensorRecv pair to XLA ops
+StatusOr<mlir::Operation*> LowerDTensorSendAndRecv(mlir::Operation* send_op,
+                                                   mlir::Operation* recv_op);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_DTENSOR_SEND_RECV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.h
new file mode 100644
index 00000000..7a087b64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ARGMAX_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ARGMAX_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class ArgMaxSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ARGMAX_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.h
new file mode 100644
index 00000000..901ff38a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_BIAS_ADD_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_BIAS_ADD_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Expander for `BiasAdd` Op.
+// `BiasAdd` uses non-standard broadcasting rules if 'NC...' data_format is
+// used, thus we can't just reuse ElementwiseSPMDExpander as it is.
+class BiasAddExpander : public SPMDExpanderBase {
+ protected:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_BIAS_ADD_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.h
new file mode 100644
index 00000000..e7e71c16
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_BROADCAST_TO_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_BROADCAST_TO_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class BroadcastToSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_BROADCAST_TO_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.h
new file mode 100644
index 00000000..2ce9800d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONCAT_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONCAT_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class ConcatSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONCAT_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.h
new file mode 100644
index 00000000..e8e3f451
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.h
@@ -0,0 +1,63 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONTROL_FLOW_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONTROL_FLOW_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class WhileRegionSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class IfRegionSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONTROL_FLOW_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.h
new file mode 100644
index 00000000..bbab329e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONV_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONV_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Implement Layout propagation and SPMD expansion for Convolution ops.
+//
+// The extended class will be registered in spmd_expander.cc for Conv2D/3D and
+// Conv2D/3D Backprop ops to enable proper DTensor behavior of them. This
+// implementation is internal and specific to DTensor while upstream(python)
+// users won't need to use this class directly in any fashion.
+class ConvSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CONV_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.h
new file mode 100644
index 00000000..49dab361
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CUMSUM_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CUMSUM_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class CumsumSPMDExpander : public SPMDExpanderBase {
+ private:
+  // SPMD expand the op for local computation.
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_CUMSUM_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.h
new file mode 100644
index 00000000..323e1c43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.h
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DATAPARALLEL_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DATAPARALLEL_SPMD_EXPANDER_H_
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// General SPMD Expander for data parallel ops.
+
+// We define data parallel ops as ops that have tensors possibly with a batch
+// dimension. Assumes batch dimensions start from the left. Tensors may
+// may have multiple batch dimensions, including zero
+class DataparallelSPMDExpander : public SPMDExpanderBase {
+ protected:
+  // These maps contain {arg_index, non_batch_rank}
+  // Example is for TF:FFT2D, the batchable_operands and batchable_outputs has
+  // {0, 2} because the first argument is batchable and the last 2 dimensions
+  // are the non-batch dimensions
+  llvm::DenseMap<int, int> batchable_operands_;
+  llvm::DenseMap<int, int> batchable_outputs_;
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ public:
+  explicit DataparallelSPMDExpander(llvm::DenseMap<int, int> batchable_operands,
+                                    llvm::DenseMap<int, int> batchable_outputs)
+      : batchable_operands_(std::move(batchable_operands)),
+        batchable_outputs_(std::move(batchable_outputs)) {}
+
+ private:
+  // Relayouts all operands and outputs with a batch dimensions to a batch
+  // sharded layout. This should only be called when there is at least one
+  // batch sharded operand or batch sharded output
+  StatusOr<mlir::Operation*> RelayoutOperandsAndOutputs(
+      mlir::Operation* op, const std::vector<Layout>& operand_layouts,
+      const std::vector<Layout>& output_layouts);
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DATAPARALLEL_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/disable_copy_on_read_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/disable_copy_on_read_spmd_expander.h
new file mode 100644
index 00000000..de39a8af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/disable_copy_on_read_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DISABLE_COPY_ON_READ_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DISABLE_COPY_ON_READ_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class DisableCopyOnReadSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& operand_layouts,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DISABLE_COPY_ON_READ_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h
new file mode 100644
index 00000000..1181f22e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h
@@ -0,0 +1,98 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DTENSOR_OP_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DTENSOR_OP_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Converts layout of input tensor to target layout inserting split or reduction
+// ops if necessary.
+class RelayoutSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// Converts layout of gradient tensor to the layout of the original Relayout's
+// input tensor, using the same expansion logic as RelayoutOp.
+class RelayoutLikeSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// Lowers DTensorSend op to backend specific TF send/ xla send operation.
+// Following is the semantics for DTensorSend/Recv.
+// a) Both replicated/sharded DTensors can be sent/received.
+// b) When sharded DTensor is sent to another mesh, the DTensor is first
+//    all-to-all'ed to replicated tensor and sent to target mesh.
+// c) Send/Recv mesh must be from or to CPU mesh. That is, TPU<->TPU or
+//    GPU<->GTU is not supported.
+// d) Cross host send/recv is not supported. That is, sending tensor from
+//    TPU device of TPUWorker 0 to host of TPUWorker 1 is unsupported.
+class DTensorSendSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// Lowers DTensorRecv op to backend specific TF recv/ xla recv operation.
+class DTensorRecvSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_DTENSOR_OP_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.h
new file mode 100644
index 00000000..9da67db6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.h
@@ -0,0 +1,66 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_EINSUM_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_EINSUM_SPMD_EXPANDER_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class EinsumSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ private:
+  // This function will take the inputs, possibly slice or add AllConcat along
+  // various mesh dimensions before the einsum operation takes place. This also
+  // returns:
+  // * The mesh dimensions, if any, that the output of the einsum should be
+  //   summed along.
+  // * The resulting output layout of the einsum operation, so we can insert an
+  //   AllConcat/split to make the output have the desired layout.
+  // * The new inputs to fed into the einsum.
+  absl::Status MaybeRelayoutInputs(
+      const std::vector<Layout>& input_layouts, mlir::Operation* op,
+      const Layout& output_layout,
+      absl::flat_hash_set<std::string>& reduce_dims, Layout& einsum_layout,
+      std::vector<mlir::Value>& new_inputs);
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_EINSUM_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.h
new file mode 100644
index 00000000..3a51672a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ELEMENTWISE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ELEMENTWISE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Base case for elementwise ops.
+// This includes some of the Unary Ops like Neg, Abs, etc.
+class ElementwiseSPMDExpander : public SPMDExpanderBase {
+ protected:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ELEMENTWISE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.h
new file mode 100644
index 00000000..001936bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_EXPANDDIMS_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_EXPANDDIMS_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class ExpandDimsExpander : public SPMDExpanderBase {
+ public:
+  // Runs SPMD expansion for tf.ExpandDims. If needed, inserts a relayout.
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_EXPANDDIMS_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.h
new file mode 100644
index 00000000..d1cff4b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_FFT_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_FFT_SPMD_EXPANDER_H_
+
+#include <string>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Implement Layout propagation and SPMD expansion for FFT ops.
+class FFTSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_FFT_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.h
new file mode 100644
index 00000000..803a593e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_FILL_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_FILL_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class FillSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_FILL_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
new file mode 100644
index 00000000..9b9cdef7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
@@ -0,0 +1,246 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_GATHER_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_GATHER_SPMD_EXPANDER_H_
+
+#include <string>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/collectives.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class GatherCommonSPMDExpander : public SPMDExpanderBase {
+ public:
+  template <typename OpType>
+  StatusOr<mlir::Operation*> ExpandOpHelper(mlir::Operation* op) {
+    TF_ASSIGN_OR_RETURN(int64_t axis, GetAxis(op));
+    TF_ASSIGN_OR_RETURN(uint64_t batch_dims, GetBatchDim(op));
+
+    mlir::Value params = op->getOperand(0);
+    mlir::Value indices = op->getOperand(1);
+
+    TF_ASSIGN_OR_RETURN(std::vector<Layout> operand_layouts,
+                        ExtractRequiredLayoutFromOperands(op));
+    TF_ASSIGN_OR_RETURN(const Layout& output_layout,
+                        ExtractRequiredSingleLayoutFromOp(op));
+
+    const Layout& params_layout = operand_layouts[0];
+    const Layout& indices_layout = operand_layouts[1];
+
+    const int params_rank = ValueRank(params);
+    const int indices_rank = ValueRank(indices);
+
+    if (params_rank == -1)
+      return errors::InvalidArgument("Missing rank for params input.");
+    if (indices_rank == -1)
+      return errors::InvalidArgument("Missing rank for indices input.");
+
+    // Handle the case of negative axis.
+    if (axis < 0) axis += params_rank;
+    if (batch_dims < 0) batch_dims += indices_rank;
+    mlir::OpBuilder builder(op);
+
+    // Step 1: If the params are sharded on dim axis, an unconditional
+    // all-concat is generated. Alternatively, we could do: all-concating
+    // indices, followed by tf.Gather + slicing with correct masks.
+    //
+    // Currently we only support the case that the output layout matching the
+    // params layout for all non-axis dim. Other cases needs either a slicing or
+    // all-concat, which can be added later.
+    {
+      Mesh mesh = params_layout.mesh();
+      std::vector<std::string> tgt_params_sharding_specs;
+      tgt_params_sharding_specs.reserve(params_rank);
+      // check the first half
+      for (int i = 0; i < axis; ++i) {
+        const std::string& dim_name = params_layout.sharding_spec(i);
+        if (dim_name != output_layout.sharding_spec(i)) {
+          return errors::InvalidArgument(
+              llvm::formatv(
+                  "input and output layout do not agree on non-axis dim {0}. "
+                  "\n  params: {1}\n  output: {2}, axis: {3}",
+                  i, params_layout.ToString(), output_layout.ToString(), axis)
+                  .str());
+        }
+        tgt_params_sharding_specs.push_back(dim_name);
+      }
+      // Set replicated for `axis` dim.
+      tgt_params_sharding_specs.push_back(Layout::kUnshardedDim);
+      // Check the second half
+      for (int i = axis + 1; i < params_rank; ++i) {
+        const std::string& dim_name = params_layout.sharding_spec(i);
+        // To align the param dim with output, we can think we insert
+        // indices_rank
+        // - batch_dims dims from indices and remove one from param (axis), so
+        // the shifting is indices_rank - batch_dims - 1.
+        if (dim_name !=
+            output_layout.sharding_spec(i + indices_rank - batch_dims - 1)) {
+          return errors::InvalidArgument(
+              llvm::formatv(
+                  "input and output layout do not agree on non-axis dim {0}. "
+                  "\n  params: {1}\n  output: {2}, axis: {3}",
+                  i, params_layout.ToString(), output_layout.ToString(), axis)
+                  .str());
+        }
+        tgt_params_sharding_specs.push_back(dim_name);
+      }
+
+      if (!Layout::IsUnshardedDimension(params_layout.sharding_spec(axis))) {
+        if (llvm::isa<mlir::TF::ResourceGatherOp>(op)) {
+          return errors::InvalidArgument(
+              "DTensor does not support sharded 0th dimension for the resource "
+              "tensor for ResourceGatherOp. Please unshard dimension ",
+              axis);
+        }
+        TF_ASSIGN_OR_RETURN(Layout tgt_params_layout,
+                            Layout::GetLayout(params_layout.type(),
+                                              tgt_params_sharding_specs, mesh));
+        TF_ASSIGN_OR_RETURN(
+            params,
+            EmitAllGather(builder, params, params_layout, tgt_params_layout));
+      }
+    }
+
+    // Step 2: Check the output layout. If it requires all-relayouting indices.
+    // Do it.
+    //
+    // Indices shape is not big typically. Relayouting is expected to be cheap.
+    {
+      bool indices_relayout_needed = false;
+      Mesh mesh = output_layout.mesh();
+      std::vector<std::string> tgt_indices_sharding_specs;
+      tgt_indices_sharding_specs.reserve(indices_rank);
+      for (int i = 0; i < indices_rank; ++i) {
+        int index_in_output;
+        int index_in_indices;
+        if (i < batch_dims) {
+          // For dim within batch_dims, indices dim is aligning at the same
+          // index as output.
+          index_in_output = i;
+          index_in_indices = i;
+        } else {
+          // For dim after batch_dims, we can remove batch_dims from outputs and
+          // indices first, i.e., (i - batch_dims), add axis back, i.e., axis -
+          // batch_dims, and then put batch_dims back, so the target position in
+          // output is
+          //
+          //   i - batch_dims + axis - batch_dims + batch_dims
+          //
+          // which is as follows:
+          index_in_output = i + axis - batch_dims;
+          index_in_indices = i;
+        }
+        tgt_indices_sharding_specs.push_back(
+            output_layout.sharding_spec(index_in_output));
+
+        if (output_layout.sharding_spec(index_in_output) !=
+            indices_layout.sharding_spec(index_in_indices)) {
+          indices_relayout_needed = true;
+        }
+      }
+
+      if (indices_relayout_needed) {
+        TF_ASSIGN_OR_RETURN(
+            Layout tgt_indices_layout,
+            Layout::GetLayout(indices_layout.type(), tgt_indices_sharding_specs,
+                              mesh));
+        TF_ASSIGN_OR_RETURN(
+            indices, EmitRelayout(indices, indices_layout, tgt_indices_layout));
+      }
+    }
+
+    auto new_operands = llvm::to_vector<4>(op->getOperands());
+    new_operands[0] = params;
+    new_operands[1] = indices;
+
+    mlir::Operation* new_gather =
+        builder
+            .create<OpType>(op->getLoc(), op->getResultTypes(),
+                            mlir::ValueRange(new_operands), op->getAttrs())
+            .getOperation();
+
+    op->getResult(0).replaceAllUsesWith(new_gather->getResult(0));
+    op->erase();
+
+    return InferSPMDExpandedLocalShape(new_gather);
+  }
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+  virtual StatusOr<int64_t> GetAxis(mlir::Operation* op) = 0;
+  virtual StatusOr<uint64_t> GetBatchDim(mlir::Operation* op) = 0;
+};
+
+class GatherV2SPMDExpander : public GatherCommonSPMDExpander {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<int64_t> GetAxis(mlir::Operation* op) override;
+
+  StatusOr<uint64_t> GetBatchDim(mlir::Operation* op) override;
+};
+
+class ResourceGatherSPMDExpander : public GatherCommonSPMDExpander {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<int64_t> GetAxis(mlir::Operation* op) override;
+
+  StatusOr<uint64_t> GetBatchDim(mlir::Operation* op) override;
+};
+
+class GatherNdSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_GATHER_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/identity_n_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/identity_n_spmd_expander.h
new file mode 100644
index 00000000..af3f5529
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/identity_n_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IDENTITY_N_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IDENTITY_N_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// SPMD Expander for IdentityNOp.
+class IdentityNSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IDENTITY_N_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.h
new file mode 100644
index 00000000..d2f45b4e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IN_TOP_K_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IN_TOP_K_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Implements SPMD Expansion for the TF::InTopKV2 op. See base class
+// SPMDExpanderBase for function documentation.
+class InTopKSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IN_TOP_K_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.h
new file mode 100644
index 00000000..82b4e842
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IO_OP_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IO_OP_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// SPMD Expander for IO ops, such as tf.WriteSummary.
+class IOOpSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_IO_OP_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h
new file mode 100644
index 00000000..78a21bf4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ITERATOR_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ITERATOR_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class IteratorGetNextSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class IteratorGetNextAsOptionalSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ITERATOR_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.h
new file mode 100644
index 00000000..dafbd657
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.h
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_MATMUL_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_MATMUL_SPMD_EXPANDER_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class MatMulSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ private:
+  StatusOr<Layout> OutputLayoutAndReducedDims(
+      bool allow_unknown_layouts, mlir::Operation* op,
+      absl::flat_hash_set<std::string>* reduced_dims,
+      std::optional<Layout>* left, std::optional<Layout>* right);
+
+  // This function prepares the inputs (x, y or a, b) to (Batch)MatMul by
+  // possibly computing a new layout for each input that allows us to simply
+  // emit a local (Batch)MatMul op. Once the layouts are computed, the function
+  // calls EmitRelayout to transform from left_layout, right_layout to the
+  // newly computed layouts.
+  // The left and right arguments are set to the mlir::Values representing the
+  // tensors with the possibly new layout.
+  // reduced_dim will contain the dim that must be reduced on after the local
+  // MatMul. It may be set to Layout::kUnsharded if no reduction is needed.
+  // matmul_layout will be set to the layout of the output of the local matmul
+  // (after the above reduction). This may be different from the desired output
+  // layout.
+  absl::Status MaybeRelayoutInputs(
+      mlir::Operation* op, const Layout& left_layout, bool left_transposed,
+      const Layout& right_layout, bool right_transposed,
+      const Layout& output_layout, std::string& reduced_dim,
+      Layout& matmul_layout, mlir::Value& left, mlir::Value& right);
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_MATMUL_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.h
new file mode 100644
index 00000000..6ba418e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.h
@@ -0,0 +1,150 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_META_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_META_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Pack/Unpack (aka tf.stack/unstack)
+// For Pack, we verify input tensors have the same layout, and produce a new
+// tensor of rank N + 1 with an unsharded first dimension.
+class PackSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class UnpackSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class PadSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class TileSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// SPMD expansion for reshape.
+//
+// If an explicit layout is provided, reshape will adjust the output to
+// conform to the new layout. N.B. not all possible input/output shapes+layouts
+// are implemented.
+//
+// A fully general reshape involves arbitrary send/recv or collective
+// permutations, and may be inefficient.
+//
+// We provide special cases for a number of common cases.
+class ReshapeSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class TransposeSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class OneHotSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// SPMD expansion for shape/rank metadata operations.
+class ShapeSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_META_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.h
new file mode 100644
index 00000000..ebe27df5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_NULLARY_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_NULLARY_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// SPMD Expander for handling ops without operands.
+class NullarySPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_NULLARY_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h
new file mode 100644
index 00000000..01e20fa6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_OPTIONAL_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_OPTIONAL_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class OptionalGetValueSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class OptionalHasValueSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_OPTIONAL_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.h
new file mode 100644
index 00000000..8079780d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_QR_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_QR_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// SPMD Expander for QR op. The First Rank-2 dimensions are the batch dimensions
+// and the last two dimensions must be replicated
+class QRSPMDExpander : public SPMDExpanderBase {
+ protected:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ private:
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_QR_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.h
new file mode 100644
index 00000000..a47c1c21
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RANDOM_OP_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RANDOM_OP_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class RandomOpSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RANDOM_OP_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/range_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/range_spmd_expander.h
new file mode 100644
index 00000000..53351fed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/range_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RANGE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RANGE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// SPMD Expander for RangeOp.
+class RangeSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RANGE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.h
new file mode 100644
index 00000000..31a76dde
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_REDUCE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_REDUCE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Emit SPMD expansion for reduction operations
+// (max, min, prod, sum, std, mean, variance)
+class ReduceSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_REDUCE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.h
new file mode 100644
index 00000000..544d111f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.h
@@ -0,0 +1,68 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_REPLICATED_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_REPLICATED_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// General SPMD Expander that enforces input(s) and output(s) are replicated.
+class ReplicatedOpSPMDExpander : public SPMDExpanderBase {
+ protected:
+  // If true, then during ExpandOp, relayouts all operands and outputs
+  // to be have replicated layout. If false, then will emit
+  // an error if not all operand and output layouts are replicated after layout
+  // propagation.
+  //
+  // This is needed because some ops like RngReadAndSkip need to enforce input
+  // and output are replicated, while some ops don't need to enforce it, so
+  // we can just relayout to replicated on those during ExpandOp.
+  bool relayout_when_sharded_;
+
+  // Expand the op to the local op after checking all layouts are replicated.
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  // Compute the layouts as always replicated.
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  // Compute the layouts as always replicated.
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ public:
+  explicit ReplicatedOpSPMDExpander(bool relayout_when_sharded = false) {
+    relayout_when_sharded_ = relayout_when_sharded;
+  }
+
+ private:
+  // Relayouts all operands and outputs to a replicated layout.
+  StatusOr<mlir::Operation*> ReplicatedRelayoutOperandsAndOutputs(
+      mlir::Operation* op, const std::vector<Layout>& operand_layouts,
+      const std::vector<Layout>& output_layouts);
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_REPLICATED_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.h
new file mode 100644
index 00000000..1dc2c3f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RESOURCE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RESOURCE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class ResourceSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  // Override the backward with both input and output layouts as arguments
+  // because we need to compute some operand layouts from existing operand
+  // layouts.
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_RESOURCE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.h
new file mode 100644
index 00000000..be7e11cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.h
@@ -0,0 +1,68 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SAVE_RESTORE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SAVE_RESTORE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class SaveRestoreSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// SPMD expansion for DTensorShardPrefix op.
+// The expansion merely sets the output layout as a rank 1 replicated tensor.
+class DTensorShardPrefixSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override {
+    return op;
+  };
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override {
+    TF_ASSIGN_OR_RETURN(auto mesh, ExtractDeviceMeshEnclosingCluster(op));
+    return llvm::DenseMap<int, Layout>(
+        {{0, Layout::ReplicatedOnMesh(mesh, /*rank=*/1)}});
+  }
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override {
+    return llvm::DenseMap<int, Layout>();
+  }
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SAVE_RESTORE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h
new file mode 100644
index 00000000..c5750b04
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SCATTER_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SCATTER_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class TensorScatterOpSPMDExpander : public SPMDExpanderBase {
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class ScatterNdOpSPMDExpander : public SPMDExpanderBase {
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SCATTER_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.h
new file mode 100644
index 00000000..9be84a2f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SEGMENTATION_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SEGMENTATION_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// TODO(b/171079751): Implement SPMD logic for non-trivial layouts.
+class UnsortedSegmentSumSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SEGMENTATION_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h
new file mode 100644
index 00000000..e749b5f6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SLICE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SLICE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class SliceSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SLICE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.h
new file mode 100644
index 00000000..065fd5c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.h
@@ -0,0 +1,80 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SOFTMAX_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SOFTMAX_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Expander for Softmax and LogSoftmax ops.
+class SoftmaxOpSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// Expander for SoftmaxCrossEntropyWithLogits ops.
+class SoftmaxLossOpSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ private:
+  // Computes the relayouts of the inputs of the softmax loss op. Returns the
+  // internal layout of the softmax loss in new_features_layout and
+  // new_labels_layout.
+  StatusOr<Layout> MaybeRelayoutInputs(mlir::Operation* op, bool is_sparse,
+                                       const Layout& features_layout,
+                                       const Layout& labels_layout,
+                                       const Layout& loss_layout,
+                                       const Layout& backprop_layout,
+                                       Layout& new_features_layout,
+                                       Layout& new_labels_layout);
+
+  // Computes relayouts of the outputs, returning an IdentityN op that ties
+  // together the loss and backprop returns.
+  StatusOr<mlir::Operation*> MaybeRelayoutOutputs(
+      mlir::Operation* op, const mlir::Value& loss, const mlir::Value& backprop,
+      const Layout& output_layout, const Layout& loss_layout,
+      const Layout& backprop_layout);
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SOFTMAX_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.h
new file mode 100644
index 00000000..7e91dc2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SPARSE_TO_DENSE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SPARSE_TO_DENSE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Implements SPMD Expansion for TF::SparseToDenseOp See base class
+// SPMDExpanderBase for function documentation.
+class SparseToDenseSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SPARSE_TO_DENSE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/split_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/split_spmd_expander.h
new file mode 100644
index 00000000..4cea045c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/split_spmd_expander.h
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SPLIT_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SPLIT_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Implements SPMD Expansion for the TF::Split op. See base class
+// SPMDExpanderBase for function documentation.
+class SplitSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+// Implements SPMD Expansion for the TF::SplitV op. See base class
+// SPMDExpanderBase for function documentation.
+class SplitVSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SPLIT_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.h
new file mode 100644
index 00000000..2b057f4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SQUEEZE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SQUEEZE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class SqueezeSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_SQUEEZE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h
new file mode 100644
index 00000000..0fae8869
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_STRIDED_SLICE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_STRIDED_SLICE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class StridedSliceSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class TensorStridedSliceUpdateSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class StridedSliceGradSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_STRIDED_SLICE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_getitem_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_getitem_spmd_expander.h
new file mode 100644
index 00000000..e5aa8fd9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_getitem_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_GETITEM_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_GETITEM_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class TensorListGetItemSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_GETITEM_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.h
new file mode 100644
index 00000000..e1ca9b87
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_RESERVE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_RESERVE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class TensorListReserveSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_RESERVE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_setitem_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_setitem_spmd_expander.h
new file mode 100644
index 00000000..865f9db7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/tensorlist_setitem_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_SETITEM_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_SETITEM_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class TensorListSetItemSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TENSORLIST_SETITEM_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.h
new file mode 100644
index 00000000..b13be7a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TOP_K_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TOP_K_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class TopKSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TOP_K_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.h
new file mode 100644
index 00000000..aee6cf9f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.h
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TRIVIAL_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TRIVIAL_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class NoOpSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override {
+    return InferSPMDExpandedLocalShape(op);
+  }
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override {
+    return llvm::DenseMap<int, Layout>();
+  }
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override {
+    return llvm::DenseMap<int, Layout>();
+  }
+};
+
+class TerminatorSPMDExpander : public SPMDExpanderBase {
+ private:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override {
+    return llvm::DenseMap<int, Layout>();
+  }
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override {
+    return llvm::DenseMap<int, Layout>();
+  }
+};
+
+// Expansion for metadata operations (like BroadcastGradientArgs) which always
+// take replicated inputs and emit a fully replicated output.
+class MetadataSPMDExpander : public SPMDExpanderBase {
+ public:
+  // BroadcastGradientArgs accepts 2 shape tensors and returns 2 values, which
+  // indicate reduction dimensions that should be used a part of gradient
+  // computation. These dimensions should be computed against the global shape,
+  // as there are various specializations that occur when a dimension is of
+  // size 1.
+  //
+  // Since shapes are passed in as value (as opposed to being pulled directly
+  // from the input shape), the operation will be performed on the global shape
+  // by default.
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_TRIVIAL_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h
new file mode 100644
index 00000000..34802cf8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_UNSUPPORTED_OP_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_UNSUPPORTED_OP_SPMD_EXPANDER_H_
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class UnsupportedOpSPMDExpander : public SPMDExpanderBase {
+ public:
+  explicit UnsupportedOpSPMDExpander(absl::string_view error_message);
+
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ private:
+  absl::string_view error_message_;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_UNSUPPORTED_OP_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/where_spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/where_spmd_expander.h
new file mode 100644
index 00000000..cd078e7a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/expansions/where_spmd_expander.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_WHERE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_WHERE_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Expander class for Where op.
+// Where op returns the indices of the non-zero elements in the input tensor.
+class WhereOpSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_WHERE_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/ir/tf_dtensor.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/ir/tf_dtensor.h
new file mode 100644
index 00000000..5b0454d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/ir/tf_dtensor.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_IR_TF_DTENSOR_H_
+#define TENSORFLOW_DTENSOR_MLIR_IR_TF_DTENSOR_H_
+
+// Additional TF dialect operations for DTensor.
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h"
+
+namespace mlir {
+namespace TF {
+
+// Add DTensor related operations into the TF dialect.
+int RegisterDTensorTFOps();
+
+}  // namespace TF
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h.inc"
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_IR_TF_DTENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/layout_parsing.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/layout_parsing.h
new file mode 100644
index 00000000..484f6871
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/layout_parsing.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_LAYOUT_PARSING_H_
+#define TENSORFLOW_DTENSOR_MLIR_LAYOUT_PARSING_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Extracts `_layout` attribute from `op` and assert a single layout.
+StatusOr<std::optional<Layout>> ExtractSingleLayoutFromOp(mlir::Operation* op);
+
+// Extracts `_layout` attribute from `op`, and returns an error is the layout
+// is missing.
+StatusOr<Layout> ExtractRequiredSingleLayoutFromOp(mlir::Operation* op);
+
+// Extracts `_layout` attribute from `op` and assert a single layout.
+StatusOr<std::optional<Layout>> ExtractSingleLayoutFromOp(
+    mlir::Operation* op, std::string attr_name);
+
+// Extracts `_layout` attribute from `op`.
+StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
+    mlir::Operation* op);
+
+// Extracts `_layout` attribute from `op` and returns an error if any are
+// missing.
+StatusOr<std::vector<Layout>> ExtractRequiredLayoutFromOp(mlir::Operation* op);
+
+// Extract and deserialize a tensor layout from `attr_name`.
+StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
+    mlir::Operation* op, std::string attr_name);
+
+// Extracts '_layout' attribute from `operand`.
+StatusOr<std::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand);
+
+// Extracts '_layout' attribute from `operand` and returns an error if missing.
+StatusOr<Layout> ExtractRequiredLayoutFromOperand(mlir::Value operand);
+
+// Extracts `_layout` attribute from `op`'s operands and returns an error if
+// any are missing.
+StatusOr<std::vector<Layout>> ExtractRequiredLayoutFromOperands(
+    mlir::Operation* op);
+
+// Set `_layout` attribute for op. For layouts without value, an empty string is
+// used as place holder.
+void SetLayoutOnOp(mlir::Operation* op, mlir::OpBuilder builder,
+                   absl::Span<const absl::optional<Layout>> layouts);
+
+void SetLayoutOnOp(mlir::Operation* op,
+                   absl::Span<const absl::optional<Layout>> layouts);
+
+void SetSingleLayoutOnOp(mlir::Operation* op, const Layout& layout);
+
+// Extracts device mesh configuration from op's enclosing tf_device.Cluster op.
+StatusOr<Mesh> ExtractDeviceMeshEnclosingCluster(mlir::Operation* op);
+
+// Extracts device mesh configuration from op's `_mesh` attribute.
+StatusOr<std::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op);
+
+// Extracts default layout information from function return attribute.
+StatusOr<std::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
+    mlir::func::ReturnOp return_op, int return_index);
+
+// Extract element layouts from the iterator resource operand of an op that uses
+// that iterator (e.g. IteratorGetNext, OptionalGetValue, etc.). The layouts are
+// extracted from the `tf._element_layouts` attribute of that resource tensor.
+StatusOr<llvm::SmallVector<Layout, 4>> ExtractElementLayoutsFromOperand(
+    mlir::OpOperand& input_value);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_LAYOUT_PARSING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/op_utils.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/op_utils.h
new file mode 100644
index 00000000..4f59456e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/op_utils.h
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_OP_UTILS_H_
+#define TENSORFLOW_DTENSOR_MLIR_OP_UTILS_H_
+
+#include <optional>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Computes a deterministic hash of the given operation.
+uint64_t OpHash(mlir::Operation* op);
+
+inline std::string OpName(mlir::Operation* op) {
+  auto ref = op->getName().getStringRef();
+  return ref.str();
+}
+
+inline std::string GetFullOpName(const std::string& name) {
+  return "tf." + name;
+}
+
+// Returns FuncOp if `op` is a callable.
+std::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op);
+
+// DTensorLayout only conveys layout information of tensors which is no
+// longer needed after SPMD expansion. As so, remove all layouts from
+// graph.
+//
+// If `remove_xla_spmd_layouts` is set to false, then Layouts that have
+// Mesh config with use_xla_spmd=True will not be deleted.
+//
+// Removes tf.DTensorLayout op and forwards it's input to it's users.
+// For example:
+//   %0 = tf.A()
+//   %1 = tf.DTensorLayout(%0)
+//   %2 = tf.B(%1)
+//
+// Will be converted to:
+//   %0 = tf.A()
+//   %2 = tf.B(%0)
+void RemoveDTensorLayoutOps(mlir::ModuleOp module,
+                            bool remove_xla_spmd_layouts);
+
+// Canonicalizer and DCE transformation passes may removed ops in the graph and
+// result in multiple consecutive DTensorLayout ops. Detect all such cases and
+// replace unnecessary DTensorLayout ops with Identity ops.
+//
+// Removes tf.DTensorLayouts and inserts a tf.Identity.
+// For example:
+//   %0 = tf.DTensorLayout(arg0)
+//   %1 = tf.DTensorLayout(%0)
+//   %2 = tf.Add(%1, %1)
+//
+// Will be converted to:
+//   %0 = tf.Identity(arg0)
+//   %1 = tf.DTensorLayout(%0)
+//   %2 = tf.Add(%1, %1)
+mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
+    mlir::ModuleOp module);
+
+// For all constants with multiple usages, clone the constants so that each
+// constant operation has at most 1 usage.
+void DuplicateConstants(mlir::Operation* op);
+
+// Constructs the dtensor Operation name from a module object.
+std::string GetOperationName(mlir::ModuleOp module);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_OP_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/shape_utils.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/shape_utils.h
new file mode 100644
index 00000000..f525965b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/shape_utils.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SHAPE_UTILS_H_
+#define TENSORFLOW_DTENSOR_MLIR_SHAPE_UTILS_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalInputShape(
+    mlir::OpOperand& input_value);
+
+StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalOutputShape(
+    mlir::OpResult result_value);
+
+// If result is a resource, the shape of the result should be adjusted to
+// local value of the resource, based on the layout for output.
+absl::Status InferSPMDExpandedLocalShapeForResourceOutput(
+    mlir::OpResult* op_result, const Layout& output_layout,
+    mlir::MLIRContext* context);
+
+// Returns op with recalculated local shape of `op` given all it's operands.
+mlir::Operation* InferSPMDExpandedLocalShape(mlir::Operation* op);
+
+// Gets the shape of a Value if the type is a RankedTensorType, otherwise
+// returns an error.
+StatusOr<llvm::ArrayRef<int64_t>> GetShapeOfValue(const mlir::Value& value,
+                                                  bool fail_on_dynamic = false);
+
+// If the producer or consumer of this value is a DTensorLayout, retrieves
+// the global shape from that layout, otherwise returns an error.
+StatusOr<llvm::ArrayRef<int64_t>> GetGlobalShapeOfValueFromDTensorLayout(
+    const mlir::Value& value);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SHAPE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expander.h
new file mode 100644
index 00000000..ba9e1ac5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expander.h
@@ -0,0 +1,84 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Base class for handling Sparse expansion of a MLIR TF Operation.
+// Note that an op will only go through Sparse Expansion only if it has
+// any sparse input tensors.
+class SparseExpanderBase {
+ public:
+  virtual ~SparseExpanderBase() = default;
+
+  // Converts `op` to a Sparse expanded form. Sparse expansion logic is
+  // a function of op type and op's operand type.
+  // Must return the `op` that is expanded as the final return value.
+  //
+  // An op has a SparseTensor operand if the defining op of that operand
+  // is a SparseToDenseOp.
+  virtual StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) = 0;
+};
+
+// Computes the Sparse expansion for `op`.
+absl::Status RunSparseExpansion(mlir::Operation* op, mlir::Operation** output);
+
+// A registry of sparse SPMD expanders. This map is statically stored and
+// initialized with all the registered sparse SPMD expanders.
+class SparseExpanderRegistry {
+ public:
+  ~SparseExpanderRegistry() = default;
+
+  // A singleton available at startup.
+  static SparseExpanderRegistry* Global();
+
+  // Returns the sparse expansion for the given operation (or nullptr if no
+  // expansion has been registered).
+  SparseExpanderBase* GetSparseExpansionFnForOp(mlir::Operation* op);
+
+  // Registers a sparse expander for the provided opName.
+  InitOnStartupMarker RegisterSparseExpansionFn(
+      std::string opName, std::unique_ptr<SparseExpanderBase> prop);
+
+ private:
+  absl::flat_hash_map<std::string, std::unique_ptr<SparseExpanderBase>>
+      op_to_sparse_expansion_fn_map_;
+};
+
+#define REGISTER_SPARSE(name, op, prop, ...)                          \
+  static ::tensorflow::InitOnStartupMarker const spmd_##name =        \
+      InitOnStartupMarker{}                                           \
+      << SparseExpanderRegistry::Global()->RegisterSparseExpansionFn( \
+             mlir::op ::getOperationName().str(),                     \
+             std::make_unique<prop>(__VA_ARGS__))
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expander_common.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expander_common.h
new file mode 100644
index 00000000..9d611506
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expander_common.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_COMMON_H_
+#define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_COMMON_H_
+
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/types/optional.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Gets the SparseToDenseOp that generates `value` if `value` is the result of
+// a SparseToDenseOp. Returns empty otherwise. This is useful
+// in SparseExpansion where we want to check whether some operand
+// is a SparseTensor, by checking whether that operand is a result of a
+// SparseToDenseOp. If this value is eventually an output of a SparseToDenseOp,
+// there should only be DTensor related ops between the actual SparseToDenseOp,
+// e.g. DTensorRelayout ops or DTensorLayout op.
+absl::StatusOr<mlir::TF::SparseToDenseOp> GetSparseToDenseOp(mlir::Value value);
+
+// Checks whether `value is an output of a SparseToDenseOp value.
+bool IsSparseValue(mlir::Value value);
+
+// Checks if `op` has any sparse value operands.
+bool HasAnySparseInput(mlir::Operation* op);
+
+// Checks if all operands of `op` is a sparse value.
+bool AllSparseInput(mlir::Operation* op);
+
+// Returns the indices component dense tensor from `value`. `value` represents
+// a SparseTensor value.
+absl::StatusOr<mlir::Value> GetIndicesFromSparseTensor(mlir::Value value);
+
+// Returns the values component dense tensor from `value`.`value` represents
+// a SparseTensor value.
+absl::StatusOr<mlir::Value> GetValuesFromSparseTensor(mlir::Value value);
+
+// Returns the dense shape component dense tensor from `value`. `value`
+// represents a SparseTensor value.
+absl::StatusOr<mlir::Value> GetDenseShapesFromSparseTensor(mlir::Value value);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h
new file mode 100644
index 00000000..928f0752
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_DYNAMIC_ENQUEUE_SPARSE_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_DYNAMIC_ENQUEUE_SPARSE_EXPANDER_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/sparse_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class DynamicEnqueueSparseExpander : public SparseExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_DYNAMIC_ENQUEUE_SPARSE_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h
new file mode 100644
index 00000000..c79acd9f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_MATMUL_SPARSE_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_MATMUL_SPARSE_EXPANDER_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/sparse_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class MatMulSparseExpander : public SparseExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_MATMUL_SPARSE_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/spmd_expander.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/spmd_expander.h
new file mode 100644
index 00000000..e5711bef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/spmd_expander.h
@@ -0,0 +1,181 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/spmd_expander_common.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Base class for handling SPMD expansion of a MLIR TF Operation.
+class SPMDExpanderBase {
+ public:
+  virtual ~SPMDExpanderBase() = default;
+
+  // Converts `op` to a SPMD expanded form. SPMD expansion logic is
+  // a function of op type, op output's layout, and layout of op's
+  // inputs. Must return the `op` that is expanded as the final return value.
+  virtual StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) = 0;
+
+  // Layout propagation functions.
+  //
+  // During the layout algorithm, for each op output we compute a layout by
+  // merging the current layout request from the op producing the output and the
+  // layout requests from the ops consuming the output. These merged layouts
+  // represent the current state of layouts over the entire mlir module.
+  //
+  // For an op, if any of the merged layouts for the inputs or output are
+  // updated, the ComputeLayoutForward and ComputeLayoutBackward functions will
+  // be called with all the updated layout maps populated.
+  //
+  // ComputeLayoutForward should take the input layouts and determine which
+  // output layout these inputs would produce. Likewise, ComputeLayoutBackward
+  // should take the output layouts and determine the what layouts to propagate
+  // to the inputs.
+  //
+  // In both cases the functions should choose layouts that reduce the amount of
+  // cross device communication for the op.
+  //
+  // ComputeLayoutForward should not take into account the current output
+  // layout(s) when computing the new ones. The merge algorithm will decide what
+  // to do. There are only a very few cases where the current output layout may
+  // need to propagated again, in which case those ops can override the
+  // expanded ComputeLayout* functions. This similarly applies to
+  // ComputeLayoutBackward.
+  //
+  // Note that for some ops, where the input layout does not determine output
+  // layout (and visa versa), it is acceptable to either return a replicated
+  // layout. E.g. for tf.Fill, ComputeLayoutForward can return a replicated
+  // output layout and if a consumer requests a more sharded layout, then the
+  // layout algorithm will merge the requests, resulting in the more sharded
+  // layout.
+
+  // Computes output layout(s) of `op` based on the current `input_layouts`
+  // inferred from inputs of `op`. The `input_layouts` parameter maps input
+  // indices to the corresponding layouts. It may be empty if the op has no
+  // operands or if no input layouts have been inferred yet.
+  virtual StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts);
+
+  // Computes output layout(s) of `op` based on the current `input_layouts` and
+  // `output_layouts` inferred from the inputs and outputs of `op`. Both
+  // parameters maps input/output indices to the corresponding layouts. Either
+  // may be empty.
+  //
+  // NOTE: The other ComputeLayoutForward function should be preferred since in
+  // most cases the output layouts are only computed based on the input layouts.
+  virtual StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts,
+      const llvm::DenseMap<int, Layout>& output_layouts);
+
+  // Computes input layout(s) of `op` based on the current `output_layouts`
+  // inferred from outputs of `op`. The `output_layouts` parameter maps output
+  // indices to the corresponding layouts. It may be empty if the op has no
+  // outputs or if no output layouts have been inferred yet.
+  virtual StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts);
+
+  // Computes input layout(s) of `op` based on the current `output_layouts` and
+  // `input_layouts` inferred from the outputs and inputs of `op`. Both
+  // parameters maps input/output indices to the corresponding layouts. Either
+  // may be empty.
+  //
+  // NOTE: The other ComputeLayoutBackward function should be preferred since in
+  // most cases the input layouts are only computed based on the output layouts.
+  virtual StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts,
+      const llvm::DenseMap<int, Layout>& output_layouts);
+
+  // Run ExpandOp() and set layout from the computed layout from original op.
+  // Returns the expanded op in output.
+  absl::Status ExpandOpAndSetLayout(mlir::Operation* op,
+                                    mlir::Operation** output);
+};
+
+// Computes the SPMD expansion for `op`.
+//
+// Prior to this call, all inputs to `op` have been lowered to local operations
+// & shapes. The lowered op must emit a type compatible with the local shape.
+absl::Status RunSPMDExpansion(mlir::Operation* op, mlir::Operation** output);
+
+// A registry of SPMD expanders. This map is statically stored and initialized
+// with all the registered SPMD expanders.
+class SPMDExpanderRegistry {
+ public:
+  ~SPMDExpanderRegistry() = default;
+
+  // A singleton available at startup.
+  static SPMDExpanderRegistry* Global();
+
+  // Returns true if the op name is supported.
+  // The name includes the "tf." prefix.
+  bool IsOpSupported(const std::string& full_op_name) {
+    return GetPropagateFnForFullOpName(full_op_name) != nullptr;
+  }
+
+  // Returns the expansion for the given operation (or nullptr if no expansion
+  // has been registered).
+  SPMDExpanderBase* GetPropagateFnForOp(mlir::Operation* op);
+
+  // Returns the expansion for the given operation (or nullptr if no expansion
+  // has been registered). The name is the full name with "tf." prefix.
+  SPMDExpanderBase* GetPropagateFnForFullOpName(
+      const std::string& full_op_name);
+
+  // Registers an expander for the provided opName.
+  InitOnStartupMarker RegisterPropagateFn(
+      std::string opName, std::unique_ptr<SPMDExpanderBase> prop);
+
+ private:
+  absl::flat_hash_map<std::string, std::unique_ptr<SPMDExpanderBase>>
+      op_to_propagate_fn_map_;
+};
+
+#define REGISTER_SPMD(name, op, prop, ...)                             \
+  static ::tensorflow::InitOnStartupMarker const spmd_##name =         \
+      InitOnStartupMarker{}                                            \
+      << dtensor::SPMDExpanderRegistry::Global()->RegisterPropagateFn( \
+             mlir::op::getOperationName().str(),                       \
+             std::make_unique<prop>(__VA_ARGS__))
+
+// Register the SPMD expander by ops string name.
+// Comparing to REGISTER_SPMD, this macro allows registration for custom ops
+// that isn't a MLIR op. Note that the op_name should start with "tf.", e.g
+// REGISTER_SPMD_BY_OP_NAME(Foo, "tf.foo", expander_class).
+#define REGISTER_SPMD_BY_OP_NAME(expander_name, op_name, prop, ...)     \
+  static ::tensorflow::InitOnStartupMarker const spmd_##expander_name = \
+      InitOnStartupMarker{}                                             \
+      << dtensor::SPMDExpanderRegistry::Global()->RegisterPropagateFn(  \
+             op_name, std::make_unique<prop>(__VA_ARGS__))
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/spmd_expander_common.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/spmd_expander_common.h
new file mode 100644
index 00000000..4140fe88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/spmd_expander_common.h
@@ -0,0 +1,191 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_COMMON_H_
+#define TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_COMMON_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+constexpr absl::string_view kReduceOpAdd = "Add";
+constexpr absl::string_view kReduceOpAll = "All";
+constexpr absl::string_view kReduceOpAny = "Any";
+constexpr absl::string_view kReduceOpMax = "Max";
+constexpr absl::string_view kReduceOpMin = "Min";
+constexpr absl::string_view kReduceOpMul = "Mul";
+// Mean is not a valid combinator function on its own. It is handled specially
+// by the reduce expansion.
+constexpr absl::string_view kReduceOpMean = "Mean";
+
+// Returns true if all layouts are replicated.
+bool AllReplicated(const std::vector<Layout>& layouts);
+
+// Takes a global type and converts it to a local type. Fails if the number of
+// shards does not divide the size of the dimension (if not dynamic).
+StatusOr<mlir::TensorType> LocalTypeFromGlobalType(
+    const Layout& layout, const mlir::TensorType& original_type);
+
+// Takes a global type and converts it to a local type.
+StatusOr<mlir::TensorType> GlobalTypeFromLocalType(
+    const Layout& layout, const mlir::TensorType& original_type);
+
+// Creates a tf::SplitOp that splits 'src_input' into 'num_splits' ways
+// in 'split_dimension' dimension and returns the split values.
+absl::Status CreateSplitOp(int num_split, int split_dimension,
+                           mlir::Location location, mlir::Value src_input,
+                           mlir::OpBuilder* builder,
+                           mlir::TF::SplitOp* split_op);
+
+// Given layouts + shapes, determines if the two are broadcast compatible.
+// See source file for more documentation.
+StatusOr<Layout> GetBroadcastLayoutForElementWise(
+    const Layout& layout_a, const Layout& layout_b,
+    mlir::ArrayRef<int64_t> shape_a, mlir::ArrayRef<int64_t> shape_b,
+    int64_t dims_to_ignore, std::vector<std::string>& to_split_a,
+    std::vector<std::string>& to_split_b);
+
+// Returns a merged layout using `GetBroadcastLayoutForElementwise()` function
+// given a list of operand layouts.
+StatusOr<std::optional<Layout>> GetMergedOperandLayout(
+    const llvm::DenseMap<int, Layout>& operand_layouts, mlir::Operation* op);
+
+// Returns the forwarded input value of DTensorLayout op for which `value` is
+// the output. This must be used after layout propagation and before SPMD
+// expansion when all mlir::Value's of tf ops are followed by DTensorLayout op
+// to specify output layout.
+// To make the implementation safe for Layout Propagation V1 algorithm, if the
+// defining op of `value` is not DTensorLayout op (only the case for V1),
+// returns `value` directly.
+// TODO(b/172936130): Remove special casing for v1 Layout Propagation
+// algorithm.
+mlir::Value GetForwardedDTensorLayoutInput(mlir::Value value);
+
+// Goal of this function is to connect 'mlir::Value's (read 'mlir::OpResult's)
+// to the 'mlir::OpOperand's which use them, crossing function call
+// boundaries. The only keys in consumers which will not actually be
+// 'mlir::OpResult's will be the 'mlir::Value's representing the inputs of the
+// main function. The rest will be direct output of operations -- i.e.
+// mlir::OpResult. Note that 'mlir::Value's that are not used by any op or are
+// simply returned from the main functiuon will not be in this list. In these
+// cases, there are no conditions on the layouts for these 'mlir::Value's.
+//
+// A list of current assumptions in this code:
+// * Functions are only called once.
+// * Functions that are not reachable from main have been trimmed.
+// * Input to CopyToMesh can always be traced back to function inputs.
+mlir::LogicalResult PopulateConsumersFromModule(
+    mlir::ModuleOp* module, mlir::Dialect* tf_dialect,
+    llvm::DenseMap<mlir::Value, std::vector<mlir::OpOperand*>>& consumers);
+
+// From device id, return an mlir::Value for a tensor of shape [1,
+// mesh.rank()] whose entries are the mesh coordinates of the device. The mesh
+// used, is the mesh for the given cluster.
+StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
+    mlir::tf_device::ClusterOp cluster);
+
+// Returns Mesh attribute on the parent cluster op for the input operation.
+StatusOr<Mesh> GetMeshOnParentCluster(mlir::Operation* op);
+
+// Checks that optional metadata attributes of `op` are valid if they
+// exist. More specifically, output layouts of tf.Shape op and layouts of
+// resources inferred from AssignVariable op is added as metadata.
+mlir::LogicalResult ValidateMetadataAttributes(mlir::Operation* op);
+
+// Creates a map from function to ops which calls the function.
+mlir::LogicalResult GetFuncToCaller(
+    mlir::ModuleOp module,
+    llvm::DenseMap<llvm::StringRef, mlir::Operation*>& func_to_caller);
+
+// Takes an operand and traces its use across function call and
+// tf_device.cluster boundaries. Note that this may turn one operand into
+// many.
+llvm::SmallVector<mlir::OpOperand*, 4> TraceUseToNextTFOp(
+    mlir::OpOperand* operand,
+    const llvm::DenseMap<llvm::StringRef, mlir::Operation*>& func_to_caller,
+    llvm::SmallVector<mlir::Value, 4>* skipped_values = nullptr);
+
+// Replaces `cluster` with a new tf_device.cluster without return values
+// if result values are not used by any other ops.
+//
+// For example:
+//
+//  %unused_value  = "tf_device.cluster"() ({
+//      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () ->
+//      tensor<i32> %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+//      tf_device.return %2 : tensor<i32>
+//  }) {_mesh="mesh:CPU,x=2,y=2"} : () -> (tensor<i32>)
+//
+// Will be transformed to:
+//
+//  "tf_device.cluster"() ({
+//      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () ->
+//      tensor<i32> %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+//      tf_device.return
+//  }) {_mesh="mesh:CPU,x=2,y=2"} : () -> ()
+void RemoveUnusedClusterResults(mlir::tf_device::ClusterOp cluster);
+
+mlir::StringAttr GetUniqueControlflowFnName(const std::string& prefix,
+                                            mlir::OpBuilder& builder);
+
+// Sets the builder insertion point to after value. If value is a block
+// argument, this checks that all users of the value are in the same cluster.
+// If not it errors out. If they are then it sets the inserition point to the
+// top of the cluster.
+absl::Status SetBuilderInsertionAfterValue(mlir::Value value,
+                                           mlir::OpBuilder& builder);
+
+// Inserts a StringFormat and Print op, should only be used for debugging
+// on CPU.
+absl::Status PrintTensor(mlir::Value value, const std::string& format_string);
+
+// Extract a vector of string from mlir value.
+absl::Status ExtractConstStringVectorFromValue(
+    mlir::Value value, llvm::SmallVectorImpl<std::string>& out_vector);
+
+StatusOr<std::string> ExtractConstScalarStringFromValue(mlir::Value value);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/topological_iterator.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/topological_iterator.h
new file mode 100644
index 00000000..dce7d6e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/topological_iterator.h
@@ -0,0 +1,63 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_TOPOLOGICAL_ITERATOR_H_
+#define TENSORFLOW_DTENSOR_MLIR_TOPOLOGICAL_ITERATOR_H_
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace dtensor {
+
+// A general Iterator that visits a FuncOp's body in topological order. Note
+// that this does not visit the given FuncOp itself. Function ops are visited
+// exactly once if functions are used in multiple call sites.
+//
+// An example usage of this Iterator is for SPMD Expansion or Sparse
+// Expansion, where we expand ops in topological order starting from the
+// `main` FuncOp, only visiting function ops once so that we don't expand
+// multiple times.
+class TopologicalIterator {
+ public:
+  explicit TopologicalIterator(mlir::func::FuncOp main_func);
+
+  // Returns whether there is any further ops to visit.
+  bool hasNext();
+
+  // Returns the next op to visit in the topological ordering. Returns
+  // a nullptr if there is no next op to visit.
+  mlir::Operation* next();
+
+ private:
+  // Stack to keep track of ops to visit.
+  llvm::SmallVector<mlir::Operation*, 4> ops_to_visit_;
+
+  // Keep track of functions we are walking, this is needed to avoid recursive
+  // function calls.
+  llvm::SmallDenseSet<mlir::StringRef, 4> funcs_visited_in_call_stack_;
+
+  // Keep track of all visit functions. This is to guarantee that
+  // functions are visited exactly once if functions are used in multiple
+  // callsites.
+  llvm::SmallDenseSet<mlir::StringRef, 4> funcs_visited_;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_TOPOLOGICAL_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.h
new file mode 100644
index 00000000..3931d8e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_UTILS_DTENSOR_MLIR_PASSES_INTERNAL_H_
+#define TENSORFLOW_DTENSOR_MLIR_UTILS_DTENSOR_MLIR_PASSES_INTERNAL_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace dtensor {
+
+void AddDTensorAllReduceCombineOptimization(mlir::OpPassManager* pm);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_UTILS_DTENSOR_MLIR_PASSES_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/dtensor/mlir/value_utils.h b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/value_utils.h
new file mode 100644
index 00000000..34d28570
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/dtensor/mlir/value_utils.h
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_VALUE_UTILS_H_
+#define TENSORFLOW_DTENSOR_MLIR_VALUE_UTILS_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+int ValueRank(mlir::Value operand_value);
+
+// Creates a effective scalar type as rank 1 with a single element.
+mlir::RankedTensorType EffectivelyScalarR1Type(mlir::Type element_type);
+
+// Reshapes a value of size type tensor<i32> to scalar.
+mlir::Value ReshapeSizeTypeToScalar(mlir::OpBuilder builder, mlir::Location loc,
+                                    mlir::Value tensor);
+
+// Retuns a int64 array representing the TensorFlow shape given the MLIR type.
+// If the type is a resource, returns the underlying shape of the resource
+// instead. Returns an error if the type is not a RankedTensorType.
+StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type);
+
+// Return a 1-D int32 constant array with the given values.
+mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
+                     llvm::ArrayRef<int32> values);
+// Return a 1-D int64 constant array with the given values.
+mlir::Value Int64Const(mlir::OpBuilder& builder, mlir::Location loc,
+                       llvm::ArrayRef<int64_t> values);
+// Return a 1-D float32 constant array with the given values.
+mlir::Value FloatConst(mlir::OpBuilder& builder, mlir::Location loc,
+                       llvm::ArrayRef<float> values);
+// Returns a 1-D tf.string constant array with given values.
+mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
+                        llvm::ArrayRef<llvm::StringRef> values);
+// Returns a tf.string scalar constant with given value.
+mlir::Value StringScalarConst(mlir::OpBuilder& builder, mlir::Location loc,
+                              llvm::StringRef value);
+// Returns a Int constant with the matching type.
+mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
+                                     mlir::Location loc,
+                                     llvm::ArrayRef<int64_t> values,
+                                     mlir::Type type);
+
+StatusOr<int64_t> ExtractConstIntFromValue(mlir::Value value);
+absl::Status ExtractConstVectorFromValue(
+    mlir::Value value, llvm::SmallVector<int64_t, 4>* out_vector);
+
+// Returns a int64 scalar constant with `value`.
+mlir::Value CreateIntScalarConst(int64_t value, mlir::OpBuilder builder,
+                                 mlir::Location loc, bool use_int64 = true);
+
+// Returns a scalar constant with 'value' of 'type'.
+StatusOr<mlir::Value> CreateZeroScalarConst(mlir::OpBuilder& builder,
+                                            mlir::Location loc,
+                                            mlir::Type type);
+
+// Selects a scalar tensor value from a 1D array in specified index.
+StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
+                                                 int index,
+                                                 mlir::Location location,
+                                                 mlir::Value array);
+
+// Returns the type that value holds. If value holds a Type that has a subtype,
+// then it returns the subtype.
+mlir::Type GetSubtypeOrSelf(mlir::Value value);
+
+// Returns whether `val` is of resource type.
+bool IsResourceType(mlir::Value val);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_MLIR_VALUE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
new file mode 100644
index 00000000..46c20dbf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
@@ -0,0 +1,23 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
new file mode 100644
index 00000000..6c83d2b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
@@ -0,0 +1,23 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
+
+#include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
new file mode 100644
index 00000000..f2406e83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h
+
+#include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/stable_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000..f3589c58
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,23 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
+
+#include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
new file mode 100644
index 00000000..ae44009e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
+
+#include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/configuration_generated.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/configuration_generated.h
new file mode 100644
index 00000000..4cb4861e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -0,0 +1,6633 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
+#define FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+
+struct ComputeSettings;
+struct ComputeSettingsBuilder;
+struct ComputeSettingsT;
+
+struct NNAPISettings;
+struct NNAPISettingsBuilder;
+struct NNAPISettingsT;
+
+struct GPUSettings;
+struct GPUSettingsBuilder;
+struct GPUSettingsT;
+
+struct HexagonSettings;
+struct HexagonSettingsBuilder;
+struct HexagonSettingsT;
+
+struct XNNPackSettings;
+struct XNNPackSettingsBuilder;
+struct XNNPackSettingsT;
+
+struct CoreMLSettings;
+struct CoreMLSettingsBuilder;
+struct CoreMLSettingsT;
+
+struct StableDelegateLoaderSettings;
+struct StableDelegateLoaderSettingsBuilder;
+struct StableDelegateLoaderSettingsT;
+
+struct CompilationCachingSettings;
+struct CompilationCachingSettingsBuilder;
+struct CompilationCachingSettingsT;
+
+struct EdgeTpuDeviceSpec;
+struct EdgeTpuDeviceSpecBuilder;
+struct EdgeTpuDeviceSpecT;
+
+struct EdgeTpuInactivePowerConfig;
+struct EdgeTpuInactivePowerConfigBuilder;
+struct EdgeTpuInactivePowerConfigT;
+
+struct EdgeTpuSettings;
+struct EdgeTpuSettingsBuilder;
+struct EdgeTpuSettingsT;
+
+struct GoogleEdgeTpuSettings;
+struct GoogleEdgeTpuSettingsBuilder;
+struct GoogleEdgeTpuSettingsT;
+
+struct CoralSettings;
+struct CoralSettingsBuilder;
+struct CoralSettingsT;
+
+struct CPUSettings;
+struct CPUSettingsBuilder;
+struct CPUSettingsT;
+
+struct ArmNNSettings;
+struct ArmNNSettingsBuilder;
+struct ArmNNSettingsT;
+
+struct MtkNeuronSettings;
+struct MtkNeuronSettingsBuilder;
+struct MtkNeuronSettingsT;
+
+struct TFLiteSettings;
+struct TFLiteSettingsBuilder;
+struct TFLiteSettingsT;
+
+struct FallbackSettings;
+struct FallbackSettingsBuilder;
+struct FallbackSettingsT;
+
+struct BenchmarkMetric;
+struct BenchmarkMetricBuilder;
+struct BenchmarkMetricT;
+
+struct BenchmarkResult;
+struct BenchmarkResultBuilder;
+struct BenchmarkResultT;
+
+namespace BenchmarkResult_ {
+
+struct InferenceOutput;
+struct InferenceOutputBuilder;
+struct InferenceOutputT;
+
+}  // namespace BenchmarkResult_
+
+struct ErrorCode;
+struct ErrorCodeBuilder;
+struct ErrorCodeT;
+
+struct BenchmarkError;
+struct BenchmarkErrorBuilder;
+struct BenchmarkErrorT;
+
+struct BenchmarkEvent;
+struct BenchmarkEventBuilder;
+struct BenchmarkEventT;
+
+struct BestAccelerationDecision;
+struct BestAccelerationDecisionBuilder;
+struct BestAccelerationDecisionT;
+
+struct BenchmarkInitializationFailure;
+struct BenchmarkInitializationFailureBuilder;
+struct BenchmarkInitializationFailureT;
+
+struct MiniBenchmarkEvent;
+struct MiniBenchmarkEventBuilder;
+struct MiniBenchmarkEventT;
+
+struct ModelFile;
+struct ModelFileBuilder;
+struct ModelFileT;
+
+struct ModelIdGroup;
+struct ModelIdGroupBuilder;
+struct ModelIdGroupT;
+
+struct BenchmarkStoragePaths;
+struct BenchmarkStoragePathsBuilder;
+struct BenchmarkStoragePathsT;
+
+struct ValidationSettings;
+struct ValidationSettingsBuilder;
+struct ValidationSettingsT;
+
+struct MinibenchmarkSettings;
+struct MinibenchmarkSettingsBuilder;
+struct MinibenchmarkSettingsT;
+
+struct BenchmarkEventStorage;
+struct BenchmarkEventStorageBuilder;
+struct BenchmarkEventStorageT;
+
+bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
+bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
+bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
+bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
+bool operator==(const GPUSettingsT &lhs, const GPUSettingsT &rhs);
+bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs);
+bool operator==(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs);
+bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs);
+bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
+bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
+bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator==(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs);
+bool operator!=(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs);
+bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
+bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
+bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
+bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
+bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs);
+bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs);
+bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs);
+bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs);
+bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
+bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
+bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator==(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs);
+bool operator!=(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs);
+bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
+bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
+bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
+bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
+bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
+bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
+bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+namespace BenchmarkResult_ {
+
+bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+}  // namespace BenchmarkResult_
+
+bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
+bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
+bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
+bool operator!=(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
+bool operator==(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs);
+bool operator!=(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs);
+bool operator==(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs);
+bool operator!=(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs);
+bool operator==(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs);
+bool operator!=(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs);
+bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
+bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
+bool operator==(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
+bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
+bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
+bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
+bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+
+enum ExecutionPreference : int32_t {
+  ExecutionPreference_ANY = 0,
+  ExecutionPreference_LOW_LATENCY = 1,
+  ExecutionPreference_LOW_POWER = 2,
+  ExecutionPreference_FORCE_CPU = 3,
+  ExecutionPreference_MIN = ExecutionPreference_ANY,
+  ExecutionPreference_MAX = ExecutionPreference_FORCE_CPU
+};
+
+inline const ExecutionPreference (&EnumValuesExecutionPreference())[4] {
+  static const ExecutionPreference values[] = {
+    ExecutionPreference_ANY,
+    ExecutionPreference_LOW_LATENCY,
+    ExecutionPreference_LOW_POWER,
+    ExecutionPreference_FORCE_CPU
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesExecutionPreference() {
+  static const char * const names[5] = {
+    "ANY",
+    "LOW_LATENCY",
+    "LOW_POWER",
+    "FORCE_CPU",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameExecutionPreference(ExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, ExecutionPreference_ANY, ExecutionPreference_FORCE_CPU)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesExecutionPreference()[index];
+}
+
+enum Delegate : int32_t {
+  Delegate_NONE = 0,
+  Delegate_NNAPI = 1,
+  Delegate_GPU = 2,
+  Delegate_HEXAGON = 3,
+  Delegate_XNNPACK = 4,
+  Delegate_EDGETPU = 5,
+  Delegate_EDGETPU_CORAL = 6,
+  Delegate_CORE_ML = 7,
+  Delegate_ARMNN = 8,
+  Delegate_MTK_NEURON = 9,
+  Delegate_MIN = Delegate_NONE,
+  Delegate_MAX = Delegate_MTK_NEURON
+};
+
+inline const Delegate (&EnumValuesDelegate())[10] {
+  static const Delegate values[] = {
+    Delegate_NONE,
+    Delegate_NNAPI,
+    Delegate_GPU,
+    Delegate_HEXAGON,
+    Delegate_XNNPACK,
+    Delegate_EDGETPU,
+    Delegate_EDGETPU_CORAL,
+    Delegate_CORE_ML,
+    Delegate_ARMNN,
+    Delegate_MTK_NEURON
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDelegate() {
+  static const char * const names[11] = {
+    "NONE",
+    "NNAPI",
+    "GPU",
+    "HEXAGON",
+    "XNNPACK",
+    "EDGETPU",
+    "EDGETPU_CORAL",
+    "CORE_ML",
+    "ARMNN",
+    "MTK_NEURON",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDelegate(Delegate e) {
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_MTK_NEURON)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDelegate()[index];
+}
+
+enum NNAPIExecutionPreference : int32_t {
+  NNAPIExecutionPreference_UNDEFINED = 0,
+  NNAPIExecutionPreference_NNAPI_LOW_POWER = 1,
+  NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER = 2,
+  NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED = 3,
+  NNAPIExecutionPreference_MIN = NNAPIExecutionPreference_UNDEFINED,
+  NNAPIExecutionPreference_MAX = NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED
+};
+
+inline const NNAPIExecutionPreference (&EnumValuesNNAPIExecutionPreference())[4] {
+  static const NNAPIExecutionPreference values[] = {
+    NNAPIExecutionPreference_UNDEFINED,
+    NNAPIExecutionPreference_NNAPI_LOW_POWER,
+    NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER,
+    NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNNAPIExecutionPreference() {
+  static const char * const names[5] = {
+    "UNDEFINED",
+    "NNAPI_LOW_POWER",
+    "NNAPI_FAST_SINGLE_ANSWER",
+    "NNAPI_SUSTAINED_SPEED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNNAPIExecutionPreference(NNAPIExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPreference_UNDEFINED, NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNNAPIExecutionPreference()[index];
+}
+
+enum NNAPIExecutionPriority : int32_t {
+  NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED = 0,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_LOW = 1,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM = 2,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH = 3,
+  NNAPIExecutionPriority_MIN = NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+  NNAPIExecutionPriority_MAX = NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH
+};
+
+inline const NNAPIExecutionPriority (&EnumValuesNNAPIExecutionPriority())[4] {
+  static const NNAPIExecutionPriority values[] = {
+    NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_LOW,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNNAPIExecutionPriority() {
+  static const char * const names[5] = {
+    "NNAPI_PRIORITY_UNDEFINED",
+    "NNAPI_PRIORITY_LOW",
+    "NNAPI_PRIORITY_MEDIUM",
+    "NNAPI_PRIORITY_HIGH",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNNAPIExecutionPriority(NNAPIExecutionPriority e) {
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED, NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNNAPIExecutionPriority()[index];
+}
+
+enum GPUBackend : int32_t {
+  GPUBackend_UNSET = 0,
+  GPUBackend_OPENCL = 1,
+  GPUBackend_OPENGL = 2,
+  GPUBackend_MIN = GPUBackend_UNSET,
+  GPUBackend_MAX = GPUBackend_OPENGL
+};
+
+inline const GPUBackend (&EnumValuesGPUBackend())[3] {
+  static const GPUBackend values[] = {
+    GPUBackend_UNSET,
+    GPUBackend_OPENCL,
+    GPUBackend_OPENGL
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUBackend() {
+  static const char * const names[4] = {
+    "UNSET",
+    "OPENCL",
+    "OPENGL",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUBackend(GPUBackend e) {
+  if (::flatbuffers::IsOutRange(e, GPUBackend_UNSET, GPUBackend_OPENGL)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUBackend()[index];
+}
+
+enum GPUInferencePriority : int32_t {
+  GPUInferencePriority_GPU_PRIORITY_AUTO = 0,
+  GPUInferencePriority_GPU_PRIORITY_MAX_PRECISION = 1,
+  GPUInferencePriority_GPU_PRIORITY_MIN_LATENCY = 2,
+  GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE = 3,
+  GPUInferencePriority_MIN = GPUInferencePriority_GPU_PRIORITY_AUTO,
+  GPUInferencePriority_MAX = GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE
+};
+
+inline const GPUInferencePriority (&EnumValuesGPUInferencePriority())[4] {
+  static const GPUInferencePriority values[] = {
+    GPUInferencePriority_GPU_PRIORITY_AUTO,
+    GPUInferencePriority_GPU_PRIORITY_MAX_PRECISION,
+    GPUInferencePriority_GPU_PRIORITY_MIN_LATENCY,
+    GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUInferencePriority() {
+  static const char * const names[5] = {
+    "GPU_PRIORITY_AUTO",
+    "GPU_PRIORITY_MAX_PRECISION",
+    "GPU_PRIORITY_MIN_LATENCY",
+    "GPU_PRIORITY_MIN_MEMORY_USAGE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUInferencePriority(GPUInferencePriority e) {
+  if (::flatbuffers::IsOutRange(e, GPUInferencePriority_GPU_PRIORITY_AUTO, GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUInferencePriority()[index];
+}
+
+enum GPUInferenceUsage : int32_t {
+  GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0,
+  GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1,
+  GPUInferenceUsage_MIN = GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+  GPUInferenceUsage_MAX = GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
+};
+
+inline const GPUInferenceUsage (&EnumValuesGPUInferenceUsage())[2] {
+  static const GPUInferenceUsage values[] = {
+    GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUInferenceUsage() {
+  static const char * const names[3] = {
+    "GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER",
+    "GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUInferenceUsage(GPUInferenceUsage e) {
+  if (::flatbuffers::IsOutRange(e, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUInferenceUsage()[index];
+}
+
+enum XNNPackFlags : int32_t {
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED = 8,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS = 16,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER = 32,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS = 64,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING = 128,
+  XNNPackFlags_MIN = XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+  XNNPackFlags_MAX = XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING
+};
+
+inline const XNNPackFlags (&EnumValuesXNNPackFlags())[10] {
+  static const XNNPackFlags values[] = {
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING
+  };
+  return values;
+}
+
+inline const char *EnumNameXNNPackFlags(XNNPackFlags e) {
+  switch (e) {
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS: return "TFLITE_XNNPACK_DELEGATE_NO_FLAGS";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8: return "TFLITE_XNNPACK_DELEGATE_FLAG_QS8";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8: return "TFLITE_XNNPACK_DELEGATE_FLAG_QU8";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8: return "TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16: return "TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED: return "TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS: return "TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER: return "TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS: return "TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS";
+    case XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING: return "TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING";
+    default: return "";
+  }
+}
+
+namespace CoreMLSettings_ {
+
+enum EnabledDevices : int32_t {
+  EnabledDevices_DEVICES_ALL = 0,
+  EnabledDevices_DEVICES_WITH_NEURAL_ENGINE = 1,
+  EnabledDevices_MIN = EnabledDevices_DEVICES_ALL,
+  EnabledDevices_MAX = EnabledDevices_DEVICES_WITH_NEURAL_ENGINE
+};
+
+inline const EnabledDevices (&EnumValuesEnabledDevices())[2] {
+  static const EnabledDevices values[] = {
+    EnabledDevices_DEVICES_ALL,
+    EnabledDevices_DEVICES_WITH_NEURAL_ENGINE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEnabledDevices() {
+  static const char * const names[3] = {
+    "DEVICES_ALL",
+    "DEVICES_WITH_NEURAL_ENGINE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEnabledDevices(EnabledDevices e) {
+  if (::flatbuffers::IsOutRange(e, EnabledDevices_DEVICES_ALL, EnabledDevices_DEVICES_WITH_NEURAL_ENGINE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEnabledDevices()[index];
+}
+
+}  // namespace CoreMLSettings_
+
+namespace EdgeTpuDeviceSpec_ {
+
+enum PlatformType : int32_t {
+  PlatformType_MMIO = 0,
+  PlatformType_REFERENCE = 1,
+  PlatformType_SIMULATOR = 2,
+  PlatformType_REMOTE_SIMULATOR = 3,
+  PlatformType_MIN = PlatformType_MMIO,
+  PlatformType_MAX = PlatformType_REMOTE_SIMULATOR
+};
+
+inline const PlatformType (&EnumValuesPlatformType())[4] {
+  static const PlatformType values[] = {
+    PlatformType_MMIO,
+    PlatformType_REFERENCE,
+    PlatformType_SIMULATOR,
+    PlatformType_REMOTE_SIMULATOR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPlatformType() {
+  static const char * const names[5] = {
+    "MMIO",
+    "REFERENCE",
+    "SIMULATOR",
+    "REMOTE_SIMULATOR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePlatformType(PlatformType e) {
+  if (::flatbuffers::IsOutRange(e, PlatformType_MMIO, PlatformType_REMOTE_SIMULATOR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPlatformType()[index];
+}
+
+}  // namespace EdgeTpuDeviceSpec_
+
+enum EdgeTpuPowerState : int32_t {
+  EdgeTpuPowerState_UNDEFINED_POWERSTATE = 0,
+  EdgeTpuPowerState_TPU_CORE_OFF = 1,
+  EdgeTpuPowerState_READY = 2,
+  EdgeTpuPowerState_ACTIVE_MIN_POWER = 3,
+  EdgeTpuPowerState_ACTIVE_VERY_LOW_POWER = 4,
+  EdgeTpuPowerState_ACTIVE_LOW_POWER = 5,
+  EdgeTpuPowerState_ACTIVE = 6,
+  EdgeTpuPowerState_OVER_DRIVE = 7,
+  EdgeTpuPowerState_MIN = EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+  EdgeTpuPowerState_MAX = EdgeTpuPowerState_OVER_DRIVE
+};
+
+inline const EdgeTpuPowerState (&EnumValuesEdgeTpuPowerState())[8] {
+  static const EdgeTpuPowerState values[] = {
+    EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    EdgeTpuPowerState_TPU_CORE_OFF,
+    EdgeTpuPowerState_READY,
+    EdgeTpuPowerState_ACTIVE_MIN_POWER,
+    EdgeTpuPowerState_ACTIVE_VERY_LOW_POWER,
+    EdgeTpuPowerState_ACTIVE_LOW_POWER,
+    EdgeTpuPowerState_ACTIVE,
+    EdgeTpuPowerState_OVER_DRIVE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEdgeTpuPowerState() {
+  static const char * const names[9] = {
+    "UNDEFINED_POWERSTATE",
+    "TPU_CORE_OFF",
+    "READY",
+    "ACTIVE_MIN_POWER",
+    "ACTIVE_VERY_LOW_POWER",
+    "ACTIVE_LOW_POWER",
+    "ACTIVE",
+    "OVER_DRIVE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEdgeTpuPowerState(EdgeTpuPowerState e) {
+  if (::flatbuffers::IsOutRange(e, EdgeTpuPowerState_UNDEFINED_POWERSTATE, EdgeTpuPowerState_OVER_DRIVE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEdgeTpuPowerState()[index];
+}
+
+namespace EdgeTpuSettings_ {
+
+enum FloatTruncationType : int32_t {
+  FloatTruncationType_UNSPECIFIED = 0,
+  FloatTruncationType_NO_TRUNCATION = 1,
+  FloatTruncationType_BFLOAT16 = 2,
+  FloatTruncationType_HALF = 3,
+  FloatTruncationType_MIN = FloatTruncationType_UNSPECIFIED,
+  FloatTruncationType_MAX = FloatTruncationType_HALF
+};
+
+inline const FloatTruncationType (&EnumValuesFloatTruncationType())[4] {
+  static const FloatTruncationType values[] = {
+    FloatTruncationType_UNSPECIFIED,
+    FloatTruncationType_NO_TRUNCATION,
+    FloatTruncationType_BFLOAT16,
+    FloatTruncationType_HALF
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesFloatTruncationType() {
+  static const char * const names[5] = {
+    "UNSPECIFIED",
+    "NO_TRUNCATION",
+    "BFLOAT16",
+    "HALF",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameFloatTruncationType(FloatTruncationType e) {
+  if (::flatbuffers::IsOutRange(e, FloatTruncationType_UNSPECIFIED, FloatTruncationType_HALF)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFloatTruncationType()[index];
+}
+
+enum QosClass : int32_t {
+  QosClass_QOS_UNDEFINED = 0,
+  QosClass_BEST_EFFORT = 1,
+  QosClass_REALTIME = 2,
+  QosClass_MIN = QosClass_QOS_UNDEFINED,
+  QosClass_MAX = QosClass_REALTIME
+};
+
+inline const QosClass (&EnumValuesQosClass())[3] {
+  static const QosClass values[] = {
+    QosClass_QOS_UNDEFINED,
+    QosClass_BEST_EFFORT,
+    QosClass_REALTIME
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQosClass() {
+  static const char * const names[4] = {
+    "QOS_UNDEFINED",
+    "BEST_EFFORT",
+    "REALTIME",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQosClass(QosClass e) {
+  if (::flatbuffers::IsOutRange(e, QosClass_QOS_UNDEFINED, QosClass_REALTIME)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQosClass()[index];
+}
+
+enum UseLayerIrTgcBackend : int32_t {
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED = 0,
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_NO = 1,
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_YES = 2,
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO = 3,
+  UseLayerIrTgcBackend_MIN = UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED,
+  UseLayerIrTgcBackend_MAX = UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO
+};
+
+inline const UseLayerIrTgcBackend (&EnumValuesUseLayerIrTgcBackend())[4] {
+  static const UseLayerIrTgcBackend values[] = {
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED,
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_NO,
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_YES,
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesUseLayerIrTgcBackend() {
+  static const char * const names[5] = {
+    "USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED",
+    "USE_LAYER_IR_TGC_BACKEND_NO",
+    "USE_LAYER_IR_TGC_BACKEND_YES",
+    "USE_LAYER_IR_TGC_BACKEND_AUTO",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameUseLayerIrTgcBackend(UseLayerIrTgcBackend e) {
+  if (::flatbuffers::IsOutRange(e, UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED, UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUseLayerIrTgcBackend()[index];
+}
+
+}  // namespace EdgeTpuSettings_
+
+namespace GoogleEdgeTpuSettings_ {
+
+enum Priority : int32_t {
+  Priority_PRIORITY_UNDEFINED = 0,
+  Priority_PRIORITY_LOW = 1,
+  Priority_PRIORITY_MEDIUM = 2,
+  Priority_PRIORITY_HIGH = 3,
+  Priority_MIN = Priority_PRIORITY_UNDEFINED,
+  Priority_MAX = Priority_PRIORITY_HIGH
+};
+
+inline const Priority (&EnumValuesPriority())[4] {
+  static const Priority values[] = {
+    Priority_PRIORITY_UNDEFINED,
+    Priority_PRIORITY_LOW,
+    Priority_PRIORITY_MEDIUM,
+    Priority_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPriority() {
+  static const char * const names[5] = {
+    "PRIORITY_UNDEFINED",
+    "PRIORITY_LOW",
+    "PRIORITY_MEDIUM",
+    "PRIORITY_HIGH",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePriority(Priority e) {
+  if (::flatbuffers::IsOutRange(e, Priority_PRIORITY_UNDEFINED, Priority_PRIORITY_HIGH)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPriority()[index];
+}
+
+enum TriState : int32_t {
+  TriState_TRISTATE_UNDEFINED = 0,
+  TriState_TRISTATE_FALSE = 1,
+  TriState_TRISTATE_TRUE = 2,
+  TriState_MIN = TriState_TRISTATE_UNDEFINED,
+  TriState_MAX = TriState_TRISTATE_TRUE
+};
+
+inline const TriState (&EnumValuesTriState())[3] {
+  static const TriState values[] = {
+    TriState_TRISTATE_UNDEFINED,
+    TriState_TRISTATE_FALSE,
+    TriState_TRISTATE_TRUE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTriState() {
+  static const char * const names[4] = {
+    "TRISTATE_UNDEFINED",
+    "TRISTATE_FALSE",
+    "TRISTATE_TRUE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTriState(TriState e) {
+  if (::flatbuffers::IsOutRange(e, TriState_TRISTATE_UNDEFINED, TriState_TRISTATE_TRUE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTriState()[index];
+}
+
+}  // namespace GoogleEdgeTpuSettings_
+
+namespace CoralSettings_ {
+
+enum Performance : int32_t {
+  Performance_UNDEFINED = 0,
+  Performance_MAXIMUM = 1,
+  Performance_HIGH = 2,
+  Performance_MEDIUM = 3,
+  Performance_LOW = 4,
+  Performance_MIN = Performance_UNDEFINED,
+  Performance_MAX = Performance_LOW
+};
+
+inline const Performance (&EnumValuesPerformance())[5] {
+  static const Performance values[] = {
+    Performance_UNDEFINED,
+    Performance_MAXIMUM,
+    Performance_HIGH,
+    Performance_MEDIUM,
+    Performance_LOW
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPerformance() {
+  static const char * const names[6] = {
+    "UNDEFINED",
+    "MAXIMUM",
+    "HIGH",
+    "MEDIUM",
+    "LOW",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePerformance(Performance e) {
+  if (::flatbuffers::IsOutRange(e, Performance_UNDEFINED, Performance_LOW)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPerformance()[index];
+}
+
+}  // namespace CoralSettings_
+
+namespace MtkNeuronSettings_ {
+
+enum ExecutionPreference : int32_t {
+  ExecutionPreference_PREFERENCE_UNDEFINED = 0,
+  ExecutionPreference_PREFERENCE_LOW_POWER = 1,
+  ExecutionPreference_PREFERENCE_FAST_SINGLE_ANSWER = 2,
+  ExecutionPreference_PREFERENCE_SUSTAINED_SPEED = 3,
+  ExecutionPreference_PREFERENCE_TURBO_BOOST = 4,
+  ExecutionPreference_MIN = ExecutionPreference_PREFERENCE_UNDEFINED,
+  ExecutionPreference_MAX = ExecutionPreference_PREFERENCE_TURBO_BOOST
+};
+
+inline const ExecutionPreference (&EnumValuesExecutionPreference())[5] {
+  static const ExecutionPreference values[] = {
+    ExecutionPreference_PREFERENCE_UNDEFINED,
+    ExecutionPreference_PREFERENCE_LOW_POWER,
+    ExecutionPreference_PREFERENCE_FAST_SINGLE_ANSWER,
+    ExecutionPreference_PREFERENCE_SUSTAINED_SPEED,
+    ExecutionPreference_PREFERENCE_TURBO_BOOST
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesExecutionPreference() {
+  static const char * const names[6] = {
+    "PREFERENCE_UNDEFINED",
+    "PREFERENCE_LOW_POWER",
+    "PREFERENCE_FAST_SINGLE_ANSWER",
+    "PREFERENCE_SUSTAINED_SPEED",
+    "PREFERENCE_TURBO_BOOST",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameExecutionPreference(ExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, ExecutionPreference_PREFERENCE_UNDEFINED, ExecutionPreference_PREFERENCE_TURBO_BOOST)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesExecutionPreference()[index];
+}
+
+enum ExecutionPriority : int32_t {
+  ExecutionPriority_PRIORITY_UNDEFINED = 0,
+  ExecutionPriority_PRIORITY_LOW = 90,
+  ExecutionPriority_PRIORITY_MEDIUM = 100,
+  ExecutionPriority_PRIORITY_HIGH = 110,
+  ExecutionPriority_MIN = ExecutionPriority_PRIORITY_UNDEFINED,
+  ExecutionPriority_MAX = ExecutionPriority_PRIORITY_HIGH
+};
+
+inline const ExecutionPriority (&EnumValuesExecutionPriority())[4] {
+  static const ExecutionPriority values[] = {
+    ExecutionPriority_PRIORITY_UNDEFINED,
+    ExecutionPriority_PRIORITY_LOW,
+    ExecutionPriority_PRIORITY_MEDIUM,
+    ExecutionPriority_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char *EnumNameExecutionPriority(ExecutionPriority e) {
+  switch (e) {
+    case ExecutionPriority_PRIORITY_UNDEFINED: return "PRIORITY_UNDEFINED";
+    case ExecutionPriority_PRIORITY_LOW: return "PRIORITY_LOW";
+    case ExecutionPriority_PRIORITY_MEDIUM: return "PRIORITY_MEDIUM";
+    case ExecutionPriority_PRIORITY_HIGH: return "PRIORITY_HIGH";
+    default: return "";
+  }
+}
+
+enum OptimizationHint : int32_t {
+  OptimizationHint_OPTIMIZATION_NONE = 0,
+  OptimizationHint_OPTIMIZATION_LOW_LATENCY = 1,
+  OptimizationHint_OPTIMIZATION_DEEP_FUSION = 2,
+  OptimizationHint_OPTIMIZATION_BATCH_PROCESSING = 3,
+  OptimizationHint_MIN = OptimizationHint_OPTIMIZATION_NONE,
+  OptimizationHint_MAX = OptimizationHint_OPTIMIZATION_BATCH_PROCESSING
+};
+
+inline const OptimizationHint (&EnumValuesOptimizationHint())[4] {
+  static const OptimizationHint values[] = {
+    OptimizationHint_OPTIMIZATION_NONE,
+    OptimizationHint_OPTIMIZATION_LOW_LATENCY,
+    OptimizationHint_OPTIMIZATION_DEEP_FUSION,
+    OptimizationHint_OPTIMIZATION_BATCH_PROCESSING
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOptimizationHint() {
+  static const char * const names[5] = {
+    "OPTIMIZATION_NONE",
+    "OPTIMIZATION_LOW_LATENCY",
+    "OPTIMIZATION_DEEP_FUSION",
+    "OPTIMIZATION_BATCH_PROCESSING",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOptimizationHint(OptimizationHint e) {
+  if (::flatbuffers::IsOutRange(e, OptimizationHint_OPTIMIZATION_NONE, OptimizationHint_OPTIMIZATION_BATCH_PROCESSING)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOptimizationHint()[index];
+}
+
+enum OperationCheckMode : int32_t {
+  OperationCheckMode_NO_OPERATION_CHECK = 0,
+  OperationCheckMode_PER_NODE_OPERATION_CHECK = 1,
+  OperationCheckMode_PRE_OPERATION_CHECK = 2,
+  OperationCheckMode_MIN = OperationCheckMode_NO_OPERATION_CHECK,
+  OperationCheckMode_MAX = OperationCheckMode_PRE_OPERATION_CHECK
+};
+
+inline const OperationCheckMode (&EnumValuesOperationCheckMode())[3] {
+  static const OperationCheckMode values[] = {
+    OperationCheckMode_NO_OPERATION_CHECK,
+    OperationCheckMode_PER_NODE_OPERATION_CHECK,
+    OperationCheckMode_PRE_OPERATION_CHECK
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOperationCheckMode() {
+  static const char * const names[4] = {
+    "NO_OPERATION_CHECK",
+    "PER_NODE_OPERATION_CHECK",
+    "PRE_OPERATION_CHECK",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOperationCheckMode(OperationCheckMode e) {
+  if (::flatbuffers::IsOutRange(e, OperationCheckMode_NO_OPERATION_CHECK, OperationCheckMode_PRE_OPERATION_CHECK)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOperationCheckMode()[index];
+}
+
+}  // namespace MtkNeuronSettings_
+
+enum BenchmarkEventType : int32_t {
+  BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE = 0,
+  BenchmarkEventType_START = 1,
+  BenchmarkEventType_END = 2,
+  BenchmarkEventType_ERROR = 3,
+  BenchmarkEventType_LOGGED = 4,
+  BenchmarkEventType_RECOVERED_ERROR = 5,
+  BenchmarkEventType_MIN = BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+  BenchmarkEventType_MAX = BenchmarkEventType_RECOVERED_ERROR
+};
+
+inline const BenchmarkEventType (&EnumValuesBenchmarkEventType())[6] {
+  static const BenchmarkEventType values[] = {
+    BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+    BenchmarkEventType_START,
+    BenchmarkEventType_END,
+    BenchmarkEventType_ERROR,
+    BenchmarkEventType_LOGGED,
+    BenchmarkEventType_RECOVERED_ERROR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBenchmarkEventType() {
+  static const char * const names[7] = {
+    "UNDEFINED_BENCHMARK_EVENT_TYPE",
+    "START",
+    "END",
+    "ERROR",
+    "LOGGED",
+    "RECOVERED_ERROR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBenchmarkEventType(BenchmarkEventType e) {
+  if (::flatbuffers::IsOutRange(e, BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE, BenchmarkEventType_RECOVERED_ERROR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBenchmarkEventType()[index];
+}
+
+enum BenchmarkStage : int32_t {
+  BenchmarkStage_UNKNOWN = 0,
+  BenchmarkStage_INITIALIZATION = 1,
+  BenchmarkStage_INFERENCE = 2,
+  BenchmarkStage_MIN = BenchmarkStage_UNKNOWN,
+  BenchmarkStage_MAX = BenchmarkStage_INFERENCE
+};
+
+inline const BenchmarkStage (&EnumValuesBenchmarkStage())[3] {
+  static const BenchmarkStage values[] = {
+    BenchmarkStage_UNKNOWN,
+    BenchmarkStage_INITIALIZATION,
+    BenchmarkStage_INFERENCE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBenchmarkStage() {
+  static const char * const names[4] = {
+    "UNKNOWN",
+    "INITIALIZATION",
+    "INFERENCE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBenchmarkStage(BenchmarkStage e) {
+  if (::flatbuffers::IsOutRange(e, BenchmarkStage_UNKNOWN, BenchmarkStage_INFERENCE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBenchmarkStage()[index];
+}
+
+struct ComputeSettingsT : public ::flatbuffers::NativeTable {
+  typedef ComputeSettings TableType;
+  tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY;
+  std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
+  std::string model_namespace_for_statistics{};
+  std::string model_identifier_for_statistics{};
+  std::unique_ptr<tflite::MinibenchmarkSettingsT> settings_to_test_locally{};
+  ComputeSettingsT() = default;
+  ComputeSettingsT(const ComputeSettingsT &o);
+  ComputeSettingsT(ComputeSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  ComputeSettingsT &operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ComputeSettingsT NativeTableType;
+  typedef ComputeSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PREFERENCE = 4,
+    VT_TFLITE_SETTINGS = 6,
+    VT_MODEL_NAMESPACE_FOR_STATISTICS = 8,
+    VT_MODEL_IDENTIFIER_FOR_STATISTICS = 10,
+    VT_SETTINGS_TO_TEST_LOCALLY = 12
+  };
+  tflite::ExecutionPreference preference() const {
+    return static_cast<tflite::ExecutionPreference>(GetField<int32_t>(VT_PREFERENCE, 0));
+  }
+  const tflite::TFLiteSettings *tflite_settings() const {
+    return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
+  }
+  const ::flatbuffers::String *model_namespace_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE_FOR_STATISTICS);
+  }
+  const ::flatbuffers::String *model_identifier_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER_FOR_STATISTICS);
+  }
+  const tflite::MinibenchmarkSettings *settings_to_test_locally() const {
+    return GetPointer<const tflite::MinibenchmarkSettings *>(VT_SETTINGS_TO_TEST_LOCALLY);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_PREFERENCE, 4) &&
+           VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
+           verifier.VerifyTable(tflite_settings()) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE_FOR_STATISTICS) &&
+           verifier.VerifyString(model_namespace_for_statistics()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER_FOR_STATISTICS) &&
+           verifier.VerifyString(model_identifier_for_statistics()) &&
+           VerifyOffset(verifier, VT_SETTINGS_TO_TEST_LOCALLY) &&
+           verifier.VerifyTable(settings_to_test_locally()) &&
+           verifier.EndTable();
+  }
+  ComputeSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ComputeSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ComputeSettingsBuilder {
+  typedef ComputeSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_preference(tflite::ExecutionPreference preference) {
+    fbb_.AddElement<int32_t>(ComputeSettings::VT_PREFERENCE, static_cast<int32_t>(preference), 0);
+  }
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+    fbb_.AddOffset(ComputeSettings::VT_TFLITE_SETTINGS, tflite_settings);
+  }
+  void add_model_namespace_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics) {
+    fbb_.AddOffset(ComputeSettings::VT_MODEL_NAMESPACE_FOR_STATISTICS, model_namespace_for_statistics);
+  }
+  void add_model_identifier_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics) {
+    fbb_.AddOffset(ComputeSettings::VT_MODEL_IDENTIFIER_FOR_STATISTICS, model_identifier_for_statistics);
+  }
+  void add_settings_to_test_locally(::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally) {
+    fbb_.AddOffset(ComputeSettings::VT_SETTINGS_TO_TEST_LOCALLY, settings_to_test_locally);
+  }
+  explicit ComputeSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ComputeSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ComputeSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics = 0,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+  ComputeSettingsBuilder builder_(_fbb);
+  builder_.add_settings_to_test_locally(settings_to_test_locally);
+  builder_.add_model_identifier_for_statistics(model_identifier_for_statistics);
+  builder_.add_model_namespace_for_statistics(model_namespace_for_statistics);
+  builder_.add_tflite_settings(tflite_settings);
+  builder_.add_preference(preference);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    const char *model_namespace_for_statistics = nullptr,
+    const char *model_identifier_for_statistics = nullptr,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+  auto model_namespace_for_statistics__ = model_namespace_for_statistics ? _fbb.CreateString(model_namespace_for_statistics) : 0;
+  auto model_identifier_for_statistics__ = model_identifier_for_statistics ? _fbb.CreateString(model_identifier_for_statistics) : 0;
+  return tflite::CreateComputeSettings(
+      _fbb,
+      preference,
+      tflite_settings,
+      model_namespace_for_statistics__,
+      model_identifier_for_statistics__,
+      settings_to_test_locally);
+}
+
+::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NNAPISettingsT : public ::flatbuffers::NativeTable {
+  typedef NNAPISettings TableType;
+  std::string accelerator_name{};
+  std::string cache_directory{};
+  std::string model_token{};
+  tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED;
+  int32_t no_of_nnapi_instances_to_cache = 0;
+  std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
+  bool allow_nnapi_cpu_on_android_10_plus = false;
+  tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED;
+  bool allow_dynamic_dimensions = false;
+  bool allow_fp16_precision_for_fp32 = false;
+  bool use_burst_computation = false;
+  int64_t support_library_handle = 0;
+  NNAPISettingsT() = default;
+  NNAPISettingsT(const NNAPISettingsT &o);
+  NNAPISettingsT(NNAPISettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  NNAPISettingsT &operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NNAPISettingsT NativeTableType;
+  typedef NNAPISettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ACCELERATOR_NAME = 4,
+    VT_CACHE_DIRECTORY = 6,
+    VT_MODEL_TOKEN = 8,
+    VT_EXECUTION_PREFERENCE = 10,
+    VT_NO_OF_NNAPI_INSTANCES_TO_CACHE = 12,
+    VT_FALLBACK_SETTINGS = 14,
+    VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS = 16,
+    VT_EXECUTION_PRIORITY = 18,
+    VT_ALLOW_DYNAMIC_DIMENSIONS = 20,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 22,
+    VT_USE_BURST_COMPUTATION = 24,
+    VT_SUPPORT_LIBRARY_HANDLE = 26
+  };
+  const ::flatbuffers::String *accelerator_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ACCELERATOR_NAME);
+  }
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  tflite::NNAPIExecutionPreference execution_preference() const {
+    return static_cast<tflite::NNAPIExecutionPreference>(GetField<int32_t>(VT_EXECUTION_PREFERENCE, 0));
+  }
+  int32_t no_of_nnapi_instances_to_cache() const {
+    return GetField<int32_t>(VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, 0);
+  }
+  const tflite::FallbackSettings *fallback_settings() const {
+    return GetPointer<const tflite::FallbackSettings *>(VT_FALLBACK_SETTINGS);
+  }
+  bool allow_nnapi_cpu_on_android_10_plus() const {
+    return GetField<uint8_t>(VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, 0) != 0;
+  }
+  tflite::NNAPIExecutionPriority execution_priority() const {
+    return static_cast<tflite::NNAPIExecutionPriority>(GetField<int32_t>(VT_EXECUTION_PRIORITY, 0));
+  }
+  bool allow_dynamic_dimensions() const {
+    return GetField<uint8_t>(VT_ALLOW_DYNAMIC_DIMENSIONS, 0) != 0;
+  }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
+  bool use_burst_computation() const {
+    return GetField<uint8_t>(VT_USE_BURST_COMPUTATION, 0) != 0;
+  }
+  int64_t support_library_handle() const {
+    return GetField<int64_t>(VT_SUPPORT_LIBRARY_HANDLE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ACCELERATOR_NAME) &&
+           verifier.VerifyString(accelerator_name()) &&
+           VerifyOffset(verifier, VT_CACHE_DIRECTORY) &&
+           verifier.VerifyString(cache_directory()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PREFERENCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, 4) &&
+           VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
+           verifier.VerifyTable(fallback_settings()) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, 1) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PRIORITY, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_DYNAMIC_DIMENSIONS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_BURST_COMPUTATION, 1) &&
+           VerifyField<int64_t>(verifier, VT_SUPPORT_LIBRARY_HANDLE, 8) &&
+           verifier.EndTable();
+  }
+  NNAPISettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NNAPISettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NNAPISettingsBuilder {
+  typedef NNAPISettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_accelerator_name(::flatbuffers::Offset<::flatbuffers::String> accelerator_name) {
+    fbb_.AddOffset(NNAPISettings::VT_ACCELERATOR_NAME, accelerator_name);
+  }
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
+    fbb_.AddOffset(NNAPISettings::VT_CACHE_DIRECTORY, cache_directory);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(NNAPISettings::VT_MODEL_TOKEN, model_token);
+  }
+  void add_execution_preference(tflite::NNAPIExecutionPreference execution_preference) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_EXECUTION_PREFERENCE, static_cast<int32_t>(execution_preference), 0);
+  }
+  void add_no_of_nnapi_instances_to_cache(int32_t no_of_nnapi_instances_to_cache) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, no_of_nnapi_instances_to_cache, 0);
+  }
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+    fbb_.AddOffset(NNAPISettings::VT_FALLBACK_SETTINGS, fallback_settings);
+  }
+  void add_allow_nnapi_cpu_on_android_10_plus(bool allow_nnapi_cpu_on_android_10_plus) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, static_cast<uint8_t>(allow_nnapi_cpu_on_android_10_plus), 0);
+  }
+  void add_execution_priority(tflite::NNAPIExecutionPriority execution_priority) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_EXECUTION_PRIORITY, static_cast<int32_t>(execution_priority), 0);
+  }
+  void add_allow_dynamic_dimensions(bool allow_dynamic_dimensions) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_DYNAMIC_DIMENSIONS, static_cast<uint8_t>(allow_dynamic_dimensions), 0);
+  }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
+  void add_use_burst_computation(bool use_burst_computation) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_USE_BURST_COMPUTATION, static_cast<uint8_t>(use_burst_computation), 0);
+  }
+  void add_support_library_handle(int64_t support_library_handle) {
+    fbb_.AddElement<int64_t>(NNAPISettings::VT_SUPPORT_LIBRARY_HANDLE, support_library_handle, 0);
+  }
+  explicit NNAPISettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NNAPISettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NNAPISettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> accelerator_name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
+    int32_t no_of_nnapi_instances_to_cache = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool allow_nnapi_cpu_on_android_10_plus = false,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    bool allow_dynamic_dimensions = false,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
+  NNAPISettingsBuilder builder_(_fbb);
+  builder_.add_support_library_handle(support_library_handle);
+  builder_.add_execution_priority(execution_priority);
+  builder_.add_fallback_settings(fallback_settings);
+  builder_.add_no_of_nnapi_instances_to_cache(no_of_nnapi_instances_to_cache);
+  builder_.add_execution_preference(execution_preference);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_directory(cache_directory);
+  builder_.add_accelerator_name(accelerator_name);
+  builder_.add_use_burst_computation(use_burst_computation);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
+  builder_.add_allow_dynamic_dimensions(allow_dynamic_dimensions);
+  builder_.add_allow_nnapi_cpu_on_android_10_plus(allow_nnapi_cpu_on_android_10_plus);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *accelerator_name = nullptr,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
+    int32_t no_of_nnapi_instances_to_cache = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool allow_nnapi_cpu_on_android_10_plus = false,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    bool allow_dynamic_dimensions = false,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
+  auto accelerator_name__ = accelerator_name ? _fbb.CreateString(accelerator_name) : 0;
+  auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateNNAPISettings(
+      _fbb,
+      accelerator_name__,
+      cache_directory__,
+      model_token__,
+      execution_preference,
+      no_of_nnapi_instances_to_cache,
+      fallback_settings,
+      allow_nnapi_cpu_on_android_10_plus,
+      execution_priority,
+      allow_dynamic_dimensions,
+      allow_fp16_precision_for_fp32,
+      use_burst_computation,
+      support_library_handle);
+}
+
+::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GPUSettingsT : public ::flatbuffers::NativeTable {
+  typedef GPUSettings TableType;
+  bool is_precision_loss_allowed = false;
+  bool enable_quantized_inference = true;
+  tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET;
+  tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
+  std::string cache_directory{};
+  std::string model_token{};
+};
+
+struct GPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GPUSettingsT NativeTableType;
+  typedef GPUSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IS_PRECISION_LOSS_ALLOWED = 4,
+    VT_ENABLE_QUANTIZED_INFERENCE = 6,
+    VT_FORCE_BACKEND = 8,
+    VT_INFERENCE_PRIORITY1 = 10,
+    VT_INFERENCE_PRIORITY2 = 12,
+    VT_INFERENCE_PRIORITY3 = 14,
+    VT_INFERENCE_PREFERENCE = 16,
+    VT_CACHE_DIRECTORY = 18,
+    VT_MODEL_TOKEN = 20
+  };
+  bool is_precision_loss_allowed() const {
+    return GetField<uint8_t>(VT_IS_PRECISION_LOSS_ALLOWED, 0) != 0;
+  }
+  bool enable_quantized_inference() const {
+    return GetField<uint8_t>(VT_ENABLE_QUANTIZED_INFERENCE, 1) != 0;
+  }
+  tflite::GPUBackend force_backend() const {
+    return static_cast<tflite::GPUBackend>(GetField<int32_t>(VT_FORCE_BACKEND, 0));
+  }
+  tflite::GPUInferencePriority inference_priority1() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY1, 0));
+  }
+  tflite::GPUInferencePriority inference_priority2() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY2, 0));
+  }
+  tflite::GPUInferencePriority inference_priority3() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY3, 0));
+  }
+  tflite::GPUInferenceUsage inference_preference() const {
+    return static_cast<tflite::GPUInferenceUsage>(GetField<int32_t>(VT_INFERENCE_PREFERENCE, 0));
+  }
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_IS_PRECISION_LOSS_ALLOWED, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_QUANTIZED_INFERENCE, 1) &&
+           VerifyField<int32_t>(verifier, VT_FORCE_BACKEND, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY1, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY2, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY3, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PREFERENCE, 4) &&
+           VerifyOffset(verifier, VT_CACHE_DIRECTORY) &&
+           verifier.VerifyString(cache_directory()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           verifier.EndTable();
+  }
+  GPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GPUSettingsBuilder {
+  typedef GPUSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_is_precision_loss_allowed(bool is_precision_loss_allowed) {
+    fbb_.AddElement<uint8_t>(GPUSettings::VT_IS_PRECISION_LOSS_ALLOWED, static_cast<uint8_t>(is_precision_loss_allowed), 0);
+  }
+  void add_enable_quantized_inference(bool enable_quantized_inference) {
+    fbb_.AddElement<uint8_t>(GPUSettings::VT_ENABLE_QUANTIZED_INFERENCE, static_cast<uint8_t>(enable_quantized_inference), 1);
+  }
+  void add_force_backend(tflite::GPUBackend force_backend) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_FORCE_BACKEND, static_cast<int32_t>(force_backend), 0);
+  }
+  void add_inference_priority1(tflite::GPUInferencePriority inference_priority1) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY1, static_cast<int32_t>(inference_priority1), 0);
+  }
+  void add_inference_priority2(tflite::GPUInferencePriority inference_priority2) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY2, static_cast<int32_t>(inference_priority2), 0);
+  }
+  void add_inference_priority3(tflite::GPUInferencePriority inference_priority3) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY3, static_cast<int32_t>(inference_priority3), 0);
+  }
+  void add_inference_preference(tflite::GPUInferenceUsage inference_preference) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PREFERENCE, static_cast<int32_t>(inference_preference), 0);
+  }
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
+    fbb_.AddOffset(GPUSettings::VT_CACHE_DIRECTORY, cache_directory);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(GPUSettings::VT_MODEL_TOKEN, model_token);
+  }
+  explicit GPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GPUSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GPUSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_precision_loss_allowed = false,
+    bool enable_quantized_inference = true,
+    tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
+    tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
+  GPUSettingsBuilder builder_(_fbb);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_directory(cache_directory);
+  builder_.add_inference_preference(inference_preference);
+  builder_.add_inference_priority3(inference_priority3);
+  builder_.add_inference_priority2(inference_priority2);
+  builder_.add_inference_priority1(inference_priority1);
+  builder_.add_force_backend(force_backend);
+  builder_.add_enable_quantized_inference(enable_quantized_inference);
+  builder_.add_is_precision_loss_allowed(is_precision_loss_allowed);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_precision_loss_allowed = false,
+    bool enable_quantized_inference = true,
+    tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
+    tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr) {
+  auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateGPUSettings(
+      _fbb,
+      is_precision_loss_allowed,
+      enable_quantized_inference,
+      force_backend,
+      inference_priority1,
+      inference_priority2,
+      inference_priority3,
+      inference_preference,
+      cache_directory__,
+      model_token__);
+}
+
+::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HexagonSettingsT : public ::flatbuffers::NativeTable {
+  typedef HexagonSettings TableType;
+  int32_t debug_level = 0;
+  int32_t powersave_level = 0;
+  bool print_graph_profile = false;
+  bool print_graph_debug = false;
+};
+
+struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HexagonSettingsT NativeTableType;
+  typedef HexagonSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEBUG_LEVEL = 4,
+    VT_POWERSAVE_LEVEL = 6,
+    VT_PRINT_GRAPH_PROFILE = 8,
+    VT_PRINT_GRAPH_DEBUG = 10
+  };
+  int32_t debug_level() const {
+    return GetField<int32_t>(VT_DEBUG_LEVEL, 0);
+  }
+  int32_t powersave_level() const {
+    return GetField<int32_t>(VT_POWERSAVE_LEVEL, 0);
+  }
+  bool print_graph_profile() const {
+    return GetField<uint8_t>(VT_PRINT_GRAPH_PROFILE, 0) != 0;
+  }
+  bool print_graph_debug() const {
+    return GetField<uint8_t>(VT_PRINT_GRAPH_DEBUG, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_DEBUG_LEVEL, 4) &&
+           VerifyField<int32_t>(verifier, VT_POWERSAVE_LEVEL, 4) &&
+           VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_PROFILE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_DEBUG, 1) &&
+           verifier.EndTable();
+  }
+  HexagonSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HexagonSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HexagonSettingsBuilder {
+  typedef HexagonSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_debug_level(int32_t debug_level) {
+    fbb_.AddElement<int32_t>(HexagonSettings::VT_DEBUG_LEVEL, debug_level, 0);
+  }
+  void add_powersave_level(int32_t powersave_level) {
+    fbb_.AddElement<int32_t>(HexagonSettings::VT_POWERSAVE_LEVEL, powersave_level, 0);
+  }
+  void add_print_graph_profile(bool print_graph_profile) {
+    fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_PROFILE, static_cast<uint8_t>(print_graph_profile), 0);
+  }
+  void add_print_graph_debug(bool print_graph_debug) {
+    fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_DEBUG, static_cast<uint8_t>(print_graph_debug), 0);
+  }
+  explicit HexagonSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HexagonSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HexagonSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t debug_level = 0,
+    int32_t powersave_level = 0,
+    bool print_graph_profile = false,
+    bool print_graph_debug = false) {
+  HexagonSettingsBuilder builder_(_fbb);
+  builder_.add_powersave_level(powersave_level);
+  builder_.add_debug_level(debug_level);
+  builder_.add_print_graph_debug(print_graph_debug);
+  builder_.add_print_graph_profile(print_graph_profile);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct XNNPackSettingsT : public ::flatbuffers::NativeTable {
+  typedef XNNPackSettings TableType;
+  int32_t num_threads = 0;
+  tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS;
+  std::string weight_cache_file_path{};
+};
+
+struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef XNNPackSettingsT NativeTableType;
+  typedef XNNPackSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_THREADS = 4,
+    VT_FLAGS = 6,
+    VT_WEIGHT_CACHE_FILE_PATH = 8
+  };
+  int32_t num_threads() const {
+    return GetField<int32_t>(VT_NUM_THREADS, 0);
+  }
+  tflite::XNNPackFlags flags() const {
+    return static_cast<tflite::XNNPackFlags>(GetField<int32_t>(VT_FLAGS, 0));
+  }
+  const ::flatbuffers::String *weight_cache_file_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_WEIGHT_CACHE_FILE_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
+           VerifyField<int32_t>(verifier, VT_FLAGS, 4) &&
+           VerifyOffset(verifier, VT_WEIGHT_CACHE_FILE_PATH) &&
+           verifier.VerifyString(weight_cache_file_path()) &&
+           verifier.EndTable();
+  }
+  XNNPackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<XNNPackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct XNNPackSettingsBuilder {
+  typedef XNNPackSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_threads(int32_t num_threads) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_NUM_THREADS, num_threads, 0);
+  }
+  void add_flags(tflite::XNNPackFlags flags) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_FLAGS, static_cast<int32_t>(flags), 0);
+  }
+  void add_weight_cache_file_path(::flatbuffers::Offset<::flatbuffers::String> weight_cache_file_path) {
+    fbb_.AddOffset(XNNPackSettings::VT_WEIGHT_CACHE_FILE_PATH, weight_cache_file_path);
+  }
+  explicit XNNPackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<XNNPackSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<XNNPackSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = 0,
+    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    ::flatbuffers::Offset<::flatbuffers::String> weight_cache_file_path = 0) {
+  XNNPackSettingsBuilder builder_(_fbb);
+  builder_.add_weight_cache_file_path(weight_cache_file_path);
+  builder_.add_flags(flags);
+  builder_.add_num_threads(num_threads);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = 0,
+    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    const char *weight_cache_file_path = nullptr) {
+  auto weight_cache_file_path__ = weight_cache_file_path ? _fbb.CreateString(weight_cache_file_path) : 0;
+  return tflite::CreateXNNPackSettings(
+      _fbb,
+      num_threads,
+      flags,
+      weight_cache_file_path__);
+}
+
+::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CoreMLSettingsT : public ::flatbuffers::NativeTable {
+  typedef CoreMLSettings TableType;
+  tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL;
+  int32_t coreml_version = 0;
+  int32_t max_delegated_partitions = 0;
+  int32_t min_nodes_per_partition = 2;
+};
+
+struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CoreMLSettingsT NativeTableType;
+  typedef CoreMLSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENABLED_DEVICES = 4,
+    VT_COREML_VERSION = 6,
+    VT_MAX_DELEGATED_PARTITIONS = 8,
+    VT_MIN_NODES_PER_PARTITION = 10
+  };
+  tflite::CoreMLSettings_::EnabledDevices enabled_devices() const {
+    return static_cast<tflite::CoreMLSettings_::EnabledDevices>(GetField<int32_t>(VT_ENABLED_DEVICES, 0));
+  }
+  int32_t coreml_version() const {
+    return GetField<int32_t>(VT_COREML_VERSION, 0);
+  }
+  int32_t max_delegated_partitions() const {
+    return GetField<int32_t>(VT_MAX_DELEGATED_PARTITIONS, 0);
+  }
+  int32_t min_nodes_per_partition() const {
+    return GetField<int32_t>(VT_MIN_NODES_PER_PARTITION, 2);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_ENABLED_DEVICES, 4) &&
+           VerifyField<int32_t>(verifier, VT_COREML_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_MAX_DELEGATED_PARTITIONS, 4) &&
+           VerifyField<int32_t>(verifier, VT_MIN_NODES_PER_PARTITION, 4) &&
+           verifier.EndTable();
+  }
+  CoreMLSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoreMLSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CoreMLSettingsBuilder {
+  typedef CoreMLSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_enabled_devices(tflite::CoreMLSettings_::EnabledDevices enabled_devices) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_ENABLED_DEVICES, static_cast<int32_t>(enabled_devices), 0);
+  }
+  void add_coreml_version(int32_t coreml_version) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_COREML_VERSION, coreml_version, 0);
+  }
+  void add_max_delegated_partitions(int32_t max_delegated_partitions) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
+  }
+  void add_min_nodes_per_partition(int32_t min_nodes_per_partition) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_MIN_NODES_PER_PARTITION, min_nodes_per_partition, 2);
+  }
+  explicit CoreMLSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CoreMLSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CoreMLSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL,
+    int32_t coreml_version = 0,
+    int32_t max_delegated_partitions = 0,
+    int32_t min_nodes_per_partition = 2) {
+  CoreMLSettingsBuilder builder_(_fbb);
+  builder_.add_min_nodes_per_partition(min_nodes_per_partition);
+  builder_.add_max_delegated_partitions(max_delegated_partitions);
+  builder_.add_coreml_version(coreml_version);
+  builder_.add_enabled_devices(enabled_devices);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StableDelegateLoaderSettingsT : public ::flatbuffers::NativeTable {
+  typedef StableDelegateLoaderSettings TableType;
+  std::string delegate_path{};
+  std::string delegate_name{};
+};
+
+struct StableDelegateLoaderSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StableDelegateLoaderSettingsT NativeTableType;
+  typedef StableDelegateLoaderSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE_PATH = 4,
+    VT_DELEGATE_NAME = 6
+  };
+  const ::flatbuffers::String *delegate_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DELEGATE_PATH);
+  }
+  const ::flatbuffers::String *delegate_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DELEGATE_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DELEGATE_PATH) &&
+           verifier.VerifyString(delegate_path()) &&
+           VerifyOffset(verifier, VT_DELEGATE_NAME) &&
+           verifier.VerifyString(delegate_name()) &&
+           verifier.EndTable();
+  }
+  StableDelegateLoaderSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StableDelegateLoaderSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StableDelegateLoaderSettingsBuilder {
+  typedef StableDelegateLoaderSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate_path(::flatbuffers::Offset<::flatbuffers::String> delegate_path) {
+    fbb_.AddOffset(StableDelegateLoaderSettings::VT_DELEGATE_PATH, delegate_path);
+  }
+  void add_delegate_name(::flatbuffers::Offset<::flatbuffers::String> delegate_name) {
+    fbb_.AddOffset(StableDelegateLoaderSettings::VT_DELEGATE_NAME, delegate_name);
+  }
+  explicit StableDelegateLoaderSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StableDelegateLoaderSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StableDelegateLoaderSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> delegate_path = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> delegate_name = 0) {
+  StableDelegateLoaderSettingsBuilder builder_(_fbb);
+  builder_.add_delegate_name(delegate_name);
+  builder_.add_delegate_path(delegate_path);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *delegate_path = nullptr,
+    const char *delegate_name = nullptr) {
+  auto delegate_path__ = delegate_path ? _fbb.CreateString(delegate_path) : 0;
+  auto delegate_name__ = delegate_name ? _fbb.CreateString(delegate_name) : 0;
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      delegate_path__,
+      delegate_name__);
+}
+
+::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CompilationCachingSettingsT : public ::flatbuffers::NativeTable {
+  typedef CompilationCachingSettings TableType;
+  std::string cache_dir{};
+  std::string model_token{};
+};
+
+struct CompilationCachingSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CompilationCachingSettingsT NativeTableType;
+  typedef CompilationCachingSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CACHE_DIR = 4,
+    VT_MODEL_TOKEN = 6
+  };
+  const ::flatbuffers::String *cache_dir() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIR);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CACHE_DIR) &&
+           verifier.VerifyString(cache_dir()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           verifier.EndTable();
+  }
+  CompilationCachingSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CompilationCachingSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CompilationCachingSettingsBuilder {
+  typedef CompilationCachingSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cache_dir(::flatbuffers::Offset<::flatbuffers::String> cache_dir) {
+    fbb_.AddOffset(CompilationCachingSettings::VT_CACHE_DIR, cache_dir);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(CompilationCachingSettings::VT_MODEL_TOKEN, model_token);
+  }
+  explicit CompilationCachingSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CompilationCachingSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CompilationCachingSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_dir = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
+  CompilationCachingSettingsBuilder builder_(_fbb);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_dir(cache_dir);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *cache_dir = nullptr,
+    const char *model_token = nullptr) {
+  auto cache_dir__ = cache_dir ? _fbb.CreateString(cache_dir) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateCompilationCachingSettings(
+      _fbb,
+      cache_dir__,
+      model_token__);
+}
+
+::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuDeviceSpecT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuDeviceSpec TableType;
+  tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO;
+  int32_t num_chips = 0;
+  std::vector<std::string> device_paths{};
+  int32_t chip_family = 0;
+};
+
+struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuDeviceSpecT NativeTableType;
+  typedef EdgeTpuDeviceSpecBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PLATFORM_TYPE = 4,
+    VT_NUM_CHIPS = 6,
+    VT_DEVICE_PATHS = 8,
+    VT_CHIP_FAMILY = 10
+  };
+  tflite::EdgeTpuDeviceSpec_::PlatformType platform_type() const {
+    return static_cast<tflite::EdgeTpuDeviceSpec_::PlatformType>(GetField<int32_t>(VT_PLATFORM_TYPE, 0));
+  }
+  int32_t num_chips() const {
+    return GetField<int32_t>(VT_NUM_CHIPS, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DEVICE_PATHS);
+  }
+  int32_t chip_family() const {
+    return GetField<int32_t>(VT_CHIP_FAMILY, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_PLATFORM_TYPE, 4) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHIPS, 4) &&
+           VerifyOffset(verifier, VT_DEVICE_PATHS) &&
+           verifier.VerifyVector(device_paths()) &&
+           verifier.VerifyVectorOfStrings(device_paths()) &&
+           VerifyField<int32_t>(verifier, VT_CHIP_FAMILY, 4) &&
+           verifier.EndTable();
+  }
+  EdgeTpuDeviceSpecT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuDeviceSpec> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuDeviceSpecBuilder {
+  typedef EdgeTpuDeviceSpec Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_platform_type(tflite::EdgeTpuDeviceSpec_::PlatformType platform_type) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_PLATFORM_TYPE, static_cast<int32_t>(platform_type), 0);
+  }
+  void add_num_chips(int32_t num_chips) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_NUM_CHIPS, num_chips, 0);
+  }
+  void add_device_paths(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths) {
+    fbb_.AddOffset(EdgeTpuDeviceSpec::VT_DEVICE_PATHS, device_paths);
+  }
+  void add_chip_family(int32_t chip_family) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_CHIP_FAMILY, chip_family, 0);
+  }
+  explicit EdgeTpuDeviceSpecBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuDeviceSpec> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuDeviceSpec>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
+    int32_t num_chips = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths = 0,
+    int32_t chip_family = 0) {
+  EdgeTpuDeviceSpecBuilder builder_(_fbb);
+  builder_.add_chip_family(chip_family);
+  builder_.add_device_paths(device_paths);
+  builder_.add_num_chips(num_chips);
+  builder_.add_platform_type(platform_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpecDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
+    int32_t num_chips = 0,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths = nullptr,
+    int32_t chip_family = 0) {
+  auto device_paths__ = device_paths ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*device_paths) : 0;
+  return tflite::CreateEdgeTpuDeviceSpec(
+      _fbb,
+      platform_type,
+      num_chips,
+      device_paths__,
+      chip_family);
+}
+
+::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuInactivePowerConfigT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuInactivePowerConfig TableType;
+  tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
+  int64_t inactive_timeout_us = 0;
+};
+
+struct EdgeTpuInactivePowerConfig FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuInactivePowerConfigT NativeTableType;
+  typedef EdgeTpuInactivePowerConfigBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INACTIVE_POWER_STATE = 4,
+    VT_INACTIVE_TIMEOUT_US = 6
+  };
+  tflite::EdgeTpuPowerState inactive_power_state() const {
+    return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INACTIVE_POWER_STATE, 0));
+  }
+  int64_t inactive_timeout_us() const {
+    return GetField<int64_t>(VT_INACTIVE_TIMEOUT_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INACTIVE_POWER_STATE, 4) &&
+           VerifyField<int64_t>(verifier, VT_INACTIVE_TIMEOUT_US, 8) &&
+           verifier.EndTable();
+  }
+  EdgeTpuInactivePowerConfigT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuInactivePowerConfigBuilder {
+  typedef EdgeTpuInactivePowerConfig Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inactive_power_state(tflite::EdgeTpuPowerState inactive_power_state) {
+    fbb_.AddElement<int32_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_POWER_STATE, static_cast<int32_t>(inactive_power_state), 0);
+  }
+  void add_inactive_timeout_us(int64_t inactive_timeout_us) {
+    fbb_.AddElement<int64_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_TIMEOUT_US, inactive_timeout_us, 0);
+  }
+  explicit EdgeTpuInactivePowerConfigBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuInactivePowerConfig>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    int64_t inactive_timeout_us = 0) {
+  EdgeTpuInactivePowerConfigBuilder builder_(_fbb);
+  builder_.add_inactive_timeout_us(inactive_timeout_us);
+  builder_.add_inactive_power_state(inactive_power_state);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuSettingsT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuSettings TableType;
+  tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
+  std::vector<std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>> inactive_power_configs{};
+  int32_t inference_priority = -1;
+  std::unique_ptr<tflite::EdgeTpuDeviceSpecT> edgetpu_device_spec{};
+  std::string model_token{};
+  tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED;
+  tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED;
+  std::vector<int32_t> hardware_cluster_ids{};
+  std::string public_model_id{};
+  tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend = tflite::EdgeTpuSettings_::UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED;
+  EdgeTpuSettingsT() = default;
+  EdgeTpuSettingsT(const EdgeTpuSettingsT &o);
+  EdgeTpuSettingsT(EdgeTpuSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  EdgeTpuSettingsT &operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuSettingsT NativeTableType;
+  typedef EdgeTpuSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INFERENCE_POWER_STATE = 4,
+    VT_INACTIVE_POWER_CONFIGS = 6,
+    VT_INFERENCE_PRIORITY = 8,
+    VT_EDGETPU_DEVICE_SPEC = 10,
+    VT_MODEL_TOKEN = 12,
+    VT_FLOAT_TRUNCATION_TYPE = 14,
+    VT_QOS_CLASS = 16,
+    VT_HARDWARE_CLUSTER_IDS = 18,
+    VT_PUBLIC_MODEL_ID = 20,
+    VT_USE_LAYER_IR_TGC_BACKEND = 22
+  };
+  tflite::EdgeTpuPowerState inference_power_state() const {
+    return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INFERENCE_POWER_STATE, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *>(VT_INACTIVE_POWER_CONFIGS);
+  }
+  int32_t inference_priority() const {
+    return GetField<int32_t>(VT_INFERENCE_PRIORITY, -1);
+  }
+  const tflite::EdgeTpuDeviceSpec *edgetpu_device_spec() const {
+    return GetPointer<const tflite::EdgeTpuDeviceSpec *>(VT_EDGETPU_DEVICE_SPEC);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type() const {
+    return static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(GetField<int32_t>(VT_FLOAT_TRUNCATION_TYPE, 0));
+  }
+  tflite::EdgeTpuSettings_::QosClass qos_class() const {
+    return static_cast<tflite::EdgeTpuSettings_::QosClass>(GetField<int32_t>(VT_QOS_CLASS, 0));
+  }
+  const ::flatbuffers::Vector<int32_t> *hardware_cluster_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_HARDWARE_CLUSTER_IDS);
+  }
+  const ::flatbuffers::String *public_model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_PUBLIC_MODEL_ID);
+  }
+  tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend() const {
+    return static_cast<tflite::EdgeTpuSettings_::UseLayerIrTgcBackend>(GetField<int32_t>(VT_USE_LAYER_IR_TGC_BACKEND, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_POWER_STATE, 4) &&
+           VerifyOffset(verifier, VT_INACTIVE_POWER_CONFIGS) &&
+           verifier.VerifyVector(inactive_power_configs()) &&
+           verifier.VerifyVectorOfTables(inactive_power_configs()) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_EDGETPU_DEVICE_SPEC) &&
+           verifier.VerifyTable(edgetpu_device_spec()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           VerifyField<int32_t>(verifier, VT_FLOAT_TRUNCATION_TYPE, 4) &&
+           VerifyField<int32_t>(verifier, VT_QOS_CLASS, 4) &&
+           VerifyOffset(verifier, VT_HARDWARE_CLUSTER_IDS) &&
+           verifier.VerifyVector(hardware_cluster_ids()) &&
+           VerifyOffset(verifier, VT_PUBLIC_MODEL_ID) &&
+           verifier.VerifyString(public_model_id()) &&
+           VerifyField<int32_t>(verifier, VT_USE_LAYER_IR_TGC_BACKEND, 4) &&
+           verifier.EndTable();
+  }
+  EdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuSettingsBuilder {
+  typedef EdgeTpuSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inference_power_state(tflite::EdgeTpuPowerState inference_power_state) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_POWER_STATE, static_cast<int32_t>(inference_power_state), 0);
+  }
+  void add_inactive_power_configs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_INACTIVE_POWER_CONFIGS, inactive_power_configs);
+  }
+  void add_inference_priority(int32_t inference_priority) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_PRIORITY, inference_priority, -1);
+  }
+  void add_edgetpu_device_spec(::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_EDGETPU_DEVICE_SPEC, edgetpu_device_spec);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_MODEL_TOKEN, model_token);
+  }
+  void add_float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_FLOAT_TRUNCATION_TYPE, static_cast<int32_t>(float_truncation_type), 0);
+  }
+  void add_qos_class(tflite::EdgeTpuSettings_::QosClass qos_class) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_QOS_CLASS, static_cast<int32_t>(qos_class), 0);
+  }
+  void add_hardware_cluster_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_HARDWARE_CLUSTER_IDS, hardware_cluster_ids);
+  }
+  void add_public_model_id(::flatbuffers::Offset<::flatbuffers::String> public_model_id) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_PUBLIC_MODEL_ID, public_model_id);
+  }
+  void add_use_layer_ir_tgc_backend(tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_USE_LAYER_IR_TGC_BACKEND, static_cast<int32_t>(use_layer_ir_tgc_backend), 0);
+  }
+  explicit EdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs = 0,
+    int32_t inference_priority = -1,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> public_model_id = 0,
+    tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend = tflite::EdgeTpuSettings_::UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED) {
+  EdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_use_layer_ir_tgc_backend(use_layer_ir_tgc_backend);
+  builder_.add_public_model_id(public_model_id);
+  builder_.add_hardware_cluster_ids(hardware_cluster_ids);
+  builder_.add_qos_class(qos_class);
+  builder_.add_float_truncation_type(float_truncation_type);
+  builder_.add_model_token(model_token);
+  builder_.add_edgetpu_device_spec(edgetpu_device_spec);
+  builder_.add_inference_priority(inference_priority);
+  builder_.add_inactive_power_configs(inactive_power_configs);
+  builder_.add_inference_power_state(inference_power_state);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    const std::vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs = nullptr,
+    int32_t inference_priority = -1,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    const char *model_token = nullptr,
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
+    const std::vector<int32_t> *hardware_cluster_ids = nullptr,
+    const char *public_model_id = nullptr,
+    tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend = tflite::EdgeTpuSettings_::UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED) {
+  auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  auto hardware_cluster_ids__ = hardware_cluster_ids ? _fbb.CreateVector<int32_t>(*hardware_cluster_ids) : 0;
+  auto public_model_id__ = public_model_id ? _fbb.CreateString(public_model_id) : 0;
+  return tflite::CreateEdgeTpuSettings(
+      _fbb,
+      inference_power_state,
+      inactive_power_configs__,
+      inference_priority,
+      edgetpu_device_spec,
+      model_token__,
+      float_truncation_type,
+      qos_class,
+      hardware_cluster_ids__,
+      public_model_id__,
+      use_layer_ir_tgc_backend);
+}
+
+::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GoogleEdgeTpuSettingsT : public ::flatbuffers::NativeTable {
+  typedef GoogleEdgeTpuSettings TableType;
+  int32_t log_verbosity = -1;
+  bool enable_tracing = false;
+  tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED;
+  std::vector<uint8_t> extension_data{};
+  std::string model_identifier{};
+  bool use_async_api = false;
+  bool delegate_should_manage_cache_for_inputs = true;
+  bool delegate_should_manage_cache_for_outputs = true;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+  bool allow_fp16_precision_for_fp32 = false;
+};
+
+struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GoogleEdgeTpuSettingsT NativeTableType;
+  typedef GoogleEdgeTpuSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LOG_VERBOSITY = 4,
+    VT_ENABLE_TRACING = 6,
+    VT_PRIORITY = 8,
+    VT_EXTENSION_DATA = 10,
+    VT_MODEL_IDENTIFIER = 12,
+    VT_USE_ASYNC_API = 14,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS = 16,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS = 18,
+    VT_PREFER_CACHE_COHERENCY_FOR_INPUTS = 20,
+    VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS = 22,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 24
+  };
+  int32_t log_verbosity() const {
+    return GetField<int32_t>(VT_LOG_VERBOSITY, -1);
+  }
+  bool enable_tracing() const {
+    return GetField<uint8_t>(VT_ENABLE_TRACING, 0) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::Priority priority() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::Priority>(GetField<int32_t>(VT_PRIORITY, 0));
+  }
+  const ::flatbuffers::Vector<uint8_t> *extension_data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_EXTENSION_DATA);
+  }
+  const ::flatbuffers::String *model_identifier() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER);
+  }
+  bool use_async_api() const {
+    return GetField<uint8_t>(VT_USE_ASYNC_API, 0) != 0;
+  }
+  bool delegate_should_manage_cache_for_inputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) != 0;
+  }
+  bool delegate_should_manage_cache_for_outputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 0));
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 0));
+  }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_LOG_VERBOSITY, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_TRACING, 1) &&
+           VerifyField<int32_t>(verifier, VT_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_EXTENSION_DATA) &&
+           verifier.VerifyVector(extension_data()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER) &&
+           verifier.VerifyString(model_identifier()) &&
+           VerifyField<uint8_t>(verifier, VT_USE_ASYNC_API, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 4) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
+           verifier.EndTable();
+  }
+  GoogleEdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GoogleEdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GoogleEdgeTpuSettingsBuilder {
+  typedef GoogleEdgeTpuSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_log_verbosity(int32_t log_verbosity) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_LOG_VERBOSITY, log_verbosity, -1);
+  }
+  void add_enable_tracing(bool enable_tracing) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_ENABLE_TRACING, static_cast<uint8_t>(enable_tracing), 0);
+  }
+  void add_priority(tflite::GoogleEdgeTpuSettings_::Priority priority) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PRIORITY, static_cast<int32_t>(priority), 0);
+  }
+  void add_extension_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_EXTENSION_DATA, extension_data);
+  }
+  void add_model_identifier(::flatbuffers::Offset<::flatbuffers::String> model_identifier) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_MODEL_IDENTIFIER, model_identifier);
+  }
+  void add_use_async_api(bool use_async_api) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_USE_ASYNC_API, static_cast<uint8_t>(use_async_api), 0);
+  }
+  void add_delegate_should_manage_cache_for_inputs(bool delegate_should_manage_cache_for_inputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_inputs), 1);
+  }
+  void add_delegate_should_manage_cache_for_outputs(bool delegate_should_manage_cache_for_outputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_outputs), 1);
+  }
+  void add_prefer_cache_coherency_for_inputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, static_cast<int32_t>(prefer_cache_coherency_for_inputs), 0);
+  }
+  void add_prefer_cache_coherency_for_outputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, static_cast<int32_t>(prefer_cache_coherency_for_outputs), 0);
+  }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
+  explicit GoogleEdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GoogleEdgeTpuSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GoogleEdgeTpuSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t log_verbosity = -1,
+    bool enable_tracing = false,
+    tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier = 0,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    bool allow_fp16_precision_for_fp32 = false) {
+  GoogleEdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_prefer_cache_coherency_for_outputs(prefer_cache_coherency_for_outputs);
+  builder_.add_prefer_cache_coherency_for_inputs(prefer_cache_coherency_for_inputs);
+  builder_.add_model_identifier(model_identifier);
+  builder_.add_extension_data(extension_data);
+  builder_.add_priority(priority);
+  builder_.add_log_verbosity(log_verbosity);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
+  builder_.add_delegate_should_manage_cache_for_outputs(delegate_should_manage_cache_for_outputs);
+  builder_.add_delegate_should_manage_cache_for_inputs(delegate_should_manage_cache_for_inputs);
+  builder_.add_use_async_api(use_async_api);
+  builder_.add_enable_tracing(enable_tracing);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t log_verbosity = -1,
+    bool enable_tracing = false,
+    tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
+    const std::vector<uint8_t> *extension_data = nullptr,
+    const char *model_identifier = nullptr,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    bool allow_fp16_precision_for_fp32 = false) {
+  auto extension_data__ = extension_data ? _fbb.CreateVector<uint8_t>(*extension_data) : 0;
+  auto model_identifier__ = model_identifier ? _fbb.CreateString(model_identifier) : 0;
+  return tflite::CreateGoogleEdgeTpuSettings(
+      _fbb,
+      log_verbosity,
+      enable_tracing,
+      priority,
+      extension_data__,
+      model_identifier__,
+      use_async_api,
+      delegate_should_manage_cache_for_inputs,
+      delegate_should_manage_cache_for_outputs,
+      prefer_cache_coherency_for_inputs,
+      prefer_cache_coherency_for_outputs,
+      allow_fp16_precision_for_fp32);
+}
+
+::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CoralSettingsT : public ::flatbuffers::NativeTable {
+  typedef CoralSettings TableType;
+  std::string device{};
+  tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED;
+  bool usb_always_dfu = false;
+  int32_t usb_max_bulk_in_queue_length = 0;
+};
+
+struct CoralSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CoralSettingsT NativeTableType;
+  typedef CoralSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEVICE = 4,
+    VT_PERFORMANCE = 6,
+    VT_USB_ALWAYS_DFU = 8,
+    VT_USB_MAX_BULK_IN_QUEUE_LENGTH = 10
+  };
+  const ::flatbuffers::String *device() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DEVICE);
+  }
+  tflite::CoralSettings_::Performance performance() const {
+    return static_cast<tflite::CoralSettings_::Performance>(GetField<int32_t>(VT_PERFORMANCE, 0));
+  }
+  bool usb_always_dfu() const {
+    return GetField<uint8_t>(VT_USB_ALWAYS_DFU, 0) != 0;
+  }
+  int32_t usb_max_bulk_in_queue_length() const {
+    return GetField<int32_t>(VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DEVICE) &&
+           verifier.VerifyString(device()) &&
+           VerifyField<int32_t>(verifier, VT_PERFORMANCE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_USB_ALWAYS_DFU, 1) &&
+           VerifyField<int32_t>(verifier, VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 4) &&
+           verifier.EndTable();
+  }
+  CoralSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoralSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CoralSettingsBuilder {
+  typedef CoralSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_device(::flatbuffers::Offset<::flatbuffers::String> device) {
+    fbb_.AddOffset(CoralSettings::VT_DEVICE, device);
+  }
+  void add_performance(tflite::CoralSettings_::Performance performance) {
+    fbb_.AddElement<int32_t>(CoralSettings::VT_PERFORMANCE, static_cast<int32_t>(performance), 0);
+  }
+  void add_usb_always_dfu(bool usb_always_dfu) {
+    fbb_.AddElement<uint8_t>(CoralSettings::VT_USB_ALWAYS_DFU, static_cast<uint8_t>(usb_always_dfu), 0);
+  }
+  void add_usb_max_bulk_in_queue_length(int32_t usb_max_bulk_in_queue_length) {
+    fbb_.AddElement<int32_t>(CoralSettings::VT_USB_MAX_BULK_IN_QUEUE_LENGTH, usb_max_bulk_in_queue_length, 0);
+  }
+  explicit CoralSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CoralSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CoralSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> device = 0,
+    tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
+    bool usb_always_dfu = false,
+    int32_t usb_max_bulk_in_queue_length = 0) {
+  CoralSettingsBuilder builder_(_fbb);
+  builder_.add_usb_max_bulk_in_queue_length(usb_max_bulk_in_queue_length);
+  builder_.add_performance(performance);
+  builder_.add_device(device);
+  builder_.add_usb_always_dfu(usb_always_dfu);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *device = nullptr,
+    tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
+    bool usb_always_dfu = false,
+    int32_t usb_max_bulk_in_queue_length = 0) {
+  auto device__ = device ? _fbb.CreateString(device) : 0;
+  return tflite::CreateCoralSettings(
+      _fbb,
+      device__,
+      performance,
+      usb_always_dfu,
+      usb_max_bulk_in_queue_length);
+}
+
+::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CPUSettingsT : public ::flatbuffers::NativeTable {
+  typedef CPUSettings TableType;
+  int32_t num_threads = -1;
+};
+
+struct CPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CPUSettingsT NativeTableType;
+  typedef CPUSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_THREADS = 4
+  };
+  int32_t num_threads() const {
+    return GetField<int32_t>(VT_NUM_THREADS, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
+           verifier.EndTable();
+  }
+  CPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CPUSettingsBuilder {
+  typedef CPUSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_threads(int32_t num_threads) {
+    fbb_.AddElement<int32_t>(CPUSettings::VT_NUM_THREADS, num_threads, -1);
+  }
+  explicit CPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CPUSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CPUSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = -1) {
+  CPUSettingsBuilder builder_(_fbb);
+  builder_.add_num_threads(num_threads);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArmNNSettingsT : public ::flatbuffers::NativeTable {
+  typedef ArmNNSettings TableType;
+  std::string backends{};
+  bool fastmath = false;
+  std::string additional_parameters{};
+};
+
+struct ArmNNSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArmNNSettingsT NativeTableType;
+  typedef ArmNNSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BACKENDS = 4,
+    VT_FASTMATH = 6,
+    VT_ADDITIONAL_PARAMETERS = 8
+  };
+  const ::flatbuffers::String *backends() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_BACKENDS);
+  }
+  bool fastmath() const {
+    return GetField<uint8_t>(VT_FASTMATH, 0) != 0;
+  }
+  const ::flatbuffers::String *additional_parameters() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ADDITIONAL_PARAMETERS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BACKENDS) &&
+           verifier.VerifyString(backends()) &&
+           VerifyField<uint8_t>(verifier, VT_FASTMATH, 1) &&
+           VerifyOffset(verifier, VT_ADDITIONAL_PARAMETERS) &&
+           verifier.VerifyString(additional_parameters()) &&
+           verifier.EndTable();
+  }
+  ArmNNSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArmNNSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArmNNSettingsBuilder {
+  typedef ArmNNSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_backends(::flatbuffers::Offset<::flatbuffers::String> backends) {
+    fbb_.AddOffset(ArmNNSettings::VT_BACKENDS, backends);
+  }
+  void add_fastmath(bool fastmath) {
+    fbb_.AddElement<uint8_t>(ArmNNSettings::VT_FASTMATH, static_cast<uint8_t>(fastmath), 0);
+  }
+  void add_additional_parameters(::flatbuffers::Offset<::flatbuffers::String> additional_parameters) {
+    fbb_.AddOffset(ArmNNSettings::VT_ADDITIONAL_PARAMETERS, additional_parameters);
+  }
+  explicit ArmNNSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArmNNSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArmNNSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> backends = 0,
+    bool fastmath = false,
+    ::flatbuffers::Offset<::flatbuffers::String> additional_parameters = 0) {
+  ArmNNSettingsBuilder builder_(_fbb);
+  builder_.add_additional_parameters(additional_parameters);
+  builder_.add_backends(backends);
+  builder_.add_fastmath(fastmath);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *backends = nullptr,
+    bool fastmath = false,
+    const char *additional_parameters = nullptr) {
+  auto backends__ = backends ? _fbb.CreateString(backends) : 0;
+  auto additional_parameters__ = additional_parameters ? _fbb.CreateString(additional_parameters) : 0;
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      backends__,
+      fastmath,
+      additional_parameters__);
+}
+
+::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MtkNeuronSettingsT : public ::flatbuffers::NativeTable {
+  typedef MtkNeuronSettings TableType;
+  tflite::MtkNeuronSettings_::ExecutionPreference execution_preference = tflite::MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED;
+  tflite::MtkNeuronSettings_::ExecutionPriority execution_priority = tflite::MtkNeuronSettings_::ExecutionPriority_PRIORITY_UNDEFINED;
+  std::vector<tflite::MtkNeuronSettings_::OptimizationHint> optimization_hints{};
+  tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode = tflite::MtkNeuronSettings_::OperationCheckMode_NO_OPERATION_CHECK;
+  bool allow_fp16_precision_for_fp32 = false;
+  bool use_ahwb = false;
+  bool use_cacheable_buffer = true;
+  std::vector<std::string> compile_options{};
+  std::vector<std::string> accelerator_names{};
+  std::string neuron_config_path{};
+  int32_t inference_deadline_ms = 0;
+  int32_t inference_abort_time_ms = 0;
+};
+
+struct MtkNeuronSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MtkNeuronSettingsT NativeTableType;
+  typedef MtkNeuronSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EXECUTION_PREFERENCE = 4,
+    VT_EXECUTION_PRIORITY = 6,
+    VT_OPTIMIZATION_HINTS = 8,
+    VT_OPERATION_CHECK_MODE = 10,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 12,
+    VT_USE_AHWB = 14,
+    VT_USE_CACHEABLE_BUFFER = 16,
+    VT_COMPILE_OPTIONS = 18,
+    VT_ACCELERATOR_NAMES = 20,
+    VT_NEURON_CONFIG_PATH = 22,
+    VT_INFERENCE_DEADLINE_MS = 24,
+    VT_INFERENCE_ABORT_TIME_MS = 26
+  };
+  tflite::MtkNeuronSettings_::ExecutionPreference execution_preference() const {
+    return static_cast<tflite::MtkNeuronSettings_::ExecutionPreference>(GetField<int32_t>(VT_EXECUTION_PREFERENCE, 0));
+  }
+  tflite::MtkNeuronSettings_::ExecutionPriority execution_priority() const {
+    return static_cast<tflite::MtkNeuronSettings_::ExecutionPriority>(GetField<int32_t>(VT_EXECUTION_PRIORITY, 0));
+  }
+  const ::flatbuffers::Vector<int32_t> *optimization_hints() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OPTIMIZATION_HINTS);
+  }
+  tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode() const {
+    return static_cast<tflite::MtkNeuronSettings_::OperationCheckMode>(GetField<int32_t>(VT_OPERATION_CHECK_MODE, 0));
+  }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
+  bool use_ahwb() const {
+    return GetField<uint8_t>(VT_USE_AHWB, 0) != 0;
+  }
+  bool use_cacheable_buffer() const {
+    return GetField<uint8_t>(VT_USE_CACHEABLE_BUFFER, 1) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *compile_options() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_COMPILE_OPTIONS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *accelerator_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ACCELERATOR_NAMES);
+  }
+  const ::flatbuffers::String *neuron_config_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NEURON_CONFIG_PATH);
+  }
+  int32_t inference_deadline_ms() const {
+    return GetField<int32_t>(VT_INFERENCE_DEADLINE_MS, 0);
+  }
+  int32_t inference_abort_time_ms() const {
+    return GetField<int32_t>(VT_INFERENCE_ABORT_TIME_MS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PREFERENCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_OPTIMIZATION_HINTS) &&
+           verifier.VerifyVector(optimization_hints()) &&
+           VerifyField<int32_t>(verifier, VT_OPERATION_CHECK_MODE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_AHWB, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_CACHEABLE_BUFFER, 1) &&
+           VerifyOffset(verifier, VT_COMPILE_OPTIONS) &&
+           verifier.VerifyVector(compile_options()) &&
+           verifier.VerifyVectorOfStrings(compile_options()) &&
+           VerifyOffset(verifier, VT_ACCELERATOR_NAMES) &&
+           verifier.VerifyVector(accelerator_names()) &&
+           verifier.VerifyVectorOfStrings(accelerator_names()) &&
+           VerifyOffset(verifier, VT_NEURON_CONFIG_PATH) &&
+           verifier.VerifyString(neuron_config_path()) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_DEADLINE_MS, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_ABORT_TIME_MS, 4) &&
+           verifier.EndTable();
+  }
+  MtkNeuronSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MtkNeuronSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MtkNeuronSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MtkNeuronSettingsBuilder {
+  typedef MtkNeuronSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_execution_preference(tflite::MtkNeuronSettings_::ExecutionPreference execution_preference) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_EXECUTION_PREFERENCE, static_cast<int32_t>(execution_preference), 0);
+  }
+  void add_execution_priority(tflite::MtkNeuronSettings_::ExecutionPriority execution_priority) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_EXECUTION_PRIORITY, static_cast<int32_t>(execution_priority), 0);
+  }
+  void add_optimization_hints(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> optimization_hints) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_OPTIMIZATION_HINTS, optimization_hints);
+  }
+  void add_operation_check_mode(tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_OPERATION_CHECK_MODE, static_cast<int32_t>(operation_check_mode), 0);
+  }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(MtkNeuronSettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
+  void add_use_ahwb(bool use_ahwb) {
+    fbb_.AddElement<uint8_t>(MtkNeuronSettings::VT_USE_AHWB, static_cast<uint8_t>(use_ahwb), 0);
+  }
+  void add_use_cacheable_buffer(bool use_cacheable_buffer) {
+    fbb_.AddElement<uint8_t>(MtkNeuronSettings::VT_USE_CACHEABLE_BUFFER, static_cast<uint8_t>(use_cacheable_buffer), 1);
+  }
+  void add_compile_options(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> compile_options) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_COMPILE_OPTIONS, compile_options);
+  }
+  void add_accelerator_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> accelerator_names) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_ACCELERATOR_NAMES, accelerator_names);
+  }
+  void add_neuron_config_path(::flatbuffers::Offset<::flatbuffers::String> neuron_config_path) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_NEURON_CONFIG_PATH, neuron_config_path);
+  }
+  void add_inference_deadline_ms(int32_t inference_deadline_ms) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_INFERENCE_DEADLINE_MS, inference_deadline_ms, 0);
+  }
+  void add_inference_abort_time_ms(int32_t inference_abort_time_ms) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_INFERENCE_ABORT_TIME_MS, inference_abort_time_ms, 0);
+  }
+  explicit MtkNeuronSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MtkNeuronSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MtkNeuronSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MtkNeuronSettings_::ExecutionPreference execution_preference = tflite::MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED,
+    tflite::MtkNeuronSettings_::ExecutionPriority execution_priority = tflite::MtkNeuronSettings_::ExecutionPriority_PRIORITY_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> optimization_hints = 0,
+    tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode = tflite::MtkNeuronSettings_::OperationCheckMode_NO_OPERATION_CHECK,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_ahwb = false,
+    bool use_cacheable_buffer = true,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> compile_options = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> accelerator_names = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> neuron_config_path = 0,
+    int32_t inference_deadline_ms = 0,
+    int32_t inference_abort_time_ms = 0) {
+  MtkNeuronSettingsBuilder builder_(_fbb);
+  builder_.add_inference_abort_time_ms(inference_abort_time_ms);
+  builder_.add_inference_deadline_ms(inference_deadline_ms);
+  builder_.add_neuron_config_path(neuron_config_path);
+  builder_.add_accelerator_names(accelerator_names);
+  builder_.add_compile_options(compile_options);
+  builder_.add_operation_check_mode(operation_check_mode);
+  builder_.add_optimization_hints(optimization_hints);
+  builder_.add_execution_priority(execution_priority);
+  builder_.add_execution_preference(execution_preference);
+  builder_.add_use_cacheable_buffer(use_cacheable_buffer);
+  builder_.add_use_ahwb(use_ahwb);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MtkNeuronSettings_::ExecutionPreference execution_preference = tflite::MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED,
+    tflite::MtkNeuronSettings_::ExecutionPriority execution_priority = tflite::MtkNeuronSettings_::ExecutionPriority_PRIORITY_UNDEFINED,
+    const std::vector<int32_t> *optimization_hints = nullptr,
+    tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode = tflite::MtkNeuronSettings_::OperationCheckMode_NO_OPERATION_CHECK,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_ahwb = false,
+    bool use_cacheable_buffer = true,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *compile_options = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *accelerator_names = nullptr,
+    const char *neuron_config_path = nullptr,
+    int32_t inference_deadline_ms = 0,
+    int32_t inference_abort_time_ms = 0) {
+  auto optimization_hints__ = optimization_hints ? _fbb.CreateVector<int32_t>(*optimization_hints) : 0;
+  auto compile_options__ = compile_options ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*compile_options) : 0;
+  auto accelerator_names__ = accelerator_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*accelerator_names) : 0;
+  auto neuron_config_path__ = neuron_config_path ? _fbb.CreateString(neuron_config_path) : 0;
+  return tflite::CreateMtkNeuronSettings(
+      _fbb,
+      execution_preference,
+      execution_priority,
+      optimization_hints__,
+      operation_check_mode,
+      allow_fp16_precision_for_fp32,
+      use_ahwb,
+      use_cacheable_buffer,
+      compile_options__,
+      accelerator_names__,
+      neuron_config_path__,
+      inference_deadline_ms,
+      inference_abort_time_ms);
+}
+
+::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
+  typedef TFLiteSettings TableType;
+  tflite::Delegate delegate = tflite::Delegate_NONE;
+  std::unique_ptr<tflite::NNAPISettingsT> nnapi_settings{};
+  std::unique_ptr<tflite::GPUSettingsT> gpu_settings{};
+  std::unique_ptr<tflite::HexagonSettingsT> hexagon_settings{};
+  std::unique_ptr<tflite::XNNPackSettingsT> xnnpack_settings{};
+  std::unique_ptr<tflite::CoreMLSettingsT> coreml_settings{};
+  std::unique_ptr<tflite::CPUSettingsT> cpu_settings{};
+  int32_t max_delegated_partitions = 0;
+  std::unique_ptr<tflite::EdgeTpuSettingsT> edgetpu_settings{};
+  std::unique_ptr<tflite::CoralSettingsT> coral_settings{};
+  std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
+  bool disable_default_delegates = false;
+  std::unique_ptr<tflite::StableDelegateLoaderSettingsT> stable_delegate_loader_settings{};
+  std::unique_ptr<tflite::GoogleEdgeTpuSettingsT> google_edgetpu_settings{};
+  std::unique_ptr<tflite::CompilationCachingSettingsT> compilation_caching_settings{};
+  std::unique_ptr<tflite::ArmNNSettingsT> armnn_settings{};
+  std::unique_ptr<tflite::MtkNeuronSettingsT> mtk_neuron_settings{};
+  TFLiteSettingsT() = default;
+  TFLiteSettingsT(const TFLiteSettingsT &o);
+  TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  TFLiteSettingsT &operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TFLiteSettingsT NativeTableType;
+  typedef TFLiteSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE = 4,
+    VT_NNAPI_SETTINGS = 6,
+    VT_GPU_SETTINGS = 8,
+    VT_HEXAGON_SETTINGS = 10,
+    VT_XNNPACK_SETTINGS = 12,
+    VT_COREML_SETTINGS = 14,
+    VT_CPU_SETTINGS = 16,
+    VT_MAX_DELEGATED_PARTITIONS = 18,
+    VT_EDGETPU_SETTINGS = 20,
+    VT_CORAL_SETTINGS = 22,
+    VT_FALLBACK_SETTINGS = 24,
+    VT_DISABLE_DEFAULT_DELEGATES = 26,
+    VT_STABLE_DELEGATE_LOADER_SETTINGS = 28,
+    VT_GOOGLE_EDGETPU_SETTINGS = 30,
+    VT_COMPILATION_CACHING_SETTINGS = 32,
+    VT_ARMNN_SETTINGS = 34,
+    VT_MTK_NEURON_SETTINGS = 36
+  };
+  tflite::Delegate delegate() const {
+    return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
+  }
+  const tflite::NNAPISettings *nnapi_settings() const {
+    return GetPointer<const tflite::NNAPISettings *>(VT_NNAPI_SETTINGS);
+  }
+  const tflite::GPUSettings *gpu_settings() const {
+    return GetPointer<const tflite::GPUSettings *>(VT_GPU_SETTINGS);
+  }
+  const tflite::HexagonSettings *hexagon_settings() const {
+    return GetPointer<const tflite::HexagonSettings *>(VT_HEXAGON_SETTINGS);
+  }
+  const tflite::XNNPackSettings *xnnpack_settings() const {
+    return GetPointer<const tflite::XNNPackSettings *>(VT_XNNPACK_SETTINGS);
+  }
+  const tflite::CoreMLSettings *coreml_settings() const {
+    return GetPointer<const tflite::CoreMLSettings *>(VT_COREML_SETTINGS);
+  }
+  const tflite::CPUSettings *cpu_settings() const {
+    return GetPointer<const tflite::CPUSettings *>(VT_CPU_SETTINGS);
+  }
+  int32_t max_delegated_partitions() const {
+    return GetField<int32_t>(VT_MAX_DELEGATED_PARTITIONS, 0);
+  }
+  const tflite::EdgeTpuSettings *edgetpu_settings() const {
+    return GetPointer<const tflite::EdgeTpuSettings *>(VT_EDGETPU_SETTINGS);
+  }
+  const tflite::CoralSettings *coral_settings() const {
+    return GetPointer<const tflite::CoralSettings *>(VT_CORAL_SETTINGS);
+  }
+  const tflite::FallbackSettings *fallback_settings() const {
+    return GetPointer<const tflite::FallbackSettings *>(VT_FALLBACK_SETTINGS);
+  }
+  bool disable_default_delegates() const {
+    return GetField<uint8_t>(VT_DISABLE_DEFAULT_DELEGATES, 0) != 0;
+  }
+  const tflite::StableDelegateLoaderSettings *stable_delegate_loader_settings() const {
+    return GetPointer<const tflite::StableDelegateLoaderSettings *>(VT_STABLE_DELEGATE_LOADER_SETTINGS);
+  }
+  const tflite::GoogleEdgeTpuSettings *google_edgetpu_settings() const {
+    return GetPointer<const tflite::GoogleEdgeTpuSettings *>(VT_GOOGLE_EDGETPU_SETTINGS);
+  }
+  const tflite::CompilationCachingSettings *compilation_caching_settings() const {
+    return GetPointer<const tflite::CompilationCachingSettings *>(VT_COMPILATION_CACHING_SETTINGS);
+  }
+  const tflite::ArmNNSettings *armnn_settings() const {
+    return GetPointer<const tflite::ArmNNSettings *>(VT_ARMNN_SETTINGS);
+  }
+  const tflite::MtkNeuronSettings *mtk_neuron_settings() const {
+    return GetPointer<const tflite::MtkNeuronSettings *>(VT_MTK_NEURON_SETTINGS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
+           VerifyOffset(verifier, VT_NNAPI_SETTINGS) &&
+           verifier.VerifyTable(nnapi_settings()) &&
+           VerifyOffset(verifier, VT_GPU_SETTINGS) &&
+           verifier.VerifyTable(gpu_settings()) &&
+           VerifyOffset(verifier, VT_HEXAGON_SETTINGS) &&
+           verifier.VerifyTable(hexagon_settings()) &&
+           VerifyOffset(verifier, VT_XNNPACK_SETTINGS) &&
+           verifier.VerifyTable(xnnpack_settings()) &&
+           VerifyOffset(verifier, VT_COREML_SETTINGS) &&
+           verifier.VerifyTable(coreml_settings()) &&
+           VerifyOffset(verifier, VT_CPU_SETTINGS) &&
+           verifier.VerifyTable(cpu_settings()) &&
+           VerifyField<int32_t>(verifier, VT_MAX_DELEGATED_PARTITIONS, 4) &&
+           VerifyOffset(verifier, VT_EDGETPU_SETTINGS) &&
+           verifier.VerifyTable(edgetpu_settings()) &&
+           VerifyOffset(verifier, VT_CORAL_SETTINGS) &&
+           verifier.VerifyTable(coral_settings()) &&
+           VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
+           verifier.VerifyTable(fallback_settings()) &&
+           VerifyField<uint8_t>(verifier, VT_DISABLE_DEFAULT_DELEGATES, 1) &&
+           VerifyOffset(verifier, VT_STABLE_DELEGATE_LOADER_SETTINGS) &&
+           verifier.VerifyTable(stable_delegate_loader_settings()) &&
+           VerifyOffset(verifier, VT_GOOGLE_EDGETPU_SETTINGS) &&
+           verifier.VerifyTable(google_edgetpu_settings()) &&
+           VerifyOffset(verifier, VT_COMPILATION_CACHING_SETTINGS) &&
+           verifier.VerifyTable(compilation_caching_settings()) &&
+           VerifyOffset(verifier, VT_ARMNN_SETTINGS) &&
+           verifier.VerifyTable(armnn_settings()) &&
+           VerifyOffset(verifier, VT_MTK_NEURON_SETTINGS) &&
+           verifier.VerifyTable(mtk_neuron_settings()) &&
+           verifier.EndTable();
+  }
+  TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TFLiteSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TFLiteSettingsBuilder {
+  typedef TFLiteSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate(tflite::Delegate delegate) {
+    fbb_.AddElement<int32_t>(TFLiteSettings::VT_DELEGATE, static_cast<int32_t>(delegate), 0);
+  }
+  void add_nnapi_settings(::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_NNAPI_SETTINGS, nnapi_settings);
+  }
+  void add_gpu_settings(::flatbuffers::Offset<tflite::GPUSettings> gpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_GPU_SETTINGS, gpu_settings);
+  }
+  void add_hexagon_settings(::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_HEXAGON_SETTINGS, hexagon_settings);
+  }
+  void add_xnnpack_settings(::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_XNNPACK_SETTINGS, xnnpack_settings);
+  }
+  void add_coreml_settings(::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_COREML_SETTINGS, coreml_settings);
+  }
+  void add_cpu_settings(::flatbuffers::Offset<tflite::CPUSettings> cpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_CPU_SETTINGS, cpu_settings);
+  }
+  void add_max_delegated_partitions(int32_t max_delegated_partitions) {
+    fbb_.AddElement<int32_t>(TFLiteSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
+  }
+  void add_edgetpu_settings(::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_EDGETPU_SETTINGS, edgetpu_settings);
+  }
+  void add_coral_settings(::flatbuffers::Offset<tflite::CoralSettings> coral_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_CORAL_SETTINGS, coral_settings);
+  }
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_FALLBACK_SETTINGS, fallback_settings);
+  }
+  void add_disable_default_delegates(bool disable_default_delegates) {
+    fbb_.AddElement<uint8_t>(TFLiteSettings::VT_DISABLE_DEFAULT_DELEGATES, static_cast<uint8_t>(disable_default_delegates), 0);
+  }
+  void add_stable_delegate_loader_settings(::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_STABLE_DELEGATE_LOADER_SETTINGS, stable_delegate_loader_settings);
+  }
+  void add_google_edgetpu_settings(::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_GOOGLE_EDGETPU_SETTINGS, google_edgetpu_settings);
+  }
+  void add_compilation_caching_settings(::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_COMPILATION_CACHING_SETTINGS, compilation_caching_settings);
+  }
+  void add_armnn_settings(::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_ARMNN_SETTINGS, armnn_settings);
+  }
+  void add_mtk_neuron_settings(::flatbuffers::Offset<tflite::MtkNeuronSettings> mtk_neuron_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_MTK_NEURON_SETTINGS, mtk_neuron_settings);
+  }
+  explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TFLiteSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TFLiteSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Delegate delegate = tflite::Delegate_NONE,
+    ::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings = 0,
+    ::flatbuffers::Offset<tflite::GPUSettings> gpu_settings = 0,
+    ::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings = 0,
+    ::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings = 0,
+    ::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings = 0,
+    ::flatbuffers::Offset<tflite::CPUSettings> cpu_settings = 0,
+    int32_t max_delegated_partitions = 0,
+    ::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CoralSettings> coral_settings = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool disable_default_delegates = false,
+    ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
+    ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0,
+    ::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings = 0,
+    ::flatbuffers::Offset<tflite::MtkNeuronSettings> mtk_neuron_settings = 0) {
+  TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_mtk_neuron_settings(mtk_neuron_settings);
+  builder_.add_armnn_settings(armnn_settings);
+  builder_.add_compilation_caching_settings(compilation_caching_settings);
+  builder_.add_google_edgetpu_settings(google_edgetpu_settings);
+  builder_.add_stable_delegate_loader_settings(stable_delegate_loader_settings);
+  builder_.add_fallback_settings(fallback_settings);
+  builder_.add_coral_settings(coral_settings);
+  builder_.add_edgetpu_settings(edgetpu_settings);
+  builder_.add_max_delegated_partitions(max_delegated_partitions);
+  builder_.add_cpu_settings(cpu_settings);
+  builder_.add_coreml_settings(coreml_settings);
+  builder_.add_xnnpack_settings(xnnpack_settings);
+  builder_.add_hexagon_settings(hexagon_settings);
+  builder_.add_gpu_settings(gpu_settings);
+  builder_.add_nnapi_settings(nnapi_settings);
+  builder_.add_delegate(delegate);
+  builder_.add_disable_default_delegates(disable_default_delegates);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FallbackSettingsT : public ::flatbuffers::NativeTable {
+  typedef FallbackSettings TableType;
+  bool allow_automatic_fallback_on_compilation_error = false;
+  bool allow_automatic_fallback_on_execution_error = false;
+};
+
+struct FallbackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FallbackSettingsT NativeTableType;
+  typedef FallbackSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR = 4,
+    VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR = 6
+  };
+  bool allow_automatic_fallback_on_compilation_error() const {
+    return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 0) != 0;
+  }
+  bool allow_automatic_fallback_on_execution_error() const {
+    return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 1) &&
+           verifier.EndTable();
+  }
+  FallbackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FallbackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FallbackSettingsBuilder {
+  typedef FallbackSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_allow_automatic_fallback_on_compilation_error(bool allow_automatic_fallback_on_compilation_error) {
+    fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_compilation_error), 0);
+  }
+  void add_allow_automatic_fallback_on_execution_error(bool allow_automatic_fallback_on_execution_error) {
+    fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_execution_error), 0);
+  }
+  explicit FallbackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FallbackSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FallbackSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool allow_automatic_fallback_on_compilation_error = false,
+    bool allow_automatic_fallback_on_execution_error = false) {
+  FallbackSettingsBuilder builder_(_fbb);
+  builder_.add_allow_automatic_fallback_on_execution_error(allow_automatic_fallback_on_execution_error);
+  builder_.add_allow_automatic_fallback_on_compilation_error(allow_automatic_fallback_on_compilation_error);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkMetricT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkMetric TableType;
+  std::string name{};
+  std::vector<float> values{};
+};
+
+struct BenchmarkMetric FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkMetricT NativeTableType;
+  typedef BenchmarkMetricBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUES = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::Vector<float> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  BenchmarkMetricT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkMetric> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkMetricBuilder {
+  typedef BenchmarkMetric Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(BenchmarkMetric::VT_NAME, name);
+  }
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<float>> values) {
+    fbb_.AddOffset(BenchmarkMetric::VT_VALUES, values);
+  }
+  explicit BenchmarkMetricBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkMetric> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkMetric>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> values = 0) {
+  BenchmarkMetricBuilder builder_(_fbb);
+  builder_.add_values(values);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetricDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const std::vector<float> *values = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto values__ = values ? _fbb.CreateVector<float>(*values) : 0;
+  return tflite::CreateBenchmarkMetric(
+      _fbb,
+      name__,
+      values__);
+}
+
+::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkResultT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkResult TableType;
+  std::vector<int64_t> initialization_time_us{};
+  std::vector<int64_t> inference_time_us{};
+  int32_t max_memory_kb = 0;
+  bool ok = false;
+  std::vector<std::unique_ptr<tflite::BenchmarkMetricT>> metrics{};
+  std::vector<std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>> actual_output{};
+  BenchmarkResultT() = default;
+  BenchmarkResultT(const BenchmarkResultT &o);
+  BenchmarkResultT(BenchmarkResultT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkResultT &operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkResultT NativeTableType;
+  typedef BenchmarkResultBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INITIALIZATION_TIME_US = 4,
+    VT_INFERENCE_TIME_US = 6,
+    VT_MAX_MEMORY_KB = 8,
+    VT_OK = 10,
+    VT_METRICS = 12,
+    VT_ACTUAL_OUTPUT = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *initialization_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INITIALIZATION_TIME_US);
+  }
+  const ::flatbuffers::Vector<int64_t> *inference_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INFERENCE_TIME_US);
+  }
+  int32_t max_memory_kb() const {
+    return GetField<int32_t>(VT_MAX_MEMORY_KB, 0);
+  }
+  bool ok() const {
+    return GetField<uint8_t>(VT_OK, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *>(VT_METRICS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *>(VT_ACTUAL_OUTPUT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INITIALIZATION_TIME_US) &&
+           verifier.VerifyVector(initialization_time_us()) &&
+           VerifyOffset(verifier, VT_INFERENCE_TIME_US) &&
+           verifier.VerifyVector(inference_time_us()) &&
+           VerifyField<int32_t>(verifier, VT_MAX_MEMORY_KB, 4) &&
+           VerifyField<uint8_t>(verifier, VT_OK, 1) &&
+           VerifyOffset(verifier, VT_METRICS) &&
+           verifier.VerifyVector(metrics()) &&
+           verifier.VerifyVectorOfTables(metrics()) &&
+           VerifyOffset(verifier, VT_ACTUAL_OUTPUT) &&
+           verifier.VerifyVector(actual_output()) &&
+           verifier.VerifyVectorOfTables(actual_output()) &&
+           verifier.EndTable();
+  }
+  BenchmarkResultT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkResult> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkResultBuilder {
+  typedef BenchmarkResult Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us) {
+    fbb_.AddOffset(BenchmarkResult::VT_INITIALIZATION_TIME_US, initialization_time_us);
+  }
+  void add_inference_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us) {
+    fbb_.AddOffset(BenchmarkResult::VT_INFERENCE_TIME_US, inference_time_us);
+  }
+  void add_max_memory_kb(int32_t max_memory_kb) {
+    fbb_.AddElement<int32_t>(BenchmarkResult::VT_MAX_MEMORY_KB, max_memory_kb, 0);
+  }
+  void add_ok(bool ok) {
+    fbb_.AddElement<uint8_t>(BenchmarkResult::VT_OK, static_cast<uint8_t>(ok), 0);
+  }
+  void add_metrics(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics) {
+    fbb_.AddOffset(BenchmarkResult::VT_METRICS, metrics);
+  }
+  void add_actual_output(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output) {
+    fbb_.AddOffset(BenchmarkResult::VT_ACTUAL_OUTPUT, actual_output);
+  }
+  explicit BenchmarkResultBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkResult> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkResult>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us = 0,
+    int32_t max_memory_kb = 0,
+    bool ok = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output = 0) {
+  BenchmarkResultBuilder builder_(_fbb);
+  builder_.add_actual_output(actual_output);
+  builder_.add_metrics(metrics);
+  builder_.add_max_memory_kb(max_memory_kb);
+  builder_.add_inference_time_us(inference_time_us);
+  builder_.add_initialization_time_us(initialization_time_us);
+  builder_.add_ok(ok);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *initialization_time_us = nullptr,
+    const std::vector<int64_t> *inference_time_us = nullptr,
+    int32_t max_memory_kb = 0,
+    bool ok = false,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output = nullptr) {
+  auto initialization_time_us__ = initialization_time_us ? _fbb.CreateVector<int64_t>(*initialization_time_us) : 0;
+  auto inference_time_us__ = inference_time_us ? _fbb.CreateVector<int64_t>(*inference_time_us) : 0;
+  auto metrics__ = metrics ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>>(*metrics) : 0;
+  auto actual_output__ = actual_output ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>(*actual_output) : 0;
+  return tflite::CreateBenchmarkResult(
+      _fbb,
+      initialization_time_us__,
+      inference_time_us__,
+      max_memory_kb,
+      ok,
+      metrics__,
+      actual_output__);
+}
+
+::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+namespace BenchmarkResult_ {
+
+struct InferenceOutputT : public ::flatbuffers::NativeTable {
+  typedef InferenceOutput TableType;
+  std::vector<uint8_t> value{};
+};
+
+struct InferenceOutput FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef InferenceOutputT NativeTableType;
+  typedef InferenceOutputBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUE = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *value() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyVector(value()) &&
+           verifier.EndTable();
+  }
+  InferenceOutputT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<InferenceOutput> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct InferenceOutputBuilder {
+  typedef InferenceOutput Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_value(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value) {
+    fbb_.AddOffset(InferenceOutput::VT_VALUE, value);
+  }
+  explicit InferenceOutputBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<InferenceOutput> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<InferenceOutput>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value = 0) {
+  InferenceOutputBuilder builder_(_fbb);
+  builder_.add_value(value);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *value = nullptr) {
+  auto value__ = value ? _fbb.CreateVector<uint8_t>(*value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      value__);
+}
+
+::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace BenchmarkResult_
+
+struct ErrorCodeT : public ::flatbuffers::NativeTable {
+  typedef ErrorCode TableType;
+  tflite::Delegate source = tflite::Delegate_NONE;
+  int32_t tflite_error = 0;
+  int64_t underlying_api_error = 0;
+};
+
+struct ErrorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ErrorCodeT NativeTableType;
+  typedef ErrorCodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SOURCE = 4,
+    VT_TFLITE_ERROR = 6,
+    VT_UNDERLYING_API_ERROR = 8
+  };
+  tflite::Delegate source() const {
+    return static_cast<tflite::Delegate>(GetField<int32_t>(VT_SOURCE, 0));
+  }
+  int32_t tflite_error() const {
+    return GetField<int32_t>(VT_TFLITE_ERROR, 0);
+  }
+  int64_t underlying_api_error() const {
+    return GetField<int64_t>(VT_UNDERLYING_API_ERROR, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SOURCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_TFLITE_ERROR, 4) &&
+           VerifyField<int64_t>(verifier, VT_UNDERLYING_API_ERROR, 8) &&
+           verifier.EndTable();
+  }
+  ErrorCodeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ErrorCode> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ErrorCodeBuilder {
+  typedef ErrorCode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_source(tflite::Delegate source) {
+    fbb_.AddElement<int32_t>(ErrorCode::VT_SOURCE, static_cast<int32_t>(source), 0);
+  }
+  void add_tflite_error(int32_t tflite_error) {
+    fbb_.AddElement<int32_t>(ErrorCode::VT_TFLITE_ERROR, tflite_error, 0);
+  }
+  void add_underlying_api_error(int64_t underlying_api_error) {
+    fbb_.AddElement<int64_t>(ErrorCode::VT_UNDERLYING_API_ERROR, underlying_api_error, 0);
+  }
+  explicit ErrorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ErrorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ErrorCode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Delegate source = tflite::Delegate_NONE,
+    int32_t tflite_error = 0,
+    int64_t underlying_api_error = 0) {
+  ErrorCodeBuilder builder_(_fbb);
+  builder_.add_underlying_api_error(underlying_api_error);
+  builder_.add_tflite_error(tflite_error);
+  builder_.add_source(source);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkErrorT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkError TableType;
+  tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN;
+  int32_t exit_code = 0;
+  int32_t signal = 0;
+  std::vector<std::unique_ptr<tflite::ErrorCodeT>> error_code{};
+  int32_t mini_benchmark_error_code = 0;
+  BenchmarkErrorT() = default;
+  BenchmarkErrorT(const BenchmarkErrorT &o);
+  BenchmarkErrorT(BenchmarkErrorT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkErrorT &operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkErrorT NativeTableType;
+  typedef BenchmarkErrorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STAGE = 4,
+    VT_EXIT_CODE = 6,
+    VT_SIGNAL = 8,
+    VT_ERROR_CODE = 10,
+    VT_MINI_BENCHMARK_ERROR_CODE = 12
+  };
+  tflite::BenchmarkStage stage() const {
+    return static_cast<tflite::BenchmarkStage>(GetField<int32_t>(VT_STAGE, 0));
+  }
+  int32_t exit_code() const {
+    return GetField<int32_t>(VT_EXIT_CODE, 0);
+  }
+  int32_t signal() const {
+    return GetField<int32_t>(VT_SIGNAL, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *>(VT_ERROR_CODE);
+  }
+  int32_t mini_benchmark_error_code() const {
+    return GetField<int32_t>(VT_MINI_BENCHMARK_ERROR_CODE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_STAGE, 4) &&
+           VerifyField<int32_t>(verifier, VT_EXIT_CODE, 4) &&
+           VerifyField<int32_t>(verifier, VT_SIGNAL, 4) &&
+           VerifyOffset(verifier, VT_ERROR_CODE) &&
+           verifier.VerifyVector(error_code()) &&
+           verifier.VerifyVectorOfTables(error_code()) &&
+           VerifyField<int32_t>(verifier, VT_MINI_BENCHMARK_ERROR_CODE, 4) &&
+           verifier.EndTable();
+  }
+  BenchmarkErrorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkError> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkErrorBuilder {
+  typedef BenchmarkError Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_stage(tflite::BenchmarkStage stage) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_STAGE, static_cast<int32_t>(stage), 0);
+  }
+  void add_exit_code(int32_t exit_code) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_EXIT_CODE, exit_code, 0);
+  }
+  void add_signal(int32_t signal) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_SIGNAL, signal, 0);
+  }
+  void add_error_code(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code) {
+    fbb_.AddOffset(BenchmarkError::VT_ERROR_CODE, error_code);
+  }
+  void add_mini_benchmark_error_code(int32_t mini_benchmark_error_code) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_MINI_BENCHMARK_ERROR_CODE, mini_benchmark_error_code, 0);
+  }
+  explicit BenchmarkErrorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkError> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkError>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
+    int32_t exit_code = 0,
+    int32_t signal = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code = 0,
+    int32_t mini_benchmark_error_code = 0) {
+  BenchmarkErrorBuilder builder_(_fbb);
+  builder_.add_mini_benchmark_error_code(mini_benchmark_error_code);
+  builder_.add_error_code(error_code);
+  builder_.add_signal(signal);
+  builder_.add_exit_code(exit_code);
+  builder_.add_stage(stage);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkErrorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
+    int32_t exit_code = 0,
+    int32_t signal = 0,
+    const std::vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code = nullptr,
+    int32_t mini_benchmark_error_code = 0) {
+  auto error_code__ = error_code ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>>(*error_code) : 0;
+  return tflite::CreateBenchmarkError(
+      _fbb,
+      stage,
+      exit_code,
+      signal,
+      error_code__,
+      mini_benchmark_error_code);
+}
+
+::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkEventT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkEvent TableType;
+  std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
+  tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE;
+  std::unique_ptr<tflite::BenchmarkResultT> result{};
+  std::unique_ptr<tflite::BenchmarkErrorT> error{};
+  int64_t boottime_us = 0;
+  int64_t wallclock_us = 0;
+  BenchmarkEventT() = default;
+  BenchmarkEventT(const BenchmarkEventT &o);
+  BenchmarkEventT(BenchmarkEventT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventT &operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkEventT NativeTableType;
+  typedef BenchmarkEventBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TFLITE_SETTINGS = 4,
+    VT_EVENT_TYPE = 6,
+    VT_RESULT = 8,
+    VT_ERROR = 10,
+    VT_BOOTTIME_US = 12,
+    VT_WALLCLOCK_US = 14
+  };
+  const tflite::TFLiteSettings *tflite_settings() const {
+    return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
+  }
+  tflite::BenchmarkEventType event_type() const {
+    return static_cast<tflite::BenchmarkEventType>(GetField<int32_t>(VT_EVENT_TYPE, 0));
+  }
+  const tflite::BenchmarkResult *result() const {
+    return GetPointer<const tflite::BenchmarkResult *>(VT_RESULT);
+  }
+  const tflite::BenchmarkError *error() const {
+    return GetPointer<const tflite::BenchmarkError *>(VT_ERROR);
+  }
+  int64_t boottime_us() const {
+    return GetField<int64_t>(VT_BOOTTIME_US, 0);
+  }
+  int64_t wallclock_us() const {
+    return GetField<int64_t>(VT_WALLCLOCK_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
+           verifier.VerifyTable(tflite_settings()) &&
+           VerifyField<int32_t>(verifier, VT_EVENT_TYPE, 4) &&
+           VerifyOffset(verifier, VT_RESULT) &&
+           verifier.VerifyTable(result()) &&
+           VerifyOffset(verifier, VT_ERROR) &&
+           verifier.VerifyTable(error()) &&
+           VerifyField<int64_t>(verifier, VT_BOOTTIME_US, 8) &&
+           VerifyField<int64_t>(verifier, VT_WALLCLOCK_US, 8) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventBuilder {
+  typedef BenchmarkEvent Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+    fbb_.AddOffset(BenchmarkEvent::VT_TFLITE_SETTINGS, tflite_settings);
+  }
+  void add_event_type(tflite::BenchmarkEventType event_type) {
+    fbb_.AddElement<int32_t>(BenchmarkEvent::VT_EVENT_TYPE, static_cast<int32_t>(event_type), 0);
+  }
+  void add_result(::flatbuffers::Offset<tflite::BenchmarkResult> result) {
+    fbb_.AddOffset(BenchmarkEvent::VT_RESULT, result);
+  }
+  void add_error(::flatbuffers::Offset<tflite::BenchmarkError> error) {
+    fbb_.AddOffset(BenchmarkEvent::VT_ERROR, error);
+  }
+  void add_boottime_us(int64_t boottime_us) {
+    fbb_.AddElement<int64_t>(BenchmarkEvent::VT_BOOTTIME_US, boottime_us, 0);
+  }
+  void add_wallclock_us(int64_t wallclock_us) {
+    fbb_.AddElement<int64_t>(BenchmarkEvent::VT_WALLCLOCK_US, wallclock_us, 0);
+  }
+  explicit BenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkEvent> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkEvent>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+    ::flatbuffers::Offset<tflite::BenchmarkResult> result = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkError> error = 0,
+    int64_t boottime_us = 0,
+    int64_t wallclock_us = 0) {
+  BenchmarkEventBuilder builder_(_fbb);
+  builder_.add_wallclock_us(wallclock_us);
+  builder_.add_boottime_us(boottime_us);
+  builder_.add_error(error);
+  builder_.add_result(result);
+  builder_.add_event_type(event_type);
+  builder_.add_tflite_settings(tflite_settings);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BestAccelerationDecisionT : public ::flatbuffers::NativeTable {
+  typedef BestAccelerationDecision TableType;
+  int32_t number_of_source_events = 0;
+  std::unique_ptr<tflite::BenchmarkEventT> min_latency_event{};
+  int64_t min_inference_time_us = 0;
+  BestAccelerationDecisionT() = default;
+  BestAccelerationDecisionT(const BestAccelerationDecisionT &o);
+  BestAccelerationDecisionT(BestAccelerationDecisionT&&) FLATBUFFERS_NOEXCEPT = default;
+  BestAccelerationDecisionT &operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BestAccelerationDecisionT NativeTableType;
+  typedef BestAccelerationDecisionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUMBER_OF_SOURCE_EVENTS = 4,
+    VT_MIN_LATENCY_EVENT = 6,
+    VT_MIN_INFERENCE_TIME_US = 8
+  };
+  int32_t number_of_source_events() const {
+    return GetField<int32_t>(VT_NUMBER_OF_SOURCE_EVENTS, 0);
+  }
+  const tflite::BenchmarkEvent *min_latency_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_MIN_LATENCY_EVENT);
+  }
+  int64_t min_inference_time_us() const {
+    return GetField<int64_t>(VT_MIN_INFERENCE_TIME_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUMBER_OF_SOURCE_EVENTS, 4) &&
+           VerifyOffset(verifier, VT_MIN_LATENCY_EVENT) &&
+           verifier.VerifyTable(min_latency_event()) &&
+           VerifyField<int64_t>(verifier, VT_MIN_INFERENCE_TIME_US, 8) &&
+           verifier.EndTable();
+  }
+  BestAccelerationDecisionT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BestAccelerationDecision> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BestAccelerationDecisionBuilder {
+  typedef BestAccelerationDecision Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_number_of_source_events(int32_t number_of_source_events) {
+    fbb_.AddElement<int32_t>(BestAccelerationDecision::VT_NUMBER_OF_SOURCE_EVENTS, number_of_source_events, 0);
+  }
+  void add_min_latency_event(::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event) {
+    fbb_.AddOffset(BestAccelerationDecision::VT_MIN_LATENCY_EVENT, min_latency_event);
+  }
+  void add_min_inference_time_us(int64_t min_inference_time_us) {
+    fbb_.AddElement<int64_t>(BestAccelerationDecision::VT_MIN_INFERENCE_TIME_US, min_inference_time_us, 0);
+  }
+  explicit BestAccelerationDecisionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BestAccelerationDecision> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BestAccelerationDecision>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t number_of_source_events = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event = 0,
+    int64_t min_inference_time_us = 0) {
+  BestAccelerationDecisionBuilder builder_(_fbb);
+  builder_.add_min_inference_time_us(min_inference_time_us);
+  builder_.add_min_latency_event(min_latency_event);
+  builder_.add_number_of_source_events(number_of_source_events);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkInitializationFailureT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkInitializationFailure TableType;
+  int32_t initialization_status = 0;
+};
+
+struct BenchmarkInitializationFailure FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkInitializationFailureT NativeTableType;
+  typedef BenchmarkInitializationFailureBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INITIALIZATION_STATUS = 4
+  };
+  int32_t initialization_status() const {
+    return GetField<int32_t>(VT_INITIALIZATION_STATUS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INITIALIZATION_STATUS, 4) &&
+           verifier.EndTable();
+  }
+  BenchmarkInitializationFailureT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkInitializationFailure> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkInitializationFailureBuilder {
+  typedef BenchmarkInitializationFailure Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_status(int32_t initialization_status) {
+    fbb_.AddElement<int32_t>(BenchmarkInitializationFailure::VT_INITIALIZATION_STATUS, initialization_status, 0);
+  }
+  explicit BenchmarkInitializationFailureBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkInitializationFailure> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkInitializationFailure>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t initialization_status = 0) {
+  BenchmarkInitializationFailureBuilder builder_(_fbb);
+  builder_.add_initialization_status(initialization_status);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MiniBenchmarkEventT : public ::flatbuffers::NativeTable {
+  typedef MiniBenchmarkEvent TableType;
+  bool is_log_flushing_event = false;
+  std::unique_ptr<tflite::BestAccelerationDecisionT> best_acceleration_decision{};
+  std::unique_ptr<tflite::BenchmarkInitializationFailureT> initialization_failure{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  MiniBenchmarkEventT() = default;
+  MiniBenchmarkEventT(const MiniBenchmarkEventT &o);
+  MiniBenchmarkEventT(MiniBenchmarkEventT&&) FLATBUFFERS_NOEXCEPT = default;
+  MiniBenchmarkEventT &operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MiniBenchmarkEventT NativeTableType;
+  typedef MiniBenchmarkEventBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IS_LOG_FLUSHING_EVENT = 4,
+    VT_BEST_ACCELERATION_DECISION = 6,
+    VT_INITIALIZATION_FAILURE = 8,
+    VT_BENCHMARK_EVENT = 10
+  };
+  bool is_log_flushing_event() const {
+    return GetField<uint8_t>(VT_IS_LOG_FLUSHING_EVENT, 0) != 0;
+  }
+  const tflite::BestAccelerationDecision *best_acceleration_decision() const {
+    return GetPointer<const tflite::BestAccelerationDecision *>(VT_BEST_ACCELERATION_DECISION);
+  }
+  const tflite::BenchmarkInitializationFailure *initialization_failure() const {
+    return GetPointer<const tflite::BenchmarkInitializationFailure *>(VT_INITIALIZATION_FAILURE);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_IS_LOG_FLUSHING_EVENT, 1) &&
+           VerifyOffset(verifier, VT_BEST_ACCELERATION_DECISION) &&
+           verifier.VerifyTable(best_acceleration_decision()) &&
+           VerifyOffset(verifier, VT_INITIALIZATION_FAILURE) &&
+           verifier.VerifyTable(initialization_failure()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  MiniBenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MiniBenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MiniBenchmarkEventBuilder {
+  typedef MiniBenchmarkEvent Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_is_log_flushing_event(bool is_log_flushing_event) {
+    fbb_.AddElement<uint8_t>(MiniBenchmarkEvent::VT_IS_LOG_FLUSHING_EVENT, static_cast<uint8_t>(is_log_flushing_event), 0);
+  }
+  void add_best_acceleration_decision(::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_BEST_ACCELERATION_DECISION, best_acceleration_decision);
+  }
+  void add_initialization_failure(::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_INITIALIZATION_FAILURE, initialization_failure);
+  }
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit MiniBenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MiniBenchmarkEvent> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MiniBenchmarkEvent>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_log_flushing_event = false,
+    ::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  MiniBenchmarkEventBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_initialization_failure(initialization_failure);
+  builder_.add_best_acceleration_decision(best_acceleration_decision);
+  builder_.add_is_log_flushing_event(is_log_flushing_event);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelFileT : public ::flatbuffers::NativeTable {
+  typedef ModelFile TableType;
+  std::string filename{};
+  int64_t fd = 0;
+  int64_t offset = 0;
+  int64_t length = 0;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  int64_t buffer_handle = 0;
+  ModelFileT() = default;
+  ModelFileT(const ModelFileT &o);
+  ModelFileT(ModelFileT&&) FLATBUFFERS_NOEXCEPT = default;
+  ModelFileT &operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ModelFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelFileT NativeTableType;
+  typedef ModelFileBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FILENAME = 4,
+    VT_FD = 6,
+    VT_OFFSET = 8,
+    VT_LENGTH = 10,
+    VT_MODEL_ID_GROUP = 12,
+    VT_BUFFER_HANDLE = 14
+  };
+  const ::flatbuffers::String *filename() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILENAME);
+  }
+  int64_t fd() const {
+    return GetField<int64_t>(VT_FD, 0);
+  }
+  int64_t offset() const {
+    return GetField<int64_t>(VT_OFFSET, 0);
+  }
+  int64_t length() const {
+    return GetField<int64_t>(VT_LENGTH, 0);
+  }
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  int64_t buffer_handle() const {
+    return GetField<int64_t>(VT_BUFFER_HANDLE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_FILENAME) &&
+           verifier.VerifyString(filename()) &&
+           VerifyField<int64_t>(verifier, VT_FD, 8) &&
+           VerifyField<int64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyField<int64_t>(verifier, VT_BUFFER_HANDLE, 8) &&
+           verifier.EndTable();
+  }
+  ModelFileT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelFile> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelFileBuilder {
+  typedef ModelFile Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_filename(::flatbuffers::Offset<::flatbuffers::String> filename) {
+    fbb_.AddOffset(ModelFile::VT_FILENAME, filename);
+  }
+  void add_fd(int64_t fd) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_FD, fd, 0);
+  }
+  void add_offset(int64_t offset) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_OFFSET, offset, 0);
+  }
+  void add_length(int64_t length) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_LENGTH, length, 0);
+  }
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(ModelFile::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_buffer_handle(int64_t buffer_handle) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_BUFFER_HANDLE, buffer_handle, 0);
+  }
+  explicit ModelFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ModelFile> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ModelFile>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> filename = 0,
+    int64_t fd = 0,
+    int64_t offset = 0,
+    int64_t length = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    int64_t buffer_handle = 0) {
+  ModelFileBuilder builder_(_fbb);
+  builder_.add_buffer_handle(buffer_handle);
+  builder_.add_length(length);
+  builder_.add_offset(offset);
+  builder_.add_fd(fd);
+  builder_.add_model_id_group(model_id_group);
+  builder_.add_filename(filename);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *filename = nullptr,
+    int64_t fd = 0,
+    int64_t offset = 0,
+    int64_t length = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    int64_t buffer_handle = 0) {
+  auto filename__ = filename ? _fbb.CreateString(filename) : 0;
+  return tflite::CreateModelFile(
+      _fbb,
+      filename__,
+      fd,
+      offset,
+      length,
+      model_id_group,
+      buffer_handle);
+}
+
+::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelIdGroupT : public ::flatbuffers::NativeTable {
+  typedef ModelIdGroup TableType;
+  std::string model_namespace{};
+  std::string model_id{};
+};
+
+struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelIdGroupT NativeTableType;
+  typedef ModelIdGroupBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_NAMESPACE = 4,
+    VT_MODEL_ID = 6
+  };
+  const ::flatbuffers::String *model_namespace() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE);
+  }
+  const ::flatbuffers::String *model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_ID);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE) &&
+           verifier.VerifyString(model_namespace()) &&
+           VerifyOffset(verifier, VT_MODEL_ID) &&
+           verifier.VerifyString(model_id()) &&
+           verifier.EndTable();
+  }
+  ModelIdGroupT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelIdGroup> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelIdGroupBuilder {
+  typedef ModelIdGroup Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_namespace(::flatbuffers::Offset<::flatbuffers::String> model_namespace) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_NAMESPACE, model_namespace);
+  }
+  void add_model_id(::flatbuffers::Offset<::flatbuffers::String> model_id) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_ID, model_id);
+  }
+  explicit ModelIdGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ModelIdGroup> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ModelIdGroup>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_id = 0) {
+  ModelIdGroupBuilder builder_(_fbb);
+  builder_.add_model_id(model_id);
+  builder_.add_model_namespace(model_namespace);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *model_namespace = nullptr,
+    const char *model_id = nullptr) {
+  auto model_namespace__ = model_namespace ? _fbb.CreateString(model_namespace) : 0;
+  auto model_id__ = model_id ? _fbb.CreateString(model_id) : 0;
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      model_namespace__,
+      model_id__);
+}
+
+::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkStoragePathsT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkStoragePaths TableType;
+  std::string storage_file_path{};
+  std::string data_directory_path{};
+};
+
+struct BenchmarkStoragePaths FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkStoragePathsT NativeTableType;
+  typedef BenchmarkStoragePathsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STORAGE_FILE_PATH = 4,
+    VT_DATA_DIRECTORY_PATH = 6
+  };
+  const ::flatbuffers::String *storage_file_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_STORAGE_FILE_PATH);
+  }
+  const ::flatbuffers::String *data_directory_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATA_DIRECTORY_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_STORAGE_FILE_PATH) &&
+           verifier.VerifyString(storage_file_path()) &&
+           VerifyOffset(verifier, VT_DATA_DIRECTORY_PATH) &&
+           verifier.VerifyString(data_directory_path()) &&
+           verifier.EndTable();
+  }
+  BenchmarkStoragePathsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkStoragePaths> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkStoragePathsBuilder {
+  typedef BenchmarkStoragePaths Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_storage_file_path(::flatbuffers::Offset<::flatbuffers::String> storage_file_path) {
+    fbb_.AddOffset(BenchmarkStoragePaths::VT_STORAGE_FILE_PATH, storage_file_path);
+  }
+  void add_data_directory_path(::flatbuffers::Offset<::flatbuffers::String> data_directory_path) {
+    fbb_.AddOffset(BenchmarkStoragePaths::VT_DATA_DIRECTORY_PATH, data_directory_path);
+  }
+  explicit BenchmarkStoragePathsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkStoragePaths> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkStoragePaths>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> storage_file_path = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> data_directory_path = 0) {
+  BenchmarkStoragePathsBuilder builder_(_fbb);
+  builder_.add_data_directory_path(data_directory_path);
+  builder_.add_storage_file_path(storage_file_path);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePathsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *storage_file_path = nullptr,
+    const char *data_directory_path = nullptr) {
+  auto storage_file_path__ = storage_file_path ? _fbb.CreateString(storage_file_path) : 0;
+  auto data_directory_path__ = data_directory_path ? _fbb.CreateString(data_directory_path) : 0;
+  return tflite::CreateBenchmarkStoragePaths(
+      _fbb,
+      storage_file_path__,
+      data_directory_path__);
+}
+
+::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ValidationSettingsT : public ::flatbuffers::NativeTable {
+  typedef ValidationSettings TableType;
+  int64_t per_test_timeout_ms = 0;
+};
+
+struct ValidationSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ValidationSettingsT NativeTableType;
+  typedef ValidationSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PER_TEST_TIMEOUT_MS = 4
+  };
+  int64_t per_test_timeout_ms() const {
+    return GetField<int64_t>(VT_PER_TEST_TIMEOUT_MS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_PER_TEST_TIMEOUT_MS, 8) &&
+           verifier.EndTable();
+  }
+  ValidationSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ValidationSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ValidationSettingsBuilder {
+  typedef ValidationSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_per_test_timeout_ms(int64_t per_test_timeout_ms) {
+    fbb_.AddElement<int64_t>(ValidationSettings::VT_PER_TEST_TIMEOUT_MS, per_test_timeout_ms, 0);
+  }
+  explicit ValidationSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ValidationSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ValidationSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t per_test_timeout_ms = 0) {
+  ValidationSettingsBuilder builder_(_fbb);
+  builder_.add_per_test_timeout_ms(per_test_timeout_ms);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MinibenchmarkSettingsT : public ::flatbuffers::NativeTable {
+  typedef MinibenchmarkSettings TableType;
+  std::vector<std::unique_ptr<tflite::TFLiteSettingsT>> settings_to_test{};
+  std::unique_ptr<tflite::ModelFileT> model_file{};
+  std::unique_ptr<tflite::BenchmarkStoragePathsT> storage_paths{};
+  std::unique_ptr<tflite::ValidationSettingsT> validation_settings{};
+  MinibenchmarkSettingsT() = default;
+  MinibenchmarkSettingsT(const MinibenchmarkSettingsT &o);
+  MinibenchmarkSettingsT(MinibenchmarkSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  MinibenchmarkSettingsT &operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MinibenchmarkSettingsT NativeTableType;
+  typedef MinibenchmarkSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SETTINGS_TO_TEST = 4,
+    VT_MODEL_FILE = 6,
+    VT_STORAGE_PATHS = 8,
+    VT_VALIDATION_SETTINGS = 10
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *>(VT_SETTINGS_TO_TEST);
+  }
+  const tflite::ModelFile *model_file() const {
+    return GetPointer<const tflite::ModelFile *>(VT_MODEL_FILE);
+  }
+  const tflite::BenchmarkStoragePaths *storage_paths() const {
+    return GetPointer<const tflite::BenchmarkStoragePaths *>(VT_STORAGE_PATHS);
+  }
+  const tflite::ValidationSettings *validation_settings() const {
+    return GetPointer<const tflite::ValidationSettings *>(VT_VALIDATION_SETTINGS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SETTINGS_TO_TEST) &&
+           verifier.VerifyVector(settings_to_test()) &&
+           verifier.VerifyVectorOfTables(settings_to_test()) &&
+           VerifyOffset(verifier, VT_MODEL_FILE) &&
+           verifier.VerifyTable(model_file()) &&
+           VerifyOffset(verifier, VT_STORAGE_PATHS) &&
+           verifier.VerifyTable(storage_paths()) &&
+           VerifyOffset(verifier, VT_VALIDATION_SETTINGS) &&
+           verifier.VerifyTable(validation_settings()) &&
+           verifier.EndTable();
+  }
+  MinibenchmarkSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MinibenchmarkSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MinibenchmarkSettingsBuilder {
+  typedef MinibenchmarkSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_settings_to_test(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_SETTINGS_TO_TEST, settings_to_test);
+  }
+  void add_model_file(::flatbuffers::Offset<tflite::ModelFile> model_file) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_MODEL_FILE, model_file);
+  }
+  void add_storage_paths(::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_STORAGE_PATHS, storage_paths);
+  }
+  void add_validation_settings(::flatbuffers::Offset<tflite::ValidationSettings> validation_settings) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_VALIDATION_SETTINGS, validation_settings);
+  }
+  explicit MinibenchmarkSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MinibenchmarkSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MinibenchmarkSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test = 0,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  MinibenchmarkSettingsBuilder builder_(_fbb);
+  builder_.add_validation_settings(validation_settings);
+  builder_.add_storage_paths(storage_paths);
+  builder_.add_model_file(model_file);
+  builder_.add_settings_to_test(settings_to_test);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test = nullptr,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  auto settings_to_test__ = settings_to_test ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>>(*settings_to_test) : 0;
+  return tflite::CreateMinibenchmarkSettings(
+      _fbb,
+      settings_to_test__,
+      model_file,
+      storage_paths,
+      validation_settings);
+}
+
+::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkEventStorageT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkEventStorage TableType;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  BenchmarkEventStorageT() = default;
+  BenchmarkEventStorageT(const BenchmarkEventStorageT &o);
+  BenchmarkEventStorageT(BenchmarkEventStorageT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventStorageT &operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkEventStorageT NativeTableType;
+  typedef BenchmarkEventStorageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_ID_GROUP = 4,
+    VT_BENCHMARK_EVENT = 6
+  };
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventStorageT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEventStorage> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventStorageBuilder {
+  typedef BenchmarkEventStorage Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit BenchmarkEventStorageBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkEventStorage> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkEventStorage>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  BenchmarkEventStorageBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_model_id_group(model_id_group);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+
+inline bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
+  return
+      (lhs.preference == rhs.preference) &&
+      ((lhs.tflite_settings == rhs.tflite_settings) || (lhs.tflite_settings && rhs.tflite_settings && *lhs.tflite_settings == *rhs.tflite_settings)) &&
+      (lhs.model_namespace_for_statistics == rhs.model_namespace_for_statistics) &&
+      (lhs.model_identifier_for_statistics == rhs.model_identifier_for_statistics) &&
+      ((lhs.settings_to_test_locally == rhs.settings_to_test_locally) || (lhs.settings_to_test_locally && rhs.settings_to_test_locally && *lhs.settings_to_test_locally == *rhs.settings_to_test_locally));
+}
+
+inline bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ComputeSettingsT::ComputeSettingsT(const ComputeSettingsT &o)
+      : preference(o.preference),
+        tflite_settings((o.tflite_settings) ? new tflite::TFLiteSettingsT(*o.tflite_settings) : nullptr),
+        model_namespace_for_statistics(o.model_namespace_for_statistics),
+        model_identifier_for_statistics(o.model_identifier_for_statistics),
+        settings_to_test_locally((o.settings_to_test_locally) ? new tflite::MinibenchmarkSettingsT(*o.settings_to_test_locally) : nullptr) {
+}
+
+inline ComputeSettingsT &ComputeSettingsT::operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(preference, o.preference);
+  std::swap(tflite_settings, o.tflite_settings);
+  std::swap(model_namespace_for_statistics, o.model_namespace_for_statistics);
+  std::swap(model_identifier_for_statistics, o.model_identifier_for_statistics);
+  std::swap(settings_to_test_locally, o.settings_to_test_locally);
+  return *this;
+}
+
+inline ComputeSettingsT *ComputeSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ComputeSettingsT>(new ComputeSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ComputeSettings::UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = preference(); _o->preference = _e; }
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
+  { auto _e = model_namespace_for_statistics(); if (_e) _o->model_namespace_for_statistics = _e->str(); }
+  { auto _e = model_identifier_for_statistics(); if (_e) _o->model_identifier_for_statistics = _e->str(); }
+  { auto _e = settings_to_test_locally(); if (_e) { if(_o->settings_to_test_locally) { _e->UnPackTo(_o->settings_to_test_locally.get(), _resolver); } else { _o->settings_to_test_locally = std::unique_ptr<tflite::MinibenchmarkSettingsT>(_e->UnPack(_resolver)); } } else if (_o->settings_to_test_locally) { _o->settings_to_test_locally.reset(); } }
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> ComputeSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateComputeSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ComputeSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _preference = _o->preference;
+  auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
+  auto _model_namespace_for_statistics = _o->model_namespace_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_namespace_for_statistics);
+  auto _model_identifier_for_statistics = _o->model_identifier_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_identifier_for_statistics);
+  auto _settings_to_test_locally = _o->settings_to_test_locally ? CreateMinibenchmarkSettings(_fbb, _o->settings_to_test_locally.get(), _rehasher) : 0;
+  return tflite::CreateComputeSettings(
+      _fbb,
+      _preference,
+      _tflite_settings,
+      _model_namespace_for_statistics,
+      _model_identifier_for_statistics,
+      _settings_to_test_locally);
+}
+
+
+inline bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
+  return
+      (lhs.accelerator_name == rhs.accelerator_name) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.execution_preference == rhs.execution_preference) &&
+      (lhs.no_of_nnapi_instances_to_cache == rhs.no_of_nnapi_instances_to_cache) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.allow_nnapi_cpu_on_android_10_plus == rhs.allow_nnapi_cpu_on_android_10_plus) &&
+      (lhs.execution_priority == rhs.execution_priority) &&
+      (lhs.allow_dynamic_dimensions == rhs.allow_dynamic_dimensions) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32) &&
+      (lhs.use_burst_computation == rhs.use_burst_computation) &&
+      (lhs.support_library_handle == rhs.support_library_handle);
+}
+
+inline bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline NNAPISettingsT::NNAPISettingsT(const NNAPISettingsT &o)
+      : accelerator_name(o.accelerator_name),
+        cache_directory(o.cache_directory),
+        model_token(o.model_token),
+        execution_preference(o.execution_preference),
+        no_of_nnapi_instances_to_cache(o.no_of_nnapi_instances_to_cache),
+        fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
+        allow_nnapi_cpu_on_android_10_plus(o.allow_nnapi_cpu_on_android_10_plus),
+        execution_priority(o.execution_priority),
+        allow_dynamic_dimensions(o.allow_dynamic_dimensions),
+        allow_fp16_precision_for_fp32(o.allow_fp16_precision_for_fp32),
+        use_burst_computation(o.use_burst_computation),
+        support_library_handle(o.support_library_handle) {
+}
+
+inline NNAPISettingsT &NNAPISettingsT::operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(accelerator_name, o.accelerator_name);
+  std::swap(cache_directory, o.cache_directory);
+  std::swap(model_token, o.model_token);
+  std::swap(execution_preference, o.execution_preference);
+  std::swap(no_of_nnapi_instances_to_cache, o.no_of_nnapi_instances_to_cache);
+  std::swap(fallback_settings, o.fallback_settings);
+  std::swap(allow_nnapi_cpu_on_android_10_plus, o.allow_nnapi_cpu_on_android_10_plus);
+  std::swap(execution_priority, o.execution_priority);
+  std::swap(allow_dynamic_dimensions, o.allow_dynamic_dimensions);
+  std::swap(allow_fp16_precision_for_fp32, o.allow_fp16_precision_for_fp32);
+  std::swap(use_burst_computation, o.use_burst_computation);
+  std::swap(support_library_handle, o.support_library_handle);
+  return *this;
+}
+
+inline NNAPISettingsT *NNAPISettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NNAPISettingsT>(new NNAPISettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = accelerator_name(); if (_e) _o->accelerator_name = _e->str(); }
+  { auto _e = cache_directory(); if (_e) _o->cache_directory = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+  { auto _e = execution_preference(); _o->execution_preference = _e; }
+  { auto _e = no_of_nnapi_instances_to_cache(); _o->no_of_nnapi_instances_to_cache = _e; }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
+  { auto _e = allow_nnapi_cpu_on_android_10_plus(); _o->allow_nnapi_cpu_on_android_10_plus = _e; }
+  { auto _e = execution_priority(); _o->execution_priority = _e; }
+  { auto _e = allow_dynamic_dimensions(); _o->allow_dynamic_dimensions = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
+  { auto _e = use_burst_computation(); _o->use_burst_computation = _e; }
+  { auto _e = support_library_handle(); _o->support_library_handle = _e; }
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> NNAPISettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNNAPISettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NNAPISettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _accelerator_name = _o->accelerator_name.empty() ? 0 : _fbb.CreateString(_o->accelerator_name);
+  auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  auto _execution_preference = _o->execution_preference;
+  auto _no_of_nnapi_instances_to_cache = _o->no_of_nnapi_instances_to_cache;
+  auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
+  auto _allow_nnapi_cpu_on_android_10_plus = _o->allow_nnapi_cpu_on_android_10_plus;
+  auto _execution_priority = _o->execution_priority;
+  auto _allow_dynamic_dimensions = _o->allow_dynamic_dimensions;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
+  auto _use_burst_computation = _o->use_burst_computation;
+  auto _support_library_handle = _o->support_library_handle;
+  return tflite::CreateNNAPISettings(
+      _fbb,
+      _accelerator_name,
+      _cache_directory,
+      _model_token,
+      _execution_preference,
+      _no_of_nnapi_instances_to_cache,
+      _fallback_settings,
+      _allow_nnapi_cpu_on_android_10_plus,
+      _execution_priority,
+      _allow_dynamic_dimensions,
+      _allow_fp16_precision_for_fp32,
+      _use_burst_computation,
+      _support_library_handle);
+}
+
+
+inline bool operator==(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
+  return
+      (lhs.is_precision_loss_allowed == rhs.is_precision_loss_allowed) &&
+      (lhs.enable_quantized_inference == rhs.enable_quantized_inference) &&
+      (lhs.force_backend == rhs.force_backend) &&
+      (lhs.inference_priority1 == rhs.inference_priority1) &&
+      (lhs.inference_priority2 == rhs.inference_priority2) &&
+      (lhs.inference_priority3 == rhs.inference_priority3) &&
+      (lhs.inference_preference == rhs.inference_preference) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token);
+}
+
+inline bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline GPUSettingsT *GPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GPUSettingsT>(new GPUSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GPUSettings::UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = is_precision_loss_allowed(); _o->is_precision_loss_allowed = _e; }
+  { auto _e = enable_quantized_inference(); _o->enable_quantized_inference = _e; }
+  { auto _e = force_backend(); _o->force_backend = _e; }
+  { auto _e = inference_priority1(); _o->inference_priority1 = _e; }
+  { auto _e = inference_priority2(); _o->inference_priority2 = _e; }
+  { auto _e = inference_priority3(); _o->inference_priority3 = _e; }
+  { auto _e = inference_preference(); _o->inference_preference = _e; }
+  { auto _e = cache_directory(); if (_e) _o->cache_directory = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<GPUSettings> GPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGPUSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _is_precision_loss_allowed = _o->is_precision_loss_allowed;
+  auto _enable_quantized_inference = _o->enable_quantized_inference;
+  auto _force_backend = _o->force_backend;
+  auto _inference_priority1 = _o->inference_priority1;
+  auto _inference_priority2 = _o->inference_priority2;
+  auto _inference_priority3 = _o->inference_priority3;
+  auto _inference_preference = _o->inference_preference;
+  auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  return tflite::CreateGPUSettings(
+      _fbb,
+      _is_precision_loss_allowed,
+      _enable_quantized_inference,
+      _force_backend,
+      _inference_priority1,
+      _inference_priority2,
+      _inference_priority3,
+      _inference_preference,
+      _cache_directory,
+      _model_token);
+}
+
+
+inline bool operator==(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs) {
+  return
+      (lhs.debug_level == rhs.debug_level) &&
+      (lhs.powersave_level == rhs.powersave_level) &&
+      (lhs.print_graph_profile == rhs.print_graph_profile) &&
+      (lhs.print_graph_debug == rhs.print_graph_debug);
+}
+
+inline bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline HexagonSettingsT *HexagonSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HexagonSettingsT>(new HexagonSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HexagonSettings::UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = debug_level(); _o->debug_level = _e; }
+  { auto _e = powersave_level(); _o->powersave_level = _e; }
+  { auto _e = print_graph_profile(); _o->print_graph_profile = _e; }
+  { auto _e = print_graph_debug(); _o->print_graph_debug = _e; }
+}
+
+inline ::flatbuffers::Offset<HexagonSettings> HexagonSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHexagonSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HexagonSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _debug_level = _o->debug_level;
+  auto _powersave_level = _o->powersave_level;
+  auto _print_graph_profile = _o->print_graph_profile;
+  auto _print_graph_debug = _o->print_graph_debug;
+  return tflite::CreateHexagonSettings(
+      _fbb,
+      _debug_level,
+      _powersave_level,
+      _print_graph_profile,
+      _print_graph_debug);
+}
+
+
+inline bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
+  return
+      (lhs.num_threads == rhs.num_threads) &&
+      (lhs.flags == rhs.flags) &&
+      (lhs.weight_cache_file_path == rhs.weight_cache_file_path);
+}
+
+inline bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline XNNPackSettingsT *XNNPackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<XNNPackSettingsT>(new XNNPackSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_threads(); _o->num_threads = _e; }
+  { auto _e = flags(); _o->flags = _e; }
+  { auto _e = weight_cache_file_path(); if (_e) _o->weight_cache_file_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateXNNPackSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const XNNPackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_threads = _o->num_threads;
+  auto _flags = _o->flags;
+  auto _weight_cache_file_path = _o->weight_cache_file_path.empty() ? 0 : _fbb.CreateString(_o->weight_cache_file_path);
+  return tflite::CreateXNNPackSettings(
+      _fbb,
+      _num_threads,
+      _flags,
+      _weight_cache_file_path);
+}
+
+
+inline bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
+  return
+      (lhs.enabled_devices == rhs.enabled_devices) &&
+      (lhs.coreml_version == rhs.coreml_version) &&
+      (lhs.max_delegated_partitions == rhs.max_delegated_partitions) &&
+      (lhs.min_nodes_per_partition == rhs.min_nodes_per_partition);
+}
+
+inline bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CoreMLSettingsT *CoreMLSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CoreMLSettingsT>(new CoreMLSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CoreMLSettings::UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = enabled_devices(); _o->enabled_devices = _e; }
+  { auto _e = coreml_version(); _o->coreml_version = _e; }
+  { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
+  { auto _e = min_nodes_per_partition(); _o->min_nodes_per_partition = _e; }
+}
+
+inline ::flatbuffers::Offset<CoreMLSettings> CoreMLSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCoreMLSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoreMLSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _enabled_devices = _o->enabled_devices;
+  auto _coreml_version = _o->coreml_version;
+  auto _max_delegated_partitions = _o->max_delegated_partitions;
+  auto _min_nodes_per_partition = _o->min_nodes_per_partition;
+  return tflite::CreateCoreMLSettings(
+      _fbb,
+      _enabled_devices,
+      _coreml_version,
+      _max_delegated_partitions,
+      _min_nodes_per_partition);
+}
+
+
+inline bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+  return
+      (lhs.delegate_path == rhs.delegate_path) &&
+      (lhs.delegate_name == rhs.delegate_name);
+}
+
+inline bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline StableDelegateLoaderSettingsT *StableDelegateLoaderSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StableDelegateLoaderSettingsT>(new StableDelegateLoaderSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StableDelegateLoaderSettings::UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate_path(); if (_e) _o->delegate_path = _e->str(); }
+  { auto _e = delegate_name(); if (_e) _o->delegate_name = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> StableDelegateLoaderSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStableDelegateLoaderSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StableDelegateLoaderSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate_path = _o->delegate_path.empty() ? 0 : _fbb.CreateString(_o->delegate_path);
+  auto _delegate_name = _o->delegate_name.empty() ? 0 : _fbb.CreateString(_o->delegate_name);
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      _delegate_path,
+      _delegate_name);
+}
+
+
+inline bool operator==(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs) {
+  return
+      (lhs.cache_dir == rhs.cache_dir) &&
+      (lhs.model_token == rhs.model_token);
+}
+
+inline bool operator!=(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CompilationCachingSettingsT *CompilationCachingSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CompilationCachingSettingsT>(new CompilationCachingSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CompilationCachingSettings::UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cache_dir(); if (_e) _o->cache_dir = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CompilationCachingSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCompilationCachingSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CompilationCachingSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cache_dir = _o->cache_dir.empty() ? 0 : _fbb.CreateString(_o->cache_dir);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  return tflite::CreateCompilationCachingSettings(
+      _fbb,
+      _cache_dir,
+      _model_token);
+}
+
+
+inline bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
+  return
+      (lhs.platform_type == rhs.platform_type) &&
+      (lhs.num_chips == rhs.num_chips) &&
+      (lhs.device_paths == rhs.device_paths) &&
+      (lhs.chip_family == rhs.chip_family);
+}
+
+inline bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuDeviceSpecT *EdgeTpuDeviceSpec::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuDeviceSpecT>(new EdgeTpuDeviceSpecT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuDeviceSpec::UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = platform_type(); _o->platform_type = _e; }
+  { auto _e = num_chips(); _o->num_chips = _e; }
+  { auto _e = device_paths(); if (_e) { _o->device_paths.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->device_paths[_i] = _e->Get(_i)->str(); } } else { _o->device_paths.resize(0); } }
+  { auto _e = chip_family(); _o->chip_family = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> EdgeTpuDeviceSpec::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuDeviceSpec(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuDeviceSpecT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _platform_type = _o->platform_type;
+  auto _num_chips = _o->num_chips;
+  auto _device_paths = _o->device_paths.size() ? _fbb.CreateVectorOfStrings(_o->device_paths) : 0;
+  auto _chip_family = _o->chip_family;
+  return tflite::CreateEdgeTpuDeviceSpec(
+      _fbb,
+      _platform_type,
+      _num_chips,
+      _device_paths,
+      _chip_family);
+}
+
+
+inline bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs) {
+  return
+      (lhs.inactive_power_state == rhs.inactive_power_state) &&
+      (lhs.inactive_timeout_us == rhs.inactive_timeout_us);
+}
+
+inline bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuInactivePowerConfigT *EdgeTpuInactivePowerConfig::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuInactivePowerConfigT>(new EdgeTpuInactivePowerConfigT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuInactivePowerConfig::UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inactive_power_state(); _o->inactive_power_state = _e; }
+  { auto _e = inactive_timeout_us(); _o->inactive_timeout_us = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> EdgeTpuInactivePowerConfig::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuInactivePowerConfig(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuInactivePowerConfigT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inactive_power_state = _o->inactive_power_state;
+  auto _inactive_timeout_us = _o->inactive_timeout_us;
+  return tflite::CreateEdgeTpuInactivePowerConfig(
+      _fbb,
+      _inactive_power_state,
+      _inactive_timeout_us);
+}
+
+
+inline bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
+  return
+      (lhs.inference_power_state == rhs.inference_power_state) &&
+      (lhs.inactive_power_configs.size() == rhs.inactive_power_configs.size() && std::equal(lhs.inactive_power_configs.cbegin(), lhs.inactive_power_configs.cend(), rhs.inactive_power_configs.cbegin(), [](std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &a, std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.inference_priority == rhs.inference_priority) &&
+      ((lhs.edgetpu_device_spec == rhs.edgetpu_device_spec) || (lhs.edgetpu_device_spec && rhs.edgetpu_device_spec && *lhs.edgetpu_device_spec == *rhs.edgetpu_device_spec)) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.float_truncation_type == rhs.float_truncation_type) &&
+      (lhs.qos_class == rhs.qos_class) &&
+      (lhs.hardware_cluster_ids == rhs.hardware_cluster_ids) &&
+      (lhs.public_model_id == rhs.public_model_id) &&
+      (lhs.use_layer_ir_tgc_backend == rhs.use_layer_ir_tgc_backend);
+}
+
+inline bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuSettingsT::EdgeTpuSettingsT(const EdgeTpuSettingsT &o)
+      : inference_power_state(o.inference_power_state),
+        inference_priority(o.inference_priority),
+        edgetpu_device_spec((o.edgetpu_device_spec) ? new tflite::EdgeTpuDeviceSpecT(*o.edgetpu_device_spec) : nullptr),
+        model_token(o.model_token),
+        float_truncation_type(o.float_truncation_type),
+        qos_class(o.qos_class),
+        hardware_cluster_ids(o.hardware_cluster_ids),
+        public_model_id(o.public_model_id),
+        use_layer_ir_tgc_backend(o.use_layer_ir_tgc_backend) {
+  inactive_power_configs.reserve(o.inactive_power_configs.size());
+  for (const auto &inactive_power_configs_ : o.inactive_power_configs) { inactive_power_configs.emplace_back((inactive_power_configs_) ? new tflite::EdgeTpuInactivePowerConfigT(*inactive_power_configs_) : nullptr); }
+}
+
+inline EdgeTpuSettingsT &EdgeTpuSettingsT::operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(inference_power_state, o.inference_power_state);
+  std::swap(inactive_power_configs, o.inactive_power_configs);
+  std::swap(inference_priority, o.inference_priority);
+  std::swap(edgetpu_device_spec, o.edgetpu_device_spec);
+  std::swap(model_token, o.model_token);
+  std::swap(float_truncation_type, o.float_truncation_type);
+  std::swap(qos_class, o.qos_class);
+  std::swap(hardware_cluster_ids, o.hardware_cluster_ids);
+  std::swap(public_model_id, o.public_model_id);
+  std::swap(use_layer_ir_tgc_backend, o.use_layer_ir_tgc_backend);
+  return *this;
+}
+
+inline EdgeTpuSettingsT *EdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuSettingsT>(new EdgeTpuSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inference_power_state(); _o->inference_power_state = _e; }
+  { auto _e = inactive_power_configs(); if (_e) { _o->inactive_power_configs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inactive_power_configs[_i]) { _e->Get(_i)->UnPackTo(_o->inactive_power_configs[_i].get(), _resolver); } else { _o->inactive_power_configs[_i] = std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inactive_power_configs.resize(0); } }
+  { auto _e = inference_priority(); _o->inference_priority = _e; }
+  { auto _e = edgetpu_device_spec(); if (_e) { if(_o->edgetpu_device_spec) { _e->UnPackTo(_o->edgetpu_device_spec.get(), _resolver); } else { _o->edgetpu_device_spec = std::unique_ptr<tflite::EdgeTpuDeviceSpecT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_device_spec) { _o->edgetpu_device_spec.reset(); } }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+  { auto _e = float_truncation_type(); _o->float_truncation_type = _e; }
+  { auto _e = qos_class(); _o->qos_class = _e; }
+  { auto _e = hardware_cluster_ids(); if (_e) { _o->hardware_cluster_ids.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->hardware_cluster_ids[_i] = _e->Get(_i); } } else { _o->hardware_cluster_ids.resize(0); } }
+  { auto _e = public_model_id(); if (_e) _o->public_model_id = _e->str(); }
+  { auto _e = use_layer_ir_tgc_backend(); _o->use_layer_ir_tgc_backend = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inference_power_state = _o->inference_power_state;
+  auto _inactive_power_configs = _o->inactive_power_configs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> (_o->inactive_power_configs.size(), [](size_t i, _VectorArgs *__va) { return CreateEdgeTpuInactivePowerConfig(*__va->__fbb, __va->__o->inactive_power_configs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _inference_priority = _o->inference_priority;
+  auto _edgetpu_device_spec = _o->edgetpu_device_spec ? CreateEdgeTpuDeviceSpec(_fbb, _o->edgetpu_device_spec.get(), _rehasher) : 0;
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  auto _float_truncation_type = _o->float_truncation_type;
+  auto _qos_class = _o->qos_class;
+  auto _hardware_cluster_ids = _o->hardware_cluster_ids.size() ? _fbb.CreateVector(_o->hardware_cluster_ids) : 0;
+  auto _public_model_id = _o->public_model_id.empty() ? 0 : _fbb.CreateString(_o->public_model_id);
+  auto _use_layer_ir_tgc_backend = _o->use_layer_ir_tgc_backend;
+  return tflite::CreateEdgeTpuSettings(
+      _fbb,
+      _inference_power_state,
+      _inactive_power_configs,
+      _inference_priority,
+      _edgetpu_device_spec,
+      _model_token,
+      _float_truncation_type,
+      _qos_class,
+      _hardware_cluster_ids,
+      _public_model_id,
+      _use_layer_ir_tgc_backend);
+}
+
+
+inline bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
+  return
+      (lhs.log_verbosity == rhs.log_verbosity) &&
+      (lhs.enable_tracing == rhs.enable_tracing) &&
+      (lhs.priority == rhs.priority) &&
+      (lhs.extension_data == rhs.extension_data) &&
+      (lhs.model_identifier == rhs.model_identifier) &&
+      (lhs.use_async_api == rhs.use_async_api) &&
+      (lhs.delegate_should_manage_cache_for_inputs == rhs.delegate_should_manage_cache_for_inputs) &&
+      (lhs.delegate_should_manage_cache_for_outputs == rhs.delegate_should_manage_cache_for_outputs) &&
+      (lhs.prefer_cache_coherency_for_inputs == rhs.prefer_cache_coherency_for_inputs) &&
+      (lhs.prefer_cache_coherency_for_outputs == rhs.prefer_cache_coherency_for_outputs) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32);
+}
+
+inline bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline GoogleEdgeTpuSettingsT *GoogleEdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GoogleEdgeTpuSettingsT>(new GoogleEdgeTpuSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GoogleEdgeTpuSettings::UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = log_verbosity(); _o->log_verbosity = _e; }
+  { auto _e = enable_tracing(); _o->enable_tracing = _e; }
+  { auto _e = priority(); _o->priority = _e; }
+  { auto _e = extension_data(); if (_e) { _o->extension_data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->extension_data.begin()); } }
+  { auto _e = model_identifier(); if (_e) _o->model_identifier = _e->str(); }
+  { auto _e = use_async_api(); _o->use_async_api = _e; }
+  { auto _e = delegate_should_manage_cache_for_inputs(); _o->delegate_should_manage_cache_for_inputs = _e; }
+  { auto _e = delegate_should_manage_cache_for_outputs(); _o->delegate_should_manage_cache_for_outputs = _e; }
+  { auto _e = prefer_cache_coherency_for_inputs(); _o->prefer_cache_coherency_for_inputs = _e; }
+  { auto _e = prefer_cache_coherency_for_outputs(); _o->prefer_cache_coherency_for_outputs = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> GoogleEdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGoogleEdgeTpuSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GoogleEdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _log_verbosity = _o->log_verbosity;
+  auto _enable_tracing = _o->enable_tracing;
+  auto _priority = _o->priority;
+  auto _extension_data = _o->extension_data.size() ? _fbb.CreateVector(_o->extension_data) : 0;
+  auto _model_identifier = _o->model_identifier.empty() ? 0 : _fbb.CreateString(_o->model_identifier);
+  auto _use_async_api = _o->use_async_api;
+  auto _delegate_should_manage_cache_for_inputs = _o->delegate_should_manage_cache_for_inputs;
+  auto _delegate_should_manage_cache_for_outputs = _o->delegate_should_manage_cache_for_outputs;
+  auto _prefer_cache_coherency_for_inputs = _o->prefer_cache_coherency_for_inputs;
+  auto _prefer_cache_coherency_for_outputs = _o->prefer_cache_coherency_for_outputs;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
+  return tflite::CreateGoogleEdgeTpuSettings(
+      _fbb,
+      _log_verbosity,
+      _enable_tracing,
+      _priority,
+      _extension_data,
+      _model_identifier,
+      _use_async_api,
+      _delegate_should_manage_cache_for_inputs,
+      _delegate_should_manage_cache_for_outputs,
+      _prefer_cache_coherency_for_inputs,
+      _prefer_cache_coherency_for_outputs,
+      _allow_fp16_precision_for_fp32);
+}
+
+
+inline bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
+  return
+      (lhs.device == rhs.device) &&
+      (lhs.performance == rhs.performance) &&
+      (lhs.usb_always_dfu == rhs.usb_always_dfu) &&
+      (lhs.usb_max_bulk_in_queue_length == rhs.usb_max_bulk_in_queue_length);
+}
+
+inline bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CoralSettingsT *CoralSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CoralSettingsT>(new CoralSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CoralSettings::UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = device(); if (_e) _o->device = _e->str(); }
+  { auto _e = performance(); _o->performance = _e; }
+  { auto _e = usb_always_dfu(); _o->usb_always_dfu = _e; }
+  { auto _e = usb_max_bulk_in_queue_length(); _o->usb_max_bulk_in_queue_length = _e; }
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CoralSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCoralSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoralSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _device = _o->device.empty() ? 0 : _fbb.CreateString(_o->device);
+  auto _performance = _o->performance;
+  auto _usb_always_dfu = _o->usb_always_dfu;
+  auto _usb_max_bulk_in_queue_length = _o->usb_max_bulk_in_queue_length;
+  return tflite::CreateCoralSettings(
+      _fbb,
+      _device,
+      _performance,
+      _usb_always_dfu,
+      _usb_max_bulk_in_queue_length);
+}
+
+
+inline bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
+  return
+      (lhs.num_threads == rhs.num_threads);
+}
+
+inline bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CPUSettingsT *CPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CPUSettingsT>(new CPUSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CPUSettings::UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_threads(); _o->num_threads = _e; }
+}
+
+inline ::flatbuffers::Offset<CPUSettings> CPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCPUSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_threads = _o->num_threads;
+  return tflite::CreateCPUSettings(
+      _fbb,
+      _num_threads);
+}
+
+
+inline bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+  return
+      (lhs.backends == rhs.backends) &&
+      (lhs.fastmath == rhs.fastmath) &&
+      (lhs.additional_parameters == rhs.additional_parameters);
+}
+
+inline bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ArmNNSettingsT *ArmNNSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArmNNSettingsT>(new ArmNNSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArmNNSettings::UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = backends(); if (_e) _o->backends = _e->str(); }
+  { auto _e = fastmath(); _o->fastmath = _e; }
+  { auto _e = additional_parameters(); if (_e) _o->additional_parameters = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> ArmNNSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArmNNSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArmNNSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _backends = _o->backends.empty() ? 0 : _fbb.CreateString(_o->backends);
+  auto _fastmath = _o->fastmath;
+  auto _additional_parameters = _o->additional_parameters.empty() ? 0 : _fbb.CreateString(_o->additional_parameters);
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      _backends,
+      _fastmath,
+      _additional_parameters);
+}
+
+
+inline bool operator==(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs) {
+  return
+      (lhs.execution_preference == rhs.execution_preference) &&
+      (lhs.execution_priority == rhs.execution_priority) &&
+      (lhs.optimization_hints == rhs.optimization_hints) &&
+      (lhs.operation_check_mode == rhs.operation_check_mode) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32) &&
+      (lhs.use_ahwb == rhs.use_ahwb) &&
+      (lhs.use_cacheable_buffer == rhs.use_cacheable_buffer) &&
+      (lhs.compile_options == rhs.compile_options) &&
+      (lhs.accelerator_names == rhs.accelerator_names) &&
+      (lhs.neuron_config_path == rhs.neuron_config_path) &&
+      (lhs.inference_deadline_ms == rhs.inference_deadline_ms) &&
+      (lhs.inference_abort_time_ms == rhs.inference_abort_time_ms);
+}
+
+inline bool operator!=(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MtkNeuronSettingsT *MtkNeuronSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MtkNeuronSettingsT>(new MtkNeuronSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MtkNeuronSettings::UnPackTo(MtkNeuronSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = execution_preference(); _o->execution_preference = _e; }
+  { auto _e = execution_priority(); _o->execution_priority = _e; }
+  { auto _e = optimization_hints(); if (_e) { _o->optimization_hints.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->optimization_hints[_i] = static_cast<tflite::MtkNeuronSettings_::OptimizationHint>(_e->Get(_i)); } } else { _o->optimization_hints.resize(0); } }
+  { auto _e = operation_check_mode(); _o->operation_check_mode = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
+  { auto _e = use_ahwb(); _o->use_ahwb = _e; }
+  { auto _e = use_cacheable_buffer(); _o->use_cacheable_buffer = _e; }
+  { auto _e = compile_options(); if (_e) { _o->compile_options.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->compile_options[_i] = _e->Get(_i)->str(); } } else { _o->compile_options.resize(0); } }
+  { auto _e = accelerator_names(); if (_e) { _o->accelerator_names.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->accelerator_names[_i] = _e->Get(_i)->str(); } } else { _o->accelerator_names.resize(0); } }
+  { auto _e = neuron_config_path(); if (_e) _o->neuron_config_path = _e->str(); }
+  { auto _e = inference_deadline_ms(); _o->inference_deadline_ms = _e; }
+  { auto _e = inference_abort_time_ms(); _o->inference_abort_time_ms = _e; }
+}
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> MtkNeuronSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMtkNeuronSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MtkNeuronSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _execution_preference = _o->execution_preference;
+  auto _execution_priority = _o->execution_priority;
+  auto _optimization_hints = _o->optimization_hints.size() ? _fbb.CreateVectorScalarCast<int32_t>(::flatbuffers::data(_o->optimization_hints), _o->optimization_hints.size()) : 0;
+  auto _operation_check_mode = _o->operation_check_mode;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
+  auto _use_ahwb = _o->use_ahwb;
+  auto _use_cacheable_buffer = _o->use_cacheable_buffer;
+  auto _compile_options = _o->compile_options.size() ? _fbb.CreateVectorOfStrings(_o->compile_options) : 0;
+  auto _accelerator_names = _o->accelerator_names.size() ? _fbb.CreateVectorOfStrings(_o->accelerator_names) : 0;
+  auto _neuron_config_path = _o->neuron_config_path.empty() ? 0 : _fbb.CreateString(_o->neuron_config_path);
+  auto _inference_deadline_ms = _o->inference_deadline_ms;
+  auto _inference_abort_time_ms = _o->inference_abort_time_ms;
+  return tflite::CreateMtkNeuronSettings(
+      _fbb,
+      _execution_preference,
+      _execution_priority,
+      _optimization_hints,
+      _operation_check_mode,
+      _allow_fp16_precision_for_fp32,
+      _use_ahwb,
+      _use_cacheable_buffer,
+      _compile_options,
+      _accelerator_names,
+      _neuron_config_path,
+      _inference_deadline_ms,
+      _inference_abort_time_ms);
+}
+
+
+inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
+  return
+      (lhs.delegate == rhs.delegate) &&
+      ((lhs.nnapi_settings == rhs.nnapi_settings) || (lhs.nnapi_settings && rhs.nnapi_settings && *lhs.nnapi_settings == *rhs.nnapi_settings)) &&
+      ((lhs.gpu_settings == rhs.gpu_settings) || (lhs.gpu_settings && rhs.gpu_settings && *lhs.gpu_settings == *rhs.gpu_settings)) &&
+      ((lhs.hexagon_settings == rhs.hexagon_settings) || (lhs.hexagon_settings && rhs.hexagon_settings && *lhs.hexagon_settings == *rhs.hexagon_settings)) &&
+      ((lhs.xnnpack_settings == rhs.xnnpack_settings) || (lhs.xnnpack_settings && rhs.xnnpack_settings && *lhs.xnnpack_settings == *rhs.xnnpack_settings)) &&
+      ((lhs.coreml_settings == rhs.coreml_settings) || (lhs.coreml_settings && rhs.coreml_settings && *lhs.coreml_settings == *rhs.coreml_settings)) &&
+      ((lhs.cpu_settings == rhs.cpu_settings) || (lhs.cpu_settings && rhs.cpu_settings && *lhs.cpu_settings == *rhs.cpu_settings)) &&
+      (lhs.max_delegated_partitions == rhs.max_delegated_partitions) &&
+      ((lhs.edgetpu_settings == rhs.edgetpu_settings) || (lhs.edgetpu_settings && rhs.edgetpu_settings && *lhs.edgetpu_settings == *rhs.edgetpu_settings)) &&
+      ((lhs.coral_settings == rhs.coral_settings) || (lhs.coral_settings && rhs.coral_settings && *lhs.coral_settings == *rhs.coral_settings)) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.disable_default_delegates == rhs.disable_default_delegates) &&
+      ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings)) &&
+      ((lhs.google_edgetpu_settings == rhs.google_edgetpu_settings) || (lhs.google_edgetpu_settings && rhs.google_edgetpu_settings && *lhs.google_edgetpu_settings == *rhs.google_edgetpu_settings)) &&
+      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings)) &&
+      ((lhs.armnn_settings == rhs.armnn_settings) || (lhs.armnn_settings && rhs.armnn_settings && *lhs.armnn_settings == *rhs.armnn_settings)) &&
+      ((lhs.mtk_neuron_settings == rhs.mtk_neuron_settings) || (lhs.mtk_neuron_settings && rhs.mtk_neuron_settings && *lhs.mtk_neuron_settings == *rhs.mtk_neuron_settings));
+}
+
+inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline TFLiteSettingsT::TFLiteSettingsT(const TFLiteSettingsT &o)
+      : delegate(o.delegate),
+        nnapi_settings((o.nnapi_settings) ? new tflite::NNAPISettingsT(*o.nnapi_settings) : nullptr),
+        gpu_settings((o.gpu_settings) ? new tflite::GPUSettingsT(*o.gpu_settings) : nullptr),
+        hexagon_settings((o.hexagon_settings) ? new tflite::HexagonSettingsT(*o.hexagon_settings) : nullptr),
+        xnnpack_settings((o.xnnpack_settings) ? new tflite::XNNPackSettingsT(*o.xnnpack_settings) : nullptr),
+        coreml_settings((o.coreml_settings) ? new tflite::CoreMLSettingsT(*o.coreml_settings) : nullptr),
+        cpu_settings((o.cpu_settings) ? new tflite::CPUSettingsT(*o.cpu_settings) : nullptr),
+        max_delegated_partitions(o.max_delegated_partitions),
+        edgetpu_settings((o.edgetpu_settings) ? new tflite::EdgeTpuSettingsT(*o.edgetpu_settings) : nullptr),
+        coral_settings((o.coral_settings) ? new tflite::CoralSettingsT(*o.coral_settings) : nullptr),
+        fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
+        disable_default_delegates(o.disable_default_delegates),
+        stable_delegate_loader_settings((o.stable_delegate_loader_settings) ? new tflite::StableDelegateLoaderSettingsT(*o.stable_delegate_loader_settings) : nullptr),
+        google_edgetpu_settings((o.google_edgetpu_settings) ? new tflite::GoogleEdgeTpuSettingsT(*o.google_edgetpu_settings) : nullptr),
+        compilation_caching_settings((o.compilation_caching_settings) ? new tflite::CompilationCachingSettingsT(*o.compilation_caching_settings) : nullptr),
+        armnn_settings((o.armnn_settings) ? new tflite::ArmNNSettingsT(*o.armnn_settings) : nullptr),
+        mtk_neuron_settings((o.mtk_neuron_settings) ? new tflite::MtkNeuronSettingsT(*o.mtk_neuron_settings) : nullptr) {
+}
+
+inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(delegate, o.delegate);
+  std::swap(nnapi_settings, o.nnapi_settings);
+  std::swap(gpu_settings, o.gpu_settings);
+  std::swap(hexagon_settings, o.hexagon_settings);
+  std::swap(xnnpack_settings, o.xnnpack_settings);
+  std::swap(coreml_settings, o.coreml_settings);
+  std::swap(cpu_settings, o.cpu_settings);
+  std::swap(max_delegated_partitions, o.max_delegated_partitions);
+  std::swap(edgetpu_settings, o.edgetpu_settings);
+  std::swap(coral_settings, o.coral_settings);
+  std::swap(fallback_settings, o.fallback_settings);
+  std::swap(disable_default_delegates, o.disable_default_delegates);
+  std::swap(stable_delegate_loader_settings, o.stable_delegate_loader_settings);
+  std::swap(google_edgetpu_settings, o.google_edgetpu_settings);
+  std::swap(compilation_caching_settings, o.compilation_caching_settings);
+  std::swap(armnn_settings, o.armnn_settings);
+  std::swap(mtk_neuron_settings, o.mtk_neuron_settings);
+  return *this;
+}
+
+inline TFLiteSettingsT *TFLiteSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TFLiteSettingsT>(new TFLiteSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate(); _o->delegate = _e; }
+  { auto _e = nnapi_settings(); if (_e) { if(_o->nnapi_settings) { _e->UnPackTo(_o->nnapi_settings.get(), _resolver); } else { _o->nnapi_settings = std::unique_ptr<tflite::NNAPISettingsT>(_e->UnPack(_resolver)); } } else if (_o->nnapi_settings) { _o->nnapi_settings.reset(); } }
+  { auto _e = gpu_settings(); if (_e) { if(_o->gpu_settings) { _e->UnPackTo(_o->gpu_settings.get(), _resolver); } else { _o->gpu_settings = std::unique_ptr<tflite::GPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->gpu_settings) { _o->gpu_settings.reset(); } }
+  { auto _e = hexagon_settings(); if (_e) { if(_o->hexagon_settings) { _e->UnPackTo(_o->hexagon_settings.get(), _resolver); } else { _o->hexagon_settings = std::unique_ptr<tflite::HexagonSettingsT>(_e->UnPack(_resolver)); } } else if (_o->hexagon_settings) { _o->hexagon_settings.reset(); } }
+  { auto _e = xnnpack_settings(); if (_e) { if(_o->xnnpack_settings) { _e->UnPackTo(_o->xnnpack_settings.get(), _resolver); } else { _o->xnnpack_settings = std::unique_ptr<tflite::XNNPackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->xnnpack_settings) { _o->xnnpack_settings.reset(); } }
+  { auto _e = coreml_settings(); if (_e) { if(_o->coreml_settings) { _e->UnPackTo(_o->coreml_settings.get(), _resolver); } else { _o->coreml_settings = std::unique_ptr<tflite::CoreMLSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coreml_settings) { _o->coreml_settings.reset(); } }
+  { auto _e = cpu_settings(); if (_e) { if(_o->cpu_settings) { _e->UnPackTo(_o->cpu_settings.get(), _resolver); } else { _o->cpu_settings = std::unique_ptr<tflite::CPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->cpu_settings) { _o->cpu_settings.reset(); } }
+  { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
+  { auto _e = edgetpu_settings(); if (_e) { if(_o->edgetpu_settings) { _e->UnPackTo(_o->edgetpu_settings.get(), _resolver); } else { _o->edgetpu_settings = std::unique_ptr<tflite::EdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_settings) { _o->edgetpu_settings.reset(); } }
+  { auto _e = coral_settings(); if (_e) { if(_o->coral_settings) { _e->UnPackTo(_o->coral_settings.get(), _resolver); } else { _o->coral_settings = std::unique_ptr<tflite::CoralSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coral_settings) { _o->coral_settings.reset(); } }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
+  { auto _e = disable_default_delegates(); _o->disable_default_delegates = _e; }
+  { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } else if (_o->stable_delegate_loader_settings) { _o->stable_delegate_loader_settings.reset(); } }
+  { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
+  { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
+  { auto _e = armnn_settings(); if (_e) { if(_o->armnn_settings) { _e->UnPackTo(_o->armnn_settings.get(), _resolver); } else { _o->armnn_settings = std::unique_ptr<tflite::ArmNNSettingsT>(_e->UnPack(_resolver)); } } else if (_o->armnn_settings) { _o->armnn_settings.reset(); } }
+  { auto _e = mtk_neuron_settings(); if (_e) { if(_o->mtk_neuron_settings) { _e->UnPackTo(_o->mtk_neuron_settings.get(), _resolver); } else { _o->mtk_neuron_settings = std::unique_ptr<tflite::MtkNeuronSettingsT>(_e->UnPack(_resolver)); } } else if (_o->mtk_neuron_settings) { _o->mtk_neuron_settings.reset(); } }
+}
+
+inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTFLiteSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TFLiteSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate = _o->delegate;
+  auto _nnapi_settings = _o->nnapi_settings ? CreateNNAPISettings(_fbb, _o->nnapi_settings.get(), _rehasher) : 0;
+  auto _gpu_settings = _o->gpu_settings ? CreateGPUSettings(_fbb, _o->gpu_settings.get(), _rehasher) : 0;
+  auto _hexagon_settings = _o->hexagon_settings ? CreateHexagonSettings(_fbb, _o->hexagon_settings.get(), _rehasher) : 0;
+  auto _xnnpack_settings = _o->xnnpack_settings ? CreateXNNPackSettings(_fbb, _o->xnnpack_settings.get(), _rehasher) : 0;
+  auto _coreml_settings = _o->coreml_settings ? CreateCoreMLSettings(_fbb, _o->coreml_settings.get(), _rehasher) : 0;
+  auto _cpu_settings = _o->cpu_settings ? CreateCPUSettings(_fbb, _o->cpu_settings.get(), _rehasher) : 0;
+  auto _max_delegated_partitions = _o->max_delegated_partitions;
+  auto _edgetpu_settings = _o->edgetpu_settings ? CreateEdgeTpuSettings(_fbb, _o->edgetpu_settings.get(), _rehasher) : 0;
+  auto _coral_settings = _o->coral_settings ? CreateCoralSettings(_fbb, _o->coral_settings.get(), _rehasher) : 0;
+  auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
+  auto _disable_default_delegates = _o->disable_default_delegates;
+  auto _stable_delegate_loader_settings = _o->stable_delegate_loader_settings ? CreateStableDelegateLoaderSettings(_fbb, _o->stable_delegate_loader_settings.get(), _rehasher) : 0;
+  auto _google_edgetpu_settings = _o->google_edgetpu_settings ? CreateGoogleEdgeTpuSettings(_fbb, _o->google_edgetpu_settings.get(), _rehasher) : 0;
+  auto _compilation_caching_settings = _o->compilation_caching_settings ? CreateCompilationCachingSettings(_fbb, _o->compilation_caching_settings.get(), _rehasher) : 0;
+  auto _armnn_settings = _o->armnn_settings ? CreateArmNNSettings(_fbb, _o->armnn_settings.get(), _rehasher) : 0;
+  auto _mtk_neuron_settings = _o->mtk_neuron_settings ? CreateMtkNeuronSettings(_fbb, _o->mtk_neuron_settings.get(), _rehasher) : 0;
+  return tflite::CreateTFLiteSettings(
+      _fbb,
+      _delegate,
+      _nnapi_settings,
+      _gpu_settings,
+      _hexagon_settings,
+      _xnnpack_settings,
+      _coreml_settings,
+      _cpu_settings,
+      _max_delegated_partitions,
+      _edgetpu_settings,
+      _coral_settings,
+      _fallback_settings,
+      _disable_default_delegates,
+      _stable_delegate_loader_settings,
+      _google_edgetpu_settings,
+      _compilation_caching_settings,
+      _armnn_settings,
+      _mtk_neuron_settings);
+}
+
+
+inline bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs) {
+  return
+      (lhs.allow_automatic_fallback_on_compilation_error == rhs.allow_automatic_fallback_on_compilation_error) &&
+      (lhs.allow_automatic_fallback_on_execution_error == rhs.allow_automatic_fallback_on_execution_error);
+}
+
+inline bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline FallbackSettingsT *FallbackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FallbackSettingsT>(new FallbackSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FallbackSettings::UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = allow_automatic_fallback_on_compilation_error(); _o->allow_automatic_fallback_on_compilation_error = _e; }
+  { auto _e = allow_automatic_fallback_on_execution_error(); _o->allow_automatic_fallback_on_execution_error = _e; }
+}
+
+inline ::flatbuffers::Offset<FallbackSettings> FallbackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFallbackSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FallbackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _allow_automatic_fallback_on_compilation_error = _o->allow_automatic_fallback_on_compilation_error;
+  auto _allow_automatic_fallback_on_execution_error = _o->allow_automatic_fallback_on_execution_error;
+  return tflite::CreateFallbackSettings(
+      _fbb,
+      _allow_automatic_fallback_on_compilation_error,
+      _allow_automatic_fallback_on_execution_error);
+}
+
+
+inline bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs) {
+  return
+      (lhs.name == rhs.name) &&
+      (lhs.values == rhs.values);
+}
+
+inline bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkMetricT *BenchmarkMetric::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkMetricT>(new BenchmarkMetricT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkMetric::UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> BenchmarkMetric::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkMetric(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkMetricT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateBenchmarkMetric(
+      _fbb,
+      _name,
+      _values);
+}
+
+
+inline bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
+  return
+      (lhs.initialization_time_us == rhs.initialization_time_us) &&
+      (lhs.inference_time_us == rhs.inference_time_us) &&
+      (lhs.max_memory_kb == rhs.max_memory_kb) &&
+      (lhs.ok == rhs.ok) &&
+      (lhs.metrics.size() == rhs.metrics.size() && std::equal(lhs.metrics.cbegin(), lhs.metrics.cend(), rhs.metrics.cbegin(), [](std::unique_ptr<tflite::BenchmarkMetricT> const &a, std::unique_ptr<tflite::BenchmarkMetricT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.actual_output.size() == rhs.actual_output.size() && std::equal(lhs.actual_output.cbegin(), lhs.actual_output.cend(), rhs.actual_output.cbegin(), [](std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &a, std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &b) { return (a == b) || (a && b && *a == *b); }));
+}
+
+inline bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkResultT::BenchmarkResultT(const BenchmarkResultT &o)
+      : initialization_time_us(o.initialization_time_us),
+        inference_time_us(o.inference_time_us),
+        max_memory_kb(o.max_memory_kb),
+        ok(o.ok) {
+  metrics.reserve(o.metrics.size());
+  for (const auto &metrics_ : o.metrics) { metrics.emplace_back((metrics_) ? new tflite::BenchmarkMetricT(*metrics_) : nullptr); }
+  actual_output.reserve(o.actual_output.size());
+  for (const auto &actual_output_ : o.actual_output) { actual_output.emplace_back((actual_output_) ? new tflite::BenchmarkResult_::InferenceOutputT(*actual_output_) : nullptr); }
+}
+
+inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(initialization_time_us, o.initialization_time_us);
+  std::swap(inference_time_us, o.inference_time_us);
+  std::swap(max_memory_kb, o.max_memory_kb);
+  std::swap(ok, o.ok);
+  std::swap(metrics, o.metrics);
+  std::swap(actual_output, o.actual_output);
+  return *this;
+}
+
+inline BenchmarkResultT *BenchmarkResult::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkResultT>(new BenchmarkResultT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkResult::UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = initialization_time_us(); if (_e) { _o->initialization_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->initialization_time_us[_i] = _e->Get(_i); } } else { _o->initialization_time_us.resize(0); } }
+  { auto _e = inference_time_us(); if (_e) { _o->inference_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inference_time_us[_i] = _e->Get(_i); } } else { _o->inference_time_us.resize(0); } }
+  { auto _e = max_memory_kb(); _o->max_memory_kb = _e; }
+  { auto _e = ok(); _o->ok = _e; }
+  { auto _e = metrics(); if (_e) { _o->metrics.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metrics[_i]) { _e->Get(_i)->UnPackTo(_o->metrics[_i].get(), _resolver); } else { _o->metrics[_i] = std::unique_ptr<tflite::BenchmarkMetricT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metrics.resize(0); } }
+  { auto _e = actual_output(); if (_e) { _o->actual_output.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->actual_output[_i]) { _e->Get(_i)->UnPackTo(_o->actual_output[_i].get(), _resolver); } else { _o->actual_output[_i] = std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->actual_output.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> BenchmarkResult::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkResult(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkResultT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _initialization_time_us = _o->initialization_time_us.size() ? _fbb.CreateVector(_o->initialization_time_us) : 0;
+  auto _inference_time_us = _o->inference_time_us.size() ? _fbb.CreateVector(_o->inference_time_us) : 0;
+  auto _max_memory_kb = _o->max_memory_kb;
+  auto _ok = _o->ok;
+  auto _metrics = _o->metrics.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>> (_o->metrics.size(), [](size_t i, _VectorArgs *__va) { return CreateBenchmarkMetric(*__va->__fbb, __va->__o->metrics[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _actual_output = _o->actual_output.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> (_o->actual_output.size(), [](size_t i, _VectorArgs *__va) { return CreateInferenceOutput(*__va->__fbb, __va->__o->actual_output[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateBenchmarkResult(
+      _fbb,
+      _initialization_time_us,
+      _inference_time_us,
+      _max_memory_kb,
+      _ok,
+      _metrics,
+      _actual_output);
+}
+
+namespace BenchmarkResult_ {
+
+
+inline bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+  return
+      (lhs.value == rhs.value);
+}
+
+inline bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline InferenceOutputT *InferenceOutput::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<InferenceOutputT>(new InferenceOutputT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void InferenceOutput::UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = value(); if (_e) { _o->value.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->value.begin()); } }
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> InferenceOutput::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInferenceOutput(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const InferenceOutputT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _value = _o->value.size() ? _fbb.CreateVector(_o->value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      _value);
+}
+
+}  // namespace BenchmarkResult_
+
+
+inline bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
+  return
+      (lhs.source == rhs.source) &&
+      (lhs.tflite_error == rhs.tflite_error) &&
+      (lhs.underlying_api_error == rhs.underlying_api_error);
+}
+
+inline bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ErrorCodeT *ErrorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ErrorCodeT>(new ErrorCodeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ErrorCode::UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = source(); _o->source = _e; }
+  { auto _e = tflite_error(); _o->tflite_error = _e; }
+  { auto _e = underlying_api_error(); _o->underlying_api_error = _e; }
+}
+
+inline ::flatbuffers::Offset<ErrorCode> ErrorCode::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateErrorCode(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ErrorCodeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _source = _o->source;
+  auto _tflite_error = _o->tflite_error;
+  auto _underlying_api_error = _o->underlying_api_error;
+  return tflite::CreateErrorCode(
+      _fbb,
+      _source,
+      _tflite_error,
+      _underlying_api_error);
+}
+
+
+inline bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
+  return
+      (lhs.stage == rhs.stage) &&
+      (lhs.exit_code == rhs.exit_code) &&
+      (lhs.signal == rhs.signal) &&
+      (lhs.error_code.size() == rhs.error_code.size() && std::equal(lhs.error_code.cbegin(), lhs.error_code.cend(), rhs.error_code.cbegin(), [](std::unique_ptr<tflite::ErrorCodeT> const &a, std::unique_ptr<tflite::ErrorCodeT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.mini_benchmark_error_code == rhs.mini_benchmark_error_code);
+}
+
+inline bool operator!=(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkErrorT::BenchmarkErrorT(const BenchmarkErrorT &o)
+      : stage(o.stage),
+        exit_code(o.exit_code),
+        signal(o.signal),
+        mini_benchmark_error_code(o.mini_benchmark_error_code) {
+  error_code.reserve(o.error_code.size());
+  for (const auto &error_code_ : o.error_code) { error_code.emplace_back((error_code_) ? new tflite::ErrorCodeT(*error_code_) : nullptr); }
+}
+
+inline BenchmarkErrorT &BenchmarkErrorT::operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(stage, o.stage);
+  std::swap(exit_code, o.exit_code);
+  std::swap(signal, o.signal);
+  std::swap(error_code, o.error_code);
+  std::swap(mini_benchmark_error_code, o.mini_benchmark_error_code);
+  return *this;
+}
+
+inline BenchmarkErrorT *BenchmarkError::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkErrorT>(new BenchmarkErrorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkError::UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = stage(); _o->stage = _e; }
+  { auto _e = exit_code(); _o->exit_code = _e; }
+  { auto _e = signal(); _o->signal = _e; }
+  { auto _e = error_code(); if (_e) { _o->error_code.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->error_code[_i]) { _e->Get(_i)->UnPackTo(_o->error_code[_i].get(), _resolver); } else { _o->error_code[_i] = std::unique_ptr<tflite::ErrorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->error_code.resize(0); } }
+  { auto _e = mini_benchmark_error_code(); _o->mini_benchmark_error_code = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> BenchmarkError::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkError(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkErrorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _stage = _o->stage;
+  auto _exit_code = _o->exit_code;
+  auto _signal = _o->signal;
+  auto _error_code = _o->error_code.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>> (_o->error_code.size(), [](size_t i, _VectorArgs *__va) { return CreateErrorCode(*__va->__fbb, __va->__o->error_code[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _mini_benchmark_error_code = _o->mini_benchmark_error_code;
+  return tflite::CreateBenchmarkError(
+      _fbb,
+      _stage,
+      _exit_code,
+      _signal,
+      _error_code,
+      _mini_benchmark_error_code);
+}
+
+
+inline bool operator==(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs) {
+  return
+      ((lhs.tflite_settings == rhs.tflite_settings) || (lhs.tflite_settings && rhs.tflite_settings && *lhs.tflite_settings == *rhs.tflite_settings)) &&
+      (lhs.event_type == rhs.event_type) &&
+      ((lhs.result == rhs.result) || (lhs.result && rhs.result && *lhs.result == *rhs.result)) &&
+      ((lhs.error == rhs.error) || (lhs.error && rhs.error && *lhs.error == *rhs.error)) &&
+      (lhs.boottime_us == rhs.boottime_us) &&
+      (lhs.wallclock_us == rhs.wallclock_us);
+}
+
+inline bool operator!=(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventT::BenchmarkEventT(const BenchmarkEventT &o)
+      : tflite_settings((o.tflite_settings) ? new tflite::TFLiteSettingsT(*o.tflite_settings) : nullptr),
+        event_type(o.event_type),
+        result((o.result) ? new tflite::BenchmarkResultT(*o.result) : nullptr),
+        error((o.error) ? new tflite::BenchmarkErrorT(*o.error) : nullptr),
+        boottime_us(o.boottime_us),
+        wallclock_us(o.wallclock_us) {
+}
+
+inline BenchmarkEventT &BenchmarkEventT::operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(tflite_settings, o.tflite_settings);
+  std::swap(event_type, o.event_type);
+  std::swap(result, o.result);
+  std::swap(error, o.error);
+  std::swap(boottime_us, o.boottime_us);
+  std::swap(wallclock_us, o.wallclock_us);
+  return *this;
+}
+
+inline BenchmarkEventT *BenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventT>(new BenchmarkEventT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEvent::UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
+  { auto _e = event_type(); _o->event_type = _e; }
+  { auto _e = result(); if (_e) { if(_o->result) { _e->UnPackTo(_o->result.get(), _resolver); } else { _o->result = std::unique_ptr<tflite::BenchmarkResultT>(_e->UnPack(_resolver)); } } else if (_o->result) { _o->result.reset(); } }
+  { auto _e = error(); if (_e) { if(_o->error) { _e->UnPackTo(_o->error.get(), _resolver); } else { _o->error = std::unique_ptr<tflite::BenchmarkErrorT>(_e->UnPack(_resolver)); } } else if (_o->error) { _o->error.reset(); } }
+  { auto _e = boottime_us(); _o->boottime_us = _e; }
+  { auto _e = wallclock_us(); _o->wallclock_us = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkEvent> BenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEvent(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
+  auto _event_type = _o->event_type;
+  auto _result = _o->result ? CreateBenchmarkResult(_fbb, _o->result.get(), _rehasher) : 0;
+  auto _error = _o->error ? CreateBenchmarkError(_fbb, _o->error.get(), _rehasher) : 0;
+  auto _boottime_us = _o->boottime_us;
+  auto _wallclock_us = _o->wallclock_us;
+  return tflite::CreateBenchmarkEvent(
+      _fbb,
+      _tflite_settings,
+      _event_type,
+      _result,
+      _error,
+      _boottime_us,
+      _wallclock_us);
+}
+
+
+inline bool operator==(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs) {
+  return
+      (lhs.number_of_source_events == rhs.number_of_source_events) &&
+      ((lhs.min_latency_event == rhs.min_latency_event) || (lhs.min_latency_event && rhs.min_latency_event && *lhs.min_latency_event == *rhs.min_latency_event)) &&
+      (lhs.min_inference_time_us == rhs.min_inference_time_us);
+}
+
+inline bool operator!=(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BestAccelerationDecisionT::BestAccelerationDecisionT(const BestAccelerationDecisionT &o)
+      : number_of_source_events(o.number_of_source_events),
+        min_latency_event((o.min_latency_event) ? new tflite::BenchmarkEventT(*o.min_latency_event) : nullptr),
+        min_inference_time_us(o.min_inference_time_us) {
+}
+
+inline BestAccelerationDecisionT &BestAccelerationDecisionT::operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(number_of_source_events, o.number_of_source_events);
+  std::swap(min_latency_event, o.min_latency_event);
+  std::swap(min_inference_time_us, o.min_inference_time_us);
+  return *this;
+}
+
+inline BestAccelerationDecisionT *BestAccelerationDecision::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BestAccelerationDecisionT>(new BestAccelerationDecisionT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BestAccelerationDecision::UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = number_of_source_events(); _o->number_of_source_events = _e; }
+  { auto _e = min_latency_event(); if (_e) { if(_o->min_latency_event) { _e->UnPackTo(_o->min_latency_event.get(), _resolver); } else { _o->min_latency_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->min_latency_event) { _o->min_latency_event.reset(); } }
+  { auto _e = min_inference_time_us(); _o->min_inference_time_us = _e; }
+}
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> BestAccelerationDecision::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBestAccelerationDecision(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BestAccelerationDecisionT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _number_of_source_events = _o->number_of_source_events;
+  auto _min_latency_event = _o->min_latency_event ? CreateBenchmarkEvent(_fbb, _o->min_latency_event.get(), _rehasher) : 0;
+  auto _min_inference_time_us = _o->min_inference_time_us;
+  return tflite::CreateBestAccelerationDecision(
+      _fbb,
+      _number_of_source_events,
+      _min_latency_event,
+      _min_inference_time_us);
+}
+
+
+inline bool operator==(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs) {
+  return
+      (lhs.initialization_status == rhs.initialization_status);
+}
+
+inline bool operator!=(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkInitializationFailureT *BenchmarkInitializationFailure::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkInitializationFailureT>(new BenchmarkInitializationFailureT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkInitializationFailure::UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = initialization_status(); _o->initialization_status = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> BenchmarkInitializationFailure::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkInitializationFailure(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkInitializationFailureT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _initialization_status = _o->initialization_status;
+  return tflite::CreateBenchmarkInitializationFailure(
+      _fbb,
+      _initialization_status);
+}
+
+
+inline bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs) {
+  return
+      (lhs.is_log_flushing_event == rhs.is_log_flushing_event) &&
+      ((lhs.best_acceleration_decision == rhs.best_acceleration_decision) || (lhs.best_acceleration_decision && rhs.best_acceleration_decision && *lhs.best_acceleration_decision == *rhs.best_acceleration_decision)) &&
+      ((lhs.initialization_failure == rhs.initialization_failure) || (lhs.initialization_failure && rhs.initialization_failure && *lhs.initialization_failure == *rhs.initialization_failure)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MiniBenchmarkEventT::MiniBenchmarkEventT(const MiniBenchmarkEventT &o)
+      : is_log_flushing_event(o.is_log_flushing_event),
+        best_acceleration_decision((o.best_acceleration_decision) ? new tflite::BestAccelerationDecisionT(*o.best_acceleration_decision) : nullptr),
+        initialization_failure((o.initialization_failure) ? new tflite::BenchmarkInitializationFailureT(*o.initialization_failure) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline MiniBenchmarkEventT &MiniBenchmarkEventT::operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(is_log_flushing_event, o.is_log_flushing_event);
+  std::swap(best_acceleration_decision, o.best_acceleration_decision);
+  std::swap(initialization_failure, o.initialization_failure);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline MiniBenchmarkEventT *MiniBenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MiniBenchmarkEventT>(new MiniBenchmarkEventT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MiniBenchmarkEvent::UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = is_log_flushing_event(); _o->is_log_flushing_event = _e; }
+  { auto _e = best_acceleration_decision(); if (_e) { if(_o->best_acceleration_decision) { _e->UnPackTo(_o->best_acceleration_decision.get(), _resolver); } else { _o->best_acceleration_decision = std::unique_ptr<tflite::BestAccelerationDecisionT>(_e->UnPack(_resolver)); } } else if (_o->best_acceleration_decision) { _o->best_acceleration_decision.reset(); } }
+  { auto _e = initialization_failure(); if (_e) { if(_o->initialization_failure) { _e->UnPackTo(_o->initialization_failure.get(), _resolver); } else { _o->initialization_failure = std::unique_ptr<tflite::BenchmarkInitializationFailureT>(_e->UnPack(_resolver)); } } else if (_o->initialization_failure) { _o->initialization_failure.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
+}
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> MiniBenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMiniBenchmarkEvent(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MiniBenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _is_log_flushing_event = _o->is_log_flushing_event;
+  auto _best_acceleration_decision = _o->best_acceleration_decision ? CreateBestAccelerationDecision(_fbb, _o->best_acceleration_decision.get(), _rehasher) : 0;
+  auto _initialization_failure = _o->initialization_failure ? CreateBenchmarkInitializationFailure(_fbb, _o->initialization_failure.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateMiniBenchmarkEvent(
+      _fbb,
+      _is_log_flushing_event,
+      _best_acceleration_decision,
+      _initialization_failure,
+      _benchmark_event);
+}
+
+
+inline bool operator==(const ModelFileT &lhs, const ModelFileT &rhs) {
+  return
+      (lhs.filename == rhs.filename) &&
+      (lhs.fd == rhs.fd) &&
+      (lhs.offset == rhs.offset) &&
+      (lhs.length == rhs.length) &&
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      (lhs.buffer_handle == rhs.buffer_handle);
+}
+
+inline bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelFileT::ModelFileT(const ModelFileT &o)
+      : filename(o.filename),
+        fd(o.fd),
+        offset(o.offset),
+        length(o.length),
+        model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        buffer_handle(o.buffer_handle) {
+}
+
+inline ModelFileT &ModelFileT::operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(filename, o.filename);
+  std::swap(fd, o.fd);
+  std::swap(offset, o.offset);
+  std::swap(length, o.length);
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(buffer_handle, o.buffer_handle);
+  return *this;
+}
+
+inline ModelFileT *ModelFile::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelFileT>(new ModelFileT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelFile::UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = filename(); if (_e) _o->filename = _e->str(); }
+  { auto _e = fd(); _o->fd = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = length(); _o->length = _e; }
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = buffer_handle(); _o->buffer_handle = _e; }
+}
+
+inline ::flatbuffers::Offset<ModelFile> ModelFile::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelFile(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelFileT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _filename = _o->filename.empty() ? 0 : _fbb.CreateString(_o->filename);
+  auto _fd = _o->fd;
+  auto _offset = _o->offset;
+  auto _length = _o->length;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _buffer_handle = _o->buffer_handle;
+  return tflite::CreateModelFile(
+      _fbb,
+      _filename,
+      _fd,
+      _offset,
+      _length,
+      _model_id_group,
+      _buffer_handle);
+}
+
+
+inline bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+  return
+      (lhs.model_namespace == rhs.model_namespace) &&
+      (lhs.model_id == rhs.model_id);
+}
+
+inline bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelIdGroupT *ModelIdGroup::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelIdGroupT>(new ModelIdGroupT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelIdGroup::UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_namespace(); if (_e) _o->model_namespace = _e->str(); }
+  { auto _e = model_id(); if (_e) _o->model_id = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> ModelIdGroup::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelIdGroup(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelIdGroupT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_namespace = _o->model_namespace.empty() ? 0 : _fbb.CreateString(_o->model_namespace);
+  auto _model_id = _o->model_id.empty() ? 0 : _fbb.CreateString(_o->model_id);
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      _model_namespace,
+      _model_id);
+}
+
+
+inline bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs) {
+  return
+      (lhs.storage_file_path == rhs.storage_file_path) &&
+      (lhs.data_directory_path == rhs.data_directory_path);
+}
+
+inline bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkStoragePathsT *BenchmarkStoragePaths::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkStoragePathsT>(new BenchmarkStoragePathsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkStoragePaths::UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = storage_file_path(); if (_e) _o->storage_file_path = _e->str(); }
+  { auto _e = data_directory_path(); if (_e) _o->data_directory_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> BenchmarkStoragePaths::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkStoragePaths(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkStoragePathsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _storage_file_path = _o->storage_file_path.empty() ? 0 : _fbb.CreateString(_o->storage_file_path);
+  auto _data_directory_path = _o->data_directory_path.empty() ? 0 : _fbb.CreateString(_o->data_directory_path);
+  return tflite::CreateBenchmarkStoragePaths(
+      _fbb,
+      _storage_file_path,
+      _data_directory_path);
+}
+
+
+inline bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs) {
+  return
+      (lhs.per_test_timeout_ms == rhs.per_test_timeout_ms);
+}
+
+inline bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ValidationSettingsT *ValidationSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ValidationSettingsT>(new ValidationSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ValidationSettings::UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = per_test_timeout_ms(); _o->per_test_timeout_ms = _e; }
+}
+
+inline ::flatbuffers::Offset<ValidationSettings> ValidationSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateValidationSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ValidationSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _per_test_timeout_ms = _o->per_test_timeout_ms;
+  return tflite::CreateValidationSettings(
+      _fbb,
+      _per_test_timeout_ms);
+}
+
+
+inline bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
+  return
+      (lhs.settings_to_test.size() == rhs.settings_to_test.size() && std::equal(lhs.settings_to_test.cbegin(), lhs.settings_to_test.cend(), rhs.settings_to_test.cbegin(), [](std::unique_ptr<tflite::TFLiteSettingsT> const &a, std::unique_ptr<tflite::TFLiteSettingsT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      ((lhs.model_file == rhs.model_file) || (lhs.model_file && rhs.model_file && *lhs.model_file == *rhs.model_file)) &&
+      ((lhs.storage_paths == rhs.storage_paths) || (lhs.storage_paths && rhs.storage_paths && *lhs.storage_paths == *rhs.storage_paths)) &&
+      ((lhs.validation_settings == rhs.validation_settings) || (lhs.validation_settings && rhs.validation_settings && *lhs.validation_settings == *rhs.validation_settings));
+}
+
+inline bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MinibenchmarkSettingsT::MinibenchmarkSettingsT(const MinibenchmarkSettingsT &o)
+      : model_file((o.model_file) ? new tflite::ModelFileT(*o.model_file) : nullptr),
+        storage_paths((o.storage_paths) ? new tflite::BenchmarkStoragePathsT(*o.storage_paths) : nullptr),
+        validation_settings((o.validation_settings) ? new tflite::ValidationSettingsT(*o.validation_settings) : nullptr) {
+  settings_to_test.reserve(o.settings_to_test.size());
+  for (const auto &settings_to_test_ : o.settings_to_test) { settings_to_test.emplace_back((settings_to_test_) ? new tflite::TFLiteSettingsT(*settings_to_test_) : nullptr); }
+}
+
+inline MinibenchmarkSettingsT &MinibenchmarkSettingsT::operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(settings_to_test, o.settings_to_test);
+  std::swap(model_file, o.model_file);
+  std::swap(storage_paths, o.storage_paths);
+  std::swap(validation_settings, o.validation_settings);
+  return *this;
+}
+
+inline MinibenchmarkSettingsT *MinibenchmarkSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MinibenchmarkSettingsT>(new MinibenchmarkSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MinibenchmarkSettings::UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = settings_to_test(); if (_e) { _o->settings_to_test.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->settings_to_test[_i]) { _e->Get(_i)->UnPackTo(_o->settings_to_test[_i].get(), _resolver); } else { _o->settings_to_test[_i] = std::unique_ptr<tflite::TFLiteSettingsT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->settings_to_test.resize(0); } }
+  { auto _e = model_file(); if (_e) { if(_o->model_file) { _e->UnPackTo(_o->model_file.get(), _resolver); } else { _o->model_file = std::unique_ptr<tflite::ModelFileT>(_e->UnPack(_resolver)); } } else if (_o->model_file) { _o->model_file.reset(); } }
+  { auto _e = storage_paths(); if (_e) { if(_o->storage_paths) { _e->UnPackTo(_o->storage_paths.get(), _resolver); } else { _o->storage_paths = std::unique_ptr<tflite::BenchmarkStoragePathsT>(_e->UnPack(_resolver)); } } else if (_o->storage_paths) { _o->storage_paths.reset(); } }
+  { auto _e = validation_settings(); if (_e) { if(_o->validation_settings) { _e->UnPackTo(_o->validation_settings.get(), _resolver); } else { _o->validation_settings = std::unique_ptr<tflite::ValidationSettingsT>(_e->UnPack(_resolver)); } } else if (_o->validation_settings) { _o->validation_settings.reset(); } }
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> MinibenchmarkSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMinibenchmarkSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MinibenchmarkSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _settings_to_test = _o->settings_to_test.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>> (_o->settings_to_test.size(), [](size_t i, _VectorArgs *__va) { return CreateTFLiteSettings(*__va->__fbb, __va->__o->settings_to_test[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _model_file = _o->model_file ? CreateModelFile(_fbb, _o->model_file.get(), _rehasher) : 0;
+  auto _storage_paths = _o->storage_paths ? CreateBenchmarkStoragePaths(_fbb, _o->storage_paths.get(), _rehasher) : 0;
+  auto _validation_settings = _o->validation_settings ? CreateValidationSettings(_fbb, _o->validation_settings.get(), _rehasher) : 0;
+  return tflite::CreateMinibenchmarkSettings(
+      _fbb,
+      _settings_to_test,
+      _model_file,
+      _storage_paths,
+      _validation_settings);
+}
+
+
+inline bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+  return
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventStorageT::BenchmarkEventStorageT(const BenchmarkEventStorageT &o)
+      : model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline BenchmarkEventStorageT &BenchmarkEventStorageT::operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline BenchmarkEventStorageT *BenchmarkEventStorage::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventStorageT>(new BenchmarkEventStorageT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEventStorage::UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> BenchmarkEventStorage::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEventStorage(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventStorageT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateBenchmarkEventStorage(
+      _fbb,
+      _model_id_group,
+      _benchmark_event);
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h
new file mode 100644
index 00000000..07ea83fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+// Converts from the C delegate plugin API to the C++ delegate plugin API.
+// Given an instance of the (C) TfLiteOpaqueDelegatePlugin struct, this returns
+// a function that takes a TFLiteSettings FlatBuffer and returns a unique_ptr to
+// an instance of the (C++) DelegatePluginInterface abstract class.
+std::function<std::unique_ptr<tflite::delegates::DelegatePluginInterface>(
+    const ::tflite::TFLiteSettings&)>
+DelegatePluginConverter(const TfLiteOpaqueDelegatePlugin& plugin_c_api);
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/delegate_registry.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000..a6ed2b06
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/delegate_registry.h
+
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
+
+namespace tflite {
+namespace delegates {
+
+using TfLiteOpaqueDelegatePtr = ::tflite::delegates::TfLiteDelegatePtr;
+using DelegatePluginInterface = ::tflite::delegates::DelegatePluginInterface;
+using DelegatePluginRegistry = ::tflite::delegates::DelegatePluginRegistry;
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h
new file mode 100644
index 00000000..12c37a50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+// Converts the provided ComputeSettings from flatbuffer to proto format.
+proto::ComputeSettings ConvertFromFlatbuffer(
+    const ComputeSettings& settings, bool skip_mini_benchmark_settings = false);
+
+proto::ComputeSettings ConvertFromFlatbuffer(
+    const ComputeSettingsT& settings,
+    bool skip_mini_benchmark_settings = false);
+
+// Converts the provided MiniBenchmarkEvent from flatbuffer to proto format.
+proto::MiniBenchmarkEvent ConvertFromFlatbuffer(
+    const MiniBenchmarkEvent& event);
+
+proto::MiniBenchmarkEvent ConvertFromFlatbuffer(
+    const MiniBenchmarkEventT& event);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/gpu_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/gpu_plugin.h
new file mode 100644
index 00000000..a45c44d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/gpu_plugin.h
@@ -0,0 +1,79 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+
+// This file provides the GpuPlugin class, which implements the
+// TFLite Delegate Plugin for the GPU Delegate.
+
+#if defined(__ANDROID__) || defined(CL_DELEGATE_NO_GL)
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
+#endif
+
+#include <string>
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#elif defined(__APPLE__)
+#include "TargetConditionals.h"
+#if (TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR) || \
+    (TARGET_OS_OSX && TARGET_CPU_ARM64)
+// Only enable metal delegate when using a real iPhone device or Apple Silicon.
+#define REAL_IPHONE_DEVICE
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+#endif
+#endif
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+// Note that if running on GPU is not supported for some reason (e.g., desktop
+// machine with no OpenGL/CL), this library will still compile but calling
+// Create() will return a nullptr.
+class GpuPlugin : public DelegatePluginInterface {
+ public:
+  explicit GpuPlugin(const TFLiteSettings& tflite_settings);
+  static std::unique_ptr<DelegatePluginInterface> New(
+      const TFLiteSettings& acceleration);
+
+  TfLiteDelegatePtr Create() override;
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override;
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+  const TfLiteGpuDelegateOptionsV2& Options() { return options_; }
+#elif defined(REAL_IPHONE_DEVICE)
+  const TFLGpuDelegateOptions& Options() { return options_; }
+#endif
+
+  std::string GetCacheDir() const { return cache_dir_; }
+  std::string GetModelToken() const { return model_token_; }
+
+ private:
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+  TfLiteGpuDelegateOptionsV2 options_;
+#elif defined(REAL_IPHONE_DEVICE)
+  TFLGpuDelegateOptions options_;
+#endif
+  std::string cache_dir_;
+  std::string model_token_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h
new file mode 100644
index 00000000..c1496ad7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+// Converts the provided TFLiteSettings from proto to flatbuffer format.
+// The returned TFLiteSettings pointer is only valid until either the
+// FlatBufferBuilder is modified or when the FlatBufferBuilder's lifetime ends.
+const TFLiteSettings* ConvertFromProto(
+    const proto::TFLiteSettings& proto_settings,
+    flatbuffers::FlatBufferBuilder* builder);
+
+// Converts the provided ComputeSettings from proto to flatbuffer format.
+// The returned ComputeSettings pointer is only valid until either the
+// FlatBufferBuilder is modified or when the FlatBufferBuilder's lifetime ends.
+const ComputeSettings* ConvertFromProto(
+    const proto::ComputeSettings& proto_settings,
+    flatbuffers::FlatBufferBuilder* builder);
+
+// Converts the provided MiniBenchmarkSettings from proto to flatbuffer format.
+// The returned MinibenchmarkSettings pointer is only valid until either the
+// FlatBufferBuilder is modified or when the FlatBufferBuilder's lifetime ends.
+const MinibenchmarkSettings* ConvertFromProto(
+    const proto::MinibenchmarkSettings& proto_settings,
+    flatbuffers::FlatBufferBuilder* builder);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h
new file mode 100644
index 00000000..c1e7590d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+
+// This file provides the StableDelegatePlugin class, which implements the
+// TFLite Delegate Plugin Interface for the stable delegates.
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace delegates {
+
+class StableDelegatePlugin : public DelegatePluginInterface {
+ public:
+  static std::unique_ptr<StableDelegatePlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return std::make_unique<StableDelegatePlugin>(tflite_settings);
+  }
+
+  explicit StableDelegatePlugin(const TFLiteSettings& tflite_settings) {
+    // Creates a copy of TFLiteSettings within the stable delegate plugin.
+    TFLiteSettingsT tflite_settings_t;
+    tflite_settings.UnPackTo(&tflite_settings_t);
+    tflite_settings_builder_.Finish(
+        CreateTFLiteSettings(tflite_settings_builder_, &tflite_settings_t));
+    const StableDelegateLoaderSettings* stable_delegate_loader_settings =
+        GetTFLiteSettings()->stable_delegate_loader_settings();
+    if (!stable_delegate_loader_settings ||
+        !stable_delegate_loader_settings->delegate_path() ||
+        stable_delegate_loader_settings->delegate_path()->Length() == 0) {
+      TFLITE_LOG(ERROR) << "The delegate path field is not available from the "
+                           "provided stable delegate loader settings.";
+      return;
+    }
+    const auto* stable_delegate_ = utils::LoadDelegateFromSharedLibrary(
+        stable_delegate_loader_settings->delegate_path()->str());
+    if (!stable_delegate_) {
+      TFLITE_LOG(ERROR) << "Failed to load stable delegate plugin symbol from "
+                        << stable_delegate_loader_settings->delegate_path();
+      return;
+    }
+    stable_delegate_plugin_ = stable_delegate_->delegate_plugin;
+    TFLITE_LOG(INFO)
+        << "The stable delegate plugin has loaded delegate plugin for "
+        << stable_delegate_->delegate_name;
+  }
+
+  TfLiteDelegatePtr Create() override {
+    return TfLiteDelegatePtr(
+        stable_delegate_plugin_->create(GetTFLiteSettings()),
+        stable_delegate_plugin_->destroy);
+  }
+
+  int GetDelegateErrno(TfLiteOpaqueDelegate* from_delegate) override {
+    return stable_delegate_plugin_->get_delegate_errno(from_delegate);
+  }
+
+ private:
+  const TFLiteSettings* GetTFLiteSettings() {
+    return flatbuffers::GetRoot<TFLiteSettings>(
+        tflite_settings_builder_.GetBufferPointer());
+  }
+
+  const TfLiteOpaqueDelegatePlugin* stable_delegate_plugin_;
+  flatbuffers::FlatBufferBuilder tflite_settings_builder_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/allocation.h b/third_party/tflite-hdrs/tensorflow/lite/allocation.h
new file mode 100644
index 00000000..b2a03a66
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/allocation.h
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Memory management for TF Lite.
+#ifndef TENSORFLOW_LITE_ALLOCATION_H_
+#define TENSORFLOW_LITE_ALLOCATION_H_
+
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+
+#endif  // TENSORFLOW_LITE_ALLOCATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/arena_planner.h b/third_party/tflite-hdrs/tensorflow/lite/arena_planner.h
new file mode 100644
index 00000000..f4644d15
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/arena_planner.h
@@ -0,0 +1,173 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ARENA_PLANNER_H_
+#define TENSORFLOW_LITE_ARENA_PLANNER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/simple_memory_arena.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+constexpr const int kDefaultArenaAlignment = 64;
+
+// A memory planner that makes all the allocations using arenas.
+//
+// Before a model is executed by the interpreter, this class determines when
+// each tensor needs to be allocated and deallocated, and preallocates all the
+// necessary memory (the PlanAllocations phase). It then assigns portions of
+// this memory buffer to each tensor (the ExecuteAllocations phase). Tensors may
+// share some of the buffer if a tensor B is to be allocated after another
+// tensor A has been deallocated.
+//
+// If dynamic tensors are used the planning steps can be repeated during model
+// execution. Since dynamic tensors don't have sizes until after the
+// corresponding operation is executed, this class supports incremental
+// planning.
+class ArenaPlanner : public MemoryPlanner {
+ public:
+  // Ownership of 'context' is not taken and it must remain util the
+  // ArenaPlanner is destroyed. The inputs to the graph will not share
+  // memory with any other tensor, effectively preserving them until the end
+  // of inference.
+  ArenaPlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info,
+               bool preserve_all_tensors, int tensor_alignment,
+               int subgraph_index = 0);
+  ~ArenaPlanner() override;
+  ArenaPlanner(const ArenaPlanner&) = delete;
+  ArenaPlanner& operator=(const ArenaPlanner&) = delete;
+
+  TfLiteStatus ResetAllocations() override;
+  TfLiteStatus ResetAllocationsAfter(int node) override;
+  TfLiteStatus PlanAllocations() override;
+  TfLiteStatus ExecuteAllocations(int first_node, int last_node) override;
+  TfLiteStatus ReleaseNonPersistentMemory() override;
+  TfLiteStatus AcquireNonPersistentMemory() override;
+  bool HasNonPersistentMemory() override;
+  void DumpDebugInfo(const std::vector<int>& execution_plan) const override;
+  void GetAllocInfo(size_t* arena_size,
+                    size_t* arena_persist_size) const override;
+
+  // Returns the base arena location for a given allocation type.
+  std::intptr_t BasePointer(TfLiteAllocationType type);
+
+ private:
+  // Check whether the input tensor's memory may be shared the output tensor.
+  // tensor_changed: true if the output tensor modifies the tensor data. For
+  // example, `Reshape` doesn't modify data but Add does.
+  bool InputTensorCanBeShared(const TfLiteTensor& input,
+                              const TfLiteTensor& output, int input_id,
+                              int output_id, bool tensor_changed);
+
+  // Identify tensors which can share memory with another.
+  void IdentifyInPlaceTensors();
+
+  // Make sure all the arenas have reserved enough memory to store all their
+  // tensors.
+  TfLiteStatus Commit(bool* arena_reallocated);
+
+  // Sorts tensors_to_allocate` using by the following ordering:
+  // - Tensors that have lifespan through the whole model inference time go
+  // first;
+  // - Other tensors (e.g. intermediate and temporary ones) are sorted from
+  // largest to smallest. For equal sized tensors, the tensor which is used
+  // first goes first.
+  void CreateTensorAllocationVector(std::vector<int32_t>* tensors_to_allocate);
+
+  // Returns vector containing the indices of all tensors allocated between
+  // `first_node` and `last_node`.
+  std::vector<int32_t> GetTensorsToAllocate(int first_node, int last_node);
+
+  // Traverse the allocation queue and reserve space in the appropriate arena
+  // for all tensors affected by ops in the interval [first_node, last_node].
+  TfLiteStatus CalculateAllocations(int first_node, int last_node,
+                                    std::vector<int32_t>* tensors_allocated);
+
+  // Assign absolute memory location to a tensor, based on its relative
+  // position inside the corresponding arena buffer.
+  TfLiteStatus ResolveTensorAllocation(int32_t tensor_index,
+                                       TfLiteTensor* tensors);
+
+  // Register an allocation for all internal (temporary) tensors of
+  // 'node_index'.
+  TfLiteStatus CalculateAllocationOfInternalTensors(int node_index);
+
+  // Register a deallocation for all internal (temporary) tensors of
+  // 'node_index'.
+  TfLiteStatus CalculateDeallocationOfInternalTensors(int node_index);
+
+  // Return the index of the tensor owing `tensor_index's` buffer.
+  int FindSharedTensor(int tensor_index);
+
+  TfLiteContext* context_;
+  std::unique_ptr<GraphInfo> graph_info_;
+
+  // Stores allocation data for all tensors.
+  std::vector<ArenaAllocWithUsageInterval> allocs_;
+
+  // Map of Tensors allocated by each node.
+  // NOLINTNEXTLINE - absl::flat_hash_set increases binary size by 106kB.
+  std::vector<std::unordered_set<int32_t>> nodes_to_tensors_;
+
+  // First node, that uses the tensor. It needs to be allocated before
+  // execution of the node's operation.
+  std::vector<int32_t> alloc_node_;
+
+  // Last node, that uses the tensor. It can be deallocated after execution of
+  // the node's operation.
+  std::vector<int32_t> dealloc_node_;
+
+  // Raw memory buffer that is allocated for all temporary and graph outputs
+  // that are declared kTfLiteArenaRw.
+  SimpleMemoryArena arena_;
+  // True when the arena_ has allocated memory (Commit was called).
+  bool has_nonpersistent_memory_;
+
+  // Raw memory buffer that is allocated for persistent tensors that are
+  // declared as kTfLiteArenaRwPersistent.
+  SimpleMemoryArena persistent_arena_;
+
+  // If true, then no overlapping of memory areas is done, meaning intermediate
+  // tensors and temporary tensors can be queried after running.
+  // (modulo running delegates)
+  bool preserve_all_tensors_;
+
+  // Number of bytes that tensor buffers should be aligned to.
+  int tensor_alignment_;
+
+  // Index of the last node whose tensors were allocated.
+  int last_active_node_;
+
+  // Holds index of original tensor if the tensor is sharing underlined
+  // data with another tensor.
+  // NOLINTNEXTLINE - absl::flat_hash_map increases binary size by 106kB.
+  std::unordered_map<int32_t, int32_t> actual_tensor_id_;
+
+  // Store number of references to each tensor.
+  std::vector<int> refcounts_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ARENA_PLANNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/array.h b/third_party/tflite-hdrs/tensorflow/lite/array.h
new file mode 100644
index 00000000..44550cd1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/array.h
@@ -0,0 +1,156 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ARRAY_H_
+#define TENSORFLOW_LITE_ARRAY_H_
+
+#include <cstring>
+#include <initializer_list>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+/// TfLite*Array helpers
+
+namespace array_internal {
+
+// Function object used as a deleter for unique_ptr holding TFLite*Array
+// objects.
+struct TfLiteArrayDeleter {
+  void operator()(TfLiteIntArray* a);
+  void operator()(TfLiteFloatArray* a);
+};
+
+// Maps T to the corresponding TfLiteArray type.
+template <class T>
+struct TfLiteArrayInfo;
+
+template <>
+struct TfLiteArrayInfo<int> {
+  using Type = TfLiteIntArray;
+};
+
+template <>
+struct TfLiteArrayInfo<float> {
+  using Type = TfLiteFloatArray;
+};
+
+}  // namespace array_internal
+
+template <class T>
+using TfLiteArrayUniquePtr =
+    std::unique_ptr<typename array_internal::TfLiteArrayInfo<T>::Type,
+                    array_internal::TfLiteArrayDeleter>;
+
+// `unique_ptr` wrapper for `TfLiteIntArray`s.
+using IntArrayUniquePtr = TfLiteArrayUniquePtr<int>;
+
+// `unique_ptr` wrapper for `TfLiteFloatArray`s.
+using FloatArrayUniquePtr = TfLiteArrayUniquePtr<float>;
+
+// Allocates a TfLiteArray of given size using malloc.
+//
+// This builds an int array by default as this is the overwhelming part of the
+// use cases.
+template <class T = int>
+TfLiteArrayUniquePtr<T> BuildTfLiteArray(int size);
+
+// Allocates a TfLiteIntArray of given size using malloc.
+template <>
+inline IntArrayUniquePtr BuildTfLiteArray<int>(const int size) {
+  return IntArrayUniquePtr(TfLiteIntArrayCreate(size));
+}
+
+// Allocates a TfLiteFloatArray of given size using malloc.
+template <>
+inline FloatArrayUniquePtr BuildTfLiteArray<float>(const int size) {
+  return FloatArrayUniquePtr(TfLiteFloatArrayCreate(size));
+}
+
+// Allocates a TFLiteArray of given size and initializes it with the given
+// values.
+//
+// `values` is expected to holds `size` elements.
+//
+// If T is explicitely specified and the type of values is not the same as T,
+// then a static_cast is performed.
+template <class T = void, class U,
+          class Type = std::conditional_t<std::is_same<T, void>::value, U, T>>
+TfLiteArrayUniquePtr<Type> BuildTfLiteArray(const int size,
+                                            const U* const values) {
+  TfLiteArrayUniquePtr<Type> array = BuildTfLiteArray<Type>(size);
+  // If size is 0, the array pointer may be null.
+  if (array && values) {
+    if (std::is_same<Type, U>::value) {
+      memcpy(array->data, values, size * sizeof(Type));
+    } else {
+      for (int i = 0; i < size; ++i) {
+        array->data[i] = static_cast<Type>(values[i]);
+      }
+    }
+  }
+  return array;
+}
+
+// Allocates a TFLiteArray and initializes it with the given array.
+//
+// `values` is expected to holds `size` elements.
+template <class T, size_t N>
+TfLiteArrayUniquePtr<T> BuildTfLiteArray(const T (&values)[N]) {
+  return BuildTfLiteArray<T>(static_cast<int>(N), values);
+}
+
+// Allocates a TFLiteArray and initializes it with the given values.
+//
+// This uses SFINAE to only be picked up by for types that implement `data()`
+// and `size()` member functions. We cannot reuse detection facilities provided
+// by Abseil in this code.
+//
+// To conform with the other overloads, we allow specifying the type of the
+// array as well as deducing it from the container.
+template <
+    class T = void, class Container,
+    class ElementType =
+        std::decay_t<decltype(*std::declval<Container>().data())>,
+    class SizeType = std::decay_t<decltype(std::declval<Container>().size())>,
+    class Type =
+        std::conditional_t<std::is_same<T, void>::value, ElementType, T>>
+TfLiteArrayUniquePtr<Type> BuildTfLiteArray(const Container& values) {
+  return BuildTfLiteArray<Type>(static_cast<int>(values.size()), values.data());
+}
+
+// Allocates a TFLiteArray and initializes it with the given values.
+template <class T>
+TfLiteArrayUniquePtr<T> BuildTfLiteArray(
+    const std::initializer_list<T>& values) {
+  return BuildTfLiteArray(static_cast<int>(values.size()), values.begin());
+}
+
+// Allocates a TFLiteArray and initializes it with the given array.
+inline IntArrayUniquePtr BuildTfLiteArray(const TfLiteIntArray& other) {
+  return BuildTfLiteArray(other.size, other.data);
+}
+
+// Allocates a TFLiteArray and initializes it with the given array.
+inline FloatArrayUniquePtr BuildTfLiteArray(const TfLiteFloatArray& other) {
+  return BuildTfLiteArray(other.size, other.data);
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ARRAY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/backend_async_kernel_interface.h b/third_party/tflite-hdrs/tensorflow/lite/async/backend_async_kernel_interface.h
new file mode 100644
index 00000000..c8d94341
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/backend_async_kernel_interface.h
@@ -0,0 +1,214 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+#define TENSORFLOW_LITE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/async/c/async_kernel.h"
+#include "tensorflow/lite/async/c/types.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace delegates {
+
+// A C++ wrapper around TfLiteAsyncKernel C API that delegate developers
+// can use to add support for asynchronous execution.
+// The implementation of `BackendAsyncKernelInterface` must be thread safe.
+class BackendAsyncKernelInterface {
+ public:
+  BackendAsyncKernelInterface();
+  virtual ~BackendAsyncKernelInterface() { TfLiteAsyncKernelDelete(kernel_); }
+
+  // Returns the TfLiteAsyncKernel instance.
+  // kernel_ will be filled with the implementation of the class.
+  virtual TfLiteAsyncKernel* kernel() { return kernel_; }
+
+  // The following methods should be implemented to support buffer interop
+  // and asynchronous execution.
+
+  // Buffer operations
+  // ======================
+  // Registers the TfLiteBackendBuffer to `handle`.
+  // `TfLiteBackendBuffer` is a wrapper around a platform-specific buffer
+  // object (e.g. AHardwareBuffer).
+  // `buffer` and `attrs` lifespan is not guaranteed after the function call.
+  // kernels should read the stored attributes instead of caching the
+  // attribute map.
+  // `io_type` specifies whether this buffer is used as an input buffer
+  // or an output buffer. If a buffer is both used as input and output,
+  // specify it as output. Not null.
+  // `attrs` describes the attributes of the buffer. It's guaranteed to be
+  // of kTfLiteAttrMapTypeBuffer type and not null.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // In `attrs`, the application must provide the type of the buffer.
+  // If additional attributes (e.g. padding, size) are provided, the backend
+  // is responsible for validating those attributes to be compatible.
+  // The backend will not own the actual buffer wrapped in `buffer`, but the
+  // backend can choose to increase the ref count if underlying implementation
+  // supports that.
+  virtual TfLiteStatus RegisterBuffer(TfLiteOpaqueContext* context,
+                                      TfLiteIoType io_type,
+                                      const TfLiteBackendBuffer* buffer,
+                                      const TfLiteAttributeMap* attrs,
+                                      TfLiteBufferHandle handle) = 0;
+
+  // Registers a buffer slice from a previously registered memory.
+  // `buffer` is the handle of the buffer pool previously registered.
+  // `attrs` contains the information of the buffer slice.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // NOTE: The backend is responsible to validate the slicing is "valid":
+  // * The slicing is not nested from another slice. (i.e. the `buffer_pool` is
+  //   a handle returned by `RegisterBuffer`.)
+  // * The attributes of the slice (e.g. size, offset) is of valid values
+  //   from the buffer pool.
+  // If the `handle` is not recognized, returns error.
+  virtual TfLiteStatus RegisterBufferSlice(TfLiteOpaqueContext* context,
+                                           TfLiteBufferHandle buffer_pool,
+                                           const TfLiteAttributeMap* attrs,
+                                           TfLiteBufferHandle handle) = 0;
+
+  // Unregisters a buffer or a buffer slice.
+  // `handle` is a buffer handle previously assigned via register_* calls.
+  // If the `handle` is not recognized, returns error.
+  // Unregistering the buffer does not mean deallocating the buffer. However
+  // the backend need to reduce the ref-count if ref counting is performed
+  // during `Register*` calls.
+  virtual TfLiteStatus UnregisterBuffer(TfLiteOpaqueContext* context,
+                                        TfLiteBufferHandle handle) = 0;
+
+  // Reconciliations
+  // ===================
+  // Inspects the buffer object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // buffer.
+  virtual const std::vector<const char*>& SupportedBufferTypes(
+      TfLiteIoType io_type) const = 0;
+
+  // Inspects the sync object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // sync object.
+  virtual const std::vector<const char*>& SupportedSynchronizations(
+      TfLiteIoType io_type) const = 0;
+
+  // Reconciles buffer or sync attributes for tensor at tensor_index.
+  // Fills `merged` with reconciled attributes.
+  // If `conflict` is provided, conflicting attributes will be provided there.
+  // If the type of the `user_provided_attributes` is not recognizable, returns
+  // error.
+  // If any of the attribute in the `user_provided_attributes` is not
+  // recognizable skip this attribute.
+  // Returns true if the attribute map type is recognizable and there's no
+  // conflicting attribute.
+  virtual bool ReconcileRestrictions(
+      const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node,
+      int tensor_index, const TfLiteAttributeMap* user_provided_attributes,
+      TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const = 0;
+
+  // Sets the input / output buffer / sync attributes.
+  // Backend kernel will check the input attributes covers all the requirements.
+  // A typical workflow is for callers call Reconcile*Restrictions method
+  // above to have a merged attribute list, check all restrictions are met
+  // and set input / output attribute here.
+  // Returns TfLiteOk if provided `attrs` covers all requirements.
+  virtual TfLiteStatus SetAttributes(TfLiteOpaqueContext* context,
+                                     TfLiteOpaqueNode* node, int tensor_index,
+                                     const TfLiteAttributeMap* attrs) = 0;
+
+  // Set buffer's attributes. Backend will check if the buffer has been
+  // registered. And return TfLiteOk if the `attrs` for the `buffer` could be
+  // set in the corresponding async kernel.
+  virtual TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                           const TfLiteAttributeMap* attrs) = 0;
+
+  // Get buffer's attributes. Backend will check if the buffer has been
+  // registered. And return TfLiteOk if provided `attrs` for the `buffer` could
+  // be found in the registration pool in corresponding async kernel. If `attrs`
+  // is a non-empty map, it will be overwritten by the attributes of the
+  // `buffer`.
+  virtual TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                           TfLiteAttributeMap* attrs) = 0;
+
+  // Prepares the kernel using the information from Set[In|Out]putAttributes
+  // call above.
+  virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                               TfLiteOpaqueNode* node) = 0;
+
+  // Execution methods
+  // =============================
+
+  // Schedules an execution with the information provided in task.
+  // The application is responsible for filling out buffer and sync mappings
+  // to tensors.
+  // Backend will set the sync ptr for related tensors if requested.
+  // i.e. SetOutputAttributes has sync implementation requested, and
+  // the TfLiteSynchronization is not null for the tensor in `task`.
+  //
+  // TfLite runtime guarantees that the task is in ready state (i.e. no
+  // un-ended execution for this task).
+  //
+  // Input synchronizations:
+  // If the synchronization of a input tensor is `kTfLiteSyncTypeNoSyncObj`
+  // type or it's nullptr, it means the data is ready during Eval call.
+  // If not, data will be available when the synchronization signals and the
+  // backend is responsible for closing the underlying synchronization.
+  // The backend is responsible for dedupping the input sync.
+  //
+  // Output synchronizations:
+  // If the synchronization type is `kTfLiteSyncTypeNoSyncObj` or is nullptr,
+  // the backend does not need to provide synchronization objects to the user.
+  // Otherwise, the backend need to provide the sync according to the sync type
+  // provided. The underlying sync object will be closed by the app (or
+  // downstream components).
+  // If there are multiple non-nullptr kTfLiteSynchronization provided for
+  // different output tensors, the backend is responsible for duplicating the
+  // synchronization.
+  // TODO(b/191883048): What if the sync fence is not dup-able?
+  //
+  // Returns kTfLiteOk if the execution is successfully scheduled.
+  virtual TfLiteStatus Eval(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node,
+                            TfLiteExecutionTask* task) = 0;
+
+  // Waits on the execution scheduled using the task to finish.
+  // TfLite runtime guarantees that the task has an un-ended execution.
+  //
+  // Callers should be able to call `Wait` on the same task from multiple
+  // threads, and those calls should return the same status (i.e. if the backend
+  // failed to successfully wait on the task, all `Wait` to the task should
+  // return the same error before a new invocation is scheduled). Returns
+  // kTfLiteOk if the task is finished (w/ or w/o blocking).
+  virtual TfLiteStatus Wait(TfLiteOpaqueContext* context,
+                            TfLiteExecutionTask* task) = 0;
+
+  // Finishes the task and clean up allocated resources for the task.
+  // May block if there's pending executions.
+  // This function will be called once and only once for individual task.
+  // Returns kTfLiteOk if there's no error. The backend is responsible to
+  // clean up task resources regardless there's error or not.
+  virtual TfLiteStatus Finish(TfLiteOpaqueContext* context,
+                              TfLiteExecutionTask* task) = 0;
+
+ protected:
+  TfLiteAsyncKernel* kernel_ = nullptr;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/c/async_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/async/c/async_kernel.h
new file mode 100644
index 00000000..d49c72f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/c/async_kernel.h
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_C_ASYNC_KERNEL_H_
+#define TENSORFLOW_LITE_ASYNC_C_ASYNC_KERNEL_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/async/c/async_kernel.h.
+
+#include "tensorflow/lite/core/async/c/async_kernel.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_C_ASYNC_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/c/async_signature_runner.h b/third_party/tflite-hdrs/tensorflow/lite/async/c/async_signature_runner.h
new file mode 100644
index 00000000..7eacd0cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/c/async_signature_runner.h
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
+#define TENSORFLOW_LITE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/async/c/async_signature_runner.h.
+
+#include "tensorflow/lite/core/async/c/async_signature_runner.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/c/task.h b/third_party/tflite-hdrs/tensorflow/lite/async/c/task.h
new file mode 100644
index 00000000..891e4183
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/c/task.h
@@ -0,0 +1,21 @@
+
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_C_TASK_H_
+#define TENSORFLOW_LITE_ASYNC_C_TASK_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/async/c/task.h.
+
+#include "tensorflow/lite/core/async/c/task.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_C_TASK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/c/types.h b/third_party/tflite-hdrs/tensorflow/lite/async/c/types.h
new file mode 100644
index 00000000..6b509427
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/c/types.h
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_C_TYPES_H_
+#define TENSORFLOW_LITE_ASYNC_C_TYPES_H_
+
+/// For documentation, see
+/// tensorflow/lite/core/async/c/types.h.
+
+#include "tensorflow/lite/core/async/c/types.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_C_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/attribute_map.h b/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/attribute_map.h
new file mode 100644
index 00000000..7da44462
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/attribute_map.h
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
+#define TENSORFLOW_LITE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
+
+/// For documentation, see
+/// tensorflow/lite/core/async/interop/c/attribute_map.h.
+
+#include "tensorflow/lite/core/async/interop/c/attribute_map.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/constants.h b/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/constants.h
new file mode 100644
index 00000000..6b151dde
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/constants.h
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
+#define TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/async/interop/c/constants.h
+
+#include "tensorflow/lite/core/async/interop/c/constants.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/types.h b/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/types.h
new file mode 100644
index 00000000..e3bee9f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/interop/c/types.h
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_INTEROP_C_TYPES_H_
+#define TENSORFLOW_LITE_ASYNC_INTEROP_C_TYPES_H_
+
+/// For documentation, see
+/// tensorflow/lite/core/async/interop/c/types.h.
+
+#include "tensorflow/lite/core/async/interop/c/types.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ASYNC_INTEROP_C_TYPES_H_
\ No newline at end of file
diff --git a/third_party/tflite-hdrs/tensorflow/lite/async/testing/mock_async_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/async/testing/mock_async_kernel.h
new file mode 100644
index 00000000..be31a2a7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/async/testing/mock_async_kernel.h
@@ -0,0 +1,84 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+#define TENSORFLOW_LITE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/async/c/types.h"
+
+namespace tflite {
+namespace async {
+namespace testing {
+
+// A fully mocked out async kernel.
+// Mocked TfLiteAsyncKernel can be retreived by `MockAsyncKernel::kernel()`.
+class MockAsyncKernel : public delegates::BackendAsyncKernelInterface {
+ public:
+  MOCK_METHOD(TfLiteStatus, RegisterBuffer,
+              (TfLiteOpaqueContext*, TfLiteIoType, const TfLiteBackendBuffer*,
+               const TfLiteAttributeMap*, TfLiteBufferHandle),
+              (override));
+  MOCK_METHOD(TfLiteStatus, RegisterBufferSlice,
+              (TfLiteOpaqueContext*, TfLiteBufferHandle,
+               const TfLiteAttributeMap*, TfLiteBufferHandle),
+              (override));
+  MOCK_METHOD(TfLiteStatus, UnregisterBuffer,
+              (TfLiteOpaqueContext*, TfLiteBufferHandle), (override));
+  MOCK_METHOD(bool, ReconcileRestrictions,
+              (const TfLiteOpaqueContext*, const TfLiteOpaqueNode*, int,
+               const TfLiteAttributeMap*, TfLiteAttributeMap*,
+               TfLiteAttributeMap*),
+              (const, override));
+  MOCK_METHOD(TfLiteStatus, SetAttributes,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
+               const TfLiteAttributeMap*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, SetBufferAttributes,
+              (const TfLiteBackendBuffer*, const TfLiteAttributeMap*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, GetBufferAttributes,
+              (const TfLiteBackendBuffer*, TfLiteAttributeMap*), (override));
+  MOCK_METHOD(TfLiteStatus, Prepare, (TfLiteOpaqueContext*, TfLiteOpaqueNode*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Eval,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, TfLiteExecutionTask*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Wait, (TfLiteOpaqueContext*, TfLiteExecutionTask*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Finish,
+              (TfLiteOpaqueContext*, TfLiteExecutionTask*), (override));
+
+  const std::vector<const char*>& SupportedBufferTypes(
+      TfLiteIoType io_type) const override {
+    return buffer_types_;
+  }
+  const std::vector<const char*>& SupportedSynchronizations(
+      TfLiteIoType io_type) const override {
+    return sync_types_;
+  }
+
+ private:
+  const std::vector<const char*> buffer_types_{"buffer_type"};
+  const std::vector<const char*> sync_types_{"sync_type"};
+};
+
+}  // namespace testing
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/builtin_op_data.h b/third_party/tflite-hdrs/tensorflow/lite/builtin_op_data.h
new file mode 100644
index 00000000..161801cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/builtin_op_data.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for new location of interface definitions.
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+#endif  // TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/builtin_ops.h b/third_party/tflite-hdrs/tensorflow/lite/builtin_ops.h
new file mode 100644
index 00000000..21c59eb4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/builtin_ops.h
@@ -0,0 +1,245 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OPS_H_
+#define TENSORFLOW_LITE_BUILTIN_OPS_H_
+
+// DO NOT EDIT MANUALLY: This file is automatically generated by
+// `schema/builtin_ops_header/generator.cc`.
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The enum for builtin operators.
+// Note: CUSTOM, DELEGATE, and PLACEHOLDER_FOR_GREATER_OP_CODES are 3 special
+// ops which are not real built-in ops.
+typedef enum {
+  kTfLiteBuiltinAdd = 0,
+  kTfLiteBuiltinAveragePool2d = 1,
+  kTfLiteBuiltinConcatenation = 2,
+  kTfLiteBuiltinConv2d = 3,
+  kTfLiteBuiltinDepthwiseConv2d = 4,
+  kTfLiteBuiltinDepthToSpace = 5,
+  kTfLiteBuiltinDequantize = 6,
+  kTfLiteBuiltinEmbeddingLookup = 7,
+  kTfLiteBuiltinFloor = 8,
+  kTfLiteBuiltinFullyConnected = 9,
+  kTfLiteBuiltinHashtableLookup = 10,
+  kTfLiteBuiltinL2Normalization = 11,
+  kTfLiteBuiltinL2Pool2d = 12,
+  kTfLiteBuiltinLocalResponseNormalization = 13,
+  kTfLiteBuiltinLogistic = 14,
+  kTfLiteBuiltinLshProjection = 15,
+  kTfLiteBuiltinLstm = 16,
+  kTfLiteBuiltinMaxPool2d = 17,
+  kTfLiteBuiltinMul = 18,
+  kTfLiteBuiltinRelu = 19,
+  kTfLiteBuiltinReluN1To1 = 20,
+  kTfLiteBuiltinRelu6 = 21,
+  kTfLiteBuiltinReshape = 22,
+  kTfLiteBuiltinResizeBilinear = 23,
+  kTfLiteBuiltinRnn = 24,
+  kTfLiteBuiltinSoftmax = 25,
+  kTfLiteBuiltinSpaceToDepth = 26,
+  kTfLiteBuiltinSvdf = 27,
+  kTfLiteBuiltinTanh = 28,
+  kTfLiteBuiltinConcatEmbeddings = 29,
+  kTfLiteBuiltinSkipGram = 30,
+  kTfLiteBuiltinCall = 31,
+  kTfLiteBuiltinCustom = 32,
+  kTfLiteBuiltinEmbeddingLookupSparse = 33,
+  kTfLiteBuiltinPad = 34,
+  kTfLiteBuiltinUnidirectionalSequenceRnn = 35,
+  kTfLiteBuiltinGather = 36,
+  kTfLiteBuiltinBatchToSpaceNd = 37,
+  kTfLiteBuiltinSpaceToBatchNd = 38,
+  kTfLiteBuiltinTranspose = 39,
+  kTfLiteBuiltinMean = 40,
+  kTfLiteBuiltinSub = 41,
+  kTfLiteBuiltinDiv = 42,
+  kTfLiteBuiltinSqueeze = 43,
+  kTfLiteBuiltinUnidirectionalSequenceLstm = 44,
+  kTfLiteBuiltinStridedSlice = 45,
+  kTfLiteBuiltinBidirectionalSequenceRnn = 46,
+  kTfLiteBuiltinExp = 47,
+  kTfLiteBuiltinTopkV2 = 48,
+  kTfLiteBuiltinSplit = 49,
+  kTfLiteBuiltinLogSoftmax = 50,
+  kTfLiteBuiltinDelegate = 51,
+  kTfLiteBuiltinBidirectionalSequenceLstm = 52,
+  kTfLiteBuiltinCast = 53,
+  kTfLiteBuiltinPrelu = 54,
+  kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
+  kTfLiteBuiltinLess = 58,
+  kTfLiteBuiltinNeg = 59,
+  kTfLiteBuiltinPadv2 = 60,
+  kTfLiteBuiltinGreater = 61,
+  kTfLiteBuiltinGreaterEqual = 62,
+  kTfLiteBuiltinLessEqual = 63,
+  kTfLiteBuiltinSelect = 64,
+  kTfLiteBuiltinSlice = 65,
+  kTfLiteBuiltinSin = 66,
+  kTfLiteBuiltinTransposeConv = 67,
+  kTfLiteBuiltinSparseToDense = 68,
+  kTfLiteBuiltinTile = 69,
+  kTfLiteBuiltinExpandDims = 70,
+  kTfLiteBuiltinEqual = 71,
+  kTfLiteBuiltinNotEqual = 72,
+  kTfLiteBuiltinLog = 73,
+  kTfLiteBuiltinSum = 74,
+  kTfLiteBuiltinSqrt = 75,
+  kTfLiteBuiltinRsqrt = 76,
+  kTfLiteBuiltinShape = 77,
+  kTfLiteBuiltinPow = 78,
+  kTfLiteBuiltinArgMin = 79,
+  kTfLiteBuiltinFakeQuant = 80,
+  kTfLiteBuiltinReduceProd = 81,
+  kTfLiteBuiltinReduceMax = 82,
+  kTfLiteBuiltinPack = 83,
+  kTfLiteBuiltinLogicalOr = 84,
+  kTfLiteBuiltinOneHot = 85,
+  kTfLiteBuiltinLogicalAnd = 86,
+  kTfLiteBuiltinLogicalNot = 87,
+  kTfLiteBuiltinUnpack = 88,
+  kTfLiteBuiltinReduceMin = 89,
+  kTfLiteBuiltinFloorDiv = 90,
+  kTfLiteBuiltinReduceAny = 91,
+  kTfLiteBuiltinSquare = 92,
+  kTfLiteBuiltinZerosLike = 93,
+  kTfLiteBuiltinFill = 94,
+  kTfLiteBuiltinFloorMod = 95,
+  kTfLiteBuiltinRange = 96,
+  kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
+  kTfLiteBuiltinAbs = 101,
+  kTfLiteBuiltinSplitV = 102,
+  kTfLiteBuiltinUnique = 103,
+  kTfLiteBuiltinCeil = 104,
+  kTfLiteBuiltinReverseV2 = 105,
+  kTfLiteBuiltinAddN = 106,
+  kTfLiteBuiltinGatherNd = 107,
+  kTfLiteBuiltinCos = 108,
+  kTfLiteBuiltinWhere = 109,
+  kTfLiteBuiltinRank = 110,
+  kTfLiteBuiltinElu = 111,
+  kTfLiteBuiltinReverseSequence = 112,
+  kTfLiteBuiltinMatrixDiag = 113,
+  kTfLiteBuiltinQuantize = 114,
+  kTfLiteBuiltinMatrixSetDiag = 115,
+  kTfLiteBuiltinRound = 116,
+  kTfLiteBuiltinHardSwish = 117,
+  kTfLiteBuiltinIf = 118,
+  kTfLiteBuiltinWhile = 119,
+  kTfLiteBuiltinNonMaxSuppressionV4 = 120,
+  kTfLiteBuiltinNonMaxSuppressionV5 = 121,
+  kTfLiteBuiltinScatterNd = 122,
+  kTfLiteBuiltinSelectV2 = 123,
+  kTfLiteBuiltinDensify = 124,
+  kTfLiteBuiltinSegmentSum = 125,
+  kTfLiteBuiltinBatchMatmul = 126,
+  kTfLiteBuiltinPlaceholderForGreaterOpCodes = 127,
+  kTfLiteBuiltinCumsum = 128,
+  kTfLiteBuiltinCallOnce = 129,
+  kTfLiteBuiltinBroadcastTo = 130,
+  kTfLiteBuiltinRfft2d = 131,
+  kTfLiteBuiltinConv3d = 132,
+  kTfLiteBuiltinImag = 133,
+  kTfLiteBuiltinReal = 134,
+  kTfLiteBuiltinComplexAbs = 135,
+  kTfLiteBuiltinHashtable = 136,
+  kTfLiteBuiltinHashtableFind = 137,
+  kTfLiteBuiltinHashtableImport = 138,
+  kTfLiteBuiltinHashtableSize = 139,
+  kTfLiteBuiltinReduceAll = 140,
+  kTfLiteBuiltinConv3dTranspose = 141,
+  kTfLiteBuiltinVarHandle = 142,
+  kTfLiteBuiltinReadVariable = 143,
+  kTfLiteBuiltinAssignVariable = 144,
+  kTfLiteBuiltinBroadcastArgs = 145,
+  kTfLiteBuiltinRandomStandardNormal = 146,
+  kTfLiteBuiltinBucketize = 147,
+  kTfLiteBuiltinRandomUniform = 148,
+  kTfLiteBuiltinMultinomial = 149,
+  kTfLiteBuiltinGelu = 150,
+  kTfLiteBuiltinDynamicUpdateSlice = 151,
+  kTfLiteBuiltinRelu0To1 = 152,
+  kTfLiteBuiltinUnsortedSegmentProd = 153,
+  kTfLiteBuiltinUnsortedSegmentMax = 154,
+  kTfLiteBuiltinUnsortedSegmentSum = 155,
+  kTfLiteBuiltinAtan2 = 156,
+  kTfLiteBuiltinUnsortedSegmentMin = 157,
+  kTfLiteBuiltinSign = 158,
+  kTfLiteBuiltinBitcast = 159,
+  kTfLiteBuiltinBitwiseXor = 160,
+  kTfLiteBuiltinRightShift = 161,
+  kTfLiteBuiltinStablehloLogistic = 162,
+  kTfLiteBuiltinStablehloAdd = 163,
+  kTfLiteBuiltinStablehloDivide = 164,
+  kTfLiteBuiltinStablehloMultiply = 165,
+  kTfLiteBuiltinStablehloMaximum = 166,
+  kTfLiteBuiltinStablehloReshape = 167,
+  kTfLiteBuiltinStablehloClamp = 168,
+  kTfLiteBuiltinStablehloConcatenate = 169,
+  kTfLiteBuiltinStablehloBroadcastInDim = 170,
+  kTfLiteBuiltinStablehloConvolution = 171,
+  kTfLiteBuiltinStablehloSlice = 172,
+  kTfLiteBuiltinStablehloCustomCall = 173,
+  kTfLiteBuiltinStablehloReduce = 174,
+  kTfLiteBuiltinStablehloAbs = 175,
+  kTfLiteBuiltinStablehloAnd = 176,
+  kTfLiteBuiltinStablehloCosine = 177,
+  kTfLiteBuiltinStablehloExponential = 178,
+  kTfLiteBuiltinStablehloFloor = 179,
+  kTfLiteBuiltinStablehloLog = 180,
+  kTfLiteBuiltinStablehloMinimum = 181,
+  kTfLiteBuiltinStablehloNegate = 182,
+  kTfLiteBuiltinStablehloOr = 183,
+  kTfLiteBuiltinStablehloPower = 184,
+  kTfLiteBuiltinStablehloRemainder = 185,
+  kTfLiteBuiltinStablehloRsqrt = 186,
+  kTfLiteBuiltinStablehloSelect = 187,
+  kTfLiteBuiltinStablehloSubtract = 188,
+  kTfLiteBuiltinStablehloTanh = 189,
+  kTfLiteBuiltinStablehloScatter = 190,
+  kTfLiteBuiltinStablehloCompare = 191,
+  kTfLiteBuiltinStablehloConvert = 192,
+  kTfLiteBuiltinStablehloDynamicSlice = 193,
+  kTfLiteBuiltinStablehloDynamicUpdateSlice = 194,
+  kTfLiteBuiltinStablehloPad = 195,
+  kTfLiteBuiltinStablehloIota = 196,
+  kTfLiteBuiltinStablehloDotGeneral = 197,
+  kTfLiteBuiltinStablehloReduceWindow = 198,
+  kTfLiteBuiltinStablehloSort = 199,
+  kTfLiteBuiltinStablehloWhile = 200,
+  kTfLiteBuiltinStablehloGather = 201,
+  kTfLiteBuiltinStablehloTranspose = 202,
+  kTfLiteBuiltinDilate = 203,
+  kTfLiteBuiltinStablehloRngBitGenerator = 204,
+  kTfLiteBuiltinReduceWindow = 205,
+  kTfLiteBuiltinStablehloComposite = 206,
+  kTfLiteBuiltinStablehloShiftLeft = 207,
+  kTfLiteBuiltinStablehloCbrt = 208,
+  kTfLiteBuiltinStablehloCase = 209,
+} TfLiteBuiltinOperator;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_LITE_BUILTIN_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/builtin_op_data.h b/third_party/tflite-hdrs/tensorflow/lite/c/builtin_op_data.h
new file mode 100644
index 00000000..06068192
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/builtin_op_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/builtin_op_data.h
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+#endif  // TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api.h
new file mode 100644
index 00000000..01938c81
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_H_
+#define TENSORFLOW_LITE_C_C_API_H_
+
+/// \file
+///
+/// C API for TensorFlow Lite.
+///
+/// For documentation, see tensorflow/lite/core/c/c_api.h
+
+#include "tensorflow/lite/core/c/c_api.h"
+
+#ifndef DOYXGEN_SKIP
+// For backwards compatibility.
+// Deprecated. Use the names starting with TfLiteOperator instead.
+#ifdef __cplusplus
+using TfLiteRegistrationExternal = TfLiteOperator;
+// NOLINTBEGIN
+const auto TfLiteRegistrationExternalCreate = TfLiteOperatorCreate;
+const auto TfLiteRegistrationExternalGetBuiltInCode =
+    TfLiteOperatorGetBuiltInCode;
+const auto TfLiteRegistrationExternalGetVersion = TfLiteOperatorGetVersion;
+const auto TfLiteRegistrationExternalDelete = TfLiteOperatorDelete;
+const auto TfLiteRegistrationExternalSetInit = TfLiteOperatorSetInit;
+const auto TfLiteRegistrationExternalSetFree = TfLiteOperatorSetFree;
+const auto TfLiteRegistrationExternalSetPrepare = TfLiteOperatorSetPrepare;
+const auto TfLiteRegistrationExternalSetInvoke = TfLiteOperatorSetInvoke;
+const auto TfLiteRegistrationExternalGetCustomName =
+    TfLiteOperatorGetCustomName;
+// NOLINTEND
+#else
+typedef TfLiteOperator TfLiteRegistrationExternal;
+#define TfLiteRegistrationExternalCreate TfLiteOperatorCreate
+#define TfLiteRegistrationExternalGetBuiltInCode TfLiteOperatorGetBuiltInCode
+#define TfLiteRegistrationExternalGetVersion TfLiteOperatorGetVersion
+#define TfLiteRegistrationExternalDelete TfLiteOperatorDelete
+#define TfLiteRegistrationExternalSetInit TfLiteOperatorSetInit
+#define TfLiteRegistrationExternalSetFree TfLiteOperatorSetFree
+#define TfLiteRegistrationExternalSetPrepare TfLiteOperatorSetPrepare
+#define TfLiteRegistrationExternalSetInvoke TfLiteOperatorSetInvoke
+#define TfLiteRegistrationExternalGetCustomName TfLiteOperatorGetCustomName
+#endif  // __cplusplus
+#endif  // DOYXGEN_SKIP
+
+#endif  // TENSORFLOW_LITE_C_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api_experimental.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_experimental.h
new file mode 100644
index 00000000..84cd4b03
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_experimental.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/c_api_experimental.h
+
+#include "tensorflow/lite/core/c/c_api_experimental.h"
+
+#endif  // TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api_for_testing.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_for_testing.h
new file mode 100644
index 00000000..7982285f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_for_testing.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_FOR_TESTING_H_
+#define TENSORFLOW_LITE_C_C_API_FOR_TESTING_H_
+
+#include "tensorflow/lite/core/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Gets the number of CPU threads to use for the interpreter.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterOptionsGetNumThreads(
+    TfLiteInterpreterOptions* options);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_C_C_API_FOR_TESTING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api_internal.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_internal.h
new file mode 100644
index 00000000..8249f22e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_internal.h
@@ -0,0 +1,299 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_INTERNAL_H_
+#define TENSORFLOW_LITE_C_C_API_INTERNAL_H_
+
+#include <stdarg.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <vector>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
+#include "tensorflow/lite/signature_runner.h"
+
+// Internal structures and subroutines used by the C API. These are likely to
+// change and should not be depended on directly by any C API clients.
+//
+// NOTE: This header does not follow C conventions and does not define a C API.
+// It is effectively an (internal) implementation detail of the C API.
+
+struct TfLiteModel {
+  // Sharing is safe as FlatBufferModel is const.
+  std::shared_ptr<const tflite::impl::FlatBufferModel> impl;
+};
+
+// The `TfLiteOpResolver` struct is an abstract callback interface that
+// contains function pointers for callbacks that return a
+// `TfLiteRegistration` given an op code or custom op name. This mechanism is
+// used to map ops referenced in the flatbuffer model to executable function
+// pointers (`TfLiteRegistration`s).
+// This struct mirrors the tflite::OpResolver C++ abstract base class.
+struct TfLiteOpResolverCallbacks {
+  // Opaque data that gets passed down to the callback functions.
+  void* user_data = nullptr;
+
+  // Callback that finds the op registration for a builtin operator by enum
+  // code.  The `user_data` parameter will be set to the
+  // `op_resolver_user_data` value that was passed to
+  // `TfLiteInterpreterOptionsSetOpResolver`.
+  std::function<const TfLiteRegistration*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op;
+  // Callback that finds the op registration of a custom operator by op name.
+  // The `user_data` parameter will be set to the `op_resolver_user_data` value
+  // that was passed to `TfLiteInterpreterOptionsSetOpResolver`.
+  std::function<const TfLiteRegistration*(void* user_data, const char* op,
+                                          int version)>
+      find_custom_op;
+
+  // Variant of `find_builtin_op` which returns `TfLiteRegistration_V3`.
+  std::function<const TfLiteRegistration_V3*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op_v3;
+  // Variant of `find_custom_op` which returns `TfLiteRegistration_V3`.
+  std::function<const TfLiteRegistration_V3*(void* user_data, const char* op,
+                                             int version)>
+      find_custom_op_v3;
+
+  // Variant of `find_builtin_op` which returns `TfLiteRegistration_V2`.
+  std::function<const TfLiteRegistration_V2*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op_v2;
+  // Variant of `find_custom_op` which returns `TfLiteRegistration_V2`.
+  std::function<const TfLiteRegistration_V2*(void* user_data, const char* op,
+                                             int version)>
+      find_custom_op_v2;
+
+  // Varant of `find_builtin_op` which returns `TfLiteRegistration_V1`.
+  std::function<const TfLiteRegistration_V1*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op_v1;
+  // Varant of `find_custom_op` which returns `TfLiteRegistration_V1`.
+  std::function<const TfLiteRegistration_V1*(void* user_data, const char* op,
+                                             int version)>
+      find_custom_op_v1;
+
+  // Variant of `find_builtin_op` which returns `TfLiteOperator`.
+  std::function<const TfLiteOperator*(void* user_data, TfLiteBuiltinOperator op,
+                                      int version)>
+      find_builtin_op_external;
+  // Variant of `find_custom_op` which returns `TfLiteOperator`.
+  std::function<const TfLiteOperator*(void* user_data, const char* op,
+                                      int version)>
+      find_custom_op_external;
+};
+
+// This struct mirrors the tflite::ErrorResolver C++ abstract base class.
+struct TfLiteErrorReporterCallback {
+  // Opaque data that gets passed down to the callback function.
+  void* user_data = nullptr;
+
+  // Callback function that reports an error.
+  void (*error_reporter)(void* user_data, const char* format,
+                         va_list args) = nullptr;
+};
+
+struct TfLiteInterpreterOptions {
+  enum {
+    kDefaultNumThreads = -1,
+  };
+  int num_threads = kDefaultNumThreads;
+
+  tflite::MutableOpResolver mutable_op_resolver;
+
+  TfLiteOpResolverCallbacks op_resolver_callbacks = {};
+
+  std::vector<TfLiteDelegate*> delegates;
+
+  TfLiteErrorReporterCallback error_reporter_callback;
+
+  bool use_nnapi = false;
+
+  // Determines whether to allow automatic fallback to CPU.
+  // If true, and if one or more delegates were set,
+  // then if Invoke with delegates fails, it will be
+  // automatically retried without delegates.
+  bool enable_delegate_fallback = false;
+
+  // TfLiteOperator objects owned by caller of
+  // `TfLiteInterpreterOptionsAddOperator` API.
+  std::vector<TfLiteOperator*> op_registrations;
+
+  // Determines whether to allow to cancel invocations with
+  // `Interpreter::Cancel` or `SignatureRunner::Cancel`.
+  bool enable_cancellation = false;
+
+  // If not nullptr, report telemetry metrics to profiler.
+  TfLiteTelemetryProfilerStruct* telemetry_profiler = nullptr;
+};
+
+struct TfLiteInterpreter {
+  // Taking a reference to the (const) model data avoids lifetime-related issues
+  // and complexity with the TfLiteModel's existence.
+  std::shared_ptr<const tflite::impl::FlatBufferModel> model;
+
+  // The interpreter does not take ownership of the provided ErrorReporter
+  // instance, so we ensure its validity here. Note that the interpreter may use
+  // the reporter in its destructor, so the reporter should be declared first.
+  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
+
+  std::unique_ptr<tflite::Interpreter> impl;
+
+  bool enable_delegate_fallback;
+};
+
+struct TfLiteSignatureRunner {
+  // The tflite::SignatureRunner runner object that this points to is owned by
+  // the interpreter. So this pointer will become invalid when the interpreter
+  // is destroyed.
+  tflite::SignatureRunner* impl;
+};
+
+namespace tflite {
+namespace internal {
+
+/// `CallbackOpResolver` is a (C++) `tflite::OpResolver` that forwards the
+/// methods to (C ABI) callback functions from a `TfLiteOpResolverCallbacks`
+/// struct.
+///
+/// The SetCallbacks method must be called before calling any of the FindOp
+/// methods.
+class CallbackOpResolver : public ::tflite::OpResolver {
+ public:
+  CallbackOpResolver() = default;
+  void SetCallbacks(
+      const struct TfLiteOpResolverCallbacks& op_resolver_callbacks) {
+    op_resolver_callbacks_ = op_resolver_callbacks;
+  }
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+
+ private:
+  CallbackOpResolver(const CallbackOpResolver&) = delete;
+  CallbackOpResolver& operator=(const CallbackOpResolver&) = delete;
+
+  // Builds a builtin op TfLiteRegistration from a legacy registration
+  // (e.g. TfLiteRegistration_V1).
+  // The legacy registration type must be a POD struct type whose field types
+  // must be a prefix of the field types in TfLiteRegistration, and offset of
+  // the first field in TfLiteRegistration that is not present in the legacy
+  // registration type must be greater than or equal to the size of the legacy
+  // registration type.
+  // `legacy_find_builtin_op` is a callback that finds the
+  // legacy registration for a builtin operator by enum code. The caller owns
+  // the returned registration.
+  template <class LegacyRegistrationT>
+  TfLiteRegistration* BuildBuiltinOpFromLegacyRegistration(
+      tflite::BuiltinOperator op, int version,
+      std::function<const LegacyRegistrationT*(
+          void* user_data, TfLiteBuiltinOperator op, int version)>
+          legacy_find_builtin_op) const {
+    if (legacy_find_builtin_op) {
+      // Get a deprecated Registration object to create a Registration.
+      const LegacyRegistrationT* legacy_registration = legacy_find_builtin_op(
+          op_resolver_callbacks_.user_data,
+          static_cast<TfLiteBuiltinOperator>(op), version);
+      if (legacy_registration) {
+        TfLiteRegistration* new_registration = new TfLiteRegistration();
+        memcpy(new_registration, legacy_registration,
+               sizeof(LegacyRegistrationT));
+        new_registration->registration_external = nullptr;
+        temporary_builtin_registrations_.push_back(
+            std::unique_ptr<TfLiteRegistration>(new_registration));
+        return new_registration;
+      }
+    }
+    return nullptr;
+  }
+
+  // Builds a custom op TfLiteRegistration from a legacy registration
+  // (e.g. TfLiteRegistration_V1).
+  // The legacy registration type must be a POD struct type whose field types
+  // must be a prefix of the field types in TfLiteRegistration, and offset of
+  // the first field in TfLiteRegistration that is not present in the legacy
+  // registration type must be greater than or equal to the size of the legacy
+  // registration type.
+  // `legacy_find_custom_op` is a callback that finds the legacy registration
+  // for a builtin operator by op name.
+  // The caller owns the returned registration.
+  template <class LegacyRegistrationT>
+  TfLiteRegistration* BuildCustomOpFromLegacyRegistration(
+      const char* op, int version,
+      std::function<const LegacyRegistrationT*(void* user_data, const char* op,
+                                               int version)>
+          legacy_find_custom_op) const {
+    if (legacy_find_custom_op) {
+      // Get a deprecated Registration object to create a Registration.
+      const LegacyRegistrationT* legacy_registration =
+          legacy_find_custom_op(op_resolver_callbacks_.user_data, op, version);
+      if (legacy_registration) {
+        TfLiteRegistration* new_registration = new TfLiteRegistration();
+        memcpy(new_registration, legacy_registration,
+               sizeof(LegacyRegistrationT));
+        new_registration->registration_external = nullptr;
+        temporary_custom_registrations_.push_back(
+            std::unique_ptr<TfLiteRegistration>(new_registration));
+        return new_registration;
+      }
+    }
+    return nullptr;
+  }
+
+  struct TfLiteOpResolverCallbacks op_resolver_callbacks_ = {};
+
+  // mutable objects to store temporary `TfLiteRegistration`.
+  mutable std::mutex mutex_;
+  mutable std::vector<std::unique_ptr<TfLiteRegistration>>
+      temporary_builtin_registrations_;  // GUARDED_BY(mutex_)
+  mutable std::vector<std::unique_ptr<TfLiteRegistration>>
+      temporary_custom_registrations_;  // GUARDED_BY(mutex_)
+};
+
+// This adds the builtin and/or custom operators specified in options in
+// `optional_options` (if any) to `mutable_resolver`, and then returns a newly
+// created TfLiteInterpreter using `mutable_op_resolver` as the default
+// OpResolver, and using any other options in `optional_options`, and using
+// the provided `model`.
+//
+// * `model` must be a valid model instance. The caller retains ownership of the
+//   object, and can destroy it immediately after creating the interpreter; the
+//   interpreter will maintain its own reference to the underlying model data.
+// * `optional_options` may be null. The caller retains ownership of the object,
+//   and can safely destroy it immediately after creating the interpreter.
+// * `mutable_resolver` must not be null. The caller retains ownership of the
+//   MutableOpResolver object, and can safely destroy it immediately after
+//   creating the interpreter.
+//
+// NOTE: The client *must* explicitly allocate tensors before attempting to
+// access input tensor data or invoke the interpreter.
+
+TfLiteInterpreter* InterpreterCreateWithOpResolver(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options,
+    tflite::MutableOpResolver* mutable_resolver);
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_C_C_API_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api_opaque.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_opaque.h
new file mode 100644
index 00000000..7e4d401a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_opaque.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_OPAQUE_H_
+#define TENSORFLOW_LITE_C_C_API_OPAQUE_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/c_api_opaque.h
+
+#include "tensorflow/lite/core/c/c_api_opaque.h"
+
+#endif  // TENSORFLOW_LITE_C_C_API_OPAQUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api_opaque_internal.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_opaque_internal.h
new file mode 100644
index 00000000..54647b67
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_opaque_internal.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_OPAQUE_INTERNAL_H_
+#define TENSORFLOW_LITE_C_C_API_OPAQUE_INTERNAL_H_
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// Internal structures and subroutines used by the C API. These are likely to
+// change and should not be depended on directly by any C API clients.
+//
+// NOTE: This header does not follow C conventions and does not define a C API.
+// It is effectively an (internal) implementation detail of the C API.
+
+namespace tflite {
+namespace internal {
+
+class CommonOpaqueConversionUtil {
+ public:
+  CommonOpaqueConversionUtil() = delete;
+
+  // Obtain (or create) a 'TfLiteOperator' object that corresponds
+  // to the provided 'registration' argument, and return the address of the
+  // external registration.  We loosely define that a
+  // 'TfLiteOperator' object "corresponds" to a 'TfLiteRegistration'
+  // object when calling any function pointer (like 'prepare') on the
+  // 'TfLiteOperator' object calls into the corresponding function
+  // pointer of the 'TfLiteRegistration' object.
+  //
+  // The specified 'context' or 'op_resolver' object is used to store the
+  // 'TfLiteOperator*' pointers. The 'TfLiteOperator*'
+  // pointer will be deallocated when that object gets destroyed.  I.e., the
+  // caller of this function should not deallocate the object pointed to by the
+  // return value of 'ObtainOperator'.
+  //
+  // We also need to provide the 'node_index' that the 'registration'
+  // corresponds to, so that the 'TfLiteOperator' can store that
+  // index within its fields.  If the registration does not yet correspond
+  // to a specific node index, then 'node_index' should be -1.
+  static TfLiteOperator* ObtainOperator(TfLiteContext* context,
+                                        const TfLiteRegistration* registration,
+                                        int node_index);
+
+ private:
+  static TfLiteOperator* CachedObtainOperator(
+      ::tflite::internal::OperatorsCache* registration_externals_cache,
+      const TfLiteRegistration* registration, int node_index);
+};
+
+}  // namespace internal
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_C_C_API_OPAQUE_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/c_api_types.h b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_types.h
new file mode 100644
index 00000000..05cda07e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/c_api_types.h
@@ -0,0 +1,26 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_C_API_TYPES_H_
+#define TENSORFLOW_LITE_C_C_API_TYPES_H_
+
+/// \file
+///
+/// C API types for TensorFlow Lite.
+///
+/// For documentation, see tensorflow/lite/core/c/c_api_types.h
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+#endif  // TENSORFLOW_LITE_C_C_API_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/common.h b/third_party/tflite-hdrs/tensorflow/lite/c/common.h
new file mode 100644
index 00000000..8a8b5133
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/common.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+///
+/// This file defines common C types and APIs for implementing operations,
+/// delegates and other constructs in TensorFlow Lite. The actual operations and
+/// delegates can be defined using C++, but the interface between the
+/// interpreter and the operations are C.
+///
+/// For documentation, see tensorflow/lite/core/c/common.h.
+///
+/// See also c_api_opaque.h which has more ABI-stable variants of some of these
+/// APIs.
+
+#ifndef TENSORFLOW_LITE_C_COMMON_H_
+#define TENSORFLOW_LITE_C_COMMON_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+#endif  // TENSORFLOW_LITE_C_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/common_internal.h b/third_party/tflite-hdrs/tensorflow/lite/c/common_internal.h
new file mode 100644
index 00000000..9d24d726
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/common_internal.h
@@ -0,0 +1,168 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_COMMON_INTERNAL_H_
+#define TENSORFLOW_LITE_C_COMMON_INTERNAL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// Internal structures and subroutines used by the C API. These are likely to
+// change and should not be depended on directly by any C API clients.
+//
+// NOTE: This header does not follow C conventions and does not define a C API.
+// It is effectively an (internal) implementation detail of the C API.
+
+// `TfLiteOperator` is an external version of `TfLiteRegistration`
+// for C API which doesn't use internal types (such as `TfLiteContext`) but only
+// uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
+// field is the exactly the same as with `TfLiteRegistration`.
+typedef struct TfLiteOperator {
+  // Custom op name.  This should be non-null iff the op is a custom op,
+  // i.e. iff builtin_code is kTfLiteBuiltinCustom.
+  const char* custom_name;
+
+  // The version of the op. The version should be higher than 0.
+  int version;
+
+  // Initializes the op from serialized data.
+  void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                size_t length);
+
+  // Deallocates the op.
+  // The pointer `buffer` is the data previously returned by an init invocation.
+  void (*free)(TfLiteOpaqueContext* context, void* buffer);
+
+  // Called when the inputs that this node depends on have been resized.
+  TfLiteStatus (*prepare)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+  // Called when the node is executed. (Should read node inputs and write to
+  // node outputs).
+  TfLiteStatus (*invoke)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+  // Retrieves the async kernel. The functor is nullptr if the node / backend
+  // does not support asynchronous execution.
+  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                            TfLiteOpaqueNode* node);
+
+  // Builtin op code.
+  // The values stored in this field should be enum constants from the
+  // TfLiteBuiltinOperator enum.
+  // For custom ops, this should be the value kTfLiteBuiltinCustom.
+  int32_t builtin_code;
+
+  // The default value of this field is supposed to be '-1'.
+  // The default value indicates to the TF Lite runtime that this registration
+  // should be used through its callbacks, i.e. 'init', 'free' etc.
+  //
+  // This would be the case when a delegate implementation supplies an opaque
+  // delegate kernel to the runtime to claim the execution for a subset of
+  // nodes. This would also be the case when an application defines a custom OP.
+  //
+  // However, users might also iterate over the execution plan to visit the
+  // nodes and registrations associated with an opaque context.  In this
+  // scenario, due to ABI stability reasons, we provide them with a registration
+  // external object, that internally delegates execution to a corresponding
+  // regular TfLiteRegistration.  In such a case the 'node_index' field should
+  // store the index of that corresponding node (and registration).
+  int node_index;
+
+  // Indicates if an operator's output can safely overwrite its input.
+  // See the comments in `TfLiteInPlaceOp`.
+  uint64_t inplace_operator;
+
+  // Data supplied by the user in the `TfLiteOperatorCreate` call and then
+  // returned back to the user in the `TfLiteOperator` callbacks listed below.
+  // The user is expected to manage the memory pointed by this field and the
+  // lifetime of that memory should extend at least from the call to
+  // `TfLiteOperatorCreate` to the invocation of the callback set with
+  // `TfLiteOperatorSetFreeWithData`.
+  void* user_data;
+  // The following callbacks can be set with the `TfLiteOperatorSetXXXWithData`
+  // functions and if, so set, will pass back the value of the user_data field
+  // above as first argument.
+  //
+  // TODO(b/339641079): Remove the legacy callbacks listed above and rename
+  // these below without the `_with_data` suffix.
+  void* (*init_with_data)(void* user_data, TfLiteOpaqueContext* context,
+                          const char* buffer, size_t length);
+  void (*free_with_data)(void* user_data, TfLiteOpaqueContext* context,
+                         void* buffer);
+  TfLiteStatus (*prepare_with_data)(void* user_data,
+                                    TfLiteOpaqueContext* context,
+                                    TfLiteOpaqueNode* node);
+  TfLiteStatus (*invoke_with_data)(void* user_data,
+                                   TfLiteOpaqueContext* context,
+                                   TfLiteOpaqueNode* node);
+  struct TfLiteAsyncKernel* (*async_kernel_with_data)(
+      void* user_data, TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+} TfLiteOperator;
+
+// Returns true iff it's safe to dereference
+// 'delegate->opaque_delegate_builder'.
+inline bool TfLiteDelegateHasValidOpaqueDelegateBuilder(
+    const TfLiteDelegate* delegate) {
+  // We want to give precedence to the delegate's `opaque_delegate_builder`
+  // field when it is available.  In an ideal setting, where all client code
+  // properly initializes the delegate, we could simply check if the
+  // `opaque_delegate_builder` contains a non-zero address.  However, in
+  // practice this breaks code that doesn't adhere to these best practices.
+  //
+  // We can avoid this problem by checking the `Prepare` field contained in the
+  // `TfliteDelegate` (not to be confused with the `Prepare` field contained in
+  // `TfLiteOpaqueDelegateBuilder` struct). In order to tell if we should use
+  // the `opaque_delegate_builder` field we check that the `TfLiteDelegate`'s
+  // `Prepare` member is null.  This should be true for every delegate that
+  // adopts the `TfLiteOpaqueDelegateBuilder` interface and should not be true
+  // for any delegate implementation that is using `TfLiteDelegate` directly.
+  //
+  // TODO(b/245730811): Consider signalling to clients if the delegate is not
+  // initialized cleanly.
+  return delegate != nullptr && delegate->Prepare == nullptr &&
+         delegate->opaque_delegate_builder != nullptr;
+}
+
+// Invokes 'Prepare' on the provided 'delegate', giving the 'delegate' a view
+// of the current graph through the provided 'context'.  Returns the delegate's
+// 'Prepare' return value.
+TfLiteStatus TfLiteDelegatePrepareInternal(TfLiteContext* context,
+                                           TfLiteDelegate* delegate);
+
+// Invokes 'CopyFromBufferHandle' on the provided 'delegate', supplying the
+// provided 'buffer_handle' and 'tensor' as arguments. The provided
+// 'buffer_handle' must have a non-null buffer handle value (i.e., not
+// 'kTfLiteNullBufferHandle').  Returns the delegate's 'CopyFromBufferHandle'
+// return value.
+TfLiteStatus TfLiteDelegateCopyFromBufferHandleInternal(
+    TfLiteContext* context, TfLiteDelegate* delegate,
+    TfLiteBufferHandle buffer_handle, TfLiteTensor* tensor);
+
+// Invokes 'FreeBufferHandle' on the provided 'delegate', supplying the provided
+// 'buffer_handle' as an argument.  The '*buffer_handle' must have a non-null
+// buffer handle value (i.e., not 'kTfLiteNullBufferHandle').  Returns
+// 'kTfLiteOk' if 'FreeBufferHandle' was called, or 'kTfLiteError' if the
+// callback is not available.
+TfLiteStatus TfLiteDelegateFreeBufferHandleInternal(
+    TfLiteContext* context, TfLiteDelegate* delegate,
+    TfLiteBufferHandle* buffer_handle);
+
+// Returns the 'delegate' flags value.  Note, if the delegate contains a valid
+// opaque_delegate_builder field, then the flags of the delegate external are
+// returned.  Otherwise, the flags field inside `TfLiteDelegate` is returned.
+int64_t TfLiteDelegateGetFlagsInternal(TfLiteDelegate* delegate);
+
+#endif  // TENSORFLOW_LITE_C_COMMON_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/jni/jni_utils.h b/third_party/tflite-hdrs/tensorflow/lite/c/jni/jni_utils.h
new file mode 100644
index 00000000..355b7a4a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/jni/jni_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_JNI_JNI_UTILS_H_
+#define TENSORFLOW_LITE_JNI_JNI_UTILS_H_
+
+#include <jni.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Checks whether the TFLite API has been initialized, throwing a Java exception
+/// otherwise.
+///
+/// @param env The JNIEnv for the current thread (which has to be attached to the
+///     JVM).
+/// @return Whether or not the TFLite API has been initialized. If this method
+///   returns false, no other JNI method should be called until the pending
+///   exception has been handled (typically by returning to Java).
+bool TfLiteCheckInitializedOrThrow(JNIEnv* env);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_JNI_JNI_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/c/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/c/test_util.h
new file mode 100644
index 00000000..ffe6007f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/c/test_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_TEST_UTIL_H_
+#define TENSORFLOW_LITE_C_TEST_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initialize TF Lite shims, in a manner appropriate for running unit tests.
+// Returns zero on success, or an implementation-defined error code on failure.
+// This should be called before calling any other shims functions or methods
+// in unit tests.
+int TfLiteInitializeShimsForTest(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_LITE_C_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/context.h b/third_party/tflite-hdrs/tensorflow/lite/context.h
new file mode 100644
index 00000000..c82f621f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/context.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for moved header location.
+#ifndef TENSORFLOW_LITE_CONTEXT_H_
+#define TENSORFLOW_LITE_CONTEXT_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+#endif  // TENSORFLOW_LITE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/context_util.h b/third_party/tflite-hdrs/tensorflow/lite/context_util.h
new file mode 100644
index 00000000..cbbe9f10
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/context_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// This provides a few C++ helpers that are useful for manipulating C
+/// structures in C++.
+#ifndef TENSORFLOW_LITE_CONTEXT_UTIL_H_
+#define TENSORFLOW_LITE_CONTEXT_UTIL_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+/// Provides a range iterable wrapper for TfLiteIntArray* (C lists) that TfLite
+/// C api uses.
+// Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+class TfLiteIntArrayView {
+ public:
+  /// Construct a view of a TfLiteIntArray*. Note, `int_array` should be
+  /// non-null and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+  size_t size() const { return end() - begin(); }
+  int operator[](size_t pos) const { return int_array_->data[pos]; }
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CONTEXT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
new file mode 100644
index 00000000..3f02a3fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
@@ -0,0 +1,144 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly,
+// but should instead include
+// "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+
+/// C API types for TF Lite delegate plugins.
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup delegate_plugin lite/acceleration/configuration/c/delegate_plugin.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// Type of delegate creation function used to allocate and construct a
+/// delegate.
+///
+/// The `tflite_settings` parameter passed to the delegate creation function
+/// should be a pointer to a FlatBuffer table object of type
+/// `tflite::TFLiteSettings`. We use `const void *` here rather than `const
+/// tflite::TFLiteSettings*` since this is a C API so we don't want to directly
+/// reference C++ types such as `tflite::TFLiteSettings`.  But note that this
+/// address should point to the 'parsed' FlatBuffer object, not the raw byte
+/// buffer. (Note that 'parsing' FlatBuffers is very cheap, it's just an offset
+/// load.)
+///
+/// If you are using the FlatBuffers C API, then you can alternatively pass
+/// in a value of type `tflite_TFLiteSettings_table_t`, which is a typedef for
+/// `const struct tflite_TFLiteSettings_table*` -- that is the corresponding
+/// type for the 'parsed' FlatBuffer object in the FlatBuffers C API.
+///
+/// Ownership of the `tflite_settings` flatbuffer remains with the caller.
+/// The caller of a delegate creation function may end the lifetime of the
+/// `tflite_settings` FlatBuffer immediately after the call to the function.
+/// So the delegate creation function should ensure that any settings that the
+/// delegate may need to reference later, after the delegate has been
+/// constructed, are copied from the FlatBuffer into storage owned by the
+/// delegate.
+typedef TfLiteDelegate *TfLiteDelegatePluginCreateFunc(
+    const void *tflite_settings);
+
+/// Type of function to destroy and deallocate a delegate.
+/// The delegate argument must have been created with the corresponding
+/// create function from the same delegate plugin.
+typedef void TfLiteDelegatePluginDestroyFunc(TfLiteDelegate *);
+
+/// Type of function to return an error code for the last delegate operation.
+/// The delegate argument must have been created with the corresponding
+/// create function from the same delegate plugin.
+typedef int TfLiteDelegatePluginGetDelegateErrnoFunc(TfLiteDelegate *);
+
+/// Struct to hold all the methods for a delegate plugin.
+typedef struct TfLiteDelegatePlugin {
+  /// Function to allocate and construct a delegate.
+  TfLiteDelegatePluginCreateFunc *create;
+
+  /// Function to deallocate a delegate.
+  TfLiteDelegatePluginDestroyFunc *destroy;
+
+  /// Function to return an error code for the last delegate operation.
+  TfLiteDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
+} TfLiteDelegatePlugin;
+
+// The following block guarded by TFLITE_USE_OPAQUE_DELEGATE has the exact same
+// functionality as the concrete types above but only uses truly opaque types.
+// Note that it has to be an addition along with the concrete types at this
+// point because the in some cases both types are used together in a same build
+// target. e.g. TFLite-in-Play Services initialization context.
+#if TFLITE_USE_OPAQUE_DELEGATE
+
+/// Same as TfLiteDelegatePluginCreateFunc but uses truly opaque types.
+typedef TfLiteOpaqueDelegateStruct *TfLiteOpaqueDelegatePluginCreateFunc(
+    const void *tflite_settings);
+
+/// Same as TfLiteDelegatePluginDestroyFunc but uses truly opaque types.
+typedef void TfLiteOpaqueDelegatePluginDestroyFunc(
+    TfLiteOpaqueDelegateStruct *delegate);
+
+/// Same as TfLiteDelegatePluginGetDelegateErrnoFunc but uses truly opaque
+/// types.
+typedef int TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc(
+    TfLiteOpaqueDelegateStruct *delegate);
+
+/// Same as TfLiteDelegatePlugin but uses truly opaque types.
+typedef struct TfLiteOpaqueDelegatePlugin {
+  TfLiteOpaqueDelegatePluginCreateFunc *create;
+
+  TfLiteOpaqueDelegatePluginDestroyFunc *destroy;
+
+  TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
+} TfLiteOpaqueDelegatePlugin;
+
+#else
+
+typedef TfLiteDelegatePluginCreateFunc TfLiteOpaqueDelegatePluginCreateFunc;
+typedef TfLiteDelegatePluginDestroyFunc TfLiteOpaqueDelegatePluginDestroyFunc;
+typedef TfLiteDelegatePluginGetDelegateErrnoFunc
+    TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc;
+typedef TfLiteDelegatePlugin TfLiteOpaqueDelegatePlugin;
+
+#endif  // TFLITE_USE_OPAQUE_DELEGATE
+
+/** @} */
+
+#ifdef __cplusplus
+};  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
new file mode 100644
index 00000000..c1e42c93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
@@ -0,0 +1,69 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include
+// "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+
+/// This header file is for the delegate plugin for GPU.
+///
+/// For the C++ delegate plugin interface, the GPU delegate plugin is added to
+/// the `DelegatePluginRegistry` by the side effect of a constructor for a
+/// static object, so there's no public API needed for this plugin, other than
+/// the API of `tflite::delegates::DelegatePluginRegistry`s, which is declared
+/// in delegate_registry.h.
+///
+/// But to provide a C API to access the GPU delegate plugin, we do expose
+/// some functions, which are declared below.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup gpu_plugin lite/acceleration/configuration/c/gpu_plugin.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// C API for the GPU delegate plugin.
+/// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteGpuDelegatePluginCApi();
+
+/** @} */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h
new file mode 100644
index 00000000..52312be9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+
+// This header file is for the delegate plugin for NNAPI.
+//
+// For the C++ delegate plugin interface, the NNAPI delegate plugin is added to
+// the DelegatePluginRegistry by the side effect of a constructor for a static
+// object, so there's no public API needed for this plugin, other than the API
+// of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the NNAPI delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the NNAPI delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteNnapiDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000..eab926c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+// C API types for TFLite delegates that implement stable delegate ABI.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Constant that identifies the TfLiteStableDelegate ABI version that the
+// delegate supports. This will get incremented if there are changes to the
+// struct. The version is in semver 2 format (see https://semver.org).
+#define TFL_STABLE_DELEGATE_ABI_VERSION "1.0.0"
+
+// Contains stable delegate metadata and implementation.
+typedef struct TfLiteStableDelegate {
+  // The struct ABI version this delegate supports in semver 2 format. It should
+  // be set to TFL_STABLE_DELEGATE_ABI_VERSION.
+  const char* delegate_abi_version;
+
+  // Uniquely identifies a delegate.
+  // Format: {vendor}_{delegate}. Prefer using snake_case.
+  // e.g. "mycompany_gpu_delegate"
+  const char* delegate_name;
+
+  // Release version of this delegate.
+  // Prefer using semver 2 format.
+  const char* delegate_version;
+
+  // Provides the implementation of the delegate plugin.
+  const TfLiteOpaqueDelegatePlugin* delegate_plugin;
+} TfLiteStableDelegate;
+
+#ifdef __cplusplus
+};  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
new file mode 100644
index 00000000..d7c51a9b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
@@ -0,0 +1,69 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include
+// "third_party/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+
+/// This header file is for the delegate plugin for XNNPACK.
+///
+/// For the C++ delegate plugin interface, the XNNPACK delegate plugin is added
+/// to the DelegatePluginRegistry by the side effect of a constructor for a
+/// static object, so there's no public API needed for this plugin, other than
+/// the API of `tflite::delegates::DelegatePluginRegistry`, which is declared in
+/// delegate_registry.h.
+///
+/// But to provide a C API to access the XNNPACK delegate plugin, we do expose
+/// some functions, which are declared below.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup xnnpack_plugin lite/acceleration/configuration/c/xnnpack_plugin.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// C API for the XNNPACK delegate plugin.
+/// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteXnnpackDelegatePluginCApi();
+
+/** @} */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin_c_test_lib.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin_c_test_lib.h
new file mode 100644
index 00000000..f07831e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin_c_test_lib.h
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Some library routines for constructing TFLiteSettings FlatBuffers,
+// implemented using the FlatBuffers C API.
+
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_C_TEST_LIB_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_C_TEST_LIB_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "tensorflow/lite/core/acceleration/configuration/c/configuration_reader.h"
+
+// Opaque type for building a TfLiteSettings flatbuffer object.
+typedef struct SettingsStorage SettingsStorage;
+
+// Constructs a TFLiteSettings FlatBuffer with the following contents:
+//
+//     tflite_settings {
+//       xnnpack_settings {
+//         num_threads: <num_threads>
+//       }
+//     }
+struct SettingsStorage* SettingsStorageCreateWithXnnpackThreads(
+    int num_threads);
+
+// Constructs a TFLiteSettings FlatBuffer with the following contents:
+//
+//     tflite_settings {
+//       xnnpack_settings {
+//         flags: <flags>
+//       }
+//     }
+struct SettingsStorage* SettingsStorageCreateWithXnnpackFlags(
+    tflite_XNNPackFlags_enum_t flags);
+
+// Gets the parsed TFLiteSettings FlatBuffer object from the SettingsStorage.
+const struct tflite_TFLiteSettings_table* SettingsStorageGetSettings(
+    const SettingsStorage* storage);
+
+// Deallocates the settings storage allocated by
+// SettingsStorageCreateWithXnnpackThreads or
+// SettingsStorageCreateWithXnnpackFlags.
+void SettingsStorageDestroy(SettingsStorage* storage);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_C_TEST_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/delegate_registry.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000..742e7438
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/delegate_registry.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// Defines an interface for TFLite delegate plugins.
+//
+// The acceleration library aims to support all TFLite delegates based on
+// configuration expressed as data (flatbuffers). However, consumers tend to
+// care about size and also use a subset of delegates. Hence we don't want to
+// statically build against all delegates.
+//
+// This interface allows plugins to handle specific delegates.
+//
+// Goal of this interface is not to abstract away all the differences between
+// delegates. The goal is only to avoid static linking.
+//
+// Note to implementers: this interface may change if new delegates don't fit
+// into the same design.
+namespace tflite {
+namespace delegates {
+
+// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
+// tensorflow/lite/interpreter.h dependency
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+class DelegatePluginInterface {
+ public:
+  virtual TfLiteDelegatePtr Create() = 0;
+  virtual int GetDelegateErrno(TfLiteDelegate* from_delegate) = 0;
+  virtual ~DelegatePluginInterface() = default;
+};
+
+// A stripped-down registry that allows delegate plugins to be created by name.
+//
+// Limitations:
+// - Doesn't allow deregistration.
+// - Doesn't check for duplication registration.
+//
+class DelegatePluginRegistry {
+ public:
+  typedef std::function<std::unique_ptr<DelegatePluginInterface>(
+      const TFLiteSettings&)>
+      CreatorFunction;
+  // Returns a DelegatePluginInterface registered with `name` or nullptr if no
+  // matching plugin found.
+  // TFLiteSettings is per-plugin, so that the corresponding delegate options
+  // data lifetime is maintained.
+  static std::unique_ptr<DelegatePluginInterface> CreateByName(
+      const std::string& name, const TFLiteSettings& settings);
+
+  // Struct to be statically allocated for registration.
+  struct Register {
+    Register(const std::string& name, CreatorFunction creator_function);
+  };
+
+ private:
+  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
+  std::unique_ptr<DelegatePluginInterface> CreateImpl(
+      const std::string& name, const TFLiteSettings& settings);
+  static DelegatePluginRegistry* GetSingleton();
+  absl::Mutex mutex_;
+  std::unordered_map<std::string, CreatorFunction> factories_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f) \
+  static auto* g_delegate_plugin_##name##_ =                     \
+      new tflite::delegates::DelegatePluginRegistry::Register(#name, f);
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(name, f) \
+  TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f);
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/nnapi_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/nnapi_plugin.h
new file mode 100644
index 00000000..65c64129
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/nnapi_plugin.h
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/delegate_registry.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
+
+// This file provides the NNApiPlugin class, which implements the
+// TFLite Delegate Plugin for the NNAPI Delegate.
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace delegates {
+
+class NnapiPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    std::unique_ptr<tflite::StatefulNnApiDelegate> nnapi_delegate = nullptr;
+    if (!support_library_handle_) {
+      nnapi_delegate =
+          std::make_unique<tflite::StatefulNnApiDelegate>(options_);
+    } else {
+      auto nnapi_support_library_driver =
+          reinterpret_cast<const NnApiSLDriverImplFL5*>(
+              support_library_handle_);
+      nnapi_delegate = std::make_unique<tflite::StatefulNnApiDelegate>(
+          nnapi_support_library_driver, options_);
+    }
+    return TfLiteDelegatePtr(
+        nnapi_delegate.release(), [](TfLiteDelegate* delegate) {
+          delete static_cast<tflite::StatefulNnApiDelegate*>(delegate);
+        });
+  }
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override {
+    auto nnapi_delegate =
+        static_cast<tflite::StatefulNnApiDelegate*>(from_delegate);
+    return nnapi_delegate->GetNnApiErrno();
+  }
+  static std::unique_ptr<NnapiPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return std::make_unique<NnapiPlugin>(tflite_settings);
+  }
+  explicit NnapiPlugin(const TFLiteSettings& tflite_settings) {
+    const NNAPISettings* nnapi_settings = tflite_settings.nnapi_settings();
+    if (!nnapi_settings) return;
+    if (nnapi_settings->accelerator_name() &&
+        nnapi_settings->accelerator_name()->Length() != 0) {
+      accelerator_ = nnapi_settings->accelerator_name()->str();
+      options_.accelerator_name = accelerator_.c_str();
+    }
+
+    SetCompilationCacheDir(tflite_settings);
+    SetModelToken(tflite_settings);
+
+    options_.execution_preference =
+        ConvertExecutionPrefence(nnapi_settings->execution_preference());
+    options_.disallow_nnapi_cpu =
+        !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
+    options_.execution_priority =
+        ConvertExecutionPriority(nnapi_settings->execution_priority());
+    options_.allow_fp16 = nnapi_settings->allow_fp16_precision_for_fp32();
+    options_.use_burst_computation = nnapi_settings->use_burst_computation();
+    if (tflite_settings.max_delegated_partitions() >= 0) {
+      options_.max_number_delegated_partitions =
+          tflite_settings.max_delegated_partitions();
+    }
+    support_library_handle_ = nnapi_settings->support_library_handle();
+  }
+  const tflite::StatefulNnApiDelegate::Options& Options() { return options_; }
+  const int64_t GetSupportLibraryHandle() { return support_library_handle_; }
+
+ private:
+  void SetCompilationCacheDir(const TFLiteSettings& tflite_settings) {
+    if (tflite_settings.compilation_caching_settings() &&
+        tflite_settings.compilation_caching_settings()->cache_dir() &&
+        tflite_settings.compilation_caching_settings()->cache_dir()->Length() !=
+            0) {
+      cache_dir_ =
+          tflite_settings.compilation_caching_settings()->cache_dir()->str();
+      options_.cache_dir = cache_dir_.c_str();
+    } else if (tflite_settings.nnapi_settings() &&
+               tflite_settings.nnapi_settings()->cache_directory() &&
+               tflite_settings.nnapi_settings()->cache_directory()->Length() !=
+                   0) {
+      cache_dir_ = tflite_settings.nnapi_settings()->cache_directory()->str();
+      options_.cache_dir = cache_dir_.c_str();
+    }
+  }
+
+  void SetModelToken(const TFLiteSettings& tflite_settings) {
+    if (tflite_settings.compilation_caching_settings() &&
+        tflite_settings.compilation_caching_settings()->model_token() &&
+        tflite_settings.compilation_caching_settings()
+                ->model_token()
+                ->Length() != 0) {
+      model_token_ =
+          tflite_settings.compilation_caching_settings()->model_token()->str();
+      options_.model_token = model_token_.c_str();
+    } else if (tflite_settings.nnapi_settings()->model_token() &&
+               tflite_settings.nnapi_settings()->model_token()->Length() != 0) {
+      model_token_ = tflite_settings.nnapi_settings()->model_token()->str();
+      options_.model_token = model_token_.c_str();
+    }
+  }
+
+  static inline tflite::StatefulNnApiDelegate::Options::ExecutionPreference
+  ConvertExecutionPrefence(
+      NNAPIExecutionPreference from_compatibility_preference) {
+    using TflitePreference =
+        tflite::StatefulNnApiDelegate::Options::ExecutionPreference;
+    switch (from_compatibility_preference) {
+      case NNAPIExecutionPreference_NNAPI_LOW_POWER:
+        return TflitePreference::kLowPower;
+      case NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER:
+        return TflitePreference::kFastSingleAnswer;
+      case NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED:
+        return TflitePreference::kSustainedSpeed;
+      default:
+        return TflitePreference::kUndefined;
+    }
+  }
+
+  static inline int ConvertExecutionPriority(
+      NNAPIExecutionPriority from_compatibility_priority) {
+    switch (from_compatibility_priority) {
+      case NNAPIExecutionPriority_NNAPI_PRIORITY_LOW:
+        return ANEURALNETWORKS_PRIORITY_LOW;
+      case NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM:
+        return ANEURALNETWORKS_PRIORITY_MEDIUM;
+      case NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH:
+        return ANEURALNETWORKS_PRIORITY_HIGH;
+      default:
+        return ANEURALNETWORKS_PRIORITY_DEFAULT;
+    }
+  }
+
+  std::string accelerator_, cache_dir_, model_token_;
+  tflite::StatefulNnApiDelegate::Options options_;
+  int64_t support_library_handle_ = 0;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h
new file mode 100644
index 00000000..25ac6472
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
+
+namespace tflite {
+namespace delegates {
+
+// A dedicated singleton registry for TfLiteStableDelegate.
+// Note that there is also a non-stable delegate registry
+// (third_party/tensorflow/lite/core/acceleration/configuration/
+// delegate_registry.h)
+// but it does not serve very well for TfLiteStableDelegate as it could not
+// register all the information of TfLiteStableDelegate and it uses concrete
+// types.
+class StableDelegateRegistry {
+ public:
+  // Registers a TfLiteStableDelegate pointer to the registry.
+  static void RegisterStableDelegate(const TfLiteStableDelegate* delegate);
+  // Retrieves the pointer to the corresponding TfLiteStableDelegate from the
+  // registry given a delegate name. Returns nullptr if no registration found.
+  static const TfLiteStableDelegate* RetrieveStableDelegate(
+      const std::string& name);
+
+ private:
+  static StableDelegateRegistry* GetSingleton();
+  void RegisterStableDelegateImpl(const TfLiteStableDelegate* delegate);
+  const TfLiteStableDelegate* RetrieveStableDelegateImpl(
+      const std::string& name);
+
+  absl::Mutex mutex_;
+  std::unordered_map<std::string, const TfLiteStableDelegate*> registry_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/error_reporter.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/error_reporter.h
new file mode 100644
index 00000000..f9106046
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/error_reporter.h
@@ -0,0 +1,20 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_CORE_API_ERROR_REPORTER_H_
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_CORE_API_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/flatbuffer_conversions.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/flatbuffer_conversions.h
new file mode 100644
index 00000000..5bc70cc6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -0,0 +1,465 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+#define TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+
+// These functions transform codes and data structures that are defined in the
+// flatbuffer serialization format into in-memory values that are used by the
+// runtime API and interpreter.
+
+#include <cstddef>
+#include <new>
+#include <type_traits>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Interface class for builtin data allocations.
+class BuiltinDataAllocator {
+ public:
+  virtual void* Allocate(size_t size, size_t alignment_hint) = 0;
+  virtual void Deallocate(void* data) = 0;
+
+  // Allocate a structure, but make sure it is a POD structure that doesn't
+  // require constructors to run. The reason we do this, is that Interpreter's C
+  // extension part will take ownership so destructors  will not be run during
+  // deallocation.
+  template <typename T>
+  T* AllocatePOD() {
+    static_assert(std::is_trivially_destructible<T>::value,
+                  "Builtin data structure must be POD.");
+    void* allocated_memory = this->Allocate(sizeof(T), alignof(T));
+    return new (allocated_memory) T();
+  }
+
+  virtual ~BuiltinDataAllocator() {}
+};
+
+// Parse the appropriate data out of the op.
+//
+// This handles builtin data explicitly as there are flatbuffer schemas.
+// If it returns kTfLiteOk, it passes the data out with `builtin_data`. The
+// calling function has to pass in an allocator object, and this allocator
+// will be called to reserve space for the output data. If the calling
+// function's allocator reserves memory on the heap, then it's the calling
+// function's responsibility to free it.
+// If it returns kTfLiteError, `builtin_data` will be `nullptr`.
+TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+// Converts the tensor data type used in the flat buffer to the representation
+// used by the runtime.
+TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
+                               ErrorReporter* error_reporter);
+
+TfLiteStatus ParseAbs(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAdd(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAddN(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMax(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMin(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAssignVariable(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+TfLiteStatus ParseBatchMatMul(const Operator* op, ErrorReporter* error_reporter,
+                              BuiltinDataAllocator* allocator,
+                              void** builtin_data);
+
+TfLiteStatus ParseBatchToSpaceNd(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+TfLiteStatus ParseBroadcastArgs(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+TfLiteStatus ParseBroadcastTo(const Operator* op, ErrorReporter* error_reporter,
+                              BuiltinDataAllocator* allocator,
+                              void** builtin_data);
+
+TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseCeil(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseCast(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseConcatenation(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+TfLiteStatus ParseConv2D(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseCos(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseCumsum(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseDepthToSpace(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseDepthwiseConv2D(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseDequantize(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseElu(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseEmbeddingLookup(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseExp(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseExpandDims(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseFill(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseFloor(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseFloorDiv(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseFloorMod(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseFullyConnected(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+TfLiteStatus ParseGather(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseGatherNd(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseGreater(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseGreaterEqual(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseHardSwish(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseIf(const Operator* op, ErrorReporter* error_reporter,
+                     BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseL2Normalization(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseLeakyRelu(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLess(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseLessEqual(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLog(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseLogicalAnd(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLogicalNot(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLogicalOr(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLogistic(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseLogSoftmax(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseMinimum(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseMirrorPad(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseMul(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseNeg(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseNotEqual(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParsePack(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePad(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePadV2(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePool(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePow(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePrelu(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseQuantize(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseReadVariable(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseReducer(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRelu(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRelu6(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseResizeBilinear(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
+                                        ErrorReporter* error_reporter,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data);
+
+TfLiteStatus ParseRound(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRsqrt(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSelectV2(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseShape(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSin(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSlice(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSpaceToBatchNd(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+TfLiteStatus ParseSpaceToDepth(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSplitV(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSqueeze(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSqrt(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSquare(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSquaredDifference(const Operator* op,
+                                    ErrorReporter* error_reporter,
+                                    BuiltinDataAllocator* allocator,
+                                    void** builtin_data);
+
+TfLiteStatus ParseStridedSlice(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseSub(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSvdf(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseTanh(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseTranspose(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseTransposeConv(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+TfLiteStatus ParseUnpack(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseUnidirectionalSequenceLSTM(const Operator* op,
+                                             ErrorReporter* error_reporter,
+                                             BuiltinDataAllocator* allocator,
+                                             void** builtin_data);
+
+TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseBitwiseXor(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseRightShift(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseStablehloScatter(const Operator* op,
+                                   ErrorReporter* error_reporter,
+                                   BuiltinDataAllocator* allocator,
+                                   void** builtin_data);
+
+TfLiteStatus ParseStablehloRngBitGenerator(const Operator* op,
+                                           ErrorReporter* error_reporter,
+                                           BuiltinDataAllocator* allocator,
+                                           void** builtin_data);
+
+TfLiteStatus ParseStablehloGather(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseStablehloReduceWindow(const Operator* op,
+                                        ErrorReporter* error_reporter,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data);
+
+TfLiteStatus ParseStablehloPad(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseStablehloComposite(const Operator* op,
+                                     ErrorReporter* error_reporter,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
+TfLiteStatus ParseStablehloShiftLeft(const Operator* op,
+                                     ErrorReporter* error_reporter,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
+TfLiteStatus ParseStablehloCase(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/op_resolver.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/op_resolver.h
new file mode 100644
index 00000000..793047fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/op_resolver.h
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
+
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+#ifndef DOXYGEN_SKIP
+class OpResolverInternal;  // For friend declaration below.
+class Subgraph;            // For friend declaration below.
+
+namespace internal {
+class CommonOpaqueConversionUtil;  // For friend declaration below.
+class OperatorsCache;              // Forward decl.
+}  // namespace internal
+#endif
+
+/// Abstract interface that returns TfLiteRegistrations given op codes or custom
+/// op names. This is the mechanism that ops being referenced in the flatbuffer
+/// model are mapped to executable function pointers (TfLiteRegistrations).
+///
+/// The lifetime of the TfLiteRegistration object whose address is
+/// returned by FindOp must exceed the lifetime of any InterpreterBuilder or
+/// Interpreter created with this OpResolver.
+/// Likewise the lifetime of the TfLiteOperator object referenced
+/// from the TfLiteRegistration object, if any, must exceed the lifetime of
+/// any InterpreterBuilder or Interpreter created with this OpResolver.
+class OpResolver {
+ public:
+  /// Finds the op registration for a builtin operator by enum code.
+  virtual const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                           int version) const = 0;
+  /// Finds the op registration of a custom operator by op name.
+  virtual const TfLiteRegistration* FindOp(const char* op,
+                                           int version) const = 0;
+
+  // Represents a sequence of delegates.
+  using TfLiteDelegatePtrVector =
+      std::vector<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>>;
+
+  // Returns optional delegates for resolving and handling ops in the flatbuffer
+  // model. This may be used in addition to the standard TfLiteRegistration
+  // lookup for graph resolution.
+  // WARNING: This API is deprecated, GetDelegateCreators is preferred.
+  virtual TfLiteDelegatePtrVector GetDelegates(int num_threads) const {
+    return {};
+  }
+
+  // Represents a function that creates a TfLite delegate instance.
+  using TfLiteDelegateCreator =
+      std::function<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
+          TfLiteContext* /*context*/)>;
+
+  // Represents a sequence of delegate creator functions.
+  using TfLiteDelegateCreators = std::vector<TfLiteDelegateCreator>;
+
+  // Returns a vector of delegate creators to create optional delegates for
+  // resolving and handling ops in the flatbuffer model. This may be used in
+  // addition to the standard TfLiteRegistration lookup for graph resolution.
+  //
+  // Note that this method is not used (will not be called) if you are using
+  // TF Lite in Google Play Services; the GetOpaqueDelegateCreators method
+  // (see below) is used for that case.
+  virtual TfLiteDelegateCreators GetDelegateCreators() const { return {}; }
+
+  // TODO(b/202712825): it would be nice if we could avoid the need for separate
+  // "opaque" types & methods for use only with TF Lite in Google Play Services.
+
+  // Represents an opaque delegate instance.
+  // WARNING: Experimental interface, subject to change.
+  using TfLiteOpaqueDelegatePtr =
+      std::unique_ptr<TfLiteOpaqueDelegate, void (*)(TfLiteOpaqueDelegate*)>;
+
+  // Represents a function that creates an opaque delegate instance.
+  // WARNING: Experimental interface, subject to change.
+  using TfLiteOpaqueDelegateCreator =
+      std::function<TfLiteOpaqueDelegatePtr(int /*num_threads*/)>;
+
+  // Represents a sequence of opaque delegate creator functions.
+  // WARNING: Experimental interface, subject to change.
+  using TfLiteOpaqueDelegateCreators = std::vector<TfLiteOpaqueDelegateCreator>;
+
+  // Returns a vector of opaque delegate creators to create optional opaque
+  // delegates for resolving and handling ops in the flatbuffer model. This may
+  // be used in addition to the standard TfLiteRegistration lookup for graph
+  // resolution.
+  //
+  // Note that this method will be called only if you are using TF Lite in
+  // Google Play Services; if you are using regular TF Lite, GetDelegateCreators
+  // (see above) is used instead.
+  //
+  // WARNING: Experimental interface, subject to change.
+  virtual TfLiteOpaqueDelegateCreators GetOpaqueDelegateCreators() const {
+    return {};
+  }
+
+  virtual ~OpResolver() = default;
+  OpResolver() = default;
+  OpResolver(const OpResolver& other) = default;
+
+ private:
+  /// Returns true if this OpResolver may contain any "user defined" ops.
+  /// By "user defined" ops, we mean any op definitions other than those
+  /// contained in tflite::ops::builtin::BuiltinOpResolver.
+  ///
+  /// If this method returns true, it doesn't necessarily mean that the
+  /// OpResolver contains a user-defined op, just that the absence of
+  /// user-defined ops can't be guaranteed.
+  ///
+  /// Note that "user-defined" ops are not the same as "custom" ops;
+  /// BuiltinOpResolver may support certain "custom" ops, in addition to
+  /// "builtin" ops, and may not support all of the "builtin" op enum values.
+  virtual bool MayContainUserDefinedOps() const { return true; }
+
+#ifndef DOXYGEN_SKIP
+  friend class OpResolverInternal;
+  friend class Subgraph;  // For OpId.
+  friend class tflite::internal::CommonOpaqueConversionUtil;
+  friend class tflite::internal::OperatorsCache;
+#endif
+
+  // This holds the identity of an operator.
+  // Ths is used as the key for the OperatorsCache below.
+  struct OpId {
+    int builtin_code;
+    const char* custom_name;
+    int version;
+    bool operator==(const OpId& other) const {
+      return builtin_code == other.builtin_code &&
+             custom_name == other.custom_name && version == other.version;
+    }
+    struct Hasher {
+      size_t operator()(const OpId& op_id) const {
+        size_t hash_builtin_code = std::hash<int>()(op_id.builtin_code);
+        size_t hash_custom_name =
+            op_id.custom_name != nullptr
+                ? std::hash<std::string>()(std::string(op_id.custom_name))
+                : 0;
+        size_t hash_version = std::hash<int>()(op_id.version);
+        return Combine(hash_builtin_code,
+                       Combine(hash_custom_name, hash_version));
+      }
+
+     private:
+      static size_t Combine(size_t hash1, size_t hash2) {
+        constexpr int num_bits_to_rotate_left = 21;
+        constexpr int num_bits_to_rotate_right =
+            std::numeric_limits<size_t>::digits - num_bits_to_rotate_left;
+        size_t hash1_rotated = (hash1 << num_bits_to_rotate_left) |
+                               (hash1 >> num_bits_to_rotate_right);
+        return hash1_rotated + hash2;
+      }
+    };
+  };
+
+  // A set of 'TfLiteOperator' objects whose lifetimes need to
+  // last at least as long as the lifetime of the OpResolver.
+  // We use shared_ptr rather than unique_ptr here, to allow the
+  // OperatorsCache to be shared with other classes such as the
+  // InterpreterBuilder and Interpreter. This is so that the
+  // TfLiteOperator objects allocated by an OpResolver,
+  // which may be referenced by a Subgraph in an Interpreter, can remain live
+  // even if the OpResolver is destroyed, while also allowing the same
+  // OpResolver to be used with multiple InterpreterBuilders and multiple
+  // Interpreters.
+  mutable std::shared_ptr<internal::OperatorsCache>
+      registration_externals_cache_;
+};
+
+#ifndef DOXYGEN_SKIP
+// Type for a set of owned 'TfLiteOperator' objects.
+// This is needed when converting TfLiteRegistration to
+// TfLiteOperator, to ensure that the number of
+// TfLiteOperator objects that we allocate is bounded, and to
+// ensure that those objects get deallocated at the appropriate time.
+// We use a public class rather than a typedef or using declaration here,
+// to ensure that the class can be forward-declared.
+// WARNING: Experimental interface, subject to change.
+namespace internal {
+class OperatorsCache
+    : private std::unordered_map<OpResolver::OpId,
+                                 std::unique_ptr<TfLiteOperator>,
+                                 OpResolver::OpId::Hasher> {
+  friend class ::tflite::Subgraph;
+  friend class ::tflite::internal::CommonOpaqueConversionUtil;
+};
+}  // namespace internal
+#endif
+
+// Handles the logic for converting between an OperatorCode structure extracted
+// from a flatbuffer and information about a registered operator
+// implementation.
+TfLiteStatus GetRegistrationFromOpCode(const OperatorCode* opcode,
+                                       const OpResolver& op_resolver,
+                                       ErrorReporter* error_reporter,
+                                       const TfLiteRegistration** registration);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/op_resolver_internal.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/op_resolver_internal.h
new file mode 100644
index 00000000..b30debe9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/op_resolver_internal.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_INTERNAL_H_
+
+/// \file
+///
+/// This header op_resolver_internal.h exists so that we can have fine-grained
+/// access control on the MayContainUserDefinedOps method
+/// and registration_externals_cache_ member.
+
+#include <memory>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+
+namespace tflite {
+
+class OpResolverInternal {
+ public:
+  OpResolverInternal() = delete;
+
+  static bool MayContainUserDefinedOps(const OpResolver& op_resolver) {
+    return op_resolver.MayContainUserDefinedOps();
+  }
+
+  // Get a shared_ptr to the OperatorsCache from an OpResolver.
+  // This is used to allow the InterpreterBuilder and OpResolver to share
+  // the same OperatorsCache, so that the Operator objects in it can persist
+  // for the lifetimes of both the InterpreterBuilder and OpResolver.
+  static std::shared_ptr<::tflite::internal::OperatorsCache> GetSharedCache(
+      const ::tflite::OpResolver& op_resolver) {
+    return op_resolver.registration_externals_cache_;
+  }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_OP_RESOLVER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/profiler.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/profiler.h
new file mode 100644
index 00000000..5affb56a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/profiler.h
@@ -0,0 +1,250 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_PROFILER_H_
+#define TENSORFLOW_LITE_CORE_API_PROFILER_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+// A simple utility for enabling profiled event tracing in TensorFlow Lite.
+class Profiler {
+ public:
+  // As certain Profiler instance might be only interested in certain event
+  // types, we define each event type value to allow a Profiler to use
+  // bitmasking bitwise operations to determine whether an event should be
+  // recorded or not.
+  enum class EventType {
+    // Default event type, the metadata field has no special significance.
+    DEFAULT = 1,
+
+    // The event is an operator invocation and the event_metadata field is the
+    // index of operator node.
+    OPERATOR_INVOKE_EVENT = 1 << 1,
+
+    // The event is an invocation for an internal operator of a TFLite delegate.
+    // The event_metadata field is the index of operator node that's specific to
+    // the delegate.
+    DELEGATE_OPERATOR_INVOKE_EVENT = 1 << 2,
+
+    // A delegate op invoke event that profiles a delegate op in the
+    // Operator-wise Profiling section and not in the Delegate internal section.
+    DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT = 1 << 3,
+
+    // The event is a recording of runtime instrumentation such as the overall
+    // TFLite runtime status, the TFLite delegate status (if a delegate
+    // is applied), and the overall model inference latency etc.
+    // Note, the delegate status and overall status are stored as separate
+    // event_metadata fields. In particular, the delegate status is encoded
+    // as DelegateStatus::full_status().
+    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 1 << 4,
+
+    // Telemetry events. Users and code instrumentations should invoke Telemetry
+    // calls instead of using the following types directly.
+    // See experimental/telemetry:profiler for definition of each metadata.
+    //
+    // A telemetry event that reports model and interpreter level events.
+    TELEMETRY_EVENT = 1 << 5,
+    // A telemetry event that reports model and interpreter level settings.
+    TELEMETRY_REPORT_SETTINGS = 1 << 6,
+    // A telemetry event that reports delegate level events.
+    TELEMETRY_DELEGATE_EVENT = 1 << 7,
+    // A telemetry event that reports delegate settings.
+    TELEMETRY_DELEGATE_REPORT_SETTINGS = 1 << 8,
+  };
+
+  virtual ~Profiler() {}
+
+  // Signals the beginning of an event and returns a handle to the profile
+  // event. The `event_metadata1` and `event_metadata2` have different
+  // interpretations based on the actual Profiler instance and the `event_type`.
+  // For example, as for the 'SubgraphAwareProfiler' defined in
+  // lite/core/subgraph.h, when the event_type is OPERATOR_INVOKE_EVENT,
+  // `event_metadata1` represents the index of a TFLite node, and
+  // `event_metadata2` represents the index of the subgraph that this event
+  // comes from.
+  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
+                              int64_t event_metadata1,
+                              int64_t event_metadata2) = 0;
+  // Similar w/ the above, but `event_metadata2` defaults to 0.
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata) {
+    return BeginEvent(tag, event_type, event_metadata, /*event_metadata2*/ 0);
+  }
+
+  // Signals an end to the specified profile event with 'event_metadata's, This
+  // is useful when 'event_metadata's are not available when the event begins
+  // or when one wants to overwrite the 'event_metadata's set at the beginning.
+  virtual void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                        int64_t event_metadata2) {
+    // By default discards the metadata.
+    EndEvent(event_handle);
+  }
+  // Signals an end to the specified profile event.
+  virtual void EndEvent(uint32_t event_handle) = 0;
+
+  // Appends an event of type 'event_type' with 'tag' and 'event_metadata'
+  // which ran for elapsed_time.
+  // Note:
+  // In cases were ProfileSummarizer and tensorflow::StatsCalculator are used
+  // they assume the value is in "usec", if in any case subclasses
+  // didn't put usec, then the values are not meaningful.
+  // TODO(karimnosseir): karimnosseir: Revisit and make the function more clear.
+  void AddEvent(const char* tag, EventType event_type, uint64_t elapsed_time,
+                int64_t event_metadata) {
+    AddEvent(tag, event_type, elapsed_time, event_metadata,
+             /*event_metadata2*/ 0);
+  }
+
+  // Adds a profiler event.
+  // `metric` field has different intreptation based on `event_type`.
+  // e.g. it means elapsed time for [DELEGATE_]OPERATOR_INVOKE_EVENT types,
+  // and interprets as source and status code for TELEMETRY_[DELEGATE_]EVENT
+  // event types. If the concrete profiler does not provide an implementation,
+  // does nothing.
+  // TODO(b/241982974): Clean up dependencies and make it pure virtual.
+  virtual void AddEvent(const char* tag, EventType event_type, uint64_t metric,
+                        int64_t event_metadata1, int64_t event_metadata2) {}
+
+  // Adds a profiler event with data.
+  // Data will be a const TelemetrySettings* for TELEMETRY_REPORT_SETTINGS
+  // and TELEMETRY_DELEGATE_REPORT_SETTINGS.
+  // If the concrete profiler does not provide an implementation, does nothing.
+  // TODO(b/241982974): Clean up dependencies and make it pure virtual.
+  virtual void AddEventWithData(const char* tag, EventType event_type,
+                                const void* data) {}
+
+ protected:
+  friend class ScopedProfile;
+};
+
+// Adds a profile event to `profiler` that begins with the construction
+// of the object and ends when the object goes out of scope.
+// The lifetime of tag should be at least the lifetime of `profiler`.
+// `profiler` may be null, in which case nothing is profiled.
+class ScopedProfile {
+ public:
+  ScopedProfile(Profiler* profiler, const char* tag,
+                Profiler::EventType event_type = Profiler::EventType::DEFAULT,
+                int64_t event_metadata = 0)
+      : profiler_(profiler), event_handle_(0) {
+    if (profiler) {
+      event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
+    }
+  }
+
+  ~ScopedProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_);
+    }
+  }
+
+ protected:
+  Profiler* profiler_;
+  uint32_t event_handle_;
+};
+
+class ScopedOperatorProfile : public ScopedProfile {
+ public:
+  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
+      : ScopedProfile(profiler, tag, Profiler::EventType::OPERATOR_INVOKE_EVENT,
+                      static_cast<uint32_t>(node_index)) {}
+};
+
+class ScopedDelegateOperatorProfile : public ScopedProfile {
+ public:
+  ScopedDelegateOperatorProfile(Profiler* profiler, const char* tag,
+                                int node_index)
+      : ScopedProfile(profiler, tag,
+                      Profiler::EventType::DELEGATE_OPERATOR_INVOKE_EVENT,
+                      static_cast<uint32_t>(node_index)) {}
+};
+
+class ScopedDelegateProfiledOperatorProfile : public ScopedProfile {
+ public:
+  ScopedDelegateProfiledOperatorProfile(Profiler* profiler, const char* tag,
+                                        int node_index)
+      : ScopedProfile(
+            profiler, tag,
+            Profiler::EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT,
+            static_cast<uint32_t>(node_index)) {}
+};
+
+// Similar to ScopedProfile but has extra event metadata for EndEvent.
+class ScopedRuntimeInstrumentationProfile {
+ public:
+  ScopedRuntimeInstrumentationProfile(Profiler* profiler, const char* tag)
+      : profiler_(profiler), event_handle_(0) {
+    if (profiler) {
+      event_handle_ = profiler_->BeginEvent(
+          tag, Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT,
+          /*event_metadata=*/-1);
+    }
+  }
+
+  void set_runtime_status(int64_t delegate_status, int64_t interpreter_status) {
+    if (profiler_) {
+      delegate_status_ = delegate_status;
+      interpreter_status_ = interpreter_status;
+    }
+  }
+
+  ~ScopedRuntimeInstrumentationProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_, delegate_status_, interpreter_status_);
+    }
+  }
+
+ private:
+  Profiler* profiler_ = nullptr;
+  uint32_t event_handle_ = 0;
+  int64_t delegate_status_ = 0;
+  int64_t interpreter_status_ = 0;
+};
+
+}  // namespace tflite
+
+#define TFLITE_VARNAME_UNIQ_IMPL(name, ctr) name##ctr
+#define TFLITE_VARNAME_UNIQ(name, ctr) TFLITE_VARNAME_UNIQ_IMPL(name, ctr)
+
+#define TFLITE_SCOPED_TAGGED_DEFAULT_PROFILE(profiler, tag)          \
+  tflite::ScopedProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
+      (profiler), (tag))
+
+#define TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)     \
+  tflite::ScopedOperatorProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
+      (profiler), (tag), (node_index))
+
+#define TFLITE_SCOPED_DELEGATE_OPERATOR_PROFILE(profiler, tag, node_index) \
+  tflite::ScopedDelegateOperatorProfile TFLITE_VARNAME_UNIQ(               \
+      _profile_, __COUNTER__)((profiler), (tag), (node_index))
+
+#define TFLITE_SCOPED_DELEGATE_PROFILED_OPERATOR_PROFILE(profiler, tag, \
+                                                         node_index)    \
+  tflite::ScopedDelegateProfiledOperatorProfile TFLITE_VARNAME_UNIQ(    \
+      _profile_, __COUNTER__)((profiler), (tag), (node_index))
+
+#define TFLITE_ADD_RUNTIME_INSTRUMENTATION_EVENT(                          \
+    profiler, tag, event_metadata1, event_metadata2)                       \
+  do {                                                                     \
+    if (profiler) {                                                        \
+      const auto handle = profiler->BeginEvent(                            \
+          tag, Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, \
+          event_metadata1, event_metadata2);                               \
+      profiler->EndEvent(handle);                                          \
+    }                                                                      \
+  } while (false);
+
+#endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/tensor_utils.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/tensor_utils.h
new file mode 100644
index 00000000..440da8ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/tensor_utils.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// Resets a variable tensor to the default value.
+TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/api/verifier.h b/third_party/tflite-hdrs/tensorflow/lite/core/api/verifier.h
new file mode 100644
index 00000000..dcb1d029
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/api/verifier.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Abstract interface for verifying a model.
+#ifndef TENSORFLOW_LITE_CORE_API_VERIFIER_H_
+#define TENSORFLOW_LITE_CORE_API_VERIFIER_H_
+
+#include "tensorflow/compiler/mlir/lite/core/api/verifier.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_CORE_API_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/async_kernel_internal.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/async_kernel_internal.h
new file mode 100644
index 00000000..efc341be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/async_kernel_internal.h
@@ -0,0 +1,162 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+typedef struct TfLiteAttributeMap TfLiteAttributeMap;
+typedef struct TfLiteBackendBuffer TfLiteBackendBuffer;
+typedef struct TfLiteExecutionTask TfLiteExecutionTask;
+
+struct TfLiteAsyncKernel {
+  // Stores the arbitrary data used to identify the async kernel it self.
+  // Filled by the backend delegate.
+  void* kernel_data;
+
+  // Buffer operations
+  // ======================
+  // Registers the TfLiteBackendBuffer to `handle`.
+  // `buffer` and `attrs` lifespan is not guaranteed after the function call.
+  // kernels should read the stored attributes instead of caching the
+  // attribute map.
+  // `io_type` specifies whether this buffer is used as an input buffer
+  // or an output buffer. If a buffer is both used as input and output,
+  // specify it as output. Not null.
+  // `attrs` describes the attributes of the buffer. It's guaranteed to be
+  // of kTfLiteAttrMapTypeBuffer type and not null.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  TfLiteStatus (*register_buffer)(TfLiteAsyncKernel* async_kernel,
+                                  TfLiteOpaqueContext* context,
+                                  TfLiteIoType io_type,
+                                  const TfLiteBackendBuffer* buffer,
+                                  const TfLiteAttributeMap* attrs,
+                                  TfLiteBufferHandle handle) = nullptr;
+
+  // Registers a buffer slice from a previously registered TfLiteBackendBuffer.
+  // `buffer` is the handle of the buffer pool previously registered.
+  // `attrs` contains the information of the buffer slice.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // If the `handle` is not recognized, returns error.
+  TfLiteStatus (*register_buffer_slice)(TfLiteAsyncKernel* async_kernel,
+                                        TfLiteOpaqueContext* context,
+                                        TfLiteBufferHandle buffer_pool,
+                                        const TfLiteAttributeMap* attrs,
+                                        TfLiteBufferHandle handle) = nullptr;
+
+  // Unregisters a buffer or a buffer slice.
+  // `handle` is a buffer handle previously assigned via register_* calls.
+  // If the `handle` is not recognized, returns error.
+  TfLiteStatus (*unregister_buffer)(TfLiteAsyncKernel* async_kernel,
+                                    TfLiteOpaqueContext* context,
+                                    TfLiteBufferHandle handle) = nullptr;
+
+  // Reconciliations
+  // ===================
+  // Inspects the buffer object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // buffer.
+  // Note: the lifespan of returned *`type` strings should be tied to that
+  // of the backend delegate.
+  // Caller DOES NOT own returned types array.
+  void (*supported_buffer_types)(const TfLiteAsyncKernel* async_kernel,
+                                 TfLiteIoType io_type,
+                                 const char* const** types,
+                                 size_t* n_types) = nullptr;
+
+  // Inspects the sync object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // sync object.
+  // Note: the lifespan of returned *`type` strings should be tied to that
+  // of the backend delegate.
+  // Caller DOES NOT own returned types array.
+  void (*supported_synchronizations)(const TfLiteAsyncKernel* async_kernel,
+                                     TfLiteIoType io_type,
+                                     const char* const** types,
+                                     size_t* n_types) = nullptr;
+
+  // Reconciles buffer or sync attributes for tensor at tensor_index.
+  // Fills `merged` with reconciled attributes.
+  // If `conflict` is provided, conflicting attributes will be provided there.
+  // Returns true if there's no conflict.
+  bool (*reconcile_restrictions)(
+      const TfLiteAsyncKernel* async_kernel, const TfLiteOpaqueContext* context,
+      const TfLiteOpaqueNode* node, int tensor_index,
+      const TfLiteAttributeMap* user_provided_attributes,
+      TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) = nullptr;
+
+  // Sets the input / output buffer / sync attributes.
+  // Backend kernel will check the input attributes covers all the requirements.
+  // A typical workflow is for callers call Reconcile*Restrictions method
+  // above to have a merged attribute list, check all restrictions are met
+  // and set input / output attribute here.
+  // Returns TfLiteOk if provided `attrs` covers all requirements.
+  TfLiteStatus (*set_attributes)(TfLiteAsyncKernel* async_kernel,
+                                 TfLiteOpaqueContext* context,
+                                 TfLiteOpaqueNode* node, int tensor_index,
+                                 const TfLiteAttributeMap* attrs) = nullptr;
+
+  // Set attributes to the buffer, backend kernel will validate the buffer.
+  TfLiteStatus (*set_buffer_attributes)(
+      TfLiteAsyncKernel* async_kernel, const TfLiteBackendBuffer* buffer,
+      const TfLiteAttributeMap* attrs) = nullptr;
+
+  // Get attributes from the buffer, backend kernel will validate the buffer.
+  TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                        const TfLiteBackendBuffer* buffer,
+                                        TfLiteAttributeMap* attrs) = nullptr;
+
+  // Prepares the kernel using the information from Set[In|Out]putAttributes
+  // call above.
+  TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
+                          TfLiteOpaqueContext* context,
+                          TfLiteOpaqueNode* node) = nullptr;
+  // Execution methods
+  // =============================
+
+  // Schedules an execution with the information provided in task.
+  // The application is responsible for filling out buffer and sync mappings
+  // to tensors.
+  // Backend will set the sync ptr for related tensors if requested.
+  // i.e. SetOutputAttributes has sync implementation requested, and
+  // the TfLiteSynchronization is not null for the tensor in `task`.
+  // Returns kTfLiteOk if the execution is successfully scheduled.
+  TfLiteStatus (*eval)(TfLiteAsyncKernel* async_kernel,
+                       TfLiteOpaqueContext* context, TfLiteOpaqueNode* node,
+                       TfLiteExecutionTask* task) = nullptr;
+
+  // Waits on the execution scheduled using the task to finish.
+  // Returns kTfLiteOk if the task is finished (w/ or w/o blocking).
+  TfLiteStatus (*wait)(TfLiteAsyncKernel* async_kernel,
+                       TfLiteOpaqueContext* context,
+                       TfLiteExecutionTask* task) = nullptr;
+
+  // Finishes the task and clean up allocated resources for the task.
+  // May block if there's pending executions.
+  // Returns kTfLiteOk if there's no error.
+  TfLiteStatus (*finish)(TfLiteAsyncKernel* async_kernel,
+                         TfLiteOpaqueContext* context,
+                         TfLiteExecutionTask* task) = nullptr;
+};
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/async_signature_runner.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/async_signature_runner.h
new file mode 100644
index 00000000..d0a85a6c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/async_signature_runner.h
@@ -0,0 +1,256 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/async_subgraph.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/internal/signature_def.h"
+
+namespace tflite {
+namespace async {
+
+// Forward declarations
+class AsyncSignatureRunnerTest;
+
+// WARNING: Experimental interface, subject to change
+//
+// Async version of SignatureRunner class for running TFLite models using
+// SignatureDef.
+class AsyncSignatureRunner {
+ public:
+  // Builds the AsyncSignatureRunner given the provided signature_def and
+  // subgraph.
+  AsyncSignatureRunner(const internal::SignatureDef* signature_def,
+                       Subgraph* subgraph);
+
+  // Registers a TfLiteBackendBuffer to backends.
+  // The `buffer` will be sent to all backends and TfLite runtime
+  // will assign an unique `handle` for backends to recognize the buffer.
+  // `io_type` specifies whether the buffer will be used as an input only
+  // or it will be used as an output.
+  // `buffer`, `attrs`, and `handle` should not be null.
+  // The application must provide the buffer type in `attrs`. It can also
+  // include additional attributes for the backends to validate (e.g. padding).
+  // Returns kTfLiteError is any of the backends failed to register
+  // the buffer (e.g. buffer type is not supported).
+  TfLiteStatus RegisterBuffer(TfLiteIoType io_type,
+                              const TfLiteBackendBuffer* buffer,
+                              const TfLiteAttributeMap* attrs,
+                              TfLiteBufferHandle* handle);
+
+  // Registers a buffer slice from a previously registered handle `buffer_pool`.
+  // `attrs` needs to contain both the information from the buffer pool
+  // as well as slice information (offset and size).
+  // `attrs` and `handle` should not be nullptr.
+  // If the application choose to provide the buffer type in `attrs` it must be
+  // identical to the buffer type of the buffer pool provided during
+  // RegisterBuffer call.
+  // Returns kTfLiteError if the registration failed (e.g. `buffer_pool`
+  // not found).
+  TfLiteStatus RegisterBufferSlice(TfLiteBufferHandle buffer_pool,
+                                   const TfLiteAttributeMap* attrs,
+                                   TfLiteBufferHandle* handle);
+
+  // Unregisters a buffer (or buffer slice) with `handle`.
+  // Returns kTfLiteError if `handle` is not recognized.
+  TfLiteStatus UnregisterBuffer(TfLiteBufferHandle handle);
+
+  // Returns a list of names of supported buffer types.
+  const std::vector<const char*>& SupportedBufferTypes(
+      TfLiteIoType io_type) const;
+
+  // Returns a list of names of supported synchronization types.
+  const std::vector<const char*>& SupportedSynchronizations(
+      TfLiteIoType io_type) const;
+
+  // Reconciles registrations with all backends depending on I/O tensor `name`
+  // if the backend kernel reads or writes the tensor.
+  // Merged attributes will be populated to `merged`.
+  // If there's a conflict attribute, it's populated to `conflict` if provided.
+  // `user_provided_attributes` and `merged` should not be nullptr.
+  // Returns true if the reconciliation successes and there's no conflicting
+  // attributes.
+  bool ReconcileRestrictions(TfLiteIoType io_type, const char* name,
+                             const TfLiteAttributeMap* user_provided_attributes,
+                             TfLiteAttributeMap* merged,
+                             TfLiteAttributeMap* conflict) const;
+
+  // Reconciles registrations with all backends depending on I/O tensor at
+  // `tensor_index` if the backend kernel reads or writes the tensor. Merged
+  // attributes will be populated to `merged`. If there's a conflict attribute,
+  // it's populated to `conflict` if provided. `user_provided_attributes` and
+  // `merged` should not be nullptr.
+  // Returns true if the reconciliation successes and there's no conflicting
+  // attributes.
+  bool ReconcileRestrictions(int tensor_index,
+                             const TfLiteAttributeMap* user_provided_attributes,
+                             TfLiteAttributeMap* merged,
+                             TfLiteAttributeMap* conflict) const;
+
+  // Finalizes the attribute for I/O tensor `name` with `attrs`.
+  // The attributes will be sent to all backend kernels that depends on tensor.
+  // Must call `Prepare` after setting new attributes.
+  // Returns true if all backends accept the `attrs`.
+  TfLiteStatus SetAttributes(TfLiteIoType io_type, const char* name,
+                             const TfLiteAttributeMap* attrs);
+
+  // Finalizes the attribute for I/O tensor at `tensor_index` with `attrs`.
+  // The attributes will be sent to all backend kernels that depends on tensor.
+  // Must call `Prepare` after setting new attributes.
+  // Returns true if all backends accept the `attrs`.
+  TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
+
+  // Set the attributes of a specific buffer. Returns
+  // kTfLiteDelegateError if the buffer is not registered.
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs);
+
+  // Get the attributes from a specific buffer. Returns
+  // kTfLiteDelegateError if the buffer has not been found in the
+  // backends.
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs);
+
+  // Prepares delegate backends for execution.
+  // Must be called after calling `SetAttributes`.
+  TfLiteStatus PrepareBackends();
+
+  // Creates an execution task for this subgraph.
+  // Must be called after `Prepare`.
+  // When creating task, all intermediate resources will be allocated
+  // for this task.
+  // The task must be released by calling `Finish`.
+  TfLiteExecutionTask* CreateTask();
+
+  // Schedules an asynchronous execution with I/O information
+  // provided in `task`.
+  // `task` should not be nullptr.
+  // Returns kTfLiteError if any backend kernels failed to schedule
+  // the execution.
+  TfLiteStatus InvokeAsync(TfLiteExecutionTask* task);
+
+  // Blocks and wait for execution tied to `task` to finish.
+  // `task` should not be nullptr.
+  // Can be called from multiple threads. All calls will block until the
+  // task finishes execution.
+  //
+  // NOTE: `Wait` and `InvokeAsync` should be called in pairs with the same
+  // `task`, unless `Finish(task)` is called and task is freed. The application
+  // is responsible to call `Wait` after `InvokeAsync` even if all output
+  // tensors are associated with synchronizations.
+  //
+  // Returns kTfLiteError if any backends failed to finish the execution.
+  // If the task is currently idle, it will return its latest status code.
+  TfLiteStatus Wait(TfLiteExecutionTask* task);
+
+  // Finishes the task and release all intermediate resources tied to
+  // this task. Must be and only be called once for the same `task` object.
+  // If there's ongoing execution, will block wait for the execution
+  // to finish.
+  // `task` should not be nullptr and will be deleted.
+  // NOTE: Caller needs to ensure `Finish` is not called concurrently with
+  // `InvokeAsync` or `Wait`.
+  // Returns kTfLiteError if failes to release the task. The task will be
+  // destroyed regardless of error or not.
+  TfLiteStatus Finish(TfLiteExecutionTask* task);
+
+  /// Returns the key for the corresponding signature.
+  const std::string& signature_key() { return signature_key_; }
+
+  /// Returns the number of inputs.
+  size_t input_size() const { return subgraph_->inputs().size(); }
+
+  /// Returns the number of outputs.
+  size_t output_size() const { return subgraph_->outputs().size(); }
+
+  /// Read-only access to list of signature input names.
+  /// Returns an empty vector if the model does not have signature.
+  const std::vector<const char*>& input_names() { return input_names_; }
+
+  /// Read-only access to list of signature output names.
+  /// Returns an empty vector if the model does not have signature.
+  const std::vector<const char*>& output_names() { return output_names_; }
+
+  /// Returns the input tensor information identified by 'input_name' in the
+  /// given signature. Returns nullptr if the given name is not valid.
+  /// Note: The returned `TfLiteTensor` should only be used to retrieve
+  /// tensor metadata (dimension, data type, etc.). Tensor data should only be
+  /// accessed via hardware buffer directly.
+  const TfLiteOpaqueTensor* input_tensor(const char* input_name) const;
+
+  /// Returns the output tensor information identified by 'output_name' in the
+  /// given signature. Returns nullptr if the given name is not valid.
+  /// Note: The returned `TfLiteTensor` should only be used to retrieve
+  /// tensor metadata (dimension, data type, etc.). Tensor data should only be
+  /// accessed via hardware buffer directly.
+  const TfLiteOpaqueTensor* output_tensor(const char* output_name) const;
+
+  /// Tensor index based accessors.
+
+  /// Read only access to list of input index.
+  const std::vector<int>& inputs() const { return subgraph_->inputs(); }
+
+  /// Read only access to list of output index.
+  const std::vector<int>& outputs() const { return subgraph_->outputs(); }
+
+  /// Returns the tensor information by tensor index.
+  const TfLiteOpaqueTensor* tensor(int tensor_index) const {
+    // The following cast is safe only because this code is part of the
+    // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+    // TfLiteOpaqueTensor and TfLiteTensor being equivalent.
+    return reinterpret_cast<const TfLiteOpaqueTensor*>(
+        subgraph_->tensor(tensor_index));
+  }
+
+ private:
+  friend class AsyncSignatureRunnerTest;
+
+  int GetTensorIndex(TfLiteIoType io_type, const char* name) const;
+
+  std::string signature_key_;
+
+  // The list of input tensor names.
+  std::vector<const char*> input_names_;
+  // The list of output tensor names.
+  std::vector<const char*> output_names_;
+
+  // Not owned.
+  // If the model does not have signature def, the name maps will be nullptr.
+  const std::map<std::string, uint32_t>* input_to_index_ = nullptr;
+  const std::map<std::string, uint32_t>* output_to_index_ = nullptr;
+
+  // Not owned.
+  Subgraph* subgraph_ = nullptr;
+
+  // Currently AsyncSubgraph is owned by SignatureRunner. However after
+  // we stabilize the interface, the async subgraph should be owned by the
+  // interpreter and AsyncSignatureRunner won't own any of the subgraphs.
+  std::unique_ptr<AsyncSubgraph> async_subgraph_;
+};
+
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/async_subgraph.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/async_subgraph.h
new file mode 100644
index 00000000..cf4f3c90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/async_subgraph.h
@@ -0,0 +1,198 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SUBGRAPH_H_
+
+#include <atomic>
+#include <map>
+#include <vector>
+
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+
+namespace tflite {
+namespace async {
+
+// Forward declaration
+class AsyncSubgraphTestPeer;
+
+// AsyncSubgraph class manages to dispatch I/O information and
+// schedule executions to underlying delegate kernels.
+// TODO(b/191883048): Currently we require either `AllocateTensors` or
+// `EnsureTensorAllocation` called to ensure the backend kernels are prepared.
+// However, we don't need to allocate the CPU memory for input / output tensors.
+// We need customize the OpPrepare or memory planner to skip the allocation
+// for user provided buffer case.
+class AsyncSubgraph {
+ public:
+  explicit AsyncSubgraph(Subgraph* subgraph);
+
+  // Returns the underlying TfLite subgraph.
+  Subgraph* subgraph() const;
+
+  // Returns the TfLiteContext of the subgraph.
+  TfLiteContext* context() const;
+
+  // Registers a TfLiteBackendBuffer to backends.
+  // The `buffer` will be sent to all backends and TfLite runtime
+  // will assign an unique `handle` for backends to recognize the buffer.
+  // `buffer`, `attrs`, and `handle` should not be null.
+  // Returns kTfLiteError is any of the backends failed to register
+  // the buffer (e.g. buffer type is not supported).
+  TfLiteStatus RegisterBuffer(TfLiteIoType io_type,
+                              const TfLiteBackendBuffer* buffer,
+                              const TfLiteAttributeMap* attrs,
+                              TfLiteBufferHandle* handle);
+
+  // Registers a buffer slice from a previously registered handle `buffer_pool`.
+  // `attrs` needs to contain both the information from the buffer pool
+  // as well as slice information (offset and size).
+  // `attrs` and `handle` should not be nullptr.
+  //
+  // NOTE: When using sliced buffer as output buffer, the application needs to
+  // make sure slices from the same buffer pool should not be used across
+  // different executions (from InvokeAsync call to the output sync signals)
+  // otherwise data corruption may occur.
+  // TODO(b/243175542): Programmatically ensure slices from one buffer are used
+  // exclusively by one backend to write to for a single execution.
+  //
+  // Returns kTfLiteError if the registration failed (e.g. `buffer_pool`
+  // not found).
+  TfLiteStatus RegisterBufferSlice(TfLiteBufferHandle buffer_pool,
+                                   const TfLiteAttributeMap* attrs,
+                                   TfLiteBufferHandle* handle);
+
+  // Unregisters a buffer (or buffer slice) with `handle`.
+  // Returns kTfLiteError if `handle` is not recognized.
+  TfLiteStatus UnregisterBuffer(TfLiteBufferHandle handle);
+
+  // Returns a list of names of supported buffer types.
+  const std::vector<const char*>& SupportedBufferTypes(
+      TfLiteIoType io_type) const;
+
+  // Returns a list of names of supported synchronization types.
+  const std::vector<const char*>& SupportedSynchronizations(
+      TfLiteIoType io_type) const;
+
+  // Reconciles registrations with all backends depending on tensor at
+  // `tensor_index` if the backend kernel reads or writes the tensor.
+  // Merged attributes will be populated to `merged`.
+  // If there's a conflict attribute, it's populated to `conflict` if provided.
+  // `user_provided_attributes` and `merged` should not be nullptr.
+  // Returns true if the reconcilation successes and there's no conflicting
+  // attributes.
+  bool ReconcileRestrictions(int tensor_index,
+                             const TfLiteAttributeMap* user_provided_attributes,
+                             TfLiteAttributeMap* merged,
+                             TfLiteAttributeMap* conflict) const;
+
+  // Finalizes the attribute for tensor at `tensor_index` with `attrs`.
+  // The attributes will be sent to all backend kernels that depends on tensor
+  // at `tensor_index`.
+  // Must call `Prepare` after setting new attributes.
+  // Returns true if all backends accept the `attrs`.
+  TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
+
+  // Set the attributes for a specific buffer. `attrs` should be initialized
+  // before calling this function and could be constructed by calling
+  // TfLiteAttributeMapCreate(). The attributes will be sent to backend kernels
+  // and stored in the map with the buffer. `buffer` and `attrs` should not be
+  // nullptr. The buffer needs to be registered before calling this function.
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs);
+
+  // Get the attributes for a specific buffer. `attrs` should be initialized
+  // before calling this function and could be constructed by calling
+  // TfLiteAttributeMapCreate(). `attrs` will be used to store the attributes
+  // obtained from the backend kernel. If `attrs` is a non-empty map, it will be
+  // overwritten by the attributes of the buffer. `buffer` and `attrs` should
+  // not be nullptr. The buffer needs to be registered before calling this
+  // function.
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs);
+
+  // Prepares delegate backends for execution.
+  // Must be called after calling `SetAttributes`.
+  TfLiteStatus Prepare();
+
+  // Creates an execution task for this subgraph.
+  // Must be called after `Prepare`.
+  // When creating task, all intermediate resources will be allocated
+  // for this task.
+  // The task must be released by calling `Finish`.
+  TfLiteExecutionTask* CreateTask();
+
+  // Schedules an asynchronous execution with I/O information
+  // provided in `task`.
+  // `task` should not be nullptr.
+  // Returns kTfLiteError if any backend kernels failed to schedule
+  // the execution.
+  TfLiteStatus InvokeAsync(TfLiteExecutionTask* task);
+
+  // Blocks and wait for execution tied to `task` to finish.
+  // `task` should not be nullptr.
+  // Returns kTfLiteError if any backends failed to finish the execution.
+  // If the task is currently idle, it will return its latest status code.
+  TfLiteStatus Wait(TfLiteExecutionTask* task);
+
+  // Finishes the task and release all intermediate resources tied to
+  // this task.
+  // If there's ongoing execution, will block wait for the execution
+  // to finish.
+  // `task` should not be nullptr and will be deleted.
+  // Returns kTfLiteError if failes to release the task. In this case `task`
+  // will not be deleted.
+  TfLiteStatus Finish(TfLiteExecutionTask* task);
+
+ private:
+  friend class AsyncSubgraphTestPeer;
+
+  // Returns true if the subgraph is fully delegated by 1 backend.
+  bool IsFullyDelegated() const;
+
+  // Returns the opaque TfLiteContext of the subgraph.
+  TfLiteOpaqueContext* opaque_context() const;
+
+  // Returns the async backend kernel that delegates the subgraph.
+  // NOTE: Since we assume only 1 backend will delegate the model, we cache
+  // the async kernel instance. In theory, the subgraph should iterate through
+  // execution plan to fetch the individual async kernels and operate
+  // respectively.
+  TfLiteAsyncKernel* async_kernel() const;
+
+  // Not owned.
+  Subgraph* subgraph_ = nullptr;
+
+  // Next buffer handle to assign in Register* calls.
+  std::atomic<TfLiteBufferHandle> next_buffer_handle_ = {0};
+
+  // Supported buffer and sync types.
+  std::map<TfLiteIoType, std::vector<const char*>> supported_buffer_types_;
+  std::map<TfLiteIoType, std::vector<const char*>> supported_synchronizations_;
+
+  // Currently AsyncSubgraph only support fully delegated by 1 backend case.
+  // Not owned.
+  mutable TfLiteAsyncKernel* async_kernel_ = nullptr;
+  TfLiteOpaqueNode* opaque_node_ = nullptr;
+};
+
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/backend_async_kernel_interface.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/backend_async_kernel_interface.h
new file mode 100644
index 00000000..a9969a73
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/backend_async_kernel_interface.h
@@ -0,0 +1,21 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"  // IWYU pragma: export
+// IWYU pragma: private, include "third_party/tensorflow/lite/async/backend_async_kernel_interface.h"
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/c/async_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/async_kernel.h
new file mode 100644
index 00000000..e53eca75
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/async_kernel.h
@@ -0,0 +1,304 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_ASYNC_KERNEL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_ASYNC_KERNEL_H_
+
+// TODO(b/270731824): Add full documentation / tests for this header.
+// Please reference to tensorflow/lite/core/async/async_kernel_internal.h
+// for documentation.
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// APIs for asynchronous delegate kernel.
+///
+/// WARNING: This is an experimental API and subject to change.
+
+/// Opaque TfLiteAsyncKernel type.
+typedef struct TfLiteAsyncKernel TfLiteAsyncKernel;
+
+/// Creates an async kernel to be initialized.
+/// `kernel_data` is the arbitrary data for identifying the async kernel itself
+/// and can be retrieved using `TfLiteAsyncKernelGetKernelData`.
+/// NOTE: TfLiteAsyncKernel does not own `kernel_data` and callers should
+/// ensure `kernel_data` out-lives the returned `TfLiteAsyncKernel`.
+TFL_CAPI_EXPORT extern TfLiteAsyncKernel* TfLiteAsyncKernelCreate(
+    void* kernel_data);
+
+/// Retrieves the kernel data for identifying the async kernel itself.
+TFL_CAPI_EXPORT extern void* TfLiteAsyncKernelGetKernelData(
+    const TfLiteAsyncKernel* async_kernel);
+
+/// Buffer operations
+/// ======================
+
+/// Sets the callback for registering a piece of platform-specific hardware
+/// buffer object.
+/// `kernel_data` will be the same value supplied by `TfLiteAsyncKernelCreate`.
+///
+/// `register_buffer`:
+/// Registers the buffer to `handle`.
+/// `buffer` and `attrs` lifespan is not guaranteed after the function call
+/// returns.
+/// kernels should save the stored attributes instead of caching the
+/// attribute map object itself.
+/// `io_type` specifies whether this buffer is used as an input buffer
+/// or an output buffer.
+/// `attrs` describes the attributes of the buffer object. It's guaranteed to be
+/// of kTfLiteBufferAttrMap type and not null. The application must provide
+/// `kTfLiteBufferAttrKeyResourceTypeName` attribute. When additional attributes
+/// (e.g. padding, size) are provided, the backend is responsible for validating
+/// those attributes to be compatible.
+/// Once its registered, TfLite runtime will assign and populate `handle` as
+/// the buffer handle.
+/// The backend will not own the actual buffer object, but the
+/// backend can choose to increase the ref count if underlying implementation
+/// supports that.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetRegisterBuffer(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*register_buffer)(
+        TfLiteAsyncKernel* async_kernel, TfLiteOpaqueContext* context,
+        TfLiteIoType io_type, const TfLiteBackendBuffer* buffer,
+        const TfLiteAttributeMap* attrs, TfLiteBufferHandle handle));
+
+/// Sets the callback for registering a buffer slice from previously registered
+/// hardware buffer object.
+///
+/// `register_buffer_slice`:
+///  Registers a buffer slice from a previously registered buffer object.
+/// `buffer_pool` is the handle of the buffer pool previously registered.
+/// `attrs` contains the information of the buffer slice.
+/// Once its registered, TfLite runtime will assign and populate `handle` as
+/// the buffer handle.
+/// NOTE: The backend is responsible to validate the slicing is "valid":
+/// * The slicing is not nested from another slice. (i.e. the `buffer_pool` is
+///   a handle returned by `RegisterBuffer`.)
+/// * The attributes of the slice (e.g. size, offset) is of valid values
+///   from the buffer pool.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetRegisterBufferSlice(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*register_buffer_slice)(TfLiteAsyncKernel* async_kernel,
+                                          TfLiteOpaqueContext* context,
+                                          TfLiteBufferHandle buffer_pool,
+                                          const TfLiteAttributeMap* attrs,
+                                          TfLiteBufferHandle handle));
+
+/// Sets the callback for unregistering a buffer handle.
+///
+/// `unregister_buffer`:
+/// Unregisters a buffer or a buffer slice.
+/// `handle` is a buffer handle previously assigned via register_* calls.
+/// If the `handle` is not recognized, returns error.
+/// NOTE: Unregistering the buffer does not mean deallocating the buffer object.
+/// But the backend need to reduce the ref-count if ref counting is performed
+/// during buffer registration calls.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetUnregisterBuffer(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*unregister_buffer)(TfLiteAsyncKernel* async_kernel,
+                                      TfLiteOpaqueContext* context,
+                                      TfLiteBufferHandle handle));
+
+/// Reconciliation methods
+/// =============================
+
+/// Sets the callback for the backend reporting supported hardware buffer object
+/// type names.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSupportedBufferTypes(
+    TfLiteAsyncKernel* async_kernel,
+    void (*supported_buffer_types)(const TfLiteAsyncKernel* async_kernel,
+                                   TfLiteIoType io_type,
+                                   const char* const** types, size_t* n_types));
+
+/// Sets the callback for the backend reporting supported synchronization object
+/// type names.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSupportedSynchronizations(
+    TfLiteAsyncKernel* async_kernel,
+    void (*supported_synchronizations)(const TfLiteAsyncKernel* async_kernel,
+                                       TfLiteIoType io_type,
+                                       const char* const** types,
+                                       size_t* n_types));
+
+/// Sets the callback for the backend to reconcile execution environment
+/// attributes (e.g. buffer / synchronization object properties).
+///
+/// `reconcile_restrictions`:
+/// Reconciles buffer or sync attributes for tensor at `tensor_index`.
+/// Fills `merged` with reconciled attributes.
+/// If `conflict` is provided, conflicting attributes should be provided there.
+/// If the type of the `user_provided_attributes` is not recognizable, returns
+/// error.
+/// If any of the attribute in the `user_provided_attributes` is not
+/// recognizable skip this attribute.
+/// Returns true if the attribute map type is recognizable and there's no
+/// conflicting attribute.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetReconcileRestrictions(
+    TfLiteAsyncKernel* async_kernel,
+    bool (*reconcile_restrictions)(
+        const TfLiteAsyncKernel* async_kernel,
+        const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node,
+        int tensor_index, const TfLiteAttributeMap* user_provided_attributes,
+        TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict));
+
+/// Sets the callback for the backend to set buffer / synchronization
+/// attributes.
+///
+/// `set_attributes`:
+/// Sets the input / output buffer / synchronization object attributes.
+/// Backend kernel will check the attributes covers all the requirements.
+/// A typical workflow is for callers call Reconcile*Restrictions method
+/// above to have a merged attribute list, check all restrictions are met
+/// and set input / output attribute here.
+/// Returns kTfLiteOk if provided `attrs` covers all requirements.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSetAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*set_attributes)(TfLiteAsyncKernel* async_kernel,
+                                   TfLiteOpaqueContext* context,
+                                   TfLiteOpaqueNode* node, int tensor_index,
+                                   const TfLiteAttributeMap* attrs));
+
+/// Sets the callback for the backend to set buffer attributes.
+///
+/// `set_buffer_attributes`:
+/// Sets the attributes of the buffers.
+/// Backend kernel will check if the provided buffer has been registered, and
+/// update the map in the backend, so that the callers can retrieve specific
+/// buffer's attributes. `attrs` should be initialized
+/// before calling this function and could be constructed by calling
+/// TfLiteAttributeMapCreate(). The attributes will be sent to backend kernels
+/// and stored in the map with the buffer. `buffer` and `attrs` should not be
+/// nullptr. The buffer needs to be registered before calling this
+/// function. Returns kTfLiteOk if the buffer has been registered and
+/// callers can successfully set the attributes for a buffer.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*set_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          const TfLiteAttributeMap* attrs));
+
+/// Sets the callback for the backend to get buffer attributes.
+///
+/// `get_buffer_attributes`:
+/// Gets the attributes of the buffers.
+/// Backend kernel will check if the provided buffer has been registered, and
+/// get the corresponding attributes from the map. `attrs` should be initialized
+/// before calling this function and could be constructed by calling
+/// TfLiteAttributeMapCreate(). `attrs` will be used to store the attributes
+/// obtained from the backend kernel. If `attrs` is a non-empty map, it will be
+/// overwritten by the attributes of the buffer. `buffer` and `attrs` should not
+/// be nullptr. The buffer needs to be registered before calling this function.
+/// Returns kTfLiteOk if the buffer has been registered and callers can
+/// successfully get the attributes for a buffer.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetGetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          TfLiteAttributeMap* attrs));
+
+/// Sets the callback to prepare the kernels using the information from
+/// `set_attributes` calls.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetPrepare(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
+                            TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+
+/// Execution methods
+/// =============================
+
+/// Sets the callback for the backend to schedule an execution.
+///
+/// `eval`:
+/// Schedules an execution with the information provided in task.
+/// The application is responsible for filling out buffer and sync mappings
+/// to tensors.
+/// Backend will set the sync ptr for related tensors if requested.
+/// i.e. SetOutputAttributes has sync implementation requested, and
+/// the TfLiteSynchronization is not null for the tensor in `task`.
+///
+/// TfLite runtime guarantees that the task is in ready state (i.e. no
+/// un-ended execution for this task).
+///
+/// Input synchronizations:
+/// If the synchronization of a input tensor is `kTfLiteSyncTypeNoSyncObj`
+/// type or it's nullptr, it means the data is ready during Eval call.
+/// If not, data will be available when the synchronization signals and the
+/// backend is responsible for closing the underlying synchronization.
+/// The backend is responsible for dedupping the input sync.
+///
+/// Output synchronizations:
+/// If the synchronization type is `kTfLiteSyncTypeNoSyncObj` or is nullptr,
+/// the backend does not need to provide synchronization objects to the user.
+/// Otherwise, the backend need to provide the sync according to the sync type
+/// provided. The underlying sync object will be closed by the app (or
+/// downstream components).
+/// If there are multiple non-nullptr kTfLiteSynchronization provided for
+/// different output tensors, the backend is responsible for duplicating the
+/// synchronization.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetEval(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*eval)(TfLiteAsyncKernel* async_kernel,
+                         TfLiteOpaqueContext* context, TfLiteOpaqueNode* node,
+                         TfLiteExecutionTask* task));
+
+/// Sets the callback for the backend to wait for a specific execution.
+///
+/// `wait`:
+/// Waits on the execution scheduled using the task to finish.
+/// TfLite runtime guarantees that the task has an un-ended execution.
+/// Callers should be able to call `Wait` on the same task from multiple
+/// threads, and those calls should return the same status (i.e. if the backend
+/// failed to successfully wait on the task, all `Wait` to the task should
+/// return the same error before a new invocation is scheduled). Returns
+/// kTfLiteOk if the task is finished (w/ or w/o blocking).
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetWait(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*wait)(TfLiteAsyncKernel* async_kernel,
+                         TfLiteOpaqueContext* context,
+                         TfLiteExecutionTask* task));
+
+/// Sets the callback for the backend to finish an execution and release all
+/// intermediate resources.
+///
+/// `finish`:
+/// Finishes the task and clean up allocated resources for the task.
+/// May block if there's pending executions.
+/// This function will be called once and only once for individual task.
+/// Returns kTfLiteOk if there's no error. The backend is responsible to
+/// clean up task resources regardless there's error or not.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetFinish(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*finish)(TfLiteAsyncKernel* async_kernel,
+                           TfLiteOpaqueContext* context,
+                           TfLiteExecutionTask* task));
+
+/// Releases `kernel`.
+/// Does not release `kernel_data`.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelDelete(TfLiteAsyncKernel* kernel);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_ASYNC_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/c/async_signature_runner.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/async_signature_runner.h
new file mode 100644
index 00000000..7996162b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/async_signature_runner.h
@@ -0,0 +1,340 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// APIs for asynchronous execution using TFLite AsyncSignatureRunner.
+///
+/// WARNING: This is an experimental API and subject to change.
+
+/// Opaque TfLiteAsyncSignatureRunner type.
+typedef struct TfLiteAsyncSignatureRunner TfLiteAsyncSignatureRunner;
+
+/// Returns a new async signature runner using the provided interpreter and
+/// signature key, or nullptr on failure.
+///
+/// NOTE: `signature_key` is a null-terminated C string that must match the
+/// key of a signature in the interpreter's model.
+///
+/// NOTE: The returned signature runner should be destroyed, by calling
+/// TfLiteAsyncSignatureRunnerDelete(), before the interpreter is destroyed.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteAsyncSignatureRunner*
+TfLiteInterpreterGetAsyncSignatureRunner(const TfLiteInterpreter* interpreter,
+                                         const char* signature_key);
+
+/// Registers a TfLiteBackendBuffer to the backend.
+/// `async_signature_runner`, `buffer`, `attrs` and `handle` should be non-null.
+/// If the hardware buffer wrapped in `buffer` is successfully registered,
+/// `handle` will be filled with a new buffer handle. Caller can use the buffer
+/// handle as input / output buffer in `TfLiteExecutionTask`.
+/// Returns kTfLiteError if the registration failed.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerRegisterBuffer(
+    TfLiteAsyncSignatureRunner* async_signature_runner, TfLiteIoType io_type,
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs,
+    TfLiteBufferHandle* handle);
+
+/// Registers a buffer slice from a previously registered handle `buffer_pool`.
+/// `async_signature_runner`, `attrs` and `handle` should be non-null.
+/// If the buffer slice described by `attrs` is successfully registered,
+/// output `handle` will be filled with a new buffer handle value.
+/// NOTE: `attrs` should contain the information about the buffer slice,
+/// e.g. offset and size of the size (if applicable).
+/// Returns kTfLiteError if the registration failed.
+TFL_CAPI_EXPORT extern TfLiteStatus
+TfLiteAsyncSignatureRunnerRegisterBufferSlice(
+    TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteBufferHandle buffer_pool, const TfLiteAttributeMap* attrs,
+    TfLiteBufferHandle* handle);
+
+/// Unregisters a hardware buffer object (or buffer slice) with `handle`.
+/// Buffer slices should be unregistered before unregistering the buffer pool
+/// it belongs to.
+/// Returns kTfLiteError if `handle` is not recognized.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerUnregisterBuffer(
+    TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteBufferHandle handle);
+
+/// Returns supported platform-specific hardware buffer types.
+///
+/// Output `types` will be a array of C strings that can be used as the
+/// value of `kTfLiteBufferAttrKeyResourceTypeName`.
+/// Output `num_types` is the size of the `types` array, and can be used to
+/// access elements in `types`.
+///
+/// NOTE: The lifetime of the returned array is the same as (and depends on) the
+/// lifetime of `signature_runner`.
+TFL_CAPI_EXPORT extern TfLiteStatus
+TfLiteAsyncSignatureRunnerGetSupportedBufferTypes(
+    const TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteIoType io_type, const char* const** types, size_t* num_types);
+
+/// Returns supported platform-specific synchronization object types.
+///
+/// Output `types` will be a array of C strings that can be used as the
+/// value of `kTfLiteSynchronizationAttrKeyObjectTypeName`.
+/// Output `num_types` is the size of the `types` array, and can be used to
+/// access elements in `types`.
+///
+/// NOTE: The lifetime of the returned array is the same as (and depends on) the
+/// lifetime of `signature_runner`.
+TFL_CAPI_EXPORT extern TfLiteStatus
+TfLiteAsyncSignatureRunnerGetSupportedSynchronizationTypes(
+    const TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteIoType io_type, const char* const** types, size_t* num_types);
+
+/// Reconciles restrictions with the backend for I/O tensor called `name`.
+/// The backend will read `user_provided_attributes` and tries to reconcile
+/// those attributes. The backend will also populate its own restrictions
+/// back to the caller.
+/// The merged attributes will be populated to `merged`. For attributes that
+/// the backend does not know or not care about, those will also be copied to
+/// `merged` attributes.
+/// If there's a conflicting attribute, it will be populated to `conflict` if
+/// it's provided.
+/// `user_provided_attributes` and `merged` should not be nullptr.
+/// Returns true if the reconcilation succeeded and there's no
+/// conflicting attributes.
+TFL_CAPI_EXPORT extern bool TfLiteAsyncSignatureRunnerReconcileRestrictions(
+    const TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteIoType io_type, const char* name,
+    const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict);
+
+/// Reconciles restrictions with the backend for I/O tensor at `tensor_index`.
+/// The backend will read `user_provided_attributes` and tries to reconcile
+/// those attributes. The backend will also populate its own restrictions
+/// back to the caller.
+/// The merged attributes will be populated to `merged`. For attributes that
+/// the backend does not know or not care about, those will also be copied to
+/// `merged` attributes.
+/// If there's a conflicting attribute, it will be populated to `conflict` if
+/// it's provided.
+/// `user_provided_attributes` and `merged` should not be nullptr.
+/// Returns true if the reconcilation succeeded and there's no
+/// conflicting attributes.
+TFL_CAPI_EXPORT extern bool
+TfLiteAsyncSignatureRunnerReconcileRestrictionsByIndex(
+    const TfLiteAsyncSignatureRunner* async_signature_runner, int tensor_index,
+    const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict);
+
+/// Finalizes I/O tensor `name`'s attributes with `attrs`.
+/// The attributes will be forwarded to all backend kernels that depends on
+/// tensor. Must call `TfLiteAsyncSignatureRunnerPrepareBackends` after setting
+/// new attributes.
+/// Callers needs to ensure the lifetime of `name` and `attrs` before this
+/// function returns, and those may be deallocated afterwards.
+/// Returns true if all backends accept the `attrs`.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerSetAttributes(
+    TfLiteAsyncSignatureRunner* async_signature_runner, TfLiteIoType io_type,
+    const char* name, const TfLiteAttributeMap* attrs);
+
+/// Finalizes I/O tensor at `tensor_index`'s attributes with `attrs`.
+/// The attributes will be forwarded to all backend kernels that depends on
+/// tensor. Must call `TfLiteAsyncSignatureRunnerPrepareBackends` after setting
+/// new attributes.
+/// Callers needs to ensure the lifetime of `name` and `attrs` before this
+/// function returns, and those may be deallocated afterwards.
+/// Returns true if all backends accept the `attrs`.
+TFL_CAPI_EXPORT extern TfLiteStatus
+TfLiteAsyncSignatureRunnerSetAttributesByIndex(
+    TfLiteAsyncSignatureRunner* async_signature_runner, int tensor_index,
+    const TfLiteAttributeMap* attrs);
+
+/// Prepares delegate backends for execution.
+/// Must be called after `TfLiteAsyncSignatureRunnerSetAttributes` and before
+/// `TfLiteAsyncSignatureRunnerCreateTask`.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerPrepareBackends(
+    TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Creates an execution task for this signature.
+/// Must be called after `TfLiteAsyncSignatureRunnerPrepareBackends` otherwise
+/// returns nullptr.
+/// When creating a task, all intermediate resources will be allocated
+/// for this task.
+/// Caller owns the returned task and must release it by calling
+/// `TfLiteAsyncSignatureRunnerFinish`.
+/// Returns nullptr if the task allocation failed.
+TFL_CAPI_EXPORT extern TfLiteExecutionTask*
+TfLiteAsyncSignatureRunnerCreateTask(
+    TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Schedules an asynchronous execution with I/O information
+/// provided in `task`.
+/// `task` should not be nullptr.
+///
+/// NOTE: For the same `task`,
+/// `Wait` and `InvokeAsync` should be called in pairs, unless `Finish(task)` is
+/// called and `task` is freed. The application is responsible
+/// to call `Wait` after `InvokeAsync` even if all output tensors are associated
+/// with synchronizations.
+///
+/// Returns kTfLiteError if any backend kernels failed to schedule
+/// the execution.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerInvokeAsync(
+    TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteExecutionTask* task);
+
+/// Blocks and wait for execution tied to `task` to finish.
+/// `task` should not be nullptr.
+/// Can be called from multiple threads. All calls will block until the
+/// task finishes execution.
+///
+/// NOTE: For the same `task`,
+/// `Wait` and `InvokeAsync` should be called in pairs, unless `Finish(task)` is
+/// called and `task` is freed. The application is responsible
+/// to call `Wait` after `InvokeAsync` even if all output tensors are associated
+/// with synchronizations.
+/// If `TfLiteAsyncSignatureRunnerWait` is called without a matching call to
+/// `TfLiteAsyncSignatureRunnerInvokeAsync`, returns the latest status code (by
+/// default `kTfLiteOk`).
+///
+/// Returns kTfLiteError if any backends failed to finish the execution.
+/// If the task is currently idle, it will return its latest status code.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerWait(
+    TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteExecutionTask* task);
+
+/// Finishes the task and release all intermediate resources tied to
+/// this task. Must be called exactly once for each `task` object.
+/// If there's ongoing execution, this will block wait for the execution
+/// to finish.
+/// `task` should not be nullptr and will be deleted.
+/// NOTE: Caller needs to ensure `Finish` is not called concurrently with
+/// `InvokeAsync` or `Wait`.
+/// Returns kTfLiteError if fails to release the task. The task will be
+/// destroyed regardless of error or not.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerFinish(
+    TfLiteAsyncSignatureRunner* async_signature_runner,
+    TfLiteExecutionTask* task);
+
+/// Returns the number of input tensors associated with the signature.
+TFL_CAPI_EXPORT extern size_t TfLiteAsyncSignatureRunnerGetInputCount(
+    const TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Returns the (null-terminated) name of the Nth input in a signature, where N
+/// is specified as `input_index`.
+///
+/// NOTE: The lifetime of the returned name is the same as (and depends on) the
+/// lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const char* TfLiteAsyncSignatureRunnerGetInputName(
+    const TfLiteAsyncSignatureRunner* async_signature_runner,
+    int32_t input_index);
+
+/// Returns the number of output tensors associated with the signature.
+TFL_CAPI_EXPORT extern size_t TfLiteAsyncSignatureRunnerGetOutputCount(
+    const TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Returns the (null-terminated) name of the Nth output in a signature, where
+/// N is specified as `output_index`.
+///
+/// NOTE: The lifetime of the returned name is the same as (and depends on) the
+/// lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const char* TfLiteAsyncSignatureRunnerGetOutputName(
+    const TfLiteAsyncSignatureRunner* async_signature_runner,
+    int32_t output_index);
+
+/// Returns the input tensor metadata identified by `input_name` in the given
+/// signature.
+/// Returns nullptr if the given name is not valid.
+///
+/// NOTE: For AsyncSignatureRunner, tensor data are not stored within
+/// `TfLiteOpaqueTensors` but in platform-specific hardware buffer objects.
+/// This method is only used for accessing the metadata like shape and data type
+/// of the input tensors.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const TfLiteOpaqueTensor*
+TfLiteAsyncSignatureRunnerGetInputTensor(
+    TfLiteAsyncSignatureRunner* async_signature_runner, const char* input_name);
+
+/// Returns the output tensor metadata identified by `output_name` in the given
+/// signature.
+/// Returns nullptr if the given name is not valid.
+///
+/// Note: For AsyncSignatureRunner, tensor data are not stored within
+/// `TfLiteOpaqueTensors` but in platform-specific hardware buffer objects.
+/// This method is only used for accessing the metadata like shape and data type
+/// of the output tensors.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const TfLiteOpaqueTensor*
+TfLiteAsyncSignatureRunnerGetOutputTensor(
+    const TfLiteAsyncSignatureRunner* async_signature_runner,
+    const char* output_name);
+
+/// Destroys the async signature runner.
+TFL_CAPI_EXPORT extern void TfLiteAsyncSignatureRunnerDelete(
+    TfLiteAsyncSignatureRunner* signature_runner);
+
+/// Returns a pointer to an array of input tensor indices.  The length of the
+/// array can be obtained via a call to
+/// `TfLiteAsyncSignatureRunnerGetInputCount`.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const int* TfLiteAsyncSignatureRunnerInputTensorIndices(
+    const TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Returns a pointer to an array of output tensor indices.  The length of the
+/// array can be obtained via a call to
+/// `TfLiteAsyncSignatureRunnerGetOutputCount`.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const int* TfLiteAsyncSignatureRunnerOutputTensorIndices(
+    const TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Returns the tensor metadata identified by `index` in the given
+/// signature.
+/// Returns nullptr if the given index is not valid or out of bound.
+///
+/// NOTE: For AsyncSignatureRunner, tensor data are not stored within
+/// `TfLiteOpaqueTensors` but in platform-specific hardware buffer objects.
+/// This method is only used for accessing the metadata like shape and data type
+/// of the input tensors.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const TfLiteOpaqueTensor*
+TfLiteAsyncSignatureRunnerGetTensor(
+    const TfLiteAsyncSignatureRunner* async_signature_runner, int index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/c/internal.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/internal.h
new file mode 100644
index 00000000..75bd96dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/internal.h
@@ -0,0 +1,33 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_INTERNAL_H_
+
+#include "tensorflow/lite/core/async/async_signature_runner.h"
+
+// Internal structures and subroutines used by the C API. These are likely to
+// change and should not be depended on directly by any C API clients.
+//
+// NOTE: This header does not follow C conventions and does not define a C API.
+// It is effectively an (internal) implementation detail of the C API.
+
+struct TfLiteAsyncSignatureRunner {
+  // The tflite::async::AsyncSignatureRunner runner object that this points to
+  // is owned by the interpreter. So this pointer will become invalid when the
+  // interpreter is destroyed.
+  tflite::async::AsyncSignatureRunner* impl;
+};
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/c/task.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/task.h
new file mode 100644
index 00000000..51eb32e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/task.h
@@ -0,0 +1,161 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_TASK_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_TASK_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// TfLiteExecutionTask API.
+///
+/// The opaque TfLiteExecutionTask stores the information for a specific
+/// execution. It includes the mapping from tensors to the buffer handles as
+/// well as the synchronization objects.
+/// WARNING: This file contains experimental APIs and subject to change.
+
+/// Opaque type for execution task.
+/// NOTE: Unless documented, `TfLiteExecutionTask` objects are
+/// "thread-compatible": i.e. not thread-safe but also not thread-hostile
+/// <https://web.archive.org/web/20210125044505/https://www.ibm.com/developerworks/java/library/j-jtp09263/index.html>.
+/// That is, each instance is not thread-safe, but multiple separate instances
+/// are safely independent.
+typedef struct TfLiteExecutionTask TfLiteExecutionTask;
+
+/// Buffers
+/// --------------------------------------------------------------------------
+/// If no synchronization type is set, the input data is default to synchronized
+/// (i.e. ready when calling InvokeAsync)
+
+/// Sets the buffer handle to the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// Returns kTfLiteError if the tensor is not found or nullptr args.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetBuffer(
+    TfLiteExecutionTask* task, TfLiteIoType io_type,
+    const char* tensor_signature_name, TfLiteBufferHandle handle);
+
+/// Sets the buffer handle to the input / output tensor associated with the
+/// tensor index.
+/// NOTE: This method does not check tensor index is pointing to a valid tensor.
+/// Caller need to make sure the tensor_index points to a valid tensor by
+/// using the element from AsyncSignatureRunner inputs / outputs array.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetBufferByIndex(
+    TfLiteExecutionTask* task, int tensor_index, TfLiteBufferHandle handle);
+
+/// Returns the buffer handle of the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// Returns kTfLiteNullBufferHandle if the tensor is not found or null input.
+TFL_CAPI_EXPORT extern TfLiteBufferHandle TfLiteExecutionTaskGetBufferByName(
+    const TfLiteExecutionTask* task, TfLiteIoType io_type,
+    const char* tensor_signature_name);
+
+/// The same as `TfLiteExecutionTaskGetBufferByName` but takes tensor index
+/// instead of the name from signature.
+TFL_CAPI_EXPORT extern TfLiteBufferHandle TfLiteExecutionTaskGetBufferByIndex(
+    const TfLiteExecutionTask* task, int tensor_index);
+
+/// Synchronizations
+/// --------------------------------------------------------------------------
+/// Associates synchronization objects to input / output tensors.
+///
+/// For input tensor, either a nullptr or default sync type
+/// `kTfLiteSyncTypeNoSyncObj` means the input is already ready when scheduling
+/// the execution. otherwise, the input data will be ready when the underlying
+/// sync object signals. The backend is responsible to close the underlying
+/// sync object.
+/// For output tensor, if the user does not require the backend to return
+/// the sync object, it can set the sync type to default
+/// `kTfLiteSyncTypeNoSyncObj` or a nullptr TfLiteSynchronization. It means the
+/// data is ready when the application calls `Wait` on the given task. Otherwise
+/// the backend needs to provide a not-null sync object according to the sync
+/// type and it will be signaled when the output data is ready. The underlying
+/// output sync object needs to be closed by the application (or some downstream
+/// in the pipeline). The backend will be responsible for duplicating the synch
+/// if TfLiteSynchronizations are not nullptr for different output tensor
+/// produced by the same backend.
+///
+/// The application needs to maintain the lifetime of the input
+/// TfLiteSynchronizations associated with the task during its invocation.
+/// TODO(b/191883048): Revisit if we want to bundle the lifetime of sync with
+/// the task itself and delete the TfLiteSynchronization in `Finish(task)`.
+
+/// Sets the opaque sync object to the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// A nullptr `sync` esentially means the tensor data does not need
+/// synchronization.
+/// `task` does not take the ownership of `sync`, so caller needs to release
+/// `sync` when destroying the `task` with AsyncSignatureRunner::Finish.
+/// Returns kTfLiteError if the tensor is not found.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetSync(
+    TfLiteExecutionTask* task, TfLiteIoType io_type,
+    const char* tensor_signature_name, TfLiteSynchronization* sync);
+
+/// Sets the opaque sync object to the input / output tensor associated with the
+/// tensor index.
+/// NOTE: This method does not check tensor index is pointing to a
+/// valid tensor. Caller need to make sure the tensor_index points to a valid
+/// tensor by using the element from AsyncSignatureRunner inputs / outputs
+/// array.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetSyncByIndex(
+    TfLiteExecutionTask* task, int tensor_index, TfLiteSynchronization* sync);
+
+/// Returns the sync object of the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// Returns nullptr if the tensor is not found or null input.
+TFL_CAPI_EXPORT extern TfLiteSynchronization* TfLiteExecutionTaskGetSyncByName(
+    const TfLiteExecutionTask* task, TfLiteIoType io_type,
+    const char* tensor_signature_name);
+
+/// The same as `TfLiteExecutionTaskGetSyncByName` but takes tensor index
+/// instead of the name from signature.
+TFL_CAPI_EXPORT extern TfLiteSynchronization* TfLiteExecutionTaskGetSyncByIndex(
+    const TfLiteExecutionTask* task, int tensor_index);
+
+/// Task execution data
+/// Backends may store task specific data for executions. This ease the burden
+/// for backends to maintain the mapping across different tasks.
+TFL_CAPI_EXPORT extern void* TfLiteExecutionTaskGetDelegateExecutionData(
+    const TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel);
+
+TFL_CAPI_EXPORT extern void TfLiteExecutionTaskSetDelegateExecutionData(
+    TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel, void* data);
+
+/// Task status
+/// Thread safe accessors for the latest status of the task.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskGetStatus(
+    const TfLiteExecutionTask* task);
+
+TFL_CAPI_EXPORT extern void TfLiteExecutionTaskSetStatus(
+    TfLiteExecutionTask* task, TfLiteStatus status);
+
+// TODO(b/262574034): Also add APIs for error code and error messages.
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_TASK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/c/types.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/types.h
new file mode 100644
index 00000000..8dabfdc2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/c/types.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_TYPES_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_TYPES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// Opaque type for TfLiteAsyncKernel.
+typedef struct TfLiteAsyncKernel TfLiteAsyncKernel;
+
+/// Opaque type for TfLiteExecutionTask.
+///
+/// See tensorflow/lite/core/async/c/task.h
+/// NOTE: TfLiteExecutionTask is NOT thread-safe.
+typedef struct TfLiteExecutionTask TfLiteExecutionTask;
+
+/// Enum tag for specifying whether a tensor is the input or output to the
+/// model.
+typedef enum TfLiteIoType {
+  kTfLiteIoTypeUnknown = 0,
+  kTfLiteIoTypeInput = 1,
+  kTfLiteIoTypeOutput = 2,
+} TfLiteIoType;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/attribute_map_internal.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/attribute_map_internal.h
new file mode 100644
index 00000000..9e17eb96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/attribute_map_internal.h
@@ -0,0 +1,119 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_MAP_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_MAP_INTERNAL_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/async/interop/variant.h"
+
+namespace tflite {
+namespace interop {
+
+// A value type pruned map, containing the attributes describing the properties
+// of a backend buffer or synchronization object.
+class AttributeMap {
+ public:
+  explicit AttributeMap(TfLiteAttrMapType type) : type_(type) {}
+  using KeyT = uint32_t;
+  using CustomKeyT = std::string;
+  // TODO(b/191883048): Benchmark std::variant vs. tagged union.
+  using ValueT = tflite::interop::Variant;
+  // TODO(b/191883048): Currently the number of attributes is small enough.
+  // So it's possible to optimize with a flat map.
+  using ContainerT = std::map<KeyT, ValueT>;
+  using CustomContainerT = std::map<CustomKeyT, ValueT>;
+
+  bool IsBufferAttributeMap() const {
+    return type_ == kTfLiteAttrMapTypeBuffer;
+  }
+  bool IsSyncAttributeMap() const { return type_ == kTfLiteAttrMapTypeSync; }
+
+  // Reconciles and merges the attribute values from other.
+  // After reconciliation, the merged value is compatible with both *this and
+  // `other`. e.g. a merged buffer size will be the maximum of two operands.
+  // If there's any attributes that cannot be reconciled, it will be filled to
+  // `conflict` if provided.
+  // `other` and `merged` should not be nullptr.
+  // Returns true if there's no conflicting attributes.
+  bool ReconcileAttributes(const AttributeMap* other, AttributeMap* merged,
+                           AttributeMap* conflict) const;
+
+  // Checks if the attributes fully covers requirements.
+  // An attribute covers if the values are compatible or it only appears
+  // in *this.
+  // `other` should not be nullptr otherwise will return false.
+  // Returns true if attrs completely covers requirements.
+  bool CheckAttributeCoverage(const AttributeMap* other,
+                              AttributeMap* conflict) const;
+
+  // Retrieves attribute value by key.
+  // Returns true if corresponding attribute exists and requested type matches,
+  // otherwise returns false.
+  template <typename AttrKeyT, typename ValueT>
+  bool GetAttr(AttrKeyT key, ValueT* value) const {
+    if (auto it = attrs_.find(static_cast<uint32_t>(key)); it != attrs_.end()) {
+      if (auto* v = it->second.Get<ValueT>(); v != nullptr) {
+        *value = *v;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Sets attribute value by key.
+  template <typename AttrKeyT, typename ValueT>
+  void SetAttr(AttrKeyT key, ValueT value) {
+    attrs_.insert_or_assign(static_cast<KeyT>(key), value);
+  }
+
+  // Retrieves custom attribute value by key.
+  // Returns true if corresponding attribute exists, otherwise returns false.
+  template <typename ValueT>
+  bool GetCustomAttr(CustomKeyT key, ValueT* value) const {
+    if (auto it = custom_attrs_.find(key); it != custom_attrs_.end()) {
+      if (auto* v = it->second.Get<ValueT>(); v != nullptr) {
+        *value = *v;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Sets custom attribute value by key.
+  template <typename ValueT>
+  void SetCustomAttr(CustomKeyT key, ValueT value) {
+    custom_attrs_.insert_or_assign(key, value);
+  }
+
+ private:
+  TfLiteAttrMapType type_;
+  ContainerT attrs_;
+  CustomContainerT custom_attrs_;
+};
+
+}  // namespace interop
+}  // namespace tflite
+
+struct TfLiteAttributeMap {
+  explicit TfLiteAttributeMap(TfLiteAttrMapType type) : impl(type) {}
+
+  tflite::interop::AttributeMap impl;
+};
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_MAP_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/attribute_map.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/attribute_map.h
new file mode 100644
index 00000000..51ea0364
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/attribute_map.h
@@ -0,0 +1,157 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// TfLiteAttributeMap API.
+///
+/// TfLiteAttributeMap stores buffer or sync attributes and keeps those
+/// intelligible across different backends and applications.
+/// Backend delegates can define a set of attribute keys to describe what's
+/// the attribute and defines the type of the value.
+/// Different components (application, TfLite runtime, backends) can use
+/// TfLiteAttributeMap to negotiate the requirements of the buffer / sync
+/// and establish the contract on specifications of a particular input / output.
+/// WARNING: This is an experimental type and subject to change.
+
+/// Opaque type for TfLiteAttributeMap.
+typedef struct TfLiteAttributeMap TfLiteAttributeMap;
+
+/// Creates an attribute map.
+/// `type` argument determines what the attribute map is describing
+/// (e.g. buffer, or sync object).
+/// Returned object is owned by the caller.
+TfLiteAttributeMap* TfLiteAttributeMapCreate(TfLiteAttrMapType type);
+
+/// Destroys the attribute map.
+/// Do nothing if `attrs` is nullptr.
+void TfLiteAttributeMapDelete(TfLiteAttributeMap* attrs);
+
+/// Returns true if `attrs` is a buffer attribute map.
+/// If `attrs` is nullptr, returns false.
+bool TfLiteAttributeMapIsBufferAttributeMap(const TfLiteAttributeMap* attrs);
+
+/// Returns true if `attrs` is a sync object attribute map.
+/// If `attrs` is nullptr, returns false.
+bool TfLiteAttributeMapIsSyncAttributeMap(const TfLiteAttributeMap* attrs);
+
+/// Copies all attributes from `src` to `dst`. Any existing attributes in `dst`
+/// will be cleared.
+/// If `src` or `dst` is null, does nothing.
+void TfLiteAttributeMapCopy(const TfLiteAttributeMap* src,
+                            TfLiteAttributeMap* dst);
+
+// --------------------------------------------------------------------------
+/// Accessor methods.
+///
+/// For getters, returns false if the key is not set, or the requested type
+/// does not match.
+/// For setters, if the value type is a pointer (e.g. c string literals),
+/// caller needs to ensure the lifetime of value exceeds the attribute map.
+/// If the key is set in previous calls, old value will be overriden by
+/// successive setter calls.
+
+/// Gets the int buffer attribute value for the given `key`.
+/// Returns false if the key is not set, `attrs` is not a buffer attribute map,
+/// or the value is not of type `size_t`.
+bool TfLiteAttributeMapGetSizeTBufferAttr(const TfLiteAttributeMap* attrs,
+                                          TfLiteBufferAttrKey key, size_t* val);
+
+/// Sets the `key` buffer attribute as `val`.
+/// Returns false if `attrs` is not a buffer attribute map.
+bool TfLiteAttributeMapSetSizeTBufferAttr(TfLiteAttributeMap* attrs,
+                                          TfLiteBufferAttrKey key, size_t val);
+
+/// Gets the C string buffer attribute value for the given `key`.
+/// Returns false if the key is not set, `attrs` is not a buffer attribute map,
+/// or the value is not of type `size_t`.
+/// Returned C string's lifespan is determined by the setter of that value.
+/// Neither `attrs` nor the caller maintains the lifespan of the string.
+bool TfLiteAttributeMapGetStringBufferAttr(const TfLiteAttributeMap* attrs,
+                                           TfLiteBufferAttrKey key,
+                                           const char** val);
+
+/// Sets the `key` buffer attribute as `val`.
+/// Returns false if `attrs` is not a buffer attribute map.
+/// `attrs` does not own the `val` C string.
+bool TfLiteAttributeMapSetStringBufferAttr(TfLiteAttributeMap* attrs,
+                                           TfLiteBufferAttrKey key,
+                                           const char* val);
+
+/// Gets the bool buffer attribute value for the given `key`.
+/// Returns false if the key is not set, `attrs` is not a buffer attribute map,
+/// or the value is not of type `bool`.
+bool TfLiteAttributeMapGetBoolBufferAttr(const TfLiteAttributeMap* attrs,
+                                         TfLiteBufferAttrKey key, bool* val);
+
+/// Sets the `key` buffer attribute as `val`.
+/// Returns false if `attrs` is not a sync attribute map.
+/// `attrs` does not own the `val` C string.
+bool TfLiteAttributeMapSetBoolBufferAttr(TfLiteAttributeMap* attrs,
+                                         TfLiteBufferAttrKey key, bool val);
+
+/// Gets the C string synchronization attribute value for the given `key`.
+/// Returns false if the key is not set, `attrs` is not a sync attribute map,
+/// or the value is not of type `size_t`.
+/// Returned C string's lifespan is determined by the setter of that value.
+/// Neither `attrs` nor the caller maintains the lifespan of the string.
+bool TfLiteAttributeMapGetStringSyncAttr(const TfLiteAttributeMap* attrs,
+                                         TfLiteSynchronizationAttrKey key,
+                                         const char** val);
+
+/// Sets the `key` buffer attribute as `val`.
+/// Returns false if `attrs` is not a sync attribute map.
+/// `attrs` does not own the `val` C string.
+bool TfLiteAttributeMapSetStringSyncAttr(TfLiteAttributeMap* attrs,
+                                         TfLiteSynchronizationAttrKey key,
+                                         const char* val);
+
+/// \privatesection
+/// Attribute map accessor methods that does not check the map type.
+/// It's recommended to use methods above for setting / getting attribute values
+/// as those will also check whether the attribute key matches the attribute
+/// map type.
+#define DECLARE_ATTR_MAP_ACCESSOR(type, type_name)                             \
+  bool TfLiteAttributeMapGet##type_name##Attr(const TfLiteAttributeMap* attrs, \
+                                              uint32_t key, type* val);        \
+  void TfLiteAttributeMapSet##type_name##Attr(TfLiteAttributeMap* attrs,       \
+                                              uint32_t key, type val);         \
+  bool TfLiteAttributeMapGetCustom##type_name##Attr(                           \
+      const TfLiteAttributeMap* attrs, const char* key, type* val);            \
+  void TfLiteAttributeMapSetCustom##type_name##Attr(                           \
+      TfLiteAttributeMap* attrs, const char* key, type val);
+
+DECLARE_ATTR_MAP_ACCESSOR(int, Int);
+DECLARE_ATTR_MAP_ACCESSOR(size_t, SizeT);
+DECLARE_ATTR_MAP_ACCESSOR(const char*, String);
+DECLARE_ATTR_MAP_ACCESSOR(bool, Bool);
+
+#undef DECLARE_ATTR_MAP_ACCESSOR
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/constants.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/constants.h
new file mode 100644
index 00000000..2c9dcfab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/constants.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_CONSTANTS_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_CONSTANTS_H_
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// Constants for TensorFlow Lite Async API.
+///
+/// WARNING: This is an experimental type and subject to change.
+
+/// Synchronization type name of "no synchronization object".
+///
+/// This is the default synchronization type for tensors that do not have
+/// user-specified synchronization attributes.
+/// When set on input tensors, the backend must ignore any input synchronization
+/// objects provided by the user, and the buffer content of the input tensor
+/// must be ready when AsyncSignatureRunner::InvokeAsync is called.
+/// When set on output tensors, the backend must not provide any output
+/// synchronization objects back to the user, and the buffer content of the
+/// output tensor must be ready when AsyncSignatureRunner::Wait returns.
+TFL_CAPI_EXPORT extern const char kTfLiteSyncTypeNoSyncObj[];  // "no_sync_obj"
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/types.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/types.h
new file mode 100644
index 00000000..0a262624
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/c/types.h
@@ -0,0 +1,132 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_TYPES_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// Types for hardware buffer object / synchronization object interoperability.
+/// WARNING: This is an experimental type and subject to change.
+
+/// TfLiteBackendBuffer is a an opaque type that abstracts platform specific
+/// implementations of hardware buffer objects (e.g. AHardwareBuffer).
+/// It's used for carrying the platform-specific hardware buffer object across
+/// applications, TFLite runtime and backends.
+typedef struct TfLiteBackendBuffer TfLiteBackendBuffer;
+
+/// Creates an empty TfLiteBackendBuffer that does not contain any hardware
+/// buffers object.
+/// Returned object is owned by the caller.
+TfLiteBackendBuffer* TfLiteBackendBufferCreate();
+
+/// Destroys a TfLiteBackendBuffer.
+/// Calling this function will not release the buffer object stored underneath.
+void TfLiteBackendBufferDelete(TfLiteBackendBuffer* buf);
+
+/// Stores a type puned buffer object to TfLiteBackendBuffer.
+/// `buf` will not own or control the lifecycle of `ptr`.
+/// Callers needs to ensure lifetime of *ptr exceeds `buf`.
+void TfLiteBackendBufferSetPtr(TfLiteBackendBuffer* buf, void* ptr);
+
+/// Retrieves the buffer object from TfLiteBackendBuffer.
+/// Callers can use TfLiteAttributeMap buffer type name to interpret returned
+/// pointer.
+void* TfLiteBackendBufferGetPtr(const TfLiteBackendBuffer* buf);
+
+/// TfLiteSynchronization is an opaque type that abstracts platform specific
+/// implementations of synchronization objects. It's used for carrying the
+/// synchronization object across applications, TFLite runtime and backends.
+typedef struct TfLiteSynchronization TfLiteSynchronization;
+
+/// Creates an empty TfLiteSynchronization.
+/// Returned object is owned by the caller.
+TfLiteSynchronization* TfLiteSynchronizationCreate();
+
+/// Destroys a TfLiteSynchronization.
+/// Calling this function will not release the synchronization object stored.
+void TfLiteSynchronizationDelete(TfLiteSynchronization* sync);
+
+/// Stores a type-punned pointer to a platform-specific synchronization object.
+/// `sync` will not own or control the lifecycle of `ptr`.
+/// Callers needs to ensure lifetime of *ptr exceeds `sync`.
+void TfLiteSynchronizationSetPtr(TfLiteSynchronization* sync, void* ptr);
+
+/// Retrieves the sync object from TfLiteSynchronization.
+/// Callers can use TfLiteAttributeMap sync type name to interpret returned
+/// pointer.
+void* TfLiteSynchronizationGetPtr(const TfLiteSynchronization* sync);
+
+/// Type of the attribute map.
+/// An attribute map can either describe the properties of backend buffers
+/// or synchronizations.
+/// The value of the TfLiteAttrMapType determines the interpretation of
+/// attribute keys. See comments below.
+typedef enum TfLiteAttrMapType {
+  /// Unknown type.
+  kTfLiteAttrMapTypeUnknown = 0,
+
+  /// The attributes describes a platform-specific hardware buffer object (e.g.
+  /// AHardwareBuffer for Android).
+  /// Keys are of TfLiteBufferAttrKey type.
+  kTfLiteAttrMapTypeBuffer = 1,
+
+  /// The attributes describes a sync object (e.g. a file descriptor as sync
+  /// fence).
+  /// Keys are of TfLiteSynchronizationAttrKey type.
+  kTfLiteAttrMapTypeSync = 2,
+} TfLiteAttrMapType;
+
+/// General hardware buffer attribute keys that are recognizable by TFLite.
+typedef enum TfLiteBufferAttrKey {
+  kTfLiteBufferAttrKeyUnknown = 0,
+  /// Backing buffer resource. const char*
+  /// e.g. "AHardwareBuffer".
+  kTfLiteBufferAttrKeyResourceTypeName = 1,
+  /// Buffer alignment, size_t
+  kTfLiteBufferAttrKeyAlignment = 2,
+  /// Buffer padding, size_t
+  kTfLiteBufferAttrKeyPadding = 3,
+  /// Buffer offset, size_t
+  kTfLiteBufferAttrKeyOffset = 4,
+  /// Buffer size (padded size if applicable), size_t
+  kTfLiteBufferAttrKeySize = 5,
+  /// Buffer current host coherency state, bool
+  kTfLiteBufferAttrKeyCurrentHostCoherencyState = 6,
+  /// Buffer preferred host coherency state, bool
+  kTfLiteBufferAttrKeyPreferredHostCoherencyState = 7,
+  /// Buffer current host cache state, bool
+  kTfLiteBufferAttrKeyCurrentHostCacheState = 8,
+  /// Buffer preferred cache state, bool
+  kTfLiteBufferAttrKeyPreferredHostCacheState = 9,
+} TfLiteBufferAttrKey;
+
+/// General synchronization attribute keys that are recognizable by TFLite.
+typedef enum TfLiteSynchronizationAttrKey {
+  kTfLiteSynchronizationAttrKeyUnknown = 0,
+  /// Synchronization type name. const char*
+  /// e.g. "sync_fence_fd"
+  kTfLiteSynchronizationAttrKeyObjectTypeName = 1,
+} TfLiteSynchronizationAttrKey;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/reconcile_fns.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/reconcile_fns.h
new file mode 100644
index 00000000..72d9f322
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/reconcile_fns.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_RECONCILE_FNS_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_RECONCILE_FNS_H_
+
+// Reconciliation functions for merging and examinate buffer / synchronization
+// attributes.
+
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+namespace tflite {
+namespace interop {
+
+// Reconciles general attributes.
+// `lhs`, `rhs`, `merged` are required to be not null, otherwise return false.
+// The merged attribute will be set in `merged`. If there's any attribute that
+// can not be reconciled, it will be set in `conflict` and return false.
+bool ReconcileGeneralAttributeKeys(TfLiteAttrMapType type,
+                                   const AttributeMap::ContainerT* lhs,
+                                   const AttributeMap::ContainerT* rhs,
+                                   AttributeMap::ContainerT* merged,
+                                   AttributeMap::ContainerT* conflict);
+
+// Check if `lhs` covers all attribute in `rhs`.
+// `lhs` and `rhs` are required to be not null, otherwise return false.
+// If there's any attribute that is not covered (i.e. missing from `lhs` or
+// values are incompatible), it will be set in `conflict` and return false.
+bool CheckGeneralAttributeKeysCoverage(TfLiteAttrMapType type,
+                                       const AttributeMap::ContainerT* lhs,
+                                       const AttributeMap::ContainerT* rhs,
+                                       AttributeMap::ContainerT* conflict);
+
+}  // namespace interop
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_RECONCILE_FNS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/variant.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/variant.h
new file mode 100644
index 00000000..9690f608
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/interop/variant.h
@@ -0,0 +1,133 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_VARIANT_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_VARIANT_H_
+
+#include <cstddef>
+#include <string>
+#include <utility>
+
+namespace tflite {
+namespace interop {
+
+// Tagged union implementation for variant type.
+// Getters and Setters have compile time check to ensure the type is supported
+// in the variant. But this class won't perform runtime type check. Callers
+// are required to ensure type used in getters are the same as setters.
+// For pointer type values hold in the variant (including C-style string literal
+// const char*), Variant does not hold the ownership of the value.
+struct Variant {
+  Variant();
+
+  template <typename T>
+  explicit Variant(T v) {
+    Set(v);
+  }
+
+  template <typename T>
+  Variant& operator=(T v) {
+    Set(v);
+    return *this;
+  }
+
+  // Getter. Disabled if the type is not supported in the variant.
+  // Returns nullptr if requested type doesn't match the actual value type.
+  template <typename T>
+  const T* Get() const = delete;
+
+  // Setter. Disabled if the type is not supported in the variant.
+  template <typename T>
+  void Set(T v) = delete;
+
+  // Returns the opaque data pointer.
+  // Callers are responsible for ensuring to cast to correct type.
+  void const* GetPtr() const { return &val; }
+
+  // Comparator.
+  // If the underlying data is string type (const char*), performs a string
+  // comparison. Otherwise checks equality of the data.
+  bool operator==(const Variant& other) const;
+  bool operator!=(const Variant& other) const;
+
+  // Data types supported in the variant.
+  union {
+    int i;
+    size_t s;
+    const char* c;
+    bool b;
+  } val;
+
+  // Tracking bit used for equality comparison.
+  enum { kInvalid, kInt, kSizeT, kString, kBool } type;
+};
+
+// Copyable.
+template <>
+inline Variant::Variant(const Variant& v) : val(v.val), type(v.type) {}
+
+// Copy assign with copy-and-swap.
+template <>
+inline Variant& Variant::operator=(Variant v) {
+  std::swap(val, v.val);
+  std::swap(type, v.type);
+  return *this;
+}
+
+// Accessor specializations.
+template <>
+inline const int* Variant::Get<int>() const {
+  if (type != kInt) return nullptr;
+  return &val.i;
+}
+template <>
+inline const size_t* Variant::Get<size_t>() const {
+  if (type != kSizeT) return nullptr;
+  return &val.s;
+}
+template <>
+inline const char* const* Variant::Get<const char*>() const {
+  if (type != kString) return nullptr;
+  return &val.c;
+}
+template <>
+inline const bool* Variant::Get<bool>() const {
+  if (type != kBool) return nullptr;
+  return &val.b;
+}
+template <>
+inline void Variant::Set(int v) {
+  val.i = v;
+  type = kInt;
+}
+template <>
+inline void Variant::Set(size_t v) {
+  val.s = v;
+  type = kSizeT;
+}
+template <>
+inline void Variant::Set(const char* v) {
+  val.c = v;
+  type = kString;
+}
+template <>
+inline void Variant::Set(bool v) {
+  val.b = v;
+  type = kBool;
+}
+
+}  // namespace interop
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_VARIANT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/task_internal.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/task_internal.h
new file mode 100644
index 00000000..09290458
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/task_internal.h
@@ -0,0 +1,161 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_TASK_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_TASK_INTERNAL_H_
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// Forward declaration
+namespace tflite::async {
+class ExecutionTask;
+}  // namespace tflite::async
+
+// TfLiteExecutionTask object holds the mapping from tensor index to
+// the backend buffer and sync object tied to that tensor.
+// It also holds the event handle that represents a specific scheduled
+// async execution.
+// The AsyncSignatureRunner that creates the task should out-live the life time
+// of the TfLiteExecutionTask.
+struct TfLiteExecutionTask {
+  TfLiteExecutionTask();
+  std::unique_ptr<tflite::async::ExecutionTask> task;
+};
+
+namespace tflite {
+namespace async {
+
+// Implementation of TfLiteExecutionTask.
+// This class is not thread safe.
+class ExecutionTask {
+ public:
+  // Returns the buffer handle for input / output tensor `name`.
+  // If there's tensor `name` is not found, returns kTfLiteNullBufferHandle.
+  TfLiteBufferHandle GetBufferHandle(TfLiteIoType io_type,
+                                     const char* name) const;
+  // Same as GetBufferHandle above, but uses tensor index as key.
+  TfLiteBufferHandle GetBufferHandle(int tensor_index) const;
+
+  // Sets the buffer handle for input / output `name`.
+  // If there's tensor `name` is not found, do nothing.
+  TfLiteStatus SetBufferHandle(TfLiteIoType io_type, const char* name,
+                               TfLiteBufferHandle handle);
+
+  // Same as SetBufferHandle above but uses tensor index. Callers need to make
+  // sure the index is valid.
+  TfLiteStatus SetBufferHandle(int tensor_index, TfLiteBufferHandle handle);
+
+  // Returns the TfLiteSynchronization for input / output tensor `name`.
+  // If there's tensor `name` is not found, returns nullptr.
+  TfLiteSynchronization* GetSynchronization(TfLiteIoType io_type,
+                                            const char* name) const;
+  // Same as GetSynchronization above, but uses tensor index as key.
+  TfLiteSynchronization* GetSynchronization(int tensor_index) const;
+
+  // Sets the TfLiteSynchronization for input / output tensor `name`.
+  // If there's tensor `name` is not found, do nothing.
+  TfLiteStatus SetSynchronization(TfLiteIoType io_type, const char* name,
+                                  TfLiteSynchronization* sync);
+
+  // Same as TfLiteSynchronization above but uses tensor index. Callers need to
+  // make sure the index is valid.
+  TfLiteStatus SetSynchronization(int tensor_index,
+                                  TfLiteSynchronization* sync);
+
+  using TensorNameMapT = std::map<std::string, uint32_t>;
+
+  // Sets the mapping from signature input name to tensor index.
+  void SetInputNameMap(const TensorNameMapT* input_name_to_idx) {
+    input_name_to_idx_ = input_name_to_idx;
+  }
+
+  // Sets the mapping from signature output name to tensor index.
+  void SetOutputNameMap(const TensorNameMapT* output_name_to_idx) {
+    output_name_to_idx_ = output_name_to_idx;
+  }
+
+  // Returns the status of this task.
+  // True if the task has been scheduled, false if idle.
+  bool Scheduled() const { return scheduled_.load(); }
+
+  // Exchanges the status of this task. Whether it's been scheduled or in idle
+  // state.
+  // Returns the previous value of the task.
+  bool SetScheduled(bool scheduled) { return scheduled_.exchange(scheduled); }
+
+  // Returns the latest status of this task.
+  // Thread safe.
+  TfLiteStatus Status() const { return status_.load(); }
+
+  // Sets the status code for this task.
+  // Thread safe.
+  void SetStatus(TfLiteStatus status) { status_.store(status); }
+
+  // Sets the delegate execution data for this task.
+  void SetDelegateExecutionData(TfLiteAsyncKernel* kernel, void* data) {
+    data_ = data;
+  }
+
+  // Returns the delegate execution data for this task.
+  // Returns nullptr if not set.
+  void* GetDelegateExecutionData(TfLiteAsyncKernel* kernel) const {
+    return data_;
+  }
+
+ private:
+  struct IOData {
+    TfLiteBufferHandle buf = kTfLiteNullBufferHandle;
+    TfLiteSynchronization* sync = nullptr;
+  };
+
+  // Finds the tensor index for input / output name.
+  // Returns false if the tensor is not found.
+  bool GetTensorIdx(TfLiteIoType io_type, const char* name, int* idx) const;
+
+  // Mapping from tensor index to buffer handle and sync object ptr.
+  // Set by the application.
+  std::map<int, IOData> io_data_;
+
+  // The status of the task. Whether the task has been scheduled or not.
+  // The bit is set when calling InvokeAsync to this task, and resets on Wait.
+  std::atomic_bool scheduled_ = false;
+
+  // The latest status of this task. Default value will be kTfLiteOk.
+  std::atomic<TfLiteStatus> status_ = kTfLiteOk;
+
+  // Mapping from signature name to tensor index.
+  // Not owned. Set and owned by AsyncSignatureRunner.
+  // Can be nullptr if the model does not have signature def.
+  const TensorNameMapT* input_name_to_idx_ = nullptr;
+  const TensorNameMapT* output_name_to_idx_ = nullptr;
+
+  // Delegate owned data.
+  // NOTE: Currently we only support one delegate. If we are to support multiple
+  // backends, we might need to change this to a map.
+  void* data_ = nullptr;
+};
+
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_TASK_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/testing/mock_async_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/testing/mock_async_kernel.h
new file mode 100644
index 00000000..fa7d434d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/testing/mock_async_kernel.h
@@ -0,0 +1,21 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+
+#include "tensorflow/lite/async/testing/mock_async_kernel.h"  // IWYU pragma: export
+// IWYU pragma: private, include "third_party/tensorflow/lite/async/testing/mock_async_kernel.h"
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/async/testing/test_backend.h b/third_party/tflite-hdrs/tensorflow/lite/core/async/testing/test_backend.h
new file mode 100644
index 00000000..493c65a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/async/testing/test_backend.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_TESTING_TEST_BACKEND_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_TESTING_TEST_BACKEND_H_
+
+#include <limits>
+#include <memory>
+
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace async {
+namespace testing {
+
+// A test backend that takes in arbitrary TfLiteAsyncKernel.
+class TestBackend {
+ public:
+  explicit TestBackend(TfLiteAsyncKernel* kernel);
+
+  TfLiteDelegate* get_delegate() { return &delegate_; }
+  TfLiteAsyncKernel* get_kernel() { return kernel_; }
+
+  // Maximum delegate partitions.
+  int NumPartitions() const { return num_partitions_; }
+  void SetNumPartitions(int num_partitions) {
+    num_partitions_ = num_partitions;
+  }
+
+  // Minimal number of nodes the backend delegates.
+  int MinPartitionedNodes() const { return min_partioned_nodes_; }
+  void SetMinPartitionedNodes(int min_partioned_nodes) {
+    min_partioned_nodes_ = min_partioned_nodes;
+  }
+
+ private:
+  // Not owned.
+  TfLiteAsyncKernel* kernel_ = nullptr;
+
+  // Owned.
+  TfLiteDelegate delegate_;
+
+  int num_partitions_ = std::numeric_limits<int>::max();
+  int min_partioned_nodes_ = 0;
+};
+
+}  // namespace testing
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_TESTING_TEST_BACKEND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/builtin_op_data.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/builtin_op_data.h
new file mode 100644
index 00000000..cfe3d825
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/builtin_op_data.h
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/builtin_op_data.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
+
+#include "tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/common.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api.h
new file mode 100644
index 00000000..41726d2f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api.h
@@ -0,0 +1,655 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+#ifndef TENSORFLOW_LITE_CORE_C_C_API_H_
+#define TENSORFLOW_LITE_CORE_C_C_API_H_
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/operator.h"  // IWYU pragma: export
+
+/// C API for TensorFlow Lite.
+///
+/// The API leans towards simplicity and uniformity instead of convenience, as
+/// most usage will be by language-specific wrappers. It provides largely the
+/// same set of functionality as that of the C++ TensorFlow Lite `Interpreter`
+/// API, but is useful for shared libraries where having a stable ABI boundary
+/// is important.
+///
+/// Conventions:
+/// * We use the prefix TfLite for everything in the API.
+/// * size_t is used to represent byte sizes of objects that are
+///   materialized in the address space of the calling process.
+/// * int is used as an index into arrays.
+///
+/// Usage:
+/// <pre><code>
+/// // Create the model and interpreter options.
+/// TfLiteModel* model = TfLiteModelCreateFromFile("/path/to/model.tflite");
+/// TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+/// TfLiteInterpreterOptionsSetNumThreads(options, 2);
+///
+/// // Create the interpreter.
+/// TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+///
+/// // Allocate tensors and populate the input tensor data.
+/// TfLiteInterpreterAllocateTensors(interpreter);
+/// TfLiteTensor* input_tensor =
+///     TfLiteInterpreterGetInputTensor(interpreter, 0);
+/// TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+///                            input.size() * sizeof(float));
+///
+/// // Execute inference.
+/// TfLiteInterpreterInvoke(interpreter);
+///
+/// // Extract the output tensor data.
+/// const TfLiteTensor* output_tensor =
+///      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+/// TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+///                          output.size() * sizeof(float));
+///
+/// // Dispose of the model and interpreter objects.
+/// TfLiteInterpreterDelete(interpreter);
+/// TfLiteInterpreterOptionsDelete(options);
+/// TfLiteModelDelete(model);
+/// </code></pre>
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup c_api lite/c/c_api.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// This header should be valid in both C (e.g. C99) and C++,
+// so 'void' in parameters is not redundant.
+// NOLINTBEGIN(modernize-redundant-void-arg)
+
+// --------------------------------------------------------------------------
+// Opaque types used by the C API.  (See also c_api_types.h.)
+
+/// TfLiteModel wraps a loaded TensorFlow Lite model.
+typedef struct TfLiteModel TfLiteModel;
+
+/// TfLiteInterpreterOptions allows customized interpreter configuration.
+typedef struct TfLiteInterpreterOptions TfLiteInterpreterOptions;
+
+/// TfLiteInterpreter provides inference from a provided model.
+typedef struct TfLiteInterpreter TfLiteInterpreter;
+
+/// A tensor in the interpreter system which is a wrapper around a buffer of
+/// data including a dimensionality (or NULL if not currently defined).
+typedef struct TfLiteTensor TfLiteTensor;
+
+/// TfLiteSignatureRunner is used to run inference on a signature.
+///
+/// Note: A signature is used to define a computation in a TF model. A model can
+/// have multiple signatures. Each signature contains three components:
+///   * Signature Key: A unique string to identify a signature
+///   * Inputs: A list of names, each mapped to an input tensor of a signature
+///   * Outputs: A list of names, each mapped to an output tensor of a signature
+///
+/// To learn more about signatures in TFLite, refer to:
+/// https://www.tensorflow.org/lite/guide/signatures
+///
+/// Using the TfLiteSignatureRunner, for a particular signature, you can set its
+/// inputs, invoke (i.e. execute) the computation, and retrieve its outputs.
+typedef struct TfLiteSignatureRunner TfLiteSignatureRunner;
+
+// --------------------------------------------------------------------------
+/// The TensorFlow Lite Runtime version.
+///
+/// Returns a pointer to a statically allocated string that is the version
+/// number of the (potentially dynamically loaded) TF Lite Runtime library.
+/// TensorFlow Lite uses semantic versioning, and the return value should be
+/// in semver 2 format <http://semver.org>, starting with MAJOR.MINOR.PATCH,
+/// e.g. "2.12.0" or "2.13.0-rc2".
+TFL_CAPI_EXPORT extern const char* TfLiteVersion(void);
+
+// --------------------------------------------------------------------------
+/// The TensorFlow Lite Extension APIs version.
+///
+/// Returns a pointer to a statically allocated string that is the version
+/// number of the TF Lite Extension APIs supported by the (potentially
+/// dynamically loaded) TF Lite Runtime library.  The TF Lite "Extension APIs"
+/// are the APIs for extending TF Lite with custom ops and delegates.
+/// More specifically, this version number covers the (non-experimental)
+/// functionality documented in the following header files:
+///
+///   * lite/c/c_api_opaque.h
+///   * lite/c/common.h
+///   * lite/c/builtin_op_data.h
+///   * lite/builtin_ops.h
+///
+/// This version number uses semantic versioning, and the return value should
+/// be in semver 2 format <http://semver.org>, starting with MAJOR.MINOR.PATCH,
+/// e.g. "2.14.0" or "2.15.0-rc2".
+TFL_CAPI_EXPORT extern const char* TfLiteExtensionApisVersion(void);
+
+/// The supported TensorFlow Lite model file Schema version.
+///
+/// Returns the (major) version number of the Schema used for model
+/// files that is supported by the (potentially dynamically loaded)
+/// TensorFlow Lite Runtime.
+///
+/// Model files using schema versions different to this may not be supported by
+/// the current version of the TF Lite Runtime.
+TFL_CAPI_EXPORT int TfLiteSchemaVersion(void);
+
+/// Returns a model from the provided buffer, or null on failure.
+///
+/// \note The caller retains ownership of the `model_data` buffer and should
+/// ensure that the lifetime of the `model_data` buffer must be at least as long
+/// as the lifetime of the `TfLiteModel` and of any `TfLiteInterpreter` objects
+/// created from that `TfLiteModel`, and furthermore the contents of the
+/// `model_data` buffer must not be modified during that time."
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreate(const void* model_data,
+                                                      size_t model_size);
+
+/// Same as `TfLiteModelCreate` with customizble error reporter.
+/// * `reporter` takes the provided `user_data` object, as well as a C-style
+///   format string and arg list (see also vprintf).
+/// * `user_data` is optional. If non-null, it is owned by the client and must
+///   remain valid for the duration of the interpreter lifetime.
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateWithErrorReporter(
+    const void* model_data, size_t model_size,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data);
+
+/// Returns a model from the provided file, or null on failure.
+///
+/// \note The file's contents must not be modified during the lifetime of the
+/// `TfLiteModel` or of any `TfLiteInterpreter` objects created from that
+/// `TfLiteModel`.
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateFromFile(
+    const char* model_path);
+
+/// Same as `TfLiteModelCreateFromFile` with customizble error reporter.
+/// * `reporter` takes the provided `user_data` object, as well as a C-style
+///   format string and arg list (see also vprintf).
+/// * `user_data` is optional. If non-null, it is owned by the client and must
+///   remain valid for the duration of the interpreter lifetime.
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateFromFileWithErrorReporter(
+    const char* model_path,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data);
+
+/// Destroys the model instance.
+TFL_CAPI_EXPORT extern void TfLiteModelDelete(TfLiteModel* model);
+
+/// Returns a new interpreter options instances.
+TFL_CAPI_EXPORT extern TfLiteInterpreterOptions*
+TfLiteInterpreterOptionsCreate();
+
+/// Creates and returns a shallow copy of an options object.
+///
+/// The caller is responsible for calling `TfLiteInterpreterOptionsDelete` to
+/// deallocate the object pointed to by the returned pointer.
+TFL_CAPI_EXPORT extern TfLiteInterpreterOptions* TfLiteInterpreterOptionsCopy(
+    const TfLiteInterpreterOptions* from);
+
+/// Destroys the interpreter options instance.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsDelete(
+    TfLiteInterpreterOptions* options);
+
+/// Sets the number of CPU threads to use for the interpreter.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetNumThreads(
+    TfLiteInterpreterOptions* options, int32_t num_threads);
+
+/// Adds a delegate to be applied during `TfLiteInterpreter` creation.
+///
+/// If delegate application fails, interpreter creation will also fail with an
+/// associated error logged.
+///
+/// \note The caller retains ownership of the delegate and should ensure that it
+/// remains valid for the duration of any created interpreter's lifetime.
+///
+/// If you are NOT using "TensorFlow Lite in Play Services", and NOT building
+/// with `TFLITE_WITH_STABLE_ABI` or `TFLITE_USE_OPAQUE_DELEGATE` macros
+/// enabled, it is possible to pass a `TfLiteDelegate*` rather than a
+/// `TfLiteOpaqueDelegate*` to this function, since in those cases,
+/// `TfLiteOpaqueDelegate` is just a typedef alias for `TfLiteDelegate`.
+/// This is for compatibility with existing source code
+/// and existing delegates.  For new delegates, it is recommended to
+/// use `TfLiteOpaqueDelegate` rather than `TfLiteDelegate`.  (See
+/// `TfLiteOpaqueDelegate` in tensorflow/lite/core/c/c_api_types.h.)
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddDelegate(
+    TfLiteInterpreterOptions* options, TfLiteOpaqueDelegate* delegate);
+
+/// Sets a custom error reporter for interpreter execution.
+///
+/// * `reporter` takes the provided `user_data` object, as well as a C-style
+///   format string and arg list (see also vprintf).
+/// * `user_data` is optional. If non-null, it is owned by the client and must
+///   remain valid for the duration of the interpreter lifetime.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetErrorReporter(
+    TfLiteInterpreterOptions* options,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data);
+
+/// Adds an op registration to be applied during `TfLiteInterpreter` creation.
+///
+/// The `TfLiteOperator` object is needed to implement custom op of
+/// TFLite Interpreter via C API. Calling this function ensures that any
+/// `TfLiteInterpreter` created with the specified `options` can execute models
+/// that use the custom operator specified in `registration`.
+/// Please refer https://www.tensorflow.org/lite/guide/ops_custom for custom op
+/// support.
+/// \note The caller retains ownership of the TfLiteOperator object
+/// and should ensure that it remains valid for the duration of any created
+/// interpreter's lifetime.
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddOperator(
+    TfLiteInterpreterOptions* options, TfLiteOperator* registration);
+
+/// Enables users to cancel in-flight invocations with
+/// `TfLiteInterpreterCancel`.
+///
+/// By default it is disabled and calling to `TfLiteInterpreterCancel` will
+/// return kTfLiteError. See `TfLiteInterpreterCancel`.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterOptionsEnableCancellation(
+    TfLiteInterpreterOptions* options, bool enable);
+
+/// Returns a new interpreter using the provided model and options, or null on
+/// failure.
+///
+/// * `model` must be a valid model instance. The caller retains ownership of
+///   the object, and may destroy it (via TfLiteModelDelete) immediately after
+///   creating the interpreter.  However, if the TfLiteModel was allocated with
+///   TfLiteModelCreate, then the `model_data` buffer that was passed to
+///   TfLiteModelCreate must outlive the lifetime of the TfLiteInterpreter
+///   object that this function returns, and must not be modified during that
+///   time; and if the TfLiteModel was allocated with TfLiteModelCreateFromFile,
+///   then the contents of the model file must not be modified during the
+///   lifetime of the TfLiteInterpreter object that this function returns.
+/// * `optional_options` may be null. The caller retains ownership of the
+///   object, and can safely destroy it (via TfLiteInterpreterOptionsDelete)
+///   immediately after creating the interpreter.
+///
+/// \note The client *must* explicitly allocate tensors before attempting to
+/// access input tensor data or invoke the interpreter.
+TFL_CAPI_EXPORT extern TfLiteInterpreter* TfLiteInterpreterCreate(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options);
+
+/// Destroys the interpreter.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterDelete(
+    TfLiteInterpreter* interpreter);
+
+/// Returns the number of input tensors associated with the model.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetInputTensorCount(
+    const TfLiteInterpreter* interpreter);
+
+/// Returns a pointer to an array of input tensor indices.  The length of the
+/// array can be obtained via a call to `TfLiteInterpreterGetInputTensorCount`.
+///
+/// Typically the input tensors associated with an `interpreter` would be set
+/// during the initialization of the `interpreter`, through a mechanism like the
+/// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
+/// interpreter.  However, there are some circumstances in which the pointer may
+/// not remain valid throughout the lifetime of the interpreter, because calls
+/// to `SetInputs` on the interpreter invalidate the returned pointer.
+///
+/// The ownership of the array remains with the TFLite runtime.
+TFL_CAPI_EXPORT const int* TfLiteInterpreterInputTensorIndices(
+    const TfLiteInterpreter* interpreter);
+
+/// Returns the tensor associated with the input index.
+/// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteInterpreterGetInputTensor(
+    const TfLiteInterpreter* interpreter, int32_t input_index);
+
+/// Resizes the specified input tensor.
+///
+/// \note After a resize, the client *must* explicitly allocate tensors before
+/// attempting to access the resized tensor data or invoke the interpreter.
+///
+/// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
+///
+/// This function makes a copy of the input dimensions, so the client can safely
+/// deallocate `input_dims` immediately after this function returns.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResizeInputTensor(
+    TfLiteInterpreter* interpreter, int32_t input_index, const int* input_dims,
+    int32_t input_dims_size);
+
+/// Updates allocations for all tensors, resizing dependent tensors using the
+/// specified input tensor dimensionality.
+///
+/// This is a relatively expensive operation, and need only be called after
+/// creating the graph and/or resizing any inputs.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterAllocateTensors(
+    TfLiteInterpreter* interpreter);
+
+/// Runs inference for the loaded graph.
+///
+/// Before calling this function, the caller should first invoke
+/// TfLiteInterpreterAllocateTensors() and should also set the values for the
+/// input tensors.  After successfully calling this function, the values for the
+/// output tensors will be set.
+///
+/// \note It is possible that the interpreter is not in a ready state to
+/// evaluate (e.g., if AllocateTensors() hasn't been called, or if a
+/// ResizeInputTensor() has been performed without a subsequent call to
+/// AllocateTensors()).
+///
+///   If the (experimental!) delegate fallback option was enabled in the
+///   interpreter options, then the interpreter will automatically fall back to
+///   not using any delegates if execution with delegates fails. For details,
+///   see TfLiteInterpreterOptionsSetEnableDelegateFallback in
+///   c_api_experimental.h.
+///
+/// Returns one of the following status codes:
+///  - kTfLiteOk: Success. Output is valid.
+///  - kTfLiteDelegateError: Execution with delegates failed, due to a problem
+///    with the delegate(s). If fallback was not enabled, output is invalid.
+///    If fallback was enabled, this return value indicates that fallback
+///    succeeded, the output is valid, and all delegates previously applied to
+///    the interpreter have been undone.
+///  - kTfLiteApplicationError: Same as for kTfLiteDelegateError, except that
+///    the problem was not with the delegate itself, but rather was
+///    due to an incompatibility between the delegate(s) and the
+///    interpreter or model.
+///  - kTfLiteError: Unexpected/runtime failure. Output is invalid.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterInvoke(
+    TfLiteInterpreter* interpreter);
+
+/// Returns the number of output tensors associated with the model.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorCount(
+    const TfLiteInterpreter* interpreter);
+
+/// Returns a pointer to an array of output tensor indices.  The length of the
+/// array can be obtained via a call to `TfLiteInterpreterGetOutputTensorCount`.
+///
+/// Typically the output tensors associated with an `interpreter` would be set
+/// during the initialization of the `interpreter`, through a mechanism like the
+/// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
+/// interpreter.  However, there are some circumstances in which the pointer may
+/// not remain valid throughout the lifetime of the interpreter, because calls
+/// to `SetOutputs` on the interpreter invalidate the returned pointer.
+///
+/// The ownership of the array remains with the TFLite runtime.
+TFL_CAPI_EXPORT const int* TfLiteInterpreterOutputTensorIndices(
+    const TfLiteInterpreter* interpreter);
+
+/// Returns the tensor associated with the output index.
+/// REQUIRES: 0 <= output_index < TfLiteInterpreterGetOutputTensorCount(tensor)
+///
+/// \note The shape and underlying data buffer for output tensors may be not
+/// be available until after the output tensor has been both sized and
+/// allocated.
+/// In general, best practice is to interact with the output tensor *after*
+/// calling TfLiteInterpreterInvoke().
+TFL_CAPI_EXPORT extern const TfLiteTensor* TfLiteInterpreterGetOutputTensor(
+    const TfLiteInterpreter* interpreter, int32_t output_index);
+
+/// Returns modifiable access to the tensor that corresponds to the
+/// specified `index` and is associated with the provided `interpreter`.
+///
+/// This requires the `index` to be between 0 and N - 1, where N is the
+/// number of tensors in the model.
+///
+/// Typically the tensors associated with the `interpreter` would be set during
+/// the `interpreter` initialization, through a mechanism like the
+/// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
+/// interpreter.  However, there are some circumstances in which the pointer may
+/// not remain valid throughout the lifetime of the interpreter, because calls
+/// to `AddTensors` on the interpreter invalidate the returned pointer.
+///
+/// Note the difference between this function and
+/// `TfLiteInterpreterGetInputTensor` (or `TfLiteInterpreterGetOutputTensor` for
+/// that matter): `TfLiteInterpreterGetTensor` takes an index into the array of
+/// all tensors associated with the `interpreter`'s model, whereas
+/// `TfLiteInterpreterGetInputTensor` takes an index into the array of input
+/// tensors.
+///
+/// The ownership of the tensor remains with the TFLite runtime, meaning the
+/// caller should not deallocate the pointer.
+TFL_CAPI_EXPORT
+TfLiteTensor* TfLiteInterpreterGetTensor(const TfLiteInterpreter* interpreter,
+                                         int index);
+
+/// Tries to cancel any in-flight invocation.
+///
+/// \note This only cancels `TfLiteInterpreterInvoke` calls that happen before
+/// calling this and it does not cancel subsequent invocations.
+/// \note Calling this function will also cancel any in-flight invocations of
+/// SignatureRunners constructed from this interpreter.
+/// Non-blocking and thread safe.
+///
+/// Returns kTfLiteError if cancellation is not enabled via
+/// `TfLiteInterpreterOptionsEnableCancellation`.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterCancel(
+    const TfLiteInterpreter* interpreter);
+
+/// --------------------------------------------------------------------------
+/// SignatureRunner APIs
+///
+/// You can run inference by either:
+///
+/// (i) (recommended) using the Interpreter to initialize SignatureRunner(s) and
+///     then only using SignatureRunner APIs.
+///
+/// (ii) only using Interpreter APIs.
+///
+/// NOTE:
+/// * Only use one of the above options to run inference, i.e. avoid mixing both
+///   SignatureRunner APIs and Interpreter APIs to run inference as they share
+///   the same underlying data (e.g. updating an input tensor “A” retrieved
+///   using the Interpreter APIs will update the state of the input tensor “B”
+///   retrieved using SignatureRunner APIs, if they point to the same underlying
+///   tensor in the model; as it is not possible for a user to debug this by
+///   analyzing the code, it can lead to undesirable behavior).
+/// * The TfLiteSignatureRunner type is conditionally thread-safe, provided that
+///   no two threads attempt to simultaneously access two TfLiteSignatureRunner
+///   instances that point to the same underlying signature, or access a
+///   TfLiteSignatureRunner and its underlying TfLiteInterpreter, unless all
+///   such simultaneous accesses are reads (rather than writes).
+/// * The lifetime of a TfLiteSignatureRunner object ends when
+///   TfLiteSignatureRunnerDelete() is called on it (or when the lifetime of the
+///   underlying TfLiteInterpreter ends -- but you should call
+///   TfLiteSignatureRunnerDelete() before that happens in order to avoid
+///   resource leaks).
+/// * You can only apply delegates to the interpreter (via
+///   TfLiteInterpreterOptions) and not to a signature.
+
+/// Returns the number of signatures defined in the model.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetSignatureCount(
+    const TfLiteInterpreter* interpreter);
+
+/// Returns the key of the Nth signature in the model, where N is specified as
+/// `signature_index`.
+///
+/// NOTE: The lifetime of the returned key is the same as (and depends on) the
+/// lifetime of `interpreter`.
+TFL_CAPI_EXPORT extern const char* TfLiteInterpreterGetSignatureKey(
+    const TfLiteInterpreter* interpreter, int32_t signature_index);
+
+/// Returns a new signature runner using the provided interpreter and signature
+/// key, or nullptr on failure.
+///
+/// NOTE: `signature_key` is a null-terminated C string that must match the
+/// key of a signature in the interpreter's model.
+///
+/// NOTE: The returned signature runner should be destroyed, by calling
+/// TfLiteSignatureRunnerDelete(), before the interpreter is destroyed.
+TFL_CAPI_EXPORT extern TfLiteSignatureRunner*
+TfLiteInterpreterGetSignatureRunner(const TfLiteInterpreter* interpreter,
+                                    const char* signature_key);
+
+/// Returns the number of inputs associated with a signature.
+TFL_CAPI_EXPORT extern size_t TfLiteSignatureRunnerGetInputCount(
+    const TfLiteSignatureRunner* signature_runner);
+
+/// Returns the (null-terminated) name of the Nth input in a signature, where N
+/// is specified as `input_index`.
+///
+/// NOTE: The lifetime of the returned name is the same as (and depends on) the
+/// lifetime of `signature_runner`.
+TFL_CAPI_EXPORT extern const char* TfLiteSignatureRunnerGetInputName(
+    const TfLiteSignatureRunner* signature_runner, int32_t input_index);
+
+/// Resizes the input tensor identified as `input_name` to be the dimensions
+/// specified by `input_dims` and `input_dims_size`. Only unknown dimensions can
+/// be resized with this function. Unknown dimensions are indicated as `-1` in
+/// the `dims_signature` attribute of a TfLiteTensor.
+///
+/// Returns status of failure or success. Note that this doesn't actually resize
+/// any existing buffers. A call to TfLiteSignatureRunnerAllocateTensors() is
+/// required to change the tensor input buffer.
+///
+/// NOTE: This function is similar to TfLiteInterpreterResizeInputTensorStrict()
+/// and not TfLiteInterpreterResizeInputTensor().
+///
+/// NOTE: `input_name` must match the name of an input in the signature.
+///
+/// NOTE: This function makes a copy of the input dimensions, so the caller can
+/// safely deallocate `input_dims` immediately after this function returns.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerResizeInputTensor(
+    TfLiteSignatureRunner* signature_runner, const char* input_name,
+    const int* input_dims, int32_t input_dims_size);
+
+/// Updates allocations for tensors associated with a signature and resizes
+/// dependent tensors using the specified input tensor dimensionality.
+/// This is a relatively expensive operation and hence should only be called
+/// after initializing the signature runner object and/or resizing any inputs.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerAllocateTensors(
+    TfLiteSignatureRunner* signature_runner);
+
+/// Returns the input tensor identified by `input_name` in the given signature.
+/// Returns nullptr if the given name is not valid.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `signature_runner`.
+TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteSignatureRunnerGetInputTensor(
+    TfLiteSignatureRunner* signature_runner, const char* input_name);
+
+/// Runs inference on a given signature.
+///
+/// Before calling this function, the caller should first invoke
+/// TfLiteSignatureRunnerAllocateTensors() and should also set the values for
+/// the input tensors. After successfully calling this function, the values for
+/// the output tensors will be set.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerInvoke(
+    TfLiteSignatureRunner* signature_runner);
+
+/// Returns the number of output tensors associated with the signature.
+TFL_CAPI_EXPORT extern size_t TfLiteSignatureRunnerGetOutputCount(
+    const TfLiteSignatureRunner* signature_runner);
+
+/// Returns the (null-terminated) name of the Nth output in a signature, where
+/// N is specified as `output_index`.
+///
+/// NOTE: The lifetime of the returned name is the same as (and depends on) the
+/// lifetime of `signature_runner`.
+TFL_CAPI_EXPORT extern const char* TfLiteSignatureRunnerGetOutputName(
+    const TfLiteSignatureRunner* signature_runner, int32_t output_index);
+
+/// Returns the output tensor identified by `output_name` in the given
+/// signature. Returns nullptr if the given name is not valid.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `signature_runner`.
+TFL_CAPI_EXPORT extern const TfLiteTensor* TfLiteSignatureRunnerGetOutputTensor(
+    const TfLiteSignatureRunner* signature_runner, const char* output_name);
+
+// --------------------------------------------------------------------------
+// TfLiteTensor wraps data associated with a graph tensor.
+//
+// Note that, while the TfLiteTensor struct is not currently opaque, and its
+// fields can be accessed directly, these methods are still convenient for
+// language bindings. In the future the tensor struct will likely be made opaque
+// in the public API.
+
+/// Returns the type of a tensor element.
+TFL_CAPI_EXPORT extern TfLiteType TfLiteTensorType(const TfLiteTensor* tensor);
+
+/// Returns the number of dimensions that the tensor has.  Returns -1 in case
+/// the 'opaque_tensor' does not have its dimensions property set.
+TFL_CAPI_EXPORT extern int32_t TfLiteTensorNumDims(const TfLiteTensor* tensor);
+
+/// Returns the length of the tensor in the "dim_index" dimension.
+/// REQUIRES: 0 <= dim_index < TFLiteTensorNumDims(tensor)
+TFL_CAPI_EXPORT extern int32_t TfLiteTensorDim(const TfLiteTensor* tensor,
+                                               int32_t dim_index);
+
+/// Returns the size of the underlying data in bytes.
+TFL_CAPI_EXPORT extern size_t TfLiteTensorByteSize(const TfLiteTensor* tensor);
+
+/// Returns a pointer to the underlying data buffer.
+///
+/// \note The result may be null if tensors have not yet been allocated, e.g.,
+/// if the Tensor has just been created or resized and `TfLiteAllocateTensors()`
+/// has yet to be called, or if the output tensor is dynamically sized and the
+/// interpreter hasn't been invoked.
+TFL_CAPI_EXPORT extern void* TfLiteTensorData(const TfLiteTensor* tensor);
+
+/// Returns the (null-terminated) name of the tensor.
+TFL_CAPI_EXPORT extern const char* TfLiteTensorName(const TfLiteTensor* tensor);
+
+/// Returns the parameters for asymmetric quantization. The quantization
+/// parameters are only valid when the tensor type is `kTfLiteUInt8` and the
+/// `scale != 0`. Quantized values can be converted back to float using:
+///    real_value = scale * (quantized_value - zero_point);
+TFL_CAPI_EXPORT extern TfLiteQuantizationParams TfLiteTensorQuantizationParams(
+    const TfLiteTensor* tensor);
+
+/// Copies from the provided input buffer into the tensor's buffer.
+/// REQUIRES: input_data_size == TfLiteTensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyFromBuffer(
+    TfLiteTensor* tensor, const void* input_data, size_t input_data_size);
+
+/// Copies to the provided output buffer from the tensor's buffer.
+/// REQUIRES: output_data_size == TfLiteTensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyToBuffer(
+    const TfLiteTensor* output_tensor, void* output_data,
+    size_t output_data_size);
+
+/// Destroys the signature runner.
+TFL_CAPI_EXPORT extern void TfLiteSignatureRunnerDelete(
+    TfLiteSignatureRunner* signature_runner);
+
+// NOLINTEND(modernize-redundant-void-arg)
+
+/** @} */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_C_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_experimental.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_experimental.h
new file mode 100644
index 00000000..c0febb49
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_experimental.h
@@ -0,0 +1,414 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/c_api_experimental.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// Resets all variable tensors to zero.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
+    TfLiteInterpreter* interpreter);
+
+// Returns the number of variable tensors associated with the model.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetVariableTensorCount(
+    const TfLiteInterpreter* interpreter);
+
+// Returns the tensor associated with the variable tensor index.
+// REQUIRES: 0 <= input_index <
+// TfLiteInterpreterGetVariableTensorCount(interpreter)
+TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteInterpreterGetVariableTensor(
+    const TfLiteInterpreter* interpreter, int32_t variable_index);
+
+/// Adds an op registration for a builtin operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of the interpreter's lifetime. A common
+/// practice is making the provided `TfLiteRegistration` instance static.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsSetOpResolver` (or related functions) on the same
+/// options object.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
+    TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
+    const TfLiteRegistration* registration, int32_t min_version,
+    int32_t max_version);
+
+/// Adds an op registration for a custom operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of any created interpreter's lifetime. A
+/// common practice is making the provided `TfLiteRegistration` instance static.
+///
+/// The lifetime of the string pointed to by `name` must be at least as long
+/// as the lifetime of the `TfLiteInterpreterOptions`.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsSetOpResolver` (or related functions) on the same
+/// options object.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
+    TfLiteInterpreterOptions* options, const char* name,
+    const TfLiteRegistration* registration, int32_t min_version,
+    int32_t max_version);
+
+/// Registers callbacks for resolving builtin or custom operators.
+///
+/// The `TfLiteInterpreterOptionsSetOpResolverExternal` function provides an
+/// alternative method for registering builtin ops and/or custom ops, by
+/// providing operator resolver callbacks.  Unlike using
+/// `TfLiteInterpreterOptionsAddOperator`,
+/// `TfLiteInterpreterOptionsAddBuiltinOp` and/or
+/// `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all the
+/// operators in a single call.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsAddBuiltin` or
+/// `TfLiteInterpreterOptionsAddCustomOp` on the same options object.
+///
+/// If `op_resolver_user_data` is non-null, its lifetime must be at least as
+/// long as the lifetime of the `TfLiteInterpreterOptions`.
+///
+/// The TfLiteOperator objects whose addresses are returned by
+/// `find_builtin_op` and `find_custom_op` must outlive both the
+/// InterpreterOptions object and any Interpreter object created from it.
+///
+/// WARNING: This is an experimental API and subject to change.
+void TfLiteInterpreterOptionsSetOpResolverExternal(
+    TfLiteInterpreterOptions* options,
+    const TfLiteOperator* (*find_builtin_op)(void* user_data, int op,
+                                             int version),
+    const TfLiteOperator* (*find_custom_op)(void* user_data,
+                                            const char* custom_op, int version),
+    void* op_resolver_user_data);
+
+/// \private
+/// Registers callbacks for resolving builtin or custom operators.
+///
+/// This combines the effects of TfLiteInterpreterOptionsSetOpResolverExternal
+/// and TfLiteInterpreterOptionsSetOpResolver.  The callbacks that return
+/// TfLiteOperator will be called first, but if they return a
+/// TfLiteOperator object that has no methods set, then
+/// the callbacks that return a TfLiteRegistration will be called to get
+/// the methods.
+///
+/// WARNING: This function is experimental and subject to change.
+///
+/// WARNING: This function is not an official part of the API,
+/// and should not be used by apps.  It is intended for use only from
+/// TF Lite itself.
+void TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
+    TfLiteInterpreterOptions* options,
+    const TfLiteOperator* (*find_builtin_op_external)(void* user_data, int op,
+                                                      int version),
+    const TfLiteOperator* (*find_custom_op_external)(void* user_data,
+                                                     const char* custom_op,
+                                                     int version),
+    const TfLiteRegistration* (*find_builtin_op)(void* user_data,
+                                                 TfLiteBuiltinOperator op,
+                                                 int version),
+    const TfLiteRegistration* (*find_custom_op)(void* user_data, const char* op,
+                                                int version),
+    void* op_resolver_user_data);
+
+/// Registers callbacks for resolving builtin or custom operators.
+///
+/// The `TfLiteInterpreterOptionsSetOpResolver` function provides an alternative
+/// method for registering builtin ops and/or custom ops, by providing operator
+/// resolver callbacks.  Unlike using `TfLiteInterpreterOptionsAddBuiltinOp`
+/// and/or `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all
+/// the operators in a single call.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsAddBuiltin` or
+/// `TfLiteInterpreterOptionsAddCustomOp` on the same options object.
+///
+/// If `op_resolver_user_data` is non-null, its lifetime must be at least as
+/// long as the lifetime of the `TfLiteInterpreterOptions`.
+///
+/// WARNING: This is an experimental API and subject to change.
+///
+/// DEPRECATED: use TfLiteInterpreterOptionsSetOpResolverExternal instead.
+void TfLiteInterpreterOptionsSetOpResolver(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration* (*find_builtin_op)(void* user_data,
+                                                 TfLiteBuiltinOperator op,
+                                                 int version),
+    const TfLiteRegistration* (*find_custom_op)(void* user_data,
+                                                const char* custom_op,
+                                                int version),
+    void* op_resolver_user_data);
+
+/// \private
+/// Backward-compat version of TfLiteInterpreterOptionsSetOpResolver.
+///
+/// WARNING: This function is deprecated / not an official part of the API, is
+/// only for binary backwards compatibility, and should not be called.
+void TfLiteInterpreterOptionsSetOpResolverV3(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration_V3* (*find_builtin_op_v3)(void* user_data,
+                                                       TfLiteBuiltinOperator op,
+                                                       int version),
+    const TfLiteRegistration_V3* (*find_custom_op_v3)(void* user_data,
+                                                      const char* op,
+                                                      int version),
+    void* op_resolver_user_data);
+
+/// \private
+/// Backward-compat version of TfLiteInterpreterOptionsSetOpResolver.
+///
+/// WARNING: This function is deprecated / not an official part of the API, is
+/// only for binary backwards compatibility, and should not be called.
+void TfLiteInterpreterOptionsSetOpResolverV2(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration_V2* (*find_builtin_op_v2)(void* user_data,
+                                                       TfLiteBuiltinOperator op,
+                                                       int version),
+    const TfLiteRegistration_V2* (*find_custom_op_v2)(void* user_data,
+                                                      const char* op,
+                                                      int version),
+    void* op_resolver_user_data);
+
+/// \private
+/// Backward-compat version of TfLiteInterpreterOptionsSetOpResolver.
+///
+/// WARNING: This function is deprecated / not an official part of the API, is
+/// only for binary backwards compatibility, and should not be called.
+void TfLiteInterpreterOptionsSetOpResolverV1(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration_V1* (*find_builtin_op_v1)(void* user_data,
+                                                       TfLiteBuiltinOperator op,
+                                                       int version),
+    const TfLiteRegistration_V1* (*find_custom_op_v1)(void* user_data,
+                                                      const char* op,
+                                                      int version),
+    void* op_resolver_user_data);
+
+/// Returns a new interpreter using the provided model and options, or null on
+/// failure, where the model uses only the operators explicitly added to the
+/// options.  This is the same as `TFLiteInterpreterCreate` from `c_api.h`,
+/// except that the only operators that are supported are the ones registered
+/// in `options` via calls to `TfLiteInterpreterOptionsSetOpResolver`,
+/// `TfLiteInterpreterOptionsAddBuiltinOp`, and/or
+/// `TfLiteInterpreterOptionsAddCustomOp`.
+///
+/// * `model` must be a valid model instance. The caller retains ownership of
+///   the object, and can destroy it immediately after creating the interpreter;
+///   the interpreter will maintain its own reference to the underlying model
+///   data.
+/// * `options` should not be null. The caller retains ownership of the object,
+///   and can safely destroy it immediately after creating the interpreter.
+///
+/// NOTE: The client *must* explicitly allocate tensors before attempting to
+/// access input tensor data or invoke the interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteInterpreter*
+TfLiteInterpreterCreateWithSelectedOps(const TfLiteModel* model,
+                                       const TfLiteInterpreterOptions* options);
+
+/// Enable or disable the NN API delegate for the interpreter (true to enable).
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetUseNNAPI(
+    TfLiteInterpreterOptions* options, bool enable);
+
+/// Enable or disable CPU fallback for the interpreter (true to enable).
+/// If enabled, TfLiteInterpreterInvoke will do automatic fallback from
+/// executing with delegate(s) to regular execution without delegates
+/// (i.e. on CPU).
+///
+/// Allowing the fallback is suitable only if both of the following hold:
+/// - The caller is known not to cache pointers to tensor data across
+///   TfLiteInterpreterInvoke calls.
+/// - The model is not stateful (no variables, no LSTMs) or the state isn't
+///   needed between batches.
+///
+/// When delegate fallback is enabled, TfLiteInterpreterInvoke will
+/// behave as follows:
+///   If one or more delegates were set in the interpreter options
+///   (see TfLiteInterpreterOptionsAddDelegate),
+///   AND inference fails,
+///   then the interpreter will fall back to not using any delegates.
+///   In that case, the previously applied delegate(s) will be automatically
+///   undone, and an attempt will be made to return the interpreter to an
+///   invokable state, which may invalidate previous tensor addresses,
+///   and the inference will be attempted again, using input tensors with
+///   the same value as previously set.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetEnableDelegateFallback(
+    TfLiteInterpreterOptions* options, bool enable);
+
+/// Allow a delegate to look at the graph and modify the graph to handle
+/// parts of the graph themselves. After this is called, the graph may
+/// contain new nodes that replace 1 more nodes.
+/// 'delegate' must outlive the interpreter.
+/// Use `TfLiteInterpreterOptionsAddDelegate` instead of this unless
+/// absolutely required.
+/// Returns one of the following three status codes:
+/// 1. kTfLiteOk: Success.
+/// 2. kTfLiteDelegateError: Delegation failed due to an error in the
+/// delegate. The Interpreter has been restored to its pre-delegation state.
+/// NOTE: This undoes all delegates previously applied to the Interpreter.
+/// 3. kTfLiteError: Unexpected/runtime failure.
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterModifyGraphWithDelegate(
+    const TfLiteInterpreter* interpreter, TfLiteDelegate* delegate);
+
+/// Returns the tensor index corresponding to the input tensor
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetInputTensorIndex(
+    const TfLiteInterpreter* interpreter, int32_t input_index);
+
+/// Returns the tensor index corresponding to the output tensor
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorIndex(
+    const TfLiteInterpreter* interpreter, int32_t output_index);
+
+/// Assigns (or reassigns) a custom memory allocation for the given
+/// tensor. `flags` is a bitmask, see TfLiteCustomAllocationFlags.
+/// The runtime does NOT take ownership of the underlying memory.
+///
+/// NOTE: User needs to call TfLiteInterpreterAllocateTensors() after this.
+/// Invalid/insufficient buffers will cause an error during
+/// TfLiteInterpreterAllocateTensors or TfLiteInterpreterInvoke (in case of
+/// dynamic shapes in the graph).
+///
+/// Parameters should satisfy the following conditions:
+/// 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+///    In general, this is true for I/O tensors & variable tensors.
+/// 2. allocation->data has the appropriate permissions for runtime access
+///    (Read-only for inputs, Read-Write for others), and outlives
+///    TfLiteInterpreter.
+/// 3. allocation->bytes >= tensor->bytes.
+///    This condition is checked again if any tensors are resized.
+/// 4. allocation->data should be aligned to kDefaultTensorAlignment
+///    defined in lite/util.h. (Currently 64 bytes)
+///    This check is skipped if kTfLiteCustomAllocationFlagsSkipAlignCheck is
+///    set through `flags`.
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus
+TfLiteInterpreterSetCustomAllocationForTensor(
+    TfLiteInterpreter* interpreter, int tensor_index,
+    const TfLiteCustomAllocation* allocation, int64_t flags);
+
+/// --------------------------------------------------------------------------
+/// BufferHandle APIs
+
+/// Sets the delegate buffer handle for the given tensor.
+///
+/// This function sets the buffer handle for a tensor that is used by other
+/// computing hardware such as EdgeTpu. For example, EdgeTpu delegate imports a
+/// tensor's memory into EdgeTpu's virtual address and returns a buffer handle.
+/// Then EdgeTpu delegate calls this API to associate the tensor with the buffer
+/// handle.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterSetBufferHandle(
+    TfLiteInterpreter* interpreter, TfLiteTensor* tensor,
+    TfLiteBufferHandle buffer_handle, TfLiteOpaqueDelegate* delegate);
+
+/// Gets the delegate buffer handle, and the delegate which can process
+/// the buffer handle.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterGetBufferHandle(
+    TfLiteInterpreter* interpreter, int tensor_index,
+    TfLiteBufferHandle* buffer_handle, TfLiteOpaqueDelegate** delegate);
+
+/// Sets whether buffer handle output is allowed.
+/// When using hardware delegation, Interpreter will make the data of output
+/// tensors available in `tensor->data` by default. If the application can
+/// consume the buffer handle directly (e.g. reading output from OpenGL
+/// texture), it can set this flag to false, so Interpreter won't copy the
+/// data from buffer handle to CPU memory.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteSetAllowBufferHandleOutput(
+    const TfLiteInterpreter* interpreter, bool allow_buffer_handle_output);
+
+/// --------------------------------------------------------------------------
+/// SignatureRunner APIs
+
+/// Attempts to cancel in flight invocation if any.
+/// This will not affect calls to `Invoke` that happen after this.
+/// Non blocking and thread safe.
+/// Returns kTfLiteError if cancellation is not enabled, otherwise returns
+/// kTfLiteOk.
+/// NOTE: Calling this function will cancel in-flight invocations
+/// in all SignatureRunners built from the same interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerCancel(
+    TfLiteSignatureRunner* signature_runner);
+
+// Forward declaration, to avoid need for dependency on
+// tensorflow/lite/profiling/telemetry/profiler.h.
+struct TfLiteTelemetryProfilerStruct;
+
+/// Registers the telemetry profiler to the interpreter.
+/// Note: The interpreter does not take the ownership of profiler, but callers
+/// must ensure profiler->data outlives the lifespan of the interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetTelemetryProfiler(
+    TfLiteInterpreterOptions* options,
+    struct TfLiteTelemetryProfilerStruct* profiler);
+
+/// Ensures the data of the tensor at the given index is readable.
+/// Note: If a delegate has been used, and `SetAllowBufferHandleOutput(true)`
+/// has been called, tensor outputs may be stored as delegate buffer handles
+/// whose data is not directly readable until this method has been called. In
+/// such cases, this method will copy the data from the delegate buffer handle
+/// to CPU memory.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterEnsureTensorDataIsReadable(
+    TfLiteInterpreter* interpreter, int tensor_index);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_opaque.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_opaque.h
new file mode 100644
index 00000000..ec3f90d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_opaque.h
@@ -0,0 +1,847 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api_opaque.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+#ifndef TENSORFLOW_LITE_CORE_C_C_API_OPAQUE_H_
+#define TENSORFLOW_LITE_CORE_C_C_API_OPAQUE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/operator.h"  // IWYU pragma: export
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// C API for TensorFlow Lite Opaque Types.
+///
+/// These APIs are accessors for TFLite Opaque Types.  These APIs are primarily
+/// intended to be used by delegates and custom OP implementations.
+///
+/// This API is part of the TensorFlow Lite Extension APIs.
+/// We reserve the right to make changes to this API in future releases,
+/// potentially including non-backwards-compatible changes, on a different
+/// schedule than for the other TensorFlow Lite APIs. See
+/// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api_opaque.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup c_api_opaque lite/c/c_api_opaque.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// --------------------------------------------------------------------------
+// Accessors for TfLiteOpaqueTensor.
+
+/// Returns the type of a tensor element.
+TFL_CAPI_EXPORT extern TfLiteType TfLiteOpaqueTensorType(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns the number of dimensions that the tensor has.  Returns -1 in case
+/// the 'opaque_tensor' does not have its dimensions property set.
+TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns the length of the tensor in the "dim_index" dimension.
+TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorDim(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index);
+
+/// Loads into the provided 'num_dims' the number of dimensions that the
+/// tensor's signature has. Returns 'kTfLiteOk' if 'num_dims' was successfully
+/// loaded. Any other return code indicates an error and 'num_dims' won't be
+/// loaded.
+///
+/// A tensor's dimension signature encodes shapes with unknown dimensions with
+/// -1.  E.g. for a tensor with three dimensions, whose first dimension has an
+/// unknown size, and the second and third dimension have a size of 2, the
+/// dimension signature is [-1,2,2], and 'TfLiteOpaqueTensorGetNumDimsSignature'
+/// loads 3 into 'num_dims'. If the tensor does not have its dimension signature
+/// field set then 'num_dims' is set to -1.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetNumDimsSignature(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t* num_dims);
+
+/// Loads into the provided 'dim_length' the length of the tensor in the
+/// 'dim_index' signature dimension or -1 if that dimension has unknown length.
+/// Returns 'kTfLiteOk' if 'dim_length' was successfully loaded. Any
+/// other return code indicates an error and 'dim_length' won't be loaded.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetDimSignature(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index,
+    int32_t* dim_length);
+
+/// Returns 'non-zero' if the provided 'opaque_tensor' is a variable, and
+/// returns zero otherwise.
+TFL_CAPI_EXPORT extern int TfLiteOpaqueTensorIsVariable(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns the size of the underlying data in bytes.
+TFL_CAPI_EXPORT extern size_t TfLiteOpaqueTensorByteSize(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns a pointer to the underlying data buffer.
+/// Returns nullptr if input is also nullptr.
+TFL_CAPI_EXPORT extern void* TfLiteOpaqueTensorData(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns the 'opaque_tensor's allocation type.
+TFL_CAPI_EXPORT extern TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns a tensor data allocation strategy.
+TFL_CAPI_EXPORT extern TfLiteAllocationStrategy
+TfLiteOpaqueTensorGetAllocationStrategy(const TfLiteOpaqueTensor* t);
+
+/// Returns how stable a tensor data buffer address is across runs.
+TFL_CAPI_EXPORT extern TfLiteRunStability
+TfLiteOpaqueTensorGetBufferAddressStability(const TfLiteOpaqueTensor* t);
+
+/// Returns how stable a tensor data values are across runs.
+TFL_CAPI_EXPORT extern TfLiteRunStability TfLiteOpaqueTensorGetDataStability(
+    const TfLiteOpaqueTensor* t);
+
+/// Returns the operation step when the data of a tensor is populated.
+TFL_CAPI_EXPORT extern TfLiteRunStep TfLiteOpaqueTensorGetDataKnownStep(
+    const TfLiteOpaqueTensor* t);
+
+/// Returns the operation step when the shape of a tensor is computed.
+TFL_CAPI_EXPORT extern TfLiteRunStep TfLiteOpaqueTensorGetShapeKnownStep(
+    const TfLiteOpaqueTensor* t);
+
+/// Returns the (null-terminated) name of the tensor.
+TFL_CAPI_EXPORT extern const char* TfLiteOpaqueTensorName(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns the 'opaque_tensor's quantization information.
+TFL_CAPI_EXPORT extern TfLiteQuantization TfLiteOpaqueTensorGetQuantization(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Returns the 'opaque_tensor's quantization parameters.
+TFL_CAPI_EXPORT extern TfLiteQuantizationParams
+TfLiteOpaqueTensorGetQuantizationParams(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+/// Copies from the provided input buffer into the tensor's buffer.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorCopyFromBuffer(
+    TfLiteOpaqueTensor* opaque_tensor, const void* input_data,
+    size_t input_data_size);
+
+/// Copies to the provided output buffer from the tensor's buffer.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorCopyToBuffer(
+    const TfLiteOpaqueTensor* opaque_tensor, void* output_data,
+    size_t output_data_size);
+
+/// Returns the number of strings stored in the provided 'tensor'.
+/// Returns -1 in case of failure.
+int TfLiteOpaqueTensorGetStringCount(const TfLiteOpaqueTensor* tensor);
+
+/// Stores the address of the n-th (denoted by the provided 'index') string
+/// contained in the provided 'tensor' in the provided '*str' pointer.  Stores
+/// the length of the string in the provided '*len' argument.
+///
+/// Returns 'kTfLiteOk' if '*str' and '*len' have been set successfully.  Any
+/// other return value indicates a failure, which leaves '*str' and '*len' in an
+/// unspecified state.
+///
+/// The range of valid indices is defined by the half open interval [0, N),
+/// where N == TfLiteOpaqueTensorGetStringCount(tensor).
+///
+/// Note that 'str' is not guaranteed to be null-terminated. Also note that this
+/// function will not create a copy of the underlying string data.  The data is
+/// owned by the 'tensor'.
+TfLiteStatus TfLiteOpaqueTensorGetString(const TfLiteOpaqueTensor* tensor,
+                                         int index, const char** str, int* len);
+
+/// Writes the array of strings specified by 'str_array' into
+/// the specified 'tensor'.  The strings provided via the 'str_array' are being
+/// copied into the 'tensor'. Returns 'kTfLiteOk' in case of success.  Any other
+/// return value indicates a failure.
+///
+/// The provided 'str_array_len' must denote the length of 'str_array'
+/// and 'str_n_len[i]' must denote the length of the i-th string.
+///
+/// The provided strings don't need to be null terminated and may contain
+/// embedded null characters.  The amount of bytes copied into the 'tensor' is
+/// entirely determined by 'str_n_len[i]' and it is the caller's responsibility
+/// to set this value correctly to avoid undefined behavior.
+///
+/// Also note that calling 'TfLiteOpaqueTensorWriteStrings' deallocates any
+/// previously stored data in the 'tensor'.
+TfLiteStatus TfLiteOpaqueTensorWriteStrings(TfLiteOpaqueTensor* tensor,
+                                            const char* const* str_array,
+                                            int str_array_len,
+                                            const int* str_n_len);
+
+/// Writes the string pointed to by the provided 'str' pointer of length 'len'
+/// into the provided 'tensor'.  The string provided via 'str' is
+/// copied into the 'tensor'.  Returns 'kTfLiteOk' in case of success. Any
+/// other return value indicates a failure.
+///
+/// Note that calling 'TfLiteOpaqueTensorWriteString' deallocates any
+/// previously stored data in the 'tensor'.  E.g. suppose 't' denotes a
+/// 'TfLiteOpaqueTensor*', then calling 'TfLiteOpaqueTensorWriteString(t, "AB",
+/// 2)' followed by a call to 'TfLiteOpaqueTensorWriteString(t, "CD", 2)' will
+/// lead to 't' containing 'CD', not 'ABCD'.
+///
+/// 'TfLiteOpaqueTensorWriteString' is a convenience function for the use case
+/// of writing a single string to a tensor and its effects are identical to
+/// calling 'TfLiteOpaqueTensorWriteStrings' with an array of a single string.
+TfLiteStatus TfLiteOpaqueTensorWriteString(TfLiteOpaqueTensor* tensor,
+                                           const char* str, int len);
+
+/// An opaque type to create a tensor.
+typedef struct TfLiteOpaqueTensorBuilder TfLiteOpaqueTensorBuilder;
+
+/// Creates an opaque tensor builder object.
+TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderCreate();
+
+/// Deletes an opaque tensor builder object.
+void TfLiteOpaqueTensorBuilderDelete(TfLiteOpaqueTensorBuilder* builder);
+
+/// Sets the 'TfLiteType' of the provided 'builder' to the provided 'type'.
+/// Returns the address of the provided 'builder', so that builder calls can be
+/// chained together.
+TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetType(
+    TfLiteOpaqueTensorBuilder* builder, TfLiteType type);
+
+/// Sets the raw data of the provided 'builder' to the provided 'data'. Returns
+/// the address of the provided 'builder', so that builder calls can be chained
+/// together.
+TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetData(
+    TfLiteOpaqueTensorBuilder* builder, void* data);
+
+/// Sets the allocation type of the provided 'builder' to the provided
+/// 'allocation_type'.  The 'allocation_type' must be one of the following:
+/// 'kTfLiteDynamic', 'kTfLiteArenaRw' or 'kTfLiteArenaRwPersistent'.  If the
+/// provided 'allocation_type' is not one of those values then
+/// 'TfLiteOpaqueContextAddTensor' will return an error. Returns the address of
+/// the provided 'builder', so that builder calls can be chained together.
+TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetAllocationType(
+    TfLiteOpaqueTensorBuilder* builder, TfLiteAllocationType allocation_type);
+
+/// Sets the quantization params of the provided 'builder' to the provided
+/// 'params'. Returns the address of the provided 'builder', so that builder
+/// calls can be chained together.
+TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetQuantizationParams(
+    TfLiteOpaqueTensorBuilder* builder, TfLiteQuantizationParams params);
+
+/// Sets the quantization of the provided 'builder' to the provided
+/// 'quantization'. Returns the address of the provided 'builder', so that
+/// builder calls can be chained together.
+TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetQuantization(
+    TfLiteOpaqueTensorBuilder* builder, TfLiteQuantization quantization);
+
+/// Sets the allocation type of the provided 'tensor' to 'kTfLiteDynamic'.
+/// This function has no effect if the 'tensor's allocation type is already
+/// 'kTfLiteDynamic'.  The provided 'tensor' must not be null.
+void TfLiteOpaqueTensorSetAllocationTypeToDynamic(TfLiteOpaqueTensor* tensor);
+
+// --------------------------------------------------------------------------
+// Accessors for TfLiteOpaqueNode.
+
+/// Returns the input tensor of the given node.
+TFL_CAPI_EXPORT extern const TfLiteOpaqueTensor* TfLiteOpaqueNodeGetInput(
+    const TfLiteOpaqueContext* opaque_context,
+    const TfLiteOpaqueNode* opaque_node, int index);
+
+/// Returns the output tensor of the given node.
+TFL_CAPI_EXPORT extern TfLiteOpaqueTensor* TfLiteOpaqueNodeGetOutput(
+    TfLiteOpaqueContext* opaque_context, const TfLiteOpaqueNode* opaque_node,
+    int index);
+
+/// Gets the number of input tensors of the provided 'opaque_node'.
+TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfInputs(
+    const TfLiteOpaqueNode* opaque_node);
+
+/// Gets the number of output tensors of the provided 'opaque_node'.
+TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
+    const TfLiteOpaqueNode* opaque_node);
+
+/// Returns opaque data provided by the node implementer. The value returned
+/// from this function is the value that was returned from the `init` callback
+/// that was passed to `TfLiteOperatorSetInit`.
+TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetUserData(
+    const TfLiteOpaqueNode* opaque_node);
+
+/// Returns the builtin data associated with the provided 'opaque_node'.
+///
+/// The builtin init data associated with a node would typically be set during
+/// the creation of the associated interpreter, through a mechanism like the
+/// interpreter builder that loads a TFLite model and initialises the
+/// interpreter's nodes accordingly.  Under these conditions the returned
+/// address remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetBuiltinData(
+    const TfLiteOpaqueNode* opaque_node);
+
+/// Loads into the provided '*init_data' pointer the address of the custom init
+/// data associated with the provided 'opaque_node'.  The length of data is
+/// loaded into the provided 'size' pointer.  Returns 'kTfLiteOk' in case
+/// of success.  Any other return value indicates a failure and will leave
+/// 'init_data' and 'size' in an unspecified state.
+///
+/// The custom init data associated with a node would typically be set during
+/// the creation of the associated interpreter, through a mechanism like the
+/// interpreter builder that loads a TFLite model and initialises the
+/// interpreter's nodes accordingly.  Under these conditions the returned
+/// address remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueNodeGetCustomInitialData(
+    const TfLiteOpaqueNode* opaque_node, const void** init_data, int* size);
+
+/// Loads into the provided '*inputs' pointer the starting address of an array
+/// of indices representing the tensors that are inputs of the provided
+/// 'opaque_node'. The length of the array is loaded into the provided
+/// 'num_inputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
+/// return value indicates a failure and will leave 'inputs' and
+/// 'num_inputs' in an unspecified state.
+///
+/// The input tensors associated with a node would typically be set during the
+/// creation of the associated interpreter, through a mechanism like the
+/// interpreter builder that loads a TFLite model and initialises the
+/// interpreter's nodes accordingly.  Under these conditions the loaded address
+/// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeInputs(
+    const TfLiteOpaqueNode* opaque_node, const int** inputs, int* num_inputs);
+
+/// Loads into the provided '*outputs' pointer the starting address of an array
+/// of indices representing the tensors that are outputs of the provided
+/// 'opaque_node'. The length of the array is loaded into the provided
+/// 'num_outputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
+/// return value indicates a failure and will leave 'outputs' and
+/// 'num_outputs' in an unspecified state.
+///
+/// The output tensors associated with a node would typically be set during the
+/// creation of the associated interpreter, through a mechanism like the
+/// interpreter builder that loads a TFLite model and initialises the
+/// interpreter's nodes accordingly.  Under these conditions the loaded address
+/// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeOutputs(
+    const TfLiteOpaqueNode* opaque_node, const int** outputs, int* num_outputs);
+
+/// Set tensor indices of temporary tensors used during the computations.
+/// These temporary tensors should be allocated using AddTensors().
+/// By default nodes don't have any temporary tensors, tensors, but ops are
+/// allowed to change that if they need scratch space of any sort.
+/// This will make a copy of the contents of the array pointed to by
+/// `temporaries`.
+TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeSetTemporaries(
+    TfLiteOpaqueNode* opaque_node, const int* temporaries, int num_temporaries);
+
+/// Loads into the provided '*temporaries' pointer the starting address of an
+/// array of indices representing the temporary tensors associated with the
+/// provided 'opaque_node'. The length of the array is loaded into the provided
+/// 'num_temporaries' pointer. Returns 'kTfLiteOk' in case of success.  Any
+/// other return value indicates a failure and will leave 'temporaries' and
+/// 'num_temporaries' in an unspecified state.
+///
+/// The temporary tensors associated with a node would typically be set during
+/// the creation of the associated interpreter, through a mechanism like the
+/// interpreter builder that loads a TFLite model and initialises the
+/// interpreter's nodes accordingly.  Under these conditions the loaded address
+/// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueNodeTemporaries(const TfLiteOpaqueNode* opaque_node,
+                                         const int** temporaries,
+                                         int* num_temporaries);
+
+/// Given an 'index_of_input', which must be in the range of [0, N), where N is
+/// the number of input tensors of the provided 'opaque_node', returns the
+/// (global) index of the tensor that holds the input.  Returns -1 if
+/// 'index_of_input' is not within the [0, N) range.
+TFL_CAPI_EXPORT
+int TfLiteOpaqueNodeGetInputTensorIndex(const TfLiteOpaqueNode* opaque_node,
+                                        int index_of_input);
+
+/// Given an 'index_of_output', which must be in the range of [0, N), where N is
+/// the number of output tensors of the provided 'opaque_node', returns the
+/// (global) index of the tensor that holds the output. Returns -1 if
+/// 'index_of_output' is not within the [0, N) range.
+TFL_CAPI_EXPORT
+int TfLiteOpaqueNodeGetOutputTensorIndex(const TfLiteOpaqueNode* opaque_node,
+                                         int index_of_output);
+
+// --------------------------------------------------------------------------
+// Accessors for TfLiteOpaqueContext.
+
+typedef struct TfLiteIntArray TfLiteIntArray;
+
+/// Loads the provided `execution_plan` associated with the provided
+/// `opaque_context`.  Returns `kTfLiteOk` if the `execution_plan` was
+/// successfully loaded.  A return value different from `kTfLiteOk` indicates a
+/// failure and the `execution_plan` will be left in an unspecified state.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
+    TfLiteOpaqueContext* opaque_context, TfLiteIntArray** execution_plan);
+
+/// Returns the external context of the specified type associated with the
+/// provided `opaque_context`. Returns `kTfLiteOk` if the external context was
+/// successfully loaded. A return value different from `kTfLiteOk` indicates a
+/// failure and the `external_context` will be left in an unspecified state.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExternalContext(
+    TfLiteOpaqueContext* opaque_context, void** external_context,
+    TfLiteExternalContextType type);
+
+/// Given the specified 'opaque_context' and 'node_index', load the caller's
+/// opaque '*node' and '*registration_external' pointer.  Return 'kTfLiteOk' if
+/// both the '*node' as well as the '*registration_external' have been loaded
+/// correctly.  Any other return code indicates a failure and both '*node' as
+/// well as '*registration_external' will be in an unspecified state.
+///
+/// A caller can obtain a node's index by calling
+/// 'TfLiteOpaqueContextGetExecutionPlan', which provides an array of node
+/// indices, sorted in execution order.  A node index might also come from the
+/// data structures passed to the delegate kernel's callback parameters, like
+/// the delegate parameters data structure passed to the 'init' callback that
+/// contains an array of node indices that are meant to be handled by the
+/// delegate kernel.
+///
+/// This function is expected to be called from within a delegate callback, like
+/// 'Prepare', or a delegate kernel callback (i.e., a callback registered with
+/// a 'TfLiteOperator' object).
+///
+/// The loaded '*node' and '*registration_external' pointers will generally
+/// remain valid for the lifetime of the associated 'opaque_context', but can be
+/// invalidated through API calls where delegates get un-applied, like API calls
+/// that modify the model graph via a delegate, or if input tensors get
+/// re-sized.
+///
+// TODO(b/237983452): Further clarify the lifetime guarantees of pointers that
+// are returned to the users and which actions invalidate them.
+TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
+    struct TfLiteOpaqueContext* opaque_context, int node_index,
+    TfLiteOpaqueNode** node, TfLiteOperator** registration_external);
+
+/// Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+///
+/// Replaces the specified `nodes_to_replace` that are associated with the
+/// provided `opaque_context` with delegate kernels.  The provided
+/// `registration_external` represents the delegate kernel and will be used for
+/// each node subset that will be delegate to the provided `opaque_delegate`.
+///
+/// The TF Lite runtime will take ownership of the `registration_external` and
+/// will delete it when the associated `opaque_context` gets destroyed.
+///
+/// The ownership of the `nodes_to_replace` and the `opaque_delegate` remains
+/// with the caller.
+TFL_CAPI_EXPORT TfLiteStatus
+TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+    struct TfLiteOpaqueContext* opaque_context,
+    TfLiteOperator* registration_external,
+    const TfLiteIntArray* nodes_to_replace,
+    TfLiteOpaqueDelegate* opaque_delegate);
+
+/// Returns modifiable access to the opaque tensor that corresponds to the
+/// specified `index` and is associated with the provided `opaque_context`.
+///
+/// This requires the `index` to be between 0 and N - 1, where N is the
+/// number of tensors in the model.
+///
+/// Typically the tensors associated with the `context` would be set
+/// during the initialization of the `interpreter` that the `context` belongs
+/// to, through a mechanism like the `InterpreterBuilder`, and remain unchanged
+/// throughout the lifetime of the interpreter.  However, there are some
+/// circumstances in which the pointer may not remain valid throughout the
+/// lifetime of the interpreter, because calls to `AddTensors` on the
+/// interpreter invalidate the returned pointer.
+///
+/// The ownership of the tensor remains with the TFLite runtime, meaning the
+/// caller should not deallocate the pointer.
+TFL_CAPI_EXPORT
+TfLiteOpaqueTensor* TfLiteOpaqueContextGetOpaqueTensor(
+    const TfLiteOpaqueContext* opaque_context, int index);
+
+/// Loads into the provided '*inputs' pointer the starting address of an array
+/// of indices representing the tensors that are inputs to the subgraph that is
+/// associated with the provided 'opaque_context'.  The length of the array is
+/// loaded into the provided 'num_inputs' pointer.  Returns 'kTfLiteOk' in case
+/// of success.  Any other return value indicates a failure and will leave
+/// 'inputs' and 'num_inputs' in an unspecified state.  Calls to 'SetInputs' on
+/// the associated subgraph invalidate the loaded pointers.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetInputs(
+    const struct TfLiteOpaqueContext* opaque_context, const int** inputs,
+    int* num_inputs);
+
+/// Loads into the provided '*outputs' pointer the starting address of an array
+/// of indices representing the tensors that are outputs to the subgraph that is
+/// associated with the provided 'opaque_context'.  The length of the array is
+/// loaded into the provided 'num_outputs' pointer.  Returns 'kTfLiteOk' in case
+/// of success.  Any other return value indicates a failure and will leave
+/// 'outputs' and 'num_outputs' in an unspecified state.  Calls to 'SetOutputs'
+/// on the associated subgraph invalidate the loaded pointers.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetOutputs(
+    const struct TfLiteOpaqueContext* opaque_context, const int** outputs,
+    int* num_outputs);
+
+/// Loads into the provided '*variables' pointer the starting address of an
+/// array of indices representing the tensors that are variables to the subgraph
+/// that is associated with the provided 'opaque_context'.  The length of the
+/// array is loaded into the provided 'num_variables' pointer.  Returns
+/// 'kTfLiteOk' in case of success.  Any other return value indicates a failure
+/// and will leave 'variables' and 'num_variables' in an unspecified state.
+/// Calls to 'SetVariables' on the associated subgraph invalidate the loaded
+/// pointers.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetVariables(
+    const struct TfLiteOpaqueContext* opaque_context, const int** variables,
+    int* num_variables);
+
+/// Returns the number of nodes associated with the provided 'opaque_context'.
+TFL_CAPI_EXPORT
+size_t TfLiteOpaqueContextGetNumNodes(
+    const struct TfLiteOpaqueContext* opaque_context);
+
+/// Returns the number of tensors associated with the provided 'opaque_context'.
+TFL_CAPI_EXPORT
+size_t TfLiteOpaqueContextGetNumTensors(
+    const struct TfLiteOpaqueContext* opaque_context);
+
+/// Returns the name of the subgraph that is associated with the provided
+/// 'opaque_context'.  Typically the returned pointer will remain valid
+/// throughout the lifetime of the subgraph, but may be invalidated by a call to
+/// 'Subgraph::SetName'.
+TFL_CAPI_EXPORT
+const char* TfLiteOpaqueContextGetName(
+    const struct TfLiteOpaqueContext* opaque_context);
+
+/// Resizes the provided 'tensor' that is associated with the provided
+/// 'context' so that the 'tensor's shape matches the dimensionality specified
+/// via the provided 'new_size' array.  Returns 'kTfLiteOk' in
+/// case of success.  Any other return value indicates a failure and will leave
+/// the 'tensor' in an unspecified state.  The TF Lite runtime takes ownership
+/// of the 'new_size' array, even in case of failure.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
+                                             TfLiteOpaqueTensor* tensor,
+                                             TfLiteIntArray* new_size);
+
+/// Entry point for C API AcquireSubgraphContext.
+///
+/// Retrieves the corresponding TfLiteOpaqueContext of a subgraph given a
+/// subgraph index and switches to the delegate context for this subgraph. If an
+/// invalid subgraph index is given, then returns kTfLiteError.
+///
+/// NOTE: This function is expected to be paired with
+/// TfLiteOpaqueContextReleaseSubgraphContext() once the delegate preparation is
+/// done and/or the delegate context functions are no longer needed.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextAcquireSubgraphContext(
+    struct TfLiteOpaqueContext* opaque_context, int subgraph_index,
+    TfLiteOpaqueContext** acquired_opaque_context);
+
+/// Entry point for C API ReleaseSubgraphContext.
+///
+/// Releases the corresponding TfLiteOpaqueContext by switching back to the
+/// TFLite kernel context for this specified subgraph.
+///
+/// NOTE: This function is expected to be used after
+/// TfLiteOpaqueContextAcquireSubgraphContext() once the delegate preparation is
+/// done and/or the delegate context functions are no longer needed.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextReleaseSubgraphContext(
+    struct TfLiteOpaqueContext* opaque_context, int subgraph_index);
+
+/// Entry point for C API MarkSubgraphAsDelegationSkippable
+///
+/// Marks the subgraph with the given index as "delegation-skippable". Returns
+/// kTfLiteOk if the given subgraph index is valid and is successfully marked
+/// as delegation-skippable, and an error status if the subgraph index is
+/// invalid.
+/// If a subgraph is delegation-skippable, then the subgraph will be handled by
+/// a specific TfLiteOpaqueDelegate that is already supposed to be
+/// aware of this condition, and therefore, TfLiteInterpreter can skip invoking
+/// `ModifyGraphWithDelegate` on this subgraph.
+///
+/// NOTE: This function is expected to be called only when the subgraph that
+/// `subgraph_index` is pointing to should be skipped by
+/// interpreter::ModifyGraphWithDelegate (e.g. the subgraph is part of the list
+/// of callee subgraphs of the same control flow node, and all of those callees
+/// are supported by the same delegate at once).
+///
+/// For  example, this function can be used when the delegate is handling
+/// control flow ops such as while ops. For instance, a while op has a condition
+/// subgraph indexed at `i` and a body subgraph indexed at `j`. The op can be
+/// delegated when the following conditions hold:
+///   1. The delegate supports while op
+///   2. Both condition subgraph `i` and body subgraph `j` can be fully
+///   delegated to the delegate.
+///
+/// Then if the delegate decides to support the while node along with both body
+/// and condition subgraphs, it should mark subgraphs `i` and `j` skippable so
+/// that those two subgraphs won't be delegated to another delegate.
+///
+/// WARNING: It is the delegate's responsibility to define when to skip
+/// `Subgraph::ModifyGraphWithDelegate`, to check for any edge cases (i.e.
+/// multiple references to the subgraph that `subgraph_index` is pointing to),
+/// and to mark a subgraph as skippable by using this function.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextMarkSubgraphAsDelegationSkippable(
+    TfLiteOpaqueContext* opaque_context, int subgraph_index);
+
+/// Loads metadata of a TF Lite node's custom initialization data. Specifically:
+/// * Loads into the supplied 'fd' the file descriptor of the file that stores
+///   the 'node's custom  initialization data.  This output parameter will be
+///   loaded if the TF Lite runtime has access to the file descriptor, though
+///   this is not always the case, e.g. if a client provides a tflite::Model
+///   directly to the TF Lite runtime.  If 'fd' can be loaded then 'kTfLiteOk'
+///   will be returned, otherwise 'kTfLiteError' is returned.
+/// * Loads into the supplied 'custom_initial_data_offset_in_file' pointer the
+///   offset of the 'node's custom init data in the file associated with 'fd'.
+///   This output parameter will be set to -1 if the 'node' does not have custom
+///   init data set.
+/// * Loads into the supplied 'custom_initial_data_size' the size of the
+///   custom initialization data.  This output parameter will be set to -1 if
+///   the 'node' does not have custom init data set.
+///
+/// Returns 'kTfLiteOk' when 'fd' has been loaded successfully and
+/// 'kTfLiteError' otherwise.  Note that this means that 'kTfLiteOk' can be
+/// returned, even if the 'node' does not have custom init data set.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetNodeInitDataMmapInfo(
+    const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node, int* fd,
+    int64_t* custom_initial_data_offset_in_file,
+    int64_t* custom_initial_data_size);
+
+/// Adds an additional tensor and configures its properties based on the
+/// provided 'builder', preserving pre-existing Tensor entries. If non-null,
+/// the value pointed to by 'new_tensor_index' will be set to the index of the
+/// new tensor.  Returns 'kTfLiteOk' when the tensor has been added
+/// successfully.  Returns 'kTfLiteError' in case of failure.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextAddTensor(TfLiteOpaqueContext* context,
+                                          TfLiteOpaqueTensorBuilder* builder,
+                                          int* new_tensor_index);
+
+/// Populates the size in bytes of a provide 'type' into 'bytes'.  Returns
+/// 'kTfLiteOk' for valid types, and 'kTfLiteError' otherwise.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetSizeOfType(TfLiteOpaqueContext* context,
+                                              TfLiteType type, size_t* bytes);
+
+/// Retrieves named metadata buffer from the TFLite model.
+/// Returns kTfLiteOk if metadata is successfully obtained from the flatbuffer
+/// model. That is, there exists a `metadata` entry with given `name` string.
+/// (see TFLite's schema.fbs).
+/// The corresponding `buffer` information is populated in `ptr` & `bytes`.
+/// The data from `ptr` is valid for the lifetime of the Interpreter.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetMetadata(TfLiteOpaqueContext* context,
+                                            const char* name, const char** ptr,
+                                            size_t* bytes);
+
+/// Reports an error message formed by using the provided 'format' string in
+/// combination with the data provided via the unnamed arguments following
+/// the 'format' parameter ('...').  The intended usage and behavior is the same
+/// as with 'printf' with regards to how the data and the formatting string
+/// interact.  E.g.
+/// 'TfLiteOpaqueContextReportError(opaque_context, "a=%d b=%d", a, b);'
+///
+/// The provided 'opaque_context' will be used for reporting the resulting error
+/// message.
+///
+/// Note that TF Lite clients can use macros like 'TF_LITE_OPAQUE_ENSURE' to
+/// check for certain conditions to be true, and print an error message if the
+/// condition does not hold.  Direct usage of this function from application
+/// code should therefore be rare.
+TFL_CAPI_EXPORT
+void TfLiteOpaqueContextReportError(struct TfLiteOpaqueContext* opaque_context,
+                                    const char* format, ...);
+
+/// Same as `TfLiteOpaqueContextReportError`, but with the variable arguments
+/// passed via a `va_list` instead of directly.
+///
+/// Callers that receive an ellipsis and want to forward it to
+/// to the opaque context error reporting API can add the ellipsis content to a
+/// `va_list` and then call `TfLiteOpaqueContextReportErrorVa`. E.g.:
+///
+///
+///     void MyErrorReporter(struct TfLiteOpaqueContext* opaque_context,
+///                                      const char* format, ...) {
+///       va_list vlist;
+///       va_start(vlist, format);
+///       TfLiteOpaqueContextReportErrorVa(opaque_context, format, vlist);
+///       va_end(vlist);
+///     }
+TFL_CAPI_EXPORT
+void TfLiteOpaqueContextReportErrorVa(
+    struct TfLiteOpaqueContext* opaque_context, const char* format,
+    va_list vlist);
+
+// Since we must not depend on any libraries, define a minimal subset of
+// error macros while avoiding names that have pre-conceived meanings like
+// assert and check.
+
+// Try to make all reporting calls through TF_LITE_OPAQUE_KERNEL_LOG rather than
+// calling the TfLiteOpaqueContextReportError function directly, so that message
+// strings can be stripped out if the binary size needs to be severely
+// optimized.
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+
+#if !defined(TF_LITE_OPAQUE_KERNEL_LOG)
+#define TF_LITE_OPAQUE_KERNEL_LOG(opaque_context, ...)             \
+  do {                                                             \
+    TfLiteOpaqueContextReportError((opaque_context), __VA_ARGS__); \
+  } while (false)
+#endif
+
+#if !defined(TF_LITE_OPAQUE_MAYBE_KERNEL_LOG)
+#define TF_LITE_OPAQUE_MAYBE_KERNEL_LOG(opaque_context, ...)         \
+  do {                                                               \
+    if ((opaque_context) != nullptr) {                               \
+      TfLiteOpaqueContextReportError((opaque_context), __VA_ARGS__); \
+    }                                                                \
+  } while (false)
+#endif
+
+#else  // TF_LITE_STRIP_ERROR_STRINGS
+#define ARGS_UNUSED(...) (void)sizeof(#__VA_ARGS__)
+
+#if !defined(TF_LITE_OPAQUE_MAYBE_KERNEL_LOG)
+#define TF_LITE_OPAQUE_KERNEL_LOG(opaque_context, ...) ARGS_UNUSED(__VA_ARGS__)
+#endif
+
+#if !defined(TF_LITE_OPAQUE_MAYBE_KERNEL_LOG)
+#define TF_LITE_OPAQUE_MAYBE_KERNEL_LOG(opaque_context, ...) \
+  ARGS_UNUSED(__VA_ARGS__)
+#endif
+
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+
+/// Check whether value is true, and if not return kTfLiteError from
+/// the current function (and report the error string msg).
+#if !defined(TF_LITE_OPAQUE_ENSURE_MSG)
+#define TF_LITE_OPAQUE_ENSURE_MSG(opaque_context, value, msg)        \
+  do {                                                               \
+    if (!(value)) {                                                  \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context), __FILE__ " " msg); \
+      return kTfLiteError;                                           \
+    }                                                                \
+  } while (0)
+#endif
+
+/// Check whether the value `a` is true, and if not return kTfLiteError from
+/// the current function, while also reporting the location of the error.
+#if !defined(TF_LITE_OPAQUE_ENSURE)
+#define TF_LITE_OPAQUE_ENSURE(opaque_context, a)                           \
+  do {                                                                     \
+    if (!(a)) {                                                            \
+      TF_LITE_OPAQUE_KERNEL_LOG(opaque_context, "%s:%d: %s was not true.", \
+                                __FILE__, __LINE__, #a);                   \
+      return kTfLiteError;                                                 \
+    }                                                                      \
+  } while (0)
+#endif
+
+/// Check whether the value `a == b` is true, and if not return kTfLiteError
+/// from the current function, while also reporting the location of the error.
+/// `a` and `b` may be evaluated more than once, so no side effects or
+/// extremely expensive computations should be done.
+///
+/// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
+#if !defined(TF_LITE_OPAQUE_ENSURE_EQ)
+#define TF_LITE_OPAQUE_ENSURE_EQ(opaque_context, a, b)                  \
+  do {                                                                  \
+    if ((a) != (b)) {                                                   \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context),                       \
+                                "%s:%d: %s != %s (%d != %d)", __FILE__, \
+                                __LINE__, #a, #b, (a), (b));            \
+      return kTfLiteError;                                              \
+    }                                                                   \
+  } while (0)
+#endif
+
+#if !defined(TF_LITE_OPAQUE_ENSURE_TYPES_EQ)
+#define TF_LITE_OPAQUE_ENSURE_TYPES_EQ(opaque_context, a, b)                  \
+  do {                                                                        \
+    if ((a) != (b)) {                                                         \
+      TF_LITE_OPAQUE_KERNEL_LOG(                                              \
+          (opaque_context), "%s:%d: %s != %s (%s != %s)", __FILE__, __LINE__, \
+          #a, #b, TfLiteTypeGetName(a), TfLiteTypeGetName(b));                \
+      return kTfLiteError;                                                    \
+    }                                                                         \
+  } while (0)
+#endif
+
+#if !defined(TF_LITE_OPAQUE_ENSURE_NEAR)
+#define TF_LITE_OPAQUE_ENSURE_NEAR(opaque_context, a, b, epsilon)             \
+  do {                                                                        \
+    double delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                   \
+    if (delta > epsilon) {                                                    \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context),                             \
+                                "%s:%d: %s not near %s (%f != %f)", __FILE__, \
+                                __LINE__, #a, #b, (double)(a), (double)(b));  \
+      return kTfLiteError;                                                    \
+    }                                                                         \
+  } while (0)
+#endif
+
+#ifndef TF_LITE_STATIC_MEMORY
+/// Creates an opaque delegate and returns its address.  The opaque delegate
+/// will behave according to the provided `opaque_delegate_builder`.  The
+/// lifetime of the objects pointed to by any of the fields within the
+/// `opaque_delegate_builder` must outlive the returned
+/// `TfLiteOpaqueDelegate` and any `TfLiteInterpreter`,
+/// `TfLiteInterpreterOptions`, `tflite::Interpreter`, or
+/// `tflite::InterpreterBuilder` that the delegate is added to.  The returned
+/// address should be passed to `TfLiteOpaqueDelegateDelete` for deletion.  If
+/// `opaque_delegate_builder` is a null pointer, then a null pointer will be
+/// returned.
+TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
+    const TfLiteOpaqueDelegateBuilder* opaque_delegate_builder);
+
+/// Deletes the provided opaque `delegate`.  This function has no effect if the
+/// `delegate` is a null pointer.
+void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
+#endif  // TF_LITE_STATIC_MEMORY
+
+/// Returns a pointer to the data associated with the provided opaque
+/// `delegate`.
+///
+/// A null pointer will be returned when:
+/// - The `delegate` is null.
+/// - The `data` field of the `TfLiteOpaqueDelegateBuilder` used to construct
+///   the `delegate` was null.
+/// - Or in case of any other error.
+/// - The `delegate` has been constructed via a `TfLiteOpaqueDelegateBuilder`,
+///   but the `data` field of the `TfLiteOpaqueDelegateBuilder` is null.
+///
+///  The data_ field of `delegate` will be returned if the
+///  `opaque_delegate_builder` field is null.
+void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+/** @} */
+
+#endif  // TENSORFLOW_LITE_CORE_C_C_API_OPAQUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_types.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_types.h
new file mode 100644
index 00000000..1fe66a47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/c_api_types.h
@@ -0,0 +1,165 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api_types.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+/// This file declares types used by the pure C inference API defined in
+/// c_api.h, some of which are also used in the C++ and C kernel and interpreter
+/// APIs.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api_types.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/c_api_types.h"
+
+#ifndef TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
+#define TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "tensorflow/compiler/mlir/lite/core/c/tflite_types.h"  // IWYU pragma: export
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup c_api_types lite/c/c_api_types.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// Define TFL_CAPI_EXPORT macro to export a function properly with a shared
+// library.
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#elif defined(TFL_STATIC_LIBRARY_BUILD)
+#define TFL_CAPI_EXPORT
+#else  // not defined TFL_STATIC_LIBRARY_BUILD
+#if defined(_WIN32)
+#ifdef TFL_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT
+#endif  // TFL_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+/// Note that new error status values may be added in future in order to
+/// indicate more fine-grained internal states, therefore, applications should
+/// not rely on status values being members of the enum.
+typedef enum TfLiteStatus {
+  /// Success
+  kTfLiteOk = 0,
+
+  /// Generally referring to an error in the runtime (i.e. interpreter)
+  kTfLiteError = 1,
+
+  /// Generally referring to an error from a TfLiteDelegate itself.
+  kTfLiteDelegateError = 2,
+
+  /// Generally referring to an error in applying a delegate due to
+  /// incompatibility between runtime and delegate, e.g., this error is returned
+  /// when trying to apply a TF Lite delegate onto a model graph that's already
+  /// immutable.
+  kTfLiteApplicationError = 3,
+
+  /// Generally referring to serialized delegate data not being found.
+  /// See tflite::delegates::Serialization.
+  kTfLiteDelegateDataNotFound = 4,
+
+  /// Generally referring to data-writing issues in delegate serialization.
+  /// See tflite::delegates::Serialization.
+  kTfLiteDelegateDataWriteError = 5,
+
+  /// Generally referring to data-reading issues in delegate serialization.
+  /// See tflite::delegates::Serialization.
+  kTfLiteDelegateDataReadError = 6,
+
+  /// Generally referring to issues when the TF Lite model has ops that cannot
+  /// be resolved at runtime. This could happen when the specific op is not
+  /// registered or built with the TF Lite framework.
+  kTfLiteUnresolvedOps = 7,
+
+  /// Generally referring to invocation cancelled by the user.
+  /// See `interpreter::Cancel`.
+  // TODO(b/194915839): Implement `interpreter::Cancel`.
+  // TODO(b/250636993): Cancellation triggered by `SetCancellationFunction`
+  // should also return this status code.
+  kTfLiteCancelled = 8,
+
+  // This status is returned by Prepare when the output shape cannot be
+  // determined but the size of the output tensor is known. For example, the
+  // output of reshape is always the same size as the input. This means that
+  // such ops may be
+  // done in place.
+  kTfLiteOutputShapeNotKnown = 9,
+} TfLiteStatus;
+
+// --------------------------------------------------------------------------
+// Opaque types used by c_api.h, c_api_opaque.h and common.h.
+
+/// TfLiteOpaqueContext is an opaque version of TfLiteContext;
+typedef struct TfLiteOpaqueContext TfLiteOpaqueContext;
+
+/// TfLiteOpaqueNode is an opaque version of TfLiteNode;
+typedef struct TfLiteOpaqueNode TfLiteOpaqueNode;
+
+/// TfLiteOpaqueTensor is an opaque version of TfLiteTensor;
+typedef struct TfLiteOpaqueTensor TfLiteOpaqueTensor;
+
+/// TfLiteDelegate: allows delegation of nodes to alternative backends.
+/// Forward declaration of concrete type declared in common.h.
+typedef struct TfLiteDelegate TfLiteDelegate;
+
+/// TfLiteOpaqueDelegateStruct: unconditionally opaque version of
+/// TfLiteDelegate; allows delegation of nodes to alternative backends.
+///
+/// This is an abstract type that is intended to have the same
+/// role as TfLiteDelegate, but without exposing the implementation
+/// details of how delegates are implemented.
+///
+/// WARNING: This is an experimental type and subject to change.
+typedef struct TfLiteOpaqueDelegateStruct TfLiteOpaqueDelegateStruct;
+
+/// TfLiteOpaqueDelegate: conditionally opaque version of
+/// TfLiteDelegate; allows delegation of nodes to alternative backends.
+/// For TF Lite in Play Services, this is an opaque type,
+/// but for regular TF Lite, this is just a typedef for TfLiteDelegate.
+///
+/// WARNING: This is an experimental type and subject to change.
+#if TFLITE_WITH_STABLE_ABI || TFLITE_USE_OPAQUE_DELEGATE
+typedef TfLiteOpaqueDelegateStruct TfLiteOpaqueDelegate;
+#else
+typedef TfLiteDelegate TfLiteOpaqueDelegate;
+#endif
+
+/** @} */
+
+#ifdef __cplusplus
+}  // extern C
+#endif
+#endif  // TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/common.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/common.h
new file mode 100644
index 00000000..1131adb6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/common.h
@@ -0,0 +1,1602 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/common.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
+/// This file defines common C types and APIs for implementing operations,
+/// delegates and other constructs in TensorFlow Lite. The actual operations and
+/// delegates can be defined using C++, but the interface between the
+/// interpreter and the operations are C.
+///
+/// Summary of abstractions:
+/// * `TF_LITE_ENSURE` - self-sufficient error checking
+/// * `TfLiteStatus` - status reporting
+/// * `TfLiteIntArray` - stores tensor shapes (dims),
+/// * `TfLiteContext` - allows an op to access the tensors
+/// * `TfLiteTensor` - tensor (a multidimensional array)
+/// * `TfLiteNode` - a single node or operation
+/// * `TfLiteRegistration` - the implementation of a conceptual operation.
+/// * `TfLiteDelegate` - allows delegation of nodes to alternative backends.
+///
+/// Some abstractions in this file are created and managed by Interpreter.
+///
+/// NOTE: The order of values in these structs are "semi-ABI stable". New values
+/// should be added only to the end of structs and never reordered.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/common.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
+
+#ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
+#define TENSORFLOW_LITE_CORE_C_COMMON_H_
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup common lite/c/common.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// The list of external context types known to TF Lite. This list exists solely
+/// to avoid conflicts and to ensure ops can share the external contexts they
+/// need. Access to the external contexts is controlled by one of the
+/// corresponding support files.
+typedef enum TfLiteExternalContextType {
+  kTfLiteEigenContext = 0,       /// include eigen_support.h to use.
+  kTfLiteGemmLowpContext = 1,    /// include gemm_support.h to use.
+  kTfLiteEdgeTpuContext = 2,     /// Placeholder for Edge TPU support.
+  kTfLiteCpuBackendContext = 3,  /// include cpu_backend_context.h to use.
+  kTfLiteLiteRtBufferContext =
+      4,  /// include external_litert_buffer_context.h to use.
+  kTfLiteMaxExternalContexts = 5
+} TfLiteExternalContextType;
+
+// Forward declare so dependent structs and methods can reference these types
+// prior to the struct definitions.
+struct TfLiteContext;
+struct TfLiteDelegate;
+struct TfLiteRegistration;
+struct TfLiteOpaqueDelegateBuilder;
+
+/// An external context is a collection of information unrelated to the TF Lite
+/// framework, but useful to a subset of the ops. TF Lite knows very little
+/// about the actual contexts, but it keeps a list of them, and is able to
+/// refresh them if configurations like the number of recommended threads
+/// change.
+typedef struct TfLiteExternalContext {
+  TfLiteExternalContextType type;
+  TfLiteStatus (*Refresh)(struct TfLiteContext* context);
+} TfLiteExternalContext;
+
+// LINT.IfChange(optional_tensor)
+#define kTfLiteOptionalTensor (-1)
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/flatbuffer_export.cc:optional_tensor)
+
+/// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
+/// indices
+typedef struct TfLiteIntArray {
+  int size;
+
+#if defined(_MSC_VER)
+  // Context for why this is needed is in http://b/189926408#comment21
+  int data[1];
+#elif (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+       __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON) ||                                             \
+    (defined(__clang__) && __clang_major__ == 7 && __clang_minor__ == 1)
+  // gcc 6.1+ have a bug where flexible members aren't properly handled
+  // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+  int data[0];
+#else
+  int data[];
+#endif
+} TfLiteIntArray;
+
+/// Given the size (number of elements) in a TfLiteIntArray, calculate its size
+/// in bytes.
+size_t TfLiteIntArrayGetSizeInBytes(int size);
+
+#ifndef TF_LITE_STATIC_MEMORY
+/// Create a array of a given `size` (uninitialized entries).
+/// This returns a pointer, that you must free using TfLiteIntArrayFree().
+TfLiteIntArray* TfLiteIntArrayCreate(int size);
+#endif
+
+/// Check if two intarrays are equal. Returns 1 if they are equal, 0 otherwise.
+int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b);
+
+/// Check if an intarray equals an array. Returns 1 if equals, 0 otherwise.
+int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
+                              const int b_data[]);
+
+#ifndef TF_LITE_STATIC_MEMORY
+/// Create a copy of an array passed as `src`.
+/// You are expected to free memory with TfLiteIntArrayFree
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
+
+/// Free memory of array `a`.
+void TfLiteIntArrayFree(TfLiteIntArray* a);
+#endif  // TF_LITE_STATIC_MEMORY
+
+/// Fixed size list of floats. Used for per-channel quantization.
+typedef struct TfLiteFloatArray {
+  int size;
+#if defined(_MSC_VER)
+  // Context for why this is needed is in http://b/189926408#comment21
+  float data[1];
+#elif (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+       __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON) ||                                             \
+    (defined(__clang__) && __clang_major__ == 7 && __clang_minor__ == 1)
+  // gcc 6.1+ have a bug where flexible members aren't properly handled
+  // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+  float data[0];
+#else
+  float data[];
+#endif
+} TfLiteFloatArray;
+
+/// Given the size (number of elements) in a TfLiteFloatArray, calculate its
+/// size in bytes.
+int TfLiteFloatArrayGetSizeInBytes(int size);
+
+#ifndef TF_LITE_STATIC_MEMORY
+/// Create a array of a given `size` (uninitialized entries).
+/// This returns a pointer, that you must free using TfLiteFloatArrayFree().
+TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
+
+/// Create a copy of an array passed as `src`.
+/// You are expected to free memory with TfLiteFloatArrayFree.
+TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src);
+
+/// Free memory of array `a`.
+void TfLiteFloatArrayFree(TfLiteFloatArray* a);
+#endif  // TF_LITE_STATIC_MEMORY
+
+// Since we must not depend on any libraries, define a minimal subset of
+// error macros while avoiding names that have pre-conceived meanings like
+// assert and check.
+
+// Try to make all reporting calls through TF_LITE_KERNEL_LOG rather than
+// calling the context->ReportError function directly, so that message strings
+// can be stripped out if the binary size needs to be severely optimized.
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+#define TF_LITE_KERNEL_LOG(context, ...)            \
+  do {                                              \
+    (context)->ReportError((context), __VA_ARGS__); \
+  } while (false)
+
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)        \
+  do {                                                \
+    if ((context) != nullptr) {                       \
+      (context)->ReportError((context), __VA_ARGS__); \
+    }                                                 \
+  } while (false)
+#else  // TF_LITE_STRIP_ERROR_STRINGS
+#define ARGS_UNUSED(...) (void)sizeof(#__VA_ARGS__)
+#define TF_LITE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+
+/// Check whether value is true, and if not return kTfLiteError from
+/// the current function (and report the error string msg).
+#define TF_LITE_ENSURE_MSG(context, value, ...)                \
+  do {                                                         \
+    if (!(value)) {                                            \
+      TF_LITE_KERNEL_LOG((context), __FILE__ " " __VA_ARGS__); \
+      return kTfLiteError;                                     \
+    }                                                          \
+  } while (0)
+
+/// Check whether the value `a` is true, and if not return kTfLiteError from
+/// the current function, while also reporting the location of the error.
+#define TF_LITE_ENSURE(context, a)                                      \
+  do {                                                                  \
+    if (!(a)) {                                                         \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s was not true.", __FILE__, \
+                         __LINE__, #a);                                 \
+      return kTfLiteError;                                              \
+    }                                                                   \
+  } while (0)
+
+#define TF_LITE_ENSURE_STATUS(a) \
+  do {                           \
+    const TfLiteStatus s = (a);  \
+    if (s != kTfLiteOk) {        \
+      return s;                  \
+    }                            \
+  } while (0)
+
+/// Check whether the value `a == b` is true, and if not return kTfLiteError
+/// from the current function, while also reporting the location of the error.
+/// `a` and `b` may be evaluated more than once, so no side effects or
+/// extremely expensive computations should be done.
+///
+/// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
+#define TF_LITE_ENSURE_EQ(context, a, b)                                   \
+  do {                                                                     \
+    if ((a) != (b)) {                                                      \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s != %s (%d != %d)", __FILE__, \
+                         __LINE__, #a, #b, (a), (b));                      \
+      return kTfLiteError;                                                 \
+    }                                                                      \
+  } while (0)
+
+#define TF_LITE_ENSURE_TYPES_EQ(context, a, b)                             \
+  do {                                                                     \
+    if ((a) != (b)) {                                                      \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s != %s (%s != %s)", __FILE__, \
+                         __LINE__, #a, #b, TfLiteTypeGetName(a),           \
+                         TfLiteTypeGetName(b));                            \
+      return kTfLiteError;                                                 \
+    }                                                                      \
+  } while (0)
+
+#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
+                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
+                         static_cast<double>(b));                            \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
+#define TF_LITE_ENSURE_OK(context, status) \
+  do {                                     \
+    const TfLiteStatus s = (status);       \
+    if ((s) != kTfLiteOk) {                \
+      return s;                            \
+    }                                      \
+  } while (0)
+
+// `std::unreachable` not available until CC23.
+#ifdef __GNUC__  // GCC, Clang, ICC
+
+#define TFL_UNREACHABLE() (__builtin_unreachable())
+
+#elif defined(_MSC_VER)  // MSVC
+
+#define TFL_UNREACHABLE() (__assume(false))
+
+#endif
+
+/// Single-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex64 {
+  float re, im;  /// real and imaginary parts, respectively.
+} TfLiteComplex64;
+
+/// Double-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex128 {
+  double re, im;  /// real and imaginary parts, respectively.
+} TfLiteComplex128;
+
+/// Half precision data type compatible with the C99 definition.
+typedef struct TfLiteFloat16 {
+  uint16_t data;
+} TfLiteFloat16;
+
+/// bfloat16 data type compatible with the Google Brain definition.
+/// https://cloud.google.com/tpu/docs/bfloat16.
+/// This provides 1 bit of sign, 8 bits of exponent, and 7 bits of mantissa.
+typedef struct TfLiteBFloat16 {
+  uint16_t data;
+} TfLiteBFloat16;
+
+/// Return the name of a given type, for error reporting purposes.
+const char* TfLiteTypeGetName(TfLiteType type);
+
+/// SupportedQuantizationTypes.
+typedef enum TfLiteQuantizationType : int {
+  /// No quantization.
+  kTfLiteNoQuantization = 0,
+  /// Affine quantization (with support for per-channel quantization).
+  /// Corresponds to TfLiteAffineQuantization.
+  kTfLiteAffineQuantization = 1,
+} TfLiteQuantizationType;
+
+/// Structure specifying the quantization used by the tensor, if-any.
+typedef struct TfLiteQuantization {
+  /// The type of quantization held by params.
+  TfLiteQuantizationType type;
+  /// Holds an optional reference to a quantization param structure. The actual
+  /// type depends on the value of the `type` field (see the comment there for
+  /// the values and corresponding types).
+  void* params;
+} TfLiteQuantization;
+
+/// Parameters for asymmetric quantization across a dimension (i.e per output
+/// channel quantization).
+/// quantized_dimension specifies which dimension the scales and zero_points
+/// correspond to.
+/// For a particular value in quantized_dimension, quantized values can be
+/// converted back to float using:
+///     `real_value = scale * (quantized_value - zero_point)`
+typedef struct TfLiteAffineQuantization {
+  TfLiteFloatArray* scale;
+  TfLiteIntArray* zero_point;
+  int32_t quantized_dimension;
+} TfLiteAffineQuantization;
+
+/// A union of pointers that points to memory for a given tensor.
+///
+/// Do not access these members directly, if possible, use
+/// `GetTensorData<TYPE>(tensor)` instead, otherwise only access `.data`, as
+/// other members are deprecated.
+typedef union TfLitePtrUnion {
+  int32_t* i32;
+  uint32_t* u32;
+  int64_t* i64;
+  uint64_t* u64;
+  float* f;
+  TfLiteFloat16* f16;
+  TfLiteBFloat16* bf16;
+  double* f64;
+  char* raw;
+  const char* raw_const;
+  uint8_t* uint8;
+  bool* b;
+  int16_t* i16;
+  uint16_t* ui16;
+  TfLiteComplex64* c64;
+  TfLiteComplex128* c128;
+  int8_t* int8;
+  /// Only use this member.
+  void* data;
+} TfLitePtrUnion;
+
+/// Memory allocation strategies.
+///  * `kTfLiteMmapRo`: Read-only memory-mapped data, or data externally
+///        allocated.
+///  * `kTfLiteArenaRw`: Arena allocated with no guarantees about persistence,
+///        and available during eval.
+///  * `kTfLiteArenaRwPersistent`: Arena allocated but persistent across eval,
+///  and only available during eval.
+///  * `kTfLiteDynamic`: Allocated during eval, or for string tensors.
+///  * `kTfLitePersistentRo`: Allocated and populated during prepare. This is
+///        useful for tensors that can be computed during prepare and treated
+///        as constant inputs for downstream ops (also in prepare).
+///  * `kTfLiteCustom`: Custom memory allocation provided by the user. See
+///        TfLiteCustomAllocation below.
+///  * `kTfLiteVariantObject`: Allocation is an arbitrary type-erased C++
+///  object.
+///        Allocation and deallocation are done through `new` and `delete`.
+typedef enum TfLiteAllocationType {
+  kTfLiteMemNone = 0,
+  kTfLiteMmapRo,
+  kTfLiteArenaRw,
+  kTfLiteArenaRwPersistent,
+  kTfLiteDynamic,
+  kTfLitePersistentRo,
+  kTfLiteCustom,
+  kTfLiteVariantObject,
+} TfLiteAllocationType;
+
+/// Memory allocation strategies.
+///
+/// TfLiteAllocationType values have been overloaded to mean more than their
+/// original intent. This enum should only be used to document the allocation
+/// strategy used by a tensor for it data.
+typedef enum TfLiteAllocationStrategy {
+  kTfLiteAllocationStrategyUnknown,
+  kTfLiteAllocationStrategyNone,    /// No data is allocated.
+  kTfLiteAllocationStrategyMMap,    /// Data is mmaped.
+  kTfLiteAllocationStrategyArena,   /// Handled by the arena.
+  kTfLiteAllocationStrategyMalloc,  /// Uses `malloc`/`free`.
+  kTfLiteAllocationStrategyNew      /// Uses `new[]`/`delete[]`.
+} TfLiteAllocationStrategy;
+
+/// Describes how stable a tensor attribute is with regards to an interpreter
+/// runs.
+typedef enum TfLiteRunStability {
+  kTfLiteRunStabilityUnknown,
+  kTfLiteRunStabilityUnstable,   /// May change at any time.
+  kTfLiteRunStabilitySingleRun,  /// Will stay the same for one run.
+  kTfLiteRunStabilityAcrossRuns  /// Will stay the same across all runs.
+} TfLiteRunStability;
+
+/// Describes the steps of a TFLite operation life cycle.
+typedef enum TfLiteRunStep {
+  kTfLiteRunStepUnknown,
+  kTfLiteRunStepInit,
+  kTfLiteRunStepPrepare,
+  kTfLiteRunStepEval
+} TfLiteRunStep;
+
+/// The delegates should use zero or positive integers to represent handles.
+/// -1 is reserved from unallocated status.
+typedef int TfLiteBufferHandle;
+enum {
+  kTfLiteNullBufferHandle = -1,
+};
+
+/// Metadata to encode each dimension in a sparse tensor.
+typedef struct TfLiteDimensionMetadata {
+  TfLiteDimensionType format;
+  int dense_size;
+  TfLiteIntArray* array_segments;
+  TfLiteIntArray* array_indices;
+} TfLiteDimensionMetadata;
+
+/// Parameters used to encode a sparse tensor. For detailed explanation of each
+/// field please refer to lite/schema/schema.fbs.
+typedef struct TfLiteSparsity {
+  TfLiteIntArray* traversal_order;
+  TfLiteIntArray* block_map;
+  TfLiteDimensionMetadata* dim_metadata;
+  int dim_metadata_size;
+} TfLiteSparsity;
+
+/// Defines a custom memory allocation not owned by the runtime.
+/// `data` should be aligned to kDefaultTensorAlignment defined in
+/// lite/util.h. (Currently 64 bytes)
+/// NOTE: See `Interpreter::SetCustomAllocationForTensor` for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
+/// The flags used in `Interpreter::SetCustomAllocationForTensor`.
+/// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
+typedef enum TfLiteCustomAllocationFlags {
+  kTfLiteCustomAllocationFlagsNone = 0,
+  /// Skips checking whether allocation.data points to an aligned buffer as
+  /// expected by the TFLite runtime.
+  /// NOTE: Setting this flag can cause crashes when calling Invoke().
+  /// Use with caution.
+  kTfLiteCustomAllocationFlagsSkipAlignCheck = 1,
+} TfLiteCustomAllocationFlags;
+
+enum { kTfLiteNoBufferIdentifier = SIZE_MAX };
+
+/// A tensor in the interpreter system which is a wrapper around a buffer of
+/// data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
+typedef struct TfLiteTensor {
+  /// The data type specification for data stored in `data`. This affects
+  /// what member of `data` union should be used.
+  TfLiteType type;
+  /// A union of data pointers. The appropriate type should be used for a typed
+  /// tensor based on `type`.
+  TfLitePtrUnion data;
+  /// A pointer to a structure representing the dimensionality interpretation
+  /// that the buffer should have. NOTE: the product of elements of `dims`
+  /// and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+  /// Quantization information.
+  TfLiteQuantizationParams params;
+  /// How memory is mapped
+  ///  kTfLiteMmapRo: Memory mapped read only.
+  ///  i.e. weights
+  ///  kTfLiteArenaRw: Arena allocated read write memory
+  ///  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+  /// The number of bytes required to store the data of this Tensor. I.e.
+  /// (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  /// type is kTfLiteFloat32 and dims = {3, 2} then
+  /// bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  /// An opaque pointer to a tflite::MMapAllocation
+  const void* allocation;
+
+  /// Null-terminated name of this tensor.
+  const char* name;
+
+  /// The delegate which knows how to handle `buffer_handle`.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  struct TfLiteDelegate* delegate;
+
+  /// An integer buffer handle that can be handled by `delegate`.
+  /// The value is valid only when delegate is not null.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteBufferHandle buffer_handle;
+
+  /// If the delegate uses its own buffer (e.g. GPU memory), the delegate is
+  /// responsible to set data_is_stale to true.
+  /// `delegate->CopyFromBufferHandle` can be called to copy the data from
+  /// delegate buffer.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  bool data_is_stale;
+
+  /// True if the tensor is a variable.
+  bool is_variable;
+
+  /// Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  /// Parameters used to encode a sparse tensor.
+  /// This is optional. The field is NULL if a tensor is dense.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteSparsity* sparsity;
+
+  /// Optional. Encodes shapes with unknown dimensions with -1. This field is
+  /// only populated when unknown dimensions exist in a read-write tensor (i.e.
+  /// an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
+  /// `dims_signature` contains [1, -1, -1, 3]).  If no unknown dimensions exist
+  /// then `dims_signature` is either null, or set to an empty array.  Note that
+  /// this field only exists when TF_LITE_STATIC_MEMORY is not defined.
+  const TfLiteIntArray* dims_signature;
+} TfLiteTensor;
+
+/// A structure representing an instance of a node.
+/// This structure only exhibits the inputs, outputs, user defined data and some
+/// node properties (like statefulness), not other features like the type.
+typedef struct TfLiteNode {
+  /// Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  /// Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  /// intermediate tensors to this node expressed as indices into the
+  /// simulator's tensors.
+  TfLiteIntArray* intermediates;
+
+  /// Temporary tensors uses during the computations. This usually contains no
+  /// tensors, but ops are allowed to change that if they need scratch space of
+  /// any sort.
+  TfLiteIntArray* temporaries;
+
+  /// Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  /// Opaque data provided to the node if the node is a builtin. This is usually
+  /// a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  /// Custom initial data. This is the opaque data provided in the flatbuffer.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+
+  /// The pointer to the delegate. This is non-null only when the node is
+  /// created by calling `interpreter.ModifyGraphWithDelegate`.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  struct TfLiteDelegate* delegate;
+
+  /// Whether this op might have side effect (e.g. stateful op).
+  bool might_have_side_effect;
+} TfLiteNode;
+#else   // defined(TF_LITE_STATIC_MEMORY)?
+// NOTE: This flag is opt-in only at compile time.
+//
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// This struct does not use:
+// - allocation
+// - buffer_handle
+// - data_is_stale
+// - delegate
+// - dims_signature
+// - name
+// - sparsity
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+
+// Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
+// only the minimum fields required to represent a node.
+//
+// This struct does not use:
+// - delegate
+// - intermediates
+// - temporaries
+typedef struct TfLiteNode {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // intermediate tensors to this node expressed as indices into the simulator's
+  // tensors.
+  TfLiteIntArray* intermediates;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+#endif  // TF_LITE_STATIC_MEMORY
+
+/// Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
+/// of information required for a kernel to run during TfLiteRegistration::Eval.
+// TODO(b/160955687): Move this field into TF_LITE_STATIC_MEMORY when TFLM
+// builds with this flag by default internally.
+typedef struct TfLiteEvalTensor {
+  /// A union of data pointers. The appropriate type should be used for a typed
+  /// tensor based on `type`.
+  TfLitePtrUnion data;
+
+  /// A pointer to a structure representing the dimensionality interpretation
+  /// that the buffer should have.
+  TfLiteIntArray* dims;
+
+  /// The data type specification for data stored in `data`. This affects
+  /// what member of `data` union should be used.
+  TfLiteType type;
+} TfLiteEvalTensor;
+
+#ifndef TF_LITE_STATIC_MEMORY
+/// Free data memory of tensor `t`.
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
+/// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+/// Free sparsity parameters.
+void TfLiteSparsityFree(TfLiteSparsity* sparsity);
+
+/// Free memory of tensor `t`.
+void TfLiteTensorFree(TfLiteTensor* t);
+
+/// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor);
+
+/// Copies the contents of `src` in `dst`.
+/// Function does nothing if either `src` or `dst` is passed as nullptr and
+/// return `kTfLiteOk`.
+/// Returns `kTfLiteError` if `src` and `dst` doesn't have matching data size.
+/// Note function copies contents, so it won't create new data pointer
+/// or change allocation type.
+/// All Tensor related properties will be copied from `src` to `dst` like
+/// quantization, sparsity, ...
+TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst);
+
+/// Change the size of the memory block owned by `tensor` to `num_bytes`.
+/// Tensors with allocation types other than `kTfLiteDynamic` will be ignored
+/// and a `kTfLiteOk` will be returned. `tensor`'s internal data buffer will be
+/// assigned a pointer which can safely be passed to free or realloc if
+/// `num_bytes` is zero. If `preserve_data` is true, tensor data will be
+/// unchanged in the range from the start of the region up to the minimum of the
+/// old and new sizes. In the case of NULL tensor, or an error allocating new
+/// memory, returns `kTfLiteError`.
+TfLiteStatus TfLiteTensorResizeMaybeCopy(size_t num_bytes, TfLiteTensor* tensor,
+                                         bool preserve_data);
+
+/// Change the size of the memory block owned by `tensor` to `num_bytes`.
+/// Tensors with allocation types other than `kTfLiteDynamic` will be ignored
+/// and a `kTfLiteOk` will be returned. `tensor`'s internal data buffer will be
+/// assigned a pointer which can safely be passed to free or realloc if
+/// `num_bytes` is zero. Tensor data will be unchanged in the range from the
+/// start of the region up to the minimum of the old and new sizes. In the case
+/// of NULL tensor, or an error allocating new memory, returns `kTfLiteError`.
+TfLiteStatus TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
+
+/// WARNING: This is an experimental interface that is subject to change.
+///
+/// Currently, TfLiteDelegateParams has to be allocated in a way that it's
+/// trivially destructable. It will be stored as `builtin_data` field in
+/// `TfLiteNode` of the delegate node.
+///
+/// See also the `CreateDelegateParams` function in `interpreter.cc` details.
+typedef struct TfLiteDelegateParams {
+  struct TfLiteDelegate* delegate;
+  TfLiteIntArray* nodes_to_replace;
+  TfLiteIntArray* input_tensors;
+  TfLiteIntArray* output_tensors;
+} TfLiteDelegateParams;
+
+/// WARNING: This is an experimental interface that is subject to change.
+///
+/// Currently, TfLiteOpaqueDelegateParams has to be allocated in a way that it's
+/// trivially destructable. It will be stored as `builtin_data` field in
+/// `TfLiteNode` of the delegate node.
+///
+/// See also the `CreateOpaqueDelegateParams` function in `subgraph.cc`
+/// details.
+typedef struct TfLiteOpaqueDelegateParams {
+  TfLiteOpaqueDelegate* delegate;
+  void* delegate_data;
+  TfLiteIntArray* nodes_to_replace;
+  TfLiteIntArray* input_tensors;
+  TfLiteIntArray* output_tensors;
+} TfLiteOpaqueDelegateParams;
+
+/// `TfLiteContext` allows an op to access the tensors.
+///
+/// `TfLiteContext` is a struct that is created by the TF Lite runtime
+/// and passed to the "methods" (C function pointers) in the
+/// `TfLiteRegistration` struct that are used to define custom ops and custom
+/// delegate kernels. It contains information and methods (C function pointers)
+/// that can be called by the code implementing a custom op or a custom delegate
+/// kernel. These methods provide access to the context in which that custom op
+/// or custom delegate kernel occurs, such as access to the input and output
+/// tensors for that op, as well as methods for allocating memory buffers
+/// and intermediate tensors, etc.
+///
+/// See also `TfLiteOpaqueContext`, which is an more ABI-stable equivalent.
+typedef struct TfLiteContext {
+  /// Number of tensors in the context.
+  size_t tensors_size;
+
+  /// The execution plan contains a list of the node indices in execution
+  /// order. execution_plan->size is the current number of nodes. And,
+  /// execution_plan->data[0] is the first node that needs to be run.
+  /// TfLiteDelegates can traverse the current execution plan by iterating
+  /// through each member of this array and using GetNodeAndRegistration() to
+  /// access details about a node. i.e.
+  ///
+  ///
+  ///     TfLiteIntArray* execution_plan;
+  ///     TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context,
+  ///                                                     &execution_plan));
+  ///     for (int exec_index = 0; exec_index < execution_plan->size;
+  ///           exec_index++) {
+  ///        int node_index = execution_plan->data[exec_index];
+  ///        TfLiteNode* node;
+  ///        TfLiteRegistration* reg;
+  ///        context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  ///     }
+  ///
+  /// Note: the memory pointed by '`*execution_plan` is OWNED by TfLite runtime.
+  /// Future calls to GetExecutionPlan invalidates earlier outputs. The
+  /// following code snippet shows the issue of such an invocation pattern.
+  /// After calling CheckNode, subsequent access to `plan_1st` is undefined.
+  ///
+  ///     void CheckNode(const TfLiteNode* node) {
+  ///       ...
+  ///       TfLiteIntArray* plan_2nd;
+  ///       TF_LITE_ENSURE_STATUS(
+  ///           context->GetExecutionPlan(context, &plan_2nd)
+  ///       );
+  ///       ...
+  ///     }
+  ///
+  ///     TfLiteIntArray* plan_1st;
+  ///     TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_1st));
+  ///     for (int exec_index = 0; exec_index < plan_1st->size; exec_index++) {
+  ///        int node_index = plan_1st->data[exec_index];
+  ///        TfLiteNode* node;
+  ///        TfLiteRegistration* reg;
+  ///        context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  ///        CheckNode(node);
+  ///     }
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
+                                   TfLiteIntArray** execution_plan);
+
+  /// An array of tensors in the interpreter context (of length `tensors_size`)
+  TfLiteTensor* tensors;
+
+  /// opaque full context ptr (an opaque c++ data structure)
+  void* impl_;
+
+  /// Request memory pointer be resized. Updates dimensions on the tensor.
+  /// NOTE: ResizeTensor takes ownership of newSize.
+  TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
+                               TfLiteIntArray* new_size);
+  /// Request that an error be reported with format string msg.
+  void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
+
+  /// Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
+  /// non-null, the value pointed to by `first_new_tensor_index` will be set to
+  /// the index of the first new tensor.
+  TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
+                             int* first_new_tensor_index);
+
+  /// Get a Tensor node by node_index.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetNodeAndRegistration)(
+      struct TfLiteContext*, int node_index, TfLiteNode** node,
+      struct TfLiteRegistration** registration);
+
+  /// Replace ops with one or more stub delegate operations. This function
+  /// does not take ownership of `nodes_to_replace`.
+  TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
+      struct TfLiteContext*, struct TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, struct TfLiteDelegate* delegate);
+
+  /// Number of threads that are recommended to subsystems like gemmlowp and
+  /// eigen.
+  int recommended_num_threads;
+
+  /// Access external contexts by type.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteExternalContext* (*GetExternalContext)(struct TfLiteContext*,
+                                               TfLiteExternalContextType);
+  /// Set the value of a external context. Does not take ownership of the
+  /// pointer.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  void (*SetExternalContext)(struct TfLiteContext*, TfLiteExternalContextType,
+                             TfLiteExternalContext*);
+
+  /// Flag for allowing float16 precision for FP32 calculation.
+  /// default: false.
+  ///
+  /// WARNING: This is an experimental API and subject to change.
+  bool allow_fp32_relax_to_fp16;
+
+  /// Pointer to the op-level profiler, if set; nullptr otherwise.
+  void* profiler;
+
+  /// Allocate persistent buffer which has the same life time as the
+  /// interpreter. Returns `nullptr` on failure. The memory is allocated from
+  /// heap for TFL, and from tail in TFLM. This method is only available in
+  /// `Init` or `Prepare` stage.
+  ///
+  /// WARNING: This is an experimental interface that is subject
+  /// to change.
+  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
+
+  /// Allocate a buffer which will be deallocated right after invoke phase.
+  /// The memory is allocated from heap in TFL, and from volatile arena in TFLM.
+  /// This method is only available in invoke stage.
+  ///
+  /// NOTE: If possible use `RequestScratchBufferInArena` method to avoid memory
+  /// allocation during inference time.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*AllocateBufferForEval)(struct TfLiteContext* ctx, size_t bytes,
+                                        void** ptr);
+
+  /// Request a scratch buffer in the arena through static memory planning.
+  /// This method is only available in `Prepare` stage and the buffer is
+  /// allocated by the interpreter between Prepare and Eval stage. In `Eval`
+  /// stage, `GetScratchBuffer` API can be used to fetch the address.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*RequestScratchBufferInArena)(struct TfLiteContext* ctx,
+                                              size_t bytes, int* buffer_idx);
+
+  /// Get the scratch buffer pointer.
+  /// This method is only available in Eval stage.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  void* (*GetScratchBuffer)(struct TfLiteContext* ctx, int buffer_idx);
+
+  /// Resize the memory pointer of the `tensor`. This method behaves the same as
+  /// `ResizeTensor`, except that it makes a copy of the shape array internally
+  /// so the shape array could be deallocated right afterwards.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*ResizeTensorExplicit)(struct TfLiteContext* ctx,
+                                       TfLiteTensor* tensor, int dims,
+                                       const int* shape);
+
+  /// This method provides a preview of post-delegation partitioning. Each
+  /// TfLiteDelegateParams in the referenced array corresponds to one instance
+  /// of the delegate kernel. Example usage:
+  ///
+  ///     TfLiteIntArray* nodes_to_replace = ...;
+  ///     TfLiteDelegateParams* params_array;
+  ///     int num_partitions = 0;
+  ///     TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+  ///        context, delegate, nodes_to_replace, &params_array,
+  ///        &num_partitions));
+  ///     for (int idx = 0; idx < num_partitions; idx++) {
+  ///        const auto& partition_params = params_array[idx];
+  ///        ...
+  ///     }
+  ///
+  /// NOTE: The context owns the memory referenced by partition_params_array. It
+  /// will be cleared with another call to PreviewDelegatePartitioning, or after
+  /// TfLiteDelegateParams::Prepare returns.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*PreviewDelegatePartitioning)(
+      struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegateParams** partition_params_array, int* num_partitions);
+
+  /// Returns a TfLiteTensor struct for a given index.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  ///
+  /// WARNING: This method may not be available on all platforms.
+  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
+                             int tensor_idx);
+
+  /// Returns a TfLiteEvalTensor struct for a given index.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  ///
+  /// WARNING: This method may not be available on all platforms.
+  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
+                                     int tensor_idx);
+
+  /// Retrieves named metadata buffer from the TFLite model.
+  /// Returns kTfLiteOk if metadata is successfully obtained from the flatbuffer
+  /// Model: that is, there exists a `metadata` entry with given `name` string.
+  /// (see TFLite's schema.fbs).
+  /// The corresponding `buffer` information is populated in `ptr` & `bytes`.
+  /// The data from `ptr` is valid for the lifetime of the Interpreter.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetModelMetadata)(const struct TfLiteContext* context,
+                                   const char* name, const char** ptr,
+                                   size_t* bytes);
+
+  /// Retrieves the corresponding TfLiteContext of a subgraph that the given
+  /// subgraph_index points to and switches to the delegate context for that
+  /// subgraph. If an invalid subgraph index is given, returns kTfLiteError.
+  ///
+  /// NOTE: This function is expected to be paired with ReleaseSubgraphContext()
+  /// once the delegate preparation is done and/or the delegate context
+  /// functions are no longer needed.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*AcquireSubgraphContext)(
+      struct TfLiteContext* context, int subgraph_index,
+      struct TfLiteContext** acquired_context);
+  /// Releases the subgraph context by switching back to the TFLite kernel
+  /// context for the subgraph that the given subgraph_index points to.
+  ///
+  /// NOTE: This function is expected to be used after AcquireSubgraphContext()
+  /// once the delegate preparation is done and/or the delegate context
+  /// functions are no longer needed.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context,
+                                         int subgraph_index);
+} TfLiteContext;
+
+/// `TfLiteOperator` is an external version of `TfLiteRegistration`
+/// for C API which doesn't use internal types (such as `TfLiteContext`) but
+/// only uses stable API types (such as `TfLiteOpaqueContext`). The purpose of
+/// each field is the exactly the same as with `TfLiteRegistration`.
+typedef struct TfLiteOperator TfLiteOperator;
+
+#ifndef DOXYGEN_SKIP
+// For backwards compatibility.
+// Deprecated. Use TfLiteOperator instead.
+typedef TfLiteOperator TfLiteRegistrationExternal;
+#endif
+
+/// The valid values of the `inplace_operator` field in `TfLiteRegistration`.
+/// This allow an op to signal to the runtime that the same data pointer
+/// may be passed as an input and output without impacting the result.
+/// This does not mean that the memory can safely be reused, it is up to the
+/// runtime to determine this, e.g. if another op consumes the same input or not
+/// or if an input tensor has sufficient memory allocated to store the output
+/// data.
+///
+/// Setting these flags authorizes the runtime to set the data pointers of an
+/// input and output tensor to the same value. In such cases, the memory
+/// required by the output must be less than or equal to that required by the
+/// shared input, never greater. If kTfLiteInplaceOpDataUnmodified is set, then
+/// the runtime can share the same input tensor with multiple operator's
+/// outputs, provided that kTfLiteInplaceOpDataUnmodified is set for all of
+/// them. Otherwise, if an input tensor is consumed by multiple operators, it
+/// may only be shared with the operator which is the last to consume it.
+///
+/// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
+typedef enum {
+  /// The default value. This indicates that the same data pointer cannot safely
+  /// be passed as an op's input and output.
+  kTfLiteInplaceOpNone = 0,
+  /// This indicates that an op's first output's data is identical to its first
+  /// input's data, for example Reshape.
+  kTfLiteInplaceOpDataUnmodified = 1,
+  /// Setting kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput means
+  /// that InputN may be shared with OutputN instead of with the first output.
+  /// This flag requires one or more of kTfLiteInplaceOpInputNShared to be set.
+  kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput = 2,
+  /// kTfLiteInplaceOpInputNShared indicates that it is safe for an op to share
+  /// InputN's data pointer with an output tensor. If
+  /// kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is set then
+  /// kTfLiteInplaceOpInputNShared indicates that InputN may be shared
+  /// with OutputN, otherwise kTfLiteInplaceOpInputNShared indicates that InputN
+  /// may be shared with the first output.
+  ///
+  /// Indicates that an op's first input may be shared with the first output
+  /// tensor. kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput has
+  /// no impact on the behavior allowed by this flag.
+  kTfLiteInplaceOpInput0Shared = 4,
+  /// Indicates that an op's second input may be shared with the first output
+  /// if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
+  /// or second output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
+  /// is set.
+  kTfLiteInplaceOpInput1Shared = 8,
+  /// Indicates that an op's third input may be shared with the first output
+  /// if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
+  /// or third output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
+  /// is
+  /// set.
+  kTfLiteInplaceOpInput2Shared = 16,
+  /// Placeholder to ensure that enum can hold 64 bit values to accommodate
+  /// future fields.
+  kTfLiteInplaceOpMaxValue = UINT64_MAX,
+} TfLiteInPlaceOp;
+
+/// The number of shareable inputs supported.
+static const int kTfLiteMaxSharableOpInputs = 3;
+
+/// `TfLiteRegistration` defines the implementation of an operation
+/// (a built-in op, custom op, or custom delegate kernel).
+///
+/// It is a struct containing "methods" (C function pointers) that will be
+/// invoked by the TF Lite runtime to evaluate instances of the operation.
+///
+/// See also `TfLiteOperator` which is a more ABI-stable equivalent.
+typedef struct TfLiteRegistration {
+  /// Initializes the op from serialized data.
+  /// Called only *once* for the lifetime of the op, so any one-time allocations
+  /// should be made here (unless they depend on tensor sizes).
+  ///
+  /// * If a built-in op:
+  ///       * `buffer` is the op's params data (TfLiteLSTMParams*).
+  ///       * `length` is zero.
+  /// * If custom op:
+  ///       * `buffer` is the op's `custom_options`.
+  ///       * `length` is the size of the buffer.
+  ///
+  /// Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
+  /// or an instance of a struct).
+  ///
+  /// The returned pointer will be stored with the node in the `user_data`
+  /// field, accessible within prepare and invoke functions below.
+  ///
+  /// NOTE: if the data is already in the desired format, simply implement this
+  /// function to return `nullptr` and implement the free function to be a
+  /// no-op.
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+
+  /// The pointer `buffer` is the data previously returned by an init
+  /// invocation.
+  void (*free)(TfLiteContext* context, void* buffer);
+
+  /// prepare is called when the inputs this node depends on have been resized.
+  /// `context->ResizeTensor()` can be called to request output tensors to be
+  /// resized.
+  /// Can be called multiple times for the lifetime of the op.
+  ///
+  /// Returns `kTfLiteOk` on success.
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+
+  /// Execute the node (should read `node->inputs` and output to
+  /// `node->outputs`).
+  ///
+  /// Returns `kTfLiteOk` on success.
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+
+  /// `profiling_string` is called during summarization of profiling information
+  /// in order to group executions together. Providing a value here will cause a
+  /// given op to appear multiple times is the profiling report. This is
+  /// particularly useful for custom ops that can perform significantly
+  /// different calculations depending on their `user-data`.
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+
+  /// Builtin codes. If this kernel refers to a builtin this is the code
+  /// of the builtin. This is so we can do marshaling to other frameworks like
+  /// NN API.
+  ///
+  /// Note: It is the responsibility of the registration binder to set this
+  /// properly.
+  int32_t builtin_code;
+
+  /// Custom op name. If the op is a builtin, this will be `null`.
+  ///
+  /// Note: It is the responsibility of the registration binder to set this
+  /// properly.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  const char* custom_name;
+
+  /// The version of the op.
+  /// Note: It is the responsibility of the registration binder to set this
+  /// properly.
+  int version;
+
+  /// The external (i.e. ABI-stable) version of `TfLiteRegistration`.
+  /// Since we can't use internal types (such as `TfLiteContext`) for C API to
+  /// maintain ABI stability.  C API user will provide `TfLiteOperator` to
+  /// implement custom ops.  We keep it inside of `TfLiteRegistration` and use
+  /// it to route callbacks properly.
+  TfLiteOperator* registration_external;
+
+  /// Retrieves asynchronous kernel.
+  ///
+  /// If the `async_kernel` field is nullptr, it means the operation described
+  /// by this TfLiteRegistration object does not support asynchronous execution.
+  /// Otherwise, the function that the field points to should only be called for
+  /// delegate kernel nodes, i.e. `node` should be a delegate kernel node
+  /// created by applying a delegate. If the function returns nullptr, that
+  /// means that the underlying delegate does not support asynchronous execution
+  /// for this `node`.
+  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
+                                            TfLiteNode* node);
+
+  /// Indicates if an operator's output may safely overwrite its inputs.
+  /// See the comments in `TfLiteInPlaceOp`.
+  uint64_t inplace_operator;
+} TfLiteRegistration;
+
+/// \private
+/// Old version of `TfLiteRegistration` to maintain binary backward
+/// compatibility.
+/// The legacy registration type must be a POD struct type whose field types
+/// must be a prefix of the field types in TfLiteRegistration, and offset of the
+/// first field in TfLiteRegistration that is not present in the legacy
+/// registration type must be greater than or equal to the size of the legacy
+/// registration type.
+///
+/// WARNING: This structure is deprecated / not an official part of the
+/// API. It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V3 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+  TfLiteOperator* registration_external;
+  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
+                                            TfLiteNode* node);
+} TfLiteRegistration_V3;
+
+/// \private
+/// Old version of `TfLiteRegistration` to maintain binary backward
+/// compatibility.
+/// The legacy registration type must be a POD struct type whose field types
+/// must be a prefix of the field types in TfLiteRegistration, and offset of the
+/// first field in TfLiteRegistration that is not present in the legacy
+/// registration type must be greater than or equal to the size of the legacy
+/// registration type.
+///
+/// WARNING: This structure is deprecated / not an official part of the
+/// API. It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V2 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+  TfLiteOperator* registration_external;
+} TfLiteRegistration_V2;
+
+/// \private
+/// Old version of `TfLiteRegistration` to maintain binary backward
+/// compatibility.
+/// The legacy registration type must be a POD struct type whose field types
+/// must be a prefix of the field types in TfLiteRegistration, and offset of the
+/// first field in TfLiteRegistration that is not present in the legacy
+/// registration type must be greater than or equal to the size of the legacy
+/// registration type.
+///
+/// WARNING: This structure is deprecated / not an official part of the
+/// API. It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V1 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+} TfLiteRegistration_V1;
+
+/// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
+/// values should be 1, 2, 4, 8, ...etc.
+typedef enum TfLiteDelegateFlags {
+  kTfLiteDelegateFlagsNone = 0,
+  /// The flag is set if the delegate can handle dynamic sized tensors.
+  /// For example, the output shape of a `Resize` op with non-constant shape
+  /// can only be inferred when the op is invoked.
+  /// In this case, the Delegate is responsible for calling
+  /// `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
+  /// `ResizeTensor` when invoking the op.
+  ///
+  /// If the delegate isn't capable to handle dynamic tensors, this flag need
+  /// to be set to false.
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  /// This flag can be used by delegates (that allow dynamic tensors) to ensure
+  /// applicable tensor shapes are automatically propagated in the case of
+  /// tensor resizing. This means that non-dynamic (allocation_type !=
+  /// kTfLiteDynamic) I/O tensors of a delegate kernel will have correct shapes
+  /// before its Prepare() method is called. The runtime leverages TFLite
+  /// builtin ops in the original execution plan to propagate shapes.
+  ///
+  /// A few points to note:
+  /// 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  /// false, this one is redundant since the delegate kernels are re-initialized
+  /// every time tensors are resized.
+  /// 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  /// work is required to prepare the original execution plan.
+  /// 3. This flag requires that the original execution plan only have ops with
+  /// valid registrations (and not 'dummy' custom ops like with Flex).
+  ///
+  /// WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2,
+
+  /// This flag can be used by delegates to request per-operator profiling. If a
+  /// node is a delegate node, this flag will be checked before profiling. If
+  /// set, then the node will not be profiled. The delegate will then add per
+  /// operator information using `Profiler::EventType::OPERATOR_INVOKE_EVENT`
+  /// and the results will appear in the operator-wise Profiling section and not
+  /// in the Delegate internal section.
+  kTfLiteDelegateFlagsPerOperatorProfiling = 4
+} TfLiteDelegateFlags;
+
+/// WARNING: This is an experimental interface that is subject to change.
+typedef struct TfLiteDelegate {
+  /// Data that delegate needs to identify itself. This data is owned by the
+  /// delegate. The delegate is owned in the user code, so the delegate is
+  /// responsible for deallocating this when it is destroyed.
+  void* data_;
+
+  /// Invoked by `ModifyGraphWithDelegate`. This prepare is called, giving the
+  /// delegate a view of the current graph through `TfLiteContext*`. It
+  /// typically will look at the nodes and call
+  /// `ReplaceNodeSubsetsWithDelegateKernels()` to ask the TensorFlow lite
+  /// runtime to create macro-nodes to represent delegated subgraphs of the
+  /// original graph.
+  TfLiteStatus (*Prepare)(TfLiteContext* context,
+                          struct TfLiteDelegate* delegate);
+
+  /// Copy the data from delegate buffer handle into raw memory of the given
+  /// `tensor`. Note that the delegate is allowed to allocate the raw bytes as
+  /// long as it follows the rules for `kTfLiteDynamic` tensors, in which case
+  /// this cannot be null.
+  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
+                                       struct TfLiteDelegate* delegate,
+                                       TfLiteBufferHandle buffer_handle,
+                                       TfLiteTensor* tensor);
+
+  /// Copy the data from raw memory of the given `tensor` to delegate buffer
+  /// handle. This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
+                                     struct TfLiteDelegate* delegate,
+                                     TfLiteBufferHandle buffer_handle,
+                                     TfLiteTensor* tensor);
+
+  /// Free the Delegate Buffer Handle. Note: This only frees the handle, but
+  /// this doesn't release the underlying resource (e.g. textures). The
+  /// resources are either owned by application layer or the delegate.
+  /// This can be null if the delegate doesn't use its own buffer.
+  void (*FreeBufferHandle)(TfLiteContext* context,
+                           struct TfLiteDelegate* delegate,
+                           TfLiteBufferHandle* handle);
+
+  /// Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  int64_t flags;
+
+  /// The opaque delegate builder associated with this object.  If set then the
+  /// TF Lite runtime will give precedence to this field.  E.g. instead of
+  /// invoking `Prepare` via the function pointer inside the `TfLiteDelegate`
+  /// object, the runtime will first check if the corresponding function
+  /// pointer inside `opaque_delegate_builder` is set and if so invoke that.
+  ///
+  /// If this field is non-null, then the `Prepare` field (of the
+  /// `TfLiteDelegate`) should be null.
+  struct TfLiteOpaqueDelegateBuilder* opaque_delegate_builder;
+} TfLiteDelegate;
+
+/// Build a `null` delegate, with all the fields properly set to their default
+/// values.
+TfLiteDelegate TfLiteDelegateCreate(void);
+
+/// `TfLiteOpaqueDelegateBuilder` is used for constructing
+/// `TfLiteOpaqueDelegate`, see `TfLiteOpaqueDelegateCreate` in c_api_opaque.h.
+/// NOTE: This struct is not ABI stable.
+///
+/// For forward source compatibility `TfLiteOpaqueDelegateBuilder` objects
+/// should be brace-initialized, so that all fields (including any that might be
+/// added in the future) get zero-initialized.  The purpose of each field is
+/// exactly the same as with `TfLiteDelegate`.
+///
+/// NOTE: This type is part of the TensorFlow Lite Extension APIs.
+/// We reserve the right to make changes to this API in future releases,
+/// potentially including non-backwards-compatible changes, on a different
+/// schedule than for the other TensorFlow Lite APIs. See
+/// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
+typedef struct TfLiteOpaqueDelegateBuilder {
+  /// Data that delegate needs to identify itself. This data is owned by the
+  /// delegate. The delegate is owned in the user code, so the delegate is
+  /// responsible for deallocating this when it is destroyed.
+  void* data;
+  /// Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  /// delegate a view of the current graph through `TfLiteContext*`. It
+  /// typically will look at the nodes and call
+  /// `ReplaceNodeSubsetsWithDelegateKernels()` to ask the TensorFlow lite
+  /// runtime to create macro-nodes to represent delegated subgraphs of the
+  /// original graph.
+  TfLiteStatus (*Prepare)(TfLiteOpaqueContext* context,  // NOLINT
+                          TfLiteOpaqueDelegate* delegate, void* data);
+  /// Copies the data from delegate buffer handle into raw memory of the given
+  /// `tensor`. Note that the delegate is allowed to allocate the raw bytes as
+  /// long as it follows the rules for kTfLiteDynamic tensors, in which case
+  /// this cannot be null.
+  TfLiteStatus (*CopyFromBufferHandle)(  // NOLINT
+      TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
+      TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
+  /// Copies the data from raw memory of the given `tensor` to delegate buffer
+  /// handle. This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyToBufferHandle)(  // NOLINT
+      TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
+      TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
+  /// Frees the Delegate Buffer Handle. Note: This only frees the handle, but
+  /// this doesn't release the underlying resource (e.g. textures). The
+  /// resources are either owned by application layer or the delegate.
+  /// This can be null if the delegate doesn't use its own buffer.
+  void (*FreeBufferHandle)(TfLiteOpaqueContext* context,  // NOLINT
+                           TfLiteOpaqueDelegate* delegate, void* data,
+                           TfLiteBufferHandle* handle);
+  /// Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  int64_t flags;
+} TfLiteOpaqueDelegateBuilder;
+
+#ifndef TF_LITE_STATIC_MEMORY
+// See c_api_opaque.h.
+// This declaration in common.h is only for backwards compatibility.
+// NOTE: This function is part of the TensorFlow Lite Extension APIs, see above.
+TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
+    const TfLiteOpaqueDelegateBuilder* opaque_delegate_builder);
+
+// See c_api_opaque.h.
+// This declaration in common.h is only for backwards compatibility.
+// NOTE: This function is part of the TensorFlow Lite Extension APIs, see above.
+void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
+#endif  // TF_LITE_STATIC_MEMORY
+
+// See c_api_opaque.h.
+// This declaration in common.h is only for backwards compatibility.
+// NOTE: This function is part of the TensorFlow Lite Extension APIs, see above.
+void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
+
+/// Returns a tensor data allocation strategy.
+TfLiteAllocationStrategy TfLiteTensorGetAllocationStrategy(
+    const TfLiteTensor* t);
+
+/// Returns how stable a tensor data buffer address is across runs.
+TfLiteRunStability TfLiteTensorGetBufferAddressStability(const TfLiteTensor* t);
+
+/// Returns how stable a tensor data values are across runs.
+TfLiteRunStability TfLiteTensorGetDataStability(const TfLiteTensor* t);
+
+/// Returns the operation step when the data of a tensor is populated.
+///
+/// Some operations can precompute their results before the evaluation step.
+/// This makes the data available earlier for subsequent operations.
+TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t);
+
+/// Returns the operation steop when the shape of a tensor is computed.
+///
+/// Some operations can precompute the shape of their results before the
+/// evaluation step. This makes the shape available earlier for subsequent
+/// operations.
+TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t);
+
+/** @} */
+// Ends `\addtogroup`, it's important for the doc generator that this doesn't
+// include the CC code below.
+
+#ifdef __cplusplus
+}  // extern "C"
+
+#include <utility>
+
+// --- TFLITE VARIANT TENSORS ----
+// Programming languges usually define "variant" as a type that can hold an
+// unbounded set of types. See std::any
+// (https://en.cppreference.com/w/cpp/utility/any) for a related standard
+// library construct. In tensorflow, variant tensors have a data member which is
+// an Object that is destructible and copy constructible.
+//   Variant tensors are commonly used to represent non trivial data
+// semantics that don't fit into simple primitives, such as lists of tensors and
+// datasets. Additionally, they can facilitate containers for optimizing
+// memory movement of tensor data.
+//
+// The following set of classes define the variant tensor member for tflite.
+// They implement a type-erased container intended to be used behind the
+// `data.data : void*` member of `TfLiteTensor`s. Runtime functions interact
+// the variant member at the level of a `VariantData`, whereas kernels
+// operate with the full knowledge of the un-erased type. The `VariantData`
+// class provides abstract methods for destroying and copying `VariantData`.
+// Invoking these methods will dispatch to the erased type opaquely.
+//    The contents of any object of type derived from `AbstractVariant` can be
+// written to `TfLiteTensor::data::data : void*` from kernels. If the runtime
+// were to copy such a tensor through `TfLiteTensorCopy`, the destination data
+// member will contain the result of invoking the erased type's copy
+// constructor. Similar for the runtime releasing tensors from memory, the
+// erased type's destructor will be invoked. There are a few caveats to consider
+// to use these safely, which we discuss below.
+//
+// EXAMPLE: READING VARIANT TENSORS
+//   ```
+//   // retrieve input with `type == kTfLiteVariant`
+//   TfLiteTensor* input = ...
+//   // must first static cast to `VariantData`, more on this below.
+//   VariantData* vd_input = static_cast<VariantData*>(t->data.data);
+//   CustomType* typed_input =
+//   static_cast<CustomType*>(vd_input);
+//   // do custom work on `typed_input`...
+//   ```
+//
+// EXAMPLE: WRITING VARIANT TENSORS
+//   ```
+//   TfLiteTensor* output = ...
+//   // construct a new variant object behind the target tensor
+//   TfLiteVariantRealloc<DerivedType, DerivedArgs...>(output, args...);
+//   // again must static cast to `VariantData*` before writing to `void*`.
+//   output->data.data = static_cast<VariantData*>(typed_output);
+//   ```
+//
+// WHY STATIC CAST TO `VariantData*`
+//    The Standard defines a `reinterpret_cast` from a derived type to its
+// parents as undefined behavior when the parent is a non-standard layout.
+// https://en.cppreference.com/w/cpp/language/reinterpret_cast (see bullet 5).
+// Due to the `VariantData` having virtual members it is indeed non-standard
+// layout, and any type derived from `VariantData` fails to be
+// "transparently-replaceable". I.e. implicit cast from derived to base in this
+// case may adjust the pointer and by definition `reinterpret_cast` will not
+// the adjust the pointer.
+//    Thus, dereferencing a pointer of type `VariantData` which addresses
+// the first byte of an object of said derived type is UB unless it was first
+// implicitly or statically casted to a `VariantData`. Writing the object of
+// derived type directly to `void*` which is dereferenced as a `VariantData` is
+// then UB, and so the intermediate cast through `VariantData` must be enforced.
+//    A good example of this issue is ellucidate in the bottom code snippet
+// here: https://en.cppreference.com/w/cpp/utility/launder.
+class VariantData {
+ public:
+  // All variant objects must be able to be destroyed and copied.
+  virtual ~VariantData() = default;
+  // A "virtual copy-constructor". Often the destination tensor of a variant
+  // copy may have been previously allocated in a prior call to inference. We
+  // allow the copy to target the destinations buffer (`maybe_alloc`),
+  // for potential reuse and optimizations. `maybe_alloc` must be of the same
+  // underlying derived type. References to whatever object is at
+  // `maybe_alloc` may be invalidated.
+  virtual VariantData* CloneTo(VariantData* maybe_alloc) const = 0;
+};
+
+// Concrete implementations extend `AbstractVariantData` with CRPT.
+template <typename ErasedDerived>
+class AbstractVariantData : public VariantData {
+ public:
+  VariantData* CloneTo(VariantData* maybe_alloc) const override {
+    if (maybe_alloc != nullptr) {
+      // If the output is still allocated, then its object may still be
+      // in its life time and the destructor must be called before re-using the
+      // buffer.
+      //     This may actual have a non-negligible effect on performance if the
+      // destructor is complex. A future iteration may
+      // introduce copy or move assignment semantics, allowing for the
+      // underlying implementation to optimize for this case.
+      auto* derived = static_cast<ErasedDerived*>(maybe_alloc);
+      derived->~ErasedDerived();
+      return new (derived)
+          ErasedDerived(static_cast<ErasedDerived const&>(*this));
+    }
+    return new ErasedDerived(static_cast<ErasedDerived const&>(*this));
+  }
+
+ protected:
+  AbstractVariantData() = default;
+  AbstractVariantData(const AbstractVariantData&) = default;
+  AbstractVariantData(AbstractVariantData&&) = delete;
+};
+
+// Analogous to `TfLiteTensorRealloc` for allocation of tensors whose
+// data member points to an arbitrary C++ object. `VariantType` refers
+// to the erased type of said object and `VariantArgs` refers to
+// a list of argument types with which to construct a new `VariantType`.
+// `VariantArgs` must match a constructor of `VariantType`.
+template <class VariantType, class... VariantArgs>
+TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t,
+                                        VariantArgs&&... args) {
+  if (t->type != kTfLiteVariant) return kTfLiteError;
+  VariantType* new_vd;
+  if (t->data.raw != nullptr) {
+    auto* target_vd = static_cast<VariantData*>(t->data.data);
+    target_vd->~VariantData();
+    // As above, we assume if `t` is already allocated then it was allocated
+    // with the same `VariantType` as templated.
+    new_vd = new (t->data.raw) VariantType(std::forward<VariantArgs>(args)...);
+  } else {
+    new_vd = new VariantType(std::forward<VariantArgs>(args)...);
+  }
+  t->data.data = static_cast<VariantData*>(new_vd);
+  t->allocation_type = kTfLiteVariantObject;
+  return kTfLiteOk;
+}
+
+#endif  // __cplusplus
+#endif  // TENSORFLOW_LITE_CORE_C_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/c/operator.h b/third_party/tflite-hdrs/tensorflow/lite/core/c/operator.h
new file mode 100644
index 00000000..ff503cff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/c/operator.h
@@ -0,0 +1,258 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \warning Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/c/c_api.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+///
+/// The types and functions declared in operator.h are
+/// part of the TensorFlow Lite Extension APIs.
+/// We reserve the right to make changes to this API in future releases,
+/// potentially including non-backwards-compatible changes, on a different
+/// schedule than for the other TensorFlow Lite APIs. See
+/// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
+#ifndef TENSORFLOW_LITE_CORE_C_OPERATOR_H_
+#define TENSORFLOW_LITE_CORE_C_OPERATOR_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// TfLiteOperator is an opaque version of TfLiteRegistration,
+/// and is used for registering custom ops.  It represents a definition of a
+/// custom op or a builtin op.
+///
+/// \warning This is an experimental type and subject to change.
+typedef struct TfLiteOperator TfLiteOperator;
+
+/// Returns a new TfLiteOperator instance.
+///
+/// The returned TfLiteOperator instance represents a definition
+/// of an operator with the identity (builtin_code/custom_name and
+/// version) specified by the parameters, but with all callbacks initially
+/// unset.
+///
+/// Evaluation of any operation using this operator will be done using
+/// the "prepare" and "invoke" callbacks, which can be set using
+/// `TfLiteOperatorSetPrepare` and
+/// `TfLiteOperatorSetInvoke`, or for async execution
+/// the "prepare", "eval", and "wait" callbacks of the `TfLiteAsyncKernel`,
+/// which can be set using `TfLiteOperatorSetAsyncKernel`.
+/// If the relevant callbacks are not set, then such evaluation will result
+/// in an error status.  So normally any use of this function should be followed
+/// by appropriate calls to set those callbacks.
+///
+/// \note The caller retains ownership and should ensure that
+/// the lifetime of the `TfLiteOperator` must be at least as long as
+/// the lifetime of any `TfLiteInterpreter` or `tflite::Interpreter` that it is
+/// used in.
+///
+/// \param builtin_code Enumeration code specifying which builtin operator this
+///                     defines, or `TfLiteBuiltinCustom` to define a custom op.
+/// \param custom_name  Name of the custom op, or `nullptr` for a builtin op.
+///                     If `custom_name` is non-null, then `builtin_code` should
+///                     be `TfLiteBuiltinCustom`.
+/// \param version      Version of the op.  See
+///                     https://www.tensorflow.org/lite/guide/ops_version
+/// \param user_data    Opaque pointer passed to the operator's callbacks set
+///                     with functions such as `TfLiteOperatorSetXXXWithData`.
+///                     The user is expected to manage the memory pointed by
+///                     this field and the lifetime of that memory should extend
+///                     at least from the call to `TfLiteOperatorCreate`
+///                     to the invocation of the callback set with
+///                     `TfLiteOperatorSetFreeWithData`.
+///
+/// \return a newly created TfLiteOperator on success, or a nullptr on failure
+TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreate(
+    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version,
+    void* user_data);
+
+/// Destroys the TfLiteOperator instance.
+///
+TFL_CAPI_EXPORT extern void TfLiteOperatorDelete(TfLiteOperator* registration);
+
+/// Return the builtin op code of the provided external 'registration'.
+///
+TFL_CAPI_EXPORT extern TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
+    const TfLiteOperator* registration);
+
+/// Returns the custom name of the provided 'registration'. The returned pointer
+/// will be non-null iff the op is a custom op.
+///
+TFL_CAPI_EXPORT extern const char* TfLiteOperatorGetCustomName(
+    const TfLiteOperator* registration);
+
+/// Return the OP version of the provided external 'registration'.  Return -1
+/// in case of error, or if the provided address is null.
+///
+TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
+    const TfLiteOperator* registration);
+
+/// Return the user data field of the provided external 'registration', or
+/// nullptr if none was set.
+///
+TFL_CAPI_EXPORT extern void* TfLiteOperatorGetUserData(
+    const TfLiteOperator* registration);
+
+/// Sets the initialization callback for the registration.
+///
+/// The callback is called to initialize the op from serialized data.
+/// Please refer `init` of `TfLiteRegistration` for the detail.
+///
+/// Deprecated: Use `TfLiteOperatorSetInitWithData`
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
+    TfLiteOperator* registration,
+    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                  size_t length));
+
+/// Sets the initialization callback for the registration. The function returns
+/// an error upon failure.
+///
+/// The callback is called to initialize the op from serialized data. The value
+/// passed in the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreate`.  Please refer `init` of `TfLiteRegistration`
+/// for the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetInitWithData(
+    TfLiteOperator* registration,
+    void* (*init)(void* user_data, TfLiteOpaqueContext* context,
+                  const char* buffer, size_t length));
+
+/// Sets the deallocation callback for the registration.
+///
+/// This callback is called to deallocate the data returned by the init
+/// callback. The value passed in the `data` parameter is the value that was
+/// returned by the `init` callback. Please refer `free` of `TfLiteRegistration`
+/// for the detail.
+///
+/// Deprecated: Use `TfLiteOperatorSetFreeWithData`
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
+    TfLiteOperator* registration,
+    void (*free)(TfLiteOpaqueContext* context, void* data));
+
+/// Sets the deallocation callback for the registration, similarly to
+/// `TfLiteOperatorSetFree`. The function returns an error upon failure.
+///
+/// This callback is called to deallocate the data returned by the init
+/// callback. The value passed in the `data` parameter is the value that was
+/// returned by the `init` callback. The value passed in the `user_data`
+/// parameter is the value that was passed to `TfLiteOperatorCreate`.
+/// Please refer `free` of `TfLiteRegistration` for the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetFreeWithData(
+    TfLiteOperator* registration,
+    void (*free)(void* user_data, TfLiteOpaqueContext* context, void* data));
+
+/// Sets the preparation callback for the registration.
+///
+/// The callback is called when the inputs of operator have been resized.
+/// Please refer `prepare` of `TfLiteRegistration` for the detail.
+///
+/// Deprecated: Use `TfLiteOperatorSetPrepareWithData`
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
+    TfLiteOperator* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+
+/// Sets the preparation callback for the registration. The function returns an
+/// error upon failure.
+///
+/// The callback is called when the inputs of operator have been resized.  The
+/// value passed in the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreate`.  Please refer `prepare` of
+/// `TfLiteRegistration` for the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetPrepareWithData(
+    TfLiteOperator* registration,
+    TfLiteStatus (*prepare)(void* user_data, TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+
+/// Sets the invocation callback for the registration.
+///
+/// The callback is called when the operator is executed.
+/// Please refer `invoke` of `TfLiteRegistration` for the detail.
+///
+/// Deprecated: Use `TfLiteOperatorSetInvokeWithData`
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
+    TfLiteOperator* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node));
+
+/// Sets the invocation callback for the registration. The function returns an
+/// error upon failure.
+///
+/// The callback is called when the operator is executed.  The value passed in
+/// the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreate`.  Please refer `invoke` of `TfLiteRegistration` for
+/// the detail.
+///
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetInvokeWithData(
+    TfLiteOperator* registration,
+    TfLiteStatus (*invoke)(void* user_data, TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node));
+
+/// Sets the async kernel accessor callback for the registration.
+///
+/// The callback is called to retrieve the async kernel if the delegate supports
+/// it. If the delegate does not support async execution, either this function
+/// should not be called, or `async_kernel` needs to be nullptr.
+/// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
+/// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
+///
+/// \warning This is an experimental API and subject to change.
+/// Deprecated: Use `TfLiteOperatorSetAsyncKernelWithData`
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetAsyncKernel(
+    TfLiteOperator* registration,
+    struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                              TfLiteOpaqueNode* node));
+
+/// Sets the async kernel accessor callback for the registration. The function
+/// returns an error upon failure.
+///
+/// The callback is called to retrieve the async kernel if the delegate supports
+/// it. If the delegate does not support async execution, either this function
+/// should not be called, or `async_kernel` needs to be nullptr.  `node` is the
+/// delegate TfLiteNode created by `ModifyGraphWithDelegate`.  The value passed
+/// in the `user_data` parameter is the value that was passed to
+/// `TfLiteOperatorCreate`.  Please refer `async_kernel` of `TfLiteRegistration`
+/// for the detail.
+///
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOperatorSetAsyncKernelWithData(
+    TfLiteOperator* registration,
+    struct TfLiteAsyncKernel* (*async_kernel)(void* user_data,
+                                              TfLiteOpaqueContext* context,
+                                              TfLiteOpaqueNode* node));
+
+/// Sets the inplace_operator field of the external registration.
+///
+/// This is a bitmask. Please refer to `inplace_operator` field of
+/// `TfLiteRegistration` for details.
+///
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInplaceOperator(
+    TfLiteOperator* registration, uint64_t inplace_operator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_C_OPERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/create_op_resolver.h b/third_party/tflite-hdrs/tensorflow/lite/core/create_op_resolver.h
new file mode 100644
index 00000000..624ef1e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/create_op_resolver.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/create_op_resolver.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_CREATE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_CORE_CREATE_OP_RESOLVER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+// The following include is not needed but is kept for now to not break
+// compatibility for existing clients; it should be removed with the next
+// non-backwards compatible version of TFLite.
+#include "tensorflow/lite/op_resolver.h"
+
+namespace tflite {
+std::unique_ptr<MutableOpResolver> CreateOpResolver();
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_CREATE_OP_RESOLVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
new file mode 100644
index 00000000..77970166
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"  // IWYU pragma: export
+//
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
new file mode 100644
index 00000000..8606ad42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
new file mode 100644
index 00000000..303b5f9b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000..bd2f520b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
new file mode 100644
index 00000000..39a3648b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000..9129fae8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
new file mode 100644
index 00000000..b235f52c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h"
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h
new file mode 100644
index 00000000..ed5d17d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// APIs of TfLiteMiniBenchmarkResult.
+typedef struct TfLiteMiniBenchmarkResult TfLiteMiniBenchmarkResult;
+int TfLiteMiniBenchmarkResultInitStatus(TfLiteMiniBenchmarkResult* result);
+uint8_t* TfLiteMiniBenchmarkResultFlatBufferData(
+    TfLiteMiniBenchmarkResult* result);
+size_t TfLiteMiniBenchmarkResultFlatBufferDataSize(
+    TfLiteMiniBenchmarkResult* result);
+// Free memory allocated with `result`.
+void TfLiteMiniBenchmarkResultFree(TfLiteMiniBenchmarkResult* result);
+
+// APIs of TfLiteMiniBenchmarkCustomValidationInfo.
+typedef struct TfLiteMiniBenchmarkCustomValidationInfo
+    TfLiteMiniBenchmarkCustomValidationInfo;
+void TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(
+    TfLiteMiniBenchmarkCustomValidationInfo* custom_validation, int batch_size,
+    uint8_t* buffer, size_t* buffer_dim, int buffer_dim_size);
+void TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+    TfLiteMiniBenchmarkCustomValidationInfo* custom_validation,
+    void* accuracy_validator_user_data,
+    bool (*accuracy_validator_func)(void* user_data,
+                                    uint8_t* benchmark_result_data,
+                                    int benchmark_result_data_size));
+
+// APIs of TfLiteMiniBenchmarkSettings.
+typedef struct TfLiteMiniBenchmarkSettings TfLiteMiniBenchmarkSettings;
+TfLiteMiniBenchmarkSettings* TfLiteMiniBenchmarkSettingsCreate();
+TfLiteMiniBenchmarkCustomValidationInfo*
+TfLiteMiniBenchmarkSettingsCustomValidationInfo(
+    TfLiteMiniBenchmarkSettings* settings);
+void TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+    TfLiteMiniBenchmarkSettings* settings, uint8_t* flatbuffer_data,
+    size_t flatbuffer_data_size);
+void TfLiteMiniBenchmarkSettingsSetErrorReporter(
+    TfLiteMiniBenchmarkSettings* settings, void* error_reporter_user_data,
+    int (*error_reporter_func)(void* user_data, const char* format,
+                               va_list args));
+void TfLiteMiniBenchmarkSettingsFree(TfLiteMiniBenchmarkSettings* settings);
+
+// Others.
+// Trigger validation for `settings` and return the validation result.
+// This returns a pointer, that you must free using
+// TfLiteMiniBenchmarkResultFree().
+TfLiteMiniBenchmarkResult* TfLiteBlockingValidatorRunnerTriggerValidation(
+    TfLiteMiniBenchmarkSettings* settings);
+
+// This function is a private function that shouldn't be considered as part of
+// the APIs.
+// TODO: b/290615172 - Remove the function from this header.
+void TfLiteMiniBenchmarkSettingsSetGpuPluginHandle(
+    TfLiteMiniBenchmarkSettings* settings, void* gpu_plugin_handle);
+
+#ifdef __cplusplus
+}  // extern "C".
+#endif
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api_types.h b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api_types.h
new file mode 100644
index 00000000..adace96a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api_types.h
@@ -0,0 +1,90 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_TYPES_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_TYPES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstdarg>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Internal structures used by the C API. Clients should not directly depend
+// on this file.
+// The result of triggering MiniBenchmark.
+struct TfLiteMiniBenchmarkResult {
+  // MinibenchmarkStatus of whether test is initialized successfully. The value
+  // maps to status_codes.h.
+  int init_status;
+  // The pointer to a stream of BenchmarkEvent(s). Size of each event is
+  // prefixed.
+  uint8_t* flatbuffer_data;
+  // The byte size of the flatbuffer_data.
+  size_t flatbuffer_data_size;
+};
+
+// Custom validation related info. For forward source compatibility, this
+// struct should always be brace-initialized, so that all fields (including any
+// that might be added in the future) get zero-initialized.
+struct TfLiteMiniBenchmarkCustomValidationInfo {
+  // The batch number of custom input.
+  int batch_size;
+  // Length of buffer_dim.
+  int buffer_dim_size;
+  // The size of each custom input within buffer.
+  size_t* buffer_dim;
+  // Pointer to concatenated custom input data. At embedding time, the
+  // i-th input tensor buffer starts from sum(buffer_dim[0...i-1]) to
+  // sum(buffer_dim[0...i]).
+  uint8_t* buffer;
+  // Arbitrary data that will be passed  to the `accuracy_validator_func`
+  // function via its `user_data` parameter.
+  void* accuracy_validator_user_data;
+  // Custom validation rule that decides whether a BenchmarkResult passes the
+  // accuracy check.
+  bool (*accuracy_validator_func)(void* user_data,
+                                  uint8_t* benchmark_result_data,
+                                  int benchmark_result_data_size);
+};
+
+// Mini-benchmark settings. For forward source compatibility, this struct
+// should always be brace-initialized, so that all fields (including any that
+// might be added in the future) get zero-initialized.
+struct TfLiteMiniBenchmarkSettings {
+  // The pointer to a flatbuffer data of MinibenchmarkSettings.
+  uint8_t* flatbuffer_data;
+  // The byte size of the flatbuffer_data.
+  size_t flatbuffer_data_size;
+  // Custom validation related info.
+  TfLiteMiniBenchmarkCustomValidationInfo custom_validation_info;
+  // Arbitrary data that will be passed  to the `error_reporter_func`
+  // function via its `user_data` parameter.
+  void* error_reporter_user_data;
+  // Custom error reporter to log error to. If the function is provided, errors
+  // will be logged with this function.
+  int (*error_reporter_func)(void* user_data, const char* format, va_list args);
+  // A handle to a gpu_plugin provided by external library. This handle will be
+  // used to lookup the shared object file that provides GPU Delegate Plugin.
+  void* gpu_plugin_handle;
+};
+
+#ifdef __cplusplus
+}  // extern "C".
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/interpreter.h b/third_party/tflite-hdrs/tensorflow/lite/core/interpreter.h
new file mode 100644
index 00000000..363a2990
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/interpreter.h
@@ -0,0 +1,1040 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Main abstraction controlling the tflite interpreter.
+/// Do NOT include this file directly,
+/// instead include third_party/tensorflow/lite/interpreter.h
+/// See third_party/tensorflow/lite/c/common.h for the API for defining
+/// operations (TfLiteRegistration).
+#ifndef TENSORFLOW_LITE_CORE_INTERPRETER_H_
+#define TENSORFLOW_LITE_CORE_INTERPRETER_H_
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/interpreter.h"
+// IWYU pragma: friend third_party/tensorflow/lite/interpreter.h
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <complex>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+#include "tensorflow/compiler/mlir/lite/experimental/remat/metadata_util.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/async/async_signature_runner.h"
+#include "tensorflow/lite/core/c/common.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/signature_runner.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/experimental/resource/initialization_status.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
+#include "tensorflow/lite/external_cpu_backend_context.h"
+#include "tensorflow/lite/internal/signature_def.h"
+#include "tensorflow/lite/interpreter_options.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+#include "tensorflow/lite/profiling/root_profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
+#include "tensorflow/lite/stderr_reporter.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/type_to_tflitetype.h"
+
+namespace tflite {
+
+#ifndef DOXYGEN_SKIP
+class InterpreterTest;  // Class for friend declarations.
+
+namespace delegates {
+class InterpreterUtils;  // Class for friend declarations.
+
+namespace test_utils {
+class TestDelegation;  // Class for friend declarations.
+}  // namespace test_utils
+}  // namespace delegates
+
+namespace interpreter_wrapper {
+class InterpreterWrapper;  // Class for friend declarations.
+}  // namespace interpreter_wrapper
+
+namespace model_builder {
+class ModelBuilder;
+}  // namespace model_builder
+#endif  // DOXYGEN_SKIP
+
+/// An interpreter for a graph of nodes that input and output from tensors.
+/// Each node of the graph processes a set of input tensors and produces a
+/// set of output Tensors. All inputs/output tensors are referenced by index.
+///
+/// Usage:
+///
+/// <pre><code>
+/// // Create model from file. Note that the model instance must outlive the
+/// // interpreter instance.
+/// auto model = tflite::FlatBufferModel::BuildFromFile(...);
+/// if (model == nullptr) {
+///   // Return error.
+/// }
+/// // Create an Interpreter with an InterpreterBuilder.
+/// std::unique_ptr<tflite::Interpreter> interpreter;
+/// tflite::ops::builtin::BuiltinOpResolver resolver;
+/// if (InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
+///   // Return failure.
+/// }
+/// if (interpreter->AllocateTensors() != kTfLiteOk) {
+///   // Return failure.
+/// }
+///
+/// auto input = interpreter->typed_tensor<float>(0);
+/// for (int i = 0; i < input_size; i++) {
+///   input[i] = ...;
+//  }
+/// interpreter->Invoke();
+/// </code></pre>
+///
+/// Note: For nearly all practical use cases, one should not directly construct
+/// an Interpreter object, but rather use the InterpreterBuilder.
+///
+/// \warning This class is *not* thread-safe. The client is responsible for
+/// ensuring serialized interaction to avoid data races and undefined behavior.
+using Interpreter = impl::Interpreter;
+
+namespace impl {
+
+class InterpreterBuilder;  // Class for friend declarations.
+
+class Interpreter {
+ public:
+  using Ptr = std::unique_ptr<Interpreter>;
+
+  // Instantiate an interpreter. All errors associated with reading and
+  // processing this model will be forwarded to the error_reporter object.
+  //
+  // Note, if error_reporter is nullptr, then a default StderrReporter is
+  // used. Ownership of 'error_reporter' remains with the caller.
+  // WARNING: Use of this constructor outside of an InterpreterBuilder is not
+  // recommended.
+  explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  ~Interpreter();
+
+  // Interpreters are not copyable as they have non-trivial memory semantics.
+  Interpreter(const Interpreter&) = delete;
+  Interpreter& operator=(const Interpreter&) = delete;
+
+  // Functions to build interpreter
+#ifndef DOXYGEN_SKIP
+  /// Provide a list of tensor indexes that are inputs to the model.
+  /// Each index is bound check and this modifies the consistent_ flag of the
+  /// interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  /// Provide a list of tensor indexes that are outputs to the model
+  /// Each index is bound check and this modifies the consistent_ flag of the
+  /// interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  /// Provide a list of tensor indexes that are variable tensors.
+  /// Each index is bound check and this modifies the consistent_ flag of the
+  /// interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  /// Adds a node with the given parameters and returns the index of the new
+  /// node in `node_index` (optionally). Interpreter will take ownership of
+  /// `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  /// remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index = nullptr);
+
+  /// Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  /// The value pointed to by `first_new_tensor_index` will be set to the
+  /// index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add,
+                          int* first_new_tensor_index = nullptr);
+
+  /// Set description of inputs/outputs/data/fptrs for node `node_index`.
+  /// This variant assumes an external buffer has been allocated of size
+  /// bytes. The lifetime of buffer must be ensured to be greater or equal
+  /// to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  /// Legacy. Deprecated in favor of above.
+  inline TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes,
+      const Allocation* allocation = nullptr) {
+    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
+                                       dims.data(), quantization, buffer, bytes,
+                                       allocation);
+  }
+
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  /// Set description of inputs/outputs/data/fptrs for node `node_index`.
+  /// This variant assumes an external buffer has been allocated of size
+  /// bytes. The lifetime of buffer must be ensured to be greater or equal
+  /// to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name,
+                                            const std::vector<int>& dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable = false);
+
+  /// Legacy. Deprecated in favor of above.
+  inline TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+      bool is_variable = false,
+      const std::vector<int>* dims_signature = nullptr) {
+    size_t rank_dims_signature = 0;
+    const int* dims_signature_pointer = nullptr;
+    if (dims_signature) {
+      rank_dims_signature = dims_signature->size();
+      dims_signature_pointer = dims_signature->data();
+    }
+    return SetTensorParametersReadWrite(
+        tensor_index, type, name, dims.size(), dims.data(), quantization,
+        is_variable, rank_dims_signature, dims_signature_pointer);
+  }
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      bool is_variable = false, size_t rank_dims_signature = 0,
+      const int* dims_signature = nullptr);
+
+  /// Enables application to cancel in flight invocation with `Cancel`.
+  /// This can be only set when building the interpreter and should not called
+  /// directly.
+  /// NOTE: This function does not affect cancellation triggered by the callback
+  /// passed in `SetCancellationFunction`.
+  TfLiteStatus EnableCancellation();
+#endif  // DOXYGEN_SKIP
+  // Functions to access tensor data
+
+  /// Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return primary_subgraph().inputs(); }
+
+  /// Return the name of a given input. The given index must be between 0 and
+  /// inputs().size().
+  const char* GetInputName(int index) const {
+    return context_->tensors[inputs()[index]].name;
+  }
+
+  /// Read only access to list of outputs.
+  const std::vector<int>& outputs() const {
+    return primary_subgraph().outputs();
+  }
+
+  /// Read only access to list of variable tensors.
+  const std::vector<int>& variables() const {
+    return primary_subgraph().variables();
+  }
+
+  /// Return the name of a given output. The given index must be between 0 and
+  /// outputs().size().
+  const char* GetOutputName(int index) const {
+    return context_->tensors[outputs()[index]].name;
+  }
+
+  /// Return the number of tensors in the model.
+  size_t tensors_size() const { return context_->tensors_size; }
+
+  /// Return the number of ops in the model.
+  size_t nodes_size() const { return primary_subgraph().nodes_size(); }
+
+  /// \warning Experimental interface, subject to change.
+  const std::vector<int>& execution_plan() const {
+    return primary_subgraph().execution_plan();
+  }
+
+  /// Get a mutable tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    return primary_subgraph().tensor(tensor_index);
+  }
+
+  /// Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    return primary_subgraph().tensor(tensor_index);
+  }
+
+  /// Returns a pointer to an operation and registration data structure if in
+  /// bounds from the primary subgraph(subgraph_[0]).
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    return primary_subgraph().node_and_registration(node_index);
+  }
+
+  /// Returns a pointer to an operation and registration data structure if in
+  /// bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int subgraph_index, int node_index) const {
+    return subgraph(subgraph_index)->node_and_registration(node_index);
+  }
+
+  /// Perform a checked cast to the appropriate tensor type (mutable pointer
+  /// version).
+  template <class T>
+  T* typed_tensor(int tensor_index) {
+    if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<std::decay_t<T>>()) {
+        return reinterpret_cast<T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  /// Perform a checked cast to the appropriate tensor type (immutable pointer
+  /// version).
+  template <class T>
+  const T* typed_tensor(int tensor_index) const {
+    if (const TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<std::decay_t<T>>()) {
+        return reinterpret_cast<const T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  /// \brief Returns list of all keys of different method signatures defined
+  /// in the model.
+  /// Note, pointers returned have lifetime same as the Interpreter object.
+  std::vector<const std::string*> signature_keys() const {
+    std::vector<const std::string*> signature_keys;
+    signature_keys.reserve(signature_defs_.size());
+    for (const auto& sig_def : signature_defs_) {
+      signature_keys.emplace_back(&sig_def.signature_key);
+    }
+    return signature_keys;
+  }
+
+  /// \brief Returns a pointer to the SignatureRunner instance to run the part
+  /// of the graph identified by a SignatureDef.  If the model does not have any
+  /// signature defs, passing nullptr as signature_key will also default to
+  /// creating runner for primary subgraph.  A nullptr is returned if the given
+  /// signature_key is not valid.
+  /// NOTE: The returned SignatureRunner instance is owned by and has the same
+  /// lifetime as the Interpreter object; additionally, class SignatureRunner is
+  /// *not* thread-safe.
+  /// If you need to specify delegates, you have to do that before calling this
+  /// function. This function will additionally apply default delegates. Thus,
+  /// applying delegates after that might lead to undesirable behaviors.
+  /// If you need `SignatureRunner` without applying default delegates,
+  /// use `BuiltinOpResolverWithoutDefaultDelegates`.
+  SignatureRunner* GetSignatureRunner(const char* signature_key);
+
+  /// \warning Experimental interface, subject to change.
+  /// \brief Returns a pointer to the AsyncSignatureRunner instance to run the
+  /// part of the graph identified by a SignatureDef.  If the model does not
+  /// have any signature defs, passing nullptr as signature_key will also
+  /// default to creating runner for primary subgraph.  A nullptr is returned if
+  /// the given signature_key is not valid.
+  /// NOTE: The returned AsyncSignatureRunner instance is owned by and has the
+  /// same lifetime as the Interpreter object; additionally, class
+  /// AsyncSignatureRunner is *not* thread-safe.
+  /// The async delegate should be applied before calling this function.
+  async::AsyncSignatureRunner* GetAsyncSignatureRunner(
+      const char* signature_key);
+
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Return the subgraph index that corresponds to a SignatureDef,
+  /// defined by 'signature_key'.
+  /// If invalid name passed, -1 will be returned.
+  int GetSubgraphIndexFromSignature(const char* signature_key) const {
+    for (const auto& signature : signature_defs_) {
+      if (signature.signature_key == signature_key) {
+        return signature.subgraph_index;
+      }
+    }
+    return -1;
+  }
+
+  /// \brief Returns the mapping of inputs to tensor index in the signature
+  /// specified through 'signature_key'.
+  /// If invalid name passed, an empty list will be returned.
+  const std::map<std::string, uint32_t>& signature_inputs(
+      const char* signature_key) const {
+    for (const auto& sig_def : signature_defs_) {
+      if (sig_def.signature_key == signature_key) return sig_def.inputs;
+    }
+    static const std::map<std::string, uint32_t>* default_empty_list =
+        new std::map<std::string, uint32_t>();
+    return *default_empty_list;
+  }
+
+  /// \brief Returns the mapping of outputs to tensor index in the signature
+  /// specified through 'signature_key'.
+  /// If invalid name passed, an empty list will be returned.
+  const std::map<std::string, uint32_t>& signature_outputs(
+      const char* signature_key) const {
+    for (const auto& sig_def : signature_defs_) {
+      if (sig_def.signature_key == signature_key) return sig_def.outputs;
+    }
+    static const std::map<std::string, uint32_t>* default_empty_list =
+        new std::map<std::string, uint32_t>();
+    return *default_empty_list;
+  }
+
+  /// \brief Returns the input tensor identified by 'signature_input_name' in
+  /// the signature identified by 'signature_key'.
+  /// Returns nullptr if not found.
+  TfLiteTensor* input_tensor_by_signature(const char* signature_input_name,
+                                          const char* signature_key) {
+    const int subgraph_index = GetSubgraphIndexFromSignature(signature_key);
+    if (subgraph_index == -1) return nullptr;
+    const int tensor_index = GetTensorIndexFromSignature(
+        signature_input_name, signature_key, /*is_input=*/true);
+    if (tensor_index == -1) return nullptr;
+    return subgraph(subgraph_index)->tensor(tensor_index);
+  }
+
+  /// \brief Returns the output tensor identified by 'signature_output_name' in
+  /// the signature identified by 'signature_key'.
+  /// Returns nullptr if not found.
+  const TfLiteTensor* output_tensor_by_signature(
+      const char* signature_output_name, const char* signature_key) const {
+    const int subgraph_index = GetSubgraphIndexFromSignature(signature_key);
+    if (subgraph_index == -1) return nullptr;
+    const int tensor_index = GetTensorIndexFromSignature(
+        signature_output_name, signature_key, /*is_input=*/false);
+    if (tensor_index == -1) return nullptr;
+    return subgraph(subgraph_index)->tensor(tensor_index);
+  }
+
+  /// Return a mutable pointer to the given input tensor. The given index must
+  /// be between 0 and inputs().size().
+  TfLiteTensor* input_tensor(size_t index) { return tensor(inputs()[index]); }
+
+  /// Return an immutable pointer to the given input tensor. The given index
+  /// must be between 0 and inputs().size().
+  const TfLiteTensor* input_tensor(size_t index) const {
+    return tensor(inputs()[index]);
+  }
+
+  /// Return a mutable pointer into the data of a given input tensor. The given
+  /// index must be between 0 and inputs().size().
+  template <class T>
+  T* typed_input_tensor(int index) {
+    return typed_tensor<T>(inputs()[index]);
+  }
+
+  /// Return an immutable pointer into the data of a given input tensor. The
+  /// given index must be between 0 and inputs().size().
+  template <class T>
+  const T* typed_input_tensor(int index) const {
+    return typed_tensor<T>(inputs()[index]);
+  }
+
+  /// Return a mutable pointer to the given output tensor. The given index must
+  /// be between 0 and outputs().size().
+  TfLiteTensor* output_tensor(size_t index) { return tensor(outputs()[index]); }
+
+  /// Return an immutable pointer to the given output tensor. The given index
+  /// must be between 0 and outputs().size().
+  const TfLiteTensor* output_tensor(size_t index) const {
+    return tensor(outputs()[index]);
+  }
+
+  /// Return a mutable pointer into the data of a given output tensor. The given
+  /// index must be between 0 and outputs().size().
+  template <class T>
+  T* typed_output_tensor(int index) {
+    return typed_tensor<T>(outputs()[index]);
+  }
+
+  /// Return an immutable pointer into the data of a given output tensor. The
+  /// given index must be between 0 and outputs().size().
+  template <class T>
+  const T* typed_output_tensor(int index) const {
+    return typed_tensor<T>(outputs()[index]);
+  }
+
+  /// Change the dimensionality of a given tensor. Note, this is only acceptable
+  /// for tensor indices that are inputs or variables.
+  /// Returns status of failure or success. Note that this doesn't actually
+  /// resize any existing buffers. A call to AllocateTensors() is required to
+  /// change the tensor input buffer.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  /// Change the dimensionality of a given tensor. This is only acceptable for
+  /// tensor indices that are inputs or variables. Only unknown dimensions can
+  /// be resized with this function. Unknown dimensions are indicated as `-1` in
+  /// the `dims_signature` attribute of a `TfLiteTensor`. Returns status of
+  /// failure or success.  Note that this doesn't actually resize any existing
+  /// buffers. A call to AllocateTensors() is required to change the tensor
+  /// input buffer.
+  TfLiteStatus ResizeInputTensorStrict(int tensor_index,
+                                       const std::vector<int>& dims);
+
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief This releases memory held by non-persistent tensors. It does NOT
+  /// re-perform memory planning. AllocateTensors needs to be called before next
+  /// invocation.
+  TfLiteStatus ReleaseNonPersistentMemory();
+
+  /// Update allocations for all tensors. This will redim dependent tensors
+  /// using the input tensor dimensionality as given. This is relatively
+  /// expensive. This *must be* called after the interpreter has been created
+  /// and before running inference (and accessing tensor buffers), and *must be*
+  /// called again if (and only if) an input tensor is resized. Returns status
+  /// of success or failure.  Will fail if any of the ops in the model (other
+  /// than those which were rewritten by delegates, if any) are not supported by
+  /// the Interpreter's OpResolver.
+  TfLiteStatus AllocateTensors();
+
+  /// Invoke the interpreter (run the whole graph in dependency order).
+  ///
+  /// NOTE: It is possible that the interpreter is not in a ready state
+  /// to evaluate (i.e. if a ResizeTensor() has been performed without an
+  /// AllocateTensors().
+  /// Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  /// Set the number of threads available to the interpreter.
+  ///
+  /// NOTE: `num_threads` should be >= -1. Setting `num_threads` to 0 has the
+  /// effect to disable multithreading, which is equivalent to setting
+  /// `num_threads` to 1. If set to the value -1, the number of threads used
+  /// will be implementation-defined and platform-dependent.
+  ///
+  /// As TfLite interpreter could internally apply a TfLite delegate by default
+  /// (i.e. XNNPACK), the number of threads that are available to the default
+  /// delegate *should be* set via InterpreterBuilder APIs as follows:
+  ///
+  ///     std::unique_ptr<tflite::Interpreter> interpreter;
+  ///     tflite::InterpreterBuilder builder(tflite model, op resolver);
+  ///     builder.SetNumThreads(...)
+  ///     ASSERT_EQ(builder(&interpreter), kTfLiteOk);
+  ///
+  /// WARNING: This API is deprecated: prefer using
+  /// `InterpreterBuilder::SetNumThreads`, as documented above.
+  TfLiteStatus SetNumThreads(int num_threads);
+
+  /// Allow float16 precision for FP32 calculation when possible.
+  /// Default: not allow.
+  ///
+  /// WARNING: This API is deprecated: prefer controlling this via delegate
+  /// options, e.g. `tflite::StatefulNnApiDelegate::Options::allow_fp16' or
+  /// `TfLiteGpuDelegateOptionsV2::is_precision_loss_allowed`.
+  /// This method will be removed in a future release.
+  void SetAllowFp16PrecisionForFp32(bool allow);
+
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Get the half precision flag.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_->allow_fp32_relax_to_fp16;
+  }
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Sets the cancellation function pointer in order to cancel a request
+  /// in the middle of a call to Invoke(). The interpreter queries this function
+  /// during inference, between op invocations; when it returns true, the
+  /// interpreter will abort execution and return `kTfLiteError`. The `data`
+  /// parameter contains any data used by the cancellation function, and if
+  /// non-null, remains owned by the caller.
+  void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief  Attempts to cancel in flight invocation if any.
+  /// This will not affect `Invoke`s that happen after the cancellation.
+  /// Non blocking. Thread safe.
+  /// Returns kTfLiteError if cancellation is not enabled, otherwise returns
+  /// kTfLiteOk.
+  TfLiteStatus Cancel();
+
+  /// \brief Allow a delegate to look at the graph and modify the graph to
+  /// handle parts of the graph themselves. After this is called, the graph may
+  /// contain new nodes that replace 1 more nodes.
+  /// 'delegate' must outlive the interpreter.
+  /// Returns one of the following status codes:
+  /// 1. kTfLiteOk: Success.
+  /// 2. kTfLiteDelegateError: Delegation failed due to an error in the
+  /// delegate, or the delegate parameter was null. The Interpreter has been
+  /// restored to its pre-delegation state.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteApplicationError : Delegation failed to be applied due to the
+  /// incompatibility with the TfLite runtime, e.g., the model graph is already
+  /// immutable when applying the delegate. However, the interpreter could still
+  /// be invoked.
+  /// 4. kTfLiteUnresolvedOps: Delegation failed because the model has an
+  /// operator that cannot be resolved. This can happen when the op is not
+  /// registered or built with the TF Lite framework.
+  /// 5. kTfLiteError: Unexpected/runtime failure. \n
+  /// \warning This is an experimental API and subject to change. \n
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteOpaqueDelegateStruct* delegate);
+
+  // Owning handle to a TfLiteDelegate instance.
+  using TfLiteDelegatePtr =
+      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Same as ModifyGraphWithDelegate except this interpreter takes
+  /// ownership of the provided delegate.
+  template <typename Delegate, typename Deleter>
+  inline TfLiteStatus ModifyGraphWithDelegate(
+      std::unique_ptr<Delegate, Deleter> delegate) {
+    Deleter deleter = std::move(delegate.get_deleter());
+
+    // Note that we retain ownership of the delegate even if graph modification
+    // fails, as delegate use will be in an indeterminate state at that point.
+    owned_delegates_.emplace_back(
+        delegate.release(), [deleter](TfLiteDelegate* delegate_to_delete) {
+          deleter(
+              static_cast<typename std::unique_ptr<Delegate, Deleter>::pointer>(
+                  delegate_to_delete));
+        });
+    return ModifyGraphWithDelegate(owned_delegates_.back().get());
+  }
+
+  /// This overload is *never* OK. TfLiteDelegate is a C structure, so it has no
+  /// virtual destructor. The default deleter of the unique_ptr does not know
+  /// how to delete C++ objects deriving from TfLiteDelegate.
+  TfLiteStatus ModifyGraphWithDelegate(
+      std::unique_ptr<TfLiteDelegate> delegate) = delete;
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Ensure the data in `tensor.data` is readable. If a
+  /// delegate has been used, and `SetAllowBufferHandleOutput(true)` has been
+  /// called, tensor outputs may be stored as delegate buffer handles whose data
+  /// is not directly readable until this method has been called.
+  /// In such cases, this method will copy the data from the delegate buffer
+  /// handle to CPU memory.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
+  }
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set the delegate buffer handle to a tensor. It can be called in the
+  /// following cases:
+  /// 1. Set the buffer handle to a tensor that's not being written by a
+  ///    delegate. For example, feeding an OpenGL texture as the input of the
+  ///    inference graph.
+  /// 2. Set the buffer handle to a tensor that uses the same delegate.
+  ///    For example, set an OpenGL texture as the output of inference, while
+  ///    the node which produces output is an OpenGL delegate node.
+  TfLiteStatus SetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle buffer_handle,
+                               TfLiteDelegate* delegate);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set the delegate buffer handle to the given tensor.
+  // It can be called in the following cases:
+  // 1. Set the buffer handle to a tensor that is used by other computing
+  // hardware such as EdgeTpu. For example, EdgeTpu delegate imports a tensor's
+  // memory into EdgeTpu's virtual address and returns a buffer handle. Then
+  // EdgeTpu delegate calls this API to associate the tensor with the buffer
+  // handle. Example bug b/277217867.
+  TfLiteStatus SetBufferHandle(TfLiteTensor* tensor,
+                               TfLiteBufferHandle buffer_handle,
+                               TfLiteDelegate* delegate);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Get the delegate buffer handle, and the delegate which can process
+  /// the buffer handle.
+  TfLiteStatus GetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle* buffer_handle,
+                               TfLiteDelegate** delegate);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Sets the profiler to tracing execution. The caller retains
+  /// ownership of the profiler and must ensure its validity.
+  /// Previously registered profilers will be unregistered.
+  /// If `profiler` is nullptr, all previously installed profilers will be
+  /// removed.
+  void SetProfiler(Profiler* profiler);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Same as SetProfiler except this interpreter takes ownership
+  /// of the provided profiler.
+  /// Previously registered profilers will be unregistered.
+  /// If `profiler` is nullptr, all previously installed profilers will be
+  /// removed.
+  void SetProfiler(std::unique_ptr<Profiler> profiler);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Adds the profiler to tracing execution. The caller retains
+  /// ownership of the profiler and must ensure its validity.
+  /// nullptr `profiler` will be ignored.
+  void AddProfiler(Profiler* profiler);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Adds the profiler to tracing execution. Transfers
+  /// ownership of the profiler to the interpreter.
+  /// nullptr `profiler` will be ignored.
+  void AddProfiler(std::unique_ptr<Profiler> profiler);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Gets the profiler used for op tracing.
+  Profiler* GetProfiler();
+
+  /// The default capacity of the tensors vector.
+  static const int& kTensorsReservedCapacity;
+
+  /// The capacity headroom of the tensors vector before calling ops'
+  /// `prepare` and `invoke` function. In those functions, it's guaranteed
+  /// allocating up to this many more tensors won't invalidate
+  /// pointers to existing tensors.
+  static const int& kTensorsCapacityHeadroom;
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set if buffer handle output is allowed.
+  ///
+  /// When using hardware delegation, Interpreter will make the data of output
+  /// tensors available in `tensor->data` by default. If the application can
+  /// consume the buffer handle directly (e.g. reading output from OpenGL
+  /// texture), it can set this flag to true, so Interpreter won't copy the
+  /// data from buffer handle to CPU memory.
+  void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
+    allow_buffer_handle_output_ = allow_buffer_handle_output;
+  }
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Reset all variable tensors to the default value.
+  /// If a variable tensor doesn't have a buffer, reset it to zero.
+  /// TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  /// to the value of the buffer.
+  TfLiteStatus ResetVariableTensors();
+
+  /// Retrieve an operator's description of its work, for profiling purposes.
+  const char* OpProfilingString(const TfLiteRegistration& op_reg,
+                                const TfLiteNode* node) const {
+    if (op_reg.profiling_string == nullptr) return nullptr;
+    return op_reg.profiling_string(context_, node);
+  }
+
+  // Set the value of an external context. TFLite interpreter doesn't take the
+  // memory ownership of this external context 'ctx', and the context should
+  // outlive the TFLite interpreter.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+
+  /// \brief Assigns (or reassigns) a custom memory allocation for the given
+  /// tensor. `flags` is a bitmask, see TfLiteCustomAllocationFlags.
+  /// The runtime does NOT take ownership of the underlying memory.
+  ///
+  /// NOTE: User needs to call AllocateTensors() after this.
+  /// Invalid/insufficient buffers will cause an error during AllocateTensors or
+  /// Invoke (in case of dynamic shapes in the graph).
+  ///
+  /// Parameters should satisfy the following conditions:
+  /// 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  ///    In general, this is true for I/O tensors & variable tensors.
+  /// 2. allocation->data has the appropriate permissions for runtime access
+  ///    (Read-only for inputs, Read-Write for others), and outlives
+  ///    Interpreter.
+  /// 3. allocation->bytes >= tensor->bytes.
+  ///    This condition is checked again if any tensors are resized.
+  /// 4. allocation->data should be aligned to kDefaultTensorAlignment
+  ///    defined in lite/util.h. (Currently 64 bytes)
+  ///    This check is skipped if kTfLiteCustomAllocationFlagsSkipAlignCheck is
+  ///    set through `flags`.
+  /// \warning This is an experimental API and subject to change. \n
+  TfLiteStatus SetCustomAllocationForTensor(
+      int tensor_index, const TfLiteCustomAllocation& allocation,
+      int64_t flags = kTfLiteCustomAllocationFlagsNone);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Apply InterpreterOptions which tunes behavior of the interpreter.
+  TfLiteStatus ApplyOptions(InterpreterOptions* options);
+
+#ifndef DOXYGEN_SKIP
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Return the number of subgraphs in the model.
+  size_t subgraphs_size() const { return subgraphs_.size(); }
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Get a pointer to a subgraph if in bounds.
+  const Subgraph* subgraph(int subgraph_index) const {
+    if (subgraph_index < 0 ||
+        static_cast<size_t>(subgraph_index) >= subgraphs_size()) {
+      return nullptr;
+    }
+    return subgraphs_[subgraph_index].get();
+  }
+
+  /// \warning This is an experimental API and subject to change.
+  Subgraph* subgraph(int subgraph_index) {
+    return const_cast<Subgraph*>(
+        static_cast<const Interpreter*>(this)->subgraph(subgraph_index));
+  }
+
+  /// \warning Experimental interface, subject to change.
+  Subgraph& primary_subgraph() {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+  }
+
+  /// \warning Experimental interface, subject to change.
+  const Subgraph& primary_subgraph() const {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+  }
+#endif  // DOXYGEN_SKIP
+
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Get the error reporter associated with this interpreter.
+  ErrorReporter* error_reporter() const { return error_reporter_; }
+
+ private:
+  friend class tflite::impl::InterpreterBuilder;
+#ifndef DOXYGEN_SKIP
+  friend class tflite::InterpreterTest;
+  friend class tflite::model_builder::ModelBuilder;
+  friend class tflite::SingleOpModel;
+  friend class tflite::delegates::InterpreterUtils;
+  friend class tflite::delegates::test_utils::TestDelegation;
+  friend class tflite::interpreter_wrapper::InterpreterWrapper;
+#endif  // DOXYGEN_SKIP
+
+  /// Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // Helper method that return the tensor index that corresponds to
+  // a name in a SignatureDef. Defined by 'signature_key', and
+  // 'signature_tensor_name'.
+  // If 'is_input' is true then the tensor is checked in input tensors,
+  // otherwise it will be checked in output tensors.
+  // Returns -1 if the tensor is not found.
+  int GetTensorIndexFromSignature(const char* signature_tensor_name,
+                                  const char* signature_key,
+                                  bool is_input) const {
+    // Iterate directly and don't use other methods to avoid extra allocation.
+    for (const auto& signature : signature_defs_) {
+      if (signature.signature_key != signature_key) continue;
+      auto& signature_list = (is_input ? signature.inputs : signature.outputs);
+      auto tensor_iter = signature_list.find(signature_tensor_name);
+      if (tensor_iter == signature_list.end()) return -1;
+      return tensor_iter->second;
+    }
+    return -1;
+  }
+
+  // Applies TFLite default delegates.
+  TfLiteStatus ApplyLazyDelegateProviders();
+
+  // Private non-experimental implementation of ModifyGraphWithDelegate.
+  // Unlike ModifyGraphWithDelegate, ModifyGraphWithDelegateImpl is defined in
+  // interpreter.cc rather than in interpreter_experimental.cc, so it can be
+  // used to implement other non-experimental methods.
+  TfLiteStatus ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate);
+
+  // Same as ModifyGraphWithDelegateImpl except that it takes ownership of the
+  // delegate.
+  template <typename Delegate, typename Deleter>
+  inline TfLiteStatus ModifyGraphWithDelegateImpl(
+      std::unique_ptr<Delegate, Deleter>&& delegate) {
+    Deleter deleter = std::move(delegate.get_deleter());
+
+    // Note that we retain ownership of the delegate even if graph modification
+    // fails, as delegate use will be in an indeterminate state at that point.
+    owned_delegates_.emplace_back(
+        delegate.release(), [deleter](TfLiteDelegate* delegate_to_delete) {
+          deleter(
+              static_cast<typename std::unique_ptr<Delegate, Deleter>::pointer>(
+                  delegate_to_delete));
+        });
+    return ModifyGraphWithDelegateImpl(owned_delegates_.back().get());
+  }
+
+  // Overrides execution plan. ImplThis bounds checks indices sent in.
+  // Note: Only used during initialization.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Sets the profiler to all subgraphs.
+  void SetSubgraphProfiler();
+
+  // Remove delegates (for fallback behaviour). The interpreter is invokable
+  // afterwards.
+  TfLiteStatus RemoveAllDelegates();
+
+  // Returns true if delegates have been applied.
+  bool HasDelegates();
+
+  // Returns true if the model has been fully delegated.
+  bool IsFullyDelegated() const;
+
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
+  // Sets the list of signature defs in the model.
+  void SetSignatureDef(std::vector<internal::SignatureDef> signature_defs) {
+    signature_defs_ = std::move(signature_defs);
+  }
+
+  // Sets model metadata as a mapping of name (key) and buffer (value) strings.
+  // Used by InterpreterBuilder, should be called after setting up subgraphs.
+  TfLiteStatus SetMetadata(const std::map<std::string, std::string>& metadata);
+
+  // Sets telemetry settings on model information and interpreter settings.
+  // Used by InterpreterBuilder.
+  TfLiteStatus SetTelemetrySettings(
+      std::unique_ptr<TfLiteTelemetryInterpreterSettings> telemetry_settings);
+
+  // Reports the telemetry settings with the given setting name.
+  TfLiteStatus ReportTelemetrySettings(const char* setting_name);
+
+  /// Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
+  /// entries. The value pointed to by `first_new_subgraph_index` will be set to
+  /// the index of the first new subgraph if `first_new_subgraph_index` is
+  /// non-null.
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr);
+
+  /// Implementation of SetProfiler.
+  /// Unlike SetProfiler, this is defined in interpreter.cc rather than in
+  /// interpreter_experimental.cc, so it can be used by interpreter_builder.cc.
+  void SetProfilerImpl(std::unique_ptr<Profiler> profiler);
+
+  TfLiteStatus ApplyOptionsImpl(InterpreterOptions* options);
+
+  std::unique_ptr<internal::SignatureDef> CreatePlaceholderSignatureDef();
+  // Given an input signature key, return a pair where the first field is a
+  // possibly replaced signature key and the second field indicates if the
+  // returned signature key was replaced.
+  std::pair<const char*, bool> ReplaceWithPlaceholderSignatureKeyIfNeeded(
+      const char* signature_key);
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  // This is the primary subgraph context.
+  TfLiteContext* context_ = nullptr;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_ = nullptr;
+
+  // List of delegates that have been installed and are owned by this
+  // interpreter instance. Useful if client delegate ownership is burdensome.
+  // WARNING: This is an experimental API and subject to change.
+  // TODO(b/116667551): Use TfLiteExternalContext for storing state.
+  std::vector<
+      std::unique_ptr<TfLiteDelegate, std::function<void(TfLiteDelegate*)>>>
+      owned_delegates_;
+
+  // A root profiler that holds a list of attached profiler implementations.
+  // will be nullptr if there's no child profiler registered.
+  std::unique_ptr<profiling::RootProfiler> root_profiler_;
+
+  bool allow_buffer_handle_output_ = false;
+
+  // List of active external contexts.
+  TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
+
+  // The default external cpu backend context. After an TFLite interpreter is
+  // initialized, 'external_contexts_[kTfLiteCpuBackendContext]' is set to point
+  // to this object. However, if this element value is overwritten via calling
+  // 'SetExternalContext(kTfLiteCpuBackendContext, ...)', we will reset this to
+  // nullptr if necessary.
+  std::unique_ptr<ExternalCpuBackendContext> own_external_cpu_backend_context_;
+
+  // Subgraphs
+  std::vector<std::unique_ptr<Subgraph>> subgraphs_;
+
+  // A map of resources. Owned by interpreter and shared by multiple subgraphs.
+  resource::ResourceMap resources_;
+
+  // A map of resource Ids. Owned by interpreter and shared by multiple
+  // subgraphs.
+  resource::ResourceIDMap resource_ids_;
+
+  // A map of initialization statuses, that indicate whether the initialization
+  // subgraph invocation is done or not. Owned by interpreter and shared by
+  // multiple subgraphs.
+  resource::InitializationStatusMap initialization_status_map_;
+
+  // Indicating delegates that the TFLite interpreter will apply by default.
+  // An empty one means there's no delegate to be applied by default or
+  // delegates have been applied and doesn't need to be applied again.
+  using TfLiteDelegateCreator =
+      std::function<TfLiteDelegatePtr(TfLiteContext* /*context*/)>;
+  using TfLiteDelegateCreators = std::vector<TfLiteDelegateCreator>;
+  TfLiteDelegateCreators lazy_delegate_providers_;
+
+  // List of SignatureDefs obtained from the model.
+  std::vector<internal::SignatureDef> signature_defs_;
+
+  // Default signature key to use when the model has no signatures.
+  static constexpr char kPlaceholderSignatureDefKey[] =
+      "<placeholder signature>";
+
+  // Placeholder SignatureDef for legacy models with no signatures.
+  std::unique_ptr<internal::SignatureDef> placeholder_signature_def_;
+
+  // Map of signature key to its corresponding SignatureRunner object.
+  // A SignatureRunner is basically a wrapper of the Subgraph corresponding to
+  // its SignatureDef.
+  std::map<std::string, SignatureRunner> signature_runner_map_;
+
+  // Map of signature key to its corresponding AsyncSignatureRunner object.
+  // An AsyncSignatureRunner is basically a wrapper of the AsyncSubgraph
+  // corresponding to its SignatureDef.
+  std::map<std::string, async::AsyncSignatureRunner>
+      async_signature_runner_map_;
+
+  // Model metadata stored as mapping of name (key) to buffer (value).
+  // Data is mapped from the Metadata in TFLite flatbuffer model.
+  std::map<std::string, std::string> metadata_;
+
+  // Telemery data including model metadata and interpreter settings.
+  std::unique_ptr<TfLiteTelemetryInterpreterSettings> telemetry_data_;
+
+  // InterpreterOptions object which is being used.
+  std::unique_ptr<InterpreterOptions> options_;
+
+  // Stores control edges that are encoded in the metadata of the model. Updated
+  // in SetMetadata; model_control_dependencies_.empty() means that there were
+  // no control dependencies encoded in the metadata, or that we were unable to
+  // parse them. We assume that, if we were able to parse them, they are
+  // consistent with the model and no further consistency check (e.g., bounds
+  // checks when dereferencing by subgraph and operator index) will take place.
+  ModelControlDependencies model_control_dependencies_;
+
+  // Flag indicating whether to continue or cancel in flight invocation.
+  // If false, the in flight invocation will be cancelled.
+  // Will be set true when application starts a new invocation.
+  // The flag is shared across all subgraphs in the interpreter.
+  // When the application calls `Cancel`, the flag will be set to false.
+  // It "resets" to true at the beginning of each `Invoke`.
+  std::atomic_flag continue_invocation_ = ATOMIC_FLAG_INIT;
+  bool cancellation_enabled_ = false;
+};
+
+}  // namespace impl
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_CORE_INTERPRETER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/interpreter_builder.h b/third_party/tflite-hdrs/tensorflow/lite/core/interpreter_builder.h
new file mode 100644
index 00000000..6233d656
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/interpreter_builder.h
@@ -0,0 +1,178 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Provides functionality to construct an interpreter for a model.
+///
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/interpreter_builder.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_INTERPRETER_BUILDER_H_
+#define TENSORFLOW_LITE_CORE_INTERPRETER_BUILDER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
+#include "tensorflow/lite/profiling/telemetry/profiler.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+namespace tflite {
+
+/// Build an interpreter capable of interpreting `model`.
+///
+/// * `model`: A model whose lifetime must be at least as long as any
+///   interpreter(s) created by the builder. In principle multiple interpreters
+///   can be made from a single model.
+/// * `op_resolver`: An instance that implements the `OpResolver` interface,
+///   which maps custom op names and builtin op codes to op registrations. The
+///   lifetime of the provided `op_resolver` object must be at least as long as
+///   the `InterpreterBuilder`; unlike `model` and `error_reporter`, the
+///   `op_resolver` does not need to exist for the duration of any created
+///   `Interpreter` objects.
+/// * `error_reporter`: a functor that is called to report errors that handles
+///   printf var arg semantics. The lifetime of the `error_reporter` object must
+///   be greater than or equal to the `Interpreter` created by `operator()`.
+/// * `options_experimental`: Options that can change behavior of interpreter.
+///   WARNING: this parameter is an experimental API and is subject to change.
+///
+/// Returns a kTfLiteOk when successful and sets interpreter to a valid
+/// Interpreter. Note: The user must ensure the lifetime of the model (and error
+/// reporter, if provided) is at least as long as interpreter's lifetime, and
+/// a single model instance may safely be used with multiple interpreters.
+using InterpreterBuilder = impl::InterpreterBuilder;
+
+namespace impl {
+
+class InterpreterBuilder {
+ public:
+  /// For this constructor, the ErrorReporter will be extracted from the
+  /// FlatBufferModel.
+  /// `options` object is copied during construction. So caller can release it
+  /// after calling the constructor.
+  InterpreterBuilder(const FlatBufferModel& model,
+                     const OpResolver& op_resolver,
+                     const InterpreterOptions* options_experimental = nullptr);
+  /// Builds an interpreter given only the raw flatbuffer Model object (instead
+  /// of a FlatBufferModel). Mostly used for testing.
+  /// If `error_reporter` is null, then DefaultErrorReporter() is used.
+  /// `options` object is copied during construction. So caller can release it
+  /// after calling the constructor.
+  InterpreterBuilder(const ::tflite::Model* model,
+                     const OpResolver& op_resolver,
+                     ErrorReporter* error_reporter = DefaultErrorReporter(),
+                     const InterpreterOptions* options_experimental = nullptr,
+                     const Allocation* allocation = nullptr);
+  ~InterpreterBuilder();
+  InterpreterBuilder(const InterpreterBuilder&) = delete;
+  InterpreterBuilder& operator=(const InterpreterBuilder&) = delete;
+
+  /// Builds an interpreter and stores it in `*interpreter`.
+  /// On success, returns kTfLiteOk and sets `*interpreter` to a valid
+  /// Interpreter.
+  /// On failure, returns an error status and sets `*interpreter` to nullptr.
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter);
+
+  /// Same as above, but also sets the number of CPU threads to use
+  /// (overriding any previous call to SetNumThreads).
+  /// Deprecated: use the SetNumThreads method instead.
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter,
+                          int num_threads);
+
+  /// Sets the number of CPU threads to use for the interpreter.
+  /// Returns kTfLiteOk on success, kTfLiteError on error.
+  TfLiteStatus SetNumThreads(int num_threads);
+
+  /// Any delegates added with AddDelegate will be applied to the Interpreter
+  /// generated by operator(), in the order that they were added.  (The delegate
+  /// parameter passed to AddDelegate should be non-null, otherwise an error
+  /// will be reported, and the call to AddDelegate will have no other effect.)
+  /// The lifetime of the delegate must be at least as long as the lifetime of
+  /// any Interpreter generated by this InterpreterBuilder.
+  void AddDelegate(TfLiteDelegate* delegate);
+  void AddDelegate(TfLiteOpaqueDelegateStruct* opaque_delegate);
+
+  // Registers a telemetry profiler.
+  // Transfers the ownership to the InterpreterOptions.
+  // WARNING: This is an experimental API and subject to change.
+  void SetTelemetryProfiler(
+      std::unique_ptr<telemetry::TelemetryProfiler> profiler) {
+    telemetry_profiler_ = std::move(profiler);
+  }
+
+ private:
+  TfLiteStatus BuildLocalIndexToRegistrationMapping();
+  TfLiteStatus ParseNodes(
+      const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
+      Subgraph* subgraph);
+  TfLiteStatus ParseTensors(
+      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+      const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+      Subgraph* subgraph, TfLiteTelemetrySubgraphInfo* subgraph_info);
+  TfLiteStatus ApplyDelegates(Interpreter* interpreter);
+  TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization,
+                                 TfLiteQuantization* quantization,
+                                 const std::vector<int>& dims);
+  TfLiteStatus ParseSparsity(const SparsityParameters* src_sparsity,
+                             TfLiteSparsity** sparsity);
+  TfLiteStatus ParseSignatureDefs(
+      const flatbuffers::Vector<flatbuffers::Offset<SignatureDef>>*
+          signature_def_list,
+      Interpreter* interpreter);
+  void ParseConversionMetadata(TfLiteTelemetryInterpreterSettings* settings);
+
+  const ::tflite::Model* model_;
+  const OpResolver& op_resolver_;
+  ErrorReporter* error_reporter_;
+  std::vector<TfLiteDelegate*> delegates_;
+  // Model metadata stored as mapping of name (key) to buffer (value).
+  // Data is mapped from the Metadata in TFLite flatbuffer model.
+  // TODO(b/188185962): Consider mapping to std::pair<const char*, size_t> if
+  // this increases runtime memory usage for large metadata.
+  std::map<std::string, std::string> metadata_;
+
+  std::vector<const TfLiteRegistration*> flatbuffer_op_index_to_registration_;
+  std::vector<TfLiteRegistration> unresolved_custom_ops_;
+  std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
+  const Allocation* allocation_ = nullptr;
+
+  bool has_flex_op_ = false;
+  int num_fp32_tensors_ = 0;
+  int num_threads_ = -1;
+  InterpreterOptions options_;
+
+  std::unique_ptr<telemetry::TelemetryProfiler> telemetry_profiler_;
+};
+
+}  // namespace impl
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_INTERPRETER_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/kernels/builtin_op_kernels.h b/third_party/tflite-hdrs/tensorflow/lite/core/kernels/builtin_op_kernels.h
new file mode 100644
index 00000000..a6756a8f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -0,0 +1,340 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/kernels/builtin_op_kernels.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/kernels/builtin_op_kernels.h"
+
+#ifndef TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
+#define TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+// Forward declaration of all builtin op kernel registration methods. These
+// registrations are included with the standard `BuiltinOpResolver`.
+//
+// This header is particularly useful in cases where only a subset of ops are
+// needed. In such cases, the client can selectively add only the registrations
+// their model requires, using a custom `OpResolver` or `MutableOpResolver`.
+// Selective registration in turn allows the linker to strip unused kernels.
+//
+// TODO(b/184734878): auto-generate this header file from the BuiltinOperator
+// enum in the FlatBuffer schema.
+
+TfLiteRegistration* Register_ABS();
+TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_ADD_N();
+TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_ARG_MIN();
+TfLiteRegistration* Register_ASSIGN_VARIABLE();
+TfLiteRegistration* Register_ATAN2();
+TfLiteRegistration* Register_AVERAGE_POOL_2D();
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND();
+TfLiteRegistration* Register_BATCH_MATMUL();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_BROADCAST_ARGS();
+TfLiteRegistration* Register_BROADCAST_TO();
+TfLiteRegistration* Register_BUCKETIZE();
+TfLiteRegistration* Register_CALL_ONCE();
+TfLiteRegistration* Register_CAST();
+TfLiteRegistration* Register_CEIL();
+TfLiteRegistration* Register_COMPLEX_ABS();
+TfLiteRegistration* Register_CONCATENATION();
+TfLiteRegistration* Register_CONV_2D();
+TfLiteRegistration* Register_CONV_3D();
+TfLiteRegistration* Register_CONV_3D_TRANSPOSE();
+TfLiteRegistration* Register_COS();
+TfLiteRegistration* Register_CUMSUM();
+TfLiteRegistration* Register_DENSIFY();
+TfLiteRegistration* Register_DEPTH_TO_SPACE();
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_DEQUANTIZE();
+TfLiteRegistration* Register_DIV();
+TfLiteRegistration* Register_DYNAMIC_UPDATE_SLICE();
+TfLiteRegistration* Register_ELU();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
+TfLiteRegistration* Register_EQUAL();
+TfLiteRegistration* Register_EXP();
+TfLiteRegistration* Register_EXPAND_DIMS();
+TfLiteRegistration* Register_FAKE_QUANT();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_FLOOR_DIV();
+TfLiteRegistration* Register_FLOOR_MOD();
+TfLiteRegistration* Register_FULLY_CONNECTED();
+TfLiteRegistration* Register_GATHER();
+TfLiteRegistration* Register_GATHER_ND();
+TfLiteRegistration* Register_GELU();
+TfLiteRegistration* Register_GREATER();
+TfLiteRegistration* Register_GREATER_EQUAL();
+TfLiteRegistration* Register_HARD_SWISH();
+TfLiteRegistration* Register_HASHTABLE();
+TfLiteRegistration* Register_HASHTABLE_FIND();
+TfLiteRegistration* Register_HASHTABLE_LOOKUP();
+TfLiteRegistration* Register_HASHTABLE_IMPORT();
+TfLiteRegistration* Register_HASHTABLE_SIZE();
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_IMAG();
+TfLiteRegistration* Register_L2_NORMALIZATION();
+TfLiteRegistration* Register_L2_POOL_2D();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_LESS_EQUAL();
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
+TfLiteRegistration* Register_LOG();
+TfLiteRegistration* Register_LOGICAL_AND();
+TfLiteRegistration* Register_LOGICAL_NOT();
+TfLiteRegistration* Register_LOGICAL_OR();
+TfLiteRegistration* Register_LOGISTIC();
+TfLiteRegistration* Register_LOG_SOFTMAX();
+TfLiteRegistration* Register_LSH_PROJECTION();
+TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_MATRIX_DIAG();
+TfLiteRegistration* Register_MATRIX_SET_DIAG();
+TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MAX_POOL_2D();
+TfLiteRegistration* Register_MEAN();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_MIRROR_PAD();
+TfLiteRegistration* Register_MUL();
+TfLiteRegistration* Register_NEG();
+TfLiteRegistration* Register_NON_MAX_SUPPRESSION_V4();
+TfLiteRegistration* Register_NON_MAX_SUPPRESSION_V5();
+TfLiteRegistration* Register_NOT_EQUAL();
+TfLiteRegistration* Register_ONE_HOT();
+TfLiteRegistration* Register_PACK();
+TfLiteRegistration* Register_PAD();
+TfLiteRegistration* Register_PADV2();
+TfLiteRegistration* Register_POW();
+TfLiteRegistration* Register_PRELU();
+TfLiteRegistration* Register_QUANTIZE();
+TfLiteRegistration* Register_MULTINOMIAL();
+TfLiteRegistration* Register_RANDOM_STANDARD_NORMAL();
+TfLiteRegistration* Register_RANDOM_UNIFORM();
+TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_RANK();
+TfLiteRegistration* Register_READ_VARIABLE();
+TfLiteRegistration* Register_REAL();
+TfLiteRegistration* Register_REDUCE_ALL();
+TfLiteRegistration* Register_REDUCE_ANY();
+TfLiteRegistration* Register_REDUCE_MAX();
+TfLiteRegistration* Register_REDUCE_MIN();
+TfLiteRegistration* Register_REDUCE_PROD();
+TfLiteRegistration* Register_RELU();
+TfLiteRegistration* Register_RELU6();
+TfLiteRegistration* Register_RELU_N1_TO_1();
+TfLiteRegistration* Register_RELU_0_TO_1();
+TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_RESIZE_BILINEAR();
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR();
+TfLiteRegistration* Register_REVERSE_SEQUENCE();
+TfLiteRegistration* Register_REVERSE_V2();
+TfLiteRegistration* Register_RFFT2D();
+TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_ROUND();
+TfLiteRegistration* Register_RSQRT();
+TfLiteRegistration* Register_SCATTER_ND();
+TfLiteRegistration* Register_SEGMENT_SUM();
+TfLiteRegistration* Register_SELECT();
+TfLiteRegistration* Register_SELECT_V2();
+TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_SIGN();
+TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_SKIP_GRAM();
+TfLiteRegistration* Register_SLICE();
+TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND();
+TfLiteRegistration* Register_SPACE_TO_DEPTH();
+TfLiteRegistration* Register_SPARSE_TO_DENSE();
+TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_SPLIT_V();
+TfLiteRegistration* Register_SQRT();
+TfLiteRegistration* Register_SQUARE();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_SQUEEZE();
+TfLiteRegistration* Register_STRIDED_SLICE();
+TfLiteRegistration* Register_SUB();
+TfLiteRegistration* Register_SUM();
+TfLiteRegistration* Register_SVDF();
+TfLiteRegistration* Register_TANH();
+TfLiteRegistration* Register_TILE();
+TfLiteRegistration* Register_TOPK_V2();
+TfLiteRegistration* Register_TRANSPOSE();
+TfLiteRegistration* Register_TRANSPOSE_CONV();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_UNIQUE();
+TfLiteRegistration* Register_UNPACK();
+TfLiteRegistration* Register_UNSORTED_SEGMENT_MAX();
+TfLiteRegistration* Register_UNSORTED_SEGMENT_MIN();
+TfLiteRegistration* Register_UNSORTED_SEGMENT_PROD();
+TfLiteRegistration* Register_UNSORTED_SEGMENT_SUM();
+TfLiteRegistration* Register_VAR_HANDLE();
+TfLiteRegistration* Register_WHERE();
+TfLiteRegistration* Register_WHILE();
+TfLiteRegistration* Register_ZEROS_LIKE();
+TfLiteRegistration* Register_BITCAST();
+TfLiteRegistration* Register_BITWISE_XOR();
+TfLiteRegistration* Register_RIGHT_SHIFT();
+TfLiteRegistration* Register_STABLEHLO_RNG_BIT_GENERATOR();
+TfLiteRegistration*
+Register_STABLEHLO_LOGISTIC();  // WARNING: not implemented, using this op will
+                                // crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_ADD();
+
+TfLiteRegistration*
+Register_STABLEHLO_DIVIDE();  // WARNING: not implemented, using this op will
+                              // crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_MULTIPLY();
+
+TfLiteRegistration* Register_STABLEHLO_MAXIMUM();
+
+TfLiteRegistration* Register_STABLEHLO_MINIMUM();
+
+TfLiteRegistration*
+Register_STABLEHLO_RESHAPE();  // WARNING: not implemented, using this op will
+                               // crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_CLAMP();  // WARNING: not implemented, using this op will
+                             // crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_CONCATENATE();  // WARNING: not implemented, using this op
+                                   // will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_BROADCAST_IN_DIM();  // WARNING: not implemented, using this
+                                        // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_CONVOLUTION();  // WARNING: not implemented, using this
+                                   // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_SLICE();  // WARNING: not implemented, using this
+                             // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_CUSTOM_CALL();  // WARNING: not implemented, using this
+                                   // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_REDUCE();  // WARNING: not implemented, using this
+                              // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_ABS();  // WARNING: not implemented, using this
+                           // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_AND();  // WARNING: not implemented, using this
+                           // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_COSINE();  // WARNING: not implemented, using this
+                              // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_EXPONENTIAL();  // WARNING: not implemented, using this
+                                   // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_FLOOR();  // WARNING: not implemented, using this
+                             // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_LOG();  // WARNING: not implemented, using this
+                           // op will crash the runtime
+
+TfLiteRegistration*
+Register_STABLEHLO_NEGATE();  // WARNING: not implemented, using this
+                              // op will crash the runtime
+TfLiteRegistration* Register_STABLEHLO_OR();  // WARNING: not implemented, using
+                                              // this op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_POWER();  // WARNING: not implemented, using this
+                             // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_REMAINDER();  // WARNING: not implemented, using this
+                                 // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_RSQRT();  // WARNING: not implemented, using this
+                             // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_SELECT();  // WARNING: not implemented, using this
+                              // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_SUBTRACT();  // WARNING: not implemented, using this
+                                // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_TANH();  // WARNING: not implemented, using this
+                            // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_COMPARE();  // WARNING: not implemented, using this
+                               // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_CONVERT();  // WARNING: not implemented, using this
+                               // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_DYNAMIC_SLICE();  // WARNING: not implemented, using this
+                                     // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_DYNAMIC_UPDATE_SLICE();  // WARNING: not implemented, using
+                                            // this op will crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_PAD();
+
+TfLiteRegistration*
+Register_STABLEHLO_IOTA();  // WARNING: not implemented, using this
+                            // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_DOT_GENERAL();  // WARNING: not implemented, using this
+                                   // op will crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_REDUCE_WINDOW();
+
+TfLiteRegistration*
+Register_STABLEHLO_SORT();  // WARNING: not implemented, using this
+                            // op will crash the runtime
+TfLiteRegistration*
+Register_STABLEHLO_WHILE();  // WARNING: not implemented, using this
+                             // op will crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_SCATTER();
+
+TfLiteRegistration* Register_STABLEHLO_GATHER();
+
+TfLiteRegistration*
+Register_STABLEHLO_TRANSPOSE();  // WARNING: not implemented, using this
+                                 // op will crash the runtime
+
+TfLiteRegistration* Register_DILATE();
+
+TfLiteRegistration* Register_REDUCE_WINDOW();
+
+TfLiteRegistration*
+Register_STABLEHLO_COMPOSITE();  // WARNING: not implemented, using this
+                                 // op will crash the runtime
+TfLiteRegistration* Register_STABLEHLO_SHIFT_LEFT();
+
+TfLiteRegistration*
+Register_STABLEHLO_CBRT();  // WARNING: not implemented, using this
+                            // op will crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_CASE();
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/kernels/register.h b/third_party/tflite-hdrs/tensorflow/lite/core/kernels/register.h
new file mode 100644
index 00000000..19dad56f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/kernels/register.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/kernels/register.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_KERNELS_REGISTER_H_
+#define TENSORFLOW_LITE_CORE_KERNELS_REGISTER_H_
+
+#include "tensorflow/lite/core/model.h"  // Legacy.
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+// This built-in op resolver provides a list of TfLite delegates that could be
+// applied by TfLite interpreter by default.
+class BuiltinOpResolver : public MutableOpResolver {
+ public:
+  // NOTE: we *deliberately* don't define any virtual functions here to avoid
+  // behavior changes when users pass a derived instance by value or assign a
+  // derived instance to a variable of this class. See "object slicing"
+  // (https://en.wikipedia.org/wiki/Object_slicing)) for details.
+  BuiltinOpResolver();
+};
+
+// This built-in op resolver enables XNNPACK by default for all types.
+// Unsigned quantized inference (QU8) can be disabled by setting
+// `enable_xnnpack_unsigned_quantized` to false. \warning Experimental
+// interface, subject to change.
+class BuiltinOpResolverWithXNNPACK : public BuiltinOpResolver {
+ public:
+  explicit BuiltinOpResolverWithXNNPACK(
+      bool enable_xnnpack_unsigned_quantized = true);
+};
+
+// TfLite interpreter could apply a TfLite delegate by default. To completely
+// disable this behavior, one could choose to use the following class
+// BuiltinOpResolverWithoutDefaultDelegates.
+class BuiltinOpResolverWithoutDefaultDelegates : public BuiltinOpResolver {
+ public:
+  BuiltinOpResolverWithoutDefaultDelegates() : BuiltinOpResolver() {
+    delegate_creators_.clear();
+    opaque_delegate_creators_.clear();
+  }
+};
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_KERNELS_REGISTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/macros.h b/third_party/tflite-hdrs/tensorflow/lite/core/macros.h
new file mode 100644
index 00000000..86de4dae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/macros.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides utility macros and functions that are inherently platform
+// specific or shared across runtime & converter.
+#ifndef TENSORFLOW_LITE_CORE_MACROS_H_
+#define TENSORFLOW_LITE_CORE_MACROS_H_
+
+#ifdef __has_builtin
+#define TFLITE_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TFLITE_HAS_BUILTIN(x) 0
+#endif
+
+#if (!defined(__NVCC__)) && (TFLITE_HAS_BUILTIN(__builtin_expect) || \
+                             (defined(__GNUC__) && __GNUC__ >= 3))
+#define TFLITE_EXPECT_FALSE(cond) __builtin_expect(cond, false)
+#define TFLITE_EXPECT_TRUE(cond) __builtin_expect(!!(cond), true)
+#else
+#define TFLITE_EXPECT_FALSE(cond) (cond)
+#define TFLITE_EXPECT_TRUE(cond) (cond)
+#endif
+
+#ifdef _WIN32
+#define TFLITE_NOINLINE __declspec(noinline)
+#else
+#ifdef __has_attribute
+#if __has_attribute(noinline)
+#define TFLITE_NOINLINE __attribute__((noinline))
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute(noinline)
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute
+#endif  // _WIN32
+
+// Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but
+// we avoid the absl dependency for binary size reasons.
+#ifdef __has_attribute
+#define TFLITE_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define TFLITE_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if (TFLITE_HAS_ATTRIBUTE(weak) ||                  \
+     (defined(__GNUC__) && !defined(__clang__))) && \
+    !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__)
+#undef TFLITE_ATTRIBUTE_WEAK
+#define TFLITE_ATTRIBUTE_WEAK __attribute__((weak))
+#define TFLITE_HAS_ATTRIBUTE_WEAK 1
+#else
+#define TFLITE_ATTRIBUTE_WEAK
+#define TFLITE_HAS_ATTRIBUTE_WEAK 0
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/model.h b/third_party/tflite-hdrs/tensorflow/lite/core/model.h
new file mode 100644
index 00000000..56b6f6a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/model.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Defines tflite::Interpreter and tflite::InterpreterBuilder.
+///
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/model.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_MODEL_H_
+#define TENSORFLOW_LITE_CORE_MODEL_H_
+
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
+
+// TODO(b/168725050): Address the issue of proxy header in this file.
+
+#endif  // TENSORFLOW_LITE_CORE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/model_builder.h b/third_party/tflite-hdrs/tensorflow/lite/core/model_builder.h
new file mode 100644
index 00000000..c53765e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/model_builder.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Deserialization infrastructure for tflite. Provides functionality
+/// to go from a serialized tflite model in flatbuffer format to an
+/// in-memory representation of the model.
+///
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/model_builder.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+
+#ifndef TENSORFLOW_LITE_CORE_MODEL_BUILDER_H_
+#define TENSORFLOW_LITE_CORE_MODEL_BUILDER_H_
+
+#include <stddef.h>
+
+#include <memory>
+
+#include "tensorflow/compiler/mlir/lite/core/model_builder_base.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+namespace tflite {
+
+namespace impl {
+
+class FlatBufferModel : public FlatBufferModelBase<FlatBufferModel> {
+ public:
+   using Ptr = std::unique_ptr<FlatBufferModel>;
+
+  // Use stderr_reporter as the default error reporter.
+  static ErrorReporter* GetDefaultErrorReporter() {
+    return DefaultErrorReporter();
+  }
+
+  // Inherit all constructors from FlatBufferModelBase since inherited factory
+  // methods refer to them.
+  using FlatBufferModelBase<FlatBufferModel>::FlatBufferModelBase;
+};
+
+}  // namespace impl
+
+using FlatBufferModel = impl::FlatBufferModel;
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_MODEL_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/model_building.h b/third_party/tflite-hdrs/tensorflow/lite/core/model_building.h
new file mode 100644
index 00000000..66993a91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/model_building.h
@@ -0,0 +1,249 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_MODEL_BUILDING_H_
+#define TENSORFLOW_LITE_CORE_MODEL_BUILDING_H_
+
+// This is an EXPERIMENTAL API to programatically build TFLite graphs. It may
+// change or be removed at any time. Use it at your own risk.
+
+#include <array>
+#include <cstdlib>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+
+namespace tflite {
+
+namespace model_builder {
+
+using OwningErasedPtr = std::unique_ptr<void, void (*)(void*)>;
+
+class InterpreterInfo;
+struct Helper;
+
+namespace internal {
+
+template <class T, class Tag>
+struct StrongType {
+  StrongType() = default;
+  explicit StrongType(const T& v) : val(v) {}
+
+  T val;
+};
+
+using GraphIdx = StrongType<int, class GraphTag>;
+using TensorIdx = StrongType<int, class TensorTag>;
+
+}  // namespace internal
+
+// Represents a tensor in the TFLite graph.
+//
+// Copyable but you shouldn't create such an object by yourself. Use the
+// `NewInput` family of functions with a Graph for that.
+//
+// Each tensor is attached to a particular graph. Don't mix tensors created by
+// different graphs in operations.
+//
+// ```cpp
+// Tensor a = graph.NewInput(kTfLiteInt32);
+// Tensor b = NewInput(graph, kTfLiteFloat32);
+// auto [c, d] = NewInputs<2>(graph, kTfLiteFloat32);
+// ```
+class [[nodiscard]] Tensor {
+ public:
+  Tensor(const Tensor&) = default;
+  Tensor& operator=(const Tensor&) = default;
+
+ private:
+  Tensor(InterpreterInfo* builder, internal::TensorIdx tensor_idx,
+         internal::GraphIdx graph_idx)
+      : builder_(builder), tensor_idx_(tensor_idx), graph_idx_(graph_idx) {}
+
+  friend class Helper;
+
+  InterpreterInfo* builder_;
+  internal::TensorIdx tensor_idx_;
+  internal::GraphIdx graph_idx_;
+};
+
+// Represents a subgraph in the TFLite interpreter.
+//
+// Copyable but you shouldn't create such an object by yourself. Use the
+// `NewGraph` function with a GraphBuilder for that.
+//
+// ```cpp
+// Graph a = builder.NewGraph();
+// ```
+class [[nodiscard]] Graph {
+ public:
+  // Returns a new input for the given graph.
+  //
+  // See also: `NewInputs<Count>()`.
+  friend Tensor NewInput(Graph& graph, TfLiteType type);
+
+ private:
+  Graph(InterpreterInfo* builder, internal::GraphIdx graph_idx)
+      : builder_(builder), graph_idx_(graph_idx) {}
+
+  friend class Helper;
+
+  InterpreterInfo* builder_;
+  internal::GraphIdx graph_idx_;
+};
+
+namespace internal {
+
+template <size_t... Is>
+std::array<Tensor, sizeof...(Is)> NewInputsImpl(std::index_sequence<Is...>,
+                                                Graph graph, TfLiteType type);
+}  // namespace internal
+
+// Returns an array of `Count` inputs for the given `graph`.
+//
+// Useful to declare multiple similar tensors in the same graph.
+//
+// ```cpp
+// auto [t1, t2] = NewInputs<2>(graph, kTfLiteFloat32);
+// ```
+template <size_t Count>
+[[nodiscard]] std::array<Tensor, Count> NewInputs(Graph graph,
+                                                  TfLiteType type) {
+  return internal::NewInputsImpl(std::make_index_sequence<Count>{}, graph,
+                                 type);
+}
+
+// Allows building a TFLite interpreter programatically.
+//
+// ```cpp
+// GraphBuilder builder;
+// Graph grap = builder.NewGraph();
+//
+// auto [in1, in2] = NewInputs<2>(kTfLiteInt32);
+// Tensor sum = Add(in1, in2);
+// Tensor abs1 = Abs(in1)
+// Tensor out = Mul(sum, abs1);
+// MarkOuput(out);
+//
+// builder.Build(interpreter);
+// ```
+class ModelBuilder {
+ public:
+  ModelBuilder();
+
+  // Applies the graphs that were defined with this builder (and related
+  // `Graph`/`Tensor` objects) to the given TFLite `Interpreter`.
+  //
+  // Note: calling this on an interpreter that has already been set up is
+  // unsupported.
+  void Build(Interpreter& interpreter);
+
+  // Returns a new graph managed by the given builder.
+  friend Graph NewGraph(ModelBuilder& builder);
+
+ private:
+  friend class Helper;
+
+  OwningErasedPtr impl_;
+};
+
+// Marks the given tensor as an output of the graph it is attached to.
+void MarkOutput(Tensor tensor);
+
+// Marks the given tensors as outputs of the graph they are attached to.
+inline void MarkOutputs(const std::vector<Tensor>& tensors) {
+  for (const Tensor& t : tensors) {
+    MarkOutput(t);
+  }
+}
+
+// Marks the given tensors as outputs of the graph they are attached to.
+inline void MarkOutputs(std::initializer_list<Tensor> tensors) {
+  for (const Tensor& t : tensors) {
+    MarkOutput(t);
+  }
+}
+
+// Marks the given tensors as outputs of the graph they are attached to.
+template <class... Ts>
+void MarkOutputs(Ts... tensors) {
+  (MarkOutput(tensors), ...);
+}
+
+namespace internal {
+
+template <size_t... Is>
+std::array<Tensor, sizeof...(Is)> NewInputsImpl(std::index_sequence<Is...>,
+                                                Graph graph, TfLiteType type) {
+  return std::array<Tensor, sizeof...(Is)>{
+      ((void)Is, NewInput(graph, type))...};
+}
+
+}  // namespace internal
+
+// Creates an ABS operation with `tensor` as the input and returns the tensor
+// representing the result.
+//
+// The resulting operation is added to the `Graph` the `tensor` is related to.
+Tensor Abs(Tensor tensor);
+
+// Creates an ADD operation with `lhs` and `rhs` as the inputs and returns the
+// tensor representing the result.
+//
+// - `lhs` and `rhs` must be from the same `Graph`.
+//
+// The resulting operation is added to the `Graph` the tensors are related to.
+Tensor Add(Tensor lhs, Tensor rhs);
+
+// Creates an MUL operation with `lhs` and `rhs` as the inputs and returns the
+// tensor representing the result.
+//
+// - `lhs` and `rhs` must be from the same `Graph`.
+//
+// The resulting operation is added to the `Graph` the tensors are related to.
+Tensor Mul(Tensor lhs, Tensor rhs);
+
+// Creates a TRANSPOSE operation with `tensor` and `permutation` as the inputs
+// and returns the tensor representing the result.
+//
+// - `tensor` and `permutation` must be from the same `Graph`.
+//
+// The resulting operation is added to the `Graph` the `tensor` is related to.
+Tensor Transpose(Tensor tensor, Tensor permutation);
+
+// Creates a STABLEHLO_COMPOSITE operation named `name` and falling back to
+// `subgraph`.
+//
+// `inputs` is associated to the subgraph inputs.
+//
+// - all the `inputs` must be from the same `Graph`.
+// - `subgraph` but be associated to the same `GraphBuilder` as the `inputs`'
+// `Graph`.
+//
+// The resulting operation is added to the `Graph` the `inputs` are related to.
+//
+// Returns a list of tensors representing the subgraph outputs.
+std::vector<Tensor> StableHLOComposite(const char* name, const Graph& subgraph,
+                                       const std::vector<Tensor>& inputs);
+
+}  // namespace model_builder
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_MODEL_BUILDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/signature_runner.h b/third_party/tflite-hdrs/tensorflow/lite/core/signature_runner.h
new file mode 100644
index 00000000..6feab4f3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/signature_runner.h
@@ -0,0 +1,268 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SIGNATURE_RUNNER_H_
+#define TENSORFLOW_LITE_CORE_SIGNATURE_RUNNER_H_
+/// \file
+///
+/// An abstraction for invoking the TF Lite interpreter.
+/// Provides support for named parameters, and for including multiple
+/// named computations in a single model, each with its own inputs/outputs.
+///
+/// Do NOT include this file directly,
+/// instead include third_party/tensorflow/lite/signature_riunner.h
+/// See third_party/tensorflow/lite/c/common.h for the API for defining
+/// operations (TfLiteRegistration).
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/internal/signature_def.h"
+
+namespace tflite {
+namespace impl {
+class Interpreter;  // Class for friend declarations.
+}
+class SignatureRunnerHelper;     // Class for friend declarations.
+class SignatureRunnerJNIHelper;  // Class for friend declarations.
+class TensorHandle;              // Class for friend declarations.
+
+namespace impl {
+/// SignatureRunner class for running TFLite models using SignatureDef.
+///
+/// Usage:
+///
+/// <pre><code>
+/// // Create model from file. Note that the model instance must outlive the
+/// // interpreter instance.
+/// auto model = tflite::FlatBufferModel::BuildFromFile(...);
+/// if (model == nullptr) {
+///   // Return error.
+/// }
+///
+/// // Create an Interpreter with an InterpreterBuilder.
+/// std::unique_ptr<tflite::Interpreter> interpreter;
+/// tflite::ops::builtin::BuiltinOpResolver resolver;
+/// if (InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
+///   // Return failure.
+/// }
+///
+/// // Get the list of signatures and check it.
+/// auto signature_defs = interpreter->signature_keys();
+/// if (signature_defs.empty()) {
+///   // Return error.
+/// }
+///
+/// // Get pointer to the SignatureRunner instance corresponding to a signature.
+/// // Note that the pointed SignatureRunner instance has lifetime same as the
+/// // Interpreter instance.
+/// tflite::SignatureRunner* runner =
+///                interpreter->GetSignatureRunner(signature_defs[0]->c_str());
+/// if (runner == nullptr) {
+///   // Return error.
+/// }
+/// if (runner->AllocateTensors() != kTfLiteOk) {
+///   // Return failure.
+/// }
+///
+/// // Set input data. In this example, the input tensor has float type.
+/// float* input = runner->input_tensor(0)->data.f;
+/// for (int i = 0; i < input_size; i++) {
+///   input[i] = ...;
+//  }
+/// runner->Invoke();
+/// </code></pre>
+///
+/// WARNING: This class is *not* thread-safe. The client is responsible for
+/// ensuring serialized interaction to avoid data races and undefined behavior.
+///
+/// SignatureRunner and Interpreter share the same underlying data. Calling
+/// methods on an Interpreter object will affect the state in corresponding
+/// SignatureRunner objects. Therefore, it is recommended not to call other
+/// Interpreter methods after calling GetSignatureRunner to create
+/// SignatureRunner instances.
+class SignatureRunner {
+ public:
+  /// Returns the key for the corresponding signature.
+  const std::string& signature_key() { return signature_def_->signature_key; }
+
+  /// Returns the number of inputs.
+  size_t input_size() const { return subgraph_->inputs().size(); }
+
+  /// Returns the number of outputs.
+  size_t output_size() const { return subgraph_->outputs().size(); }
+
+  /// Read-only access to list of signature input names.
+  const std::vector<const char*>& input_names() { return input_names_; }
+
+  /// Read-only access to list of signature output names.
+  const std::vector<const char*>& output_names() { return output_names_; }
+
+  /// Returns the input tensor identified by 'input_name' in the
+  /// given signature. Returns nullptr if the given name is not valid.
+  TfLiteTensor* input_tensor(const char* input_name);
+
+  /// Returns the output tensor identified by 'output_name' in the
+  /// given signature. Returns nullptr if the given name is not valid.
+  const TfLiteTensor* output_tensor(const char* output_name) const;
+
+  /// Change a dimensionality of a given tensor. Note, this is only acceptable
+  /// for tensors that are inputs.
+  /// Returns status of failure or success. Note that this doesn't actually
+  /// resize any existing buffers. A call to AllocateTensors() is required to
+  /// change the tensor input buffer.
+  TfLiteStatus ResizeInputTensor(const char* input_name,
+                                 const std::vector<int>& new_size);
+
+  /// Change the dimensionality of a given tensor. This is only acceptable for
+  /// tensor indices that are inputs or variables.
+  ///
+  /// Difference from ResizeInputTensor: Only unknown dimensions can be resized
+  /// with this function. Unknown dimensions are indicated as `-1` in the
+  /// `dims_signature` attribute of a TfLiteTensor.
+  ///
+  /// Returns status of failure or success. Note that this doesn't actually
+  /// resize any existing buffers. A call to AllocateTensors() is required to
+  /// change the tensor input buffer.
+  TfLiteStatus ResizeInputTensorStrict(const char* input_name,
+                                       const std::vector<int>& new_size);
+
+  /// Updates allocations for all tensors, related to the given signature.
+  TfLiteStatus AllocateTensors() { return subgraph_->AllocateTensors(); }
+
+  /// Invokes the signature runner (run the graph identified by the given
+  /// signature in dependency order).
+  TfLiteStatus Invoke();
+
+  /// Attempts to cancel in flight invocation if any.
+  /// This will not affect calls to `Invoke` that happened after this.
+  /// Non blocking and thread safe.
+  /// Returns kTfLiteError if cancellation is not enabled, otherwise returns
+  /// kTfLiteOk.
+  /// WARNING: This is an experimental API and subject to change.
+  TfLiteStatus Cancel() { return subgraph_->Cancel(); }
+
+  /// \brief Assigns (or reassigns) a custom memory allocation for the given
+  /// tensor name. `flags` is a bitmask, see TfLiteCustomAllocationFlags.
+  /// The runtime does NOT take ownership of the underlying memory.
+  ///
+  /// NOTE: User needs to call AllocateTensors() after this.
+  /// Invalid/insufficient buffers will cause an error during AllocateTensors or
+  /// Invoke (in case of dynamic shapes in the graph).
+  ///
+  /// Parameters should satisfy the following conditions:
+  /// 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  ///    In general, this is true for I/O tensors & variable tensors.
+  /// 2. allocation->data has the appropriate permissions for runtime access
+  ///    (Read-only for inputs, Read-Write for others), and outlives
+  ///    Interpreter.
+  /// 3. allocation->bytes >= tensor->bytes.
+  ///    This condition is checked again if any tensors are resized.
+  /// 4. allocation->data should be aligned to kDefaultTensorAlignment
+  ///    defined in lite/util.h. (Currently 64 bytes)
+  ///    This check is skipped if kTfLiteCustomAllocationFlagsSkipAlignCheck is
+  ///    set through `flags`.
+  /// \warning This is an experimental API and subject to change. \n
+  TfLiteStatus SetCustomAllocationForInputTensor(
+      const char* input_name, const TfLiteCustomAllocation& allocation,
+      int64_t flags = kTfLiteCustomAllocationFlagsNone);
+
+  /// \brief Assigns (or reassigns) a custom memory allocation for the given
+  /// tensor name. `flags` is a bitmask, see TfLiteCustomAllocationFlags.
+  /// The runtime does NOT take ownership of the underlying memory.
+  ///
+  /// NOTE: User needs to call AllocateTensors() after this.
+  /// Invalid/insufficient buffers will cause an error during AllocateTensors or
+  /// Invoke (in case of dynamic shapes in the graph).
+  ///
+  /// Parameters should satisfy the following conditions:
+  /// 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  ///    In general, this is true for I/O tensors & variable tensors.
+  /// 2. allocation->data has the appropriate permissions for runtime access
+  ///    (Read-only for inputs, Read-Write for others), and outlives
+  ///    Interpreter.
+  /// 3. allocation->bytes >= tensor->bytes.
+  ///    This condition is checked again if any tensors are resized.
+  /// 4. allocation->data should be aligned to kDefaultTensorAlignment
+  ///    defined in lite/util.h. (Currently 64 bytes)
+  ///    This check is skipped if kTfLiteCustomAllocationFlagsSkipAlignCheck is
+  ///    set through `flags`.
+  /// \warning This is an experimental API and subject to change. \n
+  TfLiteStatus SetCustomAllocationForOutputTensor(
+      const char* output_name, const TfLiteCustomAllocation& allocation,
+      int64_t flags = kTfLiteCustomAllocationFlagsNone);
+
+  /// \brief Set if buffer handle output is allowed.
+  ///
+  /// When using hardware delegation, Interpreter will make the data of output
+  /// tensors available in `tensor->data` by default. If the application can
+  /// consume the buffer handle directly (e.g. reading output from OpenGL
+  /// texture), it can set this flag to true, so Interpreter won't copy the
+  /// data from buffer handle to CPU memory.
+  /// \warning This is an experimental API and subject to change. \n
+  void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
+    allow_buffer_handle_output_ = allow_buffer_handle_output;
+  }
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set the delegate buffer handle to a input tensor.
+  /// TfLiteDelegate should be aware of how to handle the buffer handle.
+  /// `release_existing_buffer_handle`: If true, the existing buffer handle
+  // will be released by TfLiteDelegate::FreeBufferHandle.
+  TfLiteStatus SetInputBufferHandle(const char* input_name,
+                                    TfLiteBufferHandle buffer_handle,
+                                    TfLiteDelegate* delegate,
+                                    bool release_existing_buffer_handle = true);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set the delegate buffer handle to a output tensor.
+  /// TfLiteDelegate should be aware of how to handle the buffer handle.
+  /// `release_existing_buffer_handle`: If true, the existing buffer handle
+  /// will be released by TfLiteDelegate::FreeBufferHandle.
+  TfLiteStatus SetOutputBufferHandle(
+      const char* output_name, TfLiteBufferHandle buffer_handle,
+      TfLiteDelegate* delegate, bool release_existing_buffer_handle = true);
+
+ private:
+  // The life cycle of SignatureRunner depends on the life cycle of Subgraph,
+  // which is owned by an Interpreter. Therefore, the Interpreter will takes the
+  // responsibility to create and manage SignatureRunner objects to make sure
+  // SignatureRunner objects don't outlive their corresponding Subgraph objects.
+  SignatureRunner(const internal::SignatureDef* signature_def,
+                  Subgraph* subgraph);
+  friend class ::tflite::impl::Interpreter;
+  friend class ::tflite::SignatureRunnerHelper;
+  friend class ::tflite::SignatureRunnerJNIHelper;
+  friend class ::tflite::TensorHandle;
+
+  // The SignatureDef object is owned by the interpreter.
+  const internal::SignatureDef* signature_def_;
+  // The Subgraph object is owned by the interpreter.
+  Subgraph* subgraph_;
+  // The list of input tensor names.
+  std::vector<const char*> input_names_;
+  // The list of output tensor names.
+  std::vector<const char*> output_names_;
+
+  bool allow_buffer_handle_output_ = false;
+};
+
+}  // namespace impl
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_SIGNATURE_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/subgraph.h b/third_party/tflite-hdrs/tensorflow/lite/core/subgraph.h
new file mode 100644
index 00000000..694b0858
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/subgraph.h
@@ -0,0 +1,1190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <atomic>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/c/common_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/experimental/resource/initialization_status.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/interpreter_options.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+#ifndef DOXYGEN_SKIP
+class SingleOpModel;  // Class for friend declarations.
+
+namespace internal {
+class CommonOpaqueConversionUtil;  // Class for friend declarations.
+}
+
+namespace async {
+class AsyncSubgraph;  // Class for friend declarations.
+}
+
+namespace impl {
+class Interpreter;         // Class for friend declarations.
+class InterpreterBuilder;  // Class for friend declarations.
+class SignatureRunner;     // Class for friend declarations.
+}  // namespace impl
+
+namespace delegates {
+namespace test_utils {
+class TestDelegate;  // Class for friend declarations.
+}  // namespace test_utils
+}  // namespace delegates
+#endif  // DOXYGEN_SKIP
+
+class Subgraph {
+ public:
+#ifndef DOXYGEN_SKIP
+  friend class ::tflite::impl::Interpreter;
+  friend class ::tflite::impl::SignatureRunner;
+  friend class SingleOpModel;
+  friend class internal::CommonOpaqueConversionUtil;
+#endif  // DOXYGEN_SKIP
+  Subgraph(ErrorReporter* error_reporter,
+           TfLiteExternalContext** external_contexts,
+           std::vector<std::unique_ptr<Subgraph>>* subgraphs,
+           resource::ResourceMap* resources,
+           resource::ResourceIDMap* resource_ids,
+           resource::InitializationStatusMap* initialization_status_map,
+           int subgraph_index = kInvalidSubgraphIndex);
+
+  Subgraph(const Subgraph&) = delete;
+
+  // Subgraphs should be movable but not copyable.
+  Subgraph(Subgraph&&) = default;
+  Subgraph& operator=(const Subgraph&) = delete;
+  virtual ~Subgraph();
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Provide a list of tensor indexes that are variable tensors.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const std::vector<int>& intermediates,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index = nullptr);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add,
+                          int* first_new_tensor_index = nullptr);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter. `quantization` ownership is passed to the subgraph.
+  inline TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr,
+      TfLiteSparsity* sparsity = nullptr,
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier) {
+    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
+                                       dims.data(), quantization, buffer, bytes,
+                                       allocation, sparsity, buffer_identifier);
+  }
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t ndims,
+      const int* dims, TfLiteQuantization quantization, const char* buffer,
+      size_t bytes, const Allocation* allocation = nullptr,
+      TfLiteSparsity* sparsity = nullptr,
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter. `quantization` ownership is passed to the subgraph.
+  inline TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      bool is_variable = false, const std::vector<int>& dims_signature = {}) {
+    if (dims_signature.empty()) {
+      return SetTensorParametersReadWrite(tensor_index, type, name, dims.size(),
+                                          dims.data(), quantization,
+                                          is_variable);
+    }
+    return SetTensorParametersReadWrite(
+        tensor_index, type, name, dims.size(), dims.data(), quantization,
+        is_variable, dims_signature.size(), dims_signature.data());
+  }
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t ndims,
+      const int* dims, TfLiteQuantization quantization,
+      bool is_variable = false, const size_t ndims_signature = 0,
+      const int* dims_signature = nullptr);
+
+  // Get all tensors in the subgraph.
+  TfLiteTensor* tensors() { return context_.tensors; }
+
+  // Get a mutable tensor data structure.
+  TfLiteTensor* tensor(int tensor_index) {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_.tensors_size) {
+      return nullptr;
+    }
+    return &context_.tensors[tensor_index];
+  }
+
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_.tensors_size) {
+      return nullptr;
+    }
+    return &context_.tensors[tensor_index];
+  }
+
+  // Read only access to list of inputs.
+  std::vector<int>& inputs() { return inputs_; }
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return inputs_; }
+
+  // Read only access to list of outputs.
+  std::vector<int>& outputs() { return outputs_; }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const { return outputs_; }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& variables() { return variables_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& variables() const { return variables_; }
+
+  // WARNING: Experimental interface, subject to change.
+  // TODO(ycling): Move this function to an external context interface.
+  resource::ResourceMap& resources() { return *resources_; }
+
+  // WARNING: Experimental interface, subject to change.
+  // TODO(b/149099381): Move this function to an external context interface.
+  resource::ResourceIDMap& resource_ids() { return *resource_ids_; }
+
+  // WARNING: Experimental interface, subject to change.
+  // TODO(b/149099381): Move this function to an external context interface.
+  resource::InitializationStatusMap& initialization_status_map() {
+    return *initialization_status_map_;
+  }
+
+  size_t tensors_size() const { return tensors_.size(); }
+
+  // Return the number of ops in the model.
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
+
+  // Return vector of node indices in the order of execution.
+  std::vector<int>& execution_plan() { return execution_plan_; }
+
+  // Return read-only vector of node indices in the order of execution.
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // Return read-only vector of node indices in the order of execution before
+  // any delegate was applied.
+  //
+  // Note: if no delegate is applied, this vector will be empty.
+  const std::vector<int>& pre_delegation_execution_plan() const {
+    return pre_delegation_execution_plan_;
+  }
+
+  const std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() const {
+    return nodes_and_registration_;
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    if (node_index < 0 || static_cast<size_t>(node_index) >= nodes_size())
+      return nullptr;
+    return &nodes_and_registration_[node_index];
+  }
+
+  // Change the dimensionality of a given tensor.
+  //
+  // Note, this is only acceptable for tensor indices that are inputs.
+  TfLiteStatus ResizeInputTensor(int tensor_index, const int* dims_data,
+                                 int rank);
+
+  // Change the dimensionality of a given tensor.
+  //
+  // Note, this is only acceptable for tensor indices that are inputs.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // WARNING: Experimental interface, subject to change
+  // Change the dimensionality of a given tensor. This is only acceptable for
+  // tensor indices that are inputs or variables. Only unknown dimensions can be
+  // resized with this function. Unknown dimensions are indicated as `-1` in the
+  // `dims_signature` attribute of a `TfLiteTensor`. Returns status of failure
+  // or success.
+  TfLiteStatus ResizeInputTensorStrict(int tensor_index,
+                                       const std::vector<int>& dims);
+
+  // This releases memory held by non-persistent tensors. It does NOT re-perform
+  // memory planning.
+  // AllocateTensors needs to be called before next invocation.
+  TfLiteStatus ReleaseNonPersistentMemory();
+
+  // WARNING: Experimental interface, subject to change
+  // This API releases memory held by the given subgraph. This method is
+  // designed to release memory of control flow subgraphs.
+  // AllocateTensors needs to be called before next invocation.
+  TfLiteStatus ReleaseMemory();
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Returns the number of times each tensor is consumed. Subgraph output
+  // tensors are considered as consumed.
+  std::vector<int> GetInputTensorsCount();
+
+  // Invoke the subgraph (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Entry point for C node plugin API to report an error.
+  void ReportError(const char* format, ...);
+
+  // Return the subgraph specific context.
+  TfLiteContext* context() { return &context_; }
+  const TfLiteContext* context() const { return &context_; }
+
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+  // Get the half precision flag.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_.allow_fp32_relax_to_fp16;
+  }
+
+  // Sets the cancellation function pointer in order to cancel a request in the
+  // middle of a call to Invoke(). The interpreter queries this function during
+  // inference, between op invocations; when it returns true, the interpreter
+  // will abort execution and return `kTfLiteError`. The `data` parameter
+  // contains any data used by the cancellation function, and if non-null,
+  // remains owned by the caller.
+  // WARNING: This is an experimental API and subject to change.
+  void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index);
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ResetVariableTensors();
+
+  void SetProfiler(Profiler* profiler, int associated_subgraph_idx) {
+    if (!profiler) {
+      profiler_.reset(nullptr);
+      context_.profiler = nullptr;
+    } else {
+      profiler_ = std::make_unique<SubgraphAwareProfiler>(
+          profiler, associated_subgraph_idx);
+      context_.profiler = profiler_.get();
+    }
+  }
+
+  Profiler* GetProfiler() { return profiler_.get(); }
+
+  // Returns a pointer to vector of subgraphs.
+  // WARNING: This is an experimental API and subject to change.
+  std::vector<std::unique_ptr<Subgraph>>* GetSubgraphs() const {
+    return subgraphs_;
+  }
+
+  // Returns the location of this object within subgraphs_, or
+  // kInvalidSubgraphIndex if subgraphs_ is nullptr or *this is not
+  // represented *subgraphs_.
+  // WARNING: This is an experimental API and subject to
+  // change.
+  static constexpr int kInvalidSubgraphIndex = -1;
+  int GetSubgraphIndex() const { return subgraph_index_; }
+
+  // Returns true if this subgraph is the primary subgraph.
+  // Returns false otherwise, including the cases when GetSubgraphIndex()
+  // returns kInvalidSubgraphIndex.
+  // WARNING: This is an experimental API and subject to change.
+  bool IsPrimarySubgraph() const { return GetSubgraphIndex() == 0; }
+
+  // True if all tensors in the graph has static size after calling
+  // `AllocateTensors` function.
+  // Before `AllocateTensors` is called, this will always return true;
+  bool HasDynamicTensors() { return has_dynamic_tensors_; }
+
+  // Assigns (or reassigns) a custom memory allocation for the given tensor.
+  // `flags` is a bitmask, see TfLiteCustomAllocationFlags.
+  // The runtime does NOT take ownership of the underlying memory.
+  //
+  // NOTE: User needs to call AllocateTensors() after this.
+  // Invalid/insufficient buffers will cause an error during AllocateTensors or
+  // Invoke (in case of dynamic shapes in the graph).
+  //
+  // Parameters should satisfy the following conditions:
+  // 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  //    In general, this is true for I/O tensors & variable tensors.
+  // 2. allocation->data has the appropriate permissions for runtime access
+  //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
+  // 3. allocation->bytes >= tensor->bytes.
+  //    This condition is checked again if any tensors are resized.
+  // 4. allocation->data should be aligned to kDefaultTensorAlignment
+  //    defined in lite/util.h. (Currently 64 bytes)
+  //    This check is skipped if kTfLiteCustomAllocationFlagsSkipAlignCheck is
+  //    set through `flags`.
+  // TODO(b/182215910): Expand on this documentation in a g3doc.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus SetCustomAllocationForTensor(
+      int tensor_index, const TfLiteCustomAllocation& allocation,
+      int64_t flags = kTfLiteCustomAllocationFlagsNone);
+
+  void SetName(const char* name);
+  const std::string& GetName() const;
+
+  // WARNING: This is an experimental API and subject to change.
+  // Dumps debugging info by the underlying memory planner.
+  // Note: to have minimal binary increase caused by this debug info dump for
+  // the TfLite library and allow users to plug-in their own memory planner
+  // debugger, we have utilized weak symbols to meet these two requirements. By
+  // default, there is no debugging info dumped. However, if the TfLite-provided
+  // lite:simple_memory_arena_debug_dump (i.e. containing the strong definition)
+  // is linked to the program, calling this function will output memory usage
+  // information about tenosrs and ops.
+  void DumpMemoryPlannerDebugInfo() const;
+
+  typedef struct SubgraphAllocInfo {
+    size_t arena_size;
+    size_t arena_persist_size;
+    size_t dynamic_size;
+    size_t resource_size;
+  } SubgraphAllocInfo;
+
+  // WARNING: This is an experimental API and subject to change.
+  // Returns memory allocation status.
+  void GetMemoryAllocInfo(SubgraphAllocInfo* alloc_info) const;
+
+  // WARNING: This is an experimental API and subject to change.
+  // Set the given `InterpreterOptions` object.
+  void SetOptions(InterpreterOptions* options) {
+    options_ = options;
+    if (options && options->GetDynamicAllocationForLargeTensors() > 0) {
+      // Note: this operation cannot be reversed.
+      OptimizeMemoryForLargeTensors(
+          options->GetDynamicAllocationForLargeTensors());
+    }
+  }
+
+  // WARNING: This is an experimental API and subject to change.
+  const InterpreterOptions* GetOptions() const { return options_; }
+
+  // WARNING: This is an experimental API and subject to change.
+  // True if all intermediates tensors should be preserved for debugging.
+  bool ShouldPreserveAllTensors() const {
+    return (options_ && options_->GetPreserveAllTensors());
+  }
+
+  // WARNING: This is an experimental API and subject to change.
+  // True if all intermediate dynamic tensors should be released once they are
+  // not used by the model.
+  bool ShouldReleaseDynamicTensors() const {
+    return (options_ && options_->GetEnsureDynamicTensorsAreReleased());
+  }
+
+  /// WARNING: This is an experimental API and subject to change.
+  /// Use dynamic tensor allocation and deallocation method for large tensors
+  /// instead of static memory planner. Dynamic tensors are allocated just
+  /// before when they're needed and released when they're not needed anymore.
+  /// It improves peak memory usage but there could be some latency impact. The
+  /// parameter `large_tensors_thresholds_in_bytes` is used to determine large
+  /// tensors. This API must be called before `AllocateTensors`.
+  void OptimizeMemoryForLargeTensors(int large_tensors_thresholds_in_bytes);
+
+  // WARNING: This is an experimental API and subject to change.
+  // True if dynamic tensor allocation / deallocation method is enabled by
+  // `OptimizeMemoryForLargeTensors` API.
+  bool ShouldOptimizeMemoryForLargeTensors() {
+    return (options_ && (options_->GetDynamicAllocationForLargeTensors() > 0));
+  }
+
+  // WARNING: This is an experimental API and subject to change.
+  // Remove unused inputs of the subgraph. It checks usage of inputs and mark it
+  // as kTfLiteOptionalTensor if the input is not used in graph execution.
+  // Currently, it's used to remove unused inputs of WHILE cond subgraphs.
+  TfLiteStatus RemoveUnusedInputs();
+
+  // WARNING: This is an experimental API and subject to change.
+  // If true, the graph-reordering optimization that finds a topological
+  // reordering that keeps delegated nodes together will be disabled.
+  bool DisableDelegateClustering() const {
+    return (options_ && options_->GetDisableDelegateClustering());
+  }
+
+  // Retrieves the corresponding TfLiteContext of a subgraph given a subgraph
+  // index and switches to the delegate context for this subgraph. If an invalid
+  // subgraph index is given, returns kTfLiteError.
+  // NOTE: This function is expected to be paired with ReleaseSubgraphContext()
+  // once the delegate preparation is done and/or the delegate context functions
+  // are no longer needed.
+  TfLiteStatus AcquireSubgraphContext(int subgraph_index,
+                                      TfLiteContext** acquired_context);
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to acquire the subgraph context.
+  static TfLiteStatus AcquireSubgraphContext(struct TfLiteContext* context,
+                                             int subgraph_index,
+                                             TfLiteContext** acquired_context);
+
+  // Releases the subgraph context by switching back to the TFLite kernel
+  // context for this specified subgraph.
+  // NOTE: This function is expected to be used after AcquireSubgraphContext()
+  // once the delegate preparation is done and/or the delegate context functions
+  // are no longer needed.
+  TfLiteStatus ReleaseSubgraphContext(int subgraph_index);
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to release the subgraph context.
+  static TfLiteStatus ReleaseSubgraphContext(struct TfLiteContext* context,
+                                             int subgraph_index);
+
+  // Marks the subgraph with the given index as "delegation-skippable". Returns
+  // kTfLiteOk if the given subgraph index is valid and is successfully marked
+  // as delegation-skippable, and an error status if the subgraph index is
+  // invalid.
+  // If a subgraph is delegation-skippable, then the subgraph will be handled by
+  // a TfLiteDelegate (and that the delegate is supposed to be already aware of
+  // this state), and therefore, TfLiteInterpreter can skip invoking
+  // `ModifyGraphWithDelegate` on this subgraph.
+  // NOTE: This function is expected to be called only when the subgraph that
+  // `subgraph_index` is pointing to should be skipped by
+  // interpreter::ModifyGraphWithDelegate (e.g. the subgraph is part of the list
+  // of callee subgraphs of the same control flow node, and all of those callees
+  // are supported by the same delegate at once).
+  //
+  // For example, this function can be used when the delegate is handling
+  // control flow ops like while op. E.g. A while op has condition subgraph
+  // indexed at `i` and body subgraph indexed at `j`. The op can be delegated
+  // when the following condition satisfied:
+  //   1. The delegate supports while op
+  //   2. Both condition subgraph `i` and body subgraph `j` can be fully
+  //      delegated by the delegate.
+  // Then if the delegate decides to support the while node along with both body
+  // and condition subgraphs, it should mark subgraphs `i` and `j` skippable so
+  // those two subgraphs won't be delegated separately again after being
+  // absorbed by the parent subgraph.
+  // WARNING: It is the delegate's responsibility to define when to skip
+  // subgraph->ModifyGraphWithDelegate, to check any edge cases (i.e. multiple
+  // references to the subgraph that `subgraph_index` is pointing to), and to
+  // mark that subgraph as skippable using this function.
+  TfLiteStatus MarkSubgraphAsDelegationSkippable(int subgraph_index);
+
+  // Returns whether this subgraph is delegation-skippable.
+  // See the documentation on the private is_delegation_skippable_ field for
+  // more details.
+  bool IsDelegationSkippable() const { return is_delegation_skippable_; }
+
+  // Marks this subgraph as delegation-skippable.
+  // See the documentation on the private is_delegation_skippable_ field for
+  // more details.
+  // NOTE: This function is expected to be called only when this subgraph will
+  // be skipped by the interpreter.
+  void MarkAsDelegationSkippable() { is_delegation_skippable_ = true; }
+
+  // Loads metadata of a TF Lite node's custom initialization data.
+  // Specifically:
+  // * Loads into the supplied 'fd' the file descriptor of the file that stores
+  //   the 'node's custom  initialization data.  This output parameter will be
+  //   loaded if the TF Lite runtime has access to the file descriptor, though
+  //   this is not always the case, e.g. if a client provides a tflite::Model
+  //   directly to the TF Lite runtime.  If 'fd' can be loaded then 'kTfLiteOk'
+  //   will be returned, otherwise 'kTfLiteError' is returned.
+  // * Loads into the supplied 'custom_initial_data_offset_in_file' pointer the
+  //   offset of the 'node's custom init data in the file associated with 'fd'.
+  //   This output parameter will be set to -1 if the 'node' does not have
+  //   custom init data set.
+  // * Loads into the supplied 'custom_initial_data_size' the size of the
+  //   custom initialization data.  This output parameter will be set to -1 if
+  //   the 'node' does not have custom init data set.
+  //
+  // Returns 'kTfLiteOk' when 'fd' has been loaded successfully and
+  // 'kTfLiteError' otherwise.  Note that this means that 'kTfLiteOk' can be
+  // returned, even if the 'node' does not have custom init data set.
+  TfLiteStatus GetNodeInitDataMmapInfo(
+      const TfLiteNode* node, int* fd,
+      int64_t* custom_initial_data_offset_in_file,
+      int64_t* custom_initial_data_size) const;
+
+  // Returns true if the subgraph has delegates applied.
+  bool HasDelegates();
+
+  // Returns true if the subgraph has been fully delegated.
+  bool IsFullyDelegated() const;
+
+  const std::unordered_map<size_t, size_t>& GetTensorBufferIdentifiers() const {
+    return tensor_buffer_identifiers_;
+  }
+
+ private:
+#ifndef DOXYGEN_SKIP
+  friend class tflite::impl::InterpreterBuilder;
+  friend class tflite::async::AsyncSubgraph;
+  friend class TestDelegate;
+#endif  // DOXYGEN_SKIP
+  // SubgraphAwareProfiler wraps an actual TFLite profiler, such as a
+  // BufferedProfiler instance, and takes care of event profiling/tracing in a
+  // certain subgraph.
+  class SubgraphAwareProfiler : public Profiler {
+   public:
+    // Constructor should be called with the non-nullptr profiler argument.
+    SubgraphAwareProfiler(Profiler* profiler, int64_t subgraph_index)
+        : profiler_(profiler), subgraph_index_(subgraph_index) {}
+    ~SubgraphAwareProfiler() override {}
+
+    uint32_t BeginEvent(const char* tag, EventType event_type,
+                        int64_t event_metadata1,
+                        int64_t event_metadata2) override {
+      if (!profiler_) return 0;
+      return profiler_->BeginEvent(tag, event_type, event_metadata1,
+                                   subgraph_index_);
+    }
+
+    void EndEvent(uint32_t event_handle) override {
+      if (!profiler_) return;
+      profiler_->EndEvent(event_handle);
+    }
+
+    void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
+      if (!profiler_) return;
+      profiler_->EndEvent(event_handle, event_metadata1, event_metadata2);
+    }
+
+    void AddEvent(const char* tag, EventType event_type, uint64_t elapsed_time,
+                  int64_t event_metadata1, int64_t event_metadata2) override {
+      if (!profiler_) return;
+      profiler_->AddEvent(tag, event_type, elapsed_time, event_metadata1,
+                          subgraph_index_);
+    }
+
+    void AddEventWithData(const char* tag, EventType event_type,
+                          const void* data) override {
+      if (!profiler_) return;
+      profiler_->AddEventWithData(tag, event_type, data);
+    }
+
+   private:
+    // Not own the memory.
+    Profiler* const profiler_;
+    const int64_t subgraph_index_;
+  };
+
+  // Ensure the internal node storage memory allocates at least `count`
+  // spots for node. NOTE, this doesn't actually add operators. This is an
+  // efficiency optimization that is subject to change.
+  // Note: Only used during initialization.
+  void ReserveNodes(int count);
+
+  // Overrides execution plan. This bounds checks indices sent in.
+  // Note: Only used during initialization.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels. Returns kTfLiteError if the counter violation happens,
+  // i.e. if trying to switch to kernel context when it's already at kernel
+  // context.
+  TfLiteStatus SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'. Also increment the
+  // `delegate_context_switch_count_` for each call. Returns kTfLiteError if the
+  // count violation happens, i.e. if the counter goes below 0. This is not
+  // expected to happen unless another function arbitrarily modify the counter.
+  TfLiteStatus SwitchToDelegateContext();
+
+  // Give 'op_reg' a chance to initialize itself using the contents of
+  // 'buffer'. If registration_external is valid, use the 'init' callback from
+  // that.
+  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
+               size_t length);
+
+  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
+  // If registration_external is valid, use the 'free' callback from that.
+  void OpFree(const TfLiteRegistration& op_reg, void* buffer);
+
+  // Prepare the given 'node' for execution.
+  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node);
+
+  // Invoke the operator represented by 'node'.
+  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node);
+
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    const std::vector<int>& execution_plan,
+                                    int* last_execution_plan_index_prepared);
+
+  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
+  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
+  // `context_` whenever this std::vector is reallocated. Currently this
+  // only happens in `AddTensors()`.
+  std::vector<TfLiteTensor> tensors_;
+
+  // Check if an array of tensor indices are valid with respect to the Tensor
+  // array.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
+                                  int length);
+
+  // Check that the input indices and the output indices don't overlap.
+  // This is needed because same tensor must not be used both as input and
+  // output for an operator.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckInputAndOutputForOverlap(const int* input_indices,
+                                             int num_inputs,
+                                             const int* output_indices,
+                                             int num_outputs);
+
+  // Invoke the subgraph (run the whole graph in dependency order).
+  // Does not report invoke status through profiler.
+  TfLiteStatus InvokeImpl();
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // Does not report invoke status through profiler.
+  TfLiteStatus ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate);
+
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
+  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
+
+  // Report a detailed error string (will be printed to stderr).
+  void ReportErrorImpl(const char* format, va_list args);
+
+  // Entry point for C node plugin API to request an tensor be resized.
+  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                                   TfLiteIntArray* new_size);
+  // Entry point for C node plugin API to report an error.
+  static void ReportErrorC(TfLiteContext* context, const char* format, ...);
+
+  // Entry point for C node plugin API to add new tensors.
+  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
+                                 int* first_new_tensor_index);
+
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  // If the delegate has the 'opaque_delegate_builder' field set,
+  // then the 'registration.registration_external' must be non-null,
+  // and the subgraph takes ownership of the registration_external.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
+  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // If the delegate has the 'opaque_delegate_builder' field set,
+  // then the 'registration.registration_external' must be non-null,
+  // and the subgraph takes ownership of the registration_external.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegate* delegate);
+
+  // Helper method for PreviewDelegatePartitioning and
+  // ReplaceNodeSubsetsWithDelegateKernels. Creates node subsets whose members
+  // are either all present in or all absent from *nodes_to_replace.  The
+  // NodeSubsets and their members are in schedulable order, where
+  // schedulability considers data dependencies and, if present, *control_edges_
+  // between nodes.
+  // If control_edges_ == nullptr, PartitionGraph will preserve the original
+  // execuion order of nodes with OpMightHaveSideEffect() when finding
+  // schedulable orderings.
+  TfLiteStatus PartitionGraph(const TfLiteIntArray* nodes_to_replace,
+                              std::vector<NodeSubset>* node_subsets);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
+  // owns this memory and it is only guaranteed to exist during the invocation
+  // of the delegate prepare.
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan.
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Provides a preview of post-delegation partitioning. Each
+  // TfLiteDelegateParams in the referenced array corresponds to one instance of
+  // the delegate kernel.
+  // nodes_to_replace should point to a valid array. partition_params_array &
+  // num_partitions should be non-null.
+  // Memory allocated by this method is automatically released with another call
+  // to PreviewDelegateParitioning, or after TfLiteDelegate::Prepare is done.
+  TfLiteStatus PreviewDelegatePartitioning(
+      const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegateParams** partition_params_array, int* num_partitions);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to preview delegation partitioning.
+  static TfLiteStatus PreviewDelegatePartitioning(
+      struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegateParams** partition_params_array, int* num_partitions);
+
+  // Retrieves named metadata from the TFLite model. Returns kTfLiteOk if
+  // metadata is successfully obtained.
+  // See the Metadata table in TFLite schema.
+  TfLiteStatus GetModelMetadata(const char* name, const char** ptr,
+                                size_t* bytes);
+
+  // Entry point for C node plugin API to get model metadata based on name.
+  static TfLiteStatus GetModelMetadata(const struct TfLiteContext* context,
+                                       const char* name, const char** ptr,
+                                       size_t* bytes);
+
+  // Used to clear partitioning_preview_cache_, in case
+  // PreviewDelegatePartitioning was called.
+  void FreeDelegatePartitioningData();
+
+  // Retrieve an existing external context by type.
+  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
+  static TfLiteExternalContext* GetExternalContext(
+      struct TfLiteContext* context, TfLiteExternalContextType type);
+
+  // Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // WARNING: This is an experimental API and subject to change.
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // NOTE: If tensors were allocated prior to delegate application, they will
+  // be reallocated if the graph was modified (i.e., the caller does *not* need
+  // to explicitly call |AllocateTensors()| again). If tensors were unallocated,
+  // they will remain unallocated after delegate application.
+  // Returns one of the following status codes:
+  // 1. kTfLiteOk: Delegation succeeded
+  // 2. kTfLiteDelegateError: Delegation failed due to an error *in the
+  // delegate*, or the delegate parameter was null. The Subgraph has been
+  // restored to its pre-delegation state.
+  // NOTE: This reverts all delegates previously applied to the Subgraph.
+  // 3. kTfLiteApplicationError : Delegation failed to be applied due to the
+  // incompatibility with the TF Lite runtime, e.g., the model graph is already
+  // immutable when applying the delegate. However, the Subgraph is still in a
+  // invokable state.
+  // 4. kTfLiteUnresolvedOps: Delegation failed because the model has an
+  // operator that cannot be resolved. This can happen when the op is not
+  // registered or built with the TF Lite framework.
+  // 5. kTfLiteError: Unexpected/runtime failure.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // This un-applies all delegates that have been applied till now, but retains
+  // pointers to them.
+  // The old execution plan and nodes are restored.
+  TfLiteStatus UndoAllDelegates();
+
+  // This re-applies all delegates that were undone.
+  // Does nothing if UndoAllDelegates wasn't previously called.
+  TfLiteStatus RedoAllDelegates();
+
+  // This removes all delegates.
+  // The old execution plan and nodes are restored. The graph is invokable
+  // afterwards.
+  TfLiteStatus RemoveAllDelegates();
+
+  // Cleanups up data reserved for the given node. Does not remove the {node,
+  // registration} pair from nodes_and_registrations_.
+  void CleanupNode(int node_index);
+
+  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
+  // capacity. Calling this function may invalidate existing pointers to
+  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
+  // more tensors won't invalidate the pointer to existing tensors.
+  void EnsureTensorsVectorCapacity();
+
+  // Ensures the memory required is planned and allocated.
+  TfLiteStatus EnsureMemoryAllocations();
+
+  // Enables cancellation of in flight invocation with `Cancel` call.
+  // Should only be called by the interpreter when building the subgraph.
+  // `flag` should be nullptr otherwise cancellation is disabled.
+  TfLiteStatus EnableCancellation(std::atomic_flag* flag);
+
+  // Attempts to cancel in flight invocation if any.
+  // This will not affect `Invoke`s that happen after the cancellation.
+  // Non blocking. Thread safe.
+  // Returns kTfLiteError if cancellation is not enabled, otherwise returns
+  // kTfLiteOk.
+  TfLiteStatus Cancel();
+
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
+  // Returns true if 'node' could have side effect (e.g. stateful op).
+  // Note that any node that might update other tensors beside op's output
+  // are considered to have side effect.
+  // So control flow ops like 'If' and 'While' are considered to have
+  // side effect because they can have ops that have side effect in the
+  // condition and body subgraphs.
+  bool OpMightHaveSideEffect(const TfLiteNode* node,
+                             const TfLiteRegistration* registration) const;
+
+  // Returns new GraphInfo object based on the current Subgraph.
+  std::unique_ptr<GraphInfo> CreateGraphInfo();
+
+  // Store a ptr to the model metadata owned by the Interpreter.
+  // Since the lifetime of the Interpreter exceeds the Subgraph, metadata
+  // remains valid for the latter's lifetime.
+  // Also sets relevant fields on context_ based on known metadata.
+  TfLiteStatus SetMetadata(const std::map<std::string, std::string>* metadata,
+                           const ControlEdges* control_edges = nullptr);
+
+  // Initializes the mapping between tensor index to the index of the
+  // last operation that uses the tensor as input.
+  void InitializeTensorReleaseMap();
+
+  // May allocate dynamic tensor memory of node outputs. It's used when
+  // `EnsureDynamicTensorsAreReleased` or`UseDynamicAllocationForLargeTensors`
+  // API is used.
+  TfLiteStatus MayAllocateOpOutput(TfLiteNode* node);
+
+  // Checks the options for releasing dynamic tensors and release dynamic
+  // tensors if configured.
+  void MaybeReleaseDynamicTensors(const TfLiteNode& node, size_t node_index);
+
+  // Set the buffer handle to a tensor.
+  // The method is used to implement Interpreter::SetBufferHandle and
+  // SignatureRunner::SetInputBufferHandle/SetOutputBufferHandle APIs.
+  // `release_existing_buffer_handle`: If true, the existing buffer handle
+  // will be released by TfLiteDelegate::FreeBufferHandle.
+  static TfLiteStatus SetBufferHandleImpl(
+      TfLiteContext* context, TfLiteTensor* tensor,
+      TfLiteBufferHandle buffer_handle, TfLiteDelegate* delegate,
+      bool release_existing_buffer_handle = true);
+
+  // SetBufferHandleImpl with tensor index.
+  TfLiteStatus SetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle buffer_handle,
+                               TfLiteDelegate* delegate,
+                               bool release_existing_buffer_handle = true) {
+    return SetBufferHandleImpl(&context_, tensor(tensor_index), buffer_handle,
+                               delegate, release_existing_buffer_handle);
+  }
+
+  // The state of the Subgraph.
+  enum State {
+    // The Subgraph isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The Subgraph is ready to be invoked.
+    kStateInvokable,
+    // The Subgraph is ready to be invoked, and graph can't be further
+    // modified. The Subgraph will enter this state when calling
+    // `ModifyGraphWithDelegate` and the delegate doesn't support dynamic
+    // tensors.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  TfLiteContext context_ = {};
+
+  // A pointer to the external contexts (kTfLiteMaxExternalContexts) array that
+  // sits inside the associated TFLite interpreter instance.
+  TfLiteExternalContext** external_contexts_;
+
+  // A set of 'TfLiteOperator' pointers that are owned by the
+  // subgraph.  The objects pointed to by the 'TfLiteOperator'
+  // pointers are deleted in the 'Subgraph' destructor.
+  //
+  // The intended usage of this container is to provide (friend) classes
+  // the option to dynamically allocate 'TfLiteOperator' objects
+  // and then tie the lifetime of these objects to a subgraph.
+  //
+  // WARNING: This field needs to precede 'nodes_and_registration_', to ensure
+  // that it outlives that field, since that field might contain references to
+  // the TfLiteOperator objects contained in this fielld.
+  //
+  // LINT.IfChange
+  // The definition of OperatorsCache implicitly assumes that
+  // TfLiteOperatorDelete is the same as the standard C++ delete operator.
+  // TODO(b/238435088): in op_resolver, include operator.h and use
+  // 'TfLiteOperatorDelete' as the deleter, then we can eliminate
+  // the IfChange...ThenChange directive below.
+  std::shared_ptr<::tflite::internal::OperatorsCache> registration_externals_;
+  // LINT.ThenChange(//tensorflow/lite/core/c/c_api.cc)
+
+  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
+  // function pointers to actual implementation.
+  // Nodes should appear in the order in which they are instantiated at runtime.
+  // Delegated nodes are appended after all the original ones.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
+      nodes_and_registration_;
+
+  // Whether the model is consistent. That is to say if the inputs and outputs
+  // of every node and the global inputs and outputs are valid indexes into
+  // the tensor array.
+  bool consistent_ = true;
+
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  std::vector<int> inputs_;
+
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  std::vector<int> outputs_;
+
+  // Array of indices representing the tensors that are variable tensors.
+  std::vector<int> variables_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // Index of the next node to prepare.
+  // During Invoke(), Interpreter will allocate input tensors first, which are
+  // known to be fixed size. Then it will allocate outputs from nodes as many
+  // as possible. When there is a node that produces dynamic sized tensor.
+  // Interpreter will stop allocating tensors, set the value of next allocate
+  // node id, and execute the node to generate the output tensor before continue
+  // to allocate successors. This process repeats until all nodes are executed.
+  // NOTE: this relies on the order of nodes that is in topological order.
+  int next_execution_plan_index_to_prepare_;
+
+  // Only used in cases where a delegate supporting dynamic tensors is applied.
+  // This helps prepare the original execution before the post-delegation one,
+  // so that tensor shapes propagate.
+  int next_original_execution_plan_index_to_prepare_;
+
+  // This is similar to `next_execution_plan_index_to_prepare_`, but it tracks
+  // which nodes' allocation is planned with the arena planner.
+  //
+  // This is a workaround for b/127354079. It shouldn't be necessary if
+  // ArenaPlanner can "rewind" to a specific point.
+  // TODO(b/127354079): Improve ArenaPlanner and remove this mechanism.
+  int next_execution_plan_index_to_plan_allocation_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
+
+  // This is a copy of the first execution_plan_ before any delegates were
+  // applied. It is empty if no delegates were applied to this Subgraph.
+  std::vector<int> pre_delegation_execution_plan_;
+
+  // Contains a list of delegates applied by the user so far, in order.
+  std::vector<TfLiteDelegate*> delegates_applied_;
+
+  // Set to true if UndoAllDelegates was called, and to false during
+  // RedoAllDelegates.
+  bool delegates_undone_ = false;
+
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  IntArrayUniquePtr plan_cache_;
+
+  // Used by PreviewDelegateParitioning.
+  std::vector<TfLiteDelegateParams> partitioning_preview_cache_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  // Maps tensor index to custom allocation for all applicable tensors.
+  std::map<int, TfLiteCustomAllocation> custom_allocations_;
+
+  // Tracking bit for whether a tensor was resized in the course of an op
+  // invocation. This is a useful hint to ensure that dynamic tensor outputs
+  // trigger downstream reallocation after op invocation.
+  bool tensor_resized_since_op_invoke_ = false;
+
+  // Profiler for this interpreter instance.
+  std::unique_ptr<SubgraphAwareProfiler> profiler_;
+
+  // A pointer to vector of subgraphs. The vector is owned by the interpreter.
+  std::vector<std::unique_ptr<Subgraph>>* subgraphs_ = nullptr;
+
+  // Location of the pointer to *this in *subgraphs_, or kInvalidSubgraphIndex.
+  const int subgraph_index_;
+
+  // True if not all tensors in the graph has static size after calling
+  // `PrepareOpsStartingAt` function (which is called by the `AllocateTensors`
+  // public function).
+  // The value is invalid before `PrepareOpStartingAt` is called.
+  bool has_dynamic_tensors_ = true;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is the index of dynamic tensor which was checked at
+  // PrepareOpsStartingAt() when `has_dynamic_tensors_` is set. This information
+  // is kept only for user error message.
+  int dynamic_tensor_index_ = -1;
+
+  // Reference to cancellation function that can cancel a request in the middle
+  // of a call to Invoke(). When this function returns True, a kTfLiteError is
+  // thrown by Invoke().
+  bool (*check_cancelled_func_)(void*) = nullptr;
+
+  // Pointer to the cancellation flag owned by the interpreter.
+  // If null, it means cancellation is not enabled.
+  // If not null, in flight invocation will be cancelled if the flag is false.
+  // The flag will be reset to true in the beginning of every `Invoke` call
+  // so cancellation hapens before will not cancel subsequent invocations.
+  std::atomic_flag* continue_invocation_ = nullptr;
+
+  // Reference to data used by the cancellation function in
+  // `check_cancelled_func_`.
+  void* cancellation_data_ = nullptr;
+
+  // A map of resources. Owned by interpreter and shared by multiple subgraphs.
+  resource::ResourceMap* resources_ = nullptr;
+
+  // A map of resources IDs. Owned by interpreter and shared by multiple
+  // subgraphs.
+  resource::ResourceIDMap* resource_ids_ = nullptr;
+
+  // A map of initialization statuses, that indicate whether the intialization
+  // subgraph invocation is done or not.
+  resource::InitializationStatusMap* initialization_status_map_;
+
+  // Name of the subgraph (analogous to function name).
+  std::string name_;
+
+  // Model-metadata owned by the Interpreter.
+  const std::map<std::string, std::string>* metadata_ = nullptr;
+
+  // Mapping between tensor index to the last index of the execution plan that
+  // uses this tensor.
+  std::map<int, int> tensor_to_last_op_index_;
+
+  // `InterpreterOptions` object which is being used and owned by Interpreter.
+  InterpreterOptions* options_;
+
+  // Control edges (i.e., dependencies between nodes in addition to their data
+  // dependencies); can be nullptr. Will be initialized from metadata associated
+  // with the owning interpreter; the pointee is owned by the owning
+  // interpreter. The owning interpreter will keep this consistent with
+  // metadata_ by appropriately parametrized SetMetadata method calls.
+  const ControlEdges* control_edges_ = nullptr;
+
+  // Whether this subgraph is "delegation skippable". If a subgraph is
+  // delegation-skippable, then the subgraph will be handled by a TfLiteDelegate
+  // (and that the delegate is supposed to be already aware of this state), and
+  // therefore, TfLiteInterpreter can skip invoking `ModifyGraphWithDelegate` on
+  // this subgraph.
+  bool is_delegation_skippable_ = false;
+
+  // Count how many times the context has been switched to the delegate context.
+  // Incremented during Acquire(), and decremented during Release().
+  // Initialized to 1 initially because SwitchToKernelContext() is called once
+  // during the Subgraph initialization.
+  int delegate_context_switch_count_ = 1;
+
+  /// The allocator used for holding memory of the model. Note that this will
+  /// be null if the client provides a tflite::Model directly.
+  const Allocation* allocation_ = nullptr;
+
+  // Maps tensor constant buffers used in the subgraph to a model-wide
+  // identifiers.
+  std::unordered_map<size_t, size_t> tensor_buffer_identifiers_;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/tools/verifier.h b/third_party/tflite-hdrs/tensorflow/lite/core/tools/verifier.h
new file mode 100644
index 00000000..be3c915b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/tools/verifier.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/tools/verifier.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_TOOLS_VERIFIER_H_
+#define TENSORFLOW_LITE_CORE_TOOLS_VERIFIER_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/model.h"      // Legacy.
+#include "tensorflow/lite/error_reporter.h"  // Legacy.
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+class AlwaysTrueResolver : public OpResolver {
+ public:
+  AlwaysTrueResolver() {}
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
+    static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
+                                                   nullptr};
+    return &null_registration;
+  }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
+                                                   nullptr};
+    return &null_registration;
+  }
+};
+
+// Verifies the integrity of a Tensorflow Lite flatbuffer model file.
+// Currently, it verifies:
+// * The file is following a legit flatbuffer schema.
+// * The model is in supported version.
+// * All ops used in the model are supported by OpResolver.
+// DEPRECATED:
+//   This function is deprecated, because it doesn't take delegates into
+//   account, and as a result may report errors if the model contains
+//   operators that are not supported by the OpResolver but that would be
+//   rewritten by any TfLiteDelegate that you are using.
+// Suggested replacement:
+//   Use the version below that doesn't takes an OpResolver (and
+//   doesn't check the validity of the ops) instead of this function,
+//   and delay verification of the ops until after you have constructed
+//   the Interpreter.  To verify that the operators in the model are supported
+//   by the delegate(s) and/or by the OpResolver, construct the Interpreter,
+//   applying the TfLiteDelegate(s) using InterpreterBuilder::AddDelegate,
+//   and then just check the return value from Interpreter::AllocateTensors().
+bool Verify(const void* buf, size_t len, const OpResolver& resolver,
+            ErrorReporter* error_reporter);
+
+// Verifies the integrity of a Tensorflow Lite flatbuffer model file.
+// Currently, it verifies:
+// * The file is following a legit flatbuffer schema.
+// * The model is in supported version.
+// * Some basic consistency checks on the graph.
+// * Some validity checks on the tensors.
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_TOOLS_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/core/tools/verifier_internal.h b/third_party/tflite-hdrs/tensorflow/lite/core/tools/verifier_internal.h
new file mode 100644
index 00000000..c76e308e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/core/tools/verifier_internal.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/tools/verifier_internal.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_TOOLS_VERIFIER_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_TOOLS_VERIFIER_INTERNAL_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace internal {
+
+// Verifies that the buffer is a valid TF Lite Model flatbuffer
+// (without checking the consistency of the flatbuffer contents,
+// just that it is a valid flatbuffer).
+// Returns the FlatBuffer Model on success, or nullptr if the buffer does not
+// contain a valid TF Lite Model flatbuffer.
+const Model* VerifyFlatBufferAndGetModel(const void* buf, size_t len);
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_TOOLS_VERIFIER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/create_op_resolver.h b/third_party/tflite-hdrs/tensorflow/lite/create_op_resolver.h
new file mode 100644
index 00000000..563d5b35
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/create_op_resolver.h
@@ -0,0 +1,26 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/create_op_resolver.h
+
+#include "tensorflow/lite/core/create_op_resolver.h"  // IWYU pragma: export
+
+namespace tflite {
+using ::tflite::CreateOpResolver;
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h
new file mode 100644
index 00000000..3a32ff6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_ACTIVATION_LAYER_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_ACTIVATION_LAYER_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+class ActivationLayerBuilder : public OpBuilder {
+ public:
+  explicit ActivationLayerBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+
+  explicit ActivationLayerBuilder(GraphBuilder* graph_builder,
+                                  TfLiteFusedActivation activation)
+      : OpBuilder(graph_builder), activation_(activation) {}
+
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  void SetActivation(TfLiteFusedActivation activation) {
+    activation_ = activation;
+  }
+
+  void SetAlpha(float alpha) { alpha_ = alpha; }
+
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TfLiteFusedActivation activation_;
+  float alpha_ = 1.0f;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_ACTIVATION_LAYER_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/add_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/add_op_builder.h
new file mode 100644
index 00000000..3565d8d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/add_op_builder.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_ADD_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_ADD_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// Builder for Add op in CoreML.
+class AddOpBuilder : public OpBuilder {
+ public:
+  explicit AddOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  void SetAlpha(float alpha);
+
+ private:
+  // Used for unary add
+  float alpha_ = 0.0f;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_ADD_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.h
new file mode 100644
index 00000000..4ffed998
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_CONCATENATION_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_CONCATENATION_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+class ConcatenationOpBuilder : public OpBuilder {
+ public:
+  explicit ConcatenationOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+
+  const std::string& DebugName() override {
+    if (debug_name_.empty()) SetDebugName("ConcatOpBuilder", node_id_);
+    return debug_name_;
+  }
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_CONCATENATION_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h
new file mode 100644
index 00000000..471c1ef5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_CONVOLUTION_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_CONVOLUTION_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+enum class ConvolutionType { kConv, kDepthwiseConv, kTransposeConv };
+
+// Layer that provides convolution and depthwise convolution.
+class ConvolutionOpBuilder : public OpBuilder {
+ public:
+  explicit ConvolutionOpBuilder(GraphBuilder* graph_builder,
+                                ConvolutionType conv_type)
+      : OpBuilder(graph_builder), conv_type_(conv_type) {}
+
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+
+  void SetOutputChannels(uint64_t output_channels);
+
+  void SetNGroups(uint64_t n_groups);
+
+  void SetWeights(TfLiteTensor* weights);
+
+  void SetBias(TfLiteTensor* bias);
+
+  void SetOutputShape(TfLiteTensor* output_shape);
+
+  void SetParams(void* builtin_data);
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  void FillCoreMLWeights();
+  void FillCoreMLBias();
+
+  // Transpose TFLite kernel weights to CoreML kernel weights.
+  // Should be called after setting CoreML's kernel shapes.
+  void TransposeKernelWeights();
+
+  uint64_t output_channels_;
+  uint64_t n_groups_ = 1;
+
+  ConvolutionType conv_type_;
+
+  // using default dilation_factor (1, 1)
+  // CoreML ConvolutionLayerParams.isDeconvolution == false
+  TfLiteTensor* weights_ = nullptr;
+  TfLiteTensor* bias_ = nullptr;
+  // Only used for TransposeConv.
+  TfLiteTensor* output_shape_ = nullptr;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_CONVOLUTION_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h
new file mode 100644
index 00000000..713acb94
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+// Dummy Opbuilder for nodes that are claimed but not used. ex) FP16 dequantize
+class DummyOpBuilder : public OpBuilder {
+ public:
+  explicit DummyOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+  const std::string& DebugName() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.h
new file mode 100644
index 00000000..9f7b1add
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_FULLY_CONNECTED_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_FULLY_CONNECTED_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// Builder for InnerProductLayer in Core ML.
+class FullyConnectedOpBuilder : public OpBuilder {
+ public:
+  explicit FullyConnectedOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  void SetWeights(TfLiteTensor* weights);
+
+  void SetBias(TfLiteTensor* bias);
+
+ private:
+  void FillCoreMLWeights();
+  void FillCoreMLBias();
+
+  TfLiteTensor* weights_ = nullptr;
+  TfLiteTensor* bias_ = nullptr;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_FULLY_CONNECTED_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.h
new file mode 100644
index 00000000..603741e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_HARDSWISH_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_HARDSWISH_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// hswish(x) = x * ReLU6(x + 3) / 6
+class HardSwishOpBuilder : public OpBuilder {
+ public:
+  explicit HardSwishOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_HARDSWISH_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/mul_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/mul_op_builder.h
new file mode 100644
index 00000000..d9bc70b9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/mul_op_builder.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_MUL_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_MUL_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// Builder for Mul op in CoreML.
+class MulOpBuilder : public OpBuilder {
+ public:
+  explicit MulOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  void SetAlpha(float alpha);
+
+ private:
+  // Used for unary mul
+  float alpha_ = 1.0f;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_MUL_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_builder.h
new file mode 100644
index 00000000..51295d30
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_builder.h
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_BUILDER_H_
+
+#include <functional>
+#include <string>
+
+#include "mlmodel/format/Model.pb.h"
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+class OpBuilder;
+
+// A class represents an ID in the coreML graph.
+// A node is represented by a pair (node_id, and output_index)
+// API is experimental and subject to change.
+class TensorID {
+ public:
+  TensorID() {}
+  TensorID(int node, int output_id) : node_(node), output_id_(output_id) {}
+
+  std::string ToString() const;
+
+  int NodeID() const;
+
+  int OutputID() const;
+
+ private:
+  int node_ = -1;
+  int output_id_ = -1;
+};
+
+// Builder for the whole graph.
+// All op builders should be added using AddBuilder
+// and then BuildModel should be called to return the CoreML generated.
+//
+// API is experimental and subject to change.
+class GraphBuilder {
+ public:
+  explicit GraphBuilder(int coreml_version) : coreml_version_(coreml_version) {}
+
+  // Returns pointer to the created builder. Ownership still belongs
+  // to the GraphBuilder.
+  OpBuilder* AddBuilder(int builtin_code, const TfLiteNode* node);
+
+  // Returns pointer to the created builder with op builder function provided.
+  OpBuilder* AddBuilder(const std::function<OpBuilder*(GraphBuilder*)>& builder,
+                        const TfLiteNode* node);
+
+  // Builds Model instance and returns it.
+  CoreML::Specification::Model* BuildModel();
+
+  // Returns string representing tensor 'tensor_id' in coreML.
+  // tensor_id should have been added before calling this method.
+  std::string GetTensorName(int tensor_id);
+
+  // Returns Core ML Tensor ID for TFL 'tensor_id'.
+  // tensor_id should have been added before calling this method.
+  const TensorID GetTensorID(int tensor_id);
+
+  void AddTensorWithID(int tf_tensor_id, const TensorID& tensor_id);
+
+  // Return true if this tensor was added before to the graph.
+  bool HasTensor(int tflite_tensor_index);
+  // Return if this tensor is used in the graph (not as data).
+  // This information is used to mark constant tensors that are used as input.
+  bool IsTensorUsed(int tflite_tensor_index);
+
+  const int coreml_version_;
+
+ private:
+  std::vector<std::unique_ptr<OpBuilder>> builders_;
+  // Index in the vector is the tflite_tensor_index, the value
+  // is the ID in the coreml graph.
+  std::vector<TensorID> tensors_;
+  std::vector<bool> used_tensor_;
+};
+
+// Interface for all op layers
+// API is experimental and subject to change.
+class OpBuilder {
+ public:
+  explicit OpBuilder(GraphBuilder* graph_builder)
+      : graph_builder_(graph_builder) {}
+  virtual ~OpBuilder() {}
+
+  // Returns the Layer this builder responsible for.
+  // Ownership is transferred to caller.
+  virtual CoreML::Specification::NeuralNetworkLayer* Build();
+
+  // Associates TfLite input tensors to Core ML layer's inputs and properties.
+  // Verification for input constraints should happen here.
+  virtual TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                                      TfLiteContext* context) = 0;
+
+  // Associates TFLite output tensor with the node's output. If the OpBuilder
+  // has subgraphs, The final output of that subgraph should be associated with
+  // the output tensor.
+  virtual TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                                       TfLiteContext* context) = 0;
+
+  // Adds additional required OpBuilders, and populate builder_output_ with
+  // Actual output that corresponds to output tensor of TFL Node.
+  // Clients need to override this in cases where the nodes can be used for
+  // composing other ops. For example, Relu6 in TfLite can be converted to
+  // Relu -> Threshold -> Neg.
+  // TODO(b/147211734): have this called automatically when necessary.
+  virtual TfLiteStatus PopulateSubgraph(TfLiteContext* context);
+
+  virtual const std::string& DebugName() = 0;
+
+  void SetBuiltinData(void* builtin_data);
+
+  void SetNodeID(int id);
+
+  void SetTfLiteNode(const TfLiteNode* node);
+
+  int GetID() const;
+
+  // Adds input with tensor name.
+  void AddInput(const std::string& input_name);
+
+  // Adds input with CoreML tensor ID.
+  void AddInput(const TensorID& input_id);
+
+  // Adds input with TF Lite tensor ID.
+  // TODO(taeheej): cleanup AddInput use cases and used tensor tracking.
+  void AddInput(int tf_input_id);
+
+  // Simply adds new output to the underlying layer.
+  TensorID AddOutput();
+
+  // Should set builder_output_ (if unset) and return it as the output of
+  // this node. To be used by clients that needs the output of the node.
+  virtual TensorID GetOutput(TfLiteContext* context);
+
+ protected:
+  // Sets layer's name.
+  void SetDebugName(const char* layer_name, int id);
+
+  GraphBuilder* graph_builder_ = nullptr;
+  // Data needed by this node.
+  void* builtin_data_ = nullptr;
+  int node_id_ = -1;
+  int num_outputs_ = 0;
+  const TfLiteNode* tflite_node_ = nullptr;
+  TensorID builder_output_;
+  std::string debug_name_;
+  std::unique_ptr<CoreML::Specification::NeuralNetworkLayer> layer_;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_factory.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_factory.h
new file mode 100644
index 00000000..3cbf16bb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_factory.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_FACTORY_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_FACTORY_H_
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+class GraphBuilder;
+class OpBuilder;
+
+OpBuilder* CreateAddOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateAveragePool2dOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateConcatenationOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateConvolutionOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateDepthwiseConvolutionOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateFullyConnectedOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateHardSwishOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateLogisticOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMaxPool2dOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMeanOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMirrorPadOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMulOpBuilder(GraphBuilder* graph_builder);
+// PAD handles PAD and PADV2 together.
+OpBuilder* CreatePadOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateReluOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateReluN1To1OpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateRelu6OpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateReshapeOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateResizeBilinearOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateSoftmaxOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateTanhOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateTransposeConvolutionOpBuilder(GraphBuilder* graph_builder);
+
+OpBuilder* CreateActivationLayerBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateThresholdLayerBuilder(GraphBuilder* graph_builder);
+// Dummy Opbuilder for nodes that are claimed but not used. ex) FP16 dequantize
+OpBuilder* CreateDummyOpBuilder(GraphBuilder* graph_builder);
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_validator.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_validator.h
new file mode 100644
index 00000000..f5637130
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/op_validator.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_VALIDATOR_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_VALIDATOR_H_
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// Follow the ordering of TfLiteBuiltinOperator enum.
+bool IsConcatenationOpSupported(const TfLiteRegistration* registration,
+                                const TfLiteNode* node, TfLiteContext* context);
+bool IsConvolutionOpSupported(const TfLiteRegistration* registration,
+                              const TfLiteNode* node, TfLiteContext* context);
+bool IsDepthwiseConvolutionOpSupported(const TfLiteRegistration* registration,
+                                       const TfLiteNode* node,
+                                       TfLiteContext* context);
+bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context);
+bool IsMeanOpSupported(const TfLiteRegistration* registration,
+                       const TfLiteNode* node, TfLiteContext* context);
+bool IsMirrorPadOpSupported(const TfLiteRegistration* registration,
+                            const TfLiteNode* node, TfLiteContext* context);
+bool IsPadOpSupported(const TfLiteRegistration* registration,
+                      const TfLiteNode* node, TfLiteContext* context);
+bool IsReshapeOpSupported(const TfLiteRegistration* registration,
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version);
+bool IsResizeBilinearOpSupported(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context);
+bool IsTransposeConvolutionOpSupported(const TfLiteRegistration* registration,
+                                       const TfLiteNode* node,
+                                       TfLiteContext* context);
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_VALIDATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/pad_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/pad_op_builder.h
new file mode 100644
index 00000000..8b9c28b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/pad_op_builder.h
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_PAD_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_PAD_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+enum class PadType { kPad, kMirrorPad };
+
+// Supports PAD, PADV2, MIRROR_PAD
+class PadOpBuilder : public OpBuilder {
+ public:
+  explicit PadOpBuilder(GraphBuilder* graph_builder, PadType padding_type)
+      : OpBuilder(graph_builder), padding_type_(padding_type) {}
+
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  void SetPadding(const TfLiteTensor* padding);
+
+  void SetConstantValue(const TfLiteTensor* constant_value);
+
+ private:
+  PadType padding_type_;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_PAD_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.h
new file mode 100644
index 00000000..823717d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_POOLING_LAYER_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_POOLING_LAYER_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+class PoolingLayerBuilder : public OpBuilder {
+ public:
+  explicit PoolingLayerBuilder(GraphBuilder* graph_builder,
+                               TfLiteBuiltinOperator pooling_type)
+      : OpBuilder(graph_builder), pooling_type_(pooling_type) {}
+
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  // Should be one of pooling types.
+  TfLiteBuiltinOperator pooling_type_;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_POOLING_LAYER_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.h
new file mode 100644
index 00000000..9100be51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_RESHAPE_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_RESHAPE_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// Builder for Reshape op in CoreML.
+class ReshapeOpBuilder : public OpBuilder {
+ public:
+  explicit ReshapeOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  // Sets output shape of the Core ML reshape layer, given output shape and
+  // the input tensor's shape.
+  void SetShapeFromTensor(const TfLiteTensor* output_shape,
+                          const TfLiteIntArray* input_shape);
+  void SetShapeFromIntArray(const TfLiteIntArray* output_shape,
+                            const TfLiteIntArray* input_shape);
+
+ private:
+  std::vector<int> shape_;
+  // When channel dimension is changed, reshape should be done with HWC layout,
+  // thus transpose is required. (set with ReshapeLayerParams.mode)
+  bool need_transpose_ = false;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_RESHAPE_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.h
new file mode 100644
index 00000000..04cb82d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_RESIZE_BILINEAR_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_RESIZE_BILINEAR_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+class ResizeBilinearOpBuilder : public OpBuilder {
+ public:
+  explicit ResizeBilinearOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  int height_;
+  int width_;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_RESIZE_BILINEAR_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/softmax_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/softmax_op_builder.h
new file mode 100644
index 00000000..5609ec5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/softmax_op_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_SOFTMAX_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_SOFTMAX_OP_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+// Builder for Softmax op in CoreML.
+class SoftmaxOpBuilder : public OpBuilder {
+ public:
+  explicit SoftmaxOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_SOFTMAX_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/test_util.h
new file mode 100644
index 00000000..f55f7c5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/test_util.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/coreml/coreml_delegate.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+#import <XCTest/XCTest.h>
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+class SingleOpModelWithCoreMlDelegate : public tflite::SingleOpModel {
+ public:
+  static const char kDelegateName[];
+
+  SingleOpModelWithCoreMlDelegate();
+  tflite::Interpreter* interpreter() { return interpreter_.get(); }
+
+ protected:
+  using SingleOpModel::builder_;
+
+ private:
+  tflite::Interpreter::TfLiteDelegatePtr delegate_;
+  TfLiteCoreMlDelegateOptions params_ = {
+      .enabled_devices = TfLiteCoreMlDelegateAllDevices,
+      .min_nodes_per_partition = 1,
+  };
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+@interface BaseOpTest : XCTestCase
+@property tflite::delegates::coreml::SingleOpModelWithCoreMlDelegate* model;
+- (void)validateInterpreter:(tflite::Interpreter*)interpreter;
+- (void)checkInterpreterNotDelegated:(tflite::Interpreter*)interpreter;
+- (void)invokeAndValidate;
+- (void)invokeAndCheckNotDelegated;
+@end
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/threshold_layer_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/threshold_layer_builder.h
new file mode 100644
index 00000000..a5c88a7e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/threshold_layer_builder.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_THRESHOLD_LAYER_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_THRESHOLD_LAYER_BUILDER_H_
+
+#include <string>
+
+#include "mlmodel/format/NeuralNetwork.pb.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+// Layer that provides threshold operation. Depending on scale, this can be used
+// as max (scale > 0) or min (scale < 0), in combination with another negative
+// linear activation layer) operation.
+// TODO(karimnosseir): Generalize to other unary operators.
+class ThresholdLayerBuilder : public OpBuilder {
+ public:
+  explicit ThresholdLayerBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+
+  const std::string& DebugName() override;
+
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+
+  void SetAlpha(float alpha) { alpha_ = alpha; }
+
+  void SetScale(float scale) { scale_ = scale; }
+
+  TfLiteStatus RegisterInputs(const TfLiteIntArray* inputs,
+                              TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  float alpha_ = 0.0f;
+  float scale_ = 1.0f;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_THRESHOLD_LAYER_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/util.h
new file mode 100644
index 00000000..e3cd5a82
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/builders/util.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_UTIL_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+// Checks if Binary ops have supported broadcastable shapes.
+// Core ml arithmetic ops - Add and Mul support broadcasts among
+// [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W].
+// other shapes should be rejected. Unless it is a constant tensor of size 1,
+// which will be added as data.
+
+bool IsBinaryOpSupported(const TfLiteRegistration* registration,
+                         const TfLiteNode* node, TfLiteContext* context);
+
+// Gets the float scalar value from the given tensor. The tensor should be a
+// constant float32/float16 tensor of size 1.
+float GetScalarFloatFromTensor(const TfLiteTensor* tensor);
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_delegate.h
new file mode 100644
index 00000000..e964b83b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_delegate.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_COREML_COREML_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_COREML_COREML_DELEGATE_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+// LINT.IfChange
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+typedef enum {
+  // Create Core ML delegate only on devices with Apple Neural Engine.
+  // Returns nullptr otherwise.
+  TfLiteCoreMlDelegateDevicesWithNeuralEngine,
+  // Always create Core ML delegate
+  TfLiteCoreMlDelegateAllDevices
+} TfLiteCoreMlDelegateEnabledDevices;
+
+typedef struct {
+  // Only create delegate when Neural Engine is available on the device.
+  TfLiteCoreMlDelegateEnabledDevices enabled_devices;
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  int coreml_version;
+  // This sets the maximum number of Core ML delegates created.
+  // Each graph corresponds to one delegated node subset in the
+  // TFLite model. Set this to 0 to delegate all possible partitions.
+  int max_delegated_partitions;
+  // This sets the minimum number of nodes per partition delegated with
+  // Core ML delegate. Defaults to 2.
+  int min_nodes_per_partition;
+#ifdef TFLITE_DEBUG_DELEGATE
+  // This sets the index of the first node that could be delegated.
+  int first_delegate_node_index;
+  // This sets the index of the last node that could be delegated.
+  int last_delegate_node_index;
+#endif
+} TfLiteCoreMlDelegateOptions;
+
+// Return a delegate that uses CoreML for ops execution.
+// Must outlive the interpreter.
+TfLiteDelegate* TfLiteCoreMlDelegateCreate(
+    const TfLiteCoreMlDelegateOptions* options);
+
+// Do any needed cleanup and delete 'delegate'.
+void TfLiteCoreMlDelegateDelete(TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+// LINT.ThenChange(README.md)
+
+#endif  // TENSORFLOW_LITE_DELEGATES_COREML_COREML_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.h
new file mode 100644
index 00000000..85f1f93c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_COREML_DELEGATE_KERNEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_COREML_DELEGATE_KERNEL_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
+#import "tensorflow/lite/delegates/coreml/coreml_executor.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+// Represents a subgraph in TFLite that will be delegated to CoreML.
+// It is abstracted as a single kernel node in the main TFLite graph and
+// implements Init/Prepare/Invoke as TFLite kernel nodes.
+class CoreMlDelegateKernel {
+ public:
+  explicit CoreMlDelegateKernel(int coreml_version)
+      : coreml_version_(coreml_version) {}
+  // Initialize the delegated graph and add required nodes.
+  TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
+
+  // Any preparation work needed for the delegated graph.
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
+
+  // Allocates delegated tensordefs for graph I/O & execute it.
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
+
+  ~CoreMlDelegateKernel();
+
+ private:
+  // Builds the ML Model protocol buffer
+  TfLiteStatus BuildModel(TfLiteContext* context,
+                          const TfLiteDelegateParams* params);
+
+  // Adds the output tensors to the model generated.
+  void AddOutputTensors(const TfLiteIntArray* output_tensors,
+                        TfLiteContext* context);
+
+  // Adds the input tensors to the model generated.
+  void AddInputTensors(const TfLiteIntArray* output_tensors,
+                       TfLiteContext* context);
+
+  std::unique_ptr<delegates::coreml::GraphBuilder> builder_;
+  std::unique_ptr<CoreML::Specification::Model> model_;
+  ::CoreMlExecutor* executor_;
+  int coreml_version_;
+
+  std::vector<int> input_tensor_ids_;
+  std::vector<TensorData> inputs_;
+  std::vector<TensorData> outputs_;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_COREML_DELEGATE_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_executor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_executor.h
new file mode 100644
index 00000000..9a13984a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/coreml/coreml_executor.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#import <CoreML/CoreML.h>
+
+#include <string>
+#include <vector>
+
+#include "mlmodel/format/Model.pb.h"
+
+// Data for input/output tensors.
+struct TensorData {
+  std::vector<float> data;
+  const std::string name;
+  std::vector<int> shape;  // only required for input tensor.
+};
+
+// Responsible for:
+// - Compiling and constructing MLModel from a serialized MlModel
+//   protocol buffer.
+// - Invoking predictions on the built model.
+// Usage: Construct object, call Build() and Invoke() for inference.
+@interface CoreMlExecutor : NSObject
+
+- (bool)invokeWithInputs:(const std::vector<TensorData>&)inputs
+                 outputs:(const std::vector<TensorData>&)outputs API_AVAILABLE(ios(11));
+
+- (NSURL*)saveModel:(CoreML::Specification::Model*)model API_AVAILABLE(ios(11));
+- (bool)build:(NSURL*)modelUrl API_AVAILABLE(ios(11));
+
+- (bool)cleanup;
+
+@property MLModel* model API_AVAILABLE(ios(11));
+@property NSString* mlModelFilePath;
+@property NSString* compiledModelFilePath;
+@property(nonatomic, readonly) int coreMlVersion;
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/delegate_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/delegate_test_util.h
new file mode 100644
index 00000000..2af6d5f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/delegate_test_util.h
@@ -0,0 +1,225 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_DELEGATE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_DELEGATE_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace delegates {
+namespace test_utils {
+
+// Build a kernel registration for a custom addition op that adds its two
+// tensor inputs to produce a tensor output.
+TfLiteRegistration AddOpRegistration();
+
+class SimpleDelegate {
+ public:
+  // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+  // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+  // value-copyable and compatible with TfLite.
+  //
+  // Parameters:
+  //   nodes: Indices of the graph nodes that the delegate will handle.
+  //   fail_node_prepare: To simulate failure of Delegate node's Prepare().
+  //   min_ops_per_subset: If >0, partitioning preview is used to choose only
+  //     those subsets with min_ops_per_subset number of nodes.
+  //   fail_node_invoke: To simulate failure of Delegate node's Invoke().
+  //   automatic_shape_propagation: This assumes that the runtime will
+  //     propagate shapes using the original execution plan.
+  //   custom_op: If true, the graph nodes specified in the 'nodes' parameter
+  //     should be custom ops with name "my_add"; if false, they should be
+  //     the builtin ADD operator.
+  //   set_output_tensor_dynamic: If True, this delegate sets output tensor to
+  //     as dynamic during kernel Prepare.
+  explicit SimpleDelegate(const std::vector<int>& nodes,
+                          int64_t delegate_flags = kTfLiteDelegateFlagsNone,
+                          bool fail_node_prepare = false,
+                          int min_ops_per_subset = 0,
+                          bool fail_node_invoke = false,
+                          bool automatic_shape_propagation = false,
+                          bool custom_op = true,
+                          bool set_output_tensor_dynamic = false);
+
+  static std::unique_ptr<SimpleDelegate> DelegateWithRuntimeShapePropagation(
+      const std::vector<int>& nodes, int64_t delegate_flags,
+      int min_ops_per_subset);
+
+  static std::unique_ptr<SimpleDelegate> DelegateWithDynamicOutput(
+      const std::vector<int>& nodes);
+
+  TfLiteRegistration FakeFusedRegistration();
+
+  TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+  int min_ops_per_subset() { return min_ops_per_subset_; }
+
+ private:
+  std::vector<int> nodes_;
+  TfLiteDelegate delegate_;
+  bool fail_delegate_node_prepare_ = false;
+  int min_ops_per_subset_ = 0;
+  bool fail_delegate_node_invoke_ = false;
+  bool automatic_shape_propagation_ = false;
+  bool custom_op_ = true;
+  bool set_output_tensor_dynamic_ = false;
+};
+
+// Base class for single/multiple delegate tests.
+// Friend of Interpreter to access private methods.
+class TestDelegation {
+ public:
+  virtual ~TestDelegation() = default;
+
+  // Returns an empty interpreter that uses the same default delegates that are
+  // normally enabled by default.
+  static std::unique_ptr<impl::Interpreter>
+  NewInterpreterWithDefaultDelegates() {
+    auto interpreter = std::make_unique<impl::Interpreter>();
+    interpreter->lazy_delegate_providers_ =
+        tflite::ops::builtin::BuiltinOpResolver().GetDelegateCreators();
+    return interpreter;
+  }
+
+ protected:
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+  void SetMetadata(const std::map<std::string, std::string>& metadata) {
+    interpreter_->SetMetadata(metadata);
+  }
+
+  virtual void SetUpSubgraph(Subgraph* subgraph);
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr);
+
+  std::unique_ptr<impl::Interpreter> interpreter_;
+};
+
+// Tests scenarios involving a single delegate.
+class TestDelegate : public TestDelegation, public ::testing::Test {
+ protected:
+  void SetUp() override;
+
+  void TearDown() override;
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+
+// Tests scenarios involving a single delegate and control edges.
+// Subgraph 0 has the form
+//
+//         /---OP2---\
+//        /           \
+// >---OP0             OP3--->
+//        \           /
+//         \---OP1---/
+//
+// Delegating OP0, OP2 will generate an execution graph with a "super-node"
+// {OP0->OP2}, which can be disabled by adding (in metadata) a control edge
+// between OP1 and OP2:
+//
+//         /->-OP2---\
+//        /     ^     \
+// >---OP0      ^      OP3--->
+//        \     ^     /
+//         \---OP1---/
+//
+class TestDelegateWithControlEdges : public TestDelegate {
+ protected:
+  void SetUpSubgraph(Subgraph* subgraph) override;
+};
+
+// Tests scenarios involving two delegates, parametrized by the first & second
+// delegate's flags.
+class TestTwoDelegates
+    : public TestDelegation,
+      public ::testing::TestWithParam<
+          std::pair<TfLiteDelegateFlags, TfLiteDelegateFlags>> {
+ protected:
+  void SetUp() override;
+
+  void TearDown() override;
+
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+
+// Tests delegate functionality related to FP16 graphs.
+// Model architecture:
+// 1->DEQ->2   4->DEQ->5   7->DEQ->8   10->DEQ->11
+//         |           |           |            |
+// 0----->ADD->3----->ADD->6----->MUL->9------>ADD-->12
+// Input: 0, Output:12.
+// All constants are 2, so the function is: (x + 2 + 2) * 2 + 2 = 2x + 10
+//
+// Delegate only supports ADD, so can have up to two delegated partitions.
+// TODO(b/156707497): Add more cases here once we have landed CPU kernels
+// supporting FP16.
+class TestFP16Delegation : public ::testing::TestWithParam<int> {
+ protected:
+  void SetUp() override;
+
+  void VerifyInvoke();
+
+  void TearDown() override { interpreter_.reset(); }
+
+ protected:
+  class FP16Delegate {
+   public:
+    // Uses FP16GraphPartitionHelper to accept ADD nodes with fp16 input.
+    explicit FP16Delegate(int num_delegated_subsets,
+                          bool fail_node_prepare = false,
+                          bool fail_node_invoke = false);
+
+    TfLiteRegistration FakeFusedRegistration();
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int num_delegated_subsets() { return num_delegated_subsets_; }
+
+   private:
+    TfLiteDelegate delegate_;
+    int num_delegated_subsets_;
+    bool fail_delegate_node_prepare_ = false;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<impl::Interpreter> interpreter_;
+  std::unique_ptr<FP16Delegate> delegate_;
+  Eigen::half float16_const_;
+};
+
+}  // namespace test_utils
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_DELEGATE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/external/external_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/external/external_delegate.h
new file mode 100644
index 00000000..3589ae2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/external/external_delegate.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TfLiteExternalDelegateOptions is a structure of key/value options to create
+// an external delegate.
+#define kExternalDelegateMaxOptions 256
+typedef struct TfLiteExternalDelegateOptions {
+  const char* lib_path;
+  int count;
+  const char* keys[kExternalDelegateMaxOptions];
+  const char* values[kExternalDelegateMaxOptions];
+  TfLiteStatus (*insert)(struct TfLiteExternalDelegateOptions* options,
+                         const char* key, const char* value);
+} TfLiteExternalDelegateOptions;
+
+// Insert key/value to the options.
+TfLiteStatus TfLiteExternalDelegateOptionsInsert(
+    TfLiteExternalDelegateOptions* options, const char* key, const char* value);
+
+// Populates TfLiteExternalDelegateOptions with the given shared library path.
+TfLiteExternalDelegateOptions TfLiteExternalDelegateOptionsDefault(
+    const char* lib_path);
+
+// Creates a new delegate instance that need to be destroyed with
+// `TfLiteExternalDelegateDelete` when delegate is no longer used by TFLite.
+TfLiteDelegate* TfLiteExternalDelegateCreate(
+    const TfLiteExternalDelegateOptions* options);
+
+// Destroys a delegate created with `TfLiteExternalDelegateCreate` call.
+void TfLiteExternalDelegateDelete(TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/external/external_delegate_interface.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/external/external_delegate_interface.h
new file mode 100644
index 00000000..9c3d9fc2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/external/external_delegate_interface.h
@@ -0,0 +1,79 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_INTERFACE_H_
+#define TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_INTERFACE_H_
+
+#include "tensorflow/lite/c/common.h"
+
+// This header file declares the interface that external delegate shared
+// libraries need to implement.  The functions declared here are not defined
+// in TF Lite itself -- this just declares the interface to functions that
+// are defined elsewhere, in a shared library that TfLiteExternalDelegate will
+// dynamically load.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Define TFL_EXTERNAL_DELEGATE_EXPORT macro to export an external delegate API
+// function properly with a shared library.
+#ifdef SWIG
+#define TFL_EXTERNAL_DELEGATE_EXPORT
+#else  // !defined SWIG
+#ifdef _WIN32
+// On Windows, the TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY macro should be
+// defined when _building_ an external delegate shared library, but should not
+// be defined when _using_ an external delegate shared library.
+#ifdef TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY
+#define TFL_EXTERNAL_DELEGATE_EXPORT __declspec(dllexport)
+#else  // !defined TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY
+// We may not actually need dllimport,
+// since the symbols will looked up dynamically?
+#define TFL_EXTERNAL_DELEGATE_EXPORT __declspec(dllimport)
+#endif  // !defined TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY
+#else   // !defined _WIN32
+#define TFL_EXTERNAL_DELEGATE_EXPORT __attribute__((visibility("default")))
+#endif  // !defined _WIN32
+#endif  // !defined SWIG
+
+// Creates a delegate object based on provided key-value options.
+//
+// The delegate is initialized using the option settings specified by the
+// names in `options_keys` and the corresponding values in `options_values`,
+// which are both arrays of length `num_options` of NUL-terminated C strings.
+// This function *should not* modify those arrays, but the caller must not rely
+// on that. `options_keys` and `options_values` may be null if `num_options` is
+// zero.
+//
+// On success, returns a non-null value that should be deallocated with
+// tflite_plugin_destroy_delegate when no longer needed.
+// On failure, returns NULL to indicate an error, with the detailed information
+// reported by calling `report_error` if provided.
+extern TFL_EXTERNAL_DELEGATE_EXPORT TfLiteDelegate*
+tflite_plugin_create_delegate(const char* const* options_keys,
+                              const char* const* options_values,
+                              size_t num_options,
+                              void (*report_error)(const char*));
+
+// Destroys a delegate object that was created by tflite_plugin_create_delegate.
+// Calling this with nullptr as the argument value is allowed and has no effect.
+extern TFL_EXTERNAL_DELEGATE_EXPORT void tflite_plugin_destroy_delegate(
+    TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/buffer_map.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/buffer_map.h
new file mode 100644
index 00000000..8a6a0e09
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/buffer_map.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
+
+#include <map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace flex {
+
+// Maps a TF Lite tensor index into a TensorFlow tensor.
+//
+// The TF Lite interpreter assigns integer indices to each of its tensors, but
+// the Flex delegate deals in terms of TensorFlow tensors. This class maps
+// from indices to tensors and allows the creation of new tensors to be
+// associated with a given index.
+class BufferMap {
+ public:
+  BufferMap();
+  ~BufferMap();
+
+  // Returns true if the given 'tensor_index' has a corresponding
+  // tensorflow::Tensor.
+  bool HasTensor(int tensor_index) const;
+
+  // Returns the tensorflow::Tensor associated with the given 'tensor_index'.
+  // Precondition: HasTensor() is true.
+  tensorflow::Tensor GetTensor(int tensor_index) const;
+
+  // Returns the const pointer to tensorflow::Tensor associated with the given
+  // 'tensor_index'.
+  // Precondition: HasTensor() is true.
+  const tensorflow::Tensor* GetTensorPtr(int tensor_index) const;
+
+  // Associates the given tensorflow::Tensor with the given 'tensor_index'.
+  // Note that TensorFlow Tensors share data buffers, so this method is only a
+  // shallow copy.
+  void SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor);
+
+  // Same as above but creates a new tensorflow::Tensor with a copy of the
+  // given TfLiteTensor's data. If `allow_reusing=false`, then we explicitly
+  // disallow reusing the TF Lite tensor buffer when constructing the new
+  // tensorflow Tensor.
+  void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor,
+                     bool allow_reusing = true);
+
+ private:
+  // Mapping from TL Lite tensor ID to TensorFlow's Tensor. All tensors that
+  // are inputs or outputs of a subgraph will be added here, irrespective of
+  // whether their data are managed by TF Lite or TensorFlow.
+  std::map<int, tensorflow::Tensor> id_to_tensor_;
+};
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/buffer_map_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/buffer_map_util.h
new file mode 100644
index 00000000..68baed65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/buffer_map_util.h
@@ -0,0 +1,110 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_UTIL_H_
+
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace flex {
+
+// A tensor buffer that is allocated, deallocated and populated by TF Lite.
+class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer {
+  using tensorflow::TensorBuffer::TensorBuffer;
+
+  inline TensorBuffer* root_buffer() override { return this; }
+
+  void FillAllocationDescription(
+      tensorflow::AllocationDescription* proto) const override;
+
+  // Prevents input forwarding from mutating this buffer.
+  inline bool OwnsMemory() const override { return false; }
+
+ protected:
+  void LogAllocation();
+  void LogDeallocation();
+};
+
+// A tensor buffer for most data types. Numeric types have exactly the same
+// representation in TFLITE and TF, so we just need use memcpy().
+// For memory efficiency, this TensorBuffer can possibly reuse memory from the
+// TfLiteTensor, hence caller should ensure that the TfLiteTensor always outlive
+// this TensorBuffer.
+class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
+ public:
+  // If `allow_reusing=false`, then the tensor buffer won't be reused from the
+  // TfLiteTensor.
+  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor,
+                              bool allow_reusing = true);
+
+  ~TfLiteTensorBuffer() override;
+
+  inline size_t size() const override { return len_; }
+
+  // Indicates that `TfLiteTensorBuffer` is responsible for deallocating its
+  // underlying buffer. This buffer must have been allocated by
+  // `tensorflow::cpu_allocator`
+  inline void TakeOwnershipOfBuffer() { reused_buffer_from_tflite_ = false; }
+
+  inline bool BufferReusedFromTfLiteTensor() const {
+    return reused_buffer_from_tflite_;
+  }
+
+  // This function will check if the underlying buffer in `tensor` can be
+  // reused by the tensorflow::Tensor. If it can reuse, it will return
+  // `tensor->data.raw`, otherwise it will create new tensor buffer using
+  // tensorflow's CPU allocator.
+  // TODO(b/205153246): Also consider reusing memory to avoid copying from
+  // tensorflow::Tensor to TfLiteTensor.
+  void* MaybeAllocateTensorflowBuffer(const TfLiteTensor* tensor,
+                                      bool allow_reusing) const;
+
+ private:
+  size_t len_;
+  bool reused_buffer_from_tflite_;
+};
+
+// A string buffer. TFLITE string tensor format is different than
+// TF's so we need perform the conversion here.
+class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
+ public:
+  explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor);
+
+  ~StringTfLiteTensorBuffer() override;
+
+  inline size_t size() const override {
+    return num_strings_ * sizeof(tensorflow::tstring);
+  }
+
+ private:
+  StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings);
+
+  int num_strings_;
+};
+
+// Sets the `tensorflow::Tensor` content from `TfLiteTensor` object. If
+// `allow_reusing=false`, then we explicitly disallow reusing the TF Lite
+// tensor buffer when constructing the new tensorflow Tensor.
+tensorflow::Status SetTfTensorFromTfLite(const TfLiteTensor* tensor,
+                                         tensorflow::Tensor* tf_tensor,
+                                         bool allow_reusing = true);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/delegate.h
new file mode 100644
index 00000000..1a9006d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/delegate.h
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+namespace tflite {
+
+namespace flex {
+namespace testing {
+class KernelTest;
+}  // namespace testing
+}  // namespace flex
+
+// WARNING: This is an experimental interface that is subject to change.
+// Delegate that can be used to extract parts of a graph that are designed to be
+// executed by TensorFlow's runtime via Eager.
+//
+// The interpreter must be constructed after the FlexDelegate and destructed
+// before the FlexDelegate. This delegate may be used with multiple
+// interpreters, but it is *not* thread-safe.
+//
+// Usage:
+//   auto delegate = FlexDelegate::Create();
+//   ... build interpreter ...
+//
+//   if (delegate) {
+//     interpreter->ModifyGraphWithDelegate(delegate.get());
+//   }
+//
+//   void* delegate_data = delegate->data_;
+//   interpreter->SetCancellationFunction(
+//     delegate_data,
+//     FlexDelegate::HasCancelled);
+//
+//   ... run inference ...
+//
+//    static_cast<FlexDelegate*>(delegate_data)->Cancel();
+//
+//   ... destroy interpreter ...
+//   ... destroy delegate ...
+class FlexDelegate : public SimpleDelegateInterface {
+ public:
+  friend class flex::testing::KernelTest;
+
+  // Creates a delegate that supports TF ops.
+  static TfLiteDelegateUniquePtr Create() {
+    return Create(/*base_delegate*/ nullptr);
+  }
+
+  ~FlexDelegate() override {}
+
+  flex::DelegateData* mutable_data() { return &delegate_data_; }
+
+  // This method is thread safe. It does two things:
+  //   1. Calls the CancellationManager of the TF eager runtime to support
+  //      intra-op cancellation in TF.
+  //   2. Uses the CancellationManager to signal TFLite interpreter for inter-op
+  //      cancellation.
+  // Training is non-recoverable after calling this API.
+  void Cancel();
+
+  // The param `data` must be a pointer to a FlexDelegate instance.
+  static bool HasCancelled(void* data);
+
+ protected:
+  // We sometimes have to create certain stub data to test FlexDelegate. To
+  // achieve this, we will make a testing flex delegate class that inherits from
+  // FlexDelegate to override certain things for stub data creation. Therefore,
+  // this function accepts a FlexDelegate instance to initialize it properly for
+  // create a testing flex delegate in some cases, and it is only used in
+  // testing.
+  static TfLiteDelegateUniquePtr Create(
+      std::unique_ptr<FlexDelegate> base_delegate);
+
+  FlexDelegate() {}
+
+  const char* Name() const override;
+
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override;
+
+  TfLiteStatus Initialize(TfLiteContext* context) override;
+
+  SimpleDelegateInterface::Options DelegateOptions() const override {
+    // Use default options.
+    return SimpleDelegateInterface::Options();
+  }
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override;
+
+  TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                    TfLiteBufferHandle buffer_handle,
+                                    TfLiteTensor* output) override;
+
+  flex::DelegateData delegate_data_;
+
+  // Pointer to the base TfLiteDelegate which is created from the Create call.
+  TfLiteDelegate* base_delegate_ = nullptr;
+
+ private:
+  // A cancellation manager.
+  std::unique_ptr<tensorflow::CancellationManager> cancellation_manager_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/delegate_data.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/delegate_data.h
new file mode 100644
index 00000000..7cc06f52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/delegate_data.h
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
+#include "tensorflow/lite/delegates/flex/subgraph_resource.h"
+
+namespace tflite {
+namespace flex {
+
+// Data kept by the Flex delegate for the lifetime of an Interpreter.
+//
+// Note: This class is *not* thread-safe; any dependent delegates should not be
+// used concurrently.
+class DelegateData {
+ public:
+  DelegateData();
+  ~DelegateData();
+
+  // Prepare the necessary EagerContext and data for execution.
+  // This must be called at least once before execution. After preparation
+  // succeeds, redundant calls will be ignored (even if the session_options
+  // differ).
+  // When `main_subgraph` parameter is provided, this function will register
+  // FunctionDefs associated with each of the subgraphs attached to the
+  // `main_subgraph` which is delegated by 'flex_delegate'.
+  // 'flex_delegate' should always be non-null when 'main_subgraph' is
+  // non-null.
+  tensorflow::Status Prepare(const tensorflow::SessionOptions& session_options,
+                             Subgraph* main_subgraph = nullptr,
+                             TfLiteDelegate* flex_delegate = nullptr);
+
+  // The EagerContext that is required for execution of Flex Ops.
+  // Note: The context is lazily created after the first call to |Prepare()|.
+  tensorflow::EagerContext* GetEagerContext() { return eager_context_; }
+
+  tensorflow::CancellationManager* GetCancellationManager() {
+    return cancellation_manager_;
+  }
+
+  void SetCancellationManager(
+      tensorflow::CancellationManager* cancellation_manager) {
+    cancellation_manager_ = cancellation_manager;
+  }
+
+  // Map from TF Lite tensor index to TensorFlow tensor for a given context.
+  BufferMap* GetBufferMap(const TfLiteContext* context) {
+    return &buffer_map_[context];
+  }
+
+  // Returns the mapping between tensor index and last node index for a given
+  // context.
+  std::map<int, int>* GetTensorReleaseMap(const TfLiteContext* context) {
+    return &tensor_release_map_[context];
+  }
+
+ private:
+  // Will be null until Prepare() is called and completes successfully.
+  tensorflow::EagerContext* eager_context_ = nullptr;
+  // Not owned by DelegateData.
+  tensorflow::CancellationManager* cancellation_manager_ = nullptr;
+  // TODO(b/112439500): Clean up stale BufferMap instances after adding the
+  // necessary cleanup hook from a TfLiteContext to a TfLiteDelegate.
+  std::unordered_map<const TfLiteContext*, BufferMap> buffer_map_;
+  // Maps between context and the tensor release map. The map will be filled
+  // during delegate initialization, and queried during eval to look up tensor
+  // lifetime information.
+  std::unordered_map<const TfLiteContext*, std::map<int, int>>
+      tensor_release_map_;
+};
+
+// Creates a `TFLiteSubgraphResource` for each subgraph (execpt
+// for main subgraph) in the model and adds it in the eager context's resource
+// manager. It also registers FunctionDefs in the function library runtime for
+// subgraphs which are used by a list of flex ops.
+tensorflow::Status RegisterFunctionDefForSubgraphs(
+    Subgraph& main_subgraph,
+    const std::function<tensorflow::Status(
+        const std::vector<std::unique_ptr<Subgraph>>&,
+        std::set<std::string>* result)>& select_subgraphs_to_register,
+    tensorflow::ResourceMgr* resource_mgr,
+    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/kernel.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/kernel.h
new file mode 100644
index 00000000..ee162148
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/kernel.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
+
+#include <memory>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+namespace tflite {
+namespace flex {
+
+namespace testing {
+class KernelTest;  // friend class declaration.
+}  // namespace testing
+
+struct OpData;
+struct OpNode;
+
+class DelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  DelegateKernel();
+  ~DelegateKernel() override;
+
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override;
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override;
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override;
+
+ private:
+  friend class tflite::flex::testing::KernelTest;
+
+  // Validates that the computed output tensor shape for the Flex node matches
+  // the existing output shape assigned to the output tensor.
+  TfLiteStatus ValidateOutputTensorShapeConsistency(
+      TfLiteContext* context) const;
+
+  // Executes the Tensorflow op based on the inputs/outputs/attributes
+  // information represented in the `node_data`.
+  tensorflow::Status ExecuteOpKernelRunner(
+      tensorflow::tfrt_stub::OpKernelRunState* run_state,
+      TfLiteContext* context, OpNode* node_data);
+
+  // Returns the tensor release map held in `op_data_`;
+  const std::map<int, int>& GetTensorReleaseMap() const;
+
+  std::unique_ptr<OpData> op_data_;
+
+  // Indicates that the output shapes may be inferred using the input shapes and
+  // May be allocated during Prepare.
+  bool shapes_are_valid_ = true;
+};
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/subgraph_resource.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/subgraph_resource.h
new file mode 100644
index 00000000..5b5734ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/subgraph_resource.h
@@ -0,0 +1,65 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_SUBGRAPH_RESOURCE_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_SUBGRAPH_RESOURCE_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+
+namespace tflite {
+namespace flex {
+
+// This object stores a pointer for a TfLite subgraph and the associated mutex
+// to access the subgraph. Before accessing the TF Lite subgraph, the caller
+// needs to first acquire a lock on the mutex object.
+class TFLiteSubgraphResource : public tensorflow::ResourceBase {
+ public:
+  explicit TFLiteSubgraphResource(Subgraph& subgraph, TfLiteDelegate* delegate)
+      : subgraph_(subgraph), delegate_(delegate) {}
+
+  std::string DebugString() const override { return "TFLiteSubgraphResource"; }
+
+  // Returns the TFLite subgraph. Before calling
+  // this method, the caller needs to acquire the underlying mutex lock.
+  Subgraph& GetSubgraphResource() const TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    return subgraph_;
+  }
+
+  tensorflow::mutex& GetExclusiveLock() TF_LOCK_RETURNED(mutex_) {
+    return mutex_;
+  }
+
+  // Returns a pointer to the TfLiteDelegate which this instance of subgraph
+  // is running as part of it.
+  TfLiteDelegate* GetFlexDelegate() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    return delegate_;
+  }
+
+ private:
+  tensorflow::mutex mutex_;
+  Subgraph& subgraph_ TF_GUARDED_BY(mutex_);
+  TfLiteDelegate* delegate_ TF_GUARDED_BY(mutex_) = nullptr;
+};
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_SUBGRAPH_RESOURCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/test_util.h
new file mode 100644
index 00000000..71eeead5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/test_util.h
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_TEST_UTIL_H_
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace flex {
+namespace testing {
+
+enum TfOpType {
+  kUnpack,
+  kIdentity,
+  kAdd,
+  kMul,
+  kRfft,
+  kImag,
+  kLoopCond,
+  // Represents an op that does not exist in TensorFlow.
+  kNonExistent,
+  // Represents an valid TensorFlow op where the NodeDef is incompatible.
+  kIncompatibleNodeDef,
+};
+
+// This class creates models with TF and TFLite ops. In order to use this class
+// to test the Flex delegate, implement a function that calls
+// interpreter->ModifyGraphWithDelegate.
+class FlexModelTest : public ::testing::Test {
+ public:
+  FlexModelTest() {}
+  ~FlexModelTest() override {}
+
+  bool Invoke();
+
+  // Sets the (typed) tensor's values at the given index.
+  template <typename T>
+  void SetTypedValues(int tensor_index, const std::vector<T>& values) {
+    memcpy(interpreter_->typed_tensor<T>(tensor_index), values.data(),
+           values.size() * sizeof(T));
+  }
+
+  // Returns the (typed) tensor's values at the given index.
+  template <typename T>
+  std::vector<T> GetTypedValues(int tensor_index) {
+    const TfLiteTensor* t = interpreter_->tensor(tensor_index);
+    const T* tdata = interpreter_->typed_tensor<T>(tensor_index);
+    return std::vector<T>(tdata, tdata + t->bytes / sizeof(T));
+  }
+
+  // Sets the tensor's values at the given index.
+  void SetValues(int tensor_index, const std::vector<float>& values) {
+    SetTypedValues<float>(tensor_index, values);
+  }
+  void SetStringValues(int tensor_index, const std::vector<string>& values);
+
+  // Returns the tensor's values at the given index.
+  std::vector<float> GetValues(int tensor_index) {
+    return GetTypedValues<float>(tensor_index);
+  }
+  std::vector<string> GetStringValues(int tensor_index) const;
+
+  // Sets the tensor's shape at the given index.
+  void SetShape(int tensor_index, const std::vector<int>& values);
+
+  // Returns the tensor's shape at the given index.
+  std::vector<int> GetShape(int tensor_index);
+
+  // Returns the tensor's type at the given index.
+  TfLiteType GetType(int tensor_index);
+
+  // Returns if the tensor at the given index is dynamic.
+  bool IsDynamicTensor(int tensor_index);
+
+  const TestErrorReporter& error_reporter() const { return error_reporter_; }
+
+  // Adds `num_tensor` tensors to the model. `inputs` contains the indices of
+  // the input tensors and `outputs` contains the indices of the output
+  // tensors. All tensors are set to have `type` and `dims`.
+  void AddTensors(int num_tensors, const std::vector<int>& inputs,
+                  const std::vector<int>& outputs, TfLiteType type,
+                  const std::vector<int>& dims);
+
+  // Set a constant tensor of the given shape, type and buffer at the given
+  // index.
+  void SetConstTensor(int tensor_index, const std::vector<int>& values,
+                      TfLiteType type, const char* buffer, size_t bytes);
+
+  // Adds a TFLite Mul op. `inputs` contains the indices of the input tensors
+  // and `outputs` contains the indices of the output tensors.
+  void AddTfLiteMulOp(const std::vector<int>& inputs,
+                      const std::vector<int>& outputs);
+
+  // Adds a TensorFlow op. `inputs` contains the indices of the
+  // input tensors and `outputs` contains the indices of the output tensors.
+  // This function is limited to the set of ops defined in TfOpType.
+  void AddTfOp(TfOpType op, const std::vector<int>& inputs,
+               const std::vector<int>& outputs);
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  TestErrorReporter error_reporter_;
+  std::vector<int> tf_ops_;
+
+ private:
+  // Helper method to add a TensorFlow op. tflite_names needs to start with
+  // "Flex" in order to work with the Flex delegate.
+  void AddTfOp(const char* tflite_name, const string& tf_name,
+               const string& nodedef_str, const std::vector<int>& inputs,
+               const std::vector<int>& outputs);
+
+  std::vector<std::vector<uint8_t>> flexbuffers_;
+
+  int next_op_index_ = 0;
+};
+
+}  // namespace testing
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/util.h
new file mode 100644
index 00000000..e40a61bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/flex/util.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_UTIL_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace flex {
+
+// Converts a tensorflow:Status into a TfLiteStatus. If the original status
+// represented an error, reports it using the given 'context'.
+TfLiteStatus ConvertStatus(TfLiteContext* context,
+                           const tensorflow::Status& status);
+
+// Copies the given shape and type of the TensorFlow 'src' tensor into a TF Lite
+// 'tensor'. Logs an error and returns kTfLiteError if the shape or type can't
+// be converted.
+TfLiteStatus CopyShapeAndType(TfLiteContext* context,
+                              const tensorflow::Tensor& src,
+                              TfLiteTensor* tensor);
+
+// Returns the TF C API Data type that corresponds to the given TfLiteType.
+TF_DataType GetTensorFlowDataType(TfLiteType type);
+
+// Returns the TfLiteType that corresponds to the given TF C API Data type.
+TfLiteType GetTensorFlowLiteType(TF_DataType);
+
+// Returns the TF type name that corresponds to the given TfLiteType.
+const char* TfLiteTypeToTfTypeName(TfLiteType type);
+
+// Creates a `tensorflow::Tensor` from a TfLiteTensor for non-resource and
+// non-variant type. Returns error status if the conversion fails.
+absl::StatusOr<tensorflow::Tensor> CreateTfTensorFromTfLiteTensor(
+    const TfLiteTensor* tflite_tensor);
+
+// Returns the encoded string name for a TF Lite resource variable tensor.
+// This function will return a string in the format:
+// tflite_resource_variable:resource_id.
+std::string TfLiteResourceIdentifier(const TfLiteTensor* tensor);
+
+// Parses out the resource ID from the given `resource_handle` and sets it
+// to the corresponding TfLiteTensor. Returns true if succeed.
+bool GetTfLiteResourceTensorFromResourceHandle(
+    const tensorflow::ResourceHandle& resource_handle, TfLiteTensor* tensor);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/android_hardware_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/android_hardware_buffer.h
new file mode 100644
index 00000000..dc272f69
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/android_hardware_buffer.h
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_ANDROID_HARDWARE_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_ANDROID_HARDWARE_BUFFER_H_
+
+#include <stdint.h>
+
+#ifdef __ANDROID__
+#include <android/hardware_buffer.h>
+#else
+extern "C" {
+typedef struct AHardwareBuffer AHardwareBuffer;
+
+// struct is a copy of the Android NDK AHardwareBuffer_Desc struct in the link
+// below
+// https://developer.android.com/ndk/reference/struct/a-hardware-buffer-desc
+typedef struct AHardwareBuffer_Desc AHardwareBuffer_Desc;
+struct AHardwareBuffer_Desc {
+  uint32_t width;
+  uint32_t height;
+  uint32_t layers;
+  uint32_t format;
+  uint64_t usage;
+  uint32_t stride;
+  uint32_t rfu0;
+  uint64_t rfu1;
+};
+}  // extern "C"
+#endif  // __ANDROID__
+
+namespace tflite::gpu {
+
+// This header file and singleton class encapsulates the following Android NDK
+// features
+//   - header <android/hardware_buffer.h>
+//   - opaque struct type AHardwareBuffer
+//   - struct type AHardwareBuffer_Desc
+//   - function AHardwareBuffer_isSupported
+//   - function AHardwareBuffer_allocate
+//   - function AHardwareBuffer_acquire
+//   - function AHardwareBuffer_release
+//   - function AHardwareBuffer_describe
+//   - library libnativewindow.so (for the above features)
+//
+// For documentation on these features, see
+// <https://developer.android.com/ndk/reference/group/a-hardware-buffer>:
+//
+// Unlike using the native NDK functionality directly, this class only has a
+// run-time dependency on API level 26, not a build-time dependency.  So it can
+// be used even when building with NDK min SDK level < 26, as long as you are
+// very careful to check that Supported() returns true before calling any other
+// methods.
+class OptionalAndroidHardwareBuffer {
+ public:
+  static OptionalAndroidHardwareBuffer& Instance() {
+    static OptionalAndroidHardwareBuffer instance;
+    return instance;
+  }
+
+  // Returns true if the functionality in this class is supported.
+  bool Supported() { return supported_; }
+
+  // Like AHardwareBuffer_isSupported.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  int IsSupported(const AHardwareBuffer_Desc* description) {
+    return is_supported_(description);
+  }
+
+  // Like AHardwareBuffer_allocate.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  int Allocate(const AHardwareBuffer_Desc* description,
+               AHardwareBuffer** buffer) {
+    return allocate_(description, buffer);
+  }
+
+  // Like AHardwareBuffer_acquire.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  void Acquire(AHardwareBuffer* buffer) { return acquire_(buffer); }
+
+  // Like AHardwareBuffer_release.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  void Release(AHardwareBuffer* buffer) { return release_(buffer); }
+
+  // Like AHardwareBuffer_describe.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  void Describe(AHardwareBuffer* buffer, AHardwareBuffer_Desc* desc) {
+    return describe_(buffer, desc);
+  }
+
+ private:
+  void* dlopen_handle_;
+  int (*is_supported_)(const AHardwareBuffer_Desc* desc);
+  int (*allocate_)(const AHardwareBuffer_Desc* desc, AHardwareBuffer** buffer);
+  void (*acquire_)(AHardwareBuffer* buffer);
+  void (*release_)(AHardwareBuffer* buffer);
+  void (*describe_)(AHardwareBuffer* buffer, AHardwareBuffer_Desc* desc);
+  bool supported_;
+
+  OptionalAndroidHardwareBuffer();
+  OptionalAndroidHardwareBuffer(const OptionalAndroidHardwareBuffer&) = delete;
+  // Note that we deliberately do not call dlclose() in the destructor; doing
+  // so would complicate the code and would unnecessarily introduce additional
+  // failure scenarios. The object is a singleton and so is only destroyed when
+  // the process is about to exit, and the OS will automatically reclaim the
+  // resources on process exit anyway, so calling dlclose would only slow down
+  // process exit.
+  ~OptionalAndroidHardwareBuffer() = default;
+};
+
+}  // namespace tflite::gpu
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_ANDROID_HARDWARE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/api.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/api.h
new file mode 100644
index 00000000..535f077b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/api.h
@@ -0,0 +1,412 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_API_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_API_H_
+
+// Usage example:
+//
+//   // Builder is created from a model using GPU-specific parameters.
+//   std::unique_ptr<InferenceBuilder> builder = ...;
+//
+//   // input data is coming from a texture
+//   // output data goes to CPU
+//   builder->SetInputObjectDef(0, {DataType::FLOAT16, DataLayout::PHWC4,
+//                                  ObjectType::OPENGL_TEXTURE, true});
+//   builder->SetOutputObjectDef(0, {DataType::FLOAT32, DataLayout::BHWC,
+//                                  ObjectType::CPU_MEMORY, false});
+//   std::unique_ptr<InferenceRunner> runner;
+//   RETURN_IF_ERROR(builder->Build(&runner));  // may take significant time.
+//   RETURN_IF_ERROR(
+//       runner->SetInputObject(0, OpenGlTexture{texture_ud, texture_format}));
+//   RETURN_IF_ERROR(runner->Run());
+
+#include <cstdint>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include <CL/cl.h>
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "vulkan/vulkan.h"  // from @vulkan_headers
+
+#define GL_NO_PROTOTYPES
+#define EGL_NO_PROTOTYPES
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#undef GL_NO_PROTOTYPES
+#undef EGL_NO_PROTOTYPES
+
+namespace tflite {
+namespace gpu {
+
+// Common abbreviations:
+//   B  - batch
+//   H  - height
+//   W  - width
+//   C  - channels
+//   D  - depth := DivideRoundUp(C, 4)
+//   C4 - is the constant = 4.
+enum class DataLayout {
+  UNKNOWN,
+  BHWC,
+  DHWC4,
+  HWDC4,
+  HDWC4,
+};
+
+enum class ObjectType {
+  UNKNOWN,
+  OPENGL_SSBO,
+  OPENGL_TEXTURE,
+  CPU_MEMORY,
+  OPENCL_TEXTURE,
+  OPENCL_BUFFER,
+  VULKAN_BUFFER,
+  VULKAN_TEXTURE
+};
+
+struct OpenGlBuffer {
+  OpenGlBuffer() = default;
+  explicit OpenGlBuffer(GLuint new_id) : id(new_id) {}
+
+  GLuint id = GL_INVALID_INDEX;
+};
+
+struct OpenGlTexture {
+  OpenGlTexture() = default;
+  OpenGlTexture(GLuint new_id, GLenum new_format)
+      : id(new_id), format(new_format) {}
+
+  GLuint id = GL_INVALID_INDEX;
+  GLenum format = GL_INVALID_ENUM;
+};
+
+struct OpenClBuffer {
+  OpenClBuffer() = default;
+  explicit OpenClBuffer(cl_mem new_memobj) : memobj(new_memobj) {}
+
+  cl_mem memobj = nullptr;
+};
+
+struct OpenClTexture {
+  OpenClTexture() = default;
+  explicit OpenClTexture(cl_mem new_memobj) : memobj(new_memobj) {}
+
+  cl_mem memobj = nullptr;
+  // TODO(akulik): should it specify texture format?
+};
+
+struct VulkanBuffer {
+  VulkanBuffer() = default;
+  explicit VulkanBuffer(VkBuffer buffer_, VkDeviceSize size_,
+                        VkDeviceMemory memory_, VkDeviceSize offset_)
+      : buffer(buffer_), size(size_), memory(memory_), offset(offset_) {}
+
+  VkBuffer buffer;
+  VkDeviceSize size;
+  VkDeviceMemory memory;
+  VkDeviceSize offset;
+};
+
+struct VulkanTexture {
+  VulkanTexture() = default;
+  explicit VulkanTexture(VkDeviceMemory new_memory) : memory(new_memory) {}
+
+  VkImage image;
+  VkImageView image_view;
+  VkFormat format;
+  VkExtent3D extent;
+  VkDeviceMemory memory;
+  VkDeviceSize offset;
+};
+
+struct VulkanMemory {
+  VulkanMemory() = default;
+  explicit VulkanMemory(VkDeviceMemory new_memory) : memory(new_memory) {}
+
+  VkDeviceMemory memory;
+  VkDeviceSize size;
+  VkDeviceSize offset;
+};
+
+struct CpuMemory {
+  CpuMemory() = default;
+  CpuMemory(void* new_data, size_t new_size_bytes)
+      : data(new_data), size_bytes(new_size_bytes) {}
+
+  void* data = nullptr;
+  size_t size_bytes = 0;
+};
+
+template <typename T>
+inline CpuMemory MakeCpuMemory(absl::Span<T> t) {
+  CpuMemory m;
+  m.data = t.data();
+  m.size_bytes = t.size() * sizeof(T);
+  return m;
+}
+
+template <typename T>
+inline CpuMemory MakeReadableCpuMemory(absl::Span<const T> t) {
+  CpuMemory m;
+  m.data = const_cast<T*>(t.data());
+  m.size_bytes = t.size() * sizeof(T);
+  return m;
+}
+
+// Defines object representation.
+struct ObjectDef {
+  DataType data_type = DataType::UNKNOWN;
+  DataLayout data_layout = DataLayout::UNKNOWN;
+  ObjectType object_type = ObjectType::UNKNOWN;
+
+  // If true, then object is managed externally and needs to be provided to
+  // InferenceRunner by a user before running inference.
+  //
+  // User-provided objects will not be re-used internally for any purpose to
+  // lower overall memory usage.
+  bool user_provided = false;
+
+  bool operator==(const ObjectDef& other) const {
+    return data_type == other.data_type && data_layout == other.data_layout &&
+           object_type == other.object_type &&
+           user_provided == other.user_provided;
+  }
+};
+
+bool IsValid(const ObjectDef& def);
+
+struct Dimensions {
+  Dimensions() : b(1), h(1), w(1), c(1) {}
+
+  Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
+      : b(batch), h(height), w(width), c(channels) {}
+
+  int32_t d() const { return DivideRoundUp(c, 4); }
+
+  int32_t product() const { return b * h * w * c; }
+
+  bool operator==(const Dimensions& other) const {
+    return b == other.b && h == other.h && w == other.w && c == other.c;
+  }
+
+  int32_t b;
+  int32_t h;
+  int32_t w;
+  int32_t c;
+};
+
+// Connects tensor shape with corresponding object definition.
+struct TensorObjectDef {
+  // Dimensions semantic is defined by corresponding DataLayout.
+  Dimensions dimensions;
+  ObjectDef object_def;
+
+  bool operator==(const TensorObjectDef& other) const {
+    return dimensions == other.dimensions && object_def == other.object_def;
+  }
+};
+
+// @return true if tensor object def is defined.
+bool IsValid(const TensorObjectDef& def);
+
+// @return the number of elements in a tensor object.
+uint32_t NumElements(const TensorObjectDef& def);
+
+using TensorObject =
+    absl::variant<std::monostate, OpenGlBuffer, OpenGlTexture, CpuMemory,
+                  OpenClBuffer, OpenClTexture, VulkanBuffer, VulkanTexture>;
+
+// @return true if object is set and corresponding values are defined.
+bool IsValid(const TensorObjectDef& def, const TensorObject& object);
+
+ObjectType GetType(const TensorObject& object);
+
+// @return true if corresponding object is set for the given type
+bool IsObjectPresent(ObjectType type, const TensorObject& obj);
+
+// @return true if corresponding object has already been initialized and
+// assigned with a specific ObjectType.
+bool IsObjectInitialized(const TensorObject& obj);
+
+class InferenceRunner;
+
+// Allows to inspect and change input and output definitions before a graph is
+// prepared for the inference.
+class InferenceBuilder {
+ public:
+  virtual ~InferenceBuilder() {}
+
+  // Returns inference graph inputs and outputs definitions.
+  virtual std::vector<TensorObjectDef> inputs() const = 0;
+  virtual std::vector<TensorObjectDef> outputs() const = 0;
+
+  // Sets new shape for the input if underlying implementation and graph
+  // structure allows dynamic tensors.
+  virtual absl::Status SetInputShape(int index,
+                                     const Dimensions& dimensions) = 0;
+
+  // Updates object definitions for the given index. Implementation may allow
+  // to use different layouts and/or data type conversions between objects
+  // defined in a graph and given objects, for example:
+  //   input '0' is DataType::FLOAT32, DataLayout::BHWC.
+  //   A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4.
+  //   An implementation may allow this transformation to happen automatically
+  //   under the hood.
+  virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0;
+  virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0;
+  virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def) {
+    auto input_defs = inputs();
+    for (int i = 0; i < input_defs.size(); ++i) {
+      RETURN_IF_ERROR(SetInputObjectDef(i, def));
+    }
+    return absl::OkStatus();
+  }
+  virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def) {
+    auto output_defs = outputs();
+    for (int i = 0; i < output_defs.size(); ++i) {
+      RETURN_IF_ERROR(SetOutputObjectDef(i, def));
+    }
+    return absl::OkStatus();
+  }
+
+  // Creates new instance of the inference runner. InferenceBuilder stays valid
+  // and could be used to create another inference runner if needed.
+  //
+  // This method may take significant time to prepare new inference runner. For
+  // example, it may require to compile OpenGL shaders.
+  virtual absl::Status Build(std::unique_ptr<InferenceRunner>* runner) = 0;
+};
+
+// Runs prepared inference. Every object marked as external needs to be set
+// prior calling Run method.
+class InferenceRunner {
+ public:
+  virtual ~InferenceRunner() {}
+
+  // Returns inference graph inputs and outputs definitions.
+  virtual std::vector<TensorObjectDef> inputs() const = 0;
+  virtual std::vector<TensorObjectDef> outputs() const = 0;
+
+  // Getters provide access to underlying objects for the given index.
+  // Setters allow to set or change external object for the given index. Note,
+  // object need to match object definition set before in InferenceBuilder.
+
+  virtual absl::Status GetInputObject(int index, TensorObject* object) = 0;
+  virtual absl::Status GetOutputObject(int index, TensorObject* object) = 0;
+  virtual absl::Status SetInputObject(int index, TensorObject object) = 0;
+  virtual absl::Status SetOutputObject(int index, TensorObject object) = 0;
+
+  virtual absl::Status Run() = 0;
+};
+
+// Encapsulated compilation/runtime tradeoffs.
+enum class InferenceUsage {
+  UNKNOWN,
+
+  // InferenceRunner will be used only once. Therefore, it is important to
+  // minimize bootstrap time as well.
+  FAST_SINGLE_ANSWER,
+
+  // Prefer maximizing the throughput. Same inference runner will be used
+  // repeatedly on different inputs.
+  SUSTAINED_SPEED,
+
+  // Balance init latency and throughput. This option will result in slightly
+  // higher init latency than FAST_SINGLE_ANSWER but should have inference
+  // latency closer to SUSTAINED_SPEED.
+  BALANCED,
+};
+
+// Defines aspects to control while instantiating a runner.
+enum class InferencePriority {
+  UNKNOWN,
+
+  AUTO,
+
+  MIN_LATENCY,
+
+  MAX_PRECISION,
+
+  MIN_MEMORY_USAGE,
+};
+
+struct InferenceOptions {
+  InferenceUsage usage = InferenceUsage::SUSTAINED_SPEED;
+
+  // Ordered priorities provide better understanding of desired semantics,
+  // where priority(n) is more important than priority(n+1).
+  // AUTO priority is needed when a single priority is the most important
+  // factor. For example, priority1 = InferencePriority::MIN_LATENCY and leaving
+  // everything else to AUTO would result in configuration that achieves maximum
+  // performance.
+  //
+  // AUTO priority can only be used when higher priorities are fully specified.
+  // For example:
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
+  //            priority3 = AUTO
+  //   INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
+  //   INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
+  //            priority3 = MAX_PRECISION
+  // Invalid priorities will result in error.
+  InferencePriority priority1 = InferencePriority::MAX_PRECISION;
+
+  InferencePriority priority2 = InferencePriority::AUTO;
+
+  InferencePriority priority3 = InferencePriority::AUTO;
+#ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
+  // Number of times to invoke the inference in GPU delegate, to collect more
+  // accurate latency result. Default as 1, which is the original behavior.
+  int gpu_invoke_loop_times = 1;
+#endif
+};
+
+// Returns a position number for the priority. If priority is missing,
+// then it would return 'max num priorities + 1'.
+int GetPosition(const InferenceOptions& options, InferencePriority p);
+
+// Return true if options are valid.
+bool IsValid(const InferenceOptions& options);
+
+// Resolves AUTO priorities and specifies them explicitly.
+// Note, no-one should assume that these mappings will not change.
+// Technically this function is declared here for code re-use purposes and
+// by no means it should be treated as canonical way to resolve AUTO.
+void ResolveAutoPriority(InferenceOptions* options);
+
+enum class PriorityImportance {
+  UNKNOWN,
+  HIGHER,
+  LOWER,
+};
+
+// If both p1 and p2 are not present in options, return UNKNOWN
+// If p1 is present, but p2 is not, return HIGHER
+// If p2 is present, but p1 is not, return LOWER
+// If both are present, and p1 is more important, return HIGHER, otherwise,
+// LOWER.
+PriorityImportance GetRelativeImportance(const InferenceOptions& options,
+                                         InferencePriority p1,
+                                         InferencePriority p2);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/async_buffers.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/async_buffers.h
new file mode 100644
index 00000000..cbd39ad2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/async_buffers.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_ASYNC_BUFFERS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_ASYNC_BUFFERS_H_
+
+#if defined(__ANDROID__)
+#include <android/hardware_buffer.h>
+#endif  // __ANDROID__
+
+#include <GLES3/gl31.h>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+extern "C" typedef struct AHardwareBuffer AHardwareBuffer;
+
+namespace tflite {
+namespace gpu {
+
+class AsyncBuffer {
+ private:
+  int bytes_;                                // Number of bytes in the buffer
+  bool valid_ = false;                       // Have we mapped to SSBO already
+  GLuint opengl_buffer_ = GL_INVALID_INDEX;  // SSBO buffer id
+  AHardwareBuffer* ahwb_ = nullptr;
+
+  // Where the AHWB<->SSBO mapping occurs
+  absl::Status MapAHardwareBufferToGlBuffer();
+  // Allocate SSBO, call the AHWB<->SSBO mapping; fail gracefully if needed.
+  absl::Status AllocateOpenGlBuffer();
+
+ public:
+  explicit AsyncBuffer(TensorObjectDef tensor_def, AHardwareBuffer* ahwb) {
+    bytes_ = NumElements(tensor_def) * SizeOf(tensor_def.object_def.data_type);
+    ahwb_ = ahwb;
+  }
+  // Map the AHWB (from class constructor) to an SSBO id
+  absl::Status GetOpenGlBuffer(GLuint& buffer_ref);
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_ASYNC_BUFFERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/api.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/api.h
new file mode 100644
index 00000000..29381ea7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/api.h
@@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
+
+#ifdef CL_DELEGATE_NO_GL
+#define EGL_NO_PROTOTYPES
+#endif
+
+#include <EGL/egl.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+// Usage example:
+//
+//   std::unique_ptr<InferenceEnvironment> env;
+//   RETURN_IF_ERROR(NewInferenceEnvironment(option, &env));
+//
+//   InferenceOptions options;
+//
+//   std::unique_ptr<InferenceBuilder> builder;
+//   RETURN_IF_ERROR(env->NewInferenceBuilder(options, model, &builder));
+//   // now builder is ready to prepare inference runner.
+//
+// -----------------
+// Supported formats
+// -----------------
+//
+// OpenCL implementation uses 2D textures as the primary format.
+// Tensor in HWDC4 layout is {TEXTURE_2D, RGBA, width := W*D, height := H}.
+//
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct InferenceOptions : public tflite::gpu::InferenceOptions {};
+
+// Indicates environment
+struct InferenceEnvironmentProperties {
+  bool is_opencl_available = false;
+
+  // GL objects (buffers and textures) could be shared with CL context.
+  bool is_gl_sharing_supported = false;
+
+  // Indicates whether fast GL->CL synchronization is supported.
+  bool is_gl_to_cl_fast_sync_supported = false;
+
+  // Indicates whether fast CL->GL synchronization is supported.
+  bool is_cl_to_gl_fast_sync_supported = false;
+};
+
+// Environment manages all resources that need to stay until any inference is
+// running using OpenCL backend.
+class InferenceEnvironment {
+ public:
+  virtual ~InferenceEnvironment() {}
+
+  // Converts GraphFloat32 into intermediate, device-specific representation.
+  // This serialized_model specific for device and InferenceOptions.
+  // serialized_model cannot be used with another device or InferenceOptions.
+  // Loading serialized_model is much faster than loading GraphFloat32.
+  // serialized_model must be used with appropriate NewInferenceBuilder
+  // method (see below).
+  // Normally BuildSerializedModel method need to be called whenever a model or
+  // OS GPU driver is updated.
+  virtual absl::Status BuildSerializedModel(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::vector<uint8_t>* serialized_model) = 0;
+
+  // Serialized model can became invalid when environment changes. In this case
+  // this call will fail and model must be regenerated(with
+  // BuildSerializedModel).
+  virtual absl::Status NewInferenceBuilder(
+      const absl::Span<const uint8_t> serialized_model,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+
+  virtual absl::Status NewInferenceBuilder(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+
+  // Returns opaque binary blob that contains a collection of already compiled
+  // OpenCL kernels present in a cache. Returned data could be re-used later
+  // to speed up compilation time when new environment is created for the same
+  // set of models.
+  // Returned data is valid only if used on the same device, otherwise it will
+  // not be compatible and will be discarded.
+  virtual std::vector<uint8_t> GetSerializedBinaryCache() const = 0;
+};
+
+struct InferenceEnvironmentOptions {
+  // If any of these objects are set, created environment will use them instead
+  // of creating/choosing own instances.
+  cl_device_id device = nullptr;
+  cl_context context = nullptr;
+  cl_command_queue command_queue = nullptr;
+
+  // Whenever input and/or output is GL object, EGL display and context must be
+  // set to create GL aware OpenCL context. Do not set these variables whenever
+  // GL interoperability is not needed.
+  // It is the error to set egl_display, egl_context AND context at the same
+  // time. If egl_display and egl_context are set, they will be used to create
+  // GL-aware CL context.
+  EGLDisplay egl_display = EGL_NO_DISPLAY;
+  EGLContext egl_context = EGL_NO_CONTEXT;
+
+  // Should contain data returned from
+  // InferenceEnvironment::GetSerializedBinaryCache method.
+  // Invalid or incompatible data will be discarded. Compiled binary may become
+  // incompatible when GPU driver is updated.
+  absl::Span<const uint8_t> serialized_binary_cache;
+
+  bool IsGlAware() const {
+    return egl_context != EGL_NO_CONTEXT && egl_display != EGL_NO_DISPLAY;
+  }
+};
+
+// Creates new OpenCL environment that needs to stay around until all inference
+// runners are destroyed.
+absl::Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties /* optional */);
+
+class CLInferenceRunner : public ::tflite::gpu::InferenceRunner {
+ public:
+  // The RunWithoutExternalBufferCopy provides a contract where the user of this
+  // interface does not need
+  //    a. Inputs to be copied to the internal GPU buffer from the external CPU
+  //       input buffer
+  //    b. Outputs to be copied from the internal GPU buffer to the
+  //       external CPU buffer
+  //
+  // The user of this interface is responsible for copying the inputs prior to
+  // running the GPU kernels and outputs post running with the other interfaces
+  // provided here.
+  virtual absl::Status RunWithoutExternalBufferCopy() = 0;
+
+  // Copies from the external input tensor (normally CPU buffer) to the internal
+  // OpenCL buffer.  The call only guarantees a queueing of the command. The
+  // caller is expected to hold a copy of the queue and wait for completion if
+  // the external buffer is a CPU buffer.
+  virtual absl::Status CopyFromExternalInput(int index) = 0;
+
+  // Copies from the internal output OpenCL buffer to the external output
+  // tensor.  The call only guarantees a queueing of the command. The caller
+  // is expected to hold a copy of the queue and wait for completion if the
+  // external buffer is a CPU buffer.
+  virtual absl::Status CopyToExternalOutput(int index) = 0;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/buffer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/buffer.h
new file mode 100644
index 00000000..088a66aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Buffer represent linear GPU data storage with arbitrary data format.
+// Buffer is moveable but not copyable.
+class Buffer : public GPUObject {
+ public:
+  Buffer() {}  // just for using Buffer as a class members
+  Buffer(cl_mem buffer, size_t size_in_bytes, bool is_sub_buffer = false);
+  explicit Buffer(cl_mem buffer);
+
+  // Move only
+  Buffer(Buffer&& buffer);
+  Buffer& operator=(Buffer&& buffer);
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+  ~Buffer() override { Release(); }
+
+  // for profiling and memory statistics
+  uint64_t GetMemorySizeInBytes() const { return size_; }
+
+  cl_mem GetMemoryPtr() const { return buffer_; }
+
+  bool IsSubBuffer() const { return is_sub_buffer_; }
+
+  // Writes data to a buffer. Data should point to a region that
+  // has exact size in bytes as size_in_bytes(constructor parameter).
+  template <typename T>
+  absl::Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
+
+  // Reads data from Buffer into CPU memory.
+  template <typename T>
+  absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
+
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
+
+  absl::Status CreateFromBufferDescriptor(const BufferDescriptor& desc,
+                                          CLContext* context);
+
+ private:
+  void Release();
+
+  cl_mem buffer_ = nullptr;
+  size_t size_ = 0;
+  bool is_sub_buffer_ = false;
+  bool owner_ = true;
+};
+
+Buffer CreateBufferShared(cl_mem buffer);
+
+absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
+                                  Buffer* result);
+
+absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
+                                  CLContext* context, Buffer* result);
+
+absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
+                                   Buffer* result);
+
+absl::Status CreateReadWriteSubBuffer(const Buffer& parent,
+                                      size_t origin_in_bytes,
+                                      size_t size_in_bytes, CLContext* context,
+                                      Buffer* result);
+
+template <typename T>
+absl::Status Buffer::WriteData(CLCommandQueue* queue,
+                               const absl::Span<T> data) {
+  if (size_ != sizeof(T) * data.size()) {
+    return absl::InvalidArgumentError(
+        "absl::Span<T> data size is different from buffer allocated size.");
+  }
+  RETURN_IF_ERROR(queue->EnqueueWriteBuffer(buffer_, size_, data.data()));
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status Buffer::ReadData(CLCommandQueue* queue,
+                              std::vector<T>* result) const {
+  if (size_ % sizeof(T) != 0) {
+    return absl::UnknownError("Wrong element size(typename T is not correct?");
+  }
+
+  const int elements_count = size_ / sizeof(T);
+  result->resize(elements_count);
+
+  return queue->EnqueueReadBuffer(buffer_, size_, result->data());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_arguments.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
new file mode 100644
index 00000000..463be295
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
@@ -0,0 +1,186 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/arguments.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class CLArguments : public ArgumentsBinder {
+ public:
+  CLArguments() = default;
+
+  absl::Status Init(const GpuInfo& gpu_info,
+                    CLContext* context, Arguments* args, std::string* code);
+  absl::Status Init(const GpuInfo& gpu_info, Arguments* args,
+                    CLContext* context);
+
+  // Move only
+  CLArguments(CLArguments&& args) = default;
+  CLArguments& operator=(CLArguments&& args) = default;
+  CLArguments(const CLArguments&) = delete;
+  CLArguments& operator=(const CLArguments&) = delete;
+
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+  absl::Status SetHalf(const std::string& name, half value) override;
+  absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
+
+  absl::Status Bind(cl_kernel kernel, int offset = 0);
+
+  // Compares Int, Float, Half names to values mapping with the mapping in
+  // `other` and returns true if they are the same.
+  bool HasEqualScalarArguments(const CLArguments& other) const;
+
+ private:
+  absl::Status AllocateObjects(const Arguments& args, CLContext* context);
+  absl::Status AddObjectArgs(const GpuInfo& gpu_info, const Arguments& args);
+
+  void CopyArguments(const Arguments& args, bool use_f32_for_halfs);
+  void RenameArgumentsInCode(std::string* code);
+  std::string GetListOfArgs();
+
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
+  void AddCustomMemory(const std::string& name,
+                       const GPUCustomMemoryDescriptor& desc);
+  void AddGPUResources(const std::string& name, const GPUResources& resources);
+  absl::Status SetObjectsResources(const Arguments& args);
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
+  absl::Status SetImage3D(const std::string& name, cl_mem memory);
+  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
+
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t offset = -1;
+
+    bool operator==(const IntValue& other) const {
+      return value == other.value && offset == other.offset &&
+             active == other.active;
+    }
+  };
+  std::map<std::string, IntValue> int_values_;
+  std::vector<int32_t> shared_int4s_data_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t offset = -1;
+
+    bool operator==(const FloatValue& other) const {
+      return value == other.value && offset == other.offset &&
+             active == other.active;
+    }
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<float> shared_float4s_data_;
+
+  struct HalfValue {
+    half value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // some devices have issues with half parameters.
+    bool store_as_f32 = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+
+    bool operator==(const HalfValue& other) const {
+      return value == other.value && offset == other.offset &&
+             active == other.active;
+    }
+  };
+  std::map<std::string, HalfValue> half_values_;
+  std::vector<half> shared_half4s_data_;
+
+  struct CLBufferDescriptor {
+    GPUBufferDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImage2DDescriptor {
+    GPUImage2DDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImage2DArrayDescriptor {
+    GPUImage2DArrayDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImage3DDescriptor {
+    GPUImage3DDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLImageBufferDescriptor {
+    GPUImageBufferDescriptor desc;
+    cl_mem memory;
+  };
+  struct CLCustomMemoryDescriptor {
+    GPUCustomMemoryDescriptor desc;
+    cl_mem memory;
+  };
+
+  std::map<std::string, CLBufferDescriptor> buffers_;
+  std::map<std::string, CLImage2DDescriptor> images2d_;
+  std::map<std::string, CLImage2DArrayDescriptor> image2d_arrays_;
+  std::map<std::string, CLImage3DDescriptor> images3d_;
+  std::map<std::string, CLImageBufferDescriptor> image_buffers_;
+  std::map<std::string, CLCustomMemoryDescriptor> custom_memories_;
+
+  std::map<std::string, GPUObjectDescriptorPtr> object_refs_;
+  std::vector<GPUObjectPtr> objects_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ARGUMENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h
new file mode 100644
index 00000000..016346dd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class CLCommandBuffer {
+ public:
+  CLCommandBuffer() = default;
+  // Move only
+  CLCommandBuffer(CLCommandBuffer&& cb);
+  CLCommandBuffer& operator=(CLCommandBuffer&& cb);
+  CLCommandBuffer(const CLCommandBuffer&) = delete;
+  CLCommandBuffer& operator=(const CLCommandBuffer&) = delete;
+
+  ~CLCommandBuffer() { Release(); }
+
+  absl::Status Init(CLCommandQueue* queue, bool simultaneous_use = false);
+  absl::Status Finalize();
+  absl::Status Enqueue(CLCommandQueue* queue, CLEvent* event = nullptr);
+  cl_command_buffer_khr GetCommandBuffer() const { return cb_; }
+
+ private:
+  void Release();
+  cl_command_buffer_khr cb_ = nullptr;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
new file mode 100644
index 00000000..897dc959
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@@ -0,0 +1,141 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A wrapper around opencl command queue
+class CLCommandQueue {
+ public:
+  CLCommandQueue() {}
+  CLCommandQueue(cl_command_queue queue, bool has_ownership);
+
+  // Move only
+  CLCommandQueue(CLCommandQueue&& queue);
+  CLCommandQueue& operator=(CLCommandQueue&& queue);
+  CLCommandQueue(const CLCommandQueue&) = delete;
+  CLCommandQueue& operator=(const CLCommandQueue&) = delete;
+
+  virtual ~CLCommandQueue();
+
+  cl_command_queue queue() const { return queue_; }
+
+  virtual absl::Status Dispatch(const CLKernel& kernel,
+                                const int3& work_groups_count,
+                                const int3& work_group_size);
+
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size, CLEvent* event);
+
+  absl::Status EnqueueEvent(CLEvent* event);
+
+  absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data,
+                                 bool async = false);
+  absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data,
+                                bool async = false);
+
+  absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
+                                  const void* data, bool async = false);
+  absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
+                                 void* data, bool async = false);
+
+  absl::Status WaitForCompletion();
+
+ protected:
+  void Release();
+
+  cl_command_queue queue_ = nullptr;
+  bool has_ownership_ = false;
+};
+
+class ProfilingCommandQueue : public CLCommandQueue {
+ public:
+  ProfilingCommandQueue() {}
+  explicit ProfilingCommandQueue(cl_command_queue queue);
+
+  // Move only
+  ProfilingCommandQueue(ProfilingCommandQueue&& queue);
+  ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
+  ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
+  ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
+
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size) override;
+
+  // for better profiling
+  absl::Status DispatchNTimes(const CLKernel& kernel,
+                              const int3& work_groups_count,
+                              const int3& work_group_size, int n,
+                              int flush_period = 0);
+
+  // will write index for fastest work_group among work_group_sizes
+  absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
+                                     const GpuInfo& gpu_info,
+                                     const std::vector<int3>& work_groups_count,
+                                     const std::vector<int3>& work_group_sizes,
+                                     int* index);
+
+  // call ResetMeasurements() to start new seriese of measurements
+  void ResetMeasurements();
+
+  double GetQueueExecutionTimeMs() const;
+
+  // Difference from GetQueueExecutionTimeMs is that this number doesn't include
+  // time between kernels(kernels launches or preparing) on GPU. Usually, this
+  // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
+  // spend on something else(maybe kernels launches or preparing)
+  double GetSumOfEventsTimeMs() const;
+
+  // This label will be used for all subsequent dispatches.
+  void SetEventsLabel(const std::string& name);
+
+  ProfilingInfo GetProfilingInfo() const;
+
+ private:
+  std::vector<CLEvent> events_;
+  std::vector<int> number_of_dispatches_;
+  std::string current_label_;
+};
+
+absl::Status CreateCLCommandQueue(const CLDevice& device,
+                                  const CLContext& context,
+                                  CLCommandQueue* result);
+
+absl::Status CreateProfilingCommandQueue(const CLDevice& device,
+                                         const CLContext& context,
+                                         ProfilingCommandQueue* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_context.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_context.h
new file mode 100644
index 00000000..d262e8c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_context.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A RAII wrapper around opencl context
+class CLContext {
+ public:
+  CLContext() {}
+  CLContext(cl_context context, bool has_ownership);
+  CLContext(cl_context context, bool has_ownership, CLDevice& device);
+  // Move only
+  CLContext(CLContext&& context);
+  CLContext& operator=(CLContext&& context);
+  CLContext(const CLContext&) = delete;
+  CLContext& operator=(const CLContext&) = delete;
+
+  ~CLContext();
+
+  cl_context context() const { return context_; }
+
+  bool IsFloatTexture2DSupported(int num_channels, DataType data_type,
+                                 cl_mem_flags flags = CL_MEM_READ_WRITE) const;
+
+ private:
+  void Release();
+
+  cl_context context_ = nullptr;
+  bool has_ownership_ = false;
+};
+
+absl::Status CreateCLContext(const CLDevice& device, CLContext* result);
+absl::Status CreateCLGLContext(const CLDevice& device,
+                               cl_context_properties egl_context,
+                               cl_context_properties egl_display,
+                               CLContext* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_device.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_device.h
new file mode 100644
index 00000000..76be4189
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A wrapper around opencl device id
+class CLDevice {
+ public:
+  CLDevice() = default;
+  CLDevice(cl_device_id id, cl_platform_id platform_id);
+
+  CLDevice(CLDevice&& device);
+  CLDevice& operator=(CLDevice&& device);
+  CLDevice(const CLDevice&);
+  CLDevice& operator=(const CLDevice&);
+
+  ~CLDevice() {}
+
+  cl_device_id id() const { return id_; }
+  cl_platform_id platform() const { return platform_id_; }
+  std::string GetPlatformVersion() const;
+
+  // To track bug on some Adreno. b/131099086
+  void DisableOneLayerTextureArray();
+
+  const GpuInfo& GetInfo() const { return info_; }
+  // We update device info during context creation, so as supported texture
+  // formats can be requested from context only.
+  mutable GpuInfo info_;
+
+ private:
+  cl_device_id id_ = nullptr;
+  cl_platform_id platform_id_ = nullptr;
+};
+
+absl::Status CreateDefaultGPUDevice(CLDevice* result);
+
+template <typename T>
+T GetDeviceInfo(cl_device_id id, cl_device_info info) {
+  T result;
+  cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr);
+  if (error != CL_SUCCESS) {
+    return -1;
+  }
+  return result;
+}
+
+template <typename T>
+absl::Status GetDeviceInfo(cl_device_id id, cl_device_info info, T* result) {
+  cl_int error = clGetDeviceInfo(id, info, sizeof(T), result, nullptr);
+  if (error != CL_SUCCESS) {
+    return absl::InvalidArgumentError(CLErrorCodeToString(error));
+  }
+  return absl::OkStatus();
+}
+
+void ParseQualcommOpenClCompilerVersion(
+    const std::string& cl_driver_version,
+    AdrenoInfo::OpenClCompilerVersion* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_errors.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_errors.h
new file mode 100644
index 00000000..fb59766b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_errors.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// @return if error_code is success, then return OK status. Otherwise translates
+// error code into a message.
+inline absl::Status GetOpenCLError(cl_int error_code) {
+  if (error_code == CL_SUCCESS) {
+    return absl::OkStatus();
+  }
+  return absl::InternalError("OpenCL error: " +
+                             CLErrorCodeToString(error_code));
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_event.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_event.h
new file mode 100644
index 00000000..898e7a92
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_event.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A RAII wrapper around opencl event
+class CLEvent {
+ public:
+  CLEvent() {}
+  explicit CLEvent(cl_event event);
+
+  // Move only
+  CLEvent(CLEvent&& event);
+  CLEvent& operator=(CLEvent&& event);
+  CLEvent(const CLEvent&) = delete;
+  CLEvent& operator=(const CLEvent&) = delete;
+
+  ~CLEvent();
+
+  uint64_t GetStartedTimeNs() const;
+  uint64_t GetFinishedTimeNs() const;
+
+  double GetEventTimeMs() const;
+  uint64_t GetEventTimeNs() const;
+
+  void Wait() const;
+
+  cl_event event() const { return event_; }
+
+  bool is_valid() const { return event_ != nullptr; }
+
+  void SetName(const std::string& name);
+  std::string GetName() const { return name_; }
+
+ private:
+  void Release();
+
+  cl_event event_ = nullptr;
+
+  std::string name_;  // optional, for profiling mostly
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_image_format.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_image_format.h
new file mode 100644
index 00000000..cb415f89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_image_format.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_channel_order ToChannelOrder(int num_channels);
+
+cl_channel_type DataTypeToChannelType(DataType type, bool normalized = false);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
new file mode 100644
index 00000000..8ce0a294
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Arguments binding to CLKernel can be manual or automatic
+// In manual you specify binding index explicitly
+// In automatic binding, index auto-incremented with every binding call
+// Also, if you use automatic mode you must call ResetBindingCounter
+//   before parameters binding
+class CLKernel {
+ public:
+  CLKernel() {}
+
+  // Move only
+  CLKernel(CLKernel&& kernel);
+  CLKernel& operator=(CLKernel&& kernel);
+  CLKernel(const CLKernel&) = delete;
+  CLKernel& operator=(const CLKernel&) = delete;
+
+  ~CLKernel();
+
+  cl_kernel kernel() const { return kernel_; }
+
+  absl::Status CreateFromProgram(const CLProgram& program,
+                                 const std::string& function_name);
+
+  absl::Status SetMemory(int index, cl_mem memory);
+  absl::Status SetMemoryAuto(cl_mem memory);
+  template <typename T>
+  absl::Status SetBytes(int index, const T& value) const {
+    return SetBytes(index, static_cast<const void*>(&value), sizeof(T));
+  }
+  template <typename T>
+  absl::Status SetBytesAuto(const T& value) {
+    return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
+  }
+
+  int GetBindingCounter() const { return binding_counter_; }
+  void ResetBindingCounter() { binding_counter_ = 0; }
+
+  // Do not use this function
+  // workaround for Mali memory leak
+  absl::Status ReInit() const;
+
+  KernelInfo info_;
+
+ private:
+  void Release();
+  absl::Status SetBytes(int index, const void* ptr, int length) const;
+  absl::Status SetBytesAuto(const void* ptr, int length);
+
+  int binding_counter_ = -1;
+
+  std::string function_name_;
+  // reference to program from which kernel was created
+  cl_program program_ = nullptr;
+  cl_kernel kernel_ = nullptr;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_memory.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_memory.h
new file mode 100644
index 00000000..9252a2f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_memory.h
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// RAII wrapper for OpenCL memory object.
+//
+// Image is moveable but not copyable.
+class CLMemory {
+ public:
+  // Creates invalid object.
+  CLMemory() : CLMemory(nullptr, false) {}
+
+  CLMemory(cl_mem memory, bool has_ownership)
+      : memory_(memory), has_ownership_(has_ownership) {}
+
+  // Move-only
+  CLMemory(const CLMemory&) = delete;
+  CLMemory& operator=(const CLMemory&) = delete;
+  CLMemory(CLMemory&& image)
+      : memory_(image.memory_), has_ownership_(image.has_ownership_) {
+    image.memory_ = nullptr;
+  }
+
+  ~CLMemory() { Invalidate(); }
+
+  CLMemory& operator=(CLMemory&& image) {
+    if (this != &image) {
+      Invalidate();
+      std::swap(memory_, image.memory_);
+      has_ownership_ = image.has_ownership_;
+    }
+    return *this;
+  }
+
+  cl_mem memory() const { return memory_; }
+
+  bool is_valid() const { return memory_ != nullptr; }
+
+  // @return true if this object actually owns corresponding CL memory
+  //         and manages it's lifetime.
+  bool has_ownership() const { return has_ownership_; }
+
+  cl_mem Release() {
+    cl_mem to_return = memory_;
+    memory_ = nullptr;
+    return to_return;
+  }
+
+ private:
+  void Invalidate() {
+    if (memory_ && has_ownership_) {
+      clReleaseMemObject(memory_);
+    }
+    memory_ = nullptr;
+  }
+
+  cl_mem memory_ = nullptr;
+  bool has_ownership_ = false;
+};
+
+cl_mem_flags ToClMemFlags(AccessType access_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_operation.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_operation.h
new file mode 100644
index 00000000..ddc27452
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_operation.h
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_OPERATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_OPERATION_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct CreationContext {
+  const CLDevice* device;
+  CLContext* context;
+  CLCommandQueue* queue;
+  ProgramCache* cache;
+
+  const GpuInfo& GetGpuInfo() const { return device->info_; }
+};
+
+class ClOperation {
+ public:
+  ClOperation() = default;
+  virtual ~ClOperation() = default;
+  // Move only
+  ClOperation(ClOperation&& operation) = default;
+  ClOperation& operator=(ClOperation&& operation) = default;
+  ClOperation(const ClOperation&) = delete;
+  ClOperation& operator=(const ClOperation&) = delete;
+
+  void Init(std::unique_ptr<GPUOperation>&& gpu_operation) {
+    operation_ = std::move(gpu_operation);
+  }
+
+  GPUOperation& GetGpuOperation() { return *operation_; }
+  const GPUOperation& GetGpuOperation() const { return *operation_; }
+  uint64_t GetKernelFingerprint() const { return kernel_fingerprint_; }
+
+  // should be called after changes of inputs/outputs.
+  absl::Status UpdateParams();
+
+  absl::Status SetSrcTensor(int index, Tensor* tensor);
+  absl::Status SetDstTensor(int index, Tensor* tensor);
+
+  absl::Status AddToQueue(CLCommandQueue* queue) {
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    return queue->Dispatch(kernel_, operation_->GetWorkGroupsCount(),
+                           operation_->work_group_size_);
+  }
+
+  absl::Status AddToCommandBuffer(cl_command_buffer_khr cb) {
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    std::array<size_t, 3> local;
+    std::array<size_t, 3> global;
+    for (int i = 0; i < 3; ++i) {
+      local[i] = operation_->work_group_size_[i];
+      global[i] =
+          operation_->GetWorkGroupsCount()[i] * operation_->work_group_size_[i];
+    }
+    const int error_code = clCommandNDRangeKernelKHR(
+        cb, nullptr, nullptr, kernel_.kernel(), 3, nullptr, global.data(),
+        local.data(), 0, nullptr, nullptr, nullptr);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(
+          absl::StrCat("Failed to clCommandNDRangeKernelKHR - ",
+                       CLErrorCodeToString(error_code)));
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status AddToQueue(ProfilingCommandQueue* queue, CLEvent* event) {
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    return queue->CLCommandQueue::Dispatch(kernel_,
+                                           operation_->GetWorkGroupsCount(),
+                                           operation_->work_group_size_, event);
+  }
+
+  // for better profiling
+  absl::Status AddToQueueNTimes(ProfilingCommandQueue* queue, int n,
+                                int flush_period = 0) {
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    return queue->DispatchNTimes(kernel_, operation_->GetWorkGroupsCount(),
+                                 operation_->work_group_size_, n, flush_period);
+  }
+
+  absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info,
+                    ProfilingCommandQueue* profiling_queue);
+
+  absl::Status Compile(const CreationContext& creation_context);
+
+  absl::Status RestoreDeserialized(const ProgramCache& program_cache,
+                                   uint64_t fingerprint,
+                                   const GpuInfo& gpu_info,
+                                   const int3& work_group_size,
+                                   CLContext* context);
+
+  int3 GetWorkGroupSize() const { return operation_->work_group_size_; }
+
+  bool HasEqualScalarArguments(const ClOperation& op) const {
+    return cl_args_.HasEqualScalarArguments(op.cl_args_);
+  }
+
+ private:
+  std::unique_ptr<GPUOperation> operation_;
+  CLKernel kernel_;
+  uint64_t kernel_fingerprint_;
+  CLArguments cl_args_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_program.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_program.h
new file mode 100644
index 00000000..7015d2aa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/compiler_options.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string CompilerOptionsToString(
+    const GpuInfo& gpu_info,
+    const std::vector<CompilerOptions>& compiler_options);
+
+class CLProgram {
+ public:
+  CLProgram() {}
+  CLProgram(cl_program program, cl_device_id device_id);
+
+  // Move only
+  CLProgram(CLProgram&& program);
+  CLProgram& operator=(CLProgram&& program);
+  CLProgram(const CLProgram&) = delete;
+  CLProgram& operator=(const CLProgram&) = delete;
+
+  ~CLProgram();
+
+  cl_program program() const { return program_; }
+
+  // Return the cl_device_id associated with the program object.
+  // This can be the device associated with context on which the program object
+  // has been created or can be device that was specified when a program object
+  // was created using clCreateProgramWithBinary.
+  cl_device_id GetDeviceId() const { return device_id_; }
+
+  absl::Status GetBinary(std::vector<uint8_t>* result) const;
+
+ private:
+  void Release();
+
+  cl_program program_ = nullptr;
+
+  // reference
+  cl_device_id device_id_ = nullptr;
+};
+
+absl::Status CreateCLProgram(const std::string& code,
+                             const std::string& compiler_options,
+                             const CLContext& context, const CLDevice& device,
+                             CLProgram* result);
+
+absl::Status CreateCLProgramFromBinary(const CLContext& context,
+                                       const CLDevice& device,
+                                       absl::Span<const uint8_t> binary,
+                                       CLProgram* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_test.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_test.h
new file mode 100644
index 00000000..5d6e9dd0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/cl_test.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_TEST_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_TEST_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) ASSERT_TRUE(x.ok());
+#endif
+
+class OpenCLTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    ASSERT_OK(LoadOpenCL());
+    ASSERT_OK(CreateEnvironment(&env_));
+  }
+
+ protected:
+  Environment env_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
new file mode 100755
index 00000000..2bf90700
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
@@ -0,0 +1,223 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
+#define FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace data {
+
+struct Program;
+struct ProgramBuilder;
+
+struct CompiledCache;
+struct CompiledCacheBuilder;
+
+struct Program FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ProgramBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FINGERPRINT = 4,
+    VT_BINARY = 6
+  };
+  uint64_t fingerprint() const {
+    return GetField<uint64_t>(VT_FINGERPRINT, 0);
+  }
+  const ::flatbuffers::Vector<uint8_t> *binary() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_BINARY);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_FINGERPRINT, 8) &&
+           VerifyOffset(verifier, VT_BINARY) &&
+           verifier.VerifyVector(binary()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ProgramBuilder {
+  typedef Program Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fingerprint(uint64_t fingerprint) {
+    fbb_.AddElement<uint64_t>(Program::VT_FINGERPRINT, fingerprint, 0);
+  }
+  void add_binary(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary) {
+    fbb_.AddOffset(Program::VT_BINARY, binary);
+  }
+  explicit ProgramBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Program> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Program>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Program> CreateProgram(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary = 0) {
+  ProgramBuilder builder_(_fbb);
+  builder_.add_fingerprint(fingerprint);
+  builder_.add_binary(binary);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Program> CreateProgramDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    const std::vector<uint8_t> *binary = nullptr) {
+  auto binary__ = binary ? _fbb.CreateVector<uint8_t>(*binary) : 0;
+  return tflite::gpu::cl::data::CreateProgram(
+      _fbb,
+      fingerprint,
+      binary__);
+}
+
+struct CompiledCache FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CompiledCacheBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DRIVER_VERSION = 4,
+    VT_PROGRAMS = 6
+  };
+  const ::flatbuffers::String *driver_version() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DRIVER_VERSION);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>> *programs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>> *>(VT_PROGRAMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DRIVER_VERSION) &&
+           verifier.VerifyString(driver_version()) &&
+           VerifyOffset(verifier, VT_PROGRAMS) &&
+           verifier.VerifyVector(programs()) &&
+           verifier.VerifyVectorOfTables(programs()) &&
+           verifier.EndTable();
+  }
+};
+
+struct CompiledCacheBuilder {
+  typedef CompiledCache Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_driver_version(::flatbuffers::Offset<::flatbuffers::String> driver_version) {
+    fbb_.AddOffset(CompiledCache::VT_DRIVER_VERSION, driver_version);
+  }
+  void add_programs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>>> programs) {
+    fbb_.AddOffset(CompiledCache::VT_PROGRAMS, programs);
+  }
+  explicit CompiledCacheBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CompiledCache> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CompiledCache>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CompiledCache> CreateCompiledCache(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> driver_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>>> programs = 0) {
+  CompiledCacheBuilder builder_(_fbb);
+  builder_.add_programs(programs);
+  builder_.add_driver_version(driver_version);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CompiledCache> CreateCompiledCacheDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *driver_version = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>> *programs = nullptr) {
+  auto driver_version__ = driver_version ? _fbb.CreateString(driver_version) : 0;
+  auto programs__ = programs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>>(*programs) : 0;
+  return tflite::gpu::cl::data::CreateCompiledCache(
+      _fbb,
+      driver_version__,
+      programs__);
+}
+
+inline const tflite::gpu::cl::data::CompiledCache *GetCompiledCache(const void *buf) {
+  return ::flatbuffers::GetRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+}
+
+inline const tflite::gpu::cl::data::CompiledCache *GetSizePrefixedCompiledCache(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+}
+
+inline const char *CompiledCacheIdentifier() {
+  return "AFCM";
+}
+
+inline bool CompiledCacheBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, CompiledCacheIdentifier());
+}
+
+inline bool SizePrefixedCompiledCacheBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, CompiledCacheIdentifier(), true);
+}
+
+inline bool VerifyCompiledCacheBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
+}
+
+inline bool VerifySizePrefixedCompiledCacheBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
+}
+
+inline const char *CompiledCacheExtension() {
+  return "jetbin";
+}
+
+inline void FinishCompiledCacheBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+  fbb.Finish(root, CompiledCacheIdentifier());
+}
+
+inline void FinishSizePrefixedCompiledCacheBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+  fbb.FinishSizePrefixed(root, CompiledCacheIdentifier());
+}
+
+}  // namespace data
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/default/qcom_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/default/qcom_wrapper.h
new file mode 100644
index 00000000..d901e266
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/default/qcom_wrapper.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_DEFAULT_QCOM_WRAPPER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_DEFAULT_QCOM_WRAPPER_H_
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+#define LoadQcomExtensionFunctions()
+
+#define DEFINE_QCOM_FUNCTION_PTRS
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_DEFAULT_QCOM_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/egl_sync.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/egl_sync.h
new file mode 100644
index 00000000..dbea2436
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/egl_sync.h
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// RAII wrapper for EGL sync object.
+// EglSync is moveable but not copyable.
+class EglSync {
+ public:
+  // Creates a fence in OpenGL command stream. This sync is enqueued and *not*
+  // flushed.
+  //
+  // Depends on EGL_KHR_fence_sync extension.
+  static absl::Status NewFence(EGLDisplay display, EglSync* sync);
+
+  // Creates invalid object.
+  EglSync() : EglSync(EGL_NO_DISPLAY, EGL_NO_SYNC_KHR) {}
+
+  EglSync(EGLDisplay display, EGLSyncKHR sync)
+      : display_(display), sync_(sync) {}
+
+  // Move-only
+  EglSync(EglSync&& sync);
+  EglSync& operator=(EglSync&& sync);
+  EglSync(const EglSync&) = delete;
+  EglSync& operator=(const EglSync&) = delete;
+
+  ~EglSync() { Invalidate(); }
+
+  // Causes GPU to block and wait until this sync has been signaled.
+  // This call does not block and returns immediately.
+  absl::Status ServerWait();
+
+  // Causes CPU to block and wait until this sync has been signaled.
+  absl::Status ClientWait();
+
+  // Returns the EGLDisplay on which this instance was created.
+  EGLDisplay display() const { return display_; }
+
+  // Returns the EGLSyncKHR wrapped by this instance.
+  EGLSyncKHR sync() const { return sync_; }
+
+  // Returns true if this instance wraps a valid EGLSync object.
+  bool is_valid() const { return sync_ != nullptr; }
+
+ private:
+  void Invalidate();
+
+  EGLDisplay display_;
+  EGLSyncKHR sync_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/environment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/environment.h
new file mode 100644
index 00000000..a6382144
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/environment.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Environment {
+ public:
+  Environment() = default;
+  explicit Environment(CLDevice&& device, CLContext&& context,
+                       CLCommandQueue&& queue,
+                       ProfilingCommandQueue&& profiling_queue);
+  // Move only
+  Environment(Environment&& environment);
+  Environment& operator=(Environment&& environment);
+  Environment(const Environment&) = delete;
+  Environment& operator=(const Environment&) = delete;
+
+  const CLDevice& device() const { return device_; }
+  CLDevice* GetDevicePtr() { return &device_; }
+  const CLDevice* GetDevicePtr() const { return &device_; }
+  CLContext& context() { return context_; }
+  CLCommandQueue* queue() { return &queue_; }
+  ProfilingCommandQueue* profiling_queue() { return &profiling_queue_; }
+  ProgramCache* program_cache() { return &program_cache_; }
+  const ProgramCache* program_cache() const { return &program_cache_; }
+
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
+  bool IsSupported(CalculationsPrecision precision) const;
+  std::vector<TensorStorageType> GetSupportedStorages() const;
+  // returns storage types that support zero clamping when reading OOB in HW
+  // (Height/Width) dimensions.
+  std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport()
+      const;
+  bool IsSupported(TensorStorageType storage_type) const;
+
+  absl::Status Init();
+
+  void SetHighPerformance() const;
+  void SetDefaultPerformance() const;
+  void SetLowPerformance() const;  // for energy saving
+
+ private:
+  CLDevice device_;
+  CLContext context_;
+  CLCommandQueue queue_;
+  ProfilingCommandQueue profiling_queue_;
+  ProgramCache program_cache_;
+};
+
+TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info);
+TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
+    const GpuInfo& gpu_info);
+
+// Checks if image 2D creation from sub-buffer is supported.
+bool CanUseSubBufferForImage2d(const GpuInfo& gpu_info);
+
+absl::Status CreateEnvironment(Environment* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gl_interop.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gl_interop.h
new file mode 100644
index 00000000..28c37c9c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gl_interop.h
@@ -0,0 +1,170 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#include "tensorflow/lite/delegates/gpu/spi.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Creates an EglSync from OpenCL event. Source event does not need to outlive
+// returned sync and could be safely destroyed.
+//
+// Depends on EGL 1.5.
+absl::Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
+                                      EglSync* sync);
+
+// Returns true if 'CreateEglSyncFromClEvent' is supported.
+bool IsEglSyncFromClEventSupported();
+
+// Creates CL event from EGL sync.
+// Created event could only be consumed by AcquiredGlObject::Acquire call as
+// a 'wait_event'.
+absl::Status CreateClEventFromEglSync(cl_context context,
+                                      const EglSync& egl_sync, CLEvent* event);
+
+// Returns true if 'CreateClEventFromEglSync' is supported.
+bool IsClEventFromEglSyncSupported(const CLDevice& device);
+
+// Creates new CL memory object from OpenGL buffer.
+absl::Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id,
+                                        AccessType access_type,
+                                        CLContext* context, CLMemory* memory);
+
+// Creates new CL memory object from OpenGL texture.
+absl::Status CreateClMemoryFromGlTexture(GLenum texture_target,
+                                         GLuint texture_id,
+                                         AccessType access_type,
+                                         CLContext* context, CLMemory* memory);
+
+// Returns true if GL objects could be shared with OpenCL context.
+bool IsGlSharingSupported(const CLDevice& device);
+
+// RAII-wrapper for GL objects acquired into CL context.
+class AcquiredGlObjects {
+ public:
+  static bool IsSupported(const CLDevice& device);
+
+  AcquiredGlObjects() : AcquiredGlObjects({}, nullptr) {}
+
+  // Quitely releases OpenGL objects. It is recommended to call Release()
+  // explicitly to properly handle potential errors.
+  ~AcquiredGlObjects();
+
+  // Acquires memory from the OpenGL context. Memory must be created by either
+  // CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture calls.
+  // If 'acquire_event' is not nullptr, it will be signared once acquisition is
+  // complete.
+  static absl::Status Acquire(const std::vector<cl_mem>& memory,
+                              cl_command_queue queue,
+                              const std::vector<cl_event>& wait_events,
+                              CLEvent* acquire_event /* optional */,
+                              AcquiredGlObjects* objects);
+
+  // Releases OpenCL memory back to OpenGL context. If 'release_event' is not
+  // nullptr, it will be signalled once release is complete.
+  absl::Status Release(const std::vector<cl_event>& wait_events,
+                       CLEvent* release_event /* optional */);
+
+ private:
+  AcquiredGlObjects(const std::vector<cl_mem>& memory, cl_command_queue queue)
+      : memory_(memory), queue_(queue) {}
+
+  std::vector<cl_mem> memory_;
+  cl_command_queue queue_;
+};
+
+// Incapsulates all complicated GL-CL synchronization. It manages life time of
+// all appropriate events to ensure fast synchronization whenever possible.
+class GlInteropFabric {
+ public:
+  GlInteropFabric(EGLDisplay egl_display, Environment* environment);
+
+  // Ensures proper GL->CL synchronization is in place before
+  // GL objects that are mapped to CL objects are used.
+  absl::Status Start();
+
+  // Puts appropriate CL->GL synchronization after all work is complete.
+  absl::Status Finish();
+
+  // Registers memory to be used from GL context. Such CL memory object must
+  // be created with CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture
+  // call.
+  void RegisterMemory(cl_mem memory);
+
+  // Unregisters memory registered with RegisterMemory call.
+  void UnregisterMemory(cl_mem memory);
+
+ private:
+  bool is_enabled() const { return egl_display_ && !memory_.empty(); }
+
+  bool is_egl_sync_supported_;
+  bool is_egl_to_cl_mapping_supported_;
+  bool is_cl_to_egl_mapping_supported_;
+
+  const EGLDisplay egl_display_;
+  cl_context context_;
+  cl_command_queue queue_;
+  std::vector<cl_mem> memory_;
+  AcquiredGlObjects gl_objects_;  // transient during Start/Finish calls.
+};
+
+// Copies data from(to) GL buffer to(from) CL buffer using CPU.
+class GlClBufferCopier : public TensorObjectConverter {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return input.data_type == output.data_type &&
+           input.data_layout == output.data_layout &&
+           ((input.object_type == ObjectType::OPENGL_SSBO &&
+             output.object_type == ObjectType::OPENCL_BUFFER) ||
+            (input.object_type == ObjectType::OPENCL_BUFFER &&
+             output.object_type == ObjectType::OPENGL_SSBO));
+  }
+
+  GlClBufferCopier(const TensorObjectDef& input_def,
+                   const TensorObjectDef& output_def, Environment* environment);
+
+  absl::Status Convert(const TensorObject& input_obj,
+                       const TensorObject& output_obj) override;
+
+ private:
+  size_t size_in_bytes_;
+  CLCommandQueue* queue_ = nullptr;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
new file mode 100644
index 00000000..668ed911
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
+
+#define GL_NO_PROTOTYPES
+#define EGL_NO_PROTOTYPES
+#include <EGL/egl.h>
+#include <GLES3/gl31.h>
+#undef GL_NO_PROTOTYPES
+#undef EGL_NO_PROTOTYPES
+
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Shader compilation options.
+typedef struct {
+  // When set to zero, computations are carried out in 32-bit floating point.
+  // Otherwise, the GPU may quantify tensors, downcast values, process in FP16
+  // (recommended).
+  int32_t precision_loss_allowed;
+
+  // Priority is defined in TfLiteGpuInferencePriority.
+  int32_t inference_priority;
+} TfLiteGpuCompileOptions_New;
+
+typedef struct {
+  TfLiteGpuCompileOptions_New compile_options;
+
+  // [Optional]
+  // Whenever EGL display and EGL context are set, corresponding OpenCL context
+  // will be created.
+  // These variables are required when using GL objects as inputs or outputs.
+  EGLDisplay egl_display;
+  EGLContext egl_context;
+
+  // [Optional]
+  // Contains data returned from TfLiteGpuDelegateGetSerializedBinaryCache call.
+  // Invalid or incompatible data will be discarded. Compiled binary may become
+  // incompatible when GPU driver is updated.
+  const uint8_t* serialized_binary_cache_data;
+  size_t serialized_binary_cache_size;
+} TfLiteGpuDelegateOptions_New;
+
+// Creates a new delegate instance that need to be destroyed with
+// TfLiteGpuDelegateDelete_New when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, the following default values are used:
+// .compile_options = {
+//   .precision_loss_allowed = false,
+// }
+// .egl_display = EGL_NO_DISPLAY;
+// .egl_context = EGL_NO_CONTEXT;
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate_New(
+    const TfLiteGpuDelegateOptions_New* options);
+
+// Destroys a delegate created with `TfLiteGpuDelegateCreate_New` call.
+TFL_CAPI_EXPORT void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate);
+
+typedef enum {
+  TFLITE_GPU_DATA_LAYOUT_BHWC = 0,
+  TFLITE_GPU_DATA_LAYOUT_DHWC4 = 1,
+} TfLiteGpuDataLayout;
+
+// Binds GL shader storage object to an input or an output tensor in the
+// initialized delegate. Bound buffer should have sufficient storage to
+// accommodate all elements of a tensor.
+//
+// Supports data of kTfliteFloat16 or kTfliteFloat32 types in BHWC or DHWC4 data
+// layouts.
+//
+// *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
+TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
+    TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
+    TfLiteType data_type, TfLiteGpuDataLayout data_layout);
+
+// Returns opaque binary blob that contains a collection of cached OpenCL
+// binaries. Returned data could be re-used later to speed up initialization
+// time when new delegate is created for the same model.
+// Returned data is valid only if used on the same device, otherwise it will
+// not be compatible and will be discarded.
+TFL_CAPI_EXPORT bool TfLiteGpuDelegateGetSerializedBinaryCache(
+    TfLiteDelegate* delegate, size_t* size, const uint8_t** data);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gpu_object.h
new file mode 100644
index 00000000..0e1c67b9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct GPUResourcesWithValue {
+  GenericGPUResourcesWithValue generic;
+
+  std::vector<std::pair<std::string, cl_mem>> buffers;
+  std::vector<std::pair<std::string, cl_mem>> images2d;
+  std::vector<std::pair<std::string, cl_mem>> image2d_arrays;
+  std::vector<std::pair<std::string, cl_mem>> images3d;
+  std::vector<std::pair<std::string, cl_mem>> image_buffers;
+  std::vector<std::pair<std::string, cl_mem>> custom_memories;
+
+  void AddFloat(const std::string& name, float value) {
+    generic.AddFloat(name, value);
+  }
+  void AddInt(const std::string& name, int value) {
+    generic.AddInt(name, value);
+  }
+};
+
+class GPUObject {
+ public:
+  GPUObject() = default;
+  // Move only
+  GPUObject(GPUObject&& obj_desc) = default;
+  GPUObject& operator=(GPUObject&& obj_desc) = default;
+  GPUObject(const GPUObject&) = delete;
+  GPUObject& operator=(const GPUObject&) = delete;
+  virtual ~GPUObject() = default;
+  virtual absl::Status GetGPUResources(
+      const GPUObjectDescriptor* obj_ptr,
+      GPUResourcesWithValue* resources) const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/inference_context.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/inference_context.h
new file mode 100644
index 00000000..7cc5420b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -0,0 +1,229 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct CLNode {
+  ClOperation cl_operation;
+  std::vector<ValueId> inputs;
+  std::vector<ValueId> outputs;
+
+  // Mostly for debug purposes.
+  std::string name;
+
+  CLNode() = default;
+
+  CLNode(CLNode&& node) = default;
+  CLNode& operator=(CLNode&& node) = default;
+  CLNode(const CLNode&) = delete;
+  CLNode& operator=(const CLNode&) = delete;
+};
+
+enum class TensorType { kVariable, kConst, kExternal, kRuntime };
+
+class InferenceContext {
+ public:
+  absl::Status InitFromGraph(const CreateGpuModelInfo& create_info,
+                             const GraphFloat32& graph, Environment* env,
+                             std::vector<uint8_t>* serialized_model = nullptr);
+
+  absl::Status InitFromGpuModel(
+      const CreateGpuModelInfo& create_info, GpuModel* gpu_model,
+      Environment* env, std::vector<uint8_t>* serialized_model = nullptr,
+      Buffer* shared_buffer = nullptr);
+
+  absl::Status AddToCommandBuffer(cl_command_buffer_khr cb);
+
+  // Applies OpenCL-specific transformations to the graph before the
+  // initialization. These transformations are either impossible or useless in
+  // other backends.
+  absl::Status InitFromGraphWithTransforms(
+      const CreateGpuModelInfo& create_info, GraphFloat32* graph,
+      Environment* env, std::vector<uint8_t>* serialized_model = nullptr);
+
+  void FlushQueue(CLCommandQueue* queue);
+  absl::Status AddToQueue(CLCommandQueue* queue);
+  absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
+  // for profiling and memory statistics
+  uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const;
+  uint64_t GetConstantTensorsSize() const;
+
+  absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
+                              CLCommandQueue* queue);
+
+  // It will work only with input/output tensor ids. For all other ids we don't
+  // have any guarantees.
+  Tensor* GetTensor(ValueId id);
+
+  absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
+                               TensorFloat32* result);
+
+  const std::vector<ValueId>& GetInputIds() const { return input_ids_; }
+  const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
+
+  absl::Status RestoreDeserialized(
+      const absl::Span<const uint8_t> serialized_model, Environment* env,
+      CreateGpuModelInfo* create_info = nullptr);
+
+  // Can be used only with ids from external_mutable_tensors in create_info
+  // Must be called after initialization and before execution
+  absl::Status SetTensor(const ValueId& tensor_id, Tensor* tensor_ptr);
+
+ private:
+  flatbuffers::Offset<data::InferenceContext> Encode(
+      const CLDevice& device, const ProgramCache& program_cache,
+      flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,
+      flatbuffers::FlatBufferBuilder* builder);
+
+  void InitFromGpuModel(const GpuInfo& gpu_info, GpuModel* gpu_model);
+
+  absl::Status AllocateMemory(const GpuModel& gpu_model,
+                              const GpuInfo& gpu_info,
+                              const CreateGpuModelInfo* create_info,
+                              CLContext* context);
+
+  absl::Status AllocateConstTensors(const GpuModel& gpu_model,
+                                    CLContext* context);
+
+  absl::Status AllocateVariableTensors(const GpuModel& gpu_model,
+                                       CLContext* context);
+
+  absl::Status AllocateBufferBasedTensors(const GpuModel& gpu_model,
+                                          const GpuInfo& gpu_info,
+                                          const CreateGpuModelInfo* create_info,
+                                          CLContext* context);
+
+  absl::Status AllocateStrongShapesTensors(
+      const GpuModel& gpu_model, const GpuInfo& gpu_info,
+      const CreateGpuModelInfo* create_info, CLContext* context);
+
+  void BindMemoryToOperations();
+  absl::Status Compile(const CreationContext& creation_context);
+  absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info,
+                    ProfilingCommandQueue* profiling_queue);
+  absl::Status UpdateParams();
+  void PrepareExternal();
+
+  void InitRecordableQueue(Environment* env);
+
+  absl::Status ProfileTime(ProfilingCommandQueue* queue, ProfilingInfo* result);
+  absl::Status ClarifyTimeMultipleEnqueue(double ops_total_duration_ms,
+                                          int min_ops, int max_ops,
+                                          ProfilingCommandQueue* queue,
+                                          ProfilingInfo* result);
+  absl::Status ClarifyTimeWithCommandBuffer(ProfilingCommandQueue* queue,
+                                            ProfilingInfo* result);
+
+  absl::Status AddCommandBufferToQueue(CLCommandQueue* queue);
+
+  struct ExecutionHints {
+    bool need_flush = false;
+
+    bool flush_periodically = false;
+    int flush_period = 1;
+
+    // In order to reduce memory leak on Mali a pipeline needs to be
+    // synchronized with CPU to prevent growing internal global OpenCL kernel
+    // pool. One trick is to enqueue an event from a previous run. Most of the
+    // time is should already be executed on GPU and should not stall the
+    // pipeline.
+    bool need_manual_release = false;
+    CLEvent prev_enqueue_start_point;
+
+    void Init(const GpuInfo& gpu_info);
+  };
+  ExecutionHints execution_hints_;
+
+  // Directly mapped nodes from graph, but some of them "inactive" due
+  //  to fusion (inactive = fused).
+  // Memory is allocated only once, in ConvertOperations, and is not modified
+  //  anywhere.
+  std::vector<CLNode> nodes_;
+
+  absl::flat_hash_map<ValueId, Tensor*> external_immutable_tensors_;
+  absl::flat_hash_map<ValueId, Tensor*> external_mutable_tensors_;
+  absl::flat_hash_map<ValueId, std::vector<int>> external_tensor_to_nodes_;
+
+  std::map<ValueId, Tensor> const_tensors_;
+
+  std::map<ValueId, ValueId> variable_ids_and_refs_;
+  std::map<ValueId, Tensor> variable_tensors_;
+
+  std::unique_ptr<Buffer> shared_buffers_parent_;
+  Buffer* shared_buffers_parent_ptr_ = nullptr;
+  std::vector<Buffer> shared_buffers_;
+  std::vector<Tensor>
+      shared_buffer_tensors_;  // use references to memory from shared_buffers_
+  std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_;
+
+  std::map<ValueId, Tensor> strong_shape_tensors_;
+  std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
+
+  std::vector<ValueId> input_ids_;
+  std::vector<ValueId> output_ids_;
+
+  std::unique_ptr<RecordableQueue> recordable_queue_ = nullptr;
+
+  bool use_command_buffer_ = false;
+  std::unique_ptr<CLCommandBuffer> command_buffer_ = nullptr;
+
+  GpuInfo gpu_info_;
+};
+
+absl::Status GetInOutRefs(const absl::Span<const uint8_t> serialized_model,
+                          std::vector<int64_t>* in_refs,
+                          std::vector<int64_t>* out_refs);
+
+absl::Status GetTotalBufferSizeForTensors(const GpuModel& gpu_model,
+                                          const CreateGpuModelInfo& create_info,
+                                          const GpuInfo& gpu_info,
+                                          uint64_t* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
new file mode 100644
index 00000000..2ee3641e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) ASSERT_TRUE(x.ok());
+#endif
+
+class ClExecutionEnvironment : public TestExecutionEnvironment {
+ public:
+  ClExecutionEnvironment() = default;
+  ~ClExecutionEnvironment() override = default;
+
+  absl::Status Init();
+
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const override;
+  std::vector<TensorStorageType> GetSupportedStorages(
+      DataType data_type) const override;
+
+  const GpuInfo& GetGpuInfo() const override;
+
+  absl::Status ExecuteGpuOperationInternal(
+      const std::vector<TensorDescriptor*>& src_cpu,
+      const std::vector<TensorDescriptor*>& dst_cpu,
+      std::unique_ptr<GPUOperation>&& operation) override;
+
+ private:
+  Environment env_;
+};
+
+class OpenCLOperationTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    ASSERT_OK(LoadOpenCL());
+    ASSERT_OK(exec_env_.Init());
+  }
+
+ protected:
+  ClExecutionEnvironment exec_env_;
+};
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/kernels/converter.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/kernels/converter.h
new file mode 100644
index 00000000..83af85b1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/kernels/converter.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/spi.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Supports conversions from BHWC to internal OpenCL tensor representation and
+// back. Also supports F16/F32.
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    Environment* environment);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
new file mode 100644
index 00000000..92b5cb12
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
@@ -0,0 +1,723 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
+
+#include <CL/cl.h>
+#include <CL/cl_egl.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_platform.h>
+#include "tensorflow/lite/delegates/gpu/cl/default/qcom_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+absl::Status LoadOpenCL();
+void LoadOpenCLFunctionExtensions(cl_platform_id platform_id);
+
+typedef cl_int(CL_API_CALL *PFN_clGetPlatformIDs)(
+    cl_uint /* num_entries */, cl_platform_id * /* platforms */,
+    cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetPlatformInfo)(
+    cl_platform_id /* platform */, cl_platform_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetDeviceIDs)(
+    cl_platform_id /* platform */, cl_device_type /* device_type */,
+    cl_uint /* num_entries */, cl_device_id * /* devices */,
+    cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetDeviceInfo)(
+    cl_device_id /* device */, cl_device_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clCreateSubDevices)(
+    cl_device_id /* in_device */,
+    const cl_device_partition_property * /* properties */,
+    cl_uint /* num_devices */, cl_device_id * /* out_devices */,
+    cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clRetainDevice)(cl_device_id /* device */)
+    CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clReleaseDevice)(cl_device_id /* device */)
+    CL_API_SUFFIX__VERSION_1_2;
+typedef cl_context(CL_API_CALL *PFN_clCreateContext)(
+    const cl_context_properties * /* properties */, cl_uint /* num_devices */,
+    const cl_device_id * /* devices */,
+    void(CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t,
+                                         void *),
+    void * /* user_data */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_context(CL_API_CALL *PFN_clCreateContextFromType)(
+    const cl_context_properties * /* properties */,
+    cl_device_type /* device_type */,
+    void(CL_CALLBACK * /* pfn_notify*/)(const char *, const void *, size_t,
+                                        void *),
+    void * /* user_data */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainContext)(cl_context /* context */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseContext)(cl_context /* context */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetContextInfo)(
+    cl_context /* context */, cl_context_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueueWithProperties)(
+    cl_context /* context */, cl_device_id /* device */,
+    const cl_queue_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainCommandQueue)(
+    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseCommandQueue)(
+    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetCommandQueueInfo)(
+    cl_command_queue /* command_queue */,
+    cl_command_queue_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_mem(CL_API_CALL *PFN_clCreateBuffer)(
+    cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */,
+    void * /* host_ptr */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_mem(CL_API_CALL *PFN_clCreateSubBuffer)(
+    cl_mem /* buffer */, cl_mem_flags /* flags */,
+    cl_buffer_create_type /* buffer_create_type */,
+    const void * /* buffer_create_info */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    const cl_image_format * /* image_format */,
+    const cl_image_desc * /* image_desc */, void * /* host_ptr */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_mem(CL_API_CALL *PFN_clCreatePipe)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
+    const cl_pipe_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainMemObject)(cl_mem /* memobj */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseMemObject)(cl_mem /* memobj */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetSupportedImageFormats)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    cl_mem_object_type /* image_type */, cl_uint /* num_entries */,
+    cl_image_format * /* image_formats */,
+    cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetMemObjectInfo)(
+    cl_mem /* memobj */, cl_mem_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetImageInfo)(
+    cl_mem /* image */, cl_image_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetPipeInfo)(
+    cl_mem /* pipe */, cl_pipe_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clSetMemObjectDestructorCallback)(
+    cl_mem /* memobj */,
+    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
+                                       void * /*user_data*/),
+    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
+typedef void *(CL_API_CALL *PFN_clSVMAlloc)(
+    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
+    cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0;
+typedef void(CL_API_CALL *PFN_clSVMFree)(cl_context /* context */,
+                                         void * /* svm_pointer */)
+    CL_API_SUFFIX__VERSION_2_0;
+typedef cl_sampler(CL_API_CALL *PFN_clCreateSamplerWithProperties)(
+    cl_context /* context */,
+    const cl_sampler_properties * /* normalized_coords */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainSampler)(cl_sampler /* sampler */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseSampler)(cl_sampler /* sampler */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetSamplerInfo)(
+    cl_sampler /* sampler */, cl_sampler_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)(
+    cl_context /* context */, cl_uint /* count */, const char ** /* strings */,
+    const size_t * /* lengths */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBinary)(
+    cl_context /* context */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const size_t * /* lengths */,
+    const unsigned char ** /* binaries */, cl_int * /* binary_status */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBuiltInKernels)(
+    cl_context /* context */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* kernel_names */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clRetainProgram)(cl_program /* program */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseProgram)(cl_program /* program */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clBuildProgram)(
+    cl_program /* program */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* options */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
+                                         void * /* user_data */),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clCompileProgram)(
+    cl_program /* program */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* options */,
+    cl_uint /* num_input_headers */, const cl_program * /* input_headers */,
+    const char ** /* header_include_names */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
+                                         void * /* user_data */),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_program(CL_API_CALL *PFN_clLinkProgram)(
+    cl_context /* context */, cl_uint /* num_devices */,
+    const cl_device_id * /* device_list */, const char * /* options */,
+    cl_uint /* num_input_programs */, const cl_program * /* input_programs */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
+                                         void * /* user_data */),
+    void * /* user_data */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clUnloadPlatformCompiler)(
+    cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clGetProgramInfo)(
+    cl_program /* program */, cl_program_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetProgramBuildInfo)(
+    cl_program /* program */, cl_device_id /* device */,
+    cl_program_build_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_kernel(CL_API_CALL *PFN_clCreateKernel)(
+    cl_program /* program */, const char * /* kernel_name */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clCreateKernelsInProgram)(
+    cl_program /* program */, cl_uint /* num_kernels */,
+    cl_kernel * /* kernels */,
+    cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainKernel)(cl_kernel /* kernel */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseKernel)(cl_kernel /* kernel */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clSetKernelArg)(
+    cl_kernel /* kernel */, cl_uint /* arg_index */, size_t /* arg_size */,
+    const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clSetKernelArgSVMPointer)(
+    cl_kernel /* kernel */, cl_uint /* arg_index */,
+    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clSetKernelExecInfo)(
+    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
+    size_t /* param_value_size */,
+    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clGetKernelInfo)(
+    cl_kernel /* kernel */, cl_kernel_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetKernelArgInfo)(
+    cl_kernel /* kernel */, cl_uint /* arg_indx */,
+    cl_kernel_arg_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clGetKernelWorkGroupInfo)(
+    cl_kernel /* kernel */, cl_device_id /* device */,
+    cl_kernel_work_group_info /* param_name */, size_t /* param_value_size */,
+    void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clWaitForEvents)(
+    cl_uint /* num_events */,
+    const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetEventInfo)(
+    cl_event /* event */, cl_event_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_event(CL_API_CALL *PFN_clCreateUserEvent)(cl_context /* context */,
+                                                     cl_int * /* errcode_ret */)
+    CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clRetainEvent)(cl_event /* event */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseEvent)(cl_event /* event */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clSetUserEventStatus)(
+    cl_event /* event */,
+    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clSetEventCallback)(
+    cl_event /* event */, cl_int /* command_exec_callback_type */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clGetEventProfilingInfo)(
+    cl_event /* event */, cl_profiling_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clFlush)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clFinish)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_read */, size_t /* offset */, size_t /* size */,
+    void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_read */, const size_t * /* buffer_offset */,
+    const size_t * /* host_offset */, const size_t * /* region */,
+    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */,
+    size_t /* host_row_pitch */, size_t /* host_slice_pitch */,
+    void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_write */, size_t /* offset */, size_t /* size */,
+    const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_write */, const size_t * /* buffer_offset */,
+    const size_t * /* host_offset */, const size_t * /* region */,
+    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */,
+    size_t /* host_row_pitch */, size_t /* host_slice_pitch */,
+    const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueFillBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    const void * /* pattern */, size_t /* pattern_size */, size_t /* offset */,
+    size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
+    cl_mem /* dst_buffer */, size_t /* src_offset */, size_t /* dst_offset */,
+    size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferRect)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
+    cl_mem /* dst_buffer */, const size_t * /* src_origin */,
+    const size_t * /* dst_origin */, const size_t * /* region */,
+    size_t /* src_row_pitch */, size_t /* src_slice_pitch */,
+    size_t /* dst_row_pitch */, size_t /* dst_slice_pitch */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    cl_bool /* blocking_read */, const size_t * /* origin[3] */,
+    const size_t * /* region[3] */, size_t /* row_pitch */,
+    size_t /* slice_pitch */, void * /* ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    cl_bool /* blocking_write */, const size_t * /* origin[3] */,
+    const size_t * /* region[3] */, size_t /* input_row_pitch */,
+    size_t /* input_slice_pitch */, const void * /* ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueFillImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    const void * /* fill_color */, const size_t * /* origin[3] */,
+    const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImage)(
+    cl_command_queue /* command_queue */, cl_mem /* src_image */,
+    cl_mem /* dst_image */, const size_t * /* src_origin[3] */,
+    const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImageToBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* src_image */,
+    cl_mem /* dst_buffer */, const size_t * /* src_origin[3] */,
+    const size_t * /* region[3] */, size_t /* dst_offset */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferToImage)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
+    cl_mem /* dst_image */, size_t /* src_offset */,
+    const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef void *(CL_API_CALL *PFN_clEnqueueMapBuffer)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */,
+    cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
+    size_t /* offset */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */,
+    cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+typedef void *(CL_API_CALL *PFN_clEnqueueMapImage)(
+    cl_command_queue /* command_queue */, cl_mem /* image */,
+    cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
+    const size_t * /* origin[3] */, const size_t * /* region[3] */,
+    size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */,
+    cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)(
+    cl_command_queue /* command_queue */, cl_mem /* memobj */,
+    void * /* mapped_ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMigrateMemObjects)(
+    cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */,
+    const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueNDRangeKernel)(
+    cl_command_queue /* command_queue */, cl_kernel /* kernel */,
+    cl_uint /* work_dim */, const size_t * /* global_work_offset */,
+    const size_t * /* global_work_size */, const size_t * /* local_work_size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueNativeKernel)(
+    cl_command_queue /* command_queue */,
+    void(CL_CALLBACK * /*user_func*/)(void *), void * /* args */,
+    size_t /* cb_args */, cl_uint /* num_mem_objects */,
+    const cl_mem * /* mem_list */, const void ** /* args_mem_loc */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMarkerWithWaitList)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrierWithWaitList)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMFree)(
+    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+    void *[] /* svm_pointers[] */,
+    void(CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                          cl_uint /* num_svm_pointers */,
+                                          void *[] /* svm_pointers[] */,
+                                          void * /* user_data */),
+    void * /* user_data */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemcpy)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
+    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemFill)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMap)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
+    cl_map_flags /* flags */, void * /* svm_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMUnmap)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddressForPlatform)(
+    cl_platform_id /* platform */,
+    const char * /* func_name */)CL_API_SUFFIX__VERSION_1_2;
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage2D)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    const cl_image_format * /* image_format */, size_t /* image_width */,
+    size_t /* image_height */, size_t /* image_row_pitch */,
+    void * /* host_ptr */, cl_int * /* errcode_ret */);
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage3D)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    const cl_image_format * /* image_format */, size_t /* image_width */,
+    size_t /* image_height */, size_t /* image_depth */,
+    size_t /* image_row_pitch */, size_t /* image_slice_pitch */,
+    void * /* host_ptr */, cl_int * /* errcode_ret */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMarker)(
+    cl_command_queue /* command_queue */, cl_event * /* event */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWaitForEvents)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events */,
+    const cl_event * /* event_list */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrier)(
+    cl_command_queue /* command_queue */);
+typedef cl_int(CL_API_CALL *PFN_clUnloadCompiler)();
+typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddress)(
+    const char * /* func_name */);
+typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueue)(
+    cl_context /* context */, cl_device_id /* device */,
+    cl_command_queue_properties /* properties */, cl_int * /* errcode_ret */);
+typedef cl_sampler(CL_API_CALL *PFN_clCreateSampler)(
+    cl_context /* context */, cl_bool /* normalized_coords */,
+    cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */,
+    cl_int * /* errcode_ret */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueTask)(
+    cl_command_queue /* command_queue */, cl_kernel /* kernel */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */);
+
+// OpenGL sharing
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags,
+                                                      cl_GLuint, int *);
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLTexture)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
+    cl_GLint /* miplevel */, cl_GLuint /* texture */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireGLObjects)(
+    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
+    const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseGLObjects)(
+    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
+    const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+// cl_khr_egl_event extension
+
+// CLeglDisplayKHR is an opaque handle to an EGLDisplay
+typedef void *CLeglDisplayKHR;
+
+// CLeglSyncKHR is an opaque handle to an EGLSync object
+typedef void *CLeglSyncKHR;
+
+typedef cl_event(CL_API_CALL *PFN_clCreateEventFromEGLSyncKHR)(
+    cl_context /* context */, CLeglSyncKHR /* sync */,
+    CLeglDisplayKHR /* display */, cl_int * /* errcode_ret */);
+
+// EGL sharing
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromEGLImageKHR)(
+    cl_context /*context*/, CLeglDisplayKHR /*display*/,
+    CLeglImageKHR /*image*/, cl_mem_flags /*flags*/,
+    const cl_egl_image_properties_khr * /*properties*/,
+    cl_int * /*errcode_ret*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireEGLObjectsKHR)(
+    cl_command_queue /*command_queue*/, cl_uint /*num_objects*/,
+    const cl_mem * /*mem_objects*/, cl_uint /*num_events_in_wait_list*/,
+    const cl_event * /*event_wait_list*/, cl_event * /*event*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseEGLObjectsKHR)(
+    cl_command_queue /*command_queue*/, cl_uint /*num_objects*/,
+    const cl_mem * /*mem_objects*/, cl_uint /*num_events_in_wait_list*/,
+    const cl_event * /*event_wait_list*/, cl_event * /*event*/);
+
+// cl_khr_command_buffer
+typedef cl_command_buffer_khr(CL_API_CALL *PFN_clCreateCommandBufferKHR)(
+    cl_uint /*num_queues*/, const cl_command_queue * /*queues*/,
+    const cl_command_buffer_properties_khr * /*properties*/,
+    cl_int * /*errcode_ret*/);
+
+typedef cl_int(CL_API_CALL *PFN_clRetainCommandBufferKHR)(
+    cl_command_buffer_khr /*command_buffer*/);
+
+typedef cl_int(CL_API_CALL *PFN_clReleaseCommandBufferKHR)(
+    cl_command_buffer_khr /*command_buffer*/);
+
+typedef cl_int(CL_API_CALL *PFN_clFinalizeCommandBufferKHR)(
+    cl_command_buffer_khr /*command_buffer*/);
+
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCommandBufferKHR)(
+    cl_uint /*num_queues*/, cl_command_queue * /*queues*/,
+    cl_command_buffer_khr /*command_buffer*/,
+    cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/,
+    cl_event * /*event*/);
+
+#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION >= CL_MAKE_VERSION(0, 9, 5)
+typedef cl_int(CL_API_CALL *PFN_clCommandNDRangeKernelKHR)(
+    cl_command_buffer_khr /*command_buffer*/,
+    cl_command_queue /*command_queue*/,
+    const cl_command_properties_khr * /*properties*/, cl_kernel /*kernel*/,
+    cl_uint /*work_dim*/, const size_t * /*global_work_offset*/,
+    const size_t * /*global_work_size*/, const size_t * /*local_work_size*/,
+    cl_uint /*num_sync_points_in_wait_list*/,
+    const cl_sync_point_khr * /*sync_point_wait_list*/,
+    cl_sync_point_khr * /*sync_point*/,
+    cl_mutable_command_khr * /*mutable_handle*/);
+#else
+typedef cl_int(CL_API_CALL *PFN_clCommandNDRangeKernelKHR)(
+    cl_command_buffer_khr /*command_buffer*/,
+    cl_command_queue /*command_queue*/,
+    const cl_ndrange_kernel_command_properties_khr * /*properties*/,
+    cl_kernel /*kernel*/, cl_uint /*work_dim*/,
+    const size_t * /*global_work_offset*/, const size_t * /*global_work_size*/,
+    const size_t * /*local_work_size*/,
+    cl_uint /*num_sync_points_in_wait_list*/,
+    const cl_sync_point_khr * /*sync_point_wait_list*/,
+    cl_sync_point_khr * /*sync_point*/,
+    cl_mutable_command_khr * /*mutable_handle*/);
+#endif
+
+typedef cl_int(CL_API_CALL *PFN_clGetCommandBufferInfoKHR)(
+    cl_command_buffer_khr /*command_buffer*/,
+    cl_command_buffer_info_khr /*param_name*/, size_t /*param_value_size*/,
+    void * /*param_value*/, size_t * /*param_value_size_ret*/);
+
+extern PFN_clGetPlatformIDs clGetPlatformIDs;
+extern PFN_clGetPlatformInfo clGetPlatformInfo;
+extern PFN_clGetDeviceIDs clGetDeviceIDs;
+extern PFN_clGetDeviceInfo clGetDeviceInfo;
+extern PFN_clCreateSubDevices clCreateSubDevices;
+extern PFN_clRetainDevice clRetainDevice;
+extern PFN_clReleaseDevice clReleaseDevice;
+extern PFN_clCreateContext clCreateContext;
+extern PFN_clCreateContextFromType clCreateContextFromType;
+extern PFN_clRetainContext clRetainContext;
+extern PFN_clReleaseContext clReleaseContext;
+extern PFN_clGetContextInfo clGetContextInfo;
+extern PFN_clCreateCommandQueueWithProperties
+    clCreateCommandQueueWithProperties;
+extern PFN_clRetainCommandQueue clRetainCommandQueue;
+extern PFN_clReleaseCommandQueue clReleaseCommandQueue;
+extern PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
+extern PFN_clCreateBuffer clCreateBuffer;
+extern PFN_clCreateSubBuffer clCreateSubBuffer;
+extern PFN_clCreateImage clCreateImage;
+extern PFN_clCreatePipe clCreatePipe;
+extern PFN_clRetainMemObject clRetainMemObject;
+extern PFN_clReleaseMemObject clReleaseMemObject;
+extern PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
+extern PFN_clGetMemObjectInfo clGetMemObjectInfo;
+extern PFN_clGetImageInfo clGetImageInfo;
+extern PFN_clGetPipeInfo clGetPipeInfo;
+extern PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+extern PFN_clSVMAlloc clSVMAlloc;
+extern PFN_clSVMFree clSVMFree;
+extern PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+extern PFN_clRetainSampler clRetainSampler;
+extern PFN_clReleaseSampler clReleaseSampler;
+extern PFN_clGetSamplerInfo clGetSamplerInfo;
+extern PFN_clCreateProgramWithSource clCreateProgramWithSource;
+extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
+extern PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+extern PFN_clRetainProgram clRetainProgram;
+extern PFN_clReleaseProgram clReleaseProgram;
+extern PFN_clBuildProgram clBuildProgram;
+extern PFN_clCompileProgram clCompileProgram;
+extern PFN_clLinkProgram clLinkProgram;
+extern PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+extern PFN_clGetProgramInfo clGetProgramInfo;
+extern PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
+extern PFN_clCreateKernel clCreateKernel;
+extern PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
+extern PFN_clRetainKernel clRetainKernel;
+extern PFN_clReleaseKernel clReleaseKernel;
+extern PFN_clSetKernelArg clSetKernelArg;
+extern PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+extern PFN_clSetKernelExecInfo clSetKernelExecInfo;
+extern PFN_clGetKernelInfo clGetKernelInfo;
+extern PFN_clGetKernelArgInfo clGetKernelArgInfo;
+extern PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+extern PFN_clWaitForEvents clWaitForEvents;
+extern PFN_clGetEventInfo clGetEventInfo;
+extern PFN_clCreateUserEvent clCreateUserEvent;
+extern PFN_clRetainEvent clRetainEvent;
+extern PFN_clReleaseEvent clReleaseEvent;
+extern PFN_clSetUserEventStatus clSetUserEventStatus;
+extern PFN_clSetEventCallback clSetEventCallback;
+extern PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
+extern PFN_clFlush clFlush;
+extern PFN_clFinish clFinish;
+extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
+extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+extern PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
+extern PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+extern PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+extern PFN_clEnqueueReadImage clEnqueueReadImage;
+extern PFN_clEnqueueWriteImage clEnqueueWriteImage;
+extern PFN_clEnqueueFillImage clEnqueueFillImage;
+extern PFN_clEnqueueCopyImage clEnqueueCopyImage;
+extern PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+extern PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
+extern PFN_clEnqueueMapImage clEnqueueMapImage;
+extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+extern PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+extern PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+extern PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
+extern PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+extern PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+extern PFN_clEnqueueSVMFree clEnqueueSVMFree;
+extern PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+extern PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+extern PFN_clEnqueueSVMMap clEnqueueSVMMap;
+extern PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+extern PFN_clGetExtensionFunctionAddressForPlatform
+    clGetExtensionFunctionAddressForPlatform;
+extern PFN_clCreateImage2D clCreateImage2D;
+extern PFN_clCreateImage3D clCreateImage3D;
+extern PFN_clEnqueueMarker clEnqueueMarker;
+extern PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+extern PFN_clEnqueueBarrier clEnqueueBarrier;
+extern PFN_clUnloadCompiler clUnloadCompiler;
+extern PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+extern PFN_clCreateCommandQueue clCreateCommandQueue;
+extern PFN_clCreateSampler clCreateSampler;
+extern PFN_clEnqueueTask clEnqueueTask;
+
+// OpenGL sharing
+extern PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
+extern PFN_clCreateFromGLTexture clCreateFromGLTexture;
+extern PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+extern PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+
+// cl_khr_egl_event extension
+extern PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+// EGL sharing
+extern PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
+// cl_khr_command_buffer extension
+extern PFN_clCreateCommandBufferKHR clCreateCommandBufferKHR;
+extern PFN_clRetainCommandBufferKHR clRetainCommandBufferKHR;
+extern PFN_clReleaseCommandBufferKHR clReleaseCommandBufferKHR;
+extern PFN_clFinalizeCommandBufferKHR clFinalizeCommandBufferKHR;
+extern PFN_clEnqueueCommandBufferKHR clEnqueueCommandBufferKHR;
+extern PFN_clCommandNDRangeKernelKHR clCommandNDRangeKernelKHR;
+extern PFN_clGetCommandBufferInfoKHR clGetCommandBufferInfoKHR;
+
+// For convenient image creation
+// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
+// otherwise it will use legacy clCreateImage2D
+cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
+                           const cl_image_format *image_format,
+                           const cl_image_desc *image_desc, void *host_ptr,
+                           cl_int *errcode_ret);
+
+// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
+// otherwise it will use legacy clCreateImage3D
+cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags,
+                           const cl_image_format *image_format,
+                           const cl_image_desc *image_desc, void *host_ptr,
+                           cl_int *errcode_ret);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/program_cache.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/program_cache.h
new file mode 100644
index 00000000..3a7ded67
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/program_cache.h
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_PROGRAM_CACHE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_PROGRAM_CACHE_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ProgramCache {
+ public:
+  ProgramCache() = default;
+
+  ProgramCache(ProgramCache&& program_cache);
+  ProgramCache& operator=(ProgramCache&& program_cache);
+  ProgramCache(const ProgramCache&) = delete;
+  ProgramCache& operator=(const ProgramCache&) = delete;
+
+  absl::Status GetOrCreateCLKernel(
+      const std::string& code, const std::string& function_name,
+      const std::vector<CompilerOptions>& compiler_options,
+      const CLContext& context, const CLDevice& device, CLKernel* result,
+      uint64_t* kernel_fingerprint = nullptr);
+
+  absl::Status GetOrCreateCLKernel(const std::string& code,
+                                   const std::string& function_name,
+                                   const CLContext& context,
+                                   const CLDevice& device, CLKernel* result,
+                                   uint64_t* kernel_fingerprint = nullptr);
+
+  absl::Status GetKernel(uint64_t fingerprint, const std::string& function_name,
+                         CLKernel* result) const;
+
+  absl::Status AddProgramBinary(const CLContext& context,
+                                const CLDevice& device, uint64_t fingerprint,
+                                absl::Span<const uint8_t> binary);
+  absl::Status GetProgramBinary(uint64_t fingerprint,
+                                std::vector<uint8_t>* program_binary) const;
+
+  absl::Status AddSerializedCache(const CLContext& context,
+                                  const CLDevice& device,
+                                  absl::Span<const uint8_t> serialized_cache);
+  absl::Status GetSerializedCache(const CLDevice& device,
+                                  std::vector<uint8_t>* serialized_cache) const;
+
+ private:
+  struct ProgramDescriptor {
+    ProgramDescriptor() = default;
+    ProgramDescriptor(const std::string& code,
+                      const std::string& compiler_options);
+    explicit ProgramDescriptor(uint64_t fingerprint);
+
+    uint64_t fingerprint;
+  };
+  struct ProgramDescriptorHasher {
+    std::size_t operator()(const ProgramDescriptor& k) const {
+      return std::hash<uint64_t>()(k.fingerprint);
+    }
+  };
+  struct ProgramDescriptorEqual {
+    bool operator()(const ProgramDescriptor& a,
+                    const ProgramDescriptor& b) const {
+      return a.fingerprint == b.fingerprint;
+    }
+  };
+
+  absl::flat_hash_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher,
+                      ProgramDescriptorEqual>
+      programs_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_PROGRAM_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/qcom_thin_filter.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/qcom_thin_filter.h
new file mode 100644
index 00000000..5be21f32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/qcom_thin_filter.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_QCOM_THIN_FILTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_QCOM_THIN_FILTER_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class QcomThinFilter : public GPUObject {
+ public:
+  QcomThinFilter() {}
+  explicit QcomThinFilter(cl_mem filter) : filter_(filter) {}
+  ~QcomThinFilter() override;
+
+  // Move only
+  QcomThinFilter(QcomThinFilter&& filter);
+  QcomThinFilter& operator=(QcomThinFilter&& filter);
+  QcomThinFilter(const QcomThinFilter&) = delete;
+  QcomThinFilter& operator=(const QcomThinFilter&) = delete;
+
+  cl_mem GetMemoryPtr() const { return filter_; }
+
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
+
+  absl::Status CreateFromDescriptor(const QcomThinFilterDescriptor& desc,
+                                    CLContext* context);
+
+ private:
+  void Release();
+  cl_mem filter_ = nullptr;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_QCOM_THIN_FILTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/recordable_queue.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/recordable_queue.h
new file mode 100644
index 00000000..57db7e26
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/recordable_queue.h
@@ -0,0 +1,51 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_RECORDABLE_QUEUE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_RECORDABLE_QUEUE_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class RecordableQueue {
+ public:
+  RecordableQueue() = default;
+  virtual ~RecordableQueue() = default;
+
+  // Move only
+  RecordableQueue(RecordableQueue&& storage) = default;
+  RecordableQueue& operator=(RecordableQueue&& storage) = default;
+  RecordableQueue(const RecordableQueue&) = delete;
+  RecordableQueue& operator=(const RecordableQueue&) = delete;
+
+  virtual bool IsSupported() const { return false; }
+  virtual absl::Status Execute(CLCommandQueue* queue) const {
+    return absl::UnimplementedError("");
+  }
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_RECORDABLE_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h
new file mode 100644
index 00000000..0f29be1c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_RECORDABLE_QUEUE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_RECORDABLE_QUEUE_BUILDER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/recordable_queue.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::unique_ptr<RecordableQueue> CreateRecordableQueue(
+    const std::vector<ClOperation*>& ops, const CLDevice& device,
+    const CLContext& context);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_RECORDABLE_QUEUE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
new file mode 100755
index 00000000..17d079b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
@@ -0,0 +1,250 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SERIALIZATION_TFLITE_GPU_CL_DATA_H_
+#define FLATBUFFERS_GENERATED_SERIALIZATION_TFLITE_GPU_CL_DATA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+#include "gpu_model_generated.h"
+#include "serialization_base_generated.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace data {
+
+struct BinaryProgram;
+struct BinaryProgramBuilder;
+
+struct InferenceContext;
+struct InferenceContextBuilder;
+
+struct BinaryProgram FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryProgramBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FINGERPRINT = 4,
+    VT_BINARY = 6
+  };
+  uint64_t fingerprint() const {
+    return GetField<uint64_t>(VT_FINGERPRINT, 0);
+  }
+  const ::flatbuffers::Vector<uint8_t> *binary() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_BINARY);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_FINGERPRINT, 8) &&
+           VerifyOffset(verifier, VT_BINARY) &&
+           verifier.VerifyVector(binary()) &&
+           verifier.EndTable();
+  }
+};
+
+struct BinaryProgramBuilder {
+  typedef BinaryProgram Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fingerprint(uint64_t fingerprint) {
+    fbb_.AddElement<uint64_t>(BinaryProgram::VT_FINGERPRINT, fingerprint, 0);
+  }
+  void add_binary(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary) {
+    fbb_.AddOffset(BinaryProgram::VT_BINARY, binary);
+  }
+  explicit BinaryProgramBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BinaryProgram> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BinaryProgram>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BinaryProgram> CreateBinaryProgram(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary = 0) {
+  BinaryProgramBuilder builder_(_fbb);
+  builder_.add_fingerprint(fingerprint);
+  builder_.add_binary(binary);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BinaryProgram> CreateBinaryProgramDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    const std::vector<uint8_t> *binary = nullptr) {
+  auto binary__ = binary ? _fbb.CreateVector<uint8_t>(*binary) : 0;
+  return tflite::gpu::cl::data::CreateBinaryProgram(
+      _fbb,
+      fingerprint,
+      binary__);
+}
+
+struct InferenceContext FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef InferenceContextBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_GPU_MODEL = 4,
+    VT_DRIVER_VERSION = 6,
+    VT_BINARY_PROGRAMS = 8,
+    VT_TUNED_WORK_GROUP_SIZES_PER_NODE = 10,
+    VT_FINGERPRINTS_PER_NODE = 12
+  };
+  const tflite::gpu::data::GpuModel *gpu_model() const {
+    return GetPointer<const tflite::gpu::data::GpuModel *>(VT_GPU_MODEL);
+  }
+  const ::flatbuffers::String *driver_version() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DRIVER_VERSION);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *binary_programs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *>(VT_BINARY_PROGRAMS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>> *tuned_work_group_sizes_per_node() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>> *>(VT_TUNED_WORK_GROUP_SIZES_PER_NODE);
+  }
+  const ::flatbuffers::Vector<uint64_t> *fingerprints_per_node() const {
+    return GetPointer<const ::flatbuffers::Vector<uint64_t> *>(VT_FINGERPRINTS_PER_NODE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_GPU_MODEL) &&
+           verifier.VerifyTable(gpu_model()) &&
+           VerifyOffset(verifier, VT_DRIVER_VERSION) &&
+           verifier.VerifyString(driver_version()) &&
+           VerifyOffset(verifier, VT_BINARY_PROGRAMS) &&
+           verifier.VerifyVector(binary_programs()) &&
+           verifier.VerifyVectorOfTables(binary_programs()) &&
+           VerifyOffset(verifier, VT_TUNED_WORK_GROUP_SIZES_PER_NODE) &&
+           verifier.VerifyVector(tuned_work_group_sizes_per_node()) &&
+           verifier.VerifyVectorOfTables(tuned_work_group_sizes_per_node()) &&
+           VerifyOffset(verifier, VT_FINGERPRINTS_PER_NODE) &&
+           verifier.VerifyVector(fingerprints_per_node()) &&
+           verifier.EndTable();
+  }
+};
+
+struct InferenceContextBuilder {
+  typedef InferenceContext Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_gpu_model(::flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model) {
+    fbb_.AddOffset(InferenceContext::VT_GPU_MODEL, gpu_model);
+  }
+  void add_driver_version(::flatbuffers::Offset<::flatbuffers::String> driver_version) {
+    fbb_.AddOffset(InferenceContext::VT_DRIVER_VERSION, driver_version);
+  }
+  void add_binary_programs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>> binary_programs) {
+    fbb_.AddOffset(InferenceContext::VT_BINARY_PROGRAMS, binary_programs);
+  }
+  void add_tuned_work_group_sizes_per_node(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>>> tuned_work_group_sizes_per_node) {
+    fbb_.AddOffset(InferenceContext::VT_TUNED_WORK_GROUP_SIZES_PER_NODE, tuned_work_group_sizes_per_node);
+  }
+  void add_fingerprints_per_node(::flatbuffers::Offset<::flatbuffers::Vector<uint64_t>> fingerprints_per_node) {
+    fbb_.AddOffset(InferenceContext::VT_FINGERPRINTS_PER_NODE, fingerprints_per_node);
+  }
+  explicit InferenceContextBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<InferenceContext> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<InferenceContext>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<InferenceContext> CreateInferenceContext(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> driver_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>> binary_programs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>>> tuned_work_group_sizes_per_node = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint64_t>> fingerprints_per_node = 0) {
+  InferenceContextBuilder builder_(_fbb);
+  builder_.add_fingerprints_per_node(fingerprints_per_node);
+  builder_.add_tuned_work_group_sizes_per_node(tuned_work_group_sizes_per_node);
+  builder_.add_binary_programs(binary_programs);
+  builder_.add_driver_version(driver_version);
+  builder_.add_gpu_model(gpu_model);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model = 0,
+    const char *driver_version = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *binary_programs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::Int3>> *tuned_work_group_sizes_per_node = nullptr,
+    const std::vector<uint64_t> *fingerprints_per_node = nullptr) {
+  auto driver_version__ = driver_version ? _fbb.CreateString(driver_version) : 0;
+  auto binary_programs__ = binary_programs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>(*binary_programs) : 0;
+  auto tuned_work_group_sizes_per_node__ = tuned_work_group_sizes_per_node ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::Int3>>(*tuned_work_group_sizes_per_node) : 0;
+  auto fingerprints_per_node__ = fingerprints_per_node ? _fbb.CreateVector<uint64_t>(*fingerprints_per_node) : 0;
+  return tflite::gpu::cl::data::CreateInferenceContext(
+      _fbb,
+      gpu_model,
+      driver_version__,
+      binary_programs__,
+      tuned_work_group_sizes_per_node__,
+      fingerprints_per_node__);
+}
+
+inline const tflite::gpu::cl::data::InferenceContext *GetInferenceContext(const void *buf) {
+  return ::flatbuffers::GetRoot<tflite::gpu::cl::data::InferenceContext>(buf);
+}
+
+inline const tflite::gpu::cl::data::InferenceContext *GetSizePrefixedInferenceContext(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::InferenceContext>(buf);
+}
+
+inline bool VerifyInferenceContextBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::gpu::cl::data::InferenceContext>(nullptr);
+}
+
+inline bool VerifySizePrefixedInferenceContextBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::gpu::cl::data::InferenceContext>(nullptr);
+}
+
+inline void FinishInferenceContextBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::InferenceContext> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedInferenceContextBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::InferenceContext> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace data
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_SERIALIZATION_TFLITE_GPU_CL_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/tensor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/tensor.h
new file mode 100644
index 00000000..a879921b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -0,0 +1,134 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Tensor : public GPUObject, public GpuSpatialTensor {
+ public:
+  Tensor()
+      : memory_(nullptr), image_buffer_memory_(nullptr), memory_owner_(true) {}
+  Tensor(cl_mem memory, bool memory_owner, const TensorDescriptor& descriptor);
+  Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory,
+         const TensorDescriptor& descriptor);
+
+  // Move only
+  Tensor(Tensor&& tensor);
+  Tensor& operator=(Tensor&& tensor);
+  Tensor(const Tensor&) = delete;
+  Tensor& operator=(const Tensor&) = delete;
+
+  ~Tensor() override { Release(); }
+
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
+
+  int Width() const override { return descriptor_.GetBHWDCShape().w; }
+  int Height() const override { return descriptor_.GetBHWDCShape().h; }
+  int Depth() const override { return descriptor_.GetBHWDCShape().d; }
+  int Channels() const override { return descriptor_.GetBHWDCShape().c; }
+  int Slices() const override {
+    return DivideRoundUp(descriptor_.GetBHWDCShape().c, 4);
+  }
+  int Batch() const override { return descriptor_.GetBHWDCShape().b; }
+
+  TensorDescriptor GetDescriptor() const override { return descriptor_; }
+  DataType GetDataType() const { return descriptor_.GetDataType(); }
+  TensorStorageType GetStorageType() const {
+    return descriptor_.GetStorageType();
+  }
+  uint64_t GetMemorySizeInBytes() const {
+    return descriptor_.GetMemorySizeInBytes();
+  }
+
+  cl_mem GetMemoryPtr() const;
+
+  // This function returns buffer memory ptr for IMAGE_BUFFER instead of image
+  // memory ptr.
+  cl_mem GetMemoryPtrForWriting() const;
+
+  absl::Status CreateFromDescriptor(const TensorDescriptor& desc,
+                                    CLContext* context);
+  absl::Status UploadDescriptorData(const TensorDescriptor& desc,
+                                    CLCommandQueue* queue);
+  absl::Status ToDescriptor(TensorDescriptor* desc,
+                            CLCommandQueue* queue) const;
+
+ private:
+  friend absl::Status CreateTensorSharedImage2DBuffer(
+      const CLContext& context, cl_mem memory,
+      const TensorDescriptor& descriptor, int width_pixel_alignment,
+      Tensor* result);
+
+  absl::Status WriteData(const void* ptr, CLCommandQueue* queue);
+  absl::Status ReadData(void* ptr, CLCommandQueue* queue) const;
+
+  void Release();
+
+  cl_mem memory_;
+  cl_mem image_buffer_memory_;  // for IMAGE_BUFFER/TEXTURE_2D/SINGLE_TEXTURE_2D
+  bool memory_owner_;
+  bool buffer_based_ = false;
+  TensorDescriptor descriptor_;
+  // for use with TEXTURE_2D and when texture created from buffer.
+  int aligned_texture_width_;
+};
+
+using TensorPtr = std::shared_ptr<Tensor>;
+
+absl::Status AllocateTensorMemory(const CLContext& context,
+                                  const TensorDescriptor& descriptor,
+                                  CLMemory* result);
+
+absl::Status CreateTensor(const CLContext& context,
+                          const TensorDescriptor& descriptor, Tensor* result);
+
+absl::Status CreateTensorShared(const CLContext& context, cl_mem memory,
+                                const TensorDescriptor& descriptor,
+                                Tensor* result);
+
+absl::Status CreateTensorSharedImage2DBuffer(const CLContext& context,
+                                             cl_mem memory,
+                                             const TensorDescriptor& descriptor,
+                                             int width_pixel_alignment,
+                                             Tensor* result);
+
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/tensor_type_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/tensor_type_util.h
new file mode 100644
index 00000000..b0f6e621
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/tensor_type_util.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+ObjectType ToObjectType(TensorStorageType type);
+
+DataLayout ToDataLayout(TensorStorageType type);
+
+TensorStorageType ToTensorStorageType(ObjectType object_type,
+                                      DataLayout data_layout);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_TENSOR_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/util.h
new file mode 100644
index 00000000..3cb6ec2e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/cl/util.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_UTIL_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string CLErrorCodeToString(cl_int error_code);
+
+int ChannelTypeToSizeInBytes(cl_channel_type type);
+
+bool OpenCLSupported();
+
+template <DataType S, typename T>
+void CopyLinearFLT4(const tflite::gpu::Tensor<Linear, S>& src,
+                    absl::Span<T> dst) {
+  const int dst_depth = dst.size();
+  for (int d = 0; d < dst_depth; ++d) {
+    T val;
+    for (int i = 0; i < 4; ++i) {
+      const int dst_ch = d * 4 + i;
+      val[i] = dst_ch >= src.shape.v ? 0.0f : src.data[dst_ch];
+    }
+    dst[d] = val;
+  }
+}
+
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
+                            bool read_only, void* data, cl_mem* result);
+
+absl::Status CreateCLSubBuffer(cl_context context, cl_mem parent,
+                               size_t origin_in_bytes, size_t size_in_bytes,
+                               bool read_only, cl_mem* result);
+
+absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
+                               cl_channel_type channel_type, void* data,
+                               cl_mem* result);
+
+absl::Status CreateQcomConvolutionFilter(cl_context context, int kernel_x,
+                                         int kernel_y, cl_mem* filter,
+                                         const void* data);
+
+std::vector<std::pair<std::string, std::string>> GetClSpecificDefines();
+
+// Vendor extensions that cannot be used in open-source
+std::vector<std::string> GetUnsupportedExtensions();
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/access_type.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/access_type.h
new file mode 100644
index 00000000..c54888fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/access_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_ACCESS_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_ACCESS_TYPE_H_
+
+namespace tflite {
+namespace gpu {
+
+enum class AccessType {
+  READ,
+  WRITE,
+  READ_WRITE,
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_ACCESS_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/convert.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/convert.h
new file mode 100644
index 00000000..c7a6c173
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/convert.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+// PHWC4 layout is where channels are grouped by 4 in a row and P stands for
+// a plane that was derived by dividing channels by 4.
+absl::Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
+                            absl::Span<float> out);
+absl::Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
+                                absl::Span<HalfBits> out);
+
+// @return number of elements when shape is converted into PHWC4.
+uint32_t GetElementsSizeForPHWC4(const BHWC& shape);
+
+// Operation is opposite to ConvertToPHWC4.
+absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
+                              absl::Span<float> out);
+absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in,
+                                  const BHWC& shape, absl::Span<float> out);
+
+// Convenience wrapper around a method above.
+std::vector<float> ConvertToPHWC4(
+    const Tensor<BHWC, DataType::FLOAT32>& tensor);
+std::vector<float> ConvertToPHWC4(const Tensor<HWC, DataType::FLOAT32>& tensor);
+
+// @return number of elements when shape is converted into PIOHW4.
+uint32_t GetElementsSizeForPIOHW4(const OHWI& shape);
+
+// PIOHW4 layout re-arranges weights in groups by 4, where outer dimension is
+// P which is OxI/4.
+absl::Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
+                             absl::Span<float> out);
+
+// Convenience wrapper around a method above.
+std::vector<float> ConvertToPIOHW4(
+    const Tensor<OHWI, DataType::FLOAT32>& tensor);
+
+// @return number of elements when shape is converted into PHWO4I4.
+uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape);
+
+// Convenience wrapper around a method above.
+std::vector<float> ConvertToPHWO4I4(
+    const Tensor<OHWI, DataType::FLOAT32>& tensor);
+
+// Convenience wrapper around a method above, for Transposed Convolution.
+std::vector<float> ConvertToPHWO4I4Transposed(
+    const Tensor<OHWI, DataType::FLOAT32>& tensor);
+
+// @return (x,y,z) size for PHWO4I4 to access elements where each element
+// consists of 4 values.
+uint3 Get3DSizeForPHWO4I4(const OHWI& shape);
+
+// @return number of elements when shape is converted into PHWO4I4.
+uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape);
+
+// Layout is Po,H,W,OI4x4.
+absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
+                              absl::Span<float> out);
+
+// Convenience wrapper around a method above.
+std::vector<float> ConvertToPHWO4I4(
+    const Tensor<IHWO, DataType::FLOAT32>& tensor);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/custom_parsers.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/custom_parsers.h
new file mode 100644
index 00000000..12d3be7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/custom_parsers.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/operation_parser.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Returns a parser for the provided custom op.
+std::unique_ptr<TFLiteOperationParser> NewCustomOperationParser(
+    absl::string_view op_name);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/custom_transformations.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/custom_transformations.h
new file mode 100644
index 00000000..3ca73a0d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/custom_transformations.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Applies all implemented custom model transformations.
+bool ApplyCustomTransformations(ModelTransformer* transformer);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/data_type.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/data_type.h
new file mode 100644
index 00000000..c0188c65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/data_type.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_DATA_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_DATA_TYPE_H_
+
+#include <stddef.h>
+#include <string>
+
+namespace tflite {
+namespace gpu {
+
+enum class DataType {
+  UNKNOWN = 0,
+  FLOAT16 = 1,
+  FLOAT32 = 2,
+  FLOAT64 = 3,
+  UINT8 = 4,
+  INT8 = 5,
+  UINT16 = 6,
+  INT16 = 7,
+  UINT32 = 8,
+  INT32 = 9,
+  UINT64 = 10,
+  INT64 = 11,
+  BOOL = 12,
+};
+
+size_t SizeOf(DataType data_type);
+
+std::string ToString(DataType data_type);
+
+std::string ToCLDataType(DataType data_type, int vec_size = 1);
+
+std::string ToMetalDataType(DataType data_type, int vec_size = 1);
+
+DataType ToMetalTextureType(DataType data_type);
+
+// When add_precision enabled it will add:
+//   highp for INT32/UINT32/FLOAT32
+//   mediump for INT16/UINT16/FLOAT16(if explicit_fp16 not enabled)
+//   lowp for INT8/UINT8
+std::string ToGlslShaderDataType(DataType data_type, int vec_size = 1,
+                                 bool add_precision = false,
+                                 bool explicit_fp16 = false);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_DATA_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/flops_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/flops_util.h
new file mode 100644
index 00000000..fd5deb3c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/flops_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_FLOPS_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_FLOPS_UTIL_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
+namespace tflite {
+namespace gpu {
+
+uint64_t GetConvolutionFlops(const BHWC& dst_shape, const OHWI& weights_shape);
+uint64_t GetConvolutionWinograd4x4To6x6Flops(const BHWC& dst_shape,
+                                             const OHWI& weights_shape);
+
+uint64_t GetConvolutionTransposedFlops(const BHWC& src_shape,
+                                       const OHWI& weights_shape);
+
+uint64_t GetDepthwiseConvolutionFlops(const BHWC& dst_shape,
+                                      const OHWI& weights_shape);
+
+uint64_t GetFullyConnectedFlops(const BHWC& dst_shape,
+                                const OHWI& weights_shape);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_FLOPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/google/status_macros.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/google/status_macros.h
new file mode 100644
index 00000000..759b0d9a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/google/status_macros.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GOOGLE_OSS_STATUS_MACROS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GOOGLE_OSS_STATUS_MACROS_H_
+
+#define RETURN_IF_ERROR(s) \
+  {                        \
+    auto c = (s);          \
+    if (!c.ok()) return c; \
+  }
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GOOGLE_OSS_STATUS_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_info.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_info.h
new file mode 100644
index 00000000..c1b4eb64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -0,0 +1,611 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_INFO_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+
+// The VendorID returned by the GPU driver.
+enum class GpuVendor {
+  kApple,
+  kQualcomm,
+  kMali,
+  kPowerVR,
+  kNvidia,
+  kAMD,
+  kIntel,
+  kUnknown
+};
+
+enum class GpuApi {
+  kUnknown,
+  kOpenCl,
+  kMetal,
+  kVulkan,
+  kOpenGl,
+};
+
+enum class AdrenoGpu {
+  // Adreno 7xx series
+  kAdreno750,
+  kAdreno740,
+  kAdreno730,
+  // Adreno 6xx series
+  kAdreno685,
+  kAdreno680,
+  kAdreno675,
+  kAdreno660,
+  kAdreno650,
+  kAdreno640,
+  kAdreno630,
+  kAdreno620,
+  kAdreno618,
+  kAdreno616,
+  kAdreno615,
+  kAdreno612,
+  kAdreno610,
+  kAdreno605,
+  // Adreno 5xx series
+  kAdreno540,
+  kAdreno530,
+  kAdreno512,
+  kAdreno510,
+  kAdreno509,
+  kAdreno508,
+  kAdreno506,
+  kAdreno505,
+  kAdreno504,
+  // Adreno 4xx series
+  kAdreno430,
+  kAdreno420,
+  kAdreno418,
+  kAdreno405,
+  // Adreno 3xx series
+  kAdreno330,
+  kAdreno320,
+  kAdreno308,
+  kAdreno306,
+  kAdreno305,
+  kAdreno304,
+  // Adreno 2xx series
+  kAdreno225,
+  kAdreno220,
+  kAdreno205,
+  kAdreno203,
+  kAdreno200,
+  // Adreno 1xx series
+  kAdreno130,
+  kAdreno120,
+  kUnknown
+};
+
+struct AMDInfo {
+  AMDInfo() = default;
+  int shader_engines = 0;
+  int compute_units_per_shader_engine = 0;
+  int GetComputeUnitsCount() const {
+    return shader_engines * compute_units_per_shader_engine;
+  }
+};
+
+struct AdrenoInfo {
+  struct OpenClCompilerVersion {
+    int major = 0;
+    int minor = 0;
+    int patch = 0;
+  };
+  AdrenoInfo() = default;
+  explicit AdrenoInfo(const std::string& device_version);
+
+  AdrenoGpu adreno_gpu;
+
+  bool IsAdreno1xx() const;
+  bool IsAdreno2xx() const;
+  bool IsAdreno3xx() const;
+  bool IsAdreno4xx() const;
+  bool IsAdreno5xx() const;
+  bool IsAdreno6xx() const;
+  bool IsAdreno7xx() const;
+  bool IsAdreno6xxOrHigher() const;
+  bool IsBetterThan(AdrenoGpu gpu) const;
+
+  // This function returns some not very documented physical parameter of
+  // Adreno6xx GPU.
+  // We obtained it using Snapdragon Profiler.
+  int GetMaximumWavesCount() const;
+
+  // returns amount of register memory per CU(Compute Unit) in bytes.
+  int GetRegisterMemorySizePerComputeUnit() const;
+
+  // returns maximum possible amount of waves based on register usage.
+  int GetMaximumWavesCount(int register_footprint_per_tread,
+                           bool full_wave = true) const;
+
+  int GetWaveSize(bool full_wave) const;
+
+  int GetComputeUnitsCount() const;
+
+  // Not supported on some Adreno devices with specific driver version.
+  // b/131099086
+  bool support_one_layer_texture_array = true;
+
+  bool compiler_bugs_in_a6xx = false;
+
+  OpenClCompilerVersion cl_compiler_version;
+};
+
+enum class AppleGpu {
+  kUnknown,
+  kA7,
+  kA8,
+  kA8X,
+  kA9,
+  kA9X,
+  kA10,
+  kA10X,
+  kA11,
+  kA12,
+  kA12X,
+  kA12Z,
+  kA13,
+  kA14,
+  kA15,
+  kA16,
+  kA17Pro,
+  kA18,
+  kA18Pro,
+  kM1,
+  kM1Pro,
+  kM1Max,
+  kM1Ultra,
+  kM2,
+  kM2Pro,
+  kM2Max,
+  kM2Ultra,
+  kM3,
+  kM3Pro,
+  kM3Max,
+  kM4,
+};
+
+struct AppleInfo {
+  // https://developer.apple.com/documentation/metal/mtlgpufamily
+  enum class Family {
+    kApple9 = 9,
+    kApple8 = 8,
+    kApple7 = 7,
+    kApple6 = 6,
+    kApple5 = 5,
+    kApple4 = 4,
+    kApple3 = 3,
+    kApple2 = 2,
+    kApple1 = 1,
+  };
+  AppleInfo() = default;
+  explicit AppleInfo(const std::string& gpu_description);
+  AppleGpu gpu_type;
+  Family gpu_family;
+
+  bool IsFamilyApple1() const;
+  bool IsFamilyApple2() const;
+  bool IsFamilyApple3() const;
+  bool IsFamilyApple4() const;
+  bool IsFamilyApple5() const;
+  bool IsFamilyApple6() const;
+  bool IsFamilyApple7() const;
+  bool IsFamilyApple8() const;
+  bool IsFamilyApple9() const;
+
+  bool IsFamilyOrLower(Family family) const;
+
+  bool IsLocalMemoryPreferredOverGlobal() const;
+
+  bool IsBionic() const;
+  bool IsM1Series() const;
+  bool IsM2Series() const;
+  bool IsM3Series() const;
+  bool IsM4Series() const;
+
+  bool IsSIMDMatMulSupported() const;
+  // Often, fp32 alu performance is 1/2 of fp16 alu performance
+  // But, on some devices, fp32 alu performance equal to fp16 alu performance,
+  // at least in some scenarios.
+  // This method returns true if SIMDMatMul performance in fp32 equal to fp16
+  bool IsSIMDMatMulFp32Perf2x() const;
+
+  // floating point rounding mode
+  bool IsRoundToNearestSupported() const;
+
+  int GetComputeUnitsCount() const;
+
+  // do not use, for internal usage
+  void SetComputeUnits(int compute_units_count);
+
+ private:
+  Family GetGpuFamily() const;
+  int compute_units = -1;
+};
+
+enum class MaliGpu {
+  kUnknown,
+  kT604,
+  kT622,
+  kT624,
+  kT628,
+  kT658,
+  kT678,
+  kT720,
+  kT760,
+  kT820,
+  kT830,
+  kT860,
+  kT880,
+  kG31,
+  kG51,
+  kG71,
+  kG52,
+  kG72,
+  kG76,
+  kG57,
+  kG77,
+  kG68,
+  kG78,
+  kG310,
+  kG510,
+  kG610,
+  kG710,
+  kG715,
+};
+
+struct MaliInfo {
+  MaliInfo() = default;
+  explicit MaliInfo(const std::string& gpu_description);
+  MaliGpu gpu_version;
+
+  bool IsMaliT6xx() const;
+  bool IsMaliT7xx() const;
+  bool IsMaliT8xx() const;
+  bool IsMidgard() const;
+  bool IsBifrostGen1() const;
+  bool IsBifrostGen2() const;
+  bool IsBifrostGen3() const;
+  bool IsBifrost() const;
+  bool IsValhallGen1() const;
+  bool IsValhallGen2() const;
+  bool IsValhallGen3() const;
+  bool IsValhallGen4() const;
+  bool IsValhall() const;
+
+  // returns approximate compute units count using GPU name
+  int GetApproximateComputeUnitsCount() const;
+};
+
+enum class PowerVRGpu {
+  kUnknown,
+  // Newer generation of IMG gpus
+  // Starting with B-series - all RTE with the exception of BXM:
+  kDXT,
+  kCXT,
+  kBXT,
+  kBXS,
+  kBXM,
+  kBXE,
+  // RTZ
+  kAXT,
+  kAXM,
+  kAXE,
+  // Older generation of rogue IMG gpus - all RTZ:
+  kRogue,
+  kRogueGm9xxx,
+  kRogueGe8xxx,
+};
+
+struct PowerVRInfo {
+  struct DriverVersion {
+    int branch_main = 0;
+    int branch_minor = 0;
+    int id = 0;
+  };
+  PowerVRInfo() = default;
+  explicit PowerVRInfo(const std::string& gpu_description);
+  PowerVRGpu gpu_version;
+  DriverVersion driver_version;
+
+  bool IsRogue() const;
+  bool IsImgAxx() const;
+  bool IsImgBxx() const;
+  bool IsImgCxx() const;
+  bool IsImgDxx() const;
+
+  bool IsBetterThan(PowerVRGpu gpu) const;
+};
+
+struct OpenGlInfo {
+  std::string renderer_name;
+  std::string vendor_name;
+  std::string version;
+  int major_version = -1;
+  int minor_version = -1;
+
+  int max_image_units = 0;
+  int max_ssbo_bindings = 0;
+  int max_image_bindings = 0;
+  int max_work_group_invocations = 0;
+  int max_texture_size = 0;
+  int max_array_texture_layers = 0;
+  int max_fragment_image_units = 0;
+  int max_fragment_uniform_vec4_count = 0;
+  int max_color_atttachments = 0;
+  int max_viewport_width = 0;
+  int max_viewport_height = 0;
+  int max_renderbuffer_size = 0;
+
+  std::vector<std::string> extensions;
+  int max_compute_work_group_size_x;
+  int max_compute_work_group_size_y;
+  int max_compute_work_group_size_z;
+
+  bool SupportsExplicitFp16() const;
+
+  bool IsApiOpenGl31OrAbove() const;
+  bool IsApiOpenGl32OrAbove() const;
+};
+
+struct VulkanInfo {
+  std::string vendor_name;
+  uint32_t api_version = -1;
+  uint32_t api_version_major = -1;
+  uint32_t api_version_minor = -1;
+  uint32_t api_version_patch = -1;
+
+  int max_per_stage_descriptor_sampled_images = 0;
+  uint32_t max_compute_work_group_invocations;
+  uint32_t max_image_dimension_1d;
+  uint32_t max_image_dimension_2d;
+  uint32_t max_image_dimension_3d;
+  uint32_t max_image_array_layers;
+  uint64_t max_texel_buffer_elements;
+  uint64_t max_uniform_buffer_range;
+  uint64_t max_storage_buffer_range;
+  uint64_t max_push_constants_size;
+
+  uint32_t subgroup_size = 0;
+  bool supports_subgroup_arithmetic = false;
+
+  std::vector<std::string> extensions;
+  int max_compute_work_group_size_x;
+  int max_compute_work_group_size_y;
+  int max_compute_work_group_size_z;
+
+  bool SupportsExplicitFp16() const;
+};
+
+enum class OpenClVersion {
+  kCl1_0,
+  kCl1_1,
+  kCl1_2,
+  kCl2_0,
+  kCl2_1,
+  kCl2_2,
+  kCl3_0,
+  kUnknown,
+};
+std::string OpenClVersionToString(OpenClVersion version);
+
+struct OpenClInfo {
+  std::string device_name;
+  std::string vendor_name;
+  std::string opencl_c_version;
+  std::string platform_version;
+  std::string driver_version;
+
+  OpenClVersion cl_version;
+
+  std::vector<std::string> extensions;
+  bool supports_fp16;
+  bool supports_image3d_writes;
+  bool supports_images;
+  int compute_units_count;
+  uint64_t buffer_max_size;
+  uint64_t max_allocation_size;
+  uint64_t image2d_max_width;
+  uint64_t image2d_max_height;
+  uint64_t image_buffer_max_size;
+  uint64_t image_array_max_layers;
+  uint64_t image3d_max_width;
+  uint64_t image3d_max_height;
+  uint64_t image3d_max_depth;
+  int max_work_group_size_x;
+  int max_work_group_size_y;
+  int max_work_group_size_z;
+  int max_work_group_total_size;
+  int preferred_work_group_size_multiple;
+  bool dedicated_local_memory;
+
+  // The row pitch alignment size in pixels for 2D images created from a buffer.
+  // The value must be a power of 2.
+  uint64_t image_pitch_alignment = 0;
+  // The minimum alignment in pixels. The value must be a power of 2.
+  uint64_t image_base_address_alignment = 0;
+  uint64_t base_addr_align_in_bits;
+
+  // rtn is ROUND_TO_NEAREST
+  // with rtn precision is much better then with rtz (ROUND_TO_ZERO)
+  // Adreno 3xx supports only rtz, Adreno 4xx and more support rtn
+  // Mali from T6xx supports rtn
+  // PowerVR supports only rtz
+  bool supports_fp32_rtn;
+  bool supports_fp16_rtn;
+
+  bool supports_register_allocation_arm = false;
+
+  struct SupportedImage2dTypes {
+    absl::flat_hash_set<DataType> r_layout;
+    absl::flat_hash_set<DataType> rg_layout;
+    absl::flat_hash_set<DataType> rgb_layout;
+    absl::flat_hash_set<DataType> rgba_layout;
+
+    bool SupportsImage2D(DataType data_type, int channels) const;
+  };
+
+  SupportedImage2dTypes supported_images_2d;
+
+  bool IsImage2dFromBufferSupported() const;
+
+  bool IsCLVK() const { return absl::StrContains(platform_version, "clvk"); }
+};
+
+enum class MetalLanguageVersion {
+  kMetal1_0,
+  kMetal1_1,
+  kMetal1_2,
+  kMetal2_0,
+  kMetal2_1,
+  kMetal2_2,
+  kMetal2_3,
+  kMetal2_4,
+  kMetal3_0,
+  kUnknown,
+};
+
+struct MetalInfo {
+  MetalLanguageVersion language_version;
+
+  int max_work_group_size_x;
+  int max_work_group_size_y;
+  int max_work_group_size_z;
+
+  uint64_t buffer_max_size;
+
+  uint64_t image2d_max_width;
+  uint64_t image2d_max_height;
+  uint64_t image_array_max_layers;
+  uint64_t image3d_max_width;
+  uint64_t image3d_max_height;
+  uint64_t image3d_max_depth;
+
+  bool IsSIMDMatMulSupported() const;
+  // MSL is Metal shading language
+  bool IsMslVersionEqualOrHigher(int major, int minor = 0) const;
+};
+
+struct GpuInfo {
+  bool IsAdreno() const;
+  bool IsApple() const;
+  bool IsMali() const;
+  bool IsPowerVR() const;
+  bool IsNvidia() const;
+  bool IsAMD() const;
+  bool IsIntel() const;
+
+  bool IsGlsl() const;
+  bool IsGlslSupportsExplicitFp16() const;
+
+  // floating point rounding mode
+  bool IsRoundToNearestSupported() const;
+
+  bool SupportsFP16() const;
+
+  bool SupportsImages() const;
+  bool SupportsTextureArray() const;
+  bool SupportsImageBuffer() const;
+  bool SupportsImage3D() const;
+
+  bool SupportsPointersInKernels() const;
+
+  // returns true if device have fixed wave size equal to 32
+  bool IsWaveSizeEqualTo32() const;
+  bool SupportsSubGroupWithSize(int sub_group_size) const;
+  absl::Status GetMinSubGroupSize(int& min_sub_group_size) const;
+
+  bool SupportsFloatImage2D(DataType data_type, int channels) const;
+  bool SupportsExtension(const std::string& extension) const;
+
+  bool SupportsZeroClampForImageBuffer() const;
+  bool SupportsZeroClampForImages() const;
+
+  int GetComputeUnitsCount() const;
+
+  int GetMaxImageArguments() const;
+
+  int GetMaxWorkGroupSizeForX() const;
+  int GetMaxWorkGroupSizeForY() const;
+  int GetMaxWorkGroupSizeForZ() const;
+  int GetMaxWorkGroupTotalSize() const;
+
+  uint64_t GetMaxImage2DWidth() const;
+  uint64_t GetMaxImage2DHeight() const;
+  uint64_t GetMaxImage2DArrayLayers() const;
+  uint64_t GetMaxImage3DWidth() const;
+  uint64_t GetMaxImage3DHeight() const;
+  uint64_t GetMaxImage3DDepth() const;
+  uint64_t GetMaxBufferSize() const;
+  uint64_t GetMaxMemoryAllocationSize() const;
+  uint64_t GetMaxImageBufferWidth() const;
+
+  GpuVendor vendor = GpuVendor::kUnknown;
+  GpuApi gpu_api = GpuApi::kUnknown;
+
+  std::vector<int> supported_subgroup_sizes;
+
+  AdrenoInfo adreno_info;
+  AMDInfo amd_info;
+  AppleInfo apple_info;
+  MaliInfo mali_info;
+  PowerVRInfo powervr_info;
+
+  // OpenGL specific, gpu_api should be kOpenGl
+  OpenGlInfo opengl_info;
+  bool IsApiOpenGl() const;
+  bool IsApiOpenGl31OrAbove() const;
+
+  // Vulkan specific, gpu_api should be kVulkan
+  VulkanInfo vulkan_info;
+  bool IsApiVulkan() const;
+
+  MetalInfo metal_info;
+  bool IsApiMetal() const;
+
+  OpenClInfo opencl_info;
+  bool IsApiOpenCl() const;
+  bool IsCL11OrHigher() const;
+  bool IsCL20OrHigher() const;
+  bool IsCL30OrHigher() const;
+};
+
+// Currently it initializes:
+// vendor
+// AdrenoInfo if vendor is kQualcomm
+// AppleInfo if vendor is kApple
+// MaliInfo if vendor is kMali
+// PowerVRInfo if vendor is kPowerVR
+void GetGpuInfoFromDeviceDescription(const std::string& gpu_description,
+                                     GpuApi gpu_api, GpuInfo* gpu_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model.h
new file mode 100644
index 00000000..1b40c81f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model.h
@@ -0,0 +1,123 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_MODEL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_MODEL_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_model_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+struct GpuNode {
+  std::unique_ptr<GPUOperation> gpu_operation;
+  std::vector<ValueId> inputs;
+  std::vector<ValueId> outputs;
+  std::string name;
+
+  GpuNode() = default;
+  GpuNode(GpuNode&& node) = default;
+  GpuNode& operator=(GpuNode&& node) = default;
+  GpuNode(const GpuNode&) = delete;
+  GpuNode& operator=(const GpuNode&) = delete;
+};
+
+struct CreateGpuModelInfo {
+  CalculationsPrecision precision;
+  TensorStorageType storage_type;
+  ModelHints hints;
+
+  // User can require specific layout for some tensors.
+  // This will guarantee that tensors with specific ids have exact specified
+  // layout.
+  // Some restrictions apply:
+  //   1) ValueId must be input or output id of GraphFloat32
+  //   2) data_type must be equal to DeduceDataTypeFromPrecision(precision);
+  //      for example for precision F16, data_type must be FLOAT16
+  //   3) Layout must be without Batch dimension if tensor.shape.b == 1
+  //      Layout must be with Batch dimension if tensor.shape.b != 1
+  // InitFromGraph will fail if gpu can not allocate tensor with requested
+  // tensor descriptor
+  // WARNING: This is an experimental API and subject to change.
+  // IMPORTANT: tensors ids from predefined / external_immutable_tensors /
+  // external_mutable_tensors should not intersect.
+  absl::flat_hash_map<ValueId, TensorDescriptor> predefined;
+
+  // User can provide immutable external tensors for inference context.
+  // Some restrictions apply:
+  //   1) ValueId must be input or output id of GraphFloat32
+  //   2) Provided ptrs must be valid during life of InferenceContext.
+  //   3) data_type must be equal to DeduceDataTypeFromPrecision(precision);
+  //      for example for precision F16, data_type must be FLOAT16
+  //   4) Layout must be without Batch dimension if tensor.shape.b == 1
+  //      Layout must be with Batch dimension if tensor.shape.b != 1
+  // InitFromGraph will fail if gpu can not allocate tensor with requested
+  // tensor descriptor
+  // WARNING: This is an experimental API and subject to change.
+  // IMPORTANT: tensors ids from predefined / external_immutable_tensors /
+  // external_mutable_tensors should not intersect.
+  absl::flat_hash_map<ValueId, GpuSpatialTensor*> external_immutable_tensors;
+
+  // User can provide mutable external tensors for inference context.
+  // HINT: Highly recommended to use other options if possible, this options
+  // will be with the worst performance.
+  // Some restrictions apply:
+  //   1) ValueId must be input or output id of GraphFloat32
+  //   2) data_type must be equal to DeduceDataTypeFromPrecision(precision);
+  //      for example for precision F16, data_type must be FLOAT16
+  //   3) Layout must be without Batch dimension if tensor.shape.b == 1
+  //      Layout must be with Batch dimension if tensor.shape.b != 1
+  // InitFromGraph will fail if gpu can not allocate tensor with requested
+  // tensor descriptor
+  // WARNING: This is an experimental API and subject to change.
+  // IMPORTANT: tensors ids from predefined / external_immutable_tensors /
+  // external_mutable_tensors should not intersect.
+  absl::flat_hash_map<ValueId, TensorDescriptor> external_mutable_tensors;
+};
+
+struct GpuModel {
+  std::vector<std::pair<ValueId, ValueId>> input_ids_and_refs;
+  std::vector<std::pair<ValueId, ValueId>> variable_ids_and_refs;
+  std::vector<std::pair<ValueId, ValueId>> output_ids_and_refs;
+  std::vector<GpuNode> nodes;
+  absl::flat_hash_map<ValueId, TensorDescriptor> tensors;
+  absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors;
+};
+
+absl::Status GraphToGpuModel(const GraphFloat32& graph,
+                             const CreateGpuModelInfo& create_info,
+                             const GpuInfo& gpu_info, GpuModel* gpu_model);
+
+flatbuffers::Offset<data::GpuModel> Encode(
+    const GpuModel& gpu_model, flatbuffers::FlatBufferBuilder* builder);
+
+absl::Status Decode(const data::GpuModel* fb_gpu_model, GpuModel* gpu_model);
+
+// This transformations MUST be applied to graph for correct work of GpuModel
+// that will be created from graph
+absl::Status RunGraphTransformsForGpuModel(GraphFloat32* graph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
new file mode 100755
index 00000000..49fd064a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
@@ -0,0 +1,399 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_GPUMODEL_TFLITE_GPU_DATA_H_
+#define FLATBUFFERS_GENERATED_GPUMODEL_TFLITE_GPU_DATA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+#include "serialization_base_generated.h"
+
+namespace tflite {
+namespace gpu {
+namespace data {
+
+struct TensorDescWithId;
+struct TensorDescWithIdBuilder;
+
+struct PairOfValueIds;
+struct PairOfValueIdsBuilder;
+
+struct GpuNode;
+struct GpuNodeBuilder;
+
+struct GpuModel;
+struct GpuModelBuilder;
+
+struct TensorDescWithId FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorDescWithIdBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DESC = 4,
+    VT_ID = 6
+  };
+  const tflite::gpu::data::TensorDescriptor *desc() const {
+    return GetPointer<const tflite::gpu::data::TensorDescriptor *>(VT_DESC);
+  }
+  int32_t id() const {
+    return GetField<int32_t>(VT_ID, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DESC) &&
+           verifier.VerifyTable(desc()) &&
+           VerifyField<int32_t>(verifier, VT_ID, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorDescWithIdBuilder {
+  typedef TensorDescWithId Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_desc(::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> desc) {
+    fbb_.AddOffset(TensorDescWithId::VT_DESC, desc);
+  }
+  void add_id(int32_t id) {
+    fbb_.AddElement<int32_t>(TensorDescWithId::VT_ID, id, 0);
+  }
+  explicit TensorDescWithIdBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorDescWithId> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorDescWithId>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorDescWithId> CreateTensorDescWithId(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> desc = 0,
+    int32_t id = 0) {
+  TensorDescWithIdBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_desc(desc);
+  return builder_.Finish();
+}
+
+struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PairOfValueIdsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FIRST = 4,
+    VT_SECOND = 6
+  };
+  int32_t first() const {
+    return GetField<int32_t>(VT_FIRST, 0);
+  }
+  int32_t second() const {
+    return GetField<int32_t>(VT_SECOND, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_FIRST, 4) &&
+           VerifyField<int32_t>(verifier, VT_SECOND, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct PairOfValueIdsBuilder {
+  typedef PairOfValueIds Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_first(int32_t first) {
+    fbb_.AddElement<int32_t>(PairOfValueIds::VT_FIRST, first, 0);
+  }
+  void add_second(int32_t second) {
+    fbb_.AddElement<int32_t>(PairOfValueIds::VT_SECOND, second, 0);
+  }
+  explicit PairOfValueIdsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PairOfValueIds> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PairOfValueIds>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PairOfValueIds> CreatePairOfValueIds(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t first = 0,
+    int32_t second = 0) {
+  PairOfValueIdsBuilder builder_(_fbb);
+  builder_.add_second(second);
+  builder_.add_first(first);
+  return builder_.Finish();
+}
+
+struct GpuNode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GpuNodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_GPU_OP = 4,
+    VT_INPUT_IDS = 6,
+    VT_OUTPUT_IDS = 8,
+    VT_NAME = 10
+  };
+  const tflite::gpu::data::GPUOperation *gpu_op() const {
+    return GetPointer<const tflite::gpu::data::GPUOperation *>(VT_GPU_OP);
+  }
+  const ::flatbuffers::Vector<int32_t> *input_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
+  }
+  const ::flatbuffers::Vector<int32_t> *output_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUT_IDS);
+  }
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_GPU_OP) &&
+           verifier.VerifyTable(gpu_op()) &&
+           VerifyOffset(verifier, VT_INPUT_IDS) &&
+           verifier.VerifyVector(input_ids()) &&
+           VerifyOffset(verifier, VT_OUTPUT_IDS) &&
+           verifier.VerifyVector(output_ids()) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+};
+
+struct GpuNodeBuilder {
+  typedef GpuNode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_gpu_op(::flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op) {
+    fbb_.AddOffset(GpuNode::VT_GPU_OP, gpu_op);
+  }
+  void add_input_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids) {
+    fbb_.AddOffset(GpuNode::VT_INPUT_IDS, input_ids);
+  }
+  void add_output_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids) {
+    fbb_.AddOffset(GpuNode::VT_OUTPUT_IDS, output_ids);
+  }
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(GpuNode::VT_NAME, name);
+  }
+  explicit GpuNodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GpuNode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GpuNode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GpuNode> CreateGpuNode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
+  GpuNodeBuilder builder_(_fbb);
+  builder_.add_name(name);
+  builder_.add_output_ids(output_ids);
+  builder_.add_input_ids(input_ids);
+  builder_.add_gpu_op(gpu_op);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GpuNode> CreateGpuNodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
+    const std::vector<int32_t> *input_ids = nullptr,
+    const std::vector<int32_t> *output_ids = nullptr,
+    const char *name = nullptr) {
+  auto input_ids__ = input_ids ? _fbb.CreateVector<int32_t>(*input_ids) : 0;
+  auto output_ids__ = output_ids ? _fbb.CreateVector<int32_t>(*output_ids) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::gpu::data::CreateGpuNode(
+      _fbb,
+      gpu_op,
+      input_ids__,
+      output_ids__,
+      name__);
+}
+
+struct GpuModel FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GpuModelBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NODES = 4,
+    VT_TENSORS = 6,
+    VT_CONST_TENSORS = 8,
+    VT_INPUT_IDS = 10,
+    VT_OUTPUT_IDS = 12,
+    VT_INPUT_REFS = 14,
+    VT_OUTPUT_REFS = 16,
+    VT_VARIABLE_IDS_AND_REFS = 18
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>> *nodes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>> *>(VT_NODES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *>(VT_TENSORS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *const_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *>(VT_CONST_TENSORS);
+  }
+  const ::flatbuffers::Vector<int32_t> *input_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
+  }
+  const ::flatbuffers::Vector<int32_t> *output_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUT_IDS);
+  }
+  const ::flatbuffers::Vector<int64_t> *input_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INPUT_REFS);
+  }
+  const ::flatbuffers::Vector<int64_t> *output_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_OUTPUT_REFS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *variable_ids_and_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *>(VT_VARIABLE_IDS_AND_REFS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NODES) &&
+           verifier.VerifyVector(nodes()) &&
+           verifier.VerifyVectorOfTables(nodes()) &&
+           VerifyOffset(verifier, VT_TENSORS) &&
+           verifier.VerifyVector(tensors()) &&
+           verifier.VerifyVectorOfTables(tensors()) &&
+           VerifyOffset(verifier, VT_CONST_TENSORS) &&
+           verifier.VerifyVector(const_tensors()) &&
+           verifier.VerifyVectorOfTables(const_tensors()) &&
+           VerifyOffset(verifier, VT_INPUT_IDS) &&
+           verifier.VerifyVector(input_ids()) &&
+           VerifyOffset(verifier, VT_OUTPUT_IDS) &&
+           verifier.VerifyVector(output_ids()) &&
+           VerifyOffset(verifier, VT_INPUT_REFS) &&
+           verifier.VerifyVector(input_refs()) &&
+           VerifyOffset(verifier, VT_OUTPUT_REFS) &&
+           verifier.VerifyVector(output_refs()) &&
+           VerifyOffset(verifier, VT_VARIABLE_IDS_AND_REFS) &&
+           verifier.VerifyVector(variable_ids_and_refs()) &&
+           verifier.VerifyVectorOfTables(variable_ids_and_refs()) &&
+           verifier.EndTable();
+  }
+};
+
+struct GpuModelBuilder {
+  typedef GpuModel Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_nodes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>>> nodes) {
+    fbb_.AddOffset(GpuModel::VT_NODES, nodes);
+  }
+  void add_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> tensors) {
+    fbb_.AddOffset(GpuModel::VT_TENSORS, tensors);
+  }
+  void add_const_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> const_tensors) {
+    fbb_.AddOffset(GpuModel::VT_CONST_TENSORS, const_tensors);
+  }
+  void add_input_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids) {
+    fbb_.AddOffset(GpuModel::VT_INPUT_IDS, input_ids);
+  }
+  void add_output_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids) {
+    fbb_.AddOffset(GpuModel::VT_OUTPUT_IDS, output_ids);
+  }
+  void add_input_refs(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_refs) {
+    fbb_.AddOffset(GpuModel::VT_INPUT_REFS, input_refs);
+  }
+  void add_output_refs(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_refs) {
+    fbb_.AddOffset(GpuModel::VT_OUTPUT_REFS, output_refs);
+  }
+  void add_variable_ids_and_refs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>> variable_ids_and_refs) {
+    fbb_.AddOffset(GpuModel::VT_VARIABLE_IDS_AND_REFS, variable_ids_and_refs);
+  }
+  explicit GpuModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GpuModel> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GpuModel>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GpuModel> CreateGpuModel(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>>> nodes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> const_tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>> variable_ids_and_refs = 0) {
+  GpuModelBuilder builder_(_fbb);
+  builder_.add_variable_ids_and_refs(variable_ids_and_refs);
+  builder_.add_output_refs(output_refs);
+  builder_.add_input_refs(input_refs);
+  builder_.add_output_ids(output_ids);
+  builder_.add_input_ids(input_ids);
+  builder_.add_const_tensors(const_tensors);
+  builder_.add_tensors(tensors);
+  builder_.add_nodes(nodes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GpuModel> CreateGpuModelDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>> *nodes = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *tensors = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *const_tensors = nullptr,
+    const std::vector<int32_t> *input_ids = nullptr,
+    const std::vector<int32_t> *output_ids = nullptr,
+    const std::vector<int64_t> *input_refs = nullptr,
+    const std::vector<int64_t> *output_refs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *variable_ids_and_refs = nullptr) {
+  auto nodes__ = nodes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>>(*nodes) : 0;
+  auto tensors__ = tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>(*tensors) : 0;
+  auto const_tensors__ = const_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>(*const_tensors) : 0;
+  auto input_ids__ = input_ids ? _fbb.CreateVector<int32_t>(*input_ids) : 0;
+  auto output_ids__ = output_ids ? _fbb.CreateVector<int32_t>(*output_ids) : 0;
+  auto input_refs__ = input_refs ? _fbb.CreateVector<int64_t>(*input_refs) : 0;
+  auto output_refs__ = output_refs ? _fbb.CreateVector<int64_t>(*output_refs) : 0;
+  auto variable_ids_and_refs__ = variable_ids_and_refs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>(*variable_ids_and_refs) : 0;
+  return tflite::gpu::data::CreateGpuModel(
+      _fbb,
+      nodes__,
+      tensors__,
+      const_tensors__,
+      input_ids__,
+      output_ids__,
+      input_refs__,
+      output_refs__,
+      variable_ids_and_refs__);
+}
+
+}  // namespace data
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_GPUMODEL_TFLITE_GPU_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h
new file mode 100644
index 00000000..1ed10345
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h
@@ -0,0 +1,195 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_MODEL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_MODEL_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+//    input
+//      |
+// convolution
+//      |
+//   cosinus
+//      |
+//   output
+absl::Status TestLinkingConvolutionAndCosOp(TestExecutionEnvironment* env);
+
+//      input0
+//        |
+//   convolution   input1
+//          \        /
+//        multiplication0    input2
+//                 \         /
+//               multiplication1
+//                      |
+//                    output
+absl::Status TestLinkingConvolution2InputMul2InputMul(
+    TestExecutionEnvironment* env);
+
+//      input0(32x32x128)
+//        |
+//   convolution       input1(32x32x1)
+//        |                /
+//   conv_out(32x32x16)   /
+//          \            /
+//          broadcast_mul   input2(32x32x16)
+//                |            /
+//       mul_out(32x32x16)    /
+//                 \         /
+//               multiplication
+//                      |
+//                    output(32x32x16)
+absl::Status TestLinkingConvolution2InputBroadcastMul2InputMul(
+    TestExecutionEnvironment* env);
+
+//      input0(32x32x128)
+//        |
+//   convolution       input1(32x32x16)
+//        |                /
+//   conv_out(32x32x16)   /
+//          \            /
+//          multiplication   input2(1x1x16)
+//                |            /
+//       mul_out(32x32x16)    /
+//                 \         /
+//               broadcast_mul
+//                      |
+//                    output(32x32x16)
+absl::Status TestLinkingConvolution2InputMul2InputBroadcastMul(
+    TestExecutionEnvironment* env);
+
+//      input0
+//        |
+//   convolution   input1
+//          \        /
+//        multiplication0    input2
+//                 \         /
+//               multiplication1
+//                      |
+//                   cosinus
+//                      |
+//                   output
+absl::Status TestLinkingConvolution2InputMul2InputMulCos(
+    TestExecutionEnvironment* env);
+
+//      input
+//        |
+//   convolution
+//     /     \
+//   tanh     |
+//     \     /
+//  substraction
+//        |
+//     output
+absl::Status TestLinkingConvolutionFirstTanh2InputDiff(
+    TestExecutionEnvironment* env);
+
+//      input
+//        |
+//   convolution
+//     /     \
+//    |     tanh
+//     \     /
+//  substraction
+//        |
+//     output
+absl::Status TestLinkingConvolutionSecondTanh2InputDiff(
+    TestExecutionEnvironment* env);
+
+//      input
+//        |
+//   convolution
+//     /     \
+//   tanh    cos
+//     \     /
+//  substraction
+//        |
+//     output
+absl::Status TestLinkingConvolutionFirstTanhSecondCos2InputDiff(
+    TestExecutionEnvironment* env);
+
+//      input
+//        |
+//   convolution
+//      /    \
+//   tanh    cos
+//    /     /   \
+//   |    prelu  sin
+//   |      \   /
+//   |       pow
+//   |        |
+//   |       exp
+//    \       |
+//  substraction
+//        |
+//     output
+absl::Status TestLinkingComplex0(TestExecutionEnvironment* env);
+
+//                input1
+//                  |
+//              convolution
+//                  |
+//         input0  cos
+//             \   /
+//              add
+//               |
+//              cos
+//               |
+//              sin
+//               |
+//              abs
+//               |
+//             output
+absl::Status TestLinkingConvElem2InputAddElemsOp(TestExecutionEnvironment* env);
+
+//     input1
+//       |
+//     slice
+//       |
+//      cast
+//       |
+//     output
+absl::Status TestLinkingSliceCastOp(TestExecutionEnvironment* env);
+
+//       input
+//         |
+//      Reshape
+//       /   \
+//     Add   Add (Optional)
+//       \   /
+//        Mul
+//         |
+//       output
+absl::Status TestLinkingAddAddMulOp(TestExecutionEnvironment* env,
+                                    bool use_second_input_add);
+
+//    input
+//      |
+//   concat
+//      |
+//   cosinus
+//      |
+//   output
+absl::Status TestLinkingConcatAndCosOp(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_GPU_MODEL_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/kernel_info.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/kernel_info.h
new file mode 100644
index 00000000..4f181a61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/kernel_info.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_KERNEL_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_KERNEL_INFO_H_
+
+namespace tflite {
+namespace gpu {
+
+struct KernelInfo {
+  int private_memory_size;
+  int max_work_group_size;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_KERNEL_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/lstm_parser.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/lstm_parser.h
new file mode 100644
index 00000000..94d06747
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/lstm_parser.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ParseLSTMAttributes(
+    const TfLiteNode* tflite_node, const TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader* reader, const TfLiteLSTMParams* params,
+    absl::flat_hash_map<int, ValueId>* new_variable_input_values);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_LSTM_PARSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management.h
new file mode 100644
index 00000000..029ac79a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management.h
@@ -0,0 +1,141 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+using TaskId = size_t;
+
+// Converts given assignment of tensors to shared objects to the assignment of
+// the same tensors to offsets in continuous memory block.
+OffsetsAssignment ObjectsToOffsets(
+    const ObjectsAssignment<size_t>& obj_assignment);
+
+enum class MemoryStrategy {
+  // Naive strategy is to allocate each object separately.
+  // Can be useful for debugging to see all intermediate outputs.
+  NAIVE,
+
+  // Equality strategy allows to reuse the same part of memory for several
+  // tensors with the same size, but non-intersecting usage intervals.
+  EQUALITY,
+
+  // Greedy strategy uses greedy algorithm, iterating through all the tensors in
+  // order of their first_task, to reuse memory from tensors, that
+  // won't be used anymore, for new ones.
+  GREEDY_IN_ORDER,
+
+  // Greedy by size strategy uses greedy algorithm, iterating through all the
+  // tasks in non-increasing of their breadth, and calculating allocations for
+  // tensors used in these tasks. By breadth of the task we understand sum of
+  // sizes of all tensors in its TaskProfile.
+  GREEDY_BY_BREADTH,
+
+  // Greedy by size strategy uses greedy algorithm, iterating through all the
+  // tensors in non-increasing of their size, to reuse memory from tensors, that
+  // won't be used anymore, for new ones.
+  GREEDY_BY_SIZE,
+
+  // Choose greedy strategy from several fast algorithms, that provides best
+  // memory allocation for the given usage records.
+  GREEDY_BEST,
+
+  // Mincostflow strategy consists of building auxiliary flow graph and solving
+  // the minimum-cost flow problem in it. In the end edges with zero residual
+  // capacity determine assignment of shared objects to tensors.
+  MINCOSTFLOW,
+};
+
+// Chooses greedy algorithm with the lowest memory consumption for given usage
+// records and returns corresponding shared objects assignment.
+absl::Status BestGreedy(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+// Calculates the assignment of shared objects to given tensors, including
+// objects' sizes. Below there are specializations for different types, that
+// support more memory strategies.
+// If reallocation_graph is provided, assignment of shared objects support
+// parallel order of operation execution, but memory consumption in this case
+// can be larger. Currently only GREEDY_IN_ORDER strategy can use this
+// reallocation_graph.
+template <typename TensorSizeT>
+absl::Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<TensorSizeT>* assignment,
+    const UsageGraph* reallocation_graph = nullptr) {
+  switch (strategy) {
+    case MemoryStrategy::NAIVE:
+      return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::EQUALITY:
+      return EqualityAssignment(usage_records, assignment);
+    default:
+      return absl::InternalError(
+          "MemoryStrategy is not supported with current tensor size type.");
+  }
+  return absl::OkStatus();
+}
+
+template <>
+absl::Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<size_t>* assignment,
+    const UsageGraph* reallocation_graph);
+
+template <>
+absl::Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<BHWC>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<BHWC>* assignment,
+    const UsageGraph* reallocation_graph);
+
+template <>
+absl::Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<uint2>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<uint2>* assignment,
+    const UsageGraph* reallocation_graph);
+
+template <>
+absl::Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<uint3>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment,
+    const UsageGraph* reallocation_graph);
+
+// Calculates the assignment of tensors to offsets, considering those tensors
+// are going to be allocated in one continuous memory block.
+absl::Status AssignOffsetsToTensors(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    const MemoryStrategy& strategy, OffsetsAssignment* assignment,
+    size_t base_addr_align_bytes = 1,
+    const UsageGraph* reallocation_graph = nullptr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
new file mode 100644
index 00000000..018e5a95
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
@@ -0,0 +1,121 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
+
+#include <stddef.h>
+
+#include <cstddef>
+#include <queue>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Fast version of Equality Assignments for hashable types.
+template <typename TensorSizeT>
+absl::Status EqualityAssignmentWithHash(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Pool is a map with size as a key and vector with ids of free shared objects
+  // of this size as a value.
+  absl::flat_hash_map<TensorSizeT, std::vector<size_t>> pool;
+  std::priority_queue<QueueRecord> objects_in_use;
+  for (size_t i = 0; i < num_records; ++i) {
+    // Pop from the queue and add to the pool all objects that are no longer
+    // in use at the time of execution of the first_task of i-th intermediate
+    // tensor.
+    while (!objects_in_use.empty() &&
+           objects_in_use.top().last_task < usage_records[i].first_task) {
+      auto object_id = objects_in_use.top().object_id;
+      pool[assignment->object_sizes[object_id]].push_back(object_id);
+      objects_in_use.pop();
+    }
+
+    const TensorSizeT tensor_size = usage_records[i].tensor_size;
+    auto pool_it = pool.find(tensor_size);
+    if (pool_it == pool.end() || pool_it->second.empty()) {
+      // No free shared object with size equal to tensor_size. Create a new one,
+      // assign i-th tensor to it and add to the queue of objects in use.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    } else {
+      // Shared object with id it->second has size equal to tensor_size. Reuse
+      // this object: erase it from pool and add to the queue of objects in use.
+      assignment->object_ids[i] = pool_it->second.back();
+      pool_it->second.pop_back();
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    }
+  }
+  return absl::OkStatus();
+}
+
+// Slower version of Equality Assignments for unhashable types.
+template <typename TensorSizeT>
+absl::Status EqualityAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Index of operation, after execution of which the shared object can be
+  // deallocated.
+  std::vector<size_t> dealloc_task;
+  for (size_t i = 0; i < num_records; ++i) {
+    const TensorSizeT tensor_size = usage_records[i].tensor_size;
+    size_t best_obj = kNotAssigned;
+    for (size_t obj = 0; obj < assignment->object_sizes.size(); ++obj) {
+      // Find a shared object, that has equal size with current tensor and has
+      // been deallocated before the execution of its first_task.
+      if (dealloc_task[obj] < usage_records[i].first_task &&
+          assignment->object_sizes[obj] == tensor_size) {
+        best_obj = obj;
+        break;
+      }
+    }
+    if (best_obj == kNotAssigned) {
+      // No free shared object with size equal to tensor_size. Create a new one,
+      // assign i-th tensor to it and save its last task as deallocation task.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      dealloc_task.push_back(usage_records[i].last_task);
+    } else {
+      // Shared object with id it->second has size equal to tensor_size. Reuse
+      // this object and update its deallocation task.
+      assignment->object_ids[i] = best_obj;
+      dealloc_task[best_obj] = usage_records[i].last_task;
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
new file mode 100644
index 00000000..e207ab32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Assigns given tensors to shared objects, using the following greedy
+// algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - For each task calculate its TaskProfile. By breadth of the task we
+// understand sum of sizes of all tensors in its TaskProfile;
+// - Iterate through all tasks in non-increasing order of breadth;
+// - For each of these tasks iterate through all tensors in its TaskProfile in
+// non-increasing order of tensor_size;
+// - For every such tensor usage record find a shared object, that is not
+// assigned to some tensors, which usage intervals intersect with usage interval
+// of current tensor;
+// - If there are no suitable shared objects, assign current tensor to the new
+// object with size equal to current tensor's size;
+// - If there are suitable objects with size greater than or equal to current
+// tensor’s size, assign current tensor to the smallest of them;
+// - If there are suitable objects only with size less than current tensor’s
+// size, assign current tensor to the largest of them and increase its size.
+absl::Status GreedyByBreadthAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
new file mode 100644
index 00000000..6d06a2e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Assigns given tensors to offsets, using the following greedy algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - Iterate through tensor usage records in non-increasing order of
+// corresponding tensor sizes;
+// - For each of these records consider already assigned tensors, which usage
+// intervals intersect with usage interval of current tensor, and find the
+// smallest gap in memory between them such, that current tensor fits into that
+// gap;
+// - If such a gap has been found, current tensor should be allocated into this
+// gap. Otherwise we can allocate it after the rightmost tensor, which usage
+// interval intersects with usage interval of current tensor. So we assign
+// corresponding offset to current tensor and the tensor becomes assigned.
+absl::Status GreedyBySizeAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    size_t base_addr_align_bytes, OffsetsAssignment* assignment);
+
+// Assigns given tensors to shared objects, using the following greedy
+// algorithm:
+// - We have tensor usage records of all intermideate tensors as an input. Each
+// record consists of tensor size, first and last tasks, that use it. Let's call
+// [first_task..last_task] a tensor usage interval;
+// - Distance between two usage intervals is the absolute difference between
+// closest tasks in their intervals. If two usage intervals don't intersect,
+// than the distance between them is positive;
+// - Calculate positional maximums vector, e.g. the vector of lower bounds on
+// size of each shared object;
+// - For each tensor find the rightmost positional maximum, that is greater or
+// equal, than current tensor's size (call it position);
+// - Iterate through all tensors in non-decreasing order of their
+// SizeDistPriority (described above);
+// - For every such tensor, assign it to the object, that already has tensor,
+// which usage interval has the smallest existing positive distance to the
+// current tensor's usage interval (this distance and object id are already
+// precalculated in its SizeDistPriority record). Size of the chosen object can
+// possible increase;
+// - If there are several such objects, use the largest one;
+// - If there are no suitable shared objects, assign current tensor to the new
+// object with size equal to current tensor's size;
+// - Modify SizeDistPriority records of tensors, that haven't been assigned yet,
+// to reflect distance changes after that assignment.
+absl::Status GreedyBySizeDistPriorityAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
new file mode 100644
index 00000000..048ed389
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
@@ -0,0 +1,211 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <list>
+#include <queue>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements memory management with a greedy algorithm.
+//
+// The problem of memory management is NP-complete. This implements a
+// greedy algorithm that approximates an optimal solution with following
+// heuristic:
+//
+//   1. Iterates through all tensor usage records and for every object
+//   reference
+//      assigns shared object from the pool. When object reference is used
+//      for the last time, corresponding shared object is returned back to
+//      the pool.
+//
+//   2. Shared object pool grows when there are no free shared object
+//      available.
+//
+//   3. Shared object size may increase when tensor requests larger size.
+template <typename TensorSizeT>
+absl::Status GreedyInOrderAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment,
+    const UsageGraph* reallocation_graph = nullptr) {
+  std::vector<size_t> last_assigned_tensor;
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Pool of free shared objects is ordered by object size, because we perform
+  // lower_bound search in it.
+  std::set<PoolRecord<TensorSizeT>> pool;
+  // Queue of shared objects in use, ordered by their last_task.
+  std::priority_queue<QueueRecord> objects_in_use;
+  for (size_t i = 0; i < num_records; i++) {
+    // Pop from the queue and add to the pool all objects that are no longer
+    // in use at the time of execution of the first_task of i-th intermediate
+    // tensor.
+    while (!objects_in_use.empty() &&
+           objects_in_use.top().last_task < usage_records[i].first_task) {
+      auto object_id = objects_in_use.top().object_id;
+      pool.insert({assignment->object_sizes[object_id], object_id});
+      objects_in_use.pop();
+    }
+    TensorSizeT tensor_size = usage_records[i].tensor_size;
+    auto best_it = pool.end();
+    size_t best_size_diff = 0;
+    if (reallocation_graph) {
+      for (auto pool_it = pool.begin(); pool_it != pool.end(); ++pool_it) {
+        size_t size_diff = AbsDiffInElements(pool_it->object_size, tensor_size);
+        if (best_it == pool.end() || size_diff < best_size_diff) {
+          const std::vector<size_t>& realloc_options =
+              (*reallocation_graph)[last_assigned_tensor[pool_it->object_id]];
+          size_t pos = std::lower_bound(realloc_options.begin(),
+                                        realloc_options.end(), i) -
+                       realloc_options.begin();
+          if (pos != realloc_options.size() && realloc_options[pos] == i) {
+            // We found, that memory of tensor, that was last assigned to
+            // object pool_it->object_id, can be reused for tensor i.
+            best_size_diff = size_diff;
+            best_it = pool_it;
+          }
+        }
+      }
+    } else if (!pool.empty()) {
+      // Find shared object from pool, that will waste the least possible
+      // amount of memory when reused for current tensor.
+      auto pool_it = pool.lower_bound({tensor_size, 0});
+      TensorSizeT size_diff = 0;
+      if (pool_it != pool.end()) {
+        // Try smallest shared object from pool with size >= tensor_size.
+        size_diff = pool_it->object_size - tensor_size;
+        best_it = pool_it;
+      }
+      if (pool_it != pool.begin()) {
+        // Try largest shared object from pool with size < tensor_size.
+        pool_it--;
+        if (best_it == pool.end() ||
+            tensor_size - pool_it->object_size < size_diff) {
+          size_diff = tensor_size - pool_it->object_size;
+          best_it = pool_it;
+        }
+      }
+      // best_it can't be equal to pool.end(), because pool is not empty
+      if (best_it == pool.end()) {
+        return absl::InternalError(
+            "No shared object is found in non-empty pool in "
+            "GreedyInOrderAssignment.");
+      }
+    }
+    if (best_it == pool.end()) {
+      // No free shared object, creating a new one, assign i-th tensor to
+      // it and add to the queue of objects in use.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      last_assigned_tensor.push_back(i);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    } else {
+      size_t shared_id = best_it->object_id;
+      pool.erase(best_it);
+      assignment->object_ids[i] = shared_id;
+      assignment->object_sizes[shared_id] =
+          std::max(assignment->object_sizes[shared_id], tensor_size);
+      last_assigned_tensor[shared_id] = i;
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    }
+  }
+  return absl::OkStatus();
+}
+
+// The same algorithm as above, but for multidimensional case. The only
+// difference is that shared object dimensions can't be increased to be reused
+// for tensor, that is larger (at least by one dimension).
+template <typename TensorSizeT>
+absl::Status GreedyInOrderAssignmentMultidimensional(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Pool of free shared objects is unordered in multidimensional version of the
+  // algorithm.
+  std::list<size_t> pool;
+  // Queue of shared objects in use, ordered by their last_task.
+  std::priority_queue<QueueRecord> objects_in_use;
+  for (size_t i = 0; i < num_records; i++) {
+    // Pop from the queue and add to the pool all objects that are no longer
+    // in use at the time of execution of the first_task of i-th intermediate
+    // tensor.
+    while (!objects_in_use.empty() &&
+           objects_in_use.top().last_task < usage_records[i].first_task) {
+      auto object_id = objects_in_use.top().object_id;
+      pool.push_back(object_id);
+      objects_in_use.pop();
+    }
+    const TensorSizeT& tensor_size = usage_records[i].tensor_size;
+    auto best_it = pool.end();
+    size_t best_size_diff = 0;
+    // Find shared object from pool, that will waste the least possible
+    // amount of memory when reused for current tensor.
+    for (auto pool_it = pool.begin(); pool_it != pool.end(); ++pool_it) {
+      // Needed size of shared object to cover current tensor and all previous
+      // tensors assigned to it.
+      const TensorSizeT& shared_object_size =
+          assignment->object_sizes[*pool_it];
+      if (IsCoveringObject(shared_object_size, tensor_size)) {
+        // Prefer shared object that will waste less memory.
+        size_t size_diff = AbsDiffInElements(shared_object_size, tensor_size);
+        if (best_it == pool.end() || size_diff < best_size_diff) {
+          best_it = pool_it;
+          best_size_diff = size_diff;
+        }
+      }
+    }
+    if (best_it == pool.end()) {
+      // No free suitable shared object, creating a new one, assign i-th tensor
+      // to it and add to the queue of objects in use.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    } else {
+      size_t shared_id = *best_it;
+      pool.erase(best_it);
+      assignment->object_ids[i] = shared_id;
+      objects_in_use.push(
+          {usage_records[i].last_task, assignment->object_ids[i]});
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/internal.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
new file mode 100644
index 00000000..4d48f75d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
+
+#include <stddef.h>
+
+#include <limits>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+const size_t kNotAssigned = std::numeric_limits<size_t>::max();
+
+// This structure is used to save the initial indices of usage records after
+// they are sorted.
+template <typename TensorSizeT>
+struct TensorUsageWithIndex {
+  const TensorUsageRecord<TensorSizeT>* usage_record;
+  size_t idx;
+
+  TensorUsageWithIndex(const TensorUsageRecord<TensorSizeT>* usage_record,
+                       size_t idx)
+      : usage_record(usage_record), idx(idx) {}
+};
+
+bool CompareBySize(const TensorUsageWithIndex<size_t>& first,
+                   const TensorUsageWithIndex<size_t>& second);
+
+// TaskProfile is a vector with information about all intermediate tensors, that
+// should exist in memory during the execution of the task. Elements of the
+// vector must be sorted in non-increasing order of corresponding tensors sizes.
+using TaskProfile = std::vector<TensorUsageWithIndex<size_t>>;
+
+// Size of object, that covers both input objects (2-dimensional case).
+bool IsCoveringObject(const uint2& first_object, const uint2& second_object);
+
+// Size of object, that covers both input objects (3-dimensional case).
+bool IsCoveringObject(const uint3& first_object, const uint3& second_object);
+
+// Difference between two objects in elements count (1-dimensional case).
+size_t AbsDiffInElements(const size_t first_size, const size_t second_size);
+
+// Difference between two objects in elements count (2-dimensional case).
+size_t AbsDiffInElements(const uint2& first_size, const uint2& second_size);
+
+// Difference between two objects in elements count (3-dimensional case).
+size_t AbsDiffInElements(const uint3& first_size, const uint3& second_size);
+
+template <typename ObjectSizeT>
+struct PoolRecord {
+  PoolRecord(ObjectSizeT size, size_t obj_id)
+      : object_size(size), object_id(obj_id) {}
+
+  // Objects in pool are ordered by size.
+  bool operator<(const PoolRecord& other) const {
+    return (object_size < other.object_size) ||
+           (object_size == other.object_size && object_id < other.object_id);
+  }
+
+  ObjectSizeT object_size;
+  size_t object_id;
+};
+
+struct QueueRecord {
+  QueueRecord(TaskId task_id, size_t obj_id)
+      : last_task(task_id), object_id(obj_id) {}
+
+  // Objects in queue are ordered by last_task.
+  bool operator<(const QueueRecord& other) const {
+    return (last_task > other.last_task) ||
+           (last_task == other.last_task && object_id > other.object_id);
+  }
+
+  // Last task, where shared object is used.
+  TaskId last_task;
+  size_t object_id;
+};
+
+// Returns a vector that contains TaskProfile for each task.
+std::vector<TaskProfile> CalculateTaskProfiles(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records);
+
+// Iterates over all task profiles to calculate maximum at each position.
+std::vector<size_t> CalculatePositionalMaximums(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
new file mode 100644
index 00000000..df734ad9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements memory management with a Minimum-cost flow matching algorithm.
+//
+// The problem of memory management is NP-complete. This function creates
+// auxiliary flow graph, find minimum-cost flow in it and calculates the
+// assignment of shared objects to tensors, using the result of the flow
+// algorithm.
+absl::Status MinCostFlowAssignment(
+    const std::vector<TensorUsageRecord<size_t>>& usage_records,
+    ObjectsAssignment<size_t>* assignment);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
new file mode 100644
index 00000000..d700f620
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements memory management with a naive algorithm.
+//
+// The problem of memory management is NP-complete. This implements a
+// naive algorithm that assigns each tensor to a separate object in memory.
+template <typename TensorSizeT>
+absl::Status NaiveAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  assignment->object_sizes.resize(usage_records.size());
+  assignment->object_ids.assign(usage_records.size(), kNotAssigned);
+  for (size_t i = 0; i < usage_records.size(); i++) {
+    auto& record = usage_records[i];
+    assignment->object_ids[i] = i;
+    assignment->object_sizes[i] = record.tensor_size;
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/types.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/types.h
new file mode 100644
index 00000000..f3257fcf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/memory_management/types.h
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
+
+#include <stddef.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace tflite {
+namespace gpu {
+
+using TaskId = size_t;
+using UsageGraph = std::vector<std::vector<size_t>>;
+
+// Record, containing tensor size/shape and IDs of the first and the last task,
+// that use this tensor as input or output. For example: tensor #3 with size
+// tensor_size=65536 is first introduced in program #2 (first_task=2) and used
+// for the last time in program #7 (last_task=7).
+template <typename TensorSizeT>
+struct TensorUsageRecord {
+  TensorSizeT tensor_size;
+  TaskId first_task;
+  TaskId last_task;
+
+  TensorUsageRecord(TensorSizeT size, TaskId first, TaskId last)
+      : tensor_size(size), first_task(first), last_task(last) {}
+
+  // Default order of tensor usage records is increasing order of first_task.
+  bool operator<(const TensorUsageRecord<TensorSizeT>& other) const {
+    return first_task < other.first_task;
+  }
+};
+
+// Information about assignment of tensors to shared objects
+template <typename TensorSizeT>
+struct ObjectsAssignment {
+  // shared_object_ids_[i] is ID of shared object, that tensor i will be using.
+  std::vector<size_t> object_ids;
+  // shared_object_sizes_[i] is a size of shared object with ID equal to i.
+  std::vector<TensorSizeT> object_sizes;
+};
+
+// Information about assignment of tensors to offsets for the case, when all of
+// them are going to be allocated in one continuous memory block.
+struct OffsetsAssignment {
+  std::vector<size_t> offsets;
+  size_t total_size;
+};
+
+// This function takes the graph of tensor dependencies as an input and returns
+// reallocation graph as an output. Tensor dependencies graph is a directed
+// graph, with edge x->y existing if and only if tensor x is used for
+// calculating of tensor y. This graph can be generated with following
+// pseudocode: for op in operations do
+//   for input_tensor in op.input_tensors do
+//       for output_tensor in op.output_tensors do
+//         if both input_tensor and output_tensor are intermediate tensors then
+//           deps_graph[input_tensor].push_back(output_tensor)
+// Reallocation graph is an undirected graph, that has edge x<->y if and only if
+// tensors x and y can share memory in ANY order of operations parallel
+// execution.
+UsageGraph ReallocationGraph(const UsageGraph& deps_graph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model.h
new file mode 100644
index 00000000..51e67799
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model.h
@@ -0,0 +1,277 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
+
+#include <algorithm>
+#include <any>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/types/any.h"
+#include "absl/types/optional.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+// There is yet another representation of CNN graph. The primary purpose of this
+// representation is to simplify graph manipulation.
+
+using ValueId = uint32_t;
+
+using NodeId = uint32_t;
+
+// Used to emulate quantized behavior.
+struct QuantizationParams {
+  float min = 0;
+  float max = 0;
+  float scale = 0;
+};
+
+// Connects tensor's producer and operation that depends on this tensor.
+struct Value {
+  const ValueId id;
+  TensorRef<BHWC> tensor;
+  std::optional<QuantizationParams> quant_params;
+};
+
+struct Operation {
+  std::string type;
+  std::any attributes;
+};
+
+struct Node {
+  const NodeId id;
+  Operation operation;
+};
+
+// A DAG that consists of nodes and values. Each value may have a single
+// producer node and multiple consumer nodes. Therefore, each node may have
+// multiple input and output values.
+//
+// Value that does not have a producer is a graph's input. Value that does not
+// have a consumer is a graph's output.
+//
+// It keeps values and nodes referenced by their index in a vector. Therefore,
+// nodes and values are never deleted, but rather erased, where corresponding
+// index remains.
+//
+// It is possible to re-use removed indices, but it is not implemented yet.
+class GraphFloat32 {
+ public:
+  // @return a collection of nodes in this graph.
+  std::vector<Node*> nodes() const;
+
+  // @return a collection of values in this graph.
+  std::vector<Value*> values() const;
+
+  // @return graph inputs, that are values without producers.
+  std::vector<Value*> inputs() const;
+
+  // @return graph outputs, that are values without consumers or values added by
+  // AddKnownGraphOutput.
+  std::vector<Value*> outputs() const;
+
+  // @return values updated in place with a previously defined tensor reference.
+  std::vector<Value*> variable_inputs() const;
+
+  // @return inputs into the given node. Returns empty vector for deleted node.
+  std::vector<Value*> FindInputs(NodeId id) const;
+
+  // @return outputs from the given node. Returns empty vector for deleted node.
+  std::vector<Value*> FindOutputs(NodeId id) const;
+
+  bool IsGraphInput(ValueId id) const;
+
+  bool IsGraphOutput(ValueId id) const;
+
+  // @return producer of the given value. Returns nullptr for deleted value.
+  Node* FindProducer(ValueId id) const;
+
+  // @return consumers of the given value. Returns empty vector for deleted
+  // value.
+  std::vector<Node*> FindConsumers(ValueId id) const;
+
+  // @return a node or nullptr if node with the given id is not present.
+  Node* GetNode(NodeId id) const;
+
+  // @return a value or nullptr if value with the given id is not present.
+  Value* GetValue(ValueId id) const;
+
+  // Add a value to the list of known graph output list. The value should be
+  // gotten from delegate_params->output_tensors.
+  void AddKnownGraphOutput(Value* id) { known_graph_outputs_.push_back(id); }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Graph manipulation functions are below
+  //////////////////////////////////////////////////////////////////////////////
+
+  // @return new node created in this graph
+  // NOTE: nodes should be created in the topological order, e.g. node A that
+  // depends on a value from node B should be created after node B.
+  Node* NewNode();
+
+  // Insert Node after another in the execution plan.
+  absl::Status InsertNodeAfter(NodeId id, Node** new_node);
+
+  // @return new value created in this graph
+  Value* NewValue();
+
+  // Sets a producer for the given value. There could be a single producer
+  // for a value. If a value had another producer, it will reassign producer
+  // appropriately. If a value didn't have a producer, it will be removed
+  // from a graph's input.
+  absl::Status SetProducer(NodeId producer, ValueId value);
+
+  // Removes a producer for the given value. Value becomes producer-less and
+  // therefore becomes graph's input.
+  absl::Status RemoveProducer(ValueId value);
+
+  // Sets a consumer for the given value. There could be multiple consumers
+  // for a value.
+  absl::Status AddConsumer(NodeId consumer, ValueId value);
+
+  // Replace input value for given node.
+  absl::Status ReplaceInput(NodeId node, ValueId old_value, ValueId new_value);
+
+  // Removes a consumer for the given value. If value does not have any
+  // consumers it becomes graph's output.
+  absl::Status RemoveConsumer(NodeId consumer, ValueId value);
+
+  // Removes node from this graph. For all input values this node will be
+  // removed from consumers and for all output values a producer will be
+  // removed.
+  absl::Status DeleteNode(NodeId id);
+
+  // Removes value from this graph. It will be removed from inputs for all
+  // dependent nodes. A node that was a producer of this value will loose its
+  // output.
+  absl::Status DeleteValue(ValueId id);
+
+  absl::Status MakeExactCopy(GraphFloat32* model) const;
+
+ private:
+  struct NodeDef {
+    std::vector<Value*> inputs;
+    std::vector<Value*> outputs;
+    std::unique_ptr<Node> node;
+  };
+
+  struct ValueDef {
+    Node* producer = nullptr;
+    std::vector<Node*> consumers;
+    std::unique_ptr<Value> value;
+  };
+
+  bool IsInput(NodeId node, ValueId value);
+
+  template <typename T>
+  static void Erase(std::vector<T>* values, T value) {
+    values->erase(std::find(values->begin(), values->end(), value));
+  }
+
+  // @return non-nullptr NodeDef that has valid Node or an error
+  absl::Status LookupNode(NodeId id, NodeDef** node_def);
+
+  // @return non-nullptr ValueDef that has valid Value or an error
+  absl::Status LookupValue(ValueId id, ValueDef** value_def);
+
+  template <typename Pred>
+  std::vector<Value*> FilterValues(const Pred& predicate) const {
+    std::vector<Value*> values;
+    values.reserve(values_.size());
+    for (auto& v : values_) {
+      if (v.value != nullptr && predicate(v)) {
+        values.push_back(v.value.get());
+      }
+    }
+    return values;
+  }
+
+  template <typename Pred>
+  std::vector<Node*> FilterNodes(const Pred& predicate) const {
+    std::vector<Node*> nodes;
+    nodes.reserve(nodes_.size());
+    for (const auto id : execution_plan_) {
+      auto& n = nodes_.at(id);
+      if (n.node != nullptr && predicate(n)) {
+        nodes.push_back(n.node.get());
+      }
+    }
+    return nodes;
+  }
+
+  // There are two approaches possible: wrap entire NodeDef and ValueDef into
+  // unique_ptr and store it in values_ and nodes_ or store it by value.
+  // We store it by value here to make introspection calls cheaper.
+  std::vector<ValueDef> values_;
+
+  std::map<NodeId, NodeDef> nodes_;
+  // Node Ids in order of execution.
+  std::vector<NodeId> execution_plan_;
+
+  // List of known graph outputs.
+  std::vector<Value*> known_graph_outputs_;
+};
+
+// Removes to_remove node that precedes to_keep node only if to_remove has
+// outputs that are consumed only by to_keep. In such case to_keep inherits all
+// to_remove inputs.
+absl::Status RemovePrecedingNode(GraphFloat32* graph, const Node* to_remove,
+                                 const Node* to_keep);
+
+// Removes to_remove node that follows to_keep node only if to_remove has inputs
+// that are produced by to_keep. to_keep inherits all to_remove inputs.
+absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
+                                 const Node* to_keep);
+
+// Removes simple_node and its output value from the graph. Node is considered
+// simple if it has only one input and one output value. Input value is kept.
+absl::Status RemoveSimpleNodeKeepInput(GraphFloat32* graph,
+                                       const Node* simple_node);
+
+// Removes simple_node and its input value from the graph. Node is considered
+// simple if it has only one input and one output value. Output value is kept.
+// simple_node should be an exclusive consumer of its input value.
+absl::Status RemoveSimpleNodeKeepOutput(GraphFloat32* graph,
+                                        const Node* simple_node);
+
+absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
+                       Value** output);
+
+// Makes a direct connection between from_node and to_node. All input parameters
+// except output are expected to be initialized before passing to the function.
+// If from_node already has an output value, which is not yet consumed by
+// to_node, it may be passed as output parameter.
+absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
+                             const Node* to_node, Value** output);
+
+// @return OkStatus if all tensors have the same batch value, otherwise an
+// invalid argument error is returned.
+absl::Status CheckBatchSizeForAllValues(const GraphFloat32& model);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder.h
new file mode 100644
index 00000000..62c23108
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -0,0 +1,101 @@
+/* Copyright 2019-2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
+
+#include <limits>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+// Validates which operations are supported and returns array of operations to
+// replace with GPU kernels. The caller must free the pointer on TfLiteIntArray.
+// 'max_delegated_partitions' limits the maximum number of partitions to
+// delegate as a graph could possibly have multiple partitions (each partition
+// consists of a subset of ops) to be replaced.
+// 'excluded_ops', if not null, specifies a set of ops that should not be
+// replaced with GPU kernels.
+TfLiteIntArray* GetOpsToReplace(
+    TfLiteContext* context, bool allow_quant_ops = false,
+    int max_delegated_partitions = 1,
+    const absl::flat_hash_set<TfLiteBuiltinOperator>* excluded_ops = nullptr,
+    int start_node_index = 0,
+    int end_node_index = std::numeric_limits<int>::max());
+
+// Extracts TFLite delegate execution plan from the input TFLite context and
+// converts it into generic graph format.
+//
+// If model is quantized, quant_conversion_map maps the dequantized tensor
+// (floating-point) to the original tensor (fixed-point) & vice-versa.
+// NOTE: Not all of these new tensors will have any data and need memory
+// allocated for them. We need to do that only for the overall GPU graph inputs
+// & outputs. This should be done by the delegate, by setting the appropriate
+// TfLiteNode->temporaries.
+absl::Status BuildModel(
+    TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
+    GraphFloat32* graph,
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
+
+// Same as BuildModel, but enforces user-provided input/output indices instead
+// of using delegate_params->inputs and delegate_params->outputs for
+// inputs/outputs preallocating.
+absl::Status BuildModelEnforceIO(
+    TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
+    const std::vector<int>& input_ids, const std::vector<int>& output_ids,
+    GraphFloat32* graph,
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
+
+// Same as above but also apply all transformations on the final graph.
+// Prefer using this method instead of BuildModel.
+//
+// If model is quantized, quant_conversion_map maps the dequantized tensor
+// (floating-point) to the original TFLite tensor (fixed-point) & vice-versa.
+// NOTE: Not all of these new tensors will have any data and need memory
+// allocated for them. We need to do that only for the overall GPU graph inputs
+// & outputs. This should be done by the delegate, by setting the appropriate
+// TfLiteNode->temporaries.
+absl::Status BuildFinalModel(
+    TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
+    GraphFloat32* graph,
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
+
+// Convenience wrapper that builds a GraphFloat32 from the provided
+// FlatBufferModel.
+absl::Status BuildFromFlatBuffer(const FlatBufferModel& flatbuffer,
+                                 const OpResolver& op_resolver,
+                                 GraphFloat32* graph,
+                                 bool allow_quant_ops = false);
+
+// Module-internal converter, exposed for unit testing purpose only.
+absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
+                                            TensorRef<BHWC>* tensor_ref);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
new file mode 100644
index 00000000..27bb621c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <cstring>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
+                                    TfLiteNode** tflite_node,
+                                    TfLiteRegistration** registration);
+
+DataType ToDataType(TfLiteType type);
+
+absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc);
+
+absl::Status ExtractAxisFromIndex(const TfLiteTensor& tflite_tensor, int index,
+                                  Axis* axis);
+
+absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
+                                            TensorRef<BHWC>* tensor_ref);
+
+// Populates quantization parameters for non-constant UInt8/Int8 tensors.
+// This helps the delegate emulate quantized inference with
+// QuantizeAndDequantize.
+absl::Status PopulateQuantParams(const TfLiteTensor& tensor,
+                                 QuantizationParams* quant_params);
+
+int GetNumberOfRuntimeInputsForNode(const TfLiteContext* context,
+                                    const TfLiteNode* tflite_node);
+
+int GetNumberOfConstInputsForNode(const TfLiteContext* context,
+                                  const TfLiteNode* tflite_node);
+
+absl::Status CheckInputsOutputs(const TfLiteContext* context,
+                                const TfLiteNode* tflite_node,
+                                int runtime_inputs, int outputs);
+
+absl::Status CheckInputsConstsOutputs(const TfLiteContext* context,
+                                      const TfLiteNode* tflite_node,
+                                      int runtime_inputs, int const_inputs,
+                                      int outputs);
+
+void ConvertFloat16ToFloat32(size_t num_elements, const uint16_t* src,
+                             float* dst);
+
+template <typename T>
+inline void DequantizeConstantTensor(const TfLiteTensor& tensor,
+                                     const T* source_data,
+                                     float* dequantized_data) {
+  TfLiteAffineQuantization* quant_params =
+      static_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+  if (quant_params->scale->size > 1) {
+    // Tensor is per-channel quantized.
+    PerChannelDequantizationParams op_params;
+    op_params.zero_point = quant_params->zero_point->data;
+    op_params.scale = quant_params->scale->data;
+    op_params.quantized_dimension = quant_params->quantized_dimension;
+    reference_ops::PerChannelDequantize(op_params, GetTensorShape(&tensor),
+                                        source_data, GetTensorShape(&tensor),
+                                        dequantized_data);
+  } else {
+    DequantizationParams op_params;
+    op_params.zero_point = tensor.params.zero_point;
+    op_params.scale = tensor.params.scale;
+    reference_ops::Dequantize(op_params, GetTensorShape(&tensor), source_data,
+                              GetTensorShape(&tensor), dequantized_data);
+  }
+}
+
+template <typename T>
+absl::Status CreateVectorCopyData(const TfLiteTensor& src, T* dst) {
+  if (src.bytes % sizeof(T) != 0) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Input data size ", src.bytes,
+                     " is not aligned to expected type: ", sizeof(T)));
+  }
+  if (const int n = tflite::NumElements(&src); n * sizeof(T) == src.bytes) {
+    std::memcpy(dst, src.data.raw_const, src.bytes);
+    return absl::OkStatus();
+  } else {
+    switch (src.type) {
+      case kTfLiteNoType:
+        return absl::InvalidArgumentError("src has no type.");
+      case kTfLiteFloat32:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<float>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt32:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int32_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteUInt8:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint8_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt64:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int64_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteString:
+        return absl::UnimplementedError("src can't be string.");
+      case kTfLiteBool:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<bool>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt16:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int16_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteComplex64:
+        return absl::UnimplementedError("src can't be complex64.");
+      case kTfLiteInt8:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int8_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteFloat16:
+        return absl::UnimplementedError("src can't be float16.");
+      case kTfLiteBFloat16:
+        return absl::UnimplementedError("src can't be bfloat16.");
+      case kTfLiteFloat64:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<double>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteComplex128:
+        return absl::UnimplementedError("src can't be complex128.");
+      case kTfLiteUInt64:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint64_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteResource:
+        return absl::UnimplementedError("src can't be resource.");
+      case kTfLiteVariant:
+        return absl::UnimplementedError("src can't be variant.");
+      case kTfLiteUInt32:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint32_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteUInt16:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint16_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt4:
+        return absl::UnimplementedError("src can't be int4.");
+    }
+  }
+}
+
+template <>
+absl::Status CreateVectorCopyData<float>(const TfLiteTensor& src, float* dst);
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape);
+
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions);
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape);
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape);
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape);
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape);
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape);
+
+// If there is fused activation present, then there will be another node created
+// that will have identical output as the given node. New operation node will
+// depend on the given node output.
+absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
+                                 GraphFloat32* graph, Node* node);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder_internal.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder_internal.h
new file mode 100644
index 00000000..ee145da1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_builder_internal.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_INTERNAL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_INTERNAL_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/operation_parser.h"
+
+namespace tflite {
+namespace gpu {
+
+// Returns a new TFLiteOperationParser object which parses the TFLite operator
+// in the given TfLiteRegistration object.
+std::unique_ptr<TFLiteOperationParser> NewOperationParser(
+    const TfLiteRegistration* registration, bool allow_quant_ops = false,
+    const absl::flat_hash_set<TfLiteBuiltinOperator>* excluded_ops = nullptr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_hints.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_hints.h
new file mode 100644
index 00000000..8d39049e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_hints.h
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_HINTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_HINTS_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace gpu {
+
+struct ModelHints {
+  using ModelHint = uint64_t;
+
+  // By default we want the fastest inference.
+  static constexpr ModelHint kFastestInference = 0x00000000;
+  // Can improve compilation time, but inference can be slower.
+  static constexpr ModelHint kReduceKernelsCount = 0x00000001;
+  // Can improve tuning time, but inference can be slower.
+  static constexpr ModelHint kFastTuning = 0x00000001 << 1;
+
+  // Can improve performance and memory consumption, but slow down
+  // initialization and create more unique kernels.
+  static constexpr ModelHint kAllowSpecialKernels = 0x00000001 << 2;
+
+  // By default we apply Winograd optimized kernels and it improves performance.
+  // But it also can increase memory usage and decrease precision.
+  // This hint can disable Winograd optimizations.
+  static constexpr ModelHint kNoWinogradOptimizations = 0x00000001 << 3;
+
+  // By default, the same weights can be duplicated among different nodes of
+  // convolution. With this hint we will try to reuse common object, when it
+  // possible.
+  // Can decrease constant memory usage(if model has the same weights).
+  static constexpr ModelHint kReuseConvWeights = 0x00000001 << 4;
+
+  void Add(ModelHint hint) {
+    if (hint == kFastestInference) {
+      hints = kFastestInference;
+    } else {
+      hints |= hint;
+    }
+  }
+
+  bool Check(ModelHint hint) const { return hints & hint; }
+
+  uint64_t hints = kFastestInference;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_HINTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_transformer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_transformer.h
new file mode 100644
index 00000000..c7888009
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/model_transformer.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_TRANSFORMER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_TRANSFORMER_H_
+
+#include <deque>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+
+namespace tflite {
+namespace gpu {
+
+struct TransformationContext {
+  GraphFloat32* graph;
+};
+
+enum class TransformStatus {
+  // Transformation was not applied due to trivial conditions mismatch.
+  //
+  // This is different from DECLINED code below that provides in-depth
+  // explanation why a transformation that could have been applied but was not
+  // due to some issues.
+  SKIPPED,
+
+  // Transformation was declined, therefore, a model was not modified.
+  DECLINED,
+
+  // Transformation was applied successfully
+  APPLIED,
+
+  // Transformation may partially be applied, but left a model in an invalid
+  // state. This error should be considered unrecoverable.
+  INVALID,
+};
+
+struct TransformResult {
+  TransformStatus status;
+  std::string message;
+  bool operator==(const TransformResult& result) const {
+    return this->status == result.status && this->message == result.message;
+  }
+};
+
+// Class responsible for applying a transformation to a single node.
+class NodeTransformation {
+ public:
+  virtual ~NodeTransformation() = default;
+
+  virtual TransformResult ApplyToNode(Node* node, GraphFloat32* graph) = 0;
+};
+
+// Class responsible for applying a transformation to a sequence of nodes.
+// Nodes are guaranteed to depend on each other without extra dependents being
+// spilled.
+class SequenceTransformation {
+ public:
+  virtual ~SequenceTransformation() = default;
+
+  // @return number of nodes in a sequence to apply this transformation.
+  virtual int ExpectedSequenceLength() const = 0;
+
+  // Applies transformations to a sequence of nodes. Transformation
+  // implementation is free manipulate with sequence nodes including adding
+  // and/or deleting nodes. if there were updates to nodes in the end and/or
+  // beginning of the sequence, then referential consistency should be
+  // maintained by updating relevant references in nodes that precede this
+  // sequence or depend on a last node of the sequence.
+  virtual TransformResult ApplyToNodesSequence(
+      const std::vector<Node*>& sequence, GraphFloat32* graph) = 0;
+};
+
+// Performs model transformations.
+class ModelTransformer {
+ public:
+  explicit ModelTransformer(GraphFloat32* graph) : graph_(graph) {}
+
+  // @return false if a graph is in the broken states can not be used any more
+  bool Apply(const std::string& name, SequenceTransformation* transformation);
+
+  // @return false if a graph is in the broken states can not be used any more
+  bool Apply(const std::string& name, NodeTransformation* transformation);
+
+  // @return last recorded error for graph transformations.
+  const std::string& last_transformation_message() const;
+
+ private:
+  bool ApplyStartingWithNode(const std::string& name,
+                             SequenceTransformation* transformation,
+                             Node* begin);
+
+  void AddNodeToProcess(Node* node) {
+    if (node && processed_.insert(node->id).second) {
+      to_process_.push_back(node->id);
+    }
+  }
+
+  GraphFloat32* graph_;
+
+  // TODO(b/163423950): Clean up messaging mechanism.
+  std::string last_transformation_message_;
+  std::deque<NodeId> to_process_;
+  absl::flat_hash_set<NodeId> processed_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_TRANSFORMER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/object_reader.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/object_reader.h
new file mode 100644
index 00000000..2dae9af7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/object_reader.h
@@ -0,0 +1,151 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "fp16.h"  // from @FP16
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace gpu {
+
+// If quantized tensors exist in the graph & quant_conversion_map is non-null,
+// the mapping between the original tensors (fixed-point) & GPU values (fp) is
+// stored in quant_conversion_map.
+class ObjectReader {
+ public:
+  static absl::Status ReadNonConstantTensor(
+      TfLiteContext* context, absl::flat_hash_map<int, Value*>* tensor_to_value,
+      absl::flat_hash_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+      uint32_t tensor_idx, Value** value = nullptr);
+
+  ObjectReader(GraphFloat32* graph, TfLiteContext* context,
+               const TfLiteNode* node,
+               absl::flat_hash_map<int, Value*>* tensor_to_value,
+               absl::flat_hash_map<int, int>* quant_conversion_map = nullptr)
+      : graph_(graph),
+        context_(context),
+        node_(node),
+        tensor_to_value_(tensor_to_value),
+        quant_conversion_map_(quant_conversion_map) {}
+
+  absl::Status ReadValue(uint32_t idx, Value** value);
+
+  absl::Status ReadValueByTensorIdx(uint32_t tensor_idx, Value** value);
+
+  int GetNumberOfRuntimeInputs() const;
+
+  absl::Status GetTensorId(uint32_t input_id, int* tensor_id) const;
+
+  absl::Status GetTensorDims(uint32_t idx, TfLiteIntArray* dimensions) const;
+
+  template <typename TensorT>
+  absl::Status ReadTensor(uint32_t index, TensorT* tensor) const {
+    if (index < 0 || index >= node_->inputs->size) {
+      // If larger, this can be an older model with fewer input tensors than the
+      // current implementation.
+      return absl::OutOfRangeError("Invalid data index found.");
+    }
+    const int32_t tensor_id = node_->inputs->data[index];
+    if (tensor_id < 0) {
+      return absl::InvalidArgumentError(
+          "Invalid data index found. Possibly an unset optional tensor is "
+          "being read.");
+    }
+    const TfLiteTensor* tflite_tensor = context_->tensors + tensor_id;
+    tensor->data.resize(NumElements(tflite_tensor));
+    if (tflite_tensor->sparsity) {
+      std::vector<int> dims;
+      dims.reserve(tflite_tensor->dims->size);
+      for (int i = 0; i < tflite_tensor->dims->size; ++i) {
+        dims.push_back(tflite_tensor->dims->data[i]);
+      }
+      switch (tflite_tensor->type) {
+        case kTfLiteFloat32: {
+          internal::sparsity::FormatConverter<float> converter(
+              dims, *tflite_tensor->sparsity);
+          converter.SparseToDense(
+              static_cast<const float*>(tflite_tensor->data.data));
+          const std::vector<float> out = converter.GetData();
+          std::memcpy(&tensor->data[0], out.data(), out.size() * sizeof(float));
+          break;
+        }
+        case kTfLiteFloat16: {
+          internal::sparsity::FormatConverter<Eigen::half> converter(
+              dims, *tflite_tensor->sparsity);
+          converter.SparseToDense(
+              static_cast<const Eigen::half*>(tflite_tensor->data.data));
+          const std::vector<Eigen::half> out = converter.GetData();
+          std::transform(out.begin(), out.end(), tensor->data.begin(),
+                         [](const Eigen::half& x) {
+                           return fp16_ieee_to_fp32_value(
+                               Eigen::numext::bit_cast<uint16_t>(x));
+                         });
+          break;
+        }
+        default: {
+          return absl::InvalidArgumentError(
+              "Unexpected data type in sparse tensor");
+        }
+      }
+    } else {
+      RETURN_IF_ERROR(CreateVectorCopyData(*tflite_tensor, &tensor->data[0]));
+    }
+
+    // Axis and data layout depend on operation this tensor is used in. So,
+    // postpone resolutions until operations are parsed.
+    tensor->id = tensor_id;
+    return SetAllDimensions(tflite_tensor->dims, &tensor->shape);
+  }
+
+  absl::Status AddOutput(const Node* node, int id);
+
+  absl::Status AddOutputs(const Node* node);
+
+  absl::Status AddInput(const Node* node, uint32_t idx);
+
+  absl::Status AddUpdate(const Node* node, uint32_t idx);
+
+  TfLiteTensor* GetInputTensor(int index) const;
+
+  TfLiteTensor* GetOutputTensor(int index) const;
+
+  absl::Status VerifyInputsConstsOutputs(const TfLiteNode* node,
+                                         int runtime_inputs, int const_inputs,
+                                         int outputs);
+
+ private:
+  GraphFloat32* graph_;
+  TfLiteContext* context_;
+  const TfLiteNode* node_;
+  absl::flat_hash_map<int, Value*>* tensor_to_value_;
+  absl::flat_hash_map<int, int>* quant_conversion_map_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/operation_parser.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/operation_parser.h
new file mode 100644
index 00000000..9f21b448
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/operation_parser.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATION_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATION_PARSER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Parses TFLite operation and updates provided GraphFloat32.
+class TFLiteOperationParser {
+ public:
+  virtual ~TFLiteOperationParser() = default;
+
+  // Parses TFLite operation. This method allows expanding fused operations
+  // into more than one node.
+  virtual absl::Status Parse(const TfLiteNode* tflite_node,
+                             const TfLiteRegistration* registration,
+                             GraphFloat32* graph, ObjectReader* reader) = 0;
+
+  // Verifies whether passed tflite node may be built by GPU delegate or not.
+  virtual absl::Status IsSupported(const TfLiteContext* context,
+                                   const TfLiteNode* tflite_node,
+                                   const TfLiteRegistration* registration) = 0;
+
+  // Returns the value IDs in the graph that correspond to the updated values of
+  // the variable input tensor.
+  virtual absl::flat_hash_map<int, ValueId>
+  GetNewValueIdsForVariableInputNodes() {
+    return {};
+  }
+};
+
+absl::Status CheckMaxSupportedOpVersion(const TfLiteRegistration* registration,
+                                        int max_version);
+HW ToHW(int32_t h, int32_t w);
+absl::Status ParsePoolingAttributes(const TfLitePoolParams* tf_options,
+                                    const BHWC& input_shape,
+                                    Pooling2DAttributes* attr);
+
+template <typename AttrT>
+void UpdatePadding(const TfLitePadding& padding, const BHWC& input_shape,
+                   AttrT* attr) {
+  if (padding == kTfLitePaddingSame) {
+    attr->padding = CalculateSamePadding(input_shape, *attr);
+  } else {
+    attr->padding.prepended = HW(0, 0);
+    attr->padding.appended = HW(0, 0);
+  }
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATION_PARSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/operations.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/operations.h
new file mode 100644
index 00000000..2e72e551
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/operations.h
@@ -0,0 +1,658 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
+
+#include <cstdint>
+#include <set>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+// Non exhaustive list of operations.
+enum class OperationType {
+  UNKNOWN = 0,
+  ABS,
+  ADD,
+  BATCH_TO_SPACE,
+  BATCH_NORMALIZATION,
+  BATCHED_MATMUL,
+  CAST,
+  CEIL,
+  CONCAT,
+  CONSTANT,
+  CONVOLUTION_2D,
+  CONVOLUTION_TRANSPOSED,
+  COPY,
+  COS,
+  CUMSUM,
+  DENSIFY,
+  DEPTHWISE_CONVOLUTION,
+  DEPTH_TO_SPACE,
+  DIV,
+  ELU,
+  EQUAL,
+  EXP,
+  FLOOR,
+  FLOOR_DIV,
+  FLOOR_MOD,
+  FULLY_CONNECTED,
+  FULLY_CONNECTED_INT8,
+  GATHER,
+  GELU,
+  GREATER,
+  GREATER_EQUAL,
+  HARD_SWISH,
+  LESS,
+  LESS_EQUAL,
+  LOG,
+  LOGICAL_AND,
+  LSTM,
+  MAXIMUM,
+  MAX_UNPOOLING_2D,
+  MEAN,
+  MEAN_STDDEV_NORMALIZATION,
+  MINIMUM,
+  MUL,
+  NEG,
+  NOT_EQUAL,
+  ONE_HOT,
+  PAD,
+  PAD_V2,
+  POOLING_2D,
+  POW,
+  PRELU,
+  // Used to accurately run inference on quantized models.
+  QUANTIZE_AND_DEQUANTIZE,
+  REDUCE_MAXIMUM,
+  REDUCE_MINIMUM,
+  REDUCE_PRODUCT,
+  REDUCE_SUM,
+  RELU,
+  RESAMPLER,
+  RESHAPE,
+  RESIZE,
+  RSQRT,
+  SELECT,
+  SELECT_V2,
+  SIGMOID,
+  SIGN,
+  SIN,
+  SLICE,
+  SOFTMAX,
+  SPACE_TO_BATCH,
+  SPACE_TO_DEPTH,
+  SPLIT,
+  SQRT,
+  SQUARE,
+  SQUARED_DIFF,
+  SUB,
+  TANH,
+  TILE,
+  TRANSPOSE,
+};
+
+std::string ToString(enum OperationType op);
+
+OperationType OperationTypeFromString(const std::string& name);
+
+template <DataType DataTypeT, typename t>
+using TensorOrScalarBase = std::variant<std::monostate, Tensor<HWC, DataTypeT>,
+                                        Tensor<Linear, DataTypeT>, t>;
+
+using TensorOrScalar = TensorOrScalarBase<DataType::FLOAT32, float>;
+
+struct Padding2D {
+  bool operator==(const Padding2D& value) const;
+  bool operator!=(const Padding2D& value) const;
+  Padding2D& operator-(const Padding2D& value);
+
+  // Padding values for every axis (if needed), where 'prepended' defines
+  // padding for the beginning of each axis and 'appended' represents end part
+  // of the corresponding axis.
+  HW prepended = HW(-1, -1);
+  HW appended = HW(-1, -1);
+};
+
+struct Padding3D {
+  bool operator==(const Padding3D& value);
+  bool operator!=(const Padding3D& value);
+  Padding3D& operator-(const Padding3D& value);
+
+  // Padding values for every axis (if needed), where 'prepended' defines
+  // padding for the beginning of each axis and 'appended' represents end part
+  // of the corresponding axis.
+  HWD prepended = HWD(0, 0, 0);
+  HWD appended = HWD(0, 0, 0);
+};
+
+struct Crop2D : public Padding2D {};
+
+struct SpaceToBatchAttributes {
+  HW block;
+  Padding2D padding;
+};
+
+struct BatchToSpaceAttributes {
+  HW block;
+  Crop2D crop;
+};
+
+enum class PoolingType {
+  UNDEFINED = 0,
+
+  // average pooling
+  AVERAGE = 1,
+
+  // max pooling
+  MAX = 2,
+};
+
+struct Pooling2DAttributes {
+  PoolingType type = PoolingType::UNDEFINED;
+  // Strides for every axis.
+  HW strides = HW(-1, -1);
+  HW kernel = HW(-1, -1);
+  Padding2D padding;
+  // NOTE(akulik): technically the number of outputs from Pooling node indicates
+  // whether indices are needed or not, but I decided to keep it inside
+  // attributes to simplify processing.
+  bool output_indices = false;
+};
+
+struct Pooling3DAttributes {
+  PoolingType type = PoolingType::UNDEFINED;
+  // Strides for every axis.
+  HWD strides = HWD(0, 0, 0);
+  HWD kernel = HWD(0, 0, 0);
+  Padding3D padding;
+  // NOTE(akulik): technically the number of outputs from Pooling node indicates
+  // whether indices are needed or not, but I decided to keep it inside
+  // attributes to simplify processing.
+  bool output_indices = false;
+};
+
+struct MaxUnpooling2DAttributes {
+  // Strides for every axis.
+  HW strides = HW(-1, -1);
+  HW kernel = HW(-1, -1);
+  Padding2D padding;
+};
+
+struct MaxUnpooling3DAttributes {
+  // Strides for every axis.
+  HWD strides = HWD(0, 0, 0);
+  HWD kernel = HWD(0, 0, 0);
+  Padding3D padding;
+};
+
+struct MeanAttributes {
+  // The vector of dimensions to calculate mean along.
+  std::set<Axis> dims;
+};
+
+struct ConcatAttributes {
+  // Defines axis by which to concat on.
+  Axis axis = Axis::UNKNOWN;
+};
+
+// @return shape of a tensor after MaxUnpooling2D operation is applied to
+//         the given input.
+BHWC CalculateOutputShape(const BHWC& input,
+                          const MaxUnpooling2DAttributes& attr);
+
+// @return shape of a tensor after MaxUnpooling3D operation is applied to
+//         the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const MaxUnpooling3DAttributes& attr);
+
+// @return shape of a tensor after Pooling2D operation is applied to the given
+//         input.
+BHWC CalculateOutputShape(const BHWC& input, const Pooling2DAttributes& attr);
+
+// @return shape of a tensor after Pooling3D operation is applied to the given
+//         input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Pooling3DAttributes& attr);
+
+// @return shape of a tensor after Concat operation is applied to the given
+//         input.
+absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWC* output_shape);
+
+// @return shape of a tensor after Concat operation is applied to the given
+//         input.
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape);
+
+// @return padding for pooling operation to make sure output keep the same shape
+// as the given input.
+Padding2D CalculateSamePadding(const BHWC& input,
+                               const Pooling2DAttributes& attr);
+
+// @return padding for pooling operation to make sure output keep the same shape
+// as the given input.
+Padding3D CalculateSamePadding(const BHWDC& input,
+                               const Pooling3DAttributes& attr);
+
+// @return padding for max unpooling operation to make sure output keep the same
+// shape as the given input.
+Padding2D CalculateSamePadding(const BHWC& input,
+                               const MaxUnpooling2DAttributes& attr);
+
+// @return padding for max unpooling operation to make sure output keep the same
+// shape as the given input.
+Padding3D CalculateSamePadding(const BHWDC& input,
+                               const MaxUnpooling3DAttributes& attr);
+
+struct Convolution2DAttributes {
+  HW strides = HW(1, 1);    // Along each axis.
+  HW dilations = HW(1, 1);  // Along each axis.
+  Padding2D padding;
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  Tensor<Linear, DataType::FLOAT32> bias;  // optional
+
+  int groups = 1;  // optional, split channels dimension on equal groups
+  // Restrictions:
+  // src.Channels() and dst.Channels() must be divisible by groups
+  // Restrictions for gpu delegates:
+  //   src_group_channels = src.Channels() / groups;
+  //   dst_group_channels = dst.Channels() / groups;
+  //   src_group_channels and dst_group_channels must be divisible by 4
+  // if groups != 1, weights will have special format
+  //   weights.o = group_weights.o * groups;
+  //   weights.i = group_weights.i;
+  //   weights.h = group_weights.h;
+  //   weights.w = group_weights.w;
+};
+
+struct Convolution3DAttributes {
+  HWD strides = HWD(0, 0, 0);    // Along each axis.
+  HWD dilations = HWD(0, 0, 0);  // Along each axis.
+  Padding3D padding;
+
+  Tensor<OHWDI, DataType::FLOAT32> weights;
+  Tensor<Linear, DataType::FLOAT32> bias;  // optional
+
+  int groups = 1;  // optional, split channels dimension on equal groups
+  // Restrictions:
+  // src.Channels() and dst.Channels() must be divisible by groups
+  // Restrictions for gpu delegates:
+  //   src_group_channels = src.Channels() / groups;
+  //   dst_group_channels = dst.Channels() / groups;
+  //   src_group_channels and dst_group_channels must be divisible by 4
+  // if groups != 1, weights will have special format
+  //   weights.o = group_weights.o * groups;
+  //   weights.i = group_weights.i;
+  //   weights.h = group_weights.h;
+  //   weights.w = group_weights.w;
+  //   weights.d = group_weights.d;
+};
+
+// @return shape of a tensor after Convolution2D operation is applied to
+//         the given input.
+BHWC CalculateOutputShape(const BHWC& input,
+                          const Convolution2DAttributes& attr);
+
+// @return shape of a tensor after Convolution3D operation is applied to
+//         the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const Convolution3DAttributes& attr);
+
+// @return padding for convolution operation to make sure output keep the same
+// shape as the given input.
+Padding2D CalculateSamePadding(const BHWC& input,
+                               const Convolution2DAttributes& attr);
+
+// @return padding for convolution operation to make sure output keep the same
+// shape as the given input.
+Padding3D CalculateSamePadding(const BHWDC& input,
+                               const Convolution3DAttributes& attr);
+
+struct ConvolutionTransposedAttributes {
+  HW stride = HW(1, 1);  // Along each axis.
+  HW adjacent;           // TODO(sorokin): No op on Flow.
+  Padding2D padding;
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  Tensor<Linear, DataType::FLOAT32> bias;  // optional
+};
+
+struct ConvolutionTransposed3DAttributes {
+  HWD stride = HWD(0, 0, 0);  // Along each axis.
+  Padding3D padding;
+
+  Tensor<OHWDI, DataType::FLOAT32> weights;
+  Tensor<Linear, DataType::FLOAT32> bias;  // optional
+};
+
+Padding2D CalculateSamePadding(const BHWC& input,
+                               const ConvolutionTransposedAttributes& attr);
+
+Padding3D CalculateSamePadding(const BHWDC& input,
+                               const ConvolutionTransposed3DAttributes& attr);
+
+// @return shape of a tensor after ConvolutionTransposed operation is applied to
+//         the given input.
+BHWC CalculateOutputShape(const BHWC& input,
+                          const ConvolutionTransposedAttributes& attr);
+
+// @return shape of a tensor after ConvolutionTransposed3D operation is applied
+// to
+//         the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const ConvolutionTransposed3DAttributes& attr);
+
+struct DepthwiseConvolution2DAttributes : public Convolution2DAttributes {};
+struct DepthwiseConvolution3DAttributes : public Convolution3DAttributes {};
+
+// @return shape of a tensor after DepthwiseConvolution2D operation is applied
+//         to the given input.
+BHWC CalculateOutputShape(const BHWC& input,
+                          const DepthwiseConvolution2DAttributes& attr);
+
+// @return shape of a tensor after DepthwiseConvolution3D operation is applied
+//         to the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const DepthwiseConvolution3DAttributes& attr);
+
+// @return padding for depthwise convolution operation to make sure output keep
+// the same shape as the given input.
+Padding2D CalculateSamePadding(const BHWC& input,
+                               const DepthwiseConvolution2DAttributes& attr);
+
+// @return padding for depthwise convolution operation to make sure output keep
+// the same shape as the given input.
+Padding3D CalculateSamePadding(const BHWDC& input,
+                               const DepthwiseConvolution3DAttributes& attr);
+
+// f(x):= {
+//   if alpha != 0: x -> min(activation_max, x)
+//   else
+//     if x < activation_min : x -> min(activation_min, alpha * x)
+//     if x >= activation_min : x -> min(activation_max, x)
+// }
+//
+// Examples:
+//   - ReLU: activation_min = 0, activation_max = 0, alpha = 0
+//   - ReLU6: activation_min = 0, activation_max = 6, alpha = 0
+//   - Leaky ReLU: activation_min = 0, activation_max = 0, alpha = a
+//   - ReLUN1To1: activation_min = -1, activation_max = 1, alpha = 0
+struct ReLUAttributes {
+  // activation_min must be < activation_max
+  float activation_min = 0;
+
+  // activation_max <= 0 mean it is not set.
+  float activation_max = 0;
+
+  // alpha must be <= 1
+  float alpha = 0;
+};
+
+struct PReLUAttributes {
+  // If alpha is linear, then it is sharded across CHANNELS axis, otherwise
+  // full shape alpha is required.
+  std::variant<Tensor<Linear, DataType::FLOAT32>,
+               Tensor<HWC, DataType::FLOAT32>>
+      alpha;
+};
+
+struct ReduceAttributes {
+  std::set<Axis> dims;
+};
+
+struct SoftmaxAttributes {
+  Axis axis = Axis::UNKNOWN;
+};
+
+enum LstmKernelType {
+  FULL = 0,
+  BASIC = 1,  // Currently, only basic is supported.
+};
+
+struct LstmAttributes {
+  LstmKernelType kernel_type = LstmKernelType::BASIC;
+};
+
+enum class SamplingType {
+  UNKNOWN = 0,
+  NEAREST = 1,
+  BILINEAR = 2,
+};
+
+struct Resize2DAttributes {
+  HW new_shape;
+
+  SamplingType type = SamplingType::UNKNOWN;
+
+  // If true, the centers of the 4 corner pixels of the input and output tensors
+  // are aligned, preserving the values at the corner pixels. Defaults to false.
+  bool align_corners = false;
+
+  bool half_pixel_centers = false;
+};
+
+// TODO(b/147771327): rename to Resize3D
+struct Resize3DAttributes {
+  HWD new_shape;
+
+  SamplingType type = SamplingType::NEAREST;
+
+  // If true, the centers of the 8 corner pixels of the input and output tensors
+  // are aligned, preserving the values at the corner pixels. Defaults to false.
+  bool align_corners = false;
+
+  bool half_pixel_centers = false;
+};
+
+float CalculateResizeScale(int32_t input_size, int32_t output_size,
+                           const Resize2DAttributes& attr);
+
+float CalculateResizeScale(int32_t input_size, int32_t output_size,
+                           const Resize3DAttributes& attr);
+
+// @return shape of a tensor after scale operation is applied to the given
+// input.
+BHWC CalculateOutputShape(const BHWC& input, const Resize2DAttributes& attr);
+
+// @return shape of a tensor after scale operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Resize3DAttributes& attr);
+
+enum class PaddingContentType {
+  ZEROS = 0,
+  REFLECT = 1,
+  EDGE = 2,
+};
+
+struct PadAttributes {
+  PaddingContentType type = PaddingContentType::ZEROS;
+
+  BHWC prepended;
+  BHWC appended;
+  float constant_values = 0;
+};
+
+// @return shape of a tensor after Pad operation is applied to the given input.
+BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr);
+
+struct Pad3DAttributes {
+  PaddingContentType type = PaddingContentType::ZEROS;
+
+  BHWDC prepended;
+  BHWDC appended;
+};
+
+// @return shape of a tensor after Pad3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr);
+
+template <DataType DataTypeT>
+struct ConstTensorAttributesBase {
+  Tensor<BHWC, DataTypeT> tensor;
+};
+
+using ConstTensorAttributes = ConstTensorAttributesBase<DataType::FLOAT32>;
+
+struct DensifyAttributes {
+  Tensor<BHWC, DataType::FLOAT32> tensor;
+};
+
+// Simple slicing without advanced support for shrinking, reverse slicing etc.
+struct SliceAttributes {
+  // Specifies start and end dimensions for slicing.
+  BHWC starts;
+  BHWC ends;
+
+  // Stride should be >= 1.
+  BHWC strides;
+};
+
+// @return shape of a tensor after Slice2D operation is applied to the given
+//         input.
+BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr);
+
+// Simple slicing without advanced support for shrinking, reverse slicing etc.
+struct Slice3DAttributes {
+  // Specifies start and end dimensions for slicing.
+  BHWDC starts;
+  BHWDC ends;
+
+  // Stride should be >= 1.
+  BHWDC strides;
+};
+
+// @return shape of a tensor after Slice3D operation is applied to the given
+//         input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Slice3DAttributes& attr);
+
+struct FullyConnectedAttributes {
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  Tensor<Linear, DataType::FLOAT32> bias;
+};
+
+struct FullyConnectedInt8Attributes {
+  Tensor<OHWI, DataType::INT8> weights;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  float scale;
+  int zero_point;
+};
+
+FullyConnectedAttributes DequatizeFullyConnectedAttr(
+    const FullyConnectedInt8Attributes& attr);
+
+// @return shape of a tensor after FullyConnected operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC& input,
+                          const FullyConnectedAttributes& attr);
+
+// @return shape of a tensor after Mean operation is applied to the given input.
+BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr);
+
+// @return shape of a tensor after Mean operation is applied to the given input.
+BHWDC CalculateOutputShape(const BHWDC& input, const MeanAttributes& attr);
+
+template <DataType DataTypeT, typename t>
+struct ElementwiseAttributesBase {
+  TensorOrScalarBase<DataTypeT, t> param;
+  // For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second
+  // true when runtime tensor is B(on second position). this is important for
+  // ops that non commutative, for example subtract.
+  bool runtime_tensor_is_second = false;
+};
+
+using ElementwiseAttributes =
+    ElementwiseAttributesBase<DataType::FLOAT32, float>;
+
+struct ReshapeAttributes {
+  BHWC new_shape;
+};
+
+struct Reshape3DAttributes {
+  BHWDC new_shape;
+};
+
+struct TransposeAttributes {
+  // A permutation of the dimensions of input tensor
+  BHWC perm;
+};
+
+// @return shape of a tensor after Transpose operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr);
+
+struct Transpose3DAttributes {
+  // A permutation of the dimensions of input tensor
+  BHWDC perm;
+};
+
+// @return shape of a tensor after Transpose3D operation is applied to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const Transpose3DAttributes& attr);
+
+struct SpaceToDepthAttributes {
+  int block_size;
+};
+
+struct SplitAttributes {
+  // Defines axis by which to split.
+  Axis axis = Axis::UNKNOWN;
+};
+
+// These help perform a combination of Quantize & Dequantize to adjust float
+// values like quantized inference would.
+struct QuantizeAndDequantizeAttributes {
+  float min = 0;
+  float max = 0;
+  float scale = 0;
+};
+
+struct GatherAttributes {
+  Axis axis = Axis::UNKNOWN;
+  Tensor<Linear, DataType::INT32> indices;
+};
+
+struct OneHotAttributes {
+  float on_value = 1;
+  float off_value = 0;
+};
+
+struct SelectV2Attributes {
+  bool broadcast_true = false;
+  bool broadcast_false = false;
+  bool scalar_cond = false;
+};
+
+struct CumsumAttributes {
+  Axis axis = Axis::UNKNOWN;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/precision.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/precision.h
new file mode 100644
index 00000000..d2977e36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/precision.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_PRECISION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_PRECISION_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+
+enum class CalculationsPrecision { F32, F32_F16, F16 };
+// F32 - all data and all math ops in F32
+// F16 - all data and all math ops in F16
+// F32_F16 - as F16, but some operations (Convolution,
+// DepthwiseConvolution, FullyConnected, ConvolutionTransposed)
+// have accumulator in F32 and usually it calculates 4 mads in F16, sum them,
+// than converts this partial sum to F32 and add to accumulator.
+
+DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision);
+
+std::string ToString(CalculationsPrecision precision);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_PRECISION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/quantization_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/quantization_util.h
new file mode 100644
index 00000000..a9de6619
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/quantization_util.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Dequantizes input tensors pre-inference, leaving float tensors intact.
+// input_indices contains dequantized (fp32) outputs, that are used as
+// inputs to GPU delegate.
+// quant_conversion_map contains bidirectional mapping between dequantized
+// tensor and its original quantized one.
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<uint32_t>& input_indices,
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<int64_t>& input_indices,
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
+
+// Quantizes output tensors post-inference, leaving float tensors intact.
+// output_indices contains (fp32) inputs to be quantized, which are outputs of
+// GPU delegate.
+// quant_conversion_map contains bidirectional mapping between dequantized
+// tensor and its original quantized one.
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<uint32_t>& output_indices,
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<int64_t>& output_indices,
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h
new file mode 100644
index 00000000..e37493a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_CONVOLUTION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_CONVOLUTION_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<GPUOperation> SelectConvolution(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints);
+
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints);
+
+std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const GpuInfo& gpu_info, const OperationDef& op_def,
+    ModelHints hints, WeightsDescription* weights_desc);
+
+std::unique_ptr<GPUOperation> SelectConvolutionBatchedMatMul(
+    const OHWI& weights_shape, const BHWC& dst_shape, const GpuInfo& gpu_info,
+    const OperationDef& op_def, ModelHints hints,
+    WeightsDescription* weights_desc);
+
+std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
+    const WeightsDescription& weights_desc, const OperationDef& op_def,
+    ModelHints hints, Layout input_layout);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_CONVOLUTION_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/convolution_transposed_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/convolution_transposed_selector.h
new file mode 100644
index 00000000..5c94b898
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/convolution_transposed_selector.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_CONVOLUTION_TRANSPOSED_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_CONVOLUTION_TRANSPOSED_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
+    const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def);
+
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedWithDynamicWeights(
+    const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def, WeightsDescription* weights_desc);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_CONVOLUTION_TRANSPOSED_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/default_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/default_selector.h
new file mode 100644
index 00000000..c6f7758c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/default_selector.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_DEFAULT_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_DEFAULT_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
+                           ModelHints hints, const std::vector<Value*>& inputs,
+                           const std::vector<Value*>& outputs, const Node& node,
+                           GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_DEFAULT_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h
new file mode 100644
index 00000000..f3e50a9c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_DW_CONVOLUTION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_DW_CONVOLUTION_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<GPUOperation> SelectDWConvolution(
+    const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_DW_CONVOLUTION_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/fully_connected_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/fully_connected_selector.h
new file mode 100644
index 00000000..cf5e6f37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/fully_connected_selector.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_FULLY_CONNECTED_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_FULLY_CONNECTED_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<GPUOperation> SelectFullyConnected(
+    const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def, int batch_size);
+
+std::unique_ptr<GPUOperation> SelectFullyConnected(
+    const FullyConnectedInt8Attributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_FULLY_CONNECTED_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h
new file mode 100644
index 00000000..9f4fda56
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_OPERATION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_OPERATION_SELECTOR_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+
+namespace tflite {
+namespace gpu {
+
+struct SharedWeightsConvDesc {
+  int weights_id;
+  WeightsDescription desc;
+  std::vector<int> global_const_ids;
+
+  bool operator==(const SharedWeightsConvDesc& t) const {
+    return weights_id == t.weights_id && desc == t.desc;
+  }
+
+  void RemapIds(const absl::flat_hash_map<int, ValueId>& mapping) {
+    for (int i = 0; i < global_const_ids.size(); ++i) {
+      int local_id = -(global_const_ids[i] + 1);
+      if (local_id >= 0) {
+        global_const_ids[i] = mapping.at(local_id);
+      }
+    }
+  }
+};
+
+absl::Status GPUOperationFromNode(
+    const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
+    const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
+    const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
+    GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_OPERATION_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h
new file mode 100644
index 00000000..ba55e706
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SIMPLE_SELECTORS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SIMPLE_SELECTORS_H_
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
+                                         const GpuInfo& gpu_info);
+
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
+                                         const OperationDef& op_def);
+
+std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
+                                          const GpuInfo& gpu_info,
+                                          const OperationDef& op_def);
+
+std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
+                                            const GpuInfo& gpu_info,
+                                            const OperationDef& op_def);
+
+std::unique_ptr<GPUOperation> SelectMaxUnpooling(
+    const MaxUnpooling2DAttributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def);
+
+void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
+               int dst_channels, std::unique_ptr<GPUOperation>* ptr);
+
+absl::Status SelectGather(const GatherAttributes& attr,
+                          const OperationDef& op_def, const GpuInfo& gpu_info,
+                          std::unique_ptr<GPUOperation>* ptr);
+
+absl::Status SelectResize(const Resize2DAttributes& attr,
+                          const OperationDef& op_def,
+                          std::unique_ptr<GPUOperation>* ptr);
+
+std::unique_ptr<GPUOperation> SelectResampler(const OperationDef& op_def,
+                                              const GpuInfo& gpu_info);
+
+absl::Status SelectConcat(const ConcatAttributes& attr,
+                          const std::vector<int>& channels,
+                          const OperationDef& op_def, const GpuInfo& gpu_info,
+                          std::unique_ptr<GPUOperation>* ptr);
+
+std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
+    const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
+    const OperationDef& op_def);
+
+void SelectReshape(int src_channels, int dst_channels,
+                   const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
+std::unique_ptr<GPUOperation> SelectReduce(const std::set<Axis>& axis_to_reduce,
+                                           const BHWC& src_shape,
+                                           OperationType op_type,
+                                           const OperationDef& op_def,
+                                           const GpuInfo& gpu_info);
+
+void SelectSoftmax(const GpuInfo& gpu_info, const BHWC& shape,
+                   const OperationDef& op_def,
+                   std::unique_ptr<GPUOperation>* ptr);
+
+void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
+void SelectDepthToSpace(const SpaceToDepthAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
+void SelectSplit(const SplitAttributes& attr, const GpuInfo& gpu_info,
+                 const std::vector<int>& channels, const OperationDef& op_def,
+                 std::unique_ptr<GPUOperation>* ptr);
+
+std::unique_ptr<GPUOperation> SelectTile(const OperationDef& op_def,
+                                         const BHWC& src_shape);
+
+void SelectTranspose(const TransposeAttributes& attr,
+                     const OperationDef& op_def,
+                     std::unique_ptr<GPUOperation>* ptr);
+
+std::unique_ptr<GPUOperation> SelectWinograd4x4To36(const GpuInfo& gpu_info,
+                                                    const Padding2D& padding,
+                                                    const OperationDef& op_def);
+
+std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
+    const GpuInfo& gpu_info, const OperationDef& op_def,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
+
+std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
+    const QuantizeAndDequantizeAttributes& attr, const OperationDef& op_def);
+
+void SelectCast(const OperationDef& op_def, const GpuInfo& gpu_info,
+                std::unique_ptr<GPUOperation>* ptr);
+
+void SelectCumsum(const OperationDef& op_def, const CumsumAttributes& attr,
+                  std::unique_ptr<GPUOperation>* ptr);
+
+void SelectOneHot(const OperationDef& op_def, const OneHotAttributes& attr,
+                  std::unique_ptr<GPUOperation>* ptr);
+
+void SelectSelectV2(const OperationDef& op_def, const SelectV2Attributes& attr,
+                    std::unique_ptr<GPUOperation>* ptr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SIMPLE_SELECTORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/special_selector.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/special_selector.h
new file mode 100644
index 00000000..4c2560bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/special_selector.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SPECIAL_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SPECIAL_SELECTOR_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status GPUSubgraphFromGraph(
+    const ModelHints& hints, const GpuInfo& gpu_info,
+    CalculationsPrecision precision, const GraphFloat32& graph,
+    NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SPECIAL_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/subgraph.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/subgraph.h
new file mode 100644
index 00000000..20249b84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/selectors/subgraph.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SUBGRAPH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SUBGRAPH_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+struct GPUOperationWithRefs {
+  std::unique_ptr<GPUOperation> operation;
+
+  // input and output ids can be positive or negative.
+  // if we have positive id, we will use preallocated tensor from GraphFloat32
+  // otherwise, we will use ids for newly allocated tensors
+  std::vector<int> input_ids;
+  std::vector<int> output_ids;
+  std::string name;
+};
+
+struct GPUOperationsSubgraph {
+  std::vector<GPUOperationWithRefs> operations;
+  std::vector<TensorDescriptor> new_tensors;
+
+  int AddTensor(const TensorDescriptor& desc);
+  int AddTensor(const BHWC& shape, const TensorDescriptor& desc);
+  int AddTensor(const OHWI& shape, const TensorDescriptor& desc);
+};
+
+std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
+    const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
+    GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SELECTORS_SUBGRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/shape.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/shape.h
new file mode 100644
index 00000000..14b45537
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/shape.h
@@ -0,0 +1,683 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace tflite {
+namespace gpu {
+
+enum class Axis {
+  UNKNOWN = 0,
+  CHANNELS = 1,
+  INPUT_CHANNELS = 2,
+  OUTPUT_CHANNELS = 3,
+  HEIGHT = 4,
+  WIDTH = 5,
+  BATCH = 6,
+  VALUE = 7,
+  DEPTH = 8,
+};
+
+std::string ToString(Axis axis);
+
+// Layout represents axis order.
+enum class Layout {
+  UNKNOWN = 0,
+  SCALAR = 1,
+  LINEAR = 2,
+  HW = 3,
+  CHW = 4,
+  HWC = 5,
+  OIHW = 6,
+  OHWI = 7,
+  IHWO = 8,
+  IOHW = 9,
+  BHWC = 10,
+  HWDC = 11,
+  BHWDC = 12,
+  HWD = 13,
+  OHWDI = 14,
+  HWIO = 15,
+};
+
+std::string ToString(Layout l);
+
+// Returns number of axis for the fixed layout.
+template <Layout T>
+constexpr int Size();
+
+// Returns number of axis for the given layout.
+int Size(Layout layout);
+
+// Returns Axis for the given index and fixed layout.
+template <Layout T>
+constexpr Axis GetAxis(int index);
+
+// Returns axis for the given layout and index.
+Axis GetAxis(Layout layout, int32_t index);
+
+// Returns axis index for the given axis and fixed layout.
+template <Layout T>
+constexpr int GetAxisIndex(Axis axis);
+
+// Returns axis index for the given layout and axis.
+int GetAxisIndex(Layout layout, Axis axis);
+
+// Checks if fixed layout has given axis
+template <Layout T>
+constexpr bool HasAxis(Axis axis);
+
+// Checks if given layout has given axis
+bool HasAxis(Layout layout, Axis axis);
+
+// Stores Layout(axis set and order) and value for dimensions.
+struct Shape {
+  Shape() : layout(Layout::UNKNOWN), dimensions() {}
+
+  explicit Shape(Layout t) : layout(t), dimensions(Size(t)) {}
+
+  Shape(Layout t, std::vector<int32_t> d)
+      : layout(t), dimensions(std::move(d)) {}
+
+  bool operator==(const Shape& other) const {
+    return (layout == other.layout) && (dimensions == other.dimensions);
+  }
+
+  bool operator!=(const Shape& other) const { return !operator==(other); }
+
+  // All methods below are matching same methods defined in StrongShape to
+  // make sure generic algorithms work both ways.
+
+  // Returns back a dimension or -1 if it is not found.
+  template <Axis D>
+  int32_t get() const;
+  int32_t get(Axis axis) const;
+
+  template <Axis D>
+  bool set(int32_t t);
+  bool set(Axis axis, int32_t t);
+
+  Axis axis(int index) const { return GetAxis(layout, index); }
+
+  int index(Axis axis) const { return GetAxisIndex(layout, axis); }
+
+  bool has(Axis axis) const { return HasAxis(layout, axis); }
+
+  int64_t DimensionsProduct() const {
+    return std::accumulate(dimensions.begin(), dimensions.end(), 1LL,
+                           std::multiplies<int64_t>());
+  }
+
+  Layout layout = Layout::UNKNOWN;
+
+  std::vector<int32_t> dimensions;
+};
+
+std::string ToString(const Shape& s);
+
+// StrongShape provides convenient explicit access to dimensions stored in
+// shape, e.g. StrongShape<Layout::HW> s; provides s.h and s.w accessors.
+//
+// There is a conversion possible both ways between Shape and StrongShape.
+//
+//   OIHW oihw;  // specific shape
+//   Shape l = oihw.ToShape();
+//
+//   OHWI other;  // notice not the same but compatible shape.
+//   if (!other.Adopt(l)) {
+//     // error handling
+//   }
+//
+// StrongShape supports the following set of operations:
+//
+//   // Returns number of axis in the shape class.
+//   static constexpr int size();
+//
+//   // Returns Axis for the given index or Axis::UNKNOWN if index
+//   // falls outside of the defined range in this shape.
+//   static constexpr Axis axis(int index);
+//
+//   // Returns index for the given axis or -1 if axis is not defined in this
+//   // shape.
+//   static constexpr int index(Axis axis);
+//
+//   // Getters
+//   int32_t get(int index) const;
+//   int32_t get(Axis axis) const;
+//   int32_t get<Axis>() const;
+//
+//   // Setters that return false if set was not successful.
+//   bool set(int index, int32_t v);
+//   bool set(Axis axis, int32_t v);
+//   bool set<Axis>(int32_t v);
+//
+//   // Returns shape's layout.
+//   static const Layout layout;
+//
+//   // Turns specific shape into generic shape.
+//   Shape ToShape() const;
+//
+//   // Copies all dimensions from the given shape.
+//   bool Adopt(const Shape&);
+//
+template <Layout L>
+struct StrongShape;
+
+using Scalar = StrongShape<Layout::SCALAR>;
+using Linear = StrongShape<Layout::LINEAR>;
+using HW = StrongShape<Layout::HW>;
+using HWD = StrongShape<Layout::HWD>;
+
+// Common tensor shape for CNN models working with images.
+using CHW = StrongShape<Layout::CHW>;
+using HWC = StrongShape<Layout::HWC>;
+using HWDC = StrongShape<Layout::HWDC>;
+using BHWC = StrongShape<Layout::BHWC>;
+using BHWDC = StrongShape<Layout::BHWDC>;
+
+// Tensor shape used in convolution_2d weights.
+using OIHW = StrongShape<Layout::OIHW>;
+using OHWI = StrongShape<Layout::OHWI>;
+using IHWO = StrongShape<Layout::IHWO>;
+using IOHW = StrongShape<Layout::IOHW>;
+using HWIO = StrongShape<Layout::HWIO>;
+
+// Tensor shape used in convolution_3d weights.
+using OHWDI = StrongShape<Layout::OHWDI>;
+
+// -----------------------------------------------------------------------------
+// Everything below are internal implementation details.
+// -----------------------------------------------------------------------------
+
+namespace internal_shape {
+
+template <Axis T>
+struct AxisTraits;
+
+#define TFLITE_GPU_AXIS_TRAITS(AxisName, HolderName)    \
+  template <>                                           \
+  struct AxisTraits<Axis::AxisName> {                   \
+    struct Holder {                                     \
+      int32_t HolderName;                               \
+                                                        \
+     protected:                                         \
+      int32_t operator()() const { return HolderName; } \
+      void operator()(int32_t v) { HolderName = v; }    \
+    };                                                  \
+                                                        \
+    using dimension_holder_type = Holder;               \
+  }
+
+TFLITE_GPU_AXIS_TRAITS(CHANNELS, c);
+TFLITE_GPU_AXIS_TRAITS(HEIGHT, h);
+TFLITE_GPU_AXIS_TRAITS(WIDTH, w);
+TFLITE_GPU_AXIS_TRAITS(INPUT_CHANNELS, i);
+TFLITE_GPU_AXIS_TRAITS(OUTPUT_CHANNELS, o);
+TFLITE_GPU_AXIS_TRAITS(BATCH, b);
+TFLITE_GPU_AXIS_TRAITS(VALUE, v);
+TFLITE_GPU_AXIS_TRAITS(DEPTH, d);
+
+#undef TFLITE_GPU_AXIS_TRAITS
+
+template <int N, Axis... As>
+struct StrongShapeImpl;
+
+template <int N>
+struct StrongShapeImpl<N> {
+  static constexpr int size() { return N; }
+
+  static constexpr Axis axis(int) { return Axis::UNKNOWN; }
+
+  static constexpr int index(Axis) { return -1; }
+
+  static constexpr bool has(Axis) { return false; }
+
+  int32_t get(Axis) const { return -1; }
+
+  int32_t get(int) const { return -1; }
+
+  template <Axis B>
+  int32_t get() const {
+    return -1;
+  }
+
+  bool set(Axis, int32_t) { return false; }
+
+  bool set(int, int32_t) { return false; }
+
+  template <Axis B>
+  bool set(int32_t) {
+    return false;
+  }
+};
+
+// Used to deduce number of axis, and to be a child of a proper holder to
+// provide access to the dimension by name
+template <int N, Axis A, Axis... As>
+struct StrongShapeImpl<N, A, As...>
+    : public AxisTraits<A>::dimension_holder_type,
+      public StrongShapeImpl<N + 1, As...> {
+  using dimension_holder_type = typename AxisTraits<A>::dimension_holder_type;
+
+  using rest_type = StrongShapeImpl<N + 1, As...>;
+
+  StrongShapeImpl() : dimension_holder_type{0}, rest_type() {}
+
+  template <typename... Ts>
+  explicit StrongShapeImpl(int32_t t, Ts... ts)
+      : dimension_holder_type{t}, rest_type(ts...) {}
+
+  static constexpr Axis axis(int index) {
+    return index == N ? A : rest_type::axis(index);
+  }
+
+  static constexpr int index(Axis axis) {
+    return axis == A ? N : rest_type::index(axis);
+  }
+
+  static constexpr bool has(Axis axis) {
+    return axis == A ? true : rest_type::has(axis);
+  }
+
+  int32_t get(Axis axis) const {
+    return axis == A ? dimension_holder_type::operator()()
+                     : rest_type::get(axis);
+  }
+
+  template <Axis B>
+  int32_t get() const {
+    return B == A ? dimension_holder_type::operator()()
+                  : rest_type::template get<B>();
+  }
+
+  int32_t get(int index) const {
+    return index == N ? dimension_holder_type::operator()()
+                      : rest_type::get(index);
+  }
+
+  bool set(Axis axis, int32_t t) {
+    if (axis == A) {
+      dimension_holder_type::operator()(t);
+      return true;
+    }
+    return rest_type::set(axis, t);
+  }
+
+  bool set(int index, int32_t t) {
+    if (index == N) {
+      dimension_holder_type::operator()(t);
+      return true;
+    }
+    return rest_type::set(index, t);
+  }
+
+  template <Axis B>
+  bool set(int32_t t) {
+    if (A == B) {
+      dimension_holder_type::operator()(t);
+      return true;
+    }
+    return rest_type::template set<B>(t);
+  }
+};
+
+template <Layout T>
+struct LayoutTraits;
+
+#define TFLITE_GPU_LAYOUT_TRAITS(LayoutName, ...)              \
+  template <>                                                  \
+  struct LayoutTraits<Layout::LayoutName> {                    \
+    using strong_shape_type = StrongShapeImpl<0, __VA_ARGS__>; \
+  }
+
+TFLITE_GPU_LAYOUT_TRAITS(HW, Axis::HEIGHT, Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(HWD, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH);
+TFLITE_GPU_LAYOUT_TRAITS(OHWI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH,
+                         Axis::INPUT_CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(OIHW, Axis::OUTPUT_CHANNELS, Axis::INPUT_CHANNELS,
+                         Axis::HEIGHT, Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(IOHW, Axis::INPUT_CHANNELS, Axis::OUTPUT_CHANNELS,
+                         Axis::HEIGHT, Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(IHWO, Axis::INPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH,
+                         Axis::OUTPUT_CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(CHW, Axis::CHANNELS, Axis::HEIGHT, Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(HWC, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(HWDC, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH,
+                         Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(LINEAR, Axis::VALUE);
+TFLITE_GPU_LAYOUT_TRAITS(SCALAR, Axis::VALUE);
+TFLITE_GPU_LAYOUT_TRAITS(BHWC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH,
+                         Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(BHWDC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH,
+                         Axis::DEPTH, Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(OHWDI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT,
+                         Axis::WIDTH, Axis::DEPTH, Axis::INPUT_CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(HWIO, Axis::HEIGHT, Axis::WIDTH, Axis::INPUT_CHANNELS,
+                         Axis::OUTPUT_CHANNELS);
+
+#undef TFLITE_GPU_LAYOUT_TRAITS
+
+template <>
+struct LayoutTraits<Layout::UNKNOWN> {
+  using strong_shape_type = StrongShapeImpl<0>;
+};
+
+template <Axis A>
+struct DimensionGetterFixedAxisFunc {
+  template <Layout T>
+  int32_t operator()() const {
+    constexpr int i = GetAxisIndex<T>(A);
+    return i >= 0 && i < l->dimensions.size() ? l->dimensions[i] : -1;
+  }
+  const Shape* l;
+};
+
+struct DimensionGetterFunc {
+  template <Layout T>
+  int32_t operator()() const {
+    int i = GetAxisIndex<T>(axis);
+    return i >= 0 && i < l->dimensions.size() ? l->dimensions[i] : -1;
+  }
+  Axis axis;
+  const Shape* l;
+};
+
+template <Axis A>
+struct DimensionSetterFixedAxisFunc {
+  template <Layout T>
+  bool operator()() const {
+    constexpr int i = GetAxisIndex<T>(A);
+    if (i >= 0 && i < l->dimensions.size()) {
+      l->dimensions[i] = v;
+      return true;
+    }
+    return false;
+  }
+  Shape* l;
+  int32_t v;
+};
+
+struct DimensionSetterFunc {
+  template <Layout T>
+  bool operator()() const {
+    int i = GetAxisIndex<T>(axis);
+    if (i >= 0 && i < l->dimensions.size()) {
+      l->dimensions[i] = v;
+      return true;
+    }
+    return false;
+  }
+  Axis axis;
+  Shape* l;
+  int32_t v;
+};
+
+template <Layout L>
+struct ToShapeFunc {
+  template <Layout T>
+  bool operator()() const {
+    for (int i = 0; i < StrongShape<L>::size(); ++i) {
+      int index = GetAxisIndex<T>(StrongShape<L>::axis(i));
+      if (index < 0) return false;
+      shape->set(i, l.dimensions[index]);
+    }
+    return true;
+  }
+
+  StrongShape<L>* shape;
+  const Shape& l;
+};
+
+}  // namespace internal_shape
+
+// template <Axis... As>
+template <Layout L>
+struct StrongShape : public internal_shape::LayoutTraits<L>::strong_shape_type {
+  using strong_shape_type =
+      typename internal_shape::LayoutTraits<L>::strong_shape_type;
+  StrongShape() = default;
+
+  template <typename... Ts>
+  explicit StrongShape(Ts... t) : strong_shape_type(t...) {}
+
+  constexpr static Layout layout = L;
+
+  bool operator==(const StrongShape<L>& shape) const {
+    // TODO(akulik): implement better alternative.
+    return this->ToShape() == shape.ToShape();
+  }
+
+  bool operator!=(const StrongShape<L>& shape) const {
+    // TODO(akulik): implement better alternative.
+    return this->ToShape() != shape.ToShape();
+  }
+  bool empty() const { return DimensionsProduct() == 0; }
+
+  // Turns StrongShape into generic shape.
+  Shape ToShape() const {
+    std::vector<int32_t> dimensions(StrongShape::size());
+    for (int i = 0; i < StrongShape::size(); ++i) {
+      dimensions[i] = StrongShape::get(i);
+    }
+    return Shape(L, std::move(dimensions));
+  }
+
+  // @return all dimensions multiplied
+  int64_t DimensionsProduct() const {
+    int64_t product = 1;
+    for (int i = 0; i < StrongShape::size(); ++i) {
+      product *= StrongShape::get(i);
+    }
+    return product;
+  }
+
+  // Translates given coordinates of the layout into a linear index assuming
+  // dimensions are sorted in tensor access order e.g. if you access
+  // foobar[i][j][k] order of coordinates should be i,j,k.
+  int64_t LinearIndex(
+      const std::array<int32_t, StrongShape::size()>& coordinates) const {
+    int64_t index = coordinates[0];
+    for (int i = 1; i < StrongShape::size(); ++i) {
+      index = index * StrongShape::get(i) + coordinates[i];
+    }
+    return index;
+  }
+
+  // Copies all dimensions from the given generic shape into specific shape.
+  // It requires shape to have all axis defined in the given
+  // StrongShape. For example:
+  //   - If this shape is OHWI but given shape is OIHW, Adopt will copy all
+  //     dimensions and return true.
+  //   - If this shape is OIHW but input shape is HW, Adopt will copy H and W
+  //     dimensions and return true, but if this shape is HW and given shape
+  //     OIHW, then Adopt will return false because not all axis are present in
+  //     the input shape.
+  //
+  // @return false if generic shape is not compatible.
+  bool Adopt(const Shape& shape) {
+    return DispatchByLayout(shape.layout,
+                            internal_shape::ToShapeFunc<L>{this, shape});
+  }
+
+  // For all axis defined in a given shape copies values to this shape.
+  // Therefore, it is possible to copy dimensions from CHW to BCHW, but not
+  // the other way around.
+  //
+  // BCHW bchw;
+  // CHW chw;
+  // bchw.CopyAllGivenAxis(chw);  --> true
+  // chw.CopyAllGivenAxis(bchw);  --> false
+  //
+  // @return false if axis in source shape is not defined here, thus value
+  //         was not copied.
+  template <Layout B>
+  bool CopyAllGivenAxis(const StrongShape<B>& source) {
+    for (int i = 0; i < source.size(); ++i) {
+      if (!StrongShape::set(source.axis(i), source.get(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // For all axis defined in this shape copies values from the given shape.
+  //
+  // BCHW bchw;
+  // CHW chw;
+  // bchw.CopyAllDefinedAxis(chw);  --> false
+  // chw.CopyAllDefinedAxis(bchw);  --> true
+  //
+  // @return false if given shape does not have axis defined here,
+  //         therefore a value was not copied.
+  template <Layout B>
+  bool CopyAllDefinedAxis(const StrongShape<B>& source) {
+    for (int i = 0; i < StrongShape::size(); ++i) {
+      int source_index = source.index(StrongShape::axis(i));
+      if (source_index < 0) {
+        return false;
+      }
+      StrongShape::set(i, source.get(source_index));  // always true
+    }
+    return true;
+  }
+
+  // Copies values only for matching axis.
+  template <Layout B>
+  void CopyMatchingAxis(const StrongShape<B>& source) {
+    for (int i = 0; i < StrongShape::size(); ++i) {
+      StrongShape::set(source.axis(i), source.get(i));
+    }
+  }
+
+  // AbslHash function for using in flat hash containers.
+  template <typename H>
+  friend H AbslHashValue(H hash_state, const StrongShape& strong_shape) {
+    for (size_t i = 0; i < strong_shape.size(); ++i) {
+      hash_state = H::combine(std::move(hash_state), strong_shape.get(i));
+    }
+    return hash_state;
+  }
+};
+
+template <Layout T>
+inline std::string ToString(const StrongShape<T>& s) {
+  return ToString(s.ToShape());
+}
+
+template <Layout L>
+constexpr Layout StrongShape<L>::layout;
+
+template <class F>
+auto DispatchByLayout(Layout type, F f)
+    -> decltype(f.template operator()<Layout::UNKNOWN>()) {
+  switch (type) {
+    case Layout::HW:
+      return f.template operator()<Layout::HW>();
+    case Layout::HWD:
+      return f.template operator()<Layout::HWD>();
+    case Layout::HWC:
+      return f.template operator()<Layout::HWC>();
+    case Layout::HWDC:
+      return f.template operator()<Layout::HWDC>();
+    case Layout::CHW:
+      return f.template operator()<Layout::CHW>();
+    case Layout::OIHW:
+      return f.template operator()<Layout::OIHW>();
+    case Layout::IOHW:
+      return f.template operator()<Layout::IOHW>();
+    case Layout::OHWI:
+      return f.template operator()<Layout::OHWI>();
+    case Layout::IHWO:
+      return f.template operator()<Layout::IHWO>();
+    case Layout::LINEAR:
+      return f.template operator()<Layout::LINEAR>();
+    case Layout::SCALAR:
+      return f.template operator()<Layout::SCALAR>();
+    case Layout::BHWC:
+      return f.template operator()<Layout::BHWC>();
+    case Layout::BHWDC:
+      return f.template operator()<Layout::BHWDC>();
+    case Layout::OHWDI:
+      return f.template operator()<Layout::OHWDI>();
+    case Layout::HWIO:
+      return f.template operator()<Layout::HWIO>();
+    case Layout::UNKNOWN:
+      return f.template operator()<Layout::UNKNOWN>();
+  }
+}
+
+template <Layout T>
+constexpr int Size() {
+  return StrongShape<T>::size();
+}
+
+template <Layout T>
+constexpr Axis GetAxis(int index) {
+  return StrongShape<T>::axis(index);
+}
+
+template <Layout T>
+constexpr int GetAxisIndex(Axis axis) {
+  return StrongShape<T>::index(axis);
+}
+
+template <Layout T>
+constexpr bool HasAxis(Axis axis) {
+  return StrongShape<T>::has(axis);
+}
+
+template <Axis D>
+inline int32_t Shape::get() const {
+  return DispatchByLayout(
+      layout, internal_shape::DimensionGetterFixedAxisFunc<D>{this});
+}
+
+inline int32_t Shape::get(Axis axis) const {
+  return DispatchByLayout(layout,
+                          internal_shape::DimensionGetterFunc{axis, this});
+}
+
+template <Axis D>
+inline bool Shape::set(int32_t t) {
+  return DispatchByLayout(
+      layout, internal_shape::DimensionSetterFixedAxisFunc<D>{this, t});
+}
+
+inline bool Shape::set(Axis axis, int32_t t) {
+  return DispatchByLayout(layout,
+                          internal_shape::DimensionSetterFunc{axis, this, t});
+}
+
+template <Layout T>
+std::ostream& operator<<(std::ostream& ostream, const StrongShape<T>& shape) {
+  ostream << ToString(shape);
+  return ostream;
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/status.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/status.h
new file mode 100644
index 00000000..a45d039c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/status.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
+
+#include "absl/status/status.h"  // IWYU pragma: export
+#include "tensorflow/lite/delegates/gpu/common/google/status_macros.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/arguments.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/arguments.h
new file mode 100644
index 00000000..c88ac77d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/arguments.h
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ArgumentsBinder {
+ public:
+  virtual absl::Status SetInt(const std::string& name, int value) = 0;
+  virtual absl::Status SetFloat(const std::string& name, float value) = 0;
+  virtual absl::Status SetHalf(const std::string& name, half value) = 0;
+  virtual ~ArgumentsBinder() = default;
+};
+
+class Arguments : public ArgumentsBinder {
+ public:
+  Arguments() = default;
+  ~Arguments() override = default;
+
+  // Move only
+  Arguments(Arguments&& args) = default;
+  Arguments& operator=(Arguments&& args) = default;
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
+  void AddFloat(const std::string& name, float value = 0.0f);
+  void AddHalf(const std::string& name, half value = half(0.0f));
+  void AddInt(const std::string& name, int value = 0);
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+  absl::Status SetHalf(const std::string& name, half value) override;
+  void AddObjectRef(const std::string& name, AccessType access_type,
+                    GPUObjectDescriptorPtr&& descriptor_ptr);
+  void AddObject(const std::string& name,
+                 GPUObjectDescriptorPtr&& descriptor_ptr);
+
+  void RenameArgs(const std::string& postfix, std::string* code) const;
+  absl::Status Merge(Arguments&& args, const std::string& postfix,
+                     const std::vector<std::string>& exception_names = {});
+
+  absl::Status GetDescriptor(const std::string& name,
+                             GPUObjectDescriptor** descriptor) const;
+
+  int GetReadTexturesCount(const GpuInfo& gpu_info) const;
+  int GetWriteTexturesCount(const GpuInfo& gpu_info) const;
+
+  void ReleaseCPURepresentation();
+
+  void GetActiveArguments(const std::string& code);
+
+  void SetStateValueForAllObjects(const std::string& key,
+                                  const std::string& value);
+
+  struct IntValue {
+    int value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+  };
+  struct FloatValue {
+    float value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+  };
+  struct HalfValue {
+    half value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+  };
+
+  const std::map<std::string, IntValue>& GetIntValues() const {
+    return int_values_;
+  }
+  const std::map<std::string, FloatValue>& GetFloatValues() const {
+    return float_values_;
+  }
+  const std::map<std::string, HalfValue>& GetHalfValues() const {
+    return half_values_;
+  }
+
+  const std::map<std::string, GPUObjectDescriptorPtr>& GetObjectRefs() const {
+    return object_refs_;
+  }
+  const std::map<std::string, GPUObjectDescriptorPtr>& GetObjects() const {
+    return objects_;
+  }
+  void MoveObjectRefs(std::map<std::string, GPUObjectDescriptorPtr>* result) {
+    *result = std::move(object_refs_);
+  }
+
+  absl::Status Compile(const GpuInfo& gpu_info,
+                       std::string* code);
+
+  void ResolveObjectNames(const std::string& object_name,
+                          const std::vector<std::string>& member_names,
+                          std::string* code) const;
+  absl::Status AddObjectsScalarArgs(const GpuInfo& gpu_info);
+  void ResolveArgsPass(std::string* code) const;
+
+ private:
+  friend flatbuffers::Offset<tflite::gpu::data::Arguments> Encode(
+      const Arguments& args, flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(const tflite::gpu::data::Arguments* fb_args,
+                             Arguments* args);
+
+  absl::Status ResolveKernelGlobalSpaceBuffers(const GpuInfo& gpu_info,
+                                               std::string* code);
+
+  static constexpr char kArgsPrefix[] = "args.";
+
+  std::map<std::string, IntValue> int_values_;
+  std::map<std::string, FloatValue> float_values_;
+  std::map<std::string, HalfValue> half_values_;
+
+  std::map<std::string, GPUObjectDescriptorPtr> object_refs_;
+  std::map<std::string, GPUObjectDescriptorPtr> objects_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_ARGUMENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h
new file mode 100644
index 00000000..cc3b8ff1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_BUFFER_DESC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_BUFFER_DESC_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+struct BufferDescriptor : public GPUObjectDescriptor {
+  DataType element_type;
+  int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
+  std::vector<std::string> attributes;
+
+  // optional
+  int size = 0;
+  std::vector<uint8_t> data;
+
+  BufferDescriptor() = default;
+  BufferDescriptor(const BufferDescriptor&) = default;
+  BufferDescriptor& operator=(const BufferDescriptor&) = default;
+  BufferDescriptor(BufferDescriptor&& desc) = default;
+  BufferDescriptor& operator=(BufferDescriptor&& desc) = default;
+
+  absl::Status PerformSelector(const GpuInfo& gpu_info,
+                               absl::string_view selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(const GpuInfo& gpu_info) const override;
+  absl::Status PerformReadSelector(const GpuInfo& gpu_info,
+                                   const std::vector<std::string>& args,
+                                   std::string* result) const;
+  absl::Status PerformWriteSelector(const GpuInfo& gpu_info,
+                                    const std::vector<std::string>& args,
+                                    std::string* result) const;
+  absl::Status PerformGetPtrSelector(
+      const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args, std::string* result) const;
+
+  void Release() override;
+
+  uint64_t GetSizeInBytes() const override { return data.size(); };
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_BUFFER_DESC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/compiler_options.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/compiler_options.h
new file mode 100644
index 00000000..7cbc176a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/compiler_options.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_COMPILER_OPTIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_COMPILER_OPTIONS_H_
+
+namespace tflite {
+namespace gpu {
+
+enum class CompilerOptions {
+  kAdrenoFullSimd,
+  kAdrenoMoreWaves,
+  kClFastRelaxedMath,
+  kClDisableOptimizations,
+  kCl20,
+  kCl30,
+  kClRegisterAllocation64,
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_COMPILER_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
new file mode 100644
index 00000000..419666fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
@@ -0,0 +1,230 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+
+namespace tflite {
+namespace gpu {
+
+struct GPUImage2DDescriptor {
+  DataType data_type;
+  bool normalized = false;   // used with INT data types, if normalized, we read
+                             // in kernel float data.
+  DataType normalized_type;  // can be FLOAT32 or FLOAT16, using with normalized
+                             // = true
+  AccessType access_type;
+};
+
+struct GPUImage3DDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUImage2DArrayDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUImageBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+};
+
+struct GPUCustomMemoryDescriptor {
+  std::string type_name;
+};
+
+enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
+
+struct GPUBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
+  std::vector<std::string> attributes;
+};
+
+struct GPUResources {
+  std::vector<std::string> ints;
+  std::vector<std::string> floats;
+  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+  std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+  std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
+  std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
+  std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
+  std::vector<std::pair<std::string, GPUCustomMemoryDescriptor>>
+      custom_memories;
+
+  std::vector<std::string> GetNames() const {
+    std::vector<std::string> names = ints;
+    names.insert(names.end(), floats.begin(), floats.end());
+    for (const auto& obj : buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images2d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image2d_arrays) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images3d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image_buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : custom_memories) {
+      names.push_back(obj.first);
+    }
+    return names;
+  }
+
+  int GetReadImagesCount() const {
+    int counter = 0;
+    for (const auto& t : images2d) {
+      if (t.second.access_type == tflite::gpu::AccessType::READ) {
+        counter++;
+      }
+    }
+    for (const auto& t : image2d_arrays) {
+      if (t.second.access_type == tflite::gpu::AccessType::READ) {
+        counter++;
+      }
+    }
+    for (const auto& t : images3d) {
+      if (t.second.access_type == tflite::gpu::AccessType::READ) {
+        counter++;
+      }
+    }
+    for (const auto& t : image_buffers) {
+      if (t.second.access_type == tflite::gpu::AccessType::READ) {
+        counter++;
+      }
+    }
+    return counter;
+  }
+
+  int GetWriteImagesCount() const {
+    int counter = 0;
+    for (const auto& t : images2d) {
+      if (t.second.access_type == tflite::gpu::AccessType::WRITE) {
+        counter++;
+      }
+    }
+    for (const auto& t : image2d_arrays) {
+      if (t.second.access_type == tflite::gpu::AccessType::WRITE) {
+        counter++;
+      }
+    }
+    for (const auto& t : images3d) {
+      if (t.second.access_type == tflite::gpu::AccessType::WRITE) {
+        counter++;
+      }
+    }
+    for (const auto& t : image_buffers) {
+      if (t.second.access_type == tflite::gpu::AccessType::WRITE) {
+        counter++;
+      }
+    }
+    return counter;
+  }
+};
+
+struct GenericGPUResourcesWithValue {
+  std::vector<std::pair<std::string, int>> ints;
+  std::vector<std::pair<std::string, float>> floats;
+
+  void AddFloat(absl::string_view name, float value) {
+    floats.emplace_back(name, value);
+  }
+  void AddInt(absl::string_view name, int value) {
+    ints.emplace_back(name, value);
+  }
+};
+
+class GPUObjectDescriptor {
+ public:
+  GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc) = default;
+  GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) = default;
+  virtual ~GPUObjectDescriptor() = default;
+
+  void SetStateVar(absl::string_view key, absl::string_view value) const {
+    auto it = state_vars_.find(key);
+    if (it == state_vars_.end()) {
+      state_vars_[std::string(key)] = std::string(value);
+    } else {
+      it->second = std::string(value);
+    }
+  }
+
+  virtual absl::Status PerformConstExpr(const tflite::gpu::GpuInfo& gpu_info,
+                                        absl::string_view const_expr,
+                                        std::string* result) const {
+    return absl::UnimplementedError(
+        "No implementation of perform const expression");
+  }
+
+  virtual absl::Status PerformSelector(
+      const GpuInfo& gpu_info, absl::string_view selector,
+      const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args,
+      std::string* result) const {
+    return absl::UnimplementedError("No implementation of perform selector");
+  }
+  virtual GPUResources GetGPUResources(const GpuInfo& gpu_info) const {
+    return GPUResources();
+  }
+
+  virtual void Release() {}
+
+  // For internal use, will work correct only for const objects and before
+  // Release() call.
+  virtual uint64_t GetSizeInBytes() const { return 0; }
+
+  void SetAccess(AccessType access_type) { access_type_ = access_type; }
+  AccessType GetAccess() const { return access_type_; }
+
+ protected:
+  friend flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> Encode(
+      const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
+  friend void Decode(const tflite::gpu::data::GPUObjectDescriptor* fb_obj,
+                     GPUObjectDescriptor* obj);
+  mutable std::map<std::string, std::string, std::less<>> state_vars_;
+  AccessType access_type_;
+};
+
+using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
new file mode 100644
index 00000000..72229433
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
@@ -0,0 +1,278 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/arguments.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/compiler_options.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+// kCustom: default value
+//   GPUOperation::GetGridSize must be overloaded
+// kWBToX_HDToY_SToZ:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height() * dst_[0]->Depth();
+//   grid_z = dst_[0]->Slices();
+// kWBToX_HDToY_ZIs1:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height() * dst_[0]->Depth();
+//   grid_z = 1;
+// kWBToX_HToY_DToZ:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height();
+//   grid_z = dst_[0]->Depth();
+// kBToX_YIs1_ZIs1:
+//   grid_x = dst_[0]->Batch();
+//   grid_y = 1;
+//   grid_z = 1;
+enum class TensorToGrid {
+  kCustom,
+  kWBToX_HDToY_SToZ,
+  kWBToX_HDToY_ZIs1,
+  kWBToX_HToY_DToZ,
+  kBToX_YIs1_ZIs1
+};
+
+struct OperationDef {
+  CalculationsPrecision precision;
+  std::vector<TensorDescriptor> src_tensors;
+  std::vector<TensorDescriptor> dst_tensors;
+
+  // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision
+  DataType GetDataType() const;
+  // Primary means the first src tensor, because first tensor usually defines
+  // the structure of kernel, all other resources(biases) types and etc.
+  DataType GetPrimaryDataType() const;
+  TensorStorageType GetPrimaryStorageType() const;
+  bool IsBatchSupported() const;
+};
+
+struct ElementwiseDescriptor {
+  Arguments args;
+  std::string code;
+};
+
+class GPUOperation {
+ public:
+  GPUOperation() = default;
+  explicit GPUOperation(const OperationDef& definition);
+  virtual ~GPUOperation() = default;
+  // Move only
+  GPUOperation(GPUOperation&& operation);
+  GPUOperation& operator=(GPUOperation&& operation);
+  GPUOperation(const GPUOperation&) = delete;
+  GPUOperation& operator=(const GPUOperation&) = delete;
+
+  absl::Status AddOperation(const GpuInfo& gpu_info, GPUOperation* operation);
+
+  int GetElementwiseInputsCount() const { return elementwise_inputs_; }
+
+  void SetSrc(GpuSpatialTensor* ptr, int index = 0);
+  void SetDst(GpuSpatialTensor* ptr, int index = 0);
+
+  struct DispatchInfo {
+    int3 work_group_size;
+    int3 work_groups_count;
+  };
+  void GetPossibleDispatches(TuningType tuning_type, const GpuInfo& gpu_info,
+                             const KernelInfo& kernel_info,
+                             std::vector<DispatchInfo>* dispatches) const;
+
+  const std::vector<std::string>& GetSrcTensorsNames() const {
+    return src_tensors_names_;
+  }
+  const std::vector<std::string>& GetDstTensorsNames() const {
+    return dst_tensors_names_;
+  }
+  const std::vector<GpuSpatialTensor*>& GetSrcTensors() const { return src_; }
+  const std::vector<GpuSpatialTensor*>& GetDstTensors() const { return dst_; }
+  const int3& GetWorkGroupsCount() const { return work_groups_count_; }
+
+  absl::Status AssembleCode(const GpuInfo& gpu_info);
+
+  const OperationDef& GetDefinition() const { return definition_; }
+  CalculationsPrecision GetPrecision() const { return definition_.precision; }
+
+  void AddSrcTensor(const std::string& tensor_name,
+                    const TensorDescriptor& desc);
+  void AddSrcBuffer(const std::string& buffer_name,
+                    const BufferDescriptor& desc);
+  void AddDstTensor(const std::string& tensor_name,
+                    const TensorDescriptor& desc);
+
+  bool IsLinkable() const { return elementwise_; }
+
+  virtual absl::Status BindArguments(ArgumentsBinder* args) {
+    return absl::OkStatus();
+  }
+  void RecalculateGridSize() { grid_size_ = GetGridSize(); }
+  void RecalculateWorkGroupsCount();
+
+  Arguments args_;
+  std::string code_;
+  int3 work_group_size_ = int3(8, 4, 1);
+  std::vector<CompilerOptions> compiler_options_;
+  // not applicable to elementwise
+  TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom;
+
+  // for profiling
+  uint64_t flops_ = 0;
+  // size in bytes of constant gpu_objects inside args_
+  uint64_t const_args_size_ = 0;
+
+  // Must be called before const generic objects in args_ released.
+  void CalculateConstArgsSize();
+
+ protected:
+  friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode(
+      const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op,
+                             GPUOperation* op);
+  friend GPUOperation CreateGpuOperation(const OperationDef& definition,
+                                         ElementwiseDescriptor&& descriptor);
+  friend GPUOperation CreateGpuOperation(const OperationDef& definition,
+                                         ElementwiseDescriptor&& descriptor,
+                                         const BHWC& second_shape);
+
+  friend absl::Status FuseElemWithElemInternal(
+      const GpuInfo& gpu_info, GPUOperation&& elem0, GPUOperation&& elem1,
+      const std::vector<std::pair<std::string, std::string>>& replacements,
+      GPUOperation* result);
+  friend absl::Status FuseSimpleElemWithSimpleElem(const GpuInfo& gpu_info,
+                                                   GPUOperation&& elem0,
+                                                   GPUOperation&& elem1,
+                                                   GPUOperation* result);
+  friend absl::Status Fuse2InputElemWithSimpleElemAsFirstInput(
+      const GpuInfo& gpu_info, GPUOperation&& elem0, GPUOperation&& elem1,
+      GPUOperation* result);
+  friend absl::Status Fuse2InputElemWithSimpleElemAsSecondInput(
+      const GpuInfo& gpu_info, GPUOperation&& elem0, GPUOperation&& elem1,
+      GPUOperation* result);
+  friend absl::Status Fuse2InputElemWith2SimpleElem(const GpuInfo& gpu_info,
+                                                    GPUOperation&& elem0,
+                                                    GPUOperation&& elem1,
+                                                    GPUOperation&& elem_root,
+                                                    GPUOperation* result);
+
+  virtual int3 GetGridSize() const;
+  virtual void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info, std::vector<int3>* work_groups) const;
+
+  // Defines operation calculation precision and format of src/dst tensors.
+  OperationDef definition_;
+  std::vector<GpuSpatialTensor*> src_;
+  std::vector<GpuSpatialTensor*> dst_;
+  int grid_dimension_ = 3;  // can be 1, 2 or 3
+  int3 work_group_launch_order_ = int3(0, 1, 2);
+  int3 grid_size_ = int3(0, 0, 0);
+  std::vector<std::string> src_tensors_names_;
+  std::vector<std::string> dst_tensors_names_;
+
+ private:
+  absl::Status GetTensorDescriptor(const std::string& tensor_name,
+                                   TensorDescriptor** resutl);
+  absl::Status ResolveSecondElementwiseInput();
+  int3 work_groups_count_ = int3(0, 0, 0);
+  bool elementwise_ = false;      // temporary, used during op construction
+  int elementwise_inputs_ = 0;    // can be {0, 1, 2}
+  std::string
+      second_elementwise_tensor_name_;  // used with elementwise_inputs_ = 2
+  int linkable_count_ = 0;        // temporary, used during op construction
+  std::string elementwise_code_;  // temporary, used during op construction
+};
+
+GPUOperation CreateGpuOperation(const OperationDef& definition,
+                                ElementwiseDescriptor&& descriptor);
+
+// For creating elementwise operations with 2 runtime inputs
+GPUOperation CreateGpuOperation(const OperationDef& definition,
+                                ElementwiseDescriptor&& descriptor,
+                                const BHWC& second_shape);
+
+absl::Status FuseElemWithElemInternal(
+    const GpuInfo& gpu_info, GPUOperation&& elem0, GPUOperation&& elem1,
+    const std::vector<std::pair<std::string, std::string>>& replacements,
+    GPUOperation* result);
+
+//    input       input
+//      |           |
+//    elem0         |
+//      |    -->  elem
+//    elem1         |
+//      |           |
+//    output      output
+absl::Status FuseSimpleElemWithSimpleElem(const GpuInfo& gpu_info,
+                                          GPUOperation&& elem0,
+                                          GPUOperation&& elem1,
+                                          GPUOperation* result);
+
+//      input           input
+//     /    \             |
+//  elem0    |            |
+//     \    /      -->  elem
+//     elem1              |
+//       |                |
+//     output           output
+absl::Status Fuse2InputElemWithSimpleElemAsFirstInput(const GpuInfo& gpu_info,
+                                                      GPUOperation&& elem0,
+                                                      GPUOperation&& elem1,
+                                                      GPUOperation* result);
+
+//      input           input
+//     /    \             |
+//    |    elem0          |
+//     \    /      -->  elem
+//     elem1              |
+//       |                |
+//     output           output
+absl::Status Fuse2InputElemWithSimpleElemAsSecondInput(const GpuInfo& gpu_info,
+                                                       GPUOperation&& elem0,
+                                                       GPUOperation&& elem1,
+                                                       GPUOperation* result);
+
+//      input           input
+//     /    \             |
+//  elem0  elem1          |
+//     \    /      -->  elem
+//   elem_root            |
+//       |                |
+//     output           output
+absl::Status Fuse2InputElemWith2SimpleElem(const GpuInfo& gpu_info,
+                                           GPUOperation&& elem0,
+                                           GPUOperation&& elem1,
+                                           GPUOperation&& elem_root,
+                                           GPUOperation* result);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h
new file mode 100644
index 00000000..c960f1af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_TENSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_TENSOR_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+// Interface for GpuSpatialTensor.
+// Spatial means that it has Width/Height/Depth dimensions(or their combination)
+// and Channels dimension
+// Batch dimension optional
+class GpuSpatialTensor {
+ public:
+  GpuSpatialTensor() = default;
+  virtual ~GpuSpatialTensor() = default;
+
+  virtual int Width() const = 0;
+  virtual int Height() const = 0;
+  virtual int Depth() const = 0;
+  virtual int Channels() const = 0;
+  virtual int Slices() const = 0;
+  virtual int Batch() const = 0;
+
+  virtual TensorDescriptor GetDescriptor() const = 0;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/profiling_info.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/profiling_info.h
new file mode 100644
index 00000000..3758b792
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/profiling_info.h
@@ -0,0 +1,58 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_PROFILING_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_PROFILING_INFO_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/time/time.h"
+
+namespace tflite {
+namespace gpu {
+
+struct ProfilingInfo {
+  struct DispatchInfo {
+    std::string label;
+    absl::Duration duration;
+    uint64_t read_mem_size = 0;
+    uint64_t write_mem_size = 0;
+    uint64_t flops = 0;
+  };
+
+  std::vector<DispatchInfo> dispatches;
+
+  absl::Duration GetTotalTime() const;
+
+  // Returns report (string of lines delimited by \n)
+  // This method uses GPU counters and measure GPU time only.
+  // Report has next structure:
+  // Per kernel timing(K kernels):
+  //   conv2d 3.2ms
+  //   ...
+  // --------------------
+  // Accumulated time per operation type:
+  //   conv2d - 14.5ms
+  //   ....
+  // --------------------
+  // Ideal total time: 23.4ms // Total time for all kernels
+  std::string GetDetailedReport() const;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_PROFILING_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h
new file mode 100644
index 00000000..8ab1efc1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_QCOM_THIN_FILTER_DESC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_QCOM_THIN_FILTER_DESC_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+struct QcomThinFilterDescriptor : public GPUObjectDescriptor {
+  int kernel_size_x;
+  int kernel_size_y;
+  std::vector<uint8_t> data;
+
+  QcomThinFilterDescriptor() = default;
+  QcomThinFilterDescriptor(const QcomThinFilterDescriptor&) = default;
+  QcomThinFilterDescriptor& operator=(const QcomThinFilterDescriptor&) =
+      default;
+  QcomThinFilterDescriptor(QcomThinFilterDescriptor&& desc) = default;
+  QcomThinFilterDescriptor& operator=(QcomThinFilterDescriptor&& desc) =
+      default;
+
+  absl::Status PerformSelector(const GpuInfo& gpu_info,
+                               absl::string_view selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(const GpuInfo& gpu_info) const override;
+
+  void Release() override { data.clear(); }
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_QCOM_THIN_FILTER_DESC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/serialization_base.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/serialization_base.h
new file mode 100644
index 00000000..b4319547
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/serialization_base.h
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_SERIALIZATION_BASE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_SERIALIZATION_BASE_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+flatbuffers::Offset<data::Int2> Encode(const int2& v,
+                                       flatbuffers::FlatBufferBuilder* builder);
+
+flatbuffers::Offset<data::Int3> Encode(const int3& v,
+                                       flatbuffers::FlatBufferBuilder* builder);
+
+flatbuffers::Offset<data::TensorDescriptor> Encode(
+    const TensorDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
+void Decode(const data::TensorDescriptor* fb_desc, TensorDescriptor* desc);
+
+flatbuffers::Offset<data::GPUOperation> Encode(
+    const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
+absl::Status Decode(const data::GPUOperation* fb_op, GPUOperation* op);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_SERIALIZATION_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
new file mode 100755
index 00000000..24f4f3e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
@@ -0,0 +1,1907 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SERIALIZATIONBASE_TFLITE_GPU_DATA_H_
+#define FLATBUFFERS_GENERATED_SERIALIZATIONBASE_TFLITE_GPU_DATA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+namespace gpu {
+namespace data {
+
+struct Int4;
+struct Int4Builder;
+
+struct Int3;
+struct Int3Builder;
+
+struct Int2;
+struct Int2Builder;
+
+struct StateVariable;
+struct StateVariableBuilder;
+
+struct GPUObjectDescriptor;
+struct GPUObjectDescriptorBuilder;
+
+struct IntValue;
+struct IntValueBuilder;
+
+struct FloatValue;
+struct FloatValueBuilder;
+
+struct HalfValue;
+struct HalfValueBuilder;
+
+struct BufferDescriptor;
+struct BufferDescriptorBuilder;
+
+struct BHWDC;
+struct BHWDCBuilder;
+
+struct TensorDescriptor;
+struct TensorDescriptorBuilder;
+
+struct BufferDescriptorMapValue;
+struct BufferDescriptorMapValueBuilder;
+
+struct TensorDescriptorMapValue;
+struct TensorDescriptorMapValueBuilder;
+
+struct Arguments;
+struct ArgumentsBuilder;
+
+struct OperationDef;
+struct OperationDefBuilder;
+
+struct CompilerOption;
+struct CompilerOptionBuilder;
+
+struct GPUOperation;
+struct GPUOperationBuilder;
+
+enum class AccessType : int8_t {
+  READ = 0,
+  WRITE = 1,
+  READ_WRITE = 2,
+  MIN = READ,
+  MAX = READ_WRITE
+};
+
+inline const AccessType (&EnumValuesAccessType())[3] {
+  static const AccessType values[] = {
+    AccessType::READ,
+    AccessType::WRITE,
+    AccessType::READ_WRITE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAccessType() {
+  static const char * const names[4] = {
+    "READ",
+    "WRITE",
+    "READ_WRITE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAccessType(AccessType e) {
+  if (::flatbuffers::IsOutRange(e, AccessType::READ, AccessType::READ_WRITE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAccessType()[index];
+}
+
+enum class DataType : int8_t {
+  UNKNOWN = 0,
+  FLOAT16 = 1,
+  FLOAT32 = 2,
+  FLOAT64 = 3,
+  UINT8 = 4,
+  INT8 = 5,
+  UINT16 = 6,
+  INT16 = 7,
+  UINT32 = 8,
+  INT32 = 9,
+  UINT64 = 10,
+  INT64 = 11,
+  BOOL = 12,
+  MIN = UNKNOWN,
+  MAX = BOOL
+};
+
+inline const DataType (&EnumValuesDataType())[13] {
+  static const DataType values[] = {
+    DataType::UNKNOWN,
+    DataType::FLOAT16,
+    DataType::FLOAT32,
+    DataType::FLOAT64,
+    DataType::UINT8,
+    DataType::INT8,
+    DataType::UINT16,
+    DataType::INT16,
+    DataType::UINT32,
+    DataType::INT32,
+    DataType::UINT64,
+    DataType::INT64,
+    DataType::BOOL
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDataType() {
+  static const char * const names[14] = {
+    "UNKNOWN",
+    "FLOAT16",
+    "FLOAT32",
+    "FLOAT64",
+    "UINT8",
+    "INT8",
+    "UINT16",
+    "INT16",
+    "UINT32",
+    "INT32",
+    "UINT64",
+    "INT64",
+    "BOOL",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDataType(DataType e) {
+  if (::flatbuffers::IsOutRange(e, DataType::UNKNOWN, DataType::BOOL)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDataType()[index];
+}
+
+enum class MemoryType : int8_t {
+  GLOBAL = 0,
+  CONSTANT = 1,
+  LOCAL = 2,
+  MIN = GLOBAL,
+  MAX = LOCAL
+};
+
+inline const MemoryType (&EnumValuesMemoryType())[3] {
+  static const MemoryType values[] = {
+    MemoryType::GLOBAL,
+    MemoryType::CONSTANT,
+    MemoryType::LOCAL
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMemoryType() {
+  static const char * const names[4] = {
+    "GLOBAL",
+    "CONSTANT",
+    "LOCAL",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMemoryType(MemoryType e) {
+  if (::flatbuffers::IsOutRange(e, MemoryType::GLOBAL, MemoryType::LOCAL)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMemoryType()[index];
+}
+
+enum class TensorStorageType : int8_t {
+  UNKNOWN = 0,
+  BUFFER = 1,
+  IMAGE_BUFFER = 2,
+  TEXTURE_2D = 3,
+  TEXTURE_3D = 4,
+  TEXTURE_ARRAY = 5,
+  SINGLE_TEXTURE_2D = 6,
+  MIN = UNKNOWN,
+  MAX = SINGLE_TEXTURE_2D
+};
+
+inline const TensorStorageType (&EnumValuesTensorStorageType())[7] {
+  static const TensorStorageType values[] = {
+    TensorStorageType::UNKNOWN,
+    TensorStorageType::BUFFER,
+    TensorStorageType::IMAGE_BUFFER,
+    TensorStorageType::TEXTURE_2D,
+    TensorStorageType::TEXTURE_3D,
+    TensorStorageType::TEXTURE_ARRAY,
+    TensorStorageType::SINGLE_TEXTURE_2D
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTensorStorageType() {
+  static const char * const names[8] = {
+    "UNKNOWN",
+    "BUFFER",
+    "IMAGE_BUFFER",
+    "TEXTURE_2D",
+    "TEXTURE_3D",
+    "TEXTURE_ARRAY",
+    "SINGLE_TEXTURE_2D",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTensorStorageType(TensorStorageType e) {
+  if (::flatbuffers::IsOutRange(e, TensorStorageType::UNKNOWN, TensorStorageType::SINGLE_TEXTURE_2D)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTensorStorageType()[index];
+}
+
+enum class Layout : int8_t {
+  UNKNOWN = 0,
+  HWC = 1,
+  BHWC = 2,
+  HWDC = 3,
+  BHWDC = 4,
+  LINEAR = 5,
+  HW = 6,
+  MIN = UNKNOWN,
+  MAX = HW
+};
+
+inline const Layout (&EnumValuesLayout())[7] {
+  static const Layout values[] = {
+    Layout::UNKNOWN,
+    Layout::HWC,
+    Layout::BHWC,
+    Layout::HWDC,
+    Layout::BHWDC,
+    Layout::LINEAR,
+    Layout::HW
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLayout() {
+  static const char * const names[8] = {
+    "UNKNOWN",
+    "HWC",
+    "BHWC",
+    "HWDC",
+    "BHWDC",
+    "LINEAR",
+    "HW",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLayout(Layout e) {
+  if (::flatbuffers::IsOutRange(e, Layout::UNKNOWN, Layout::HW)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLayout()[index];
+}
+
+enum class CalculationsPrecision : int8_t {
+  F32 = 0,
+  F32_F16 = 1,
+  F16 = 2,
+  MIN = F32,
+  MAX = F16
+};
+
+inline const CalculationsPrecision (&EnumValuesCalculationsPrecision())[3] {
+  static const CalculationsPrecision values[] = {
+    CalculationsPrecision::F32,
+    CalculationsPrecision::F32_F16,
+    CalculationsPrecision::F16
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCalculationsPrecision() {
+  static const char * const names[4] = {
+    "F32",
+    "F32_F16",
+    "F16",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCalculationsPrecision(CalculationsPrecision e) {
+  if (::flatbuffers::IsOutRange(e, CalculationsPrecision::F32, CalculationsPrecision::F16)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCalculationsPrecision()[index];
+}
+
+enum class TensorToGrid : int8_t {
+  CUSTOM = 0,
+  WB_TO_X_HD_TO_Y_S_TO_Z = 1,
+  WB_TO_X_HD_TO_Y_Z_IS_1 = 2,
+  WB_TO_X_H_TO_Y_D_TO_Z = 3,
+  B_TO_X_Y_IS_1_Z_IS_1 = 4,
+  MIN = CUSTOM,
+  MAX = B_TO_X_Y_IS_1_Z_IS_1
+};
+
+inline const TensorToGrid (&EnumValuesTensorToGrid())[5] {
+  static const TensorToGrid values[] = {
+    TensorToGrid::CUSTOM,
+    TensorToGrid::WB_TO_X_HD_TO_Y_S_TO_Z,
+    TensorToGrid::WB_TO_X_HD_TO_Y_Z_IS_1,
+    TensorToGrid::WB_TO_X_H_TO_Y_D_TO_Z,
+    TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTensorToGrid() {
+  static const char * const names[6] = {
+    "CUSTOM",
+    "WB_TO_X_HD_TO_Y_S_TO_Z",
+    "WB_TO_X_HD_TO_Y_Z_IS_1",
+    "WB_TO_X_H_TO_Y_D_TO_Z",
+    "B_TO_X_Y_IS_1_Z_IS_1",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTensorToGrid(TensorToGrid e) {
+  if (::flatbuffers::IsOutRange(e, TensorToGrid::CUSTOM, TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTensorToGrid()[index];
+}
+
+enum class CompilerOptions : int8_t {
+  ADRENO_FULL_SIMD_LINE = 0,
+  ADRENO_MORE_WAVES = 1,
+  CL_FAST_RELAXED_MATH = 2,
+  CL_OPT_DISABLE = 3,
+  CL_2_0 = 4,
+  CL_3_0 = 5,
+  CL_REGISTER_ALLOCATION_64 = 6,
+  MIN = ADRENO_FULL_SIMD_LINE,
+  MAX = CL_REGISTER_ALLOCATION_64
+};
+
+inline const CompilerOptions (&EnumValuesCompilerOptions())[7] {
+  static const CompilerOptions values[] = {
+    CompilerOptions::ADRENO_FULL_SIMD_LINE,
+    CompilerOptions::ADRENO_MORE_WAVES,
+    CompilerOptions::CL_FAST_RELAXED_MATH,
+    CompilerOptions::CL_OPT_DISABLE,
+    CompilerOptions::CL_2_0,
+    CompilerOptions::CL_3_0,
+    CompilerOptions::CL_REGISTER_ALLOCATION_64
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCompilerOptions() {
+  static const char * const names[8] = {
+    "ADRENO_FULL_SIMD_LINE",
+    "ADRENO_MORE_WAVES",
+    "CL_FAST_RELAXED_MATH",
+    "CL_OPT_DISABLE",
+    "CL_2_0",
+    "CL_3_0",
+    "CL_REGISTER_ALLOCATION_64",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCompilerOptions(CompilerOptions e) {
+  if (::flatbuffers::IsOutRange(e, CompilerOptions::MIN, CompilerOptions::MAX)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCompilerOptions()[index];
+}
+
+struct Int4 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Int4Builder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_X = 4,
+    VT_Y = 6,
+    VT_Z = 8,
+    VT_W = 10
+  };
+  int32_t x() const {
+    return GetField<int32_t>(VT_X, 0);
+  }
+  int32_t y() const {
+    return GetField<int32_t>(VT_Y, 0);
+  }
+  int32_t z() const {
+    return GetField<int32_t>(VT_Z, 0);
+  }
+  int32_t w() const {
+    return GetField<int32_t>(VT_W, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_X, 4) &&
+           VerifyField<int32_t>(verifier, VT_Y, 4) &&
+           VerifyField<int32_t>(verifier, VT_Z, 4) &&
+           VerifyField<int32_t>(verifier, VT_W, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct Int4Builder {
+  typedef Int4 Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_x(int32_t x) {
+    fbb_.AddElement<int32_t>(Int4::VT_X, x, 0);
+  }
+  void add_y(int32_t y) {
+    fbb_.AddElement<int32_t>(Int4::VT_Y, y, 0);
+  }
+  void add_z(int32_t z) {
+    fbb_.AddElement<int32_t>(Int4::VT_Z, z, 0);
+  }
+  void add_w(int32_t w) {
+    fbb_.AddElement<int32_t>(Int4::VT_W, w, 0);
+  }
+  explicit Int4Builder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int4> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Int4>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int4> CreateInt4(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t x = 0,
+    int32_t y = 0,
+    int32_t z = 0,
+    int32_t w = 0) {
+  Int4Builder builder_(_fbb);
+  builder_.add_w(w);
+  builder_.add_z(z);
+  builder_.add_y(y);
+  builder_.add_x(x);
+  return builder_.Finish();
+}
+
+struct Int3 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Int3Builder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_X = 4,
+    VT_Y = 6,
+    VT_Z = 8
+  };
+  int32_t x() const {
+    return GetField<int32_t>(VT_X, 0);
+  }
+  int32_t y() const {
+    return GetField<int32_t>(VT_Y, 0);
+  }
+  int32_t z() const {
+    return GetField<int32_t>(VT_Z, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_X, 4) &&
+           VerifyField<int32_t>(verifier, VT_Y, 4) &&
+           VerifyField<int32_t>(verifier, VT_Z, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct Int3Builder {
+  typedef Int3 Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_x(int32_t x) {
+    fbb_.AddElement<int32_t>(Int3::VT_X, x, 0);
+  }
+  void add_y(int32_t y) {
+    fbb_.AddElement<int32_t>(Int3::VT_Y, y, 0);
+  }
+  void add_z(int32_t z) {
+    fbb_.AddElement<int32_t>(Int3::VT_Z, z, 0);
+  }
+  explicit Int3Builder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int3> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Int3>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int3> CreateInt3(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t x = 0,
+    int32_t y = 0,
+    int32_t z = 0) {
+  Int3Builder builder_(_fbb);
+  builder_.add_z(z);
+  builder_.add_y(y);
+  builder_.add_x(x);
+  return builder_.Finish();
+}
+
+struct Int2 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Int2Builder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_X = 4,
+    VT_Y = 6
+  };
+  int32_t x() const {
+    return GetField<int32_t>(VT_X, 0);
+  }
+  int32_t y() const {
+    return GetField<int32_t>(VT_Y, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_X, 4) &&
+           VerifyField<int32_t>(verifier, VT_Y, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct Int2Builder {
+  typedef Int2 Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_x(int32_t x) {
+    fbb_.AddElement<int32_t>(Int2::VT_X, x, 0);
+  }
+  void add_y(int32_t y) {
+    fbb_.AddElement<int32_t>(Int2::VT_Y, y, 0);
+  }
+  explicit Int2Builder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int2> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Int2>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int2> CreateInt2(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t x = 0,
+    int32_t y = 0) {
+  Int2Builder builder_(_fbb);
+  builder_.add_y(y);
+  builder_.add_x(x);
+  return builder_.Finish();
+}
+
+struct StateVariable FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StateVariableBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEY = 4,
+    VT_VALUE = 6
+  };
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
+  }
+  const ::flatbuffers::String *value() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyString(value()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StateVariableBuilder {
+  typedef StateVariable Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
+    fbb_.AddOffset(StateVariable::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<::flatbuffers::String> value) {
+    fbb_.AddOffset(StateVariable::VT_VALUE, value);
+  }
+  explicit StateVariableBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StateVariable> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StateVariable>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StateVariable> CreateStateVariable(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> value = 0) {
+  StateVariableBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StateVariable> CreateStateVariableDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *key = nullptr,
+    const char *value = nullptr) {
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateString(value) : 0;
+  return tflite::gpu::data::CreateStateVariable(
+      _fbb,
+      key__,
+      value__);
+}
+
+struct GPUObjectDescriptor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GPUObjectDescriptorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STATE_VARS = 4,
+    VT_ACCESS_TYPE = 6
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>> *state_vars() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>> *>(VT_STATE_VARS);
+  }
+  tflite::gpu::data::AccessType access_type() const {
+    return static_cast<tflite::gpu::data::AccessType>(GetField<int8_t>(VT_ACCESS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_STATE_VARS) &&
+           verifier.VerifyVector(state_vars()) &&
+           verifier.VerifyVectorOfTables(state_vars()) &&
+           VerifyField<int8_t>(verifier, VT_ACCESS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct GPUObjectDescriptorBuilder {
+  typedef GPUObjectDescriptor Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_state_vars(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>>> state_vars) {
+    fbb_.AddOffset(GPUObjectDescriptor::VT_STATE_VARS, state_vars);
+  }
+  void add_access_type(tflite::gpu::data::AccessType access_type) {
+    fbb_.AddElement<int8_t>(GPUObjectDescriptor::VT_ACCESS_TYPE, static_cast<int8_t>(access_type), 0);
+  }
+  explicit GPUObjectDescriptorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GPUObjectDescriptor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GPUObjectDescriptor>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>>> state_vars = 0,
+    tflite::gpu::data::AccessType access_type = tflite::gpu::data::AccessType::READ) {
+  GPUObjectDescriptorBuilder builder_(_fbb);
+  builder_.add_state_vars(state_vars);
+  builder_.add_access_type(access_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>> *state_vars = nullptr,
+    tflite::gpu::data::AccessType access_type = tflite::gpu::data::AccessType::READ) {
+  auto state_vars__ = state_vars ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>>(*state_vars) : 0;
+  return tflite::gpu::data::CreateGPUObjectDescriptor(
+      _fbb,
+      state_vars__,
+      access_type);
+}
+
+struct IntValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUE = 6,
+    VT_ACTIVE = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  int32_t value() const {
+    return GetField<int32_t>(VT_VALUE, 0);
+  }
+  bool active() const {
+    return GetField<uint8_t>(VT_ACTIVE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int32_t>(verifier, VT_VALUE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ACTIVE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntValueBuilder {
+  typedef IntValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(IntValue::VT_NAME, name);
+  }
+  void add_value(int32_t value) {
+    fbb_.AddElement<int32_t>(IntValue::VT_VALUE, value, 0);
+  }
+  void add_active(bool active) {
+    fbb_.AddElement<uint8_t>(IntValue::VT_ACTIVE, static_cast<uint8_t>(active), 0);
+  }
+  explicit IntValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IntValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IntValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IntValue> CreateIntValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    int32_t value = 0,
+    bool active = false) {
+  IntValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_name(name);
+  builder_.add_active(active);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<IntValue> CreateIntValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    int32_t value = 0,
+    bool active = false) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::gpu::data::CreateIntValue(
+      _fbb,
+      name__,
+      value,
+      active);
+}
+
+struct FloatValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloatValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUE = 6,
+    VT_ACTIVE = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  float value() const {
+    return GetField<float>(VT_VALUE, 0.0f);
+  }
+  bool active() const {
+    return GetField<uint8_t>(VT_ACTIVE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<float>(verifier, VT_VALUE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ACTIVE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloatValueBuilder {
+  typedef FloatValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(FloatValue::VT_NAME, name);
+  }
+  void add_value(float value) {
+    fbb_.AddElement<float>(FloatValue::VT_VALUE, value, 0.0f);
+  }
+  void add_active(bool active) {
+    fbb_.AddElement<uint8_t>(FloatValue::VT_ACTIVE, static_cast<uint8_t>(active), 0);
+  }
+  explicit FloatValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloatValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FloatValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloatValue> CreateFloatValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    float value = 0.0f,
+    bool active = false) {
+  FloatValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_name(name);
+  builder_.add_active(active);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<FloatValue> CreateFloatValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    float value = 0.0f,
+    bool active = false) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::gpu::data::CreateFloatValue(
+      _fbb,
+      name__,
+      value,
+      active);
+}
+
+struct HalfValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HalfValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUE = 6,
+    VT_ACTIVE = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  float value() const {
+    return GetField<float>(VT_VALUE, 0.0f);
+  }
+  bool active() const {
+    return GetField<uint8_t>(VT_ACTIVE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<float>(verifier, VT_VALUE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ACTIVE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct HalfValueBuilder {
+  typedef HalfValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(HalfValue::VT_NAME, name);
+  }
+  void add_value(float value) {
+    fbb_.AddElement<float>(HalfValue::VT_VALUE, value, 0.0f);
+  }
+  void add_active(bool active) {
+    fbb_.AddElement<uint8_t>(HalfValue::VT_ACTIVE, static_cast<uint8_t>(active), 0);
+  }
+  explicit HalfValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HalfValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HalfValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HalfValue> CreateHalfValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    float value = 0.0f,
+    bool active = false) {
+  HalfValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_name(name);
+  builder_.add_active(active);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<HalfValue> CreateHalfValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    float value = 0.0f,
+    bool active = false) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::gpu::data::CreateHalfValue(
+      _fbb,
+      name__,
+      value,
+      active);
+}
+
+struct BufferDescriptor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BufferDescriptorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BASE_OBJ = 4,
+    VT_ELEMENT_TYPE = 6,
+    VT_ELEMENT_SIZE = 8,
+    VT_MEMORY_TYPE = 10,
+    VT_ATTRIBUTES = 12,
+    VT_SIZE = 14,
+    VT_DATA = 16
+  };
+  const tflite::gpu::data::GPUObjectDescriptor *base_obj() const {
+    return GetPointer<const tflite::gpu::data::GPUObjectDescriptor *>(VT_BASE_OBJ);
+  }
+  tflite::gpu::data::DataType element_type() const {
+    return static_cast<tflite::gpu::data::DataType>(GetField<int8_t>(VT_ELEMENT_TYPE, 0));
+  }
+  int32_t element_size() const {
+    return GetField<int32_t>(VT_ELEMENT_SIZE, 0);
+  }
+  tflite::gpu::data::MemoryType memory_type() const {
+    return static_cast<tflite::gpu::data::MemoryType>(GetField<int8_t>(VT_MEMORY_TYPE, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ATTRIBUTES);
+  }
+  int32_t size() const {
+    return GetField<int32_t>(VT_SIZE, 0);
+  }
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BASE_OBJ) &&
+           verifier.VerifyTable(base_obj()) &&
+           VerifyField<int8_t>(verifier, VT_ELEMENT_TYPE, 1) &&
+           VerifyField<int32_t>(verifier, VT_ELEMENT_SIZE, 4) &&
+           VerifyField<int8_t>(verifier, VT_MEMORY_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfStrings(attributes()) &&
+           VerifyField<int32_t>(verifier, VT_SIZE, 4) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct BufferDescriptorBuilder {
+  typedef BufferDescriptor Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_base_obj(::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj) {
+    fbb_.AddOffset(BufferDescriptor::VT_BASE_OBJ, base_obj);
+  }
+  void add_element_type(tflite::gpu::data::DataType element_type) {
+    fbb_.AddElement<int8_t>(BufferDescriptor::VT_ELEMENT_TYPE, static_cast<int8_t>(element_type), 0);
+  }
+  void add_element_size(int32_t element_size) {
+    fbb_.AddElement<int32_t>(BufferDescriptor::VT_ELEMENT_SIZE, element_size, 0);
+  }
+  void add_memory_type(tflite::gpu::data::MemoryType memory_type) {
+    fbb_.AddElement<int8_t>(BufferDescriptor::VT_MEMORY_TYPE, static_cast<int8_t>(memory_type), 0);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attributes) {
+    fbb_.AddOffset(BufferDescriptor::VT_ATTRIBUTES, attributes);
+  }
+  void add_size(int32_t size) {
+    fbb_.AddElement<int32_t>(BufferDescriptor::VT_SIZE, size, 0);
+  }
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(BufferDescriptor::VT_DATA, data);
+  }
+  explicit BufferDescriptorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BufferDescriptor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BufferDescriptor>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+    tflite::gpu::data::DataType element_type = tflite::gpu::data::DataType::UNKNOWN,
+    int32_t element_size = 0,
+    tflite::gpu::data::MemoryType memory_type = tflite::gpu::data::MemoryType::GLOBAL,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attributes = 0,
+    int32_t size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
+  BufferDescriptorBuilder builder_(_fbb);
+  builder_.add_data(data);
+  builder_.add_size(size);
+  builder_.add_attributes(attributes);
+  builder_.add_element_size(element_size);
+  builder_.add_base_obj(base_obj);
+  builder_.add_memory_type(memory_type);
+  builder_.add_element_type(element_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+    tflite::gpu::data::DataType element_type = tflite::gpu::data::DataType::UNKNOWN,
+    int32_t element_size = 0,
+    tflite::gpu::data::MemoryType memory_type = tflite::gpu::data::MemoryType::GLOBAL,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *attributes = nullptr,
+    int32_t size = 0,
+    const std::vector<uint8_t> *data = nullptr) {
+  auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*attributes) : 0;
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return tflite::gpu::data::CreateBufferDescriptor(
+      _fbb,
+      base_obj,
+      element_type,
+      element_size,
+      memory_type,
+      attributes__,
+      size,
+      data__);
+}
+
+struct BHWDC FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BHWDCBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_B = 4,
+    VT_H = 6,
+    VT_W = 8,
+    VT_D = 10,
+    VT_C = 12
+  };
+  int32_t b() const {
+    return GetField<int32_t>(VT_B, 0);
+  }
+  int32_t h() const {
+    return GetField<int32_t>(VT_H, 0);
+  }
+  int32_t w() const {
+    return GetField<int32_t>(VT_W, 0);
+  }
+  int32_t d() const {
+    return GetField<int32_t>(VT_D, 0);
+  }
+  int32_t c() const {
+    return GetField<int32_t>(VT_C, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_B, 4) &&
+           VerifyField<int32_t>(verifier, VT_H, 4) &&
+           VerifyField<int32_t>(verifier, VT_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_D, 4) &&
+           VerifyField<int32_t>(verifier, VT_C, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct BHWDCBuilder {
+  typedef BHWDC Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_b(int32_t b) {
+    fbb_.AddElement<int32_t>(BHWDC::VT_B, b, 0);
+  }
+  void add_h(int32_t h) {
+    fbb_.AddElement<int32_t>(BHWDC::VT_H, h, 0);
+  }
+  void add_w(int32_t w) {
+    fbb_.AddElement<int32_t>(BHWDC::VT_W, w, 0);
+  }
+  void add_d(int32_t d) {
+    fbb_.AddElement<int32_t>(BHWDC::VT_D, d, 0);
+  }
+  void add_c(int32_t c) {
+    fbb_.AddElement<int32_t>(BHWDC::VT_C, c, 0);
+  }
+  explicit BHWDCBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BHWDC> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BHWDC>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BHWDC> CreateBHWDC(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t b = 0,
+    int32_t h = 0,
+    int32_t w = 0,
+    int32_t d = 0,
+    int32_t c = 0) {
+  BHWDCBuilder builder_(_fbb);
+  builder_.add_c(c);
+  builder_.add_d(d);
+  builder_.add_w(w);
+  builder_.add_h(h);
+  builder_.add_b(b);
+  return builder_.Finish();
+}
+
+struct TensorDescriptor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorDescriptorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BASE_OBJ = 4,
+    VT_DATA_TYPE = 6,
+    VT_STORAGE_TYPE = 8,
+    VT_LAYOUT = 10,
+    VT_SHAPE = 12,
+    VT_DATA = 14,
+    VT_USE_BUFFER_FOR_WRITE_ONLY_2D_TEXTURE = 16,
+    VT_USE_BUFFER_FOR_WRITE_ONLY_IMAGE_BUFFER = 18
+  };
+  const tflite::gpu::data::GPUObjectDescriptor *base_obj() const {
+    return GetPointer<const tflite::gpu::data::GPUObjectDescriptor *>(VT_BASE_OBJ);
+  }
+  tflite::gpu::data::DataType data_type() const {
+    return static_cast<tflite::gpu::data::DataType>(GetField<int8_t>(VT_DATA_TYPE, 0));
+  }
+  tflite::gpu::data::TensorStorageType storage_type() const {
+    return static_cast<tflite::gpu::data::TensorStorageType>(GetField<int8_t>(VT_STORAGE_TYPE, 0));
+  }
+  tflite::gpu::data::Layout layout() const {
+    return static_cast<tflite::gpu::data::Layout>(GetField<int8_t>(VT_LAYOUT, 0));
+  }
+  const tflite::gpu::data::BHWDC *shape() const {
+    return GetPointer<const tflite::gpu::data::BHWDC *>(VT_SHAPE);
+  }
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool use_buffer_for_write_only_2d_texture() const {
+    return GetField<uint8_t>(VT_USE_BUFFER_FOR_WRITE_ONLY_2D_TEXTURE, 0) != 0;
+  }
+  bool use_buffer_for_write_only_image_buffer() const {
+    return GetField<uint8_t>(VT_USE_BUFFER_FOR_WRITE_ONLY_IMAGE_BUFFER, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BASE_OBJ) &&
+           verifier.VerifyTable(base_obj()) &&
+           VerifyField<int8_t>(verifier, VT_DATA_TYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_STORAGE_TYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_LAYOUT, 1) &&
+           VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.VerifyTable(shape()) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           VerifyField<uint8_t>(verifier, VT_USE_BUFFER_FOR_WRITE_ONLY_2D_TEXTURE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_BUFFER_FOR_WRITE_ONLY_IMAGE_BUFFER, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorDescriptorBuilder {
+  typedef TensorDescriptor Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_base_obj(::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj) {
+    fbb_.AddOffset(TensorDescriptor::VT_BASE_OBJ, base_obj);
+  }
+  void add_data_type(tflite::gpu::data::DataType data_type) {
+    fbb_.AddElement<int8_t>(TensorDescriptor::VT_DATA_TYPE, static_cast<int8_t>(data_type), 0);
+  }
+  void add_storage_type(tflite::gpu::data::TensorStorageType storage_type) {
+    fbb_.AddElement<int8_t>(TensorDescriptor::VT_STORAGE_TYPE, static_cast<int8_t>(storage_type), 0);
+  }
+  void add_layout(tflite::gpu::data::Layout layout) {
+    fbb_.AddElement<int8_t>(TensorDescriptor::VT_LAYOUT, static_cast<int8_t>(layout), 0);
+  }
+  void add_shape(::flatbuffers::Offset<tflite::gpu::data::BHWDC> shape) {
+    fbb_.AddOffset(TensorDescriptor::VT_SHAPE, shape);
+  }
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(TensorDescriptor::VT_DATA, data);
+  }
+  void add_use_buffer_for_write_only_2d_texture(bool use_buffer_for_write_only_2d_texture) {
+    fbb_.AddElement<uint8_t>(TensorDescriptor::VT_USE_BUFFER_FOR_WRITE_ONLY_2D_TEXTURE, static_cast<uint8_t>(use_buffer_for_write_only_2d_texture), 0);
+  }
+  void add_use_buffer_for_write_only_image_buffer(bool use_buffer_for_write_only_image_buffer) {
+    fbb_.AddElement<uint8_t>(TensorDescriptor::VT_USE_BUFFER_FOR_WRITE_ONLY_IMAGE_BUFFER, static_cast<uint8_t>(use_buffer_for_write_only_image_buffer), 0);
+  }
+  explicit TensorDescriptorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorDescriptor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorDescriptor>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+    tflite::gpu::data::DataType data_type = tflite::gpu::data::DataType::UNKNOWN,
+    tflite::gpu::data::TensorStorageType storage_type = tflite::gpu::data::TensorStorageType::UNKNOWN,
+    tflite::gpu::data::Layout layout = tflite::gpu::data::Layout::UNKNOWN,
+    ::flatbuffers::Offset<tflite::gpu::data::BHWDC> shape = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0,
+    bool use_buffer_for_write_only_2d_texture = false,
+    bool use_buffer_for_write_only_image_buffer = false) {
+  TensorDescriptorBuilder builder_(_fbb);
+  builder_.add_data(data);
+  builder_.add_shape(shape);
+  builder_.add_base_obj(base_obj);
+  builder_.add_use_buffer_for_write_only_image_buffer(use_buffer_for_write_only_image_buffer);
+  builder_.add_use_buffer_for_write_only_2d_texture(use_buffer_for_write_only_2d_texture);
+  builder_.add_layout(layout);
+  builder_.add_storage_type(storage_type);
+  builder_.add_data_type(data_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+    tflite::gpu::data::DataType data_type = tflite::gpu::data::DataType::UNKNOWN,
+    tflite::gpu::data::TensorStorageType storage_type = tflite::gpu::data::TensorStorageType::UNKNOWN,
+    tflite::gpu::data::Layout layout = tflite::gpu::data::Layout::UNKNOWN,
+    ::flatbuffers::Offset<tflite::gpu::data::BHWDC> shape = 0,
+    const std::vector<uint8_t> *data = nullptr,
+    bool use_buffer_for_write_only_2d_texture = false,
+    bool use_buffer_for_write_only_image_buffer = false) {
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return tflite::gpu::data::CreateTensorDescriptor(
+      _fbb,
+      base_obj,
+      data_type,
+      storage_type,
+      layout,
+      shape,
+      data__,
+      use_buffer_for_write_only_2d_texture,
+      use_buffer_for_write_only_image_buffer);
+}
+
+struct BufferDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BufferDescriptorMapValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEY = 4,
+    VT_VALUE = 6
+  };
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
+  }
+  const tflite::gpu::data::BufferDescriptor *value() const {
+    return GetPointer<const tflite::gpu::data::BufferDescriptor *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyTable(value()) &&
+           verifier.EndTable();
+  }
+};
+
+struct BufferDescriptorMapValueBuilder {
+  typedef BufferDescriptorMapValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
+    fbb_.AddOffset(BufferDescriptorMapValue::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value) {
+    fbb_.AddOffset(BufferDescriptorMapValue::VT_VALUE, value);
+  }
+  explicit BufferDescriptorMapValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BufferDescriptorMapValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BufferDescriptorMapValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value = 0) {
+  BufferDescriptorMapValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *key = nullptr,
+    ::flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value = 0) {
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  return tflite::gpu::data::CreateBufferDescriptorMapValue(
+      _fbb,
+      key__,
+      value);
+}
+
+struct TensorDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorDescriptorMapValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEY = 4,
+    VT_VALUE = 6
+  };
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
+  }
+  const tflite::gpu::data::TensorDescriptor *value() const {
+    return GetPointer<const tflite::gpu::data::TensorDescriptor *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyTable(value()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorDescriptorMapValueBuilder {
+  typedef TensorDescriptorMapValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
+    fbb_.AddOffset(TensorDescriptorMapValue::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value) {
+    fbb_.AddOffset(TensorDescriptorMapValue::VT_VALUE, value);
+  }
+  explicit TensorDescriptorMapValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorDescriptorMapValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorDescriptorMapValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value = 0) {
+  TensorDescriptorMapValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *key = nullptr,
+    ::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value = 0) {
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  return tflite::gpu::data::CreateTensorDescriptorMapValue(
+      _fbb,
+      key__,
+      value);
+}
+
+struct Arguments FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgumentsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INT_VALUES = 4,
+    VT_FLOAT_VALUES = 6,
+    VT_HALF_VALUES = 8,
+    VT_BUFFER_REFS = 10,
+    VT_TENSOR_REFS = 12,
+    VT_BUFFER_OBJECTS = 14,
+    VT_TENSOR_OBJECTS = 16
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>> *int_values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>> *>(VT_INT_VALUES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>> *float_values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>> *>(VT_FLOAT_VALUES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>> *half_values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>> *>(VT_HALF_VALUES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *>(VT_BUFFER_REFS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *>(VT_TENSOR_REFS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_objects() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *>(VT_BUFFER_OBJECTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_objects() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *>(VT_TENSOR_OBJECTS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INT_VALUES) &&
+           verifier.VerifyVector(int_values()) &&
+           verifier.VerifyVectorOfTables(int_values()) &&
+           VerifyOffset(verifier, VT_FLOAT_VALUES) &&
+           verifier.VerifyVector(float_values()) &&
+           verifier.VerifyVectorOfTables(float_values()) &&
+           VerifyOffset(verifier, VT_HALF_VALUES) &&
+           verifier.VerifyVector(half_values()) &&
+           verifier.VerifyVectorOfTables(half_values()) &&
+           VerifyOffset(verifier, VT_BUFFER_REFS) &&
+           verifier.VerifyVector(buffer_refs()) &&
+           verifier.VerifyVectorOfTables(buffer_refs()) &&
+           VerifyOffset(verifier, VT_TENSOR_REFS) &&
+           verifier.VerifyVector(tensor_refs()) &&
+           verifier.VerifyVectorOfTables(tensor_refs()) &&
+           VerifyOffset(verifier, VT_BUFFER_OBJECTS) &&
+           verifier.VerifyVector(buffer_objects()) &&
+           verifier.VerifyVectorOfTables(buffer_objects()) &&
+           VerifyOffset(verifier, VT_TENSOR_OBJECTS) &&
+           verifier.VerifyVector(tensor_objects()) &&
+           verifier.VerifyVectorOfTables(tensor_objects()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ArgumentsBuilder {
+  typedef Arguments Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_int_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>>> int_values) {
+    fbb_.AddOffset(Arguments::VT_INT_VALUES, int_values);
+  }
+  void add_float_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>>> float_values) {
+    fbb_.AddOffset(Arguments::VT_FLOAT_VALUES, float_values);
+  }
+  void add_half_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>>> half_values) {
+    fbb_.AddOffset(Arguments::VT_HALF_VALUES, half_values);
+  }
+  void add_buffer_refs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_refs) {
+    fbb_.AddOffset(Arguments::VT_BUFFER_REFS, buffer_refs);
+  }
+  void add_tensor_refs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_refs) {
+    fbb_.AddOffset(Arguments::VT_TENSOR_REFS, tensor_refs);
+  }
+  void add_buffer_objects(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_objects) {
+    fbb_.AddOffset(Arguments::VT_BUFFER_OBJECTS, buffer_objects);
+  }
+  void add_tensor_objects(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_objects) {
+    fbb_.AddOffset(Arguments::VT_TENSOR_OBJECTS, tensor_objects);
+  }
+  explicit ArgumentsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Arguments> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Arguments>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Arguments> CreateArguments(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>>> int_values = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>>> float_values = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>>> half_values = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_objects = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_objects = 0) {
+  ArgumentsBuilder builder_(_fbb);
+  builder_.add_tensor_objects(tensor_objects);
+  builder_.add_buffer_objects(buffer_objects);
+  builder_.add_tensor_refs(tensor_refs);
+  builder_.add_buffer_refs(buffer_refs);
+  builder_.add_half_values(half_values);
+  builder_.add_float_values(float_values);
+  builder_.add_int_values(int_values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Arguments> CreateArgumentsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>> *int_values = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>> *float_values = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>> *half_values = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_refs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_refs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_objects = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_objects = nullptr) {
+  auto int_values__ = int_values ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::IntValue>>(*int_values) : 0;
+  auto float_values__ = float_values ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>>(*float_values) : 0;
+  auto half_values__ = half_values ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>>(*half_values) : 0;
+  auto buffer_refs__ = buffer_refs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>(*buffer_refs) : 0;
+  auto tensor_refs__ = tensor_refs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>(*tensor_refs) : 0;
+  auto buffer_objects__ = buffer_objects ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>(*buffer_objects) : 0;
+  auto tensor_objects__ = tensor_objects ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>(*tensor_objects) : 0;
+  return tflite::gpu::data::CreateArguments(
+      _fbb,
+      int_values__,
+      float_values__,
+      half_values__,
+      buffer_refs__,
+      tensor_refs__,
+      buffer_objects__,
+      tensor_objects__);
+}
+
+struct OperationDef FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperationDefBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PRECISION = 4,
+    VT_SRC_TENSORS = 6,
+    VT_DST_TENSORS = 8
+  };
+  tflite::gpu::data::CalculationsPrecision precision() const {
+    return static_cast<tflite::gpu::data::CalculationsPrecision>(GetField<int8_t>(VT_PRECISION, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *src_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *>(VT_SRC_TENSORS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *dst_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *>(VT_DST_TENSORS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PRECISION, 1) &&
+           VerifyOffset(verifier, VT_SRC_TENSORS) &&
+           verifier.VerifyVector(src_tensors()) &&
+           verifier.VerifyVectorOfTables(src_tensors()) &&
+           VerifyOffset(verifier, VT_DST_TENSORS) &&
+           verifier.VerifyVector(dst_tensors()) &&
+           verifier.VerifyVectorOfTables(dst_tensors()) &&
+           verifier.EndTable();
+  }
+};
+
+struct OperationDefBuilder {
+  typedef OperationDef Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(tflite::gpu::data::CalculationsPrecision precision) {
+    fbb_.AddElement<int8_t>(OperationDef::VT_PRECISION, static_cast<int8_t>(precision), 0);
+  }
+  void add_src_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> src_tensors) {
+    fbb_.AddOffset(OperationDef::VT_SRC_TENSORS, src_tensors);
+  }
+  void add_dst_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> dst_tensors) {
+    fbb_.AddOffset(OperationDef::VT_DST_TENSORS, dst_tensors);
+  }
+  explicit OperationDefBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<OperationDef> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<OperationDef>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<OperationDef> CreateOperationDef(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::gpu::data::CalculationsPrecision precision = tflite::gpu::data::CalculationsPrecision::F32,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> src_tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> dst_tensors = 0) {
+  OperationDefBuilder builder_(_fbb);
+  builder_.add_dst_tensors(dst_tensors);
+  builder_.add_src_tensors(src_tensors);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<OperationDef> CreateOperationDefDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::gpu::data::CalculationsPrecision precision = tflite::gpu::data::CalculationsPrecision::F32,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *src_tensors = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *dst_tensors = nullptr) {
+  auto src_tensors__ = src_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>(*src_tensors) : 0;
+  auto dst_tensors__ = dst_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>(*dst_tensors) : 0;
+  return tflite::gpu::data::CreateOperationDef(
+      _fbb,
+      precision,
+      src_tensors__,
+      dst_tensors__);
+}
+
+struct CompilerOption FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CompilerOptionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OPTION = 4
+  };
+  tflite::gpu::data::CompilerOptions option() const {
+    return static_cast<tflite::gpu::data::CompilerOptions>(GetField<int8_t>(VT_OPTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OPTION, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct CompilerOptionBuilder {
+  typedef CompilerOption Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_option(tflite::gpu::data::CompilerOptions option) {
+    fbb_.AddElement<int8_t>(CompilerOption::VT_OPTION, static_cast<int8_t>(option), 0);
+  }
+  explicit CompilerOptionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CompilerOption> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CompilerOption>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CompilerOption> CreateCompilerOption(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::gpu::data::CompilerOptions option = tflite::gpu::data::CompilerOptions::ADRENO_FULL_SIMD_LINE) {
+  CompilerOptionBuilder builder_(_fbb);
+  builder_.add_option(option);
+  return builder_.Finish();
+}
+
+struct GPUOperation FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GPUOperationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ARGUMENTS = 4,
+    VT_CODE = 6,
+    VT_WORK_GROUP_SIZE = 8,
+    VT_COMPILER_OPTIONS = 10,
+    VT_TENSOR_TO_GRID = 12,
+    VT_FLOPS = 14,
+    VT_DEFINITION = 16,
+    VT_GRID_DIMENSION = 18,
+    VT_WORK_GROUP_LAUNCH_ORDER = 20,
+    VT_GRID_SIZE = 22,
+    VT_SRC_TENSORS_NAMES = 24,
+    VT_DST_TENSORS_NAMES = 26,
+    VT_WORK_GROUPS_COUNT = 28
+  };
+  const tflite::gpu::data::Arguments *arguments() const {
+    return GetPointer<const tflite::gpu::data::Arguments *>(VT_ARGUMENTS);
+  }
+  const ::flatbuffers::String *code() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CODE);
+  }
+  const tflite::gpu::data::Int3 *work_group_size() const {
+    return GetPointer<const tflite::gpu::data::Int3 *>(VT_WORK_GROUP_SIZE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *compiler_options() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *>(VT_COMPILER_OPTIONS);
+  }
+  tflite::gpu::data::TensorToGrid tensor_to_grid() const {
+    return static_cast<tflite::gpu::data::TensorToGrid>(GetField<int8_t>(VT_TENSOR_TO_GRID, 0));
+  }
+  uint64_t flops() const {
+    return GetField<uint64_t>(VT_FLOPS, 0);
+  }
+  const tflite::gpu::data::OperationDef *definition() const {
+    return GetPointer<const tflite::gpu::data::OperationDef *>(VT_DEFINITION);
+  }
+  int32_t grid_dimension() const {
+    return GetField<int32_t>(VT_GRID_DIMENSION, 0);
+  }
+  const tflite::gpu::data::Int3 *work_group_launch_order() const {
+    return GetPointer<const tflite::gpu::data::Int3 *>(VT_WORK_GROUP_LAUNCH_ORDER);
+  }
+  const tflite::gpu::data::Int3 *grid_size() const {
+    return GetPointer<const tflite::gpu::data::Int3 *>(VT_GRID_SIZE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *src_tensors_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_SRC_TENSORS_NAMES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *dst_tensors_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DST_TENSORS_NAMES);
+  }
+  const tflite::gpu::data::Int3 *work_groups_count() const {
+    return GetPointer<const tflite::gpu::data::Int3 *>(VT_WORK_GROUPS_COUNT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ARGUMENTS) &&
+           verifier.VerifyTable(arguments()) &&
+           VerifyOffset(verifier, VT_CODE) &&
+           verifier.VerifyString(code()) &&
+           VerifyOffset(verifier, VT_WORK_GROUP_SIZE) &&
+           verifier.VerifyTable(work_group_size()) &&
+           VerifyOffset(verifier, VT_COMPILER_OPTIONS) &&
+           verifier.VerifyVector(compiler_options()) &&
+           verifier.VerifyVectorOfTables(compiler_options()) &&
+           VerifyField<int8_t>(verifier, VT_TENSOR_TO_GRID, 1) &&
+           VerifyField<uint64_t>(verifier, VT_FLOPS, 8) &&
+           VerifyOffset(verifier, VT_DEFINITION) &&
+           verifier.VerifyTable(definition()) &&
+           VerifyField<int32_t>(verifier, VT_GRID_DIMENSION, 4) &&
+           VerifyOffset(verifier, VT_WORK_GROUP_LAUNCH_ORDER) &&
+           verifier.VerifyTable(work_group_launch_order()) &&
+           VerifyOffset(verifier, VT_GRID_SIZE) &&
+           verifier.VerifyTable(grid_size()) &&
+           VerifyOffset(verifier, VT_SRC_TENSORS_NAMES) &&
+           verifier.VerifyVector(src_tensors_names()) &&
+           verifier.VerifyVectorOfStrings(src_tensors_names()) &&
+           VerifyOffset(verifier, VT_DST_TENSORS_NAMES) &&
+           verifier.VerifyVector(dst_tensors_names()) &&
+           verifier.VerifyVectorOfStrings(dst_tensors_names()) &&
+           VerifyOffset(verifier, VT_WORK_GROUPS_COUNT) &&
+           verifier.VerifyTable(work_groups_count()) &&
+           verifier.EndTable();
+  }
+};
+
+struct GPUOperationBuilder {
+  typedef GPUOperation Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_arguments(::flatbuffers::Offset<tflite::gpu::data::Arguments> arguments) {
+    fbb_.AddOffset(GPUOperation::VT_ARGUMENTS, arguments);
+  }
+  void add_code(::flatbuffers::Offset<::flatbuffers::String> code) {
+    fbb_.AddOffset(GPUOperation::VT_CODE, code);
+  }
+  void add_work_group_size(::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size) {
+    fbb_.AddOffset(GPUOperation::VT_WORK_GROUP_SIZE, work_group_size);
+  }
+  void add_compiler_options(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>>> compiler_options) {
+    fbb_.AddOffset(GPUOperation::VT_COMPILER_OPTIONS, compiler_options);
+  }
+  void add_tensor_to_grid(tflite::gpu::data::TensorToGrid tensor_to_grid) {
+    fbb_.AddElement<int8_t>(GPUOperation::VT_TENSOR_TO_GRID, static_cast<int8_t>(tensor_to_grid), 0);
+  }
+  void add_flops(uint64_t flops) {
+    fbb_.AddElement<uint64_t>(GPUOperation::VT_FLOPS, flops, 0);
+  }
+  void add_definition(::flatbuffers::Offset<tflite::gpu::data::OperationDef> definition) {
+    fbb_.AddOffset(GPUOperation::VT_DEFINITION, definition);
+  }
+  void add_grid_dimension(int32_t grid_dimension) {
+    fbb_.AddElement<int32_t>(GPUOperation::VT_GRID_DIMENSION, grid_dimension, 0);
+  }
+  void add_work_group_launch_order(::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order) {
+    fbb_.AddOffset(GPUOperation::VT_WORK_GROUP_LAUNCH_ORDER, work_group_launch_order);
+  }
+  void add_grid_size(::flatbuffers::Offset<tflite::gpu::data::Int3> grid_size) {
+    fbb_.AddOffset(GPUOperation::VT_GRID_SIZE, grid_size);
+  }
+  void add_src_tensors_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> src_tensors_names) {
+    fbb_.AddOffset(GPUOperation::VT_SRC_TENSORS_NAMES, src_tensors_names);
+  }
+  void add_dst_tensors_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> dst_tensors_names) {
+    fbb_.AddOffset(GPUOperation::VT_DST_TENSORS_NAMES, dst_tensors_names);
+  }
+  void add_work_groups_count(::flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count) {
+    fbb_.AddOffset(GPUOperation::VT_WORK_GROUPS_COUNT, work_groups_count);
+  }
+  explicit GPUOperationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GPUOperation> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GPUOperation>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GPUOperation> CreateGPUOperation(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::Arguments> arguments = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> code = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>>> compiler_options = 0,
+    tflite::gpu::data::TensorToGrid tensor_to_grid = tflite::gpu::data::TensorToGrid::CUSTOM,
+    uint64_t flops = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
+    int32_t grid_dimension = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> grid_size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> src_tensors_names = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> dst_tensors_names = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count = 0) {
+  GPUOperationBuilder builder_(_fbb);
+  builder_.add_flops(flops);
+  builder_.add_work_groups_count(work_groups_count);
+  builder_.add_dst_tensors_names(dst_tensors_names);
+  builder_.add_src_tensors_names(src_tensors_names);
+  builder_.add_grid_size(grid_size);
+  builder_.add_work_group_launch_order(work_group_launch_order);
+  builder_.add_grid_dimension(grid_dimension);
+  builder_.add_definition(definition);
+  builder_.add_compiler_options(compiler_options);
+  builder_.add_work_group_size(work_group_size);
+  builder_.add_code(code);
+  builder_.add_arguments(arguments);
+  builder_.add_tensor_to_grid(tensor_to_grid);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GPUOperation> CreateGPUOperationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::Arguments> arguments = 0,
+    const char *code = nullptr,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size = 0,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *compiler_options = nullptr,
+    tflite::gpu::data::TensorToGrid tensor_to_grid = tflite::gpu::data::TensorToGrid::CUSTOM,
+    uint64_t flops = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
+    int32_t grid_dimension = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> grid_size = 0,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *src_tensors_names = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *dst_tensors_names = nullptr,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count = 0) {
+  auto code__ = code ? _fbb.CreateString(code) : 0;
+  auto compiler_options__ = compiler_options ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>>(*compiler_options) : 0;
+  auto src_tensors_names__ = src_tensors_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*src_tensors_names) : 0;
+  auto dst_tensors_names__ = dst_tensors_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*dst_tensors_names) : 0;
+  return tflite::gpu::data::CreateGPUOperation(
+      _fbb,
+      arguments,
+      code__,
+      work_group_size,
+      compiler_options__,
+      tensor_to_grid,
+      flops,
+      definition,
+      grid_dimension,
+      work_group_launch_order,
+      grid_size,
+      src_tensors_names__,
+      dst_tensors_names__,
+      work_groups_count);
+}
+
+}  // namespace data
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_SERIALIZATIONBASE_TFLITE_GPU_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
new file mode 100644
index 00000000..dd5ae203
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
@@ -0,0 +1,471 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TENSOR_DESC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TENSOR_DESC_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+
+enum class TensorStorageType {
+  UNKNOWN,
+  BUFFER,
+  IMAGE_BUFFER,
+  TEXTURE_2D,
+  TEXTURE_3D,
+  TEXTURE_ARRAY,
+  SINGLE_TEXTURE_2D
+};
+
+class TensorDescriptor : public GPUObjectDescriptor {
+ public:
+  TensorDescriptor() = default;
+  TensorDescriptor(DataType data_type, TensorStorageType storage_type,
+                   Layout layout)
+      : data_type_(data_type), storage_type_(storage_type), layout_(layout) {}
+
+  TensorDescriptor(const TensorDescriptor&) = default;
+  TensorDescriptor& operator=(const TensorDescriptor&) = default;
+  TensorDescriptor(TensorDescriptor&& desc);
+  TensorDescriptor& operator=(TensorDescriptor&& desc);
+
+  void CopyWithoutData(TensorDescriptor* desc) const;
+
+  bool operator==(const TensorDescriptor& d) const {
+    return data_type_ == d.data_type_ && storage_type_ == d.storage_type_ &&
+           layout_ == d.layout_;
+  }
+
+  bool operator!=(const TensorDescriptor& d) const { return !(*this == d); }
+
+  void GetGpuResources(const BHWDC& tensor_shape,
+                       GenericGPUResourcesWithValue* resources) const;
+
+  absl::Status PerformConstExpr(const GpuInfo& gpu_info,
+                                absl::string_view const_expr,
+                                std::string* result) const override;
+
+  absl::Status PerformSelector(const GpuInfo& gpu_info,
+                               absl::string_view selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(const GpuInfo& gpu_info) const override;
+
+  void Release() override { data_.clear(); }
+  uint64_t GetSizeInBytes() const override { return data_.size(); };
+  size_t GetSizeInBytesForShape(const BHWDC& shape5d) const;
+
+  bool HasAxis(Axis axis) const;
+
+  absl::Status GetLinkingContextFromWriteSelector(
+      const std::vector<std::string>& args, std::string* value_name,
+      std::string* x_coord, std::string* y_coord, std::string* z_coord,
+      std::string* s_coord, std::string* b_coord) const;
+
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<BHWC, T>& src);
+  template <DataType T>
+  void DownloadData(tflite::gpu::Tensor<BHWC, T>* dst);
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<BHWDC, T>& src);
+  template <DataType T>
+  void DownloadData(tflite::gpu::Tensor<BHWDC, T>* dst);
+
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<HWC, T>& src);
+
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<Linear, T>& src);
+
+  int GetLinearIndex(const BHWDC& shape5d, int b, int x, int y, int d, int s,
+                     int sub_c) const;
+
+  bool SupportsZeroClamp(const Axis& axis, const GpuInfo& gpu_info) const;
+  bool CanReadOutOfBorder(const Axis& axis) const;
+  bool IsLinear() const;
+
+  DataType GetDataType() const { return data_type_; }
+  TensorStorageType GetStorageType() const { return storage_type_; }
+
+  // applicable only for types that: IsLinear -> true.
+  // In this case for address we have 1d component - addr (int)
+  // If for addr == -1 this linear storage type returns zero value, this
+  // function returns true, otherwise false
+  bool ReturnsZeroForNegOneRead(const GpuInfo& gpu_info) const;
+
+  absl::Status CanCreateTensorWithShape(const GpuInfo& gpu_info,
+                                        const BHWDC& shape) const;
+
+  absl::Status CanCreateTensorWithShape(const GpuInfo& gpu_info,
+                                        const BHWC& shape) const;
+
+  // Can udate storage type if in the current storage type this tensor can not
+  // be allocated with shape on specified device(gpu_info)
+  // Usual scenario is to create new tensor_desc on base of another and may be
+  // update storage type for new tensor_desc shape because it can be unsuported
+  // with old storage type
+  absl::Status UpdateToSupportedStorageType(const GpuInfo& gpu_info,
+                                            const BHWC& shape);
+
+  // shape must be initialized when using this function
+  std::vector<uint64_t> GetStorageDims() const;
+  // shape must be initialized when using this function
+  int3 GetFullTensorRegion() const;
+  // shape must be initialized when using this function
+  uint64_t GetMemorySizeInBytes() const;
+  // shape must be initialized when using this function
+  int GetElementSize() const;
+
+  void SetUseBufferForWriteOnlyTexture2d(bool value) {
+    use_buffer_for_write_only_2d_texture_ = value;
+  }
+  bool GetUseBufferForWriteOnlyTexture2d() const {
+    return use_buffer_for_write_only_2d_texture_;
+  }
+
+  void SetUseBufferForWriteOnlyImageBuffer(bool value) {
+    use_buffer_for_write_only_image_buffer_ = value;
+  }
+  bool GetUseBufferForWriteOnlyImageBuffer() const {
+    return use_buffer_for_write_only_image_buffer_;
+  }
+
+  void SetBHWCShape(const BHWC& new_shape) {
+    shape_ = BHWDC(new_shape.b, new_shape.h, new_shape.w, 1, new_shape.c);
+  }
+  void SetBHWDCShape(const BHWDC& new_shape) { shape_ = new_shape; }
+  BHWC GetBHWCShape() const {
+    return BHWC(shape_.b, shape_.h, shape_.w, shape_.c);
+  }
+  BHWDC GetBHWDCShape() const { return shape_; }
+  void SetData(std::vector<uint8_t>&& new_data) { data_ = new_data; }
+  const std::vector<uint8_t>& GetData() const { return data_; }
+
+ private:
+  friend flatbuffers::Offset<data::TensorDescriptor> Encode(
+      const TensorDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
+  friend void Decode(const data::TensorDescriptor* fb_desc,
+                     TensorDescriptor* desc);
+
+  template <DataType DataTypeT>
+  friend TensorDescriptor CreateConstantLinearTensorDescriptor(
+      DataType data_type, TensorStorageType storage_type,
+      const tflite::gpu::Tensor<Linear, DataTypeT>& src);
+
+  friend TensorDescriptor CreateConstantHWVec4TensorDescriptor(
+      DataType data_type, TensorStorageType storage_type, int width, int height,
+      const uint8_t* data);
+
+  absl::Status PerformReadSelector(
+      const GpuInfo& gpu_info, const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args, std::string* result) const;
+  absl::Status PerformReadNearestSelector(const GpuInfo& gpu_info,
+                                          const std::vector<std::string>& args,
+                                          std::string* result) const;
+  absl::Status PerformReadBilinearSelector(const GpuInfo& gpu_info,
+                                           const std::vector<std::string>& args,
+                                           std::string* result) const;
+  absl::Status PerformReadPerChannelSelector(
+      const GpuInfo& gpu_info, const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args, std::string* result) const;
+
+  absl::Status PerformGetAddressSelector(const std::vector<std::string>& args,
+                                         std::string* result) const;
+
+  absl::Status PerformGetHandleSelector(const std::vector<std::string>& args,
+                                        std::string* result) const;
+
+  std::string StorageTypeToAddressType() const;
+
+  absl::Status PerformWriteSelector(const GpuInfo& gpu_info,
+                                    const std::vector<std::string>& args,
+                                    std::string* result) const;
+
+  absl::Status PerformWriteLinearSelector(const GpuInfo& gpu_info,
+                                          const std::vector<std::string>& args,
+                                          std::string* result) const;
+
+  absl::Status PerformWrite2DSelector(const GpuInfo& gpu_info,
+                                      const std::vector<std::string>& args,
+                                      std::string* result) const;
+
+  std::string Read(const GpuInfo& gpu_info, DataType read_as_type,
+                   const std::vector<std::string>& coords) const;
+  std::string Write(const GpuInfo& gpu_info, absl::string_view var_name,
+                    const std::vector<std::string>& coords) const;
+
+  absl::Status MaybeGetDataTypeFromTemplateArgs(
+      const std::vector<std::string>& template_args, DataType* result) const;
+
+  std::string GetGlobalAddressNoDeclaration(absl::string_view xc,
+                                            absl::string_view yc,
+                                            absl::string_view zc,
+                                            absl::string_view sc,
+                                            absl::string_view bc) const;
+
+  std::vector<std::string> GetPhysicalCoordsWHS(absl::string_view x,
+                                                absl::string_view y,
+                                                absl::string_view s) const;
+  std::vector<std::string> GetPhysicalCoordsWHSB(absl::string_view x,
+                                                 absl::string_view y,
+                                                 absl::string_view s,
+                                                 absl::string_view b) const;
+  std::vector<std::string> GetPhysicalCoordsWHDS(absl::string_view x,
+                                                 absl::string_view y,
+                                                 absl::string_view z,
+                                                 absl::string_view s) const;
+  std::vector<std::string> GetPhysicalCoordsWHDSB(absl::string_view x,
+                                                  absl::string_view y,
+                                                  absl::string_view z,
+                                                  absl::string_view s,
+                                                  absl::string_view b) const;
+  std::vector<std::string> GetPhysicalCoords(absl::string_view xc,
+                                             absl::string_view yc,
+                                             absl::string_view zc,
+                                             absl::string_view sc,
+                                             absl::string_view bc) const;
+  std::vector<std::string> GetPhysicalCoordsLinear(absl::string_view x) const;
+  std::vector<std::string> GetPhysicalCoordsHW(absl::string_view x,
+                                               absl::string_view y) const;
+
+  bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
+                           std::string* xc, std::string* yc, std::string* zc,
+                           std::string* sc, std::string* bc) const;
+
+  template <typename T>
+  void UploadData(const T* src);
+  template <typename T>
+  void DownloadData(T* dst);
+
+  DataType data_type_ = DataType::UNKNOWN;
+  TensorStorageType storage_type_ = TensorStorageType::UNKNOWN;
+
+  // This field describes logical layout, actual(physical) GPU layout can be
+  // totally different.
+  Layout layout_ =
+      Layout::UNKNOWN;  // Supported layouts is HWC, BHWC, HWDC, BHWDC
+                        // HW and LINEAR (for constant objects only)
+
+  // applicable only for TEXTURE_2D.
+  // When Texture 2d created from buffer, we can use it as texture or as buffer.
+  // This option allows to use texture 2d as buffer when we use it as dst
+  // tensor(write only).
+  // Currently supported only for Metal/OpenCL.
+  // By default false.
+  bool use_buffer_for_write_only_2d_texture_ = false;
+
+  // applicable only for IMAGE_BUFFER.
+  // We can use image buffer as image or as buffer.
+  // This option allows to use image buffer as buffer when we use it as dst
+  // tensor(write only).
+  // Currently supported only for Metal/OpenCL.
+  // By default true.
+  bool use_buffer_for_write_only_image_buffer_ = true;
+
+  // optional
+  BHWDC shape_;
+  std::vector<uint8_t> data_;
+};
+
+TensorDescriptor CreateBhwcTensorDescriptor(DataType data_type,
+                                            TensorStorageType storage_type,
+                                            const BHWC& shape);
+TensorDescriptor CreateHwcTensorDescriptor(DataType data_type,
+                                           TensorStorageType storage_type,
+                                           const HWC& shape);
+
+TensorStorageType GetStorageTypeForLinearTensor(const GpuInfo& gpu_info,
+                                                DataType data_type,
+                                                const Linear& shape);
+template <DataType DataTypeT>
+TensorDescriptor CreateConstantLinearTensorDescriptor(
+    DataType data_type, TensorStorageType storage_type,
+    const tflite::gpu::Tensor<Linear, DataTypeT>& src);
+
+template <DataType DataTypeT>
+TensorDescriptor CreateConstantLinearTensorDescriptor(
+    const GpuInfo& gpu_info, DataType data_type,
+    const tflite::gpu::Tensor<Linear, DataTypeT>& src);
+
+TensorDescriptor CreateConstantHWVec4TensorDescriptor(
+    DataType data_type, TensorStorageType storage_type, int width, int height,
+    const uint8_t* data);
+
+template <DataType T>
+void TensorDescriptor::UploadData(const tflite::gpu::Tensor<Linear, T>& src) {
+  shape_ = BHWDC(src.shape.v, 1, 1, 1, 1);
+  UploadData(src.data.data());
+}
+
+template <DataType T>
+void TensorDescriptor::UploadData(const tflite::gpu::Tensor<BHWC, T>& src) {
+  shape_ = BHWDC(src.shape.b, src.shape.h, src.shape.w, 1, src.shape.c);
+  UploadData(src.data.data());
+}
+
+template <DataType T>
+void TensorDescriptor::DownloadData(tflite::gpu::Tensor<BHWC, T>* dst) {
+  dst->shape = BHWC(shape_.b, shape_.h, shape_.w, shape_.c);
+  dst->data.resize(dst->shape.DimensionsProduct(), 0.0f);
+  DownloadData(dst->data.data());
+}
+
+template <DataType T>
+void TensorDescriptor::UploadData(const tflite::gpu::Tensor<BHWDC, T>& src) {
+  shape_ = src.shape;
+  UploadData(src.data.data());
+}
+
+template <DataType T>
+void TensorDescriptor::DownloadData(tflite::gpu::Tensor<BHWDC, T>* dst) {
+  dst->shape = shape_;
+  dst->data.resize(dst->shape.DimensionsProduct(), 0.0f);
+  DownloadData(dst->data.data());
+}
+
+template <typename T>
+void TensorDescriptor::UploadData(const T* src) {
+  data_.resize(GetSizeInBytesForShape(shape_));
+  if (layout_ == Layout::LINEAR) {
+    if (data_type_ == DataType::FLOAT16) {
+      half* gpu_data = reinterpret_cast<half*>(data_.data());
+      DataFromLinear(src, *this, gpu_data);
+    } else {
+      T* gpu_data = reinterpret_cast<T*>(data_.data());
+      DataFromLinear(src, *this, gpu_data);
+    }
+  } else {  // HWC/BHWC/HWDC/BHWDC
+    if (data_type_ == DataType::FLOAT16) {
+      half* gpu_data = reinterpret_cast<half*>(data_.data());
+      DataFromBHWDC(src, shape_, *this, gpu_data);
+    } else {
+      T* gpu_data = reinterpret_cast<T*>(data_.data());
+      DataFromBHWDC(src, shape_, *this, gpu_data);
+    }
+  }
+}
+
+template <typename T>
+void TensorDescriptor::DownloadData(T* dst) {
+  data_.resize(GetSizeInBytesForShape(shape_));
+  if (data_type_ == DataType::FLOAT16) {
+    half* gpu_data = reinterpret_cast<half*>(data_.data());
+    DataToBHWDC(gpu_data, shape_, *this, dst);
+  } else {
+    T* gpu_data = reinterpret_cast<T*>(data_.data());
+    DataToBHWDC(gpu_data, shape_, *this, dst);
+  }
+}
+
+template <typename FromType, typename ToType>
+void DataFromLinear(const FromType* src, const TensorDescriptor& desc,
+                    ToType* dst) {
+  const int element_size = desc.GetElementSize();
+  const Linear shape = Linear(desc.GetBHWCShape().c);
+  const int slices = DivideRoundUp(shape.v, element_size);
+  for (int s = 0; s < slices; ++s) {
+    for (int c = 0; c < element_size; ++c) {
+      FromType value;
+      if (s * 4 + c < shape.v) {
+        const int cpu_index = shape.LinearIndex({s * element_size + c});
+        value = src[cpu_index];
+      } else {
+        value = 0;
+      }
+      int gpu_index = s * element_size + c;
+      dst[gpu_index] = value;
+    }
+  }
+}
+
+template <typename FromType, typename ToType>
+void DataFromBHWDC(const FromType* src, const BHWDC& shape,
+                   const TensorDescriptor& desc, ToType* dst) {
+  const int channels_alignment =
+      desc.GetStorageType() == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c
+                                                                    : 4;
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              FromType value;
+              if (s * 4 + c < shape.c) {
+                const int cpu_index =
+                    shape.LinearIndex({b, y, x, d, s * 4 + c});
+                value = src[cpu_index];
+              } else {
+                value = 0;
+              }
+              int gpu_index = desc.GetLinearIndex(shape, b, x, y, d, s, c);
+              dst[gpu_index] = value;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename FromType, typename ToType>
+void DataToBHWDC(const FromType* src, const BHWDC& shape,
+                 const TensorDescriptor& desc, ToType* dst) {
+  const int channels_alignment =
+      desc.GetStorageType() == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c
+                                                                    : 4;
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              if (s * 4 + c >= shape.c) {
+                continue;
+              }
+              int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c});
+              int gpu_index = desc.GetLinearIndex(shape, b, x, y, d, s, c);
+              dst[cpu_index] = src[gpu_index];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+std::string ToString(TensorStorageType type);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TENSOR_DESC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/testing_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/testing_util.h
new file mode 100644
index 00000000..6a45e3ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/testing_util.h
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TESTING_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TESTING_UTIL_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+using TensorInt32 = Tensor<BHWC, DataType::INT32>;
+class TestExecutionEnvironment {
+ public:
+  TestExecutionEnvironment() = default;
+  virtual ~TestExecutionEnvironment() = default;
+
+  virtual std::vector<CalculationsPrecision> GetSupportedPrecisions() const = 0;
+  virtual std::vector<TensorStorageType> GetSupportedStorages(
+      DataType data_type) const = 0;
+
+  virtual const GpuInfo& GetGpuInfo() const = 0;
+
+  absl::Status ExecuteGPUOperation(
+      const std::vector<TensorDescriptor*>& src_cpu,
+      const std::vector<TensorDescriptor*>& dst_cpu,
+      std::unique_ptr<GPUOperation>&& operation);
+
+  template <typename DstTensorType>
+  absl::Status ExecuteGpuModel(const std::vector<TensorFloat32>& src_cpu,
+                               const std::vector<DstTensorType*>& dst_cpu,
+                               GpuModel* gpu_model);
+
+  template <typename DstTensorType>
+  absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                                   std::unique_ptr<GPUOperation>&& operation,
+                                   const std::vector<BHWC>& dst_sizes,
+                                   const std::vector<DstTensorType*>& dst_cpu);
+
+  absl::Status ExecuteGPUOperation(
+      const std::vector<TensorFloat32>& src_cpu,
+      std::unique_ptr<GPUOperation>&& operation,
+      const std::vector<BHWC>& dst_sizes,
+      const std::initializer_list<TensorFloat32*>& dst_cpu) {
+    return ExecuteGPUOperation(src_cpu, std::move(operation), dst_sizes,
+                               std::vector<TensorFloat32*>(dst_cpu));
+  }
+
+  absl::Status ExecuteGPUOperation(
+      const std::vector<Tensor5DFloat32>& src_cpu,
+      std::unique_ptr<GPUOperation>&& operation,
+      const std::vector<BHWDC>& dst_sizes,
+      const std::vector<Tensor5DFloat32*>& dst_cpu);
+
+  absl::Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
+                                   std::unique_ptr<GPUOperation>&& operation,
+                                   const BHWC& dst_size,
+                                   TensorFloat32* result) {
+    return ExecuteGPUOperation(std::vector<TensorFloat32>{src_cpu},
+                               std::move(operation), dst_size, result);
+  }
+
+  absl::Status ExecuteGPUOperation(const Tensor5DFloat32& src_cpu,
+                                   std::unique_ptr<GPUOperation>&& operation,
+                                   const BHWDC& dst_size,
+                                   Tensor5DFloat32* result) {
+    return ExecuteGPUOperation(std::vector<Tensor5DFloat32>{src_cpu},
+                               std::move(operation), dst_size, result);
+  }
+
+  absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                                   std::unique_ptr<GPUOperation>&& operation,
+                                   const BHWC& dst_size,
+                                   TensorFloat32* result) {
+    return ExecuteGPUOperation(
+        std::vector<TensorFloat32>{src_cpu}, std::move(operation),
+        std::vector<BHWC>{dst_size}, std::vector<TensorFloat32*>{result});
+  }
+
+  absl::Status ExecuteGPUOperation(const std::vector<Tensor5DFloat32>& src_cpu,
+                                   std::unique_ptr<GPUOperation>&& operation,
+                                   const BHWDC& dst_size,
+                                   Tensor5DFloat32* result) {
+    return ExecuteGPUOperation(
+        std::vector<Tensor5DFloat32>{src_cpu}, std::move(operation),
+        std::vector<BHWDC>{dst_size}, std::vector<Tensor5DFloat32*>{result});
+  }
+
+ protected:
+  virtual absl::Status ExecuteGpuOperationInternal(
+      const std::vector<TensorDescriptor*>& src_cpu,
+      const std::vector<TensorDescriptor*>& dst_cpu,
+      std::unique_ptr<GPUOperation>&& operation) = 0;
+};
+
+absl::Status PointWiseNear(const std::vector<float>& ref,
+                           const std::vector<float>& to_compare,
+                           float eps = 0.0f);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TESTING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/tuning_type.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/tuning_type.h
new file mode 100644
index 00000000..0eebffbd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/tuning_type.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TUNING_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TUNING_TYPE_H_
+
+namespace tflite {
+namespace gpu {
+
+enum class TuningType { kExhaustive, kFast };
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_TUNING_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/util.h
new file mode 100644
index 00000000..7218aaf3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/util.h
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+std::string MemoryTypeToCLType(MemoryType type);
+
+std::string MemoryTypeToMetalType(MemoryType type);
+
+// Returns float4 mask for last plane(batch of 4 channels)
+// assumes that plane size is 4;
+// for example we have 7 channels, in our data structures we align it to 8
+// but 8s-channel will be empty, then last plane (batch of 4 channels) will
+// have this mask (1, 1, 1, 0).
+float4 GetMaskForLastPlane(int channels);
+
+// task_size as amount of FLT4 processed elements.
+int GetRecommendedBlockSizeForConv(const GpuInfo& gpu_info,
+                                   CalculationsPrecision precision,
+                                   int task_size);
+
+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size);
+
+std::string GetTypeDeclaration(const GpuInfo& gpu_info, DataType data_type,
+                               int vec_size);
+
+std::string GetZeroValue(const GpuInfo& gpu_info, DataType data_type,
+                         int vec_size);
+
+std::string GetOneValue(const GpuInfo& gpu_info, DataType data_type,
+                        int vec_size);
+
+// Returns expression that can be substituted for converted value
+// Intended to be used with absl::Substitute
+// Example usage:
+//   auto conversion_function = GetTypeConversion(gpu_info, UINT8, FLOAT32, 4);
+//   auto code = absl::Substitute(conversion_function, "value_name");
+std::string GetTypeConversion(const GpuInfo& gpu_info, DataType src_type,
+                              DataType dst_type, int vec_size);
+
+std::string GetNextWord(const std::string& code, size_t first_position);
+
+size_t FindEnclosingBracket(const std::string& text, size_t first_pos,
+                            char bracket);
+
+absl::Status ParseArgsInsideBrackets(const std::string& text,
+                                     size_t open_bracket_pos,
+                                     size_t* close_bracket_pos,
+                                     std::vector<std::string>* args);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/weights_conversion.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/weights_conversion.h
new file mode 100644
index 00000000..8a4a2a74
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/weights_conversion.h
@@ -0,0 +1,516 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WEIGHTS_CONVERSION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WEIGHTS_CONVERSION_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+
+using uint = unsigned int;
+
+template <DataType S, typename T>
+void RearrangeWeightsToOHWIOGroupI4O4(
+    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int d = 0; d < dst_groups; ++d) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
+        for (int s = 0; s < src_slices; ++s) {
+          for (int d_group = 0; d_group < out_group_size; ++d_group) {
+            for (int j = 0; j < 4; ++j) {
+              T filter;
+              for (int i = 0; i < 4; ++i) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filter[i] = weights.data[f_index];
+                } else {
+                  filter[i] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToODHWIOGroupI4O4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int d = 0; d < dst_groups; ++d) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              for (int j = 0; j < 4; ++j) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + j;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToOHWIOGroupO4I4(
+    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int d = 0; d < dst_groups; ++d) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
+        for (int s = 0; s < src_slices; ++s) {
+          for (int d_group = 0; d_group < out_group_size; ++d_group) {
+            for (int j = 0; j < 4; ++j) {
+              T filter;
+              for (int i = 0; i < 4; ++i) {
+                const int s_ch = s * 4 + i;
+                const int d_ch = (d * out_group_size + d_group) * 4 + j;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filter[i] = weights.data[f_index];
+                } else {
+                  filter[i] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToODHWIOGroupO4I4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int d = 0; d < dst_groups; ++d) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              for (int j = 0; j < 4; ++j) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + i;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + j;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4HWIOOGroupO4(
+    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
+        for (int s = 0; s < src_slices; ++s) {
+          for (int d = 0; d < dst_groups; ++d) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              T filter;
+              for (int i = 0; i < 4; ++i) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filter[i] = weights.data[f_index];
+                } else {
+                  filter[i] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4DHWIOOGroupO4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d = 0; d < dst_groups; ++d) {
+              for (int d_group = 0; d_group < out_group_size; ++d_group) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + j;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToO4HWIOOGroupI4(
+    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
+        for (int s = 0; s < src_slices; ++s) {
+          for (int d = 0; d < dst_groups; ++d) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              T filter;
+              for (int i = 0; i < 4; ++i) {
+                const int s_ch = s * 4 + i;
+                const int d_ch = (d * out_group_size + d_group) * 4 + j;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filter[i] = weights.data[f_index];
+                } else {
+                  filter[i] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToO4DHWIOOGroupI4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d = 0; d < dst_groups; ++d) {
+              for (int d_group = 0; d_group < out_group_size; ++d_group) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + i;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + j;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToOICustomSpatialI4O4(
+    const tflite::gpu::Tensor<OHWI, S>& weights,
+    const std::vector<int>& spatial_remap, absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int s = 0; s < src_slices; ++s) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          const int kernel_index = spatial_remap[y * weights.shape.w + x];
+          const int kernel_index_x = kernel_index % weights.shape.w;
+          const int kernel_index_y = kernel_index / weights.shape.w;
+          for (int i = 0; i < 4; ++i) {
+            T filter;
+            for (int j = 0; j < 4; ++j) {
+              const int s_ch = s * 4 + i;
+              const int d_ch = d * 4 + j;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index = weights.shape.LinearIndex(
+                    {d_ch, kernel_index_y, kernel_index_x, s_ch});
+                filter[j] = weights.data[f_index];
+              } else {
+                filter[j] = 0.0f;
+              }
+            }
+            dst[counter++] = filter;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToOICustomSpatialI4O4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights,
+    const std::vector<int>& spatial_remap, absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int s = 0; s < src_slices; ++s) {
+      for (int z = 0; z < weights.shape.d; ++z) {
+        for (int y = 0; y < weights.shape.h; ++y) {
+          for (int x = 0; x < weights.shape.w; ++x) {
+            int kernel_index =
+                spatial_remap[(z * weights.shape.h + y) * weights.shape.w + x];
+            const int kernel_index_x = kernel_index % weights.shape.w;
+            kernel_index /= weights.shape.w;
+            const int kernel_index_y = kernel_index % weights.shape.h;
+            const int kernel_index_z = kernel_index / weights.shape.h;
+            for (int i = 0; i < 4; ++i) {
+              T filter;
+              for (int j = 0; j < 4; ++j) {
+                const int s_ch = s * 4 + i;
+                const int d_ch = d * 4 + j;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index = weights.shape.LinearIndex(
+                      {d_ch, kernel_index_y, kernel_index_x, kernel_index_z,
+                       s_ch});
+                  filter[j] = weights.data[f_index];
+                } else {
+                  filter[j] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToOICustomSpatialO4I4(
+    const tflite::gpu::Tensor<OHWI, S>& weights,
+    const std::vector<int>& spatial_remap, absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int s = 0; s < src_slices; ++s) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          const int kernel_index = spatial_remap[y * weights.shape.w + x];
+          const int kernel_index_x = kernel_index % weights.shape.w;
+          const int kernel_index_y = kernel_index / weights.shape.w;
+          for (int i = 0; i < 4; ++i) {
+            T filter;
+            for (int j = 0; j < 4; ++j) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index = weights.shape.LinearIndex(
+                    {d_ch, kernel_index_y, kernel_index_x, s_ch});
+                filter[j] = weights.data[f_index];
+              } else {
+                filter[j] = 0.0f;
+              }
+            }
+            dst[counter++] = filter;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToOICustomSpatialO4I4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights,
+    const std::vector<int>& spatial_remap, absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int s = 0; s < src_slices; ++s) {
+      for (int z = 0; z < weights.shape.d; ++z) {
+        for (int y = 0; y < weights.shape.h; ++y) {
+          for (int x = 0; x < weights.shape.w; ++x) {
+            int kernel_index =
+                spatial_remap[(z * weights.shape.h + y) * weights.shape.w + x];
+            const int kernel_index_x = kernel_index % weights.shape.w;
+            kernel_index /= weights.shape.w;
+            const int kernel_index_y = kernel_index % weights.shape.h;
+            const int kernel_index_z = kernel_index / weights.shape.h;
+            for (int i = 0; i < 4; ++i) {
+              T filter;
+              for (int j = 0; j < 4; ++j) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = d * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index = weights.shape.LinearIndex(
+                      {d_ch, kernel_index_y, kernel_index_x, kernel_index_z,
+                       s_ch});
+                  filter[j] = weights.data[f_index];
+                } else {
+                  filter[j] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+uint GetTotalElementsCountForLayout(const WeightsDescription& weight_desc,
+                                    const OHWI& shape);
+uint GetTotalElementsCountForLayout(const WeightsDescription& weight_desc,
+                                    const OHWDI& shape);
+
+// Applicable to:
+//   k2DX4I4YIsSpatialIAndXIsOOGroupO4
+//   k2DX4O4YIsSpatialIAndXIsOOGroupI4
+uint2 Get2dResourceSize(const WeightsDescription& weight_desc,
+                        const OHWI& shape);
+// Applicable to:
+//   k2DX4I4YIsSpatialIAndXIsOOGroupO4
+//   k2DX4O4YIsSpatialIAndXIsOOGroupI4
+uint2 Get2dResourceSize(const WeightsDescription& weight_desc,
+                        const OHWDI& shape);
+
+void RearrangeWeights(
+    const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
+    const WeightsDescription& dst_weight_desc, absl::Span<uint8_t> dst);
+
+void RearrangeWeights(
+    const tflite::gpu::Tensor<OHWDI, DataType::FLOAT32>& weights,
+    const WeightsDescription& dst_weight_desc, absl::Span<uint8_t> dst);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WEIGHTS_CONVERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/weights_layout.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/weights_layout.h
new file mode 100644
index 00000000..29f9df0a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/weights_layout.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WEIGHTS_LAYOUT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WEIGHTS_LAYOUT_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+
+enum class WeightsLayout {
+  kUnknown,
+  // Spatial is DHW/HW depending on amount of spatial dimensions (Depth, Height,
+  // Width).
+  kOSpatialIOGroupI4O4,
+  kOSpatialIOGroupO4I4,
+  kOICustomSpatialI4O4,
+  kOICustomSpatialO4I4,
+  k2DX4I4YIsSpatialIAndXIsOOGroupO4,
+  k2DX4O4YIsSpatialIAndXIsOOGroupI4,
+};
+
+struct WeightsDescription {
+  DataType type;
+  WeightsLayout layout;
+  // applicable with layouts that have OGroup.
+  int output_group_size;  // OGroup size
+  // applicable with layouts that have CustomSpatial
+  std::vector<int> spatial_remap;
+
+  int GetOutputGroupSize() const;
+  bool IsI4O4() const;
+  bool IsO4I4() const;
+  bool IsCustomSpatial() const;
+
+  bool operator==(const WeightsDescription& t) const;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WEIGHTS_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/work_group_picking.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/work_group_picking.h
new file mode 100644
index 00000000..508bffc7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/task/work_group_picking.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WORK_GROUP_PICKING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WORK_GROUP_PICKING_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/workgroup_selection.h"
+
+namespace tflite {
+namespace gpu {
+
+// multiplier can be power of two only
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
+                                       const KernelInfo& kernel_info,
+                                       const int3& grid,
+                                       WorkGroupSizeAlignment z_alignment,
+                                       std::vector<int3>* work_groups);
+
+void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
+                                      const KernelInfo& kernel_info,
+                                      const int3& grid,
+                                      WorkGroupSizeAlignment z_alignment,
+                                      std::vector<int3>* work_groups);
+
+int3 GetWorkGroupXY128ConvLinear(const int3& grid);
+
+int3 GetWorkGroupXY128Simple(const int3& grid);
+int3 GetWorkGroupXY128Conv(const int3& grid);
+
+bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
+
+void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
+                           const KernelInfo& kernel_info, const int3& grid,
+                           std::vector<int3>* work_groups);
+
+void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
+                               const KernelInfo& kernel_info, const int3& grid,
+                               std::vector<int3>* work_groups);
+
+// returns first work group from wgs that has size not bigger than max_wg_size
+// if no suitable groups among wgs, returns {1, 1, 1}
+int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_WORK_GROUP_PICKING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/add.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/add.h
new file mode 100644
index 00000000..22cfa7b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/add.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ADD_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+// Add operation supports not equal tensors on input (for possibility to
+// remove Padding operation with zeroes in channels dimension)
+GPUOperation CreateAdd(const OperationDef& definition,
+                       const std::vector<int>& channels, int dst_channels);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h
new file mode 100644
index 00000000..c86a8aba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ADD_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ADD_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status AddTwoEqualTensorsTest(TestExecutionEnvironment* env);
+
+absl::Status AddFirstTensorHasMoreChannelsThanSecondTest(
+    TestExecutionEnvironment* env);
+
+absl::Status AddFirstTensorHasLessChannelsThanSecond(
+    TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ADD_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cast.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cast.h
new file mode 100644
index 00000000..3e69f42f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cast.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CAST_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CAST_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateCast(const OperationDef& definition,
+                        const GpuInfo& gpu_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CAST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cast_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cast_test_util.h
new file mode 100644
index 00000000..4f4f9a0d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cast_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CAST_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CAST_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status CastTests(TestExecutionEnvironment* env);
+
+absl::Status CastToBoolTests(TestExecutionEnvironment* env);
+
+absl::Status CastFromBoolTests(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CAST_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_test_util.h
new file mode 100644
index 00000000..9503519c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConcatWidthTest(TestExecutionEnvironment* env);
+absl::Status ConcatHeightTest(TestExecutionEnvironment* env);
+absl::Status ConcatChannelsTest(TestExecutionEnvironment* env);
+absl::Status ConcatChannelsAlignedx4Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_xy.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_xy.h
new file mode 100644
index 00000000..464cc429
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_xy.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_XY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_XY_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateConcatXY(const OperationDef& definition,
+                            const ConcatAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_XY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_z.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_z.h
new file mode 100644
index 00000000..bb2e51ce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/concat_z.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_Z_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_Z_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateConcatZ(const OperationDef& definition,
+                           const std::vector<int>& channels,
+                           const GpuInfo& gpu_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONCAT_Z_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h
new file mode 100644
index 00000000..ec46a2d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h
@@ -0,0 +1,170 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+template <DataType S, typename T>
+void RearrangeWeightsForConvConstants(
+    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int d = 0; d < dst_depth; ++d) {
+          const int channels_count = std::min(4, weights.shape.i - s * 4);
+          T filters[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < channels_count; ++j) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[j][i] = weights.data[f_index];
+              } else {
+                filters[j][i] = 0.0f;
+              }
+            }
+          }
+          for (int i = 0; i < channels_count; ++i) {
+            dst[counter++] = filters[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsForConvConstantsDot(
+    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int d = 0; d < dst_depth; ++d) {
+          const int channels_count = std::min(4, weights.shape.o - d * 4);
+          T filters[4];
+          for (int j = 0; j < channels_count; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + i;
+              const int d_ch = d * 4 + j;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[j][i] = weights.data[f_index];
+              } else {
+                filters[j][i] = 0.0f;
+              }
+            }
+          }
+          for (int i = 0; i < channels_count; ++i) {
+            dst[counter++] = filters[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType T>
+void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                   const GpuInfo& gpu_info,
+                                   CalculationsPrecision precision,
+                                   bool use_dot_conv, GPUOperation* op) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const bool f32_weights = precision == CalculationsPrecision::F32;
+  const int float_size = f32_weights ? 4 : 2;
+  const int aligned_ch_count = use_dot_conv ? weights.shape.o * src_depth * 4
+                                            : weights.shape.i * dst_depth * 4;
+  const int float_count = aligned_ch_count * kernel_x * kernel_y;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  if (gpu_info.IsApiOpenCl() || gpu_info.IsApiMetal()) {
+    desc.memory_type = MemoryType::CONSTANT;
+  } else {
+    desc.memory_type = MemoryType::GLOBAL;
+  }
+  desc.size = float_size * float_count;
+  desc.data.resize(desc.size);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    if (use_dot_conv) {
+      RearrangeWeightsForConvConstantsDot(weights,
+                                          absl::MakeSpan(ptr, float_count / 4));
+    } else {
+      RearrangeWeightsForConvConstants(weights,
+                                       absl::MakeSpan(ptr, float_count / 4));
+    }
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    if (use_dot_conv) {
+      RearrangeWeightsForConvConstantsDot(weights,
+                                          absl::MakeSpan(ptr, float_count / 4));
+    } else {
+      RearrangeWeightsForConvConstants(weights,
+                                       absl::MakeSpan(ptr, float_count / 4));
+    }
+  }
+
+  op->args_.AddObject("weights",
+                      std::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+bool IsConvConstantsSupported(const GpuInfo& gpu_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
+
+GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_constants_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_constants_test_util.h
new file mode 100644
index 00000000..370d86cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_constants_test_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvConstantsSimpleWeightsTest(TestExecutionEnvironment* env);
+absl::Status ConvConstantsTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h
new file mode 100644
index 00000000..77787e42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h
@@ -0,0 +1,359 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvGeneric : public GPUOperation {
+ public:
+  enum class WeightsUploadType {
+    LOCAL_MEM_ASYNC_SUBGROUP,  // we use it for PowerVR with workgroup size = 32
+    LOCAL_MEM_BY_THREADS,
+    GLOBAL_MEM,
+    CONSTANT_MEM,
+    PRIVATE_MEM_SIMD_BROADCAST,
+    TEXTURES_MEM_X4,  // 4 textures for weights
+  };
+  struct ConvParams {
+    DataType weights_data_type;  // used for weights and biases
+    int4 block_size;             // WHDS
+    bool fixed_work_group_size;
+    int3 work_group_size;
+    int3 work_group_launch_order;
+    bool linear_spatial;  // spatial dimensions are Width/Height/Depth
+    bool linear_all;  // linear_spatial & linear_all can not be used together,
+                      // linear_all can not be used with WeightsUploadTypes
+                      // that use workgroups(subgroups) for
+                      // uploading(LOCAL_MEM_BY_THREADS for example).
+    bool different_weights_for_height;
+    bool groups_support = false;  // convolution groups
+    int src_depth_loop_size;
+    bool need_src_loop = true;
+    bool need_dst_loop = true;
+    WeightsUploadType weights_upload_type;
+    bool x_kernel_is_1 = false;
+    bool y_kernel_is_1 = false;
+    bool z_kernel_is_1 = false;
+    WeightsLayout weights_layout;
+
+    // used only with PRIVATE_MEM_SIMD_BROADCAST
+    int simd_size = 1;
+
+    bool AreWeightsBuffer() const {
+      return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
+    }
+
+    bool IsPrivateMemBroadcast() const {
+      return weights_upload_type ==
+             WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
+    }
+  };
+  ConvGeneric() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  WeightsDescription GetWeightsDescription() const {
+    WeightsDescription desc;
+    desc.type = conv_params_.weights_data_type;
+    desc.layout = conv_params_.weights_layout;
+    desc.output_group_size = conv_params_.block_size.w;
+    return desc;
+  }
+
+  // Move only
+  ConvGeneric(ConvGeneric&& operation);
+  ConvGeneric& operator=(ConvGeneric&& operation);
+  ConvGeneric(const ConvGeneric&) = delete;
+  ConvGeneric& operator=(const ConvGeneric&) = delete;
+
+ private:
+  ConvGeneric(const OperationDef& definition,
+              const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
+              const BHWC* dst_shape = nullptr);
+  ConvGeneric(const OperationDef& definition,
+              const Convolution2DAttributes& attr, const BHWC& weights_shape,
+              const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
+  ConvGeneric(const OperationDef& definition,
+              const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
+              const BHWC* dst_shape = nullptr);
+  explicit ConvGeneric(const OperationDef& definition);
+  ConvGeneric(const OperationDef& definition,
+              const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
+              const BHWDC* dst_shape = nullptr);
+
+  void GenerateCode(const GpuInfo& gpu_info);
+
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
+  template <DataType T>
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
+
+  template <DataType T>
+  void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
+
+  friend ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr,
+                                       const BHWC* dst_shape);
+
+  friend ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
+                                       const OperationDef& definition,
+                                       const FullyConnectedAttributes& attr,
+                                       const BHWC* dst_shape);
+
+  friend ConvGeneric CreateConvGenericBatchedMatMul(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const OHWI& weights_shape, const BHWC* dst_shape);
+
+  friend ConvGeneric CreateConvGenericDynamicWeights(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC& weights_shape,
+      const BHWC* dst_shape);
+
+  friend ConvGeneric CreateConvGenericWino4x4To6x6(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC* dst_shape);
+
+  friend ConvGeneric CreateConvGeneric3D(const GpuInfo& gpu_info,
+                                         const OperationDef& definition,
+                                         const Convolution3DAttributes& attr,
+                                         const BHWDC* dst_shape);
+
+  ConvParams GuessBestParams(const GpuInfo& gpu_info,
+                             const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const GpuInfo& gpu_info,
+                             const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             const BHWC& weights_shape,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const GpuInfo& gpu_info,
+                             const OperationDef& definition,
+                             const FullyConnectedAttributes& attr,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParamsPointwise(const GpuInfo& gpu_info,
+                                      const OperationDef& definition,
+                                      const OHWI& weights_shape,
+                                      const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const GpuInfo& gpu_info,
+                             const OperationDef& definition,
+                             const Convolution3DAttributes& attr,
+                             const BHWDC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const GpuInfo& gpu_info,
+                             const OperationDef& definition, int src_depth,
+                             int dst_depth, bool x_kernel_is_1,
+                             bool y_kernel_is_1,
+                             bool different_weights_for_height,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParamsApple(const GpuInfo& gpu_info,
+                                  const OperationDef& definition, int src_depth,
+                                  int dst_depth, bool x_kernel_is_1,
+                                  bool y_kernel_is_1,
+                                  bool different_weights_for_height,
+                                  const BHWC& dst_shape);
+
+  std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
+                           const ConvParams& conv_params);
+
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
+  int4 dilation_;
+  ConvParams conv_params_;
+};
+
+template <DataType T>
+void ConvGeneric::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                             const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
+  UploadBias(biases);
+}
+
+template <DataType T>
+void ConvGeneric::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
+  tflite::gpu::Tensor<OHWI, T> wino_weights;
+  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+  UploadWeights(wino_weights);
+  tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
+  biases.shape = Linear(weights.shape.o);
+  biases.data.resize(weights.shape.o, 0.0f);
+  UploadBias(biases);
+}
+
+template <DataType T>
+void ConvGeneric::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
+  BufferDescriptor desc;
+  desc.element_type = conv_params_.weights_data_type;
+  desc.element_size = 4;
+  desc.memory_type = conv_params_.weights_upload_type ==
+                             ConvGeneric::WeightsUploadType::CONSTANT_MEM
+                         ? MemoryType::CONSTANT
+                         : MemoryType::GLOBAL;
+  const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
+                             ? sizeof(float)
+                             : sizeof(half);
+  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
+  desc.size = float_size * aligned_channels;
+  desc.data.resize(desc.size);
+  if (conv_params_.weights_data_type == DataType::FLOAT32) {
+    float* gpu_data = reinterpret_cast<float*>(desc.data.data());
+    for (int i = 0; i < aligned_channels; ++i) {
+      gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
+    }
+  } else {
+    half* gpu_data = reinterpret_cast<half*>(desc.data.data());
+    for (int i = 0; i < aligned_channels; ++i) {
+      gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
+    }
+  }
+  args_.AddObject("biases",
+                  std::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+template <DataType T>
+void ConvGeneric::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
+  const auto weights_desc = GetWeightsDescription();
+  const int flt_count =
+      GetTotalElementsCountForLayout(weights_desc, weights.shape);
+
+  std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
+  RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
+
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = weights_desc.type;
+    desc.element_size = 4;
+    desc.memory_type = conv_params_.weights_upload_type ==
+                               ConvGeneric::WeightsUploadType::CONSTANT_MEM
+                           ? MemoryType::CONSTANT
+                           : MemoryType::GLOBAL;
+    desc.size = weights_data.size();
+    desc.data = std::move(weights_data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
+    int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
+    for (int i = 0; i < 4; ++i) {
+      TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+          weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
+          tex_size.y, weights_data.data() + sub_size * i);
+      args_.AddObject("weights" + std::to_string(i),
+                      std::make_unique<TensorDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+template <DataType T>
+void ConvGeneric::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
+  const auto weights_desc = GetWeightsDescription();
+  const int flt_count =
+      GetTotalElementsCountForLayout(weights_desc, weights.shape);
+
+  std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
+  RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
+
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = weights_desc.type;
+    desc.element_size = 4;
+    desc.size = weights_data.size();
+    desc.data = std::move(weights_data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
+    int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
+    for (int i = 0; i < 4; ++i) {
+      TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+          weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
+          tex_size.y, weights_data.data() + sub_size * i);
+      args_.AddObject("weights" + std::to_string(i),
+                      std::make_unique<TensorDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr,
+                              const BHWC* dst_shape = nullptr);
+
+ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr,
+                              const BHWC* dst_shape = nullptr);
+
+ConvGeneric CreateConvGenericDynamicWeights(const GpuInfo& gpu_info,
+                                            const OperationDef& definition,
+                                            const Convolution2DAttributes& attr,
+                                            const BHWC& weights_shape,
+                                            const BHWC* dst_shape = nullptr);
+
+ConvGeneric CreateConvGenericBatchedMatMul(const GpuInfo& gpu_info,
+                                           const OperationDef& definition,
+                                           const OHWI& weights_shape,
+                                           const BHWC* dst_shape = nullptr);
+
+ConvGeneric CreateConvGenericWino4x4To6x6(const GpuInfo& gpu_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr,
+                                          const BHWC* dst_shape = nullptr);
+
+ConvGeneric CreateConvGeneric3D(const GpuInfo& gpu_info,
+                                const OperationDef& definition,
+                                const Convolution3DAttributes& attr,
+                                const BHWDC* dst_shape = nullptr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_generic_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_generic_test_util.h
new file mode 100644
index 00000000..781ac794
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_generic_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvGeneric1x1SimpleWeightsTest(TestExecutionEnvironment* env);
+absl::Status ConvGeneric1x1Test(TestExecutionEnvironment* env);
+absl::Status ConvGenericSimpleWeightsTest(TestExecutionEnvironment* env);
+absl::Status ConvGenericTest(TestExecutionEnvironment* env);
+absl::Status ConvGenericGroupedTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_metal_simd.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_metal_simd.h
new file mode 100644
index 00000000..d9106bf5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_metal_simd.h
@@ -0,0 +1,105 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_METAL_SIMD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_METAL_SIMD_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvolutionMetalSimd : public GPUOperation {
+ public:
+  ConvolutionMetalSimd() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConvolutionMetalSimd(ConvolutionMetalSimd&& kernel) = default;
+  ConvolutionMetalSimd& operator=(ConvolutionMetalSimd&& kernel) = default;
+  ConvolutionMetalSimd(const ConvolutionMetalSimd&) = delete;
+  ConvolutionMetalSimd& operator=(const ConvolutionMetalSimd&) = delete;
+
+  WeightsDescription GetWeightsDescription() const {
+    WeightsDescription desc;
+    desc.type = DeduceDataTypeFromPrecision(definition_.precision);
+    desc.layout = WeightsLayout::kOSpatialIOGroupO4I4;
+    desc.output_group_size = 4;
+    return desc;
+  }
+
+  struct ConvParams {
+    int3 work_group_size;
+    int3 work_group_launch_order;
+    bool linear_spatial;  // spatial dimensions are Width/Height/Depth
+    int slices_per_thread;
+    bool x_kernel_is_1 = true;
+    bool y_kernel_is_1 = true;
+    bool z_kernel_is_1 = true;
+
+    // must be 32 * k
+    int GetSpatialThreadsCount() const {
+      if (linear_spatial) {
+        return work_group_size.x;
+      } else {
+        return work_group_size.x * work_group_size.y;
+      }
+    }
+
+    int GetX4SlicesCount() const {
+      if (linear_spatial) {
+        return work_group_size.y;
+      } else {
+        return work_group_size.z;
+      }
+    }
+  };
+
+  ConvParams params_;
+
+ private:
+  explicit ConvolutionMetalSimd(const OperationDef& definition)
+      : GPUOperation(definition) {}
+  friend ConvolutionMetalSimd CreateConvolutionMetalSimd(
+      const OperationDef& definition, const BHWC& dst_shape,
+      const Convolution2DAttributes& attr, const GpuInfo& gpu_info);
+};
+
+ConvolutionMetalSimd CreateConvolutionMetalSimd(
+    const OperationDef& definition, const BHWC& dst_shape,
+    const Convolution2DAttributes& attr, const GpuInfo& gpu_info);
+
+bool IsConvolutionMetalSimdSupported(const GpuInfo& gpu_info,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr);
+
+bool IsGoodTaskSizeForAppleConvSimd(const BHWC& dst_shape,
+                                    const GpuInfo& gpu_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_METAL_SIMD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_weights_converter.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_weights_converter.h
new file mode 100644
index 00000000..58891d21
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_weights_converter.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_WEIGHTS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_WEIGHTS_CONVERTER_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConverterToConvWeights : public GPUOperation {
+ public:
+  ConverterToConvWeights(const OperationDef& definition,
+                         const WeightsDescription& weights_desc,
+                         Layout input_layout);
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConverterToConvWeights(ConverterToConvWeights&& operation) = default;
+  ConverterToConvWeights& operator=(ConverterToConvWeights&& operation) =
+      default;
+  ConverterToConvWeights(const ConverterToConvWeights&) = delete;
+  ConverterToConvWeights& operator=(const ConverterToConvWeights&) = delete;
+
+ private:
+  std::string GetConverterToConvWeightsCode();
+
+  OHWI GetWeightsSize() const;
+
+  WeightsDescription weights_desc_;
+
+  Layout input_layout_;  // Can be only OHWI or HWIO
+  // if input_layout_ is OHWI: reinterpreting weights as OHWI-BHWC tensor
+  // if input_layout_ is HWIO: reinterpreting weights as HWIO-BHWC tensor
+};
+
+ConverterToConvWeights CreateConverterToConvWeights(
+    const OperationDef& definition, const WeightsDescription& weights_desc,
+    Layout input_layout);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_WEIGHTS_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_weights_converter_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_weights_converter_test_util.h
new file mode 100644
index 00000000..77c38496
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conv_weights_converter_test_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_WEIGHTS_CONVERTER_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_WEIGHTS_CONVERTER_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConverterToConvWeights1x1OutX4Test(TestExecutionEnvironment* env);
+absl::Status ConverterToConvWeights1x1OutX4UnalignedTest(
+    TestExecutionEnvironment* env);
+absl::Status ConverterToConvWeights1x1OutX2Test(TestExecutionEnvironment* env);
+absl::Status ConverterToConvWeightsOutX2Test(TestExecutionEnvironment* env);
+absl::Status ConverterToConvTransposedWeights4x4Test(
+    TestExecutionEnvironment* env);
+absl::Status ConverterToConvWeights4xTexturesTest(
+    TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_WEIGHTS_CONVERTER_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conversion.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conversion.h
new file mode 100644
index 00000000..ca562768
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/conversion.h
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVERSION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVERSION_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateTensorToTensorOp(const GpuInfo& gpu_info,
+                                    const TensorDescriptor& src_desc,
+                                    const TensorDescriptor& dst_desc);
+
+GPUOperation CreateTensorToBhwcBufferOp(const GpuInfo& gpu_info,
+                                        const TensorDescriptor& src_desc,
+                                        const BufferDescriptor& dst_desc);
+
+GPUOperation CreateBhwcBufferToTensorOp(const GpuInfo& gpu_info,
+                                        const BufferDescriptor& src_desc,
+                                        const TensorDescriptor& dst_desc);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h
new file mode 100644
index 00000000..5eb41cc8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h
@@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvolutionTransposed : public GPUOperation {
+ public:
+  ConvolutionTransposed() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConvolutionTransposed(ConvolutionTransposed&& operation) = default;
+  ConvolutionTransposed& operator=(ConvolutionTransposed&& operation) = default;
+  ConvolutionTransposed(const ConvolutionTransposed&) = delete;
+  ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
+
+  WeightsDescription GetWeightsDescription() const {
+    WeightsDescription desc;
+    desc.type = DeduceDataTypeFromPrecision(definition_.precision);
+    desc.layout = weights_layout_;
+    desc.output_group_size = block_size_.w;
+    return desc;
+  }
+
+ private:
+  friend ConvolutionTransposed CreateConvolutionTransposed(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  friend ConvolutionTransposed CreateConvolutionTransposed3D(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposed3DAttributes& attr);
+  friend ConvolutionTransposed CreateConvolutionTransposedDynamicWeights(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+
+  ConvolutionTransposed(const OperationDef& definition,
+                        const ConvolutionTransposedAttributes& attr,
+                        const GpuInfo& gpu_info);
+  ConvolutionTransposed(const OperationDef& definition,
+                        const ConvolutionTransposed3DAttributes& attr,
+                        const GpuInfo& gpu_info);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     bool weights_are_buffer);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                     bool weights_are_buffer);
+
+  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                                const GpuInfo& gpu_info,
+                                                const int4& block_size);
+  int4 stride_;
+  int4 block_size_ = int4(1, 1, 1, 1);  // WHDS
+  WeightsLayout weights_layout_;
+};
+
+template <DataType T>
+void ConvolutionTransposed::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights, bool weights_are_buffer) {
+  const auto weights_desc = GetWeightsDescription();
+  const int flt_count =
+      GetTotalElementsCountForLayout(weights_desc, weights.shape);
+
+  std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
+  RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = weights_desc.type;
+    desc.element_size = 4;
+    desc.size = weights_data.size();
+    desc.data = std::move(weights_data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
+    int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
+    for (int i = 0; i < 4; ++i) {
+      TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+          weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
+          tex_size.y, weights_data.data() + sub_size * i);
+      args_.AddObject("weights" + std::to_string(i),
+                      std::make_unique<TensorDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+template <DataType T>
+void ConvolutionTransposed::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
+  const auto weights_desc = GetWeightsDescription();
+  const int flt_count =
+      GetTotalElementsCountForLayout(weights_desc, weights.shape);
+
+  std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
+  RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = weights_desc.type;
+    desc.element_size = 4;
+    desc.size = weights_data.size();
+    desc.data = std::move(weights_data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
+    int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
+    for (int i = 0; i < 4; ++i) {
+      TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+          weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
+          tex_size.y, weights_data.data() + sub_size * i);
+      args_.AddObject("weights" + std::to_string(i),
+                      std::make_unique<TensorDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+ConvolutionTransposed CreateConvolutionTransposed(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed CreateConvolutionTransposed3D(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr);
+
+ConvolutionTransposed CreateConvolutionTransposedDynamicWeights(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3.h
new file mode 100644
index 00000000..158ec50b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvolutionTransposed3x3 : public GPUOperation {
+ public:
+  ConvolutionTransposed3x3() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConvolutionTransposed3x3(ConvolutionTransposed3x3&& operation) = default;
+  ConvolutionTransposed3x3& operator=(ConvolutionTransposed3x3&& operation) =
+      default;
+  ConvolutionTransposed3x3(const ConvolutionTransposed3x3&) = delete;
+  ConvolutionTransposed3x3& operator=(const ConvolutionTransposed3x3&) = delete;
+
+  WeightsDescription GetWeightsDescription() const {
+    WeightsDescription desc;
+    desc.type = DeduceDataTypeFromPrecision(definition_.precision);
+    desc.layout = weights_layout_;
+    desc.spatial_remap = GetSpatialWeightsRemap();
+    return desc;
+  }
+
+  enum class WeightsUploadType {
+    LOCAL_MEM_ASYNC,
+    LOCAL_MEM_BY_THREADS,
+    GLOBAL_MEM,
+    CONSTANT_MEM,
+  };
+
+ private:
+  ConvolutionTransposed3x3(const OperationDef& definition,
+                           const GpuInfo& gpu_info, int2 padding);
+  friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3DynamicWeights(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+
+  void UploadWeights(
+      const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights);
+
+  std::vector<int> GetSpatialWeightsRemap() const;
+
+  std::string GenerateConvolutionTransposedCode(
+      const GpuInfo& gpu_info, const OperationDef& op_def,
+      ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
+      int2 padding, int3 work_group_launch_order);
+
+  int2 padding_;
+  WeightsUploadType weights_upload_type_;
+  WeightsLayout weights_layout_;
+};
+
+bool IsConvolutionTransposed3x3Supported(
+    const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed3x3 CreateConvolutionTransposed3x3DynamicWeights(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_test_util.h
new file mode 100644
index 00000000..9fe921db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_test_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvolutionTransposed3x3Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h
new file mode 100644
index 00000000..8b4e281e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvolutionTransposed3x3Thin : public GPUOperation {
+ public:
+  ConvolutionTransposed3x3Thin() = default;
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation) =
+      default;
+  ConvolutionTransposed3x3Thin& operator=(
+      ConvolutionTransposed3x3Thin&& operation) = default;
+  ConvolutionTransposed3x3Thin(const ConvolutionTransposed3x3Thin&) = delete;
+  ConvolutionTransposed3x3Thin& operator=(const ConvolutionTransposed3x3Thin&) =
+      delete;
+
+  WeightsDescription GetWeightsDescription() const {
+    WeightsDescription desc;
+    desc.type = DeduceDataTypeFromPrecision(definition_.precision);
+    desc.layout = weights_layout_;
+    desc.spatial_remap = GetSpatialWeightsRemap();
+    return desc;
+  }
+
+ private:
+  ConvolutionTransposed3x3Thin(const GpuInfo& gpu_info,
+                               const OperationDef& definition,
+                               const ConvolutionTransposedAttributes& attr);
+
+  friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  friend ConvolutionTransposed3x3Thin
+  CreateConvolutionTransposed3x3ThinDynamicWeights(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+
+  void UploadWeights(
+      const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights);
+
+  std::vector<int> GetSpatialWeightsRemap() const;
+
+  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                                const GpuInfo& gpu_info,
+                                                int src_depth, int dst_depth);
+
+  WeightsLayout weights_layout_;
+};
+
+bool IsConvolutionTransposed3x3ThinSupported(
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin_test_util.h
new file mode 100644
index 00000000..5b86c3e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_THIN_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_THIN_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvolutionTransposed3x3ThinSimpleWeightsTest(
+    TestExecutionEnvironment* env);
+
+absl::Status ConvolutionTransposed3x3ThinTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_3X3_THIN_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h
new file mode 100644
index 00000000..eb593bd0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_4X4_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_4X4_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
+#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvolutionTransposed4x4 : public GPUOperation {
+ public:
+  ConvolutionTransposed4x4() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConvolutionTransposed4x4(ConvolutionTransposed4x4&& operation) = default;
+  ConvolutionTransposed4x4& operator=(ConvolutionTransposed4x4&& operation) =
+      default;
+  ConvolutionTransposed4x4(const ConvolutionTransposed4x4&) = delete;
+  ConvolutionTransposed4x4& operator=(const ConvolutionTransposed4x4&) = delete;
+
+  WeightsDescription GetWeightsDescription() const {
+    WeightsDescription desc;
+    desc.type = DeduceDataTypeFromPrecision(definition_.precision);
+    desc.layout = weights_layout_;
+    desc.spatial_remap = GetSpatialWeightsRemap();
+    return desc;
+  }
+
+  enum class WeightsUploadType {
+    LOCAL_MEM_ASYNC,
+    LOCAL_MEM_BY_THREADS,
+    GLOBAL_MEM,
+    CONSTANT_MEM,
+  };
+
+ private:
+  ConvolutionTransposed4x4(const OperationDef& definition,
+                           const GpuInfo& gpu_info);
+
+  friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+
+  void UploadWeights(
+      const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
+      WeightsUploadType weights_upload_type);
+
+  std::vector<int> GetSpatialWeightsRemap() const;
+
+  std::string GenerateConvolutionTransposedCode(
+      const GpuInfo& gpu_info, const OperationDef& op_def,
+      WeightsUploadType weights_upload_type);
+
+  WeightsLayout weights_layout_;
+};
+
+bool IsConvolutionTransposed4x4Supported(
+    const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_4X4_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4_test_util.h
new file mode 100644
index 00000000..7584da63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4_test_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_4X4_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_4X4_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvolutionTransposed4x4SimpleWeightsTest(
+    TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_4X4_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_test_util.h
new file mode 100644
index 00000000..ef431769
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_test_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvolutionTransposedSimpleWeightsTest(
+    TestExecutionEnvironment* env);
+absl::Status ConvolutionTransposedTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin.h
new file mode 100644
index 00000000..1fdd4e3d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin.h
@@ -0,0 +1,150 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_THIN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_THIN_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class ConvolutionTransposedThin : public GPUOperation {
+ public:
+  ConvolutionTransposedThin() = default;
+  int3 GetGridSize() const override;
+
+  // Move only
+  ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
+  ConvolutionTransposedThin& operator=(ConvolutionTransposedThin&& operation);
+  ConvolutionTransposedThin(const ConvolutionTransposedThin&) = delete;
+  ConvolutionTransposedThin& operator=(const ConvolutionTransposedThin&) =
+      delete;
+
+ private:
+  friend ConvolutionTransposedThin CreateConvolutionTransposedThin(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  ConvolutionTransposedThin(const OperationDef& definition,
+                            const ConvolutionTransposedAttributes& attr,
+                            const GpuInfo& gpu_info);
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                                int src_depth, int dst_channels,
+                                                const int2& kernel_size);
+};
+
+template <DataType T>
+void ConvolutionTransposedThin::UploadData(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    const tflite::gpu::Tensor<Linear, T>& biases) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int flt4_count =
+      weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
+
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+  desc.size = flt4_size * (flt4_count + 1);
+  desc.data.resize(desc.size);
+
+  if (f32_weights) {
+    float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
+    float4 bias_value(0.0f);
+    for (int i = 0; i < weights.shape.o; ++i) {
+      bias_value[i] = biases.data[i];
+    }
+    gpu_data[flt4_count] = bias_value;
+  } else {
+    half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
+    half4 bias_value(0.0f);
+    for (int i = 0; i < weights.shape.o; ++i) {
+      bias_value[i] = biases.data[i];
+    }
+    gpu_data[flt4_count] = bias_value;
+  }
+
+  args_.AddObject("weights",
+                  std::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposedThin::RearrangeWeightsData(
+    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        std::vector<T> filters(weights.shape.o);
+        for (int j = 0; j < weights.shape.o; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int s_ch = s * 4 + i;
+            const int d_ch = j;
+            if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+              const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
+              filters[j][i] = weights.data[f_index];
+            } else {
+              filters[j][i] = 0.0f;
+            }
+          }
+        }
+        for (int j = 0; j < weights.shape.o; ++j) {
+          dst[counter++] = filters[j];
+        }
+      }
+    }
+  }
+}
+
+bool IsConvolutionTransposedThinSupported(
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposedThin CreateConvolutionTransposedThin(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_THIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin_test_util.h
new file mode 100644
index 00000000..77a9f248
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_THIN_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_THIN_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ConvolutionTransposedThinSimpleWeightsTest(
+    TestExecutionEnvironment* env);
+
+absl::Status ConvolutionTransposedThinTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONVOLUTION_TRANSPOSED_THIN_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cumsum.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cumsum.h
new file mode 100644
index 00000000..6da32b81
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cumsum.h
@@ -0,0 +1,53 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CUMSUM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CUMSUM_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class Cumsum : public GPUOperation {
+ public:
+  Cumsum() = default;
+  explicit Cumsum(const OperationDef& definition, Axis axis)
+      : GPUOperation(definition), axis_(axis) {}
+  int3 GetGridSize() const override;
+
+  // Move only
+  Cumsum(Cumsum&& operation);
+  Cumsum& operator=(Cumsum&& operation);
+  Cumsum(const Cumsum&) = delete;
+  Cumsum& operator=(const Cumsum&) = delete;
+  void GetCumsumCode(const OperationDef& op_def);
+
+ private:
+  Axis axis_;
+};
+
+Cumsum CreateCumsum(const OperationDef& definition,
+                    const CumsumAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CUMSUM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cumsum_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cumsum_test_util.h
new file mode 100644
index 00000000..3c4eff1c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/cumsum_test_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CUMSUM_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CUMSUM_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status CumsumHWCTest(TestExecutionEnvironment* env);
+
+absl::Status CumsumBHWCTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CUMSUM_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.h
new file mode 100644
index 00000000..024ea18b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.h
@@ -0,0 +1,271 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class DepthwiseConv : public GPUOperation {
+ public:
+  int3 GetGridSize() const override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+
+  // Move only
+  DepthwiseConv(DepthwiseConv&& operation) = default;
+  DepthwiseConv& operator=(DepthwiseConv&& operation) = default;
+  DepthwiseConv(const DepthwiseConv&) = delete;
+  DepthwiseConv& operator=(const DepthwiseConv&) = delete;
+
+  friend DepthwiseConv CreateDepthwiseConvolution2D(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr);
+
+  friend DepthwiseConv CreateDepthwiseConvolution2DDynamicWeights(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr);
+
+  friend DepthwiseConv CreateDepthwiseConvolution3D(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const DepthwiseConvolution3DAttributes& attr);
+
+ private:
+  struct DepthwiseConvParams {
+    bool UseLocalMem() const {
+      return use_weights_caching || use_spatial_caching;
+    }
+    int GetKernelsTotalSize() const {
+      return x_kernel_size * y_kernel_size * z_kernel_size;
+    }
+    int GetWorkGroupTotalSize() const {
+      return work_group_size.x * work_group_size.y * work_group_size.z;
+    }
+    int channel_multiplier;
+    // Supportd only tensors with Width & Height spatial dimensions
+    // optional, if true, spatial dims will be uploaded to local mem
+    bool use_spatial_caching = false;
+    // optional, if true, weights will be uploaded to local memory
+    bool use_weights_caching = false;
+    // optional, if UsesLocalMem() return true this field must be initialized
+    int3 work_group_size = int3(1, 1, 1);
+
+    // optional, if UsesLocalMem() return true this field must be initialized
+    int x_kernel_size = 1;
+    // optional, if UsesLocalMem() return true this field must be initialized
+    int y_kernel_size = 1;
+    // optional, if UsesLocalMem() return true this field must be initialized
+    int z_kernel_size = 1;
+
+    // optional, if use_spatial_caching true this field must be initialized
+    int x_dilation_size = 1;
+    // optional, if use_spatial_caching true this field must be initialized
+    int y_dilation_size = 1;
+    // optional, if use_spatial_caching true this field must be initialized
+    int z_dilation_size = 1;
+  };
+
+  explicit DepthwiseConv(const OperationDef& definition,
+                         const DepthwiseConvParams& params);
+
+  std::string GenerateSrcUpload(const GpuInfo& gpu_info);
+  std::string GenerateWeightsUpload(const GpuInfo& gpu_info);
+  std::string GenerateCode(const GpuInfo& gpu_info);
+
+  template <DataType T>
+  void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                bool weights_are_buffer);
+
+  template <DataType T>
+  void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                                bool weights_are_buffer);
+
+  DepthwiseConvParams params_;
+};
+
+template <DataType S, typename T>
+void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
+                                 absl::Span<T> dst) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int d_ch = d * 4 + i;
+          if (d_ch < dst_channels) {
+            const int f_index = weights.shape.LinearIndex(
+                {d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+  }
+}
+
+template <DataType T>
+void DepthwiseConv::UploadWeightsForDWConv2D(
+    const tflite::gpu::Tensor<OHWI, T>& weights, bool weights_are_buffer) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const int elements_count = kernel_x * kernel_y * dst_slices;
+
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights", std::make_unique<BufferDescriptor>(desc));
+  } else {
+    TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+        fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
+        TensorStorageType::TEXTURE_2D, kernel_x * kernel_y, dst_slices,
+        data.data());
+    args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
+                                 absl::Span<T> dst) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int z = 0; z < kernel_z; ++z) {
+      for (int y = 0; y < kernel_y; ++y) {
+        for (int x = 0; x < kernel_x; ++x) {
+          T filter_val;
+          for (int i = 0; i < 4; ++i) {
+            const int d_ch = d * 4 + i;
+            if (d_ch < dst_channels) {
+              const int f_index = weights.shape.LinearIndex(
+                  {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
+              filter_val[i] = weights.data[f_index];
+            } else {
+              filter_val[i] = 0.0f;
+            }
+          }
+          dst[counter++] = filter_val;
+        }
+      }
+    }
+  }
+}
+
+template <DataType T>
+void DepthwiseConv::UploadWeightsForDWConv3D(
+    const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+        fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
+        TensorStorageType::TEXTURE_2D, kernel_x * kernel_y * kernel_z,
+        dst_slices, data.data());
+    args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
+  }
+}
+
+DepthwiseConv CreateDepthwiseConvolution2D(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
+DepthwiseConv CreateDepthwiseConvolution2DDynamicWeights(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
+DepthwiseConv CreateDepthwiseConvolution3D(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.h
new file mode 100644
index 00000000..144d2198
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.h
@@ -0,0 +1,160 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class DepthwiseConv3x3 : public GPUOperation {
+ public:
+  DepthwiseConv3x3() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  DepthwiseConv3x3(DepthwiseConv3x3&& operation);
+  DepthwiseConv3x3& operator=(DepthwiseConv3x3&& operation);
+  DepthwiseConv3x3(const DepthwiseConv3x3&) = delete;
+  DepthwiseConv3x3& operator=(const DepthwiseConv3x3&) = delete;
+
+ private:
+  explicit DepthwiseConv3x3(const OperationDef& definition,
+                            bool weights_are_buffer, bool local_mem_uploads,
+                            const GpuInfo& gpu_info);
+  template <DataType T>
+  void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              const tflite::gpu::Tensor<Linear, T>& biases,
+                              bool weights_are_buffer);
+
+  friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsAndBiasesData(
+      const tflite::gpu::Tensor<OHWI, S>& weights,
+      const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
+
+  std::string GenerateDepthwiseConvCode(const GpuInfo& gpu_info,
+                                        const OperationDef& op_def,
+                                        bool weights_are_buffer,
+                                        bool local_mem_uploads);
+
+  bool local_mem_uploads_;
+};
+
+template <DataType T>
+void DepthwiseConv3x3::UploadWeightsAndBiases(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  int texture_width = 10;  // 3x3 kernel + 1 bias
+  int texture_height = src_depth;
+  const int elements_count = texture_width * texture_height;
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+        fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
+        TensorStorageType::TEXTURE_2D, texture_width, texture_height,
+        data.data());
+    args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
+  }
+}
+
+template <DataType S, typename T>
+void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
+    const tflite::gpu::Tensor<OHWI, S>& weights,
+    const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int s_ch = s * 4 + i;
+          if (s_ch < weights.shape.i) {
+            const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+
+    T bias_val;
+    for (int i = 0; i < 4; ++i) {
+      const int dst_ch = s * 4 + i;
+      bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
+    }
+    dst[counter++] = bias_val;
+  }
+}
+
+bool IsDepthwiseConv3x3Supported(const GpuInfo& gpu_info,
+                                 const DepthwiseConvolution2DAttributes& attr);
+
+DepthwiseConv3x3 CreateDepthwiseConv3x3(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.h
new file mode 100644
index 00000000..afa41dee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.h
@@ -0,0 +1,153 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+// Depth Wise Convolution for kernel 3x3
+// require:
+//   channels_multiplier = 1;
+//   kernel_size = 3x3;
+//   dilation.y = 1;
+//   stride.y = 2;
+class DepthWiseConv3x3StrideH2 : public GPUOperation {
+ public:
+  DepthWiseConv3x3StrideH2() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  DepthWiseConv3x3StrideH2(DepthWiseConv3x3StrideH2&& kernel) = default;
+  DepthWiseConv3x3StrideH2& operator=(DepthWiseConv3x3StrideH2&& kernel) =
+      default;
+  DepthWiseConv3x3StrideH2(const DepthWiseConv3x3StrideH2&) = delete;
+  DepthWiseConv3x3StrideH2& operator=(const DepthWiseConv3x3StrideH2&) = delete;
+
+ private:
+  explicit DepthWiseConv3x3StrideH2(const OperationDef& definition)
+      : GPUOperation(definition) {}
+  friend DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
+      const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
+
+  template <DataType T>
+  void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              const tflite::gpu::Tensor<Linear, T>& biases,
+                              bool weights_are_buffer);
+  template <DataType S, typename T>
+  void RearrangeWeightsAndBiasesData(
+      const tflite::gpu::Tensor<OHWI, S>& weights,
+      const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
+
+  bool local_mem_uploads_;
+};
+
+template <DataType T>
+void DepthWiseConv3x3StrideH2::UploadWeightsAndBiases(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  int texture_width = 10;  // 3x3 kernel + 1 bias
+  int texture_height = src_depth;
+  const int elements_count = texture_width * texture_height;
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+        fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
+        TensorStorageType::TEXTURE_2D, texture_width, texture_height,
+        data.data());
+    args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
+  }
+}
+
+template <DataType S, typename T>
+void DepthWiseConv3x3StrideH2::RearrangeWeightsAndBiasesData(
+    const tflite::gpu::Tensor<OHWI, S>& weights,
+    const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int s_ch = s * 4 + i;
+          if (s_ch < weights.shape.i) {
+            const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+
+    T bias_val;
+    for (int i = 0; i < 4; ++i) {
+      const int dst_ch = s * 4 + i;
+      bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
+    }
+    dst[counter++] = bias_val;
+  }
+}
+
+DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
+    const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
+
+bool IsDepthWiseConv3x3StrideH2Supported(
+    const DepthwiseConvolution2DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2_test_util.h
new file mode 100644
index 00000000..19af7b3f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2_test_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status DepthWiseConv3x3StrideH2SimpleWeightsTest(
+    TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_test_util.h
new file mode 100644
index 00000000..d51cb0e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_test_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status DepthwiseConv3x3SimpleWeightsTest(TestExecutionEnvironment* env);
+absl::Status DepthwiseConv3x3Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_test_util.h
new file mode 100644
index 00000000..cd0de447
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_test_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status DepthwiseConvSimpleWeightsTest(TestExecutionEnvironment* env);
+absl::Status DepthwiseConvNoMultiplierTest(TestExecutionEnvironment* env);
+absl::Status DepthwiseConvMultiplier2Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
new file mode 100644
index 00000000..6eda55bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ELEMENTWISE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+// Creates simple one input operation without any parameters, for example
+// log, sin, cos, etc.
+ElementwiseDescriptor CreateElementwiseOneInput(const GpuInfo& gpu_info,
+                                                CalculationsPrecision precision,
+                                                const OperationType& op_type);
+
+GPUOperation CreateElementwiseOneInput(const GpuInfo& gpu_info,
+                                       const OperationDef& definition,
+                                       const OperationType& op_type);
+
+// Creates simple one input operation without any parameters, for example
+// log, sin, cos, etc.
+// Can broadcast input.
+GPUOperation CreateElementwiseOneInputWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type, const BHWC& input_shape,
+    const BHWC& output_shape);
+
+// Creates simple two input(first input is runtime tensor and second input is
+// constant or linear/hwc tensor) operation, for example sub, div and etc.
+template <DataType DataTypeT, typename T>
+GPUOperation CreateElementwise(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr);
+
+// Creates simple two input(first input is runtime tensor and second input is
+// constant or linear/hwc tensor) operation, for example sub, div and etc.
+// Can broadcast input.
+template <DataType DataTypeT, typename T>
+GPUOperation CreateElementwiseWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr,
+    const BHWC& input_shape, const BHWC& output_shape);
+
+// Creates simple two input(2 runtime tensors) operation, for example
+// sub, div and etc.
+GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
+                                       const OperationType& op_type,
+                                       const BHWC& shape);
+
+// Creates simple two input(2 runtime tensors) operation, for example
+// sub, div and etc.
+// Can broadcast first and second input simultaneously.
+GPUOperation CreateElementwiseTwoInputWithBroadcast(
+    const OperationDef& definition, const OperationType& op_type,
+    const BHWC& first_input_shape, const BHWC& second_input_shape,
+    const BHWC& output_shape);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ELEMENTWISE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
new file mode 100644
index 00000000..97490d91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ELEMENTWISE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status AbsTest(TestExecutionEnvironment* env);
+absl::Status CosTest(TestExecutionEnvironment* env);
+absl::Status CopyTest(TestExecutionEnvironment* env);
+absl::Status EluTest(TestExecutionEnvironment* env);
+absl::Status ExpTest(TestExecutionEnvironment* env);
+absl::Status FloorTest(TestExecutionEnvironment* env);
+absl::Status FloorDivTest(TestExecutionEnvironment* env);
+absl::Status FloorModTest(TestExecutionEnvironment* env);
+absl::Status GeluTest(TestExecutionEnvironment* env);
+absl::Status HardSwishTest(TestExecutionEnvironment* env);
+absl::Status LogTest(TestExecutionEnvironment* env);
+absl::Status NegTest(TestExecutionEnvironment* env);
+absl::Status RsqrtTest(TestExecutionEnvironment* env);
+absl::Status SigmoidTest(TestExecutionEnvironment* env);
+absl::Status SinTest(TestExecutionEnvironment* env);
+absl::Status SqrtTest(TestExecutionEnvironment* env);
+absl::Status SquareTest(TestExecutionEnvironment* env);
+absl::Status TanhTest(TestExecutionEnvironment* env);
+absl::Status SubTest(TestExecutionEnvironment* env);
+absl::Status SquaredDiffTest(TestExecutionEnvironment* env);
+absl::Status DivTest(TestExecutionEnvironment* env);
+absl::Status PowTest(TestExecutionEnvironment* env);
+absl::Status AddTest(TestExecutionEnvironment* env);
+absl::Status MaximumTest(TestExecutionEnvironment* env);
+absl::Status MaximumWithScalarTest(TestExecutionEnvironment* env);
+absl::Status MaximumWithConstantLinearTensorTest(TestExecutionEnvironment* env);
+absl::Status MaximumWithConstantHWCTensorTest(TestExecutionEnvironment* env);
+absl::Status MaximumWithConstantHWCTensorBroadcastChannelsTest(
+    TestExecutionEnvironment* env);
+absl::Status MinimumTest(TestExecutionEnvironment* env);
+absl::Status MinimumWithScalarTest(TestExecutionEnvironment* env);
+absl::Status MulTest(TestExecutionEnvironment* env);
+absl::Status MulBroadcastHWTest(TestExecutionEnvironment* env);
+absl::Status MulBroadcastChannelsTest(TestExecutionEnvironment* env);
+absl::Status SubWithScalarAtFirstPositionTest(TestExecutionEnvironment* env);
+absl::Status LessTest(TestExecutionEnvironment* env);
+absl::Status LessEqualTest(TestExecutionEnvironment* env);
+absl::Status GreaterTest(TestExecutionEnvironment* env);
+absl::Status GreaterEqualTest(TestExecutionEnvironment* env);
+absl::Status EqualTest(TestExecutionEnvironment* env);
+absl::Status NotEqualTest(TestExecutionEnvironment* env);
+absl::Status CosBroadcastTest(TestExecutionEnvironment* env);
+absl::Status MaximumScalarBroadcastInputTest(TestExecutionEnvironment* env);
+absl::Status MulLinearBroadcastInputTest(TestExecutionEnvironment* env);
+absl::Status MulBroadcastBothInputsTest(TestExecutionEnvironment* env);
+absl::Status LogicalAndTest(TestExecutionEnvironment* env);
+absl::Status LogicalAndWithConstantTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ELEMENTWISE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
new file mode 100644
index 00000000..20a5ab64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
@@ -0,0 +1,216 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+
+template <DataType T, typename S>
+void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int padded_src_channels = AlignByN(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int padded_dst_channels = AlignByN(dst_channels, 4);
+
+  // Change the travelsal order of the weight matrix in the following way:
+  // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
+  // size is not divisible by 4, then pad with zeros. Each block is stored
+  // contigously. The 16 elements within a block are ordered as 4 elements of
+  // the first column, 4 elems of the second, etc. Blocks then traversed as
+  // columns first, rows last. As an example, an 8x8 matrix would be traversed
+  // as below.
+  //
+  //  |  0  4  8 12 32 36 40 44 |
+  //  |  1  5  9 13 33 37 41 45 |
+  //  |  2  6 10 14 34 38 42 46 |
+  //  |  3  7 11 15 35 39 43 47 |
+  //  | 16 20 24 28 48 52 56 60 |
+  //  | 17 21 25 29 49 53 57 61 |
+  //  | 18 22 26 30 50 54 58 62 |
+  //  | 19 23 27 31 51 55 59 63 |
+  //
+  // The benefit of doing this is that reading contigous 16 elements gives a 4x4
+  // block of the matrix, where the first 4 elements is the first row of the
+  // block, second 4 elements is the second row of the block, etc. Subsequent
+  // blocks contain elements of the same 4 columns.
+
+  for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
+    for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
+      for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
+        for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
+          int y = 4 * block_y + y_in_block;
+          int x = 4 * block_x + x_in_block;
+          // Consider destination as an array with extents
+          // [padded_src_channels/4][padded_dst_channels/4][4][4]
+          int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
+                          x_in_block * 4 + y_in_block;
+          if (x < src_channels && y < dst_channels) {
+            dst[dst_index] = weights.data[src_channels * y + x];
+          } else {
+            dst[dst_index] = 0.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType T, typename S>
+void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int s = 0; s < src_depth; ++s) {
+      for (int i = 0; i < 4; ++i) {
+        const int src_ch = s * 4 + i;
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + j;
+          if (src_ch < src_channels && dst_ch < dst_channels) {
+            dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
+          } else {
+            dst[counter++] = 0.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+class FullyConnected : public GPUOperation {
+ public:
+  FullyConnected() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  FullyConnected(FullyConnected&& kernel);
+  FullyConnected& operator=(FullyConnected&& kernel);
+  FullyConnected(const FullyConnected&) = delete;
+  FullyConnected& operator=(const FullyConnected&) = delete;
+
+ private:
+  FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
+  friend FullyConnected CreateFullyConnected(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const FullyConnectedAttributes& attr);
+  friend FullyConnected CreateFullyConnected(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const FullyConnectedInt8Attributes& attr);
+
+  void UploadQuantizedWeights(
+      const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
+      float zero_point);
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     bool weights_are_buffer);
+
+  std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
+                                          const GpuInfo& gpu_info,
+                                          bool weights_are_buffer,
+                                          bool quantized);
+};
+
+template <DataType T>
+void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                   bool weights_are_buffer) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+
+  const int elements_count = src_depth * dst_depth * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  const int float4_size = f32_weights ? 16 : 8;
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data.resize(desc.size);
+
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    }
+
+    args_.AddObject("weights",
+                    std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    std::vector<uint8_t> data(float4_size * elements_count);
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    }
+
+    TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+        f32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
+        TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth, data.data());
+    args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
+  }
+}
+
+FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
+                                    const OperationDef& definition,
+                                    const FullyConnectedAttributes& attr);
+
+FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
+                                    const OperationDef& definition,
+                                    const FullyConnectedInt8Attributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.h
new file mode 100644
index 00000000..5727ee37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status FullyConnectedTest(TestExecutionEnvironment* env);
+absl::Status FullyConnectedLargeTest(TestExecutionEnvironment* env);
+absl::Status FullyConnectedExtraLargeTest(TestExecutionEnvironment* env);
+absl::Status FullyConnectedInt8Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/gather.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/gather.h
new file mode 100644
index 00000000..120eb325
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/gather.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_H_
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateGather(const GpuInfo& gpu_info, const OperationDef& op_def,
+                          const GatherAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h
new file mode 100644
index 00000000..6e512a2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status GatherBatchTest(TestExecutionEnvironment* env, bool constant_idx);
+absl::Status GatherHeightTest(TestExecutionEnvironment* env, bool constant_idx);
+absl::Status GatherWidthTest(TestExecutionEnvironment* env, bool constant_idx);
+absl::Status GatherChannelsTest(TestExecutionEnvironment* env,
+                                bool constant_idx);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/lstm.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/lstm.h
new file mode 100644
index 00000000..19e1c56d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/lstm.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_LSTM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_LSTM_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateLSTM(const OperationDef& definition,
+                        const GpuInfo& gpu_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_LSTM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/lstm_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/lstm_test_util.h
new file mode 100644
index 00000000..66128208
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/lstm_test_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_LSTM_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_LSTM_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status LstmTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_LSTM_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/max_unpooling.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/max_unpooling.h
new file mode 100644
index 00000000..d6b0bd6f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/max_unpooling.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MAX_UNPOOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MAX_UNPOOLING_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateMaxUnpooling(const GpuInfo& gpu_info,
+                                const OperationDef& definition,
+                                const MaxUnpooling2DAttributes& attr);
+
+GPUOperation CreateMaxUnpooling(const GpuInfo& gpu_info,
+                                const OperationDef& definition,
+                                const MaxUnpooling3DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MAX_UNPOOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/max_unpooling_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/max_unpooling_test_util.h
new file mode 100644
index 00000000..a4815a6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/max_unpooling_test_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MAX_UNPOOLING_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MAX_UNPOOLING_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status MaxUnpoolingTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MAX_UNPOOLING_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h
new file mode 100644
index 00000000..2bb45732
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MEAN_STDDEV_NORMALIZATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MEAN_STDDEV_NORMALIZATION_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+// Implements tensor_utils::MeanStddevNormalization
+class MeanStdDevNormalization : public GPUOperation {
+ public:
+  explicit MeanStdDevNormalization(const OperationDef& definition,
+                                   const GpuInfo& gpu_info, const BHWC& shape,
+                                   float variance_bias, bool two_step);
+
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    if (!work_group_reduction_) {
+      GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
+                            work_groups);
+      return;
+    }
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  MeanStdDevNormalization(MeanStdDevNormalization&& kernel) = default;
+  MeanStdDevNormalization& operator=(MeanStdDevNormalization&& kernel) =
+      default;
+  MeanStdDevNormalization(const MeanStdDevNormalization&) = delete;
+  MeanStdDevNormalization& operator=(const MeanStdDevNormalization&) = delete;
+
+ private:
+  std::string GetNormalizationCode(const GpuInfo& gpu_info, bool channels_x4,
+                                   bool two_step);
+  bool work_group_reduction_ = true;
+};
+
+// std dev can be calculated in single step, but two step algorithm can
+// provide more stable and robust results
+MeanStdDevNormalization CreateMeanStdDevNormalization(
+    const OperationDef& definition, const GpuInfo& gpu_info, const BHWC& shape,
+    float variance_bias = 1.0e-8f, bool two_step = true);
+
+// MeanStdDevNormalization fusion works with this subgraph
+//       input
+//       /    \
+//      |    mean
+//       \    /
+//     substraction
+//       /    \
+//      |      |
+//      |    square
+//      |      |
+//      |     mean
+//      |      |
+//      |     add
+//      |      |
+//      |    rsqrt
+//      |      |
+//       \    /
+//    multiplication
+//          |
+//        output
+absl::Status TryMeanStdDevNormalization(
+    const GpuInfo& gpu_info, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+class LayerNormalization : public GPUOperation {
+ public:
+  LayerNormalization(const OperationDef& definition, const GpuInfo& gpu_info,
+                     const BHWC& shape, float variance_bias,
+                     const Tensor<Linear, DataType::FLOAT32>& mul_linear,
+                     const Tensor<Linear, DataType::FLOAT32>& sub_linear,
+                     bool two_step);
+
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    if (!work_group_reduction_) {
+      GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
+                            work_groups);
+      return;
+    }
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  LayerNormalization(LayerNormalization&& kernel) = default;
+  LayerNormalization& operator=(LayerNormalization&& kernel) = default;
+  LayerNormalization(const LayerNormalization&) = delete;
+  LayerNormalization& operator=(const LayerNormalization&) = delete;
+
+ private:
+  std::string GetNormalizationCode(const GpuInfo& gpu_info, bool channels_x4,
+                                   bool two_step);
+  bool work_group_reduction_ = true;
+};
+
+// std dev can be calculated in single step, but two step algorithm can
+// provide more stable and robust results
+LayerNormalization CreateLayerNormalization(
+    const OperationDef& definition, const GpuInfo& gpu_info, const BHWC& shape,
+    float variance_bias, const Tensor<Linear, DataType::FLOAT32>& mul_linear,
+    const Tensor<Linear, DataType::FLOAT32>& sub_linear, bool two_step);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MEAN_STDDEV_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization_test_util.h
new file mode 100644
index 00000000..8d4f4b67
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization_test_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MEAN_STDDEV_NORMALIZATION_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MEAN_STDDEV_NORMALIZATION_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status MeanStddevNormSeparateBatchesTest(float mean, float diff,
+                                               float tolerance,
+                                               TestExecutionEnvironment* env);
+
+absl::Status MeanStddevNormalizationAllBatchesTest(
+    TestExecutionEnvironment* env);
+
+absl::Status MeanStddevNormalizationLargeVectorTest(
+    TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_MEAN_STDDEV_NORMALIZATION_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/one_hot.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/one_hot.h
new file mode 100644
index 00000000..36e0d9fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/one_hot.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ONE_HOT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ONE_HOT_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateOneHot(const OperationDef& definition,
+                          const OneHotAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ONE_HOT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/one_hot_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/one_hot_test_util.h
new file mode 100644
index 00000000..a1ca397a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/one_hot_test_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ONE_HOT_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ONE_HOT_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status OneHotTest(TestExecutionEnvironment* env);
+
+absl::Status OneHotBatchTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_ONE_HOT_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/padding.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/padding.h
new file mode 100644
index 00000000..e8ef1072
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/padding.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PADDING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PADDING_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreatePadding(const OperationDef& definition,
+                           const PadAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PADDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/padding_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/padding_test_util.h
new file mode 100644
index 00000000..d70a66fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/padding_test_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PADDING_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PADDING_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status PaddingAppendWidthTest(TestExecutionEnvironment* env);
+absl::Status PaddingAppendWidthConstValuesTest(TestExecutionEnvironment* env);
+absl::Status PaddingPrependWidthTest(TestExecutionEnvironment* env);
+absl::Status PaddingAppendHeightTest(TestExecutionEnvironment* env);
+absl::Status PaddingPrependHeightTest(TestExecutionEnvironment* env);
+absl::Status PaddingAppendChannelsTest(TestExecutionEnvironment* env);
+absl::Status PaddingPrependChannelsTest(TestExecutionEnvironment* env);
+absl::Status PaddingPrependChannelsX4Test(TestExecutionEnvironment* env);
+absl::Status PaddingComplexTest(TestExecutionEnvironment* env);
+absl::Status PaddingReflectWidthTest(TestExecutionEnvironment* env);
+absl::Status PaddingReflectChannelsTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PADDING_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/pooling.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/pooling.h
new file mode 100644
index 00000000..0f094b4c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/pooling.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_POOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_POOLING_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreatePooling(const OperationDef& definition,
+                           const GpuInfo& gpu_info,
+                           const Pooling2DAttributes& attr);
+
+GPUOperation CreatePooling(const OperationDef& definition,
+                           const GpuInfo& gpu_info,
+                           const Pooling3DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_POOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/pooling_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/pooling_test_util.h
new file mode 100644
index 00000000..2e1119a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/pooling_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_POOLING_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_POOLING_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status AveragePoolingTest(TestExecutionEnvironment* env);
+absl::Status AveragePoolingNonEmptyPaddingTest(TestExecutionEnvironment* env);
+absl::Status MaxPoolingTest(TestExecutionEnvironment* env);
+absl::Status MaxPoolingIndicesTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/prelu.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/prelu.h
new file mode 100644
index 00000000..7faa0409
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/prelu.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+ElementwiseDescriptor CreatePReLU(const PReLUAttributes& attr,
+                                  TensorDescriptor tensor_desc);
+
+GPUOperation CreatePReLU(const GpuInfo& gpu_info,
+                         const OperationDef& definition,
+                         const PReLUAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/prelu_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/prelu_test_util.h
new file mode 100644
index 00000000..d8a562de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/prelu_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status PReLUAlphaTest(TestExecutionEnvironment* env);
+
+absl::Status PReLUAlphaClipTest(TestExecutionEnvironment* env);
+
+absl::Status PReLUHWCAlphaTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_PRELU_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize.h
new file mode 100644
index 00000000..28a169cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_QUANTIZE_AND_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_QUANTIZE_AND_DEQUANTIZE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+// Performs the operation: {Quantize, Dequantize} on floating-point data.
+// We need this operation to emulate the error introduced by quantization
+// on the GPU, which cannot represent int8 tensors.
+//
+// Implemented as:
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
+// dq_value = qvalue * qscale + qmin
+// Here, qmin, qmax & qscale refer to the quantization values as implemented in
+// TensorFlow Lite's 'FakeQuant' kernel.
+//
+// NOTE: We do not need to nudge min/max values in this op, since they would
+// already be adjusted while generating the quantized model.
+GPUOperation CreateQuantizeAndDequantize(
+    const OperationDef& definition,
+    const QuantizeAndDequantizeAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_QUANTIZE_AND_DEQUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize_test_util.h
new file mode 100644
index 00000000..5124a818
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize_test_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_QUANTIZE_AND_DEQUANTIZE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_QUANTIZE_AND_DEQUANTIZE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status QuantAndDequant_Dim2Bits8Test(TestExecutionEnvironment* env);
+
+absl::Status QuantAndDequant_Dim3Bits8_NegativeRangeTest(
+    TestExecutionEnvironment* env);
+
+absl::Status QuantAndDequant_Dim3Bits16Test(TestExecutionEnvironment* env);
+
+absl::Status QuantAndDequant_Dim2Bits16_NegativeRangeTest(
+    TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_QUANTIZE_AND_DEQUANTIZE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reduce.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reduce.h
new file mode 100644
index 00000000..1fb4b32a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reduce.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_REDUCE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_REDUCE_H_
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class Reduce : public GPUOperation {
+ public:
+  Reduce() = default;
+  Reduce(const std::map<Axis, int>& axis_to_reduce, OperationType op_type,
+         const OperationDef& definition, const GpuInfo& gpu_info);
+
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  Reduce(Reduce&& operation);
+  Reduce& operator=(Reduce&& operation);
+  Reduce(const Reduce&) = delete;
+  Reduce& operator=(const Reduce&) = delete;
+
+ private:
+  std::string GetReduceKernelCode(const OperationDef& op_def,
+                                  const GpuInfo& gpu_info,
+                                  const int3& work_group_size,
+                                  const std::vector<Axis>& axis_to_reduce,
+                                  OperationType op_type);
+
+  bool use_wg_reduction_;
+};
+
+Reduce CreateReduce(const std::set<Axis>& axis_to_reduce, const BHWC& src_shape,
+                    OperationType op_type, const OperationDef& definition,
+                    const GpuInfo& gpu_info);
+
+Reduce CreateReduce(const std::set<Axis>& axis_to_reduce,
+                    const BHWDC& src_shape, OperationType op_type,
+                    const OperationDef& definition, const GpuInfo& gpu_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_REDUCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.h
new file mode 100644
index 00000000..a30553de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_REDUCE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_REDUCE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status MeanHWTest(TestExecutionEnvironment* env);
+absl::Status ReduceSumChannelsTest(TestExecutionEnvironment* env);
+absl::Status ReduceProductChannelsTest(TestExecutionEnvironment* env);
+absl::Status ReduceMaxChannelsTest(TestExecutionEnvironment* env);
+absl::Status ReduceMinChannelsTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_REDUCE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/relu.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/relu.h
new file mode 100644
index 00000000..a96f3f74
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/relu.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RELU_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RELU_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+ElementwiseDescriptor CreateReLU(const ReLUAttributes& attr,
+                                 CalculationsPrecision precision);
+
+GPUOperation CreateReLU(const OperationDef& definition,
+                        const ReLUAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
new file mode 100644
index 00000000..8c0a7023
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RELU_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RELU_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUAlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RELU_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resampler.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resampler.h
new file mode 100644
index 00000000..748d58a0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resampler.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESAMPLER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESAMPLER_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateResampler(const GpuInfo& gpu_info,
+                             const OperationDef& definition);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resampler_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resampler_test_util.h
new file mode 100644
index 00000000..d65e0206
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resampler_test_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESAMPLER_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESAMPLER_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ResamplerIdentityTest(const BHWC& shape,
+                                   TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESAMPLER_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshape.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshape.h
new file mode 100644
index 00000000..6f03fe36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshape.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPE_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateReshape(const OperationDef& definition);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshape_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshape_test_util.h
new file mode 100644
index 00000000..6b2652b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshape_test_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ReshapeTest(TestExecutionEnvironment* env);
+absl::Status Reshapex4Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshapex4.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshapex4.h
new file mode 100644
index 00000000..45183f36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/reshapex4.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPEX4_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPEX4_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+// More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
+GPUOperation CreateReshapex4(const OperationDef& definition);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESHAPEX4_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resize.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resize.h
new file mode 100644
index 00000000..064fa55f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resize.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESIZE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class Resize : public GPUOperation {
+ public:
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  Resize(Resize&& operation);
+  Resize& operator=(Resize&& operation);
+  Resize(const Resize&) = delete;
+  Resize& operator=(const Resize&) = delete;
+
+  friend Resize CreateResize(const OperationDef& definition,
+                             const Resize2DAttributes& attr);
+
+ private:
+  Resize(const OperationDef& definition, const Resize2DAttributes& attr);
+
+  std::string GetResizeCode(const OperationDef& op_def,
+                            const Resize2DAttributes& attr);
+
+  Resize2DAttributes attr_;
+};
+
+Resize CreateResize(const OperationDef& definition,
+                    const Resize2DAttributes& attr);
+
+class Resize3D : public GPUOperation {
+ public:
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  Resize3D(Resize3D&& operation);
+  Resize3D& operator=(Resize3D&& operation);
+  Resize3D(const Resize3D&) = delete;
+  Resize3D& operator=(const Resize3D&) = delete;
+
+  friend Resize3D CreateResize3D(const OperationDef& definition,
+                                 const Resize3DAttributes& attr);
+
+ private:
+  Resize3D(const OperationDef& definition, const Resize3DAttributes& attr);
+
+  std::string GetResize3DCode(const OperationDef& op_def,
+                              const Resize3DAttributes& attr);
+
+  Resize3DAttributes attr_;
+};
+
+Resize3D CreateResize3D(const OperationDef& definition,
+                        const Resize3DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resize_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resize_test_util.h
new file mode 100644
index 00000000..506cf9be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/resize_test_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESIZE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESIZE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ResizeBilinearAlignedTest(TestExecutionEnvironment* env);
+absl::Status ResizeBilinearNonAlignedTest(TestExecutionEnvironment* env);
+absl::Status ResizeBilinearWithoutHalfPixelTest(TestExecutionEnvironment* env);
+absl::Status ResizeBilinearWithHalfPixelTest(TestExecutionEnvironment* env);
+absl::Status ResizeNearestTest(TestExecutionEnvironment* env);
+absl::Status ResizeNearestAlignCornersTest(TestExecutionEnvironment* env);
+absl::Status ResizeNearestHalfPixelCentersTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_RESIZE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/select_v2.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/select_v2.h
new file mode 100644
index 00000000..4ebf198a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/select_v2.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SELECT_V2_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SELECT_V2_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateSelectV2(const OperationDef& definition,
+                            const SelectV2Attributes& attr = {});
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SELECT_V2_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/select_v2_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/select_v2_test_util.h
new file mode 100644
index 00000000..88051787
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/select_v2_test_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SELECT_V2_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SELECT_V2_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status SelectV2Test(TestExecutionEnvironment* env);
+
+absl::Status SelectV2BatchTest(TestExecutionEnvironment* env);
+
+absl::Status SelectV2ChannelsTest(TestExecutionEnvironment* env);
+
+absl::Status SelectV2ChannelsBatchTest(TestExecutionEnvironment* env);
+
+absl::Status SelectV2BroadcastTrueTest(TestExecutionEnvironment* env);
+
+absl::Status SelectV2BroadcastFalseTest(TestExecutionEnvironment* env);
+
+absl::Status SelectV2BroadcastBothTest(TestExecutionEnvironment* env);
+
+absl::Status SelectV2ChannelsBroadcastFalseTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SELECT_V2_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax.h
new file mode 100644
index 00000000..1831585a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateSoftmax(const OperationDef& definition,
+                           const GpuInfo& gpu_info, const BHWC& shape);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.h
new file mode 100644
index 00000000..860c9d9e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX1X1_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX1X1_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+class Softmax1x1 : public GPUOperation {
+ public:
+  Softmax1x1() = default;
+  Softmax1x1(const OperationDef& definition, const GpuInfo& gpu_info,
+             const BHWC& shape);
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  Softmax1x1(Softmax1x1&& kernel);
+  Softmax1x1& operator=(Softmax1x1&& kernel);
+  Softmax1x1(const Softmax1x1&) = delete;
+  Softmax1x1& operator=(const Softmax1x1&) = delete;
+
+  friend Softmax1x1 CreateSoftmax1x1();
+
+ private:
+  std::string GetSoftmaxKernelCode(const OperationDef& op_def);
+};
+
+Softmax1x1 CreateSoftmax1x1(const OperationDef& definition,
+                            const GpuInfo& gpu_info, const BHWC& shape);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX1X1_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax_test_util.h
new file mode 100644
index 00000000..6b6cc0e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/softmax_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status SoftmaxTest(TestExecutionEnvironment* env);
+absl::Status SoftmaxBigNumberTest(TestExecutionEnvironment* env);
+
+absl::Status Softmax1x1Test(TestExecutionEnvironment* env);
+absl::Status Softmax1x1BigNumberTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SOFTMAX_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.h
new file mode 100644
index 00000000..520fa43d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPACE_TO_DEPTH_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateSpaceToDepth(const OperationDef& op_def,
+                                const SpaceToDepthAttributes& attr);
+
+GPUOperation CreateDepthToSpace(const OperationDef& op_def,
+                                const SpaceToDepthAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPACE_TO_DEPTH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h
new file mode 100644
index 00000000..28692542
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPACE_TO_DEPTH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPACE_TO_DEPTH_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status SpaceToDepthTensorShape1x2x2x1BlockSize2Test(
+    TestExecutionEnvironment* env);
+absl::Status SpaceToDepthTensorShape1x2x2x2BlockSize2Test(
+    TestExecutionEnvironment* env);
+absl::Status SpaceToDepthTensorShape1x2x2x3BlockSize2Test(
+    TestExecutionEnvironment* env);
+absl::Status SpaceToDepthTensorShape1x4x4x1BlockSize2Test(
+    TestExecutionEnvironment* env);
+absl::Status SpaceToDepthTensorShape1x6x6x1BlockSize3Test(
+    TestExecutionEnvironment* env);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPACE_TO_DEPTH_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h
new file mode 100644
index 00000000..5ddb01ac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_CONV_POINTWISE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_CONV_POINTWISE_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+
+namespace tflite {
+namespace gpu {
+
+struct ConvPointwiseAttributes {
+  // (Slice start width, slice start height)
+  std::vector<int2> offsets;
+  // True if we use mean as the reduce op, false to use reduce_sum.
+  bool mean;
+};
+
+GPUOperation CreateConvPointwise(const OperationDef& definition,
+                                 const ConvPointwiseAttributes& attr);
+
+absl::Status TryFusedPointwiseConv(
+    const GraphFloat32& graph, NodeId first_node_id,
+    CalculationsPrecision precision,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_CONV_POINTWISE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/dw7x7_conv2to6_concat_conv8to8.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/dw7x7_conv2to6_concat_conv8to8.h
new file mode 100644
index 00000000..c309fc4b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/dw7x7_conv2to6_concat_conv8to8.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_DW7X7_CONV2TO6_CONCAT_CONV8TO8_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_DW7X7_CONV2TO6_CONCAT_CONV8TO8_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+
+namespace tflite {
+namespace gpu {
+
+// Input: 1 tensors with 2 channels.
+// Output: 2 tensors
+//   1 - 8 channels, 2x smaller in XY than input
+//   2 - 8 channels, 2x smaller in XY than input
+// This operation replace folowing sequence of operations:
+//   - Depthwise with kernel 7x7 (input -> interm_1)
+//   - MaxPooling 2x2 (interm_0 -> interm_2)
+//   - Convolution 1x1 2 to 6 channels (+PReLU) (interm_1 -> interm_3)
+//   - Concat (interm_3, interm_2 -> output_0)
+//   - Convolution 1x1 8 to 8 channels (+PReLU) (output_0 -> output_1)
+//
+// Limitations:
+//  Requires extension cl_qcom_accelerated_image_ops.
+
+bool IsDW7x7Conv2To6ConcatConv8to8Supported(const GpuInfo& gpu_info);
+
+GPUOperation CreateDW7x7Conv2To6ConcatConv8to8(
+    const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& dw_attr,
+    const Convolution2DAttributes& conv2to6, const PReLUAttributes& prelu0,
+    const Convolution2DAttributes& conv8to8, const PReLUAttributes& prelu1);
+
+absl::Status TryDW7x7Conv2To6ConcatConv8to8(
+    const GpuInfo& gpu_info, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_DW7X7_CONV2TO6_CONCAT_CONV8TO8_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h
new file mode 100644
index 00000000..7d4f2a99
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h
@@ -0,0 +1,199 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
+
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+
+template <DataType T, typename S>
+void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int padded_src_channels = AlignByN(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int padded_dst_channels = AlignByN(dst_channels, 4);
+
+  for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
+    for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
+      for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
+        for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
+          int y = 4 * block_y + y_in_block;
+          int x = 4 * block_x + x_in_block;
+          int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
+                          x_in_block * 4 + y_in_block;
+          if (x < src_channels && y < dst_channels) {
+            dst[dst_index] = weights.data[src_channels * y + x];
+          } else {
+            dst[dst_index] = 0.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType T, typename S>
+void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int s = 0; s < src_depth; ++s) {
+      for (int i = 0; i < 4; ++i) {
+        const int src_ch = s * 4 + i;
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + j;
+          if (src_ch < src_channels && dst_ch < dst_channels) {
+            dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
+          } else {
+            dst[counter++] = 0.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+class FCFCAdd : public GPUOperation {
+ public:
+  FCFCAdd() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  FCFCAdd(FCFCAdd&& kernel);
+  FCFCAdd& operator=(FCFCAdd&& kernel);
+  FCFCAdd(const FCFCAdd&) = delete;
+  FCFCAdd& operator=(const FCFCAdd&) = delete;
+
+ private:
+  FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
+  friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
+                               const OperationDef& definition,
+                               const FullyConnectedAttributes& attr0,
+                               const FullyConnectedAttributes& attr1);
+  friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
+                               const OperationDef& definition,
+                               const FullyConnectedInt8Attributes& attr0,
+                               const FullyConnectedInt8Attributes& attr1);
+
+  void UploadQuantizedWeights(
+      const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
+      float zero_point, int index);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     const std::string& name, bool weights_are_buffer);
+
+  std::string GetFCFCAddKernelCode(const OperationDef& op_def,
+                                   const GpuInfo& gpu_info,
+                                   bool weights_are_buffer, bool quantized_0,
+                                   bool quantized_1);
+};
+
+template <DataType T>
+void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                            const std::string& name, bool weights_are_buffer) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+
+  const int elements_count = src_depth * dst_depth * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  const int float4_size = f32_weights ? 16 : 8;
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data.resize(desc.size);
+
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    }
+
+    args_.AddObject(name, std::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    std::vector<uint8_t> data(float4_size * elements_count);
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    }
+
+    TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
+        f32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
+        TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth, data.data());
+
+    args_.AddObject(name, std::make_unique<TensorDescriptor>(std::move(desc)));
+  }
+}
+
+FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
+                      const FullyConnectedAttributes& attr0,
+                      const FullyConnectedAttributes& attr1);
+
+FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
+                      const FullyConnectedInt8Attributes& attr0,
+                      const FullyConnectedInt8Attributes& attr1);
+
+absl::Status TryFCFCAdd(
+    const GpuInfo& gpu_info, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.h
new file mode 100644
index 00000000..3e3f2a52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_THIN_POINTWISE_FUSER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_THIN_POINTWISE_FUSER_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status TryThinPointwiseFuser(
+    const GpuInfo& gpu_info, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_THIN_POINTWISE_FUSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/split.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/split.h
new file mode 100644
index 00000000..57c8caa5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/split.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPLIT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPLIT_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class Split : public GPUOperation {
+ public:
+  Split(const GpuInfo& gpu_info, const OperationDef& definition,
+        const SplitAttributes& attr, const std::vector<int>& channels);
+  int3 GetGridSize() const override;
+
+  // Move only
+  Split(Split&& operation) = default;
+  Split& operator=(Split&& operation) = default;
+  Split(const Split&) = delete;
+  Split& operator=(const Split&) = delete;
+
+ private:
+  std::string GetSplitCode();
+  std::string GetSplitChannelsCode(const GpuInfo& gpu_info,
+                                   const std::vector<int>& channels);
+
+  SplitAttributes attr_;
+};
+
+Split CreateSplit(const GpuInfo& gpu_info, const OperationDef& definition,
+                  const SplitAttributes& attr,
+                  const std::vector<int>& channels);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPLIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/split_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/split_test_util.h
new file mode 100644
index 00000000..0a2483ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/split_test_util.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPLIT_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPLIT_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status SplitChannelsTest(TestExecutionEnvironment* env);
+absl::Status SplitChannelsX4Test(TestExecutionEnvironment* env);
+absl::Status SplitWidthTest(TestExecutionEnvironment* env);
+absl::Status SplitHeightTest(TestExecutionEnvironment* env);
+absl::Status SplitBatchTest(TestExecutionEnvironment* env);
+absl::Status SplitDepthTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPLIT_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/strided_slice.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/strided_slice.h
new file mode 100644
index 00000000..15868a8e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/strided_slice.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_STRIDED_SLICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_STRIDED_SLICE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+class StridedSlice : public GPUOperation {
+ public:
+  StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+
+  // Move only
+  StridedSlice(StridedSlice&& operation);
+  StridedSlice& operator=(StridedSlice&& operation);
+  StridedSlice(const StridedSlice&) = delete;
+  StridedSlice& operator=(const StridedSlice&) = delete;
+
+ private:
+  std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4);
+
+  SliceAttributes attributes_;
+};
+
+StridedSlice CreateStridedSlice(const OperationDef& definition,
+                                const SliceAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_STRIDED_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/strided_slice_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/strided_slice_test_util.h
new file mode 100644
index 00000000..f42cf74a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/strided_slice_test_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_STRIDED_SLICE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_STRIDED_SLICE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status StridedSliceTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_STRIDED_SLICE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/tile.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/tile.h
new file mode 100644
index 00000000..d951e982
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/tile.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TILE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TILE_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateTile(const OperationDef& op_def, int src_channels);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/tile_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/tile_test_util.h
new file mode 100644
index 00000000..61ac3a52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/tile_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TILE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TILE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status TileChannelsTest(TestExecutionEnvironment* env);
+absl::Status TileChannelsX4Test(TestExecutionEnvironment* env);
+absl::Status TileWidthTest(TestExecutionEnvironment* env);
+absl::Status TileHeightTest(TestExecutionEnvironment* env);
+absl::Status TileHWCTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TILE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/transpose.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/transpose.h
new file mode 100644
index 00000000..57cee721
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/transpose.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TRANSPOSE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TRANSPOSE_H_
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+GPUOperation CreateTranspose(const OperationDef& definition,
+                             const TransposeAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TRANSPOSE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/transpose_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/transpose_test_util.h
new file mode 100644
index 00000000..f7bbdf82
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/transpose_test_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TRANSPOSE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TRANSPOSE_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status TransposeTest(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_TRANSPOSE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/winograd.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/winograd.h
new file mode 100644
index 00000000..c5da853d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/winograd.h
@@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_WINOGRAD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_WINOGRAD_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
+namespace tflite {
+namespace gpu {
+
+// You can read https://arxiv.org/pdf/1509.09308.pdf for understanding of basic
+// principles. In this kernels used different matrices for transformations than
+// in original work.
+class Winograd4x4To36 : public GPUOperation {
+ public:
+  Winograd4x4To36() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+
+  // Move only
+  Winograd4x4To36(Winograd4x4To36&& kernel) = default;
+  Winograd4x4To36& operator=(Winograd4x4To36&& kernel) = default;
+  Winograd4x4To36(const Winograd4x4To36&) = delete;
+  Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
+
+ private:
+  Winograd4x4To36(const OperationDef& definition, const Padding2D& padding)
+      : GPUOperation(definition), padding_(padding) {}
+  friend Winograd4x4To36 CreateWinograd4x4To36(const OperationDef& definition,
+                                               const Padding2D& padding,
+                                               const GpuInfo& gpu_info);
+
+  Padding2D padding_;
+};
+
+Winograd4x4To36 CreateWinograd4x4To36(const OperationDef& definition,
+                                      const Padding2D& padding,
+                                      const GpuInfo& gpu_info);
+
+class Winograd4x4To36TileX6 : public GPUOperation {
+ public:
+  Winograd4x4To36TileX6() = default;
+  Winograd4x4To36TileX6(const OperationDef& definition,
+                        const Padding2D& padding, const GpuInfo& gpu_info);
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+
+  // Move only
+  Winograd4x4To36TileX6(Winograd4x4To36TileX6&& operation) = default;
+  Winograd4x4To36TileX6& operator=(Winograd4x4To36TileX6&& operation) = default;
+  Winograd4x4To36TileX6(const Winograd4x4To36TileX6&) = delete;
+  Winograd4x4To36TileX6& operator=(const Winograd4x4To36TileX6&) = delete;
+
+ private:
+  friend Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const Padding2D& padding);
+
+  void UploadBt();
+
+  std::string GetWinograd4x4To36TileX6Code(const OperationDef& op_def,
+                                           const GpuInfo& gpu_info);
+
+  // Must be called after kernel compilation
+  int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
+
+  Padding2D padding_;
+};
+
+Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const Padding2D& padding);
+
+class Winograd36To4x4 : public GPUOperation {
+ public:
+  Winograd36To4x4() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  Winograd36To4x4(Winograd36To4x4&& kernel) = default;
+  Winograd36To4x4& operator=(Winograd36To4x4&& kernel) = default;
+  Winograd36To4x4(const Winograd36To4x4&) = delete;
+  Winograd36To4x4& operator=(const Winograd36To4x4&) = delete;
+
+ private:
+  explicit Winograd36To4x4(const OperationDef& definition)
+      : GPUOperation(definition) {}
+  friend Winograd36To4x4 CreateWinograd36To4x4(
+      const OperationDef& definition,
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
+};
+
+Winograd36To4x4 CreateWinograd36To4x4(
+    const OperationDef& definition,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
+
+class Winograd36To4x4Tile4x1 : public GPUOperation {
+ public:
+  Winograd36To4x4Tile4x1() = default;
+  Winograd36To4x4Tile4x1(const OperationDef& definition,
+                         const GpuInfo& gpu_info);
+  absl::Status BindArguments(ArgumentsBinder* args) override;
+  int3 GetGridSize() const override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const GpuInfo& gpu_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
+
+  // Move only
+  Winograd36To4x4Tile4x1(Winograd36To4x4Tile4x1&& operation) = default;
+  Winograd36To4x4Tile4x1& operator=(Winograd36To4x4Tile4x1&& operation) =
+      default;
+  Winograd36To4x4Tile4x1(const Winograd36To4x4Tile4x1&) = delete;
+  Winograd36To4x4Tile4x1& operator=(const Winograd36To4x4Tile4x1&) = delete;
+
+ private:
+  friend Winograd36To4x4Tile4x1 CreateWinograd36To4x4Tile4x1(
+      const GpuInfo& gpu_info, const OperationDef& definition,
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
+
+  void UploadAt();
+
+  std::string GetWinograd36To4x4Tile4x1Code(const OperationDef& op_def,
+                                            const GpuInfo& gpu_info);
+
+  // Must be called after kernel compilation
+  int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
+};
+
+Winograd36To4x4Tile4x1 CreateWinograd36To4x4Tile4x1(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_WINOGRAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h
new file mode 100644
index 00000000..1fa7e2cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_WINOGRAD_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_WINOGRAD_TEST_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status Winograd4x4To36TileX6Test(TestExecutionEnvironment* env);
+absl::Status Winograd36To4x4Tile4x1Test(TestExecutionEnvironment* env);
+absl::Status Winograd4x4To36Test(TestExecutionEnvironment* env);
+absl::Status Winograd4x4To36BatchTest(TestExecutionEnvironment* env);
+absl::Status Winograd36To4x4Test(TestExecutionEnvironment* env);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_WINOGRAD_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tensor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tensor.h
new file mode 100644
index 00000000..41e9062d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/tensor.h
@@ -0,0 +1,129 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
+namespace tflite {
+namespace gpu {
+namespace internal_tensor {
+
+// Meta function given element type returns a type for Tensor data container.
+template <DataType Type>
+struct StorageType;
+
+template <>
+struct StorageType<DataType::FLOAT32> {
+  using value = std::vector<float>;
+};
+
+template <>
+struct StorageType<DataType::INT32> {
+  using value = std::vector<int32_t>;
+};
+
+template <>
+struct StorageType<DataType::INT16> {
+  using value = std::vector<int16_t>;
+};
+
+template <>
+struct StorageType<DataType::INT8> {
+  using value = std::vector<int8_t>;
+};
+
+template <>
+struct StorageType<DataType::UINT32> {
+  using value = std::vector<uint32_t>;
+};
+
+template <>
+struct StorageType<DataType::UINT16> {
+  using value = std::vector<uint16_t>;
+};
+
+template <>
+struct StorageType<DataType::UINT8> {
+  using value = std::vector<uint8_t>;
+};
+
+template <>
+struct StorageType<DataType::BOOL> {
+  using value = std::vector<uint8_t>;
+};
+
+}  // namespace internal_tensor
+
+template <typename ShapeT, DataType Type>
+struct Tensor {
+  using ShapeType = ShapeT;
+
+  constexpr static DataType kType = Type;
+
+  using TensorStorageType = typename internal_tensor::StorageType<Type>::value;
+
+  // Opaque id of a tensor.
+  int64_t id = -1;
+
+  ShapeType shape;
+
+  TensorStorageType data;
+};
+
+// TensorRef is a reference to another tensor. If an object should never hold
+// tensor data, then TensorRef should be used instead.
+template <typename ShapeT>
+struct TensorRef {
+  using ShapeType = ShapeT;
+
+  DataType type = DataType::UNKNOWN;
+
+  ShapeT shape;
+
+  // Opaque reference to a tensor. Upstream component is responsible for
+  // resolving this reference into an actual tensor.
+  int64_t ref = -1;
+
+  // Specifies if the tensor should be a variable input tensor that must be an
+  // output as well as an input to the graph.
+  bool is_variable_input = false;
+};
+
+template <typename ShapeT, DataType Type>
+constexpr DataType Tensor<ShapeT, Type>::kType;
+
+template <typename ShapeT, DataType Type>
+Tensor<ShapeT, Type> MakeZeroTensor(const ShapeT& shape) {
+  Tensor<ShapeT, Type> tensor;
+  tensor.shape = shape;
+  tensor.data = typename Tensor<ShapeT, Type>::TensorStorageType(
+      shape.DimensionsProduct(), 0);
+  return tensor;
+}
+
+using TensorFloat32 = Tensor<BHWC, DataType::FLOAT32>;
+using Tensor5DFloat32 = Tensor<BHWDC, DataType::FLOAT32>;
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
new file mode 100644
index 00000000..dacb486e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h"
+#include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
+
+namespace tflite {
+
+// Specifies the common ops feature parity for all gpu delegates. NameModel
+// structure stores the model name and model itself.
+inline std::vector<TestParams> GetFeatureParity() {
+  return {Add2SameShapeTensors(), AddBroadcast()};
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h
new file mode 100644
index 00000000..228fd4b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_GENERATORS_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_GENERATORS_ADD_H_
+
+#include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
+
+namespace tflite {
+
+TestParams Add2SameShapeTensors();
+TestParams AddBroadcast();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_GENERATORS_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
new file mode 100644
index 00000000..a75efbeb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
@@ -0,0 +1,223 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+
+// These two functions implement usability printing for TfLiteTensor dimensions
+// and coordinates. By default dimensions are interpreted depending on the size:
+// 1:Linear, 2:HW, 3: HWC, 4:BHWC. If there are more than 4 dimensions,
+// absl::nullopt will be returned.
+std::optional<std::string> ShapeToString(TfLiteIntArray* shape);
+std::optional<std::string> CoordinateToString(TfLiteIntArray* shape,
+                                              int linear);
+
+template <typename TupleMatcher>
+class TensorEqMatcher {
+ public:
+  TensorEqMatcher(const TupleMatcher& tuple_matcher, const TfLiteTensor& rhs)
+      : tuple_matcher_(tuple_matcher), rhs_(rhs) {}
+
+  // Make TensorEqMatcher movable only (The copy operations are implicitly
+  // deleted).
+  TensorEqMatcher(TensorEqMatcher&& other) = default;
+  TensorEqMatcher& operator=(TensorEqMatcher&& other) = default;
+
+  template <typename T>
+  operator testing::Matcher<T>() const {  // NOLINT
+    return testing::Matcher<T>(new Impl(tuple_matcher_, rhs_));
+  }
+
+  class Impl : public testing::MatcherInterface<TfLiteTensor> {
+   public:
+    typedef ::std::tuple<float, float> InnerMatcherArg;
+
+    Impl(const TupleMatcher& tuple_matcher, const TfLiteTensor& rhs)
+        : mono_tuple_matcher_(
+              testing::SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
+          rhs_(rhs) {}
+
+    // Make Impl movable only (The copy operations are implicitly deleted).
+    Impl(Impl&& other) = default;
+    Impl& operator=(Impl&& other) = default;
+
+    // Define what gtest framework will print for the Expected field.
+    void DescribeTo(std::ostream* os) const override {
+      std::string shape;
+      std::optional<std::string> result = ShapeToString(rhs_.dims);
+      if (result.has_value()) {
+        shape = std::move(result.value());
+      } else {
+        shape = "[error: unsupported number of dimensions]";
+      }
+      *os << "tensor which has the shape of " << shape
+          << ", where each value and its corresponding expected value ";
+      mono_tuple_matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(
+        TfLiteTensor lhs,
+        testing::MatchResultListener* listener) const override {
+      // 1. Check that TfLiteTensor data type is supported.
+      // Support for other data types will be added on demand.
+      if (lhs.type != kTfLiteFloat32 || rhs_.type != kTfLiteFloat32) {
+        *listener << "which data type is not float32, which is not currently "
+                     "supported.";
+        return false;
+      }
+
+      // 2. Check that dimensions' sizes match. Otherwise, we are not able to
+      // compare tensors.
+      if (lhs.dims->size != rhs_.dims->size) {
+        *listener << "which is different from the expected shape of size "
+                  << rhs_.dims->size;
+        return false;
+      }
+      // 3. Check that dimensions' values are equal as well. We are not able to
+      // compare tensors of different shapes, even if the total elements count
+      // matches.
+      bool dims_are_equal = true;
+      for (int i = 0; i < lhs.dims->size; i++) {
+        dims_are_equal &= lhs.dims->data[i] == rhs_.dims->data[i];
+      }
+      if (!dims_are_equal) {
+        std::string shape;
+        std::optional<std::string> result = ShapeToString(rhs_.dims);
+        if (result.has_value()) {
+          shape = std::move(result.value());
+        } else {
+          shape = "[error: unsupported number of dimensions]";
+        }
+        *listener << "which is different from the expected shape " << shape;
+        return false;
+      }
+
+      // 4. Proceed to data comparison. Iterate through elements as they lay
+      // flat. If some pair of elements don't match, deduct the coordinate
+      // basing on the dimensions, then return.
+      absl::Span<float> lhs_span(lhs.data.f, lhs.bytes / sizeof(float));
+      absl::Span<float> rhs_span(rhs_.data.f, rhs_.bytes / sizeof(float));
+
+      auto left = lhs_span.begin();
+      auto right = rhs_span.begin();
+      for (size_t i = 0; i != lhs_span.size(); ++i, ++left, ++right) {
+        if (listener->IsInterested()) {
+          testing::StringMatchResultListener inner_listener;
+          if (!mono_tuple_matcher_.MatchAndExplain({*left, *right},
+                                                   &inner_listener)) {
+            *listener << "where the value pair (";
+            testing::internal::UniversalPrint(*left, listener->stream());
+            *listener << ", ";
+            testing::internal::UniversalPrint(*right, listener->stream());
+            std::string coordinate;
+            std::optional<std::string> result = CoordinateToString(lhs.dims, i);
+            if (result.has_value()) {
+              coordinate = std::move(result.value());
+            } else {
+              coordinate = "[error: unsupported number of dimensions]";
+            }
+            *listener << ") with coordinate " << coordinate << " don't match";
+            testing::internal::PrintIfNotEmpty(inner_listener.str(),
+                                               listener->stream());
+            return false;
+          }
+        } else {
+          if (!mono_tuple_matcher_.Matches({*left, *right})) return false;
+        }
+      }
+
+      return true;
+    }
+
+   private:
+    const testing::Matcher<InnerMatcherArg> mono_tuple_matcher_;
+    const TfLiteTensor rhs_;
+  };
+
+ private:
+  const TupleMatcher tuple_matcher_;
+  const TfLiteTensor rhs_;
+};
+
+// Builds interpreter for a model, allocates tensors.
+absl::Status BuildInterpreter(const Model* model,
+                              std::unique_ptr<Interpreter>* interpreter);
+
+// Allocates tensors for a given interpreter.
+absl::Status AllocateTensors(std::unique_ptr<Interpreter>* interpreter);
+
+// Modifies graph with given delegate.
+absl::Status ModifyGraphWithDelegate(std::unique_ptr<Interpreter>* interpreter,
+                                     TfLiteDelegate* delegate);
+
+// Initializes inputs with consequent values of some fixed range.
+void InitializeInputs(int left, int right,
+                      std::unique_ptr<Interpreter>* interpreter);
+
+// Invokes a prebuilt interpreter.
+absl::Status Invoke(std::unique_ptr<Interpreter>* interpreter);
+
+// Usability structure, which is used to pass parameters data to parameterized
+// tests.
+struct TestParams {
+  // A gtest name, which will be used for a generated tests.
+  std::string name;
+
+  // Function, which returns a TFLite model, associated with this test name.
+  std::vector<uint8_t> model;
+};
+
+// Defines how the TestParams should be printed into the command line if
+// something fails during testing.
+std::ostream& operator<<(std::ostream& os, const TestParams& param);
+
+}  // namespace tflite
+
+// Gtest framework uses this function to describe TfLiteTensor if something
+// fails. TfLiteTensor is defined in global namespace, same should be done for
+// streaming operator.
+std::ostream& operator<<(std::ostream& os, const TfLiteTensor& tensor);
+
+// Defines a matcher to compare two TfLiteTensors pointwise using the given
+// tuple matcher for comparing their values.
+template <typename TupleMatcherT>
+inline tflite::TensorEqMatcher<TupleMatcherT> TensorEq(
+    const TupleMatcherT& matcher, const TfLiteTensor& rhs) {
+  return tflite::TensorEqMatcher<TupleMatcherT>(matcher, rhs);
+}
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
new file mode 100644
index 00000000..3c10e6d8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_INTERPRETER_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_INTERPRETER_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace gpu {
+namespace testing {
+
+// Runs Tensorflow Lite model using Tensorflow Lite with a delegate and
+// an appropriate operations resolver. If delegate is nullptr, inference will
+// be done only on CPU.
+absl::Status InterpreterInvokeWithOpResolver(
+    const ::tflite::Model* model, TfLiteDelegate* delegate,
+    const OpResolver& op_resolver, const std::vector<TensorFloat32>& inputs,
+    std::vector<TensorFloat32>* outputs);
+
+// Runs Tensorflow Lite model using Tensorflow Lite with a delegate and
+// builtin operations resolver. If delegate is nullptr, inference will
+// be done only on CPU.
+absl::Status InterpreterInvoke(const ::tflite::Model* model,
+                               TfLiteDelegate* delegate,
+                               const std::vector<TensorFloat32>& inputs,
+                               std::vector<TensorFloat32>* outputs);
+
+}  // namespace testing
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_INTERPRETER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/add_bias.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/add_bias.h
new file mode 100644
index 00000000..1523c413
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/add_bias.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_BIAS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_BIAS_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Makes optional bias(Conv/Deconv and etc) as not optional(always present)
+std::unique_ptr<NodeTransformation> NewAddBias();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_BIAS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h
new file mode 100644
index 00000000..6eb4aaaf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_QUANT_ADJUSTMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_QUANT_ADJUSTMENTS_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// This pass is used to support inference on quantized models with the GPU
+// delegate.
+//
+// When delegating quantized models, we still run float-point inference on GPU
+// under-the-hood. This is done by dequantizing inputs (at runtime) & constants
+// (during delegation).
+// However, intermediate tensors can still deviate from the original quantized
+// inference, since activations may not follow the attributes set by the
+// original quantizion parameters.
+// To prevent this, we add "QuantizeAndDequantize" nodes for each node-output
+// that was originally fixed-point:
+// op1 -> op2
+// becomes
+// op1 -> QuantizeAndDequantize -> op2
+std::unique_ptr<NodeTransformation> NewAddQuantAdjustments();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_QUANT_ADJUSTMENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
new file mode 100644
index 00000000..7a7b05d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_ADD_TO_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_ADD_TO_CONV_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+
+// Fuse Add Scalar or Add Broadcast after Convolution(Convolution2D,
+// DepthWise, TransposedConvolution, FullyConnected) into biases of
+// convolution.
+std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd();
+
+// Fuse Add Scalar or Add Broadcast before Convolution2D into weights and biases
+// of convolution.
+std::unique_ptr<SequenceTransformation> NewMergeAddWithConvolution();
+
+// Modify Convolution2DAttributes so that after making convolution with
+// modified attributes we will have the same result as convolution
+// with old attributes and following add operation.
+void FuseConvolution2DWithAdd(const ElementwiseAttributes& add_attr,
+                              Convolution2DAttributes* attr);
+
+// Modify DepthwiseConvolution2DAttributes so that after making depth wise
+// convolution with modified attributes we will have the same result as depth
+// wise convolution with old attributes and following add operation.
+void FuseDepthwiseConvolution2DWithAdd(const ElementwiseAttributes& add_attr,
+                                       DepthwiseConvolution2DAttributes* attr);
+
+// Modify ConvolutionTransposedAttributes so that after making convolution
+// transposed with modified attributes we will have the same result as
+// convolution transposed with old attributes and following add operation.
+void FuseConvolutionTransposedWithAdd(const ElementwiseAttributes& add_attr,
+                                      ConvolutionTransposedAttributes* attr);
+
+// Modify FullyConnectedAttributes so that after making fully connected with
+// modified attributes we will have the same result as fully connected
+// with old attributes and following add operation.
+void FuseFullyConnectedWithAdd(const ElementwiseAttributes& add_attr,
+                               FullyConnectedAttributes* attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_ADD_TO_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
new file mode 100644
index 00000000..92fab455
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_MUL_TO_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_MUL_TO_CONV_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+
+// Fuse Multiply Scalar or Multiply Broadcast after Convolution(Convolution2D,
+// DepthWise, TransposedConvolution, FullyConnected) into weights and biases of
+// convolution.
+std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithMul();
+
+// Fuse Multiply Scalar or Multiply Broadcast before Convolution(Convolution2D,
+// DepthWise, TransposedConvolution, FullyConnected) into weights and biases of
+// convolution.
+std::unique_ptr<SequenceTransformation> NewMergeMulWithConvolution();
+
+// Modify Convolution2DAttributes so that after making convolution with
+// modified attributes we will have the same result as convolution
+// with old attributes and following multiply operation.
+void FuseConvolution2DWithMultiply(const ElementwiseAttributes& mul_attr,
+                                   Convolution2DAttributes* attr);
+
+// Modify DepthwiseConvolution2DAttributes so that after making depth wise
+// convolution with modified attributes we will have the same result as depth
+// wise convolution with old attributes and following multiply operation.
+void FuseDepthwiseConvolution2DWithMultiply(
+    const ElementwiseAttributes& mul_attr,
+    DepthwiseConvolution2DAttributes* attr);
+
+// Modify ConvolutionTransposedAttributes so that after making convolution
+// transposed with modified attributes we will have the same result as
+// convolution transposed with old attributes and following multiply operation.
+void FuseConvolutionTransposedWithMultiply(
+    const ElementwiseAttributes& mul_attr,
+    ConvolutionTransposedAttributes* attr);
+
+// Modify FullyConnectedAttributes so that after making fully connected with
+// modified attributes we will have the same result as fully connected
+// with old attributes and following multiply operation.
+void FuseFullyConnectedWithMultiply(const ElementwiseAttributes& mul_attr,
+                                    FullyConnectedAttributes* attr);
+
+// Modify Convolution2DAttributes so that after making convolution with
+// modified attributes we will have the same result as multiply operation and
+// convolution with old attributes
+void FuseMultiplyWithConvolution2D(const ElementwiseAttributes& mul_attr,
+                                   Convolution2DAttributes* attr);
+
+// Modify DepthwiseConvolution2DAttributes so that after making depth wise
+// convolution with modified attributes we will have the same result as multiply
+// operation and depth wise convolution with old attributes
+void FuseMultiplyWithDepthwiseConvolution2D(
+    const ElementwiseAttributes& mul_attr,
+    DepthwiseConvolution2DAttributes* attr);
+
+// Modify ConvolutionTransposedAttributes so that after making convolution
+// transposed with modified attributes we will have the same result as multiply
+// operation and convolution transposed with old attributes
+void FuseMultiplyWithConvolutionTransposed(
+    const ElementwiseAttributes& mul_attr,
+    ConvolutionTransposedAttributes* attr);
+
+// Modify FullyConnectedAttributes so that after making fully connected
+// with modified attributes we will have the same result as multiply
+// operation and fully connected with old attributes
+void FuseMultiplyWithFullyConnected(const ElementwiseAttributes& mul_attr,
+                                    FullyConnectedAttributes* attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_MUL_TO_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/global_pooling_to_reduce_op.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/global_pooling_to_reduce_op.h
new file mode 100644
index 00000000..d2eba5d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/global_pooling_to_reduce_op.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_GLOBAL_POOLING_TO_REDUCE_OP_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_GLOBAL_POOLING_TO_REDUCE_OP_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Turns global pooling to reduce operation
+// currently can convert average pooling into mean.
+std::unique_ptr<NodeTransformation> NewGlobalPoolingToReduceOp();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_GLOBAL_POOLING_TO_REDUCE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h
new file mode 100644
index 00000000..9a62d0bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_FULLY_CONNECTED_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Turns convolution with kernel 1x1 and input tensor with h=1 and w=1 into
+// fully connected operation
+std::unique_ptr<NodeTransformation> NewMakeFullyConnectedFromConvolution();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/make_padding.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/make_padding.h
new file mode 100644
index 00000000..c7774eb3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/make_padding.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_PADDING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_PADDING_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Turns concat that handles only two tensors, where one tensor is zeros, into
+// padding operation.
+std::unique_ptr<NodeTransformation> NewMakePaddingFromConcat();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_PADDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/matching.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/matching.h
new file mode 100644
index 00000000..b28c8b05
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/matching.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCHING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCHING_H_
+
+// A file provides predicates to match subgraphs.
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace gpu {
+
+// Returns true if a container of nodes contains nodes that all match given
+// operation_types.
+template <typename T>
+bool MatchesByOperationType(const T& nodes,
+                            const std::vector<std::string>& types) {
+  if (nodes.size() != types.size()) return false;
+  return std::mismatch(nodes.begin(), nodes.end(), types.begin(),
+                       [&](typename T::value_type a, const std::string& b) {
+                         return a->operation.type == b;
+                       })
+             .first == nodes.end();
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCHING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/merge_densify.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/merge_densify.h
new file mode 100644
index 00000000..065fefc0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/merge_densify.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_DENSIFY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_DENSIFY_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Merges DENSIFY with CONV_2D / DEPTHWISE_CONV_2D.
+std::unique_ptr<NodeTransformation> NewMergeDensify();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_DENSIFY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h
new file mode 100644
index 00000000..d28cdfb7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_PADDING_WITH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_PADDING_WITH_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<SequenceTransformation> NewMergePaddingWithPooling();
+
+std::unique_ptr<SequenceTransformation> NewMergePaddingWithConvolution2D();
+
+std::unique_ptr<SequenceTransformation>
+NewMergePaddingWithDepthwiseConvolution();
+
+// This transform requires Add operation support of unequal tensors on input.
+// Padding should be with zeroes, and only appended in Z axis.
+// Also input tensor channels should be divisible by 4(aligned).
+// It should replace following pattern:
+// 1) some tensor padded with zeroes in Z dim, for example from 24 to 32
+//   channels
+// 2) than this tensor used only in Add operation and Add operation
+//   adds this useless zeroes on 24-32 channels.
+// It removes this useless addition
+// by using Add with unequal tensors on input. Instead of filling with zeroes
+// and adding this part in Add operation, Add operation makes additional check
+// for this tensor:
+//   if (channels < src_channels) {
+//     result += tensor_from_pad_operation.data[index];
+//   }
+std::unique_ptr<NodeTransformation> NewMergePaddingWithAdd();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_PADDING_WITH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
new file mode 100644
index 00000000..d8a66fd8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MODEL_TRANSFORMATIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MODEL_TRANSFORMATIONS_H_
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Applies custom and general transformations to the model in the proper order.
+// @return false when something went wrong that turned a graph in a broken state
+bool ApplyModelTransformations(ModelTransformer* transformer);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MODEL_TRANSFORMATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h
new file mode 100644
index 00000000..e8acb599
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_REMOVE_NOOP_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_REMOVE_NOOP_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+std::unique_ptr<SequenceTransformation> NewRemoveSingleInputConcat();
+
+std::unique_ptr<SequenceTransformation> NewRemoveSingleInputAdd();
+
+std::unique_ptr<SequenceTransformation> NewRemoveDegenerateUpsampling();
+
+// Removes reshape with input shape == output shape
+std::unique_ptr<NodeTransformation> NewRemoveIdentityReshape();
+
+// Removes strided slice with input tensor == output tensor
+std::unique_ptr<NodeTransformation> NewRemoveIdentityStridedSlice();
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_REMOVE_NOOP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/types.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/types.h
new file mode 100644
index 00000000..45f6bf50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/types.h
@@ -0,0 +1,209 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TYPES_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TYPES_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+#include "fp16.h"  // from @FP16
+
+namespace tflite {
+namespace gpu {
+
+// TODO(akulik): make these types Google-style compliant.
+
+using HalfBits = uint16_t;
+
+class alignas(2) half {
+ public:
+  HalfBits bits;
+
+  half() = default;
+
+  half(const half& f) : bits(f.bits) {}
+
+  explicit half(float other) { bits = fp16_ieee_from_fp32_value(other); }
+
+  half& operator=(const half& other) = default;
+
+  void operator=(float f) { *this = half(f); }
+
+  operator float() const { return fp16_ieee_to_fp32_value(bits); }
+};
+
+template <typename T>
+struct alignas(sizeof(T)) Vec4 {
+  union {
+    struct {
+      T x, y, z, w;
+    };
+    std::array<T, 4> data_;
+  };
+
+  Vec4() : Vec4(T(0.0f)) {}
+
+  template <typename S>
+  Vec4(S x_, S y_, S z_, S w_) : x(x_), y(y_), z(z_), w(w_) {}
+  explicit Vec4(T v) : x(v), y(v), z(v), w(v) {}
+
+  template <typename S>
+  explicit Vec4(S v) : x(v), y(v), z(v), w(v) {}
+
+  Vec4(const Vec4& f) : x(f.x), y(f.y), z(f.z), w(f.w) {}
+
+  template <typename S>
+  Vec4(const Vec4<S>& f) : x(f.x), y(f.y), z(f.z), w(f.w) {}
+
+  Vec4& operator=(const Vec4& other) {
+    x = other.x;
+    y = other.y;
+    z = other.z;
+    w = other.w;
+    return *this;
+  }
+
+  static constexpr int size() { return 4; }
+
+  T& operator[](size_t n) { return data_[n]; }
+  T operator[](size_t n) const { return data_[n]; }
+
+  bool operator==(const Vec4& value) const {
+    return data_[0] == value[0] && data_[1] == value[1] &&
+           data_[2] == value[2] && data_[3] == value[3];
+  }
+  bool operator!=(const Vec4& value) const {
+    return !(this->operator==(value));
+  }
+};
+
+template <typename T>
+struct alignas(sizeof(T)) Vec3 {
+  union {
+    struct {
+      T x, y, z;
+    };
+    std::array<T, 3> data_;
+  };
+
+  Vec3() : Vec3(T(0.0f)) {}
+
+  template <typename S>
+  constexpr Vec3(S x_, S y_, S z_) : x(x_), y(y_), z(z_) {}
+  explicit Vec3(T v) : x(v), y(v), z(v) {}
+
+  template <typename S>
+  explicit Vec3(S v) : x(v), y(v), z(v) {}
+
+  Vec3(const Vec3& f) : x(f.x), y(f.y), z(f.z) {}
+
+  template <typename S>
+  Vec3(const Vec3<S>& f) : x(f.x), y(f.y), z(f.z) {}
+
+  Vec3& operator=(const Vec3& other) {
+    x = other.x;
+    y = other.y;
+    z = other.z;
+    return *this;
+  }
+
+  static constexpr int size() { return 3; }
+
+  T& operator[](size_t n) { return data_[n]; }
+  T operator[](size_t n) const { return data_[n]; }
+  bool operator==(const Vec3& value) const {
+    return data_[0] == value[0] && data_[1] == value[1] && data_[2] == value[2];
+  }
+  bool operator!=(const Vec3& value) const {
+    return !(this->operator==(value));
+  }
+};
+
+template <typename T>
+struct alignas(sizeof(T)) Vec2 {
+  union {
+    struct {
+      T x, y;
+    };
+    std::array<T, 2> data_;
+  };
+
+  Vec2() : Vec2(T(0.0f)) {}
+
+  template <typename S>
+  Vec2(S x_, S y_) : x(x_), y(y_) {}
+  explicit Vec2(T v) : x(v), y(v) {}
+
+  template <typename S>
+  explicit Vec2(S v) : x(v), y(v) {}
+
+  Vec2(const Vec2& f) : x(f.x), y(f.y) {}
+
+  template <typename S>
+  Vec2(const Vec2<S>& f) : x(f.x), y(f.y) {}
+
+  Vec2& operator=(const Vec2& other) {
+    x = other.x;
+    y = other.y;
+    return *this;
+  }
+
+  bool operator==(const Vec2& value) const {
+    return data_[0] == value[0] && data_[1] == value[1];
+  }
+
+  bool operator!=(const Vec2& value) const {
+    return !(this->operator==(value));
+  }
+
+  static constexpr int size() { return 2; }
+
+  T& operator[](size_t n) { return data_[n]; }
+  T operator[](size_t n) const { return data_[n]; }
+};
+
+using float2 = Vec2<float>;
+using half2 = Vec2<half>;
+using byte2 = Vec2<int8_t>;
+using ubyte2 = Vec2<uint8_t>;
+using short2 = Vec2<int16_t>;
+using ushort2 = Vec2<uint16_t>;
+using int2 = Vec2<int32_t>;
+using uint2 = Vec2<uint32_t>;
+
+using float3 = Vec3<float>;
+using half3 = Vec3<half>;
+using byte3 = Vec3<int8_t>;
+using ubyte3 = Vec3<uint8_t>;
+using short3 = Vec3<int16_t>;
+using ushort3 = Vec3<uint16_t>;
+using int3 = Vec3<int32_t>;
+using uint3 = Vec3<uint32_t>;
+
+using float4 = Vec4<float>;
+using half4 = Vec4<half>;
+using byte4 = Vec4<int8_t>;
+using ubyte4 = Vec4<uint8_t>;
+using short4 = Vec4<int16_t>;
+using ushort4 = Vec4<uint16_t>;
+using int4 = Vec4<int32_t>;
+using uint4 = Vec4<uint32_t>;
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/unimplemented_operation_parser.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/unimplemented_operation_parser.h
new file mode 100644
index 00000000..384bfe42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/unimplemented_operation_parser.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UNIMPLEMENTED_OPERATION_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UNIMPLEMENTED_OPERATION_PARSER_H_
+
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/operation_parser.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+class UnimplementedOperationParser : public TFLiteOperationParser {
+ public:
+  explicit UnimplementedOperationParser(absl::string_view op_name)
+      : op_name_(op_name) {}
+
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    return absl::UnimplementedError(op_name_);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    return absl::UnimplementedError(op_name_);
+  }
+
+ private:
+  std::string op_name_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UNIMPLEMENTED_OPERATION_PARSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/util.h
new file mode 100644
index 00000000..6a1e7930
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/util.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+
+// @param n must be non negative
+// @param divisor must be greater than zero
+template <typename T, typename N>
+T DivideRoundUp(T n, N divisor) {
+  const T div = static_cast<T>(divisor);
+  const T q = n / div;
+  return n % div == 0 ? q : q + 1;
+}
+
+template <>
+inline uint3 DivideRoundUp(uint3 n, uint3 divisor) {
+  return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
+               DivideRoundUp(n.z, divisor.z));
+}
+
+// @param number or its components must be greater than zero
+// @param n must be greater than zero
+template <typename T, typename N>
+T AlignByN(T number, N n) {
+  return DivideRoundUp(number, n) * n;
+}
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/winograd_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/winograd_util.h
new file mode 100644
index 00000000..55629696
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/winograd_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+
+// Matrices for Winograd trasformations received with method described here
+// https://openreview.net/pdf?id=H1ZaRZVKg
+
+// returns A transposed matrix(6 * 4) as array (24 values) for Winograd4x4To6x6
+std::vector<float> AtMatrixForWinograd4x4To6x6();
+
+// returns B transposed matrix(6 * 6) as array (36 values) for Winograd4x4To6x6
+std::vector<float> BtMatrixForWinograd4x4To6x6();
+
+void RearrangeWeightsToWinograd4x4To6x6Weights(
+    const Tensor<OHWI, DataType::FLOAT32>& src_weights,
+    Tensor<OHWI, DataType::FLOAT32>* dst_weights);
+
+bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/workgroup_selection.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
new file mode 100644
index 00000000..30e4d8dc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WORKGROUP_SELECTION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WORKGROUP_SELECTION_H_
+
+#include <vector>
+
+namespace tflite {
+namespace gpu {
+
+// PRECISE assume that WorkGroupSize * k = GridSize;
+// NO_ALIGNMENT no restrictions;
+// We need PRECISE when we don't have check in kernel for boundaries
+// If we have the check, we can use PRECISE or NO_ALIGNMENT as well.
+enum class WorkGroupSizeAlignment { PRECISE, NO_ALIGNMENT };
+
+std::vector<int> GetPossibleSizes(int number,
+                                  WorkGroupSizeAlignment z_alignment);
+
+// Specializations exist for int3 and uint3 in the .cc file
+
+template <typename T>
+std::vector<T> GenerateWorkGroupSizes(
+    const T& grid, int min_work_group_total_size, int max_work_group_total_size,
+    const T& max_work_group_sizes, WorkGroupSizeAlignment x_alignment,
+    WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment);
+
+template <typename T>
+void GenerateWorkGroupSizesAlignedToGrid(const T& grid,
+                                         const T& max_work_group_size,
+                                         const int max_work_group_total_size,
+                                         std::vector<T>* work_groups);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WORKGROUP_SELECTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/delegate.h
new file mode 100644
index 00000000..0b11f41b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/delegate.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_DELEGATE_H_
+
+#include <stdint.h>
+
+#include <cstddef>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/delegate_options.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a new delegate instance that need to be destroyed with
+// TfLiteGpuDelegateV2Delete when delegate is no longer used by TFLite.
+//
+// This delegate encapsulates multiple GPU-acceleration APIs under the hood to
+// make use of the fastest available on a device.
+//
+// When `options` is set to `nullptr`, then default options are used.
+//
+// `TfLiteGpuDelegateV2Create` creates an instance for use with the synchronous
+// API.  `TfLiteGpuDelegateV2CreateAsync` creates an instance for use with the
+// asynchronous API.
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateV2Create(
+    const TfLiteGpuDelegateOptionsV2* options);
+#if defined(__ANDROID__)
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateV2CreateAsync(
+    const TfLiteGpuDelegateOptionsV2* options);
+#endif
+
+// Destroys a delegate created with `TfLiteGpuDelegateV2Create` call.
+TFL_CAPI_EXPORT void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate);
+
+TFL_CAPI_EXPORT TfLiteDelegate* tflite_plugin_create_delegate(
+    const char* const* options_keys, const char* const* options_values,
+    size_t num_options, void (*report_error)(const char*));
+
+TFL_CAPI_EXPORT void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/delegate_options.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/delegate_options.h
new file mode 100644
index 00000000..b52d45c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/delegate_options.h
@@ -0,0 +1,167 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_DELEGATE_OPTIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_DELEGATE_OPTIONS_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Encapsulated compilation/runtime tradeoffs.
+enum TfLiteGpuInferenceUsage {
+  // Delegate will be used only once, therefore, bootstrap/init time should
+  // be taken into account.
+  TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0,
+
+  // Prefer maximizing the throughput. Same delegate will be used repeatedly on
+  // multiple inputs.
+  TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1,
+
+  // Balance init latency and throughput. This option will result in slightly
+  // higher init latency than FAST_SINGLE_ANSWER but should have inference
+  // latency closer to SUSTAINED_SPEED.
+  TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED = 2,
+};
+
+enum TfLiteGpuInferencePriority {
+  // AUTO priority is needed when a single priority is the most important
+  // factor. For example,
+  // priority1 = MIN_LATENCY would result in the configuration that achieves
+  // maximum performance.
+  TFLITE_GPU_INFERENCE_PRIORITY_AUTO = 0,
+  TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION = 1,
+  TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY = 2,
+  TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE = 3,
+};
+
+// Used to toggle experimental flags used in the delegate. Note that this is a
+// bitmask, so the values should be 1, 2, 4, 8, ...etc.
+enum TfLiteGpuExperimentalFlags {
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
+  // Enables inference on quantized models with the delegate.
+  // NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default.
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
+  // Enforces execution with the provided backend.
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY = 1 << 2,
+  // Enable serialization of GPU kernels & model data. Speeds up initialization
+  // at the cost of space on disk.
+  // Delegate performs serialization the first time it is applied with a new
+  // model or inference params. Later initializations are fast.
+  // ModifyGraphWithDelegate will fail if data cannot be serialized.
+  //
+  // NOTE: User also needs to set serialization_dir & model_token in
+  // TfLiteGpuDelegateOptionsV2.
+  // Currently works only if CL backend is used.
+  TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_SERIALIZATION = 1 << 3,
+};
+
+// IMPORTANT: Always use TfLiteGpuDelegateOptionsV2Default() method to create
+// new instance of TfLiteGpuDelegateOptionsV2, otherwise every new added option
+// may break inference.
+typedef struct {
+  // When set to zero, computations are carried out in maximal possible
+  // precision. Otherwise, the GPU may quantify tensors, downcast values,
+  // process in FP16 to increase performance. For most models precision loss is
+  // warranted.
+  // [OBSOLETE]: to be removed
+  int32_t is_precision_loss_allowed;
+
+  // Preference is defined in TfLiteGpuInferenceUsage.
+  int32_t inference_preference;
+
+  // Ordered priorities provide better control over desired semantics,
+  // where priority(n) is more important than priority(n+1), therefore,
+  // each time inference engine needs to make a decision, it uses
+  // ordered priorities to do so.
+  // For example:
+  //   MAX_PRECISION at priority1 would not allow to decrease precision,
+  //   but moving it to priority2 or priority3 would result in F16 calculation.
+  //
+  // Priority is defined in TfLiteGpuInferencePriority.
+  // AUTO priority can only be used when higher priorities are fully specified.
+  // For example:
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
+  //            priority3 = AUTO
+  //   INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
+  //   INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
+  //            priority3 = MAX_PRECISION
+  // Invalid priorities will result in error.
+  int32_t inference_priority1;
+  int32_t inference_priority2;
+  int32_t inference_priority3;
+
+  // Bitmask flags. See the comments in TfLiteGpuExperimentalFlags.
+  int64_t experimental_flags;
+
+  // A graph could have multiple partitions that can be delegated to the GPU.
+  // This limits the maximum number of partitions to be delegated. By default,
+  // it's set to 1 in TfLiteGpuDelegateOptionsV2Default().
+  int32_t max_delegated_partitions;
+
+  // The nul-terminated directory to use for serialization.
+  // Whether serialization actually happens or not is dependent on backend used
+  // and validity of this directory.
+  // Set to nullptr in TfLiteGpuDelegateOptionsV2Default(), which implies the
+  // delegate will not try serialization.
+  //
+  // NOTE: Users should ensure that this directory is private to the app to
+  // avoid data access issues.
+  const char* serialization_dir;
+
+  // The unique nul-terminated token string that acts as a 'namespace' for
+  // all serialization entries.
+  // Should be unique to a particular model (graph & constants).
+  // For an example of how to generate this from a TFLite model, see
+  // StrFingerprint() in lite/delegates/serialization.h.
+  //
+  // Set to nullptr in TfLiteGpuDelegateOptionsV2Default(), which implies the
+  // delegate will not try serialization.
+  const char* model_token;
+
+#ifdef TFLITE_DEBUG_DELEGATE
+  // This sets the index of the first node that could be delegated.
+  int first_delegate_node_index;
+  // This sets the index of the last node that could be delegated.
+  int last_delegate_node_index;
+#endif
+#ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
+  // Number of times to invoke the inference in GPU delegate, to collect more
+  // accurate latency result. Default as 1, which is the original behavior.
+  int gpu_invoke_loop_times;
+#endif
+} TfLiteGpuDelegateOptionsV2;
+
+// Populates TfLiteGpuDelegateOptionsV2 as follows:
+//   is_precision_loss_allowed = false
+//   inference_preference = TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER
+//   priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
+//   priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
+//   priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
+//   experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT
+//   max_delegated_partitions = 1
+TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default();
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_DELEGATE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/android_sync.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/android_sync.h
new file mode 100644
index 00000000..3d823596
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/android_sync.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_ANDROID_SYNC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_ANDROID_SYNC_H_
+
+namespace tflite::gpu::gl {
+
+// Insert a gpu wait sync to the GL queue; return true if successful.
+bool WaitFdGpu(int fence_fd);
+
+// Create a GL Fence object and return the associated fd
+int CreateFdGpu();
+
+}  // namespace tflite::gpu::gl
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_ANDROID_SYNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/api.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/api.h
new file mode 100644
index 00000000..11498243
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/api.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_API_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_API_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+#include "tensorflow/lite/delegates/gpu/gl/object_manager.h"
+#include "tensorflow/lite/delegates/gpu/gl/runtime_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/stats.h"
+#include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+class InferenceContext;
+
+// Represents a model that was prepared for execution. It is stored in a format
+// most suitable for execution and optionally may include pre-generated or
+// pre-compiled GPU shaders or whatever is needed for efficient execution.
+class CompiledModel {
+ public:
+  virtual ~CompiledModel() = default;
+
+  virtual CompilerStats stats() const = 0;
+
+  // Creates new inference context. Result can outlive @this.
+  //
+  // NewRun call as well as subsequent calls to InferenceContext methods should
+  // be done from the same EGL context.
+  virtual absl::Status NewRun(
+      const RuntimeOptions& options, const ObjectManager* objects,
+      CommandQueue* command_queue,
+      std::unique_ptr<InferenceContext>* inference_context) const = 0;
+
+#ifndef TFLITE_GPU_BINARY_RELEASE
+  // Serializes compiled model to a string.
+  // @return true if serialization finished successfully.
+  virtual absl::Status Serialize(
+      std::vector<uint8_t>* serialized_compiled_model) const = 0;
+#endif  // TFLITE_GPU_BINARY_RELEASE
+};
+
+// Turns the given model into "compiled" form that is suitable for inference.
+absl::Status Compile(const CompilationOptions& options,
+                     const GraphFloat32& model,
+                     const std::unordered_set<int>& tflite_graph_io,  // NOLINT
+                     const NodeShader& node_shader,
+                     const WorkgroupsCalculator& workgroup_calculator,
+                     std::unique_ptr<CompiledModel>* compiled_model);
+
+#ifndef TFLITE_GPU_BINARY_RELEASE
+// Reads serialized representation previously created with
+// CompiledModel::Serialize call.
+absl::Status ReadSerializedModel(
+    const std::vector<uint8_t>& serialized_model,
+    std::unique_ptr<CompiledModel>* compiled_model);
+#endif  // TFLITE_GPU_BINARY_RELEASE
+
+// Encapsulates everything needed for one or more inference executions done
+// sequentially.
+//
+// Thread-safe.
+class InferenceContext {
+ public:
+  virtual ~InferenceContext() = default;
+
+  virtual RuntimeStats stats() const = 0;
+
+  // Executes inference.
+  virtual absl::Status Execute() = 0;
+
+  // Asks context to reset it for another round. Keep in mind that does not
+  // affect inputs nor outputs which are not cleared, so it is possible to
+  // re-use them.
+  // It is an error to call Reset while previous run is still in progress.
+  virtual absl::Status Reset() = 0;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/api2.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/api2.h
new file mode 100644
index 00000000..05062064
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/api2.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_API2_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_API2_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+struct InferenceOptions : public tflite::gpu::InferenceOptions {};
+
+struct InferenceEnvironmentProperties {
+  bool is_opengl_available = false;
+};
+
+// Manages all resources that need to stay around as long as any inference is
+// running using the OpenGL backend.
+class InferenceEnvironment {
+ public:
+  virtual ~InferenceEnvironment() = default;
+
+  virtual absl::Status NewInferenceBuilder(
+      GraphFloat32&& model, const InferenceOptions& options,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+};
+
+struct InferenceEnvironmentOptions {
+  CommandQueue* queue = nullptr;
+};
+
+// Creates a new OpenGL environment that needs to stay around until all
+// inference runners are destroyed.
+absl::Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties /* optional */);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_API2_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/command_queue.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/command_queue.h
new file mode 100644
index 00000000..d9bff04a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/command_queue.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMMAND_QUEUE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMMAND_QUEUE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// GL programs can be executed directly via dispatch call or using a queue
+// abstraction similar to one in OpenCL and Vulkan.
+// CommandQueue executes given programs in order as they come.
+class CommandQueue {
+ public:
+  virtual ~CommandQueue() = default;
+
+  // Dispatches a program. It may or may not call glFlush.
+  virtual absl::Status Dispatch(const GlProgram& program,
+                                const uint3& workgroups) = 0;
+
+  // Called at the end of dispatching of all programs.
+  virtual absl::Status Flush() = 0;
+
+  // Waits until all programs dispatched prior this call are completed.
+  virtual absl::Status WaitForCompletion() = 0;
+};
+
+// By default memory barrier is inserted after every dispatch.
+std::unique_ptr<CommandQueue> NewCommandQueue(const GpuInfo& gpu_info);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMMAND_QUEUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler.h
new file mode 100644
index 00000000..03ea3dd2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler.h
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_H_
+
+#include <functional>
+#include <memory>
+#include <unordered_set>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+using ShaderCodeCallback = std::function<absl::Status(ShaderCode code)>;
+
+class Compiler {
+ public:
+  virtual ~Compiler() = default;
+
+  // Goes over a graph and generates OpenGL shaders for the given graph.
+  // Callback is called for every generated shader. Callback may execute shaders
+  // as they come or store them elsewhere to execute later.
+  virtual absl::Status Compile(
+      const GraphFloat32& graph,
+      const std::unordered_set<int>& tflite_graph_io,  // NOLINT
+      const ShaderCodeCallback& callback) = 0;
+};
+
+std::unique_ptr<Compiler> NewCompiler(
+    const NodeShader* node_shader, const GpuInfo* gpu_info,
+    const CompilationOptions& options);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h
new file mode 100644
index 00000000..8d36504d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_COMPILED_NODE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_COMPILED_NODE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Contains compiler internal attributes for each node after it was processed by
+// NodeShader.
+struct CompiledNodeAttributes {
+  std::vector<Object> inputs;
+  std::vector<Object> outputs;
+
+  GeneratedCode code;
+
+  // nodes that are covered by the provided shader.
+  std::vector<NodeId> node_indices;
+};
+
+// Moves all code objects, parameters and node indices from attr to merged_attr.
+// Parameters and objects in attr.code.source_code are renamed to ensure
+// uniqueness.
+absl::Status MergeCode(CompiledNodeAttributes* attr,
+                       CompiledNodeAttributes* merged_attr);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_COMPILED_NODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h
new file mode 100644
index 00000000..ff5ac5b8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_AUTO_INPUT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_AUTO_INPUT_H_
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Fuses nodes that have auto output with auto input node using the following
+// rules.
+//
+// Source graph:
+//   A B C
+//   \ | /
+//     D
+//
+// - A, B and C each have a single output marked as AUTO
+// - Each output is used only by D
+// - D has all inputs marked as AUTO
+//
+// Result: in the best case a single node that does (A,B,C)+D operations.
+//
+class FuseAutoInput : public NodeTransformation {
+ public:
+  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_AUTO_INPUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h
new file mode 100644
index 00000000..09e2cc52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INLINE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INLINE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Fuses every two nodes where first node does default output and second node
+// is INLINE.
+//
+// Generates code as follows:
+//   1. all uniforms are inlined
+//   2. source code is wrapped into {}
+// For example:
+//  value = clamp(value, 0.0, clip);
+//  +
+//  value = 1.0 / (1.0 + exp(-1.0 * value));
+// will turn into:
+//  {
+//    value = clamp(value, 0.0, clip);
+//  }
+//  {
+//    value = 1.0 / (1.0 + exp(-1.0 * value));
+//  }
+class FuseAutoOutputWithInline : public SequenceTransformation {
+ public:
+  int ExpectedSequenceLength() const final { return 2; }
+
+  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
+                                       GraphFloat32* graph) final;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INLINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h
new file mode 100644
index 00000000..7b334d27
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INPLACE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INPLACE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Fuse two shaders where second shader is inline shader with the first.
+// First shader should have a special symbol that defines a place where such
+// fusion should be made and what variable needs to be changed.
+// Second shader needs to operation with 'value_0' variable.
+// Example:
+//
+//  First shader:
+//   vec4 result = input_data_0.data[gid.x, gid.y, gid.z];
+//   $inplace_update:result$
+//   ...
+//   output_data_0.data[1,2,3] = result;
+//
+//  Second shader:
+//   value_0 = max(value_0, 0);
+//
+//  Fused shader:
+//   vec4 result = input_data_0.data[gid.x, gid.y, gid.z];
+//   result = max(result, 0);
+//   ...
+//   output_data_0.data[1,2,3] = result;
+//
+class FuseInplaceUpdate : public SequenceTransformation {
+ public:
+  int ExpectedSequenceLength() const final { return 2; }
+
+  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
+                                       GraphFloat32* graph) final;
+};
+
+// Removes all %inplace_update:XXX% strings from the code.
+class RemoveUnusedInplaceUpdates : public NodeTransformation {
+ public:
+  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INPLACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
new file mode 100644
index 00000000..74273a68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// This rewrite handles access to objects both reads and writes.
+//
+// The following syntax is supported to access objects:
+//
+//   READ:
+//     vec4 value = $data[i]$;
+//       where data is a buffer or 1D texture
+//     vec4 value = $data[i,j]$;
+//       where data is 2D texture
+//     vec4 value = $data[i,j,k]$;
+//       where data is 3D texture
+//
+//   WRITE:
+//     $data[i] = value$;
+//       where data is a buffer or 1D texture
+//     $data[i,j] = value$;
+//       where data is 2D texture
+//     $data[i,j,k] = value$;
+//       where data is 3D texture
+//
+// Accessor supports all types (gvecN) as well as float16.
+//
+// TODO(akulik): support field in data[x,y,z].x
+//
+class ObjectAccessor : public InlineRewrite {
+ public:
+  ObjectAccessor(bool is_mali, VariableAccessor* variable_accessor)
+      : ObjectAccessor(is_mali, /*sampler_textures=*/false, variable_accessor) {
+  }
+
+  ObjectAccessor(bool is_mali, bool sampler_textures,
+                 VariableAccessor* variable_accessor)
+      : is_mali_(is_mali),
+        sampler_textures_(sampler_textures),
+        variable_accessor_(variable_accessor) {}
+
+  RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
+
+  // Return true if object was successfully added.
+  bool AddObject(const std::string& name, Object object);
+
+  // Returns objects declarations that need to be added in a shader's code.
+  std::string GetObjectDeclarations() const;
+
+  // Returns functions declarations that need to be added in a shader's code.
+  // These functions are used by code accessing objects.
+  std::string GetFunctionsDeclarations() const;
+
+  // Returns a collection of registered objects
+  std::vector<Object> GetObjects() const;
+
+ private:
+  RewriteStatus RewriteRead(absl::string_view location, std::string* output);
+
+  RewriteStatus RewriteWrite(absl::string_view location,
+                             absl::string_view value, std::string* output);
+
+  std::map<std::string, Object> name_to_object_;
+
+  const bool is_mali_;
+  const bool sampler_textures_;
+  VariableAccessor* variable_accessor_;
+};
+
+// Implementation details below.
+
+namespace object_accessor_internal {
+
+// Refers to an element in an object.
+struct IndexedElement {
+  absl::string_view object_name;
+  std::vector<absl::string_view> indices;
+};
+
+// Splits name[index1, index2...] into 'name' and {'index1', 'index2'...}.
+IndexedElement ParseElement(absl::string_view input);
+
+}  // namespace object_accessor_internal
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h
new file mode 100644
index 00000000..29fad004
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PREPROCESSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PREPROCESSOR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+enum class RewriteStatus {
+  SUCCESS = 0,
+  NOT_RECOGNIZED = 1,
+  ERROR = 2,
+};
+
+// Inline rewrite matches a string and rewrites it.
+class InlineRewrite {
+ public:
+  virtual ~InlineRewrite() = default;
+
+  virtual RewriteStatus Rewrite(absl::string_view input,
+                                std::string* output) = 0;
+};
+
+// Text preprocessor runs a collection of registered rewrites.
+// It uses a single character prefix as inline delimiter that needs to quote
+// text to be rewritten.
+class TextPreprocessor {
+ public:
+  // @param keep_unknown_rewrites if true, will keep unhandled rewrites as is
+  // instead of reporting an error.
+  TextPreprocessor(char inline_delimiter, bool keep_unknown_rewrites)
+      : inline_delimiter_(inline_delimiter),
+        keep_unknown_rewrites_(keep_unknown_rewrites) {}
+
+  void AddRewrite(InlineRewrite* rewrite) {
+    inline_rewrites_.push_back(rewrite);
+  }
+
+  // input and output may point to the same object.
+  absl::Status Rewrite(const std::string& input, std::string* output);
+
+ private:
+  const char inline_delimiter_;
+  const bool keep_unknown_rewrites_;
+
+  std::vector<InlineRewrite*> inline_rewrites_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PREPROCESSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/rename.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/rename.h
new file mode 100644
index 00000000..e38ade1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/rename.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_RENAME_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_RENAME_H_
+
+#include <functional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Functor takes old name and returns new name.
+using NameFunctor = std::function<std::string(absl::string_view name)>;
+
+// Rewrites source code, objects and parameters with the new names supplied
+// by the given functor.
+absl::Status Rename(const NameFunctor& name_func, GeneratedCode* code);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_RENAME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h
new file mode 100644
index 00000000..3f3db379
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+struct ShaderCode {
+  ShaderCode() = default;
+  ShaderCode(const std::vector<Variable>& in_parameters,
+             const std::vector<Object>& in_objects, const uint3& in_workload,
+             const uint3& in_recommended_workgroup,
+             const std::string& in_source_code,
+             const std::vector<NodeId>& in_node_indices)
+      : parameters(in_parameters),
+        objects(in_objects),
+        workload(in_workload),
+        recommended_workgroup(in_recommended_workgroup),
+        source_code(in_source_code),
+        node_indices(in_node_indices) {}
+
+  // A list of uniform parameters to be set.
+  std::vector<Variable> parameters;
+
+  // A list of objects to bind to opengl program.
+  std::vector<Object> objects;
+
+  uint3 workload;
+
+  // operation may specify recommended workgroup size
+  uint3 recommended_workgroup;
+
+  // Generated source code does not set local size, therefore it needs to be set
+  // elsewhere.
+  std::string source_code;
+
+  // nodes of the graph that are covered by the shader.
+  std::vector<NodeId> node_indices;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
new file mode 100644
index 00000000..510d6a27
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODEGEN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODEGEN_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// This class is responsible for assembling a shader by putting together
+// objects, parameters declarations and main function.
+class ShaderCodegen {
+ public:
+  ShaderCodegen(const CompilationOptions& options, const GpuInfo& gpu_info);
+
+  // Builds final program representation.
+  absl::Status Build(CompiledNodeAttributes attr,
+                     ShaderCode* shader_code) const;
+
+ private:
+  const CompilationOptions options_;
+  const GpuVendor gpu_type_;
+  bool inline_parameters_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODEGEN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
new file mode 100644
index 00000000..0eb01c0e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
@@ -0,0 +1,103 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// This rewrite handles access to variables. It may rewrite a variable with
+// actual values if 'inline_values' is set to true.
+//
+// The following syntax is supported to access variables:
+//  - simple variable: name
+//  - variable with field: name.(x|y|z|w)
+//  - variable with index: name[i]
+//  - variable with index and field: name[i].(x|y|z|w)
+//
+// If 'inline_values' is set to true, non-variable-length variables will be
+// inlined. For example, 'base.x' will be replaced with value of 'x' field from
+// 'base'. Variable-length variables are declared as const and accessed via
+// index. These declarations are returned by GetConstDeclarations.
+//
+// If 'inline_values' is set to false, all variables will be declared as
+// uniforms. Uniform declarations are returned by GetUniformDeclarations.
+class VariableAccessor : public InlineRewrite {
+ public:
+  explicit VariableAccessor(bool inline_values, bool vulkan_support = false)
+      : inline_values_(inline_values), vulkan_support_(vulkan_support) {}
+
+  RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
+
+  // Returns true if variable was successfully added.
+  bool AddSharedVariable(Variable&& variable);
+
+  // Returns true if variable was successfully added.
+  bool AddUniformParameter(Variable&& variable);
+
+  // Returns true if variable value is an empty vector.
+  bool IsEmptyVariableLength(const Variable& variable) const;
+
+  // Returns const variables that need to be inlined in the a shader's code.
+  std::string GetConstDeclarations() const;
+
+  // Returns shared variable declarations that need to be inlined.
+  std::string GetSharedVariableDeclarations() const;
+
+  // Returns uniform parameter declarations that need to be inlined.
+  std::string GetUniformParameterDeclarations() const;
+
+  // Returns a collection of uniform parameters.
+  std::vector<Variable> GetUniformParameters() const;
+
+ private:
+  const bool inline_values_;
+  const bool vulkan_support_;
+  absl::flat_hash_map<std::string, Variable> name_to_variable_;
+  std::set<std::string> shared_variables_;
+  std::set<std::string> uniform_parameters_;
+};
+
+// Implementation details below.
+
+namespace variable_accessor_internal {
+
+struct VariableReference {
+  absl::string_view name;
+  absl::string_view index;
+  absl::string_view field;
+};
+
+// Parse the following regex manually
+// name(\[index\])?(\.field)?
+VariableReference Parse(absl::string_view input);
+
+}  // namespace variable_accessor_internal
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler_options.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler_options.h
new file mode 100644
index 00000000..5ea22ea1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/compiler_options.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
+
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Default constructor for options turns on all optimizations.
+struct CompilationOptions {
+  // Allows to quantify tensors, downcast values, process in float16 etc.
+  bool allow_precision_loss = false;
+
+  // When set few operations are fused into a single shader. Therefore, there
+  // will be less shaders, but each shader will become larger.
+  bool fuse_operations = true;
+
+  // Parameters will be inlined into a shader. This in turn will generated more
+  // unique shaders where each will need to be compiled.
+  bool inline_parameters = false;
+
+  // If true, shaders, that have auto-input and auto-output, will use a single
+  // object for reading and writing.
+  bool inline_objects = true;  // TODO(akulik): unsupported
+
+  // Can be only Textures or Buffers
+  ObjectType preferred_obj_type = ObjectType::UNKNOWN;
+  // User has an option to choose between textures and buffers. Textures work
+  // better on Adreno and buffers are better for Mali.
+
+  // Chooses object type to represent intermediate tensors. Buffers have more
+  // efficient memory usage because they represent opaque memory blob, but
+  // textures work better on Adreno.
+  // TODO(akulik): may be better name?
+  ObjectType ref_obj_type = ObjectType::UNKNOWN;
+
+  // If true, a user may change BATCH dimension at runtime. Otherwise, static
+  // batch size will be fixed during compile time.
+  // Dynamic mode uses less memory, while static mode may yield better
+  // performance for small models.
+  bool dynamic_batch = false;
+
+  // Fuses consequent nodes which have auto output and auto input.
+  bool auto_input_fusion = true;
+
+  // If true sampler2D and texelFetch will be used to access read only textures.
+  // This feature is not supported yet by the OpenGL runtime.
+  bool sampler_textures = false;
+
+  // Generate GLSL code compatible with Vulkan.
+  bool vulkan_support = false;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h
new file mode 100644
index 00000000..cb1f345d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_BHWC_TO_PHWC4_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_BHWC_TO_PHWC4_H_
+
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+class ConverterBhwcToPhwc4 {
+ public:
+  // Creates invalid object.
+  ConverterBhwcToPhwc4() : program_(), workgroup_size_() {}
+
+  static absl::Status Create(ConverterBhwcToPhwc4* converter);
+
+  absl::Status Convert(const BHWC& shape, const GlBuffer& source,
+                       CommandQueue* command_queue /* optional */,
+                       GlBuffer* destination);
+
+ private:
+  explicit ConverterBhwcToPhwc4(GlProgram program, const uint3& workgroup_size)
+      : program_(std::move(program)), workgroup_size_(workgroup_size) {}
+
+  GlProgram program_;
+  uint3 workgroup_size_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_BHWC_TO_PHWC4_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h
new file mode 100644
index 00000000..3e1b3a36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_PHWC4_TO_BHWC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_PHWC4_TO_BHWC_H_
+
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+class ConverterPhwc4ToBhwc {
+ public:
+  // Creates invalid object.
+  ConverterPhwc4ToBhwc() : program_(), workgroup_size_() {}
+
+  static absl::Status Create(ConverterPhwc4ToBhwc* converter);
+
+  absl::Status Convert(const BHWC& shape, const GlBuffer& source,
+                       CommandQueue* command_queue /* optional */,
+                       GlBuffer* destination);
+
+ private:
+  explicit ConverterPhwc4ToBhwc(GlProgram program, const uint3& workgroup_size)
+      : program_(std::move(program)), workgroup_size_(workgroup_size) {}
+
+  GlProgram program_;
+  uint3 workgroup_size_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_PHWC4_TO_BHWC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/util.h
new file mode 100644
index 00000000..67f35497
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/converters/util.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_UTIL_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+inline std::string GetShaderHeader(const uint3& localsize) {
+  return absl::StrCat("#version 310 es\nlayout(local_size_x = ", localsize.x,
+                      ", local_size_y = ", localsize.y,
+                      ", local_size_z = ", localsize.z, ") in;\n");
+}
+
+inline uint32_t BytesForPHWC4(const BHWC& shape) {
+  return shape.b * shape.h * shape.w * AlignByN(shape.c, 4) * sizeof(float);
+}
+
+inline uint32_t BytesForBHWC(const BHWC& shape) {
+  return shape.DimensionsProduct() * sizeof(float);
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_context.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_context.h
new file mode 100644
index 00000000..a93f1fdc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_context.h
@@ -0,0 +1,104 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_CONTEXT_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// EglContext is an RAII wrapper for an EGLContext.
+//
+// EglContext is moveable but not copyable.
+//
+// See https://www.khronos.org/registry/EGL/sdk/docs/man/html/eglIntro.xhtml for
+// more info.
+class EglContext {
+ public:
+  // Creates an invalid EglContext.
+  EglContext()
+      : context_(EGL_NO_CONTEXT),
+        display_(EGL_NO_DISPLAY),
+        config_(EGL_NO_CONFIG_KHR),
+        has_ownership_(false) {}
+
+  EglContext(EGLContext context, EGLDisplay display, EGLConfig config,
+             bool has_ownership)
+      : context_(context),
+        display_(display),
+        config_(config),
+        has_ownership_(has_ownership) {}
+
+  // Move only
+  EglContext(EglContext&& other);
+  EglContext& operator=(EglContext&& other);
+  EglContext(const EglContext&) = delete;
+  EglContext& operator=(const EglContext&) = delete;
+
+  ~EglContext() { Invalidate(); }
+
+  EGLContext context() const { return context_; }
+
+  EGLDisplay display() const { return display_; }
+
+  EGLConfig config() const { return config_; }
+
+  // Make this EglContext the current EGL context on this thread, replacing
+  // the existing current.
+  absl::Status MakeCurrent(EGLSurface read, EGLSurface write);
+
+  absl::Status MakeCurrentSurfaceless() {
+    return MakeCurrent(EGL_NO_SURFACE, EGL_NO_SURFACE);
+  }
+
+  // Returns true if this is the currently bound EGL context.
+  bool IsCurrent() const;
+
+  // Returns true if this object actually owns corresponding EGL context
+  // and manages it's lifetime.
+  bool has_ownership() const { return has_ownership_; }
+
+ private:
+  void Invalidate();
+
+  EGLContext context_;
+  EGLDisplay display_;
+  EGLConfig config_;
+
+  bool has_ownership_;
+};
+
+// It uses the EGL_KHR_no_config_context extension to create a no config context
+// since most modern hardware supports the extension.
+absl::Status CreateConfiglessContext(EGLDisplay display,
+                                     EGLContext shared_context,
+                                     EglContext* egl_context);
+
+absl::Status CreateSurfacelessContext(EGLDisplay display,
+                                      EGLContext shared_context,
+                                      EglContext* egl_context);
+
+absl::Status CreatePBufferContext(EGLDisplay display, EGLContext shared_context,
+                                  EglContext* egl_context);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_environment.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_environment.h
new file mode 100644
index 00000000..cb661649
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_environment.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_ENVIRONMENT_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/egl_context.h"
+#include "tensorflow/lite/delegates/gpu/gl/egl_surface.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Class encapsulates creation of OpenGL objects needed before starting working
+// with OpenGL: binds OpenGL ES API, creates new EGL context, binds it to EGL
+// display and creates surfaces if needed.
+//
+// EGL environment needs to be created once per thread.
+class EglEnvironment {
+ public:
+  static absl::Status NewEglEnvironment(
+      std::unique_ptr<EglEnvironment>* egl_environment);
+
+  EglEnvironment() = default;
+  ~EglEnvironment();
+
+  const EglContext& context() const { return context_; }
+  EGLDisplay display() const { return display_; }
+  const GpuInfo& gpu_info() const { return gpu_info_; }
+
+ private:
+  absl::Status Init();
+  absl::Status InitConfiglessContext();
+  absl::Status InitSurfacelessContext();
+  absl::Status InitPBufferContext();
+
+  EGLDisplay display_ = EGL_NO_DISPLAY;
+  EglSurface surface_draw_;
+  EglSurface surface_read_;
+  EglContext context_;
+  GpuInfo gpu_info_;
+
+  // Strange hack that helps on Mali GPUs
+  // without it glFinish and glFenceSync don't work
+  void ForceSyncTurning();
+  GLuint dummy_framebuffer_ = GL_INVALID_INDEX;
+  GLuint dummy_texture_ = GL_INVALID_INDEX;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_surface.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_surface.h
new file mode 100644
index 00000000..5d39aed3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/egl_surface.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_SURFACE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_SURFACE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// An RAII wrapper for EGLSurface.
+// See https://www.khronos.org/registry/EGL/sdk/docs/man/html/eglIntro.xhtml for
+// an introduction to the concepts.
+//
+// EglSurface is moveable but not copyable.
+class EglSurface {
+ public:
+  // Creates an invalid EglSurface.
+  EglSurface() : surface_(EGL_NO_SURFACE), display_(EGL_NO_DISPLAY) {}
+
+  EglSurface(EGLSurface surface, EGLDisplay display)
+      : surface_(surface), display_(display) {}
+
+  // Move-only
+  EglSurface(EglSurface&& other);
+  EglSurface& operator=(EglSurface&& other);
+  EglSurface(const EglSurface&) = delete;
+  EglSurface& operator=(const EglSurface&) = delete;
+
+  ~EglSurface() { Invalidate(); }
+
+  EGLSurface surface() const { return surface_; }
+
+ private:
+  void Invalidate();
+
+  EGLSurface surface_;
+  EGLDisplay display_;
+};
+
+// Creates off-screen pbuffer-based surface of the given height and width.
+absl::Status CreatePbufferRGBSurface(EGLConfig config, EGLDisplay display,
+                                     uint32_t height, uint32_t width,
+                                     EglSurface* egl_surface);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_SURFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/float16_conversions.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/float16_conversions.h
new file mode 100644
index 00000000..304c2a23
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/float16_conversions.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_FLOAT16_CONVERSIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_FLOAT16_CONVERSIONS_H_
+
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// If an object is float32, converts it to float16 representation.
+bool MaybeConvertToFloat16(Object* object);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_FLOAT16_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
new file mode 100644
index 00000000..2dcf7286
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -0,0 +1,335 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_BUFFER_H_
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Buffer is an RAII wrapper for OpenGL buffer object.
+// See https://www.khronos.org/opengl/wiki/Buffer_Object for more information.
+//
+// Buffer is moveable but not copyable.
+class GlBuffer {
+ public:
+  // @param has_ownership indicates that GlBuffer is responsible for
+  // corresponding GL buffer deletion.
+  GlBuffer(GLenum target, GLuint id, size_t bytes_size, size_t offset,
+           bool has_ownership)
+      : target_(target),
+        id_(id),
+        bytes_size_(bytes_size),
+        offset_(offset),
+        has_ownership_(has_ownership) {}
+
+  // Creates invalid buffer.
+  GlBuffer() : GlBuffer(GL_INVALID_ENUM, GL_INVALID_INDEX, 0, 0, false) {}
+
+  // Move-only
+  GlBuffer(GlBuffer&& buffer);
+  GlBuffer& operator=(GlBuffer&& buffer);
+  GlBuffer(const GlBuffer&) = delete;
+  GlBuffer& operator=(const GlBuffer&) = delete;
+
+  ~GlBuffer();
+
+  // Reads data from buffer into CPU memory. Data should point to a region that
+  // has at least bytes_size available.
+  template <typename T>
+  absl::Status Read(absl::Span<T> data) const;
+
+  // Writes data to a buffer.
+  template <typename T>
+  absl::Status Write(absl::Span<const T> data);
+
+  // Maps GPU memory to CPU address space and calls reader that may read from
+  // that memory.
+  template <typename T>
+  absl::Status MappedRead(
+      const std::function<absl::Status(absl::Span<const T>)>& reader) const;
+
+  // Maps GPU memory to CPU address space and calls writer that may write into
+  // that memory.
+  template <typename T>
+  absl::Status MappedWrite(
+      const std::function<absl::Status(absl::Span<T>)>& writer);
+
+  absl::Status MakeView(size_t offset, size_t bytes_size, GlBuffer* gl_buffer);
+
+  // Makes a copy without ownership of the buffer.
+  GlBuffer MakeRef();
+
+  // Binds a buffer to an index.
+  absl::Status BindToIndex(uint32_t index) const;
+
+  // Releases the ownership of the buffer object.
+  void Release() { has_ownership_ = false; }
+
+  size_t bytes_size() const { return bytes_size_; }
+
+  const GLenum target() const { return target_; }
+
+  const GLuint id() const { return id_; }
+
+  bool is_valid() const { return id_ != GL_INVALID_INDEX; }
+
+  size_t offset() const { return offset_; }
+
+  // @return true if this object actually owns corresponding GL buffer
+  //         and manages it's lifetime.
+  bool has_ownership() const { return has_ownership_; }
+
+ private:
+  void Invalidate();
+
+  GLenum target_;
+  GLuint id_;
+  size_t bytes_size_;
+  size_t offset_;
+  bool has_ownership_;
+};
+
+absl::Status CopyBuffer(const GlBuffer& read_buffer,
+                        const GlBuffer& write_buffer);
+
+absl::Status GetSSBOSize(GLuint id, int64_t* size_bytes);
+
+// Creates new shader storage buffer that will be modified and used many
+// times.
+// Buffer will be initialized with 0's.
+//
+// See https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object for
+// details.
+template <typename T>
+absl::Status CreateReadWriteShaderStorageBuffer(uint32_t num_elements,
+                                                GlBuffer* gl_buffer);
+
+// Creates new shader storage buffer that will be filled with data once which
+// will be used many times.
+template <typename T>
+absl::Status CreateReadOnlyShaderStorageBuffer(absl::Span<const T> data,
+                                               GlBuffer* gl_buffer);
+
+// Adapts raw Buffer::Read method to read data into a vector.
+template <typename T>
+absl::Status AppendFromBuffer(const GlBuffer& buffer, std::vector<T>* data) {
+  if (buffer.bytes_size() % sizeof(T) != 0) {
+    return absl::InvalidArgumentError("Buffer is not aligned");
+  }
+  size_t num_elements = buffer.bytes_size() / sizeof(T);
+  data->resize(data->size() + num_elements);
+  return buffer.Read<T>(
+      absl::MakeSpan(data->data() + data->size() - num_elements, num_elements));
+}
+
+// Persistent buffer provides CPU pointer to the buffer that is valid all the
+// time. A user should properly synchronize the access to the buffer on CPU and
+// GPU sides.
+class GlPersistentBuffer : public GlBuffer {
+ public:
+  GlPersistentBuffer(GLenum target, GLuint id, size_t bytes_size, size_t offset,
+                     bool has_ownership, void* data);
+  GlPersistentBuffer();
+
+  // Move-only
+  GlPersistentBuffer(GlPersistentBuffer&& buffer);
+  GlPersistentBuffer& operator=(GlPersistentBuffer&& buffer);
+  GlPersistentBuffer(const GlPersistentBuffer&) = delete;
+  GlPersistentBuffer& operator=(const GlPersistentBuffer&) = delete;
+
+  ~GlPersistentBuffer();
+
+  void* data() { return data_; }
+
+ private:
+  void* data_;
+};
+
+// Creates read-write persistent buffer with valid CPU pointer
+absl::Status CreatePersistentBuffer(size_t size, GlPersistentBuffer* gl_buffer);
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementation details are below.
+
+namespace gl_buffer_internal {
+
+// RAII for creating and/or owning buffer id.
+class BufferId {
+ public:
+  BufferId() : id_(GL_INVALID_INDEX) {
+    TFLITE_GPU_CALL_GL(glGenBuffers, 1 /* number of buffers */, &id_)
+        .IgnoreError();
+    // only possible error here is when a number of buffers is negative.
+  }
+
+  explicit BufferId(GLuint id) : id_(id) {}
+
+  ~BufferId() {
+    if (id_ != GL_INVALID_INDEX) {
+      TFLITE_GPU_CALL_GL(glDeleteBuffers, 1, &id_).IgnoreError();
+    }
+  }
+
+  GLuint id() const { return id_; }
+
+  GLuint Release() {
+    GLuint id = GL_INVALID_INDEX;
+    std::swap(id, id_);
+    return id;
+  }
+
+ private:
+  GLuint id_;
+};
+
+// RAII for binding and unbinding a buffer.
+class BufferBinder {
+ public:
+  BufferBinder(GLenum target, GLuint id) : target_(target), prev_id_(0) {
+    TFLITE_GPU_CALL_GL(glBindBuffer, target_, id).IgnoreError();
+  }
+
+  BufferBinder(GLenum target, GLuint id, GLuint prev_id)
+      : target_(target), prev_id_(prev_id) {
+    TFLITE_GPU_CALL_GL(glBindBuffer, target_, id).IgnoreError();
+  }
+
+  ~BufferBinder() {
+    TFLITE_GPU_CALL_GL(glBindBuffer, target_, prev_id_).IgnoreError();
+  }
+
+ private:
+  const GLenum target_;
+  GLuint prev_id_;
+};
+
+// RAII for mapping and unmapping a buffer.
+class BufferMapper {
+ public:
+  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access);
+
+  ~BufferMapper();
+
+  void* data() { return data_; }
+
+ private:
+  const GLenum target_;
+  void* data_;
+};
+
+}  // namespace gl_buffer_internal
+
+template <typename T>
+absl::Status CreateReadWriteShaderStorageBuffer(uint32_t num_elements,
+                                                GlBuffer* gl_buffer) {
+  gl_buffer_internal::BufferId id;
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
+  // TODO(akulik): benchmark DYNAMIC vs STREAM buffer
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(
+      glBufferData, GL_SHADER_STORAGE_BUFFER, num_elements * sizeof(T),
+      std::vector<T>(num_elements).data(), GL_STREAM_COPY));
+  *gl_buffer = GlBuffer{GL_SHADER_STORAGE_BUFFER, id.Release(),
+                        num_elements * sizeof(T), 0, true};
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status CreateReadOnlyShaderStorageBuffer(absl::Span<const T> data,
+                                               GlBuffer* gl_buffer) {
+  gl_buffer_internal::BufferId id;
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glBufferData, GL_SHADER_STORAGE_BUFFER,
+                                     data.size() * sizeof(T), data.data(),
+                                     GL_STATIC_READ));
+  *gl_buffer = GlBuffer{GL_SHADER_STORAGE_BUFFER, id.Release(),
+                        data.size() * sizeof(T), 0, true};
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status GlBuffer::Read(absl::Span<T> data) const {
+  if (data.size() * sizeof(T) < bytes_size()) {
+    return absl::InvalidArgumentError(
+        "Read from buffer failed. Destination data is shorter than buffer.");
+  }
+  // TODO(akulik): glCopyBufferSubData is actually available in ES 3.1, try it.
+  return MappedRead<T>([this, data](absl::Span<const T> src) {
+    std::memcpy(data.data(), src.data(), bytes_size());
+    return absl::OkStatus();
+  });
+}
+
+template <typename T>
+absl::Status GlBuffer::Write(absl::Span<const T> data) {
+  if (data.size() * sizeof(T) > bytes_size_) {
+    return absl::InvalidArgumentError(
+        "Write to buffer failed. Source data is larger than buffer.");
+  }
+  gl_buffer_internal::BufferBinder binder(target_, id_);
+  return TFLITE_GPU_CALL_GL(glBufferSubData, target_, offset_, bytes_size_,
+                            data.data());
+}
+
+template <typename T>
+absl::Status GlBuffer::MappedRead(
+    const std::function<absl::Status(absl::Span<const T> d)>& reader) const {
+  if (bytes_size_ % sizeof(T) != 0) {
+    return absl::InvalidArgumentError("Buffer is not aligned");
+  }
+  gl_buffer_internal::BufferBinder binder(target_, id_);
+  gl_buffer_internal::BufferMapper mapper(target_, offset_, bytes_size_,
+                                          GL_MAP_READ_BIT);
+  if (!mapper.data()) {
+    return GetOpenGlErrors();
+  }
+  return reader(absl::MakeSpan(reinterpret_cast<const T*>(mapper.data()),
+                               bytes_size_ / sizeof(T)));
+}
+
+template <typename T>
+absl::Status GlBuffer::MappedWrite(
+    const std::function<absl::Status(absl::Span<T> d)>& writer) {
+  if (bytes_size_ % sizeof(T) != 0) {
+    return absl::InvalidArgumentError("Buffer is not aligned");
+  }
+  gl_buffer_internal::BufferBinder binder(target_, id_);
+  gl_buffer_internal::BufferMapper mapper(target_, offset_, bytes_size_,
+                                          GL_MAP_WRITE_BIT);
+  if (!mapper.data()) {
+    return GetOpenGlErrors();
+  }
+  return writer(absl::MakeSpan(reinterpret_cast<T*>(mapper.data()),
+                               bytes_size_ / sizeof(T)));
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_call.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_call.h
new file mode 100644
index 00000000..1a392d6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_call.h
@@ -0,0 +1,118 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_CALL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_CALL_H_
+
+#include <string>
+#include <type_traits>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Primary purpose of this file is to provide useful macro for calling GL
+// functions and checking errors. It also attaches a context to status in case
+// of a GL error.
+//
+// Use TFLITE_GPU_CALL_GL as follows:
+//
+//   For GL functions with a return value:
+//     Before:
+//       GLint result = glFunc(...);
+//       RETURN_IF_ERROR(GetOpenGlErrors());
+//     After:
+//       GLint result;
+//       RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glFunc, &result, ...));
+//
+//   For GL functions without a return value:
+//     Before:
+//       glFunc(...);
+//       RETURN_IF_ERROR(GetOpenGlErrors());
+//     After:
+//       RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glFunc, ...));
+
+namespace gl_call_internal {
+
+// For GL functions with a return value.
+template <typename T>
+struct Caller {
+  template <typename F, typename ErrorF, typename... Params>
+  absl::Status operator()(const std::string& context, F func, ErrorF error_func,
+                          T* result, Params&&... params) {
+    *result = func(std::forward<Params>(params)...);
+    const auto status = error_func();
+    if (status.ok()) return absl::OkStatus();
+    return absl::Status(status.code(),
+                        std::string(status.message()) + ": " + context);
+  }
+};
+
+// For GL functions without a return value.
+template<>
+struct Caller<void> {
+  template <typename F, typename ErrorF, typename... Params>
+  absl::Status operator()(const std::string& context, F func, ErrorF error_func,
+                          Params&&... params) {
+    func(std::forward<Params>(params)...);
+    const auto status = error_func();
+    if (status.ok()) return absl::OkStatus();
+    return absl::Status(status.code(),
+                        std::string(status.message()) + ": " + context);
+  }
+};
+
+template <typename F, typename ErrorF, typename ResultT, typename... ParamsT>
+absl::Status CallAndCheckError(const std::string& context, F func,
+                               ErrorF error_func, ResultT* result,
+                               ParamsT&&... params) {
+  return Caller<ResultT>()(context, func, error_func, result,
+                           std::forward<ParamsT>(params)...);
+}
+
+template <typename F, typename ErrorF, typename... Params>
+absl::Status CallAndCheckError(const std::string& context, F func,
+                               ErrorF error_func, Params&&... params) {
+  return Caller<void>()(context, func, error_func,
+                        std::forward<Params>(params)...);
+}
+
+}  // namespace gl_call_internal
+
+// XX_STRINGIFY is a helper macro to effectively apply # operator to an
+// arbitrary value.
+#define TFLITE_GPU_INTERNAL_STRINGIFY_HELPER(x) #x
+#define TFLITE_GPU_INTERNAL_STRINGIFY(x) TFLITE_GPU_INTERNAL_STRINGIFY_HELPER(x)
+#define TFLITE_GPU_FILE_LINE \
+  __FILE__ ":" TFLITE_GPU_INTERNAL_STRINGIFY(__LINE__)
+
+#define TFLITE_GPU_CALL_GL(method, ...)                   \
+  ::tflite::gpu::gl::gl_call_internal::CallAndCheckError( \
+      #method " in " TFLITE_GPU_FILE_LINE, method,        \
+      ::tflite::gpu::gl::GetOpenGlErrors, __VA_ARGS__)
+
+#define TFLITE_GPU_CALL_EGL(method, ...)                  \
+  ::tflite::gpu::gl::gl_call_internal::CallAndCheckError( \
+      #method " in " TFLITE_GPU_FILE_LINE, method,        \
+      ::tflite::gpu::gl::GetEglError, __VA_ARGS__)
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_CALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_errors.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_errors.h
new file mode 100644
index 00000000..761eddd8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_errors.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_ERRORS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_ERRORS_H_
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// @return recent opengl errors and packs them into Status.
+absl::Status GetOpenGlErrors();
+
+// @return the error of the last called EGL function in the current thread.
+absl::Status GetEglError();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_ERRORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_program.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_program.h
new file mode 100644
index 00000000..892cb8e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_program.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_PROGRAM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_PROGRAM_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_shader.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// A wrapper around opengl program id that needs to be recycled when not needed.
+// Encapsulates logic needed to bind parameters, link a program and execute it.
+class GlProgram {
+ public:
+  // Creates invalid program.
+  GlProgram() : id_(0) {}
+
+  // Creates new program, initializes it, attaches the given shader and links
+  // a program. Thus, if this call returns a program, one may set parameters and
+  // finally execute a program.
+  // therefore it needs to be handled elsewhere.
+  static absl::Status CreateWithShader(const GlShader& shader,
+                                       GlProgram* gl_program);
+
+  // Same as CreateWithShader but takes compiled shader in a binary form,
+  // therefore compilation step is avoided.
+  static absl::Status CreateWithBinaryShader(const BinaryShader& shader,
+                                             GlProgram* gl_program);
+
+  // move-only
+  GlProgram(GlProgram&& program);
+  GlProgram& operator=(GlProgram&& program);
+  GlProgram(const GlProgram&) = delete;
+  GlProgram& operator=(const GlProgram&) = delete;
+
+  ~GlProgram();
+
+  GLuint id() const { return id_; }
+
+  // Returns a binary representation for a shader currently attached and linked
+  // into this program.
+  absl::Status GetBinary(BinaryShader* binary_shader);
+
+  absl::Status SetParameter(const Variable& param);
+
+  // Executes program
+  absl::Status Dispatch(const uint3& workgroups) const;
+
+  bool is_valid() const { return id_ != 0; }
+
+ private:
+  explicit GlProgram(GLuint program_id) : id_(program_id) {}
+
+  void Invalidate();
+
+  GLint GetUniformId(const std::string& name);
+
+  GLuint id_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_shader.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_shader.h
new file mode 100644
index 00000000..1419c94a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_shader.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SHADER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SHADER_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// A wrapper around opengl shader id that needs to be recycled when not needed.
+class GlShader {
+ public:
+  // Creates and compiles a shader.
+  //
+  // @param shader_type is one of GL_VERTEX_SHADER, GL_FRAGMENT_SHADER, or
+  // GL_COMPUTE_SHADER.
+  static absl::Status CompileShader(GLenum shader_type,
+                                    const std::string& shader_source,
+                                    GlShader* gl_shader);
+
+  GlShader() : id_(0) {}
+
+  // move-only
+  GlShader(GlShader&& shader);
+  GlShader& operator=(GlShader&& shader);
+  GlShader(const GlShader&) = delete;
+  GlShader& operator=(const GlShader&) = delete;
+
+  ~GlShader();
+
+  GLuint id() const { return id_; }
+
+ private:
+  explicit GlShader(GLuint id) : id_(id) {}
+
+  void Invalidate();
+
+  GLuint id_;
+};
+
+// Holds binary blob for compiled shader. It can be used to instantiate
+// a program instead of plain Shader that will need to be compiled first.
+//
+// Some OpenGL implementations allow to extract binary representation once it
+// is compiled. Call Program::GetBinary after program is successfully created
+// with a shader from sources.
+class BinaryShader {
+ public:
+  BinaryShader(GLenum format, std::vector<uint8_t> binary)
+      : format_(format), binary_(std::move(binary)) {}
+
+  GLenum format() const { return format_; }
+
+  const std::vector<uint8_t>& binary() const { return binary_; }
+
+ private:
+  GLenum format_;
+  std::vector<uint8_t> binary_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SHADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_sync.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_sync.h
new file mode 100644
index 00000000..6dc12016
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_sync.h
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SYNC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SYNC_H_
+
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// RAII wrapper for OpenGL GLsync object.
+// See https://www.khronos.org/opengl/wiki/Sync_Object for more information.
+//
+// GlSync is moveable but not copyable.
+class GlSync {
+ public:
+  static absl::Status NewSync(GlSync* gl_sync) {
+    GLsync sync;
+    RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glFenceSync, &sync,
+                                       GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
+    *gl_sync = GlSync(sync);
+    return absl::OkStatus();
+  }
+
+  // Creates invalid object.
+  GlSync() : GlSync(nullptr) {}
+
+  // Move-only
+  GlSync(GlSync&& sync) : sync_(sync.sync_) { sync.sync_ = nullptr; }
+
+  GlSync& operator=(GlSync&& sync) {
+    if (this != &sync) {
+      Invalidate();
+      std::swap(sync_, sync.sync_);
+    }
+    return *this;
+  }
+
+  GlSync(const GlSync&) = delete;
+  GlSync& operator=(const GlSync&) = delete;
+
+  ~GlSync() { Invalidate(); }
+
+  const GLsync sync() const { return sync_; }
+
+ private:
+  explicit GlSync(GLsync sync) : sync_(sync) {}
+
+  void Invalidate() {
+    if (sync_) {
+      glDeleteSync(sync_);
+      sync_ = nullptr;
+    }
+  }
+
+  GLsync sync_;
+};
+
+// Waits until GPU is done with processing.
+absl::Status GlSyncWait();
+
+// Waits until all commands are flushed and then performs active waiting by
+// spinning a thread and checking sync status. It leads to shorter wait time
+// (up to tens of ms) but consumes more CPU.
+absl::Status GlActiveSyncWait();
+
+// CPU checks the value in the buffer that is going to be written by GPU. The
+// persistent buffer is used for the simultaneous access to the buffer by GPU
+// and CPU. The instance remains invalid if persistent buffer OpenGL extension
+// is not supported by the device.
+class GlShaderSync {
+ public:
+  static absl::Status NewSync(GlShaderSync* gl_sync);
+  GlShaderSync() {}
+  absl::Status Wait();
+
+ private:
+  GlProgram flag_program_;
+  GlPersistentBuffer flag_buffer_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SYNC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_texture.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_texture.h
new file mode 100644
index 00000000..c52ae0be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_texture.h
@@ -0,0 +1,208 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_TEXTURE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Texture is an RAII wrapper for OpenGL texture object.
+// See https://www.khronos.org/opengl/wiki/Texture for more information.
+//
+// Texture is moveable but not copyable.
+class GlTexture {
+ public:
+  // Creates invalid texture.
+  GlTexture()
+      : GlTexture(GL_INVALID_ENUM, GL_INVALID_INDEX, GL_INVALID_ENUM, 0, 0,
+                  false) {}
+
+  GlTexture(GLenum target, GLuint id, GLenum format, size_t bytes_size,
+            GLint layer, bool owned)
+      : id_(id),
+        target_(target),
+        format_(format),
+        bytes_size_(bytes_size),
+        layer_(layer),
+        owned_(owned) {}
+
+  // Move-only
+  GlTexture(GlTexture&& texture);
+  GlTexture& operator=(GlTexture&& texture);
+  GlTexture(const GlTexture&) = delete;
+  GlTexture& operator=(const GlTexture&) = delete;
+
+  ~GlTexture();
+
+  // Binds a texture as an image to the given index.
+  absl::Status BindAsReadonlyImage(uint32_t index) const;
+
+  // Bind texture as an image for write access at given index.
+  absl::Status BindAsWriteonlyImage(uint32_t index) const;
+
+  // Bind texture as an image for read-write access at given index.
+  absl::Status BindAsReadWriteImage(uint32_t index) const;
+
+  // Binds a texture as a sampler to the given index.
+  absl::Status BindAsSampler2D(uint32_t index) const;
+
+  GLenum target() const { return target_; }
+
+  GLuint id() const { return id_; }
+
+  GLenum format() const { return format_; }
+
+  GLint layer() const { return layer_; }
+
+  bool is_valid() const { return id_ != GL_INVALID_INDEX; }
+
+  size_t bytes_size() const { return bytes_size_; }
+
+  // @return true if this object actually owns corresponding GL buffer
+  //         and manages it's lifetime.
+  bool has_ownership() const { return owned_; }
+
+ private:
+  void Invalidate();
+
+  absl::Status BindImage(uint32_t index, GLenum access) const;
+
+  GLuint id_;
+  GLenum target_;
+  GLenum format_;
+  size_t bytes_size_;
+  GLint layer_;
+  bool owned_;
+};
+
+// Creates new 2D image texture that will be filled with float32 data once which
+// will be used for reading.
+//
+// @param size defines 2D image texture size where each pixel is RGBA.
+absl::Status CreateReadOnlyImageTexture(const uint2& size,
+                                        absl::Span<const float> data,
+                                        GlTexture* gl_texture);
+
+// Creates new 2D image texture that will be filled with float16 data once which
+// will be used for reading.
+//
+// @param size defines 2D image texture size where each pixel is RGBA.
+absl::Status CreateReadOnlyImageTextureF16(const uint2& size,
+                                           absl::Span<const uint16_t> data,
+                                           GlTexture* gl_texture);
+
+// Creates new 2D image texture that will be filled with uint8 data once which
+// will be used for reading.
+//
+// @param size defines 2D image texture size where each pixel is RGBA.
+absl::Status CreateReadOnlyImageTextureU8(const uint2& size,
+                                          absl::Span<const uint8_t> data,
+                                          GlTexture* gl_texture);
+
+// Creates new 3D RGBA image texture that will be filled with float32 data once
+// which will be used for reading.
+//
+// @param size defines 3D image texture size where each pixel is RGBA.
+absl::Status CreateReadOnlyImageTexture(const uint3& size,
+                                        absl::Span<const float> data,
+                                        GlTexture* gl_texture);
+
+// Creates new 3D RGBA image texture that will be filled with float16 data once
+// which will be used for reading.
+//
+// @param size defines 3D image texture size where each pixel is RGBA.
+absl::Status CreateReadOnlyImageTextureF16(const uint3& size,
+                                           absl::Span<const uint16_t> data,
+                                           GlTexture* gl_texture);
+
+// Creates new RGBA 2D image texture
+//
+// @param size defines 2D image texture size where each pixel is RGBA.
+absl::Status CreateReadWriteRgbaImageTexture(DataType data_type,
+                                             const uint2& size,
+                                             GlTexture* gl_texture);
+
+// Creates new RGBA 3D image texture
+//
+// @param size defines 3D image texture size where each pixel is RGBA.
+absl::Status CreateReadWriteRgbaImageTexture(DataType data_type,
+                                             const uint3& size,
+                                             GlTexture* gl_texture);
+
+namespace gl_texture_internal {
+
+// RAII for creating and/or owning texture id.
+class TextureId {
+ public:
+  TextureId() : id_(GL_INVALID_INDEX) {
+    TFLITE_GPU_CALL_GL(glGenTextures, 1 /* number of textures*/, &id_)
+        .IgnoreError();
+  }
+
+  explicit TextureId(GLuint id) : id_(id) {}
+
+  ~TextureId() {
+    if (id_ != GL_INVALID_INDEX) {
+      TFLITE_GPU_CALL_GL(glDeleteTextures, 1, &id_).IgnoreError();
+    }
+  }
+
+  GLuint id() const { return id_; }
+
+  GLuint Release() {
+    GLuint id = GL_INVALID_INDEX;
+    std::swap(id, id_);
+    return id;
+  }
+
+ private:
+  GLuint id_;
+};
+
+// RAII for binding and unbinding a texture.
+class TextureBinder {
+ public:
+  TextureBinder(GLenum target, GLuint id) : target_(target) {
+    TFLITE_GPU_CALL_GL(glBindTexture, target_, id).IgnoreError();
+  }
+
+  ~TextureBinder() {
+    TFLITE_GPU_CALL_GL(glBindTexture, target_, 0).IgnoreError();
+  }
+
+ private:
+  const GLenum target_;
+};
+
+}  // namespace gl_texture_internal
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_TEXTURE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_texture_helper.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_texture_helper.h
new file mode 100644
index 00000000..e520de32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/gl_texture_helper.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_TEXTURE_HELPER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_TEXTURE_HELPER_H_
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// From https://www.khronos.org/opengl/wiki/Normalized_Integer
+// A Normalized Integer is an integer which is used to store a decimal floating
+// point number. When formats use such an integer, OpenGL will automatically
+// convert them to/from floating point values as needed. This allows normalized
+// integers to be treated equivalently with floating-point values, acting as a
+// form of compression.
+GLenum ToTextureFormat(DataType type, bool normalized = false);
+
+GLenum ToTextureInternalFormat(DataType type, bool normalized = false);
+
+GLenum ToTextureDataType(DataType type);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_TEXTURE_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/add.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/add.h
new file mode 100644
index 00000000..cfd6ce84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/add.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_ADD_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewAddNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/concat.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/concat.h
new file mode 100644
index 00000000..34c027da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/concat.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONCAT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONCAT_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewAlignedConcatNodeShader();
+std::unique_ptr<NodeShader> NewConcatNodeShader();
+std::unique_ptr<NodeShader> NewFlatConcatNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONCAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/conv.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/conv.h
new file mode 100644
index 00000000..c2f2d217
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/conv.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONV_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewConvolutionNodeShader();
+
+// Specialization for 1x1 convolutions.
+std::unique_ptr<NodeShader> NewConvolution1x1NodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/converter.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/converter.h
new file mode 100644
index 00000000..c5f2ba20
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/converter.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONVERTER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+#include "tensorflow/lite/delegates/gpu/spi.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Supports conversions from DHWC4 to internal OpenGL tensor representation and
+// back. Supports F32 only.
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    CommandQueue* command_queue /* optional */);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
new file mode 100644
index 00000000..7b2a841b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CUSTOM_REGISTRY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CUSTOM_REGISTRY_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Registers custom operations.
+void RegisterCustomOps(
+    absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+        shaders_);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_CUSTOM_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.h
new file mode 100644
index 00000000..a953010e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_DEPTHWISE_CONV_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewDepthwiseConvolutionNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_DEPTHWISE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.h
new file mode 100644
index 00000000..42109d91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_ELEMENTWISE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewElementwiseNodeShader(
+    OperationType operation_type);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_ELEMENTWISE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.h
new file mode 100644
index 00000000..3a137f4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_FULLY_CONNECTED_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewFullyConnectedNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/lstm.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/lstm.h
new file mode 100644
index 00000000..fcc5acdd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/lstm.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_LSTM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_LSTM_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewLstmNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_LSTM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.h
new file mode 100644
index 00000000..f4deb739
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MAX_UNPOOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MAX_UNPOOLING_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewMaxUnpoolingNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MAX_UNPOOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/mean.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/mean.h
new file mode 100644
index 00000000..af2628fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/mean.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MEAN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MEAN_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewMeanNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MEAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/mul.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/mul.h
new file mode 100644
index 00000000..bbfa63de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/mul.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MUL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MUL_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewMultiplyNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_MUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/pad.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/pad.h
new file mode 100644
index 00000000..c6840df1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/pad.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_PAD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_PAD_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewPadNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_PAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/pooling.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/pooling.h
new file mode 100644
index 00000000..c4f650cf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/pooling.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_POOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_POOLING_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewPoolingNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_POOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/prelu.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/prelu.h
new file mode 100644
index 00000000..30d30198
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/prelu.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_PRELU_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_PRELU_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewPReLUNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_PRELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
new file mode 100644
index 00000000..46c40385
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Performs the operation: {Quantize, Dequantize} on floating-point data.
+// We need this operation to emulate the error introduced by quantization
+// on the GPU, which cannot represent int8 tensors.
+//
+// Implemented as:
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
+// dq_value = qvalue * qscale + qmin
+// Here, qmin, qmax & qscale refer to the quantization values as implemented in
+// TensorFlow Lite's 'FakeQuant' kernel.
+//
+// NOTE: We do not need to nudge min/max values in this op, since they would
+// already be adjusted while generating the quantized model.
+std::unique_ptr<NodeShader> NewQuantizeAndDequantizeNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/registry.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/registry.h
new file mode 100644
index 00000000..009a9283
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/registry.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_REGISTRY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_REGISTRY_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewNodeShaderRegistry();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/relu.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/relu.h
new file mode 100644
index 00000000..fdc812ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/relu.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RELU_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RELU_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewReLUNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/resampler.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/resampler.h
new file mode 100644
index 00000000..57dad5cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/resampler.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESAMPLER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESAMPLER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewResamplerNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESAMPLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/reshape.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/reshape.h
new file mode 100644
index 00000000..b2b0914a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/reshape.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESHAPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESHAPE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewReshapeNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/resize.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/resize.h
new file mode 100644
index 00000000..9d414043
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/resize.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESIZE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewResizeNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_RESIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/slice.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/slice.h
new file mode 100644
index 00000000..bf93043f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/slice.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SLICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SLICE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewSliceNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h
new file mode 100644
index 00000000..2b6c7863
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/softmax.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SOFTMAX_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SOFTMAX_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewSoftmaxNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SOFTMAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
new file mode 100644
index 00000000..0d867e68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewSpaceToDepthNodeShader();
+
+std::unique_ptr<NodeShader> NewDepthToSpaceNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/test_util.h
new file mode 100644
index 00000000..42a78902
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/test_util.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TEST_UTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+#include "tensorflow/lite/delegates/gpu/gl/runtime_options.h"
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) ASSERT_THAT(x.message(), testing::StrEq(""));
+#endif
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+class SingleOpModel {
+ public:
+  SingleOpModel() = delete;
+  SingleOpModel(Operation&& operation,
+                const std::vector<TensorRef<BHWC>>& inputs,
+                const std::vector<TensorRef<BHWC>>& outputs);
+
+  virtual ~SingleOpModel() = default;
+
+  bool PopulateTensor(int index, std::vector<float>&& data);
+
+  absl::Status Invoke(const NodeShader& shader);
+  absl::Status Invoke(const CompilationOptions& compile_options,
+                      const RuntimeOptions& runtime_options,
+                      const NodeShader& shader);
+
+  const std::vector<float>& GetOutput(int index) const {
+    return outputs_[index].data;
+  }
+
+ protected:
+  GraphFloat32 graph_;
+  std::vector<TensorFloat32> inputs_;
+  std::vector<TensorFloat32> outputs_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/tile.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/tile.h
new file mode 100644
index 00000000..a4470281
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/tile.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TILE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TILE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewTileNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.h
new file mode 100644
index 00000000..553704bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TRANSPOSE_CONV_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewConvolutionTransposedNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_TRANSPOSE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/node_shader.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/node_shader.h
new file mode 100644
index 00000000..9abe41fa
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/node_shader.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_NODE_SHADER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_NODE_SHADER_H_
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+enum class IOStructure {
+  // Source code uses standard inputs or outputs that should be generated from
+  // node inputs/outputs. Compiler will generate them automatically as
+  // 'input_data_N'/'output_data_N', where N is an index of the input/output.
+  //
+  // Generated code should not return input objects.
+  ONLY_DEFINITIONS,
+
+  // For inputs:
+  //   Source code runs computations using 'vec4 value_N' declared by the
+  //   compiler, where N is an index of the input. Each value comes from inputs
+  //   using coordinates set by GlobalInvocationID and a dispatch method,
+  //   therefore, source code should not explicitly read values.
+  //
+  // For outputs:
+  //   Source code runs computations and leaves results in 'vec4 value_N'
+  //   declared by the compiler, where N is an index of the output. Value will
+  //   be written to the output using coordinates set by GlobalInvocationID and
+  //   a dispatch method. Therefore, source code should not explicitly write
+  //   results.
+  AUTO,
+};
+
+struct GeneratedCode {
+  // A list of parameters to be set as uniform or hardcoded in a shader.
+  std::vector<Variable> parameters;
+
+  // A list of objects to bind before shader could be executed.
+  std::vector<std::pair<std::string, Object>> objects;
+
+  // A list of shared variables in the shader program.
+  std::vector<Variable> shared_variables;
+
+  // Compute shader operate on an abstract concept of work groups, each
+  // three-dimensional. The number of work groups to be executed is defined by
+  // workload tuple. Therefore,
+  //   workload[x,y,z] := workgroup_size[x,y,z] X workgroup_count[x,y,z]
+  // where 'X' is element-wise multiplication.
+  //
+  // Zero workload is calculated as PHWC4 based on output tensor.
+  uint3 workload;
+
+  // operation may specify recommended workgroup size. If not set, runtime will
+  // figure it out automatically.
+  uint3 workgroup;
+
+  std::string source_code;
+
+  // Parameters below reveal additional information about source_code.
+
+  IOStructure input;
+  IOStructure output;
+};
+
+// A class handles shader generation and setting runtime shader parameters.
+class NodeShader {
+ public:
+  virtual ~NodeShader() = default;
+
+  // A context for generating a code.
+  struct GenerationContext {
+    const GpuInfo* gpu_info;
+    CompilationOptions compiler_options;
+
+    // Information extracted & copied from compiled graph.
+    const std::string& op_type;
+    const absl::any& op_attr;
+    // Do NOT use StrongShape<Layout::BHWC> in preparation for
+    // RankedTensorType::getShape() which returns ArrayRef<int64_t>.
+    std::vector<std::array<int64_t, 4>> input_shapes;
+    std::vector<std::array<int64_t, 4>> output_shapes;
+  };
+
+  // Generates shader code for a node. The code should be just a function body.
+  virtual absl::Status GenerateCode(const GenerationContext& ctx,
+                                    GeneratedCode* generated_code) const = 0;
+
+  // Limit the size of the const offsets array
+  static constexpr int kMaxConstArraySize = 9;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_NODE_SHADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/object.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/object.h
new file mode 100644
index 00000000..aee7d595
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/object.h
@@ -0,0 +1,188 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_OBJECT_H_
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+using ObjectData = std::vector<uint8_t>;
+
+// Generic identifier to be used to lookup an object.
+using ObjectRef = uint32_t;
+
+constexpr ObjectRef kInvalidObjectRef = ~0;
+
+enum class ObjectType : int {
+  UNKNOWN = 0,
+  TEXTURE = 1,
+  BUFFER = 2,
+};
+
+using ObjectSize = absl::variant<size_t, uint2, uint3>;
+
+// An object represents a reference to or pre-defined constant OpenGL Buffer or
+// Texture. NodeShader is supposed to set all fields but leave binding = 0
+// that will be set later by a compiler.
+struct Object {
+  AccessType access;
+
+  DataType data_type;
+
+  ObjectType object_type;
+
+  // OpenGL-specific binding information
+  uint32_t binding;
+
+  // Indicates size of 1D, 2D or 3D object in elements, where single element
+  // consists of 4 values.
+  ObjectSize size;
+
+  absl::variant<ObjectData, ObjectRef> object;
+};
+
+// @return true if object is a reference.
+inline bool IsRef(const Object& object) {
+  return !std::holds_alternative<ObjectData>(object.object);
+}
+
+inline ObjectRef GetRef(const Object& object) {
+  auto ref = std::get_if<ObjectRef>(&object.object);
+  return ref ? *ref : kInvalidObjectRef;
+}
+
+inline const ObjectData* GetData(const Object& object) {
+  return std::get_if<ObjectData>(&object.object);
+}
+
+inline size_t ByteSizeOf(const Object& object);
+
+// @return object that references an object created externally.
+inline Object MakeObjectRef(ObjectRef unique_id, const ObjectSize& size,
+                            AccessType access_type) {
+  return Object{access_type, DataType::FLOAT32, ObjectType::UNKNOWN, 0,
+                size,        unique_id};
+}
+
+namespace internal_object {
+
+template <typename T>
+std::vector<uint8_t> ToBytesVector(const std::vector<T>& data,
+                                   size_t alignment) {
+  std::vector<uint8_t> t(AlignByN(data.size() * sizeof(T), alignment));
+  std::memcpy(t.data(), data.data(), data.size() * sizeof(T));
+  return t;
+}
+
+struct ObjectSizer {
+  size_t operator()(const uint3& size) const {
+    return size.x * size.y * size.z;
+  }
+
+  size_t operator()(const uint2& size) const { return size.x * size.y; }
+
+  size_t operator()(uint32_t size) const { return size; }
+};
+
+}  // namespace internal_object
+
+inline size_t NumElements(const ObjectSize& size) {
+  return std::visit(internal_object::ObjectSizer{}, size);
+}
+
+inline size_t ByteSizeOf(const Object& object) {
+  return SizeOf(object.data_type) * /* vec4 */ 4 * NumElements(object.size);
+}
+
+inline Object MakeReadonlyObject(const ObjectSize& size,
+                                 const std::vector<float>& data) {
+  return Object{AccessType::READ,
+                DataType::FLOAT32,
+                ObjectType::UNKNOWN,
+                0,
+                size,
+                internal_object::ToBytesVector(data, 16)};
+}
+
+inline Object MakeReadonlyTexture(const ObjectSize& size,
+                                  const std::vector<float>& data) {
+  return Object{AccessType::READ,
+                DataType::FLOAT32,
+                ObjectType::TEXTURE,
+                0,
+                size,
+                internal_object::ToBytesVector(data, 16)};
+}
+
+inline Object MakeReadonlyBuffer(const ObjectSize& size,
+                                 const std::vector<float>& data) {
+  return Object{AccessType::READ,
+                DataType::FLOAT32,
+                ObjectType::BUFFER,
+                0,
+                size,
+                internal_object::ToBytesVector(data, 16)};
+}
+
+inline Object MakeReadonlyObject(const std::vector<float>& data) {
+  return MakeReadonlyObject(
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+}
+
+inline Object MakeReadonlyTexture(const std::vector<float>& data) {
+  return MakeReadonlyTexture(
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+}
+
+inline Object MakeReadonlyBuffer(const std::vector<float>& data) {
+  return MakeReadonlyBuffer(
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+}
+
+// TODO(akulik): find better place for functions below.
+
+inline uint3 GetPHWC4Size(const BHWC& shape) {
+  uint3 size;
+  size.x = shape.w;
+  size.y = shape.h;
+  size.z = shape.b * DivideRoundUp(shape.c, 4);
+  return size;
+}
+
+inline Object MakePHWC4Ref(uint32_t global_id, const BHWC& shape) {
+  return MakeObjectRef(global_id, GetPHWC4Size(shape), AccessType::READ_WRITE);
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_OBJECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/object_manager.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/object_manager.h
new file mode 100644
index 00000000..0a7de28e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/object_manager.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_OBJECT_MANAGER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_OBJECT_MANAGER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_texture.h"
+#include "tensorflow/lite/delegates/gpu/gl/stats.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// ObjectManager is a registry that owns corresponding objects and provides
+// discovery functionality. All objects are kept until manager is destroyed.
+//
+// All buffers and textures share the same id space, therefore, it is an error
+// to register two objects with the same id.
+// TODO(akulik): make ObjectManager templated by object type.
+class ObjectManager {
+ public:
+  // Moves ownership over the given buffer to the manager.
+  absl::Status RegisterBuffer(uint32_t id, GlBuffer buffer);
+
+  void RemoveBuffer(uint32_t id);
+
+  // Return a permanent pointer to a buffer for the given id or nullptr.
+  GlBuffer* FindBuffer(uint32_t id) const;
+
+  // Moves ownership over the given texture to the manager.
+  absl::Status RegisterTexture(uint32_t id, GlTexture texture);
+
+  void RemoveTexture(uint32_t id);
+
+  // Return a permanent pointer to a texture for the given id or nullptr.
+  GlTexture* FindTexture(uint32_t id) const;
+
+  ObjectsStats stats() const;
+
+ private:
+  std::vector<std::unique_ptr<GlBuffer>> buffers_;
+  std::vector<std::unique_ptr<GlTexture>> textures_;
+};
+
+// TODO(akulik): find better place for functions below.
+
+// Creates read-only buffer from the given tensor. Tensor data is converted to
+// PHWC4 layout.
+absl::Status CreatePHWC4BufferFromTensor(const TensorFloat32& tensor,
+                                         GlBuffer* gl_buffer);
+
+// Creates read-write buffer for the given tensor shape, where data layout is
+// supposed to be PHWC4.
+absl::Status CreatePHWC4BufferFromTensorRef(const TensorRef<BHWC>& tensor_ref,
+                                            GlBuffer* gl_buffer);
+
+// Copies data from a buffer that holds data in PHWC4 layout to the given
+// tensor.
+absl::Status CopyFromPHWC4Buffer(const GlBuffer& buffer, TensorFloat32* tensor);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_OBJECT_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/portable_egl.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/portable_egl.h
new file mode 100644
index 00000000..7be19851
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/portable_egl.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_PORTABLE_EGL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_PORTABLE_EGL_H_
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_PORTABLE_EGL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/portable_gl31.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/portable_gl31.h
new file mode 100644
index 00000000..78ac048f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/portable_gl31.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_PORTABLE_GL31_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_PORTABLE_GL31_H_
+
+#define HAS_EGL 1
+
+#include <EGL/egl.h>
+#include <GLES2/gl2.h>
+#include <GLES2/gl2ext.h>
+
+#ifdef __ANDROID__
+// Weak-link all GL APIs included from this point on.
+// TODO(camillol): Annotate these with availability attributes for the
+// appropriate versions of Android, by including gl{3,31,31}.h and resetting
+// GL_APICALL for each.
+#undef GL_APICALL
+#define GL_APICALL __attribute__((weak_import)) KHRONOS_APICALL
+#endif  // __ANDROID__
+
+#include <GLES3/gl31.h>
+#include <GLES3/gl32.h>
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_PORTABLE_GL31_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/request_gpu_info.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/request_gpu_info.h
new file mode 100644
index 00000000..9c47de10
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/request_gpu_info.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_REQUEST_GPU_INFO_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_REQUEST_GPU_INFO_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// This method performs multiple GL calls, therefore, egl context needs to be
+// created upfront.
+absl::Status RequestOpenGlInfo(OpenGlInfo* gl_info);
+
+// This method performs multiple GL calls, therefore, egl context needs to be
+// created upfront.
+absl::Status RequestGpuInfo(GpuInfo* gpu_info);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_REQUEST_GPU_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime.h
new file mode 100644
index 00000000..d9ff5681
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime.h
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_H_
+
+#include <functional>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_shader.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+#include "tensorflow/lite/delegates/gpu/gl/object_manager.h"
+#include "tensorflow/lite/delegates/gpu/gl/runtime/shared_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/runtime_options.h"
+#include "tensorflow/lite/delegates/gpu/gl/stats.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Runtime compiles code and executes it once all code is compiled. It creates
+// intermediate objects and destroys them when runtime is destroyed.
+class Runtime {
+ public:
+  Runtime(const RuntimeOptions& options, const GpuInfo& gpu_info,
+          CommandQueue* command_queue, const ObjectManager* external_objects);
+
+  // Takes parameters and objects and prepares GL program.
+  absl::Status AddProgram(const GlShader& shader,
+                          const std::vector<Variable>& parameters,
+                          const std::vector<Object>& objects,
+                          const uint3& num_workgroups);
+
+  // Needs to be called once all programs and shaders has been added to runtime.
+  absl::Status PrepareForExecution();
+
+  // Executes all compiled programs.
+  // TODO(akulik): add more controls over execution. Execution policy?
+  absl::Status Execute();
+
+  // Gets access to objects created while executing generated code.
+  const ObjectManager* internal_objects() const { return &internal_objects_; }
+
+  CommandQueue* command_queue() { return command_queue_; }
+
+  RuntimeStats stats() const {
+    RuntimeStats stats;
+    stats.const_objects = const_objects_.stats();
+    stats.internal_objects = internal_objects_.stats();
+    if (external_objects_) {
+      stats.external_objects = external_objects_->stats();
+    }
+    return stats;
+  }
+
+ private:
+  absl::Status AllocateInternalObject(const Object& object);
+
+  absl::Status AllocateConstObject(const Object& object, uint32_t* id);
+
+  // Goes over objects in programs and decides how to allocate them to
+  // minimize total allocated memory. Returns a collection of objects to be
+  // allocated and shared by internal objects.
+  absl::Status AssignInternalObjects(std::vector<Object>* objects);
+
+  const RuntimeOptions options_;
+  const GpuInfo gpu_info_;
+  const ObjectManager* external_objects_;
+  CommandQueue* command_queue_;
+
+  ObjectManager internal_objects_;
+  ObjectManager const_objects_;
+  uint32_t next_const_id_ = 0;  // id for const objects
+
+  std::unique_ptr<SharedBufferData> shared_readonly_buffer_;
+
+  using BindFunc = std::function<absl::Status()>;
+
+  // Encapsulates a program and all object to bind before dispatch.
+  struct CompiledProgramDescriptor {
+    GlProgram program;
+    uint3 num_workgroups;
+
+    std::vector<BindFunc> bindings;
+    std::vector<Object> refs;
+  };
+
+  std::vector<CompiledProgramDescriptor> programs_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime/shared_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime/shared_buffer.h
new file mode 100644
index 00000000..11b09463
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime/shared_buffer.h
@@ -0,0 +1,80 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_SHARED_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_SHARED_BUFFER_H_
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Class accumulates readonly data and creates a single buffer out of it.
+// User should call Add one or more times and complete shared buffer creation
+// with CreateSharedBuffer() call.
+class SharedBufferData {
+ public:
+  SharedBufferData() {
+    glGetIntegerv(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT, &alignment_);
+  }
+
+  // @return true if data was added to the shared buffer.
+  bool Add(const ObjectData& data, GlBuffer* buffer) {
+    // TODO(akulik): Does it make sense to bundle even big buffers > 1MB?
+
+    // align buffer's data.
+    shared_data_.resize(AlignByN(shared_data_.size(), alignment_), 0);
+    // Accumulate readonly data in a single shared buffer buffer.
+    *buffer = GlBuffer(GL_SHADER_STORAGE_BUFFER, buffer_id_.id(), data.size(),
+                       shared_data_.size(), /*has_ownership=*/false);
+    std::copy(data.begin(), data.end(), std::back_inserter(shared_data_));
+    return true;
+  }
+
+  bool empty() const { return shared_data_.empty(); }
+
+  // Returns a single GlBuffer that owns entire shared data.
+  absl::Status CreateSharedGlBuffer(GlBuffer* gl_buffer) {
+    // Upload data to a buffer
+    gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER,
+                                            buffer_id_.id());
+    RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glBufferData, GL_SHADER_STORAGE_BUFFER,
+                                       shared_data_.size(), shared_data_.data(),
+                                       GL_STATIC_READ));
+    *gl_buffer = GlBuffer(GL_SHADER_STORAGE_BUFFER, buffer_id_.Release(),
+                          shared_data_.size(), 0, /*has_ownership=*/true);
+    return absl::OkStatus();
+  }
+
+ private:
+  GLint alignment_ = 256;
+  gl_buffer_internal::BufferId buffer_id_;
+  ObjectData shared_data_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_SHARED_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime_options.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime_options.h
new file mode 100644
index 00000000..44e054ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/runtime_options.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_OPTIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_OPTIONS_H_
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+struct RuntimeOptions {
+  RuntimeOptions()
+      : reuse_internal_objects(true), bundle_readonly_objects(true) {}
+
+  // If enabled triggers greedy algorithm to re-use internal buffers when
+  // possible.
+  // Keep this false when, for example, one need to analyze intermediate
+  // results for debugging purposes.
+  bool reuse_internal_objects;
+
+  // If enabled all readonly objects will be bundled to create as few buffers or
+  // textures as possible.
+  bool bundle_readonly_objects;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_RUNTIME_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/serialization.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/serialization.h
new file mode 100644
index 00000000..82b76a47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/serialization.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_SERIALIZATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_SERIALIZATION_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiled_model_generated.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+struct CompiledModelOptions {
+  // If true, a model was compiled with dynamic batch size and therefore,
+  // a user may change BATCH dimension at runtime.
+  bool dynamic_batch = false;
+};
+
+// Accumulates shaders and programs and stores it in FlatBuffer format.
+class SerializedCompiledModelBuilder {
+ public:
+  SerializedCompiledModelBuilder() : builder_(32 * 1024) {}
+
+  void AddShader(const std::string& shader_src);
+
+  void AddProgram(const std::vector<Variable>& parameters,
+                  const std::vector<Object>& objects,
+                  const uint3& workgroup_size, const uint3& num_workgroups,
+                  size_t shader_index);
+
+  // Returns serialized data that will stay valid until this object is
+  // destroyed.
+  absl::Span<const uint8_t> Finalize(const CompiledModelOptions& options);
+
+ private:
+  std::vector<flatbuffers::Offset<flatbuffers::String>> shaders_;
+  std::vector<flatbuffers::Offset<data::Program>> programs_;
+  ::flatbuffers::FlatBufferBuilder builder_;
+};
+
+// Handles deserialization events. it is guaranteed that shaders will be called
+// first in the appropriate order and programs come next.
+class DeserializationHandler {
+ public:
+  virtual ~DeserializationHandler() = default;
+
+  virtual absl::Status OnShader(absl::Span<const char> shader_src) = 0;
+
+  virtual absl::Status OnProgram(const std::vector<Variable>& parameters,
+                                 const std::vector<Object>& objects,
+                                 const uint3& workgroup_size,
+                                 const uint3& num_workgroups,
+                                 size_t shader_index) = 0;
+
+  virtual void OnOptions(const CompiledModelOptions& options) = 0;
+};
+
+absl::Status DeserializeCompiledModel(absl::Span<const uint8_t> serialized,
+                                      DeserializationHandler* handler);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_SERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/stats.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/stats.h
new file mode 100644
index 00000000..31583b73
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/stats.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_STATS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_STATS_H_
+
+#include <numeric>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// A collection of compile-time stats exposed via API.
+struct CompilerStats {};
+
+struct ObjectStats {
+  // Number of allocated objects.
+  int32_t count = 0;
+
+  // Total bytes allocated.
+  int64_t total_bytes = 0;
+};
+
+struct ObjectsStats {
+  ObjectStats buffers;
+
+  ObjectStats textures;
+};
+
+// A collection of runtime-time stats exposed via API.
+struct RuntimeStats {
+  ObjectsStats internal_objects;
+
+  ObjectsStats const_objects;
+
+  ObjectsStats external_objects;
+};
+
+inline std::string ToString(const ObjectStats& stats) {
+  return absl::StrCat("count = ", stats.count,
+                      ", total bytes = ", stats.total_bytes);
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_STATS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/variable.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/variable.h
new file mode 100644
index 00000000..41ceb7e6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/variable.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_VARIABLE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_VARIABLE_H_
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+struct Variable {
+  using ValueType =
+      absl::variant<int32_t, int2, int4, uint32_t, uint4, float, float2, float4,
+                    std::vector<int2>, std::vector<float4>>;
+
+  std::string name;
+  ValueType value;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h
new file mode 100644
index 00000000..9bf1c4cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_BEST_EFFORT_CALCULATOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_BEST_EFFORT_CALCULATOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<WorkgroupsCalculator> BestEffortWorkgroupsCalculator(
+    const uint8_t* metadata, const GpuInfo& gpu_info);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_BEST_EFFORT_CALCULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h
new file mode 100644
index 00000000..13224742
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+constexpr uint3 kEmptyWorkgroupSize(0, 0, 0);
+
+// Calculates workgroup size for the given shader code in a model graph.
+//
+// Potentially there are multiple implementations possible:
+//   - per-operation type hard-coded constants
+//   - statistic-based calculator that uses aggregated stats for all operations
+class WorkgroupsCalculator {
+ public:
+  explicit WorkgroupsCalculator(const GpuInfo& gpu_info);
+
+  virtual ~WorkgroupsCalculator() = default;
+
+  // Uses shader code recommended work group size if available and doesn't
+  // exceed max work group invocations num, otherwise work group size from
+  // passed calculator.
+  uint3 Calculate(const ShaderCode& shader_code) const;
+
+ protected:
+  virtual uint3 CalculateInternal(const ShaderCode& shader_code) const = 0;
+
+ private:
+  GpuInfo gpu_info_;
+};
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h
new file mode 100644
index 00000000..5087bfca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_FROM_METADATA_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_FROM_METADATA_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Creates new workgroup calculator that uses extra information serialized in
+// metadata.
+std::unique_ptr<WorkgroupsCalculator> NewWorkgroupsCalculatorFromMetadata(
+    const uint8_t* metadata, const GpuInfo& gpu_info);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_CALCULATOR_FROM_METADATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h
new file mode 100644
index 00000000..0c23a962
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_DEFAULT_CALCULATOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_DEFAULT_CALCULATOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Creates new workgroups calculator for the general case or specificly for Mali
+std::unique_ptr<WorkgroupsCalculator> NewDefaultWorkgroupsCalculator(
+    const GpuInfo& gpu_info);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_DEFAULT_CALCULATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h
new file mode 100644
index 00000000..57d4ffde
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_IDEAL_WORKGROUP_PICKER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_IDEAL_WORKGROUP_PICKER_H_
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Picks up the ideal workgroup size for the given convolution case.
+// Ideal workgroup gives top 10% of the possible performance for the given case.
+// They are received after the workgroup performance research (b/117291356).
+uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
+                                  OperationType op_type, HW kernel, HW strides,
+                                  OHWI workload);
+
+// Does the same as the function above. Use this one if your operation can
+// suggest some reasonable workgroup size. It's expected to give better
+// performance than the default workgroup calculator.
+uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
+                                  OperationType op_type, HW kernel, HW strides,
+                                  uint3 default_wg, OHWI workload);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_WORKGROUPS_IDEAL_WORKGROUP_PICKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl_delegate.h
new file mode 100644
index 00000000..bcf2cd83
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/gl_delegate.h
@@ -0,0 +1,134 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
+
+#include <GLES3/gl31.h>
+#include <stdint.h>
+
+#include "absl/base/attributes.h"
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+//
+// GPU delegate declared in this file is OBSOLETE and replaced with the delegate
+// declared in delegate.h. New delegate combines all GL, CL and soon
+// Vulkan-based implementations in one.
+// Please migrate before end of 2019.
+//
+// WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+// LINT.IfChange
+enum TfLiteGlObjectType {
+  TFLITE_GL_OBJECT_TYPE_FASTEST = 0,
+  TFLITE_GL_OBJECT_TYPE_TEXTURE = 1,
+  TFLITE_GL_OBJECT_TYPE_BUFFER = 2,
+};
+
+// Shader compilation options.
+// Always use TfLiteGlCompileOptionsDefault() method to create new instance
+// of TfLiteGlCompileOptions, otherwise every new added option may break
+// inference.
+// TODO(impjdi): Unify with opengl::CompilationOptions.
+typedef struct {
+  // When set to zero, computations are carried out in 32-bit floating point.
+  // Otherwise, the GPU may quantify tensors, downcast values, process in FP16
+  // (recommended).
+  int32_t precision_loss_allowed;
+
+  // User's preferred GL object to represent tensors.  When set to:
+  // * `TFLITE_GL_OBJECT_TYPE_FASTEST`, the delegate chooses a GL object type
+  //   automatically that will perform fastest (recommended).
+  // * `TFLITE_GL_OBJECT_TYPE_TEXTURE`: GL textures are used to represent
+  //   tensors which often work faster on Adreno-based devices, but may use more
+  //   memory.
+  // * `TFLITE_GL_OBJECT_TYPE_BUFFER`: GL shader storage buffer objects are used
+  //   to represent tensors.
+  int32_t preferred_gl_object_type;
+
+  // When set to zero, dynamic batching is disabled and input/output tensors
+  // must have a batch size of 1 (probably what you unless you use LSTMs).
+  // Otherwise, enables dynamic batching and input/output tensor can have a
+  // batch size greater than 1.
+  int32_t dynamic_batch_enabled;
+
+  // Parameters will be inlined into a shader. This in turn will generated more
+  // unique shaders where each will need to be compiled.
+  int32_t inline_parameters;
+} TfLiteGlCompileOptions;
+
+// Populates TfLiteGlCompileOptions as follows:
+//   precision_loss_allowed = 0;
+//   preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
+//   dynamic_batch_enabled = 0;
+//   inline_parameters = 0;
+TFL_CAPI_EXPORT TfLiteGlCompileOptions TfLiteGlCompileOptionsDefault();
+
+// Always use TfLiteGpuDelegateOptionsDefault() method to create new instance
+// of TfLiteGpuDelegateOptions, otherwise every new added option may break
+// inference.
+typedef struct {
+  const uint8_t* metadata;  // Internal.
+  TfLiteGlCompileOptions compile_options;
+} TfLiteGpuDelegateOptions;
+
+// Populates TfLiteGlCompileOptions as follows:
+//   metadata = nullptr;
+//   compile_options = TfLiteGlCompileOptionsDefault();
+TFL_CAPI_EXPORT TfLiteGpuDelegateOptions TfLiteGpuDelegateOptionsDefault();
+
+// LINT.ThenChange(//tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java)
+
+// Creates a new delegate instance that need to be destroyed with
+// TfLiteGpuDelegateDelete when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, the following default values are used:
+// .metadata = nullptr,
+// .compile_options = {
+//   .precision_loss_allowed = false,
+//   .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
+//   .dynamic_batch_enabled = false,
+// },
+ABSL_DEPRECATED("Use TfLiteGpuDelegateV2Create defined in delegate.h instead.")
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate(
+    const TfLiteGpuDelegateOptions* options);
+
+// Destroys a delegate created with `TfLiteGpuDelegateCreate` call.
+TFL_CAPI_EXPORT void TfLiteGpuDelegateDelete(TfLiteDelegate* delegate);
+
+// Binds GL shader storage object to an input or an output tensor in the
+// initialized delegate.  Bound buffer should have sufficient storage to
+// accommodate all elements of a tensor.
+//
+// *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
+TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindBufferToTensor(
+    TfLiteDelegate* delegate, GLuint buffer, int tensor_index);
+
+#ifndef TFLITE_GPU_BINARY_RELEASE
+// Returns the metadata of `tflite_model` if it has one, or `nullptr` otherwise.
+// Designed to be used with `TfLiteGpuDelegateOptions.metadata`.
+TFL_CAPI_EXPORT const uint8_t* TfLiteGpuDelegateGetModelMetadata(
+    const void* tflite_model);
+#endif  // TFLITE_GPU_BINARY_RELEASE
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/buffer.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/buffer.h
new file mode 100644
index 00000000..ac042e51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/buffer.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_H_
+
+#include <string>
+#include <vector>
+
+#import <Metal/Metal.h>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class Buffer : public GPUObject {
+ public:
+  Buffer() {}  // just for using Buffer as a class members
+  Buffer(id<MTLBuffer> buffer, size_t size_in_bytes);
+
+  // Move only
+  Buffer(Buffer&& buffer);
+  Buffer& operator=(Buffer&& buffer);
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+  ~Buffer();
+
+  // for profiling and memory statistics
+  uint64_t GetMemorySizeInBytes() const { return size_; }
+
+  id<MTLBuffer> GetMemoryPtr() const { return buffer_; }
+
+  // Writes data to a buffer. Data should point to a region that
+  // has exact size in bytes as size_in_bytes(constructor parameter).
+  template <typename T>
+  absl::Status WriteData(const absl::Span<T> data);
+
+  // Reads data from Buffer into CPU memory.
+  template <typename T>
+  absl::Status ReadData(std::vector<T>* result) const;
+
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
+
+  absl::Status CreateFromBufferDescriptor(const BufferDescriptor& desc, id<MTLDevice> device);
+
+ private:
+  void Release();
+
+  id<MTLBuffer> buffer_ = nullptr;
+  size_t size_;
+};
+
+absl::Status CreateBuffer(size_t size_in_bytes, const void* data, id<MTLDevice> device,
+                          Buffer* result);
+
+template <typename T>
+absl::Status Buffer::WriteData(const absl::Span<T> data) {
+  if (size_ != sizeof(T) * data.size()) {
+    return absl::InvalidArgumentError(
+        "absl::Span<T> data size is different from buffer allocated size.");
+  }
+  std::memcpy([buffer_ contents], data.data(), size_);
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status Buffer::ReadData(std::vector<T>* result) const {
+  if (size_ % sizeof(T) != 0) {
+    return absl::UnknownError("Wrong element size(typename T is not correct?");
+  }
+
+  const int elements_count = size_ / sizeof(T);
+  result->resize(elements_count);
+  std::memcpy(result->data(), [buffer_ contents], size_);
+
+  return absl::OkStatus();
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/buffer_convert.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/buffer_convert.h
new file mode 100644
index 00000000..52738a9e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/buffer_convert.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_CONVERT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_CONVERT_H_
+
+#import <Metal/Metal.h>
+
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
+@interface TFLBufferConvert : NSObject
+
+/// Constructs converter from/to BHWC <-> BPHWC4
+/// @param isFloat16 the BPHWC4 buffer is in float16 format.
+/// @param convertToPBHWC4 convert BHWC -> BPHWC4 if true or BPHWC4 -> BHWC instead.
+- (id)initWithDevice:(id<MTLDevice>)device
+           isFloat16:(bool)isFloat16
+     convertToPBHWC4:(bool)convertToPBHWC4;
+
+/// Converts from/to BHWC <-> BPHWC4
+/// @param shape shape of BHWC tensor.
+- (void)convertWithEncoder:(id<MTLComputeCommandEncoder>)encoder
+                     shape:(const ::tflite::gpu::BHWC&)shape
+              sourceBuffer:(id<MTLBuffer>)sourceBuffer
+           convertedBuffer:(id<MTLBuffer>)convertedBuffer;
+
+@end
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_BUFFER_CONVERT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/common.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/common.h
new file mode 100644
index 00000000..553b44c4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/common.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMMON_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMMON_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+/// Returns system default device on iOS or Intel GPU on macOS.
+id<MTLDevice> GetBestSupportedMetalDevice();
+
+absl::Status CreateComputeProgram(
+    id<MTLDevice> device, const std::string& code,
+    const std::string& function_name,
+    const std::map<std::string, std::string>& macros,
+    id<MTLComputePipelineState>* program);
+
+absl::Status CreateComputeProgramWithArgumentBuffer(
+    id<MTLDevice> device, const std::string& code,
+    const std::string& function_name,
+    const std::map<std::string, std::string>& macros,
+    id<MTLComputePipelineState>* program,
+    id<MTLArgumentEncoder>* arguments_encoder);
+
+// ICB - indirect command buffer
+absl::Status CreateComputeProgramWithICBSupport(
+    id<MTLDevice> device, const std::string& code,
+    const std::string& function_name,
+    const std::map<std::string, std::string>& macros,
+    id<MTLComputePipelineState>* program,
+    id<MTLArgumentEncoder>* arguments_encoder);
+
+absl::Status CreateFunction(id<MTLDevice> device, const std::string& code,
+                            const std::string& function_name,
+                            const std::map<std::string, std::string>& macros,
+                            id<MTLFunction>* function);
+
+int PixelFormatToSizeInBytes(MTLPixelFormat pixel_format);
+MTLPixelFormat DataTypeToRGBAPixelFormat(DataType type, bool normalized = false);
+
+void WriteDataToTexture2D(id<MTLTexture> texture, id<MTLDevice> device, const void* data);
+void ReadDataFromTexture2D(id<MTLTexture> texture, id<MTLDevice> device, void* data);
+
+void WriteDataToTexture3D(id<MTLTexture> texture, id<MTLDevice> device, const void* data);
+void ReadDataFromTexture3D(id<MTLTexture> texture, id<MTLDevice> device, void* data);
+
+void WriteDataToTexture2DArray(id<MTLTexture> texture, id<MTLDevice> device, const void* data);
+void ReadDataFromTexture2DArray(id<MTLTexture> texture, id<MTLDevice> device, void* data);
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/compute_task.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/compute_task.h
new file mode 100644
index 00000000..e51406e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/compute_task.h
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMPUTE_TASK_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMPUTE_TASK_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
+#include "tensorflow/lite/delegates/gpu/metal/common.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_device.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class ComputeTask {
+ public:
+  ComputeTask() = default;
+  ~ComputeTask();
+
+  // Move only
+  ComputeTask(ComputeTask&& task);
+  ComputeTask& operator=(ComputeTask&& task);
+  ComputeTask(const ComputeTask&) = delete;
+  ComputeTask& operator=(const ComputeTask&) = delete;
+
+  void Init(std::unique_ptr<GPUOperation>&& operation);
+
+  const GPUOperation& GetGpuOperation() const { return *operation_; }
+
+  absl::Status Compile(MetalDevice* device);
+
+  // should be called after changes of inputs/outputs.
+  absl::Status UpdateParams();
+
+  void Encode(id<MTLComputeCommandEncoder> encoder);
+
+  API_AVAILABLE(ios(13.0), macos(11.00), tvos(13.0))
+  void EncodeToICB(id<MTLIndirectComputeCommand> icb_command);
+  API_AVAILABLE(ios(11.0), macos(10.13), tvos(11.0))
+  void AddResourcesToEncoder(id<MTLComputeCommandEncoder> encoder) const;
+
+  void Update();
+
+  void SetSrcTensor(MetalSpatialTensor* tensor, int index);
+
+  void SetDstTensor(MetalSpatialTensor* tensor, int index);
+
+  absl::Status Tune(TuningType tuning_type, MetalDevice* device);
+
+  int3 GetWorkGroupSize() const { return operation_->work_group_size_; }
+  void SetWorkGroupSize(const int3& work_group_size);
+
+  const std::string& GetCode() const { return operation_->code_; }
+  const std::map<std::string, std::string>& GetDefines() const {
+    return defines_;
+  }
+
+  absl::Status Init(MetalDevice* device, const std::string& code,
+                    const std::map<std::string, std::string>& defines);
+  absl::Status RestoreDeserialized(MetalDevice* device);
+
+ private:
+  absl::Status CompileProgram(
+      MetalDevice* device, const std::string& code,
+      const std::map<std::string, std::string>& defines);
+  void Release();
+
+  std::unique_ptr<GPUOperation> operation_;
+  id<MTLComputePipelineState> program_ = nullptr;
+  MetalArguments metal_args_;
+
+  bool use_arguments_buffer_ = false;  // optional
+  bool need_icb_support_ = false;      // optional
+  id<MTLArgumentEncoder> arguments_encoder_ = nullptr;
+  id<MTLBuffer> arg_buffer_ = nullptr;
+
+  // for serialization
+  std::map<std::string, std::string> defines_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMPUTE_TASK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/gpu_object.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/gpu_object.h
new file mode 100644
index 00000000..0c29f4b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/gpu_object.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+struct GPUResourcesWithValue {
+  GenericGPUResourcesWithValue generic;
+
+  struct BufferParameter {
+    id<MTLBuffer> handle;
+    uint64_t offset;
+  };
+  std::vector<std::pair<std::string, BufferParameter>> buffers;
+  std::vector<std::pair<std::string, id<MTLTexture>>> images2d;
+  std::vector<std::pair<std::string, id<MTLTexture>>> image2d_arrays;
+  std::vector<std::pair<std::string, id<MTLTexture>>> images3d;
+  std::vector<std::pair<std::string, id<MTLTexture>>> image_buffers;
+
+  void AddFloat(const std::string& name, float value) {
+    generic.AddFloat(name, value);
+  }
+  void AddInt(const std::string& name, int value) {
+    generic.AddInt(name, value);
+  }
+};
+
+class GPUObject {
+ public:
+  GPUObject() = default;
+  // Move only
+  GPUObject(GPUObject&& obj_desc) = default;
+  GPUObject& operator=(GPUObject&& obj_desc) = default;
+  GPUObject(const GPUObject&) = delete;
+  GPUObject& operator=(const GPUObject&) = delete;
+  virtual ~GPUObject() = default;
+  virtual absl::Status GetGPUResources(
+      const GPUObjectDescriptor* obj_ptr,
+      GPUResourcesWithValue* resources) const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_GPU_OBJECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/inference_context.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/inference_context.h
new file mode 100644
index 00000000..9e945bdf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/inference_context.h
@@ -0,0 +1,197 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_INFERENCE_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_INFERENCE_CONTEXT_H_
+
+#import <Metal/Metal.h>
+
+#include <list>
+#include <map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_model_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task.h"
+#include "tensorflow/lite/delegates/gpu/metal/inference_context_generated.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_device.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+struct MetalNode {
+  ComputeTask task;
+  std::vector<ValueId> inputs;
+  std::vector<ValueId> outputs;
+
+  // Mostly for debug purposes.
+  std::string name;
+
+  MetalNode() = default;
+
+  MetalNode(MetalNode&& node) = default;
+  MetalNode& operator=(MetalNode&& node) = default;
+  MetalNode(const MetalNode&) = delete;
+  MetalNode& operator=(const MetalNode&) = delete;
+};
+
+class InferenceContext {
+ public:
+  InferenceContext() = default;
+
+  // IMPORTANT: If InitFromGraph used, RunGraphTransforms must be applied for
+  // this graph upfront, otherwise not guaranteed correct behavior
+  absl::Status InitFromGraph(const CreateGpuModelInfo& create_info,
+                             const GraphFloat32& graph, id<MTLDevice> device_id,
+                             std::vector<uint8_t>* serialized_model = nullptr);
+
+  // Applies specific transformations to the graph before the
+  // initialization. These transformations are either impossible or useless in
+  // other backends.
+  absl::Status InitFromGraphWithTransforms(
+      const CreateGpuModelInfo& create_info, GraphFloat32* graph,
+      id<MTLDevice> device_id,
+      std::vector<uint8_t>* serialized_model = nullptr);
+
+  absl::Status RestoreDeserialized(
+      const absl::Span<const uint8_t> serialized_model, id<MTLDevice> device_id,
+      CreateGpuModelInfo* create_info = nullptr);
+
+  /// Inserts all GPU compute tasks into the command encoder.
+  /// @param inputOutputBuffers Must be created and passed into the method
+  /// with pairs ID:buffer
+  /// @discussion No GPU synchronization functions are used inside. All GPU
+  /// resources must be created
+  ///             with the same device which has been used in
+  ///             compileModelWithDevice() method.
+  void EncodeWithEncoder(id<MTLComputeCommandEncoder> command_encoder);
+
+  /// Inserts all GPU compute tasks into the command buffer. For every task will
+  /// be used separate
+  ///   encoder.
+  /// @param inputOutputBuffers Must be created and passed into the method with
+  /// pairs ID:buffer
+  /// @discussion No GPU synchronization functions are used inside. All GPU
+  /// resources must be created
+  ///             with the same device which has been used in
+  ///             compileModelWithDevice() method.
+  void EncodeWithCommandBuffer(id<MTLCommandBuffer> command_buffer);
+
+  /// Adds all GPU compute tasks to the command queue. For every task will be
+  /// used separate
+  ///   encoder. Few encoders(flushPeriod) batched into compute buffer that sent
+  ///   for execution.
+  /// @param inputOutputBuffers Must be created and passed into the method with
+  /// pairs ID:buffer
+  /// @discussion No GPU synchronization functions are used inside. All GPU
+  /// resources must be created
+  ///             with the same device which has been used in
+  ///             compileModelWithDevice() method.
+  void EncodeWithCommandQueue(id<MTLCommandQueue> command_queue,
+                              int flush_period);
+
+  API_AVAILABLE(ios(13.0), macos(11.00), tvos(13.0))
+  void AddResources(id<MTLComputeCommandEncoder> command_encoder);
+  API_AVAILABLE(ios(13.0), macos(11.00), tvos(13.0))
+  void EncodeWithICB(id<MTLComputeCommandEncoder> command_encoder);
+
+  void Profile(id<MTLDevice> device, ProfilingInfo* result);
+  // Returns size in bytes for all intermediate(runtime) tensors that owned by
+  // this inference context. Do not include constant tensors.
+  uint64_t GetIntermediateTensorsSize() const;
+  uint64_t GetConstantTensorsSize() const;
+
+  // Can be used only with ids from external_mutable_tensors in create_info
+  // Must be called after initialization and before execution
+  absl::Status SetTensor(const ValueId& tensor_id,
+                         MetalSpatialTensor* tensor_ptr);
+
+  MetalSpatialTensor* GetTensor(ValueId tensor_id);
+  absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor);
+  absl::Status GetOutputTensor(ValueId id, TensorFloat32* result);
+
+ private:
+  enum class TensorMemoryType {
+    kStrongShape,
+    kBuffer,
+    kVariable,
+    kConst,
+    kExternal
+  };
+
+  flatbuffers::Offset<data::InferenceContext> Encode(
+      MetalDevice* device,
+      flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,
+      flatbuffers::FlatBufferBuilder* builder);
+
+  absl::Status Decode(MetalDevice* device,
+                      const data::InferenceContext* fb_inference);
+
+  void CopyFromGpuModel(GpuModel* gpu_model);
+  absl::Status CompileOperations(MetalDevice* device);
+  void PrepareExternal();
+
+  absl::Status AllocateTensors(MetalDevice* device);
+  absl::Status AllocateMemoryForConstTensors(MetalDevice* device);
+  absl::Status AllocateMemoryForBuffers(MetalDevice* device);
+  absl::Status AllocateMemoryForStrongShapes(MetalDevice* device);
+  void BindTensorsToOperations();
+  absl::Status UpdateParams(const GpuInfo& gpu_info);
+  void GetUsages(const std::function<bool(ValueId)>& functor,
+                 std::map<ValueId, int2>* usages);
+  TensorMemoryType GetTensorMemoryType(const GpuInfo& gpu_info, ValueId id);
+  absl::Status Tune(TuningType tuning_type, MetalDevice* device);
+
+  absl::flat_hash_map<ValueId, TensorDescriptor> tensors_descs_;
+
+  std::vector<MetalNode> nodes_;
+  std::vector<ValueId> input_ids_;
+  std::vector<ValueId> output_ids_;
+
+  absl::flat_hash_map<ValueId, MetalSpatialTensor*> external_immutable_tensors_;
+  absl::flat_hash_map<ValueId, MetalSpatialTensor*> external_mutable_tensors_;
+  absl::flat_hash_map<ValueId, std::vector<int>> external_tensor_to_nodes_;
+  absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_;
+  std::map<ValueId, MetalSpatialTensor> const_tensors_;
+
+  std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_;
+  std::vector<id<MTLBuffer>> shared_buffers_;
+  std::vector<MetalSpatialTensor>
+      shared_buffer_tensors_;  // use references to memory
+                               // from _sharedBuffers
+
+  std::map<ValueId, MetalSpatialTensor> strong_shape_tensors_;
+  std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
+
+  id<MTLIndirectCommandBuffer> icb_ API_AVAILABLE(ios(13.0), macos(11.00),
+                                                  tvos(13.0)) = nullptr;
+  id<MTLDevice> device_ = nullptr;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_INFERENCE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
new file mode 100644
index 00000000..1daeee3a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_TEST_UTIL_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_device.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class MetalExecutionEnvironment : public TestExecutionEnvironment {
+ public:
+  MetalExecutionEnvironment() = default;
+  ~MetalExecutionEnvironment() = default;
+
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const override;
+  std::vector<TensorStorageType> GetSupportedStorages(
+      DataType data_type) const override;
+
+  const GpuInfo& GetGpuInfo() const override { return device_.GetInfo(); }
+
+  absl::Status ExecuteGpuOperationInternal(
+      const std::vector<TensorDescriptor*>& src_cpu,
+      const std::vector<TensorDescriptor*>& dst_cpu,
+      std::unique_ptr<GPUOperation>&& operation) override;
+
+ private:
+  MetalDevice device_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_arguments.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_arguments.h
new file mode 100644
index 00000000..829c5bfc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_arguments.h
@@ -0,0 +1,176 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/arguments.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_device.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class MetalArguments : public ArgumentsBinder {
+ public:
+  MetalArguments() = default;
+
+  absl::Status Init(bool use_arguments_buffer, MetalDevice* device,
+                    Arguments* args, std::string* code);
+
+  absl::Status Init(bool use_arguments_buffer, MetalDevice* device,
+                    Arguments* args);
+
+  // Move only
+  MetalArguments(MetalArguments&& args) = default;
+  MetalArguments& operator=(MetalArguments&& args) = default;
+  MetalArguments(const MetalArguments&) = delete;
+  MetalArguments& operator=(const MetalArguments&) = delete;
+
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+  absl::Status SetHalf(const std::string& name, half value) override;
+  absl::Status SetObjectRef(const std::string& name, const GPUObject& object);
+
+  void Encode(id<MTLComputeCommandEncoder> encoder, int buffer_offset,
+              int texture_offset = 0) const;
+
+  // For usage with Argument Buffers
+  API_AVAILABLE(ios(11.0), macos(10.13), tvos(11.0))
+  void AddResourcesToEncoder(id<MTLComputeCommandEncoder> encoder) const;
+  API_AVAILABLE(ios(11.0), macos(10.13), tvos(11.0))
+  void EncodeArguments(id<MTLArgumentEncoder> arguments_encoder);
+
+ private:
+  // creates structure with layout:
+  // struct uniforms_buffer {
+  //   int val_0;
+  //   int val_1;
+  //   float val_2;
+  //   int dummy;  // for alignment
+  // };
+  std::string CopyScalarArgumentsToStructWithScalarFields(
+      const Arguments& args, const std::string& call_prefix = "",
+      std::string* code = nullptr);
+
+  // creates structure with layout:
+  // struct uniforms_buffer {
+  //   int4 val_0_val_1_dummy_dummy;
+  //   float4 val_2_dummy_dummy_dummy;
+  // };
+  std::string CopyScalarArgumentsToStructWithVec4Fields(
+      const Arguments& args, const std::string& call_prefix = "",
+      std::string* code = nullptr);
+
+  absl::Status AllocateObjects(const Arguments& args, id<MTLDevice> device);
+  absl::Status AddObjectArgs(const GpuInfo& gpu_info, const Arguments& args);
+
+  void AddGPUResources(const std::string& name, const GPUResources& resources);
+
+  std::string GetListOfArgs(int buffer_offset, int textures_offset = 0);
+
+  std::string GetArgumentBufferStructDefinition(bool add_constants_struct);
+
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
+
+  absl::Status SetBuffer(const std::string& name, id<MTLBuffer> handle,
+                         uint64_t offset);
+  absl::Status SetImage2D(const std::string& name, id<MTLTexture> handle);
+  absl::Status SetImage2DArray(const std::string& name, id<MTLTexture> handle);
+  absl::Status SetImage3D(const std::string& name, id<MTLTexture> handle);
+  absl::Status SetImageBuffer(const std::string& name, id<MTLTexture> handle);
+
+  absl::Status SetObjectsResources(const Arguments& args);
+
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t bytes_offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t bytes_offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<uint8_t> const_data_;
+
+  struct MetalBufferDescriptor {
+    GPUBufferDescriptor desc;
+    id<MTLBuffer> handle;
+    uint64_t offset;
+  };
+  struct MetalImage2DDescriptor {
+    GPUImage2DDescriptor desc;
+    id<MTLTexture> handle;
+  };
+  struct MetalImage2DArrayDescriptor {
+    GPUImage2DArrayDescriptor desc;
+    id<MTLTexture> handle;
+  };
+  struct MetalImage3DDescriptor {
+    GPUImage3DDescriptor desc;
+    id<MTLTexture> handle;
+  };
+  struct MetalImageBufferDescriptor {
+    GPUImageBufferDescriptor desc;
+    id<MTLTexture> handle;
+  };
+
+  std::map<std::string, MetalBufferDescriptor> buffers_;
+  std::map<std::string, MetalImage2DDescriptor> images2d_;
+  std::map<std::string, MetalImage2DArrayDescriptor> image2d_arrays_;
+  std::map<std::string, MetalImage3DDescriptor> images3d_;
+  std::map<std::string, MetalImageBufferDescriptor> image_buffers_;
+
+  std::map<std::string, GPUObjectDescriptorPtr> object_refs_;
+  std::vector<GPUObjectPtr> objects_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_device.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_device.h
new file mode 100644
index 00000000..65a40cab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_device.h
@@ -0,0 +1,58 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_DEVICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_DEVICE_H_
+
+#import <Metal/Metal.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+// A wrapper around metal device
+class MetalDevice {
+ public:
+  MetalDevice();
+  MetalDevice(id<MTLDevice> device);
+
+  MetalDevice(MetalDevice&& device) = default;
+  MetalDevice& operator=(MetalDevice&& device) = default;
+  MetalDevice(const MetalDevice&) = delete;
+  MetalDevice& operator=(const MetalDevice&) = delete;
+
+  ~MetalDevice() = default;
+
+  id<MTLDevice> device() const { return device_; }
+
+  const GpuInfo& GetInfo() const { return info_; }
+
+  bool IsLanguageVersion2orHigher() const;
+
+ private:
+  id<MTLDevice> device_ = nullptr;
+  GpuInfo info_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_DEVICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h
new file mode 100644
index 00000000..c971ac85
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_SPATIAL_TENSOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_SPATIAL_TENSOR_H_
+
+#import <Metal/Metal.h>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/common.h"
+#include "tensorflow/lite/delegates/gpu/metal/gpu_object.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class MetalSpatialTensor : public GPUObject, public GpuSpatialTensor {
+ public:
+  MetalSpatialTensor()
+      : memory_(nullptr),
+        texture_mem_(nullptr),
+        memory_owner_(true),
+        texture_mem_owner_(true) {}
+  MetalSpatialTensor(id<MTLBuffer> buffer, id<MTLTexture> texture,
+                     bool memory_owner, bool texture_mem_owner,
+                     const TensorDescriptor& descriptor);
+
+  // Move only
+  MetalSpatialTensor(MetalSpatialTensor&& tensor);
+  MetalSpatialTensor& operator=(MetalSpatialTensor&& tensor);
+  MetalSpatialTensor(const MetalSpatialTensor&) = delete;
+  MetalSpatialTensor& operator=(const MetalSpatialTensor&) = delete;
+
+  ~MetalSpatialTensor() override { Release(); }
+
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
+
+  int Width() const override { return descriptor_.GetBHWDCShape().w; }
+  int Height() const override { return descriptor_.GetBHWDCShape().h; }
+  int Depth() const override { return descriptor_.GetBHWDCShape().d; }
+  int Channels() const override { return descriptor_.GetBHWDCShape().c; }
+  int Slices() const override {
+    return DivideRoundUp(descriptor_.GetBHWDCShape().c, 4);
+  }
+  int Batch() const override { return descriptor_.GetBHWDCShape().b; }
+
+  TensorDescriptor GetDescriptor() const override { return descriptor_; }
+  DataType GetDataType() const { return descriptor_.GetDataType(); }
+  TensorStorageType GetStorageType() const {
+    return descriptor_.GetStorageType();
+  }
+  uint64_t GetMemorySizeInBytes() const {
+    return descriptor_.GetMemorySizeInBytes();
+  }
+
+  absl::Status CreateFromDescriptor(const TensorDescriptor& desc,
+                                    id<MTLDevice> device);
+  absl::Status UploadDescriptorData(const TensorDescriptor& desc,
+                                    id<MTLDevice> device);
+  absl::Status ToDescriptor(TensorDescriptor* desc, id<MTLDevice> device) const;
+
+  absl::Status SetBufferHandle(id<MTLBuffer> buffer);
+  id<MTLBuffer> GetBufferHandle() const;
+
+ private:
+  friend absl::Status CreateTensorSharedBuffer(
+      id<MTLBuffer> buffer, const TensorDescriptor& descriptor,
+      MetalSpatialTensor* result, uint64_t buffer_offset);
+
+  friend absl::Status CreateTensorSharedImage2DBuffer(
+      id<MTLBuffer> buffer, const TensorDescriptor& descriptor,
+      int row_bytes_alignment, MetalSpatialTensor* result,
+      uint64_t buffer_offset);
+
+  absl::Status WriteData(id<MTLDevice> device, const void* ptr);
+  absl::Status ReadData(id<MTLDevice> device, void* ptr) const;
+
+  void Release();
+
+  id<MTLBuffer> memory_;
+  id<MTLTexture> texture_mem_;
+  bool memory_owner_;
+  bool texture_mem_owner_;
+  TensorDescriptor descriptor_;
+  // for use with TEXTURE_2D and when texture created from buffer.
+  int aligned_texture_width_;
+  // used when created from shared buffer
+  uint64_t buffer_offset_ = 0;
+};
+
+absl::Status CreateTensor(id<MTLDevice> device,
+                          const TensorDescriptor& descriptor,
+                          MetalSpatialTensor* result);
+
+absl::Status CreateTensorSharedBuffer(id<MTLBuffer> buffer,
+                                      const TensorDescriptor& descriptor,
+                                      MetalSpatialTensor* result,
+                                      uint64_t buffer_offset = 0);
+
+absl::Status CreateTensorSharedImage2DBuffer(id<MTLBuffer> buffer,
+                                             const TensorDescriptor& descriptor,
+                                             int row_bytes_alignment,
+                                             MetalSpatialTensor* result,
+                                             uint64_t buffer_offset = 0);
+
+TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info);
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_SPATIAL_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal_delegate.h
new file mode 100644
index 00000000..ef3fa1b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
+
+#import <Metal/Metal.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#else
+// For "C" 'bool' is not built-in type.
+#include <stdbool.h>
+#endif  // __cplusplus
+
+typedef struct TfLiteDelegate TfLiteDelegate;
+
+typedef enum {
+  // waitUntilCompleted
+  TFLGpuDelegateWaitTypePassive,
+  // Minimize latency. It uses active spinning instead of mutex and consumes
+  // additional CPU resources.
+  TFLGpuDelegateWaitTypeActive,
+  // Useful when the output is used with GPU pipeline then or if external
+  // command encoder is set.
+  TFLGpuDelegateWaitTypeDoNotWait,
+  // Tries to avoid GPU sleep mode.
+  TFLGpuDelegateWaitTypeAggressive,
+} TFLGpuDelegateWaitType;
+
+// Creates a new delegate instance that need to be destroyed with
+// DeleteFlowDelegate when delegate is no longer used by tflite.
+typedef struct {
+  // Allows to quantify tensors, downcast values, process in float16 etc.
+  bool allow_precision_loss;
+  TFLGpuDelegateWaitType wait_type;
+  // Allows execution of integer quantized models
+  bool enable_quantization;
+} TFLGpuDelegateOptions;
+
+// Populates TFLGpuDelegateOptions as follows:
+//   allow_precision_loss = false;
+//   wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
+//   enable_quantization = true;
+TFL_CAPI_EXPORT extern TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault(void);
+
+// Creates a new delegate instance that need to be destroyed with
+// `TFLDeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, the following default values are used:
+// .precision_loss_allowed = false,
+// .wait_type = kPassive,
+TFL_CAPI_EXPORT extern TfLiteDelegate* TFLGpuDelegateCreate(
+    const TFLGpuDelegateOptions* options);
+
+// Destroys a delegate created with `TFLGpuDelegateCreate` call.
+TFL_CAPI_EXPORT extern void TFLGpuDelegateDelete(TfLiteDelegate* delegate);
+
+// Binds Metal buffer to an input or an output tensor in the initialized
+// delegate. Bound buffer should have sufficient storage to accommodate all
+// elements of a tensor. For quantized model, the buffer is bound to internal
+// dequantized float32 tensor.
+// Returns non-zero on success, or zero otherwise.
+//
+// *** Must be called *after* `Interpreter::ModifyGraphWithDelegate`. ***
+// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern bool TFLGpuDelegateBindMetalBufferToTensor(
+    TfLiteDelegate* delegate, int tensor_index, id<MTLBuffer> metal_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal_delegate_internal.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
new file mode 100644
index 00000000..9173daa6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_INTERNAL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_INTERNAL_H_
+
+#import <Metal/Metal.h>
+
+#include <functional>
+
+struct TfLiteDelegate;
+
+// Binds user-defined MTLCommandBuffer. The delegate puts all GPU tasks
+// into this buffer instead of the internal command buffer.
+bool TFLGpuDelegateSetCommandBuffer(TfLiteDelegate* delegate,
+                                    id<MTLCommandBuffer> command_buffer);
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_DELEGATE_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/spi.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/spi.h
new file mode 100644
index 00000000..a70f8dbb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/spi.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_SPI_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_SPI_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+// Contains only service provider-related interfaces. Users should not use them
+// directly.
+
+namespace tflite {
+namespace gpu {
+
+// Converts a tensor object into another one.
+class TensorObjectConverter {
+ public:
+  virtual ~TensorObjectConverter() = default;
+
+  virtual absl::Status Convert(const TensorObject& input,
+                               const TensorObject& output) = 0;
+};
+
+class TensorObjectConverterBuilder {
+ public:
+  virtual ~TensorObjectConverterBuilder() = default;
+
+  virtual bool IsSupported(const TensorObjectDef& input,
+                           const TensorObjectDef& output) const = 0;
+
+  virtual absl::Status MakeConverter(
+      const TensorObjectDef& input, const TensorObjectDef& output,
+      std::unique_ptr<TensorObjectConverter>* converter) = 0;
+};
+
+// Connects tensor definition provided by a user (external) with tensor
+// definition used by the inference engine (internal).
+struct TensorTieDef {
+  uint32_t id;
+  AccessType access_type;
+  TensorObjectDef internal_def;
+  TensorObjectDef external_def;
+};
+
+// Connects external tensor object to internal tensor object and provides
+// functionality to copy data to/from external object to internal.
+class TensorTie {
+ public:
+  explicit TensorTie(const TensorTieDef& def) : def_(def) {}
+
+  virtual ~TensorTie() = default;
+
+  virtual absl::Status SetExternalObject(TensorObject obj) = 0;
+
+  virtual TensorObject GetExternalObject() = 0;
+
+  virtual absl::Status CopyToExternalObject() = 0;
+
+  virtual absl::Status CopyFromExternalObject() = 0;
+
+  const TensorTieDef& def() const { return def_; }
+
+ private:
+  const TensorTieDef def_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_SPI_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/tflite_profile.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/tflite_profile.h
new file mode 100644
index 00000000..6e9d7310
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/gpu/tflite_profile.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_TFLITE_PROFILE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_TFLITE_PROFILE_H_
+
+#include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h"
+
+namespace tflite {
+namespace gpu {
+
+// Returns if TFLite Profiler is active.
+bool IsTfLiteProfilerActive();
+
+// Save the given TFLite Profiler object (from TfLiteContext) for op profiling.
+void SetTfLiteProfiler(void* profiler);
+
+// Returns saved TFLite Profiler object.
+void* GetTfLiteProfiler();
+
+// Generate TFLite Profiler events with the given ProfilingInfo object.
+void AddTfLiteProfilerEvents(tflite::gpu::ProfilingInfo* profiling_info);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_TFLITE_PROFILE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/activation_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/activation_builder.h
new file mode 100644
index 00000000..4537cd4a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/activation_builder.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ACTIVATION_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ACTIVATION_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ActivationOpBuilder : public OpBuilder {
+ public:
+  explicit ActivationOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  explicit ActivationOpBuilder(GraphBuilder* graph_builder, int op_type,
+                               int relu_value)
+      : OpBuilder(graph_builder, op_type), relu_value_(relu_value) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ActivationOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float relu_value_ = 6;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ACTIVATION_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h
new file mode 100644
index 00000000..54d85b5c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ARG_MIN_MAX_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ARG_MIN_MAX_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ArgMinMaxOpBuilder : public OpBuilder {
+ public:
+  explicit ArgMinMaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ArgMinMaxOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ARG_MIN_MAX_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h
new file mode 100644
index 00000000..cc17a5c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ARITHMETIC_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ARITHMETIC_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ArithmeticOpBuilder : public OpBuilder {
+ public:
+  explicit ArithmeticOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ArithmeticOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float output_min_, output_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_ARITHMETIC_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/batch_seq_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/batch_seq_builder.h
new file mode 100644
index 00000000..9975b306
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/batch_seq_builder.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_BATCH_SEQ_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_BATCH_SEQ_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class BatchSeqBuilder : public OpBuilder {
+ public:
+  explicit BatchSeqBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override {
+    // BatchSeqConfig doesn't have any outputs.
+    return kTfLiteOk;
+  }
+
+  void SetMaxSizeForBatch(int max_size_for_batch) {
+    max_size_for_batch_ = max_size_for_batch;
+  }
+
+  void SetInputBatchDimensions(TfLiteIntArray* input_batch_dimensions) {
+    input_batch_dims_ = input_batch_dimensions;
+  }
+
+  void SetOutputBatchDimensions(TfLiteIntArray* output_batch_dimensions) {
+    output_batch_dims_ = output_batch_dimensions;
+  }
+
+ private:
+  // Maximum size for the batch dimension in a single run.
+  // The graph can have input with larger batch, internally
+  // multiple runs will happen each won't have more than 'max_size_for_batch_'
+  // in batch dimension.
+  int max_size_for_batch_ = 1;
+  // Input dimension for each input in the graph.
+  // Input with fixed batch should have -1.
+  TfLiteIntArray* input_batch_dims_;
+  // Output dimension for each output in the graph.
+  // Output with fixed batch should have -1.
+  TfLiteIntArray* output_batch_dims_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_BATCH_SEQ_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/cast_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/cast_builder.h
new file mode 100644
index 00000000..901318b1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/cast_builder.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CAST_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CAST_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+// This builder is used to cast int8 input or output tensors to & from uint8
+// respectively. No TFLite op converts to this.
+// NOTE: There are no explicit tests for this, but is required for all int8 unit
+// tests.
+class CastOpBuilder : public OpBuilder {
+ public:
+  explicit CastOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  // inputs & outputs should contain the *same* (one) TFLite tensor-id, since
+  // tensors are cast in-place. The tensor will point to a different Hexagon
+  // TensorID after this runs.
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~CastOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CAST_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/concat_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/concat_builder.h
new file mode 100644
index 00000000..0146c91f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/concat_builder.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CONCAT_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CONCAT_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ConcatOpBuilder : public OpBuilder {
+ public:
+  explicit ConcatOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ConcatOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  std::vector<float> input_minima_;
+  std::vector<float> input_maxima_;
+  float output_min_, output_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CONCAT_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
new file mode 100644
index 00000000..4ee9a716
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
@@ -0,0 +1,164 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CONV_2D_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CONV_2D_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+// Stores quantization data for Conv/TransposeConv nodes.
+// This information is used to handle the per-channel quantized weights & biases
+// correctly in the Hexagon delegate.
+struct PerChannelQuantData {
+  // This is initialized while processing quantized weights, and acts as an
+  // input to Hexagon Conv nodes.
+  OpBuilder* channel_scales_node = nullptr;
+  // Scale information is obtained from TfLiteAffineQuantization in the weights
+  // tensor.
+  float* scales_data = nullptr;
+  int num_scale_values = 1;
+  // Number of splits to workaround DepthwiseConv accuracy issue.
+  // See Conv2dOpBuilder.should_split_dwconv_ for details.
+  int splits = 0;
+  std::vector<OpBuilder*> channel_scales_nodes;
+};
+
+class Conv2dOpBuilder : public OpBuilder {
+ public:
+  explicit Conv2dOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~Conv2dOpBuilder() override;
+
+ private:
+  TfLiteStatus InitializeWeightsNodes(const TfLiteIntArray* inputs,
+                                      const TfLiteIntArray* outputs,
+                                      TfLiteContext* context,
+                                      const int input_depth);
+
+  TfLiteStatus InitializeBiasNodes(const TfLiteIntArray* inputs,
+                                   const TfLiteIntArray* outputs,
+                                   TfLiteContext* context);
+
+  void BuildStandardConv(const TfLiteIntArray* inputs,
+                         const TfLiteTensor& output_data_tensor,
+                         OpBuilder* data_min_const, OpBuilder* data_max_const,
+                         OpBuilder* conv_output_min_const,
+                         OpBuilder* conv_output_max_const,
+                         OpBuilder* stride_node,
+                         const TfLitePadding padding_type,
+                         TensorID* output_tensor, TensorID* output_min_tensor,
+                         TensorID* output_max_tensor);
+  void BuildDilatedDwConv(const TfLiteIntArray* inputs,
+                          const TfLiteTensor& data_tensor,
+                          const TfLiteTensor& output_data_tensor,
+                          OpBuilder* data_min_const, OpBuilder* data_max_const,
+                          OpBuilder* conv_output_min_const,
+                          OpBuilder* conv_output_max_const,
+                          OpBuilder* stride_node, int stride_height,
+                          const TfLitePadding padding_type,
+                          TensorID* output_tensor, TensorID* output_min_tensor,
+                          TensorID* output_max_tensor);
+  void BuildSplittedDwConv(
+      const TfLiteIntArray* inputs, const TfLiteTensor& data_tensor,
+      const TfLiteTensor& output_data_tensor, OpBuilder* data_min_const,
+      OpBuilder* data_max_const, OpBuilder* conv_output_min_const,
+      OpBuilder* conv_output_max_const, OpBuilder* stride_node,
+      const TfLitePadding padding_type, TensorID* output_tensor,
+      TensorID* output_min_tensor, TensorID* output_max_tensor);
+
+  TensorID node_output_;
+  std::vector<float> transposed_weights_;
+  std::vector<int> stride_shape_;
+  std::vector<int> weight_shape_;
+  OpBuilder* weights_min_node_ = nullptr;
+  OpBuilder* weights_max_node_ = nullptr;
+  OpBuilder* bias_min_node_ = nullptr;
+  OpBuilder* bias_max_node_ = nullptr;
+
+  // TODO(b/228874753)
+  // We are seeing accuray issues on DepthwiseSupernode_8x8p32to8 in the
+  // following case:
+  // * kernel size is 5x5
+  // * stride size is 2x2
+  // * per channel quantized
+  // * input depth more than 32
+  //
+  // To workaround the issue, the DepthwiseSupernode_8x8p32to8 is splitted
+  // into 32 channel batches and concatenated afterwards.
+  // Input tensor, weights, bias and channel scales are splitted into 32
+  // channel sizes and fed to multiple DepthwiseSupernode_8x8p32to8 ops.
+  // The results are stitched back with a Concat op.
+  //
+  // Checks if it has DepthwiseSupernode_8x8p32to8 accuracy issues.
+  void CheckShouldSplitDwConv(TfLiteType weights_type, int input_depth,
+                              bool is_per_channel_quant,
+                              int channel_multiplier);
+  // Split weights into multiple 32-channel nodes.
+  // `converted_data` is MSB flipped int8 weight values.
+  void SplitWeightsForDwConv(const std::vector<uint8_t>& converted_data,
+                             int input_depth, int channel_multiplier);
+  // Split bias into 32 element batches.
+  // `preprocessed_bias_data` is the output of ProcessPerChannelQuantizedBias.
+  void SplitBiasForDwConv(std::vector<int>& preprocessed_bias_data);
+  bool should_split_dwconv_ = false;
+  std::vector<TensorID> data_nodes_;
+  std::vector<OpBuilder*> bias_nodes_;
+  std::vector<OpBuilder*> weights_nodes_;
+
+  // Modified only if node has per-channel quantized weights/biases.
+  PerChannelQuantData per_channel_quant_;
+
+  // Only used for dilated Depthwise Conv.
+  std::vector<int> dilation_factors_h_w_;
+  std::vector<int> space_to_batch_paddings_;
+  std::vector<int> batch_to_space_crops_;
+};
+
+// ProcessPerChannelQuantizedWeights & ProcessPerChannelQuantizedBias can be
+// used to pre-process per-channel quantized weights & biases for Hexagon.
+// NOTE: ProcessPerChannelQuantizedWeights should be run before
+// ProcessPerChannelQuantizedBias. This is becase we set PerChannelQuantData
+// based on the weights tensor, which is utilized while preprocessing bias.
+
+TfLiteStatus ProcessPerChannelQuantizedWeights(
+    const TfLiteTensor& weights_tensor, TfLiteContext* context,
+    float* weights_min, float* weights_max, GraphBuilder* graph_builder,
+    PerChannelQuantData* per_channel_quant);
+
+TfLiteStatus ProcessPerChannelQuantizedBias(
+    const TfLiteTensor& data_tensor, const TfLiteTensor& bias_tensor,
+    const int bias_tensor_idx, TfLiteContext* context, float* bias_min,
+    float* bias_max, GraphBuilder* graph_builder,
+    PerChannelQuantData* per_channel_quant,
+    std::vector<int>* preprocessed_bias_data,
+    OpBuilder** bias_const_node = nullptr);
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_CONV_2D_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h
new file mode 100644
index 00000000..a48b0b0c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_HARDSWISH_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_HARDSWISH_BUILDER_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class HardSwishOpBuilder : public OpBuilder {
+ public:
+  explicit HardSwishOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~HardSwishOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_HARDSWISH_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h
new file mode 100644
index 00000000..7a534ccd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_L2_NORMALIZATION_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_L2_NORMALIZATION_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class L2NormalizationOpBuilder : public OpBuilder {
+ public:
+  explicit L2NormalizationOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~L2NormalizationOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_L2_NORMALIZATION_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/matmul_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/matmul_builder.h
new file mode 100644
index 00000000..d891208b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/matmul_builder.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MATMUL_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MATMUL_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+// Builder for FullyConnected op in Hexagon with weights as const.
+class MatMulWithConstWeightsOpBuilder : public OpBuilder {
+ public:
+  explicit MatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                           int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  std::vector<int> weights_shape_, bias_shape_;
+  std::vector<float> transposed_weights_;
+  float weights_min_, weights_max_;
+};
+
+// Builder for FullyConnected op in Hexagon with non const weights.
+class MatMulOpBuilder : public OpBuilder {
+ public:
+  explicit MatMulOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  // Adds Fully connected op related ops to the graph.
+  TfLiteStatus AddFullyConnected(const TfLiteIntArray* inputs,
+                                 const TfLiteIntArray* outputs,
+                                 const TensorID weights_id,
+                                 const TensorID weights_min_id,
+                                 const TensorID weights_max_id,
+                                 TfLiteContext* context, OpBuilder* matmul_op);
+
+  TensorID node_output_;
+  std::vector<int> weights_shape_, bias_shape_;
+  std::vector<float> transposed_weights_;
+  float weights_min_, weights_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MATMUL_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h
new file mode 100644
index 00000000..5683f331
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/min_max_builder.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MinMaxOpBuilder : public OpBuilder {
+ public:
+  explicit MinMaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h
new file mode 100644
index 00000000..3afbac2c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MirrorPadOpBuilder : public OpBuilder {
+ public:
+  explicit MirrorPadOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~MirrorPadOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  std::vector<int> paddings_shape_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h
new file mode 100644
index 00000000..578447ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_NEG_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_NEG_OP_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class NegOpBuilder : public OpBuilder {
+ public:
+  explicit NegOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_NEG_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/op_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/op_builder.h
new file mode 100644
index 00000000..8f835a69
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/op_builder.h
@@ -0,0 +1,410 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_BUILDER_H_
+
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "hexagon/hexagon_nn_ops.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+// Wrapper that holds all data representing a single node in the Hexagon graph.
+struct OpNode {
+  std::vector<hexagon_nn_input> inputs;
+  std::vector<hexagon_nn_output> outputs;
+  // Value from the Enum of Ops in hexagon_nn_ops
+  int op_type;
+  hexagon_nn_padding_type padding_type = NN_PAD_NA;
+  // Id of node in the Hexagon graph.
+  int node_id = -1;
+  // Index/ID of node in the tflite graph.
+  // This ID can be duplicate if one TFLite node creates multiple Hexagon op
+  // nodes.
+  int tflite_node_index = -1;
+};
+
+class GraphBuilder;
+
+// Represents a single Op in the TFLite graph.
+// For each op in TFLite there should be an OpBuilder, this builder is
+// responsible for constructing equivalent node(s) in the hexagon graph. A
+// single builder can create one or more ops in the hexagon graph. When adding
+// new op* users should inherit from this class and implement
+// - PopulateSubgraph: which given inputs/outputs should construct the
+// equivalent hexagon nodes.
+// - RegisterOutputs: Which should have logic that maps final outputs from a
+// given node to the equivalent in Hexagon graph.
+class OpBuilder {
+ public:
+  // Const representing the shape of a scalar value.
+  static constexpr int kScalarShape[] = {1, 1, 1, 1};
+
+  OpBuilder(GraphBuilder* graph_builder, int hexagon_op_type)
+      : graph_builder_(graph_builder) {
+    op_node_.op_type = hexagon_op_type;
+  }
+  // A tensor is identified in the graph using a pair of IDs
+  // (Node ID, output Tensor ID)
+  // Node producing this tensor, and the index of the tensor in this
+  // node output list.
+  using TensorID = std::pair<int, int>;
+
+  virtual ~OpBuilder() {}
+
+  // Sets the op type in the hexagon graph.
+  void SetOpType(int op_type) { op_node_.op_type = op_type; }
+
+  // Sets the node id in the hexagon graph.
+  void SetNodeId(int node_id) { op_node_.node_id = node_id; }
+
+  // Sets the TfLite node index in the TfLite graph.
+  void SetTFLiteNodeId(int node_index) {
+    op_node_.tflite_node_index = node_index;
+  }
+
+  // Marks this node as Const node.
+  void SetConstNode() { op_node_.op_type = OP_Const; }
+
+  // Sets the padding type of the current node.
+  void SetPaddingType(hexagon_nn_padding_type padding_type) {
+    op_node_.padding_type = padding_type;
+  }
+
+  // Sets the builtin_data of TFLite node that this Builder is responsible for.
+  void SetBuiltinData(void* builtin_data) { builtin_data_ = builtin_data; }
+
+  // Returns true if the current op is a const Op.
+  bool IsConstNode() const { return op_node_.op_type == OP_Const; }
+
+  // Subclasses should override it and have logic which handles initializing
+  // hexagon node(s) for the current op, given 'inputs' 'outputs'
+  virtual TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                        const TfLiteIntArray* outputs,
+                                        TfLiteContext* context) {
+    return kTfLiteOk;
+  }
+
+  // Subclasses should override it and register the final output(s) from the
+  // node to the equivalent in hexagon graph.
+  virtual TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                                       TfLiteContext* context) {
+    return kTfLiteOk;
+  }
+
+  // Constructs OpNode which represents a node in the Hexagon graph.
+  const OpNode* Build();
+
+  // Returns the Node index in TFLite graph.
+  int GetTFLiteNodeID() const { return op_node_.tflite_node_index; }
+
+  // Returns the Op type of the current Op (in Hexagon graph)
+  int GetOpType() const { return op_node_.op_type; }
+
+  // Returns the node id in the hexagon graph.
+  int GetID() const { return op_node_.node_id; }
+
+  // Adds tensor identified by 'tensor_id' as input to the current Op.
+  void AddInput(const TensorID& tensor_id) { input_ids_.push_back(tensor_id); }
+
+  // Adds Output to the current node, the output has shape defined in 'dims'.
+  // The size of each element is defined using 'element_size'.
+  // Returns the TensorID identifying this output in the graph.
+  TensorID AddOutput(const TfLiteIntArray* dims, int element_size);
+
+  // Adds Output to the current node, each element in the output has
+  // size 'elementsize' and rank 'rank' and for each dimension in the output
+  // the maximum size is max_sizes[i].
+  // Returns the TensorID identifying this output in the graph.
+  TensorID AddOutput(int elementsize, int rank,
+                     const std::vector<int>& max_sizes);
+
+  // Same as above but accepts pointer instead of std::vector.
+  TensorID AddOutput(int elementsize, int rank, const int* max_sizes_vect);
+
+  // Sets the node that corresponds to this builder in TFLite graph.
+  void SetTfLiteNode(const TfLiteNode* node) { tflite_node_ = node; }
+
+  // Static
+  // Computes the min/max values of 'tensor' and sets the values in
+  // the out params 'min' and 'max'.
+  // Returns kTfLiteOk on success.
+  static TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
+                                                  float* min, float* max) {
+    if (tensor.type == kTfLiteUInt8) {
+      return ComputeMinAndMaxQuantValues(tensor, min, max,
+                                         std::numeric_limits<uint8_t>::min(),
+                                         std::numeric_limits<uint8_t>::max());
+    } else if (tensor.type == kTfLiteInt8) {
+      return ComputeMinAndMaxQuantValues(tensor, min, max,
+                                         std::numeric_limits<int8_t>::min(),
+                                         std::numeric_limits<int8_t>::max());
+    } else if (tensor.type == kTfLiteInt32) {
+      return ComputeMinAndMaxQuantValues(tensor, min, max,
+                                         std::numeric_limits<int>::min(),
+                                         std::numeric_limits<int>::max());
+    }
+    return kTfLiteError;
+  }
+
+ protected:
+  // Helper method to fetch dimensions.
+  // TODO(karimnosseir): Move to a shared place.
+  void GetDims(int* batch_size, int* height_size, int* width_size,
+               int* depth_size, const TfLiteIntArray* dims) {
+    int* dim[] = {batch_size, height_size, width_size, depth_size};
+    for (int i = 0; i < 4; ++i) *(dim[i]) = 1;
+    for (int i = 4 - dims->size; i < 4; ++i) {
+      *dim[i] = dims->data[i - (4 - dims->size)];
+    }
+  }
+
+  // Computes the min and max for 'tensor' and adds them as input
+  // to the node.
+  TfLiteStatus ComputeAndAddMinAndMax(TfLiteContext* context,
+                                      const TfLiteTensor& tensor);
+
+  // Computes the float min and max for 'tensor', given 'min_value' and
+  // 'max_value' data range. The float min and max will be set in 'min' and
+  // 'max' params
+  template <typename T>
+  static TfLiteStatus ComputeMinAndMaxQuantValues(const TfLiteTensor& tensor,
+                                                  float* min, float* max,
+                                                  T min_value, T max_value) {
+    *min = 0;
+    *max = 0;
+    const TfLiteQuantization& quant = tensor.quantization;
+    if (quant.type != TfLiteQuantizationType::kTfLiteAffineQuantization) {
+      printf("Tensor not quantized: %s\n", tensor.name);
+      return kTfLiteError;
+    }
+    const TfLiteAffineQuantization* params =
+        static_cast<const TfLiteAffineQuantization*>(quant.params);
+    float scale = params->scale->data[0];
+    float zero_point = static_cast<float>(params->zero_point->data[0]);
+    *min = scale * (static_cast<float>(min_value) - zero_point);
+    *max = scale * (static_cast<float>(max_value) - zero_point);
+
+    return kTfLiteOk;
+  }
+
+  OpNode op_node_;
+  // inputs to the current op. Each pair identifies a single output from
+  // another node (node_id, output_id).
+  std::vector<TensorID> input_ids_;
+  // Pointer to the graph builder.
+  GraphBuilder* graph_builder_ = nullptr;
+  // Data needed by this node.
+  void* builtin_data_ = nullptr;
+  // TODO(karimnosseir): Currently we only use it for getting output
+  // size. Can we avoid passing it ?
+  const TfLiteNode* tflite_node_ = nullptr;
+};
+
+class GraphBuilder {
+ public:
+  GraphBuilder(const HexagonNN* hexagon_nn, TfLiteContext* context,
+               int graph_id)
+      : hexagon_nn_(hexagon_nn), context_(context), graph_id_(graph_id) {}
+
+  // Returns per OP builder. 'op_type' is the TfLite builtinOperator.
+  OpBuilder* AddNodeFromTfLiteOp(int op_type, TfLiteNode* node,
+                                 int tflite_node_index);
+
+  // Add node to the graph. The caller responsible for setting correct
+  // data in the Op.
+  // 'tflite_node_index' is the node index in TFLite that creates this op.
+  OpBuilder* AddNode(int tflite_node_index = -1);
+
+  // Add const node that provides the data held by 'tensor'.
+  // If `int8_to_uint8` is true, then the data will be casted to uint8 from
+  // int8.
+  OpBuilder* AddConstNodeWithData(int tensor_id, const TfLiteTensor& tensor,
+                                  bool int8_to_uint8 = false);
+
+  // Same as above but takes shape of the tensor that will holds the data.
+  OpBuilder* AddConstNodeWithData(const int shape[], char* data, int data_size);
+
+  OpBuilder* CreateOpBuilderFromTfLiteOp(int op_type, TfLiteNode* node);
+
+  // Construct Input node with 'input_tensors' as output.
+  TfLiteStatus AddInputTensors(const TfLiteIntArray* input_tensors,
+                               TfLiteContext* context);
+
+  // Construct Output node with 'output_tensors' as input.
+  TfLiteStatus AddOutputTensors(const TfLiteIntArray* output_tensors,
+                                TfLiteContext* context);
+
+  // Adds BatchSeqConfig node to the graph. This is configuration
+  // for a dynamic batch size for the graph.
+  // A graph can have only one node of this type.
+  void AddBatchSeqConfig(int max_size_for_batch,
+                         TfLiteIntArray* input_batch_dimensions,
+                         TfLiteIntArray* output_batch_dimensions);
+
+  // Returns tensor id inside Hexagon graph.
+  OpBuilder::TensorID GetHexagonTensorId(int tflite_tensor_index) {
+    if (!HasTensor(tflite_tensor_index)) {
+      // Return invalid ID.
+      return OpBuilder::TensorID(-1, -1);
+    }
+    return tensors_[tflite_tensor_index];
+  }
+
+  // Return true if this tensor was added before to the graph.
+  bool HasTensor(int tflite_tensor_index) {
+    if (tensors_.size() <= tflite_tensor_index) {
+      return false;
+    }
+    // the first field is node ID and id = 0 is reserved
+    // so anything > 0 is correctly initialized.
+    return tensors_[tflite_tensor_index].first != 0;
+  }
+
+  void AddDebugNode() {}
+
+  void Build() {
+    for (int i = 0; i < builders_.size(); ++i) {
+      if (builders_[i]->IsConstNode()) {
+        continue;
+      }
+      const OpNode* op_node = builders_[i]->Build();
+      int error = hexagon_nn_->hexagon_nn_append_node(
+          graph_id_, op_node->node_id, op_node->op_type, op_node->padding_type,
+          op_node->inputs.data(), op_node->inputs.size(),
+          op_node->outputs.data(), op_node->outputs.size());
+      if (error != 0) {
+        printf("Error adding node: id:%d, op_type:%d\n", op_node->node_id,
+               op_node->op_type);
+      }
+    }
+  }
+
+  void print() {
+    printf("------------------------------\n");
+    std::vector<unsigned char> buf(10000);
+    hexagon_nn_->hexagon_nn_snpprint(graph_id_, buf.data(), buf.size());
+    printf("%s", buf.data());
+    printf("------------------------------\n");
+    fflush(stdout);
+  }
+
+  // Add new tensor mapping to the tensor list.
+  bool AddTensorWithID(int tflite_tensor_id, int hexagon_node_id,
+                       int hexagon_node_output_id, bool overwrite = false) {
+    if (!overwrite && HasTensor(tflite_tensor_id)) {
+      TF_LITE_KERNEL_LOG(
+          context_,
+          "Trying to add duplicate tensor without overwrite, tflite_tensor_id "
+          "%d, hexagon_node_id %d, hexagon_node_output_id %d",
+          tflite_tensor_id, hexagon_node_id, hexagon_node_output_id);
+      return false;
+    }
+    if (tensors_.size() <= tflite_tensor_id) {
+      tensors_.resize(tflite_tensor_id + 1);
+    }
+    if (hexagon_node_id == -1 || hexagon_node_output_id == -1)
+      TF_LITE_KERNEL_LOG(context_,
+                         "Trying to add invalid id, tflite_tensor_id "
+                         "%d, hexagon_node_id %d, hexagon_node_output_id %d",
+                         tflite_tensor_id, hexagon_node_id,
+                         hexagon_node_output_id);
+    tensors_[tflite_tensor_id] =
+        OpBuilder::TensorID(hexagon_node_id, hexagon_node_output_id);
+    return true;
+  }
+
+  int GetOpTypeId(int node_id) {
+    if (node_id > builders_.size()) {
+      return -1;
+    }
+    return builders_[node_id - 1]->GetOpType();
+  }
+
+  int GetTFLiteNodeID(int node_id) const {
+    if (node_id > builders_.size()) {
+      return -1;
+    }
+    return builders_[node_id - 1]->GetTFLiteNodeID();
+  }
+
+  // Returns true if the graph supports dynamic batch. False otherwise.
+  bool GraphHasDynamicBatch() const { return max_size_for_batch_ != -1; }
+
+  // Returns the maximum value for batch dimension the graph supports.
+  // -1 if the graph doesn't support dynamic batch.
+  int GetMaxBatchSize() const { return max_size_for_batch_; }
+
+ private:
+  // Lookup in cache if data with key 'cache_key' is present.
+  // Return OpBuilder* for the data if found, nullptr otherwise.
+  OpBuilder* LookupConstData(uint64_t cache_key);
+
+  // Inserts 'value' in cache, with key equals 'cache_key'.
+  // If data in cache with same key then it will be overwritten.
+  void AddToCache(uint64_t cache_key, OpBuilder* value);
+
+  // Helper method to fetch dimensions.
+  // TODO(karimnosseir): Move this method to shared place.
+  void GetDims(int* batch_size, int* height_size, int* width_size,
+               int* depth_size, const TfLiteIntArray* dims) {
+    int* dim[] = {batch_size, height_size, width_size, depth_size};
+    for (int i = 0; i < 4; ++i) *(dim[i]) = 1;
+    for (int i = 4 - dims->size; i < 4; ++i) {
+      *dim[i] = dims->data[i - (4 - dims->size)];
+    }
+  }
+
+  // Adds a Cast op to convert a tensor from int8 to uint8 (or vice versa).
+  // The builder which has the casting operator is filled in 'cast_op_builder'
+  // if not nullptr.
+  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id,
+                         OpBuilder** cast_op_builder);
+
+  const HexagonNN* hexagon_nn_ = nullptr;
+  TfLiteContext* context_ = nullptr;
+  int graph_id_ = -1;
+  std::vector<std::unique_ptr<OpBuilder>> builders_;
+  // Index in the vector is the tflite_tensor_index, the value
+  // is the ID in the hexgon graph.
+  std::vector<OpBuilder::TensorID> tensors_;
+
+  // If the graph being built supports dynamic batch, this represents
+  // the maximum value for batch.
+  int max_size_for_batch_ = -1;
+
+  // Cache for const data in the graph.
+  // Key is hash of the data, value is pointer to the OpBuilder* for the added
+  // data.
+  std::map<uint64_t, OpBuilder*> cache_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/op_factory.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/op_factory.h
new file mode 100644
index 00000000..4f8314e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/op_factory.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_FACTORY_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_FACTORY_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+class GraphBuilder;
+class OpBuilder;
+
+OpBuilder* CreateArgMinMaxOpBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateActivationBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateArithmeticBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                                 int op_type);
+OpBuilder* CreateConcatBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateConv2DBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateTransposeConv2DBuilder(GraphBuilder* graph_builder,
+                                        int op_type);
+OpBuilder* CreatePool2DBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateReshapeBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSoftmaxBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateReduceBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreatePadBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateResizeNearestNeighborBuilder(GraphBuilder* graph_builder,
+                                              int op_type);
+OpBuilder* CreateL2NormalizationBuilder(GraphBuilder* graph_builder,
+                                        int op_type);
+OpBuilder* CreateSplitBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateResizeBilinearOpBuilder(GraphBuilder* graph_builder,
+                                         int op_type);
+OpBuilder* CreateNegOpBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateTransposeBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSpaceToDepthBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateBatchSeqBuilder(GraphBuilder* graph_builder, int op_type,
+                                 int max_size_for_batch,
+                                 TfLiteIntArray* input_batch_dimensions,
+                                 TfLiteIntArray* output_batch_dimensions);
+OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreatePackBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMatMulOpBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateStridedSliceBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSquaredDifferenceOpBuilder(GraphBuilder* graph_builder,
+                                            int op_type);
+OpBuilder* CreateRSqrtOpBuilder(GraphBuilder* graph_builder, int op_type);
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_FACTORY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pack_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pack_builder.h
new file mode 100644
index 00000000..895a0d64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pack_builder.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class PackOpBuilder : public OpBuilder {
+ public:
+  explicit PackOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  // Min/max for all inputs.
+  std::vector<float> minima_, maxima_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pad_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pad_builder.h
new file mode 100644
index 00000000..62bfaa88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pad_builder.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_PAD_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_PAD_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class PadOpBuilder : public OpBuilder {
+ public:
+  explicit PadOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~PadOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_PAD_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h
new file mode 100644
index 00000000..e79fc315
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_POOL_2D_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_POOL_2D_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class Pool2dOpBuilder : public OpBuilder {
+ public:
+  explicit Pool2dOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~Pool2dOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  std::vector<int> stride_shape_;
+  std::vector<int> filter_shape_;
+  float output_min_, output_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_POOL_2D_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/quantize_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/quantize_builder.h
new file mode 100644
index 00000000..446d1e3b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/quantize_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_QUANTIZE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_QUANTIZE_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class QuantizeOpBuilder : public OpBuilder {
+ public:
+  explicit QuantizeOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  explicit QuantizeOpBuilder(GraphBuilder* graph_builder, int op_type,
+                             int relu_value)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~QuantizeOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_QUANTIZE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h
new file mode 100644
index 00000000..ef860db2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/reduce_builder.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_REDUCE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_REDUCE_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ReduceOpBuilder : public OpBuilder {
+ public:
+  explicit ReduceOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ReduceOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_REDUCE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/reshape_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/reshape_builder.h
new file mode 100644
index 00000000..b5514e58
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/reshape_builder.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESHAPE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESHAPE_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ReshapeOpBuilder : public OpBuilder {
+ public:
+  explicit ReshapeOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ReshapeOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  std::vector<int> output_shape_;
+  std::vector<int> output_shape_shape_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESHAPE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h
new file mode 100644
index 00000000..3e8891e6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESIZE_BILINEAR_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESIZE_BILINEAR_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ResizeBilinearOpBuilder : public OpBuilder {
+ public:
+  explicit ResizeBilinearOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ResizeBilinearOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESIZE_BILINEAR_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h
new file mode 100644
index 00000000..2c16eff1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESIZE_NEAREST_NEIGHBOR_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESIZE_NEAREST_NEIGHBOR_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class ResizeNearestNeighborOpBuilder : public OpBuilder {
+ public:
+  explicit ResizeNearestNeighborOpBuilder(GraphBuilder* graph_builder,
+                                          int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~ResizeNearestNeighborOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_RESIZE_NEAREST_NEIGHBOR_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/slice_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/slice_builder.h
new file mode 100644
index 00000000..9bb0c586
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/slice_builder.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class SliceOpBuilder : public OpBuilder {
+ public:
+  explicit SliceOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h
new file mode 100644
index 00000000..8cd3d44a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/softmax_builder.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SOFTMAX_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SOFTMAX_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class SoftmaxOpBuilder : public OpBuilder {
+ public:
+  explicit SoftmaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~SoftmaxOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float beta_value_ = 1.0f;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SOFTMAX_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h
new file mode 100644
index 00000000..47d460b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SPACE_TO_DEPTH_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SPACE_TO_DEPTH_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+// Supports both ways:
+// Space -> Depth & Depth -> Space.
+class SpaceToDepthOpBuilder : public OpBuilder {
+ public:
+  explicit SpaceToDepthOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~SpaceToDepthOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  int block_size_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SPACE_TO_DEPTH_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/split_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/split_builder.h
new file mode 100644
index 00000000..95a3ee65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/split_builder.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SPLIT_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SPLIT_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class SplitOpBuilder : public OpBuilder {
+ public:
+  explicit SplitOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~SplitOpBuilder() override;
+
+ private:
+  std::vector<TensorID> node_outputs_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_SPLIT_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h
new file mode 100644
index 00000000..6ddf99b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_STRIDED_SLICE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_STRIDED_SLICE_BUILDER_H_
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class StridedSliceOpBuilder : public OpBuilder {
+ public:
+  explicit StridedSliceOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_STRIDED_SLICE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
new file mode 100644
index 00000000..781fa640
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TESTS_HEXAGON_DELEGATE_OP_MODEL_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TESTS_HEXAGON_DELEGATE_OP_MODEL_H_
+
+#include <algorithm>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+class SingleOpModelWithHexagon : public SingleOpModel {
+ public:
+  SingleOpModelWithHexagon() : delegate_(nullptr, [](TfLiteDelegate*) {}) {
+    SetBypassDefaultDelegates();
+  }
+
+  void ApplyDelegateAndInvoke() {
+    static const char kDelegateName[] = "TfLiteHexagonDelegate";
+
+    // Make sure we set the environment.
+    setenv(
+        "ADSP_LIBRARY_PATH",
+        "/data/local/tmp/hexagon_delegate_test;/system/lib/rfsa/adsp;/system/"
+        "vendor/lib/rfsa/adsp;/dsp",
+        1 /*overwrite*/);
+
+    // For tests, we use one-op-models.
+    params_.min_nodes_per_partition = 1;
+    auto* delegate_ptr = TfLiteHexagonDelegateCreate(&params_);
+    ASSERT_TRUE(delegate_ptr != nullptr);
+    delegate_ = Interpreter::TfLiteDelegatePtr(
+        delegate_ptr, [](TfLiteDelegate* delegate) {
+          TfLiteHexagonDelegateDelete(delegate);
+          // Turn off the fast rpc and cleanup.
+          // Any communication with the DSP will fail unless new
+          // HexagonDelegateInit called.
+          TfLiteHexagonTearDown();
+        });
+    TfLiteHexagonInit();
+    // Make sure we have valid interpreter.
+    ASSERT_TRUE(interpreter_ != nullptr);
+    // Add delegate.
+    EXPECT_TRUE(interpreter_->ModifyGraphWithDelegate(delegate_.get()) !=
+                kTfLiteError);
+    // Make sure graph has one Op which is the delegate node.
+    ASSERT_EQ(1, interpreter_->execution_plan().size());
+    const int node = interpreter_->execution_plan()[0];
+    const auto* node_and_reg = interpreter_->node_and_registration(node);
+    ASSERT_TRUE(node_and_reg != nullptr);
+    ASSERT_TRUE(node_and_reg->second.custom_name != nullptr);
+    ASSERT_STREQ(kDelegateName, node_and_reg->second.custom_name);
+
+    Invoke();
+  }
+
+ protected:
+  using SingleOpModel::builder_;
+
+ private:
+  Interpreter::TfLiteDelegatePtr delegate_;
+  TfLiteHexagonDelegateOptions params_ = {0};
+};
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TESTS_HEXAGON_DELEGATE_OP_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h
new file mode 100644
index 00000000..bdbaa88a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/transpose_builder.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TRANSPOSE_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TRANSPOSE_BUILDER_H_
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class TransposeOpBuilder : public OpBuilder {
+ public:
+  explicit TransposeOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+};
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TRANSPOSE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
new file mode 100644
index 00000000..d6a9a5a8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TRANSPOSE_CONV_2D_BUILDER_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TRANSPOSE_CONV_2D_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h"
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class TransposeConv2dOpBuilder : public OpBuilder {
+ public:
+  explicit TransposeConv2dOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~TransposeConv2dOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  std::vector<float> transposed_weights_;
+  std::vector<int> stride_shape_;
+  std::vector<int> bias_shape_;
+
+  // Modified only if node has per-channel quantized weights/biases.
+  PerChannelQuantData per_channel_quant_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_TRANSPOSE_CONV_2D_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
new file mode 100644
index 00000000..c34267c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_DELEGATE_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Use TfLiteHexagonDelegateOptionsDefault() for Default options.
+struct TFL_CAPI_EXPORT TfLiteHexagonDelegateOptions {
+  // This corresponds to the debug level in the hexagon SDK. 0 (default)
+  // means no debug.
+  int debug_level;
+
+  // This corresponds to powersave_level in the hexagon SDK.
+  // where 0 (default) means high performance which means more power
+  // consumption.
+  int powersave_level;
+
+  // If set to true, performance information about the graph will be dumped
+  // to Standard output, this includes cpu cycles.
+  // WARNING: Experimental and subject to change anytime.
+  bool print_graph_profile;
+
+  // If set to true, graph structure will be dumped to Standard output.
+  // This is usually beneficial to see what actual nodes executed on
+  // the DSP. Combining with 'debug_level' more information will be printed.
+  // WARNING: Experimental and subject to change anytime.
+  bool print_graph_debug;
+
+  // This sets the maximum number of Hexagon graphs created with
+  // hexagon_nn_init. Each graph corresponds to one delegated node subset in the
+  // TFLite model.
+  int max_delegated_partitions;
+  // This sets the minimum number of nodes per graph created with
+  // hexagon_nn_init. Defaults to 2.
+  int min_nodes_per_partition;
+
+  // If true, then the hexagon graph will adapt for inputs with dynamic batch.
+  // See below options are needed to be set.
+  // Currently, Only supported when the whole graph is delegated, and
+  // with batch as index 0.
+  // WARNING: Experimental and subject to change anytime.
+  bool enable_dynamic_batch_size;
+
+  // Maximum value for a batch dimension when evaluating graphs with
+  // dynamic batch. The input to the graph can have value for batch bigger than
+  // this number, internally the graph will run multiple times each with
+  // batch dimension <= max_batch_size. you should decide the value of this
+  // based on memory/latency tradeoffs.
+  // This needs to be set only if 'enable_dynamic_batch_size' is true.
+  // Not needed for fixed graphs.
+  // WARNING: Experimental and subject to change anytime.
+  int max_batch_size;
+
+  // Each element identifies the index of the batch dimension in a single input.
+  // input_batch_dimensions->data[i] is the index of the batch dimension for
+  // input[i]. If the graph has 1 input then the size of the array should be 1,
+  // and so on. This needs to be set only if 'enable_dynamic_batch_size' is
+  // true. Not needed for fixed graphs.
+  // If input[i] doesn't have dynamic batch, then input_batch_dimensions[i]
+  // should be -1.
+  // Delegate will take ownership of the pointer.
+  // WARNING: Experimental and subject to change anytime.
+  TfLiteIntArray* input_batch_dimensions;
+
+  // Each element identifies the index of the batch dimension in a single
+  // output. output_batch_dimensions->data[i] is the index of the batch
+  // dimension for output[i]. If the graph has 1 output then the size of the
+  // array should be 1, and so on. This needs to be set only if
+  // 'enable_dynamic_batch_size' is true. Not needed for fixed graphs. If
+  // output[i] has doesn't have dynamic batch, then output_batch_dimensions[i]
+  // should be -1. Delegate will take ownership of the pointer. WARNING:
+  // Experimental and subject to change anytime.
+  TfLiteIntArray* output_batch_dimensions;
+};
+
+// Return a delegate that uses Hexagon SDK for ops execution.
+// Must outlive the interpreter.
+TfLiteDelegate* TFL_CAPI_EXPORT
+TfLiteHexagonDelegateCreate(const TfLiteHexagonDelegateOptions* options);
+
+// Returns TfLiteHexagonDelegateOptions populated with default values.
+TFL_CAPI_EXPORT TfLiteHexagonDelegateOptions
+TfLiteHexagonDelegateOptionsDefault();
+
+// Do any needed cleanup and delete 'delegate'.
+void TFL_CAPI_EXPORT TfLiteHexagonDelegateDelete(TfLiteDelegate* delegate);
+
+// Initializes the DSP connection.
+// This should be called before doing any usage of the delegate.
+// "lib_directory_path": Path to the directory which holds the
+// shared libraries for the Hexagon NN libraries on the device.
+void TFL_CAPI_EXPORT TfLiteHexagonInitWithPath(const char* lib_directory_path);
+
+// Same as above method but doesn't accept the path params.
+// Assumes the environment setup is already done. Only initialize Hexagon.
+void TFL_CAPI_EXPORT TfLiteHexagonInit();
+
+// Clean up and switch off the DSP connection.
+// This should be called after all processing is done and delegate is deleted.
+void TFL_CAPI_EXPORT TfLiteHexagonTearDown();
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
new file mode 100644
index 00000000..67432fa8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
@@ -0,0 +1,102 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_DELEGATE_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_DELEGATE_KERNEL_H_
+
+#include <time.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "hexagon/hexagon_nn_ops.h"
+#include "hexagon/hexagon_nn.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Represents an abstraction of a Hexagon NNLib graph with functionality to
+// initialize, prepare and invoke it based on the TFLite subgraph to be
+// delegated.
+class HexagonDelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  explicit HexagonDelegateKernel(const ::TfLiteHexagonDelegateOptions& params)
+      : params_(params) {}
+
+  // Initialize the Hexagon graph and add required nodes.
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override;
+
+  // Prepare the Hexagon graph with hexagon_nn_prepare.
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override;
+
+  // Allocate Hexagon tensordefs for graph I/O & execute it.
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override;
+
+  ~HexagonDelegateKernel() override;
+
+  // Sets the environment required for Hexagon execution: DSP attributes,
+  // rpcmem, etc.
+  static void InitState();
+
+  // Teardown the environment initialized in InitState.
+  static void Teardown();
+
+ private:
+  // Builds the Hexagon graph based on delegated TFLite subgraph.
+  TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* output_tensors);
+
+  void ReportError(TfLiteContext* context, const std::string& msg);
+
+  // Resizes output tensors in case the delegate has dynamic batch enabled.
+  // Returns Error otherwise or if the requested size is invalid.
+  TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node);
+
+  void PrintLog();
+
+  // Prints performance information about the graph including cycles per node.
+  // If 'profiler' is not nullptr data will be added to it.
+  void PrintPerformanceData(Profiler* profiler);
+
+  // Print debugging information about the graph constructed.
+  // Amount of information can be increased with debug level.
+  void PrintDebuggingGraph();
+
+  const HexagonNN* hexagon_nn_ = nullptr;  // Not owned.
+  std::unique_ptr<delegates::hexagon::GraphBuilder> builder_;
+  hexagon_nn_nn_id graph_id_ = -1;
+  // Indices of nodes in the delegated TfLite subgraph.
+  std::vector<int> nodes_;
+  ::TfLiteHexagonDelegateOptions params_;
+
+  // Whether the Hexagon graph is prepared or not.
+  bool graph_prepared_ = false;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_DELEGATE_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_implementation.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_implementation.h
new file mode 100644
index 00000000..c65d9365
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_implementation.h
@@ -0,0 +1,142 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_IMPLEMENTATION_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_IMPLEMENTATION_H_
+
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn_interface.h"
+
+namespace tflite {
+// Holds the methods to use to Construct/Execute NN graph using Hexagon NNLib.
+struct HexagonNN {
+  // Call this function before creating a graph. It allows the environment on
+  // the DSP to configure some settings.
+  hexagon_nn_config_fn* hexagon_nn_config;
+
+  //   Creates a new graph and returns an identifier to refer to the new graph.
+  //   After a graph is
+  // initialized, nodes can be added to it.
+  // The returned graph is empty and cannot be executed until all nodes have
+  // been added and the graph is finalized with hexagon_nn_prepare(). Multiple
+  // graphs can be created and can be kept alive in the DSP environment
+  // simultaneously.
+  hexagon_nn_init_fn* hexagon_nn_init;
+
+  // Provides a simple parameter between 0 and 255 to control the power saving
+  // mode.
+  // A level of 255 indicates that preference should be given to minimizing
+  // power consumption. A level of 0 indicates that preference should be given
+  // to executing as fast as possible.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_set_powersave_level_fn* hexagon_nn_set_powersave_level;
+
+  // Changes the debug verbosity level for messages.
+  hexagon_nn_set_debug_level_fn* hexagon_nn_set_debug_level;
+
+  // Prepares a network for execution.
+  // This function is required after all the nodes have been appended and before
+  // execution.
+  // This call provides a hook where memory can be allocated, data
+  // can be rearranged, inputs and outputs can be linked up, and things in the
+  // graph can be optimized.
+  // Once a network has been prepared, it can no longer
+  // be appended to, but it can be executed.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_prepare_fn* hexagon_nn_prepare;
+
+  // Adds an ordinary (non-constant) node to the graph.
+  // Non-constant nodes can have zero or more inputs and zero or more outputs.
+  // An input is described as a source node ID as well as an output index to
+  // refer to which one of several outputs a node may have.
+  // An output is described with a maximum size. The true size of an output can
+  // be computed dynamically, but the caller must define the maximum amount of
+  // data storage required by the output during node creation.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_append_node_fn* hexagon_nn_append_node;
+
+  // Adds constant nodes to a graph.
+  // Constant nodes produce a single output that can be connected to one graph
+  // node input. Unique node_ids are required for referencing nodes when
+  // connecting the graph (for example, specifying which outputs of earlier
+  // nodes will be used as inputs to particular subsequent nodes). Node_ids are
+  // selected by the caller, but node_id=0 and node_id>0xF0000000 are reserved.
+  // Node_ids must be unique.
+  // *** NOTE: On SDM835 and older targets,
+  // hexagon_nn_append_const_node() will not work properly for arrays larger
+  // than 32 MB. Instead, use hexagon_nn_append_empty_const_node_large_array(),
+  // which expects the same arguments.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_append_const_node_fn* hexagon_nn_append_const_node;
+
+  // Executes a network, with provided input data and returning output data.
+  // Execution will fail if the network has not been prepared.
+  // Input is provided to the INPUT node, and output is returned from the OUTPUT
+  // node.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_execute_fn* hexagon_nn_execute;
+
+  // Newer version of hexagon_nn_execute that utilizes hexagon_nn_tensordefs to
+  // represent inputs & outputs. Executes a network with provided input tensors
+  // and returns output tensors. Execution will fail if the network has not
+  // been prepared.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_execute_new_fn* hexagon_nn_execute_new;
+
+  // Tears down and frees an NN graph. This can be done at any time after
+  // hexagon_nn_init(). After this function has been invoked, the nn_id id is
+  // invalid.
+  //
+  // Returns 0 on success, otherwise failure.
+  hexagon_nn_teardown_fn* hexagon_nn_teardown;
+
+  hexagon_nn_snpprint_fn* hexagon_nn_snpprint;
+
+  hexagon_nn_getlog_fn* hexagon_nn_getlog;
+
+  hexagon_nn_get_perfinfo_fn* hexagon_nn_get_perfinfo;
+
+  hexagon_nn_reset_perfinfo_fn* hexagon_nn_reset_perfinfo;
+
+  hexagon_nn_op_id_to_name_fn* hexagon_nn_op_id_to_name;
+
+  // Should be called once to shutdown DSP and cleanup.
+  hexagon_nn_global_teardown_fn* hexagon_nn_global_teardown;
+
+  // Should be called once to initialize DSP.
+  hexagon_nn_global_init_fn* hexagon_nn_global_init;
+
+  // Returns true if the device SoC is supported by hexagon library. False
+  // Otherwise.
+  hexagon_nn_is_device_supported_fn* hexagon_nn_is_device_supported;
+
+  // Returns the version number of the interface library.
+  hexagon_nn_hexagon_interface_version_fn* hexagon_nn_hexagon_interface_version;
+
+  hexagon_nn_version_fn* hexagon_nn_version = nullptr;
+
+  bool interface_loaded = false;
+};
+
+// Returns an instance of HexagonNN.
+const HexagonNN* HexagonNNImplementation();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_IMPLEMENTATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h
new file mode 100644
index 00000000..2d7e37da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_HEXAGON_NN_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_HEXAGON_NN_H_
+
+#include "hexagon/hexagon_nn.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.h"
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_HEXAGON_NN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.h
new file mode 100644
index 00000000..370f36fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_HEXAGON_NN_INIT_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_HEXAGON_NN_INIT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void hexagon_nn_global_teardown(void);
+void hexagon_nn_global_init(void);
+bool hexagon_nn_is_device_supported();
+int hexagon_nn_hexagon_interface_version(void);
+#ifdef __cplusplus
+}
+#endif
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_HEXAGON_NN_INIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h
new file mode 100644
index 00000000..5486dad9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_SOC_MODEL_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_SOC_MODEL_H_
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdio>
+
+#include "hexagon/remote.h"
+#include "hexagon/remote64.h"
+#include "hexagon/hexnn_soc_defines.h"
+
+namespace tflite {
+namespace delegates {
+#define SOC_ID_BUFFER_LENGTH 5
+#define URI_BUFFER_LENGTH 100
+
+// Returns QC SoC ID of the device.
+int get_soc_id(int* soc_id);
+
+// Returns structure that has SoC information.
+SocSkelTable getsoc_model();
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_SOC_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn_interface.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn_interface.h
new file mode 100644
index 00000000..9942b9db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/hexagon_nn_interface.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_INTERFACE_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_INTERFACE_H_
+
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+
+using hexagon_nn_config_fn = decltype(hexagon_nn_config);
+using hexagon_nn_init_fn = decltype(hexagon_nn_init);
+
+using hexagon_nn_set_powersave_level_fn =
+    decltype(hexagon_nn_set_powersave_level);
+
+using hexagon_nn_set_debug_level_fn = decltype(hexagon_nn_set_debug_level);
+
+using hexagon_nn_prepare_fn = decltype(hexagon_nn_prepare);
+
+using hexagon_nn_append_node_fn = decltype(hexagon_nn_append_node);
+
+using hexagon_nn_append_const_node_fn = decltype(hexagon_nn_append_const_node);
+
+using hexagon_nn_execute_fn = decltype(hexagon_nn_execute);
+
+using hexagon_nn_execute_new_fn = decltype(hexagon_nn_execute_new);
+
+using hexagon_nn_teardown_fn = decltype(hexagon_nn_teardown);
+
+using hexagon_nn_snpprint_fn = decltype(hexagon_nn_snpprint);
+
+using hexagon_nn_getlog_fn = decltype(hexagon_nn_getlog);
+
+using hexagon_nn_get_perfinfo_fn = decltype(hexagon_nn_get_perfinfo);
+
+using hexagon_nn_reset_perfinfo_fn = decltype(hexagon_nn_reset_perfinfo);
+
+using hexagon_nn_op_id_to_name_fn = decltype(hexagon_nn_op_id_to_name);
+
+using hexagon_nn_global_teardown_fn = decltype(hexagon_nn_global_teardown);
+
+using hexagon_nn_global_init_fn = decltype(hexagon_nn_global_init);
+
+using hexagon_nn_is_device_supported_fn =
+    decltype(hexagon_nn_is_device_supported);
+
+using hexagon_nn_version_fn = decltype(hexagon_nn_version);
+
+using hexagon_nn_hexagon_interface_version_fn =
+    decltype(hexagon_nn_hexagon_interface_version);
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/utils.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/utils.h
new file mode 100644
index 00000000..b73be384
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/hexagon/utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_UTILS_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// Interpretes data from 'dims' as a 4D shape {batch, height, width, depth} and
+// populates the corresponding values. If dims->size < 4, the shape is prefixed
+// with 1s.
+// For example, dims {2, 3} is interpreted as: {1, 1, 2, 3}.
+// Returns kTfLiteError if dims->size > 4, kTfLiteOk otherwise.
+TfLiteStatus Get4DShape(unsigned int* batch_size, unsigned int* height_size,
+                        unsigned int* width_size, unsigned int* depth_size,
+                        TfLiteIntArray* dims);
+
+// Returns true if provided node is supported by Hexagon NNLib in the current
+// context.
+bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
+                              const TfLiteNode* node, TfLiteContext* context);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/interpreter_utils.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/interpreter_utils.h
new file mode 100644
index 00000000..fa55046f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/interpreter_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+
+#include "tensorflow/lite/interpreter.h"
+
+// Utility functions and classes for using delegates.
+
+namespace tflite {
+namespace delegates {
+class InterpreterUtils {
+ public:
+  /// Invokes an interpreter with automatic fallback from delegation to CPU.
+  ///
+  /// If using the delegate fails, the delegate is automatically undone and an
+  /// attempt made to return the interpreter to an invokable state.
+  ///
+  /// Allowing the fallback is suitable only if both of the following hold:
+  /// - The caller is known not to cache pointers to tensor data across Invoke()
+  ///   calls.
+  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
+  ///   needed between batches.
+  ///
+  /// Returns one of the following three status codes:
+  /// 1. kTfLiteOk: Success. Output is valid.
+  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
+  /// valid.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
+  /// WARNING: This is an experimental API and subject to change.
+  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
+};
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/acceleration_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/acceleration_test_util.h
new file mode 100644
index 00000000..bde6055f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/acceleration_test_util.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_ACCELERATION_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_ACCELERATION_TEST_UTIL_H_
+
+#include <optional>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/types/optional.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
+
+namespace tflite {
+
+// NNAPI specific configuration for the validation allowlist.
+class NnapiAccelerationTestParams {
+ public:
+  static const char* AccelerationTestConfig() {
+    return acceleration_test_config_;
+  }
+
+  static NnapiAccelerationTestParams ParseConfigurationLine(
+      const std::string& conf_line);
+
+  explicit NnapiAccelerationTestParams(int min_android_sdk_version)
+      : min_android_sdk_version_{min_android_sdk_version} {};
+
+  NnapiAccelerationTestParams()
+      : min_android_sdk_version_{delegate::nnapi::kMinSdkVersionForNNAPI} {};
+
+  // Minimum SDK version to apply the acceleration validation to.
+  int MinAndroidSdkVersion() { return min_android_sdk_version_; }
+
+ private:
+  // Content in nnapi_acceleration_test_list.cc.
+  static const char* const acceleration_test_config_;
+  int min_android_sdk_version_;
+};
+
+// Returns the NNAPI acceleration test configuration for the given test id.
+std::optional<NnapiAccelerationTestParams> GetNnapiAccelerationTestParam(
+    std::string test_id);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_ACCELERATION_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
new file mode 100644
index 00000000..52b1c410
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -0,0 +1,460 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/serialization.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+struct NnApiSLDriverImplFL5;
+struct NnapiDelegateVendorPlugin;
+typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
+
+namespace tflite {
+
+namespace delegate {
+namespace nnapi {
+class NNAPIDelegateKernel;
+}  // namespace nnapi
+}  // namespace delegate
+
+using tflite::delegate::nnapi::NNAPIDelegateKernel;
+
+// TFliteDelegate to interface with NNAPI.
+class StatefulNnApiDelegate : public TfLiteDelegate {
+ public:
+  // Encapsulates all options that are specific to NNAPI delegate.
+  struct Options {
+    // Preferred Power/perf trade-off. For more details please see
+    // ANeuralNetworksCompilation_setPreference documentation in :
+    // https://developer.android.com/ndk/reference/group/neural-networks.html
+    enum ExecutionPreference {
+      kUndefined = -1,
+      kLowPower = 0,
+      kFastSingleAnswer = 1,
+      kSustainedSpeed = 2,
+    };
+
+    // Preferred Power/perf trade-off.
+    ExecutionPreference execution_preference = kUndefined;
+
+    // Selected NNAPI accelerator with nul-terminated name.
+    // Default to nullptr, which implies the NNAPI default behavior: NNAPI
+    // runtime is allowed to use all available accelerators. If the selected
+    // accelerator cannot be found, NNAPI will not be used.
+    // It is the caller's responsibility to ensure the string is valid for the
+    // duration of the Options object lifetime.
+    const char* accelerator_name = nullptr;
+
+    // The nul-terminated cache dir for NNAPI model.
+    // Default to nullptr, which implies the NNAPI will not try caching the
+    // compilation.
+    const char* cache_dir = nullptr;
+
+    // The unique nul-terminated token string for NNAPI model.
+    // Default to nullptr, which implies the NNAPI will not try caching the
+    // compilation. It is the caller's responsibility to ensure there is no
+    // clash of the tokens.
+    // NOTE: when using compilation caching, it is not recommended to use the
+    // same delegate instance for multiple models.
+    const char* model_token = nullptr;
+
+    // Whether to disallow NNAPI CPU usage. Only effective on Android 10 and
+    // above. The NNAPI CPU typically performs less well than built-in TfLite
+    // kernels, but allowing CPU allows partial acceleration of models. If this
+    // is set to true, NNAPI is only used if the whole model is accelerated.
+    bool disallow_nnapi_cpu = true;
+
+    // Specifies the max number of partitions to delegate. A value <= 0 means
+    // no limit.
+    // If the delegation of the full set of supported nodes would generate a
+    // number of partition greater than this parameter, only
+    // <max_number_delegated_partitions> of them will be actually accelerated.
+    // The selection is currently done sorting partitions in decreasing order
+    // of number of nodes and selecting them until the limit is reached.
+    int max_number_delegated_partitions = 3;
+
+    // allow fp32 computation to be run in fp16.
+    bool allow_fp16 = false;
+
+    // Specifies the relative priority for executions of the model.
+    // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
+    // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
+    // ANEURALNETWORKS_PRIORITY_DEFAULT}.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model. If the device is not able to complete the compilation within the
+    // specified duration, the compilation may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model. If the device is not able to complete the execution within the
+    // specified duration, the execution may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_execution_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution. If a WHILE loop condition model does not output false
+    // within the specified duration, the execution will be aborted. If set to
+    // 0, the default timeout for loops will be used.
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
+
+    // Whether to allow dynamic dimension sizes without re-compilation.
+    // A tensor of with dynamic dimension must have a valid dim_signature
+    // defined.
+    // Only supported in NNAPI 1.1 and newer versions.
+    // WARNING: Setting this flag to true may result in model being rejected by
+    // accelerator. This should only be enabled if the target device supports
+    // dynamic dimensions of the model.
+    bool allow_dynamic_dimensions = false;
+
+    // Force using NNAPI Burst mode if supported.
+    // Burst mode allows accelerators to efficiently manage resources, which
+    // would significantly reduce overhead especially if the same delegate
+    // instance is to be used for multiple inferences.
+    // If NNAPI devices are specified and are of NNAPI feature level 5 or
+    // higher, NNAPI delegate will automatically enable burst mode for better
+    // performance.
+    // Default: Disabled for devices with NNAPI feature level 4 or lower.
+    bool use_burst_computation = false;
+
+    // Specifies the max number of NNAPI reusable executions to cache. An
+    // execution can be reused if the input and output tensors are using the
+    // same buffer handles, and all dynamic dimensions are unchanged. Setting
+    // this field to 0 means do not reuse execution.
+    uint32_t max_execution_cache_size = 4;
+
+    // Provides hints about the max size of tensors with dynamic shapes. The key
+    // of the map is the tensor index, and the value is the max size of the
+    // tensor in bytes. If a vendor plugin is supplied, this field is required
+    // for all output tensors with dynamic shapes because the output size cannot
+    // be inferred. Otherwise, this field is optional and any provided
+    // information may be used to guide the memory allocation. This field has no
+    // effect on tensors with static shapes.
+    std::map<int, size_t> tensor_max_size_hints;
+
+    // The optional null-terminated vendor specific compilation hints string.
+    // It is the vendor_plugin's responsibility to parse the hint string and
+    // decide whether the hints should be respected or not. If no vendor_plugin
+    // provided, the hints will be ignored.
+    const char* vendor_compilation_hints = nullptr;
+
+    // The optional null-terminated vendor specific execution hints string.
+    // It is the vendor_plugin's responsibility to parse the hint string and
+    // decide whether the hints should be respected or not. If no vendor_plugin
+    // provided, the hints will be ignored.
+    const char* vendor_execution_hints = nullptr;
+
+    // It is the users responsibility to make sure that
+    // vendor_plugin outlives the delegate instance.
+    // If a vendor plugin is supplied, and the model has dynamic dimensions, the
+    // delegate is not able to propagate tensor shapes. In such a case, the user
+    // must provide max tensor size in the "tensor_max_size_hints" field for all
+    // output tensors with dynamic shapes.
+    NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
+
+    // Controls disabling of the debugging diagnostics callbacks that only print
+    // debug logs, which are otherwise enabled by default.
+    // Use this in case different callbacks are being registered elsewhere, such
+    // as for example to send logs through some logger.
+    bool disable_debugging_diagnostics_callbacks = false;
+  };
+
+  // Uses default options.
+  StatefulNnApiDelegate();
+
+  // The ownership of the NnApi instance is left to the caller of the
+  // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime
+  // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate.
+  explicit StatefulNnApiDelegate(const NnApi* nnapi);
+
+  // The constructor that accepts options from user.
+  // This makes a copy of any data that it needs from Options, so
+  // the caller can safely deallocate any storage pointed to by
+  // the 'const char *' members of Options immediately after calling this.
+  explicit StatefulNnApiDelegate(Options options);
+
+  // Constructor that accepts both an NnApi instance and options.
+  // The ownership of the NnApi instance is left to the caller of the
+  // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime
+  // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate.
+  // This constructor makes a copy of any data that it needs from Options, so
+  // the caller can safely deallocate any storage pointed to by
+  // the 'const char *' members of Options immediately after calling this.
+  StatefulNnApiDelegate(const NnApi* nnapi, Options options);
+
+  // Constructor that accepts an NnApiSLDriverImplFL5 instance and options.
+  // The ownership of the NnApiSLDriverImplFL5 instance is left to the caller of
+  // the StatefulNnApiDelegate constructor; the caller must ensure that the
+  // lifetime of the NnApiSLDriverImplFL5 instance encompasses all calls to
+  // methods on the StatefulNnApiDelegate instance, other than the destructor.
+  // This constructor makes a copy of any data that it needs from Options, so
+  // the caller can safely deallocate any storage pointed to by
+  // the 'const char *' members of Options immediately after calling this.
+  //
+  // The NN API Support Library Driver must support at least NNAPI Feature Level
+  // 5 (introduced in SDK level 31), but this might point to a compatible struct
+  // that also supports a higher NNAPI Feature Level. These cases can be
+  // distinguished by examining the base.implFeatureLevel field, which should be
+  // set to the supported feature level (which must be >=
+  // ANEURALNETWORKS_FEATURE_LEVEL_5).
+  //
+  // Please note that since NNAPI Support Library doesn't implement some of the
+  // functions (see CreateNnApiFromSupportLibrary implementation and NNAPI SL
+  // documentation for details), the underlying NnApi structure will have
+  // nullptr stored in some of the function pointers. Calling such functions
+  // will result in a crash.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  StatefulNnApiDelegate(
+      const NnApiSLDriverImplFL5* nnapi_support_library_driver,
+      Options options);
+
+  ~StatefulNnApiDelegate() = default;
+
+  // Returns the delegate options.
+  // The lifetime of the storage pointed to by the 'const char *' members of the
+  // returned Options object is the same as the lifetime of the supplied
+  // TfLiteDelegate instance.
+  static const Options GetOptions(TfLiteDelegate* delegate);
+
+  // Callback function which copies data from ANeuralNetworksMemory to host
+  // tensor CPU buffer. It is the users responsibility to implement these
+  // callbacks for the specific types of shared memory they intend to use.
+  // WARNING: This is an experimental interface that is subject to change.
+  typedef TfLiteStatus (*CopyToHostTensorFnPtr)(TfLiteTensor* tensor,
+                                                ANeuralNetworksMemory* memory,
+                                                size_t memory_offset,
+                                                size_t byte_size,
+                                                void* callback_context);
+
+  // Encapsulates all fields related to memory registration for internal
+  // bookkeeping only.
+  struct MemoryRegistration {
+    ANeuralNetworksMemory* memory;
+    CopyToHostTensorFnPtr callback;
+    void* callback_context;
+    // The registration timestamp. It is unique for each registered memory in
+    // the lifetime of a StatefulNnApiDelegate.
+    uint64_t timestamp;
+  };
+
+  // Register the ANeuralNetworksMemory handle with the delegate. A
+  // TfLiteBufferHandle will be returned to be used with
+  // Interpreter::SetBufferHandle. The callback_context will be passed to the
+  // callback function when invoked.
+  // Note: the returned TfLiteBufferHandle can only be used with a single
+  // Interpreter instance. However, the caller can register the same memory
+  // multiple times to get different handles to use with difference Interpreter
+  // instances
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteBufferHandle RegisterNnapiMemory(ANeuralNetworksMemory* memory,
+                                         CopyToHostTensorFnPtr callback,
+                                         void* callback_context);
+
+  // Returns the vector of known ANeuralNetworksMemory handles.
+  // Note: this function is not intended to be called by developers.
+  // WARNING: This is an experimental interface that is subject to change.
+  static const std::vector<MemoryRegistration>& GetTensorMemoryMap(
+      TfLiteDelegate* delegate);
+
+  // Returns ptr to delegates::Serialization, if caching is enabled by user via
+  // cache_dir & model_token.
+  static delegates::Serialization* GetCache(TfLiteDelegate* delegate);
+
+  // Returns the int value of the ResultCode returned by the latest
+  // failed call to NNAPI, if any. Zero only in case of NO failed calls since
+  // the construction of this instance of StatefulNnApiDelegate.
+  // The error code is reset when the delegate is re-initialized
+  // (i.e. when calling interpreter.ModifyGraphWithDelegate(delegate)).
+  int GetNnApiErrno() const;
+
+ private:
+  // Encapsulates all delegate data.
+  struct Data {
+    // Pointer to NNAPI implementation to be used by this delegate as
+    // set when building the StatefulNnApiDelegate instance.
+    // Will generally be the NnApiInstance() singleton but can be overridden
+    // for testing or for users needing to wrap or stub parts of NNAPI.
+    // The ownership of the nnapi instance is left to the caller of
+    // the StatefulNnApiDelegate constructor.
+    const NnApi* nnapi;
+    // Preferred Power/perf trade-off.
+    Options::ExecutionPreference execution_preference;
+    // Selected NNAPI accelerator name.
+    std::string accelerator_name;
+    // The cache dir for NNAPI model.
+    std::string cache_dir;
+    // The unique token string for NNAPI model.
+    std::string model_token;
+    // Whether to disallow NNAPI CPU.
+    bool disallow_nnapi_cpu;
+    // Tensor to ANeuralNetworksMemory mapping.
+    std::vector<MemoryRegistration> tensor_memory_map;
+    // The next timestamp for buffer handle registration.
+    uint64_t next_buffer_handle_timestamp = 1;
+    // Contains a non zero value if any NNAPI method call
+    // operation returned a non zero result code.
+    int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
+    // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
+    // when trying to understand if all nodes are supported by the target
+    // accelerators.
+    // The key is the index of the first node in the partition.
+    // Couldn't use unique_ptr because of problems building on gcc
+    std::unordered_map<int, NNAPIDelegateKernel*> delegate_state_cache;
+    // Maximum number of NNAPI partition to delegate. Zero or negative means
+    // no limit. Copied from StatefulNnApiDelegate::Options
+    int max_number_delegated_partitions;
+    // allow fp32 computation to be run in fp16.
+    bool allow_fp16;
+    // Specifies the relative priority for executions of the model.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model.
+    uint64_t max_execution_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
+    // Whether to allow dynamic dimension sizes without re-compilation.
+    bool allow_dynamic_dimensions = false;
+    // Whether to use NNAPI Burst mode.
+    bool use_burst_computation = false;
+    // Specifies the max number of NNAPI reusable executions to cache.
+    uint32_t max_execution_cache_size = 4;
+    // Provides hints about the max size of tensors with dynamic shapes.
+    std::map<int, size_t> tensor_max_size_hints;
+    // The null-terminated vendor specific compilation hints string
+    const char* vendor_compilation_hints = nullptr;
+    // The null-terminated vendor specific execution hints string.
+    const char* vendor_execution_hints = nullptr;
+
+    // It is the users responsibility to make sure that
+    // vendor_plugin outlives the delegate instance.
+    NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
+
+    // Smart pointer for automatically cleaning up NnApi structure in case the
+    // delegate was constructed from an NNAPI support library
+    std::unique_ptr<const NnApi> owned_nnapi = nullptr;
+
+    // TFLite Serialization in case caching has been enabled by the user through
+    // Options.
+    std::unique_ptr<delegates::Serialization> cache;
+
+    // Controls disabling of the default diagnostics callbacks that only print
+    // debug logs, which are otherwise enabled by default.
+    // Use this in case different callbacks are being registered elsewhere, such
+    // as for example to send logs through some logger.
+    bool disable_debugging_diagnostics_callbacks = false;
+
+    explicit Data(const NnApi* nnapi);
+    explicit Data(std::unique_ptr<const NnApi> nnapi);
+    ~Data();
+
+    // Caches an initialised NNAPIDelegateKernel.
+    void CacheDelegateKernel(const TfLiteDelegateParams* delegate_params,
+                             NNAPIDelegateKernel* delegate_state);
+    // Returns a cached NNAPIDelegateKernel if available and removes it
+    // from the cache transferring the ownership to the caller.
+    NNAPIDelegateKernel* MaybeGetCachedDelegateKernel(
+        const TfLiteDelegateParams* delegate_params);
+  };
+
+  // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate
+  // documentation for more info.
+  static TfLiteStatus DoPrepare(TfLiteContext* context,
+                                TfLiteDelegate* delegate);
+
+  // Copy the data from delegate buffer handle into raw memory of the given
+  // 'tensor'. The delegate is allowed to allocate the raw
+  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  static TfLiteStatus DoCopyFromBufferHandle(TfLiteContext* context,
+                                             TfLiteDelegate* delegate,
+                                             TfLiteBufferHandle buffer_handle,
+                                             TfLiteTensor* tensor);
+
+  // Copy the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. Currently this function is not supported, and calling the function
+  // will result in an error.
+  static TfLiteStatus DoCopyToBufferHandle(TfLiteContext* context,
+                                           TfLiteDelegate* delegate,
+                                           TfLiteBufferHandle buffer_handle,
+                                           TfLiteTensor* tensor);
+
+  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
+  // this doesn't release the underlying resource (e.g. textures). The
+  // resources are either owned by application layer or the delegate.
+  static void DoFreeBufferHandle(TfLiteContext* context,
+                                 TfLiteDelegate* delegate,
+                                 TfLiteBufferHandle* handle);
+
+  // Returns the nodes that can be delegated via NNAPI to the accelerator
+  // specified in the delegate options and information about the way the
+  // graph will be partitioned if the supported nodes will be delegated.
+  // Partition information is composed by the number of partitions and
+  // the delegate parameters associated to each partition.
+  // The method also caches in delegate->data the NNApiDelegateKernel instances
+  // that have been created during the device evaluation.
+  // All arguments are expected to be non-null.
+  static TfLiteStatus GetNodesSupportedByAccelerator(
+      TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
+      const std::vector<int>& supported_nodes,
+      std::vector<int>* device_supported_nodes, int* num_partitions,
+      TfLiteDelegateParams** params_array, int* nnapi_errno);
+
+  // Alters the given array of nodes_to_delegate to limit the number of NNAPI
+  // owned partition to be less or equal than num_partitions. If num_partitions
+  // is less or equal to zero the input is left unaltered.
+  // The nodes_to_delegate array is expected to contain at element 0 the number
+  // of nodes to delegate and in remaining elements the set of nodes
+  // that would be delegated to NNAPI if this function wouldn't be
+  // called. It will be altered storing in the first element the count of
+  // nodes to actually delegate and in the remainder of the array the indexes.
+  // The params_array params might be altered during the functions execution.
+  static TfLiteStatus LimitDelegatedPartitions(
+      int max_partitions,
+      std::vector<TfLiteDelegateParams> partition_params_array,
+      std::vector<int>* nodes_to_delegate);
+
+  void StatefulNnApiDelegateConstructorImpl(const Options& options);
+
+  // Delegate data presented through TfLiteDelegate::data_.
+  Data delegate_data_;
+};
+
+// DEPRECATED: Please use StatefulNnApiDelegate class instead.
+//
+// Returns a singleton delegate that can be used to use the NN API.
+// e.g.
+//   TfLiteDelegate* delegate = NnApiDelegate();
+//   interpreter->ModifyGraphWithDelegate(delegate);
+// NnApiDelegate() returns a singleton, so you should not free this
+// pointer or worry about its lifetime.
+TfLiteDelegate* NnApiDelegate();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api.h
new file mode 100644
index 00000000..1ece2fbb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api.h
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_C_API_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_C_API_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Use TfLiteNnapiDelegateOptionsDefault() for Default options.
+// WARNING: This is an experimental API and subject to change.
+typedef struct TfLiteNnapiDelegateOptions {
+  // Preferred Power/perf trade-off. For more details please see
+  // ANeuralNetworksCompilation_setPreference documentation in :
+  // https://developer.android.com/ndk/reference/group/neural-networks.html
+  enum ExecutionPreference {
+    kUndefined = -1,
+    kLowPower = 0,
+    kFastSingleAnswer = 1,
+    kSustainedSpeed = 2,
+  } execution_preference;  // Preferred Power/perf trade-off.
+                           // Default to kUndefined.
+
+  // Selected NNAPI accelerator with nul-terminated name.
+  // Default to nullptr, which implies the NNAPI default behavior: NNAPI
+  // runtime is allowed to use all available accelerators. If the selected
+  // accelerator cannot be found, NNAPI will not be used.
+  // It is the caller's responsibility to ensure the string is valid for the
+  // duration of the Options object lifetime.
+  const char* accelerator_name;
+
+  // The nul-terminated cache dir for NNAPI model.
+  // Default to nullptr, which implies the NNAPI will not try caching the
+  // compilation.
+  const char* cache_dir;
+
+  // The unique nul-terminated token string for NNAPI model.
+  // Default to nullptr, which implies the NNAPI will not try caching the
+  // compilation. It is the caller's responsibility to ensure there is no
+  // clash of the tokens.
+  // NOTE: when using compilation caching, it is not recommended to use the
+  // same delegate instance for multiple models.
+  const char* model_token;
+
+  // Whether to disallow NNAPI CPU usage. Default to 1 (true). Only effective on
+  // Android 10 and above. The NNAPI CPU typically performs less well than
+  // built-in TfLite kernels, but allowing CPU allows partial acceleration of
+  // models. If this is set to true, NNAPI is only used if the whole model is
+  // accelerated.
+  int disallow_nnapi_cpu;
+
+  // Whether to allow fp32 computation to be run in fp16. Default to 0 (false).
+  int allow_fp16;
+
+  // Specifies the max number of partitions to delegate. A value <= 0 means
+  // no limit. Default to 3.
+  // If the delegation of the full set of supported nodes would generate a
+  // number of partition greater than this parameter, only
+  // <max_number_delegated_partitions> of them will be actually accelerated.
+  // The selection is currently done sorting partitions in decreasing order
+  // of number of nodes and selecting them until the limit is reached.
+  int max_number_delegated_partitions;
+
+  // The pointer to NNAPI support lib implementation. Default to nullptr.
+  // If specified, NNAPI delegate will use the support lib instead of NNAPI in
+  // Android OS.
+  void* nnapi_support_library_handle;
+} TfLiteNnapiDelegateOptions;
+
+// Returns a delegate that uses NNAPI for ops execution.
+// Must outlive the interpreter.
+// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteNnapiDelegateCreate(
+    const TfLiteNnapiDelegateOptions* options);
+
+// Returns TfLiteNnapiDelegateOptions populated with default values.
+// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT TfLiteNnapiDelegateOptions TfLiteNnapiDelegateOptionsDefault();
+
+// Does any needed cleanup and deletes 'delegate'.
+// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT void TfLiteNnapiDelegateDelete(TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
new file mode 100644
index 00000000..5e3a31ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -0,0 +1,431 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
+
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace delegate {
+namespace nnapi {
+
+constexpr int32_t kMinSdkVersionForNNAPI = 27;
+constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
+constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+constexpr int32_t kMinSdkVersionForNNAPI13 = 30;
+constexpr int32_t kNNAPIRuntimeFeatureLevel5 = 31;
+constexpr int32_t kNNAPIRuntimeFeatureLevel6 = 1000006;
+constexpr int32_t kNNAPIRuntimeFeatureLevel7 = 1000007;
+constexpr int32_t kNNAPIRuntimeFeatureLevel8 = 1000008;
+
+class NNAPIOpBuilder;
+
+// The kernel that represents the node sub set of TF Lite being run on NN API.
+struct NNAPIOpMappingArgs {
+  TfLiteContext* context;
+  NNAPIOpBuilder* builder;
+  TfLiteNode* node;
+  int node_index;
+  std::vector<int>* model_state_outputs;
+  std::vector<int>* model_state_tfl_inputs;
+  std::vector<std::tuple<int, int>>* feedback_loops;
+  int* nnapi_errno;
+};
+
+// RAII NN API Model Destructor for use with std::unique_ptr
+class NNFreeModel {
+ public:
+  explicit NNFreeModel(const NnApi* nnapi) : nnapi_(nnapi) {}
+  void operator()(ANeuralNetworksModel* model) {
+    nnapi_->ANeuralNetworksModel_free(model);
+  }
+
+ private:
+  // NnApi instance to use. Not owned by this object.
+  const NnApi* nnapi_;
+};
+// RAII NN API Compilation Destructor for use with std::unique_ptr
+class NNFreeCompilation {
+ public:
+  explicit NNFreeCompilation(const NnApi* nnapi) : nnapi_(nnapi) {}
+  void operator()(ANeuralNetworksCompilation* model) {
+    nnapi_->ANeuralNetworksCompilation_free(model);
+  }
+
+ private:
+  // NnApi instance to use. Not owned by this object.
+  const NnApi* nnapi_;
+};
+// RAII NN API Execution Destructor for use with std::unique_ptr
+class NNFreeExecution {
+ public:
+  explicit NNFreeExecution(const NnApi* nnapi) : nnapi_(nnapi) {}
+  void operator()(ANeuralNetworksExecution* execution) {
+    nnapi_->ANeuralNetworksExecution_free(execution);
+  }
+
+ private:
+  // NnApi instance to use. Not owned by this object.
+  const NnApi* nnapi_;
+};
+// RAII NN API Burst Destructor for use with std::unique_ptr
+class NNFreeBurst {
+ public:
+  explicit NNFreeBurst(const NnApi* nnapi) : nnapi_(nnapi) {}
+  void operator()(ANeuralNetworksBurst* model) {
+    nnapi_->ANeuralNetworksBurst_free(model);
+  }
+
+ private:
+  // NnApi instance to use. Not owned by this object.
+  const NnApi* nnapi_;
+};
+
+using UniqueExecution =
+    std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>;
+
+// RAII NN API MappingUtil Destructor for use with std::unique_ptr
+class NNFreeMappingUtil {
+ public:
+  void operator()(NnapiMappingUtilCInterface* mapping_util);
+};
+
+// Manage NNAPI shared memory handle
+class NNMemory {
+ public:
+  NNMemory(const NnApi* nnapi, const char* name, size_t size);
+
+  ~NNMemory();
+
+  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
+  uint8_t* get_data_ptr() { return data_ptr_; }
+  size_t get_byte_size() { return byte_size_; }
+
+ private:
+  // NnApi instance to use. Not owned by this object.
+  const NnApi* nnapi_;
+  int fd_ = 0;
+  size_t byte_size_ = 0;
+  uint8_t* data_ptr_ = nullptr;
+  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
+#ifndef __ANDROID__
+  std::string shm_region_name_;
+#endif
+};
+
+// LINT.IfChange
+enum class NNAPIValidationFailureType : int {
+  // The operator is not supported by either NNAPI or the NNAPI Delegate.
+  kUnsupportedOperator = 0,
+  // The given operation or operands are not supported on the specified
+  // Android SDK version. The min supported version is specified in the
+  // validation failure message.
+  kUnsupportedAndroidVersion = 1,
+  // The version of the operator (value of TfLiteRegistration::version)
+  // for the given op is not supported. The max supported version
+  // is specified in the validation failure message.
+  // For more details on each operator version see
+  // the GetBuiltinOperatorVersion function in
+  // tensorflow/lite/tools/versioning/op_version.cc.
+  kUnsupportedOperatorVersion = 2,
+  // The given input operand type is not supported for the current combination
+  // of operator type and sdk version.
+  kUnsupportedInputType = 3,
+  // When using NN API version 1.0 or 1.1, the condition
+  //   input_scale * filter_scale < output_scale
+  // must be true for quantized versions of the following ops:
+  // * CONV_2D
+  // * DEPTHWISE_CONV_2D
+  // * FULLY_CONNECTED (where filter actually stands for weights)
+  // The condition is relaxed and no longer required since version 1.2.
+  kNotRestrictedScaleCompliant = 4,
+  // The given output operand type is not supported for the current combination
+  // of operator type and sdk version.
+  kUnsupportedOutputType = 5,
+  // The size of the operand tensor is too large.
+  kUnsupportedOperandSize = 6,
+  // The value of one of the operands or of a combination of operands is
+  // not supported. Details are provided in the failure message.
+  kUnsupportedOperandValue = 7,
+  // The combination of float inputs and quantized weights or filters
+  // is not supported
+  kUnsupportedHybridOperator = 8,
+  // The quantization type (for example per-channel quantization) is not
+  // supported.
+  kUnsupportedQuantizationType = 9,
+  // The accelerated version of operation requires a specific operand to be
+  // specified.
+  kMissingRequiredOperand = 10,
+  // The rank of the operand is not supported. Details in the failure message.
+  kUnsupportedOperandRank = 11,
+  // The input tensor cannot be dynamically-sized.
+  kInputTensorShouldHaveConstantShape = 12,
+  // The operator has a different number of inputs of the one or ones that
+  // are supported by NNAPI.
+  kUnsupportedOperatorVariant = 13,
+  // The accelerated version of the operator cannot specify an activation
+  // function.
+  kNoActivationExpected = 14,
+  // Quantization scale and/or zero point are not in the supported value(s)
+  // for the accelerated operation.
+  kUnsupportedQuantizationParameters = 15,
+};
+// LINT.ThenChange(nnapi_linter/linter.proto)
+
+struct NNAPIValidationFailure {
+  NNAPIValidationFailureType type;
+  std::string message;
+
+  NNAPIValidationFailure(NNAPIValidationFailureType type, const char* message)
+      : type(type), message(message) {}
+};
+
+// LRU cache of reusable NNAPI executions.
+class NNAPIExecutionCache {
+ public:
+  // The cache signature. Uniquely identifies an execution request.
+  struct Signature {
+    std::vector<uint64_t> tensor_handle_timestamps;
+    std::vector<int> dynamic_dimensions;
+
+    bool operator==(const Signature& other) const;
+    struct Hasher {
+      std::size_t operator()(const Signature& signature) const;
+    };
+  };
+
+  explicit NNAPIExecutionCache(uint32_t max_cache_size)
+      : max_cache_size_(max_cache_size) {}
+
+  // Gets the cached execution by signature.
+  // On cache hit, the target execution is set to be the most recently used one.
+  // On cache miss, nullptr is returned.
+  ANeuralNetworksExecution* Get(const Signature& signature);
+
+  // Puts the execution in cache and set it to be the most recently used one.
+  // If the cache is full, the least recently used entry will be released.
+  void Put(const Signature& signature, UniqueExecution execution);
+
+  // Clears all cache entries.
+  void Clear();
+
+  // Resets the max cache size.
+  void SetMaxCacheSize(uint32_t max_cache_size);
+
+ private:
+  // Releases the least recently used cache.
+  void ReleaseLRU();
+
+  // The maximum number of reusable executions to cache.
+  uint32_t max_cache_size_;
+
+  // Cache signatures in the order of most recent use. The most recently used
+  // signature is at the front of the list.
+  std::list<Signature> order_;
+
+  // A hash map to lookup a managed execution by its signature.
+  std::unordered_map<Signature,
+                     std::pair<std::list<Signature>::iterator, UniqueExecution>,
+                     Signature::Hasher>
+      lookup_;
+};
+
+// The kernel that represents the node sub set of TF Lite being run on NN API.
+class NNAPIDelegateKernel {
+ public:
+  explicit NNAPIDelegateKernel(
+      const NnApi* nnapi, NnapiDelegateVendorPlugin* vendor_plugin = nullptr)
+      : initialised_(false),
+        nnapi_(nnapi),
+        nn_model_(nullptr, NNFreeModel(nnapi_)),
+        nn_compilation_(nullptr, NNFreeCompilation(nnapi_)),
+        nn_burst_(nullptr, NNFreeBurst(nnapi_)),
+        nn_execution_cache_(/*max_cache_size=*/4),
+        mapping_util_(NnapiMappingUtilCInterfaceCreate(), NNFreeMappingUtil()),
+        vendor_plugin_(vendor_plugin) {}
+  NNAPIDelegateKernel() : NNAPIDelegateKernel(NnApiImplementation()) {}
+  ~NNAPIDelegateKernel() {
+    for (auto content : allocation_memory_mapping_) {
+      nnapi_->ANeuralNetworksMemory_free(content.second);
+    }
+  }
+
+  static NnapiMappingUtilCInterface* NnapiMappingUtilCInterfaceCreate();
+
+  // Translate a node into its operands
+  // It assumes that the call to Validate for has been successful for
+  // the operation.
+  // In case of success it returns kTfLiteOk and stores in n_op_type the
+  // NNAPI Operation code.
+  // Returns kTfLiteError in case of failures during mapping.
+  static TfLiteStatus Map(TfLiteContext* context, int builtin_code, int version,
+                          int android_sdk_version,
+                          const NNAPIOpMappingArgs& mapping_args,
+                          ANeuralNetworksOperationType* nn_op_type,
+                          NnapiDelegateVendorPlugin* vendor_plugin = nullptr);
+
+  // Returns true if the node can be accelerated with NNAPI.
+  static bool Validate(
+      const TfLiteContext* context, const TfLiteRegistration* registration,
+      int android_sdk_version, const TfLiteNode* node,
+      bool is_accelerator_specified,
+      NnapiDelegateVendorPlugin* vendor_plugin = nullptr,
+      // Collects lists of failures collected during
+      // the validation of the possibility of accelerating
+      // the given node
+      std::vector<NNAPIValidationFailure>* map_failures = nullptr);
+
+  // Initialize the kernel (a NN model) and builds the NN Model.
+  // Any NNAPI Related error causing this method to fail will have the
+  // associated error number stored in nnapi_errno
+  TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params,
+                    int* nnapi_errno);
+
+  // Creates the NNAPI Compilation for the NN model. It assumes that Init has
+  // been called and completed successfully.
+  // Any NNAPI Related error causing this method to fail will have the
+  // associated error number stored in nnapi_errno
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node,
+                       int* nnapi_errno);
+
+  // Invoke the NN Model. Expects Init and Prepare to have been completed
+  // successfully.
+  // Any NNAPI Related error causing this method to fail will have the
+  // associated error number stored in nnapi_errno
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node,
+                      int* nnapi_errno);
+
+  // Returns the list of operations supported by the current NNAPI model as
+  // built in Prepare. Every operation is identified by the index as provided
+  // in the delegate parameters given to the delegate during the Init call.
+  // It expects the Init method has been called and completed successfully and
+  // returns kTfLiteError if not. Returns an error if any of the NNAPI
+  // operations fails or if the
+  // ANeuralNetworksModel_getSupportedOperationsForDevices function is not
+  // available in the NnApi object.
+  TfLiteStatus GetOperationsSupportedByTargetNnApiDevices(
+      TfLiteContext* context, std::vector<int>* supported_nodes,
+      int* nnapi_errno);
+
+ private:
+  // True if initialization has been completed successfully
+  bool initialised_;
+  // Access to NNApi.
+  const NnApi* nnapi_;
+  // ANN device handle.
+  std::vector<ANeuralNetworksDevice*> nnapi_devices_;
+  // Name of the nnapi device, empty if nnapi_devices_ is empty;
+  std::string device_name_;
+  // ANN API state.
+  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
+  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
+      nn_compilation_;
+  std::unique_ptr<ANeuralNetworksBurst, NNFreeBurst> nn_burst_;
+  NNAPIExecutionCache nn_execution_cache_;
+  // The mappings of tenor id to BufferHandle. Needed to track BufferHandle
+  // change and alter nn_reusable_execution_ if necessary.
+  std::vector<int> tensor_handle_map_;
+  // Node indices that this delegate is responsible for. Indices here
+  // indexes into the nodes array in the TfLiteContext.
+  std::vector<int> nodes_;
+  // Track indices we use
+  std::unique_ptr<NnapiMappingUtilCInterface, NNFreeMappingUtil> mapping_util_;
+
+  std::map<const MMAPAllocation*, ANeuralNetworksMemory*>
+      allocation_memory_mapping_;
+  // Track memory map
+  const std::vector<StatefulNnApiDelegate::MemoryRegistration>*
+      tensor_memory_map_;
+  std::vector<int> model_state_outputs_;
+  std::vector<int> model_state_tfl_inputs_;
+  // This is the equivalent of the pair model_state_outputs_,
+  // model_state_tfl_inputs_ for all tensors where we have to keep the output
+  // data available for TFLite model users
+  std::vector<std::tuple<int, int>> feedback_loops_;
+  // The mappings of tenor id to max size in bytes. If the hint is not provided
+  // for a tensor, it is set to 0.
+  std::vector<size_t> tensor_max_size_hints_;
+
+  std::unique_ptr<NNMemory> nn_input_memory_;
+  std::unique_ptr<NNMemory> nn_output_memory_;
+
+  std::vector<uint8_t> nn_compilation_cache_token_;
+
+  // Map of DENSIFY output tensor id to node id.
+  std::vector<int> densify_output_to_node_mapping_;
+  // Map of DEQUANTIZE output tensor id to node id.
+  // Only contains DEQUANTIZE nodes with non-const input.
+  std::vector<int> non_const_dequantize_output_to_node_mapping_;
+
+  NnapiDelegateVendorPlugin* vendor_plugin_ = nullptr;
+
+  // Fully initialized in NNAPIDelegateKernel::AddOpsAndTensors
+  int target_feature_level_ = 27;  // kMinSdkVersionForNNAPI10
+
+  void AddDequantizeOperatorsWhereNeeded(
+      const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
+      int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
+
+  TfLiteStatus DensifyAndDequantizeConstTensor(TfLiteContext* context,
+                                               int densify_node_id,
+                                               bool should_dequantize,
+                                               NNAPIOpBuilder& builder);
+
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno,
+                                bool allow_dynamic_dimensions);
+
+  TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const StatefulNnApiDelegate::Options& options,
+                          const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* output_tensors,
+                          int* nnapi_errno);
+
+  // Log the compilation info provided by the support library at the end of
+  // a compilation (failed or successful).
+  // To avoid output spamming, logging is done only once, on the first call to
+  // this method, subsequent runs will only retrieve the information but not
+  // log it.
+  //
+  // This method is registered as a callback with the SL which calls it.
+  static void LogCompilationInfoOnce(
+      const NnApi* nnapi, const ANeuralNetworksDiagnosticCompilationInfo* info);
+
+  // Log the execution info provided by the support library at the end of
+  // an execution (failed or successful).
+  // To avoid output spamming, logging is done only once, on the first call to
+  // this method, subsequent runs will only retrieve the information but not
+  // log it.
+  //
+  // This method is registered as a callback with the SL which calls it.
+  static void LogExecutionInfoOnce(
+      const NnApi* nnapi, const ANeuralNetworksDiagnosticExecutionInfo* info);
+};
+
+}  // namespace nnapi
+}  // namespace delegate
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
new file mode 100644
index 00000000..9a279e40
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
@@ -0,0 +1,102 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_MOCK_TEST_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_MOCK_TEST_H_
+
+// Cannot mock the delegate when using the disabled version
+// (see the condition in the BUILD file).
+#ifndef NNAPI_DELEGATE_DISABLED
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_handler.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace delegate {
+namespace nnapi {
+
+class NnApiMock : public ::tflite::nnapi::NnApiHandler {
+ public:
+  explicit NnApiMock(NnApi* nnapi, int android_sdk_version = 29)
+      : ::tflite::nnapi::NnApiHandler(nnapi) {
+    nnapi_->nnapi_exists = true;
+    nnapi_->android_sdk_version = android_sdk_version;
+    nnapi_->nnapi_runtime_feature_level = android_sdk_version;
+
+    nnapi_->ANeuralNetworksCompilation_free =
+        [](ANeuralNetworksCompilation* compilation) {};
+    nnapi_->ANeuralNetworksMemory_free = [](ANeuralNetworksMemory* memory) {};
+    nnapi_->ANeuralNetworksModel_free = [](ANeuralNetworksModel* model) {};
+    nnapi_->ANeuralNetworksExecution_free =
+        [](ANeuralNetworksExecution* execution) {};
+    nnapi_->ASharedMemory_create = [](const char* name, size_t size) -> int {
+      return open("/dev/zero", O_RDWR);
+    };
+    nnapi_->ANeuralNetworksEvent_free = [](ANeuralNetworksEvent* event) {};
+
+    ModelCreateReturns<ANEURALNETWORKS_NO_ERROR>();
+    AddOperandReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetOperandValueReturns<ANEURALNETWORKS_NO_ERROR>();
+    AddOperationReturns<ANEURALNETWORKS_NO_ERROR>();
+    IdentifyInputAndOutputsReturns<ANEURALNETWORKS_NO_ERROR>();
+    RelaxComputationFloatReturns<ANEURALNETWORKS_NO_ERROR>();
+    ModelFinishReturns<ANEURALNETWORKS_NO_ERROR>();
+    MemoryCreateFromFdReturns<ANEURALNETWORKS_NO_ERROR>();
+    CompilationCreateReturns<ANEURALNETWORKS_NO_ERROR>();
+    CompilationCreateForDevicesReturns<ANEURALNETWORKS_NO_ERROR>();
+    CompilationFinishReturns<ANEURALNETWORKS_NO_ERROR>();
+    ExecutionCreateReturns<ANEURALNETWORKS_NO_ERROR>();
+    ExecutionSetInputFromMemoryReturns<ANEURALNETWORKS_NO_ERROR>();
+    ExecutionSetOutputFromMemoryReturns<ANEURALNETWORKS_NO_ERROR>();
+    ExecutionComputeReturns<ANEURALNETWORKS_NO_ERROR>();
+    ExecutionStartComputeReturns<ANEURALNETWORKS_NO_ERROR>();
+    EventWaitReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetPriorityReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetOperandSymmPerChannelQuantParamsReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetNnapiSupportedDevice("test-device", android_sdk_version);
+  }
+
+  ~NnApiMock() { Reset(); }
+};
+
+class NnApiDelegateMockTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    nnapi_ = *NnApiImplementation();
+    nnapi_mock_ = std::make_unique<NnApiMock>(&nnapi_);
+  }
+
+  std::unique_ptr<NnApiMock> nnapi_mock_;
+
+ private:
+  NnApi nnapi_;
+};
+
+}  // namespace nnapi
+}  // namespace delegate
+}  // namespace tflite
+
+#endif  // #ifndef NNAPI_DELEGATE_DISABLED
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_MOCK_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h
new file mode 100644
index 00000000..a230700c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_PLUGIN_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The mapping utils intended for vendor plugin to track:
+//   - TFLite tensor indices to NN API tensor indices mapping.
+//   - TFLite node indices to NN API operation indices mapping.
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct NnapiMappingUtilCInterface {
+  // Given a TFLite index, return the ANN index. If it doesn't exist
+  // return -1.
+  int (*TfLiteIndexToNnIndex)(NnapiMappingUtilCInterface* mapping, int index);
+
+  // When adding a non-tensor TFLite node parameter to NNAPI as an
+  // ANeuralNetworksOperand, notify NNAPI delegate to increment the operand
+  // count.
+  int (*AddNewNonTensorOperand)(NnapiMappingUtilCInterface* mapping);
+
+  // When adding a TFLite tensor to NNAPI as an ANeuralNetworksOperand, notify
+  // NNAPI delegate to add a new mapping from `tflite_index` and return the NN
+  // API tensor index.
+  int (*AddNewNnTensorIndex)(NnapiMappingUtilCInterface* mapping,
+                             int tflite_index);
+
+  // When adding a TFLite tensor to NNAPI as multiple ANeuralNetworksOperand
+  // objects, for example when splitting one input into several ones, notify
+  // NNAPI delegate to increment the operand count.
+  int (*AddDelegateGeneratedInputAnnTensorOperand)(
+      NnapiMappingUtilCInterface* mapping);
+
+  // Given a TFLite index returns a TFLite type to which a tensor must be
+  // converted during copying the data to the memory allocated for NN API.
+  // kTfLiteNoType means no conversion is needed.
+  TfLiteType (*TfLiteIndexToNnTypeConversion)(
+      NnapiMappingUtilCInterface* mapping, int index);
+
+  // Add a new mapping from TFLite tensor index to a type conversion.
+  void (*AddTypeConversion)(NnapiMappingUtilCInterface* mapping,
+                            int tflite_index, TfLiteType tflite_type);
+
+  // Add a new mapping from TFLite node index to NNAPI op index.
+  void (*AddNnapiToTfliteOpMapping)(NnapiMappingUtilCInterface* mapping,
+                                    int tflite_node_index);
+
+  // opaque handle for the mapping context. Only intended for the NNAPI Delegate
+  // to use.
+  void* context;
+} NnapiMappingUtilCInterface;
+
+// The interface for NNAPI Vendor Plugin.
+// The interface exposes necessary functionalities for NNAPI delegate to
+// interact with the vendor plugin.
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct NnapiDelegateVendorPlugin {
+  // Validate whether the given TFLite node is supported by the plugin.
+  bool (*ValidateNode)(const TfLiteContext* context,
+                       const TfLiteRegistration* registration,
+                       const TfLiteNode* node);
+
+  // Translate a TFLite node into corresponding NNAPI operands and operation.
+  // It assumes that the call to Validate for has been successful for
+  // the operation. In case of success it returns kTfLiteOk and stores the
+  // corresponding NNAPI operand indices and operation code through the mapping
+  // utility interface. Returns kTfLiteError in case of failures during mapping.
+  TfLiteStatus (*MapNode)(TfLiteContext* context, const TfLiteNode* node,
+                          int node_index, NnapiMappingUtilCInterface* mapping,
+                          ANeuralNetworksModel* model);
+
+  // Parse the provided compilation_hints string and configure it for the given
+  // ANeuralNetworksCompilation handle.
+  TfLiteStatus (*ConfigureCompilationHints)(
+      const char* compilation_hints, ANeuralNetworksCompilation* compilation);
+
+  // Parse the provided execution_hints string and configure it for the given
+  // ANeuralNetworksExecution handle.
+  TfLiteStatus (*ConfigureExecutionHints)(const char* execution_hints,
+                                          ANeuralNetworksExecution* execution);
+} NnapiDelegateVendorPlugin;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h
new file mode 100644
index 00000000..2b0175bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace delegate {
+namespace nnapi {
+
+void ExtractQuantLstmWeightsSubmatrix(const TfLiteIntArray* submatrix_dims,
+                                      const int32_t offset_row,
+                                      const int32_t offset_column,
+                                      const TfLiteIntArray* weight_dims,
+                                      const uint8_t* weights,
+                                      std::vector<uint8_t>* submatrix);
+
+void DecomposeQuantLstmWeightsTensor(const uint8_t* concat_weights,
+                                     const TfLiteIntArray* weight_dims,
+                                     std::vector<uint8_t>* recurrent_to_input,
+                                     std::vector<uint8_t>* input_to_input,
+                                     std::vector<uint8_t>* recurrent_to_cell,
+                                     std::vector<uint8_t>* input_to_cell,
+                                     std::vector<uint8_t>* recurrent_to_forget,
+                                     std::vector<uint8_t>* input_to_forget,
+                                     std::vector<uint8_t>* recurrent_to_output,
+                                     std::vector<uint8_t>* input_to_output);
+
+void SetWeightSubmatrixDims(const TfLiteIntArray* weight_dims,
+                            TfLiteIntArray* recurrent_submatrix_dims,
+                            TfLiteIntArray* input_submatrix_dims);
+
+void DecomposeBiasTensor(const int32_t* biases, int bias_size,
+                         std::vector<int32_t>* input_bias,
+                         std::vector<int32_t>* cell_bias,
+                         std::vector<int32_t>* forget_bias,
+                         std::vector<int32_t>* output_bias);
+
+}  // namespace nnapi
+}  // namespace delegate
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/serialization.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/serialization.h
new file mode 100644
index 00000000..5c3f3255
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/serialization.h
@@ -0,0 +1,232 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_SERIALIZATION_H_
+#define TENSORFLOW_LITE_DELEGATES_SERIALIZATION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+// This file implements a serialization utility that TFLite delegates can use to
+// read/write initialization data.
+//
+// Example code:
+//
+// Initialization
+// ==============
+// SerializationParams params;
+// // Acts as a namespace for all data entries for a given model.
+// // See StrFingerprint().
+// params.model_token = options->model_token;
+// // Location where data is stored, should be private to the app using this.
+// params.serialization_dir = options->serialization_dir;
+// Serialization serialization(params);
+//
+// Writing data
+// ============
+// TfLiteContext* context = ...;
+// TfLiteDelegateParams* params = ...;
+// SerializationEntry kernels_entry = serialization->GetEntryForKernel(
+//     "gpuv2_kernels", context, delegate_params);
+//
+// TfLiteStatus kernels_save_status = kernels_entry.SetData(
+//     reinterpret_cast<char*>(data_ptr),
+//     data_size);
+// if (kernels_save_status == kTfLiteOk) {
+//   //...serialization successful...
+// } else if (kernels_save_status == kTfLiteDelegateDataWriteError) {
+//   //...error in serializing data to disk...
+// } else {
+//   //...unexpected error...
+// }
+//
+// Reading data
+// ============
+// std::string kernels_data;
+// TfLiteStatus kernels_data_status = kernels_entry.GetData(&kernels_data);
+// if (kernels_data_status == kTfLiteOk) {
+//   //...serialized data found...
+// } else if (kernels_data_status == kTfLiteDelegateDataNotFound) {
+//   //...serialized data missing...
+// } else {
+//   //...unexpected error...
+// }
+namespace tflite {
+namespace delegates {
+
+// Helper to generate a unique string (converted from 64-bit farmhash) given
+// some data. Intended for use by:
+//
+// 1. Delegates, to 'fingerprint' some custom data (like options),
+//    and provide it as custom_key to Serialization::GetEntryForDelegate or
+//    GetEntryForKernel.
+// 2. TFLite clients, to fingerprint a model flatbuffer & get a unique
+//    model_token.
+std::string StrFingerprint(const void* data, const size_t num_bytes);
+
+// Encapsulates a unique blob of data serialized by a delegate.
+// Needs to be initialized with a Serialization instance.
+// Any data set with this entry is 'keyed' by a 64-bit fingerprint unique to the
+// parameters used during initialization via
+// Serialization::GetEntryForDelegate/GetEntryForKernel.
+//
+// NOTE: TFLite cannot guarantee that the read data is always fully valid,
+// especially if the directory is accessible to other applications/processes.
+// It is the delegate's responsibility to validate the retrieved data.
+class SerializationEntry {
+ public:
+  friend class Serialization;
+
+  // Returns a 64-bit fingerprint unique to the parameters provided during the
+  // generation of this SerializationEntry.
+  // Produces same value on every run.
+  uint64_t GetFingerprint() const { return fingerprint_; }
+
+  // Stores `data` into a file that is unique to this SerializationKey.
+  // Overwrites any existing data if present.
+  //
+  // Returns:
+  //   kTfLiteOk if data is successfully stored
+  //   kTfLiteDelegateDataWriteError for data writing issues
+  //   kTfLiteError for unexpected error.
+  //
+  // NOTE: We use a temp file & rename it as file renaming is an atomic
+  // operation in most systems.
+  TfLiteStatus SetData(TfLiteContext* context, const char* data,
+                       const size_t size) const;
+
+  // Get `data` corresponding to this key, if available.
+  //
+  // Returns:
+  //   kTfLiteOk if data is successfully stored
+  //   kTfLiteDataError for data writing issues
+  //   kTfLiteError for unexpected error.
+  TfLiteStatus GetData(TfLiteContext* context, std::string* data) const;
+
+  // Non-copyable.
+  SerializationEntry(const SerializationEntry&) = delete;
+  SerializationEntry& operator=(const SerializationEntry&) = delete;
+  SerializationEntry(SerializationEntry&& src) = default;
+
+ protected:
+  SerializationEntry(const std::string& cache_dir,
+                     const std::string& model_token,
+                     const uint64_t fingerprint_64);
+
+  // Caching directory.
+  const std::string cache_dir_;
+  // Model Token.
+  const std::string model_token_;
+  // For most applications, 64-bit fingerprints are enough.
+  const uint64_t fingerprint_ = 0;
+};
+
+// Encapsulates all the data that clients can use to parametrize a Serialization
+// interface.
+typedef struct SerializationParams {
+  // Acts as a 'namespace' for all SerializationEntry instances.
+  // Clients should ensure that the token is unique to the model graph & data.
+  // StrFingerprint() can be used with the flatbuffer data to generate a unique
+  // 64-bit token.
+  // TODO(b/190055017): Add 64-bit fingerprints to TFLite flatbuffers to ensure
+  // different model constants automatically lead to different fingerprints.
+  // Required.
+  const char* model_token;
+  // Denotes the directory to be used to store data.
+  // It is the client's responsibility to ensure this location is valid and
+  // application-specific to avoid unintended data access issues.
+  // On Android, `getCodeCacheDir()` is recommended.
+  // Required.
+  const char* cache_dir;
+} SerializationParams;
+
+// Utility to enable caching abilities for delegates.
+// See documentation at the top of the file for usage details.
+//
+// WARNING: Experimental interface, subject to change.
+class Serialization {
+ public:
+  // Initialize a Serialization interface for applicable delegates.
+  explicit Serialization(const SerializationParams& params)
+      : cache_dir_(params.cache_dir), model_token_(params.model_token) {}
+
+  // Generate a SerializationEntry that incorporates both `custom_key` &
+  // `context` into its unique fingerprint.
+  //  Should be used to handle data common to all delegate kernels.
+  // Delegates can incorporate versions & init arguments in custom_key using
+  // StrFingerprint().
+  SerializationEntry GetEntryForDelegate(const std::string& custom_key,
+                                         TfLiteContext* context) {
+    return GetEntryImpl(custom_key, context);
+  }
+
+  // Generate a SerializationEntry that incorporates `custom_key`, `context`,
+  // and `delegate_params` into its unique fingerprint.
+  // Should be used to handle data specific to a delegate kernel, since
+  // the context+delegate_params combination is node-specific.
+  // Delegates can incorporate versions & init arguments in custom_key using
+  // StrFingerprint().
+  SerializationEntry GetEntryForKernel(
+      const std::string& custom_key, TfLiteContext* context,
+      const TfLiteDelegateParams* partition_params) {
+    return GetEntryImpl(custom_key, context, partition_params);
+  }
+
+  // Non-copyable.
+  Serialization(const Serialization&) = delete;
+  Serialization& operator=(const Serialization&) = delete;
+
+ protected:
+  SerializationEntry GetEntryImpl(
+      const std::string& custom_key, TfLiteContext* context = nullptr,
+      const TfLiteDelegateParams* delegate_params = nullptr);
+
+  const std::string cache_dir_;
+  const std::string model_token_;
+};
+
+// Helper for delegates to save their delegation decisions (which nodes to
+// delegate) in TfLiteDelegate::Prepare().
+// Internally, this uses a unique SerializationEntry based on the `context` &
+// `delegate_id` to save the `node_ids`. It is recommended that `delegate_id` be
+// unique to a backend/version to avoid reading back stale delegation decisions.
+//
+// NOTE: This implementation is platform-specific, so this method & the
+// subsequent call to GetDelegatedNodes should happen on the same device.
+TfLiteStatus SaveDelegatedNodes(TfLiteContext* context,
+                                Serialization* serialization,
+                                const std::string& delegate_id,
+                                const TfLiteIntArray* node_ids);
+
+// Retrieves list of delegated nodes that were saved earlier with
+// SaveDelegatedNodes.
+// Caller assumes ownership of data pointed by *nodes_ids.
+//
+// NOTE: This implementation is platform-specific, so SaveDelegatedNodes &
+// corresponding GetDelegatedNodes should be called on the same device.
+TfLiteStatus GetDelegatedNodes(TfLiteContext* context,
+                               Serialization* serialization,
+                               const std::string& delegate_id,
+                               TfLiteIntArray** node_ids);
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_SERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/telemetry.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/telemetry.h
new file mode 100644
index 00000000..461e23f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/telemetry.h
@@ -0,0 +1,110 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_TELEMETRY_H_
+#define TENSORFLOW_LITE_DELEGATES_TELEMETRY_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// This file implements utilities for delegate telemetry. These enable
+// representation and reporting of hardware-specific configurations, status
+// codes, etc.
+// These APIs are for internal use *only*, and should be modified with care to
+// avoid incompatibilities between delegates & runtime.
+// WARNING: This is an experimental feature that is subject to change.
+namespace tflite {
+namespace delegates {
+
+// Used to identify specific events for tflite::Profiler.
+constexpr char kDelegateSettingsTag[] = "delegate_settings";
+constexpr char kDelegateStatusTag[] = "delegate_status";
+
+// Defines the delegate or hardware-specific 'namespace' that a status code
+// belongs to. For example, GPU delegate errors might be belong to TFLITE_GPU,
+// while OpenCL-specific ones might be TFLITE_GPU_CL.
+enum class DelegateStatusSource {
+  NONE = 0,
+  TFLITE_GPU = 1,
+  TFLITE_NNAPI = 2,
+  TFLITE_HEXAGON = 3,
+  TFLITE_XNNPACK = 4,
+  TFLITE_COREML = 5,
+  MAX_NUM_SOURCES = std::numeric_limits<int32_t>::max(),
+};
+
+// DelegateStatus defines a namespaced status with a combination of
+// DelegateStatusSource & the corresponding fine-grained 32-bit code. Used to
+// convert to/from a 64-bit representation as follows:
+//
+// delegates::DelegateStatus status(
+//      delegates::DelegateStatusSource::TFLITE_NNAPI,
+//      ANEURALNETWORKS_OP_FAILED);
+// int64_t code = status.full_status();
+//
+// auto parsed_status = delegates::DelegateStatus(code);
+class DelegateStatus {
+ public:
+  DelegateStatus() : DelegateStatus(DelegateStatusSource::NONE, 0) {}
+  explicit DelegateStatus(int32_t code)
+      : DelegateStatus(DelegateStatusSource::NONE, code) {}
+  explicit DelegateStatus(int64_t full_status)
+      : DelegateStatus(
+            static_cast<DelegateStatusSource>(
+                full_status >> 32 &
+                static_cast<int32_t>(DelegateStatusSource::MAX_NUM_SOURCES)),
+            static_cast<int32_t>(full_status &
+                                 std::numeric_limits<int32_t>::max())) {}
+  DelegateStatus(DelegateStatusSource source, int32_t code)
+      : source_(static_cast<int32_t>(source)), code_(code) {}
+
+  // Return the detailed full status encoded as a int64_t value.
+  int64_t full_status() const {
+    return static_cast<int64_t>(source_) << 32 | code_;
+  }
+
+  DelegateStatusSource source() const {
+    return static_cast<DelegateStatusSource>(source_);
+  }
+
+  int32_t code() const { return code_; }
+
+ private:
+  // value of a DelegateStatusSource, like DelegateStatusSource::TFLITE_GPU
+  int32_t source_;
+  // value of a status code, like kTfLiteOk.
+  int32_t code_;
+};
+
+// Used by delegates to report their configuration/settings to TFLite.
+// Calling this method adds a new GENERAL_RUNTIME_INSTRUMENTATION_EVENT to
+// the runtime Profiler.
+TfLiteStatus ReportDelegateSettings(TfLiteContext* context,
+                                    TfLiteDelegate* delegate,
+                                    const TFLiteSettings& settings);
+
+// Used by delegates to report their status to the TFLite runtime.
+// Calling this method adds a new GENERAL_RUNTIME_INSTRUMENTATION_EVENT to
+// the runtime Profiler.
+TfLiteStatus ReportDelegateStatus(TfLiteContext* context,
+                                  TfLiteDelegate* delegate,
+                                  const DelegateStatus& status);
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_TELEMETRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils.h
new file mode 100644
index 00000000..bdf90edc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils.h
@@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_H_
+
+// Utility functions and classes for implementing delegates.
+
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace delegates {
+
+// Creates a new Read/Write tensor having the same shape as the original, but
+// with a different type. Note that this might void existing references to
+// tensors.
+TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
+                                              const int original_tensor_index,
+                                              TfLiteType new_type,
+                                              TfLiteTensor** new_tensor,
+                                              int* new_tensor_index);
+
+// Retrieves the corresponding TfLiteContext of a subgraph given a subgraph
+// index and switches to the delegate context for this subgraph. If an invalid
+// subgraph index is given, returns kTfLiteError.
+// NOTE: This function is expected to be paired with ReleaseSubgraphContext()
+// once the delegate preparation is done and/or the delegate context functions
+// are no longer needed.
+TfLiteStatus AcquireSubgraphContext(const TfLiteContext* context,
+                                    int subgraph_index,
+                                    TfLiteContext** acquired_context);
+
+// Releases the subgraph context by switching back to the TFLite kernel
+// context for this specified subgraph.
+// NOTE: This function is expected to be used after AcquireSubgraphContext()
+// once the delegate preparation is done and/or the delegate context functions
+// are no longer needed.
+TfLiteStatus ReleaseSubgraphContext(const TfLiteContext* context,
+                                    int subgraph_index);
+
+// Marks the subgraph with the given index as delegation-skippable. Returns
+// kTfLiteOk if the given subgraph index is valid and is successfully marked
+// as delegation-skippable, and an error status if the subgraph index is
+// invalid.
+// If a subgraph is delegation-skippable, then the subgraph will be handled by
+// a TfLiteDelegate (and that the delegate is supposed to be already aware of
+// this state), and therefore, TfLiteInterpreter can skip invoking
+// `ModifyGraphWithDelegate` on this subgraph.
+// NOTE: This function is expected to be called only when the subgraph that
+// `subgraph_index` is pointing to should be skipped by
+// interpreter::ModifyGraphWithDelegate (e.g. the subgraph is part of the list
+// of callee subgraphs of the same control flow node, and all of those callees
+// are supported by the same delegate at once).
+//
+// For example, this function can be used when the delegate is handling
+// control flow ops like while op. E.g. A while op has condition subgraph
+// indexed at `i` and body subgraph indexed at `j`. The op can be delegated
+// when the following condition satisfied:
+//   1. The delegate supports while op
+//   2. Both condition subgraph `i` and body subgraph `j` can be fully
+//   delegated by the delegate.
+// Then if the delegate decides to support the while node along with both body
+// and condition subgraphs, it should mark subgraphs `i` and `j` skippable so
+// those two subgraphs won't be delegated separately again after being
+// absorbed by the parent subgraph.
+// WARNING: It is the delegate's responsibility to define when to skip
+// subgraph->ModifyGraphWithDelegate, to check any edge cases (i.e. multiple
+// references to the subgraph that `subgraph_index` is pointing to), and to mark
+// that subgraph as skippable using this function.
+// NOTE: Entry point for C node plugin API.
+TfLiteStatus MarkSubgraphAsDelegationSkippable(const TfLiteContext* context,
+                                               int subgraph_index);
+
+using IsNodeSupportedFn =
+    std::function<bool(TfLiteContext*, TfLiteNode*, TfLiteRegistration*,
+                       std::string* unsupported_details)>;
+
+// A utility class to help model graph parition.
+// Note the class *needs* to be used in TfLiteDelegate::Prepare.
+class GraphPartitionHelper {
+ public:
+  GraphPartitionHelper(TfLiteContext* context,
+                       IsNodeSupportedFn is_node_supported_fn)
+      : context_(context), is_node_supported_fn_(is_node_supported_fn) {}
+
+  GraphPartitionHelper(TfLiteContext* context,
+                       const std::vector<int>& supported_node_indices)
+      : context_(context),
+        num_total_nodes_(supported_node_indices.size()),
+        supported_nodes_(
+            ConvertVectorToTfLiteIntArray(supported_node_indices)) {}
+
+  virtual ~GraphPartitionHelper() {
+    TfLiteIntArrayFree(supported_nodes_);
+    TfLiteIntArrayFree(original_execution_plan_);
+  }
+
+  // Partition the graph into node subsets such that each subset could be
+  // replaced with one delegate kernel (i.e. a kTfLiteBuiltinDelegate op).
+  // If 'unsupported_nodes_info' is provided, it will be populated with
+  // information about all different unsupported nodes.
+  // The 'start_node_index' and 'end_node_index' define the range of nodes
+  // that could be delegated.
+  TfLiteStatus Partition(std::set<std::string>* unsupported_nodes_info,
+                         int start_node_index = 0,
+                         int end_node_index = std::numeric_limits<int>::max()) {
+    return PartitionImpl(unsupported_nodes_info, start_node_index,
+                         end_node_index);
+  }
+
+  // Returns the first n largest partitions or all if #partitions is less than
+  // 'n' and each parition has at least (>=) 'min_nodes_per_partition' nodes.
+  // Note that partitions are ranked according to the number of nodes that
+  // a partition has, and the returned TfLiteDelegateParams objects are *owned*
+  // by the TfLite runtime.
+  // TODO(b/156707497): remove this and use GetNodesOfFirstNLargestPartitions
+  std::vector<TfLiteDelegateParams*> GetFirstNLargestPartitions(
+      int n = std::numeric_limits<int>::max(),
+      int min_nodes_per_partition = 0) const;
+
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  std::vector<int> GetNodesOfFirstNLargestPartitions(
+      int n = std::numeric_limits<int>::max(),
+      int min_nodes_per_partition = 0) {
+    // Separated implementation that can be overrided, to preserve default value
+    return GetNodesOfFirstNLargestPartitionsImpl(n, min_nodes_per_partition);
+  }
+
+  int num_total_nodes() const { return num_total_nodes_; }
+  int num_supported_nodes() const { return num_supported_nodes_; }
+  int num_partitions() const { return partitions_.size(); }
+
+ protected:
+  virtual bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                               TfLiteRegistration* registration, int node_id,
+                               std::string* unsupported_details) {
+    return is_node_supported_fn_(context, node, registration,
+                                 unsupported_details);
+  }
+  virtual std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
+      int n, int min_nodes_per_partition);
+  virtual TfLiteStatus PartitionImpl(
+      std::set<std::string>* unsupported_nodes_info, int start_node_index,
+      int end_node_index);
+
+  TfLiteContext* const context_ = nullptr;
+
+  // Doesn't own the memory of each TfLiteDelegateParams object as it's
+  // managed by the TfLite runtime itself. See
+  // TfLiteContext::PreviewDelegatePartitioning for details.
+  std::vector<TfLiteDelegateParams*> partitions_;
+
+  // Copy of (pre-delegation) execution plan obtained from TfLiteContext in
+  // PrepareSupportedNodes
+  TfLiteIntArray* original_execution_plan_ = nullptr;
+
+ private:
+  // Generate a list of supported nodes (i.e. populating 'supported_nodes_') by
+  // iterating over all nodes (i,e. those listed in the execution_plan
+  // associated w/ 'context_').
+  // If 'unsupported_nodes_info' is provided, it will be populated with
+  // information about all different unsupported nodes.
+  // The 'start_node_index' and 'end_node_index' define the range of nodes that
+  // could be delegated.
+  TfLiteStatus PrepareSupportedNodes(
+      std::set<std::string>* unsupported_nodes_info = nullptr,
+      int start_node_index = 0,
+      int end_node_index = std::numeric_limits<int>::max());
+
+  // The number of total nodes passed in for partitioning (i.e. the
+  // execution_plan size associated w/ 'context_')
+  int num_total_nodes_ = 0;
+
+  int num_supported_nodes_ = 0;
+
+  // Tells if a node is supported as it could be delegated.
+  const IsNodeSupportedFn is_node_supported_fn_ = nullptr;
+
+  // Contains an array of supported node indices.
+  TfLiteIntArray* supported_nodes_ = nullptr;  // owns the memory
+};
+
+// Specialized partitioner for graphs that possibly contain fp16 tensors.
+//
+// From nodes that accept fp16 inputs, this delegates the following:
+// 1. All nodes (except DEQUANTIZE) that are supported with constant fp16 inputs
+// by the delegate (in the TFLite graph, these nodes take in dequantized FP32
+// outputs).
+// 2. All fp16 DEQUANTIZE nodes that have *all* their consumers in the *first*
+// delegated partition. This is because TFLite's partitioning algorithm
+// greedily puts all such nodes in the first partition.
+class FP16GraphPartitionHelper : public GraphPartitionHelper {
+ public:
+  FP16GraphPartitionHelper(TfLiteContext* context,
+                           IsNodeSupportedFn is_node_supported_fn)
+      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+
+ protected:
+  // Specialized function to handle fp16 nodes.
+  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteRegistration* registration, int node_id,
+                       std::string* unsupported_details) override;
+
+  // This will remap input tensors by removing FP16 to FP32 dequantized tensors.
+  std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
+      int n, int min_nodes_per_partition) override;
+
+ private:
+  // This remaps fp32 inputs of the given node to their corresponding fp16
+  // version, if applicable. Can be summarized as:
+  // fp16 -> DEQUANTIZE -> fp32 -> OP -> output
+  // becomes
+  // fp16 -> OP -> output
+  void RemapFp16InputTensors(TfLiteNode* node,
+                             std::vector<int>* orig_inputs) const;
+
+  // Performs the above remapping for all nodes in the given list, without
+  // tracking the original inputs.
+  void RemapFp16InputTensors(const std::vector<int>& nodes) const;
+
+  // ('dequantize' here refers to fp16 DEQUANTIZE)
+  // Mapping of dequantize nodes' output tensor-id to its node id.
+  // TODO(b/156707497): Use absl hash_maps here.
+  std::unordered_map<int, int> constant_dequant_nodes_;
+  // Mapping of DEQUANTIZE node's output (fp32) to its input (fp16).
+  std::unordered_map<int, int> constant_dequant_map_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/async_type_helpers.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/async_type_helpers.h
new file mode 100644
index 00000000..14f5061c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/async_type_helpers.h
@@ -0,0 +1,96 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_ASYNC_TYPE_HELPERS_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_ASYNC_TYPE_HELPERS_H_
+
+#include <memory>
+#include <optional>
+
+#include "tensorflow/lite/async/interop/c/attribute_map.h"
+#include "tensorflow/lite/async/interop/c/types.h"
+
+namespace tflite::delegates::utils {
+
+constexpr char kBufferTypeAHardwareBufferBlob[] = "ahardware_buffer_blob";
+constexpr char kSyncTypeSyncFenceFd[] = "sync_fence_fd";
+
+// RAII wrapper of TfLiteAttributeMap.
+using ScopedTfLiteAttrMap =
+    std::unique_ptr<TfLiteAttributeMap, decltype(&TfLiteAttributeMapDelete)>;
+
+inline ScopedTfLiteAttrMap CreateScopedTfLiteAttrMap(TfLiteAttrMapType type) {
+  return ScopedTfLiteAttrMap(TfLiteAttributeMapCreate(type),
+                             TfLiteAttributeMapDelete);
+}
+
+// RAII wrapper of TfLiteBackendBuffer.
+using ScopedTfLiteBackendBuffer =
+    std::unique_ptr<TfLiteBackendBuffer, decltype(&TfLiteBackendBufferDelete)>;
+
+inline ScopedTfLiteBackendBuffer CreateScopedTfLiteBackendBuffer() {
+  return ScopedTfLiteBackendBuffer(TfLiteBackendBufferCreate(),
+                                   TfLiteBackendBufferDelete);
+}
+
+// RAII wrapper of TfLiteSynchronization.
+using ScopedTfLiteSynchronization =
+    std::unique_ptr<TfLiteSynchronization,
+                    decltype(&TfLiteSynchronizationDelete)>;
+
+inline ScopedTfLiteSynchronization CreateScopedTfLiteSynchronization() {
+  return ScopedTfLiteSynchronization(TfLiteSynchronizationCreate(),
+                                     TfLiteSynchronizationDelete);
+}
+
+enum class BufferType { kUnknown, kAHardwareBufferBlob };
+
+struct BufferAttributes {
+  std::optional<BufferType> buffer_type;
+  std::optional<size_t> alignment;
+  std::optional<size_t> padding;
+  std::optional<size_t> offset;
+  std::optional<size_t> size;
+};
+
+// Converts TfLiteAttributeMap to BufferAttributes.
+// Crashes if the input attr map is not a buffer attr map.
+BufferAttributes ReadBufferAttrs(const TfLiteAttributeMap* attr_map);
+BufferAttributes ReadBufferAttrs(const ScopedTfLiteAttrMap& attr_map);
+
+// Converts BufferAttributes to TfLiteAttributeMap.
+// Crashes if the input mutable attr map is not a buffer attr map.
+void WriteBufferAttrs(const BufferAttributes& attrs,
+                      TfLiteAttributeMap* attr_map);
+ScopedTfLiteAttrMap WriteBufferAttrs(const BufferAttributes& attrs);
+
+enum class SyncType { kUnknown, kNoSyncObj, kSyncFenceFd };
+
+struct SyncAttributes {
+  std::optional<SyncType> sync_type;
+};
+
+// Converts TfLiteAttributeMap to SyncAttributes.
+// Crashes if the input attr map is not a sync attr map.
+SyncAttributes ReadSyncAttrs(const TfLiteAttributeMap* attr_map);
+SyncAttributes ReadSyncAttrs(const ScopedTfLiteAttrMap& attr_map);
+
+// Converts SyncAttributes to TfLiteAttributeMap.
+// Crashes if the input mutable attr map is not a sync attr map.
+void WriteSyncAttrs(const SyncAttributes& attrs, TfLiteAttributeMap* attr_map);
+ScopedTfLiteAttrMap WriteSyncAttrs(const SyncAttributes& attrs);
+
+}  // namespace tflite::delegates::utils
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_ASYNC_TYPE_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h
new file mode 100644
index 00000000..f4dc1b3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_DUMMY_DELEGATE_DUMMY_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_DUMMY_DELEGATE_DUMMY_DELEGATE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct {
+  // Allowed ops to delegate.
+  int allowed_builtin_code;
+  // Report error during init.
+  bool error_during_init;
+  // Report error during prepare.
+  bool error_during_prepare;
+  // Report error during invoke.
+  bool error_during_invoke;
+} DummyDelegateOptions;
+
+// Returns a structure with the default delegate options.
+DummyDelegateOptions TfLiteDummyDelegateOptionsDefault();
+
+// Creates a new delegate instance that needs to be destroyed with
+// `TfLiteDummyDelegateDelete` when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, the above default values are used:
+TfLiteDelegate* TfLiteDummyDelegateCreate(const DummyDelegateOptions* options);
+
+// Destroys a delegate created with `TfLiteDummyDelegateCreate` call.
+void TfLiteDummyDelegateDelete(TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+// A convenient wrapper that returns C++ std::unique_ptr for automatic memory
+// management.
+inline std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+TfLiteDummyDelegateCreateUnique(const DummyDelegateOptions* options) {
+  return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
+      TfLiteDummyDelegateCreate(options), TfLiteDummyDelegateDelete);
+}
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_DUMMY_DELEGATE_DUMMY_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
new file mode 100644
index 00000000..4301ea77
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+
+namespace tflite {
+namespace example {
+namespace helpers {
+int CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor);
+}  // namespace helpers
+
+// LINT.IfChange
+static const char kSampleStableDelegateName[] = "google_sample_delegate";
+// LINT.ThenChange(Google-internal path)
+static const char kSampleStableDelegateVersion[] = "1.0.0";
+
+// A simple delegate that supports only addition and subtraction operations.
+// Implements SimpleOpaqueDelegateInterface, and therefore the delegate can be
+// easily be adapted to work with the stable TFLite delegate API via
+// TfLiteOpaqueDelegateFactory.
+class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
+ public:
+  // SampleStableDelegate supports float32 input type only.
+  // Returns true if the inputs of 'node' are two tensors of float32 with the
+  // same shape and the operation is addition or subtraction (without fused
+  // activation).
+  bool IsNodeSupportedByDelegate(const TfLiteOperator* registration_external,
+                                 const TfLiteOpaqueNode* node,
+                                 TfLiteOpaqueContext* context) const override;
+
+  // No-op. The delegate doesn't have extra steps to perform during
+  // initialization.
+  TfLiteStatus Initialize(TfLiteOpaqueContext* context) override;
+
+  // Returns a name that identifies the delegate.
+  const char* Name() const override;
+
+  // Returns an instance of SampleStableDelegateKernel that implements
+  // SimpleOpaqueDelegateKernelInterface. SampleStableDelegateKernel describes
+  // how a subgraph is delegated and the concrete evaluation of both addition
+  // and subtraction operations to be performed by the delegate.
+  std::unique_ptr<SimpleOpaqueDelegateKernelInterface>
+  CreateDelegateKernelInterface() override;
+};
+
+}  // namespace example
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
new file mode 100644
index 00000000..bf784f4d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
@@ -0,0 +1,125 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_WITH_CONTROL_FLOW_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_WITH_CONTROL_FLOW_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+
+namespace tflite {
+namespace example {
+namespace helpers {
+int CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor);
+}  // namespace helpers
+
+// LINT.IfChange
+static const char kSampleStableDelegateName[] = "google_sample_delegate";
+// LINT.ThenChange(Google-internal path)
+static const char kSampleStableDelegateVersion[] = "1.0.0";
+
+// A simple delegate that supports only a few operations:
+// addition, subtraction, multiplication, equality checks, and while loops.
+// Implements SimpleOpaqueDelegateInterface, and therefore the delegate can be
+// easily be adapted to work with the stable TFLite delegate API via
+// TfLiteOpaqueDelegateFactory.
+class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
+ public:
+  // SampleStableDelegate supports float32 input type only.
+  // Returns true if the inputs of 'node' are two tensors of float32 with the
+  // same shape and the operation is supported (without fused activation).
+  bool IsNodeSupportedByDelegate(const TfLiteOperator* registration_external,
+                                 const TfLiteOpaqueNode* node,
+                                 TfLiteOpaqueContext* context) const override;
+
+  // No-op. The delegate doesn't have extra steps to perform during
+  // initialization.
+  TfLiteStatus Initialize(TfLiteOpaqueContext* context) override;
+
+  // Returns a name that identifies the delegate.
+  const char* Name() const override;
+
+  // Returns an instance of SampleStableDelegateKernel that implements
+  // SimpleOpaqueDelegateKernelInterface. SampleStableDelegateKernel describes
+  // how a subgraph is delegated and the concrete evaluation of operations to be
+  // performed by the delegate.
+  std::unique_ptr<SimpleOpaqueDelegateKernelInterface>
+  CreateDelegateKernelInterface() override;
+
+ private:
+  // Computes all the compatible callee subgraphs of control flow ops of the
+  // subgraph specified with the given index. All the subgraph tree structures
+  // are stored in control_flow_subgraph_tree_ and any compatible subgraphs are
+  // added to compatible_callee_subgraph_indices_.
+  // NOTE: This function is expected to be called recursively to gather all the
+  // nested control flow subgraphs, and is expected to be called by
+  // PrepareControlFlow() with the root (subgraph_index = 0).
+  TfLiteStatus ComputeCompatibleCalleeSubgraphs(
+      TfLiteOpaqueContext* opaque_context, int subgraph_index);
+  // Performs any necessary steps for control flow support. For this sample
+  // delegate, we computes compatible callee subgraphs, releases subgraph
+  // contexts, and mark compatible callee subgraphs so that we can avoid Calling
+  // ModifyGraphWithDelegate() on the compatible subgraphs.
+  TfLiteStatus PrepareControlFlow(TfLiteOpaqueContext* opaque_context);
+
+  // Adds a control flow callee subgraph to the parent subgraph in the
+  // control_flow_subgraph_tree_.
+  void AddCalleeSubgraphToCallerSubgraph(int callee_subgraph_index,
+                                         int caller_subgraph_index) {
+    control_flow_subgraph_tree_[caller_subgraph_index].insert(
+        callee_subgraph_index);
+  }
+
+  // Adds a compatible callee subgraph.
+  void AddCompatibleCalleeSubgraph(int subgraph_index) {
+    compatible_callee_subgraph_indices_.insert(subgraph_index);
+  }
+
+  // Returns true if `subgraph_index` is of a compatible callee subgraph.
+  bool IsCompatibleCalleeSubgraph(int subgraph_index) const {
+    return compatible_callee_subgraph_indices_.contains(subgraph_index);
+  }
+  // A map from a parent subgraph index to its control flow callee subgraph
+  // indices (i.e. called by WHILE op). This information is used for
+  // constructing a tree of control flow subgraphs and traversing to figure out
+  // the delegation dependencies.
+  absl::flat_hash_map<int, absl::flat_hash_set<int>>
+      control_flow_subgraph_tree_;
+  // A set of callee subgraph indices (i.e., called by WHILE op) that this
+  // sample delegate can fully support. We mark all the callee subgraphs of the
+  // subgraph S (that contains this control flow op) as compatible if all those
+  // callee subgraphs are fully supported (i.e. contains only the ops fully
+  // supported by this delegate). For example, a WHILE op contains two callees:
+  // condition and body subgraphs. If and only if both condition and body
+  // subgraphs contain only the supported ops, then both subgraphs are marked
+  // as compatible.
+  // NOTE: The definition of `compatible` depends on the delegate provider's
+  // requirements. The definition we provide in this sample delegate is just an
+  // example for demonstration purpose only.
+  absl::flat_hash_set<int> compatible_callee_subgraph_indices_;
+  // If the delegate is already initialized. This is used to avoid duplicate
+  // PrepareControlFlow() call (i.e. we only want to call PrepareControlFlow()
+  // in the primary subgraph for our sample delegate).
+  bool has_been_initialized_ = false;
+};
+
+}  // namespace example
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_WITH_CONTROL_FLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
new file mode 100644
index 00000000..e932fe8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_DELEGATE_LOADER_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_DELEGATE_LOADER_H_
+
+#include <string>
+
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
+
+namespace tflite {
+namespace delegates {
+namespace utils {
+
+constexpr char kTfLiteStableDelegateSymbol[] = "TFL_TheStableDelegate";
+constexpr char kTfLiteLibraryPathEnvironmentVariable[] =
+    "TFLITE_STABLE_DELEGATE_LIBRARY_PATH";
+
+// Loads the TFLite delegate shared library and returns the pointer to
+// TfLiteStableDelegate (defined in
+// tensorflow/lite/acceleration/configuration/c/stable_delegate.h).
+// The returned pointer could be null if the delegate shared library cannot be
+// opened or the delegate symbol cannot be found.
+const TfLiteStableDelegate* LoadDelegateFromSharedLibrary(
+    const std::string& delegate_path);
+
+// Loads `delegate_symbol` from the delegate shared library and returns a
+// pointer to void. It is caller's responsibility to check and cast the pointer
+// to other types. The returned pointer could be null if the delegate shared
+// library cannot be opened or the delegate symbol cannot be found.
+void* LoadSymbolFromSharedLibrary(const std::string& delegate_path,
+                                  const std::string& delegate_symbol);
+
+// TODO(b/239825926): Add ABI version check when loading TfLiteStableDelegate.
+
+}  // namespace utils
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_DELEGATE_LOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
new file mode 100644
index 00000000..e7b0b096
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
+
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
+
+// This header file declares the interface that stable delegate shared
+// libraries need to implement. The stable delegate loader will dynamically load
+// the shared library.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Define TFL_STABLE_DELEGATE_EXPORT macro to export a stable delegate API
+// function properly with a shared library.
+#ifdef SWIG
+#define TFL_STABLE_DELEGATE_EXPORT
+#else  // !defined SWIG
+#ifdef _WIN32
+// On Windows, the TFL_STABLE_DELEGATE_COMPILE_LIBRARY macro should be defined
+// when _building_ a stable delegate shared library, but should not be defined
+// when _using_ a stable delegate shared library.
+#ifdef TFL_STABLE_DELEGATE_COMPILE_LIBRARY
+#define TFL_STABLE_DELEGATE_EXPORT __declspec(dllexport)
+#else  // !defined TFL_STABLE_DELEGATE_COMPILE_LIBRARY
+#define TFL_STABLE_DELEGATE_EXPORT __declspec(dllimport)
+#endif  // !defined TFL_STABLE_DELEGATE_COMPILE_LIBRARY
+#else   // !defined _WIN32
+#define TFL_STABLE_DELEGATE_EXPORT __attribute__((visibility("default")))
+#endif  // !defined _WIN32
+#endif  // !defined SWIG
+
+// The variable contains stable delegate metadata and implementation.
+//
+// The variable is dynamically initialized and it will be used as the entrypoint
+// for the stable delegate providers to load the symbols. Don't add other
+// initializations, which depend on the sequence of this initialization.
+extern TFL_STABLE_DELEGATE_EXPORT const TfLiteStableDelegate
+    TFL_TheStableDelegate;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
new file mode 100644
index 00000000..376358e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_TFLITE_SETTINGS_JSON_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_TFLITE_SETTINGS_JSON_PARSER_H_
+
+#include <string>
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+namespace delegates {
+namespace utils {
+
+// This class parses a JSON file to tflite::TFLiteSettings*.
+// Note: This class is "thread-compatible", i.e. not thread-safe but also not
+// thread-hostile
+// <https://web.archive.org/web/20210125044505/https://www.ibm.com/developerworks/java/library/j-jtp09263/index.html>.
+// That is, each instance is not thread-safe, but multiple separate instances
+// are safely independent.
+class TfLiteSettingsJsonParser {
+ public:
+  TfLiteSettingsJsonParser();
+
+  // Loads TFLiteSettings from a JSON file path. The lifetime of the
+  // TFLiteSettings object is tied to the lifetime of the
+  // TfLiteSettingsJsonParser instance.
+  //
+  // Returns the pointer to the TFLiteSettings object or nullptr if an error is
+  // encountered.
+  const TFLiteSettings* Parse(const std::string& json_file_path);
+
+  // Returns the buffer pointer to the loaded TFLiteSettings object or nullptr
+  // if an error was encountered during loading or the TFLiteSettings object is
+  // not loaded. The lifetime of the buffer is tied to the lifetime of the
+  // TfLiteSettingsJsonParser instance.
+  const uint8_t* GetBufferPointer();
+
+  // Returns the buffer size of the loaded TFLiteSettings object or 0 if an
+  // error was encountered during loading or the TFLiteSettings object is not
+  // loaded.
+  flatbuffers::uoffset_t GetBufferSize();
+
+ private:
+  // Parses content inside `json_file_path` into flatbuffer. Returns true if the
+  // parsing was successful, otherwise the method returns false.
+  bool LoadFromJsonFile(const std::string& json_file_path);
+
+  flatbuffers::Parser parser_;
+  uint8_t* buffer_pointer_;
+  flatbuffers::uoffset_t buffer_size_;
+};
+
+}  // namespace utils
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_TFLITE_SETTINGS_JSON_PARSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/ret_macros.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/ret_macros.h
new file mode 100644
index 00000000..de6cd308
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/ret_macros.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_RET_MACROS_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_RET_MACROS_H_
+
+#include <cstddef>
+#include <cstdlib>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+// Evaluate an expression whose type is std::optional<T>. If it returns an
+// instance that contains a value, then initialize the declaration with that
+// value; otherwise, return the specified error value.
+#define TFLITE_ASSIGN_OR_RETURN(declaration, expr, err)                \
+  TFLITE_ASSIGN_OR_RETURN_IMPL(                                        \
+      TFLITE_ASSIGN_OR_RETURN_MACROS_CONCAT_NAME(_maybe, __COUNTER__), \
+      declaration, expr, err);
+
+#define TFLITE_ASSIGN_OR_RETURN_IMPL(maybe, declaration, expr, err) \
+  auto maybe = (expr);                                              \
+  if (!maybe.has_value()) return (err);                             \
+  declaration = *std::move(maybe)
+
+#define TFLITE_ASSIGN_OR_RETURN_MACROS_CONCAT_NAME(x, y) \
+  TFLITE_ASSIGN_OR_RETURN_MACROS_CONCAT_IMPL(x, y)
+#define TFLITE_ASSIGN_OR_RETURN_MACROS_CONCAT_IMPL(x, y) x##y
+
+// If the specified condition is false, log the specified message and return the
+// specified error value.
+#define TFLITE_RET_CHECK(c, m, r) \
+  TFLITE_RET_CHECK_IMPL(c, m, r, __FILE__, __LINE__)
+
+#define TFLITE_RET_CHECK_IMPL(c, m, r, file, line)                         \
+  do {                                                                     \
+    if (!(c)) {                                                            \
+      ::tflite::delegates::utils::TfLiteCheckLog("TFLITE_RET_CHECK", file, \
+                                                 line, #c, m);             \
+      return (r);                                                          \
+    }                                                                      \
+  } while (false)
+
+// If the specified condition is false, log the specified message and return
+// kTfLiteDelegateError.
+#define TFLITE_RET_CHECK_STATUS(c, m) \
+  TFLITE_RET_CHECK(c, m, kTfLiteDelegateError)
+
+// If the specified condition is false, log the specified message and abort().
+#define TFLITE_ABORT_CHECK(c, m) \
+  TFLITE_ABORT_CHECK_IMPL(c, m, __FILE__, __LINE__)
+
+#define TFLITE_ABORT_CHECK_IMPL(c, m, file, line)                            \
+  do {                                                                       \
+    if (!(c)) {                                                              \
+      ::tflite::delegates::utils::TfLiteCheckLog("TFLITE_ABORT_CHECK", file, \
+                                                 line, #c, m);               \
+      std::abort();                                                          \
+    }                                                                        \
+  } while (false)
+
+namespace tflite::delegates::utils {
+inline void TfLiteCheckLog(const char* kind, const char* file, std::size_t line,
+                           const char* cond, const char* message) {
+  TFLITE_LOG_PROD(::tflite::TFLITE_LOG_ERROR, "%s failure (%s:%zu) %s \"%s\"",
+                  kind, file, line, cond, message);
+}
+}  // namespace tflite::delegates::utils
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_RET_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/simple_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/simple_delegate.h
new file mode 100644
index 00000000..4aae653d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has utilities that facilitates creating new delegates.
+// - SimpleDelegateKernelInterface: Represents a Kernel which handles a subgraph
+// to be delegated. It has Init/Prepare/Invoke which are going to be called
+// during inference, similar to TFLite Kernels. Delegate owner should implement
+// this interface to build/prepare/invoke the delegated subgraph.
+// - SimpleDelegateInterface:
+// This class wraps TFLiteDelegate and users need to implement the interface and
+// then call TfLiteDelegateFactory::CreateSimpleDelegate(...) to get
+// TfLiteDelegate* that can be passed to ModifyGraphWithDelegate and free it via
+// TfLiteDelegateFactory::DeleteSimpleDelegate(...).
+// or call TfLiteDelegateFactory::Create(...) to get a std::unique_ptr
+// TfLiteDelegate that can also be passed to ModifyGraphWithDelegate, in which
+// case TfLite interpereter takes the memory ownership of the delegate.
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+using TfLiteDelegateUniquePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+// Users should inherit from this class and implement the interface below.
+// Each instance represents a single part of the graph (subgraph).
+class SimpleDelegateKernelInterface {
+ public:
+  virtual ~SimpleDelegateKernelInterface() = default;
+
+  // Initializes a delegated subgraph.
+  // The nodes in the subgraph are inside TfLiteDelegateParams->nodes_to_replace
+  virtual TfLiteStatus Init(TfLiteContext* context,
+                            const TfLiteDelegateParams* params) = 0;
+
+  // Will be called by the framework. Should handle any needed preparation
+  // for the subgraph e.g. allocating buffers, compiling model.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) = 0;
+
+  // Actual subgraph inference should happen on this call.
+  // Returns status, and signalling any errors.
+  // NOTE: Tensor data pointers (tensor->data) can change every inference, so
+  // the implementation of this method needs to take that into account.
+  virtual TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) = 0;
+};
+
+// Pure Interface that clients should implement.
+// The Interface represents a delegate's capabilities and provides a factory
+// for SimpleDelegateKernelInterface.
+//
+// Clients should implement the following methods:
+// - IsNodeSupportedByDelegate
+// - Initialize
+// - Name
+// - CreateDelegateKernelInterface
+// - DelegateOptions
+class SimpleDelegateInterface {
+ public:
+  // Properties of a delegate.  These are used by TfLiteDelegateFactory to
+  // help determine how to partition the graph, i.e. which nodes each delegate
+  // will get applied to.
+  struct Options {
+    // Maximum number of delegated subgraph, values <=0 means unlimited.
+    int max_delegated_partitions = 0;
+
+    // The minimum number of nodes allowed in a delegated graph, values <=0
+    // means unlimited.
+    int min_nodes_per_partition = 0;
+  };
+
+  virtual ~SimpleDelegateInterface() = default;
+
+  // Returns true if 'node' is supported by the delegate. False otherwise.
+  virtual bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                         const TfLiteNode* node,
+                                         TfLiteContext* context) const = 0;
+
+  // Initialize the delegate before finding and replacing TfLite nodes with
+  // delegate kernels, for example, retrieving some TFLite settings from
+  // 'context'.
+  virtual TfLiteStatus Initialize(TfLiteContext* context) = 0;
+
+  // Returns a name that identifies the delegate.
+  // This name is used for debugging/logging/profiling.
+  virtual const char* Name() const = 0;
+
+  // Returns instance of an object that implements the interface
+  // SimpleDelegateKernelInterface.
+  // An instance of SimpleDelegateKernelInterface represents one subgraph to
+  // be delegated.
+  // Caller takes ownership of the returned object.
+  virtual std::unique_ptr<SimpleDelegateKernelInterface>
+  CreateDelegateKernelInterface() = 0;
+
+  // Returns SimpleDelegateInterface::Options which has delegate properties
+  // relevant for graph partitioning.
+  virtual SimpleDelegateInterface::Options DelegateOptions() const = 0;
+
+  /// Optional method for supporting hardware buffers.
+  /// Copies the data from delegate buffer handle into raw memory of the given
+  /// `tensor`. Note that the delegate is allowed to allocate the raw bytes as
+  /// long as it follows the rules for kTfLiteDynamic tensors.
+  virtual TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                            TfLiteBufferHandle buffer_handle,
+                                            TfLiteTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  /// Optional method for supporting hardware buffers.
+  /// Copies the data from raw memory of the given `tensor` to delegate buffer
+  /// handle.
+  virtual TfLiteStatus CopyToBufferHandle(TfLiteContext* context,
+                                          TfLiteBufferHandle buffer_handle,
+                                          const TfLiteTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  /// Optional method for supporting hardware buffers.
+  /// Frees the Delegate Buffer Handle. Note: This only frees the handle, but
+  /// this doesn't release the underlying resource (e.g. textures). The
+  /// resources are either owned by application layer or the delegate.
+  virtual void FreeBufferHandle(TfLiteContext* context,
+                                TfLiteBufferHandle* handle) {}
+};
+
+// Factory class that provides static methods to deal with SimpleDelegate
+// creation and deletion.
+class TfLiteDelegateFactory {
+ public:
+  // Creates TfLiteDelegate from the provided SimpleDelegateInterface.
+  // The returned TfLiteDelegate should be deleted using DeleteSimpleDelegate.
+  // A simple usage of the flags bit mask:
+  // CreateSimpleDelegate(..., kTfLiteDelegateFlagsAllowDynamicTensors |
+  // kTfLiteDelegateFlagsRequirePropagatedShapes)
+  static TfLiteDelegate* CreateSimpleDelegate(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate,
+      int64_t flags = kTfLiteDelegateFlagsNone);
+
+  // Deletes 'delegate' the passed pointer must be the one returned
+  // from CreateSimpleDelegate.
+  // This function will destruct the SimpleDelegate object too.
+  static void DeleteSimpleDelegate(TfLiteDelegate* delegate);
+
+  // A convenient function wrapping the above two functions and returning a
+  // std::unique_ptr type for auto memory management.
+  inline static TfLiteDelegateUniquePtr Create(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate) {
+    return TfLiteDelegateUniquePtr(
+        CreateSimpleDelegate(std::move(simple_delegate)), DeleteSimpleDelegate);
+  }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/simple_opaque_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
new file mode 100644
index 00000000..5096eaa1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
@@ -0,0 +1,164 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has utilities that facilitates creating new opaque delegates.
+// - SimpleOpaqueDelegateKernelInterface: Represents a Kernel which handles a
+// subgraph to be delegated. It has Init/Prepare/Invoke which are going to be
+// called during inference, similar to TFLite Kernels. Delegate owner should
+// implement this interface to build/prepare/invoke the delegated subgraph.
+// - SimpleOpaqueDelegateInterface:
+// This class wraps TFLiteOpaqueDelegate and users need to implement the
+// interface and then call
+// TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(...) to get a
+// TfLiteOpaqueDelegate* that can be passed to
+// TfLiteInterpreterOptionsAddDelegate and free it via
+// TfLiteOpaqueDelegateFactory::DeleteSimpleDelegate(...),
+// or call TfLiteOpaqueDelegateFactory::Create(...) to get a std::unique_ptr
+// to TfLiteOpaqueDelegate that can also be passed to
+// TfLiteInterpreterOptionsAddDelegate.
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_OPAQUE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_OPAQUE_DELEGATE_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+using TfLiteOpaqueDelegateUniquePtr =
+    std::unique_ptr<TfLiteOpaqueDelegate, void (*)(TfLiteOpaqueDelegate*)>;
+
+// Users should inherit from this class and implement the interface below.
+// Each instance represents a single part of the graph (subgraph).
+class SimpleOpaqueDelegateKernelInterface {
+ public:
+  virtual ~SimpleOpaqueDelegateKernelInterface() = default;
+
+  // Initializes a delegated subgraph.
+  // The nodes in the subgraph are inside
+  // TfLiteOpaqueDelegateParams->nodes_to_replace
+  virtual TfLiteStatus Init(TfLiteOpaqueContext* context,
+                            const TfLiteOpaqueDelegateParams* params) = 0;
+
+  // Will be called by the framework. Should handle any needed preparation
+  // for the subgraph e.g. allocating buffers, compiling model.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                               TfLiteOpaqueNode* node) = 0;
+
+  // Actual subgraph inference should happen on this call.
+  // Returns status, and signalling any errors.
+  // NOTE: Tensor data pointers (tensor->data) can change every inference, so
+  // the implementation of this method needs to take that into account.
+  virtual TfLiteStatus Eval(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node) = 0;
+};
+
+// Pure Interface that clients should implement.
+// The Interface represents a delegate's capabilities and provides a factory
+// for SimpleDelegateKernelInterface.
+//
+// Clients should implement the following methods:
+// - IsNodeSupportedByDelegate
+// - Initialize
+// - Name
+// - CreateDelegateKernelInterface
+class SimpleOpaqueDelegateInterface {
+ public:
+  virtual ~SimpleOpaqueDelegateInterface() = default;
+
+  // Returns true if 'node' is supported by the delegate. False otherwise.
+  virtual bool IsNodeSupportedByDelegate(
+      const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+      TfLiteOpaqueContext* context) const = 0;
+
+  // Initialize the delegate before finding and replacing TfLite nodes with
+  // delegate kernels, for example, retrieving some TFLite settings from
+  // 'context'.
+  virtual TfLiteStatus Initialize(TfLiteOpaqueContext* context) = 0;
+
+  // Returns a name that identifies the delegate.
+  // This name is used for debugging/logging/profiling.
+  virtual const char* Name() const = 0;
+
+  // Returns instance of an object that implements the interface
+  // SimpleDelegateKernelInterface.
+  // An instance of SimpleDelegateKernelInterface represents one subgraph to
+  // be delegated.
+  // Caller takes ownership of the returned object.
+  virtual std::unique_ptr<SimpleOpaqueDelegateKernelInterface>
+  CreateDelegateKernelInterface() = 0;
+
+  /// Optional method for supporting hardware buffers.
+  /// Copies the data from delegate buffer handle into raw memory of the given
+  /// `tensor`. Note that the delegate is allowed to allocate the raw bytes as
+  /// long as it follows the rules for kTfLiteDynamic tensors.
+  virtual TfLiteStatus CopyFromBufferHandle(TfLiteOpaqueContext* context,
+                                            TfLiteBufferHandle buffer_handle,
+                                            TfLiteOpaqueTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  /// Optional method for supporting hardware buffers.
+  /// Copies the data from raw memory of the given `tensor` to delegate buffer
+  /// handle.
+  virtual TfLiteStatus CopyToBufferHandle(TfLiteOpaqueContext* context,
+                                          TfLiteBufferHandle buffer_handle,
+                                          const TfLiteOpaqueTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  /// Optional method for supporting hardware buffers.
+  /// Frees the Delegate Buffer Handle. Note: This only frees the handle, but
+  /// this doesn't release the underlying resource (e.g. textures). The
+  /// resources are either owned by application layer or the delegate.
+  virtual void FreeBufferHandle(TfLiteOpaqueContext* context,
+                                TfLiteBufferHandle* handle) {}
+};
+
+// Factory class that provides static methods to deal with SimpleDelegate
+// creation and deletion.
+class TfLiteOpaqueDelegateFactory {
+ public:
+  // Creates TfLiteDelegate from the provided SimpleOpaqueDelegateInterface.
+  // The returned TfLiteDelegate should be deleted using DeleteSimpleDelegate.
+  // A simple usage of the flags bit mask:
+  // CreateSimpleDelegate(..., kTfLiteDelegateFlagsAllowDynamicTensors |
+  // kTfLiteDelegateFlagsRequirePropagatedShapes)
+  static TfLiteOpaqueDelegate* CreateSimpleDelegate(
+      std::unique_ptr<SimpleOpaqueDelegateInterface> simple_delegate,
+      int64_t flags = kTfLiteDelegateFlagsNone);
+
+  // Deletes 'delegate' the passed pointer must be the one returned from
+  // CreateSimpleDelegate. This function will destruct the SimpleDelegate object
+  // too.
+  static void DeleteSimpleDelegate(TfLiteOpaqueDelegate* opaque_delegate);
+
+  // A convenient function wrapping the above two functions and returning a
+  // std::unique_ptr type for auto memory management.
+  inline static TfLiteOpaqueDelegateUniquePtr Create(
+      std::unique_ptr<SimpleOpaqueDelegateInterface> simple_delegate) {
+    return TfLiteOpaqueDelegateUniquePtr(
+        CreateSimpleDelegate(std::move(simple_delegate)), DeleteSimpleDelegate);
+  }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_OPAQUE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/sync_fence.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/sync_fence.h
new file mode 100644
index 00000000..a17e349d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/sync_fence.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SYNC_FENCE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_SYNC_FENCE_H_
+
+#include <optional>
+#include <variant>
+
+#include "absl/types/span.h"
+
+namespace tflite::delegates::utils {
+
+// Blocks until all file descriptors have been signalled, or returns an error
+// (signified by an instance with no value).
+std::optional<std::monostate> WaitForAllFds(absl::Span<const int> fds);
+
+// Returns (without blocking) `true` if all the provided file descriptors are
+// signalled, `false` if at least one file descriptor is not yet signalled, or
+// an error (indicated by an instance with no value).
+std::optional<bool> AreAllFdsSignalled(absl::Span<const int> fds);
+
+}  // namespace tflite::delegates::utils
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_SYNC_FENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/utils.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/utils.h
new file mode 100644
index 00000000..8e665ccf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/utils/utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_UTILS_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/utils/ret_macros.h"
+
+namespace tflite::delegates::utils {
+
+// Returns kTfLiteOk if the status is ok;
+// Otherwise, log the error message and returns kTfLiteError.
+TfLiteStatus ConvertToTfLiteStatus(absl::Status status);
+
+inline bool IsPowerOfTwo(size_t x) { return x && ((x & (x - 1)) == 0); }
+
+// Round up "size" to the nearest multiple of "multiple".
+// "multiple" must be a power of 2.
+inline uint32_t RoundUp(uint32_t size, uint32_t multiple) {
+  TFLITE_ABORT_CHECK(IsPowerOfTwo(multiple), "");  // Crash OK
+  return (size + (multiple - 1)) & ~(multiple - 1);
+}
+
+}  // namespace tflite::delegates::utils
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h
new file mode 100644
index 00000000..19c81473
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h
@@ -0,0 +1,115 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_BATCH_MATRIX_MULTIPLY_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_BATCH_MATRIX_MULTIPLY_TESTER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class BatchMatrixMultiplyTester {
+ public:
+  enum class WeightsType {
+    kFP32,
+  };
+
+  enum QuantizationType {
+    kNone,
+    kChannel,
+    kTensor,
+  };
+
+  BatchMatrixMultiplyTester() = default;
+  BatchMatrixMultiplyTester(const BatchMatrixMultiplyTester&) = delete;
+  BatchMatrixMultiplyTester& operator=(const BatchMatrixMultiplyTester&) =
+      delete;
+
+  BatchMatrixMultiplyTester& InputADims(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    input_a_dims_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input1_size_ = ComputeSize(input_a_dims_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& InputADims() const { return input_a_dims_; }
+
+  BatchMatrixMultiplyTester& InputBDims(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_b_dims_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input2_size_ = ComputeSize(input_b_dims_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& InputBDims() const { return input_b_dims_; }
+
+  BatchMatrixMultiplyTester& InputBQuant(QuantizationType quantization) {
+    quant_b_ = quantization;
+    return *this;
+  }
+  QuantizationType InputBQuant() const { return quant_b_; }
+
+  int32_t Input1Size() const { return input1_size_; }
+
+  int32_t Input2Size() const { return input2_size_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  BatchMatrixMultiplyTester& TransposeB(bool adj_y) {
+    transpose_b_ = adj_y;
+    return *this;
+  }
+
+  bool TransposeB() const { return transpose_b_; }
+
+  BatchMatrixMultiplyTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  WeightsType WeightsType() const { return weights_type_; }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_a_dims_;
+  std::vector<int32_t> input_b_dims_;
+  QuantizationType quant_b_ = kNone;
+  int32_t input1_size_ = 1;
+  int32_t input2_size_ = 1;
+  bool transpose_b_ = false;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_BATCH_MATRIX_MULTIPLY_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
new file mode 100644
index 00000000..2f6edf52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -0,0 +1,138 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class BinaryElementwiseTester {
+ public:
+  BinaryElementwiseTester() = default;
+  BinaryElementwiseTester(const BinaryElementwiseTester&) = delete;
+  BinaryElementwiseTester& operator=(const BinaryElementwiseTester&) = delete;
+
+  inline BinaryElementwiseTester& Input1Shape(std::vector<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input1_shape_ = std::move(shape);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Input1Shape() const {
+    return input1_shape_;
+  }
+
+  inline BinaryElementwiseTester& Input2Shape(std::vector<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input2_shape_ = std::move(shape);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Input2Shape() const {
+    return input2_shape_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline BinaryElementwiseTester& Input1Static(bool is_static) {
+    input1_static_ = is_static;
+    return *this;
+  }
+
+  inline bool Input1Static() const { return input1_static_; }
+
+  inline BinaryElementwiseTester& Input2Static(bool is_static) {
+    input2_static_ = is_static;
+    return *this;
+  }
+
+  inline bool Input2Static() const { return input2_static_; }
+
+  inline BinaryElementwiseTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline BinaryElementwiseTester& INT8Weights() {
+    int8_weights_ = true;
+    return *this;
+  }
+
+  inline bool INT8Weights() const { return int8_weights_; }
+
+  inline BinaryElementwiseTester& INT8ChannelWiseWeights() {
+    int8_channel_wise_weights_ = true;
+    return *this;
+  }
+
+  inline bool INT8ChannelWiseWeights() const {
+    return int8_channel_wise_weights_;
+  }
+
+  inline BinaryElementwiseTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  inline BinaryElementwiseTester& Activation(
+      ActivationFunctionType activation) {
+    activation_ = activation;
+    return *this;
+  }
+
+  void Test(tflite::BuiltinOperator binary_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator binary_op) const;
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input1_shape_;
+  std::vector<int32_t> input2_shape_;
+  bool input1_static_ = false;
+  bool input2_static_ = false;
+  bool fp16_weights_ = false;
+  bool int8_weights_ = false;
+  bool int8_channel_wise_weights_ = false;
+  bool sparse_weights_ = false;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/concatenation_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
new file mode 100644
index 00000000..202dab11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
@@ -0,0 +1,133 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_CONCATENATION_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_CONCATENATION_TESTER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+// Creates a new shape with the same dimensions as `shape`, except for the axis
+// dimension, which will have the value `size`.
+std::vector<int32_t> SameShapeDifferentAxis(std::vector<int32_t> shape,
+                                            int axis, int32_t size);
+
+class ConcatenationTester {
+ public:
+  ConcatenationTester() = default;
+  ConcatenationTester(const ConcatenationTester&) = delete;
+  ConcatenationTester& operator=(const ConcatenationTester&) = delete;
+
+  inline ConcatenationTester& Axis(int axis) {
+    axis_ = axis;
+    return *this;
+  }
+
+  inline const int Axis() const { return axis_; }
+
+  inline ConcatenationTester& InputShapes(
+      const std::initializer_list<std::vector<int32_t>> shapes) {
+    for (auto shape : shapes) {
+      for (auto it = shape.begin(); it != shape.end(); ++it) {
+        EXPECT_GT(*it, 0);
+      }
+    }
+    input_shapes_ = shapes;
+    input_scales_.resize(shapes.size(), 1);
+    output_scale_ = 1.f;
+    input_zero_points_.resize(shapes.size(), 0);
+    output_zero_point_ = 0;
+    return *this;
+  }
+
+  inline std::vector<int32_t> InputShape(size_t i) const {
+    return input_shapes_[i];
+  }
+
+  inline ConcatenationTester& InputScales(const std::vector<float> scales) {
+    input_scales_ = scales;
+    return *this;
+  }
+
+  inline float InputScale(size_t i) const { return input_scales_[i]; }
+
+  inline ConcatenationTester& InputZeroPoint(
+      const std::vector<int32_t> zero_points) {
+    input_zero_points_ = zero_points;
+    return *this;
+  }
+
+  inline float InputZeroPoint(size_t i) const { return input_zero_points_[i]; }
+
+  inline ConcatenationTester& OutputScale(float scale) {
+    output_scale_ = scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline ConcatenationTester& OutputZeroPoint(int32_t zero_point) {
+    output_zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_scale_; }
+
+  inline size_t NumInputs() const { return input_shapes_.size(); }
+
+  std::vector<int32_t> OutputShape() const {
+    std::vector<int32_t> output_shape = InputShape(0);
+    int concat_axis = Axis() < 0 ? Axis() + output_shape.size() : Axis();
+    size_t axis_dim_size = 0;
+    for (size_t i = 0; i < NumInputs(); i++) {
+      axis_dim_size += InputShape(i)[concat_axis];
+    }
+    output_shape[concat_axis] = axis_dim_size;
+    return output_shape;
+  }
+
+  template <typename T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  int axis_;
+  std::vector<int32_t> output_shape_;
+  std::vector<std::vector<int32_t>> input_shapes_;
+  std::vector<float> input_scales_;
+  float output_scale_;
+  std::vector<int32_t> input_zero_points_;
+  int32_t output_zero_point_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_CONCATENATION_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
new file mode 100644
index 00000000..0681bff6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
@@ -0,0 +1,283 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class Conv2DTester {
+ public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+  };
+  enum class BiasType {
+    kFP32,
+    kFP16,
+  };
+
+  Conv2DTester() = default;
+  Conv2DTester(const Conv2DTester&) = delete;
+  Conv2DTester& operator=(const Conv2DTester&) = delete;
+
+  inline Conv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline Conv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline Conv2DTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline Conv2DTester& Groups(int32_t groups) {
+    EXPECT_EQ(InputChannels() % groups, 0);
+    EXPECT_EQ(OutputChannels() % groups, 0);
+    groups_ = groups;
+    return *this;
+  }
+
+  inline int32_t Groups() const { return groups_; }
+
+  inline int32_t KernelInputChannels() const {
+    return input_channels_ / groups_;
+  }
+
+  inline Conv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline Conv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline Conv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline Conv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline Conv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline Conv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline Conv2DTester& DilationHeight(int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline Conv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline Conv2DTester& FP16Weights() {
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
+    return *this;
+  }
+
+  inline Conv2DTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline Conv2DTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline Conv2DTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  inline Conv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline Conv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline Conv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline Conv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline Conv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline Conv2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline Conv2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  inline Conv2DTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+  std::vector<char> CreateTfLiteModel() const;
+
+ private:
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t groups_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
+  bool sparse_weights_ = false;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h
new file mode 100644
index 00000000..baa64012
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h
@@ -0,0 +1,103 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTH_TO_SPACE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTH_TO_SPACE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DepthToSpaceTester {
+ public:
+  DepthToSpaceTester() = default;
+  DepthToSpaceTester(const DepthToSpaceTester&) = delete;
+  DepthToSpaceTester& operator=(const DepthToSpaceTester&) = delete;
+
+  inline DepthToSpaceTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline int32_t InputChannels() const {
+    return OutputChannels() * BlockSize() * BlockSize();
+  }
+
+  inline DepthToSpaceTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline DepthToSpaceTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline DepthToSpaceTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const { return InputWidth() * BlockSize(); }
+
+  inline int32_t OutputHeight() const { return InputHeight() * BlockSize(); }
+
+  inline DepthToSpaceTester& BlockSize(int32_t block_size) {
+    EXPECT_GT(block_size, 1);
+    block_size_ = block_size;
+    return *this;
+  }
+
+  inline int32_t BlockSize() const { return block_size_; }
+
+  template <class T>
+  void Test(TensorType tensor_type, Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  int32_t batch_size_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t block_size_ = 2;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTH_TO_SPACE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
new file mode 100644
index 00000000..1bcbb4ff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -0,0 +1,273 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DepthwiseConv2DTester {
+ public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+  };
+  enum class BiasType {
+    kFP32,
+    kFP16,
+  };
+
+  DepthwiseConv2DTester() = default;
+  DepthwiseConv2DTester(const DepthwiseConv2DTester&) = delete;
+  DepthwiseConv2DTester& operator=(const DepthwiseConv2DTester&) = delete;
+
+  inline DepthwiseConv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline DepthwiseConv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline DepthwiseConv2DTester& DepthMultiplier(int32_t depth_multiplier) {
+    EXPECT_GT(depth_multiplier, 0);
+    depth_multiplier_ = depth_multiplier;
+    return *this;
+  }
+
+  inline int32_t DepthMultiplier() const { return depth_multiplier_; }
+
+  inline int32_t OutputChannels() const {
+    return DepthMultiplier() * InputChannels();
+  }
+
+  inline DepthwiseConv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline DepthwiseConv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline DepthwiseConv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline DepthwiseConv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline DepthwiseConv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline DepthwiseConv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline DepthwiseConv2DTester& DilationHeight(int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline DepthwiseConv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline DepthwiseConv2DTester& FP16Weights() {
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  inline DepthwiseConv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  inline DepthwiseConv2DTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t depth_multiplier_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
+  bool sparse_weights_ = false;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dequantize_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dequantize_tester.h
new file mode 100644
index 00000000..8e7f80cb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dequantize_tester.h
@@ -0,0 +1,92 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DEQUANTIZE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DEQUANTIZE_TESTER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DequantizeTester {
+ public:
+  DequantizeTester() = default;
+  DequantizeTester(const DequantizeTester&) = delete;
+  DequantizeTester& operator=(const DequantizeTester&) = delete;
+
+  inline DequantizeTester& Shape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = DequantizeTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline DequantizeTester& InputZeroPoint(int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline DequantizeTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline DequantizeTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  int32_t input_zero_point_ = 0;
+  float input_scale_ = 1.0f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DEQUANTIZE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h
new file mode 100644
index 00000000..dc7439de
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h
@@ -0,0 +1,245 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DynamicallyQuantizedConv2DTester {
+ public:
+  DynamicallyQuantizedConv2DTester() = default;
+  DynamicallyQuantizedConv2DTester(const DynamicallyQuantizedConv2DTester&) =
+      delete;
+  DynamicallyQuantizedConv2DTester& operator=(
+      const DynamicallyQuantizedConv2DTester&) = delete;
+
+  inline DynamicallyQuantizedConv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline DynamicallyQuantizedConv2DTester& InputChannels(
+      int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline DynamicallyQuantizedConv2DTester& OutputChannels(
+      int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline DynamicallyQuantizedConv2DTester& Groups(int32_t groups) {
+    EXPECT_EQ(InputChannels() % groups, 0);
+    EXPECT_EQ(OutputChannels() % groups, 0);
+    groups_ = groups;
+    return *this;
+  }
+
+  inline int32_t Groups() const { return groups_; }
+
+  inline int32_t KernelInputChannels() const {
+    return input_channels_ / groups_;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline DynamicallyQuantizedConv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline DynamicallyQuantizedConv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline DynamicallyQuantizedConv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline DynamicallyQuantizedConv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline DynamicallyQuantizedConv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline DynamicallyQuantizedConv2DTester& DilationHeight(
+      int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline DynamicallyQuantizedConv2DTester& DilationWidth(
+      int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedConv2DTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+  std::vector<char> CreateTfLiteModel() const;
+
+ private:
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t groups_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_CONV_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
new file mode 100644
index 00000000..1370d101
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
@@ -0,0 +1,176 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_FULLY_CONNECTED_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_FULLY_CONNECTED_TESTER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+enum class WeightsType {
+  kChannelWiseQuantizedInt4,
+  kChannelWiseQuantizedInt8,
+  kTensorWiseQuantizedInt8,
+};
+
+class DynamicallyQuantizedFullyConnectedTester {
+ public:
+  DynamicallyQuantizedFullyConnectedTester() = default;
+  DynamicallyQuantizedFullyConnectedTester(
+      const DynamicallyQuantizedFullyConnectedTester&) = delete;
+  DynamicallyQuantizedFullyConnectedTester& operator=(
+      const DynamicallyQuantizedFullyConnectedTester&) = delete;
+
+  inline DynamicallyQuantizedFullyConnectedTester& InputShape(
+      std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline DynamicallyQuantizedFullyConnectedTester& InputChannels(
+      int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline DynamicallyQuantizedFullyConnectedTester& OutputChannels(
+      int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline DynamicallyQuantizedFullyConnectedTester& WeightsType(
+      WeightsType weights_type) {
+    weights_type_ = weights_type;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedFullyConnectedTester& FilterZeroPoint(
+      int32_t filter_zero_point) {
+    filter_zero_point_ = filter_zero_point;
+    return *this;
+  }
+
+  inline int32_t FilterZeroPoint() const { return filter_zero_point_; }
+
+  inline DynamicallyQuantizedFullyConnectedTester& FilterScale(
+      float filter_scale) {
+    filter_scale_ = filter_scale;
+    return *this;
+  }
+
+  inline float FilterScale() const { return filter_scale_; }
+
+  inline DynamicallyQuantizedFullyConnectedTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline DynamicallyQuantizedFullyConnectedTester& NoBias() {
+    has_bias_ = false;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedFullyConnectedTester& WithBias() {
+    has_bias_ = true;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedFullyConnectedTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedFullyConnectedTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedFullyConnectedTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedFullyConnectedTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline bool HasBias() const { return has_bias_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  inline enum WeightsType WeightsType() const { return weights_type_; }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  int32_t input_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+
+  enum WeightsType weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+  int32_t filter_zero_point_ = 0;
+  float filter_scale_ = 0.75f;
+  bool keep_dims_ = false;
+  bool has_bias_ = true;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_FULLY_CONNECTED_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h
new file mode 100644
index 00000000..3c170523
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h
@@ -0,0 +1,211 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DynamicallyQuantizedTransposeConvTester {
+ public:
+  DynamicallyQuantizedTransposeConvTester() = default;
+  DynamicallyQuantizedTransposeConvTester(
+      const DynamicallyQuantizedTransposeConvTester&) = delete;
+  DynamicallyQuantizedTransposeConvTester& operator=(
+      const DynamicallyQuantizedTransposeConvTester&) = delete;
+
+  inline DynamicallyQuantizedTransposeConvTester& BatchSize(
+      int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& InputChannels(
+      int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& OutputChannels(
+      int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& OutputHeight(
+      int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& OutputWidth(
+      int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& KernelHeight(
+      int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& KernelWidth(
+      int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& StrideHeight(
+      int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& StrideWidth(
+      int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+  inline DynamicallyQuantizedTransposeConvTester& DilationHeight(
+      int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& DilationWidth(
+      int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedTransposeConvTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline int32_t InputWidth() const {
+    return ComputeInputSize(OutputWidth(), KernelWidth(), StrideWidth());
+  }
+
+  inline int32_t InputHeight() const {
+    return ComputeInputSize(OutputHeight(), KernelHeight(), StrideHeight());
+  }
+
+  inline DynamicallyQuantizedTransposeConvTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  int32_t ComputeInputSize(int32_t output_size, int32_t kernel_size,
+                           int32_t stride) const {
+    // Roughly follows TFLite's `ComputeOutSize`.
+    switch (padding_) {
+      case ::tflite::Padding_VALID:
+        return (output_size + stride - kernel_size) / stride;
+        break;
+      case ::tflite::Padding_SAME:
+        return (output_size + stride - 1) / stride;
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+ private:
+  std::vector<int8_t> GenerateKernelData() const;
+  std::vector<float> GenerateBiasData() const;
+  std::vector<float> GenerateKernelScaleData() const;
+  std::vector<char> CreateTfLiteModel(
+      const std::vector<int8_t>& filter_data,
+      const std::vector<float>& bias_data,
+      const std::vector<float>& kernel_scale) const;
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/file_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/file_util.h
new file mode 100644
index 00000000..d1ab4d6d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/file_util.h
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
+
+#include <sys/types.h>
+
+#include <cstddef>
+#include <utility>
+
+namespace tflite {
+namespace xnnpack {
+
+#if defined(_MSC_VER)
+using mode_t = int;
+#endif
+
+// Wraps a C file descriptor and closes it when destroyed.
+//
+// Note that constness of the wrapped does NOT propagate to the file operations.
+class FileDescriptor {
+ public:
+  explicit FileDescriptor(int fd) : fd_(fd) {}
+
+  FileDescriptor() = default;
+
+  FileDescriptor(const FileDescriptor&) = delete;
+  FileDescriptor& operator=(const FileDescriptor&) = delete;
+
+  FileDescriptor(FileDescriptor&& other) : fd_(other.fd_) { other.fd_ = -1; }
+
+  FileDescriptor& operator=(FileDescriptor&& other) {
+    Close();
+    fd_ = other.fd_;
+    other.fd_ = -1;
+    return *this;
+  }
+
+  ~FileDescriptor() { Close(); }
+
+  // Checks that the file descriptor has a valid value.
+  //
+  // WARNING: this does not check that the descriptor points to an open file.
+  bool IsValid() const { return fd_ >= 0; }
+
+  // Returns the file descriptor value.
+  int Value() const { return fd_; }
+
+  // Closes the current file descriptor if needed and assigns the given value.
+  void Reset(int new_fd);
+
+  // Returns the cursor position in the current file.
+  //
+  // Equivalent to MovePos(0).
+  //
+  // WARNING: the file descriptor must be valid and the file must be opened.
+  off_t GetPos() const;
+
+  // Sets the absolute cursor position in the current file.
+  //
+  // Returns the cursor position in the file or -1 on error.
+  //
+  // WARNING: the file descriptor must be valid and the file must be opened.
+  off_t SetPos(off_t position) const;
+
+  // Sets the cursor position relative to the file end.
+  //
+  // Returns the cursor position in the file or -1 on error.
+  //
+  // WARNING: the file descriptor must be valid and the file must be opened.
+  off_t SetPosFromEnd(off_t offset) const;
+
+  // Moves the cursor position by the given offset in the current file.
+  //
+  // Returns the cursor position in the file or -1 on error.
+  //
+  // WARNING: the file descriptor must be valid and the file must be opened.
+  off_t MovePos(off_t offset) const;
+
+  // Duplicates the current file descriptor and returns the new file descriptor.
+  //
+  // If the file descriptor is invalid, returns a new invalid FileDescriptor
+  // object.
+  FileDescriptor Duplicate() const;
+
+  // Opens a file.
+  //
+  // Directly maps to the standard C function `open`.
+  static FileDescriptor Open(const char* path, int flags, mode_t mode = 0);
+
+  // Closes the current file descriptor and sets it to -1.
+  void Close();
+
+  // Reads `count` bytes from the file at the current position to `dst`.
+  //
+  // Returns true if all the data available in the file was read to the buffer
+  // (i.e. `count` bytes were read or EOF was reached).
+  //
+  // This is a convenience function wrapping the standard `read` function. If
+  // you need finer grain control use that directly.
+  [[nodiscard /*Reading from a file may fail.*/]]
+  bool Read(void* dst, size_t count) const;
+
+  // Writes `count` bytes to the file at the current position from `src`.
+  //
+  // This is a convenience function wrapping the standard `write` function. If
+  // you need finer grain control use that directly.
+  [[nodiscard /*Reading from a file may fail.*/]]
+  bool Write(const void* src, size_t count) const;
+
+  // Returns the current file descriptor value and stops managing it.
+  int Release() {
+    const int fd = fd_;
+    fd_ = -1;
+    return fd;
+  }
+
+  friend void swap(FileDescriptor& f1, FileDescriptor& f2) {
+    using std::swap;
+    swap(f1.fd_, f2.fd_);
+  }
+
+ private:
+  int fd_ = -1;
+};
+
+// Checks if the current build and system support creating an in-memory file
+// descriptor.
+bool InMemoryFileDescriptorAvailable();
+
+// Creates a new file descriptor that isn't backed by a file system. The file
+// will be automatically cleaned up when the last file descriptor pointing to it
+// is closed.
+FileDescriptor CreateInMemoryFileDescriptor(const char* path);
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/flexbuffers_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/flexbuffers_util.h
new file mode 100644
index 00000000..6f303c8a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/flexbuffers_util.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FLEXBUFFERS_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FLEXBUFFERS_UTIL_H_
+
+#include "flatbuffers/base.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+
+namespace tflite::xnnpack {
+// We use this class defined with internal linkage as a key to prevent the
+// following workaround to leak into other translation units.
+struct FloatPointer {
+  const float* ptr = nullptr;
+};
+}  // namespace tflite::xnnpack
+
+namespace flexbuffers {
+
+// TODO(b/359351192): switch to xnnpack builtin. This is a workaround until we
+// are able to use just the value.
+//
+// We go around the access policy of the `Reference` class by specializing a
+// template function that was not specialized for our use case.
+//
+// This is weakly tolerant to an update to the `Reference` class because:
+//   - THIS IS MEANT TO BE TEMPORARY until we actually use the XNNPack
+//     implementation of SDPA (and dependent on not needing data ptr).
+//   - The flexbuffer spec is public and set, so the layout should not evolve
+//     much.
+//
+// The alternative was to copy/paste the code to get to the map data and grab
+// the pointer which basically means rewriting flexbuffer.h.
+template <>
+tflite::xnnpack::FloatPointer inline flexbuffers::Reference::As<
+    tflite::xnnpack::FloatPointer>() const {
+#if !FLATBUFFERS_LITTLEENDIAN
+  // Flexbuffers are always stored in little endian order. Returning a pointer
+  // to the float data on a big endian architecture is meaningless.
+  return nullptr;
+#else
+  return {IsFloat() ? reinterpret_cast<const float*>(data_) : nullptr};
+#endif
+}
+
+}  // namespace flexbuffers
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FLEXBUFFERS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
new file mode 100644
index 00000000..029fff36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -0,0 +1,179 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class FullyConnectedTester {
+ public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+    kDynamic,
+  };
+  enum class BiasType {
+    kNone,
+    kFP32,
+    kFP16,
+    kDynamic,
+  };
+
+  FullyConnectedTester() = default;
+  FullyConnectedTester(const FullyConnectedTester&) = delete;
+  FullyConnectedTester& operator=(const FullyConnectedTester&) = delete;
+
+  inline FullyConnectedTester& InputShape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline FullyConnectedTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline FullyConnectedTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline FullyConnectedTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline FullyConnectedTester& FP16Weights() {
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
+    return *this;
+  }
+
+  inline FullyConnectedTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline FullyConnectedTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline FullyConnectedTester& DynamicWeights() {
+    weights_type_ = WeightsType::kDynamic;
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline FullyConnectedTester& NoBias() {
+    bias_type_ = BiasType::kNone;
+    return *this;
+  }
+
+  inline FullyConnectedTester& DynamicBias() {
+    bias_type_ = BiasType::kDynamic;
+    return *this;
+  }
+
+  inline FullyConnectedTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline FullyConnectedTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline FullyConnectedTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline FullyConnectedTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline bool HasBias() const { return bias_type_ != BiasType::kNone; }
+
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  int32_t input_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  bool keep_dims_ = false;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
new file mode 100644
index 00000000..68f8d54f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_LEAKY_RELU_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_LEAKY_RELU_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class LeakyReluTester {
+ public:
+  LeakyReluTester() = default;
+  LeakyReluTester(const LeakyReluTester&) = delete;
+  LeakyReluTester& operator=(const LeakyReluTester&) = delete;
+
+  inline LeakyReluTester& Shape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = LeakyReluTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Shape() const { return shape_; }
+
+  inline int32_t Size() const { return size_; }
+
+  inline LeakyReluTester& NegativeSlope(float negative_slope) {
+    negative_slope_ = negative_slope;
+    return *this;
+  }
+
+  inline float NegativeSlope() const { return negative_slope_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  float negative_slope_ = 0.5f;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_LEAKY_RELU_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h
new file mode 100644
index 00000000..ba13b851
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h
@@ -0,0 +1,119 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_ODML_SDPA_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_ODML_SDPA_TESTER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+constexpr const char kOdmlSdpaCompositeMqa[] = "odml_sdpa_composite_mqa";
+constexpr const char kOdmlSdpaCompositeMha[] = "odml_sdpa_composite_mha";
+constexpr const char kOdmlSdpaCompositeGqa[] = "odml_sdpa_composite_gqa";
+constexpr const char kOdmlSdpaCustom[] = "odml_sdpa_custom";
+
+class ODMLSDPATester {
+ public:
+  ODMLSDPATester() = default;
+  ODMLSDPATester(const ODMLSDPATester&) = delete;
+  ODMLSDPATester& operator=(const ODMLSDPATester&) = delete;
+
+  explicit ODMLSDPATester(const std::string& model_name)
+      : model_name_(model_name) {};
+
+  inline ODMLSDPATester& QueryShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    query_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    query_size_ = ComputeSize(query_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& QueryShape() const { return query_shape_; }
+
+  inline ODMLSDPATester& KeyShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    key_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    key_size_ = ComputeSize(key_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& KeyShape() const { return key_shape_; }
+
+  inline ODMLSDPATester& ValueShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    value_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    value_size_ = ComputeSize(value_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& ValueShape() const { return value_shape_; }
+
+  inline ODMLSDPATester& MaskShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    mask_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    mask_size_ = ComputeSize(mask_shape_);
+    return *this;
+  }
+
+  int32_t Batch() const { return query_shape_[0]; };
+  int32_t InputSeqLen() const { return query_shape_[1]; };
+  int32_t QHeads() const { return query_shape_[2]; };
+  int32_t HeadDim() const { return query_shape_[3]; };
+  int32_t MaxSeqLen() const { return key_shape_[1]; };
+  int32_t KVHeads() const { return key_shape_[2]; };
+
+  inline const std::vector<int32_t>& MaskShape() const { return mask_shape_; }
+
+  inline int32_t QuerySize() const { return query_size_; }
+
+  inline int32_t KeySize() const { return key_size_; }
+
+  inline int32_t ValueSize() const { return value_size_; }
+
+  inline int32_t MaskSize() const { return mask_size_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  std::vector<int32_t> query_shape_;
+  std::vector<int32_t> key_shape_;
+  std::vector<int32_t> value_shape_;
+  std::vector<int32_t> mask_shape_;
+  int32_t query_size_ = 1;
+  int32_t key_size_ = 1;
+  int32_t value_size_ = 1;
+  int32_t mask_size_ = 1;
+  std::string model_name_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_ODML_SDPA_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/pad_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/pad_tester.h
new file mode 100644
index 00000000..c6cefb43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/pad_tester.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PadTester {
+ public:
+  PadTester() = default;
+  PadTester(const PadTester&) = delete;
+  PadTester& operator=(const PadTester&) = delete;
+
+  inline PadTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PadTester& InputPrePaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_pre_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPrePaddings() const {
+    return input_pre_paddings_;
+  }
+
+  inline PadTester& InputPostPaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_post_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPostPaddings() const {
+    return input_post_paddings_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> input_pre_paddings_;
+  std::vector<int32_t> input_post_paddings_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
new file mode 100644
index 00000000..d559c451
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
@@ -0,0 +1,177 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_POOL_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_POOL_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class Pool2DTester {
+ public:
+  Pool2DTester() = default;
+  Pool2DTester(const Pool2DTester&) = delete;
+  Pool2DTester& operator=(const Pool2DTester&) = delete;
+
+  inline Pool2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline Pool2DTester& Channels(int32_t channels) {
+    EXPECT_GT(channels, 0);
+    channels_ = channels;
+    return *this;
+  }
+
+  inline int32_t Channels() const { return channels_; }
+
+  inline Pool2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline Pool2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      return (InputWidth() - PoolingWidth()) / StrideWidth() + 1;
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      return (InputHeight() - PoolingHeight()) / StrideHeight() + 1;
+    }
+  }
+
+  inline Pool2DTester& PoolingHeight(int32_t pooling_height) {
+    EXPECT_GT(pooling_height, 0);
+    pooling_height_ = pooling_height;
+    return *this;
+  }
+
+  inline int32_t PoolingHeight() const { return pooling_height_; }
+
+  inline Pool2DTester& PoolingWidth(int32_t pooling_width) {
+    EXPECT_GT(pooling_width, 0);
+    pooling_width_ = pooling_width;
+    return *this;
+  }
+
+  inline int32_t PoolingWidth() const { return pooling_width_; }
+
+  inline Pool2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline Pool2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline Pool2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline Pool2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline Pool2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline Pool2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline Pool2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline Pool2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline Pool2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  void Test(tflite::BuiltinOperator pool_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator pool_op) const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t pooling_height_ = 1;
+  int32_t pooling_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_POOL_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/prelu_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/prelu_tester.h
new file mode 100644
index 00000000..3dbb7fbf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/prelu_tester.h
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PreluTester {
+ public:
+  PreluTester() = default;
+  PreluTester(const PreluTester&) = delete;
+  PreluTester& operator=(const PreluTester&) = delete;
+
+  inline PreluTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PreluTester& SlopeShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    slope_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& SlopeShape() const { return slope_shape_; }
+
+  inline const std::vector<int32_t>& OutputShape() const {
+    return InputShape();
+  }
+
+  inline PreluTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline PreluTester& INT8Weights() {
+    int8_weights_ = true;
+    return *this;
+  }
+
+  inline bool INT8Weights() const { return int8_weights_; }
+
+  inline PreluTester& INT8ChannelWiseWeights() {
+    int8_channel_wise_weights_ = true;
+    return *this;
+  }
+
+  inline bool INT8ChannelWiseWeights() const {
+    return int8_channel_wise_weights_;
+  }
+
+  inline PreluTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  inline PreluTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> slope_shape_;
+  bool fp16_weights_ = false;
+  bool int8_weights_ = false;
+  bool int8_channel_wise_weights_ = false;
+  bool sparse_weights_ = false;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantization_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantization_util.h
new file mode 100644
index 00000000..c9a7594f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantization_util.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZATION_UTIL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace xnnpack {
+
+// Dequantizes INT8 value using given zero point and scale.
+// packed_s8_data should contain raw tensor data corresponding to
+// a given tensor_shape. unpacked_fp32_data should be preallocated
+// to have the same size.
+void DequantizeInt8(const int8_t* packed_s8_data, float* unpacked_fp32_data,
+                    const RuntimeShape& tensor_shape, int32_t zero_point,
+                    double scale);
+
+// Per-channel dequantizes INT8 value using given zero points and
+// scales. packed_s8_data should contain raw tensor data corresponding
+// to a given tensor_shape. unpacked_fp32_data should be preallocated
+// to have the same size.
+void PerChannelDequantizeInt8(const int8_t* packed_s8_data,
+                              float* unpacked_fp32_data,
+                              const RuntimeShape& tensor_shape,
+                              const int32_t* zero_points, const float* scales,
+                              int32_t quantized_dimension);
+
+// Dequantizes INT8 value using given zero point and scale.
+// packed_fp16_data should have tensor_elements size and contain raw
+// FP16 tensor data. unpacked_fp32_data should be preallocated to
+// have the same size.
+void DequantizeFloat16(const uint16_t* packed_fp16_data,
+                       float* unpacked_fp32_data, size_t tensor_elements);
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantize_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantize_tester.h
new file mode 100644
index 00000000..8300158d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantize_tester.h
@@ -0,0 +1,113 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizeTester {
+ public:
+  QuantizeTester() = default;
+  QuantizeTester(const QuantizeTester&) = delete;
+  QuantizeTester& operator=(const QuantizeTester&) = delete;
+
+  inline QuantizeTester& Shape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = QuantizeTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline QuantizeTester& InputZeroPoint(int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizeTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizeTester& OutputZeroPoint(int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizeTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizeTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void PopulateInput(Interpreter* delegate_interpreter,
+                     Interpreter* default_interpreter) const;
+
+  template <class T>
+  void InvokeAndCheckOutput(Interpreter* delegate_interpreter,
+                            Interpreter* default_interpreter) const;
+
+  void Test(TensorType input_type, TensorType output_type,
+            TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType input_type,
+                                      TensorType output_type) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  int32_t input_zero_point_ = 0;
+  float input_scale_ = 1.0f;
+  int32_t output_zero_point_ = 0;
+  float output_scale_ = 1.0f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h
new file mode 100644
index 00000000..4bfafd40
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h
@@ -0,0 +1,180 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_BINARY_ELEMENTWISE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_BINARY_ELEMENTWISE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedBinaryElementwiseTester {
+ public:
+  QuantizedBinaryElementwiseTester() = default;
+  QuantizedBinaryElementwiseTester(const QuantizedBinaryElementwiseTester&) =
+      delete;
+  QuantizedBinaryElementwiseTester& operator=(
+      const QuantizedBinaryElementwiseTester&) = delete;
+
+  inline QuantizedBinaryElementwiseTester& Input1Shape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input1_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Input1Shape() const {
+    return input1_shape_;
+  }
+
+  inline QuantizedBinaryElementwiseTester& Input2Shape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input2_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Input2Shape() const {
+    return input2_shape_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline QuantizedBinaryElementwiseTester& Input1Static(bool is_static) {
+    input1_static_ = is_static;
+    return *this;
+  }
+
+  inline bool Input1Static() const { return input1_static_; }
+
+  inline QuantizedBinaryElementwiseTester& Input2Static(bool is_static) {
+    input2_static_ = is_static;
+    return *this;
+  }
+
+  inline bool Input2Static() const { return input2_static_; }
+
+  inline QuantizedBinaryElementwiseTester& Input1ZeroPoint(
+      int32_t input1_zero_point) {
+    input1_zero_point_ = input1_zero_point;
+    return *this;
+  }
+
+  inline int32_t Input1ZeroPoint() const { return input1_zero_point_; }
+
+  inline QuantizedBinaryElementwiseTester& Input2ZeroPoint(
+      int32_t input2_zero_point) {
+    input2_zero_point_ = input2_zero_point;
+    return *this;
+  }
+
+  inline int32_t Input2ZeroPoint() const { return input2_zero_point_; }
+
+  inline QuantizedBinaryElementwiseTester& OutputZeroPoint(
+      int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedBinaryElementwiseTester& Input1Scale(float input1_scale) {
+    input1_scale_ = input1_scale;
+    return *this;
+  }
+
+  inline float Input1Scale() const { return input1_scale_; }
+
+  inline QuantizedBinaryElementwiseTester& Input2Scale(float input2_scale) {
+    input2_scale_ = input2_scale;
+    return *this;
+  }
+
+  inline float Input2Scale() const { return input2_scale_; }
+
+  inline QuantizedBinaryElementwiseTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedBinaryElementwiseTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  inline QuantizedBinaryElementwiseTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline QuantizedBinaryElementwiseTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline QuantizedBinaryElementwiseTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(tflite::BuiltinOperator binary_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator binary_op) const;
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input1_shape_;
+  std::vector<int32_t> input2_shape_;
+  bool input1_static_ = false;
+  bool input2_static_ = false;
+  int32_t input1_zero_point_ = 0;
+  int32_t input2_zero_point_ = 0;
+  int32_t output_zero_point_ = 0;
+  float input1_scale_ = 0.75f;
+  float input2_scale_ = 1.0f;
+  float output_scale_ = 1.75f;
+  bool unsigned_ = false;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_BINARY_ELEMENTWISE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
new file mode 100644
index 00000000..b59cb6ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
@@ -0,0 +1,304 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+// Creates a model with a single CONV_2D operator with quantized input, output,
+// and weights, runs this model in two TensorFlow Lite interpreters, one with
+// the delegate applied, and the other without, and compares the results.
+class QuantizedConv2DTester {
+ public:
+  QuantizedConv2DTester() = default;
+  QuantizedConv2DTester(const QuantizedConv2DTester&) = delete;
+  QuantizedConv2DTester& operator=(const QuantizedConv2DTester&) = delete;
+
+  inline QuantizedConv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline QuantizedConv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline QuantizedConv2DTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline QuantizedConv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline QuantizedConv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline QuantizedConv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline QuantizedConv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline QuantizedConv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline QuantizedConv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline QuantizedConv2DTester& DilationHeight(int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline QuantizedConv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline QuantizedConv2DTester& Groups(int32_t groups) {
+    EXPECT_EQ(InputChannels() % groups, 0);
+    EXPECT_EQ(OutputChannels() % groups, 0);
+    groups_ = groups;
+    return *this;
+  }
+
+  inline int32_t Groups() const { return groups_; }
+
+  inline int32_t KernelInputChannels() const {
+    return input_channels_ / groups_;
+  }
+
+  inline QuantizedConv2DTester& InputZeroPoint(int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizedConv2DTester& OutputZeroPoint(int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedConv2DTester& KernelZeroPoint(int32_t kernel_zero_point) {
+    kernel_zero_point_ = kernel_zero_point;
+    return *this;
+  }
+
+  inline int32_t KernelZeroPoint() const { return kernel_zero_point_; }
+
+  inline QuantizedConv2DTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizedConv2DTester& KernelScale(float kernel_scale) {
+    kernel_scale_ = kernel_scale;
+    return *this;
+  }
+
+  inline float KernelScale() const {
+    EXPECT_FALSE(ChannelWise());
+    return kernel_scale_;
+  }
+
+  inline QuantizedConv2DTester& KernelScales(
+      const std::vector<float>& kernel_scales) {
+    EXPECT_GT(kernel_scales.size(), 0);
+    kernel_scales_ = kernel_scales;
+    return *this;
+  }
+
+  inline const std::vector<float>& KernelScales() const {
+    EXPECT_TRUE(ChannelWise());
+    return kernel_scales_;
+  }
+
+  inline bool Unsigned() const { return kernel_zero_point_ != 0; }
+
+  inline bool ChannelWise() const { return !kernel_scales_.empty(); }
+
+  inline QuantizedConv2DTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedConv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline QuantizedConv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline QuantizedConv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline QuantizedConv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline QuantizedConv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline QuantizedConv2DTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t groups_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  int32_t input_zero_point_ = 0;
+  int32_t output_zero_point_ = 0;
+  int32_t kernel_zero_point_ = 0;
+  float input_scale_ = 0.125f;
+  float kernel_scale_ = 0.25f;
+  std::vector<float> kernel_scales_;
+  float output_scale_ = 1.5f;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_CONV_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h
new file mode 100644
index 00000000..82178d50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h
@@ -0,0 +1,302 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_DEPTHWISE_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_DEPTHWISE_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+// Creates a model with a single DEPTHWISE_CONV_2D operator with quantized
+// input, output, and weights, runs this model in two TensorFlow Lite
+// interpreters, one with the delegate applied, and the other without, and
+// compares the results.
+class QuantizedDepthwiseConv2DTester {
+ public:
+  QuantizedDepthwiseConv2DTester() = default;
+  QuantizedDepthwiseConv2DTester(const QuantizedDepthwiseConv2DTester&) =
+      delete;
+  QuantizedDepthwiseConv2DTester& operator=(
+      const QuantizedDepthwiseConv2DTester&) = delete;
+
+  inline QuantizedDepthwiseConv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline QuantizedDepthwiseConv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline QuantizedDepthwiseConv2DTester& DepthMultiplier(
+      int32_t depth_multiplier) {
+    EXPECT_GT(depth_multiplier, 0);
+    depth_multiplier_ = depth_multiplier;
+    return *this;
+  }
+
+  inline int32_t DepthMultiplier() const { return depth_multiplier_; }
+
+  inline int32_t OutputChannels() const {
+    return DepthMultiplier() * InputChannels();
+  }
+
+  inline QuantizedDepthwiseConv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline QuantizedDepthwiseConv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline QuantizedDepthwiseConv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline QuantizedDepthwiseConv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline QuantizedDepthwiseConv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline QuantizedDepthwiseConv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline QuantizedDepthwiseConv2DTester& DilationHeight(
+      int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline QuantizedDepthwiseConv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& InputZeroPoint(
+      int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizedDepthwiseConv2DTester& OutputZeroPoint(
+      int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedDepthwiseConv2DTester& KernelZeroPoint(
+      int32_t kernel_zero_point) {
+    kernel_zero_point_ = kernel_zero_point;
+    return *this;
+  }
+
+  inline int32_t KernelZeroPoint() const { return kernel_zero_point_; }
+
+  inline QuantizedDepthwiseConv2DTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizedDepthwiseConv2DTester& KernelScale(float kernel_scale) {
+    kernel_scale_ = kernel_scale;
+    return *this;
+  }
+
+  inline float KernelScale() const {
+    EXPECT_FALSE(ChannelWise());
+    return kernel_scale_;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& KernelScales(
+      const std::vector<float>& kernel_scales) {
+    EXPECT_GT(kernel_scales.size(), 0);
+    kernel_scales_ = kernel_scales;
+    return *this;
+  }
+
+  inline const std::vector<float>& KernelScales() const {
+    EXPECT_TRUE(ChannelWise());
+    return kernel_scales_;
+  }
+
+  inline bool Unsigned() const { return kernel_zero_point_ != 0; }
+
+  inline bool ChannelWise() const { return !kernel_scales_.empty(); }
+
+  inline QuantizedDepthwiseConv2DTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedDepthwiseConv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline QuantizedDepthwiseConv2DTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t depth_multiplier_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  int32_t input_zero_point_ = 0;
+  int32_t output_zero_point_ = 0;
+  int32_t kernel_zero_point_ = 0;
+  float input_scale_ = 0.8f;
+  float kernel_scale_ = 0.75f;
+  std::vector<float> kernel_scales_;
+  float output_scale_ = 1.5f;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_DEPTHWISE_CONV_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h
new file mode 100644
index 00000000..c5514e57
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h
@@ -0,0 +1,193 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_FULLY_CONNECTED_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_FULLY_CONNECTED_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedFullyConnectedTester {
+ public:
+  QuantizedFullyConnectedTester() = default;
+  QuantizedFullyConnectedTester(const QuantizedFullyConnectedTester&) = delete;
+  QuantizedFullyConnectedTester& operator=(
+      const QuantizedFullyConnectedTester&) = delete;
+
+  inline QuantizedFullyConnectedTester& InputShape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline QuantizedFullyConnectedTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline QuantizedFullyConnectedTester& OutputChannels(
+      int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline QuantizedFullyConnectedTester& InputZeroPoint(
+      int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizedFullyConnectedTester& FilterZeroPoint(
+      int32_t filter_zero_point) {
+    filter_zero_point_ = filter_zero_point;
+    return *this;
+  }
+
+  inline int32_t FilterZeroPoint() const { return filter_zero_point_; }
+
+  inline QuantizedFullyConnectedTester& OutputZeroPoint(
+      int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedFullyConnectedTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizedFullyConnectedTester& FilterScale(float filter_scale) {
+    filter_scale_ = filter_scale;
+    return *this;
+  }
+
+  inline float FilterScale() const { return filter_scale_; }
+
+  inline QuantizedFullyConnectedTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedFullyConnectedTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline bool Unsigned() const { return filter_zero_point_ != 0; }
+
+  inline QuantizedFullyConnectedTester& NoBias() {
+    has_bias_ = false;
+    return *this;
+  }
+
+  inline QuantizedFullyConnectedTester& WithBias() {
+    has_bias_ = true;
+    return *this;
+  }
+
+  inline QuantizedFullyConnectedTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline QuantizedFullyConnectedTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline QuantizedFullyConnectedTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline QuantizedFullyConnectedTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline bool HasBias() const { return has_bias_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  int32_t input_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t input_zero_point_ = 0;
+  int32_t filter_zero_point_ = 0;
+  int32_t output_zero_point_ = 0;
+  float input_scale_ = 0.8f;
+  float filter_scale_ = 0.75f;
+  float output_scale_ = 1.5f;
+  bool keep_dims_ = false;
+  bool has_bias_ = true;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_FULLY_CONNECTED_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h
new file mode 100644
index 00000000..ca46b280
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h
@@ -0,0 +1,115 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_LEAKY_RELU_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_LEAKY_RELU_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedLeakyReluTester {
+ public:
+  QuantizedLeakyReluTester() = default;
+  QuantizedLeakyReluTester(const QuantizedLeakyReluTester&) = delete;
+  QuantizedLeakyReluTester& operator=(const QuantizedLeakyReluTester&) = delete;
+
+  inline QuantizedLeakyReluTester& Shape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = QuantizedLeakyReluTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline QuantizedLeakyReluTester& InputZeroPoint(int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizedLeakyReluTester& OutputZeroPoint(int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedLeakyReluTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizedLeakyReluTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedLeakyReluTester& NegativeSlope(float negative_slope) {
+    negative_slope_ = negative_slope;
+    return *this;
+  }
+
+  inline float NegativeSlope() const { return negative_slope_; }
+
+  inline QuantizedLeakyReluTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  int32_t input_zero_point_ = 0;
+  int32_t output_zero_point_ = 0;
+  float input_scale_ = 1.0f;
+  float output_scale_ = 1.0f;
+  float negative_slope_ = 0.5f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_LEAKY_RELU_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h
new file mode 100644
index 00000000..bfb68871
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h
@@ -0,0 +1,119 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_PAD_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_PAD_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedPadTester {
+ public:
+  QuantizedPadTester() = default;
+  QuantizedPadTester(const QuantizedPadTester&) = delete;
+  QuantizedPadTester& operator=(const QuantizedPadTester&) = delete;
+
+  inline QuantizedPadTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline QuantizedPadTester& InputPrePaddings(
+      std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_pre_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPrePaddings() const {
+    return input_pre_paddings_;
+  }
+
+  inline QuantizedPadTester& InputPostPaddings(
+      std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_post_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPostPaddings() const {
+    return input_post_paddings_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  inline QuantizedPadTester& ZeroPoint(int32_t zero_point) {
+    zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t ZeroPoint() const { return zero_point_; }
+
+  inline QuantizedPadTester& Scale(float scale) {
+    scale_ = scale;
+    return *this;
+  }
+
+  inline float Scale() const { return scale_; }
+
+  inline QuantizedPadTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> input_pre_paddings_;
+  std::vector<int32_t> input_post_paddings_;
+  int32_t zero_point_ = 7;
+  float scale_ = 0.8f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_PAD_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h
new file mode 100644
index 00000000..d1719b40
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_POOL_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_POOL_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedPool2DTester {
+ public:
+  QuantizedPool2DTester() = default;
+  QuantizedPool2DTester(const QuantizedPool2DTester&) = delete;
+  QuantizedPool2DTester& operator=(const QuantizedPool2DTester&) = delete;
+
+  inline QuantizedPool2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline QuantizedPool2DTester& Channels(int32_t channels) {
+    EXPECT_GT(channels, 0);
+    channels_ = channels;
+    return *this;
+  }
+
+  inline int32_t Channels() const { return channels_; }
+
+  inline QuantizedPool2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline QuantizedPool2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      return (InputWidth() - PoolingWidth()) / StrideWidth() + 1;
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      return (InputHeight() - PoolingHeight()) / StrideHeight() + 1;
+    }
+  }
+
+  inline QuantizedPool2DTester& PoolingHeight(int32_t pooling_height) {
+    EXPECT_GT(pooling_height, 0);
+    pooling_height_ = pooling_height;
+    return *this;
+  }
+
+  inline int32_t PoolingHeight() const { return pooling_height_; }
+
+  inline QuantizedPool2DTester& PoolingWidth(int32_t pooling_width) {
+    EXPECT_GT(pooling_width, 0);
+    pooling_width_ = pooling_width;
+    return *this;
+  }
+
+  inline int32_t PoolingWidth() const { return pooling_width_; }
+
+  inline QuantizedPool2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline QuantizedPool2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline QuantizedPool2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  inline QuantizedPool2DTester& ZeroPoint(int32_t zero_point) {
+    zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t ZeroPoint() const { return zero_point_; }
+
+  inline QuantizedPool2DTester& Scale(float scale) {
+    scale_ = scale;
+    return *this;
+  }
+
+  inline float Scale() const { return scale_; }
+
+  inline QuantizedPool2DTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(tflite::BuiltinOperator pool_op, Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(tflite::BuiltinOperator pool_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator pool_op) const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t pooling_height_ = 1;
+  int32_t pooling_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+  int32_t zero_point_ = 7;
+  float scale_ = 0.5f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_POOL_2D_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
new file mode 100644
index 00000000..863c8e7c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
@@ -0,0 +1,145 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_RESIZE_BILINEAR_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_RESIZE_BILINEAR_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedResizeBilinearTester {
+ public:
+  QuantizedResizeBilinearTester() = default;
+  QuantizedResizeBilinearTester(const QuantizedResizeBilinearTester&) = delete;
+  QuantizedResizeBilinearTester& operator=(
+      const QuantizedResizeBilinearTester&) = delete;
+
+  inline QuantizedResizeBilinearTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline QuantizedResizeBilinearTester& Channels(int32_t channels) {
+    EXPECT_GT(channels, 0);
+    channels_ = channels;
+    return *this;
+  }
+
+  inline int32_t Channels() const { return channels_; }
+
+  inline QuantizedResizeBilinearTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline QuantizedResizeBilinearTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline QuantizedResizeBilinearTester& OutputHeight(int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline QuantizedResizeBilinearTester& OutputWidth(int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  QuantizedResizeBilinearTester& AlignCorners(bool align_corners) {
+    align_corners_ = align_corners;
+    return *this;
+  }
+
+  bool AlignCorners() const { return align_corners_; }
+
+  QuantizedResizeBilinearTester& HalfPixelCenters(bool half_pixel_centers) {
+    half_pixel_centers_ = half_pixel_centers;
+    return *this;
+  }
+
+  bool HalfPixelCenters() const { return half_pixel_centers_; }
+
+  inline QuantizedResizeBilinearTester& ZeroPoint(int32_t zero_point) {
+    zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t ZeroPoint() const { return zero_point_; }
+
+  inline QuantizedResizeBilinearTester& Scale(float scale) {
+    scale_ = scale;
+    return *this;
+  }
+
+  inline float Scale() const { return scale_; }
+
+  inline QuantizedResizeBilinearTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  int32_t batch_size_ = 1;
+  int32_t channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  bool align_corners_ = false;
+  bool half_pixel_centers_ = false;
+  int32_t zero_point_ = 2;
+  float scale_ = 0.75f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_RESIZE_BILINEAR_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
new file mode 100644
index 00000000..06885971
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
@@ -0,0 +1,231 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+namespace xnnpack {
+
+class QuantizedTransposeConvTester {
+ public:
+  explicit QuantizedTransposeConvTester() = default;
+  QuantizedTransposeConvTester(const QuantizedTransposeConvTester&) = delete;
+  QuantizedTransposeConvTester& operator=(const QuantizedTransposeConvTester&) =
+      delete;
+
+  inline QuantizedTransposeConvTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline QuantizedTransposeConvTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline QuantizedTransposeConvTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline QuantizedTransposeConvTester& OutputHeight(int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline QuantizedTransposeConvTester& OutputWidth(int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  inline QuantizedTransposeConvTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline QuantizedTransposeConvTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline QuantizedTransposeConvTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline QuantizedTransposeConvTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline QuantizedTransposeConvTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  inline QuantizedTransposeConvTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline QuantizedTransposeConvTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline int32_t InputWidth() const {
+    return ComputeInputSize(OutputWidth(), KernelWidth(), StrideWidth());
+  }
+
+  inline int32_t InputHeight() const {
+    return ComputeInputSize(OutputHeight(), KernelHeight(), StrideHeight());
+  }
+
+  inline int32_t PaddingWidth() const {
+    return ComputePadding(OutputWidth(), KernelWidth(), StrideWidth());
+  }
+
+  inline int32_t PaddingHeight() const {
+    return ComputePadding(OutputHeight(), KernelHeight(), StrideHeight());
+  }
+
+  inline bool UseBias() const { return use_bias_; }
+
+  inline QuantizedTransposeConvTester& WithBias(bool use_bias = true) {
+    use_bias_ = use_bias;
+    return *this;
+  }
+
+  inline QuantizedTransposeConvTester& NoBias() { return WithBias(false); }
+
+  inline QuantizedTransposeConvTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline QuantizedTransposeConvTester& Signed(bool is_signed = true) {
+    return Unsigned(!is_signed);
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  inline QuantizedTransposeConvTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  int32_t ComputeInputSize(int32_t output_size, int32_t kernel_size,
+                           int32_t stride) const {
+    // Roughly follows TFLite's `ComputeOutSize`.
+    switch (padding_) {
+      case ::tflite::Padding_VALID:
+        return (output_size + stride - kernel_size) / stride;
+        break;
+      case ::tflite::Padding_SAME:
+        return (output_size + stride - 1) / stride;
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  int32_t ComputePadding(int32_t output_size, int32_t kernel_size,
+                         int32_t stride) const {
+    // Roughly follows TFLite's `ComputePaddingWithOffset`.
+    if (padding_ == ::tflite::Padding_VALID) {
+      return 0;
+    }
+    assert(padding_ == ::tflite::Padding_SAME);
+    const int32_t input_size =
+        ComputeInputSize(output_size, kernel_size, stride);
+    return (output_size - 1) * stride + kernel_size - input_size;
+  }
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  template <typename WeightType>
+  void EnsureOutputsClose(const Interpreter* default_interpreter,
+                          const Interpreter* delegate_interpreter) const;
+
+ private:
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  bool unsigned_ = true;
+  bool use_bias_ = true;
+  bool sparse_weights_ = false;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h
new file mode 100644
index 00000000..595fe436
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h
@@ -0,0 +1,112 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_UNARY_ELEMENTWISE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_UNARY_ELEMENTWISE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedUnaryElementwiseTester {
+ public:
+  QuantizedUnaryElementwiseTester() = default;
+  QuantizedUnaryElementwiseTester(const QuantizedUnaryElementwiseTester&) =
+      delete;
+  QuantizedUnaryElementwiseTester& operator=(
+      const QuantizedUnaryElementwiseTester&) = delete;
+
+  inline QuantizedUnaryElementwiseTester& Shape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = QuantizedUnaryElementwiseTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline QuantizedUnaryElementwiseTester& InputZeroPoint(
+      int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizedUnaryElementwiseTester& OutputZeroPoint(
+      int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedUnaryElementwiseTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizedUnaryElementwiseTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedUnaryElementwiseTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(tflite::BuiltinOperator unary_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator unary_op) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  int32_t input_zero_point_ = 0;
+  int32_t output_zero_point_ = 0;
+  float input_scale_ = 1.0f;
+  float output_scale_ = 1.0f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_UNARY_ELEMENTWISE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.h
new file mode 100644
index 00000000..3419022b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.h
@@ -0,0 +1,212 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_VARIABLE_OPS_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_VARIABLE_OPS_TESTER_H_
+
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+NewXnnPackDelegateSupportingVariableOps();
+
+class QuantizedVariableOpsTester {
+ public:
+  void TestAssignThenRead(TfLiteDelegate *delegate) const;
+
+  void TestAssignTwiceThenRead(TfLiteDelegate *delegate) const;
+
+  void TestAssignThenReadUsingAnotherVarHandle(TfLiteDelegate *delegate) const;
+
+  void TestTwoVarHandlesAssignThenRead(TfLiteDelegate *delegate) const;
+
+  void TestTwoSubgraphsReadAssign(TfLiteDelegate *delegate) const;
+
+  void TestTwoSubgraphsReadAssignOneVarHandle(TfLiteDelegate *delegate) const;
+
+  void TestTwoSubgraphsReadAssignOneVarHandle2(TfLiteDelegate *delegate) const;
+
+  // Creates a model with this subgraph:
+  //   (initial_input) ----\
+  //                        \
+  //   VAR_HANDLE ----------> AV
+  //        \
+  //         \---------------> RV  -> (output)
+  std::vector<char> CreateModelAssignThenRead() const;
+
+  // Creates a model with this subgraph:
+  // (initial_input) -------
+  //                        \
+  // VAR_HANDLE ----------> AV
+  //  \      \
+  //   \      \---------------> RV  -> (output)
+  //    \
+  //     \-----AV
+  //           /
+  // (default_input)
+  std::vector<char> CreateModelAssignTwiceThenRead() const;
+
+  // Creates a model with this subgraph:
+  //   (initial_input) ----\
+  //                        \
+  //   VAR_HANDLE ----------> AV
+  //
+  //   VAR_HANDLE ---------------> RV  -> (output)
+  //   Second VAR_HANDLE is a different operator object but refers to the same
+  //   variable by name.
+  std::vector<char> CreateModelAssignThenReadUsingAnotherVarHandle() const;
+
+  // Creates a model with this subgraph:
+  // (default_input) -------
+  //                        \
+  // VAR_HANDLE ----------> AV
+  //         \
+  //          \---------------> RV  -> (output1)
+  //
+  // (default_input) ---------------
+  //                                 \
+  // VAR_HANDLE (different one) -----AV
+  //         \
+  //          \---------------> RV  -> (output2)
+  std::vector<char> CreateModelTwoVarHandlesAssignThenRead() const;
+
+  // Creates a model with two subgraphs.
+  // primary subgraph:
+  // CALL_ONCE (secondary subgraph)
+  // VAR_HANDLE1 ---> RV ---> (output)
+  // VAR_HANDLE2 ---> RV ---> (output)
+  //
+  // secondary subgraph:
+  // VAR_HANDLE2   ----- AV
+  // buffer1 -----------/
+  // VAR_HANDLE1   ----- AV
+  // buffer2 -----------/
+  // The var handles are defined in different orders.
+  std::vector<char> CreateModelTwoSubgraphsReadAssign() const;
+
+  // Creates a model with two subgraphs.
+  // primary subgraph has 1 var handle.
+  // CALL_ONCE (secondary subgraph)
+  // VAR_HANDLE1 ---> RV ---> (output)
+  //
+  // secondary subgraph has 2 varhandle:
+  // VAR_HANDLE2   ----- AV
+  // buffer1 -----------/
+  // VAR_HANDLE1   ----- AV
+  // buffer2 -----------/
+  // The expected output is buffer2.
+  // The var handles are defined in different orders.
+  std::vector<char> CreateModelTwoSubgraphsReadAssignOneVarHandle() const;
+
+  // Similar to CreateModelTwoSubgraphsReadAssignOneVarHandle but with the first
+  // subgraph reading var handle 1 and flipping the order of var handles in the
+  // second subgraph.
+  // Creates a model with two subgraphs.
+  // primary subgraph has 1 var handle.
+  // CALL_ONCE (secondary subgraph)
+  // VAR_HANDLE2 ---> RV ---> (output)
+  //
+  // secondary subgraph has 2 varhandle:
+  // VAR_HANDLE1   ----- AV
+  // buffer1 -----------/
+  // VAR_HANDLE2   ----- AV
+  // buffer2 -----------/
+  // The expected output is buffer2.
+  // The var handles are defined in different orders.
+  std::vector<char> CreateModelTwoSubgraphsReadAssignOneVarHandle2() const;
+
+  inline QuantizedVariableOpsTester &NumInputs(size_t num_inputs) {
+    num_inputs_ = num_inputs;
+    return *this;
+  }
+
+  inline size_t NumInputs() const { return num_inputs_; }
+
+  inline QuantizedVariableOpsTester &NumOutputs(size_t num_outputs) {
+    num_outputs_ = num_outputs;
+    return *this;
+  }
+
+  inline size_t NumOutputs() const { return num_outputs_; }
+
+  inline size_t NumSubgraphs() const { return num_subgraphs_; }
+
+  inline QuantizedVariableOpsTester &NumSubgraphs(size_t num_subgraphs) {
+    num_subgraphs_ = num_subgraphs;
+    return *this;
+  }
+
+  const std::vector<int32_t> &Shape() const { return shape_; }
+
+  const std::vector<int32_t> &ResourceShape() const { return resource_shape_; }
+
+  size_t OutputSize() const {
+    return std::accumulate(Shape().begin(), Shape().end(), 1,
+                           std::multiplies<int32_t>());
+  }
+
+  size_t InputSize() const {
+    return std::accumulate(Shape().begin(), Shape().cend(), 1,
+                           std::multiplies<int32_t>());
+  }
+
+  inline QuantizedVariableOpsTester &ZeroPoint(int32_t zero_point) {
+    zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t ZeroPoint() const { return zero_point_; }
+
+  inline QuantizedVariableOpsTester &Scale(float scale) {
+    scale_ = scale;
+    return *this;
+  }
+
+  inline float Scale() const { return scale_; }
+
+  inline QuantizedVariableOpsTester &Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+ private:
+  template <class T>
+  void Test(TfLiteDelegate *delegate, const std::vector<char> &buffer) const;
+
+  std::vector<int32_t> shape_ = {1, 2, 2, 3};
+  std::vector<int32_t> resource_shape_ = {1};
+  size_t num_inputs_ = 0;
+  size_t num_outputs_ = 0;
+  size_t num_subgraphs_ = 1;
+  int32_t zero_point_ = 2;
+  float scale_ = 0.75f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_VARIABLE_OPS_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/reduce_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/reduce_tester.h
new file mode 100644
index 00000000..b3a05e0c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/reduce_tester.h
@@ -0,0 +1,165 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_REDUCE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_REDUCE_TESTER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <unordered_set>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class ReduceTester {
+ public:
+  enum class Quantization { None, Signed, Unsigned };
+
+  ReduceTester() = default;
+  ReduceTester(const ReduceTester&) = delete;
+  ReduceTester& operator=(const ReduceTester&) = delete;
+
+  inline ReduceTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = ReduceTester::ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline ReduceTester& Axes(std::initializer_list<int32_t> axes) {
+    for (auto it = axes.begin(); it != axes.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    axes_ = std::vector<int32_t>(axes.begin(), axes.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Axes() const { return axes_; }
+
+  inline ReduceTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline std::vector<int32_t> OutputShape() const {
+    std::vector<int32_t> output_shape;
+    output_shape.reserve(InputShape().size());
+    std::unordered_set<int32_t> axes_set(Axes().cbegin(), Axes().cend());
+    for (int32_t i = 0; i < InputShape().size(); i++) {
+      if (axes_set.count(i) != 0) {
+        if (KeepDims()) {
+          output_shape.push_back(1);
+        }
+      } else {
+        output_shape.push_back(InputShape()[i]);
+      }
+    }
+    return output_shape;
+  }
+
+  inline int32_t OutputSize() const {
+    int32_t output_size = 1;
+    std::unordered_set<int32_t> axes_set(Axes().cbegin(), Axes().cend());
+    for (int32_t i = 0; i < InputShape().size(); i++) {
+      if (axes_set.count(i) == 0) {
+        output_size *= InputShape()[i];
+      }
+    }
+    return output_size;
+  }
+
+  inline ReduceTester& RelativeTolerance(float relative_tolerance) {
+    relative_tolerance_ = relative_tolerance;
+    return *this;
+  }
+
+  inline float RelativeTolerance() const { return relative_tolerance_; }
+
+  inline ReduceTester& InputZeroPoint(int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline ReduceTester& OutputZeroPoint(int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline ReduceTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline ReduceTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline ReduceTester& Quantization(Quantization q) {
+    quantization_ = q;
+    return *this;
+  }
+
+  inline enum Quantization Quantization() const { return quantization_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(tflite::BuiltinOperator reduce_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator reduce_op) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> axes_;
+  int32_t input_size_;
+  bool keep_dims_ = true;
+  float relative_tolerance_ = 10.0f;
+  int32_t input_zero_point_ = 1;
+  int32_t output_zero_point_ = 2;
+  float input_scale_ = 1.25f;
+  float output_scale_ = 0.75f;
+  enum Quantization quantization_ = Quantization::None;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_REDUCE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/reshape_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/reshape_tester.h
new file mode 100644
index 00000000..35e8a643
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/reshape_tester.h
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_RESHAPE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_RESHAPE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class ReshapeTester {
+ public:
+  ReshapeTester() = default;
+  ReshapeTester(const ReshapeTester&) = delete;
+  ReshapeTester& operator=(const ReshapeTester&) = delete;
+
+  inline ReshapeTester& InputShape(const std::vector<int32_t>& input_shape) {
+    for (int32_t input_dim : input_shape) {
+      EXPECT_GT(input_dim, 0);
+    }
+    input_shape_ = std::vector<int32_t>(input_shape.begin(), input_shape.end());
+    input_size_ = ReshapeTester::ComputeSize(input_shape);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline ReshapeTester& OutputShape(const std::vector<int32_t>& output_shape) {
+    for (int32_t output_dim : output_shape) {
+      EXPECT_GT(output_dim, 0);
+    }
+    output_shape_ =
+        std::vector<int32_t>(output_shape.begin(), output_shape.end());
+    output_size_ = ReshapeTester::ComputeSize(output_shape);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& OutputShape() const {
+    return output_shape_;
+  }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline int32_t OutputSize() const { return output_size_; }
+
+  inline ReshapeTester& OutputShapeAsInput(bool shape_as_input) {
+    shape_as_input_ = shape_as_input;
+    return *this;
+  }
+
+  inline bool OutputShapeAsInput() const { return shape_as_input_; }
+
+  template <class T>
+  void Test(TensorType tensor_type, Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> output_shape_;
+  int32_t input_size_ = 1;
+  int32_t output_size_ = 1;
+  bool shape_as_input_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_RESHAPE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
new file mode 100644
index 00000000..73791e52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_RESIZE_BILINEAR_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_RESIZE_BILINEAR_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class ResizeBilinearTester {
+ public:
+  ResizeBilinearTester() = default;
+  ResizeBilinearTester(const ResizeBilinearTester&) = delete;
+  ResizeBilinearTester& operator=(const ResizeBilinearTester&) = delete;
+
+  inline ResizeBilinearTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline ResizeBilinearTester& Channels(int32_t channels) {
+    EXPECT_GT(channels, 0);
+    channels_ = channels;
+    return *this;
+  }
+
+  inline int32_t Channels() const { return channels_; }
+
+  inline ResizeBilinearTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline ResizeBilinearTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline ResizeBilinearTester& OutputHeight(int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline ResizeBilinearTester& OutputWidth(int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  ResizeBilinearTester& AlignCorners(bool align_corners) {
+    align_corners_ = align_corners;
+    return *this;
+  }
+
+  bool AlignCorners() const { return align_corners_; }
+
+  ResizeBilinearTester& HalfPixelCenters(bool half_pixel_centers) {
+    half_pixel_centers_ = half_pixel_centers;
+    return *this;
+  }
+
+  bool HalfPixelCenters() const { return half_pixel_centers_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  int32_t batch_size_ = 1;
+  int32_t channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  bool align_corners_ = false;
+  bool half_pixel_centers_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_RESIZE_BILINEAR_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/slice_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/slice_tester.h
new file mode 100644
index 00000000..c4ce7b5c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/slice_tester.h
@@ -0,0 +1,138 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_SLICE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_SLICE_TESTER_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class SliceTester {
+ public:
+  SliceTester() = default;
+  SliceTester(const SliceTester&) = delete;
+  SliceTester& operator=(const SliceTester&) = delete;
+
+  inline SliceTester& InputShape(const std::vector<int32_t>& shape) {
+    for (const auto dim : shape) {
+      EXPECT_GT(dim, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline SliceTester& Offsets(const std::vector<int32_t>& offsets) {
+    offsets_ = std::vector<int32_t>(offsets);
+    offsets_int64_.resize(offsets.size());
+    std::copy(offsets_.begin(), offsets_.end(), offsets_int64_.begin());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> Offsets() const { return offsets_; }
+
+  inline const void* OffsetsData() const {
+    return UseInt64OffsetsAndSize() ? (void*)offsets_int64_.data()
+                                    : (void*)offsets_.data();
+  }
+
+  inline const size_t OffsetsSizeInBytes() const {
+    if (use_int64_offsets_and_sizes_) {
+      return Offsets().size() * sizeof(int64_t);
+    } else {
+      return Offsets().size() * sizeof(int32_t);
+    }
+  }
+
+  inline SliceTester& Sizes(const std::vector<int32_t>& sizes) {
+    sizes_ = std::vector<int32_t>(sizes);
+    sizes_int64_.resize(sizes.size());
+    std::copy(sizes_.begin(), sizes_.end(), sizes_int64_.begin());
+    output_shape_ = std::vector<int32_t>(sizes);
+    for (size_t i = 0; i < sizes.size(); i++) {
+      if (output_shape_[i] < 0) {
+        output_shape_[i] = input_shape_[i] - offsets_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline const std::vector<int32_t> Sizes() const { return sizes_; }
+
+  inline const void* SizesData() const {
+    return UseInt64OffsetsAndSize() ? (void*)sizes_int64_.data()
+                                    : (void*)sizes_.data();
+  }
+
+  inline const size_t SizesSizeInBytes() const {
+    if (use_int64_offsets_and_sizes_) {
+      return Sizes().size() * sizeof(int64_t);
+    } else {
+      return Sizes().size() * sizeof(int32_t);
+    }
+  }
+
+  std::vector<int32_t> OutputShape() const { return output_shape_; }
+
+  inline SliceTester& UseInt64OffsetsAndSize(bool use_int64) {
+    use_int64_offsets_and_sizes_ = use_int64;
+    return *this;
+  }
+
+  inline bool UseInt64OffsetsAndSize() const {
+    return use_int64_offsets_and_sizes_;
+  }
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+  template <typename T>
+  void Test(Interpreter* default_interpreter,
+            Interpreter* delegate_interpreter) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> offsets_;
+  std::vector<int64_t> offsets_int64_;
+  std::vector<int32_t> sizes_;
+  std::vector<int64_t> sizes_int64_;
+  std::vector<int32_t> output_shape_;
+  bool use_int64_offsets_and_sizes_;
+};
+
+int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+std::vector<int32_t> RandomOffsets(std::mt19937& rng,
+                                   const std::vector<int32_t>& dims);
+
+std::vector<int32_t> RandomSizes(std::mt19937& rng,
+                                 const std::vector<int32_t>& dims,
+                                 const std::vector<int32_t>& offsets);
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_SLICE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/softmax_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/softmax_tester.h
new file mode 100644
index 00000000..b5e7e48f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/softmax_tester.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class SoftmaxTester {
+ public:
+  SoftmaxTester() = default;
+  SoftmaxTester(const SoftmaxTester&) = delete;
+  SoftmaxTester& operator=(const SoftmaxTester&) = delete;
+
+  inline SoftmaxTester& Shape(std::initializer_list<int32_t> shape) {
+    EXPECT_GT(shape.size(), 0);
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = SoftmaxTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline SoftmaxTester& Beta(float beta) {
+    beta_ = beta;
+    return *this;
+  }
+
+  float Beta() const { return beta_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  float beta_ = 1.0f;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h
new file mode 100644
index 00000000..804a70ce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h
@@ -0,0 +1,105 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_SPACE_TO_DEPTH_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_SPACE_TO_DEPTH_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite::xnnpack {
+
+class SpaceToDepthTester {
+ public:
+  SpaceToDepthTester() = default;
+  SpaceToDepthTester(const SpaceToDepthTester&) = delete;
+  SpaceToDepthTester& operator=(const SpaceToDepthTester&) = delete;
+
+  inline SpaceToDepthTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t NumInputElements() const {
+    return batch_size_ * input_height_ * input_width_ * input_channels_;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline int32_t OutputChannels() const {
+    return InputChannels() * BlockSize() * BlockSize();
+  }
+
+  inline SpaceToDepthTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline SpaceToDepthTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline SpaceToDepthTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const { return InputWidth() / BlockSize(); }
+
+  inline int32_t OutputHeight() const { return InputHeight() / BlockSize(); }
+
+  inline SpaceToDepthTester& BlockSize(int32_t block_size) {
+    EXPECT_GT(block_size, 1);
+    block_size_ = block_size;
+    return *this;
+  }
+
+  inline int32_t BlockSize() const { return block_size_; }
+
+  template <class T>
+  void Test(TensorType tensor_type, Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  int32_t batch_size_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t block_size_ = 2;
+};
+
+}  // namespace tflite::xnnpack
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_SPACE_TO_DEPTH_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/split_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/split_tester.h
new file mode 100644
index 00000000..b73a0f96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/split_tester.h
@@ -0,0 +1,88 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_SPLIT_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_SPLIT_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class SplitTester {
+ public:
+  SplitTester() = default;
+  SplitTester(const SplitTester&) = delete;
+  SplitTester& operator=(const SplitTester&) = delete;
+
+  inline SplitTester& SplitDimension(int32_t split_dim) {
+    split_dim_ = split_dim;
+    return *this;
+  }
+
+  inline SplitTester& InputShape(const std::vector<int32_t>& shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  int32_t SplitDimension() const { return split_dim_; }
+
+  inline SplitTester& NumSplits(int num_splits) {
+    num_splits_ = num_splits;
+    return *this;
+  }
+
+  inline const int NumSplits() const { return num_splits_; }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  std::vector<int32_t> OutputShape() const {
+    std::vector<int32_t> output_shape = InputShape();
+    int32_t split_dim = SplitDimension();
+    split_dim += split_dim < 0 ? InputShape().size() : 0;
+    EXPECT_LE(0, split_dim);
+    EXPECT_EQ(0, output_shape[split_dim] % NumSplits());
+    output_shape[split_dim] /= NumSplits();
+    return output_shape;
+  }
+
+  template <typename T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  int32_t split_dim_;
+  int num_splits_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_SPLIT_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h
new file mode 100644
index 00000000..b2cbe95c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h
@@ -0,0 +1,191 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_STRIDED_SLICE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_STRIDED_SLICE_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class StridedSliceTest : public ::testing::Test {
+ public:
+  StridedSliceTest() {
+    xnnpack_delegate_.reset(TfLiteXNNPackDelegateCreate(nullptr));
+    std::random_device random_device;
+    rng_ = std::mt19937(random_device());
+    shape_rng_ = std::bind(shape_dist_, std::ref(rng_));
+  }
+
+ protected:
+  inline int32_t RandomShape() { return shape_rng_(); }
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate_{nullptr, TfLiteXNNPackDelegateDelete};
+  std::mt19937 rng_;
+  std::uniform_int_distribution<int32_t> shape_dist_{2, 5};
+  std::function<int32_t(void)> shape_rng_;
+};
+
+// Type alias for better test names in output.
+using SignedQuantizedStridedSliceTest = StridedSliceTest;
+using UnsignedQuantizedStridedSliceTest = StridedSliceTest;
+
+int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+template <typename T, typename RNG>
+static T RandomElement(const std::vector<T>& v, RNG rng) {
+  return v[std::uniform_int_distribution<int>(0, v.size() - 1)(rng)];
+}
+
+class StridedSliceTester {
+ public:
+  StridedSliceTester() = default;
+  StridedSliceTester(const StridedSliceTester&) = delete;
+  StridedSliceTester& operator=(const StridedSliceTester&) = delete;
+
+  inline StridedSliceTester& InputShape(const std::vector<int32_t>& shape) {
+    input_shape_ = shape;
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline StridedSliceTester& RandomBegins(std::mt19937& rng) {
+    // Begin can be any number from -dims[i] to dims[i]. If it is dims[i], we
+    // set begin_mask, to have it be interpreted as 0.
+    begins_ = std::vector<int32_t>(InputShape().size());
+    for (size_t i = 0; i < begins_.size(); i++) {
+      begins_[i] = std::uniform_int_distribution<int32_t>(-InputShape()[i],
+                                                          InputShape()[i])(rng);
+      if (begins_[i] == InputShape()[i]) {
+        begin_mask_ |= 1 << i;
+      }
+    }
+    return *this;
+  }
+
+  inline StridedSliceTester& Begins(const std::vector<int32_t>& begins) {
+    begins_ = begins;
+    return *this;
+  }
+
+  inline const std::vector<int32_t> Begins() const { return begins_; }
+
+  // Get the begin at dimension i, taking into account negative values and
+  // begin_mask.
+  inline const int32_t Begin(int i) const {
+    const int32_t begin = Begins()[i];
+    if ((BeginMask() & (1 << i)) != 0) {
+      return 0;
+    } else if (begin < 0) {
+      return InputShape()[i] + begin;
+    } else {
+      return begin;
+    }
+  }
+
+  inline StridedSliceTester& RandomEnds(std::mt19937& rng) {
+    ends_ = std::vector<int32_t>(InputShape().size());
+    for (size_t i = 0; i < ends_.size(); i++) {
+      // Valid sizes are [1, max_size] for a slice.
+      const size_t max_size = InputShape()[i] - Begin(i);
+      // Choose between positive indices and negative indices:
+      // 1. Positive end offsets range from [Begin(i) + 1, Begin(i) + max_size].
+      // 2. Negative end offsets range from [-max_size, -1]
+      // 3. Special case for -max_size, which we set end_mask, which represents
+      // end offset of InputShape()[i].
+      std::vector<int32_t> valid_ends(max_size * 2);
+      std::iota(valid_ends.begin(), valid_ends.begin() + max_size,
+                Begin(i) + 1);
+      std::iota(valid_ends.begin() + max_size, valid_ends.end(), -max_size);
+      ends_[i] = RandomElement(valid_ends, rng);
+      if (ends_[i] == -max_size) {
+        end_mask_ |= 1 << i;
+      }
+    }
+    return *this;
+  }
+
+  inline StridedSliceTester& Ends(const std::vector<int32_t>& ends) {
+    ends_ = ends;
+    return *this;
+  }
+
+  inline const std::vector<int32_t> Ends() const { return ends_; }
+
+  // Get the end at dimension i, taking into account negative values and
+  // end_mask.
+  inline const int32_t End(int i) const {
+    const int32_t end = Ends()[i];
+    if ((EndMask() & (1 << i)) != 0) {
+      return InputShape()[i];
+    } else if (end < 0) {
+      return InputShape()[i] + end;
+    } else {
+      return end;
+    }
+  }
+
+  inline const std::vector<int32_t> Strides() const {
+    return std::vector<int32_t>(InputShape().size(), 1);
+  }
+
+  std::vector<int32_t> OutputShape() const {
+    auto output_shape = std::vector<int32_t>(InputShape().size());
+    for (size_t i = 0; i < output_shape.size(); i++) {
+      output_shape[i] = End(i) - Begin(i);
+    }
+    return output_shape;
+  }
+
+  uint32_t BeginMask() const { return begin_mask_; }
+  uint32_t EndMask() const { return end_mask_; }
+  uint32_t EllipsisMask() const { return 0; }
+  uint32_t NewAxisMask() const { return 0; }
+  uint32_t ShrinkAxisMask() const { return 0; }
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+  template <typename T>
+  void Test(Interpreter* default_interpreter,
+            Interpreter* delegate_interpreter) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> begins_;
+  std::vector<int32_t> ends_;
+  std::vector<int32_t> strides_;
+  uint32_t begin_mask_ = 0;
+  uint32_t end_mask_ = 0;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_STRIDED_SLICE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/test_util.h
new file mode 100644
index 00000000..5ccf56eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/test_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_TEST_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+namespace tflite {
+namespace xnnpack {
+
+int8_t QuantizeInt8(float value, int32_t zero_point, float scale);
+
+void QuantizeInt8PerChannel(const float* scale, const int64_t* zero_point,
+                            int32_t quantized_dimension,
+                            const float* input_data, int8_t* output_data,
+                            const std::vector<int32_t>& shape);
+
+float GetInt8QuantizationScale(const std::vector<float>& data);
+
+std::vector<float> GetInt8QuantizationScalePerChannel(
+    const float* data, int32_t quantized_dimension,
+    const std::vector<int32_t>& shape);
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
new file mode 100644
index 00000000..1e8387c6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
@@ -0,0 +1,246 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_CONV_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_CONV_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class TransposeConvTester {
+ public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+  };
+  enum class BiasType {
+    kNone,
+    kFP32,
+    kFP16,
+  };
+
+  TransposeConvTester() = default;
+  TransposeConvTester(const TransposeConvTester&) = delete;
+  TransposeConvTester& operator=(const TransposeConvTester&) = delete;
+
+  inline TransposeConvTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline TransposeConvTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline TransposeConvTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline TransposeConvTester& OutputHeight(int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline TransposeConvTester& OutputWidth(int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  inline TransposeConvTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline TransposeConvTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline TransposeConvTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline TransposeConvTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline TransposeConvTester& FP16Weights() {
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
+    return *this;
+  }
+
+  inline TransposeConvTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline TransposeConvTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
+    return *this;
+  }
+
+  inline TransposeConvTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  inline TransposeConvTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline TransposeConvTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline int32_t InputWidth() const {
+    return ComputeInputSize(OutputWidth(), KernelWidth(), StrideWidth());
+  }
+
+  inline int32_t InputHeight() const {
+    return ComputeInputSize(OutputHeight(), KernelHeight(), StrideHeight());
+  }
+
+  inline int32_t PaddingWidth() const {
+    return ComputePadding(OutputWidth(), KernelWidth(), StrideWidth());
+  }
+
+  inline int32_t PaddingHeight() const {
+    return ComputePadding(OutputHeight(), KernelHeight(), StrideHeight());
+  }
+
+  inline TransposeConvTester& NoBias() {
+    bias_type_ = BiasType::kNone;
+    return *this;
+  }
+
+  inline TransposeConvTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  int32_t ComputeInputSize(int32_t output_size, int32_t kernel_size,
+                           int32_t stride) const {
+    // Roughly follows TFLite's `ComputeOutSize`.
+    switch (padding_) {
+      case ::tflite::Padding_VALID:
+        return (output_size + stride - kernel_size) / stride;
+        break;
+      case ::tflite::Padding_SAME:
+        return (output_size + stride - 1) / stride;
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  int32_t ComputePadding(int32_t output_size, int32_t kernel_size,
+                         int32_t stride) const {
+    // Roughly follows TFLite's `ComputePaddingWithOffset`.
+    if (padding_ == ::tflite::Padding_VALID) {
+      return 0;
+    }
+    assert(padding_ == ::tflite::Padding_SAME);
+    const int32_t input_size =
+        ComputeInputSize(output_size, kernel_size, stride);
+    return (output_size - 1) * stride + kernel_size - input_size;
+  }
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline bool HasBias() const { return bias_type_ != BiasType::kNone; }
+
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
+  bool sparse_weights_ = false;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_CONV_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/transpose_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/transpose_tester.h
new file mode 100644
index 00000000..b5fb206a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/transpose_tester.h
@@ -0,0 +1,77 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class TransposeTester {
+ public:
+  TransposeTester() = default;
+  TransposeTester(const TransposeTester&) = delete;
+  TransposeTester& operator=(const TransposeTester&) = delete;
+
+  inline TransposeTester& num_dims(int32_t num_dims) {
+    assert(num_dims != 0);
+    this->num_dims_ = num_dims;
+    return *this;
+  }
+
+  inline int32_t num_dims() const { return this->num_dims_; }
+
+  inline TransposeTester& input_shape(std::vector<int32_t> input_shape) {
+    this->input_shape_ = input_shape;
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& input_shape() const {
+    return this->input_shape_;
+  }
+
+  inline TransposeTester& perm(std::vector<int32_t> perm) {
+    this->perm_ = perm;
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& perm() const { return this->perm_; }
+
+  template <class T>
+  void Test(TensorType tensor_type, Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
+
+  int32_t num_dims_ = 1;
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> perm_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
new file mode 100644
index 00000000..ae6d71d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+struct ToleranceInfo {
+  float relative = 10.0f;
+  float absolute = 0.0f;
+};
+
+class UnaryElementwiseTester {
+ public:
+  UnaryElementwiseTester() = default;
+  UnaryElementwiseTester(const UnaryElementwiseTester&) = delete;
+  UnaryElementwiseTester& operator=(const UnaryElementwiseTester&) = delete;
+
+  inline UnaryElementwiseTester& Shape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    size_ = UnaryElementwiseTester::ComputeSize(shape_);
+    return *this;
+  }
+
+  const std::vector<int32_t>& Shape() const { return shape_; }
+
+  int32_t Size() const { return size_; }
+
+  inline UnaryElementwiseTester& Tolerance(const ToleranceInfo& tolerance) {
+    tolerance_ = tolerance;
+    return *this;
+  }
+
+  const ToleranceInfo& Tolerance() const { return tolerance_; }
+  float RelativeTolerance() const { return tolerance_.relative; }
+  float AbsoluteTolerance() const { return tolerance_.absolute; }
+
+  void Test(tflite::BuiltinOperator unary_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator unary_op) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> shape_;
+  int32_t size_;
+  ToleranceInfo tolerance_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h
new file mode 100644
index 00000000..7da1b62f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h
@@ -0,0 +1,210 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_VARIABLE_OPS_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_VARIABLE_OPS_TESTER_H_
+
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+NewXnnPackDelegateSupportingVariableOps();
+
+class VariableOpsTester {
+ public:
+  void TestAssignThenRead(TfLiteDelegate *delegate) const {
+    const std::vector<char> model = CreateModelAssignThenRead();
+    Test(delegate, model);
+  }
+
+  void TestAssignTwiceThenRead(TfLiteDelegate *delegate) const {
+    const std::vector<char> model = CreateModelAssignTwiceThenRead();
+    Test(delegate, model);
+  }
+
+  void TestAssignThenReadUsingAnotherVarHandle(TfLiteDelegate *delegate) const {
+    const std::vector<char> model =
+        CreateModelAssignThenReadUsingAnotherVarHandle();
+    Test(delegate, model);
+  }
+
+  void TestTwoVarHandlesAssignThenRead(TfLiteDelegate *delegate) const {
+    const std::vector<char> model = CreateModelTwoVarHandlesAssignThenRead();
+    Test(delegate, model);
+  }
+
+  void TestTwoSubgraphsReadAssign(TfLiteDelegate *delegate) const {
+    const std::vector<char> model = CreateModelTwoSubgraphsReadAssign();
+    Test(delegate, model);
+  }
+
+  void TestTwoSubgraphsReadAssignOneVarHandle(TfLiteDelegate *delegate) const {
+    const std::vector<char> model =
+        CreateModelTwoSubgraphsReadAssignOneVarHandle();
+    Test(delegate, model);
+  }
+
+  void TestTwoSubgraphsReadAssignOneVarHandle2(TfLiteDelegate *delegate) const {
+    const std::vector<char> model =
+        CreateModelTwoSubgraphsReadAssignOneVarHandle2();
+    Test(delegate, model);
+  }
+
+  // Creates a model with this subgraph:
+  //   (initial_input) ----\
+  //                        \
+  //   VAR_HANDLE ----------> AV
+  //        \
+  //         \---------------> RV  -> (output)
+  std::vector<char> CreateModelAssignThenRead() const;
+
+  // Creates a model with this subgraph:
+  // (initial_input) -------
+  //                        \
+  // VAR_HANDLE ----------> AV
+  //  \      \
+  //   \      \---------------> RV  -> (output)
+  //    \
+  //     \-----AV
+  //           /
+  // (default_input)
+  std::vector<char> CreateModelAssignTwiceThenRead() const;
+
+  // Creates a model with this subgraph:
+  //   (initial_input) ----\
+  //                        \
+  //   VAR_HANDLE ----------> AV
+  //
+  //   VAR_HANDLE ---------------> RV  -> (output)
+  //   Second VAR_HANDLE is a different operator object but refers to the same
+  //   variable by name.
+  std::vector<char> CreateModelAssignThenReadUsingAnotherVarHandle() const;
+
+  // Creates a model with this subgraph:
+  // (default_input) -------
+  //                        \
+  // VAR_HANDLE ----------> AV
+  //         \
+  //          \---------------> RV  -> (output1)
+  //
+  // (default_input) ---------------
+  //                                 \
+  // VAR_HANDLE (different one) -----AV
+  //         \
+  //          \---------------> RV  -> (output2)
+  std::vector<char> CreateModelTwoVarHandlesAssignThenRead() const;
+
+  // Creates a model with two subgraphs.
+  // primary subgraph:
+  // CALL_ONCE (secondary subgraph)
+  // VAR_HANDLE1 ---> RV ---> (output)
+  // VAR_HANDLE2 ---> RV ---> (output)
+  //
+  // secondary subgraph:
+  // VAR_HANDLE2   ----- AV
+  // buffer1 -----------/
+  // VAR_HANDLE1   ----- AV
+  // buffer2 -----------/
+  // The var handles are defined in different orders.
+  std::vector<char> CreateModelTwoSubgraphsReadAssign() const;
+
+  // Creates a model with two subgraphs.
+  // primary subgraph has 1 var handle.
+  // CALL_ONCE (secondary subgraph)
+  // VAR_HANDLE1 ---> RV ---> (output)
+  //
+  // secondary subgraph has 2 varhandle:
+  // VAR_HANDLE2   ----- AV
+  // buffer1 -----------/
+  // VAR_HANDLE1   ----- AV
+  // buffer2 -----------/
+  // The expected output is buffer2.
+  // The var handles are defined in different orders.
+  std::vector<char> CreateModelTwoSubgraphsReadAssignOneVarHandle() const;
+
+  // Similar to CreateModelTwoSubgraphsReadAssignOneVarHandle but with the first
+  // subgraph reading var handle 1 and flipping the order of var handles in the
+  // second subgraph.
+  // Creates a model with two subgraphs.
+  // primary subgraph has 1 var handle.
+  // CALL_ONCE (secondary subgraph)
+  // VAR_HANDLE2 ---> RV ---> (output)
+  //
+  // secondary subgraph has 2 varhandle:
+  // VAR_HANDLE1   ----- AV
+  // buffer1 -----------/
+  // VAR_HANDLE2   ----- AV
+  // buffer2 -----------/
+  // The expected output is buffer2.
+  // The var handles are defined in different orders.
+  std::vector<char> CreateModelTwoSubgraphsReadAssignOneVarHandle2() const;
+
+  inline VariableOpsTester &NumInputs(size_t num_inputs) {
+    num_inputs_ = num_inputs;
+    return *this;
+  }
+
+  inline size_t NumInputs() const { return num_inputs_; }
+
+  inline VariableOpsTester &NumOutputs(size_t num_outputs) {
+    num_outputs_ = num_outputs;
+    return *this;
+  }
+
+  inline size_t NumOutputs() const { return num_outputs_; }
+
+  inline size_t NumSubgraphs() const { return num_subgraphs_; }
+
+  inline VariableOpsTester &NumSubgraphs(size_t num_subgraphs) {
+    num_subgraphs_ = num_subgraphs;
+    return *this;
+  }
+
+  const std::vector<int32_t> &Shape() const { return shape_; }
+
+  const std::vector<int32_t> &ResourceShape() const { return resource_shape_; }
+
+  size_t OutputSize() const {
+    return std::accumulate(Shape().begin(), Shape().end(), 1,
+                           std::multiplies<int32_t>());
+  }
+
+  size_t InputSize() const {
+    return std::accumulate(Shape().begin(), Shape().cend(), 1,
+                           std::multiplies<int32_t>());
+  }
+
+ private:
+  void Test(TfLiteDelegate *delegate, const std::vector<char> &buffer) const;
+
+  std::vector<int32_t> shape_ = {1, 2, 2, 3};
+  std::vector<int32_t> resource_shape_ = {1};
+  size_t num_inputs_ = 0;
+  size_t num_outputs_ = 0;
+  size_t num_subgraphs_ = 1;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_VARIABLE_OPS_TESTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/weight_cache.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/weight_cache.h
new file mode 100644
index 00000000..3e2efed4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -0,0 +1,475 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "xnnpack.h"  // from @XNNPACK
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/file_util.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h"
+
+// WARNING: the interface in this file is still under experimentation and WILL
+// CHANGE. Do not rely on it.
+
+// TFLite doesn't use absl hashing utilities.
+
+namespace tflite {
+namespace xnnpack {
+
+// Reserved value to request the delegate to use an in-memory cache instead of
+// saving it to disk.
+//
+// This is useful when disk space is not available or when having to manage the
+// cache file freshness is too complicated and still provides the deduplication
+// mechanism for constant buffers that are reused accross graph signatures.
+inline constexpr char kInMemoryCachePath[] = ":memory";
+
+// This structure is written at the start of every cache file.
+//
+// When changing this structure or anything in the cache file layout,
+// `kVersion` should be incremented by one.
+//
+// When creating a new cache file, `version` should be set to `kVersion`.
+//
+// When reading a cache file, the cache should be rejected if `version`
+// doesn't match `kVersion`.
+struct XNNPackCacheHeader {
+  enum : uint64_t { kInvalidHeader = 0, kVersion = 1 };
+  uint64_t version;
+  uint8_t xnnpack_build_identifier[32];
+  uint64_t buffer_list_offset;
+  uint64_t buffer_list_size;
+};
+
+struct PackIdentifier {
+  enum { kNoId = SIZE_MAX };
+  uint64_t pack_algorithm_id = kNoId;
+  uint64_t weights_id = kNoId;
+  uint64_t bias_id = kNoId;
+
+  friend bool operator==(const PackIdentifier& a, const PackIdentifier& b) {
+    return a.pack_algorithm_id == b.pack_algorithm_id &&
+           a.weights_id == b.weights_id && a.bias_id == b.bias_id;
+  }
+
+  struct Hash {
+    size_t operator()(const PackIdentifier& p) const {
+      std::hash<uint64_t> hasher;
+      return hasher(p.pack_algorithm_id) ^ hasher(p.weights_id) ^
+             hasher(p.bias_id);
+    }
+  };
+};
+
+struct BufferLocation {
+  uint64_t offset;
+  uint64_t size;
+
+  static constexpr BufferLocation Invalid() { return {SIZE_MAX, SIZE_MAX}; }
+
+  constexpr bool IsInvalid() const {
+    constexpr BufferLocation invalid = Invalid();
+    return offset == invalid.offset && size == invalid.size;
+  }
+};
+
+// Handles MMap allocations lifetime.
+//
+// When mapped, provides a view over the allocation for convenience.
+//
+// WARNING: the interface in this file is still under experimentation and WILL
+// CHANGE. Do not rely on it.
+class MMapHandle {
+ public:
+  using value_type = uint8_t;
+
+  MMapHandle() = default;
+  ~MMapHandle();
+  MMapHandle(const MMapHandle&) = delete;
+  MMapHandle& operator=(const MMapHandle&) = delete;
+  MMapHandle(MMapHandle&&);
+  MMapHandle& operator=(MMapHandle&&);
+
+  // Maps the file at the given path.
+  [[nodiscard /*Mapping a file can fail.*/]]
+  bool Map(const char* path, size_t offset = 0);
+
+  // Maps the fd associated to the file descriptor.
+  //
+  // The debug_path is printed along the error messages.
+  [[nodiscard /*Mapping a file can fail.*/]]
+  bool Map(const FileDescriptor& fd, size_t offset = 0,
+           const char* debug_path = "unspecified");
+
+  // Tries to resize the current mapping.
+  //
+  // Only succeeds if the mapping could be resized without being moved.
+  //
+  // WARNING: expects `IsMapped()` to be true.
+  [[nodiscard /*Resizing a file can fail.*/]]
+  bool Resize(size_t new_size);
+
+  // Unmaps an existing mapping.
+  void UnMap();
+
+  // Returns true if a mapping exists.
+  bool IsMapped() const { return data_ != nullptr; }
+
+  // Returns the mapping buffer.
+  uint8_t* data() { return data_ + offset_page_adjustment_; }
+
+  // Returns the mapping buffer.
+  const uint8_t* data() const { return data_ + offset_page_adjustment_; }
+
+  // Returns the mapping size in bytes.
+  size_t size() const { return size_; }
+
+  size_t offset() const { return offset_; }
+
+  uint8_t* begin() { return data(); }
+
+  const uint8_t* begin() const { return data(); }
+
+  uint8_t* end() { return data() + size(); }
+
+  const uint8_t* end() const { return data() + size(); }
+
+  friend void swap(MMapHandle& a, MMapHandle& b);
+
+ private:
+  size_t size_ = 0;
+  size_t offset_ = 0;
+  size_t offset_page_adjustment_ = 0;
+  uint8_t* data_ = nullptr;
+};
+
+// Provides storage to write the packed buffers to and saves those to disk.
+//
+// WARNING: the interface in this file is still under experimentation and WILL
+// CHANGE. Do not rely on it.
+class WeightCacheBuilder {
+ public:
+  WeightCacheBuilder() = default;
+  ~WeightCacheBuilder() = default;
+
+  // Non-copyable.
+  WeightCacheBuilder(const WeightCacheBuilder&) = delete;
+  WeightCacheBuilder& operator=(const WeightCacheBuilder&) = delete;
+
+  // Moveable.
+  WeightCacheBuilder(WeightCacheBuilder&&);
+  WeightCacheBuilder& operator=(WeightCacheBuilder&&);
+
+  [[nodiscard /*Starting the builder may fail.*/]]
+  bool Start(const char* path);
+
+  [[nodiscard]]
+  bool IsStarted() const {
+    return fd_.IsValid();
+  }
+
+  // Reopens the given file to add data to it.
+  //
+  // This should be only called from the weight cache provider.
+  [[nodiscard /*Starting a build step may fail.*/]]
+  bool StartBuildStep();
+
+  // Resets the builder, discarding any data that hasn't been written.
+  void Reset();
+
+  // Reserves space in the data buffer for the required size in bytes and
+  // returns the address of that space.
+  //
+  // Sets `last_reserve` to the offset from `buffer_data_`'s start and `n`.
+  //
+  // A call to `Reserve` should alway be followed by a call to `Append`.
+  [[nodiscard /*The pointer to reserved space should be used.*/]]
+  void* Reserve(size_t size);
+
+  // Adds a buffer to the cache.
+  //
+  // The buffer space must have been reserved before using `Reserve`. If not, a
+  // new call to `Reserve` will be done and the data will be copied over.
+  [[nodiscard /*The location to the appended data should be saved.*/]]
+  BufferLocation Append(PackIdentifier pack_id, const void* data,
+                        uint64_t size);
+
+  // Writes the flatbuffer to disk.
+  [[nodiscard /*Writing the weight cache can fail.*/]]
+  bool StopBuildStep();
+
+  // Get the offset in the cache file of the data written during the last step.
+  //
+  // This includes the buffers that were appended and the whole buffer mapping.
+  [[nodiscard]]
+  size_t LastBuildStepStart() const {
+    return build_segment_start_;
+  }
+
+  // Get the size of the data written during the last step.
+  //
+  // This includes the buffers that were appended and the whole buffer mapping.
+  [[nodiscard]]
+  size_t LastBuildStepSize() const {
+    return build_segment_size_;
+  }
+
+  // Returns the file descriptor.
+  const FileDescriptor& GetFileDescriptor() const { return fd_; }
+
+  // Returns the capacity of the underlying reserved buffer.
+  //
+  // WARNING: this exposes class implementation details for testing purposes and
+  // may be removed at any time.
+  size_t capacity() const { return capacity_; }
+
+  // Returns the address of the underlying reserved buffer.
+  //
+  // YOU SHOULD BE GETTING THAT ADDRESS FROM THE `Reserve` FUNCTION.
+  //
+  // WARNING: this exposes class implementation details for testing purposes and
+  // may be removed at any time.
+  uint8_t* data() const { return data_.get(); }
+
+ private:
+  std::unique_ptr<uint8_t[]> data_ = nullptr;
+  cache::schema::BufferListT schema_;
+  size_t capacity_ = 0;
+  // Size of the data written between StartBuildStep and StopBuildStep.
+  size_t build_segment_size_ = 0;
+  // Offset in the cache file when StartBuildStep was called.
+  size_t build_segment_start_ = 0;
+  // The call to StopBuildStep may short circuit when nothing was written to the
+  // cache. To ensure a smooth reloading, we need to ensure that the file header
+  // is correct. This flag lets us know if that has happened.
+  bool first_write_done_ = false;
+  // Temporary file descriptor to write the weights to disk immediately.
+  FileDescriptor fd_;
+  std::string file_path_;
+
+  bool is_build_step_ = false;
+};
+
+// Allows XNNPack to directly load packed weights from disk instead of having to
+// repack them every time.
+//
+// XNNPack kernels do not have knowledge of the TFLite context. The only thing
+// they can access is the buffers address. We rely on the fact that the address
+// provided by TFLite is unique in order to find out the buffer identifier.
+//
+// To use the cache you need to:
+//
+//  - Map the buffer addresses to their identifier with `MapTensorIdentifiers`
+//  - Load the cache file.
+//  - Finalize the cache before calling the run functions of XNNPack (setup and
+//    reshape are ok).
+class MMapWeightCacheProvider {
+ public:
+  MMapWeightCacheProvider() = default;
+  MMapWeightCacheProvider(const MMapWeightCacheProvider&) = delete;
+  MMapWeightCacheProvider& operator=(const MMapWeightCacheProvider&) = delete;
+  MMapWeightCacheProvider(MMapWeightCacheProvider&&);
+  MMapWeightCacheProvider& operator=(MMapWeightCacheProvider&&);
+
+  // Changes the file path to save the cache to.
+  //
+  // WARNING: Can only be called if the cache isn't finalized.
+  void SetFilePath(const char* file_path);
+
+  const std::string& GetFilePath() const { return file_path_; }
+
+  // Tries to load the given file. If the file doesn't exist starts building the
+  // cache for it.
+  [[nodiscard /*Loading a cache file may fail.*/]]
+  bool LoadOrStartBuild(const char* file_path);
+
+  [[nodiscard /*Starting to build a cache file may fail.*/]]
+  bool StartBuild(const char* file_path);
+
+  // Set the weight file path and loads it.
+  [[nodiscard /*Loading a cache file may fail.*/]]
+  bool Load(const std::string& path);
+
+  // Loads the weight cache previously set with `SetFilePath`.
+  [[nodiscard /*Loading cache data may fail.*/]]
+  bool Load();
+
+  // Checks if the cache is currently being built or if it was loaded from a
+  // file.
+  [[nodiscard]]
+  bool CanStartBuildStep() const {
+    return building_run_;
+  };
+
+  // Prepares to add new data to the cache.
+  [[nodiscard /*Updating cache data may fail.*/]]
+  bool StartBuildStep();
+
+  // Prepares to use data that was added to the cache during a build step.
+  [[nodiscard /*Updating cache data may fail.*/]]
+  bool StopBuildStep();
+
+  // Creates the tensor map.
+  void MapTensorIdentifiers(
+      const TfLiteTensor* tensors, size_t size,
+      const std::unordered_map<size_t, size_t>& tensor_index_to_identifier);
+
+  // In case a constant buffer data needs to be moved for some reason, this will
+  // map the new buffer data to its identifier.
+  void RemapDataBuffer(const void* buffer, const void* new_buffer);
+
+  // Returns the offset of the buffer identified by `cache_key`.
+  //
+  // If the buffer isn't found, return SIZE_MAX.
+  [[nodiscard]]
+  size_t LookUp(const xnn_weights_cache_look_up_key* cache_key);
+
+  // Reserves space for a buffer of given size and returns a pointer to it.
+  //
+  // The buffer data should be filled and `LookUpOrInsert` should be immediately
+  // called.
+  [[nodiscard]]
+  void* ReserveSpace(size_t size);
+
+  // Returns the offset of the buffer identified by `cache_key`. If the lookup
+  // fails, inserts the span `[ptr, ptr+size)`.
+  //
+  // This should be called after ReserveSpace and `ptr` should be the result of
+  // that call with the given `size`.
+  //
+  // WARNING: The cache key cannot be null.
+  [[nodiscard]]
+  size_t LookUpOrInsert(const xnn_weights_cache_look_up_key* cache_key,
+                        void* ptr, size_t size);
+
+  // Gets the pointer to the buffer at the given offset.
+  //
+  // WARNING: This requires the buffer to be finalized.
+  // WARNING: This does not check the validity of the passed offset.
+  void* OffsetToAddr(size_t offset);
+
+  // Releases the weight cache's memory.
+  void Release();
+
+  // Returns true if any weights have been added to the underlying builder.
+  [[nodiscard]]
+  bool IsBuilding() const {
+    return is_build_step_;
+  };
+
+  // Returns true if a file is mapped or a file path is set.
+  [[nodiscard]]
+  bool IsActive() const {
+    return !mmap_handles_.empty() || builder_.IsStarted();
+  };
+
+  // Returns the cache provider expected by XNNPack.
+  xnn_weights_cache_provider& GetCacheProvider() { return cache_provider_; }
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static size_t look_up(void* context,
+                        const xnn_weights_cache_look_up_key* cache_key);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static void* reserve_space(void* context, size_t n);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static size_t look_up_or_insert(
+      void* context, const xnn_weights_cache_look_up_key* cache_key, void* ptr,
+      size_t size);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static bool is_finalized(void* context);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static void* offset_to_addr(void* context, size_t offset);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static enum xnn_status delete_cache(void* context);
+
+ private:
+  // Hashes a cache key to lookup in `cache_key_to_identifier_`.
+  PackIdentifier BuildPackIdentifier(const xnn_weights_cache_look_up_key& key);
+
+  // Loads the data written by the last call to `builder_.BuildStepStop()`.
+  [[nodiscard /*Loading cache data may fail.*/]]
+  bool LoadLastBuildStep();
+
+  // Cache provider implementation for XNNPack.
+  xnn_weights_cache_provider cache_provider_{
+      /*context=*/this,
+      /*look_up=*/MMapWeightCacheProvider::look_up,
+      /*reserve_space=*/MMapWeightCacheProvider::reserve_space,
+      /*look_up_or_insert=*/MMapWeightCacheProvider::look_up_or_insert,
+      /*is_finalized=*/MMapWeightCacheProvider::is_finalized,
+      /*offset_to_addr=*/MMapWeightCacheProvider::offset_to_addr,
+      /*delete_cache=*/MMapWeightCacheProvider::delete_cache};
+
+  // Path to the cache file.
+  std::string file_path_;
+
+  // Maps buffer addresses to buffer identifiers.
+  std::unordered_map<const void*, uint64_t> buffer_address_to_identifier_;
+
+  std::unordered_map<const void*, const void*> buffer_remaps_;
+
+  // Maps cache request hashes to the buffer identifier.
+  std::unordered_multimap<PackIdentifier, BufferLocation, PackIdentifier::Hash>
+      cache_key_to_offset_;
+
+  // MMap allocation handler.
+  std::vector<MMapHandle> mmap_handles_;
+
+  // The offset to the first buffer data in the MMap allocation.
+  size_t mmap_buffer_base_offset_;
+
+  // Can hold a file descriptor when building a temporary cache to prevent it
+  // from being deleted.
+  FileDescriptor temporary_file_descriptor_;
+
+  // Used to build the cache.
+  WeightCacheBuilder builder_;
+
+  // True if the current run is the one building the cache file.
+  //
+  // We cannot distinguish between a wrong/outdated cache and one that is not
+  // fully done. To detect misuse, we still want to raise an error when XNNPack
+  // tries to append data to an existing file (i.e. when this is `false`).
+  bool building_run_ = false;
+
+  // True between StartBuildStep and StopBuildStep.
+  //
+  // This is used to check whether the builder is active, which means that some
+  // of the buffers are not available/can't be retrieved.
+  bool is_build_step_ = false;
+
+  // Stores the loaded buffer addresses corresponding to the given offset in the
+  // cache file.
+  std::map<size_t, void*> offset_to_addr_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
new file mode 100644
index 00000000..e7f17130
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -0,0 +1,172 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Enable XNNPACK acceleration for signed quantized 8-bit inference.
+// This includes operators with channel-wise quantized weights.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_QS8 0x00000001
+// Enable XNNPACK acceleration for unsigned quantized 8-bit inference.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_QU8 0x00000002
+// Force FP16 inference for FP32 operators.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 0x00000004
+// Enable XNNPACK acceleration for FULLY_CONNECTED operator with dynamic
+// weights.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED 0x00000008
+// Enable XNNPACK acceleration for VAR_HANDLE, READ_VARIABLE, and
+// ASSIGN_VARIABLE operators.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS 0x00000010
+// Enable transient indirection buffer to reduce memory usage in selected
+// operators. Indirection buffer initialization will take place on every
+// inference run, instead of only once during initialization of the operators.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER 0x00000020
+// Enable the latest XNNPACK operators and features in the delegate which have
+// not yet been enabled by default.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS 0x00000040
+// Enable XNNPack subgraph reshaping. This means that models with dynamic
+// tensors are supported and that inputs may be efficiently resized.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING 0x00000080
+// If XNNPACK has been built with Slinky, enable Slinky usage.
+// (Ignored if XNNPACK is built without Slinky.)
+#define TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SLINKY 0x00000100
+
+struct TfLiteXNNPackDelegateWeightsCache;
+
+typedef struct {
+  // Number of threads to use in the thread pool.
+  // 0 or negative value means no thread pool used.
+  int32_t num_threads;
+  // Bitfield with any combination of the following binary options:
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_QS8
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_QU8
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING
+  uint32_t flags;
+  // Cache for packed weights, can be shared between multiple instances of
+  // delegates.
+  struct TfLiteXNNPackDelegateWeightsCache* weights_cache;
+  // Deprecated. Use the flags bitfield with the
+  // TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS mask.
+  bool handle_variable_ops;
+  // Path to the weight cache to load.
+  //
+  // To keep backwards compatibility with the previous caching mechanism, the
+  // weight cache will only be loaded from this if `weight_cache` is undefined.
+  const char* weight_cache_file_path;
+} TfLiteXNNPackDelegateOptions;
+
+// Returns true on systems that support running the in-memory weight cache
+// provider.
+TFL_CAPI_EXPORT bool TfLiteXNNPackDelegateCanUseInMemoryWeightCacheProvider();
+
+// Returns a file path that will activate the in-memory weight cache that
+// enables weight deduplication.
+TFL_CAPI_EXPORT const char* TfLiteXNNPackDelegateInMemoryFilePath();
+
+// Returns a structure with the default XNNPack delegate options.
+TFL_CAPI_EXPORT TfLiteXNNPackDelegateOptions
+TfLiteXNNPackDelegateOptionsDefault();
+
+// Creates a new delegate instance that need to be destroyed with
+// `TfLiteXNNPackDelegateDelete` when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, default values are used (see
+// implementation of TfLiteXNNPackDelegateOptionsDefault in the .cc file for
+// details).
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteXNNPackDelegateCreate(
+    const TfLiteXNNPackDelegateOptions* options);
+
+// Performs the same task as TfLiteXNNPackDelegateCreate, with one exception.
+// If the context passed contains a non-null xnnpack_threadpool field,
+// we will use it as the threadpool for the delegate created.
+TfLiteDelegate* TfLiteXNNPackDelegateCreateWithThreadpool(
+    const TfLiteXNNPackDelegateOptions* options, TfLiteContext* context);
+
+// Returns the pthreadpool_t object used for parallelization in XNNPACK.
+// Can return NULL if the XNNPack delegate is single-threaded.
+//
+// WARNING: This API is experimental and subject to change.
+TFL_CAPI_EXPORT void* TfLiteXNNPackDelegateGetThreadPool(
+    TfLiteDelegate* delegate);
+
+// Returns the options in the delegate.
+// Returns NULL if the delegate is NULL.
+//
+// WARNING: This API is experimental and subject to change.
+TFL_CAPI_EXPORT const TfLiteXNNPackDelegateOptions*
+TfLiteXNNPackDelegateGetOptions(TfLiteDelegate* delegate);
+
+// Returns the flags used for an XNNPack delegate.
+// See documentation for TfLiteXNNPackDelegateOptions.flags.
+//
+// WARNING: This API is experimental and subject to change.
+TFL_CAPI_EXPORT int TfLiteXNNPackDelegateGetFlags(TfLiteDelegate* delegate);
+
+// Destroys a delegate created with `TfLiteXNNPackDelegateCreate` call.
+TFL_CAPI_EXPORT void TfLiteXNNPackDelegateDelete(TfLiteDelegate* delegate);
+
+// Creates a new weights cache that can be shared with multiple delegate
+// instances. Prefer TfLiteXNNPackDelegateWeightsCacheCreateWithSize which can
+// reduce memory bandwidth.
+TFL_CAPI_EXPORT struct TfLiteXNNPackDelegateWeightsCache*
+TfLiteXNNPackDelegateWeightsCacheCreate();
+
+// Creates a new weights cache with a specified initial size that can be shared
+// with multiple delegate instances. The weights cache can hold up to size bytes
+// without growing.
+TFL_CAPI_EXPORT struct TfLiteXNNPackDelegateWeightsCache*
+TfLiteXNNPackDelegateWeightsCacheCreateWithSize(size_t size);
+
+// Soft-finalize a weights cache. Extra space will be left in the weights cache
+// to allow for cache "insertion" only if it is a cache hit. This has memory
+// overhead compared to TfLiteXNNPackDelegateWeightsCacheFinalizeHard. Use this
+// if the number of interpreter instances using XNNPACK delegate is not fixed
+// (e.g. created based on workload in a server daemon).
+// Returns true on success, false on error.
+TFL_CAPI_EXPORT bool TfLiteXNNPackDelegateWeightsCacheFinalizeSoft(
+    struct TfLiteXNNPackDelegateWeightsCache* cache);
+
+// Hard-finalize a weights cache, cache is effectively frozen and no more cache
+// operations are allowed. Memory is resized to smallest possible. Use this if
+// the number of interpreter instances using XNNPACK delegate can be fixed and
+// all creation of instances can happen up front. This has the lowest memory
+// usage.
+// Returns true on success, false on error.
+TFL_CAPI_EXPORT bool TfLiteXNNPackDelegateWeightsCacheFinalizeHard(
+    struct TfLiteXNNPackDelegateWeightsCache* cache);
+
+// Destroys a weights cache created with
+// `TfLiteXNNPackDelegateWeightsCacheCreate` call.
+TFL_CAPI_EXPORT void TfLiteXNNPackDelegateWeightsCacheDelete(
+    struct TfLiteXNNPackDelegateWeightsCache* cache);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/error_reporter.h b/third_party/tflite-hdrs/tensorflow/lite/error_reporter.h
new file mode 100644
index 00000000..2705a6c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/error_reporter.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for moved header location.
+#ifndef TENSORFLOW_LITE_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_ERROR_REPORTER_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"  // IWYU pragma: export
+#include "tensorflow/lite/stderr_reporter.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/AppDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/AppDelegate.h
new file mode 100644
index 00000000..94046d97
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/AppDelegate.h
@@ -0,0 +1,21 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
+
+@property(strong, nonatomic) UIWindow *window;
+
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/RunModelViewController.h b/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/RunModelViewController.h
new file mode 100644
index 00000000..a4b358b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/RunModelViewController.h
@@ -0,0 +1,24 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface RunModelViewController : UIViewController
+
+- (IBAction)getUrl:(id)sender;
+
+@property(weak, nonatomic) IBOutlet UITextView *urlContentTextView;
+@property(weak, nonatomic) IBOutlet UITextField *urlTextField;
+
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/ios_image_load.h b/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/ios_image_load.h
new file mode 100644
index 00000000..74c6cf3c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/ios/simple/ios_image_load.h
@@ -0,0 +1,23 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
+#define TENSORFLOW_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
+
+#include <vector>
+
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width,
+                                       int* out_height, int* out_channels);
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/bitmap_helpers.h b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/bitmap_helpers.h
new file mode 100644
index 00000000..e16e10c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/bitmap_helpers.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+
+#include <string>
+
+#include "tensorflow/lite/examples/label_image/bitmap_helpers_impl.h"  // IWYU pragma: export
+#include "tensorflow/lite/examples/label_image/label_image.h"
+
+namespace tflite {
+namespace label_image {
+
+std::vector<uint8_t> read_bmp(const std::string& input_bmp_name, int* width,
+                              int* height, int* channels, Settings* s);
+
+template <class T>
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s);
+
+// explicit instantiation
+template void resize<float>(float*, unsigned char*, int, int, int, int, int,
+                            int, Settings*);
+template void resize<int8_t>(int8_t*, unsigned char*, int, int, int, int, int,
+                             int, Settings*);
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int, int,
+                              int, Settings*);
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
new file mode 100644
index 00000000..07148dc6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+
+#include "tensorflow/lite/examples/label_image/label_image.h"
+
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace label_image {
+
+template <class T>
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s) {
+  int number_of_pixels = image_height * image_width * image_channels;
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+
+  int base_index = 0;
+
+  // two inputs: input and new_sizes
+  interpreter->AddTensors(2, &base_index);
+  // one output
+  interpreter->AddTensors(1, &base_index);
+  // set input and output tensors
+  interpreter->SetInputs({0, 1});
+  interpreter->SetOutputs({2});
+
+  // set parameters of tensors
+  TfLiteQuantizationParams quant;
+  interpreter->SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "input",
+      {1, image_height, image_width, image_channels}, quant);
+  interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2},
+                                            quant);
+  interpreter->SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "output",
+      {1, wanted_height, wanted_width, wanted_channels}, quant);
+
+  ops::builtin::BuiltinOpResolver resolver;
+  const TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR, 1);
+  auto* params = reinterpret_cast<TfLiteResizeBilinearParams*>(
+      malloc(sizeof(TfLiteResizeBilinearParams)));
+  params->align_corners = false;
+  params->half_pixel_centers = false;
+  interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, params, resize_op,
+                                     nullptr);
+
+  interpreter->AllocateTensors();
+
+  // fill input image
+  // in[] are integers, cannot do memcpy() directly
+  auto input = interpreter->typed_tensor<float>(0);
+  for (int i = 0; i < number_of_pixels; i++) {
+    input[i] = in[i];
+  }
+
+  // fill new_sizes
+  interpreter->typed_tensor<int>(1)[0] = wanted_height;
+  interpreter->typed_tensor<int>(1)[1] = wanted_width;
+
+  interpreter->Invoke();
+
+  auto output = interpreter->typed_tensor<float>(2);
+  auto output_number_of_pixels = wanted_height * wanted_width * wanted_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    switch (s->input_type) {
+      case kTfLiteFloat32:
+        out[i] = (output[i] - s->input_mean) / s->input_std;
+        break;
+      case kTfLiteInt8:
+        out[i] = static_cast<int8_t>(output[i] - 128);
+        break;
+      case kTfLiteUInt8:
+        out[i] = static_cast<uint8_t>(output[i]);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/get_top_n.h b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/get_top_n.h
new file mode 100644
index 00000000..29bf702c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/get_top_n.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/examples/label_image/get_top_n_impl.h"  // IWYU pragma: export
+
+namespace tflite {
+namespace label_image {
+
+template <class T>
+void get_top_n(T* prediction, int prediction_size, size_t num_results,
+               float threshold, std::vector<std::pair<float, int>>* top_results,
+               TfLiteType input_type);
+
+// explicit instantiation so that we can use them otherwhere
+template void get_top_n<float>(float*, int, size_t, float,
+                               std::vector<std::pair<float, int>>*, TfLiteType);
+template void get_top_n<int8_t>(int8_t*, int, size_t, float,
+                                std::vector<std::pair<float, int>>*,
+                                TfLiteType);
+template void get_top_n<uint8_t>(uint8_t*, int, size_t, float,
+                                 std::vector<std::pair<float, int>>*,
+                                 TfLiteType);
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/get_top_n_impl.h b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/get_top_n_impl.h
new file mode 100644
index 00000000..4d9d3b84
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/get_top_n_impl.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
+
+#include <algorithm>
+#include <functional>
+#include <queue>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace label_image {
+
+extern bool input_floating;
+
+// Returns the top N confidence values over threshold in the provided vector,
+// sorted by confidence in descending order.
+template <class T>
+void get_top_n(T* prediction, int prediction_size, size_t num_results,
+               float threshold, std::vector<std::pair<float, int>>* top_results,
+               TfLiteType input_type) {
+  // Will contain top N results in ascending order.
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
+                      std::greater<std::pair<float, int>>>
+      top_result_pq;
+
+  const long count = prediction_size;  // NOLINT(runtime/int)
+  float value = 0.0;
+
+  for (int i = 0; i < count; ++i) {
+    switch (input_type) {
+      case kTfLiteFloat32:
+        value = prediction[i];
+        break;
+      case kTfLiteInt8:
+        value = (prediction[i] + 128) / 256.0;
+        break;
+      case kTfLiteUInt8:
+        value = prediction[i] / 255.0;
+        break;
+      default:
+        break;
+    }
+    // Only add it if it beats the threshold and has a chance at being in
+    // the top N.
+    if (value < threshold) {
+      continue;
+    }
+
+    top_result_pq.push(std::pair<float, int>(value, i));
+
+    // If at capacity, kick the smallest value out.
+    if (top_result_pq.size() > num_results) {
+      top_result_pq.pop();
+    }
+  }
+
+  // Copy to output vector and reverse into descending order.
+  while (!top_result_pq.empty()) {
+    top_results->push_back(top_result_pq.top());
+    top_result_pq.pop();
+  }
+  std::reverse(top_results->begin(), top_results->end());
+}
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/label_image.h b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/label_image.h
new file mode 100644
index 00000000..db55265b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/label_image.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace label_image {
+
+struct Settings {
+  bool verbose = false;
+  bool accel = false;
+  TfLiteType input_type = kTfLiteFloat32;
+  bool profiling = false;
+  bool allow_fp16 = false;
+  bool gl_backend = false;
+  bool hexagon_delegate = false;
+  bool xnnpack_delegate = false;
+  int loop_count = 1;
+  float input_mean = 127.5f;
+  float input_std = 127.5f;
+  string model_name = "./mobilenet_quant_v1_224.tflite";
+  tflite::FlatBufferModel* model;
+  string input_bmp_name = "./grace_hopper.bmp";
+  string labels_file_name = "./labels.txt";
+  int number_of_threads = 4;
+  int number_of_results = 5;
+  int max_profiling_buffer_entries = 1024;
+  int number_of_warmup_runs = 2;
+};
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/log.h b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/log.h
new file mode 100644
index 00000000..5ccdfe6a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/examples/label_image/log.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LOG_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LOG_H_
+
+#include <iostream>
+#include <sstream>
+
+namespace tflite {
+namespace label_image {
+
+class Log {
+  std::stringstream stream_;
+
+ public:
+  explicit Log(const char* severity) { stream_ << severity << ": "; }
+  std::stringstream& Stream() { return stream_; }
+  ~Log() { std::cerr << stream_.str() << std::endl; }
+};
+
+#define LOG(severity) tflite::label_image::Log(#severity).Stream()
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LOG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/android_info.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/android_info.h
new file mode 100644
index 00000000..797c838c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/android_info.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_ANDROID_INFO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_ANDROID_INFO_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Information about and Android device, used for determining compatibility
+// status.
+struct AndroidInfo {
+  // Property ro.build.version.sdk
+  std::string android_sdk_version;
+  // Property ro.product.model
+  std::string model;
+  // Property ro.product.device
+  std::string device;
+  // Property ro.product.manufacturer
+  std::string manufacturer;
+  // Whether code is running on an emulator.
+  bool is_emulator;
+};
+
+absl::Status RequestAndroidInfo(AndroidInfo* info_out);
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_ANDROID_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h
new file mode 100644
index 00000000..bfb17539
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_CANONICALIZE_VALUE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_CANONICALIZE_VALUE_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace tflite::acceleration {
+
+// Normalises the given ASCII input by converting all alphabets to lower case
+// and replacing ' ' and '-' with '_'.
+std::string CanonicalizeValue(absl::string_view value);
+
+// Applies the above normalisation plus key specific normalisation.
+std::string CanonicalizeValueWithKey(absl::string_view key,
+                                     absl::string_view value);
+}  // namespace tflite::acceleration
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_CANONICALIZE_VALUE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/devicedb.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/devicedb.h
new file mode 100644
index 00000000..cc42f755
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/devicedb.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_DEVICEDB_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_DEVICEDB_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/lite/experimental/acceleration/compatibility/database_generated.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Use the variables in `variable_values` to evaluate the decision tree in
+// `database` and update the `variable_values` based on derived properties in
+// the decision tree.
+//
+// See database.fbs for a description of the decision tree.
+void UpdateVariablesFromDatabase(
+    std::map<std::string, std::string>* variable_values,
+    const DeviceDatabase& database);
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_DEVICEDB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
new file mode 100644
index 00000000..59f73c2c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
@@ -0,0 +1,144 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_GPU_COMPATIBILITY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_GPU_COMPATIBILITY_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/delegate_options.h"
+#include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
+#include "tensorflow/lite/experimental/acceleration/compatibility/database_generated.h"
+#include "tensorflow/lite/experimental/acceleration/compatibility/variables.h"
+
+namespace tflite {
+namespace acceleration {
+
+// This class provides information on GPU delegate support.
+//
+// The GPU delegate is supported on a subset of Android devices, depending on
+// Android version, OpenGL ES version, GPU chipset etc. The support is based on
+// measure stability, correctness and peformance. For more detail see README.md.
+//
+// Example usage:
+//   tflite::Interpreter* interpreter = ... ;
+//   tflite::acceleration::AndroidInfo android_info;
+//   tflite::gpu::GpuInfo gpu_info;
+//   EXPECT_OK(tflite::acceleration::RequestAndroidInfo(&android_info));
+//   EXPECT_OK(tflite::gpu::gl::EglEnvironment::NewEglEnvironment(&env));
+//   EXPECT_OK(tflite::gpu::gl::RequestGpuInfo(&tflite_gpu_info));
+//   tflite::acceleration::GPUCompatibilityList list;
+//   TfLiteDelegate* gpu_delegate = nullptr;
+//   TfLiteGpuDelegateOptions gpu_options;
+//   if (list.Includes(android_info, gpu_info)) {
+//     gpu_options = list.BestOptionsFor(android_info, gpu_info);
+//     gpu_delegate = TfLiteGpuDelegateCreate(&gpu_options);
+//     EXPECT_EQ(interpreter->ModifyGraphWithDelegate(gpu_delegate), TfLiteOk);
+//   } else {
+//     // Fallback path.
+//   }
+class GPUCompatibilityList {
+ public:
+  // Construct list from bundled data. Returns a unique_ptr to a nullptr if
+  // creation fails.
+  static std::unique_ptr<GPUCompatibilityList> Create();
+
+  // Constructs list from the given flatbuffer data. Returns a unique_ptr to a
+  // nullptr is the given flatbuffer is empty or invalid.
+  // The flatbuffer pointer must remain valid during the usage of the
+  // compatibility list, it is the caller's responsibility to make sure of that.
+  // To have the compatibility list own the flatbuffer, use the alternative
+  // Create() method below.
+  static std::unique_ptr<GPUCompatibilityList> Create(
+      const unsigned char* compatibility_list_flatbuffer, int length);
+
+  // Constructs list from the given flatbuffer data. Returns a unique_ptr to a
+  // nullptr is the given flatbuffer is empty or invalid.
+  // The passed flatbuffer will be owned by the compatibility list object, so
+  // this method can be used safely with local temporary strings.
+  static std::unique_ptr<GPUCompatibilityList> Create(
+      std::string compatibility_list_flatbuffer);
+
+  // Returns true if the provided device specs are supported by the database.
+  bool Includes(const AndroidInfo& android_info,
+                const ::tflite::gpu::GpuInfo& gpu_info) const;
+
+  // Returns the compatibility status as an enum (unknown/supported/unsupported)
+  gpu::CompatibilityStatus GetStatus(
+      const AndroidInfo& android_info,
+      const ::tflite::gpu::GpuInfo& gpu_info) const;
+
+  // Returns the compatibility status as an enum (unknown/supported/unsupported)
+  // of the provided device specified as a map of variables (properties).
+  // Map keys should all be from here:
+  // tensorflow/lite/experimental/acceleration/compatibility/variables.h
+  gpu::CompatibilityStatus GetStatus(
+      std::map<std::string, std::string>& variables) const;
+
+  // Returns the best TfLiteGpuDelegateOptionsV2 for the provided device specs
+  // based on the database. The output can be modified as desired before passing
+  // to delegate creation.
+  TfLiteGpuDelegateOptionsV2 GetBestOptionsFor(
+      const AndroidInfo& android_info,
+      const ::tflite::gpu::GpuInfo& gpu_info) const;
+
+  // Convert android_info and gpu_info into a set of variables used for querying
+  // the list, and update variables from list data. See variables.h
+  // and devicedb.h for more information.
+  std::map<std::string, std::string> CalculateVariables(
+      const AndroidInfo& android_info,
+      const ::tflite::gpu::GpuInfo& gpu_info) const;
+
+  GPUCompatibilityList(const GPUCompatibilityList&) = delete;
+  GPUCompatibilityList& operator=(const GPUCompatibilityList&) = delete;
+
+  // Checks if the provided byte array represents a valid compatibility list
+  static bool IsValidFlatbuffer(const unsigned char* data, int len);
+
+  std::map<std::string, std::string> InfosToMap(
+      const AndroidInfo& android_info,
+      const ::tflite::gpu::GpuInfo& gpu_info) const;
+
+  // Converts the compatibility status enum value to the corresponding status
+  // string.
+  static std::string CompatibilityStatusToString(
+      gpu::CompatibilityStatus status);
+
+  // Converts the status string to the corresponding compatibility status enum
+  // value.
+  static gpu::CompatibilityStatus StringToCompatibilityStatus(
+      absl::string_view status);
+
+ protected:
+  const DeviceDatabase* database_;
+
+  // Optional container of the flatbuffer content, to support ownership of the
+  // flatbuffer by the compatibility list object itself.
+  std::string fbcontent_;
+
+ private:
+  explicit GPUCompatibilityList(
+      const unsigned char* compatibility_list_flatbuffer);
+
+  explicit GPUCompatibilityList(std::string compatibility_list_flatbuffer);
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_GPU_COMPATIBILITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/variables.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/variables.h
new file mode 100644
index 00000000..0965eca7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/compatibility/variables.h
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_VARIABLES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_VARIABLES_H_
+
+// This file lists generally useful compatibility properties.
+// Properties starting with "tflite." are reserved.
+// Users of the compatibility library can use arbitrary other property names.
+
+namespace tflite {
+namespace acceleration {
+// System properties, not specific to any single delegate.
+
+// Android properties.
+//
+// Android SDK version number. Android system property ro.build.version.sdk.
+// E.g., "28".
+inline constexpr char kAndroidSdkVersion[] = "tflite.android_sdk_version";
+// SoC model. Looked up from database or possibly returned from Android system
+// property ro.board.platform, normalized. E.g., "sdm450".
+inline constexpr char kSoCModel[] = "tflite.soc_model";
+// SoC vendor. Looked up from database. E.g., "qualcomm".
+inline constexpr char kSoCVendor[] = "tflite.soc_vendor";
+// Device manufacturer. Android API android.os.Build.MANUFACTURER, normalized.
+// E.g., "google".
+inline constexpr char kManufacturer[] = "tflite.manufacturer";
+// Device model. Android API android.os.Build.MODEL, normalized.
+// E.g., "pixel_2".
+inline constexpr char kDeviceModel[] = "tflite.device_model";
+// Device name. Android API android.os.Build.DEVICE, normalized.
+// E.g., "walleye".
+inline constexpr char kDeviceName[] = "tflite.device_name";
+
+// GPU-related properties.
+//
+// OpenGL ES version. E.g., 3.2.
+inline constexpr char kOpenGLESVersion[] = "tflite.opengl_es_version";
+// GPU model, result of querying GL_RENDERER, normalized. E.g.,
+// "adreno_(tm)_505".
+inline constexpr char kGPUModel[] = "tflite.gpu_model";
+// GPU vendor, normalized. E.g., "adreno_(tm)_505".
+inline constexpr char kGPUVendor[] = "tflite.gpu_vendor";
+// OpenGL driver version, result of querying GL_VERSION. E.g.,
+// "opengl_es_3.2_v@328.0_(git@6fb5a5b,_ife855c4895)_(date:08/21/18)"
+inline constexpr char kOpenGLDriverVersion[] = "tflite.opengl_driver_version";
+
+// Allowlist use case. This property is used to allow joining multiple lists
+// into a single decision tree.
+inline constexpr char kUseCase[] = "tflite.use_case";
+
+// NNAPI-related properties.
+//
+// NNAPI accelerator name, returned by ANeuralNetworksDevice_getName. E.g.,
+// "qti-dsp".
+inline constexpr char kNNAPIAccelerator[] = "tflite.nnapi_accelerator";
+// NNAPI accelerator feature level, returned by
+// ANeuralNetworksDevice_getFeatureLevel. E.g., 29. Actual variables are named
+// "tflite.nnapi_feature_level.<accelerator name>", e.g.,
+// "tflite.nnapi_feature_level.qti-dsp".
+inline constexpr char kNNAPIFeatureLevelPrefix[] = "tflite.nnapi_feature_level";
+
+namespace gpu {
+// GPU-delegate derived properties.
+
+// Whether the GPU delegate works in general.
+// Possible values are ("", "SUPPORTED", "UNSUPPORTED"). An empty value for
+// this field means that the device is unsupported.
+inline constexpr char kStatus[] = "tflite.gpu.status";
+
+inline constexpr char kStatusSupported[] = "SUPPORTED";
+inline constexpr char kStatusUnknown[] = "UNKNOWN";
+inline constexpr char kStatusUnsupported[] = "UNSUPPORTED";
+
+enum class CompatibilityStatus {
+  kUnknown = 0,
+  kSupported,
+  kUnsupported,
+};
+
+}  // namespace gpu
+
+namespace metadata {
+// Latency for model to run.
+inline constexpr char kLatency[] = "tflite.latency";
+
+// Frame time for one frame through model.
+inline constexpr char kFrameTime[] = "tflite.frame_time";
+
+}  // namespace metadata
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_VARIABLES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
new file mode 100644
index 00000000..8f547566
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
new file mode 100644
index 00000000..7e37522c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
new file mode 100644
index 00000000..f2abad60
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000..3e4a6741
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
new file mode 100644
index 00000000..0d1efc91
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
new file mode 100644
index 00000000..471ca204
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
@@ -0,0 +1,6043 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
+#define FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+
+struct ComputeSettings;
+struct ComputeSettingsBuilder;
+struct ComputeSettingsT;
+
+struct NNAPISettings;
+struct NNAPISettingsBuilder;
+struct NNAPISettingsT;
+
+struct GPUSettings;
+struct GPUSettingsBuilder;
+struct GPUSettingsT;
+
+struct HexagonSettings;
+struct HexagonSettingsBuilder;
+struct HexagonSettingsT;
+
+struct XNNPackSettings;
+struct XNNPackSettingsBuilder;
+struct XNNPackSettingsT;
+
+struct CoreMLSettings;
+struct CoreMLSettingsBuilder;
+struct CoreMLSettingsT;
+
+struct StableDelegateLoaderSettings;
+struct StableDelegateLoaderSettingsBuilder;
+struct StableDelegateLoaderSettingsT;
+
+struct CompilationCachingSettings;
+struct CompilationCachingSettingsBuilder;
+struct CompilationCachingSettingsT;
+
+struct EdgeTpuDeviceSpec;
+struct EdgeTpuDeviceSpecBuilder;
+struct EdgeTpuDeviceSpecT;
+
+struct EdgeTpuInactivePowerConfig;
+struct EdgeTpuInactivePowerConfigBuilder;
+struct EdgeTpuInactivePowerConfigT;
+
+struct EdgeTpuSettings;
+struct EdgeTpuSettingsBuilder;
+struct EdgeTpuSettingsT;
+
+struct GoogleEdgeTpuSettings;
+struct GoogleEdgeTpuSettingsBuilder;
+struct GoogleEdgeTpuSettingsT;
+
+struct CoralSettings;
+struct CoralSettingsBuilder;
+struct CoralSettingsT;
+
+struct CPUSettings;
+struct CPUSettingsBuilder;
+struct CPUSettingsT;
+
+struct ArmNNSettings;
+struct ArmNNSettingsBuilder;
+struct ArmNNSettingsT;
+
+struct TFLiteSettings;
+struct TFLiteSettingsBuilder;
+struct TFLiteSettingsT;
+
+struct FallbackSettings;
+struct FallbackSettingsBuilder;
+struct FallbackSettingsT;
+
+struct BenchmarkMetric;
+struct BenchmarkMetricBuilder;
+struct BenchmarkMetricT;
+
+struct BenchmarkResult;
+struct BenchmarkResultBuilder;
+struct BenchmarkResultT;
+
+namespace BenchmarkResult_ {
+
+struct InferenceOutput;
+struct InferenceOutputBuilder;
+struct InferenceOutputT;
+
+}  // namespace BenchmarkResult_
+
+struct ErrorCode;
+struct ErrorCodeBuilder;
+struct ErrorCodeT;
+
+struct BenchmarkError;
+struct BenchmarkErrorBuilder;
+struct BenchmarkErrorT;
+
+struct BenchmarkEvent;
+struct BenchmarkEventBuilder;
+struct BenchmarkEventT;
+
+struct BestAccelerationDecision;
+struct BestAccelerationDecisionBuilder;
+struct BestAccelerationDecisionT;
+
+struct BenchmarkInitializationFailure;
+struct BenchmarkInitializationFailureBuilder;
+struct BenchmarkInitializationFailureT;
+
+struct MiniBenchmarkEvent;
+struct MiniBenchmarkEventBuilder;
+struct MiniBenchmarkEventT;
+
+struct ModelFile;
+struct ModelFileBuilder;
+struct ModelFileT;
+
+struct ModelIdGroup;
+struct ModelIdGroupBuilder;
+struct ModelIdGroupT;
+
+struct BenchmarkStoragePaths;
+struct BenchmarkStoragePathsBuilder;
+struct BenchmarkStoragePathsT;
+
+struct ValidationSettings;
+struct ValidationSettingsBuilder;
+struct ValidationSettingsT;
+
+struct MinibenchmarkSettings;
+struct MinibenchmarkSettingsBuilder;
+struct MinibenchmarkSettingsT;
+
+struct BenchmarkEventStorage;
+struct BenchmarkEventStorageBuilder;
+struct BenchmarkEventStorageT;
+
+bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
+bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
+bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
+bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
+bool operator==(const GPUSettingsT &lhs, const GPUSettingsT &rhs);
+bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs);
+bool operator==(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs);
+bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs);
+bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
+bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
+bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator==(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs);
+bool operator!=(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs);
+bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
+bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
+bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
+bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
+bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs);
+bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs);
+bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs);
+bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs);
+bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
+bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
+bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
+bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
+bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
+bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
+bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
+bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
+bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+namespace BenchmarkResult_ {
+
+bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+}  // namespace BenchmarkResult_
+
+bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
+bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
+bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
+bool operator!=(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
+bool operator==(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs);
+bool operator!=(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs);
+bool operator==(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs);
+bool operator!=(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs);
+bool operator==(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs);
+bool operator!=(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs);
+bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
+bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
+bool operator==(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
+bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
+bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
+bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
+bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+
+enum ExecutionPreference : int32_t {
+  ExecutionPreference_ANY = 0,
+  ExecutionPreference_LOW_LATENCY = 1,
+  ExecutionPreference_LOW_POWER = 2,
+  ExecutionPreference_FORCE_CPU = 3,
+  ExecutionPreference_MIN = ExecutionPreference_ANY,
+  ExecutionPreference_MAX = ExecutionPreference_FORCE_CPU
+};
+
+inline const ExecutionPreference (&EnumValuesExecutionPreference())[4] {
+  static const ExecutionPreference values[] = {
+    ExecutionPreference_ANY,
+    ExecutionPreference_LOW_LATENCY,
+    ExecutionPreference_LOW_POWER,
+    ExecutionPreference_FORCE_CPU
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesExecutionPreference() {
+  static const char * const names[5] = {
+    "ANY",
+    "LOW_LATENCY",
+    "LOW_POWER",
+    "FORCE_CPU",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameExecutionPreference(ExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, ExecutionPreference_ANY, ExecutionPreference_FORCE_CPU)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesExecutionPreference()[index];
+}
+
+enum Delegate : int32_t {
+  Delegate_NONE = 0,
+  Delegate_NNAPI = 1,
+  Delegate_GPU = 2,
+  Delegate_HEXAGON = 3,
+  Delegate_XNNPACK = 4,
+  Delegate_EDGETPU = 5,
+  Delegate_EDGETPU_CORAL = 6,
+  Delegate_CORE_ML = 7,
+  Delegate_ARMNN = 8,
+  Delegate_MIN = Delegate_NONE,
+  Delegate_MAX = Delegate_ARMNN
+};
+
+inline const Delegate (&EnumValuesDelegate())[9] {
+  static const Delegate values[] = {
+    Delegate_NONE,
+    Delegate_NNAPI,
+    Delegate_GPU,
+    Delegate_HEXAGON,
+    Delegate_XNNPACK,
+    Delegate_EDGETPU,
+    Delegate_EDGETPU_CORAL,
+    Delegate_CORE_ML,
+    Delegate_ARMNN
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDelegate() {
+  static const char * const names[10] = {
+    "NONE",
+    "NNAPI",
+    "GPU",
+    "HEXAGON",
+    "XNNPACK",
+    "EDGETPU",
+    "EDGETPU_CORAL",
+    "CORE_ML",
+    "ARMNN",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDelegate(Delegate e) {
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_ARMNN)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDelegate()[index];
+}
+
+enum NNAPIExecutionPreference : int32_t {
+  NNAPIExecutionPreference_UNDEFINED = 0,
+  NNAPIExecutionPreference_NNAPI_LOW_POWER = 1,
+  NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER = 2,
+  NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED = 3,
+  NNAPIExecutionPreference_MIN = NNAPIExecutionPreference_UNDEFINED,
+  NNAPIExecutionPreference_MAX = NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED
+};
+
+inline const NNAPIExecutionPreference (&EnumValuesNNAPIExecutionPreference())[4] {
+  static const NNAPIExecutionPreference values[] = {
+    NNAPIExecutionPreference_UNDEFINED,
+    NNAPIExecutionPreference_NNAPI_LOW_POWER,
+    NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER,
+    NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNNAPIExecutionPreference() {
+  static const char * const names[5] = {
+    "UNDEFINED",
+    "NNAPI_LOW_POWER",
+    "NNAPI_FAST_SINGLE_ANSWER",
+    "NNAPI_SUSTAINED_SPEED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNNAPIExecutionPreference(NNAPIExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPreference_UNDEFINED, NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNNAPIExecutionPreference()[index];
+}
+
+enum NNAPIExecutionPriority : int32_t {
+  NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED = 0,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_LOW = 1,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM = 2,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH = 3,
+  NNAPIExecutionPriority_MIN = NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+  NNAPIExecutionPriority_MAX = NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH
+};
+
+inline const NNAPIExecutionPriority (&EnumValuesNNAPIExecutionPriority())[4] {
+  static const NNAPIExecutionPriority values[] = {
+    NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_LOW,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNNAPIExecutionPriority() {
+  static const char * const names[5] = {
+    "NNAPI_PRIORITY_UNDEFINED",
+    "NNAPI_PRIORITY_LOW",
+    "NNAPI_PRIORITY_MEDIUM",
+    "NNAPI_PRIORITY_HIGH",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNNAPIExecutionPriority(NNAPIExecutionPriority e) {
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED, NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNNAPIExecutionPriority()[index];
+}
+
+enum GPUBackend : int32_t {
+  GPUBackend_UNSET = 0,
+  GPUBackend_OPENCL = 1,
+  GPUBackend_OPENGL = 2,
+  GPUBackend_MIN = GPUBackend_UNSET,
+  GPUBackend_MAX = GPUBackend_OPENGL
+};
+
+inline const GPUBackend (&EnumValuesGPUBackend())[3] {
+  static const GPUBackend values[] = {
+    GPUBackend_UNSET,
+    GPUBackend_OPENCL,
+    GPUBackend_OPENGL
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUBackend() {
+  static const char * const names[4] = {
+    "UNSET",
+    "OPENCL",
+    "OPENGL",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUBackend(GPUBackend e) {
+  if (::flatbuffers::IsOutRange(e, GPUBackend_UNSET, GPUBackend_OPENGL)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUBackend()[index];
+}
+
+enum GPUInferencePriority : int32_t {
+  GPUInferencePriority_GPU_PRIORITY_AUTO = 0,
+  GPUInferencePriority_GPU_PRIORITY_MAX_PRECISION = 1,
+  GPUInferencePriority_GPU_PRIORITY_MIN_LATENCY = 2,
+  GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE = 3,
+  GPUInferencePriority_MIN = GPUInferencePriority_GPU_PRIORITY_AUTO,
+  GPUInferencePriority_MAX = GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE
+};
+
+inline const GPUInferencePriority (&EnumValuesGPUInferencePriority())[4] {
+  static const GPUInferencePriority values[] = {
+    GPUInferencePriority_GPU_PRIORITY_AUTO,
+    GPUInferencePriority_GPU_PRIORITY_MAX_PRECISION,
+    GPUInferencePriority_GPU_PRIORITY_MIN_LATENCY,
+    GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUInferencePriority() {
+  static const char * const names[5] = {
+    "GPU_PRIORITY_AUTO",
+    "GPU_PRIORITY_MAX_PRECISION",
+    "GPU_PRIORITY_MIN_LATENCY",
+    "GPU_PRIORITY_MIN_MEMORY_USAGE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUInferencePriority(GPUInferencePriority e) {
+  if (::flatbuffers::IsOutRange(e, GPUInferencePriority_GPU_PRIORITY_AUTO, GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUInferencePriority()[index];
+}
+
+enum GPUInferenceUsage : int32_t {
+  GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0,
+  GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1,
+  GPUInferenceUsage_MIN = GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+  GPUInferenceUsage_MAX = GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
+};
+
+inline const GPUInferenceUsage (&EnumValuesGPUInferenceUsage())[2] {
+  static const GPUInferenceUsage values[] = {
+    GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUInferenceUsage() {
+  static const char * const names[3] = {
+    "GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER",
+    "GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUInferenceUsage(GPUInferenceUsage e) {
+  if (::flatbuffers::IsOutRange(e, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUInferenceUsage()[index];
+}
+
+enum XNNPackFlags : int32_t {
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4,
+  XNNPackFlags_MIN = XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+  XNNPackFlags_MAX = XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16
+};
+
+inline const XNNPackFlags (&EnumValuesXNNPackFlags())[5] {
+  static const XNNPackFlags values[] = {
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesXNNPackFlags() {
+  static const char * const names[6] = {
+    "TFLITE_XNNPACK_DELEGATE_NO_FLAGS",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_QS8",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_QU8",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameXNNPackFlags(XNNPackFlags e) {
+  if (::flatbuffers::IsOutRange(e, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesXNNPackFlags()[index];
+}
+
+namespace CoreMLSettings_ {
+
+enum EnabledDevices : int32_t {
+  EnabledDevices_DEVICES_ALL = 0,
+  EnabledDevices_DEVICES_WITH_NEURAL_ENGINE = 1,
+  EnabledDevices_MIN = EnabledDevices_DEVICES_ALL,
+  EnabledDevices_MAX = EnabledDevices_DEVICES_WITH_NEURAL_ENGINE
+};
+
+inline const EnabledDevices (&EnumValuesEnabledDevices())[2] {
+  static const EnabledDevices values[] = {
+    EnabledDevices_DEVICES_ALL,
+    EnabledDevices_DEVICES_WITH_NEURAL_ENGINE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEnabledDevices() {
+  static const char * const names[3] = {
+    "DEVICES_ALL",
+    "DEVICES_WITH_NEURAL_ENGINE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEnabledDevices(EnabledDevices e) {
+  if (::flatbuffers::IsOutRange(e, EnabledDevices_DEVICES_ALL, EnabledDevices_DEVICES_WITH_NEURAL_ENGINE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEnabledDevices()[index];
+}
+
+}  // namespace CoreMLSettings_
+
+namespace EdgeTpuDeviceSpec_ {
+
+enum PlatformType : int32_t {
+  PlatformType_MMIO = 0,
+  PlatformType_REFERENCE = 1,
+  PlatformType_SIMULATOR = 2,
+  PlatformType_REMOTE_SIMULATOR = 3,
+  PlatformType_MIN = PlatformType_MMIO,
+  PlatformType_MAX = PlatformType_REMOTE_SIMULATOR
+};
+
+inline const PlatformType (&EnumValuesPlatformType())[4] {
+  static const PlatformType values[] = {
+    PlatformType_MMIO,
+    PlatformType_REFERENCE,
+    PlatformType_SIMULATOR,
+    PlatformType_REMOTE_SIMULATOR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPlatformType() {
+  static const char * const names[5] = {
+    "MMIO",
+    "REFERENCE",
+    "SIMULATOR",
+    "REMOTE_SIMULATOR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePlatformType(PlatformType e) {
+  if (::flatbuffers::IsOutRange(e, PlatformType_MMIO, PlatformType_REMOTE_SIMULATOR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPlatformType()[index];
+}
+
+}  // namespace EdgeTpuDeviceSpec_
+
+enum EdgeTpuPowerState : int32_t {
+  EdgeTpuPowerState_UNDEFINED_POWERSTATE = 0,
+  EdgeTpuPowerState_TPU_CORE_OFF = 1,
+  EdgeTpuPowerState_READY = 2,
+  EdgeTpuPowerState_ACTIVE_MIN_POWER = 3,
+  EdgeTpuPowerState_ACTIVE_VERY_LOW_POWER = 4,
+  EdgeTpuPowerState_ACTIVE_LOW_POWER = 5,
+  EdgeTpuPowerState_ACTIVE = 6,
+  EdgeTpuPowerState_OVER_DRIVE = 7,
+  EdgeTpuPowerState_MIN = EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+  EdgeTpuPowerState_MAX = EdgeTpuPowerState_OVER_DRIVE
+};
+
+inline const EdgeTpuPowerState (&EnumValuesEdgeTpuPowerState())[8] {
+  static const EdgeTpuPowerState values[] = {
+    EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    EdgeTpuPowerState_TPU_CORE_OFF,
+    EdgeTpuPowerState_READY,
+    EdgeTpuPowerState_ACTIVE_MIN_POWER,
+    EdgeTpuPowerState_ACTIVE_VERY_LOW_POWER,
+    EdgeTpuPowerState_ACTIVE_LOW_POWER,
+    EdgeTpuPowerState_ACTIVE,
+    EdgeTpuPowerState_OVER_DRIVE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEdgeTpuPowerState() {
+  static const char * const names[9] = {
+    "UNDEFINED_POWERSTATE",
+    "TPU_CORE_OFF",
+    "READY",
+    "ACTIVE_MIN_POWER",
+    "ACTIVE_VERY_LOW_POWER",
+    "ACTIVE_LOW_POWER",
+    "ACTIVE",
+    "OVER_DRIVE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEdgeTpuPowerState(EdgeTpuPowerState e) {
+  if (::flatbuffers::IsOutRange(e, EdgeTpuPowerState_UNDEFINED_POWERSTATE, EdgeTpuPowerState_OVER_DRIVE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEdgeTpuPowerState()[index];
+}
+
+namespace EdgeTpuSettings_ {
+
+enum FloatTruncationType : int32_t {
+  FloatTruncationType_UNSPECIFIED = 0,
+  FloatTruncationType_NO_TRUNCATION = 1,
+  FloatTruncationType_BFLOAT16 = 2,
+  FloatTruncationType_HALF = 3,
+  FloatTruncationType_MIN = FloatTruncationType_UNSPECIFIED,
+  FloatTruncationType_MAX = FloatTruncationType_HALF
+};
+
+inline const FloatTruncationType (&EnumValuesFloatTruncationType())[4] {
+  static const FloatTruncationType values[] = {
+    FloatTruncationType_UNSPECIFIED,
+    FloatTruncationType_NO_TRUNCATION,
+    FloatTruncationType_BFLOAT16,
+    FloatTruncationType_HALF
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesFloatTruncationType() {
+  static const char * const names[5] = {
+    "UNSPECIFIED",
+    "NO_TRUNCATION",
+    "BFLOAT16",
+    "HALF",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameFloatTruncationType(FloatTruncationType e) {
+  if (::flatbuffers::IsOutRange(e, FloatTruncationType_UNSPECIFIED, FloatTruncationType_HALF)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFloatTruncationType()[index];
+}
+
+enum QosClass : int32_t {
+  QosClass_QOS_UNDEFINED = 0,
+  QosClass_BEST_EFFORT = 1,
+  QosClass_REALTIME = 2,
+  QosClass_MIN = QosClass_QOS_UNDEFINED,
+  QosClass_MAX = QosClass_REALTIME
+};
+
+inline const QosClass (&EnumValuesQosClass())[3] {
+  static const QosClass values[] = {
+    QosClass_QOS_UNDEFINED,
+    QosClass_BEST_EFFORT,
+    QosClass_REALTIME
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQosClass() {
+  static const char * const names[4] = {
+    "QOS_UNDEFINED",
+    "BEST_EFFORT",
+    "REALTIME",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQosClass(QosClass e) {
+  if (::flatbuffers::IsOutRange(e, QosClass_QOS_UNDEFINED, QosClass_REALTIME)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQosClass()[index];
+}
+
+}  // namespace EdgeTpuSettings_
+
+namespace GoogleEdgeTpuSettings_ {
+
+enum Priority : int32_t {
+  Priority_PRIORITY_UNDEFINED = 0,
+  Priority_PRIORITY_LOW = 1,
+  Priority_PRIORITY_MEDIUM = 2,
+  Priority_PRIORITY_HIGH = 3,
+  Priority_MIN = Priority_PRIORITY_UNDEFINED,
+  Priority_MAX = Priority_PRIORITY_HIGH
+};
+
+inline const Priority (&EnumValuesPriority())[4] {
+  static const Priority values[] = {
+    Priority_PRIORITY_UNDEFINED,
+    Priority_PRIORITY_LOW,
+    Priority_PRIORITY_MEDIUM,
+    Priority_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPriority() {
+  static const char * const names[5] = {
+    "PRIORITY_UNDEFINED",
+    "PRIORITY_LOW",
+    "PRIORITY_MEDIUM",
+    "PRIORITY_HIGH",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePriority(Priority e) {
+  if (::flatbuffers::IsOutRange(e, Priority_PRIORITY_UNDEFINED, Priority_PRIORITY_HIGH)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPriority()[index];
+}
+
+enum TriState : int32_t {
+  TriState_TRISTATE_UNDEFINED = 0,
+  TriState_TRISTATE_FALSE = 1,
+  TriState_TRISTATE_TRUE = 2,
+  TriState_MIN = TriState_TRISTATE_UNDEFINED,
+  TriState_MAX = TriState_TRISTATE_TRUE
+};
+
+inline const TriState (&EnumValuesTriState())[3] {
+  static const TriState values[] = {
+    TriState_TRISTATE_UNDEFINED,
+    TriState_TRISTATE_FALSE,
+    TriState_TRISTATE_TRUE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTriState() {
+  static const char * const names[4] = {
+    "TRISTATE_UNDEFINED",
+    "TRISTATE_FALSE",
+    "TRISTATE_TRUE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTriState(TriState e) {
+  if (::flatbuffers::IsOutRange(e, TriState_TRISTATE_UNDEFINED, TriState_TRISTATE_TRUE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTriState()[index];
+}
+
+}  // namespace GoogleEdgeTpuSettings_
+
+namespace CoralSettings_ {
+
+enum Performance : int32_t {
+  Performance_UNDEFINED = 0,
+  Performance_MAXIMUM = 1,
+  Performance_HIGH = 2,
+  Performance_MEDIUM = 3,
+  Performance_LOW = 4,
+  Performance_MIN = Performance_UNDEFINED,
+  Performance_MAX = Performance_LOW
+};
+
+inline const Performance (&EnumValuesPerformance())[5] {
+  static const Performance values[] = {
+    Performance_UNDEFINED,
+    Performance_MAXIMUM,
+    Performance_HIGH,
+    Performance_MEDIUM,
+    Performance_LOW
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPerformance() {
+  static const char * const names[6] = {
+    "UNDEFINED",
+    "MAXIMUM",
+    "HIGH",
+    "MEDIUM",
+    "LOW",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePerformance(Performance e) {
+  if (::flatbuffers::IsOutRange(e, Performance_UNDEFINED, Performance_LOW)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPerformance()[index];
+}
+
+}  // namespace CoralSettings_
+
+enum BenchmarkEventType : int32_t {
+  BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE = 0,
+  BenchmarkEventType_START = 1,
+  BenchmarkEventType_END = 2,
+  BenchmarkEventType_ERROR = 3,
+  BenchmarkEventType_LOGGED = 4,
+  BenchmarkEventType_RECOVERED_ERROR = 5,
+  BenchmarkEventType_MIN = BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+  BenchmarkEventType_MAX = BenchmarkEventType_RECOVERED_ERROR
+};
+
+inline const BenchmarkEventType (&EnumValuesBenchmarkEventType())[6] {
+  static const BenchmarkEventType values[] = {
+    BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+    BenchmarkEventType_START,
+    BenchmarkEventType_END,
+    BenchmarkEventType_ERROR,
+    BenchmarkEventType_LOGGED,
+    BenchmarkEventType_RECOVERED_ERROR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBenchmarkEventType() {
+  static const char * const names[7] = {
+    "UNDEFINED_BENCHMARK_EVENT_TYPE",
+    "START",
+    "END",
+    "ERROR",
+    "LOGGED",
+    "RECOVERED_ERROR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBenchmarkEventType(BenchmarkEventType e) {
+  if (::flatbuffers::IsOutRange(e, BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE, BenchmarkEventType_RECOVERED_ERROR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBenchmarkEventType()[index];
+}
+
+enum BenchmarkStage : int32_t {
+  BenchmarkStage_UNKNOWN = 0,
+  BenchmarkStage_INITIALIZATION = 1,
+  BenchmarkStage_INFERENCE = 2,
+  BenchmarkStage_MIN = BenchmarkStage_UNKNOWN,
+  BenchmarkStage_MAX = BenchmarkStage_INFERENCE
+};
+
+inline const BenchmarkStage (&EnumValuesBenchmarkStage())[3] {
+  static const BenchmarkStage values[] = {
+    BenchmarkStage_UNKNOWN,
+    BenchmarkStage_INITIALIZATION,
+    BenchmarkStage_INFERENCE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBenchmarkStage() {
+  static const char * const names[4] = {
+    "UNKNOWN",
+    "INITIALIZATION",
+    "INFERENCE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBenchmarkStage(BenchmarkStage e) {
+  if (::flatbuffers::IsOutRange(e, BenchmarkStage_UNKNOWN, BenchmarkStage_INFERENCE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBenchmarkStage()[index];
+}
+
+struct ComputeSettingsT : public ::flatbuffers::NativeTable {
+  typedef ComputeSettings TableType;
+  tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY;
+  std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
+  std::string model_namespace_for_statistics{};
+  std::string model_identifier_for_statistics{};
+  std::unique_ptr<tflite::MinibenchmarkSettingsT> settings_to_test_locally{};
+  ComputeSettingsT() = default;
+  ComputeSettingsT(const ComputeSettingsT &o);
+  ComputeSettingsT(ComputeSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  ComputeSettingsT &operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ComputeSettingsT NativeTableType;
+  typedef ComputeSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PREFERENCE = 4,
+    VT_TFLITE_SETTINGS = 6,
+    VT_MODEL_NAMESPACE_FOR_STATISTICS = 8,
+    VT_MODEL_IDENTIFIER_FOR_STATISTICS = 10,
+    VT_SETTINGS_TO_TEST_LOCALLY = 12
+  };
+  tflite::ExecutionPreference preference() const {
+    return static_cast<tflite::ExecutionPreference>(GetField<int32_t>(VT_PREFERENCE, 0));
+  }
+  const tflite::TFLiteSettings *tflite_settings() const {
+    return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
+  }
+  const ::flatbuffers::String *model_namespace_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE_FOR_STATISTICS);
+  }
+  const ::flatbuffers::String *model_identifier_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER_FOR_STATISTICS);
+  }
+  const tflite::MinibenchmarkSettings *settings_to_test_locally() const {
+    return GetPointer<const tflite::MinibenchmarkSettings *>(VT_SETTINGS_TO_TEST_LOCALLY);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_PREFERENCE, 4) &&
+           VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
+           verifier.VerifyTable(tflite_settings()) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE_FOR_STATISTICS) &&
+           verifier.VerifyString(model_namespace_for_statistics()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER_FOR_STATISTICS) &&
+           verifier.VerifyString(model_identifier_for_statistics()) &&
+           VerifyOffset(verifier, VT_SETTINGS_TO_TEST_LOCALLY) &&
+           verifier.VerifyTable(settings_to_test_locally()) &&
+           verifier.EndTable();
+  }
+  ComputeSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ComputeSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ComputeSettingsBuilder {
+  typedef ComputeSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_preference(tflite::ExecutionPreference preference) {
+    fbb_.AddElement<int32_t>(ComputeSettings::VT_PREFERENCE, static_cast<int32_t>(preference), 0);
+  }
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+    fbb_.AddOffset(ComputeSettings::VT_TFLITE_SETTINGS, tflite_settings);
+  }
+  void add_model_namespace_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics) {
+    fbb_.AddOffset(ComputeSettings::VT_MODEL_NAMESPACE_FOR_STATISTICS, model_namespace_for_statistics);
+  }
+  void add_model_identifier_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics) {
+    fbb_.AddOffset(ComputeSettings::VT_MODEL_IDENTIFIER_FOR_STATISTICS, model_identifier_for_statistics);
+  }
+  void add_settings_to_test_locally(::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally) {
+    fbb_.AddOffset(ComputeSettings::VT_SETTINGS_TO_TEST_LOCALLY, settings_to_test_locally);
+  }
+  explicit ComputeSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ComputeSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ComputeSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics = 0,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+  ComputeSettingsBuilder builder_(_fbb);
+  builder_.add_settings_to_test_locally(settings_to_test_locally);
+  builder_.add_model_identifier_for_statistics(model_identifier_for_statistics);
+  builder_.add_model_namespace_for_statistics(model_namespace_for_statistics);
+  builder_.add_tflite_settings(tflite_settings);
+  builder_.add_preference(preference);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    const char *model_namespace_for_statistics = nullptr,
+    const char *model_identifier_for_statistics = nullptr,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+  auto model_namespace_for_statistics__ = model_namespace_for_statistics ? _fbb.CreateString(model_namespace_for_statistics) : 0;
+  auto model_identifier_for_statistics__ = model_identifier_for_statistics ? _fbb.CreateString(model_identifier_for_statistics) : 0;
+  return tflite::CreateComputeSettings(
+      _fbb,
+      preference,
+      tflite_settings,
+      model_namespace_for_statistics__,
+      model_identifier_for_statistics__,
+      settings_to_test_locally);
+}
+
+::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NNAPISettingsT : public ::flatbuffers::NativeTable {
+  typedef NNAPISettings TableType;
+  std::string accelerator_name{};
+  std::string cache_directory{};
+  std::string model_token{};
+  tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED;
+  int32_t no_of_nnapi_instances_to_cache = 0;
+  std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
+  bool allow_nnapi_cpu_on_android_10_plus = false;
+  tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED;
+  bool allow_dynamic_dimensions = false;
+  bool allow_fp16_precision_for_fp32 = false;
+  bool use_burst_computation = false;
+  int64_t support_library_handle = 0;
+  NNAPISettingsT() = default;
+  NNAPISettingsT(const NNAPISettingsT &o);
+  NNAPISettingsT(NNAPISettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  NNAPISettingsT &operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NNAPISettingsT NativeTableType;
+  typedef NNAPISettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ACCELERATOR_NAME = 4,
+    VT_CACHE_DIRECTORY = 6,
+    VT_MODEL_TOKEN = 8,
+    VT_EXECUTION_PREFERENCE = 10,
+    VT_NO_OF_NNAPI_INSTANCES_TO_CACHE = 12,
+    VT_FALLBACK_SETTINGS = 14,
+    VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS = 16,
+    VT_EXECUTION_PRIORITY = 18,
+    VT_ALLOW_DYNAMIC_DIMENSIONS = 20,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 22,
+    VT_USE_BURST_COMPUTATION = 24,
+    VT_SUPPORT_LIBRARY_HANDLE = 26
+  };
+  const ::flatbuffers::String *accelerator_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ACCELERATOR_NAME);
+  }
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  tflite::NNAPIExecutionPreference execution_preference() const {
+    return static_cast<tflite::NNAPIExecutionPreference>(GetField<int32_t>(VT_EXECUTION_PREFERENCE, 0));
+  }
+  int32_t no_of_nnapi_instances_to_cache() const {
+    return GetField<int32_t>(VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, 0);
+  }
+  const tflite::FallbackSettings *fallback_settings() const {
+    return GetPointer<const tflite::FallbackSettings *>(VT_FALLBACK_SETTINGS);
+  }
+  bool allow_nnapi_cpu_on_android_10_plus() const {
+    return GetField<uint8_t>(VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, 0) != 0;
+  }
+  tflite::NNAPIExecutionPriority execution_priority() const {
+    return static_cast<tflite::NNAPIExecutionPriority>(GetField<int32_t>(VT_EXECUTION_PRIORITY, 0));
+  }
+  bool allow_dynamic_dimensions() const {
+    return GetField<uint8_t>(VT_ALLOW_DYNAMIC_DIMENSIONS, 0) != 0;
+  }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
+  bool use_burst_computation() const {
+    return GetField<uint8_t>(VT_USE_BURST_COMPUTATION, 0) != 0;
+  }
+  int64_t support_library_handle() const {
+    return GetField<int64_t>(VT_SUPPORT_LIBRARY_HANDLE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ACCELERATOR_NAME) &&
+           verifier.VerifyString(accelerator_name()) &&
+           VerifyOffset(verifier, VT_CACHE_DIRECTORY) &&
+           verifier.VerifyString(cache_directory()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PREFERENCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, 4) &&
+           VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
+           verifier.VerifyTable(fallback_settings()) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, 1) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PRIORITY, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_DYNAMIC_DIMENSIONS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_BURST_COMPUTATION, 1) &&
+           VerifyField<int64_t>(verifier, VT_SUPPORT_LIBRARY_HANDLE, 8) &&
+           verifier.EndTable();
+  }
+  NNAPISettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NNAPISettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NNAPISettingsBuilder {
+  typedef NNAPISettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_accelerator_name(::flatbuffers::Offset<::flatbuffers::String> accelerator_name) {
+    fbb_.AddOffset(NNAPISettings::VT_ACCELERATOR_NAME, accelerator_name);
+  }
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
+    fbb_.AddOffset(NNAPISettings::VT_CACHE_DIRECTORY, cache_directory);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(NNAPISettings::VT_MODEL_TOKEN, model_token);
+  }
+  void add_execution_preference(tflite::NNAPIExecutionPreference execution_preference) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_EXECUTION_PREFERENCE, static_cast<int32_t>(execution_preference), 0);
+  }
+  void add_no_of_nnapi_instances_to_cache(int32_t no_of_nnapi_instances_to_cache) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, no_of_nnapi_instances_to_cache, 0);
+  }
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+    fbb_.AddOffset(NNAPISettings::VT_FALLBACK_SETTINGS, fallback_settings);
+  }
+  void add_allow_nnapi_cpu_on_android_10_plus(bool allow_nnapi_cpu_on_android_10_plus) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, static_cast<uint8_t>(allow_nnapi_cpu_on_android_10_plus), 0);
+  }
+  void add_execution_priority(tflite::NNAPIExecutionPriority execution_priority) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_EXECUTION_PRIORITY, static_cast<int32_t>(execution_priority), 0);
+  }
+  void add_allow_dynamic_dimensions(bool allow_dynamic_dimensions) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_DYNAMIC_DIMENSIONS, static_cast<uint8_t>(allow_dynamic_dimensions), 0);
+  }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
+  void add_use_burst_computation(bool use_burst_computation) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_USE_BURST_COMPUTATION, static_cast<uint8_t>(use_burst_computation), 0);
+  }
+  void add_support_library_handle(int64_t support_library_handle) {
+    fbb_.AddElement<int64_t>(NNAPISettings::VT_SUPPORT_LIBRARY_HANDLE, support_library_handle, 0);
+  }
+  explicit NNAPISettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NNAPISettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NNAPISettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> accelerator_name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
+    int32_t no_of_nnapi_instances_to_cache = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool allow_nnapi_cpu_on_android_10_plus = false,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    bool allow_dynamic_dimensions = false,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
+  NNAPISettingsBuilder builder_(_fbb);
+  builder_.add_support_library_handle(support_library_handle);
+  builder_.add_execution_priority(execution_priority);
+  builder_.add_fallback_settings(fallback_settings);
+  builder_.add_no_of_nnapi_instances_to_cache(no_of_nnapi_instances_to_cache);
+  builder_.add_execution_preference(execution_preference);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_directory(cache_directory);
+  builder_.add_accelerator_name(accelerator_name);
+  builder_.add_use_burst_computation(use_burst_computation);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
+  builder_.add_allow_dynamic_dimensions(allow_dynamic_dimensions);
+  builder_.add_allow_nnapi_cpu_on_android_10_plus(allow_nnapi_cpu_on_android_10_plus);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *accelerator_name = nullptr,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
+    int32_t no_of_nnapi_instances_to_cache = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool allow_nnapi_cpu_on_android_10_plus = false,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    bool allow_dynamic_dimensions = false,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
+  auto accelerator_name__ = accelerator_name ? _fbb.CreateString(accelerator_name) : 0;
+  auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateNNAPISettings(
+      _fbb,
+      accelerator_name__,
+      cache_directory__,
+      model_token__,
+      execution_preference,
+      no_of_nnapi_instances_to_cache,
+      fallback_settings,
+      allow_nnapi_cpu_on_android_10_plus,
+      execution_priority,
+      allow_dynamic_dimensions,
+      allow_fp16_precision_for_fp32,
+      use_burst_computation,
+      support_library_handle);
+}
+
+::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GPUSettingsT : public ::flatbuffers::NativeTable {
+  typedef GPUSettings TableType;
+  bool is_precision_loss_allowed = false;
+  bool enable_quantized_inference = true;
+  tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET;
+  tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
+  std::string cache_directory{};
+  std::string model_token{};
+};
+
+struct GPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GPUSettingsT NativeTableType;
+  typedef GPUSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IS_PRECISION_LOSS_ALLOWED = 4,
+    VT_ENABLE_QUANTIZED_INFERENCE = 6,
+    VT_FORCE_BACKEND = 8,
+    VT_INFERENCE_PRIORITY1 = 10,
+    VT_INFERENCE_PRIORITY2 = 12,
+    VT_INFERENCE_PRIORITY3 = 14,
+    VT_INFERENCE_PREFERENCE = 16,
+    VT_CACHE_DIRECTORY = 18,
+    VT_MODEL_TOKEN = 20
+  };
+  bool is_precision_loss_allowed() const {
+    return GetField<uint8_t>(VT_IS_PRECISION_LOSS_ALLOWED, 0) != 0;
+  }
+  bool enable_quantized_inference() const {
+    return GetField<uint8_t>(VT_ENABLE_QUANTIZED_INFERENCE, 1) != 0;
+  }
+  tflite::GPUBackend force_backend() const {
+    return static_cast<tflite::GPUBackend>(GetField<int32_t>(VT_FORCE_BACKEND, 0));
+  }
+  tflite::GPUInferencePriority inference_priority1() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY1, 0));
+  }
+  tflite::GPUInferencePriority inference_priority2() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY2, 0));
+  }
+  tflite::GPUInferencePriority inference_priority3() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY3, 0));
+  }
+  tflite::GPUInferenceUsage inference_preference() const {
+    return static_cast<tflite::GPUInferenceUsage>(GetField<int32_t>(VT_INFERENCE_PREFERENCE, 0));
+  }
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_IS_PRECISION_LOSS_ALLOWED, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_QUANTIZED_INFERENCE, 1) &&
+           VerifyField<int32_t>(verifier, VT_FORCE_BACKEND, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY1, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY2, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY3, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PREFERENCE, 4) &&
+           VerifyOffset(verifier, VT_CACHE_DIRECTORY) &&
+           verifier.VerifyString(cache_directory()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           verifier.EndTable();
+  }
+  GPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GPUSettingsBuilder {
+  typedef GPUSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_is_precision_loss_allowed(bool is_precision_loss_allowed) {
+    fbb_.AddElement<uint8_t>(GPUSettings::VT_IS_PRECISION_LOSS_ALLOWED, static_cast<uint8_t>(is_precision_loss_allowed), 0);
+  }
+  void add_enable_quantized_inference(bool enable_quantized_inference) {
+    fbb_.AddElement<uint8_t>(GPUSettings::VT_ENABLE_QUANTIZED_INFERENCE, static_cast<uint8_t>(enable_quantized_inference), 1);
+  }
+  void add_force_backend(tflite::GPUBackend force_backend) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_FORCE_BACKEND, static_cast<int32_t>(force_backend), 0);
+  }
+  void add_inference_priority1(tflite::GPUInferencePriority inference_priority1) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY1, static_cast<int32_t>(inference_priority1), 0);
+  }
+  void add_inference_priority2(tflite::GPUInferencePriority inference_priority2) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY2, static_cast<int32_t>(inference_priority2), 0);
+  }
+  void add_inference_priority3(tflite::GPUInferencePriority inference_priority3) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY3, static_cast<int32_t>(inference_priority3), 0);
+  }
+  void add_inference_preference(tflite::GPUInferenceUsage inference_preference) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PREFERENCE, static_cast<int32_t>(inference_preference), 0);
+  }
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
+    fbb_.AddOffset(GPUSettings::VT_CACHE_DIRECTORY, cache_directory);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(GPUSettings::VT_MODEL_TOKEN, model_token);
+  }
+  explicit GPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GPUSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GPUSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_precision_loss_allowed = false,
+    bool enable_quantized_inference = true,
+    tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
+    tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
+  GPUSettingsBuilder builder_(_fbb);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_directory(cache_directory);
+  builder_.add_inference_preference(inference_preference);
+  builder_.add_inference_priority3(inference_priority3);
+  builder_.add_inference_priority2(inference_priority2);
+  builder_.add_inference_priority1(inference_priority1);
+  builder_.add_force_backend(force_backend);
+  builder_.add_enable_quantized_inference(enable_quantized_inference);
+  builder_.add_is_precision_loss_allowed(is_precision_loss_allowed);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_precision_loss_allowed = false,
+    bool enable_quantized_inference = true,
+    tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
+    tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr) {
+  auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateGPUSettings(
+      _fbb,
+      is_precision_loss_allowed,
+      enable_quantized_inference,
+      force_backend,
+      inference_priority1,
+      inference_priority2,
+      inference_priority3,
+      inference_preference,
+      cache_directory__,
+      model_token__);
+}
+
+::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HexagonSettingsT : public ::flatbuffers::NativeTable {
+  typedef HexagonSettings TableType;
+  int32_t debug_level = 0;
+  int32_t powersave_level = 0;
+  bool print_graph_profile = false;
+  bool print_graph_debug = false;
+};
+
+struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HexagonSettingsT NativeTableType;
+  typedef HexagonSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEBUG_LEVEL = 4,
+    VT_POWERSAVE_LEVEL = 6,
+    VT_PRINT_GRAPH_PROFILE = 8,
+    VT_PRINT_GRAPH_DEBUG = 10
+  };
+  int32_t debug_level() const {
+    return GetField<int32_t>(VT_DEBUG_LEVEL, 0);
+  }
+  int32_t powersave_level() const {
+    return GetField<int32_t>(VT_POWERSAVE_LEVEL, 0);
+  }
+  bool print_graph_profile() const {
+    return GetField<uint8_t>(VT_PRINT_GRAPH_PROFILE, 0) != 0;
+  }
+  bool print_graph_debug() const {
+    return GetField<uint8_t>(VT_PRINT_GRAPH_DEBUG, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_DEBUG_LEVEL, 4) &&
+           VerifyField<int32_t>(verifier, VT_POWERSAVE_LEVEL, 4) &&
+           VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_PROFILE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_DEBUG, 1) &&
+           verifier.EndTable();
+  }
+  HexagonSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HexagonSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HexagonSettingsBuilder {
+  typedef HexagonSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_debug_level(int32_t debug_level) {
+    fbb_.AddElement<int32_t>(HexagonSettings::VT_DEBUG_LEVEL, debug_level, 0);
+  }
+  void add_powersave_level(int32_t powersave_level) {
+    fbb_.AddElement<int32_t>(HexagonSettings::VT_POWERSAVE_LEVEL, powersave_level, 0);
+  }
+  void add_print_graph_profile(bool print_graph_profile) {
+    fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_PROFILE, static_cast<uint8_t>(print_graph_profile), 0);
+  }
+  void add_print_graph_debug(bool print_graph_debug) {
+    fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_DEBUG, static_cast<uint8_t>(print_graph_debug), 0);
+  }
+  explicit HexagonSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HexagonSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HexagonSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t debug_level = 0,
+    int32_t powersave_level = 0,
+    bool print_graph_profile = false,
+    bool print_graph_debug = false) {
+  HexagonSettingsBuilder builder_(_fbb);
+  builder_.add_powersave_level(powersave_level);
+  builder_.add_debug_level(debug_level);
+  builder_.add_print_graph_debug(print_graph_debug);
+  builder_.add_print_graph_profile(print_graph_profile);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct XNNPackSettingsT : public ::flatbuffers::NativeTable {
+  typedef XNNPackSettings TableType;
+  int32_t num_threads = 0;
+  tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS;
+};
+
+struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef XNNPackSettingsT NativeTableType;
+  typedef XNNPackSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_THREADS = 4,
+    VT_FLAGS = 6
+  };
+  int32_t num_threads() const {
+    return GetField<int32_t>(VT_NUM_THREADS, 0);
+  }
+  tflite::XNNPackFlags flags() const {
+    return static_cast<tflite::XNNPackFlags>(GetField<int32_t>(VT_FLAGS, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
+           VerifyField<int32_t>(verifier, VT_FLAGS, 4) &&
+           verifier.EndTable();
+  }
+  XNNPackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<XNNPackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct XNNPackSettingsBuilder {
+  typedef XNNPackSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_threads(int32_t num_threads) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_NUM_THREADS, num_threads, 0);
+  }
+  void add_flags(tflite::XNNPackFlags flags) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_FLAGS, static_cast<int32_t>(flags), 0);
+  }
+  explicit XNNPackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<XNNPackSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<XNNPackSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = 0,
+    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS) {
+  XNNPackSettingsBuilder builder_(_fbb);
+  builder_.add_flags(flags);
+  builder_.add_num_threads(num_threads);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CoreMLSettingsT : public ::flatbuffers::NativeTable {
+  typedef CoreMLSettings TableType;
+  tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL;
+  int32_t coreml_version = 0;
+  int32_t max_delegated_partitions = 0;
+  int32_t min_nodes_per_partition = 2;
+};
+
+struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CoreMLSettingsT NativeTableType;
+  typedef CoreMLSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENABLED_DEVICES = 4,
+    VT_COREML_VERSION = 6,
+    VT_MAX_DELEGATED_PARTITIONS = 8,
+    VT_MIN_NODES_PER_PARTITION = 10
+  };
+  tflite::CoreMLSettings_::EnabledDevices enabled_devices() const {
+    return static_cast<tflite::CoreMLSettings_::EnabledDevices>(GetField<int32_t>(VT_ENABLED_DEVICES, 0));
+  }
+  int32_t coreml_version() const {
+    return GetField<int32_t>(VT_COREML_VERSION, 0);
+  }
+  int32_t max_delegated_partitions() const {
+    return GetField<int32_t>(VT_MAX_DELEGATED_PARTITIONS, 0);
+  }
+  int32_t min_nodes_per_partition() const {
+    return GetField<int32_t>(VT_MIN_NODES_PER_PARTITION, 2);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_ENABLED_DEVICES, 4) &&
+           VerifyField<int32_t>(verifier, VT_COREML_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_MAX_DELEGATED_PARTITIONS, 4) &&
+           VerifyField<int32_t>(verifier, VT_MIN_NODES_PER_PARTITION, 4) &&
+           verifier.EndTable();
+  }
+  CoreMLSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoreMLSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CoreMLSettingsBuilder {
+  typedef CoreMLSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_enabled_devices(tflite::CoreMLSettings_::EnabledDevices enabled_devices) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_ENABLED_DEVICES, static_cast<int32_t>(enabled_devices), 0);
+  }
+  void add_coreml_version(int32_t coreml_version) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_COREML_VERSION, coreml_version, 0);
+  }
+  void add_max_delegated_partitions(int32_t max_delegated_partitions) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
+  }
+  void add_min_nodes_per_partition(int32_t min_nodes_per_partition) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_MIN_NODES_PER_PARTITION, min_nodes_per_partition, 2);
+  }
+  explicit CoreMLSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CoreMLSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CoreMLSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL,
+    int32_t coreml_version = 0,
+    int32_t max_delegated_partitions = 0,
+    int32_t min_nodes_per_partition = 2) {
+  CoreMLSettingsBuilder builder_(_fbb);
+  builder_.add_min_nodes_per_partition(min_nodes_per_partition);
+  builder_.add_max_delegated_partitions(max_delegated_partitions);
+  builder_.add_coreml_version(coreml_version);
+  builder_.add_enabled_devices(enabled_devices);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StableDelegateLoaderSettingsT : public ::flatbuffers::NativeTable {
+  typedef StableDelegateLoaderSettings TableType;
+  std::string delegate_path{};
+};
+
+struct StableDelegateLoaderSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StableDelegateLoaderSettingsT NativeTableType;
+  typedef StableDelegateLoaderSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE_PATH = 4
+  };
+  const ::flatbuffers::String *delegate_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DELEGATE_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DELEGATE_PATH) &&
+           verifier.VerifyString(delegate_path()) &&
+           verifier.EndTable();
+  }
+  StableDelegateLoaderSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StableDelegateLoaderSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StableDelegateLoaderSettingsBuilder {
+  typedef StableDelegateLoaderSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate_path(::flatbuffers::Offset<::flatbuffers::String> delegate_path) {
+    fbb_.AddOffset(StableDelegateLoaderSettings::VT_DELEGATE_PATH, delegate_path);
+  }
+  explicit StableDelegateLoaderSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StableDelegateLoaderSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StableDelegateLoaderSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> delegate_path = 0) {
+  StableDelegateLoaderSettingsBuilder builder_(_fbb);
+  builder_.add_delegate_path(delegate_path);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *delegate_path = nullptr) {
+  auto delegate_path__ = delegate_path ? _fbb.CreateString(delegate_path) : 0;
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      delegate_path__);
+}
+
+::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CompilationCachingSettingsT : public ::flatbuffers::NativeTable {
+  typedef CompilationCachingSettings TableType;
+  std::string cache_dir{};
+  std::string model_token{};
+};
+
+struct CompilationCachingSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CompilationCachingSettingsT NativeTableType;
+  typedef CompilationCachingSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CACHE_DIR = 4,
+    VT_MODEL_TOKEN = 6
+  };
+  const ::flatbuffers::String *cache_dir() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIR);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CACHE_DIR) &&
+           verifier.VerifyString(cache_dir()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           verifier.EndTable();
+  }
+  CompilationCachingSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CompilationCachingSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CompilationCachingSettingsBuilder {
+  typedef CompilationCachingSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cache_dir(::flatbuffers::Offset<::flatbuffers::String> cache_dir) {
+    fbb_.AddOffset(CompilationCachingSettings::VT_CACHE_DIR, cache_dir);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(CompilationCachingSettings::VT_MODEL_TOKEN, model_token);
+  }
+  explicit CompilationCachingSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CompilationCachingSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CompilationCachingSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_dir = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
+  CompilationCachingSettingsBuilder builder_(_fbb);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_dir(cache_dir);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *cache_dir = nullptr,
+    const char *model_token = nullptr) {
+  auto cache_dir__ = cache_dir ? _fbb.CreateString(cache_dir) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateCompilationCachingSettings(
+      _fbb,
+      cache_dir__,
+      model_token__);
+}
+
+::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuDeviceSpecT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuDeviceSpec TableType;
+  tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO;
+  int32_t num_chips = 0;
+  std::vector<std::string> device_paths{};
+  int32_t chip_family = 0;
+};
+
+struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuDeviceSpecT NativeTableType;
+  typedef EdgeTpuDeviceSpecBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PLATFORM_TYPE = 4,
+    VT_NUM_CHIPS = 6,
+    VT_DEVICE_PATHS = 8,
+    VT_CHIP_FAMILY = 10
+  };
+  tflite::EdgeTpuDeviceSpec_::PlatformType platform_type() const {
+    return static_cast<tflite::EdgeTpuDeviceSpec_::PlatformType>(GetField<int32_t>(VT_PLATFORM_TYPE, 0));
+  }
+  int32_t num_chips() const {
+    return GetField<int32_t>(VT_NUM_CHIPS, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DEVICE_PATHS);
+  }
+  int32_t chip_family() const {
+    return GetField<int32_t>(VT_CHIP_FAMILY, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_PLATFORM_TYPE, 4) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHIPS, 4) &&
+           VerifyOffset(verifier, VT_DEVICE_PATHS) &&
+           verifier.VerifyVector(device_paths()) &&
+           verifier.VerifyVectorOfStrings(device_paths()) &&
+           VerifyField<int32_t>(verifier, VT_CHIP_FAMILY, 4) &&
+           verifier.EndTable();
+  }
+  EdgeTpuDeviceSpecT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuDeviceSpec> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuDeviceSpecBuilder {
+  typedef EdgeTpuDeviceSpec Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_platform_type(tflite::EdgeTpuDeviceSpec_::PlatformType platform_type) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_PLATFORM_TYPE, static_cast<int32_t>(platform_type), 0);
+  }
+  void add_num_chips(int32_t num_chips) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_NUM_CHIPS, num_chips, 0);
+  }
+  void add_device_paths(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths) {
+    fbb_.AddOffset(EdgeTpuDeviceSpec::VT_DEVICE_PATHS, device_paths);
+  }
+  void add_chip_family(int32_t chip_family) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_CHIP_FAMILY, chip_family, 0);
+  }
+  explicit EdgeTpuDeviceSpecBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuDeviceSpec> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuDeviceSpec>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
+    int32_t num_chips = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths = 0,
+    int32_t chip_family = 0) {
+  EdgeTpuDeviceSpecBuilder builder_(_fbb);
+  builder_.add_chip_family(chip_family);
+  builder_.add_device_paths(device_paths);
+  builder_.add_num_chips(num_chips);
+  builder_.add_platform_type(platform_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpecDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
+    int32_t num_chips = 0,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths = nullptr,
+    int32_t chip_family = 0) {
+  auto device_paths__ = device_paths ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*device_paths) : 0;
+  return tflite::CreateEdgeTpuDeviceSpec(
+      _fbb,
+      platform_type,
+      num_chips,
+      device_paths__,
+      chip_family);
+}
+
+::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuInactivePowerConfigT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuInactivePowerConfig TableType;
+  tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
+  int64_t inactive_timeout_us = 0;
+};
+
+struct EdgeTpuInactivePowerConfig FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuInactivePowerConfigT NativeTableType;
+  typedef EdgeTpuInactivePowerConfigBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INACTIVE_POWER_STATE = 4,
+    VT_INACTIVE_TIMEOUT_US = 6
+  };
+  tflite::EdgeTpuPowerState inactive_power_state() const {
+    return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INACTIVE_POWER_STATE, 0));
+  }
+  int64_t inactive_timeout_us() const {
+    return GetField<int64_t>(VT_INACTIVE_TIMEOUT_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INACTIVE_POWER_STATE, 4) &&
+           VerifyField<int64_t>(verifier, VT_INACTIVE_TIMEOUT_US, 8) &&
+           verifier.EndTable();
+  }
+  EdgeTpuInactivePowerConfigT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuInactivePowerConfigBuilder {
+  typedef EdgeTpuInactivePowerConfig Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inactive_power_state(tflite::EdgeTpuPowerState inactive_power_state) {
+    fbb_.AddElement<int32_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_POWER_STATE, static_cast<int32_t>(inactive_power_state), 0);
+  }
+  void add_inactive_timeout_us(int64_t inactive_timeout_us) {
+    fbb_.AddElement<int64_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_TIMEOUT_US, inactive_timeout_us, 0);
+  }
+  explicit EdgeTpuInactivePowerConfigBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuInactivePowerConfig>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    int64_t inactive_timeout_us = 0) {
+  EdgeTpuInactivePowerConfigBuilder builder_(_fbb);
+  builder_.add_inactive_timeout_us(inactive_timeout_us);
+  builder_.add_inactive_power_state(inactive_power_state);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuSettingsT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuSettings TableType;
+  tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
+  std::vector<std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>> inactive_power_configs{};
+  int32_t inference_priority = -1;
+  std::unique_ptr<tflite::EdgeTpuDeviceSpecT> edgetpu_device_spec{};
+  std::string model_token{};
+  tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED;
+  tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED;
+  std::vector<int32_t> hardware_cluster_ids{};
+  std::string public_model_id{};
+  EdgeTpuSettingsT() = default;
+  EdgeTpuSettingsT(const EdgeTpuSettingsT &o);
+  EdgeTpuSettingsT(EdgeTpuSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  EdgeTpuSettingsT &operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuSettingsT NativeTableType;
+  typedef EdgeTpuSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INFERENCE_POWER_STATE = 4,
+    VT_INACTIVE_POWER_CONFIGS = 6,
+    VT_INFERENCE_PRIORITY = 8,
+    VT_EDGETPU_DEVICE_SPEC = 10,
+    VT_MODEL_TOKEN = 12,
+    VT_FLOAT_TRUNCATION_TYPE = 14,
+    VT_QOS_CLASS = 16,
+    VT_HARDWARE_CLUSTER_IDS = 18,
+    VT_PUBLIC_MODEL_ID = 20
+  };
+  tflite::EdgeTpuPowerState inference_power_state() const {
+    return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INFERENCE_POWER_STATE, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *>(VT_INACTIVE_POWER_CONFIGS);
+  }
+  int32_t inference_priority() const {
+    return GetField<int32_t>(VT_INFERENCE_PRIORITY, -1);
+  }
+  const tflite::EdgeTpuDeviceSpec *edgetpu_device_spec() const {
+    return GetPointer<const tflite::EdgeTpuDeviceSpec *>(VT_EDGETPU_DEVICE_SPEC);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type() const {
+    return static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(GetField<int32_t>(VT_FLOAT_TRUNCATION_TYPE, 0));
+  }
+  tflite::EdgeTpuSettings_::QosClass qos_class() const {
+    return static_cast<tflite::EdgeTpuSettings_::QosClass>(GetField<int32_t>(VT_QOS_CLASS, 0));
+  }
+  const ::flatbuffers::Vector<int32_t> *hardware_cluster_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_HARDWARE_CLUSTER_IDS);
+  }
+  const ::flatbuffers::String *public_model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_PUBLIC_MODEL_ID);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_POWER_STATE, 4) &&
+           VerifyOffset(verifier, VT_INACTIVE_POWER_CONFIGS) &&
+           verifier.VerifyVector(inactive_power_configs()) &&
+           verifier.VerifyVectorOfTables(inactive_power_configs()) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_EDGETPU_DEVICE_SPEC) &&
+           verifier.VerifyTable(edgetpu_device_spec()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           VerifyField<int32_t>(verifier, VT_FLOAT_TRUNCATION_TYPE, 4) &&
+           VerifyField<int32_t>(verifier, VT_QOS_CLASS, 4) &&
+           VerifyOffset(verifier, VT_HARDWARE_CLUSTER_IDS) &&
+           verifier.VerifyVector(hardware_cluster_ids()) &&
+           VerifyOffset(verifier, VT_PUBLIC_MODEL_ID) &&
+           verifier.VerifyString(public_model_id()) &&
+           verifier.EndTable();
+  }
+  EdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuSettingsBuilder {
+  typedef EdgeTpuSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inference_power_state(tflite::EdgeTpuPowerState inference_power_state) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_POWER_STATE, static_cast<int32_t>(inference_power_state), 0);
+  }
+  void add_inactive_power_configs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_INACTIVE_POWER_CONFIGS, inactive_power_configs);
+  }
+  void add_inference_priority(int32_t inference_priority) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_PRIORITY, inference_priority, -1);
+  }
+  void add_edgetpu_device_spec(::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_EDGETPU_DEVICE_SPEC, edgetpu_device_spec);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_MODEL_TOKEN, model_token);
+  }
+  void add_float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_FLOAT_TRUNCATION_TYPE, static_cast<int32_t>(float_truncation_type), 0);
+  }
+  void add_qos_class(tflite::EdgeTpuSettings_::QosClass qos_class) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_QOS_CLASS, static_cast<int32_t>(qos_class), 0);
+  }
+  void add_hardware_cluster_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_HARDWARE_CLUSTER_IDS, hardware_cluster_ids);
+  }
+  void add_public_model_id(::flatbuffers::Offset<::flatbuffers::String> public_model_id) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_PUBLIC_MODEL_ID, public_model_id);
+  }
+  explicit EdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs = 0,
+    int32_t inference_priority = -1,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> public_model_id = 0) {
+  EdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_public_model_id(public_model_id);
+  builder_.add_hardware_cluster_ids(hardware_cluster_ids);
+  builder_.add_qos_class(qos_class);
+  builder_.add_float_truncation_type(float_truncation_type);
+  builder_.add_model_token(model_token);
+  builder_.add_edgetpu_device_spec(edgetpu_device_spec);
+  builder_.add_inference_priority(inference_priority);
+  builder_.add_inactive_power_configs(inactive_power_configs);
+  builder_.add_inference_power_state(inference_power_state);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    const std::vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs = nullptr,
+    int32_t inference_priority = -1,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    const char *model_token = nullptr,
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
+    const std::vector<int32_t> *hardware_cluster_ids = nullptr,
+    const char *public_model_id = nullptr) {
+  auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  auto hardware_cluster_ids__ = hardware_cluster_ids ? _fbb.CreateVector<int32_t>(*hardware_cluster_ids) : 0;
+  auto public_model_id__ = public_model_id ? _fbb.CreateString(public_model_id) : 0;
+  return tflite::CreateEdgeTpuSettings(
+      _fbb,
+      inference_power_state,
+      inactive_power_configs__,
+      inference_priority,
+      edgetpu_device_spec,
+      model_token__,
+      float_truncation_type,
+      qos_class,
+      hardware_cluster_ids__,
+      public_model_id__);
+}
+
+::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GoogleEdgeTpuSettingsT : public ::flatbuffers::NativeTable {
+  typedef GoogleEdgeTpuSettings TableType;
+  int32_t log_verbosity = -1;
+  bool enable_tracing = false;
+  tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED;
+  std::vector<uint8_t> extension_data{};
+  std::string model_identifier{};
+  bool use_async_api = false;
+  bool delegate_should_manage_cache_for_inputs = true;
+  bool delegate_should_manage_cache_for_outputs = true;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+};
+
+struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GoogleEdgeTpuSettingsT NativeTableType;
+  typedef GoogleEdgeTpuSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LOG_VERBOSITY = 4,
+    VT_ENABLE_TRACING = 6,
+    VT_PRIORITY = 8,
+    VT_EXTENSION_DATA = 10,
+    VT_MODEL_IDENTIFIER = 12,
+    VT_USE_ASYNC_API = 14,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS = 16,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS = 18,
+    VT_PREFER_CACHE_COHERENCY_FOR_INPUTS = 20,
+    VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS = 22
+  };
+  int32_t log_verbosity() const {
+    return GetField<int32_t>(VT_LOG_VERBOSITY, -1);
+  }
+  bool enable_tracing() const {
+    return GetField<uint8_t>(VT_ENABLE_TRACING, 0) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::Priority priority() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::Priority>(GetField<int32_t>(VT_PRIORITY, 0));
+  }
+  const ::flatbuffers::Vector<uint8_t> *extension_data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_EXTENSION_DATA);
+  }
+  const ::flatbuffers::String *model_identifier() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER);
+  }
+  bool use_async_api() const {
+    return GetField<uint8_t>(VT_USE_ASYNC_API, 0) != 0;
+  }
+  bool delegate_should_manage_cache_for_inputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) != 0;
+  }
+  bool delegate_should_manage_cache_for_outputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 0));
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_LOG_VERBOSITY, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_TRACING, 1) &&
+           VerifyField<int32_t>(verifier, VT_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_EXTENSION_DATA) &&
+           verifier.VerifyVector(extension_data()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER) &&
+           verifier.VerifyString(model_identifier()) &&
+           VerifyField<uint8_t>(verifier, VT_USE_ASYNC_API, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 4) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 4) &&
+           verifier.EndTable();
+  }
+  GoogleEdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GoogleEdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GoogleEdgeTpuSettingsBuilder {
+  typedef GoogleEdgeTpuSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_log_verbosity(int32_t log_verbosity) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_LOG_VERBOSITY, log_verbosity, -1);
+  }
+  void add_enable_tracing(bool enable_tracing) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_ENABLE_TRACING, static_cast<uint8_t>(enable_tracing), 0);
+  }
+  void add_priority(tflite::GoogleEdgeTpuSettings_::Priority priority) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PRIORITY, static_cast<int32_t>(priority), 0);
+  }
+  void add_extension_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_EXTENSION_DATA, extension_data);
+  }
+  void add_model_identifier(::flatbuffers::Offset<::flatbuffers::String> model_identifier) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_MODEL_IDENTIFIER, model_identifier);
+  }
+  void add_use_async_api(bool use_async_api) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_USE_ASYNC_API, static_cast<uint8_t>(use_async_api), 0);
+  }
+  void add_delegate_should_manage_cache_for_inputs(bool delegate_should_manage_cache_for_inputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_inputs), 1);
+  }
+  void add_delegate_should_manage_cache_for_outputs(bool delegate_should_manage_cache_for_outputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_outputs), 1);
+  }
+  void add_prefer_cache_coherency_for_inputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, static_cast<int32_t>(prefer_cache_coherency_for_inputs), 0);
+  }
+  void add_prefer_cache_coherency_for_outputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, static_cast<int32_t>(prefer_cache_coherency_for_outputs), 0);
+  }
+  explicit GoogleEdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GoogleEdgeTpuSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GoogleEdgeTpuSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t log_verbosity = -1,
+    bool enable_tracing = false,
+    tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier = 0,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
+  GoogleEdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_prefer_cache_coherency_for_outputs(prefer_cache_coherency_for_outputs);
+  builder_.add_prefer_cache_coherency_for_inputs(prefer_cache_coherency_for_inputs);
+  builder_.add_model_identifier(model_identifier);
+  builder_.add_extension_data(extension_data);
+  builder_.add_priority(priority);
+  builder_.add_log_verbosity(log_verbosity);
+  builder_.add_delegate_should_manage_cache_for_outputs(delegate_should_manage_cache_for_outputs);
+  builder_.add_delegate_should_manage_cache_for_inputs(delegate_should_manage_cache_for_inputs);
+  builder_.add_use_async_api(use_async_api);
+  builder_.add_enable_tracing(enable_tracing);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t log_verbosity = -1,
+    bool enable_tracing = false,
+    tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
+    const std::vector<uint8_t> *extension_data = nullptr,
+    const char *model_identifier = nullptr,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
+  auto extension_data__ = extension_data ? _fbb.CreateVector<uint8_t>(*extension_data) : 0;
+  auto model_identifier__ = model_identifier ? _fbb.CreateString(model_identifier) : 0;
+  return tflite::CreateGoogleEdgeTpuSettings(
+      _fbb,
+      log_verbosity,
+      enable_tracing,
+      priority,
+      extension_data__,
+      model_identifier__,
+      use_async_api,
+      delegate_should_manage_cache_for_inputs,
+      delegate_should_manage_cache_for_outputs,
+      prefer_cache_coherency_for_inputs,
+      prefer_cache_coherency_for_outputs);
+}
+
+::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CoralSettingsT : public ::flatbuffers::NativeTable {
+  typedef CoralSettings TableType;
+  std::string device{};
+  tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED;
+  bool usb_always_dfu = false;
+  int32_t usb_max_bulk_in_queue_length = 0;
+};
+
+struct CoralSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CoralSettingsT NativeTableType;
+  typedef CoralSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEVICE = 4,
+    VT_PERFORMANCE = 6,
+    VT_USB_ALWAYS_DFU = 8,
+    VT_USB_MAX_BULK_IN_QUEUE_LENGTH = 10
+  };
+  const ::flatbuffers::String *device() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DEVICE);
+  }
+  tflite::CoralSettings_::Performance performance() const {
+    return static_cast<tflite::CoralSettings_::Performance>(GetField<int32_t>(VT_PERFORMANCE, 0));
+  }
+  bool usb_always_dfu() const {
+    return GetField<uint8_t>(VT_USB_ALWAYS_DFU, 0) != 0;
+  }
+  int32_t usb_max_bulk_in_queue_length() const {
+    return GetField<int32_t>(VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DEVICE) &&
+           verifier.VerifyString(device()) &&
+           VerifyField<int32_t>(verifier, VT_PERFORMANCE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_USB_ALWAYS_DFU, 1) &&
+           VerifyField<int32_t>(verifier, VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 4) &&
+           verifier.EndTable();
+  }
+  CoralSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoralSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CoralSettingsBuilder {
+  typedef CoralSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_device(::flatbuffers::Offset<::flatbuffers::String> device) {
+    fbb_.AddOffset(CoralSettings::VT_DEVICE, device);
+  }
+  void add_performance(tflite::CoralSettings_::Performance performance) {
+    fbb_.AddElement<int32_t>(CoralSettings::VT_PERFORMANCE, static_cast<int32_t>(performance), 0);
+  }
+  void add_usb_always_dfu(bool usb_always_dfu) {
+    fbb_.AddElement<uint8_t>(CoralSettings::VT_USB_ALWAYS_DFU, static_cast<uint8_t>(usb_always_dfu), 0);
+  }
+  void add_usb_max_bulk_in_queue_length(int32_t usb_max_bulk_in_queue_length) {
+    fbb_.AddElement<int32_t>(CoralSettings::VT_USB_MAX_BULK_IN_QUEUE_LENGTH, usb_max_bulk_in_queue_length, 0);
+  }
+  explicit CoralSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CoralSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CoralSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> device = 0,
+    tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
+    bool usb_always_dfu = false,
+    int32_t usb_max_bulk_in_queue_length = 0) {
+  CoralSettingsBuilder builder_(_fbb);
+  builder_.add_usb_max_bulk_in_queue_length(usb_max_bulk_in_queue_length);
+  builder_.add_performance(performance);
+  builder_.add_device(device);
+  builder_.add_usb_always_dfu(usb_always_dfu);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *device = nullptr,
+    tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
+    bool usb_always_dfu = false,
+    int32_t usb_max_bulk_in_queue_length = 0) {
+  auto device__ = device ? _fbb.CreateString(device) : 0;
+  return tflite::CreateCoralSettings(
+      _fbb,
+      device__,
+      performance,
+      usb_always_dfu,
+      usb_max_bulk_in_queue_length);
+}
+
+::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CPUSettingsT : public ::flatbuffers::NativeTable {
+  typedef CPUSettings TableType;
+  int32_t num_threads = -1;
+};
+
+struct CPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CPUSettingsT NativeTableType;
+  typedef CPUSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_THREADS = 4
+  };
+  int32_t num_threads() const {
+    return GetField<int32_t>(VT_NUM_THREADS, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
+           verifier.EndTable();
+  }
+  CPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CPUSettingsBuilder {
+  typedef CPUSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_threads(int32_t num_threads) {
+    fbb_.AddElement<int32_t>(CPUSettings::VT_NUM_THREADS, num_threads, -1);
+  }
+  explicit CPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CPUSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CPUSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = -1) {
+  CPUSettingsBuilder builder_(_fbb);
+  builder_.add_num_threads(num_threads);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArmNNSettingsT : public ::flatbuffers::NativeTable {
+  typedef ArmNNSettings TableType;
+  std::string backends{};
+  bool fastmath = false;
+  std::string additional_parameters{};
+};
+
+struct ArmNNSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArmNNSettingsT NativeTableType;
+  typedef ArmNNSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BACKENDS = 4,
+    VT_FASTMATH = 6,
+    VT_ADDITIONAL_PARAMETERS = 8
+  };
+  const ::flatbuffers::String *backends() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_BACKENDS);
+  }
+  bool fastmath() const {
+    return GetField<uint8_t>(VT_FASTMATH, 0) != 0;
+  }
+  const ::flatbuffers::String *additional_parameters() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ADDITIONAL_PARAMETERS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BACKENDS) &&
+           verifier.VerifyString(backends()) &&
+           VerifyField<uint8_t>(verifier, VT_FASTMATH, 1) &&
+           VerifyOffset(verifier, VT_ADDITIONAL_PARAMETERS) &&
+           verifier.VerifyString(additional_parameters()) &&
+           verifier.EndTable();
+  }
+  ArmNNSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArmNNSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArmNNSettingsBuilder {
+  typedef ArmNNSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_backends(::flatbuffers::Offset<::flatbuffers::String> backends) {
+    fbb_.AddOffset(ArmNNSettings::VT_BACKENDS, backends);
+  }
+  void add_fastmath(bool fastmath) {
+    fbb_.AddElement<uint8_t>(ArmNNSettings::VT_FASTMATH, static_cast<uint8_t>(fastmath), 0);
+  }
+  void add_additional_parameters(::flatbuffers::Offset<::flatbuffers::String> additional_parameters) {
+    fbb_.AddOffset(ArmNNSettings::VT_ADDITIONAL_PARAMETERS, additional_parameters);
+  }
+  explicit ArmNNSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArmNNSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArmNNSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> backends = 0,
+    bool fastmath = false,
+    ::flatbuffers::Offset<::flatbuffers::String> additional_parameters = 0) {
+  ArmNNSettingsBuilder builder_(_fbb);
+  builder_.add_additional_parameters(additional_parameters);
+  builder_.add_backends(backends);
+  builder_.add_fastmath(fastmath);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *backends = nullptr,
+    bool fastmath = false,
+    const char *additional_parameters = nullptr) {
+  auto backends__ = backends ? _fbb.CreateString(backends) : 0;
+  auto additional_parameters__ = additional_parameters ? _fbb.CreateString(additional_parameters) : 0;
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      backends__,
+      fastmath,
+      additional_parameters__);
+}
+
+::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
+  typedef TFLiteSettings TableType;
+  tflite::Delegate delegate = tflite::Delegate_NONE;
+  std::unique_ptr<tflite::NNAPISettingsT> nnapi_settings{};
+  std::unique_ptr<tflite::GPUSettingsT> gpu_settings{};
+  std::unique_ptr<tflite::HexagonSettingsT> hexagon_settings{};
+  std::unique_ptr<tflite::XNNPackSettingsT> xnnpack_settings{};
+  std::unique_ptr<tflite::CoreMLSettingsT> coreml_settings{};
+  std::unique_ptr<tflite::CPUSettingsT> cpu_settings{};
+  int32_t max_delegated_partitions = 0;
+  std::unique_ptr<tflite::EdgeTpuSettingsT> edgetpu_settings{};
+  std::unique_ptr<tflite::CoralSettingsT> coral_settings{};
+  std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
+  bool disable_default_delegates = false;
+  std::unique_ptr<tflite::StableDelegateLoaderSettingsT> stable_delegate_loader_settings{};
+  std::unique_ptr<tflite::GoogleEdgeTpuSettingsT> google_edgetpu_settings{};
+  std::unique_ptr<tflite::CompilationCachingSettingsT> compilation_caching_settings{};
+  std::unique_ptr<tflite::ArmNNSettingsT> armnn_settings{};
+  TFLiteSettingsT() = default;
+  TFLiteSettingsT(const TFLiteSettingsT &o);
+  TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  TFLiteSettingsT &operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TFLiteSettingsT NativeTableType;
+  typedef TFLiteSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE = 4,
+    VT_NNAPI_SETTINGS = 6,
+    VT_GPU_SETTINGS = 8,
+    VT_HEXAGON_SETTINGS = 10,
+    VT_XNNPACK_SETTINGS = 12,
+    VT_COREML_SETTINGS = 14,
+    VT_CPU_SETTINGS = 16,
+    VT_MAX_DELEGATED_PARTITIONS = 18,
+    VT_EDGETPU_SETTINGS = 20,
+    VT_CORAL_SETTINGS = 22,
+    VT_FALLBACK_SETTINGS = 24,
+    VT_DISABLE_DEFAULT_DELEGATES = 26,
+    VT_STABLE_DELEGATE_LOADER_SETTINGS = 28,
+    VT_GOOGLE_EDGETPU_SETTINGS = 30,
+    VT_COMPILATION_CACHING_SETTINGS = 32,
+    VT_ARMNN_SETTINGS = 34
+  };
+  tflite::Delegate delegate() const {
+    return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
+  }
+  const tflite::NNAPISettings *nnapi_settings() const {
+    return GetPointer<const tflite::NNAPISettings *>(VT_NNAPI_SETTINGS);
+  }
+  const tflite::GPUSettings *gpu_settings() const {
+    return GetPointer<const tflite::GPUSettings *>(VT_GPU_SETTINGS);
+  }
+  const tflite::HexagonSettings *hexagon_settings() const {
+    return GetPointer<const tflite::HexagonSettings *>(VT_HEXAGON_SETTINGS);
+  }
+  const tflite::XNNPackSettings *xnnpack_settings() const {
+    return GetPointer<const tflite::XNNPackSettings *>(VT_XNNPACK_SETTINGS);
+  }
+  const tflite::CoreMLSettings *coreml_settings() const {
+    return GetPointer<const tflite::CoreMLSettings *>(VT_COREML_SETTINGS);
+  }
+  const tflite::CPUSettings *cpu_settings() const {
+    return GetPointer<const tflite::CPUSettings *>(VT_CPU_SETTINGS);
+  }
+  int32_t max_delegated_partitions() const {
+    return GetField<int32_t>(VT_MAX_DELEGATED_PARTITIONS, 0);
+  }
+  const tflite::EdgeTpuSettings *edgetpu_settings() const {
+    return GetPointer<const tflite::EdgeTpuSettings *>(VT_EDGETPU_SETTINGS);
+  }
+  const tflite::CoralSettings *coral_settings() const {
+    return GetPointer<const tflite::CoralSettings *>(VT_CORAL_SETTINGS);
+  }
+  const tflite::FallbackSettings *fallback_settings() const {
+    return GetPointer<const tflite::FallbackSettings *>(VT_FALLBACK_SETTINGS);
+  }
+  bool disable_default_delegates() const {
+    return GetField<uint8_t>(VT_DISABLE_DEFAULT_DELEGATES, 0) != 0;
+  }
+  const tflite::StableDelegateLoaderSettings *stable_delegate_loader_settings() const {
+    return GetPointer<const tflite::StableDelegateLoaderSettings *>(VT_STABLE_DELEGATE_LOADER_SETTINGS);
+  }
+  const tflite::GoogleEdgeTpuSettings *google_edgetpu_settings() const {
+    return GetPointer<const tflite::GoogleEdgeTpuSettings *>(VT_GOOGLE_EDGETPU_SETTINGS);
+  }
+  const tflite::CompilationCachingSettings *compilation_caching_settings() const {
+    return GetPointer<const tflite::CompilationCachingSettings *>(VT_COMPILATION_CACHING_SETTINGS);
+  }
+  const tflite::ArmNNSettings *armnn_settings() const {
+    return GetPointer<const tflite::ArmNNSettings *>(VT_ARMNN_SETTINGS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
+           VerifyOffset(verifier, VT_NNAPI_SETTINGS) &&
+           verifier.VerifyTable(nnapi_settings()) &&
+           VerifyOffset(verifier, VT_GPU_SETTINGS) &&
+           verifier.VerifyTable(gpu_settings()) &&
+           VerifyOffset(verifier, VT_HEXAGON_SETTINGS) &&
+           verifier.VerifyTable(hexagon_settings()) &&
+           VerifyOffset(verifier, VT_XNNPACK_SETTINGS) &&
+           verifier.VerifyTable(xnnpack_settings()) &&
+           VerifyOffset(verifier, VT_COREML_SETTINGS) &&
+           verifier.VerifyTable(coreml_settings()) &&
+           VerifyOffset(verifier, VT_CPU_SETTINGS) &&
+           verifier.VerifyTable(cpu_settings()) &&
+           VerifyField<int32_t>(verifier, VT_MAX_DELEGATED_PARTITIONS, 4) &&
+           VerifyOffset(verifier, VT_EDGETPU_SETTINGS) &&
+           verifier.VerifyTable(edgetpu_settings()) &&
+           VerifyOffset(verifier, VT_CORAL_SETTINGS) &&
+           verifier.VerifyTable(coral_settings()) &&
+           VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
+           verifier.VerifyTable(fallback_settings()) &&
+           VerifyField<uint8_t>(verifier, VT_DISABLE_DEFAULT_DELEGATES, 1) &&
+           VerifyOffset(verifier, VT_STABLE_DELEGATE_LOADER_SETTINGS) &&
+           verifier.VerifyTable(stable_delegate_loader_settings()) &&
+           VerifyOffset(verifier, VT_GOOGLE_EDGETPU_SETTINGS) &&
+           verifier.VerifyTable(google_edgetpu_settings()) &&
+           VerifyOffset(verifier, VT_COMPILATION_CACHING_SETTINGS) &&
+           verifier.VerifyTable(compilation_caching_settings()) &&
+           VerifyOffset(verifier, VT_ARMNN_SETTINGS) &&
+           verifier.VerifyTable(armnn_settings()) &&
+           verifier.EndTable();
+  }
+  TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TFLiteSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TFLiteSettingsBuilder {
+  typedef TFLiteSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate(tflite::Delegate delegate) {
+    fbb_.AddElement<int32_t>(TFLiteSettings::VT_DELEGATE, static_cast<int32_t>(delegate), 0);
+  }
+  void add_nnapi_settings(::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_NNAPI_SETTINGS, nnapi_settings);
+  }
+  void add_gpu_settings(::flatbuffers::Offset<tflite::GPUSettings> gpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_GPU_SETTINGS, gpu_settings);
+  }
+  void add_hexagon_settings(::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_HEXAGON_SETTINGS, hexagon_settings);
+  }
+  void add_xnnpack_settings(::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_XNNPACK_SETTINGS, xnnpack_settings);
+  }
+  void add_coreml_settings(::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_COREML_SETTINGS, coreml_settings);
+  }
+  void add_cpu_settings(::flatbuffers::Offset<tflite::CPUSettings> cpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_CPU_SETTINGS, cpu_settings);
+  }
+  void add_max_delegated_partitions(int32_t max_delegated_partitions) {
+    fbb_.AddElement<int32_t>(TFLiteSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
+  }
+  void add_edgetpu_settings(::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_EDGETPU_SETTINGS, edgetpu_settings);
+  }
+  void add_coral_settings(::flatbuffers::Offset<tflite::CoralSettings> coral_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_CORAL_SETTINGS, coral_settings);
+  }
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_FALLBACK_SETTINGS, fallback_settings);
+  }
+  void add_disable_default_delegates(bool disable_default_delegates) {
+    fbb_.AddElement<uint8_t>(TFLiteSettings::VT_DISABLE_DEFAULT_DELEGATES, static_cast<uint8_t>(disable_default_delegates), 0);
+  }
+  void add_stable_delegate_loader_settings(::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_STABLE_DELEGATE_LOADER_SETTINGS, stable_delegate_loader_settings);
+  }
+  void add_google_edgetpu_settings(::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_GOOGLE_EDGETPU_SETTINGS, google_edgetpu_settings);
+  }
+  void add_compilation_caching_settings(::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_COMPILATION_CACHING_SETTINGS, compilation_caching_settings);
+  }
+  void add_armnn_settings(::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_ARMNN_SETTINGS, armnn_settings);
+  }
+  explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TFLiteSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TFLiteSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Delegate delegate = tflite::Delegate_NONE,
+    ::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings = 0,
+    ::flatbuffers::Offset<tflite::GPUSettings> gpu_settings = 0,
+    ::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings = 0,
+    ::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings = 0,
+    ::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings = 0,
+    ::flatbuffers::Offset<tflite::CPUSettings> cpu_settings = 0,
+    int32_t max_delegated_partitions = 0,
+    ::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CoralSettings> coral_settings = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool disable_default_delegates = false,
+    ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
+    ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0,
+    ::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings = 0) {
+  TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_armnn_settings(armnn_settings);
+  builder_.add_compilation_caching_settings(compilation_caching_settings);
+  builder_.add_google_edgetpu_settings(google_edgetpu_settings);
+  builder_.add_stable_delegate_loader_settings(stable_delegate_loader_settings);
+  builder_.add_fallback_settings(fallback_settings);
+  builder_.add_coral_settings(coral_settings);
+  builder_.add_edgetpu_settings(edgetpu_settings);
+  builder_.add_max_delegated_partitions(max_delegated_partitions);
+  builder_.add_cpu_settings(cpu_settings);
+  builder_.add_coreml_settings(coreml_settings);
+  builder_.add_xnnpack_settings(xnnpack_settings);
+  builder_.add_hexagon_settings(hexagon_settings);
+  builder_.add_gpu_settings(gpu_settings);
+  builder_.add_nnapi_settings(nnapi_settings);
+  builder_.add_delegate(delegate);
+  builder_.add_disable_default_delegates(disable_default_delegates);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FallbackSettingsT : public ::flatbuffers::NativeTable {
+  typedef FallbackSettings TableType;
+  bool allow_automatic_fallback_on_compilation_error = false;
+  bool allow_automatic_fallback_on_execution_error = false;
+};
+
+struct FallbackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FallbackSettingsT NativeTableType;
+  typedef FallbackSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR = 4,
+    VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR = 6
+  };
+  bool allow_automatic_fallback_on_compilation_error() const {
+    return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 0) != 0;
+  }
+  bool allow_automatic_fallback_on_execution_error() const {
+    return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 1) &&
+           verifier.EndTable();
+  }
+  FallbackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FallbackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FallbackSettingsBuilder {
+  typedef FallbackSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_allow_automatic_fallback_on_compilation_error(bool allow_automatic_fallback_on_compilation_error) {
+    fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_compilation_error), 0);
+  }
+  void add_allow_automatic_fallback_on_execution_error(bool allow_automatic_fallback_on_execution_error) {
+    fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_execution_error), 0);
+  }
+  explicit FallbackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FallbackSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FallbackSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool allow_automatic_fallback_on_compilation_error = false,
+    bool allow_automatic_fallback_on_execution_error = false) {
+  FallbackSettingsBuilder builder_(_fbb);
+  builder_.add_allow_automatic_fallback_on_execution_error(allow_automatic_fallback_on_execution_error);
+  builder_.add_allow_automatic_fallback_on_compilation_error(allow_automatic_fallback_on_compilation_error);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkMetricT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkMetric TableType;
+  std::string name{};
+  std::vector<float> values{};
+};
+
+struct BenchmarkMetric FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkMetricT NativeTableType;
+  typedef BenchmarkMetricBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUES = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::Vector<float> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  BenchmarkMetricT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkMetric> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkMetricBuilder {
+  typedef BenchmarkMetric Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(BenchmarkMetric::VT_NAME, name);
+  }
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<float>> values) {
+    fbb_.AddOffset(BenchmarkMetric::VT_VALUES, values);
+  }
+  explicit BenchmarkMetricBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkMetric> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkMetric>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> values = 0) {
+  BenchmarkMetricBuilder builder_(_fbb);
+  builder_.add_values(values);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetricDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const std::vector<float> *values = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto values__ = values ? _fbb.CreateVector<float>(*values) : 0;
+  return tflite::CreateBenchmarkMetric(
+      _fbb,
+      name__,
+      values__);
+}
+
+::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkResultT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkResult TableType;
+  std::vector<int64_t> initialization_time_us{};
+  std::vector<int64_t> inference_time_us{};
+  int32_t max_memory_kb = 0;
+  bool ok = false;
+  std::vector<std::unique_ptr<tflite::BenchmarkMetricT>> metrics{};
+  std::vector<std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>> actual_output{};
+  BenchmarkResultT() = default;
+  BenchmarkResultT(const BenchmarkResultT &o);
+  BenchmarkResultT(BenchmarkResultT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkResultT &operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkResultT NativeTableType;
+  typedef BenchmarkResultBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INITIALIZATION_TIME_US = 4,
+    VT_INFERENCE_TIME_US = 6,
+    VT_MAX_MEMORY_KB = 8,
+    VT_OK = 10,
+    VT_METRICS = 12,
+    VT_ACTUAL_OUTPUT = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *initialization_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INITIALIZATION_TIME_US);
+  }
+  const ::flatbuffers::Vector<int64_t> *inference_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INFERENCE_TIME_US);
+  }
+  int32_t max_memory_kb() const {
+    return GetField<int32_t>(VT_MAX_MEMORY_KB, 0);
+  }
+  bool ok() const {
+    return GetField<uint8_t>(VT_OK, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *>(VT_METRICS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *>(VT_ACTUAL_OUTPUT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INITIALIZATION_TIME_US) &&
+           verifier.VerifyVector(initialization_time_us()) &&
+           VerifyOffset(verifier, VT_INFERENCE_TIME_US) &&
+           verifier.VerifyVector(inference_time_us()) &&
+           VerifyField<int32_t>(verifier, VT_MAX_MEMORY_KB, 4) &&
+           VerifyField<uint8_t>(verifier, VT_OK, 1) &&
+           VerifyOffset(verifier, VT_METRICS) &&
+           verifier.VerifyVector(metrics()) &&
+           verifier.VerifyVectorOfTables(metrics()) &&
+           VerifyOffset(verifier, VT_ACTUAL_OUTPUT) &&
+           verifier.VerifyVector(actual_output()) &&
+           verifier.VerifyVectorOfTables(actual_output()) &&
+           verifier.EndTable();
+  }
+  BenchmarkResultT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkResult> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkResultBuilder {
+  typedef BenchmarkResult Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us) {
+    fbb_.AddOffset(BenchmarkResult::VT_INITIALIZATION_TIME_US, initialization_time_us);
+  }
+  void add_inference_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us) {
+    fbb_.AddOffset(BenchmarkResult::VT_INFERENCE_TIME_US, inference_time_us);
+  }
+  void add_max_memory_kb(int32_t max_memory_kb) {
+    fbb_.AddElement<int32_t>(BenchmarkResult::VT_MAX_MEMORY_KB, max_memory_kb, 0);
+  }
+  void add_ok(bool ok) {
+    fbb_.AddElement<uint8_t>(BenchmarkResult::VT_OK, static_cast<uint8_t>(ok), 0);
+  }
+  void add_metrics(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics) {
+    fbb_.AddOffset(BenchmarkResult::VT_METRICS, metrics);
+  }
+  void add_actual_output(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output) {
+    fbb_.AddOffset(BenchmarkResult::VT_ACTUAL_OUTPUT, actual_output);
+  }
+  explicit BenchmarkResultBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkResult> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkResult>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us = 0,
+    int32_t max_memory_kb = 0,
+    bool ok = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output = 0) {
+  BenchmarkResultBuilder builder_(_fbb);
+  builder_.add_actual_output(actual_output);
+  builder_.add_metrics(metrics);
+  builder_.add_max_memory_kb(max_memory_kb);
+  builder_.add_inference_time_us(inference_time_us);
+  builder_.add_initialization_time_us(initialization_time_us);
+  builder_.add_ok(ok);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *initialization_time_us = nullptr,
+    const std::vector<int64_t> *inference_time_us = nullptr,
+    int32_t max_memory_kb = 0,
+    bool ok = false,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output = nullptr) {
+  auto initialization_time_us__ = initialization_time_us ? _fbb.CreateVector<int64_t>(*initialization_time_us) : 0;
+  auto inference_time_us__ = inference_time_us ? _fbb.CreateVector<int64_t>(*inference_time_us) : 0;
+  auto metrics__ = metrics ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>>(*metrics) : 0;
+  auto actual_output__ = actual_output ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>(*actual_output) : 0;
+  return tflite::CreateBenchmarkResult(
+      _fbb,
+      initialization_time_us__,
+      inference_time_us__,
+      max_memory_kb,
+      ok,
+      metrics__,
+      actual_output__);
+}
+
+::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+namespace BenchmarkResult_ {
+
+struct InferenceOutputT : public ::flatbuffers::NativeTable {
+  typedef InferenceOutput TableType;
+  std::vector<uint8_t> value{};
+};
+
+struct InferenceOutput FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef InferenceOutputT NativeTableType;
+  typedef InferenceOutputBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUE = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *value() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyVector(value()) &&
+           verifier.EndTable();
+  }
+  InferenceOutputT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<InferenceOutput> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct InferenceOutputBuilder {
+  typedef InferenceOutput Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_value(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value) {
+    fbb_.AddOffset(InferenceOutput::VT_VALUE, value);
+  }
+  explicit InferenceOutputBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<InferenceOutput> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<InferenceOutput>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value = 0) {
+  InferenceOutputBuilder builder_(_fbb);
+  builder_.add_value(value);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *value = nullptr) {
+  auto value__ = value ? _fbb.CreateVector<uint8_t>(*value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      value__);
+}
+
+::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace BenchmarkResult_
+
+struct ErrorCodeT : public ::flatbuffers::NativeTable {
+  typedef ErrorCode TableType;
+  tflite::Delegate source = tflite::Delegate_NONE;
+  int32_t tflite_error = 0;
+  int64_t underlying_api_error = 0;
+};
+
+struct ErrorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ErrorCodeT NativeTableType;
+  typedef ErrorCodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SOURCE = 4,
+    VT_TFLITE_ERROR = 6,
+    VT_UNDERLYING_API_ERROR = 8
+  };
+  tflite::Delegate source() const {
+    return static_cast<tflite::Delegate>(GetField<int32_t>(VT_SOURCE, 0));
+  }
+  int32_t tflite_error() const {
+    return GetField<int32_t>(VT_TFLITE_ERROR, 0);
+  }
+  int64_t underlying_api_error() const {
+    return GetField<int64_t>(VT_UNDERLYING_API_ERROR, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SOURCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_TFLITE_ERROR, 4) &&
+           VerifyField<int64_t>(verifier, VT_UNDERLYING_API_ERROR, 8) &&
+           verifier.EndTable();
+  }
+  ErrorCodeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ErrorCode> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ErrorCodeBuilder {
+  typedef ErrorCode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_source(tflite::Delegate source) {
+    fbb_.AddElement<int32_t>(ErrorCode::VT_SOURCE, static_cast<int32_t>(source), 0);
+  }
+  void add_tflite_error(int32_t tflite_error) {
+    fbb_.AddElement<int32_t>(ErrorCode::VT_TFLITE_ERROR, tflite_error, 0);
+  }
+  void add_underlying_api_error(int64_t underlying_api_error) {
+    fbb_.AddElement<int64_t>(ErrorCode::VT_UNDERLYING_API_ERROR, underlying_api_error, 0);
+  }
+  explicit ErrorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ErrorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ErrorCode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Delegate source = tflite::Delegate_NONE,
+    int32_t tflite_error = 0,
+    int64_t underlying_api_error = 0) {
+  ErrorCodeBuilder builder_(_fbb);
+  builder_.add_underlying_api_error(underlying_api_error);
+  builder_.add_tflite_error(tflite_error);
+  builder_.add_source(source);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkErrorT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkError TableType;
+  tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN;
+  int32_t exit_code = 0;
+  int32_t signal = 0;
+  std::vector<std::unique_ptr<tflite::ErrorCodeT>> error_code{};
+  int32_t mini_benchmark_error_code = 0;
+  BenchmarkErrorT() = default;
+  BenchmarkErrorT(const BenchmarkErrorT &o);
+  BenchmarkErrorT(BenchmarkErrorT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkErrorT &operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkErrorT NativeTableType;
+  typedef BenchmarkErrorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STAGE = 4,
+    VT_EXIT_CODE = 6,
+    VT_SIGNAL = 8,
+    VT_ERROR_CODE = 10,
+    VT_MINI_BENCHMARK_ERROR_CODE = 12
+  };
+  tflite::BenchmarkStage stage() const {
+    return static_cast<tflite::BenchmarkStage>(GetField<int32_t>(VT_STAGE, 0));
+  }
+  int32_t exit_code() const {
+    return GetField<int32_t>(VT_EXIT_CODE, 0);
+  }
+  int32_t signal() const {
+    return GetField<int32_t>(VT_SIGNAL, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *>(VT_ERROR_CODE);
+  }
+  int32_t mini_benchmark_error_code() const {
+    return GetField<int32_t>(VT_MINI_BENCHMARK_ERROR_CODE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_STAGE, 4) &&
+           VerifyField<int32_t>(verifier, VT_EXIT_CODE, 4) &&
+           VerifyField<int32_t>(verifier, VT_SIGNAL, 4) &&
+           VerifyOffset(verifier, VT_ERROR_CODE) &&
+           verifier.VerifyVector(error_code()) &&
+           verifier.VerifyVectorOfTables(error_code()) &&
+           VerifyField<int32_t>(verifier, VT_MINI_BENCHMARK_ERROR_CODE, 4) &&
+           verifier.EndTable();
+  }
+  BenchmarkErrorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkError> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkErrorBuilder {
+  typedef BenchmarkError Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_stage(tflite::BenchmarkStage stage) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_STAGE, static_cast<int32_t>(stage), 0);
+  }
+  void add_exit_code(int32_t exit_code) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_EXIT_CODE, exit_code, 0);
+  }
+  void add_signal(int32_t signal) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_SIGNAL, signal, 0);
+  }
+  void add_error_code(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code) {
+    fbb_.AddOffset(BenchmarkError::VT_ERROR_CODE, error_code);
+  }
+  void add_mini_benchmark_error_code(int32_t mini_benchmark_error_code) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_MINI_BENCHMARK_ERROR_CODE, mini_benchmark_error_code, 0);
+  }
+  explicit BenchmarkErrorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkError> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkError>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
+    int32_t exit_code = 0,
+    int32_t signal = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code = 0,
+    int32_t mini_benchmark_error_code = 0) {
+  BenchmarkErrorBuilder builder_(_fbb);
+  builder_.add_mini_benchmark_error_code(mini_benchmark_error_code);
+  builder_.add_error_code(error_code);
+  builder_.add_signal(signal);
+  builder_.add_exit_code(exit_code);
+  builder_.add_stage(stage);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkErrorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
+    int32_t exit_code = 0,
+    int32_t signal = 0,
+    const std::vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code = nullptr,
+    int32_t mini_benchmark_error_code = 0) {
+  auto error_code__ = error_code ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>>(*error_code) : 0;
+  return tflite::CreateBenchmarkError(
+      _fbb,
+      stage,
+      exit_code,
+      signal,
+      error_code__,
+      mini_benchmark_error_code);
+}
+
+::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkEventT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkEvent TableType;
+  std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
+  tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE;
+  std::unique_ptr<tflite::BenchmarkResultT> result{};
+  std::unique_ptr<tflite::BenchmarkErrorT> error{};
+  int64_t boottime_us = 0;
+  int64_t wallclock_us = 0;
+  BenchmarkEventT() = default;
+  BenchmarkEventT(const BenchmarkEventT &o);
+  BenchmarkEventT(BenchmarkEventT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventT &operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkEventT NativeTableType;
+  typedef BenchmarkEventBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TFLITE_SETTINGS = 4,
+    VT_EVENT_TYPE = 6,
+    VT_RESULT = 8,
+    VT_ERROR = 10,
+    VT_BOOTTIME_US = 12,
+    VT_WALLCLOCK_US = 14
+  };
+  const tflite::TFLiteSettings *tflite_settings() const {
+    return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
+  }
+  tflite::BenchmarkEventType event_type() const {
+    return static_cast<tflite::BenchmarkEventType>(GetField<int32_t>(VT_EVENT_TYPE, 0));
+  }
+  const tflite::BenchmarkResult *result() const {
+    return GetPointer<const tflite::BenchmarkResult *>(VT_RESULT);
+  }
+  const tflite::BenchmarkError *error() const {
+    return GetPointer<const tflite::BenchmarkError *>(VT_ERROR);
+  }
+  int64_t boottime_us() const {
+    return GetField<int64_t>(VT_BOOTTIME_US, 0);
+  }
+  int64_t wallclock_us() const {
+    return GetField<int64_t>(VT_WALLCLOCK_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
+           verifier.VerifyTable(tflite_settings()) &&
+           VerifyField<int32_t>(verifier, VT_EVENT_TYPE, 4) &&
+           VerifyOffset(verifier, VT_RESULT) &&
+           verifier.VerifyTable(result()) &&
+           VerifyOffset(verifier, VT_ERROR) &&
+           verifier.VerifyTable(error()) &&
+           VerifyField<int64_t>(verifier, VT_BOOTTIME_US, 8) &&
+           VerifyField<int64_t>(verifier, VT_WALLCLOCK_US, 8) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventBuilder {
+  typedef BenchmarkEvent Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+    fbb_.AddOffset(BenchmarkEvent::VT_TFLITE_SETTINGS, tflite_settings);
+  }
+  void add_event_type(tflite::BenchmarkEventType event_type) {
+    fbb_.AddElement<int32_t>(BenchmarkEvent::VT_EVENT_TYPE, static_cast<int32_t>(event_type), 0);
+  }
+  void add_result(::flatbuffers::Offset<tflite::BenchmarkResult> result) {
+    fbb_.AddOffset(BenchmarkEvent::VT_RESULT, result);
+  }
+  void add_error(::flatbuffers::Offset<tflite::BenchmarkError> error) {
+    fbb_.AddOffset(BenchmarkEvent::VT_ERROR, error);
+  }
+  void add_boottime_us(int64_t boottime_us) {
+    fbb_.AddElement<int64_t>(BenchmarkEvent::VT_BOOTTIME_US, boottime_us, 0);
+  }
+  void add_wallclock_us(int64_t wallclock_us) {
+    fbb_.AddElement<int64_t>(BenchmarkEvent::VT_WALLCLOCK_US, wallclock_us, 0);
+  }
+  explicit BenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkEvent> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkEvent>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+    ::flatbuffers::Offset<tflite::BenchmarkResult> result = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkError> error = 0,
+    int64_t boottime_us = 0,
+    int64_t wallclock_us = 0) {
+  BenchmarkEventBuilder builder_(_fbb);
+  builder_.add_wallclock_us(wallclock_us);
+  builder_.add_boottime_us(boottime_us);
+  builder_.add_error(error);
+  builder_.add_result(result);
+  builder_.add_event_type(event_type);
+  builder_.add_tflite_settings(tflite_settings);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BestAccelerationDecisionT : public ::flatbuffers::NativeTable {
+  typedef BestAccelerationDecision TableType;
+  int32_t number_of_source_events = 0;
+  std::unique_ptr<tflite::BenchmarkEventT> min_latency_event{};
+  int64_t min_inference_time_us = 0;
+  BestAccelerationDecisionT() = default;
+  BestAccelerationDecisionT(const BestAccelerationDecisionT &o);
+  BestAccelerationDecisionT(BestAccelerationDecisionT&&) FLATBUFFERS_NOEXCEPT = default;
+  BestAccelerationDecisionT &operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BestAccelerationDecisionT NativeTableType;
+  typedef BestAccelerationDecisionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUMBER_OF_SOURCE_EVENTS = 4,
+    VT_MIN_LATENCY_EVENT = 6,
+    VT_MIN_INFERENCE_TIME_US = 8
+  };
+  int32_t number_of_source_events() const {
+    return GetField<int32_t>(VT_NUMBER_OF_SOURCE_EVENTS, 0);
+  }
+  const tflite::BenchmarkEvent *min_latency_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_MIN_LATENCY_EVENT);
+  }
+  int64_t min_inference_time_us() const {
+    return GetField<int64_t>(VT_MIN_INFERENCE_TIME_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUMBER_OF_SOURCE_EVENTS, 4) &&
+           VerifyOffset(verifier, VT_MIN_LATENCY_EVENT) &&
+           verifier.VerifyTable(min_latency_event()) &&
+           VerifyField<int64_t>(verifier, VT_MIN_INFERENCE_TIME_US, 8) &&
+           verifier.EndTable();
+  }
+  BestAccelerationDecisionT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BestAccelerationDecision> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BestAccelerationDecisionBuilder {
+  typedef BestAccelerationDecision Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_number_of_source_events(int32_t number_of_source_events) {
+    fbb_.AddElement<int32_t>(BestAccelerationDecision::VT_NUMBER_OF_SOURCE_EVENTS, number_of_source_events, 0);
+  }
+  void add_min_latency_event(::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event) {
+    fbb_.AddOffset(BestAccelerationDecision::VT_MIN_LATENCY_EVENT, min_latency_event);
+  }
+  void add_min_inference_time_us(int64_t min_inference_time_us) {
+    fbb_.AddElement<int64_t>(BestAccelerationDecision::VT_MIN_INFERENCE_TIME_US, min_inference_time_us, 0);
+  }
+  explicit BestAccelerationDecisionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BestAccelerationDecision> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BestAccelerationDecision>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t number_of_source_events = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event = 0,
+    int64_t min_inference_time_us = 0) {
+  BestAccelerationDecisionBuilder builder_(_fbb);
+  builder_.add_min_inference_time_us(min_inference_time_us);
+  builder_.add_min_latency_event(min_latency_event);
+  builder_.add_number_of_source_events(number_of_source_events);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkInitializationFailureT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkInitializationFailure TableType;
+  int32_t initialization_status = 0;
+};
+
+struct BenchmarkInitializationFailure FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkInitializationFailureT NativeTableType;
+  typedef BenchmarkInitializationFailureBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INITIALIZATION_STATUS = 4
+  };
+  int32_t initialization_status() const {
+    return GetField<int32_t>(VT_INITIALIZATION_STATUS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INITIALIZATION_STATUS, 4) &&
+           verifier.EndTable();
+  }
+  BenchmarkInitializationFailureT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkInitializationFailure> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkInitializationFailureBuilder {
+  typedef BenchmarkInitializationFailure Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_status(int32_t initialization_status) {
+    fbb_.AddElement<int32_t>(BenchmarkInitializationFailure::VT_INITIALIZATION_STATUS, initialization_status, 0);
+  }
+  explicit BenchmarkInitializationFailureBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkInitializationFailure> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkInitializationFailure>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t initialization_status = 0) {
+  BenchmarkInitializationFailureBuilder builder_(_fbb);
+  builder_.add_initialization_status(initialization_status);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MiniBenchmarkEventT : public ::flatbuffers::NativeTable {
+  typedef MiniBenchmarkEvent TableType;
+  bool is_log_flushing_event = false;
+  std::unique_ptr<tflite::BestAccelerationDecisionT> best_acceleration_decision{};
+  std::unique_ptr<tflite::BenchmarkInitializationFailureT> initialization_failure{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  MiniBenchmarkEventT() = default;
+  MiniBenchmarkEventT(const MiniBenchmarkEventT &o);
+  MiniBenchmarkEventT(MiniBenchmarkEventT&&) FLATBUFFERS_NOEXCEPT = default;
+  MiniBenchmarkEventT &operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MiniBenchmarkEventT NativeTableType;
+  typedef MiniBenchmarkEventBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IS_LOG_FLUSHING_EVENT = 4,
+    VT_BEST_ACCELERATION_DECISION = 6,
+    VT_INITIALIZATION_FAILURE = 8,
+    VT_BENCHMARK_EVENT = 10
+  };
+  bool is_log_flushing_event() const {
+    return GetField<uint8_t>(VT_IS_LOG_FLUSHING_EVENT, 0) != 0;
+  }
+  const tflite::BestAccelerationDecision *best_acceleration_decision() const {
+    return GetPointer<const tflite::BestAccelerationDecision *>(VT_BEST_ACCELERATION_DECISION);
+  }
+  const tflite::BenchmarkInitializationFailure *initialization_failure() const {
+    return GetPointer<const tflite::BenchmarkInitializationFailure *>(VT_INITIALIZATION_FAILURE);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_IS_LOG_FLUSHING_EVENT, 1) &&
+           VerifyOffset(verifier, VT_BEST_ACCELERATION_DECISION) &&
+           verifier.VerifyTable(best_acceleration_decision()) &&
+           VerifyOffset(verifier, VT_INITIALIZATION_FAILURE) &&
+           verifier.VerifyTable(initialization_failure()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  MiniBenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MiniBenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MiniBenchmarkEventBuilder {
+  typedef MiniBenchmarkEvent Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_is_log_flushing_event(bool is_log_flushing_event) {
+    fbb_.AddElement<uint8_t>(MiniBenchmarkEvent::VT_IS_LOG_FLUSHING_EVENT, static_cast<uint8_t>(is_log_flushing_event), 0);
+  }
+  void add_best_acceleration_decision(::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_BEST_ACCELERATION_DECISION, best_acceleration_decision);
+  }
+  void add_initialization_failure(::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_INITIALIZATION_FAILURE, initialization_failure);
+  }
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit MiniBenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MiniBenchmarkEvent> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MiniBenchmarkEvent>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_log_flushing_event = false,
+    ::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  MiniBenchmarkEventBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_initialization_failure(initialization_failure);
+  builder_.add_best_acceleration_decision(best_acceleration_decision);
+  builder_.add_is_log_flushing_event(is_log_flushing_event);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelFileT : public ::flatbuffers::NativeTable {
+  typedef ModelFile TableType;
+  std::string filename{};
+  int64_t fd = 0;
+  int64_t offset = 0;
+  int64_t length = 0;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  int64_t buffer_handle = 0;
+  ModelFileT() = default;
+  ModelFileT(const ModelFileT &o);
+  ModelFileT(ModelFileT&&) FLATBUFFERS_NOEXCEPT = default;
+  ModelFileT &operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ModelFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelFileT NativeTableType;
+  typedef ModelFileBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FILENAME = 4,
+    VT_FD = 6,
+    VT_OFFSET = 8,
+    VT_LENGTH = 10,
+    VT_MODEL_ID_GROUP = 12,
+    VT_BUFFER_HANDLE = 14
+  };
+  const ::flatbuffers::String *filename() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILENAME);
+  }
+  int64_t fd() const {
+    return GetField<int64_t>(VT_FD, 0);
+  }
+  int64_t offset() const {
+    return GetField<int64_t>(VT_OFFSET, 0);
+  }
+  int64_t length() const {
+    return GetField<int64_t>(VT_LENGTH, 0);
+  }
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  int64_t buffer_handle() const {
+    return GetField<int64_t>(VT_BUFFER_HANDLE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_FILENAME) &&
+           verifier.VerifyString(filename()) &&
+           VerifyField<int64_t>(verifier, VT_FD, 8) &&
+           VerifyField<int64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyField<int64_t>(verifier, VT_BUFFER_HANDLE, 8) &&
+           verifier.EndTable();
+  }
+  ModelFileT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelFile> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelFileBuilder {
+  typedef ModelFile Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_filename(::flatbuffers::Offset<::flatbuffers::String> filename) {
+    fbb_.AddOffset(ModelFile::VT_FILENAME, filename);
+  }
+  void add_fd(int64_t fd) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_FD, fd, 0);
+  }
+  void add_offset(int64_t offset) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_OFFSET, offset, 0);
+  }
+  void add_length(int64_t length) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_LENGTH, length, 0);
+  }
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(ModelFile::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_buffer_handle(int64_t buffer_handle) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_BUFFER_HANDLE, buffer_handle, 0);
+  }
+  explicit ModelFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ModelFile> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ModelFile>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> filename = 0,
+    int64_t fd = 0,
+    int64_t offset = 0,
+    int64_t length = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    int64_t buffer_handle = 0) {
+  ModelFileBuilder builder_(_fbb);
+  builder_.add_buffer_handle(buffer_handle);
+  builder_.add_length(length);
+  builder_.add_offset(offset);
+  builder_.add_fd(fd);
+  builder_.add_model_id_group(model_id_group);
+  builder_.add_filename(filename);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *filename = nullptr,
+    int64_t fd = 0,
+    int64_t offset = 0,
+    int64_t length = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    int64_t buffer_handle = 0) {
+  auto filename__ = filename ? _fbb.CreateString(filename) : 0;
+  return tflite::CreateModelFile(
+      _fbb,
+      filename__,
+      fd,
+      offset,
+      length,
+      model_id_group,
+      buffer_handle);
+}
+
+::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelIdGroupT : public ::flatbuffers::NativeTable {
+  typedef ModelIdGroup TableType;
+  std::string model_namespace{};
+  std::string model_id{};
+};
+
+struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelIdGroupT NativeTableType;
+  typedef ModelIdGroupBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_NAMESPACE = 4,
+    VT_MODEL_ID = 6
+  };
+  const ::flatbuffers::String *model_namespace() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE);
+  }
+  const ::flatbuffers::String *model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_ID);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE) &&
+           verifier.VerifyString(model_namespace()) &&
+           VerifyOffset(verifier, VT_MODEL_ID) &&
+           verifier.VerifyString(model_id()) &&
+           verifier.EndTable();
+  }
+  ModelIdGroupT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelIdGroup> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelIdGroupBuilder {
+  typedef ModelIdGroup Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_namespace(::flatbuffers::Offset<::flatbuffers::String> model_namespace) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_NAMESPACE, model_namespace);
+  }
+  void add_model_id(::flatbuffers::Offset<::flatbuffers::String> model_id) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_ID, model_id);
+  }
+  explicit ModelIdGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ModelIdGroup> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ModelIdGroup>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_id = 0) {
+  ModelIdGroupBuilder builder_(_fbb);
+  builder_.add_model_id(model_id);
+  builder_.add_model_namespace(model_namespace);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *model_namespace = nullptr,
+    const char *model_id = nullptr) {
+  auto model_namespace__ = model_namespace ? _fbb.CreateString(model_namespace) : 0;
+  auto model_id__ = model_id ? _fbb.CreateString(model_id) : 0;
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      model_namespace__,
+      model_id__);
+}
+
+::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkStoragePathsT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkStoragePaths TableType;
+  std::string storage_file_path{};
+  std::string data_directory_path{};
+};
+
+struct BenchmarkStoragePaths FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkStoragePathsT NativeTableType;
+  typedef BenchmarkStoragePathsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STORAGE_FILE_PATH = 4,
+    VT_DATA_DIRECTORY_PATH = 6
+  };
+  const ::flatbuffers::String *storage_file_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_STORAGE_FILE_PATH);
+  }
+  const ::flatbuffers::String *data_directory_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATA_DIRECTORY_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_STORAGE_FILE_PATH) &&
+           verifier.VerifyString(storage_file_path()) &&
+           VerifyOffset(verifier, VT_DATA_DIRECTORY_PATH) &&
+           verifier.VerifyString(data_directory_path()) &&
+           verifier.EndTable();
+  }
+  BenchmarkStoragePathsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkStoragePaths> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkStoragePathsBuilder {
+  typedef BenchmarkStoragePaths Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_storage_file_path(::flatbuffers::Offset<::flatbuffers::String> storage_file_path) {
+    fbb_.AddOffset(BenchmarkStoragePaths::VT_STORAGE_FILE_PATH, storage_file_path);
+  }
+  void add_data_directory_path(::flatbuffers::Offset<::flatbuffers::String> data_directory_path) {
+    fbb_.AddOffset(BenchmarkStoragePaths::VT_DATA_DIRECTORY_PATH, data_directory_path);
+  }
+  explicit BenchmarkStoragePathsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkStoragePaths> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkStoragePaths>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> storage_file_path = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> data_directory_path = 0) {
+  BenchmarkStoragePathsBuilder builder_(_fbb);
+  builder_.add_data_directory_path(data_directory_path);
+  builder_.add_storage_file_path(storage_file_path);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePathsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *storage_file_path = nullptr,
+    const char *data_directory_path = nullptr) {
+  auto storage_file_path__ = storage_file_path ? _fbb.CreateString(storage_file_path) : 0;
+  auto data_directory_path__ = data_directory_path ? _fbb.CreateString(data_directory_path) : 0;
+  return tflite::CreateBenchmarkStoragePaths(
+      _fbb,
+      storage_file_path__,
+      data_directory_path__);
+}
+
+::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ValidationSettingsT : public ::flatbuffers::NativeTable {
+  typedef ValidationSettings TableType;
+  int64_t per_test_timeout_ms = 0;
+};
+
+struct ValidationSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ValidationSettingsT NativeTableType;
+  typedef ValidationSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PER_TEST_TIMEOUT_MS = 4
+  };
+  int64_t per_test_timeout_ms() const {
+    return GetField<int64_t>(VT_PER_TEST_TIMEOUT_MS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_PER_TEST_TIMEOUT_MS, 8) &&
+           verifier.EndTable();
+  }
+  ValidationSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ValidationSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ValidationSettingsBuilder {
+  typedef ValidationSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_per_test_timeout_ms(int64_t per_test_timeout_ms) {
+    fbb_.AddElement<int64_t>(ValidationSettings::VT_PER_TEST_TIMEOUT_MS, per_test_timeout_ms, 0);
+  }
+  explicit ValidationSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ValidationSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ValidationSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t per_test_timeout_ms = 0) {
+  ValidationSettingsBuilder builder_(_fbb);
+  builder_.add_per_test_timeout_ms(per_test_timeout_ms);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MinibenchmarkSettingsT : public ::flatbuffers::NativeTable {
+  typedef MinibenchmarkSettings TableType;
+  std::vector<std::unique_ptr<tflite::TFLiteSettingsT>> settings_to_test{};
+  std::unique_ptr<tflite::ModelFileT> model_file{};
+  std::unique_ptr<tflite::BenchmarkStoragePathsT> storage_paths{};
+  std::unique_ptr<tflite::ValidationSettingsT> validation_settings{};
+  MinibenchmarkSettingsT() = default;
+  MinibenchmarkSettingsT(const MinibenchmarkSettingsT &o);
+  MinibenchmarkSettingsT(MinibenchmarkSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  MinibenchmarkSettingsT &operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MinibenchmarkSettingsT NativeTableType;
+  typedef MinibenchmarkSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SETTINGS_TO_TEST = 4,
+    VT_MODEL_FILE = 6,
+    VT_STORAGE_PATHS = 8,
+    VT_VALIDATION_SETTINGS = 10
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *>(VT_SETTINGS_TO_TEST);
+  }
+  const tflite::ModelFile *model_file() const {
+    return GetPointer<const tflite::ModelFile *>(VT_MODEL_FILE);
+  }
+  const tflite::BenchmarkStoragePaths *storage_paths() const {
+    return GetPointer<const tflite::BenchmarkStoragePaths *>(VT_STORAGE_PATHS);
+  }
+  const tflite::ValidationSettings *validation_settings() const {
+    return GetPointer<const tflite::ValidationSettings *>(VT_VALIDATION_SETTINGS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SETTINGS_TO_TEST) &&
+           verifier.VerifyVector(settings_to_test()) &&
+           verifier.VerifyVectorOfTables(settings_to_test()) &&
+           VerifyOffset(verifier, VT_MODEL_FILE) &&
+           verifier.VerifyTable(model_file()) &&
+           VerifyOffset(verifier, VT_STORAGE_PATHS) &&
+           verifier.VerifyTable(storage_paths()) &&
+           VerifyOffset(verifier, VT_VALIDATION_SETTINGS) &&
+           verifier.VerifyTable(validation_settings()) &&
+           verifier.EndTable();
+  }
+  MinibenchmarkSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MinibenchmarkSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MinibenchmarkSettingsBuilder {
+  typedef MinibenchmarkSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_settings_to_test(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_SETTINGS_TO_TEST, settings_to_test);
+  }
+  void add_model_file(::flatbuffers::Offset<tflite::ModelFile> model_file) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_MODEL_FILE, model_file);
+  }
+  void add_storage_paths(::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_STORAGE_PATHS, storage_paths);
+  }
+  void add_validation_settings(::flatbuffers::Offset<tflite::ValidationSettings> validation_settings) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_VALIDATION_SETTINGS, validation_settings);
+  }
+  explicit MinibenchmarkSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MinibenchmarkSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MinibenchmarkSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test = 0,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  MinibenchmarkSettingsBuilder builder_(_fbb);
+  builder_.add_validation_settings(validation_settings);
+  builder_.add_storage_paths(storage_paths);
+  builder_.add_model_file(model_file);
+  builder_.add_settings_to_test(settings_to_test);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test = nullptr,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  auto settings_to_test__ = settings_to_test ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>>(*settings_to_test) : 0;
+  return tflite::CreateMinibenchmarkSettings(
+      _fbb,
+      settings_to_test__,
+      model_file,
+      storage_paths,
+      validation_settings);
+}
+
+::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkEventStorageT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkEventStorage TableType;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  BenchmarkEventStorageT() = default;
+  BenchmarkEventStorageT(const BenchmarkEventStorageT &o);
+  BenchmarkEventStorageT(BenchmarkEventStorageT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventStorageT &operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkEventStorageT NativeTableType;
+  typedef BenchmarkEventStorageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_ID_GROUP = 4,
+    VT_BENCHMARK_EVENT = 6
+  };
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventStorageT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEventStorage> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventStorageBuilder {
+  typedef BenchmarkEventStorage Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit BenchmarkEventStorageBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkEventStorage> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkEventStorage>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  BenchmarkEventStorageBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_model_id_group(model_id_group);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+
+inline bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
+  return
+      (lhs.preference == rhs.preference) &&
+      ((lhs.tflite_settings == rhs.tflite_settings) || (lhs.tflite_settings && rhs.tflite_settings && *lhs.tflite_settings == *rhs.tflite_settings)) &&
+      (lhs.model_namespace_for_statistics == rhs.model_namespace_for_statistics) &&
+      (lhs.model_identifier_for_statistics == rhs.model_identifier_for_statistics) &&
+      ((lhs.settings_to_test_locally == rhs.settings_to_test_locally) || (lhs.settings_to_test_locally && rhs.settings_to_test_locally && *lhs.settings_to_test_locally == *rhs.settings_to_test_locally));
+}
+
+inline bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ComputeSettingsT::ComputeSettingsT(const ComputeSettingsT &o)
+      : preference(o.preference),
+        tflite_settings((o.tflite_settings) ? new tflite::TFLiteSettingsT(*o.tflite_settings) : nullptr),
+        model_namespace_for_statistics(o.model_namespace_for_statistics),
+        model_identifier_for_statistics(o.model_identifier_for_statistics),
+        settings_to_test_locally((o.settings_to_test_locally) ? new tflite::MinibenchmarkSettingsT(*o.settings_to_test_locally) : nullptr) {
+}
+
+inline ComputeSettingsT &ComputeSettingsT::operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(preference, o.preference);
+  std::swap(tflite_settings, o.tflite_settings);
+  std::swap(model_namespace_for_statistics, o.model_namespace_for_statistics);
+  std::swap(model_identifier_for_statistics, o.model_identifier_for_statistics);
+  std::swap(settings_to_test_locally, o.settings_to_test_locally);
+  return *this;
+}
+
+inline ComputeSettingsT *ComputeSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ComputeSettingsT>(new ComputeSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ComputeSettings::UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = preference(); _o->preference = _e; }
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
+  { auto _e = model_namespace_for_statistics(); if (_e) _o->model_namespace_for_statistics = _e->str(); }
+  { auto _e = model_identifier_for_statistics(); if (_e) _o->model_identifier_for_statistics = _e->str(); }
+  { auto _e = settings_to_test_locally(); if (_e) { if(_o->settings_to_test_locally) { _e->UnPackTo(_o->settings_to_test_locally.get(), _resolver); } else { _o->settings_to_test_locally = std::unique_ptr<tflite::MinibenchmarkSettingsT>(_e->UnPack(_resolver)); } } else if (_o->settings_to_test_locally) { _o->settings_to_test_locally.reset(); } }
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> ComputeSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateComputeSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ComputeSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _preference = _o->preference;
+  auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
+  auto _model_namespace_for_statistics = _o->model_namespace_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_namespace_for_statistics);
+  auto _model_identifier_for_statistics = _o->model_identifier_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_identifier_for_statistics);
+  auto _settings_to_test_locally = _o->settings_to_test_locally ? CreateMinibenchmarkSettings(_fbb, _o->settings_to_test_locally.get(), _rehasher) : 0;
+  return tflite::CreateComputeSettings(
+      _fbb,
+      _preference,
+      _tflite_settings,
+      _model_namespace_for_statistics,
+      _model_identifier_for_statistics,
+      _settings_to_test_locally);
+}
+
+
+inline bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
+  return
+      (lhs.accelerator_name == rhs.accelerator_name) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.execution_preference == rhs.execution_preference) &&
+      (lhs.no_of_nnapi_instances_to_cache == rhs.no_of_nnapi_instances_to_cache) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.allow_nnapi_cpu_on_android_10_plus == rhs.allow_nnapi_cpu_on_android_10_plus) &&
+      (lhs.execution_priority == rhs.execution_priority) &&
+      (lhs.allow_dynamic_dimensions == rhs.allow_dynamic_dimensions) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32) &&
+      (lhs.use_burst_computation == rhs.use_burst_computation) &&
+      (lhs.support_library_handle == rhs.support_library_handle);
+}
+
+inline bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline NNAPISettingsT::NNAPISettingsT(const NNAPISettingsT &o)
+      : accelerator_name(o.accelerator_name),
+        cache_directory(o.cache_directory),
+        model_token(o.model_token),
+        execution_preference(o.execution_preference),
+        no_of_nnapi_instances_to_cache(o.no_of_nnapi_instances_to_cache),
+        fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
+        allow_nnapi_cpu_on_android_10_plus(o.allow_nnapi_cpu_on_android_10_plus),
+        execution_priority(o.execution_priority),
+        allow_dynamic_dimensions(o.allow_dynamic_dimensions),
+        allow_fp16_precision_for_fp32(o.allow_fp16_precision_for_fp32),
+        use_burst_computation(o.use_burst_computation),
+        support_library_handle(o.support_library_handle) {
+}
+
+inline NNAPISettingsT &NNAPISettingsT::operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(accelerator_name, o.accelerator_name);
+  std::swap(cache_directory, o.cache_directory);
+  std::swap(model_token, o.model_token);
+  std::swap(execution_preference, o.execution_preference);
+  std::swap(no_of_nnapi_instances_to_cache, o.no_of_nnapi_instances_to_cache);
+  std::swap(fallback_settings, o.fallback_settings);
+  std::swap(allow_nnapi_cpu_on_android_10_plus, o.allow_nnapi_cpu_on_android_10_plus);
+  std::swap(execution_priority, o.execution_priority);
+  std::swap(allow_dynamic_dimensions, o.allow_dynamic_dimensions);
+  std::swap(allow_fp16_precision_for_fp32, o.allow_fp16_precision_for_fp32);
+  std::swap(use_burst_computation, o.use_burst_computation);
+  std::swap(support_library_handle, o.support_library_handle);
+  return *this;
+}
+
+inline NNAPISettingsT *NNAPISettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NNAPISettingsT>(new NNAPISettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = accelerator_name(); if (_e) _o->accelerator_name = _e->str(); }
+  { auto _e = cache_directory(); if (_e) _o->cache_directory = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+  { auto _e = execution_preference(); _o->execution_preference = _e; }
+  { auto _e = no_of_nnapi_instances_to_cache(); _o->no_of_nnapi_instances_to_cache = _e; }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
+  { auto _e = allow_nnapi_cpu_on_android_10_plus(); _o->allow_nnapi_cpu_on_android_10_plus = _e; }
+  { auto _e = execution_priority(); _o->execution_priority = _e; }
+  { auto _e = allow_dynamic_dimensions(); _o->allow_dynamic_dimensions = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
+  { auto _e = use_burst_computation(); _o->use_burst_computation = _e; }
+  { auto _e = support_library_handle(); _o->support_library_handle = _e; }
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> NNAPISettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNNAPISettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NNAPISettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _accelerator_name = _o->accelerator_name.empty() ? 0 : _fbb.CreateString(_o->accelerator_name);
+  auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  auto _execution_preference = _o->execution_preference;
+  auto _no_of_nnapi_instances_to_cache = _o->no_of_nnapi_instances_to_cache;
+  auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
+  auto _allow_nnapi_cpu_on_android_10_plus = _o->allow_nnapi_cpu_on_android_10_plus;
+  auto _execution_priority = _o->execution_priority;
+  auto _allow_dynamic_dimensions = _o->allow_dynamic_dimensions;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
+  auto _use_burst_computation = _o->use_burst_computation;
+  auto _support_library_handle = _o->support_library_handle;
+  return tflite::CreateNNAPISettings(
+      _fbb,
+      _accelerator_name,
+      _cache_directory,
+      _model_token,
+      _execution_preference,
+      _no_of_nnapi_instances_to_cache,
+      _fallback_settings,
+      _allow_nnapi_cpu_on_android_10_plus,
+      _execution_priority,
+      _allow_dynamic_dimensions,
+      _allow_fp16_precision_for_fp32,
+      _use_burst_computation,
+      _support_library_handle);
+}
+
+
+inline bool operator==(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
+  return
+      (lhs.is_precision_loss_allowed == rhs.is_precision_loss_allowed) &&
+      (lhs.enable_quantized_inference == rhs.enable_quantized_inference) &&
+      (lhs.force_backend == rhs.force_backend) &&
+      (lhs.inference_priority1 == rhs.inference_priority1) &&
+      (lhs.inference_priority2 == rhs.inference_priority2) &&
+      (lhs.inference_priority3 == rhs.inference_priority3) &&
+      (lhs.inference_preference == rhs.inference_preference) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token);
+}
+
+inline bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline GPUSettingsT *GPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GPUSettingsT>(new GPUSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GPUSettings::UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = is_precision_loss_allowed(); _o->is_precision_loss_allowed = _e; }
+  { auto _e = enable_quantized_inference(); _o->enable_quantized_inference = _e; }
+  { auto _e = force_backend(); _o->force_backend = _e; }
+  { auto _e = inference_priority1(); _o->inference_priority1 = _e; }
+  { auto _e = inference_priority2(); _o->inference_priority2 = _e; }
+  { auto _e = inference_priority3(); _o->inference_priority3 = _e; }
+  { auto _e = inference_preference(); _o->inference_preference = _e; }
+  { auto _e = cache_directory(); if (_e) _o->cache_directory = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<GPUSettings> GPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGPUSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _is_precision_loss_allowed = _o->is_precision_loss_allowed;
+  auto _enable_quantized_inference = _o->enable_quantized_inference;
+  auto _force_backend = _o->force_backend;
+  auto _inference_priority1 = _o->inference_priority1;
+  auto _inference_priority2 = _o->inference_priority2;
+  auto _inference_priority3 = _o->inference_priority3;
+  auto _inference_preference = _o->inference_preference;
+  auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  return tflite::CreateGPUSettings(
+      _fbb,
+      _is_precision_loss_allowed,
+      _enable_quantized_inference,
+      _force_backend,
+      _inference_priority1,
+      _inference_priority2,
+      _inference_priority3,
+      _inference_preference,
+      _cache_directory,
+      _model_token);
+}
+
+
+inline bool operator==(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs) {
+  return
+      (lhs.debug_level == rhs.debug_level) &&
+      (lhs.powersave_level == rhs.powersave_level) &&
+      (lhs.print_graph_profile == rhs.print_graph_profile) &&
+      (lhs.print_graph_debug == rhs.print_graph_debug);
+}
+
+inline bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline HexagonSettingsT *HexagonSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HexagonSettingsT>(new HexagonSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HexagonSettings::UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = debug_level(); _o->debug_level = _e; }
+  { auto _e = powersave_level(); _o->powersave_level = _e; }
+  { auto _e = print_graph_profile(); _o->print_graph_profile = _e; }
+  { auto _e = print_graph_debug(); _o->print_graph_debug = _e; }
+}
+
+inline ::flatbuffers::Offset<HexagonSettings> HexagonSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHexagonSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HexagonSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _debug_level = _o->debug_level;
+  auto _powersave_level = _o->powersave_level;
+  auto _print_graph_profile = _o->print_graph_profile;
+  auto _print_graph_debug = _o->print_graph_debug;
+  return tflite::CreateHexagonSettings(
+      _fbb,
+      _debug_level,
+      _powersave_level,
+      _print_graph_profile,
+      _print_graph_debug);
+}
+
+
+inline bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
+  return
+      (lhs.num_threads == rhs.num_threads) &&
+      (lhs.flags == rhs.flags);
+}
+
+inline bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline XNNPackSettingsT *XNNPackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<XNNPackSettingsT>(new XNNPackSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_threads(); _o->num_threads = _e; }
+  { auto _e = flags(); _o->flags = _e; }
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateXNNPackSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const XNNPackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_threads = _o->num_threads;
+  auto _flags = _o->flags;
+  return tflite::CreateXNNPackSettings(
+      _fbb,
+      _num_threads,
+      _flags);
+}
+
+
+inline bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
+  return
+      (lhs.enabled_devices == rhs.enabled_devices) &&
+      (lhs.coreml_version == rhs.coreml_version) &&
+      (lhs.max_delegated_partitions == rhs.max_delegated_partitions) &&
+      (lhs.min_nodes_per_partition == rhs.min_nodes_per_partition);
+}
+
+inline bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CoreMLSettingsT *CoreMLSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CoreMLSettingsT>(new CoreMLSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CoreMLSettings::UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = enabled_devices(); _o->enabled_devices = _e; }
+  { auto _e = coreml_version(); _o->coreml_version = _e; }
+  { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
+  { auto _e = min_nodes_per_partition(); _o->min_nodes_per_partition = _e; }
+}
+
+inline ::flatbuffers::Offset<CoreMLSettings> CoreMLSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCoreMLSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoreMLSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _enabled_devices = _o->enabled_devices;
+  auto _coreml_version = _o->coreml_version;
+  auto _max_delegated_partitions = _o->max_delegated_partitions;
+  auto _min_nodes_per_partition = _o->min_nodes_per_partition;
+  return tflite::CreateCoreMLSettings(
+      _fbb,
+      _enabled_devices,
+      _coreml_version,
+      _max_delegated_partitions,
+      _min_nodes_per_partition);
+}
+
+
+inline bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+  return
+      (lhs.delegate_path == rhs.delegate_path);
+}
+
+inline bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline StableDelegateLoaderSettingsT *StableDelegateLoaderSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StableDelegateLoaderSettingsT>(new StableDelegateLoaderSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StableDelegateLoaderSettings::UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate_path(); if (_e) _o->delegate_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> StableDelegateLoaderSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStableDelegateLoaderSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StableDelegateLoaderSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate_path = _o->delegate_path.empty() ? 0 : _fbb.CreateString(_o->delegate_path);
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      _delegate_path);
+}
+
+
+inline bool operator==(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs) {
+  return
+      (lhs.cache_dir == rhs.cache_dir) &&
+      (lhs.model_token == rhs.model_token);
+}
+
+inline bool operator!=(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CompilationCachingSettingsT *CompilationCachingSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CompilationCachingSettingsT>(new CompilationCachingSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CompilationCachingSettings::UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cache_dir(); if (_e) _o->cache_dir = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CompilationCachingSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCompilationCachingSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CompilationCachingSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cache_dir = _o->cache_dir.empty() ? 0 : _fbb.CreateString(_o->cache_dir);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  return tflite::CreateCompilationCachingSettings(
+      _fbb,
+      _cache_dir,
+      _model_token);
+}
+
+
+inline bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
+  return
+      (lhs.platform_type == rhs.platform_type) &&
+      (lhs.num_chips == rhs.num_chips) &&
+      (lhs.device_paths == rhs.device_paths) &&
+      (lhs.chip_family == rhs.chip_family);
+}
+
+inline bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuDeviceSpecT *EdgeTpuDeviceSpec::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuDeviceSpecT>(new EdgeTpuDeviceSpecT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuDeviceSpec::UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = platform_type(); _o->platform_type = _e; }
+  { auto _e = num_chips(); _o->num_chips = _e; }
+  { auto _e = device_paths(); if (_e) { _o->device_paths.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->device_paths[_i] = _e->Get(_i)->str(); } } else { _o->device_paths.resize(0); } }
+  { auto _e = chip_family(); _o->chip_family = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> EdgeTpuDeviceSpec::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuDeviceSpec(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuDeviceSpecT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _platform_type = _o->platform_type;
+  auto _num_chips = _o->num_chips;
+  auto _device_paths = _o->device_paths.size() ? _fbb.CreateVectorOfStrings(_o->device_paths) : 0;
+  auto _chip_family = _o->chip_family;
+  return tflite::CreateEdgeTpuDeviceSpec(
+      _fbb,
+      _platform_type,
+      _num_chips,
+      _device_paths,
+      _chip_family);
+}
+
+
+inline bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs) {
+  return
+      (lhs.inactive_power_state == rhs.inactive_power_state) &&
+      (lhs.inactive_timeout_us == rhs.inactive_timeout_us);
+}
+
+inline bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuInactivePowerConfigT *EdgeTpuInactivePowerConfig::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuInactivePowerConfigT>(new EdgeTpuInactivePowerConfigT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuInactivePowerConfig::UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inactive_power_state(); _o->inactive_power_state = _e; }
+  { auto _e = inactive_timeout_us(); _o->inactive_timeout_us = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> EdgeTpuInactivePowerConfig::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuInactivePowerConfig(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuInactivePowerConfigT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inactive_power_state = _o->inactive_power_state;
+  auto _inactive_timeout_us = _o->inactive_timeout_us;
+  return tflite::CreateEdgeTpuInactivePowerConfig(
+      _fbb,
+      _inactive_power_state,
+      _inactive_timeout_us);
+}
+
+
+inline bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
+  return
+      (lhs.inference_power_state == rhs.inference_power_state) &&
+      (lhs.inactive_power_configs.size() == rhs.inactive_power_configs.size() && std::equal(lhs.inactive_power_configs.cbegin(), lhs.inactive_power_configs.cend(), rhs.inactive_power_configs.cbegin(), [](std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &a, std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.inference_priority == rhs.inference_priority) &&
+      ((lhs.edgetpu_device_spec == rhs.edgetpu_device_spec) || (lhs.edgetpu_device_spec && rhs.edgetpu_device_spec && *lhs.edgetpu_device_spec == *rhs.edgetpu_device_spec)) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.float_truncation_type == rhs.float_truncation_type) &&
+      (lhs.qos_class == rhs.qos_class) &&
+      (lhs.hardware_cluster_ids == rhs.hardware_cluster_ids) &&
+      (lhs.public_model_id == rhs.public_model_id);
+}
+
+inline bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuSettingsT::EdgeTpuSettingsT(const EdgeTpuSettingsT &o)
+      : inference_power_state(o.inference_power_state),
+        inference_priority(o.inference_priority),
+        edgetpu_device_spec((o.edgetpu_device_spec) ? new tflite::EdgeTpuDeviceSpecT(*o.edgetpu_device_spec) : nullptr),
+        model_token(o.model_token),
+        float_truncation_type(o.float_truncation_type),
+        qos_class(o.qos_class),
+        hardware_cluster_ids(o.hardware_cluster_ids),
+        public_model_id(o.public_model_id) {
+  inactive_power_configs.reserve(o.inactive_power_configs.size());
+  for (const auto &inactive_power_configs_ : o.inactive_power_configs) { inactive_power_configs.emplace_back((inactive_power_configs_) ? new tflite::EdgeTpuInactivePowerConfigT(*inactive_power_configs_) : nullptr); }
+}
+
+inline EdgeTpuSettingsT &EdgeTpuSettingsT::operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(inference_power_state, o.inference_power_state);
+  std::swap(inactive_power_configs, o.inactive_power_configs);
+  std::swap(inference_priority, o.inference_priority);
+  std::swap(edgetpu_device_spec, o.edgetpu_device_spec);
+  std::swap(model_token, o.model_token);
+  std::swap(float_truncation_type, o.float_truncation_type);
+  std::swap(qos_class, o.qos_class);
+  std::swap(hardware_cluster_ids, o.hardware_cluster_ids);
+  std::swap(public_model_id, o.public_model_id);
+  return *this;
+}
+
+inline EdgeTpuSettingsT *EdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuSettingsT>(new EdgeTpuSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inference_power_state(); _o->inference_power_state = _e; }
+  { auto _e = inactive_power_configs(); if (_e) { _o->inactive_power_configs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inactive_power_configs[_i]) { _e->Get(_i)->UnPackTo(_o->inactive_power_configs[_i].get(), _resolver); } else { _o->inactive_power_configs[_i] = std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inactive_power_configs.resize(0); } }
+  { auto _e = inference_priority(); _o->inference_priority = _e; }
+  { auto _e = edgetpu_device_spec(); if (_e) { if(_o->edgetpu_device_spec) { _e->UnPackTo(_o->edgetpu_device_spec.get(), _resolver); } else { _o->edgetpu_device_spec = std::unique_ptr<tflite::EdgeTpuDeviceSpecT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_device_spec) { _o->edgetpu_device_spec.reset(); } }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+  { auto _e = float_truncation_type(); _o->float_truncation_type = _e; }
+  { auto _e = qos_class(); _o->qos_class = _e; }
+  { auto _e = hardware_cluster_ids(); if (_e) { _o->hardware_cluster_ids.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->hardware_cluster_ids[_i] = _e->Get(_i); } } else { _o->hardware_cluster_ids.resize(0); } }
+  { auto _e = public_model_id(); if (_e) _o->public_model_id = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inference_power_state = _o->inference_power_state;
+  auto _inactive_power_configs = _o->inactive_power_configs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> (_o->inactive_power_configs.size(), [](size_t i, _VectorArgs *__va) { return CreateEdgeTpuInactivePowerConfig(*__va->__fbb, __va->__o->inactive_power_configs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _inference_priority = _o->inference_priority;
+  auto _edgetpu_device_spec = _o->edgetpu_device_spec ? CreateEdgeTpuDeviceSpec(_fbb, _o->edgetpu_device_spec.get(), _rehasher) : 0;
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  auto _float_truncation_type = _o->float_truncation_type;
+  auto _qos_class = _o->qos_class;
+  auto _hardware_cluster_ids = _o->hardware_cluster_ids.size() ? _fbb.CreateVector(_o->hardware_cluster_ids) : 0;
+  auto _public_model_id = _o->public_model_id.empty() ? 0 : _fbb.CreateString(_o->public_model_id);
+  return tflite::CreateEdgeTpuSettings(
+      _fbb,
+      _inference_power_state,
+      _inactive_power_configs,
+      _inference_priority,
+      _edgetpu_device_spec,
+      _model_token,
+      _float_truncation_type,
+      _qos_class,
+      _hardware_cluster_ids,
+      _public_model_id);
+}
+
+
+inline bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
+  return
+      (lhs.log_verbosity == rhs.log_verbosity) &&
+      (lhs.enable_tracing == rhs.enable_tracing) &&
+      (lhs.priority == rhs.priority) &&
+      (lhs.extension_data == rhs.extension_data) &&
+      (lhs.model_identifier == rhs.model_identifier) &&
+      (lhs.use_async_api == rhs.use_async_api) &&
+      (lhs.delegate_should_manage_cache_for_inputs == rhs.delegate_should_manage_cache_for_inputs) &&
+      (lhs.delegate_should_manage_cache_for_outputs == rhs.delegate_should_manage_cache_for_outputs) &&
+      (lhs.prefer_cache_coherency_for_inputs == rhs.prefer_cache_coherency_for_inputs) &&
+      (lhs.prefer_cache_coherency_for_outputs == rhs.prefer_cache_coherency_for_outputs);
+}
+
+inline bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline GoogleEdgeTpuSettingsT *GoogleEdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GoogleEdgeTpuSettingsT>(new GoogleEdgeTpuSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GoogleEdgeTpuSettings::UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = log_verbosity(); _o->log_verbosity = _e; }
+  { auto _e = enable_tracing(); _o->enable_tracing = _e; }
+  { auto _e = priority(); _o->priority = _e; }
+  { auto _e = extension_data(); if (_e) { _o->extension_data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->extension_data.begin()); } }
+  { auto _e = model_identifier(); if (_e) _o->model_identifier = _e->str(); }
+  { auto _e = use_async_api(); _o->use_async_api = _e; }
+  { auto _e = delegate_should_manage_cache_for_inputs(); _o->delegate_should_manage_cache_for_inputs = _e; }
+  { auto _e = delegate_should_manage_cache_for_outputs(); _o->delegate_should_manage_cache_for_outputs = _e; }
+  { auto _e = prefer_cache_coherency_for_inputs(); _o->prefer_cache_coherency_for_inputs = _e; }
+  { auto _e = prefer_cache_coherency_for_outputs(); _o->prefer_cache_coherency_for_outputs = _e; }
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> GoogleEdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGoogleEdgeTpuSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GoogleEdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _log_verbosity = _o->log_verbosity;
+  auto _enable_tracing = _o->enable_tracing;
+  auto _priority = _o->priority;
+  auto _extension_data = _o->extension_data.size() ? _fbb.CreateVector(_o->extension_data) : 0;
+  auto _model_identifier = _o->model_identifier.empty() ? 0 : _fbb.CreateString(_o->model_identifier);
+  auto _use_async_api = _o->use_async_api;
+  auto _delegate_should_manage_cache_for_inputs = _o->delegate_should_manage_cache_for_inputs;
+  auto _delegate_should_manage_cache_for_outputs = _o->delegate_should_manage_cache_for_outputs;
+  auto _prefer_cache_coherency_for_inputs = _o->prefer_cache_coherency_for_inputs;
+  auto _prefer_cache_coherency_for_outputs = _o->prefer_cache_coherency_for_outputs;
+  return tflite::CreateGoogleEdgeTpuSettings(
+      _fbb,
+      _log_verbosity,
+      _enable_tracing,
+      _priority,
+      _extension_data,
+      _model_identifier,
+      _use_async_api,
+      _delegate_should_manage_cache_for_inputs,
+      _delegate_should_manage_cache_for_outputs,
+      _prefer_cache_coherency_for_inputs,
+      _prefer_cache_coherency_for_outputs);
+}
+
+
+inline bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
+  return
+      (lhs.device == rhs.device) &&
+      (lhs.performance == rhs.performance) &&
+      (lhs.usb_always_dfu == rhs.usb_always_dfu) &&
+      (lhs.usb_max_bulk_in_queue_length == rhs.usb_max_bulk_in_queue_length);
+}
+
+inline bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CoralSettingsT *CoralSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CoralSettingsT>(new CoralSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CoralSettings::UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = device(); if (_e) _o->device = _e->str(); }
+  { auto _e = performance(); _o->performance = _e; }
+  { auto _e = usb_always_dfu(); _o->usb_always_dfu = _e; }
+  { auto _e = usb_max_bulk_in_queue_length(); _o->usb_max_bulk_in_queue_length = _e; }
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CoralSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCoralSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoralSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _device = _o->device.empty() ? 0 : _fbb.CreateString(_o->device);
+  auto _performance = _o->performance;
+  auto _usb_always_dfu = _o->usb_always_dfu;
+  auto _usb_max_bulk_in_queue_length = _o->usb_max_bulk_in_queue_length;
+  return tflite::CreateCoralSettings(
+      _fbb,
+      _device,
+      _performance,
+      _usb_always_dfu,
+      _usb_max_bulk_in_queue_length);
+}
+
+
+inline bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
+  return
+      (lhs.num_threads == rhs.num_threads);
+}
+
+inline bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CPUSettingsT *CPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CPUSettingsT>(new CPUSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CPUSettings::UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_threads(); _o->num_threads = _e; }
+}
+
+inline ::flatbuffers::Offset<CPUSettings> CPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCPUSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_threads = _o->num_threads;
+  return tflite::CreateCPUSettings(
+      _fbb,
+      _num_threads);
+}
+
+
+inline bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+  return
+      (lhs.backends == rhs.backends) &&
+      (lhs.fastmath == rhs.fastmath) &&
+      (lhs.additional_parameters == rhs.additional_parameters);
+}
+
+inline bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ArmNNSettingsT *ArmNNSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArmNNSettingsT>(new ArmNNSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArmNNSettings::UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = backends(); if (_e) _o->backends = _e->str(); }
+  { auto _e = fastmath(); _o->fastmath = _e; }
+  { auto _e = additional_parameters(); if (_e) _o->additional_parameters = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> ArmNNSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArmNNSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArmNNSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _backends = _o->backends.empty() ? 0 : _fbb.CreateString(_o->backends);
+  auto _fastmath = _o->fastmath;
+  auto _additional_parameters = _o->additional_parameters.empty() ? 0 : _fbb.CreateString(_o->additional_parameters);
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      _backends,
+      _fastmath,
+      _additional_parameters);
+}
+
+
+inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
+  return
+      (lhs.delegate == rhs.delegate) &&
+      ((lhs.nnapi_settings == rhs.nnapi_settings) || (lhs.nnapi_settings && rhs.nnapi_settings && *lhs.nnapi_settings == *rhs.nnapi_settings)) &&
+      ((lhs.gpu_settings == rhs.gpu_settings) || (lhs.gpu_settings && rhs.gpu_settings && *lhs.gpu_settings == *rhs.gpu_settings)) &&
+      ((lhs.hexagon_settings == rhs.hexagon_settings) || (lhs.hexagon_settings && rhs.hexagon_settings && *lhs.hexagon_settings == *rhs.hexagon_settings)) &&
+      ((lhs.xnnpack_settings == rhs.xnnpack_settings) || (lhs.xnnpack_settings && rhs.xnnpack_settings && *lhs.xnnpack_settings == *rhs.xnnpack_settings)) &&
+      ((lhs.coreml_settings == rhs.coreml_settings) || (lhs.coreml_settings && rhs.coreml_settings && *lhs.coreml_settings == *rhs.coreml_settings)) &&
+      ((lhs.cpu_settings == rhs.cpu_settings) || (lhs.cpu_settings && rhs.cpu_settings && *lhs.cpu_settings == *rhs.cpu_settings)) &&
+      (lhs.max_delegated_partitions == rhs.max_delegated_partitions) &&
+      ((lhs.edgetpu_settings == rhs.edgetpu_settings) || (lhs.edgetpu_settings && rhs.edgetpu_settings && *lhs.edgetpu_settings == *rhs.edgetpu_settings)) &&
+      ((lhs.coral_settings == rhs.coral_settings) || (lhs.coral_settings && rhs.coral_settings && *lhs.coral_settings == *rhs.coral_settings)) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.disable_default_delegates == rhs.disable_default_delegates) &&
+      ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings)) &&
+      ((lhs.google_edgetpu_settings == rhs.google_edgetpu_settings) || (lhs.google_edgetpu_settings && rhs.google_edgetpu_settings && *lhs.google_edgetpu_settings == *rhs.google_edgetpu_settings)) &&
+      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings)) &&
+      ((lhs.armnn_settings == rhs.armnn_settings) || (lhs.armnn_settings && rhs.armnn_settings && *lhs.armnn_settings == *rhs.armnn_settings));
+}
+
+inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline TFLiteSettingsT::TFLiteSettingsT(const TFLiteSettingsT &o)
+      : delegate(o.delegate),
+        nnapi_settings((o.nnapi_settings) ? new tflite::NNAPISettingsT(*o.nnapi_settings) : nullptr),
+        gpu_settings((o.gpu_settings) ? new tflite::GPUSettingsT(*o.gpu_settings) : nullptr),
+        hexagon_settings((o.hexagon_settings) ? new tflite::HexagonSettingsT(*o.hexagon_settings) : nullptr),
+        xnnpack_settings((o.xnnpack_settings) ? new tflite::XNNPackSettingsT(*o.xnnpack_settings) : nullptr),
+        coreml_settings((o.coreml_settings) ? new tflite::CoreMLSettingsT(*o.coreml_settings) : nullptr),
+        cpu_settings((o.cpu_settings) ? new tflite::CPUSettingsT(*o.cpu_settings) : nullptr),
+        max_delegated_partitions(o.max_delegated_partitions),
+        edgetpu_settings((o.edgetpu_settings) ? new tflite::EdgeTpuSettingsT(*o.edgetpu_settings) : nullptr),
+        coral_settings((o.coral_settings) ? new tflite::CoralSettingsT(*o.coral_settings) : nullptr),
+        fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
+        disable_default_delegates(o.disable_default_delegates),
+        stable_delegate_loader_settings((o.stable_delegate_loader_settings) ? new tflite::StableDelegateLoaderSettingsT(*o.stable_delegate_loader_settings) : nullptr),
+        google_edgetpu_settings((o.google_edgetpu_settings) ? new tflite::GoogleEdgeTpuSettingsT(*o.google_edgetpu_settings) : nullptr),
+        compilation_caching_settings((o.compilation_caching_settings) ? new tflite::CompilationCachingSettingsT(*o.compilation_caching_settings) : nullptr),
+        armnn_settings((o.armnn_settings) ? new tflite::ArmNNSettingsT(*o.armnn_settings) : nullptr) {
+}
+
+inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(delegate, o.delegate);
+  std::swap(nnapi_settings, o.nnapi_settings);
+  std::swap(gpu_settings, o.gpu_settings);
+  std::swap(hexagon_settings, o.hexagon_settings);
+  std::swap(xnnpack_settings, o.xnnpack_settings);
+  std::swap(coreml_settings, o.coreml_settings);
+  std::swap(cpu_settings, o.cpu_settings);
+  std::swap(max_delegated_partitions, o.max_delegated_partitions);
+  std::swap(edgetpu_settings, o.edgetpu_settings);
+  std::swap(coral_settings, o.coral_settings);
+  std::swap(fallback_settings, o.fallback_settings);
+  std::swap(disable_default_delegates, o.disable_default_delegates);
+  std::swap(stable_delegate_loader_settings, o.stable_delegate_loader_settings);
+  std::swap(google_edgetpu_settings, o.google_edgetpu_settings);
+  std::swap(compilation_caching_settings, o.compilation_caching_settings);
+  std::swap(armnn_settings, o.armnn_settings);
+  return *this;
+}
+
+inline TFLiteSettingsT *TFLiteSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TFLiteSettingsT>(new TFLiteSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate(); _o->delegate = _e; }
+  { auto _e = nnapi_settings(); if (_e) { if(_o->nnapi_settings) { _e->UnPackTo(_o->nnapi_settings.get(), _resolver); } else { _o->nnapi_settings = std::unique_ptr<tflite::NNAPISettingsT>(_e->UnPack(_resolver)); } } else if (_o->nnapi_settings) { _o->nnapi_settings.reset(); } }
+  { auto _e = gpu_settings(); if (_e) { if(_o->gpu_settings) { _e->UnPackTo(_o->gpu_settings.get(), _resolver); } else { _o->gpu_settings = std::unique_ptr<tflite::GPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->gpu_settings) { _o->gpu_settings.reset(); } }
+  { auto _e = hexagon_settings(); if (_e) { if(_o->hexagon_settings) { _e->UnPackTo(_o->hexagon_settings.get(), _resolver); } else { _o->hexagon_settings = std::unique_ptr<tflite::HexagonSettingsT>(_e->UnPack(_resolver)); } } else if (_o->hexagon_settings) { _o->hexagon_settings.reset(); } }
+  { auto _e = xnnpack_settings(); if (_e) { if(_o->xnnpack_settings) { _e->UnPackTo(_o->xnnpack_settings.get(), _resolver); } else { _o->xnnpack_settings = std::unique_ptr<tflite::XNNPackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->xnnpack_settings) { _o->xnnpack_settings.reset(); } }
+  { auto _e = coreml_settings(); if (_e) { if(_o->coreml_settings) { _e->UnPackTo(_o->coreml_settings.get(), _resolver); } else { _o->coreml_settings = std::unique_ptr<tflite::CoreMLSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coreml_settings) { _o->coreml_settings.reset(); } }
+  { auto _e = cpu_settings(); if (_e) { if(_o->cpu_settings) { _e->UnPackTo(_o->cpu_settings.get(), _resolver); } else { _o->cpu_settings = std::unique_ptr<tflite::CPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->cpu_settings) { _o->cpu_settings.reset(); } }
+  { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
+  { auto _e = edgetpu_settings(); if (_e) { if(_o->edgetpu_settings) { _e->UnPackTo(_o->edgetpu_settings.get(), _resolver); } else { _o->edgetpu_settings = std::unique_ptr<tflite::EdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_settings) { _o->edgetpu_settings.reset(); } }
+  { auto _e = coral_settings(); if (_e) { if(_o->coral_settings) { _e->UnPackTo(_o->coral_settings.get(), _resolver); } else { _o->coral_settings = std::unique_ptr<tflite::CoralSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coral_settings) { _o->coral_settings.reset(); } }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
+  { auto _e = disable_default_delegates(); _o->disable_default_delegates = _e; }
+  { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } else if (_o->stable_delegate_loader_settings) { _o->stable_delegate_loader_settings.reset(); } }
+  { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
+  { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
+  { auto _e = armnn_settings(); if (_e) { if(_o->armnn_settings) { _e->UnPackTo(_o->armnn_settings.get(), _resolver); } else { _o->armnn_settings = std::unique_ptr<tflite::ArmNNSettingsT>(_e->UnPack(_resolver)); } } else if (_o->armnn_settings) { _o->armnn_settings.reset(); } }
+}
+
+inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTFLiteSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TFLiteSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate = _o->delegate;
+  auto _nnapi_settings = _o->nnapi_settings ? CreateNNAPISettings(_fbb, _o->nnapi_settings.get(), _rehasher) : 0;
+  auto _gpu_settings = _o->gpu_settings ? CreateGPUSettings(_fbb, _o->gpu_settings.get(), _rehasher) : 0;
+  auto _hexagon_settings = _o->hexagon_settings ? CreateHexagonSettings(_fbb, _o->hexagon_settings.get(), _rehasher) : 0;
+  auto _xnnpack_settings = _o->xnnpack_settings ? CreateXNNPackSettings(_fbb, _o->xnnpack_settings.get(), _rehasher) : 0;
+  auto _coreml_settings = _o->coreml_settings ? CreateCoreMLSettings(_fbb, _o->coreml_settings.get(), _rehasher) : 0;
+  auto _cpu_settings = _o->cpu_settings ? CreateCPUSettings(_fbb, _o->cpu_settings.get(), _rehasher) : 0;
+  auto _max_delegated_partitions = _o->max_delegated_partitions;
+  auto _edgetpu_settings = _o->edgetpu_settings ? CreateEdgeTpuSettings(_fbb, _o->edgetpu_settings.get(), _rehasher) : 0;
+  auto _coral_settings = _o->coral_settings ? CreateCoralSettings(_fbb, _o->coral_settings.get(), _rehasher) : 0;
+  auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
+  auto _disable_default_delegates = _o->disable_default_delegates;
+  auto _stable_delegate_loader_settings = _o->stable_delegate_loader_settings ? CreateStableDelegateLoaderSettings(_fbb, _o->stable_delegate_loader_settings.get(), _rehasher) : 0;
+  auto _google_edgetpu_settings = _o->google_edgetpu_settings ? CreateGoogleEdgeTpuSettings(_fbb, _o->google_edgetpu_settings.get(), _rehasher) : 0;
+  auto _compilation_caching_settings = _o->compilation_caching_settings ? CreateCompilationCachingSettings(_fbb, _o->compilation_caching_settings.get(), _rehasher) : 0;
+  auto _armnn_settings = _o->armnn_settings ? CreateArmNNSettings(_fbb, _o->armnn_settings.get(), _rehasher) : 0;
+  return tflite::CreateTFLiteSettings(
+      _fbb,
+      _delegate,
+      _nnapi_settings,
+      _gpu_settings,
+      _hexagon_settings,
+      _xnnpack_settings,
+      _coreml_settings,
+      _cpu_settings,
+      _max_delegated_partitions,
+      _edgetpu_settings,
+      _coral_settings,
+      _fallback_settings,
+      _disable_default_delegates,
+      _stable_delegate_loader_settings,
+      _google_edgetpu_settings,
+      _compilation_caching_settings,
+      _armnn_settings);
+}
+
+
+inline bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs) {
+  return
+      (lhs.allow_automatic_fallback_on_compilation_error == rhs.allow_automatic_fallback_on_compilation_error) &&
+      (lhs.allow_automatic_fallback_on_execution_error == rhs.allow_automatic_fallback_on_execution_error);
+}
+
+inline bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline FallbackSettingsT *FallbackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FallbackSettingsT>(new FallbackSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FallbackSettings::UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = allow_automatic_fallback_on_compilation_error(); _o->allow_automatic_fallback_on_compilation_error = _e; }
+  { auto _e = allow_automatic_fallback_on_execution_error(); _o->allow_automatic_fallback_on_execution_error = _e; }
+}
+
+inline ::flatbuffers::Offset<FallbackSettings> FallbackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFallbackSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FallbackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _allow_automatic_fallback_on_compilation_error = _o->allow_automatic_fallback_on_compilation_error;
+  auto _allow_automatic_fallback_on_execution_error = _o->allow_automatic_fallback_on_execution_error;
+  return tflite::CreateFallbackSettings(
+      _fbb,
+      _allow_automatic_fallback_on_compilation_error,
+      _allow_automatic_fallback_on_execution_error);
+}
+
+
+inline bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs) {
+  return
+      (lhs.name == rhs.name) &&
+      (lhs.values == rhs.values);
+}
+
+inline bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkMetricT *BenchmarkMetric::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkMetricT>(new BenchmarkMetricT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkMetric::UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> BenchmarkMetric::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkMetric(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkMetricT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateBenchmarkMetric(
+      _fbb,
+      _name,
+      _values);
+}
+
+
+inline bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
+  return
+      (lhs.initialization_time_us == rhs.initialization_time_us) &&
+      (lhs.inference_time_us == rhs.inference_time_us) &&
+      (lhs.max_memory_kb == rhs.max_memory_kb) &&
+      (lhs.ok == rhs.ok) &&
+      (lhs.metrics.size() == rhs.metrics.size() && std::equal(lhs.metrics.cbegin(), lhs.metrics.cend(), rhs.metrics.cbegin(), [](std::unique_ptr<tflite::BenchmarkMetricT> const &a, std::unique_ptr<tflite::BenchmarkMetricT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.actual_output.size() == rhs.actual_output.size() && std::equal(lhs.actual_output.cbegin(), lhs.actual_output.cend(), rhs.actual_output.cbegin(), [](std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &a, std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &b) { return (a == b) || (a && b && *a == *b); }));
+}
+
+inline bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkResultT::BenchmarkResultT(const BenchmarkResultT &o)
+      : initialization_time_us(o.initialization_time_us),
+        inference_time_us(o.inference_time_us),
+        max_memory_kb(o.max_memory_kb),
+        ok(o.ok) {
+  metrics.reserve(o.metrics.size());
+  for (const auto &metrics_ : o.metrics) { metrics.emplace_back((metrics_) ? new tflite::BenchmarkMetricT(*metrics_) : nullptr); }
+  actual_output.reserve(o.actual_output.size());
+  for (const auto &actual_output_ : o.actual_output) { actual_output.emplace_back((actual_output_) ? new tflite::BenchmarkResult_::InferenceOutputT(*actual_output_) : nullptr); }
+}
+
+inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(initialization_time_us, o.initialization_time_us);
+  std::swap(inference_time_us, o.inference_time_us);
+  std::swap(max_memory_kb, o.max_memory_kb);
+  std::swap(ok, o.ok);
+  std::swap(metrics, o.metrics);
+  std::swap(actual_output, o.actual_output);
+  return *this;
+}
+
+inline BenchmarkResultT *BenchmarkResult::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkResultT>(new BenchmarkResultT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkResult::UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = initialization_time_us(); if (_e) { _o->initialization_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->initialization_time_us[_i] = _e->Get(_i); } } else { _o->initialization_time_us.resize(0); } }
+  { auto _e = inference_time_us(); if (_e) { _o->inference_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inference_time_us[_i] = _e->Get(_i); } } else { _o->inference_time_us.resize(0); } }
+  { auto _e = max_memory_kb(); _o->max_memory_kb = _e; }
+  { auto _e = ok(); _o->ok = _e; }
+  { auto _e = metrics(); if (_e) { _o->metrics.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metrics[_i]) { _e->Get(_i)->UnPackTo(_o->metrics[_i].get(), _resolver); } else { _o->metrics[_i] = std::unique_ptr<tflite::BenchmarkMetricT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metrics.resize(0); } }
+  { auto _e = actual_output(); if (_e) { _o->actual_output.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->actual_output[_i]) { _e->Get(_i)->UnPackTo(_o->actual_output[_i].get(), _resolver); } else { _o->actual_output[_i] = std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->actual_output.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> BenchmarkResult::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkResult(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkResultT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _initialization_time_us = _o->initialization_time_us.size() ? _fbb.CreateVector(_o->initialization_time_us) : 0;
+  auto _inference_time_us = _o->inference_time_us.size() ? _fbb.CreateVector(_o->inference_time_us) : 0;
+  auto _max_memory_kb = _o->max_memory_kb;
+  auto _ok = _o->ok;
+  auto _metrics = _o->metrics.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>> (_o->metrics.size(), [](size_t i, _VectorArgs *__va) { return CreateBenchmarkMetric(*__va->__fbb, __va->__o->metrics[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _actual_output = _o->actual_output.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> (_o->actual_output.size(), [](size_t i, _VectorArgs *__va) { return CreateInferenceOutput(*__va->__fbb, __va->__o->actual_output[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateBenchmarkResult(
+      _fbb,
+      _initialization_time_us,
+      _inference_time_us,
+      _max_memory_kb,
+      _ok,
+      _metrics,
+      _actual_output);
+}
+
+namespace BenchmarkResult_ {
+
+
+inline bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+  return
+      (lhs.value == rhs.value);
+}
+
+inline bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline InferenceOutputT *InferenceOutput::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<InferenceOutputT>(new InferenceOutputT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void InferenceOutput::UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = value(); if (_e) { _o->value.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->value.begin()); } }
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> InferenceOutput::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInferenceOutput(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const InferenceOutputT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _value = _o->value.size() ? _fbb.CreateVector(_o->value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      _value);
+}
+
+}  // namespace BenchmarkResult_
+
+
+inline bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
+  return
+      (lhs.source == rhs.source) &&
+      (lhs.tflite_error == rhs.tflite_error) &&
+      (lhs.underlying_api_error == rhs.underlying_api_error);
+}
+
+inline bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ErrorCodeT *ErrorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ErrorCodeT>(new ErrorCodeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ErrorCode::UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = source(); _o->source = _e; }
+  { auto _e = tflite_error(); _o->tflite_error = _e; }
+  { auto _e = underlying_api_error(); _o->underlying_api_error = _e; }
+}
+
+inline ::flatbuffers::Offset<ErrorCode> ErrorCode::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateErrorCode(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ErrorCodeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _source = _o->source;
+  auto _tflite_error = _o->tflite_error;
+  auto _underlying_api_error = _o->underlying_api_error;
+  return tflite::CreateErrorCode(
+      _fbb,
+      _source,
+      _tflite_error,
+      _underlying_api_error);
+}
+
+
+inline bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
+  return
+      (lhs.stage == rhs.stage) &&
+      (lhs.exit_code == rhs.exit_code) &&
+      (lhs.signal == rhs.signal) &&
+      (lhs.error_code.size() == rhs.error_code.size() && std::equal(lhs.error_code.cbegin(), lhs.error_code.cend(), rhs.error_code.cbegin(), [](std::unique_ptr<tflite::ErrorCodeT> const &a, std::unique_ptr<tflite::ErrorCodeT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.mini_benchmark_error_code == rhs.mini_benchmark_error_code);
+}
+
+inline bool operator!=(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkErrorT::BenchmarkErrorT(const BenchmarkErrorT &o)
+      : stage(o.stage),
+        exit_code(o.exit_code),
+        signal(o.signal),
+        mini_benchmark_error_code(o.mini_benchmark_error_code) {
+  error_code.reserve(o.error_code.size());
+  for (const auto &error_code_ : o.error_code) { error_code.emplace_back((error_code_) ? new tflite::ErrorCodeT(*error_code_) : nullptr); }
+}
+
+inline BenchmarkErrorT &BenchmarkErrorT::operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(stage, o.stage);
+  std::swap(exit_code, o.exit_code);
+  std::swap(signal, o.signal);
+  std::swap(error_code, o.error_code);
+  std::swap(mini_benchmark_error_code, o.mini_benchmark_error_code);
+  return *this;
+}
+
+inline BenchmarkErrorT *BenchmarkError::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkErrorT>(new BenchmarkErrorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkError::UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = stage(); _o->stage = _e; }
+  { auto _e = exit_code(); _o->exit_code = _e; }
+  { auto _e = signal(); _o->signal = _e; }
+  { auto _e = error_code(); if (_e) { _o->error_code.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->error_code[_i]) { _e->Get(_i)->UnPackTo(_o->error_code[_i].get(), _resolver); } else { _o->error_code[_i] = std::unique_ptr<tflite::ErrorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->error_code.resize(0); } }
+  { auto _e = mini_benchmark_error_code(); _o->mini_benchmark_error_code = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> BenchmarkError::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkError(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkErrorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _stage = _o->stage;
+  auto _exit_code = _o->exit_code;
+  auto _signal = _o->signal;
+  auto _error_code = _o->error_code.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>> (_o->error_code.size(), [](size_t i, _VectorArgs *__va) { return CreateErrorCode(*__va->__fbb, __va->__o->error_code[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _mini_benchmark_error_code = _o->mini_benchmark_error_code;
+  return tflite::CreateBenchmarkError(
+      _fbb,
+      _stage,
+      _exit_code,
+      _signal,
+      _error_code,
+      _mini_benchmark_error_code);
+}
+
+
+inline bool operator==(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs) {
+  return
+      ((lhs.tflite_settings == rhs.tflite_settings) || (lhs.tflite_settings && rhs.tflite_settings && *lhs.tflite_settings == *rhs.tflite_settings)) &&
+      (lhs.event_type == rhs.event_type) &&
+      ((lhs.result == rhs.result) || (lhs.result && rhs.result && *lhs.result == *rhs.result)) &&
+      ((lhs.error == rhs.error) || (lhs.error && rhs.error && *lhs.error == *rhs.error)) &&
+      (lhs.boottime_us == rhs.boottime_us) &&
+      (lhs.wallclock_us == rhs.wallclock_us);
+}
+
+inline bool operator!=(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventT::BenchmarkEventT(const BenchmarkEventT &o)
+      : tflite_settings((o.tflite_settings) ? new tflite::TFLiteSettingsT(*o.tflite_settings) : nullptr),
+        event_type(o.event_type),
+        result((o.result) ? new tflite::BenchmarkResultT(*o.result) : nullptr),
+        error((o.error) ? new tflite::BenchmarkErrorT(*o.error) : nullptr),
+        boottime_us(o.boottime_us),
+        wallclock_us(o.wallclock_us) {
+}
+
+inline BenchmarkEventT &BenchmarkEventT::operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(tflite_settings, o.tflite_settings);
+  std::swap(event_type, o.event_type);
+  std::swap(result, o.result);
+  std::swap(error, o.error);
+  std::swap(boottime_us, o.boottime_us);
+  std::swap(wallclock_us, o.wallclock_us);
+  return *this;
+}
+
+inline BenchmarkEventT *BenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventT>(new BenchmarkEventT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEvent::UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
+  { auto _e = event_type(); _o->event_type = _e; }
+  { auto _e = result(); if (_e) { if(_o->result) { _e->UnPackTo(_o->result.get(), _resolver); } else { _o->result = std::unique_ptr<tflite::BenchmarkResultT>(_e->UnPack(_resolver)); } } else if (_o->result) { _o->result.reset(); } }
+  { auto _e = error(); if (_e) { if(_o->error) { _e->UnPackTo(_o->error.get(), _resolver); } else { _o->error = std::unique_ptr<tflite::BenchmarkErrorT>(_e->UnPack(_resolver)); } } else if (_o->error) { _o->error.reset(); } }
+  { auto _e = boottime_us(); _o->boottime_us = _e; }
+  { auto _e = wallclock_us(); _o->wallclock_us = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkEvent> BenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEvent(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
+  auto _event_type = _o->event_type;
+  auto _result = _o->result ? CreateBenchmarkResult(_fbb, _o->result.get(), _rehasher) : 0;
+  auto _error = _o->error ? CreateBenchmarkError(_fbb, _o->error.get(), _rehasher) : 0;
+  auto _boottime_us = _o->boottime_us;
+  auto _wallclock_us = _o->wallclock_us;
+  return tflite::CreateBenchmarkEvent(
+      _fbb,
+      _tflite_settings,
+      _event_type,
+      _result,
+      _error,
+      _boottime_us,
+      _wallclock_us);
+}
+
+
+inline bool operator==(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs) {
+  return
+      (lhs.number_of_source_events == rhs.number_of_source_events) &&
+      ((lhs.min_latency_event == rhs.min_latency_event) || (lhs.min_latency_event && rhs.min_latency_event && *lhs.min_latency_event == *rhs.min_latency_event)) &&
+      (lhs.min_inference_time_us == rhs.min_inference_time_us);
+}
+
+inline bool operator!=(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BestAccelerationDecisionT::BestAccelerationDecisionT(const BestAccelerationDecisionT &o)
+      : number_of_source_events(o.number_of_source_events),
+        min_latency_event((o.min_latency_event) ? new tflite::BenchmarkEventT(*o.min_latency_event) : nullptr),
+        min_inference_time_us(o.min_inference_time_us) {
+}
+
+inline BestAccelerationDecisionT &BestAccelerationDecisionT::operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(number_of_source_events, o.number_of_source_events);
+  std::swap(min_latency_event, o.min_latency_event);
+  std::swap(min_inference_time_us, o.min_inference_time_us);
+  return *this;
+}
+
+inline BestAccelerationDecisionT *BestAccelerationDecision::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BestAccelerationDecisionT>(new BestAccelerationDecisionT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BestAccelerationDecision::UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = number_of_source_events(); _o->number_of_source_events = _e; }
+  { auto _e = min_latency_event(); if (_e) { if(_o->min_latency_event) { _e->UnPackTo(_o->min_latency_event.get(), _resolver); } else { _o->min_latency_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->min_latency_event) { _o->min_latency_event.reset(); } }
+  { auto _e = min_inference_time_us(); _o->min_inference_time_us = _e; }
+}
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> BestAccelerationDecision::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBestAccelerationDecision(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BestAccelerationDecisionT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _number_of_source_events = _o->number_of_source_events;
+  auto _min_latency_event = _o->min_latency_event ? CreateBenchmarkEvent(_fbb, _o->min_latency_event.get(), _rehasher) : 0;
+  auto _min_inference_time_us = _o->min_inference_time_us;
+  return tflite::CreateBestAccelerationDecision(
+      _fbb,
+      _number_of_source_events,
+      _min_latency_event,
+      _min_inference_time_us);
+}
+
+
+inline bool operator==(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs) {
+  return
+      (lhs.initialization_status == rhs.initialization_status);
+}
+
+inline bool operator!=(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkInitializationFailureT *BenchmarkInitializationFailure::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkInitializationFailureT>(new BenchmarkInitializationFailureT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkInitializationFailure::UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = initialization_status(); _o->initialization_status = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> BenchmarkInitializationFailure::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkInitializationFailure(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkInitializationFailureT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _initialization_status = _o->initialization_status;
+  return tflite::CreateBenchmarkInitializationFailure(
+      _fbb,
+      _initialization_status);
+}
+
+
+inline bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs) {
+  return
+      (lhs.is_log_flushing_event == rhs.is_log_flushing_event) &&
+      ((lhs.best_acceleration_decision == rhs.best_acceleration_decision) || (lhs.best_acceleration_decision && rhs.best_acceleration_decision && *lhs.best_acceleration_decision == *rhs.best_acceleration_decision)) &&
+      ((lhs.initialization_failure == rhs.initialization_failure) || (lhs.initialization_failure && rhs.initialization_failure && *lhs.initialization_failure == *rhs.initialization_failure)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MiniBenchmarkEventT::MiniBenchmarkEventT(const MiniBenchmarkEventT &o)
+      : is_log_flushing_event(o.is_log_flushing_event),
+        best_acceleration_decision((o.best_acceleration_decision) ? new tflite::BestAccelerationDecisionT(*o.best_acceleration_decision) : nullptr),
+        initialization_failure((o.initialization_failure) ? new tflite::BenchmarkInitializationFailureT(*o.initialization_failure) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline MiniBenchmarkEventT &MiniBenchmarkEventT::operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(is_log_flushing_event, o.is_log_flushing_event);
+  std::swap(best_acceleration_decision, o.best_acceleration_decision);
+  std::swap(initialization_failure, o.initialization_failure);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline MiniBenchmarkEventT *MiniBenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MiniBenchmarkEventT>(new MiniBenchmarkEventT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MiniBenchmarkEvent::UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = is_log_flushing_event(); _o->is_log_flushing_event = _e; }
+  { auto _e = best_acceleration_decision(); if (_e) { if(_o->best_acceleration_decision) { _e->UnPackTo(_o->best_acceleration_decision.get(), _resolver); } else { _o->best_acceleration_decision = std::unique_ptr<tflite::BestAccelerationDecisionT>(_e->UnPack(_resolver)); } } else if (_o->best_acceleration_decision) { _o->best_acceleration_decision.reset(); } }
+  { auto _e = initialization_failure(); if (_e) { if(_o->initialization_failure) { _e->UnPackTo(_o->initialization_failure.get(), _resolver); } else { _o->initialization_failure = std::unique_ptr<tflite::BenchmarkInitializationFailureT>(_e->UnPack(_resolver)); } } else if (_o->initialization_failure) { _o->initialization_failure.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
+}
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> MiniBenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMiniBenchmarkEvent(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MiniBenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _is_log_flushing_event = _o->is_log_flushing_event;
+  auto _best_acceleration_decision = _o->best_acceleration_decision ? CreateBestAccelerationDecision(_fbb, _o->best_acceleration_decision.get(), _rehasher) : 0;
+  auto _initialization_failure = _o->initialization_failure ? CreateBenchmarkInitializationFailure(_fbb, _o->initialization_failure.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateMiniBenchmarkEvent(
+      _fbb,
+      _is_log_flushing_event,
+      _best_acceleration_decision,
+      _initialization_failure,
+      _benchmark_event);
+}
+
+
+inline bool operator==(const ModelFileT &lhs, const ModelFileT &rhs) {
+  return
+      (lhs.filename == rhs.filename) &&
+      (lhs.fd == rhs.fd) &&
+      (lhs.offset == rhs.offset) &&
+      (lhs.length == rhs.length) &&
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      (lhs.buffer_handle == rhs.buffer_handle);
+}
+
+inline bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelFileT::ModelFileT(const ModelFileT &o)
+      : filename(o.filename),
+        fd(o.fd),
+        offset(o.offset),
+        length(o.length),
+        model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        buffer_handle(o.buffer_handle) {
+}
+
+inline ModelFileT &ModelFileT::operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(filename, o.filename);
+  std::swap(fd, o.fd);
+  std::swap(offset, o.offset);
+  std::swap(length, o.length);
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(buffer_handle, o.buffer_handle);
+  return *this;
+}
+
+inline ModelFileT *ModelFile::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelFileT>(new ModelFileT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelFile::UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = filename(); if (_e) _o->filename = _e->str(); }
+  { auto _e = fd(); _o->fd = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = length(); _o->length = _e; }
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = buffer_handle(); _o->buffer_handle = _e; }
+}
+
+inline ::flatbuffers::Offset<ModelFile> ModelFile::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelFile(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelFileT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _filename = _o->filename.empty() ? 0 : _fbb.CreateString(_o->filename);
+  auto _fd = _o->fd;
+  auto _offset = _o->offset;
+  auto _length = _o->length;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _buffer_handle = _o->buffer_handle;
+  return tflite::CreateModelFile(
+      _fbb,
+      _filename,
+      _fd,
+      _offset,
+      _length,
+      _model_id_group,
+      _buffer_handle);
+}
+
+
+inline bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+  return
+      (lhs.model_namespace == rhs.model_namespace) &&
+      (lhs.model_id == rhs.model_id);
+}
+
+inline bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelIdGroupT *ModelIdGroup::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelIdGroupT>(new ModelIdGroupT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelIdGroup::UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_namespace(); if (_e) _o->model_namespace = _e->str(); }
+  { auto _e = model_id(); if (_e) _o->model_id = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> ModelIdGroup::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelIdGroup(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelIdGroupT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_namespace = _o->model_namespace.empty() ? 0 : _fbb.CreateString(_o->model_namespace);
+  auto _model_id = _o->model_id.empty() ? 0 : _fbb.CreateString(_o->model_id);
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      _model_namespace,
+      _model_id);
+}
+
+
+inline bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs) {
+  return
+      (lhs.storage_file_path == rhs.storage_file_path) &&
+      (lhs.data_directory_path == rhs.data_directory_path);
+}
+
+inline bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkStoragePathsT *BenchmarkStoragePaths::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkStoragePathsT>(new BenchmarkStoragePathsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkStoragePaths::UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = storage_file_path(); if (_e) _o->storage_file_path = _e->str(); }
+  { auto _e = data_directory_path(); if (_e) _o->data_directory_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> BenchmarkStoragePaths::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkStoragePaths(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkStoragePathsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _storage_file_path = _o->storage_file_path.empty() ? 0 : _fbb.CreateString(_o->storage_file_path);
+  auto _data_directory_path = _o->data_directory_path.empty() ? 0 : _fbb.CreateString(_o->data_directory_path);
+  return tflite::CreateBenchmarkStoragePaths(
+      _fbb,
+      _storage_file_path,
+      _data_directory_path);
+}
+
+
+inline bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs) {
+  return
+      (lhs.per_test_timeout_ms == rhs.per_test_timeout_ms);
+}
+
+inline bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ValidationSettingsT *ValidationSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ValidationSettingsT>(new ValidationSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ValidationSettings::UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = per_test_timeout_ms(); _o->per_test_timeout_ms = _e; }
+}
+
+inline ::flatbuffers::Offset<ValidationSettings> ValidationSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateValidationSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ValidationSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _per_test_timeout_ms = _o->per_test_timeout_ms;
+  return tflite::CreateValidationSettings(
+      _fbb,
+      _per_test_timeout_ms);
+}
+
+
+inline bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
+  return
+      (lhs.settings_to_test.size() == rhs.settings_to_test.size() && std::equal(lhs.settings_to_test.cbegin(), lhs.settings_to_test.cend(), rhs.settings_to_test.cbegin(), [](std::unique_ptr<tflite::TFLiteSettingsT> const &a, std::unique_ptr<tflite::TFLiteSettingsT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      ((lhs.model_file == rhs.model_file) || (lhs.model_file && rhs.model_file && *lhs.model_file == *rhs.model_file)) &&
+      ((lhs.storage_paths == rhs.storage_paths) || (lhs.storage_paths && rhs.storage_paths && *lhs.storage_paths == *rhs.storage_paths)) &&
+      ((lhs.validation_settings == rhs.validation_settings) || (lhs.validation_settings && rhs.validation_settings && *lhs.validation_settings == *rhs.validation_settings));
+}
+
+inline bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MinibenchmarkSettingsT::MinibenchmarkSettingsT(const MinibenchmarkSettingsT &o)
+      : model_file((o.model_file) ? new tflite::ModelFileT(*o.model_file) : nullptr),
+        storage_paths((o.storage_paths) ? new tflite::BenchmarkStoragePathsT(*o.storage_paths) : nullptr),
+        validation_settings((o.validation_settings) ? new tflite::ValidationSettingsT(*o.validation_settings) : nullptr) {
+  settings_to_test.reserve(o.settings_to_test.size());
+  for (const auto &settings_to_test_ : o.settings_to_test) { settings_to_test.emplace_back((settings_to_test_) ? new tflite::TFLiteSettingsT(*settings_to_test_) : nullptr); }
+}
+
+inline MinibenchmarkSettingsT &MinibenchmarkSettingsT::operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(settings_to_test, o.settings_to_test);
+  std::swap(model_file, o.model_file);
+  std::swap(storage_paths, o.storage_paths);
+  std::swap(validation_settings, o.validation_settings);
+  return *this;
+}
+
+inline MinibenchmarkSettingsT *MinibenchmarkSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MinibenchmarkSettingsT>(new MinibenchmarkSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MinibenchmarkSettings::UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = settings_to_test(); if (_e) { _o->settings_to_test.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->settings_to_test[_i]) { _e->Get(_i)->UnPackTo(_o->settings_to_test[_i].get(), _resolver); } else { _o->settings_to_test[_i] = std::unique_ptr<tflite::TFLiteSettingsT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->settings_to_test.resize(0); } }
+  { auto _e = model_file(); if (_e) { if(_o->model_file) { _e->UnPackTo(_o->model_file.get(), _resolver); } else { _o->model_file = std::unique_ptr<tflite::ModelFileT>(_e->UnPack(_resolver)); } } else if (_o->model_file) { _o->model_file.reset(); } }
+  { auto _e = storage_paths(); if (_e) { if(_o->storage_paths) { _e->UnPackTo(_o->storage_paths.get(), _resolver); } else { _o->storage_paths = std::unique_ptr<tflite::BenchmarkStoragePathsT>(_e->UnPack(_resolver)); } } else if (_o->storage_paths) { _o->storage_paths.reset(); } }
+  { auto _e = validation_settings(); if (_e) { if(_o->validation_settings) { _e->UnPackTo(_o->validation_settings.get(), _resolver); } else { _o->validation_settings = std::unique_ptr<tflite::ValidationSettingsT>(_e->UnPack(_resolver)); } } else if (_o->validation_settings) { _o->validation_settings.reset(); } }
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> MinibenchmarkSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMinibenchmarkSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MinibenchmarkSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _settings_to_test = _o->settings_to_test.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>> (_o->settings_to_test.size(), [](size_t i, _VectorArgs *__va) { return CreateTFLiteSettings(*__va->__fbb, __va->__o->settings_to_test[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _model_file = _o->model_file ? CreateModelFile(_fbb, _o->model_file.get(), _rehasher) : 0;
+  auto _storage_paths = _o->storage_paths ? CreateBenchmarkStoragePaths(_fbb, _o->storage_paths.get(), _rehasher) : 0;
+  auto _validation_settings = _o->validation_settings ? CreateValidationSettings(_fbb, _o->validation_settings.get(), _rehasher) : 0;
+  return tflite::CreateMinibenchmarkSettings(
+      _fbb,
+      _settings_to_test,
+      _model_file,
+      _storage_paths,
+      _validation_settings);
+}
+
+
+inline bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+  return
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventStorageT::BenchmarkEventStorageT(const BenchmarkEventStorageT &o)
+      : model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline BenchmarkEventStorageT &BenchmarkEventStorageT::operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline BenchmarkEventStorageT *BenchmarkEventStorage::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventStorageT>(new BenchmarkEventStorageT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEventStorage::UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> BenchmarkEventStorage::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEventStorage(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventStorageT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateBenchmarkEventStorage(
+      _fbb,
+      _model_id_group,
+      _benchmark_event);
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h
new file mode 100644
index 00000000..707c9ce3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000..54a5da39
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/delegate_registry.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h
new file mode 100644
index 00000000..f40f4ece
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
new file mode 100644
index 00000000..191f689b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/gpu_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/gpu_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
new file mode 100644
index 00000000..e8aeffa5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/core/acceleration/configuration/nnapi_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/nnapi_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
new file mode 100644
index 00000000..d3b0c1d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
new file mode 100644
index 00000000..653f6a6b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
new file mode 100644
index 00000000..fe7fc1ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BENCHMARK_RESULT_EVALUATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BENCHMARK_RESULT_EVALUATOR_H_
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Evaluates the BenchmarkEvent output from validator.
+class AbstractBenchmarkResultEvaluator {
+ public:
+  virtual ~AbstractBenchmarkResultEvaluator() = default;
+
+  // Returns whether this event means the validation test has passed. It checks
+  // that the test has finished successfully, and the test result passed
+  // accuracy checks.
+  bool IsValidationSuccessEvent(const BenchmarkEvent& event) {
+    return event.event_type() == BenchmarkEventType_END && event.result() &&
+           HasPassedAccuracyCheck(*event.result());
+  }
+
+  // Returns whether this BenchmarkResult should pass the accuracy check.
+  virtual bool HasPassedAccuracyCheck(const BenchmarkResult& result) = 0;
+};
+
+// Evaluator for embedded validation scenario.
+class EmbeddedResultEvaluator : public AbstractBenchmarkResultEvaluator {
+ public:
+  static EmbeddedResultEvaluator* GetInstance();
+
+  bool HasPassedAccuracyCheck(const BenchmarkResult& result) override;
+
+ private:
+  EmbeddedResultEvaluator() = default;
+  ~EmbeddedResultEvaluator() override = default;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BENCHMARK_RESULT_EVALUATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/big_little_affinity.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/big_little_affinity.h
new file mode 100644
index 00000000..0f9f355f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/big_little_affinity.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BIG_LITTLE_AFFINITY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BIG_LITTLE_AFFINITY_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace acceleration {
+
+struct BigLittleAffinity {
+  uint16_t big_core_affinity = 0;
+  uint16_t little_core_affinity = 0;
+};
+
+BigLittleAffinity GetAffinity();
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BIG_LITTLE_AFFINITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
new file mode 100644
index 00000000..823d0520
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BLOCKING_VALIDATOR_RUNNER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BLOCKING_VALIDATOR_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Class that runs mini-benchmark validation in a separate process and gives
+// access to the results. This class provides a synchronous API for the callers
+// to wait until the all the tests have finished.
+//
+// This class is thread-safe when using different storage_path_. When
+// storage_path_ is shared between multiple runners, they will interfere with
+// each other.
+class BlockingValidatorRunner {
+ public:
+  explicit BlockingValidatorRunner(const ValidatorRunnerOptions& options);
+
+  MinibenchmarkStatus Init();
+
+  // Trigger the validation tests with for_settings, and return the test result.
+  // Each for_settings will have a corresponding result. The result is of schema
+  // BenchmarkEvent.
+  std::vector<flatbuffers::FlatBufferBuilder> TriggerValidation(
+      const std::vector<const TFLiteSettings*>& for_settings);
+
+ private:
+  int per_test_timeout_ms_ = 0;
+  const std::string storage_path_base_;
+  std::unique_ptr<ValidatorRunnerImpl> validator_runner_impl_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BLOCKING_VALIDATOR_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
new file mode 100644
index 00000000..011e4920
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
@@ -0,0 +1,23 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h
+
+#include "tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h
new file mode 100644
index 00000000..63351190
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CALL_REGISTER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CALL_REGISTER_H_
+#include "tensorflow/lite/core/c/common.h"
+namespace tflite {
+namespace acceleration {
+namespace ops {
+// CALL op can be used to invoke a subgraph a given number of times.
+TfLiteRegistration* Register_CALL();
+
+typedef struct {
+  // Index of the subgraph that needs to be invoked.
+  // Subgraph should have batch size 1.
+  int subgraph_index;
+  // The number of times the CALL op should call the subgraph.
+  // The inputs to the call op are expected to have this value as their batch
+  // size.
+  int loop_count;
+} TfLiteCallParams;
+
+}  // namespace ops
+}  // namespace acceleration
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CALL_REGISTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h
new file mode 100644
index 00000000..7475f392
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CONSTANTS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CONSTANTS_H_
+
+namespace tflite {
+namespace acceleration {
+
+// Model modification.
+inline constexpr int kModelSchemaVersion = 3;
+inline constexpr char kValidationGraphName[] = "VALIDATION:main";
+
+// Process Execution.
+// Number of bytes used to store process id.
+inline constexpr int kPidBufferLength = 20;
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h
new file mode 100644
index 00000000..dfa3049d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_DECODE_JPEG_REGISTER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_DECODE_JPEG_REGISTER_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// DECODE_JPEG can be used to decode a batch of JPEG images on Android.
+// TODO(b/172544567): Support iOS.
+// TODO(b/172544567): Support greyscale images.
+// Expects single 1D input of the shape {num_images} and type string.
+// Single output containing the decoded images with shape {num_images, height,
+// width, channels}. All input images are required to have 3 channels. The
+// decoded images can have 3 or 4 channels depending on the shape of the
+// target model input. This op will add an alpha channel value of 255 (fully
+// opaque) if the target model accepts input images with 4 channels. This op
+// will eventually be included in mainline Tflite as a built-in/custom op once
+// it supports both Android and iOS.
+TfLiteRegistration* Register_DECODE_JPEG();
+
+struct OpData {
+  // Number of images to decode.
+  int32_t num_images;
+  // All images should have the same height and width.
+  // Height of images after decoding.
+  int32_t height;
+  // Width of images after decoding.
+  int32_t width;
+  // Number of channels to be decoded. Accepted values are 3 (RGB) and 4 (RGBA).
+  int32_t channels;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_DECODE_JPEG_REGISTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h
new file mode 100644
index 00000000..ae3a0a3c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_DECODE_JPEG_STATUS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_DECODE_JPEG_STATUS_H_
+
+#include <string>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+struct Status {
+  TfLiteStatus code = kTfLiteOk;
+  std::string error_message;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_DECODE_JPEG_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h
new file mode 100644
index 00000000..b6ee7dfc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h
@@ -0,0 +1,161 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_FB_STORAGE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_FB_STORAGE_H_
+
+#include <errno.h>
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/string_view.h"
+#include "flatbuffers/base.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/stderr_reporter.h"
+namespace tflite {
+namespace acceleration {
+
+// FileStorage wraps storage of data in a file with locking and error handling.
+// Locking makes appends and reads atomic, using flock(2).
+//
+// The locking in this class is not meant for general purpose multiple
+// reader/writer support, but primarily for the case where a previous instance
+// of a program has not finished and we'd like to not corrupt the file
+// unnecessarily.
+class FileStorage {
+ public:
+  FileStorage(absl::string_view path, ErrorReporter* error_reporter);
+  // Read contents into buffer_. Returns an error if file exists but cannot be
+  // read.
+  MinibenchmarkStatus ReadFileIntoBuffer();
+  // Append data to file. Resets the in-memory items and returns an error if
+  // writing fails in any way.
+  //
+  // This calls fsync() on the file to guarantee persistence and is hence quite
+  // expensive. The assumption is that this is not done often or in a critical
+  // path.
+  MinibenchmarkStatus AppendDataToFile(absl::string_view data);
+
+ protected:
+  std::string path_;
+  ErrorReporter* error_reporter_;
+  std::string buffer_;
+};
+
+// FlatbufferStorage stores several flatbuffer objects in a file. The primary
+// usage is for storing mini benchmark results.
+//
+// Flatbuffers are not designed for easy mutation. This class is append-only.
+// The intended usage is to store a log of events like 'start benchmark with
+// configuration X', 'benchmark results for X' / 'crash observed with X' that
+// are then parsed to make decisions about how to configure TFLite.
+//
+// The data is stored as consecutive length-prefixed flatbuffers with identifier
+// "STO1".
+ABSL_CONST_INIT extern const char kFlatbufferStorageIdentifier[];
+template <typename T>
+class FlatbufferStorage : protected FileStorage {
+ public:
+  explicit FlatbufferStorage(
+      absl::string_view path,
+      ErrorReporter* error_reporter = DefaultErrorReporter())
+      : FileStorage(path, error_reporter) {}
+  // Reads current contents. Returns an error if file is inaccessible or
+  // contents are corrupt. The file not existing is not an error.
+  MinibenchmarkStatus Read();
+  // Get count of objects stored.
+  size_t Count() { return contents_.size(); }
+  // Get object at index i, i < Count();
+  const T* Get(size_t i) { return contents_[i]; }
+
+  // Append a new object to storage and write out to disk. Returns an error if
+  // disk write or re-read fails.
+  MinibenchmarkStatus Append(flatbuffers::FlatBufferBuilder* fbb,
+                             flatbuffers::Offset<T> object);
+
+ private:
+  std::vector<const T*> contents_;
+};
+
+template <typename T>
+MinibenchmarkStatus FlatbufferStorage<T>::Read() {
+  contents_.clear();
+  MinibenchmarkStatus status = ReadFileIntoBuffer();
+  if (status != kMinibenchmarkSuccess) {
+    return status;
+  }
+  size_t remaining_size = buffer_.size();
+  const uint8_t* current_ptr =
+      reinterpret_cast<const uint8_t*>(buffer_.c_str());
+  while (remaining_size != 0) {
+    if (remaining_size < sizeof(flatbuffers::uoffset_t)) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Corrupt size-prefixed flatbuffer file %s (remaining size less than "
+          "size of uoffset_t)",
+          path_.c_str());
+      return kMinibenchmarkCorruptSizePrefixedFlatbufferFile;
+    }
+    flatbuffers::uoffset_t current_size =
+        flatbuffers::ReadScalar<flatbuffers::uoffset_t>(current_ptr);
+    flatbuffers::Verifier verifier(
+        current_ptr, sizeof(flatbuffers::uoffset_t) + current_size);
+    if (!verifier.VerifySizePrefixedBuffer<T>(kFlatbufferStorageIdentifier)) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Corrupt size-prefixed flatbuffer file %s (verifier returned false)",
+          path_.c_str());
+      return kMinibenchmarkCorruptSizePrefixedFlatbufferFile;
+    }
+    contents_.push_back(flatbuffers::GetSizePrefixedRoot<T>(current_ptr));
+    size_t consumed = sizeof(flatbuffers::uoffset_t) + current_size;
+    if (remaining_size < consumed) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Corrupt size-prefixed flatbuffer file %s (mismatched size "
+          "calculation)",
+          path_.c_str());
+      return kMinibenchmarkCorruptSizePrefixedFlatbufferFile;
+    }
+    remaining_size -= consumed;
+    current_ptr += consumed;
+  }
+  return kMinibenchmarkSuccess;
+}
+
+template <typename T>
+MinibenchmarkStatus FlatbufferStorage<T>::Append(
+    flatbuffers::FlatBufferBuilder* fbb, flatbuffers::Offset<T> object) {
+  contents_.clear();
+  fbb->FinishSizePrefixed(object, kFlatbufferStorageIdentifier);
+  const char* data = reinterpret_cast<const char*>(fbb->GetBufferPointer());
+  size_t size = fbb->GetSize();
+  MinibenchmarkStatus status = AppendDataToFile({data, size});
+  if (status != kMinibenchmarkSuccess) {
+    return status;
+  }
+  return Read();
+}
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_FB_STORAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h
new file mode 100644
index 00000000..90093b60
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_FILE_LOCK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_FILE_LOCK_H_
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif  // !_WIN32
+
+#include <string>
+
+namespace tflite {
+namespace acceleration {
+
+// A simple mutex lock implemented with file descriptor. Not supported in
+// Windows. This lock will release safely when the calling thread / process
+// crashes.
+class FileLock {
+ public:
+  explicit FileLock(const std::string& path) : path_(path) {}
+
+  // Move only.
+  FileLock(FileLock&& other) = default;
+  FileLock& operator=(FileLock&& other) = default;
+
+  ~FileLock() {
+#ifndef _WIN32
+    if (fd_ >= 0) {
+      close(fd_);
+    }
+#endif  // !_WIN32
+  }
+
+  // Returns whether the lock is acquired successfully.
+  bool TryLock();
+
+ private:
+  std::string path_;
+  int fd_ = -1;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_FILE_LOCK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
new file mode 100644
index 00000000..f84f34cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_GPU_MODULE_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_GPU_MODULE_PLUGIN_H_
+
+// This file provides the GpuPlugin class, which implements the
+// TFLite Delegate Plugin for the GPU Delegate.
+
+#include <memory>
+#include <string>
+
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+
+namespace tflite {
+namespace acceleration {
+
+// A DelegatePlugin that uses external library to create GPU Plugin.
+class GpuModulePlugin : public delegates::DelegatePluginInterface {
+ public:
+  static std::unique_ptr<DelegatePluginInterface> New(
+      const TFLiteSettings& acceleration);
+
+  // Move only.
+  GpuModulePlugin(GpuModulePlugin&& other) = default;
+  GpuModulePlugin& operator=(GpuModulePlugin&& other) = default;
+  ~GpuModulePlugin() override;
+
+  delegates::TfLiteDelegatePtr Create() override;
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override;
+
+ private:
+  explicit GpuModulePlugin(const TFLiteSettings& tflite_settings);
+
+  // The handle to the loaded external library.
+  void* module_ = nullptr;
+  const TfLiteDelegatePlugin* plugin_handle_ = nullptr;
+  // A copy of the input tflite_settings.
+  flatbuffers::FlatBufferBuilder fbb_;
+  // A pointer to the data in fbb_.
+  const TFLiteSettings* tflite_settings_;
+  MinibenchmarkStatus error_code_ = kMinibenchmarkSuccess;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_GPU_MODULE_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h
new file mode 100644
index 00000000..1f315704
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_COMMON_H_
+
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+struct JpegHeader {
+  int height;
+  int width;
+  int channels;
+  int bits_per_sample = BITS_IN_JSAMPLE;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_decompress_buffered_struct.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_decompress_buffered_struct.h
new file mode 100644
index 00000000..a7f5ec63
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_decompress_buffered_struct.h
@@ -0,0 +1,78 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_DECOMPRESS_BUFFERED_STRUCT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_DECOMPRESS_BUFFERED_STRUCT_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+#include <vector>
+
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// May provide an extra buffer of characters beyond the `jpeg_decompress_struct`
+// for some builds of Libjpeg Dynamic Library on Android that expect a larger
+// struct than we were compiled with. Zeroes out any allocated bytes beyond
+// sizeof(jpeg_decompress_struct). This class is exclusively used by
+// decode_jpeg.cc to resize `jpeg_decompress_struct`. This is to fix a struct
+// mismatch problem. See go/libjpeg-android for more details.
+class JpegDecompressBufferedStruct {
+ public:
+  explicit JpegDecompressBufferedStruct(std::size_t expected_size)
+      : resized_size_(std::max(sizeof(jpeg_decompress_struct), expected_size)),
+        buffer_(reinterpret_cast<char*>(malloc(resized_size_))) {
+    // Note: Malloc guarantees alignment for 8 bytes. Hence, using malloc
+    // instead of aligned_alloc.
+    // https://www.gnu.org/software/libc/manual/html_node/Aligned-Memory-Blocks.html
+    // alignof(jpeg_decompress_struct) is 8 bytes both on 32 and 64 bit.
+    // It's safe to align the buffered struct as
+    // alignof(jpeg_decompress_struct). This is because we only access the
+    // `jpeg_common_fields` fields of `jpeg_decompress_struct`, all of which are
+    // pointers. The alignment of these pointer fields is 8 and 4 bytes for 64
+    // bit and 32 bit platforms respectively. Since
+    // alignof(jpeg_decompress_struct) is 8 bytes on both platforms, accessing
+    // these fields shouldn't be a problem.
+    // Zero out any excess bytes. Zero-initialization is safe for the bytes
+    // beyond sizeof(jpeg_decompress_struct) because both the dynamic library
+    // and the implementation in decode_jpeg.cc limit their access only to
+    // `jpeg_common_fields` in `jpeg_decompress_struct`.
+    while (--expected_size >= sizeof(jpeg_decompress_struct)) {
+      buffer_[expected_size] = 0;
+    }
+  }
+  ~JpegDecompressBufferedStruct() { std::free(buffer_); }
+  JpegDecompressBufferedStruct(const JpegDecompressBufferedStruct&) = delete;
+  JpegDecompressBufferedStruct& operator=(const JpegDecompressBufferedStruct&) =
+      delete;
+  jpeg_decompress_struct* get() const {
+    return reinterpret_cast<jpeg_decompress_struct*>(buffer_);
+  }
+  int const size() { return resized_size_; }
+  const char* buffer() { return buffer_; }
+
+ private:
+  int resized_size_;
+  char* const buffer_;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_DECOMPRESS_BUFFERED_STRUCT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h
new file mode 100644
index 00000000..39acc69f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h
@@ -0,0 +1,47 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_HEADER_PARSER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_HEADER_PARSER_H_
+
+#include <string>
+#include <tuple>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// Extract the info in JpegHeader from the given buffer.
+// Fails if the buffer doesn't contain a valid JPEG image in JFIF format.
+Status ReadJpegHeader(const tflite::StringRef& jpeg_image_data,
+                      JpegHeader* header);
+
+// Writes into the given string the content of the JPEG image altered with
+// the content of new_header.
+// This is intented to be used in tests to forge existing images.
+Status BuildImageWithNewHeader(const tflite::StringRef& orig_jpeg_image_data,
+                               const JpegHeader& new_header,
+                               std::string& new_image_data);
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_JPEG_HEADER_PARSER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libc_handle.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libc_handle.h
new file mode 100644
index 00000000..45950bef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libc_handle.h
@@ -0,0 +1,81 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBC_HANDLE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBC_HANDLE_H_
+
+#ifdef __ANDROID__
+#include <dlfcn.h>
+#endif
+
+#include <cstdio>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// This class offers a handle to C Standard Library (LibC) shared object
+// library on Android. It offers pointers to functions in LibC.
+// Fmemopen is available as native API from Android SDK 23 onwards. In order to
+// support Android devices from SDK 21 onwards, we load fmemopen dynamically
+// from the libc shared object library.
+// TODO(b/172544567): Support Apple.
+class LibCHandle {
+ public:
+  // Factory to get an initialised instance of LibCHandle.
+  // Loads the libc dynamic library and gets handle to all the
+  // required functions. Stores the initialisation status in `status`.
+  static LibCHandle Create(Status& status);
+  LibCHandle(LibCHandle const&) = delete;
+  LibCHandle& operator=(const LibCHandle&) = delete;
+  LibCHandle(LibCHandle&& other)
+      : libc_(std::exchange(other.libc_, nullptr)),
+        fmemopen_(std::exchange(other.fmemopen_, nullptr)) {}
+  LibCHandle& operator=(LibCHandle&& other) {
+    if (&other != this) {
+      CloseHandle();
+      libc_ = std::exchange(other.libc_, nullptr);
+      fmemopen_ = std::exchange(other.fmemopen_, nullptr);
+    }
+    return *this;
+  }
+  ~LibCHandle() { CloseHandle(); }
+  // Definition can be found here
+  // https://man7.org/linux/man-pages/man3/fmemopen.3.html
+  FILE* fmemopen(void* buf, size_t size, const char* mode) const;
+
+ private:
+  using FmemopenPtr = FILE* (*)(void*, size_t, const char*);
+  LibCHandle(void* libc, FmemopenPtr ptr) : libc_(libc), fmemopen_(ptr) {}
+  // Closes the dynamic library loaded in libc_.
+  void CloseHandle() {
+#ifdef __ANDROID__
+    if (libc_ != nullptr) {
+      dlclose(libc_);
+    }
+#endif
+  }
+  void* libc_ = nullptr;
+  FmemopenPtr fmemopen_ = nullptr;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBC_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h
new file mode 100644
index 00000000..cbc4b6e6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h
@@ -0,0 +1,20 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_H_
+
+#include "tensorflow/core/platform/jpeg.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h
new file mode 100644
index 00000000..4121dfe8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h
@@ -0,0 +1,221 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_DECODER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_DECODER_H_
+
+#include <memory.h>
+
+#include <csetjmp>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_decompress_buffered_struct.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/libc_handle.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// Extracts the expected size of `jpeg_decompress_struct` from the "struct
+// mismatch" error message and stores it in `expected_size`. Returns status code
+// kTfLiteOk if the extraction was successful, error otherwise.
+Status ExtractSizeFromErrorMessage(const std::string& error_message,
+                                   size_t& expected_size);
+
+class LibjpegDecoder {
+ public:
+  // The maximum height allowed for the decoded image. Any attempt to call
+  // DecodeImage for an image with height or width over the allowed limits will
+  // fail.
+  // The size is define to 10,000 lines.
+  static const size_t kMaxImageHeight;
+  // The maximum width allowed for the decoded image. Any attempt to call
+  // DecodeImage for an image with height or width over the allowed limits will
+  // fail.
+  // The size is define to 10,000 pixels per line.
+  static const size_t kMaxImageWidth;
+
+  // Creates and initialises the decoder.
+  // Dynamically loads libjpeg (into handle_) and sets the expected size for
+  // `jpeg_decompress_struct` (in expected_size_for_decompress_struct_). Returns
+  // an initalised instance of decoder if successful, else returns nullptr.
+  // Stores initialisation status in status.
+  static std::unique_ptr<LibjpegDecoder> Create(Status& status);
+  Status DecodeImage(const tflite::StringRef& encoded,
+                     const JpegHeader& expected_image_dimensions,
+                     unsigned char* decoded, const size_t& decoded_size) const;
+
+ private:
+  explicit LibjpegDecoder(LibCHandle libc_handle)
+      : libc_handle_(std::move(libc_handle)) {}
+  // Wraps all objects required for using the libjpeg library.
+  // This is to avoid stack-allocating these variables in the function that
+  // calls setjmp().
+  class Impl {
+   public:
+    explicit Impl(size_t decompress_struct_size, const LibjpegHandle* handle);
+    ~Impl() { jpeg_destroy_decompress(); }
+    Impl(const Impl&) = delete;
+    Impl& operator=(const Impl&) = delete;
+    Impl(Impl&& other) = delete;
+    Impl& operator=(Impl&& other) = delete;
+
+    // Wrapping calls to LibjpegHandle functions in Run and RunAndSetStatus.
+    TfLiteStatus jpeg_CreateDecompress(int version, size_t struct_size) {
+      // Note: It is safe to call jpeg_destroy_decompress even if the
+      // corresponding call to create_jpeg_decompress fails. See the end of
+      // section "Compression details" in
+      // https://www.freedesktop.org/wiki/Software/libjpeg/.
+      safe_to_invoke_destroy_decompress_ = true;
+      return Run(&LibjpegHandle::jpeg_create_decompress_, version, struct_size);
+    }
+    TfLiteStatus jpeg_stdio_src(FILE* infile) {
+      return Run(&LibjpegHandle::jpeg_stdio_src_, infile);
+    }
+
+    TfLiteStatus jpeg_read_header(int& read_header_result,
+                                  boolean require_image) {
+      return RunAndSetResult(&LibjpegHandle::jpeg_read_header_,
+                             &read_header_result, require_image);
+    }
+
+    TfLiteStatus jpeg_start_decompress(boolean& start_decompress_result) {
+      return RunAndSetResult(&LibjpegHandle::jpeg_start_decompress_,
+                             &start_decompress_result);
+    }
+    TfLiteStatus jpeg_read_scanlines(unsigned int& read_scanlines_result,
+                                     JSAMPARRAY scanlines,
+                                     JDIMENSION max_lines) {
+      return RunAndSetResult(&LibjpegHandle::jpeg_read_scanlines_,
+                             &read_scanlines_result, scanlines, max_lines);
+    }
+    TfLiteStatus jpeg_finish_decompress(boolean& finish_decompress_result) {
+      return RunAndSetResult(&LibjpegHandle::jpeg_finish_decompress_,
+                             &finish_decompress_result);
+    }
+    TfLiteStatus jpeg_destroy_decompress() {
+      if (safe_to_invoke_destroy_decompress_) {
+        safe_to_invoke_destroy_decompress_ = false;
+        return Run(&LibjpegHandle::jpeg_destroy_decompress_);
+      }
+      return kTfLiteOk;
+    }
+
+    // Status from the libjpeg layer that is to be returned to the caller.
+    Status status() { return status_; }
+
+   private:
+    // Delegates to one of the LibjpegHandle::jpeg_* methods.
+    // This is to restrict the call to setjmp() to a stack frame free from
+    // stack allocated C++ variables. The type of f is T
+    // (LibjpegHandle::*f)(Args...), for some T. All args must be
+    // pointers/references/primitive types. Since we use a
+    // non-suspending JPEG encoded data source, return value from a
+    // LibjpegHandle::jpeg_* methods is not required by client and hence
+    // discarded. Returns an Ok status if the execution was successful, error
+    // otherwise.
+    template <typename Fn, typename... Args>
+    TfLiteStatus Run(Fn f, Args... args) {
+      // Note(1): C++ variables local to this function should not be stack
+      // allocated
+      // and should be passed as pointers or references. Using setjmp/longjmp
+      // with stack allocated C++ objects that have non-trivial destructors can
+      // lead to undefined behaviour.
+      // https://wiki.sei.cmu.edu/confluence/pages/viewpage.action?pageId=88046492
+      // All such variables whose scope contains calls to libjpeg (that may do a
+      // longjmp) should be passed in as arguments.
+      //
+      // Note(2): All other variables local to this function that need to be
+      // accessed after longjmp() returns control to this function, should be
+      // volatile-qualified.
+      // After invoking longjump(), non-volatile local variables should not be
+      // accessed for two reasons:
+      // - their values may be indeterminate. According to the C standard, if
+      // the variable's value has changed between setjmp() and longjmp(), their
+      // value is considered indeterminate, and accessing them is undefined
+      // behaviour.
+      // https://wiki.sei.cmu.edu/confluence/display/c/MSC22-C.+Use+the+setjmp%28%29%2C+longjmp%28%29+facility+securely
+      // - the register storing such variables might be clobbered. Even if the
+      // variable remains unchanged between setjmp() and longjmp(), the stack
+      // slot for the variable may get incorrectly clobbered. This is a known
+      // LLVM bug: https://bugs.llvm.org/show_bug.cgi?id=21183
+      if (setjmp(env_)) return kTfLiteError;
+      (handle_->*f)(cinfo_.get(), args...);
+      return kTfLiteOk;
+    }
+    // Extension of the Run method for non-void JPEG calls when we need to
+    // collect the returned value.
+    // See Run comments above for details.
+    template <
+        typename Fn, typename... Args,
+        typename ResultType = typename std::result_of_t<Fn>,
+        typename = typename std::enable_if<!std::is_void<ResultType>::value> >
+    TfLiteStatus RunAndSetResult(Fn f, ResultType* result, Args... args) {
+      if (setjmp(env_)) return kTfLiteError;
+      *result = (handle_->*f)(cinfo_.get(), args...);
+      return kTfLiteOk;
+    }
+    // Size of `jpeg_decompress_struct` as expected by libjpeg library.
+    size_t decompress_struct_size_;
+    const LibjpegHandle* handle_;
+    // Using a buffered struct for `jpeg_decompress_struct` as the size expected
+    // by libjpeg can be different from the size of the compiled struct. See
+    // go/libjpeg-android. Note: Since we resize the struct, accessing some of
+    // the fields of this struct may lead to undefined behaviour. For
+    // decompression, only the fields within `jpeg_common_fields` are required
+    // viz. error manager(`err`) and client data(`client_data`). This code
+    // limits its usage to these two fields and we recommend future contributors
+    // to not access fields beyond `jpeg_common_fields`.
+    JpegDecompressBufferedStruct cinfo_;
+    struct jpeg_error_mgr jerr_;
+    // Stores the information of the calling environment which can be restored
+    // later. Libjpeg aborts the program in case of any errors by using longjmp
+    // and then calling exit(). The only way to avoid this, is to transfer the
+    // control flow to the caller by using setjmp/longjmp.
+    jmp_buf env_;
+    static void ErrorExit(j_common_ptr cinfo);
+    // Calls to jpeg_create_decompress and jpeg_destroy_decompress need to be
+    // paired. This flag indicates if it's safe to invoke
+    // jpeg_destroy_decompress.
+    bool safe_to_invoke_destroy_decompress_ = false;
+    // Status of the most recent execution of a LibjpegHandle::jpeg_* method
+    // invoked using Run or RunAndSetResult.
+    Status status_;
+  };
+  // Size of `jpeg_decompress_struct` as expected by the libjpeg dynamic
+  // library. The expected size is different from the size of the compiled
+  // struct on some Android Devices. See go/libjpeg-android.
+  size_t expected_size_for_decompress_struct_;
+  // Handle to the Libjpeg dynamic library.
+  std::unique_ptr<LibjpegHandle> libjpeg_handle_;
+  // Handle to the LibC dynamic library.
+  LibCHandle libc_handle_;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_DECODER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test_helper.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test_helper.h
new file mode 100644
index 00000000..a170d769
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test_helper.h
@@ -0,0 +1,124 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_DECODER_TEST_HELPER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_DECODER_TEST_HELPER_H_
+
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// Checks if the values are almost equal and their absolute difference doesn't
+// exceed tolerance.
+// Params: int tolerance. arg is int.
+MATCHER_P(AreAlmostEqualWithTolerance, tolerance, "") {
+  int a = static_cast<int>(testing::get<0>(arg));
+  int b = static_cast<int>(testing::get<1>(arg));
+  int diff = std::abs(b - a);
+  if (result_listener) {
+    std::ostringstream os;
+    os << "difference between " << a << " and " << b << " is " << diff
+       << " which is greater than " << tolerance << ". ";
+    *result_listener << os.str();
+  }
+  return diff <= tolerance;
+}
+
+// Matcher for matching the 13x13 yellow-white chessboard pattern in an image.
+// Parameters: int tolerance. arg is vector<uint8_t> image.
+// Checks that relative difference between pixel values is within tolerance.
+MATCHER_P(HasChessboardPatternWithTolerance, tolerance, "") {
+  const std::vector<uint8_t> image = static_cast<std::vector<uint8_t>>(arg);
+  const std::vector<uint8_t> kYellow = {253, 242, 0};
+  const std::vector<uint8_t> kWhite = {255, 255, 255};
+  const int kHeightRect = 23;
+  const int kWidthRect = 19;
+  const int kChannels = 3;
+  // Rectangles on the chessboard have fixed height and width.
+  // Check color at centre pixel for every rectangle.
+  // There are 13x13 rectangles in all.
+  const int row_stride = kChannels * 250;
+  int rect = 0;
+
+  for (int i = 0; i < 13; i++)
+    for (int j = 0; j < 13; j++) {
+      int row = i * kHeightRect + (kHeightRect / 2);
+      int col = j * kWidthRect + (kWidthRect / 2);
+      int pixel = row * row_stride + col * kChannels;
+      std::vector<uint8_t> decoded_color = {image[pixel], image[pixel + 1],
+                                            image[pixel + 2]};
+      if (!testing::ExplainMatchResult(
+              testing::Pointwise(AreAlmostEqualWithTolerance(tolerance),
+                                 (rect & 1) ? kWhite : kYellow),
+              decoded_color, result_listener)) {
+        *result_listener << "Pixel values at row#" << row << ", col#" << col
+                         << " don't match.";
+
+        return false;
+      }
+      rect++;
+    }
+  return true;
+}
+
+// Matcher for matching the
+// SMPTE(https://en.wikipedia.org/wiki/SMPTE_color_bars) rainbow color bars
+// pattern in an image. Parameters: int tolerance. arg is vector<uint8_t> image.
+// Verifies that relative difference between pixel values is within tolerance.
+MATCHER_P(HasRainbowPatternWithTolerance, tolerance, "") {
+  const std::vector<uint8_t> image = static_cast<std::vector<uint8_t>>(arg);
+
+  const int kWidth = 250;
+  const int kRow = 150;
+  const int kChannels = 3;
+  const int kBandSize = 35;
+  std::vector<std::vector<uint8_t>> colors = {
+      {192, 192, 192}, {192, 192, 0}, {0, 193, 192}, {0, 192, 0},
+      {192, 0, 192},   {193, 0, 0},   {0, 0, 193}};
+  // Match pixel colors at 7 locations in a fixed row.
+  for (int i = 0; i < 7; ++i) {
+    int col = (i * kBandSize + kBandSize / 2);
+    int pixel = kRow * kWidth * kChannels + col * kChannels;
+    std::vector<uint8_t> decoded_color = {image[pixel], image[pixel + 1],
+                                          image[pixel + 2]};
+    if (!testing::ExplainMatchResult(
+            testing::Pointwise(AreAlmostEqualWithTolerance(tolerance),
+                               colors[i]),
+            decoded_color, result_listener)) {
+      *result_listener << "Pixel values at row#" << kRow << ", col#" << col
+                       << " don't match.";
+
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_DECODER_TEST_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.h
new file mode 100644
index 00000000..88fab338
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.h
@@ -0,0 +1,73 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_HANDLE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_HANDLE_H_
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h"
+
+namespace tflite {
+namespace acceleration {
+namespace decode_jpeg_kernel {
+
+// This class offers a handle to Libjpeg shared object library on Android.
+// It offers pointers to functions in Libjpeg that are required for decoding
+// JPEG images.
+// TODO(b/172544567): Support Apple.
+class LibjpegHandle {
+ public:
+  // Factory for creating an initialised instance of LibjpegHandle.
+  // Loads the libjpeg dynamic library and gets handle to all the functions
+  // required for decompressing JPEGs. Returns an initialised instance of
+  // LibjpegHandle if successful, else nullptr. Stores initialisation status in
+  // `status`.
+  static std::unique_ptr<LibjpegHandle> Create(Status& status);
+  // Closes the dynamic library loaded in libjpeg_.
+  ~LibjpegHandle();
+  LibjpegHandle(LibjpegHandle const&) = delete;
+  LibjpegHandle& operator=(const LibjpegHandle&) = delete;
+  LibjpegHandle(LibjpegHandle&& LibjpegHandle) = delete;
+  LibjpegHandle& operator=(LibjpegHandle&& other) = delete;
+  // Based on our analysis of Android devices in the ODML lab, it is reasonable
+  // to expect 62 (6b) as the version of libjpeg on all Android devices from SDK
+  // 22 onwards.
+  static const int kLibjpegVersion = 62;
+  // Definitions of the functions below can be found in
+  // third_party/libjpeg_turbo/src/jpeglib.h
+  struct jpeg_error_mgr* (*jpeg_std_error_)(struct jpeg_error_mgr*);
+  void (*jpeg_destroy_decompress_)(j_decompress_ptr);
+  void (*jpeg_create_decompress_)(j_decompress_ptr, int, size_t);
+  void (*jpeg_stdio_src_)(j_decompress_ptr, FILE*);
+  int (*jpeg_read_header_)(j_decompress_ptr, boolean);
+  boolean (*jpeg_start_decompress_)(j_decompress_ptr);
+  unsigned int (*jpeg_read_scanlines_)(j_decompress_ptr, JSAMPARRAY,
+                                       JDIMENSION);
+  boolean (*jpeg_finish_decompress_)(j_decompress_ptr);
+
+ private:
+  LibjpegHandle() {}
+  void* libjpeg_ = nullptr;
+};
+
+}  // namespace decode_jpeg_kernel
+}  // namespace acceleration
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_LIBJPEG_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h
new file mode 100644
index 00000000..2cc952e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h
@@ -0,0 +1,127 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MINI_BENCHMARK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MINI_BENCHMARK_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+namespace acceleration {
+// Instances are thread-compatible, access from multiple threads must be guarded
+// by a mutex.
+//
+// Caution: The mini-benchmark runs silently in-process on non-Android, rather
+// than in a separate process.
+class MiniBenchmark {
+ public:
+  // Get best acceleration based on tests done so far. If no successful tests
+  // are found, the best settings are on CPU or if the settings do not contain
+  // configurations to test or not all relevant fields are present, the returned
+  // ComputeSettingsT will be an object created by the default constructor.
+  // Note: if we have successful mini-benchmark run events, the best
+  // acceleration configuration will be persisted on the local storage as a new
+  // mini-benchmark event.
+  virtual ComputeSettingsT GetBestAcceleration() = 0;
+
+  // Trigger the running of tests in the background in a separate thread on
+  // Linux, but a separate process on Android. If triggering fails, errors are
+  // stored internally.
+  //
+  // Does nothing if the settings do not contain configurations to test or not
+  // all relevant fields are present.
+  virtual void TriggerMiniBenchmark() = 0;
+
+  virtual void SetEventTimeoutForTesting(int64_t timeout_us) = 0;
+
+  // Mark mini-benchmark events that have not yet been marked as to be logged,
+  // and return these events. This can include errors in triggering the
+  // mini-benchmark.
+  virtual std::vector<MiniBenchmarkEventT> MarkAndGetEventsToLog() = 0;
+
+  // Get the number of remaining tests that still need to be completed.
+  // Note that this method should be only called after calling
+  // TriggerMiniBenchmark or GetBestAcceleration where additional
+  // mini-benchmark-related setup could be initialized. In short, -1 is returned
+  // if the overall mini-benchmark-related setup isn't properly initialized.
+  virtual int NumRemainingAccelerationTests() = 0;
+
+  MiniBenchmark() {}
+  virtual ~MiniBenchmark() {}
+
+  MiniBenchmark(MiniBenchmark&) = delete;
+  MiniBenchmark& operator=(const MiniBenchmark&) = delete;
+  MiniBenchmark(MiniBenchmark&&) = delete;
+  MiniBenchmark& operator=(const MiniBenchmark&&) = delete;
+};
+
+// Instantiate a mini-benchmark. This will return a no-op implementation unless
+// the :mini_benchmark_implementation target has been linked in.
+std::unique_ptr<MiniBenchmark> CreateMiniBenchmark(
+    const MinibenchmarkSettings& settings, const std::string& model_namespace,
+    const std::string& model_id);
+
+// A simple registry that allows different mini-benchmark implementations to be
+// created by name.
+//
+// Limitations:
+// - Doesn't allow deregistration.
+// - Doesn't check for duplication registration.
+//
+class MinibenchmarkImplementationRegistry {
+ public:
+  using CreatorFunction = std::function<std::unique_ptr<MiniBenchmark>(
+      const MinibenchmarkSettings& /*settings*/,
+      const std::string& /*model_namespace*/, const std::string& /*model_id*/)>;
+
+  // Returns a MiniBenchmark registered with `name` or nullptr if no matching
+  // mini-benchmark implementation found.
+  static std::unique_ptr<MiniBenchmark> CreateByName(
+      const std::string& name, const MinibenchmarkSettings& settings,
+      const std::string& model_namespace, const std::string& model_id);
+
+  // Struct to be statically allocated for registration.
+  struct Register {
+    Register(const std::string& name, CreatorFunction creator_function);
+  };
+
+ private:
+  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
+  std::unique_ptr<MiniBenchmark> CreateImpl(
+      const std::string& name, const MinibenchmarkSettings& settings,
+      const std::string& model_namespace, const std::string& model_id);
+  static MinibenchmarkImplementationRegistry* GetSingleton();
+
+  absl::Mutex mutex_;
+  std::unordered_map<std::string, CreatorFunction> factories_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#define TFLITE_REGISTER_MINI_BENCHMARK_FACTORY_FUNCTION(name, f) \
+  static auto* g_tflite_mini_benchmark_##name##_ =               \
+      new MinibenchmarkImplementationRegistry::Register(#name, f);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MINI_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h
new file mode 100644
index 00000000..9e7ace16
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MINI_BENCHMARK_TEST_HELPER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MINI_BENCHMARK_TEST_HELPER_H_
+
+#include <string>
+
+namespace tflite {
+namespace acceleration {
+
+class MiniBenchmarkTestHelper {
+ public:
+  // Dump the in-memory binary data stream to the testing temporary directory w/
+  // a file name as 'filename'.
+  // It retruns the full file path of the dumped file.
+  static std::string DumpToTempFile(const std::string& filename,
+                                    const unsigned char* data, size_t length);
+
+  // The constructor will check whether the testing environment supports to run
+  // the mini benchmark. If yes, it will do additional testing setup
+  // accordingly.
+  explicit MiniBenchmarkTestHelper(
+#ifdef __ANDROID__
+      bool should_load_entrypoint_dynamically = true
+#else   // !__ANDROID__
+      bool should_load_entrypoint_dynamically = false
+#endif  // __ANDROID__
+  );
+  ~MiniBenchmarkTestHelper() = default;
+  bool should_perform_test() const { return should_perform_test_; }
+
+ private:
+  bool should_perform_test_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MINI_BENCHMARK_TEST_HELPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h
new file mode 100644
index 00000000..b98a3bc4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h
@@ -0,0 +1,95 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_CUSTOM_VALIDATION_EMBEDDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_CUSTOM_VALIDATION_EMBEDDER_H_
+
+#include <utility>
+#include <vector>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Create a model with custom validation graph.
+//
+// 'validation model' (new subgraph)
+// input (batch_size)
+//           |
+// +-----------------------+
+// |'main_model' (0)       |
+// | +---------------+     |
+// | |input          +---+ |
+// | +---------------+   | |
+// |                     ~ |
+// | +---------------+   | |
+// | |outputs        +<--+ |
+// | +---------------+     |
+// |                       |
+// +-----------------------+
+//           |
+// output (batch_size)
+//
+// The new model contains all the information from main_model, with an extra
+// subgraph for validation purposes. The validation graph calls the primary
+// subgraph with batch_size. The input data is embedded to the validation graph.
+// custom_input should have the same order as the input in the main_model. E.g.
+// custom_input[i] will be mapped to main_model.input[i].
+class CustomValidationEmbedder {
+ public:
+  CustomValidationEmbedder(
+      int batch_size, std::vector<std::vector<uint8_t>> custom_input,
+      ErrorReporter* error_reporter = DefaultErrorReporter())
+      : batch_size_(batch_size),
+        custom_input_(std::move(custom_input)),
+        error_reporter_(error_reporter) {}
+
+  // Move only.
+  CustomValidationEmbedder(CustomValidationEmbedder&&) = default;
+  CustomValidationEmbedder& operator=(CustomValidationEmbedder&&) = default;
+
+  // Build the final model with main_model and validation subgraph.
+  MinibenchmarkStatus BuildModel(const Model& main_model,
+                                 flatbuffers::FlatBufferBuilder& fbb);
+
+ private:
+  // Helper function to create tensors in validation graph based on primary
+  // subgraph. This function creates new tensors and buffers based on the
+  // from_subgraphs.tensors[from_indexes]. The new tensors will have shape[0]
+  // set to batch_size_, and indexes stored in new_indexes.
+  // New buffers will be created for each of the new tensors, and buffer data is
+  // copied from the corresponding buffer_content.
+  void CreateTensorsFrom(const SubGraph& from_subgraph,
+                         const std::vector<int>& from_indexes,
+                         std::vector<std::vector<uint8_t>>* buffer_content,
+                         flatbuffers::FlatBufferBuilder& fbb,
+                         std::vector<int>& new_indexes,
+                         std::vector<flatbuffers::Offset<Buffer>>& buffers,
+                         std::vector<flatbuffers::Offset<Tensor>>& tensors);
+
+  int batch_size_;
+  std::vector<std::vector<uint8_t>> custom_input_;
+  ErrorReporter* error_reporter_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_CUSTOM_VALIDATION_EMBEDDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h
new file mode 100644
index 00000000..6efa5cfb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h
@@ -0,0 +1,112 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_EMBEDDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_EMBEDDER_H_
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/reflection_generated.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace acceleration {
+// Class to embed a mini-benchmark into a tflite file.
+//
+// The inputs are:
+// - 'main_model': the actual inference graph (e.g., mobilenet classifier)
+// - 'jpeg_data': jpeg images used as test data.
+// - 'validation_model': a graph that takes as input two sets of values (the
+// known-good main model output and the to-be-tested main model output) and
+// produces 2 or more outputs where one must be called 'ok' (whether the
+// results are good enough) and rest are metrics that were used to determine
+// 'ok' and can be used for debugging/telemetry.
+// (Known good outputs are produced inside this class, i.e. running TFLite CPU
+// on the build host).
+//
+// The output is:
+// - A new benchmark model which has 3 subgraphs. The 'main_model' subgraph, a
+// new 'validate' subgraph that invokes the other two subgraphs when required,
+// and the 'validation_model' subgraph.
+// - The model output is the output of 'validation_model' + output of
+// 'main_model'
+// - This model has additional buffers that store the 'jpeg_data' and the actual
+// outputs.
+// - The 'main_model' subgraph is fed the 'jpeg_data' and produces an output
+// which is used by the 'validation_model' with the known-good outputs to
+// evaluate the model.
+// - This entire process is handled end-to-end by the 'validate' subgraph using
+// two custom ops: 'validate/call' (implemented in :call in this directory) and
+// 'validate/decode_jpeg' (being implemented).
+//
+// Constraints on inputs:
+// - 'main_model' must have a single input of dimensions
+//   [1, height, width, 1 or 3]
+// - the images encoded in 'jpeg_data' must have same height, width and channels
+//   as 'main_model' input
+// - the 'validation_model' must have inputs equal to 'main_model' outputs
+//   duplicated (e.g, if 'main_model' has outputs with dimensions
+//   [1, 10] and [1, 20]; the 'validation_model' must have inputs with
+//   dimensions [1, 10], [1, 20], [1, 10], [1, 20]).
+// - the 'validation_model' must have 2 or more outputs, and one of them must be
+//   called 'ok'.
+// - all inputs and outputs must be tensors (not scalars).
+//
+// TODO(b/172541832):
+// - Mark the validation graph so that it's not delegated in the inference case.
+// - Allow known-good outputs to be given rather than always being calculated
+// inside this class.
+class Embedder {
+ public:
+  // Construct Embedder with inputs. The Model* inputs are owned by the caller
+  // and must outlive the Embedder. The `schema` must contain the tflite
+  // flatbuffer schema. If the model is quantized, scale and zero_point are
+  // ignored.
+  Embedder(const Model* main_model, const std::vector<std::string>& jpeg_data,
+           float scale, int64_t zero_point, const Model* validation_model,
+           const reflection::Schema* schema,
+           bool use_ondevice_cpu_for_golden = false);
+  // Construct the output model. Calls Finish() on 'fbb'.
+  // The 'resolver' must have the call and decode_jpeg ops from this directory
+  // registered as 'validation/call' and 'validation/decode_jpeg'.
+  absl::Status CreateModelWithEmbeddedValidation(
+      flatbuffers::FlatBufferBuilder* fbb,
+      ::tflite::ops::builtin::BuiltinOpResolver* resolver);
+  // Check that the inputs fulfill the constraints. Called automatically as part
+  // of CreateModelWithEmbeddedValidation.
+  absl::Status ValidateInputs();
+
+ private:
+  const Model* main_model_;
+  std::vector<std::string> jpeg_data_;
+  int32_t jpeg_output_channels_;
+  float scale_;
+  int64_t zero_point_;
+  const Model* validation_model_;
+  const reflection::Schema* schema_;
+  bool use_ondevice_cpu_for_golden_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_EMBEDDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/grafter.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/grafter.h
new file mode 100644
index 00000000..ea90433e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/grafter.h
@@ -0,0 +1,117 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_GRAFTER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_GRAFTER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "flatbuffers/reflection.h"  // from @flatbuffers
+#include "flatbuffers/reflection_generated.h"  // from @flatbuffers
+#include "flatbuffers/table.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+
+namespace tflite {
+struct Model;
+}  // namespace tflite
+
+namespace tflite {
+namespace acceleration {
+
+// Combines the given models into one, using the FlatBufferBuilder.
+//
+// This is useful for constructing models that contain validation data and
+// metrics.
+//
+// The model fields are handled as follows:
+// - version is set to 3
+// - operator codes are concatenated (no deduplication)
+// - subgraphs are concatenated in order, rewriting operator and buffer indices
+// to match the combined model. Subgraph names are set from 'subgraph_names'
+// - description is taken from first model
+// - buffers are concatenated
+// - metadata buffer is left unset
+// - metadata are concatenated
+// - signature_defs are taken from the first model (as they refer to the main
+// subgraph).
+absl::Status CombineModels(flatbuffers::FlatBufferBuilder* fbb,
+                           std::vector<const Model*> models,
+                           std::vector<std::string> subgraph_names,
+                           const reflection::Schema* schema);
+
+// Convenience methods for copying flatbuffer Tables and Vectors.
+//
+// These are used by CombineModels above, but also needed for constructing
+// validation subgraphs to be combined with models.
+class FlatbufferHelper {
+ public:
+  FlatbufferHelper(flatbuffers::FlatBufferBuilder* fbb,
+                   const reflection::Schema* schema);
+  template <typename T>
+  absl::Status CopyTableToVector(const std::string& name, const T* o,
+                                 std::vector<flatbuffers::Offset<T>>* v) {
+    auto copied = CopyTable(name, o);
+    if (!copied.ok()) {
+      return copied.status();
+    }
+    v->push_back(*copied);
+    return absl::OkStatus();
+  }
+  template <typename T>
+  absl::StatusOr<flatbuffers::Offset<T>> CopyTable(const std::string& name,
+                                                   const T* o) {
+    if (o == nullptr) return 0;
+    const reflection::Object* def = FindObject(name);
+    if (!def) {
+      return absl::NotFoundError(
+          absl::StrFormat("Type %s not found in schema", name));
+    }
+    // We want to use the general copying mechanisms that operate on
+    // flatbuffers::Table pointers. Flatbuffer types are not directly
+    // convertible to Table, as they inherit privately from table.
+    // For type* -> Table*, use reinterpret cast.
+    const flatbuffers::Table* ot =
+        reinterpret_cast<const flatbuffers::Table*>(o);
+    // For Offset<Table *> -> Offset<type>, rely on uoffset_t conversion to
+    // any flatbuffers::Offset<T>.
+    return flatbuffers::CopyTable(*fbb_, *schema_, *def, *ot).o;
+  }
+  template <typename int_type>
+  flatbuffers::Offset<flatbuffers::Vector<int_type>> CopyIntVector(
+      const flatbuffers::Vector<int_type>* from) {
+    if (from == nullptr) {
+      return 0;
+    }
+    std::vector<int_type> v{from->cbegin(), from->cend()};
+    return fbb_->CreateVector(v);
+  }
+  const reflection::Object* FindObject(const std::string& name);
+
+ private:
+  flatbuffers::FlatBufferBuilder* fbb_;
+  const reflection::Schema* schema_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_GRAFTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.h
new file mode 100644
index 00000000..8bb518b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.h
@@ -0,0 +1,221 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_VALIDATION_GRAPH_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_VALIDATION_GRAPH_BUILDER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/reflection_generated.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/grafter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Class for building the validation entry-point graph that calls into the main
+// graph and a metrics graph. Like this (boxes are tensors with plural names
+// meaning possibly multiple tensors, arrows are ops and numbers in parentheses
+// are subgraph indices):
+// +--------------------------------------+
+// | Graph created by this class (1)      |
+// |                                      |
+// | +-----------input-+                  |
+// | |jpeg input       |                  |
+// | +-----+-----------+                  |
+// |       |                              |
+// |       | decode                       |
+// |       v                              |
+// | +-----+-----------+                  |
+// | |quantized image  |                  |
+// | +-----+-----------+                  |  +-----------------------+
+// |       |                              |  |'main_model' (0)       |
+// |       | dequantize (optional)        |  | +---------------+     |
+// |       v                              |  | |input          +---+ |
+// | +-----+-----------+                  |  | +---------------+   | |
+// | |float image      |                  |  |                     ~ |
+// | +-----+-----------+                  |  | +---------------+   | |
+// |       |  call                        |  | |outputs        +<--+ |
+// |       +<------------------------------->+ +---------------+     |
+// |       v                              |  |                       |
+// | +-----+-----output+ +---------input+ |  +-----------------------+
+// | |actual outputs   | |golden outputs| |
+// | +-----+-----------+ +-----------+--+ |
+// |       |                         |    |
+// |       | dequantize (optional)   |    |
+// |       |                         |    |
+// | +-----+-------------------------+-+  |
+// | | dequantized actual and golden   |  |
+// | | outputs (validation inputs)     |  |
+// | +-----+---------------------------+  |  +-----------------------+
+// |       |  call                        |  |'validation model' (2) |
+// |       +<------------------------------->+                       |
+// |       v                              |  | +---------------+     |
+// | +-----+-----output+                  |  | |inputs         +---+ |
+// | |results          |                  |  | +---------------+   | |
+// | +-----------------+                  |  |                     ~ |
+// |                                      |  | +---------------+   | |
+// |                                      |  | |outputs        +<--+ |
+// |                                      |  | +---------------+     |
+// |                                      |  |                       |
+// +--------------------------------------+  +-----------------------+
+//
+// It's important the 'main_model' has subgraph index 0 so that it is used as
+// the primary subgraph by the TFLite interpreter. The other indices are
+// arbitrary.
+// TODO(b/172541832): Handle a main model with more than one subgraph.
+//
+// Note that the jpeg input is marked as an input in this graph, as TFLite
+// graphs must have inputs. However, it will be pre-filled from the jpeg_data
+// and doesn't need to be filled by the user of the model.
+class ValidationGraphBuilder {
+ public:
+  ValidationGraphBuilder(const std::string& metric_prefix,
+                         const Model* main_model,
+                         std::vector<std::string> jpeg_data,
+                         int32_t jpeg_output_channels, float scale,
+                         int64_t zero_point, const Model* validation_model,
+                         const reflection::Schema* schema,
+                         bool use_ondevice_cpu_for_golden)
+      : metric_prefix_(metric_prefix),
+        main_model_(main_model),
+        jpeg_data_(jpeg_data),
+        jpeg_output_channels_(jpeg_output_channels),
+        scale_(scale),
+        zero_point_(zero_point),
+        validation_model_(validation_model),
+        schema_(schema),
+        helper_(&fbb_, schema_),
+        use_ondevice_cpu_for_golden_(use_ondevice_cpu_for_golden) {}
+
+  ValidationGraphBuilder(const ValidationGraphBuilder&) = delete;
+  ValidationGraphBuilder& operator=(const ValidationGraphBuilder&) = delete;
+
+  // Builds the part of the model drawn above until the call to the validation
+  // graph. The model is used to generate golden outputs. Calls Finish on the
+  // FlatbufferBuilder.
+  absl::Status BuildIntermediateModel(flatbuffers::FlatBufferBuilder* fbb);
+
+  // Builds the whole model as drawn above. The subgraph_with_golden_outputs
+  // should be the result of invoking subgraph 1 on the output of
+  // BuildIntermediateModel(). Calls Finish on the FlatbufferBuilder.
+  absl::Status BuildFinalModel(flatbuffers::FlatBufferBuilder* fbb,
+                               Subgraph* subgraph_with_golden_outputs);
+
+ private:
+  // Allocation of tensors, for communication between methods that create the
+  // tensors, the operations and the buffers.
+  // (Some of these vectors will always contain only one element, but using the
+  // same type for them simplifies the code a lot).
+  struct TensorInfo {
+    ~TensorInfo() { std::free(jpeg_buffer_contents); }
+
+    std::vector<int32_t> entrypoint_inputs;
+    std::vector<int32_t> entrypoint_outputs;
+    std::vector<int32_t> jpeg_images;
+
+    // With float main model, both quantized_images and float_images are set,
+    // and float_images is the same as main input. With a quantized model
+    // only quantized_images is set and it's the same as main input.
+    std::vector<int32_t> quantized_images;
+    std::vector<int32_t> float_images;
+
+    std::vector<int32_t> main_outputs;  // First half of validation_inputs.
+    std::vector<int32_t> validation_inputs;
+    // With a float model, validation_inputs is used directly. With a quantized
+    // model, the inputs are first dequantized.
+    // Some models have a mixture of quantized outputs that need to be
+    // dequantized to floats; and integer outputs. For integer outputs
+    // kSkippedIndex is used.
+    std::vector<int32_t> dequantized_validation_inputs;
+    std::vector<int32_t> validation_outputs;
+
+    char* jpeg_buffer_contents = nullptr;
+    int32_t jpeg_buffer_length = -1;
+    int32_t jpeg_height = -1;
+    int32_t jpeg_width = -1;
+  };
+
+  static constexpr int32_t kModelVersion = 3;
+  static constexpr int32_t kSkippedIndex = -1;
+  // Operator code numbering.
+  static constexpr int32_t kCallOperatorCode = 0;
+  static constexpr int32_t kDequantizeOperatorCode = 1;
+  static constexpr int32_t kDecodeJpegOperatorCode = 2;
+  // Subgraph numbering.
+  static constexpr int32_t kMainSubgraphIndex = 0;
+  static constexpr int32_t kValidationSubgraphIndex = 2;
+
+  absl::StatusOr<flatbuffers::Offset<Model>> MakeModel(
+      bool intermediate_only, Subgraph* subgraph_with_golden_outputs);
+
+  absl::StatusOr<flatbuffers::Offset<
+      flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>>
+  OperatorCodes();
+
+  absl::StatusOr<
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>>
+  Tensors(bool intermediate_only, TensorInfo* tensor_info);
+
+  // Create the options for the custom call op (see call.cc for the options
+  // format).
+  flatbuffers::Offset<flatbuffers::Vector<uint8_t>> CallOpCustomOptions(
+      int subgraph);
+
+  // Create the options for the custom jpeg op (see decode_jpeg.cc for the
+  // options format).
+  flatbuffers::Offset<flatbuffers::Vector<uint8_t>> JpegOpCustomOptions(
+      int height, int width, int channels);
+
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+  Operators(bool intermediate_only, const TensorInfo& tensor_info);
+
+  absl::StatusOr<
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>>
+  SubGraphs(bool intermediate_only, TensorInfo* tensor_info);
+
+  absl::StatusOr<
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>>
+  Buffers(bool intermediate_only, const TensorInfo& tensor_info,
+          Subgraph* subgraph_with_golden_outputs);
+
+  const std::string metric_prefix_;
+  const Model* main_model_;
+  std::vector<std::string> jpeg_data_;
+  int32_t jpeg_output_channels_;
+  float scale_;
+  int64_t zero_point_;
+  const Model* validation_model_;
+  const reflection::Schema* schema_;
+  flatbuffers::FlatBufferBuilder fbb_;
+  FlatbufferHelper helper_;
+  bool use_ondevice_cpu_for_golden_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_MODIFIER_VALIDATION_GRAPH_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/nnapi_sl_fake_impl.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/nnapi_sl_fake_impl.h
new file mode 100644
index 00000000..cecb564a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/nnapi_sl_fake_impl.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_NNAPI_SL_FAKE_IMPL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_NNAPI_SL_FAKE_IMPL_H_
+
+namespace tflite {
+namespace acceleration {
+
+// Initialize the shared file used to check if NNAPI SL has been called
+// and count the number of API calls.
+void InitNnApiSlInvocationStatus();
+
+// Checks if any API call to the fake NNAPI SL has been done.
+bool WasNnApiSlInvoked();
+
+// Returns the number of calls to the fake NNAPI SL.
+int CountNnApiSlApiCalls();
+
+}  // namespace acceleration
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_NNAPI_SL_FAKE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/runner.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/runner.h
new file mode 100644
index 00000000..d04368f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/runner.h
@@ -0,0 +1,137 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_RUNNER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_RUNNER_H_
+
+#include <string>
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Class that runs a C-main-function -compatible exported symbol in a separate
+// process on Android. Support all Android 24+ devices and some 23 devices. May
+// also work on 22- but has not been tested.
+//
+// Requirements on the caller:
+// - The function to be called must reside in the same shared library as this
+// code and it must be exported (typically by being extern "C" and having a name
+// that starts with Java).
+// - The function to be called must have the same signature as C main:
+//     extern "C" int Java_foo(int argc, char** argv)
+// - The librunner_main.so shared object from this directory must
+// reside in the same location as the shared object above (typically by
+// depending on the runner_main_library_for_deps target from this
+// directory in a java_library or java_binary)
+//
+// The return values are meant to be detailed enough for telemetry.
+//
+// For reasons behind the above restrictions and requirements, see
+// implementation notes in runner.cc
+//
+// Warning: this class will just run the provided code in-process when compiled
+// for non-Android, and timeout is not enforced.
+class ProcessRunner {
+ public:
+  // Construct ProcessRunner. 'temporary_path' should be a suitable subdirectory
+  // of the app data path for extracting the helper binary on Android P-.
+  //
+  // Since the function will be called through popen() only return values
+  // between 0 and 127 are well-defined.
+  ProcessRunner(const std::string& temporary_path,
+                const std::string& function_name,
+                int (*function_pointer)(int argc, char** argv),
+                int timeout_millisec = 0,
+                ErrorReporter* error_reporter = tflite::DefaultErrorReporter())
+      : temporary_path_(temporary_path),
+        function_name_(function_name),
+        function_pointer_(reinterpret_cast<void*>(function_pointer)),
+        timeout_millisec_(timeout_millisec),
+        error_reporter_(error_reporter) {}
+
+  // Initialize runner.
+  MinibenchmarkStatus Init();
+
+  // Run function in separate process. Returns function's output to stdout and
+  // the shell exitcode. Stderr is discarded.
+  //
+  // The function will be called with argc and argv corresponding to a command
+  // line like:
+  //     helper_binary function_name (optional: model path) args
+  // If model_allocation is not null, runner will use pipe() to pass the model
+  // data to subprocess. Otherwise, args[0] should be a model path.
+  // The args are escaped for running through the shell.
+  //
+  // The 'output' and 'exitcode' and `signal` are set as follows based on the
+  // return value:
+  //   kMinibenchmarkUnknownStatus, kMinibenchmarkPreconditionNotMet: undefined
+  //   kMinibenchmarkPopenFailed:
+  //       *output is an empty string
+  //       *exitcode is errno after popen()
+  //       *signal is 0
+  //   kMinibenchmarkCommandFailed, kMinibenchmarkSuccess:
+  //       *output is stdout produced from function
+  //       *exitcode is:
+  //        - if the process terminated normally:
+  //          the return value of the benchmark function or, if function
+  //          loading fails one the MinibenchmarkStatus values listed under
+  //          'Runner main status codes' to describe the failure.
+  //        - if the process has been terminated by a signal: 0
+  //       *signal is:
+  //        - if the process has been terminated by a signal: the signal number
+  //        - 0 otherwise
+  //
+  // To be considered successful, the function must return
+  // kMinibenchmarkSuccess. This is because some GPU drivers call exit(0) as a
+  // bailout and we don't want to confuse that with a successful run.
+  MinibenchmarkStatus Run(const Allocation* model_allocation,
+                          const std::vector<std::string>& args,
+                          std::string* output, int* exitcode, int* signal);
+
+  ProcessRunner(ProcessRunner&) = delete;
+  ProcessRunner& operator=(const ProcessRunner&) = delete;
+
+ private:
+#ifdef TFLITE_ACCELERATION_BENCHMARK_IN_PROCESS
+  int RunInprocess(const Allocation* model_allocation,
+                   const std::vector<std::string>& args);
+#endif  // TFLITE_ACCELERATION_BENCHMARK_IN_PROCESS
+
+#ifndef _WIN32
+  // This function first reads the subprocess id from fstream, and then block
+  // until timeout_millisec_ has passed, OR fstream is closed. If timeout is
+  // reached first, we will kill the subprocess. Returns whether kill() is
+  // triggered.
+  bool KillProcessWhenTimedOut(FILE* fstream);
+#endif  // !_WIN32
+
+  std::string temporary_path_;
+  std::string function_name_;
+  void* function_pointer_;
+  std::string runner_path_;
+  std::string soname_;
+  int timeout_millisec_;
+  ErrorReporter* error_reporter_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.h
new file mode 100644
index 00000000..466031a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_SET_BIG_CORE_AFFINITY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_SET_BIG_CORE_AFFINITY_H_
+
+#include <sys/types.h>
+
+namespace tflite {
+namespace acceleration {
+
+// Set the CPU affinity of current process to the system's big cores.
+// Returns 0 in case of success, otherwise returns the value of errno after
+// the call to sched_setaffinity.
+int32_t SetBigCoresAffinity();
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_SET_BIG_CORE_AFFINITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h
new file mode 100644
index 00000000..165ab995
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h
@@ -0,0 +1,128 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_STATUS_CODES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_STATUS_CODES_H_
+
+namespace tflite {
+namespace acceleration {
+// A unified set of status codes for mini-benchmark.
+//
+// The overall mini benchmark infrastructure is multi-layered and its behaviour
+// depends on app packaging, Android target and device SDK version, delegates
+// and drivers. We want to get detailed telemetry so that issues in-the-wild can
+// be diagnosed and potentially reproduced.
+//
+// This enum is used as a single source of truth for the possible error
+// conditions encountered. Layers can pass status codes upwards unchanged.
+//
+// (absl::Status and friends are not allowed in the TFLite codebase for version
+// skew and binary size reasons).
+enum MinibenchmarkStatus {
+  kMinibenchmarkUnknownStatus = 0,
+
+  // First set of error codes that are used as process exit codes to communicate
+  // between the parent and child process. The values need to be between 1 and
+  // 126 to be passed through popen().
+  //
+  // Runner main status codes used to indicate inability to dynamically load and
+  // execute the validation code.
+  //
+  // Next available code: 15
+  // LINT.IfChange
+  kMinibenchmarkRunnerMainDlopenFailed = 11,
+  kMinibenchmarkRunnerMainSymbolLookupFailed = 12,
+  kMinibenchmarkRunnerMainTooFewArguments = 13,
+  kMinibenchmarkUnsupportedPlatform = 14,
+  // LINT.ThenChange(//tensorflow/lite/experimental/acceleration/mini_benchmark/runner_main.c)
+  // General status codes that may be used anywhere
+  //
+  // Next available code: 121
+  kMinibenchmarkPreconditionNotMet = 119,
+  kMinibenchmarkSuccess = 120,
+  // Storage status codes. These are used when storage can not be used to pass
+  // status.
+  //
+  // Next available code: 29
+  kMinibenchmarkCorruptSizePrefixedFlatbufferFile = 21,
+  kMinibenchmarkCantCreateStorageFile = 22,
+  kMinibenchmarkFlockingStorageFileFailed = 23,
+  kMinibenchmarkErrorReadingStorageFile = 24,
+  kMinibenchmarkFailedToOpenStorageFileForWriting = 25,
+  kMinibenchmarkErrorWritingStorageFile = 26,
+  kMinibenchmarkErrorFsyncingStorageFile = 27,
+  kMinibenchmarkErrorClosingStorageFile = 28,
+
+  // Second set of error codes that are used either before launching the child
+  // process or communicated through the storage mechanism. These can be > 127.
+  //
+  // Runner status codes.
+  //
+  // Next available code: 516
+  kMinibenchmarkDladdrReturnedZero = 502,
+  kMinibenchmarkDliFnameWasNull = 503,
+  kMinibenchmarkDliFnameHasApkNameOnly = 504,
+  kMinibenchmarkRequestAndroidInfoFailed = 505,
+  kMinibenchmarkDliFnameDoesntContainSlashes = 506,
+  kMinibenchmarkCouldntOpenTemporaryFileForBinary = 507,
+  kMinibenchmarkCouldntChmodTemporaryFile = 508,
+  kMinibenchmarkPopenFailed = 509,
+  kMinibenchmarkCommandFailed = 510,
+  kMinibenchmarkCommandTimedOut = 514,
+  kMiniBenchmarkCannotLoadSupportLibrary = 511,
+  kMiniBenchmarkInvalidSupportLibraryConfiguration = 512,
+  kMinibenchmarkPipeFailed = 513,
+  kMinibenchmarkCannotLoadGpuModule = 515,
+
+  // Validator status codes.
+  //
+  // Next available code: 1018
+  kMinibenchmarkDelegateNotSupported = 1000,
+  kMinibenchmarkDelegatePluginNotFound = 1001,
+  kMinibenchmarkDelegateCreateFailed = 1014,
+  kMinibenchmarkModelTooLarge = 1002,  // Safety limit currently set at 100M.
+  kMinibenchmarkSeekToModelOffsetFailed = 1003,
+  kMinibenchmarkModelReadFailed = 1004,
+  kMinibenchmarkModelInitFailed = 1017,
+  kMinibenchmarkInterpreterBuilderFailed = 1005,
+  kMinibenchmarkValidationSubgraphNotFound = 1006,
+  kMinibenchmarkModifyGraphWithDelegateFailed = 1007,
+  kMinibenchmarkAllocateTensorsFailed = 1008,
+  kMinibenchmarkInvokeFailed = 1009,
+  kMinibenchmarkModelBuildFailed = 1010,
+  kMinibenchmarkValidationSubgraphHasTooFewInputs = 1011,
+  kMinibenchmarkValidationSubgraphHasTooFewOutputs = 1012,
+  kMinibenchmarkValidationSubgraphInputsDontMatchOutputs = 1013,
+  kMinibenchmarkValidationInputMissing = 1015,
+  kMinibenchmarkValidationSubgraphBuildFailed = 1016,
+
+  // Validator runner status codes.
+  //
+  // Next available code: 1505
+  kMinibenchmarkChildProcessAlreadyRunning = 1501,
+  kMinibenchmarkValidationEntrypointSymbolNotFound = 1502,
+  kMinibenchmarkNoValidationRequestFound = 1503,
+  kMinibenchmarkCompletionEventMissing = 1504,
+
+  // Validator runner recoverable errors
+  //
+  // Next available code: 1602
+  kMinibenchmarkUnableToSetCpuAffinity = 1601,
+
+  kMinibenchmarkAbiMethodNotAvailable = 1701,
+};
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_STATUS_CODES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
new file mode 100644
index 00000000..de00c24e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
@@ -0,0 +1,142 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/tools/model_loader.h"
+
+namespace tflite {
+namespace acceleration {
+
+// Class to run the validation subgraph of a tflite model with embedded
+// validation.
+//
+// The API is split into multiple steps so that callers can construct detailed
+// telemetry from it.
+class Validator {
+ public:
+  // Construct Validator for the given model and compute settings. The
+  // compute_settings must be valid for the lifetime of the Validator instance.
+  Validator(std::unique_ptr<tools::ModelLoader> model_loader,
+            const ComputeSettings* compute_settings)
+      : model_loader_(std::move(model_loader)),
+        compute_settings_(compute_settings) {}
+
+  // Results from validation.
+  struct Results {
+    // Are the results correct (metrics below threshold). When validation
+    // is not embedded, this field is set to false.
+    bool ok = false;
+    // What are the accuracy metrics results, for telemetry.
+    std::map<std::string, std::vector<float>> metrics;
+    // How long did loading the delegate and creating the interpreter take. -1
+    // if failed.
+    int64_t delegate_prep_time_us = 0;
+    // How long did execution (Invoke) take. (Empty in rare cases when reading
+    // the system clock fails).
+    std::vector<int64_t> execution_time_us;
+    // Any possible error from the delegate.
+    int delegate_error = 0;
+    // Number of delegated kernels.
+    int delegated_kernels = 0;
+    // Model output with the delegate, in byte format. It is ordered the same as
+    // tflite::Interpreter::output_tensor(), i.e. the value of output_tensor(i)
+    // is stored in actual_inference_output[i].
+    std::vector<std::vector<char>> actual_inference_output;
+  };
+
+  // Status from validation run.
+  struct Status {
+    // Status of the mini benchmark run.
+    MinibenchmarkStatus status;
+    // Stage during which validation run failed.
+    // Unknown in case of a successful run.
+    BenchmarkStage stage = BenchmarkStage_UNKNOWN;
+  };
+
+  // Run the validation graph and return validation results.
+  Status RunValidation(Results* results_out);
+
+  // Get timestamps.
+  static int64_t BootTimeMicros();
+  static int64_t WallTimeMicros();
+
+  Validator(Validator&) = delete;
+  Validator& operator=(Validator&) = delete;
+  Validator(Validator&&) = delete;
+  Validator& operator=(Validator&&) = delete;
+
+ private:
+  // An opaque version of Interpreter::TfLiteDelegatePtr.
+  using TfLiteOpaqueDelegatePtr =
+      std::unique_ptr<TfLiteOpaqueDelegateStruct,
+                      void (*)(TfLiteOpaqueDelegateStruct*)>;
+
+  // Load delegate plugin and create delegate.
+  MinibenchmarkStatus LoadDelegate();
+
+  // Retrieves a stable delegate and creates an opaque delegate.
+  MinibenchmarkStatus LoadOpaqueDelegate();
+
+  // Create the interpreter with the delegate. Must be called after
+  // LoadDelegate().
+  MinibenchmarkStatus CreateInterpreter(int* delegate_error_out,
+                                        int* delegated_kernels_out);
+
+  // Only used in embedded validation case. If the golden output is not
+  // embedded, run Model on CPU and add golden output to model_.
+  MinibenchmarkStatus CheckGoldenOutput(Results* results_out);
+
+  std::unique_ptr<tools::ModelLoader> model_loader_;
+  const ComputeSettings* compute_settings_;
+  // Optional. Interpreter that runs on CPU.
+  std::unique_ptr<Interpreter> golden_interpreter_;
+  // Interpreter that runs with delegate enabled, using the compute settings
+  // passed to the Validator constructor.
+  std::unique_ptr<Interpreter> interpreter_;
+  // Op resolver used to create the interpreters. Depending on the
+  // compute_settings_, it may or may not include the default delegate.
+  std::unique_ptr<::tflite::MutableOpResolver> resolver_;
+  std::unique_ptr<FlatBufferModel> model_;
+  ::tflite::delegates::TfLiteDelegatePtr delegate_ =
+      delegates::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+  TfLiteOpaqueDelegatePtr opaque_delegate_ =
+      TfLiteOpaqueDelegatePtr(nullptr, [](TfLiteOpaqueDelegate*) {});
+  std::unique_ptr<tflite::delegates::DelegatePluginInterface> delegate_plugin_;
+  int validation_entrypoint_index_ = -1;
+  Subgraph* validation_entrypoint_ = nullptr;
+  Subgraph* main_model_ = nullptr;
+  // Whether accuracy validation is embedded.
+  bool has_accuracy_validation_ = false;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
new file mode 100644
index 00000000..6c12a256
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
@@ -0,0 +1,93 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
+
+namespace tflite {
+namespace acceleration {
+// Class that runs mini-benchmark validation in a separate process and gives
+// access to the results.
+//
+// It is safe to construct more than one instance of the ValidatorRunner in one
+// or more processes. File locks are used to ensure the storage is mutated
+// safely and that we run at most one validation at a time for a given
+// data_directory_path.
+//
+// A single instance of ValidatorRunner is thread-compatible (access from
+// multiple threads must be guarded with a mutex).
+class ValidatorRunner {
+ public:
+  static constexpr int64_t kDefaultEventTimeoutUs = 30 * 1000 * 1000;
+
+  explicit ValidatorRunner(const ValidatorRunnerOptions& options);
+
+  MinibenchmarkStatus Init();
+
+  // The following methods invalidate previously returned pointers.
+
+  // Run validation for those settings in 'for_settings' where validation has
+  // not yet been run. Incomplete validation may be retried a small number of
+  // times (e.g., 2).
+  // Returns number of runs triggered (this may include runs triggered through a
+  // different instance, and is meant for debugging).
+  int TriggerMissingValidation(
+      const std::vector<const TFLiteSettings*>& for_settings);
+
+  // Get results for successfully completed validation runs. The caller can then
+  // pick the best configuration based on timings.
+  std::vector<const BenchmarkEvent*> GetSuccessfulResults() {
+    return validator_runner_impl_->GetSuccessfulResultsFromStorage();
+  }
+
+  // Get results for completed validation runs regardless whether it is
+  // successful or not.
+  int GetNumCompletedResults() {
+    return validator_runner_impl_->GetNumCompletedResults();
+  }
+
+  // Get all relevant results for telemetry. Will contain:
+  // - Start events if an incomplete test is found. Tests are considered
+  // incomplete, if they started more than timeout_us ago and do not have
+  // results/errors.
+  // - Error events where the test ended with an error
+  // - End events where the test was completed (even if results were incorrect).
+  // The returned events will be marked as logged and not returned again on
+  // subsequent calls.
+  std::vector<const BenchmarkEvent*> GetAndFlushEventsToLog(
+      int64_t timeout_us = kDefaultEventTimeoutUs);
+
+ private:
+  const std::string storage_path_;
+  FlatbufferStorage<BenchmarkEvent> storage_;
+  ErrorReporter* error_reporter_;
+  bool triggered_ = false;
+  std::unique_ptr<ValidatorRunnerImpl> validator_runner_impl_;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.h
new file mode 100644
index 00000000..84d64f8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_ENTRYPOINT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_ENTRYPOINT_H_
+
+// This is a helper function used by the runner_main.c to run validation test
+// through commandline flags.
+// - argv[1]: Not used. helper binary name.
+// - argv[2]: Not used. entrypoint function name.
+// - argv[3]: model path. This path will be used by ModelLoader to create model.
+// - argv[4]: storage path for reading Minibenchmark settings and writing
+// Minibenchmark result to.
+// - argv[5]: Not used. data directory path.
+// - argv[6]: Optional. NNAPI SL path.
+// This function ensures thread-safety by creating a file lock of
+// storage_path.child_lock, hence there will be at most one validation test
+// running for a unique storage_path. The validation error and output is written
+// to storage_path too.
+extern "C" int Java_org_tensorflow_lite_acceleration_validation_entrypoint(
+    int argc, char** argv);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_ENTRYPOINT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
new file mode 100644
index 00000000..9daaa253
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
@@ -0,0 +1,167 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_IMPL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_IMPL_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
+
+namespace tflite {
+namespace acceleration {
+
+// This class implements the logic of managing models and triggering validation
+// tests in separate processes, so that we can provide blocking and non-blocking
+// API of ValidatorRunner.
+class ValidatorRunnerImpl {
+ public:
+  // nnapi_sl should be valid until Init() finishes. error_reporter should be
+  // valid during the entire lifetime of the class.
+  // TODO(b/246912769): Create a common Context class to store shared params.
+  ValidatorRunnerImpl(
+      const std::string& fd_or_model_path, const std::string& storage_path,
+      const std::string& data_directory_path, int timeout_ms,
+      std::unique_ptr<CustomValidationEmbedder> custom_validation_embedder,
+      ErrorReporter* error_reporter, const NnApiSLDriverImplFL5* nnapi_sl,
+      const TfLiteDelegatePlugin* gpu_plugin_handle,
+      const std::string& validation_entrypoint_name,
+      AbstractBenchmarkResultEvaluator* benchmark_evaluator)
+      : fd_or_model_path_(fd_or_model_path),
+        storage_path_(storage_path),
+        data_directory_path_(data_directory_path),
+        timeout_ms_(timeout_ms),
+        custom_validation_embedder_(std::move(custom_validation_embedder)),
+        error_reporter_(error_reporter),
+        storage_(storage_path_, error_reporter_),
+        nnapi_helper_(nnapi_sl),
+        gpu_helper_(gpu_plugin_handle),
+        validation_entrypoint_helper_(validation_entrypoint_name,
+                                      error_reporter_),
+        benchmark_evaluator_(benchmark_evaluator) {}
+
+  MinibenchmarkStatus Init();
+
+  // Trigger the test for the given tflite_settings in a new thread. The
+  // settings will run sequentially. The storage_path will be used to store
+  // intermediate test output.
+  void TriggerValidationAsync(
+      std::vector<flatbuffers::FlatBufferBuilder> tflite_settings,
+      absl::string_view storage_path);
+
+  // Returns the unmodified successful BenchmarkEvent from storage. If a
+  // BenchmarkEvent is considered pass with the BenchmarkResultEvaluator, but
+  // its result.ok is set to false, the BenchmarkEvent will be returned as-is.
+  std::vector<const BenchmarkEvent*> GetSuccessfulResultsFromStorage();
+  int GetNumCompletedResults();
+
+  // Returns the completed BenchmarkEvent. BenchmarkResult::ok() will be set to
+  // the result from BenchmarkResultEvaluator.
+  std::vector<flatbuffers::FlatBufferBuilder> GetCompletedResults();
+
+ private:
+  class NnapiHelper {
+   public:
+    // nnapi_sl should be valid when Load() is called.
+    explicit NnapiHelper(const NnApiSLDriverImplFL5* nnapi_sl)
+        : nnapi_sl_(nnapi_sl) {}
+
+    // Load the NNAPI SL from dynamic linking loader. Returns the error status
+    // if failed.
+    MinibenchmarkStatus Load();
+
+    // Returns the pathname of the shared object.
+    const std::string& nnapi_sl_path() const { return nnapi_sl_path_; }
+
+   private:
+    const NnApiSLDriverImplFL5* nnapi_sl_;
+    std::string nnapi_sl_path_;
+  };
+
+  // Lookup the GPU Module .so file path if gpu_plugin_handle is provided.
+  class GpuHelper {
+   public:
+    explicit GpuHelper(const TfLiteDelegatePlugin* gpu_plugin_handle)
+        : gpu_plugin_handle_(gpu_plugin_handle) {}
+    MinibenchmarkStatus Load();
+    const std::string& gpu_so_path() const { return gpu_so_path_; }
+
+   private:
+    const TfLiteDelegatePlugin* gpu_plugin_handle_;
+    std::string gpu_so_path_;
+  };
+
+  class ValidationEntrypointHelper {
+   public:
+    using EntrypointFunc = int(int argc, char** argv);
+
+    // error_reporter should be valid for the entire lifetime.
+    explicit ValidationEntrypointHelper(
+        const std::string& validation_entrypoint_name,
+        ErrorReporter* error_reporter)
+        : validation_entrypoint_name_(validation_entrypoint_name),
+          error_reporter_(error_reporter) {}
+
+    // Verify that the entrypoint function can be found with dlsym(). Returns
+    // the error status if failed.
+    MinibenchmarkStatus Validate();
+
+    // Returns the entrypoint function from dlsym(). Returns nullptr if failed.
+    // Note this function will perform the lookup each time when it's called.
+    EntrypointFunc* LoadEntrypoint();
+
+    // Returns the function name. Lifetime is the same as the helper class
+    // itself.
+    const std::string& name() { return validation_entrypoint_name_; }
+
+   private:
+    std::string validation_entrypoint_name_;
+    ErrorReporter* error_reporter_;
+  };
+
+  std::string fd_or_model_path_;
+  std::string storage_path_;
+  std::string data_directory_path_;
+  int timeout_ms_ = 0;
+  std::unique_ptr<CustomValidationEmbedder> custom_validation_embedder_;
+  // Owns the model data if model is created with custom_validation_embedder_.
+  // Otherwise this is empty.
+  flatbuffers::FlatBufferBuilder model_with_custom_input_;
+  // If not null, this allocation points the already-loaded model that need to
+  // be copied to the detached thread.
+  std::unique_ptr<Allocation> model_allocation_ = nullptr;
+  ErrorReporter* error_reporter_;
+  FlatbufferStorage<BenchmarkEvent> storage_;
+  NnapiHelper nnapi_helper_;
+  GpuHelper gpu_helper_;
+  ValidationEntrypointHelper validation_entrypoint_helper_;
+  AbstractBenchmarkResultEvaluator* benchmark_evaluator_ = nullptr;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
new file mode 100644
index 00000000..44a69cbf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_OPTIONS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_OPTIONS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
+#include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+namespace tflite {
+namespace acceleration {
+
+inline const char* TfLiteValidationEntrypointName() {
+  static constexpr char kEntrypointName[] =
+      "Java_org_tensorflow_lite_acceleration_validation_entrypoint";
+  return kEntrypointName;
+}
+
+// Option class for constructing ValidatorRunner and BlockingValidatorRunner.
+struct ValidatorRunnerOptions {
+  // Required: Where to read the model.
+  // Option 1: Read model from model_path.
+  std::string model_path;
+  // Option 2: Read model from file descriptor.
+  int model_fd = -1;
+  size_t model_offset = 0;
+  size_t model_size = 0;
+  // Option 3: Read model from buffer. Requires model_buffer and model_size.
+  const uint8_t* model_buffer = nullptr;
+
+  // Optional: Custom validation info.
+  // Number of sample input.
+  int custom_input_batch_size = 1;
+  // The sample input data.
+  // Suppose the model has N input tensors, and each tensor is of size M, then
+  // custom_input_data.size() == N, and each custom_input_data[i] .size() ==
+  // M*N. The input data from different batches are concatenated so that the
+  // j-th input data maps to custom_input_data[i][j * M to(j + 1) * M].
+  std::vector<std::vector<uint8_t>> custom_input_data;
+  // The custom validation rule that decides whether the output is considered
+  // passing accuracy checks. The lifetime of this evaluator should last longer
+  // than validator runner.
+  AbstractBenchmarkResultEvaluator* benchmark_result_evaluator =
+      EmbeddedResultEvaluator::GetInstance();
+
+  // Required: The 'storage_path' must be model-specific.
+  std::string storage_path;
+  // Required: 'data_directory_path' must be suitable for extracting an
+  // executable file to.
+  std::string data_directory_path;
+  // Optional: The timeout for each acceleration config test. By default
+  // timeout is not enabled.
+  int per_test_timeout_ms = 0;
+
+  // Optional: The nnapi_sl pointer can be used to configure the runner to use
+  // the NNAPI implementation coming from the Support Library instead of
+  // the NNAPI platform drivers.
+  // If nnapi_sl is not null we expect the functions referenced by the
+  // structure lifetime to be enclosing the one of the mini-benchmark. In
+  // particular we expect that if the NnApiSupportLibrary was loaded by a
+  // shared library, dlclose is called only after all this mini-benchmark
+  // object has been deleted.
+  const NnApiSLDriverImplFL5* nnapi_sl = nullptr;
+  // Optional: A handle to a gpu_plugin provided by TFLite-in-PlayServices GPU
+  // Module. It will be used to lookup the shared object that provides GPU
+  // Delegate Plugin.
+  const TfLiteDelegatePlugin* gpu_plugin_handle = nullptr;
+
+  std::string validation_entrypoint_name = TfLiteValidationEntrypointName();
+  ErrorReporter* error_reporter = DefaultErrorReporter();
+};
+
+// Create a ValidatorRunnerOptions based on the given settings.
+ValidatorRunnerOptions CreateValidatorRunnerOptionsFrom(
+    const MinibenchmarkSettings& settings);
+
+// Create the model path needed for creating ModelLoader.
+std::string CreateModelLoaderPath(const ValidatorRunnerOptions& options);
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/genai/genai_ops.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/genai/genai_ops.h
new file mode 100644
index 00000000..233f3b52
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/genai/genai_ops.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_GENAI_GENAI_OPS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_GENAI_GENAI_OPS_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_KV_CACHE();
+TfLiteRegistration* Register_EXTERNAL_KV_CACHE();
+TfLiteRegistration* Register_SDPA();
+
+extern "C" void GenAIOpsRegisterer(::tflite::MutableOpResolver* resolver);
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_GENAI_GENAI_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_any.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_any.h
new file mode 100644
index 00000000..69a2a8d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_any.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ANY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ANY_H_
+
+#include <stdbool.h>  // NOLINT: To use bool type in C
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum {
+  kLiteRtAnyTypeNone = 0,
+  kLiteRtAnyTypeBool = 1,
+  kLiteRtAnyTypeInt = 2,
+  kLiteRtAnyTypeReal = 3,
+  kLiteRtAnyTypeString = 8,
+  kLiteRtAnyTypeVoidPtr = 9,
+} LiteRtAnyType;
+
+typedef struct {
+  LiteRtAnyType type;
+  union {
+    bool bool_value;
+    int64_t int_value;
+    double real_value;
+    const char* str_value;
+    const void* ptr_value;
+  };
+} LiteRtAny;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ANY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_common.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_common.h
new file mode 100644
index 00000000..72f089c2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_common.h
@@ -0,0 +1,101 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Declares canonical opaque type.
+#define LITERT_DEFINE_HANDLE(name) typedef struct name##T* name
+
+#if __ANDROID_API__ >= 26
+#define LITERT_HAS_AHWB_SUPPORT 1
+#else
+#define LITERT_HAS_AHWB_SUPPORT 0
+#endif  // __ANDROID_API__ >= 26
+
+#if defined(__linux__) || defined(__ANDROID__)
+#define LITERT_HAS_SYNC_FENCE_SUPPORT 1
+#else
+#define LITERT_HAS_SYNC_FENCE_SUPPORT 0
+#endif
+
+#if defined(__ANDROID__)
+#define LITERT_HAS_ION_SUPPORT 1
+#define LITERT_HAS_DMABUF_SUPPORT 1
+#define LITERT_HAS_FASTRPC_SUPPORT 1
+#else
+#define LITERT_HAS_ION_SUPPORT 0
+#define LITERT_HAS_DMABUF_SUPPORT 0
+#define LITERT_HAS_FASTRPC_SUPPORT 0
+#endif
+
+#define LITERT_API_VERSION_MAJOR 0
+#define LITERT_API_VERSION_MINOR 1
+#define LITERT_API_VERSION_PATCH 0
+
+typedef struct LiteRtApiVersion {
+  int major;
+  int minor;
+  int patch;
+} LiteRtApiVersion;
+
+typedef enum {
+  kLiteRtStatusOk = 0,
+
+  // Generic errors.
+  kLiteRtStatusErrorInvalidArgument = 1,
+  kLiteRtStatusErrorMemoryAllocationFailure = 2,
+  kLiteRtStatusErrorRuntimeFailure = 3,
+  kLiteRtStatusErrorMissingInputTensor = 4,
+  kLiteRtStatusErrorUnsupported = 5,
+  kLiteRtStatusErrorNotFound = 6,
+  kLiteRtStatusErrorTimeoutExpired = 7,
+
+  // File and loading related errors.
+  kLiteRtStatusErrorFileIO = 500,
+  kLiteRtStatusErrorInvalidFlatbuffer = 501,
+  kLiteRtStatusErrorDynamicLoading = 502,
+  kLiteRtStatusErrorSerialization = 503,
+  kLiteRtStatusErrorCompilation = 504,
+
+  // IR related errors.
+  kLiteRtStatusErrorIndexOOB = 1000,
+  kLiteRtStatusErrorInvalidIrType = 1001,
+  kLiteRtStatusErrorInvalidGraphInvariant = 1002,
+  kLiteRtStatusErrorGraphModification = 1003,
+
+  // Tool related errors.
+  kLiteRtStatusErrorInvalidToolConfig = 1500,
+
+  // Lealization related errors.
+  kLiteRtStatusLegalizeNoMatch = 2000,
+  kLiteRtStatusErrorInvalidLegalization = 2001,
+} LiteRtStatus;
+
+typedef enum : int {
+  kLiteRtHwAccelatorNone = 0,
+  kLiteRtHwAccelatorCpu = 1 << 0,
+  kLiteRtHwAccelatorGpu = 1 << 1,
+  kLiteRtHwAccelatorNpu = 1 << 2,
+} LiteRtHwAccelerators;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_compiled_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_compiled_model.h
new file mode 100644
index 00000000..10a2d4c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_compiled_model.h
@@ -0,0 +1,106 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The LiteRtCompiledModel is a higher level inference API. It is created by
+// provided model with compilation options. Internally, it instantiates runtime
+// and applies Delegates mapped to the compilation options.
+// It also supports getting LiteRtTensorBufferRequirements to create
+// input/output TensorBuffers, and it allows to invoke the model with the
+// input/output TensorBuffers.
+//
+// Example user flow:
+//
+// 1. Create LiteRtCompiledModel
+// 2. Query the model input/output LiteRtTensorBufferRequirements
+// 3. Create input/output LiteRtTensorBuffer
+// 4. Fill the input LiteRtTensorBuffer with input data
+// 5. Invoke the model with the input/output LiteRtTensorBuffer
+// 6. Evaluate the output LiteRtTensorBuffer
+
+LITERT_DEFINE_HANDLE(LiteRtCompiledModel);
+
+// Creates a LiteRtCompiledModel from a LiteRtModel object.
+// The model is loaded into memory and the caller takes ownership of the
+// returned object.
+LiteRtStatus LiteRtCreateCompiledModel(
+    LiteRtModel model, LiteRtCompilationOptions compilation_options,
+    LiteRtCompiledModel* compiled_model);
+
+// Returns the buffer requirements for the given n-th input tensor. The returned
+// LiteRtTensorBufferRequirements is used to create the input tensor
+// buffer.
+//
+// Parameters:
+// - compiled_model: the target `LiteRtCompiledModel` object.
+// - signature_index: the index of the signature in `LiteRtModel`.
+// - input_index: the index of the input tensor in the signature (subgraph).
+// - buffer_requirements: the returned `LiteRtTensorBufferRequirements`.
+LiteRtStatus LiteRtGetCompiledModelInputBufferRequirements(
+    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
+    LiteRtParamIndex input_index,
+    LiteRtTensorBufferRequirements* buffer_requirements);
+
+// Returns the buffer requirements for the given n-th output tensor. The
+// returned LiteRtTensorBufferRequirements is used to create the output tensor
+// buffer.
+//
+// Parameters:
+// - compiled_model: the target `LiteRtCompiledModel` object.
+// - signature_index: the index of the signature in `LiteRtModel`.
+// - input_index: the index of the input tensor in the signature (subgraph).
+// - buffer_requirements: the returned `LiteRtTensorBufferRequirements`.
+LiteRtStatus LiteRtGetCompiledModelOutputBufferRequirements(
+    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
+    LiteRtParamIndex output_index,
+    LiteRtTensorBufferRequirements* buffer_requirements);
+
+// Runs the model of the given n-th signature with the provided input/output
+// LiteRtTensorBuffer.
+//
+// Parameters:
+// - compiled_model: the target `LiteRtCompiledModel` object.
+// - signature_index: the index of the signature in `LiteRtModel`.
+// - num_input_buffers: the number of input `LiteRtTensorBuffer`.
+// - input_buffers: the array of input `LiteRtTensorBuffer`.
+// - num_output_buffers: the number of output `LiteRtTensorBuffer`.
+// - output_buffers: the array of output LiteRtTensorBuffer.
+LiteRtStatus LiteRtRunCompiledModel(LiteRtCompiledModel compiled_model,
+                                    LiteRtParamIndex signature_index,
+                                    size_t num_input_buffers,
+                                    LiteRtTensorBuffer* input_buffers,
+                                    size_t num_output_buffers,
+                                    LiteRtTensorBuffer* output_buffers);
+
+void LiteRtDestroyCompiledModel(LiteRtCompiledModel compiled_model);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h
new file mode 100644
index 00000000..151aa050
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h
@@ -0,0 +1,31 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_OPTIONS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_OPTIONS_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The compilation options for the LiteRtCompiledModel.
+typedef LiteRtHwAccelerators LiteRtCompilationOptions;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h
new file mode 100644
index 00000000..48855e78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h
@@ -0,0 +1,84 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_DISPATCH_DELEGATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_DISPATCH_DELEGATE_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+
+#ifdef __cplusplus
+#include <memory>
+
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct LiteRtDispatchDelegateOptions LiteRtDispatchDelegateOptions;
+
+// Returns DispatchDelegateOptions populated with default values.
+LiteRtDispatchDelegateOptions* LiteRtCreateDefaultDispatchDelegateOptions();
+
+TfLiteStatus LiteRtAddDispatchDelegateOption(
+    LiteRtDispatchDelegateOptions* options, LiteRtDispatchOption option);
+
+void LiteRtDestroyDispatchDelegateOptions(
+    LiteRtDispatchDelegateOptions* options);
+
+// Create a delegate that uses the Dispatch API for execution. Takes ownership
+// of the passed `options`. Must outlive the TFL interpreter.
+TfLiteOpaqueDelegate* LiteRtCreateDispatchDelegate(
+    LiteRtDispatchDelegateOptions* options);
+
+// Do any needed cleanup and delete 'delegate'.
+void LiteRtDestroyDispatchDelegate(TfLiteOpaqueDelegate* delegate);
+
+//
+// Common option helpers
+//
+
+// Alloc base is the address of the first byte of flatbuffer model in memory. It
+// is used by ops to find the start of npu byte code appended to the file.
+TfLiteStatus LiteRtDispatchDelegateAddAllocBaseOption(
+    LiteRtDispatchDelegateOptions* options, const void* alloc_base);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+namespace litert {
+
+using DispatchDelegateOptionsPtr =
+    std::unique_ptr<LiteRtDispatchDelegateOptions,
+                    void (*)(LiteRtDispatchDelegateOptions*)>;
+
+using DispatchDelegatePtr = tflite::TfLiteOpaqueDelegateUniquePtr;
+
+DispatchDelegateOptionsPtr CreateDispatchDelegateOptionsPtr();
+
+DispatchDelegatePtr CreateDispatchDelegatePtr(
+    DispatchDelegateOptionsPtr&& options);
+
+}  // namespace litert
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_DISPATCH_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_environment.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_environment.h
new file mode 100644
index 00000000..fce03aee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_environment.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ENVIRONMENT_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_any.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum {
+  kLiteRtEnvOptionTagCompilerPluginLibraryPath = 0,
+  kLiteRtEnvOptionTagDispatchLibraryPath = 1,
+} LiteRtEnvOptionTag;
+
+typedef struct {
+  LiteRtEnvOptionTag tag;
+  LiteRtAny value;
+} LiteRtEnvOption;
+
+// Create a singleton LiteRT environment with options. Returns an error if the
+// instance already exists, in which case the specified options have no
+// effect. If not created explicitly with options, the environment instance will
+// be created (with no options) when needed.
+LiteRtStatus LiteRtEnvironmentCreate(int num_options,
+                                     const LiteRtEnvOption* options);
+
+// Destroy the LiteRT environment instance.
+void LiteRtEnvironmentDestroy();
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_event.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_event.h
new file mode 100644
index 00000000..20a42738
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_event.h
@@ -0,0 +1,43 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_EVENT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_EVENT_H_
+
+#include <stdbool.h>  // NOLINT: To use bool type in C
+#include <stdint.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+LITERT_DEFINE_HANDLE(LiteRtEvent);
+
+LiteRtStatus LiteRtCreateEventFromSyncFenceFd(int sync_fence_fd, bool owns_fd,
+                                              LiteRtEvent* event);
+
+LiteRtStatus LiteRtGetEventSyncFenceFd(LiteRtEvent event, int* sync_fence_fd);
+
+// Pass -1 for timeout_in_ms for indefinite wait.
+LiteRtStatus LiteRtEventWait(LiteRtEvent event, int64_t timeout_in_ms);
+
+void LiteRtDestroyEvent(LiteRtEvent event);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_EVENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_layout.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_layout.h
new file mode 100644
index 00000000..b641985b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_layout.h
@@ -0,0 +1,45 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LAYOUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LAYOUT_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Max number of dimensions in any ranked tensor type.
+#define LITERT_TENSOR_MAX_RANK 8
+
+// The shape information for tensor types of fixed rank.
+typedef struct {
+  // The number of dimensions.
+  uint32_t rank;
+
+  // Dimension sizes, array of length `rank`. Dynamic dimensions are anything
+  // less than 0. Everything from [rank, LITERT_MAX_RANK) is undefined.
+  int32_t dimensions[LITERT_TENSOR_MAX_RANK];
+
+  // Strides for a nomimal NWHC layout. NULL if unused.
+  const uint32_t* strides;
+} LiteRtLayout;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_logging.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_logging.h
new file mode 100644
index 00000000..41962c64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_logging.h
@@ -0,0 +1,87 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LOGGING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LOGGING_H_
+
+#include <stdarg.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+LITERT_DEFINE_HANDLE(LiteRtLogger);
+
+// WARNING: The values of the following enum are to be kept in sync with
+// tflite::LogSeverity.
+typedef enum {
+  kLiteRtLogSeverityVerbose = 0,
+  kLiteRtLogSeverityInfo = 1,
+  kLiteRtLogSeverityWarning = 2,
+  kLiteRtLogSeverityError = 3,
+  kLiteRtLogSeveritySilent = 4,
+} LiteRtLogSeverity;
+
+#define LITERT_VERBOSE kLiteRtLogSeverityVerbose
+#define LITERT_INFO kLiteRtLogSeverityInfo
+#define LITERT_WARNING kLiteRtLogSeverityWarning
+#define LITERT_ERROR kLiteRtLogSeverityError
+#define LITERT_SILENT kLiteRtLogSeveritySilent
+
+LiteRtStatus LiteRtCreateLogger(LiteRtLogger* logger);
+LiteRtStatus LiteRtGetMinLoggerSeverity(LiteRtLogger logger,
+                                        LiteRtLogSeverity* min_severity);
+LiteRtStatus LiteRtSetMinLoggerSeverity(LiteRtLogger logger,
+                                        LiteRtLogSeverity min_severity);
+LiteRtStatus LiteRtLoggerLog(LiteRtLogger logger, LiteRtLogSeverity severity,
+                             const char* format, ...);
+void LiteRtDestroyLogger(LiteRtLogger logger);
+
+LiteRtLogger LiteRtGetDefaultLogger();
+LiteRtStatus LiteRtSetDefaultLogger(LiteRtLogger logger);
+LiteRtStatus LiteRtDefaultLoggerLog(LiteRtLogSeverity severity,
+                                    const char* format, ...);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#define LITERT_LOGGER_LOG_PROD(logger, severity, format, ...)                  \
+  {                                                                            \
+    LiteRtLogSeverity __min_severity__;                                        \
+    if (LiteRtGetMinLoggerSeverity(logger, &__min_severity__) !=               \
+        kLiteRtStatusOk) {                                                     \
+      __min_severity__ = kLiteRtLogSeverityVerbose;                            \
+    }                                                                          \
+    if (severity >= __min_severity__) {                                        \
+      LiteRtLoggerLog(logger, severity, "[%s:%d] " format, __FILE__, __LINE__, \
+                      ##__VA_ARGS__);                                          \
+    }                                                                          \
+  }
+
+#ifndef NDEBUG
+#define LITERT_LOGGER_LOG LITERT_LOGGER_LOG_PROD
+#else
+#define LITERT_LOGGER_LOG(logger, severity, format, ...)             \
+  do {                                                               \
+    LITERT_LOGGER_LOG_PROD(logger, severity, format, ##__VA_ARGS__); \
+  } while (false)
+#endif
+
+#define LITERT_LOG(severity, format, ...) \
+  LITERT_LOGGER_LOG(LiteRtGetDefaultLogger(), severity, format, ##__VA_ARGS__);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_model.h
new file mode 100644
index 00000000..0cae98e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_model.h
@@ -0,0 +1,352 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_MODEL_H_
+
+#include <stdbool.h>  // NOLINT: To use bool type in C
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_layout.h"
+#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//
+// Handles + Common
+//
+
+// Constant data behind a tensor stored in the model.
+LITERT_DEFINE_HANDLE(LiteRtWeights);
+
+// Values/edges of the models graph.
+LITERT_DEFINE_HANDLE(LiteRtTensor);
+
+// Operations/nodes of the models graph.
+LITERT_DEFINE_HANDLE(LiteRtOp);
+
+// Fundamental block of program, i.e. a function body.
+LITERT_DEFINE_HANDLE(LiteRtSubgraph);
+
+// Signature of the model.
+LITERT_DEFINE_HANDLE(LiteRtSignature);
+
+// A collection of subgraph + metadata + signature.
+LITERT_DEFINE_HANDLE(LiteRtModel);
+
+// Append only list of ops.
+LITERT_DEFINE_HANDLE(LiteRtOpList);
+
+// For indexing into litert collections or counting litert things.
+typedef size_t LiteRtParamIndex;
+
+//
+// LiteRtTensor + Types
+//
+
+// Get the string name associated with this tensor. This is an optional
+// attribute and if not set will return a zero-length string.
+LiteRtStatus LiteRtGetTensorName(LiteRtTensor tensor, const char** name);
+
+// TENSOR TYPES
+
+// Primitive types for elements in a tensor.
+typedef enum {
+  kLiteRtElementTypeNone = kTfLiteNoType,
+  kLiteRtElementTypeBool = kTfLiteBool,
+  kLiteRtElementTypeInt4 = kTfLiteInt4,
+  kLiteRtElementTypeInt8 = kTfLiteInt8,
+  kLiteRtElementTypeInt16 = kTfLiteInt16,
+  kLiteRtElementTypeInt32 = kTfLiteInt32,
+  kLiteRtElementTypeInt64 = kTfLiteInt64,
+  kLiteRtElementTypeUInt8 = kTfLiteUInt8,
+  kLiteRtElementTypeUInt16 = kTfLiteUInt16,
+  kLiteRtElementTypeUInt32 = kTfLiteUInt32,
+  kLiteRtElementTypeUInt64 = kTfLiteUInt64,
+  kLiteRtElementTypeFloat16 = kTfLiteFloat16,
+  kLiteRtElementTypeBFloat16 = kTfLiteBFloat16,
+  kLiteRtElementTypeFloat32 = kTfLiteFloat32,
+  kLiteRtElementTypeFloat64 = kTfLiteFloat64,
+  kLiteRtElementTypeComplex64 = kTfLiteComplex64,
+  kLiteRtElementTypeComplex128 = kTfLiteComplex128,
+  kLiteRtElementTypeTfResource = kTfLiteResource,
+  kLiteRtElementTypeTfString = kTfLiteString,
+  kLiteRtElementTypeTfVariant = kTfLiteVariant,
+} LiteRtElementType;
+
+// Tensor whose rank is dynamic.
+typedef struct {
+  // The primitive element type of the constituent data.
+  LiteRtElementType element_type;
+} LiteRtUnrankedTensorType;
+
+// Tensor whose rank is static but dimenions may be dynamic.
+typedef struct {
+  // The primitive element type of the constituent data.
+  LiteRtElementType element_type;
+
+  // Shape information.
+  LiteRtLayout layout;
+} LiteRtRankedTensorType;
+
+// The identifier for tensor type union.
+typedef enum {
+  // Type with fix ranked and possibly dynamic dimensions.
+  kLiteRtRankedTensorType = 0,
+
+  // Type with dynamic rank.
+  kLiteRtUnrankedTensorType = 1,
+} LiteRtTensorTypeId;
+
+// Get type identifier from tensor.
+LiteRtStatus LiteRtGetTensorTypeId(LiteRtTensor tensor,
+                                   LiteRtTensorTypeId* type_id);
+
+// Get unranked tensor type info, return bad status if not unranked.
+LiteRtStatus LiteRtGetUnrankedTensorType(
+    LiteRtTensor tensor, LiteRtUnrankedTensorType* unranked_tensor_type);
+
+// Get ranked tensor type info, return bad status if not ranked.
+LiteRtStatus LiteRtGetRankedTensorType(
+    LiteRtTensor tensor, LiteRtRankedTensorType* ranked_tensor_type);
+
+// QUANTIZATION
+
+// Schema for tensors quantized with one set of q-params.
+typedef struct {
+  // Scaling factor.
+  float scale;
+
+  // The value that float:0 maps to in q-space.
+  int64_t zero_point;
+} LiteRtQuantizationPerTensor;
+
+// Schema for tensors quantized with one set of q-params per channel.
+typedef struct {
+  int32_t quantized_dimension;
+  uint64_t num_channels;
+  float* scales;
+  int64_t* zero_points;
+} LiteRtQuantizationPerChannel;
+
+// The identifier for quantization scheme type union.
+typedef enum {
+  // Tag for tensors without quantization.
+  kLiteRtQuantizationNone = 0,
+
+  // Basic quantization, one set of q-params per tensor.
+  kLiteRtQuantizationPerTensor = 1,
+
+  // [NOT IMPLEMENTED YET] Q-params for each element accross a single dimension.
+  kLiteRtQuantizationPerChannel = 2,
+
+  // [NOT IMPLEMENTED YET] Q-params accross blocks of fixed size (e.g. 2048).
+  kLiteRtQuantizationBlockWise = 3,
+} LiteRtQuantizationTypeId;
+
+// Get the identifier for the type of quantization for a given tensor.
+LiteRtStatus LiteRtGetQuantizationTypeId(LiteRtTensor tensor,
+                                         LiteRtQuantizationTypeId* q_type_id);
+
+// Get the per-tensor quantization information for a given tensor if it has it.
+LiteRtStatus LiteRtGetPerTensorQuantization(
+    LiteRtTensor tensor, LiteRtQuantizationPerTensor* per_tensor_quantization);
+
+// Get the per-channel quantization information for a given tensor if it has it.
+LiteRtStatus LiteRtGetPerChannelQuantization(
+    LiteRtTensor tensor,
+    LiteRtQuantizationPerChannel* per_channel_quantization);
+
+// EDGES
+
+// Information about the about that defines a tensor.
+typedef struct LiteRtTensorDefiningOp {
+  // The defining op itself.
+  LiteRtOp op;
+
+  // The op output index that defines the specific tensor.
+  LiteRtParamIndex op_output_index;
+} LiteRtTensorDefiningOp;
+
+// Information about a reference to a tensor in the graph.
+typedef struct LiteRtTensorUserOp {
+  // The referring op itself.
+  LiteRtOp op;
+
+  // Index of which operand the op refers to a specific tensor on.
+  LiteRtParamIndex op_input_index;
+} LiteRtTensorUserOp;
+
+// Get all the ops that reference given tensor, and at what operand index.
+LiteRtStatus LiteRtGetNumTensorUses(LiteRtTensor tensor,
+                                    LiteRtParamIndex* num_uses);
+LiteRtStatus LiteRtGetTensorUse(LiteRtTensor tensor, LiteRtParamIndex use_index,
+                                LiteRtOp* user,
+                                LiteRtParamIndex* user_arg_index);
+
+// Get the op that defines this tensor and the corresponding output index. If
+// tensor is a subgraph input, has_defining_op will be false.
+LiteRtStatus LiteRtGetTensorDefiningOp(LiteRtTensor tensor,
+                                       bool* has_defining_op,
+                                       LiteRtTensorDefiningOp* defining_op);
+
+// WEIGHTS (constant data)
+
+// Get static weights associated with a given tensor. All tensors have weights,
+// null weights have size = 0;
+LiteRtStatus LiteRtGetTensorWeights(LiteRtTensor tensor,
+                                    LiteRtWeights* weights);
+
+//
+// LiteRtWeights
+//
+
+// Get opaque array from given tensor weights.
+LiteRtStatus LiteRtGetWeightsBytes(LiteRtWeights weights, const void** addr,
+                                   size_t* size);
+
+//
+// LiteRtOp
+//
+
+// Get code corresponding to operation type for given op.
+LiteRtStatus LiteRtGetOpCode(LiteRtOp op, LiteRtOpCode* code);
+
+// Get input tensors of given op.
+LiteRtStatus LiteRtGetNumOpInputs(LiteRtOp op, LiteRtParamIndex* num_inputs);
+LiteRtStatus LiteRtGetOpInput(LiteRtOp op, LiteRtParamIndex input_index,
+                              LiteRtTensor* input);
+
+// Get output tensors of given op.
+LiteRtStatus LiteRtGetNumOpOutputs(LiteRtOp op, LiteRtParamIndex* num_outputs);
+LiteRtStatus LiteRtGetOpOutput(LiteRtOp op, LiteRtParamIndex output_index,
+                               LiteRtTensor* output);
+
+//
+// LiteRtSubgraph
+//
+
+// Get input tensors for given subgraph.
+LiteRtStatus LiteRtGetNumSubgraphInputs(LiteRtSubgraph subgraph,
+                                        LiteRtParamIndex* num_inputs);
+LiteRtStatus LiteRtGetSubgraphInput(LiteRtSubgraph subgraph,
+                                    LiteRtParamIndex input_index,
+                                    LiteRtTensor* input);
+
+// Get output tensors for given subgraph.
+LiteRtStatus LiteRtGetNumSubgraphOutputs(LiteRtSubgraph subgraph,
+                                         LiteRtParamIndex* num_outputs);
+LiteRtStatus LiteRtGetSubgraphOutput(LiteRtSubgraph subgraph,
+                                     LiteRtParamIndex output_index,
+                                     LiteRtTensor* output);
+
+// Get all ops in given subgraph in a topological order.
+LiteRtStatus LiteRtGetNumSubgraphOps(LiteRtSubgraph subgraph,
+                                     LiteRtParamIndex* num_ops);
+LiteRtStatus LiteRtGetSubgraphOp(LiteRtSubgraph subgraph,
+                                 LiteRtParamIndex op_index, LiteRtOp* op);
+
+//
+// LiteRtSignature
+//
+
+// Default signature key. This is the key that is used if the model does not
+// define any signatures.
+LiteRtStatus LiteRtGetDefaultSignatureKey(const char** signature_key);
+
+// Get the signature key string defined in the model.
+LiteRtStatus LiteRtGetSignatureKey(LiteRtSignature signature,
+                                   const char** signature_key);
+
+// Get the associated subgraph for the given signature.
+LiteRtStatus LiteRtGetSignatureSubgraph(LiteRtSignature signature,
+                                        LiteRtSubgraph* subgraph);
+
+// Get the number of inputs for the given signature.
+LiteRtStatus LiteRtGetNumSignatureInputs(LiteRtSignature signature,
+                                         LiteRtParamIndex* num_inputs);
+
+// Get the name of the i-th of input tensor name for the given signature.
+LiteRtStatus LiteRtGetSignatureInputName(LiteRtSignature signature,
+                                         LiteRtParamIndex input_idx,
+                                         const char** input_name);
+
+// Get the number of outputs for the given signature.
+LiteRtStatus LiteRtGetNumSignatureOutputs(LiteRtSignature signature,
+                                          LiteRtParamIndex* num_outputs);
+
+// Get the name of the i-th of output tensor name for the given signature.
+LiteRtStatus LiteRtGetSignatureOutputName(LiteRtSignature signature,
+                                          LiteRtParamIndex output_idx,
+                                          const char** output_name);
+
+//
+// LiteRtModel
+//
+
+LiteRtStatus LiteRtCreateModelFromFile(const char* filename,
+                                       LiteRtModel* model);
+
+LiteRtStatus LiteRtCreateModelFromBuffer(const void* buffer_addr,
+                                         size_t buffer_size,
+                                         LiteRtModel* model);
+
+// Get the metadata buffer associated with given key if it exists.
+LiteRtStatus LiteRtGetModelMetadata(LiteRtModel model, const char* metadata_key,
+                                    const void** metadata_buffer,
+                                    size_t* metadata_buffer_size);
+
+// Get the index of the entry subgraph.
+// TODO: b/365299994 - Figure out signatures.
+LiteRtStatus LiteRtGetMainModelSubgraphIndex(
+    LiteRtModel model, LiteRtParamIndex* main_subgraph_index);
+
+// Get number of subgraphs in model.
+LiteRtStatus LiteRtGetNumModelSubgraphs(LiteRtModel model,
+                                        LiteRtParamIndex* num_subgraphs);
+
+// Get subgraph at given index in model.
+LiteRtStatus LiteRtGetModelSubgraph(LiteRtModel model,
+                                    LiteRtParamIndex subgraph_index,
+                                    LiteRtSubgraph* subgraph);
+
+// Get the number of signatures defined in the model.
+LiteRtStatus LiteRtGetNumModelSignatures(LiteRtModel model,
+                                         LiteRtParamIndex* num_signatures);
+
+// Get the signature at the given index in the model
+LiteRtStatus LiteRtGetModelSignature(LiteRtModel model,
+                                     LiteRtParamIndex signature_index,
+                                     LiteRtSignature* signature);
+
+// Destroy the given model, freeing any memory it owns.
+void LiteRtDestroyModel(LiteRtModel model);
+
+//
+// Utility Types
+//
+
+// An append only list of ops.
+LiteRtStatus LiteRtPushOp(LiteRtOpList op_list, LiteRtOp op);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_op_code.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_op_code.h
new file mode 100644
index 00000000..529360e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_op_code.h
@@ -0,0 +1,245 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OP_CODE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OP_CODE_H_
+
+#include "tensorflow/lite/builtin_ops.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum {
+  kLiteRtOpCodeTflAdd = kTfLiteBuiltinAdd,
+  kLiteRtOpCodeTflAveragePool2d = kTfLiteBuiltinAveragePool2d,
+  kLiteRtOpCodeTflConcatenation = kTfLiteBuiltinConcatenation,
+  kLiteRtOpCodeTflConv2d = kTfLiteBuiltinConv2d,
+  kLiteRtOpCodeTflDepthwiseConv2d = kTfLiteBuiltinDepthwiseConv2d,
+  kLiteRtOpCodeTflDepthToSpace = kTfLiteBuiltinDepthToSpace,
+  kLiteRtOpCodeTflDequantize = kTfLiteBuiltinDequantize,
+  kLiteRtOpCodeTflEmbeddingLookup = kTfLiteBuiltinEmbeddingLookup,
+  kLiteRtOpCodeTflFloor = kTfLiteBuiltinFloor,
+  kLiteRtOpCodeTflFullyConnected = kTfLiteBuiltinFullyConnected,
+  kLiteRtOpCodeTflHashtableLookup = kTfLiteBuiltinHashtableLookup,
+  kLiteRtOpCodeTflL2Normalization = kTfLiteBuiltinL2Normalization,
+  kLiteRtOpCodeTflL2Pool2d = kTfLiteBuiltinL2Pool2d,
+  kLiteRtOpCodeTflLocalResponseNormalization =
+      kTfLiteBuiltinLocalResponseNormalization,
+  kLiteRtOpCodeTflLogistic = kTfLiteBuiltinLogistic,
+  kLiteRtOpCodeTflLshProjection = kTfLiteBuiltinLshProjection,
+  kLiteRtOpCodeTflLstm = kTfLiteBuiltinLstm,
+  kLiteRtOpCodeTflMaxPool2d = kTfLiteBuiltinMaxPool2d,
+  kLiteRtOpCodeTflMul = kTfLiteBuiltinMul,
+  kLiteRtOpCodeTflRelu = kTfLiteBuiltinRelu,
+  kLiteRtOpCodeTflReluN1To1 = kTfLiteBuiltinReluN1To1,
+  kLiteRtOpCodeTflRelu6 = kTfLiteBuiltinRelu6,
+  kLiteRtOpCodeTflReshape = kTfLiteBuiltinReshape,
+  kLiteRtOpCodeTflResizeBilinear = kTfLiteBuiltinResizeBilinear,
+  kLiteRtOpCodeTflRnn = kTfLiteBuiltinRnn,
+  kLiteRtOpCodeTflSoftmax = kTfLiteBuiltinSoftmax,
+  kLiteRtOpCodeTflSpaceToDepth = kTfLiteBuiltinSpaceToDepth,
+  kLiteRtOpCodeTflSvdf = kTfLiteBuiltinSvdf,
+  kLiteRtOpCodeTflTanh = kTfLiteBuiltinTanh,
+  kLiteRtOpCodeTflConcatEmbeddings = kTfLiteBuiltinConcatEmbeddings,
+  kLiteRtOpCodeTflSkipGram = kTfLiteBuiltinSkipGram,
+  kLiteRtOpCodeTflCall = kTfLiteBuiltinCall,
+  kLiteRtOpCodeTflCustom = kTfLiteBuiltinCustom,
+  kLiteRtOpCodeTflEmbeddingLookupSparse = kTfLiteBuiltinEmbeddingLookupSparse,
+  kLiteRtOpCodeTflPad = kTfLiteBuiltinPad,
+  kLiteRtOpCodeTflUnidirectionalSequenceRnn =
+      kTfLiteBuiltinUnidirectionalSequenceRnn,
+  kLiteRtOpCodeTflGather = kTfLiteBuiltinGather,
+  kLiteRtOpCodeTflBatchToSpaceNd = kTfLiteBuiltinBatchToSpaceNd,
+  kLiteRtOpCodeTflSpaceToBatchNd = kTfLiteBuiltinSpaceToBatchNd,
+  kLiteRtOpCodeTflTranspose = kTfLiteBuiltinTranspose,
+  kLiteRtOpCodeTflMean = kTfLiteBuiltinMean,
+  kLiteRtOpCodeTflSub = kTfLiteBuiltinSub,
+  kLiteRtOpCodeTflDiv = kTfLiteBuiltinDiv,
+  kLiteRtOpCodeTflSqueeze = kTfLiteBuiltinSqueeze,
+  kLiteRtOpCodeTflUnidirectionalSequenceLstm =
+      kTfLiteBuiltinUnidirectionalSequenceLstm,
+  kLiteRtOpCodeTflStridedSlice = kTfLiteBuiltinStridedSlice,
+  kLiteRtOpCodeTflBidirectionalSequenceRnn =
+      kTfLiteBuiltinBidirectionalSequenceRnn,
+  kLiteRtOpCodeTflExp = kTfLiteBuiltinExp,
+  kLiteRtOpCodeTflTopkV2 = kTfLiteBuiltinTopkV2,
+  kLiteRtOpCodeTflSplit = kTfLiteBuiltinSplit,
+  kLiteRtOpCodeTflLogSoftmax = kTfLiteBuiltinLogSoftmax,
+  kLiteRtOpCodeTflDelegate = kTfLiteBuiltinDelegate,
+  kLiteRtOpCodeTflBidirectionalSequenceLstm =
+      kTfLiteBuiltinBidirectionalSequenceLstm,
+  kLiteRtOpCodeTflCast = kTfLiteBuiltinCast,
+  kLiteRtOpCodeTflPrelu = kTfLiteBuiltinPrelu,
+  kLiteRtOpCodeTflMaximum = kTfLiteBuiltinMaximum,
+  kLiteRtOpCodeTflArgMax = kTfLiteBuiltinArgMax,
+  kLiteRtOpCodeTflMinimum = kTfLiteBuiltinMinimum,
+  kLiteRtOpCodeTflLess = kTfLiteBuiltinLess,
+  kLiteRtOpCodeTflNeg = kTfLiteBuiltinNeg,
+  kLiteRtOpCodeTflPadv2 = kTfLiteBuiltinPadv2,
+  kLiteRtOpCodeTflGreater = kTfLiteBuiltinGreater,
+  kLiteRtOpCodeTflGreaterEqual = kTfLiteBuiltinGreaterEqual,
+  kLiteRtOpCodeTflLessEqual = kTfLiteBuiltinLessEqual,
+  kLiteRtOpCodeTflSelect = kTfLiteBuiltinSelect,
+  kLiteRtOpCodeTflSlice = kTfLiteBuiltinSlice,
+  kLiteRtOpCodeTflSin = kTfLiteBuiltinSin,
+  kLiteRtOpCodeTflTransposeConv = kTfLiteBuiltinTransposeConv,
+  kLiteRtOpCodeTflSparseToDense = kTfLiteBuiltinSparseToDense,
+  kLiteRtOpCodeTflTile = kTfLiteBuiltinTile,
+  kLiteRtOpCodeTflExpandDims = kTfLiteBuiltinExpandDims,
+  kLiteRtOpCodeTflEqual = kTfLiteBuiltinEqual,
+  kLiteRtOpCodeTflNotEqual = kTfLiteBuiltinNotEqual,
+  kLiteRtOpCodeTflLog = kTfLiteBuiltinLog,
+  kLiteRtOpCodeTflSum = kTfLiteBuiltinSum,
+  kLiteRtOpCodeTflSqrt = kTfLiteBuiltinSqrt,
+  kLiteRtOpCodeTflRsqrt = kTfLiteBuiltinRsqrt,
+  kLiteRtOpCodeTflShape = kTfLiteBuiltinShape,
+  kLiteRtOpCodeTflPow = kTfLiteBuiltinPow,
+  kLiteRtOpCodeTflArgMin = kTfLiteBuiltinArgMin,
+  kLiteRtOpCodeTflFakeQuant = kTfLiteBuiltinFakeQuant,
+  kLiteRtOpCodeTflReduceProd = kTfLiteBuiltinReduceProd,
+  kLiteRtOpCodeTflReduceMax = kTfLiteBuiltinReduceMax,
+  kLiteRtOpCodeTflPack = kTfLiteBuiltinPack,
+  kLiteRtOpCodeTflLogicalOr = kTfLiteBuiltinLogicalOr,
+  kLiteRtOpCodeTflOneHot = kTfLiteBuiltinOneHot,
+  kLiteRtOpCodeTflLogicalAnd = kTfLiteBuiltinLogicalAnd,
+  kLiteRtOpCodeTflLogicalNot = kTfLiteBuiltinLogicalNot,
+  kLiteRtOpCodeTflUnpack = kTfLiteBuiltinUnpack,
+  kLiteRtOpCodeTflReduceMin = kTfLiteBuiltinReduceMin,
+  kLiteRtOpCodeTflFloorDiv = kTfLiteBuiltinFloorDiv,
+  kLiteRtOpCodeTflReduceAny = kTfLiteBuiltinReduceAny,
+  kLiteRtOpCodeTflSquare = kTfLiteBuiltinSquare,
+  kLiteRtOpCodeTflZerosLike = kTfLiteBuiltinZerosLike,
+  kLiteRtOpCodeTflFill = kTfLiteBuiltinFill,
+  kLiteRtOpCodeTflFloorMod = kTfLiteBuiltinFloorMod,
+  kLiteRtOpCodeTflRange = kTfLiteBuiltinRange,
+  kLiteRtOpCodeTflResizeNearestNeighbor = kTfLiteBuiltinResizeNearestNeighbor,
+  kLiteRtOpCodeTflLeakyRelu = kTfLiteBuiltinLeakyRelu,
+  kLiteRtOpCodeTflSquaredDifference = kTfLiteBuiltinSquaredDifference,
+  kLiteRtOpCodeTflMirrorPad = kTfLiteBuiltinMirrorPad,
+  kLiteRtOpCodeTflAbs = kTfLiteBuiltinAbs,
+  kLiteRtOpCodeTflSplitV = kTfLiteBuiltinSplitV,
+  kLiteRtOpCodeTflUnique = kTfLiteBuiltinUnique,
+  kLiteRtOpCodeTflCeil = kTfLiteBuiltinCeil,
+  kLiteRtOpCodeTflReverseV2 = kTfLiteBuiltinReverseV2,
+  kLiteRtOpCodeTflAddN = kTfLiteBuiltinAddN,
+  kLiteRtOpCodeTflGatherNd = kTfLiteBuiltinGatherNd,
+  kLiteRtOpCodeTflCos = kTfLiteBuiltinCos,
+  kLiteRtOpCodeTflWhere = kTfLiteBuiltinWhere,
+  kLiteRtOpCodeTflRank = kTfLiteBuiltinRank,
+  kLiteRtOpCodeTflElu = kTfLiteBuiltinElu,
+  kLiteRtOpCodeTflReverseSequence = kTfLiteBuiltinReverseSequence,
+  kLiteRtOpCodeTflMatrixDiag = kTfLiteBuiltinMatrixDiag,
+  kLiteRtOpCodeTflQuantize = kTfLiteBuiltinQuantize,
+  kLiteRtOpCodeTflMatrixSetDiag = kTfLiteBuiltinMatrixSetDiag,
+  kLiteRtOpCodeTflRound = kTfLiteBuiltinRound,
+  kLiteRtOpCodeTflHardSwish = kTfLiteBuiltinHardSwish,
+  kLiteRtOpCodeTflIf = kTfLiteBuiltinIf,
+  kLiteRtOpCodeTflWhile = kTfLiteBuiltinWhile,
+  kLiteRtOpCodeTflNonMaxSuppressionV4 = kTfLiteBuiltinNonMaxSuppressionV4,
+  kLiteRtOpCodeTflNonMaxSuppressionV5 = kTfLiteBuiltinNonMaxSuppressionV5,
+  kLiteRtOpCodeTflScatterNd = kTfLiteBuiltinScatterNd,
+  kLiteRtOpCodeTflSelectV2 = kTfLiteBuiltinSelectV2,
+  kLiteRtOpCodeTflDensify = kTfLiteBuiltinDensify,
+  kLiteRtOpCodeTflSegmentSum = kTfLiteBuiltinSegmentSum,
+  kLiteRtOpCodeTflBatchMatmul = kTfLiteBuiltinBatchMatmul,
+  kLiteRtOpCodeTflPlaceholderForGreaterOpCodeTfls =
+      kTfLiteBuiltinPlaceholderForGreaterOpCodes,
+  kLiteRtOpCodeTflCumsum = kTfLiteBuiltinCumsum,
+  kLiteRtOpCodeTflCallOnce = kTfLiteBuiltinCallOnce,
+  kLiteRtOpCodeTflBroadcastTo = kTfLiteBuiltinBroadcastTo,
+  kLiteRtOpCodeTflRfft2d = kTfLiteBuiltinRfft2d,
+  kLiteRtOpCodeTflConv3d = kTfLiteBuiltinConv3d,
+  kLiteRtOpCodeTflImag = kTfLiteBuiltinImag,
+  kLiteRtOpCodeTflReal = kTfLiteBuiltinReal,
+  kLiteRtOpCodeTflComplexAbs = kTfLiteBuiltinComplexAbs,
+  kLiteRtOpCodeTflHashtable = kTfLiteBuiltinHashtable,
+  kLiteRtOpCodeTflHashtableFind = kTfLiteBuiltinHashtableFind,
+  kLiteRtOpCodeTflHashtableImport = kTfLiteBuiltinHashtableImport,
+  kLiteRtOpCodeTflHashtableSize = kTfLiteBuiltinHashtableSize,
+  kLiteRtOpCodeTflReduceAll = kTfLiteBuiltinReduceAll,
+  kLiteRtOpCodeTflConv3dTranspose = kTfLiteBuiltinConv3dTranspose,
+  kLiteRtOpCodeTflVarHandle = kTfLiteBuiltinVarHandle,
+  kLiteRtOpCodeTflReadVariable = kTfLiteBuiltinReadVariable,
+  kLiteRtOpCodeTflAssignVariable = kTfLiteBuiltinAssignVariable,
+  kLiteRtOpCodeTflBroadcastArgs = kTfLiteBuiltinBroadcastArgs,
+  kLiteRtOpCodeTflRandomStandardNormal = kTfLiteBuiltinRandomStandardNormal,
+  kLiteRtOpCodeTflBucketize = kTfLiteBuiltinBucketize,
+  kLiteRtOpCodeTflRandomUniform = kTfLiteBuiltinRandomUniform,
+  kLiteRtOpCodeTflMultinomial = kTfLiteBuiltinMultinomial,
+  kLiteRtOpCodeTflGelu = kTfLiteBuiltinGelu,
+  kLiteRtOpCodeTflDynamicUpdateSlice = kTfLiteBuiltinDynamicUpdateSlice,
+  kLiteRtOpCodeTflRelu0To1 = kTfLiteBuiltinRelu0To1,
+  kLiteRtOpCodeTflUnsortedSegmentProd = kTfLiteBuiltinUnsortedSegmentProd,
+  kLiteRtOpCodeTflUnsortedSegmentMax = kTfLiteBuiltinUnsortedSegmentMax,
+  kLiteRtOpCodeTflUnsortedSegmentSum = kTfLiteBuiltinUnsortedSegmentSum,
+  kLiteRtOpCodeTflAtan2 = kTfLiteBuiltinAtan2,
+  kLiteRtOpCodeTflUnsortedSegmentMin = kTfLiteBuiltinUnsortedSegmentMin,
+  kLiteRtOpCodeTflSign = kTfLiteBuiltinSign,
+  kLiteRtOpCodeTflBitcast = kTfLiteBuiltinBitcast,
+  kLiteRtOpCodeTflBitwiseXor = kTfLiteBuiltinBitwiseXor,
+  kLiteRtOpCodeTflRightShift = kTfLiteBuiltinRightShift,
+  kLiteRtOpCodeShloLogistic = kTfLiteBuiltinStablehloLogistic,
+  kLiteRtOpCodeShloAdd = kTfLiteBuiltinStablehloAdd,
+  kLiteRtOpCodeShloDivide = kTfLiteBuiltinStablehloDivide,
+  kLiteRtOpCodeShloMultiply = kTfLiteBuiltinStablehloMultiply,
+  kLiteRtOpCodeShloMaximum = kTfLiteBuiltinStablehloMaximum,
+  kLiteRtOpCodeShloReshape = kTfLiteBuiltinStablehloReshape,
+  kLiteRtOpCodeShloClamp = kTfLiteBuiltinStablehloClamp,
+  kLiteRtOpCodeShloConcatenate = kTfLiteBuiltinStablehloConcatenate,
+  kLiteRtOpCodeShloBroadcastInDim = kTfLiteBuiltinStablehloBroadcastInDim,
+  kLiteRtOpCodeShloConvolution = kTfLiteBuiltinStablehloConvolution,
+  kLiteRtOpCodeShloSlice = kTfLiteBuiltinStablehloSlice,
+  kLiteRtOpCodeShloCustomCall = kTfLiteBuiltinStablehloCustomCall,
+  kLiteRtOpCodeShloReduce = kTfLiteBuiltinStablehloReduce,
+  kLiteRtOpCodeShloAbs = kTfLiteBuiltinStablehloAbs,
+  kLiteRtOpCodeShloAnd = kTfLiteBuiltinStablehloAnd,
+  kLiteRtOpCodeShloCosine = kTfLiteBuiltinStablehloCosine,
+  kLiteRtOpCodeShloExponential = kTfLiteBuiltinStablehloExponential,
+  kLiteRtOpCodeShloFloor = kTfLiteBuiltinStablehloFloor,
+  kLiteRtOpCodeShloLog = kTfLiteBuiltinStablehloLog,
+  kLiteRtOpCodeShloMinimum = kTfLiteBuiltinStablehloMinimum,
+  kLiteRtOpCodeShloNegate = kTfLiteBuiltinStablehloNegate,
+  kLiteRtOpCodeShloOr = kTfLiteBuiltinStablehloOr,
+  kLiteRtOpCodeShloPower = kTfLiteBuiltinStablehloPower,
+  kLiteRtOpCodeShloRemainder = kTfLiteBuiltinStablehloRemainder,
+  kLiteRtOpCodeShloRsqrt = kTfLiteBuiltinStablehloRsqrt,
+  kLiteRtOpCodeShloSelect = kTfLiteBuiltinStablehloSelect,
+  kLiteRtOpCodeShloSubtract = kTfLiteBuiltinStablehloSubtract,
+  kLiteRtOpCodeShloTanh = kTfLiteBuiltinStablehloTanh,
+  kLiteRtOpCodeShloScatter = kTfLiteBuiltinStablehloScatter,
+  kLiteRtOpCodeShloCompare = kTfLiteBuiltinStablehloCompare,
+  kLiteRtOpCodeShloConvert = kTfLiteBuiltinStablehloConvert,
+  kLiteRtOpCodeShloDynamicSlice = kTfLiteBuiltinStablehloDynamicSlice,
+  kLiteRtOpCodeShloDynamicUpdateSlice =
+      kTfLiteBuiltinStablehloDynamicUpdateSlice,
+  kLiteRtOpCodeShloPad = kTfLiteBuiltinStablehloPad,
+  kLiteRtOpCodeShloIota = kTfLiteBuiltinStablehloIota,
+  kLiteRtOpCodeShloGeneral = kTfLiteBuiltinStablehloDotGeneral,
+  kLiteRtOpCodeShloWindow = kTfLiteBuiltinStablehloReduceWindow,
+  kLiteRtOpCodeShloSort = kTfLiteBuiltinStablehloSort,
+  kLiteRtOpCodeShloWhile = kTfLiteBuiltinStablehloWhile,
+  kLiteRtOpCodeShloGather = kTfLiteBuiltinStablehloGather,
+  kLiteRtOpCodeShloTranspose = kTfLiteBuiltinStablehloTranspose,
+  kLiteRtOpCodeTflDilate = kTfLiteBuiltinDilate,
+  kLiteRtOpCodeShloRngBitGenerator = kTfLiteBuiltinStablehloRngBitGenerator,
+  kLiteRtOpCodeTflReduceWindow = kTfLiteBuiltinReduceWindow,
+  kLiteRtOpCodeShloComposite = kTfLiteBuiltinStablehloComposite,
+} LiteRtOpCode;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OP_CODE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_options.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_options.h
new file mode 100644
index 00000000..4fd2da62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_options.h
@@ -0,0 +1,174 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OPTIONS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OPTIONS_H_
+
+#include <stdbool.h>  // NOLINT: To use bool type in C
+#include <stdint.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+LITERT_DEFINE_HANDLE(LiteRtOp);
+
+//==============================================================================
+//
+//  Get option APIs for LiteRt ADD op.
+//  Options:
+//  - FusedActivationOption : uint32_t
+//
+//==============================================================================
+LiteRtStatus LiteRtGetAddFusedActivationOption(LiteRtOp op,
+                                               uint32_t* fused_activation);
+
+//==============================================================================
+//
+//  Get option APIs for LiteRt BatchMatmul op.
+//  Options:
+//  - AdjXOption : bool
+//  - AdjYOption : bool
+//  - AsymmtericQuantizeInputOption : bool
+//
+//==============================================================================
+LiteRtStatus LiteRtGetBatchMatmulAdjXOption(LiteRtOp op, bool* adj_x);
+LiteRtStatus LiteRtGetBatchMatmulAdjYOption(LiteRtOp op, bool* adj_y);
+LiteRtStatus LiteRtGetBatchMatmulAsymmetricQuantizeInputOption(
+    LiteRtOp op, bool* asymmetric_quantize_input);
+
+//==============================================================================
+//
+//  Get option APIs for LiteRt Concatenation op.
+//  Options:
+//  - FusedActivationOption : uint32_t
+//  - AxisOption : int32_t
+//
+//==============================================================================
+LiteRtStatus LiteRtGetConcatenationFusedActivationOption(
+    LiteRtOp op, uint32_t* fused_activation);
+LiteRtStatus LiteRtGetConcatenationAxisOption(LiteRtOp op, int32_t* axis);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt Div op.
+//  Options:
+//  - FusedActivationOption : uint32_t
+//
+//==============================================================================
+LiteRtStatus LiteRtGetDivFusedActivationOption(LiteRtOp op,
+                                               uint32_t* fused_activation);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt FullyConnected op.
+//  Options:
+//  - FusedActivationOption : uint32_t
+//  - WeightsFormatOption : uint32_t
+//  - KeepNumDimsOption : bool
+//  - QuantizedBiasTypeOption : uint32_t
+//  - AsymmtericQuantizeInputOption : bool
+//
+//==============================================================================
+LiteRtStatus LiteRtGetFullyConnectedFusedActivationOption(
+    LiteRtOp op, uint32_t* fused_activation);
+LiteRtStatus LiteRtGetFullyConnectedWeightsFormatOption(
+    LiteRtOp op, uint32_t* weights_format);
+LiteRtStatus LiteRtGetFullyConnectedKeepNumDimsOption(LiteRtOp op,
+                                                      bool* keep_num_dims);
+LiteRtStatus LiteRtFullyConnectedGetQuantizedBiasTypeOption(
+    LiteRtOp op, uint32_t* quantized_bias_type);
+LiteRtStatus LiteRtGetFullyConnectedAsymmetricQuantizeInputOption(
+    LiteRtOp op, bool* asymmetric_quantize_input);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt Mul op.
+//  Options:
+//  - FusedActivationOption : uint32_t
+//
+//==============================================================================
+LiteRtStatus LiteRtGetMulFusedActivationOption(LiteRtOp op,
+                                               uint32_t* fused_activation);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt Softmax op.
+//  Options:
+//  - BetaOption : float
+//
+//==============================================================================
+LiteRtStatus LiteRtGetSoftmaxBetaOption(LiteRtOp op, float* beta);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt StridedSlice op.
+//  Options:
+//  - BeginMaskOption : int32_t
+//  - EndMaskOption : int32_t
+//  - EllipsisMaskOption : int32_t
+//  - NewAxisMaskOption : int32_t
+//  - ShrinkAxisMaskOption : int32_t
+//  - OffsetOption : bool
+
+//==============================================================================
+LiteRtStatus LiteRtGetStridedSliceBeginMaskOption(LiteRtOp op,
+                                                  int32_t* begin_mask);
+LiteRtStatus LiteRtGetStridedSliceEndMaskOption(LiteRtOp op, int32_t* end_mask);
+LiteRtStatus LiteRtGetStridedSliceEllipsisMaskOption(LiteRtOp op,
+                                                     int32_t* ellipsis_mask);
+LiteRtStatus LiteRtGetStridedSliceNewAxisMaskOption(LiteRtOp op,
+                                                    int32_t* new_axis_mask);
+LiteRtStatus LiteRtGetStridedSliceShrinkAxisMaskOption(
+    LiteRtOp op, int32_t* shrink_axis_mask);
+LiteRtStatus LiteRtGetStridedSliceOffsetOption(LiteRtOp op, bool* offset);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt Sub op.
+//  Options:
+//  - FusedActivationOption : uint32_t
+//  - (Not supported) PotScaleInt16Option : bool
+//
+//==============================================================================
+LiteRtStatus LiteRtGetSubFusedActivationOption(LiteRtOp op,
+                                               uint32_t* fused_activation);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt Reshape op.
+//  Options:
+//  - new_shape : int32_t[]
+//
+//==============================================================================
+LiteRtStatus LiteRtGetReshapeNewShapeOption(LiteRtOp op,
+                                            const int32_t** new_shape,
+                                            int32_t* new_shape_size);
+
+//==============================================================================
+//
+// Get option APIs for LiteRt Sum op.
+//  Options:
+// - KeepdimsOption : bool
+//
+//==============================================================================
+LiteRtStatus LiteRtGetSumKeepDimsOption(LiteRtOp op, bool* keepdims);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h
new file mode 100644
index 00000000..2adbb498
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h
@@ -0,0 +1,201 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_H_
+
+#include <memory.h>
+#include <stddef.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_event.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+
+#if LITERT_HAS_AHWB_SUPPORT
+#include <android/hardware_buffer.h>
+#else
+// Define a place holder AHardwareBuffer struct just to enable compilation.
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+typedef struct AHardwareBuffer AHardwareBuffer;
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // LITERT_HAS_AHWB_SUPPORT
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+LITERT_DEFINE_HANDLE(LiteRtTensorBuffer);
+
+#define LITERT_HOST_MEMORY_BUFFER_ALIGNMENT 64
+
+typedef enum {
+  kLiteRtTensorBufferTypeUnknown = 0,
+  kLiteRtTensorBufferTypeHostMemory = 1,
+  kLiteRtTensorBufferTypeAhwb = 2,
+  kLiteRtTensorBufferTypeIon = 3,
+  kLiteRtTensorBufferTypeDmaBuf = 4,
+  kLiteRtTensorBufferTypeFastRpc = 5,
+} LiteRtTensorBufferType;
+
+typedef void (*LiteRtHostMemoryDeallocator)(void* addr);
+typedef void (*LiteRtAhwbDeallocator)(AHardwareBuffer* ahwb);
+typedef void (*LiteRtIonDeallocator)(void* ion_buffer_addr);
+typedef void (*LiteRtDmaBufDeallocator)(void* dmabuf_buffer_addr);
+typedef void (*LiteRtFastRpcDeallocator)(void* fastrpc_buffer_addr);
+
+// /////////////////////////////////////////////////////////////////////////////
+// TensorBuffers.
+// /////////////////////////////////////////////////////////////////////////////
+
+// Create a tensor buffer from an existing host memory buffer of a given size,
+// with optional host memory buffer deallocator (it can be NULL). Return an
+// error if the passed host memory buffer doesn't satisfy
+// LITERT_HOST_MEMORY_BUFFER_ALIGNMENT alignment.
+LiteRtStatus LiteRtCreateTensorBufferFromHostMemory(
+    const LiteRtRankedTensorType* tensor_type, void* host_buffer_addr,
+    size_t host_buffer_size, LiteRtHostMemoryDeallocator deallocator,
+    LiteRtTensorBuffer* buffer);
+
+// Return an error if the backing buffer is not allocated on the host memory.
+LiteRtStatus LiteRtGetTensorBufferHostMemory(LiteRtTensorBuffer tensor_buffer,
+                                             void** host_memory_addr);
+
+#if LITERT_HAS_AHWB_SUPPORT
+// Create a tensor buffer from an existing AHardwareBuffer, with optional
+// AHardwareBuffer deallocator (it can be NULL). An non-zero `buffer_offset` can
+// be used to specify multiple tensor buffers sharing the same underlying AHWB,
+// in which case the provided AHWB must be sufficiently large to accomodate for
+// the allocation needed for all tensor buffers sharing it.
+LiteRtStatus LiteRtCreateTensorBufferFromAhwb(
+    const LiteRtRankedTensorType* tensor_type, AHardwareBuffer* ahwb,
+    size_t ahwb_offset, LiteRtAhwbDeallocator deallocator,
+    LiteRtTensorBuffer* buffer);
+
+// Return an error if the backing buffer is not an AhardwareBuffer.
+LiteRtStatus LiteRtGetTensorBufferAhwb(LiteRtTensorBuffer tensor_buffer,
+                                       AHardwareBuffer** ahwb);
+#endif  // LITERT_HAS_AHWB_SUPPORT
+
+#if LITERT_HAS_ION_SUPPORT
+// Create a tensor buffer from an existing ION buffer of a given size, with
+// optional ION buffer deallocator (it can be NULL). An non-zero
+// `ion_buffer_offset` can be used to specify multiple tensor buffers sharing
+// the same underlying ION buffer, in which case parameter `ion_buffer_size`
+// must be the entire size of the underlying ION memory buffer, including the
+// allocation needed for all tensor buffers sharing it.
+LiteRtStatus LiteRtCreateTensorBufferFromIonBuffer(
+    const LiteRtRankedTensorType* tensor_type, void* ion_buffer_addr,
+    int ion_buffer_fd, size_t ion_buffer_size, size_t ion_buffer_offset,
+    LiteRtIonDeallocator deallocator, LiteRtTensorBuffer* buffer);
+
+// Return an error if the backing buffer is not an ION buffer.
+LiteRtStatus LiteRtGetTensorBufferIonBuffer(LiteRtTensorBuffer buffer,
+                                            void** ion_buffer_addr,
+                                            int* ion_buffer_fd);
+#endif  // LITERT_HAS_ION_SUPPORT
+
+#if LITERT_HAS_DMABUF_SUPPORT
+// Create a tensor buffer from an existing DMA-BUF buffer of a given size, with
+// optional DMA-BUF buffer deallocator (it can be NULL). An non-zero
+// `dmabuf_buffer_offset` can be used to specify multiple tensor buffers sharing
+// the same underlying ION buffer, in which case parameter `ion_buffer_size`
+// must be the entire size of the underlying ION memory buffer, including the
+// allocation needed for all tensor buffers sharing it.
+LiteRtStatus LiteRtCreateTensorBufferFromDmaBufBuffer(
+    const LiteRtRankedTensorType* tensor_type, void* dmabuf_buffer_addr,
+    int dmabuf_buffer_fd, size_t dmabuf_buffer_size,
+    size_t dmabuf_buffer_offset, LiteRtDmaBufDeallocator deallocator,
+    LiteRtTensorBuffer* buffer);
+
+// Return an error if the backing buffer is not an DMA-BUF buffer.
+LiteRtStatus LiteRtGetTensorBufferDmaBufBuffer(LiteRtTensorBuffer tensor_buffer,
+                                               void** dmabuf_buffer_addr,
+                                               int* dmabuf_buffer_fd);
+#endif  // LITERT_HAS_DMABUF_SUPPORT
+
+#if LITERT_HAS_FASTRPC_SUPPORT
+// Create a tensor buffer from an existing FastRPC memory buffer of a given
+// size, with optional FastRPC memory buffer deallocator (it can be NULL). An
+// non-zero `fastrpc_buffer_offset` can be used to specify multiple tensor
+// buffers sharing the same underlying FastRPC memory buffer, in which case
+// parameter `fastrpc_buffer_size` must be the entire size of the underlying
+// FastRPC memory buffer, including the allocation needed for all tensor buffers
+// sharing it.
+LiteRtStatus LiteRtCreateTensorBufferFromFastRpcBuffer(
+    const LiteRtRankedTensorType* tensor_type, void* fastrpc_buffer_addr,
+    int fastrpc_fd, size_t fastrpc_buffer_size, size_t fastrpc_buffer_offset,
+    LiteRtFastRpcDeallocator deallocator, LiteRtTensorBuffer* buffer);
+
+// Return an error if the backing buffer is not a FastRPC memory buffer.
+LiteRtStatus LiteRtGetTensorBufferFastRpcBuffer(
+    LiteRtTensorBuffer tensor_buffer, void** fastrpc_buffer_addr,
+    int* fastrpc_buffer_fd);
+#endif  // LITERT_HAS_FASTRPC_SUPPORT
+
+// Create a buffer backed by managed memory for a given size.
+LiteRtStatus LiteRtCreateManagedTensorBuffer(
+    LiteRtTensorBufferType buffer_type,
+    const LiteRtRankedTensorType* tensor_type, size_t buffer_size,
+    LiteRtTensorBuffer* buffer);
+
+// Create a duplicate of the current tensor buffer. It will increase the
+// reference count of a managed tensor buffer. And the number decreases when
+// LiteRtDestroyTensorBuffer() is called.
+LiteRtStatus LiteRtDuplicateTensorBuffer(LiteRtTensorBuffer tensor_buffer);
+
+LiteRtStatus LiteRtGetTensorBufferType(LiteRtTensorBuffer tensor_buffer,
+                                       LiteRtTensorBufferType* buffer_type);
+
+LiteRtStatus LiteRtGetTensorBufferTensorType(
+    LiteRtTensorBuffer tensor_buffer, LiteRtRankedTensorType* tensor_type);
+
+LiteRtStatus LiteRtGetTensorBufferSize(LiteRtTensorBuffer tensor_buffer,
+                                       size_t* size);
+
+LiteRtStatus LiteRtGetTensorBufferOffset(LiteRtTensorBuffer tensor_buffer,
+                                         size_t* offset);
+
+LiteRtStatus LiteRtHasTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
+                                        bool* has_event);
+
+LiteRtStatus LiteRtGetTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
+                                        LiteRtEvent* event);
+
+LiteRtStatus LiteRtSetTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
+                                        LiteRtEvent event);
+
+LiteRtStatus LiteRtClearTensorBufferEvent(LiteRtTensorBuffer tensor_buffer);
+
+// Lock a tensor buffer and map it to host memory, optionally synchronizing on a
+// given input event (parameter `event` can be NULL).
+LiteRtStatus LiteRtLockTensorBuffer(LiteRtTensorBuffer tensor_buffer,
+                                    void** host_mem_addr, LiteRtEvent event);
+
+// Unlock a tensor buffer and (potentially) unmap it from host memory.
+LiteRtStatus LiteRtUnlockTensorBuffer(LiteRtTensorBuffer buffer);
+
+// Destroy a tensor buffer. If the tensor buffer is managed, the number of
+// references to it is decreased and released the underlying TensorBufferT when
+// the last reference is removed.
+void LiteRtDestroyTensorBuffer(LiteRtTensorBuffer buffer);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h
new file mode 100644
index 00000000..1c691a3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h
@@ -0,0 +1,57 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+LITERT_DEFINE_HANDLE(LiteRtTensorBufferRequirements);
+
+LiteRtStatus LiteRtCreateTensorBufferRequirements(
+    int num_supported_tensor_buffer_types,
+    const LiteRtTensorBufferType* supported_tensor_buffer_types,
+    size_t buffer_size, int num_strides, const uint32_t* strides,
+    LiteRtTensorBufferRequirements* requirements);
+
+LiteRtStatus LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
+    LiteRtTensorBufferRequirements requirements, int* num_types);
+
+LiteRtStatus LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
+    LiteRtTensorBufferRequirements requirements, int type_index,
+    LiteRtTensorBufferType* type);
+
+LiteRtStatus LiteRtGetTensorBufferRequirementsBufferSize(
+    LiteRtTensorBufferRequirements requirements, size_t* buffer_size);
+
+LiteRtStatus LiteRtGetTensorBufferRequirementsStrides(
+    LiteRtTensorBufferRequirements requirements, int* num_strides,
+    const uint32_t** strides);
+
+void LiteRtDestroyTensorBufferRequirements(
+    LiteRtTensorBufferRequirements requirements);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_any.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_any.h
new file mode 100644
index 00000000..7b95e65e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_any.h
@@ -0,0 +1,116 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ANY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ANY_H_
+
+#include <any>
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_any.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+
+inline std::any ToStdAny(LiteRtAny litert_any) {
+  std::any res;
+  switch (litert_any.type) {
+    case kLiteRtAnyTypeNone:
+      break;
+    case kLiteRtAnyTypeBool:
+      res = litert_any.bool_value;
+      break;
+    case kLiteRtAnyTypeInt:
+      res = litert_any.int_value;
+      break;
+    case kLiteRtAnyTypeReal:
+      res = litert_any.real_value;
+      break;
+    case kLiteRtAnyTypeString:
+      res = litert_any.str_value;
+      break;
+    case kLiteRtAnyTypeVoidPtr:
+      res = litert_any.ptr_value;
+      break;
+  }
+  return res;
+}
+
+inline Expected<LiteRtAny> ToLiteRtAny(const std::any& any) {
+  LiteRtAny result;
+  if (!any.has_value()) {
+    result.type = kLiteRtAnyTypeNone;
+    return result;
+
+  } else if (any.type() == typeid(LiteRtAny::bool_value)) {
+    result.type = kLiteRtAnyTypeBool;
+    result.bool_value = std::any_cast<decltype(LiteRtAny::bool_value)>(any);
+    return result;
+
+  } else if (any.type() == typeid(int8_t)) {
+    result.type = kLiteRtAnyTypeInt;
+    result.int_value = std::any_cast<int8_t>(any);
+    return result;
+
+  } else if (any.type() == typeid(int16_t)) {
+    result.type = kLiteRtAnyTypeInt;
+    result.int_value = std::any_cast<int16_t>(any);
+    return result;
+
+  } else if (any.type() == typeid(int32_t)) {
+    result.type = kLiteRtAnyTypeInt;
+    result.int_value = std::any_cast<int32_t>(any);
+    return result;
+
+  } else if (any.type() == typeid(int64_t)) {
+    result.type = kLiteRtAnyTypeInt;
+    result.int_value = std::any_cast<int64_t>(any);
+    return result;
+
+  } else if (any.type() == typeid(float)) {
+    result.type = kLiteRtAnyTypeReal;
+    result.real_value = std::any_cast<float>(any);
+    return result;
+
+  } else if (any.type() == typeid(double)) {
+    result.type = kLiteRtAnyTypeReal;
+    result.real_value = std::any_cast<double>(any);
+    return result;
+
+  } else if (any.type() == typeid(LiteRtAny::str_value)) {
+    result.type = kLiteRtAnyTypeString;
+    result.str_value = std::any_cast<decltype(LiteRtAny::str_value)>(any);
+    return result;
+
+  } else if (any.type() == typeid(absl::string_view)) {
+    result.type = kLiteRtAnyTypeString;
+    result.str_value = std::any_cast<absl::string_view>(any).data();
+    return result;
+
+  } else if (any.type() == typeid(LiteRtAny::ptr_value)) {
+    result.type = kLiteRtAnyTypeVoidPtr;
+    result.ptr_value = std::any_cast<decltype(LiteRtAny::ptr_value)>(any);
+    return result;
+
+  } else {
+    return Error(kLiteRtStatusErrorInvalidArgument,
+                 "Invalid argument for ToLiteRtAny");
+  }
+}
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ANY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h
new file mode 100644
index 00000000..c81b5d12
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h
@@ -0,0 +1,356 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_BUFFER_REF_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_BUFFER_REF_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+namespace litert {
+
+//===----------------------------------------------------------------------===//
+//
+//                                                            << BUFFER REF >>
+//
+// Read, read/write, and owning views of buffers of arbitrary byte width types.
+//
+// Serialized model artifacts and assets are frequently large strings that with
+// (annoyingly) non-standard char type and left padded. The following classes
+// simplify handling such buffers in an efficient copy free manner. They also
+// provide read and write left-padded aware interpretebility through standard
+// signed char strings types. This is used for making manual edits to flatbuffer
+// metadata or dierctly to serialized flatbuffer.
+// NOTE: std::basic_xxx<unsigned char> not supported by our C++ toolchain.
+//
+// Pre-allocated buffers can be transferred to these classes or allocation can
+// be internalized. XBufferRefs can be implictly upcasted to non-owning
+// read/write or read-only to provide other routines with an appropriate view of
+// the data. E.g.:
+//
+// ```
+// void ReadBuffer(BufferRef r_buf) { std::cerr << r_buf.StrView(); }
+// void WriteToBuffer(MutableBufferRef rw_buf) { rw_buf.WriteTo("SomeData"); }
+// ...
+// OwningBuffer<uint8_t> buf(size);
+// WriteToBuffer(buf); // Implicitly convert to read/write with no ownership.
+// ReadBuffer(buf); // Implicitly convert to read-only.
+// ```
+//
+//===----------------------------------------------------------------------===//
+
+// Allocation/Deallocation behavior for owning buffer refs. An allocator is a
+// trivially constructible/destructible object that overrides () for allocating
+// and freeing memory.
+
+// Malloc/free based memory.
+template <typename ByteT = uint8_t>
+struct Mallocator {
+  void operator()(ByteT* d) {
+    if (d != nullptr) {
+      free(d);
+    }
+  }
+
+  ByteT* operator()(size_t bytes) {
+    return reinterpret_cast<ByteT*>(malloc(bytes));
+  }
+};
+
+// New/delete based memory.
+template <typename ByteT = uint8_t>
+struct Newlocator {
+  void operator()(ByteT* d) {
+    if (d != nullptr) {
+      delete[] d;
+    }
+  }
+
+  ByteT* operator()(size_t bytes) { return new ByteT[bytes]; }
+};
+
+//
+// Read-Only Bytes
+//
+
+// Immutable and non-owning view of a buffer.
+template <typename ByteT = uint8_t>
+class BufferRef {
+ public:
+  using TupleT = std::tuple<const ByteT* const, const size_t, const size_t>;
+
+  // Null buffer.
+  BufferRef() : size_(0), offset_(0), data_(nullptr) {}
+
+  // Construct from already allocated buffer. Methods will only expose
+  // data[offset, offset + size].
+  BufferRef(const ByteT* data, size_t size, size_t offset = 0)
+      : size_(size), offset_(offset), data_(const_cast<ByteT*>(data)) {}
+  BufferRef(const void* data, size_t size, size_t offset = 0)
+      : size_(size),
+        offset_(offset),
+        data_(const_cast<ByteT*>(reinterpret_cast<const ByteT*>(data))) {}
+  explicit BufferRef(absl::Span<const ByteT> data)
+      : size_(data.size()),
+        offset_(0),
+        data_(const_cast<ByteT*>(data.data())) {}
+
+  // Start of actual data.
+  const ByteT* Data() const { return data_ + offset_; }
+
+  // Size of actual data.
+  size_t Size() const { return size_ - offset_; }
+
+  // Get buffer details in tuple form.
+  TupleT Get() const { return TupleT(data_, size_, offset_); }
+
+  // Start of actual data as signed char. Might not be null terminated.
+  const char* StrData() const { return reinterpret_cast<const char*>(Data()); }
+
+  // Convenience view of actual data as a string. Makes null terminated.
+  absl::string_view StrView() const {
+    return absl::string_view(StrData(), Size());
+  }
+
+  // Const view of actual data.
+  absl::Span<const ByteT> Span() const {
+    return absl::MakeConstSpan(Data(), Size());
+  }
+
+  // Copy the buffer data to a vector.
+  std::vector<ByteT> ToVec() const {
+    return std::vector<ByteT>(StrData(), StrData() + Size());
+  }
+
+  // Write the string data to a stream.
+  void WriteStr(std::ostream& out) const { out.write(StrData(), Size()); }
+
+  // Print info about this buffer.
+  void Dump(std::ostream& out) const {
+    out << absl::StreamFormat("%s[%lu:%lu]\n", TypeName(), offset_, size_);
+  }
+
+  BufferRef(const BufferRef& other) = default;
+  BufferRef& operator=(const BufferRef& other) = default;
+
+  virtual ~BufferRef() = default;
+
+ protected:
+  size_t size_;
+  size_t offset_;
+  ByteT* data_ = nullptr;
+
+  // Debug name.
+  virtual absl::string_view TypeName() const { return "BufferRef"; }
+};
+template <typename ByteT = uint8_t>
+BufferRef(const ByteT*, size_t, size_t) -> BufferRef<ByteT>;
+
+//
+// Read-Write Non-Owning Bytes
+//
+
+// Writeable (but still non-owning) version of BufferRef.
+template <typename ByteT>
+class MutableBufferRef : public BufferRef<ByteT> {
+ public:
+  using TupleT = std::tuple<ByteT* const, const size_t, const size_t>;
+
+  // Null buffer.
+  MutableBufferRef()
+      : BufferRef<ByteT>((ByteT*)nullptr, /*size*/ 0, /*offset*/ 0) {}
+
+  // Create a mutable view from pre-allocated non-const buffer.
+  MutableBufferRef(ByteT* data, size_t size, size_t offset = 0)
+      : BufferRef<ByteT>(data, size, offset) {}
+  MutableBufferRef(void* data, size_t size, size_t offset = 0)
+      : BufferRef<ByteT>(data, size, offset) {}
+  explicit MutableBufferRef(absl::Span<ByteT> data) : BufferRef<ByteT>(data) {}
+  explicit MutableBufferRef(absl::Span<const ByteT> data) = delete;
+  MutableBufferRef(const ByteT*, size_t, size_t) = delete;
+  MutableBufferRef(const void*, size_t, size_t) = delete;
+
+  // Mutable start of actual data.
+  ByteT* Data() { return this->data_ + this->offset_; }
+
+  // Get the mutable start of actual data as a char pointer.
+  char* StrData() { return reinterpret_cast<char*>(Data()); }
+
+  // Get buffer info in tuple form.
+  TupleT Get() { return TupleT(this->data_, this->size_, this->offset_); }
+
+  // Mutable span of actual data.
+  absl::Span<ByteT> Span() { return absl::MakeSpan(Data(), this->Size()); }
+
+  // Write string into the actual buffer at offset. Returns false if the entire
+  // string cannot fit into the actual buffer.
+  bool WriteInto(absl::string_view str, size_t offset = 0) {
+    if (str.size() > this->Size() - offset) {
+      return false;
+    }
+    std::memcpy(Data() + offset, str.data(), str.size());
+    return true;
+  }
+
+  MutableBufferRef(const MutableBufferRef& other) = default;
+  MutableBufferRef& operator=(const MutableBufferRef& other) = default;
+
+ protected:
+  // Debug name.
+  absl::string_view TypeName() const override { return "MutableBufferRef"; }
+};
+template <typename ByteT>
+MutableBufferRef(ByteT*, size_t, size_t) -> MutableBufferRef<ByteT>;
+
+//
+// Read-Write Owning Bytes
+//
+
+// Writable and owning buffer reference. Can allocate new buffers internally and
+// take ownership of existing buffers. Does not support resizing.
+template <typename ByteT = uint8_t, class Allocator = Newlocator<ByteT>>
+class OwningBufferRef : public MutableBufferRef<ByteT> {
+ public:
+  using TupleT = std::tuple<ByteT* const, const size_t, const size_t>;
+  using WeakTupleT = std::tuple<ByteT*&, size_t&, size_t&>;
+
+  // Null buffer.
+  OwningBufferRef()
+      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, /*size*/ 0,
+                                /*offset*/ 0) {}
+
+  // Initialize a new buffer reference and allocate internally.
+  explicit OwningBufferRef(size_t size)
+      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, size, /*offset*/ 0) {
+    this->data_ = (ByteT*)Allocator()(size);
+  }
+
+  // Take ownership of given buffer.
+  OwningBufferRef(ByteT* data, size_t size, size_t offset = 0)
+      : MutableBufferRef<ByteT>(data, size, offset) {}
+  OwningBufferRef(void* data, size_t size, size_t offset = 0)
+      : MutableBufferRef<ByteT>(data, size, offset) {}
+  explicit OwningBufferRef(absl::Span<ByteT> data)
+      : MutableBufferRef<ByteT>(data) {}
+
+  // Copy the given buffer.
+  OwningBufferRef(const ByteT* data, size_t size)
+      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, size,
+                                /*offset*/ 0) {
+    this->data_ = (ByteT*)Allocator()(size);
+    std::memcpy(this->data_, data, size);
+  }
+  explicit OwningBufferRef(absl::Span<const ByteT> data)
+      : OwningBufferRef<ByteT, Allocator>(data.data(), data.size()) {}
+
+  // Copy data from givens string.
+  explicit OwningBufferRef(absl::string_view data)
+      : OwningBufferRef<ByteT, Allocator>(
+            reinterpret_cast<const ByteT*>(data.data()), data.size()) {}
+
+  // Copy data from given c-style string.
+  explicit OwningBufferRef(const char* data)
+      : OwningBufferRef<ByteT, Allocator>(absl::string_view(data)) {}
+
+  // Drop reference to any owned memory.
+  void Drop() {
+    this->data_ = nullptr;
+    this->size_ = 0;
+    this->offset_ = 0;
+  }
+
+  // Get the buffer details and drop references to them.
+  TupleT Release() {
+    auto res = std::make_tuple(this->data_, this->size_, this->offset_);
+    Drop();
+    return res;
+  }
+
+  // Get weak references to buffer data. Takes ownership of anything that
+  // is swapped in.
+  WeakTupleT GetWeak() {
+    return WeakTupleT(this->data_, this->size_, this->offset_);
+  }
+
+  // Free any owned memory.
+  void Reset() {
+    Allocator()(this->data_);
+    Drop();
+  }
+
+  // Reset any existing data and copy in given ro buffer.
+  void Assign(const ByteT* buf, size_t size, size_t offset = 0) {
+    Reset();
+    this->size_ = size;
+    this->data_ = (ByteT*)Allocator()(this->size_);
+    std::memcpy(this->data_, buf, this->size_);
+    this->offset_ = offset;
+  }
+
+  OwningBufferRef(OwningBufferRef&& other)
+      : MutableBufferRef<ByteT>(other.data_, other.size_, other.offset_) {
+    other.Drop();
+  }
+
+  OwningBufferRef& operator=(OwningBufferRef&& other) {
+    if (this != &other) {
+      Reset();
+      this->data_ = other.data_;
+      this->size_ = other.size_;
+      this->offset_ = other.offset_;
+      other.Drop();
+    }
+    return *this;
+  }
+
+  OwningBufferRef(const OwningBufferRef& other)
+      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, other.size_,
+                                other.offset_) {
+    Assign(other.data_, other.size_, other.offset_);
+  }
+
+  OwningBufferRef& operator=(const OwningBufferRef& other) {
+    Assign(other.data_, other.size_, other.offset_);
+    return *this;
+  }
+
+  ~OwningBufferRef() override { Reset(); }
+
+ protected:
+  // Debug string.
+  absl::string_view TypeName() const override { return "OwningBufferRef"; }
+};
+
+template <typename ByteT = uint8_t, class Allocator = Newlocator<ByteT>>
+OwningBufferRef(const ByteT*, size_t) -> OwningBufferRef<ByteT, Allocator>;
+
+template <typename ByteT = uint8_t, class Allocator = Newlocator<ByteT>>
+OwningBufferRef(ByteT*, size_t) -> OwningBufferRef<ByteT, Allocator>;
+
+template <typename ByteT = char, class Allocator = Newlocator<ByteT>>
+OwningBufferRef(const char*) -> OwningBufferRef<ByteT, Allocator>;
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_BUFFER_REF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_compiled_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_compiled_model.h
new file mode 100644
index 00000000..9b9499ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_compiled_model.h
@@ -0,0 +1,141 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_COMPILED_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_COMPILED_MODEL_H_
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_compiled_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
+
+namespace litert {
+
+// The CompiledModel is a higher level inference API. It is created by
+// provided model with compilation options. Internally, it instantiates runtime
+// and applies Delegates mapped to the compilation options.
+// It also supports getting BufferRequirements to create input/output
+// TensorBuffers, and it allows to invoke the model with the input/output
+// TensorBuffers.
+//
+// Example user flow:
+//
+// 1. Create CompiledModel
+// 2. Query the model input/output requirements
+// 3. Create input/output TensorBuffers
+// 4. Fill the input TensorBuffers with input data
+// 5. Invoke the model with the input/output TensorBuffers
+// 6. Evaluate the output TensorBuffers
+
+class CompiledModel
+    : public internal::Handle<LiteRtCompiledModel, LiteRtDestroyCompiledModel> {
+ public:
+  CompiledModel() = default;
+
+  // Parameter `owned` indicates if the created CompiledModel object should take
+  // ownership of the provided `compiled_model` handle.
+  explicit CompiledModel(Model* model, LiteRtCompiledModel compiled_model,
+                         bool owned = true)
+      : internal::Handle<LiteRtCompiledModel, LiteRtDestroyCompiledModel>(
+            compiled_model, owned),
+        model_(model) {}
+
+  // Creates a CompiledModel from a TFLite file.
+  // The model is loaded into memory and the caller takes ownership of the
+  // returned object.
+  static Expected<CompiledModel> Create(
+      litert::Model& model,
+      LiteRtCompilationOptions compilation_options = kLiteRtHwAccelatorCpu) {
+    LiteRtCompiledModel compiled_model;
+    if (auto status = LiteRtCreateCompiledModel(
+            model.Get(), compilation_options, &compiled_model);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to create compiled model");
+    }
+    return CompiledModel(&model, compiled_model);
+  }
+
+  // Returns the buffer requirements for the given n-th input tensor. The
+  // returned TensorBufferRequirements is used to create the input tensor
+  // buffer.
+  litert::Expected<TensorBufferRequirements> GetInputBufferRequirements(
+      size_t signature_index, size_t input_index) {
+    LiteRtTensorBufferRequirements buffer_requirements;
+    if (auto status = LiteRtGetCompiledModelInputBufferRequirements(
+            Get(), signature_index, input_index, &buffer_requirements);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get input buffer requirements");
+    }
+    return TensorBufferRequirements(buffer_requirements, /*owned=*/false);
+  }
+
+  // Returns the buffer requirements for the given output tensor. The returned
+  // TensorBufferRequirements is used to create the output tensor
+  // buffer.
+  litert::Expected<TensorBufferRequirements> GetOutputBufferRequirements(
+      size_t signature_index, size_t output_index) {
+    LiteRtTensorBufferRequirements buffer_requirements;
+    if (auto status = LiteRtGetCompiledModelOutputBufferRequirements(
+            Get(), signature_index, output_index, &buffer_requirements);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get output buffer requirements");
+    }
+    return TensorBufferRequirements(buffer_requirements, /*owned=*/false);
+  }
+
+  // A helper function to creates the input tensor buffers for the given
+  // signature. It uses BufferRequirements and RankedTensorType to create the
+  // input tensor buffers.
+  Expected<std::vector<TensorBuffer>> CreateInputBuffers(
+      size_t signature_index);
+
+  // A helper function to creates the output tensor buffers for the given
+  // signature. It uses BufferRequirements and RankedTensorType to create the
+  // output tensor buffers.
+  Expected<std::vector<TensorBuffer>> CreateOutputBuffers(
+      size_t signature_index);
+
+  // Runs the model of the given signature index with the provided input/output
+  // TensorBuffers.
+  Expected<void> Run(size_t signature_index,
+                     const std::vector<TensorBuffer>& input_buffers,
+                     const std::vector<TensorBuffer>& output_buffers);
+
+  // Runs the model of the given signature key with the provided input/output
+  // TensorBuffer map.
+  Expected<void> Run(
+      absl::string_view signature_key,
+      const absl::flat_hash_map<absl::string_view, TensorBuffer>& input_map,
+      const absl::flat_hash_map<absl::string_view, TensorBuffer>& output_map);
+
+ private:
+  Model* model_;
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_COMPILED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_consts.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_consts.h
new file mode 100644
index 00000000..14ac9a0b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_consts.h
@@ -0,0 +1,34 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_CONSTS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_CONSTS_H_
+
+#include <cstddef>
+
+namespace litert {
+
+// The following constants are used to properly size absl::InlinedVector<>
+// uses used in the LiteRT code. Their values don't need to be exact; they
+// are just optimization hints.
+static constexpr size_t kExpectedMaxTensorRank = 6;
+static constexpr size_t kExpectedMaxNumOfTensorUses = 8;
+static constexpr size_t kExpectedMaxNumOfOpInputs = 4;
+static constexpr size_t kExpectedMaxNumOfOpOutputs = 8;
+static constexpr size_t kExpectedMaxNumOfSubgraphInputs = 4;
+static constexpr size_t kExpectedMaxNumOfSubgraphOutputs = 4;
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_CONSTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_detail.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_detail.h
new file mode 100644
index 00000000..566d8468
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_detail.h
@@ -0,0 +1,105 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DETAIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DETAIL_H_
+
+#include <cstddef>
+#include <functional>
+#include <optional>
+#include <utility>
+
+#include "absl/log/absl_check.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+namespace litert {
+
+// See "std::construct_at" from C++20.
+template <class T, class... Args>
+T* ConstructAt(T* p, Args&&... args) {
+  return ::new (static_cast<void*>(p)) T(std::forward<Args>(args)...);
+}
+
+// Reduce all over zipped iters of same size.
+template <typename LeftVals, typename RightVals = LeftVals>
+bool AllZip(const LeftVals& lhs, const RightVals& rhs,
+            std::function<bool(const typename LeftVals::value_type&,
+                               const typename RightVals::value_type&)>
+                bin_pred) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+  for (auto i = 0; i < lhs.size(); ++i) {
+    if (!bin_pred(lhs.at(i), rhs.at(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Reduce any over zipped iters of same size.
+template <typename LeftVals, typename RightVals = LeftVals>
+bool AnyZip(const LeftVals& lhs, const RightVals& rhs,
+            std::function<bool(const typename LeftVals::value_type&,
+                               const typename RightVals::value_type&)>
+                bin_pred) {
+  auto neg = [&](const auto& l, const auto& r) { return !bin_pred(l, r); };
+  return !(AllZip(lhs, rhs, neg));
+}
+
+// Does element exist in range.
+template <class It, class T>
+bool Contains(It begin, It end, const T& val) {
+  return std::find(begin, end, val) != end;
+}
+
+// Does element exist in range satisfying pred.
+template <class It, class UPred>
+bool ContainsIf(It begin, It end, UPred u_pred) {
+  return std::find_if(begin, end, u_pred) != end;
+}
+
+// Get the ind of the given element if it is present.
+template <class T, class It>
+std::optional<size_t> FindInd(It begin, It end, T val) {
+  auto it = std::find(begin, end, val);
+  return (it == end) ? std::nullopt : std::make_optional(it - begin);
+}
+
+namespace internal {
+
+// Call function "get" and assert it returns value equal to given expected
+// value.
+template <class F, class Expected, typename... Args>
+inline void AssertEq(F get, Expected expected, Args&&... args) {
+  auto status = get(std::forward<Args>(args)...);
+  ABSL_CHECK_EQ(status, expected);
+}
+
+// Call function "get" and assert it returns true.
+template <class F, typename... Args>
+inline void AssertTrue(F get, Args&&... args) {
+  AssertEq(get, true, std::forward<Args>(args)...);
+}
+
+// Call function "get" and assert it returns an OK LiteRtStatus.
+template <class F, typename... Args>
+inline void AssertOk(F get, Args&&... args) {
+  AssertEq(get, kLiteRtStatusOk, std::forward<Args>(args)...);
+}
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DETAIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_element_type.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_element_type.h
new file mode 100644
index 00000000..84b032b3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_element_type.h
@@ -0,0 +1,157 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ELEMENT_TYPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ELEMENT_TYPE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+
+namespace litert {
+
+// Data type of tensor elements. C++ equivalent to LiteRtElementType.
+enum class ElementType {
+  None = kLiteRtElementTypeNone,
+  Bool = kLiteRtElementTypeBool,
+  Int4 = kLiteRtElementTypeInt4,
+  Int8 = kLiteRtElementTypeInt8,
+  Int16 = kLiteRtElementTypeInt16,
+  Int32 = kLiteRtElementTypeInt32,
+  Int64 = kLiteRtElementTypeInt64,
+  UInt8 = kLiteRtElementTypeUInt8,
+  UInt16 = kLiteRtElementTypeUInt16,
+  UInt32 = kLiteRtElementTypeUInt32,
+  UInt64 = kLiteRtElementTypeUInt64,
+  Float16 = kLiteRtElementTypeFloat16,
+  BFloat16 = kLiteRtElementTypeBFloat16,
+  Float32 = kLiteRtElementTypeFloat32,
+  Float64 = kLiteRtElementTypeFloat64,
+  Complex64 = kLiteRtElementTypeComplex64,
+  Complex128 = kLiteRtElementTypeComplex128,
+  TfResource = kLiteRtElementTypeTfResource,
+  TfString = kLiteRtElementTypeTfString,
+  TfVariant = kLiteRtElementTypeTfVariant,
+};
+
+// Get number of bytes of a single element of given type.
+inline constexpr std::optional<size_t> GetByteWidth(ElementType ty) {
+  if (ty == ElementType::Bool)
+    return 1;
+  else if (ty == ElementType::Int8)
+    return 1;
+  else if (ty == ElementType::Int16)
+    return 2;
+  else if (ty == ElementType::Int32)
+    return 4;
+  else if (ty == ElementType::Int64)
+    return 8;
+  else if (ty == ElementType::UInt8)
+    return 1;
+  else if (ty == ElementType::UInt16)
+    return 2;
+  else if (ty == ElementType::UInt32)
+    return 4;
+  else if (ty == ElementType::UInt64)
+    return 8;
+  else if (ty == ElementType::Float16)
+    return 2;
+  else if (ty == ElementType::BFloat16)
+    return 2;
+  else if (ty == ElementType::Float32)
+    return 4;
+  else if (ty == ElementType::Float64)
+    return 8;
+  else
+    return std::nullopt;
+}
+
+// Get number of bytes of a single element of given type via template.
+template <ElementType Ty>
+inline constexpr size_t GetByteWidth() {
+  constexpr auto byte_width = GetByteWidth(Ty);
+  static_assert(byte_width.has_value(), "Type does not have byte width");
+  return byte_width.value();
+}
+
+template <class>
+constexpr bool dependent_false = false;  // workaround before CWG2518/P2593R1
+
+// Get the litert::ElementType associated with given C++ type.
+template <typename T>
+inline constexpr ElementType GetElementType() {
+  static_assert(dependent_false<T>, "Uknown C++ type");
+  return ElementType::None;
+}
+
+template <>
+inline constexpr ElementType GetElementType<bool>() {
+  return ElementType::Bool;
+}
+
+template <>
+inline constexpr ElementType GetElementType<int8_t>() {
+  return ElementType::Int8;
+}
+
+template <>
+inline constexpr ElementType GetElementType<uint8_t>() {
+  return ElementType::UInt8;
+}
+
+template <>
+inline constexpr ElementType GetElementType<int16_t>() {
+  return ElementType::Int16;
+}
+
+template <>
+inline constexpr ElementType GetElementType<uint16_t>() {
+  return ElementType::UInt16;
+}
+
+template <>
+inline constexpr ElementType GetElementType<int32_t>() {
+  return ElementType::Int32;
+}
+
+template <>
+inline constexpr ElementType GetElementType<uint32_t>() {
+  return ElementType::UInt32;
+}
+
+template <>
+inline constexpr ElementType GetElementType<int64_t>() {
+  return ElementType::Int64;
+}
+
+template <>
+inline constexpr ElementType GetElementType<uint64_t>() {
+  return ElementType::UInt64;
+}
+
+template <>
+inline constexpr ElementType GetElementType<float>() {
+  return ElementType::Float32;
+}
+
+template <>
+inline constexpr ElementType GetElementType<double>() {
+  return ElementType::Float64;
+}
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ELEMENT_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_environment.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_environment.h
new file mode 100644
index 00000000..4910abd8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_environment.h
@@ -0,0 +1,82 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ENVIRONMENT_H_
+
+#include <any>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+
+class Environment {
+ public:
+  enum class OptionTag {
+    CompilerPluginLibraryPath = kLiteRtEnvOptionTagCompilerPluginLibraryPath,
+    DispatchLibraryPath = kLiteRtEnvOptionTagDispatchLibraryPath,
+  };
+
+  struct Option {
+    OptionTag tag;
+    std::any value;
+  };
+
+  static Expected<void> Create(absl::Span<const Option> options) {
+    auto c_options = ConvertOptions(options);
+    if (!c_options) {
+      return c_options.Error();
+    }
+    if (auto status =
+            LiteRtEnvironmentCreate(c_options->size(), c_options->data());
+        status != kLiteRtStatusOk) {
+      return Error(status);
+    } else {
+      return {};
+    }
+  }
+
+  static void Destroy() { LiteRtEnvironmentDestroy(); }
+
+ private:
+  static Expected<std::vector<LiteRtEnvOption>> ConvertOptions(
+      absl::Span<const Option> options) {
+    std::vector<LiteRtEnvOption> c_options;
+    c_options.reserve(options.size());
+
+    for (auto& option : options) {
+      auto litert_any = ToLiteRtAny(option.value);
+      if (!litert_any) {
+        return litert_any.Error();
+      }
+
+      LiteRtEnvOption c_option = {
+          /*.tag=*/static_cast<LiteRtEnvOptionTag>(option.tag),
+          /*.value=*/*litert_any,
+      };
+      c_options.push_back(c_option);
+    }
+
+    return c_options;
+  }
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_event.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_event.h
new file mode 100644
index 00000000..a618d3e8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_event.h
@@ -0,0 +1,66 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EVENT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EVENT_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_event.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
+
+namespace litert {
+
+class Event : public internal::Handle<LiteRtEvent, LiteRtDestroyEvent> {
+ public:
+  // Parameter `owned` indicates if the created TensorBufferRequirements object
+  // should take ownership of the provided `requirements` handle.
+  explicit Event(LiteRtEvent event, bool owned = true)
+      : internal::Handle<LiteRtEvent, LiteRtDestroyEvent>(event, owned) {}
+
+  static Expected<Event> CreateFromSyncFenceFd(int sync_fence_fd,
+                                               bool owns_fd) {
+    LiteRtEvent event;
+    if (auto status =
+            LiteRtCreateEventFromSyncFenceFd(sync_fence_fd, owns_fd, &event);
+        status != kLiteRtStatusOk) {
+      return Error(status, "Failed to create event from sync fence fd");
+    }
+    return Event(event);
+  }
+
+  Expected<int> GetSyncFenceFd(LiteRtEvent event) {
+    int fd;
+    if (auto status = LiteRtGetEventSyncFenceFd(Get(), &fd);
+        status != kLiteRtStatusOk) {
+      return Error(status, "Failed to get sync fence fd from event");
+    }
+    return fd;
+  }
+
+  // Pass -1 for timeout_in_ms for indefinite wait.
+  Expected<void> Wait(int64_t timeout_in_ms) {
+    if (auto status = LiteRtEventWait(Get(), timeout_in_ms);
+        status != kLiteRtStatusOk) {
+      return Error(status, "Failed to wait on event");
+    }
+    return {};
+  }
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EVENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_expected.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_expected.h
new file mode 100644
index 00000000..01a48181
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_expected.h
@@ -0,0 +1,343 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EXPECTED_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EXPECTED_H_
+
+#include <initializer_list>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <utility>
+
+#include "absl/log/absl_check.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+
+namespace litert {
+
+// An "Expected" incapsulates the result of some routine which may have an
+// unexpected result. Unexpected results in this context are a standard
+// LiteRtStatus plus extra usability data such as error messages. This is
+// similar to an absl::StatusOr or std::expected (C++23) but better integrated
+// with LiteRtStatus as the canonical status code.
+
+// C++ wrapper around LiteRtStatus code. Provides a status as well
+// as an error message.
+class Error {
+ public:
+  // Construct Unexpected from status and optional error message. NOTE:
+  // kLiteRtStatusOk should not be passed to Unexpected.
+  explicit Error(LiteRtStatus status, absl::string_view message = "")
+      : status_(status), message_(message) {
+    ABSL_DCHECK(status != kLiteRtStatusOk);
+  }
+
+  // Get the status.
+  constexpr LiteRtStatus Status() const { return status_; }
+
+  // Get the error message, empty string if none was attached.
+  constexpr absl::string_view Message() const { return message_; }
+
+  friend std::ostream& operator<<(std::ostream& stream, const Error& error) {
+    return stream << error.Message();
+  }
+
+ private:
+  LiteRtStatus status_;
+  absl::string_view message_;
+};
+
+class Unexpected {
+ public:
+  template <class... Args>
+  constexpr explicit Unexpected(Args&&... args)
+      : error_(std::forward<Args>(args)...) {}
+
+  // Allow for implicit conversion from convertible Error value inplace.
+  // NOLINTNEXTLINE
+  Unexpected(class Error&& e) : error_(std::move(e)) {}
+
+  Unexpected(Unexpected&& other) = default;
+  Unexpected(const Unexpected& other) = default;
+  Unexpected& operator=(Unexpected&& other) = default;
+  Unexpected& operator=(const Unexpected& other) = default;
+
+  constexpr const class Error& Error() const& noexcept { return error_; }
+  constexpr class Error& Error() & noexcept { return error_; }
+  constexpr const class Error&& Error() const&& noexcept {
+    return std::move(error_);
+  }
+  constexpr class Error&& Error() && noexcept { return std::move(error_); }
+
+ private:
+  class Error error_;
+};
+
+// Utility for generic return values that may be a statused failure.
+// Expecteds store and own the lifetime of either an Unexpected, or a T.
+// T may be any type, primitive or non-primitive.
+//
+// No dynamic allocations occur during initialization,
+// so the underlying T is only movable (as opposed to something like "release").
+// Arguments should be constructed inplace at the time of initilizing
+// the expcted if possible.
+//
+// Unexpected&& and T&& may be implicitly casted
+// to an Expected. For example,
+//
+// Expected<Foo> Bar() {
+//   bool success = ...
+//   if (!success) { return Unexpected(kLiteRtStatus, "Bad Baz"); }
+//   return Foo();
+// }
+//
+template <class T>
+class Expected {
+ public:
+  // Construct Expected with T inplace.
+
+  // Construct T from initializer list inplace.
+  template <class U>
+  Expected(std::initializer_list<U> il) : has_value_(true), value_(il) {}
+
+  // Construct T from forwarded args inplace.
+  template <class... Args>
+  explicit Expected(Args&&... args)
+      : has_value_(true), value_(std::forward<Args>(args)...) {}
+
+  // Allow for implicit conversion from convertible T value inplace.
+  // NOLINTNEXTLINE
+  Expected(const T& t) : has_value_(true), value_(t) {}
+  // NOLINTNEXTLINE
+  Expected(T&& t) : has_value_(true), value_(std::move(t)) {}
+
+  // Construct from Unexpected inplace.
+
+  // Allow for implicit conversion from Error.
+  // NOLINTNEXTLINE
+  Expected(const Unexpected& err) : has_value_(false), unexpected_(err) {}
+  // NOLINTNEXTLINE
+  Expected(Unexpected&& err) : has_value_(false), unexpected_(std::move(err)) {}
+  // NOLINTNEXTLINE
+  Expected(const class Error& e) : has_value_(false), unexpected_(e) {}
+
+  // Copy/move
+
+  Expected(Expected&& other) : has_value_(other.HasValue()) {
+    if (HasValue()) {
+      ConstructAt(std::addressof(value_), std::move(other.value_));
+    } else {
+      ConstructAt(std::addressof(unexpected_), std::move(other.unexpected_));
+    }
+  }
+
+  Expected(const Expected& other) : has_value_(other.has_value_) {
+    if (HasValue()) {
+      ConstructAt(std::addressof(value_), other.value_);
+      value_ = other.value_;
+    } else {
+      ConstructAt(std::addressof(unexpected_), other.unexpected_);
+    }
+  }
+
+  Expected& operator=(Expected&& other) {
+    if (this != &other) {
+      Expected::~Expected();
+      has_value_ = other.has_value_;
+      if (HasValue()) {
+        value_ = std::move(other.Value());
+      } else {
+        unexpected_ = std::move(other.unexpected_);
+      }
+    }
+    return *this;
+  }
+
+  Expected& operator=(const Expected& other) {
+    ~Expected();
+    has_value_ = other.has_value_;
+    if (HasValue()) {
+      value_ = other.value_;
+    } else {
+      unexpected_ = other.unexpected_;
+    }
+    return *this;
+  }
+
+  ~Expected() {
+    if (has_value_ && std::is_destructible<T>()) {
+      value_.~T();
+    } else {
+      unexpected_.~Unexpected();
+    }
+  }
+
+  // Observers for T value, program exits if it doesn't have one.
+  const T& Value() const& {
+    CheckVal();
+    return value_;
+  }
+
+  T& Value() & {
+    CheckVal();
+    return value_;
+  }
+
+  const T&& Value() const&& {
+    CheckVal();
+    return std::move(value_);
+  }
+
+  T&& Value() && {
+    CheckVal();
+    return std::move(value_);
+  }
+
+  const T* operator->() const {
+    CheckVal();
+    return &value_;
+  }
+
+  T* operator->() {
+    CheckVal();
+    return &value_;
+  }
+
+  const T& operator*() const& { return Value(); }
+
+  T& operator*() & { return Value(); }
+
+  const T&& operator*() const&& { return std::move(Value()); }
+
+  T&& operator*() && { return std::move(Value()); }
+
+  // Observer for Unexpected, program exits if it doesn't have one.
+  const class Error& Error() const& {
+    CheckNoVal();
+    return unexpected_.Error();
+  }
+
+  class Error& Error() & {
+    CheckNoVal();
+    return unexpected_.Error();
+  }
+
+  const class Error&& Error() const&& {
+    CheckNoVal();
+    return std::move(unexpected_.Error());
+  }
+
+  class Error&& Error() && {
+    CheckNoVal();
+    return std::move(unexpected_.Error());
+  }
+
+  // Does this expected contain a T Value. It contains an unexpected if not.
+  bool HasValue() const { return has_value_; }
+
+  // Convert to bool for HasValue.
+  explicit operator bool() const { return HasValue(); }
+
+ private:
+  bool has_value_;
+  union {
+    T value_;
+    Unexpected unexpected_;
+  };
+  void CheckNoVal() const { ABSL_CHECK(!HasValue()); }
+  void CheckVal() const { ABSL_CHECK(HasValue()); }
+};
+
+template <>
+class Expected<void> {
+ public:
+  // Implicit construction is used to simplify returning a valid value, e.g., in
+  // "return {};"
+  Expected() : has_value_(true) {}
+
+  // Construct from Unexpected inplace.
+
+  // Allow for implicit conversion from Error.
+  // NOLINTNEXTLINE
+  Expected(const Unexpected& err) : has_value_(false), unexpected_(err) {}
+  // NOLINTNEXTLINE
+  Expected(Unexpected&& err) : has_value_(false), unexpected_(std::move(err)) {}
+  // NOLINTNEXTLINE
+  Expected(const Error& e) : has_value_(false), unexpected_(e) {}
+
+  ~Expected() {
+    if (!has_value_) {
+      unexpected_.~Unexpected();
+    }
+  }
+
+  Expected& operator=(Expected&& other) {
+    if (this != &other) {
+      Expected::~Expected();
+      has_value_ = other.has_value_;
+      unexpected_ = std::move(other.unexpected_);
+    }
+    return *this;
+  }
+
+  Expected& operator=(const Expected& other) {
+    if (this != &other) {
+      Expected::~Expected();
+      has_value_ = other.has_value_;
+      unexpected_ = other.unexpected_;
+    }
+    return *this;
+  }
+
+  // Observer for Unexpected, program exits if it doesn't have one.
+  const class Error& Error() const& {
+    CheckNoVal();
+    return unexpected_.Error();
+  }
+
+  class Error& Error() & {
+    CheckNoVal();
+    return unexpected_.Error();
+  }
+
+  const class Error&& Error() const&& {
+    CheckNoVal();
+    return std::move(unexpected_.Error());
+  }
+
+  class Error&& Error() && {
+    CheckNoVal();
+    return std::move(unexpected_.Error());
+  }
+
+  // Does this expected contain a T Value. It contains an unexpected if not.
+  bool HasValue() const { return has_value_; }
+
+  // Convert to bool for HasValue.
+  explicit operator bool() const { return HasValue(); }
+
+ private:
+  bool has_value_;
+  union {
+    Unexpected unexpected_;
+  };
+  void CheckNoVal() const { ABSL_CHECK(!HasValue()); }
+  void CheckVal() const { ABSL_CHECK(HasValue()); }
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EXPECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_handle.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_handle.h
new file mode 100644
index 00000000..503eaad3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_handle.h
@@ -0,0 +1,74 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_HANDLE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_HANDLE_H_
+
+#include <memory>
+#include <type_traits>
+
+namespace litert {
+namespace internal {
+
+template <typename H>
+inline void DummyDeleter(H) {}
+
+// This class is used to wrap and manage the lifetime of opaque handles from the
+// C API into an equivalent C++ object. The class is a wrapper on
+// std::unique_ptr<> that has a default constructor and doesn't crash if the
+// deleter is null.
+template <typename H, void (*deleter)(H)>
+class Handle {
+ public:
+  Handle() = default;
+  explicit Handle(H handle, bool owned) noexcept
+      : ptr_(handle, owned ? deleter : DummyDeleter<H>) {}
+
+  Handle(Handle&& other) noexcept { *this = std::move(other); }
+
+  Handle& operator=(Handle&& other) noexcept {
+    std::swap(ptr_, other.ptr_);
+    return *this;
+  }
+
+  // Return true if the underlying LiteRtTensorBuffer handle is valid.
+  explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }
+
+  // Return the underlying LiteRtTensorBuffer handle.
+  H Get() const noexcept { return ptr_.get(); }
+
+  H Release() noexcept { return ptr_.release(); }
+
+  bool IsOwned() const noexcept {
+    return ptr_.get_deleter() != DummyDeleter<H>;
+  }
+
+ private:
+  std::unique_ptr<std::remove_pointer_t<H>, void (*)(H)> ptr_ = {nullptr,
+                                                                 DummyDeleter};
+};
+
+// This class is similar to Handle, but the managed opaque handle is not owned
+// (i.e., it will not be destroyed).
+template <typename H>
+class NonOwnedHandle : public Handle<H, DummyDeleter<H>> {
+ public:
+  explicit NonOwnedHandle(H handle) noexcept
+      : Handle<H, DummyDeleter<H>>(handle, /*owned=*/false) {}
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_HANDLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_layout.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_layout.h
new file mode 100644
index 00000000..a928e34c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_layout.h
@@ -0,0 +1,102 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_LAYOUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_LAYOUT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <optional>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_layout.h"
+
+namespace litert {
+
+// Small standalone helper functions for working with
+// c layout api.
+
+static constexpr size_t kTensorMaxRank = LITERT_TENSOR_MAX_RANK;
+
+// Build layout from given iterator of dimensions.
+template <class Begin, class End>
+inline constexpr LiteRtLayout BuildLayout(Begin begin, End end,
+                                          const uint32_t* strides = nullptr) {
+  LiteRtLayout res{static_cast<uint32_t>(end - begin), {}, strides};
+  auto i = 0;
+
+  for (auto* it = begin; it < end && i < kTensorMaxRank; ++it) {
+    res.dimensions[i] = *it;
+    ++i;
+  }
+
+  return res;
+}
+
+// Build layout from given iterable of dimensions.
+template <class Dims>
+inline constexpr LiteRtLayout BuildLayout(const Dims& dims,
+                                          const uint32_t* strides = nullptr) {
+  return BuildLayout(std::cbegin(dims), std::cend(dims), strides);
+}
+
+// Build layout from literal dimensions.
+inline constexpr LiteRtLayout BuildLayout(std::initializer_list<int32_t> dims,
+                                          const uint32_t* strides = nullptr) {
+  return BuildLayout(dims.begin(), dims.end(), strides);
+}
+
+// Compute the number of elements in dims iterator. Nullopt if there exists
+// a dynamic dimension.
+template <class Begin, class End>
+inline constexpr std::optional<size_t> NumElements(Begin begin, End end) {
+  if (end - begin == 0) {
+    return {};
+  }
+  size_t res = 1;
+  for (auto* it = begin; it < end; ++it) {
+    if (*it < 0) {
+      return {};
+    }
+    res *= *it;
+  }
+  return res;
+}
+
+// Override for layouts.
+inline constexpr std::optional<size_t> NumElements(const LiteRtLayout& layout) {
+  auto* b = std::cbegin(layout.dimensions);
+  return NumElements(b, b + layout.rank);
+}
+
+// Get dims as span.
+inline constexpr absl::Span<const int32_t> DimsSpan(
+    const LiteRtLayout& layout) {
+  return absl::MakeConstSpan(layout.dimensions, layout.rank);
+}
+
+// Get strides as span if they exist.
+inline constexpr std::optional<absl::Span<const uint32_t>> StridesSpan(
+    const LiteRtLayout& layout) {
+  if (layout.strides) {
+    return absl::MakeConstSpan(layout.strides, layout.rank);
+  }
+  return {};
+}
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_macros.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_macros.h
new file mode 100644
index 00000000..1622c0fc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_macros.h
@@ -0,0 +1,67 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MACROS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MACROS_H_
+
+#include "absl/log/absl_check.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"  // IWYU pragma: keep
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"  // IWYU pragma: keep
+
+#define _CONCAT_NAME_IMPL(x, y) x##y
+
+#define _CONCAT_NAME(x, y) _CONCAT_NAME_IMPL(x, y)
+
+#define _RETURN_VAL(val) return val
+
+#define LITERT_CHECK_STATUS_HAS_CODE(expr, code) ABSL_CHECK(expr == code);
+
+#define LITERT_CHECK_STATUS_OK(expr) \
+  LITERT_CHECK_STATUS_HAS_CODE(expr, kLiteRtStatusOk);
+
+#define LITERT_ENSURE_SUPPORTED(cond, msg) \
+  if (!(cond)) {                           \
+    LITERT_LOG(LITERT_ERROR, "%s", msg);   \
+    return kLiteRtStatusErrorUnsupported;  \
+  }
+
+#define LITERT_ENSURE(expr, fail_stat, msg) \
+  if (!(expr)) {                            \
+    LITERT_LOG(LITERT_ERROR, "%s", msg);    \
+    return fail_stat;                       \
+  }
+
+#define LITERT_RETURN_STATUS_IF_NOT_OK(expr) \
+  if (LiteRtStatus status = expr; status != kLiteRtStatusOk) return status;
+
+#define LITERT_RETURN_STATUS_IF_NOT_OK_OR_NOT_MATCHED(expr)                  \
+  if (LiteRtStatus status = expr;                                            \
+      (status != kLiteRtStatusOk && status != kLiteRtStatusLegalizeNoMatch)) \
+    return status;
+
+#define LITERT_RETURN_VAL_IF_NOT_OK(expr, ret_val) \
+  if (LiteRtStatus status = expr; status != kLiteRtStatusOk) return ret_val;
+
+#define LITERT_STACK_ARRAY(ty, var, size, init) \
+  ty* var = (ty*)alloca(sizeof(ty) * size);     \
+  for (ty* e = var; e < var + size; ++e) {      \
+    *e = init;                                  \
+  }
+
+#define LITERT_EXPECT_OK(status)                       \
+  if (auto stat = (status); stat != kLiteRtStatusOk) { \
+    return ::litert::Unexpected(stat);                 \
+  }
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_model.h
new file mode 100644
index 00000000..77c76afd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_model.h
@@ -0,0 +1,520 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_consts.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
+
+namespace litert {
+
+using Dimensions = absl::InlinedVector<int32_t, kExpectedMaxTensorRank>;
+using Strides = absl::InlinedVector<uint32_t, kExpectedMaxTensorRank>;
+
+// Tensor layout. C++ equivalent to LiteRtLayout.
+class Layout {
+ public:
+  explicit Layout(litert::Dimensions&& dimensions,
+                  litert::Strides&& strides = litert::Strides())
+      : dimensions_(std::move(dimensions)), strides_(std::move(strides)) {}
+
+  explicit Layout(const LiteRtLayout& layout)
+      : dimensions_(layout.dimensions, layout.dimensions + layout.rank) {
+    if (layout.strides) {
+      strides_.reserve(layout.rank);
+      std::copy(layout.strides, layout.strides + layout.rank,
+                std::back_inserter(strides_));
+    }
+  }
+
+  explicit operator LiteRtLayout() const {
+    auto res = BuildLayout(dimensions_);
+    res.strides = HasStrides() ? strides_.data() : nullptr;
+    return res;
+  }
+
+  bool operator==(const Layout& other) const {
+    return dimensions_ == other.dimensions_ && strides_ == other.strides_;
+  }
+
+  uint32_t Rank() const { return dimensions_.size(); }
+
+  absl::Span<const int32_t> Dimensions() const {
+    return absl::MakeSpan(dimensions_.data(), dimensions_.size());
+  }
+
+  bool HasStrides() const { return !strides_.empty(); }
+
+  absl::Span<const uint32_t> Strides() const {
+    const uint32_t* data = HasStrides() ? strides_.data() : nullptr;
+    auto size = HasStrides() ? Rank() : 0;
+    return absl::MakeSpan(data, size);
+  }
+
+  // Get the number of scalar elements in this tensor type. std::nullopt if
+  // not fully static.
+  std::optional<size_t> NumElements() const {
+    return ::litert::NumElements(dimensions_.cbegin(), dimensions_.cend());
+  }
+
+ private:
+  litert::Dimensions dimensions_;
+  litert::Strides strides_;
+};
+
+// Type for tensors with known dimensions. C++ equivalent to
+// LiteRtRankedTensorType.
+class RankedTensorType {
+ public:
+  RankedTensorType(enum ElementType element_type, class Layout&& layout)
+      : element_type_(element_type), layout_(std::move(layout)) {}
+  explicit RankedTensorType(const LiteRtRankedTensorType& type)
+      : element_type_(static_cast<enum ElementType>(type.element_type)),
+        layout_(type.layout) {}
+
+  explicit operator LiteRtRankedTensorType() const {
+    return LiteRtRankedTensorType{
+        /*.element_type=*/static_cast<LiteRtElementType>(element_type_),
+        /*layout=*/static_cast<LiteRtLayout>(layout_),
+    };
+  }
+
+  bool operator==(const RankedTensorType& other) const {
+    return ElementType() == other.ElementType() && Layout() == other.Layout();
+  }
+
+  enum ElementType ElementType() const { return element_type_; }
+
+  const class Layout& Layout() const { return layout_; }
+
+ private:
+  enum ElementType element_type_;
+  class Layout layout_;
+};
+
+// Tensor weights. C++ equivalent of LiteRtWeights.
+class Weights : public internal::NonOwnedHandle<LiteRtWeights> {
+ public:
+  Weights() = default;
+  explicit Weights(LiteRtWeights weights)
+      : internal::NonOwnedHandle<LiteRtWeights>(weights) {}
+
+  absl::Span<const uint8_t> Bytes() const {
+    size_t size;
+    const void* addr;
+    internal::AssertOk(LiteRtGetWeightsBytes, Get(), &addr, &size);
+    return absl::MakeSpan(static_cast<const uint8_t*>(addr), size);
+  }
+};
+
+// Tensor. C++ equivalent of LiteRtTensor.
+class Tensor : public internal::NonOwnedHandle<LiteRtTensor> {
+ public:
+  Tensor() = default;
+  explicit Tensor(LiteRtTensor tensor)
+      : internal::NonOwnedHandle<LiteRtTensor>(tensor) {}
+
+  enum ElementType ElementType() const {
+    if (TypeId() == kLiteRtUnrankedTensorType) {
+      return static_cast<enum ElementType>(UnrankedTensorType()->element_type);
+    } else {
+      return RankedTensorType()->ElementType();
+    }
+  }
+
+  LiteRtTensorTypeId TypeId() const {
+    LiteRtTensorTypeId type_id;
+    internal::AssertOk(LiteRtGetTensorTypeId, Get(), &type_id);
+    return type_id;
+  }
+
+  Expected<LiteRtUnrankedTensorType> UnrankedTensorType() const {
+    if (TypeId() != kLiteRtUnrankedTensorType) {
+      return Error(kLiteRtStatusErrorInvalidArgument,
+                   "Not an unranked invalid tensor");
+    }
+    LiteRtUnrankedTensorType unranked_tensor_type;
+    internal::AssertOk(LiteRtGetUnrankedTensorType, Get(),
+                       &unranked_tensor_type);
+    return unranked_tensor_type;
+  }
+
+  Expected<class RankedTensorType> RankedTensorType() const {
+    if (TypeId() != kLiteRtRankedTensorType) {
+      return Error(kLiteRtStatusErrorInvalidArgument,
+                   "Not a ranked tensor type");
+    }
+    LiteRtRankedTensorType ranked_tensor_type;
+    internal::AssertOk(LiteRtGetRankedTensorType, Get(), &ranked_tensor_type);
+    return litert::RankedTensorType(ranked_tensor_type);
+  }
+
+  LiteRtQuantizationTypeId QTypeId() const {
+    LiteRtQuantizationTypeId q_type_id;
+    internal::AssertOk(LiteRtGetQuantizationTypeId, Get(), &q_type_id);
+    return q_type_id;
+  }
+
+  bool HasQuantization() const { return QTypeId() != kLiteRtQuantizationNone; }
+
+  LiteRtQuantizationPerTensor PerTensorQuantization() const {
+    internal::AssertEq([&]() { return QTypeId(); },
+                       kLiteRtQuantizationPerTensor);
+    LiteRtQuantizationPerTensor per_tensor_quantization;
+    internal::AssertOk(LiteRtGetPerTensorQuantization, Get(),
+                       &per_tensor_quantization);
+    return per_tensor_quantization;
+  }
+
+  LiteRtQuantizationPerChannel PerChannelQuantization() const {
+    internal::AssertEq([&]() { return QTypeId(); },
+                       kLiteRtQuantizationPerChannel);
+    LiteRtQuantizationPerChannel per_channel_quantization;
+    internal::AssertOk(LiteRtGetPerChannelQuantization, Get(),
+                       &per_channel_quantization);
+    return per_channel_quantization;
+  }
+
+  bool HasWeights() const {
+    auto weights = Weights();
+    return !weights.Bytes().empty();
+  }
+
+  class Weights Weights() const {
+    LiteRtWeights weights;
+    internal::AssertOk(LiteRtGetTensorWeights, Get(), &weights);
+    return litert::Weights(weights);
+  }
+
+  absl::string_view Name() const {
+    const char* name;
+    internal::AssertOk(LiteRtGetTensorName, Get(), &name);
+    return absl::string_view(name);
+  }
+
+  struct TensorUse;
+  using TensorUses =
+      absl::InlinedVector<TensorUse, kExpectedMaxNumOfTensorUses>;
+
+  TensorUses Uses() const;
+
+  template <typename T>
+  Expected<absl::Span<const T>> WeightsData() const {
+    auto ranked_tensor_type = RankedTensorType();
+    if (!ranked_tensor_type) {
+      return ranked_tensor_type.Error();
+    }
+
+    const enum ElementType ty = ranked_tensor_type->ElementType();
+    if (ty != GetElementType<T>()) {
+      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
+    }
+
+    if (!HasWeights()) {
+      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
+    }
+    const absl::Span<const uint8_t> weights = Weights().Bytes();
+
+    auto num_elements = ranked_tensor_type->Layout().NumElements();
+    if (!num_elements.has_value()) {
+      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
+    }
+    auto byte_width = GetByteWidth(ty);
+    if (!byte_width.has_value()) {
+      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
+    }
+
+    if (byte_width.value() * num_elements.value() != weights.size()) {
+      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
+    }
+
+    return absl::MakeConstSpan(reinterpret_cast<const T*>(weights.data()),
+                               num_elements.value());
+  }
+
+  std::optional<LiteRtTensorDefiningOp> DefiningOp() const {
+    bool has_defining_op;
+    LiteRtTensorDefiningOp defining_op;
+    internal::AssertOk(LiteRtGetTensorDefiningOp, Get(), &has_defining_op,
+                       &defining_op);
+    if (has_defining_op) {
+      return defining_op;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  bool IsSubgraphOutput() const;
+  bool IsSubgraphInput() const;
+  bool IsConstant() const;
+};
+
+using OpInputs = absl::InlinedVector<Tensor, kExpectedMaxNumOfOpInputs>;
+using OpOutputs = absl::InlinedVector<Tensor, kExpectedMaxNumOfOpOutputs>;
+
+// Operator. C++ equivalent of LiteRtOp.
+class Op : public internal::NonOwnedHandle<LiteRtOp> {
+ public:
+  Op() = default;
+  explicit Op(LiteRtOp op) : internal::NonOwnedHandle<LiteRtOp>(op) {}
+
+  LiteRtOpCode Code() const {
+    LiteRtOpCode opcode;
+    internal::AssertOk(LiteRtGetOpCode, Get(), &opcode);
+    return opcode;
+  }
+
+  OpInputs Inputs() const;
+  OpOutputs Outputs() const;
+};
+
+struct Tensor::TensorUse {
+  Op user;
+  LiteRtParamIndex user_arg_ind;
+};
+
+using SubgraphInputs =
+    absl::InlinedVector<Tensor, kExpectedMaxNumOfSubgraphInputs>;
+using SubgraphOutputs =
+    absl::InlinedVector<Tensor, kExpectedMaxNumOfSubgraphOutputs>;
+
+// Model subgraph. C++ equivalent of LiteRtSubgraph.
+class Subgraph : public internal::NonOwnedHandle<LiteRtSubgraph> {
+ public:
+  Subgraph() = default;
+  explicit Subgraph(LiteRtSubgraph subgraph)
+      : internal::NonOwnedHandle<LiteRtSubgraph>(subgraph) {}
+
+  SubgraphInputs Inputs() const;
+  SubgraphOutputs Outputs() const;
+  std::vector<Op> Ops() const;
+};
+
+// Model signature. C++ equivalent of LiteRtSignature.
+class Signature : public internal::NonOwnedHandle<LiteRtSignature> {
+ public:
+  Signature() = default;
+  explicit Signature(LiteRtSignature signature)
+      : internal::NonOwnedHandle<LiteRtSignature>(signature) {}
+
+  absl::string_view Key() const {
+    const char* key;
+    internal::AssertOk(LiteRtGetSignatureKey, Get(), &key);
+    return key;
+  }
+
+  LiteRtSubgraph Subgraph() const {
+    LiteRtSubgraph subgraph;
+    internal::AssertOk(LiteRtGetSignatureSubgraph, Get(), &subgraph);
+    return subgraph;
+  }
+
+  std::vector<absl::string_view> InputNames() const {
+    LiteRtParamIndex num_inputs;
+    internal::AssertOk(LiteRtGetNumSignatureInputs, Get(), &num_inputs);
+    std::vector<absl::string_view> input_names;
+    input_names.reserve(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      const char* input_name;
+      internal::AssertOk(LiteRtGetSignatureInputName, Get(), i, &input_name);
+      input_names.push_back(input_name);
+    }
+    return input_names;
+  }
+
+  std::vector<absl::string_view> OutputNames() const {
+    LiteRtParamIndex num_outputs;
+    internal::AssertOk(LiteRtGetNumSignatureOutputs, Get(), &num_outputs);
+    std::vector<absl::string_view> output_names;
+    output_names.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      const char* output_name;
+      internal::AssertOk(LiteRtGetSignatureOutputName, Get(), i, &output_name);
+      output_names.push_back(output_name);
+    }
+    return output_names;
+  }
+};
+
+// Model. C++ equivalent of LiteRtModel.
+class Model : public internal::Handle<LiteRtModel, LiteRtDestroyModel> {
+ public:
+  Model() = default;
+
+  static Model CreateFromOwnedHandle(LiteRtModel model) {
+    return Model(model, /*owned=*/true);
+  }
+
+  static Model CreateFromNonOwnedHandle(LiteRtModel model) {
+    return Model(model, /*owned=*/false);
+  }
+
+  static Expected<Model> CreateFromFile(const std::string& filename) {
+    LiteRtModel model;
+    if (auto status = LiteRtCreateModelFromFile(filename.c_str(), &model);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to load model from file");
+    }
+    return CreateFromOwnedHandle(model);
+  }
+
+  static Expected<Model> CreateFromBuffer(BufferRef<uint8_t> buffer) {
+    LiteRtModel model;
+    if (auto status =
+            LiteRtCreateModelFromBuffer(buffer.Data(), buffer.Size(), &model);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to load model from buffer");
+    }
+    return CreateFromOwnedHandle(model);
+  }
+
+  Expected<absl::Span<const uint8_t>> Metadata(
+      const std::string& metadata_key) const {
+    const void* buffer;
+    size_t buffer_size;
+    if (LiteRtGetModelMetadata(Get(), metadata_key.data(), &buffer,
+                               &buffer_size) != kLiteRtStatusOk) {
+      return Unexpected(kLiteRtStatusErrorNotFound, "Metadata key not found");
+    }
+    return absl::MakeSpan(static_cast<const uint8_t*>(buffer), buffer_size);
+  }
+
+  Expected<class Subgraph> MainSubgraph() {
+    LiteRtParamIndex main_subgraph_index;
+    internal::AssertOk(LiteRtGetMainModelSubgraphIndex, Get(),
+                       &main_subgraph_index);
+    return this->Subgraph(main_subgraph_index);
+  }
+
+  size_t NumSubgraphs() const {
+    LiteRtParamIndex num_subgraphs;
+    internal::AssertOk(LiteRtGetNumModelSubgraphs, Get(), &num_subgraphs);
+    return num_subgraphs;
+  }
+
+  Expected<class Subgraph> Subgraph(size_t subgraph_index) {
+    LiteRtSubgraph subgraph;
+    if (LiteRtGetModelSubgraph(Get(), subgraph_index, &subgraph) !=
+        kLiteRtStatusOk) {
+      return Unexpected(kLiteRtStatusErrorNotFound, "Subgraph not found");
+    }
+    return litert::Subgraph(subgraph);
+  }
+
+  Expected<class Subgraph> Subgraph(absl::string_view signature_key) {
+    auto signature = FindSignature(signature_key);
+    if (!signature) {
+      return Unexpected(kLiteRtStatusErrorNotFound, "Signature not found");
+    }
+    return litert::Subgraph(signature->Subgraph());
+  }
+
+  size_t GetNumSignatures() const {
+    LiteRtParamIndex num_signatures;
+    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
+    return num_signatures;
+  }
+
+  // Returns the list of signatures defined in the model.
+  Expected<std::vector<class Signature>> GetSignatures() const {
+    LiteRtParamIndex num_signatures;
+    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
+    std::vector<class Signature> signatures;
+    signatures.reserve(num_signatures);
+    for (int i = 0; i < num_signatures; ++i) {
+      LiteRtSignature lite_rt_signature;
+      internal::AssertOk(LiteRtGetModelSignature, Get(), i, &lite_rt_signature);
+      Signature signature(lite_rt_signature);
+      signatures.push_back(std::move(signature));
+    }
+    return std::move(signatures);
+  }
+
+  // Returns the signature at the given index.
+  Expected<class Signature> GetSignature(size_t signature_index) const {
+    LiteRtSignature lite_rt_signature;
+    internal::AssertOk(LiteRtGetModelSignature, Get(), signature_index,
+                       &lite_rt_signature);
+    return Signature(lite_rt_signature);
+  }
+
+  // Returns the signature index for the given signature key.
+  Expected<size_t> GetSignatureIndex(absl::string_view signature_key) const {
+    LiteRtParamIndex num_signatures;
+    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
+    for (int i = 0; i < num_signatures; ++i) {
+      LiteRtSignature lite_rt_signature;
+      internal::AssertOk(LiteRtGetModelSignature, Get(), i, &lite_rt_signature);
+      const char* key_cstr;
+      internal::AssertOk(LiteRtGetSignatureKey, lite_rt_signature, &key_cstr);
+      if (absl::string_view(key_cstr) == signature_key) {
+        return i;
+      }
+    }
+    return Unexpected(kLiteRtStatusErrorNotFound, "Signature not found");
+  }
+
+  // Returns the Signature object for the given signature key.
+  Expected<class Signature> FindSignature(
+      absl::string_view signature_key) const {
+    LiteRtParamIndex num_signatures;
+    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
+    for (int i = 0; i < num_signatures; ++i) {
+      LiteRtSignature lite_rt_signature;
+      internal::AssertOk(LiteRtGetModelSignature, Get(), i, &lite_rt_signature);
+      const char* key_cstr;
+      internal::AssertOk(LiteRtGetSignatureKey, lite_rt_signature, &key_cstr);
+      if (absl::string_view(key_cstr) == signature_key) {
+        return Signature(lite_rt_signature);
+      }
+    }
+    return Unexpected(kLiteRtStatusErrorNotFound, "Signature not found");
+  }
+
+  static absl::string_view DefaultSignatureKey() {
+    const char* key;
+    internal::AssertOk(LiteRtGetDefaultSignatureKey, &key);
+    return key;
+  }
+
+ private:
+  // Parameter `owned` indicates if the created TensorBuffer object should take
+  // ownership of the provided `tensor_buffer` handle.
+  Model(LiteRtModel model, bool owned)
+      : internal::Handle<LiteRtModel, LiteRtDestroyModel>(model, owned) {}
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_model_predicates.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_model_predicates.h
new file mode 100644
index 00000000..238e9a45
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_model_predicates.h
@@ -0,0 +1,78 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_PREDICATES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_PREDICATES_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+
+// Predicates used for matching patterns in the graph. NOTE: All optionals in
+// matcher arguments are considered to be a vacous match.
+
+namespace litert {
+
+struct TensorTypeInfo {
+  std::optional<ElementType> element_type = std::nullopt;
+  std::optional<absl::InlinedVector<int32_t, 4>> dims = std::nullopt;
+
+  explicit TensorTypeInfo(ElementType element_type)
+      : element_type(element_type) {}
+  explicit TensorTypeInfo(absl::InlinedVector<int32_t, 4> dims) : dims(dims) {}
+  TensorTypeInfo(ElementType element_type, absl::InlinedVector<int32_t, 4> dims)
+      : element_type(element_type), dims(dims) {}
+};
+
+struct UseInfo {
+  std::optional<LiteRtOpCode> op_code = std::nullopt;
+  std::optional<LiteRtParamIndex> user_param_ind = std::nullopt;
+};
+
+// Does this tensor have given type and shape info.
+bool MatchRankedTensorType(const RankedTensorType& tensor_type,
+                           const TensorTypeInfo& expected);
+
+// Does this op have signature matching given types.
+bool MatchOpType(
+    const Op& op,
+    const std::vector<std::optional<TensorTypeInfo>>& expected_inputs,
+    const std::vector<std::optional<TensorTypeInfo>>& expected_outputs);
+
+// Does this tensor contain weights whose values match expected_data.
+template <typename T>
+inline bool MatchWeights(const Tensor& tensor,
+                         absl::Span<const T> expected_data) {
+  auto weights = tensor.WeightsData<T>();
+  return weights.HasValue() && *weights == expected_data;
+}
+
+// Does this tensor have a user with the given information.
+bool MatchUse(const Tensor& tensor, const UseInfo& expected_use);
+
+// Does this tensor have matching users. If "strict" is true, then expected_uses
+// size must equal the number of actual uses, otherwise just checks each
+// expected_use match an actual use.
+bool MatchUses(const Tensor& tensor, const std::vector<UseInfo>& expected_uses,
+               bool strict = true);
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_PREDICATES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h
new file mode 100644
index 00000000..1c044dd7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h
@@ -0,0 +1,330 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_H_
+
+#include <cstddef>
+#include <cstring>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_event.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_event.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+
+namespace litert {
+
+// Tensor and associated backing buffer. C++ equivalent of LiteRtTensorBuffer.
+class TensorBuffer
+    : public internal::Handle<LiteRtTensorBuffer, LiteRtDestroyTensorBuffer> {
+ public:
+  TensorBuffer() = default;
+
+  // Parameter `owned` indicates if the created TensorBuffer object should take
+  // ownership of the provided `tensor_buffer` handle.
+  explicit TensorBuffer(LiteRtTensorBuffer tensor_buffer, bool owned = true)
+      : internal::Handle<LiteRtTensorBuffer, LiteRtDestroyTensorBuffer>(
+            tensor_buffer, owned) {}
+
+  // Creates a duplicate of the current TensorBuffer object. The returned
+  // object is reference counted so the underlying LiteRtTensorBuffer handle is
+  // not released with the destructor until the last reference is removed.
+  Expected<TensorBuffer> Duplicate() const {
+    if (!IsOwned()) {
+      return Unexpected(kLiteRtStatusErrorInvalidArgument,
+                        "Cannot duplicate a non-owned tensor buffer");
+    }
+    if (auto status = LiteRtDuplicateTensorBuffer(Get());
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to duplicate managed tensor buffer");
+    }
+    return TensorBuffer(Get());
+  }
+
+  static Expected<TensorBuffer> CreateManaged(
+      LiteRtTensorBufferType buffer_type, const RankedTensorType& tensor_type,
+      size_t buffer_size) {
+    LiteRtTensorBuffer tensor_buffer;
+    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
+    if (auto status = LiteRtCreateManagedTensorBuffer(
+            buffer_type, &litert_tensor_type, buffer_size, &tensor_buffer);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to create managed tensor buffer");
+    }
+    return TensorBuffer(tensor_buffer);
+  }
+
+  // Creates a TensorBuffer object that wraps the provided host memory.
+  // The provided host memory is not owned by the TensorBuffer object and must
+  // outlive the TensorBuffer object.
+  static Expected<TensorBuffer> CreateFromHostMemory(
+      const RankedTensorType& tensor_type, void* host_mem_addr,
+      size_t buffer_size) {
+    LiteRtTensorBuffer tensor_buffer;
+    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
+
+    if (auto status = LiteRtCreateTensorBufferFromHostMemory(
+            &litert_tensor_type, host_mem_addr, buffer_size,
+            /*deallocator=*/nullptr, &tensor_buffer);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status,
+                        "Failed to create tensor buffer from host memory");
+    }
+    return TensorBuffer(tensor_buffer);
+  }
+
+  // Creates a TensorBuffer object that wraps an Android Hardware Buffer. Note
+  // that the provided AHardwareBuffer is not owned by the TensorBuffer object
+  // and must outlive the TensorBuffer object. The `ahwb_offset` parameter
+  // specifies the offset in bytes from the start of the AHardwareBuffer where
+  // the tensor data starts.
+  static Expected<TensorBuffer> CreateFromAhwb(
+      const RankedTensorType& tensor_type, AHardwareBuffer* ahwb,
+      size_t ahwb_offset) {
+#if LITERT_HAS_AHWB_SUPPORT
+    LiteRtTensorBuffer tensor_buffer;
+    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
+
+    if (auto status = LiteRtCreateTensorBufferFromAhwb(
+            &litert_tensor_type, ahwb, ahwb_offset,
+            /*deallocator=*/nullptr, &tensor_buffer);
+        status != kLiteRtStatusOk) {
+      return Unexpected(
+          status,
+          "Failed to create tensor buffer from Android Hardware Buffer");
+    }
+    return TensorBuffer(tensor_buffer);
+#else
+    return litert::Unexpected(
+        kLiteRtStatusErrorRuntimeFailure,
+        "AHardwareBuffer is not supported on this platform");
+#endif
+  }
+
+  litert::Expected<AHardwareBuffer*> GetAhwb() const {
+#if LITERT_HAS_AHWB_SUPPORT
+    AHardwareBuffer* ahwb;
+    if (LiteRtGetTensorBufferAhwb(Get(), &ahwb) == kLiteRtStatusOk) {
+      return ahwb;
+    } else {
+      return litert::Unexpected(
+          kLiteRtStatusErrorRuntimeFailure,
+          "Failed to get AHardwareBuffer from tensor buffer");
+    }
+#else
+    return litert::Unexpected(
+        kLiteRtStatusErrorRuntimeFailure,
+        "AHardwareBuffer is not supported on this platform");
+#endif
+  }
+
+  struct DmaBuf {
+    void* addr;
+    int fd;
+  };
+
+  litert::Expected<DmaBuf> GetDmaBuf() const {
+#if LITERT_HAS_DMABUF_SUPPORT
+    DmaBuf dma_buf;
+    if (LiteRtGetTensorBufferDmaBufBuffer(Get(), &dma_buf.addr, &dma_buf.fd) ==
+        kLiteRtStatusOk) {
+      return dma_buf;
+    } else {
+      return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure,
+                                "Failed to get DMA-BUF from tensor buffer");
+    }
+#else
+    return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure,
+                              "DMA-BUF is not supported on this platform");
+#endif
+  }
+
+  Expected<LiteRtTensorBufferType> BufferType() const {
+    LiteRtTensorBufferType tensor_buffer_type;
+    if (auto status = LiteRtGetTensorBufferType(Get(), &tensor_buffer_type);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get tensor buffer type");
+    }
+    return tensor_buffer_type;
+  }
+
+  Expected<RankedTensorType> TensorType() const {
+    LiteRtRankedTensorType tensor_type;
+    if (auto status = LiteRtGetTensorBufferTensorType(Get(), &tensor_type);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get tensor type");
+    }
+    return RankedTensorType(tensor_type);
+  }
+
+  Expected<size_t> Size() const {
+    size_t size;
+    if (auto status = LiteRtGetTensorBufferSize(Get(), &size);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get tensor size");
+    }
+    return size;
+  }
+
+  Expected<size_t> Offset() const {
+    size_t offset;
+    if (auto status = LiteRtGetTensorBufferOffset(Get(), &offset);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get tensor offset");
+    }
+    return offset;
+  }
+
+  bool HasEvent() const {
+    bool has_event;
+    internal::AssertOk(LiteRtHasTensorBufferEvent, Get(), &has_event);
+    return has_event;
+  }
+
+  Expected<Event> GetEvent() const {
+    LiteRtEvent event;
+    if (auto status = LiteRtGetTensorBufferEvent(Get(), &event);
+        status != kLiteRtStatusOk) {
+      return Error(status, "Failed to get tensor buffer event");
+    }
+    return Event(event, /*owned=*/false);
+  }
+
+  Expected<void> SetEvent(Event e) {
+    if (auto status = LiteRtSetTensorBufferEvent(Get(), e.Get());
+        status != kLiteRtStatusOk) {
+      return Error(status, "Failed to set tensor buffer event");
+    }
+    return {};
+  }
+
+  Expected<void> ClearEvent() {
+    if (auto status = LiteRtClearTensorBufferEvent(Get());
+        status != kLiteRtStatusOk) {
+      return Error(status, "Failed to clear tensor buffer event");
+    }
+    return {};
+  }
+
+  Expected<void*> Lock(LiteRtEvent event = nullptr) {
+    void* host_mem_addr;
+    if (auto status = LiteRtLockTensorBuffer(Get(), &host_mem_addr, event);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to lock the tensor buffer");
+    }
+    return host_mem_addr;
+  }
+
+  Expected<void> Unlock() {
+    if (auto status = LiteRtUnlockTensorBuffer(Get());
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to unlock the tensor buffer");
+    }
+    return {};
+  }
+
+  // Writes data from the user provided Span<const T> to the tensor buffer.
+  // It returns an error if the provided buffer is bigger than the size of the
+  // tensor buffer.
+  template <typename T>
+  Expected<void> Write(absl::Span<const T> data) {
+    auto host_mem_addr = Lock();
+    if (!host_mem_addr) {
+      return host_mem_addr.Error();
+    }
+    auto size = Size();
+    if (!size) {
+      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
+                        "Failed to get TensorBuffer size");
+    }
+    if (*size < data.size() * sizeof(T)) {
+      return Unexpected(
+          kLiteRtStatusErrorRuntimeFailure,
+          "TensorBuffer size is smaller than the given data size");
+    }
+    std::memcpy(*host_mem_addr, data.data(), data.size() * sizeof(T));
+    Unlock();
+    return {};
+  }
+
+  // Reads data into the user provided Span<T> from the tensor buffer.
+  // If the provided buffer is smaller than the size of the tensor buffer, the
+  // data will be read up to the size of the provided buffer.
+  // It returns an error if the provided buffer is bigger than the size of the
+  // tensor buffer.
+  template <typename T>
+  Expected<void> Read(absl::Span<T> data) {
+    auto host_mem_addr = Lock();
+    if (!host_mem_addr) {
+      return host_mem_addr.Error();
+    }
+    auto size = Size();
+    if (!size) {
+      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
+                        "Failed to get TensorBuffer size");
+    }
+    size_t total_read_size = data.size() * sizeof(T);
+    if (*size < total_read_size) {
+      return Unexpected(
+          kLiteRtStatusErrorRuntimeFailure,
+          "TensorBuffer size is smaller than the given data size");
+    }
+    std::memcpy(data.data(), *host_mem_addr, total_read_size);
+    Unlock();
+    return {};
+  }
+};
+
+class TensorBufferScopedLock {
+ public:
+  TensorBufferScopedLock(const TensorBufferScopedLock& arg) = delete;
+  TensorBufferScopedLock(TensorBufferScopedLock&& arg) = default;
+  ~TensorBufferScopedLock() { (void)LiteRtUnlockTensorBuffer(tensor_buffer_); }
+
+  template <typename T = void>
+  static Expected<std::pair<TensorBufferScopedLock, T*>> Create(
+      TensorBuffer& tensor_buffer, LiteRtEvent event = nullptr) {
+    return Create<T>(tensor_buffer.Get(), event);
+  }
+
+  template <typename T = void>
+  static Expected<std::pair<TensorBufferScopedLock, T*>> Create(
+      LiteRtTensorBuffer tensor_buffer, LiteRtEvent event = nullptr) {
+    void* host_mem_addr;
+    if (auto status =
+            LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr, event);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to lock the tensor buffer");
+    }
+    return std::make_pair(TensorBufferScopedLock(tensor_buffer),
+                          static_cast<T*>(host_mem_addr));
+  }
+
+ private:
+  explicit TensorBufferScopedLock(LiteRtTensorBuffer& tensor_buffer)
+      : tensor_buffer_(tensor_buffer) {}
+
+  LiteRtTensorBuffer tensor_buffer_;
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h
new file mode 100644
index 00000000..61ba97c6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h
@@ -0,0 +1,106 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
+
+namespace litert {
+
+// Requirements for allocating a TensorBuffer, typically specified by a HW
+// accelerator for a given I/O tensor. C++ equivalent to
+// LiteRtTensorBufferRequirements.
+class TensorBufferRequirements
+    : public internal::Handle<LiteRtTensorBufferRequirements,
+                              LiteRtDestroyTensorBufferRequirements> {
+ public:
+  TensorBufferRequirements() = default;
+
+  // Parameter `owned` indicates if the created TensorBufferRequirements object
+  // should take ownership of the provided `requirements` handle.
+  explicit TensorBufferRequirements(LiteRtTensorBufferRequirements requirements,
+                                    bool owned = true)
+      : internal::Handle<LiteRtTensorBufferRequirements,
+                         LiteRtDestroyTensorBufferRequirements>(requirements,
+                                                                owned) {}
+
+  static Expected<TensorBufferRequirements> Create(
+      absl::Span<const LiteRtTensorBufferType> buffer_types, size_t buffer_size,
+      absl::Span<const uint32_t> strides =
+          absl::MakeSpan(static_cast<const uint32_t*>(nullptr), 0)) {
+    LiteRtTensorBufferRequirements tensor_buffer_requirements;
+    if (auto status = LiteRtCreateTensorBufferRequirements(
+            buffer_types.size(), buffer_types.data(), buffer_size,
+            strides.size(), strides.data(), &tensor_buffer_requirements);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to create tensor buffer requirements");
+    }
+    return TensorBufferRequirements(tensor_buffer_requirements);
+  }
+
+  Expected<std::vector<LiteRtTensorBufferType>> SupportedTypes() const {
+    int num_types;
+    if (auto status = LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
+            Get(), &num_types);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status,
+                        "Failed to get the number of supported tensor types");
+    }
+    std::vector<LiteRtTensorBufferType> types(num_types);
+    for (auto i = 0; i < num_types; ++i) {
+      if (auto status =
+              LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
+                  Get(), i, &types[i]);
+          status != kLiteRtStatusOk) {
+        return Unexpected(status, "Failed to get supported tensor type");
+      }
+    }
+    return types;
+  }
+
+  Expected<size_t> BufferSize() const {
+    size_t buffer_size;
+    if (auto status =
+            LiteRtGetTensorBufferRequirementsBufferSize(Get(), &buffer_size);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get tensor buffer size");
+    }
+    return buffer_size;
+  }
+
+  Expected<absl::Span<const uint32_t>> Strides() const {
+    int num_strides;
+    const uint32_t* strides;
+    if (auto status = LiteRtGetTensorBufferRequirementsStrides(
+            Get(), &num_strides, &strides);
+        status != kLiteRtStatusOk) {
+      return Unexpected(status, "Failed to get strides");
+    }
+    return absl::MakeSpan(strides, num_strides);
+  }
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/compiler/plugin/algo.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/compiler/plugin/algo.h
new file mode 100644
index 00000000..6c938ea3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/compiler/plugin/algo.h
@@ -0,0 +1,39 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_ALGO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_ALGO_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+
+namespace litert::internal {
+
+// Identifies sub-DAGs of ops connected w.r.t. the use-def chain. Expects
+// all "ops" belong to the same Subgraph. The ops in the input
+// and output will always be the same.
+std::vector<std::vector<LiteRtOp>> GroupPartitions(
+    const std::vector<LiteRtOp>& ops);
+
+// Outlines "partition" from "root" into the empty subgraph "slice". Assumes
+// the partition is a valid sub-DAG, and replaces it with a single
+// tfl.custom_op in "root". A reference to that op is returned.
+LiteRtOp OutlinePartition(LiteRtSubgraphT& root, LiteRtSubgraph slice,
+                          std::vector<LiteRtOp>& partition);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_ALGO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h
new file mode 100644
index 00000000..f3b93293
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h
@@ -0,0 +1,173 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_PLUGIN_H_
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/core/byte_code_util.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h"
+
+// C++ wrappers and high-level functions for managing compiler plugins
+// and applying them to models.
+
+namespace litert::internal {
+
+// Wraps vendor compiled result. Must be outlived by the CompilerPlugin
+// the generated it.
+class CompiledResult {
+ public:
+  friend class CompilerPlugin;
+
+  // Get the single module of compiled byte code. This contains the
+  // compilation result for all entry points.
+  Expected<BufferRef<uint8_t>> ByteCode() const;
+
+  // Get information regarding the "ith" entry points in the compiled module.
+  // There will be oe entry point for each subgraph compiled for.
+  Expected<absl::string_view> CallInfo(LiteRtParamIndex call_idx) const;
+
+  // Get the number of entry points in the compiled module. This will be equal
+  // to the number of subgraphs passed to the compilation step.
+  Expected<LiteRtParamIndex> NumCalls() const;
+
+  explicit CompiledResult(const LiteRtCompilerPluginApi& parent)
+      : parent_(parent) {}
+
+  CompiledResult(CompiledResult&& other);
+  CompiledResult& operator=(CompiledResult&& other);
+  CompiledResult(const CompiledResult& other) = delete;
+  CompiledResult& operator=(const CompiledResult& other) = delete;
+
+  ~CompiledResult();
+
+ private:
+  LiteRtCompilerPluginApi parent_;
+  LiteRtCompiledResult compiled_result_handle_ = nullptr;
+};
+
+// Wraps vendor compiler plugin.
+class CompilerPlugin {
+ public:
+  std::string DebugString() const;
+
+  // Get the compiler plugin's API version.
+  Expected<LiteRtApiVersion> ApiVersion() const;
+
+  // Get the supported HW accelerators (e.g., GPU, NPU).
+  Expected<LiteRtHwAccelerators> SupportedHardware() const;
+
+  // Get the manufacturer associated with this plugin. NOTE: SocManufacturer
+  // string returned by the underlying plugin are expected to have static
+  // lifetime.
+  absl::string_view SocManufacturer() const {
+    return plugin_api_.get_compiler_plugin_soc_manufacturer();
+  }
+
+  // Get list of unique soc models targetable by this plugin.
+  const std::vector<std::string>& SocModels() const { return soc_models_; }
+
+  // Selects ops for the plugin to compile.
+  Expected<std::vector<LiteRtOp>> Partition(const Subgraph& subgraph);
+
+  // Compile given LiteRtSubgraphs. Result object must be outlived by
+  // this CompilerPlugin.
+  Expected<CompiledResult> Compile(absl::Span<LiteRtSubgraph> partitions,
+                                   absl::string_view soc_model = "");
+
+  // Search for shared library files with prefix "libLiteRtCompilerPlugin" in
+  // the directories passed through "lib_search_paths". Populates
+  // "loaded_plugins" with resolved plugin apis for each found library that can
+  // be succesfully loaded. Additionally initializes the compiler plugin
+  // instances and stores handle.
+  static Expected<std::vector<CompilerPlugin>> LoadPlugins(
+      absl::Span<const absl::string_view> lib_search_paths);
+
+  CompilerPlugin(CompilerPlugin&& other);
+  CompilerPlugin& operator=(CompilerPlugin&& other);
+  CompilerPlugin(const CompilerPlugin& other) = delete;
+  CompilerPlugin& operator=(const CompilerPlugin& other) = delete;
+
+  // Destroys any living `LiteRtCompilerPlugin` and frees reference
+  // to dynamically loaded library.
+  ~CompilerPlugin();
+
+ private:
+  static Expected<CompilerPlugin> LoadPlugin(absl::string_view lib_path);
+  CompilerPlugin() = default;
+
+  std::vector<std::string> soc_models_;
+  void* lib_handle_ = nullptr;
+  LiteRtCompilerPluginApi plugin_api_ = {};
+  LiteRtCompilerPlugin plugin_handle_ = nullptr;
+
+  // Internal LiteRtCompiledResult wrapper.
+
+  CompiledResult MakeResult() const { return CompiledResult(plugin_api_); }
+};
+
+// Higher level functions for applying plugin to graph.
+//===---------------------------------------------------------------------------
+
+// Dispatch op references and their subgraph to be compiled.
+using PartitionResult =
+    std::pair<std::vector<LiteRtOp>, typename LiteRtSubgraphT::Alloc>;
+
+// Applies just the partition phase of the plugin on the model. Returns
+// references newly allocated subgraphs removed from input and their
+// corresponding dispatch ops in the input.
+Expected<PartitionResult> PartitionModel(CompilerPlugin& compiler_plugin,
+                                         LiteRtModelT& model);
+
+// Applies both the partition and compile steps to the model. Generated
+// byte_code will be internalized within the model for later serialization.
+// The serialization parameter refers to the strategy used to pack the byte code
+// during future serialization.
+Expected<void> ApplyPlugin(
+    CompilerPlugin& compiler_plugin, LiteRtModelT& model,
+    absl::string_view soc_model = "",
+    Serialization serialization = Serialization::kAppend);
+
+// Apply all available plugins providing the selected HW accelerators to the
+// given model, modify the model accordingly, and return (1) the number of
+// compiler plugins succesfully applied, (2) a new flatbuffer backing the
+// modified model, (3) a string listing the compiler plugins that were
+// succesfully applied, and (4) a string listing the compiler plugins that
+// failed to apply with an associated error message.
+struct ApplyPluginsResult {
+  size_t num_applied_plugins;
+  OwningBufferRef<uint8_t> new_flatbuffer;
+  std::string success_message;
+  std::string error_message;
+};
+
+Expected<ApplyPluginsResult> ApplyPlugins(
+    LiteRtModel model, LiteRtHwAccelerators selected_hw_accelerators);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/byte_code_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/byte_code_util.h
new file mode 100644
index 00000000..3f6de421
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/byte_code_util.h
@@ -0,0 +1,118 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_BYTE_CODE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_BYTE_CODE_UTIL_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
+
+namespace litert::internal {
+
+// Shared "custom_code" for all dispatch ops.
+static constexpr absl::string_view kLiteRtDispatchOpCustomCode = "DISPATCH_OP";
+
+//
+// Build Stamp
+//
+
+// Maximum size of string for soc_manufacturer.
+static constexpr size_t kSocManufacturerMaxLen = 124;
+
+// Maximum size of string for soc_model.
+static constexpr size_t kSocModelMaxLen = 124;
+
+// The method used for packing byte code with flatbuffer.
+enum Serialization : uint8_t {
+  kUnknown = 0,
+  // Byte code is appended to back of .tflite.
+  kAppend = 1,
+  // Byte code is stored in a metadata buffer [FOR TESTING ONLY].
+  kMetadata = 2
+};
+
+// Metadata key to lookup the build stamp.
+static constexpr absl::string_view kLiteRtBuildStampKey = "LiteRtStamp";
+
+// Make a serialized build stamp that can go directly in the flatbuffer.
+Expected<OwningBufferRef<uint8_t>> MakeBuildStamp(
+    absl::string_view soc_manufacturer, absl::string_view soc_model,
+    Serialization serialization);
+
+// Parse a serialized build stamp from the given buf.
+Expected<std::tuple<absl::string_view, absl::string_view, Serialization>>
+ParseBuildStamp(BufferRef<uint8_t> buf);
+
+//
+// METADATA
+//
+
+// Metadata key for looking up byte code that is directly packed.
+static constexpr absl::string_view kByteCodeMetadataKey = "NPU_BYTE_CODE";
+
+//
+// APPEND: Placeholder for bytecode offset and size.
+//
+
+// Maximum number of digits the byte code size can be base 10.
+static constexpr size_t kByteCodeSizeStrMaxLen = 10;
+
+// Maximum number of digits the byte code offset can be base 10.
+static constexpr size_t kByteCodeOffsetStrMaxLen = 10;
+
+// Prefix before serialized [offset, size, function name].
+static constexpr absl::string_view kByteCodePrefix = "<npu_byte_code>";
+
+// Get a new serialized byte code placeholder buffer with prefix.
+OwningBufferRef<uint8_t> MakeByteCodePlaceholder();
+
+// Parse byte code offset and size serialized as a ByteCodePlaceholder in buf.
+Expected<std::pair<size_t, size_t>> ParseByteCodePlaceholder(
+    BufferRef<uint8_t> buf);
+
+// Replace all byte code placeholders with actual values. This happens directly
+// on a serialized model without changing its size.
+LiteRtStatus FinishByteCodePlaceholders(
+    MutableBufferRef<uint8_t> seralized_model, size_t byte_code_size);
+
+//
+// APPEND: ExecInfo for per-custom op info.
+//
+
+// Maximum length of string for the entry point name.
+static constexpr size_t kEntryPointNameMaxLen = 124;
+
+// Maximum length of a metadata key stored per custom op.
+static constexpr size_t kMetadataKeyMaxLen = 124;
+
+// Make a serialized exec info from the given values.
+Expected<OwningBufferRef<uint8_t>> MakeExecInfo(
+    absl::string_view entrypoint_name, absl::string_view metadata_key);
+
+// Parse serialized exec info from buffer.
+Expected<std::pair<absl::string_view, absl::string_view>> ParseExecInfo(
+    BufferRef<uint8_t> buf);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_BYTE_CODE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/dynamic_loading.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/dynamic_loading.h
new file mode 100644
index 00000000..d0275674
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/dynamic_loading.h
@@ -0,0 +1,79 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DYNAMIC_LOADING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DYNAMIC_LOADING_H_
+
+#include <dlfcn.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
+
+namespace litert::internal {
+
+constexpr absl::string_view kLiteRtSharedLibPrefix = "libLiteRt";
+
+// Check for null and print the last dlerror.
+inline void LogDlError() {
+  char* err = ::dlerror();
+  if (err == nullptr) {
+    return;
+  }
+  LITERT_LOG(LITERT_WARNING, "::dlerror() : %s", err);
+}
+
+// Probes for a list of shared library at given paths and returns when the first
+// one is found. Returns kLiteRtStatusErrorDynamicLoading if none of the shared
+// libraries are found.
+LiteRtStatus OpenLib(const std::vector<std::string>& so_paths,
+                     void** lib_handle);
+
+// Loads shared library at given path. Logging can be disabled to probe for
+// shared libraries.
+LiteRtStatus OpenLib(absl::string_view so_path, void** lib_handle,
+                     bool log_failure = true);
+
+// Closes reference to loaded shared library held by lib_handle.
+LiteRtStatus CloseLib(void* lib_handle);
+
+// Resolves a named symbol from given lib handle of type Sym.
+template <class Sym>
+inline static LiteRtStatus ResolveLibSymbol(void* lib_handle,
+                                            absl::string_view sym_name,
+                                            Sym* sym_handle) {
+  Sym ptr = (Sym)::dlsym(lib_handle, sym_name.data());
+  if (ptr == nullptr) {
+    LITERT_LOG(LITERT_ERROR, "Faild to resolve symbol: %s\n", sym_name.data());
+    LogDlError();
+    return kLiteRtStatusErrorDynamicLoading;
+  }
+  *sym_handle = ptr;
+  return kLiteRtStatusOk;
+}
+
+// Find all litert shared libraries in "search_path" and return
+// kLiteRtStatusErrorInvalidArgument if the provided search_path doesn't
+// exist. All internal dynamically linked dependencies for litert should be
+// prefixed with "libLiteRtCompilerPlugin".
+LiteRtStatus FindLiteRtSharedLibs(absl::string_view search_path,
+                                  std::vector<std::string>& results);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DYNAMIC_LOADING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/environment.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/environment.h
new file mode 100644
index 00000000..23fe16db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/environment.h
@@ -0,0 +1,62 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ENVIRONMENT_H_
+
+#include <any>
+#include <map>
+#include <optional>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert::internal {
+
+// A singleton class that contains global LiteRT environment options.
+class Environment {
+ public:
+  // Create the singleton environment instance with options. Returns an error if
+  // the instance already exists, in which case the specified options have no
+  // effect.
+  static Expected<void> CreateWithOptions(
+      absl::Span<const LiteRtEnvOption> options);
+
+  // Return the envirnment instance and, if not yet created, creates one with no
+  // options.
+  static Expected<Environment*> Instance();
+
+  // Destroy the environment instance.
+  static void Destroy();
+
+  std::optional<LiteRtAny> GetOption(LiteRtEnvOptionTag tag) const {
+    auto i = options_.find(tag);
+    if (i != options_.end()) {
+      return i->second;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+ private:
+  std::map<LiteRtEnvOptionTag, LiteRtAny> options_;
+
+  static Environment* the_instance_;
+};
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/filesystem.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/filesystem.h
new file mode 100644
index 00000000..87146d68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/filesystem.h
@@ -0,0 +1,50 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_FILESYSTEM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_FILESYSTEM_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+// Generic file operations. Try to encapsulate the std filesystem header as much
+// as possible because its technically unapproved.
+
+namespace litert::internal {
+
+// Append all given subpaths together (e.g. os.path.join).
+std::string Join(const std::vector<absl::string_view>& paths);
+
+// Make a new empty file at the given path.
+void Touch(absl::string_view path);
+
+// Does this file exist.
+bool Exists(absl::string_view path);
+
+// Get size of file.
+Expected<size_t> Size(absl::string_view path);
+
+// Load the bytes of the file at given path.
+Expected<OwningBufferRef<uint8_t>> LoadBinaryFile(absl::string_view path);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_FILESYSTEM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h
new file mode 100644
index 00000000..033f6cdd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h
@@ -0,0 +1,43 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_FLATBUFFER_TO_LITERT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_FLATBUFFER_TO_LITERT_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
+
+namespace litert::internal {
+
+LiteRtStatus IsOpSupported(const TflOp& op);
+
+LiteRtStatus IsBufferSupported(const TflBuffer& buffer);
+
+// Checks if the misc non-type non quantization parts of this tensor are
+// supported in the litet model api.
+LiteRtStatus IsTensorSupported(const TflTensor& tensor);
+
+LiteRtElementType MapElementType(TflElementType element_type);
+
+Expected<TensorType> MapTensorType(const TflTensorType& tfl_tensor_type);
+
+Expected<Quantization> MapQuantization(const TflQuantization* tfl_quantization,
+                                       BufferProvider buffer_provider);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_FLATBUFFER_TO_LITERT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/graph_validation.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/graph_validation.h
new file mode 100644
index 00000000..c0a19929
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/graph_validation.h
@@ -0,0 +1,47 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_GRAPH_VALIDATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_GRAPH_VALIDATION_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
+
+// Helper functions for validating the structure of IR graphs.
+
+namespace litert::internal {
+
+// Checks the double-linked edges to immediate neighbors are valid.
+bool ValidateLocalTopology(const LiteRtOpT& litert_op);
+
+// Runs ValidateLocalTopology across given LiteRtOp iterator.
+template <class OpIt>
+bool ValidateLocalTopology(OpIt start, OpIt end) {
+  return std::all_of(start, end,
+                     [](const auto* op) { return ValidateLocalTopology(*op); });
+}
+
+// Checks the following are bijections:
+// * non-const tensor with no defining op <-> subgraph input
+// * tensor with no users <-> subgraph output (assuming no side effect ops)
+// These are used to figure out the i/o signatures when building a subgraph
+// from scratch.
+bool ValidateSubgraphIO(const LiteRtSubgraphT& litert_subgraph);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_GRAPH_VALIDATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/ir_allocator.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/ir_allocator.h
new file mode 100644
index 00000000..4e0a575a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/ir_allocator.h
@@ -0,0 +1,109 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_IR_ALLOCATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_IR_ALLOCATOR_H_
+
+#include <cstddef>
+#include <functional>
+#include <list>
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+
+namespace litert::internal {
+
+// A list of IR objects scoped to the same block (subgraph) that provides
+// pointer stability. Facilitates management of memory and c-like access
+// to elements.
+template <class Ir>
+class IrAllocator {
+ private:
+  using Storage = std::list<Ir>;
+  using Refs = std::vector<Ir*>;
+
+ public:
+  // Emplace a new element onto the list.
+  template <class... Args>
+  Ir& EmplaceBack(Args&&... args) {
+    auto& emp = storage_.emplace_back(std::forward<Args>(args)...);
+    refs_->push_back(&emp);
+    return emp;
+  }
+
+  // Get the array of (stable) pointers to underlying elements. Suitable
+  // for passing through c-like interface. Consituent pointers are always
+  // guarateed to be stable (unless explicitly erased). The array of pointers
+  // itself is guaranteed to be stable so long as no length-changing operations
+  // occur, moving this class does not invalidate pointers or array.
+  absl::Span<Ir*> Elements() const {
+    return absl::MakeSpan(refs_->data(), refs_->size());
+  }
+
+  // Remove elements from the allocator if they match the predicate.
+  // Returns the number of elements removed.
+  size_t RemoveIf(std::function<bool(const Ir& ir)> pred) {
+    auto ref_it = refs_->begin();
+    for (auto it = storage_.begin(); it != storage_.end();) {
+      if (!pred(*it)) {
+        *ref_it = &*it;
+        ++ref_it;
+        ++it;
+        continue;
+      }
+      it = storage_.erase(it);
+    }
+    const size_t removed = refs_->end() - ref_it;
+    refs_->resize(refs_->size() - removed);
+    return removed;
+  }
+
+  // Cuts all but the first `size` elements from storage. Does nothing if `size`
+  // is greater or equal to current size.
+  void ResizeDown(size_t size) {
+    if (size >= Size()) {
+      return;
+    }
+    storage_.resize(size);
+    refs_->resize(size);
+  }
+
+  // Transfers the ownership of given allocator to this one.
+  void Transfer(IrAllocator&& other) {
+    storage_.splice(storage_.cend(), other.storage_);
+    refs_->insert(refs_->end(), other.refs_->cbegin(), other.refs_->cend());
+  }
+
+  // Number of elements stored by this allocator.
+  size_t Size() const { return storage_.size(); }
+
+  IrAllocator() { refs_ = std::make_unique<Refs>(); }
+
+  // IR is generally semantically movable (without reference invalidation)
+  // but not copyable. IrAllocators reflect that, note moving lists
+  // does not invalidate references.
+  IrAllocator(const IrAllocator& other) = delete;
+  IrAllocator& operator=(const IrAllocator& other) = delete;
+  IrAllocator(IrAllocator&& other) = default;
+  IrAllocator& operator=(IrAllocator&& other) = default;
+
+ private:
+  Storage storage_;
+  std::unique_ptr<Refs> refs_;
+};
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_IR_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h
new file mode 100644
index 00000000..4fbe51bf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h
@@ -0,0 +1,32 @@
+
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_LITERT_TO_FLATBUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_LITERT_TO_FLATBUFFER_H_
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
+
+namespace litert::internal {
+
+Expected<TflTensorType> MapTensorType(const TensorType& litert_tensor_type);
+
+Expected<TflQuantizationPtr> MapQuantization(
+    const Quantization& litert_quantization);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_LITERT_TO_FLATBUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model.h
new file mode 100644
index 00000000..40a60af5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model.h
@@ -0,0 +1,828 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/absl_check.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"  // IWYU pragma: export
+#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/byte_code_util.h"
+#include "tensorflow/lite/experimental/litert/core/model/ir_allocator.h"
+#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Internal LiteRtIR
+//
+// These are the backing definitions for the opaque types in the c api
+// (c/litert_model.h).
+//
+// < STORAGE DETAIL >
+//
+// Unless deleted as a result of calls c api client, the lifetime of all "IR
+// Objects" (definitions of opaque types) are designed to be transitively owned
+// by the LiteRtModelT which is generally the longset living object. See various
+// "Emplace" methods.
+//
+// Since c api clients interface with pointers to IR Ojbects, a form of pointer
+// stability is desirable. Classes in this file enforce that pointers to IR
+// Objects are valid for their entire life time. Thus a c api client may store
+// pointers and depend on referential equality of IR Objects thoughout different
+// calls. This also facilitates storing edge/parent-references as pointers
+// within IR Objects.
+//
+// Direct copying is generally not allowed for IR Objects since copying
+// instances of mutually recursive types is not entirely well-defined.
+//
+// IR Objects are generally default constructible to facilitate stable storage
+// and iterative construction.
+//
+// < EXPOSING TFLITE SCHEMA >
+//
+// Direct access to tflite schema types is limited to the "detail" namespace.
+// This indicates that encapsulating all the details of the flatbuffer is a WIP.
+// Future implementations may use different data forms (new litert serialized
+// format, tflite runtime types etc).
+//
+// < USAGE NOTE >
+//
+// The classes here contain only simple getters & setters. Care should be taken
+// to leave the IR in a valid state when using setters since the graph is
+// doubly-linked. Higher-level functionality for correct graph mutation can be
+// found in "model_graph.h".
+////////////////////////////////////////////////////////////////////////////////
+
+// All tflite schema type usage.
+namespace detail {
+
+// OP
+
+// Placeholder for the ind of the dispatch op code added during serialization.
+static constexpr auto kDispatchOpCodeTflInd = -1;
+
+void SetTflOpCodeInd(LiteRtOpT& litert_op, int32_t tfl_op_code_ind);
+
+int32_t GetTflOpCodeInd(const LiteRtOpT& litert_op);
+
+template <class Arg>
+void SetTflOptions(LiteRtOpT& litert_op, Arg&& arg);
+
+const ::litert::internal::TflOptions& GetTflOptions(const LiteRtOpT& litert_op);
+
+::litert::internal::TflOptions&& TakeTflOptions(LiteRtOpT& litert_op);
+
+// WEIGHT
+
+const ::litert::internal::TflBuffer& GetTflBuffer(
+    const LiteRtWeightsT& litert_weights);
+
+litert::internal::TflBufferPtr TakeTflBuffer(LiteRtWeightsT& litert_weights);
+
+void SetTflBuffer(LiteRtWeightsT& litert_weights,
+                  litert::internal::TflBufferPtr tfl_buffer);
+
+// MODEL
+
+const std::vector<::litert::internal::TflOpCodePtr>& GetTflOpCodes(
+    const LiteRtModelT& litert_model);
+
+template <class Arg>
+void SetTflOpCodes(LiteRtModelT& litert_model, Arg&& arg);
+
+std::vector<::litert::internal::TflOpCodePtr>&& TakeTflOpCodes(
+    LiteRtModelT& litert_model);
+
+void SetTflInitFlatbuffer(LiteRtModelT& litert_model,
+                          ::litert::BufferRef<uint8_t> init_flatbuffer);
+
+::litert::BufferRef<uint8_t> GetTflInitFlatbuffer(
+    const LiteRtModelT& litert_model);
+
+}  // namespace detail
+
+//
+// Helpers for conceptual unions from C api.
+//
+
+// // For requesting opaque data stored within IR.
+using BufferProvider = std::function<uint8_t*(size_t size)>;
+
+// TENSOR TYPE
+
+// Detail convenience type for tensor type union.
+typedef union {
+  LiteRtUnrankedTensorType unranked_tensor_type;
+  LiteRtRankedTensorType ranked_tensor_type;
+} TensorTypeDetail;
+
+// Union and identifier for tensor types.
+using TensorType = std::pair<LiteRtTensorTypeId, TensorTypeDetail>;
+
+// Construct tensor type union as ranked tensor. NOTE: Copies data in `dims`.
+TensorType MakeRankedTensorType(LiteRtElementType element_type,
+                                absl::Span<const int32_t> dims);
+
+// QUANTIZATION TYPE
+
+// Detail convenience type for quantization type union.
+typedef union {
+  LiteRtQuantizationPerTensor per_tensor;
+  LiteRtQuantizationPerChannel per_channel;
+} QuantizationDetail;
+
+// Union and identifier for quantization types.
+using Quantization = std::pair<LiteRtQuantizationTypeId, QuantizationDetail>;
+
+// Make default type with quantization info.
+inline Quantization MakeEmptyQuantization() {
+  return Quantization(kLiteRtQuantizationNone, QuantizationDetail());
+}
+
+// Construct quantization type as per tensor.
+Quantization MakePerTensorQuantization(float scale, int64_t zero_point);
+
+// Construct quantization type as per channel, requires buffer callback to
+// store data.
+template <class Scales, class ZeroPoints>
+Quantization MakePerChannelQuantization(const Scales& scales,
+                                        const ZeroPoints& zero_points,
+                                        int32_t quantized_dim,
+                                        BufferProvider buffer_provider) {
+  const auto size = std::size(scales);
+  ABSL_DCHECK_EQ(size, std::size(zero_points));
+
+  Quantization res;
+  res.first = kLiteRtQuantizationPerChannel;
+
+  res.second.per_channel.num_channels = size;
+  res.second.per_channel.quantized_dimension = quantized_dim;
+
+  const size_t scales_buf_size = size * sizeof(float);
+  const size_t zeros_buf_size = size * sizeof(int64_t);
+  auto* scales_buf = reinterpret_cast<float*>(buffer_provider(scales_buf_size));
+  auto* zeros_buf = reinterpret_cast<int64_t*>(buffer_provider(zeros_buf_size));
+  std::copy(std::cbegin(scales), std::cend(scales), scales_buf);
+  std::copy(std::cbegin(zero_points), std::cend(zero_points), zeros_buf);
+
+  res.second.per_channel.scales = scales_buf;
+  res.second.per_channel.zero_points = zeros_buf;
+
+  return res;
+}
+
+//
+// Tensor
+//
+
+// Constant data associated with a tensor.
+class LiteRtWeightsT {
+ private:
+  using OwnedBuffer = ::litert::OwningBufferRef<uint8_t>;
+
+ public:
+  // Underlying data.
+  ::litert::BufferRef<uint8_t> Buf() const {
+    return ::litert::BufferRef<uint8_t>(tfl_buf_->data.data(),
+                                        tfl_buf_->data.size());
+  }
+
+  // Set weights via copied data.
+  void SetFromBuf(::litert::BufferRef<uint8_t> buf) {
+    tfl_buf_->data.assign(buf.Data(), buf.Data() + buf.Size());
+  }
+
+  // Set via copied vec.
+  void SetFromVec(const std::vector<uint8_t>& vec) { tfl_buf_->data = vec; }
+
+  // IR is generally, default constructible and movable but not copyable.
+  LiteRtWeightsT()
+      : tfl_buf_(std::make_unique<::litert::internal::TflBuffer>()) {}
+  LiteRtWeightsT(const LiteRtWeightsT&) = delete;
+  LiteRtWeightsT(LiteRtWeightsT&&) = default;
+  LiteRtWeightsT& operator=(const LiteRtWeightsT&) = delete;
+  LiteRtWeightsT& operator=(LiteRtWeightsT&&) = default;
+
+  // Friendship for internal tflite details.
+  friend const ::litert::internal::TflBuffer& detail::GetTflBuffer(
+      const LiteRtWeightsT& litert_weights);
+
+  friend litert::internal::TflBufferPtr detail::TakeTflBuffer(
+      LiteRtWeightsT& litert_weights);
+
+  friend void detail::SetTflBuffer(LiteRtWeightsT& litert_weights,
+                                   litert::internal::TflBufferPtr tfl_buffer);
+
+ private:
+  // TFLITE
+  ::litert::internal::TflBufferPtr tfl_buf_;
+};
+
+// Fundamental value in a litert program, "edges" in the graph.
+class LiteRtTensorT {
+ private:
+  using UserData = std::unique_ptr<uint8_t[]>;
+
+ public:
+  using Ref = std::reference_wrapper<LiteRtTensorT>;
+  using Use = std::pair<LiteRtOp, LiteRtParamIndex>;
+  using UseVec = std::vector<Use>;
+  using Alloc = ::litert::internal::IrAllocator<LiteRtTensorT>;
+
+  // The ops that take this tensor as input.
+  const std::vector<LiteRtOp>& Users() const { return users_; }
+  std::vector<LiteRtOp>& Users() { return users_; }
+
+  // Which operand index users take this tensor on, respects the ordering of
+  // users..
+  const std::vector<LiteRtParamIndex>& UserArgInds() const {
+    return user_arg_inds_;
+  }
+  std::vector<LiteRtParamIndex>& UserArgInds() { return user_arg_inds_; }
+
+  // Number of uses, same as number of user arg inds.
+  size_t NumUses() const { return users_.size(); }
+
+  // Get the ith use.
+  Use GetUse(size_t ind) const {
+    return {users_.at(ind), user_arg_inds_.at(ind)};
+  }
+
+  // Remove the use at the given index.
+  void RemoveUse(size_t ind) {
+    users_.erase(users_.begin() + ind);
+    user_arg_inds_.erase(user_arg_inds_.begin() + ind);
+  }
+
+  // Get the op that outputs this tensor, null if constant or subgraph input.
+  LiteRtOp DefiningOp() const { return defining_op_; }
+
+  // Get the output index of the op that defines this tensor, only meaningful
+  // if it has a defining op.
+  LiteRtParamIndex DefiningOpOutInd() const { return defining_op_out_ind_; }
+
+  // Update the defining op of this tensor. The caller is required to update the
+  // given op's output if not already correct.
+  void SetDefiningOp(LiteRtOpT& defining_op, LiteRtParamIndex out_ind) {
+    defining_op_ = &defining_op;
+    defining_op_out_ind_ = out_ind;
+  }
+
+  // Set the defining op to none.
+  void ClearDefiningOp() {
+    defining_op_ = nullptr;
+    defining_op_out_ind_ = 0;
+  }
+
+  // Any constant data associated with this tensor.
+  const LiteRtWeightsT& Weights() const { return weights_; }
+  LiteRtWeightsT& Weights() { return weights_; }
+
+  // Authored name associated with this tensor. May be empty.
+  absl::string_view Name() const { return name_; }
+
+  // Update the name associated with this tensor.
+  void SetName(std::string name) { name_ = std::move(name); }
+
+  // Get quantization information for this tensor.
+  const Quantization& Qparams() const { return quantization_; }
+  Quantization& Qparams() { return quantization_; }
+
+  // Set quantization information.
+  template <class Arg>
+  void SetQarams(Arg&& arg) {
+    quantization_ = std::forward<Arg>(arg);
+  }
+
+  // Get the tensor type of this tensor.
+  const TensorType& Type() const { return tensor_type_; }
+  TensorType& Type() { return tensor_type_; }
+
+  // Set the tensor type.
+  template <class Arg>
+  void SetType(Arg&& arg) {
+    tensor_type_ = std::forward<Arg>(arg);
+  }
+
+  // Get a new buffer that will live as long as this tensor. Used for storing
+  // various buffers passed through c-api (dims, quantization etc).
+  uint8_t* RequestBuffer(size_t size) {
+    user_data_.push_back(std::make_unique<uint8_t[]>(size));
+    return user_data_.back().get();
+  }
+
+  // Allow for implicit conversion to bufer provider.
+  // NOLINTNEXTLINE
+  operator BufferProvider() & {
+    return [this](auto s) { return this->RequestBuffer(s); };
+  }
+
+  // IR is generally, default constructible and movable but not copyable.
+  LiteRtTensorT() = default;
+  LiteRtTensorT(const LiteRtTensorT&) = delete;
+  LiteRtTensorT(LiteRtTensorT&&) = default;
+  LiteRtTensorT& operator=(const LiteRtTensorT&) = delete;
+  LiteRtTensorT& operator=(LiteRtTensorT&&) = default;
+
+ private:
+  std::vector<LiteRtOp> users_;
+  std::vector<LiteRtParamIndex> user_arg_inds_;
+
+  LiteRtOp defining_op_ = nullptr;
+  LiteRtParamIndex defining_op_out_ind_;
+
+  LiteRtWeightsT weights_;
+  Quantization quantization_;
+  TensorType tensor_type_;
+
+  std::string name_;
+
+  std::vector<UserData> user_data_;
+};
+
+// Helper to get multiple uses at once.
+template <class Inds>
+LiteRtTensorT::UseVec GetTensorUses(const LiteRtTensorT& tensor,
+                                    const Inds& inds) {
+  auto start = std::cbegin(inds);
+  auto end = std::cend(inds);
+  LiteRtTensorT::UseVec uses(end - start);
+  auto get = [&tensor = std::as_const(tensor)](auto i) {
+    return tensor.GetUse(i);
+  };
+  std::transform(start, end, uses.begin(), get);
+  return uses;
+}
+
+//
+// Op
+//
+
+// Fundamental unit of compute of a litert program, or "nodes" in the graph.
+class LiteRtOpT {
+ public:
+  using Ref = std::reference_wrapper<LiteRtOpT>;
+  using Alloc = ::litert::internal::IrAllocator<LiteRtOpT>;
+
+  // Input tensors for this op.
+  const std::vector<LiteRtTensor>& Inputs() const { return inputs_; }
+  std::vector<LiteRtTensor>& Inputs() { return inputs_; }
+
+  // Access input at given ind.
+  LiteRtTensorT& Input(size_t ind) { return *Inputs().at(ind); }
+  const LiteRtTensorT& Input(size_t ind) const { return *Inputs().at(ind); }
+
+  // Number of input tensors.
+  size_t NumInputs() const { return inputs_.size(); }
+
+  // Output tensors for this op.
+  const std::vector<LiteRtTensor>& Outputs() const { return outputs_; }
+  std::vector<LiteRtTensor>& Outputs() { return outputs_; }
+
+  // Number of output tensors.
+  size_t NumOutputs() const { return outputs_.size(); }
+
+  // Access output at given ind.
+  LiteRtTensorT& Output(size_t ind) { return *Outputs().at(ind); }
+  const LiteRtTensorT& Output(size_t ind) const { return *Outputs().at(ind); }
+
+  // Remove the ith entry of input list.
+  void RemoveInput(size_t ind) { inputs_.erase(inputs_.begin() + ind); }
+
+  // Remove the ith entry of output list.
+  void RemoveOutput(size_t ind) { outputs_.erase(outputs_.begin() + ind); }
+
+  // Get any custom options attached to this op. Empty if there are none.
+  litert::BufferRef<uint8_t> CustomOptions() const { return custom_options_; }
+
+  // Attach custom opaque optins to this op.
+  template <class... Args>
+  void SetCustomOptions(Args&&... args) {
+    custom_options_ =
+        ::litert::OwningBufferRef<uint8_t>(std::forward<Args>(args)...);
+  }
+
+  // Sets the custom options to zero length buffer.
+  void ClearCustomOptions() { custom_options_.Reset(); }
+
+  // Get the op code.
+  LiteRtOpCode OpCode() const { return litert_op_code_; }
+
+  // Set the op code.
+  void SetOpCode(LiteRtOpCode litert_op_code) {
+    litert_op_code_ = litert_op_code;
+  }
+
+  // IR is generally, default constructible and movable but not copyable.
+  LiteRtOpT() = default;
+  LiteRtOpT(const LiteRtOpT&) = delete;
+  LiteRtOpT(LiteRtOpT&&) = default;
+  LiteRtOpT& operator=(const LiteRtOpT&) = delete;
+  LiteRtOpT& operator=(LiteRtOpT&&) = default;
+
+  // Friendship for internal tflite details.
+  friend void detail::SetTflOpCodeInd(LiteRtOpT& litert_op,
+                                      int32_t tfl_op_code_ind);
+
+  friend int32_t detail::GetTflOpCodeInd(const LiteRtOpT& litert_op);
+
+  template <class Arg>
+  friend void detail::SetTflOptions(LiteRtOpT& litert_op, Arg&& arg);
+
+  friend const ::litert::internal::TflOptions& detail::GetTflOptions(
+      const LiteRtOpT& litert_op);
+
+  friend ::litert::internal::TflOptions&& detail::TakeTflOptions(
+      LiteRtOpT& litert_op);
+
+ private:
+  LiteRtOpCode litert_op_code_;
+
+  ::litert::OwningBufferRef<uint8_t> custom_options_;
+
+  std::vector<LiteRtTensor> inputs_;
+  std::vector<LiteRtTensor> outputs_;
+
+  // TFLITE
+  int32_t tfl_op_code_ind_ = detail::kDispatchOpCodeTflInd;
+  ::litert::internal::TflOptions tfl_option_;
+};
+
+//
+// Subgraph
+//
+
+// Fundamental block of a litert program. Manages the storage of all
+// ops and tensor within.
+class LiteRtSubgraphT {
+ public:
+  using Ref = std::reference_wrapper<LiteRtSubgraphT>;
+  using Alloc = ::litert::internal::IrAllocator<LiteRtSubgraphT>;
+
+  // Get a stable pointer for all of the tensors in this subgraph.
+  absl::Span<LiteRtTensor> Tensors() { return tensors_.Elements(); }
+  absl::Span<const LiteRtTensor> Tensors() const { return tensors_.Elements(); }
+
+  // Access the tensor at given ind.
+  LiteRtTensorT& Tensor(size_t ind) { return *Tensors().at(ind); }
+  const LiteRtTensorT& Tensor(size_t ind) const { return *Tensors().at(ind); }
+
+  // Get a stable pointer for all of the ops in this subgraph. Will
+  // be a valid toplological order.
+  absl::Span<LiteRtOp> Ops() { return ops_.Elements(); }
+  absl::Span<const LiteRtOp> Ops() const { return ops_.Elements(); }
+
+  // Access op at the given ind.
+  LiteRtOpT& Op(size_t ind) { return *Ops().at(ind); }
+  const LiteRtOpT& Op(size_t ind) const { return *Ops().at(ind); }
+
+  // All the subgraph input tensors, these also exist in Tensors.
+  const std::vector<LiteRtTensor>& Inputs() const { return inputs_; }
+  std::vector<LiteRtTensor>& Inputs() { return inputs_; }
+
+  // Number of inputs tensors.
+  size_t NumInputs() const { return inputs_.size(); }
+
+  // Access the subgraph input at given ind.
+  LiteRtTensorT& Input(size_t ind) { return *Inputs().at(ind); }
+  const LiteRtTensorT& Input(size_t ind) const { return *Inputs().at(ind); }
+
+  // All the subgraph output tensors, these also exist in Tensors.
+  const std::vector<LiteRtTensor>& Outputs() const { return outputs_; }
+  std::vector<LiteRtTensor>& Outputs() { return outputs_; }
+
+  // Number of outputs tensors.
+  size_t NumOutputs() const { return outputs_.size(); }
+
+  // Access the subgraph output at given ind.
+  LiteRtTensorT& Output(size_t ind) { return *Outputs().at(ind); }
+  const LiteRtTensorT& Output(size_t ind) const { return *Outputs().at(ind); }
+
+  // Clear the entry for the ith input.
+  void ClearInput(size_t ind) { inputs_.erase(inputs_.begin() + ind); }
+
+  // Clear the entry for the ith output.
+  void ClearOutput(size_t ind) { outputs_.erase(outputs_.begin() + ind); }
+
+  // Construct a new tensor which will be owned by this subgraph and get a
+  // reference to it.
+  template <class... Args>
+  LiteRtTensorT& EmplaceTensor(Args&&... args) {
+    return tensors_.EmplaceBack(std::forward<Args>(args)...);
+  }
+
+  // Construct a new op which will be owned by this subgraph and get a
+  // reference to it.
+  template <class... Args>
+  LiteRtOpT& EmplaceOp(Args&&... args) {
+    return ops_.EmplaceBack(std::forward<Args>(args)...);
+  }
+
+  // De-allocates ops that pass given predicate. Returns number of ops removed.
+  size_t RemoveOpIf(std::function<bool(const LiteRtOpT& op)> pred) {
+    return ops_.RemoveIf(pred);
+  }
+
+  // De-allocates tensors that pass given predicate. Returns number of tensors
+  // removed.
+  size_t RemoveTensorIf(std::function<bool(const LiteRtTensorT& tensor)> pred) {
+    return tensors_.RemoveIf(pred);
+  }
+
+  // IR is generally, default constructible and movable but not copyable.
+  LiteRtSubgraphT() = default;
+  LiteRtSubgraphT(const LiteRtSubgraphT&) = delete;
+  LiteRtSubgraphT(LiteRtSubgraphT&&) = default;
+  LiteRtSubgraphT& operator=(const LiteRtSubgraphT&) = delete;
+  LiteRtSubgraphT& operator=(LiteRtSubgraphT&&) = default;
+
+ private:
+  LiteRtTensorT::Alloc tensors_;
+
+  LiteRtOpT::Alloc ops_;
+
+  std::vector<LiteRtTensor> inputs_;
+  std::vector<LiteRtTensor> outputs_;
+};
+
+//
+// Signature
+//
+
+class LiteRtSignatureT {
+ private:
+  using StrVec = std::vector<std::string>;
+
+ public:
+  using Ptr = std::unique_ptr<LiteRtSignatureT>;
+  using Ref = std::reference_wrapper<LiteRtSignatureT>;
+  using Alloc = ::litert::internal::IrAllocator<LiteRtSignatureT>;
+
+  static constexpr absl::string_view kDefaultSignatureKey =
+      "<placeholder signature>";
+
+  LiteRtSignatureT(LiteRtSubgraph subgraph, StrVec input_names,
+                   StrVec output_names, std::string key)
+      : key_(std::move(key)),
+        subgraph_(subgraph),
+        input_names_(std::move(input_names)),
+        output_names_(std::move(output_names)) {}
+
+  // String named inputs for called subgraph.
+  const StrVec& InputNames() const { return input_names_; }
+
+  // String named outputs for called subgraph.
+  const StrVec& OutputNames() const { return output_names_; }
+
+  // Get the callable subgraph.
+  const LiteRtSubgraphT& GetSubgraph() const { return *subgraph_; }
+  LiteRtSubgraphT& GetSubgraph() { return *subgraph_; }
+
+  // Name of the callable signature.
+  absl::string_view Key() const { return key_; }
+
+  bool operator==(const LiteRtSignatureT& other) const {
+    const auto key_eq = key_ == other.key_;
+    const auto subgraph_eq = subgraph_ == other.subgraph_;
+    const auto input_names_eq = input_names_ == other.input_names_;
+    const auto output_names_eq = output_names_ == other.output_names_;
+    return key_eq && subgraph_eq && input_names_eq && output_names_eq;
+  }
+
+  // IR is generally, default constructible and movable but not copyable.
+  LiteRtSignatureT() = default;
+  LiteRtSignatureT(const LiteRtSignatureT&) = delete;
+  LiteRtSignatureT(LiteRtSignatureT&&) = default;
+  LiteRtSignatureT& operator=(const LiteRtSignatureT&) = delete;
+  LiteRtSignatureT& operator=(LiteRtSignatureT&&) = default;
+
+ private:
+  std::string key_;
+
+  LiteRtSubgraph subgraph_;
+
+  StrVec input_names_;
+  StrVec output_names_;
+};
+
+// Make a basic signature from information in the given subgraph. Used with the
+// main subgraph when no explicit signatures have been authored.
+LiteRtSignatureT MakeDefaultSignature(LiteRtSubgraph subgraph);
+
+//
+// Model
+//
+
+// Root-level graph object for litert programs. Manages the storage
+// of all litert graph objects within.
+class LiteRtModelT {
+ private:
+  using MetadataMap =
+      absl::flat_hash_map<std::string, litert::OwningBufferRef<uint8_t>>;
+
+ public:
+  using Ref = std::reference_wrapper<LiteRtModelT>;
+  using Ptr = std::unique_ptr<LiteRtModelT>;
+  using TflOpCodes = std::vector<litert::internal::TflOpCodePtr>;
+
+  // TODO replace this with the index of the default signature.
+  static constexpr const size_t kMainSubgraphIndex = 0;
+
+  // OBSERVERS
+
+  // Get a stable pointer for all of the subgraphs within this model.
+  absl::Span<LiteRtSubgraph> Subgraphs() { return subgraphs_.Elements(); }
+  absl::Span<const LiteRtSubgraph> Subgraphs() const {
+    return subgraphs_.Elements();
+  }
+
+  // Access subgraph at given ind.
+  LiteRtSubgraphT& Subgraph(size_t ind) { return *Subgraphs().at(ind); }
+  const LiteRtSubgraphT& Subgraph(size_t ind) const {
+    return *Subgraphs().at(ind);
+  }
+
+  // Number of subraphs.
+  size_t NumSubgraphs() const { return subgraphs_.Elements().size(); }
+
+  // Default entry point of this model.
+  const LiteRtSubgraphT* MainSubgraph() const {
+    return &Subgraph(kMainSubgraphIndex);
+  }
+  LiteRtSubgraph MainSubgraph() { return &Subgraph(kMainSubgraphIndex); }
+
+  // Look up signature by key.
+  litert::Expected<LiteRtSignatureT::Ref> FindSignature(
+      absl::string_view signature_key) const {
+    for (LiteRtSignature sig : signatures_.Elements()) {
+      if (sig->Key() == signature_key) {
+        return std::ref(*sig);
+      }
+    }
+    return ::litert::Error(kLiteRtStatusErrorNotFound, "Signature not found");
+  }
+
+  // All signatures registered with this model.
+  absl::Span<LiteRtSignature> Signatures() const {
+    return signatures_.Elements();
+  }
+
+  // Look up metadata by key, getting a view of its buffer as a string
+  // if it exists.
+  litert::Expected<litert::BufferRef<uint8_t>> FindMetadata(
+      absl::string_view key) const {
+    if (auto it = metadata_.find(key); it != metadata_.end()) {
+      return it->second;
+    }
+    return ::litert::Error(kLiteRtStatusErrorNotFound);
+  }
+
+  // Metadata key-val pair iterator.
+  MetadataMap::iterator MetadataBegin() { return metadata_.begin(); }
+  MetadataMap::iterator MetadataEnd() { return metadata_.end(); }
+
+  // Remvoe and take ownership of the metadata under given key if it exists.
+  litert::Expected<litert::OwningBufferRef<uint8_t>> PopMetadata(
+      absl::string_view key) {
+    if (auto it = metadata_.find(key); it != metadata_.end()) {
+      return metadata_.extract(it).mapped();
+    }
+    return ::litert::Error(kLiteRtStatusErrorNotFound);
+  }
+
+  // BUILDERS
+
+  // Build a new subgraph and get a stable reference to it.
+  template <class... Args>
+  LiteRtSubgraphT& EmplaceSubgraph(Args&&... args) {
+    return subgraphs_.EmplaceBack(std::forward<Args>(args)...);
+  }
+
+  // Transfers given subgraphs into this model.
+  void TransferSubgraphs(LiteRtSubgraphT::Alloc&& subgraphs) {
+    subgraphs_.Transfer(std::move(subgraphs));
+  }
+
+  // Cut all by the first `size` subgraphs. Does nothing if given size is
+  // greater or equal to current.
+  void ResizeSubgraphsDown(size_t size) { subgraphs_.ResizeDown(size); }
+
+  // Adds a new metadata buffer to the model. Fails if it already exists.
+  template <class... Args>
+  LiteRtStatus PushMetadata(absl::string_view key, Args&&... args) {
+    if (metadata_.contains(key)) {
+      return kLiteRtStatusErrorInvalidArgument;
+    }
+    metadata_.insert(
+        {std::string(key.begin(), key.end()),
+         ::litert::OwningBufferRef<uint8_t>(std::forward<Args>(args)...)});
+    return kLiteRtStatusOk;
+  }
+
+  // Construct a new signature for this model.
+  template <class... Args>
+  LiteRtSignatureT& EmplaceSignature(Args&&... args) {
+    return signatures_.EmplaceBack(std::forward<Args>(args)...);
+  }
+
+  // IR is generally, default constructible and movable but not copyable.
+  LiteRtModelT() = default;
+  LiteRtModelT(const LiteRtModelT&) = delete;
+  LiteRtModelT(LiteRtModelT&&) = default;
+  LiteRtModelT& operator=(const LiteRtModelT&) = delete;
+  LiteRtModelT& operator=(LiteRtModelT&&) = default;
+
+  // Friendship for internal tflite details.
+  friend const TflOpCodes& detail::GetTflOpCodes(
+      const LiteRtModelT& litert_model);
+
+  template <class Arg>
+  friend void detail::SetTflOpCodes(LiteRtModelT& litert_model, Arg&& arg);
+
+  friend TflOpCodes&& detail::TakeTflOpCodes(LiteRtModelT& litert_model);
+
+  friend void detail::SetTflInitFlatbuffer(
+      LiteRtModelT& litert_model, ::litert::BufferRef<uint8_t> init_flatbuffer);
+
+  friend ::litert::BufferRef<uint8_t> detail::GetTflInitFlatbuffer(
+      const LiteRtModelT& litert_model);
+
+ private:
+  LiteRtSubgraphT::Alloc subgraphs_;
+  LiteRtSignatureT::Alloc signatures_;
+
+  MetadataMap metadata_;
+
+  // TFLITE
+  TflOpCodes tfl_operator_codes_;
+  litert::BufferRef<uint8_t> tfl_init_flatbuffer_;
+};
+
+// Lookup subgraph by signature name.
+::litert::Expected<LiteRtSubgraph> LookupSubgraph(
+    const LiteRtModelT& model, absl::string_view signature_key);
+
+//
+// Utils
+//
+
+// Used for communicating selections of ops.
+class LiteRtOpListT {
+ public:
+  void Push(LiteRtOp op) { ops_.push_back(op); }
+
+  std::vector<LiteRtOp> Vec() const {
+    std::vector<LiteRtOp> res;
+    res.reserve(ops_.size());
+    res.assign(ops_.begin(), ops_.end());
+    return res;
+  }
+
+ private:
+  // Investigate if this is possible with vector (hit some issues).
+  std::list<LiteRtOp> ops_;
+};
+
+namespace detail {
+
+template <class Arg>
+void SetTflOptions(LiteRtOpT& litert_op, Arg&& arg) {
+  litert_op.tfl_option_ = std::forward<Arg>(arg);
+}
+
+template <class Arg>
+void SetTflOpCodes(LiteRtModelT& litert_model, Arg&& arg) {
+  litert_model.tfl_operator_codes_ = std::forward<Arg>(arg);
+}
+
+}  // namespace detail
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_buffer.h
new file mode 100644
index 00000000..0914ec4e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_buffer.h
@@ -0,0 +1,37 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_BUFFER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+
+namespace litert::internal {
+
+// Get a buffer that is the concatenation of given tflite file and
+// npu byte code file. Adds metadata containing the offset/size of npu byte
+// code.
+Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
+    absl::string_view tfl_file, absl::string_view npu_file);
+
+// Same as above but takes in litert model and npu byte_code in memory.
+Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
+    LiteRtModelT&& model, BufferRef<uint8_t> npu_byte_code);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_file_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_file_test_util.h
new file mode 100644
index 00000000..4e958d5f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_file_test_util.h
@@ -0,0 +1,50 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_FILE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_FILE_TEST_UTIL_H_
+
+#include <functional>
+
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
+
+namespace litert::internal {
+
+// Callback to get a tfl tensor from it's index.
+using GetTflTensor =
+    std::function<std::reference_wrapper<const TflTensor>(uint32_t ind)>;
+
+// Compare q-params for having the same type and values.
+bool EqualsFbQuantization(const Quantization& litert_quantization,
+                          const TflQuantization* tfl_quantization);
+
+// Compare tensor types for having the same shape and element type.
+bool EqualsFbTensorType(const TensorType& litert_tensor_type,
+                        const TflTensorType& tfl_tensor_type);
+
+// Compare litert op to flatbuffer op along with their input/output tensors
+// types and quantization. Takes a callback to lookup tfl tensors the indices
+// within the tfl op.
+bool EqualsFbOp(const LiteRtOpT& litert_op, const TflOp& tfl_op,
+                GetTflTensor get_tfl_tensor);
+
+// Compare litert tensor to flatbuffer tensor for having same types and
+// quantization.
+bool EqualsFbTensor(const LiteRtTensorT& litert_tensor,
+                    const TflTensor& tfl_tensor);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_FILE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_graph.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_graph.h
new file mode 100644
index 00000000..a6c5f275
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_graph.h
@@ -0,0 +1,108 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_GRAPH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_GRAPH_H_
+
+#include <functional>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_consts.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+
+namespace litert::internal {
+
+// using IrMapping = absl::flat_hash_map<LiteRtTensor, LiteRtTensor>;
+
+// CLONING
+
+// Clones the basic data between tensors (like name and data) but not
+// things related to incoming/outgoing edges (users, defining op) or weights.
+void CloneTo(const LiteRtTensorT& src, LiteRtTensorT& dest);
+
+// Clones the basic data between ops (like op code and options) but
+// things related to incoming/outgoing edges (input/output tensors).
+void CloneTo(const LiteRtOpT& src, LiteRtOpT& dest);
+
+// Same as clone to, but allocates a the dest tensor into given subgraph.
+LiteRtTensorT& MakeClone(LiteRtSubgraphT& parent, const LiteRtTensorT& src);
+
+// Same as clone to, but allocates a the dest op into given subgraph.
+LiteRtOpT& MakeClone(LiteRtSubgraphT& parent, const LiteRtOpT& src);
+
+// OBSERVERS
+
+// Checks if tensor is input to given op, return its index if so.
+std::optional<LiteRtParamIndex> FindInput(const LiteRtOpT& op,
+                                          const LiteRtTensorT& tensor);
+
+// Checks if tensor is output to given op, return its index if so.
+std::optional<LiteRtParamIndex> FindOutput(const LiteRtOpT& op,
+                                           const LiteRtTensorT& tensor);
+
+// Checks if tensor is input to given subgraph, return its index if so.
+std::optional<LiteRtParamIndex> FindInput(const LiteRtSubgraphT& subgraph,
+                                          const LiteRtTensorT& tensor);
+
+// Checks if tensor is output to given subgraph, return its index if so.
+std::optional<LiteRtParamIndex> FindOutput(const LiteRtSubgraphT& subgraph,
+                                           const LiteRtTensorT& tensor);
+
+// Check if tensor is part of subgraph IO.
+bool IsIO(const LiteRtSubgraphT& subgraph, const LiteRtTensorT& tensor);
+
+using UseIndices =
+    absl::InlinedVector<LiteRtParamIndex, kExpectedMaxNumOfTensorUses>;
+
+// Checks if tensor is used by op, return the use inds for each use of tensor by
+// op (there may be multiple). These are the indexes to call
+// LiteRtTensorT::GetUse with.
+UseIndices FindUseInds(const LiteRtTensorT& tensor, const LiteRtOpT& op);
+
+// Is this tensor a constant tensor?
+bool IsConstant(const LiteRtTensorT& tensor);
+
+// MUTATORS
+
+// Attaches the pre-allocated tensor to be an input of given op.
+void AttachInput(LiteRtTensor tensor, LiteRtOpT& op);
+
+// Attaches the pre-allocated tensor to be an output of given op.
+void AttachOutput(LiteRtTensor tensor, LiteRtOpT& op);
+
+// Remove the input edge from an op. Return the disconnected tensor.
+LiteRtTensor DisconnectInput(LiteRtOpT& op, LiteRtParamIndex input_ind);
+
+// Remove an output edge from an op. Return the disconnected tensor.
+LiteRtTensor DisconnectOutput(LiteRtOpT& op, LiteRtParamIndex output_ind);
+
+// Remove all incoming and outgoing edges from this op. This can prep nodes
+// for removal in DCE.
+void Drop(LiteRtOpT& litert_op);
+
+// Run very naive dead code elimination. Removes only ops/tensors that have no
+// in/out edges. Ops are handled first. Ignores subgraph IO. Not recursive and
+// does only one pass. Returns if the graph was modified.
+// NOTE: This de-allocates removed objects, only use when references to these
+// objects will not be used.
+// TODO: Update this with complete work-list based approach.
+bool DCE(LiteRtSubgraphT& subgraph);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_load.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_load.h
new file mode 100644
index 00000000..b6a8c2cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_load.h
@@ -0,0 +1,36 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_LOAD_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_LOAD_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+
+namespace litert::internal {
+
+Expected<std::unique_ptr<LiteRtModelT>> LoadModelFromFile(
+    absl::string_view filename);
+
+Expected<std::unique_ptr<LiteRtModelT>> LoadModelFromBuffer(
+    BufferRef<uint8_t> buffer);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_LOAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_serialize.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_serialize.h
new file mode 100644
index 00000000..61a4b51b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/model/model_serialize.h
@@ -0,0 +1,46 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_SERIALIZE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_SERIALIZE_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Serializes model to bytes.
+// NOTE this destroys the model before it returns unless destroy_model is false.
+// NOTE: Caller takes ownership of `buf`. Flatbuffers are packed into their
+// arrays back to front, so the valid flatbuffer is buf[offset, size].
+LiteRtStatus LiteRtSerializeModel(LiteRtModel model, uint8_t** buf,
+                                  size_t* size, size_t* offset,
+                                  bool destroy_model = true);
+
+#ifdef __cplusplus
+}
+
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert::internal {
+
+Expected<OwningBufferRef<uint8_t>> SerializeModel(LiteRtModelT&& model);
+
+}  // namespace litert::internal
+
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_SERIALIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h
new file mode 100644
index 00000000..bfeca1e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h
@@ -0,0 +1,287 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_FLATBUFFER_TOOLS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_FLATBUFFER_TOOLS_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_consts.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace litert::internal {
+
+// Flatbuffer IR
+
+using TflTensor = ::tflite::TensorT;
+using TflOp = ::tflite::OperatorT;
+using TflBuffer = ::tflite::BufferT;
+using TflSubgraph = ::tflite::SubGraphT;
+using TflModel = ::tflite::ModelT;
+using TflOpCodeEnum = ::tflite::BuiltinOperator;
+using TflOpCode = ::tflite::OperatorCodeT;
+using TflQuantization = ::tflite::QuantizationParametersT;
+using TflElementType = ::tflite::TensorType;
+using TflOptions = ::tflite::BuiltinOptionsUnion;
+using TflSignature = ::tflite::SignatureDefT;
+using TflMetadata = ::tflite::MetadataT;
+
+using TflBufferPtr = std::unique_ptr<TflBuffer>;
+using TflModelPtr = std::unique_ptr<TflModel>;
+using TflQuantizationPtr = std::unique_ptr<TflQuantization>;
+using TflOpCodePtr = std::unique_ptr<TflOpCode>;
+using TflSubgraphPtr = std::unique_ptr<TflSubgraph>;
+using TflTensorPtr = std::unique_ptr<TflTensor>;
+using TflOpPtr = std::unique_ptr<TflOp>;
+using TflSignaturePtr = std::unique_ptr<TflSignature>;
+using TflMetadataPtr = std::unique_ptr<TflMetadata>;
+
+// Code and verion.
+using TflOpCodeDetail = std::pair<TflOpCodeEnum, int32_t>;
+
+// Zero-point, scale.
+using TflPerTensorQParams = std::pair<int64_t, float>;
+
+// Quantized dim, num channels, zero-points, scales.
+using TflPerChannelQParams =
+    std::tuple<int32_t, size_t, std::vector<int64_t>, std::vector<float>>;
+
+// Mirror of all the tensor type related fields in flatbuffer tensor definition.
+struct TflShapeInfo {
+  // Fixed or dynamic rank.
+  bool has_rank;
+
+  // Basic shape, all elements are non-negative (even if this is a dynamic
+  // shape).
+  absl::InlinedVector<int32_t, kExpectedMaxTensorRank> shape;
+
+  // Dynamic dyn info. If this is not empty, then its length is equal to shape.
+  // If i is a dyn dim, then shape[i] == 1 and shape_signature[i] < 0. Otherwise
+  // shape_signature[i] == shape[i].
+  absl::InlinedVector<int32_t, kExpectedMaxTensorRank> shape_signature;
+
+  // Convert from a single dims array. Will detect if array is static/dynamic
+  // and populate fields accordingly.
+  explicit TflShapeInfo(absl::Span<const int32_t> shape_data) : has_rank(true) {
+    bool is_dyn = false;
+    shape.reserve(shape_data.size());
+    shape_signature.reserve(shape_data.size());
+    for (auto d : shape_data) {
+      if (d >= 0) {
+        shape.push_back(d);
+        shape_signature.push_back(d);
+      } else {
+        is_dyn = true;
+        shape.push_back(1);
+        shape_signature.push_back(-1);
+      }
+    }
+    if (!is_dyn) {
+      shape_signature.clear();
+    }
+  }
+
+  // Convert from tensor.
+  explicit TflShapeInfo(const TflTensor& tfl_tensor)
+      : has_rank(tfl_tensor.has_rank),
+        shape(tfl_tensor.shape.begin(), tfl_tensor.shape.end()),
+        shape_signature(tfl_tensor.shape_signature.begin(),
+                        tfl_tensor.shape_signature.end()) {}
+};
+
+using TflTensorType = std::pair<TflElementType, TflShapeInfo>;
+
+// Flatbuffer bytes util.
+
+// Convenience method to get string view from native flatbuffer chars.
+absl::string_view FbBufToStr(const uint8_t* fb_data, size_t size);
+
+// Span version.
+absl::string_view FbBufToStr(absl::Span<const uint8_t> fb_buf);
+
+// Convenience method to get mutable signed char span from native flatbuffer
+// chars.
+absl::Span<char> FbBufToStr(uint8_t* fb_data, size_t size);
+
+// Span to span version.
+absl::Span<char> FbBufToStr(absl::Span<uint8_t> fb_buf);
+
+// Flatbuffer verifiers.
+
+// Verifies given serialized flatbuffer
+bool VerifyFlatbuffer(const uint8_t* buf, size_t buf_size);
+
+// Override of above with view input.
+bool VerifyFlatbuffer(absl::Span<const uint8_t> buf);
+
+// TFL flatbuffer IR helpers.
+
+// Get the metadata buffer under given key if it exists.
+Expected<BufferRef<uint8_t>> GetMetadata(absl::string_view key,
+                                         const TflModel& model);
+
+// Get the metadata buffer under given key if it exists that can be written to.
+Expected<MutableBufferRef<uint8_t>> GetMutableMetadata(absl::string_view key,
+                                                       TflModel& model);
+
+// Push the given metadata to the given key if the key does not already exist.
+LiteRtStatus PushMetadata(absl::string_view key, TflModel& model,
+                          BufferRef<uint8_t> metadata);
+
+// Get the buffer object at the given index if it exists.
+Expected<BufferRef<uint8_t>> GetTflBuffer(const TflModel& tfl_model,
+                                          uint32_t buffer_ind);
+
+// Get the buffer object at the given index if it exists that can be written to.
+Expected<MutableBufferRef<uint8_t>> GetMutableTflBuffer(TflModel& tfl_model,
+                                                        uint32_t buffer_ind);
+
+// Get a non-owning view of tfl buffer if it exists.
+Expected<const TflBuffer*> GetBuffer(const TflModel& tfl_model,
+                                     uint32_t buffer_ind);
+
+// Move and take ownership of the buffer object at given index if it exists.
+Expected<TflBufferPtr> TakeBuffer(TflModel& tfl_model, uint32_t buffer_ind);
+
+// Add a new buffer to the tflite model, returning its index.
+Expected<uint32_t> PushTflBuffer(TflModel& tfl_model,
+                                 BufferRef<uint8_t> buffer);
+
+// Make a tflite buffer from data.
+template <class T>
+TflBufferPtr MakeTflBuffer(std::initializer_list<T> data) {
+  auto res = std::make_unique<TflBuffer>();
+  const auto byte_size = data.size() * sizeof(T);
+  res->data.resize(byte_size);
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    auto* write_to =
+        reinterpret_cast<T*>(res->data.data()) + (it - data.begin());
+    *write_to = *it;
+  }
+  res->size = res->data.size();
+  res->offset = 0;
+  return res;
+}
+
+// Get the op code from the model at the given index if it exists.
+Expected<TflOpCodeEnum> GetTflOpCode(const TflModel& tfl_model,
+                                     uint32_t op_code_ind);
+
+// Is tensor fixed rank, with possible dynamic dims.
+bool IsRankedTensorType(const TflShapeInfo& tfl_shape);
+
+// Is ranked tensor type with static shape.
+bool IsStaticTensorType(const TflShapeInfo& tfl_shape);
+
+// Get static shape info if given is indeed a static shape.
+Expected<absl::Span<const int32_t>> AsStaticShape(
+    const TflShapeInfo& tfl_shape);
+
+// Get ranked dynamic shape info if given is indeed a ranked. Still works with
+// static shapes.
+Expected<absl::Span<const int32_t>> AsDynamicShape(
+    const TflShapeInfo& tfl_shape);
+
+// Is the tensor quantized.
+bool IsQuantized(const TflQuantization* tfl_quantization);
+
+// Is the tensor per-tensor quantized.
+bool IsPerTensorQuantized(const TflQuantization* tfl_quantization);
+
+// Is the tensor per-channel quantized.
+bool IsPerChannelQuantized(const TflQuantization* tfl_quantization);
+
+// Is the tensor block-wise quantized.
+bool IsBlockWiseQuantized(const TflQuantization* tfl_quantization);
+
+// Does tensor have custom quantization.
+bool IsCustomQuantized(const TflQuantization* tfl_quantization);
+
+// Get the per-tensor tensor q-params if given tensor has them.
+Expected<TflPerTensorQParams> AsPerTensorQparams(
+    const TflQuantization* tfl_quantization);
+
+// Get the per-channel tensor q-params if given tensor has them.
+Expected<TflPerChannelQParams> AsPerChannelQparams(
+    const TflQuantization* tfl_quantization);
+
+// Flatbuffer management helpers.
+
+// Make a tfl allocation from buffer.
+::tflite::Allocation::Ptr MakeAllocation(BufferRef<uint8_t> buf);
+
+// Wrapper around a tflite model buffer.
+class FlatbufferWrapper {
+ public:
+  using Ptr = std::unique_ptr<FlatbufferWrapper>;
+
+  // Load flatbuffer from file.
+  static Expected<Ptr> CreateFromTflFile(absl::string_view path);
+
+  // Load flatbuffer from allocated buffer that will be copied.
+  static Expected<Ptr> CreateFromBuffer(BufferRef<uint8_t> buffer);
+
+  // Load flatbuffer from allocated buffer and take ownership.
+  static Expected<Ptr> CreateFromBuffer(OwningBufferRef<uint8_t>&& buffer);
+
+  // Underlying buffer.
+  BufferRef<uint8_t> Buf() const {
+    return BufferRef<uint8_t>(alloc_->base(), alloc_->bytes());
+  }
+
+  // Underlying model object.
+  const ::tflite::FlatBufferModel& FlatbufferModel() const {
+    return *fb_model_;
+  }
+
+  // Unpack the contained flatbuffer.
+  TflModelPtr Unpack() const {
+    return TflModelPtr(fb_model_->GetModel()->UnPack());
+  }
+
+ private:
+  FlatbufferWrapper(::tflite::FlatBufferModel::Ptr fb_model,
+                    ::tflite::Allocation::Ptr alloc,
+                    OwningBufferRef<uint8_t>&& model_buf)
+      : fb_model_(std::move(fb_model)),
+        alloc_(std::move(alloc)),
+        model_buf_(std::forward<OwningBufferRef<uint8_t>>(model_buf)) {}
+
+  ::tflite::FlatBufferModel::Ptr fb_model_;
+  ::tflite::Allocation::Ptr alloc_;
+  OwningBufferRef<uint8_t> model_buf_;
+};
+
+// Re-serialize the unpacked model from flatbuffer wrapper.
+OwningBufferRef<uint8_t> SerializeFlatbuffer(
+    const FlatbufferWrapper& flatbuffer);
+OwningBufferRef<uint8_t> SerializeFlatbuffer(const TflModel& tfl_model);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_FLATBUFFER_TOOLS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/util/tensor_type_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/util/tensor_type_util.h
new file mode 100644
index 00000000..9663b2ac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/core/util/tensor_type_util.h
@@ -0,0 +1,111 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_TENSOR_TYPE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_TENSOR_TYPE_UTIL_H_
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert::internal {
+
+struct Ratio {
+  using Type = int;
+  Type num;
+  Type denom;
+  std::string ToString() const { return absl::StrCat(num, "/", denom); }
+};
+
+Expected<Ratio> GetElementSize(LiteRtElementType element_type);
+
+// Get the number of elements in a tensor with given dimensions.
+template <typename T>
+Expected<size_t> GetNumElements(absl::Span<T> dimensions) {
+  size_t num_elements = 1;
+  for (auto i = 0; i < dimensions.size(); ++i) {
+    auto dim = dimensions[i];
+    if (dim < 0) {
+      return Unexpected(kLiteRtStatusErrorInvalidArgument,
+                        "Unexpected negative dimension");
+    } else if (dim == 0) {
+      return Unexpected(kLiteRtStatusErrorInvalidArgument,
+                        "Unexpected 0 dimension");
+    }
+    num_elements *= dim;
+  }
+  return num_elements;
+}
+
+inline Expected<size_t> GetNumElements(
+    const LiteRtRankedTensorType& tensor_type) {
+  return GetNumElements(
+      absl::MakeSpan(tensor_type.layout.dimensions, tensor_type.layout.rank));
+}
+
+// Get the minimum number of bytes necessary to represent a packed tensor with a
+// given element type and dimensions.
+template <typename T>
+Expected<size_t> GetNumPackedBytes(LiteRtElementType element_type,
+                                   absl::Span<T> dimensions) {
+  auto element_size = GetElementSize(element_type);
+  if (!element_size) {
+    return element_size.Error();
+  }
+  auto num_elements = GetNumElements(dimensions);
+  if (!num_elements) {
+    return num_elements.Error();
+  }
+  return ((*num_elements * element_size->num) + (element_size->denom - 1)) /
+         element_size->denom;
+}
+
+// Get the number of bytes necessary to represent a packed tensor type, ignoring
+// any stride information.
+inline Expected<size_t> GetNumPackedBytes(
+    const LiteRtRankedTensorType& tensor_type) {
+  return GetNumPackedBytes(
+      tensor_type.element_type,
+      absl::MakeSpan(tensor_type.layout.dimensions, tensor_type.layout.rank));
+}
+
+// Get the minimum number of bytes necessary to represent a possibly unpacked
+// tensor with a given element type, dimensions, and strides.
+template <typename T, typename U>
+Expected<size_t> GetNumBytes(LiteRtElementType element_type,
+                             absl::Span<T> dimensions, absl::Span<U> strides) {
+  if (dimensions.size() != strides.size()) {
+    return Unexpected(
+        kLiteRtStatusErrorInvalidArgument,
+        "Dimensions and strides have different number of elements");
+  }
+  auto element_size = GetElementSize(element_type);
+  if (!element_size) {
+    return element_size.Error();
+  }
+  auto rank = dimensions.size();
+  size_t num_elements = 1;
+  for (auto i = 0; i < rank; ++i) {
+    num_elements += (dimensions[i] - 1) * strides[i];
+  }
+  return ((num_elements * element_size->num) + (element_size->denom - 1)) /
+         element_size->denom;
+}
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_TENSOR_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h
new file mode 100644
index 00000000..e7575109
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h
@@ -0,0 +1,53 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_AHWB_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_AHWB_BUFFER_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_event.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+#if LITERT_HAS_AHWB_SUPPORT
+#include <android/hardware_buffer.h>
+#else
+// Define a place holder AHardwareBuffer struct just to enable compilation.
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+typedef struct AHardwareBuffer AHardwareBuffer;
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // LITERT_HAS_AHWB_SUPPORT
+
+namespace litert {
+namespace internal {
+
+struct AhwbBuffer {
+  AHardwareBuffer* ahwb;
+
+  static bool IsSupported();
+  static Expected<AhwbBuffer> Alloc(size_t size);
+  static void Free(AHardwareBuffer* ahwb);
+  static Expected<size_t> GetSize(AHardwareBuffer* ahwb);
+  static Expected<void*> Lock(AHardwareBuffer* ahwb,
+                              LiteRtEvent event = nullptr);
+  static Expected<void> Unlock(AHardwareBuffer* ahwb);
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_AHWB_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/compiled_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/compiled_model.h
new file mode 100644
index 00000000..9398c98a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/compiled_model.h
@@ -0,0 +1,162 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_COMPILED_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_COMPILED_MODEL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
+#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model_builder.h"
+
+// The LiteRtCompiledModelT is internal implementation of CompiledModel C++ API.
+class LiteRtCompiledModelT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtCompiledModelT>;
+
+  LiteRtCompiledModelT() = default;
+  ~LiteRtCompiledModelT() = default;
+
+  // Creates a LiteRtCompiledModelT from a LiteRtModel object.
+  // The model is loaded into memory and the caller takes ownership of the
+  // returned object.
+  static litert::Expected<Ptr> Create(
+      LiteRtModel model, LiteRtCompilationOptions compilation_options);
+
+  // Returns the buffer requirements for the n-th input tensor. The returned
+  // LiteRtTensorBufferRequirements is used to create the input tensor
+  // buffer.
+  litert::Expected<LiteRtTensorBufferRequirements> GetInputBufferRequirements(
+      absl::string_view signature_key, size_t input_index);
+
+  // The same as GetInputBufferRequirements() for C API.
+  litert::Expected<LiteRtTensorBufferRequirements>
+  GetInputBufferRequirementsCApi(size_t signature_index, size_t input_index) {
+    if (signature_index >= signature_keys_.size()) {
+      return litert::Unexpected(
+          kLiteRtStatusErrorIndexOOB,
+          "Signature index is out of range of signature keys");
+    }
+    return GetInputBufferRequirements(*signature_keys_[signature_index],
+                                      input_index);
+  }
+
+  // Returns the buffer requirements for the n-th output tensor. The returned
+  // LiteRtTensorBufferRequirements is used to create the output tensor
+  // buffer.
+  litert::Expected<LiteRtTensorBufferRequirements> GetOutputBufferRequirements(
+      absl::string_view signature_key, size_t output_index);
+
+  // The same as GetOutputBufferRequirements() for C API.
+  litert::Expected<LiteRtTensorBufferRequirements>
+  GetOutputBufferRequirementsCApi(size_t signature_index, size_t output_index) {
+    if (signature_index >= signature_keys_.size()) {
+      return litert::Unexpected(
+          kLiteRtStatusErrorIndexOOB,
+          "Signature index is out of range of signature keys");
+    }
+    return GetOutputBufferRequirements(*signature_keys_[signature_index],
+                                       output_index);
+  }
+
+  // Runs the model of the given signature with the provided input/output
+  // litert::TensorBuffers.
+  litert::Expected<void> Run(
+      absl::string_view signature_key,
+      const std::vector<LiteRtTensorBuffer>& input_buffers,
+      const std::vector<LiteRtTensorBuffer>& output_buffers);
+
+  // The same as Run() for C API.
+  litert::Expected<void> RunCApi(size_t signature_index,
+                                 size_t num_input_buffers,
+                                 LiteRtTensorBuffer* input_buffers,
+                                 size_t num_output_buffers,
+                                 LiteRtTensorBuffer* output_buffers);
+
+ private:
+  // Processes the model and initializes the internal states.
+  // This is called in the public Create*() methods.
+  litert::Expected<void> Initialize();
+
+  // Returns the buffer requirements for the given tensor.
+  litert::Expected<LiteRtTensorBufferRequirements> GetTensorBufferRequirements(
+      const TfLiteTensor* tensor);
+
+  // Returns the SignatureRunner for the given signature key.
+  // If the signature key is not found, returns nullptr.
+  tflite::SignatureRunner* GetSignatureRunner(absl::string_view signature_key);
+
+  // Registers the TensorBuffer for the given tensor with the SignatureRunner.
+  // If the TensorBuffer can be directly consumed as CPU Tensors, they'll be
+  // locked and use it with CustomAllocation. The buffer is locked by
+  // LiteRtTensorBufferScopedLock and kept in the `scoped_locks`. It will be
+  // unlocked automatically when the `scoped_locks` are destroyed.
+  litert::Expected<void> RegisterBuffer(
+      tflite::SignatureRunner* runner, const TfLiteTensor* tensor,
+      const char* tensor_name, LiteRtTensorBuffer buffer, bool is_input,
+      std::vector<litert::TensorBufferScopedLock>& scoped_locks);
+
+  void RegisterDelegate(tflite::TfLiteOpaqueDelegateUniquePtr&& delegate) {
+    delegates_.push_back(std::move(delegate));
+  }
+
+  // Map from signature key to SignatureRunner. This is used to lazy calling
+  // GetSignatureRunner() which is expensive.
+  absl::flat_hash_map<absl::string_view, tflite::SignatureRunner*>
+      signature_runners_;
+
+  // The buffer requirement maps for CPU buffers. For delegates with CPU
+  // buffers, they don't register TensorBufferRequirements. Instead, the
+  // CompiledModel creates the TensorBufferRequirements and stores them
+  // in this map.
+  absl::flat_hash_map<const TfLiteTensor*, litert::TensorBufferRequirements>
+      cpu_buffer_requirements_;
+
+  // The Interpreter and related objects used to run the model.
+  std::unique_ptr<::tflite::Interpreter> interp_;
+  std::unique_ptr<::tflite::FlatBufferModel> fb_model_;
+  std::unique_ptr<::tflite::Allocation> alloc_;
+  litert::OwningBufferRef<uint8_t> model_buf_;
+  std::vector<const std::string*> signature_keys_;
+
+  // The ExternalLiteRtBufferContext used to register tensor buffers with
+  // Delegates.
+  // Note: The ExternalLiteRtBufferContext must be destroyed after the
+  // Interpreter.
+  std::unique_ptr<litert::internal::ExternalLiteRtBufferContext>
+      buffer_context_;
+
+  std::vector<tflite::TfLiteOpaqueDelegateUniquePtr> delegates_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_COMPILED_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h
new file mode 100644
index 00000000..56b6ea7b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h
@@ -0,0 +1,115 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_KERNEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_KERNEL_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+
+namespace litert {
+namespace internal {
+
+// A TFL kernel that the interpreter calls to dispatch execution through the
+// Dispatch API.
+class DispatchDelegateKernel
+    : public tflite::SimpleOpaqueDelegateKernelInterface {
+ public:
+  using Ptr = std::unique_ptr<tflite::SimpleOpaqueDelegateKernelInterface>;
+
+  ~DispatchDelegateKernel() override;
+
+  static Expected<Ptr> Create(std::string&& graph_name,
+                              const LiteRtDispatchDelegateOptions& options);
+
+  TfLiteStatus Init(TfLiteOpaqueContext* context,
+                    const TfLiteOpaqueDelegateParams* params) override;
+
+  TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                       TfLiteOpaqueNode* node) override;
+
+  TfLiteStatus Eval(TfLiteOpaqueContext* context,
+                    TfLiteOpaqueNode* node) override;
+
+ private:
+  DispatchDelegateKernel(const LiteRtDispatchDelegateOptions& options,
+                         std::string&& graph_name,
+                         LiteRtDispatchDeviceContext device_context)
+      : options_(options),
+        graph_name_(std::move(graph_name)),
+        device_context_(device_context) {}
+
+  Expected<TensorBufferRequirements> GetBufferRequirements(
+      const RankedTensorType& tensor_type, int io_tensor_index,
+      bool is_input) const;
+
+  // Creates a new tensor buffer for the given tensor. After that the created
+  // tensor buffer is registered with RegisterLiteRtTensorBuffer().
+  TfLiteStatus CreateAndSetBuffer(const TfLiteOpaqueTensor* tfl_opaque_tensor,
+                                  int buffer_index, bool is_input);
+
+  // Registers the given LiteRtTensorBuffer (and its size) with the Dispatch
+  // API.
+  // Also update the internal state (input_tensor_buffers_, etc.) to keep track
+  // of the registered tensor buffers.
+  TfLiteStatus RegisterLiteRtTensorBuffer(TensorBuffer&& tensor_buffer,
+                                          size_t used_size, int buffer_index,
+                                          bool is_input);
+
+  // Registers LiteRtTensorBuffers for all inputs and outputs of the given
+  // node.
+  // Also update the internal state (input_tensor_buffers_, etc.) to keep track
+  // of the registered tensor buffers.
+  TfLiteStatus RegisterLiteRtTensorBuffers(TfLiteOpaqueContext* context,
+                                           TfLiteOpaqueNode* node);
+
+  const LiteRtDispatchDelegateOptions& options_;
+  std::string graph_name_;
+  LiteRtDispatchDeviceContext device_context_;
+  LiteRtDispatchInvocationContext invocation_context_ = nullptr;
+
+  // Indicates whether the input tensor buffer requires a CPU sync before
+  // invoking the Dispatch API.
+  std::vector<bool> input_tensor_buffers_require_cpu_sync_;
+
+  std::vector<TensorBuffer> input_tensor_buffers_;
+  std::vector<LiteRtTensorBufferHandle> input_tensor_buffer_handles_;
+  std::vector<size_t> input_tensor_buffer_used_size_;
+
+  // Indicates whether the output tensor buffer requires a CPU sync after
+  // invoking the Dispatch API.
+  std::vector<bool> output_tensor_buffers_require_cpu_sync_;
+
+  std::vector<TensorBuffer> output_tensor_buffers_;
+  std::vector<LiteRtTensorBufferHandle> output_tensor_buffer_handles_;
+  std::vector<size_t> output_tensor_buffer_used_size_;
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h
new file mode 100644
index 00000000..030c022d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h
@@ -0,0 +1,113 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_OPTIONS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_OPTIONS_H_
+
+#include <any>
+#include <cstdint>
+#include <map>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_any.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
+#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/core/environment.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+
+class LiteRtDispatchDelegateOptions {
+ public:
+  LiteRtDispatchDelegateOptions() {
+    auto environment = litert::internal::Environment::Instance();
+    if (!environment) {
+      LITERT_LOG(LITERT_WARNING, "LiteRT environment not found");
+      return;
+    }
+
+    auto option =
+        (*environment)->GetOption(kLiteRtEnvOptionTagDispatchLibraryPath);
+    if (!option.has_value()) {
+      return;
+    }
+
+    if (option->type != kLiteRtAnyTypeString) {
+      LITERT_LOG(LITERT_WARNING,
+                 "Ingoring option kLiteRtEnvOptionTagDispatchLibraryPath due "
+                 "to invalid value");
+      return;
+    }
+
+    LiteRtDispatchOption dispatch_option = {
+        /*.name=*/kDispatchOptionSharedLibraryDir,
+        /*.value=*/*option,
+    };
+    AddOption(dispatch_option);
+  }
+
+  // Push a new dispatch option.
+  void AddOption(LiteRtDispatchOption option) { options_.push_back(option); }
+
+  // Get all dispatch options.
+  const std::vector<LiteRtDispatchOption>& GetDispatchOptions() const {
+    return options_;
+  }
+
+  // Find a dispatch option under the given name if it exists.
+  litert::Expected<std::any> FindDispatchOption(absl::string_view name) const {
+    for (const auto& option : options_) {
+      if (option.name != name) {
+        continue;
+      }
+      return litert::ToStdAny(option.value);
+    }
+    return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
+  }
+
+ private:
+  std::vector<LiteRtDispatchOption> options_;
+};
+
+//
+// Common options
+//
+
+static constexpr absl::string_view kAllocBase = "alloc_base";
+
+inline void AddAllocBaseOption(const void* alloc_base,
+                               LiteRtDispatchDelegateOptions& opts) {
+  LiteRtAny opt;
+  opt.type = kLiteRtAnyTypeVoidPtr;
+  opt.ptr_value = alloc_base;
+  opts.AddOption(LiteRtDispatchOption{kAllocBase.data(), opt});
+}
+
+inline litert::Expected<const void*> FindAllocBase(
+    const LiteRtDispatchDelegateOptions& opts) {
+  auto alloc_base = opts.FindDispatchOption(kAllocBase);
+  if (!alloc_base) {
+    return alloc_base.Error();
+  }
+  return std::any_cast<const void*>(*alloc_base);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h
new file mode 100644
index 00000000..ad0cb37d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h
@@ -0,0 +1,35 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DMABUF_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DMABUF_BUFFER_H_
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+namespace internal {
+
+struct DmaBufBuffer {
+  int fd;
+  void* addr;
+
+  static bool IsSupported();
+  static Expected<DmaBufBuffer> Alloc(size_t size);
+  static void Free(void* addr);
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DMABUF_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/event.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/event.h
new file mode 100644
index 00000000..8cc665e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/event.h
@@ -0,0 +1,32 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EVENT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EVENT_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+struct LiteRtEventT {
+#if LITERT_HAS_SYNC_FENCE_SUPPORT
+  int fd = -1;
+  bool owns_fd = false;
+#endif
+  ~LiteRtEventT();
+  litert::Expected<void> Wait(int64_t timeout_in_ms);
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EVENT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h
new file mode 100644
index 00000000..13951447
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h
@@ -0,0 +1,115 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EXTERNAL_LITERT_BUFFER_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EXTERNAL_LITERT_BUFFER_CONTEXT_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
+
+namespace litert {
+namespace internal {
+
+class ExternalLiteRtBufferContext : public TfLiteExternalContext {
+ public:
+  ExternalLiteRtBufferContext() = default;
+  ~ExternalLiteRtBufferContext() = default;
+
+  // Registers a tensor buffer requirements for the given tensor.
+  // The registered TensorBufferRequirements object is owned by
+  // ExternalLiteRtBufferContext.
+  // Note: Currently, the system pre-registers tensor buffer requirements before
+  // they're actually used. A more efficient approach would be to query
+  // DelegateKernel only when these requirements are needed.
+  LiteRtStatus RegisterBufferRequirement(
+      const TfLiteOpaqueTensor* tensor,
+      TensorBufferRequirements&& buffer_requirements);
+
+  inline LiteRtStatus RegisterBufferRequirement(
+      const TfLiteTensor* tensor,
+      TensorBufferRequirements&& buffer_requirements) {
+    return RegisterBufferRequirement(
+        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor),
+        std::move(buffer_requirements));
+  }
+
+  // Gets a registered tensor buffer requirements for the given tensor.
+  // The returned TensorBufferRequirements object is still owned by
+  // ExternalLiteRtBufferContext.
+  litert::Expected<TensorBufferRequirements*> GetBufferRequirement(
+      const TfLiteOpaqueTensor* tensor);
+
+  inline litert::Expected<TensorBufferRequirements*> GetBufferRequirement(
+      const TfLiteTensor* tensor) {
+    return GetBufferRequirement(
+        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor));
+  }
+
+  // Registers a tensor buffer for the given tensor.
+  // The registered TensorBuffer object is owned by ExternalLiteRtBufferContext.
+  LiteRtStatus RegisterTensorBuffer(const TfLiteOpaqueTensor* tensor,
+                                    TensorBuffer&& tensor_buffer);
+
+  inline LiteRtStatus RegisterTensorBuffer(const TfLiteTensor* tensor,
+                                           TensorBuffer&& tensor_buffer) {
+    return RegisterTensorBuffer(
+        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor),
+        std::move(tensor_buffer));
+  }
+
+  // Gets a registered tensor buffer for the given tensor.
+  // The returned TensorBuffer object is duplication (reference counted)
+  // of registered TensorBuffer.
+  litert::Expected<TensorBuffer> GetTensorBuffer(
+      const TfLiteOpaqueTensor* tensor);
+
+  inline litert::Expected<TensorBuffer> GetTensorBuffer(
+      const TfLiteTensor* tensor) {
+    return GetTensorBuffer(reinterpret_cast<const TfLiteOpaqueTensor*>(tensor));
+  }
+
+  // Creates a tensor buffer for the given tensor.
+  // The callers takes ownership of the returned TensorBuffer object.
+  litert::Expected<TensorBuffer> CreateBufferForTensor(
+      const TfLiteOpaqueTensor* tensor);
+
+  inline litert::Expected<TensorBuffer> CreateBufferForTensor(
+      const TfLiteTensor* tensor) {
+    return CreateBufferForTensor(
+        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor));
+  }
+
+ private:
+  absl::flat_hash_map<const TfLiteOpaqueTensor*, TensorBufferRequirements>
+      buffer_requirements_;
+  absl::flat_hash_map<const TfLiteOpaqueTensor*, TensorBuffer> tensor_buffers_;
+
+  ExternalLiteRtBufferContext(const ExternalLiteRtBufferContext&) = delete;
+  ExternalLiteRtBufferContext& operator=(const ExternalLiteRtBufferContext&) =
+      delete;
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EXTERNAL_LITERT_BUFFER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h
new file mode 100644
index 00000000..d45a1676
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h
@@ -0,0 +1,35 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_FASTRPC_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_FASTRPC_BUFFER_H_
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+namespace internal {
+
+struct FastRpcBuffer {
+  int fd;
+  void* addr;
+
+  static bool IsSupported();
+  static Expected<FastRpcBuffer> Alloc(size_t size);
+  static void Free(void* addr);
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_FASTRPC_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/ion_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/ion_buffer.h
new file mode 100644
index 00000000..c82a180f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/ion_buffer.h
@@ -0,0 +1,35 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ION_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ION_BUFFER_H_
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+namespace internal {
+
+struct IonBuffer {
+  int fd;
+  void* addr;
+
+  static bool IsSupported();
+  static Expected<IonBuffer> Alloc(size_t size, size_t alignment);
+  static void Free(void* addr);
+};
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ION_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/tensor_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/tensor_buffer.h
new file mode 100644
index 00000000..7997c907
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/tensor_buffer.h
@@ -0,0 +1,183 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+class LiteRtTensorBufferT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtTensorBufferT>;
+
+  ~LiteRtTensorBufferT();
+
+  // Make this class non-copiable because it includes raw pointers and resource
+  // handles.
+  LiteRtTensorBufferT(const LiteRtTensorBufferT&) = delete;
+  LiteRtTensorBufferT(LiteRtTensorBufferT&&) = delete;
+  LiteRtTensorBufferT& operator=(const LiteRtTensorBufferT&) = delete;
+  LiteRtTensorBufferT& operator=(LiteRtTensorBufferT&&) = delete;
+
+  static litert::Expected<Ptr> CreateFromHostMemory(
+      const LiteRtRankedTensorType& tensor_type,
+      absl::Span<uint8_t> host_memory,
+      LiteRtHostMemoryDeallocator deallocator = nullptr);
+
+  static litert::Expected<Ptr> CreateFromAhwb(
+      const LiteRtRankedTensorType& tensor_type, AHardwareBuffer* ahwb,
+      size_t ahwb_offset, LiteRtAhwbDeallocator deallocator = nullptr);
+
+  static litert::Expected<Ptr> CreateFromIonBuffer(
+      const LiteRtRankedTensorType& tensor_type, void* ion_buffer_addr,
+      int ion_buffer_fd, size_t ion_buffer_size, size_t ion_buffer_offset,
+      LiteRtIonDeallocator deallocator = nullptr);
+
+  static litert::Expected<Ptr> CreateFromDmaBufBuffer(
+      const LiteRtRankedTensorType& tensor_type, void* dmabuf_buffer_addr,
+      int dmabuf_buffer_fd, size_t dmabuf_buffer_size,
+      size_t dmabuf_buffer_offset,
+      LiteRtDmaBufDeallocator deallocator = nullptr);
+
+  static litert::Expected<Ptr> CreateFromFastRpcBuffer(
+      const LiteRtRankedTensorType& tensor_type, void* fastrpc_buffer_addr,
+      int fastrpc_buffer_fd, size_t fastrpc_buffer_size,
+      size_t fastrpc_buffer_offset,
+      LiteRtFastRpcDeallocator deallocator = nullptr);
+
+  static litert::Expected<Ptr> CreateManaged(
+      LiteRtTensorBufferType buffer_type,
+      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
+
+  LiteRtRankedTensorType tensor_type() const { return tensor_type_; }
+  LiteRtTensorBufferType buffer_type() const { return buffer_type_; }
+  size_t buffer_size() const { return buffer_size_; }
+  size_t buffer_offset() const { return buffer_offset_; }
+
+  bool HasEvent() const { return event_.has_value(); }
+
+  litert::Expected<LiteRtEvent> GetEvent() const {
+    if (!HasEvent()) {
+      return litert::Error(kLiteRtStatusErrorRuntimeFailure,
+                           "TensorBuffer has no event");
+    }
+    return *event_;
+  }
+
+  void SetEvent(LiteRtEvent e) { event_ = e; }
+  void ClearEvent() { event_ = std::nullopt; }
+
+  litert::Expected<void*> GetHostBuffer();
+  litert::Expected<AHardwareBuffer*> GetAhwbBuffer();
+  litert::Expected<std::pair<void*, int>> GetIonBuffer();
+  litert::Expected<std::pair<void*, int>> GetDmaBufBuffer();
+  litert::Expected<std::pair<void*, int>> GetFastRpcBuffer();
+
+  litert::Expected<void*> Lock(LiteRtEvent event = nullptr);
+  litert::Expected<void> Unlock();
+
+  // Used to duplicate the current tensor buffer. Internally it increases
+  // reference count to the underlying buffer.
+  void Duplicate() const { Ref(); }
+
+  // Increments reference count by one.
+  void Ref() const { ref_.fetch_add(1, std::memory_order_relaxed); }
+
+  // Decrements reference count by one.  If the count remains
+  // positive, returns false.  When the count reaches zero, returns
+  // true.
+  bool Unref() const {
+    if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      return true;
+    }
+    return false;
+  }
+
+  // Gets the current reference count.
+  int RefCount() const { return ref_.load(std::memory_order_relaxed); }
+
+ private:
+  struct HostBuffer {
+    void* addr;
+    LiteRtHostMemoryDeallocator deallocator;
+  };
+
+  struct AhwbBuffer {
+    AHardwareBuffer* ahwb;
+    LiteRtAhwbDeallocator deallocator;
+  };
+
+  struct IonBuffer {
+    void* addr;
+    int fd;
+    LiteRtIonDeallocator deallocator;
+  };
+
+  struct DmaBufBuffer {
+    void* addr;
+    int fd;
+    LiteRtDmaBufDeallocator deallocator;
+  };
+
+  struct FastRpcBuffer {
+    void* addr;
+    int fd;
+    LiteRtFastRpcDeallocator deallocator;
+  };
+
+  LiteRtTensorBufferT(const LiteRtRankedTensorType& tensor_type,
+                      LiteRtTensorBufferType buffer_type, size_t buffer_size,
+                      size_t buffer_offset = 0);
+
+  static litert::Expected<Ptr> CreateManagedOnHostMemory(
+      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
+
+  static litert::Expected<Ptr> CreateManagedAhwbBuffer(
+      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
+
+  static litert::Expected<Ptr> CreateManagedIonBuffer(
+      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
+
+  static litert::Expected<Ptr> CreateManagedDmaBufBuffer(
+      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
+
+  static litert::Expected<Ptr> CreateManagedFastRpcBuffer(
+      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
+
+  litert::Expected<void> IsValid();
+
+  LiteRtRankedTensorType tensor_type_;
+  std::vector<std::decay_t<decltype(LiteRtLayout::dimensions[0])>> dimensions_;
+  std::vector<std::decay_t<decltype(LiteRtLayout::strides[0])>> strides_;
+  LiteRtTensorBufferType buffer_type_;
+  size_t buffer_size_;
+  size_t buffer_offset_;
+  std::variant<HostBuffer, AhwbBuffer, IonBuffer, DmaBufBuffer, FastRpcBuffer>
+      buffer_;
+  std::optional<LiteRtEvent> event_;
+  mutable std::atomic_int_fast32_t ref_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/tfl_utils.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/tfl_utils.h
new file mode 100644
index 00000000..6bfd7972
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/runtime/tfl_utils.h
@@ -0,0 +1,34 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TFL_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TFL_UTILS_H_
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+
+struct TfLiteOpaqueTensor;
+
+namespace litert {
+namespace internal {
+
+Expected<ElementType> ConvertElementType(TfLiteType tfl_type);
+
+Expected<RankedTensorType> ConvertTensorType(
+    const TfLiteOpaqueTensor* tfl_opaque_tensor);
+
+}  // namespace internal
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TFL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/common.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/common.h
new file mode 100644
index 00000000..6b6148c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/common.h
@@ -0,0 +1,104 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/core/model/model_buffer.h"
+#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
+#include "tensorflow/lite/experimental/litert/test/test_macros.h"  // IWYU pragma: keep
+#include "tensorflow/lite/interpreter.h"
+
+namespace litert {
+namespace testing {
+
+// A x-platform compatible replacement for testing::UniqueTestDirectory.
+class UniqueTestDirectory {
+ public:
+  static Expected<UniqueTestDirectory> Create();
+  ~UniqueTestDirectory();
+
+  UniqueTestDirectory(const UniqueTestDirectory&) = delete;
+  UniqueTestDirectory(UniqueTestDirectory&&) = default;
+  UniqueTestDirectory& operator=(const UniqueTestDirectory&) = delete;
+  UniqueTestDirectory& operator=(UniqueTestDirectory&&) = default;
+
+  absl::string_view Str() const { return tmpdir_; }
+
+ private:
+  explicit UniqueTestDirectory(std::string&& tmpdir)
+      : tmpdir_(std::move(tmpdir)) {}
+  std::string tmpdir_;
+};
+
+std::string GetTestFilePath(absl::string_view filename);
+
+Model LoadTestFileModel(absl::string_view filename);
+
+class TflRuntime {
+ public:
+  using Ptr = std::unique_ptr<TflRuntime>;
+
+  static Expected<Ptr> CreateFromFlatBuffer(
+      internal::FlatbufferWrapper::Ptr flatbuffer);
+
+  ::tflite::Interpreter& Interpreter() { return *interpreter_; }
+
+  const internal::FlatbufferWrapper& Flatbuffer() const { return *flatbuffer_; }
+
+ private:
+  TflRuntime(internal::FlatbufferWrapper::Ptr flatbuffer,
+             ::tflite::Interpreter::Ptr interpreter)
+      : flatbuffer_(std::move(flatbuffer)),
+        interpreter_(std::move(interpreter)) {}
+
+  internal::FlatbufferWrapper::Ptr flatbuffer_;
+  ::tflite::Interpreter::Ptr interpreter_;
+};
+
+inline Expected<TflRuntime::Ptr> MakeRuntimeFromTestFile(
+    absl::string_view filename) {
+  auto flatbuffer =
+      internal::FlatbufferWrapper::CreateFromTflFile(GetTestFilePath(filename));
+  if (!flatbuffer) {
+    return flatbuffer.Error();
+  }
+  return TflRuntime::CreateFromFlatBuffer(std::move(*flatbuffer));
+}
+
+inline Expected<TflRuntime::Ptr> MakeRuntimeFromTestFileWithNpuModel(
+    absl::string_view filename, absl::string_view npu_filename) {
+  auto buf = internal::GetModelBufWithByteCode(GetTestFilePath(filename),
+                                               GetTestFilePath(npu_filename));
+  if (!buf) {
+    return buf.Error();
+  }
+  auto flatbuffer =
+      internal::FlatbufferWrapper::CreateFromBuffer(std::move(*buf));
+  if (!flatbuffer) {
+    return flatbuffer.Error();
+  }
+  return TflRuntime::CreateFromFlatBuffer(std::move(*flatbuffer));
+}
+
+}  // namespace testing
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/test_macros.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/test_macros.h
new file mode 100644
index 00000000..e64f0af9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/test_macros.h
@@ -0,0 +1,46 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MACROS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MACROS_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"  // IWYU pragma: keep
+
+#define _LITERT_ASSERT_RESULT_OK_ASSIGN(decl, expr, result) \
+  auto result = (expr);                                     \
+  ASSERT_TRUE(result.HasValue());                           \
+  decl = result.Value();
+
+#define LITERT_ASSERT_RESULT_OK_ASSIGN(decl, expr) \
+  _LITERT_ASSERT_RESULT_OK_ASSIGN(decl, expr,      \
+                                  _CONCAT_NAME(_result, __COUNTER__))
+
+#define _LITERT_ASSERT_RESULT_OK_MOVE(decl, expr, result) \
+  auto result = (expr);                                   \
+  ASSERT_TRUE(result.HasValue());                         \
+  decl = std::move(result.Value());
+
+#define LITERT_ASSERT_RESULT_OK_MOVE(decl, expr) \
+  _LITERT_ASSERT_RESULT_OK_MOVE(decl, expr, _CONCAT_NAME(_result, __COUNTER__))
+
+#define LITERT_ASSERT_STATUS_HAS_CODE(expr, code) \
+  {                                               \
+    LiteRtStatus status = (expr);                 \
+    ASSERT_EQ(status, code);                      \
+  }
+
+#define LITERT_ASSERT_STATUS_OK(expr) \
+  LITERT_ASSERT_STATUS_HAS_CODE(expr, kLiteRtStatusOk);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/test_models.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/test_models.h
new file mode 100644
index 00000000..4dd8f0a1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/test_models.h
@@ -0,0 +1,123 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MODELS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MODELS_H_
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+// ///////////////////////////////////////////////////////////////////////////
+// FP32 models.
+// ///////////////////////////////////////////////////////////////////////////
+
+// Attention sub-module of a toy model.
+static constexpr absl::string_view kAttentionModel = "attention.tflite";
+
+// Attention vector einsum sub-module of a toy LLM.
+static constexpr absl::string_view kAttnVecEinsumModel =
+    "attn_vec_einsum.tflite";
+
+//  Feed forward sub-module of a toy LLM.
+static constexpr absl::string_view kFeedForwardModel = "ff.tflite";
+
+// Key einsume sub-module of a toy LLM.
+static constexpr absl::string_view kKeyEinsumModel = "k_einsum.tflite";
+
+// Value einsum sub-module of a toy LLM.
+static constexpr absl::string_view kValueEinsumModel = "v_einsum.tflite";
+
+// Query einsum sub-module of a toy LLM.
+static constexpr absl::string_view kQueryEinsumModel = "q_einsum.tflite";
+
+// RMS Normalization sub-module of a toy LLM.
+static constexpr absl::string_view kRMSNormModel = "norm.tflite";
+
+// ROPE sub-module of a toy LLM.
+static constexpr absl::string_view kROPEModel = "rope.tflite";
+
+// ROPE sub-module of a toy LLM, uses embedding_lookup op for sin/cos.
+static constexpr absl::string_view kLookUpROPEModel = "lookup_rope.tflite";
+
+// Scale dot product attentionsub-module of a toy LLM.
+static constexpr absl::string_view kSDPAModel = "sdpa.tflite";
+
+// Transformer block sub-module of a toy LLM.
+static constexpr absl::string_view kTransformerBlockModel =
+    "transformer.tflite";
+
+// ///////////////////////////////////////////////////////////////////////////
+// Quantized models.
+// ///////////////////////////////////////////////////////////////////////////
+
+// Quantized model with a single mul op.
+// Mul: <8x100x32x4xint16>, <8x100x32x4xint16> -> <8x100x32x4xint16>
+static constexpr absl::string_view kQSimpleMul16x16Model = "mul_quant.tflite";
+
+// Quantized model with a mul op and a add op.
+// Mul: <8x100x32x4xint16>, <8x100x32x4xint16> -> <8x100x32x4xint16>
+// Add: <8x100x32x4xint16>, <8x100x32x4xint16> -> <8x100x32x4xint16>
+static constexpr absl::string_view kQMulAdd16x16Model =
+    "simple_quantized_ops.tflite";
+
+// Single add op i16 activations and i8 weights and dynamic shape.
+// Add: <?x32x32int16>, <?x32x32int16> -> <?x32x32int16>
+static constexpr absl::string_view kQSingleDynAdd16x8Model =
+    "single_add_default_a16w8_recipe_quantized.tflite";
+
+// Single add op i8 activations and i8 weights and dynamic shape.
+// Add: <?x32x32int8>, <?x32x32int8> -> <?x32x32int8>
+static constexpr absl::string_view kQSingleDynAdd8x8Model =
+    "single_add_default_a8w8_recipe_quantized.tflite";
+
+// Single mul op i16 activations and i8 weights and dynamic shape.
+// Mul: <?x32x32int16>, <?x32x32int16> -> <?x32x32int16>
+static constexpr absl::string_view kQSingleDynMul16x8Model =
+    "single_mul_default_a16w8_recipe_quantized.tflite";
+
+// Single mul op i8 activations and i8 weights and dynamic shape.
+// Mul: <?x32x32int8>, <?x32x32int8> -> <?x32x32int8>
+static constexpr absl::string_view kQSingleDynMul8x8Model =
+    "single_mul_default_a8w8_recipe_quantized.tflite";
+
+// Single rsqrt op i16 activations and i8 weights and dynamic shape.
+// RSQRT: <?x32x32int16> -> <?x32x32int16>
+static constexpr absl::string_view kQSingleDynRsqrt16x8Model =
+    "single_rsqrt_default_a16w8_recipe_quantized.tflite";
+
+// Single rsqrt op i8 activations and i8 weights and dynamic shape.
+// RSQRT: <?x32x32int8> -> <?x32x32int8>
+static constexpr absl::string_view kQSingleDynRsqrt8x8Model =
+    "single_rsqrt_default_a8w8_recipe_quantized.tflite";
+
+// Quantized einsum model with i16 activations and i8 weights.
+static constexpr absl::string_view kQQueryEinsum16x8Model =
+    "static_w8_a16_quantized_q_einsum.tflite";
+
+static constexpr absl::string_view kQKeyEinsum16x8Model =
+    "static_w8_a16_quantized_k_einsum.tflite";
+
+static constexpr absl::string_view kQVauleEinsum16x8Model =
+    "static_w8_a16_quantized_v_einsum.tflite";
+
+static constexpr absl::string_view kQAttnVecEinsum16x8Model =
+    "static_w8_a16_quantized_attn_vec_einsum.tflite";
+
+// All the quantized test models.
+static constexpr auto kAllQModels = absl::MakeConstSpan((absl::string_view[]){
+    kQSimpleMul16x16Model, kQMulAdd16x16Model, kQSingleDynAdd16x8Model,
+    kQSingleDynAdd8x8Model, kQSingleDynMul16x8Model, kQSingleDynMul8x8Model,
+    kQSingleDynRsqrt16x8Model, kQSingleDynRsqrt8x8Model});
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MODELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h
new file mode 100644
index 00000000..2068cb02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h
@@ -0,0 +1,67 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TESTDATA_SIMPLE_MODEL_TEST_VECTORS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TESTDATA_SIMPLE_MODEL_TEST_VECTORS_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
+
+constexpr const char* kModelFileName = "simple_model.tflite";
+constexpr const char* kQualcommModelFileName = "simple_model_qualcomm.bin";
+constexpr const char* kGoogleTensorModelFileName =
+    "simple_model_google_tensor.bin";
+constexpr const char* kMediaTekModelFileName = "simple_model_mtk.bin";
+
+constexpr const int32_t kTestInput0Dimensions[] = {2};
+constexpr const int32_t kNumTestInput0Dimensions =
+    sizeof(kTestInput0Dimensions) / sizeof(kTestInput0Dimensions[0]);
+constexpr const int32_t kTestInput1Dimensions[] = {2};
+constexpr const int32_t kNumTestInput1Dimensions =
+    sizeof(kTestInput1Dimensions) / sizeof(kTestInput1Dimensions[0]);
+constexpr const int32_t kTestOutputDimensions[] = {2};
+constexpr const int32_t kNumTestOutputDimensions =
+    sizeof(kTestOutputDimensions) / sizeof(kTestOutputDimensions[0]);
+
+constexpr const float kTestInput0Tensor[] = {1, 2};
+constexpr const float kTestInput1Tensor[] = {10, 20};
+constexpr const float kTestOutputTensor[] = {11, 22};
+
+constexpr const float kTestInput0Tensor_2[] = {10, 20};
+constexpr const float kTestInput1Tensor_2[] = {100, 200};
+constexpr const float kTestOutputTensor_2[] = {110, 220};
+
+constexpr const size_t kTestInput0Size =
+    sizeof(kTestInput0Tensor) / sizeof(kTestInput0Tensor[0]);
+constexpr const size_t kTestInput1Size =
+    sizeof(kTestInput1Tensor) / sizeof(kTestInput1Tensor[0]);
+constexpr const size_t kTestOutputSize =
+    sizeof(kTestOutputTensor) / sizeof(kTestOutputTensor[0]);
+
+constexpr const LiteRtRankedTensorType kInput0TensorType = {
+    /*.element_type=*/kLiteRtElementTypeFloat32,
+    ::litert::BuildLayout(kTestInput0Dimensions)};
+
+constexpr const LiteRtRankedTensorType kInput1TensorType = {
+    /*.element_type=*/kLiteRtElementTypeFloat32,
+    ::litert::BuildLayout(kTestInput1Dimensions)};
+
+constexpr const LiteRtRankedTensorType kOutputTensorType = {
+    /*.element_type=*/kLiteRtElementTypeFloat32,
+    ::litert::BuildLayout(kTestOutputDimensions)};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TESTDATA_SIMPLE_MODEL_TEST_VECTORS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/apply_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/apply_plugin.h
new file mode 100644
index 00000000..46caf8ac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/apply_plugin.h
@@ -0,0 +1,177 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_APPLY_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_APPLY_PLUGIN_H_
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
+#include "tensorflow/lite/experimental/litert/core/byte_code_util.h"
+#include "tensorflow/lite/experimental/litert/tools/outstream.h"
+
+namespace litert::tools {
+
+using ::litert::internal::Serialization;
+
+// TODO remove these usings other than Ptr and outStraemT
+
+struct ApplyPluginRun {
+  // NOTE: All StrFlagT are expected to have static storage duration.
+  using Ptr = std::unique_ptr<ApplyPluginRun>;
+
+  // A specific command implemented by the tool to run.
+  enum class Cmd {
+    // Displays info about all plugins found in given search paths.
+    //
+    // FLAG SEMANTICS:
+    // "lib_search_paths": Required, at least one.
+    // "model": Ignored.
+    // "soc_manufacturer": Optional, filters plugins to display.
+    // "soc_models": Ignored.
+    // "outs": Required, must be size one.
+    // "dump_out": Optional.
+    // "serialization": Ignored.
+    INFO,
+
+    // Does nothing and simply de-serializes and re-serializes the given model.
+    // This is intended for testing and internal debugging only.
+    //
+    // FLAG SEMANTICS:
+    // "lib_search_paths": Ignored.
+    // "model": Required.
+    // "soc_manufacturer": Ignored.
+    // "soc_models": Ignored.
+    // "outs": Required, must be size one.
+    // "dump_out": Optional.
+    // "serialization": Ignored.
+    NOOP,
+
+    // Runs the entire end to end flow. This is the standard compiler plugin
+    // usage. A seperate compilation step will occur for each sco_model tag that
+    // is supported by the loaded plugin, and a new output model will be
+    // generated for each. Partitioning is invariant accross different soc_model
+    // targets from the same manufacturer, so only one compilation step will
+    // occur even if multiple targest are requested.
+    //
+    // FLAG SEMANTICS:
+    // "lib_search_paths": Required, at least one.
+    // "model": Required.
+    // "soc_manufacturer": Required.
+    // "soc_models": Required, at least one.
+    // "outs": Required, must be size equal to "soc_models".
+    // "dump_out": Optional.
+    // "serialization": Required.
+    //
+    // TODO: Support multi target compilation.
+    APPLY,
+
+    // Only run the partiion step and skip compilation. Writes a ".tflite" model
+    // to "out" where selected partitions are manifested as new standard
+    // flatbuffer subgraphs added to the input model.
+    // The partitions original locations are replaced with a single custom op
+    // the contains an identifier to the corresponding partition (new subgraph).
+    // This is intended for testing and development.
+    //
+    // FLAG SEMANTICS:
+    // "lib_search_paths": Required, at least one.
+    // "model": Required.
+    // "soc_manufacturer": Required.
+    // "soc_models": Ignored.
+    // "outs": Required, must be size one.
+    // "dump_out": Optional.
+    // "serialization": Ignored.
+    PARTITION,
+
+    // Skip partitioning and run the entire input model through compilation
+    // directly. Fails if any ops in the input model are unsupported by the
+    // plugin. Writes the raw compiled result to the "out" stream without any
+    // wrapping flatbuffer. Runs multi-target compilation as in "APPLY",
+    // Intended for testing and development.
+    //
+    // FLAG SEMANTICS:
+    // "lib_search_paths": Required, at least one.
+    // "model": Required.
+    // "soc_manufacturer": Required.
+    // "soc_models": Required, at least one.
+    // "out": Required, must be size equal to "soc_models".
+    // "dump_out": Optional.
+    // "serialization": Ignored.
+    //
+    // TODO: Support multi target compilation.
+    COMPILE,
+  };
+
+  // A command to run, see above.
+  Cmd cmd;
+
+  // Collection of paths on local files system dictating where the tool should
+  // look for suitable LiteRtCompilerPlugin shared libraries. The tool will
+  // select the first ".so" file found with prefix "libLiteRtPlugin" that has
+  // the "soc_manufacturer" tag passed. Providing more than one plugin shared
+  // library for the same manufacturer results in an error.
+  std::vector<absl::string_view> lib_search_paths = {};
+
+  // Path to ".tflite" model the tool should operated on.
+  std::optional<absl::string_view> model = {};
+
+  // A tag representing a manufacturer the tool should target for compilation.
+  // This is used to select the appropriate plugin if multiple plugins are found
+  // in "lib_search_paths".
+  std::optional<absl::string_view> soc_manufacturer = {};
+
+  // Collection of soc models tags the tool should target for compilation.
+  std::vector<absl::string_view> soc_models = {};
+
+  // Where the tool should write its result file(s) to. If the command runs
+  // compilation, an "out" stream should be passed for each "soc_model" target
+  // requested for compilation. Output for the "ith" target will be written to
+  // the "ith" outs stream.
+  std::vector<OutStream> outs = {std::cout};
+
+  // Where to direct logging for this run. Passing nullopt here indicates
+  // "silent" behavior and should only be used when this tool is part of a
+  // larger pipeline like an end2end test.
+  UserStream dump_out;
+
+  // Dictates how the final model with compiled assets should be serialized.
+  // Only relevant to the "apply" function.
+  //
+  // [METADATA] Write the compiled module into a metadata buffer using the
+  // soc_manufacturer as a key. This is for testing and debugging as it allows
+  // the contents of the byte code to be rendered by exisitng flatbuffer
+  // tooling. Custom op options will contain only a string identifying the
+  // respective entry point.
+  //
+  // [APPEND] Appends the compiled byte code to the end of the ".tflite" file.
+  // Custom options will contain both an entry point name, and an optional
+  // metadata lookup key. This facilitates per-op metadata while allowing
+  // multiple ops to share the same metadata if needed. Any instances of this
+  // metadata are pairs indicating the offset into the file where the byte code
+  // starts as well as the size of the byte code.
+  Serialization serialization = Serialization::kMetadata;
+};
+
+LiteRtStatus ApplyPlugin(ApplyPluginRun::Ptr run);
+
+}  // namespace litert::tools
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_APPLY_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/dump.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/dump.h
new file mode 100644
index 00000000..4012bb3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/dump.h
@@ -0,0 +1,71 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_DUMP_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_DUMP_H_
+
+#include <iostream>
+#include <istream>
+#include <ostream>
+
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
+#include "tensorflow/lite/experimental/litert/core/model/model.h"
+
+namespace litert::internal {
+
+//
+// LiteRt IR
+//
+
+// Dump details about the given LiteRtOpT to the given stream.
+void Dump(const LiteRtOpT& op, std::ostream& out = std::cerr);
+
+// Dump details about the given LiteRtSubgraphT to the given stream.
+void Dump(const LiteRtSubgraphT& subgraph, std::ostream& out = std::cerr);
+
+// Dump details about the given LiteRtTensorT to the given stream.
+void Dump(const LiteRtTensorT& tensor, std::ostream& out = std::cerr);
+
+// Dump details about the given LiteRtOpCode to the given stream.
+void Dump(LiteRtOpCode code, std::ostream& out = std::cerr);
+
+// Dump details about the given LiteRtElementType to the given stream.
+void Dump(LiteRtElementType type, std::ostream& out = std::cerr);
+
+// Dump details about the given LiteRtRankedTensorType to the given stream.
+void Dump(const LiteRtRankedTensorType& type, std::ostream& out = std::cerr);
+
+// Dump details about the given LiteRtModel to the given stream.
+void Dump(const LiteRtModelT& model, std::ostream& out = std::cerr);
+
+// Dump details about the given quantization params.
+void Dump(Quantization quantization, std::ostream& out = std::cerr);
+
+// Dump details about options
+void DumpOptions(const LiteRtOpT& op, std::ostream& out = std::cerr);
+
+//
+// Library Utilities
+//
+
+// Dumps details about the loaded LiteRtCompilerPlugin library.
+void Dump(const CompilerPlugin& plugin, std::ostream& out = std::cerr);
+
+// Dumps details about the dynamic library (see "dlinfo").
+void DumpDLL(void* lib_handle, std::ostream& out = std::cerr);
+
+}  // namespace litert::internal
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_DUMP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/outstream.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/outstream.h
new file mode 100644
index 00000000..a920f218
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/outstream.h
@@ -0,0 +1,83 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_OUTSTREAM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_OUTSTREAM_H_
+
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
+
+namespace litert::tools {
+
+using OutStream = std::reference_wrapper<std::ostream>;
+using OutStreamPtr = std::unique_ptr<std::ostream>;
+
+// Out stream configured by a user by flag.
+class UserStream {
+ public:
+  // Parse the flag and get a configured stream.
+  static UserStream MakeFromFlag(absl::string_view flag) {
+    if (flag == kCerr) {
+      LITERT_LOG(LITERT_INFO, "Setup cerr stream\n", "");
+      return UserStream(std::cerr);
+    } else if (flag == kCout) {
+      LITERT_LOG(LITERT_INFO, "Setup cout stream\n", "");
+      return UserStream(std::cout);
+    } else if (flag == kNone) {
+      LITERT_LOG(LITERT_INFO, "Setup null stream\n", "");
+      return UserStream();
+    } else {
+      // File stream.
+      LITERT_LOG(LITERT_INFO, "Setup file stream\n", "");
+      auto ofstream = std::make_unique<std::ofstream>();
+      ofstream->open(flag.data());
+      return UserStream(std::move(ofstream));
+    }
+  }
+
+  // Get the actual stream to write to.
+  OutStream Get() { return used_; }
+
+  // Silent stream.
+  UserStream()
+      : stored_(std::make_unique<std::ostream>(nullptr)), used_(*stored_) {}
+  // From reference to external stream (cerr, cout)
+  explicit UserStream(OutStream ostream) : stored_(nullptr), used_(ostream) {}
+  // From stream to internalize.
+  explicit UserStream(OutStreamPtr ostream)
+      : stored_(std::move(ostream)), used_(*stored_) {}
+
+  UserStream(UserStream&&) = default;
+  UserStream& operator=(UserStream&&) = default;
+
+ private:
+  // These are used in the various CLI's flags that configure output streams.
+  static constexpr absl::string_view kCerr = "--";
+  static constexpr absl::string_view kCout = "-";
+  static constexpr absl::string_view kNone = "none";
+
+  OutStreamPtr stored_;
+  OutStream used_;
+};
+
+}  // namespace litert::tools
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_OUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/tool_display.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/tool_display.h
new file mode 100644
index 00000000..583d07ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/tools/tool_display.h
@@ -0,0 +1,102 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_TOOL_DISPLAY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_TOOL_DISPLAY_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/tools/outstream.h"
+
+namespace litert::tools {
+
+// Utility class for interactive logging for usage in command line tools only.
+// Allows user to explicitly set target stream.
+class ToolDisplay {
+ public:
+  using Ptr = std::unique_ptr<ToolDisplay>;
+  // Construct configured ToolDisplay. Label is used for prefixing dumps
+  // in "LabeledStream".
+  explicit ToolDisplay(UserStream&& ostream, absl::string_view tool_label = "")
+      : label_(MakeLabel(tool_label)),
+        ostream_(std::forward<UserStream>(ostream)) {}
+  explicit ToolDisplay(OutStream ostream, absl::string_view tool_label = "")
+      : label_(MakeLabel(tool_label)), ostream_(UserStream(ostream)) {}
+
+  ToolDisplay(const ToolDisplay&) = delete;
+  ToolDisplay& operator=(const ToolDisplay&) = delete;
+  ToolDisplay(ToolDisplay&&) = delete;
+  ToolDisplay& operator=(ToolDisplay&&) = delete;
+
+  // Get out stream.
+  std::ostream& Display();
+
+  // Get Display with label prefix.
+  std::ostream& Labeled();
+
+  // Get Display with indent.
+  std::ostream& Indented();
+
+  // Log string indicating a sub rountine is beginning.
+  void Start(absl::string_view scope_name);
+
+  // Log string indicating a sub rountine is done and succeeded.
+  void Done(absl::string_view scope_name = "");
+
+  // Log string indicating a sub rountine is done and failed.
+  void Fail();
+
+  // Logs "start/finish" messages automatically.
+  class LoggedScope {
+    friend class ToolDisplay;
+
+   public:
+    LoggedScope(const LoggedScope&) = delete;
+    LoggedScope& operator=(const LoggedScope&) = delete;
+    LoggedScope(LoggedScope&&) = delete;
+    LoggedScope& operator=(LoggedScope&&) = delete;
+
+    ~LoggedScope();
+
+   private:
+    explicit LoggedScope(ToolDisplay& parent, absl::string_view scope_name);
+
+    void Start();
+    void Done();
+
+    ToolDisplay& parent_;
+    // These should all be from literals.
+    absl::string_view scope_name_;
+  };
+
+  // Get object that prints a start message and an exit message
+  // automatically when it goes out of scope.
+  [[maybe_unused]] LoggedScope StartS(absl::string_view scope_name);
+
+ private:
+  static std::string MakeLabel(absl::string_view tool_label);
+  std::string label_;
+  UserStream ostream_;
+};
+
+// Print art and info at cli startup.
+void DumpPreamble(ToolDisplay& display);
+
+}  // namespace litert::tools
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_TOOL_DISPLAY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h
new file mode 100644
index 00000000..d0196e99
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h
@@ -0,0 +1,107 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+LITERT_DEFINE_HANDLE(LiteRtCompilerPlugin);
+
+// Artifact produced from compiling a selected partition of ops.
+LITERT_DEFINE_HANDLE(LiteRtCompiledResult);
+
+//
+// Plugin
+//
+
+LiteRtStatus LiteRtGetCompilerPluginVersion(LiteRtApiVersion* api_version);
+
+// Name associated with the manufacturer this plugin relates to (e.g,
+// GoogleTensor, Qualcomm).
+const char* LiteRtGetCompilerPluginSocManufacturer();
+
+LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin);
+
+void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin);
+
+// Return the HW supported by this plugin (e.g., GPU, NPU)
+LiteRtStatus LiteRtGetCompilerPluginSupportedHardware(
+    LiteRtCompilerPlugin compiler_plugin,
+    LiteRtHwAccelerators* supported_hardware);
+
+// Number of SoC models supported by this plugin.
+LiteRtStatus LiteRtGetNumCompilerPluginSupportedSocModels(
+    LiteRtCompilerPlugin compiler_plugin,
+    LiteRtParamIndex* num_supported_soc_models);
+
+// Gets the name of the SoC model at the given index. The memory
+// associated with the returned name is owned by the plugin.
+LiteRtStatus LiteRtGetCompilerPluginSupportedSocModel(
+    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex soc_model_idx,
+    const char** soc_model_name);
+
+// Select desired ops for compilation. This will only be called once
+// per subgraph, plugins should select all supportable ops.
+LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
+                                           LiteRtSubgraph subgraph,
+                                           LiteRtOpList selected_ops);
+
+// Prepare result to pass to the runtime for given partition and, optionally,
+// for a given SoC model (parameter `soc_model` can be NULL to specify a default
+// SoC model). The given subgraphs are valid sub-DAG within the ops selected in
+// partition step.
+LiteRtStatus LiteRtCompilerPluginCompile(LiteRtCompilerPlugin compiler_plugin,
+                                         const char* soc_model,
+                                         LiteRtSubgraph* partitions,
+                                         LiteRtParamIndex num_partitions,
+                                         LiteRtCompiledResult* compiled_result);
+
+//
+// Compiled Partition
+//
+
+void LiteRtDestroyCompiledResult(LiteRtCompiledResult result);
+
+// Get serialized result to compiled modules available to all custom ops.
+// This could be one module with multiple entry points or multiple modules
+// concat together.
+LiteRtStatus LiteRtGetCompiledResultByteCode(
+    LiteRtCompiledResult compiled_result, const void** byte_code,
+    size_t* byte_code_size);
+
+// Get info to embed in a particular custom op. This could be  any opaque data
+// parsed in the custom op.
+LiteRtStatus LiteRtGetCompiledResultCallInfo(
+    LiteRtCompiledResult compiled_result, LiteRtParamIndex call_idx,
+    const void** call_info, size_t* call_info_size);
+
+// Get the number of calls that will be made to the HAL for this graph.
+// This should equal the number of partitions given for compilation which
+// is equal to the number of custom ops in the final model.
+LiteRtStatus LiteRtGetNumCompiledResultCalls(
+    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_calls);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h
new file mode 100644
index 00000000..b376f5a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h
@@ -0,0 +1,139 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_API_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_API_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
+
+// Wrapper for dynamically loaded LiteRtCompilerPlugin library. See
+// "litert_compiler_plugin.h".
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//
+// Api Interface
+//
+
+typedef LiteRtStatus (*LiteRtGetCompilerPluginVersionT)(LiteRtApiVersion*);
+
+typedef const char* (*LiteRtGetCompilerPluginSocManufacturerT)();
+
+typedef LiteRtStatus (*LiteRtCreateCompilerPluginT)(LiteRtCompilerPlugin*);
+
+typedef void (*LiteRtDestroyCompilerPluginT)(LiteRtCompilerPlugin);
+
+typedef LiteRtStatus (*LiteRtGetCompilerPluginSupportedHardwareT)(
+    LiteRtCompilerPlugin, LiteRtHwAccelerators*);
+
+typedef LiteRtStatus (*LiteRtGetNumCompilerPluginSupportedSocModelsT)(
+    LiteRtCompilerPlugin, LiteRtParamIndex*);
+
+typedef LiteRtStatus (*LiteRtGetCompilerPluginSupportedSocModelT)(
+    LiteRtCompilerPlugin, LiteRtParamIndex soc_model_idx,
+    const char** soc_moel_idx);
+
+typedef LiteRtStatus (*LiteRtCompilerPluginPartitionT)(
+    LiteRtCompilerPlugin, LiteRtSubgraph subgraph, LiteRtOpList selected_ops);
+
+typedef LiteRtStatus (*LiteRtCompilerPluginCompileT)(
+    LiteRtCompilerPlugin, const char* soc_model, LiteRtSubgraph* partitions,
+    LiteRtParamIndex num_partitions, LiteRtCompiledResult* compiled_result);
+
+typedef void (*LiteRtDestroyCompiledResultT)(LiteRtCompiledResult);
+
+typedef LiteRtStatus (*LiteRtGetCompiledResultByteCodeT)(
+    LiteRtCompiledResult, const void** byte_code, size_t* byte_code_size);
+
+typedef LiteRtStatus (*LiteRtGetCompiledResultCallInfoT)(
+    LiteRtCompiledResult, LiteRtParamIndex call_idx, const void** call_info,
+    size_t* call_info_size);
+
+typedef LiteRtStatus (*LiteRtGetNumCompiledResultCallsT)(
+    LiteRtCompiledResult, LiteRtParamIndex* num_calls);
+
+//
+// Function Pointer Container
+//
+
+// Wraps all resolved functions from api interface.
+struct LiteRtCompilerPluginApi {
+  LiteRtGetCompilerPluginVersionT get_compiler_plugin_version;
+  LiteRtGetCompilerPluginSocManufacturerT get_compiler_plugin_soc_manufacturer;
+  LiteRtCreateCompilerPluginT create_compiler_plugin;
+  LiteRtDestroyCompilerPluginT destroy_compiler_plugin;
+
+  LiteRtGetCompilerPluginSupportedHardwareT
+      get_compiler_plugin_supported_hardware;
+  LiteRtGetNumCompilerPluginSupportedSocModelsT
+      get_num_compiler_plugin_supported_models;
+  LiteRtGetCompilerPluginSupportedSocModelT
+      get_compiler_plugin_supported_soc_model;
+
+  LiteRtCompilerPluginPartitionT compiler_plugin_partition;
+  LiteRtCompilerPluginCompileT compiler_plugin_compile;
+
+  LiteRtDestroyCompiledResultT destroy_compiled_result;
+  LiteRtGetCompiledResultByteCodeT get_compiled_result_byte_code;
+  LiteRtGetCompiledResultCallInfoT get_compiled_result_call_info;
+  LiteRtGetNumCompiledResultCallsT get_compiled_result_num_calls;
+};
+
+#ifdef __cplusplus
+}
+
+#include "absl/strings/string_view.h"
+
+static constexpr absl::string_view kLiteRtGetCompilerPluginVersion =
+    "LiteRtGetCompilerPluginVersion";
+
+static constexpr absl::string_view kLiteRtGetCompilerPluginSupportedHardware =
+    "LiteRtGetCompilerPluginSupportedHardware";
+
+static constexpr absl::string_view kLiteRtGetCompilerPluginSocManufacturer =
+    "LiteRtGetCompilerPluginSocManufacturer";
+static constexpr absl::string_view
+    kLiteRtGetNumCompilerPluginSupportedSocModels =
+        "LiteRtGetNumCompilerPluginSupportedSocModels";
+static constexpr absl::string_view kLiteRtGetCompilerPluginSupportedSocModel =
+    "LiteRtGetCompilerPluginSupportedSocModel";
+
+static constexpr absl::string_view kLiteRtCreateCompilerPlugin =
+    "LiteRtCreateCompilerPlugin";
+static constexpr absl::string_view kLiteRtDestroyCompilerPlugin =
+    "LiteRtDestroyCompilerPlugin";
+
+static constexpr absl::string_view kLiteRtCompilerPluginPartition =
+    "LiteRtCompilerPluginPartition";
+static constexpr absl::string_view kLiteRtCompilerPluginCompile =
+    "LiteRtCompilerPluginCompile";
+
+static constexpr absl::string_view kLiteRtDestroyCompiledResult =
+    "LiteRtDestroyCompiledResult";
+static constexpr absl::string_view kLiteRtGetCompiledResultByteCode =
+    "LiteRtGetCompiledResultByteCode";
+static constexpr absl::string_view kLiteRtGetCompiledResultCallInfo =
+    "LiteRtGetCompiledResultCallInfo";
+static constexpr absl::string_view kLiteRtGetNumCompiledResultCalls =
+    "LiteRtGetNumCompiledResultCalls";
+
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h
new file mode 100644
index 00000000..fa735fed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h
@@ -0,0 +1,275 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_H_
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_any.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_event.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// /////////////////////////////////////////////////////////////////////////////
+// Basic Execution API
+// /////////////////////////////////////////////////////////////////////////////
+
+LITERT_DEFINE_HANDLE(LiteRtDispatchDeviceContext);
+LITERT_DEFINE_HANDLE(LiteRtDispatchInvocationContext);
+
+typedef uint64_t LiteRtTensorBufferHandle;
+
+typedef enum LiteRtDispatchCapabilities {
+  kLiteRtDispatchCapabilitiesNone = 0,
+  kLiteRtDispatchCapabilitiesBasic = 1,  // The vendor supports the Basic API
+  kLiteRtDispatchCapabilitiesAsync = 2,  // The vendor supports the Async API
+  kLiteRtDispatchCapabilitiesGraph = 4,  // The vendor supports the Graph API
+} LiteRtDispatchCapabilities;
+
+// Types of executable that can run on the HW accelerators.
+typedef enum LiteRtDispatchExecutableType {
+  kLiteRtDispatchExecutableTypeUnknown = 0,
+  kLiteRtDispatchExecutableTypeDspLibrary = 1,  // DSP library
+  kLiteRtDispatchExecutableTypeMlModel = 2,     // Vendor-specific ML model
+} LiteRtDispatchExecutableType;
+
+typedef struct LiteRtDispatchOption {
+  const char* name;
+  LiteRtAny value;
+} LiteRtDispatchOption;
+
+// This option can be used to specify a directory from where to load shared
+// libraries.
+static const char* kDispatchOptionSharedLibraryDir = "shared_library_dir";
+
+// Initialize the Dispatch API runtime.
+//
+// This function should be called before calling any other Dispatch API
+// functions.
+LiteRtStatus LiteRtDispatchInitialize(const LiteRtDispatchOption* options,
+                                      int num_options);
+
+// Return the version of the Dispatch API runtime.
+LiteRtStatus LiteRtDispatchGetApiVersion(LiteRtApiVersion* api_version);
+
+// Return the vendor id of the Dispatch API runtime.
+//
+// This function returns a pointer to a statically allocated string that is the
+// ID of vendor providing the Dispatch API runtime.
+LiteRtStatus LiteRtDispatchGetVendorId(const char** vendor_id);
+
+// Return the build ID of the Dispatch API runtime.
+//
+// This function returns a pointer to a statically allocated string that is the
+// ID of the Dispatch API runtime build.
+LiteRtStatus LiteRtDispatchGetBuildId(const char** build_id);
+
+// Return the capabilities supported by the Dispatch API runtime as a set of the
+// values specified in LiteRtDispatchCapabilities.
+LiteRtStatus LiteRtDispatchGetCapabilities(int* capabilities);
+
+// Create a `LiteRtDispatchDeviceContext` object.
+//
+// The returned object is used to talk with the underlying HW. The caller owns
+// the memory associated with the context and should call
+// LiteRtDispatchDeviceContextDestroy() to release it. Return NULL in case of
+// error.
+LiteRtStatus LiteRtDispatchDeviceContextCreate(
+    LiteRtDispatchDeviceContext* device_context);
+
+// Release a `LiteRtDispatchDeviceContext` object.
+//
+// The given context should be release only after releasing all associated
+// objects.
+LiteRtStatus LiteRtDispatchDeviceContextDestroy(
+    LiteRtDispatchDeviceContext device_context);
+
+// Given a tensor type for an invocation context input, obtain the attributes
+// the HW requires for the associated tensor buffer. The returned
+// `tensor_buffer_requirements` object is owned by the caller.
+LiteRtStatus LiteRtDispatchGetInputRequirements(
+    LiteRtDispatchInvocationContext invocation_context, int input_index,
+    const LiteRtRankedTensorType* tensor_type,
+    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
+
+// Given a tensor type for an invocation context output, obtain the attributes
+// the HW requires for the associated tensor buffer. The returned
+// `tensor_buffer_requirements` object is owned by the caller.
+LiteRtStatus LiteRtDispatchGetOutputRequirements(
+    LiteRtDispatchInvocationContext invocation_context, int output_index,
+    const LiteRtRankedTensorType* tensor_type,
+    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
+
+// Registers a buffer with the given device context.
+// Note: The memory backing the buffer should be valid until
+// `LiteRtDispatchUnregisterTensorBuffer` is called.
+LiteRtStatus LiteRtDispatchRegisterTensorBuffer(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtTensorBuffer tensor_buffer,
+    LiteRtTensorBufferHandle* tensor_buffer_handle);
+
+// Unregisters the registered buffer associated with the given
+// `LiteRtTensorBufferHandle`.
+// Note: The registered `LiteRtTensorBufferHandle` is supposed to be
+// unregistered with this function before the associated `ThrContext` is deleted
+// by calling `LiteRtDispatchDeviceContextDestroy`.
+LiteRtStatus LiteRtDispatchUnregisterTensorBuffer(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+// Create an invocation context to run a given function from a given
+// executable. Parameter `function_name` is required if the provided executable
+// includes multiple functions.
+LiteRtStatus LiteRtDispatchInvocationContextCreate(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtDispatchExecutableType exec_type, const void* exec_bytecode_ptr,
+    size_t exec_bytecode_size, const char* function_name, int num_inputs,
+    int num_outputs, LiteRtDispatchInvocationContext* invocation_context);
+
+LiteRtStatus LiteRtDispatchInvocationContextDestroy(
+    LiteRtDispatchInvocationContext invocation_context);
+
+LiteRtStatus LiteRtDispatchAttachInput(
+    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+LiteRtStatus LiteRtDispatchAttachOutput(
+    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+LiteRtStatus LiteRtDispatchDetachInput(
+    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+LiteRtStatus LiteRtDispatchDetachOutput(
+    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+LiteRtStatus LiteRtDispatchInvoke(
+    LiteRtDispatchInvocationContext invocation_context);
+
+// /////////////////////////////////////////////////////////////////////////////
+// Async Execution API
+// /////////////////////////////////////////////////////////////////////////////
+
+LiteRtStatus LiteRtDispatchAttachInputEvent(
+    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
+    LiteRtEvent input_event);
+
+LiteRtStatus LiteRtDispatchInvokeAsync(
+    LiteRtDispatchInvocationContext invocation_context, int num_output_events,
+    LiteRtEvent* output_events);
+
+// /////////////////////////////////////////////////////////////////////////////
+// Graph Execution API
+// /////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t LiteRtDispatchNodeId;
+typedef uint64_t LiteRtDispatchEdgeId;
+typedef uint64_t LiteRtDispatchExecutableHandle;
+
+LITERT_DEFINE_HANDLE(LiteRtDispatchGraph);
+
+// Types of graph nodes.
+typedef enum LiteRtDispatchNodeType {
+  kLiteRtDispatchNodeTypeUnknown = 0,
+  kLiteRtDispatchNodeTypeDsp =
+      1,  // Can execute both ML models and Dsp libraries
+  kLiteRtDispatchNodeTypeNpu = 2,  // Can execute only ML models
+} LiteRtDispatchNodeType;
+
+LiteRtStatus LiteRtDispatchGraphCreate(
+    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph** graph);
+
+LiteRtStatus LiteRtDispatchGraphDestroy(LiteRtDispatchGraph* graph);
+
+// Add a compute node to a given graph. Parameter node_id should be unique to
+// the graph.
+LiteRtStatus LiteRtDispatchAddNode(LiteRtDispatchGraph* graph,
+                                   LiteRtDispatchNodeId node_id,
+                                   LiteRtDispatchNodeType node_type);
+
+// Add an edge a given graph. Parameter edge_id should be unique to the graph.
+LiteRtStatus LiteRtDispatchAddEdge(LiteRtDispatchGraph* graph,
+                                   LiteRtDispatchEdgeId edge_id);
+
+// Connect a given node's input.
+LiteRtStatus LiteRtDispatchConnectNodeInput(LiteRtDispatchGraph* graph,
+                                            LiteRtDispatchNodeId node_id,
+                                            int input_index,
+                                            LiteRtDispatchEdgeId edge_id);
+
+// Connect a given node's output.
+LiteRtStatus LiteRtDispatchConnectNodeOutput(LiteRtDispatchGraph* graph,
+                                             LiteRtDispatchNodeId node_id,
+                                             int output_index,
+                                             LiteRtDispatchEdgeId edge_id);
+
+// Connect a given graph's input.
+LiteRtStatus LiteRtDispatchConnectGraphInput(LiteRtDispatchGraph* graph,
+                                             int input_index,
+                                             LiteRtDispatchEdgeId edge_id);
+
+// Connect a given graph's output.
+LiteRtStatus LiteRtDispatchConnectGraphOutput(LiteRtDispatchGraph* graph,
+                                              int output_index,
+                                              LiteRtDispatchEdgeId edge_id);
+
+LiteRtStatus LiteRtDispatchLoadExecutable(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtDispatchExecutableType type, const void* bytecode,
+    size_t bytecode_size, LiteRtDispatchExecutableHandle* exec_handle);
+
+LiteRtStatus LiteRtDispatchUnloadExecutable(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtDispatchExecutableHandle exec_handle);
+
+// Assign an executable function to a graph node. Parameter `function_name` is
+// mandatory if the given executable includes multiple functions.
+LiteRtStatus LiteRtDispatchAssignNodeFunction(
+    LiteRtDispatchGraph* graph, LiteRtDispatchNodeId node_id,
+    LiteRtDispatchExecutableHandle exec_handle, const char* function_name);
+
+// Add an annotation to an entire graph.
+LiteRtStatus LiteRtDispatchAnnotateGraph(LiteRtDispatchGraph* graph,
+                                         const char* key, const char* value);
+
+// Add an annotation to a specified node.
+LiteRtStatus LiteRtDispatchAnnotateNode(LiteRtDispatchGraph* graph,
+                                        LiteRtDispatchNodeId node_id,
+                                        const char* key, const char* value);
+
+// Add an annotation to a specified edge.
+LiteRtStatus LiteRtDispatchAnnotateEdge(LiteRtDispatchGraph* graph,
+                                        LiteRtDispatchEdgeId edge_id,
+                                        const char* key, const char* value);
+
+LiteRtStatus LiteRtDispatchInvocationContextCreateFromGraph(
+    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph* graph,
+    LiteRtDispatchInvocationContext* invocation_context);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h
new file mode 100644
index 00000000..61ee2e81
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h
@@ -0,0 +1,222 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_API_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_API_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_event.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// /////////////////////////////////////////////////////////////////////////////
+
+typedef LiteRtStatus (*LiteRtDispatchInitializeT)(
+    const LiteRtDispatchOption* options, int num_options);
+
+typedef LiteRtStatus (*LiteRtDispatchGetVendorIdT)(const char** vendor_id);
+
+typedef LiteRtStatus (*LiteRtDispatchGetBuildIdT)(const char** build_id);
+
+typedef LiteRtStatus (*LiteRtDispatchGetCapabilitiesT)(int* capabilities);
+
+typedef LiteRtStatus (*LiteRtDispatchDeviceContextCreateT)(
+    LiteRtDispatchDeviceContext* device_context);
+
+typedef LiteRtStatus (*LiteRtDispatchDeviceContextDestroyT)(
+    LiteRtDispatchDeviceContext device_context);
+
+typedef LiteRtStatus (*LiteRtDispatchGetInputRequirementsT)(
+    LiteRtDispatchInvocationContext invocation_context, int input_index,
+    const LiteRtRankedTensorType* tensor_type,
+    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
+
+typedef LiteRtStatus (*LiteRtDispatchGetOutputRequirementsT)(
+    LiteRtDispatchInvocationContext invocation_context, int output_index,
+    const LiteRtRankedTensorType* tensor_type,
+    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
+
+typedef LiteRtStatus (*LiteRtDispatchRegisterTensorBufferT)(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtTensorBuffer tensor_buffer,
+    LiteRtTensorBufferHandle* tensor_buffer_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchUnregisterTensorBufferT)(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtTensorBufferHandle handle);
+
+typedef LiteRtStatus (*LiteRtDispatchInvocationContextCreateT)(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtDispatchExecutableType exec_type, const void* exec_bytecode,
+    size_t exec_bytecode_size, const char* function_name, int num_inputs,
+    int num_outputs, LiteRtDispatchInvocationContext* invocation_context);
+
+typedef LiteRtStatus (*LiteRtDispatchInvocationContextDestroyT)(
+    LiteRtDispatchInvocationContext invocation_context);
+
+typedef LiteRtStatus (*LiteRtDispatchAttachInputT)(
+    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchAttachOutputT)(
+    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchDetachInputT)(
+    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchDetachOutputT)(
+    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
+    LiteRtTensorBufferHandle tensor_buffer_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchInvokeT)(
+    LiteRtDispatchInvocationContext invocation_context);
+
+typedef struct LiteRtDispatchInterface {
+  LiteRtDispatchInitializeT initialize;
+  LiteRtDispatchGetVendorIdT get_vendor_id;
+  LiteRtDispatchGetBuildIdT get_build_id;
+  LiteRtDispatchGetCapabilitiesT get_capabilities;
+  LiteRtDispatchDeviceContextCreateT device_context_create;
+  LiteRtDispatchDeviceContextDestroyT device_context_destroy;
+  LiteRtDispatchGetInputRequirementsT get_input_requirements;
+  LiteRtDispatchGetOutputRequirementsT get_output_requirements;
+  LiteRtDispatchRegisterTensorBufferT register_tensor_buffer;
+  LiteRtDispatchUnregisterTensorBufferT unregister_tensor_buffer;
+  LiteRtDispatchInvocationContextCreateT invocation_context_create;
+  LiteRtDispatchInvocationContextDestroyT invocation_context_destroy;
+  LiteRtDispatchAttachInputT attach_input;
+  LiteRtDispatchAttachOutputT attach_output;
+  LiteRtDispatchDetachInputT detach_input;
+  LiteRtDispatchDetachOutputT detach_output;
+  LiteRtDispatchInvokeT invoke;
+} LiteRtDispatchInterface;
+
+// /////////////////////////////////////////////////////////////////////////////
+
+typedef LiteRtStatus (*LiteRtDispatchAttachInputEventT)(
+    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
+    LiteRtEvent input_event);
+
+typedef LiteRtStatus (*LiteRtDispatchInvokeAsyncT)(
+    LiteRtDispatchInvocationContext invocation_context, int num_output_events,
+    LiteRtEvent* output_events);
+
+typedef struct LiteRtDispatchAsyncInterface {
+  LiteRtDispatchAttachInputEventT attach_input_event;
+  LiteRtDispatchInvokeAsyncT invoke_async;
+} LiteRtDispatchAsyncInterface;
+
+// /////////////////////////////////////////////////////////////////////////////
+
+typedef LiteRtStatus (*LiteRtDispatchGraphCreateT)(
+    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph* graph);
+
+typedef LiteRtStatus (*LiteRtDispatchGraphDestroyT)(LiteRtDispatchGraph graph);
+
+typedef LiteRtStatus (*LiteRtDispatchAddNodeT)(
+    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
+    LiteRtDispatchNodeType node_type);
+
+typedef LiteRtStatus (*LiteRtDispatchAddEdgeT)(LiteRtDispatchGraph graph,
+                                               LiteRtDispatchEdgeId edge_id);
+
+typedef LiteRtStatus (*LiteRtDispatchConnectNodeInputT)(
+    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id, int input_index,
+    LiteRtDispatchEdgeId edge_id);
+
+typedef LiteRtStatus (*LiteRtDispatchConnectNodeOutputT)(
+    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id, int output_index,
+    LiteRtDispatchEdgeId edge_id);
+
+typedef LiteRtStatus (*LiteRtDispatchConnectGraphInputT)(
+    LiteRtDispatchGraph graph, int input_index, LiteRtDispatchEdgeId edge_id);
+
+typedef LiteRtStatus (*LiteRtDispatchConnectGraphOutputT)(
+    LiteRtDispatchGraph graph, int output_index, LiteRtDispatchEdgeId edge_id);
+
+typedef LiteRtStatus (*LiteRtDispatchLoadExecutableT)(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtDispatchExecutableType type, const void* bytecode_ptr,
+    size_t bytecode_size, LiteRtDispatchExecutableHandle* exec_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchUnloadExecutableT)(
+    LiteRtDispatchDeviceContext device_context,
+    LiteRtDispatchExecutableHandle exec_handle);
+
+typedef LiteRtStatus (*LiteRtDispatchAssignNodeFunctionT)(
+    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
+    LiteRtDispatchExecutableHandle exec_handle, const char* function_name);
+
+typedef LiteRtStatus (*LiteRtDispatchInvocationContextCreateFromGraphT)(
+    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph,
+    LiteRtDispatchInvocationContext* invocation_context);
+
+typedef LiteRtStatus (*LiteRtDispatchAnnotateGraphT)(LiteRtDispatchGraph graph,
+                                                     const char* key,
+                                                     const char* value);
+
+typedef LiteRtStatus (*LiteRtDispatchAnnotateNodeT)(
+    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id, const char* key,
+    const char* value);
+
+typedef LiteRtStatus (*LiteRtDispatchAnnotateEdgeT)(
+    LiteRtDispatchGraph graph, LiteRtDispatchEdgeId edge_id, const char* key,
+    const char* value);
+
+typedef struct LiteRtDispatchGraphInterface {
+  LiteRtDispatchGraphCreateT graph_create;
+  LiteRtDispatchGraphDestroyT graph_destroy;
+  LiteRtDispatchAddNodeT add_node;
+  LiteRtDispatchAddEdgeT add_edge;
+  LiteRtDispatchConnectNodeInputT connect_node_input;
+  LiteRtDispatchConnectNodeOutputT connect_node_output;
+  LiteRtDispatchConnectGraphInputT connect_graph_input;
+  LiteRtDispatchConnectGraphOutputT connect_graph_output;
+  LiteRtDispatchLoadExecutableT load_executable;
+  LiteRtDispatchUnloadExecutableT unload_executable;
+  LiteRtDispatchAssignNodeFunctionT assign_node_function;
+  LiteRtDispatchAnnotateGraphT annotate_graph;
+  LiteRtDispatchAnnotateNodeT annotate_node;
+  LiteRtDispatchAnnotateEdgeT annotate_edge;
+  LiteRtDispatchInvocationContextCreateFromGraphT
+      invocation_context_create_from_graph;
+} LiteRtDispatchGraphInterface;
+
+// /////////////////////////////////////////////////////////////////////////////
+
+// FIXME See Vulkan and OpenCL extensions.
+typedef struct LiteRtDispatchApi {
+  LiteRtApiVersion version;
+  LiteRtDispatchInterface* interface;
+  LiteRtDispatchAsyncInterface* async_interface;
+  LiteRtDispatchGraphInterface* graph_interface;
+} LiteRtDispatchApi;
+
+LiteRtStatus LiteRtDispatchGetApi(LiteRtDispatchApi* api);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h
new file mode 100644
index 00000000..34cf95bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h
@@ -0,0 +1,79 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_BACKEND_IR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_BACKEND_IR_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+
+namespace litert {
+
+// Interfaces and types for managing backend IR to be targeted by LiteRt for
+// compilation.
+
+// Memory Management
+//===---------------------------------------------------------------------------
+
+// Callable for allocating a new instance of a backend IR type. This facilitates
+// external memory management for the backend IR implementented by the backend.
+// It is encouraged for implementations provide pointer stability (consider
+// std::list for storage).
+template <class BackendIr, class... Args>
+using BackendIrAllocator = std::function<BackendIr*(Args&&... args)>;
+
+// Allocator for backend tensors.
+template <class BackendTensor>
+using TensorAllocator = BackendIrAllocator<BackendTensor>;
+
+// Allocator for backend ops.
+template <class BackendOp>
+using OpAllocator = BackendIrAllocator<BackendOp>;
+
+// Graph Construction
+//===---------------------------------------------------------------------------
+
+// Wrapper for an in memory graph for a particular backend. Implementations
+// should contain an instance of a backend graph that can be iteratively
+// constructed via calls to this interface.
+template <class BackendOp, class BackendTensor>
+class BackendGraphBuilder {
+ public:
+  // Hook called to initialize state for a new backend graph with a name. This
+  // will be called once per-instance before any other method.
+  virtual void InitGraph(std::string graph_name) = 0;
+
+  // Hook called to register a backend tensor once it
+  // has been converted. This will be called once per tensor.
+  virtual LiteRtStatus RegisterTensor(BackendTensor& tensor) = 0;
+
+  // Hook called to register a backend op once it has been converted. This will
+  // be called once per op (in a toplogogical order). All input/output tensors
+  // will have been registered before called.
+  virtual LiteRtStatus RegisterOp(BackendOp& op) = 0;
+
+  // Hook called to register a graph when graph
+  // conversion is completed. Backend graph context should be stored as internal
+  // state. This will be called once per instance after all ops/tensors have
+  // been finalized.
+  virtual LiteRtStatus FinalizeGraph() = 0;
+
+  virtual ~BackendGraphBuilder() = default;
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_BACKEND_IR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/conversion.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/conversion.h
new file mode 100644
index 00000000..139ba594
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/conversion.h
@@ -0,0 +1,262 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Utility types for mapping LiteRt IR to arbitrary backend specific
+// types. Implementations of these types define mapping for ops and tensors
+// that may be used in a stndalone fashion. They also may be composed
+// to create lowerings of entire graphs with topology.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERSION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERSION_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
+
+namespace litert {
+
+// Interfaces and types for implementing "conversions" that map LiteRt IR to
+// backend IR.
+// NOTE: Conversions depend on external memory management for the backend IR
+// types. User defined conversions are usually expected to leverage callbacks
+// to allocate backend IR types rather than constructing them directly.
+
+// Conversion Result Type
+//===---------------------------------------------------------------------------
+
+// Result of a one->many general mapping from LiteRt op to any number of
+// backend specific ops. Does not own the memory of the backend ops or tensors.
+template <class BackendOp, class BackendTensor>
+struct GeneralConversionResult {
+  // Ops emitted from translation pattern.
+  std::vector<BackendOp*> ops;
+
+  // Any backend tensors used within the results ops. Not relevant when
+  // size of backend ops == 1. This does not include input/output tensors of the
+  // op being converted.
+  std::vector<BackendTensor*> intermediate_tensors;
+};
+
+// The result of a one->one specialized mapping from LiteRt op to backend op.
+template <class BackendOp>
+using SimpleConversionResult = BackendOp*;
+
+// A tag-type for a conversion result that is a non-error non-match.
+struct NoMatch {};
+
+// Type union for conversion results.
+// TODO(lukeboyer): Update conversion result types to handle the case where
+// backend ops add extra inputs.
+template <class BackendOp, class BackendTensor>
+using ConversionResult =
+    std::variant<SimpleConversionResult<BackendOp>,
+                 GeneralConversionResult<BackendOp, BackendTensor>, NoMatch>;
+
+// Short hand for holds_alternative.
+template <class Result, class BackendOp, class BackendTensor>
+bool ConversionIsA(const ConversionResult<BackendOp, BackendTensor>& result) {
+  return std::holds_alternative<Result>(result);
+}
+
+// Short hand for holds_alternative.
+template <class BackendOp, class BackendTensor>
+bool ConversionMatched(
+    const ConversionResult<BackendOp, BackendTensor>& result) {
+  return !std::holds_alternative<NoMatch>(result);
+}
+
+// Short hand for holds_alternative.
+template <class BackendOp, class BackendTensor>
+bool IsSimpleResult(const ConversionResult<BackendOp, BackendTensor>& result) {
+  return ConversionIsA<SimpleConversionResult<BackendOp>>(result);
+}
+
+// Short hand for holds_alternative.
+template <class BackendOp, class BackendTensor>
+bool IsGeneralResult(const ConversionResult<BackendOp, BackendTensor>& result) {
+  return ConversionIsA<GeneralConversionResult<BackendOp, BackendTensor>>(
+      result);
+}
+
+// Short hand for std::get. Also checks if match and wraps in expected.
+template <class Result, class BackendOp, class BackendTensor>
+Expected<Result> GetConversionResult(
+    const ConversionResult<BackendOp, BackendTensor>& result) {
+  if (ConversionMatched(result)) {
+    return Expected<Result>(std::get<Result>(result));
+  }
+  return Error(kLiteRtStatusLegalizeNoMatch);
+}
+
+// Get simple result if there was a match.
+template <class BackendOp, class BackendTensor>
+Expected<SimpleConversionResult<BackendOp>> GetSimpleConversionResult(
+    const ConversionResult<BackendOp, BackendTensor>& result) {
+  if (!IsSimpleResult(result)) {
+    return Error(kLiteRtStatusErrorInvalidArgument);
+  }
+  return GetConversionResult<SimpleConversionResult<BackendOp>>(result);
+}
+
+// Get general result if there was a match.
+template <class BackendOp, class BackendTensor>
+Expected<GeneralConversionResult<BackendOp, BackendTensor>>
+GetGeneralConversionResult(
+    const ConversionResult<BackendOp, BackendTensor>& result) {
+  if (!IsGeneralResult(result)) {
+    return Error(kLiteRtStatusErrorInvalidArgument);
+  }
+  return GetConversionResult<GeneralConversionResult<BackendOp, BackendTensor>>(
+      result);
+}
+
+// Common IR Conversion
+//===---------------------------------------------------------------------------
+
+// User defined callback for converting a LiteRt tensor to a backend tensor.
+// These are leveraged in various higher-level conversion routines.
+// TensorConverters should not stack allocate memory for the backend tensor. In
+// most situations, these will be bound to an external allocator.
+template <class BackendTensor>
+using TensorConverter =
+    std::function<Expected<BackendTensor*>(const Tensor& litert_tensor)>;
+
+// User defined callback for creating a TensorConverter. This facilitates
+// TensoConverters that are bound to an external allocator.
+template <class BackendTensor>
+using TensorConverterFactory = std::function<TensorConverter<BackendTensor>(
+    TensorAllocator<BackendTensor> alloc)>;
+
+// Mapping from LiteRt tensor to backend tensor, used during iterative graph
+// conversions to store current scope.
+template <class BackendTensor>
+using TensorMap = absl::flat_hash_map<LiteRtTensor, BackendTensor*>;
+
+// User-defined hook that calls backend to determine if an op is supported.
+template <class BackendOp>
+using Capability = std::function<bool(const BackendOp* op)>;
+
+// Legalization
+//===---------------------------------------------------------------------------
+
+// A legalization is a particlar type of user-defined conversion that is
+// scheduled for execution on a particular type of LiteRtOp. They may be
+// one-to-one or one-to-many conversions.
+template <class BackendOp, class BackendTensor>
+class Legalization {
+ private:
+  using Self = Legalization<BackendOp, BackendTensor>;
+
+ public:
+  using Result = ConversionResult<BackendOp, BackendTensor>;
+  using TensorConverter = TensorConverter<BackendTensor>;
+  using TensorConverterFactory = TensorConverterFactory<BackendTensor>;
+  using Ptr = std::unique_ptr<Self>;
+  using TensorAllocator = TensorAllocator<BackendTensor>;
+  using OpAllocator = OpAllocator<BackendOp>;
+  using Tensors = std::vector<BackendTensor*>;
+
+  // The type of op to schedule on.
+  virtual LiteRtOpCode OpToMatch() const = 0;
+
+  // Invoke this legalization on the given LiteRt op. All new backend IR will be
+  // allocated via given allocators. NOTE: In most cases, input and output
+  // converters will be the same. They are separated here for compatibility with
+  // graph-level conversions routines.
+  Expected<Result> Legalize(const Op& litert_op,
+                            TensorConverterFactory input_converter,
+                            TensorConverterFactory output_converter,
+                            TensorAllocator tensor_allocator,
+                            OpAllocator op_allocator) const {
+    const auto litert_inputs = litert_op.Inputs();
+    Tensors inputs(litert_inputs.size());
+    auto convert_input = input_converter(tensor_allocator);
+
+    for (size_t i = 0; i < litert_inputs.size(); ++i) {
+      const auto& litert_input = litert_inputs[i];
+      auto result = convert_input(litert_input);
+      if (!result) {
+        return result.Error();
+      }
+      inputs[i] = *result;
+    }
+
+    const auto litert_outputs = litert_op.Outputs();
+    Tensors outputs(litert_outputs.size());
+    auto convert_output = output_converter(tensor_allocator);
+
+    for (size_t i = 0; i < litert_outputs.size(); ++i) {
+      const auto& litert_output = litert_outputs[i];
+      auto result = convert_output(litert_output);
+      if (!result) {
+        return result.Error();
+      }
+      outputs[i] = *result;
+    }
+
+    return LegalizeImpl(litert_op, inputs, outputs, tensor_allocator,
+                        op_allocator);
+  }
+
+  virtual ~Legalization() = default;
+
+ private:
+  // The user defined implementation of a legalization. Users must use the
+  // given allocators to allocate any new backend IR types (e.g. intermediate
+  // ops/tensors in the case of a one-to-many legalization). BackendTensors
+  // corresponding to LiteRt inputs and outputs have been pre-converted.
+  virtual Expected<Result> LegalizeImpl(const Op& litert_op,
+                                        const Tensors& inputs,
+                                        const Tensors& outputs,
+                                        TensorAllocator tensor_allocator,
+                                        OpAllocator op_allocator) const = 0;
+};
+
+// Collection of legalizations for a specific backend.
+template <class BackendOp, class BackendTensor>
+using Legalizations =
+    std::vector<typename Legalization<BackendOp, BackendTensor>::Ptr>;
+
+// Map for instance lookup by op code.
+template <class BackendOp, class BackendTensor>
+using LegalizationMap =
+    absl::flat_hash_map<LiteRtOpCode,
+                        const Legalization<BackendOp, BackendTensor>*>;
+
+// Construct a LegalizationMap from a collection of legalizations.
+// TODO: Consider wrapping the legalization map in a class to avoid
+// re-constructing it & better syntax.
+template <class BackendOp, class BackendTensor>
+LegalizationMap<BackendOp, BackendTensor> MakeLegalizationMap(
+    const Legalizations<BackendOp, BackendTensor>& legalizations) {
+  LegalizationMap<BackendOp, BackendTensor> map;
+  for (const auto& l : legalizations) {
+    map.insert({l->OpToMatch(), l.get()});
+  }
+  return map;
+}
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h
new file mode 100644
index 00000000..cd7221c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h
@@ -0,0 +1,177 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERT_GRAPH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERT_GRAPH_H_
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
+
+namespace litert {
+
+// Performs iterative graph conversion with user provided hooks. This function
+// traverses the IR in toplogical order, converting ops and tensors with given
+// tensor converter and legalizations. Registers converted ops and tensors with
+// the backend graph builder after they have been converted. The following are
+// true:
+// * Each tensor and op will be converted & registered at most once.
+// * An ops input and output tensors will be registered before the op is
+// converted (and before its registered).
+// * The graph builder will be initialized before any registration.
+// * The graph builder will be finalized after all registration.
+template <class Ir>
+LiteRtStatus ConvertGraph(
+    const Subgraph& subgraph, std::string graph_name,
+    typename Ir::TensorConverterFactory tensor_converter_factory,
+    typename Ir::TensorAllocator tensor_alloc,
+    typename Ir::OpAllocator op_alloc,
+    const typename Ir::Legalizations& legalizations,
+    typename Ir::GraphBuilder& builder) {
+  // Store mapping between evaluated litert tensors and corresponding backend
+  // tensors.
+  typename Ir::TensorMap tensor_map;
+
+  // Initialize backend graph builder.
+  builder.InitGraph(std::move(graph_name));
+
+  // Convert tensor, add to scope and register in backend graph builder.
+  auto handle_tensor = [&tensor_map, &builder](
+                           const auto& litert_tensor,
+                           auto tensor_converter) -> Ir::TensorResult {
+    auto converted = tensor_converter(litert_tensor);
+    if (!converted) {
+      LITERT_LOG(LITERT_ERROR, "Failed to convert tensor %lu",
+                 litert_tensor.Get());
+      return converted.Error();
+    }
+
+    if (auto status = builder.RegisterTensor(**converted);
+        status != kLiteRtStatusOk) {
+      LITERT_LOG(LITERT_ERROR, "Failed to register tensor %lu, with status %d",
+                 litert_tensor.Get(), status);
+      return Error(status);
+    }
+
+    tensor_map.insert({litert_tensor.Get(), *converted});
+    return *converted;
+  };
+
+  // Wrap provided tensor conversion logic for converting subgraph or op input
+  // tensors. We want functionality that provides user-defined conversions with
+  // tensors to be aware of the tensor map and graph builder registration.
+  auto input_tensor_convert_factory = [tensor_converter_factory, &tensor_map,
+                                       handle_tensor](auto tensor_alloc) {
+    return [tensor_alloc, tensor_converter_factory, &tensor_map,
+            handle_tensor](const Tensor& litert_tensor) -> Ir::TensorResult {
+      auto tensor_converter = tensor_converter_factory(tensor_alloc);
+
+      // Check if tensor has been converted already.
+      auto it = tensor_map.find(litert_tensor.Get());
+      const auto in_scope = it != tensor_map.end();
+      if (in_scope) {
+        LITERT_LOG(LITERT_VERBOSE, "Tensor %lu is in scope",
+                   litert_tensor.Get());
+        return it->second;
+      }
+
+      // If its a subgraph input or constant, we can convert it and add to
+      // scope.
+      const auto is_cst = litert_tensor.IsConstant();
+      const auto is_sg_input = litert_tensor.IsSubgraphInput();
+      if (is_sg_input || is_cst) {
+        return handle_tensor(litert_tensor, tensor_converter);
+      }
+
+      // Tensor must be added to scope before conversion, or not have a parent
+      // (e.g. subgraph input or constant) so error at this point.
+      LITERT_LOG(LITERT_ERROR, "Tensor %lu not handled", litert_tensor.Get());
+      return Error(kLiteRtStatusErrorInvalidArgument);
+    };
+  };
+
+  // Wrap provided tensor conversion logic for op output tensors. Adds to map
+  // and backend graph after conversion.
+  auto output_tensor_convert_factory = [tensor_converter_factory,
+                                        handle_tensor](auto tensor_alloc) {
+    return [tensor_alloc, tensor_converter_factory,
+            handle_tensor](const Tensor& litert_tensor) {
+      auto tensor_converter = tensor_converter_factory(tensor_alloc);
+      return handle_tensor(litert_tensor, tensor_converter);
+    };
+  };
+
+  // Convert all ops in subgraph in toplogical order.
+  auto legalization_map = Ir::MakeLegalizationMap(legalizations);
+  for (const auto& op : subgraph.Ops()) {
+    auto it = legalization_map.find(op.Code());
+    if (it == legalization_map.end()) {
+      LITERT_LOG(LITERT_ERROR, "No legalization found for op %d", op.Code());
+      return kLiteRtStatusErrorUnsupported;
+    }
+
+    auto result = it->second->Legalize(op, input_tensor_convert_factory,
+                                       output_tensor_convert_factory,
+                                       tensor_alloc, op_alloc);
+    if (!result) {
+      LITERT_LOG(LITERT_ERROR, "Failed to legalize op %d, with status %d",
+                 op.Code(), result.Error().Status());
+      return result.Error().Status();
+    }
+
+    auto simple_result = GetSimpleConversionResult(*result);
+    if (simple_result) {
+      if (auto stat = builder.RegisterOp(**simple_result);
+          stat != kLiteRtStatusOk) {
+        LITERT_LOG(LITERT_ERROR, "Failed to register op %d, with status %d",
+                   op.Code(), stat);
+        return stat;
+      }
+    }
+
+    auto general_result = GetGeneralConversionResult(*result);
+    if (general_result) {
+      for (auto* tensor : general_result->intermediate_tensors) {
+        if (auto stat = builder.RegisterTensor(*tensor);
+            stat != kLiteRtStatusOk) {
+          LITERT_LOG(LITERT_ERROR,
+                     "Failed to register tensor %d, with status %d", tensor->id,
+                     stat);
+          return stat;
+        }
+      }
+
+      for (auto* op : general_result->ops) {
+        if (auto stat = builder.RegisterOp(*op); stat != kLiteRtStatusOk) {
+          LITERT_LOG(LITERT_ERROR, "Failed to register op %d, with status %d",
+                     op->op_code, stat);
+          return stat;
+        }
+      }
+    }
+  }
+
+  builder.FinalizeGraph();
+
+  return kLiteRtStatusOk;
+}
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERT_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/ir_types.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/ir_types.h
new file mode 100644
index 00000000..a1da917d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/ir_types.h
@@ -0,0 +1,50 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_IR_TYPES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_IR_TYPES_H_
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
+
+namespace litert {
+
+// Holds particular backends IR template aliases for convenience.
+template <class BackendOp, class BackendTensor>
+struct IrTypes {
+  using Op = BackendOp;
+  using Tensor = BackendTensor;
+  using OpAllocator = OpAllocator<Op>;
+  using TensorAllocator = TensorAllocator<Tensor>;
+  using GraphBuilder = BackendGraphBuilder<Op, Tensor>;
+  using GeneralConversionResult = GeneralConversionResult<Op, Tensor>;
+  using SimpleConversionResult = SimpleConversionResult<Op>;
+  using ConversionResult = Expected<ConversionResult<Op, Tensor>>;
+  using Legalization = Legalization<Op, Tensor>;
+  using Legalizations = Legalizations<Op, Tensor>;
+  using LegalizationMap = LegalizationMap<Op, Tensor>;
+  using TensorConverter = TensorConverter<Tensor>;
+  using TensorResult = Expected<Tensor*>;
+  using TensorConverterFactory = TensorConverterFactory<Tensor>;
+  using TensorMap = TensorMap<Tensor>;
+  using Capability = Capability<Op>;
+  // NOLINTNEXTLINE
+  inline static auto MakeLegalizationMap =
+      litert::MakeLegalizationMap<Op, Tensor>;
+};
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_IR_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h
new file mode 100644
index 00000000..097ee8dc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h
@@ -0,0 +1,46 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_LITERT_COMPILER_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_LITERT_COMPILER_PLUGIN_H_
+
+#include <memory>
+
+#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
+
+namespace litert {
+
+// Deleter for incomplete compiler plugin type.
+struct LiteRtCompilerPluginDeleter {
+  void operator()(LiteRtCompilerPlugin plugin) {
+    if (plugin != nullptr) {
+      LiteRtDestroyCompilerPlugin(plugin);
+    }
+  }
+};
+
+// Smart pointer wrapper for incomplete plugin type.
+using PluginPtr =
+    std::unique_ptr<LiteRtCompilerPluginT, LiteRtCompilerPluginDeleter>;
+
+// Initialize a plugin via c-api and wrap result in smart pointer.
+inline PluginPtr CreatePlugin() {
+  LiteRtCompilerPlugin plugin;
+  LITERT_CHECK_STATUS_OK(LiteRtCreateCompilerPlugin(&plugin));
+  return PluginPtr(plugin);
+}
+
+}  // namespace litert
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_LITERT_COMPILER_PLUGIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h
new file mode 100644
index 00000000..a462d174
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h
@@ -0,0 +1,97 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_PARTITION_WITH_CAPABILITIES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_PARTITION_WITH_CAPABILITIES_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
+
+namespace litert {
+
+// Higher-level functions for partitioning by leveraging user-defined
+// conversions. This method selects ops for partitioning via a callback that
+// checks if an op is supported by the backend.
+
+// Selects ops for partitioning from given subgraph based on given Capability
+// check. Returns all ops in the given supbgraph that are supported by the
+// backend. Suitable for use in implementing LiteRtCompilerPluginPartition. Any
+// allocations of new backend ir types will be done through given external
+// allocators.
+// NOTE: A missing legalization or any legalization failure will result in
+// an op not being supported, rather than a failure of this function.
+template <class Ir>
+Expected<std::vector<LiteRtOp>> PartitionWithCapabilities(
+    const typename Ir::Legalizations& legalizations,
+    typename Ir::Capability capability,
+    typename Ir::TensorConverterFactory convert_tensor_fact,
+    typename Ir::TensorAllocator tensor_allocator,
+    typename Ir::OpAllocator op_allocator, const Subgraph& litert_subgraph) {
+  std::vector<LiteRtOp> results;
+
+  // Build map for legalization lookup by op code.
+  auto map = Ir::MakeLegalizationMap(legalizations);
+
+  // Convert all ops from the given subgraph and check backend support.
+  for (const auto& litert_op : litert_subgraph.Ops()) {
+    const auto code = litert_op.Code();
+    LITERT_LOG(LITERT_INFO, "Checking support for LiteRtOp: %d", code);
+
+    auto it = map.find(code);
+    if (it == map.end()) {
+      LITERT_LOG(LITERT_WARNING, "No legalization found for LiteRtOp: %d",
+                 code);
+      continue;
+    }
+
+    // Call user-defined conversion.
+    auto result = it->second->Legalize(litert_op, convert_tensor_fact,
+                                       convert_tensor_fact, tensor_allocator,
+                                       op_allocator);
+    if (!result) {
+      LITERT_LOG(LITERT_WARNING, "Failed to legalize LiteRtOp: %d", code);
+      continue;
+    }
+
+    if (auto simple_result = GetSimpleConversionResult(*result)) {
+      if (capability(*simple_result)) {
+        LITERT_LOG(LITERT_INFO, "Selected LiteRtOp: %d", litert_op.Code());
+        results.push_back(litert_op.Get());
+      }
+      continue;
+    }
+
+    // Check all ops emitted from a one-to-many conversion are supported.
+    if (auto gen_result = GetGeneralConversionResult(*result)) {
+      const auto b_ops_start = gen_result->ops.cbegin();
+      const auto b_ops_end = gen_result->ops.cend();
+      if (std::all_of(b_ops_start, b_ops_end, capability)) {
+        LITERT_LOG(LITERT_INFO, "Selected LiteRtOp: %d", litert_op.Code());
+        results.push_back(litert_op.Get());
+      }
+      continue;
+    }
+  }
+
+  return results;
+}
+
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_PARTITION_WITH_CAPABILITIES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h
new file mode 100644
index 00000000..e7b93261
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h
@@ -0,0 +1,125 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_CONVERSION_IMPL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_CONVERSION_IMPL_H_
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
+#include "tensorflow/lite/experimental/litert/c/litert_options.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/ir_types.h"
+#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
+
+namespace litert::example {
+
+// Conversion type implementations for the fictional "example" backend.
+
+ExampleTypes::TensorConverter MakeTensorConverter(
+    ExampleTypes::TensorAllocator alloc);
+
+static constexpr absl::string_view kIntermediateTensorName =
+    "intermediate_bin_output";
+
+// Example legalization for simple binary ops.
+template <ExampleOpType BackendOpType, LiteRtOpCode LiteRtOpType>
+class ExampleBinOpLegalization : public Legalization<ExampleOp, ExampleTensor> {
+ private:
+  using Self = ExampleBinOpLegalization<BackendOpType, LiteRtOpType>;
+
+ public:
+  using Ptr = std::unique_ptr<Self>;
+
+  static Ptr Make() { return std::make_unique<Self>(); }
+
+  // Return the litert op code to match on.
+  constexpr LiteRtOpCode OpToMatch() const override { return LiteRtOpType; }
+
+  // Determines if the given litert op has a fused relu attribute.
+  bool HasFusedRelu(const Op& litert_op) const {
+    if constexpr (LiteRtOpType != kLiteRtOpCodeTflAdd) {
+      return false;
+    }
+    uint32_t faf;
+    if (LiteRtGetAddFusedActivationOption(litert_op.Get(), &faf) !=
+        kLiteRtStatusOk) {
+      return false;
+    }
+    return faf == 1;
+  }
+
+  // Transforms LiteRtAdd op into example op definition using the tensor
+  // converter to map tensors within.
+  ExampleTypes::ConversionResult LegalizeImpl(
+      const Op& litert_op, const Tensors& inputs, const Tensors& outputs,
+      ExampleTypes::TensorAllocator tensor_allocator,
+      ExampleTypes::OpAllocator op_allocator) const override {
+    ABSL_DCHECK_EQ(litert_op.Code(), LiteRtOpType);
+
+    auto& bin_op = *op_allocator();
+    bin_op.op_code = BackendOpType;
+
+    if (inputs.size() != 2 || outputs.size() != 1) {
+      return Error(kLiteRtStatusErrorInvalidArgument);
+    }
+
+    for (const auto* input : inputs) {
+      bin_op.inputs.push_back(input->id);
+      bin_op.input_names.push_back(input->name);
+    }
+
+    auto& output_tensor = *outputs.front();
+    if (!HasFusedRelu(litert_op)) {
+      bin_op.outputs.push_back(output_tensor.id);
+      bin_op.output_names.push_back(output_tensor.name);
+      return Expected<Result>(&bin_op);
+    }
+
+    auto* bin_output = tensor_allocator();
+    bin_output->dims = output_tensor.dims;
+    bin_output->type = output_tensor.type;
+    bin_output->name = std::string(kIntermediateTensorName);
+    bin_op.outputs.push_back(bin_output->id);
+    bin_op.output_names.push_back(bin_output->name);
+
+    auto& relu = *op_allocator();
+    relu.op_code = ExampleOpType::RELU;
+    relu.inputs.push_back(bin_output->id);
+    relu.input_names.push_back(bin_output->name);
+    relu.outputs.push_back(output_tensor.id);
+    relu.output_names.push_back(output_tensor.name);
+
+    ExampleTypes::GeneralConversionResult result;
+    result.ops.push_back(&bin_op);
+    result.ops.push_back(&relu);
+    result.intermediate_tensors.push_back(bin_output);
+
+    return ExampleTypes::ConversionResult(result);
+  }
+};
+
+using ExampleLegalizeAdd =
+    ExampleBinOpLegalization<ExampleOpType::ADD, kLiteRtOpCodeTflAdd>;
+using ExampleLegalizeMul =
+    ExampleBinOpLegalization<ExampleOpType::MUL, kLiteRtOpCodeTflMul>;
+
+ExampleTypes::Legalizations MakeAllLegalizations();
+
+}  // namespace litert::example
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_CONVERSION_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_ir.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_ir.h
new file mode 100644
index 00000000..e423a53f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_ir.h
@@ -0,0 +1,153 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_IR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_IR_H_
+
+#include <cstdint>
+#include <list>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
+#include "tensorflow/lite/experimental/litert/vendors/cc/ir_types.h"
+
+namespace litert::example {
+
+// Example IR wrapper types for an imaginary backend.
+
+// Example backend knows only float and int 32.
+enum class ExampleTensorType {
+  FLOAT,
+  INT,
+};
+
+// Example backend tensor wrapper that stores the type and shape and unique ID.
+struct ExampleTensor {
+  using Id = int32_t;
+  ExampleTensorType type;
+  std::vector<uint32_t> dims;
+  std::string name;
+  Id id = -1;
+};
+
+// Example backend knows only a few simple ops.
+enum class ExampleOpType {
+  ADD,
+  MUL,
+  RELU,
+};
+
+// Example backend op that stores op type as well as input and output tensor
+// IDs and names.
+struct ExampleOp {
+  ExampleOpType op_code;
+  std::vector<ExampleTensor::Id> inputs;
+  std::vector<std::string> input_names;
+  std::vector<ExampleTensor::Id> outputs;
+  std::vector<std::string> output_names;
+};
+
+// Simple allocator(s) for example example IR types that provides pointer
+// stability.
+template <class E>
+class ExampleIrAllocatorBase {
+ public:
+  ExampleIrAllocatorBase(const ExampleIrAllocatorBase&) = delete;
+  ExampleIrAllocatorBase& operator=(const ExampleIrAllocatorBase&) = delete;
+  ExampleIrAllocatorBase() = default;
+
+ protected:
+  std::list<E> ir_;
+};
+
+// Allocator for example tensors that provides pointer stability and unique IDs.
+class ExampleTensorAllocator : public ExampleIrAllocatorBase<ExampleTensor> {
+ private:
+  using Alloc = BackendIrAllocator<ExampleTensor>;
+
+ public:
+  ExampleTensor* operator()() {
+    auto& tensor = this->ir_.emplace_back();
+    tensor.id = this->next_id_++;
+    return &tensor;
+  }
+
+  // Return lambda instead of implicit copy construction when converting to
+  // function type.
+  // NOLINTNEXTLINE
+  operator Alloc() {
+    return [this]() { return this->operator()(); };
+  }
+
+  ExampleTensorAllocator(const ExampleTensorAllocator&) = delete;
+  ExampleTensorAllocator& operator=(const ExampleTensorAllocator&) = delete;
+  ExampleTensorAllocator() = default;
+
+ private:
+  uint32_t next_id_ = 0;
+};
+
+// Allocator for example ops that provides pointer stability.
+class ExampleOpAllocator : public ExampleIrAllocatorBase<ExampleOp> {
+ private:
+  using Alloc = BackendIrAllocator<ExampleOp>;
+
+ public:
+  ExampleOp* operator()() { return &this->ir_.emplace_back(); }
+
+  // Return lambda instead of implicit copy construction when converting to
+  // function type.
+  // NOLINTNEXTLINE
+  operator Alloc() {
+    return [this]() { return this->operator()(); };
+  }
+
+  ExampleOpAllocator(const ExampleOpAllocator&) = delete;
+  ExampleOpAllocator& operator=(const ExampleOpAllocator&) = delete;
+  ExampleOpAllocator() = default;
+};
+
+// Builder for graph conversion to example IR. The internal example IR graph is
+// simply a string representation of the graph.
+class ExampleGraphBuilder
+    : public BackendGraphBuilder<ExampleOp, ExampleTensor> {
+ public:
+  // Prefixes ir string.
+  void InitGraph(std::string graph_name) override;
+
+  // Registers tensor into the currrent graph by simply appending its string
+  // representation.
+  LiteRtStatus RegisterTensor(ExampleTensor& tensor) override;
+
+  // Registers op into the currrent graph by simply appending its string
+  // representation.
+  LiteRtStatus RegisterOp(ExampleOp& op) override;
+
+  // Simply appends tag to IR string.
+  LiteRtStatus FinalizeGraph() override;
+
+  // Gets the serialized IR representation.
+  std::string Serialize() const;
+
+ private:
+  std::stringstream example_graph_;
+};
+
+using ExampleTypes = IrTypes<ExampleOp, ExampleTensor>;
+
+}  // namespace litert::example
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_IR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h
new file mode 100644
index 00000000..e592dafc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h
@@ -0,0 +1,29 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_PLUGIN_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_PLUGIN_COMMON_H_
+
+#include <string>
+#include <vector>
+
+// Simple compiled result def holds byte code and per op data.
+struct LiteRtCompiledResultT {
+  std::string byte_code;
+  std::vector<std::string> per_op_data;
+};
+
+namespace litert::example {}  // namespace litert::example
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_PLUGIN_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h
new file mode 100644
index 00000000..00e0559c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h
@@ -0,0 +1,67 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_DISPATCH_API_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_DISPATCH_API_H_
+
+#include <cstddef>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+
+namespace litert {
+namespace google_tensor {
+
+LiteRtStatus GraphCreate(LiteRtDispatchDeviceContext device_context,
+                         LiteRtDispatchGraph* graph);
+LiteRtStatus GraphDestroy(LiteRtDispatchGraph graph);
+LiteRtStatus AddNode(LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
+                     LiteRtDispatchNodeType node_type);
+LiteRtStatus AddEdge(LiteRtDispatchGraph graph, LiteRtDispatchEdgeId edge_id);
+LiteRtStatus ConnectNodeInput(LiteRtDispatchGraph graph,
+                              LiteRtDispatchNodeId node_id, int input_index,
+                              LiteRtDispatchEdgeId edge_id);
+LiteRtStatus ConnectNodeOutput(LiteRtDispatchGraph graph,
+                               LiteRtDispatchNodeId node_id, int output_index,
+                               LiteRtDispatchEdgeId edge_id);
+LiteRtStatus ConnectGraphInput(LiteRtDispatchGraph graph, int input_index,
+                               LiteRtDispatchEdgeId edge_id);
+LiteRtStatus ConnectGraphOutput(LiteRtDispatchGraph graph, int output_index,
+                                LiteRtDispatchEdgeId edge_id);
+LiteRtStatus LoadExecutable(LiteRtDispatchDeviceContext device_context,
+                            LiteRtDispatchExecutableType type,
+                            const void* bytecode, size_t bytecode_size,
+                            LiteRtDispatchExecutableHandle* exec_handle);
+LiteRtStatus UnloadExecutable(LiteRtDispatchDeviceContext device_context,
+                              LiteRtDispatchExecutableHandle exec_handle);
+LiteRtStatus AssignNodeFunction(LiteRtDispatchGraph graph,
+                                LiteRtDispatchNodeId node_id,
+                                LiteRtDispatchExecutableHandle exec_handle,
+                                const char* function_name);
+LiteRtStatus AnnotateGraph(LiteRtDispatchGraph graph, const char* key,
+                           const char* value);
+LiteRtStatus AnnotateNode(LiteRtDispatchGraph graph,
+                          LiteRtDispatchNodeId node_id, const char* key,
+                          const char* value);
+LiteRtStatus AnnotateEdge(LiteRtDispatchGraph graph,
+                          LiteRtDispatchEdgeId edge_id, const char* key,
+                          const char* value);
+LiteRtStatus InvocationContextCreateFromGraph(
+    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph,
+    LiteRtDispatchInvocationContext* invocation_context);
+
+}  // namespace google_tensor
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_DISPATCH_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h
new file mode 100644
index 00000000..694cbe2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "third_party/odml/infra/southbound/sb_api.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
+
+class LiteRtDispatchDeviceContextT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtDispatchDeviceContextT>;
+
+  ~LiteRtDispatchDeviceContextT();
+
+  static litert::Expected<Ptr> Create(
+      const litert::google_tensor::Southbound& southbound);
+
+  ThrContext* thr_context() { return thr_context_; }
+  void add_graph(ThrGraph* graph) { thr_graphs_.insert(graph); }
+  void remove_graph(ThrGraph* graph) { thr_graphs_.erase(graph); }
+
+ private:
+  explicit LiteRtDispatchDeviceContextT(
+      const litert::google_tensor::Southbound& southbound)
+      : southbound_(southbound) {}
+
+  const litert::google_tensor::Southbound& southbound_;
+  ThrContext* thr_context_ = nullptr;
+  absl::flat_hash_set<ThrGraph*> thr_graphs_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h
new file mode 100644
index 00000000..69dd48be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h
@@ -0,0 +1,94 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_GRAPH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_GRAPH_H_
+
+#include <cstddef>
+#include <map>
+
+#include "third_party/odml/infra/southbound/sb_api.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+
+class LiteRtDispatchGraphT {
+ public:
+  LiteRtDispatchGraphT(ThrGraph* thr_graph,
+                       LiteRtDispatchDeviceContext device_context)
+      : thr_graph_(thr_graph), device_context_(device_context) {}
+
+  ThrGraph* thr_graph() { return thr_graph_; }
+
+  LiteRtDispatchDeviceContext device_context() { return device_context_; }
+
+  int NextNodeInputIndex(LiteRtDispatchNodeId node_id) {
+    return NextNodeIoIndex(node_id, next_node_input_index_);
+  }
+
+  int NextNodeOutputIndex(LiteRtDispatchNodeId node_id) {
+    return NextNodeIoIndex(node_id, next_node_output_index_);
+  }
+
+  int NextGraphInputIndex() { return next_graph_input_index_++; }
+
+  int NextGraphOutputIndex() { return next_graph_output_index_++; }
+
+  void AddInputEdge(int input_index, LiteRtDispatchEdgeId edge_id) {
+    input_edges_[input_index] = edge_id;
+  }
+
+  void AddOutputEdge(int output_index, LiteRtDispatchEdgeId edge_id) {
+    output_edges_[output_index] = edge_id;
+  }
+
+  litert::Expected<LiteRtDispatchEdgeId> InputEdge(int input_index) const {
+    return IoEdge(input_index, input_edges_);
+  }
+
+  litert::Expected<LiteRtDispatchEdgeId> OutputEdge(int output_index) const {
+    return IoEdge(output_index, output_edges_);
+  }
+
+  size_t NumOutputs() const { return output_edges_.size(); }
+
+ private:
+  using NextNodeIoIndexMap = std::map<LiteRtDispatchNodeId, int>;
+  using IoIndexToEdgeIdMap = std::map<int, LiteRtDispatchEdgeId>;
+
+  int NextNodeIoIndex(LiteRtDispatchNodeId node_id, NextNodeIoIndexMap& map) {
+    return map[node_id]++;
+  }
+
+  litert::Expected<LiteRtDispatchEdgeId> IoEdge(
+      int io_index, const IoIndexToEdgeIdMap& map) const {
+    auto iter = map.find(io_index);
+    if (iter == map.end()) {
+      return litert::Unexpected(kLiteRtStatusErrorNotFound,
+                                "Unexpected graph input/output index");
+    }
+    return iter->second;
+  }
+
+  ThrGraph* thr_graph_;
+  LiteRtDispatchDeviceContext device_context_;
+  NextNodeIoIndexMap next_node_input_index_;
+  NextNodeIoIndexMap next_node_output_index_;
+  int next_graph_input_index_ = 0;
+  int next_graph_output_index_ = 0;
+  IoIndexToEdgeIdMap input_edges_;
+  IoIndexToEdgeIdMap output_edges_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h
new file mode 100644
index 00000000..ffc0c613
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h
@@ -0,0 +1,66 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
+
+#include <optional>
+
+#include "third_party/odml/infra/southbound/sb_api.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h"
+
+class LiteRtDispatchInvocationContextT {
+ public:
+  LiteRtDispatchInvocationContextT(ThrInvocationContext* thr_invocation_context,
+                                   LiteRtDispatchDeviceContext device_context,
+                                   LiteRtDispatchGraph graph)
+      : thr_invocation_context_(thr_invocation_context),
+        device_context_(device_context),
+        graph_(graph) {}
+
+  ~LiteRtDispatchInvocationContextT() {
+    if (exec_handle_) {
+      litert::google_tensor::UnloadExecutable(device_context_, *exec_handle_);
+    }
+  }
+
+  litert::Expected<LiteRtTensorBufferRequirements> GetInputRequirements(
+      int input_index, const LiteRtRankedTensorType& tensor_type);
+  litert::Expected<LiteRtTensorBufferRequirements> GetOutputRequirements(
+      int output_index, const LiteRtRankedTensorType& tensor_type);
+
+  ThrInvocationContext* thr_invocation_context() {
+    return thr_invocation_context_;
+  }
+
+  LiteRtDispatchDeviceContext device_context() { return device_context_; }
+
+  LiteRtDispatchGraph graph() { return graph_; }
+
+  void AttachExecutable(LiteRtDispatchExecutableHandle exec_handle) {
+    exec_handle_ = exec_handle;
+  }
+
+ private:
+  ThrInvocationContext* thr_invocation_context_;
+  LiteRtDispatchDeviceContext device_context_;
+  LiteRtDispatchGraph graph_;
+  std::optional<LiteRtDispatchExecutableHandle> exec_handle_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h
new file mode 100644
index 00000000..bd2ad8c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h
@@ -0,0 +1,128 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_SOUTHBOUND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_SOUTHBOUND_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "third_party/odml/infra/southbound/sb_api.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+namespace google_tensor {
+
+class Southbound {
+ public:
+  using Ptr = std::unique_ptr<Southbound>;
+  struct ThrFunctions;
+
+  Southbound(Southbound&) = delete;
+  Southbound(Southbound&&) = delete;
+  Southbound& operator=(const Southbound&) = delete;
+  Southbound& operator=(Southbound&&) = delete;
+
+  ~Southbound();
+
+  static Expected<Ptr> Create(std::optional<std::string> shared_library_dir);
+
+  const ThrFunctions& api() const { return *api_; }
+
+ private:
+  Southbound();
+  Expected<void> LoadSymbols(std::optional<std::string> shared_library_dir);
+
+  void* dlib_handle_ = nullptr;
+  std::unique_ptr<ThrFunctions> api_;
+};
+
+// A convenient struct for holding function pointers to SouthBound symbols.
+// These function pointers will be loaded to the shared library on device during
+// runtime.
+struct Southbound::ThrFunctions {
+  decltype(&thrInitialize) thr_initialize = nullptr;
+
+  decltype(&thrGetVendorApiVersion) thr_get_vendor_api_version = nullptr;
+  decltype(&thrGetVendorId) thr_get_vendor_id = nullptr;
+
+  decltype(&thrContextCreate) thr_context_create = nullptr;
+  decltype(&thrContextDelete) thr_context_delete = nullptr;
+
+  decltype(&thrGraphCreate) thr_graph_create = nullptr;
+  decltype(&thrGraphDelete) thr_graph_delete = nullptr;
+
+  decltype(&thrGraphAddEdge) thr_graph_add_edge = nullptr;
+  decltype(&thrGraphAddSqNode) thr_graph_add_sq_node = nullptr;
+
+  decltype(&thrGraphConnectNodeInput) thr_graph_connect_node_input = nullptr;
+  decltype(&thrGraphConnectNodeOutput) thr_graph_connect_node_output = nullptr;
+
+  decltype(&thrGraphSetInputEdge) thr_graph_set_input_edge = nullptr;
+  decltype(&thrGraphSetOutputEdge) thr_graph_set_output_edge = nullptr;
+
+  decltype(&thrGraphAnnotateGraph) thr_graph_annotate_graph = nullptr;
+  decltype(&thrGraphAnnotateEdge) thr_graph_annotate_edge = nullptr;
+  decltype(&thrGraphAnnotateNode) thr_graph_annotate_node = nullptr;
+
+  decltype(&thrLoadSqContainer) thr_load_sq_container = nullptr;
+  decltype(&thrLoadSqContainerFd) thr_load_sq_container_fd = nullptr;
+  decltype(&thrLoadSqContainerFile) thr_load_sq_container_file = nullptr;
+  decltype(&thrUnloadSqContainer) thr_unload_sq_container = nullptr;
+
+  decltype(&thrGraphAssignSq) thr_graph_assign_sq = nullptr;
+  decltype(&thrSqQueryScratchPad) thr_sq_query_scratch_pad = nullptr;
+  decltype(&thrSqAttachScratchPadBuffer) thr_sq_attach_scratch_pad_buffer =
+      nullptr;
+
+  decltype(&thrRegisterBuffer) thr_register_buffer = nullptr;
+  decltype(&thrRegisterBufferWithOffset) thr_register_buffer_with_offset =
+      nullptr;
+  decltype(&thrUnregisterBuffer) thr_unregister_buffer = nullptr;
+
+  decltype(&thrInvocationContextGet) thr_invocation_context_get = nullptr;
+  decltype(&thrInvocationContextDelete) thr_invocation_context_delete = nullptr;
+
+  decltype(&thrInvocationContextAttachBuffer)
+      thr_invocation_context_attach_buffer = nullptr;
+  decltype(&thrInvocationContextDetachBuffer)
+      thr_invocation_context_detach_buffer = nullptr;
+
+  decltype(&thrInvocationContextPrepareForInvoke)
+      thr_invocation_context_prepare_for_invoke = nullptr;
+  decltype(&thrInvocationContextInvokeOnce) thr_invocation_context_invoke_once =
+      nullptr;
+  decltype(&thrInvocationContextWait) thr_invocation_context_wait = nullptr;
+
+  decltype(&thrInvocationContextAttachInputBufferSyncFence)
+      thr_invocation_context_attach_input_buffer_sync_fence = nullptr;
+  decltype(&thrInvocationContextGetOutputBufferSyncFence)
+      thr_invocation_context_get_output_buffer_sync_fence = nullptr;
+
+  decltype(&thrInvocationContextQueryNodeScratchPad)
+      thr_invocation_context_query_node_scratch_pad = nullptr;
+  decltype(&thrInvocationContextAttachScratchPadBuffer)
+      thr_invocation_context_attach_scratch_pad_buffer = nullptr;
+
+  decltype(&thrVendorSetSystemAttributeStr)
+      thr_vendor_set_system_attribute_str = nullptr;
+  decltype(&thrVendorSetSystemAttributeInt64)
+      thr_vendor_set_system_attribute_int64 = nullptr;
+};
+
+}  // namespace google_tensor
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_SOUTHBOUND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h
new file mode 100644
index 00000000..d7ac0a51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h
@@ -0,0 +1,32 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_COMPILE_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_COMPILE_MODEL_H_
+
+#include <optional>
+#include <string>
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h"
+
+namespace litert::mediatek {
+
+Expected<NeuronCompilationPtr> CompileModel(
+    const NeuronAdapter& neuron_adapter, NeuronModel* model,
+    std::optional<std::string> soc_model);
+
+}  // namespace litert::mediatek
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_COMPILE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h
new file mode 100644
index 00000000..21af01d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h
@@ -0,0 +1,34 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_CREATE_MODEL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_CREATE_MODEL_H_
+
+#include <string>
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h"
+
+namespace litert::mediatek {
+
+// Create a new NeuronModel Graph from given LiteRt Graph.
+Expected<NeuronModelPtr> CreateModel(const NeuronAdapter& neuron_adapter,
+                                     const Subgraph& partition,
+                                     const std::string& model_name);
+
+}  // namespace litert::mediatek
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_CREATE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h
new file mode 100644
index 00000000..fef6773e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h
@@ -0,0 +1,32 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
+
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h"
+
+namespace litert::mediatek {
+
+Expected<void> LegalizeAddOp(const NeuronAdapter& neuron_adapter,
+                             NeuronModel* model, OperandMap& operand_map,
+                             const litert::Op& op);
+
+}  // namespace litert::mediatek
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h
new file mode 100644
index 00000000..ce3b5d8c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h
@@ -0,0 +1,92 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_OPERAND_MAP_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_OPERAND_MAP_H_
+
+#include <cstdint>
+#include <map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h"
+
+namespace litert::mediatek {
+
+// This class takes care of registering Tensors and scalars with a given
+// NeuronModel and returing their "operand index", which is how the MTK SDK
+// handles them.
+class OperandMap {
+ public:
+  OperandMap(const NeuronAdapter& neuron_adapter, NeuronModel* model)
+      : neuron_adapter_(neuron_adapter), model_(model) {}
+
+  // Add a scalar operand to the model.
+  Expected<uint32_t> AddScalarBool(bool value) {
+    return AddScalar(NEURON_BOOL, value);
+  }
+  Expected<uint32_t> AddScalarInt32(int32_t value) {
+    return AddScalar(NEURON_INT32, value);
+  }
+  Expected<uint32_t> AddScalarFloat32(float value) {
+    return AddScalar(NEURON_FLOAT32, value);
+  }
+
+  // Find the operand index for a given tensor and, if not done already, add the
+  // tensor as an operand in the model.
+  Expected<uint32_t> GetOperandIndex(const Tensor& t) {
+    auto i = map_.find(t.Get());
+    if (i != map_.end()) {
+      return i->second;
+    } else {
+      return Register(t);
+    }
+  }
+
+ private:
+  Expected<uint32_t> Register(const Tensor& t);
+  Expected<uint32_t> Register(const NeuronOperandType& operand_type);
+  uint32_t AllocateOperandIndex() { return next_operand_index_++; }
+
+  template <typename T>
+  Expected<uint32_t> AddScalar(int32_t mtk_type, T value) {
+    const NeuronOperandType scalar_type = {
+        .type = mtk_type,
+        .dimensionCount = 0,
+        .dimensions = nullptr,
+    };
+    auto operand_index = Register(scalar_type);
+    if (!operand_index) {
+      return operand_index.Error();
+    }
+    if (neuron_adapter_.api().model_set_operand_value(
+            model_, *operand_index, &value, sizeof(value)) != NEURON_NO_ERROR) {
+      return Error(kLiteRtStatusErrorRuntimeFailure,
+                   "Failed to set value of scalar operand");
+    }
+    return operand_index;
+  }
+
+  const NeuronAdapter& neuron_adapter_;
+  NeuronModel* model_;
+  int next_operand_index_ = 0;
+  absl::flat_hash_map<LiteRtTensor, uint32_t> map_;
+};
+
+}  // namespace litert::mediatek
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_OPERAND_MAP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h
new file mode 100644
index 00000000..b06cf048
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h
@@ -0,0 +1,87 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h"
+
+class LiteRtDispatchDeviceContextT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtDispatchDeviceContextT>;
+  struct NeuronMemoryInfo {
+    litert::mediatek::NeuronMemory* neuron_memory;
+    size_t size;
+    size_t offset;
+  };
+
+  ~LiteRtDispatchDeviceContextT();
+
+  static litert::Expected<Ptr> Create(
+      const litert::mediatek::NeuronAdapter& neuron_adapter);
+
+  litert::Expected<LiteRtTensorBufferHandle> RegisterTensorBuffer(
+      const litert::TensorBuffer& tensor_buffer);
+
+  litert::Expected<void> UnregisterTensorBuffer(
+      LiteRtTensorBufferHandle tensor_buffer_handle) {
+    return neuron_memory_registry_.Unregister(tensor_buffer_handle);
+  }
+
+  litert::Expected<NeuronMemoryInfo> GetNeuronMemoryInfo(
+      LiteRtTensorBufferHandle tensor_buffer_handle) {
+    auto record = neuron_memory_registry_.Find(tensor_buffer_handle);
+    if (!record) {
+      return record.Error();
+    } else {
+      return NeuronMemoryInfo(**record);
+    }
+  }
+
+ private:
+  class NeuronMemoryRegistry {
+   public:
+    explicit NeuronMemoryRegistry(
+        const litert::mediatek::NeuronAdapter& neuron_adapter)
+        : neuron_adapter_(neuron_adapter) {}
+    ~NeuronMemoryRegistry();
+    LiteRtTensorBufferHandle Register(
+        litert::mediatek::NeuronMemory* neuron_memory, size_t size,
+        size_t offset);
+    litert::Expected<void> Unregister(
+        LiteRtTensorBufferHandle tensor_buffer_handle);
+    litert::Expected<NeuronMemoryInfo*> Find(
+        LiteRtTensorBufferHandle tensor_buffer_handle);
+
+   private:
+    const litert::mediatek::NeuronAdapter& neuron_adapter_;
+    std::vector<NeuronMemoryInfo> records_;
+  };
+
+  explicit LiteRtDispatchDeviceContextT(
+      const litert::mediatek::NeuronAdapter& neuron_adapter)
+      : neuron_adapter_(neuron_adapter),
+        neuron_memory_registry_(neuron_adapter) {}
+
+  const litert::mediatek::NeuronAdapter& neuron_adapter_;
+  NeuronMemoryRegistry neuron_memory_registry_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h
new file mode 100644
index 00000000..0b633186
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h
@@ -0,0 +1,96 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
+
+#include <optional>
+
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h"
+
+class LiteRtDispatchInvocationContextT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtDispatchInvocationContextT>;
+
+  static litert::Expected<Ptr> Create(
+      litert::mediatek::NeuronAdapter& neuron_adapter,
+      LiteRtDispatchDeviceContext device_context,
+      LiteRtDispatchExecutableType exec_type, const void* exec_bytecode_ptr,
+      size_t exec_bytecode_size, const char* function_name, int num_inputs,
+      int num_outputs);
+
+  ~LiteRtDispatchInvocationContextT();
+
+  litert::Expected<LiteRtTensorBufferRequirements> GetInputRequirements(
+      int input_index, const LiteRtRankedTensorType& tensor_type);
+
+  litert::Expected<LiteRtTensorBufferRequirements> GetOutputRequirements(
+      int output_index, const LiteRtRankedTensorType& tensor_type);
+
+  litert::Expected<void> AttachInput(
+      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
+  litert::Expected<void> AttachOutput(
+      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
+
+  litert::Expected<void> DetachInput(
+      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
+  litert::Expected<void> DetachOutput(
+      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
+
+  litert::Expected<void> Invoke();
+
+ private:
+  class IoRequirementsBuilder {
+   public:
+    IoRequirementsBuilder(size_t buffer_size,
+                          const std::vector<uint32_t>& padded_dimensions);
+    litert::Expected<LiteRtTensorBufferRequirements> Create();
+
+   private:
+    size_t buffer_size_;
+    std::vector<uint32_t> strides_;
+  };
+
+  LiteRtDispatchInvocationContextT(
+      const litert::mediatek::NeuronAdapter& neuron_adapter,
+      LiteRtDispatchDeviceContext device_context,
+      litert::mediatek::NeuronModel* model,
+      litert::mediatek::NeuronCompilation* compilation,
+      litert::mediatek::NeuronExecution* execution, int num_inputs,
+      int num_outputs)
+      : neuron_adapter_(neuron_adapter),
+        device_context_(device_context),
+        model_(model),
+        compilation_(compilation),
+        execution_(execution),
+        input_requirements_builders_(num_inputs),
+        output_requirements_builders_(num_outputs) {}
+
+  const litert::mediatek::NeuronAdapter& neuron_adapter_;
+  LiteRtDispatchDeviceContext device_context_;
+  litert::mediatek::NeuronModel* model_;
+  litert::mediatek::NeuronCompilation* compilation_;
+  litert::mediatek::NeuronExecution* execution_;
+  std::vector<std::unique_ptr<IoRequirementsBuilder>>
+      input_requirements_builders_;
+  std::vector<std::unique_ptr<IoRequirementsBuilder>>
+      output_requirements_builders_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h
new file mode 100644
index 00000000..47809716
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter.h
@@ -0,0 +1,259 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_NEURON_ADAPTER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_NEURON_ADAPTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+#if LITERT_HAS_AHWB_SUPPORT
+#include <android/hardware_buffer.h>
+#else
+struct AHardwareBuffer {};
+#endif
+
+namespace litert::mediatek {
+
+// /////////////////////////////////////////////////////////////////////////////
+//
+// A minimal set of definitions for the NeuronAdapter API, from public domain
+// sources.
+//
+// /////////////////////////////////////////////////////////////////////////////
+
+struct NeuronRuntimeVersion {
+  uint8_t major;
+  uint8_t minor;
+  uint8_t patch;
+};
+
+enum NeuronOperationType {
+  NEURON_ADD = 0,
+};
+
+struct NeuronOperandType {
+  int32_t type;
+  // NOLINTNEXTLINE
+  uint32_t dimensionCount;
+  const uint32_t* dimensions;
+  float scale;
+  // NOLINTNEXTLINE
+  int32_t zeroPoint;
+};
+
+struct NeuronModel;
+struct NeuronCompilation;
+struct NeuronExecution;
+struct NeuronMemory;
+
+static constexpr int NEURON_NO_ERROR = 0;
+static constexpr int NEURON_FLOAT32 = 0;
+static constexpr int NEURON_INT32 = 1;
+static constexpr int NEURON_BOOL = 6;
+static constexpr int NEURON_TENSOR_FLOAT32 = 3;
+static constexpr int NEURON_TENSOR_INT32 = 4;
+static constexpr int NEURON_PRIORITY_HIGH = 110;
+static constexpr int NEURON_PREFER_SUSTAINED_SPEED = 2;
+
+int NeuronCompilation_create(NeuronModel* model,
+                             NeuronCompilation** compilation);
+int NeuronCompilation_createWithOptions(NeuronModel* model,
+                                        NeuronCompilation** compilation,
+                                        const char* options);
+int NeuronCompilation_finish(NeuronCompilation* compilation);
+int NeuronCompilation_getCompiledNetworkSize(NeuronCompilation* compilation,
+                                             size_t* size);
+int NeuronCompilation_getInputPaddedDimensions(NeuronCompilation* compilation,
+                                               int32_t index,
+                                               uint32_t* dimensions);
+int NeuronCompilation_getInputPaddedSize(NeuronCompilation* compilation,
+                                         int32_t index, size_t* size);
+int NeuronCompilation_getOutputPaddedDimensions(NeuronCompilation* compilation,
+                                                int32_t index,
+                                                uint32_t* dimensions);
+int NeuronCompilation_getOutputPaddedSize(NeuronCompilation* compilation,
+                                          int32_t index, size_t* size);
+int NeuronCompilation_setOptimizationString(NeuronCompilation* compilation,
+                                            const char* optimizationString);
+int NeuronCompilation_setPreference(NeuronCompilation* compilation,
+                                    int32_t preference);
+int NeuronCompilation_setPriority(NeuronCompilation* compilation, int priority);
+int NeuronCompilation_storeCompiledNetwork(NeuronCompilation* compilation,
+                                           void* buffer, size_t size);
+int NeuronExecution_compute(NeuronExecution* execution);
+int NeuronExecution_create(NeuronCompilation* compilation,
+                           NeuronExecution** execution);
+int NeuronExecution_setBoostHint(NeuronExecution* execution,
+                                 uint8_t boostValue);
+int NeuronExecution_setInputFromMemory(NeuronExecution* execution,
+                                       uint32_t index,
+                                       const NeuronOperandType* type,
+                                       const NeuronMemory* memory,
+                                       size_t offset, size_t length);
+int NeuronExecution_setOutputFromMemory(NeuronExecution* execution,
+                                        uint32_t index,
+                                        const NeuronOperandType* type,
+                                        const NeuronMemory* memory,
+                                        size_t offset, size_t length);
+int NeuronMemory_createFromAHardwareBuffer(const AHardwareBuffer* ahwb,
+                                           NeuronMemory** memory);
+int NeuronMemory_createFromFd(size_t size, int protect, int fd, size_t offset,
+                              NeuronMemory** memory);
+int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type);
+int NeuronModel_addOperation(NeuronModel* model, NeuronOperationType type,
+                             uint32_t inputCount, const uint32_t* inputs,
+                             uint32_t outputCount, const uint32_t* outputs);
+int NeuronModel_create(NeuronModel** model);
+int NeuronModel_finish(NeuronModel* model);
+int NeuronModel_getExtensionOperandType(NeuronModel* model,
+                                        const char* extensionName,
+                                        uint16_t operandCodeWithinExtension,
+                                        int32_t* type);
+int NeuronModel_getExtensionOperationType(NeuronModel* model,
+                                          const char* extensionName,
+                                          uint16_t operationCodeWithinExtension,
+                                          int32_t* type);
+int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
+                                         uint32_t inputCount,
+                                         const uint32_t* inputs,
+                                         uint32_t outputCount,
+                                         const uint32_t* outputs);
+int NeuronModel_restoreFromCompiledNetwork(NeuronModel** model,
+                                           NeuronCompilation** compilation,
+                                           const void* buffer, size_t size);
+int NeuronModel_setName(NeuronModel* model, const char* name);
+int NeuronModel_setOperandValue(NeuronModel* model, int32_t index,
+                                const void* buffer, size_t length);
+int Neuron_getVersion(NeuronRuntimeVersion* version);
+void NeuronCompilation_free(NeuronCompilation* compilation);
+void NeuronExecution_free(NeuronExecution* execution);
+void NeuronMemory_free(NeuronMemory* memory);
+void NeuronModel_free(NeuronModel* model);
+
+// /////////////////////////////////////////////////////////////////////////////
+
+using NeuronModelPtr = std::unique_ptr<NeuronModel, void (*)(NeuronModel*)>;
+using NeuronCompilationPtr =
+    std::unique_ptr<NeuronCompilation, void (*)(NeuronCompilation*)>;
+using NeuronExecutionPtr =
+    std::unique_ptr<NeuronExecution, void (*)(NeuronExecution*)>;
+
+class NeuronAdapter {
+ public:
+  using Ptr = std::unique_ptr<NeuronAdapter>;
+  struct Api;
+
+  NeuronAdapter(NeuronAdapter&) = delete;
+  NeuronAdapter(NeuronAdapter&&) = delete;
+  NeuronAdapter& operator=(const NeuronAdapter&) = delete;
+  NeuronAdapter& operator=(NeuronAdapter&&) = delete;
+
+  ~NeuronAdapter();
+
+  static Expected<Ptr> Create(std::optional<std::string> shared_library_dir);
+
+  const Api& api() const { return *api_; }
+
+  absl::string_view AotCompileOptions() const {
+    // Option `import_forever` has been recommended by MediaTek to reduce memory
+    // footprint when using the same I/O buffers across multiple invocations.
+    return "--apusys-config \"{ \\\"import_forever\\\": true }\"";
+  }
+
+  absl::string_view JitCompileOptions() const { return ""; }
+
+  Expected<NeuronModelPtr> CreateModel() const;
+
+  Expected<NeuronCompilationPtr> CreateCompilation(NeuronModel* model) const;
+
+  Expected<NeuronCompilationPtr> CreateCompilation(
+      NeuronModel* model, const std::string& compile_options) const;
+
+  Expected<NeuronExecutionPtr> CreateExecution(
+      NeuronCompilation* compilation) const;
+
+ private:
+  NeuronAdapter();
+  litert::Expected<void> LoadSymbols(
+      std::optional<std::string> shared_library_dir);
+
+  void* dlib_handle_ = nullptr;
+  std::unique_ptr<Api> api_;
+};
+
+// A convenient struct for holding function pointers to NeuronAdapter API
+// symbols. These function pointers will be loaded to the shared library on
+// device during runtime.
+struct NeuronAdapter::Api {
+  decltype(&NeuronCompilation_create) compilation_create = nullptr;
+  decltype(&NeuronCompilation_createWithOptions)
+      compilation_create_with_options = nullptr;
+  decltype(&NeuronCompilation_finish) compilation_finish = nullptr;
+  decltype(&NeuronCompilation_free) compilation_free = nullptr;
+  decltype(&NeuronCompilation_getCompiledNetworkSize)
+      compilation_get_compiled_network_size = nullptr;
+  decltype(&NeuronCompilation_getInputPaddedDimensions)
+      compilation_get_input_padded_dimensions = nullptr;
+  decltype(&NeuronCompilation_getInputPaddedSize)
+      compilation_get_input_padded_size = nullptr;
+  decltype(&NeuronCompilation_getOutputPaddedDimensions)
+      compilation_get_output_padded_dimensions = nullptr;
+  decltype(&NeuronCompilation_getOutputPaddedSize)
+      compilation_get_output_padded_size = nullptr;
+  decltype(&NeuronCompilation_setOptimizationString)
+      compilation_set_optimization_string = nullptr;
+  decltype(&NeuronCompilation_setPreference) compilation_set_preference =
+      nullptr;
+  decltype(&NeuronCompilation_setPriority) compilation_set_priority = nullptr;
+  decltype(&NeuronCompilation_storeCompiledNetwork)
+      compilation_store_compiled_network = nullptr;
+  decltype(&NeuronExecution_compute) execution_compute = nullptr;
+  decltype(&NeuronExecution_create) execution_create = nullptr;
+  decltype(&NeuronExecution_free) execution_free = nullptr;
+  decltype(&NeuronExecution_setBoostHint) execution_set_boost_hint = nullptr;
+  decltype(&NeuronExecution_setInputFromMemory)
+      execution_set_input_from_memory = nullptr;
+  decltype(&NeuronExecution_setOutputFromMemory)
+      execution_set_output_from_memory = nullptr;
+  decltype(&NeuronMemory_createFromAHardwareBuffer) memory_create_from_ahwb =
+      nullptr;
+  decltype(&NeuronMemory_createFromFd) memory_create_from_fd = nullptr;
+  decltype(&NeuronMemory_free) memory_free = nullptr;
+  decltype(&NeuronModel_addOperand) model_add_operand = nullptr;
+  decltype(&NeuronModel_addOperation) model_add_operation = nullptr;
+  decltype(&NeuronModel_create) model_create = nullptr;
+  decltype(&NeuronModel_finish) model_finish = nullptr;
+  decltype(&NeuronModel_free) model_free = nullptr;
+  decltype(&NeuronModel_getExtensionOperandType)
+      model_get_extension_operand_type = nullptr;
+  decltype(&NeuronModel_getExtensionOperationType)
+      model_get_extension_operation_type = nullptr;
+  decltype(&NeuronModel_identifyInputsAndOutputs)
+      model_identify_inputs_and_outputs = nullptr;
+  decltype(&NeuronModel_restoreFromCompiledNetwork)
+      model_restore_from_compiled_network = nullptr;
+  decltype(&NeuronModel_setName) model_set_name = nullptr;
+  decltype(&NeuronModel_setOperandValue) model_set_operand_value = nullptr;
+  decltype(&Neuron_getVersion) get_version = nullptr;
+};
+
+}  // namespace litert::mediatek
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_NEURON_ADAPTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/common.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/common.h
new file mode 100644
index 00000000..34b89714
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/common.h
@@ -0,0 +1,100 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMMON_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMMON_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
+#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "third_party/qairt/latest/include/QNN/System/QnnSystemInterface.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define LITERT_RETURN_STATUS_IF_QNN_NOT_OK(expr) \
+  if (QNN_SUCCESS != (expr)) {                   \
+    return kLiteRtStatusErrorNotFound;           \
+  }
+
+// Pointers to functions of a dynamically loaded QNN library.
+typedef QNN_INTERFACE_VER_TYPE QnnApi;
+
+// Pointers to functions of a dynamically loaded QNN system library.
+typedef QNN_SYSTEM_INTERFACE_VER_TYPE QnnSystemApi;
+
+// QNN backend library should be on DT_RUNPATH (-rpath).
+static const char kLibQnnHtpSo[] = "libQnnHtp.so";
+
+// QNN backend library should be on DT_RUNPATH (-rpath).
+static const char kLibQnnSystemSo[] = "libQnnSystem.so";
+
+// Map LiteRT element type to Qnn counterpart.
+inline LiteRtStatus LegalizeElementType(litert::ElementType litert_type,
+                                        Qnn_DataType_t* qnn_type) {
+  switch (litert_type) {
+    case litert::ElementType::Bool:
+      *qnn_type = QNN_DATATYPE_BOOL_8;
+      break;
+    case litert::ElementType::Int4:
+      *qnn_type = QNN_DATATYPE_SFIXED_POINT_4;
+      break;
+    case litert::ElementType::Int8:
+      *qnn_type = QNN_DATATYPE_INT_8;
+      break;
+    case litert::ElementType::Int16:
+      *qnn_type = QNN_DATATYPE_INT_16;
+      break;
+    case litert::ElementType::Int32:
+      *qnn_type = QNN_DATATYPE_INT_32;
+      break;
+    case litert::ElementType::Int64:
+      *qnn_type = QNN_DATATYPE_INT_64;
+      break;
+    case litert::ElementType::UInt8:
+      *qnn_type = QNN_DATATYPE_UINT_8;
+      break;
+    case litert::ElementType::UInt16:
+      *qnn_type = QNN_DATATYPE_UINT_16;
+      break;
+    case litert::ElementType::UInt32:
+      *qnn_type = QNN_DATATYPE_UINT_32;
+      break;
+    case litert::ElementType::UInt64:
+      *qnn_type = QNN_DATATYPE_UINT_64;
+      break;
+    case litert::ElementType::Float16:
+      *qnn_type = QNN_DATATYPE_FLOAT_16;
+      break;
+    case litert::ElementType::Float32:
+      *qnn_type = QNN_DATATYPE_FLOAT_32;
+      break;
+    case litert::ElementType::Float64:
+      *qnn_type = QNN_DATATYPE_FLOAT_64;
+      break;
+    default:
+      return kLiteRtStatusErrorUnsupported;
+  }
+  return kLiteRtStatusOk;
+}
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h
new file mode 100644
index 00000000..20e0f27f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h
@@ -0,0 +1,53 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_OP_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_OP_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+
+namespace litert::qnn {
+
+//
+// Initialize QNN Op.
+//
+
+// NOTE: Any referential data within a QNN Op
+// is allocated with "new" and must be explicitly cleaned up with ResetOp.
+
+// Construct a "blank" QNN Op.
+Qnn_OpConfig_t BuildDefaultOp();
+
+// Construct a "blank" QNN Param.
+Qnn_Param_t BuildDefaultParam();
+
+// Reset the given tensor, deallocating anything on the heap that it points to.
+void ResetOp(Qnn_OpConfig_t& op);
+
+// Reset the given param, deallocating anything on the heap that it points to.
+void ResetParam(Qnn_Param_t& param);
+
+//
+// Legalize LiteRt Op to Analogous QNN Construct.
+//
+
+// Map src op onto dest. Resets dest before doing anything. This only handles
+// attribute-like info. It does not set edges (in/out tensors).
+LiteRtStatus LegalizeOp(LiteRtOp src, Qnn_OpConfig_t& dest);
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h
new file mode 100644
index 00000000..18ea1508
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h
@@ -0,0 +1,73 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_TENSOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_TENSOR_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+
+namespace litert::qnn {
+
+//
+// Initialize QNN Tensors.
+//
+
+// NOTE: Within LiteRt land, all Qnn Tensors are treated as "v2". Any
+// referential data (like dimensions : uint32_t*) within a QNN Tensor
+// is allocated with "new" and must be explicitly cleaned up with ResetTensor.
+
+// Construct a "blank" QNN Tensor.
+Qnn_Tensor_t BuildDefaultTensor();
+
+// Construct a "blank" QNN Tensor with given id.
+Qnn_Tensor_t BuildDefaultTensor(uint32_t id);
+
+// Constructa a "blank" QNN Tensor meant to be used as a graph input.
+Qnn_Tensor_t BuildInputTensor();
+
+// Constructa a "blank" QNN Tensor meant to be used as a graph output.
+Qnn_Tensor_t BuildOutputTensor();
+
+Qnn_ClientBuffer_t BuildDefaultClientBuffer();
+
+// Adds attributes to given tensor making it amenable for use as graph input.
+void SetInputTensorAttrs(Qnn_Tensor_t& tensor);
+
+// Adds attributes to given tensor making it amenable for use as graph output.
+void SetOutputTensorAttrs(Qnn_Tensor_t& tensor);
+
+// Adds attributes to given tensor making it amenable for uses a intermediate
+// output.
+void SetResultTensorAttrs(Qnn_Tensor_t& tensor);
+
+// Reset the given tensor, deallocating anything on the heap that it points to.
+void ResetTensor(Qnn_Tensor_t& tensor);
+
+// Resets all fields other than id in the given tensor and returns the id for
+// convenience. Only the id is needed to traffic QNN Tensors after they have
+// been registered with the context.
+uint32_t MoveToId(Qnn_Tensor_t& tensor);
+
+//
+// Legalize LiteRt Tensors to Analogous QNN Construct.
+//
+
+// Map src tensor onto dest. Resets dest before doing anything.
+LiteRtStatus LegalizeTensor(const litert::Tensor& src, Qnn_Tensor_t& dest);
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h
new file mode 100644
index 00000000..0469fbdb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h
@@ -0,0 +1,121 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_GRAPH_MAPPER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_GRAPH_MAPPER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
+#include "third_party/qairt/latest/include/QNN/QnnGraph.h"
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
+
+namespace litert::qnn {
+
+// Algorithm class for managing "scope" when mapping litert Subgraphs
+// to QNN Graphs.
+class GraphMapper {
+ public:
+  GraphMapper(LiteRtSubgraph subgraph, QnnManager& qnn,
+              Qnn_ContextHandle_t context_handle)
+      : subgraph_(Subgraph(subgraph)),
+        qnn_(qnn),
+        context_handle_(context_handle) {}
+
+  // Legalize given LiteRtTensors attributes into QNN Tensor registered with
+  // QNN context. Result QNN Tensor is empty except for the canonical id
+  // assigned by QNN Api.
+  LiteRtStatus LegalizeAndRegister(LiteRtTensor litert_tensor,
+                                   Qnn_Tensor_t& qnn_tensor);
+
+  // Find ID associated with evaluated litert Tensor and add it to given
+  // QNN Tensor.
+  LiteRtStatus LookupInScope(LiteRtTensor litert_tensor,
+                             Qnn_Tensor_t& qnn_tensor);
+
+  // Adds new mapping to scope. All fields other than ID in given QNN Tensor are
+  // cleared and its ID is added to "current_scope". Expects QNN Tensor has
+  // already been registered with context.
+  LiteRtStatus PushToScope(LiteRtTensor litert_tensor,
+                           Qnn_Tensor_t& qnn_tensor);
+
+  // NOTE: QNN Tensors must be created with a unique name. This will ensure
+  // uniqueness but will want to have more meaningful names in the future.
+  LiteRtStatus AssignTensorName(Qnn_Tensor_t& qnn_tensor);
+
+  // QNN Sdk Accessors
+  QnnManager& Qnn();
+  Qnn_GraphHandle_t& QnnGraph();
+
+  // CC Convenience Accessors
+  const Subgraph& Graph() const { return subgraph_; }
+
+  // Accessor for current scope.
+  // Since each QNN Tensor needs to have a unique name globally within each QNN
+  // context, we maintain "Current scope", which is a map of evaluated
+  // LiteRtTensors to their resolved QNN Tensor ID.
+  absl::flat_hash_map<LiteRtTensor, uint32_t>& CurrentScope();
+
+  // Can implementation handle given LiteRtSubgraph topology (see comment at
+  // bottom of file).
+  LiteRtStatus IsLiteRtSubgraphSupported();
+
+  // Initialize QNN Graph with given name. Call this after parsing
+  // LiteRtSubgraph.
+  LiteRtStatus InitQnnGraph(absl::string_view qnn_graph_name);
+
+  // Finalize QNN Graph. Call this after all ops have been mapped.
+  LiteRtStatus Finalize();
+
+  // Pick graph config based on subgraph.
+  absl::Span<const QnnGraph_Config_t*> PickGraphConfigHeuristic();
+
+  inline void RegisterOutput(LiteRtTensor litert_tensor) {
+    graph_outpus_.insert(litert_tensor);
+  }
+
+ private:
+  const Subgraph subgraph_;
+
+  // Set of all outputs of the graph.
+  absl::flat_hash_set<LiteRtTensor> graph_outpus_;
+
+  // Maps evaluated tensors to their resolved QNN Tensor ID.
+  absl::flat_hash_map<LiteRtTensor, uint32_t> current_scope_;
+
+  //
+  // QNN Sdk State
+  //
+  QnnManager& qnn_;
+  Qnn_ContextHandle_t context_handle_;
+  Qnn_GraphHandle_t qnn_graph_ = nullptr;
+
+  //
+  // Tensor Naming
+  //
+
+  uint32_t cur_tensor_num_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_GRAPH_MAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h
new file mode 100644
index 00000000..c8301cb1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class AddOpLegalization : public Legalization {
+ public:
+  AddOpLegalization() = default;
+  ~AddOpLegalization() = default;
+  using Ptr = std::unique_ptr<AddOpLegalization>;
+  static Ptr Create() { return std::make_unique<AddOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h
new file mode 100644
index 00000000..60aee1f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_BATCH_MATMUL_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_BATCH_MATMUL_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class BatchMatmulOpLegalization : public Legalization {
+ public:
+  BatchMatmulOpLegalization() = default;
+  ~BatchMatmulOpLegalization() = default;
+  using Ptr = std::unique_ptr<BatchMatmulOpLegalization>;
+  static Ptr Create() { return std::make_unique<BatchMatmulOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_BATCH_MATMUL_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h
new file mode 100644
index 00000000..fecbe54b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CAST_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CAST_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class CastOpLegalization : public Legalization {
+ public:
+  CastOpLegalization() = default;
+  ~CastOpLegalization() = default;
+  using Ptr = std::unique_ptr<CastOpLegalization>;
+  static Ptr Create() { return std::make_unique<CastOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CAST_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h
new file mode 100644
index 00000000..b3c26971
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h
@@ -0,0 +1,51 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CONCATENATION_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CONCATENATION_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class ConcatenationOpLegalization : public Legalization {
+ public:
+  ConcatenationOpLegalization() = default;
+  ~ConcatenationOpLegalization() = default;
+  using Ptr = std::unique_ptr<ConcatenationOpLegalization>;
+  static Ptr Create() {
+    return std::make_unique<ConcatenationOpLegalization>();
+  }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CONCATENATION_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h
new file mode 100644
index 00000000..6a35da2f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_COS_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_COS_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class CosOpLegalization : public Legalization {
+ public:
+  CosOpLegalization() = default;
+  ~CosOpLegalization() = default;
+  using UniquePtr = std::unique_ptr<CosOpLegalization>;
+  static UniquePtr Create() { return std::make_unique<CosOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_COS_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h
new file mode 100644
index 00000000..a22b9124
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DIV_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DIV_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class DivOpLegalization : public Legalization {
+ public:
+  DivOpLegalization() = default;
+  ~DivOpLegalization() = default;
+  using Ptr = std::unique_ptr<DivOpLegalization>;
+  static Ptr Create() { return std::make_unique<DivOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DIV_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h
new file mode 100644
index 00000000..e8bae779
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h
@@ -0,0 +1,51 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_EMBEDDING_LOOKUP_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_EMBEDDING_LOOKUP_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class EmbeddingLookupOpLegalization : public Legalization {
+ public:
+  EmbeddingLookupOpLegalization() = default;
+  ~EmbeddingLookupOpLegalization() = default;
+  using Ptr = std::unique_ptr<EmbeddingLookupOpLegalization>;
+  static Ptr Create() {
+    return std::make_unique<EmbeddingLookupOpLegalization>();
+  }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_EMBEDDING_LOOKUP_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h
new file mode 100644
index 00000000..0ff2983e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h
@@ -0,0 +1,51 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_FULLY_CONNECTED_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_FULLY_CONNECTED_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class FullyConnectedOpLegalization : public Legalization {
+ public:
+  FullyConnectedOpLegalization() = default;
+  ~FullyConnectedOpLegalization() = default;
+  using Ptr = std::unique_ptr<FullyConnectedOpLegalization>;
+  static Ptr Create() {
+    return std::make_unique<FullyConnectedOpLegalization>();
+  }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_FULLY_CONNECTED_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h
new file mode 100644
index 00000000..fdb31f53
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GELU_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GELU_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class GeluOpLegalization : public Legalization {
+ public:
+  GeluOpLegalization() = default;
+  ~GeluOpLegalization() = default;
+  using Ptr = std::unique_ptr<GeluOpLegalization>;
+  static Ptr Create() { return std::make_unique<GeluOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GELU_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h
new file mode 100644
index 00000000..bb353420
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GREATER_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GREATER_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class GreaterOpLegalization : public Legalization {
+ public:
+  GreaterOpLegalization() = default;
+  ~GreaterOpLegalization() = default;
+  using Ptr = std::unique_ptr<GreaterOpLegalization>;
+  static Ptr Create() { return std::make_unique<GreaterOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GREATER_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h
new file mode 100644
index 00000000..5f7c8ef9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h
@@ -0,0 +1,50 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LEGALIZATION_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+
+#define STRINGIFY(x) #x
+#define QNN_OP_NAME(prefix) STRINGIFY(prefix##__COUNTER__)
+
+namespace litert::qnn {
+
+class Legalization {
+ public:
+  Legalization() = default;
+  virtual ~Legalization() = default;
+
+  virtual LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                                  GraphMapper& graph_mapper) = 0;
+
+  // Sets the op name, package name, and type.
+  // Note: All argument strings can't be de-allocated until the op has been
+  // registered with the qnn api. i.e graphAddNode().
+  inline LiteRtStatus SetOpInfo(const char* name, const char* op_package_name,
+                                const char* op_type, Qnn_OpConfig_t& op) {
+    op.v1.name = name;
+    op.v1.packageName = op_package_name;
+    op.v1.typeName = op_type;
+    return kLiteRtStatusOk;
+  }
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h
new file mode 100644
index 00000000..b16c5335
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LESS_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LESS_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class LessOpLegalization : public Legalization {
+ public:
+  LessOpLegalization() = default;
+  ~LessOpLegalization() = default;
+  using Ptr = std::unique_ptr<LessOpLegalization>;
+  static Ptr Create() { return std::make_unique<LessOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LESS_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h
new file mode 100644
index 00000000..ec5c5c2a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h
@@ -0,0 +1,51 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LOGICAL_AND_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LOGICAL_AND_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class LogicalAndOpLegalization : public Legalization {
+ public:
+  LogicalAndOpLegalization() = default;
+  ~LogicalAndOpLegalization() = default;
+  using UniquePtr = std::unique_ptr<LogicalAndOpLegalization>;
+  static UniquePtr Create() {
+    return std::make_unique<LogicalAndOpLegalization>();
+  }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LOGICAL_AND_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h
new file mode 100644
index 00000000..098d0954
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_MUL_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_MUL_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class MulOpLegalization : public Legalization {
+ public:
+  MulOpLegalization() = default;
+  ~MulOpLegalization() = default;
+  using Ptr = std::unique_ptr<MulOpLegalization>;
+  static Ptr Create() { return std::make_unique<MulOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_MUL_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h
new file mode 100644
index 00000000..e8553639
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RESHAPE_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RESHAPE_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class ReshapeOpLegalization : public Legalization {
+ public:
+  ReshapeOpLegalization() = default;
+  ~ReshapeOpLegalization() = default;
+  using Ptr = std::unique_ptr<ReshapeOpLegalization>;
+  static Ptr Create() { return std::make_unique<ReshapeOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RESHAPE_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h
new file mode 100644
index 00000000..5971e9f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RSQRT_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RSQRT_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class RsqrtOpLegalization : public Legalization {
+ public:
+  RsqrtOpLegalization() = default;
+  ~RsqrtOpLegalization() = default;
+  using Ptr = std::unique_ptr<RsqrtOpLegalization>;
+  static Ptr Create() { return std::make_unique<RsqrtOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RSQRT_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h
new file mode 100644
index 00000000..526498a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SELECT_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SELECT_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class SelectOpLegalization : public Legalization {
+ public:
+  SelectOpLegalization() = default;
+  ~SelectOpLegalization() = default;
+  using Ptr = std::unique_ptr<SelectOpLegalization>;
+  static Ptr Create() { return std::make_unique<SelectOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SELECT_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h
new file mode 100644
index 00000000..e87296ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SIN_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SIN_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class SinOpLegalization : public Legalization {
+ public:
+  SinOpLegalization() = default;
+  ~SinOpLegalization() = default;
+  using UniquePtr = std::unique_ptr<SinOpLegalization>;
+  static UniquePtr Create() { return std::make_unique<SinOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SIN_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h
new file mode 100644
index 00000000..1430d1e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h
@@ -0,0 +1,47 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SLICE_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SLICE_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class SliceOpLegalization : public Legalization {
+ public:
+  SliceOpLegalization() = default;
+  ~SliceOpLegalization() = default;
+  using Ptr = std::unique_ptr<SliceOpLegalization>;
+  static Ptr Create() { return std::make_unique<SliceOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SLICE_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h
new file mode 100644
index 00000000..b4ecb005
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SOFTMAX_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SOFTMAX_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class SoftmaxOpLegalization : public Legalization {
+ public:
+  SoftmaxOpLegalization() = default;
+  ~SoftmaxOpLegalization() = default;
+  using Ptr = std::unique_ptr<SoftmaxOpLegalization>;
+  static Ptr Create() { return std::make_unique<SoftmaxOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SOFTMAX_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h
new file mode 100644
index 00000000..3f05f8e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUB_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUB_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class SubOpLegalization : public Legalization {
+ public:
+  SubOpLegalization() = default;
+  ~SubOpLegalization() = default;
+  using Ptr = std::unique_ptr<SubOpLegalization>;
+  static Ptr Create() { return std::make_unique<SubOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUB_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h
new file mode 100644
index 00000000..a50e946a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUM_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUM_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class SumOpLegalization : public Legalization {
+ public:
+  SumOpLegalization() = default;
+  ~SumOpLegalization() = default;
+  using Ptr = std::unique_ptr<SumOpLegalization>;
+  static Ptr Create() { return std::make_unique<SumOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUM_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h
new file mode 100644
index 00000000..486e321a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TANH_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TANH_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class TanhOpLegalization : public Legalization {
+ public:
+  TanhOpLegalization() = default;
+  ~TanhOpLegalization() = default;
+  using Ptr = std::unique_ptr<TanhOpLegalization>;
+  static Ptr Create() { return std::make_unique<TanhOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TANH_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h
new file mode 100644
index 00000000..39d7fc64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TRANSPOSE_OP_LEGALIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TRANSPOSE_OP_LEGALIZATION_H_
+
+#include <alloca.h>
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
+
+namespace litert::qnn {
+
+class TransposeOpLegalization : public Legalization {
+ public:
+  TransposeOpLegalization() = default;
+  ~TransposeOpLegalization() = default;
+  using Ptr = std::unique_ptr<TransposeOpLegalization>;
+  static Ptr Create() { return std::make_unique<TransposeOpLegalization>(); }
+
+  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
+                          GraphMapper& graph_mapper);
+
+ private:
+  // Counter to ensure unique op names.
+  uint32_t op_counter_ = 0;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TRANSPOSE_OP_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h
new file mode 100644
index 00000000..58584fd6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h
@@ -0,0 +1,39 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_UTIL_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
+
+namespace litert::qnn {
+
+// Use this function to legalize a LiteRtOp to a Qnn Op when:
+// 1. Source input/output tensor and destination input/ouptut tensor are 1 : 1
+// mapped
+// 2. Assigning params to destination OP does not depending on input tensor of
+// source OP.
+LiteRtStatus LegalizeSimpleOp(const Op& src, Qnn_OpConfig_t& dest,
+                              GraphMapper& graph_mapper);
+
+// Dump source Op details.
+void DumpLegalization(const LiteRtOpT& op);
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h
new file mode 100644
index 00000000..db978ba4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h
@@ -0,0 +1,33 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_QNN_COMPOSE_GRAPH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_QNN_COMPOSE_GRAPH_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/c/litert_model.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
+
+namespace litert::qnn {
+
+// Composes a new QNN Graph from given LiteRt Graph. Qnn Graph is written to
+// context behind "qnn". Uses given graph_name to name entry point.
+LiteRtStatus ComposeGraph(QnnManager& qnn, Qnn_ContextHandle_t context_handle,
+                          LiteRtSubgraph subgraph,
+                          absl::string_view qnn_graph_name);
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_QNN_COMPOSE_GRAPH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h
new file mode 100644
index 00000000..2e07186a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h
@@ -0,0 +1,68 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CONTEXT_BINARY_INFO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CONTEXT_BINARY_INFO_H_
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h"
+
+namespace litert {
+namespace qnn {
+
+class GraphInfo {
+ public:
+  static Expected<GraphInfo> Create(
+      const QnnSystemContext_GraphInfo_t& graph_info);
+  const std::string& Name() const { return name_; }
+  const std::vector<QnnTensor>& Inputs() const { return inputs_; }
+  const std::vector<QnnTensor>& Outputs() const { return outputs_; }
+
+ private:
+  GraphInfo() = default;
+  Expected<void> Init(const QnnSystemContext_GraphInfo_t& graph_info);
+  std::string name_;
+  std::vector<QnnTensor> inputs_;
+  std::vector<QnnTensor> outputs_;
+};
+
+class ContextBinaryInfo {
+ public:
+  static Expected<ContextBinaryInfo> Create(QnnManager& qnn,
+                                            const void* exec_bytecode_ptr,
+                                            size_t exec_bytecode_size);
+  const std::vector<QnnTensor>& ContextTensors() const {
+    return context_tensors_;
+  }
+  const std::vector<GraphInfo>& Graphs() const { return graphs_; }
+
+ private:
+  ContextBinaryInfo() = default;
+  Expected<void> Init(const QnnSystemContext_BinaryInfo_t& binary_info);
+  std::vector<QnnTensor> context_tensors_;
+  std::vector<GraphInfo> graphs_;
+};
+
+}  // namespace qnn
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CONTEXT_BINARY_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h
new file mode 100644
index 00000000..bd375c51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h
@@ -0,0 +1,79 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
+
+class LiteRtDispatchDeviceContextT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtDispatchDeviceContextT>;
+
+  ~LiteRtDispatchDeviceContextT() = default;
+
+  static litert::Expected<Ptr> Create(litert::qnn::QnnManager& qnn_manager);
+
+  litert::Expected<LiteRtTensorBufferHandle> RegisterTensorBuffer(
+      LiteRtTensorBuffer tensor_buffer) {
+    return tensor_buffer_registry_.Register(
+        TensorBufferRegistryEntry(tensor_buffer));
+  }
+
+  litert::Expected<void> UnregisterTensorBuffer(
+      LiteRtTensorBufferHandle tensor_buffer_handle) {
+    return tensor_buffer_registry_.Unregister(tensor_buffer_handle);
+  }
+
+  litert::Expected<LiteRtTensorBuffer> GetTensorBuffer(
+      LiteRtTensorBufferHandle tensor_buffer_handle);
+
+  litert::Expected<Qnn_MemHandle_t> GetMemHandle(
+      LiteRtTensorBufferHandle tensor_buffer_handle,
+      const Qnn_Tensor_t& tensor);
+
+  void SetInvocationContext(
+      LiteRtDispatchInvocationContextT* invocation_context) {
+    invocation_context_ = invocation_context;
+  }
+
+ private:
+  struct TensorBufferRegistryEntry {
+    LiteRtTensorBuffer tensor_buffer;
+    Qnn_MemHandle_t qnn_mem_handle = nullptr;
+    explicit TensorBufferRegistryEntry(LiteRtTensorBuffer tensor_buffer_)
+        : tensor_buffer(tensor_buffer_) {}
+  };
+
+  using TensorBufferRegistry = litert::qnn::Registry<LiteRtTensorBufferHandle,
+                                                     TensorBufferRegistryEntry>;
+
+  LiteRtDispatchDeviceContextT(litert::qnn::QnnManager& qnn_manager)
+      : qnn_manager_(qnn_manager) {}
+
+  litert::Expected<Qnn_MemHandle_t> RegisterTensorBuffer(
+      LiteRtTensorBuffer tensor_buffer, const Qnn_Tensor_t& tensor);
+
+  litert::qnn::QnnManager& qnn_manager_;
+  TensorBufferRegistry tensor_buffer_registry_;
+  LiteRtDispatchInvocationContextT* invocation_context_ = nullptr;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h
new file mode 100644
index 00000000..247fa334
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h
@@ -0,0 +1,82 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
+#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h"
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
+
+class LiteRtDispatchDeviceContextT;
+
+class LiteRtDispatchInvocationContextT {
+ public:
+  using Ptr = std::unique_ptr<LiteRtDispatchInvocationContextT>;
+
+  ~LiteRtDispatchInvocationContextT() = default;
+
+  static litert::Expected<Ptr> Create(
+      litert::qnn::QnnManager& qnn_manager,
+      LiteRtDispatchDeviceContextT& device_context,
+      const void* exec_bytecode_ptr, size_t exec_bytecode_size,
+      const char* function_name);
+
+  litert::Expected<LiteRtTensorBufferRequirements> GetInputRequirements(
+      int input_index, const LiteRtRankedTensorType& tensor_type);
+  litert::Expected<LiteRtTensorBufferRequirements> GetOutputRequirements(
+      int output_index, const LiteRtRankedTensorType& tensor_type);
+
+  litert::Expected<void> AttachInput(
+      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
+
+  litert::Expected<void> AttachOutput(
+      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
+
+  litert::Expected<void> Execute();
+
+  Qnn_ContextHandle_t ContextHandle() { return context_handle_.get(); }
+
+ private:
+  LiteRtDispatchInvocationContextT(
+      litert::qnn::QnnManager& qnn_manager,
+      const litert::qnn::ContextBinaryInfo& context_binary_info,
+      LiteRtDispatchDeviceContextT& device_context,
+      litert::qnn::QnnManager::ContextHandle&& context_handle,
+      Qnn_ProfileHandle_t profile_handle, int graph_index,
+      Qnn_GraphHandle_t graph_handle);
+
+  litert::Expected<void> AttachBuffer(
+      Qnn_Tensor_t& tensor, LiteRtTensorBufferHandle tensor_buffer_handle);
+
+  litert::qnn::QnnManager& qnn_manager_;
+  LiteRtDispatchDeviceContextT& device_context_;
+  litert::qnn::QnnManager::ContextHandle context_handle_;
+  Qnn_ProfileHandle_t profile_handle_;
+  int graph_index_;
+  Qnn_GraphHandle_t graph_handle_;
+  std::vector<litert::qnn::QnnTensor> inputs_;
+  std::vector<litert::qnn::QnnTensor> outputs_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h
new file mode 100644
index 00000000..b2d74a57
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h
@@ -0,0 +1,73 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_REGISTRY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_REGISTRY_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+namespace qnn {
+
+template <typename H, typename V>
+class Registry {
+ public:
+  Expected<H> Register(const V& value) {
+    // TODO: improve this linear search by keeping an index to the first unused
+    // element.
+    for (auto i = 0; i < entries_.size(); ++i) {
+      auto& entry = entries_[i];
+      if (!entry.used) {
+        entry.value = value;
+        entry.used = true;
+        return static_cast<H>(i);
+      }
+    }
+    // Grow the set of entries.
+    H handle = static_cast<H>(entries_.size());
+    entries_.emplace_back(value);
+    return handle;
+  }
+
+  Expected<void> Unregister(H handle) {
+    if (handle < 0 || handle >= entries_.size()) {
+      return Unexpected(kLiteRtStatusErrorNotFound, "Unexpected handle");
+    }
+    entries_[handle].used = false;
+    return {};
+  }
+
+  Expected<V*> Get(H handle) {
+    if (handle < 0 || handle >= entries_.size()) {
+      return Unexpected(kLiteRtStatusErrorNotFound, "Unexpected handle");
+    }
+    return &entries_[handle].value;
+  }
+
+ private:
+  struct Entry {
+    V value;
+    bool used;
+    explicit Entry(const V& v) : value(v), used(true) {}
+  };
+
+  std::vector<Entry> entries_;
+};
+
+}  // namespace qnn
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h
new file mode 100644
index 00000000..934a164b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h
@@ -0,0 +1,28 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_LOG_H_
+
+#include "third_party/qairt/latest/include/QNN/QnnLog.h"
+
+namespace litert::qnn {
+
+// Gets a default logger implementation to stdout.
+// This is used when initializing qnn logging.
+QnnLog_Callback_t GetDefaultStdOutLogger();
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_LOG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h
new file mode 100644
index 00000000..76c02cf0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h
@@ -0,0 +1,226 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_MANAGER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_MANAGER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpDevice.h"
+#include "third_party/qairt/latest/include/QNN/QnnBackend.h"
+#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
+#include "third_party/qairt/latest/include/QNN/QnnContext.h"
+#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
+#include "third_party/qairt/latest/include/QNN/System/QnnSystemContext.h"
+#include "third_party/qairt/latest/include/QNN/System/QnnSystemInterface.h"
+#include "tensorflow/lite/experimental/litert/c/litert_common.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"  // IWYU pragma: keep
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
+
+//===----------------------------------------------------------------------===//
+//
+//                                                                   QnnManger
+//
+// Syntactic sugar for various Qnn Sdk routines.
+//
+// Provides various utilities for linking shared libraries at runtime
+// against Qnn symbols as well as convience getters and storage of handles
+// (pointers). Provides simple wrappers for freeing handles and returning
+// LiteRtStatus rather than Qnn ones. Additionally exposes hooks for dumping
+// api and shared libarary details.
+//
+// Does not own any memory and will always have trivial cstor/dstor. The
+// user is responsible for freeing any Qnn handles explicitly. Note,
+// Qnn handles will be automatically freed when the library is unloaded
+// if they have been already.
+//
+//===----------------------------------------------------------------------===//
+
+namespace litert::qnn {
+
+class QnnManager;
+
+namespace internal {
+
+void Dump(const QnnManager& qnn, std::ostream& out);
+
+}  // namespace internal
+
+class QnnManager {
+  friend void internal::Dump(const QnnManager& qnn, std::ostream& out);
+
+ public:
+  using Ptr = std::unique_ptr<QnnManager>;
+  using SystemContextHandle =
+      std::unique_ptr<std::remove_pointer<QnnSystemContext_Handle_t>::type,
+                      QnnSystemContext_FreeFn_t>;
+  class ContextHandle;
+
+  ~QnnManager();
+
+  static Expected<Ptr> Create(
+      absl::Span<const QnnBackend_Config_t*> configs,
+      std::optional<std::string> shared_library_dir = std::nullopt,
+      std::optional<QnnHtpDevice_Arch_t> soc_model = std::nullopt);
+
+  static absl::Span<const QnnBackend_Config_t*> DefaultBackendConfigs();
+  static absl::Span<const QnnContext_Config_t*> DefaultContextConfigs();
+
+  // Get resolved function pointers for qnn sdk calls. Nullptr if functions
+  // have not been resolved yet.
+  const QnnApi* Api() const;
+
+  // Get resolved function pointers for qnn sdk calls. Nullptr if functions
+  // have not been resolved yet.
+  const QnnSystemApi* SystemApi() const;
+
+  //
+  // QNN SDK Objects.
+  //
+
+  // Create system context handle.
+  Expected<SystemContextHandle> CreateSystemContextHandle();
+
+  // Create a context handle for compilation.
+  Expected<ContextHandle> CreateContextHandle(
+      absl::Span<const QnnContext_Config_t*> configs);
+
+  // Create a context handle for inference, from a given bytecode.
+  Expected<ContextHandle> CreateContextHandle(
+      absl::Span<const QnnContext_Config_t*> configs,
+      absl::Span<const uint8_t> bytecode, Qnn_ProfileHandle_t profile_handle);
+
+  //
+  // Context Binary
+  //
+
+  // Generates QNN context binary from current context. Writes to given
+  // buffer.
+  LiteRtStatus GenerateContextBinary(Qnn_ContextHandle_t context_handle,
+                                     std::vector<char>& buffer);
+
+ private:
+  QnnManager() = default;
+
+  LiteRtStatus Init(absl::Span<const QnnBackend_Config_t*> configs,
+                    std::optional<std::string> shared_library_dir,
+                    std::optional<QnnHtpDevice_Arch_t> soc_model);
+
+  //
+  // Manage libQnn*.so Loading
+  //
+
+  // Loads the libQnn*.so at given path.
+  LiteRtStatus LoadLib(absl::string_view path);
+
+  // Loads the libQnnSystem.so at given path.
+  LiteRtStatus LoadSystemLib(absl::string_view path);
+
+  //
+  // Resolve and Access QNN SDK Functions
+  //
+
+  // Resolve all available QNN SDK functions from (already) loaded so. If
+  // multiple providers are found, selects the first one with a suitable
+  // version. Fails if none can be found.
+  LiteRtStatus ResolveApi();
+
+  // Resolve all available QNN SDK functions from (already) loaded so. If
+  // multiple providers are found, selects the first one with a suitable
+  // version. Fails if none can be found.
+  LiteRtStatus ResolveSystemApi();
+
+  // Get qnn log handle. Nullptr if logCreate has not been successfully called.
+  Qnn_LogHandle_t& LogHandle() { return log_handle_; }
+
+  // Get qnn backend handle. Nullptr if backendCreate has not been successfully
+  // called.
+  Qnn_BackendHandle_t& BackendHandle() { return backend_handle_; }
+
+  // Get qnn device handle. Nullptr if deviceCreate has not been successfully
+  // called.
+  Qnn_DeviceHandle_t& DeviceHandle() { return device_handle_; }
+
+  // Signal QNN SDK to free any memory related to the device. Does nothing
+  // if deviceCreate has not been called.
+  LiteRtStatus FreeDevice();
+
+  // Signal QNN SDK to free any memory related to logging. Does nothing
+  // if logCreate has not been called.
+  LiteRtStatus FreeLogging();
+
+  // Signal QNN SDK to free any memory related to backend. Does nothing
+  // if backendCreate has not been called.
+  LiteRtStatus FreeBackend();
+
+  void* lib_so_ = nullptr;
+  void* lib_system_so_ = nullptr;
+
+  const QnnInterface_t* interface_ = nullptr;
+  const QnnSystemInterface_t* system_interface_ = nullptr;
+
+  Qnn_LogHandle_t log_handle_ = nullptr;
+  Qnn_BackendHandle_t backend_handle_ = nullptr;
+  Qnn_DeviceHandle_t device_handle_ = nullptr;
+};
+
+// Unfortunately we can't use std::unique_ptr with a deleter because
+// QnnContext_FreeFn_t takes a profile handle as a second argument.
+class QnnManager::ContextHandle {
+ public:
+  ContextHandle(Qnn_ContextHandle_t context_handle, Qnn_ProfileHandle_t profile,
+                QnnContext_FreeFn_t free_fn)
+      : context_handle_(context_handle), profile_(profile), free_fn_(free_fn) {}
+
+  ~ContextHandle() {
+    if (context_handle_ && free_fn_) {
+      free_fn_(context_handle_, profile_);
+    }
+  }
+
+  ContextHandle(ContextHandle&& other) { *this = std::move(other); }
+
+  ContextHandle(const ContextHandle& other) = delete;
+
+  ContextHandle& operator=(ContextHandle&& other) {
+    std::swap(context_handle_, other.context_handle_);
+    std::swap(profile_, other.profile_);
+    std::swap(free_fn_, other.free_fn_);
+    return *this;
+  }
+
+  ContextHandle& operator=(const ContextHandle& other) = delete;
+
+  Qnn_ContextHandle_t get() const noexcept { return context_handle_; }
+  explicit operator bool() const noexcept { return context_handle_ != nullptr; }
+
+ private:
+  Qnn_ContextHandle_t context_handle_ = nullptr;
+  Qnn_ProfileHandle_t profile_ = nullptr;
+  QnnContext_FreeFn_t free_fn_ = nullptr;
+};
+
+}  // namespace litert::qnn
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_MANAGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h
new file mode 100644
index 00000000..84466856
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h
@@ -0,0 +1,60 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_TENSOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_TENSOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
+#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
+#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
+
+namespace litert {
+namespace qnn {
+
+class QnnTensor {
+ public:
+  static Expected<QnnTensor> Create(const Qnn_Tensor_t& tensor);
+
+  QnnTensor(const QnnTensor& other);
+  QnnTensor(QnnTensor&& other);
+
+  QnnTensor& operator=(const QnnTensor&) = delete;
+  QnnTensor& operator=(QnnTensor&&) = delete;
+
+  Qnn_Tensor_t& Tensor() { return tensor_; }
+  const Qnn_Tensor_t& Tensor() const { return tensor_; }
+
+  size_t Rank() const { return dimensions_.size(); }
+  const uint32_t* Dimensions() const { return dimensions_.data(); }
+
+ private:
+  explicit QnnTensor(const Qnn_Tensor_t& tensor) : tensor_(tensor) {}
+  Expected<void> DeepCopy();
+
+  Qnn_Tensor_t tensor_;
+  std::string name_;
+  std::vector<uint32_t> dimensions_;
+  std::vector<uint8_t> is_dynamic_dimensions_;
+};
+
+}  // namespace qnn
+}  // namespace litert
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h
new file mode 100644
index 00000000..b6465024
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h
@@ -0,0 +1,29 @@
+// Copyright 2024 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_TOOLS_DUMP_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_TOOLS_DUMP_H_
+
+#include <iostream>
+#include <ostream>
+
+#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
+
+namespace litert::qnn::internal {
+
+void Dump(const QnnManager& qnn, std::ostream& out = std::cerr);
+
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_TOOLS_DUMP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/audio_microfrontend.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/audio_microfrontend.h
new file mode 100644
index 00000000..bf284feb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/audio_microfrontend.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_AUDIO_MICROFRONTEND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_AUDIO_MICROFRONTEND_H_
+
+#include "tensorflow/lite/context.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+TfLiteRegistration* Register_AUDIO_MICROFRONTEND();
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_AUDIO_MICROFRONTEND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/bits.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/bits.h
new file mode 100644
index 00000000..5e79ccaf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/bits.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
+
+#ifdef __cplusplus
+#include <cstdint>
+
+extern "C" {
+#endif
+
+static inline int CountLeadingZeros32Slow(uint64_t n) {
+  int zeroes = 28;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros32(uint32_t n) {
+#if !defined(__clang__) && defined(_MSC_VER)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse(&result, n)) {
+    return 31 - result;
+  }
+  return 32;
+#elif defined(__clang__) && defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clz(0) is undefined.
+  if (n == 0) {
+    return 32;
+  }
+  return __builtin_clz(n);
+#else
+  return CountLeadingZeros32Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit32(uint32_t n) {
+  return 32 - CountLeadingZeros32(n);
+}
+
+static inline int CountLeadingZeros64Slow(uint64_t n) {
+  int zeroes = 60;
+  if (n >> 32) zeroes -= 32, n >>= 32;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros64(uint64_t n) {
+#if !defined(__clang__) && defined(_MSC_VER) && defined(_M_X64)
+  // MSVC does not have __builtin_clzll. Use _BitScanReverse64.
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse64(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif !defined(__clang__) && defined(_MSC_VER)
+  // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
+    return 31 - result;
+  }
+  if (_BitScanReverse(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(__clang__) || defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
+  if (n == 0) {
+    return 64;
+  }
+  return __builtin_clzll(n);
+#else
+  return CountLeadingZeros64Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit64(uint64_t n) {
+  return 64 - CountLeadingZeros64(n);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft.h
new file mode 100644
index 00000000..aaffa69d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct complex_int16_t {
+  int16_t real;
+  int16_t imag;
+};
+
+struct FftState {
+  int16_t* input;
+  struct complex_int16_t* output;
+  size_t fft_size;
+  size_t input_size;
+  void* scratch;
+  size_t scratch_size;
+};
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift);
+
+void FftInit(struct FftState* state);
+
+void FftReset(struct FftState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft_io.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft_io.h
new file mode 100644
index 00000000..7a59af68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft_io.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void FftWriteMemmapPreamble(FILE* fp, const struct FftState* state);
+void FftWriteMemmap(FILE* fp, const struct FftState* state,
+                    const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft_util.h
new file mode 100644
index 00000000..6a471301
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/fft_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Prepares and FFT for the given input size.
+int FftPopulateState(struct FftState* state, size_t input_size);
+
+// Frees any allocated buffers.
+void FftFreeStateContents(struct FftState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank.h
new file mode 100644
index 00000000..1e6d3885
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#define kFilterbankBits 12
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FilterbankState {
+  int num_channels;
+  int start_index;
+  int end_index;
+  int16_t* channel_frequency_starts;
+  int16_t* channel_weight_starts;
+  int16_t* channel_widths;
+  int16_t* weights;
+  int16_t* unweights;
+  uint64_t* work;
+};
+
+// Converts the relevant complex values of an FFT output into energy (the
+// square magnitude).
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy);
+
+// Computes the mel-scale filterbank on the given energy array. Output is cached
+// internally - to fetch it, you need to call FilterbankSqrt.
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy);
+
+// Applies an integer square root to the 64 bit intermediate values of the
+// filterbank, and returns a pointer to them. Memory will be invalidated the
+// next time FilterbankAccumulateChannels is called.
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift);
+
+void FilterbankReset(struct FilterbankState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h
new file mode 100644
index 00000000..5fc96845
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void FilterbankWriteMemmapPreamble(FILE* fp,
+                                   const struct FilterbankState* state);
+void FilterbankWriteMemmap(FILE* fp, const struct FilterbankState* state,
+                           const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h
new file mode 100644
index 00000000..781d1024
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FilterbankConfig {
+  // number of frequency channel buckets for filterbank
+  int num_channels;
+  // maximum frequency to include
+  float upper_band_limit;
+  // minimum frequency to include
+  float lower_band_limit;
+  // unused
+  int output_scale_shift;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config);
+
+// Allocates any buffers.
+int FilterbankPopulateState(const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size);
+
+// Frees any allocated buffers.
+void FilterbankFreeStateContents(struct FilterbankState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend.h
new file mode 100644
index 00000000..883df5fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FrontendState {
+  struct WindowState window;
+  struct FftState fft;
+  struct FilterbankState filterbank;
+  struct NoiseReductionState noise_reduction;
+  struct PcanGainControlState pcan_gain_control;
+  struct LogScaleState log_scale;
+};
+
+struct FrontendOutput {
+  const uint16_t* values;
+  size_t size;
+};
+
+// Main entry point to processing frontend samples. Updates num_samples_read to
+// contain the number of samples that have been consumed from the input array.
+// Returns a struct containing the generated output. If not enough samples were
+// added to generate a feature vector, the returned size will be 0 and the
+// values pointer will be NULL. Note that the output pointer will be invalidated
+// as soon as FrontendProcessSamples is called again, so copy the contents
+// elsewhere if you need to use them later.
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read);
+
+void FrontendReset(struct FrontendState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h
new file mode 100644
index 00000000..0d59eda7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_IO_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int WriteFrontendStateMemmap(const char* header, const char* source,
+                             const struct FrontendState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h
new file mode 100644
index 00000000..895ce6cd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FrontendConfig {
+  struct WindowConfig window;
+  struct FilterbankConfig filterbank;
+  struct NoiseReductionConfig noise_reduction;
+  struct PcanGainControlConfig pcan_gain_control;
+  struct LogScaleConfig log_scale;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config);
+
+// Allocates any buffers.
+int FrontendPopulateState(const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void FrontendFreeStateContents(struct FrontendState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_lut.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_lut.h
new file mode 100644
index 00000000..b2448a32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_lut.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Number of segments in the log lookup table. The table will be kLogSegments+1
+// in length (with some padding).
+#define kLogSegments 128
+#define kLogSegmentsLog2 7
+
+// Scale used by lookup table.
+#define kLogScale 65536
+#define kLogScaleLog2 16
+#define kLogCoeff 45426
+
+extern const uint16_t kLogLut[];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale.h
new file mode 100644
index 00000000..a383f32f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct LogScaleState {
+  int enable_log;
+  int scale_shift;
+};
+
+// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note
+// that the signal array will be modified.
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h
new file mode 100644
index 00000000..9d447ac9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void LogScaleWriteMemmap(FILE* fp, const struct LogScaleState* state,
+                         const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h
new file mode 100644
index 00000000..11f7d9ee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct LogScaleConfig {
+  // set to false (0) to disable this module
+  int enable_log;
+  // scale results by 2^(scale_shift)
+  int scale_shift;
+};
+
+// Populates the LogScaleConfig with "sane" default values.
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config);
+
+// Allocates any buffers.
+int LogScalePopulateState(const struct LogScaleConfig* config,
+                          struct LogScaleState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h
new file mode 100644
index 00000000..46d3f52e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
+
+#define kNoiseReductionBits 14
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct NoiseReductionState {
+  int smoothing_bits;
+  uint16_t even_smoothing;
+  uint16_t odd_smoothing;
+  uint16_t min_signal_remaining;
+  int num_channels;
+  uint32_t* estimate;
+};
+
+// Removes stationary noise from each channel of the signal using a low pass
+// filter.
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal);
+
+void NoiseReductionReset(struct NoiseReductionState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h
new file mode 100644
index 00000000..ded52118
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void NoiseReductionWriteMemmapPreamble(FILE* fp,
+                                       const struct NoiseReductionState* state);
+void NoiseReductionWriteMemmap(FILE* fp,
+                               const struct NoiseReductionState* state,
+                               const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h
new file mode 100644
index 00000000..fa555391
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct NoiseReductionConfig {
+  // scale the signal up by 2^(smoothing_bits) before reduction
+  int smoothing_bits;
+  // smoothing coefficient for even-numbered channels
+  float even_smoothing;
+  // smoothing coefficient for odd-numbered channels
+  float odd_smoothing;
+  // fraction of signal to preserve (1.0 disables this module)
+  float min_signal_remaining;
+};
+
+// Populates the NoiseReductionConfig with "sane" default values.
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config);
+
+// Allocates any buffers.
+int NoiseReductionPopulateState(const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels);
+
+// Frees any allocated buffers.
+void NoiseReductionFreeStateContents(struct NoiseReductionState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
new file mode 100644
index 00000000..3f6222be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kPcanSnrBits 12
+#define kPcanOutputBits 6
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Details at https://research.google/pubs/pub45911.pdf
+struct PcanGainControlState {
+  int enable_pcan;
+  uint32_t* noise_estimate;
+  int num_channels;
+  int16_t* gain_lut;
+  int32_t snr_shift;
+};
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut);
+
+uint32_t PcanShrink(const uint32_t x);
+
+void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h
new file mode 100644
index 00000000..d4bfaa2e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
+
+#define kWideDynamicFunctionBits 32
+#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct PcanGainControlConfig {
+  // set to false (0) to disable this module
+  int enable_pcan;
+  // gain normalization exponent (0.0 disables, 1.0 full strength)
+  float strength;
+  // positive value added in the normalization denominator
+  float offset;
+  // number of fractional bits in the gain
+  int gain_bits;
+};
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config);
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x);
+
+int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits);
+
+void PcanGainControlFreeStateContents(struct PcanGainControlState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window.h
new file mode 100644
index 00000000..bad81514
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kFrontendWindowBits 12
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WindowState {
+  size_t size;
+  int16_t* coefficients;
+  size_t step;
+
+  int16_t* input;
+  size_t input_used;
+  int16_t* output;
+  int16_t max_abs_output_value;
+};
+
+// Applies a window to the samples coming in, stepping forward at the given
+// rate.
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read);
+
+void WindowReset(struct WindowState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window_io.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window_io.h
new file mode 100644
index 00000000..a76b2dc3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window_io.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void WindowWriteMemmapPreamble(FILE* fp, const struct WindowState* state);
+void WindowWriteMemmap(FILE* fp, const struct WindowState* state,
+                       const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_IO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window_util.h
new file mode 100644
index 00000000..68e4de9e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/microfrontend/lib/window_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WindowConfig {
+  // length of window frame in milliseconds
+  size_t size_ms;
+  // length of step for next frame in milliseconds
+  size_t step_size_ms;
+};
+
+// Populates the WindowConfig with "sane" default values.
+void WindowFillConfigWithDefaults(struct WindowConfig* config);
+
+// Allocates any buffers.
+int WindowPopulateState(const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void WindowFreeStateContents(struct WindowState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/crop.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/crop.h
new file mode 100644
index 00000000..b21bffcb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/crop.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_CROP_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_CROP_H_
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace crop {
+
+// Center Crop
+//
+// Inputs: [img: any, frac: scalar<double>]
+// Ouputs: [img: any]
+//
+// Crop the given image from center, scaling height and width dimensions by
+// decimal input. Let `offs = floor((d - d * frac) / 2)` where `d` is a given
+// dimension and `frac` is the second decimal scalar input to this Algo. The
+// bounding box for each dimension `d` then covers `[offs, d - offs)`. Mimics
+// semantic of `tf.image.central_crop`.
+//
+// https://www.tensorflow.org/api_docs/python/tf/image/central_crop
+
+const algo::Algo* Impl_CenterCrop();
+
+// Crop to Bounding Box
+//
+// Inputs: [img: any, offset_height: unsigned, offset_width: unsigned,
+//          target_height: unsigned, target_width: unsigned]
+// Ouputs: [img: any]
+//
+// Cuts a rectangular bounding box out of given image. The top-left corner of
+// the bounding box is at (`offset_height`, `offset_width`) in image, and the
+// lower-right corner is at (`offset_height` + `target_height`,
+// `offset_width` + `target_width`). Mimics semantic of
+// `tf.image.crop_to_bounding_box`.
+//
+// https://www.tensorflow.org/api_docs/python/tf/image/crop_to_bounding_box
+
+const algo::Algo* Impl_CropToBoundingBox();
+
+}  // namespace crop
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_CROP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/flip_left_right.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/flip_left_right.h
new file mode 100644
index 00000000..42a2639c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/flip_left_right.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_FLIP_LEFT_RIGHT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_FLIP_LEFT_RIGHT_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace flip_left_right {
+
+// Flip (left_right)
+//
+// Inputs: [img: any]
+// Ouputs: [img: any]
+//
+// Flips the given image horizontally (left to right).
+// Mimics semantic of `tf.image.flip_left_right`.
+
+// https://www.tensorflow.org/api_docs/python/tf/image/flip_left_right
+
+const algo::Algo* Impl_FlipLeftRight();
+
+}  // namespace flip_left_right
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_FLIP_LEFT_RIGHT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/flip_up_down.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/flip_up_down.h
new file mode 100644
index 00000000..821fefc4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/flip_up_down.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_FLIP_UP_DOWN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_FLIP_UP_DOWN_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace flip_up_down {
+
+// Flip (up to down)
+//
+// Inputs: [img: any]
+// Ouputs: [img: any]
+//
+// Flips the given image vertically (up to down).
+// Mimics semantic of `tf.image.flip_up_down.
+
+// https://www.tensorflow.org/api_docs/python/tf/image/flip_up_down
+
+const algo::Algo* Impl_FlipUpDown();
+
+}  // namespace flip_up_down
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_FLIP_UP_DOWN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/image_utils.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/image_utils.h
new file mode 100644
index 00000000..b56917e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/image_utils.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_IMAGE_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_IMAGE_UTILS_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace ml_adj {
+
+inline void ConvertColorSpace(dim_t batches, dim_t height, dim_t width,
+                              const float* input_data, float* output_data,
+                              const float* kernel_data, dim_t kernel_size) {
+  TFLITE_DCHECK(kernel_size == 9);
+
+  const dim_t kNumChannels = 3;
+  const dim_t output_num_pixels = batches * width * height;
+  const float* src_data_prt = input_data;
+  float* dst_data_prt = output_data;
+
+  for (int i = 0; i < output_num_pixels; ++i) {
+    dst_data_prt[0] = kernel_data[0] * src_data_prt[0] +
+                      kernel_data[1] * src_data_prt[1] +
+                      kernel_data[2] * src_data_prt[2];
+
+    dst_data_prt[1] = kernel_data[3] * src_data_prt[0] +
+                      kernel_data[4] * src_data_prt[1] +
+                      kernel_data[5] * src_data_prt[2];
+
+    dst_data_prt[2] = kernel_data[6] * src_data_prt[0] +
+                      kernel_data[7] * src_data_prt[1] +
+                      kernel_data[8] * src_data_prt[2];
+
+    src_data_prt += kNumChannels;
+    dst_data_prt += kNumChannels;
+  }
+}
+
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_IMAGE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/per_image_standardization.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/per_image_standardization.h
new file mode 100644
index 00000000..fb329753
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/per_image_standardization.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_PER_IMAGE_STANDARDIZATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_PER_IMAGE_STANDARDIZATION_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace per_image_standardization {
+
+// Per Image Standardization
+//
+// Inputs: [img: float]
+// Ouputs: [img: float]
+//
+// Linearly scales each image in input to have mean 0 and variance 1.
+// Mimics semantic of `tf.image.per_image_standardization`.
+
+// https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
+
+const algo::Algo* Impl_PerImageStandardization();
+
+}  // namespace per_image_standardization
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_PER_IMAGE_STANDARDIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/resize.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/resize.h
new file mode 100644
index 00000000..507d9cb8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/resize.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RESIZE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RESIZE_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace resize {
+
+// Bilinear Resize
+//
+// Inputs: [img: float, new size: vector<unsigned>]
+// Ouputs: [img: float]
+//
+// Resizes the given image, scaling height and width dimensions to `new size`.
+// Mimics semantic of `tf.image.resize`, where `method` equals to bilinear
+// interpolation.
+//
+// https://www.tensorflow.org/api_docs/python/tf/image/resize
+
+const algo::Algo* Impl_Resize();
+
+}  // namespace resize
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RESIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rgb_to_grayscale.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rgb_to_grayscale.h
new file mode 100644
index 00000000..c3912091
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rgb_to_grayscale.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RGB_TO_GRAYSCALE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RGB_TO_GRAYSCALE_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace rgb_to_grayscale {
+
+// RGB to Grayscale conversion
+//
+// Inputs: [img: float]
+// Ouputs: [img: float]
+//
+// Converts one or more images from RGB to Grayscale.
+// Mimics semantic of `tf.image.rgb_to_grayscale`.
+
+// https://www.tensorflow.org/api_docs/python/tf/image/rgb_to_grayscale
+
+const algo::Algo* Impl_RgbToGrayscale();
+
+}  // namespace rgb_to_grayscale
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RGB_TO_GRAYSCALE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rgb_to_yuv.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rgb_to_yuv.h
new file mode 100644
index 00000000..3f6860f7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rgb_to_yuv.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RGB_TO_YUV_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RGB_TO_YUV_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace rgb_to_yuv {
+
+// RGB to YUV conversion
+//
+// Inputs: [img: float]
+// Ouputs: [img: float]
+//
+// Coverts the given 3-channel RGB image to 3-channel YUV image.
+// Mimics semantic of `tf.image.rgb_to_yuv.
+
+// https://www.tensorflow.org/api_docs/python/tf/image/rgb_to_yuv
+
+const algo::Algo* Impl_RgbToYuv();
+
+}  // namespace rgb_to_yuv
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_RGB_TO_YUV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rotate.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rotate.h
new file mode 100644
index 00000000..a28a0672
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/rotate.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_ROTATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_ROTATE_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace rotate {
+
+// Rotate
+//
+// Inputs: [img: float, angle: scalar<int>]
+// Ouputs: [img: float]
+//
+// Rotates the given image with arbitrary `angle`.
+// Overlaps with semantic of `tf.image.rotate` only for angles multiple to 90
+// degrees.
+//
+// https://www.tensorflow.org/api_docs/python/tf/image/rotate
+
+const algo::Algo* Impl_Rotate();
+
+}  // namespace rotate
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_ROTATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/yuv_to_rgb.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/yuv_to_rgb.h
new file mode 100644
index 00000000..990ac061
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/algo/yuv_to_rgb.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_YUV_TO_RGB_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_YUV_TO_RGB_H_
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace yuv_to_rgb {
+
+// YUV to RGB conversion
+//
+// Inputs: [img: float]
+// Ouputs: [img: float]
+//
+// Coverts the given 3-channel YUV image to 3-channel RGB image.
+// Mimics semantic of `tf.image.yuv_to_rgb.
+
+// https://www.tensorflow.org/api_docs/python/tf/image/yuv_to_rgb
+
+const algo::Algo* Impl_YuvToRgb();
+
+}  // namespace yuv_to_rgb
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_ALGO_YUV_TO_RGB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/data/owning_vector_ref.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/data/owning_vector_ref.h
new file mode 100644
index 00000000..92e3a997
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/data/owning_vector_ref.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_DATA_OWNING_VECTOR_REF_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_DATA_OWNING_VECTOR_REF_H_
+#include <vector>
+
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj {
+namespace data {
+
+// A MutableDataRef implemenation that manages its buffer through underlying
+// vector which it owns.
+class OwningVectorRef : public MutableDataRef {
+ public:
+  explicit OwningVectorRef(etype_t type) : MutableDataRef(type) {}
+
+  OwningVectorRef(const OwningVectorRef&) = delete;
+  OwningVectorRef(OwningVectorRef&&) = delete;
+  OwningVectorRef& operator=(const OwningVectorRef&) = delete;
+  OwningVectorRef& operator=(OwningVectorRef&&) = delete;
+
+  // Resizes the underlying vector to prod(dims) * type width.
+  void Resize(dims_t&& dims) override;
+
+  // Gets read-only pointer to vector's buffer.
+  const void* Data() const override;
+
+  // Gets a write-read pointer to vector's buffer.
+  void* Data() override;
+
+  ind_t NumElements() const override;
+
+  size_t Bytes() const override;
+
+  ~OwningVectorRef() override = default;
+
+ private:
+  std::vector<char> raw_data_buffer_;
+  ind_t num_elements_ = 0;
+};
+
+}  // namespace data
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_DATA_OWNING_VECTOR_REF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/lib.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/lib.h
new file mode 100644
index 00000000..d37a46ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/lib.h
@@ -0,0 +1,139 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_LIB_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_LIB_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace ml_adj {
+
+/// Standard Types ///
+
+// Length of axis.
+typedef uint32_t dim_t;
+
+// Dimensions of data.
+typedef std::vector<dim_t> dims_t;
+
+// 1d index into data.
+typedef uint64_t ind_t;
+
+// Integral type of data element.
+enum etype_t : uint8_t {
+  i32 = 0,
+  f32 = 1,
+  f64 = 2,
+};
+
+// Size in bytes of data element.
+typedef uint8_t width_t;
+
+namespace data {
+
+// Get size (int bytes) of single element of type.
+inline width_t TypeWidth(etype_t type) {
+  switch (type) {
+    case etype_t::i32:
+      return sizeof(int32_t);
+    case etype_t::f32:
+      return sizeof(float);
+    case etype_t::f64:
+      return sizeof(double);
+  }
+}
+
+/// Input/Output Wrapper for Algos ///
+
+// Encapsulates a input or output to an algorithm. Management of
+// buffers is to be implemented outside of algorithms.
+
+// Read only wrapper.
+class DataRef {
+ public:
+  explicit DataRef(etype_t type) : element_type_(type) {}
+
+  DataRef(const DataRef&) = delete;
+  DataRef(DataRef&&) = delete;
+  DataRef& operator=(const DataRef&) = delete;
+  DataRef& operator=(DataRef&&) = delete;
+
+  // Read only buffer, allocated to be of size == Bytes().
+  virtual const void* Data() const = 0;
+
+  // Number of elements currently allocated.
+  virtual ind_t NumElements() const = 0;
+
+  // Size of buffer.
+  virtual size_t Bytes() const = 0;
+
+  // Type of elements.
+  etype_t Type() const { return element_type_; }
+
+  // Implicit dimensions of buffer.
+  const dims_t& Dims() const { return dims_; }
+
+  virtual ~DataRef() = default;
+
+ protected:
+  dims_t dims_;
+  // Data can be reshaped but not change types.
+  const etype_t element_type_;
+};
+
+// Read/write wrapper which can be resized.
+class MutableDataRef : public DataRef {
+ public:
+  using DataRef::Data;
+
+  explicit MutableDataRef(etype_t type) : DataRef(type) {}
+
+  // Takes ownership of dims_t. Implementations must set bytes and
+  // `num_elements_` field.
+  virtual void Resize(dims_t&& dims) = 0;
+
+  // Write buffer, allocated to be of size == Bytes().
+  virtual void* Data() = 0;
+};
+
+}  // namespace data
+
+namespace algo {
+
+/// Function Interface for Operations on DataRefs ///
+
+// Inputs to algorithm.
+typedef std::vector<data::DataRef*> InputPack;
+// Outputs to algorithm.
+typedef std::vector<data::MutableDataRef*> OutputPack;
+
+// Generic algorithm, computes outputs via inputs.
+typedef void ComputeFunc(const InputPack& inputs, const OutputPack& outputs);
+
+// Optional hook to compute output shapes when they are non data-dependent.
+// This is a place-holder for now.
+// TODO(b/292143456) Figure out what the signature of this should be.
+typedef size_t ShapeFunc();
+
+struct Algo {
+  ComputeFunc* process = nullptr;
+  ShapeFunc* output_size = nullptr;
+};
+
+}  // namespace algo
+}  // namespace ml_adj
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/tflite/extern_call.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/tflite/extern_call.h
new file mode 100644
index 00000000..efaed9c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/tflite/extern_call.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_TFLITE_EXTERN_CALL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_TFLITE_EXTERN_CALL_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite::extern_call {
+
+// Compile time options passed to this kernel at runtime.
+struct ExternCallOptions {
+  // A single custom op is used to represent a call to an arbitrary function
+  // in the library. The function that is called is encoded in `func_id`.
+  // Because of compiler op def, these will be encded at compile time
+  // as `char[]` and will serialize `uint8_t`, so we match this type.
+  uint8_t func_id;
+};
+
+TfLiteRegistration* Register_EXTERN_CALL();
+
+}  // namespace tflite::extern_call
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_TFLITE_EXTERN_CALL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/tflite/tfl_tensor_ref.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/tflite/tfl_tensor_ref.h
new file mode 100644
index 00000000..2f37d716
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/ml_adjacent/tflite/tfl_tensor_ref.h
@@ -0,0 +1,74 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_TFLITE_TFL_TENSOR_REF_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_TFLITE_TFL_TENSOR_REF_H_
+
+#include <cstddef>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/ml_adjacent/lib.h"
+
+namespace ml_adj::data {
+
+// Immutable wrapper of `TfLiteTensor`.
+class TflTensorRef : public DataRef {
+ public:
+  explicit TflTensorRef(const TfLiteTensor* tfl_tensor);
+
+  // TODO(b/290283768) Implement copy and move semantics.
+  TflTensorRef(const TflTensorRef&) = delete;
+  TflTensorRef(TflTensorRef&&) = delete;
+  TflTensorRef& operator=(const TflTensorRef&) = delete;
+  TflTensorRef& operator=(TflTensorRef&&) = delete;
+
+  const void* Data() const override;
+
+  ind_t NumElements() const override;
+
+  size_t Bytes() const override;
+
+ private:
+  const TfLiteTensor* const tfl_tensor_;
+};
+
+// Mutable wrapper of `TfLiteTensor`.
+class MutableTflTensorRef : public MutableDataRef {
+ public:
+  MutableTflTensorRef(TfLiteTensor* tfl_tensor, TfLiteContext* tfl_ctx);
+
+  // TODO(b/290283768) Implement copy and move semantics.
+  MutableTflTensorRef(const MutableTflTensorRef&) = delete;
+  MutableTflTensorRef(MutableTflTensorRef&&) = delete;
+  MutableTflTensorRef& operator=(const MutableTflTensorRef&) = delete;
+  MutableTflTensorRef& operator=(MutableTflTensorRef&&) = delete;
+
+  const void* Data() const override;
+
+  ind_t NumElements() const override;
+
+  size_t Bytes() const override;
+
+  void Resize(dims_t&& dims) override;
+
+  void* Data() override;
+
+ private:
+  TfLiteTensor* const tfl_tensor_;
+  TfLiteContext* tfl_ctx_;
+};
+
+}  // namespace ml_adj::data
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ML_ADJACENT_TFLITE_TFL_TENSOR_REF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/cache_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/cache_buffer.h
new file mode 100644
index 00000000..74a11176
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/cache_buffer.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
+
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/resource_variable.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace resource {
+
+/// WARNING: Experimental interface, subject to change.
+// A Cache Buffer class. Useful for keeping the keys and values of a
+// transformer block attention mechanism in autoregressive decode.
+// Ops can access this buffer and add tensors to it. It also keeps track of the
+// number of used entries in the cache.
+class CacheBuffer : public ResourceVariable {
+ public:
+  CacheBuffer() = default;
+  CacheBuffer(const CacheBuffer &) = delete;
+  ~CacheBuffer() override;
+  CacheBuffer &operator=(const CacheBuffer &) = delete;
+  // Initialize tensor of a certain shape using the provided type.
+  TfLiteStatus Initialize(const TfLiteIntArray &shape);
+  size_t GetNumEntries(int idx) const;
+  float *GetBuffer();
+  size_t GetSize();
+  void SetNumEntries(int idx, size_t count);
+
+ private:
+  // The number of entries currently used in the buffer;
+  std::unique_ptr<size_t[]> num_entries_;
+  // The float buffer for storage. Has shape:
+  // <batch, num layers, seq length, num heads, head dim>
+  std::unique_ptr<float[]> buffer_;
+  TfLiteIntArray *dims_;
+};
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/initialization_status.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/initialization_status.h
new file mode 100644
index 00000000..cf0b8f2b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/initialization_status.h
@@ -0,0 +1,70 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_INITIALIZATION_STATUS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_INITIALIZATION_STATUS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
+
+namespace tflite {
+namespace resource {
+
+/// WARNING: Experimental interface, subject to change.
+// An initialization status class. This class will record the completion status
+// of the initialization procedure. For example, when the initialization
+// subgraph should be invoked  once in a life cycle, this class instance will
+// have the initialization status in order to make sure the followup invocations
+// to invoke the initalization subgraph can be ignored safely.
+class InitializationStatus : public ResourceBase {
+ public:
+  InitializationStatus() {}
+  InitializationStatus(InitializationStatus&& other) noexcept {
+    is_initialized_ = other.is_initialized_;
+  }
+
+  InitializationStatus(const InitializationStatus&) = delete;
+  InitializationStatus& operator=(const InitializationStatus&) = delete;
+
+  ~InitializationStatus() override {}
+
+  // Mark initialization is done.
+  void MarkInitializationIsDone();
+
+  // Returns true if this initialization is done.
+  bool IsInitialized() override;
+
+  size_t GetMemoryUsage() override { return 0; }
+
+ private:
+  // True if the initialization process is done.
+  bool is_initialized_ = false;
+};
+
+/// WARNING: Experimental interface, subject to change.
+using InitializationStatusMap =
+    std::unordered_map<std::int32_t, std::unique_ptr<InitializationStatus>>;
+
+InitializationStatus* GetInitializationStatus(InitializationStatusMap* map,
+                                              int subgraph_id);
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_INITIALIZATION_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/lookup_interfaces.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/lookup_interfaces.h
new file mode 100644
index 00000000..e89ab2fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/lookup_interfaces.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_LOOKUP_INTERFACES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_LOOKUP_INTERFACES_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/lookup_util.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace resource {
+
+/// WARNING: Experimental interface, subject to change.
+// A resource hash table interface. It's similar to TensorFlow core's
+// LookupInterface class. But it's identified with int32 ID in TFLite (instead
+// of using Resource handle like TensorFlow).
+class LookupInterface : public ResourceBase {
+ public:
+  virtual TfLiteStatus Lookup(TfLiteContext* context, const TfLiteTensor* keys,
+                              TfLiteTensor* values,
+                              const TfLiteTensor* default_value) = 0;
+  virtual TfLiteStatus Import(TfLiteContext* context, const TfLiteTensor* keys,
+                              const TfLiteTensor* values) = 0;
+  virtual size_t Size() = 0;
+
+  virtual TfLiteType GetKeyType() const = 0;
+  virtual TfLiteType GetValueType() const = 0;
+  virtual TfLiteStatus CheckKeyAndValueTypes(TfLiteContext* context,
+                                             const TfLiteTensor* keys,
+                                             const TfLiteTensor* values) = 0;
+};
+
+// Creates an resource hash table, shared among all the subgraphs with the
+// given resource id if there is an existing one.
+// WARNING: Experimental interface, subject to change.
+void CreateHashtableResourceIfNotAvailable(ResourceMap* resources,
+                                           int resource_id,
+                                           TfLiteType key_dtype,
+                                           TfLiteType value_dtype);
+
+// Returns the corresponding resource hash table, or nullptr if none.
+// WARNING: Experimental interface, subject to change.
+LookupInterface* GetHashtableResource(ResourceMap* resources, int resource_id);
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_LOOKUP_INTERFACES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/lookup_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/lookup_util.h
new file mode 100644
index 00000000..61fdb513
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/lookup_util.h
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_LOOKUP_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_LOOKUP_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace resource {
+namespace internal {
+
+/// Helper class for accessing TFLite tensor data.
+template <typename T>
+class TensorReader {
+ public:
+  explicit TensorReader(const TfLiteTensor* input) {
+    input_data_ = GetTensorData<T>(input);
+  }
+
+  // Returns the corresponding scalar data at the given index position.
+  // In here, it does not check the validity of the index should be guaranteed
+  // in order not to harm the performance. Caller should take care of it.
+  T GetData(int index) { return input_data_[index]; }
+
+ private:
+  const T* input_data_;
+};
+
+/// Helper class for accessing TFLite tensor data. This specialized class is for
+/// std::string type.
+template <>
+class TensorReader<std::string> {
+ public:
+  explicit TensorReader(const TfLiteTensor* input) : input_(input) {}
+
+  // Returns the corresponding string data at the given index position.
+  // In here, it does not check the validity of the index should be guaranteed
+  // in order not to harm the performance. Caller should take care of it.
+  std::string GetData(int index) {
+    auto string_ref = GetString(input_, index);
+    return std::string(string_ref.str, string_ref.len);
+  }
+
+ private:
+  const TfLiteTensor* input_;
+};
+
+/// WARNING: Experimental interface, subject to change.
+/// Helper class for writing TFLite tensor data.
+template <typename ValueType>
+class TensorWriter {
+ public:
+  explicit TensorWriter(TfLiteTensor* values) {
+    output_data_ = GetTensorData<ValueType>(values);
+  }
+
+  // Sets the given value to the given index position of the tensor storage.
+  // In here, it does not check the validity of the index should be guaranteed
+  // in order not to harm the performance. Caller should take care of it.
+  void SetData(int index, ValueType& value) { output_data_[index] = value; }
+
+  // Commit updates. In this case, it does nothing since the SetData method
+  // writes data directly.
+  void Commit() {
+    // Noop.
+  }
+
+ private:
+  ValueType* output_data_;
+};
+
+/// WARNING: Experimental interface, subject to change.
+/// Helper class for writing TFLite tensor data. This specialized class is for
+/// std::string type.
+template <>
+class TensorWriter<std::string> {
+ public:
+  explicit TensorWriter(TfLiteTensor* values) : values_(values) {}
+
+  // Queues the given string value to the buffer regardless of the provided
+  // index.
+  // In here, it does not check the validity of the index should be guaranteed
+  // in order not to harm the performance. Caller should take care of it.
+  void SetData(int index, const std::string& value) {
+    buf_.AddString(value.data(), value.length());
+  }
+
+  // Commit updates. The stored data in DynamicBuffer will be written into the
+  // tensor storage.
+  void Commit() { buf_.WriteToTensor(values_, nullptr); }
+
+ private:
+  TfLiteTensor* values_;
+  DynamicBuffer buf_;
+};
+
+}  // namespace internal
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_LOOKUP_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/resource_base.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/resource_base.h
new file mode 100644
index 00000000..78a8676a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/resource_base.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_BASE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_BASE_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace tflite {
+namespace resource {
+
+// ResourceBase is an abstract base class for resources.
+/// WARNING: Experimental interface, subject to change.
+class ResourceBase {
+ public:
+  explicit ResourceBase() {}
+  virtual ~ResourceBase() {}
+
+  // Returns true if it is initialized.
+  virtual bool IsInitialized() = 0;
+
+  virtual size_t GetMemoryUsage() {
+    return 0;
+  }  // TODO(b/242603814): Make it pure virtual.
+};
+
+/// WARNING: Experimental interface, subject to change.
+using ResourceMap =
+    std::unordered_map<std::int32_t, std::unique_ptr<ResourceBase>>;
+
+using ResourceIDMap = std::map<std::pair<std::string, std::string>, int>;
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/resource_variable.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/resource_variable.h
new file mode 100644
index 00000000..bf772668
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/resource_variable.h
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_VARIABLE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_VARIABLE_H_
+
+#include <cstddef>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
+
+namespace tflite {
+namespace resource {
+
+/// WARNING: Experimental interface, subject to change.
+// A resource variable class. It's similar to TensorFlow Resource
+// Variable, but it's identified with int32 ID in TFLite (instead of
+// using Resource handle like TensorFlow).
+class ResourceVariable : public ResourceBase {
+ public:
+  ResourceVariable();
+  ResourceVariable(ResourceVariable&& other);
+
+  ResourceVariable(const ResourceVariable&) = delete;
+  ResourceVariable& operator=(const ResourceVariable&) = delete;
+
+  ~ResourceVariable() override;
+
+  // Assigns data from a tensor. Copies its type, shape and data over.
+  TfLiteStatus AssignFrom(const TfLiteTensor* tensor);
+
+  // Get the data tensor stored in the resource variable.
+  // Returns `nullptr` if the variable is never initialized by calling
+  // `AssignFrom`.
+  TfLiteTensor* GetTensor() { return is_initialized_ ? &tensor_ : nullptr; }
+
+  // Returns true if this resource variable is initialized.
+  bool IsInitialized() override { return is_initialized_; }
+
+  size_t GetMemoryUsage() override {
+    return is_initialized_ ? tensor_.bytes : 0;
+  }
+
+ protected:
+  // The tensor (and its buffer stored in `tensor_.data` is fully owned by
+  // the `ResourceVariable` object.
+  TfLiteTensor tensor_;
+  // True if `AssignFrom` function is every called.
+  // False if and only if `tensor_` is filled with zeros.
+  bool is_initialized_ = false;
+};
+
+// Creates a resource variable, shared among all the subgraphs with the given
+// resource id if there is an existing one.
+// WARNING: Experimental interface, subject to change.
+void CreateResourceVariableIfNotAvailable(ResourceMap* resources,
+                                          int resource_id);
+
+// Returns the corresponding resource variable, or nullptr if none.
+// WARNING: Experimental interface, subject to change.
+ResourceVariable* GetResourceVariable(ResourceMap* resources, int resource_id);
+
+// Returns true if 'tensor' points to a builtin resource.
+// WARNING: Experimental interface, subject to change.
+bool IsBuiltinResource(const TfLiteTensor* tensor);
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_VARIABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/static_hashtable.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/static_hashtable.h
new file mode 100644
index 00000000..c91b65d1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/resource/static_hashtable.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_STATIC_HASHTABLE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_STATIC_HASHTABLE_H_
+
+#include <cstddef>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
+#include "tensorflow/lite/experimental/resource/lookup_util.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace resource {
+namespace internal {
+
+// A static hash table class. This hash table allows initialization one time in
+// its life cycle. This hash table implements Tensorflow core's HashTableV2 op.
+template <typename KeyType, typename ValueType>
+class StaticHashtable : public tflite::resource::LookupInterface {
+ public:
+  explicit StaticHashtable(TfLiteType key_type, TfLiteType value_type)
+      : key_type_(key_type), value_type_(value_type) {}
+  ~StaticHashtable() override {}
+
+  // Finds the corresponding value of the given keys tensor in the map and
+  // copies the result data to the given values tensor. If there is no matching
+  // value, it will write the default value into the matched position instead.
+  TfLiteStatus Lookup(TfLiteContext* context, const TfLiteTensor* keys,
+                      TfLiteTensor* values,
+                      const TfLiteTensor* default_value) override;
+
+  // Inserts the given key and value tensor data into the hash table.
+  TfLiteStatus Import(TfLiteContext* context, const TfLiteTensor* keys,
+                      const TfLiteTensor* values) override;
+
+  // Returns the item size of the hash table.
+  size_t Size() override { return map_.size(); }
+
+  TfLiteType GetKeyType() const override { return key_type_; }
+  TfLiteType GetValueType() const override { return value_type_; }
+
+  TfLiteStatus CheckKeyAndValueTypes(TfLiteContext* context,
+                                     const TfLiteTensor* keys,
+                                     const TfLiteTensor* values) override {
+    TF_LITE_ENSURE_EQ(context, keys->type, key_type_);
+    TF_LITE_ENSURE_EQ(context, values->type, value_type_);
+    return kTfLiteOk;
+  }
+
+  // Returns true if the hash table is initialized.
+  bool IsInitialized() override { return is_initialized_; }
+
+  size_t GetMemoryUsage() override { return map_.size() * sizeof(ValueType); }
+
+ private:
+  TfLiteType key_type_;
+  TfLiteType value_type_;
+
+  std::unordered_map<KeyType, ValueType> map_;
+  bool is_initialized_ = false;
+};
+
+::tflite::resource::LookupInterface* CreateStaticHashtable(
+    TfLiteType key_type, TfLiteType value_type);
+
+}  // namespace internal
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_STATIC_HASHTABLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/bf16.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/bf16.h
new file mode 100644
index 00000000..45acdd85
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/bf16.h
@@ -0,0 +1,279 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BF16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BF16_H_
+
+#if defined(__STDCPP_BFLOAT16_T__)
+#include <stdfloat>
+namespace shlo_ref {
+using BF16 = ::std::bfloat16_t;
+}  // namespace shlo_ref
+
+#else
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "absl/base/casts.h"
+#include "absl/log/absl_check.h"
+
+// On arm64 the compiler is not yet able to generate code for __bf16
+// operations. Therefore, we resort to a software-based implementation of BF16
+// based on promoting ops to float.
+namespace shlo_ref {
+class BF16;
+
+namespace internal {
+BF16 NumericF32ToBF16RoundNearestEven(float v);
+BF16 F32ToBF16RoundNearestEven(float v);
+float BF16ToFloat(BF16 v);
+}  // namespace internal
+}  // namespace shlo_ref
+
+namespace std {
+template <>
+struct numeric_limits< ::shlo_ref::BF16>;
+}  // namespace std
+
+namespace shlo_ref {
+
+class BF16 {
+ public:
+  constexpr BF16() : value_(0) {}
+
+  template <typename T,
+            typename = std::enable_if_t<std::is_convertible_v<T, float> > >
+  explicit BF16(T x) {
+    if constexpr (std::is_same_v<T, bool>) {
+      value_ = static_cast<uint16_t>(x) * 0x3f80;
+    } else if constexpr (std::numeric_limits<T>::is_integer) {
+      *this = internal::NumericF32ToBF16RoundNearestEven(static_cast<float>(x));
+    } else {
+      *this = internal::F32ToBF16RoundNearestEven(static_cast<float>(x));
+    }
+  }
+
+  // Tagged constructor to allow construction from bits
+  struct bitcast_construct_t {};
+  explicit constexpr BF16(bitcast_construct_t, uint16_t value)
+      : value_(value) {}
+
+  // Lossless conversion to `float`.
+  operator float() const {  // NOLINT: Allow implicit conversions to float.
+    return internal::BF16ToFloat(*this);
+  }
+
+  // Assignment operators
+  BF16& operator=(float v) { return *this = static_cast<BF16>(v); }
+  BF16& operator=(bool v) { return *this = static_cast<BF16>(v); }
+
+  template <typename T>
+  // NOLINTNEXTLINE(misc-unconventional-assign-operator)
+  std::enable_if_t<std::numeric_limits<T>::is_integer, BF16&> operator=(T v) {
+    return *this = static_cast<BF16>(v);
+  }
+
+#define INTERNAL_BF16_ARITHMETIC_OP(OP)                          \
+  friend BF16 operator OP(BF16 x, BF16 y) {                      \
+    return BF16(static_cast<float>(x) OP static_cast<float>(y)); \
+  }
+
+#define INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(OP)                       \
+  friend BF16& operator OP##=(BF16 & x, BF16 y) {                    \
+    return x = BF16(static_cast<float>(x) OP static_cast<float>(y)); \
+  }
+
+  INTERNAL_BF16_ARITHMETIC_OP(+)
+  INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(+)
+  INTERNAL_BF16_ARITHMETIC_OP(-)
+  INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(-)
+  INTERNAL_BF16_ARITHMETIC_OP(*)
+  INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(*)
+  INTERNAL_BF16_ARITHMETIC_OP(/)
+  INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(/)
+  INTERNAL_BF16_ARITHMETIC_OP(==)
+  INTERNAL_BF16_ARITHMETIC_OP(!=)
+  INTERNAL_BF16_ARITHMETIC_OP(<)
+  INTERNAL_BF16_ARITHMETIC_OP(<=)
+  INTERNAL_BF16_ARITHMETIC_OP(>)
+  INTERNAL_BF16_ARITHMETIC_OP(>=)
+
+#undef INTERNAL_BF16_ARITHMETIC_OP
+#undef INTERNAL_BF16_ARITHMETIC_ASSIGN_OP
+
+  // Unary negation.
+  friend BF16 operator-(BF16 x) {
+    BF16 result;
+    result.value_ = x.value_ ^ 0x8000;
+    return result;
+  }
+
+  // Unary plus
+  friend BF16 operator+(BF16 x) { return x; }
+
+ private:
+  uint16_t value_;
+};
+
+inline bool isinf(BF16 x) { return std::isinf(static_cast<float>(x)); }
+inline bool signbit(BF16 x) { return std::signbit(static_cast<float>(x)); }
+inline bool isnan(BF16 x) { return std::isnan(static_cast<float>(x)); }
+inline bool isfinite(BF16 x) { return std::isfinite(static_cast<float>(x)); }
+inline BF16 abs(BF16 x) { return BF16(std::abs(static_cast<float>(x))); }
+inline BF16 exp(BF16 x) { return BF16(std::exp(static_cast<float>(x))); }
+inline BF16 exp2(BF16 x) { return BF16(std::exp2(static_cast<float>(x))); }
+inline BF16 expm1(BF16 x) { return BF16(std::expm1(static_cast<float>(x))); }
+inline BF16 log(BF16 x) { return BF16(std::log(static_cast<float>(x))); }
+inline BF16 log1p(BF16 x) { return BF16(std::log1p(static_cast<float>(x))); }
+inline BF16 log10(BF16 x) { return BF16(std::log10(static_cast<float>(x))); }
+inline BF16 log2(BF16 x) { return BF16(std::log2(static_cast<float>(x))); }
+inline BF16 sqrt(BF16 x) { return BF16(std::sqrt(static_cast<float>(x))); }
+inline BF16 pow(BF16 x, BF16 y) {
+  return BF16(std::pow(static_cast<float>(x), static_cast<float>(y)));
+}
+inline BF16 sin(BF16 x) { return BF16(std::sin(static_cast<float>(x))); }
+inline BF16 cos(BF16 x) { return BF16(std::cos(static_cast<float>(x))); }
+inline BF16 tan(BF16 x) { return BF16(std::tan(static_cast<float>(x))); }
+inline BF16 asin(BF16 x) { return BF16(std::asin(static_cast<float>(x))); }
+inline BF16 acos(BF16 x) { return BF16(std::acos(static_cast<float>(x))); }
+inline BF16 atan(BF16 x) { return BF16(std::atan(static_cast<float>(x))); }
+inline BF16 sinh(BF16 x) { return BF16(std::sinh(static_cast<float>(x))); }
+inline BF16 cosh(BF16 x) { return BF16(std::cosh(static_cast<float>(x))); }
+inline BF16 tanh(BF16 x) { return BF16(std::tanh(static_cast<float>(x))); }
+inline BF16 asinh(BF16 x) { return BF16(std::asinh(static_cast<float>(x))); }
+inline BF16 acosh(BF16 x) { return BF16(std::acosh(static_cast<float>(x))); }
+inline BF16 atanh(BF16 x) { return BF16(std::atanh(static_cast<float>(x))); }
+inline BF16 floor(BF16 x) { return BF16(std::floor(static_cast<float>(x))); }
+inline BF16 trunc(BF16 x) { return BF16(std::trunc(static_cast<float>(x))); }
+inline BF16 rint(BF16 x) { return BF16(std::rint(static_cast<float>(x))); }
+inline BF16 ceil(BF16 x) { return BF16(std::ceil(static_cast<float>(x))); }
+inline BF16 fmod(BF16 x, BF16 y) {
+  return BF16(std::fmod(static_cast<float>(x), static_cast<float>(y)));
+}
+inline BF16 fmin(BF16 a, BF16 b) {
+  return BF16(std::fmin(static_cast<float>(a), static_cast<float>(b)));
+}
+inline BF16 fmax(BF16 a, BF16 b) {
+  return BF16(std::fmax(static_cast<float>(a), static_cast<float>(b)));
+}
+
+namespace internal {
+
+inline BF16 NumericF32ToBF16RoundNearestEven(float v) {
+  ABSL_CHECK(!std::isnan(v));
+
+  uint32_t input = absl::bit_cast<uint32_t>(v);
+  const uint32_t lsb = (input >> 16) & 1;
+  const uint32_t rounding_bias = 0x7fff + lsb;
+  input += rounding_bias;
+  return absl::bit_cast<BF16, uint16_t>(input >> 16);
+}
+
+inline BF16 F32ToBF16RoundNearestEven(float v) {
+  if (std::isnan(v)) {
+    return BF16(BF16::bitcast_construct_t{},
+                static_cast<uint16_t>(
+                    (absl::bit_cast<uint32_t>(v) | 0x00200000u) >> 16));
+  }
+  return NumericF32ToBF16RoundNearestEven(v);
+}
+
+inline float BF16ToFloat(BF16 v) {
+  return absl::bit_cast<float>(
+      static_cast<uint32_t>(absl::bit_cast<uint16_t>(v)) << 16);
+}
+}  // namespace internal
+}  // namespace shlo_ref
+
+// Specialized std::numeric_limits for BF16
+namespace std {
+template <>
+class numeric_limits<shlo_ref::BF16> {
+ public:
+  static constexpr bool is_specialized = true;      // NOLINT
+  static constexpr bool is_signed = true;           // NOLINT
+  static constexpr bool is_integer = false;         // NOLINT
+  static constexpr bool is_exact = false;           // NOLINT
+  static constexpr bool has_infinity = true;        // NOLINT
+  static constexpr bool has_quiet_NaN = true;       // NOLINT
+  static constexpr bool has_signaling_NaN = true;   // NOLINT
+  static constexpr float_denorm_style has_denorm =  // NOLINT
+      std::denorm_present;
+  static constexpr bool has_denorm_loss = false;    // NOLINT
+  static constexpr float_round_style round_style =  // NOLINT
+      numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;   // NOLINT
+  static constexpr bool is_bounded = true;  // NOLINT
+  static constexpr bool is_modulo = false;  // NOLINT
+  static constexpr int digits = 8;          // NOLINT
+  static constexpr int digits10 = 2;        // NOLINT
+  static constexpr int max_digits10 = 4;    // NOLINT
+  static constexpr int radix = 2;           // NOLINT
+  static constexpr int min_exponent =       // NOLINT
+      numeric_limits<float>::min_exponent;
+  static constexpr int min_exponent10 =  // NOLINT
+      numeric_limits<float>::min_exponent10;
+  static constexpr int max_exponent =  // NOLINT
+      numeric_limits<float>::max_exponent;
+  static constexpr int max_exponent10 =  // NOLINT
+      numeric_limits<float>::max_exponent10;
+  static constexpr bool traps = numeric_limits<float>::traps;  // NOLINT
+  static constexpr bool tinyness_before =                      // NOLINT
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr shlo_ref::BF16(min)() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x0080));
+  }
+  static constexpr shlo_ref::BF16 lowest() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0xff7f));
+  }
+  static constexpr shlo_ref::BF16(max)() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x7f7f));
+  }
+  static constexpr shlo_ref::BF16 epsilon() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x3c00));
+  }
+  static constexpr shlo_ref::BF16 round_error() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x3f00));
+  }
+  static constexpr shlo_ref::BF16 infinity() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x7f80));
+  }
+  static constexpr shlo_ref::BF16 quiet_NaN() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x7fc0));
+  }
+  static constexpr shlo_ref::BF16 signaling_NaN() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x7f81));
+  }
+  static constexpr shlo_ref::BF16 denorm_min() {
+    return shlo_ref::BF16(shlo_ref::BF16::bitcast_construct_t{},
+                          static_cast<uint16_t>(0x0001));
+  }
+};
+}  // namespace std
+
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BF16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/data_type.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/data_type.h
new file mode 100644
index 00000000..40d0acb3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/data_type.h
@@ -0,0 +1,166 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
+
+namespace shlo_ref {
+
+// For more information on StableHLO types, see the spec., search for "Element
+// types". The SHLO Device Profile does not include unsigned or 64 bit types.
+enum class DataType {
+  kI1,
+  kSI4,
+  kSI8,
+  kSI16,
+  kSI32,
+  kSI64,
+  kBF16,
+  kF16,
+  kF32,
+};
+
+template <class T>
+struct DefaultStorageDescription {
+  using Type = T;
+  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
+  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
+};
+
+// Storage provides the corresponding C++ type for the given DataType.
+template <DataType data_type>
+struct Storage {};
+
+template <>
+struct Storage<DataType::kI1> : DefaultStorageDescription<bool> {};
+
+template <>
+struct Storage<DataType::kSI4> : DefaultStorageDescription<I4> {};
+
+template <>
+struct Storage<DataType::kSI8> : DefaultStorageDescription<int8_t> {};
+
+template <>
+struct Storage<DataType::kSI16> : DefaultStorageDescription<int16_t> {};
+
+template <>
+struct Storage<DataType::kSI32> : DefaultStorageDescription<int32_t> {};
+
+template <>
+struct Storage<DataType::kSI64> : DefaultStorageDescription<int64_t> {};
+
+template <>
+struct Storage<DataType::kBF16> : DefaultStorageDescription<BF16> {};
+
+template <>
+struct Storage<DataType::kF16> : DefaultStorageDescription<F16> {};
+
+template <>
+struct Storage<DataType::kF32> : DefaultStorageDescription<float> {};
+
+template <DataType data_type>
+using StorageType = typename Storage<data_type>::Type;
+
+constexpr bool IsBool(DataType data_type) { return data_type == DataType::kI1; }
+
+constexpr bool IsSignedInteger(DataType data_type) {
+  return data_type == DataType::kSI4 || data_type == DataType::kSI8 ||
+         data_type == DataType::kSI16 || data_type == DataType::kSI32 ||
+         data_type == DataType::kSI64;
+}
+
+constexpr bool IsUnsignedInteger(DataType data_type) { return false; }
+
+constexpr bool IsInteger(DataType data_type) {
+  return IsSignedInteger(data_type) || IsUnsignedInteger(data_type);
+}
+
+constexpr bool IsFloat(DataType data_type) {
+  return data_type == DataType::kBF16 || data_type == DataType::kF16 ||
+         data_type == DataType::kF32;
+}
+
+template <DataType data_type>
+constexpr int64_t SizeOf() {
+  return sizeof(StorageType<data_type>);
+}
+
+constexpr int64_t SizeOf(DataType data_type) {
+  switch (data_type) {
+    case DataType::kI1:
+      return SizeOf<DataType::kI1>();
+    case DataType::kSI4:
+      return SizeOf<DataType::kSI4>();
+    case DataType::kSI8:
+      return SizeOf<DataType::kSI8>();
+    case DataType::kSI16:
+      return SizeOf<DataType::kSI16>();
+    case DataType::kSI32:
+      return SizeOf<DataType::kSI32>();
+    case DataType::kSI64:
+      return SizeOf<DataType::kSI64>();
+    case DataType::kBF16:
+      return SizeOf<DataType::kBF16>();
+    case DataType::kF16:
+      return SizeOf<DataType::kF16>();
+    case DataType::kF32:
+      return SizeOf<DataType::kF32>();
+  }
+}
+
+// Gets a string representation of the given DataType.
+constexpr const char* ToString(DataType t) {
+  switch (t) {
+    case DataType::kI1:
+      return "I1";
+      break;
+    case DataType::kSI4:
+      return "SI4";
+      break;
+    case DataType::kSI8:
+      return "SI8";
+      break;
+    case DataType::kSI16:
+      return "SI16";
+      break;
+    case DataType::kSI32:
+      return "SI32";
+      break;
+    case DataType::kSI64:
+      return "SI64";
+      break;
+    case DataType::kBF16:
+      return "BF16";
+      break;
+    case DataType::kF16:
+      return "F16";
+      break;
+    case DataType::kF32:
+      return "F32";
+      break;
+  }
+  return "Unknown data type";
+}
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/dispatch.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/dispatch.h
new file mode 100644
index 00000000..ecadbd7e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/dispatch.h
@@ -0,0 +1,173 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DISPATCH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DISPATCH_H_
+
+#define RETURN_OK_STATUS_IF_VOID(expr)                           \
+  {                                                              \
+    return [&](auto v) {                                         \
+      if constexpr (std::is_same_v<decltype(v, (expr)), void>) { \
+        (void)(expr);                                            \
+        return absl::OkStatus();                                 \
+      } else {                                                   \
+        return expr;                                             \
+      }                                                          \
+      return absl::OkStatus();                                   \
+    }(0);                                                        \
+  }
+
+#define DISPATCH_INT(name, element_type, ...)                           \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kSI4:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI4>(__VA_ARGS__)));  \
+      case DataType::kSI8:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI8>(__VA_ARGS__)));  \
+      case DataType::kSI16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI16>(__VA_ARGS__))); \
+      case DataType::kSI32:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI32>(__VA_ARGS__))); \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
+#define DISPATCH_FLOAT(name, element_type, ...)                         \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kBF16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kBF16>(__VA_ARGS__))); \
+      case DataType::kF16:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kF16>(__VA_ARGS__)));  \
+      case DataType::kF32:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kF32>(__VA_ARGS__)));  \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
+#define DISPATCH_INT_FLOAT(name, element_type, ...)                     \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kSI4:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI4>(__VA_ARGS__)));  \
+      case DataType::kSI8:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI8>(__VA_ARGS__)));  \
+      case DataType::kSI16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI16>(__VA_ARGS__))); \
+      case DataType::kSI32:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI32>(__VA_ARGS__))); \
+      case DataType::kBF16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kBF16>(__VA_ARGS__))); \
+      case DataType::kF16:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kF16>(__VA_ARGS__)));  \
+      case DataType::kF32:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kF32>(__VA_ARGS__)));  \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
+#define DISPATCH_BOOL_INT(name, element_type, ...)                      \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kI1:                                               \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kI1>(__VA_ARGS__)));   \
+      case DataType::kSI4:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI4>(__VA_ARGS__)));  \
+      case DataType::kSI8:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI8>(__VA_ARGS__)));  \
+      case DataType::kSI16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI16>(__VA_ARGS__))); \
+      case DataType::kSI32:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI32>(__VA_ARGS__))); \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
+#define DISPATCH_BOOL_INT_FLOAT(name, element_type, ...)                \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kI1:                                               \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kI1>(__VA_ARGS__)));   \
+      case DataType::kSI4:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI4>(__VA_ARGS__)));  \
+      case DataType::kSI8:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI8>(__VA_ARGS__)));  \
+      case DataType::kSI16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI16>(__VA_ARGS__))); \
+      case DataType::kSI32:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI32>(__VA_ARGS__))); \
+      case DataType::kBF16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kBF16>(__VA_ARGS__))); \
+      case DataType::kF16:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kF16>(__VA_ARGS__)));  \
+      case DataType::kF32:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kF32>(__VA_ARGS__)));  \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
+#define DISPATCH_QUANTIZED(name, storage_type, expressed_type, ...)          \
+  {                                                                          \
+    switch (storage_type) {                                                  \
+      case DataType::kSI4:                                                   \
+        switch (expressed_type) {                                            \
+          case DataType::kBF16:                                              \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI4, DataType::kBF16>(__VA_ARGS__)));       \
+          case DataType::kF16:                                               \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI4, DataType::kF16>(__VA_ARGS__)));        \
+          case DataType::kF32:                                               \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI4, DataType::kF32>(__VA_ARGS__)));        \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      case DataType::kSI8:                                                   \
+        switch (expressed_type) {                                            \
+          case DataType::kBF16:                                              \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI8, DataType::kBF16>(__VA_ARGS__)));       \
+          case DataType::kF16:                                               \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI8, DataType::kF16>(__VA_ARGS__)));        \
+          case DataType::kF32:                                               \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI8, DataType::kF32>(__VA_ARGS__)));        \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      case DataType::kSI16:                                                  \
+        switch (expressed_type) {                                            \
+          case DataType::kF32:                                               \
+            RETURN_OK_STATUS_IF_VOID(                                        \
+                (name<DataType::kSI16, DataType::kF32>(__VA_ARGS__)));       \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      default:                                                               \
+        return absl::InvalidArgumentError("Unsupported storage type");       \
+    }                                                                        \
+  }
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DISPATCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/f16.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/f16.h
new file mode 100644
index 00000000..594adadc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/f16.h
@@ -0,0 +1,245 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "fp16.h"  // from @FP16  // IWYU pragma: keep, used with no builtin float16
+
+// Use __FLT16_MAX__ to determine whether _Float16 is builtin
+#if defined(__FLT16_MAX__) && !SHLO_REF_EMULATE_F16
+#define SHLO_REF_HAS_BUILTIN_FLOAT16 1
+#endif
+
+namespace shlo_ref {
+
+class alignas(uint16_t) F16 {
+ public:
+  F16() = default;
+
+  template <typename T,
+            typename = std::enable_if_t<std::is_convertible_v<T, float>>>
+  // Allow implicit conversions from types convertible to float.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  F16(T x);
+
+  // Tagged constructor to allow construction from bits
+  struct bitcast_construct_t {};
+  explicit F16(bitcast_construct_t, uint16_t bits) : bits_(bits) {}
+
+  // Allow implicit conversions to float.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator float() const;
+
+  explicit operator bool() const;
+
+  F16& operator=(float x) { return *this = static_cast<F16>(x); }
+
+#ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
+
+#define SHLO_REF_DEFINE_BINARY_OP(OP)                                         \
+  friend F16 operator OP(F16 x, F16 y) { return x.native_ OP y.native_; }     \
+                                                                              \
+  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend auto operator OP(F16 x, T y) {                                       \
+    return x.native_ OP y;                                                    \
+  }                                                                           \
+                                                                              \
+  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend auto operator OP(T x, F16 y) {                                       \
+    return x OP y.native_;                                                    \
+  }
+
+#define SHLO_REF_DEFINE_BINARY_ASSIGN_OP(OP)                               \
+  SHLO_REF_DEFINE_BINARY_OP(OP);                                           \
+  friend F16& operator OP##=(F16 & x, F16 y) {                             \
+    x.native_ OP## = y.native_;                                            \
+    return x;                                                              \
+  }                                                                        \
+                                                                           \
+  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend F16& operator OP##=(F16 & x, T y) {                               \
+    x.native_ OP## = y;                                                    \
+    return x;                                                              \
+  }
+
+#else  // !SHLO_REF_HAS_BUILTIN_FLOAT16
+
+#define SHLO_REF_DEFINE_BINARY_OP(OP)                              \
+  friend F16 operator OP(F16 x, F16 y) {                           \
+    return F16(static_cast<float>(x) OP static_cast<float>(y));    \
+  }                                                                \
+                                                                   \
+  template <typename T, typename C = std::common_type_t<F16, T>>   \
+  friend C operator OP(F16 x, T y) {                               \
+    return static_cast<C>(static_cast<C>(x) OP static_cast<C>(y)); \
+  }                                                                \
+                                                                   \
+  template <typename T, typename C = std::common_type_t<F16, T>>   \
+  friend C operator OP(T x, F16 y) {                               \
+    return static_cast<C>(static_cast<C>(x) OP static_cast<C>(y)); \
+  }
+
+#define SHLO_REF_DEFINE_BINARY_ASSIGN_OP(OP)                               \
+  SHLO_REF_DEFINE_BINARY_OP(OP);                                           \
+  friend F16& operator OP##=(F16 & x, F16 y) {                             \
+    return x = F16(static_cast<float>(x) OP static_cast<float>(y));        \
+  }                                                                        \
+                                                                           \
+  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend F16& operator OP##=(F16 & x, T y) {                               \
+    return x = static_cast<float>(x) OP y;                                 \
+  }
+
+#endif  // SHLO_REF_HAS_BUILTIN_FLOAT16
+
+  friend F16 operator+(F16 x);
+  friend F16 operator-(F16 x);
+  friend F16& operator++(F16& x);
+  friend F16 operator++(F16& x, int);
+  friend F16& operator--(F16& x);
+  friend F16 operator--(F16& x, int);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(+);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(-);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(*);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(/);
+  SHLO_REF_DEFINE_BINARY_OP(<);
+  SHLO_REF_DEFINE_BINARY_OP(<=);
+  SHLO_REF_DEFINE_BINARY_OP(>);
+  SHLO_REF_DEFINE_BINARY_OP(>=);
+  SHLO_REF_DEFINE_BINARY_OP(==);
+  SHLO_REF_DEFINE_BINARY_OP(!=);
+#undef SHLO_REF_DEFINE_BINARY_ASSIGN_OP
+#undef SHLO_REF_DEFINE_BINARY_OP
+
+ private:
+  union {
+#ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
+    _Float16 native_;
+#endif
+
+    uint16_t bits_;
+  };
+};
+
+namespace detail {
+
+template <class T, class SFINAE = void>
+struct F16CommonType {};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<std::is_integral_v<T>>> {
+  using type = F16;
+};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<std::is_floating_point_v<T>>> {
+  using type = T;
+};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<!std::is_arithmetic_v<T> &&
+                                         std::is_convertible_v<T, float>>> {
+  using type = float;
+};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<!std::is_arithmetic_v<T> &&
+                                         !std::is_convertible_v<T, float> &&
+                                         std::is_convertible_v<T, double>>> {
+  using type = double;
+};
+
+}  // namespace detail
+
+}  // namespace shlo_ref
+
+namespace std {
+
+template <>
+struct common_type<shlo_ref::F16, shlo_ref::F16> {
+  using type = shlo_ref::F16;
+};
+
+template <typename T>
+struct common_type<shlo_ref::F16, T> : shlo_ref::detail::F16CommonType<T> {};
+
+template <typename T>
+struct common_type<T, shlo_ref::F16> : shlo_ref::detail::F16CommonType<T> {};
+
+}  // namespace std
+
+namespace shlo_ref {
+static_assert(sizeof(F16) == sizeof(uint16_t));
+static_assert(alignof(F16) == alignof(uint16_t));
+
+#ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
+
+template <typename T, typename _>
+F16::F16(T x) : native_(static_cast<_Float16>(static_cast<float>(x))) {}
+
+inline F16::operator float() const { return native_; }
+
+inline F16::operator bool() const { return native_; }
+
+inline F16 operator+(F16 x) { return x.native_; }
+
+inline F16 operator-(F16 x) { return -x.native_; }
+
+inline F16& operator++(F16& x) { return x += 1; }
+
+inline F16 operator++(F16& x, int) { return x.native_++; }
+
+inline F16& operator--(F16& x) { return x -= 1; }
+
+inline F16 operator--(F16& x, int) { return x.native_--; }
+
+#else  // !SHLO_REF_HAS_BUILTIN_FLOAT16
+
+template <typename T, typename _>
+inline F16::F16(T x)
+    : bits_(fp16_ieee_from_fp32_value(static_cast<float>(x))) {}
+
+inline F16::operator float() const { return fp16_ieee_to_fp32_value(bits_); }
+
+inline F16::operator bool() const { return bits_; }
+
+inline F16 operator-(F16 x) { return F16(-static_cast<float>(x)); }
+inline F16 operator+(F16 x) { return F16(static_cast<float>(x)); }
+
+inline F16& operator++(F16& x) { return x += 1; }
+
+inline F16 operator++(F16& x, int) {
+  const F16 y = x;
+  ++x;
+  return y;
+}
+
+inline F16& operator--(F16& x) { return x -= 1; }
+
+inline F16 operator--(F16& x, int) {
+  const F16 y = x;
+  --x;
+  return y;
+}
+
+#endif
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/i4.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/i4.h
new file mode 100644
index 00000000..35d72031
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/i4.h
@@ -0,0 +1,416 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_I4_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_I4_H_
+
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <type_traits>
+
+namespace shlo_ref {
+
+struct I4 {
+  int8_t data = 0;
+
+  constexpr I4() = default;
+  constexpr I4(const I4&) = default;
+  constexpr I4& operator=(const I4&) = default;
+
+  template <class T>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr I4(T v) : data(v) {}
+
+  template <class T>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator T() const {
+    return static_cast<T>(data);
+  }
+
+  // ++a
+  friend I4& operator++(I4& lhs) {
+    ++lhs.data;
+    return lhs;
+  }
+  // --a
+  friend I4& operator--(I4& lhs) {
+    --lhs.data;
+    return lhs;
+  }
+  // a++
+  friend I4 operator++(I4& lhs, int) {
+    I4 ret = lhs;
+    ++lhs.data;
+    return ret;
+  }
+  // a--
+  friend I4 operator--(I4& lhs, int) {
+    I4 ret = lhs;
+    --lhs.data;
+    return ret;
+  }
+  // a += b
+  friend I4& operator+=(I4& lhs, I4 rhs) {
+    lhs.data += rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator+=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data += static_cast<C>(rhs);
+    return lhs;
+  }
+  // a -= b
+  friend I4& operator-=(I4& lhs, I4 rhs) {
+    lhs.data -= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator-=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data -= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a *= b
+  friend I4& operator*=(I4& lhs, I4 rhs) {
+    lhs.data *= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator*=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data *= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a /= b
+  friend I4& operator/=(I4& lhs, I4 rhs) {
+    lhs.data /= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator/=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data /= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a %= b
+  friend I4& operator%=(I4& lhs, I4 rhs) {
+    lhs.data %= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator%=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data %= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a &= b
+  friend I4& operator&=(I4& lhs, I4 rhs) {
+    lhs.data &= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator&=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data &= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a |= b
+  friend I4& operator|=(I4& lhs, I4 rhs) {
+    lhs.data |= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator|=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data |= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a ^= b
+  friend I4& operator^=(I4& lhs, I4 rhs) {
+    lhs.data ^= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator^=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data ^= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a <<= b
+  friend I4& operator<<=(I4& lhs, I4 rhs) {
+    lhs.data <<= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator<<=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data <<= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a >>= b
+  friend I4& operator>>=(I4& lhs, I4 rhs) {
+    lhs.data >>= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator>>=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data >>= static_cast<C>(rhs);
+    return lhs;
+  }
+  // +a
+  friend auto operator+(I4 lhs) { return +lhs.data; }
+  // -a
+  friend auto operator-(I4 lhs) { return -lhs.data; }
+  // a + b
+  friend auto operator+(I4 lhs, I4 rhs) { return lhs.data + rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator+(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data + static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator+(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) + rhs.data;
+  }
+  // a - b
+  friend auto operator-(I4 lhs, I4 rhs) { return lhs.data - rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator-(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data - static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator-(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) - rhs.data;
+  }
+  // a * b
+  friend auto operator*(I4 lhs, I4 rhs) { return lhs.data * rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator*(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data * static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator*(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) * rhs.data;
+  }
+  // a / b
+  friend auto operator/(I4 lhs, I4 rhs) { return lhs.data / rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator/(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data / static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator/(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) / rhs.data;
+  }
+  // a % b
+  friend auto operator%(I4 lhs, I4 rhs) { return lhs.data % rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator%(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data % static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator%(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) % rhs.data;
+  }
+  // ~a
+  friend auto operator~(I4 lhs) { return ~lhs.data; }
+  // a & b
+  friend auto operator&(I4 lhs, I4 rhs) { return lhs.data & rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator&(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data & static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator&(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) & rhs.data;
+  }
+  // a | b
+  friend auto operator|(I4 lhs, I4 rhs) { return lhs.data | rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator|(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data | static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator|(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) | rhs.data;
+  }
+  // a ^ b
+  friend auto operator^(I4 lhs, I4 rhs) { return lhs.data ^ rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator^(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data ^ static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator^(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) ^ rhs.data;
+  }
+  // a << b
+  friend auto operator<<(I4 lhs, I4 rhs) { return lhs.data << rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator<<(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data << static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator<<(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) << rhs.data;
+  }
+  // a >> b
+  friend auto operator>>(I4 lhs, I4 rhs) { return lhs.data >> rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator>>(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data >> static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator>>(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) >> rhs.data;
+  }
+  // !a
+  friend bool operator!(I4 v) { return !v.data; }
+  // a && b
+  friend auto operator&&(I4 lhs, I4 rhs) { return lhs.data && rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator&&(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data && static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator&&(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) && rhs.data;
+  }
+  // a || b
+  friend auto operator||(I4 lhs, I4 rhs) { return lhs.data || rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator||(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data || static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator||(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) || rhs.data;
+  }
+  // a == b
+  friend bool operator==(I4 lhs, I4 rhs) { return lhs.data == rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator==(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data == static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator==(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) == rhs.data;
+  }
+  // a != b
+  friend bool operator!=(I4 lhs, I4 rhs) { return lhs.data != rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator!=(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data != static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator!=(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) != rhs.data;
+  }
+  // a < b
+  friend bool operator<(I4 lhs, I4 rhs) { return lhs.data < rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data < static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) < rhs.data;
+  }
+  // a > b
+  friend bool operator>(I4 lhs, I4 rhs) { return lhs.data > rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data > static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) > rhs.data;
+  }
+  // a <= b
+  friend bool operator<=(I4 lhs, I4 rhs) { return lhs.data <= rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<=(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data <= static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<=(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) <= rhs.data;
+  }
+  // a >= b
+  friend bool operator>=(I4 lhs, I4 rhs) { return lhs.data >= rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>=(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data >= static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>=(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) >= rhs.data;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, I4 v) { return os << +v; }
+};
+
+}  // namespace shlo_ref
+
+namespace std {
+
+template <>
+struct numeric_limits<shlo_ref::I4> : std::numeric_limits<int8_t> {
+  static constexpr shlo_ref::I4 min() noexcept { return shlo_ref::I4(-8); }
+  static constexpr shlo_ref::I4 lowest() noexcept { return min(); }
+  static constexpr shlo_ref::I4 max() noexcept { return shlo_ref::I4(7); }
+};
+
+}  // namespace std
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_I4_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/bench/util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/bench/util.h
new file mode 100644
index 00000000..29c81541
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/bench/util.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_BENCH_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_BENCH_UTIL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
+
+namespace stablehlo {
+
+namespace benchmark {
+
+static constexpr auto KB = 1024;
+
+template <typename Number>
+std::vector<Number> GenerateRandomVector(
+    size_t size, Number min = std::numeric_limits<Number>::min(),
+    Number max = std::numeric_limits<Number>::max()) {
+  std::vector<Number> data(size);
+  if constexpr (std::is_integral_v<Number>) {
+    static std::uniform_int_distribution<Number> distribution(min, max);
+    static std::default_random_engine generator;
+    std::generate(data.begin(), data.end(),
+                  [&]() { return distribution(generator); });
+  } else if constexpr (std::is_floating_point_v<Number>) {
+    static std::uniform_real_distribution<Number> distribution(min, max);
+    static std::default_random_engine generator;
+    std::generate(data.begin(), data.end(),
+                  [&]() { return distribution(generator); });
+  } else {
+    static_assert(std::is_same_v<Number, BF16> or std::is_same_v<Number, F16>);
+    static std::uniform_real_distribution<float> distribution(min, max);
+    static std::default_random_engine generator;
+    std::generate(data.begin(), data.end(),
+                  [&]() { return distribution(generator); });
+  }
+  return data;
+}
+
+}  // namespace benchmark
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_BENCH_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/include/shlo.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/include/shlo.h
new file mode 100644
index 00000000..31dbbc90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/include/shlo.h
@@ -0,0 +1,422 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_INCLUDE_SHLO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_INCLUDE_SHLO_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+
+namespace stablehlo {
+
+enum class ElementType { kUnknown, kI1, kSI8, kSI16, kSI32, kBF16, kF16, kF32 };
+
+using DimensionSize = size_t;
+using Axes = std::vector<size_t>;
+using Dims = absl::Span<const DimensionSize>;
+
+/*
+  A tensor shape represents non-negative dimension sizes in the ascending order
+  of the corresponding dimensions (which are also called axes) numbered from 0
+  to R-1. The number of dimensions R is called rank. For example,
+  tensor<2x3xf32> is a tensor type with shape 2x3 and element type f32. It has
+  two dimensions (or, in other words, two axes) - 0th dimension and 1st
+  dimension - whose sizes are 2 and 3. Its rank is 2.
+*/
+class Shape {
+ public:
+  // Scalar shape, with rank 0
+  Shape() = default;
+  // Tensor shape, with rank > 0
+  explicit Shape(std::vector<DimensionSize>&& dims) : dims_(std::move(dims)) {}
+  Shape(std::initializer_list<DimensionSize>&& dims) : dims_(std::move(dims)) {}
+  bool operator==(const Shape& other) const { return dims_ == other.dims_; }
+  bool operator!=(const Shape& other) const { return !(*this == other); }
+  size_t rank() const { return dims_.size(); }
+  DimensionSize dim(size_t idx) const { return dims_[idx]; }
+  Axes axes() const;
+  Dims dims() const { return dims_; }
+  size_t num_elements() const;
+
+ private:
+  std::vector<DimensionSize> dims_;
+};
+
+class TensorType {
+ public:
+  TensorType(Shape&& shape, ElementType tensor_element_type)
+      : shape_(std::move(shape)), tensor_element_type_(tensor_element_type) {}
+  bool operator==(const TensorType& other) const {
+    return shape_ == other.shape_ and
+           tensor_element_type_ == other.tensor_element_type_;
+  }
+  bool operator!=(const TensorType& other) const { return !(*this == other); }
+  const Shape& shape() const { return shape_; }
+  ElementType element_type() const { return tensor_element_type_; }
+  size_t num_bytes() const;
+
+ private:
+  Shape shape_;
+  ElementType tensor_element_type_;
+};
+
+// Tensor layout has is not part of the StableHLO standard. At this time we
+// support only strides for the minor dimension.
+class Layout {
+ public:
+  explicit Layout(size_t minor_dim_stride = 1)
+      : minor_dim_stride_(minor_dim_stride) {}
+  bool has_strides() const { return minor_dim_stride_ > 1; }
+  size_t minor_dim_stride() const { return minor_dim_stride_; }
+
+ private:
+  size_t minor_dim_stride_ = 1;
+};
+
+class Tensor {
+ public:
+  Tensor(TensorType&& type, void* buffer, Layout&& layout = Layout())
+      : type_(std::move(type)), buffer_(buffer), layout_(std::move(layout)) {}
+  bool operator==(const Tensor& other) const;
+  bool operator!=(const Tensor& other) const { return !(*this == other); }
+  const TensorType& type() const { return type_; }
+  void* buffer() { return buffer_; }
+  const void* buffer() const { return buffer_; }
+  const Layout& layout() const { return layout_; }
+  const Shape& shape() const { return type_.shape(); }
+  ElementType element_type() const { return type_.element_type(); }
+  TensorType baseline_type() const { return type_; }
+  ElementType baseline_element_type() const {
+    return baseline_type().element_type();
+  }
+  size_t rank() const { return shape().rank(); }
+  DimensionSize dim(size_t idx) const { return shape().dim(idx); }
+  Axes axes() const { return shape().axes(); }
+  Dims dims() const { return shape().dims(); }
+  size_t num_elements() const { return shape().num_elements(); }
+  size_t num_bytes() const { return type_.num_bytes(); }
+  bool is_per_tensor_quantized() const { return false; }
+  bool is_per_axis_quantized() const { return false; }
+
+ private:
+  TensorType type_;
+  void* buffer_;
+  Layout layout_;
+};
+
+struct QuantizedParameter {
+  float scale;
+  int32_t zero_point;
+  bool operator==(const QuantizedParameter& other) const {
+    return scale == other.scale and zero_point == other.zero_point;
+  }
+  bool operator!=(const QuantizedParameter& other) const {
+    return !(*this == other);
+  }
+};
+
+class QuantizedTensorElementType {
+ public:
+  // Constructor for per-tensor quantization.
+  QuantizedTensorElementType(ElementType storage_type,
+                             ElementType expressed_type,
+                             QuantizedParameter&& quantized_parameter,
+                             std::optional<int32_t> storage_min = std::nullopt,
+                             std::optional<int32_t> storage_max = std::nullopt)
+      : storage_type_(storage_type), expressed_type_(expressed_type) {
+    parameters_.emplace_back(std::move(quantized_parameter));
+  }
+
+  // Constructor for per-axis quantization.
+  QuantizedTensorElementType(ElementType storage_type,
+                             ElementType expressed_type,
+                             std::vector<QuantizedParameter>&& parameters,
+                             std::optional<int32_t> storage_min = std::nullopt,
+                             std::optional<int32_t> storage_max = std::nullopt)
+      : storage_type_(storage_type),
+        expressed_type_(expressed_type),
+        parameters_(std::move(parameters)) {}
+
+  ElementType storage_type() const { return storage_type_; }
+  ElementType expressed_type() const { return expressed_type_; }
+  std::optional<int32_t> storage_min() const { return storage_min_; }
+  std::optional<int32_t> storage_max() const { return storage_max_; }
+  std::optional<DimensionSize> quantized_dimension() const {
+    return quantized_dimension_;
+  }
+  const QuantizedParameter& parameters(size_t index) const {
+    return parameters_[index];
+  }
+  size_t num_parameters() const { return parameters_.size(); }
+  bool operator==(const QuantizedTensorElementType& other) const {
+    return storage_type_ == other.storage_type_ and
+           expressed_type_ == other.expressed_type_ and
+           storage_min_ == other.storage_min_ and
+           storage_max_ == other.storage_max_ and
+           quantized_dimension_ == other.quantized_dimension_ and
+           parameters_ == other.parameters_;
+  }
+  bool operator!=(const QuantizedTensorElementType& other) const {
+    return !(*this == other);
+  }
+  bool is_per_tensor_quantized() const { return !quantized_dimension_; }
+  bool is_per_axis_quantized() const { return !is_per_tensor_quantized(); }
+
+ private:
+  ElementType storage_type_;
+  ElementType expressed_type_;
+  std::optional<int32_t> storage_min_;
+  std::optional<int32_t> storage_max_;
+  std::optional<DimensionSize> quantized_dimension_;
+  std::vector<QuantizedParameter> parameters_;
+};
+
+class QuantizedTensorType {
+ public:
+  QuantizedTensorType(
+      Shape&& shape, QuantizedTensorElementType&& quantized_tensor_element_type)
+      : shape_(std::move(shape)),
+        quantized_tensor_element_type_(
+            std::move(quantized_tensor_element_type)) {}
+  const Shape& shape() const { return shape_; }
+  const QuantizedTensorElementType& element_type() const {
+    return quantized_tensor_element_type_;
+  }
+  bool operator==(const QuantizedTensorType& other) const {
+    return shape_ == other.shape_ and quantized_tensor_element_type_ ==
+                                          other.quantized_tensor_element_type_;
+  }
+  bool operator!=(const QuantizedTensorType& other) const {
+    return !(*this == other);
+  }
+  size_t num_bytes() const;
+
+ private:
+  Shape shape_;
+  QuantizedTensorElementType quantized_tensor_element_type_;
+};
+
+class QuantizedTensor {
+ public:
+  QuantizedTensor(QuantizedTensorType&& type, void* buffer,
+                  Layout&& layout = Layout())
+      : type_(std::move(type)), buffer_(buffer), layout_(std::move(layout)) {}
+  bool operator==(const QuantizedTensor& other) const;
+  bool operator!=(const QuantizedTensor& other) const {
+    return !(*this == other);
+  }
+  const QuantizedTensorType& type() const { return type_; }
+  void* buffer() { return buffer_; }
+  const void* buffer() const { return buffer_; }
+  const Layout& layout() const { return layout_; }
+  const Shape& shape() const { return type_.shape(); }
+  ElementType storage_type() const { return element_type().storage_type(); }
+  ElementType expressed_type() const { return element_type().expressed_type(); }
+  const QuantizedTensorElementType& element_type() const {
+    return type_.element_type();
+  }
+  QuantizedTensorType baseline_type() const;
+  QuantizedTensorElementType baseline_element_type() const {
+    return baseline_type().element_type();
+  }
+  size_t rank() const { return shape().rank(); }
+  DimensionSize dim(size_t idx) const { return shape().dim(idx); }
+  Axes axes() const { return shape().axes(); }
+  Dims dims() const { return shape().dims(); }
+  size_t num_elements() const { return shape().num_elements(); }
+  size_t num_bytes() const { return type_.num_bytes(); }
+  bool is_per_tensor_quantized() const {
+    return element_type().is_per_tensor_quantized();
+  }
+  bool is_per_axis_quantized() const {
+    return element_type().is_per_axis_quantized();
+  }
+  auto quantized_dimension() const {
+    return element_type().quantized_dimension();
+  }
+  auto storage_min() const { return element_type().storage_min(); }
+  auto storage_max() const { return element_type().storage_max(); }
+  auto scales(size_t idx) const { return element_type().parameters(idx).scale; }
+  auto zero_points(size_t idx) const {
+    return element_type().parameters(idx).zero_point;
+  }
+
+ private:
+  QuantizedTensorType type_;
+  void* buffer_;
+  Layout layout_;
+};
+
+inline bool IsSignedInteger(ElementType element_type) {
+  return element_type == ElementType::kSI8 ||
+         element_type == ElementType::kSI16 ||
+         element_type == ElementType::kSI32;
+}
+
+inline bool IsUnsignedInteger(ElementType element_type) { return false; }
+
+inline bool IsBoolean(ElementType element_type) {
+  return element_type == ElementType::kI1;
+}
+
+inline bool IsFloat(ElementType element_type) {
+  return element_type == ElementType::kBF16 ||
+         element_type == ElementType::kF16 || element_type == ElementType::kF32;
+}
+
+inline bool IsSignedInteger(const QuantizedTensorElementType& element_type) {
+  return IsSignedInteger(element_type.expressed_type());
+}
+inline bool IsUnsignedInteger(const QuantizedTensorElementType& element_type) {
+  return IsUnsignedInteger(element_type.expressed_type());
+}
+inline bool IsBoolean(const QuantizedTensorElementType& element_type) {
+  return IsBoolean(element_type.expressed_type());
+}
+inline bool IsFloat(const QuantizedTensorElementType& element_type) {
+  return IsFloat(element_type.expressed_type());
+}
+
+enum class ComparisonDirection { kEQ, kNE, kGE, kGT, kLE, kLT };
+enum class CompareType {
+  kFloat,
+  kTotalOrder /* Unsupported */,
+  kSigned,
+  kUnsigned
+};
+
+// /////////////////////////////////////////////////////////////////////////////
+
+absl::Status Abs(const Tensor& operand, Tensor& result);
+absl::Status Abs(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Add(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Add(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                 QuantizedTensor& result);
+absl::Status And(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Atan2(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Atan2(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                   QuantizedTensor& result);
+absl::Status BroadcastInDim(
+    const Tensor& operand, absl::Span<const DimensionSize> broadcast_dimensions,
+    Tensor& result);
+absl::Status BroadcastInDim(
+    const QuantizedTensor& operand,
+    absl::Span<const DimensionSize> broadcast_dimensions,
+    QuantizedTensor& result);
+absl::Status Clamp(const Tensor& min, const Tensor& operand, const Tensor& max,
+                   Tensor& result);
+absl::Status Clamp(const QuantizedTensor& min, const QuantizedTensor& operand,
+                   const QuantizedTensor& max, QuantizedTensor& result);
+absl::Status Cbrt(const Tensor& operand, Tensor& result);
+absl::Status Cbrt(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Ceil(const Tensor& operand, Tensor& result);
+absl::Status Ceil(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Compare(const Tensor& lhs, const Tensor& rhs,
+                     ComparisonDirection comparison_direction,
+                     CompareType compare_type, Tensor& result);
+absl::Status Compare(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                     ComparisonDirection comparison_direction,
+                     CompareType compare_type, Tensor& result);
+absl::Status Concatenate(absl::Span<const Tensor*> inputs,
+                         DimensionSize dimension, Tensor& result);
+absl::Status Concatenate(absl::Span<const QuantizedTensor*> inputs,
+                         DimensionSize dimension, QuantizedTensor& result);
+absl::Status Cosine(const Tensor& operand, Tensor& result);
+absl::Status Cosine(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status CountLeadingZeros(const Tensor& operand, Tensor& result);
+absl::Status Divide(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Divide(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                    QuantizedTensor& result);
+absl::Status Exponential(const Tensor& operand, Tensor& result);
+absl::Status Exponential(const QuantizedTensor& operand,
+                         QuantizedTensor& result);
+absl::Status ExponentialMinusOne(const Tensor& operand, Tensor& result);
+absl::Status ExponentialMinusOne(const QuantizedTensor& operand,
+                                 QuantizedTensor& result);
+absl::Status Floor(const Tensor& operand, Tensor& result);
+absl::Status Floor(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Iota(DimensionSize iota_dimension, Tensor& result);
+absl::Status Iota(DimensionSize iota_dimension, QuantizedTensor& result);
+absl::Status IsFinite(const Tensor& operand, Tensor& result);
+absl::Status IsFinite(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Log(const Tensor& operand, Tensor& result);
+absl::Status Log(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status LogPlusOne(const Tensor& operand, Tensor& result);
+absl::Status LogPlusOne(const QuantizedTensor& operand,
+                        QuantizedTensor& result);
+absl::Status Logistic(const Tensor& operand, Tensor& result);
+absl::Status Logistic(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Maximum(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Maximum(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                     QuantizedTensor& result);
+absl::Status Minimum(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Minimum(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                     QuantizedTensor& result);
+absl::Status Multiply(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Multiply(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                      QuantizedTensor& result);
+absl::Status Negate(const Tensor& operand, Tensor& result);
+absl::Status Negate(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Not(const Tensor& operand, Tensor& result);
+absl::Status Or(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Popcnt(const Tensor& operand, Tensor& result);
+absl::Status Power(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Power(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                   QuantizedTensor& result);
+absl::Status Remainder(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Remainder(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                       QuantizedTensor& result);
+absl::Status RoundNearestAfz(const Tensor& operand, Tensor& result);
+absl::Status RoundNearestAfz(const QuantizedTensor& operand,
+                             QuantizedTensor& result);
+absl::Status RoundNearestEven(const Tensor& operand, Tensor& result);
+absl::Status RoundNearestEven(const QuantizedTensor& operand,
+                              QuantizedTensor& result);
+absl::Status Rsqrt(const Tensor& operand, Tensor& result);
+absl::Status Rsqrt(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Select(const Tensor& pred, const Tensor& on_true,
+                    const Tensor& on_false, Tensor& result);
+absl::Status Select(const Tensor& pred, const QuantizedTensor& on_true,
+                    const QuantizedTensor& on_false, QuantizedTensor& result);
+absl::Status ShiftLeft(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status ShiftRightArithmetic(const Tensor& lhs, const Tensor& rhs,
+                                  Tensor& result);
+absl::Status ShiftRightLogical(const Tensor& lhs, const Tensor& rhs,
+                               Tensor& result);
+absl::Status Sign(const Tensor& operand, Tensor& result);
+absl::Status Sign(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Sine(const Tensor& operand, Tensor& result);
+absl::Status Sine(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Subtract(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+absl::Status Subtract(const QuantizedTensor& lhs, const QuantizedTensor& rhs,
+                      QuantizedTensor& result);
+absl::Status Sqrt(const Tensor& operand, Tensor& result);
+absl::Status Sqrt(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status Tanh(const Tensor& operand, Tensor& result);
+absl::Status Tanh(const QuantizedTensor& operand, QuantizedTensor& result);
+absl::Status UniformDequantize(const QuantizedTensor& operand, Tensor& result);
+absl::Status UniformQuantize(const Tensor& operand, QuantizedTensor& result);
+absl::Status Xor(const Tensor& lhs, const Tensor& rhs, Tensor& result);
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_INCLUDE_SHLO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/bf16.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/bf16.h
new file mode 100644
index 00000000..fbb1d480
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/bf16.h
@@ -0,0 +1,129 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_BF16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_BF16_H_
+
+#include "tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h"
+
+#if defined(__STDCPP_BFLOAT16_T__)
+#include <stdfloat>
+namespace stablehlo {
+using BF16 = bfloat16_t;
+}  // namespace stablehlo
+
+#elif __has_keyword(__bf16) && __x86_64__
+namespace stablehlo {
+// On x86 the compiler is able to generate code for __bf16 operations.
+using BF16 = __bf16;
+}  // namespace stablehlo
+
+#elif __has_keyword(__bf16) && __aarch64__
+#include <cmath>
+#include <cstdint>
+
+namespace stablehlo {
+
+// On arm64 the compiler is not yet able to generate code for __bf16
+// operations. Therefore, we resort to a software-based implementation of BF16
+// based on promoting ops to float.
+class BF16 {
+ public:
+  BF16(float f = 0.0f) {
+    if (std::isnan(f)) {
+      // If the value is a NaN, squash it to a NaN with the msb of the
+      // mantissa. This avoids that after the truncation below we don't end up
+      // with an inf.
+      value_ = std::signbit(f) ? 0xFFC0 : 0x7FC0;
+    } else {
+      // Fast rounding algorithm that rounds a half value to nearest even. This
+      // reduces expected error when we convert a large number of floats.
+      uint32_t input = *reinterpret_cast<const uint32_t*>(&f);
+
+      // Least significant bit of resulting bfloat.
+      uint32_t lsb = (input >> 16) & 1;
+      uint32_t rounding_bias = 0x7fff + lsb;
+      input += rounding_bias;
+
+      value_ = static_cast<uint16_t>(input >> 16u);
+    }
+  }
+
+  BF16& operator=(BF16 other) {
+    value_ = other.value_;
+    return *this;
+  }
+
+  bool operator==(BF16 other) const { return value_ == other.value_; }
+  bool operator!=(BF16 other) const { return !(*this == other); }
+
+  operator float() const {
+    uint32_t tmp = value_ << 16;
+    return *reinterpret_cast<float*>(&tmp);
+  }
+
+  BF16 operator-() const { return BF16(-static_cast<float>(*this)); }
+
+  BF16& operator+=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) + static_cast<float>(other)).value_;
+    return *this;
+  }
+
+  BF16& operator-=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) - static_cast<float>(other)).value_;
+    return *this;
+  }
+
+  BF16& operator*=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) * static_cast<float>(other)).value_;
+    return *this;
+  }
+
+  BF16& operator/=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) / static_cast<float>(other)).value_;
+    return *this;
+  }
+
+ private:
+  uint16_t value_;
+};
+
+inline BF16 operator+(BF16 x, BF16 y) {
+  x += y;
+  return x;
+}
+
+inline BF16 operator-(BF16 x, BF16 y) {
+  x -= y;
+  return x;
+}
+
+inline BF16 operator*(BF16 x, BF16 y) {
+  x *= y;
+  return x;
+}
+
+inline BF16 operator/(BF16 x, BF16 y) {
+  x /= y;
+  return x;
+}
+
+}  // namespace stablehlo
+
+#else
+#error Type BF16 is not available
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_BF16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/debug.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/debug.h
new file mode 100644
index 00000000..eb676526
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/debug.h
@@ -0,0 +1,117 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DEBUG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DEBUG_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <ios>
+#include <limits>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
+
+namespace stablehlo {
+
+bool AlmostSame(const Tensor& x, const Tensor& y);
+bool AlmostSame(const QuantizedTensor& x, const QuantizedTensor& y);
+
+std::ostream& operator<<(std::ostream& os, ElementType element_type);
+std::ostream& operator<<(std::ostream& os, const Shape& shape);
+std::ostream& operator<<(std::ostream& os, const TensorType& tensor_type);
+std::ostream& operator<<(std::ostream& os, const Tensor& tensor);
+std::ostream& operator<<(std::ostream& os, const QuantizedTensorElementType& t);
+std::ostream& operator<<(std::ostream& os,
+                         const QuantizedTensorType& tensor_type);
+std::ostream& operator<<(std::ostream& os, const QuantizedTensor& tensor);
+
+inline std::ostream& operator<<(std::ostream& os, BF16 value) {
+  return os << static_cast<float>(value);
+}
+
+inline std::ostream& operator<<(std::ostream& os, F16 value) {
+  return os << static_cast<float>(value);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         ComparisonDirection comparison_direction);
+std::ostream& operator<<(std::ostream& os, CompareType compare_type);
+
+std::ostream& operator<<(std::ostream& os, const TensorIndex& tensor_index);
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::optional<T>& optional) {
+  if (optional) {
+    os << *optional;
+  } else {
+    os << "?";
+  }
+  return os;
+}
+
+template <typename T>
+std::string ToString(
+    const T* ptr, size_t n,
+    size_t max_num_elem_to_print = std::numeric_limits<size_t>::max()) {
+  std::ostringstream os;
+  os << "[";
+  for (size_t i = 0; i < std::min(n, max_num_elem_to_print); ++i) {
+    if (i > 0) {
+      os << ", ";
+    }
+    if constexpr (std::is_same<T, int8_t>::value ||
+                  std::is_same<T, uint8_t>::value) {
+      os << "0x" << std::hex << static_cast<uint32_t>(ptr[i]);
+    } else {
+      os << ptr[i];
+    }
+  }
+  if (n > max_num_elem_to_print) {
+    os << ", ...";
+  }
+  os << "]";
+  return os.str();
+}
+
+template <typename T, typename... Types>
+inline std::string ToString(const std::vector<T, Types...>& vec) {
+  return ToString(vec.data(), vec.size());
+}
+
+template <typename T, typename... Types>
+inline std::ostream& operator<<(std::ostream& os,
+                                const std::vector<T, Types...>& vec) {
+  return os << ToString(vec.data(), vec.size());
+}
+
+template <typename T>
+inline std::string ToString(absl::Span<T> span) {
+  return ToString(span.data(), span.size());
+}
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DEBUG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/dispatch.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/dispatch.h
new file mode 100644
index 00000000..649bc0ba
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/dispatch.h
@@ -0,0 +1,137 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DISPATCH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DISPATCH_H_
+
+namespace stablehlo {
+
+#define DISPATCH_INT(name, element_type, ...)                             \
+  {                                                                       \
+    switch (element_type) {                                               \
+      case ElementType::kSI8:                                             \
+        return name<ElementType::kSI8, ElementType::kSI8>(__VA_ARGS__);   \
+      case ElementType::kSI16:                                            \
+        return name<ElementType::kSI16, ElementType::kSI16>(__VA_ARGS__); \
+      case ElementType::kSI32:                                            \
+        return name<ElementType::kSI32, ElementType::kSI32>(__VA_ARGS__); \
+      default:                                                            \
+        return absl::InvalidArgumentError("Unsupported element type");    \
+    }                                                                     \
+  }
+
+#define DISPATCH_FLOAT(name, element_type, ...)                           \
+  {                                                                       \
+    switch (element_type) {                                               \
+      case ElementType::kBF16:                                            \
+        return name<ElementType::kBF16, ElementType::kBF16>(__VA_ARGS__); \
+      case ElementType::kF16:                                             \
+        return name<ElementType::kF16, ElementType::kF16>(__VA_ARGS__);   \
+      case ElementType::kF32:                                             \
+        return name<ElementType::kF32, ElementType::kF32>(__VA_ARGS__);   \
+      default:                                                            \
+        return absl::InvalidArgumentError("Unsupported element type");    \
+    }                                                                     \
+  }
+
+#define DISPATCH_INT_FLOAT(name, element_type, ...)                       \
+  {                                                                       \
+    switch (element_type) {                                               \
+      case ElementType::kSI8:                                             \
+        return name<ElementType::kSI8, ElementType::kSI8>(__VA_ARGS__);   \
+      case ElementType::kSI16:                                            \
+        return name<ElementType::kSI16, ElementType::kSI16>(__VA_ARGS__); \
+      case ElementType::kSI32:                                            \
+        return name<ElementType::kSI32, ElementType::kSI32>(__VA_ARGS__); \
+      case ElementType::kBF16:                                            \
+        return name<ElementType::kBF16, ElementType::kBF16>(__VA_ARGS__); \
+      case ElementType::kF16:                                             \
+        return name<ElementType::kF16, ElementType::kF16>(__VA_ARGS__);   \
+      case ElementType::kF32:                                             \
+        return name<ElementType::kF32, ElementType::kF32>(__VA_ARGS__);   \
+      default:                                                            \
+        return absl::InvalidArgumentError("Unsupported element type");    \
+    }                                                                     \
+  }
+
+#define DISPATCH_BOOL_INT_FLOAT(name, element_type, ...)                  \
+  {                                                                       \
+    switch (element_type) {                                               \
+      case ElementType::kI1:                                              \
+        return name<ElementType::kI1, ElementType::kI1>(__VA_ARGS__);     \
+      case ElementType::kSI8:                                             \
+        return name<ElementType::kSI8, ElementType::kSI8>(__VA_ARGS__);   \
+      case ElementType::kSI16:                                            \
+        return name<ElementType::kSI16, ElementType::kSI16>(__VA_ARGS__); \
+      case ElementType::kSI32:                                            \
+        return name<ElementType::kSI32, ElementType::kSI32>(__VA_ARGS__); \
+      case ElementType::kBF16:                                            \
+        return name<ElementType::kBF16, ElementType::kBF16>(__VA_ARGS__); \
+      case ElementType::kF16:                                             \
+        return name<ElementType::kF16, ElementType::kF16>(__VA_ARGS__);   \
+      case ElementType::kF32:                                             \
+        return name<ElementType::kF32, ElementType::kF32>(__VA_ARGS__);   \
+      default:                                                            \
+        return absl::InvalidArgumentError("Unsupported element type");    \
+    }                                                                     \
+  }
+
+#define DISPATCH_QUANTIZED(name, storage_type, expressed_type, ...)           \
+  {                                                                           \
+    switch (storage_type) {                                                   \
+      case ElementType::kSI8:                                                 \
+        switch (expressed_type) {                                             \
+          case ElementType::kBF16:                                            \
+            return name<ElementType::kSI8, ElementType::kBF16>(__VA_ARGS__);  \
+          case ElementType::kF16:                                             \
+            return name<ElementType::kSI8, ElementType::kF16>(__VA_ARGS__);   \
+          case ElementType::kF32:                                             \
+            return name<ElementType::kSI8, ElementType::kF32>(__VA_ARGS__);   \
+          default:                                                            \
+            return absl::InvalidArgumentError("Unsupported expressed type");  \
+        }                                                                     \
+        break;                                                                \
+      case ElementType::kSI16:                                                \
+        switch (expressed_type) {                                             \
+          case ElementType::kBF16:                                            \
+            return name<ElementType::kSI16, ElementType::kBF16>(__VA_ARGS__); \
+          case ElementType::kF16:                                             \
+            return name<ElementType::kSI16, ElementType::kF16>(__VA_ARGS__);  \
+          case ElementType::kF32:                                             \
+            return name<ElementType::kSI16, ElementType::kF32>(__VA_ARGS__);  \
+          default:                                                            \
+            return absl::InvalidArgumentError("Unsupported expressed type");  \
+        }                                                                     \
+        break;                                                                \
+      case ElementType::kSI32:                                                \
+        switch (expressed_type) {                                             \
+          case ElementType::kBF16:                                            \
+            return name<ElementType::kSI32, ElementType::kBF16>(__VA_ARGS__); \
+          case ElementType::kF16:                                             \
+            return name<ElementType::kSI32, ElementType::kF16>(__VA_ARGS__);  \
+          case ElementType::kF32:                                             \
+            return name<ElementType::kSI32, ElementType::kF32>(__VA_ARGS__);  \
+          default:                                                            \
+            return absl::InvalidArgumentError("Unsupported expressed type");  \
+        }                                                                     \
+        break;                                                                \
+      default:                                                                \
+        return absl::InvalidArgumentError("Unsupported storage type");        \
+    }                                                                         \
+  }
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DISPATCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/f16.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/f16.h
new file mode 100644
index 00000000..a2679306
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/f16.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_F16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_F16_H_
+
+#include "tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h"
+
+#if defined(__STDCPP_FLOAT16_T__)
+#include <stdfloat>
+namespace stablehlo {
+using F16 = float16_t;
+}  // namespace stablehlo
+
+#elif __has_keyword(_Float16)
+namespace stablehlo {
+using F16 = _Float16;
+}  // namespace stablehlo
+
+#else
+#error Type F16 is not available
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_F16_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h
new file mode 100644
index 00000000..7c8efdc0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_HAS_KEYWORD_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_HAS_KEYWORD_H_
+
+// CAUTION: __is_identifier behaves opposite how you would expect!
+// '__is_identifier' returns '0' if '__x' is a reserved identifier provided by
+// the compiler and '1' otherwise.  borrowed from LLVM __config header under
+// Apache license 2.
+// (https://www.mend.io/blog/top-10-apache-license-questions-answered/)
+
+#ifndef __is_identifier       // Optional of course.
+#define __is_identifier(x) 1  // Compatibility with non-clang compilers.
+#endif
+
+// More sensible macro for keyword detection
+#define __has_keyword(__x) !(__is_identifier(__x))
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_HAS_KEYWORD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/storage.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/storage.h
new file mode 100644
index 00000000..280f8a89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/storage.h
@@ -0,0 +1,124 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_STORAGE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_STORAGE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
+
+namespace stablehlo {
+
+template <ElementType element_type>
+struct Storage;
+
+template <>
+struct Storage<ElementType::kI1> {
+  using Type = uint8_t;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+template <>
+struct Storage<ElementType::kSI8> {
+  using Type = int8_t;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+template <>
+struct Storage<ElementType::kSI16> {
+  using Type = int16_t;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+template <>
+struct Storage<ElementType::kSI32> {
+  using Type = int32_t;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+template <>
+struct Storage<ElementType::kBF16> {
+  using Type = BF16;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+template <>
+struct Storage<ElementType::kF16> {
+  using Type = F16;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+template <>
+struct Storage<ElementType::kF32> {
+  using Type = float;
+  static Type Get(const void* buffer, size_t idx) {
+    auto p = static_cast<const Type*>(buffer);
+    return p[idx];
+  }
+  static void Set(void* buffer, size_t idx, Type value) {
+    auto p = static_cast<Type*>(buffer);
+    p[idx] = value;
+  }
+};
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_STORAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/util.h
new file mode 100644
index 00000000..8df271be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/src/util.h
@@ -0,0 +1,221 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_UTIL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <ostream>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+
+namespace stablehlo {
+
+template <ElementType storage_type, ElementType expressed_type>
+inline typename Storage<expressed_type>::Type Dequantize(
+    typename Storage<storage_type>::Type quantized_value,
+    const QuantizedParameter& quant_param) {
+  using ST = typename Storage<storage_type>::Type;
+  using ET = typename Storage<expressed_type>::Type;
+  auto sub = (quantized_value - static_cast<ST>(quant_param.zero_point));
+  return static_cast<ET>(sub) * static_cast<ET>(quant_param.scale);
+}
+
+template <ElementType storage_type, ElementType expressed_type>
+inline typename Storage<storage_type>::Type QuantizePartial(
+    typename Storage<expressed_type>::Type expressed_value,
+    typename Storage<expressed_type>::Type scale_inv,
+    typename Storage<storage_type>::Type zero_point) {
+  using ST = typename Storage<storage_type>::Type;
+  using ET = typename Storage<expressed_type>::Type;
+  ET rounding_extra = (expressed_value >= 0) ? ET(0.5) : ET(-0.5);
+  ET tmp = (expressed_value * scale_inv + rounding_extra);
+  // Clamp the value in case of overflow/underflow. This is needed to avoid
+  // getting a SIGILL exception when casting down below.
+  ET max = std::numeric_limits<ST>::max();
+  ET min = std::numeric_limits<ST>::min();
+  if (tmp > max) {
+    tmp = max;
+  } else if (tmp < min) {
+    tmp = min;
+  }
+  ST rounded_value = static_cast<ST>(tmp);
+  ST storage_value = rounded_value + zero_point;
+  return storage_value;
+}
+
+template <ElementType storage_type>
+absl::Status CompleteQuantization(void* buffer, size_t n,
+                                  const std::optional<int32_t>& storage_min,
+                                  const std::optional<int32_t>& storage_max) {
+  using S = Storage<storage_type>;
+
+  if (storage_min) {
+    typename S::Type min = *storage_min;
+    for (size_t i = 0; i < n; ++i) {
+      auto storage = S::Get(buffer, i);
+      storage = std::max(storage, min);
+      S::Set(buffer, i, storage);
+    }
+  }
+
+  if (storage_max) {
+    typename S::Type max = *storage_max;
+    for (size_t i = 0; i < n; ++i) {
+      auto storage = S::Get(buffer, i);
+      storage = std::min(storage, max);
+      S::Set(buffer, i, storage);
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+template <ElementType storage_type>
+absl::Status CompleteQuantization(QuantizedTensor& result) {
+  if (storage_type != result.storage_type()) {
+    return absl::InvalidArgumentError("Unexpected storage type");
+  }
+
+  size_t n = result.num_elements();
+  auto result_buffer = result.buffer();
+  const auto& result_storage_min = result.type().element_type().storage_min();
+  const auto& result_storage_max = result.type().element_type().storage_max();
+  return CompleteQuantization<storage_type>(
+      result_buffer, n, result_storage_min, result_storage_max);
+}
+
+// /////////////////////////////////////////////////////////////////////////////
+
+template <ElementType storage_type, ElementType expressed_type, typename Op>
+inline typename Storage<storage_type>::Type DequantizeOpQuantizePartial(
+    typename Storage<expressed_type>::Type operand_storage,
+    const QuantizedParameter& operand_quant_param,
+    typename Storage<expressed_type>::Type result_scale_inv,
+    typename Storage<storage_type>::Type result_zero_point, Op&& op) {
+  auto operand_expressed = Dequantize<storage_type, expressed_type>(
+      operand_storage, operand_quant_param);
+  auto result_expressed = op(operand_expressed);
+  return QuantizePartial<storage_type, expressed_type>(
+      result_expressed, result_scale_inv, result_zero_point);
+}
+
+template <ElementType storage_type, ElementType expressed_type, typename Op>
+inline typename Storage<storage_type>::Type DequantizeOpQuantizePartial(
+    typename Storage<expressed_type>::Type lhs_storage,
+    typename Storage<expressed_type>::Type rhs_storage,
+    const QuantizedParameter& lhs_quant_param,
+    const QuantizedParameter& rhs_quant_param,
+    typename Storage<expressed_type>::Type result_scale_inv,
+    typename Storage<storage_type>::Type result_zero_point, Op&& op) {
+  auto lhs_expressed =
+      Dequantize<storage_type, expressed_type>(lhs_storage, lhs_quant_param);
+  auto rhs_expressed =
+      Dequantize<storage_type, expressed_type>(rhs_storage, rhs_quant_param);
+  auto result_expressed = op(lhs_expressed, rhs_expressed);
+  return QuantizePartial<storage_type, expressed_type>(
+      result_expressed, result_scale_inv, result_zero_point);
+}
+
+// /////////////////////////////////////////////////////////////////////////////
+
+class TensorIndex {
+ public:
+  explicit TensorIndex(const Shape& shape)
+      : shape_(shape), index_(shape.rank(), 0), linear_index_(0) {}
+  auto operator[](size_t idx) const { return index_[idx]; }
+  void set(size_t idx, DimensionSize value) {
+    index_[idx] = value;
+    linear_index_.reset();
+  }
+  void set(const TensorIndex& other) {
+    index_ = other.index_;
+    linear_index_.reset();
+  }
+  // Return a linearized index assuming the shape's dimension 0 is the major
+  // index (i.e., slowest moving dimension) and the shape's dimension R-1, where
+  // R is the rank, is the minor index (i.e., the fastest moving dimension). For
+  // instance, for tensor<2x3xf32> dimension 1 (of size 3) is the fastest moving
+  // dimension.
+  size_t linearize() const {
+    if (!linear_index_) {
+      linear_index_ = compute_linear_index();
+    }
+    return *linear_index_;
+  }
+
+ private:
+  friend class TensorIndexIterator;
+  friend std::ostream& operator<<(std::ostream&, const TensorIndex&);
+
+  size_t compute_linear_index() const {
+    auto n = index_.size();
+    size_t linear_index = 0;
+    for (auto i = 0; i < n; ++i) {
+      linear_index = (linear_index * shape_.dim(i)) + index_[i];
+    }
+    return linear_index;
+  }
+
+  bool advance() {
+    auto n = index_.size();
+    index_[n - 1]++;
+    if (linear_index_) {
+      (*linear_index_)++;
+    }
+    for (int i = n - 1; i >= 0; --i) {
+      if (index_[i] == shape_.dim(i)) {
+        if ((i - 1) >= 0) {
+          index_[i] = 0;
+          index_[i - 1]++;
+        } else {
+          // Overflow.
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  Shape shape_;
+  std::vector<DimensionSize> index_;
+  mutable std::optional<size_t> linear_index_;
+};
+
+class TensorIndexIterator {
+ public:
+  explicit TensorIndexIterator(const Shape& shape) : index_(shape) {}
+  TensorIndexIterator& operator++() {
+    has_next_ = index_.advance();
+    return *this;
+  }
+  const TensorIndex& operator*() const { return index_; }
+  const TensorIndex* operator->() const { return &index_; }
+  bool has_next() const { return has_next_; }
+
+ private:
+  TensorIndex index_;
+  bool has_next_ = true;
+};
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/test/matchers.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/test/matchers.h
new file mode 100644
index 00000000..6f06428d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/test/matchers.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_MATCHERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_MATCHERS_H_
+
+#include <gmock/gmock.h>
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"
+
+namespace stablehlo {
+namespace testing {
+
+MATCHER_P(IsAlmostSame, expected, "") { return AlmostSame(arg, expected); }
+
+}  // namespace testing
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/test/util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/test/util.h
new file mode 100644
index 00000000..2fb80c47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/legacy/test/util.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_UTIL_H_
+
+#include <optional>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
+
+namespace stablehlo {
+
+namespace testing {
+
+template <ElementType storage_type, ElementType expressed_type>
+std::vector<typename Storage<storage_type>::Type> QuantizeVector(
+    const std::vector<typename Storage<expressed_type>::Type>& input,
+    const QuantizedParameter& quantized_parameter) {
+  std::vector<typename Storage<storage_type>::Type> result;
+  typename Storage<expressed_type>::Type scale_inv =
+      1.0 / quantized_parameter.scale;
+  for (auto x : input) {
+    auto q = QuantizePartial<storage_type, expressed_type>(
+        x, scale_inv, quantized_parameter.zero_point);
+    result.push_back(q);
+  }
+  CHECK_OK(CompleteQuantization<storage_type>(  // Crash OK
+      result.data(), result.size(),
+      /* storage_min */ std::nullopt,
+      /* storage_min */ std::nullopt));
+  return result;
+}
+
+}  // namespace testing
+
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/abs.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/abs.h
new file mode 100644
index 00000000..d8071fc8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/abs.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_ABS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_ABS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct AbsOp {
+  struct Attributes {};
+};
+
+AbsOp Create(AbsOp::Attributes);
+absl::Status Prepare(AbsOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(AbsOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_ABS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/and.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/and.h
new file mode 100644
index 00000000..5a17f42c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/and.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct AndOp {
+  struct Attributes {};
+};
+
+AndOp Create(AndOp::Attributes);
+absl::Status Prepare(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/benchmark_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/benchmark_util.h
new file mode 100644
index 00000000..442af182
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/benchmark_util.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BENCHMARK_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BENCHMARK_UTIL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+
+namespace shlo_ref {
+
+// Converts the given number of Kibibytes to the equivalent number of bytes.
+// This is useful for specifying test input sizes as `KiB(8)`.
+constexpr size_t KiB(size_t kibibytes) { return kibibytes * 1024; }
+
+template <DataType data_type, typename T = StorageType<data_type>>
+std::vector<T> GenerateRandomVector(size_t size) {
+  std::vector<T> data(size);
+  if constexpr (std::is_integral_v<T>) {
+    static std::uniform_int_distribution<T> distribution(
+        Storage<data_type>::kMinValue, Storage<data_type>::kMaxValue);
+    static std::default_random_engine generator;
+    std::generate(data.begin(), data.end(),
+                  [&]() { return distribution(generator); });
+  } else if constexpr (std::is_floating_point_v<T>) {
+    static std::uniform_real_distribution<T> distribution(-1.0f, 1.0f);
+    static std::default_random_engine generator;
+    std::generate(data.begin(), data.end(),
+                  [&]() { return distribution(generator); });
+  } else {
+    static_assert(std::is_same_v<T, BF16> || std::is_same_v<T, F16>);
+    static std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+    static std::default_random_engine generator;
+    std::generate(data.begin(), data.end(),
+                  [&]() { return distribution(generator); });
+  }
+  return data;
+}
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BENCHMARK_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
new file mode 100644
index 00000000..994129a2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
@@ -0,0 +1,81 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+namespace detail {
+
+template <DataType storage_type, DataType expressed_type, typename F>
+void DequantizeOpQuantizePerTensor(F&& func, const Tensor& lhs,
+                                   const Tensor& rhs, Tensor& output) {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+  const DimensionSize num_elements = lhs.NumElements();
+  const StorageT lhs_zero_point =
+      lhs.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT lhs_scale =
+      lhs.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT rhs_zero_point =
+      rhs.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT rhs_scale =
+      rhs.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT output_zero_point =
+      output.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT output_scale =
+      output.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT* lhs_data = lhs.GetDataAs<storage_type>();
+  const StorageT* rhs_data = rhs.GetDataAs<storage_type>();
+  StorageT* output_data = output.GetDataAs<storage_type>();
+  const ExpressedT inv_scale = static_cast<ExpressedT>(1) / output_scale;
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++lhs_data, ++rhs_data, ++output_data) {
+    const ExpressedT dequantized_lhs =
+        Dequantize(*lhs_data, lhs_zero_point, lhs_scale);
+    const ExpressedT dequantized_rhs =
+        Dequantize(*rhs_data, rhs_zero_point, rhs_scale);
+    const ExpressedT dequantized_res = func(dequantized_lhs, dequantized_rhs);
+    *output_data = Quantize<storage_type, expressed_type>(
+        dequantized_res, output_zero_point, inv_scale);
+  }
+}
+
+template <DataType data_type, class F>
+void EvaluateNoQuantization(F&& func, const Tensor& lhs, const Tensor& rhs,
+                            Tensor& output) {
+  using T = StorageType<data_type>;
+  const T* lhs_data = lhs.GetDataAs<data_type>();
+  const T* rhs_data = rhs.GetDataAs<data_type>();
+  T* output_data = output.GetDataAs<data_type>();
+  const DimensionSize num_elements = lhs.NumElements();
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++output_data, ++lhs_data, ++rhs_data) {
+    *output_data = static_cast<T>(func(*lhs_data, *rhs_data));
+  }
+}
+
+}  // namespace detail
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
new file mode 100644
index 00000000..25267858
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
@@ -0,0 +1,201 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
+
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <class Op, class List>
+struct OpTuple;
+
+template <class Op, class... Ts>
+struct OpTuple<Op, ::testing::Types<Ts...>> {
+  using Type = std::tuple<Op, Ts...>;
+};
+
+template <class Op>
+struct OpTupleFactory {
+  template <class T>
+  using WithOp = typename OpTuple<Op, T>::Type;
+};
+
+template <class Op, class SupportedTypes>
+using BinaryElementwiseBaselineConstraintTypes =
+    MapTypes<OpTupleFactory<Op>::template WithOp,
+             FilterTypes<NegatePred<SameTypes>::template Predicate,
+                         CrossProductTypes<SupportedTypes, SupportedTypes,
+                                           SupportedTypes>>>;
+
+using BaselineConstraintIntTypes = ::testing::Types<TestParam<DataType::kSI32>>;
+
+using BaselineConstraintFloatTypes =
+    ::testing::Types<TestParam<DataType::kF32>>;
+
+using BaselineConstraintQuantizedPerTensorTypes =
+    ::testing::Types<PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+                     PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>;
+
+template <class Op>
+class BinaryElementwiseOpShapePropagationTest : public ::testing::Test {
+ protected:
+  void SetRhsShape(Shape shape) { rhs_tensor_.shape() = std::move(shape); }
+  void SetOutputShape(Shape shape) {
+    output_tensor_.shape() = std::move(shape);
+  }
+  bool LhsAndOutputShapesAreEqual() const {
+    return lhs_tensor_.shape() == output_tensor_.shape();
+  }
+
+  Op op_ = Create(SupportedOpAttributes<Op>::Get());
+  Tensor lhs_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor rhs_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor output_tensor_ = {
+      .type = TensorType{.shape = Shape(),
+                         .element_type =
+                             SupportedOpOutputDataType<Op>::kStorageType},
+      .data = nullptr};
+};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseOpShapePropagationTest);
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest, ShapePropagationWorks) {
+  ASSERT_TRUE(this->output_tensor_.shape().empty());
+  EXPECT_OK(Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+                    this->output_tensor_));
+  EXPECT_THAT(this->output_tensor_.shape(),
+              ::testing::ElementsAreArray(this->lhs_tensor_.shape()));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             SmallerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             BiggerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 4, 5}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             IncompatibleOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 5}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(BinaryElementwiseOpShapePropagationTest,
+                            ShapePropagationWorks,
+                            SmallerOutputShapeRaisesAnError,
+                            BiggerOutputShapeRaisesAnError,
+                            IncompatibleOutputShapeRaisesAnError);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+template <class T>
+class BinaryElementwiseSameBaselineElementTypeConstraintTest
+    : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseSameBaselineElementTypeConstraintTest);
+
+TYPED_TEST_P(BinaryElementwiseSameBaselineElementTypeConstraintTest,
+             DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using LhsTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using RhsTypeDesc = std::tuple_element_t<2, TypeParam>;
+  using ResultTypeDesc = std::tuple_element_t<3, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor lhs_tensor{.type = TensorTypeFor(LhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor rhs_tensor{.type = TensorTypeFor(RhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(ResultTypeDesc{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    DifferentInputOutputStorageTypesRaiseAnError);
+
+// Tests that unsupported types are detected during when `Prepare` is called.
+template <class T>
+class BinaryElementwiseUnsupportedTypeTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseUnsupportedTypeTest);
+
+TYPED_TEST_P(BinaryElementwiseUnsupportedTypeTest, PrepareRaisesAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using TypeDesc = std::tuple_element_t<1, TypeParam>;
+  Tensor input_tensor{.type = TensorTypeFor(TypeDesc{}, Shape({2, 3, 4})),
+                      .data = nullptr};
+  Tensor output_tensor = input_tensor;
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, input_tensor, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("Unsupported tensor type"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(BinaryElementwiseUnsupportedTypeTest,
+                            PrepareRaisesAnError);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/cbrt.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/cbrt.h
new file mode 100644
index 00000000..552cb8e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/cbrt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_CBRT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_CBRT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CbrtOp {
+  struct Attributes {};
+};
+
+CbrtOp Create(CbrtOp::Attributes);
+absl::Status Prepare(CbrtOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(CbrtOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_CBRT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/ceil.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/ceil.h
new file mode 100644
index 00000000..4ed97660
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/ceil.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_CEIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_CEIL_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CeilOp {
+  struct Attributes {};
+};
+
+CeilOp Create(CeilOp::Attributes);
+absl::Status Prepare(CeilOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(CeilOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_CEIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/compare.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/compare.h
new file mode 100644
index 00000000..369ab050
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/compare.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COMPARE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COMPARE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CompareOp {
+  enum class ComparisonDirection { kEq, kNe, kGe, kGt, kLe, kLt };
+  // `compare_type` is considered for deprecation. We won't implement it until
+  // the deprecation is lifted.
+  //
+  // https://github.com/openxla/stablehlo/issues/584
+
+  struct Attributes {
+    ComparisonDirection comparison_direction;
+  };
+
+  Attributes attributes;
+};
+
+CompareOp Create(CompareOp::Attributes attributes);
+absl::Status Prepare(CompareOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(CompareOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COMPARE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/cosine.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/cosine.h
new file mode 100644
index 00000000..0b0b1e8a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/cosine.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COSINE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COSINE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CosineOp {
+  struct Attributes {};
+};
+
+CosineOp Create(CosineOp::Attributes);
+absl::Status Prepare(CosineOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(CosineOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COSINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
new file mode 100644
index 00000000..77b0de2d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CountLeadingZerosOp {
+  struct Attributes {};
+};
+
+CountLeadingZerosOp Create(CountLeadingZerosOp::Attributes);
+absl::Status Prepare(CountLeadingZerosOp& op, const Tensor& input,
+                     Tensor& output);
+absl::Status Evaluate(CountLeadingZerosOp& op, const Tensor& input,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/divide.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/divide.h
new file mode 100644
index 00000000..a1a16c1b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/divide.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct DivideOp {
+  struct Attributes {};
+};
+
+DivideOp Create(DivideOp::Attributes);
+absl::Status Prepare(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/exponential.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/exponential.h
new file mode 100644
index 00000000..6fe19227
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/exponential.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialOp {
+  struct Attributes {};
+};
+
+ExponentialOp Create(ExponentialOp::Attributes);
+absl::Status Prepare(ExponentialOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(ExponentialOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
new file mode 100644
index 00000000..9c4fa9bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialMinusOneOp {
+  struct Attributes {};
+};
+
+ExponentialMinusOneOp Create(ExponentialMinusOneOp::Attributes);
+absl::Status Prepare(ExponentialMinusOneOp& op, const Tensor& input,
+                     Tensor& output);
+absl::Status Evaluate(ExponentialMinusOneOp& op, const Tensor& input,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/floor.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/floor.h
new file mode 100644
index 00000000..1a3f7c48
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/floor.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct FloorOp {
+  struct Attributes {};
+};
+
+FloorOp Create(FloorOp::Attributes);
+absl::Status Prepare(FloorOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(FloorOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/is_finite.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/is_finite.h
new file mode 100644
index 00000000..0000567b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/is_finite.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_IS_FINITE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_IS_FINITE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct IsFiniteOp {
+  struct Attributes {};
+};
+
+IsFiniteOp Create(const IsFiniteOp::Attributes& attributes);
+absl::Status Prepare(IsFiniteOp& op, const Tensor& operand, Tensor& result);
+absl::Status Evaluate(IsFiniteOp& op, const Tensor& operand, Tensor& result);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_IS_FINITE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/log.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/log.h
new file mode 100644
index 00000000..981cf23a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/log.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogOp {
+  struct Attributes {};
+};
+
+LogOp Create(LogOp::Attributes);
+absl::Status Prepare(LogOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/log_plus_one.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/log_plus_one.h
new file mode 100644
index 00000000..de06a981
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/log_plus_one.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogPlusOneOp {
+  struct Attributes {};
+};
+
+LogPlusOneOp Create(LogPlusOneOp::Attributes);
+absl::Status Prepare(LogPlusOneOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogPlusOneOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/logistic.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/logistic.h
new file mode 100644
index 00000000..b44ef6fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/logistic.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogisticOp {
+  struct Attributes {};
+};
+
+LogisticOp Create(LogisticOp::Attributes);
+absl::Status Prepare(LogisticOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogisticOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/maximum.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/maximum.h
new file mode 100644
index 00000000..1ce0be36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/maximum.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MaximumOp {
+  struct Attributes {};
+};
+
+MaximumOp Create(MaximumOp::Attributes);
+absl::Status Prepare(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/minimum.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/minimum.h
new file mode 100644
index 00000000..5fc22055
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/minimum.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MinimumOp {
+  struct Attributes {};
+};
+
+MinimumOp Create(MinimumOp::Attributes);
+absl::Status Prepare(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/multiply.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/multiply.h
new file mode 100644
index 00000000..1301969b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/multiply.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MultiplyOp {
+  struct Attributes {};
+};
+
+MultiplyOp Create(MultiplyOp::Attributes);
+absl::Status Prepare(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/negate.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/negate.h
new file mode 100644
index 00000000..07fc6329
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/negate.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct NegateOp {
+  struct Attributes {};
+};
+
+NegateOp Create(NegateOp::Attributes);
+absl::Status Prepare(NegateOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(NegateOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/not.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/not.h
new file mode 100644
index 00000000..c05bc0ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/not.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct NotOp {
+  struct Attributes {};
+};
+
+NotOp Create(NotOp::Attributes);
+absl::Status Prepare(NotOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(NotOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/or.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/or.h
new file mode 100644
index 00000000..f9201c93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/or.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct OrOp {
+  struct Attributes {};
+};
+
+OrOp Create(OrOp::Attributes);
+absl::Status Prepare(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/popcnt.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/popcnt.h
new file mode 100644
index 00000000..56f3c3b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/popcnt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct PopcntOp {
+  struct Attributes {};
+};
+
+PopcntOp Create(PopcntOp::Attributes);
+absl::Status Prepare(PopcntOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(PopcntOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sign.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sign.h
new file mode 100644
index 00000000..70bdb71e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sign.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SignOp {
+  struct Attributes {};
+};
+
+SignOp Create(SignOp::Attributes);
+absl::Status Prepare(SignOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SignOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sine.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sine.h
new file mode 100644
index 00000000..66ca9a0a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sine.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SineOp {
+  struct Attributes {};
+};
+
+SineOp Create(SineOp::Attributes);
+absl::Status Prepare(SineOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SineOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sqrt.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sqrt.h
new file mode 100644
index 00000000..6955efff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/sqrt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SqrtOp {
+  struct Attributes {};
+};
+
+SqrtOp Create(SqrtOp::Attributes);
+absl::Status Prepare(SqrtOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SqrtOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/subtract.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/subtract.h
new file mode 100644
index 00000000..f0a80dbe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/subtract.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SubtractOp {
+  struct Attributes {};
+};
+
+SubtractOp Create(SubtractOp::Attributes);
+absl::Status Prepare(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/tanh.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/tanh.h
new file mode 100644
index 00000000..fabf2b3a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/tanh.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct TanhOp {
+  struct Attributes {};
+};
+
+TanhOp Create(TanhOp::Attributes);
+absl::Status Prepare(TanhOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(TanhOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/test_util.h
new file mode 100644
index 00000000..bdec109d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -0,0 +1,442 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
+
+#include <cstdint>
+#include <random>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+// We use a vector class that is different from std::vector to have a consistent
+// API when dealing with bool tensors.
+template <class T>
+using Vector = absl::InlinedVector<T, 1>;
+
+// Helper for UniformDistribution.
+template <DataType storage_type, typename = void>
+struct UniformDistributionImpl;
+
+template <>
+struct UniformDistributionImpl<DataType::kI1, void>
+    : std::uniform_int_distribution<int32_t> {
+  using std::uniform_int_distribution<int32_t>::uniform_int_distribution;
+};
+
+template <>
+struct UniformDistributionImpl<DataType::kSI4, void>
+    : std::uniform_int_distribution<int8_t> {
+  using std::uniform_int_distribution<int8_t>::uniform_int_distribution;
+};
+
+template <DataType storage_type>
+struct UniformDistributionImpl<storage_type,
+                               std::enable_if_t<IsInteger(storage_type)>>
+    : std::uniform_int_distribution<typename Storage<storage_type>::Type> {
+  using std::uniform_int_distribution<
+      typename Storage<storage_type>::Type>::uniform_int_distribution;
+};
+
+template <DataType storage_type>
+struct UniformDistributionImpl<storage_type,
+                               std::enable_if_t<IsFloat(storage_type)>>
+    : std::uniform_real_distribution<float> {
+  using std::uniform_real_distribution<float>::uniform_real_distribution;
+};
+
+// Helps creating a uniform distribution for the given data type.
+template <DataType storage_type>
+using UniformDistribution = UniformDistributionImpl<storage_type>;
+
+// Returns a vector filled with random data according to the set distribution.
+template <DataType storage_type,
+          template <DataType> class Distribution = UniformDistribution,
+          class MinT = StorageType<storage_type>,
+          class MaxT = StorageType<storage_type>,
+          class Config = Storage<storage_type>>
+Vector<typename Config::Type> RandomBuffer(const Shape& shape,
+                                           const MinT min = Config::kMinValue,
+                                           const MaxT max = Config::kMaxValue) {
+  using StorageT = StorageType<storage_type>;
+  const StorageT min_val =
+      min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
+  const StorageT max_val =
+      max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
+  Vector<typename Config::Type> vec(shape.NumElements());
+  std::random_device rd;
+  if constexpr (std::is_same_v<I4, StorageT>) {
+    Distribution<DataType::kSI8> dist(min_val, max_val);
+    absl::c_generate(vec, [&] { return static_cast<StorageT>(dist(rd)); });
+  } else {
+    Distribution<storage_type> dist(min_val, max_val);
+    absl::c_generate(vec, [&] { return dist(rd); });
+  }
+  return vec;
+}
+
+// Returns a vector filled with incremental value. The values wrap around
+// according to the storage type range.
+template <DataType storage_type, class StartT = StorageType<storage_type>,
+          class MinT = StorageType<storage_type>,
+          class MaxT = StorageType<storage_type>,
+          class Config = Storage<storage_type>>
+Vector<typename Config::Type> IotaBuffer(const Shape& shape,
+                                         const StartT start = Config::kMinValue,
+                                         const MinT min = Config::kMinValue,
+                                         const MaxT max = Config::kMaxValue) {
+  using StorageT = StorageType<storage_type>;
+  const StorageT min_val =
+      min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
+  const StorageT max_val =
+      max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
+  Vector<typename Config::Type> vec(shape.NumElements());
+  StorageT v = start >= min_val ? static_cast<StorageT>(start) : min_val;
+  v = v <= max_val ? v : min_val;
+  for (auto& e : vec) {
+    e = v;
+    if (v >= max_val) {
+      v = min_val;
+    } else {
+      ++v;
+    }
+  }
+  return vec;
+}
+
+// Typed test parameter type.
+template <DataType... Types>
+struct TestParam;
+
+// Typed test parameter specialization for non quantized tensors.
+template <DataType storage_type>
+struct TestParam<storage_type> {
+  static constexpr DataType kStorage = storage_type;
+  using StorageT = StorageType<storage_type>;
+};
+
+// Typed test parameter specialization for quantized tensors.
+template <DataType storage_type, DataType expressed_type>
+struct TestParam<storage_type, expressed_type> {
+  static constexpr DataType kStorage = storage_type;
+  static constexpr DataType kExpressed = expressed_type;
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+};
+
+// Typed test parameter tag to ask for a per-tensor quantized tensor.
+//
+// TestParamT should be a `TestParam<storage_type, expressed_type>`.
+template <class TestParamT>
+struct PerTensor {
+  using Param = TestParamT;
+};
+
+// Typed test parameter tag to ask for a per-channel quantized tensor.
+//
+// TestParamT should be a `TestParam<storage_type, expressed_type>`.
+template <class TestParamT, Axis kAxis = 0>
+struct PerAxis {
+  using Param = TestParamT;
+  static constexpr Axis axis = kAxis;
+};
+
+// Helps getting a human readable typed test parameter name.
+template <class T>
+struct ParamName;
+
+template <DataType T, DataType... Ts>
+struct ParamName<TestParam<T, Ts...>> {
+  static std::string Get() {
+    std::string name = std::string("") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name;
+  }
+};
+
+template <DataType T, DataType... Ts>
+struct ParamName<PerTensor<TestParam<T, Ts...>>> {
+  static std::string Get() {
+    std::string name = std::string("PerTensor[") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name + "]";
+  }
+};
+
+template <DataType T, DataType... Ts, Axis axis>
+struct ParamName<PerAxis<TestParam<T, Ts...>, axis>> {
+  static std::string Get() {
+    std::string name = std::string("PerAxis[") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name + ":" + std::to_string(axis) + "]";
+  }
+};
+
+template <class TestParamT, class... TestParamTs>
+struct ParamName<std::tuple<TestParamT, TestParamTs...>> {
+  static std::string Get() {
+    std::string name = ParamName<TestParamT>::Get();
+    ((name += std::string(":") + ParamName<TestParamTs>::Get()), ...);
+    return name;
+  }
+};
+
+// Allows GTest to print a human readable version of the typed test parameters.
+class TestParamNames {
+ public:
+  template <class T>
+  static std::string GetName(int) {
+    return ParamName<T>::Get();
+  }
+};
+
+// Applies the F template to the given testing::Types list.
+template <template <class> class F, class T>
+struct Map;
+
+template <template <class> class F, class... Ts>
+struct Map<F, ::testing::Types<Ts...>> {
+  using Types = ::testing::Types<F<Ts>...>;
+};
+
+template <template <class> class F, class T>
+using MapTypes = typename Map<F, T>::Types;
+
+// Concatenates testing::Types lists.
+template <class... Ts>
+struct Concat;
+
+template <class... Ts>
+struct Concat<::testing::Types<Ts...>> {
+  using Types = ::testing::Types<Ts...>;
+};
+
+template <class... Ts, class... Us, class... ExtraTypes>
+struct Concat<::testing::Types<Ts...>, ::testing::Types<Us...>, ExtraTypes...> {
+  using Types =
+      typename Concat<::testing::Types<Ts..., Us...>, ExtraTypes...>::Types;
+};
+
+template <class... Ts>
+using ConcatTypes = typename Concat<Ts...>::Types;
+
+// Transforms a list of types into a list of tuple<Op, type>.
+template <class Op, class T>
+struct WithOp;
+
+template <class Op, class... Ts>
+struct WithOp<Op, ::testing::Types<Ts...>> {
+  using Types = ::testing::Types<std::tuple<Op, Ts>...>;
+};
+
+template <class Op, class T>
+using WithOpTypes = typename WithOp<Op, T>::Types;
+
+// Helps generating a cross-product of lists.
+template <class Accu, class... Lists>
+struct CrossProductImpl;
+
+template <class... AccuTs, class... Ts, class... Lists>
+struct CrossProductImpl<::testing::Types<AccuTs...>, ::testing::Types<Ts...>,
+                        Lists...> {
+  using Types =
+      ConcatTypes<typename CrossProductImpl<::testing::Types<AccuTs..., Ts>,
+                                            Lists...>::Types...>;
+};
+
+template <class... AccuTs>
+struct CrossProductImpl<::testing::Types<AccuTs...>> {
+  using Types = ::testing::Types<::testing::Types<AccuTs...>>;
+};
+
+// Generates a cross-product of lists.
+template <class... Lists>
+struct CrossProduct {
+  using Types = typename CrossProductImpl<::testing::Types<>, Lists...>::Types;
+};
+
+template <class... Lists>
+using CrossProductTypes = typename CrossProduct<Lists...>::Types;
+
+static_assert(
+    std::is_same_v<
+        CrossProductTypes<::testing::Types<int, float>,
+                          ::testing::Types<char, double>>,
+        ::testing::Types<
+            ::testing::Types<int, char>, ::testing::Types<int, double>,
+            ::testing::Types<float, char>, ::testing::Types<float, double>>>);
+
+static_assert(
+    std::is_same_v<
+        CrossProductTypes<::testing::Types<int>, ::testing::Types<char, double>,
+                          ::testing::Types<float>>,
+        ::testing::Types<::testing::Types<int, char, float>,
+                         ::testing::Types<int, double, float>>>);
+
+// Filters out the types that don't satisfy the predicate.
+template <template <class...> class Predicate, class List>
+struct Filter;
+
+template <template <class...> class Predicate, class... Ts>
+struct Filter<Predicate, ::testing::Types<Ts...>> {
+  using Type =
+      ConcatTypes<std::conditional_t<Predicate<Ts>::value, ::testing::Types<Ts>,
+                                     ::testing::Types<>>...>;
+};
+
+template <template <class...> class Predicate, class List>
+using FilterTypes = typename Filter<Predicate, List>::Type;
+
+static_assert(std::is_same_v<
+              FilterTypes<std::is_integral, ::testing::Types<int, char, float>>,
+              ::testing::Types<int, char>>);
+
+// Checks if all given types are the same.
+template <class T, class... Ts>
+struct SameTypes : std::bool_constant<(std::is_same_v<T, Ts> && ...)> {};
+
+// Checks if all types in the testing::Types list are the same.
+template <class T, class... Ts>
+struct SameTypes<::testing::Types<T, Ts...>> : SameTypes<T, Ts...> {};
+
+// Provides a new predicate that negates the given one.
+template <template <class...> class Pred>
+struct NegatePred {
+  template <class... Ts>
+  using Predicate = std::negation<Pred<Ts...>>;
+};
+
+// Use this with TYPED_TEST_SUITE for boolean testing.
+using BoolTestType = ::testing::Types<TestParam<DataType::kI1>>;
+
+// Use this with TYPED_TEST_SUITE for non quantized integer testing.
+using IntTestTypes =
+    ::testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
+                     TestParam<DataType::kSI16>, TestParam<DataType::kSI32>>;
+
+// Use this with TYPED_TEST_SUITE for non quantized floating point testing.
+using FloatTestTypes =
+    ::testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
+                     TestParam<DataType::kF32>>;
+
+// Use this with TYPED_TEST_SUITE for non quantized testing.
+using ArithmeticTestTypes = ConcatTypes<IntTestTypes, FloatTestTypes>;
+
+// Use this with TYPED_TEST_SUITE for unspecified quantized testing.
+using QuantizedTestTypes =
+    ::testing::Types<TestParam<DataType::kSI4, DataType::kF32>,
+                     TestParam<DataType::kSI8, DataType::kF32>,
+                     TestParam<DataType::kSI16, DataType::kF32>,
+                     TestParam<DataType::kSI4, DataType::kBF16>,
+                     TestParam<DataType::kSI8, DataType::kBF16>,
+                     TestParam<DataType::kSI4, DataType::kF16>,
+                     TestParam<DataType::kSI8, DataType::kF16>>;
+
+// Use this with TYPED_TEST_SUITE for quantized per tensor testing.
+using PerTensorQuantizedTestTypes = MapTypes<PerTensor, QuantizedTestTypes>;
+
+template <class T>
+using PerAxis0 = PerAxis<T, 0>;
+
+// Use this with TYPED_TEST_SUITE for quantized per axis testing.
+using PerAxisQuantizedTestTypes = MapTypes<PerAxis0, QuantizedTestTypes>;
+
+// Customization point for generic tests that need to create a supported tensor
+// for an op but that don't care what that type is.
+//
+// Specialize this in the test file if F32 isn't supported by the op under test.
+template <class Op>
+struct SupportedOpDataType {
+  static constexpr DataType kStorageType = DataType::kF32;
+};
+
+// Customization point for generic tests that need to create a supported output
+// tensor for an op but that don't care what that type is.
+//
+// Specialize this in the test file if `SupportedOpDataType<Op>::kStorageType`
+// isn't supported by the op under test.
+template <class Op>
+struct SupportedOpOutputDataType {
+  static constexpr DataType kStorageType =
+      SupportedOpDataType<Op>::kStorageType;
+};
+
+// Customization point for generic tests that need a valid attribute
+// configuration to create an op but that don't care what that configuration is.
+//
+// Specialize this in the test file if F32 isn't supported by the op under test.
+template <class Op>
+struct SupportedOpAttributes {
+  static typename Op::Attributes Get() { return {}; };
+};
+
+// Builds a TensorType object and returns it in a variant that can be passed to
+// a tensor.
+template <DataType storage_type>
+TensorTypeVariant TensorTypeFor(TestParam<storage_type>, const Shape& shape) {
+  return TensorType{.shape = shape, .element_type = storage_type};
+}
+
+// Builds a per tensor QuantizedTensorType object and returns it in a variant
+// that can be passed to a tensor.
+//
+// WARNING: the scale and zero point are randomly generated:
+//   - scale is in [0.5, 1.5]
+//   - zero_point is in [-5, 5]
+template <DataType storage_type, DataType expressed_type>
+TensorTypeVariant TensorTypeFor(
+    PerTensor<TestParam<storage_type, expressed_type>>, const Shape& shape) {
+  std::random_device rd;
+  UniformDistribution<expressed_type> expressed_dist(0.5, 1.5);
+  UniformDistribution<storage_type> storage_dist(-5, 5);
+  StorageType<expressed_type> scale =
+      static_cast<StorageType<expressed_type>>(expressed_dist(rd));
+  StorageType<storage_type> zero_point =
+      StorageType<storage_type>(storage_dist(rd));
+  return QuantizedPerTensorTensorType{
+      .shape = shape,
+      .element_type = QuantizedElementTypePerTensor(storage_type, zero_point,
+                                                    expressed_type, scale)};
+}
+
+// Builds a per axis QuantizedTensorType object and returns it in a variant
+// that can be passed to a tensor.
+//
+// WARNING: scales and zero points are unspecified and may be empty.
+template <DataType storage_type, DataType expressed_type, Axis axis>
+TensorTypeVariant TensorTypeFor(
+    PerAxis<TestParam<storage_type, expressed_type>, axis>,
+    const Shape& shape) {
+  return QuantizedPerAxisTensorType{
+      .shape = shape,
+      .element_type = QuantizedElementTypePerAxis(storage_type, {},
+                                                  expressed_type, {}, axis)};
+}
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
new file mode 100644
index 00000000..fe052fad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
@@ -0,0 +1,192 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_H_
+
+#include <cstddef>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+namespace detail {
+
+template <typename StorageT, typename ExpressedT, typename F>
+void DequantizeOpQuantizePerAxisImpl(
+    F& op, const Shape& shape, const Axis quantization_dimension,
+    const StorageT quantization_min, const StorageT quantization_max,
+    const absl::Span<const StorageT> input_zero_points,
+    const absl::Span<const ExpressedT> input_scales,
+    const absl::Span<const StorageT> output_zero_points,
+    const absl::Span<const ExpressedT> output_scales, const Strides& strides,
+    const StorageT* input_data, StorageT* output_data, const size_t depth,
+    size_t quantization_index) {
+  const DimensionSize dim = shape.Dim(depth);
+  if (depth + 1 >= shape.Rank()) {
+    for (DimensionSize i = 0; i < dim; ++i) {
+      if (depth == quantization_dimension) {
+        quantization_index = i;
+      }
+      const ExpressedT dequantized_input =
+          Dequantize(*input_data, input_zero_points[quantization_index],
+                     input_scales[quantization_index]);
+      const ExpressedT dequantized_res = op(dequantized_input);
+      *output_data = Quantize<StorageT, ExpressedT>(
+          dequantized_res, output_zero_points[quantization_index],
+          static_cast<ExpressedT>(1) / output_scales[quantization_index],
+          quantization_min, quantization_max);
+      output_data += strides[depth];
+      input_data += strides[depth];
+    }
+  } else {
+    for (DimensionSize i = 0; i < dim; ++i) {
+      if (depth == quantization_dimension) {
+        quantization_index = i;
+      }
+      DequantizeOpQuantizePerAxisImpl(
+          op, shape, quantization_dimension, quantization_min, quantization_max,
+          input_zero_points, input_scales, output_zero_points, output_scales,
+          strides, input_data, output_data, depth + 1, quantization_index);
+      output_data += strides[depth];
+      input_data += strides[depth];
+    }
+  }
+}
+
+template <DataType storage_type, DataType expressed_type, typename F>
+void DequantizeOpQuantizePerAxis(F&& func, const Tensor& input,
+                                 Tensor& output) {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+  const Shape& shape = input.shape();
+  const Axis quantization_dimension =
+      input.quantized_per_axis_element_type().QuantizedDimension();
+  const absl::Span<const StorageT> input_zero_points =
+      input.quantized_per_axis_element_type().ZeroPointsAs<storage_type>();
+  const absl::Span<const ExpressedT> input_scales =
+      input.quantized_per_axis_element_type().ScalesAs<expressed_type>();
+  const absl::Span<const StorageT> output_zero_points =
+      output.quantized_per_axis_element_type().ZeroPointsAs<storage_type>();
+  const absl::Span<const ExpressedT> output_scales =
+      output.quantized_per_axis_element_type().ScalesAs<expressed_type>();
+  const Strides& strides = ComputeStrides(shape);
+  const StorageT* input_data = input.GetDataAs<storage_type>();
+  StorageT* output_data = output.GetDataAs<storage_type>();
+  DequantizeOpQuantizePerAxisImpl(
+      func, shape, quantization_dimension, Storage<storage_type>::kMinValue,
+      Storage<storage_type>::kMaxValue, input_zero_points, input_scales,
+      output_zero_points, output_scales, strides, input_data, output_data,
+      /*depth=*/0, /*quantization_index=*/0);
+}
+
+template <DataType storage_type, DataType expressed_type, typename F>
+void DequantizeOpQuantizePerTensor(F& func, const Tensor& input,
+                                   Tensor& output) {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+  const DimensionSize num_elements = input.NumElements();
+  const StorageT input_zero_point =
+      input.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT input_scale =
+      input.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT output_zero_point =
+      output.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT output_scale =
+      output.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT* input_data = input.GetDataAs<storage_type>();
+  StorageT* output_data = output.GetDataAs<storage_type>();
+  const ExpressedT inv_scale = static_cast<ExpressedT>(1) / output_scale;
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++input_data, ++output_data) {
+    const ExpressedT dequantized_input =
+        Dequantize(*input_data, input_zero_point, input_scale);
+    const ExpressedT dequantized_res = func(dequantized_input);
+    *output_data = Quantize<storage_type, expressed_type>(
+        dequantized_res, output_zero_point, inv_scale);
+  }
+}
+
+template <DataType data_type, class F>
+void EvaluateNoQuantization(F&& func, const Tensor& input, Tensor& output) {
+  absl::c_transform(input.Flat<data_type>(), output.GetDataAs<data_type>(),
+                    static_cast<F&&>(func));
+}
+
+}  // namespace detail
+
+// The following structures and functions are examples to implement unary ops.
+
+template <class F>
+struct UnaryElementwiseOp {
+  struct Attributes {};
+  F func;
+};
+
+// Creates the op structure and initializes the functor if it has a state.
+template <class F>
+UnaryElementwiseOp<F> Create(typename UnaryElementwiseOp<F>::Attributes,
+                             const F& func) {
+  return UnaryElementwiseOp<F>{func};
+}
+
+template <class F>
+UnaryElementwiseOp<F> Create(typename UnaryElementwiseOp<F>::Attributes,
+                             F&& func) {
+  return UnaryElementwiseOp<F>{static_cast<F&&>(func)};
+}
+
+// Checks the op constraints and propagates the output shape if needed.
+template <class F>
+absl::Status Prepare(UnaryElementwiseOp<F>& op, const Tensor& input,
+                     Tensor& output) {
+  return Propagate(input.shape(), output.shape());
+}
+
+// Runs the op over the input tensor.
+template <class F>
+absl::Status Evaluate(UnaryElementwiseOp<F>& op, const Tensor& input,
+                      Tensor& output) {
+  if (input.IsPerAxisQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerAxis,
+                       input.quantized_per_axis_element_type().StorageType(),
+                       input.quantized_per_axis_element_type().ExpressedType(),
+                       op.func, input, output);
+  } else if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), op.func,
+        input, output)
+  } else {
+    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
+                            input.tensor_element_type(), op.func, input,
+                            output);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
new file mode 100644
index 00000000..2f5fab7a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
@@ -0,0 +1,262 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
+
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <class Op>
+using BaselineMismatchSignedIntegerTypes = ::testing::Types<
+    std::tuple<Op, TestParam<DataType::kSI4>, TestParam<DataType::kSI8>>,
+    std::tuple<Op, TestParam<DataType::kSI4>, TestParam<DataType::kSI16>>,
+    std::tuple<Op, TestParam<DataType::kSI8>, TestParam<DataType::kSI4>>,
+    std::tuple<Op, TestParam<DataType::kSI8>, TestParam<DataType::kSI16>>,
+    std::tuple<Op, TestParam<DataType::kSI16>, TestParam<DataType::kSI4>>,
+    std::tuple<Op, TestParam<DataType::kSI16>, TestParam<DataType::kSI8>>>;
+
+// Lists couples of unmatched baseline element types.
+template <class Op>
+using UnaryElementwiseConstraint1Types = ::testing::Types<
+    std::tuple<Op, TestParam<DataType::kF16>, TestParam<DataType::kBF16>>,
+    std::tuple<Op, TestParam<DataType::kF16>, TestParam<DataType::kF32>>,
+    std::tuple<Op, TestParam<DataType::kBF16>, TestParam<DataType::kF16>>,
+    std::tuple<Op, TestParam<DataType::kBF16>, TestParam<DataType::kF32>>,
+    std::tuple<Op, TestParam<DataType::kF32>, TestParam<DataType::kF16>>,
+    std::tuple<Op, TestParam<DataType::kF32>, TestParam<DataType::kBF16>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>>;
+
+// Tests that the input shape is compared to the output shape and that it is
+// propagated if needed.
+
+template <class Op>
+class UnaryElementwiseOpShapePropagationTest : public ::testing::Test {
+ protected:
+  void SetOutputShape(Shape shape) {
+    output_tensor_.shape() = std::move(shape);
+  }
+  bool InputAndOutputShapesAreEqual() const {
+    return input_tensor_.shape() == output_tensor_.shape();
+  }
+
+  Op op_ = Create(typename Op::Attributes{});
+  Tensor input_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor output_tensor_ = {
+      .type = TensorType{.shape = Shape(),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseOpShapePropagationTest);
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest, ShapePropagationWorks) {
+  ASSERT_TRUE(this->output_tensor_.shape().empty());
+  EXPECT_OK(Prepare(this->op_, this->input_tensor_, this->output_tensor_));
+  EXPECT_THAT(this->output_tensor_.shape(),
+              ::testing::ElementsAreArray(this->input_tensor_.shape()));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             SmallerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             BiggerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 4, 5}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             IncompatibleOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 5}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(UnaryElementwiseOpShapePropagationTest,
+                            ShapePropagationWorks,
+                            SmallerOutputShapeRaisesAnError,
+                            BiggerOutputShapeRaisesAnError,
+                            IncompatibleOutputShapeRaisesAnError);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+template <class T>
+class UnaryElementwiseSameBaselineElementTypeConstraintTest
+    : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseSameBaselineElementTypeConstraintTest);
+
+TYPED_TEST_P(UnaryElementwiseSameBaselineElementTypeConstraintTest,
+             DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using OperandTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using ResultTypeDesc = std::tuple_element_t<2, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor input_tensor{.type = TensorTypeFor(OperandTypeDesc{}, shape),
+                      .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(ResultTypeDesc{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status = Prepare(op, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    DifferentInputOutputStorageTypesRaiseAnError);
+
+// Tests that unsupported types are detected during when `Prepare` is called.
+template <class T>
+class UnaryElementwiseUnsupportedTypeTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseUnsupportedTypeTest);
+
+TYPED_TEST_P(UnaryElementwiseUnsupportedTypeTest, PrepareRaisesAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using TypeDesc = std::tuple_element_t<1, TypeParam>;
+  Tensor input_tensor{.type = TensorTypeFor(TypeDesc{}, Shape({2, 3, 4})),
+                      .data = nullptr};
+  Tensor output_tensor = input_tensor;
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status = Prepare(op, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("Unsupported tensor type"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(UnaryElementwiseUnsupportedTypeTest,
+                            PrepareRaisesAnError);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/util.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/util.h
new file mode 100644
index 00000000..51a18f0d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/util.h
@@ -0,0 +1,101 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+#define SHLO_REF_RETURN_ON_ERROR(EXPR)    \
+  if (absl::Status s = (EXPR); !s.ok()) { \
+    return s;                             \
+  }
+
+// Propagates the input shape to the output shape.
+//
+// If the output shape is already populated, checks that is it compatible with
+// the input.
+absl::Status Propagate(const Shape& input_shape, Shape& output_shape);
+
+// Propagates the input shapes to the output shape.
+//
+// If the output shape is already populated, checks that is it compatible with
+// the inputs.
+absl::Status Propagate(const Shape& lhs_shape, const Shape& rhs_shape,
+                       Shape& output_shape);
+
+// Provides context information for the `Check*` functions error messages.
+struct CheckCtx {
+  explicit CheckCtx(std::string name) : op_name(name) {}
+  // The operation that requested the check.
+  std::string op_name;
+};
+
+// Checks that the `tensor` element type is supported by one the the `checks`
+// functions.
+//
+// Returns a failed precondition error when no check succeeds.
+//
+// The check functions should have the following signature.
+//
+// ```
+// bool Check(const Tensor& tensor);
+// ```
+template <class... CheckFuncs>
+absl::Status CheckSupportedTypes(CheckCtx ctx, const Tensor& tensor,
+                                 CheckFuncs&&... checks) {
+  if ((static_cast<CheckFuncs&&>(checks)(tensor) || ...)) {
+    return absl::OkStatus();
+  }
+  std::string tensor_type_repr = std::visit(
+      [](auto v) -> std::string { return ToString(v); }, tensor.element_type());
+  return absl::FailedPreconditionError("stablehlo." + ctx.op_name +
+                                       ": Unsupported tensor type (" +
+                                       tensor_type_repr + ").");
+}
+
+// Returns true if the tensor's storage type is boolean.
+bool IsBoolTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is a signed integer type.
+bool IsSignedIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an unsigned integer type.
+bool IsUnsignedIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an integer type.
+bool IsIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an floating point type.
+bool IsFloatTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is quantized per tensor.
+bool IsQuantizedPerTensorTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is quantized per axis.
+bool IsQuantizedPerAxisTensor(const Tensor& tensor);
+
+// Checks that both tensors have the same baseline element type.
+absl::Status CheckSameBaselineType(CheckCtx ctx, const Tensor& tensor1,
+                                   const Tensor& tensor2);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/xor.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/xor.h
new file mode 100644
index 00000000..0a98aca6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/ops/xor.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct XorOp {
+  struct Attributes {};
+};
+
+XorOp Create(XorOp::Attributes);
+absl::Status Prepare(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/overload.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/overload.h
new file mode 100644
index 00000000..913847e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/overload.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OVERLOAD_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OVERLOAD_H_
+
+namespace shlo_ref {
+
+// Returns a functor that provides overloads based on the
+// functors passed to it.
+//
+// Useful when used in conjunction with `std::visit`.
+//
+// Use absl version when we know for sure the version we can use.
+template <class... Ts>
+class Overload : public Ts... {
+ public:
+  explicit Overload(Ts&&... ts) : Ts(static_cast<Ts&&>(ts))... {}
+  using Ts::operator()...;
+};
+
+template <class... Ts>
+Overload(Ts&&...) -> Overload<Ts...>;
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OVERLOAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/quantize.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/quantize.h
new file mode 100644
index 00000000..f94d07a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/quantize.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZE_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+
+namespace shlo_ref {
+
+// Converts floating-point values of the expressed type into corresponding
+// integer values of the storage type using the zero point and scale associated
+// with the quantized element type.
+template <typename StorageT, typename ExpressedT>
+inline constexpr StorageT Quantize(ExpressedT expressed_value,
+                                   StorageT zero_point, ExpressedT scale_inv,
+                                   StorageT min_value, StorageT max_value) {
+  const ExpressedT rounding_extra =
+      (expressed_value > 0) ? ExpressedT(0.5f) : ExpressedT(-0.5f);
+  ExpressedT tmp = expressed_value * scale_inv + rounding_extra;
+
+  // Clamp the value in case of overflow/underflow. This is needed to avoid
+  // getting a SIGILL exception when casting down below.
+  tmp = std::clamp(tmp, static_cast<ExpressedT>(min_value),
+                   static_cast<ExpressedT>(max_value));
+  auto rounded_value = static_cast<StorageT>(tmp);
+  StorageT storage_value(rounded_value + zero_point);
+
+  // Clamp again using the min & max values.
+  return std::clamp(storage_value, min_value, max_value);
+}
+
+// A DataType dispatched version of Quantize, this allows for leveraging the min
+// and max values of the DataType, which may not necessarily be the same as the
+// min and max values of the underlying C data type. Ie, a 4-bit integer can be
+// stored in an int8_t, but the value range is from -8 to 7.
+template <DataType storage_type, DataType expressed_type>
+inline constexpr StorageType<storage_type> Quantize(
+    StorageType<expressed_type> expressed_value,
+    StorageType<storage_type> zero_point, StorageType<expressed_type> scale_inv,
+    StorageType<storage_type> min_value = Storage<storage_type>::kMinValue,
+    StorageType<storage_type> max_value = Storage<storage_type>::kMaxValue) {
+  return Quantize(expressed_value, zero_point, scale_inv, min_value, max_value);
+}
+
+// Converts quantized elements which represent integer values of the storage
+// type into corresponding floating-point values of the expressed type using
+// the zero point and scale associated with the quantized element type.
+template <typename StorageT, typename ExpressedT>
+inline constexpr ExpressedT Dequantize(StorageT quantized_value,
+                                       StorageT zero_point, ExpressedT scale) {
+  auto sub = quantized_value - zero_point;
+  return static_cast<ExpressedT>(sub) * scale;
+}
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
new file mode 100644
index 00000000..6097074b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
@@ -0,0 +1,275 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/absl_check.h"
+#include "absl/log/absl_log.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+namespace shlo_ref {
+
+constexpr bool IsValidQuantizationTypePair(DataType storage_type,
+                                           DataType expressed_type) {
+  switch (storage_type) {
+    case DataType::kSI4:
+    case DataType::kSI8:
+    case DataType::kSI16:
+      break;
+    default:
+      return false;
+  }
+  switch (expressed_type) {
+    case DataType::kBF16:
+    case DataType::kF16:
+    case DataType::kF32:
+      break;
+    default:
+      return false;
+  }
+  return SizeOf(storage_type) < SizeOf(expressed_type);
+}
+
+class QuantizedElementTypePerTensor {
+ public:
+  using ZeroPointVariant =
+      std::variant<Storage<DataType::kSI4>::Type, Storage<DataType::kSI8>::Type,
+                   Storage<DataType::kSI16>::Type>;
+  using ScaleVariant = std::variant<Storage<DataType::kBF16>::Type,
+                                    Storage<DataType::kF16>::Type,
+                                    Storage<DataType::kF32>::Type>;
+
+  template <class T, class U>
+  QuantizedElementTypePerTensor(DataType storage_type, T zero_point,
+                                DataType expressed_type, U scale) {
+#define SHLO_STORAGE_CASE(TYPE)                                             \
+  case DataType ::k##TYPE:                                                  \
+    zero_point_ =                                                           \
+        static_cast<typename Storage<DataType::k##TYPE>::Type>(zero_point); \
+    break;
+    switch (storage_type) {
+      SHLO_STORAGE_CASE(SI4);
+      SHLO_STORAGE_CASE(SI8);
+      SHLO_STORAGE_CASE(SI16);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization storage type ("
+                        << ToString(storage_type) << ").";
+    }
+#undef SHLO_STORAGE_CASE
+#define SHLO_EXPRESSED_CASE(TYPE)                                           \
+  case DataType ::k##TYPE:                                                  \
+    scale_ = static_cast<typename Storage<DataType::k##TYPE>::Type>(scale); \
+    break;
+    switch (expressed_type) {
+      SHLO_EXPRESSED_CASE(BF16);
+      SHLO_EXPRESSED_CASE(F16);
+      SHLO_EXPRESSED_CASE(F32);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization expressed type ("
+                        << ToString(expressed_type) << ").";
+    }
+#undef SHLO_EXPRESSED_CASE
+    ABSL_CHECK(IsValidQuantizationTypePair(StorageType(), ExpressedType()));
+  }
+
+  DataType ExpressedType() const {
+    const DataType scale_types[] = {DataType::kBF16, DataType::kF16,
+                                    DataType::kF32};
+    return scale_types[scale_.index()];
+  }
+
+  DataType StorageType() const {
+    const DataType zero_point_types[] = {DataType::kSI4, DataType::kSI8,
+                                         DataType::kSI16, DataType::kSI32};
+    return zero_point_types[zero_point_.index()];
+  }
+
+  ScaleVariant& Scale() { return scale_; }
+
+  const ScaleVariant& Scale() const { return scale_; }
+
+  template <DataType expressed_type>
+  const typename Storage<expressed_type>::Type& ScaleAs() const {
+    return std::get<typename Storage<expressed_type>::Type>(scale_);
+  }
+
+  ZeroPointVariant& ZeroPoint() { return zero_point_; }
+
+  const ZeroPointVariant& ZeroPoint() const { return zero_point_; }
+
+  template <DataType storage_type>
+  const typename Storage<storage_type>::Type& ZeroPointAs() const {
+    return std::get<typename Storage<storage_type>::Type>(zero_point_);
+  }
+
+  friend bool operator==(const QuantizedElementTypePerTensor& lhs,
+                         const QuantizedElementTypePerTensor& rhs) {
+    return lhs.zero_point_ == rhs.zero_point_ && lhs.scale_ == rhs.scale_;
+  }
+
+  friend bool operator!=(const QuantizedElementTypePerTensor& lhs,
+                         const QuantizedElementTypePerTensor& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  ZeroPointVariant zero_point_;
+  ScaleVariant scale_;
+};
+
+class QuantizedElementTypePerAxis {
+  template <class To, class FromRange, class... Ts>
+  void ConvertAndAssign(std::variant<Ts...>& dest, FromRange&& range) {
+    using std::begin;
+    using std::end;
+    dest = To(begin(range), end(range));
+  }
+
+ public:
+  template <typename T>
+  using SmallInlinedVector = absl::InlinedVector<T, 8>;
+
+  using ScalesVariant =
+      std::variant<SmallInlinedVector<Storage<DataType::kBF16>::Type>,
+                   SmallInlinedVector<Storage<DataType::kF16>::Type>,
+                   SmallInlinedVector<Storage<DataType::kF32>::Type>>;
+
+  // There is no need for kSI4 because it currently uses the same underlying
+  // storage type as kSI8, which complicates accessing the variant. If they ever
+  // use different underlying types, please add an alternative for kSI4.
+  using ZeroPointsVariant =
+      std::variant<SmallInlinedVector<Storage<DataType::kSI4>::Type>,
+                   SmallInlinedVector<Storage<DataType::kSI8>::Type>,
+                   SmallInlinedVector<Storage<DataType::kSI16>::Type>>;
+
+  template <class RangeT = std::initializer_list<int32_t>,
+            class RangeU = std::initializer_list<float>>
+  QuantizedElementTypePerAxis(DataType storage_type, RangeT&& zero_points,
+                              DataType expressed_type, RangeU&& scales,
+                              Axis quantized_dimension)
+      : quantized_dimension_(quantized_dimension) {
+#define SHLO_STORAGE_CASE(TYPE)                                             \
+  case DataType ::k##TYPE:                                                  \
+    ConvertAndAssign<SmallInlinedVector<Storage<DataType::k##TYPE>::Type>>( \
+        zero_points_, static_cast<RangeT&&>(zero_points));                  \
+    break;
+    switch (storage_type) {
+      SHLO_STORAGE_CASE(SI4);
+      SHLO_STORAGE_CASE(SI8);
+      SHLO_STORAGE_CASE(SI16);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization storage type ("
+                        << ToString(storage_type) << ").";
+    }
+#undef SHLO_STORAGE_CASE
+#define SHLO_EXPRESSED_CASE(TYPE)                                           \
+  case DataType ::k##TYPE:                                                  \
+    ConvertAndAssign<SmallInlinedVector<Storage<DataType::k##TYPE>::Type>>( \
+        scales_, static_cast<RangeU&&>(scales));                            \
+    break;
+    switch (expressed_type) {
+      SHLO_EXPRESSED_CASE(BF16);
+      SHLO_EXPRESSED_CASE(F16);
+      SHLO_EXPRESSED_CASE(F32);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization expressed type ("
+                        << ToString(expressed_type) << ").";
+    }
+#undef SHLO_EXPRESSED_CASE
+    ABSL_CHECK(IsValidQuantizationTypePair(StorageType(), ExpressedType()));
+  }
+
+  DataType ExpressedType() const {
+    const DataType scale_types[] = {DataType::kBF16, DataType::kF16,
+                                    DataType::kF32};
+    return scale_types[scales_.index()];
+  }
+
+  DataType StorageType() const {
+    const DataType zero_point_types[] = {DataType::kSI4, DataType::kSI8,
+                                         DataType::kSI16, DataType::kSI32};
+    return zero_point_types[zero_points_.index()];
+  }
+
+  Axis& QuantizedDimension() { return quantized_dimension_; }
+
+  const Axis& QuantizedDimension() const { return quantized_dimension_; }
+
+  ScalesVariant& Scales() { return scales_; }
+
+  const ScalesVariant& Scales() const { return scales_; }
+
+  template <DataType expressed_type>
+  const SmallInlinedVector<typename Storage<expressed_type>::Type>& ScalesAs()
+      const {
+    return std::get<SmallInlinedVector<typename Storage<expressed_type>::Type>>(
+        scales_);
+  }
+
+  ZeroPointsVariant& ZeroPoints() { return zero_points_; }
+
+  const ZeroPointsVariant& ZeroPoints() const { return zero_points_; }
+
+  template <DataType storage_type>
+  const SmallInlinedVector<typename Storage<storage_type>::Type>& ZeroPointsAs()
+      const {
+    return std::get<SmallInlinedVector<typename Storage<storage_type>::Type>>(
+        zero_points_);
+  }
+
+  friend bool operator==(const QuantizedElementTypePerAxis& lhs,
+                         const QuantizedElementTypePerAxis& rhs) {
+    return lhs.zero_points_ == rhs.zero_points_ && lhs.scales_ == rhs.scales_;
+  }
+
+  friend bool operator!=(const QuantizedElementTypePerAxis& lhs,
+                         const QuantizedElementTypePerAxis& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  Axis quantized_dimension_;
+  ScalesVariant scales_;
+  ZeroPointsVariant zero_points_;
+};
+
+// Gets a string representation of the given element type.
+std::string ToString(const QuantizedElementTypePerTensor& t);
+
+// Gets a string representation of the given element type.
+std::string ToString(const QuantizedElementTypePerAxis& t);
+
+QuantizedElementTypePerTensor BaselineType(
+    const QuantizedElementTypePerTensor& type);
+
+QuantizedElementTypePerAxis BaselineType(
+    const QuantizedElementTypePerAxis& type);
+
+}  // namespace shlo_ref
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/shape.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/shape.h
new file mode 100644
index 00000000..72a32229
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/shape.h
@@ -0,0 +1,112 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SHAPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SHAPE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+
+namespace shlo_ref {
+
+// The SHLO Spec states that dimensions are non-negative. We diverge from the
+// spec here to use negative values to represent dynamic dimensions.
+using DimensionSize = int64_t;
+using Axis = size_t;
+
+inline constexpr DimensionSize kDynamicDimension = -1;
+inline constexpr Axis kMaxNumDimensions = 6;
+
+using Strides = absl::InlinedVector<DimensionSize, kMaxNumDimensions>;
+
+class Shape {
+ public:
+  Shape() = default;
+  ~Shape() = default;
+  Shape(const Shape&) = default;
+  Shape& operator=(const Shape&) = default;
+  Shape(Shape&&) = default;
+  Shape& operator=(Shape&&) = default;
+
+  explicit Shape(absl::Span<const DimensionSize> dims);
+
+  absl::Span<const DimensionSize> Dimensions() const;
+  absl::Span<DimensionSize> MutableDimensions();
+
+  // range(rank(x))
+  absl::InlinedVector<Axis, kMaxNumDimensions> Axes() const;
+
+  // shape(x)[axis]
+  DimensionSize Dim(Axis axis) const;
+
+  // list(map(lambda axis: dim(x, axis), axes))
+  absl::InlinedVector<DimensionSize, kMaxNumDimensions> Dims(
+      absl::Span<const Axis> axes) const;
+
+  // size(shape(x))
+  size_t Rank() const;
+
+  // reduce(lambda x, y: x * y, shape(x))
+  // Note: in the SHLO spec, this is called size. We've diverged for readability
+  // and possible confusion with C++ container's usage of size().
+  DimensionSize NumElements() const;
+
+  // The following members are provided for compatibility with the standard
+  // library.
+  using value_type = DimensionSize;
+
+  const value_type& operator[](int dim) const { return dims_[dim]; }
+  value_type& operator[](int dim) { return dims_[dim]; }
+
+  auto cbegin() const { return dims_.begin(); }
+  auto begin() const { return dims_.begin(); }
+  auto begin() { return dims_.begin(); }
+  auto cend() const { return dims_.end(); }
+  auto end() const { return dims_.end(); }
+  auto end() { return dims_.end(); }
+  bool empty() const { return dims_.empty(); }
+  size_t size() const { return dims_.size(); }
+  const value_type* data() const { return dims_.data(); }
+  value_type* data() { return dims_.data(); }
+
+ private:
+  absl::InlinedVector<DimensionSize, kMaxNumDimensions> dims_;
+};
+
+bool operator==(const Shape& lhs, const Shape& rhs);
+bool operator!=(const Shape& lhs, const Shape& rhs);
+
+Strides ComputeStrides(const Shape& shape);
+
+template <class T>
+Strides ComputeStrides(const absl::Span<const T> shape) {
+  absl::InlinedVector<DimensionSize, kMaxNumDimensions> strides(shape.size());
+  if (!shape.empty()) {
+    strides[shape.size() - 1] = 1;
+    if (shape.size() > 1) {
+      for (size_t i = shape.size() - 1; i != 0; --i) {
+        strides[i - 1] = shape[i] * strides[i];
+      }
+    }
+  }
+  return strides;
+}
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/status_matcher.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/status_matcher.h
new file mode 100644
index 00000000..77f4fad9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/status_matcher.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
+
+// IWYU pragma: always_keep
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"  // IWYU pragma: keep - used in the
+                                             // provided macros in OSS builds.
+
+namespace shlo_ref {
+namespace testing {
+
+MATCHER_P(StatusIs, status_code, "") { return arg.code() == status_code; }
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) \
+  ASSERT_THAT(x, ::shlo_ref::testing::StatusIs(::absl::StatusCode::kOk))
+#endif
+
+#ifndef EXPECT_OK
+#define EXPECT_OK(x) \
+  EXPECT_THAT(x, ::shlo_ref::testing::StatusIs(::absl::StatusCode::kOk))
+#endif
+
+}  // namespace testing
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor.h
new file mode 100644
index 00000000..a0ebdc64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor.h
@@ -0,0 +1,131 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <variant>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+namespace shlo_ref {
+
+using TensorElementType = DataType;
+
+constexpr TensorElementType BaselineType(TensorElementType type) {
+  return type;
+}
+
+using TensorElementTypeVariant =
+    std::variant<TensorElementType, QuantizedElementTypePerTensor,
+                 QuantizedElementTypePerAxis>;
+
+TensorElementTypeVariant BaselineType(const TensorElementTypeVariant& type);
+
+struct TensorType {
+  Shape shape;
+  TensorElementType element_type;
+};
+
+struct QuantizedPerTensorTensorType {
+  Shape shape;
+  QuantizedElementTypePerTensor element_type;
+};
+
+struct QuantizedPerAxisTensorType {
+  Shape shape;
+  QuantizedElementTypePerAxis element_type;
+};
+
+using TensorTypeVariant = std::variant<TensorType, QuantizedPerTensorTensorType,
+                                       QuantizedPerAxisTensorType>;
+
+struct Tensor {
+  const Shape& shape() const;
+  Shape& shape();
+
+  bool IsQuantized() const;
+  bool IsPerAxisQuantized() const;
+  bool IsPerTensorQuantized() const;
+
+  size_t Rank() const;
+  DataType StorageType() const;
+
+  DimensionSize NumElements() const;
+  size_t SizeInBytes() const;
+
+  TensorType& tensor_type();
+  const TensorType& tensor_type() const;
+
+  QuantizedPerTensorTensorType& quantized_per_tensor_type();
+  const QuantizedPerTensorTensorType& quantized_per_tensor_type() const;
+
+  QuantizedPerAxisTensorType& quantized_per_axis_type();
+  const QuantizedPerAxisTensorType& quantized_per_axis_type() const;
+
+  const TensorElementType& tensor_element_type() const;
+  const QuantizedElementTypePerTensor& quantized_per_tensor_element_type()
+      const;
+  const QuantizedElementTypePerAxis& quantized_per_axis_element_type() const;
+
+  TensorElementTypeVariant element_type() const;
+
+  template <DataType data_type, typename T = typename Storage<data_type>::Type>
+  T* GetDataAs() {
+    return reinterpret_cast<T*>(data);
+  }
+
+  template <DataType data_type, typename T = typename Storage<data_type>::Type>
+  const T* GetDataAs() const {
+    return reinterpret_cast<const T*>(data);
+  }
+
+  template <DataType data_type, typename T = typename Storage<data_type>::Type>
+  absl::Span<const T> Flat() const {
+    return absl::MakeConstSpan(GetDataAs<data_type>(),
+                               static_cast<size_t>(NumElements()));
+  }
+
+  TensorTypeVariant type;
+
+  // If type is TensorType, the type should be Storage<type.element_type>::Type.
+  // If type is QuantizedTensorType, the type should be
+  // Storage<type.element_type.storage_type>::Type.
+  // May be nullptr if buffers are not yet available.
+  // The size of the array must be equal to Size(shape).
+  void* data = nullptr;
+};
+
+bool operator==(const TensorType& lhs, const TensorType& rhs);
+bool operator!=(const TensorType& lhs, const TensorType& rhs);
+
+bool operator==(const QuantizedPerTensorTensorType& lhs,
+                const QuantizedPerTensorTensorType& rhs);
+bool operator!=(const QuantizedPerTensorTensorType& lhs,
+                const QuantizedPerTensorTensorType& rhs);
+
+bool operator==(const QuantizedPerAxisTensorType& lhs,
+                const QuantizedPerAxisTensorType& rhs);
+bool operator!=(const QuantizedPerAxisTensorType& lhs,
+                const QuantizedPerAxisTensorType& rhs);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor_matcher.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor_matcher.h
new file mode 100644
index 00000000..6d58466b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor_matcher.h
@@ -0,0 +1,82 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_MATCHER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_MATCHER_H_
+
+#include <gmock/gmock.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+
+namespace shlo_ref {
+namespace testing {
+
+MATCHER_P(TensorEq, tensor, "") {
+  if (!::testing::ExplainMatchResult(::testing::Eq(tensor.type), arg.type,
+                                     result_listener)) {
+    return false;
+  }
+  switch (tensor.StorageType()) {
+    case DataType::kI1:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kI1>()),
+          arg.template Flat<DataType::kI1>(), result_listener);
+    case DataType::kSI4:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kSI4>()),
+          arg.template Flat<DataType::kSI4>(), result_listener);
+    case DataType::kSI8:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kSI8>()),
+          arg.template Flat<DataType::kSI8>(), result_listener);
+    case DataType::kSI16:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kSI16>()),
+          arg.template Flat<DataType::kSI16>(), result_listener);
+    case DataType::kSI32:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kSI32>()),
+          arg.template Flat<DataType::kSI32>(), result_listener);
+    case DataType::kSI64:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kSI64>()),
+          arg.template Flat<DataType::kSI32>(), result_listener);
+    case DataType::kBF16:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kBF16>()),
+          arg.template Flat<DataType::kBF16>(), result_listener);
+    case DataType::kF16:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kF16>()),
+          arg.template Flat<DataType::kF16>(), result_listener);
+    case DataType::kF32:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kF32>()),
+          arg.template Flat<DataType::kF32>(), result_listener);
+  }
+}
+
+}  // namespace testing
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_MATCHER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor_with_data.h b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor_with_data.h
new file mode 100644
index 00000000..e9508573
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/experimental/shlo/tensor_with_data.h
@@ -0,0 +1,100 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_WITH_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_WITH_DATA_H_
+
+#include <cstddef>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+// This is a utility class for creating a Tensor with a backing data buffer for
+// the tensor data. It is primarily to be used for testing.
+class TensorWithData {
+ public:
+  // Creates a non-quantized tensor.
+  template <DataType storage_type>
+  static TensorWithData Create(
+      Shape shape, absl::Span<const StorageType<storage_type>> data) {
+    Tensor tensor{
+        TensorType{.shape = std::move(shape), .element_type = storage_type}};
+    std::vector<std::byte> buffer(tensor.SizeInBytes());
+    std::memcpy(buffer.data(), data.data(), tensor.SizeInBytes());
+    return TensorWithData(std::move(tensor), std::move(buffer));
+  }
+
+  // Creates a per-tensor quantized tensor.
+  template <DataType storage_type, DataType expressed_type>
+  static TensorWithData Create(
+      Shape shape, absl::Span<const StorageType<expressed_type>> data,
+      StorageType<expressed_type> scale, StorageType<storage_type> zero_point) {
+    static_assert(IsInteger(storage_type));
+    static_assert(IsFloat(expressed_type));
+    using StorageT = typename Storage<storage_type>::Type;
+    using ExpressedT = typename Storage<expressed_type>::Type;
+
+    Tensor tensor{QuantizedPerTensorTensorType{
+        .shape = std::move(shape),
+        .element_type = QuantizedElementTypePerTensor(storage_type, zero_point,
+                                                      expressed_type, scale)}};
+
+    const ExpressedT scale_inv = ExpressedT(1.0) / scale;
+    std::vector<StorageT> quantized_data;
+    for (const auto& expressed_value : data) {
+      quantized_data.push_back(Quantize<storage_type, expressed_type>(
+          expressed_value, zero_point, scale_inv));
+    }
+
+    std::vector<std::byte> buffer(tensor.SizeInBytes());
+    std::memcpy(buffer.data(), quantized_data.data(), tensor.SizeInBytes());
+    return TensorWithData(std::move(tensor), std::move(buffer));
+  }
+
+  TensorWithData(const TensorWithData& other)
+      : tensor_(other.tensor_), buffer_(other.buffer_) {
+    tensor_.data = buffer_.data();
+  }
+
+  TensorWithData& operator=(const TensorWithData& other) {
+    tensor_ = other.tensor_;
+    buffer_ = other.buffer_;
+    tensor_.data = buffer_.data();
+    return *this;
+  }
+
+  const Tensor& tensor() const { return tensor_; }
+
+ private:
+  TensorWithData(Tensor tensor, std::vector<std::byte> buffer)
+      : tensor_(std::move(tensor)), buffer_(std::move(buffer)) {
+    tensor_.data = buffer_.data();
+  }
+
+  Tensor tensor_;
+  std::vector<std::byte> buffer_;
+};
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_WITH_DATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/external_cpu_backend_context.h b/third_party/tflite-hdrs/tensorflow/lite/external_cpu_backend_context.h
new file mode 100644
index 00000000..b8ae1906
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/external_cpu_backend_context.h
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_
+#define TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// This is the base class for TF Lite internal backend contexts (like a
+// RUY-based cpu backend context class). A derived internal backend context is
+// generally a collection of utilities (i.e. a thread pool etc.) for TF Lite to
+// use certain kernel libraries, such as Gemmlowp, RUY, etc., to implement TF
+// Lite operators.
+class TfLiteInternalBackendContext {
+ public:
+  virtual ~TfLiteInternalBackendContext() {}
+
+  // Set the maximum number of threads that could be used for parallelizing
+  // TfLite computation.
+  virtual void SetMaxNumThreads(int max_num_threads) = 0;
+
+  // A context may internally cache prepacked versions of constant tensors for
+  // faster computation. This function will clear any caches on the context.
+  virtual void ClearCaches() = 0;
+};
+
+// This TfLiteExternalContext-derived class is the default
+// 'kTfLiteCpuBackendContext'-typed context that's used internally in TF Lite
+// framework. The primary purpose of having this class is to allow the same cpu
+// backend context to be sharable among a set of TF Lite interpreters so that
+// certain system costs are saved, like saving the cost of having multiple
+// thread pools in each separate cpu backend context etc..
+//
+// Note: as of 2019/07/19, such context sharing among a set of interpreters will
+// break the execution if these interpreters are invoked simultaneously. It
+// works only when these context-sharing interpreters are invoked in a
+// serialized way. Here's an example to illustrate the context sharing among 2
+// TF Lite interpreters:
+//
+//  TfLiteExternalContext* global_ctxt = new ExternalCpuBackendContext();
+//  interpreter1 = /*...*/;
+//  interpreter1->SetExternalContext(kTfLiteCpuBackendContext, global_ctxt);
+//  interpreter2 = /*...*/;
+//  interpreter2->SetExternalContext(kTfLiteCpuBackendContext, global_ctxt);
+//
+//  interpreter1->SetNumThreads(2);
+//  interpreter1->Invoke();
+//
+//  interpreter2->SetNumThreads(4);
+//  interpreter2->Invoke();
+//
+// After sharing the context, calling 'SetNumThreads' on any of the
+// context-sharing interpreters will have the global impact as it also refreshes
+// the #thread info in the global cpu backend context (i.e. 'global_ctxt' above)
+// that affects how much parallelism an interpreter invocation will use.
+// Therefore, if different number of threads are used among different
+// interpreters, don't call 'SetNumThreads' consecutively but call it
+// separately between each interpreter's invocation as illustrated above.
+//
+// Note: it is the responsibility of the user of this context (i.e. a
+// TFLiteInterpreter) to clear any state from the internal backend
+// context if/when the interpreter no longer needs the shared context.
+// See, e.g., TFLiteInterpreter destructor clears caches in the case of a
+// shared ExternalCpuBackendContext.
+class ExternalCpuBackendContext : public TfLiteExternalContext {
+ public:
+  ExternalCpuBackendContext();
+  ~ExternalCpuBackendContext() {}
+
+  void set_internal_backend_context(
+      std::unique_ptr<TfLiteInternalBackendContext> internal_backend_context) {
+    internal_backend_context_ = std::move(internal_backend_context);
+  }
+
+  TfLiteInternalBackendContext* internal_backend_context() const {
+    return internal_backend_context_.get();
+  }
+
+ private:
+  // Note the actual internal backend context object is lazily initialized.
+  std::unique_ptr<TfLiteInternalBackendContext> internal_backend_context_;
+
+  ExternalCpuBackendContext(const ExternalCpuBackendContext&) = delete;
+  ExternalCpuBackendContext& operator=(const ExternalCpuBackendContext&) =
+      delete;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/graph_info.h b/third_party/tflite-hdrs/tensorflow/lite/graph_info.h
new file mode 100644
index 00000000..9b7a6ace
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/graph_info.h
@@ -0,0 +1,162 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_GRAPH_INFO_H_
+#define TENSORFLOW_LITE_GRAPH_INFO_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// Basic information about an inference graph, where execution nodes
+// are connected via tensors.
+class GraphInfo {
+ public:
+  virtual ~GraphInfo() {}
+
+  // Total number of tensors in the graph. This should be cached when possible.
+  virtual size_t num_tensors() const = 0;
+
+  // Returns a tensor given its index which is expected to be between 0 and
+  // num_tensors(). Use tensors() below for iteration as it is much faster.
+  virtual TfLiteTensor* tensor(size_t index) = 0;
+
+  // Returns all tensors in the graph
+  virtual TfLiteTensor* tensors() = 0;
+
+  // Number of nodes in the current execution plan.
+  virtual size_t num_execution_nodes() const = 0;
+
+  // Total number of known nodes, which may include nodes that are no longer in
+  // the execution plan. This happens in case of applying multiple delegates.
+  // Should be >= num_execution_nodes()
+  virtual size_t num_total_nodes() const = 0;
+
+  // Returns a node given its index in the execution plan, which is expected to
+  // be between 0 and num_execution_nodes().
+  virtual const TfLiteNode& node(size_t index) const = 0;
+
+  // Returns a node registration given its index which is expected to be between
+  // 0 and num_nodes().
+  virtual const TfLiteRegistration& registration(size_t index) const = 0;
+
+  // Returns an implementation-specific node index which may be different from
+  // execution-plan index.
+  // Expected to be between 0 and num_total_nodes().
+  virtual size_t node_index(size_t index) const = 0;
+
+  // Returns the indices of the input tensors.
+  virtual const std::vector<int>& inputs() const = 0;
+
+  // Returns the indices of the output tensors.
+  virtual const std::vector<int>& outputs() const = 0;
+
+  // Returns the indices of the variable tensors.
+  virtual const std::vector<int>& variables() const = 0;
+};
+
+// Represents a subset of nodes in a TensorFlow Lite graph.
+struct NodeSubset {
+  enum Type {
+    kTfUnexplored = 0,  // temporarily used during creation
+    kTfPartition,
+    kTfNonPartition
+  };
+  Type type = kTfUnexplored;
+  // Nodes within the node sub set
+  std::vector<int> nodes;
+  // Tensors that stride output from another node sub set that this depends on,
+  // or global inputs to the TensorFlow Lite full graph.
+  std::vector<int> input_tensors;
+  // Outputs that are consumed by other node sub sets or are global output
+  // tensors. All output tensors of the nodes in the node sub set that do not
+  // appear in this list are intermediate results that can be potentially
+  // elided.
+  std::vector<int> output_tensors;
+};
+
+// LINT.IfChange
+// Node edge.second depends on node edge.first.
+using ControlEdge = std::pair<int32_t, int32_t>;
+using ControlEdges = std::vector<ControlEdge>;
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/utils/control_edges.h)
+
+// Partitions a list of node indices `nodes_to_partition` into node subsets.
+// Each node subset is in dependency order internally (i.e. all members of the
+// node subsets can be executed in the order they occur) and externally (i.e.,
+// node subsets are executable in the order they occur.) The function assumes
+// that the nodes of the graph represented in *info are in dependency order.
+//
+// Depending on the value of `greedily`, the function behaves
+//
+// - greedily: while a node_set is generated whose members are (aren't) members
+// of
+//   `*nodes_to_partition`, it will add nodes to this subset, as long as they
+//   are (aren't) members of *nodes_to_partition and they are schedulable (i.e.,
+//   all nodes they depend have already be added to `*node_subsets`.)
+//
+// - non-greedily: this preserves the original execution order, i.e. the node
+//   subsets generated will be of the form [ [0..i_1), [i1..i2), ... ].
+//
+// `control_edges` specifies a control dependency DAG on the nodes contained in
+// `info`. The resulting partitioning will respect these control
+// dependencies. This way, restrictions (in addition to the nodes' data
+// dependencies) can be imposed on the ultimate execution order of the graph
+// (naturally, this is relevant only if ordering greedily.)
+//
+// (Example: with `greedily`, `control_edges.empty()`, and `nodes_to_partition
+// == {2, 3}`, the graph
+//
+//                    ▼------------▼
+//                    |            v
+// 0 --> 1 --> 2* --> 3*     4 --> 5
+//       |                   ^
+//       ▲-------------------▲
+//
+// will be partitioned as {{0, 1, 4}, {2, 3}, {5}}, since data dependencies
+// (notated '-->') allow for execution of 4 immediately after 1.
+//
+// With an additional control dependency `control_edges == {{3, 4}}` (notated
+// '==>'), execution of node 4 requires prior execution of node 3:
+//
+//                    ▼------------▼
+//                    |            v
+// 0 --> 1 --> 2* --> 3* ==> 4 --> 5
+//       |                   ^
+//       ▲-------------------▲
+//
+// and the partitioning will be {{0, 1}, {2, 3}, {4, 5}}.)
+//
+// If control_edges == nullptr, the algorithm preserves the relative ordering of
+// nodes that have their `might_have_side_effects` attribute set, i.e., it
+// behaves as if `*control_dependencies` of the form `{ {n_1, n_2}, {n_2, n_3},
+// ... }` had been handed in, where the n_i are the (sorted) indices of nodes
+// with `might_have_side_effects` attribute set.
+//
+// The function assumes that `*node_subsets` is initially empty.
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<NodeSubset>* node_subsets, bool greedily,
+    const ControlEdges* control_edges = nullptr,
+    bool disable_node_fusion = false);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_GRAPH_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/internal/signature_def.h b/third_party/tflite-hdrs/tensorflow/lite/internal/signature_def.h
new file mode 100644
index 00000000..e3d60785
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/internal/signature_def.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_INTERNAL_SIGNATURE_DEF_H_
+#define TENSORFLOW_LITE_INTERNAL_SIGNATURE_DEF_H_
+
+#include <map>
+#include <string>
+
+namespace tflite {
+namespace internal {
+
+// Structure representing SignatureDef inputs/outputs.
+struct SignatureDef {
+  // Maps name in signature def as key to index of the tensor in the model.
+  std::map<std::string, uint32_t> inputs;
+  // Maps name in signature def as key to index of the tensor in the model.
+  std::map<std::string, uint32_t> outputs;
+  // The key of this SignatureDef in the SavedModel signature def map.
+  std::string signature_key;
+  // The subgraph index of the signature in the model.
+  uint32_t subgraph_index;
+};
+
+}  // namespace internal
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_INTERNAL_SIGNATURE_DEF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/interpreter.h b/third_party/tflite-hdrs/tensorflow/lite/interpreter.h
new file mode 100644
index 00000000..381c0e3c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/interpreter.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_INTERPRETER_H_
+#define TENSORFLOW_LITE_INTERPRETER_H_
+
+/// For documentation, see
+/// tensorflow/lite/core/interpreter.h.
+
+#include "tensorflow/lite/core/interpreter.h"
+
+namespace tflite {
+using Interpreter = ::tflite::impl::Interpreter;
+using SignatureRunner = ::tflite::impl::SignatureRunner;
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_INTERPRETER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/interpreter_builder.h b/third_party/tflite-hdrs/tensorflow/lite/interpreter_builder.h
new file mode 100644
index 00000000..346e08ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/interpreter_builder.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
+#define TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/interpreter_builder.h.
+
+#include "tensorflow/lite/core/interpreter_builder.h"  // IWYU pragma: export
+
+namespace tflite {
+using InterpreterBuilder = ::tflite::impl::InterpreterBuilder;
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/interpreter_options.h b/third_party/tflite-hdrs/tensorflow/lite/interpreter_options.h
new file mode 100644
index 00000000..a7557e54
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/interpreter_options.h
@@ -0,0 +1,116 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Provides options to an interpreter.
+///
+#ifndef TENSORFLOW_LITE_INTERPRETER_OPTIONS_H_
+#define TENSORFLOW_LITE_INTERPRETER_OPTIONS_H_
+
+namespace tflite {
+
+/// Options class for `Interpreter`.
+/// WARNING: This is an experimental API and subject to change.
+class InterpreterOptions {
+ public:
+  /// Preserving all intermediates tensors for debugging.
+  /// WARNING: This is an experimental API and subject to change.
+  void SetPreserveAllTensors(bool value = true) {
+    experimental_preserve_all_tensors_ = value;
+  }
+
+  /// Returns if the `experimental_preserve_all_tensors_` feature is enabled.
+  /// WARNING: This is an experimental API and subject to change.
+  bool GetPreserveAllTensors() { return experimental_preserve_all_tensors_; }
+
+  /// Force all intermediate dynamic tensors to be released once they are not
+  /// used by the model. Please use this configuration with caution, since it
+  /// might reduce the peak memory usage of the model at the cost of a slower
+  /// inference speed.
+  /// WARNING: This is an experimental API and subject to change.
+  void SetEnsureDynamicTensorsAreReleased(bool value = true) {
+    experimental_ensure_dynamic_tensors_are_released_ = value;
+  }
+
+  /// Returns if the `experimental_ensure_dynamic_tensors_are_released_` feature
+  /// is enabled.
+  /// WARNING: This is an experimental API and subject to change.
+  bool GetEnsureDynamicTensorsAreReleased() {
+    return experimental_ensure_dynamic_tensors_are_released_;
+  }
+
+  /// Use dynamic tensor allocation and deallocation method for large tensors
+  /// instead of static memory planner. Dynamic tensors are allocated just
+  /// before when they're needed and released when they're not needed anymore.
+  /// It improves peak memory usage but there could be some latency impact. The
+  /// value (in bytes, and default is 1024 * 1024) is used to determine large
+  /// tensors.
+  /// WARNING: This is an experimental API and subject to change.
+  void OptimizeMemoryForLargeTensors(int value = 1 << 20) {
+    if (value > 0) {
+      experimental_optimize_memory_for_large_tensors_ = value;
+      experimental_ensure_dynamic_tensors_are_released_ = true;
+    }
+  }
+
+  /// Returns the size (in bytes) threshold for dynamic tensor allocation
+  /// method. It returns zero if the feature is not enabled.
+  /// WARNING: This is an experimental API and subject to change.
+  int GetDynamicAllocationForLargeTensors() {
+    return experimental_optimize_memory_for_large_tensors_;
+  }
+
+  // Returns true iff delegate clustering (i.e., reordering execution such that
+  // the number of switches between non-delegated and delegated execution of
+  // nodes is minimized) is disabled.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetDisableDelegateClustering() {
+    return experimental_disable_delegate_clustering_;
+  }
+
+  // If value == true, disable delegate clustering (see above), otherwise,
+  // enable it.
+  // WARNING: This is an experimental API and subject to change.
+  void SetDisableDelegateClustering(bool value = true) {
+    experimental_disable_delegate_clustering_ = value;
+  }
+
+  // If set to `true`, the CAST op will cache its output when its input is a
+  // constant tensor.
+  //
+  // WARNING: This is an experimental API and subject to change.
+  void SetCacheConstantCastOp(bool value) {
+    experimental_cache_constant_cast_op_ = value;
+  }
+
+  // If `true`, the CAST op will cache its output when its input is a constant
+  // tensor.
+  //
+  // WARNING: This is an experimental API and subject to change.
+  bool GetCacheConstantCastOp() const {
+    return experimental_cache_constant_cast_op_;
+  }
+
+ private:
+  bool experimental_preserve_all_tensors_ = false;
+  bool experimental_ensure_dynamic_tensors_are_released_ = false;
+  int experimental_optimize_memory_for_large_tensors_ = 0;
+  bool experimental_disable_delegate_clustering_ = false;
+  bool experimental_cache_constant_cast_op_ = false;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_INTERPRETER_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/interpreter_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/interpreter_test_util.h
new file mode 100644
index 00000000..b844f97f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/interpreter_test_util.h
@@ -0,0 +1,96 @@
+#ifndef TENSORFLOW_LITE_INTERPRETER_TEST_UTIL_H_
+#define TENSORFLOW_LITE_INTERPRETER_TEST_UTIL_H_
+
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/internal/signature_def.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+// Test helper for accessing private Interpreter members and methods.
+class InterpreterTest : public ::testing::Test {
+ public:
+  InterpreterTest() : interpreter_(new Interpreter) {}
+
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    return interpreter->ModifyGraphWithDelegate(std::move(delegate));
+  }
+
+ protected:
+  TfLiteContext* GetInterpreterContext() { return interpreter_->context_; }
+
+  Interpreter::TfLiteDelegateCreators* mutable_lazy_delegate_providers() {
+    return &interpreter_->lazy_delegate_providers_;
+  }
+
+  bool HasDelegates() { return interpreter_->HasDelegates(); }
+
+  bool IsFullyDelegated() const { return interpreter_->IsFullyDelegated(); }
+
+  TfLiteStatus ApplyLazyDelegateProviders() {
+    return interpreter_->ApplyLazyDelegateProviders();
+  }
+
+  void BuildSignature(const std::string& signature_key,
+                      const std::map<std::string, uint32_t>& inputs,
+                      const std::map<std::string, uint32_t>& outputs) {
+    BuildSignature(interpreter_.get(), signature_key, inputs, outputs);
+  }
+
+  // TODO(b/271296489): Refactor InterpreterTest for C API (i.e.
+  // TfLiteInterpreter). Currently this method is added to manipulate the
+  // interpreter instance within TfLiteInterpreter.
+  static void BuildSignature(Interpreter* interpreter,
+                             const std::string& signature_key,
+                             const std::map<std::string, uint32_t>& inputs,
+                             const std::map<std::string, uint32_t>& outputs,
+                             int subgraph_index = 0) {
+    internal::SignatureDef signature;
+    signature.inputs = inputs;
+    signature.outputs = outputs;
+    signature.signature_key = signature_key;
+    signature.subgraph_index = subgraph_index;
+    interpreter->SetSignatureDef({signature});
+  }
+
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan) {
+    return interpreter_->SetExecutionPlan(new_plan);
+  }
+
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr) {
+    interpreter_->AddSubgraphs(subgraphs_to_add, first_new_subgraph_index);
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_INTERPRETER_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/ios/TensorFlowLiteC.h b/third_party/tflite-hdrs/tensorflow/lite/ios/TensorFlowLiteC.h
new file mode 100644
index 00000000..b65700ea
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/ios/TensorFlowLiteC.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_IOS_TENSORFLOWLITEC_H_
+#define TENSORFLOW_LITE_IOS_TENSORFLOWLITEC_H_
+
+#include "tensorflow/compiler/mlir/lite/core/c/tflite_types.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
+#include "tensorflow/lite/core/c/c_api_opaque.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/operator.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
+
+#endif  // TENSORFLOW_LITE_IOS_TENSORFLOWLITEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/java/src/main/native/jni_utils.h b/third_party/tflite-hdrs/tensorflow/lite/java/src/main/native/jni_utils.h
new file mode 100644
index 00000000..1796a388
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/java/src/main/native/jni_utils.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_JNI_UTILS_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_JNI_UTILS_H_
+
+#include <jni.h>
+#include <stdarg.h>
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/error_reporter.h"
+
+namespace tflite {
+namespace jni {
+
+extern const char kIllegalArgumentException[];
+extern const char kIllegalStateException[];
+extern const char kNullPointerException[];
+extern const char kUnsupportedOperationException[];
+
+/**
+ * Thin wrapper around env->ThrowNew(...) that constructs the message using
+ * printf-style formatting.
+ *
+ * Beware that if there is an exception already pending, then throwing
+ * another exception may result in program termination, so it is good
+ * practice to ensure that there is no pending exception before calling
+ * this function.
+ */
+void ThrowException(JNIEnv* env, const char* clazz, const char* fmt, ...);
+
+/**
+ * Checks whether the necessary JNI infra has been initialized, throwing a Java
+ * exception otherwise.
+ *
+ * @param env The JNIEnv for the current thread (which has to be attached to the
+ *     JVM).
+ * @return Whether or not the JNI infra has been initialized. If this method
+ *     returns false, no other JNI method should be called until the pending
+ *     exception has been handled (typically by returning to Java).
+ */
+bool CheckJniInitializedOrThrow(JNIEnv* env);
+
+class BufferErrorReporter : public ErrorReporter {
+ public:
+  BufferErrorReporter(JNIEnv* env, int limit);
+  ~BufferErrorReporter() override;
+  int Report(const char* format, va_list args) override;
+  const char* CachedErrorMessage();
+  using ErrorReporter::Report;
+
+ private:
+  char* buffer_;
+  int start_idx_ = 0;
+  int end_idx_ = 0;
+};
+
+// Creates a Java string array from a C++ string vector.
+jobjectArray CreateStringArray(const std::vector<const char*>& values,
+                               JNIEnv* env);
+
+// Checks the difference between tensor dimensions and given dimensions. Returns
+// true if there is a difference, else false.
+bool AreDimsDifferent(JNIEnv* env, TfLiteTensor* tensor, jintArray dims);
+
+// Creates a C++ integer vector from a jintArray.
+std::vector<int> ConvertJIntArrayToVector(JNIEnv* env, jintArray inputs);
+// Converts a handle to a pointer of expected type.
+template <typename T>
+T* CastLongToPointer(JNIEnv* env, jlong handle) {
+  if (handle == 0 || handle == -1) {
+    ThrowException(env, tflite::jni::kIllegalArgumentException,
+                   "Internal error: Found invalid handle");
+    return nullptr;
+  }
+  return reinterpret_cast<T*>(handle);
+}
+
+}  // namespace jni
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_JNI_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h b/third_party/tflite-hdrs/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h
new file mode 100644
index 00000000..743c5bcb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_LAZY_DELEGATE_PROXY_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_LAZY_DELEGATE_PROXY_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/op_resolver.h"
+
+namespace tflite {
+namespace jni {
+
+class OpResolverLazyDelegateProxy : public OpResolver {
+ public:
+  OpResolverLazyDelegateProxy(std::unique_ptr<tflite::OpResolver>&& op_resolver,
+                              bool use_xnnpack)
+      : op_resolver_(std::move(op_resolver)), use_xnnpack_(use_xnnpack) {}
+
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+
+  OpResolver::TfLiteDelegateCreators GetDelegateCreators() const override;
+  OpResolver::TfLiteOpaqueDelegateCreators GetOpaqueDelegateCreators()
+      const override;
+
+ private:
+  bool MayContainUserDefinedOps() const override;
+
+  static std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+  createXNNPackDelegate(TfLiteContext* context);
+
+  static OpResolver::TfLiteOpaqueDelegatePtr createXNNPackOpaqueDelegate(
+      int num_threads);
+
+  std::unique_ptr<tflite::OpResolver> op_resolver_;
+  bool use_xnnpack_ = false;
+};
+
+}  // namespace jni
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_LAZY_DELEGATE_PROXY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/acceleration_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/acceleration_test_util.h
new file mode 100644
index 00000000..78e4d01a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/acceleration_test_util.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_H_
+
+#include <string>
+
+namespace tflite {
+
+// Returns the test id to use to retrieve the acceleration configuration
+// in the acceleration allowlist.
+std::string GetCurrentTestId();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/acceleration_test_util_internal.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/acceleration_test_util_internal.h
new file mode 100644
index 00000000..afb928dc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/acceleration_test_util_internal.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_INTERNAL_H_
+#define TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_INTERNAL_H_
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "re2/re2.h"
+
+namespace tflite {
+
+// Reads the acceleration configuration, handles comments and empty lines and
+// the basic data conversion format (split into key, value, recognition of
+// the line being a white or black list entry) and gives the data to the
+// consumer to be inserted into the target collection.
+void ReadAccelerationConfig(
+    const char* config,
+    const std::function<void(std::string, std::string, bool)>& consumer);
+
+template <typename T>
+class ConfigurationEntry {
+ public:
+  ConfigurationEntry(const std::string& test_id_rex, T test_config,
+                     bool is_denylist)
+      : test_id_rex_(new RE2(test_id_rex)),
+        test_config_(test_config),
+        is_denylist_(is_denylist) {}
+
+  bool Matches(const std::string& test_id) const {
+    return RE2::FullMatch(test_id, *test_id_rex_);
+  }
+  bool IsDenylistEntry() const { return is_denylist_; }
+  const T& TestConfig() const { return test_config_; }
+
+  const std::string& TestIdRex() const { return test_id_rex_->pattern(); }
+
+ private:
+  std::unique_ptr<RE2> test_id_rex_;
+  T test_config_;
+  bool is_denylist_;
+};
+
+// Returns the acceleration test configuration for the given test id and
+// the given acceleration configuration type.
+// The configuration type is responsible of providing the test configuration
+// and the parse function to convert configuration lines into configuration
+// objects.
+template <typename T>
+std::optional<T> GetAccelerationTestParam(std::string test_id) {
+  static std::atomic<std::vector<ConfigurationEntry<T>>*> test_config_ptr;
+
+  if (test_config_ptr.load() == nullptr) {
+    auto config = new std::vector<ConfigurationEntry<T>>();
+
+    auto consumer = [&config](std::string key, std::string value_str,
+                              bool is_denylist) mutable {
+      T value = T::ParseConfigurationLine(value_str);
+      config->emplace_back(key, value, is_denylist);
+    };
+
+    ReadAccelerationConfig(T::AccelerationTestConfig(), consumer);
+
+    // Even if it has been already set, it would be just replaced with the
+    // same value, just freeing the old value to avoid leaks
+    auto* prev_val = test_config_ptr.exchange(config);
+    delete prev_val;
+  }
+
+  const std::vector<ConfigurationEntry<T>>* test_config =
+      test_config_ptr.load();
+
+  const auto test_config_iter =
+      std::find_if(test_config->begin(), test_config->end(),
+                   [&test_id](const ConfigurationEntry<T>& elem) {
+                     return elem.Matches(test_id);
+                   });
+  if (test_config_iter != test_config->end() &&
+      !test_config_iter->IsDenylistEntry()) {
+    return std::optional<T>(test_config_iter->TestConfig());
+  } else {
+    return std::optional<T>();
+  }
+}
+
+}  //  namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/add_n_test_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/add_n_test_common.h
new file mode 100644
index 00000000..7afb4397
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/add_n_test_common.h
@@ -0,0 +1,65 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_ADD_N_TEST_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_ADD_N_TEST_COMMON_H_
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+
+class BaseAddNOpModel : public SingleOpModel {
+ public:
+  BaseAddNOpModel(const std::vector<TensorData>& inputs,
+                  const TensorData& output) {
+    int num_inputs = inputs.size();
+    std::vector<std::vector<int>> input_shapes;
+
+    for (int i = 0; i < num_inputs; ++i) {
+      inputs_.push_back(AddInput(inputs[i]));
+      input_shapes.push_back(GetShape(inputs_[i]));
+    }
+
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD_N, BuiltinOptions_AddNOptions,
+                 CreateAddNOptions(builder_).Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  int input(int i) { return inputs_[i]; }
+
+ protected:
+  std::vector<int> inputs_;
+  int output_;
+};
+
+class FloatAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_ADD_N_TEST_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/builtin_op_kernels.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/builtin_op_kernels.h
new file mode 100644
index 00000000..7b1a0975
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/builtin_op_kernels.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
+#define TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/kernels/builtin_op_kernels.h
+
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+#define TFLITE_OP(NAME) \
+    using ::tflite::ops::builtin::NAME;
+
+#include "tensorflow/lite/kernels/builtin_ops_list.inc"
+
+#undef TFLITE_OP
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cast_test_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cast_test_common.h
new file mode 100644
index 00000000..1cfa4bd7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cast_test_common.h
@@ -0,0 +1,71 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_CAST_TEST_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_CAST_TEST_COMMON_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/interpreter_options.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+using ::testing::ElementsAreArray;
+
+class CastOpModel : public SingleOpModel {
+ public:
+  CastOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_CAST, BuiltinOptions_CastOptions,
+                 CreateCastOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <class ConstInputData>
+  CastOpModel(const TensorData& input, ConstInputData&& data,
+              const TensorData& output) {
+    input_ = AddConstInput(input, static_cast<ConstInputData>(data));
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_CAST, BuiltinOptions_CastOptions,
+                 CreateCastOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/true, /*allocate_and_delegate=*/false,
+                     /*use_simple_allocator=*/false);
+    InterpreterOptions options;
+    options.SetCacheConstantCastOp(true);
+    interpreter_->ApplyOptions(&options);
+    AllocateAndDelegate(/*apply_delegate=*/true);
+  }
+
+  void Set4BitInput(absl::Span<const int8_t> f) {
+    PopulateTensor4bit(input_, 0, f.data(), f.data() + f.size());
+  }
+
+  int input() const { return input_; }
+  int output() const { return output_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_CAST_TEST_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/control_flow_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/control_flow_common.h
new file mode 100644
index 00000000..dfb1f54d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/control_flow_common.h
@@ -0,0 +1,172 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_CONTROL_FLOW_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_CONTROL_FLOW_COMMON_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+// Propagate tensor shapes and types from `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+//
+// When `resize_subgraph_inputs` is true, the function calls subgraphs's
+// `ResizeInputTensor` function, and it may trigger the memory planner to
+// reallocate memory.
+// When `resize_subgraph_inputs` is false, it implies `context` belongs to
+// `dst_subgraph`. The function calls `context->ResizeTensor`. This happens
+// when resizing `While` op's outputs.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsShapeAndType(TfLiteContext* context,
+                                     Subgraph* src_subgraph,
+                                     const SrcVector& src_tensor_indices,
+                                     Subgraph* dst_subgraph,
+                                     const DstVector& dst_tensor_indices,
+                                     bool resize_subgraph_inputs) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    // Skip copying unused destination tensors.
+    if (dst_tensor_indices[i] == kTfLiteOptionalTensor) continue;
+
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    if (resize_subgraph_inputs) {
+      dst_subgraph->ResizeInputTensor(dst_tensor_indices[i],
+                                      src_tensor->dims->data,
+                                      src_tensor->dims->size);
+    } else {
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, dst_tensor,
+                                         TfLiteIntArrayCopy(src_tensor->dims)));
+    }
+    dst_tensor->type = src_tensor->type;
+  }
+  return kTfLiteOk;
+}
+
+// Copy the tensors data from tensors `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsData(TfLiteContext* context, Subgraph* src_subgraph,
+                             const SrcVector& src_tensor_indices,
+                             Subgraph* dst_subgraph,
+                             const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    // Skip copying unused destination tensors.
+    if (dst_tensor_indices[i] == kTfLiteOptionalTensor) continue;
+
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    if (IsDynamicTensor(dst_tensor)) {
+      TfLiteTensorRealloc(src_tensor->bytes, dst_tensor);
+    }
+    TF_LITE_ENSURE_OK(context, TfLiteTensorCopy(src_tensor, dst_tensor));
+  }
+  return kTfLiteOk;
+}
+
+// Propagate tensor shapes and types from `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph` and copy data deeply.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus DeepCopyTensorsShapeTypeData(
+    TfLiteContext* context, TfLiteNode* node, Subgraph* src_subgraph,
+    const SrcVector& src_tensor_indices, Subgraph* dst_subgraph,
+    const DstVector& dst_tensor_indices, bool body_has_dynamic_output_tensors) {
+  if (body_has_dynamic_output_tensors) {
+    Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+    bool resize_subgraph_inputs = (dst_subgraph != this_subgraph);
+    TF_LITE_ENSURE_OK(
+        context, CopyTensorsShapeAndType(
+                     context, src_subgraph, src_tensor_indices, dst_subgraph,
+                     dst_tensor_indices, resize_subgraph_inputs));
+    if (resize_subgraph_inputs) {
+      TF_LITE_ENSURE_OK(context, dst_subgraph->AllocateTensors());
+    }
+  }
+  TF_LITE_ENSURE_OK(context,
+                    CopyTensorsData(context, src_subgraph, src_tensor_indices,
+                                    dst_subgraph, dst_tensor_indices));
+  return kTfLiteOk;
+}
+
+template <typename SrcVector, typename DstVector>
+TfLiteStatus DeepOrShallowCopyTensorsShapeTypeData(
+    TfLiteContext* context, TfLiteNode* node, Subgraph* src_subgraph,
+    const SrcVector& src_tensor_indices, Subgraph* dst_subgraph,
+    const DstVector& dst_tensor_indices) {
+  // Resize the destination subgraph inputs.
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    // Skip copying unused destination tensors.
+    if (dst_tensor_indices[i] == kTfLiteOptionalTensor) continue;
+    if (src_tensor_indices[i] == kTfLiteOptionalTensor) continue;
+
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    std::vector<int> dims(src_tensor->dims->data,
+                          src_tensor->dims->data + src_tensor->dims->size);
+    dst_subgraph->ResizeInputTensor(dst_tensor_indices[i], dims);
+    dst_tensor->type = src_tensor->type;
+    if (!IsResourceOrVariant(src_tensor)) {
+      dst_tensor->bytes = 0;  // Don't allocate memory with AllocateTensors().
+      dst_tensor->data.raw = nullptr;
+    }
+  }
+  TF_LITE_ENSURE_OK(context, dst_subgraph->AllocateTensors());
+  // Deep or shallow copy the data from src subgraph to dst.
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    // Skip copying unused destination tensors.
+    if (dst_tensor_indices[i] == kTfLiteOptionalTensor) continue;
+    if (src_tensor_indices[i] == kTfLiteOptionalTensor) continue;
+
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    if (IsResourceOrVariant(src_tensor)) {
+      TfLiteTensorRealloc(src_tensor->bytes, dst_tensor);
+      TF_LITE_ENSURE_OK(context, TfLiteTensorCopy(src_tensor, dst_tensor));
+    } else {
+      // Make a shallow copy of the data. This is only safe because the caller
+      // is expected to have previously set dst_tensor->allocation_type to
+      // kTfLiteCustom, to ensure the buffer is never double-freed later on.
+      TF_LITE_ENSURE_EQ(context, dst_tensor->allocation_type, kTfLiteCustom);
+      dst_tensor->bytes = src_tensor->bytes;
+      dst_tensor->data.raw = src_tensor->data.raw;
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Returns the subgraph input tensor index if the given output is also an input.
+// Otherwise returns -1.
+int OutputIsInput(int output_idx, const std::vector<int>& subgraph_inputs);
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CONTROL_FLOW_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_context.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_context.h
new file mode 100644
index 00000000..d5bf9114
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_context.h
@@ -0,0 +1,137 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_CONTEXT_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_CONTEXT_H_
+
+#if (defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
+     defined(_M_X64))
+#define TFLITE_X86_PLATFORM
+#endif
+
+#include <memory>
+
+#include "public/gemmlowp.h"
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+#include "pthreadpool.h"  // from @pthreadpool
+#endif
+#include "ruy/context.h"  // from @ruy
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/external_cpu_backend_context.h"
+
+namespace tflite {
+
+class CpuBackendContext final : public TfLiteInternalBackendContext {
+ public:
+  static CpuBackendContext* GetFromContext(TfLiteContext* context);
+
+  CpuBackendContext();
+  ~CpuBackendContext() override;
+
+  ruy::Context* ruy_context() const { return ruy_context_.get(); }
+
+  gemmlowp::GemmContext* gemmlowp_context() const {
+    return gemmlowp_context_.get();
+  }
+
+  // Sets the maximum-number-of-threads-to-use parameter, only as a means of
+  // passing around this information.
+  void SetMaxNumThreads(int max_num_threads) override;
+
+  int max_num_threads() const { return max_num_threads_; }
+
+  void SetUseCaching(bool flag);
+
+  bool use_caching() const { return use_caching_; }
+
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+  pthreadpool_t get_xnnpack_threadpool();
+#endif
+
+  void ClearCaches() override { ruy_context_->ClearPrepackedCache(); }
+
+  // Gemmlowp on x86 is a deprecated path but some clients may still use
+  // this path based on link time dependencies.
+  bool PreferGemmlowpOnX86();
+
+ private:
+  bool RuyHasAvxOrAbove();
+
+  // Copy the wrapper class for cpuinfo from Ruy.
+  class CpuInfo final {
+   public:
+    CpuInfo() {}
+    ~CpuInfo();
+
+    // X86 features
+    bool Avx();
+    bool Avx2Fma();
+    bool Avx512();
+
+   private:
+    enum class InitStatus {
+      kNotYetAttempted,
+      kInitialized,
+      kFailed,
+    };
+
+    InitStatus init_status_ = InitStatus::kNotYetAttempted;
+
+    bool EnsureInitialized();
+    InitStatus Initialize();
+    CpuInfo(const CpuInfo&) = delete;
+    CpuInfo& operator=(const CpuInfo&) = delete;
+  };
+
+  // To enable a smooth transition from the current direct usage
+  // of the underlying gemmlowp context to going through abstractions
+  // (see :cpu_backend_gemm), for now a CpuBackendContext always
+  // stores both a gemmlowp context and a ruy context.
+  // TODO(b/131416458): Once call sites all go through abstractions,
+  // elide what can be elided based on TFLITE_WITH_RUY.
+  const std::unique_ptr<ruy::Context> ruy_context_;
+  const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
+  CpuInfo cpuinfo_;
+
+  // The maximum of threads used for parallelizing TfLite ops. However,
+  // cpu_backend_threadpool::Execute creates as many threads as it's
+  // asked to, regardless of this. Typically a call site would query
+  // cpu_backend_context->max_num_threads() and used that to determine
+  // the number of tasks to create and to give to
+  // cpu_backend_threadpool::Execute.
+  //
+  // This value also gets propagated to back-ends, where it plays the same
+  // information-only role.
+  int max_num_threads_;
+  // For matrix muliplications with constants parameters (i.e. weights), we can
+  // sometimes provide speedups by caching the "prepacked" data, for some
+  // additional memory cost. This flag permits the user to route all
+  // CpuBackendGem operations to a library that permits such an optimization
+  // (currently the Ruy library only).
+  bool use_caching_;
+
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+  // A smart pointer for the xnnpack threadpool. Is created by a call from the
+  // interpreter, and then consumed by xnnpack, possibly via a TFLite kernel.
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)>
+      xnnpack_threadpool_{nullptr, &pthreadpool_destroy};
+#endif
+
+  CpuBackendContext(const CpuBackendContext&) = delete;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm.h
new file mode 100644
index 00000000..af91b0a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -0,0 +1,223 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_H_
+
+#include <cstdint>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
+
+#ifndef TFLITE_WITH_RUY
+#include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_x86.h"
+#endif
+
+namespace tflite {
+
+namespace cpu_backend_gemm {
+
+// The main entry point for CpuBackendGemm::Gemm.
+//
+// If TFLITE_WITH_RUY is set, CpuBackendGemm::Gemm will always go to Ruy aka
+// GemmImplUsingRuy. Other cases are as follows:
+//
+//                    |Quantized (uint8)|Quantized (int8)| Float |
+// TFLITE_WITH_RUY    |      Ruy        |      Ruy       | Ruy   |
+// !TFLITE_WITH_RUY   |      gemmlowp   |  Ruy/gemmlowp* | eigen |
+// * - Ruy if NEON is not available.
+
+//  On x86 platforms:
+//  (default)         |      gemmlowp   |     Ruy        | eigen |
+//  TFLITE_X86_RUY_\  |      Ruy        |     Ruy        | Ruy   |
+//  ENABLED && (AVX
+//  or above available)
+
+#if !defined(TFLITE_WITH_RUY) && defined(TFLITE_X86_PLATFORM)
+/* GEMM dispatch implementation for x86.
+ */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl : detail::GemmImplX86<LhsScalar, RhsScalar, AccumScalar,
+                                      DstScalar, quantization_flavor> {};
+#else
+/* Generic implementation using ruy.
+ * Non-ruy implementation will be partial specializations of this template.
+ */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl : detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar,
+                                           DstScalar, quantization_flavor> {};
+
+#if !defined(TFLITE_WITH_RUY)
+
+/* Specializations using gemmlowp */
+template <typename SrcScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct GemmImpl<SrcScalar, SrcScalar, std::int32_t, DstScalar,
+                quantization_flavor>
+    : detail::GemmImplUsingGemmlowp<SrcScalar, SrcScalar, std::int32_t,
+                                    DstScalar, quantization_flavor> {};
+
+// When SrcScalar=int8 or DstScalar=int8, gemmlowp fails to compile
+// outside of NEON. We avoid the compilation failure by subspecializing these
+// cases, rerouting it back to ruy.
+#if !defined(GEMMLOWP_NEON)
+template <typename SrcScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                quantization_flavor>
+    : detail::GemmImplUsingRuy<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                               quantization_flavor> {};
+
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl<std::int8_t, std::int8_t, std::int32_t, DstScalar,
+                quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               DstScalar, quantization_flavor> {};
+
+template <QuantizationFlavor quantization_flavor>
+struct GemmImpl<std::int8_t, std::int8_t, std::int32_t, std::int8_t,
+                quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               std::int8_t, quantization_flavor> {};
+#endif  // not GEMMLOWP_NEON
+
+/* Specializations using Eigen */
+
+template <>
+struct GemmImpl<float, float, float, float, QuantizationFlavor::kFloatingPoint>
+    : detail::GemmImplUsingEigen {};
+
+#endif  // not TFLITE_WITH_RUY
+
+#endif  // not TFLITE_WITH_RUY and TFLITE_X86_PLATFORM
+
+/* Public entry point */
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+          const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+          const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+          const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+          CpuBackendContext* context) {
+  ruy::profiler::ScopeLabel label("cpu_backend_gemm::Gemm");
+  ValidateParams(lhs_params, rhs_params, dst_params, params);
+  if (!IsValidGemm(lhs_params, rhs_params, dst_params)) {
+    // For now, assert in debug mode, return in opt.
+    // TODO(b/183099395) Eliminate debug/release discrepancy by plumbing in
+    // TFLiteStatus so we can return an error here.
+    TFLITE_DCHECK(false);
+    return;
+  }
+  // In some cases we want to unconditionally use ruy as the backend, overriding
+  // the `tflite_with_ruy` setting and the platform default.
+  bool must_use_ruy = false;
+  if (context->use_caching()) {
+    // Only ruy supports caching of pre-packed matrices. Due to the large
+    // performance impact in the cases where it's typically used, this overrides
+    // the default.
+    must_use_ruy = true;
+  }
+  if (lhs_params.order != Order::kRowMajor ||
+      rhs_params.order != Order::kColMajor ||
+      dst_params.order != Order::kColMajor) {
+    // ruy supports all 2^3=8 combinations of storage orders with comparable
+    // performance. In ruy, it's only a runtime switch. In other backends
+    // (gemmlowp, Eigen), storage orders are template parameters, supporting
+    // all 8 combinations would be up to a 8-fold code size increase, so we
+    // prefer to force usage of ruy in these cases.
+    must_use_ruy = true;
+  }
+  if (must_use_ruy) {
+    detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                             quantization_flavor>::Run(lhs_params, lhs_data,
+                                                       rhs_params, rhs_data,
+                                                       dst_params, dst_data,
+                                                       params, context);
+    return;
+  }
+  // If we did not choose to force usage of ruy above, then we may now consider
+  // using custom GEMV code for the matrix*vector cases.
+  const bool try_custom_gemv = (dst_params.cols == 1);
+  if (try_custom_gemv) {
+    // GEMV case: try a custom fast GEMV path. It will return true if it
+    // actually handled it.
+    if (detail::CustomGemv(lhs_params, lhs_data, rhs_params, rhs_data,
+                           dst_params, dst_data, params, context)) {
+      return;
+    }
+  }
+  // Generic case: dispatch to any backend as a general GEMM.
+  GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+           quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
+                                     dst_params, dst_data, params, context);
+}
+
+// Special path for 16x8 quant gemm.
+template <QuantizationFlavor quantization_flavor>
+void Gemm(const MatrixParams<int8_t>& lhs_params, const int8_t* lhs_data,
+          const MatrixParams<int16_t>& rhs_params, const int16_t* rhs_data,
+          const MatrixParams<int16_t>& dst_params, int16_t* dst_data,
+          const GemmParams<int32_t, int16_t, quantization_flavor>& params,
+          CpuBackendContext* context) {
+  ruy::profiler::ScopeLabel label("cpu_backend_gemm::Gemm");
+  ValidateParams(lhs_params, rhs_params, dst_params, params);
+  if (!IsValidGemm(lhs_params, rhs_params, dst_params)) {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  // Currently, only Ruy backend supports 16x8 quant gemm so we use ruy
+  // only.
+  detail::GemmImplUsingRuy<int8_t, int16_t, int32_t, int16_t,
+                           quantization_flavor>::Run(lhs_params, lhs_data,
+                                                     rhs_params, rhs_data,
+                                                     dst_params, dst_data,
+                                                     params, context);
+}
+
+// Special path for gemm with raw accumulator case. i.e. AccumScalar ==
+// DstScalar == int32 case.
+template <typename LhsScalar, typename RhsScalar,
+          QuantizationFlavor quantization_flavor>
+void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+          const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+          const MatrixParams<int32_t>& dst_params, int32_t* dst_data,
+          const GemmParams<int32_t, int32_t, quantization_flavor>& params,
+          CpuBackendContext* context) {
+  ruy::profiler::ScopeLabel label("cpu_backend_gemm::Gemm");
+  ValidateParams(lhs_params, rhs_params, dst_params, params);
+
+  // Currently, only Ruy backend supports get raw accumulator, so we use ruy
+  // only.
+  ruy::profiler::ScopeLabel label2("cpu_backend_gemm::Gemm: general GEMM");
+  detail::GemmImplUsingRuy<LhsScalar, RhsScalar, int32_t, int32_t,
+                           quantization_flavor>::Run(lhs_params, lhs_data,
+                                                     rhs_params, rhs_data,
+                                                     dst_params, dst_data,
+                                                     params, context);
+}
+
+}  // namespace cpu_backend_gemm
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
new file mode 100644
index 00000000..2712d7d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -0,0 +1,789 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Fast Gemv (i.e. matrix*vector multiplication) paths.
+// TODO(b/132094390): remove when GEMM performance is good enough on GEMV cases.
+
+// TFLite's runtime ops concentrate as much as possible the matrix*vector
+// use cases on the (matrix) * (column-vector) case, as opposed to
+// (row-vector) * (matrix).  So that is what we focus on optimizing here.
+// Accordingly, the public cpu_backend_gemm::Gemm() entry point checks
+// if we are in this (matrix) * (column-vector) case, and if so calls
+// CustomGemv.
+//
+// cpu_backend_gemm::Gemm is also currently restricted (as enforced in
+// ValidateParams) to the case where the left-hand side matrix is row-major.
+//
+// So the current scope of this CustomGemv function really is:
+// (row-major matrix) * (column-vector).
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <type_traits>
+#include <vector>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+// CustomGemvImpl is what needs to be specialized for each custom GEMV path.
+//
+// It does not deal with any multi-threaded implementation detail. Rather,
+// it provides the single-thread implementation to be run by each thread.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct CustomGemvImpl {
+  // The number of rows of the left-hand-side matrix (and equivalently of the
+  // destination column-vector) that the kernel processes at a time.
+  // This will also be the minimum required number of rows for a Gemv shape
+  // to be supported by this path.
+  //
+  // Gemv implementations are expected to be able to deal with numbers of
+  // rows that aren't multiples of kKernelRows by possibly running the kernel
+  // again at an odd row_start, e.g. if kKernelRows==4, Run() should still
+  // support running on 7 rows by running twice: once with row_start=0 and then
+  // another time with row_start=3.
+  //
+  // On the other hand, gemv implementations are not expected to support
+  // running on fewer than kKernelRows rows. There is no interest in
+  // optimizing such narrow Gemv's that they are just a few dot-products.
+  // Supporting that would require custom kernel code only for that case.
+  static constexpr int kKernelRows = 1;
+
+  // Returns true if the Gemv shape is supported by Run(), provided that
+  // (row_end - row_start) > kKernelRows.
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<LhsScalar>& lhs_params,
+      const MatrixParams<RhsScalar>& rhs_params,
+      const MatrixParams<DstScalar>& dst_params,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+    return false;
+  }
+
+  // Performs the Gemv.
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end) {}
+};
+
+// Wraps CustomGemvImpl for multi-threaded operation.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+class CustomGemvTask : public cpu_backend_threadpool::Task {
+ public:
+  CustomGemvTask(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end)
+      : lhs_params_(lhs_params),
+        lhs_data_(lhs_data),
+        rhs_params_(rhs_params),
+        rhs_data_(rhs_data),
+        dst_params_(dst_params),
+        dst_data_(dst_data),
+        params_(params),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                                quantization_flavor>;
+    Impl::Run(lhs_params_, lhs_data_, rhs_params_, rhs_data_, dst_params_,
+              dst_data_, params_, row_start_, row_end_);
+  }
+
+ private:
+  const MatrixParams<LhsScalar>& lhs_params_;
+  const LhsScalar* lhs_data_;
+  const MatrixParams<RhsScalar>& rhs_params_;
+  const RhsScalar* rhs_data_;
+  const MatrixParams<DstScalar>& dst_params_;
+  DstScalar* dst_data_;
+  const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params_;
+  int row_start_;
+  int row_end_;
+};
+
+// Either performs the requested Gemv operation and returns true,
+// or immediately returns false.
+//
+// See the comment at the top of the file for the scope of what this handles.
+// In summary: (row-major matrix) * (column-vector).
+//
+// Here is only high-level logic.
+// The actual implementation details are in specializations of
+// CustomGemvImpl.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+bool CustomGemv(
+    const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+    const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+    const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+    CpuBackendContext* context) {
+  ruy::profiler::ScopeLabel label("cpu_backend_gemm::Gemm: CustomGemv");
+  using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                              quantization_flavor>;
+  if (lhs_params.rows < Impl::kKernelRows) {
+    return false;
+  }
+  if (!Impl::IsSupportedGivenSufficientlyManyRows(lhs_params, rhs_params,
+                                                  dst_params, params)) {
+    return false;
+  }
+  TFLITE_DCHECK_GE(lhs_params.rows, Impl::kKernelRows);
+  int thread_count = LegacyHowManyThreads<Impl::kKernelRows>(
+      context->max_num_threads(), dst_params.rows, dst_params.cols,
+      lhs_params.cols);
+  if (thread_count == 1) {
+    Impl::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data,
+              params, 0, lhs_params.rows);
+  } else {
+    using Task = CustomGemvTask<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                                quantization_flavor>;
+    std::vector<Task> tasks;
+    tasks.reserve(thread_count);
+    const int kRowsPerThread =
+        RoundUp<Impl::kKernelRows>(CeilQuotient(dst_params.rows, thread_count));
+    int row_start = 0;
+    for (int i = 0; i < thread_count; i++) {
+      int row_end = std::min(dst_params.rows, row_start + kRowsPerThread);
+      tasks.emplace_back(lhs_params, lhs_data, rhs_params, rhs_data, dst_params,
+                         dst_data, params, row_start, row_end);
+      row_start = row_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), context);
+  }
+  return true;
+}
+
+// USE_NEON still allows for x86 where we may be using the arm_neon_sse.h
+// wrapper implementing NEON intrinsics on top of SSE4 intrinsics.
+#ifdef USE_NEON
+
+// Some NEON helper functions used by CustomGemvImpl specializations below,
+// allowing for some type genericity in them.
+
+inline int16x8x2_t Load16AndSubtractZeroPoint(const std::uint8_t* src,
+                                              std::uint8_t zero_point) {
+  uint8x16_t src_u8 = vld1q_u8(src);
+  int16x8_t src_s16_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
+  int16x8_t src_s16_1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
+  int16x8x2_t result;
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  result.val[0] = vsubq_s16(src_s16_0, zero_point_vec);
+  result.val[1] = vsubq_s16(src_s16_1, zero_point_vec);
+  return result;
+}
+
+inline int16x8x2_t Load16AndSubtractZeroPoint(const std::int8_t* src,
+                                              std::int8_t zero_point) {
+  int8x16_t src_s8 = vld1q_s8(src);
+  int16x8_t src_s16_0 = vmovl_s8(vget_low_s8(src_s8));
+  int16x8_t src_s16_1 = vmovl_s8(vget_high_s8(src_s8));
+  int16x8x2_t result;
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  result.val[0] = vsubq_s16(src_s16_0, zero_point_vec);
+  result.val[1] = vsubq_s16(src_s16_1, zero_point_vec);
+  return result;
+}
+
+inline int16x8_t Load8AndSubtractZeroPoint(const std::uint8_t* src,
+                                           std::uint8_t zero_point) {
+  uint8x8_t src_u8 = vld1_u8(src);
+  int16x8_t src_s16 = vreinterpretq_s16_u16(vmovl_u8(src_u8));
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  return vsubq_s16(src_s16, zero_point_vec);
+}
+
+inline int16x8_t Load8AndSubtractZeroPoint(const std::int8_t* src,
+                                           std::int8_t zero_point) {
+  int8x8_t src_s8 = vld1_s8(src);
+  int16x8_t src_s16 = vmovl_s8(src_s8);
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  return vsubq_s16(src_s16, zero_point_vec);
+}
+
+inline void ClampAndStore(int32x4_t src, std::uint8_t clamp_min,
+                          std::uint8_t clamp_max, std::uint8_t* dst) {
+  // Narrow values down to 16 bit signed.
+  const int16x4_t res16 = vqmovn_s32(src);
+  // Narrow values down to 8 bit unsigned, saturating.
+  uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+  // Apply the clamping from the activation function
+  res8 = vmax_u8(res8, vdup_n_u8(clamp_min));
+  res8 = vmin_u8(res8, vdup_n_u8(clamp_max));
+  // Store results to destination.
+  vst1_lane_u8(dst + 0, res8, 0);
+  vst1_lane_u8(dst + 1, res8, 1);
+  vst1_lane_u8(dst + 2, res8, 2);
+  vst1_lane_u8(dst + 3, res8, 3);
+}
+
+inline void ClampAndStore(int32x4_t src, std::int8_t clamp_min,
+                          std::int8_t clamp_max, std::int8_t* dst) {
+  // Narrow values down to 16 bit signed.
+  const int16x4_t res16 = vqmovn_s32(src);
+  // Narrow values down to 8 bit unsigned, saturating.
+  int8x8_t res8 = vqmovn_s16(vcombine_s16(res16, res16));
+  // Apply the clamping from the activation function
+  res8 = vmax_s8(res8, vdup_n_s8(clamp_min));
+  res8 = vmin_s8(res8, vdup_n_s8(clamp_max));
+  // Store results to destination.
+  vst1_lane_s8(dst + 0, res8, 0);
+  vst1_lane_s8(dst + 1, res8, 1);
+  vst1_lane_s8(dst + 2, res8, 2);
+  vst1_lane_s8(dst + 3, res8, 3);
+}
+
+inline void ClampAndStore(int32x4_t src, std::int16_t clamp_min,
+                          std::int16_t clamp_max, std::int16_t* dst) {
+  // Narrow values down to 16 bit signed.
+  int16x4_t res16 = vqmovn_s32(src);
+  // Apply the clamping from the activation function
+  res16 = vmax_s16(res16, vdup_n_s16(clamp_min));
+  res16 = vmin_s16(res16, vdup_n_s16(clamp_max));
+  // Store results to destination.
+  vst1_lane_s16(dst + 0, res16, 0);
+  vst1_lane_s16(dst + 1, res16, 1);
+  vst1_lane_s16(dst + 2, res16, 2);
+  vst1_lane_s16(dst + 3, res16, 3);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
+                      quantization_flavor> {
+  // This partial template specialization is less generic than its declaration
+  // implies: it assumes the following constraints on its free template
+  // parameters. We guard these assumptions in the following static_assert's.
+  static_assert(std::is_same<LhsScalar, std::uint8_t>::value ||
+                    std::is_same<LhsScalar, std::int8_t>::value,
+                "");
+  static_assert(std::is_same<RhsScalar, std::uint8_t>::value ||
+                    std::is_same<RhsScalar, std::int8_t>::value,
+                "");
+  static_assert(std::is_same<DstScalar, std::uint8_t>::value ||
+                    std::is_same<DstScalar, std::int8_t>::value ||
+                    std::is_same<DstScalar, std::int16_t>::value,
+                "");
+  static_assert(quantization_flavor ==
+                        QuantizationFlavor::kIntegerWithUniformMultiplier ||
+                    quantization_flavor ==
+                        QuantizationFlavor::kIntegerWithPerRowMultiplier,
+                "");
+
+  // This implementation's inner loop processes 4 rows of the left-hand side
+  // matrix at a time.
+  static constexpr int kKernelRows = 4;
+
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<LhsScalar>& lhs_params,
+      const MatrixParams<RhsScalar>& rhs_params,
+      const MatrixParams<DstScalar>& dst_params,
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params) {
+    // The kernel processes at least 8 LHS columns at once to fill NEON
+    // registers. The leftovers-handling code at the end works by loading a
+    // partially overlapping final register by walking back by a few (<8) values
+    // to avoid running past the row's end. This relies on there being
+    // at least 8 LHS columns.
+    return lhs_params.cols >= 8;
+  }
+
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end) {
+    // Handle kKernelRows ( == 4) rows of the left-hand side matrix at each
+    // iteration of this for loop.
+    TFLITE_DCHECK_GE(row_end - row_start, kKernelRows);
+    for (int row = row_start; row < row_end; row += kKernelRows) {
+      // Here is the magic where we allow this kernel to handle any odd number
+      // of rows as long as it's >= kKernelRows: the last group of `kKernelRows`
+      // rows will be nudged to fit, possibly by starting at an odd value of
+      // `row`.
+      row = std::min(row, row_end - kKernelRows);
+      const LhsScalar* filter_ptr = lhs_data + row * lhs_params.cols;
+
+      static constexpr int kCacheLineSize = 64;
+      for (int k = 0; k < rhs_params.rows;
+           k += kCacheLineSize / sizeof(RhsScalar)) {
+        optimized_ops_preload_l1_keep(rhs_data + k);
+      }
+
+      // kPreloadAhead is empirically determined.
+      // End-to-end latency (ms) on mobilenet_v2_0.35_96_8bit, 1 thread,
+      // on Qualcomm S855:
+      //
+      // kPreloadAhead | big core | little core
+      // --------------+----------+------------
+      // 64            | 1.26     | 5.45
+      // 128           | 1.23     | 5.01
+      // 256           | 1.18     | 4.9
+      // 512           | 1.18     | 5.45
+      // 1024          | 1.18     | 6.5
+      // no prefetch   | 1.25     | 8.1
+      static constexpr int kPreloadAhead = 256;
+
+      // 4 accumulator registers, one for each row being processed.
+      // Each has 4 int32 lanes that corresponds to columns modulo 4, and
+      // will need to be horizontally reduced at the end.
+      int32x4_t acc0 = vdupq_n_s32(0);
+      int32x4_t acc1 = acc0;
+      int32x4_t acc2 = acc0;
+      int32x4_t acc3 = acc0;
+      int in = 0;
+      // As much as possible, handle 16 columns of the left-hand side matrix
+      // at a time. This allows for decent NEON implementation.
+      for (; in <= lhs_params.cols - 16; in += 16) {
+        const LhsScalar* local_filter_ptr = filter_ptr;
+        int16x8x2_t input_val =
+            Load16AndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
+        int16x8x2_t filter_val_0 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        local_filter_ptr += lhs_params.cols;
+        int16x8x2_t filter_val_1 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        local_filter_ptr += lhs_params.cols;
+        int16x8x2_t filter_val_2 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        local_filter_ptr += lhs_params.cols;
+        int16x8x2_t filter_val_3 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        filter_ptr += 16;
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[1]),
+                         vget_high_s16(input_val.val[1]));
+      }
+      // Less that 16 values remain. Try to handle 8 more.
+      if (in <= lhs_params.cols - 8) {
+        int16x8_t input_val =
+            Load8AndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
+        int16x8_t filter_val_0 = Load8AndSubtractZeroPoint(
+            filter_ptr + 0 * lhs_params.cols, lhs_params.zero_point);
+        int16x8_t filter_val_1 = Load8AndSubtractZeroPoint(
+            filter_ptr + 1 * lhs_params.cols, lhs_params.zero_point);
+        int16x8_t filter_val_2 = Load8AndSubtractZeroPoint(
+            filter_ptr + 2 * lhs_params.cols, lhs_params.zero_point);
+        int16x8_t filter_val_3 = Load8AndSubtractZeroPoint(
+            filter_ptr + 3 * lhs_params.cols, lhs_params.zero_point);
+        filter_ptr += 8;
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0),
+                         vget_low_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1),
+                         vget_low_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2),
+                         vget_low_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3),
+                         vget_low_s16(input_val));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                         vget_high_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                         vget_high_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                         vget_high_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                         vget_high_s16(input_val));
+        in += 8;
+      }
+      // Less than 8 values remain. Handle the remaining values
+      // in one more copy of the above code handling 8, where we
+      // walk back a few values to be able to load 8 values without
+      // overrunning the buffer. This is where we make use of the requirement
+      // (see IsSupportedGivenSufficientlyManyRows) that there at least
+      // 8 LHS columns.
+      if (in < lhs_params.cols) {
+        // `back` is how many entries to walk back by.
+        // Its value is necessarily between 1 and 7.
+        const int back = in + 8 - lhs_params.cols;
+        TFLITE_DCHECK_GE(back, 1);
+        TFLITE_DCHECK_LE(back, 7);
+        // Load 8 values as usual.
+        int16x8_t input_val = Load8AndSubtractZeroPoint(
+            rhs_data + lhs_params.cols - 8, rhs_params.zero_point);
+        const LhsScalar* local_filter_ptr = filter_ptr - back;
+        filter_ptr += lhs_params.cols - in;
+        int16x8_t filter_val_0 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        local_filter_ptr += lhs_params.cols;
+        int16x8_t filter_val_1 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        local_filter_ptr += lhs_params.cols;
+        int16x8_t filter_val_2 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        local_filter_ptr += lhs_params.cols;
+        int16x8_t filter_val_3 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        // Now zero out the `back` first entries of input_val.
+        // vsetq_lane_s16 takes a literal index, so we need unrolled code.
+        switch (back) {
+          case 7:
+            input_val = vsetq_lane_s16(0, input_val, 6);
+            [[clang::fallthrough]];
+          case 6:
+            input_val = vsetq_lane_s16(0, input_val, 5);
+            [[clang::fallthrough]];
+          case 5:
+            input_val = vsetq_lane_s16(0, input_val, 4);
+            [[clang::fallthrough]];
+          case 4:
+            input_val = vsetq_lane_s16(0, input_val, 3);
+            [[clang::fallthrough]];
+          case 3:
+            input_val = vsetq_lane_s16(0, input_val, 2);
+            [[clang::fallthrough]];
+          case 2:
+            input_val = vsetq_lane_s16(0, input_val, 1);
+            [[clang::fallthrough]];
+          default:
+            input_val = vsetq_lane_s16(0, input_val, 0);
+        }
+        // Multiply-accumulate 8 values as usual. The `back` first lanes
+        // of filter_val_* are junk, but it doesn't matter since they get
+        // multiplied by the zeros that we just wrote in the corresponding
+        // lanes of input_val.
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0),
+                         vget_low_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1),
+                         vget_low_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2),
+                         vget_low_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3),
+                         vget_low_s16(input_val));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                         vget_high_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                         vget_high_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                         vget_high_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                         vget_high_s16(input_val));
+      }
+
+      // Horizontally reduce accumulators
+      int32x2_t pairwise_reduced_acc_0 =
+          vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+      int32x2_t pairwise_reduced_acc_1 =
+          vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+      int32x2_t pairwise_reduced_acc_2 =
+          vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+      int32x2_t pairwise_reduced_acc_3 =
+          vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+      const int32x2_t reduced_lo =
+          vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      const int32x2_t reduced_hi =
+          vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+      // End of horizontal reduction: now `reduced` is a single int32x4
+      // containing the 4 int32 accumulators corresponding to the 4 rows
+      // being processed.
+
+      // Add bias values.
+      if (params.bias) {
+        int32x4_t bias_vec = vld1q_s32(params.bias + row);
+        reduced = vaddq_s32(reduced, bias_vec);
+      }
+
+      // Get multiplier parameters.
+      int32x4_t multiplier_fixedpoint;
+      int32x4_t multiplier_exponent;
+      if (quantization_flavor ==
+          QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+        multiplier_exponent =
+            vld1q_s32(params.multiplier_exponent_perchannel + row);
+        multiplier_fixedpoint =
+            vld1q_s32(params.multiplier_fixedpoint_perchannel + row);
+      } else {
+        multiplier_exponent = vdupq_n_s32(params.multiplier_exponent);
+        multiplier_fixedpoint = vdupq_n_s32(params.multiplier_fixedpoint);
+      }
+
+      // If positive exponent, shift left.
+      int32x4_t exponent_positive_part =
+          vmaxq_s32(multiplier_exponent, vdupq_n_s32(0));
+      reduced = vshlq_s32(reduced, exponent_positive_part);
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_s32(reduced, multiplier_fixedpoint);
+      // If negative exponent, rounding-shift-right.
+      int32x4_t exponent_negative_part =
+          vminq_s32(multiplier_exponent, vdupq_n_s32(0));
+      reduced = vrshlq_s32(reduced, exponent_negative_part);
+
+      // Add the output offset.
+      const int32x4_t output_offset_vec = vdupq_n_s32(dst_params.zero_point);
+      reduced = vaddq_s32(reduced, output_offset_vec);
+
+      // Finally, clamp and store to the destination.
+      ClampAndStore(reduced, params.clamp_min, params.clamp_max,
+                    dst_data + row);
+    }
+  }
+};
+
+// The float specialization below is unconditionally faster than ruy
+// because ruy does not currently have any Gemv path.
+// But it is not unconditionally faster than Eigen, which is what is used
+// unless TFLITE_WITH_RUY is defined. Indeed, Eigen has decently efficient
+// Gemv paths, and they may use AVX instructions, while the present
+// NEON intrinsics code maps at best to SSE4 on x86.
+#ifdef TFLITE_WITH_RUY
+
+// We want to use fused multiply-add when it's available (that is, on A64
+// unconditionally and on A32 with VFPv4) because it's often faster, and
+// because non-fused seems not to be available in A64 so a conscientious
+// compiler might emit slow code (separate mul and add instructions) in order to
+// implement the vmlaq_f32 intrinsic with strict bit-for-bit exactness on A64.
+// (Compilers seem to be generating a fused fmla instruction at the moment,
+// but that could change).
+//
+// We still want to support building for A32 without VFPv4.
+inline float32x4_t mul_add(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+#ifdef __ARM_FEATURE_FMA
+  return vfmaq_f32(acc, lhs, rhs);
+#else
+  return vmlaq_f32(acc, lhs, rhs);
+#endif
+}
+
+template <>
+struct CustomGemvImpl<float, float, float, float,
+                      QuantizationFlavor::kFloatingPoint> {
+  // This implementation's inner loop processes 4 rows of the left-hand side
+  // matrix at a time.
+  static constexpr int kKernelRows = 4;
+
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<float>& lhs_params,
+      const MatrixParams<float>& rhs_params,
+      const MatrixParams<float>& dst_params,
+      const GemmParams<float, float>& params) {
+    // The kernel processes 4 LHS columns at once to fill float32x4 registers.
+    // The leftovers-handling code at the end works by loading a partially
+    // overlapping final register by walking back by a few (<4) floats
+    // to avoid running past the row's end. This relies on there being
+    // at least 4 LHS columns.
+    return lhs_params.cols >= 4;
+  }
+  static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
+                  const MatrixParams<float>& rhs_params, const float* rhs_data,
+                  const MatrixParams<float>& dst_params, float* dst_data,
+                  const GemmParams<float, float>& params, int row_start,
+                  int row_end) {
+    // Handle kKernelRows ( == 4) rows of the left-hand side matrix at each
+    // iteration of this for loop.
+    TFLITE_DCHECK_GE(row_end - row_start, kKernelRows);
+    for (int row = row_start; row < row_end; row += kKernelRows) {
+      // Here is the magic where we allow this kernel to handle any odd number
+      // of rows as long as it's >= kKernelRows: the last group of `kKernelRows`
+      // rows will be nudged to fit, possibly by starting at an odd value of
+      // `row`.
+      row = std::min(row, row_end - kKernelRows);
+      const float* filter_ptr = lhs_data + row * lhs_params.cols;
+
+      static constexpr int kCacheLineSize = 64;
+      for (int k = 0; k < rhs_params.rows;
+           k += kCacheLineSize / sizeof(float)) {
+        optimized_ops_preload_l1_keep(rhs_data + k);
+      }
+
+      // kPreloadAhead is empirically determined.
+      // End-to-end latency (ms) on mobilenet_v2_0.35_96_float, 1 thread,
+      // on Qualcomm S855:
+      //
+      // kPreloadAhead | big core | little core
+      // --------------+----------+------------
+      // 64            | 2.4      | 15.2
+      // 128           | 2.15     | 12.9
+      // 256           | 2        | 12.9
+      // 512           | 2.08     | 13.3
+      // 1024          | 2.05     | 14.7
+      // no prefetch   | 2.1      | 28
+      static constexpr int kPreloadAhead = 256;
+
+      // 4 accumulator registers, one for each row being processed.
+      // Each has 4 float32 lanes that corresponds to columns modulo 4, and
+      // will need to be horizontally reduced at the end.
+      float32x4_t acc0 = vdupq_n_f32(0);
+      float32x4_t acc1 = acc0;
+      float32x4_t acc2 = acc0;
+      float32x4_t acc3 = acc0;
+      int in = 0;
+      // As much as possible, handle 4 columns of the left-hand side matrix
+      // at a time. This allows for decent NEON implementation.
+      for (; in <= lhs_params.cols - 4; in += 4) {
+        float32x4_t input_val = vld1q_f32(rhs_data + in);
+        const float* local_filter_ptr = filter_ptr;
+        float32x4_t filter_val_0 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_1 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_2 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_3 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        filter_ptr += 4;
+        acc0 = mul_add(acc0, filter_val_0, input_val);
+        acc1 = mul_add(acc1, filter_val_1, input_val);
+        acc2 = mul_add(acc2, filter_val_2, input_val);
+        acc3 = mul_add(acc3, filter_val_3, input_val);
+      }
+      // Less than 4 values remain. Handle the remaining values
+      // in one more copy of the above code handling 4, where we
+      // walk back a few values to be able to load 4 values without
+      // overrunning the buffer. This is where we make use of the requirement
+      // (see IsSupportedGivenSufficientlyManyRows) that there at least
+      // 4 LHS columns.
+      if (in < lhs_params.cols) {
+        // `back` is how many entries to walk back by.
+        // Its value is necessarily between 1 and 3.
+        const int back = in + 4 - lhs_params.cols;
+        TFLITE_DCHECK_GE(back, 1);
+        TFLITE_DCHECK_LE(back, 3);
+        // Load 4 values as usual.
+        float32x4_t input_val = vld1q_f32(rhs_data + lhs_params.cols - 4);
+        const float* local_filter_ptr = filter_ptr - back;
+        filter_ptr += lhs_params.cols - in;
+        float32x4_t filter_val_0 = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_1 = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_2 = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_3 = vld1q_f32(local_filter_ptr);
+        // Now zero out the `back` first entries of input_val.
+        // vsetq_lane_f32 takes a literal index, so we need unrolled code.
+        switch (back) {
+          case 3:
+            input_val = vsetq_lane_f32(0, input_val, 2);
+            [[clang::fallthrough]];
+          case 2:
+            input_val = vsetq_lane_f32(0, input_val, 1);
+            [[clang::fallthrough]];
+          default:
+            input_val = vsetq_lane_f32(0, input_val, 0);
+        }
+        // Multiply-accumulate 4 values as usual. The `back` first lanes
+        // of filter_val_* are junk, but it doesn't matter since they get
+        // multiplied by the zeros that we just wrote in the corresponding
+        // lanes of input_val.
+        acc0 = mul_add(acc0, filter_val_0, input_val);
+        acc1 = mul_add(acc1, filter_val_1, input_val);
+        acc2 = mul_add(acc2, filter_val_2, input_val);
+        acc3 = mul_add(acc3, filter_val_3, input_val);
+      }
+
+      // Horizontally reduce accumulators
+      float32x2_t pairwise_reduced_acc_0 =
+          vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+      float32x2_t pairwise_reduced_acc_1 =
+          vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+      float32x2_t pairwise_reduced_acc_2 =
+          vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+      float32x2_t pairwise_reduced_acc_3 =
+          vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+      float32x2_t reduced_lo =
+          vpadd_f32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      float32x2_t reduced_hi =
+          vpadd_f32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      float32x4_t reduced = vcombine_f32(reduced_lo, reduced_hi);
+      // End of horizontal reduction: now `reduced` is a single float32x4
+      // containing the 4 float32 accumulators corresponding to the 4 rows
+      // being processed.
+
+      if (params.bias) {
+        // Add bias values.
+        reduced = vaddq_f32(reduced, vld1q_f32(params.bias + row));
+      }
+
+      // Clamp and store to destination.
+      reduced = vminq_f32(reduced, vdupq_n_f32(params.clamp_max));
+      reduced = vmaxq_f32(reduced, vdupq_n_f32(params.clamp_min));
+      vst1q_f32(dst_data + row, reduced);
+    }
+  }
+};
+
+#endif  // TFLITE_WITH_RUY
+
+#endif  // USE_NEON
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
new file mode 100644
index 00000000..bd4733dc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
+
+#ifndef TFLITE_WITH_RUY
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+struct GemmImplUsingEigen {
+  static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
+                  const MatrixParams<float>& rhs_params, const float* rhs_data,
+                  const MatrixParams<float>& dst_params, float* dst_data,
+                  const GemmParams<float, float>& params,
+                  CpuBackendContext* /* context */);
+};
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // not TFLITE_WITH_RUY
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
new file mode 100644
index 00000000..4c653d8b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -0,0 +1,195 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
+
+#include <tuple>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#ifndef TFLITE_WITH_RUY
+
+#include <cstdint>
+#include <type_traits>
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+template <typename DstScalar>
+struct GemmlowpSaturatingCastStage {};
+
+template <>
+struct GemmlowpSaturatingCastStage<std::uint8_t> {
+  using Type = gemmlowp::OutputStageSaturatingCastToUint8;
+};
+
+template <>
+struct GemmlowpSaturatingCastStage<std::int8_t> {
+  using Type = gemmlowp::OutputStageSaturatingCastToInt8;
+};
+
+template <>
+struct GemmlowpSaturatingCastStage<std::int16_t> {
+  using Type = gemmlowp::OutputStageSaturatingCastToInt16;
+};
+
+template <typename DstScalar>
+struct GemmlowpBitDepthParams {};
+
+template <>
+struct GemmlowpBitDepthParams<std::uint8_t> {
+  using Type = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
+};
+
+template <>
+struct GemmlowpBitDepthParams<std::int8_t> {
+  using Type = gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams;
+};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplUsingGemmlowp {};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+struct GemmImplUsingGemmlowp<
+    LhsScalar, RhsScalar, AccumScalar, DstScalar,
+    QuantizationFlavor::kIntegerWithUniformMultiplier> {
+  static_assert(std::is_same<LhsScalar, RhsScalar>::value, "");
+  static_assert(std::is_same<AccumScalar, std::int32_t>::value, "");
+  using SrcScalar = LhsScalar;
+
+  static void Run(
+      const MatrixParams<SrcScalar>& lhs_params, const SrcScalar* lhs_data,
+      const MatrixParams<SrcScalar>& rhs_params, const SrcScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar,
+                       QuantizationFlavor::kIntegerWithUniformMultiplier>&
+          params,
+      CpuBackendContext* context) {
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::RowMajor>
+        gemmlowp_lhs(lhs_data, lhs_params.rows, lhs_params.cols);
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::ColMajor>
+        gemmlowp_rhs(rhs_data, rhs_params.rows, rhs_params.cols);
+    gemmlowp::MatrixMap<DstScalar, gemmlowp::MapOrder::ColMajor> gemmlowp_dst(
+        dst_data, dst_params.rows, dst_params.cols);
+
+    using ColVectorMap =
+        gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+    scale_stage.result_offset_after_shift = dst_params.zero_point;
+    scale_stage.result_fixedpoint_multiplier = params.multiplier_fixedpoint;
+    scale_stage.result_exponent = params.multiplier_exponent;
+    using SaturatingCastStageType =
+        typename GemmlowpSaturatingCastStage<DstScalar>::Type;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = params.clamp_min;
+    clamp_stage.max = params.clamp_max;
+    SaturatingCastStageType saturating_cast_stage;
+    using BitDepthParams = typename GemmlowpBitDepthParams<SrcScalar>::Type;
+    if (params.bias) {
+      ColVectorMap bias_vector(params.bias, lhs_params.rows);
+      gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+      bias_addition_stage.bias_vector = bias_vector;
+      auto output_pipeline = std::make_tuple(
+          bias_addition_stage, scale_stage, clamp_stage, saturating_cast_stage);
+      gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+          context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs,
+          &gemmlowp_dst, -lhs_params.zero_point, -rhs_params.zero_point,
+          output_pipeline);
+    } else {
+      auto output_pipeline =
+          std::make_tuple(scale_stage, clamp_stage, saturating_cast_stage);
+      gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+          context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs,
+          &gemmlowp_dst, -lhs_params.zero_point, -rhs_params.zero_point,
+          output_pipeline);
+    }
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+struct GemmImplUsingGemmlowp<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                             QuantizationFlavor::kIntegerWithPerRowMultiplier> {
+  static_assert(std::is_same<LhsScalar, RhsScalar>::value, "");
+  static_assert(std::is_same<AccumScalar, std::int32_t>::value, "");
+  using SrcScalar = LhsScalar;
+
+  static void Run(
+      const MatrixParams<SrcScalar>& lhs_params, const SrcScalar* lhs_data,
+      const MatrixParams<SrcScalar>& rhs_params, const SrcScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar,
+                       QuantizationFlavor::kIntegerWithPerRowMultiplier>&
+          params,
+      CpuBackendContext* context) {
+    // gemmlowp support for this per-channel path is limited to NEON.
+    // We fall back to ruy outside of NEON.
+#ifdef GEMMLOWP_NEON
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::RowMajor>
+        gemmlowp_lhs(lhs_data, lhs_params.rows, lhs_params.cols);
+    gemmlowp::MatrixMap<const SrcScalar, gemmlowp::MapOrder::ColMajor>
+        gemmlowp_rhs(rhs_data, rhs_params.rows, rhs_params.cols);
+    gemmlowp::MatrixMap<DstScalar, gemmlowp::MapOrder::ColMajor> gemmlowp_dst(
+        dst_data, dst_params.rows, dst_params.cols);
+
+    using ColVectorMap =
+        gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>;
+    ColVectorMap bias_vector(params.bias, lhs_params.rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
+        gemmlowp::VectorShape::Col>
+        scale_stage;
+    scale_stage.result_offset_after_shift = dst_params.zero_point;
+    scale_stage.result_fixedpoint_multiplier =
+        ColVectorMap(params.multiplier_fixedpoint_perchannel, dst_params.rows);
+    scale_stage.result_exponent =
+        ColVectorMap(params.multiplier_exponent_perchannel, dst_params.rows);
+    using SaturatingCastStageType =
+        typename GemmlowpSaturatingCastStage<DstScalar>::Type;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = params.clamp_min;
+    clamp_stage.max = params.clamp_max;
+    SaturatingCastStageType saturating_cast_stage;
+    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
+                                           clamp_stage, saturating_cast_stage);
+    using BitDepthParams = typename GemmlowpBitDepthParams<SrcScalar>::Type;
+    gemmlowp::GemmWithOutputPipeline<SrcScalar, DstScalar, BitDepthParams>(
+        context->gemmlowp_context(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
+        -lhs_params.zero_point, -rhs_params.zero_point, output_pipeline);
+#else
+    GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                     QuantizationFlavor::kIntegerWithPerRowMultiplier>::
+        Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data,
+            params, context);
+#endif
+  }
+};
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // not TFLITE_WITH_RUY
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_params.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_params.h
new file mode 100644
index 00000000..6dba34bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_params.h
@@ -0,0 +1,263 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_PARAMS_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_PARAMS_H_
+
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+namespace cpu_backend_gemm {
+
+// Matrix storage order: column-major or row-major.
+enum class Order { kColMajor, kRowMajor };
+
+enum class CachePolicy : std::uint8_t {
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
+inline CachePolicy DefaultCachePolicy(bool is_constant_data) {
+  return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup
+                          : CachePolicy::kNeverCache;
+}
+
+// MatrixParams encapsulates the parameters that Gemm needs about each
+// matrix, besides the buffer data pointer.
+// Compare to ruy::Matrix, which also encapsulates the data pointer.
+// Rationale for leaving the data pointer out of here: doing so
+// requires complicated const-correctness mechanics. See
+// ruy::ConstCheckingPtr.
+template <typename Scalar>
+struct MatrixParams {
+  // Storage layout order. For now we only do plain linear non-strided
+  // layout. It would be easy to support a stride if needed.
+  Order order = Order::kColMajor;
+  // Number of rows of the matrix.
+  int rows = 0;
+  // Number of columns of the matrix.
+  int cols = 0;
+  // The zero_point, i.e. which Scalar value is to be interpreted as zero.
+  // When Scalar is floating-point, this must be 0.
+  Scalar zero_point = 0;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
+};
+
+// Enumeration of broad categories of Gemm.
+//
+// The primary reason for this to exist is to allow Gemm to compile
+// only uniform-quantized or only per-channel-quantized code paths.
+// This is unneeded with ruy as the back-end, as this is only a runtime
+// difference in ruy, but with gemmlowp these really are separate code
+// paths and templatizing in a QuantizationFlavor is necessary to avoid
+// compiling unused gemmlowp code. Indeed, TFLite currently uses
+// uint8 with uniform quantization and int8 with per-channel quantization,
+// and does not use uint8 with per-channel. We want to avoid compiling
+// the gemmlowp uint8 per-channel path when gemmlowp is the back-end.
+//
+// It's possible to drop this in the future if gemmlowp goes away and no
+// other then-relevant backend library handles quantized paths in a way that
+// requires knowing this at compile-time.
+enum class QuantizationFlavor {
+  // Floating-point Gemm: the accumulators are not multiplied by any
+  // 'multiplier'.
+  kFloatingPoint,
+  // Quantized Gemm using a single multiplier for all accumulators.
+  kIntegerWithUniformMultiplier,
+  // Quantized Gemm using a separate multipliers for accumulators of each
+  // row of the destination matrix. This is what is called 'per-channel'
+  // in GemmParams. Here we use the more specific 'per-row' terminology
+  // to allow for the possibility of 'per-column' in the future, and to
+  // allow for that to be a separate code path in some back-end such as
+  // gemmlowp.
+  kIntegerWithPerRowMultiplier
+};
+
+// Additional parameters that Gemm needs, beyond what falls into
+// the MatrixParams that it takes. Compare to ruy::Spec.
+//
+// Decoupling AccumScalar from DstScalar (rather than deducing it from that)
+// is useful future-proofing. Think of a float16 path using float32 accum.
+//
+// QuantizationFlavor is passed here even though it's technically not used
+// in this class. This is so that we retain the ability in the future to
+// specialize this class for quantization flavor, and this allows for
+// Gemm to be templatized in quantization_flavor via the GemmParams that it
+// takes, allowing for automatic template parameter deduction to take place,
+// so that most call sites don't need to specify a QuantizationFlavor
+// (only those that need perchannel quantization do).
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor =
+              std::is_floating_point<AccumScalar>::value
+                  ? QuantizationFlavor::kFloatingPoint
+                  : QuantizationFlavor::kIntegerWithUniformMultiplier>
+struct GemmParams {
+  // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
+  // of the multiplier by which accumulators are multiplied before being casted
+  // to the destination type.
+  AccumScalar multiplier_fixedpoint = 0;
+  // Only for non-floating-point cases. The exponent part of the aforementioned
+  // multiplier.
+  int multiplier_exponent = 0;
+  // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_fixedpoint.
+  const AccumScalar* multiplier_fixedpoint_perchannel = nullptr;
+  // Per-channel variant of multiplier_exponent. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_exponent.
+  //
+  // Either none or both of multiplier_exponent_perchannel and
+  // multiplier_fixedpoint_perchannel must be nullptr.
+  const int* multiplier_exponent_perchannel = nullptr;
+  // The bias vector data, if not null.
+  const AccumScalar* bias = nullptr;
+  // min clamp bound of destination values.
+  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+                            ? -std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::lowest();
+  // max clamp bound of destination values.
+  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+                            ? std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::max();
+};
+
+/* Convenience typedefs */
+
+template <typename DstScalar>
+using QuantizedGemmParams = GemmParams<std::int32_t, DstScalar>;
+
+using FloatGemmParams = GemmParams<float, float>;
+
+/* Validation functions */
+
+// Note that this uses TFLITE_DCHECK from kernels/internal/compatibility.h
+// and not TF_LITE_ASSERT from op_macros.h. We want this to be explicitly
+// debug-build-only assertions so that there's not reason not to
+// generously validate, and TF_LITE_ASSERT is actually at the moment
+// a release-build assertion. See b/131587258.
+
+// Validates self-consistency of GemmParams.
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+void ValidateGemmParams(
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+  // Guard consistency of the quantized multiplier fields.
+  if (quantization_flavor == QuantizationFlavor::kFloatingPoint) {
+    TFLITE_DCHECK(!params.multiplier_fixedpoint);
+    TFLITE_DCHECK(!params.multiplier_exponent);
+    TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
+  } else if (quantization_flavor ==
+                 QuantizationFlavor::kIntegerWithUniformMultiplier &&
+             !std::is_same<DstScalar, int32_t>::value) {
+    TFLITE_DCHECK(params.multiplier_fixedpoint);
+    // Nothing to check about multiplier_exponent
+    TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
+  } else if (quantization_flavor ==
+                 QuantizationFlavor::kIntegerWithPerRowMultiplier &&
+             !std::is_same<DstScalar, int32_t>::value) {
+    TFLITE_DCHECK(!params.multiplier_fixedpoint);
+    TFLITE_DCHECK(!params.multiplier_exponent);
+    TFLITE_DCHECK(params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(params.multiplier_exponent_perchannel);
+  } else {
+    // For the get raw accumulator case, we should make sure none of the
+    // quantization params are set.
+    TFLITE_DCHECK(!params.multiplier_fixedpoint);
+    TFLITE_DCHECK(!params.multiplier_exponent);
+    TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
+    TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
+  }
+}
+
+namespace detail {
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct ValidateTypes {
+  // This generic implementation is for quantized flavors.
+  // kFloatingPoint will be a specialization below.
+  static_assert(!std::is_floating_point<LhsScalar>::value, "");
+  static_assert(!std::is_floating_point<RhsScalar>::value, "");
+  static_assert(!std::is_floating_point<AccumScalar>::value, "");
+  // No requirement on DstScalar --- we might in the future allow it
+  // to be floating point even in a quantized Gemm.
+};
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar>
+struct ValidateTypes<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                     QuantizationFlavor::kFloatingPoint> {
+  static_assert(std::is_floating_point<LhsScalar>::value, "");
+  static_assert(std::is_floating_point<RhsScalar>::value, "");
+  static_assert(std::is_floating_point<AccumScalar>::value, "");
+  static_assert(std::is_floating_point<DstScalar>::value, "");
+};
+
+}  // namespace detail
+
+// Validates overall consistency of all the parameters taken by a Gemm call:
+// the 3 MatrixParams and the GemmParams.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+void ValidateParams(
+    const MatrixParams<LhsScalar>& lhs_params,
+    const MatrixParams<RhsScalar>& rhs_params,
+    const MatrixParams<DstScalar>& dst_params,
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+  (void)detail::ValidateTypes<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                              quantization_flavor>();
+  ValidateGemmParams(params);
+}
+
+// Test if the Gemm is degenerate in some way, e.g. nonsensical dimenions.
+template <typename LhsScalar, typename RhsScalar, typename DstScalar>
+bool IsValidGemm(const MatrixParams<LhsScalar>& lhs_params,
+                 const MatrixParams<RhsScalar>& rhs_params,
+                 const MatrixParams<DstScalar>& dst_params) {
+  bool valid = true;
+  valid &= lhs_params.rows >= 1;
+  valid &= lhs_params.cols >= 1;
+  valid &= rhs_params.rows >= 1;
+  valid &= rhs_params.cols >= 1;
+  valid &= dst_params.rows >= 1;
+  valid &= dst_params.cols >= 1;
+  valid &= lhs_params.cols == rhs_params.rows;
+  valid &= rhs_params.cols == dst_params.cols;
+  valid &= lhs_params.rows == lhs_params.rows;
+  return valid;
+}
+
+}  // namespace cpu_backend_gemm
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_PARAMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
new file mode 100644
index 00000000..6a818834
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
@@ -0,0 +1,150 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
+
+#include "ruy/matrix.h"  // from @ruy
+#include "ruy/mul_params.h"  // from @ruy
+#include "ruy/ruy.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy) {
+  switch (cache_policy) {
+    case CachePolicy::kNeverCache:
+      return ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ruy::CachePolicy::kAlwaysCache;
+    default:
+      TFLITE_DCHECK(false);
+      return ruy::CachePolicy::kNeverCache;
+  }
+}
+
+template <typename Scalar, typename DataPointer>
+void MakeRuyMatrix(const MatrixParams<Scalar>& params, DataPointer data_ptr,
+                   ruy::Matrix<Scalar>* dst, bool use_caching = false) {
+  ruy::Order ruy_order = params.order == Order::kColMajor
+                             ? ruy::Order::kColMajor
+                             : ruy::Order::kRowMajor;
+  ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order,
+                        dst->mutable_layout());
+  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+  // It does care whether we assign to it a Scalar* or a const Scalar*.
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+  if (use_caching) {
+    dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
+  }
+}
+
+// Floating-point case.
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct MakeRuyMulParamsImpl final {
+  static void Run(
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      ruy::MulParams<AccumScalar, DstScalar>* ruy_mul_params) {
+    static_assert(quantization_flavor == QuantizationFlavor::kFloatingPoint,
+                  "");
+    ruy_mul_params->set_bias(params.bias);
+    ruy_mul_params->set_clamp_min(params.clamp_min);
+    ruy_mul_params->set_clamp_max(params.clamp_max);
+  }
+};
+
+// Integer-quantized case with destination type narrower than int32
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+struct MakeRuyMulParamsImpl<std::int32_t, DstScalar, quantization_flavor>
+    final {
+  static void Run(
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params,
+      ruy::MulParams<std::int32_t, DstScalar>* ruy_mul_params) {
+    static_assert(sizeof(DstScalar) < sizeof(std::int32_t), "");
+    if (quantization_flavor ==
+        QuantizationFlavor::kIntegerWithUniformMultiplier) {
+      ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+      ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+    }
+    if (quantization_flavor ==
+        QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+      ruy_mul_params->set_multiplier_fixedpoint_perchannel(
+          params.multiplier_fixedpoint_perchannel);
+      ruy_mul_params->set_multiplier_exponent_perchannel(
+          params.multiplier_exponent_perchannel);
+    }
+    ruy_mul_params->set_bias(params.bias);
+    ruy_mul_params->set_clamp_min(params.clamp_min);
+    ruy_mul_params->set_clamp_max(params.clamp_max);
+  }
+};
+
+// Raw-integer case with destination type int32.
+template <QuantizationFlavor quantization_flavor>
+struct MakeRuyMulParamsImpl<std::int32_t, std::int32_t, quantization_flavor>
+    final {
+  static void Run(
+      const GemmParams<std::int32_t, std::int32_t, quantization_flavor>& params,
+      ruy::MulParams<std::int32_t, std::int32_t>* ruy_mul_params) {
+    ruy_mul_params->set_bias(params.bias);
+  }
+};
+
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+void MakeRuyMulParams(
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+    ruy::MulParams<AccumScalar, DstScalar>* ruy_mul_params) {
+  MakeRuyMulParamsImpl<AccumScalar, DstScalar, quantization_flavor>::Run(
+      params, ruy_mul_params);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplUsingRuy {
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      CpuBackendContext* context) {
+    ruy::Matrix<LhsScalar> ruy_lhs;
+    ruy::Matrix<RhsScalar> ruy_rhs;
+    ruy::Matrix<DstScalar> ruy_dst;
+    MakeRuyMatrix(lhs_params, lhs_data, &ruy_lhs, context->use_caching());
+    MakeRuyMatrix(rhs_params, rhs_data, &ruy_rhs, context->use_caching());
+    MakeRuyMatrix(dst_params, dst_data, &ruy_dst);
+
+    ruy::MulParams<AccumScalar, DstScalar> ruy_mul_params;
+    MakeRuyMulParams(params, &ruy_mul_params);
+
+    ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, context->ruy_context(),
+             &ruy_dst);
+  }
+};
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_x86.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_x86.h
new file mode 100644
index 00000000..26bb74e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_gemm_x86.h
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_X86_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_X86_H_
+
+// If TFLITE_WITH_RUY is set, Ruy is the only GEMM option. In this header
+// we select either Ruy or an alternative based on the SIMD extentions
+// available on the given x86 platform.
+#ifndef TFLITE_WITH_RUY
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplX86 {
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      CpuBackendContext* context) {
+    // TODO(b/168923364) Ruy is preferred on x86, but check if the deprecated
+    // path is enabled.
+    if (context->PreferGemmlowpOnX86()) {
+      // Dispatch to gemmlowp.
+      detail::GemmImplUsingGemmlowp<
+          LhsScalar, RhsScalar, AccumScalar, DstScalar,
+          quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
+                                    dst_params, dst_data, params, context);
+
+      return;
+    }
+    // Run-time dispatch to Ruy for platforms with AVX or above.
+    detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                             quantization_flavor>::Run(lhs_params, lhs_data,
+                                                       rhs_params, rhs_data,
+                                                       dst_params, dst_data,
+                                                       params, context);
+  }
+};
+
+// For float, defer to eigen for now.
+template <>
+struct GemmImplX86<float, float, float, float,
+                   QuantizationFlavor::kFloatingPoint> {
+  static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
+                  const MatrixParams<float>& rhs_params, const float* rhs_data,
+                  const MatrixParams<float>& dst_params, float* dst_data,
+                  const GemmParams<float, float,
+                                   QuantizationFlavor::kFloatingPoint>& params,
+                  CpuBackendContext* context) {
+    GemmImplUsingEigen::Run(lhs_params, lhs_data, rhs_params, rhs_data,
+                            dst_params, dst_data, params, context);
+  }
+};
+
+// gemmlowp requires NEON for certain quantization cases. See note in
+// cpu_backend_gemm.h
+#if !defined(GEMMLOWP_NEON)
+template <typename SrcScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplX86<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                   quantization_flavor>
+    : detail::GemmImplUsingRuy<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                               quantization_flavor> {};
+
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplX86<std::int8_t, std::int8_t, std::int32_t, DstScalar,
+                   quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               DstScalar, quantization_flavor> {};
+
+template <QuantizationFlavor quantization_flavor>
+struct GemmImplX86<std::int8_t, std::int8_t, std::int32_t, std::int8_t,
+                   quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               std::int8_t, quantization_flavor> {};
+#endif  // not GEMMLOWP_NEON
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // not TFLITE_WITH_RUY
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_X86_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_threadpool.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_threadpool.h
new file mode 100644
index 00000000..39eafd51
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/cpu_backend_threadpool.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_THREADPOOL_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_THREADPOOL_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+#ifdef TFLITE_WITH_RUY
+#include "ruy/context.h"  // from @ruy
+#include "ruy/thread_pool.h"  // from @ruy
+#else
+#include "public/gemmlowp.h"
+#endif
+
+namespace tflite {
+namespace cpu_backend_threadpool {
+
+#ifdef TFLITE_WITH_RUY
+
+using Task = ruy::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType* tasks,
+             CpuBackendContext* cpu_backend_context) {
+  TFLITE_DCHECK_LE(tasks_count, cpu_backend_context->max_num_threads());
+  cpu_backend_context->ruy_context()->mutable_thread_pool()->Execute(
+      tasks_count, tasks);
+}
+
+#else  // not TFLITE_WITH_RUY
+
+using Task = gemmlowp::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType* tasks,
+             CpuBackendContext* cpu_backend_context) {
+  TFLITE_DCHECK_LE(tasks_count, cpu_backend_context->max_num_threads());
+  cpu_backend_context->gemmlowp_context()->workers_pool()->Execute(tasks_count,
+                                                                   tasks);
+}
+
+#endif
+
+}  // namespace cpu_backend_threadpool
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_entry.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_entry.h
new file mode 100644
index 00000000..eede67c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_entry.h
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied from tensorflow/core/util/ctc/ctc_beam_entry.h
+// TODO(b/111524997): Remove this file.
+#ifndef TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_ENTRY_H_
+#define TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_ENTRY_H_
+
+#include <algorithm>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/kernels/ctc/ctc_loss_util.h"
+
+namespace tflite {
+namespace custom {
+namespace ctc {
+
+// The ctc_beam_search namespace holds several classes meant to be accessed only
+// in case of extending the CTCBeamSearch decoder to allow custom scoring
+// functions.
+//
+// BeamEntry is exposed through template arguments BeamScorer and BeamComparer
+// of CTCBeamSearch (ctc_beam_search.h).
+namespace ctc_beam_search {
+
+struct EmptyBeamState {};
+
+struct BeamProbability {
+  BeamProbability() : total(kLogZero), blank(kLogZero), label(kLogZero) {}
+  void Reset() {
+    total = kLogZero;
+    blank = kLogZero;
+    label = kLogZero;
+  }
+  float total;
+  float blank;
+  float label;
+};
+
+template <class CTCBeamState>
+class BeamRoot;
+
+template <class CTCBeamState = EmptyBeamState>
+struct BeamEntry {
+  // BeamRoot<CTCBeamState>::AddEntry() serves as the factory method.
+  friend BeamEntry<CTCBeamState>* BeamRoot<CTCBeamState>::AddEntry(
+      BeamEntry<CTCBeamState>* p, int l);
+  inline bool Active() const { return newp.total != kLogZero; }
+  // Return the child at the given index, or construct a new one in-place if
+  // none was found.
+  BeamEntry& GetChild(int ind) {
+    auto entry = children.emplace(ind, nullptr);
+    auto& child_entry = entry.first->second;
+    // If this is a new child, populate the BeamEntry<CTCBeamState>*.
+    if (entry.second) {
+      child_entry = beam_root->AddEntry(this, ind);
+    }
+    return *child_entry;
+  }
+  std::vector<int> LabelSeq(bool merge_repeated) const {
+    std::vector<int> labels;
+    int prev_label = -1;
+    const BeamEntry* c = this;
+    while (c->parent != nullptr) {  // Checking c->parent to skip root leaf.
+      if (!merge_repeated || c->label != prev_label) {
+        labels.push_back(c->label);
+      }
+      prev_label = c->label;
+      c = c->parent;
+    }
+    std::reverse(labels.begin(), labels.end());
+    return labels;
+  }
+
+  BeamEntry<CTCBeamState>* parent;
+  int label;
+  // All instances of child BeamEntry are owned by *beam_root.
+  std::unordered_map<int, BeamEntry<CTCBeamState>*> children;
+  BeamProbability oldp;
+  BeamProbability newp;
+  CTCBeamState state;
+
+ private:
+  // Constructor giving parent, label, and the beam_root.
+  // The object pointed to by p cannot be copied and should not be moved,
+  // otherwise parent will become invalid.
+  // This private constructor is only called through the factory method
+  // BeamRoot<CTCBeamState>::AddEntry().
+  BeamEntry(BeamEntry* p, int l, BeamRoot<CTCBeamState>* beam_root)
+      : parent(p), label(l), beam_root(beam_root) {}
+  BeamRoot<CTCBeamState>* beam_root;
+
+  BeamEntry(const BeamEntry&) = delete;
+  void operator=(const BeamEntry&) = delete;
+};
+
+// This class owns all instances of BeamEntry.  This is used to avoid recursive
+// destructor call during destruction.
+template <class CTCBeamState = EmptyBeamState>
+class BeamRoot {
+ public:
+  BeamRoot(BeamEntry<CTCBeamState>* p, int l) { root_entry_ = AddEntry(p, l); }
+  BeamRoot(const BeamRoot&) = delete;
+  BeamRoot& operator=(const BeamRoot&) = delete;
+
+  BeamEntry<CTCBeamState>* AddEntry(BeamEntry<CTCBeamState>* p, int l) {
+    auto* new_entry = new BeamEntry<CTCBeamState>(p, l, this);
+    beam_entries_.emplace_back(new_entry);
+    return new_entry;
+  }
+  BeamEntry<CTCBeamState>* RootEntry() const { return root_entry_; }
+
+ private:
+  BeamEntry<CTCBeamState>* root_entry_ = nullptr;
+  std::vector<std::unique_ptr<BeamEntry<CTCBeamState>>> beam_entries_;
+};
+
+// BeamComparer is the default beam comparer provided in CTCBeamSearch.
+template <class CTCBeamState = EmptyBeamState>
+class BeamComparer {
+ public:
+  virtual ~BeamComparer() {}
+  virtual bool inline operator()(const BeamEntry<CTCBeamState>* a,
+                                 const BeamEntry<CTCBeamState>* b) const {
+    return a->newp.total > b->newp.total;
+  }
+};
+
+}  // namespace ctc_beam_search
+
+}  // namespace ctc
+}  // namespace custom
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_ENTRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_scorer.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_scorer.h
new file mode 100644
index 00000000..7936cd02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_scorer.h
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Collection of scoring classes that can be extended and provided to the
+// CTCBeamSearchDecoder to incorporate additional scoring logic (such as a
+// language model).
+//
+// To build a custom scorer extend and implement the pure virtual methods from
+// BeamScorerInterface. The default CTC decoding behavior is implemented
+// through BaseBeamScorer.
+
+// Copied from tensorflow/core/util/ctc/ctc_beam_scorer.h
+// TODO(b/111524997): Remove this file.
+#ifndef TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_SCORER_H_
+#define TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_SCORER_H_
+
+#include "tensorflow/lite/kernels/ctc/ctc_beam_entry.h"
+
+namespace tflite {
+namespace custom {
+namespace ctc {
+
+// Base implementation of a beam scorer used by default by the decoder that can
+// be subclassed and provided as an argument to CTCBeamSearchDecoder, if complex
+// scoring is required. Its main purpose is to provide a thin layer for
+// integrating language model scoring easily.
+template <typename CTCBeamState>
+class BaseBeamScorer {
+ public:
+  virtual ~BaseBeamScorer() {}
+  // State initialization.
+  virtual void InitializeState(CTCBeamState* root) const {}
+  // ExpandState is called when expanding a beam to one of its children.
+  // Called at most once per child beam. In the simplest case, no state
+  // expansion is done.
+  virtual void ExpandState(const CTCBeamState& from_state, int from_label,
+                           CTCBeamState* to_state, int to_label) const {}
+  // ExpandStateEnd is called after decoding has finished. Its purpose is to
+  // allow a final scoring of the beam in its current state, before resorting
+  // and retrieving the TopN requested candidates. Called at most once per beam.
+  virtual void ExpandStateEnd(CTCBeamState* state) const {}
+  // GetStateExpansionScore should be an inexpensive method to retrieve the
+  // (cached) expansion score computed within ExpandState. The score is
+  // multiplied (log-addition) with the input score at the current step from
+  // the network.
+  //
+  // The score returned should be a log-probability. In the simplest case, as
+  // there's no state expansion logic, the expansion score is zero.
+  virtual float GetStateExpansionScore(const CTCBeamState& state,
+                                       float previous_score) const {
+    return previous_score;
+  }
+  // GetStateEndExpansionScore should be an inexpensive method to retrieve the
+  // (cached) expansion score computed within ExpandStateEnd. The score is
+  // multiplied (log-addition) with the final probability of the beam.
+  //
+  // The score returned should be a log-probability.
+  virtual float GetStateEndExpansionScore(const CTCBeamState& state) const {
+    return 0;
+  }
+};
+
+}  // namespace ctc
+}  // namespace custom
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_SCORER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_search.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_search.h
new file mode 100644
index 00000000..df442f97
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_beam_search.h
@@ -0,0 +1,432 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied from tensorflow/core/util/ctc/ctc_beam_search.h
+// TODO(b/111524997): Remove this file.
+#ifndef TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_SEARCH_H_
+#define TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_SEARCH_H_
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/kernels/ctc/ctc_beam_entry.h"
+#include "tensorflow/lite/kernels/ctc/ctc_beam_scorer.h"
+#include "tensorflow/lite/kernels/ctc/ctc_decoder.h"
+#include "tensorflow/lite/kernels/ctc/ctc_loss_util.h"
+#include "tensorflow/lite/kernels/ctc/top_n.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace custom {
+namespace ctc {
+
+template <typename CTCBeamState = ctc_beam_search::EmptyBeamState,
+          typename CTCBeamComparer =
+              ctc_beam_search::BeamComparer<CTCBeamState>>
+class CTCBeamSearchDecoder : public CTCDecoder {
+  // Beam Search
+  //
+  // Example (GravesTh Fig. 7.5):
+  //         a    -
+  //  P = [ 0.3  0.7 ]  t = 0
+  //      [ 0.4  0.6 ]  t = 1
+  //
+  // Then P(l = -) = P(--) = 0.7 * 0.6 = 0.42
+  //      P(l = a) = P(a-) + P(aa) + P(-a) = 0.3*0.4 + ... = 0.58
+  //
+  // In this case, Best Path decoding is suboptimal.
+  //
+  // For Beam Search, we use the following main recurrence relations:
+  //
+  // Relation 1:
+  // ---------------------------------------------------------- Eq. 1
+  //      P(l=abcd @ t=7) = P(l=abc  @ t=6) * P(d @ 7)
+  //                      + P(l=abcd @ t=6) * (P(d @ 7) + P(- @ 7))
+  // where P(l=? @ t=7), ? = a, ab, abc, abcd are all stored and
+  // updated recursively in the beam entry.
+  //
+  // Relation 2:
+  // ---------------------------------------------------------- Eq. 2
+  //      P(l=abc? @ t=3) = P(l=abc @ t=2) * P(? @ 3)
+  // for ? in a, b, d, ..., (not including c or the blank index),
+  // and the recurrence starts from the beam entry for P(l=abc @ t=2).
+  //
+  // For this case, the length of the new sequence equals t+1 (t
+  // starts at 0).  This special case can be calculated as:
+  //   P(l=abc? @ t=3) = P(a @ 0)*P(b @ 1)*P(c @ 2)*P(? @ 3)
+  // but we calculate it recursively for speed purposes.
+  typedef ctc_beam_search::BeamEntry<CTCBeamState> BeamEntry;
+  typedef ctc_beam_search::BeamRoot<CTCBeamState> BeamRoot;
+  typedef ctc_beam_search::BeamProbability BeamProbability;
+
+ public:
+  typedef BaseBeamScorer<CTCBeamState> DefaultBeamScorer;
+
+  // The beam search decoder is constructed specifying the beam_width (number of
+  // candidates to keep at each decoding timestep) and a beam scorer (used for
+  // custom scoring, for example enabling the use of a language model).
+  // The ownership of the scorer remains with the caller. The default
+  // implementation, CTCBeamSearchDecoder<>::DefaultBeamScorer, generates the
+  // standard beam search.
+  CTCBeamSearchDecoder(int num_classes, int beam_width,
+                       BaseBeamScorer<CTCBeamState>* scorer, int batch_size = 1,
+                       bool merge_repeated = false)
+      : CTCDecoder(num_classes, batch_size, merge_repeated),
+        beam_width_(beam_width),
+        leaves_(beam_width),
+        beam_scorer_(scorer) {
+    Reset();
+  }
+
+  ~CTCBeamSearchDecoder() override {}
+
+  // Run the hibernating beam search algorithm on the given input.
+  bool Decode(const CTCDecoder::SequenceLength& seq_len,
+              const std::vector<CTCDecoder::Input>& input,
+              std::vector<CTCDecoder::Output>* output,
+              CTCDecoder::ScoreOutput* scores) override;
+
+  // Calculate the next step of the beam search and update the internal state.
+  template <typename Vector>
+  void Step(const Vector& raw_input);
+
+  template <typename Vector>
+  float GetTopK(const int K, const Vector& input,
+                std::vector<float>* top_k_logits,
+                std::vector<int>* top_k_indices);
+
+  // Retrieve the beam scorer instance used during decoding.
+  BaseBeamScorer<CTCBeamState>* GetBeamScorer() const { return beam_scorer_; }
+
+  // Set label selection parameters for faster decoding.
+  // See comments for label_selection_size_ and label_selection_margin_.
+  void SetLabelSelectionParameters(int label_selection_size,
+                                   float label_selection_margin) {
+    label_selection_size_ = label_selection_size;
+    label_selection_margin_ = label_selection_margin;
+  }
+
+  // Reset the beam search
+  void Reset();
+
+  // Extract the top n paths at current time step
+  bool TopPaths(int n, std::vector<std::vector<int>>* paths,
+                std::vector<float>* log_probs, bool merge_repeated) const;
+
+ private:
+  int beam_width_;
+
+  // Label selection is designed to avoid possibly very expensive scorer calls,
+  // by pruning the hypotheses based on the input alone.
+  // Label selection size controls how many items in each beam are passed
+  // through to the beam scorer. Only items with top N input scores are
+  // considered.
+  // Label selection margin controls the difference between minimal input score
+  // (versus the best scoring label) for an item to be passed to the beam
+  // scorer. This margin is expressed in terms of log-probability.
+  // Default is to do no label selection.
+  // For more detail: https://research.google.com/pubs/pub44823.html
+  int label_selection_size_ = 0;       // zero means unlimited
+  float label_selection_margin_ = -1;  // -1 means unlimited.
+
+  gtl::TopN<BeamEntry*, CTCBeamComparer> leaves_;
+  std::unique_ptr<BeamRoot> beam_root_;
+  BaseBeamScorer<CTCBeamState>* beam_scorer_;
+
+  CTCBeamSearchDecoder(const CTCBeamSearchDecoder&) = delete;
+  void operator=(const CTCBeamSearchDecoder&) = delete;
+};
+
+template <typename CTCBeamState, typename CTCBeamComparer>
+bool CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
+    const CTCDecoder::SequenceLength& seq_len,
+    const std::vector<CTCDecoder::Input>& input,
+    std::vector<CTCDecoder::Output>* output, ScoreOutput* scores) {
+  // Storage for top paths.
+  std::vector<std::vector<int>> beams;
+  std::vector<float> beam_log_probabilities;
+  int top_n = output->size();
+  if (std::any_of(output->begin(), output->end(),
+                  [this](const CTCDecoder::Output& output) -> bool {
+                    return output.size() < this->batch_size_;
+                  })) {
+    return false;
+  }
+  if (scores->rows() < batch_size_ || scores->cols() < top_n) {
+    return false;
+  }
+
+  for (int b = 0; b < batch_size_; ++b) {
+    int seq_len_b = seq_len[b];
+    Reset();
+
+    for (int t = 0; t < seq_len_b; ++t) {
+      // Pass log-probabilities for this example + time.
+      Step(input[t].row(b));
+    }  // for (int t...
+
+    // O(n * log(n))
+    std::unique_ptr<std::vector<BeamEntry*>> branches(leaves_.Extract());
+    leaves_.Reset();
+    for (int i = 0; i < branches->size(); ++i) {
+      BeamEntry* entry = (*branches)[i];
+      beam_scorer_->ExpandStateEnd(&entry->state);
+      entry->newp.total +=
+          beam_scorer_->GetStateEndExpansionScore(entry->state);
+      leaves_.push(entry);
+    }
+
+    bool status =
+        TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_);
+    if (!status) {
+      return status;
+    }
+
+    TFLITE_DCHECK_EQ(top_n, beam_log_probabilities.size());
+    TFLITE_DCHECK_EQ(beams.size(), beam_log_probabilities.size());
+
+    for (int i = 0; i < top_n; ++i) {
+      // Copy output to the correct beam + batch
+      (*output)[i][b].swap(beams[i]);
+      (*scores)(b, i) = -beam_log_probabilities[i];
+    }
+  }  // for (int b...
+  return true;
+}
+
+template <typename CTCBeamState, typename CTCBeamComparer>
+template <typename Vector>
+float CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::GetTopK(
+    const int K, const Vector& input, std::vector<float>* top_k_logits,
+    std::vector<int>* top_k_indices) {
+  // Find Top K choices, complexity nk in worst case. The array input is read
+  // just once.
+  TFLITE_DCHECK_EQ(num_classes_, input.size());
+  top_k_logits->clear();
+  top_k_indices->clear();
+  top_k_logits->resize(K, -INFINITY);
+  top_k_indices->resize(K, -1);
+  for (int j = 0; j < num_classes_ - 1; ++j) {
+    const float logit = input(j);
+    if (logit > (*top_k_logits)[K - 1]) {
+      int k = K - 1;
+      while (k > 0 && logit > (*top_k_logits)[k - 1]) {
+        (*top_k_logits)[k] = (*top_k_logits)[k - 1];
+        (*top_k_indices)[k] = (*top_k_indices)[k - 1];
+        k--;
+      }
+      (*top_k_logits)[k] = logit;
+      (*top_k_indices)[k] = j;
+    }
+  }
+  // Return max value which is in 0th index or blank character logit
+  return std::max((*top_k_logits)[0], input(num_classes_ - 1));
+}
+
+template <typename CTCBeamState, typename CTCBeamComparer>
+template <typename Vector>
+void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
+    const Vector& raw_input) {
+  std::vector<float> top_k_logits;
+  std::vector<int> top_k_indices;
+  const bool top_k =
+      (label_selection_size_ > 0 && label_selection_size_ < raw_input.size());
+  // Number of character classes to consider in each step.
+  const int max_classes = top_k ? label_selection_size_ : (num_classes_ - 1);
+  // Get max coefficient and remove it from raw_input later.
+  float max_coeff;
+  if (top_k) {
+    max_coeff = GetTopK(label_selection_size_, raw_input, &top_k_logits,
+                        &top_k_indices);
+  } else {
+    max_coeff = raw_input.maxCoeff();
+  }
+
+  // Get normalization term of softmax: log(sum(exp(logit[j]-max_coeff))).
+  float logsumexp = 0.0;
+  for (int j = 0; j < raw_input.size(); ++j) {
+    logsumexp += Eigen::numext::exp(raw_input(j) - max_coeff);
+  }
+  logsumexp = Eigen::numext::log(logsumexp);
+  // Final normalization offset to get correct log probabilities.
+  float norm_offset = max_coeff + logsumexp;
+
+  const float label_selection_input_min =
+      (label_selection_margin_ >= 0) ? (max_coeff - label_selection_margin_)
+                                     : -std::numeric_limits<float>::infinity();
+
+  // Extract the beams sorted in decreasing new probability
+  TFLITE_DCHECK_EQ(num_classes_, raw_input.size());
+
+  std::unique_ptr<std::vector<BeamEntry*>> branches(leaves_.Extract());
+  leaves_.Reset();
+
+  for (BeamEntry* b : *branches) {
+    // P(.. @ t) becomes the new P(.. @ t-1)
+    b->oldp = b->newp;
+  }
+
+  for (BeamEntry* b : *branches) {
+    if (b->parent != nullptr) {  // if not the root
+      if (b->parent->Active()) {
+        // If last two sequence characters are identical:
+        //   Plabel(l=acc @ t=6) = (Plabel(l=acc @ t=5)
+        //                          + Pblank(l=ac @ t=5))
+        // else:
+        //   Plabel(l=abc @ t=6) = (Plabel(l=abc @ t=5)
+        //                          + P(l=ab @ t=5))
+        float previous = (b->label == b->parent->label) ? b->parent->oldp.blank
+                                                        : b->parent->oldp.total;
+        b->newp.label =
+            LogSumExp(b->newp.label,
+                      beam_scorer_->GetStateExpansionScore(b->state, previous));
+      }
+      // Plabel(l=abc @ t=6) *= P(c @ 6)
+      b->newp.label += raw_input(b->label) - norm_offset;
+    }
+    // Pblank(l=abc @ t=6) = P(l=abc @ t=5) * P(- @ 6)
+    b->newp.blank = b->oldp.total + raw_input(blank_index_) - norm_offset;
+    // P(l=abc @ t=6) = Plabel(l=abc @ t=6) + Pblank(l=abc @ t=6)
+    b->newp.total = LogSumExp(b->newp.blank, b->newp.label);
+
+    // Push the entry back to the top paths list.
+    // Note, this will always fill leaves back up in sorted order.
+    leaves_.push(b);
+  }
+
+  // we need to resort branches in descending oldp order.
+
+  // branches is in descending oldp order because it was
+  // originally in descending newp order and we copied newp to oldp.
+
+  // Grow new leaves
+  for (BeamEntry* b : *branches) {
+    // A new leaf (represented by its BeamProbability) is a candidate
+    // iff its total probability is nonzero and either the beam list
+    // isn't full, or the lowest probability entry in the beam has a
+    // lower probability than the leaf.
+    auto is_candidate = [this](const BeamProbability& prob) {
+      return (prob.total > kLogZero &&
+              (leaves_.size() < beam_width_ ||
+               prob.total > leaves_.peek_bottom()->newp.total));
+    };
+
+    if (!is_candidate(b->oldp)) {
+      continue;
+    }
+
+    for (int ind = 0; ind < max_classes; ind++) {
+      const int label = top_k ? top_k_indices[ind] : ind;
+      const float logit = top_k ? top_k_logits[ind] : raw_input(ind);
+      // Perform label selection: if input for this label looks very
+      // unpromising, never evaluate it with a scorer.
+      // We may compare logits instead of log probabilities,
+      // since the difference is the same in both cases.
+      if (logit < label_selection_input_min) {
+        continue;
+      }
+      BeamEntry& c = b->GetChild(label);
+      if (!c.Active()) {
+        //   Pblank(l=abcd @ t=6) = 0
+        c.newp.blank = kLogZero;
+        // If new child label is identical to beam label:
+        //   Plabel(l=abcc @ t=6) = Pblank(l=abc @ t=5) * P(c @ 6)
+        // Otherwise:
+        //   Plabel(l=abcd @ t=6) = P(l=abc @ t=5) * P(d @ 6)
+        beam_scorer_->ExpandState(b->state, b->label, &c.state, c.label);
+        float previous = (c.label == b->label) ? b->oldp.blank : b->oldp.total;
+        c.newp.label = logit - norm_offset +
+                       beam_scorer_->GetStateExpansionScore(c.state, previous);
+        // P(l=abcd @ t=6) = Plabel(l=abcd @ t=6)
+        c.newp.total = c.newp.label;
+
+        if (is_candidate(c.newp)) {
+          // Before adding the new node to the beam, check if the beam
+          // is already at maximum width.
+          if (leaves_.size() == beam_width_) {
+            // Bottom is no longer in the beam search.  Reset
+            // its probability; signal it's no longer in the beam search.
+            BeamEntry* bottom = leaves_.peek_bottom();
+            bottom->newp.Reset();
+          }
+          leaves_.push(&c);
+        } else {
+          // Deactivate child.
+          c.oldp.Reset();
+          c.newp.Reset();
+        }
+      }
+    }
+  }  // for (BeamEntry* b...
+}
+
+template <typename CTCBeamState, typename CTCBeamComparer>
+void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Reset() {
+  leaves_.Reset();
+
+  // This beam root, and all of its children, will be in memory until
+  // the next reset.
+  beam_root_.reset(new BeamRoot(nullptr, -1));
+  beam_root_->RootEntry()->newp.total = 0.0;  // ln(1)
+  beam_root_->RootEntry()->newp.blank = 0.0;  // ln(1)
+
+  // Add the root as the initial leaf.
+  leaves_.push(beam_root_->RootEntry());
+
+  // Call initialize state on the root object.
+  beam_scorer_->InitializeState(&beam_root_->RootEntry()->state);
+}
+
+template <typename CTCBeamState, typename CTCBeamComparer>
+bool CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
+    int n, std::vector<std::vector<int>>* paths, std::vector<float>* log_probs,
+    bool merge_repeated) const {
+  TFLITE_DCHECK(paths);
+  TFLITE_DCHECK(log_probs);
+  paths->clear();
+  log_probs->clear();
+  if (n > beam_width_) {
+    return false;
+  }
+  if (n > leaves_.size()) {
+    return false;
+  }
+
+  gtl::TopN<BeamEntry*, CTCBeamComparer> top_branches(n);
+
+  // O(beam_width_ * log(n)), space complexity is O(n)
+  for (auto it = leaves_.unsorted_begin(); it != leaves_.unsorted_end(); ++it) {
+    top_branches.push(*it);
+  }
+  // O(n * log(n))
+  std::unique_ptr<std::vector<BeamEntry*>> branches(top_branches.Extract());
+
+  for (int i = 0; i < n; ++i) {
+    BeamEntry* e((*branches)[i]);
+    paths->push_back(e->LabelSeq(merge_repeated));
+    log_probs->push_back(e->newp.total);
+  }
+  return true;
+}
+
+}  // namespace ctc
+}  // namespace custom
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CTC_CTC_BEAM_SEARCH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_decoder.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_decoder.h
new file mode 100644
index 00000000..1c5591b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_decoder.h
@@ -0,0 +1,114 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied from tensorflow/core/util/ctc/ctc_decoder.h
+// TODO(b/111524997): Remove this file.
+#ifndef TENSORFLOW_LITE_KERNELS_CTC_CTC_DECODER_H_
+#define TENSORFLOW_LITE_KERNELS_CTC_CTC_DECODER_H_
+
+#include <memory>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+
+namespace tflite {
+namespace custom {
+namespace ctc {
+
+// The CTCDecoder is an abstract interface to be implemented when providing a
+// decoding method on the timestep output of a RNN trained with CTC loss.
+//
+// The two types of decoding available are:
+//   - greedy path, through the CTCGreedyDecoder
+//   - beam search, through the CTCBeamSearchDecoder
+class CTCDecoder {
+ public:
+  typedef Eigen::Map<const Eigen::ArrayXi> SequenceLength;
+  typedef Eigen::Map<const Eigen::MatrixXf> Input;
+  typedef std::vector<std::vector<int>> Output;
+  typedef Eigen::Map<Eigen::MatrixXf> ScoreOutput;
+
+  CTCDecoder(int num_classes, int batch_size, bool merge_repeated)
+      : num_classes_(num_classes),
+        blank_index_(num_classes - 1),
+        batch_size_(batch_size),
+        merge_repeated_(merge_repeated) {}
+
+  virtual ~CTCDecoder() {}
+
+  // Dimensionality of the input/output is expected to be:
+  //  - seq_len[b] - b = 0 to batch_size_
+  //  - input[t].rows(b) - t = 0 to timesteps; b = 0 t batch_size_
+  //  - output.size() specifies the number of beams to be returned.
+  //  - scores(b, i) - b = 0 to batch_size; i = 0 to output.size()
+  virtual bool Decode(const SequenceLength& seq_len,
+                      const std::vector<Input>& input,
+                      std::vector<Output>* output, ScoreOutput* scores) = 0;
+
+  int batch_size() { return batch_size_; }
+  int num_classes() { return num_classes_; }
+
+ protected:
+  int num_classes_;
+  int blank_index_;
+  int batch_size_;
+  bool merge_repeated_;
+};
+
+// CTCGreedyDecoder is an implementation of the simple best path decoding
+// algorithm, selecting at each timestep the most likely class at each timestep.
+class CTCGreedyDecoder : public CTCDecoder {
+ public:
+  CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated)
+      : CTCDecoder(num_classes, batch_size, merge_repeated) {}
+
+  bool Decode(const CTCDecoder::SequenceLength& seq_len,
+              const std::vector<CTCDecoder::Input>& input,
+              std::vector<CTCDecoder::Output>* output,
+              CTCDecoder::ScoreOutput* scores) override {
+    if (output->empty() || (*output)[0].size() < batch_size_) {
+      return false;
+    }
+    if (scores->rows() < batch_size_ || scores->cols() == 0) {
+      return false;
+    }
+    // For each batch entry, identify the transitions
+    for (int b = 0; b < batch_size_; ++b) {
+      int seq_len_b = seq_len[b];
+      // Only writing to beam 0
+      std::vector<int>& output_b = (*output)[0][b];
+
+      int prev_class_ix = -1;
+      (*scores)(b, 0) = 0;
+      for (int t = 0; t < seq_len_b; ++t) {
+        auto row = input[t].row(b);
+        int max_class_ix;
+        (*scores)(b, 0) += -row.maxCoeff(&max_class_ix);
+        if (max_class_ix != blank_index_ &&
+            !(merge_repeated_ && max_class_ix == prev_class_ix)) {
+          output_b.push_back(max_class_ix);
+        }
+        prev_class_ix = max_class_ix;
+      }
+    }
+    return true;
+  }
+};
+
+}  // namespace ctc
+}  // namespace custom
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CTC_CTC_DECODER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_loss_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_loss_util.h
new file mode 100644
index 00000000..eb1eee57
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/ctc_loss_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied from tensorflow/core/util/ctc/ctc_loss_util.h
+// TODO(b/111524997): Remove this file.
+#ifndef TENSORFLOW_LITE_KERNELS_CTC_CTC_LOSS_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_CTC_CTC_LOSS_UTIL_H_
+
+#include <cmath>
+#include <limits>
+
+namespace tflite {
+namespace custom {
+namespace ctc {
+
+const float kLogZero = -std::numeric_limits<float>::infinity();
+
+// Add logarithmic probabilities using:
+// ln(a + b) = ln(a) + ln(1 + exp(ln(b) - ln(a)))
+// The two inputs are assumed to be log probabilities.
+// (GravesTh) Eq. 7.18
+inline float LogSumExp(float log_prob_1, float log_prob_2) {
+  // Always have 'b' be the smaller number to avoid the exponential from
+  // blowing up.
+  if (log_prob_1 == kLogZero && log_prob_2 == kLogZero) {
+    return kLogZero;
+  } else {
+    return (log_prob_1 > log_prob_2)
+               ? log_prob_1 + log1pf(expf(log_prob_2 - log_prob_1))
+               : log_prob_2 + log1pf(expf(log_prob_1 - log_prob_2));
+  }
+}
+
+}  // namespace ctc
+}  // namespace custom
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CTC_CTC_LOSS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/top_n.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/top_n.h
new file mode 100644
index 00000000..8520ac36
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/ctc/top_n.h
@@ -0,0 +1,341 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This simple class finds the top n elements of an incrementally provided set
+// of elements which you push one at a time.  If the number of elements exceeds
+// n, the lowest elements are incrementally dropped.  At the end you get
+// a vector of the top elements sorted in descending order (through Extract() or
+// ExtractNondestructive()), or a vector of the top elements but not sorted
+// (through ExtractUnsorted() or ExtractUnsortedNondestructive()).
+//
+// The value n is specified in the constructor.  If there are p elements pushed
+// altogether:
+//   The total storage requirements are O(min(n, p)) elements
+//   The running time is O(p * log(min(n, p))) comparisons
+// If n is a constant, the total storage required is a constant and the running
+// time is linear in p.
+//
+// NOTE(zhifengc): There is a way to do this in O(min(n, p)) storage and O(p)
+// runtime. The basic idea is to repeatedly fill up a buffer of 2 * n elements,
+// discarding the lowest n elements whenever the buffer is full using a linear-
+// time median algorithm. This may have better performance when the input
+// sequence is partially sorted.
+//
+// NOTE(zhifengc): This class should be redesigned to avoid reallocating a
+// vector for each Extract.
+
+// Copied from tensorflow/core/lib/gtl/top_n.h
+// TODO(b/111524997): Remove this file.
+#ifndef TENSORFLOW_LITE_KERNELS_CTC_TOP_N_H_
+#define TENSORFLOW_LITE_KERNELS_CTC_TOP_N_H_
+
+#include <stddef.h>
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace gtl {
+
+// Cmp is an stl binary predicate.  Note that Cmp is the "greater" predicate,
+// not the more commonly used "less" predicate.
+//
+// If you use a "less" predicate here, the TopN will pick out the bottom N
+// elements out of the ones passed to it, and it will return them sorted in
+// ascending order.
+//
+// TopN is rule-of-zero copyable and movable if its members are.
+template <class T, class Cmp = std::greater<T> >
+class TopN {
+ public:
+  // The TopN is in one of the three states:
+  //
+  //  o UNORDERED: this is the state an instance is originally in,
+  //    where the elements are completely orderless.
+  //
+  //  o BOTTOM_KNOWN: in this state, we keep the invariant that there
+  //    is at least one element in it, and the lowest element is at
+  //    position 0. The elements in other positions remain
+  //    unsorted. This state is reached if the state was originally
+  //    UNORDERED and a peek_bottom() function call is invoked.
+  //
+  //  o HEAP_SORTED: in this state, the array is kept as a heap and
+  //    there are exactly (limit_+1) elements in the array. This
+  //    state is reached when at least (limit_+1) elements are
+  //    pushed in.
+  //
+  //  The state transition graph is at follows:
+  //
+  //             peek_bottom()                (limit_+1) elements
+  //  UNORDERED --------------> BOTTOM_KNOWN --------------------> HEAP_SORTED
+  //      |                                                           ^
+  //      |                      (limit_+1) elements                  |
+  //      +-----------------------------------------------------------+
+
+  enum State { UNORDERED, BOTTOM_KNOWN, HEAP_SORTED };
+  using UnsortedIterator = typename std::vector<T>::const_iterator;
+
+  // 'limit' is the maximum number of top results to return.
+  explicit TopN(size_t limit) : TopN(limit, Cmp()) {}
+  TopN(size_t limit, const Cmp &cmp) : limit_(limit), cmp_(cmp) {}
+
+  size_t limit() const { return limit_; }
+
+  // Number of elements currently held by this TopN object.  This
+  // will be no greater than 'limit' passed to the constructor.
+  size_t size() const { return std::min(elements_.size(), limit_); }
+
+  bool empty() const { return size() == 0; }
+
+  // If you know how many elements you will push at the time you create the
+  // TopN object, you can call reserve to preallocate the memory that TopN
+  // will need to process all 'n' pushes.  Calling this method is optional.
+  void reserve(size_t n) { elements_.reserve(std::min(n, limit_ + 1)); }
+
+  // Push 'v'.  If the maximum number of elements was exceeded, drop the
+  // lowest element and return it in 'dropped' (if given). If the maximum is not
+  // exceeded, 'dropped' will remain unchanged. 'dropped' may be omitted or
+  // nullptr, in which case it is not filled in.
+  // Requires: T is CopyAssignable, Swappable
+  void push(const T &v) { push(v, nullptr); }
+  void push(const T &v, T *dropped) { PushInternal(v, dropped); }
+
+  // Move overloads of push.
+  // Requires: T is MoveAssignable, Swappable
+  void push(T &&v) {  // NOLINT(build/c++11)
+    push(std::move(v), nullptr);
+  }
+  void push(T &&v, T *dropped) {  // NOLINT(build/c++11)
+    PushInternal(std::move(v), dropped);
+  }
+
+  // Peeks the bottom result without calling Extract()
+  const T &peek_bottom();
+
+  // Extract the elements as a vector sorted in descending order.  The caller
+  // assumes ownership of the vector and must delete it when done.  This is a
+  // destructive operation.  The only method that can be called immediately
+  // after Extract() is Reset().
+  std::vector<T> *Extract();
+
+  // Similar to Extract(), but makes no guarantees the elements are in sorted
+  // order.  As with Extract(), the caller assumes ownership of the vector and
+  // must delete it when done.  This is a destructive operation.  The only
+  // method that can be called immediately after ExtractUnsorted() is Reset().
+  std::vector<T> *ExtractUnsorted();
+
+  // A non-destructive version of Extract(). Copy the elements in a new vector
+  // sorted in descending order and return it.  The caller assumes ownership of
+  // the new vector and must delete it when done.  After calling
+  // ExtractNondestructive(), the caller can continue to push() new elements.
+  std::vector<T> *ExtractNondestructive() const;
+
+  // A non-destructive version of Extract(). Copy the elements to a given
+  // vector sorted in descending order. After calling
+  // ExtractNondestructive(), the caller can continue to push() new elements.
+  // Note:
+  //  1. The given argument must to be allocated.
+  //  2. Any data contained in the vector prior to the call will be deleted
+  //     from it. After the call the vector will contain only the elements
+  //     from the data structure.
+  void ExtractNondestructive(std::vector<T> *output) const;
+
+  // A non-destructive version of ExtractUnsorted(). Copy the elements in a new
+  // vector and return it, with no guarantees the elements are in sorted order.
+  // The caller assumes ownership of the new vector and must delete it when
+  // done.  After calling ExtractUnsortedNondestructive(), the caller can
+  // continue to push() new elements.
+  std::vector<T> *ExtractUnsortedNondestructive() const;
+
+  // A non-destructive version of ExtractUnsorted(). Copy the elements into
+  // a given vector, with no guarantees the elements are in sorted order.
+  // After calling ExtractUnsortedNondestructive(), the caller can continue
+  // to push() new elements.
+  // Note:
+  //  1. The given argument must to be allocated.
+  //  2. Any data contained in the vector prior to the call will be deleted
+  //     from it. After the call the vector will contain only the elements
+  //     from the data structure.
+  void ExtractUnsortedNondestructive(std::vector<T> *output) const;
+
+  // Return an iterator to the beginning (end) of the container,
+  // with no guarantees about the order of iteration. These iterators are
+  // invalidated by mutation of the data structure.
+  UnsortedIterator unsorted_begin() const { return elements_.begin(); }
+  UnsortedIterator unsorted_end() const { return elements_.begin() + size(); }
+
+  // Accessor for comparator template argument.
+  Cmp *comparator() { return &cmp_; }
+
+  // This removes all elements.  If Extract() or ExtractUnsorted() have been
+  // called, this will put it back in an empty but useable state.
+  void Reset();
+
+ private:
+  template <typename U>
+  void PushInternal(U &&v, T *dropped);  // NOLINT(build/c++11)
+
+  // elements_ can be in one of two states:
+  //   elements_.size() <= limit_:  elements_ is an unsorted vector of elements
+  //      pushed so far.
+  //   elements_.size() > limit_:  The last element of elements_ is unused;
+  //      the other elements of elements_ are an stl heap whose size is exactly
+  //      limit_.  In this case elements_.size() is exactly one greater than
+  //      limit_, but don't use "elements_.size() == limit_ + 1" to check for
+  //      that because you'll get a false positive if limit_ == size_t(-1).
+  std::vector<T> elements_;
+  size_t limit_;  // Maximum number of elements to find
+  Cmp cmp_;       // Greater-than comparison function
+  State state_ = UNORDERED;
+};
+
+// ----------------------------------------------------------------------
+// Implementations of non-inline functions
+
+template <class T, class Cmp>
+template <typename U>
+void TopN<T, Cmp>::PushInternal(U &&v, T *dropped) {  // NOLINT(build/c++11)
+  if (limit_ == 0) {
+    if (dropped) *dropped = std::forward<U>(v);  // NOLINT(build/c++11)
+    return;
+  }
+  if (state_ != HEAP_SORTED) {
+    elements_.push_back(std::forward<U>(v));  // NOLINT(build/c++11)
+    if (state_ == UNORDERED || cmp_(elements_.back(), elements_.front())) {
+      // Easy case: we just pushed the new element back
+    } else {
+      // To maintain the BOTTOM_KNOWN state, we need to make sure that
+      // the element at position 0 is always the smallest. So we put
+      // the new element at position 0 and push the original bottom
+      // element in the back.
+      // Warning: this code is subtle.
+      using std::swap;
+      swap(elements_.front(), elements_.back());
+    }
+    if (elements_.size() == limit_ + 1) {
+      // Transition from unsorted vector to a heap.
+      std::make_heap(elements_.begin(), elements_.end(), cmp_);
+      if (dropped) *dropped = std::move(elements_.front());
+      std::pop_heap(elements_.begin(), elements_.end(), cmp_);
+      state_ = HEAP_SORTED;
+    }
+  } else {
+    // Only insert the new element if it is greater than the least element.
+    if (cmp_(v, elements_.front())) {
+      elements_.back() = std::forward<U>(v);  // NOLINT(build/c++11)
+      std::push_heap(elements_.begin(), elements_.end(), cmp_);
+      if (dropped) *dropped = std::move(elements_.front());
+      std::pop_heap(elements_.begin(), elements_.end(), cmp_);
+    } else {
+      if (dropped) *dropped = std::forward<U>(v);  // NOLINT(build/c++11)
+    }
+  }
+}
+
+template <class T, class Cmp>
+const T &TopN<T, Cmp>::peek_bottom() {
+  TFLITE_DCHECK(!empty());
+  if (state_ == UNORDERED) {
+    // We need to do a linear scan to find out the bottom element
+    int min_candidate = 0;
+    for (size_t i = 1; i < elements_.size(); ++i) {
+      if (cmp_(elements_[min_candidate], elements_[i])) {
+        min_candidate = i;
+      }
+    }
+    // By swapping the element at position 0 and the minimal
+    // element, we transition to the BOTTOM_KNOWN state
+    if (min_candidate != 0) {
+      using std::swap;
+      swap(elements_[0], elements_[min_candidate]);
+    }
+    state_ = BOTTOM_KNOWN;
+  }
+  return elements_.front();
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::Extract() {
+  auto out = new std::vector<T>;
+  out->swap(elements_);
+  if (state_ != HEAP_SORTED) {
+    std::sort(out->begin(), out->end(), cmp_);
+  } else {
+    out->pop_back();
+    std::sort_heap(out->begin(), out->end(), cmp_);
+  }
+  return out;
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::ExtractUnsorted() {
+  auto out = new std::vector<T>;
+  out->swap(elements_);
+  if (state_ == HEAP_SORTED) {
+    // Remove the limit_+1'th element.
+    out->pop_back();
+  }
+  return out;
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::ExtractNondestructive() const {
+  auto out = new std::vector<T>;
+  ExtractNondestructive(out);
+  return out;
+}
+
+template <class T, class Cmp>
+void TopN<T, Cmp>::ExtractNondestructive(std::vector<T> *output) const {
+  TFLITE_DCHECK(output);
+  *output = elements_;
+  if (state_ != HEAP_SORTED) {
+    std::sort(output->begin(), output->end(), cmp_);
+  } else {
+    output->pop_back();
+    std::sort_heap(output->begin(), output->end(), cmp_);
+  }
+}
+
+template <class T, class Cmp>
+std::vector<T> *TopN<T, Cmp>::ExtractUnsortedNondestructive() const {
+  auto elements = new std::vector<T>;
+  ExtractUnsortedNondestructive(elements);
+  return elements;
+}
+
+template <class T, class Cmp>
+void TopN<T, Cmp>::ExtractUnsortedNondestructive(std::vector<T> *output) const {
+  TFLITE_DCHECK(output);
+  *output = elements_;
+  if (state_ == HEAP_SORTED) {
+    // Remove the limit_+1'th element.
+    output->pop_back();
+  }
+}
+
+template <class T, class Cmp>
+void TopN<T, Cmp>::Reset() {
+  elements_.clear();
+  state_ = UNORDERED;
+}
+
+}  // namespace gtl
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CTC_TOP_N_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/custom_ops_register.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/custom_ops_register.h
new file mode 100644
index 00000000..d2624c8a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/custom_ops_register.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
+#define TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_ATAN2();
+TfLiteRegistration* Register_AVG_POOL_3D();
+TfLiteRegistration* Register_HASHTABLE();
+TfLiteRegistration* Register_HASHTABLE_FIND();
+TfLiteRegistration* Register_HASHTABLE_IMPORT();
+TfLiteRegistration* Register_HASHTABLE_SIZE();
+TfLiteRegistration* Register_IRFFT2D();
+TfLiteRegistration* Register_MAX_POOL_3D();
+TfLiteRegistration* Register_MULTINOMIAL();
+TfLiteRegistration* Register_RANDOM_STANDARD_NORMAL();
+TfLiteRegistration* Register_RANDOM_UNIFORM();
+TfLiteRegistration* Register_RANDOM_UNIFORM_INT();
+TfLiteRegistration* Register_ROLL();
+TfLiteRegistration* Register_SIGN();
+TfLiteRegistration* Register_TABLE();
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/dequantize.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/dequantize.h
new file mode 100644
index 00000000..f38abe4f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/dequantize.h
@@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_DEQUANTIZE_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace dequantize {
+
+// This file has two implementation of Dequantize.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+inline bool IsQuantizedPerChannel(const TfLiteTensor* input) {
+  if (input->quantization.type == kTfLiteAffineQuantization &&
+      input->quantization.params) {
+    auto* quant_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
+    return (quant_params->scale && quant_params->scale->size > 1);
+  }
+  return false;
+}
+
+inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
+                                             TfLiteNode* node,
+                                             const TfLiteTensor* input,
+                                             TfLiteTensor* output) {
+  const auto* quantization_params =
+      reinterpret_cast<const TfLiteAffineQuantization*>(
+          input->quantization.params);
+  PerChannelDequantizationParams per_channel_op_params;
+  per_channel_op_params.quantized_dimension =
+      quantization_params->quantized_dimension;
+  per_channel_op_params.scale = quantization_params->scale->data;
+  per_channel_op_params.zero_point = quantization_params->zero_point->data;
+  const int8_t* input_data;
+  const size_t bytes_unpacked = input->bytes * 2;
+  auto unpacked_input_data = std::make_unique<int8_t[]>(bytes_unpacked);
+
+  if (input->type == kTfLiteInt4) {
+    tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+        GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
+        unpacked_input_data.get());
+    input_data = unpacked_input_data.get();
+  } else {
+    input_data = GetTensorData<int8_t>(input);
+  }
+
+  switch (input->type) {
+    case kTfLiteUInt8:
+      reference_ops::PerChannelDequantize<uint8_t>(
+          per_channel_op_params, GetTensorShape(input),
+          GetTensorData<uint8_t>(input), GetTensorShape(output),
+          GetTensorData<float>(output));
+      break;
+    case kTfLiteInt4:
+    case kTfLiteInt8:
+      reference_ops::PerChannelDequantize<int8_t>(
+          per_channel_op_params, GetTensorShape(input), input_data,
+          GetTensorShape(output), GetTensorData<float>(output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %d not supported for per-channel.",
+                         input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
+                            const TfLiteTensor* input, TfLiteTensor* output) {
+  if (IsQuantizedPerChannel(input)) {
+    return PerChannelDequantizeImpl(context, node, input, output);
+  }
+  DequantizationParams op_params;
+  op_params.zero_point = input->params.zero_point;
+  op_params.scale = input->params.scale;
+  const int8_t* input_data;
+  const size_t bytes_unpacked = input->bytes * 2;
+  auto unpacked_input_data = std::make_unique<int8_t[]>(bytes_unpacked);
+
+  if (input->type == kTfLiteInt4) {
+    // Use GetTensorShape(input).FlatSize() for num_elements.
+    tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+        GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
+        unpacked_input_data.get());
+    input_data = unpacked_input_data.get();
+  } else {
+    input_data = GetTensorData<int8_t>(input);
+  }
+
+  switch (input->type) {
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        reference_ops::Dequantize(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        optimized_ops::Dequantize(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      }
+      break;
+    case kTfLiteInt4:
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        reference_integer_ops::Dequantize<int8_t>(
+            op_params, GetTensorShape(input), input_data,
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        optimized_ops::Dequantize(op_params, GetTensorShape(input), input_data,
+                                  GetTensorShape(output),
+                                  GetTensorData<float>(output));
+      }
+      break;
+    case kTfLiteInt16:
+      if (kernel_type == kReference) {
+        reference_integer_ops::Dequantize<int16_t>(
+            op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        optimized_ops::Dequantize(
+            op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      }
+      break;
+    case kTfLiteFloat16: {
+      const Eigen::half* half_data = reinterpret_cast<const Eigen::half*>(
+          GetTensorData<TfLiteFloat16>(input));
+      reference_ops::Dequantize(GetTensorShape(input), half_data,
+                                GetTensorShape(output),
+                                GetTensorData<float>(output));
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %d not supported.", input->type);
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace dequantize
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_DEQUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/eigen_support.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/eigen_support.h
new file mode 100644
index 00000000..4a9c48ac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/eigen_support.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
+#define TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace EigenForTFLite {
+struct ThreadPoolDevice;
+}
+
+namespace tflite {
+namespace eigen_support {
+
+// Let the framework know that the op will be using Eigen. If necessary a set of
+// temporary Eigen objects might be created and placed in 'context'.
+void IncrementUsageCounter(TfLiteContext* context);
+
+// Let the framework know that the op stopped using Eigen. If there are no more
+// usages all temporary Eigen objects will be deleted.
+void DecrementUsageCounter(TfLiteContext* context);
+
+// Fetch the ThreadPoolDevice associated with the provided context.
+//
+// Note: The caller must ensure that |IncrementUsageCounter()| has already been
+// called. Moreover, it is *not* safe to cache the returned device; it may be
+// invalidated if the context thread count changes.
+const EigenForTFLite::ThreadPoolDevice* GetThreadPoolDevice(
+    TfLiteContext* context);
+
+}  // namespace eigen_support
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/floor_mod_test_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/floor_mod_test_common.h
new file mode 100644
index 00000000..fd226a8e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/floor_mod_test_common.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_FLOOR_MOD_TEST_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_FLOOR_MOD_TEST_COMMON_H_
+
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+template <typename T>
+class FloorModModel : public SingleOpModel {
+ public:
+  FloorModModel(const TensorData& input1, const TensorData& input2,
+                const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_FLOOR_MOD, BuiltinOptions_FloorModOptions,
+                 CreateFloorModOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_FLOOR_MOD_TEST_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/fully_connected.h
new file mode 100644
index 00000000..26cbf797
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/fully_connected.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Forward declares registrations for specific FC layer implementations. Do not
+// include this header if you are fine with any FC implementation, include
+// builtin_op_kernels.h instead. This implementation-specific registration is
+// only available for FC, as these versions are explicitly tested and supported.
+
+#ifndef TENSORFLOW_LITE_KERNELS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
+TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_REF();
+TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_OPT();
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/gradient/bcast_grad_args.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/gradient/bcast_grad_args.h
new file mode 100644
index 00000000..21eb650b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/gradient/bcast_grad_args.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_GRADIENT_BCAST_GRAD_ARGS_H_
+#define TENSORFLOW_LITE_KERNELS_GRADIENT_BCAST_GRAD_ARGS_H_
+
+// This file declares the TensorFlow Lite's broadcast gradient argument custom
+// operator.
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_BROADCAST_GRADIENT_ARGS();
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_GRADIENT_BCAST_GRAD_ARGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/gradient/gradient_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/gradient/gradient_ops.h
new file mode 100644
index 00000000..77289815
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/gradient/gradient_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_GRADIENT_GRADIENT_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_GRADIENT_GRADIENT_OPS_H_
+
+// This file declares the TensorFlow Lite's gradient custom operators.
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+extern "C" void AddGradientOps(::tflite::MutableOpResolver* resolver);
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_GRADIENT_GRADIENT_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/gru_cell.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/gru_cell.h
new file mode 100644
index 00000000..097a03b9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/gru_cell.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_GRU_CELL_H_
+#define TENSORFLOW_LITE_KERNELS_GRU_CELL_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace gru_cell {
+
+void GruCell(const RuntimeShape& input_shape, const float* input,
+             const RuntimeShape& state_shape, const float* input_state,
+             const RuntimeShape& gate_weight_shape, const float* gate_weight,
+             const RuntimeShape& gate_bias_shape, const float* gate_bias,
+             const RuntimeShape& candidate_weight_shape,
+             const float* candidate_weight,
+             const RuntimeShape& candidate_bias_shape,
+             const float* candidate_bias, const RuntimeShape& output_shape,
+             float* output, float* output_state,
+             const RuntimeShape& activation_shape, float* activation,
+             const RuntimeShape& concat_shape, float* concat,
+             const tflite::FullyConnectedParams& fc_params,
+             tflite::CpuBackendContext* cpu_backend_context);
+
+}  // namespace gru_cell
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_GRU_CELL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/common.h
new file mode 100644
index 00000000..4d990d70
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/common.h
@@ -0,0 +1,1313 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#include <cmath>
+#include <functional>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+constexpr int kReverseShift = -1;
+
+// Reduces and compresses dimensions so that broadcast handling becomes more
+// efficient. Returns true if the output shape is broadcastable; it doesn't
+// contain any degenerate dimension, i.e. shape dimension = 0. False otherwise.
+template <int MAX_DIM = 6>
+bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
+                                  const RuntimeShape& input2_shape,
+                                  size_t* compressed_input1_stride,
+                                  size_t* compressed_input2_stride,
+                                  size_t* compressed_output_shape) {
+  size_t num_compressed_dims = 0;
+  size_t compressed_input1_shape[MAX_DIM];
+  size_t compressed_input2_shape[MAX_DIM];
+  std::fill(compressed_input1_shape, compressed_input1_shape + MAX_DIM, 1);
+  std::fill(compressed_input2_shape, compressed_input2_shape + MAX_DIM, 1);
+  std::fill(compressed_output_shape, compressed_output_shape + MAX_DIM, 1);
+  bool broadcast_input1 = false;
+  bool broadcast_input2 = false;
+  bool first_nonunit = true;
+  const size_t num_input1_dims = input1_shape.DimensionsCount();
+  const size_t num_input2_dims = input2_shape.DimensionsCount();
+  const int32_t* input1_dims = input1_shape.DimsData();
+  const int32_t* input2_dims = input2_shape.DimsData();
+  const size_t num_common_dims = std::min(num_input1_dims, num_input2_dims);
+  for (size_t i = 1; i <= num_common_dims; i++) {
+    const size_t input1_dim = input1_dims[num_input1_dims - i];
+    const size_t input2_dim = input2_dims[num_input2_dims - i];
+    if (input1_dim == 0 || input2_dim == 0) {
+      return false;
+    }
+    if (input1_dim == 1 && input2_dim == 1) {
+      continue;
+    }
+    assert(!broadcast_input1 || !broadcast_input2);
+
+    if (input1_dim == 1) {
+      if (!broadcast_input1) {
+        broadcast_input1 = true;
+        broadcast_input2 = false;
+        num_compressed_dims++;
+      }
+      compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
+    } else if (input2_dim == 1) {
+      if (!broadcast_input2) {
+        broadcast_input1 = false;
+        broadcast_input2 = true;
+        num_compressed_dims++;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    } else {
+      TFLITE_DCHECK(input1_dim == input2_dim);
+      if (broadcast_input1 || broadcast_input2 || first_nonunit) {
+        broadcast_input1 = false;
+        broadcast_input2 = false;
+        num_compressed_dims++;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    }
+    first_nonunit = false;
+  }
+  if (num_input1_dims > num_input2_dims) {
+    if (!broadcast_input2) {
+      num_compressed_dims++;
+    }
+    for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) {
+      const size_t input1_dim = input1_dims[i];
+      if (input1_dim == 0) {
+        return false;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    }
+  } else if (num_input2_dims > num_input1_dims) {
+    if (!broadcast_input1) {
+      num_compressed_dims++;
+    }
+    for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) {
+      const size_t input2_dim = input2_dims[i];
+      if (input2_dim == 0) {
+        return false;
+      }
+      compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
+    }
+  }
+  num_compressed_dims = (num_compressed_dims > 1) ? num_compressed_dims : 1;
+
+  int input1_stride = 1;
+  int input2_stride = 1;
+  for (int i = 0; i < MAX_DIM; ++i) {
+    compressed_input1_stride[i] = input1_stride;
+    input1_stride *= compressed_input1_shape[i];
+    compressed_input2_stride[i] = input2_stride;
+    input2_stride *= compressed_input2_shape[i];
+  }
+  for (int i = 0; i < MAX_DIM; ++i) {
+    if (compressed_input1_shape[i] != compressed_input2_shape[i]) {
+      if (compressed_input1_shape[i] == 1) {
+        compressed_input1_stride[i] = 0;
+      } else {
+        TFLITE_DCHECK_EQ(compressed_input2_shape[i], 1);
+        compressed_input2_stride[i] = 0;
+      }
+    }
+  }
+  return true;
+}
+
+inline void GetActivationMinMax(FusedActivationFunctionType ac,
+                                float* output_activation_min,
+                                float* output_activation_max) {
+  switch (ac) {
+    case FusedActivationFunctionType::kNone:
+      *output_activation_min = std::numeric_limits<float>::lowest();
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu:
+      *output_activation_min = 0.f;
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      *output_activation_min = -1.f;
+      *output_activation_max = 1.f;
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      *output_activation_min = 0.f;
+      *output_activation_max = 6.f;
+      break;
+  }
+}
+
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
+                                      T output_activation_max) {
+  using std::max;
+  using std::min;
+  return min(max(x, output_activation_min), output_activation_max);
+}
+
+// Legacy function, left for compatibility only.
+template <FusedActivationFunctionType Ac>
+float ActivationFunction(float x) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  return ActivationFunctionWithMinMax(x, output_activation_min,
+                                      output_activation_max);
+}
+
+inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
+                         const float* bias_data, int array_size,
+                         float* array_data) {
+  if (bias_size == 0) return;
+  // Note: see b/132215220: in May 2019 we thought it would be OK to replace
+  // this with the Eigen one-liner:
+  //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
+  // This turned out to severely regress performance: +4ms (i.e. 8%) on
+  // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
+  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+#ifdef USE_NEON
+  float* array_ptr = array_data;
+  float* array_end_ptr = array_ptr + array_size;
+  const auto clamp_min_vec = vdupq_n_f32(clamp_min);
+  const auto clamp_max_vec = vdupq_n_f32(clamp_max);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16) {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      x0 = vmaxq_f32(clamp_min_vec, x0);
+      x1 = vmaxq_f32(clamp_min_vec, x1);
+      x2 = vmaxq_f32(clamp_min_vec, x2);
+      x3 = vmaxq_f32(clamp_min_vec, x3);
+      x0 = vminq_f32(clamp_max_vec, x0);
+      x1 = vminq_f32(clamp_max_vec, x1);
+      x2 = vminq_f32(clamp_max_vec, x2);
+      x3 = vminq_f32(clamp_max_vec, x3);
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4) {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      x = vmaxq_f32(clamp_min_vec, x);
+      x = vminq_f32(clamp_max_vec, x);
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++) {
+      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
+                                                  clamp_min, clamp_max);
+    }
+  }
+#else  // not NEON
+  for (int array_offset = 0; array_offset < array_size;
+       array_offset += bias_size) {
+    for (int i = 0; i < bias_size; i++) {
+      array_data[array_offset + i] = ActivationFunctionWithMinMax(
+          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+    }
+  }
+#endif
+}
+
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
+
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
+
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  TFLITE_DCHECK_LE(shift, 0);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  TFLITE_DCHECK_GE(shift, 0);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+#ifdef USE_NEON
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
+    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
+  TFLITE_DCHECK(quantized_multiplier >= 0);
+
+  const int right_shift = std::min(-1, shift);
+  const int left_shift = shift - right_shift;
+
+  const int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  int32x4x4_t result;
+  result.val[0] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  result.val[1] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  result.val[2] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  result.val[3] = vrshlq_s32(
+      vqdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup),
+      right_shift_dup);
+
+  return result;
+}
+#endif  // USE_NEON
+// Double-rounding MultiplyByQuantizedMultiplier
+#else
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+#ifdef USE_NEON
+// Round uses ARM's rounding shift right.
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
+    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
+  int32x4x4_t result;
+
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  return result;
+}
+#endif  // USE_NEON
+#endif  // TFLITE_SINGLE_ROUNDING
+
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+  if (integer_input == 0) {
+    return std::numeric_limits<T>::digits;
+  }
+#if defined(__GNUC__)
+  if (std::is_same<T, uint32_t>::value) {
+    return __builtin_clz(integer_input);
+  } else if (std::is_same<T, uint64_t>::value) {
+    return __builtin_clzll(integer_input);
+  }
+#endif
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+}
+
+template <typename T>
+inline int CountLeadingSignBits(T integer_input) {
+  static_assert(std::is_signed<T>::value, "Only signed integer types handled.");
+#if defined(__GNUC__) && !defined(__clang__)
+  return integer_input ? __builtin_clrsb(integer_input)
+                       : std::numeric_limits<T>::digits;
+#else
+  using U = typename std::make_unsigned<T>::type;
+  return integer_input >= 0
+             ? CountLeadingZeros(static_cast<U>(integer_input)) - 1
+         : integer_input != std::numeric_limits<T>::min()
+             ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
+             : 0;
+#endif
+}
+
+// Use "count leading zeros" helper functions to do a fast Floor(log_2(x)).
+template <typename Integer>
+inline Integer FloorLog2(Integer n) {
+  static_assert(std::is_integral<Integer>::value, "");
+  static_assert(std::is_signed<Integer>::value, "");
+  static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
+  TFLITE_CHECK_GT(n, 0);
+  if (sizeof(Integer) == 4) {
+    return 30 - CountLeadingSignBits(n);
+  } else {
+    return 62 - CountLeadingSignBits(n);
+  }
+}
+
+namespace detail {
+
+// LUTPopulate takes an optional type-erased transform_params to allow passing
+// extra parameters to the transform function pointer. const void* is used
+// instead of std::function to be compatible with TFLite Micro
+template <typename FloatT, typename Func>
+inline typename std::enable_if<std::is_same<Func, FloatT (*)(FloatT)>::value,
+                               FloatT>::type
+LUTTransform(Func transform, const void* /*transform_params*/, FloatT value) {
+  static_assert(std::is_floating_point<FloatT>::value,
+                "FloatT must be a floating-point type.");
+  return transform(value);
+}
+
+template <typename FloatT, typename Func>
+inline typename std::enable_if<
+    std::is_same<Func, FloatT (*)(FloatT, const void*)>::value, FloatT>::type
+LUTTransform(Func transform, const void* transform_params, FloatT value) {
+  static_assert(std::is_floating_point<FloatT>::value,
+                "FloatT must be a floating-point type.");
+  return transform(value, transform_params);
+}
+
+// Use the same LUT generation code for both uint8_t and int8_t. Int8_t indexes
+// will be directly casted to uint8_t, the int8 LUT will thus be ordered as [0,
+// 1, ..., 127, -128, ..., -2, -1] instead of [-128, -127, ..., -1, 0, 1, ...,
+// 126, 127].
+template <typename T, typename Func>
+inline void LUTPopulateInt8(float input_scale, int32_t input_zero_point,
+                            float output_scale, int32_t output_zero_point,
+                            Func transform, const void* transform_params,
+                            T* lut) {
+  static_assert(
+      std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value,
+      "T must be an uint8 or int8 type.");
+  uint8_t* lut_uint8 = reinterpret_cast<uint8_t*>(lut);
+  const float inverse_scale = 1 / output_scale;
+  int32_t maxval = std::numeric_limits<T>::max();
+  int32_t minval = std::numeric_limits<T>::min();
+  for (int32_t val = minval; val <= maxval; ++val) {
+    const float dequantized = input_scale * (val - input_zero_point);
+    const float transformed =
+        LUTTransform(transform, transform_params, dequantized);
+    const float rescaled = TfLiteRound(transformed * inverse_scale);
+    const int32_t quantized =
+        static_cast<int32_t>(rescaled + output_zero_point);
+    lut_uint8[static_cast<uint8_t>(static_cast<T>(val))] = static_cast<uint8_t>(
+        static_cast<T>(std::max(std::min(maxval, quantized), minval)));
+  }
+}
+
+// Keep floating-point type configurable for backward compatibility. float
+// should be used for FloatT by default.
+template <typename FloatT, typename Func>
+inline void LUTPopulateInt16(FloatT input_scale, int32_t input_zero_point,
+                             FloatT output_scale, int32_t output_zero_point,
+                             Func transform, const void* transform_params,
+                             int16_t* lut) {
+  static_assert(std::is_floating_point<FloatT>::value,
+                "FloatT must be a floating-point type.");
+  const FloatT input_min =
+      input_scale * (std::numeric_limits<int16_t>::min() - input_zero_point);
+  const FloatT input_max =
+      input_scale * (std::numeric_limits<int16_t>::max() - input_zero_point);
+  const FloatT output_min =
+      output_scale * (std::numeric_limits<int16_t>::min() - output_zero_point);
+  const FloatT output_max =
+      output_scale * (std::numeric_limits<int16_t>::max() - output_zero_point);
+
+  const int nb_steps = 512;
+  const FloatT step = (input_max - input_min) / nb_steps;
+  const FloatT half_step = step / 2;
+  const FloatT output_scaling_inv =
+      static_cast<FloatT>(std::numeric_limits<int16_t>::max() -
+                          std::numeric_limits<int16_t>::min() + 1) /
+      (output_max - output_min);
+  const FloatT table_min =
+      static_cast<FloatT>(std::numeric_limits<int16_t>::min());
+  const FloatT table_max =
+      static_cast<FloatT>(std::numeric_limits<int16_t>::max());
+
+  for (int i = 0; i < nb_steps; i++) {
+    const FloatT val =
+        LUTTransform<FloatT>(transform, transform_params, input_min + i * step);
+    const FloatT val_midpoint = LUTTransform<FloatT>(
+        transform, transform_params, input_min + i * step + half_step);
+    const FloatT val_next = LUTTransform<FloatT>(transform, transform_params,
+                                                 input_min + (i + 1) * step);
+
+    const FloatT sample_val = TfLiteRound(val * output_scaling_inv);
+    const FloatT midpoint_interp_val =
+        TfLiteRound((val_next * output_scaling_inv +
+                     TfLiteRound(val * output_scaling_inv)) /
+                    2);
+    const FloatT midpoint_val = TfLiteRound(val_midpoint * output_scaling_inv);
+    const FloatT midpoint_err = midpoint_interp_val - midpoint_val;
+    const FloatT bias = TfLiteRound(midpoint_err / 2);
+
+    lut[i] = static_cast<int16_t>(std::min<FloatT>(
+        std::max<FloatT>(sample_val - bias, table_min), table_max));
+  }
+
+  lut[nb_steps] = static_cast<int16_t>(std::min<FloatT>(
+      std::max<FloatT>(TfLiteRound(LUTTransform<FloatT>(
+                                       transform, transform_params, input_max) *
+                                   output_scaling_inv),
+                       table_min),
+      table_max));
+}
+
+}  // namespace detail
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
+                                   std::is_same<T, int8_t>::value,
+                               void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float), T* lut) {
+  detail::LUTPopulateInt8(input_scale, input_zero_point, output_scale,
+                          output_zero_point, transform, nullptr, lut);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
+                                   std::is_same<T, int8_t>::value,
+                               void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float, const void*),
+            const void* transform_params, T* lut) {
+  detail::LUTPopulateInt8(input_scale, input_zero_point, output_scale,
+                          output_zero_point, transform, transform_params, lut);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float), T* lut) {
+  detail::LUTPopulateInt16<float>(input_scale, input_zero_point, output_scale,
+                                  output_zero_point, transform, nullptr, lut);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float, const void*),
+            const void* transform_params, T* lut) {
+  detail::LUTPopulateInt16<float>(input_scale, input_zero_point, output_scale,
+                                  output_zero_point, transform,
+                                  transform_params, lut);
+}
+
+// Deprecated, avoid usage and prefer the float version. Kept for
+// backward-compatiblity.
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
+LUTPopulate(double input_scale, int32_t input_zero_point, double output_scale,
+            int32_t output_zero_point, double (*transform)(double), T* lut) {
+  detail::LUTPopulateInt16<double>(input_scale, input_zero_point, output_scale,
+                                   output_zero_point, transform, nullptr, lut);
+}
+
+// The size of the LUT depends on the type of input. For uint8 and int8 inputs a
+// simple 256 entries LUT is used. For int16 inputs the high 9 bits are used for
+// indexing and the 7 remaining bits are used for interpolation. We thus use a
+// 513-entries LUT for int16 cases, 512 for the 9-bit indexing and 1 extra entry
+// to interpolate the last value.
+template <typename T>
+constexpr int LUTSize() {
+  static_assert(std::is_same<T, uint8_t>::value ||
+                    std::is_same<T, int8_t>::value ||
+                    std::is_same<T, int16_t>::value,
+                "Only LUTs with uint8, int8 or int16 inputs are supported.");
+  // As per c++11: constexpr methods cannot have more than one return statement.
+  return (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
+             ? 256
+             : 513;
+}
+
+// int16_t -> int16_t table lookup with interpolation
+// LUT must have 513 values
+inline int16_t LUTLookup(int16_t value, const int16_t* lut) {
+  // 512 base values, lut[513] is only used to calculate the slope
+  const uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
+  assert(index < 512 && "LUT index out of range.");
+  const int16_t offset = value & 0x7f;
+
+  // Base and slope are Q0.x
+  const int16_t base = lut[index];
+  const int16_t slope = lut[index + 1] - lut[index];
+
+  // Q0.x * Q0.7 = Q0.(x + 7)
+  // Round and convert from Q0.(x + 7) to Q0.x
+  const int delta = (slope * offset + 64) >> 7;
+
+  // Q0.15 + Q0.15
+  return static_cast<int16_t>(base + delta);
+}
+
+// int8_t -> int8_t table lookup without interpolation
+// LUT must have 256 values
+// LUTPopulate<int8_t> has ordered the LUT so that indexing it with an
+// int8_t is just done by casting it to an uint8_t.
+inline int8_t LUTLookup(int8_t value, const int8_t* lut) {
+  return lut[static_cast<uint8_t>(value)];
+}
+
+// uint8_t -> uint8_t table lookup without interpolation
+// LUT must have 256 values
+inline uint8_t LUTLookup(uint8_t value, const uint8_t* lut) {
+  return lut[value];
+}
+
+// Table of sigmoid(i/24) at 0.16 format - 256 elements.
+
+// We use combined sigmoid and tanh look-up table, since
+// tanh(x) = 2*sigmoid(2*x) -1.
+// Both functions are symmetric, so the LUT table is only needed
+// for the absolute value of the input.
+static const uint16_t sigmoid_table_uint16[256] = {
+    32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498,
+    40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255,
+    46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865,
+    52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174,
+    56503, 56823, 57133, 57433, 57724, 58007, 58280, 58544, 58800, 59048, 59288,
+    59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, 61279, 61441,
+    61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886,
+    62990, 63090, 63186, 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
+    63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, 64357, 64405, 64450,
+    64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845,
+    64873, 64900, 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097,
+    65115, 65132, 65149, 65164, 65179, 65194, 65208, 65221, 65234, 65246, 65258,
+    65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360,
+    65367, 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425,
+    65429, 65433, 65438, 65442, 65445, 65449, 65453, 65456, 65459, 65462, 65465,
+    65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
+    65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508,
+    65509, 65510, 65511, 65512, 65513, 65514, 65515, 65516, 65517, 65517, 65518,
+    65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, 65525,
+    65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529,
+    65529, 65529, 65530, 65530, 65530, 65530, 65531, 65531, 65531, 65531, 65531,
+    65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, 65533, 65533,
+    65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534,
+    65534, 65534, 65535};
+
+// TODO(b/77858996): Add these to gemmlowp.
+template <typename IntegerType>
+IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          sum)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
+}
+
+template <typename IntegerType>
+IntegerType SaturatingSub(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t diff = a32 - b32;
+  return static_cast<std::int16_t>(
+      std::min(static_cast<int32_t>(32767),
+               std::max(static_cast<int32_t>(-32768), diff)));
+}
+
+template <>
+inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t diff = a64 - b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          diff)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingSub(a.raw(), b.raw()));
+}
+// End section to be moved to gemmlowp.
+
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
+// Convert int32_t multiplier to int16_t with rounding.
+inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
+                                            int16_t* multiplier_int16_t) {
+  TFLITE_DCHECK_GE(multiplier_int32_t, 0);
+  static constexpr int32_t kRoundingOffset = 1 << 15;
+  if (multiplier_int32_t >=
+      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
+    *multiplier_int16_t = std::numeric_limits<int16_t>::max();
+    return;
+  }
+  const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
+  TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
+  TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
+  *multiplier_int16_t = result;
+  TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90   ? 7
+         : input_bits > 44 ? 6
+         : input_bits > 21 ? 5
+         : input_bits > 10 ? 4
+         : input_bits > 4  ? 3
+         : input_bits > 1  ? 2
+                           : 1;
+}
+
+// Although currently the name of this function says that it cannot handle
+// values less than 1, in practice it can handle as low as 1/x_max, where
+// x_max is the largest representable input.  In other words, the output range
+// is symmetric.
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32_t r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          static_cast<int32_t>(InputIntegerBits - z_a_headroom_plus_1),
+          31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
+  const int32_t r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          static_cast<int32_t>(InputIntegerBits - z_b_headroom),
+          31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficient to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
+inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
+                             int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
+  // This is the number of bits to the left of the binary point above 1.0.
+  // Consider x=1.25.  In that case shifted_scale=0.8 and
+  // no later adjustment will be needed.
+  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
+  const int32_t shifted_sum_minus_one =
+      static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));
+
+  gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
+      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+          gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
+  return shifted_scale.raw();
+}
+
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t* output_inv_sqrt,
+                                             int* output_shift) {
+  TFLITE_DCHECK_GE(input, 0);
+  if (input <= 1) {
+    // Handle the input value 1 separately to avoid overflow in that case
+    // in the general computation below (b/143972021). Also handle 0 as if it
+    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+    // but rare/unrealistic input value. We can expect both to occur in some
+    // incompletely trained models, but probably not in fully trained models.
+    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+    *output_shift = 0;
+    return;
+  }
+  TFLITE_DCHECK_GT(input, 1);
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  const unsigned max_left_shift_bits =
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) {
+  return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
+         indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] +
+         indexes[4] * desc.strides[4];
+}
+
+inline int SubscriptToIndex(const NdArrayDesc<8>& desc, int indexes[8]) {
+  return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
+         indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] +
+         indexes[4] * desc.strides[4] + indexes[5] * desc.strides[5] +
+         indexes[6] * desc.strides[6] + indexes[7] * desc.strides[7];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+// Copies dims to desc, calculating strides.
+template <int N>
+TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape,
+                                    NdArrayDesc<N>* desc_out) {
+  int desc_stride = 1;
+  for (int i = N - 1; i >= 0; --i) {
+    desc_out->extents[i] = input_shape.Dims(i);
+    desc_out->strides[i] = desc_stride;
+    desc_stride *= input_shape.Dims(i);
+  }
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+    NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
+  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+    const RuntimeShape& input2_shape, NdArrayDesc<N>* desc0_out,
+    NdArrayDesc<N>* desc1_out, NdArrayDesc<N>* desc2_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+  TFLITE_DCHECK(desc2_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+  auto extended_input2_shape = RuntimeShape::ExtendedShape(N, input2_shape);
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
+  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
+  CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    const int extent2 = extended_input2_shape.Dims(i);
+
+    int extent = extent0;
+    if (extent1 != 1) extent = extent1;
+    if (extent2 != 1) extent = extent2;
+
+    TFLITE_DCHECK(extent0 == 1 || extent0 == extent);
+    TFLITE_DCHECK(extent1 == 1 || extent1 == extent);
+    TFLITE_DCHECK(extent2 == 1 || extent2 == extent);
+
+    if (!(extent0 == extent1 && extent1 == extent2)) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent;
+      }
+      if (extent1 == 1) {
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent;
+      }
+      if (extent2 == 1) {
+        desc2_out->strides[i] = 0;
+        desc2_out->extents[i] = extent;
+      }
+    }
+  }
+}
+
+// Detailed implementation of NDOpsHelper, the indexes must be a zero array.
+// This implementation is equivalent to N nested loops. Ex, if N=4, it can be
+// re-writen as:
+// for (int b = 0; b < output.extents[0]; ++b) {
+//   for (int y = 0; y < output.extents[1]; ++y) {
+//     for (int x = 0; x < output.extents[2]; ++x) {
+//       for (int c = 0; c < output.extents[3]; ++c) {
+//           calc({b,y,x,c});
+//       }
+//     }
+//   }
+// }
+template <int N, int DIM, typename Calc>
+typename std::enable_if<DIM != N - 1, void>::type NDOpsHelperImpl(
+    const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) {
+  for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM]) {
+    NDOpsHelperImpl<N, DIM + 1, Calc>(output, calc, indexes);
+  }
+}
+
+template <int N, int DIM, typename Calc>
+typename std::enable_if<DIM == N - 1, void>::type NDOpsHelperImpl(
+    const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) {
+  for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM]) {
+    calc(indexes);
+  }
+}
+
+// Execute the calc function in the innermost iteration based on the shape of
+// the output. The calc function should take a single argument of type int[N].
+template <int N, typename Calc>
+inline void NDOpsHelper(const NdArrayDesc<N>& output, const Calc& calc) {
+  int indexes[N] = {0};
+  NDOpsHelperImpl<N, 0, Calc>(output, calc, indexes);
+}
+// Copied from gemmlowp::RoundDown when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundDown(Integer i) {
+  return i - (i % Modulus);
+}
+
+// Copied from gemmlowp::RoundUp when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundUp(Integer i) {
+  return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Copied from gemmlowp::CeilQuotient when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+  return (a + b - 1) / b;
+}
+
+// This function is a copy of gemmlowp::HowManyThreads, copied when we dropped
+// the direct dependency of internal/optimized/ on gemmlowp.
+//
+// It computes a reasonable number of threads to use for a GEMM of shape
+// (rows, cols, depth).
+//
+// TODO(b/131910176): get rid of this function by switching each call site
+// to its own more sensible logic for its own workload.
+template <int KernelRows>
+inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols,
+                                int depth) {
+  // Early-exit in the default case where multi-threading is disabled.
+  if (max_num_threads == 1) {
+    return 1;
+  }
+
+  // Ensure that each thread has KernelRows rows to process, if at all possible.
+  int thread_count = std::min(max_num_threads, rows / KernelRows);
+
+  // Limit the number of threads according to the overall size of the problem.
+  if (thread_count > 1) {
+    // Empirically determined value.
+    static constexpr std::uint64_t min_cubic_size_per_thread = 64 * 1024;
+
+    // We can only multiply two out of three sizes without risking overflow
+    const std::uint64_t cubic_size =
+        std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
+
+    thread_count = std::min(
+        thread_count, static_cast<int>(cubic_size / min_cubic_size_per_thread));
+  }
+
+  if (thread_count < 1) {
+    thread_count = 1;
+  }
+
+  assert(thread_count > 0 && thread_count <= max_num_threads);
+  return thread_count;
+}
+
+template <typename T>
+void optimized_ops_preload_l1_stream(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 0 means no locality */ 0);
+#else
+  (void)ptr;
+#endif
+}
+
+template <typename T>
+void optimized_ops_preload_l1_keep(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
+template <typename T>
+void optimized_ops_prefetch_write_l1_keep(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 1 means write */ 1, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/compatibility.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/compatibility.h
new file mode 100644
index 00000000..7ba66ed8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/compatibility.h
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/op_macros.h"
+
+#ifndef TFLITE_DCHECK
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_EQ
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GE
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GT
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LE
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LT
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
+#ifndef TFLITE_CHECK
+#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_EQ
+#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_GE
+#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_GT
+#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_LE
+#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_LT
+#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TF_LITE_STATIC_MEMORY
+// TODO(b/162019032): Consider removing these type-aliases.
+using int8 = std::int8_t;
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+#endif  // !defined(TF_LITE_STATIC_MEMORY)
+
+// Allow for cross-compiler usage of function signatures - currently used for
+// specifying named RUY profiler regions in templated methods.
+#if defined(_MSC_VER)
+#define TFLITE_PRETTY_FUNCTION __FUNCSIG__
+#elif defined(__GNUC__)
+#define TFLITE_PRETTY_FUNCTION __PRETTY_FUNCTION__
+#else
+#define TFLITE_PRETTY_FUNCTION __func__
+#endif
+
+// TFLITE_DEPRECATED()
+//
+// Duplicated from absl/base/macros.h to avoid pulling in that library.
+// Marks a deprecated class, struct, enum, function, method and variable
+// declarations. The macro argument is used as a custom diagnostic message (e.g.
+// suggestion of a better alternative).
+//
+// Example:
+//
+//   class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
+//   TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
+//
+// Every usage of a deprecated entity will trigger a warning when compiled with
+// clang's `-Wdeprecated-declarations` option. This option is turned off by
+// default, but the warnings will be reported by clang-tidy.
+#if defined(__clang__) && __cplusplus >= 201103L
+#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
+#endif
+
+#ifndef TFLITE_DEPRECATED
+#define TFLITE_DEPRECATED(message)
+#endif
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/constants.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/constants.h
new file mode 100644
index 00000000..ddaa4279
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/constants.h
@@ -0,0 +1,61 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_CONSTANTS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_CONSTANTS_H_
+
+// Maths constants.
+// The following macros are not always available on all platforms.
+// E.g. MSVC requires additional compile flag to export those.
+#ifndef M_E
+#define M_E 2.7182818284590452354 /* e */
+#endif
+#ifndef M_LOG2E
+#define M_LOG2E 1.4426950408889634074 /* log_2 e */
+#endif
+#ifndef M_LOG10E
+#define M_LOG10E 0.43429448190325182765 /* log_10 e */
+#endif
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942 /* log_e 2 */
+#endif
+#ifndef M_LN10
+#define M_LN10 2.30258509299404568402 /* log_e 10 */
+#endif
+#ifndef M_PI
+#define M_PI 3.14159265358979323846 /* pi */
+#endif
+#ifndef M_PI_2
+#define M_PI_2 1.57079632679489661923 /* pi/2 */
+#endif
+#ifndef M_PI_4
+#define M_PI_4 0.78539816339744830962 /* pi/4 */
+#endif
+#ifndef M_1_PI
+#define M_1_PI 0.31830988618379067154 /* 1/pi */
+#endif
+#ifndef M_2_PI
+#define M_2_PI 0.63661977236758134308 /* 2/pi */
+#endif
+#ifndef M_2_SQRTPI
+#define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
+#endif
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
+#endif
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+#endif
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/cppmath.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/cppmath.h
new file mode 100644
index 00000000..67ab4610
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/cppmath.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(__ZEPHYR__)
+#define TF_LITE_GLOBAL_STD_PREFIX
+#else
+#define TF_LITE_GLOBAL_STD_PREFIX std
+#endif
+
+#define DECLARE_STD_GLOBAL_SWITCH1(tf_name, std_name) \
+  template <class T>                                  \
+  inline T tf_name(const T x) {                       \
+    return TF_LITE_GLOBAL_STD_PREFIX::std_name(x);    \
+  }
+
+DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round)
+DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1)
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/kernel_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/kernel_utils.h
new file mode 100644
index 00000000..97e2bf89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/kernel_utils.h
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+// Performs an RNN batch inference step for inputs specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its input and recurrent weights,
+// and biases, along with the input size, number of units, activation.
+//
+// The pointers to the hidden state and the output are updated as a result.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many outputs and hidden states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to num_units. It is usually
+// not when we want to store the RNN output into a slice of the output tensor,
+// e.g. for bidirectional RNNs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  int output_batch_leading_dim,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+// Same as above but includes an auxiliary input with the corresponding weights.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* aux_input_ptr_batch,
+                  const float* aux_input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int aux_input_size, int num_units,
+                  int batch_size, int output_batch_leading_dim,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+// Performs a quantized RNN batch inference step. Same as above, but for
+// quantization purposes, we also pass in quantized_hidden_state_ptr_batch and
+// quantized_input_ptr_batch pointers for temporary storage of the quantized
+// values of hidden_state_ptr_batch and input_ptr_batch, respectively.
+// These temporary storages are expected to be preallocated to the same size as
+// the respective pointers.
+// An additional preallocated temporary storage 'scaling_factors' (of size
+// batch_size) is used to store the scaling factors of the quantization (used
+// for recovery).
+// {input,recurrent}_weights_scale params are used for dequantization/recovery.
+void RnnBatchStep(
+    const float* input_ptr_batch, const int8_t* input_weights_ptr,
+    float input_weights_scale, const int8_t* recurrent_weights_ptr,
+    float recurrent_weights_scale, const float* bias_ptr, int input_size,
+    int num_units, int batch_size, int output_batch_leading_dim,
+    TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
+    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
+    float* hidden_state_ptr_batch, float* output_ptr_batch,
+    bool asymmetric_quantize_inputs, int32_t* zero_points,
+    int32_t* accum_scratch, int32_t* row_sums, bool* compute_row_sums);
+
+void RnnBatchStep(
+    const float* input_ptr_batch, const int8_t* input_weights_ptr,
+    float input_weights_scale, const float* aux_input_ptr_batch,
+    const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
+    const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
+    const float* bias_ptr, int input_size, int aux_input_size, int num_units,
+    int batch_size, int output_batch_leading_dim,
+    TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
+    int8_t* aux_quantized_input_ptr_batch,
+    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
+    float* hidden_state_ptr_batch, float* output_ptr_batch,
+    bool asymmetric_quantize_inputs, int32_t* zero_points,
+    int32_t* accum_scratch, int32_t* row_sums, bool* compute_row_sums);
+
+}  // namespace kernel_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/legacy_types.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/legacy_types.h
new file mode 100644
index 00000000..c19a1adb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/legacy_types.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// TODO(b/116772710): Insert legacy Dims<> code in here.
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/max.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/max.h
new file mode 100644
index 00000000..c1810027
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/max.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
+inline float TfLiteMax(const float& x, const float& y) {
+  return std::max(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMax(const T& x, const T& y) {
+  return std::fmax(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc.h
new file mode 100644
index 00000000..8dae91ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc.h
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for computing MFCCs from spectrogram slices.
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_H_
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h"
+
+namespace tflite {
+namespace internal {
+
+class Mfcc {
+ public:
+  Mfcc();
+  bool Initialize(int input_length, double input_sample_rate);
+
+  // Input is a single squared-magnitude spectrogram frame. The input spectrum
+  // is converted to linear magnitude and weighted into bands using a
+  // triangular mel filterbank, and a discrete cosine transform (DCT) of the
+  // values is taken. Output is populated with the lowest dct_coefficient_count
+  // of these values.
+  void Compute(const std::vector<double>& spectrogram_frame,
+               std::vector<double>* output) const;
+
+  void set_upper_frequency_limit(double upper_frequency_limit) {
+    // CHECK(!initialized_) << "Set frequency limits before calling
+    // Initialize.";
+    upper_frequency_limit_ = upper_frequency_limit;
+  }
+
+  void set_lower_frequency_limit(double lower_frequency_limit) {
+    // CHECK(!initialized_) << "Set frequency limits before calling
+    // Initialize.";
+    lower_frequency_limit_ = lower_frequency_limit;
+  }
+
+  void set_filterbank_channel_count(int filterbank_channel_count) {
+    /// CHECK(!initialized_) << "Set channel count before calling Initialize.";
+    filterbank_channel_count_ = filterbank_channel_count;
+  }
+
+  void set_dct_coefficient_count(int dct_coefficient_count) {
+    // CHECK(!initialized_) << "Set coefficient count before calling
+    // Initialize.";
+    dct_coefficient_count_ = dct_coefficient_count;
+  }
+
+ private:
+  MfccMelFilterbank mel_filterbank_;
+  MfccDct dct_;
+  bool initialized_;
+  double lower_frequency_limit_;
+  double upper_frequency_limit_;
+  int filterbank_channel_count_;
+  int dct_coefficient_count_;
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc_dct.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc_dct.h
new file mode 100644
index 00000000..f2947b50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc_dct.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic minimal DCT class for MFCC speech processing.
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+
+#include <vector>
+
+namespace tflite {
+namespace internal {
+
+class MfccDct {
+ public:
+  MfccDct();
+  bool Initialize(int input_length, int coefficient_count);
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  bool initialized_;
+  int coefficient_count_;
+  int input_length_;
+  std::vector<std::vector<double> > cosines_;
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h
new file mode 100644
index 00000000..53d05bff
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for applying a mel-scale mapping to a power spectrum.
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+
+#include <vector>
+
+namespace tflite {
+namespace internal {
+
+class MfccMelFilterbank {
+ public:
+  MfccMelFilterbank();
+  bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
+
+  // Takes a squared-magnitude spectrogram slice as input, computes a
+  // triangular-mel-weighted linear-magnitude filterbank, and places the result
+  // in output.
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  double FreqToMel(double freq) const;
+  bool initialized_;
+  int num_channels_;
+  double sample_rate_;
+  int input_length_;
+  std::vector<double> center_frequencies_;  // In mel, for each mel channel.
+
+  // Each FFT bin b contributes to two triangular mel channels, with
+  // proportion weights_[b] going into mel channel band_mapper_[b], and
+  // proportion (1 - weights_[b]) going into channel band_mapper_[b] + 1.
+  // Thus, weights_ contains the weighting applied to each FFT bin for the
+  // upper-half of the triangular band.
+  std::vector<double> weights_;  // Right-side weight for this fft  bin.
+
+  // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
+  std::vector<int> band_mapper_;
+  int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/min.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/min.h
new file mode 100644
index 00000000..62035dcc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/min.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
+inline float TfLiteMin(const float& x, const float& y) {
+  return std::min(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMin(const T& x, const T& y) {
+  return std::fmin(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/opaque_tensor_ctypes.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/opaque_tensor_ctypes.h
new file mode 100644
index 00000000..dd4664af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/opaque_tensor_ctypes.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPAQUE_TENSOR_CTYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPAQUE_TENSOR_CTYPES_H_
+
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/namespace.h"
+
+namespace tflite {
+namespace TFLITE_CONDITIONAL_NAMESPACE {
+
+/// Returns the dimensions of the given tensor.
+TFLITE_NOINLINE RuntimeShape GetTensorShape(const TfLiteOpaqueTensor* tensor);
+
+}  // namespace TFLITE_CONDITIONAL_NAMESPACE
+
+using ::tflite::TFLITE_CONDITIONAL_NAMESPACE::GetTensorShape;
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPAQUE_TENSOR_CTYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h
new file mode 100644
index 00000000..8d660281
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_COMMON_H_
+#include <cstdint>
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Since we need to convert int4 to int8 with shifts, it is faster if we
+// can use unsigned int4, so just subtract zero_point_4bit from all values.
+// Fold input * zero_point into quantization since we need to quantize
+// each input and multiply by zero_point_4bit to convert back to signed int.
+constexpr int zero_point_4bit = -7;
+
+inline int8_t upper(int8_t value) { return value >> 4; }
+
+inline int8_t lower(int8_t value) {
+  uint8_t sign_y = UINT8_C(256) - (value & UINT8_C(8));
+  return (value & UINT8_C(7)) | sign_y;
+}
+
+inline int8_t merge(int8_t upper, int8_t lower) {
+  const auto to_int4 = [](int8_t v) -> uint8_t {
+    int32_t x = v + 7;
+    return static_cast<uint8_t>(x);
+  };
+  return (to_int4(upper) << 4) | to_int4(lower);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h
new file mode 100644
index 00000000..b44f8999
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h
@@ -0,0 +1,163 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+/* Returns the maximum number of rhs rows supported/compiled.
+ *
+ * In general 4 is the most we can do without running out of registers on
+ * aarch64. For x86/aarch64, 4x32 4bit = 64 bytes can be held in cache line
+ * size. This is required to set the packing layout for the rhs.
+ *
+ * For reference, return 1.
+ */
+inline int GetMaxSupportedRows() { return 1; }
+
+/* Pack a 4bit inner_rows x inner_cols array from src.
+ * This is called as an inner function for Prepack.
+ */
+inline void PackInner(const int8_t* src, uint8_t* box, int src_rows,
+                      int src_cols, int outer_row, int outer_col,
+                      int outer_rows, int outer_cols, int inner_rows,
+                      int inner_cols) {
+  ReferencePackInner(src, box, src_rows, src_cols, outer_row, outer_col,
+                     outer_rows, outer_cols, inner_rows, inner_cols);
+}
+
+/* Prepack lhs matrix into dest.
+ * Transform tensor from (src_rows, src_cols) to
+ * (layout_rows / width, layout_cols / depth, width, depth) with possibly
+ * padding, and interleaving values along depth / 2 dimensions.
+ * dest should be aligned and allocated before prepack.
+ */
+inline void Prepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  ReferencePrepack(dest, tensor, layout_rows, layout_cols, src_rows, src_cols,
+                   width, depth);
+}
+
+/* Quantize input floats to 8bit and calculate sum of each column.
+ *  Data in float_data_ptr of shape (n_batch x n_data), is quantized and
+ * packed into (n_batch / width, n_data / depth, width, data) into
+ * quantized_data_ptr and input_offsets will contain the product of filter
+ * zero_point and input.
+ */
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  ReferenceBatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                                   quantized_data_ptr, scaling_factors, width,
+                                   depth, input_offsets);
+}
+
+/* Write bias + input offset * filter_scale to output_ptr.
+ * output_ptr of size (batch_size, output_depth) will have
+ * output_ptr[output_depth * b + o] =
+ *     bias_ptr[o] + input_offsets[b] * batch_scales[b] * filter_scale[o]
+ */
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  ReferenceAssignBiasAndComputeOffsets(input_offsets, batch_scales,
+                                       filter_scales, bias_ptr, output_ptr,
+                                       output_depth, batch_size);
+}
+
+/* Add accumulated integer sums in dst to float output.
+ * output_ptr of size (batch_size, output_depth) will have
+ * output_ptr[b * output_depth + o] = \
+ *   dst[b / dst_layout_rows, o / dst_layout_cols,
+ *       b % dst_layout_rows, o % dst_layout_cols] * scaling_filters[b] *
+ *       filter_scales[o]
+ */
+template <int Depth, int Width>
+void Unpack(float* output_ptr, const int32_t* dst, int batch_size,
+            int num_units, const float* scaling_factors,
+            const float* filter_scales, int dst_layout_rows,
+            int dst_layout_cols) {
+  ReferenceUnpack<Depth, Width>(output_ptr, dst, batch_size, num_units,
+                                scaling_factors, filter_scales, dst_layout_rows,
+                                dst_layout_cols);
+}
+
+/* Computes dst = (lchd,rchd->lr, lhs, rhs)
+ * Where l = lhs_layout_rows, r = rhs_layout_rows,
+ * c = rhs_layout_cols = lhs_layout_cols.
+ */
+template <int RowsLeft, int RowsRight, int Cols>
+void RunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+               int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+               int rhs_layout_cols, int dst_layout_rows, int dst_layout_cols) {
+  ReferenceRunKernel<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+// Begin template specializations.
+template <>
+inline void Unpack<4, 1>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  ReferenceUnpack<4, 1>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                        filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  ReferenceRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                               rhs_layout_rows, rhs_layout_cols,
+                               dst_layout_rows, dst_layout_cols);
+}
+
+// End template specializations.
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, const int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+  ReferenceRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                               rhs_layout_rows, rhs_layout_cols,
+                               dst_layout_rows, dst_layout_cols);
+  ReferenceUnpack<4, 1>(output_ptr, dst, batch_size, output_depth,
+                        scaling_factors, filter_scales, dst_layout_rows,
+                        dst_layout_cols);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h
new file mode 100644
index 00000000..afc02f71
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_IMPL_H_
+
+#include <stdint.h>
+
+namespace tflite {
+namespace optimized_4bit {
+
+void ReferencePackInner(const int8_t* src, uint8_t* box, int src_rows,
+                        int src_cols, int outer_row, int outer_col,
+                        int outer_rows, int outer_cols, int inner_rows,
+                        int inner_cols);
+
+void ReferencePrepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                      int layout_cols, int src_rows, int src_cols, int width,
+                      int depth);
+
+void ReferenceBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                      int n_data, int8_t* quantized_data_ptr,
+                                      float* scaling_factors, int width,
+                                      int depth, int32_t* input_offsets);
+
+void ReferenceAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                          const float* batch_scales,
+                                          const float* filter_scales,
+                                          const float* bias_ptr,
+                                          float* output_ptr, int output_depth,
+                                          int batch_size);
+
+template <int Depth, int Width>
+extern void ReferenceUnpack(float* output_ptr, const int32_t* dst,
+                            int batch_size, int num_units,
+                            const float* scaling_factors,
+                            const float* filter_scales, int dst_layout_rows,
+                            int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void ReferenceRunKernel(const uint8_t* lhs, const int8_t* rhs,
+                               int32_t* dst, int lhs_layout_rows,
+                               int lhs_layout_cols, int rhs_layout_rows,
+                               int rhs_layout_cols, int dst_layout_rows,
+                               int dst_layout_cols);
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h
new file mode 100644
index 00000000..64756479
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h
@@ -0,0 +1,192 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_H_
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Maximum RowsRight compiled RunKernel implementations.
+inline int GetMaxSupportedRows() {
+#ifdef __aarch64__
+  return 4;
+#else
+  return 1;
+#endif
+}
+
+// Pack a 4bit inner_rows x inner_cols array from src.
+inline void PackInner(const int8_t* src, uint8_t* box, int src_rows,
+                      int src_cols, int outer_row, int outer_col,
+                      int outer_rows, int outer_cols, int inner_rows,
+                      int inner_cols) {
+  NeonPackInner(src, box, src_rows, src_cols, outer_row, outer_col, outer_rows,
+                outer_cols, inner_rows, inner_cols);
+}
+
+// Prepack lhs matrix into dest.
+inline void Prepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  NeonPrepack(dest, tensor, layout_rows, layout_cols, src_rows, src_cols, width,
+              depth);
+}
+
+// Quantize input floats to 8bit and calculate sum of each column.
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  NeonBatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                              quantized_data_ptr, scaling_factors, width, depth,
+                              input_offsets);
+}
+
+// Write bias + input offset * filter_scale to output_ptr.
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  NeonAssignBiasAndComputeOffsets(input_offsets, batch_scales, filter_scales,
+                                  bias_ptr, output_ptr, output_depth,
+                                  batch_size);
+}
+
+// Add accumulated integer sums in dst to float output.
+template <int Depth, int Width>
+void Unpack(float* output_ptr, const int32_t* dst, int batch_size,
+            int num_units, const float* scaling_factors,
+            const float* filter_scales, int dst_layout_rows,
+            int dst_layout_cols) {
+  NeonUnpack<Depth, Width>(output_ptr, dst, batch_size, num_units,
+                           scaling_factors, filter_scales, dst_layout_rows,
+                           dst_layout_cols);
+}
+
+// Compute sum of lhs * rhs columnwise.
+template <int RowsLeft, int RowsRight, int Cols>
+void RunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+               int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+               int rhs_layout_cols, int dst_layout_rows, int dst_layout_cols) {
+  NeonRunKernel<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 1>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  NeonUnpack<4, 1>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  NeonRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+}
+
+#ifdef __aarch64__
+template <>
+inline void Unpack<4, 2>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  NeonUnpack<4, 2>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  NeonRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 4>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  NeonUnpack<4, 4>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  NeonRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+}
+#endif
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, const int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+#ifdef __aarch64__
+  if (rhs_width >= 4) {
+    NeonRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                            rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                            dst_layout_cols);
+    NeonUnpack<4, 4>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                     filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+  if (rhs_width >= 2) {
+    NeonRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                            rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                            dst_layout_cols);
+    NeonUnpack<4, 2>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                     filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+#endif
+  NeonRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+  NeonUnpack<4, 1>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_NEON)...
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h
new file mode 100644
index 00000000..4cf68847
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_IMPL_H_
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <stdint.h>
+
+#if !defined(EIGEN_MAX_ALIGN_BYTES) && !defined(__aarch64__)
+#define EIGEN_MAX_ALIGN_BYTES 32
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+#define EIGEN_MAX_ALIGN_BYTES 64
+#endif
+
+namespace tflite {
+namespace optimized_4bit {
+
+void NeonPackInner(const int8_t* src, uint8_t* box, int src_rows, int src_cols,
+                   int outer_row, int outer_col, int outer_rows, int outer_cols,
+                   int inner_rows, int inner_cols);
+
+void NeonPrepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                 int layout_cols, int src_rows, int src_cols, int width,
+                 int depth);
+
+void NeonBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                 int n_data, int8_t* quantized_data_ptr,
+                                 float* scaling_factors, int width, int depth,
+                                 int32_t* input_offsets);
+
+void NeonAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                     const float* batch_scales,
+                                     float* filter_scales,
+                                     const float* bias_ptr, float* output_ptr,
+                                     int output_depth, int batch_size);
+
+template <int Depth, int Width>
+extern void NeonUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+                       int num_units, const float* scaling_factors,
+                       const float* filter_scales, int dst_layout_rows,
+                       int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void NeonRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                          int lhs_layout_rows, int lhs_layout_cols,
+                          int rhs_layout_rows, int rhs_layout_cols,
+                          int dst_layout_rows, int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void NeonRunKernelNoSDot(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols);
+
+#ifdef __aarch64__
+template <int RowsLeft, int RowsRight, int Cols>
+extern void NeonRunKernelSDot(const uint8_t* lhs, const int8_t* rhs,
+                              int32_t* dst, int lhs_layout_rows,
+                              int lhs_layout_cols, int rhs_layout_rows,
+                              int rhs_layout_cols, int dst_layout_rows,
+                              int dst_layout_cols);
+#endif
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_NEON)...
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h
new file mode 100644
index 00000000..e884a07b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h
@@ -0,0 +1,185 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_H_
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+
+#include <stdint.h>
+
+#include "tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Maximum RowsRight compiled RunKernel implementations.
+inline int GetMaxSupportedRows() { return 4; }
+
+// Pack a 4bit inner_rows x inner_cols array from src.
+inline void PackInner(const int8_t* src, uint8_t* box, int src_rows,
+                      int src_cols, int outer_row, int outer_col,
+                      int outer_rows, int outer_cols, int inner_rows,
+                      int inner_cols) {
+  SsePackInner(src, box, src_rows, src_cols, outer_row, outer_col, outer_rows,
+               outer_cols, inner_rows, inner_cols);
+}
+
+// Prepack lhs matrix into dest.
+inline void Prepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  SsePrepack(dest, tensor, layout_rows, layout_cols, src_rows, src_cols, width,
+             depth);
+}
+
+// Quantize input floats to 8bit and calculate sum of each column.
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  SseBatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                             quantized_data_ptr, scaling_factors, width, depth,
+                             input_offsets);
+}
+
+// Write bias + input offset * filter_scale to output_ptr.
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  SseAssignBiasAndComputeOffsets(input_offsets, batch_scales, filter_scales,
+                                 bias_ptr, output_ptr, output_depth,
+                                 batch_size);
+}
+
+// Add accumulated integer sums in dst to float output.
+template <int Depth, int Width>
+void Unpack(float* output_ptr, const int32_t* dst, int batch_size,
+            int num_units, const float* scaling_factors,
+            const float* filter_scales, int dst_layout_rows,
+            int dst_layout_cols) {
+  SseUnpack<Depth, Width>(output_ptr, dst, batch_size, num_units,
+                          scaling_factors, filter_scales, dst_layout_rows,
+                          dst_layout_cols);
+}
+
+// Compute sum of lhs * rhs columnwise.
+template <int RowsLeft, int RowsRight, int Cols>
+void RunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+               int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+               int rhs_layout_cols, int dst_layout_rows, int dst_layout_cols) {
+  SseRunKernel<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 1>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  SseUnpack<4, 1>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  SseRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 2>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  SseUnpack<4, 2>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  SseRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 4>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  SseUnpack<4, 4>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  SseRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+}
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, const int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+  if (rhs_width >= 4) {
+    SseRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                           rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                           dst_layout_cols);
+    SseUnpack<4, 4>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                    filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+  if (rhs_width >= 2) {
+    SseRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                           rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                           dst_layout_cols);
+    SseUnpack<4, 2>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                    filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+  SseRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+  SseUnpack<4, 1>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_SSE) && defined(__SSSE3__)
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h
new file mode 100644
index 00000000..087a677c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_IMPL_H_
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+
+#include <stdint.h>
+
+#ifndef EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_MAX_ALIGN_BYTES 64
+#endif
+
+namespace tflite {
+namespace optimized_4bit {
+
+void SsePackInner(const int8_t* src, uint8_t* box, int src_rows, int src_cols,
+                  int outer_row, int outer_col, int outer_rows, int outer_cols,
+                  int inner_rows, int inner_cols);
+
+void SsePrepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                int layout_cols, int src_rows, int src_cols, int width,
+                int depth);
+
+void SseBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int width, int depth,
+                                int32_t* input_offsets);
+
+void SseAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                    const float* batch_scales,
+                                    const float* filter_scales,
+                                    const float* bias_ptr, float* output_ptr,
+                                    int output_depth, int batch_size);
+
+template <int Depth, int Width>
+extern void SseUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+                      int num_units, const float* scaling_factors,
+                      const float* filter_scales, int dst_layout_rows,
+                      int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void SseRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols);
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_SSE) && defined(__SSSE3__)
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/avx2_quantization_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/avx2_quantization_utils.h
new file mode 100644
index 00000000..8f1128f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/avx2_quantization_utils.h
@@ -0,0 +1,160 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_AVX2_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_AVX2_QUANTIZATION_UTILS_H_
+#ifdef __AVX2__
+
+#include <immintrin.h>
+
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace avx2_utils {
+
+static inline void mm_storeu_si64(void *dst, __m128i v) {
+#if (defined __clang__) || (defined _MSC_VER)
+  _mm_storeu_si64(dst, v);
+#else
+  // GCC 9 lacks support for _mm_storeu_si64.
+  *static_cast<std::int64_t *>(dst) = _mm_extract_epi64(v, 0);
+#endif
+}
+
+static inline __m256i mm256_blendv_epi32(const __m256i &a, const __m256i &b,
+                                         const __m256i &mask) {
+  __m256 result =
+      _mm256_blendv_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b),
+                       _mm256_castsi256_ps(mask));
+  return _mm256_castps_si256(result);
+}
+
+static inline __m256i rounding_right_shift(const __m256i &value,
+                                           int32_t right_shift) {
+  TFLITE_DCHECK_GT(right_shift, 0);
+  const int32_t one_shift_exp_minus1 = 1 << (right_shift - 1);
+  __m256i nudge = _mm256_set1_epi32(one_shift_exp_minus1);
+  const __m256i r_plus_nudge = _mm256_add_epi32(value, nudge);
+  const __m256i shifted_sum =
+      _mm256_srav_epi32(r_plus_nudge, _mm256_set1_epi32(right_shift));
+
+  // Identify overflow in each lane and create mask.
+  const __m256i mask_num_plus_nudge_overflow = _mm256_cmpgt_epi32(
+      value, _mm256_set1_epi32(0x7fffffff - one_shift_exp_minus1));
+  // Fill results with either (value + nudge) >> exponent or
+  // std::numeric_limits<std::int32_t>::max() in the case of overflow.
+  return mm256_blendv_epi32(
+      shifted_sum, _mm256_set1_epi32(std::numeric_limits<std::int32_t>::max()),
+      mask_num_plus_nudge_overflow);
+}
+
+static inline __m256i rounding_right_shift(const __m256i &value,
+                                           const __m256i right_shift) {
+  const __m256i zeros = _mm256_setzero_si256();
+  const __m256i mask_rightshift_gtz = _mm256_cmpgt_epi32(right_shift, zeros);
+  const __m256i one_shift_exp_minus1 =
+      _mm256_sllv_epi32(_mm256_set1_epi32(1),
+                        _mm256_sub_epi32(right_shift, _mm256_set1_epi32(1)));
+  __m256i nudge =
+      mm256_blendv_epi32(zeros, one_shift_exp_minus1, mask_rightshift_gtz);
+  const __m256i r_plus_nudge = _mm256_add_epi32(value, nudge);
+  const __m256i shifted_sum = _mm256_srav_epi32(r_plus_nudge, right_shift);
+
+  // Identify overflow in each lane and create mask.
+  const __m256i mask_num_plus_nudge_overflow = _mm256_cmpgt_epi32(
+      value, _mm256_sub_epi32(_mm256_set1_epi32(0x7fffffff), nudge));
+  // Fill results with either (value + nudge) >> exponent or
+  // std::numeric_limits<std::int32_t>::max() in the case of overflow.
+  return mm256_blendv_epi32(
+      shifted_sum, _mm256_set1_epi32(std::numeric_limits<std::int32_t>::max()),
+      mask_num_plus_nudge_overflow);
+}
+
+inline void CastInt32ToInt16AndStore(int16 *dst, const __m256i v) {
+  // As _mm256_cvtepi32_epi16 is not supported in AVX2, use the below repack.
+  // Select bytes 0, 1, 4, 5, 8, 9, 12, 13 within each lane, effectively
+  // truncating each 16-bit integer.
+  const __m256i repack_perm = _mm256_set1_epi64x(0x0d0c090805040100);
+  const __m256i shuffled_v = _mm256_shuffle_epi8(v, repack_perm);
+  mm_storeu_si64(dst, _mm256_extracti128_si256(shuffled_v, 0));
+  mm_storeu_si64(dst + 4, _mm256_extracti128_si256(shuffled_v, 1));
+}
+
+inline __m256i MultiplyByQuantizedMultiplier(const __m256i &value,
+                                             const int32_t multiplier,
+                                             const int32_t left_shift) {
+  const __m256i repack_perm = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+  const __m256i shifted_value =
+      left_shift > 0 ? _mm256_sllv_epi32(value, _mm256_set1_epi32(left_shift))
+                     : value;
+
+  __m256i scaled_v_low = _mm256_mul_epi32(
+      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(shifted_value, 0)),
+      _mm256_set1_epi64x(multiplier));
+  __m256i scaled_v_high = _mm256_mul_epi32(
+      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(shifted_value, 1)),
+      _mm256_set1_epi64x(multiplier));
+
+  scaled_v_low = _mm256_srlv_epi64(scaled_v_low, _mm256_set1_epi64x(31));
+  scaled_v_high = _mm256_srlv_epi64(scaled_v_high, _mm256_set1_epi64x(31));
+  // As _mm256_cvtepi64_epi32 is not supported in AVX2, use the below permute.
+  scaled_v_high = _mm256_slli_epi64(scaled_v_high, 32);
+  __m256i result = _mm256_blend_epi32(scaled_v_low, scaled_v_high, 0xaa);
+  result = _mm256_permutevar8x32_epi32(result, repack_perm);
+  if (left_shift >= 0) {
+    return result;
+  }
+  return rounding_right_shift(result, -left_shift);
+}
+
+inline __m256i MultiplyByQuantizedMultiplier(const __m256i &value,
+                                             const __m256i multiplier,
+                                             const __m256i left_shift) {
+  const __m256i zero_vector = _mm256_setzero_si256();
+  const __m256i positive_left_shift = _mm256_max_epi32(left_shift, zero_vector);
+  const __m256i positive_right_shift =
+      _mm256_max_epi32(_mm256_sub_epi32(zero_vector, left_shift), zero_vector);
+
+  const __m256i repack_perm = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+  const __m256i shifted_value = _mm256_sllv_epi32(value, positive_left_shift);
+
+  const __m256i multiplier_low =
+      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(multiplier, 0));
+  const __m256i multiplier_high =
+      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(multiplier, 1));
+
+  __m256i scaled_v_low = _mm256_mul_epi32(
+      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(shifted_value, 0)),
+      multiplier_low);
+  __m256i scaled_v_high = _mm256_mul_epi32(
+      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(shifted_value, 1)),
+      multiplier_high);
+
+  scaled_v_low = _mm256_srlv_epi64(scaled_v_low, _mm256_set1_epi64x(31));
+  scaled_v_high = _mm256_srlv_epi64(scaled_v_high, _mm256_set1_epi64x(31));
+  // As _mm256_cvtepi64_epi32 is not supported in AVX2, use the below permute.
+  scaled_v_high = _mm256_slli_epi64(scaled_v_high, 32);
+  __m256i result = _mm256_blend_epi32(scaled_v_low, scaled_v_high, 0xaa);
+  result = _mm256_permutevar8x32_epi32(result, repack_perm);
+
+  return rounding_right_shift(result, positive_right_shift);
+}
+}  // namespace avx2_utils
+}  // namespace tflite
+
+#endif  // __AVX2__
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_AVX2_QUANTIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/batch_matmul.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
new file mode 100644
index 00000000..6d20210f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
@@ -0,0 +1,253 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_BATCH_MATMUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_BATCH_MATMUL_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data,
+                        CpuBackendContext* context,
+                        bool transpose_lhs = false) {
+  using ::tflite::cpu_backend_gemm::Gemm;
+  using ::tflite::cpu_backend_gemm::GemmParams;
+  using ::tflite::cpu_backend_gemm::MatrixParams;
+
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  MatrixParams<int8_t> lhs_params;
+  if (transpose_lhs) {
+    lhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  } else {
+    lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  }
+  lhs_params.rows = lhs_rows;
+  lhs_params.cols = accum_depth;
+  lhs_params.zero_point = -filter_offset;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = rhs_cols;
+  rhs_params.zero_point = -input_offset;
+
+  MatrixParams<int8_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = lhs_rows;
+  dst_params.cols = rhs_cols;
+  dst_params.zero_point = output_offset;
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        GemmParams<int32_t, int8_t> gemm_params;
+        gemm_params.clamp_min = output_activation_min;
+        gemm_params.clamp_max = output_activation_max;
+        gemm_params.multiplier_fixedpoint = output_multiplier;
+        gemm_params.multiplier_exponent = output_shift;
+        cpu_backend_gemm::Gemm(lhs_params, lhs_ptr2, rhs_params, rhs_ptr2,
+                               dst_params, out_ptr, gemm_params, context);
+      }
+    }
+  }
+}
+
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int32_t* output_data,
+                        CpuBackendContext* context,
+                        bool transpose_lhs = false) {
+  using ::tflite::cpu_backend_gemm::Gemm;
+  using ::tflite::cpu_backend_gemm::GemmParams;
+  using ::tflite::cpu_backend_gemm::MatrixParams;
+
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 weights_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  MatrixParams<int8_t> lhs_params;
+  if (transpose_lhs) {
+    lhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  } else {
+    lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  }
+  lhs_params.rows = lhs_rows;
+  lhs_params.cols = accum_depth;
+  lhs_params.zero_point = -weights_offset;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = rhs_cols;
+  rhs_params.zero_point = -input_offset;
+
+  MatrixParams<int32_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = lhs_rows;
+  dst_params.cols = rhs_cols;
+  dst_params.zero_point = output_offset;
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int32_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                          b1 * batch_dim2 + b2) *
+                                             lhs_rows * rhs_cols;
+
+        GemmParams<int32_t, int32_t> gemm_params;
+        gemm_params.clamp_min = output_activation_min;
+        gemm_params.clamp_max = output_activation_max;
+        gemm_params.multiplier_fixedpoint = output_multiplier;
+        gemm_params.multiplier_exponent = output_shift;
+        cpu_backend_gemm::Gemm(lhs_params, lhs_ptr2, rhs_params, rhs_ptr2,
+                               dst_params, out_ptr, gemm_params, context);
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_BATCH_MATMUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/cpu_check.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/cpu_check.h
new file mode 100644
index 00000000..b39371a3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
+
+// This include is superfluous. However, it's been here for a while, and a
+// number of files have been relying on it to include neon_check.h for them.
+// This should be removed, but with a global run of presubmits to catch
+// any such issues. This requires running more than just TFLite presubmits.
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+
+namespace tflite {
+
+// On A64, returns true if the dotprod extension is present.
+// On other architectures, returns false unconditionally.
+bool DetectArmNeonDotprod();
+
+struct CpuFlags {
+  bool neon_dotprod = false;
+};
+
+inline void GetCpuFlags(CpuFlags* cpu_flags) {
+  cpu_flags->neon_dotprod = DetectArmNeonDotprod();
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
new file mode 100644
index 00000000..841b3805
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -0,0 +1,589 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+constexpr int kDepthwiseConvScratchWorkspaceSize = 10 * 10 * 64;
+constexpr int kDepthwiseConvAdjustedBiasLimit = 64;
+// In cases such as depth multiplication, we want to be able to load data from
+// the workspace that is beyond the valid range. Macro-block sizes are adjusted
+// to allow for this.
+constexpr int kWorkspaceExtension = 16;
+
+#ifdef USE_NEON
+
+#ifndef __aarch64__
+inline int8x16_t vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
+  const uint8x16_t mask = vtstq_s8(b, vdupq_n_s8(8));
+
+  // Delete bit 3 from the indices.
+  const int8x16_t high_bits = vshrq_n_s8(b, 4);
+  int8x16_t deleted_bit_3 = b;
+  deleted_bit_3 = vsliq_n_s8(deleted_bit_3, high_bits, 3);
+
+  int8x8x4_t repacked_data;
+
+  // Calculate for lower indices.
+  repacked_data.val[0] = vget_low_s8(a.val[0]);
+  repacked_data.val[1] = vget_low_s8(a.val[1]);
+  repacked_data.val[2] = vget_low_s8(a.val[2]);
+  repacked_data.val[3] = vget_low_s8(a.val[3]);
+  const int8x16_t output_for_lower =
+      vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_s8(deleted_bit_3)));
+
+  // Calculate for high indices.
+  repacked_data.val[0] = vget_high_s8(a.val[0]);
+  repacked_data.val[1] = vget_high_s8(a.val[1]);
+  repacked_data.val[2] = vget_high_s8(a.val[2]);
+  repacked_data.val[3] = vget_high_s8(a.val[3]);
+  const int8x16_t output_for_higher =
+      vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_s8(deleted_bit_3)));
+
+  // Merge.
+  int8x16_t output = vbslq_s8(mask, output_for_higher, output_for_lower);
+  return output;
+}
+#endif  // !__aarch64__
+
+// Convenience-compatibility functions.
+// Compatibility: Intrinsics reflect a mixture of older and newer ARM
+//     instructions. This actually results in ZIP1 / ZIP2 asm instructions, but
+//     one intrinsic is provided. Also older instructions operated in place,
+//     and it seems more defensive to assume that some versions of intrinsics
+//     might reflect this
+// Convenience: Callers in these kernels want both ZIP1 and ZIP2, and we do not
+//     want the calling code to get cluttered with unpacking int8x16x2_t.
+inline void vzipq_s8_in_place(int8x16_t* a, int8x16_t* b) {
+  int8x16x2_t r8x16;
+  r8x16 = vzipq_s8(*a, *b);
+  *a = r8x16.val[0];
+  *b = r8x16.val[1];
+}
+
+inline void vzipq_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vzipq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+  *b = vreinterpretq_s8_s16(r16x8.val[1]);
+}
+
+// Similar rationale to the zip-in_place functions, but callers only actually
+// need the TRN1 asm instruction result.
+inline void vtrn1_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vtrnq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+}
+
+// Similar rationale to the zip-in_place functions, but callers only actually
+// need the ZIP1 or ZIP2 asm instruction results.
+inline int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
+  return vzipq_s8(a, b).val[0];
+}
+inline int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
+  return vzipq_s8(a, b).val[1];
+}
+
+inline void biregister_rotate_8(int8x16_t* left, int8x16_t* right) {
+  *left = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*left), 8));
+  *left = vreinterpretq_s8_u32(vsliq_n_u32(vreinterpretq_u32_s8(*left),
+                                           vreinterpretq_u32_s8(*right), 24));
+  *right = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*right), 8));
+}
+
+#ifndef __aarch64__
+inline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
+  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
+}
+#endif  // !__aarch64__
+
+#ifdef __ARM_FEATURE_DOTPROD
+// The vdotq_lane_s32 takes int8x8t for the rhs parameter, whereas the actual
+// instruction selects from between 4 32-bit (4x8-bit packed) sub-registers, an
+// unusual interpretation of "lane".
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, const int lane) {
+  switch (lane) {
+    case 0:
+      return vdotq_lane_s32(acc, lhs, vget_low_s8(rhs), 0);
+    case 1:
+      return vdotq_lane_s32(acc, lhs, vget_low_s8(rhs), 1);
+    case 2:
+      return vdotq_lane_s32(acc, lhs, vget_high_s8(rhs), 0);
+    case 3:
+    default:
+      return vdotq_lane_s32(acc, lhs, vget_high_s8(rhs), 1);
+  }
+}
+
+#else
+
+inline int32x4_t vdotq_s32(int32x4_t acc, int8x16_t lhs, int8x16_t rhs) {
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, int lane) {
+  int8x8_t lane_rhs;
+  if (lane == 0) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0));
+  } else if (lane == 1) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 1));
+  } else if (lane == 2) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 0));
+  } else {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 1));
+  }
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), lane_rhs));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), lane_rhs));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+#endif  // !__ARM_FEATURE_DOTPROD
+#endif  // ARM NEON
+
+//  This structure is typically used for reducing the magnitude of outputs, and
+//  the historical name reflects that.
+template <DepthwiseConvOutputRounding output_rounding>
+struct DivideByPOT {};
+
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kAwayFromZero> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return RoundingDivideByPOT(x, exponent);
+  }
+  // Mult versions use the exponents directly, rather than negated.
+  template <typename IntegerType>
+  static inline IntegerType RunMult(IntegerType x, int exponent) {
+    return RoundingDivideByPOT(x, -exponent);
+  }
+};
+
+#ifdef USE_NEON
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kUpward> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32_t>(-exponent)));
+  }
+  template <typename IntegerType>
+  static inline IntegerType RunMult(IntegerType x, IntegerType exponent) {
+    return vqrshlq_s32(x, exponent);
+  }
+  template <typename IntegerType>
+  static inline IntegerType RunMult(IntegerType x, int exponent) {
+    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32_t>(exponent)));
+  }
+};
+#endif  // ARM NEON
+
+// See CategorizeDotProductKernel for definitive taxonomy.
+enum class DotProduct3x3KernelType {
+  kNone = 0,  // Parameter combination is not supported for dot product kernels.
+  kPlain,
+  kWithDepthMultiplicationStride1,
+  kWithDepthMultiplicationStride2,
+  kStride2,
+};
+
+enum class QuantizationType {
+  kNonPerChannelUint8 = 0,
+  kPerChannelInt8 = 1,
+};
+
+template <QuantizationType quantization_type>
+struct QuantizationTypeImpl {};
+
+template <>
+struct QuantizationTypeImpl<QuantizationType::kNonPerChannelUint8> {
+  typedef uint8_t ExternalType;
+
+  static constexpr int kIntSymmetricZeroPoint = 128;
+  static constexpr uint8_t kUint8SignBit = 0x80;
+};
+
+template <>
+struct QuantizationTypeImpl<QuantizationType::kPerChannelInt8> {
+  typedef int8_t ExternalType;
+
+  static constexpr int kIntSymmetricZeroPoint = 0;
+  static constexpr uint8_t kUint8SignBit = 0x0;
+};
+
+template <
+    QuantizationType quantization_type = QuantizationType::kNonPerChannelUint8>
+inline DotProduct3x3KernelType CategorizeDotProductKernel(
+    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
+    const RuntimeShape& output_shape, const DepthwiseParams& params,
+    const int32_t* output_shift_ptr = nullptr) {
+  constexpr int kSymmetricZeroPoint =
+      QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+  const int padding =
+      std::max(params.padding_values.width, params.padding_values.height);
+  const int stride = params.stride_width;
+  const int32_t input_depth = input_shape.Dims(3);
+  const int32_t depth_multiplier = params.depth_multiplier;
+  const int32_t filter_height = filter_shape.Dims(1);
+  const int32_t filter_width = filter_shape.Dims(2);
+
+  bool supported = stride == params.stride_height && stride <= 2 &&
+                   padding <= 1 && filter_width == 3 && filter_height == 3 &&
+                   params.dilation_width_factor == 1 &&
+                   params.dilation_height_factor == 1 &&
+                   (((input_depth % 8) == 0 && depth_multiplier == 1) ||
+                    (input_depth == 1 && depth_multiplier > 1));
+
+  if (!supported) {
+    return DotProduct3x3KernelType::kNone;
+  }
+
+  if (params.weights_offset != -kSymmetricZeroPoint) {
+    return DotProduct3x3KernelType::kNone;
+  }
+
+  if (quantization_type == QuantizationType::kPerChannelInt8) {
+    if (output_shift_ptr == nullptr) {
+      return DotProduct3x3KernelType::kNone;
+    }
+  } else if (params.output_shift > 0) {
+    return DotProduct3x3KernelType::kNone;
+  }
+
+  if (params.depth_multiplier == 1) {
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kPlain;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kStride2;
+    } else {
+      return DotProduct3x3KernelType::kNone;
+    }
+  } else {
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride2;
+    } else {
+      return DotProduct3x3KernelType::kNone;
+    }
+  }
+}
+
+// Encapsulates constant parameters used in DepthwiseConv.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+struct DepthwiseConvParams {
+  int64_t input_depth;
+  int64_t input_row_size;
+  int64_t output_depth;
+  int64_t output_row_size;
+  int64_t filter_row_size;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t filter_offset;
+  int32_t output_multiplier;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  int32_t output_right_shift;
+  int32_t input_width;
+  int32_t input_height;
+  int32_t stride_width;
+  int32_t stride_height;
+  int32_t output_width;
+  int32_t output_height;
+  float float_output_activation_min;
+  float float_output_activation_max;
+};
+
+// Encapsulates constant parameters used in DepthwiseConv using dot-product ops.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+//
+// This structure is specifically designed for use in asm.
+struct DepthwiseConvDotProdParams {
+  int64_t input_depth;
+  int64_t output_depth;
+  int32_t stride;
+  int32_t bias_increment;
+  //
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int32_t output_shift;
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  //
+  int32_t padding_left;
+  int32_t padding_right;
+  int32_t padding_top;
+  int32_t padding_bottom;
+  //
+  int32_t depth_micro_repeats;
+  //
+  int32_t width_macro_count;
+  int32_t input_width_overall_micro_repeats;
+  int32_t input_width_micro_repeats;
+  int32_t residual_width;
+  int32_t output_width_overall_micro_repeats;
+  int32_t output_width_micro_repeats;
+  int32_t output_residual_width;
+  int32_t workspace_width_micro_repeats;
+  //
+  int32_t height_macro_count;
+  int32_t inbound_block_height;
+  int32_t outbound_block_height;
+  int32_t input_height_stride;
+  int32_t output_height_stride;
+  int32_t workspace_height_stride;
+  //
+  int32_t four_over_stride;
+  //
+  const int32_t* output_multiplier_per_channel;
+  const int32_t* output_shift_per_channel;
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32_t kDepth,
+          int32_t kStrideWidth, int32_t kStrideHeight>
+struct DepthwiseConvWindow {};
+
+template <DepthwiseConvOutputRounding output_rounding, int32_t kDepth,
+          int32_t kStrideWidth, int32_t kStrideHeight>
+struct DepthwiseConvWindowPerChannel {};
+
+enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter };
+
+template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
+          int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartial {};
+
+template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
+          int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartialPerChannel {};
+
+// Copies a subset of the input designated by |input_ptr| into |output_ptr|
+// with the specified output dimensions. Supports output depths of 64 only as
+// this is the cache line size.
+template <typename T>
+inline void ShuffleInput(const T* input_ptr, int64_t input_depth,
+                         int32_t input_width, int32_t input_height,
+                         int64_t output_depth, int32_t output_width,
+                         int32_t output_height, T* output_ptr) {
+  const int64_t input_row_size = input_depth * input_width;
+  for (int32_t y = 0; y < output_height; y++) {
+    const T* ptr = input_ptr;
+    for (int32_t x = 0; x < output_width; x++) {
+      memcpy(output_ptr, ptr, output_depth);
+      output_ptr += output_depth;
+      ptr += input_depth;
+    }
+    input_ptr += input_row_size;
+  }
+}
+
+// Calculates the input size depending on stride and output.
+inline int32_t get_shuffle_input_size(int32_t stride, int32_t output) {
+  return stride * (output - 1) + 3;
+}
+
+// Indicates the input and output dimensions used when shuffling input
+// activations.
+struct ShuffleParams {
+  int32_t output_width;
+  int32_t output_height;
+  int32_t input_width;
+  int32_t input_height;
+
+  ShuffleParams() = default;
+  ShuffleParams(int32_t output_width, int32_t output_height,
+                int32_t stride_width, int32_t stride_height)
+      : output_width(output_width),
+        output_height(output_height),
+        input_width(get_shuffle_input_size(stride_width, output_width)),
+        input_height(get_shuffle_input_size(stride_height, output_height)) {}
+};
+
+template <
+    QuantizationType quantization_type = QuantizationType::kNonPerChannelUint8>
+inline bool Fast3x3FilterKernelSupported(
+    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
+    int32_t stride_width, int32_t stride_height, int32_t dilation_width_factor,
+    int32_t dilation_height_factor, int32_t pad_width, int32_t pad_height,
+    int32_t depth_multiplier, const RuntimeShape& output_shape,
+    int32_t output_shift, const int32_t* output_shift_ptr = nullptr) {
+  const int32_t input_height = input_shape.Dims(1);
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t input_depth = input_shape.Dims(3);
+  const int32_t filter_height = filter_shape.Dims(1);
+  const int32_t filter_width = filter_shape.Dims(2);
+  const int32_t output_height = output_shape.Dims(1);
+  const int32_t output_width = output_shape.Dims(2);
+
+  bool supported =
+      filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
+      (stride_width == 1 || stride_width == 2) &&
+      (stride_height == 1 || stride_height == 2) &&
+      (stride_width == stride_height) && (pad_width == 0 || pad_width == 1) &&
+      (pad_height == 0 || pad_height == 1) && (pad_width == pad_height) &&
+      (input_depth % 8) == 0 && (output_shift <= 0) &&
+      dilation_width_factor == 1 && dilation_height_factor == 1;
+
+  if (!supported) {
+    return false;
+  }
+
+  // Handle case where padding is zero but padding type is not kValid.
+  // This would require special boundary case handling that is not supported.
+
+  const int32_t out_x = output_width - 1;
+  const int32_t out_y = output_height - 1;
+
+  const int32_t in_x_origin = (out_x * stride_width) - pad_width;
+  const int32_t in_y_origin = (out_y * stride_height) - pad_height;
+
+  const int32_t in_x_end = in_x_origin + filter_width;
+  const int32_t in_y_end = in_y_origin + filter_height;
+
+  // Supported only if filter on the right and bottom boundary lies completely
+  // within the input if padding is zero.
+  if (pad_width == 0 && pad_height == 0) {
+    return in_x_end <= input_width && in_y_end <= input_height;
+  }
+
+  // Else if padding is 1, supported if bottom right filter lies +1 past input
+  // width and height.
+  supported = in_x_end <= (input_width + 1) && in_y_end <= (input_height + 1);
+
+  if (!supported) {
+    return false;
+  }
+
+  // Shapes with width 1 and height > 1, and vice versa are not supported yet.
+  if (input_width == 1) {
+    supported = (input_width == input_height);
+  } else if (input_height == 1) {
+    supported = (input_width == input_height);
+  }
+  return supported;
+}
+
+// Permute filter data, and adjust bias data to account for symmetric input
+// offset. Details are provided in the implementation of the
+// kUseCModel3x3DotProduct version.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          QuantizationType quantization_type>
+struct ProcessPerDepth {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Copy a macro block of data from the input buffer into the workspace,
+// permuting data within each micro block.
+//
+// (a) Copy a macro block of data, padding as required along the width and
+//     height.
+// (b) Transpose the data within each micro block.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          QuantizationType quantization_type,
+          DepthwiseConvDepthMultiplication depth_multiplication,
+          int32_t max_padding>
+struct PackMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Apply filter to macro block of input data and store results. Details are
+// provided in the implementation of the kUseCModel3x3DotProduct version.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          QuantizationType quantization_type,
+          DepthwiseConvDepthMultiplication depth_multiplication, int32_t stride>
+struct KernelMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+#if defined(__aarch64__)
+// Experiments suggest that a modest performance improvement is seen, at least
+// on 855 chipset big cores, with cache hints.
+template <typename T>
+inline void PreloadInputBlock(
+    const T* input_block_data,
+    const DepthwiseConvDotProdParams* function_params) {
+  // Preload.
+  const int input_width_micro_repeats =
+      function_params->input_width_micro_repeats;
+  const int block_height = function_params->inbound_block_height;
+  const int residual_width = function_params->residual_width;
+  const int input_height_stride = function_params->input_height_stride;
+  const int input_depth = function_params->input_depth;
+
+  const int total_width = 4 * input_width_micro_repeats + residual_width;
+  const T* row_ptr = input_block_data;
+  for (int k_height = 0; k_height < block_height; ++k_height) {
+    const T* ptr = row_ptr;
+    for (int j = 0; j < total_width; ++j) {
+      // Input data is loaded once.
+      optimized_ops_preload_l1_keep(ptr);
+      ptr += input_depth;
+    }
+    row_ptr += input_height_stride;
+  }
+}
+#endif  // __aarch64__
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
new file mode 100644
index 00000000..83f2c340
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -0,0 +1,1117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters
+        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+        // Multiply-accumulate
+        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++) {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    float32x2_t filter = vld1_f32(filter_ptr);
+    float32x4_t filter_x4 = vcombine_f32(filter, filter);
+    int outp = 0;
+
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x2_t input_1 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x2_t input_2 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x4_t input = vcombine_f32(input_1, input_2);
+
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter_x4);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filter);
+
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    float32x4_t filter = vld1q_f32(filter_ptr);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input = vld1q_f32(input_ptr);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
+                                int input_depth, int input_width,
+                                const float* input_data, int pad_width,
+                                int depth_multiplier, int filter_width,
+                                const float* filter_data,
+                                int out_x_buffer_start, int out_x_buffer_end,
+                                int output_depth, float* acc_buffer) {
+  ruy::profiler::ScopeLabel label(TFLITE_PRETTY_FUNCTION);
+  // Consistency check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped =
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped =
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped = (pad_width + input_width -
+                                    dilation_factor * filter_x + stride - 1) /
+                                   stride;
+      }
+    } else {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped =
+          pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+                             kFixedDepthMultiplier>::Run(num_output_pixels,
+                                                         input_depth,
+                                                         depth_multiplier,
+                                                         input_ptr,
+                                                         input_ptr_increment,
+                                                         filter_base_ptr,
+                                                         acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const float* input_data, int pad_width, int depth_multiplier,
+    int filter_width, const float* filter_data, int out_x_buffer_start,
+    int out_x_buffer_end, int output_depth, float* acc_buffer) {
+  ruy::profiler::ScopeLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start,
+        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end = std::min(
+        out_x_buffer_end,
+        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
+            stride);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const float* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float* bias_data,
+                                       float* acc_buffer) {
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+//
+// The cpu_flags is currently unused. This
+// parameter is included so that the signature matches that required by a
+// templated function. Other versions, such as quantized, need this parameter.
+inline void DepthwiseConvImpl(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data, const CpuFlags& /* cpu_flags */, int thread_start,
+    int thread_end, int thread_dim) {
+  ruy::profiler::ScopeLabel label("DepthwiseConv/float/DepthwiseConvImpl");
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  static const int kAccBufferMaxSize = 4832;
+  float acc_buffer[kAccBufferMaxSize];
+  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
+                                   FIXED_DEPTH_MULTIPLIER>;               \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif  // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+  }
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim) {
+    case 0:
+      // Multithread along with the batch axis
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  float* output_ptr = output_data + output_ptr_offset;
+  int batch_step =
+      (output_height + row_start - row_end) * output_width * output_depth;
+
+  for (int b = batch_start; b < batch_end; ++b) {
+    for (int out_y = row_start; out_y < row_end; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                          dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height,
+                   (input_height - in_y_origin + dilation_height_factor - 1) /
+                       dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+              stride_width, dilation_width_factor, input_depth, input_width,
+              input_data + in_y * input_height_stride + b * input_batch_stride,
+              pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+              out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16) {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
+          for (int k = 0; k < 4; k++) {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4) {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          float acc = acc_buffer[i];
+          acc = std::max(output_activation_min,
+                         std::min(output_activation_max, acc));
+
+          *output_ptr++ = acc;
+        }
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
new file mode 100644
index 00000000..810468fb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
@@ -0,0 +1,189 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// TODO(luwa): add multithread to per-channel depthwise_conv
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+template <typename T, typename TS>
+struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
+  DepthwiseConvWorkerTask(const DepthwiseParams& params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const RuntimeShape& filter_shape,
+                          const T* filter_data, const RuntimeShape& bias_shape,
+                          const TS* bias_data, const RuntimeShape& output_shape,
+                          T* output_data, const CpuFlags& cpu_flags,
+                          int thread_start, int thread_end, int thread_dim)
+      : params_(params),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        cpu_flags_(cpu_flags),
+        thread_start_(thread_start),
+        thread_end_(thread_end),
+        thread_dim_(thread_dim) {}
+
+  void Run() override {
+    DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
+                      filter_data_, bias_shape_, bias_data_, output_shape_,
+                      output_data_, cpu_flags_, thread_start_, thread_end_,
+                      thread_dim_);
+  }
+
+ private:
+  const DepthwiseParams& params_;
+  const RuntimeShape& input_shape_;
+  const T* input_data_;
+  const RuntimeShape& filter_shape_;
+  const T* filter_data_;
+  const RuntimeShape& bias_shape_;
+  const TS* bias_data_;
+  const RuntimeShape& output_shape_;
+  T* output_data_;
+  const CpuFlags& cpu_flags_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const RuntimeShape& output_shape,
+                              const RuntimeShape& filter_shape) {
+  // How many scalar multiplications are needed to make it worth using one
+  // more thread
+  static constexpr int kMinMulPerThread = 1 << 13;  // 8k
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
+  // Try to avoid real runtime divisions if possible by dividing by a
+  // compile-time constant.
+  int thread_count = std::max(1, num_muls / kMinMulPerThread);
+  return thread_count;
+}
+
+inline bool MultithreadAlongBatches(int thread_count, int batches) {
+  TFLITE_DCHECK_GE(thread_count, 2);
+  // If there are fewer batch entries than the number of threads we want to use,
+  // then better do intra-batch-entry multithreading.
+  if (batches < thread_count) {
+    return false;
+  }
+  // If there are at least 2 batch entries to be handed to each thread, then
+  // it's safe to proceed with batch-wise multithreading: each thread will have
+  // approximately equal number of batch entries to handle, so the load
+  // balancing will be reasonable, and the amount to which the load is not
+  // perfectly balanced will be offset by the inherent advantages of
+  // batch-wise multithreading (each thread is more efficient thanks to working
+  // on larger buffers with less boundary-handling overhead).
+  if (batches >= 2 * thread_count) {
+    return true;
+  }
+  // In the limit case were there are at least 1 but not much more than 1
+  // batch entries per thread, it may be a good idea to do per-batch
+  // multithreading if the number of batch entries is a multiple of the number
+  // of threads, so that each thread will have the same number of batch entries
+  // to process.
+  return ((batches % thread_count) == 0);
+}
+
+template <typename T, typename TS>
+inline void DepthwiseConv(const DepthwiseParams& params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const RuntimeShape& filter_shape,
+                          const T* filter_data, const RuntimeShape& bias_shape,
+                          const TS* bias_data, const RuntimeShape& output_shape,
+                          T* output_data,
+                          CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("DepthwiseConv");
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  int thread_count = HowManyConvThreads(output_shape, filter_shape);
+  const int max_threads = cpu_backend_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+#ifndef TFLITE_WITH_RUY
+  // Cap the number of threads to 2 for float path to avoid regression in
+  // performance (b/132294857).
+  if (std::is_floating_point<T>::value) {
+    thread_count = std::min(thread_count, 2);
+  }
+#endif
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
+
+  if (thread_count == 1) {
+    DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
+                      filter_data, bias_shape, bias_data, output_shape,
+                      output_data, cpu_flags, /*thread_start=*/0,
+                      /*thread_end=*/output_height, /*thread_dim=*/1);
+    return;
+  }
+
+  int thread_dim, thread_dim_size;
+  if (MultithreadAlongBatches(thread_count, output_batches)) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_height;
+  }
+
+  std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  int thread_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int thread_end =
+        thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+    tasks.emplace_back(params, input_shape, input_data, filter_shape,
+                       filter_data, bias_shape, bias_data, output_shape,
+                       output_data, cpu_flags, thread_start, thread_end,
+                       thread_dim);
+    thread_start = thread_end;
+  }
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
new file mode 100644
index 00000000..68f3da1a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -0,0 +1,2127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+
+#include <algorithm>
+#include <type_traits>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8x2_t filter_u8;
+    filter_u8.val[0] = vld1_u8(filter_ptr);
+    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
+                            vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
+                                  vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
+                                  vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] =
+          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] =
+          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
+                                   vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
+                                   vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
+                              vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
+                              vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
+                              vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
+                              vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
+                              vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
+                              vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
+                              vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
+                              vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 3> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const uint8_t dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
+                                                     {2, 3, 3, 3, 4, 4, 4, 5},
+                                                     {5, 5, 6, 6, 6, 7, 7, 7}};
+    uint8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++) {
+      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8_t* local_filter_ptr = filter_ptr;
+      const uint8_t* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[3];
+        uint8x8x3_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+
+        uint8x8_t input_u8_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t input_s16_dup3 =
+              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
+                                    vget_low_s16(filter[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
+                                    vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++) {
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8_t* local_filter_ptr = filter_ptr;
+      const uint8_t* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[2];
+        uint8x8x2_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
+                                    vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
+                                    vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs.
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++) {
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8_t* local_filter_ptr = filter_ptr;
+      const uint8_t* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+#ifdef __AVX2__
+        // Load the filters, add filter_offset.
+        __m128i filter_u8_0 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(local_filter_ptr + 8 * 0));
+        __m128i filter_u8_1 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(local_filter_ptr + 8 * 1));
+        local_filter_ptr += 16;
+        __m256i filter_0 = _mm256_cvtepu8_epi32(filter_u8_0);
+        __m256i filter_1 = _mm256_cvtepu8_epi32(filter_u8_1);
+        __m256i filter_offset_vec = _mm256_set1_epi32(filter_offset);
+        filter_0 = _mm256_add_epi32(filter_0, filter_offset_vec);
+        filter_1 = _mm256_add_epi32(filter_1, filter_offset_vec);
+        // Load the inputs, add input_offset.
+        __m128i input_u8_0 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(local_input_ptr + 8 * 0));
+        __m128i input_u8_1 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(local_input_ptr + 8 * 1));
+        local_input_ptr += 16;
+        __m256i input_0 = _mm256_cvtepu8_epi32(input_u8_0);
+        __m256i input_1 = _mm256_cvtepu8_epi32(input_u8_1);
+        __m256i input_offset_vec = _mm256_set1_epi32(input_offset);
+        input_0 = _mm256_add_epi32(input_0, input_offset_vec);
+        input_1 = _mm256_add_epi32(input_1, input_offset_vec);
+        // Load the accumulators from acc_buffer
+        __m256i acc_0 = _mm256_loadu_si256(
+            reinterpret_cast<const __m256i*>(acc_buffer_ptr + 8 * 0));
+        __m256i acc_1 = _mm256_loadu_si256(
+            reinterpret_cast<const __m256i*>(acc_buffer_ptr + 8 * 1));
+        acc_0 = _mm256_add_epi32(acc_0, _mm256_mullo_epi32(input_0, filter_0));
+        acc_1 = _mm256_add_epi32(acc_1, _mm256_mullo_epi32(input_1, filter_1));
+        // Store the accumulators back to acc_buffer
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(acc_buffer_ptr + 8 * 0),
+                            acc_0);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(acc_buffer_ptr + 8 * 1),
+                            acc_1);
+        acc_buffer_ptr += 16;
+#else
+        // Load the filters, add filter_offset.
+        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
+        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
+        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 =
+            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 =
+            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+#endif
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+        const int16x8_t filter =
+            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++ + filter_offset;
+        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 16, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+                                   vget_low_s16(filter[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+                                   vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] =
+            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
+    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
+    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
+    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_u8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_u8_x are the same
+    // as the last 4 bytes of filter_u8_x.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter = vaddq_s16(
+        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint16x4_t input_u16 = vdup_n_u16(0);
+      input_u16 = vset_lane_u16(
+          (reinterpret_cast<const uint16_t*>(input_ptr))[0], input_u16, 0);
+      input_ptr += input_ptr_increment;
+      input_u16 = vset_lane_u16(
+          (reinterpret_cast<const uint16_t*>(input_ptr))[0], input_u16, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(
+          vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    if (num_output_pixels <= 0) {
+      return;
+    }
+
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    uint8x8_t input_u8 = vdup_n_u8(0);
+    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+    const int16x4_t input_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 12, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const uint8_t* filter_ptr,
+                  int16_t filter_offset, int32_t* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
+    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
+      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const uint8_t* input_data, int16_t input_offset, int pad_width,
+    int depth_multiplier, int filter_width, const uint8_t* filter_data,
+    int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, int32_t* acc_buffer) {
+  ruy::profiler::ScopeLabel label(TFLITE_PRETTY_FUNCTION);
+  // Consistency check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const uint8_t* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped =
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped =
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped = (pad_width + input_width -
+                                    dilation_factor * filter_x + stride - 1) /
+                                   stride;
+      }
+    } else {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped =
+          pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    int32_t* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const uint8_t* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<
+        kAllowStrided, kFixedInputDepth,
+        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
+                                    depth_multiplier, input_ptr, input_offset,
+                                    input_ptr_increment, filter_base_ptr,
+                                    filter_offset, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const uint8_t* input_data, int16_t input_offset, int pad_width,
+    int depth_multiplier, int filter_width, const uint8_t* filter_data,
+    int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, int32_t* acc_buffer) {
+  ruy::profiler::ScopeLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const uint8_t* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start,
+        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end = std::min(
+        out_x_buffer_end,
+        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
+            stride);
+
+    int32_t* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const uint8_t* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const uint8_t* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const int16_t input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int16_t filter_val = *filter_ptr++ + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32_t* bias_data,
+                                       int32_t* acc_buffer) {
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1) {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16) {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  } else if (output_depth == 2) {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8) {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  } else if (output_depth == 4) {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  } else if (output_depth == 8) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  } else if (output_depth == 16) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, int thread_start, int thread_end, int thread_dim) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+#ifdef USE_NEON
+  const bool shift_left = (output_shift > 0);
+  const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
+#endif
+
+  // The default Accbuffer size is 2048, will allocate a bigger memory if it's
+  // not enough.
+  // TODO(b/136089667): If output_depth > 2048 happens a lot, we should just use
+  // a scratch tensor.
+  static const int kStackAccBufferSize = 2048;
+  int acc_buffer_size = kStackAccBufferSize;
+  int32_t stack_acc_buffer[kStackAccBufferSize];
+  int32_t* acc_buffer = stack_acc_buffer;
+  std::unique_ptr<int32_t[]> heap_acc_buffer;
+  if (kStackAccBufferSize < output_depth) {
+    heap_acc_buffer.reset(new int32_t[output_depth]);
+    acc_buffer = heap_acc_buffer.get();
+    acc_buffer_size = output_depth;
+  }
+  const int kOutputPixelsInAccBuffer = acc_buffer_size / output_depth;
+  const int acc_buffer_size_actually_used =
+      kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   acc_buffer_size_actually_used);
+  TFLITE_DCHECK_LE(acc_buffer_size_actually_used, acc_buffer_size);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
+                                       FIXED_DEPTH_MULTIPLIER>;           \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif  // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim) {
+    case 0:
+      // Multithread along with the batch axis
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  uint8_t* output_ptr = output_data + output_ptr_offset;
+  int batch_step =
+      (output_height + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b) {
+    for (int out_y = row_start; out_y < row_end; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                          dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height,
+                   (input_height - in_y_origin + dilation_height_factor - 1) /
+                       dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+              stride_width, dilation_width_factor, input_depth, input_width,
+              input_data + in_y * input_height_stride + b * input_batch_stride,
+              input_offset, pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_height_stride, filter_offset,
+              out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Now need to convert them to
+        // the final 8bit form and store them.
+        ruy::profiler::ScopeLabel label("downquantize+store");
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+#ifdef USE_NEON
+        using gemmlowp::RoundingDivideByPOT;
+        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+        const int32x4_t output_activation_min_vec =
+            vdupq_n_s32(output_activation_min);
+        const int32x4_t output_activation_max_vec =
+            vdupq_n_s32(output_activation_max);
+        // Handle 16 values at once.
+        // This allows us to issue 4 mutually independent int32
+        // multiplications (vqrdmulh), which should alleviate most of their
+        // high latency.
+        for (; i <= num_output_values - 16; i += 16) {
+          int32x4_t acc[4];
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
+          }
+
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+            for (int j = 0; j < 4; j++) {
+              acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
+            }
+          } else {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+          }
+          // Add the output offset.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vaddq_s32(acc[j], output_offset_vec);
+          }
+          // Apply the activation function.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
+          }
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
+          }
+          // Saturating cast to uint8 and store to destination.
+          int16x4_t acc_s16[4];
+          for (int j = 0; j < 4; j++) {
+            acc_s16[j] = vqmovn_s32(acc[j]);
+          }
+          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
+          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
+          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
+          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
+          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+          output_ptr += 16;
+        }
+        // Handle 8 values at once.
+        // Not as good as 16 (now we're only issuing 2 mutually independent
+        // vqrdmulh instructions, so we're probably paying for their high
+        // latency).
+        for (; i <= num_output_values - 8; i += 8) {
+          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
+          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            // Rounding right shift.
+            acc0 = RoundingDivideByPOT(acc0, -output_shift);
+            acc1 = RoundingDivideByPOT(acc1, -output_shift);
+          } else {
+            // Fixed-point multiplication.
+            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+
+            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          }
+          // Add the output offset.
+          acc0 = vaddq_s32(acc0, output_offset_vec);
+          acc1 = vaddq_s32(acc1, output_offset_vec);
+          // Apply the activation function.
+          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
+          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
+          acc0 = vminq_s32(acc0, output_activation_max_vec);
+          acc1 = vminq_s32(acc1, output_activation_max_vec);
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
+          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
+          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_u8(output_ptr, res_u8);
+          output_ptr += 8;
+        }
+        // Handle 4 values at once. Now we're paying the full price of the
+        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
+        // (without any alignment) can only be done 1 byte at a time.
+        // Yet, that is still worth doing to minimize the amount of leftover
+        // that will have to go through the very slow scalar code.
+        for (; i <= num_output_values - 4; i += 4) {
+          int32x4_t acc = vld1q_s32(acc_buffer + i);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+            // Rounding right shift.
+            acc = RoundingDivideByPOT(acc, -output_shift);
+          } else {
+            // Fixed-point multiplication.
+            acc = vmulq_n_s32(acc, multiplier_power_of_two);
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+          }
+          // Add the output offset.
+          acc = vaddq_s32(acc, output_offset_vec);
+          // Apply the activation function.
+          acc = vmaxq_s32(acc, output_activation_min_vec);
+          acc = vminq_s32(acc, output_activation_max_vec);
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc_s16 = vqmovn_s32(acc);
+          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_lane_u8(output_ptr + 0, res_u8, 0);
+          vst1_lane_u8(output_ptr + 1, res_u8, 1);
+          vst1_lane_u8(output_ptr + 2, res_u8, 2);
+          vst1_lane_u8(output_ptr + 3, res_u8, 3);
+          output_ptr += 4;
+        }
+#endif  // USE_NEON
+
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          int32_t acc = acc_buffer[i];
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          *output_ptr++ = static_cast<uint8_t>(acc);
+        }
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+}  // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, const CpuFlags& cpu_flags, int thread_start,
+    int thread_end, int thread_dim) {
+  ruy::profiler::ScopeLabel label("DepthwiseConv/8bit");
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_GE(dilation_width_factor, 1);
+  TFLITE_DCHECK_GE(dilation_height_factor, 1);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+  // Dispatch to dot-product 3x3 kernels when supported.
+  if (cpu_flags.neon_dotprod) {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type =
+        optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+            input_shape, filter_shape, output_shape, params);
+    if (kernel_type != DotProduct3x3KernelType::kNone) {
+      ruy::profiler::ScopeLabel specialized_label(
+          "DepthwiseConv/8bit/3x3XDotProduct");
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data, thread_start,
+          thread_end, thread_dim);
+      return;
+    }
+  }
+
+#endif
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int output_shift = params.output_shift;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (depthwise_conv::Fast3x3FilterKernelSupported(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, output_shift)) {
+    ruy::profiler::ScopeLabel specialized_label("DepthwiseConv/8bit/3x3");
+    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+        bias_data, output_shape, output_data, thread_start, thread_end,
+        thread_dim);
+    return;
+  }
+#endif
+
+  ruy::profiler::ScopeLabel specialized_label("DepthwiseConv/8bit/General");
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
+                                       filter_shape, filter_data, bias_shape,
+                                       bias_data, output_shape, output_data,
+                                       thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, const CpuFlags& cpu_flags, int thread_start,
+    int thread_end, int thread_dim) {
+  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kUpward>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data, cpu_flags, thread_start, thread_end,
+      thread_dim);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
new file mode 100644
index 00000000..633feb53
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -0,0 +1,13442 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#ifdef USE_NEON
+inline int8x16_t util_vld1q_x8(const uint8_t* data_addr) {
+  return vreinterpretq_s8_u8(vld1q_u8(data_addr));
+}
+inline int8x16_t util_vld1q_x8(const int8_t* data_addr) {
+  return vld1q_s8(data_addr);
+}
+inline int8x8_t util_vld1_x8(const uint8_t* data_addr) {
+  return vreinterpret_s8_u8(vld1_u8(data_addr));
+}
+inline int8x8_t util_vld1_x8(const int8_t* data_addr) {
+  return vld1_s8(data_addr);
+}
+#endif
+
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+
+// Lane operations are for clarity and convenience. We want to load and store
+// 4 8-bit lanes together. So these are treated much like 32-bit loads and
+// 32-bit stores. Stores require 32-bit alignment.
+
+#define vst1_lane_8x4(dst, reg, lane_num)                                  \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);          \
+  vst1_lane_s32(reinterpret_cast<int32_t*>(dst), vreinterpret_s32_s8(reg), \
+                lane_num)
+#define vst1q_lane_8x4(dst, reg, lane_num)                        \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+
+// Important! Most compilation configurations will compile and run without
+// reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
+// obscure bug or mis-feature probably in unhygienic macro expansion.
+#define vld1q_lane_s8x8(src, reg, lane_num)                                  \
+  vreinterpretq_s8_s64(vld1q_lane_s64(reinterpret_cast<const int64_t*>(src), \
+                                      vreinterpretq_s64_s8(reg), lane_num))
+#define vld1_lane_8x4(src, reg, lane_num)                                \
+  vreinterpret_s8_s32(vld1_lane_s32(reinterpret_cast<const int32*>(src), \
+                                    vreinterpret_s32_s8(reg), lane_num))
+#define vld1q_lane_8x4(src, reg, lane_num) \
+  vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_FILTER_OFFSET 48
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_RIGHT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+                  OFFSET_OUTPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+                  OFFSET_INPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+                  OFFSET_OUTPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
+                  OFFSET_FILTER_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+                  OFFSET_INPUT_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+                  OFFSET_OUTPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
+
+// Dot product ops hard-coded
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvDotProdParams struct. This is used in the asm to load
+// parameters. Keep these values in sync with the static_asserts below.
+
+#define DP_OFFSET_INPUT_DEPTH 0
+#define DP_OFFSET_OUTPUT_DEPTH DP_OFFSET_INPUT_DEPTH + 8
+#define DP_OFFSET_STRIDE DP_OFFSET_OUTPUT_DEPTH + 8
+#define DP_OFFSET_BIAS_INCREMENT DP_OFFSET_STRIDE + 4
+//
+#define DP_OFFSET_INPUT_OFFSET 24
+#define DP_OFFSET_OUTPUT_OFFSET DP_OFFSET_INPUT_OFFSET + 4
+#define DP_OFFSET_OUTPUT_MULTIPLIER DP_OFFSET_OUTPUT_OFFSET + 4
+#define DP_OFFSET_OUTPUT_SHIFT DP_OFFSET_OUTPUT_MULTIPLIER + 4
+#define DP_OFFSET_QUANTIZED_ACTIVATION_MIN DP_OFFSET_OUTPUT_SHIFT + 4
+#define DP_OFFSET_QUANTIZED_ACTIVATION_MAX \
+  DP_OFFSET_QUANTIZED_ACTIVATION_MIN + 4
+//
+#define DP_OFFSET_PADDING_LEFT 48
+#define DP_OFFSET_PADDING_RIGHT DP_OFFSET_PADDING_LEFT + 4
+#define DP_OFFSET_PADDING_TOP DP_OFFSET_PADDING_RIGHT + 4
+#define DP_OFFSET_PADDING_BOTTOM DP_OFFSET_PADDING_TOP + 4
+//
+#define DP_OFFSET_DEPTH_MICRO_REPEATS DP_OFFSET_PADDING_BOTTOM + 4
+//
+#define DP_OFFSET_WIDTH_MACRO_COUNT 68
+#define DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS \
+  DP_OFFSET_WIDTH_MACRO_COUNT + 4
+#define DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS \
+  DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS + 4
+#define DP_OFFSET_RESIDUAL_WIDTH DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS + 4
+#define DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS \
+  DP_OFFSET_RESIDUAL_WIDTH + 4
+#define DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS \
+  DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS + 4
+#define DP_OFFSET_OUTPUT_RESIDUAL_WIDTH DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS + 4
+#define DP_OFFSET_WORKSPACE_WIDTH_MICRO_REPEATS \
+  DP_OFFSET_OUTPUT_RESIDUAL_WIDTH + 4
+//
+#define DP_OFFSET_HEIGHT_MACRO_COUNT 100
+#define DP_OFFSET_INBOUND_BLOCK_HEIGHT DP_OFFSET_HEIGHT_MACRO_COUNT + 4
+#define DP_OFFSET_OUTBOUND_BLOCK_HEIGHT DP_OFFSET_INBOUND_BLOCK_HEIGHT + 4
+#define DP_OFFSET_INPUT_HEIGHT_STRIDE DP_OFFSET_OUTBOUND_BLOCK_HEIGHT + 4
+#define DP_OFFSET_OUTPUT_HEIGHT_STRIDE DP_OFFSET_INPUT_HEIGHT_STRIDE + 4
+#define DP_OFFSET_WORKSPACE_HEIGHT_STRIDE DP_OFFSET_OUTPUT_HEIGHT_STRIDE + 4
+//
+#define DP_OFFSET_FOUR_OVER_STRIDE DP_OFFSET_WORKSPACE_HEIGHT_STRIDE + 4
+//
+#define DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL DP_OFFSET_FOUR_OVER_STRIDE + 4
+#define DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL \
+  DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL + 8
+
+static_assert(offsetof(DepthwiseConvDotProdParams, input_depth) ==
+                  DP_OFFSET_INPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_depth) ==
+                  DP_OFFSET_OUTPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, stride) == DP_OFFSET_STRIDE,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, bias_increment) ==
+                  DP_OFFSET_BIAS_INCREMENT,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams, input_offset) ==
+                  DP_OFFSET_INPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_offset) ==
+                  DP_OFFSET_OUTPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_multiplier) ==
+                  DP_OFFSET_OUTPUT_MULTIPLIER,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_shift) ==
+                  DP_OFFSET_OUTPUT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, quantized_activation_min) ==
+                  DP_OFFSET_QUANTIZED_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, quantized_activation_max) ==
+                  DP_OFFSET_QUANTIZED_ACTIVATION_MAX,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams, padding_left) ==
+                  DP_OFFSET_PADDING_LEFT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, padding_right) ==
+                  DP_OFFSET_PADDING_RIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, padding_top) ==
+                  DP_OFFSET_PADDING_TOP,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, padding_bottom) ==
+                  DP_OFFSET_PADDING_BOTTOM,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams, depth_micro_repeats) ==
+                  DP_OFFSET_DEPTH_MICRO_REPEATS,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams, width_macro_count) ==
+                  DP_OFFSET_WIDTH_MACRO_COUNT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams,
+                       input_width_overall_micro_repeats) ==
+                  DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, input_width_micro_repeats) ==
+                  DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, residual_width) ==
+                  DP_OFFSET_RESIDUAL_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams,
+                       output_width_overall_micro_repeats) ==
+                  DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams,
+                       output_width_micro_repeats) ==
+                  DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_residual_width) ==
+                  DP_OFFSET_OUTPUT_RESIDUAL_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams,
+                       workspace_width_micro_repeats) ==
+                  DP_OFFSET_WORKSPACE_WIDTH_MICRO_REPEATS,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams, height_macro_count) ==
+                  DP_OFFSET_HEIGHT_MACRO_COUNT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, inbound_block_height) ==
+                  DP_OFFSET_INBOUND_BLOCK_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, outbound_block_height) ==
+                  DP_OFFSET_OUTBOUND_BLOCK_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, input_height_stride) ==
+                  DP_OFFSET_INPUT_HEIGHT_STRIDE,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_height_stride) ==
+                  DP_OFFSET_OUTPUT_HEIGHT_STRIDE,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, workspace_height_stride) ==
+                  DP_OFFSET_WORKSPACE_HEIGHT_STRIDE,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams, four_over_stride) ==
+                  DP_OFFSET_FOUR_OVER_STRIDE,
+              "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams,
+                       output_multiplier_per_channel) ==
+                  DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_shift_per_channel) ==
+                  DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL,
+              "");
+
+#endif  // __aarch64__ && !GOOGLE_L4T - Dot product ops hard-coded
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+
+template <>
+struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kAwayFromZero, 8, 1,
+                           1> {
+ public:
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v30.16b, w4\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v31.16b, w0\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "add x10, %[bias_ptr], #16\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "dup v9.8h, w9\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "and v25.16b, v21.16b, v28.16b\n"
+            "and v29.16b, v22.16b, v28.16b\n"
+            "and v30.16b, v23.16b, v28.16b\n"
+            "and v31.16b, v24.16b, v28.16b\n"
+            "sshr v25.4s, v25.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v21.4s, v21.4s, v25.4s\n"
+            "sqadd v22.4s, v22.4s, v29.4s\n"
+            "dup v29.8h, w2\n"
+            "sqadd v23.4s, v23.4s, v30.4s\n"
+            "dup v30.16b, w4\n"
+            "sqadd v24.4s, v24.4s, v31.4s\n"
+            "dup v31.16b, w0\n"
+            "srshl v21.4s, v21.4s, v28.4s\n"
+            "srshl v22.4s, v22.4s, v28.4s\n"
+            "srshl v23.4s, v23.4s, v28.4s\n"
+            "srshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "and v25.16b, v21.16b, v28.16b\n"
+            "and v29.16b, v22.16b, v28.16b\n"
+            "and v30.16b, v23.16b, v28.16b\n"
+            "and v31.16b, v24.16b, v28.16b\n"
+            "sshr v25.4s, v25.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v21.4s, v21.4s, v25.4s\n"
+            "sqadd v22.4s, v22.4s, v29.4s\n"
+            "dup v29.8h, w2\n"
+            "sqadd v23.4s, v23.4s, v30.4s\n"
+            "dup v30.16b, w4\n"
+            "sqadd v24.4s, v24.4s, v31.4s\n"
+            "dup v31.16b, w0\n"
+            "srshl v21.4s, v21.4s, v28.4s\n"
+            "srshl v22.4s, v22.4s, v28.4s\n"
+            "srshl v23.4s, v23.4s, v28.4s\n"
+            "srshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v10.8h, v26.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v11.8h, v26.8h, v11.8b\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "uaddw v13.8h, v26.8h, v13.8b\n"
+            "uaddw v14.8h, v26.8h, v14.8b\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v16.8h, v26.8h, v16.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "uaddw v17.8h, v26.8h, v17.8b\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "uaddw v19.8h, v26.8h, v19.8b\n"
+            "uaddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v25.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v25.4s, v25.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v25.4s\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.8h, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.16b, w4\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.16b, w0\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v25.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v25.4s, v25.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v25.4s\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.8h, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.16b, w4\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.16b, w0\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v9.16b, v21.16b, v28.16b\n"
+          "and v12.16b, v22.16b, v28.16b\n"
+          "and v15.16b, v23.16b, v28.16b\n"
+          "and v18.16b, v24.16b, v28.16b\n"
+          "sshr v9.4s, v9.4s, #31\n"
+          "sshr v12.4s, v12.4s, #31\n"
+          "sshr v15.4s, v15.4s, #31\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v9.4s\n"
+          "sqadd v22.4s, v22.4s, v12.4s\n"
+          "sqadd v23.4s, v23.4s, v15.4s\n"
+          "sqadd v24.4s, v24.4s, v18.4s\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "ld1 {v21.4s}, [%[bias_ptr]]\n"
+        "ld1 {v22.4s}, [x10]\n"
+        "ld1 {v23.4s}, [%[bias_ptr]]\n"
+        "ld1 {v24.4s}, [x10]\n"
+
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "uaddw v14.8h, v26.8h, v14.8b\n"
+        "uaddw v15.8h, v26.8h, v15.8b\n"
+        "uaddw v17.8h, v26.8h, v17.8b\n"
+        "uaddw v18.8h, v26.8h, v18.8b\n"
+        "uaddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v25.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v25.4s, v25.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v25.4s\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.8h, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.16b, w4\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.16b, w0\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [%[output_ptr]], x3\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [%[output_ptr]], x3\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "uaddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "uaddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "and v25.16b, v21.16b, v28.16b\n"
+        "and v29.16b, v22.16b, v28.16b\n"
+        "and v30.16b, v23.16b, v28.16b\n"
+        "and v31.16b, v24.16b, v28.16b\n"
+        "sshr v25.4s, v25.4s, #31\n"
+        "sshr v29.4s, v29.4s, #31\n"
+        "sshr v30.4s, v30.4s, #31\n"
+        "sshr v31.4s, v31.4s, #31\n"
+        "sqadd v21.4s, v21.4s, v25.4s\n"
+        "sqadd v22.4s, v22.4s, v29.4s\n"
+        "dup v29.8h, w2\n"
+        "sqadd v23.4s, v23.4s, v30.4s\n"
+        "dup v30.16b, w4\n"
+        "sqadd v24.4s, v24.4s, v31.4s\n"
+        "dup v31.16b, w0\n"
+        "srshl v21.4s, v21.4s, v28.4s\n"
+        "srshl v22.4s, v22.4s, v28.4s\n"
+        "srshl v23.4s, v23.4s, v28.4s\n"
+        "srshl v24.4s, v24.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtn v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqadd v23.8h, v23.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "sqxtun2 v21.16b, v23.8h\n"
+        "umax v21.16b, v21.16b, v30.16b\n"
+        "umin v21.16b, v21.16b, v31.16b\n"
+        "st1 {v21.8b}, [%[output_ptr]], x3\n"
+        "mov v23.d[0], v21.d[1]\n"
+        "st1 {v23.8b}, [%[output_ptr]], x3\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "and v9.16b, v21.16b, v28.16b\n"
+        "and v12.16b, v22.16b, v28.16b\n"
+        "sshr v9.4s, v9.4s, #31\n"
+        "sshr v12.4s, v12.4s, #31\n"
+        "sqadd v21.4s, v21.4s, v9.4s\n"
+        "sqadd v22.4s, v22.4s, v12.4s\n"
+        "srshl v21.4s, v21.4s, v28.4s\n"
+        "srshl v22.4s, v22.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "umax v21.8b, v21.8b, v30.8b\n"
+        "umin v21.8b, v21.8b, v31.8b\n"
+        "st1 {v21.8b}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kUpward, 8, 1, 1> {
+ public:
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v30.16b, w4\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v31.16b, w0\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "add x10, %[bias_ptr], #16\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "dup v9.8h, w9\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v10.8h, v26.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v11.8h, v26.8h, v11.8b\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "uaddw v13.8h, v26.8h, v13.8b\n"
+            "uaddw v14.8h, v26.8h, v14.8b\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v16.8h, v26.8h, v16.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "uaddw v17.8h, v26.8h, v17.8b\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "uaddw v19.8h, v26.8h, v19.8b\n"
+            "uaddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "ld1 {v21.4s}, [%[bias_ptr]]\n"
+        "ld1 {v22.4s}, [x10]\n"
+        "ld1 {v23.4s}, [%[bias_ptr]]\n"
+        "ld1 {v24.4s}, [x10]\n"
+
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "uaddw v14.8h, v26.8h, v14.8b\n"
+        "uaddw v15.8h, v26.8h, v15.8b\n"
+        "uaddw v17.8h, v26.8h, v17.8b\n"
+        "uaddw v18.8h, v26.8h, v18.8b\n"
+        "uaddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [%[output_ptr]], x3\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [%[output_ptr]], x3\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "uaddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "uaddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrshl v21.4s, v21.4s, v28.4s\n"
+        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqrshl v23.4s, v23.4s, v28.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtn v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqadd v23.8h, v23.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "sqxtun2 v21.16b, v23.8h\n"
+        "umax v21.16b, v21.16b, v30.16b\n"
+        "umin v21.16b, v21.16b, v31.16b\n"
+        "st1 {v21.8b}, [%[output_ptr]], x3\n"
+        "mov v23.d[0], v21.d[1]\n"
+        "st1 {v23.8b}, [%[output_ptr]], x3\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrshl v21.4s, v21.4s, v28.4s\n"
+        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "umax v21.8b, v21.8b, v30.8b\n"
+        "umin v21.8b, v21.8b, v31.8b\n"
+        "st1 {v21.8b}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kAwayFromZero, 8, 2,
+                           2> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v28.8h, w0\n"
+        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.4s, w9\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w1\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w3\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "dup v31.16b, w4\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+
+        // Load filters and add offsets.
+        "add x10, %[bias_ptr], #16\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "dup v9.8h, w20\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v19.4s}, [%[bias_ptr]]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "ld1 {v20.4s}, [x10]\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v25.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [x10]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "and v27.16b, v21.16b, v28.16b\n"
+            "and v29.16b, v22.16b, v28.16b\n"
+            "and v30.16b, v23.16b, v28.16b\n"
+            "and v31.16b, v24.16b, v28.16b\n"
+            "sshr v27.4s, v27.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v21.4s, v21.4s, v27.4s\n"
+            "dup v27.4s, w1\n"
+            "sqadd v22.4s, v22.4s, v29.4s\n"
+            "dup v29.8h, w2\n"
+            "sqadd v23.4s, v23.4s, v30.4s\n"
+            "dup v30.16b, w3\n"
+            "sqadd v24.4s, v24.4s, v31.4s\n"
+            "dup v31.16b, w4\n"
+            "srshl v21.4s, v21.4s, v28.4s\n"
+            "srshl v22.4s, v22.4s, v28.4s\n"
+            "srshl v23.4s, v23.4s, v28.4s\n"
+            "srshl v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x6], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+            "and v27.16b, v19.16b, v28.16b\n"
+            "and v29.16b, v20.16b, v28.16b\n"
+            "and v30.16b, v25.16b, v28.16b\n"
+            "and v31.16b, v26.16b, v28.16b\n"
+            "sshr v27.4s, v27.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v19.4s, v19.4s, v27.4s\n"
+            "dup v27.4s, w1\n"
+            "sqadd v20.4s, v20.4s, v29.4s\n"
+            "dup v29.8h, w2\n"
+            "sqadd v25.4s, v25.4s, v30.4s\n"
+            "dup v30.16b, w3\n"
+            "sqadd v26.4s, v26.4s, v31.4s\n"
+            "dup v31.16b, w4\n"
+            "srshl v19.4s, v19.4s, v28.4s\n"
+            "srshl v20.4s, v20.4s, v28.4s\n"
+            "srshl v25.4s, v25.4s, v28.4s\n"
+            "srshl v26.4s, v26.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v19.4h, v19.4s\n"
+            "sqxtn2 v19.8h, v20.4s\n"
+            "sqxtn v25.4h, v25.4s\n"
+            "sqxtn2 v25.8h, v26.4s\n"
+            "sqadd v19.8h, v19.8h, v29.8h\n"
+            "sqadd v25.8h, v25.8h, v29.8h\n"
+            "sqxtun v19.8b, v19.8h\n"
+            "sqxtun2 v19.16b, v25.8h\n"
+            "ld1 {v20.4s}, [x10]\n"
+            "umax v19.16b, v19.16b, v30.16b\n"
+            "umin v19.16b, v19.16b, v31.16b\n"
+            "ld1 {v26.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v19.8b}, [x7], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v25.d[0], v19.d[1]\n"
+            "st1 {v25.8b}, [x7], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "ld1 {v19.4s}, [%[bias_ptr]]\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "ld1 {v25.4s}, [%[bias_ptr]]\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v27.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v27.4s, v27.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v27.4s\n"
+          "dup v27.4s, w1\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.8h, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.16b, w3\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.16b, w4\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x6]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+          "and v27.16b, v19.16b, v28.16b\n"
+          "and v29.16b, v20.16b, v28.16b\n"
+          "and v30.16b, v25.16b, v28.16b\n"
+          "and v31.16b, v26.16b, v28.16b\n"
+          "sshr v27.4s, v27.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v19.4s, v19.4s, v27.4s\n"
+          "dup v27.4s, w1\n"
+          "sqadd v20.4s, v20.4s, v29.4s\n"
+          "dup v29.8h, w2\n"
+          "sqadd v25.4s, v25.4s, v30.4s\n"
+          "dup v30.16b, w3\n"
+          "sqadd v26.4s, v26.4s, v31.4s\n"
+          "dup v31.16b, w4\n"
+          "srshl v19.4s, v19.4s, v28.4s\n"
+          "srshl v20.4s, v20.4s, v28.4s\n"
+          "srshl v25.4s, v25.4s, v28.4s\n"
+          "srshl v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v19.4h, v19.4s\n"
+          "sqxtn2 v19.8h, v20.4s\n"
+          "sqxtn v25.4h, v25.4s\n"
+          "sqxtn2 v25.8h, v26.4s\n"
+          "sqadd v19.8h, v19.8h, v29.8h\n"
+          "sqadd v25.8h, v25.8h, v29.8h\n"
+          "sqxtun v19.8b, v19.8h\n"
+          "sqxtun2 v19.16b, v25.8h\n"
+          "umax v19.16b, v19.16b, v30.16b\n"
+          "umin v19.16b, v19.16b, v31.16b\n"
+          "st1 {v19.8b}, [x7], x5\n"
+          "mov v25.d[0], v19.d[1]\n"
+          "st1 {v25.8b}, [x7]\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "dup v26.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "and v18.16b, v21.16b, v26.16b\n"
+          "and v19.16b, v22.16b, v26.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v18.4s\n"
+          "sqadd v22.4s, v22.4s, v19.4s\n"
+          "srshl v21.4s, v21.4s, v26.4s\n"
+          "srshl v22.4s, v22.4s, v26.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "umax v21.8b, v21.8b, v30.8b\n"
+          "umin v21.8b, v21.8b, v31.8b\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v18.16b, v23.16b, v26.16b\n"
+          "and v19.16b, v24.16b, v26.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v23.4s, v23.4s, v18.4s\n"
+          "sqadd v24.4s, v24.4s, v19.4s\n"
+          "srshl v23.4s, v23.4s, v26.4s\n"
+          "srshl v24.4s, v24.4s, v26.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "umax v23.8b, v23.8b, v30.8b\n"
+          "umin v23.8b, v23.8b, v31.8b\n"
+          "st1 {v23.8b}, [x7]\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "uaddw v9.8h, v28.8h, v9.8b\n"
+        "ld1 {v24.4s}, [%[bias_ptr]]\n"
+        "uaddw v10.8h, v28.8h, v10.8b\n"
+        "ld1 {v25.4s}, [x10]\n"
+        "uaddw v11.8h, v28.8h, v11.8b\n"
+        "ld1 {v26.4s}, [%[bias_ptr]]\n"
+        "ld1 {v27.4s}, [x10]\n"
+        "uaddw v12.8h, v28.8h, v12.8b\n"
+        "uaddw v13.8h, v28.8h, v13.8b\n"
+        "uaddw v14.8h, v28.8h, v14.8b\n"
+        "uaddw v15.8h, v28.8h, v15.8b\n"
+        "uaddw v16.8h, v28.8h, v16.8b\n"
+        "uaddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "uaddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "uaddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "uaddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "uaddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "uaddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "dup v28.4s, w1\n"
+          "dup v29.4s, w9\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+          "dup v28.8h, w2\n"
+          "and v30.16b, v24.16b, v29.16b\n"
+          "and v31.16b, v25.16b, v29.16b\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v24.4s, v24.4s, v30.4s\n"
+          "sqadd v25.4s, v25.4s, v31.4s\n"
+          "and v30.16b, v26.16b, v29.16b\n"
+          "and v31.16b, v27.16b, v29.16b\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v26.4s, v26.4s, v30.4s\n"
+          "dup v30.16b, w3\n"
+          "sqadd v27.4s, v27.4s, v31.4s\n"
+          "dup v31.16b, w4\n"
+          "srshl v24.4s, v24.4s, v29.4s\n"
+          "srshl v25.4s, v25.4s, v29.4s\n"
+          "srshl v26.4s, v26.4s, v29.4s\n"
+          "srshl v27.4s, v27.4s, v29.4s\n"
+          "sqxtn v24.4h, v24.4s\n"
+          "sqxtn2 v24.8h, v25.4s\n"
+          "sqxtn v26.4h, v26.4s\n"
+          "sqxtn2 v26.8h, v27.4s\n"
+          "sqadd v24.8h, v24.8h, v28.8h\n"
+          "sqadd v26.8h, v26.8h, v28.8h\n"
+          "sqxtun v24.8b, v24.8h\n"
+          "sqxtun2 v24.16b, v26.8h\n"
+          "dup v28.8h, w0\n"
+          "ld1 {v25.4s}, [x10]\n"
+          "umax v24.16b, v24.16b, v30.16b\n"
+          "umin v24.16b, v24.16b, v31.16b\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v24.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v26.d[0], v24.d[1]\n"
+          "st1 {v26.8b}, [x6], x5\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v24.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "uaddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "uaddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "uaddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "uaddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "uaddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "uaddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "dup v28.4s, w1\n"
+        "dup v29.4s, w9\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+        "dup v28.8h, w2\n"
+        "and v30.16b, v24.16b, v29.16b\n"
+        "and v31.16b, v25.16b, v29.16b\n"
+        "sshr v30.4s, v30.4s, #31\n"
+        "sshr v31.4s, v31.4s, #31\n"
+        "sqadd v24.4s, v24.4s, v30.4s\n"
+        "sqadd v25.4s, v25.4s, v31.4s\n"
+        "and v30.16b, v26.16b, v29.16b\n"
+        "and v31.16b, v27.16b, v29.16b\n"
+        "sshr v30.4s, v30.4s, #31\n"
+        "sshr v31.4s, v31.4s, #31\n"
+        "sqadd v26.4s, v26.4s, v30.4s\n"
+        "dup v30.16b, w3\n"
+        "sqadd v27.4s, v27.4s, v31.4s\n"
+        "dup v31.16b, w4\n"
+        "srshl v24.4s, v24.4s, v29.4s\n"
+        "srshl v25.4s, v25.4s, v29.4s\n"
+        "srshl v26.4s, v26.4s, v29.4s\n"
+        "srshl v27.4s, v27.4s, v29.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtn v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd v24.8h, v24.8h, v28.8h\n"
+        "sqadd v26.8h, v26.8h, v28.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqxtun2 v24.16b, v26.8h\n"
+        "dup v28.8h, w0\n"
+        "umax v24.16b, v24.16b, v30.16b\n"
+        "umin v24.16b, v24.16b, v31.16b\n"
+        "st1 {v24.8b}, [x6], x5\n"
+        "mov v26.d[0], v24.d[1]\n"
+        "st1 {v26.8b}, [x6]\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v26.4s, w9\n"
+        "dup v27.4s, w1\n"
+        "dup v29.8h, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "and v18.16b, v24.16b, v26.16b\n"
+        "and v19.16b, v25.16b, v26.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v24.4s, v24.4s, v18.4s\n"
+        "sqadd v25.4s, v25.4s, v19.4s\n"
+        "srshl v24.4s, v24.4s, v26.4s\n"
+        "srshl v25.4s, v25.4s, v26.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqadd v24.8h, v24.8h, v29.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax v24.8b, v24.8b, v30.8b\n"
+        "umin v24.8b, v24.8b, v31.8b\n"
+        "st1 {v24.8b}, [x6]\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvWindow<DepthwiseConvOutputRounding::kUpward, 8, 2, 2> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v28.8h, w0\n"
+        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.4s, w9\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w1\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w3\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "dup v31.16b, w4\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+
+        // Load filters and add offsets.
+        "add x10, %[bias_ptr], #16\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "dup v9.8h, w20\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v19.4s}, [%[bias_ptr]]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "ld1 {v20.4s}, [x10]\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v25.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [x10]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x6], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+            "sqrshl v19.4s, v19.4s, v28.4s\n"
+            "sqrshl v20.4s, v20.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v28.4s\n"
+            "sqrshl v26.4s, v26.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v19.4h, v19.4s\n"
+            "sqxtn2 v19.8h, v20.4s\n"
+            "sqxtn v25.4h, v25.4s\n"
+            "sqxtn2 v25.8h, v26.4s\n"
+            "sqadd v19.8h, v19.8h, v29.8h\n"
+            "sqadd v25.8h, v25.8h, v29.8h\n"
+            "sqxtun v19.8b, v19.8h\n"
+            "sqxtun2 v19.16b, v25.8h\n"
+            "ld1 {v20.4s}, [x10]\n"
+            "umax v19.16b, v19.16b, v30.16b\n"
+            "umin v19.16b, v19.16b, v31.16b\n"
+            "ld1 {v26.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v19.8b}, [x7], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v25.d[0], v19.d[1]\n"
+            "st1 {v25.8b}, [x7], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "ld1 {v19.4s}, [%[bias_ptr]]\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "ld1 {v25.4s}, [%[bias_ptr]]\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x6]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+          "sqrshl v19.4s, v19.4s, v28.4s\n"
+          "sqrshl v20.4s, v20.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v28.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v19.4h, v19.4s\n"
+          "sqxtn2 v19.8h, v20.4s\n"
+          "sqxtn v25.4h, v25.4s\n"
+          "sqxtn2 v25.8h, v26.4s\n"
+          "sqadd v19.8h, v19.8h, v29.8h\n"
+          "sqadd v25.8h, v25.8h, v29.8h\n"
+          "sqxtun v19.8b, v19.8h\n"
+          "sqxtun2 v19.16b, v25.8h\n"
+          "umax v19.16b, v19.16b, v30.16b\n"
+          "umin v19.16b, v19.16b, v31.16b\n"
+          "st1 {v19.8b}, [x7], x5\n"
+          "mov v25.d[0], v19.d[1]\n"
+          "st1 {v25.8b}, [x7]\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "dup v26.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v26.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "umax v21.8b, v21.8b, v30.8b\n"
+          "umin v21.8b, v21.8b, v31.8b\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v23.4s, v23.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v26.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "umax v23.8b, v23.8b, v30.8b\n"
+          "umin v23.8b, v23.8b, v31.8b\n"
+          "st1 {v23.8b}, [x7]\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "uaddw v9.8h, v28.8h, v9.8b\n"
+        "ld1 {v24.4s}, [%[bias_ptr]]\n"
+        "uaddw v10.8h, v28.8h, v10.8b\n"
+        "ld1 {v25.4s}, [x10]\n"
+        "uaddw v11.8h, v28.8h, v11.8b\n"
+        "ld1 {v26.4s}, [%[bias_ptr]]\n"
+        "ld1 {v27.4s}, [x10]\n"
+        "uaddw v12.8h, v28.8h, v12.8b\n"
+        "uaddw v13.8h, v28.8h, v13.8b\n"
+        "uaddw v14.8h, v28.8h, v14.8b\n"
+        "uaddw v15.8h, v28.8h, v15.8b\n"
+        "uaddw v16.8h, v28.8h, v16.8b\n"
+        "uaddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "uaddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "uaddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "uaddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "uaddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "uaddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "dup v28.4s, w1\n"
+          "dup v29.4s, w9\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+          "dup v28.8h, w2\n"
+          "sqrshl v24.4s, v24.4s, v29.4s\n"
+          "sqrshl v25.4s, v25.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v29.4s\n"
+          "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "sqxtn v24.4h, v24.4s\n"
+          "sqxtn2 v24.8h, v25.4s\n"
+          "sqxtn v26.4h, v26.4s\n"
+          "sqxtn2 v26.8h, v27.4s\n"
+          "sqadd v24.8h, v24.8h, v28.8h\n"
+          "sqadd v26.8h, v26.8h, v28.8h\n"
+          "sqxtun v24.8b, v24.8h\n"
+          "sqxtun2 v24.16b, v26.8h\n"
+          "dup v28.8h, w0\n"
+          "ld1 {v25.4s}, [x10]\n"
+          "umax v24.16b, v24.16b, v30.16b\n"
+          "umin v24.16b, v24.16b, v31.16b\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v24.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v26.d[0], v24.d[1]\n"
+          "st1 {v26.8b}, [x6], x5\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v24.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "uaddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "uaddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "uaddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "uaddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "uaddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "uaddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "dup v28.4s, w1\n"
+        "dup v29.4s, w9\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+        "dup v28.8h, w2\n"
+        "sqrshl v24.4s, v24.4s, v29.4s\n"
+        "sqrshl v25.4s, v25.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v29.4s\n"
+        "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtn v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd v24.8h, v24.8h, v28.8h\n"
+        "sqadd v26.8h, v26.8h, v28.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqxtun2 v24.16b, v26.8h\n"
+        "dup v28.8h, w0\n"
+        "umax v24.16b, v24.16b, v30.16b\n"
+        "umin v24.16b, v24.16b, v31.16b\n"
+        "st1 {v24.8b}, [x6], x5\n"
+        "mov v26.d[0], v24.d[1]\n"
+        "st1 {v26.8b}, [x6]\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v26.4s, w9\n"
+        "dup v27.4s, w1\n"
+        "dup v29.8h, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "sqrshl v24.4s, v24.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v26.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqadd v24.8h, v24.8h, v29.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax v24.8b, v24.8b, v30.8b\n"
+        "umin v24.8b, v24.8b, v31.8b\n"
+        "st1 {v24.8b}, [x6]\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
+                            EdgeType::kCenter, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w10\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w10\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w10\n"
+        "dup v25.8h, w9\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
+                            EdgeType::kCenter, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w10\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w10\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w10\n"
+        "dup v25.8h, w9\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
+                            EdgeType::kCorner, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w7\n"
+        "dup v25.8h, w6\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
+        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
+                            EdgeType::kCorner, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w7\n"
+        "dup v25.8h, w6\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
+        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
+                            EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
+                            EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kAwayFromZero,
+                            EdgeType::kVertical, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        // TODO(b/129852264): Improve testing coverage.
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<DepthwiseConvOutputRounding::kUpward,
+                            EdgeType::kVertical, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        // TODO(b/129852264): Improve testing coverage.
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_FILTER_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_RIGHT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvThroughDepth {
+  // Runs the DepthwiseConvWindow kernels through the depth dimension from
+  // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
+  // binary size. We use a DepthwiseConvParams struct for read only params
+  // to minimize call overhead.
+  static void __attribute__((noinline))
+  Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
+      int32 output_window_width, const DepthwiseConvParams& params) {
+    for (; start_depth <= end_depth - 8; start_depth += 8) {
+      DepthwiseConvWindow<output_rounding, 8, kStrideWidth, kStrideHeight>::Run(
+          input_ptr, filter_ptr, bias_ptr, output_ptr, input_depth,
+          input_row_size, output_window_height, output_window_width, &params);
+      input_ptr += 8;
+      output_ptr += 8;
+      filter_ptr += 8;
+      bias_ptr += 8;
+    }
+  }
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvMultiRow {
+  using ConvKernel =
+      DepthwiseConvThroughDepth<output_rounding, kStrideWidth, kStrideHeight>;
+
+  static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
+                         const uint8* filter_data, const int32* bias_data,
+                         uint8* output_data, const DepthwiseConvParams& params,
+                         const ShuffleParams& shuffle_params,
+                         uint8* shuffle_workspace) {
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
+        get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
+        get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+    TFLITE_DCHECK_LE(
+        64 * shuffle_params.input_width * shuffle_params.input_height,
+        kDepthwiseConvScratchWorkspaceSize);
+
+    int32 out_x = start_x;
+
+    // Run shuffling on inputs with sufficiently large depth and width. When
+    // these parameters are large enough, more time is taken to load inputs
+    // from memory. At this point, it becomes useful to prefetch and
+    // preshuffle the input data to maximize locality.
+    if (params.output_depth > 64 ||
+        (params.output_depth <= 64 && params.input_width > 150)) {
+      for (; out_x <= (end_x - shuffle_params.output_width);
+           out_x += shuffle_params.output_width) {
+        const uint8* input_ptr = input_data;
+        const int32* bias_ptr = bias_data;
+        const uint8* filter_ptr = filter_data;
+        uint8* output_ptr = output_data;
+        int64_t depth = 0;
+        const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+        for (; depth <= params.output_depth - 64; depth += 64) {
+          // Preload.
+          const uint8* h_ptr = input_ptr;
+          for (int32 i = 0; i < shuffle_params.input_height; i++) {
+            const uint8* ptr = h_ptr;
+            for (int32 j = 0; j < shuffle_params.input_width; j++) {
+              asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+              ptr += params.input_depth;
+            }
+            h_ptr += params.input_row_size;
+          }
+
+          // For a large enough input, shuffle into buckets.
+          ShuffleInput(input_ptr, params.input_depth, params.input_width,
+                       params.input_height, 64, shuffle_params.input_width,
+                       shuffle_params.input_height, shuffle_workspace);
+          ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+                          0, 64, 64, shuffle_row_size,
+                          shuffle_params.output_height,
+                          shuffle_params.output_width, params);
+          input_ptr += 64;
+          output_ptr += 64;
+          filter_ptr += 64;
+          bias_ptr += 64;
+        }
+
+        // Preload.
+        const uint8* h_ptr = input_ptr;
+        for (int32 i = 0; i < shuffle_params.input_height; i++) {
+          const uint8* ptr = h_ptr;
+          for (int32 j = 0; j < shuffle_params.input_width; j++) {
+            asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+            ptr += params.input_depth;
+          }
+          h_ptr += params.input_row_size;
+        }
+
+        // Handle leftover depth.
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
+                        params.input_row_size, shuffle_params.output_height,
+                        shuffle_params.output_width, params);
+
+        input_data +=
+            shuffle_params.output_width * kStrideWidth * params.input_depth;
+        output_data += shuffle_params.output_width * params.output_depth;
+      }
+    }
+
+    const int32 output_leftover_width = end_x - out_x;
+    if (output_leftover_width > 0) {
+      ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
+                      params.output_depth, params.input_depth,
+                      params.input_row_size, shuffle_params.output_height,
+                      output_leftover_width, params);
+    }
+  }
+};
+
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConvHandlePadding(const uint8* input_data,
+                                       const uint8* filter_data,
+                                       const int32* bias_data,
+                                       uint8* output_data,
+                                       const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const uint8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartial<output_rounding, EdgeType::kCenter, 1, 1>::Run(
+        input_data, filter_ptr, bias_data, output_data, &params);
+    return;
+  }
+
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const uint8* input_ptr = input_data;
+  const uint8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
+  uint8* output_ptr = output_data;
+
+  DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartial<output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartial<output_rounding, EdgeType::kVertical, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+               (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartial<output_rounding, EdgeType::kVertical, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartial<output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartial<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConv3x3Filter(
+    const DepthwiseParams& rt_params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvParams params;
+
+  const int32 stride_width = rt_params.stride_width;
+  const int32 stride_height = rt_params.stride_height;
+  const int32 pad_width = rt_params.padding_values.width;
+  const int32 pad_height = rt_params.padding_values.height;
+  const int32 depth_multiplier = rt_params.depth_multiplier;
+  const int32 output_activation_min = rt_params.quantized_activation_min;
+  const int32 output_activation_max = rt_params.quantized_activation_max;
+  const int32 input_offset = rt_params.input_offset;
+  const int32 filter_offset = rt_params.weights_offset;
+  const int32 output_offset = rt_params.output_offset;
+  const int32 output_multiplier = rt_params.output_multiplier;
+  const int32 output_shift = rt_params.output_shift;
+
+  params.input_depth = input_shape.Dims(3);
+  params.input_width = input_shape.Dims(2);
+  params.input_height = input_shape.Dims(1);
+  params.input_row_size = params.input_depth * params.input_width;
+  params.input_offset = input_offset;
+  params.stride_width = stride_width;
+  params.stride_height = stride_height;
+  params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  params.output_width = output_shape.Dims(2);
+  params.output_height = output_shape.Dims(1);
+  params.output_row_size = params.output_depth * params.output_width;
+  params.output_offset = output_offset;
+  params.filter_offset = filter_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_right_shift = output_shift;
+  params.output_activation_min = output_activation_min;
+  params.output_activation_max = output_activation_max;
+
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+  params.filter_row_size = params.output_depth * filter_width;
+
+  // Algorithm assumes below constraints. It is optimized for depth
+  // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
+  TFLITE_DCHECK(depth_multiplier == 1);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 3);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int64_t input_batch_size = params.input_row_size * params.input_height;
+  const int64_t output_batch_size =
+      params.output_row_size * params.output_height;
+
+  ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+      four_row_shuffle_params, eight_row_shuffle_params;
+  if (stride_width == 1) {
+    one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+    two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+    four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+    eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+  } else {
+    one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+    two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+    four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+    eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+  }
+
+  using conv_multirow_func_t =
+      decltype(&DepthwiseConvMultiRow<output_rounding, 1, 1>::Run);
+  conv_multirow_func_t conv_multirow_func =
+      DepthwiseConvMultiRow<output_rounding, 1, 1>::Run;
+  if (stride_width == 2) {
+    conv_multirow_func = DepthwiseConvMultiRow<output_rounding, 2, 2>::Run;
+  }
+
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array
+  // used in gemmlowp.
+  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
+
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = params.output_height;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, params.output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      break;
+  }
+
+  for (int32 b = batch_start; b < batch_end; ++b) {
+    // input_ptr and output_ptr point to the start of each batch
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
+
+    int32 out_x = 0;
+    int32 out_y = row_start;
+    int32 end_x = params.output_width;
+    int32 end_y = row_end;
+
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePadding<output_rounding>(
+          input_ptr, filter_data, bias_data, output_ptr, params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = std::max(1, out_y);
+      end_y = std::min(params.output_height - 1, end_y);
+    }
+
+    // pad_width and pad_height can both be 0 or 1, depending on padding option,
+    // such as Padding_VALID / Padding_SAME.
+    const int in_x = (out_x * stride_width) - pad_width;
+    const int in_y = (out_y * stride_height) - pad_height;
+
+    // input_ptr and output_ptr point to (in_y, in_x) and (out_y, out_x),
+    // respectively. (in_y, in_x) and (out_y, out_x) change along with
+    // row_start.
+    input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+    output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
+
+    // Shuffling shapes that maximize width over the shuffle workspace size
+    // perform better since the inputs are closer together, minimizing
+    // shuffling time.
+    //
+    // If the input shape has width large enough for the 2 row kernels,
+    // we prefer to use this. The innermost loop of the kernels handle
+    // 2 height x 2 width so this is the fastest path.
+    //
+    // If the input shape has smaller width but larger height, shuffling is
+    // still useful and can benefit from kernels 4 row and 8 row kernels.
+
+    // Handle 8 rows at a time.
+    if (params.input_width < four_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, eight_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 8 * stride_height * params.input_row_size;
+        output_ptr += 8 * params.output_row_size;
+      }
+    }
+
+    // Handle 4 rows at a time.
+    if (params.input_width < two_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, four_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 4 * stride_height * params.input_row_size;
+        output_ptr += 4 * params.output_row_size;
+      }
+    }
+
+    // Handle 2 rows at a time.
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, two_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += 2 * stride_height * params.input_row_size;
+      output_ptr += 2 * params.output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, one_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += stride_height * params.input_row_size;
+      output_ptr += params.output_row_size;
+    }
+  }
+}
+#endif  // __aarch64__
+
+// Perform any necessary cache hinting and pre-writing.
+template <DepthwiseConvImplementation implementation>
+struct WorkspacePrefetchWrite {
+  static inline void Run(int8_t fill_data, int size, int8_t* workspace) {}
+};
+
+#if defined(__aarch64__)
+// Encourage the processor to keep the workspace in cache. Both the cache hint
+// and some memory writes are required.
+//
+// This code is extremely fragile.
+// Do not edit without extensive comparative performance testing.
+// Do not inline without great care.
+// Do not rely on results before and after getting coffee: non-thermal changes
+//    of more than 10% can occur with hidden underlying processor state changes.
+template <>
+struct WorkspacePrefetchWrite<
+    DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
+  static void __attribute__((noinline))
+  Run(int8 fill_data, int size, int8* workspace) {
+    const int8x8_t fill_data_vec_int8 = vdup_n_s8(fill_data);
+    const uint32x2_t fill_data_vec = vreinterpret_u32_s8(fill_data_vec_int8);
+    for (int i = 0; i < (size - 15); i += 64) {
+      int8* ptr = workspace + i;
+      asm volatile("prfm pstl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+      vst1_lane_u32(reinterpret_cast<uint32_t*>(ptr), fill_data_vec, 0);
+    }
+    vst1_lane_u32(reinterpret_cast<uint32_t*>(workspace + size - 4),
+                  fill_data_vec, 0);
+  }
+};
+
+#endif  // __aarch64__
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
+    defined(__clang__)
+// Dot product ops hard-coded
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                       QuantizationType::kNonPerChannelUint8> {
+  static inline void ProcessPerDepthNeon(
+      const uint8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[filter_data]
+    // x1 %[bias_data]
+    // x2 %[shuffled_filter_data]
+    // x3 %[adjusted_bias_data]
+    // x4 %[function_params]
+#define DC_PER_DEPTH_1 "1"
+#define DC_PER_DEPTH_2 "2"
+
+    asm volatile(
+        "ldp    w12, w11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
+        "ldrsw  x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr    w10, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "mov    x8, xzr\n"
+        "add    w11, w11, #128\n"  // =128
+        "sxtw   x12, w12\n"
+        "movi   v0.16b, #128\n"
+        "dup    v1.4s, w11\n"
+        "lsl    x11, x12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "movi   v2.16b, #1\n"
+        // implicit-def: $q3
+        // implicit-def: $q4
+        // implicit-def: $q5
+        // implicit-def: $q6
+        // implicit-def: $q7
+        // implicit-def: $q16
+        // implicit-def: $q17
+        // implicit-def: $q18
+        // implicit-def: $q19
+        "b      " DC_PER_DEPTH_2 "f\n"
+        DC_PER_DEPTH_1 ":\n"  // in Loop: Header=BB177_2 Depth=1
+        "add    x13, %[filter_data], x8, lsl #3\n"
+        "ld1    { v19.d }[0], [x13], x9\n"
+        "movi   v21.16b, #0\n"
+        "movi   v20.16b, #0\n"
+        "add    x8, x8, #1\n"  // =1
+        "ld1    { v18.d }[0], [x13], x9\n"
+        "ld1    { v17.d }[0], [x13], x9\n"
+        "zip1   v22.16b, v19.16b, v18.16b\n"
+        "eor    v22.16b, v22.16b, v0.16b\n"
+        "ld1    { v16.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v17.16b, v0.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "ld1    { v7.d }[0], [x13], x9\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        ".word 0x4e8296d5  // sdot   v21.4s, v22.16b, v2.16b\n"
+        ".word 0x4e829714  // sdot   v20.4s, v24.16b, v2.16b\n"
+        "ld1    { v6.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v16.16b, v7.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "ld1    { v5.d }[0], [x13], x9\n"
+        "zip1   v25.16b, v6.16b, v0.16b\n"
+        "eor    v25.16b, v25.16b, v0.16b\n"
+        "zip1   v26.8h, v23.8h, v25.8h\n"
+        "ld1    { v4.d }[0], [x13], x9\n"
+        "zip2   v23.8h, v23.8h, v25.8h\n"
+        ".word 0x4e8296f5  // sdot   v21.4s, v23.16b, v2.16b\n"
+        ".word 0x4e829754  // sdot   v20.4s, v26.16b, v2.16b\n"
+        "ld1    { v3.d }[0], [x13]\n"
+        "zip1   v25.16b, v5.16b, v4.16b\n"
+        "stp    q26, q23, [%[shuffled_filter_data], #32]\n"
+        "stp    q24, q22, [%[shuffled_filter_data]]\n"
+        "zip1   v23.16b, v3.16b, v0.16b\n"
+        "eor    v22.16b, v25.16b, v0.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "stp    q24, q22, [%[shuffled_filter_data], #64]\n"
+        ".word 0x4e8296d5  // sdot   v21.4s, v22.16b, v2.16b\n"
+        "ldr    q22, [%[bias_data]]\n"
+        "ldr    q23, [%[bias_data], x12]\n"
+        ".word 0x4e829714  // sdot   v20.4s, v24.16b, v2.16b\n"
+        "add    %[shuffled_filter_data], x2, #96\n"  // =96
+        "mla    v22.4s, v20.4s, v1.4s\n"
+        "mla    v23.4s, v21.4s, v1.4s\n"
+        "add    %[bias_data], x1, x11\n"
+        "stp    q22, q23, [%[adjusted_bias_data]], #32\n"
+        DC_PER_DEPTH_2 ":\n"  // =>This Inner Loop Header: Depth=1
+        "cmp    w8, w10\n"
+        "b.lt   " DC_PER_DEPTH_1 "b\n"
+        :
+        // Outputs.
+        [ filter_data ] "+r"(filter_data),
+        [ bias_data ] "+r"(bias_data),
+        [ shuffled_filter_data ] "+r"(shuffled_filter_data),
+        [ adjusted_bias_data ] "+r"(adjusted_bias_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        // We use these general-purpose registers.
+        "x8", "x9", "x10", "x11", "x12", "x13");
+#undef DC_PER_DEPTH_1
+#undef DC_PER_DEPTH_2
+  }
+
+  static void __attribute__((noinline))
+  Run(const uint8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthNeon(filter_data, bias_data, shuffled_filter_data,
+                        adjusted_bias_data, function_params);
+  }
+};
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                       QuantizationType::kPerChannelInt8> {
+  static inline void ProcessPerDepthNeon(
+      const int8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[filter_data]
+    // x1 %[bias_data]
+    // x2 %[shuffled_filter_data]
+    // x3 %[adjusted_bias_data]
+    // x4 %[function_params]
+#define DC_PER_DEPTH_1 "1"
+#define DC_PER_DEPTH_2 "2"
+#define DC_PER_DEPTH_3 "3"
+
+    asm volatile(        // %bb.0:
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "cmp    w8, #1\n"  // =1
+        "b.lt   " DC_PER_DEPTH_3 "f\n"
+        // %bb.1:
+        "add    x10, %[function_params], #" STR(DP_OFFSET_INPUT_OFFSET) "\n"  // =24
+        "ldrsw  x11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
+        "ldrsw  x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ld1r   { v1.4s }, [x10]\n"
+        "movi   v0.16b, #0\n"
+        "lsl    x10, x11, #2\n"
+        "lsl    x11, x11, #3\n"
+        "movi   v2.16b, #1\n"
+        "mov    x12, %[filter_data]\n"
+        // implicit-def: $q3
+        // implicit-def: $q4
+        // implicit-def: $q5
+        // implicit-def: $q6
+        // implicit-def: $q7
+        // implicit-def: $q16
+        // implicit-def: $q17
+        // implicit-def: $q18
+        // implicit-def: $q19
+        DC_PER_DEPTH_2 ":\n"  // =>This Inner Loop Header: Depth=1
+        "add    x13, %[filter_data], x9\n"
+        "ld1    { v3.d }[0], [x12], #8\n"
+        "ld1    { v4.d }[0], [x13], x9\n"
+        "movi   v21.16b, #0\n"
+        "movi   v20.16b, #0\n"
+        "subs   w8, w8, #1\n"  // =1
+        "ld1    { v5.d }[0], [x13], x9\n"
+        "zip1   v22.16b, v3.16b, v4.16b\n"
+        "mov    %[filter_data], x12\n"
+        "ld1    { v6.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v5.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "ld1    { v7.d }[0], [x13], x9\n"
+        ".word 0x4e8296d5  // sdot   v21.4s, v22.16b, v2.16b\n"
+        ".word 0x4e829714  // sdot   v20.4s, v24.16b, v2.16b\n"
+        "ld1    { v16.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v6.16b, v7.16b\n"
+        "ld1    { v17.d }[0], [x13], x9\n"
+        "zip1   v25.16b, v16.16b, v0.16b\n"
+        "zip1   v26.8h, v23.8h, v25.8h\n"
+        "zip2   v23.8h, v23.8h, v25.8h\n"
+        "ld1    { v18.d }[0], [x13], x9\n"
+        ".word 0x4e8296f5  // sdot   v21.4s, v23.16b, v2.16b\n"
+        ".word 0x4e829754  // sdot   v20.4s, v26.16b, v2.16b\n"
+        "ld1    { v19.d }[0], [x13]\n"
+        "zip1   v25.16b, v17.16b, v18.16b\n"
+        "stp    q24, q22, [%[shuffled_filter_data]]\n"
+        "stp    q26, q23, [%[shuffled_filter_data], #32]\n"
+        "zip1   v22.16b, v19.16b, v0.16b\n"
+        "zip1   v23.8h, v25.8h, v22.8h\n"
+        "zip2   v22.8h, v25.8h, v22.8h\n"
+        "stp    q23, q22, [%[shuffled_filter_data], #64]\n"
+        ".word 0x4e8296f4  // sdot   v20.4s, v23.16b, v2.16b\n"
+        ".word 0x4e8296d5  // sdot   v21.4s, v22.16b, v2.16b\n"
+        "ldr    q22, [%[bias_data]]\n"
+        "ldr    q23, [%[bias_data], x10]\n"
+        "add    %[shuffled_filter_data], x2, #96\n"  // =96
+        "add    %[bias_data], x1, x11\n"
+        "mla    v22.4s, v20.4s, v1.4s\n"
+        "mla    v23.4s, v21.4s, v1.4s\n"
+        "stp    q22, q23, [%[adjusted_bias_data]], #32\n"
+        "b.ne   " DC_PER_DEPTH_2 "b\n"
+        DC_PER_DEPTH_3 ":\n"
+        :
+        // Outputs.
+        [ filter_data ] "+r"(filter_data),
+        [ bias_data ] "+r"(bias_data),
+        [ shuffled_filter_data ] "+r"(shuffled_filter_data),
+        [ adjusted_bias_data ] "+r"(adjusted_bias_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        // We use these general-purpose registers.
+        "x8", "x9", "x10", "x11", "x12", "x13");
+#undef DC_PER_DEPTH_1
+#undef DC_PER_DEPTH_2
+#undef DC_PER_DEPTH_3
+  }
+
+  static void __attribute__((noinline))
+  Run(const int8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthNeon(filter_data, bias_data, shuffled_filter_data,
+                        adjusted_bias_data, function_params);
+  }
+};
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockNeon(
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data_0 = input_block_data;
+      int8x16_t input_data_a;
+      int8x16_t input_data_b;
+      int8x16_t input_data_c;
+      int8x16_t input_data_d;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        int8x16_t work_reg_a_sp;
+        int8x16_t work_reg_b_sp;
+
+        int i_depth = 0;
+
+        if (depth_micro_repeats >= 2) {
+          i_depth += 2;
+
+          input_data_a = util_vld1q_x8(input_data_0);
+          input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+          input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+          input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+          input_data_0 += 16;
+
+          for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+            work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+            work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+            vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+            if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+            }
+
+            work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+            work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+            vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+            input_data_a = util_vld1q_x8(input_data_0);
+            input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+            optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+            optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+            vst1q_s8(scratch_data_0, work_reg_a);
+            vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+            scratch_data_0 += depth_advance;
+
+            if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+            }
+
+            input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+            input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+            optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+            optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+            vst1q_s8(scratch_data_0, work_reg_a_sp);
+            vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+            scratch_data_0 += depth_advance;
+            input_data_0 += 16;
+          }
+
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+
+          work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+          work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+          vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+            work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+          }
+
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+          vst1q_s8(scratch_data_0, work_reg_a_sp);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+          scratch_data_0 += depth_advance;
+        }
+        for (; i_depth < depth_micro_repeats; ++i_depth) {
+          input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+          input_data_b =
+              vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
+          input_data_c =
+              vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
+          input_data_d =
+              vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+          input_data_0 += 8;
+
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
+
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        TFLITE_DCHECK_LT(residual_width, 4);
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+          input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+          input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+          if (residual_width > 1) {
+            input_data_b =
+                vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
+            if (residual_width == 3) {
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+            }
+          }
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_0 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+      }
+
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data_0 = input_block_data;
+      int8x16_t input_data_a;
+      int8x16_t input_data_b;
+      int8x16_t input_data_c;
+      int8x16_t input_data_d;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs zero-ed).
+        int adjusted_residual_width =
+            j_width == (input_width_micro_repeats) ? residual_width : 4;
+
+        if (trailing_width_padding &&
+            j_width == (width_overall_micro_repeats - 1)) {
+          adjusted_residual_width -= 1;
+        }
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+        }
+        if (start_width == 0) {
+          if (adjusted_residual_width == 4) {
+            int8x16_t work_reg_a_sp;
+            int8x16_t work_reg_b_sp;
+
+            int i_depth = 0;
+
+            if (depth_micro_repeats >= 2) {
+              i_depth += 2;
+
+              input_data_a = util_vld1q_x8(input_data_0);
+              input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+              input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+              input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+              input_data_0 += 16;
+
+              for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+                work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+                work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                  work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                }
+
+                work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+                work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+                input_data_a = util_vld1q_x8(input_data_0);
+                input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+                vst1q_s8(scratch_data_0, work_reg_a);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+                scratch_data_0 += depth_advance;
+
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                  work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                }
+
+                input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+                input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+                vst1q_s8(scratch_data_0, work_reg_a_sp);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+                scratch_data_0 += depth_advance;
+                input_data_0 += 16;
+              }
+
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+
+              work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+              work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              }
+
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a_sp);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+              scratch_data_0 += depth_advance;
+            }
+            for (; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+              input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
+                                             input_data_b, 0);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+              input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
+                                             input_data_d, 0);
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              input_data_0 += 8;
+
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              if (adjusted_residual_width > 0) {
+                input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+                if (adjusted_residual_width > 1) {
+                  input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                                 input_data_b, 0);
+                  if (adjusted_residual_width == 3) {
+                    input_data_c = vld1q_lane_s8x8(
+                        input_data_0 + 2 * input_depth, input_data_c, 0);
+                  }
+                }
+              }
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          }
+        } else {
+          if (adjusted_residual_width == 4) {
+            int8x16_t work_reg_a_sp;
+            int8x16_t work_reg_b_sp;
+
+            int i_depth = 0;
+
+            if (depth_micro_repeats >= 2) {
+              i_depth += 2;
+
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+              input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+              input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+              input_data_0 += 16;
+
+              for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+                work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+                work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                  work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                }
+
+                work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+                work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+                input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+                input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+                vst1q_s8(scratch_data_0, work_reg_a);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+                scratch_data_0 += depth_advance;
+
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                  work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                }
+
+                input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+                input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+                optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+                vst1q_s8(scratch_data_0, work_reg_a_sp);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+                scratch_data_0 += depth_advance;
+                input_data_0 += 16;
+              }
+
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+
+              work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+              work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              }
+
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a_sp);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+              scratch_data_0 += depth_advance;
+            }
+            for (; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
+                                             input_data_b, 0);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+              input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
+                                             input_data_d, 0);
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              input_data_0 += 8;
+
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              // Skip loading first column.
+              if (adjusted_residual_width > 1) {
+                input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                               input_data_b, 0);
+                if (adjusted_residual_width == 3) {
+                  input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                                 input_data_c, 0);
+                }
+              }
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0);
+              optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          }
+        }
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+    int8x8_t padding_mask;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+    const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
+    padding_mask = vdup_n_s8(-1);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = (copy_size + start_width) & 0x7;
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          work_reg = util_vld1q_x8(input_block_data + input_block_offset);
+          work_reg = vextq_s8(vreinterpretq_s8_u8(padding_reg), work_reg, 15);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
+          optimized_ops_prefetch_write_l1_keep(scratch_data);
+          vst1q_s8(scratch_data, work_reg);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              util_vld1q_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                               copy_done);
+          vst1q_s8(scratch_data + start_width + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              util_vld1_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                               copy_done);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
+                                       copy_size - 8);
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (8 - copy_remaining))));
+          half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                  vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
+                                  half_work_reg);
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                               copy_done);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        }
+
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                             copy_done);
+        optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                             copy_done + 8);
+        vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
+                                        half_work_reg, 0);
+          half_work_reg = vext_s8(vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
+                                  half_work_reg, 7);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          optimized_ops_prefetch_write_l1_keep(scratch_data);
+          vst1_lane_8x4(scratch_data, half_work_reg, 0);
+          copy_done += 3;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                               copy_done);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (4 - copy_remaining))));
+          half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                  vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
+                                  half_work_reg);
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                               copy_done);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                             copy_done);
+        optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
+                                             copy_done + 12);
+        vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      // Special case of 1 + 3 + 1, padding + copy + padding.
+      // This is rarely executed in practice.
+      TFLITE_DCHECK_EQ(copy_size, 3);
+      TFLITE_DCHECK_EQ(start_width, 1);
+      TFLITE_DCHECK(leading_width_padding);
+      TFLITE_DCHECK(trailing_width_padding);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vreinterpret_s8_u8(vdup_n_u8(-input_offset));
+        half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
+                                         input_block_data + input_block_offset),
+                                     half_work_reg, 1);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 1),
+                         half_work_reg, 2);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 2),
+                         half_work_reg, 3);
+
+        if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        }
+        TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset);
+        vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
+
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset + 4);
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset + 16);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
+      if (leading_width_padding) {
+        padding_mask = vset_lane_s8(-1, padding_mask, 0);
+      }
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+        if (leading_width_padding) {
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
+        }
+        half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                vget_low_s8(vreinterpretq_s8_u8(padding_reg)),
+                                half_work_reg);
+
+        if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        }
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset + 4);
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset + 16);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    const int copy_block_height = block_height;
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    const int copy_size =
+        (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = copy_size & 0x7;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              util_vld1q_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
+          TFLITE_DCHECK_EQ(copy_done % 16, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+          vst1q_s8(scratch_data + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              util_vld1_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
+                                       copy_size - 8);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (8 - copy_remaining))));
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+        optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done + 8);
+        vst1_s8(scratch_data + copy_done, half_work_reg);
+        vst1_s8(scratch_data + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = copy_size & 0x3;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (4 - copy_remaining))));
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
+        optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done + 12);
+        vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        optimized_ops_prefetch_write_l1_keep(scratch_data_base +
+                                             scratch_data_offset + 8);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static void __attribute__((noinline))
+  Run(int32 height_block_number, int32 width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kNonPerChannelUint8,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_1 "1"
+#define DC_KERNEL_NO_MULT_2 "2"
+#define DC_KERNEL_NO_MULT_3 "3"
+#define DC_KERNEL_NO_MULT_4 "4"
+#define DC_KERNEL_NO_MULT_5 "5"
+#define DC_KERNEL_NO_MULT_6 "6"
+#define DC_KERNEL_NO_MULT_7 "7"
+#define DC_KERNEL_NO_MULT_8 "8"
+#define DC_KERNEL_NO_MULT_9 "9"
+#define DC_KERNEL_NO_MULT_10 "10"
+#define DC_KERNEL_NO_MULT_11 "11"
+#define DC_KERNEL_NO_MULT_12 "12"
+#define DC_KERNEL_NO_MULT_13 "13"
+#define DC_KERNEL_NO_MULT_14 "14"
+#define DC_KERNEL_NO_MULT_15 "15"
+#define DC_KERNEL_NO_MULT_16 "16"
+#define DC_KERNEL_NO_MULT_17 "17"
+#define DC_KERNEL_NO_MULT_18 "18"
+#define DC_KERNEL_NO_MULT_19 "19"
+#define DC_KERNEL_NO_MULT_20 "20"
+#define DC_KERNEL_NO_MULT_21 "21"
+#define DC_KERNEL_NO_MULT_22 "22"
+#define DC_KERNEL_NO_MULT_23 "23"
+#define DC_KERNEL_NO_MULT_24 "24"
+#define DC_KERNEL_NO_MULT_25 "25"
+#define DC_KERNEL_NO_MULT_26 "26"
+#define DC_KERNEL_NO_MULT_27 "27"
+#define DC_KERNEL_NO_MULT_28 "28"
+#define DC_KERNEL_NO_MULT_29 "29"
+#define DC_KERNEL_NO_MULT_30 "30"
+#define DC_KERNEL_NO_MULT_31 "31"
+#define DC_KERNEL_NO_MULT_32 "32"
+#define DC_KERNEL_NO_MULT_33 "33"
+#define DC_KERNEL_NO_MULT_34 "34"
+#define DC_KERNEL_NO_MULT_35 "35"
+
+    asm volatile(
+        // Compiled code used block of 320 for spill out of total stack of 464.
+        "sub    sp, sp, #320\n"  // =464
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "cmp    w8, #1\n"  // =1
+        "str    w8, [sp, #36]\n"  // 4-byte Folded Spill
+        "b.lt   " DC_KERNEL_NO_MULT_35 "f\n"
+        // %bb.1:
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "str    xzr, [sp, #64]\n"  // 8-byte Folded Spill
+        "str    wzr, [sp, #60]\n"  // 4-byte Folded Spill
+        "ldpsw  x21, x14, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "str    w8, [sp, #276]\n"  // 4-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS) "]\n"
+        "ldrsw  x13, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
+        "ldrsw  x5, [%[function_params]]\n"
+        "str    w8, [sp, #280]\n"  // 4-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "add    x11, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"  // =36
+        "add    x12, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
+        "add    x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "str    w8, [sp, #284]\n"  // 4-byte Folded Spill
+        "ldrb   w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
+        "ld1r   { v1.4s }, [x12]\n"
+        "ld1r   { v2.4s }, [x11]\n"
+        "lsl    x12, x14, #2\n"
+        "dup    v7.16b, w8\n"
+        "fmov   s5, w8\n"
+        "lsl    x8, x13, #5\n"
+        "add    x13, x14, x14, lsl #1\n"
+        "add    x11, x14, x14, lsl #2\n"
+        "mov    x26, %[output_block_data]\n"
+        "mov    %[output_block_data], %[filter_workspace]\n"
+        "ldr    w7, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ld1r   { v0.8h }, [x10]\n"
+        "dup    v16.16b, w9\n"
+        "fmov   s6, w9\n"
+        "lsl    x15, x14, #1\n"
+        "lsl    %[filter_workspace], x21, #1\n"
+        "add    x27, x21, x21, lsl #1\n"
+        "lsl    x9, x5, #1\n"
+        "add    x10, x21, x5\n"
+        "stp    x11, x12, [sp, #208]\n"  // 16-byte Folded Spill
+        "add    x11, x11, %[scratch_block_data]\n"
+        "add    x12, x12, %[scratch_block_data]\n"
+        "str    x13, [sp, #224]\n"  // 8-byte Folded Spill
+        "add    x13, x13, %[scratch_block_data]\n"
+        "str    x8, [sp, #24]\n"  // 8-byte Folded Spill
+        "stp    x15, x14, [sp, #256]\n"  // 16-byte Folded Spill
+        "add    x8, x14, %[scratch_block_data]\n"
+        "add    x14, x15, %[scratch_block_data]\n"
+        "add    x15, x9, x5\n"
+        "add    x16, x9, x27\n"
+        "add    x17, x9, %[filter_workspace]\n"
+        "add    x6, x9, x21\n"
+        "add    %[function_params], x26, x9\n"
+        "add    x9, x26, x10\n"
+        "add    x10, x11, #32\n"  // =32
+        "add    x11, x12, #32\n"  // =32
+        "add    x12, x13, #32\n"  // =32
+        "str    x12, [sp, #312]\n"  // 8-byte Folded Spill
+        "add    x12, x14, #32\n"  // =32
+        "str    x12, [sp, #304]\n"  // 8-byte Folded Spill
+        "add    x12, x15, x27\n"
+        "add    x13, x15, %[filter_workspace]\n"
+        "add    x23, x15, x21\n"
+        "add    x14, x26, x15\n"
+        "add    x15, x27, x5\n"
+        "add    x20, x26, x17\n"
+        "mov    w17, w7\n"
+        "add    x19, x26, x15\n"
+        "add    x15, %[filter_workspace], x5\n"
+        "mov    x22, xzr\n"
+        "str    x14, [sp, #296]\n"  // 8-byte Folded Spill
+        "add    x14, x26, x16\n"
+        "add    x7, x26, x6\n"
+        "add    x16, x26, x15\n"
+        "add    x15, x26, x13\n"
+        "add    x6, x26, x23\n"
+        "and    w13, w17, #0xfffffffe\n"
+        "lsl    x23, x5, #2\n"
+        "dup    v17.8b, v5.b[0]\n"
+        "dup    v14.8b, v6.b[0]\n"
+        "add    x8, x8, #32\n"  // =32
+        "str    x14, [sp, #288]\n"  // 8-byte Folded Spill
+        "add    x14, x26, x12\n"
+        "mov    x12, xzr\n"
+        "str    w13, [sp, #12]\n"  // 4-byte Folded Spill
+        "mov    x13, x16\n"
+        "stp    x26, x23, [sp, #80]\n"  // 16-byte Folded Spill
+        "add    x23, x26, x21\n"
+        "add    x22, x26, x5\n"
+        "mov    x28, %[filter_workspace]\n"
+        "add    %[filter_workspace], x26, x1\n"
+        "add    x25, x26, x27\n"
+        "str    %[scratch_block_data], [sp, #184]\n"  // 8-byte Folded Spill
+        "str    x21, [sp, #136]\n"  // 8-byte Folded Spill
+        "str    w17, [sp, #76]\n"  // 4-byte Folded Spill
+        "str    x26, [sp, #16]\n"  // 8-byte Folded Spill
+        "stp    d14, d17, [sp, #96]\n"  // 16-byte Folded Spill
+        "stp    x6, x23, [sp, #240]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_4 "f\n"
+        DC_KERNEL_NO_MULT_2 ":\n"  // in Loop: Header=BB225_4 Depth=1
+        "mov    %[bias_data], x16\n"
+        DC_KERNEL_NO_MULT_3 ":\n"  // in Loop: Header=BB225_4 Depth=1
+        "ldr    %[output_block_data], [sp, #24]\n"  // 8-byte Folded Reload
+        "ldr    x12, [sp, #184]\n"  // 8-byte Folded Reload
+        "ldr    w17, [sp, #60]\n"  // 4-byte Folded Reload
+        "add    x12, x12, %[output_block_data]\n"
+        "str    x12, [sp, #184]\n"  // 8-byte Folded Spill
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "add    w17, w17, #1\n"  // =1
+        "str    w17, [sp, #60]\n"  // 4-byte Folded Spill
+        "add    x12, x12, #8\n"  // =8
+        "str    x12, [sp, #80]\n"  // 8-byte Folded Spill
+        "ldr    x12, [sp, #64]\n"  // 8-byte Folded Reload
+        "add    x12, x12, %[output_block_data]\n"
+        "str    x12, [sp, #64]\n"  // 8-byte Folded Spill
+        "ldr    w12, [sp, #36]\n"  // 4-byte Folded Reload
+        "cmp    w17, w12\n"
+        "ldp    x12, %[output_block_data], [sp, #40]\n"  // 16-byte Folded Reload
+        "ldr    w17, [sp, #76]\n"  // 4-byte Folded Reload
+        "add    x12, x12, #8\n"  // =8
+        "b.eq   " DC_KERNEL_NO_MULT_35 "f\n"
+        DC_KERNEL_NO_MULT_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB225_31 Depth 2
+        // Child Loop BB225_34 Depth 2
+        // Child Loop BB225_20 Depth 2
+        // Child Loop BB225_23 Depth 3
+        // Child Loop BB225_27 Depth 4
+        // Child Loop BB225_7 Depth 2
+        // Child Loop BB225_9 Depth 3
+        // Child Loop BB225_15 Depth 3
+        "ldp    q18, q15, [%[output_block_data]]\n"
+        "ldp    q19, q5, [%[output_block_data], #32]\n"
+        "ldp    q20, q6, [%[output_block_data], #64]\n"
+        "cmp    w17, #4\n"  // =4
+        "add    %[output_block_data], x3, #96\n"  // =96
+        "stp    x12, %[output_block_data], [sp, #40]\n"  // 16-byte Folded Spill
+        "b.ne   " DC_KERNEL_NO_MULT_16 "f\n"
+        // %bb.5:        // in Loop: Header=BB225_4 Depth=1
+        "mov    x24, x12\n"
+        "ldr    x12, [sp, #64]\n"  // 8-byte Folded Reload
+        "mov    x16, xzr\n"
+        "stp    q6, q5, [sp, #144]\n"  // 32-byte Folded Spill
+        "str    q15, [sp, #112]\n"  // 16-byte Folded Spill
+        "str    x12, [sp, #232]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_7 "f\n"
+        DC_KERNEL_NO_MULT_6 ":\n"  // in Loop: Header=BB225_7 Depth=2
+        "ldr    x12, [sp, #232]\n"  // 8-byte Folded Reload
+        "ldp    q20, q19, [sp, #144]\n"  // 32-byte Folded Reload
+        "add    x16, x16, #1\n"  // =1
+        "cmp    x16, #2\n"  // =2
+        "add    x12, x12, #16\n"  // =16
+        "add    x24, x24, #4\n"  // =4
+        "mov    v18.16b, v15.16b\n"
+        "str    x12, [sp, #232]\n"  // 8-byte Folded Spill
+        "b.eq   " DC_KERNEL_NO_MULT_3 "b\n"
+        DC_KERNEL_NO_MULT_7 ":\n"  // Parent Loop BB225_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB225_9 Depth 3
+        // Child Loop BB225_15 Depth 3
+        "ldr    x12, [sp, #184]\n"  // 8-byte Folded Reload
+        "ldr    q21, [%[bias_data]], #16\n"
+        "add    %[output_block_data], x12, x16, lsl #4\n"
+        "ldr    w12, [sp, #280]\n"  // 4-byte Folded Reload
+        "ldr    q22, [%[output_block_data]]\n"
+        "mov    v31.16b, v21.16b\n"
+        "mov    v8.16b, v21.16b\n"
+        "cmp    w12, #1\n"  // =1
+        "ldr    x12, [sp, #264]\n"  // 8-byte Folded Reload
+        "mov    v9.16b, v21.16b\n"
+        "mov    v10.16b, v21.16b\n"
+        "ldr    q27, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #256]\n"  // 8-byte Folded Reload
+        "ldr    q26, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #224]\n"  // 8-byte Folded Reload
+        ".word 0x4e9a969f  // sdot   v31.4s, v20.16b, v26.16b\n"
+        "ldr    q25, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #216]\n"  // 8-byte Folded Reload
+        ".word 0x4e9a9668  // sdot   v8.4s, v19.16b, v26.16b\n"
+        ".word 0x4e9a9649  // sdot   v9.4s, v18.16b, v26.16b\n"
+        ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
+        "ldr    q24, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #208]\n"  // 8-byte Folded Reload
+        "ldr    q23, [%[output_block_data], x12]\n"
+        "b.lt   " DC_KERNEL_NO_MULT_11 "f\n"
+        // %bb.8:        // in Loop: Header=BB225_7 Depth=2
+        "stp    x24, x16, [sp, #192]\n"  // 16-byte Folded Spill
+        "ldr    w12, [sp, #280]\n"  // 4-byte Folded Reload
+        "mov    x17, x24\n"
+        "ldr    x21, [sp, #232]\n"  // 8-byte Folded Reload
+        "mov    x24, x25\n"
+        "mov    x25, %[filter_workspace]\n"
+        "mov    %[filter_workspace], x22\n"
+        "mov    x22, x23\n"
+        "ldr    x23, [sp, #88]\n"  // 8-byte Folded Reload
+        "shl    v28.4s, v18.4s, #8\n"
+        "shl    v29.4s, v19.4s, #8\n"
+        "shl    v30.4s, v20.4s, #8\n"
+        "mov    v11.16b, v23.16b\n"
+        "mov    v12.16b, v24.16b\n"
+        "mov    v13.16b, v27.16b\n"
+        "mov    v14.16b, v22.16b\n"
+        DC_KERNEL_NO_MULT_9 ":\n"  // Parent Loop BB225_4 Depth=1
+        // Parent Loop BB225_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4e8e965f  // sdot   v31.4s, v18.16b, v14.16b\n"
+        ".word 0x4e8d9648  // sdot   v8.4s, v18.16b, v13.16b\n"
+        ".word 0x4e999669  // sdot   v9.4s, v19.16b, v25.16b\n"
+        ".word 0x4e8d967f  // sdot   v31.4s, v19.16b, v13.16b\n"
+        ".word 0x4e8c966a  // sdot   v10.4s, v19.16b, v12.16b\n"
+        ".word 0x4e999688  // sdot   v8.4s, v20.16b, v25.16b\n"
+        ".word 0x4e8c9689  // sdot   v9.4s, v20.16b, v12.16b\n"
+        "sqrdmulh        v31.4s, v31.4s, v1.4s\n"
+        ".word 0x4e8b968a  // sdot   v10.4s, v20.16b, v11.16b\n"
+        "sqrdmulh        v8.4s, v8.4s, v1.4s\n"
+        "sqrdmulh        v9.4s, v9.4s, v1.4s\n"
+        "sqrshl v31.4s, v31.4s, v2.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v1.4s\n"
+        "sqrshl v8.4s, v8.4s, v2.4s\n"
+        "sqrshl v9.4s, v9.4s, v2.4s\n"
+        "sqxtn  v31.4h, v31.4s\n"
+        "sqrshl v10.4s, v10.4s, v2.4s\n"
+        "sqxtn  v9.4h, v9.4s\n"
+        "sqxtn2 v31.8h, v8.4s\n"
+        "sqxtn2 v9.8h, v10.4s\n"
+        "sqadd  v31.8h, v31.8h, v0.8h\n"
+        "sqadd  v8.8h, v9.8h, v0.8h\n"
+        "sqxtun v31.8b, v31.8h\n"
+        "sqxtun2        v31.16b, v8.8h\n"
+        "umax   v31.16b, v31.16b, v7.16b\n"
+        "add    %[output_block_data], x22, x17\n"
+        "umin   v31.16b, v31.16b, v16.16b\n"
+        "str    s31, [x26, x17]\n"
+        "st1    { v31.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x25, x17\n"
+        "st1    { v31.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x24, x17\n"
+        "mov    v10.16b, v21.16b\n"
+        "st1    { v31.s }[3], [%[output_block_data]]\n"
+        "mov    v31.16b, v21.16b\n"
+        "mov    v8.16b, v21.16b\n"
+        ".word 0x4e99978a  // sdot   v10.4s, v28.16b, v25.16b\n"
+        "mov    x16, x26\n"
+        "ldr    x26, [sp, #304]\n"  // 8-byte Folded Reload
+        ".word 0x4e8e979f  // sdot   v31.4s, v28.16b, v14.16b\n"
+        ".word 0x4e8d9788  // sdot   v8.4s, v28.16b, v13.16b\n"
+        ".word 0x4e8c97aa  // sdot   v10.4s, v29.16b, v12.16b\n"
+        "mov    v9.16b, v21.16b\n"
+        ".word 0x4e8d97bf  // sdot   v31.4s, v29.16b, v13.16b\n"
+        ".word 0x4e9a97a8  // sdot   v8.4s, v29.16b, v26.16b\n"
+        ".word 0x4e8b97ca  // sdot   v10.4s, v30.16b, v11.16b\n"
+        "rev32  v4.8h, v26.8h\n"
+        ".word 0x4e9a9789  // sdot   v9.4s, v28.16b, v26.16b\n"
+        ".word 0x4e9a97df  // sdot   v31.4s, v30.16b, v26.16b\n"
+        ".word 0x4e9997c8  // sdot   v8.4s, v30.16b, v25.16b\n"
+        "sqrdmulh        v26.4s, v10.4s, v1.4s\n"
+        "rev32  v6.8h, v24.8h\n"
+        ".word 0x4e9997a9  // sdot   v9.4s, v29.16b, v25.16b\n"
+        "sqrdmulh        v24.4s, v8.4s, v1.4s\n"
+        "sqrshl v8.4s, v26.4s, v2.4s\n"
+        "ldr    q26, [x26, x21]\n"
+        "ldr    x26, [sp, #312]\n"  // 8-byte Folded Reload
+        "mov    v17.16b, v16.16b\n"
+        "mov    v16.16b, v7.16b\n"
+        "rev32  v7.8h, v23.8h\n"
+        ".word 0x4e8c97c9  // sdot   v9.4s, v30.16b, v12.16b\n"
+        "sqrdmulh        v23.4s, v31.4s, v1.4s\n"
+        "rev32  v5.8h, v25.8h\n"
+        "sqrdmulh        v25.4s, v9.4s, v1.4s\n"
+        "sqrshl v23.4s, v23.4s, v2.4s\n"
+        "add    %[output_block_data], %[scratch_block_data], x21\n"
+        "sqrshl v31.4s, v24.4s, v2.4s\n"
+        "sqrshl v24.4s, v25.4s, v2.4s\n"
+        "sqxtn  v9.4h, v23.4s\n"
+        "rev32  v15.8h, v22.8h\n"
+        "ldr    q22, [%[output_block_data], #32]\n"
+        "rev32  v3.8h, v27.8h\n"
+        "sqxtn  v10.4h, v24.4s\n"
+        "ldr    q27, [x8, x21]\n"
+        "ldr    q25, [x26, x21]\n"
+        "ldr    q24, [x11, x21]\n"
+        "ldr    q23, [x10, x21]\n"
+        "sqxtn2 v9.8h, v31.4s\n"
+        "sqxtn2 v10.8h, v8.4s\n"
+        "sqadd  v31.8h, v9.8h, v0.8h\n"
+        "sqadd  v8.8h, v10.8h, v0.8h\n"
+        "sqxtun v31.8b, v31.8h\n"
+        "sqxtun2        v31.16b, v8.8h\n"
+        "umax   v31.16b, v31.16b, v16.16b\n"
+        "add    %[output_block_data], x9, x17\n"
+        "umin   v31.16b, v31.16b, v17.16b\n"
+        "str    s31, [%[filter_workspace], x17]\n"
+        "st1    { v31.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x13, x17\n"
+        "st1    { v31.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x19, x17\n"
+        "mov    v8.16b, v21.16b\n"
+        "st1    { v31.s }[3], [%[output_block_data]]\n"
+        "trn1   v31.8h, v15.8h, v22.8h\n"
+        "mov    v9.16b, v21.16b\n"
+        "mov    v10.16b, v21.16b\n"
+        "trn1   v3.8h, v3.8h, v27.8h\n"
+        "trn1   v4.8h, v4.8h, v26.8h\n"
+        ".word 0x4e9f9648  // sdot   v8.4s, v18.16b, v31.16b\n"
+        "mov    v11.16b, v21.16b\n"
+        "trn1   v5.8h, v5.8h, v25.8h\n"
+        ".word 0x4e839649  // sdot   v9.4s, v18.16b, v3.16b\n"
+        ".word 0x4e84964a  // sdot   v10.4s, v18.16b, v4.16b\n"
+        ".word 0x4e839668  // sdot   v8.4s, v19.16b, v3.16b\n"
+        "trn1   v6.8h, v6.8h, v24.8h\n"
+        ".word 0x4e85964b  // sdot   v11.4s, v18.16b, v5.16b\n"
+        ".word 0x4e849669  // sdot   v9.4s, v19.16b, v4.16b\n"
+        ".word 0x4e85966a  // sdot   v10.4s, v19.16b, v5.16b\n"
+        ".word 0x4e849688  // sdot   v8.4s, v20.16b, v4.16b\n"
+        "trn1   v7.8h, v7.8h, v23.8h\n"
+        ".word 0x4e86966b  // sdot   v11.4s, v19.16b, v6.16b\n"
+        ".word 0x4e859689  // sdot   v9.4s, v20.16b, v5.16b\n"
+        ".word 0x4e86968a  // sdot   v10.4s, v20.16b, v6.16b\n"
+        "sqrdmulh        v8.4s, v8.4s, v1.4s\n"
+        ".word 0x4e87968b  // sdot   v11.4s, v20.16b, v7.16b\n"
+        "sqrdmulh        v9.4s, v9.4s, v1.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v1.4s\n"
+        "sqrshl v8.4s, v8.4s, v2.4s\n"
+        "sqrdmulh        v11.4s, v11.4s, v1.4s\n"
+        "sqrshl v9.4s, v9.4s, v2.4s\n"
+        "sqrshl v10.4s, v10.4s, v2.4s\n"
+        "sqxtn  v8.4h, v8.4s\n"
+        "sqrshl v11.4s, v11.4s, v2.4s\n"
+        "sqxtn  v10.4h, v10.4s\n"
+        "sqxtn2 v8.8h, v9.4s\n"
+        "sqxtn2 v10.8h, v11.4s\n"
+        "sqadd  v8.8h, v8.8h, v0.8h\n"
+        "sqadd  v9.8h, v10.8h, v0.8h\n"
+        "sqxtun v8.8b, v8.8h\n"
+        "sqxtun2        v8.16b, v9.8h\n"
+        "mov    v9.16b, v21.16b\n"
+        "mov    v10.16b, v21.16b\n"
+        "mov    v11.16b, v21.16b\n"
+        ".word 0x4e9f9789  // sdot   v9.4s, v28.16b, v31.16b\n"
+        "mov    x26, x16\n"
+        "ldr    x16, [sp, #288]\n"  // 8-byte Folded Reload
+        "mov    v12.16b, v21.16b\n"
+        ".word 0x4e83978a  // sdot   v10.4s, v28.16b, v3.16b\n"
+        ".word 0x4e84978b  // sdot   v11.4s, v28.16b, v4.16b\n"
+        ".word 0x4e8397a9  // sdot   v9.4s, v29.16b, v3.16b\n"
+        "umax   v8.16b, v8.16b, v16.16b\n"
+        ".word 0x4e85978c  // sdot   v12.4s, v28.16b, v5.16b\n"
+        ".word 0x4e8497aa  // sdot   v10.4s, v29.16b, v4.16b\n"
+        ".word 0x4e8597ab  // sdot   v11.4s, v29.16b, v5.16b\n"
+        ".word 0x4e8497c9  // sdot   v9.4s, v30.16b, v4.16b\n"
+        "add    %[output_block_data], x7, x17\n"
+        "umin   v8.16b, v8.16b, v17.16b\n"
+        ".word 0x4e8697ac  // sdot   v12.4s, v29.16b, v6.16b\n"
+        ".word 0x4e8597ca  // sdot   v10.4s, v30.16b, v5.16b\n"
+        ".word 0x4e8697cb  // sdot   v11.4s, v30.16b, v6.16b\n"
+        "sqrdmulh        v3.4s, v9.4s, v1.4s\n"
+        "str    s8, [%[function_params], x17]\n"
+        "st1    { v8.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x20, x17\n"
+        ".word 0x4e8797cc  // sdot   v12.4s, v30.16b, v7.16b\n"
+        "sqrdmulh        v4.4s, v10.4s, v1.4s\n"
+        "sqrdmulh        v5.4s, v11.4s, v1.4s\n"
+        "sqrshl v3.4s, v3.4s, v2.4s\n"
+        "st1    { v8.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x16, x17\n"
+        "sqrdmulh        v6.4s, v12.4s, v1.4s\n"
+        "sqrshl v4.4s, v4.4s, v2.4s\n"
+        "sqrshl v5.4s, v5.4s, v2.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "st1    { v8.s }[3], [%[output_block_data]]\n"
+        "sqrshl v6.4s, v6.4s, v2.4s\n"
+        "sqxtn  v5.4h, v5.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqxtn2 v5.8h, v6.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqadd  v4.8h, v5.8h, v0.8h\n"
+        "sqxtun v3.8b, v3.8h\n"
+        "sqxtun2        v3.16b, v4.8h\n"
+        "ldr    x16, [sp, #296]\n"  // 8-byte Folded Reload
+        "mov    v7.16b, v16.16b\n"
+        "umax   v3.16b, v3.16b, v7.16b\n"
+        "add    %[output_block_data], x6, x17\n"
+        "umin   v3.16b, v3.16b, v17.16b\n"
+        "str    s3, [x16, x17]\n"
+        "st1    { v3.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x15, x17\n"
+        "mov    v31.16b, v21.16b\n"
+        "mov    v8.16b, v21.16b\n"
+        "mov    v9.16b, v21.16b\n"
+        "mov    v10.16b, v21.16b\n"
+        "mov    v16.16b, v17.16b\n"
+        "st1    { v3.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x14, x17\n"
+        "subs   w12, w12, #1\n"  // =1
+        "add    x21, x21, #32\n"  // =32
+        ".word 0x4e9a969f  // sdot   v31.4s, v20.16b, v26.16b\n"
+        ".word 0x4e9a9668  // sdot   v8.4s, v19.16b, v26.16b\n"
+        ".word 0x4e9a9649  // sdot   v9.4s, v18.16b, v26.16b\n"
+        ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
+        "add    x17, x17, x23\n"
+        "mov    v11.16b, v23.16b\n"
+        "mov    v12.16b, v24.16b\n"
+        "mov    v13.16b, v27.16b\n"
+        "mov    v14.16b, v22.16b\n"
+        "st1    { v3.s }[3], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_9 "b\n"
+        // %bb.10:        // in Loop: Header=BB225_7 Depth=2
+        "add    %[output_block_data], %[scratch_block_data], x21\n"
+        "ldr    x21, [sp, #136]\n"  // 8-byte Folded Reload
+        "ldp    d14, d17, [sp, #96]\n"  // 16-byte Folded Reload
+        "mov    x23, x22\n"
+        "mov    x22, %[filter_workspace]\n"
+        "mov    %[filter_workspace], x25\n"
+        "mov    x25, x24\n"
+        "ldr    q15, [sp, #112]\n"  // 16-byte Folded Reload
+        "ldp    x24, x16, [sp, #192]\n"  // 16-byte Folded Reload
+        "add    x12, x26, x17\n"
+        "ldr    w17, [sp, #284]\n"  // 4-byte Folded Reload
+        "cmp    w17, #0\n"  // =0
+        "b.gt   " DC_KERNEL_NO_MULT_12 "f\n"
+        "b      " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_11 ":\n"  // in Loop: Header=BB225_7 Depth=2
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "add    x12, x12, x16, lsl #2\n"
+        "ldr    w17, [sp, #284]\n"  // 4-byte Folded Reload
+        "cmp    w17, #0\n"  // =0
+        "b.le   " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_12 ":\n"  // in Loop: Header=BB225_7 Depth=2
+        "ldr    w17, [sp, #284]\n"  // 4-byte Folded Reload
+        "movi   v28.16b, #0\n"
+        "movi   v29.16b, #0\n"
+        "movi   v30.16b, #0\n"
+        "cmp    w17, #3\n"  // =3
+        "movi   v11.16b, #0\n"
+        "movi   v12.16b, #0\n"
+        "movi   v13.16b, #0\n"
+        "b.lt   " DC_KERNEL_NO_MULT_14 "f\n"
+        // %bb.13:        // in Loop: Header=BB225_7 Depth=2
+        "add    x17, %[output_block_data], #32\n"  // =32
+        "ldr    %[output_block_data], [sp, #264]\n"  // 8-byte Folded Reload
+        "ldr    q13, [x17]\n"
+        "ldr    q12, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #256]\n"  // 8-byte Folded Reload
+        "ldr    q11, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #224]\n"  // 8-byte Folded Reload
+        "ldr    q30, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #216]\n"  // 8-byte Folded Reload
+        "ldr    q29, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #208]\n"  // 8-byte Folded Reload
+        "ldr    q28, [x17, %[output_block_data]]\n"
+        DC_KERNEL_NO_MULT_14 ":\n"  // in Loop: Header=BB225_7 Depth=2
+        "ldr    w17, [sp, #284]\n"  // 4-byte Folded Reload
+        DC_KERNEL_NO_MULT_15 ":\n"  // Parent Loop BB225_4 Depth=1
+        // Parent Loop BB225_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4e96965f  // sdot   v31.4s, v18.16b, v22.16b\n"
+        ".word 0x4e9b9648  // sdot   v8.4s, v18.16b, v27.16b\n"
+        ".word 0x4e999669  // sdot   v9.4s, v19.16b, v25.16b\n"
+        ".word 0x4e9b967f  // sdot   v31.4s, v19.16b, v27.16b\n"
+        ".word 0x4e98966a  // sdot   v10.4s, v19.16b, v24.16b\n"
+        ".word 0x4e999688  // sdot   v8.4s, v20.16b, v25.16b\n"
+        ".word 0x4e989689  // sdot   v9.4s, v20.16b, v24.16b\n"
+        "sqrdmulh        v3.4s, v31.4s, v1.4s\n"
+        ".word 0x4e97968a  // sdot   v10.4s, v20.16b, v23.16b\n"
+        "sqrdmulh        v4.4s, v8.4s, v1.4s\n"
+        "sqrdmulh        v5.4s, v9.4s, v1.4s\n"
+        "sqrshl v3.4s, v3.4s, v2.4s\n"
+        "sqrdmulh        v6.4s, v10.4s, v1.4s\n"
+        "sqrshl v4.4s, v4.4s, v2.4s\n"
+        "sqrshl v5.4s, v5.4s, v2.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqrshl v6.4s, v6.4s, v2.4s\n"
+        "sqxtn  v5.4h, v5.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqxtn2 v5.8h, v6.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqadd  v4.8h, v5.8h, v0.8h\n"
+        "sqxtun v3.8b, v3.8h\n"
+        "sqxtun2        v3.16b, v4.8h\n"
+        "umax   v3.16b, v3.16b, v7.16b\n"
+        "add    %[output_block_data], x12, x21\n"
+        "umin   v3.16b, v3.16b, v16.16b\n"
+        "ushr   v26.4s, v26.4s, #8\n"
+        "ushr   v25.4s, v25.4s, #8\n"
+        "str    s3, [x12]\n"
+        "st1    { v3.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x28\n"
+        "ushr   v22.4s, v22.4s, #8\n"
+        "ushr   v27.4s, v27.4s, #8\n"
+        "sli    v26.4s, v11.4s, #24\n"
+        "ushr   v24.4s, v24.4s, #8\n"
+        "ushr   v23.4s, v23.4s, #8\n"
+        "sli    v25.4s, v30.4s, #24\n"
+        "mov    v31.16b, v21.16b\n"
+        "mov    v8.16b, v21.16b\n"
+        "mov    v9.16b, v21.16b\n"
+        "mov    v10.16b, v21.16b\n"
+        "st1    { v3.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x27\n"
+        "subs   w17, w17, #1\n"  // =1
+        "sli    v22.4s, v13.4s, #24\n"
+        "ushr   v13.4s, v13.4s, #8\n"
+        "ushr   v11.4s, v11.4s, #8\n"
+        "sli    v27.4s, v12.4s, #24\n"
+        "ushr   v12.4s, v12.4s, #8\n"
+        "ushr   v30.4s, v30.4s, #8\n"
+        "sli    v24.4s, v29.4s, #24\n"
+        "ushr   v29.4s, v29.4s, #8\n"
+        "sli    v23.4s, v28.4s, #24\n"
+        "ushr   v28.4s, v28.4s, #8\n"
+        ".word 0x4e9a969f  // sdot   v31.4s, v20.16b, v26.16b\n"
+        ".word 0x4e9a9668  // sdot   v8.4s, v19.16b, v26.16b\n"
+        ".word 0x4e9a9649  // sdot   v9.4s, v18.16b, v26.16b\n"
+        "add    x12, x12, x5\n"
+        ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
+        "st1    { v3.s }[3], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_15 "b\n"
+        "b      " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_16 ":\n"  // in Loop: Header=BB225_4 Depth=1
+        "cmp    w17, #1\n"  // =1
+        "add    x16, %[bias_data], #32\n"  // =32
+        "b.lt   " DC_KERNEL_NO_MULT_2 "b\n"
+        // %bb.17:        // in Loop: Header=BB225_4 Depth=1
+        "ldr    w23, [sp, #276]\n"  // 4-byte Folded Reload
+        "cmp    w23, #1\n"  // =1
+        "b.lt   " DC_KERNEL_NO_MULT_29 "f\n"
+        // %bb.18:        // in Loop: Header=BB225_4 Depth=1
+        "str    x16, [sp, #192]\n"  // 8-byte Folded Spill
+        "ldp    q21, q22, [%[bias_data]]\n"
+        "ldr    x17, [sp, #184]\n"  // 8-byte Folded Reload
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "ldr    x23, [sp, #248]\n"  // 8-byte Folded Reload
+        "mov    w24, wzr\n"
+        "b      " DC_KERNEL_NO_MULT_20 "f\n"
+        DC_KERNEL_NO_MULT_19 ":\n"  // in Loop: Header=BB225_20 Depth=2
+        "ldr    w12, [sp, #76]\n"  // 4-byte Folded Reload
+        "add    w24, w24, #1\n"  // =1
+        "ldr    x21, [sp, #136]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #200]\n"  // 8-byte Folded Reload
+        "cmp    w24, w12\n"
+        "ldr    x12, [sp, #232]\n"  // 8-byte Folded Reload
+        "add    x12, x12, x21\n"
+        "b.eq   " DC_KERNEL_NO_MULT_28 "f\n"
+        DC_KERNEL_NO_MULT_20 ":\n"  // Parent Loop BB225_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB225_23 Depth 3
+        // Child Loop BB225_27 Depth 4
+        "ldr    %[output_block_data], [sp, #264]\n"  // 8-byte Folded Reload
+        "ldp    q23, q24, [x17]\n"
+        "mov    x21, x12\n"
+        "mov    w12, wzr\n"
+        "add    x16, x17, %[output_block_data]\n"
+        "ldr    %[output_block_data], [sp, #256]\n"  // 8-byte Folded Reload
+        "ldp    q25, q26, [x16]\n"
+        "str    x16, [sp, #200]\n"  // 8-byte Folded Spill
+        "add    %[output_block_data], x17, x3\n"
+        "ldp    q27, q28, [%[output_block_data]]\n"
+        "str    x21, [sp, #232]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_23 "f\n"
+        DC_KERNEL_NO_MULT_21 ":\n"  // in Loop: Header=BB225_23 Depth=3
+        "mov    %[filter_workspace], x26\n"
+        DC_KERNEL_NO_MULT_22 ":\n"  // in Loop: Header=BB225_23 Depth=3
+        "ldr    w17, [sp, #276]\n"  // 4-byte Folded Reload
+        "add    w12, w12, #1\n"  // =1
+        "cmp    w12, w17\n"
+        "mov    x17, x16\n"
+        "b.eq   " DC_KERNEL_NO_MULT_19 "b\n"
+        DC_KERNEL_NO_MULT_23 ":\n"  // Parent Loop BB225_4 Depth=1
+        // Parent Loop BB225_20 Depth=2
+        // =>  This Loop Header: Depth=3
+        // Child Loop BB225_27 Depth 4
+        "mov    x26, %[filter_workspace]\n"
+        "ldr    w1, [sp, #280]\n"  // 4-byte Folded Reload
+        "ldr    w3, [sp, #284]\n"  // 4-byte Folded Reload
+        "add    x16, x17, #32\n"  // =32
+        "cmp    w12, w1\n"
+        "mov    w1, #4\n"
+        "csel   w3, w3, w1, eq\n"
+        "cmp    w3, #3\n"  // =3
+        "b.ge   " DC_KERNEL_NO_MULT_25 "f\n"
+        // %bb.24:        // in Loop: Header=BB225_23 Depth=3
+        "movi   v29.16b, #0\n"
+        "cmp    w3, #1\n"  // =1
+        "movi   v30.16b, #0\n"
+        "movi   v31.16b, #0\n"
+        "movi   v9.16b, #0\n"
+        "movi   v10.16b, #0\n"
+        "movi   v8.16b, #0\n"
+        "b.ge   " DC_KERNEL_NO_MULT_26 "f\n"
+        "b      " DC_KERNEL_NO_MULT_21 "b\n"
+        DC_KERNEL_NO_MULT_25 ":\n"  // in Loop: Header=BB225_23 Depth=3
+        "ldr    x23, [sp, #264]\n"  // 8-byte Folded Reload
+        "mov    %[filter_workspace], x22\n"
+        "mov    x22, x15\n"
+        "mov    x15, x14\n"
+        "add    x23, x16, x23\n"
+        "mov    x14, x13\n"
+        "mov    x13, x20\n"
+        "mov    x20, x16\n"
+        "mov    x16, x25\n"
+        "ldr    x25, [sp, #256]\n"  // 8-byte Folded Reload
+        "ldp    q8, q31, [x17, #32]\n"
+        "ldp    q10, q30, [x23]\n"
+        "ldp    x6, x23, [sp, #240]\n"  // 16-byte Folded Reload
+        "add    x25, x20, x25\n"
+        "ldp    q9, q29, [x25]\n"
+        "mov    x25, x16\n"
+        "mov    x16, x20\n"
+        "mov    x20, x13\n"
+        "mov    x13, x14\n"
+        "mov    x14, x15\n"
+        "mov    x15, x22\n"
+        "mov    x22, %[filter_workspace]\n"
+        "mov    %[bias_data], x7\n"
+        DC_KERNEL_NO_MULT_26 ":\n"  // in Loop: Header=BB225_23 Depth=3
+        "mov    %[filter_workspace], x26\n"
+        DC_KERNEL_NO_MULT_27 ":\n"  // Parent Loop BB225_4 Depth=1
+        // Parent Loop BB225_20 Depth=2
+        // Parent Loop BB225_23 Depth=3
+        // =>  This Inner Loop Header: Depth=4
+        "mov    v3.16b, v21.16b\n"
+        "mov    v4.16b, v22.16b\n"
+        ".word 0x4e979643  // sdot   v3.4s, v18.16b, v23.16b\n"
+        ".word 0x4e9895e4  // sdot   v4.4s, v15.16b, v24.16b\n"
+        ".word 0x4e999663  // sdot   v3.4s, v19.16b, v25.16b\n"
+        ".word 0x4e9a94a4  // sdot   v4.4s, v5.16b, v26.16b\n"
+        ".word 0x4e9b9683  // sdot   v3.4s, v20.16b, v27.16b\n"
+        ".word 0x4e9c94c4  // sdot   v4.4s, v6.16b, v28.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v1.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v1.4s\n"
+        "sqrshl v3.4s, v3.4s, v2.4s\n"
+        "sqrshl v4.4s, v4.4s, v2.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtun v3.8b, v3.8h\n"
+        "umax   v3.8b, v3.8b, v17.8b\n"
+        "ushr   v23.4s, v23.4s, #8\n"
+        "ushr   v24.4s, v24.4s, #8\n"
+        "ushr   v25.4s, v25.4s, #8\n"
+        "ushr   v26.4s, v26.4s, #8\n"
+        "ushr   v27.4s, v27.4s, #8\n"
+        "ushr   v28.4s, v28.4s, #8\n"
+        "umin   v3.8b, v3.8b, v14.8b\n"
+        "subs   w3, w3, #1\n"  // =1
+        "sli    v23.4s, v8.4s, #24\n"
+        "ushr   v8.4s, v8.4s, #8\n"
+        "sli    v24.4s, v31.4s, #24\n"
+        "ushr   v31.4s, v31.4s, #8\n"
+        "sli    v25.4s, v10.4s, #24\n"
+        "ushr   v10.4s, v10.4s, #8\n"
+        "sli    v26.4s, v30.4s, #24\n"
+        "ushr   v30.4s, v30.4s, #8\n"
+        "sli    v27.4s, v9.4s, #24\n"
+        "ushr   v9.4s, v9.4s, #8\n"
+        "sli    v28.4s, v29.4s, #24\n"
+        "ushr   v29.4s, v29.4s, #8\n"
+        "str    d3, [x21]\n"
+        "add    x21, x21, x5\n"
+        "b.ne   " DC_KERNEL_NO_MULT_27 "b\n"
+        "b      " DC_KERNEL_NO_MULT_22 "b\n"
+        DC_KERNEL_NO_MULT_28 ":\n"  // in Loop: Header=BB225_4 Depth=1
+        "ldr    %[bias_data], [sp, #192]\n"  // 8-byte Folded Reload
+        "ldr    x26, [sp, #16]\n"  // 8-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_3 "b\n"
+        DC_KERNEL_NO_MULT_29 ":\n"  // in Loop: Header=BB225_4 Depth=1
+        "ldr    w12, [sp, #12]\n"  // 4-byte Folded Reload
+        "cmp    w17, #2\n"  // =2
+        "b.hs   " DC_KERNEL_NO_MULT_31 "f\n"
+        // %bb.30:        // in Loop: Header=BB225_4 Depth=1
+        "ldr    x23, [sp, #248]\n"  // 8-byte Folded Reload
+        "mov    w12, wzr\n"
+        "b      " DC_KERNEL_NO_MULT_33 "f\n"
+        DC_KERNEL_NO_MULT_31 ":\n"  // Parent Loop BB225_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "subs   w12, w12, #2\n"  // =2
+        "b.ne   " DC_KERNEL_NO_MULT_31 "b\n"
+        // %bb.32:        // in Loop: Header=BB225_4 Depth=1
+        "ldr    w12, [sp, #12]\n"  // 4-byte Folded Reload
+        "ldr    x23, [sp, #248]\n"  // 8-byte Folded Reload
+        "cmp    w17, w12\n"
+        "b.eq   " DC_KERNEL_NO_MULT_2 "b\n"
+        DC_KERNEL_NO_MULT_33 ":\n"  // in Loop: Header=BB225_4 Depth=1
+        "sub    w12, w17, w12\n"
+        DC_KERNEL_NO_MULT_34 ":\n"  // Parent Loop BB225_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "subs   w12, w12, #1\n"  // =1
+        "b.ne   " DC_KERNEL_NO_MULT_34 "b\n"
+        "b      " DC_KERNEL_NO_MULT_2 "b\n"
+        DC_KERNEL_NO_MULT_35 ":\n"
+        // Compiled intrinsics total stack 464, now 320 for spillage only.
+        "add    sp, sp, #320\n"  // =464
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+#undef DC_KERNEL_NO_MULT_1
+#undef DC_KERNEL_NO_MULT_2
+#undef DC_KERNEL_NO_MULT_3
+#undef DC_KERNEL_NO_MULT_4
+#undef DC_KERNEL_NO_MULT_5
+#undef DC_KERNEL_NO_MULT_6
+#undef DC_KERNEL_NO_MULT_7
+#undef DC_KERNEL_NO_MULT_8
+#undef DC_KERNEL_NO_MULT_9
+#undef DC_KERNEL_NO_MULT_10
+#undef DC_KERNEL_NO_MULT_11
+#undef DC_KERNEL_NO_MULT_12
+#undef DC_KERNEL_NO_MULT_13
+#undef DC_KERNEL_NO_MULT_14
+#undef DC_KERNEL_NO_MULT_15
+#undef DC_KERNEL_NO_MULT_16
+#undef DC_KERNEL_NO_MULT_17
+#undef DC_KERNEL_NO_MULT_18
+#undef DC_KERNEL_NO_MULT_19
+#undef DC_KERNEL_NO_MULT_20
+#undef DC_KERNEL_NO_MULT_21
+#undef DC_KERNEL_NO_MULT_22
+#undef DC_KERNEL_NO_MULT_23
+#undef DC_KERNEL_NO_MULT_24
+#undef DC_KERNEL_NO_MULT_25
+#undef DC_KERNEL_NO_MULT_26
+#undef DC_KERNEL_NO_MULT_27
+#undef DC_KERNEL_NO_MULT_28
+#undef DC_KERNEL_NO_MULT_29
+#undef DC_KERNEL_NO_MULT_30
+#undef DC_KERNEL_NO_MULT_31
+#undef DC_KERNEL_NO_MULT_32
+#undef DC_KERNEL_NO_MULT_33
+#undef DC_KERNEL_NO_MULT_34
+#undef DC_KERNEL_NO_MULT_35
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kNonPerChannelUint8,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_STRIDE_1 "1"
+#define DC_KERNEL_NO_MULT_STRIDE_2 "2"
+#define DC_KERNEL_NO_MULT_STRIDE_3 "3"
+#define DC_KERNEL_NO_MULT_STRIDE_4 "4"
+#define DC_KERNEL_NO_MULT_STRIDE_5 "5"
+#define DC_KERNEL_NO_MULT_STRIDE_6 "6"
+#define DC_KERNEL_NO_MULT_STRIDE_7 "7"
+#define DC_KERNEL_NO_MULT_STRIDE_8 "8"
+#define DC_KERNEL_NO_MULT_STRIDE_9 "9"
+#define DC_KERNEL_NO_MULT_STRIDE_10 "10"
+#define DC_KERNEL_NO_MULT_STRIDE_11 "11"
+#define DC_KERNEL_NO_MULT_STRIDE_12 "12"
+#define DC_KERNEL_NO_MULT_STRIDE_13 "13"
+#define DC_KERNEL_NO_MULT_STRIDE_14 "14"
+#define DC_KERNEL_NO_MULT_STRIDE_15 "15"
+#define DC_KERNEL_NO_MULT_STRIDE_16 "16"
+#define DC_KERNEL_NO_MULT_STRIDE_17 "17"
+#define DC_KERNEL_NO_MULT_STRIDE_18 "18"
+#define DC_KERNEL_NO_MULT_STRIDE_19 "19"
+#define DC_KERNEL_NO_MULT_STRIDE_20 "20"
+#define DC_KERNEL_NO_MULT_STRIDE_21 "21"
+#define DC_KERNEL_NO_MULT_STRIDE_22 "22"
+#define DC_KERNEL_NO_MULT_STRIDE_23 "23"
+#define DC_KERNEL_NO_MULT_STRIDE_24 "24"
+#define DC_KERNEL_NO_MULT_STRIDE_25 "25"
+#define DC_KERNEL_NO_MULT_STRIDE_26 "26"
+#define DC_KERNEL_NO_MULT_STRIDE_27 "27"
+#define DC_KERNEL_NO_MULT_STRIDE_28 "28"
+#define DC_KERNEL_NO_MULT_STRIDE_29 "29"
+#define DC_KERNEL_NO_MULT_STRIDE_30 "30"
+#define DC_KERNEL_NO_MULT_STRIDE_31 "31"
+#define DC_KERNEL_NO_MULT_STRIDE_32 "32"
+#define DC_KERNEL_NO_MULT_STRIDE_33 "33"
+#define DC_KERNEL_NO_MULT_STRIDE_34 "34"
+#define DC_KERNEL_NO_MULT_STRIDE_35 "35"
+
+    asm volatile(
+        // Compiled code used block of 160 for spill out of total stack of 304.
+        "sub    sp, sp, #160\n"  // =304
+        "stp    %[output_block_data], %[filter_workspace], [sp, #144]\n"  // 16-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "cmp    w8, #1\n"  // =1
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
+        // %bb.1:
+        "ldr    x14, [%[function_params]]\n"
+        "ldpsw  x11, x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldp    w13, w3, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "add    x15, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
+        "add    x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
+        "add    x5, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "add    x6, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"  // =36
+        "add    x7, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
+        "ldrsw  x19, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldr    w1, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ldp    w16, w4, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ld1r   { v1.8b }, [x15]\n"
+        "lsl    w15, w14, #1\n"
+        "sxtw   x20, w15\n"
+        "cmp    w16, #1\n"  // =1
+        "ldr    x15, [sp, #144]\n"  // 8-byte Folded Reload
+        "ccmp   w3, w13, #0, eq\n"
+        "ld1r   { v0.8h }, [x5]\n"
+        "ld1r   { v2.8b }, [x17]\n"
+        "ld1r   { v3.4s }, [x7]\n"
+        "ld1r   { v4.4s }, [x6]\n"
+        "csel   w23, w3, w13, lt\n"
+        "sxtw   x6, w14\n"
+        "bic    w14, w23, w23, asr #31\n"
+        "lsl    x5, x12, #1\n"
+        "madd   x15, x20, x14, x15\n"
+        "sub    x14, x13, x14\n"
+        "mov    x9, xzr\n"
+        "mov    x10, xzr\n"
+        "str    w4, [sp, #84]\n"  // 4-byte Folded Spill
+        "lsl    %[function_params], x19, #5\n"
+        "lsl    x7, x12, #2\n"
+        "add    x19, x5, x12\n"
+        "str    x14, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x14, x15, #4\n"  // =4
+        "str    %[output_block_data], [sp, #72]\n"  // 8-byte Folded Spill
+        "str    x15, [sp, #88]\n"  // 8-byte Folded Spill
+        "str    x14, [sp, #8]\n"  // 8-byte Folded Spill
+        // implicit-def: $q16
+        // implicit-def: $q7
+        // implicit-def: $q22
+        // implicit-def: $q18
+        // implicit-def: $q17
+        // implicit-def: $q6
+        // implicit-def: $q11
+        // implicit-def: $q13
+        // implicit-def: $q14
+        // implicit-def: $q15
+        // implicit-def: $q20
+        "b      " DC_KERNEL_NO_MULT_STRIDE_4 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_2 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "add    x25, %[bias_data], #32\n"  // =32
+        "mov    v22.16b, v12.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "add    x10, x10, #1\n"  // =1
+        "cmp    x10, x8\n"
+        "add    x9, x9, #8\n"  // =8
+        "mov    %[bias_data], x25\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB227_30 Depth 2
+        // Child Loop BB227_22 Depth 2
+        // Child Loop BB227_7 Depth 2
+        // Child Loop BB227_10 Depth 2
+        // Child Loop BB227_13 Depth 2
+        // Child Loop BB227_26 Depth 2
+        "ldr    x15, [sp, #152]\n"  // 8-byte Folded Reload
+        "add    w14, w10, w10, lsl #1\n"
+        "lsl    w14, w14, #5\n"
+        "cmp    w1, #2\n"  // =2
+        "add    x27, x15, x14\n"
+        "madd   x26, x10, %[function_params], %[scratch_block_data]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_15 "f\n"
+        // %bb.5:        // in Loop: Header=BB227_4 Depth=1
+        "ubfx   x14, x9, #3, #29\n"
+        "lsl    x25, x14, #3\n"
+        "ldr    x14, [sp, #88]\n"  // 8-byte Folded Reload
+        "ldr    q24, [x27]\n"
+        "ldr    q25, [x27, #32]\n"
+        "ldr    q26, [x27, #64]\n"
+        "add    x24, x14, x25\n"
+        "ldr    x14, [sp, #144]\n"  // 8-byte Folded Reload
+        "ldr    q27, [%[bias_data]]\n"
+        "ldr    q31, [x26]\n"
+        "ldr    q8, [x26, x12]\n"
+        "ldr    q30, [x26, x5]\n"
+        "ldr    q29, [x26, x19]\n"
+        "ldr    q28, [x26, x7]\n"
+        "lsl    w15, w10, #3\n"
+        "cmp    w23, #1\n"  // =1
+        "add    x28, x14, x15\n"
+        "mov    v12.16b, v22.16b\n"
+        "mov    w14, wzr\n"
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_9 "f\n"
+        // %bb.6:        // in Loop: Header=BB227_4 Depth=1
+        "mov    x17, xzr\n"
+        "add    x22, x26, #32\n"  // =32
+        "mov    x21, x23\n"
+        "mov    v19.16b, v30.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_7 ":\n"  // Parent Loop BB227_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v20.16b, v27.16b\n"
+        "mov    v21.16b, v27.16b\n"
+        ".word 0x4e9f9714  // sdot   v20.4s, v24.16b, v31.16b\n"
+        ".word 0x4e939715  // sdot   v21.4s, v24.16b, v19.16b\n"
+        ".word 0x4e889734  // sdot   v20.4s, v25.16b, v8.16b\n"
+        ".word 0x4e9d9735  // sdot   v21.4s, v25.16b, v29.16b\n"
+        ".word 0x4e939754  // sdot   v20.4s, v26.16b, v19.16b\n"
+        ".word 0x4e9c9755  // sdot   v21.4s, v26.16b, v28.16b\n"
+        "sqrdmulh        v20.4s, v20.4s, v3.4s\n"
+        "and    %[output_block_data], x17, #0xffffffe0\n"
+        "sqrdmulh        v21.4s, v21.4s, v3.4s\n"
+        "sqrshl v20.4s, v20.4s, v4.4s\n"
+        "add    %[output_block_data], x22, x3\n"
+        "sqrshl v21.4s, v21.4s, v4.4s\n"
+        "sqxtn  v20.4h, v20.4s\n"
+        "rev32  v22.8h, v31.8h\n"
+        "rev32  v23.8h, v8.8h\n"
+        "rev32  v9.8h, v30.8h\n"
+        "rev32  v10.8h, v29.8h\n"
+        "ldr    q31, [%[output_block_data]]\n"
+        "ldr    q8, [%[output_block_data], x12]\n"
+        "ldr    q30, [%[output_block_data], x5]\n"
+        "ldr    q29, [%[output_block_data], x19]\n"
+        "rev32  v19.8h, v28.8h\n"
+        "ldr    q28, [%[output_block_data], x7]\n"
+        "sqxtn2 v20.8h, v21.4s\n"
+        "sqadd  v20.8h, v20.8h, v0.8h\n"
+        "sqxtun v20.8b, v20.8h\n"
+        "add    x15, x28, w14, sxtw\n"
+        "umax   v20.8b, v20.8b, v1.8b\n"
+        "add    %[output_block_data], x15, x11\n"
+        "umin   v20.8b, v20.8b, v2.8b\n"
+        "mov    v11.16b, v27.16b\n"
+        "str    s20, [x15]\n"
+        "st1    { v20.s }[1], [%[output_block_data]]\n"
+        "trn1   v20.8h, v22.8h, v31.8h\n"
+        "mov    v21.16b, v27.16b\n"
+        "trn1   v22.8h, v23.8h, v8.8h\n"
+        "trn1   v23.8h, v9.8h, v30.8h\n"
+        ".word 0x4e94970b  // sdot   v11.4s, v24.16b, v20.16b\n"
+        "trn1   v9.8h, v10.8h, v29.8h\n"
+        ".word 0x4e979715  // sdot   v21.4s, v24.16b, v23.16b\n"
+        ".word 0x4e96972b  // sdot   v11.4s, v25.16b, v22.16b\n"
+        "trn1   v19.8h, v19.8h, v28.8h\n"
+        ".word 0x4e899735  // sdot   v21.4s, v25.16b, v9.16b\n"
+        ".word 0x4e97974b  // sdot   v11.4s, v26.16b, v23.16b\n"
+        ".word 0x4e939755  // sdot   v21.4s, v26.16b, v19.16b\n"
+        "sqrdmulh        v19.4s, v11.4s, v3.4s\n"
+        "sqrdmulh        v20.4s, v21.4s, v3.4s\n"
+        "sqrshl v19.4s, v19.4s, v4.4s\n"
+        "sqrshl v20.4s, v20.4s, v4.4s\n"
+        "sqxtn  v19.4h, v19.4s\n"
+        "sqxtn2 v19.8h, v20.4s\n"
+        "sqadd  v19.8h, v19.8h, v0.8h\n"
+        "sqxtun v19.8b, v19.8h\n"
+        "add    x15, x15, x6\n"
+        "umax   v19.8b, v19.8b, v1.8b\n"
+        "add    %[output_block_data], x15, x11\n"
+        "umin   v19.8b, v19.8b, v2.8b\n"
+        "add    x17, x17, #32\n"  // =32
+        "subs   x21, x21, #1\n"  // =1
+        "str    s19, [x15]\n"
+        "st1    { v19.s }[1], [%[output_block_data]]\n"
+        "add    w14, w14, w20\n"
+        "mov    v19.16b, v30.16b\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_7 "b\n"
+        // %bb.8:        // in Loop: Header=BB227_4 Depth=1
+        "mov    v20.16b, v31.16b\n"
+        "mov    v15.16b, v8.16b\n"
+        "mov    v14.16b, v30.16b\n"
+        "mov    v13.16b, v29.16b\n"
+        "mov    v11.16b, v28.16b\n"
+        "mov    w14, w23\n"
+        DC_KERNEL_NO_MULT_STRIDE_9 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "cmp    w14, w13\n"
+        "ldr    x14, [sp, #136]\n"  // 8-byte Folded Reload
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_11 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_10 ":\n"  // Parent Loop BB227_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v9.16b, v27.16b\n"
+        "mov    v10.16b, v27.16b\n"
+        ".word 0x4e9f9709  // sdot   v9.4s, v24.16b, v31.16b\n"
+        ".word 0x4e889729  // sdot   v9.4s, v25.16b, v8.16b\n"
+        ".word 0x4e9e970a  // sdot   v10.4s, v24.16b, v30.16b\n"
+        ".word 0x4e9e9749  // sdot   v9.4s, v26.16b, v30.16b\n"
+        ".word 0x4e9d972a  // sdot   v10.4s, v25.16b, v29.16b\n"
+        ".word 0x4e9c974a  // sdot   v10.4s, v26.16b, v28.16b\n"
+        "sqrdmulh        v9.4s, v9.4s, v3.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v3.4s\n"
+        "sqrshl v9.4s, v9.4s, v4.4s\n"
+        "sqrshl v10.4s, v10.4s, v4.4s\n"
+        "sqxtn  v9.4h, v9.4s\n"
+        "sqxtn2 v9.8h, v10.4s\n"
+        "sqadd  v9.8h, v9.8h, v0.8h\n"
+        "sqxtun v9.8b, v9.8h\n"
+        "umax   v9.8b, v9.8b, v1.8b\n"
+        "rev32  v31.8h, v31.8h\n"
+        "rev32  v8.8h, v8.8h\n"
+        "rev32  v30.8h, v30.8h\n"
+        "rev32  v29.8h, v29.8h\n"
+        "rev32  v28.8h, v28.8h\n"
+        "umin   v9.8b, v9.8b, v2.8b\n"
+        "add    x15, x24, x11\n"
+        "subs   x14, x14, #1\n"  // =1
+        "trn1   v31.8h, v31.8h, v20.8h\n"
+        "trn1   v8.8h, v8.8h, v15.8h\n"
+        "trn1   v29.8h, v29.8h, v13.8h\n"
+        "trn1   v30.8h, v30.8h, v14.8h\n"
+        "trn1   v28.8h, v28.8h, v11.8h\n"
+        "str    s9, [x24]\n"
+        "add    x24, x24, x20\n"
+        "st1    { v9.s }[1], [x15]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_10 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_11 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "ldr    q24, [x27, #16]\n"
+        "ldr    q25, [x27, #48]\n"
+        "ldr    q26, [x27, #80]\n"
+        "ldr    q30, [x26, #16]!\n"
+        "ldr    q27, [%[bias_data], #16]\n"
+        "cmp    w23, #0\n"  // =0
+        "ldr    q8, [x26, x12]\n"
+        "ldr    q31, [x26, x5]\n"
+        "ldr    q29, [x26, x19]\n"
+        "ldr    q28, [x26, x7]\n"
+        "b.le   " DC_KERNEL_NO_MULT_STRIDE_24 "f\n"
+        // %bb.12:        // in Loop: Header=BB227_4 Depth=1
+        "mov    w14, wzr\n"
+        "mov    x17, xzr\n"
+        "add    x22, x26, #32\n"  // =32
+        "add    x24, x28, #4\n"  // =4
+        "mov    x21, x23\n"
+        "mov    v19.16b, v31.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_13 ":\n"  // Parent Loop BB227_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v5.16b, v27.16b\n"
+        "mov    v20.16b, v27.16b\n"
+        ".word 0x4e9e9705  // sdot   v5.4s, v24.16b, v30.16b\n"
+        ".word 0x4e939714  // sdot   v20.4s, v24.16b, v19.16b\n"
+        ".word 0x4e889725  // sdot   v5.4s, v25.16b, v8.16b\n"
+        ".word 0x4e9d9734  // sdot   v20.4s, v25.16b, v29.16b\n"
+        ".word 0x4e939745  // sdot   v5.4s, v26.16b, v19.16b\n"
+        ".word 0x4e9c9754  // sdot   v20.4s, v26.16b, v28.16b\n"
+        "sqrdmulh        v5.4s, v5.4s, v3.4s\n"
+        "and    %[output_block_data], x17, #0xffffffe0\n"
+        "sqrdmulh        v20.4s, v20.4s, v3.4s\n"
+        "sqrshl v5.4s, v5.4s, v4.4s\n"
+        "add    %[output_block_data], x22, x3\n"
+        "sqrshl v20.4s, v20.4s, v4.4s\n"
+        "sqxtn  v5.4h, v5.4s\n"
+        "rev32  v21.8h, v30.8h\n"
+        "rev32  v22.8h, v8.8h\n"
+        "rev32  v23.8h, v31.8h\n"
+        "rev32  v9.8h, v29.8h\n"
+        "ldr    q30, [%[output_block_data]]\n"
+        "ldr    q8, [%[output_block_data], x12]\n"
+        "ldr    q31, [%[output_block_data], x5]\n"
+        "ldr    q29, [%[output_block_data], x19]\n"
+        "rev32  v19.8h, v28.8h\n"
+        "ldr    q28, [%[output_block_data], x7]\n"
+        "sqxtn2 v5.8h, v20.4s\n"
+        "sqadd  v5.8h, v5.8h, v0.8h\n"
+        "sqxtun v5.8b, v5.8h\n"
+        "add    x15, x24, w14, sxtw\n"
+        "umax   v5.8b, v5.8b, v1.8b\n"
+        "add    %[output_block_data], x15, x11\n"
+        "umin   v5.8b, v5.8b, v2.8b\n"
+        "mov    v10.16b, v27.16b\n"
+        "str    s5, [x15]\n"
+        "st1    { v5.s }[1], [%[output_block_data]]\n"
+        "trn1   v5.8h, v21.8h, v30.8h\n"
+        "mov    v20.16b, v27.16b\n"
+        "trn1   v21.8h, v22.8h, v8.8h\n"
+        "trn1   v22.8h, v23.8h, v31.8h\n"
+        ".word 0x4e85970a  // sdot   v10.4s, v24.16b, v5.16b\n"
+        "trn1   v23.8h, v9.8h, v29.8h\n"
+        ".word 0x4e969714  // sdot   v20.4s, v24.16b, v22.16b\n"
+        ".word 0x4e95972a  // sdot   v10.4s, v25.16b, v21.16b\n"
+        "trn1   v19.8h, v19.8h, v28.8h\n"
+        ".word 0x4e979734  // sdot   v20.4s, v25.16b, v23.16b\n"
+        ".word 0x4e96974a  // sdot   v10.4s, v26.16b, v22.16b\n"
+        ".word 0x4e939754  // sdot   v20.4s, v26.16b, v19.16b\n"
+        "sqrdmulh        v5.4s, v10.4s, v3.4s\n"
+        "sqrdmulh        v19.4s, v20.4s, v3.4s\n"
+        "sqrshl v5.4s, v5.4s, v4.4s\n"
+        "sqrshl v19.4s, v19.4s, v4.4s\n"
+        "sqxtn  v5.4h, v5.4s\n"
+        "sqxtn2 v5.8h, v19.4s\n"
+        "sqadd  v5.8h, v5.8h, v0.8h\n"
+        "sqxtun v5.8b, v5.8h\n"
+        "add    x15, x15, x6\n"
+        "umax   v5.8b, v5.8b, v1.8b\n"
+        "add    x17, x17, #32\n"  // =32
+        "subs   x21, x21, #1\n"  // =1
+        "add    %[output_block_data], x15, x11\n"
+        "umin   v5.8b, v5.8b, v2.8b\n"
+        "add    w14, w14, w20\n"
+        "mov    v19.16b, v31.16b\n"
+        "str    s5, [x15]\n"
+        "st1    { v5.s }[1], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_13 "b\n"
+        // %bb.14:        // in Loop: Header=BB227_4 Depth=1
+        "mov    v20.16b, v30.16b\n"
+        "mov    v15.16b, v8.16b\n"
+        "mov    v14.16b, v31.16b\n"
+        "mov    v13.16b, v29.16b\n"
+        "mov    v11.16b, v28.16b\n"
+        "mov    w14, w23\n"
+        "cmp    w14, w13\n"
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_25 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_15 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "cmp    w13, #1\n"  // =1
+        "add    x25, %[bias_data], #32\n"  // =32
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        // %bb.16:        // in Loop: Header=BB227_4 Depth=1
+        "stp    q13, q11, [sp, #96]\n"  // 32-byte Folded Spill
+        "add    x15, x26, x12\n"
+        "ldp    q9, q10, [x15]\n"
+        "ldr    x15, [sp, #144]\n"  // 8-byte Folded Reload
+        "lsl    w14, w10, #3\n"
+        "ldp    q30, q31, [%[bias_data]]\n"
+        "add    x17, x26, x5\n"
+        "add    %[bias_data], x15, x14\n"
+        "ldr    w14, [sp, #84]\n"  // 4-byte Folded Reload
+        "ldp    q24, q25, [x27]\n"
+        "ldp    q26, q27, [x27, #32]\n"
+        "ldp    q28, q29, [x27, #64]\n"
+        "ldp    q12, q11, [x26], #32\n"
+        "ldp    q8, q13, [x17]\n"
+        "cmp    w13, w14\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_27 "f\n"
+        // %bb.17:        // in Loop: Header=BB227_4 Depth=1
+        "ldr    x14, [sp, #72]\n"  // 8-byte Folded Reload
+        "mov    x24, xzr\n"
+        "mov    w27, wzr\n"
+        "mov    x28, x13\n"
+        "mov    v19.16b, v15.16b\n"
+        "mov    v5.16b, v14.16b\n"
+        "cbnz   x14,    " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_22 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_18 ":\n"  // in Loop: Header=BB227_22 Depth=2
+        "mov    v14.16b, v30.16b\n"
+        ".word 0x4e8c970e  // sdot   v14.4s, v24.16b, v12.16b\n"
+        "mov    v12.16b, v31.16b\n"
+        ".word 0x4e8b972c  // sdot   v12.4s, v25.16b, v11.16b\n"
+        ".word 0x4e89974e  // sdot   v14.4s, v26.16b, v9.16b\n"
+        ".word 0x4e8a976c  // sdot   v12.4s, v27.16b, v10.16b\n"
+        ".word 0x4e88978e  // sdot   v14.4s, v28.16b, v8.16b\n"
+        ".word 0x4e8d97ac  // sdot   v12.4s, v29.16b, v13.16b\n"
+        "sqrdmulh        v8.4s, v14.4s, v3.4s\n"
+        "sqrdmulh        v9.4s, v12.4s, v3.4s\n"
+        "sqrshl v8.4s, v8.4s, v4.4s\n"
+        "sqrshl v9.4s, v9.4s, v4.4s\n"
+        "sqxtn  v8.4h, v8.4s\n"
+        "sqxtn2 v8.8h, v9.4s\n"
+        "sqadd  v8.8h, v8.8h, v0.8h\n"
+        "sqxtun v8.8b, v8.8h\n"
+        "umax   v8.8b, v8.8b, v1.8b\n"
+        "umin   v8.8b, v8.8b, v2.8b\n"
+        "str    d8, [x15, x6]\n"
+        "mov    v12.16b, v6.16b\n"
+        "mov    v9.16b, v17.16b\n"
+        "mov    v8.16b, v18.16b\n"
+        "mov    v11.16b, v22.16b\n"
+        "mov    v10.16b, v7.16b\n"
+        "mov    v13.16b, v16.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_19 ":\n"  // in Loop: Header=BB227_22 Depth=2
+        "mov    v14.16b, v5.16b\n"
+        "mov    v15.16b, v19.16b\n"
+        "add    w27, w27, w20\n"
+        "add    x24, x24, #32\n"  // =32
+        "subs   x28, x28, #1\n"  // =1
+        "sub    x14, x14, #1\n"  // =1
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_33 "f\n"
+        // %bb.20:        // in Loop: Header=BB227_22 Depth=2
+        "mov    v19.16b, v15.16b\n"
+        "mov    v5.16b, v14.16b\n"
+        "cbz    x14,    " DC_KERNEL_NO_MULT_STRIDE_22 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_21 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "and    x15, x24, #0xffffffe0\n"
+        "add    x15, x26, x15\n"
+        "add    x17, x15, x12\n"
+        "add    %[output_block_data], x15, x5\n"
+        "ldp    q6, q22, [x15]\n"
+        "ldp    q17, q7, [x17]\n"
+        "ldp    q18, q16, [%[output_block_data]]\n"
+        DC_KERNEL_NO_MULT_STRIDE_22 ":\n"  // Parent Loop BB227_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v14.16b, v30.16b\n"
+        "mov    v15.16b, v31.16b\n"
+        ".word 0x4e8c970e  // sdot   v14.4s, v24.16b, v12.16b\n"
+        ".word 0x4e89974e  // sdot   v14.4s, v26.16b, v9.16b\n"
+        ".word 0x4e8b972f  // sdot   v15.4s, v25.16b, v11.16b\n"
+        ".word 0x4e88978e  // sdot   v14.4s, v28.16b, v8.16b\n"
+        ".word 0x4e8a976f  // sdot   v15.4s, v27.16b, v10.16b\n"
+        ".word 0x4e8d97af  // sdot   v15.4s, v29.16b, v13.16b\n"
+        "sqrdmulh        v14.4s, v14.4s, v3.4s\n"
+        "sqrdmulh        v15.4s, v15.4s, v3.4s\n"
+        "sqrshl v14.4s, v14.4s, v4.4s\n"
+        "sqrshl v15.4s, v15.4s, v4.4s\n"
+        "sqxtn  v14.4h, v14.4s\n"
+        "sqxtn2 v14.8h, v15.4s\n"
+        "sqadd  v14.8h, v14.8h, v0.8h\n"
+        "sqxtun v14.8b, v14.8h\n"
+        "rev32  v12.8h, v12.8h\n"
+        "rev32  v9.8h, v9.8h\n"
+        "rev32  v8.8h, v8.8h\n"
+        "rev32  v11.8h, v11.8h\n"
+        "rev32  v10.8h, v10.8h\n"
+        "rev32  v13.8h, v13.8h\n"
+        "umax   v14.8b, v14.8b, v1.8b\n"
+        "add    x15, %[bias_data], w27, sxtw\n"
+        "cmp    w16, #1\n"  // =1
+        "trn1   v12.8h, v12.8h, v6.8h\n"
+        "trn1   v11.8h, v11.8h, v22.8h\n"
+        "trn1   v9.8h, v9.8h, v17.8h\n"
+        "trn1   v10.8h, v10.8h, v7.8h\n"
+        "trn1   v8.8h, v8.8h, v18.8h\n"
+        "umin   v14.8b, v14.8b, v2.8b\n"
+        "trn1   v13.8h, v13.8h, v16.8h\n"
+        "str    d14, [x15]\n"
+        "b.gt   " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
+        // %bb.23:        // in Loop: Header=BB227_22 Depth=2
+        "cbz    x14,    " DC_KERNEL_NO_MULT_STRIDE_19 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_24 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "mov    w14, wzr\n"
+        "cmp    w14, w13\n"
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_25 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "ldr    x14, [sp, #8]\n"  // 8-byte Folded Reload
+        "ldr    x15, [sp, #136]\n"  // 8-byte Folded Reload
+        "add    x14, x14, x25\n"
+        DC_KERNEL_NO_MULT_STRIDE_26 ":\n"  // Parent Loop BB227_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v5.16b, v27.16b\n"
+        "mov    v19.16b, v27.16b\n"
+        ".word 0x4e9e9705  // sdot   v5.4s, v24.16b, v30.16b\n"
+        ".word 0x4e889725  // sdot   v5.4s, v25.16b, v8.16b\n"
+        ".word 0x4e9f9713  // sdot   v19.4s, v24.16b, v31.16b\n"
+        ".word 0x4e9f9745  // sdot   v5.4s, v26.16b, v31.16b\n"
+        ".word 0x4e9d9733  // sdot   v19.4s, v25.16b, v29.16b\n"
+        ".word 0x4e9c9753  // sdot   v19.4s, v26.16b, v28.16b\n"
+        "sqrdmulh        v5.4s, v5.4s, v3.4s\n"
+        "sqrdmulh        v19.4s, v19.4s, v3.4s\n"
+        "sqrshl v5.4s, v5.4s, v4.4s\n"
+        "sqrshl v19.4s, v19.4s, v4.4s\n"
+        "sqxtn  v5.4h, v5.4s\n"
+        "sqxtn2 v5.8h, v19.4s\n"
+        "sqadd  v5.8h, v5.8h, v0.8h\n"
+        "sqxtun v5.8b, v5.8h\n"
+        "umax   v5.8b, v5.8b, v1.8b\n"
+        "mov    v9.16b, v20.16b\n"
+        "rev32  v20.8h, v30.8h\n"
+        "rev32  v21.8h, v8.8h\n"
+        "rev32  v22.8h, v31.8h\n"
+        "rev32  v23.8h, v29.8h\n"
+        "rev32  v28.8h, v28.8h\n"
+        "umin   v5.8b, v5.8b, v2.8b\n"
+        "add    x17, x14, x11\n"
+        "subs   x15, x15, #1\n"  // =1
+        "trn1   v30.8h, v20.8h, v9.8h\n"
+        "mov    v20.16b, v9.16b\n"
+        "trn1   v8.8h, v21.8h, v15.8h\n"
+        "trn1   v29.8h, v23.8h, v13.8h\n"
+        "trn1   v31.8h, v22.8h, v14.8h\n"
+        "trn1   v28.8h, v28.8h, v11.8h\n"
+        "str    s5, [x14]\n"
+        "add    x14, x14, x20\n"
+        "st1    { v5.s }[1], [x17]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_26 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_27 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "ldr    x28, [sp, #72]\n"  // 8-byte Folded Reload
+        "mov    w14, wzr\n"
+        "mov    x24, xzr\n"
+        "mov    x27, x13\n"
+        "stp    q20, q15, [sp, #16]\n"  // 32-byte Folded Spill
+        "str    q14, [sp, #48]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_STRIDE_30 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_28 ":\n"  // in Loop: Header=BB227_30 Depth=2
+        "mov    v5.16b, v30.16b\n"
+        ".word 0x4e8c9705  // sdot   v5.4s, v24.16b, v12.16b\n"
+        "mov    v19.16b, v31.16b\n"
+        ".word 0x4e8b9733  // sdot   v19.4s, v25.16b, v11.16b\n"
+        ".word 0x4e899745  // sdot   v5.4s, v26.16b, v9.16b\n"
+        ".word 0x4e8a9773  // sdot   v19.4s, v27.16b, v10.16b\n"
+        ".word 0x4e889785  // sdot   v5.4s, v28.16b, v8.16b\n"
+        ".word 0x4e8d97b3  // sdot   v19.4s, v29.16b, v13.16b\n"
+        "sqrdmulh        v5.4s, v5.4s, v3.4s\n"
+        "sqrdmulh        v19.4s, v19.4s, v3.4s\n"
+        "sqrshl v5.4s, v5.4s, v4.4s\n"
+        "sqrshl v19.4s, v19.4s, v4.4s\n"
+        "sqxtn  v5.4h, v5.4s\n"
+        "sqxtn2 v5.8h, v19.4s\n"
+        "sqadd  v5.8h, v5.8h, v0.8h\n"
+        "sqxtun v5.8b, v5.8h\n"
+        "umax   v5.8b, v5.8b, v1.8b\n"
+        "umin   v5.8b, v5.8b, v2.8b\n"
+        "mov    v6.16b, v14.16b\n"
+        "mov    v12.16b, v14.16b\n"
+        "mov    v9.16b, v17.16b\n"
+        "mov    v8.16b, v18.16b\n"
+        "mov    v11.16b, v22.16b\n"
+        "mov    v10.16b, v7.16b\n"
+        "mov    v13.16b, v16.16b\n"
+        "str    d5, [x15, x6]\n"
+        DC_KERNEL_NO_MULT_STRIDE_29 ":\n"  // in Loop: Header=BB227_30 Depth=2
+        "add    x24, x24, #32\n"  // =32
+        "sub    x28, x28, #1\n"  // =1
+        "subs   x27, x27, #1\n"  // =1
+        "add    w14, w14, w20\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_34 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_30 ":\n"  // Parent Loop BB227_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v14.16b, v30.16b\n"
+        "mov    v15.16b, v31.16b\n"
+        ".word 0x4e8c970e  // sdot   v14.4s, v24.16b, v12.16b\n"
+        "and    x17, x24, #0xffffffe0\n"
+        ".word 0x4e8b972f  // sdot   v15.4s, v25.16b, v11.16b\n"
+        ".word 0x4e89974e  // sdot   v14.4s, v26.16b, v9.16b\n"
+        "add    x17, x26, x17\n"
+        ".word 0x4e8a976f  // sdot   v15.4s, v27.16b, v10.16b\n"
+        ".word 0x4e88978e  // sdot   v14.4s, v28.16b, v8.16b\n"
+        "rev32  v21.8h, v8.8h\n"
+        "rev32  v6.8h, v11.8h\n"
+        "ldp    q11, q22, [x17]\n"
+        ".word 0x4e8d97af  // sdot   v15.4s, v29.16b, v13.16b\n"
+        "sqrdmulh        v8.4s, v14.4s, v3.4s\n"
+        "rev32  v20.8h, v9.8h\n"
+        "sqrdmulh        v9.4s, v15.4s, v3.4s\n"
+        "sqrshl v8.4s, v8.4s, v4.4s\n"
+        "rev32  v5.8h, v13.8h\n"
+        "add    %[output_block_data], x17, x12\n"
+        "add    x17, x17, x5\n"
+        "sqrshl v9.4s, v9.4s, v4.4s\n"
+        "sqxtn  v13.4h, v8.4s\n"
+        "rev32  v19.8h, v12.8h\n"
+        "ldp    q17, q7, [%[output_block_data]]\n"
+        "ldp    q18, q16, [x17]\n"
+        "sqxtn2 v13.8h, v9.4s\n"
+        "trn1   v12.8h, v19.8h, v11.8h\n"
+        "sqadd  v19.8h, v13.8h, v0.8h\n"
+        "sqxtun v19.8b, v19.8h\n"
+        "rev32  v23.8h, v10.8h\n"
+        "umax   v19.8b, v19.8b, v1.8b\n"
+        "add    x15, %[bias_data], w14, sxtw\n"
+        "cmp    w16, #1\n"  // =1
+        "mov    v14.16b, v11.16b\n"
+        "trn1   v11.8h, v6.8h, v22.8h\n"
+        "trn1   v9.8h, v20.8h, v17.8h\n"
+        "trn1   v8.8h, v21.8h, v18.8h\n"
+        "trn1   v10.8h, v23.8h, v7.8h\n"
+        "umin   v19.8b, v19.8b, v2.8b\n"
+        "trn1   v13.8h, v5.8h, v16.8h\n"
+        "str    d19, [x15]\n"
+        "b.gt   " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
+        // %bb.31:        // in Loop: Header=BB227_30 Depth=2
+        "cbnz   x28,    " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
+        // %bb.32:        // in Loop: Header=BB227_30 Depth=2
+        "mov    v6.16b, v14.16b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_29 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_33 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "ldp    q13, q11, [sp, #96]\n"  // 32-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_34 ":\n"  // in Loop: Header=BB227_4 Depth=1
+        "ldp    q13, q11, [sp, #96]\n"  // 32-byte Folded Reload
+        "ldp    q15, q14, [sp, #32]\n"  // 32-byte Folded Reload
+        "ldr    q20, [sp, #16]\n"  // 16-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_35 ":\n"
+        // Compiled intrinsics total stack 304, now 160 for spillage only.
+        "add    sp, sp, #160\n"  // =304
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_NO_MULT_STRIDE_1
+#undef DC_KERNEL_NO_MULT_STRIDE_2
+#undef DC_KERNEL_NO_MULT_STRIDE_3
+#undef DC_KERNEL_NO_MULT_STRIDE_4
+#undef DC_KERNEL_NO_MULT_STRIDE_5
+#undef DC_KERNEL_NO_MULT_STRIDE_6
+#undef DC_KERNEL_NO_MULT_STRIDE_7
+#undef DC_KERNEL_NO_MULT_STRIDE_8
+#undef DC_KERNEL_NO_MULT_STRIDE_9
+#undef DC_KERNEL_NO_MULT_STRIDE_10
+#undef DC_KERNEL_NO_MULT_STRIDE_11
+#undef DC_KERNEL_NO_MULT_STRIDE_12
+#undef DC_KERNEL_NO_MULT_STRIDE_13
+#undef DC_KERNEL_NO_MULT_STRIDE_14
+#undef DC_KERNEL_NO_MULT_STRIDE_15
+#undef DC_KERNEL_NO_MULT_STRIDE_16
+#undef DC_KERNEL_NO_MULT_STRIDE_17
+#undef DC_KERNEL_NO_MULT_STRIDE_18
+#undef DC_KERNEL_NO_MULT_STRIDE_19
+#undef DC_KERNEL_NO_MULT_STRIDE_20
+#undef DC_KERNEL_NO_MULT_STRIDE_21
+#undef DC_KERNEL_NO_MULT_STRIDE_22
+#undef DC_KERNEL_NO_MULT_STRIDE_23
+#undef DC_KERNEL_NO_MULT_STRIDE_24
+#undef DC_KERNEL_NO_MULT_STRIDE_25
+#undef DC_KERNEL_NO_MULT_STRIDE_26
+#undef DC_KERNEL_NO_MULT_STRIDE_27
+#undef DC_KERNEL_NO_MULT_STRIDE_28
+#undef DC_KERNEL_NO_MULT_STRIDE_29
+#undef DC_KERNEL_NO_MULT_STRIDE_30
+#undef DC_KERNEL_NO_MULT_STRIDE_31
+#undef DC_KERNEL_NO_MULT_STRIDE_32
+#undef DC_KERNEL_NO_MULT_STRIDE_33
+#undef DC_KERNEL_NO_MULT_STRIDE_34
+#undef DC_KERNEL_NO_MULT_STRIDE_35
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kNonPerChannelUint8,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_MULT_1 "1"
+#define DC_KERNEL_MULT_2 "2"
+#define DC_KERNEL_MULT_3 "3"
+#define DC_KERNEL_MULT_4 "4"
+#define DC_KERNEL_MULT_5 "5"
+#define DC_KERNEL_MULT_6 "6"
+#define DC_KERNEL_MULT_7 "7"
+#define DC_KERNEL_MULT_8 "8"
+#define DC_KERNEL_MULT_9 "9"
+#define DC_KERNEL_MULT_10 "10"
+#define DC_KERNEL_MULT_11 "11"
+#define DC_KERNEL_MULT_12 "12"
+#define DC_KERNEL_MULT_13 "13"
+#define DC_KERNEL_MULT_14 "14"
+#define DC_KERNEL_MULT_15 "15"
+#define DC_KERNEL_MULT_16 "16"
+#define DC_KERNEL_MULT_17 "17"
+#define DC_KERNEL_MULT_18 "18"
+#define DC_KERNEL_MULT_19 "19"
+#define DC_KERNEL_MULT_20 "20"
+#define DC_KERNEL_MULT_21 "21"
+#define DC_KERNEL_MULT_22 "22"
+
+    asm volatile(
+        // Compiled code used block of 288 for spill out of total stack of 400.
+        // However, an 8-byte spill was sneaked in to #296.
+        // Spillage increased to 304 and these are mapped to #288.
+        "sub    sp, sp, #304\n"  // =400
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[filter_workspace], [sp, #32]\n"  // 8-byte Folded Spill
+        "cmp    w8, #1\n"  // =1
+        "str    w8, [sp, #12]\n"  // 4-byte Folded Spill
+        "b.lt   " DC_KERNEL_MULT_22 "f\n"
+        // %bb.1:
+        "str    wzr, [sp, #28]\n"  // 4-byte Folded Spill
+        "ldpsw  x21, x5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldrb   w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
+        "ldrsw  x17, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr    w13, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "add    x11, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"  // =36
+        "ldp    w1, w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "add    x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "add    x12, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
+        "ld1r   { v2.4s }, [x11]\n"
+        "dup    v3.16b, w8\n"
+        "fmov   s5, w8\n"
+        "lsl    x11, x21, #1\n"
+        "add    x7, x21, x21, lsl #1\n"
+        "lsl    x8, x17, #1\n"
+        "ldr    w16, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ld1r   { v0.8h }, [x10]\n"
+        "ld1r   { v1.4s }, [x12]\n"
+        "str    w13, [sp, #272]\n"  // 4-byte Folded Spill
+        "cmp    w13, #4\n"  // =4
+        "add    x10, x8, x17\n"
+        "add    x6, x8, x7\n"
+        "add    x12, x8, x11\n"
+        "add    x13, x8, x21\n"
+        "add    x8, %[output_block_data], x8\n"
+        "str    x8, [sp, #176]\n"  // 8-byte Folded Spill
+        "add    x8, x7, x17\n"
+        "add    x14, x11, x17\n"
+        "add    x24, %[output_block_data], x8\n"
+        "add    x8, %[output_block_data], x14\n"
+        "add    x14, x5, #4\n"  // =4
+        "ccmp   w15, w1, #0, lt\n"
+        "str    x14, [sp, #136]\n"  // 8-byte Folded Spill
+        "lsl    x14, x17, #2\n"
+        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
+        "csel   w25, w15, w1, lt\n"
+        "cmp    w16, #1\n"  // =1
+        "str    x14, [sp, #128]\n"  // 8-byte Folded Spill
+        "add    x14, %[output_block_data], x21\n"
+        "add    x22, x5, x5, lsl #2\n"
+        "str    x16, [sp, #56]\n"  // 8-byte Folded Spill
+        "cset   w16, lt\n"
+        "cmp    w1, #1\n"  // =1
+        "str    x14, [sp, #120]\n"  // 8-byte Folded Spill
+        "add    x14, %[output_block_data], x17\n"
+        "lsl    x20, x5, #2\n"
+        "str    w1, [sp, #276]\n"  // 4-byte Folded Spill
+        "cset   w1, lt\n"
+        "str    x14, [sp, #112]\n"  // 8-byte Folded Spill
+        "add    x14, x22, #4\n"  // =4
+        "add    x19, x5, x5, lsl #1\n"
+        "orr    w16, w16, w1\n"
+        "str    x14, [sp, #104]\n"  // 8-byte Folded Spill
+        "add    x14, x20, #4\n"  // =4
+        "dup    v4.16b, w9\n"
+        "fmov   s6, w9\n"
+        "lsl    %[function_params], x5, #1\n"
+        "add    x9, x21, x17\n"
+        "str    w16, [sp, #8]\n"  // 4-byte Folded Spill
+        "add    x16, x10, x21\n"
+        "str    x14, [sp, #96]\n"  // 8-byte Folded Spill
+        "add    x14, x19, #4\n"  // =4
+        "mov    x23, xzr\n"
+        "add    x9, %[output_block_data], x9\n"
+        "str    w15, [sp, #268]\n"  // 4-byte Folded Spill
+        "add    x15, x10, x11\n"
+        "add    x27, %[output_block_data], x12\n"
+        "add    x12, %[output_block_data], x16\n"
+        "str    x14, [sp, #88]\n"  // 8-byte Folded Spill
+        "add    x14, %[function_params], #4\n"  // =4
+        "stp    x11, x21, [sp, #184]\n"  // 16-byte Folded Spill
+        "add    x11, %[output_block_data], x11\n"
+        "str    x9, [sp, #168]\n"  // 8-byte Folded Spill
+        "add    x9, x10, x7\n"
+        "add    x26, %[output_block_data], x6\n"
+        "add    x28, %[output_block_data], x13\n"
+        "mov    x13, x23\n"
+        "str    x12, [sp, #144]\n"  // 8-byte Folded Spill
+        "mov    x12, x7\n"
+        "stp    x7, %[output_block_data], [sp, #40]\n"  // 16-byte Folded Spill
+        "stp    x19, x5, [sp, #248]\n"  // 16-byte Folded Spill
+        "stp    x22, x20, [sp, #232]\n"  // 16-byte Folded Spill
+        "stp    x11, x14, [sp, #72]\n"  // 16-byte Folded Spill
+        "add    x11, %[output_block_data], x7\n"
+        "ldp    x7, x6, [sp, #120]\n"  // 16-byte Folded Reload
+        "ldr    x23, [sp, #112]\n"  // 8-byte Folded Reload
+        "ldp    x22, x19, [sp, #88]\n"  // 16-byte Folded Reload
+        "add    x10, %[output_block_data], x10\n"
+        "dup    v5.8b, v5.b[0]\n"
+        "dup    v6.8b, v6.b[0]\n"
+        "str    x10, [sp, #152]\n"  // 8-byte Folded Spill
+        "add    x9, %[output_block_data], x9\n"
+        "add    x10, %[output_block_data], x15\n"
+        "mov    w15, #4\n"
+        "mov    x20, x14\n"
+        "str    %[function_params], [sp, #280]\n"  // 8-byte Folded Spill
+        "str    x11, [sp, #64]\n"  // 8-byte Folded Spill
+        "str    %[scratch_block_data], [sp, #200]\n"  // 8-byte Folded Spill
+        "str    w25, [sp, #164]\n"  // 4-byte Folded Spill
+        "str    x9, [sp, #288]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_MULT_4 "f\n"
+        DC_KERNEL_MULT_2 ":\n"  // in Loop: Header=BB205_4 Depth=1
+        "mov    %[bias_data], x11\n"
+        DC_KERNEL_MULT_3 ":\n"  // in Loop: Header=BB205_4 Depth=1
+        "ldr    w13, [sp, #28]\n"  // 4-byte Folded Reload
+        "ldr    w12, [sp, #12]\n"  // 4-byte Folded Reload
+        "ldr    x11, [sp, #48]\n"  // 8-byte Folded Reload
+        "add    w13, w13, #1\n"  // =1
+        "str    w13, [sp, #28]\n"  // 4-byte Folded Spill
+        "cmp    w13, w12\n"
+        "ldr    x13, [sp, #16]\n"  // 8-byte Folded Reload
+        "add    x11, x11, #8\n"  // =8
+        "str    x11, [sp, #48]\n"  // 8-byte Folded Spill
+        "add    x13, x13, #8\n"  // =8
+        "b.eq   " DC_KERNEL_MULT_22 "f\n"
+        DC_KERNEL_MULT_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB205_18 Depth 2
+        // Child Loop BB205_20 Depth 3
+        // Child Loop BB205_21 Depth 4
+        // Child Loop BB205_7 Depth 2
+        // Child Loop BB205_9 Depth 3
+        // Child Loop BB205_13 Depth 3
+        "ldr    x12, [sp, #32]\n"  // 8-byte Folded Reload
+        "ldr    x14, [sp, #56]\n"  // 8-byte Folded Reload
+        "ldp    q20, q7, [x12]\n"
+        "ldp    q19, q16, [x12, #32]\n"
+        "ldp    q18, q17, [x12, #64]\n"
+        "cmp    w14, #4\n"  // =4
+        "add    x12, x12, #96\n"  // =96
+        "str    x12, [sp, #32]\n"  // 8-byte Folded Spill
+        "str    x13, [sp, #16]\n"  // 8-byte Folded Spill
+        "b.ne   " DC_KERNEL_MULT_15 "f\n"
+        // %bb.5:        // in Loop: Header=BB205_4 Depth=1
+        "mov    %[filter_workspace], xzr\n"
+        "mov    x5, x13\n"
+        "b      " DC_KERNEL_MULT_7 "f\n"
+        DC_KERNEL_MULT_6 ":\n"  // in Loop: Header=BB205_7 Depth=2
+        "add    %[filter_workspace], x1, #1\n"  // =1
+        "cmp    %[filter_workspace], #2\n"  // =2
+        "add    x5, x5, #4\n"  // =4
+        "mov    v18.16b, v17.16b\n"
+        "mov    v19.16b, v16.16b\n"
+        "mov    v20.16b, v7.16b\n"
+        "b.eq   " DC_KERNEL_MULT_3 "b\n"
+        DC_KERNEL_MULT_7 ":\n"  // Parent Loop BB205_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB205_9 Depth 3
+        // Child Loop BB205_13 Depth 3
+        "ldr    q21, [%[bias_data]], #16\n"
+        "ldr    w12, [%[scratch_block_data]]\n"
+        "ldp    %[function_params], x13, [sp, #248]\n"  // 16-byte Folded Reload
+        "ldr    x16, [sp, #240]\n"  // 8-byte Folded Reload
+        "ldr    x14, [sp, #280]\n"  // 8-byte Folded Reload
+        "fmov   s22, w12\n"
+        "add    x13, %[scratch_block_data], x13\n"
+        "ldr    w16, [%[scratch_block_data], x16]\n"
+        "mov    v22.s[1], w12\n"
+        "ld1    { v22.s }[2], [x13]\n"
+        "ldr    x13, [sp, #232]\n"  // 8-byte Folded Reload
+        "ldr    w14, [%[scratch_block_data], x14]\n"
+        "fmov   s23, w16\n"
+        "ldr    w4, [%[scratch_block_data], %[function_params]]\n"
+        "add    x13, %[scratch_block_data], x13\n"
+        "mov    v23.s[1], w16\n"
+        "ld1    { v23.s }[2], [x13]\n"
+        "fmov   s24, w14\n"
+        "mov    v24.s[1], w14\n"
+        "dup    v25.4s, w14\n"
+        "mov    v28.16b, v21.16b\n"
+        "mov    v29.16b, v21.16b\n"
+        "mov    v30.16b, v21.16b\n"
+        "dup    v26.4s, w4\n"
+        "mov    v31.16b, v21.16b\n"
+        "mov    v24.s[2], w4\n"
+        "cmp    w25, #1\n"  // =1
+        ".word 0x4e99965c  // sdot   v28.4s, v18.16b, v25.16b\n"
+        ".word 0x4e99967d  // sdot   v29.4s, v19.16b, v25.16b\n"
+        ".word 0x4e99969e  // sdot   v30.4s, v20.16b, v25.16b\n"
+        "mov    v24.s[3], w14\n"
+        "mov    v22.s[3], w12\n"
+        "mov    v23.s[3], w16\n"
+        ".word 0x4e9a969f  // sdot   v31.4s, v20.16b, v26.16b\n"
+        "b.lt   " DC_KERNEL_MULT_14 "f\n"
+        // %bb.8:        // in Loop: Header=BB205_7 Depth=2
+        "stp    %[filter_workspace], %[bias_data], [sp, #216]\n"  // 16-byte Folded Spill
+        "mov    w13, w25\n"
+        "str    x5, [sp, #208]\n"  // 8-byte Folded Spill
+        "mov    x16, x5\n"
+        "mov    x14, %[scratch_block_data]\n"
+        "ldp    x25, %[scratch_block_data], [sp, #168]\n"  // 16-byte Folded Reload
+        "mov    x15, x10\n"
+        "mov    x9, x8\n"
+        "mov    x8, x24\n"
+        "mov    x24, x28\n"
+        "mov    x28, x27\n"
+        "ldp    %[filter_workspace], x27, [sp, #144]\n"  // 16-byte Folded Reload
+        "ldr    x5, [sp, #136]\n"  // 8-byte Folded Reload
+        "ldr    %[bias_data], [sp, #104]\n"  // 8-byte Folded Reload
+        "ldp    x10, x11, [sp, #64]\n"  // 16-byte Folded Reload
+        "shl    v25.4s, v20.4s, #8\n"
+        "shl    v26.4s, v19.4s, #8\n"
+        "shl    v27.4s, v18.4s, #8\n"
+        DC_KERNEL_MULT_9 ":\n"  // Parent Loop BB205_4 Depth=1
+        // Parent Loop BB205_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4f96e29c  // sdot   v28.4s, v20.16b, v22.4b[0]\n"
+        ".word 0x4f96ea9d  // sdot   v29.4s, v20.16b, v22.4b[2]\n"
+        ".word 0x4f98ea7e  // sdot   v30.4s, v19.16b, v24.4b[2]\n"
+        ".word 0x4f96ea7c  // sdot   v28.4s, v19.16b, v22.4b[2]\n"
+        ".word 0x4f97e27f  // sdot   v31.4s, v19.16b, v23.4b[0]\n"
+        ".word 0x4f98ea5d  // sdot   v29.4s, v18.16b, v24.4b[2]\n"
+        ".word 0x4f97e25e  // sdot   v30.4s, v18.16b, v23.4b[0]\n"
+        "sqrdmulh        v28.4s, v28.4s, v1.4s\n"
+        ".word 0x4f97ea5f  // sdot   v31.4s, v18.16b, v23.4b[2]\n"
+        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
+        "sqrdmulh        v30.4s, v30.4s, v1.4s\n"
+        "sqrshl v28.4s, v28.4s, v2.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v1.4s\n"
+        "sqrshl v29.4s, v29.4s, v2.4s\n"
+        "sqrshl v30.4s, v30.4s, v2.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqrshl v31.4s, v31.4s, v2.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqxtun v28.8b, v28.8h\n"
+        "sqxtun2        v28.16b, v29.8h\n"
+        "umax   v28.16b, v28.16b, v3.16b\n"
+        "add    %[function_params], x7, x16\n"
+        "umin   v28.16b, v28.16b, v4.16b\n"
+        "add    x21, x11, x16\n"
+        "str    s28, [%[output_block_data], x16]\n"
+        "st1    { v28.s }[1], [%[function_params]]\n"
+        "add    %[function_params], x10, x16\n"
+        "st1    { v28.s }[2], [x21]\n"
+        "st1    { v28.s }[3], [%[function_params]]\n"
+        "mov    x12, x14\n"
+        "add    x21, x14, x20\n"
+        "ldr    w4, [x14, #4]!\n"
+        "ld1    { v24.s }[1], [x21]\n"
+        "add    x21, x12, x19\n"
+        "ld1    { v23.s }[1], [x21]\n"
+        "mov    v22.s[1], w4\n"
+        "add    %[function_params], x12, x22\n"
+        "ld1    { v24.s }[3], [%[function_params]]\n"
+        "add    %[function_params], x12, x5\n"
+        "ld1    { v22.s }[3], [%[function_params]]\n"
+        "add    x12, x12, %[bias_data]\n"
+        "mov    v28.16b, v21.16b\n"
+        "ld1    { v23.s }[3], [x12]\n"
+        "mov    v29.16b, v21.16b\n"
+        "mov    v30.16b, v21.16b\n"
+        ".word 0x4f96e33c  // sdot   v28.4s, v25.16b, v22.4b[0]\n"
+        "mov    v31.16b, v21.16b\n"
+        ".word 0x4f98e33e  // sdot   v30.4s, v25.16b, v24.4b[0]\n"
+        ".word 0x4f96eb3d  // sdot   v29.4s, v25.16b, v22.4b[2]\n"
+        ".word 0x4f96eb5c  // sdot   v28.4s, v26.16b, v22.4b[2]\n"
+        ".word 0x4f98eb3f  // sdot   v31.4s, v25.16b, v24.4b[2]\n"
+        ".word 0x4f98eb5e  // sdot   v30.4s, v26.16b, v24.4b[2]\n"
+        ".word 0x4f98e35d  // sdot   v29.4s, v26.16b, v24.4b[0]\n"
+        ".word 0x4f98e37c  // sdot   v28.4s, v27.16b, v24.4b[0]\n"
+        ".word 0x4f97e35f  // sdot   v31.4s, v26.16b, v23.4b[0]\n"
+        ".word 0x4f97e37e  // sdot   v30.4s, v27.16b, v23.4b[0]\n"
+        ".word 0x4f98eb7d  // sdot   v29.4s, v27.16b, v24.4b[2]\n"
+        "sqrdmulh        v28.4s, v28.4s, v1.4s\n"
+        ".word 0x4f97eb7f  // sdot   v31.4s, v27.16b, v23.4b[2]\n"
+        "sqrdmulh        v30.4s, v30.4s, v1.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
+        "sqrshl v28.4s, v28.4s, v2.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v1.4s\n"
+        "sqrshl v30.4s, v30.4s, v2.4s\n"
+        "sqrshl v29.4s, v29.4s, v2.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqrshl v31.4s, v31.4s, v2.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqxtun v28.8b, v28.8h\n"
+        "sqxtun2        v28.16b, v29.8h\n"
+        "umax   v28.16b, v28.16b, v3.16b\n"
+        "add    x12, x25, x16\n"
+        "umin   v28.16b, v28.16b, v4.16b\n"
+        "add    %[function_params], x9, x16\n"
+        "str    s28, [x23, x16]\n"
+        "st1    { v28.s }[1], [x12]\n"
+        "add    x12, x8, x16\n"
+        "mov    v29.16b, v21.16b\n"
+        "ushr   v10.2d, v22.2d, #16\n"
+        "mov    v30.16b, v21.16b\n"
+        "mov    v31.16b, v21.16b\n"
+        "st1    { v28.s }[2], [%[function_params]]\n"
+        "st1    { v28.s }[3], [x12]\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f8ae29d  // sdot   v29.4s, v20.16b, v10.4b[0]\n"
+        "mov    v8.16b, v21.16b\n"
+        ".word 0x4f9ce29f  // sdot   v31.4s, v20.16b, v28.4b[0]\n"
+        ".word 0x4f8aea9e  // sdot   v30.4s, v20.16b, v10.4b[2]\n"
+        ".word 0x4f8aea7d  // sdot   v29.4s, v19.16b, v10.4b[2]\n"
+        "ushr   v9.2d, v23.2d, #16\n"
+        ".word 0x4f9cea88  // sdot   v8.4s, v20.16b, v28.4b[2]\n"
+        ".word 0x4f9cea7f  // sdot   v31.4s, v19.16b, v28.4b[2]\n"
+        ".word 0x4f9ce27e  // sdot   v30.4s, v19.16b, v28.4b[0]\n"
+        ".word 0x4f9ce25d  // sdot   v29.4s, v18.16b, v28.4b[0]\n"
+        ".word 0x4f89e268  // sdot   v8.4s, v19.16b, v9.4b[0]\n"
+        ".word 0x4f89e25f  // sdot   v31.4s, v18.16b, v9.4b[0]\n"
+        ".word 0x4f9cea5e  // sdot   v30.4s, v18.16b, v28.4b[2]\n"
+        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
+        ".word 0x4f89ea48  // sdot   v8.4s, v18.16b, v9.4b[2]\n"
+        "sqrdmulh        v31.4s, v31.4s, v1.4s\n"
+        "sqrdmulh        v30.4s, v30.4s, v1.4s\n"
+        "sqrshl v29.4s, v29.4s, v2.4s\n"
+        "sqrdmulh        v8.4s, v8.4s, v1.4s\n"
+        "sqrshl v31.4s, v31.4s, v2.4s\n"
+        "sqrshl v30.4s, v30.4s, v2.4s\n"
+        "sqxtn  v29.4h, v29.4s\n"
+        "sqrshl v8.4s, v8.4s, v2.4s\n"
+        "sqxtn  v31.4h, v31.4s\n"
+        "sqxtn2 v29.8h, v30.4s\n"
+        "sqxtn2 v31.8h, v8.4s\n"
+        "sqadd  v29.8h, v29.8h, v0.8h\n"
+        "sqadd  v30.8h, v31.8h, v0.8h\n"
+        "sqxtun v29.8b, v29.8h\n"
+        "sqxtun2        v29.16b, v30.8h\n"
+        "umax   v29.16b, v29.16b, v3.16b\n"
+        "add    %[function_params], x24, x16\n"
+        "umin   v29.16b, v29.16b, v4.16b\n"
+        "mov    v30.16b, v21.16b\n"
+        "add    x12, x28, x16\n"
+        "str    s29, [%[scratch_block_data], x16]\n"
+        "st1    { v29.s }[1], [%[function_params]]\n"
+        "add    %[function_params], x26, x16\n"
+        "mov    v31.16b, v21.16b\n"
+        "mov    v8.16b, v21.16b\n"
+        ".word 0x4f8ae33e  // sdot   v30.4s, v25.16b, v10.4b[0]\n"
+        "st1    { v29.s }[2], [x12]\n"
+        "st1    { v29.s }[3], [%[function_params]]\n"
+        "mov    v29.16b, v21.16b\n"
+        ".word 0x4f9ce328  // sdot   v8.4s, v25.16b, v28.4b[0]\n"
+        ".word 0x4f8aeb3f  // sdot   v31.4s, v25.16b, v10.4b[2]\n"
+        ".word 0x4f8aeb5e  // sdot   v30.4s, v26.16b, v10.4b[2]\n"
+        ".word 0x4f9ceb3d  // sdot   v29.4s, v25.16b, v28.4b[2]\n"
+        ".word 0x4f9ceb48  // sdot   v8.4s, v26.16b, v28.4b[2]\n"
+        ".word 0x4f9ce35f  // sdot   v31.4s, v26.16b, v28.4b[0]\n"
+        ".word 0x4f9ce37e  // sdot   v30.4s, v27.16b, v28.4b[0]\n"
+        ".word 0x4f89e35d  // sdot   v29.4s, v26.16b, v9.4b[0]\n"
+        ".word 0x4f89e368  // sdot   v8.4s, v27.16b, v9.4b[0]\n"
+        ".word 0x4f9ceb7f  // sdot   v31.4s, v27.16b, v28.4b[2]\n"
+        "sqrdmulh        v30.4s, v30.4s, v1.4s\n"
+        ".word 0x4f89eb7d  // sdot   v29.4s, v27.16b, v9.4b[2]\n"
+        "sqrdmulh        v28.4s, v8.4s, v1.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v1.4s\n"
+        "sqrshl v30.4s, v30.4s, v2.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
+        "sqrshl v28.4s, v28.4s, v2.4s\n"
+        "sqrshl v31.4s, v31.4s, v2.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqrshl v29.4s, v29.4s, v2.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtun v29.8b, v29.8h\n"
+        "sqxtun2        v29.16b, v28.8h\n"
+        "umax   v28.16b, v29.16b, v3.16b\n"
+        "add    x12, %[filter_workspace], x16\n"
+        "umin   v8.16b, v28.16b, v4.16b\n"
+        "str    s8, [x27, x16]\n"
+        "st1    { v8.s }[1], [x12]\n"
+        "ldr    x12, [sp, #288]\n"  // 8-byte Folded Reload
+        "mov    v28.16b, v21.16b\n"
+        "mov    v29.16b, v21.16b\n"
+        "mov    v30.16b, v21.16b\n"
+        "mov    v31.16b, v21.16b\n"
+        "ushr   v24.2d, v24.2d, #32\n"
+        "add    %[function_params], x15, x16\n"
+        "add    x12, x12, x16\n"
+        "subs   w13, w13, #1\n"  // =1
+        "ushr   v22.2d, v22.2d, #32\n"
+        "ushr   v23.2d, v23.2d, #32\n"
+        ".word 0x4f98e25c  // sdot   v28.4s, v18.16b, v24.4b[0]\n"
+        ".word 0x4f98e27d  // sdot   v29.4s, v19.16b, v24.4b[0]\n"
+        ".word 0x4f98e29e  // sdot   v30.4s, v20.16b, v24.4b[0]\n"
+        ".word 0x4f98ea9f  // sdot   v31.4s, v20.16b, v24.4b[2]\n"
+        "add    x16, x16, x6\n"
+        "st1    { v8.s }[2], [%[function_params]]\n"
+        "st1    { v8.s }[3], [x12]\n"
+        "b.ne   " DC_KERNEL_MULT_9 "b\n"
+        // %bb.10:        // in Loop: Header=BB205_7 Depth=2
+        "ldr    w25, [sp, #164]\n"  // 4-byte Folded Reload
+        "ldp    x21, %[scratch_block_data], [sp, #192]\n"  // 16-byte Folded Reload
+        "ldr    %[function_params], [sp, #184]\n"  // 8-byte Folded Reload
+        "ldp    %[filter_workspace], %[bias_data], [sp, #216]\n"  // 16-byte Folded Reload
+        "ldr    x5, [sp, #208]\n"  // 8-byte Folded Reload
+        "add    x13, %[output_block_data], x16\n"
+        "mov    w12, w25\n"
+        "mov    x27, x28\n"
+        "mov    x28, x24\n"
+        "mov    x24, x8\n"
+        "mov    x8, x9\n"
+        "mov    x10, x15\n"
+        "mov    w15, #4\n"
+        "ldr    w16, [sp, #276]\n"  // 4-byte Folded Reload
+        "cmp    w12, w16\n"
+        "b.ge   " DC_KERNEL_MULT_6 "b\n"
+        DC_KERNEL_MULT_11 ":\n"  // in Loop: Header=BB205_7 Depth=2
+        "ldr    w12, [sp, #272]\n"  // 4-byte Folded Reload
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_6 "b\n"
+        // %bb.12:        // in Loop: Header=BB205_7 Depth=2
+        "add    x12, x14, #4\n"  // =4
+        "ldr    x14, [sp, #240]\n"  // 8-byte Folded Reload
+        "ldr    x16, [sp, #280]\n"  // 8-byte Folded Reload
+        "add    x14, x12, x14\n"
+        "ld1    { v23.s }[1], [x14]\n"
+        "ldr    x14, [sp, #232]\n"  // 8-byte Folded Reload
+        "add    x16, x12, x16\n"
+        "ld1    { v24.s }[1], [x16]\n"
+        "add    x14, x12, x14\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "ldp    x16, x14, [sp, #248]\n"  // 16-byte Folded Reload
+        "add    x16, x12, x16\n"
+        "ld1    { v24.s }[3], [x16]\n"
+        "ldr    x16, [sp, #40]\n"  // 8-byte Folded Reload
+        "ld1    { v22.s }[1], [x12], x14\n"
+        "ld1    { v22.s }[3], [x12]\n"
+        "ldr    w12, [sp, #272]\n"  // 4-byte Folded Reload
+        DC_KERNEL_MULT_13 ":\n"  // Parent Loop BB205_4 Depth=1
+        // Parent Loop BB205_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4f96e29c  // sdot   v28.4s, v20.16b, v22.4b[0]\n"
+        ".word 0x4f96ea9d  // sdot   v29.4s, v20.16b, v22.4b[2]\n"
+        ".word 0x4f98ea7e  // sdot   v30.4s, v19.16b, v24.4b[2]\n"
+        ".word 0x4f96ea7c  // sdot   v28.4s, v19.16b, v22.4b[2]\n"
+        ".word 0x4f97e27f  // sdot   v31.4s, v19.16b, v23.4b[0]\n"
+        ".word 0x4f98ea5d  // sdot   v29.4s, v18.16b, v24.4b[2]\n"
+        ".word 0x4f97e25e  // sdot   v30.4s, v18.16b, v23.4b[0]\n"
+        "sqrdmulh        v25.4s, v28.4s, v1.4s\n"
+        ".word 0x4f97ea5f  // sdot   v31.4s, v18.16b, v23.4b[2]\n"
+        "sqrdmulh        v26.4s, v29.4s, v1.4s\n"
+        "sqrdmulh        v27.4s, v30.4s, v1.4s\n"
+        "sqrshl v25.4s, v25.4s, v2.4s\n"
+        "sqrdmulh        v28.4s, v31.4s, v1.4s\n"
+        "sqrshl v26.4s, v26.4s, v2.4s\n"
+        "sqrshl v27.4s, v27.4s, v2.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqrshl v28.4s, v28.4s, v2.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v25.8h, v26.4s\n"
+        "sqxtn2 v27.8h, v28.4s\n"
+        "sqadd  v25.8h, v25.8h, v0.8h\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtun v25.8b, v25.8h\n"
+        "sqxtun2        v25.16b, v26.8h\n"
+        "umax   v25.16b, v25.16b, v3.16b\n"
+        "add    x14, x13, x21\n"
+        "umin   v25.16b, v25.16b, v4.16b\n"
+        "str    s25, [x13]\n"
+        "st1    { v25.s }[1], [x14]\n"
+        "add    x14, x13, %[function_params]\n"
+        "ushr   v24.2d, v24.2d, #8\n"
+        "mov    v28.16b, v21.16b\n"
+        "mov    v29.16b, v21.16b\n"
+        "mov    v30.16b, v21.16b\n"
+        "mov    v31.16b, v21.16b\n"
+        "st1    { v25.s }[2], [x14]\n"
+        "add    x14, x13, x16\n"
+        "subs   w12, w12, #1\n"  // =1
+        "ushr   v22.2d, v22.2d, #8\n"
+        "ushr   v23.2d, v23.2d, #8\n"
+        ".word 0x4f98e25c  // sdot   v28.4s, v18.16b, v24.4b[0]\n"
+        ".word 0x4f98e27d  // sdot   v29.4s, v19.16b, v24.4b[0]\n"
+        ".word 0x4f98e29e  // sdot   v30.4s, v20.16b, v24.4b[0]\n"
+        "add    x13, x13, x17\n"
+        ".word 0x4f98ea9f  // sdot   v31.4s, v20.16b, v24.4b[2]\n"
+        "st1    { v25.s }[3], [x14]\n"
+        "b.ne   " DC_KERNEL_MULT_13 "b\n"
+        "b      " DC_KERNEL_MULT_6 "b\n"
+        DC_KERNEL_MULT_14 ":\n"  // in Loop: Header=BB205_7 Depth=2
+        "ldr    x11, [sp, #48]\n"  // 8-byte Folded Reload
+        "ldr    %[function_params], [sp, #184]\n"  // 8-byte Folded Reload
+        "mov    w12, wzr\n"
+        "mov    x14, %[scratch_block_data]\n"
+        "add    x13, x11, %[filter_workspace], lsl #2\n"
+        "ldr    w16, [sp, #276]\n"  // 4-byte Folded Reload
+        "cmp    w12, w16\n"
+        "b.ge   " DC_KERNEL_MULT_6 "b\n"
+        "b      " DC_KERNEL_MULT_11 "b\n"
+        DC_KERNEL_MULT_15 ":\n"  // in Loop: Header=BB205_4 Depth=1
+        "ldr    w14, [sp, #8]\n"  // 4-byte Folded Reload
+        "add    x11, %[bias_data], #32\n"  // =32
+        "tbnz   w14, #0,    " DC_KERNEL_MULT_2 "b\n"
+        // %bb.16:        // in Loop: Header=BB205_4 Depth=1
+        "ldp    q21, q22, [%[bias_data]]\n"
+        "ldr    %[filter_workspace], [sp, #48]\n"  // 8-byte Folded Reload
+        "mov    x14, xzr\n"
+        "b      " DC_KERNEL_MULT_18 "f\n"
+        DC_KERNEL_MULT_17 ":\n"  // in Loop: Header=BB205_18 Depth=2
+        "ldr    x12, [sp, #56]\n"  // 8-byte Folded Reload
+        "ldp    x21, %[scratch_block_data], [sp, #192]\n"  // 16-byte Folded Reload
+        "add    x14, x14, #1\n"  // =1
+        "cmp    x14, x12\n"
+        "add    %[filter_workspace], x1, x21\n"
+        "b.eq   " DC_KERNEL_MULT_2 "b\n"
+        DC_KERNEL_MULT_18 ":\n"  // Parent Loop BB205_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB205_20 Depth 3
+        // Child Loop BB205_21 Depth 4
+        "ldr    x16, [sp, #256]\n"  // 8-byte Folded Reload
+        "mov    w13, wzr\n"
+        "madd   x12, x14, x16, %[scratch_block_data]\n"
+        "mov    %[scratch_block_data], x16\n"
+        "ldr    w16, [x12]\n"
+        "add    %[function_params], x12, %[scratch_block_data]\n"
+        "fmov   s23, w16\n"
+        "mov    v23.s[1], w16\n"
+        "ld1    { v23.s }[2], [%[function_params]]\n"
+        "ldr    %[function_params], [sp, #280]\n"  // 8-byte Folded Reload
+        "mov    v23.s[3], w16\n"
+        "add    %[function_params], x12, %[function_params]\n"
+        "ld1r   { v24.4s }, [%[function_params]]\n"
+        "mov    x16, %[filter_workspace]\n"
+        "b      " DC_KERNEL_MULT_20 "f\n"
+        DC_KERNEL_MULT_19 ":\n"  // in Loop: Header=BB205_20 Depth=3
+        "ldr    w4, [sp, #276]\n"  // 4-byte Folded Reload
+        "add    w13, w13, #1\n"  // =1
+        "cmp    w13, w4\n"
+        "b.eq   " DC_KERNEL_MULT_17 "b\n"
+        DC_KERNEL_MULT_20 ":\n"  // Parent Loop BB205_4 Depth=1
+        // Parent Loop BB205_18 Depth=2
+        // =>  This Loop Header: Depth=3
+        // Child Loop BB205_21 Depth 4
+        "ldr    x21, [sp, #280]\n"  // 8-byte Folded Reload
+        "add    x12, x12, #4\n"  // =4
+        "mov    %[function_params], x12\n"
+        "ld1    { v23.s }[1], [%[function_params]], x21\n"
+        "ldr    w21, [sp, #268]\n"  // 4-byte Folded Reload
+        "ld1    { v24.s }[1], [%[function_params]]\n"
+        "ldr    w4, [sp, #272]\n"  // 4-byte Folded Reload
+        "cmp    w13, w21\n"
+        "add    x21, x12, %[scratch_block_data]\n"
+        "ld1    { v23.s }[3], [x21]\n"
+        "csel   w4, w4, w15, eq\n"
+        "cmp    w4, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_19 "b\n"
+        DC_KERNEL_MULT_21 ":\n"  // Parent Loop BB205_4 Depth=1
+        // Parent Loop BB205_18 Depth=2
+        // Parent Loop BB205_20 Depth=3
+        // =>  This Inner Loop Header: Depth=4
+        "mov    v25.16b, v21.16b\n"
+        "mov    v26.16b, v22.16b\n"
+        ".word 0x4f97e299  // sdot   v25.4s, v20.16b, v23.4b[0]\n"
+        ".word 0x4f97e0fa  // sdot   v26.4s, v7.16b, v23.4b[0]\n"
+        ".word 0x4f97ea79  // sdot   v25.4s, v19.16b, v23.4b[2]\n"
+        ".word 0x4f97ea1a  // sdot   v26.4s, v16.16b, v23.4b[2]\n"
+        ".word 0x4f98e259  // sdot   v25.4s, v18.16b, v24.4b[0]\n"
+        ".word 0x4f98e23a  // sdot   v26.4s, v17.16b, v24.4b[0]\n"
+        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
+        "sqrdmulh        v26.4s, v26.4s, v1.4s\n"
+        "sqrshl v25.4s, v25.4s, v2.4s\n"
+        "sqrshl v26.4s, v26.4s, v2.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqxtn2 v25.8h, v26.4s\n"
+        "sqadd  v25.8h, v25.8h, v0.8h\n"
+        "sqxtun v25.8b, v25.8h\n"
+        "umax   v25.8b, v25.8b, v5.8b\n"
+        "umin   v25.8b, v25.8b, v6.8b\n"
+        "subs   w4, w4, #1\n"  // =1
+        "ushr   v23.2d, v23.2d, #8\n"
+        "ushr   v24.2d, v24.2d, #8\n"
+        "str    d25, [x16]\n"
+        "add    x16, x16, x17\n"
+        "b.ne   " DC_KERNEL_MULT_21 "b\n"
+        "b      " DC_KERNEL_MULT_19 "b\n"
+        DC_KERNEL_MULT_22 ":\n"
+        // Compiled intrinsics total stack 400, now 304 for spillage only.
+        "add    sp, sp, #304\n"  // =400
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_MULT_1
+#undef DC_KERNEL_MULT_2
+#undef DC_KERNEL_MULT_3
+#undef DC_KERNEL_MULT_4
+#undef DC_KERNEL_MULT_5
+#undef DC_KERNEL_MULT_6
+#undef DC_KERNEL_MULT_7
+#undef DC_KERNEL_MULT_8
+#undef DC_KERNEL_MULT_9
+#undef DC_KERNEL_MULT_10
+#undef DC_KERNEL_MULT_11
+#undef DC_KERNEL_MULT_12
+#undef DC_KERNEL_MULT_13
+#undef DC_KERNEL_MULT_14
+#undef DC_KERNEL_MULT_15
+#undef DC_KERNEL_MULT_16
+#undef DC_KERNEL_MULT_17
+#undef DC_KERNEL_MULT_18
+#undef DC_KERNEL_MULT_19
+#undef DC_KERNEL_MULT_20
+#undef DC_KERNEL_MULT_21
+#undef DC_KERNEL_MULT_22
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kNonPerChannelUint8,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_MULT_STRIDE_1 "1"
+#define DC_KERNEL_MULT_STRIDE_2 "2"
+#define DC_KERNEL_MULT_STRIDE_3 "3"
+#define DC_KERNEL_MULT_STRIDE_4 "4"
+#define DC_KERNEL_MULT_STRIDE_5 "5"
+#define DC_KERNEL_MULT_STRIDE_6 "6"
+#define DC_KERNEL_MULT_STRIDE_7 "7"
+#define DC_KERNEL_MULT_STRIDE_8 "8"
+#define DC_KERNEL_MULT_STRIDE_9 "9"
+#define DC_KERNEL_MULT_STRIDE_10 "10"
+#define DC_KERNEL_MULT_STRIDE_11 "11"
+#define DC_KERNEL_MULT_STRIDE_12 "12"
+#define DC_KERNEL_MULT_STRIDE_13 "13"
+
+    asm volatile(
+        "ldr    w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldp    w11, w6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldpsw  x9, x10, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldrsw  x12, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "ldrsw  x13, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr    w14, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "add    x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
+        "add    x5, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
+        "add    x7, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
+        "add    x19, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"  // =36
+        "add    %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "sxtw   x11, w11\n"
+        "ld1r   { v0.8h }, [%[function_params]]\n"
+        "ld1r   { v1.4s }, [x7]\n"
+        "ld1r   { v2.4s }, [x19]\n"
+        "ld1r   { v3.8b }, [x17]\n"
+        "ld1r   { v4.8b }, [x5]\n"
+        "cmp    w15, #2\n"  // =2
+        "ccmp   w6, w11, #0, lt\n"
+        "lsl    x5, x6, #2\n"
+        "csel   w6, w6, w11, lt\n"
+        "mov    x8, xzr\n"
+        "add    x16, %[scratch_block_data], #4\n"  // =4
+        "lsl    x17, x10, #1\n"
+        "add    %[function_params], x10, x10, lsl #1\n"
+        "sxtw   x6, w6\n"
+        "add    x7, x9, x13\n"
+        "b      " DC_KERNEL_MULT_STRIDE_13 "f\n"
+        DC_KERNEL_MULT_STRIDE_1 ":\n"  // in Loop: Header=BB206_13 Depth=1
+        "ldr    w20, [%[scratch_block_data]]\n"
+        "add    x21, %[scratch_block_data], x10\n"
+        "ldp    q5, q6, [%[filter_workspace]]\n"
+        "ldp    q7, q16, [%[filter_workspace], #32]\n"
+        "fmov   s21, w20\n"
+        "mov    v21.s[1], w20\n"
+        "ld1    { v21.s }[2], [x21]\n"
+        "ldp    q17, q18, [%[filter_workspace], #64]\n"
+        "ldp    q19, q20, [%[bias_data]], #32\n"
+        "ldr    s22, [%[scratch_block_data], x17]\n"
+        "ubfiz  x19, x8, #3, #29\n"
+        "add    %[filter_workspace], %[filter_workspace], #96\n"  // =96
+        "add    x19, %[output_block_data], x19\n"
+        "cmp    w14, #2\n"  // =2
+        "mov    v21.s[3], w20\n"
+        "mov    x20, xzr\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_7 "f\n"
+        // %bb.2:        // in Loop: Header=BB206_13 Depth=1
+        "dup    v22.4s, v22.s[0]\n"
+        "add    x21, %[scratch_block_data], %[function_params]\n"
+        "add    x22, %[scratch_block_data], x10, lsl #2\n"
+        "ld1    { v22.s }[2], [x21]\n"
+        "ld1r   { v23.4s }, [x22]\n"
+        "mov    x21, xzr\n"
+        "b      " DC_KERNEL_MULT_STRIDE_4 "f\n"
+        DC_KERNEL_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB206_4 Depth=2
+        "and    x22, x20, #0xfffffffc\n"
+        "add    x23, x16, x22\n"
+        "lsl    x24, x10, #2\n"
+        "mov    x22, x23\n"
+        "ld1    { v21.s }[1], [x22], x24\n"
+        "add    x24, x23, x17\n"
+        "ld1    { v22.s }[1], [x24]\n"
+        "add    x24, x23, x10\n"
+        "ld1    { v21.s }[3], [x24]\n"
+        "add    x23, x23, %[function_params]\n"
+        "ld1    { v22.s }[3], [x23]\n"
+        "mov    v25.16b, v19.16b\n"
+        "mov    v27.16b, v20.16b\n"
+        "ld1    { v23.s }[1], [x22]\n"
+        "ushr   v29.2d, v21.2d, #16\n"
+        ".word 0x4f9de0b9  // sdot   v25.4s, v5.16b, v29.4b[0]\n"
+        ".word 0x4f9de0db  // sdot   v27.4s, v6.16b, v29.4b[0]\n"
+        "mov    v26.16b, v19.16b\n"
+        "mov    v28.16b, v20.16b\n"
+        ".word 0x4f9de8f9  // sdot   v25.4s, v7.16b, v29.4b[2]\n"
+        ".word 0x4f9dea1b  // sdot   v27.4s, v16.16b, v29.4b[2]\n"
+        "ushr   v29.2d, v22.2d, #16\n"
+        ".word 0x4f9de0ba  // sdot   v26.4s, v5.16b, v29.4b[0]\n"
+        ".word 0x4f9de0dc  // sdot   v28.4s, v6.16b, v29.4b[0]\n"
+        "mov    v24.16b, v19.16b\n"
+        ".word 0x4f9de8fa  // sdot   v26.4s, v7.16b, v29.4b[2]\n"
+        ".word 0x4f9dea1c  // sdot   v28.4s, v16.16b, v29.4b[2]\n"
+        ".word 0x4f9de239  // sdot   v25.4s, v17.16b, v29.4b[0]\n"
+        ".word 0x4f9de25b  // sdot   v27.4s, v18.16b, v29.4b[0]\n"
+        "ushr   v29.2d, v23.2d, #16\n"
+        ".word 0x4f9de23a  // sdot   v26.4s, v17.16b, v29.4b[0]\n"
+        ".word 0x4f9de25c  // sdot   v28.4s, v18.16b, v29.4b[0]\n"
+        "mov    v29.16b, v19.16b\n"
+        ".word 0x4f95e0b8  // sdot   v24.4s, v5.16b, v21.4b[0]\n"
+        ".word 0x4f96e0bd  // sdot   v29.4s, v5.16b, v22.4b[0]\n"
+        ".word 0x4f95e8f8  // sdot   v24.4s, v7.16b, v21.4b[2]\n"
+        ".word 0x4f96e8fd  // sdot   v29.4s, v7.16b, v22.4b[2]\n"
+        ".word 0x4f96e238  // sdot   v24.4s, v17.16b, v22.4b[0]\n"
+        ".word 0x4f97e23d  // sdot   v29.4s, v17.16b, v23.4b[0]\n"
+        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
+        "sqrshl v24.4s, v24.4s, v2.4s\n"
+        "sqrshl v29.4s, v29.4s, v2.4s\n"
+        "sqxtn  v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v29.4s\n"
+        "sqadd  v24.8h, v24.8h, v0.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "add    x22, x19, x9\n"
+        "mov    v29.16b, v20.16b\n"
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "str    s24, [x19]\n"
+        "st1    { v24.s }[1], [x22]\n"
+        "mov    v24.16b, v20.16b\n"
+        ".word 0x4f95e0dd  // sdot   v29.4s, v6.16b, v21.4b[0]\n"
+        ".word 0x4f96e0d8  // sdot   v24.4s, v6.16b, v22.4b[0]\n"
+        ".word 0x4f95ea1d  // sdot   v29.4s, v16.16b, v21.4b[2]\n"
+        ".word 0x4f96ea18  // sdot   v24.4s, v16.16b, v22.4b[2]\n"
+        ".word 0x4f96e25d  // sdot   v29.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f97e258  // sdot   v24.4s, v18.16b, v23.4b[0]\n"
+        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
+        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
+        "sqrshl v29.4s, v29.4s, v2.4s\n"
+        "sqrshl v24.4s, v24.4s, v2.4s\n"
+        "sqxtn  v29.4h, v29.4s\n"
+        "sqxtn2 v29.8h, v24.4s\n"
+        "sqadd  v24.8h, v29.8h, v0.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "sqrdmulh        v26.4s, v26.4s, v1.4s\n"
+        "sqrshl v25.4s, v25.4s, v2.4s\n"
+        "add    x22, x22, #4\n"  // =4
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "sqrshl v26.4s, v26.4s, v2.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "str    s24, [x19, #4]\n"
+        "st1    { v24.s }[1], [x22]\n"
+        "sqxtn2 v25.8h, v26.4s\n"
+        "sqadd  v24.8h, v25.8h, v0.8h\n"
+        "sqrdmulh        v27.4s, v27.4s, v1.4s\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqrdmulh        v28.4s, v28.4s, v1.4s\n"
+        "sqrshl v27.4s, v27.4s, v2.4s\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "add    x23, x19, x13\n"
+        "add    x24, x19, x7\n"
+        "sqrshl v28.4s, v28.4s, v2.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "str    s24, [x23]\n"
+        "st1    { v24.s }[1], [x24]\n"
+        "sqxtn2 v27.8h, v28.4s\n"
+        "sqadd  v24.8h, v27.8h, v0.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "add    x25, x24, #4\n"  // =4
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "add    x21, x21, #1\n"  // =1
+        "ushr   v21.2d, v21.2d, #32\n"
+        "ushr   v22.2d, v22.2d, #32\n"
+        "ushr   v23.2d, v23.2d, #32\n"
+        "add    x19, x23, x13\n"
+        "str    s24, [x23, #4]\n"
+        "st1    { v24.s }[1], [x25]\n"
+        "add    x20, x20, #4\n"  // =4
+        DC_KERNEL_MULT_STRIDE_4 ":\n"  // Parent Loop BB206_13 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "cmp    x21, x6\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_3 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_6 "f\n"
+        DC_KERNEL_MULT_STRIDE_5 ":\n"  // in Loop: Header=BB206_6 Depth=2
+        "and    x22, x20, #0xfffffffc\n"
+        "add    x22, x16, x22\n"
+        "lsl    x23, x10, #2\n"
+        "mov    x25, x22\n"
+        "add    x24, x22, x17\n"
+        "ld1    { v21.s }[1], [x25], x23\n"
+        "ld1    { v22.s }[1], [x24]\n"
+        "add    x23, x22, x10\n"
+        "add    x22, x22, %[function_params]\n"
+        "ld1    { v21.s }[3], [x23]\n"
+        "ld1    { v22.s }[3], [x22]\n"
+        "mov    v24.16b, v19.16b\n"
+        "ld1    { v23.s }[1], [x25]\n"
+        "mov    v25.16b, v19.16b\n"
+        ".word 0x4f95e0b8  // sdot   v24.4s, v5.16b, v21.4b[0]\n"
+        ".word 0x4f96e0b9  // sdot   v25.4s, v5.16b, v22.4b[0]\n"
+        ".word 0x4f95e8f8  // sdot   v24.4s, v7.16b, v21.4b[2]\n"
+        ".word 0x4f96e8f9  // sdot   v25.4s, v7.16b, v22.4b[2]\n"
+        ".word 0x4f96e238  // sdot   v24.4s, v17.16b, v22.4b[0]\n"
+        ".word 0x4f97e239  // sdot   v25.4s, v17.16b, v23.4b[0]\n"
+        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
+        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
+        "sqrshl v24.4s, v24.4s, v2.4s\n"
+        "sqrshl v25.4s, v25.4s, v2.4s\n"
+        "sqxtn  v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqadd  v24.8h, v24.8h, v0.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "add    x22, x19, x9\n"
+        "mov    v25.16b, v20.16b\n"
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "str    s24, [x19]\n"
+        "st1    { v24.s }[1], [x22]\n"
+        "mov    v24.16b, v20.16b\n"
+        ".word 0x4f95e0d9  // sdot   v25.4s, v6.16b, v21.4b[0]\n"
+        ".word 0x4f96e0d8  // sdot   v24.4s, v6.16b, v22.4b[0]\n"
+        ".word 0x4f95ea19  // sdot   v25.4s, v16.16b, v21.4b[2]\n"
+        ".word 0x4f96ea18  // sdot   v24.4s, v16.16b, v22.4b[2]\n"
+        ".word 0x4f96e259  // sdot   v25.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f97e258  // sdot   v24.4s, v18.16b, v23.4b[0]\n"
+        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
+        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
+        "sqrshl v25.4s, v25.4s, v2.4s\n"
+        "sqrshl v24.4s, v24.4s, v2.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqxtn2 v25.8h, v24.4s\n"
+        "sqadd  v24.8h, v25.8h, v0.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "add    x22, x22, #4\n"  // =4
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "add    x21, x21, #1\n"  // =1
+        "ushr   v21.2d, v21.2d, #16\n"
+        "ushr   v22.2d, v22.2d, #16\n"
+        "ushr   v23.2d, v23.2d, #16\n"
+        "str    s24, [x19, #4]\n"
+        "st1    { v24.s }[1], [x22]\n"
+        "add    x19, x19, x13\n"
+        "add    x20, x20, #4\n"  // =4
+        DC_KERNEL_MULT_STRIDE_6 ":\n"  // Parent Loop BB206_13 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "cmp    x21, x11\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_5 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_12 "f\n"
+        DC_KERNEL_MULT_STRIDE_7 ":\n"  // in Loop: Header=BB206_13 Depth=1
+        "mov    x21, xzr\n"
+        "dup    v22.4s, v22.s[0]\n"
+        "b      " DC_KERNEL_MULT_STRIDE_11 "f\n"
+        DC_KERNEL_MULT_STRIDE_8 ":\n"  // in Loop: Header=BB206_11 Depth=2
+        "and    x22, x20, #0xfffffffc\n"
+        "add    x22, x16, x22\n"
+        "mov    x23, x22\n"
+        "ld1    { v21.s }[1], [x23], x17\n"
+        "add    x22, x22, x10\n"
+        "mov    v23.16b, v19.16b\n"
+        "mov    v24.16b, v20.16b\n"
+        "ld1    { v22.s }[1], [x23]\n"
+        "ld1    { v21.s }[3], [x22]\n"
+        "cmp    w15, #2\n"  // =2
+        "ccmp   x5, x20, #0, ne\n"
+        ".word 0x4f96e237  // sdot   v23.4s, v17.16b, v22.4b[0]\n"
+        ".word 0x4f96e258  // sdot   v24.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f95e0b7  // sdot   v23.4s, v5.16b, v21.4b[0]\n"
+        ".word 0x4f95e0d8  // sdot   v24.4s, v6.16b, v21.4b[0]\n"
+        ".word 0x4f95e8f7  // sdot   v23.4s, v7.16b, v21.4b[2]\n"
+        ".word 0x4f95ea18  // sdot   v24.4s, v16.16b, v21.4b[2]\n"
+        "sqrdmulh        v23.4s, v23.4s, v1.4s\n"
+        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
+        "sqrshl v23.4s, v23.4s, v2.4s\n"
+        "sqrshl v24.4s, v24.4s, v2.4s\n"
+        "sqxtn  v25.4h, v23.4s\n"
+        "sqxtn2 v25.8h, v24.4s\n"
+        "sqadd  v24.8h, v25.8h, v0.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax   v24.8b, v24.8b, v3.8b\n"
+        "umin   v24.8b, v24.8b, v4.8b\n"
+        "ushr   v23.2d, v21.2d, #16\n"
+        "str    d24, [x19]\n"
+        "ushr   v24.2d, v22.2d, #16\n"
+        "add    x19, x19, x13\n"
+        "b.eq   " DC_KERNEL_MULT_STRIDE_10 "f\n"
+        // %bb.9:        // in Loop: Header=BB206_11 Depth=2
+        "mov    v25.16b, v19.16b\n"
+        "mov    v26.16b, v20.16b\n"
+        ".word 0x4f98e239  // sdot   v25.4s, v17.16b, v24.4b[0]\n"
+        ".word 0x4f98e25a  // sdot   v26.4s, v18.16b, v24.4b[0]\n"
+        ".word 0x4f97e0b9  // sdot   v25.4s, v5.16b, v23.4b[0]\n"
+        ".word 0x4f97e0da  // sdot   v26.4s, v6.16b, v23.4b[0]\n"
+        ".word 0x4f97e8f9  // sdot   v25.4s, v7.16b, v23.4b[2]\n"
+        ".word 0x4f97ea1a  // sdot   v26.4s, v16.16b, v23.4b[2]\n"
+        "ushr   v23.2d, v21.2d, #32\n"
+        "sqrdmulh        v21.4s, v25.4s, v1.4s\n"
+        "ushr   v24.2d, v22.2d, #32\n"
+        "sqrdmulh        v22.4s, v26.4s, v1.4s\n"
+        "sqrshl v21.4s, v21.4s, v2.4s\n"
+        "sqrshl v22.4s, v22.4s, v2.4s\n"
+        "sqxtn  v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqadd  v21.8h, v21.8h, v0.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "umax   v21.8b, v21.8b, v3.8b\n"
+        "umin   v21.8b, v21.8b, v4.8b\n"
+        "str    d21, [x19]\n"
+        "add    x19, x19, x13\n"
+        DC_KERNEL_MULT_STRIDE_10 ":\n"  // in Loop: Header=BB206_11 Depth=2
+        "add    x21, x21, #1\n"  // =1
+        "add    x20, x20, #4\n"  // =4
+        "mov    v22.16b, v24.16b\n"
+        "mov    v21.16b, v23.16b\n"
+        DC_KERNEL_MULT_STRIDE_11 ":\n"  // Parent Loop BB206_13 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "cmp    x21, x11\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_8 "b\n"
+        DC_KERNEL_MULT_STRIDE_12 ":\n"  // in Loop: Header=BB206_13 Depth=1
+        "add    x8, x8, #1\n"  // =1
+        DC_KERNEL_MULT_STRIDE_13 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB206_11 Depth 2
+        // Child Loop BB206_4 Depth 2
+        // Child Loop BB206_6 Depth 2
+        "cmp    x8, x12\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_1 "b\n"
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25");
+
+#undef DC_KERNEL_MULT_STRIDE_1
+#undef DC_KERNEL_MULT_STRIDE_2
+#undef DC_KERNEL_MULT_STRIDE_3
+#undef DC_KERNEL_MULT_STRIDE_4
+#undef DC_KERNEL_MULT_STRIDE_5
+#undef DC_KERNEL_MULT_STRIDE_6
+#undef DC_KERNEL_MULT_STRIDE_7
+#undef DC_KERNEL_MULT_STRIDE_8
+#undef DC_KERNEL_MULT_STRIDE_9
+#undef DC_KERNEL_MULT_STRIDE_10
+#undef DC_KERNEL_MULT_STRIDE_11
+#undef DC_KERNEL_MULT_STRIDE_12
+#undef DC_KERNEL_MULT_STRIDE_13
+  }
+
+  static void __attribute__((noinline))
+  Run(const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_1 "1"
+#define DC_KERNEL_NO_MULT_2 "2"
+#define DC_KERNEL_NO_MULT_3 "3"
+#define DC_KERNEL_NO_MULT_4 "4"
+#define DC_KERNEL_NO_MULT_5 "5"
+#define DC_KERNEL_NO_MULT_6 "6"
+#define DC_KERNEL_NO_MULT_7 "7"
+#define DC_KERNEL_NO_MULT_8 "8"
+#define DC_KERNEL_NO_MULT_9 "9"
+#define DC_KERNEL_NO_MULT_10 "10"
+#define DC_KERNEL_NO_MULT_11 "11"
+#define DC_KERNEL_NO_MULT_12 "12"
+#define DC_KERNEL_NO_MULT_13 "13"
+#define DC_KERNEL_NO_MULT_14 "14"
+#define DC_KERNEL_NO_MULT_15 "15"
+#define DC_KERNEL_NO_MULT_16 "16"
+#define DC_KERNEL_NO_MULT_17 "17"
+#define DC_KERNEL_NO_MULT_18 "18"
+#define DC_KERNEL_NO_MULT_19 "19"
+#define DC_KERNEL_NO_MULT_20 "20"
+#define DC_KERNEL_NO_MULT_21 "21"
+#define DC_KERNEL_NO_MULT_22 "22"
+#define DC_KERNEL_NO_MULT_23 "23"
+#define DC_KERNEL_NO_MULT_24 "24"
+#define DC_KERNEL_NO_MULT_25 "25"
+#define DC_KERNEL_NO_MULT_26 "26"
+#define DC_KERNEL_NO_MULT_27 "27"
+#define DC_KERNEL_NO_MULT_28 "28"
+#define DC_KERNEL_NO_MULT_29 "29"
+#define DC_KERNEL_NO_MULT_30 "30"
+#define DC_KERNEL_NO_MULT_31 "31"
+#define DC_KERNEL_NO_MULT_32 "32"
+#define DC_KERNEL_NO_MULT_33 "33"
+
+    asm volatile(
+        // Compiled code used block of 384 for spill out of total stack of 528.
+        "sub    sp, sp, #384\n"  // =528
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[scratch_block_data], [sp, #376]\n"  // 8-byte Folded Spill
+        "cmp    w8, #1\n"  // =1
+        "str    x8, [sp, #56]\n"  // 8-byte Folded Spill
+        "b.lt   " DC_KERNEL_NO_MULT_33 "f\n"
+        // %bb.1:
+        "stp    xzr, xzr, [sp, #72]\n"  // 16-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "str    xzr, [sp, #88]\n"  // 8-byte Folded Spill
+        "ldpsw  x22, x5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldr    x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "str    w8, [sp, #340]\n"  // 4-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS) "]\n"
+        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
+        "str    x11, [sp, #40]\n"  // 8-byte Folded Spill
+        "ldr    x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
+        "str    w8, [sp, #344]\n"  // 4-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldrsw  x7, [%[function_params]]\n"
+        "str    x11, [sp, #32]\n"  // 8-byte Folded Spill
+        "ldrsw  x11, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "str    w8, [sp, #348]\n"  // 4-byte Folded Spill
+        "ldrb   w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
+        "ldr    x26, [sp, #376]\n"  // 8-byte Folded Reload
+        "mov    x23, %[output_block_data]\n"
+        "add    x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "dup    v5.16b, w8\n"
+        "fmov   s3, w8\n"
+        "lsl    x8, x11, #5\n"
+        "dup    v6.16b, w9\n"
+        "fmov   s4, w9\n"
+        "str    x8, [sp, #48]\n"  // 8-byte Folded Spill
+        "add    x8, x5, x26\n"
+        "lsl    x9, x7, #1\n"
+        "ld1r   { v0.8h }, [x10]\n"
+        "add    x13, x5, x5, lsl #1\n"
+        "add    x10, x22, x7\n"
+        "add    x28, x8, #32\n"  // =32
+        "add    x8, x23, x9\n"
+        "str    x13, [sp, #312]\n"  // 8-byte Folded Spill
+        "add    x13, x13, x26\n"
+        "str    x8, [sp, #360]\n"  // 8-byte Folded Spill
+        "add    x8, x23, x10\n"
+        "str    x8, [sp, #352]\n"  // 8-byte Folded Spill
+        "add    x8, x13, #32\n"  // =32
+        "ldr    w6, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "lsl    x12, x5, #2\n"
+        "add    x11, x5, x5, lsl #2\n"
+        "add    x24, x22, x22, lsl #1\n"
+        "str    x8, [sp, #368]\n"  // 8-byte Folded Spill
+        "lsl    x8, x5, #1\n"
+        "mov    %[output_block_data], %[filter_workspace]\n"
+        "lsl    %[filter_workspace], x22, #1\n"
+        "stp    x11, x12, [sp, #296]\n"  // 16-byte Folded Spill
+        "add    x11, x11, x26\n"
+        "add    x12, x12, x26\n"
+        "add    x14, x9, x7\n"
+        "add    x15, x9, x24\n"
+        "stp    x8, x5, [sp, #320]\n"  // 16-byte Folded Spill
+        "add    x8, x8, x26\n"
+        "add    x10, x11, #32\n"  // =32
+        "add    x11, x12, #32\n"  // =32
+        "add    x19, x8, #32\n"  // =32
+        "add    x12, x14, x24\n"
+        "add    x13, x14, %[filter_workspace]\n"
+        "add    x8, x14, x22\n"
+        "add    x25, x23, x14\n"
+        "add    x14, x23, x15\n"
+        "add    x17, x9, x22\n"
+        "mov    %[scratch_block_data], x19\n"
+        "mov    x19, x14\n"
+        "add    x14, x24, x7\n"
+        "add    x21, x23, x17\n"
+        "mov    w17, w6\n"
+        "add    x15, x23, x14\n"
+        "add    x14, %[filter_workspace], x7\n"
+        "add    x6, x23, x12\n"
+        "add    x12, x23, x13\n"
+        "add    %[function_params], x23, x14\n"
+        "mov    x14, x12\n"
+        "and    w12, w17, #0xfffffffe\n"
+        "str    w12, [sp, #20]\n"  // 4-byte Folded Spill
+        "lsl    x12, x7, #2\n"
+        "str    x12, [sp, #152]\n"  // 8-byte Folded Spill
+        "add    x12, x23, x22\n"
+        "str    x12, [sp, #144]\n"  // 8-byte Folded Spill
+        "add    x12, x23, x7\n"
+        "add    x16, x9, %[filter_workspace]\n"
+        "str    x12, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x12, x23, %[filter_workspace]\n"
+        "dup    v7.8b, v3.b[0]\n"
+        "dup    v14.8b, v4.b[0]\n"
+        "add    x20, x23, x16\n"
+        "mov    x13, x15\n"
+        "add    x15, x23, x8\n"
+        "mov    x5, %[filter_workspace]\n"
+        "str    x12, [sp, #128]\n"  // 8-byte Folded Spill
+        "mov    x8, x24\n"
+        "add    x12, x23, x24\n"
+        "mov    w1, #4\n"
+        "stp    x23, x12, [sp, #112]\n"  // 16-byte Folded Spill
+        "str    x26, [sp, #264]\n"  // 8-byte Folded Spill
+        "str    x22, [sp, #200]\n"  // 8-byte Folded Spill
+        "str    w17, [sp, #108]\n"  // 4-byte Folded Spill
+        "str    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Spill
+        "str    x23, [sp, #24]\n"  // 8-byte Folded Spill
+        "stp    d14, d7, [sp, #160]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_4 "f\n"
+        DC_KERNEL_NO_MULT_2 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "mov    %[bias_data], x9\n"
+        DC_KERNEL_NO_MULT_3 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "ldr    %[output_block_data], [sp, #48]\n"  // 8-byte Folded Reload
+        "ldr    x12, [sp, #264]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #88]\n"  // 8-byte Folded Reload
+        "add    x12, x12, %[output_block_data]\n"
+        "str    x12, [sp, #264]\n"  // 8-byte Folded Spill
+        "ldr    x12, [sp, #112]\n"  // 8-byte Folded Reload
+        "add    x17, x17, #1\n"  // =1
+        "add    x12, x12, #8\n"  // =8
+        "str    x12, [sp, #112]\n"  // 8-byte Folded Spill
+        "ldr    x12, [sp, #72]\n"  // 8-byte Folded Reload
+        "add    x12, x12, %[output_block_data]\n"
+        "str    x12, [sp, #72]\n"  // 8-byte Folded Spill
+        "ldp    x12, %[output_block_data], [sp, #56]\n"  // 16-byte Folded Reload
+        "cmp    x17, x12\n"
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "add    x12, x12, #8\n"  // =8
+        "stp    x12, x17, [sp, #80]\n"  // 16-byte Folded Spill
+        "ldr    w17, [sp, #108]\n"  // 4-byte Folded Reload
+        "b.eq   " DC_KERNEL_NO_MULT_33 "f\n"
+        DC_KERNEL_NO_MULT_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB111_29 Depth 2
+        // Child Loop BB111_32 Depth 2
+        // Child Loop BB111_20 Depth 2
+        // Child Loop BB111_22 Depth 3
+        // Child Loop BB111_25 Depth 4
+        // Child Loop BB111_7 Depth 2
+        // Child Loop BB111_9 Depth 3
+        // Child Loop BB111_15 Depth 3
+        "ldp    q16, q15, [%[output_block_data]]\n"
+        "ldp    q17, q3, [%[output_block_data], #32]\n"
+        "ldp    q18, q4, [%[output_block_data], #64]\n"
+        "cmp    w17, #4\n"  // =4
+        "add    %[output_block_data], x3, #96\n"  // =96
+        "str    %[output_block_data], [sp, #64]\n"  // 8-byte Folded Spill
+        "b.ne   " DC_KERNEL_NO_MULT_16 "f\n"
+        // %bb.5:        // in Loop: Header=BB111_4 Depth=1
+        "ldp    x24, x12, [sp, #80]\n"  // 16-byte Folded Reload
+        "ldr    x17, [sp, #32]\n"  // 8-byte Folded Reload
+        "ldr    x26, [sp, #72]\n"  // 8-byte Folded Reload
+        "mov    x9, xzr\n"
+        "lsl    w12, w12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "add    x16, x17, x12\n"
+        "ldr    x17, [sp, #40]\n"  // 8-byte Folded Reload
+        "stp    q4, q3, [sp, #224]\n"  // 32-byte Folded Spill
+        "str    q15, [sp, #176]\n"  // 16-byte Folded Spill
+        "add    x12, x17, x12\n"
+        "stp    x12, x16, [sp, #208]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_7 "f\n"
+        DC_KERNEL_NO_MULT_6 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldp    q18, q17, [sp, #224]\n"  // 32-byte Folded Reload
+        "add    x9, x9, #1\n"  // =1
+        "add    x26, x26, #16\n"  // =16
+        "cmp    x9, #2\n"  // =2
+        "add    x24, x24, #4\n"  // =4
+        "mov    v16.16b, v15.16b\n"
+        "b.eq   " DC_KERNEL_NO_MULT_3 "b\n"
+        DC_KERNEL_NO_MULT_7 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB111_9 Depth 3
+        // Child Loop BB111_15 Depth 3
+        "ldr    q19, [%[bias_data]], #16\n"
+        "ldr    x16, [sp, #264]\n"  // 8-byte Folded Reload
+        "lsl    x12, x9, #4\n"
+        "ldr    w17, [sp, #344]\n"  // 4-byte Folded Reload
+        "mov    v31.16b, v19.16b\n"
+        "add    %[output_block_data], x16, x12\n"
+        "ldr    x16, [sp, #216]\n"  // 8-byte Folded Reload
+        "ldr    q22, [%[output_block_data]]\n"
+        "mov    v8.16b, v19.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        "ldr    q20, [x16, x12]\n"
+        "ldr    x16, [sp, #208]\n"  // 8-byte Folded Reload
+        "mov    v10.16b, v19.16b\n"
+        "cmp    w17, #1\n"  // =1
+        "ldr    q21, [x16, x12]\n"
+        "ldr    x12, [sp, #328]\n"  // 8-byte Folded Reload
+        "ldr    q27, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #320]\n"  // 8-byte Folded Reload
+        "ldr    q26, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #312]\n"  // 8-byte Folded Reload
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        "ldr    q25, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #304]\n"  // 8-byte Folded Reload
+        ".word 0x4e9a9628  // sdot   v8.4s, v17.16b, v26.16b\n"
+        ".word 0x4e9a9609  // sdot   v9.4s, v16.16b, v26.16b\n"
+        ".word 0x4e99960a  // sdot   v10.4s, v16.16b, v25.16b\n"
+        "ldr    q24, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #296]\n"  // 8-byte Folded Reload
+        "ldr    q23, [%[output_block_data], x12]\n"
+        "b.lt   " DC_KERNEL_NO_MULT_11 "f\n"
+        // %bb.8:        // in Loop: Header=BB111_7 Depth=2
+        "stp    x24, x9, [sp, #280]\n"  // 16-byte Folded Spill
+        "ldr    w12, [sp, #344]\n"  // 4-byte Folded Reload
+        "mov    x17, x24\n"
+        "str    x26, [sp, #272]\n"  // 8-byte Folded Spill
+        "mov    x22, x26\n"
+        "ldp    x27, x24, [sp, #144]\n"  // 16-byte Folded Reload
+        "ldp    x26, %[filter_workspace], [sp, #128]\n"  // 16-byte Folded Reload
+        "ldr    x16, [sp, #120]\n"  // 8-byte Folded Reload
+        "shl    v28.4s, v16.4s, #8\n"
+        "shl    v29.4s, v17.4s, #8\n"
+        "shl    v30.4s, v18.4s, #8\n"
+        "mov    v11.16b, v23.16b\n"
+        "mov    v12.16b, v24.16b\n"
+        "mov    v13.16b, v27.16b\n"
+        "mov    v14.16b, v22.16b\n"
+        DC_KERNEL_NO_MULT_9 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4e8e961f  // sdot   v31.4s, v16.16b, v14.16b\n"
+        ".word 0x4e8d9608  // sdot   v8.4s, v16.16b, v13.16b\n"
+        ".word 0x4e999629  // sdot   v9.4s, v17.16b, v25.16b\n"
+        ".word 0x4e8d963f  // sdot   v31.4s, v17.16b, v13.16b\n"
+        ".word 0x4e8c962a  // sdot   v10.4s, v17.16b, v12.16b\n"
+        ".word 0x4e999648  // sdot   v8.4s, v18.16b, v25.16b\n"
+        ".word 0x4e8c9649  // sdot   v9.4s, v18.16b, v12.16b\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        ".word 0x4e8b964a  // sdot   v10.4s, v18.16b, v11.16b\n"
+        "sqrdmulh        v8.4s, v8.4s, v21.4s\n"
+        "sqrdmulh        v9.4s, v9.4s, v21.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v21.4s\n"
+        "sqrshl v8.4s, v8.4s, v20.4s\n"
+        "sqrshl v9.4s, v9.4s, v20.4s\n"
+        "sqxtn  v31.4h, v31.4s\n"
+        "sqrshl v10.4s, v10.4s, v20.4s\n"
+        "sqxtn  v9.4h, v9.4s\n"
+        "sqxtn2 v31.8h, v8.4s\n"
+        "sqxtn2 v9.8h, v10.4s\n"
+        "sqadd  v31.8h, v31.8h, v0.8h\n"
+        "sqadd  v8.8h, v9.8h, v0.8h\n"
+        "sqxtn  v31.8b, v31.8h\n"
+        "sqxtn2 v31.16b, v8.8h\n"
+        "smax   v31.16b, v31.16b, v5.16b\n"
+        "add    %[output_block_data], x27, x17\n"
+        "smin   v31.16b, v31.16b, v6.16b\n"
+        "str    s31, [x23, x17]\n"
+        "st1    { v31.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x26, x17\n"
+        "st1    { v31.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x16, x17\n"
+        "st1    { v31.s }[3], [%[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #376]\n"  // 8-byte Folded Reload
+        "mov    v10.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        "ldr    x9, [sp, #352]\n"  // 8-byte Folded Reload
+        ".word 0x4e99978a  // sdot   v10.4s, v28.16b, v25.16b\n"
+        ".word 0x4e8e979f  // sdot   v31.4s, v28.16b, v14.16b\n"
+        ".word 0x4e8d9788  // sdot   v8.4s, v28.16b, v13.16b\n"
+        ".word 0x4e8c97aa  // sdot   v10.4s, v29.16b, v12.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        ".word 0x4e8d97bf  // sdot   v31.4s, v29.16b, v13.16b\n"
+        ".word 0x4e9a97a8  // sdot   v8.4s, v29.16b, v26.16b\n"
+        ".word 0x4e8b97ca  // sdot   v10.4s, v30.16b, v11.16b\n"
+        "add    %[output_block_data], x3, x22\n"
+        "rev32  v2.8h, v26.8h\n"
+        ".word 0x4e9a9789  // sdot   v9.4s, v28.16b, v26.16b\n"
+        ".word 0x4e9a97df  // sdot   v31.4s, v30.16b, v26.16b\n"
+        ".word 0x4e9997c8  // sdot   v8.4s, v30.16b, v25.16b\n"
+        "sqrdmulh        v26.4s, v10.4s, v21.4s\n"
+        "rev32  v15.8h, v22.8h\n"
+        "ldr    q22, [%[output_block_data], #32]\n"
+        "add    %[output_block_data], x9, x17\n"
+        "rev32  v4.8h, v24.8h\n"
+        ".word 0x4e9997a9  // sdot   v9.4s, v29.16b, v25.16b\n"
+        "sqrdmulh        v24.4s, v8.4s, v21.4s\n"
+        "sqrshl v8.4s, v26.4s, v20.4s\n"
+        "ldr    q26, [%[scratch_block_data], x22]\n"
+        "mov    x9, %[scratch_block_data]\n"
+        "ldr    %[scratch_block_data], [sp, #368]\n"  // 8-byte Folded Reload
+        "mov    v7.16b, v6.16b\n"
+        "mov    v6.16b, v5.16b\n"
+        "rev32  v5.8h, v23.8h\n"
+        ".word 0x4e8c97c9  // sdot   v9.4s, v30.16b, v12.16b\n"
+        "sqrdmulh        v23.4s, v31.4s, v21.4s\n"
+        "rev32  v3.8h, v25.8h\n"
+        "sqrdmulh        v25.4s, v9.4s, v21.4s\n"
+        "sqrshl v23.4s, v23.4s, v20.4s\n"
+        "sqrshl v31.4s, v24.4s, v20.4s\n"
+        "sqrshl v24.4s, v25.4s, v20.4s\n"
+        "sqxtn  v9.4h, v23.4s\n"
+        "rev32  v1.8h, v27.8h\n"
+        "sqxtn  v10.4h, v24.4s\n"
+        "ldr    q27, [x28, x22]\n"
+        "ldr    q25, [%[scratch_block_data], x22]\n"
+        "ldr    q24, [x11, x22]\n"
+        "ldr    q23, [x10, x22]\n"
+        "sqxtn2 v9.8h, v31.4s\n"
+        "sqxtn2 v10.8h, v8.4s\n"
+        "sqadd  v31.8h, v9.8h, v0.8h\n"
+        "sqadd  v8.8h, v10.8h, v0.8h\n"
+        "sqxtn  v31.8b, v31.8h\n"
+        "sqxtn2 v31.16b, v8.8h\n"
+        "smax   v31.16b, v31.16b, v6.16b\n"
+        "smin   v31.16b, v31.16b, v7.16b\n"
+        "str    s31, [%[filter_workspace], x17]\n"
+        "st1    { v31.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], %[function_params], x17\n"
+        "st1    { v31.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x13, x17\n"
+        "mov    v8.16b, v19.16b\n"
+        "st1    { v31.s }[3], [%[output_block_data]]\n"
+        "trn1   v31.8h, v15.8h, v22.8h\n"
+        "mov    v9.16b, v19.16b\n"
+        "mov    v10.16b, v19.16b\n"
+        "trn1   v1.8h, v1.8h, v27.8h\n"
+        "trn1   v2.8h, v2.8h, v26.8h\n"
+        ".word 0x4e9f9608  // sdot   v8.4s, v16.16b, v31.16b\n"
+        "mov    v11.16b, v19.16b\n"
+        "trn1   v3.8h, v3.8h, v25.8h\n"
+        ".word 0x4e819609  // sdot   v9.4s, v16.16b, v1.16b\n"
+        ".word 0x4e82960a  // sdot   v10.4s, v16.16b, v2.16b\n"
+        ".word 0x4e819628  // sdot   v8.4s, v17.16b, v1.16b\n"
+        "trn1   v4.8h, v4.8h, v24.8h\n"
+        ".word 0x4e83960b  // sdot   v11.4s, v16.16b, v3.16b\n"
+        ".word 0x4e829629  // sdot   v9.4s, v17.16b, v2.16b\n"
+        ".word 0x4e83962a  // sdot   v10.4s, v17.16b, v3.16b\n"
+        ".word 0x4e829648  // sdot   v8.4s, v18.16b, v2.16b\n"
+        "trn1   v5.8h, v5.8h, v23.8h\n"
+        ".word 0x4e84962b  // sdot   v11.4s, v17.16b, v4.16b\n"
+        ".word 0x4e839649  // sdot   v9.4s, v18.16b, v3.16b\n"
+        ".word 0x4e84964a  // sdot   v10.4s, v18.16b, v4.16b\n"
+        "sqrdmulh        v8.4s, v8.4s, v21.4s\n"
+        ".word 0x4e85964b  // sdot   v11.4s, v18.16b, v5.16b\n"
+        "sqrdmulh        v9.4s, v9.4s, v21.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v21.4s\n"
+        "sqrshl v8.4s, v8.4s, v20.4s\n"
+        "sqrdmulh        v11.4s, v11.4s, v21.4s\n"
+        "sqrshl v9.4s, v9.4s, v20.4s\n"
+        "sqrshl v10.4s, v10.4s, v20.4s\n"
+        "sqxtn  v8.4h, v8.4s\n"
+        "sqrshl v11.4s, v11.4s, v20.4s\n"
+        "sqxtn  v10.4h, v10.4s\n"
+        "sqxtn2 v8.8h, v9.4s\n"
+        "sqxtn2 v10.8h, v11.4s\n"
+        "sqadd  v8.8h, v8.8h, v0.8h\n"
+        "sqadd  v9.8h, v10.8h, v0.8h\n"
+        "sqxtn  v8.8b, v8.8h\n"
+        "sqxtn2 v8.16b, v9.8h\n"
+        "mov    v9.16b, v19.16b\n"
+        "ldr    %[scratch_block_data], [sp, #360]\n"  // 8-byte Folded Reload
+        "mov    v10.16b, v19.16b\n"
+        "mov    v11.16b, v19.16b\n"
+        ".word 0x4e9f9789  // sdot   v9.4s, v28.16b, v31.16b\n"
+        "mov    v12.16b, v19.16b\n"
+        ".word 0x4e81978a  // sdot   v10.4s, v28.16b, v1.16b\n"
+        ".word 0x4e82978b  // sdot   v11.4s, v28.16b, v2.16b\n"
+        ".word 0x4e8197a9  // sdot   v9.4s, v29.16b, v1.16b\n"
+        "smax   v8.16b, v8.16b, v6.16b\n"
+        ".word 0x4e83978c  // sdot   v12.4s, v28.16b, v3.16b\n"
+        ".word 0x4e8297aa  // sdot   v10.4s, v29.16b, v2.16b\n"
+        ".word 0x4e8397ab  // sdot   v11.4s, v29.16b, v3.16b\n"
+        ".word 0x4e8297c9  // sdot   v9.4s, v30.16b, v2.16b\n"
+        "add    %[output_block_data], x21, x17\n"
+        "smin   v8.16b, v8.16b, v7.16b\n"
+        ".word 0x4e8497ac  // sdot   v12.4s, v29.16b, v4.16b\n"
+        ".word 0x4e8397ca  // sdot   v10.4s, v30.16b, v3.16b\n"
+        ".word 0x4e8497cb  // sdot   v11.4s, v30.16b, v4.16b\n"
+        "sqrdmulh        v1.4s, v9.4s, v21.4s\n"
+        "str    s8, [%[scratch_block_data], x17]\n"
+        "st1    { v8.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x20, x17\n"
+        ".word 0x4e8597cc  // sdot   v12.4s, v30.16b, v5.16b\n"
+        "sqrdmulh        v2.4s, v10.4s, v21.4s\n"
+        "sqrdmulh        v3.4s, v11.4s, v21.4s\n"
+        "sqrshl v1.4s, v1.4s, v20.4s\n"
+        "st1    { v8.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x19, x17\n"
+        "sqrdmulh        v4.4s, v12.4s, v21.4s\n"
+        "sqrshl v2.4s, v2.4s, v20.4s\n"
+        "sqrshl v3.4s, v3.4s, v20.4s\n"
+        "sqxtn  v1.4h, v1.4s\n"
+        "st1    { v8.s }[3], [%[output_block_data]]\n"
+        "sqrshl v4.4s, v4.4s, v20.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v1.8h, v2.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v1.8h, v1.8h, v0.8h\n"
+        "sqadd  v2.8h, v3.8h, v0.8h\n"
+        "sqxtn  v1.8b, v1.8h\n"
+        "mov    v5.16b, v6.16b\n"
+        "sqxtn2 v1.16b, v2.8h\n"
+        "smax   v1.16b, v1.16b, v5.16b\n"
+        "add    %[output_block_data], x15, x17\n"
+        "smin   v1.16b, v1.16b, v7.16b\n"
+        "str    s1, [x25, x17]\n"
+        "st1    { v1.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x14, x17\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        "mov    v10.16b, v19.16b\n"
+        "mov    %[scratch_block_data], x9\n"
+        "mov    v6.16b, v7.16b\n"
+        "st1    { v1.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x6, x17\n"
+        "subs   w12, w12, #1\n"  // =1
+        "add    x22, x22, #32\n"  // =32
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        ".word 0x4e9a9628  // sdot   v8.4s, v17.16b, v26.16b\n"
+        ".word 0x4e9a9609  // sdot   v9.4s, v16.16b, v26.16b\n"
+        ".word 0x4e99960a  // sdot   v10.4s, v16.16b, v25.16b\n"
+        "add    x17, x17, x24\n"
+        "mov    v11.16b, v23.16b\n"
+        "mov    v12.16b, v24.16b\n"
+        "mov    v13.16b, v27.16b\n"
+        "mov    v14.16b, v22.16b\n"
+        "st1    { v1.s }[3], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_9 "b\n"
+        // %bb.10:        // in Loop: Header=BB111_7 Depth=2
+        "ldr    x12, [sp, #376]\n"  // 8-byte Folded Reload
+        "ldp    d14, d7, [sp, #160]\n"  // 16-byte Folded Reload
+        "ldr    q15, [sp, #176]\n"  // 16-byte Folded Reload
+        "ldp    x24, x9, [sp, #280]\n"  // 16-byte Folded Reload
+        "add    %[output_block_data], x12, x22\n"
+        "ldr    x22, [sp, #200]\n"  // 8-byte Folded Reload
+        "ldr    x26, [sp, #272]\n"  // 8-byte Folded Reload
+        "add    x12, x23, x17\n"
+        "mov    w1, #4\n"
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        "cmp    w17, #0\n"  // =0
+        "b.gt   " DC_KERNEL_NO_MULT_12 "f\n"
+        "b      " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_11 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldr    x12, [sp, #112]\n"  // 8-byte Folded Reload
+        "add    x12, x12, x9, lsl #2\n"
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        "cmp    w17, #0\n"  // =0
+        "b.le   " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_12 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        "movi   v28.16b, #0\n"
+        "movi   v29.16b, #0\n"
+        "movi   v30.16b, #0\n"
+        "cmp    w17, #3\n"  // =3
+        "movi   v11.16b, #0\n"
+        "movi   v12.16b, #0\n"
+        "movi   v13.16b, #0\n"
+        "b.lt   " DC_KERNEL_NO_MULT_14 "f\n"
+        // %bb.13:        // in Loop: Header=BB111_7 Depth=2
+        "add    x17, %[output_block_data], #32\n"  // =32
+        "ldp    x16, %[output_block_data], [sp, #320]\n"  // 16-byte Folded Reload
+        "ldr    q13, [x17]\n"
+        "ldr    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Reload
+        "ldr    q12, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #312]\n"  // 8-byte Folded Reload
+        "ldr    q11, [x17, x16]\n"
+        "ldr    q30, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #304]\n"  // 8-byte Folded Reload
+        "ldr    q29, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #296]\n"  // 8-byte Folded Reload
+        "ldr    q28, [x17, %[output_block_data]]\n"
+        DC_KERNEL_NO_MULT_14 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        DC_KERNEL_NO_MULT_15 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4e96961f  // sdot   v31.4s, v16.16b, v22.16b\n"
+        ".word 0x4e9b9608  // sdot   v8.4s, v16.16b, v27.16b\n"
+        ".word 0x4e999629  // sdot   v9.4s, v17.16b, v25.16b\n"
+        ".word 0x4e9b963f  // sdot   v31.4s, v17.16b, v27.16b\n"
+        ".word 0x4e98962a  // sdot   v10.4s, v17.16b, v24.16b\n"
+        ".word 0x4e999648  // sdot   v8.4s, v18.16b, v25.16b\n"
+        ".word 0x4e989649  // sdot   v9.4s, v18.16b, v24.16b\n"
+        "sqrdmulh        v1.4s, v31.4s, v21.4s\n"
+        ".word 0x4e97964a  // sdot   v10.4s, v18.16b, v23.16b\n"
+        "sqrdmulh        v2.4s, v8.4s, v21.4s\n"
+        "sqrdmulh        v3.4s, v9.4s, v21.4s\n"
+        "sqrshl v1.4s, v1.4s, v20.4s\n"
+        "sqrdmulh        v4.4s, v10.4s, v21.4s\n"
+        "sqrshl v2.4s, v2.4s, v20.4s\n"
+        "sqrshl v3.4s, v3.4s, v20.4s\n"
+        "sqxtn  v1.4h, v1.4s\n"
+        "sqrshl v4.4s, v4.4s, v20.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v1.8h, v2.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v1.8h, v1.8h, v0.8h\n"
+        "sqadd  v2.8h, v3.8h, v0.8h\n"
+        "sqxtn  v1.8b, v1.8h\n"
+        "sqxtn2 v1.16b, v2.8h\n"
+        "smax   v1.16b, v1.16b, v5.16b\n"
+        "add    %[output_block_data], x12, x22\n"
+        "smin   v1.16b, v1.16b, v6.16b\n"
+        "ushr   v26.4s, v26.4s, #8\n"
+        "ushr   v25.4s, v25.4s, #8\n"
+        "str    s1, [x12]\n"
+        "st1    { v1.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x5\n"
+        "ushr   v22.4s, v22.4s, #8\n"
+        "ushr   v27.4s, v27.4s, #8\n"
+        "sli    v26.4s, v11.4s, #24\n"
+        "ushr   v24.4s, v24.4s, #8\n"
+        "ushr   v23.4s, v23.4s, #8\n"
+        "sli    v25.4s, v30.4s, #24\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        "mov    v10.16b, v19.16b\n"
+        "st1    { v1.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x8\n"
+        "subs   w17, w17, #1\n"  // =1
+        "sli    v22.4s, v13.4s, #24\n"
+        "ushr   v13.4s, v13.4s, #8\n"
+        "ushr   v11.4s, v11.4s, #8\n"
+        "sli    v27.4s, v12.4s, #24\n"
+        "ushr   v12.4s, v12.4s, #8\n"
+        "ushr   v30.4s, v30.4s, #8\n"
+        "sli    v24.4s, v29.4s, #24\n"
+        "ushr   v29.4s, v29.4s, #8\n"
+        "sli    v23.4s, v28.4s, #24\n"
+        "ushr   v28.4s, v28.4s, #8\n"
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        ".word 0x4e9a9628  // sdot   v8.4s, v17.16b, v26.16b\n"
+        ".word 0x4e9a9609  // sdot   v9.4s, v16.16b, v26.16b\n"
+        "add    x12, x12, x7\n"
+        ".word 0x4e99960a  // sdot   v10.4s, v16.16b, v25.16b\n"
+        "st1    { v1.s }[3], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_15 "b\n"
+        "b      " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_16 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "cmp    w17, #1\n"  // =1
+        "add    x9, %[bias_data], #32\n"  // =32
+        "b.lt   " DC_KERNEL_NO_MULT_2 "b\n"
+        // %bb.17:        // in Loop: Header=BB111_4 Depth=1
+        "ldr    w12, [sp, #340]\n"  // 4-byte Folded Reload
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_NO_MULT_27 "f\n"
+        // %bb.18:        // in Loop: Header=BB111_4 Depth=1
+        "ldr    x12, [sp, #88]\n"  // 8-byte Folded Reload
+        "ldp    x17, %[output_block_data], [sp, #32]\n"  // 16-byte Folded Reload
+        "str    x9, [sp, #288]\n"  // 8-byte Folded Spill
+        "ldp    q19, q20, [%[bias_data]]\n"
+        "lsl    w12, w12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "add    x17, x17, x12\n"
+        "add    x12, %[output_block_data], x12\n"
+        "ldp    q21, q22, [x17]\n"
+        "ldp    q23, q24, [x12]\n"
+        "ldr    x9, [sp, #264]\n"  // 8-byte Folded Reload
+        "ldr    x27, [sp, #112]\n"  // 8-byte Folded Reload
+        "mov    w26, wzr\n"
+        "b      " DC_KERNEL_NO_MULT_20 "f\n"
+        DC_KERNEL_NO_MULT_19 ":\n"  // in Loop: Header=BB111_20 Depth=2
+        "ldr    w12, [sp, #108]\n"  // 4-byte Folded Reload
+        "ldr    x22, [sp, #200]\n"  // 8-byte Folded Reload
+        "add    w26, w26, #1\n"  // =1
+        "cmp    w26, w12\n"
+        "add    x27, x27, x22\n"
+        "b.eq   " DC_KERNEL_NO_MULT_26 "f\n"
+        DC_KERNEL_NO_MULT_20 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB111_22 Depth 3
+        // Child Loop BB111_25 Depth 4
+        "ldp    x16, %[output_block_data], [sp, #320]\n"  // 16-byte Folded Reload
+        "ldp    q25, q26, [x9]\n"
+        "mov    w12, wzr\n"
+        "mov    x17, x9\n"
+        "add    %[scratch_block_data], x9, %[output_block_data]\n"
+        "add    %[output_block_data], x9, x16\n"
+        "ldp    q27, q28, [%[scratch_block_data]]\n"
+        "ldp    q29, q30, [%[output_block_data]]\n"
+        "mov    x9, %[scratch_block_data]\n"
+        "mov    x22, x27\n"
+        "b      " DC_KERNEL_NO_MULT_22 "f\n"
+        DC_KERNEL_NO_MULT_21 ":\n"  // in Loop: Header=BB111_22 Depth=3
+        "ldr    w16, [sp, #340]\n"  // 4-byte Folded Reload
+        "add    w12, w12, #1\n"  // =1
+        "mov    x17, %[scratch_block_data]\n"
+        "cmp    w12, w16\n"
+        "b.eq   " DC_KERNEL_NO_MULT_19 "b\n"
+        DC_KERNEL_NO_MULT_22 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_20 Depth=2
+        // =>  This Loop Header: Depth=3
+        // Child Loop BB111_25 Depth 4
+        "ldr    w16, [sp, #344]\n"  // 4-byte Folded Reload
+        "add    %[scratch_block_data], x17, #32\n"  // =32
+        "cmp    w12, w16\n"
+        "ldr    w16, [sp, #348]\n"  // 4-byte Folded Reload
+        "csel   w3, w16, w1, eq\n"
+        "cmp    w3, #3\n"  // =3
+        "b.ge   " DC_KERNEL_NO_MULT_24 "f\n"
+        // %bb.23:        // in Loop: Header=BB111_22 Depth=3
+        "movi   v31.16b, #0\n"
+        "cmp    w3, #1\n"  // =1
+        "movi   v8.16b, #0\n"
+        "movi   v9.16b, #0\n"
+        "movi   v11.16b, #0\n"
+        "movi   v12.16b, #0\n"
+        "movi   v10.16b, #0\n"
+        "b.ge   " DC_KERNEL_NO_MULT_25 "f\n"
+        "b      " DC_KERNEL_NO_MULT_21 "b\n"
+        DC_KERNEL_NO_MULT_24 ":\n"  // in Loop: Header=BB111_22 Depth=3
+        "ldr    x24, [sp, #328]\n"  // 8-byte Folded Reload
+        "mov    x16, x11\n"
+        "mov    x11, x10\n"
+        "mov    x10, %[scratch_block_data]\n"
+        "add    x24, %[scratch_block_data], x24\n"
+        "ldr    %[scratch_block_data], [sp, #320]\n"  // 8-byte Folded Reload
+        "ldp    q10, q9, [x17, #32]\n"
+        "ldp    q12, q8, [x24]\n"
+        "mov    x23, x15\n"
+        "add    %[scratch_block_data], x10, x0\n"
+        "ldp    q11, q31, [%[scratch_block_data]]\n"
+        "mov    x15, x14\n"
+        "mov    x14, x6\n"
+        "mov    %[bias_data], x13\n"
+        "mov    x13, x21\n"
+        "mov    x21, x20\n"
+        "mov    x20, x19\n"
+        "mov    x19, x25\n"
+        "mov    x19, x20\n"
+        "mov    x20, x21\n"
+        "mov    x21, x13\n"
+        "mov    x13, %[bias_data]\n"
+        "mov    x14, x15\n"
+        "mov    x15, x23\n"
+        "mov    %[scratch_block_data], x10\n"
+        "mov    x10, x11\n"
+        "mov    x11, x16\n"
+        DC_KERNEL_NO_MULT_25 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_20 Depth=2
+        // Parent Loop BB111_22 Depth=3
+        // =>  This Inner Loop Header: Depth=4
+        "mov    v1.16b, v19.16b\n"
+        "mov    v2.16b, v20.16b\n"
+        ".word 0x4e999601  // sdot   v1.4s, v16.16b, v25.16b\n"
+        ".word 0x4e9a95e2  // sdot   v2.4s, v15.16b, v26.16b\n"
+        ".word 0x4e9b9621  // sdot   v1.4s, v17.16b, v27.16b\n"
+        ".word 0x4e9c9462  // sdot   v2.4s, v3.16b, v28.16b\n"
+        ".word 0x4e9d9641  // sdot   v1.4s, v18.16b, v29.16b\n"
+        ".word 0x4e9e9482  // sdot   v2.4s, v4.16b, v30.16b\n"
+        "sqrdmulh        v1.4s, v1.4s, v23.4s\n"
+        "sqrdmulh        v2.4s, v2.4s, v24.4s\n"
+        "sqrshl v1.4s, v1.4s, v21.4s\n"
+        "sqrshl v2.4s, v2.4s, v22.4s\n"
+        "sqxtn  v1.4h, v1.4s\n"
+        "sqxtn2 v1.8h, v2.4s\n"
+        "sqadd  v1.8h, v1.8h, v0.8h\n"
+        "sqxtn  v1.8b, v1.8h\n"
+        "smax   v1.8b, v1.8b, v7.8b\n"
+        "ushr   v25.4s, v25.4s, #8\n"
+        "ushr   v26.4s, v26.4s, #8\n"
+        "ushr   v27.4s, v27.4s, #8\n"
+        "ushr   v28.4s, v28.4s, #8\n"
+        "ushr   v29.4s, v29.4s, #8\n"
+        "ushr   v30.4s, v30.4s, #8\n"
+        "smin   v1.8b, v1.8b, v14.8b\n"
+        "subs   w3, w3, #1\n"  // =1
+        "sli    v25.4s, v10.4s, #24\n"
+        "ushr   v10.4s, v10.4s, #8\n"
+        "sli    v26.4s, v9.4s, #24\n"
+        "ushr   v9.4s, v9.4s, #8\n"
+        "sli    v27.4s, v12.4s, #24\n"
+        "ushr   v12.4s, v12.4s, #8\n"
+        "sli    v28.4s, v8.4s, #24\n"
+        "ushr   v8.4s, v8.4s, #8\n"
+        "sli    v29.4s, v11.4s, #24\n"
+        "ushr   v11.4s, v11.4s, #8\n"
+        "sli    v30.4s, v31.4s, #24\n"
+        "ushr   v31.4s, v31.4s, #8\n"
+        "str    d1, [x22]\n"
+        "add    x22, x22, x7\n"
+        "b.ne   " DC_KERNEL_NO_MULT_25 "b\n"
+        "b      " DC_KERNEL_NO_MULT_21 "b\n"
+        DC_KERNEL_NO_MULT_26 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "ldr    %[bias_data], [sp, #288]\n"  // 8-byte Folded Reload
+        "ldr    x23, [sp, #24]\n"  // 8-byte Folded Reload
+        "ldr    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_3 "b\n"
+        DC_KERNEL_NO_MULT_27 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "ldr    w12, [sp, #20]\n"  // 4-byte Folded Reload
+        "cmp    w17, #2\n"  // =2
+        "b.hs   " DC_KERNEL_NO_MULT_29 "f\n"
+        // %bb.28:        // in Loop: Header=BB111_4 Depth=1
+        "mov    w12, wzr\n"
+        "b      " DC_KERNEL_NO_MULT_31 "f\n"
+        DC_KERNEL_NO_MULT_29 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "subs   w12, w12, #2\n"  // =2
+        "b.ne   " DC_KERNEL_NO_MULT_29 "b\n"
+        // %bb.30:        // in Loop: Header=BB111_4 Depth=1
+        "ldr    w12, [sp, #20]\n"  // 4-byte Folded Reload
+        "cmp    w17, w12\n"
+        "b.eq   " DC_KERNEL_NO_MULT_2 "b\n"
+        DC_KERNEL_NO_MULT_31 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "sub    w12, w17, w12\n"
+        DC_KERNEL_NO_MULT_32 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "subs   w12, w12, #1\n"  // =1
+        "b.ne   " DC_KERNEL_NO_MULT_32 "b\n"
+        "b      " DC_KERNEL_NO_MULT_2 "b\n"
+        DC_KERNEL_NO_MULT_33 ":\n"
+        // Compiled intrinsics total stack 528, now 384 for spillage only.
+        "add    sp, sp, #384\n"  // =528
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+#undef DC_KERNEL_NO_MULT_1
+#undef DC_KERNEL_NO_MULT_2
+#undef DC_KERNEL_NO_MULT_3
+#undef DC_KERNEL_NO_MULT_4
+#undef DC_KERNEL_NO_MULT_5
+#undef DC_KERNEL_NO_MULT_6
+#undef DC_KERNEL_NO_MULT_7
+#undef DC_KERNEL_NO_MULT_8
+#undef DC_KERNEL_NO_MULT_9
+#undef DC_KERNEL_NO_MULT_10
+#undef DC_KERNEL_NO_MULT_11
+#undef DC_KERNEL_NO_MULT_12
+#undef DC_KERNEL_NO_MULT_13
+#undef DC_KERNEL_NO_MULT_14
+#undef DC_KERNEL_NO_MULT_15
+#undef DC_KERNEL_NO_MULT_16
+#undef DC_KERNEL_NO_MULT_17
+#undef DC_KERNEL_NO_MULT_18
+#undef DC_KERNEL_NO_MULT_19
+#undef DC_KERNEL_NO_MULT_20
+#undef DC_KERNEL_NO_MULT_21
+#undef DC_KERNEL_NO_MULT_22
+#undef DC_KERNEL_NO_MULT_23
+#undef DC_KERNEL_NO_MULT_24
+#undef DC_KERNEL_NO_MULT_25
+#undef DC_KERNEL_NO_MULT_26
+#undef DC_KERNEL_NO_MULT_27
+#undef DC_KERNEL_NO_MULT_28
+#undef DC_KERNEL_NO_MULT_29
+#undef DC_KERNEL_NO_MULT_30
+#undef DC_KERNEL_NO_MULT_31
+#undef DC_KERNEL_NO_MULT_32
+#undef DC_KERNEL_NO_MULT_33
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_STRIDE_1 "1"
+#define DC_KERNEL_NO_MULT_STRIDE_2 "2"
+#define DC_KERNEL_NO_MULT_STRIDE_3 "3"
+#define DC_KERNEL_NO_MULT_STRIDE_4 "4"
+#define DC_KERNEL_NO_MULT_STRIDE_5 "5"
+#define DC_KERNEL_NO_MULT_STRIDE_6 "6"
+#define DC_KERNEL_NO_MULT_STRIDE_7 "7"
+#define DC_KERNEL_NO_MULT_STRIDE_8 "8"
+#define DC_KERNEL_NO_MULT_STRIDE_9 "9"
+#define DC_KERNEL_NO_MULT_STRIDE_10 "10"
+#define DC_KERNEL_NO_MULT_STRIDE_11 "11"
+#define DC_KERNEL_NO_MULT_STRIDE_12 "12"
+#define DC_KERNEL_NO_MULT_STRIDE_13 "13"
+#define DC_KERNEL_NO_MULT_STRIDE_14 "14"
+#define DC_KERNEL_NO_MULT_STRIDE_15 "15"
+#define DC_KERNEL_NO_MULT_STRIDE_16 "16"
+#define DC_KERNEL_NO_MULT_STRIDE_17 "17"
+#define DC_KERNEL_NO_MULT_STRIDE_18 "18"
+#define DC_KERNEL_NO_MULT_STRIDE_19 "19"
+#define DC_KERNEL_NO_MULT_STRIDE_20 "20"
+#define DC_KERNEL_NO_MULT_STRIDE_21 "21"
+#define DC_KERNEL_NO_MULT_STRIDE_22 "22"
+#define DC_KERNEL_NO_MULT_STRIDE_23 "23"
+#define DC_KERNEL_NO_MULT_STRIDE_24 "24"
+#define DC_KERNEL_NO_MULT_STRIDE_25 "25"
+#define DC_KERNEL_NO_MULT_STRIDE_26 "26"
+#define DC_KERNEL_NO_MULT_STRIDE_27 "27"
+#define DC_KERNEL_NO_MULT_STRIDE_28 "28"
+#define DC_KERNEL_NO_MULT_STRIDE_29 "29"
+#define DC_KERNEL_NO_MULT_STRIDE_30 "30"
+#define DC_KERNEL_NO_MULT_STRIDE_31 "31"
+#define DC_KERNEL_NO_MULT_STRIDE_32 "32"
+#define DC_KERNEL_NO_MULT_STRIDE_33 "33"
+#define DC_KERNEL_NO_MULT_STRIDE_34 "34"
+#define DC_KERNEL_NO_MULT_STRIDE_35 "35"
+
+    asm volatile(
+        // Compiled code used block of 176 for spill out of total stack of 320.
+        "sub    sp, sp, #176\n"  // =320
+
+
+        "ldr    w23, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[scratch_block_data], [sp, #168]\n"  // 8-byte Folded Spill
+        "cmp    w23, #1\n"  // =1
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
+        // %bb.1:
+        "ldr    x8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "ldpsw  x11, x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldp    w13, w0, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldr    w5, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "str    x8, [sp, #144]\n"  // 8-byte Folded Spill
+        "ldr    x8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
+        "ldr    x14, [%[function_params]]\n"
+        "str    w5, [sp, #164]\n"  // 4-byte Folded Spill
+        "add    x15, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
+        "str    x8, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x16, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
+        "add    x17, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "ldrsw  x8, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldp    w5, w4, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ld1r   { v0.8h }, [x17]\n"
+        "ld1r   { v1.8b }, [x15]\n"
+        "ld1r   { v2.8b }, [x16]\n"
+        "cmp    w5, #1\n"  // =1
+        "ccmp   w0, w13, #0, eq\n"
+        "lsl    w15, w14, #1\n"
+        "csel   w6, w0, w13, lt\n"
+        "lsl    x8, x8, #5\n"
+        "sxtw   x19, w14\n"
+        "sxtw   x22, w15\n"
+        "bic    w14, w6, w6, asr #31\n"
+        "str    x8, [sp, #152]\n"  // 8-byte Folded Spill
+        "lsl    x7, x12, #1\n"
+        "madd   x8, x22, x14, %[output_block_data]\n"
+        "mov    x9, xzr\n"
+        "mov    x10, xzr\n"
+        "lsl    x20, x12, #2\n"
+        "add    x21, x7, x12\n"
+        "sub    x14, x13, x14\n"
+        "stp    x8, x23, [sp, #48]\n"  // 16-byte Folded Spill
+        "add    x8, x8, #4\n"  // =4
+        "str    w4, [sp, #44]\n"  // 4-byte Folded Spill
+        "str    %[scratch_block_data], [sp, #32]\n"  // 8-byte Folded Spill
+        "str    x14, [sp, #128]\n"  // 8-byte Folded Spill
+        "str    x8, [sp, #8]\n"  // 8-byte Folded Spill
+        // implicit-def: $q5
+        // implicit-def: $q21
+        // implicit-def: $q19
+        // implicit-def: $q16
+        // implicit-def: $q20
+        // implicit-def: $q3
+        // implicit-def: $q11
+        // implicit-def: $q13
+        // implicit-def: $q14
+        // implicit-def: $q15
+        // implicit-def: $q6
+        "b      " DC_KERNEL_NO_MULT_STRIDE_4 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_2 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "add    x27, %[bias_data], #32\n"  // =32
+        "mov    v19.16b, v12.16b\n"
+        "mov    v3.16b, v9.16b\n"
+        "mov    v5.16b, v10.16b\n"
+        "mov    v20.16b, v7.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "add    x10, x10, #1\n"  // =1
+        "cmp    x10, x23\n"
+        "add    x9, x9, #8\n"  // =8
+        "mov    %[bias_data], x27\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB112_30 Depth 2
+        // Child Loop BB112_21 Depth 2
+        // Child Loop BB112_7 Depth 2
+        // Child Loop BB112_9 Depth 2
+        // Child Loop BB112_12 Depth 2
+        // Child Loop BB112_26 Depth 2
+        "ldr    w8, [sp, #164]\n"  // 4-byte Folded Reload
+        "add    w14, w10, w10, lsl #1\n"
+        "lsl    w14, w14, #5\n"
+        "add    x26, %[filter_workspace], x14\n"
+        "cmp    w8, #2\n"  // =2
+        "ldr    x8, [sp, #168]\n"  // 8-byte Folded Reload
+        "ldr    x14, [sp, #152]\n"  // 8-byte Folded Reload
+        "nop\n"
+        "madd   x28, x10, x14, x8\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_14 "f\n"
+        // %bb.5:        // in Loop: Header=BB112_4 Depth=1
+        "ldr    x8, [sp, #136]\n"  // 8-byte Folded Reload
+        "ubfx   x14, x9, #3, #29\n"
+        "lsl    w15, w10, #3\n"
+        "lsl    x27, x14, #3\n"
+        "lsl    x14, x15, #2\n"
+        "add    x24, x8, x14\n"
+        "ldr    x8, [sp, #144]\n"  // 8-byte Folded Reload
+        "ldr    q22, [x26]\n"
+        "ldr    q23, [x26, #32]\n"
+        "ldr    q24, [x26, #64]\n"
+        "add    x14, x8, x14\n"
+        "ldr    x8, [sp, #48]\n"  // 8-byte Folded Reload
+        "ldr    q25, [%[bias_data]]\n"
+        "ldr    q31, [x28]\n"
+        "ldr    q8, [x28, x12]\n"
+        "ldr    q30, [x28, x7]\n"
+        "ldr    q29, [x28, x21]\n"
+        "ldr    q26, [x24]\n"
+        "ldr    q27, [x14]\n"
+        "ldr    q28, [x28, x20]\n"
+        "add    x25, x8, x27\n"
+        "cmp    w6, #1\n"  // =1
+        "add    %[function_params], %[output_block_data], x15\n"
+        "mov    v12.16b, v19.16b\n"
+        "mov    v7.16b, v20.16b\n"
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_23 "f\n"
+        // %bb.6:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v4.16b, v21.16b\n"
+        "mov    x8, %[filter_workspace]\n"
+        "mov    w15, wzr\n"
+        "mov    x16, xzr\n"
+        "add    x17, x28, #32\n"  // =32
+        "mov    x23, x6\n"
+        "mov    v17.16b, v30.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_7 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v18.16b, v25.16b\n"
+        "mov    v19.16b, v25.16b\n"
+        ".word 0x4e9f96d2  // sdot   v18.4s, v22.16b, v31.16b\n"
+        ".word 0x4e9196d3  // sdot   v19.4s, v22.16b, v17.16b\n"
+        ".word 0x4e8896f2  // sdot   v18.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9d96f3  // sdot   v19.4s, v23.16b, v29.16b\n"
+        ".word 0x4e919712  // sdot   v18.4s, v24.16b, v17.16b\n"
+        ".word 0x4e9c9713  // sdot   v19.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v18.4s, v18.4s, v27.4s\n"
+        "and    %[scratch_block_data], x16, #0xffffffe0\n"
+        "sqrdmulh        v19.4s, v19.4s, v27.4s\n"
+        "sqrshl v18.4s, v18.4s, v26.4s\n"
+        "add    %[scratch_block_data], x17, x0\n"
+        "sqrshl v19.4s, v19.4s, v26.4s\n"
+        "sqxtn  v18.4h, v18.4s\n"
+        "rev32  v20.8h, v31.8h\n"
+        "rev32  v21.8h, v8.8h\n"
+        "rev32  v9.8h, v30.8h\n"
+        "rev32  v10.8h, v29.8h\n"
+        "ldr    q31, [%[scratch_block_data]]\n"
+        "ldr    q8, [%[scratch_block_data], x12]\n"
+        "ldr    q30, [%[scratch_block_data], x7]\n"
+        "ldr    q29, [%[scratch_block_data], x21]\n"
+        "rev32  v17.8h, v28.8h\n"
+        "ldr    q28, [%[scratch_block_data], x20]\n"
+        "sqxtn2 v18.8h, v19.4s\n"
+        "sqadd  v18.8h, v18.8h, v0.8h\n"
+        "sqxtn  v18.8b, v18.8h\n"
+        "add    %[filter_workspace], %[function_params], w15, sxtw\n"
+        "smax   v18.8b, v18.8b, v1.8b\n"
+        "add    %[scratch_block_data], %[filter_workspace], x11\n"
+        "smin   v18.8b, v18.8b, v2.8b\n"
+        "mov    v11.16b, v25.16b\n"
+        "str    s18, [%[filter_workspace]]\n"
+        "st1    { v18.s }[1], [%[scratch_block_data]]\n"
+        "trn1   v18.8h, v20.8h, v31.8h\n"
+        "mov    v19.16b, v25.16b\n"
+        "trn1   v20.8h, v21.8h, v8.8h\n"
+        "trn1   v21.8h, v9.8h, v30.8h\n"
+        ".word 0x4e9296cb  // sdot   v11.4s, v22.16b, v18.16b\n"
+        "trn1   v9.8h, v10.8h, v29.8h\n"
+        ".word 0x4e9596d3  // sdot   v19.4s, v22.16b, v21.16b\n"
+        ".word 0x4e9496eb  // sdot   v11.4s, v23.16b, v20.16b\n"
+        "trn1   v17.8h, v17.8h, v28.8h\n"
+        ".word 0x4e8996f3  // sdot   v19.4s, v23.16b, v9.16b\n"
+        ".word 0x4e95970b  // sdot   v11.4s, v24.16b, v21.16b\n"
+        ".word 0x4e919713  // sdot   v19.4s, v24.16b, v17.16b\n"
+        "sqrdmulh        v17.4s, v11.4s, v27.4s\n"
+        "sqrdmulh        v18.4s, v19.4s, v27.4s\n"
+        "sqrshl v17.4s, v17.4s, v26.4s\n"
+        "sqrshl v18.4s, v18.4s, v26.4s\n"
+        "sqxtn  v17.4h, v17.4s\n"
+        "sqxtn2 v17.8h, v18.4s\n"
+        "sqadd  v17.8h, v17.8h, v0.8h\n"
+        "sqxtn  v17.8b, v17.8h\n"
+        "add    %[filter_workspace], x1, x19\n"
+        "smax   v17.8b, v17.8b, v1.8b\n"
+        "add    %[scratch_block_data], %[filter_workspace], x11\n"
+        "smin   v17.8b, v17.8b, v2.8b\n"
+        "add    x16, x16, #32\n"  // =32
+        "subs   x23, x23, #1\n"  // =1
+        "str    s17, [%[filter_workspace]]\n"
+        "st1    { v17.s }[1], [%[scratch_block_data]]\n"
+        "add    w15, w15, w22\n"
+        "mov    v17.16b, v30.16b\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_7 "b\n"
+        // %bb.8:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v6.16b, v31.16b\n"
+        "mov    v15.16b, v8.16b\n"
+        "mov    v14.16b, v30.16b\n"
+        "mov    v13.16b, v29.16b\n"
+        "mov    v11.16b, v28.16b\n"
+        "mov    w15, w6\n"
+        "mov    %[filter_workspace], x8\n"
+        "mov    v21.16b, v4.16b\n"
+        "cmp    w15, w13\n"
+        "ldr    x15, [sp, #128]\n"  // 8-byte Folded Reload
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_10 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_9 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v9.16b, v25.16b\n"
+        "mov    v10.16b, v25.16b\n"
+        ".word 0x4e9f96c9  // sdot   v9.4s, v22.16b, v31.16b\n"
+        ".word 0x4e8896e9  // sdot   v9.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9e96ca  // sdot   v10.4s, v22.16b, v30.16b\n"
+        ".word 0x4e9e9709  // sdot   v9.4s, v24.16b, v30.16b\n"
+        ".word 0x4e9d96ea  // sdot   v10.4s, v23.16b, v29.16b\n"
+        ".word 0x4e9c970a  // sdot   v10.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v9.4s, v9.4s, v27.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v27.4s\n"
+        "sqrshl v9.4s, v9.4s, v26.4s\n"
+        "sqrshl v10.4s, v10.4s, v26.4s\n"
+        "sqxtn  v9.4h, v9.4s\n"
+        "sqxtn2 v9.8h, v10.4s\n"
+        "sqadd  v9.8h, v9.8h, v0.8h\n"
+        "sqxtn  v9.8b, v9.8h\n"
+        "smax   v9.8b, v9.8b, v1.8b\n"
+        "rev32  v31.8h, v31.8h\n"
+        "rev32  v8.8h, v8.8h\n"
+        "rev32  v30.8h, v30.8h\n"
+        "rev32  v29.8h, v29.8h\n"
+        "rev32  v28.8h, v28.8h\n"
+        "smin   v9.8b, v9.8b, v2.8b\n"
+        "add    x16, x25, x11\n"
+        "subs   x15, x15, #1\n"  // =1
+        "trn1   v31.8h, v31.8h, v6.8h\n"
+        "trn1   v8.8h, v8.8h, v15.8h\n"
+        "trn1   v29.8h, v29.8h, v13.8h\n"
+        "trn1   v30.8h, v30.8h, v14.8h\n"
+        "trn1   v28.8h, v28.8h, v11.8h\n"
+        "str    s9, [x25]\n"
+        "add    x25, x25, x22\n"
+        "st1    { v9.s }[1], [x16]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_9 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_10 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldr    q22, [x26, #16]\n"
+        "ldr    q23, [x26, #48]\n"
+        "ldr    q24, [x26, #80]\n"
+        "ldr    q29, [x28, #16]!\n"
+        "ldr    q25, [%[bias_data], #16]\n"
+        "ldr    q26, [x24, #16]\n"
+        "ldr    q27, [x14, #16]\n"
+        "ldr    q8, [x28, x12]\n"
+        "ldr    q31, [x28, x7]\n"
+        "ldr    q30, [x28, x21]\n"
+        "ldr    q28, [x28, x20]\n"
+        "ldr    x23, [sp, #56]\n"  // 8-byte Folded Reload
+        "cmp    w6, #0\n"  // =0
+        "mov    v10.16b, v5.16b\n"
+        "b.le   " DC_KERNEL_NO_MULT_STRIDE_24 "f\n"
+        // %bb.11:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v6.16b, v21.16b\n"
+        "mov    v9.16b, v3.16b\n"
+        "mov    w14, wzr\n"
+        "mov    x15, xzr\n"
+        "add    x16, x28, #32\n"  // =32
+        "add    x17, %[function_params], #4\n"  // =4
+        "mov    %[function_params], x6\n"
+        "mov    v17.16b, v31.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_12 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v3.16b, v25.16b\n"
+        "mov    v4.16b, v25.16b\n"
+        ".word 0x4e9d96c3  // sdot   v3.4s, v22.16b, v29.16b\n"
+        ".word 0x4e9196c4  // sdot   v4.4s, v22.16b, v17.16b\n"
+        ".word 0x4e8896e3  // sdot   v3.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9e96e4  // sdot   v4.4s, v23.16b, v30.16b\n"
+        ".word 0x4e919703  // sdot   v3.4s, v24.16b, v17.16b\n"
+        ".word 0x4e9c9704  // sdot   v4.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v27.4s\n"
+        "and    %[scratch_block_data], x15, #0xffffffe0\n"
+        "sqrdmulh        v4.4s, v4.4s, v27.4s\n"
+        "sqrshl v3.4s, v3.4s, v26.4s\n"
+        "add    %[scratch_block_data], x16, x0\n"
+        "sqrshl v4.4s, v4.4s, v26.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "rev32  v5.8h, v29.8h\n"
+        "rev32  v18.8h, v8.8h\n"
+        "rev32  v19.8h, v31.8h\n"
+        "rev32  v20.8h, v30.8h\n"
+        "ldr    q29, [%[scratch_block_data]]\n"
+        "ldr    q8, [%[scratch_block_data], x12]\n"
+        "ldr    q31, [%[scratch_block_data], x7]\n"
+        "ldr    q30, [%[scratch_block_data], x21]\n"
+        "rev32  v17.8h, v28.8h\n"
+        "ldr    q28, [%[scratch_block_data], x20]\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "add    x8, x17, w14, sxtw\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "add    %[scratch_block_data], x8, x11\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "mov    v21.16b, v25.16b\n"
+        "str    s3, [x8]\n"
+        "st1    { v3.s }[1], [%[scratch_block_data]]\n"
+        "trn1   v3.8h, v5.8h, v29.8h\n"
+        "mov    v4.16b, v25.16b\n"
+        "trn1   v5.8h, v18.8h, v8.8h\n"
+        "trn1   v18.8h, v19.8h, v31.8h\n"
+        ".word 0x4e8396d5  // sdot   v21.4s, v22.16b, v3.16b\n"
+        "trn1   v19.8h, v20.8h, v30.8h\n"
+        ".word 0x4e9296c4  // sdot   v4.4s, v22.16b, v18.16b\n"
+        ".word 0x4e8596f5  // sdot   v21.4s, v23.16b, v5.16b\n"
+        "trn1   v17.8h, v17.8h, v28.8h\n"
+        ".word 0x4e9396e4  // sdot   v4.4s, v23.16b, v19.16b\n"
+        ".word 0x4e929715  // sdot   v21.4s, v24.16b, v18.16b\n"
+        ".word 0x4e919704  // sdot   v4.4s, v24.16b, v17.16b\n"
+        "sqrdmulh        v3.4s, v21.4s, v27.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v27.4s\n"
+        "sqrshl v3.4s, v3.4s, v26.4s\n"
+        "sqrshl v4.4s, v4.4s, v26.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "add    x8, x8, x19\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "add    x15, x15, #32\n"  // =32
+        "subs   %[function_params], %[function_params], #1\n"  // =1
+        "add    %[scratch_block_data], x8, x11\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "add    w14, w14, w22\n"
+        "mov    v17.16b, v31.16b\n"
+        "str    s3, [x8]\n"
+        "st1    { v3.s }[1], [%[scratch_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_12 "b\n"
+        // %bb.13:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v15.16b, v8.16b\n"
+        "mov    v14.16b, v31.16b\n"
+        "mov    v13.16b, v30.16b\n"
+        "mov    v11.16b, v28.16b\n"
+        "mov    w14, w6\n"
+        "mov    v21.16b, v6.16b\n"
+        "mov    v6.16b, v29.16b\n"
+        "mov    v3.16b, v29.16b\n"
+        "cmp    w14, w13\n"
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_25 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_14 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "cmp    w13, #1\n"  // =1
+        "add    x27, %[bias_data], #32\n"  // =32
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        // %bb.15:        // in Loop: Header=BB112_4 Depth=1
+        "ldr    x8, [sp, #136]\n"  // 8-byte Folded Reload
+        "lsl    w14, w10, #3\n"
+        "stp    q15, q14, [sp, #64]\n"  // 32-byte Folded Spill
+        "stp    q13, q11, [sp, #96]\n"  // 32-byte Folded Spill
+        "add    x15, x28, x12\n"
+        "lsl    x16, x14, #2\n"
+        "ldp    q10, q11, [x15]\n"
+        "add    x15, x8, x16\n"
+        "ldr    x8, [sp, #144]\n"  // 8-byte Folded Reload
+        "ldp    q30, q31, [x15]\n"
+        "add    x15, x28, x7\n"
+        "ldp    q22, q23, [x26]\n"
+        "add    x16, x8, x16\n"
+        "ldr    w8, [sp, #44]\n"  // 4-byte Folded Reload
+        "ldp    q24, q25, [x26, #32]\n"
+        "ldp    q26, q27, [x26, #64]\n"
+        "ldp    q17, q18, [%[bias_data]]\n"
+        "ldp    q14, q13, [x28], #32\n"
+        "ldp    q8, q9, [x16]\n"
+        "ldp    q12, q15, [x15]\n"
+        "add    %[bias_data], %[output_block_data], x14\n"
+        "cmp    w13, w8\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_27 "f\n"
+        // %bb.16:        // in Loop: Header=BB112_4 Depth=1
+        "ldr    x25, [sp, #32]\n"  // 8-byte Folded Reload
+        "mov    x14, xzr\n"
+        "mov    w4, wzr\n"
+        "mov    x24, x13\n"
+        "cbnz   x25,    " DC_KERNEL_NO_MULT_STRIDE_20 "f\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_17 ":\n"  // in Loop: Header=BB112_21 Depth=2
+        "mov    v28.16b, v17.16b\n"
+        ".word 0x4e8e96dc  // sdot   v28.4s, v22.16b, v14.16b\n"
+        "mov    v29.16b, v18.16b\n"
+        ".word 0x4e8d96fd  // sdot   v29.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8a971c  // sdot   v28.4s, v24.16b, v10.16b\n"
+        ".word 0x4e8b973d  // sdot   v29.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8c975c  // sdot   v28.4s, v26.16b, v12.16b\n"
+        ".word 0x4e8f977d  // sdot   v29.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v28.4s, v28.4s, v8.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v9.4s\n"
+        "sqrshl v28.4s, v28.4s, v30.4s\n"
+        "sqrshl v29.4s, v29.4s, v31.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        "mov    v14.16b, v3.16b\n"
+        "mov    v10.16b, v20.16b\n"
+        "mov    v12.16b, v16.16b\n"
+        "mov    v13.16b, v19.16b\n"
+        "mov    v11.16b, v21.16b\n"
+        "mov    v15.16b, v5.16b\n"
+        "str    d28, [x15, x19]\n"
+        DC_KERNEL_NO_MULT_STRIDE_18 ":\n"  // in Loop: Header=BB112_21 Depth=2
+        "add    w4, w4, w22\n"
+        "add    x14, x14, #32\n"  // =32
+        "subs   x24, x24, #1\n"  // =1
+        "sub    x25, x25, #1\n"  // =1
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_33 "f\n"
+        // %bb.19:        // in Loop: Header=BB112_21 Depth=2
+        "cbz    x25,    " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_20 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "and    x15, x14, #0xffffffe0\n"
+        "add    x15, x28, x15\n"
+        "add    x16, x15, x12\n"
+        "add    x17, x15, x7\n"
+        "ldp    q3, q19, [x15]\n"
+        "ldp    q20, q21, [x16]\n"
+        "ldp    q16, q5, [x17]\n"
+        DC_KERNEL_NO_MULT_STRIDE_21 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v28.16b, v17.16b\n"
+        "mov    v29.16b, v18.16b\n"
+        ".word 0x4e8e96dc  // sdot   v28.4s, v22.16b, v14.16b\n"
+        ".word 0x4e8a971c  // sdot   v28.4s, v24.16b, v10.16b\n"
+        ".word 0x4e8d96fd  // sdot   v29.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8c975c  // sdot   v28.4s, v26.16b, v12.16b\n"
+        ".word 0x4e8b973d  // sdot   v29.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8f977d  // sdot   v29.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v28.4s, v28.4s, v8.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v9.4s\n"
+        "sqrshl v28.4s, v28.4s, v30.4s\n"
+        "sqrshl v29.4s, v29.4s, v31.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "rev32  v14.8h, v14.8h\n"
+        "rev32  v10.8h, v10.8h\n"
+        "rev32  v12.8h, v12.8h\n"
+        "rev32  v13.8h, v13.8h\n"
+        "rev32  v11.8h, v11.8h\n"
+        "rev32  v15.8h, v15.8h\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        "add    x15, %[bias_data], w4, sxtw\n"
+        "cmp    w5, #1\n"  // =1
+        "trn1   v14.8h, v14.8h, v3.8h\n"
+        "trn1   v13.8h, v13.8h, v19.8h\n"
+        "trn1   v10.8h, v10.8h, v20.8h\n"
+        "trn1   v11.8h, v11.8h, v21.8h\n"
+        "trn1   v12.8h, v12.8h, v16.8h\n"
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        "trn1   v15.8h, v15.8h, v5.8h\n"
+        "str    d28, [x15]\n"
+        "b.gt   " DC_KERNEL_NO_MULT_STRIDE_17 "b\n"
+        // %bb.22:        // in Loop: Header=BB112_21 Depth=2
+        "cbz    x25,    " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_17 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_23 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "mov    w15, wzr\n"
+        "cmp    w15, w13\n"
+        "ldr    x15, [sp, #128]\n"  // 8-byte Folded Reload
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_9 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_10 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_24 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "mov    v9.16b, v3.16b\n"
+        "mov    w14, wzr\n"
+        "cmp    w14, w13\n"
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_25 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldr    x8, [sp, #8]\n"  // 8-byte Folded Reload
+        "ldr    x15, [sp, #128]\n"  // 8-byte Folded Reload
+        "add    x14, x8, x27\n"
+        DC_KERNEL_NO_MULT_STRIDE_26 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v3.16b, v25.16b\n"
+        "mov    v4.16b, v25.16b\n"
+        ".word 0x4e9d96c3  // sdot   v3.4s, v22.16b, v29.16b\n"
+        ".word 0x4e8896e3  // sdot   v3.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9f96c4  // sdot   v4.4s, v22.16b, v31.16b\n"
+        ".word 0x4e9f9703  // sdot   v3.4s, v24.16b, v31.16b\n"
+        ".word 0x4e9e96e4  // sdot   v4.4s, v23.16b, v30.16b\n"
+        ".word 0x4e9c9704  // sdot   v4.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v27.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v27.4s\n"
+        "sqrshl v3.4s, v3.4s, v26.4s\n"
+        "sqrshl v4.4s, v4.4s, v26.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "rev32  v5.8h, v29.8h\n"
+        "rev32  v17.8h, v8.8h\n"
+        "rev32  v18.8h, v31.8h\n"
+        "rev32  v19.8h, v30.8h\n"
+        "rev32  v20.8h, v28.8h\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "add    x16, x14, x11\n"
+        "subs   x15, x15, #1\n"  // =1
+        "trn1   v29.8h, v5.8h, v6.8h\n"
+        "trn1   v8.8h, v17.8h, v15.8h\n"
+        "trn1   v30.8h, v19.8h, v13.8h\n"
+        "trn1   v31.8h, v18.8h, v14.8h\n"
+        "trn1   v28.8h, v20.8h, v11.8h\n"
+        "str    s3, [x14]\n"
+        "add    x14, x14, x22\n"
+        "st1    { v3.s }[1], [x16]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_26 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_27 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldr    x25, [sp, #32]\n"  // 8-byte Folded Reload
+        "mov    w14, wzr\n"
+        "mov    %[function_params], xzr\n"
+        "mov    x24, x13\n"
+        "str    q6, [sp, #16]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_STRIDE_30 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_28 ":\n"  // in Loop: Header=BB112_30 Depth=2
+        "mov    v3.16b, v17.16b\n"
+        ".word 0x4e8e96c3  // sdot   v3.4s, v22.16b, v14.16b\n"
+        "mov    v4.16b, v18.16b\n"
+        ".word 0x4e8d96e4  // sdot   v4.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8a9703  // sdot   v3.4s, v24.16b, v10.16b\n"
+        ".word 0x4e8b9724  // sdot   v4.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8c9743  // sdot   v3.4s, v26.16b, v12.16b\n"
+        ".word 0x4e8f9764  // sdot   v4.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v8.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v9.4s\n"
+        "sqrshl v3.4s, v3.4s, v30.4s\n"
+        "sqrshl v4.4s, v4.4s, v31.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "str    d3, [x15, x19]\n"
+        "mov    v3.16b, v6.16b\n"
+        "mov    v14.16b, v6.16b\n"
+        "mov    v10.16b, v20.16b\n"
+        "mov    v12.16b, v16.16b\n"
+        "mov    v13.16b, v19.16b\n"
+        "mov    v11.16b, v21.16b\n"
+        "mov    v15.16b, v5.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_29 ":\n"  // in Loop: Header=BB112_30 Depth=2
+        "add    %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
+        "sub    x25, x25, #1\n"  // =1
+        "subs   x24, x24, #1\n"  // =1
+        "add    w14, w14, w22\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_34 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_30 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v28.16b, v17.16b\n"
+        "mov    v29.16b, v18.16b\n"
+        ".word 0x4e8e96dc  // sdot   v28.4s, v22.16b, v14.16b\n"
+        "and    x16, %[function_params], #0xffffffe0\n"
+        ".word 0x4e8d96fd  // sdot   v29.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8a971c  // sdot   v28.4s, v24.16b, v10.16b\n"
+        "add    x16, x28, x16\n"
+        ".word 0x4e8b973d  // sdot   v29.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8c975c  // sdot   v28.4s, v26.16b, v12.16b\n"
+        "rev32  v19.8h, v14.8h\n"
+        "rev32  v3.8h, v13.8h\n"
+        "ldp    q14, q13, [x16]\n"
+        ".word 0x4e8f977d  // sdot   v29.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v28.4s, v28.4s, v8.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v9.4s\n"
+        "sqrshl v28.4s, v28.4s, v30.4s\n"
+        "add    x17, x16, x12\n"
+        "add    x16, x16, x7\n"
+        "sqrshl v29.4s, v29.4s, v31.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "rev32  v21.8h, v12.8h\n"
+        "rev32  v4.8h, v11.8h\n"
+        "ldp    q20, q11, [x17]\n"
+        "ldp    q12, q5, [x16]\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "mov    v6.16b, v14.16b\n"
+        "trn1   v14.8h, v19.8h, v14.8h\n"
+        "mov    v19.16b, v13.16b\n"
+        "trn1   v13.8h, v3.8h, v13.8h\n"
+        "sqadd  v3.8h, v28.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "rev32  v16.8h, v10.8h\n"
+        "rev32  v7.8h, v15.8h\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "add    x15, %[bias_data], w14, sxtw\n"
+        "cmp    w5, #1\n"  // =1
+        "trn1   v10.8h, v16.8h, v20.8h\n"
+        "mov    v16.16b, v12.16b\n"
+        "trn1   v12.8h, v21.8h, v12.8h\n"
+        "mov    v21.16b, v11.16b\n"
+        "trn1   v11.8h, v4.8h, v11.8h\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "trn1   v15.8h, v7.8h, v5.8h\n"
+        "str    d3, [x15]\n"
+        "b.gt   " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
+        // %bb.31:        // in Loop: Header=BB112_30 Depth=2
+        "cbnz   x25,    " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
+        // %bb.32:        // in Loop: Header=BB112_30 Depth=2
+        "mov    v3.16b, v6.16b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_29 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_33 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldp    q13, q11, [sp, #96]\n"  // 32-byte Folded Reload
+        "ldp    q15, q14, [sp, #64]\n"  // 32-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_34 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldp    q13, q11, [sp, #96]\n"  // 32-byte Folded Reload
+        "ldp    q15, q14, [sp, #64]\n"  // 32-byte Folded Reload
+        "ldr    q6, [sp, #16]\n"  // 16-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_35 ":\n"
+
+        // Compiled intrinsics total stack 320, now 176 for spillage only.
+        "add    sp, sp, #176\n"  // =320
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_NO_MULT_STRIDE_1
+#undef DC_KERNEL_NO_MULT_STRIDE_2
+#undef DC_KERNEL_NO_MULT_STRIDE_3
+#undef DC_KERNEL_NO_MULT_STRIDE_4
+#undef DC_KERNEL_NO_MULT_STRIDE_5
+#undef DC_KERNEL_NO_MULT_STRIDE_6
+#undef DC_KERNEL_NO_MULT_STRIDE_7
+#undef DC_KERNEL_NO_MULT_STRIDE_8
+#undef DC_KERNEL_NO_MULT_STRIDE_9
+#undef DC_KERNEL_NO_MULT_STRIDE_10
+#undef DC_KERNEL_NO_MULT_STRIDE_11
+#undef DC_KERNEL_NO_MULT_STRIDE_12
+#undef DC_KERNEL_NO_MULT_STRIDE_13
+#undef DC_KERNEL_NO_MULT_STRIDE_14
+#undef DC_KERNEL_NO_MULT_STRIDE_15
+#undef DC_KERNEL_NO_MULT_STRIDE_16
+#undef DC_KERNEL_NO_MULT_STRIDE_17
+#undef DC_KERNEL_NO_MULT_STRIDE_18
+#undef DC_KERNEL_NO_MULT_STRIDE_19
+#undef DC_KERNEL_NO_MULT_STRIDE_20
+#undef DC_KERNEL_NO_MULT_STRIDE_21
+#undef DC_KERNEL_NO_MULT_STRIDE_22
+#undef DC_KERNEL_NO_MULT_STRIDE_23
+#undef DC_KERNEL_NO_MULT_STRIDE_24
+#undef DC_KERNEL_NO_MULT_STRIDE_25
+#undef DC_KERNEL_NO_MULT_STRIDE_26
+#undef DC_KERNEL_NO_MULT_STRIDE_27
+#undef DC_KERNEL_NO_MULT_STRIDE_28
+#undef DC_KERNEL_NO_MULT_STRIDE_29
+#undef DC_KERNEL_NO_MULT_STRIDE_30
+#undef DC_KERNEL_NO_MULT_STRIDE_31
+#undef DC_KERNEL_NO_MULT_STRIDE_32
+#undef DC_KERNEL_NO_MULT_STRIDE_33
+#undef DC_KERNEL_NO_MULT_STRIDE_34
+#undef DC_KERNEL_NO_MULT_STRIDE_35
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_MULT_1 "1"
+#define DC_KERNEL_MULT_2 "2"
+#define DC_KERNEL_MULT_3 "3"
+#define DC_KERNEL_MULT_4 "4"
+#define DC_KERNEL_MULT_5 "5"
+#define DC_KERNEL_MULT_6 "6"
+#define DC_KERNEL_MULT_7 "7"
+#define DC_KERNEL_MULT_8 "8"
+#define DC_KERNEL_MULT_9 "9"
+#define DC_KERNEL_MULT_10 "10"
+#define DC_KERNEL_MULT_11 "11"
+#define DC_KERNEL_MULT_12 "12"
+#define DC_KERNEL_MULT_13 "13"
+#define DC_KERNEL_MULT_14 "14"
+#define DC_KERNEL_MULT_15 "15"
+#define DC_KERNEL_MULT_16 "16"
+#define DC_KERNEL_MULT_17 "17"
+#define DC_KERNEL_MULT_18 "18"
+#define DC_KERNEL_MULT_19 "19"
+#define DC_KERNEL_MULT_20 "20"
+#define DC_KERNEL_MULT_21 "21"
+#define DC_KERNEL_MULT_22 "22"
+#define DC_KERNEL_MULT_23 "23"
+
+    asm volatile(
+        // Compiled code used block of 336 for spill out of total stack of 448.
+        // However, an 8-byte spill was sneaked in to #344.
+        // Spillage increased to 352 and these are mapped to #336.
+        "sub    sp, sp, #352\n"  // =448
+
+
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[filter_workspace], [sp, #56]\n"  // 8-byte Folded Spill
+        "cmp    w8, #1\n"  // =1
+        "str    x8, [sp, #32]\n"  // 8-byte Folded Spill
+        "b.lt   " DC_KERNEL_MULT_23 "f\n"
+        // %bb.1:
+        "ldr    w11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldr    x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "ldp    w17, w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldr    w16, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ldpsw  x21, x6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldrb   w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
+        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
+        "add    x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "str    x12, [sp, #24]\n"  // 8-byte Folded Spill
+        "ldr    x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
+        "ldrsw  %[function_params], [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp    w11, #4\n"  // =4
+        "ccmp   w15, w17, #0, lt\n"
+        "csel   w25, w15, w17, lt\n"
+        "cmp    w16, #1\n"  // =1
+        "str    x16, [sp, #80]\n"  // 8-byte Folded Spill
+        "cset   w16, lt\n"
+        "cmp    w17, #1\n"  // =1
+        "dup    v1.16b, w8\n"
+        "fmov   s3, w8\n"
+        "dup    v2.16b, w9\n"
+        "fmov   s4, w9\n"
+        "lsl    x8, %[function_params], #1\n"
+        "add    x9, x21, %[function_params]\n"
+        "str    w17, [sp, #324]\n"  // 4-byte Folded Spill
+        "cset   w17, lt\n"
+        "ld1r   { v0.8h }, [x10]\n"
+        "lsl    x7, x21, #1\n"
+        "add    x22, x21, x21, lsl #1\n"
+        "add    x10, x8, %[function_params]\n"
+        "add    x9, %[output_block_data], x9\n"
+        "orr    w16, w16, w17\n"
+        "str    x9, [sp, #216]\n"  // 8-byte Folded Spill
+        "str    w15, [sp, #316]\n"  // 4-byte Folded Spill
+        "add    x9, x10, x22\n"
+        "add    x15, x10, x7\n"
+        "str    w16, [sp, #12]\n"  // 4-byte Folded Spill
+        "add    x16, x10, x21\n"
+        "add    x10, %[output_block_data], x10\n"
+        "str    x10, [sp, #200]\n"  // 8-byte Folded Spill
+        "add    x10, x6, #4\n"  // =4
+        "str    x10, [sp, #160]\n"  // 8-byte Folded Spill
+        "lsl    x10, %[function_params], #2\n"
+        "str    x10, [sp, #152]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], x21\n"
+        "add    x17, x6, x6, lsl #2\n"
+        "str    x10, [sp, #144]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], %[function_params]\n"
+        "lsl    x24, x6, #2\n"
+        "str    x10, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x10, x17, #4\n"  // =4
+        "add    x19, x6, x6, lsl #1\n"
+        "str    x10, [sp, #128]\n"  // 8-byte Folded Spill
+        "add    x10, x24, #4\n"  // =4
+        "str    x12, [sp, #16]\n"  // 8-byte Folded Spill
+        "str    w11, [sp, #320]\n"  // 4-byte Folded Spill
+        "lsl    x20, x6, #1\n"
+        "add    x11, x8, x22\n"
+        "add    x12, x8, x7\n"
+        "add    x13, x8, x21\n"
+        "add    x8, %[output_block_data], x8\n"
+        "str    x10, [sp, #120]\n"  // 8-byte Folded Spill
+        "add    x10, x19, #4\n"  // =4
+        "stp    x8, x7, [sp, #224]\n"  // 16-byte Folded Spill
+        "add    x8, x22, %[function_params]\n"
+        "str    x10, [sp, #112]\n"  // 8-byte Folded Spill
+        "add    x10, x20, #4\n"  // =4
+        "mov    x5, xzr\n"
+        "add    x14, x7, %[function_params]\n"
+        "add    x8, %[output_block_data], x8\n"
+        "str    x10, [sp, #104]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], x7\n"
+        "add    x26, %[output_block_data], x11\n"
+        "str    x8, [sp, #184]\n"  // 8-byte Folded Spill
+        "add    x8, %[output_block_data], x14\n"
+        "mov    x14, x5\n"
+        "add    x5, %[output_block_data], x9\n"
+        "add    x9, %[output_block_data], x16\n"
+        "mov    x16, x22\n"
+        "stp    x19, x6, [sp, #296]\n"  // 16-byte Folded Spill
+        "mov    x11, x7\n"
+        "str    x20, [sp, #328]\n"  // 8-byte Folded Spill
+        "str    x10, [sp, #96]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], x22\n"
+        "stp    x22, %[output_block_data], [sp, #64]\n"  // 16-byte Folded Spill
+        "ldr    x7, [sp, #160]\n"  // 8-byte Folded Reload
+        "ldr    x23, [sp, #136]\n"  // 8-byte Folded Reload
+        "ldp    x22, x19, [sp, #112]\n"  // 16-byte Folded Reload
+        "ldr    x20, [sp, #104]\n"  // 8-byte Folded Reload
+        "mov    %[filter_workspace], xzr\n"
+        "dup    v3.8b, v3.b[0]\n"
+        "dup    v4.8b, v4.b[0]\n"
+        "add    x27, %[output_block_data], x12\n"
+        "add    x28, %[output_block_data], x13\n"
+        "mov    x13, %[filter_workspace]\n"
+        "stp    x8, x17, [sp, #168]\n"  // 16-byte Folded Spill
+        "add    x8, %[output_block_data], x15\n"
+        "str    x10, [sp, #88]\n"  // 8-byte Folded Spill
+        "mov    w10, #4\n"
+        "stp    x21, %[scratch_block_data], [sp, #256]\n"  // 16-byte Folded Spill
+        "str    w25, [sp, #212]\n"  // 4-byte Folded Spill
+        "str    x24, [sp, #192]\n"  // 8-byte Folded Spill
+        "str    x9, [sp, #336]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_MULT_5 "f\n"
+        DC_KERNEL_MULT_2 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "mov    %[output_block_data], x21\n"
+        "ldp    x21, %[scratch_block_data], [sp, #256]\n"  // 16-byte Folded Reload
+        DC_KERNEL_MULT_3 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "mov    %[bias_data], x11\n"
+        DC_KERNEL_MULT_4 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "ldp    x12, x14, [sp, #32]\n"  // 16-byte Folded Reload
+        "ldr    x11, [sp, #72]\n"  // 8-byte Folded Reload
+        "ldr    x13, [sp, #48]\n"  // 8-byte Folded Reload
+        "add    x14, x14, #1\n"  // =1
+        "add    x11, x11, #8\n"  // =8
+        "cmp    x14, x12\n"
+        "add    x13, x13, #8\n"  // =8
+        "str    x11, [sp, #72]\n"  // 8-byte Folded Spill
+        "b.eq   " DC_KERNEL_MULT_23 "f\n"
+        DC_KERNEL_MULT_5 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB107_19 Depth 2
+        // Child Loop BB107_21 Depth 3
+        // Child Loop BB107_22 Depth 4
+        // Child Loop BB107_8 Depth 2
+        // Child Loop BB107_10 Depth 3
+        // Child Loop BB107_14 Depth 3
+        "ldr    x12, [sp, #56]\n"  // 8-byte Folded Reload
+        "ldr    x16, [sp, #80]\n"  // 8-byte Folded Reload
+        "ldp    q18, q5, [x12]\n"
+        "ldp    q17, q6, [x12, #32]\n"
+        "ldp    q16, q7, [x12, #64]\n"
+        "cmp    w16, #4\n"  // =4
+        "add    x12, x12, #96\n"  // =96
+        "stp    x13, x12, [sp, #48]\n"  // 16-byte Folded Spill
+        "str    x14, [sp, #40]\n"  // 8-byte Folded Spill
+        "b.ne   " DC_KERNEL_MULT_16 "f\n"
+        // %bb.6:        // in Loop: Header=BB107_5 Depth=1
+        "lsl    w12, w14, #3\n"
+        "ldr    x14, [sp, #16]\n"  // 8-byte Folded Reload
+        "lsl    x12, x12, #2\n"
+        "mov    x15, xzr\n"
+        "mov    %[filter_workspace], x13\n"
+        "add    x11, x14, x12\n"
+        "ldr    x14, [sp, #24]\n"  // 8-byte Folded Reload
+        "str    x11, [sp, #248]\n"  // 8-byte Folded Spill
+        "add    x11, x14, x12\n"
+        "str    x11, [sp, #240]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_MULT_8 "f\n"
+        DC_KERNEL_MULT_7 ":\n"  // in Loop: Header=BB107_8 Depth=2
+        "add    x15, x15, #1\n"  // =1
+        "cmp    x15, #2\n"  // =2
+        "add    %[filter_workspace], x1, #4\n"  // =4
+        "mov    v16.16b, v7.16b\n"
+        "mov    v17.16b, v6.16b\n"
+        "mov    v18.16b, v5.16b\n"
+        "b.eq   " DC_KERNEL_MULT_4 "b\n"
+        DC_KERNEL_MULT_8 ":\n"  // Parent Loop BB107_5 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB107_10 Depth 3
+        // Child Loop BB107_14 Depth 3
+        "ldr    q19, [%[bias_data]], #16\n"
+        "ldr    x11, [sp, #248]\n"  // 8-byte Folded Reload
+        "lsl    x12, x15, #4\n"
+        "ldr    w13, [%[scratch_block_data]]\n"
+        "ldr    x16, [sp, #328]\n"  // 8-byte Folded Reload
+        "ldr    q20, [x11, x12]\n"
+        "ldr    x11, [sp, #240]\n"  // 8-byte Folded Reload
+        "ldr    w6, [%[scratch_block_data], x24]\n"
+        "ldr    w16, [%[scratch_block_data], x16]\n"
+        "ldr    q21, [x11, x12]\n"
+        "ldp    x12, x14, [sp, #296]\n"  // 16-byte Folded Reload
+        "fmov   s22, w13\n"
+        "add    x14, %[scratch_block_data], x14\n"
+        "mov    v22.s[1], w13\n"
+        "fmov   s23, w6\n"
+        "ldr    w12, [%[scratch_block_data], x12]\n"
+        "ld1    { v22.s }[2], [x14]\n"
+        "add    x14, %[scratch_block_data], x17\n"
+        "mov    v23.s[1], w6\n"
+        "ld1    { v23.s }[2], [x14]\n"
+        "fmov   s24, w16\n"
+        "mov    v24.s[1], w16\n"
+        "dup    v25.4s, w16\n"
+        "mov    v28.16b, v19.16b\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "dup    v26.4s, w12\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v24.s[2], w12\n"
+        "cmp    w25, #1\n"  // =1
+        ".word 0x4e99961c  // sdot   v28.4s, v16.16b, v25.16b\n"
+        ".word 0x4e99963d  // sdot   v29.4s, v17.16b, v25.16b\n"
+        ".word 0x4e99965e  // sdot   v30.4s, v18.16b, v25.16b\n"
+        "mov    v24.s[3], w16\n"
+        "mov    v22.s[3], w13\n"
+        "mov    v23.s[3], w6\n"
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        "b.lt   " DC_KERNEL_MULT_15 "f\n"
+        // %bb.9:        // in Loop: Header=BB107_8 Depth=2
+        "stp    x15, %[bias_data], [sp, #280]\n"  // 16-byte Folded Spill
+        "mov    w13, w25\n"
+        "str    %[filter_workspace], [sp, #272]\n"  // 8-byte Folded Spill
+        "mov    x16, %[filter_workspace]\n"
+        "mov    x14, %[scratch_block_data]\n"
+        "ldp    x25, %[scratch_block_data], [sp, #216]\n"  // 16-byte Folded Reload
+        "mov    x24, x28\n"
+        "mov    x28, x27\n"
+        "ldr    x27, [sp, #200]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #184]\n"  // 8-byte Folded Reload
+        "mov    x9, x8\n"
+        "mov    x8, x5\n"
+        "ldr    x5, [sp, #168]\n"  // 8-byte Folded Reload
+        "ldp    x15, x10, [sp, #144]\n"  // 16-byte Folded Reload
+        "ldr    %[bias_data], [sp, #128]\n"  // 8-byte Folded Reload
+        "ldp    %[filter_workspace], x11, [sp, #88]\n"  // 16-byte Folded Reload
+        "shl    v25.4s, v18.4s, #8\n"
+        "shl    v26.4s, v17.4s, #8\n"
+        "shl    v27.4s, v16.4s, #8\n"
+        "mov    x21, %[output_block_data]\n"
+        DC_KERNEL_MULT_10 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_8 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4f96e25c  // sdot   v28.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f96ea5d  // sdot   v29.4s, v18.16b, v22.4b[2]\n"
+        ".word 0x4f98ea3e  // sdot   v30.4s, v17.16b, v24.4b[2]\n"
+        ".word 0x4f96ea3c  // sdot   v28.4s, v17.16b, v22.4b[2]\n"
+        ".word 0x4f97e23f  // sdot   v31.4s, v17.16b, v23.4b[0]\n"
+        ".word 0x4f98ea1d  // sdot   v29.4s, v16.16b, v24.4b[2]\n"
+        ".word 0x4f97e21e  // sdot   v30.4s, v16.16b, v23.4b[0]\n"
+        "sqrdmulh        v28.4s, v28.4s, v21.4s\n"
+        ".word 0x4f97ea1f  // sdot   v31.4s, v16.16b, v23.4b[2]\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "sqxtn2 v28.16b, v29.8h\n"
+        "smax   v28.16b, v28.16b, v1.16b\n"
+        "add    %[output_block_data], x15, x16\n"
+        "smin   v28.16b, v28.16b, v2.16b\n"
+        "add    x6, x11, x16\n"
+        "str    s28, [x21, x16]\n"
+        "st1    { v28.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], %[filter_workspace], x16\n"
+        "st1    { v28.s }[2], [x6]\n"
+        "st1    { v28.s }[3], [%[output_block_data]]\n"
+        "mov    x12, x14\n"
+        "add    x6, x14, x20\n"
+        "ldr    w3, [x14, #4]!\n"
+        "ld1    { v24.s }[1], [x6]\n"
+        "add    x6, x12, x19\n"
+        "ld1    { v23.s }[1], [x6]\n"
+        "mov    v22.s[1], w3\n"
+        "add    %[output_block_data], x12, x22\n"
+        "ld1    { v24.s }[3], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x7\n"
+        "ld1    { v22.s }[3], [%[output_block_data]]\n"
+        "add    x12, x12, %[bias_data]\n"
+        "mov    v28.16b, v19.16b\n"
+        "ld1    { v23.s }[3], [x12]\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        ".word 0x4f96e33c  // sdot   v28.4s, v25.16b, v22.4b[0]\n"
+        "mov    v31.16b, v19.16b\n"
+        ".word 0x4f98e33e  // sdot   v30.4s, v25.16b, v24.4b[0]\n"
+        ".word 0x4f96eb3d  // sdot   v29.4s, v25.16b, v22.4b[2]\n"
+        ".word 0x4f96eb5c  // sdot   v28.4s, v26.16b, v22.4b[2]\n"
+        ".word 0x4f98eb3f  // sdot   v31.4s, v25.16b, v24.4b[2]\n"
+        ".word 0x4f98eb5e  // sdot   v30.4s, v26.16b, v24.4b[2]\n"
+        ".word 0x4f98e35d  // sdot   v29.4s, v26.16b, v24.4b[0]\n"
+        ".word 0x4f98e37c  // sdot   v28.4s, v27.16b, v24.4b[0]\n"
+        ".word 0x4f97e35f  // sdot   v31.4s, v26.16b, v23.4b[0]\n"
+        ".word 0x4f97e37e  // sdot   v30.4s, v27.16b, v23.4b[0]\n"
+        ".word 0x4f98eb7d  // sdot   v29.4s, v27.16b, v24.4b[2]\n"
+        "sqrdmulh        v28.4s, v28.4s, v21.4s\n"
+        ".word 0x4f97eb7f  // sdot   v31.4s, v27.16b, v23.4b[2]\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "sqxtn2 v28.16b, v29.8h\n"
+        "smax   v28.16b, v28.16b, v1.16b\n"
+        "add    x12, x25, x16\n"
+        "smin   v28.16b, v28.16b, v2.16b\n"
+        "add    %[output_block_data], x5, x16\n"
+        "str    s28, [x23, x16]\n"
+        "st1    { v28.s }[1], [x12]\n"
+        "add    x12, x17, x16\n"
+        "mov    v29.16b, v19.16b\n"
+        "ushr   v10.2d, v22.2d, #16\n"
+        "mov    v30.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "st1    { v28.s }[2], [%[output_block_data]]\n"
+        "st1    { v28.s }[3], [x12]\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f8ae25d  // sdot   v29.4s, v18.16b, v10.4b[0]\n"
+        "mov    v8.16b, v19.16b\n"
+        ".word 0x4f9ce25f  // sdot   v31.4s, v18.16b, v28.4b[0]\n"
+        ".word 0x4f8aea5e  // sdot   v30.4s, v18.16b, v10.4b[2]\n"
+        ".word 0x4f8aea3d  // sdot   v29.4s, v17.16b, v10.4b[2]\n"
+        "ushr   v9.2d, v23.2d, #16\n"
+        ".word 0x4f9cea48  // sdot   v8.4s, v18.16b, v28.4b[2]\n"
+        ".word 0x4f9cea3f  // sdot   v31.4s, v17.16b, v28.4b[2]\n"
+        ".word 0x4f9ce23e  // sdot   v30.4s, v17.16b, v28.4b[0]\n"
+        ".word 0x4f9ce21d  // sdot   v29.4s, v16.16b, v28.4b[0]\n"
+        ".word 0x4f89e228  // sdot   v8.4s, v17.16b, v9.4b[0]\n"
+        ".word 0x4f89e21f  // sdot   v31.4s, v16.16b, v9.4b[0]\n"
+        ".word 0x4f9cea1e  // sdot   v30.4s, v16.16b, v28.4b[2]\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        ".word 0x4f89ea08  // sdot   v8.4s, v16.16b, v9.4b[2]\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqrdmulh        v8.4s, v8.4s, v21.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqxtn  v29.4h, v29.4s\n"
+        "sqrshl v8.4s, v8.4s, v20.4s\n"
+        "sqxtn  v31.4h, v31.4s\n"
+        "sqxtn2 v29.8h, v30.4s\n"
+        "sqxtn2 v31.8h, v8.4s\n"
+        "sqadd  v29.8h, v29.8h, v0.8h\n"
+        "sqadd  v30.8h, v31.8h, v0.8h\n"
+        "sqxtn  v29.8b, v29.8h\n"
+        "sqxtn2 v29.16b, v30.8h\n"
+        "smax   v29.16b, v29.16b, v1.16b\n"
+        "add    %[output_block_data], x24, x16\n"
+        "smin   v29.16b, v29.16b, v2.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "add    x12, x28, x16\n"
+        "str    s29, [%[scratch_block_data], x16]\n"
+        "st1    { v29.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x26, x16\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        ".word 0x4f8ae33e  // sdot   v30.4s, v25.16b, v10.4b[0]\n"
+        "st1    { v29.s }[2], [x12]\n"
+        "st1    { v29.s }[3], [%[output_block_data]]\n"
+        "mov    v29.16b, v19.16b\n"
+        ".word 0x4f9ce328  // sdot   v8.4s, v25.16b, v28.4b[0]\n"
+        ".word 0x4f8aeb3f  // sdot   v31.4s, v25.16b, v10.4b[2]\n"
+        ".word 0x4f8aeb5e  // sdot   v30.4s, v26.16b, v10.4b[2]\n"
+        ".word 0x4f9ceb3d  // sdot   v29.4s, v25.16b, v28.4b[2]\n"
+        ".word 0x4f9ceb48  // sdot   v8.4s, v26.16b, v28.4b[2]\n"
+        ".word 0x4f9ce35f  // sdot   v31.4s, v26.16b, v28.4b[0]\n"
+        ".word 0x4f9ce37e  // sdot   v30.4s, v27.16b, v28.4b[0]\n"
+        ".word 0x4f89e35d  // sdot   v29.4s, v26.16b, v9.4b[0]\n"
+        ".word 0x4f89e368  // sdot   v8.4s, v27.16b, v9.4b[0]\n"
+        ".word 0x4f9ceb7f  // sdot   v31.4s, v27.16b, v28.4b[2]\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        ".word 0x4f89eb7d  // sdot   v29.4s, v27.16b, v9.4b[2]\n"
+        "sqrdmulh        v28.4s, v8.4s, v21.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "ldr    x12, [sp, #336]\n"  // 8-byte Folded Reload
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v29.8b, v29.8h\n"
+        "sqxtn2 v29.16b, v28.8h\n"
+        "smax   v28.16b, v29.16b, v1.16b\n"
+        "add    x12, x12, x16\n"
+        "smin   v8.16b, v28.16b, v2.16b\n"
+        "mov    v28.16b, v19.16b\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "ushr   v24.2d, v24.2d, #32\n"
+        "add    %[output_block_data], x9, x16\n"
+        "str    s8, [x27, x16]\n"
+        "st1    { v8.s }[1], [x12]\n"
+        "add    x12, x8, x16\n"
+        "subs   w13, w13, #1\n"  // =1
+        "ushr   v22.2d, v22.2d, #32\n"
+        "ushr   v23.2d, v23.2d, #32\n"
+        ".word 0x4f98e21c  // sdot   v28.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f98e23d  // sdot   v29.4s, v17.16b, v24.4b[0]\n"
+        ".word 0x4f98e25e  // sdot   v30.4s, v18.16b, v24.4b[0]\n"
+        ".word 0x4f98ea5f  // sdot   v31.4s, v18.16b, v24.4b[2]\n"
+        "add    x16, x16, x10\n"
+        "st1    { v8.s }[2], [%[output_block_data]]\n"
+        "st1    { v8.s }[3], [x12]\n"
+        "b.ne   " DC_KERNEL_MULT_10 "b\n"
+        // %bb.11:        // in Loop: Header=BB107_8 Depth=2
+        "ldr    w25, [sp, #212]\n"  // 4-byte Folded Reload
+        "add    x13, x21, x16\n"
+        "mov    %[output_block_data], x21\n"
+        "ldp    x21, %[scratch_block_data], [sp, #256]\n"  // 16-byte Folded Reload
+        "ldr    x6, [sp, #232]\n"  // 8-byte Folded Reload
+        "mov    x27, x28\n"
+        "mov    x28, x24\n"
+        "ldr    x24, [sp, #192]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #176]\n"  // 8-byte Folded Reload
+        "ldp    x15, %[bias_data], [sp, #280]\n"  // 16-byte Folded Reload
+        "ldr    %[filter_workspace], [sp, #272]\n"  // 8-byte Folded Reload
+        "mov    w12, w25\n"
+        "mov    x5, x8\n"
+        "mov    x8, x9\n"
+        "mov    w10, #4\n"
+        "ldr    w16, [sp, #324]\n"  // 4-byte Folded Reload
+        "cmp    w12, w16\n"
+        "b.ge   " DC_KERNEL_MULT_7 "b\n"
+        DC_KERNEL_MULT_12 ":\n"  // in Loop: Header=BB107_8 Depth=2
+        "ldr    w12, [sp, #320]\n"  // 4-byte Folded Reload
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_7 "b\n"
+        // %bb.13:        // in Loop: Header=BB107_8 Depth=2
+        "add    x12, x14, #4\n"  // =4
+        "ldr    x16, [sp, #328]\n"  // 8-byte Folded Reload
+        "add    x14, x12, x24\n"
+        "ld1    { v23.s }[1], [x14]\n"
+        "add    x14, x12, x17\n"
+        "add    x16, x12, x16\n"
+        "ld1    { v24.s }[1], [x16]\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "ldp    x16, x14, [sp, #296]\n"  // 16-byte Folded Reload
+        "add    x16, x12, x16\n"
+        "ld1    { v24.s }[3], [x16]\n"
+        "ldr    x16, [sp, #64]\n"  // 8-byte Folded Reload
+        "ld1    { v22.s }[1], [x12], x14\n"
+        "ldr    w14, [sp, #320]\n"  // 4-byte Folded Reload
+        "ld1    { v22.s }[3], [x12]\n"
+        DC_KERNEL_MULT_14 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_8 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4f96e25c  // sdot   v28.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f96ea5d  // sdot   v29.4s, v18.16b, v22.4b[2]\n"
+        ".word 0x4f98ea3e  // sdot   v30.4s, v17.16b, v24.4b[2]\n"
+        ".word 0x4f96ea3c  // sdot   v28.4s, v17.16b, v22.4b[2]\n"
+        ".word 0x4f97e23f  // sdot   v31.4s, v17.16b, v23.4b[0]\n"
+        ".word 0x4f98ea1d  // sdot   v29.4s, v16.16b, v24.4b[2]\n"
+        ".word 0x4f97e21e  // sdot   v30.4s, v16.16b, v23.4b[0]\n"
+        "sqrdmulh        v25.4s, v28.4s, v21.4s\n"
+        ".word 0x4f97ea1f  // sdot   v31.4s, v16.16b, v23.4b[2]\n"
+        "sqrdmulh        v26.4s, v29.4s, v21.4s\n"
+        "sqrdmulh        v27.4s, v30.4s, v21.4s\n"
+        "sqrshl v25.4s, v25.4s, v20.4s\n"
+        "sqrdmulh        v28.4s, v31.4s, v21.4s\n"
+        "sqrshl v26.4s, v26.4s, v20.4s\n"
+        "sqrshl v27.4s, v27.4s, v20.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v25.8h, v26.4s\n"
+        "sqxtn2 v27.8h, v28.4s\n"
+        "sqadd  v25.8h, v25.8h, v0.8h\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtn  v25.8b, v25.8h\n"
+        "sqxtn2 v25.16b, v26.8h\n"
+        "smax   v25.16b, v25.16b, v1.16b\n"
+        "add    x12, x13, x21\n"
+        "smin   v25.16b, v25.16b, v2.16b\n"
+        "str    s25, [x13]\n"
+        "st1    { v25.s }[1], [x12]\n"
+        "add    x12, x13, x6\n"
+        "ushr   v24.2d, v24.2d, #8\n"
+        "mov    v28.16b, v19.16b\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "st1    { v25.s }[2], [x12]\n"
+        "add    x12, x13, x16\n"
+        "subs   w14, w14, #1\n"  // =1
+        "ushr   v22.2d, v22.2d, #8\n"
+        "ushr   v23.2d, v23.2d, #8\n"
+        ".word 0x4f98e21c  // sdot   v28.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f98e23d  // sdot   v29.4s, v17.16b, v24.4b[0]\n"
+        ".word 0x4f98e25e  // sdot   v30.4s, v18.16b, v24.4b[0]\n"
+        "add    x13, x13, %[function_params]\n"
+        ".word 0x4f98ea5f  // sdot   v31.4s, v18.16b, v24.4b[2]\n"
+        "st1    { v25.s }[3], [x12]\n"
+        "b.ne   " DC_KERNEL_MULT_14 "b\n"
+        "b      " DC_KERNEL_MULT_7 "b\n"
+        DC_KERNEL_MULT_15 ":\n"  // in Loop: Header=BB107_8 Depth=2
+        "ldr    x11, [sp, #72]\n"  // 8-byte Folded Reload
+        "ldr    x6, [sp, #232]\n"  // 8-byte Folded Reload
+        "mov    w12, wzr\n"
+        "mov    x14, %[scratch_block_data]\n"
+        "add    x13, x11, x15, lsl #2\n"
+        "ldr    w16, [sp, #324]\n"  // 4-byte Folded Reload
+        "cmp    w12, w16\n"
+        "b.lt   " DC_KERNEL_MULT_12 "b\n"
+        "b      " DC_KERNEL_MULT_7 "b\n"
+        DC_KERNEL_MULT_16 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "ldr    w16, [sp, #12]\n"  // 4-byte Folded Reload
+        "add    x11, %[bias_data], #32\n"  // =32
+        "tbnz   w16, #0,    " DC_KERNEL_MULT_3 "b\n"
+        // %bb.17:        // in Loop: Header=BB107_5 Depth=1
+        "ldp    x13, x16, [sp, #16]\n"  // 16-byte Folded Reload
+        "mov    x12, x14\n"
+        "lsl    w12, w12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "add    x13, x13, x12\n"
+        "add    x12, x16, x12\n"
+        "ldp    q19, q20, [%[bias_data]]\n"
+        "ldp    q21, q22, [x13]\n"
+        "ldp    q23, q24, [x12]\n"
+        "ldr    x15, [sp, #72]\n"  // 8-byte Folded Reload
+        "ldr    %[scratch_block_data], [sp, #304]\n"  // 8-byte Folded Reload
+        "mov    x21, %[output_block_data]\n"
+        "mov    x14, xzr\n"
+        "b      " DC_KERNEL_MULT_19 "f\n"
+        DC_KERNEL_MULT_18 ":\n"  // in Loop: Header=BB107_19 Depth=2
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "add    x14, x14, #1\n"  // =1
+        "cmp    x14, x12\n"
+        "ldr    x12, [sp, #256]\n"  // 8-byte Folded Reload
+        "add    x15, x15, x12\n"
+        "b.eq   " DC_KERNEL_MULT_2 "b\n"
+        DC_KERNEL_MULT_19 ":\n"  // Parent Loop BB107_5 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB107_21 Depth 3
+        // Child Loop BB107_22 Depth 4
+        "ldr    x12, [sp, #264]\n"  // 8-byte Folded Reload
+        "mov    w13, wzr\n"
+        "madd   x6, x14, %[scratch_block_data], x12\n"
+        "ldr    w12, [x6]\n"
+        "add    x16, x6, %[scratch_block_data]\n"
+        "fmov   s25, w12\n"
+        "mov    v25.s[1], w12\n"
+        "ld1    { v25.s }[2], [x16]\n"
+        "ldr    x16, [sp, #328]\n"  // 8-byte Folded Reload
+        "mov    v25.s[3], w12\n"
+        "add    x16, x6, x16\n"
+        "ld1r   { v26.4s }, [x16]\n"
+        "mov    x16, x15\n"
+        "b      " DC_KERNEL_MULT_21 "f\n"
+        DC_KERNEL_MULT_20 ":\n"  // in Loop: Header=BB107_21 Depth=3
+        "ldr    w12, [sp, #324]\n"  // 4-byte Folded Reload
+        "add    w13, w13, #1\n"  // =1
+        "cmp    w13, w12\n"
+        "b.eq   " DC_KERNEL_MULT_18 "b\n"
+        DC_KERNEL_MULT_21 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_19 Depth=2
+        // =>  This Loop Header: Depth=3
+        // Child Loop BB107_22 Depth 4
+        "ldr    %[output_block_data], [sp, #328]\n"  // 8-byte Folded Reload
+        "add    x6, x6, #4\n"  // =4
+        "mov    x12, x6\n"
+        "ld1    { v25.s }[1], [x12], %[output_block_data]\n"
+        "ldr    w3, [sp, #316]\n"  // 4-byte Folded Reload
+        "ld1    { v26.s }[1], [x12]\n"
+        "ldr    w12, [sp, #320]\n"  // 4-byte Folded Reload
+        "cmp    w13, w3\n"
+        "add    %[output_block_data], x6, %[scratch_block_data]\n"
+        "ld1    { v25.s }[3], [%[output_block_data]]\n"
+        "csel   w12, w12, w10, eq\n"
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_20 "b\n"
+        DC_KERNEL_MULT_22 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_19 Depth=2
+        // Parent Loop BB107_21 Depth=3
+        // =>  This Inner Loop Header: Depth=4
+        "mov    v27.16b, v19.16b\n"
+        "mov    v28.16b, v20.16b\n"
+        ".word 0x4f99e25b  // sdot   v27.4s, v18.16b, v25.4b[0]\n"
+        ".word 0x4f99e0bc  // sdot   v28.4s, v5.16b, v25.4b[0]\n"
+        ".word 0x4f99ea3b  // sdot   v27.4s, v17.16b, v25.4b[2]\n"
+        ".word 0x4f99e8dc  // sdot   v28.4s, v6.16b, v25.4b[2]\n"
+        ".word 0x4f9ae21b  // sdot   v27.4s, v16.16b, v26.4b[0]\n"
+        ".word 0x4f9ae0fc  // sdot   v28.4s, v7.16b, v26.4b[0]\n"
+        "sqrdmulh        v27.4s, v27.4s, v23.4s\n"
+        "sqrdmulh        v28.4s, v28.4s, v24.4s\n"
+        "sqrshl v27.4s, v27.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v22.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v27.8h, v28.4s\n"
+        "sqadd  v27.8h, v27.8h, v0.8h\n"
+        "sqxtn  v27.8b, v27.8h\n"
+        "smax   v27.8b, v27.8b, v3.8b\n"
+        "smin   v27.8b, v27.8b, v4.8b\n"
+        "subs   w12, w12, #1\n"  // =1
+        "ushr   v25.2d, v25.2d, #8\n"
+        "ushr   v26.2d, v26.2d, #8\n"
+        "str    d27, [x16]\n"
+        "add    x16, x16, %[function_params]\n"
+        "b.ne   " DC_KERNEL_MULT_22 "b\n"
+        "b      " DC_KERNEL_MULT_20 "b\n"
+        DC_KERNEL_MULT_23 ":\n"
+
+
+        // Compiled intrinsics total stack 448, now 352 for spillage only.
+        "add    sp, sp, #352\n"  // =448
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_MULT_1
+#undef DC_KERNEL_MULT_2
+#undef DC_KERNEL_MULT_3
+#undef DC_KERNEL_MULT_4
+#undef DC_KERNEL_MULT_5
+#undef DC_KERNEL_MULT_6
+#undef DC_KERNEL_MULT_7
+#undef DC_KERNEL_MULT_8
+#undef DC_KERNEL_MULT_9
+#undef DC_KERNEL_MULT_10
+#undef DC_KERNEL_MULT_11
+#undef DC_KERNEL_MULT_12
+#undef DC_KERNEL_MULT_13
+#undef DC_KERNEL_MULT_14
+#undef DC_KERNEL_MULT_15
+#undef DC_KERNEL_MULT_16
+#undef DC_KERNEL_MULT_17
+#undef DC_KERNEL_MULT_18
+#undef DC_KERNEL_MULT_19
+#undef DC_KERNEL_MULT_20
+#undef DC_KERNEL_MULT_21
+#undef DC_KERNEL_MULT_22
+#undef DC_KERNEL_MULT_23
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_MULT_STRIDE_1 "1"
+#define DC_KERNEL_MULT_STRIDE_2 "2"
+#define DC_KERNEL_MULT_STRIDE_3 "3"
+#define DC_KERNEL_MULT_STRIDE_4 "4"
+#define DC_KERNEL_MULT_STRIDE_5 "5"
+#define DC_KERNEL_MULT_STRIDE_6 "6"
+#define DC_KERNEL_MULT_STRIDE_7 "7"
+#define DC_KERNEL_MULT_STRIDE_8 "8"
+#define DC_KERNEL_MULT_STRIDE_9 "9"
+#define DC_KERNEL_MULT_STRIDE_10 "10"
+#define DC_KERNEL_MULT_STRIDE_11 "11"
+#define DC_KERNEL_MULT_STRIDE_12 "12"
+#define DC_KERNEL_MULT_STRIDE_13 "13"
+#define DC_KERNEL_MULT_STRIDE_14 "14"
+#define DC_KERNEL_MULT_STRIDE_15 "15"
+#define DC_KERNEL_MULT_STRIDE_16 "16"
+#define DC_KERNEL_MULT_STRIDE_17 "17"
+#define DC_KERNEL_MULT_STRIDE_18 "18"
+
+    asm volatile(
+        // Compiled code used block of 32 for spill out of total stack of 112.
+        "sub    sp, sp, #32\n"  // =112
+
+
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "cmp    w8, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_STRIDE_18 "f\n"
+        // %bb.1:
+        "ldr    w7, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldp    w12, w22, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldpsw  x10, x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldrsw  x17, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "add    x13, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "add    x14, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
+        "add    x6, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
+        "cmp    w7, #2\n"  // =2
+        "ldp    x15, x16, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "ldr    w4, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ld1r   { v0.8h }, [x13]\n"
+        "ld1r   { v1.8b }, [x6]\n"
+        "ld1r   { v2.8b }, [x14]\n"
+        "ccmp   w22, w12, #0, lt\n"
+        "add    x13, x10, x17\n"
+        "str    x22, [sp]\n"  // 8-byte Folded Spill
+        "csel   w22, w22, w12, lt\n"
+        "lsl    x6, x11, #1\n"
+        "add    x21, x13, #4\n"  // =4
+        "bic    w13, w22, w22, asr #31\n"
+        "mov    x9, xzr\n"
+        "add    x5, %[scratch_block_data], #4\n"  // =4
+        "str    w7, [sp, #12]\n"  // 4-byte Folded Spill
+        "add    x7, x17, #4\n"  // =4
+        "add    x19, x10, #4\n"  // =4
+        "add    x20, x6, x11\n"
+        "lsl    x14, x13, #2\n"
+        "sub    x13, x12, x13\n"
+        "stp    x13, x14, [sp, #16]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_MULT_STRIDE_3 "f\n"
+        DC_KERNEL_MULT_STRIDE_2 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "add    x9, x9, #1\n"  // =1
+        "cmp    x9, x8\n"
+        "b.eq   " DC_KERNEL_MULT_STRIDE_18 "f\n"
+        DC_KERNEL_MULT_STRIDE_3 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB108_16 Depth 2
+        // Child Loop BB108_11 Depth 2
+        // Child Loop BB108_6 Depth 2
+        // Child Loop BB108_13 Depth 2
+        "lsl    w13, w9, #3\n"
+        "lsl    x14, x13, #2\n"
+        "add    x23, x16, x14\n"
+        "ldp    q19, q20, [x23]\n"
+        "ldr    w23, [%[scratch_block_data]]\n"
+        "add    x14, x15, x14\n"
+        "ldp    q21, q22, [x14]\n"
+        "add    x14, %[scratch_block_data], x11\n"
+        "fmov   s23, w23\n"
+        "mov    v23.s[1], w23\n"
+        "ld1    { v23.s }[2], [x14]\n"
+        "ldp    q3, q4, [%[filter_workspace]]\n"
+        "ldp    q5, q6, [%[filter_workspace], #32]\n"
+        "ldp    q7, q16, [%[filter_workspace], #64]\n"
+        "ldp    q17, q18, [%[bias_data]], #32\n"
+        "ldr    s24, [%[scratch_block_data], x6]\n"
+        "add    %[filter_workspace], x1, #96\n"  // =96
+        "add    x25, %[output_block_data], x13\n"
+        "cmp    w4, #2\n"  // =2
+        "mov    v23.s[3], w23\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_8 "f\n"
+        // %bb.4:        // in Loop: Header=BB108_3 Depth=1
+        "dup    v24.4s, v24.s[0]\n"
+        "add    x13, %[scratch_block_data], x20\n"
+        "add    x14, %[scratch_block_data], x11, lsl #2\n"
+        "ld1    { v24.s }[2], [x13]\n"
+        "ld1r   { v25.4s }, [x14]\n"
+        "cmp    w22, #1\n"  // =1
+        "lsl    x26, x11, #2\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_12 "f\n"
+        // %bb.5:        // in Loop: Header=BB108_3 Depth=1
+        "mov    x27, xzr\n"
+        "mov    x28, x22\n"
+        DC_KERNEL_MULT_STRIDE_6 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x13, x27, #0xfffffffc\n"
+        "add    x13, x5, x13\n"
+        "mov    x23, x13\n"
+        "ld1    { v23.s }[1], [x23], x26\n"
+        "add    x24, x13, x6\n"
+        "ld1    { v24.s }[1], [x24]\n"
+        "add    x14, x13, x11\n"
+        "add    x24, x13, x20\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "ld1    { v24.s }[3], [x24]\n"
+        "mov    v27.16b, v17.16b\n"
+        "ld1    { v25.s }[1], [x23]\n"
+        "mov    v28.16b, v17.16b\n"
+        ".word 0x4f97e07b  // sdot   v27.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f98e07c  // sdot   v28.4s, v3.16b, v24.4b[0]\n"
+        ".word 0x4f97e8bb  // sdot   v27.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f98e8bc  // sdot   v28.4s, v5.16b, v24.4b[2]\n"
+        ".word 0x4f98e0fb  // sdot   v27.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f99e0fc  // sdot   v28.4s, v7.16b, v25.4b[0]\n"
+        "sqrdmulh        v27.4s, v27.4s, v21.4s\n"
+        "sqrdmulh        v28.4s, v28.4s, v21.4s\n"
+        "sqrshl v27.4s, v27.4s, v19.4s\n"
+        "sqrshl v28.4s, v28.4s, v19.4s\n"
+        "sqxtn  v31.4h, v27.4s\n"
+        "sqxtn2 v31.8h, v28.4s\n"
+        "mov    v29.16b, v18.16b\n"
+        "sqadd  v28.8h, v31.8h, v0.8h\n"
+        "mov    v30.16b, v18.16b\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        ".word 0x4f97e09d  // sdot   v29.4s, v4.16b, v23.4b[0]\n"
+        "add    x13, x25, x19\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        ".word 0x4f98e09e  // sdot   v30.4s, v4.16b, v24.4b[0]\n"
+        ".word 0x4f97e8dd  // sdot   v29.4s, v6.16b, v23.4b[2]\n"
+        "sub    x23, x13, #4\n"  // =4
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        ".word 0x4f98e8de  // sdot   v30.4s, v6.16b, v24.4b[2]\n"
+        ".word 0x4f98e21d  // sdot   v29.4s, v16.16b, v24.4b[0]\n"
+        "str    s28, [x25]\n"
+        "st1    { v28.s }[1], [x23]\n"
+        ".word 0x4f99e21e  // sdot   v30.4s, v16.16b, v25.4b[0]\n"
+        "sqrdmulh        v28.4s, v29.4s, v22.4s\n"
+        "sqrdmulh        v29.4s, v30.4s, v22.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        "mov    v26.16b, v17.16b\n"
+        "str    s28, [x25, #4]\n"
+        "mov    v29.16b, v18.16b\n"
+        "st1    { v28.s }[1], [x13]\n"
+        "ushr   v28.2d, v23.2d, #16\n"
+        ".word 0x4f9ce07a  // sdot   v26.4s, v3.16b, v28.4b[0]\n"
+        ".word 0x4f9ce09d  // sdot   v29.4s, v4.16b, v28.4b[0]\n"
+        "mov    v27.16b, v17.16b\n"
+        "mov    v30.16b, v18.16b\n"
+        ".word 0x4f9ce8ba  // sdot   v26.4s, v5.16b, v28.4b[2]\n"
+        ".word 0x4f9ce8dd  // sdot   v29.4s, v6.16b, v28.4b[2]\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f9ce07b  // sdot   v27.4s, v3.16b, v28.4b[0]\n"
+        ".word 0x4f9ce09e  // sdot   v30.4s, v4.16b, v28.4b[0]\n"
+        ".word 0x4f9ce8bb  // sdot   v27.4s, v5.16b, v28.4b[2]\n"
+        ".word 0x4f9ce8de  // sdot   v30.4s, v6.16b, v28.4b[2]\n"
+        ".word 0x4f9ce0fa  // sdot   v26.4s, v7.16b, v28.4b[0]\n"
+        ".word 0x4f9ce21d  // sdot   v29.4s, v16.16b, v28.4b[0]\n"
+        "ushr   v28.2d, v25.2d, #16\n"
+        ".word 0x4f9ce0fb  // sdot   v27.4s, v7.16b, v28.4b[0]\n"
+        "sqrdmulh        v26.4s, v26.4s, v21.4s\n"
+        "sqrdmulh        v27.4s, v27.4s, v21.4s\n"
+        "sqrshl v26.4s, v26.4s, v19.4s\n"
+        "sqrshl v27.4s, v27.4s, v19.4s\n"
+        "sqxtn  v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd  v26.8h, v26.8h, v0.8h\n"
+        ".word 0x4f9ce21e  // sdot   v30.4s, v16.16b, v28.4b[0]\n"
+        "sqrdmulh        v28.4s, v29.4s, v22.4s\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "add    x24, x25, x21\n"
+        "sqrdmulh        v29.4s, v30.4s, v22.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "add    x23, x25, x7\n"
+        "sub    x13, x24, #4\n"  // =4
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "stur   s26, [x23, #-4]\n"
+        "st1    { v26.s }[1], [x13]\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v26.8h, v28.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "add    x14, x25, x17\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "subs   x28, x28, #1\n"  // =1
+        "ushr   v23.2d, v23.2d, #32\n"
+        "ushr   v24.2d, v24.2d, #32\n"
+        "ushr   v25.2d, v25.2d, #32\n"
+        "add    x25, x14, x17\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "add    x27, x27, #4\n"  // =4
+        "str    s26, [x23]\n"
+        "st1    { v26.s }[1], [x24]\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_6 "b\n"
+        // %bb.7:        // in Loop: Header=BB108_3 Depth=1
+        "mov    w13, w22\n"
+        "cmp    w13, w12\n"
+        "ldp    x13, x27, [sp, #16]\n"  // 16-byte Folded Reload
+        "b.lt   " DC_KERNEL_MULT_STRIDE_13 "f\n"
+        "b      " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_8 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        // %bb.9:        // in Loop: Header=BB108_3 Depth=1
+        "ldr    w13, [sp, #12]\n"  // 4-byte Folded Reload
+        "dup    v24.4s, v24.s[0]\n"
+        "cmp    w13, #2\n"  // =2
+        "b.ne   " DC_KERNEL_MULT_STRIDE_14 "f\n"
+        // %bb.10:        // in Loop: Header=BB108_3 Depth=1
+        "mov    x26, xzr\n"
+        "mov    x13, x12\n"
+        DC_KERNEL_MULT_STRIDE_11 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x14, x26, #0xfffffffc\n"
+        "add    x14, x5, x14\n"
+        "mov    x23, x14\n"
+        "ld1    { v23.s }[1], [x23], x6\n"
+        "add    x14, x14, x11\n"
+        "mov    v26.16b, v17.16b\n"
+        "mov    v27.16b, v18.16b\n"
+        "ld1    { v24.s }[1], [x23]\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "mov    v25.16b, v17.16b\n"
+        "add    x14, x25, x17\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f9ce0fa  // sdot   v26.4s, v7.16b, v28.4b[0]\n"
+        ".word 0x4f9ce21b  // sdot   v27.4s, v16.16b, v28.4b[0]\n"
+        "ushr   v28.2d, v23.2d, #16\n"
+        ".word 0x4f9ce07a  // sdot   v26.4s, v3.16b, v28.4b[0]\n"
+        ".word 0x4f9ce09b  // sdot   v27.4s, v4.16b, v28.4b[0]\n"
+        ".word 0x4f9ce8ba  // sdot   v26.4s, v5.16b, v28.4b[2]\n"
+        ".word 0x4f9ce8db  // sdot   v27.4s, v6.16b, v28.4b[2]\n"
+        "mov    v28.16b, v18.16b\n"
+        ".word 0x4f98e0f9  // sdot   v25.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f98e21c  // sdot   v28.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f97e079  // sdot   v25.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f97e09c  // sdot   v28.4s, v4.16b, v23.4b[0]\n"
+        ".word 0x4f97e8b9  // sdot   v25.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f97e8dc  // sdot   v28.4s, v6.16b, v23.4b[2]\n"
+        "sqrdmulh        v25.4s, v25.4s, v21.4s\n"
+        "sqrdmulh        v28.4s, v28.4s, v22.4s\n"
+        "sqrshl v25.4s, v25.4s, v19.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqxtn2 v25.8h, v28.4s\n"
+        "sqadd  v25.8h, v25.8h, v0.8h\n"
+        "sqrdmulh        v26.4s, v26.4s, v21.4s\n"
+        "sqxtn  v25.8b, v25.8h\n"
+        "sqrdmulh        v27.4s, v27.4s, v22.4s\n"
+        "sqrshl v26.4s, v26.4s, v19.4s\n"
+        "smax   v25.8b, v25.8b, v1.8b\n"
+        "sqrshl v27.4s, v27.4s, v20.4s\n"
+        "sqxtn  v26.4h, v26.4s\n"
+        "smin   v25.8b, v25.8b, v2.8b\n"
+        "str    d25, [x25]\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd  v25.8h, v26.8h, v0.8h\n"
+        "sqxtn  v25.8b, v25.8h\n"
+        "smax   v25.8b, v25.8b, v1.8b\n"
+        "smin   v25.8b, v25.8b, v2.8b\n"
+        "subs   x13, x13, #1\n"  // =1
+        "ushr   v24.2d, v24.2d, #32\n"
+        "ushr   v23.2d, v23.2d, #32\n"
+        "str    d25, [x25, x17]\n"
+        "add    x25, x14, x17\n"
+        "add    x26, x26, #4\n"  // =4
+        "b.ne   " DC_KERNEL_MULT_STRIDE_11 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_12 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "mov    w13, wzr\n"
+        "cmp    w13, w12\n"
+        "ldp    x13, x27, [sp, #16]\n"  // 16-byte Folded Reload
+        "b.ge   " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_13 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x14, x27, #0xfffffffc\n"
+        "add    x14, x5, x14\n"
+        "mov    x24, x14\n"
+        "add    x23, x14, x6\n"
+        "ld1    { v23.s }[1], [x24], x26\n"
+        "ld1    { v24.s }[1], [x23]\n"
+        "add    x23, x14, x11\n"
+        "add    x14, x14, x20\n"
+        "ld1    { v23.s }[3], [x23]\n"
+        "ld1    { v24.s }[3], [x14]\n"
+        "mov    v26.16b, v17.16b\n"
+        "ld1    { v25.s }[1], [x24]\n"
+        "mov    v27.16b, v17.16b\n"
+        ".word 0x4f97e07a  // sdot   v26.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f98e07b  // sdot   v27.4s, v3.16b, v24.4b[0]\n"
+        ".word 0x4f97e8ba  // sdot   v26.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f98e8bb  // sdot   v27.4s, v5.16b, v24.4b[2]\n"
+        ".word 0x4f98e0fa  // sdot   v26.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f99e0fb  // sdot   v27.4s, v7.16b, v25.4b[0]\n"
+        "sqrdmulh        v26.4s, v26.4s, v21.4s\n"
+        "sqrdmulh        v27.4s, v27.4s, v21.4s\n"
+        "sqrshl v26.4s, v26.4s, v19.4s\n"
+        "sqrshl v27.4s, v27.4s, v19.4s\n"
+        "sqxtn  v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd  v26.8h, v26.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "add    x14, x25, x10\n"
+        "mov    v27.16b, v18.16b\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "str    s26, [x25]\n"
+        "st1    { v26.s }[1], [x14]\n"
+        "mov    v26.16b, v18.16b\n"
+        ".word 0x4f97e09b  // sdot   v27.4s, v4.16b, v23.4b[0]\n"
+        ".word 0x4f98e09a  // sdot   v26.4s, v4.16b, v24.4b[0]\n"
+        ".word 0x4f97e8db  // sdot   v27.4s, v6.16b, v23.4b[2]\n"
+        ".word 0x4f98e8da  // sdot   v26.4s, v6.16b, v24.4b[2]\n"
+        ".word 0x4f98e21b  // sdot   v27.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f99e21a  // sdot   v26.4s, v16.16b, v25.4b[0]\n"
+        "sqrdmulh        v27.4s, v27.4s, v22.4s\n"
+        "sqrdmulh        v26.4s, v26.4s, v22.4s\n"
+        "sqrshl v27.4s, v27.4s, v20.4s\n"
+        "sqrshl v26.4s, v26.4s, v20.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v27.8h, v26.4s\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "subs   x13, x13, #1\n"  // =1
+        "add    x14, x14, #4\n"  // =4
+        "ushr   v23.2d, v23.2d, #16\n"
+        "ushr   v24.2d, v24.2d, #16\n"
+        "ushr   v25.2d, v25.2d, #16\n"
+        "str    s26, [x25, #4]\n"
+        "add    x25, x25, x17\n"
+        "add    x27, x27, #4\n"  // =4
+        "st1    { v26.s }[1], [x14]\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_13 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_14 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "ldr    x27, [sp]\n"  // 8-byte Folded Reload
+        "mov    x13, xzr\n"
+        "mov    x26, x12\n"
+        "b      " DC_KERNEL_MULT_STRIDE_16 "f\n"
+        DC_KERNEL_MULT_STRIDE_15 ":\n"  // in Loop: Header=BB108_16 Depth=2
+        "add    x13, x13, #4\n"  // =4
+        "subs   x26, x26, #1\n"  // =1
+        "sub    x27, x27, #1\n"  // =1
+        "mov    v23.16b, v25.16b\n"
+        "mov    v24.16b, v26.16b\n"
+        "b.eq   " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_16 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x14, x13, #0xfffffffc\n"
+        "add    x14, x5, x14\n"
+        "mov    x23, x14\n"
+        "ld1    { v23.s }[1], [x23], x6\n"
+        "add    x14, x14, x11\n"
+        "mov    v25.16b, v17.16b\n"
+        "mov    v26.16b, v18.16b\n"
+        "ld1    { v24.s }[1], [x23]\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        ".word 0x4f98e0f9  // sdot   v25.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f98e21a  // sdot   v26.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f97e079  // sdot   v25.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f97e09a  // sdot   v26.4s, v4.16b, v23.4b[0]\n"
+        ".word 0x4f97e8b9  // sdot   v25.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f97e8da  // sdot   v26.4s, v6.16b, v23.4b[2]\n"
+        "sqrdmulh        v25.4s, v25.4s, v21.4s\n"
+        "sqrdmulh        v26.4s, v26.4s, v22.4s\n"
+        "sqrshl v25.4s, v25.4s, v19.4s\n"
+        "sqrshl v26.4s, v26.4s, v20.4s\n"
+        "sqxtn  v27.4h, v25.4s\n"
+        "sqxtn2 v27.8h, v26.4s\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "ushr   v25.2d, v23.2d, #16\n"
+        "str    d26, [x25]\n"
+        "ushr   v26.2d, v24.2d, #16\n"
+        "add    x25, x25, x17\n"
+        "cbz    x27,    " DC_KERNEL_MULT_STRIDE_15 "b\n"
+        // %bb.17:        // in Loop: Header=BB108_16 Depth=2
+        "mov    v27.16b, v17.16b\n"
+        "mov    v28.16b, v18.16b\n"
+        ".word 0x4f9ae0fb  // sdot   v27.4s, v7.16b, v26.4b[0]\n"
+        ".word 0x4f9ae21c  // sdot   v28.4s, v16.16b, v26.4b[0]\n"
+        ".word 0x4f99e07b  // sdot   v27.4s, v3.16b, v25.4b[0]\n"
+        ".word 0x4f99e09c  // sdot   v28.4s, v4.16b, v25.4b[0]\n"
+        ".word 0x4f99e8bb  // sdot   v27.4s, v5.16b, v25.4b[2]\n"
+        ".word 0x4f99e8dc  // sdot   v28.4s, v6.16b, v25.4b[2]\n"
+        "ushr   v25.2d, v23.2d, #32\n"
+        "sqrdmulh        v23.4s, v27.4s, v21.4s\n"
+        "ushr   v26.2d, v24.2d, #32\n"
+        "sqrdmulh        v24.4s, v28.4s, v22.4s\n"
+        "sqrshl v23.4s, v23.4s, v19.4s\n"
+        "sqrshl v24.4s, v24.4s, v20.4s\n"
+        "sqxtn  v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd  v23.8h, v23.8h, v0.8h\n"
+        "sqxtn  v23.8b, v23.8h\n"
+        "smax   v23.8b, v23.8b, v1.8b\n"
+        "smin   v23.8b, v23.8b, v2.8b\n"
+        "str    d23, [x25]\n"
+        "add    x25, x25, x17\n"
+        "b      " DC_KERNEL_MULT_STRIDE_15 "b\n"
+        DC_KERNEL_MULT_STRIDE_18 ":\n"
+
+        // Compiled intrinsics total stack 112, now 32 for spillage only.
+        "add    sp, sp, #32\n"  // =112
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_MULT_STRIDE_1
+#undef DC_KERNEL_MULT_STRIDE_2
+#undef DC_KERNEL_MULT_STRIDE_3
+#undef DC_KERNEL_MULT_STRIDE_4
+#undef DC_KERNEL_MULT_STRIDE_5
+#undef DC_KERNEL_MULT_STRIDE_6
+#undef DC_KERNEL_MULT_STRIDE_7
+#undef DC_KERNEL_MULT_STRIDE_8
+#undef DC_KERNEL_MULT_STRIDE_9
+#undef DC_KERNEL_MULT_STRIDE_10
+#undef DC_KERNEL_MULT_STRIDE_11
+#undef DC_KERNEL_MULT_STRIDE_12
+#undef DC_KERNEL_MULT_STRIDE_13
+#undef DC_KERNEL_MULT_STRIDE_14
+#undef DC_KERNEL_MULT_STRIDE_15
+#undef DC_KERNEL_MULT_STRIDE_16
+#undef DC_KERNEL_MULT_STRIDE_17
+#undef DC_KERNEL_MULT_STRIDE_18
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+#undef DP_OFFSET_INPUT_DEPTH
+#undef DP_OFFSET_OUTPUT_DEPTH
+#undef DP_OFFSET_STRIDE
+#undef DP_OFFSET_BIAS_INCREMENT
+//
+#undef DP_OFFSET_INPUT_OFFSET
+#undef DP_OFFSET_OUTPUT_OFFSET
+#undef DP_OFFSET_OUTPUT_MULTIPLIER
+#undef DP_OFFSET_OUTPUT_SHIFT
+#undef DP_OFFSET_QUANTIZED_ACTIVATION_MIN
+#undef DP_OFFSET_QUANTIZED_ACTIVATION_MAX
+//
+#undef DP_OFFSET_PADDING_LEFT
+#undef DP_OFFSET_PADDING_RIGHT
+#undef DP_OFFSET_PADDING_TOP
+#undef DP_OFFSET_PADDING_BOTTOM
+//
+#undef DP_OFFSET_DEPTH_MICRO_REPEATS
+//
+#undef DP_OFFSET_WIDTH_MACRO_COUNT
+#undef DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS
+#undef DP_OFFSET_INPUT_WIDTH_MICRO_REPEATS
+#undef DP_OFFSET_RESIDUAL_WIDTH
+#undef DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS
+#undef DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS
+#undef DP_OFFSET_OUTPUT_RESIDUAL_WIDTH
+#undef DP_OFFSET_WORKSPACE_WIDTH_MICRO_REPEATS
+//
+#undef DP_OFFSET_HEIGHT_MACRO_COUNT
+#undef DP_OFFSET_INBOUND_BLOCK_HEIGHT
+#undef DP_OFFSET_OUTBOUND_BLOCK_HEIGHT
+#undef DP_OFFSET_INPUT_HEIGHT_STRIDE
+#undef DP_OFFSET_OUTPUT_HEIGHT_STRIDE
+#undef DP_OFFSET_WORKSPACE_HEIGHT_STRIDE
+//
+#undef DP_OFFSET_FOUR_OVER_STRIDE
+
+#endif  // __aarch64__ && !GOOGLE_L4T - Dot product ops hard-coded
+
+// Top-level implementation function for 3x3 depthwise convolution using NEON
+// dot-product instructions.
+//
+// MACRO & MICRO BLOCKS
+//
+// The task is divided into macro blocks. Data is copied first into a macro
+// block in a workspace. This has two purposes: (a) bringing data into
+// cache, and (b) permuting data so that it can be used much more easily in
+// a dot-product filter.
+//
+// When there is no depth multiplication:
+//
+// The permutations required for dot-products are local, within 4 data points
+// down the depth and 4 across the width. We want to pull in input data at least
+// 8-bytes at a time, down the depth, and so we divide the macro blocks into
+// 1x4x8 (height, width, depth) and further divide the micro blocks into
+// sub-blocks with shape (1x4x4).
+//
+// Each macro-block is constructed from micro-blocks that are internally
+// rearranged during loading into the macro-block workspace.
+//
+// In other words, the micro-block shape is
+//     {1, 1, 4, 8}
+// Each macro block is typically shape
+//     {1, height_block_size, 4 * workspace_width_micro_repeats, 64}
+// and workspace_width_micro_repeats is chosen so it fits into the workspace.
+//
+// However, if depth < 64, we decrease the macro block depth, enabling us to
+// increase the macro-block width.
+//
+// When there is depth multiplication:
+//
+// We require input-depth = 1 and exploit that instead.  Note that output data
+// is still full-depth, *as is the filter and bias data after certain
+// adjustments*, and so the filter stage in this case still proceeds in terms of
+// sub-blocks.
+//
+// The Magic of these numbers:
+//     4 is the number of input elements used in each dot-product.
+//     8 is the number of inputs we load at a time into a register.
+//     64 is min amount of data to be loaded in a stretch (when possible).
+//
+// FILTER DATA PREPARATION
+//
+// Filter data needs to be permuted in a fashion like that of input data, and
+// this is done in a preprocessing stage. In addition, this stage extends the
+// filter in the direction of width from 3 to 4. The extra filter taps are set
+// to zero so that input data does not have to be zeroed before applying
+// dot-products.
+//
+// OVERALL COUNTS: HANDLING TRAILING ITERATION
+//
+// Often it is necessary to handle the last iteration in a loop differently,
+// generally because the final item is shorter. The logic to detect the
+// special case can be a bit expensive. We use a scheme in which there are
+// two counts, in a pattern like xxx_yyy_repeats and
+// xxx_overall_yyy_repeats. The first gives the count of "normal"
+// iterations. The loop iterates over the second count, and the induction
+// variable is checked to see if it reaches xxx_yyy_repeats. If there is no
+// special trailing iteration, xxx_yyy_repeats = xxx_overall_yyy_repeats,
+// and the special code is not executed.
+//
+// Example:
+// Suppose that we characterize a size s as
+// f(s) -> (block-4-repetitions, remainder, overall_repetitions):
+// f(11) -> (2, 3, 3)
+// f(12) -> (3, 0, 3)
+// f(13) -> (3, 1, 4)
+//
+// POINTING OUTSIDE OF INPUT ARRAY.
+//
+// When there is padding, the input data pointer passed to the fill routines
+// points outside of the input array and into a kind-of virtual padded
+// margin. It turns out that this simplifies the code and removes
+// conditional statements. It is hard to explain why without comparing two
+// versions of the code. In summary, this way the adjustment into the margin
+// can be made unconditionally, and the correction back into the input array
+// is done where there is a conditional already.
+//
+// OVERLAP
+//
+// Since this is *depthwise* conv, neither the batch nor the depth have overlap.
+// The height and depth overlap by (filter_size - 1). Thus some data is used
+// twice on the borders of macro blocks.
+//
+template <DepthwiseConvImplementation implementation,
+          QuantizationType quantization_type>
+inline void DepthwiseConvDotProduct3x3Impl(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        input_data,
+    const RuntimeShape& filter_shape,
+    const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape,
+    typename QuantizationTypeImpl<quantization_type>::ExternalType* output_data,
+    int thread_start, int thread_end, int thread_dim) {
+  // Check kernel restrictions.
+  constexpr int filter_size = 3;
+  constexpr int kMaxStride = 2;
+  constexpr int kMaxPadding = 1;
+  constexpr int kSymmetricZeroPoint =
+      QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+  TFLITE_DCHECK_EQ(params.weights_offset, -kSymmetricZeroPoint);
+  TFLITE_DCHECK_LE(params.stride_width, kMaxStride);
+  TFLITE_DCHECK_EQ(params.stride_height, params.stride_width);
+  TFLITE_DCHECK_EQ(params.dilation_width_factor, 1);
+  TFLITE_DCHECK_EQ(params.dilation_height_factor, 1);
+  TFLITE_DCHECK_LE(params.padding_values.width, kMaxPadding);
+  TFLITE_DCHECK_LE(params.padding_values.height, kMaxPadding);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  // Key kernel parameters (along with padding handled later).
+  const int stride = params.stride_width;
+  const int depth_multiplier = params.depth_multiplier;
+  const bool has_depth_multiplication = depth_multiplier > 1;
+
+  // Extract task dimensions.
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  TFLITE_DCHECK(!has_depth_multiplication || input_depth == 1);
+  TFLITE_DCHECK(has_depth_multiplication || input_depth == output_depth);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  TFLITE_DCHECK_EQ(input_depth * depth_multiplier, output_depth);
+  TFLITE_DCHECK_EQ(MatchingDim(filter_shape, 1, filter_shape, 2), filter_size);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  // Return now if nothing to do.
+  if (output_width == 0 || output_height == 0) {
+    return;
+  }
+
+  // Kernel parameter structure: set basic fields.
+  //
+  // In asm it is easier to pass a structure than more than, say, 8 parameters.
+  DepthwiseConvDotProdParams function_params;
+  function_params.input_depth = input_depth;
+  function_params.output_depth = output_depth;
+  function_params.input_offset = params.input_offset;
+  function_params.output_offset = params.output_offset;
+  function_params.output_multiplier = params.output_multiplier;
+  function_params.output_shift = params.output_shift;
+  function_params.quantized_activation_min = params.quantized_activation_min;
+  function_params.quantized_activation_max = params.quantized_activation_max;
+  function_params.stride = stride;
+
+  // Handle inbound bias data.
+  //
+  // Note that this data is adjusted in a per-depth process before the main
+  // filters. The adjustment accounts for a non-symmetric input offset.
+  //
+  // Kernel subroutines need to be able to operate consistently on an bias
+  // array. Where there is no bias, we provide one filled with zeros.
+  constexpr int kMinBiasLoad = 8;
+  int32_t zero_bias_data[kMinBiasLoad];
+  int32_t bias_increment;
+  if (bias_data) {
+    bias_increment = 4;
+  } else {
+    memset(zero_bias_data, 0, sizeof(zero_bias_data));
+    bias_data = &zero_bias_data[0];
+    bias_increment = 0;
+  }
+  function_params.bias_increment = bias_increment;
+  TFLITE_DCHECK_LE(2 * function_params.bias_increment, kMinBiasLoad);
+
+  // Process multithreading.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      break;
+  }
+  const int row_count = row_end - row_start;
+
+  // Process padding.
+  //
+  // Whether "correct" or not, this matches ComputeConvSizes. When there is
+  // stride > 1 there can be padding on the bottom or top, and therefore
+  // we need to consider padding. This is true even if one or other of the
+  // padding_values is 0.
+  const int padded_width = (output_width - 1) * stride + filter_size;
+  int full_padding_top;
+  {
+    const int padding_left = params.padding_values.width;
+    // Right padding would be -1 if discarding input because of stride.
+    const int padding_right =
+        std::max(padded_width - input_width - padding_left, 0);
+    int padding_top = params.padding_values.height;
+    const int padded_height = (output_height - 1) * stride + filter_size;
+    int padding_bottom =
+        std::max(padded_height - input_height - padding_top, 0);
+
+    TFLITE_DCHECK_LE(padding_left, padding_right);
+    TFLITE_DCHECK_LE(padding_top, padding_bottom);
+
+    full_padding_top = padding_top;
+    if (row_start != 0) {
+      padding_top = 0;
+    }
+    if (row_end != output_height) {
+      padding_bottom = 0;
+    }
+
+    function_params.padding_left = padding_left;
+    function_params.padding_right = padding_right;
+    function_params.padding_top = padding_top;
+    function_params.padding_bottom = padding_bottom;
+  }
+  // When stride == 1 left or top padding may only be non-zero.
+  // This is when padding is specified but not needed on a trailing dimension.
+  // When stride == 2 right or bottom padding may only be non-zero.
+  // This is a result of the details of the padding calculations.
+  const bool padding_required =
+      function_params.padding_left > 0 || function_params.padding_top > 0 ||
+      function_params.padding_right > 0 || function_params.padding_bottom > 0;
+
+  // Choose parameter-specific kernel subroutines.
+  //
+  // The main part of the kernel has two stages. First, a temporary workspace is
+  // filled with padded and permuted data. Second, the filter is applied to the
+  // workspace data to generate output.
+  //
+  // The workspace fill stage handles padding so that the filter stage does not
+  // need to account for it. The workspace fill stage does not need to
+  // understand striding, and implicitly handles striding through the parameters
+  // that it is given.
+  using pack_macro_block_func_t = decltype(
+      &PackMacroBlock<implementation, quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      0>::Run);
+  using kernel_macro_block_func_t = decltype(
+      &KernelMacroBlock<implementation, quantization_type,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        1>::Run);
+  pack_macro_block_func_t pack_macro_block_func;
+  kernel_macro_block_func_t kernel_macro_block_func;
+  {
+    if (has_depth_multiplication) {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation, quantization_type,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation, quantization_type,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation, quantization_type,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation, quantization_type,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/2>::Run;
+      }
+    } else {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation, quantization_type,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation, quantization_type,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, quantization_type,
+            DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, quantization_type,
+            DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/2>::Run;
+      }
+    }
+  }
+
+  // Stride-only variables.
+  //
+  const int row_count_per_macro = stride == 1 ? 4 : 2;
+  // row_count_per_macro * stride:
+  constexpr int input_height_per_macro = 4;
+  // Number of rows per micro block (= rows per macro block) is
+  //   (row_count_per_macro - 1) * stride + 1 + (filter_size - 1)
+  const int height_block_size = stride == 1 ? 3 + filter_size : 2 + filter_size;
+  const int input_height_overlap = filter_size - stride;
+  // stride == 1 ? 4 : 2:
+  function_params.four_over_stride = row_count_per_macro;
+
+  TFLITE_DCHECK_EQ(stride * function_params.four_over_stride, 4);
+  TFLITE_DCHECK_EQ(height_block_size,
+                   input_height_per_macro + input_height_overlap);
+
+  // Create workspaces.
+  //
+  // Filter workspace is for shuffle: only first depth/8 is used.
+  // indexed as [depth/8][sub-block][height][depth][width].
+  TFLITE_DCHECK_EQ(kDepthwiseConvAdjustedBiasLimit % 8, 0);
+  int8_t macroblock_workspace[kDepthwiseConvScratchWorkspaceSize];
+  int32_t adjusted_bias_data[kDepthwiseConvAdjustedBiasLimit];
+  int8_t filter_workspace[kDepthwiseConvAdjustedBiasLimit >> 3][3][2][4][4];
+
+  // Output depth characterization.
+  //
+  const int depth_macro_count = output_depth / 64;
+  const int depth_overall_macro_count = (output_depth + 63) / 64;
+  // Number of micro blocks down the depth in a final incomplete macro block.
+  const int depth_trailing_micro_repeats = output_depth / 8 % 8;
+  // The output_depth may not have a remainder: it must be a multiple of 8.
+  TFLITE_DCHECK_EQ(output_depth,
+                   64 * depth_macro_count + 8 * depth_trailing_micro_repeats);
+
+  // Characterize the first macro block depth, the largest.
+  //
+  // We base treatment of the width on the trailing macro block if there are
+  // no full blocks, in order to do more work together (that is, increase
+  // workspace_width_micro_repeats when largest_macro_depth < 64).
+  const int largest_macro_depth =
+      has_depth_multiplication
+          ? 1
+          : (depth_macro_count > 0 ? 64 : 8 * depth_trailing_micro_repeats);
+
+  // Characterize width, consumption of input and generation of output.
+  //
+  // In the case of depth multiplication, we ensure that some of the workspace
+  // at the end remains unused. This enables the filter routines to load the
+  // "next" data, of at least 16 bytes, even when at the end of the workspace.
+  // It is relatively expensive to detect the end micro block. It is also very
+  // difficult to test for (to trigger) erroneous reads (past end of array) in
+  // the depth multiplication case.
+  int workspace_width_micro_repeats =
+      (has_depth_multiplication
+           ? kDepthwiseConvScratchWorkspaceSize - kWorkspaceExtension
+           : kDepthwiseConvScratchWorkspaceSize) /
+      (4 * largest_macro_depth * height_block_size);
+  // When there is no depth multiplication, the workspace depth is a multiple of
+  // 8, which ensures that workspace rows are 16-byte aligned. (Actually 32,
+  // because of the micro width of 4.) This is not necessarily the case under
+  // depth multiplication, so we adjust now to impose this restriction.
+  if (has_depth_multiplication) {
+    workspace_width_micro_repeats = (workspace_width_micro_repeats / 4) * 4;
+  }
+  TFLITE_DCHECK_EQ((workspace_width_micro_repeats * largest_macro_depth) % 4,
+                   0);
+  // Discount 1 of the micro-block repeats in each macro block to account for
+  // overlap.
+  const int consumed_width_per_macro_block =
+      4 * (workspace_width_micro_repeats - 1);
+  const int output_width_per_macro_block =
+      function_params.four_over_stride * (workspace_width_micro_repeats - 1);
+  TFLITE_DCHECK_GT(workspace_width_micro_repeats, 1);
+  TFLITE_DCHECK_EQ(output_width_per_macro_block * stride,
+                   consumed_width_per_macro_block);
+
+  // Width repetitions and residuals.
+  //
+  // Use of the workspace is characterized primarily in terms of *padded input*.
+  // Striding only matters in a few places.
+  //
+  // Simplifications: We require that there always be at least one full
+  // micro-block across the width. Since the maximum padding is 1, the trailing
+  // padding cannot span two micro blocks.
+  const int residual_micro_width = padded_width % 4;
+  // We base the count of macro blocks on the amount of padded input data each
+  // one consumes.
+  int width_overall_macro_count = (padded_width - residual_micro_width +
+                                   consumed_width_per_macro_block - 1) /
+                                  consumed_width_per_macro_block;
+  // Recall that we left a micro block at the end of each macro block for use as
+  // overlap. There is a special case in which we can use one fewer macro
+  // blocks, with the last one consuming extra input. (But not if the
+  // calculation thinks that we can use zero blocks.)
+  if (padded_width <=
+      ((width_overall_macro_count - 1) * consumed_width_per_macro_block + 4)) {
+    width_overall_macro_count -= 1;
+  }
+  width_overall_macro_count = std::max(width_overall_macro_count, 1);
+  // We always have to treat the final macro block along width as trailing,
+  // because even if it is full in terms of padded input, it will be incomplete
+  // in terms of output.
+  const int width_macro_count = width_overall_macro_count - 1;
+  // Micro blocks are traversed in terms of input in fill routines.
+  const int width_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count) / 4;
+  const int width_overall_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count + 3) /
+      4;
+  // Micro blocks are traversed in terms of output in filtering routines.
+  const int residual_output_micro_width =
+      (output_width - 1) % function_params.four_over_stride + 1;
+  const int output_width_trailing_micro_repeats =
+      residual_micro_width > (filter_size - 1)
+          ? width_trailing_micro_repeats
+          : width_trailing_micro_repeats - 1;
+  // Check results.
+  TFLITE_DCHECK_GT(width_overall_trailing_micro_repeats, 0);
+  TFLITE_DCHECK_EQ(padded_width,
+                   residual_micro_width +
+                       consumed_width_per_macro_block * width_macro_count +
+                       4 * width_trailing_micro_repeats);
+  TFLITE_DCHECK_LE(width_overall_macro_count, width_macro_count + 1);
+  TFLITE_DCHECK_GE(width_overall_macro_count, width_macro_count);
+
+  // Height repetitions and residuals.
+  //
+  int height_macro_count;
+  int residual_row_count;
+  int height_overall_macro_count;
+  if (stride == 1) {
+    TFLITE_DCHECK_EQ(row_count_per_macro, 4);
+    height_macro_count = row_count / 4;
+    residual_row_count = row_count % 4;
+    height_overall_macro_count = (row_count + 3) / 4;
+  } else {
+    TFLITE_DCHECK_EQ(row_count_per_macro, 2);
+    height_macro_count = row_count / 2;
+    residual_row_count = row_count % 2;
+    height_overall_macro_count = (row_count + 1) / 2;
+  }
+  TFLITE_DCHECK_EQ(
+      row_count, residual_row_count + row_count_per_macro * height_macro_count);
+  TFLITE_DCHECK_LE(height_overall_macro_count, height_macro_count + 1);
+  TFLITE_DCHECK_GE(height_overall_macro_count, height_macro_count);
+
+  // Data strides.
+  //
+  const int input_height_stride = input_width * input_depth;
+  const int output_height_stride = output_width * output_depth;
+  const int input_batch_stride = input_height_stride * input_height;
+  const int output_batch_stride = output_height_stride * output_height;
+  const int input_depth_macro_stride = has_depth_multiplication ? 0 : 64;
+  const int input_width_macro_stride =
+      input_depth * consumed_width_per_macro_block;
+  const int output_width_macro_stride =
+      output_depth * output_width_per_macro_block;
+
+  // Store parameters that do not vary across macro blocks.
+  //
+  function_params.workspace_width_micro_repeats = workspace_width_micro_repeats;
+  function_params.height_macro_count = height_overall_macro_count;
+  function_params.width_macro_count = width_overall_macro_count;
+  function_params.input_height_stride = input_height_stride;
+  function_params.output_height_stride = output_height_stride;
+  function_params.residual_width = residual_micro_width;
+
+  // Prefetch workspace for write, along with any necessary dummy writes.
+  const int max_workspace_height_stride =
+      16 * ((workspace_width_micro_repeats + 3) >> 2) * largest_macro_depth;
+  const int workspace_fill_size = std::min(
+      kDepthwiseConvScratchWorkspaceSize,
+      height_block_size * max_workspace_height_stride + kWorkspaceExtension);
+  WorkspacePrefetchWrite<implementation>::Run(
+      params.weights_offset, workspace_fill_size, macroblock_workspace);
+
+  // Main process.
+  //
+  // Most kernels are nested batch-height-width-depth. Here we proceed over
+  // macro blocks batch-width-depth-height.
+  //
+  // Example of handling of trailing iteration: when there is trailing depth,
+  // depth_overall_macro_count = depth_macro_count + 1, so we can adjust the
+  // dimensions for trailing macro blocks by looking for
+  // j_depth == depth_macro_count.
+  for (int b = batch_start; b < batch_end; ++b) {
+    for (int k_width = 0; k_width < width_overall_macro_count; ++k_width) {
+      // Figure out the work to be done for this macro block. If it trails in
+      // any dimension, the work in that dimension is adjusted.
+      // The work to be done across widths has 3 cases:
+      // (a) A full macro block,
+      // (b) Partial terminal macro block, with input and output ending in
+      //     same micro block, and
+      // (c) Partial terminal macro block, with output corresponding to one
+      //     fewer micro blocks, because filter extends across micro-block
+      //     boundary.
+      if (k_width != width_macro_count) {
+        function_params.output_residual_width = 0;
+        function_params.input_width_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.output_width_micro_repeats =
+            workspace_width_micro_repeats - 1;
+      } else {
+        function_params.output_residual_width = residual_output_micro_width;
+        function_params.input_width_micro_repeats =
+            width_trailing_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            width_overall_trailing_micro_repeats;
+        function_params.output_width_micro_repeats =
+            output_width_trailing_micro_repeats;
+      }
+      function_params.output_width_overall_micro_repeats =
+          function_params.output_residual_width == 0
+              ? function_params.output_width_micro_repeats
+              : function_params.output_width_micro_repeats + 1;
+
+      for (int j_depth = 0; j_depth < depth_overall_macro_count; ++j_depth) {
+        if (quantization_type == QuantizationType::kPerChannelInt8) {
+          // Each macro block handles depth of 64 (8 micro). The kernel
+          // functions receive pointers to quantization data for the block being
+          // processed.
+          function_params.output_multiplier_per_channel =
+              params.output_multiplier_per_channel + 64 * j_depth;
+          function_params.output_shift_per_channel =
+              params.output_shift_per_channel + 64 * j_depth;
+        }
+        // Process filter and bias data.
+        //
+        function_params.depth_micro_repeats =
+            j_depth == depth_macro_count ? depth_trailing_micro_repeats : 8;
+        ProcessPerDepth<implementation, quantization_type>::Run(
+            filter_data + 64 * j_depth,
+            bias_data + 8 * 2 * bias_increment * j_depth,
+            filter_workspace[0][0][0][0], adjusted_bias_data, &function_params);
+
+        const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            input_data_block = input_data + b * input_batch_stride +
+                               j_depth * input_depth_macro_stride +
+                               k_width * input_width_macro_stride -
+                               function_params.padding_left * input_depth +
+                               row_start * stride * input_height_stride -
+                               full_padding_top * input_height_stride;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data_block =
+                output_data + b * output_batch_stride +
+                row_start * output_height_stride + j_depth * 64 +
+                k_width * output_width_macro_stride;
+
+        // Under depth multiplication the workspace_height_stride does not have
+        // to depend on input_width_overall_micro_repeats, but this improves the
+        // compactness of workspace use.
+        const int workspace_height_stride =
+            has_depth_multiplication
+                ? 16 * ((function_params.input_width_overall_micro_repeats +
+                         3) >>
+                        2)
+                : 4 * function_params.input_width_overall_micro_repeats * 8 *
+                      function_params.depth_micro_repeats;
+        TFLITE_DCHECK_EQ(workspace_height_stride % 16, 0);
+        function_params.workspace_height_stride = workspace_height_stride;
+
+        // For the first macro block for output rows we fill in the first few
+        // rows.  After this we will copy them (see below in loop.)
+        function_params.inbound_block_height = input_height_overlap;
+        pack_macro_block_func(-1, k_width, input_data_block,
+                              macroblock_workspace, &function_params);
+        input_data_block += input_height_stride * input_height_overlap;
+
+        for (int i_height = 0; i_height < height_overall_macro_count;
+             ++i_height) {
+          if (i_height != height_macro_count) {
+            function_params.inbound_block_height = input_height_per_macro;
+            function_params.outbound_block_height = row_count_per_macro;
+          } else {
+            function_params.inbound_block_height = residual_row_count * stride;
+            function_params.outbound_block_height = residual_row_count;
+          }
+          TFLITE_DCHECK_LT(i_height * row_count_per_macro, row_count);
+          TFLITE_DCHECK_LT(i_height * input_height_per_macro, input_height);
+          TFLITE_DCHECK_LT(k_width * output_width_per_macro_block,
+                           output_width);
+          TFLITE_DCHECK_LT(k_width * consumed_width_per_macro_block,
+                           input_width);
+
+          // Macro blocks overlap by input_height_overlap rows, so we copy
+          // those instead of filling in afresh.  The first macro block across
+          // output rows was filled in outside of the loop (above).
+          if (i_height > 0) {
+            memcpy(macroblock_workspace,
+                   macroblock_workspace +
+                       input_height_per_macro * workspace_height_stride,
+                   input_height_overlap * workspace_height_stride);
+          }
+
+          pack_macro_block_func(
+              i_height, k_width, input_data_block,
+              macroblock_workspace +
+                  input_height_overlap * workspace_height_stride,
+              &function_params);
+
+          kernel_macro_block_func(
+              macroblock_workspace, filter_workspace[0][0][0][0],
+              adjusted_bias_data, output_data_block, &function_params);
+
+          input_data_block += input_height_stride * input_height_per_macro;
+          output_data_block += output_height_stride * row_count_per_macro;
+        }
+      }
+    }
+  }
+}
+
+template <DepthwiseConvImplementation implementation>
+inline void DepthwiseConvDotProduct3x3(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvDotProduct3x3Impl<
+      implementation, depthwise_conv::QuantizationType::kNonPerChannelUint8>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data, thread_start, thread_end,
+      thread_dim);
+}
+
+template <DepthwiseConvImplementation implementation>
+inline void DepthwiseConvDotProduct3x3PerChannel(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvDotProduct3x3Impl<
+      implementation, depthwise_conv::QuantizationType::kPerChannelInt8>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data, thread_start, thread_end,
+      thread_dim);
+}
+
+#undef vst1_lane_8x4
+#undef vst1q_lane_8x4
+#undef vld1q_lane_s8x8
+#undef vld1_lane_8x4
+#undef vld1q_lane_8x4
+#undef vld1q_dup_s8x4
+
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
new file mode 100644
index 00000000..63dca21c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -0,0 +1,8129 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+
+// This file provides kernel implementations that are not used in shipped
+// inference code, but rather (a) show how model C++ code is designed and then
+// transformed into asm code, and (b) aid with maintenance and later development
+// of variations. Many projects (even including, say, the classic NAG libraries)
+// develop highly optimized code, but do not maintain intermediate versions.
+// Often the result is incomprehensible final-version code.
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#ifdef USE_NEON
+
+inline void util_vst1_u8(uint8_t* data_addr, uint8x8_t reg) {
+  return vst1_u8(data_addr, reg);
+}
+inline void util_vst1_x8(uint8_t* data_addr, int8x8_t reg) {
+  return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
+}
+inline void util_vst1_x8(int8_t* data_addr, int8x8_t reg) {
+  return vst1_s8(data_addr, reg);
+}
+
+// Lane operations are for clarity and convenience. We want to load and store
+// 4 8-bit lanes together. So these are treated much like 32-bit loads and
+// 32-bit stores. Stores require 32-bit alignment.
+
+#define vst1_lane_s8x4(dst, reg, lane_num)                                  \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);           \
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_s8(reg), \
+                lane_num)
+#define vst1_lane_u8x4(dst, reg, lane_num)                                  \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);           \
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_u8(reg), \
+                lane_num)
+#define vst1q_lane_s8x4(dst, reg, lane_num)                                   \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);             \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_s8(reg), \
+                 lane_num)
+#define vst1q_lane_u8x4(dst, reg, lane_num)                                   \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0);             \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_u8(reg), \
+                 lane_num)
+
+// Important! Most compilation configurations will compile and run without
+// reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
+// obscure bug or mis-feature probably in unhygienic macro expansion.
+#define vld1q_lane_s8x8(src, reg, lane_num)                                   \
+  vreinterpretq_s8_u64(vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), \
+                                      vreinterpretq_u64_s8(reg), lane_num))
+#define vld1_lane_8x4(src, reg, lane_num)                                \
+  vreinterpret_s8_s32(vld1_lane_s32(reinterpret_cast<const int32*>(src), \
+                                    vreinterpret_s32_s8(reg), lane_num))
+#define vld1q_lane_8x4(src, reg, lane_num)                                 \
+  vreinterpretq_s8_s32(vld1q_lane_s32(reinterpret_cast<const int32*>(src), \
+                                      vreinterpretq_s32_s8(reg), lane_num))
+#define vld1q_dup_s8x4(src) \
+  vreinterpretq_s8_s32(vld1q_dup_s32(reinterpret_cast<const int32*>(src)))
+
+#endif  // USE_NEON
+
+template <QuantizationType quantization_type>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                       quantization_type> {
+  // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
+  // width 3,  sub-block 0 or 1, depth 4. Filter data is written as
+  // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
+  //
+  // Note that this rearrangement is much like that performed on input data when
+  // filling the workspace, and optimized versions will be similar.
+  static inline void FillFilterBank(int depth, const uint8_t* filter_block,
+                                    int8_t filter_bank[3][2][4][4]) {
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    // Load filter data in, 8-bytes down depth / sub-block at a time.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8_t loaded_filter[3][4][2][4];
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
+               8);
+      }
+      // Pad the filter with symmetric representation of 0, so that the values
+      // become 0 when the zero-poing is added below. Thus these filter taps are
+      // effectively disregarded in later filtering.
+      memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
+    }
+    for (int y = 0; y < 3; ++y) {
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank[y][0][z][x] =
+              loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
+          filter_bank[y][1][z][x] =
+              loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+    }
+  }
+
+  // Adjust the bias (weights) data according to the input offset.
+  //
+  // The output calculation is
+  // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
+  //                                 (filter[i][j][d] + filter_offset)
+  // (where offsets are expressed as differences from 128).
+  //
+  // Since we cannot efficiently handle varying offsets / bias across the image,
+  // we insist on filter_offset = 0.
+  //
+  // This function calculates
+  // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
+  // which accounts for input offset. If the bias is constant over the depth,
+  // the adjusted bias will vary.
+  static inline void AdjustBias(int32_t input_offset,
+                                const int8_t filter_bank[3][2][4][4],
+                                const int32_t* bias_data,
+                                int32_t adjusted_bias_block[2][4]) {
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    // For instance, if input_offset == 128, no adjustment is needed.
+    const int32_t input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int s = 0; s < 2; ++s) {
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_block[s][z] = bias_data[4 * s + z];
+        for (int i = 0; i < 9; ++i) {
+          adjusted_bias_block[s][z] +=
+              input_offset_difference * filter_bank[i % 3][s][z][i / 3];
+        }
+      }
+    }
+  }
+
+  static void Run(const uint8_t* filter_data, const int32_t* bias_data,
+                  int8_t* shuffled_filter_data, int32_t* adjusted_bias_data,
+                  const DepthwiseConvDotProdParams* function_params) {
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+    const int32_t input_offset = function_params->input_offset;
+
+    int8_t filter_bank[3][2][4][4];
+    int32_t adjusted_bias_block[2][4];
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
+      AdjustBias(input_offset, filter_bank,
+                 bias_data + 2 * bias_increment * j_depth, adjusted_bias_block);
+
+      memcpy(shuffled_filter_data, filter_bank[0][0][0],
+             shuffled_filter_increment);
+      shuffled_filter_data += shuffled_filter_increment;
+      memcpy(adjusted_bias_data, adjusted_bias_block[0],
+             8 * sizeof(adjusted_bias_block[0][0]));
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+template <QuantizationType quantization_type>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                       quantization_type> {
+  static inline void Run(const uint8_t* filter_data, const int32_t* bias_data,
+                         int8_t* shuffled_filter_data,
+                         int32_t* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8_t filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8_t filter_bank_a_1[4][4];
+    int8_t filter_bank_a_2[4][4];
+    int8_t filter_bank_b_0[4][4];
+    int8_t filter_bank_b_1[4][4];
+    int8_t filter_bank_b_2[4][4];
+
+    // Load filter data in, essentially dropping the [depth/8] dimension, which
+    // is equivalent to loading just the depth needed for one micro-block.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8_t loaded_filter_0[4][2][4];
+    uint8_t loaded_filter_1[4][2][4];
+    uint8_t loaded_filter_2[4][2][4];
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    const int32_t input_offset = function_params->input_offset;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    const int32_t input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const uint8_t* filter_block = filter_data + 8 * j_depth;
+
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
+               8);
+        memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
+               8);
+        memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
+               8);
+      }
+      // Pad the filter with -filter_offset, so that the values become 0 when
+      // the filter_offset is later added, and so the filter tap is effectively
+      // disregarded.
+      memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
+      memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
+      memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
+
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank_a_0[z][x] =
+              loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_0[z][x] =
+              loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
+          filter_bank_a_1[z][x] =
+              loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_1[z][x] =
+              loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
+          filter_bank_a_2[z][x] =
+              loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_2[z][x] =
+              loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+
+      memcpy(shuffled_filter_data, filter_bank_a_0, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_0, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_a_1, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_1, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_a_2, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_2, 16);
+      shuffled_filter_data += 16;
+
+      int32_t adjusted_bias_data_0[4];
+      int32_t adjusted_bias_data_1[4];
+      // For instance, if input_offset == 128, no adjustment is needed.
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_data_0[z] = bias_data[z];
+        adjusted_bias_data_1[z] = bias_data[4 + z];
+        for (int x = 0; x < 4; ++x) {
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_0[z][x];
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_1[z][x];
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_2[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_0[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_1[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_2[z][x];
+
+          adjusted_bias_data[z] = adjusted_bias_data_0[z];
+          adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
+        }
+      }
+      bias_data += 2 * bias_increment;
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+#ifdef USE_NEON
+template <QuantizationType quantization_type>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                       quantization_type> {
+  static void ProcessPerDepthIntrinsics(
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          filter_data,
+      const int32_t* bias_data, int8_t* shuffled_filter_data,
+      int32_t* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    constexpr uint8_t kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+    const int32_t input_offset = function_params->input_offset;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(input_offset, -255);
+      TFLITE_DCHECK_LE(input_offset, 0);
+    }
+    const int32_t input_offset_difference = input_offset + kSymmetricZeroPoint;
+    const int8x16_t ones_vector = vdupq_n_s8(1);
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8x16_t input_0_a;
+    int8x16_t input_0_b;
+    int8x16_t input_0_c;
+    int8x16_t input_1_a;
+    int8x16_t input_1_b;
+    int8x16_t input_1_c;
+    int8x16_t input_2_a;
+    int8x16_t input_2_b;
+    int8x16_t input_2_c;
+
+    int8x16_t filter_0_a;
+    int8x16_t filter_0_b;
+    int8x16_t filter_1_a;
+    int8x16_t filter_1_b;
+    int8x16_t filter_2_a;
+    int8x16_t filter_2_b;
+
+    // For uint8, effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+
+    const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        filter_block = filter_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          filter_block_ptr = filter_block;
+      input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
+      filter_block_ptr += depth;
+      input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
+      filter_block_ptr += depth;
+      input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
+      filter_block_ptr += depth;
+      input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
+      filter_block_ptr += depth;
+      input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
+      filter_block_ptr += depth;
+      input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
+      filter_block_ptr += depth;
+      input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
+      filter_block_ptr += depth;
+      input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
+      filter_block_ptr += depth;
+      input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
+
+      filter_0_a = vzip1q_s8(input_0_a, input_0_b);
+      filter_0_b = vzip1q_s8(input_0_c, sign_bit);
+      filter_1_a = vzip1q_s8(input_1_a, input_1_b);
+      filter_1_b = vzip1q_s8(input_1_c, sign_bit);
+      filter_2_a = vzip1q_s8(input_2_a, input_2_b);
+      filter_2_b = vzip1q_s8(input_2_c, sign_bit);
+      if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+        filter_0_a = veorq_s8(filter_0_a, sign_bit);
+        filter_0_b = veorq_s8(filter_0_b, sign_bit);
+        filter_1_a = veorq_s8(filter_1_a, sign_bit);
+        filter_1_b = veorq_s8(filter_1_b, sign_bit);
+        filter_2_a = veorq_s8(filter_2_a, sign_bit);
+        filter_2_b = veorq_s8(filter_2_b, sign_bit);
+      }
+      vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
+      vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
+      vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
+
+      vst1q_s8(shuffled_filter_data, filter_0_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_0_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_1_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_1_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_2_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_2_b);
+      shuffled_filter_data += 16;
+
+      int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      // For instance, if input_offset is kIntSymmetricZeroPoint, no adjustment
+      // is needed.
+
+      int32x4_t filter_sum_a = vdupq_n_s32(0);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
+      int32x4_t filter_sum_b = vdupq_n_s32(0);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
+
+      adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
+                                         input_offset_difference);
+      adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
+                                         input_offset_difference);
+
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
+      adjusted_bias_data += 4;
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
+      adjusted_bias_data += 4;
+
+      filter_block += 8;
+    }
+  }
+
+  static inline void Run(const typename QuantizationTypeImpl<
+                             quantization_type>::ExternalType* filter_data,
+                         const int32_t* bias_data, int8_t* shuffled_filter_data,
+                         int32_t* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
+                              adjusted_bias_data, function_params);
+  }
+};
+#endif
+
+template <QuantizationType quantization_type, int32_t max_padding>
+struct PackMacroBlock<
+    DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kNoMultiplication, max_padding> {
+  // A straight copy of a macro block of input data into a scratch buffer.
+  //
+  // Requirement: depth_micro_repeats > 0.
+  static inline void CopyMacroBlock(
+      int32_t height_block_number, int32_t width_block_number,
+      const DepthwiseConvDotProdParams& function_params,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data) {
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The input depth and count of micro blocks provide the width strides.
+    const int input_height_stride = function_params.input_height_stride;
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int input_depth = function_params.input_depth;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    int input_width_micro_repeats = function_params.input_width_micro_repeats;
+    const int residual_width = function_params.residual_width;
+    const int block_height = function_params.inbound_block_height;
+
+    const int padding_left = function_params.padding_left;
+    const int padding_right = function_params.padding_right;
+    const int padding_top = function_params.padding_top;
+    const int padding_bottom = function_params.padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params.width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params.height_macro_count - 1);
+
+    // Modify the trailing case to reflect the input width.
+    int input_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+    if (trailing_width_padding) {
+      input_residual_width -= 1;
+      input_width_micro_repeats = width_overall_micro_repeats - 1;
+    }
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    const int32_t input_offset_difference =
+        function_params.input_offset + kSymmetricZeroPoint;
+
+    // We load data into a temporary buffer and then save, to match subsequent
+    // processing. This will make it easier to combine stages into one ASM
+    // routine.
+    int8_t tmp_load[4][2][4];
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs trailing padding).
+        int adjusted_residual_width =
+            j_width == input_width_micro_repeats ? input_residual_width : 4;
+
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+          memset(tmp_load[0][0], -input_offset_difference, 8);
+        }
+        if (adjusted_residual_width < 4) {
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            memset(tmp_load[x][0], -input_offset_difference, 8);
+          }
+        }
+
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // The inner 3 loops go through the sub-block, depth and width within
+          // each micro block.
+
+          // Load, and apply symmetric offset.
+          int8_t* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              input_data = input_block_data + k_height * input_height_stride +
+                           j_width * 4 * input_depth + i_depth * 8;
+          // Full-size macro blocks are 2*4*4 = 32 bytes.
+          for (int x = start_width; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+
+          // Save results.
+          memcpy(&scratch_data[0], tmp_load[0][0], 8);
+          memcpy(&scratch_data[8], tmp_load[1][0], 8);
+          memcpy(&scratch_data[16], tmp_load[2][0], 8);
+          memcpy(&scratch_data[24], tmp_load[3][0], 8);
+        }
+      }
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference, workspace_height_stride);
+    }
+  }
+
+  // Transpose 4x4 blocks within each sub-micro-block.
+  //
+  // Implemented somewhat like NEON register manipulation, so that we can see
+  // equivalence of the two approaches.
+  static inline void MicroTransposeBlocks(
+      const DepthwiseConvDotProdParams& function_params,
+      int8_t* scratch_block_data) {
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    const int block_height = function_params.inbound_block_height;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating.
+    int8_t tmp_load[4][2][4];         // [width][sub-block][depth]
+    int8_t tmp_transposed[4][2][4];   // [depth][sub-block][width]
+    int8_t tmp_interleaved[2][4][4];  // [sub-block][depth][width]
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          int8_t* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          // A. Load data
+          memcpy(tmp_load[0][0], &scratch_data[0], 8);
+          memcpy(tmp_load[1][0], &scratch_data[8], 8);
+          memcpy(tmp_load[2][0], &scratch_data[16], 8);
+          memcpy(tmp_load[3][0], &scratch_data[24], 8);
+
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
+        }
+      }
+    }
+  }
+
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    CopyMacroBlock(height_block_number, width_block_number, *function_params,
+                   input_block_data, scratch_block_data);
+    MicroTransposeBlocks(*function_params, scratch_block_data);
+  }
+};
+
+template <QuantizationType quantization_type, int32_t max_padding>
+struct PackMacroBlock<
+    DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Currently support for padding is limited to 1 on any side.
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The count of micro blocks (below) provides the width strides.
+    const int input_height_stride = function_params->input_height_stride;
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int residual_width = function_params->residual_width;
+    const int block_height = function_params->inbound_block_height;
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    const int32_t input_offset_difference =
+        function_params->input_offset + kSymmetricZeroPoint;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // When there is unit input depth, the micro-block iteration need only be
+    // through the height. The micro blocks are contiguous across the width.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data = input_block_data + k_height * input_height_stride;
+      int8_t* scratch_data =
+          scratch_block_data + k_height * workspace_height_stride;
+
+      // Handle leading padding. This is overwritten if there is no padding.
+      scratch_data[0] = -input_offset_difference;
+
+      memcpy(&scratch_data[start_width], input_data, copy_size);
+      for (int i = 0; i < copy_size; ++i) {
+        scratch_data[start_width + i] += -kSymmetricZeroPoint;
+      }
+
+      // Handle trailing padding, and fill in remainder of micro block.
+      memset(&scratch_data[start_width + copy_size], -input_offset_difference,
+             4 - adjusted_residual_width + kWorkspaceExtension);
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+    }
+  }
+};
+
+// Beginning of code section containing intermediate code transformation.
+//
+// This section is only compiled when kUseUnwound3x3DotProduct versions of
+// templated functions are selected.
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
+    // down the depth.
+    int8_t tmp_load[4][2][4];
+    int8_t tmp_transposed[4][2][4];
+    int8_t tmp_interleaved[2][4][4];
+
+    // Work through one slice, by row, at a time.
+    int8_t* scratch_data = scratch_block_data;
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data = input_block_data;
+      input_block_data += input_height_stride;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        // Load, then zero.
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // A. Simulate register loading.
+          for (int x = 0; x < 4; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C and D are to be performed together as 4-byte stores in NEON code.
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
+          memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
+          memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
+
+          scratch_data += depth_advance;
+          input_data += 8;
+        }
+        scratch_data += width_advance;
+        input_data += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        // Figure out division of work (available input vs zero-ed).
+        const int adjusted_residual_width = residual_width;
+        // Load, then zero.
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // A. Simulate register loading.
+          for (int x = 0; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = 0;
+              }
+            }
+          }
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C and D are to be performed together as 4-byte stores in NEON code.
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
+          memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
+          memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
+
+          scratch_data += depth_advance;
+          input_data += 8;
+        }
+        scratch_data += width_advance;
+        input_data += input_depth_skip;
+      }
+      scratch_data += height_advance;
+    }
+
+    TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
+                                       block_height * workspace_height_stride);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Just use C model code for case of padding. Optimized versions merge the
+    // modifications therein to handle padding.
+    PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                   quantization_type,
+                   DepthwiseConvDepthMultiplication::kNoMultiplication,
+                   /*max_padding=*/1>::Run(height_block_number,
+                                           width_block_number, input_block_data,
+                                           scratch_block_data, function_params);
+  }
+};
+
+template <QuantizationType quantization_type, int32_t max_padding>
+struct PackMacroBlock<
+    DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32_t input_offset = function_params->input_offset;
+    const int32_t input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8_t* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // This is used to simulate what should happen in registers.
+    int8_t tmp_data[16];
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    if (copy_size >= 16) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8_t* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          tmp_data[0] = -input_offset_difference;
+          memcpy(scratch_data, tmp_data, 16);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
+                 16);
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
+        }
+
+        const int copy_remaining = copy_size - copy_done;
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          memcpy(tmp_data,
+                 input_block_data + input_block_offset + copy_done -
+                     (16 - copy_remaining),
+                 16);
+          // Shift to select the part that we need.
+          for (int i = 0; i < copy_remaining; ++i) {
+            tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
+          }
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Apply padding to remainder, some unnecessary but costless in regs.
+          for (int i = copy_remaining; i < 16; ++i) {
+            tmp_data[i] = -input_offset_difference;
+          }
+          const int final_repeats =
+              width_overall_micro_repeats - (start_width + copy_done) / 4;
+          for (int i = 0; i < final_repeats; ++i) {
+            memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
+            copy_done += 4;
+          }
+        }
+        memset(scratch_data + start_width + copy_done, -input_offset_difference,
+               kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8_t* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          tmp_data[0] = -input_offset_difference;
+          memcpy(scratch_data, tmp_data, 4);
+          copy_done += 3;
+        }
+
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
+                 4);
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Perform as 4 int32 stores, because that is our alignment.
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
+        }
+
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+        const int copy_remaining = copy_size - copy_done;
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          memcpy(tmp_data,
+                 input_block_data + input_block_offset + copy_done -
+                     (4 - copy_remaining),
+                 4);
+          // Shift to select the part that we need.
+          for (int i = 0; i < copy_remaining; ++i) {
+            tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
+          }
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Apply padding to remainder, some unnecessary but costless in regs.
+          for (int i = copy_remaining; i < 4; ++i) {
+            tmp_data[i] = -input_offset_difference;
+          }
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
+          copy_done += 4;
+        }
+        memset(scratch_data + start_width + copy_done, -input_offset_difference,
+               kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Apply padding by quick fill of whole reg.
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] = -input_offset;
+        }
+        for (int i = 0; i < copy_size; ++i) {
+          // Apply shift-left insert, tmp_data as both operands.
+          // The zero-index byte is left unchanged.
+          for (int i = 7; i > 0; --i) {
+            tmp_data[i] = tmp_data[i - 1];
+          }
+          tmp_data[1] =
+              input_block_data[input_block_offset + (copy_size - 1 - i)];
+        }
+        if (!leading_width_padding) {
+          // Remove leading padding, junking trailing byte, OK because max size
+          // is less than 8.
+          TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
+          for (int i = 0; i < 7; ++i) {
+            tmp_data[i] = tmp_data[i + 1];
+          }
+        }
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] += -kSymmetricZeroPoint;
+        }
+        memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
+        memset(scratch_data_base + scratch_data_offset + 8,
+               -input_offset_difference, kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      // This path is basically the same as the preceding, 2-micro-block one,
+      // but here we simply store fewer bytes.
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Apply padding by quick fill of whole reg.
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] = -input_offset;
+        }
+        for (int i = 0; i < copy_size; ++i) {
+          // Apply shift-left insert, tmp_data as both operands.
+          // The zero-index byte is left unchanged.
+          for (int i = 7; i > 0; --i) {
+            tmp_data[i] = tmp_data[i - 1];
+          }
+          tmp_data[1] =
+              input_block_data[input_block_offset + (copy_size - 1 - i)];
+        }
+        if (!leading_width_padding) {
+          // Remove leading padding, junking trailing byte, OK because max size
+          // is less than 8.
+          TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
+          for (int i = 0; i < 7; ++i) {
+            tmp_data[i] = tmp_data[i + 1];
+          }
+        }
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] += -kSymmetricZeroPoint;
+        }
+        memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
+        memset(scratch_data_base + scratch_data_offset + 4,
+               -input_offset_difference, kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+};
+// The preceding section is only compiled when kUseUnwound3x3DotProduct versions
+// of templated functions are selected.
+//
+// End of code section containing intermediate code transformation.
+
+#ifdef USE_NEON
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockIntrinsics(
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr uint8_t kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+
+    // Work through one slice, by row, at a time.
+    int8_t* scratch_data_0 = scratch_block_data;
+
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data_0 = input_block_data;
+      int8x16_t input_data_a;
+      int8x16_t input_data_b;
+      int8x16_t input_data_c;
+      int8x16_t input_data_d;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        int8x16_t work_reg_a_sp;
+        int8x16_t work_reg_b_sp;
+
+        int i_depth = 0;
+
+        if (depth_micro_repeats >= 2) {
+          i_depth += 2;
+
+          input_data_a = util_vld1q_x8(input_data_0);
+          input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+          input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+          input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+          input_data_0 += 16;
+
+          for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+            work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+            work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+            vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+            if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+            }
+
+            work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+            work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+            vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+            input_data_a = util_vld1q_x8(input_data_0);
+            input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+            vst1q_s8(scratch_data_0, work_reg_a);
+            vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+            scratch_data_0 += depth_advance;
+
+            if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+            }
+
+            input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+            input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+            vst1q_s8(scratch_data_0, work_reg_a_sp);
+            vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+            scratch_data_0 += depth_advance;
+            input_data_0 += 16;
+          }
+
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+
+          work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+          work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+          vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+            work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+          }
+
+          vst1q_s8(scratch_data_0, work_reg_a_sp);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+          scratch_data_0 += depth_advance;
+        }
+        for (; i_depth < depth_micro_repeats; ++i_depth) {
+          input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+          input_data_b =
+              vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
+          input_data_c =
+              vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
+          input_data_d =
+              vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+          input_data_0 += 8;
+
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        TFLITE_DCHECK_LT(residual_width, 4);
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+          input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+          input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+          if (residual_width > 1) {
+            input_data_b =
+                vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
+            if (residual_width == 3) {
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+            }
+          }
+          work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+          work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_0 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+      }
+
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+    PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockIntrinsics(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    constexpr uint8_t kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32_t input_offset = function_params->input_offset;
+    const int32_t input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+
+    // Work through one slice, by row, at a time.
+    int8_t* scratch_data_0 = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data_0 = input_block_data;
+      int8x16_t input_data_a;
+      int8x16_t input_data_b;
+      int8x16_t input_data_c;
+      int8x16_t input_data_d;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs zero-ed).
+        int adjusted_residual_width =
+            j_width == (input_width_micro_repeats) ? residual_width : 4;
+
+        if (trailing_width_padding &&
+            j_width == (width_overall_micro_repeats - 1)) {
+          adjusted_residual_width -= 1;
+        }
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+        }
+        if (start_width == 0) {
+          if (adjusted_residual_width == 4) {
+            int8x16_t work_reg_a_sp;
+            int8x16_t work_reg_b_sp;
+
+            int i_depth = 0;
+
+            if (depth_micro_repeats >= 2) {
+              i_depth += 2;
+
+              input_data_a = util_vld1q_x8(input_data_0);
+              input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+              input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+              input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+              input_data_0 += 16;
+
+              for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+                work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+                work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                  work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                }
+
+                work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+                work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+                input_data_a = util_vld1q_x8(input_data_0);
+                input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+                scratch_data_0 += depth_advance;
+
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                  work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                }
+
+                input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+                input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a_sp);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+                scratch_data_0 += depth_advance;
+                input_data_0 += 16;
+              }
+
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+
+              work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+              work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              }
+
+              vst1q_s8(scratch_data_0, work_reg_a_sp);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+              scratch_data_0 += depth_advance;
+            }
+            for (; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+              input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
+                                             input_data_b, 0);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+              input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
+                                             input_data_d, 0);
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              input_data_0 += 8;
+
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              if (adjusted_residual_width > 0) {
+                input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
+                if (adjusted_residual_width > 1) {
+                  input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                                 input_data_b, 0);
+                  if (adjusted_residual_width == 3) {
+                    input_data_c = vld1q_lane_s8x8(
+                        input_data_0 + 2 * input_depth, input_data_c, 0);
+                  }
+                }
+              }
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          }
+        } else {
+          if (adjusted_residual_width == 4) {
+            int8x16_t work_reg_a_sp;
+            int8x16_t work_reg_b_sp;
+
+            int i_depth = 0;
+
+            if (depth_micro_repeats >= 2) {
+              i_depth += 2;
+
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+              input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+              input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+              input_data_0 += 16;
+
+              for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+                work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+                work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                  work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                }
+
+                work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+                work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+                vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+
+                input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+                input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+                scratch_data_0 += depth_advance;
+
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                  work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                }
+
+                input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+                input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
+                vst1q_s8(scratch_data_0, work_reg_a_sp);
+                vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+                scratch_data_0 += depth_advance;
+                input_data_0 += 16;
+              }
+
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+
+              work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
+              work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
+              vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              }
+
+              vst1q_s8(scratch_data_0, work_reg_a_sp);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
+
+              scratch_data_0 += depth_advance;
+            }
+            for (; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
+                                             input_data_b, 0);
+              input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                             input_data_c, 0);
+              input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
+                                             input_data_d, 0);
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              input_data_0 += 8;
+
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              input_data_a = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_b = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_c = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              input_data_d = vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+              // Skip loading first column.
+              if (adjusted_residual_width > 1) {
+                input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                               input_data_b, 0);
+                if (adjusted_residual_width == 3) {
+                  input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                                 input_data_c, 0);
+                }
+              }
+              work_reg_a = vzip1q_s8(input_data_a, input_data_b);
+              work_reg_b = vzip1q_s8(input_data_c, input_data_d);
+
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+          }
+        }
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockIntrinsics(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32_t input_offset = function_params->input_offset;
+    const int32_t input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8_t* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8_t kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+    int8x8_t padding_mask;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+    const int8x16_t padding_reg =
+        vreinterpretq_s8_u8(vdupq_n_u8(-input_offset));
+    padding_mask = vdup_n_s8(-1);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = (copy_size + start_width) & 0x7;
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8_t* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          work_reg = util_vld1q_x8(input_block_data + input_block_offset);
+          work_reg = vextq_s8(padding_reg, work_reg, 15);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
+          vst1q_s8(scratch_data, work_reg);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              util_vld1q_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          vst1q_s8(scratch_data + start_width + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              util_vld1_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
+                                       copy_size - 8);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (8 - copy_remaining))));
+          half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                  vget_low_s8(padding_reg), half_work_reg);
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8_t* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
+                                        half_work_reg, 0);
+          half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          vst1_lane_s8x4(scratch_data, half_work_reg, 0);
+          copy_done += 3;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
+                         0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (4 - copy_remaining))));
+          half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                  vget_low_s8(padding_reg), half_work_reg);
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
+                         0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_s8x4(scratch_data + start_width + copy_done, half_work_reg,
+                       0);
+        vst1_lane_s8x4(scratch_data + start_width + copy_done + 4,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data + start_width + copy_done + 8,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data + start_width + copy_done + 12,
+                       half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      // Special case of 1 + 3 + 1, padding + copy + padding.
+      // This is rarely executed in practice.
+      TFLITE_DCHECK_EQ(copy_size, 3);
+      TFLITE_DCHECK_EQ(start_width, 1);
+      TFLITE_DCHECK(leading_width_padding);
+      TFLITE_DCHECK(trailing_width_padding);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vreinterpret_s8_u8(vdup_n_u8(-input_offset));
+        half_work_reg = vld1_lane_s8(reinterpret_cast<const int8_t*>(
+                                         input_block_data + input_block_offset),
+                                     half_work_reg, 1);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8_t*>(
+                             input_block_data + input_block_offset + 1),
+                         half_work_reg, 2);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8_t*>(
+                             input_block_data + input_block_offset + 2),
+                         half_work_reg, 3);
+
+        if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        }
+        TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
+        vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
+
+        // Trailing guard.
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
+                       half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
+      if (leading_width_padding) {
+        padding_mask = vreinterpret_s8_u8(
+            vset_lane_u8(255, vreinterpret_u8_s8(padding_mask), 0));
+      }
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8_t*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+        if (leading_width_padding) {
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
+        }
+        half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                vget_low_s8(padding_reg), half_work_reg);
+
+        if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        }
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                       0);
+
+        // Trailing guard.
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
+                       half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+template <QuantizationType quantization_type>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      quantization_type,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockIntrinsics(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    // Work through one slice, by row, at a time.
+    int8_t* scratch_data_base = scratch_block_data;
+
+    const int copy_block_height = block_height;
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    const int copy_size =
+        (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8_t kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const int8x16_t sign_bit = vreinterpretq_s8_u8(vdupq_n_u8(kSignBit));
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = copy_size & 0x7;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8_t* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              util_vld1q_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
+          TFLITE_DCHECK_EQ(copy_done % 16, 0);
+          vst1q_s8(scratch_data + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              util_vld1_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
+                                       copy_size - 8);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (8 - copy_remaining))));
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + copy_done, half_work_reg);
+        vst1_s8(scratch_data + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = copy_size & 0x3;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8_t* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (4 - copy_remaining))));
+
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_s8x4(scratch_data + copy_done, half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data + copy_done + 4, half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data + copy_done + 8, half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data + copy_done + 12, half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8_t*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                       0);
+
+        // Trailing guard.
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 4,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 8,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 12,
+                       half_work_reg, 0);
+        vst1_lane_s8x4(scratch_data_base + scratch_data_offset + 16,
+                       half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(
+      int32_t height_block_number, int32_t width_block_number,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8_t* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+#endif  // ARM NEON
+
+// Apply filter to macro block of input data and store results.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32_t stride, QuantizationType quantization_type>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, depth 4, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
+                                               int workspace_height_stride,
+                                               int width_micro_stride,
+                                               bool no_right_block,
+                                               const int8_t* input_block,
+                                               int8_t selected_data[3][4][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+
+    // The input banks have same format as selected_data.
+    int8_t left_bank[3][4][4];
+    int8_t right_bank[3][4][4];
+
+    // Work through one slice, by row, at a time.
+    for (int k_height = 0; k_height < 3; ++k_height) {
+      // Simulate demangling of mangled storage arrangement.
+      const int8_t* left_input_block =
+          &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
+      memcpy(left_bank[k_height][0], left_input_block, 16);
+      if (no_right_block) {
+        memset(right_bank[k_height][0], 0, 16);
+      } else {
+        const int8_t* right_input_block =
+            &input_block[k_height * workspace_height_stride +
+                         sub_block * 2 * 8 + width_micro_stride];
+        memcpy(right_bank[k_height][0], right_input_block, 16);
+      }
+      for (int depth_index = 0; depth_index < 4; ++depth_index) {
+        memcpy(selected_data[k_height][depth_index],
+               &left_bank[k_height][depth_index][offset], 4 - offset);
+        memcpy(&selected_data[k_height][depth_index][4 - offset],
+               right_bank[k_height][depth_index], offset);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& params, int sub_block,
+      const int8_t selected_data[3][4][4], const int8_t filter_bank[3][2][4][4],
+      const int32_t* bias_data, uint8_t output_values[4]) {
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t output_multiplier = params.output_multiplier;
+    const int32_t output_shift = params.output_shift;
+    const int32_t output_offset = params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32_t acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32_t input_val = selected_data[y][d][x];
+          int32_t filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8_t>(acc);
+    }
+  }
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8_t filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8_t sub_selected_input_data[3][4][4];  // Height 3, depth 4, width 4.
+    uint8_t output_values[4];                 // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8_t* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8_t* input_data =
+                scratch_data + width_micro_stride * i_width;
+            // Iterate over input width shifts within sub-micro blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, s,
+                                        workspace_height_stride,
+                                        width_micro_stride, no_right_block,
+                                        input_data, sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Apply filter to macro block of input data and store results.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32_t stride, QuantizationType quantization_type>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset,
+                                               int workspace_height_stride,
+                                               bool no_right_block,
+                                               const int8_t* input_block,
+                                               int8_t selected_data[3][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+    if (no_right_block) {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset],
+               4 - offset);
+      }
+    } else {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset], 4);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& function_params, int sub_block,
+      const int8_t selected_data[3][4], const int8_t filter_bank[3][2][4][4],
+      const int32_t* bias_data, uint8_t output_values[4]) {
+    const int32_t output_activation_min =
+        function_params.quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params.quantized_activation_max;
+    const int32_t output_multiplier = function_params.output_multiplier;
+    const int32_t output_shift = function_params.output_shift;
+    const int32_t output_offset = function_params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32_t acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32_t input_val = selected_data[y][x];
+          int32_t filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8_t>(acc);
+    }
+  }
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8_t filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8_t sub_selected_input_data[3][4];  // Height 3, depth 4, width 4.
+    uint8_t output_values[4];              // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val;
+          uint8_t* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8_t* input_data = scratch_data + 4 * i_width;
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
+                                        no_right_block, input_data,
+                                        sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Beginning of code section containing intermediate code transformation.
+//
+// This section is only compiled when kUseUnwound3x3DotProduct versions of
+// templated functions are selected.
+template <int32_t stride, QuantizationType quantization_type>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_multiplier = function_params->output_multiplier;
+    const int32_t output_shift = function_params->output_shift;
+    const int32_t output_offset = function_params->output_offset;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8_t filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8_t filter_bank_a_1[4][4];
+    int8_t filter_bank_a_2[4][4];
+    int8_t filter_bank_b_0[4][4];
+    int8_t filter_bank_b_1[4][4];
+    int8_t filter_bank_b_2[4][4];
+    // Simulate NEON-register input data concatenation + sub-selection.
+    // Also sub-block, height 3, depth 4, width 4.
+    uint8_t output_values[4];  // Sub-block, depth 4.
+    // selected_data has format Depth 4, width 4.
+    int8_t left_bank_0[4][4];
+    int8_t left_bank_1[4][4];
+    int8_t left_bank_2[4][4];
+    int8_t right_bank_0[4][4];
+    int8_t right_bank_1[4][4];
+    int8_t right_bank_2[4][4];
+    memset(right_bank_0[0], 0, 16);
+    memset(right_bank_1[0], 0, 16);
+    memset(right_bank_2[0], 0, 16);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8_t* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      memcpy(filter_bank_a_0, filter_block, 16);
+      memcpy(filter_bank_b_0, filter_block + 16, 16);
+      memcpy(filter_bank_a_1, filter_block + 32, 16);
+      memcpy(filter_bank_b_1, filter_block + 48, 16);
+      memcpy(filter_bank_a_2, filter_block + 64, 16);
+      memcpy(filter_bank_b_2, filter_block + 80, 16);
+
+      for (int s = 0; s < 2; ++s) {
+        // Work through one slice, by row, at a time.
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8_t* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+          const int8_t* input_data_0 = scratch_data + s * 2 * 8;
+
+          // Load first sub-micro block of data into operational banks.
+          memcpy(left_bank_0[0], input_data_0, 16);
+          memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
+          memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
+                 16);
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8_t* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+
+            // Load next sub-micro block of data.
+            if (!no_right_block) {
+              memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
+              memcpy(right_bank_1[0],
+                     input_data + workspace_height_stride + width_micro_stride,
+                     16);
+              memcpy(
+                  right_bank_2[0],
+                  input_data + 2 * workspace_height_stride + width_micro_stride,
+                  16);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              // Operate on depth of 4 in batches.
+              for (int d = 0; d < 4; ++d) {
+                int32_t acc = 0;
+                for (int x = 0; x < 4; ++x) {
+                  int32_t input_val = left_bank_0[d][x];
+                  int32_t filter_val = filter_bank_a_0[d][x];
+                  acc += filter_val * input_val;
+                }
+                for (int x = 0; x < 4; ++x) {
+                  int32_t input_val = left_bank_1[d][x];
+                  int32_t filter_val = filter_bank_a_1[d][x];
+                  acc += filter_val * input_val;
+                }
+                for (int x = 0; x < 4; ++x) {
+                  int32_t input_val = left_bank_2[d][x];
+                  int32_t filter_val = filter_bank_a_2[d][x];
+                  acc += filter_val * input_val;
+                }
+                acc += bias_data[d];
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[d] = static_cast<uint8_t>(acc);
+              }
+
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+
+              // Simulate shifting instructions.
+              if (stride_val == 1) {
+                for (int depth_index = 0; depth_index < 4; ++depth_index) {
+                  for (int z = 0; z < 3; ++z) {
+                    left_bank_0[depth_index][z] =
+                        left_bank_0[depth_index][z + 1];
+                    left_bank_1[depth_index][z] =
+                        left_bank_1[depth_index][z + 1];
+                    left_bank_2[depth_index][z] =
+                        left_bank_2[depth_index][z + 1];
+                  }
+                  left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
+                  left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
+                  left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
+                  for (int z = 0; z < 3; ++z) {
+                    right_bank_0[depth_index][z] =
+                        right_bank_0[depth_index][z + 1];
+                    right_bank_1[depth_index][z] =
+                        right_bank_1[depth_index][z + 1];
+                    right_bank_2[depth_index][z] =
+                        right_bank_2[depth_index][z + 1];
+                  }
+                }
+              } else {
+                for (int depth_index = 0; depth_index < 4; ++depth_index) {
+                  for (int z = 0; z < 2; ++z) {
+                    left_bank_0[depth_index][z] =
+                        left_bank_0[depth_index][z + 2];
+                    left_bank_1[depth_index][z] =
+                        left_bank_1[depth_index][z + 2];
+                    left_bank_2[depth_index][z] =
+                        left_bank_2[depth_index][z + 2];
+                  }
+                  left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
+                  left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
+                  left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
+                  left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
+                  left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
+                  left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
+                  for (int z = 0; z < 2; ++z) {
+                    right_bank_0[depth_index][z] =
+                        right_bank_0[depth_index][z + 2];
+                    right_bank_1[depth_index][z] =
+                        right_bank_1[depth_index][z + 2];
+                    right_bank_2[depth_index][z] =
+                        right_bank_2[depth_index][z + 2];
+                  }
+                }
+              }
+            }
+          }
+        }
+        bias_data += bias_increment;
+
+        // Move filter for second sub-block into operational filter.
+        for (int z = 0; z < 4; ++z) {
+          for (int x = 0; x < 4; ++x) {
+            filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
+            filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
+            filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
+          }
+        }
+      }
+    }
+  }
+};
+
+template <int32_t stride, QuantizationType quantization_type>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_multiplier = function_params->output_multiplier;
+    const int32_t output_shift = function_params->output_shift;
+    const int32_t output_offset = function_params->output_offset;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8_t filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8_t filter_bank_a_1[4][4];
+    int8_t filter_bank_a_2[4][4];
+    int8_t filter_bank_b_0[4][4];
+    int8_t filter_bank_b_1[4][4];
+    int8_t filter_bank_b_2[4][4];
+    // Simulate NEON-register input data concatenation + sub-selection.
+    // Also sub-block, height 3, depth 4, width 4.
+
+    int8_t input_bank_0[8];
+    int8_t input_bank_1[8];
+    int8_t input_bank_2[8];
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+
+    uint8_t output_values[2][4];  // Sub-block, depth 4.
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank_a_0, filter_workspace, 16);
+      memcpy(filter_bank_b_0, filter_workspace + 16, 16);
+      memcpy(filter_bank_a_1, filter_workspace + 32, 16);
+      memcpy(filter_bank_b_1, filter_workspace + 48, 16);
+      memcpy(filter_bank_a_2, filter_workspace + 64, 16);
+      memcpy(filter_bank_b_2, filter_workspace + 80, 16);
+
+      // Work through one slice, by row, at a time.
+      for (int k_height = 0; k_height < block_height; ++k_height) {
+        const int8_t* scratch_data =
+            scratch_block_data +
+            workspace_height_stride * k_height * stride_val;
+        uint8_t* output_data =
+            output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+        memcpy(input_bank_0, scratch_data, 4);
+        memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
+        memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8_t* input_data = scratch_data + 4 * i_width;
+
+          memcpy(input_bank_0 + 4, input_data + 4, 4);
+          memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
+          memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
+                 4);
+
+          // Iterate over input width shifts within 4x4 blocks.
+          for (int w = 0; w < output_width; ++w) {
+            constexpr int offset =
+                0;  // Shift input instead of offset in multiply-accumulate.
+
+            {
+              const int s = 0;
+              for (int d = 0; d < 4; ++d) {
+                int32_t acc = bias_data[s * 4 + d];
+                for (int x = 0; x < 4; ++x) {
+                  int32_t input_val_0 = input_bank_0[offset + x];
+                  int32_t filter_val_0 = filter_bank_a_0[d][x];
+                  acc += filter_val_0 * input_val_0;
+                  int32_t input_val_1 = input_bank_1[offset + x];
+                  int32_t filter_val_1 = filter_bank_a_1[d][x];
+                  acc += filter_val_1 * input_val_1;
+                  int32_t input_val_2 = input_bank_2[offset + x];
+                  int32_t filter_val_2 = filter_bank_a_2[d][x];
+                  acc += filter_val_2 * input_val_2;
+                }
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[s][d] = static_cast<uint8_t>(acc);
+
+                output_data[s * 4 + d] = output_values[s][d];
+              }
+            }
+            {
+              const int s = 1;
+              for (int d = 0; d < 4; ++d) {
+                int32_t acc = bias_data[s * 4 + d];
+                for (int x = 0; x < 4; ++x) {
+                  int32_t input_val_0 = input_bank_0[offset + x];
+                  int32_t filter_val_0 = filter_bank_b_0[d][x];
+                  acc += filter_val_0 * input_val_0;
+                  int32_t input_val_1 = input_bank_1[offset + x];
+                  int32_t filter_val_1 = filter_bank_b_1[d][x];
+                  acc += filter_val_1 * input_val_1;
+                  int32_t input_val_2 = input_bank_2[offset + x];
+                  int32_t filter_val_2 = filter_bank_b_2[d][x];
+                  acc += filter_val_2 * input_val_2;
+                }
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[s][d] = static_cast<uint8_t>(acc);
+
+                output_data[s * 4 + d] = output_values[s][d];
+              }
+            }
+
+            // Simulate register shifts.
+            for (int i = 0; i < (8 - stride_val); ++i) {
+              input_bank_0[i] = input_bank_0[i + stride_val];
+              input_bank_1[i] = input_bank_1[i + stride_val];
+              input_bank_2[i] = input_bank_2[i + stride_val];
+            }
+
+            output_data += output_depth;
+          }
+        }
+      }
+      bias_data += 2 * bias_increment;
+      filter_workspace += shuffled_filter_increment;
+    }
+  }
+};
+// The preceding section is only compiled when kUseUnwound3x3DotProduct versions
+// of templated functions are selected.
+//
+// End of code section containing intermediate code transformation.
+
+#ifdef USE_NEON
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kNonPerChannelUint8,
+    DepthwiseConvDepthMultiplication::kNoMultiplication,
+    /*stride=*/1> {
+  static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
+  static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
+    return vmin_u8(a, b);
+  }
+  static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
+    return vmax_u8(a, b);
+  }
+  static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
+    return vminq_u8(a, b);
+  }
+  static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
+    return vmaxq_u8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, uint8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kNonPerChannelUint8;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_multiplier = function_params->output_multiplier;
+    const int32_t output_shift = function_params->output_shift;
+    const int32_t output_offset = function_params->output_offset;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8_t>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8_t>(output_activation_max));
+
+    const int8_t* input_data_depthwise = scratch_block_data;
+    typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+      filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+      filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8_t* input_data_base = input_data_depthwise + 2 * 8 * s;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data_base = output_data_depthwise + 4 * s;
+
+          const int8_t* next_input_data = input_data_base;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          bias_data += kBiasIncrement;
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(next_input_data + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(next_input_data + 4 * workspace_height_stride);
+          int8x16_t left_bank_5_reg =
+              vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+          acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+          acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+          acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += depth;
+            }
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+
+            // Loading of next block always valid.
+            right_bank_0_reg = vld1q_s8(next_input_data);
+            right_bank_1_reg =
+                vld1q_s8(next_input_data + workspace_height_stride);
+            right_bank_2_reg =
+                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+            right_bank_3_reg =
+                vld1q_s8(next_input_data + 3 * workspace_height_stride);
+            right_bank_4_reg =
+                vld1q_s8(next_input_data + 4 * workspace_height_stride);
+            right_bank_5_reg =
+                vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              left_bank_0_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
+              left_bank_1_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
+              left_bank_2_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
+              left_bank_3_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
+              left_bank_4_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
+              left_bank_5_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_5_reg)));
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+              left_bank_5_reg = right_bank_5_reg;
+
+              output_data += depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += width_micro_stride;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            if (no_right_block) {
+              // Only needed for sanitizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+              biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+              biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+              biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
+              biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
+              biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+          input_data_base += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+          filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+          filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+        }
+      } else {
+        const int8_t* input_data_base = input_data_depthwise;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data_base = output_data_depthwise;
+
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* next_input_data = input_data_base;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg_a =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg_a =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
+          int8x16_t left_bank_1_reg_b =
+              vld1q_s8(next_input_data + workspace_height_stride + 16);
+          int8x16_t left_bank_2_reg_b =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+            const int output_width =
+                i_width == output_width_micro_repeats ? residual_width : 4;
+
+            int8x16_t right_bank_0_reg_a;
+            int8x16_t right_bank_1_reg_a;
+            int8x16_t right_bank_2_reg_a;
+            int8x16_t right_bank_0_reg_b;
+            int8x16_t right_bank_1_reg_b;
+            int8x16_t right_bank_2_reg_b;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            // Load next sub-micro block of data.
+            if (no_right_block) {
+              // Only needed for sanitizer checks.
+              right_bank_0_reg_a = vdupq_n_s8(0);
+              right_bank_1_reg_a = vdupq_n_s8(0);
+              right_bank_2_reg_a = vdupq_n_s8(0);
+              right_bank_0_reg_b = vdupq_n_s8(0);
+              right_bank_1_reg_b = vdupq_n_s8(0);
+              right_bank_2_reg_b = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg_a = vld1q_s8(next_input_data);
+              right_bank_1_reg_a =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg_a =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
+              right_bank_1_reg_b =
+                  vld1q_s8(next_input_data + workspace_height_stride + 16);
+              right_bank_2_reg_b =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              int32x4_t acc_a = adjusted_bias_data_a;
+              int32x4_t acc_b = adjusted_bias_data_b;
+              acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
+              acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
+              acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
+              acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
+              acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
+              acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
+
+              // Fixed-point multiplication.
+              acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
+              acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
+              acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_a, -output_shift);
+              acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_b, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_0 =
+                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
+              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
+              acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
+                                        vget_low_u8(output_activation_min_vec));
+              acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
+                                        vget_low_u8(output_activation_max_vec));
+
+              util_vst1_u8(output_data, acc_u8_0_0);
+
+              biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
+              biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
+              biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
+              biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
+              biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
+              biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
+
+              output_data += depth;
+            }
+          }
+          input_data_base += workspace_height_stride;
+          output_data_base += output_height_stride;
+        }
+      }
+      input_data_depthwise += depth_micro_stride;
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kNonPerChannelUint8,
+    DepthwiseConvDepthMultiplication::kNoMultiplication,
+    /*stride=*/2> {
+  static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
+  static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
+    return vmin_u8(a, b);
+  }
+  static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
+    return vmax_u8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, uint8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kNonPerChannelUint8;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    constexpr int kStrideVal = 2;
+    constexpr int kFourOverStride = 2;
+    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
+    TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_multiplier = function_params->output_multiplier;
+    const int32_t output_shift = function_params->output_shift;
+    const int32_t output_offset = function_params->output_offset;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    // This version only does min/max on 64 bits.
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const uint8x8_t output_activation_min_vec =
+        vdup_n_u8(static_cast<uint8_t>(output_activation_min));
+    const uint8x8_t output_activation_max_vec =
+        vdup_n_u8(static_cast<uint8_t>(output_activation_max));
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    TFLITE_DCHECK_LE(block_height, 2);
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8_t* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      if (block_height == 2) {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8_t* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_block_data + 8 * j_depth;
+          const int8_t* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(input_data_0 + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(input_data_0 + 4 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+          int8x16_t right_bank_3_reg;
+          int8x16_t right_bank_4_reg;
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8;
+
+          int i_width = 0;
+
+          // When output_width_micro_repeats <
+          // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
+          // residual_width == 1 is then true iff residual_width < 2.
+          const int adjusted_width_micro_repeats =
+              (output_width_micro_repeats <
+               output_width_overall_micro_repeats) &&
+                      (residual_width == 1)
+                  ? output_width_micro_repeats
+                  : output_width_overall_micro_repeats;
+
+          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+            const int output_width = kFourOverStride;
+            TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+            const int8_t* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            acc0 = adjusted_bias_data;
+            acc1 = adjusted_bias_data;
+            right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+            right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                        workspace_height_stride);
+
+            acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+            typename QuantizationTypeImpl<quantization_type>::ExternalType*
+                output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                        2 * workspace_height_stride);
+            right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
+                                        3 * workspace_height_stride);
+            acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+            right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
+                                        4 * workspace_height_stride);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            left_bank_0_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
+            left_bank_1_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
+            left_bank_2_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
+            left_bank_3_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
+            left_bank_4_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
+            acc0 = adjusted_bias_data;
+            acc1 = adjusted_bias_data;
+            vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+            vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+            vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+            vst1_lane_u8x4(output_data_base, acc_u8, 0);
+            vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8, 1);
+
+            vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+            vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+
+            acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            vst1_lane_u8x4(output_data_base + depth, acc_u8, 0);
+            vst1_lane_u8x4(output_data_base + depth + output_height_stride,
+                           acc_u8, 1);
+
+            left_bank_0_reg = right_bank_0_reg;
+            left_bank_1_reg = right_bank_1_reg;
+            left_bank_2_reg = right_bank_2_reg;
+            left_bank_3_reg = right_bank_3_reg;
+            left_bank_4_reg = right_bank_4_reg;
+          }
+          for (; i_width < output_width_overall_micro_repeats; ++i_width) {
+            TFLITE_DCHECK_NE(residual_width, kFourOverStride);
+
+            // No need to load next ("right") block of data.
+
+            typename QuantizationTypeImpl<quantization_type>::ExternalType*
+                output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
+              acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+              acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_u8x4(output_data_base, acc_u8, 0);
+              vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8,
+                             1);
+
+              left_bank_0_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
+              left_bank_1_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
+              left_bank_2_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
+              left_bank_3_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
+              left_bank_4_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+            }
+          }
+          bias_data += kBiasIncrement;
+        }
+      } else {
+        // block_height == 1.
+        int8x16_t filter_reg_0_a;
+        int8x16_t filter_reg_1_a;
+        int8x16_t filter_reg_2_a;
+        int8x16_t filter_reg_0_b;
+        int8x16_t filter_reg_1_b;
+        int8x16_t filter_reg_2_b;
+
+        filter_reg_0_a = vld1q_s8(filter_block);
+        filter_reg_1_a = vld1q_s8(filter_block + 32);
+        filter_reg_2_a = vld1q_s8(filter_block + 64);
+        filter_reg_0_b = vld1q_s8(filter_block + 16);
+        filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
+        filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
+
+        const int8_t* scratch_data =
+            scratch_block_data + depth_micro_stride * j_depth;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data = output_block_data + 8 * j_depth;
+        const int8_t* input_data_0 = scratch_data;
+
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+
+        // Load first sub-micro block of data into operational banks.
+        int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
+        int8x16_t left_bank_1_reg_a =
+            vld1q_s8(input_data_0 + workspace_height_stride);
+        int8x16_t left_bank_2_reg_a =
+            vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+        int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
+        int8x16_t left_bank_1_reg_b =
+            vld1q_s8(input_data_0 + workspace_height_stride + 16);
+        int8x16_t left_bank_2_reg_b =
+            vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
+
+        int8x16_t right_bank_0_reg_a;
+        int8x16_t right_bank_1_reg_a;
+        int8x16_t right_bank_2_reg_a;
+        int8x16_t right_bank_0_reg_b;
+        int8x16_t right_bank_1_reg_b;
+        int8x16_t right_bank_2_reg_b;
+
+        int32x4_t acc0_a;
+        int32x4_t acc0_b;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : kFourOverStride;
+          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+          const int8_t* input_data =
+              input_data_0 + width_micro_stride * i_width;
+          const bool no_right_block = i_width == output_width_micro_repeats &&
+                                      output_width_overall_micro_repeats ==
+                                          workspace_width_micro_repeats;
+
+          if (!no_right_block) {
+            // Load next sub-micro block of data.
+            right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
+            right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+            right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+            right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
+            right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride + 16);
+            right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride + 16);
+          }
+
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data_base = output_data + depth * 2 * i_width;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0_a = adjusted_bias_data_a;
+            acc0_b = adjusted_bias_data_b;
+
+            acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
+
+            // Fixed-point multiplication.
+            acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
+            acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
+            acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_a, -output_shift);
+            acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_b, -output_shift);
+            // Add the output offset.
+            int16x8_t acc_s16_0_1 =
+                vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            util_vst1_u8(output_data_base, acc_u8);
+
+            left_bank_0_reg_a = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_a)));
+            left_bank_1_reg_a = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_a)));
+            left_bank_2_reg_a = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_a)));
+            left_bank_0_reg_b = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_b)));
+            left_bank_1_reg_b = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_b)));
+            left_bank_2_reg_b = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_b)));
+            vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
+          }
+
+          if (output_width > 1) {
+            acc0_a = adjusted_bias_data_a;
+            acc0_b = adjusted_bias_data_b;
+
+            acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
+
+            // Fixed-point multiplication.
+            acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
+            acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
+            acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_a, -output_shift);
+            acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0_b, -output_shift);
+            // Add the output offset.
+            int16x8_t acc_s16_0_1 =
+                vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            util_vst1_u8(output_data_base + depth, acc_u8);
+
+            left_bank_0_reg_a = right_bank_0_reg_a;
+            left_bank_1_reg_a = right_bank_1_reg_a;
+            left_bank_2_reg_a = right_bank_2_reg_a;
+            left_bank_0_reg_b = right_bank_0_reg_b;
+            left_bank_1_reg_b = right_bank_1_reg_b;
+            left_bank_2_reg_b = right_bank_2_reg_b;
+          }
+        }
+      }
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kNonPerChannelUint8,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth,
+    /*stride=*/1> {
+  static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
+  static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
+    return vmin_u8(a, b);
+  }
+  static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
+    return vmax_u8(a, b);
+  }
+  static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
+    return vminq_u8(a, b);
+  }
+  static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
+    return vmaxq_u8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, uint8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kNonPerChannelUint8;
+
+    TFLITE_DCHECK_EQ(function_params->stride, 1);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_multiplier = function_params->output_multiplier;
+    const int32_t output_shift = function_params->output_shift;
+    const int32_t output_offset = function_params->output_offset;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8_t>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8_t>(output_activation_max));
+
+    typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+      filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+      filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+
+      // When output_width_micro_repeats < output_width_overall_micro_repeats,
+      // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
+      // residual_width < 2.
+      const int adjusted_width_micro_repeats =
+          (output_width_micro_repeats < output_width_overall_micro_repeats) &&
+                  (residual_width < 4)
+              ? output_width_micro_repeats
+              : output_width_overall_micro_repeats;
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data_base = output_data_depthwise + 4 * s;
+
+          const int8_t* next_input_data = scratch_block_data;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          bias_data += kBiasIncrement;
+
+          int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+          int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_a_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_a_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+          input_bank_b_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_b_reg =
+              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                             input_bank_b_reg, 2);
+          input_bank_c_reg = vld1q_dup_s8x4(
+              next_input_data +
+              4 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_c_reg =
+              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                             input_bank_c_reg, 2);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
+          acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
+          acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
+
+          int i_width = 0;
+          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+            next_input_data += 4;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += output_depth;
+            }
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              input_bank_a_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+              input_bank_b_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+              input_bank_c_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              input_bank_a_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+              input_bank_b_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+              input_bank_c_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+              output_data += output_depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+
+          if (i_width < output_width_overall_micro_repeats) {
+            next_input_data += 4;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
+                                                  vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              input_bank_a_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
+              input_bank_b_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
+              input_bank_c_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
+
+              output_data += output_depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+          // scratch_block_data += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+          filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+          filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+        }
+      } else {
+        // Block height < 4.
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data_base = output_data_depthwise;
+
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* next_input_data =
+              scratch_block_data + k_height * workspace_height_stride;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_p_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_p_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_p_reg, 2);
+          input_bank_q_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            next_input_data += 4;
+            const int output_width =
+                i_width == output_width_micro_repeats ? residual_width : 4;
+
+            // Load next sub-micro block of data.
+            input_bank_p_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
+            input_bank_p_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_p_reg, 3);
+            input_bank_q_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_q_reg, 1);
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              int32x4_t acc_a = adjusted_bias_data_a;
+              int32x4_t acc_b = adjusted_bias_data_b;
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
+                                          input_bank_p_reg, 0);
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
+                                          input_bank_p_reg, 2);
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
+                                          input_bank_q_reg, 0);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
+                                          input_bank_p_reg, 0);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
+                                          input_bank_p_reg, 2);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
+                                          input_bank_q_reg, 0);
+
+              // Fixed-point multiplication.
+              acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
+              acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
+              acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_a, -output_shift);
+              acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc_b, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_0 =
+                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
+              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
+              acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
+                                        vget_low_u8(output_activation_min_vec));
+              acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
+                                        vget_low_u8(output_activation_max_vec));
+
+              util_vst1_u8(output_data, acc_u8_0_0);
+
+              input_bank_p_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_p_reg), 8));
+              input_bank_q_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_q_reg), 8));
+
+              output_data += output_depth;
+            }
+          }
+          output_data_base += output_height_stride;
+        }
+      }
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kNonPerChannelUint8,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth,
+    /*stride=*/2> {
+  static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
+  static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
+    return vmin_u8(a, b);
+  }
+  static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
+    return vmax_u8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, uint8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kNonPerChannelUint8;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    constexpr int kStrideVal = 2;
+    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_multiplier = function_params->output_multiplier;
+    const int32_t output_shift = function_params->output_shift;
+    const int32_t output_offset = function_params->output_offset;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8_t>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8_t>(output_activation_max));
+
+    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
+      bias_data += kBiasIncrement;
+      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
+      bias_data += kBiasIncrement;
+
+      if (block_height == 2) {
+        const int8_t* scratch_data = scratch_block_data;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_b_reg = vld1q_lane_8x4(
+            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
+        input_bank_c_reg = vld1q_dup_s8x4(
+            scratch_data +
+            4 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        // When output_width_micro_repeats < output_width_overall_micro_repeats,
+        // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
+        // residual_width < 2.
+        const int adjusted_width_micro_repeats =
+            (output_width_micro_repeats < output_width_overall_micro_repeats) &&
+                    (residual_width < 2)
+                ? output_width_micro_repeats
+                : output_width_overall_micro_repeats;
+
+        int i_width = 0;
+        for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+          const int8_t* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                           1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+            input_bank_c_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+            output_data += output_depth;
+          }
+
+          // output_width == four_over_stride.
+          acc0 = adjusted_bias_data_s_0;
+          acc1 = adjusted_bias_data_s_0;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc0, -output_shift);
+          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc1, -output_shift);
+          // Add the output offset.
+          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+          // Apply the activation function.
+          acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+          acc_u8_0_1 =
+              util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+          acc_u8_0_1 =
+              util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+          vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
+          vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+          acc0 = adjusted_bias_data_s_1;
+          acc1 = adjusted_bias_data_s_1;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc0, -output_shift);
+          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+              acc1, -output_shift);
+          // Add the output offset.
+          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+          // Apply the activation function.
+          acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+          acc_u8_0_1 =
+              util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+          acc_u8_0_1 =
+              util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+          vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
+          vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
+
+          input_bank_a_reg = vreinterpretq_s8_u64(
+              vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
+          input_bank_b_reg = vreinterpretq_s8_u64(
+              vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
+          input_bank_c_reg = vreinterpretq_s8_u64(
+              vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
+
+          output_data += output_depth;
+        }
+        for (; i_width < output_width_overall_micro_repeats; ++i_width) {
+          // output_width == 1.
+          const int8_t* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_u8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_u8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_u8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_u8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                           1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+            input_bank_c_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+            output_data += output_depth;
+          }
+        }
+      } else {
+        TFLITE_DCHECK_EQ(block_height, 1);
+        // Work through one slice, by row, at a time.
+        const int8_t* scratch_data = scratch_block_data;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width =
+              i_width == output_width_micro_repeats ? residual_width : 2;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+          const int8_t* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            util_vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            util_vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+
+            output_data += output_depth;
+          }
+        }
+      }
+    }
+  }
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, uint8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kPerChannelInt8,
+    DepthwiseConvDepthMultiplication::kNoMultiplication,
+    /*stride=*/1> {
+  static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
+  static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
+    return vmin_s8(a, b);
+  }
+  static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
+    return vmax_s8(a, b);
+  }
+  static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
+    return vminq_s8(a, b);
+  }
+  static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
+    return vmaxq_s8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, int8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kPerChannelInt8;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_offset = function_params->output_offset;
+    const int32_t* output_shift_per_channel =
+        function_params->output_shift_per_channel;
+    const int32_t* output_multiplier_per_channel =
+        function_params->output_multiplier_per_channel;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+      TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
+      TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const int8x16_t output_activation_min_vec =
+        vdupq_n_s8(static_cast<int8_t>(output_activation_min));
+    const int8x16_t output_activation_max_vec =
+        vdupq_n_s8(static_cast<int8_t>(output_activation_max));
+
+    const int8_t* input_data_depthwise = scratch_block_data;
+    typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+      filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+      filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8_t* input_data_base = input_data_depthwise + 2 * 8 * s;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data_base = output_data_depthwise + 4 * s;
+
+          const int8_t* next_input_data = input_data_base;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          bias_data += kBiasIncrement;
+
+          const int32x4_t output_shift =
+              vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
+          const int32x4_t output_multiplier =
+              vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(next_input_data + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(next_input_data + 4 * workspace_height_stride);
+          int8x16_t left_bank_5_reg =
+              vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+          acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+          acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+          acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += depth;
+            }
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+
+            // Loading of next block always valid.
+            right_bank_0_reg = vld1q_s8(next_input_data);
+            right_bank_1_reg =
+                vld1q_s8(next_input_data + workspace_height_stride);
+            right_bank_2_reg =
+                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+            right_bank_3_reg =
+                vld1q_s8(next_input_data + 3 * workspace_height_stride);
+            right_bank_4_reg =
+                vld1q_s8(next_input_data + 4 * workspace_height_stride);
+            right_bank_5_reg =
+                vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              left_bank_0_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
+              left_bank_1_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
+              left_bank_2_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
+              left_bank_3_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
+              left_bank_4_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
+              left_bank_5_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_5_reg)));
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+              left_bank_5_reg = right_bank_5_reg;
+
+              output_data += depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += width_micro_stride;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            if (no_right_block) {
+              // Only needed for sanitizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+              biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+              biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+              biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
+              biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
+              biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+          input_data_base += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+          filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+          filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+        }
+      } else {
+        const int8_t* input_data_base = input_data_depthwise;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data_base = output_data_depthwise;
+
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+
+        const int32x4_t output_shift_a =
+            vld1q_s32(output_shift_per_channel + j_depth * 8);
+        const int32x4_t output_multiplier_a =
+            vld1q_s32(output_multiplier_per_channel + j_depth * 8);
+        const int32x4_t output_shift_b =
+            vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
+        const int32x4_t output_multiplier_b =
+            vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
+
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* next_input_data = input_data_base;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg_a =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg_a =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
+          int8x16_t left_bank_1_reg_b =
+              vld1q_s8(next_input_data + workspace_height_stride + 16);
+          int8x16_t left_bank_2_reg_b =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+            const int output_width =
+                i_width == output_width_micro_repeats ? residual_width : 4;
+
+            int8x16_t right_bank_0_reg_a;
+            int8x16_t right_bank_1_reg_a;
+            int8x16_t right_bank_2_reg_a;
+            int8x16_t right_bank_0_reg_b;
+            int8x16_t right_bank_1_reg_b;
+            int8x16_t right_bank_2_reg_b;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            // Load next sub-micro block of data.
+            if (no_right_block) {
+              // Only needed for sanitizer checks.
+              right_bank_0_reg_a = vdupq_n_s8(0);
+              right_bank_1_reg_a = vdupq_n_s8(0);
+              right_bank_2_reg_a = vdupq_n_s8(0);
+              right_bank_0_reg_b = vdupq_n_s8(0);
+              right_bank_1_reg_b = vdupq_n_s8(0);
+              right_bank_2_reg_b = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg_a = vld1q_s8(next_input_data);
+              right_bank_1_reg_a =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg_a =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
+              right_bank_1_reg_b =
+                  vld1q_s8(next_input_data + workspace_height_stride + 16);
+              right_bank_2_reg_b =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              int32x4_t acc_a = adjusted_bias_data_a;
+              int32x4_t acc_b = adjusted_bias_data_b;
+              acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
+              acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
+              acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
+              acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
+              acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
+              acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
+
+              // Fixed-point multiplication.
+              acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
+              acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
+              acc_a =
+                  DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                      acc_a, output_shift_a);
+              acc_b =
+                  DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                      acc_b, output_shift_b);
+              // Add the output offset.
+              int16x8_t acc_s16_0_0 =
+                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
+              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+              // Apply the activation function.
+              int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
+              acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
+                                        vget_low_s8(output_activation_min_vec));
+              acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
+                                        vget_low_s8(output_activation_max_vec));
+
+              vst1_s8(output_data, acc_u8_0_0);
+
+              biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
+              biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
+              biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
+              biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
+              biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
+              biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
+
+              output_data += depth;
+            }
+          }
+          input_data_base += workspace_height_stride;
+          output_data_base += output_height_stride;
+        }
+      }
+      input_data_depthwise += depth_micro_stride;
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, int8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kPerChannelInt8,
+    DepthwiseConvDepthMultiplication::kNoMultiplication,
+    /*stride=*/2> {
+  static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
+  static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
+    return vmin_s8(a, b);
+  }
+  static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
+    return vmax_s8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, int8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kPerChannelInt8;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    constexpr int kStrideVal = 2;
+    constexpr int kFourOverStride = 2;
+    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
+    TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_offset = function_params->output_offset;
+    const int32_t* output_shift_per_channel =
+        function_params->output_shift_per_channel;
+    const int32_t* output_multiplier_per_channel =
+        function_params->output_multiplier_per_channel;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+      TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
+      TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    // This version only does min/max on 64 bits.
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const int8x8_t output_activation_min_vec =
+        vdup_n_s8(static_cast<int8_t>(output_activation_min));
+    const int8x8_t output_activation_max_vec =
+        vdup_n_s8(static_cast<int8_t>(output_activation_max));
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    TFLITE_DCHECK_LE(block_height, 2);
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8_t* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      if (block_height == 2) {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8_t* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_block_data + 8 * j_depth;
+          const int8_t* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+
+          const int32x4_t output_shift =
+              vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
+          const int32x4_t output_multiplier =
+              vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(input_data_0 + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(input_data_0 + 4 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+          int8x16_t right_bank_3_reg;
+          int8x16_t right_bank_4_reg;
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int16x8_t acc_s16_0_1;
+          int8x8_t acc_u8;
+
+          int i_width = 0;
+
+          // When output_width_micro_repeats <
+          // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
+          // residual_width == 1 is then true iff residual_width < 2.
+          const int adjusted_width_micro_repeats =
+              (output_width_micro_repeats <
+               output_width_overall_micro_repeats) &&
+                      (residual_width == 1)
+                  ? output_width_micro_repeats
+                  : output_width_overall_micro_repeats;
+
+          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+            const int output_width = kFourOverStride;
+            TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+            const int8_t* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            acc0 = adjusted_bias_data;
+            acc1 = adjusted_bias_data;
+            right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+            right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                        workspace_height_stride);
+
+            acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+            typename QuantizationTypeImpl<quantization_type>::ExternalType*
+                output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                        2 * workspace_height_stride);
+            right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
+                                        3 * workspace_height_stride);
+            acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+            right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
+                                        4 * workspace_height_stride);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift);
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            left_bank_0_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
+            left_bank_1_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
+            left_bank_2_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
+            left_bank_3_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
+            left_bank_4_reg = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
+            acc0 = adjusted_bias_data;
+            acc1 = adjusted_bias_data;
+            vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+            vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+            vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+            vst1_lane_s8x4(output_data_base, acc_u8, 0);
+            vst1_lane_s8x4(output_data_base + output_height_stride, acc_u8, 1);
+
+            vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+            vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+
+            acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+            acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+            acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift);
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            vst1_lane_s8x4(output_data_base + depth, acc_u8, 0);
+            vst1_lane_s8x4(output_data_base + depth + output_height_stride,
+                           acc_u8, 1);
+
+            left_bank_0_reg = right_bank_0_reg;
+            left_bank_1_reg = right_bank_1_reg;
+            left_bank_2_reg = right_bank_2_reg;
+            left_bank_3_reg = right_bank_3_reg;
+            left_bank_4_reg = right_bank_4_reg;
+          }
+          for (; i_width < output_width_overall_micro_repeats; ++i_width) {
+            TFLITE_DCHECK_NE(residual_width, kFourOverStride);
+
+            // No need to load next ("right") block of data.
+
+            typename QuantizationTypeImpl<quantization_type>::ExternalType*
+                output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
+              acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+              acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_s8x4(output_data_base, acc_u8, 0);
+              vst1_lane_s8x4(output_data_base + output_height_stride, acc_u8,
+                             1);
+
+              left_bank_0_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg)));
+              left_bank_1_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg)));
+              left_bank_2_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg)));
+              left_bank_3_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_3_reg)));
+              left_bank_4_reg = vreinterpretq_s8_u16(
+                  vrev32q_u16(vreinterpretq_u16_s8(left_bank_4_reg)));
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+            }
+          }
+          bias_data += kBiasIncrement;
+        }
+      } else {
+        // block_height == 1.
+        int8x16_t filter_reg_0_a;
+        int8x16_t filter_reg_1_a;
+        int8x16_t filter_reg_2_a;
+        int8x16_t filter_reg_0_b;
+        int8x16_t filter_reg_1_b;
+        int8x16_t filter_reg_2_b;
+
+        filter_reg_0_a = vld1q_s8(filter_block);
+        filter_reg_1_a = vld1q_s8(filter_block + 32);
+        filter_reg_2_a = vld1q_s8(filter_block + 64);
+        filter_reg_0_b = vld1q_s8(filter_block + 16);
+        filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
+        filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
+
+        const int8_t* scratch_data =
+            scratch_block_data + depth_micro_stride * j_depth;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data = output_block_data + 8 * j_depth;
+        const int8_t* input_data_0 = scratch_data;
+
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+
+        const int32x4_t output_shift_a =
+            vld1q_s32(output_shift_per_channel + j_depth * 8);
+        const int32x4_t output_multiplier_a =
+            vld1q_s32(output_multiplier_per_channel + j_depth * 8);
+        const int32x4_t output_shift_b =
+            vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
+        const int32x4_t output_multiplier_b =
+            vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
+
+        // Load first sub-micro block of data into operational banks.
+        int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
+        int8x16_t left_bank_1_reg_a =
+            vld1q_s8(input_data_0 + workspace_height_stride);
+        int8x16_t left_bank_2_reg_a =
+            vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+        int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
+        int8x16_t left_bank_1_reg_b =
+            vld1q_s8(input_data_0 + workspace_height_stride + 16);
+        int8x16_t left_bank_2_reg_b =
+            vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
+
+        int8x16_t right_bank_0_reg_a;
+        int8x16_t right_bank_1_reg_a;
+        int8x16_t right_bank_2_reg_a;
+        int8x16_t right_bank_0_reg_b;
+        int8x16_t right_bank_1_reg_b;
+        int8x16_t right_bank_2_reg_b;
+
+        int32x4_t acc0_a;
+        int32x4_t acc0_b;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : kFourOverStride;
+          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+          const int8_t* input_data =
+              input_data_0 + width_micro_stride * i_width;
+          const bool no_right_block = i_width == output_width_micro_repeats &&
+                                      output_width_overall_micro_repeats ==
+                                          workspace_width_micro_repeats;
+
+          if (!no_right_block) {
+            // Load next sub-micro block of data.
+            right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
+            right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+            right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+            right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
+            right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride + 16);
+            right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride + 16);
+          }
+
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data_base = output_data + depth * 2 * i_width;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0_a = adjusted_bias_data_a;
+            acc0_b = adjusted_bias_data_b;
+
+            acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
+
+            // Fixed-point multiplication.
+            acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
+            acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
+            acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0_a, output_shift_a);
+            acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0_b, output_shift_b);
+            // Add the output offset.
+            int16x8_t acc_s16_0_1 =
+                vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            vst1_s8(output_data_base, acc_u8);
+
+            left_bank_0_reg_a = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_a)));
+            left_bank_1_reg_a = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_a)));
+            left_bank_2_reg_a = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_a)));
+            left_bank_0_reg_b = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_0_reg_b)));
+            left_bank_1_reg_b = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_1_reg_b)));
+            left_bank_2_reg_b = vreinterpretq_s8_u16(
+                vrev32q_u16(vreinterpretq_u16_s8(left_bank_2_reg_b)));
+            vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
+            vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
+            vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
+          }
+
+          if (output_width > 1) {
+            acc0_a = adjusted_bias_data_a;
+            acc0_b = adjusted_bias_data_b;
+
+            acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
+            acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
+            acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
+
+            // Fixed-point multiplication.
+            acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
+            acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
+            acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0_a, output_shift_a);
+            acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0_b, output_shift_b);
+            // Add the output offset.
+            int16x8_t acc_s16_0_1 =
+                vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
+            acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
+
+            vst1_s8(output_data_base + depth, acc_u8);
+
+            left_bank_0_reg_a = right_bank_0_reg_a;
+            left_bank_1_reg_a = right_bank_1_reg_a;
+            left_bank_2_reg_a = right_bank_2_reg_a;
+            left_bank_0_reg_b = right_bank_0_reg_b;
+            left_bank_1_reg_b = right_bank_1_reg_b;
+            left_bank_2_reg_b = right_bank_2_reg_b;
+          }
+        }
+      }
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, int8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kPerChannelInt8,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth,
+    /*stride=*/1> {
+  static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
+  static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
+    return vmin_s8(a, b);
+  }
+  static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
+    return vmax_s8(a, b);
+  }
+  static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
+    return vminq_s8(a, b);
+  }
+  static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
+    return vmaxq_s8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, int8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kPerChannelInt8;
+
+    TFLITE_DCHECK_EQ(function_params->stride, 1);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_offset = function_params->output_offset;
+    const int32_t* output_shift_per_channel =
+        function_params->output_shift_per_channel;
+    const int32_t* output_multiplier_per_channel =
+        function_params->output_multiplier_per_channel;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+      TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
+      TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const int8x16_t output_activation_min_vec =
+        vdupq_n_s8(static_cast<int8_t>(output_activation_min));
+    const int8x16_t output_activation_max_vec =
+        vdupq_n_s8(static_cast<int8_t>(output_activation_max));
+
+    typename QuantizationTypeImpl<quantization_type>::ExternalType*
+        output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+      filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+      filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+          vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+
+      // When output_width_micro_repeats < output_width_overall_micro_repeats,
+      // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
+      // residual_width < 2.
+      const int adjusted_width_micro_repeats =
+          (output_width_micro_repeats < output_width_overall_micro_repeats) &&
+                  (residual_width < 4)
+              ? output_width_micro_repeats
+              : output_width_overall_micro_repeats;
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data_base = output_data_depthwise + 4 * s;
+
+          const int8_t* next_input_data = scratch_block_data;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          bias_data += kBiasIncrement;
+
+          const int32x4_t output_shift =
+              vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
+          const int32x4_t output_multiplier =
+              vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
+
+          int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+          int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_a_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_a_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+          input_bank_b_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_b_reg =
+              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                             input_bank_b_reg, 2);
+          input_bank_c_reg = vld1q_dup_s8x4(
+              next_input_data +
+              4 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_c_reg =
+              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                             input_bank_c_reg, 2);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
+          acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
+          acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
+
+          int i_width = 0;
+          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+            next_input_data += 4;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += output_depth;
+            }
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              input_bank_a_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+              input_bank_b_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+              input_bank_c_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              input_bank_a_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+              input_bank_b_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+              input_bank_c_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+              output_data += output_depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+
+          if (i_width < output_width_overall_micro_repeats) {
+            next_input_data += 4;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc0, output_shift);
+              acc1 = vqrdmulhq_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc1, output_shift);
+              acc2 = vqrdmulhq_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc2, output_shift);
+              acc3 = vqrdmulhq_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                  acc3, output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              int8x16_t acc_u8_all = vcombine_s8(vqmovxn_s16(acc_s16_0_1),
+                                                 vqmovxn_s16(acc_s16_2_3));
+              acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_s8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_s8x4(output_data + output_height_stride, acc_u8_all,
+                              1);
+              vst1q_lane_s8x4(output_data + 2 * output_height_stride,
+                              acc_u8_all, 2);
+              vst1q_lane_s8x4(output_data + 3 * output_height_stride,
+                              acc_u8_all, 3);
+
+              input_bank_a_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 8));
+              input_bank_b_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 8));
+              input_bank_c_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 8));
+
+              output_data += output_depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+          // scratch_block_data += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_0_a), 8));
+          filter_reg_1_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_1_a), 8));
+          filter_reg_2_a_shifted = vreinterpretq_s8_u32(
+              vshlq_n_u32(vreinterpretq_u32_s8(filter_reg_2_a), 8));
+        }
+      } else {
+        // Block height < 4.
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data_base = output_data_depthwise;
+
+        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+        bias_data += kBiasIncrement;
+
+        const int32x4_t output_shift_a =
+            vld1q_s32(output_shift_per_channel + j_depth * 8);
+        const int32x4_t output_multiplier_a =
+            vld1q_s32(output_multiplier_per_channel + j_depth * 8);
+        const int32x4_t output_shift_b =
+            vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
+        const int32x4_t output_multiplier_b =
+            vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
+
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8_t* next_input_data =
+              scratch_block_data + k_height * workspace_height_stride;
+          typename QuantizationTypeImpl<quantization_type>::ExternalType*
+              output_data = output_data_base;
+
+          int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_p_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_p_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_p_reg, 2);
+          input_bank_q_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            next_input_data += 4;
+            const int output_width =
+                i_width == output_width_micro_repeats ? residual_width : 4;
+
+            // Load next sub-micro block of data.
+            input_bank_p_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
+            input_bank_p_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_p_reg, 3);
+            input_bank_q_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_q_reg, 1);
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              int32x4_t acc_a = adjusted_bias_data_a;
+              int32x4_t acc_b = adjusted_bias_data_b;
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
+                                          input_bank_p_reg, 0);
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
+                                          input_bank_p_reg, 2);
+              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
+                                          input_bank_q_reg, 0);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
+                                          input_bank_p_reg, 0);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
+                                          input_bank_p_reg, 2);
+              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
+                                          input_bank_q_reg, 0);
+
+              // Fixed-point multiplication.
+              acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
+              acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
+              acc_a =
+                  DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                      acc_a, output_shift_a);
+              acc_b =
+                  DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                      acc_b, output_shift_b);
+              // Add the output offset.
+              int16x8_t acc_s16_0_0 =
+                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
+              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+              // Apply the activation function.
+              int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
+              acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
+                                        vget_low_s8(output_activation_min_vec));
+              acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
+                                        vget_low_s8(output_activation_max_vec));
+
+              vst1_s8(output_data, acc_u8_0_0);
+
+              input_bank_p_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_p_reg), 8));
+              input_bank_q_reg = vreinterpretq_s8_u64(
+                  vshrq_n_u64(vreinterpretq_u64_s8(input_bank_q_reg), 8));
+
+              output_data += output_depth;
+            }
+          }
+          output_data_base += output_height_stride;
+        }
+      }
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, int8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    QuantizationType::kPerChannelInt8,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth,
+    /*stride=*/2> {
+  static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
+  static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
+    return vmin_s8(a, b);
+  }
+  static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
+    return vmax_s8(a, b);
+  }
+
+  static inline void KernelMacroBlockIntrinsics(
+      const int8_t* scratch_block_data, const int8_t* filter_workspace,
+      const int32_t* bias_data, int8_t* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    static constexpr QuantizationType quantization_type =
+        QuantizationType::kPerChannelInt8;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    constexpr int kStrideVal = 2;
+    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int kBiasIncrement = 4;
+
+    const int32_t output_activation_min =
+        function_params->quantized_activation_min;
+    const int32_t output_activation_max =
+        function_params->quantized_activation_max;
+    const int32_t output_offset = function_params->output_offset;
+    const int32_t* output_shift_per_channel =
+        function_params->output_shift_per_channel;
+    const int32_t* output_multiplier_per_channel =
+        function_params->output_multiplier_per_channel;
+    if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+      TFLITE_DCHECK_GE(output_activation_min, 0);
+      TFLITE_DCHECK_LT(output_activation_min, 256);
+      TFLITE_DCHECK_GE(output_activation_max, 0);
+      TFLITE_DCHECK_LT(output_activation_max, 256);
+    } else {
+      TFLITE_DCHECK_GE(output_activation_min, -128);
+      TFLITE_DCHECK_LT(output_activation_min, 128);
+      TFLITE_DCHECK_GE(output_activation_max, -128);
+      TFLITE_DCHECK_LT(output_activation_max, 128);
+      TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
+      TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
+    }
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16_t>(output_offset));
+    const int8x16_t output_activation_min_vec =
+        vdupq_n_s8(static_cast<int8_t>(output_activation_min));
+    const int8x16_t output_activation_max_vec =
+        vdupq_n_s8(static_cast<int8_t>(output_activation_max));
+
+    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
+      bias_data += kBiasIncrement;
+      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
+      bias_data += kBiasIncrement;
+
+      const int32x4_t output_shift_s_0 =
+          vld1q_s32(output_shift_per_channel + j_depth * 8);
+      const int32x4_t output_multiplier_s_0 =
+          vld1q_s32(output_multiplier_per_channel + j_depth * 8);
+      const int32x4_t output_shift_s_1 =
+          vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
+      const int32x4_t output_multiplier_s_1 =
+          vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
+
+      if (block_height == 2) {
+        const int8_t* scratch_data = scratch_block_data;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_b_reg = vld1q_lane_8x4(
+            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
+        input_bank_c_reg = vld1q_dup_s8x4(
+            scratch_data +
+            4 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        // When output_width_micro_repeats < output_width_overall_micro_repeats,
+        // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
+        // residual_width < 2.
+        const int adjusted_width_micro_repeats =
+            (output_width_micro_repeats < output_width_overall_micro_repeats) &&
+                    (residual_width < 2)
+                ? output_width_micro_repeats
+                : output_width_overall_micro_repeats;
+
+        int i_width = 0;
+        for (; i_width < adjusted_width_micro_repeats; ++i_width) {
+          const int8_t* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          int8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift_s_0);
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift_s_0);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_max_vec));
+
+            vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift_s_1);
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift_s_1);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_max_vec));
+
+            vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                           1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+            input_bank_c_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+            output_data += output_depth;
+          }
+
+          // output_width == four_over_stride.
+          acc0 = adjusted_bias_data_s_0;
+          acc1 = adjusted_bias_data_s_0;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
+          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+              acc0, output_shift_s_0);
+          acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
+          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+              acc1, output_shift_s_0);
+          // Add the output offset.
+          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+          // Apply the activation function.
+          acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+          acc_u8_0_1 =
+              util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
+          acc_u8_0_1 =
+              util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
+
+          vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
+          vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+          acc0 = adjusted_bias_data_s_1;
+          acc1 = adjusted_bias_data_s_1;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
+          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+              acc0, output_shift_s_1);
+          acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
+          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+              acc1, output_shift_s_1);
+          // Add the output offset.
+          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+          // Apply the activation function.
+          acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+          acc_u8_0_1 =
+              util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
+          acc_u8_0_1 =
+              util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
+
+          vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
+          vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
+
+          input_bank_a_reg = vreinterpretq_s8_u64(
+              vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+          input_bank_b_reg = vreinterpretq_s8_u64(
+              vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+          input_bank_c_reg = vreinterpretq_s8_u64(
+              vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+          output_data += output_depth;
+        }
+        for (; i_width < output_width_overall_micro_repeats; ++i_width) {
+          // output_width == 1.
+          const int8_t* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          int8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift_s_0);
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift_s_0);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_max_vec));
+
+            vst1_lane_s8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_s8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift_s_1);
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift_s_1);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_max_vec));
+
+            vst1_lane_s8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_s8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                           1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+            input_bank_c_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_c_reg), 16));
+
+            output_data += output_depth;
+          }
+        }
+      } else {
+        TFLITE_DCHECK_EQ(block_height, 1);
+        // Work through one slice, by row, at a time.
+        const int8_t* scratch_data = scratch_block_data;
+        typename QuantizationTypeImpl<quantization_type>::ExternalType*
+            output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width =
+              i_width == output_width_micro_repeats ? residual_width : 2;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
+          const int8_t* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          int8x8_t acc_u8_0_1;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift_s_0);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift_s_1);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_s8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc0, output_shift_s_0);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
+                acc1, output_shift_s_1);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
+            acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_min_vec));
+            acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
+                                      vget_low_s8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_s8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_a_reg), 16));
+            input_bank_b_reg = vreinterpretq_s8_u64(
+                vshrq_n_u64(vreinterpretq_u64_s8(input_bank_b_reg), 16));
+
+            output_data += output_depth;
+          }
+        }
+      }
+    }
+  }
+
+  static inline void Run(const int8_t* scratch_block_data,
+                         const int8_t* filter_workspace,
+                         const int32_t* bias_data, int8_t* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+#undef vst1_lane_s8x4
+#undef vst1_lane_u8x4
+#undef vst1q_lane_s8x4
+#undef vst1q_lane_u8x4
+#undef vld1q_lane_s8x8
+#undef vld1_lane_8x4
+#undef vld1q_lane_8x4
+#undef vld1q_dup_s8x4
+
+#endif  //  USE_NEON
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
new file mode 100644
index 00000000..cf0beb68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#define EIGEN_USE_THREADS
+
+#define Eigen EigenForTFLite
+
+// NOTE: We need to define our own tensor contraction dispatch method before
+// including the unsupported/Eigen/CXX11/Tensor header in order to reduce the
+// total number of kernel instantiations.
+// If you have trouble simply undef out the reducer macro e.g.
+// TFLITE_REDUCE_INSTANTIATIONS, but be aware this will make
+// the binary much bigger!
+#define TFLITE_REDUCE_INSTANTIATIONS
+#if defined(TFLITE_REDUCE_INSTANTIATIONS)
+// Override Eigen tensor contraction dispatch method.
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)                  \
+  if (this->m_lhs_inner_dim_contiguous && this->m_rhs_inner_dim_contiguous && \
+      !this->m_rhs_inner_dim_reordered) {                                     \
+    METHOD<true, true, false, ALIGNMENT> ARGS;                                \
+  } else {                                                                    \
+    eigen_assert(false && "Unsupported contraction formats");                 \
+  }
+#endif
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions-inl.h"
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h
new file mode 100644
index 00000000..a9ff7920
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h
@@ -0,0 +1,152 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_FULLY_CONNECTED_4BIT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_FULLY_CONNECTED_4BIT_H_
+#include <stdint.h>
+
+#ifndef TFLITE_MMAP_DISABLED
+#include <sys/mman.h>
+#endif
+
+#include <cstdlib>
+#include <memory>
+
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+#include "tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h"
+#elif defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h"
+#else
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h"
+#endif
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Define 4-bit filter block size: 4x32 (64 bytes)
+constexpr int FilterWidth = 4;
+constexpr int FilterDepth = 32;
+constexpr int kDefaultAlignmentPadding = 63;
+
+struct Deleter {
+  explicit Deleter(size_t size = 0) : size(size) {}
+  void operator()(uint8_t* memory) {
+    if (!memory) {
+      return;
+    }
+#ifdef TFLITE_MMAP_DISABLED
+    delete[] memory;
+#else
+    munmap(memory, size);
+#endif
+  }
+  size_t size;
+};
+
+struct OpData4Bit {
+  int rows_right = 1;
+  int batch_size = 0;
+  bool needs_prepack = true;
+  uint8_t* prepacked_cache = nullptr;
+  std::unique_ptr<uint8_t[], Deleter> prepacked_cache_buffer;
+  size_t prepacked_cache_buffer_size = 0;
+
+  void AllocatePackedRegion(size_t required_size) {
+#ifdef TFLITE_MMAP_DISABLED
+    uint8_t* region = new uint8_t[required_size];
+    prepacked_cache_buffer =
+        std::unique_ptr<uint8_t[], Deleter>(region, Deleter());
+#else
+    uint8_t* region = reinterpret_cast<uint8_t*>(
+        mmap(nullptr, required_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+    prepacked_cache_buffer =
+        std::unique_ptr<uint8_t[], Deleter>(region, Deleter(required_size));
+#ifdef MADV_MERGEABLE
+    madvise(region, required_size, MADV_MERGEABLE);
+#endif
+#endif
+    prepacked_cache = reinterpret_cast<uint8_t*>(
+        (reinterpret_cast<uintptr_t>(prepacked_cache_buffer.get()) +
+         kDefaultAlignmentPadding) &
+        ~kDefaultAlignmentPadding);
+    prepacked_cache_buffer_size = required_size;
+  }
+};
+
+namespace api {
+/* Prepack lhs matrix into dest.
+ * Transform tensor from (src_rows, src_cols) to
+ * (layout_rows / width, layout_cols / depth, width, depth) with possibly
+ * padding, and interleaving values along depth / 2 dimensions.
+ * dest should be aligned and allocated before prepack.
+ */
+inline void Prepack(uint8_t* dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  optimized_4bit::Prepack(dest, tensor, layout_rows, layout_cols, src_rows,
+                          src_cols, width, depth);
+}
+
+/* Quantize input floats to 8bit and calculate sum of each column.
+ *  Data in float_data_ptr of shape (n_batch x n_data), is quantized and
+ * packed into (n_batch / width, n_data / depth, width, data) into
+ * quantized_data_ptr and input_offsets will contain the product of filter
+ * zero_point and input.
+ */
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  optimized_4bit::BatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                                          quantized_data_ptr, scaling_factors,
+                                          width, depth, input_offsets);
+}
+
+/* Write bias + input offset * filter_scale to output_ptr.
+ * output_ptr of size (batch_size, output_depth) will have
+ * output_ptr[output_depth * b + o] =
+ *     bias_ptr[o] + input_offsets[b] * batch_scales[b] * filter_scale[o]
+ */
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  optimized_4bit::AssignBiasAndComputeOffsets(
+      input_offsets, batch_scales, filter_scales, bias_ptr, output_ptr,
+      output_depth, batch_size);
+}
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, const int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+  optimized_4bit::RunAndUnpack(
+      rhs_width, lhs, rhs, dst, output_depth, batch_size, lhs_layout_rows,
+      lhs_layout_cols, rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+      dst_layout_cols, output_ptr, scaling_factors, filter_scales);
+}
+
+}  // namespace api
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_FULLY_CONNECTED_4BIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/im2col_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
new file mode 100644
index 00000000..e0da94a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
@@ -0,0 +1,511 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
+
+#include <algorithm>
+#include <cassert>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+    const RuntimeShape& input_shape, int w, int h, int b, int kheight,
+    int kwidth, int stride_width, int stride_height, int pad_width,
+    int pad_height, int in_width, int in_height, int in_depth,
+    int single_buffer_length, int buffer_id, const T* in_data,
+    T* conv_buffer_data, uint8_t zero_byte) {
+  ruy::profiler::ScopeLabel label("ExtractPatchIntoBufferColumn");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num =
+      std::max(0, std::min(kwidth - w_offset, in_width - iw_start)) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset =
+      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num ==
+         ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0) {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, zero_byte,
+           (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0)) {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  } else {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      if (left_padding > 0) {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, zero_byte,
+               (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      if (right_padding > 0) {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, zero_byte,
+               (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0) {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+        output_row_offset +
+        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, zero_byte,
+           (bottom_row_elements * sizeof(T)));
+  }
+}
+
+// Supports per-batch zero_byte for per-batch asymmetric quantized inputs.
+template <typename T>
+void DilatedIm2col(const ConvParams& params, const RuntimeShape& input_shape,
+                   const T* input_data, const RuntimeShape& filter_shape,
+                   const RuntimeShape& output_shape, T* im2col_data,
+                   const int32_t* zero_bytes, const int zero_bytes_len) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same optimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  ruy::profiler::ScopeLabel label("DilatedIm2col");
+  TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1);
+  TFLITE_DCHECK(im2col_data);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  MatchingDim(output_shape, 3, filter_shape, 0);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  const RuntimeShape row_shape({1, batches, output_height, output_width});
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  const RuntimeShape col_shape({1, filter_height, filter_width, input_depth});
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  const RuntimeShape im2col_shape(
+      {1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch) {
+    const T zero_byte = zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch])
+                                           : static_cast<T>(zero_bytes[0]);
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        // Each im2col row is an output pixel. Arrange the input data in this
+        // row in an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height)) {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
+              T* dst = im2col_data +
+                       Offset(im2col_shape, 0, 0, row_offset, col_offset);
+              if ((in_x >= 0) && (in_x < input_width)) {
+                // Filter pixel is within the input, copy the input data.
+                T const* src =
+                    input_data + Offset(input_shape, batch, in_y, in_x, 0);
+                memcpy(dst, src, input_depth * sizeof(T));
+              } else {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, zero_byte, input_depth * sizeof(T));
+              }
+            }
+          } else {
+            // Filter row is outside the input, zero out the entire filter row.
+            int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
+            T* dst = im2col_data +
+                     Offset(im2col_shape, 0, 0, row_offset, col_offset);
+            memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void DilatedIm2col(const ConvParams& params, uint8_t zero_byte,
+                   const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& filter_shape,
+                   const RuntimeShape& output_shape, T* im2col_data) {
+  const int32_t zero_point = static_cast<int32_t>(zero_byte);
+  DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape,
+                   im2col_data, &zero_point, 1);
+}
+
+template <typename T>
+void Im2col(const ConvParams& params, int kheight, int kwidth,
+            uint8_t zero_byte, const RuntimeShape& input_shape,
+            const T* input_data, const RuntimeShape& output_shape,
+            T* output_data) {
+  ruy::profiler::ScopeLabel label("Im2col");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < output_height; ++h) {
+      for (int w = 0; w < output_width; ++w) {
+        ExtractPatchIntoBufferColumn(
+            input_shape, w, h, b, kheight, kwidth, stride_width, stride_height,
+            pad_width, pad_height, input_width, input_height, input_depth,
+            output_depth, buffer_id, input_data, output_data, zero_byte);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+template <typename T>
+void Im2col(const ConvParams& params, int kheight, int kwidth,
+            const int32_t* input_offsets, const int input_offsets_size,
+            const RuntimeShape& input_shape, const T* input_data,
+            const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Im2col");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  TFLITE_DCHECK_EQ(batches, input_offsets_size);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b) {
+    uint8_t zero_byte = static_cast<uint8_t>(input_offsets[b]);
+    for (int h = 0; h < output_height; ++h) {
+      for (int w = 0; w < output_width; ++w) {
+        ExtractPatchIntoBufferColumn(
+            input_shape, w, h, b, kheight, kwidth, stride_width, stride_height,
+            pad_width, pad_height, input_width, input_height, input_depth,
+            output_depth, buffer_id, input_data, output_data, zero_byte);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn3D(
+    int b, int d, int h, int w,                             // Output indexes.
+    int kdepth, int kheight, int kwidth,                    // Kernel params.
+    int stride_depth, int stride_height, int stride_width,  // Stride params.
+    int pad_depth, int pad_height, int pad_width,           // Padding params.
+    int in_depth, int in_height, int in_width, int in_channel,  // Input shape.
+    int output_row_offset, const T* in_data, T* conv_buffer_data,
+    uint8_t zero_byte) {
+  ruy::profiler::ScopeLabel label("ExtractPatchIntoBufferColumn3D");
+
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, d, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int id_ungated_start = d * stride_depth - pad_depth;
+  const int id_start = std::max(0, id_ungated_start);
+  const int id_ungated_end = (id_ungated_start + kdepth);
+  const int id_end = std::min(id_ungated_end, in_depth);
+
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+
+  // Calculate the padding sizes.
+  const int d_padding_before = std::max(0, -id_ungated_start);
+  const int d_padding_after = (id_ungated_end - id_end);
+  const int h_padding_before = std::max(0, -ih_ungated_start);
+  const int h_padding_after = (ih_ungated_end - ih_end);
+  const int w_padding_before = std::max(0, -iw_ungated_start);
+  const int w_padding_after = (iw_ungated_end - iw_end);
+
+  // Memset if there are paddings in the depth dimension.
+  const int kd_stride_size = kheight * kwidth * in_channel;
+  const int id_stride_size = in_height * in_width * in_channel;
+
+  if (d_padding_before > 0) {
+    const int d_padding_before_elements = (d_padding_before * kd_stride_size);
+    memset(conv_buffer_data + output_row_offset, zero_byte,
+           (d_padding_before_elements * sizeof(T)));
+  }
+
+  if (d_padding_after > 0) {
+    const int d_padding_after_elements = (d_padding_after * kd_stride_size);
+    const int bottom_start =
+        output_row_offset + (kdepth - d_padding_after) * kd_stride_size;
+    memset(conv_buffer_data + bottom_start, zero_byte,
+           (d_padding_after_elements * sizeof(T)));
+  }
+
+  // If there are paddings in height or width dimension, menset the entire area
+  // to take advantage of sequential memory handling performance.
+  int out_offset = output_row_offset + d_padding_before * kd_stride_size;
+  if (h_padding_before > 0 || h_padding_after > 0 || w_padding_before > 0 ||
+      w_padding_after > 0) {
+    const int middle_elements = (id_end - id_start) * kd_stride_size;
+    memset(conv_buffer_data + out_offset, zero_byte,
+           (middle_elements * sizeof(T)));
+  }
+
+  // Copy the valid data from the input tensor.
+  const int kh_stride_size = kwidth * in_channel;
+  const int ih_stride_size = in_width * in_channel;
+  const int h_padding = h_padding_before + h_padding_after;
+  const int w_padding = w_padding_before + w_padding_after;
+  const int single_row_num = (kwidth - w_padding) * in_channel;
+  out_offset +=
+      h_padding_before * kh_stride_size + w_padding_before * in_channel;
+  const int in_offset_without_d = b * in_depth * id_stride_size +
+                                  ih_start * ih_stride_size +
+                                  iw_start * in_channel;
+  for (int id = id_start; id < id_end; ++id) {
+    int in_offset = in_offset_without_d + id * id_stride_size;
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      out_offset += kh_stride_size;
+      in_offset += ih_stride_size;
+    }
+    out_offset += h_padding * kh_stride_size;
+  }
+}
+
+template <typename T>
+void Im2col3D(const Conv3DParams& params, int kdepth, int kheight, int kwidth,
+              uint8_t zero_byte, const RuntimeShape& input_shape,
+              const T* input_data, const RuntimeShape& im2col_shape,
+              T* im2col_data) {
+  ruy::profiler::ScopeLabel label("Im2col3D");
+  const int stride_depth = params.stride_depth;
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_depth = params.padding_values.depth;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(im2col_shape.DimensionsCount(), 5);
+
+  const int batches = MatchingDim(input_shape, 0, im2col_shape, 0);
+  const int input_depth = input_shape.Dims(1);
+  const int input_height = input_shape.Dims(2);
+  const int input_width = input_shape.Dims(3);
+  const int input_channel = input_shape.Dims(4);
+
+  const int output_depth = im2col_shape.Dims(1);
+  const int output_height = im2col_shape.Dims(2);
+  const int output_width = im2col_shape.Dims(3);
+  const int output_channel = im2col_shape.Dims(4);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b) {
+    for (int d = 0; d < output_depth; ++d) {
+      for (int h = 0; h < output_height; ++h) {
+        for (int w = 0; w < output_width; ++w) {
+          ExtractPatchIntoBufferColumn3D(
+              b, d, h, w, kdepth, kheight, kwidth, stride_depth, stride_height,
+              stride_width, pad_depth, pad_height, pad_width, input_depth,
+              input_height, input_width, input_channel, buffer_id, input_data,
+              im2col_data, zero_byte);
+          buffer_id += output_channel;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void DilatedIm2col3D(const Conv3DParams& params, int filter_depth,
+                            int filter_height, int filter_width,
+                            uint8_t zero_byte, const RuntimeShape& input_shape,
+                            const T* input_data,
+                            const RuntimeShape& im2col_shape, T* im2col_data) {
+  ruy::profiler::ScopeLabel label("DilatedIm2col3D");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(im2col_shape.DimensionsCount(), 5);
+
+  // Only NDHWC format is currently supported.
+  const int batches = MatchingDim(input_shape, 0, im2col_shape, 0);
+  const int input_channels = input_shape.Dims(4);
+  const int input_width = input_shape.Dims(3);
+  const int input_height = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(1);
+
+  const int output_width = im2col_shape.Dims(3);
+  const int output_height = im2col_shape.Dims(2);
+  const int output_depth = im2col_shape.Dims(1);
+
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int pad_depth = params.padding_values.depth;
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x D x H x W.
+  const RuntimeShape row_shape(
+      {1, batches, output_depth, output_height, output_width});
+  // The columns, N, are sub-ordered Kd x Kh x Kw x Din.
+  const RuntimeShape col_shape(
+      {1, filter_depth, filter_height, filter_width, input_channels});
+  // Use dimensions M and N to construct dims for indexing directly into im2col.
+  const RuntimeShape im2col_reshaped(
+      {1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      const int in_d_origin = (out_d * params.stride_depth) - pad_depth;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * params.stride_height) - pad_height;
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * params.stride_width) - pad_width;
+          const int row_offset =
+              Offset(row_shape, 0, batch, out_d, out_y, out_x);
+          for (int filter_d = 0; filter_d < filter_depth; ++filter_d) {
+            const int in_d = in_d_origin + params.dilation_depth * filter_d;
+            if ((in_d >= 0) && (in_d < input_depth)) {
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                const int in_y =
+                    in_y_origin + params.dilation_height * filter_y;
+                if ((in_y >= 0) && (in_y < input_height)) {
+                  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                    const int in_x =
+                        in_x_origin + params.dilation_width * filter_x;
+                    int col_offset =
+                        Offset(col_shape, 0, filter_d, filter_y, filter_x, 0);
+                    T* dst = im2col_data + Offset(im2col_reshaped, 0, 0,
+                                                  row_offset, col_offset);
+                    if ((in_x >= 0) && (in_x < input_width)) {
+                      // Filter pixel is within the input, copy the input data.
+                      T const* src = input_data + Offset(input_shape, batch,
+                                                         in_d, in_y, in_x, 0);
+                      memcpy(dst, src, input_depth * sizeof(T));
+                    } else {
+                      // Filter pixel is outside the input, zero it out.
+                      memset(dst, zero_byte, input_depth * sizeof(T));
+                    }
+                  }
+                } else {
+                  const int col_offset =
+                      Offset(col_shape, 0, filter_d, filter_y, 0, 0);
+                  T* dst = im2col_data + Offset(im2col_reshaped, 0, 0,
+                                                row_offset, col_offset);
+                  memset(dst, zero_byte,
+                         filter_width * input_depth * sizeof(T));
+                }
+              }
+            } else {
+              const int col_offset = Offset(col_shape, 0, filter_d, 0, 0, 0);
+              T* dst = im2col_data +
+                       Offset(im2col_reshaped, 0, 0, row_offset, col_offset);
+              memset(dst, zero_byte,
+                     filter_height * filter_width * input_depth * sizeof(T));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
new file mode 100644
index 00000000..47ae6f8a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -0,0 +1,513 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/avx2_quantization_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwiseInt8(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("AddElementwiseInt8/8bit");
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+#ifdef USE_NEON
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_dup);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_dup);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_dup);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_dup);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8>(clamped_output);
+  }
+}
+
+// Element-wise add is used for the non-broadcast add.
+inline void AddElementwiseInt16(int size, const ArithmeticParams& params,
+                                const int16* input1_data,
+                                const int16* input2_data, int16* output_data) {
+  ruy::profiler::ScopeLabel label("AddElementwiseInt16/16bit");
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -32768);
+  TFLITE_DCHECK_GT(params.input2_offset, -32768);
+  TFLITE_DCHECK_LT(params.input1_offset, 32768);
+  TFLITE_DCHECK_LT(params.input2_offset, 32768);
+
+#ifdef __AVX2__
+  const int32_t input1_left_shift = params.left_shift + params.input1_shift;
+  const int32_t input2_left_shift = params.left_shift + params.input2_shift;
+  const __m256i input1_offset = _mm256_set1_epi32(params.input1_offset);
+  const __m256i input2_offset = _mm256_set1_epi32(params.input2_offset);
+  const __m256i output_offset = _mm256_set1_epi32(params.output_offset);
+  const __m256i clamp_max_v =
+      _mm256_set1_epi32(params.quantized_activation_max);
+  const __m256i clamp_min_v =
+      _mm256_set1_epi32(params.quantized_activation_min);
+
+  for (; i <= size - 16; i += 16) {
+    const __m256i input1_val_original =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(input1_data + i));
+    const __m256i input2_val_original =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(input2_data + i));
+
+    __m256i s11 =
+        _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input1_val_original));
+    __m256i s12 =
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input1_val_original, 1));
+    __m256i s21 =
+        _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input2_val_original));
+    __m256i s22 =
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input2_val_original, 1));
+
+    s11 = _mm256_add_epi32(s11, input1_offset);
+    s12 = _mm256_add_epi32(s12, input1_offset);
+    s21 = _mm256_add_epi32(s21, input2_offset);
+    s22 = _mm256_add_epi32(s22, input2_offset);
+
+    s11 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s11, params.input1_multiplier, input1_left_shift);
+    s12 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s12, params.input1_multiplier, input1_left_shift);
+    s21 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s21, params.input2_multiplier, input2_left_shift);
+    s22 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s22, params.input2_multiplier, input2_left_shift);
+
+    __m256i s1 = _mm256_add_epi32(s11, s21);
+    __m256i s2 = _mm256_add_epi32(s12, s22);
+
+    s1 = avx2_utils::MultiplyByQuantizedMultiplier(s1, params.output_multiplier,
+                                                   params.output_shift);
+    s2 = avx2_utils::MultiplyByQuantizedMultiplier(s2, params.output_multiplier,
+                                                   params.output_shift);
+
+    s1 = _mm256_add_epi32(s1, output_offset);
+    s2 = _mm256_add_epi32(s2, output_offset);
+
+    s1 = _mm256_min_epi32(s1, clamp_max_v);
+    s1 = _mm256_max_epi32(s1, clamp_min_v);
+    s2 = _mm256_min_epi32(s2, clamp_max_v);
+    s2 = _mm256_max_epi32(s2, clamp_min_v);
+
+    avx2_utils::CastInt32ToInt16AndStore(output_data + i, s1);
+    avx2_utils::CastInt32ToInt16AndStore(output_data + i + 8, s2);
+  }
+
+#elif defined(USE_NEON)
+  const int32x4_t output_activation_min_vector =
+      vdupq_n_s32(params.quantized_activation_min);
+  const int32x4_t output_activation_max_vector =
+      vdupq_n_s32(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int32x4_t input1_offset_dup = vdupq_n_s32(params.input1_offset);
+  const int32x4_t input2_offset_dup = vdupq_n_s32(params.input2_offset);
+  const int32x4_t output_offset_dup = vdupq_n_s32(params.output_offset);
+
+  // Use the size 16 batch as it is effective on pixel 3/4.
+  for (; i <= size - 16; i += 16) {
+    const int16x8_t input11_val_original = vld1q_s16(input1_data + i);
+    const int16x8_t input12_val_original = vld1q_s16(input2_data + i);
+    const int16x8_t input21_val_original = vld1q_s16(input1_data + 8 + i);
+    const int16x8_t input22_val_original = vld1q_s16(input2_data + 8 + i);
+
+    int32x4_t x111 = vmovl_s16(vget_low_s16(input11_val_original));
+    int32x4_t x112 = vmovl_s16(vget_high_s16(input11_val_original));
+    int32x4_t x121 = vmovl_s16(vget_low_s16(input12_val_original));
+    int32x4_t x122 = vmovl_s16(vget_high_s16(input12_val_original));
+    int32x4_t x211 = vmovl_s16(vget_low_s16(input21_val_original));
+    int32x4_t x212 = vmovl_s16(vget_high_s16(input21_val_original));
+    int32x4_t x221 = vmovl_s16(vget_low_s16(input22_val_original));
+    int32x4_t x222 = vmovl_s16(vget_high_s16(input22_val_original));
+
+    x111 = vaddq_s32(x111, input1_offset_dup);
+    x112 = vaddq_s32(x112, input1_offset_dup);
+    x121 = vaddq_s32(x121, input2_offset_dup);
+    x122 = vaddq_s32(x122, input2_offset_dup);
+    x211 = vaddq_s32(x211, input1_offset_dup);
+    x212 = vaddq_s32(x212, input1_offset_dup);
+    x221 = vaddq_s32(x221, input2_offset_dup);
+    x222 = vaddq_s32(x222, input2_offset_dup);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input2_left_dup);
+    x122 = vshlq_s32(x122, input2_left_dup);
+    x211 = vshlq_s32(x211, input1_left_dup);
+    x212 = vshlq_s32(x212, input1_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input2_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input2_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input1_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input1_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+
+    int32x4_t s11 = vaddq_s32(x111, x121);
+    int32x4_t s12 = vaddq_s32(x112, x122);
+    int32x4_t s21 = vaddq_s32(x211, x221);
+    int32x4_t s22 = vaddq_s32(x212, x222);
+
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+
+    using gemmlowp::RoundingDivideByPOT;
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+
+    s11 = vaddq_s32(s11, output_offset_dup);
+    s12 = vaddq_s32(s12, output_offset_dup);
+    s21 = vaddq_s32(s21, output_offset_dup);
+    s22 = vaddq_s32(s22, output_offset_dup);
+
+    s11 = vmaxq_s32(output_activation_min_vector,
+                    vminq_s32(output_activation_max_vector, s11));
+    s12 = vmaxq_s32(output_activation_min_vector,
+                    vminq_s32(output_activation_max_vector, s12));
+    s21 = vmaxq_s32(output_activation_min_vector,
+                    vminq_s32(output_activation_max_vector, s21));
+    s22 = vmaxq_s32(output_activation_min_vector,
+                    vminq_s32(output_activation_max_vector, s22));
+
+    const int16x8_t s1 = vcombine_s16(vqmovn_s32(s11), vqmovn_s32(s12));
+    const int16x8_t s2 = vcombine_s16(vqmovn_s32(s21), vqmovn_s32(s22));
+
+    vst1q_s16(output_data + i, s1);
+    vst1q_s16(output_data + 8 + i, s2);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int16>(clamped_output);
+  }
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               int8 input1_data, const int8* input2_data,
+                               int8* output_data) {
+  using gemmlowp::RoundingDivideByPOT;
+
+  ruy::profiler::ScopeLabel label("AddScalarBroadcastInt8/8bit");
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  int i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const int8x8_t output_activation_min_vector =
+      vdup_n_s8(params.quantized_activation_min);
+  const int8x8_t output_activation_max_vector =
+      vdup_n_s8(params.quantized_activation_max);
+
+  // Process broadcast scalar.
+  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+  const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+  const int16x8_t input1_val =
+      vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
+
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const int8x8_t clamped =
+        vmax_s8(output_activation_min_vector,
+                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+    vst1_s8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  if (i < size) {
+    // Process broadcast scalar.
+    const int32 input1_val = params.input1_offset + input1_data;
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i) {
+      const int32 input2_val = params.input2_offset + input2_data[i];
+      const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32 scaled_input2_val =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input2_val, params.input2_multiplier,
+              params.input2_shift);
+      const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32 raw_output =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              raw_sum, params.output_multiplier, params.output_shift) +
+          params.output_offset;
+      const int32 clamped_output =
+          std::min(params.quantized_activation_max,
+                   std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<int8>(clamped_output);
+    }
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8* input1_data,
+                const RuntimeShape& input2_shape, const int8* input2_data,
+                const RuntimeShape& output_shape, int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("AddInt8/8bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwiseInt8(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int16* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("AddInt16/16bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -32768);
+  TFLITE_DCHECK_GT(params.input2_offset, -32768);
+  TFLITE_DCHECK_LT(params.input1_offset, 32768);
+  TFLITE_DCHECK_LT(params.input2_offset, 32768);
+  AddElementwiseInt16(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAddDispatch(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int8* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int8* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int8* output_data) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_integer_ops::BroadcastAdd6DSlow(
+        params, input1_shape, input1_data, input2_shape, input2_data,
+        output_shape, output_data);
+  }
+
+  optimized_ops::BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data, AddElementwiseInt8, AddScalarBroadcast);
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
new file mode 100644
index 00000000..445e70ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+template <typename InputScalar, typename DstScalar>
+inline void ConvPerChannel(
+    const ConvParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const InputScalar* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    DstScalar* output_data, const RuntimeShape& im2col_shape,
+    InputScalar* im2col_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("Conv/8bit");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int32 input_offset = params.input_offset;
+  const int32 output_offset = params.output_offset;
+  // Set min and max value of the output.
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const InputScalar* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  const int8 input_zero_point = -input_offset;
+  const uint8 zero_point_byte =
+      *reinterpret_cast<const uint8*>(&input_zero_point);
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::DilatedIm2col(params, zero_point_byte, input_shape,
+                                 input_data, filter_shape, output_shape,
+                                 im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::Im2col(params, filter_height, filter_width, zero_point_byte,
+                          input_shape, input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int filter_rows = filter_shape.Dims(0);
+  const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+  const int output_rows = output_shape.Dims(3);
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_shape, 3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+  cpu_backend_gemm::MatrixParams<int8> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = 0;  // filter is symmetric-quantized
+  cpu_backend_gemm::MatrixParams<InputScalar> rhs_params;
+  rhs_params.rows = gemm_input_rows;
+  rhs_params.cols = gemm_input_cols;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<DstScalar> dst_params;
+  dst_params.rows = output_rows;
+  dst_params.cols = output_cols;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<
+      int32, DstScalar,
+      cpu_backend_gemm::QuantizationFlavor::kIntegerWithPerRowMultiplier>
+      gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint_perchannel = output_multiplier;
+  gemm_params.multiplier_exponent_perchannel = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
new file mode 100644
index 00000000..cd2a6148
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -0,0 +1,2026 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
+
+#include <string.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+namespace depthwise_conv {
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8x2_t filter_s8;
+    filter_s8.val[0] = vld1_s8(filter_ptr);
+    filter_s8.val[1] = vld1_s8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vmovl_s8(filter_s8.val[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
+                                  vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
+                                  vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++) {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] =
+          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] =
+          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
+                                   vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
+                                   vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++) {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
+                              vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
+                              vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
+                              vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
+                              vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
+                              vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
+                              vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
+                              vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
+                              vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 3> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const int8_t dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
+                                                    {2, 3, 3, 3, 4, 4, 4, 5},
+                                                    {5, 5, 6, 6, 6, 7, 7, 7}};
+    int8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++) {
+      dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const int8_t* local_filter_ptr = filter_ptr;
+      const int8_t* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters.
+        int16x8_t filter[3];
+        int8x8x3_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++) {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+
+        int8x8_t input_s8_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
+                                    vget_low_s16(filter[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
+                                    vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++) {
+          *acc_buffer_ptr++ +=
+              static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const int8_t* local_filter_ptr = filter_ptr;
+      const int8_t* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters.
+        int16x8_t filter[2];
+        int8x8x2_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
+                                    vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
+                                    vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs.
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++) {
+          *acc_buffer_ptr++ +=
+              static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const int8_t* local_filter_ptr = filter_ptr;
+      const int8_t* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters.
+        int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
+        int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+        int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+        // Load the inputs, add input_offset.
+        int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
+        int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vmovl_s8(input_s8_0);
+        int16x8_t input_1 = vmovl_s8(input_s8_1);
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 =
+            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 =
+            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters.
+        const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter = vmovl_s8(filter_s8);
+        // Load the inputs, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 16, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++) {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+                                   vget_low_s16(filter[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+                                   vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] =
+            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
+    int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_2 = vmovl_s8(filter_s8_2);
+    int16x8_t filter_3 = vmovl_s8(filter_s8_3);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_s8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_s8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_s8_x are the same
+    // as the last 4 bytes of filter_s8_x.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_x = vmovl_s8(filter_s8_x);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int16x4_t input_s16 = vdup_n_s16(0);
+      input_s16 = vset_lane_s16(
+          (reinterpret_cast<const int16_t*>(input_ptr))[0], input_s16, 0);
+      input_ptr += input_ptr_increment;
+      input_s16 = vset_lane_s16(
+          (reinterpret_cast<const int16_t*>(input_ptr))[0], input_s16, 1);
+      input_ptr += input_ptr_increment;
+      input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    if (num_output_pixels <= 0) {
+      return;
+    }
+
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    int8x8_t input_s8 = vdup_n_s8(0);
+    input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+    input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+    input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+    input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+    const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 12, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const int8_t* input_ptr, int16_t input_offset,
+                  int input_ptr_increment, const int8_t* filter_ptr,
+                  int32_t* acc_buffer_ptr) {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8_0 = vld1_s8(input_ptr);
+      int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vmovl_s8(input_s8_0);
+      int16x8_t input_1 = vmovl_s8(input_s8_1);
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const int8_t* input_data, int16_t input_offset, int pad_width,
+    int depth_multiplier, int filter_width, const int8_t* filter_data,
+    int out_x_buffer_start, int out_x_buffer_end, int output_depth,
+    int32_t* acc_buffer) {
+  ruy::profiler::ScopeLabel label(TFLITE_PRETTY_FUNCTION);
+  // Consistency check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const int8_t* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped =
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped =
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclamped =
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped = (pad_width + input_width -
+                                    dilation_factor * filter_x + stride - 1) /
+                                   stride;
+      }
+    } else {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped =
+          pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    int32_t* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<
+        kAllowStrided, kFixedInputDepth,
+        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
+                                    depth_multiplier, input_ptr, input_offset,
+                                    input_ptr_increment, filter_base_ptr,
+                                    acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const int8_t* input_data, int16_t input_offset, int pad_width,
+    int depth_multiplier, int filter_width, const int8_t* filter_data,
+    int out_x_buffer_start, int out_x_buffer_end, int output_depth,
+    int32_t* acc_buffer) {
+  ruy::profiler::ScopeLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const int8_t* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start,
+        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end = std::min(
+        out_x_buffer_end,
+        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
+            stride);
+
+    int32_t* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const int8_t* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const int16_t input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int16_t filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32_t* bias_data,
+                                       int32_t* acc_buffer) {
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1) {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16) {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  } else if (output_depth == 2) {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8) {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  } else if (output_depth == 4) {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  } else if (output_depth == 8) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  } else if (output_depth == 16) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, int thread_start, int thread_end, int thread_dim) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_rows = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  static const int kAccBufferMaxSize = 2048;
+  int acc_buffer_size = kAccBufferMaxSize;
+  int32_t stack_acc_buffer[kAccBufferMaxSize];
+  int32_t* acc_buffer = stack_acc_buffer;
+#ifndef TF_LITE_STATIC_MEMORY
+  std::unique_ptr<int32_t[]> heap_acc_buffer;
+  if (kAccBufferMaxSize < output_depth) {
+    heap_acc_buffer.reset(new int32_t[output_depth]);
+    acc_buffer = heap_acc_buffer.get();
+    acc_buffer_size = output_depth;
+  }
+#endif
+  TFLITE_DCHECK_GE(acc_buffer_size, output_depth);
+  const int kOutputPixelsInAccBuffer = acc_buffer_size / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, acc_buffer_size);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
+                                       FIXED_DEPTH_MULTIPLIER>;           \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif  // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_rows;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, output_rows);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  int8_t* output_ptr = output_data + output_ptr_offset;
+  int batch_step =
+      (output_rows + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b) {
+    for (int out_y = row_start; out_y < row_end; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                          dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height,
+                   (input_height - in_y_origin + dilation_height_factor - 1) /
+                       dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+              stride_width, dilation_width_factor, input_depth, input_width,
+              input_data + in_y * input_height_stride + b * input_batch_stride,
+              input_offset, pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+              out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Now need to convert them to
+        // the final 8bit form and store them.
+        ruy::profiler::ScopeLabel label("downquantize+store");
+        const int num_output_values = output_depth * num_output_pixels;
+
+        optimized_ops::Quantize(output_multiplier, output_shift, output_depth,
+                                num_output_values, output_offset,
+                                output_activation_min, output_activation_max,
+                                acc_buffer, output_ptr);
+
+        output_ptr += num_output_values;
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+}  // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, int thread_start, int thread_end, int thread_dim,
+    const CpuBackendContext& cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("DepthwiseConvInt8/8bit");
+  const int depth_multiplier = params.depth_multiplier;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_GE(dilation_width_factor, 1);
+  TFLITE_DCHECK_GE(dilation_height_factor, 1);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
+
+  // Dispatch to dot-product 3x3 kernels when supported.
+  if (has_dot_product_instructions) {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type =
+        optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+            optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+            input_shape, filter_shape, output_shape, params, output_shift);
+    if (kernel_type != DotProduct3x3KernelType::kNone) {
+      ruy::profiler::ScopeLabel specialized_label(
+          "DepthwiseConvInt8/8bit/3x3XDotProduct");
+      DepthwiseParams params_copy = params;
+      params_copy.output_shift_per_channel = output_shift;
+      params_copy.output_multiplier_per_channel = output_multiplier;
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+          params_copy, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data, thread_start,
+          thread_end, thread_dim);
+      return;
+    }
+  }
+
+#endif
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+          optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, 0, output_shift)) {
+    ruy::profiler::ScopeLabel specialized_label("DepthwiseConvInt8/8bit/3x3");
+    optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+        DepthwiseConvOutputRounding::kUpward>(
+        params, output_multiplier, output_shift, input_shape, input_data,
+        filter_shape, filter_data, bias_shape, bias_data, output_shape,
+        output_data, thread_start, thread_end, thread_dim);
+    return;
+  }
+#endif
+
+  ruy::profiler::ScopeLabel specialized_label("DepthwiseConvInt8/8bit/General");
+  depthwise_conv::DepthwiseConvGeneral(
+      params, output_multiplier, output_shift, input_shape, input_data,
+      filter_shape, filter_data, bias_shape, bias_data, output_shape,
+      output_data, thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, int thread_start, int thread_end, int thread_dim,
+    const CpuBackendContext& cpu_backend_context) {
+  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+      params, output_multiplier, output_shift, input_shape, input_data,
+      filter_shape, filter_data, bias_shape, bias_data, output_shape,
+      output_data, thread_start, thread_end, thread_dim, cpu_backend_context);
+}
+
+template <typename T, typename TS>
+struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
+  DepthwiseConvWorkerTask(const DepthwiseParams& params,
+                          const int32_t* output_multiplier,
+                          const int32_t* output_shift,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const RuntimeShape& filter_shape,
+                          const T* filter_data, const RuntimeShape& bias_shape,
+                          const TS* bias_data, const RuntimeShape& output_shape,
+                          T* output_data, int thread_start, int thread_end,
+                          int thread_dim,
+                          const CpuBackendContext& cpu_backend_context_x)
+      : params_(params),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        thread_start_(thread_start),
+        thread_end_(thread_end),
+        thread_dim_(thread_dim),
+        cpu_backend_context(cpu_backend_context_x) {}
+
+  void Run() override {
+    DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_,
+                      input_data_, filter_shape_, filter_data_, bias_shape_,
+                      bias_data_, output_shape_, output_data_, thread_start_,
+                      thread_end_, thread_dim_, cpu_backend_context);
+  }
+
+ private:
+  const DepthwiseParams& params_;
+  const int32_t* output_multiplier_;
+  const int32_t* output_shift_;
+  const RuntimeShape& input_shape_;
+  const T* input_data_;
+  const RuntimeShape& filter_shape_;
+  const T* filter_data_;
+  const RuntimeShape& bias_shape_;
+  const TS* bias_data_;
+  const RuntimeShape& output_shape_;
+  T* output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+  const CpuBackendContext& cpu_backend_context;
+};
+
+inline int HowManyConvThreads(const RuntimeShape& output_shape,
+                              const RuntimeShape& filter_shape,
+                              int thread_dim) {
+  constexpr int kMinMulPerThread = 8;
+  const int output_units = output_shape.Dims(thread_dim);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_mul_per_unit =
+      FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+  int thread_count = output_units / min_units_per_thread;
+  return thread_count;
+}
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("DepthwiseConvInt8");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  const int max_threads = cpu_backend_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1) {
+    DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape,
+                      input_data, filter_shape, filter_data, bias_shape,
+                      bias_data, output_shape, output_data, /*thread_start=*/0,
+                      /*thread_end=*/output_rows, /*thread_dim=*/1,
+                      *cpu_backend_context);
+  } else {
+    std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i) {
+      int thread_end =
+          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks.emplace_back(params, output_multiplier, output_shift, input_shape,
+                         input_data, filter_shape, filter_data, bias_shape,
+                         bias_data, output_shape, output_data, thread_start,
+                         thread_end, thread_dim, *cpu_backend_context);
+      thread_start = thread_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
new file mode 100644
index 00000000..8d761dd3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@@ -0,0 +1,3083 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
+
+#include <stddef.h>
+
+#include <memory>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_RIGHT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+                  OFFSET_OUTPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+                  OFFSET_INPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+                  OFFSET_OUTPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+                  OFFSET_INPUT_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+                  OFFSET_OUTPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
+
+template <>
+struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
+                                     1> {
+ public:
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr, int64_t input_depth,
+                         int64_t input_row_size, int32 output_window_height,
+                         int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // However, the challenges are how to plan the registers allocation
+        // wisely: 25 NEON registers are already reserved for inputs, filters,
+        // and outputs; also, 2 registers (v30, v31) are used for output
+        // min/max, while another 2 registers (v26, v29) are used for input
+        // offset & output offset, so that's total 25 + 2 + 2 = 29 already.
+        // But we need 4 more registers to hold the output multiplier & output
+        // right shift (we only have 3).
+        //
+        // So here's the plan:
+        // v27 (which held duplicated output multiplier previously) will hold
+        // the first 4 values of the output_multiplier_ptr (we have 8 in total);
+        // v30 (which held duplicated output right shift previously) will hold
+        // the first 4 values of the output_shift_ptr (we have 8 in total);
+        // lastly, v28 will hold the last 4 values of output_multiplier and v31
+        // (previously occupied by activations) will hold the last 4 values of
+        // output_shift. Then v25 will be used for output activation min while
+        // output activation max will just reuse other registers, like v24.
+        //
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v26.8h, w9\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "add x10, %[bias_ptr], #16\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "dup v25.16b, w4\n"
+
+        // Deal with output multiplier & output shift.
+        "ld1 {v27.4s, v28.4s}, [%[output_multiplier_ptr]]\n"
+        "ld1 {v30.4s, v31.4s}, [%[output_shift_ptr]]\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "sshll v5.8h, v5.8b, #0\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "sshll v6.8h, v6.8b, #0\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "sshll v7.8h, v7.8b, #0\n"
+        "sshll v8.8h, v8.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "saddw v14.8h, v26.8h, v14.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "saddw v17.8h, v26.8h, v17.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v19.8h, v26.8h, v19.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+            "sqrshl v21.4s, v21.4s, v30.4s\n"
+            "sqrshl v22.4s, v22.4s, v31.4s\n"
+            "sqrshl v23.4s, v23.4s, v30.4s\n"
+            "sqrshl v24.4s, v24.4s, v31.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtn v21.8b, v21.8h\n"
+            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v24.16b, w0\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "smax v21.16b, v21.16b, v25.16b\n"
+            "smin v21.16b, v21.16b, v24.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "saddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "saddw v12.8h, v26.8h, v12.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "saddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "saddw v18.8h, v26.8h, v18.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+            "sqrshl v21.4s, v21.4s, v30.4s\n"
+            "sqrshl v22.4s, v22.4s, v31.4s\n"
+            "sqrshl v23.4s, v23.4s, v30.4s\n"
+            "sqrshl v24.4s, v24.4s, v31.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtn v21.8b, v21.8h\n"
+            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v24.16b, w0\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "smax v21.16b, v21.16b, v25.16b\n"
+            "smin v21.16b, v21.16b, v24.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "saddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "saddw v10.8h, v26.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "saddw v11.8h, v26.8h, v11.8b\n"
+            "saddw v12.8h, v26.8h, v12.8b\n"
+            "saddw v13.8h, v26.8h, v13.8b\n"
+            "saddw v14.8h, v26.8h, v14.8b\n"
+            "saddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "saddw v16.8h, v26.8h, v16.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "saddw v17.8h, v26.8h, v17.8b\n"
+            "saddw v18.8h, v26.8h, v18.8b\n"
+            "saddw v19.8h, v26.8h, v19.8b\n"
+            "saddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
+
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "ld1 {v21.4s}, [%[bias_ptr]]\n"
+        "ld1 {v22.4s}, [x10]\n"
+        "ld1 {v23.4s}, [%[bias_ptr]]\n"
+        "ld1 {v24.4s}, [x10]\n"
+
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+        "saddw v14.8h, v26.8h, v14.8b\n"
+        "saddw v15.8h, v26.8h, v15.8b\n"
+        "saddw v17.8h, v26.8h, v17.8b\n"
+        "saddw v18.8h, v26.8h, v18.8b\n"
+        "saddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [%[output_ptr]], x3\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [%[output_ptr]], x3\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v14.8h, v26.8h, v14.8b\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "saddw v17.8h, v26.8h, v17.8b\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v19.8h, v26.8h, v19.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "saddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "saddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+        "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrshl v21.4s, v21.4s, v30.4s\n"
+        "sqrshl v22.4s, v22.4s, v31.4s\n"
+        "sqrshl v23.4s, v23.4s, v30.4s\n"
+        "sqrshl v24.4s, v24.4s, v31.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtn v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqadd v23.8h, v23.8h, v29.8h\n"
+        "sqxtn v21.8b, v21.8h\n"
+        "sqxtn2 v21.16b, v23.8h\n"
+        "dup v24.16b, w0\n"
+        "smax v21.16b, v21.16b, v25.16b\n"
+        "smin v21.16b, v21.16b, v24.16b\n"
+        "st1 {v21.8b}, [%[output_ptr]], x3\n"
+        "mov v23.d[0], v21.d[1]\n"
+        "st1 {v23.8b}, [%[output_ptr]], x3\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+        "sqrshl v21.4s, v21.4s, v30.4s\n"
+        "sqrshl v22.4s, v22.4s, v31.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqxtn v21.8b, v21.8h\n"
+        "dup v24.16b, w0\n"
+        "smax v21.8b, v21.8b, v25.8b\n"
+        "smin v21.8b, v21.8b, v24.8b\n"
+        "st1 {v21.8b}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [output_multiplier_ptr] "r"(output_multiplier_ptr),
+    [output_shift_ptr] "r"(output_shift_ptr),
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
+                                     2> {
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr, int64_t input_depth,
+                         int64_t input_row_size, int32 output_window_height,
+                         int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // The register planning here is really tricky:
+        // v0-v29 are all used at least once for either filter/input/output,
+        // some of them are used for output shift and output multiplier, or
+        // input/output offset.
+        // Only v30 & v31 are only used for output activation min/max.
+        // For per-channel case, we need 4 registers to hold output shift &
+        // output multiplier. However, given the reality, we simply cannot do
+        // that without reloading.
+        //
+        // So here's the plan:
+        // We hold output_multiplier in v30 & v31, and we will load output_shift
+        // into two consecutive registers each time before use.
+        // We will duplicate output min & max before needed.
+        // Sometimes we may borrow registers from input offset or bias, we will
+        // dup them back after use.
+        //
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v28.8h, w0\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+
+        // Deal with output multiplier.
+        "ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
+
+        // Load filters and add offsets.
+        "add x10, %[bias_ptr], #16\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "sshll v5.8h, v5.8b, #0\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "sshll v6.8h, v6.8b, #0\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "sshll v7.8h, v7.8b, #0\n"
+        "sshll v8.8h, v8.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v19.4s}, [%[bias_ptr]]\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "ld1 {v20.4s}, [x10]\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v25.4s}, [%[bias_ptr]]\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [x10]\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "saddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "saddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+            "sqrshl v21.4s, v21.4s, v27.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v27.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtn v21.8b, v21.8h\n"
+            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "smax v21.16b, v21.16b, v27.16b\n"
+            "smin v21.16b, v21.16b, v29.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "dup v29.8h, w2\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x5\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x6], x5\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "saddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "saddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "saddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "saddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "saddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+            "sqrshl v19.4s, v19.4s, v27.4s\n"
+            "sqrshl v20.4s, v20.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v27.4s\n"
+            "sqrshl v26.4s, v26.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v19.4h, v19.4s\n"
+            "sqxtn2 v19.8h, v20.4s\n"
+            "sqxtn v25.4h, v25.4s\n"
+            "sqxtn2 v25.8h, v26.4s\n"
+            "sqadd v19.8h, v19.8h, v29.8h\n"
+            "sqadd v25.8h, v25.8h, v29.8h\n"
+            "sqxtn v19.8b, v19.8h\n"
+            "sqxtn2 v19.16b, v25.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
+            "ld1 {v20.4s}, [x10]\n"
+            "smax v19.16b, v19.16b, v27.16b\n"
+            "smin v19.16b, v19.16b, v29.16b\n"
+            "ld1 {v26.4s}, [x10]\n"
+            "dup v29.8h, w2\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v19.8b}, [x7], x5\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
+            "mov v25.d[0], v19.d[1]\n"
+            "st1 {v25.8b}, [x7], x5\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
+            "ld1 {v19.4s}, [%[bias_ptr]]\n"
+            "saddw v14.8h, v28.8h, v14.8b\n"
+            "ld1 {v25.4s}, [%[bias_ptr]]\n"
+            "saddw v15.8h, v28.8h, v15.8b\n"
+            "saddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+          "sqrshl v21.4s, v21.4s, v27.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v27.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "smax v21.16b, v21.16b, v27.16b\n"
+          "smin v21.16b, v21.16b, v29.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "dup v29.8h, w2\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x5\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x6]\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+          "sqrshl v19.4s, v19.4s, v27.4s\n"
+          "sqrshl v20.4s, v20.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v27.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v19.4h, v19.4s\n"
+          "sqxtn2 v19.8h, v20.4s\n"
+          "sqxtn v25.4h, v25.4s\n"
+          "sqxtn2 v25.8h, v26.4s\n"
+          "sqadd v19.8h, v19.8h, v29.8h\n"
+          "sqadd v25.8h, v25.8h, v29.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
+          "sqxtn v19.8b, v19.8h\n"
+          "sqxtn2 v19.16b, v25.8h\n"
+          "smax v19.16b, v19.16b, v27.16b\n"
+          "smin v19.16b, v19.16b, v29.16b\n"
+          "st1 {v19.8b}, [x7], x5\n"
+          "dup v29.8h, w2\n"
+          "mov v25.d[0], v19.d[1]\n"
+          "st1 {v25.8b}, [x7]\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v21.4s, v21.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v27.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "smax v21.8b, v21.8b, v26.8b\n"
+          "smin v21.8b, v21.8b, v27.8b\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6]\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v27.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtn v23.8b, v23.8h\n"
+          "smax v23.8b, v23.8b, v26.8b\n"
+          "smin v23.8b, v23.8b, v27.8b\n"
+          "st1 {v23.8b}, [x7]\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "saddw v9.8h, v28.8h, v9.8b\n"
+        "ld1 {v24.4s}, [%[bias_ptr]]\n"
+        "saddw v10.8h, v28.8h, v10.8b\n"
+        "ld1 {v25.4s}, [x10]\n"
+        "saddw v11.8h, v28.8h, v11.8b\n"
+        "ld1 {v26.4s}, [%[bias_ptr]]\n"
+        "ld1 {v27.4s}, [x10]\n"
+        "saddw v12.8h, v28.8h, v12.8b\n"
+        "saddw v13.8h, v28.8h, v13.8b\n"
+        "saddw v14.8h, v28.8h, v14.8b\n"
+        "saddw v15.8h, v28.8h, v15.8b\n"
+        "saddw v16.8h, v28.8h, v16.8b\n"
+        "saddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "saddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "saddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "saddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "saddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "saddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
+          "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "dup v28.8h, w2\n"
+          "sqxtn v24.4h, v24.4s\n"
+          "sqxtn2 v24.8h, v25.4s\n"
+          "sqxtn v26.4h, v26.4s\n"
+          "sqxtn2 v26.8h, v27.4s\n"
+          "sqadd v24.8h, v24.8h, v28.8h\n"
+          "sqadd v26.8h, v26.8h, v28.8h\n"
+          "sqxtn v24.8b, v24.8h\n"
+          "sqxtn2 v24.16b, v26.8h\n"
+          "dup v28.8h, w0\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
+          "ld1 {v25.4s}, [x10]\n"
+          "smax v24.16b, v24.16b, v27.16b\n"
+          "smin v24.16b, v24.16b, v29.16b\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v24.8b}, [x6], x5\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "mov v26.d[0], v24.d[1]\n"
+          "st1 {v26.8b}, [x6], x5\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v24.4s}, [%[bias_ptr]]\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [%[bias_ptr]]\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "saddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "saddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "saddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "saddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "saddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "saddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
+        "sqrshl v25.4s, v25.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v28.4s\n"
+        "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "dup v28.8h, w2\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtn v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd v24.8h, v24.8h, v28.8h\n"
+        "sqadd v26.8h, v26.8h, v28.8h\n"
+        "sqxtn v24.8b, v24.8h\n"
+        "dup v28.16b, w3\n"
+        "dup v29.16b, w4\n"
+        "sqxtn2 v24.16b, v26.8h\n"
+        "smax v24.16b, v24.16b, v28.16b\n"
+        "smin v24.16b, v24.16b, v29.16b\n"
+        "st1 {v24.8b}, [x6], x5\n"
+        "mov v26.d[0], v24.d[1]\n"
+        "st1 {v26.8b}, [x6]\n"
+        "dup v28.8h, w0\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v29.8h, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+        "sqrshl v24.4s, v24.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v27.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "dup v26.16b, w3\n"
+        "dup v27.16b, w4\n"
+        "sqadd v24.8h, v24.8h, v29.8h\n"
+        "sqxtn v24.8b, v24.8h\n"
+        "smax v24.8b, v24.8b, v26.8b\n"
+        "smin v24.8b, v24.8b, v27.8b\n"
+        "st1 {v24.8b}, [x6]\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [output_multiplier_ptr] "r"(output_multiplier_ptr),
+    [output_shift_ptr] "r"(output_shift_ptr),
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kCenter, 1, 1> {
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v10-v11 to hold output_shift.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "dup v28.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w9\n"
+        "dup v31.16b, w10\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "sshll v0.8h, v0.8b, #0\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v11.4s}, [%[output_shift_ptr]], #16\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+          "sqrshl v16.4s, v16.4s, v10.4s\n"
+          "sqrshl v17.4s, v17.4s, v11.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v11.4s}, [%[output_shift_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+        "sqrshl v16.4s, v16.4s, v10.4s\n"
+        "sqrshl v17.4s, v17.4s, v11.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
+        "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kCorner, 1, 1> {
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+        //
+        // Use v4-v5 to hold output_multiplier & v6-v7 to hold output_shift.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v28.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w6\n"
+        "dup v31.16b, w7\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_shift_ptr]], #16\n"
+
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+          "sqrshl v16.4s, v16.4s, v6.4s\n"
+          "sqrshl v17.4s, v17.4s, v7.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "sshll v3.8h, v3.8b, #0\n"
+          "ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_shift_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+        "sqrshl v16.4s, v16.4s, v6.4s\n"
+        "sqrshl v17.4s, v17.4s, v7.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v14-v15 to hold output_shift.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "dup v31.8b, w13\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
+
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "sshll v5.8h, v5.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "sqrshl v16.4s, v16.4s, v14.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v15.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "sshll v3.8h, v3.8b, #0\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "sshll v4.8h, v4.8b, #0\n"
+          "sshll v5.8h, v5.8b, #0\n"
+          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+        "sqrshl v16.4s, v16.4s, v14.4s\n"
+        "sqrshl v17.4s, v17.4s, v15.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kVertical, 1, 1> {
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v14-v15 to hold output_shift.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "dup v31.8b, w13\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
+
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "sshll v5.8h, v5.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "sqrshl v16.4s, v16.4s, v14.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v15.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "sshll v3.8h, v3.8b, #0\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "sshll v4.8h, v4.8b, #0\n"
+          "sshll v5.8h, v5.8b, #0\n"
+          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+        "sqrshl v16.4s, v16.4s, v14.4s\n"
+        "sqrshl v17.4s, v17.4s, v15.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtn v16.8b, v16.8h\n"
+        // TODO(b/129852264): Improve testing coverage.
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_RIGHT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvThroughDepthPerChannel {
+  // Runs the DepthwiseConvWindowPerChannel kernels through the depth dimension
+  // from |start_depth| to |end_depth|. Keep this not inlined to maintain a
+  // small binary size. We use a DepthwiseConvParams struct for read only params
+  // to minimize call overhead.
+  static void __attribute__((noinline))
+  Run(const int32* output_multiplier_ptr, const int32* output_shift_ptr,
+      const int8* input_ptr, const int8* filter_ptr, const int32* bias_ptr,
+      int8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
+      int32 output_window_width, const DepthwiseConvParams& params) {
+    for (; start_depth <= end_depth - 8; start_depth += 8) {
+      DepthwiseConvWindowPerChannel<output_rounding, 8, kStrideWidth,
+                                    kStrideHeight>::Run(output_multiplier_ptr,
+                                                        output_shift_ptr,
+                                                        input_ptr, filter_ptr,
+                                                        bias_ptr, output_ptr,
+                                                        input_depth,
+                                                        input_row_size,
+                                                        output_window_height,
+                                                        output_window_width,
+                                                        &params);
+      input_ptr += 8;
+      output_ptr += 8;
+      filter_ptr += 8;
+      bias_ptr += 8;
+      output_multiplier_ptr += 8;
+      output_shift_ptr += 8;
+    }
+  }
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvMultiRowPerChannel {
+  using ConvKernel =
+      DepthwiseConvThroughDepthPerChannel<output_rounding, kStrideWidth,
+                                          kStrideHeight>;
+
+  static inline void Run(const int32* output_multiplier,
+                         const int32* output_shift, const int8* input_data,
+                         int32 start_x, int32 end_x, const int8* filter_data,
+                         const int32* bias_data, int8* output_data,
+                         const DepthwiseConvParams& params,
+                         const ShuffleParams& shuffle_params,
+                         int8* shuffle_workspace) {
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
+        get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
+        get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+    TFLITE_DCHECK_LE(
+        64 * shuffle_params.input_width * shuffle_params.input_height,
+        kDepthwiseConvScratchWorkspaceSize);
+
+    int32 out_x = start_x;
+
+    // Run shuffling on inputs with sufficiently large depth and width. When
+    // these parameters are large enough, more time is taken to load inputs
+    // from memory. At this point, it becomes useful to prefetch and
+    // preshuffle the input data to maximize locality.
+    if (params.output_depth > 64 ||
+        (params.output_depth <= 64 && params.input_width > 150)) {
+      for (; out_x <= (end_x - shuffle_params.output_width);
+           out_x += shuffle_params.output_width) {
+        const int8* input_ptr = input_data;
+        const int32* bias_ptr = bias_data;
+        const int32* output_multiplier_ptr = output_multiplier;
+        const int32* output_shift_ptr = output_shift;
+        const int8* filter_ptr = filter_data;
+        int8* output_ptr = output_data;
+        int64_t depth = 0;
+        const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+        for (; depth <= params.output_depth - 64; depth += 64) {
+          // Preload.
+          const int8* h_ptr = input_ptr;
+          for (int32 i = 0; i < shuffle_params.input_height; i++) {
+            const int8* ptr = h_ptr;
+            for (int32 j = 0; j < shuffle_params.input_width; j++) {
+              optimized_ops_preload_l1_keep(ptr);
+              ptr += params.input_depth;
+            }
+            h_ptr += params.input_row_size;
+          }
+
+          // For a large enough input, shuffle into buckets.
+          ShuffleInput(input_ptr, params.input_depth, params.input_width,
+                       params.input_height, 64, shuffle_params.input_width,
+                       shuffle_params.input_height, shuffle_workspace);
+          ConvKernel::Run(output_multiplier_ptr, output_shift_ptr,
+                          shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+                          0, 64, 64, shuffle_row_size,
+                          shuffle_params.output_height,
+                          shuffle_params.output_width, params);
+          input_ptr += 64;
+          output_ptr += 64;
+          filter_ptr += 64;
+          bias_ptr += 64;
+          output_multiplier_ptr += 64;
+          output_shift_ptr += 64;
+        }
+
+        // Preload.
+        const int8* h_ptr = input_ptr;
+        for (int32 i = 0; i < shuffle_params.input_height; i++) {
+          const int8* ptr = h_ptr;
+          for (int32 j = 0; j < shuffle_params.input_width; j++) {
+            optimized_ops_preload_l1_keep(ptr);
+            ptr += params.input_depth;
+          }
+          h_ptr += params.input_row_size;
+        }
+
+        // Handle leftover depth.
+        ConvKernel::Run(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                        filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
+                        params.input_row_size, shuffle_params.output_height,
+                        shuffle_params.output_width, params);
+
+        input_data +=
+            shuffle_params.output_width * kStrideWidth * params.input_depth;
+        output_data += shuffle_params.output_width * params.output_depth;
+      }
+    }
+
+    const int32 output_leftover_width = end_x - out_x;
+    if (output_leftover_width > 0) {
+      ConvKernel::Run(output_multiplier, output_shift, input_data, filter_data,
+                      bias_data, output_data, 0, params.output_depth,
+                      params.input_depth, params.input_row_size,
+                      shuffle_params.output_height, output_leftover_width,
+                      params);
+    }
+  }
+};
+
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConvHandlePaddingPerChannel(
+    const int32* output_multiplier_ptr, const int32* output_shift_ptr,
+    const int8* input_data, const int8* filter_data, const int32* bias_data,
+    int8* output_data, const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const int8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_data,
+                                           filter_ptr, bias_data, output_data,
+                                           &params);
+    return;
+  }
+
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const int8* input_ptr = input_data;
+  const int8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
+  int8* output_ptr = output_data;
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
+
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+               (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConv3x3FilterPerChannel(
+    const DepthwiseParams& rt_params, const int32* output_multiplier_ptr,
+    const int32* output_shift_ptr, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvParams params;
+
+  const int32 stride_width = rt_params.stride_width;
+  const int32 stride_height = rt_params.stride_height;
+  const int32 pad_width = rt_params.padding_values.width;
+  const int32 pad_height = rt_params.padding_values.height;
+  const int32 depth_multiplier = rt_params.depth_multiplier;
+  const int32 output_activation_min = rt_params.quantized_activation_min;
+  const int32 output_activation_max = rt_params.quantized_activation_max;
+  const int32 input_offset = rt_params.input_offset;
+  const int32 filter_offset = rt_params.weights_offset;
+  const int32 output_offset = rt_params.output_offset;
+
+  params.input_depth = input_shape.Dims(3);
+  params.input_width = input_shape.Dims(2);
+  params.input_height = input_shape.Dims(1);
+  params.input_row_size = params.input_depth * params.input_width;
+  params.input_offset = input_offset;
+  params.stride_width = stride_width;
+  params.stride_height = stride_height;
+  params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  params.output_width = output_shape.Dims(2);
+  params.output_height = output_shape.Dims(1);
+  params.output_row_size = params.output_depth * params.output_width;
+  params.output_offset = output_offset;
+  params.filter_offset = filter_offset;
+  params.output_activation_min = output_activation_min;
+  params.output_activation_max = output_activation_max;
+
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+  params.filter_row_size = params.output_depth * filter_width;
+
+  // Algorithm assumes below constraints. It is optimized for depth
+  // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
+  TFLITE_DCHECK(depth_multiplier == 1);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 3);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int64_t input_batch_size = params.input_row_size * params.input_height;
+  const int64_t output_batch_size =
+      params.output_row_size * params.output_height;
+
+  ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+      four_row_shuffle_params, eight_row_shuffle_params;
+  if (stride_width == 1) {
+    one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+    two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+    four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+    eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+  } else {
+    one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+    two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+    four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+    eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+  }
+
+  using conv_multirow_func_t =
+      decltype(&DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run);
+  conv_multirow_func_t conv_multirow_func =
+      DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run;
+  if (stride_width == 2) {
+    conv_multirow_func =
+        DepthwiseConvMultiRowPerChannel<output_rounding, 2, 2>::Run;
+  }
+
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array
+  // used in gemmlowp.
+  int8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
+
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = params.output_height;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, params.output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      break;
+  }
+
+  for (int32 b = batch_start; b < batch_end; ++b) {
+    // input_ptr and output_ptr point to the start of each batch
+    const int8* input_ptr = input_data + b * input_batch_size;
+    int8* output_ptr = output_data + b * output_batch_size;
+
+    int32 out_x = 0;
+    int32 out_y = row_start;
+    int32 end_x = params.output_width;
+    int32 end_y = row_end;
+
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePaddingPerChannel<output_rounding>(
+          output_multiplier_ptr, output_shift_ptr, input_ptr, filter_data,
+          bias_data, output_ptr, params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = std::max(1, out_y);
+      end_y = std::min(params.output_height - 1, end_y);
+    }
+
+    // pad_width and pad_height can both be 0 or 1, depending on padding option,
+    // such as Padding_VALID / Padding_SAME.
+    const int in_x = (out_x * stride_width) - pad_width;
+    const int in_y = (out_y * stride_height) - pad_height;
+
+    // input_ptr and output_ptr point to (in_y, in_x) and (out_y, out_x),
+    // respectively. (in_y, in_x) and (out_y, out_x) change along with
+    // row_start.
+    input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+    output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
+
+    // Shuffling shapes that maximize width over the shuffle workspace size
+    // perform better since the inputs are closer together, minimizing
+    // shuffling time.
+    //
+    // If the input shape has width large enough for the 2 row kernels,
+    // we prefer to use this. The innermost loop of the kernels handle
+    // 2 height x 2 width so this is the fastest path.
+    //
+    // If the input shape has smaller width but larger height, shuffling is
+    // still useful and can benefit from kernels 4 row and 8 row kernels.
+
+    // Handle 8 rows at a time.
+    if (params.input_width < four_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                           out_x, end_x, filter_data, bias_data, output_ptr,
+                           params, eight_row_shuffle_params, shuffle_workspace);
+        input_ptr += 8 * stride_height * params.input_row_size;
+        output_ptr += 8 * params.output_row_size;
+      }
+    }
+
+    // Handle 4 rows at a time.
+    if (params.input_width < two_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                           out_x, end_x, filter_data, bias_data, output_ptr,
+                           params, four_row_shuffle_params, shuffle_workspace);
+        input_ptr += 4 * stride_height * params.input_row_size;
+        output_ptr += 4 * params.output_row_size;
+      }
+    }
+
+    // Handle 2 rows at a time.
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                         out_x, end_x, filter_data, bias_data, output_ptr,
+                         params, two_row_shuffle_params, shuffle_workspace);
+      input_ptr += 2 * stride_height * params.input_row_size;
+      output_ptr += 2 * params.output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                         out_x, end_x, filter_data, bias_data, output_ptr,
+                         params, one_row_shuffle_params, shuffle_workspace);
+      input_ptr += stride_height * params.input_row_size;
+      output_ptr += params.output_row_size;
+    }
+  }
+}
+#endif  // __aarch64__
+
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid.h
new file mode 100644
index 00000000..174f58d4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid.h
@@ -0,0 +1,511 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
+
+#include <algorithm>
+#include <memory>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+namespace depthwise_conv {
+
+// Initializes the accumulator buffer with zeros.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       int32* acc_buffer) {
+  memset(acc_buffer, 0,
+         sizeof(acc_buffer[0]) * output_depth * num_output_pixels);
+}
+
+// Base DWConv Implementation used with both static and dynamic
+// accumulator buffers.
+// Initializes the accumulator buffer with bias values.
+static void DoDepthwiseConvHybridGeneral(
+    const DepthwiseParams& params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, const int32_t* input_offsets,
+    int thread_start, int thread_end, int thread_dim, int32* acc_buffer,
+    int32 acc_buffer_size) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_rows = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  TFLITE_DCHECK_GE(acc_buffer_size, output_depth);
+  const int kOutputPixelsInAccBuffer = acc_buffer_size / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, acc_buffer_size);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConvHybrid op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
+                                       FIXED_DEPTH_MULTIPLIER>;           \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif  // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_rows;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, output_rows);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  float* output_ptr = output_data + output_ptr_offset;
+  int batch_step =
+      (output_rows + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b) {
+    float input_scale = input_scales[b];
+    int32_t input_offset = input_offsets[b];
+    for (int out_y = row_start; out_y < row_end; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                          dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height,
+                   (input_height - in_y_origin + dilation_height_factor - 1) /
+                       dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, acc_buffer);
+
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+              stride_width, dilation_width_factor, input_depth, input_width,
+              input_data + in_y * input_height_stride + b * input_batch_stride,
+              -input_offset, pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+              out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Just store them as float values
+        gemmlowp::ScopedProfilingLabel label("store");
+        const int num_output_values = output_depth * num_output_pixels;
+        int c = 0;
+        while (c < output_depth) {
+          int target_output_depth = output_depth;
+
+#ifdef USE_NEON
+          const float32x4_t output_activation_min_vec =
+              vdupq_n_f32(output_activation_min);
+          const float32x4_t output_activation_max_vec =
+              vdupq_n_f32(output_activation_max);
+          const float32x4_t input_scale_32x4 = vdupq_n_f32(input_scale);
+          for (; c <= output_depth - 4; c += 4) {
+            if ((c + 4) > output_depth) {
+              break;
+            }
+            const float32x4_t channel_scale_32x4 =
+                vld1q_f32(per_channel_scales + c);
+            const float32x4_t bias_32x4 = vld1q_f32(bias_data + c);
+            for (int n = 0; n < num_output_pixels; ++n) {
+              int loc = n * output_depth + c;
+              int32x4_t acc = vld1q_s32(acc_buffer + loc);
+              float32x4_t float_acc = vcvtq_f32_s32(acc);
+              float_acc = vmulq_f32(float_acc, channel_scale_32x4);
+              float_acc = vmulq_f32(float_acc, input_scale_32x4);
+              float_acc = vaddq_f32(float_acc, bias_32x4);
+              float_acc = vmaxq_f32(float_acc, output_activation_min_vec);
+              float_acc = vminq_f32(float_acc, output_activation_max_vec);
+              vst1q_f32(output_ptr + loc, float_acc);
+            }
+          }
+#endif  // USE_NEON
+
+          for (; c < target_output_depth; c++) {
+            for (int n = 0; n < num_output_pixels; ++n) {
+              int loc = n * output_depth + c;
+              int32 acc = acc_buffer[loc];
+              float float_acc = acc * input_scale * per_channel_scales[c];
+              float_acc += bias_data[c];
+              float_acc = std::max(float_acc, output_activation_min);
+              float_acc = std::min(float_acc, output_activation_max);
+              output_ptr[loc] = float_acc;
+            }
+          }
+        }
+        output_ptr += num_output_values;
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+// Utilize the base implementation of DWConv with a stack allocated accumulator
+// buffer. The static allocation limits the number of depthwise channels that
+// can be processed to kStaticAccBufferMaxSize.
+static void DoDepthwiseConvHybridGeneralStatic(
+    const DepthwiseParams& params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, const int32_t* input_offsets,
+    int thread_start, int thread_end, int thread_dim) {
+  static const int kStaticAccBufferMaxSize = 2048;
+  int32 stack_acc_buffer[kStaticAccBufferMaxSize];
+  DoDepthwiseConvHybridGeneral(
+      params, input_scales, input_shape, input_data, filter_shape, filter_data,
+      bias_shape, bias_data, output_shape, output_data, per_channel_scales,
+      input_offsets, thread_start, thread_end, thread_dim, stack_acc_buffer,
+      kStaticAccBufferMaxSize);
+}
+
+// This DWConv function uses static memory for accumulation by default for upto
+// kStaticAccBufferMaxSize channels. Beyound that, a dynamic buffer is used on
+// a per call basis. The function errors out if number of channels is larger
+// than kStaticAccBufferMaxSize and TF_LITE_STATIC_MEMORY is defined.
+inline void DepthwiseConvHybridGeneral(
+    const DepthwiseParams& params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, const int32_t* input_offsets,
+    int thread_start, int thread_end, int thread_dim) {
+#ifndef TF_LITE_STATIC_MEMORY
+  static const int kStaticAccBufferMaxSize = 2048;
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+
+  if (kStaticAccBufferMaxSize < output_depth) {
+    std::unique_ptr<int32[]> heap_acc_buffer(new int32[output_depth]);
+    DoDepthwiseConvHybridGeneral(
+        params, input_scales, input_shape, input_data, filter_shape,
+        filter_data, bias_shape, bias_data, output_shape, output_data,
+        per_channel_scales, input_offsets, thread_start, thread_end, thread_dim,
+        heap_acc_buffer.get(), output_depth);
+
+    return;
+  }
+#endif
+
+  DoDepthwiseConvHybridGeneralStatic(
+      params, input_scales, input_shape, input_data, filter_shape, filter_data,
+      bias_shape, bias_data, output_shape, output_data, per_channel_scales,
+      input_offsets, thread_start, thread_end, thread_dim);
+}
+
+}  // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvHybridWithRounding(
+    const DepthwiseParams& params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, const int32_t* input_offsets,
+    int thread_start, int thread_end, int thread_dim) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8/8bit");
+  const int depth_multiplier = params.depth_multiplier;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_GE(dilation_width_factor, 1);
+  TFLITE_DCHECK_GE(dilation_height_factor, 1);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+      optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, 0, nullptr)) {
+    gemmlowp::ScopedProfilingLabel specialized_label(
+        "DepthwiseConvHybridInt8/8bit/3x3");
+    optimized_ops::depthwise_conv::DepthwiseConvHybrid3x3FilterPerChannel<
+        DepthwiseConvOutputRounding::kUpward>(
+            params, input_scales, input_shape, input_data,
+            filter_shape, filter_data, bias_shape, bias_data, output_shape,
+            output_data, per_channel_scales, input_offsets,
+            thread_start, thread_end, thread_dim);
+    return;
+  }
+#endif
+
+  gemmlowp::ScopedProfilingLabel specialized_label(
+      "DepthwiseConvHybridInt8/8bit/General");
+  depthwise_conv::DepthwiseConvHybridGeneral(
+      params, input_scales, input_shape, input_data,
+      filter_shape, filter_data, bias_shape, bias_data, output_shape,
+      output_data, per_channel_scales, input_offsets,
+      thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvHybridImpl(
+    const DepthwiseParams& params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, const int32_t* input_offsets,
+    int thread_start, int thread_end, int thread_dim) {
+  return DepthwiseConvHybridWithRounding<
+      DepthwiseConvOutputRounding::kAwayFromZero>(
+          params, input_scales, input_shape, input_data,
+          filter_shape, filter_data, bias_shape, bias_data, output_shape,
+          output_data, per_channel_scales, input_offsets,
+          thread_start, thread_end, thread_dim);
+}
+
+template <typename T, typename TS>
+struct DepthwiseConvHybridWorkerTask : cpu_backend_threadpool::Task {
+  DepthwiseConvHybridWorkerTask(const DepthwiseParams& params,
+                                const float* input_scales,
+                                const RuntimeShape& input_shape,
+                                const T* input_data,
+                                const RuntimeShape& filter_shape,
+                                const T* filter_data,
+                                const RuntimeShape& bias_shape,
+                                const TS* bias_data,
+                                const RuntimeShape& output_shape,
+                                float* output_data,
+                                const float* per_channel_scales,
+                                const int32_t* input_offsets,
+                                int thread_start, int thread_end,
+                                int thread_dim)
+      : params(params),
+        input_scales(input_scales),
+        input_shape(input_shape),
+        input_data(input_data),
+        filter_shape(filter_shape),
+        filter_data(filter_data),
+        bias_shape(bias_shape),
+        bias_data(bias_data),
+        output_shape(output_shape),
+        output_data(output_data),
+        per_channel_scales(per_channel_scales),
+        input_offsets(input_offsets),
+        thread_start(thread_start),
+        thread_end(thread_end),
+        thread_dim(thread_dim) {}
+
+  void Run() override {
+    DepthwiseConvHybridImpl(params, input_scales, input_shape,
+                            input_data, filter_shape, filter_data,
+                            bias_shape, bias_data, output_shape,
+                            output_data, per_channel_scales, input_offsets,
+                            thread_start, thread_end, thread_dim);
+  }
+
+ private:
+  const DepthwiseParams& params;
+  const float* input_scales;
+  const RuntimeShape& input_shape;
+  const T* input_data;
+  const RuntimeShape& filter_shape;
+  const T* filter_data;
+  const RuntimeShape& bias_shape;
+  const TS* bias_data;
+  const RuntimeShape& output_shape;
+  float* output_data;
+  const float* per_channel_scales;
+  const int32_t* input_offsets;
+  int thread_start;
+  int thread_end;
+  int thread_dim;
+};
+
+inline void DepthwiseConvHybridPerChannel(
+    const DepthwiseParams& params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, int32_t* input_offsets,
+    CpuBackendContext* cpu_backend_context) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  const int max_threads = cpu_backend_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1) {
+    DepthwiseConvHybridImpl(params, input_scales, input_shape,
+                            input_data, filter_shape, filter_data, bias_shape,
+                            bias_data, output_shape, output_data,
+                            per_channel_scales, input_offsets,
+                            /*thread_start=*/0, /*thread_end=*/output_rows,
+                            /*thread_dim=*/1);
+  } else {
+    std::vector<DepthwiseConvHybridWorkerTask<int8, float>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i) {
+      int thread_end =
+          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks.emplace_back(params, input_scales, input_shape,
+                         input_data, filter_shape, filter_data, bias_shape,
+                         bias_data, output_shape, output_data,
+                         per_channel_scales, input_offsets, thread_start,
+                         thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
new file mode 100644
index 00000000..5c7abda8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
@@ -0,0 +1,3243 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
+
+#include <stddef.h>
+
+#include <memory>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_RIGHT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
+#define OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN 92
+#define OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX 96
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+                  OFFSET_OUTPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+                  OFFSET_INPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+                  OFFSET_OUTPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+                  OFFSET_INPUT_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+                  OFFSET_OUTPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, float_output_activation_min) ==
+                  OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvParams, float_output_activation_max) ==
+                  OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX,
+              "");
+
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
+    int32 kStrideWidth, int32 kStrideHeight>
+    struct DepthwiseConvHybridWindowPerChannel {};
+
+template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
+    int kPadWidth, int kPadHeight>
+    struct DepthwiseConvHybridPartialPerChannel {};
+
+template <>
+struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
+    8, 1, 1> {
+ public:
+  static inline void Run(const float* input_scale,
+                         const int8* input_ptr,
+                         const int8* filter_ptr, const float* bias_ptr,
+                         float* output_ptr, int64_t input_depth,
+                         int64_t input_row_size, int32 output_window_height,
+                         int32 output_window_width,
+                         const float* per_channel_scales,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * 4 * params_ptr->output_row_size;
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // However, the challenges are how to plan the registers allocation
+        // wisely: 25 NEON registers are already reserved for inputs, filters,
+        // and outputs; also, 2 registers (v30, v31) are used for output
+        // min/max, while another 2 registers (v26, v29) are used for input
+        // offset & output offset, so that's total 25 + 2 + 2 = 29 already.
+        // But we need 4 more registers to hold the output multiplier & output
+        // right shift (we only have 3).
+        //
+        // So here's the plan:
+        // v27 (which held duplicated output multiplier previously) will hold
+        // the first 4 values of the output_multiplier_ptr (we have 8 in total);
+        // v30 (which held duplicated output right shift previously) will hold
+        // the first 4 values of the output_shift_ptr (we have 8 in total);
+        // lastly, v28 will hold the last 4 values of output_mulitplier and v31
+        // (previously occupied by activations) will hold the last 4 values of
+        // output_shift. Then v25 will be used for output activation min while
+        // output activation max will just reuse oother registers, like v24.
+        //
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v25.4s, w4\n"
+        "dup v29.4s, w0\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "mov x4, #4\n"
+        "mul x1, x1, x4\n"
+        "mul x4, x4, x3\n"
+
+        // Load per_channel scales and bias (float).
+        "ldr w2, [%[input_scale]]\n"
+        "ld1 {v27.4s, v28.4s}, [%[per_channel_scales]]\n"
+        "ld1 {v30.4s, v31.4s}, [%[bias_ptr]]\n"
+        "dup v26.4s, w2\n"
+        "fmul v27.4s, v27.4s, v26.4s\n"
+        "fmul v28.4s, v28.4s, v26.4s\n"
+        "dup v26.8h, w9\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "sshll v5.8h, v5.8b, #0\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "sshll v6.8h, v6.8b, #0\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "sshll v7.8h, v7.8b, #0\n"
+        "sshll v8.8h, v8.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "saddw v14.8h, v26.8h, v14.8b\n"
+
+          "movi v21.4s, #0\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
+          "movi v22.4s, #0\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
+          "movi v23.4s, #0\n"
+          "saddw v17.8h, v26.8h, v17.8b\n"
+          "movi v24.4s, #0\n"
+
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v19.8h, v26.8h, v19.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            // Cast to float.
+            "scvtf v21.4s, v21.4s\n"
+            "scvtf v22.4s, v22.4s\n"
+            "scvtf v23.4s, v23.4s\n"
+            "scvtf v24.4s, v24.4s\n"
+            // Multiply by per channel scale.
+            "fmul v21.4s, v21.4s, v27.4s\n"
+            "fmul v22.4s, v22.4s, v28.4s\n"
+            "fmul v23.4s, v23.4s, v27.4s\n"
+            "fmul v24.4s, v24.4s, v28.4s\n"
+            // Add bias.
+            "fadd v21.4s, v21.4s, v30.4s\n"
+            "fadd v22.4s, v22.4s, v31.4s\n"
+            "fadd v23.4s, v23.4s, v30.4s\n"
+            "fadd v24.4s, v24.4s, v31.4s\n"
+            // Clamp range.
+            "fmax v21.4s, v21.4s, v25.4s\n"
+            "fmin v21.4s, v21.4s, v29.4s\n"
+            "fmax v22.4s, v22.4s, v25.4s\n"
+            "fmin v22.4s, v22.4s, v29.4s\n"
+            "fmax v23.4s, v23.4s, v25.4s\n"
+            "fmin v23.4s, v23.4s, v29.4s\n"
+            "fmax v24.4s, v24.4s, v25.4s\n"
+            "fmin v24.4s, v24.4s, v29.4s\n"
+            // Store to float.
+            "st1 {v21.4s, v22.4s}, [x6], x4\n"
+            "st1 {v23.4s, v24.4s}, [x7], x4\n"
+            // Reset to int
+            "fcvtms v21.4s, v21.4s\n"
+            "fcvtms v22.4s, v22.4s\n"
+            "fcvtms v23.4s, v23.4s\n"
+            "fcvtms v24.4s, v24.4s\n"
+
+            "movi v22.4s, #0\n"
+            "movi v24.4s, #0\n"
+            "saddw v9.8h, v26.8h, v9.8b\n"
+            "saddw v12.8h, v26.8h, v12.8b\n"
+            "saddw v15.8h, v26.8h, v15.8b\n"
+            "movi v21.4s, #0\n"
+            "saddw v18.8h, v26.8h, v18.8b\n"
+            "movi v23.4s, #0\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+
+            // Cast to float.
+            "scvtf v21.4s, v21.4s\n"
+            "scvtf v22.4s, v22.4s\n"
+            "scvtf v23.4s, v23.4s\n"
+            "scvtf v24.4s, v24.4s\n"
+            // Multiply by per channel scale.
+            "fmul v21.4s, v21.4s, v27.4s\n"
+            "fmul v22.4s, v22.4s, v28.4s\n"
+            "fmul v23.4s, v23.4s, v27.4s\n"
+            "fmul v24.4s, v24.4s, v28.4s\n"
+            // Add bias.
+            "fadd v21.4s, v21.4s, v30.4s\n"
+            "fadd v22.4s, v22.4s, v31.4s\n"
+            "fadd v23.4s, v23.4s, v30.4s\n"
+            "fadd v24.4s, v24.4s, v31.4s\n"
+            // Clamp range.
+            "fmax v21.4s, v21.4s, v25.4s\n"
+            "fmin v21.4s, v21.4s, v29.4s\n"
+            "fmax v22.4s, v22.4s, v25.4s\n"
+            "fmin v22.4s, v22.4s, v29.4s\n"
+            "fmax v23.4s, v23.4s, v25.4s\n"
+            "fmin v23.4s, v23.4s, v29.4s\n"
+            "fmax v24.4s, v24.4s, v25.4s\n"
+            "fmin v24.4s, v24.4s, v29.4s\n"
+            // Store to float.
+            "st1 {v21.4s, v22.4s}, [x6], x4\n"
+            "st1 {v23.4s, v24.4s}, [x7], x4\n"
+            // Reset to int.
+            "fcvtms v21.4s, v21.4s\n"
+            "fcvtms v22.4s, v22.4s\n"
+            "fcvtms v23.4s, v23.4s\n"
+            "fcvtms v24.4s, v24.4s\n"
+
+            "movi v22.4s, #0\n"
+            "movi v24.4s, #0\n"
+            "saddw v9.8h, v26.8h, v9.8b\n"
+            "saddw v10.8h, v26.8h, v10.8b\n"
+            "saddw v11.8h, v26.8h, v11.8b\n"
+            "saddw v12.8h, v26.8h, v12.8b\n"
+            "saddw v13.8h, v26.8h, v13.8b\n"
+            "saddw v14.8h, v26.8h, v14.8b\n"
+            "saddw v15.8h, v26.8h, v15.8b\n"
+            "movi v21.4s, #0\n"
+            "saddw v16.8h, v26.8h, v16.8b\n"
+            "movi v23.4s, #0\n"
+            "saddw v17.8h, v26.8h, v17.8b\n"
+            "saddw v18.8h, v26.8h, v18.8b\n"
+            "saddw v19.8h, v26.8h, v19.8b\n"
+            "saddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          // Cast to float.
+          "scvtf v21.4s, v21.4s\n"
+          "scvtf v22.4s, v22.4s\n"
+          "scvtf v23.4s, v23.4s\n"
+          "scvtf v24.4s, v24.4s\n"
+          // Multiply by per channel scale.
+          "fmul v21.4s, v21.4s, v27.4s\n"
+          "fmul v22.4s, v22.4s, v28.4s\n"
+          "fmul v23.4s, v23.4s, v27.4s\n"
+          "fmul v24.4s, v24.4s, v28.4s\n"
+          // Add bias.
+          "fadd v21.4s, v21.4s, v30.4s\n"
+          "fadd v22.4s, v22.4s, v31.4s\n"
+          "fadd v23.4s, v23.4s, v30.4s\n"
+          "fadd v24.4s, v24.4s, v31.4s\n"
+          // Clamp range.
+          "fmax v21.4s, v21.4s, v25.4s\n"
+          "fmin v21.4s, v21.4s, v29.4s\n"
+          "fmax v22.4s, v22.4s, v25.4s\n"
+          "fmin v22.4s, v22.4s, v29.4s\n"
+          "fmax v23.4s, v23.4s, v25.4s\n"
+          "fmin v23.4s, v23.4s, v29.4s\n"
+          "fmax v24.4s, v24.4s, v25.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          // Store to float.
+          "st1 {v21.4s, v22.4s}, [x6], x4\n"
+          "st1 {v23.4s, v24.4s}, [x7], x4\n"
+          // Reset to int.
+          "fcvtms v21.4s, v21.4s\n"
+          "fcvtms v22.4s, v22.4s\n"
+          "fcvtms v23.4s, v23.4s\n"
+          "fcvtms v24.4s, v24.4s\n"
+
+          "movi v22.4s, #0\n"
+          "movi v24.4s, #0\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
+          "movi v21.4s, #0\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "movi v23.4s, #0\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          // Cast to float.
+          "scvtf v21.4s, v21.4s\n"
+          "scvtf v22.4s, v22.4s\n"
+          "scvtf v23.4s, v23.4s\n"
+          "scvtf v24.4s, v24.4s\n"
+          // Multiply by per channel scale.
+          "fmul v21.4s, v21.4s, v27.4s\n"
+          "fmul v22.4s, v22.4s, v28.4s\n"
+          "fmul v23.4s, v23.4s, v27.4s\n"
+          "fmul v24.4s, v24.4s, v28.4s\n"
+          // Add bias.
+          "fadd v21.4s, v21.4s, v30.4s\n"
+          "fadd v22.4s, v22.4s, v31.4s\n"
+          "fadd v23.4s, v23.4s, v30.4s\n"
+          "fadd v24.4s, v24.4s, v31.4s\n"
+          // Clamp range.
+          "fmax v21.4s, v21.4s, v25.4s\n"
+          "fmin v21.4s, v21.4s, v29.4s\n"
+          "fmax v22.4s, v22.4s, v25.4s\n"
+          "fmin v22.4s, v22.4s, v29.4s\n"
+          "fmax v23.4s, v23.4s, v25.4s\n"
+          "fmin v23.4s, v23.4s, v29.4s\n"
+          "fmax v24.4s, v24.4s, v25.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          // Store to float.
+          "st1 {v21.4s, v22.4s}, [x6], x4\n"
+          "st1 {v23.4s, v24.4s}, [x7], x4\n"
+          // Reset to int.
+          "fcvtms v21.4s, v21.4s\n"
+          "fcvtms v22.4s, v22.4s\n"
+          "fcvtms v23.4s, v23.4s\n"
+          "fcvtms v24.4s, v24.4s\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+          // Cast to float.
+          "scvtf v21.4s, v21.4s\n"
+          "scvtf v22.4s, v22.4s\n"
+          "scvtf v23.4s, v23.4s\n"
+          "scvtf v24.4s, v24.4s\n"
+          // Multiply by per channel scale.
+          "fmul v21.4s, v21.4s, v27.4s\n"
+          "fmul v22.4s, v22.4s, v28.4s\n"
+          "fmul v23.4s, v23.4s, v27.4s\n"
+          "fmul v24.4s, v24.4s, v28.4s\n"
+           // Add bias.
+          "fadd v21.4s, v21.4s, v30.4s\n"
+          "fadd v22.4s, v22.4s, v31.4s\n"
+          "fadd v23.4s, v23.4s, v30.4s\n"
+          "fadd v24.4s, v24.4s, v31.4s\n"
+          // Clamp range.
+          "fmax v21.4s, v21.4s, v25.4s\n"
+          "fmin v21.4s, v21.4s, v29.4s\n"
+          "fmax v22.4s, v22.4s, v25.4s\n"
+          "fmin v22.4s, v22.4s, v29.4s\n"
+          "fmax v23.4s, v23.4s, v25.4s\n"
+          "fmin v23.4s, v23.4s, v29.4s\n"
+          "fmax v24.4s, v24.4s, v25.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          // Store to float.
+          "st1 {v21.4s, v22.4s}, [x6], x4\n"
+          "st1 {v23.4s, v24.4s}, [x7], x4\n"
+          // Reset to int.
+          "fcvtms v21.4s, v21.4s\n"
+          "fcvtms v22.4s, v22.4s\n"
+          "fcvtms v23.4s, v23.4s\n"
+          "fcvtms v24.4s, v24.4s\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "movi v21.4s, #0\n"
+        "movi v22.4s, #0\n"
+        "movi v23.4s, #0\n"
+        "movi v24.4s, #0\n"
+
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+        "saddw v14.8h, v26.8h, v14.8b\n"
+        "saddw v15.8h, v26.8h, v15.8b\n"
+        "saddw v17.8h, v26.8h, v17.8b\n"
+        "saddw v18.8h, v26.8h, v18.8b\n"
+        "saddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          // Cast to float.
+          "scvtf v21.4s, v21.4s\n"
+          "scvtf v22.4s, v22.4s\n"
+          "scvtf v23.4s, v23.4s\n"
+          "scvtf v24.4s, v24.4s\n"
+          // Multiply by per channel scale.
+          "fmul v21.4s, v21.4s, v27.4s\n"
+          "fmul v22.4s, v22.4s, v28.4s\n"
+          "fmul v23.4s, v23.4s, v27.4s\n"
+          "fmul v24.4s, v24.4s, v28.4s\n"
+          // Add bias.
+          "fadd v21.4s, v21.4s, v30.4s\n"
+          "fadd v22.4s, v22.4s, v31.4s\n"
+          "fadd v23.4s, v23.4s, v30.4s\n"
+          "fadd v24.4s, v24.4s, v31.4s\n"
+          // Clamp range.
+          "fmax v21.4s, v21.4s, v25.4s\n"
+          "fmin v21.4s, v21.4s, v29.4s\n"
+          "fmax v22.4s, v22.4s, v25.4s\n"
+          "fmin v22.4s, v22.4s, v29.4s\n"
+          "fmax v23.4s, v23.4s, v25.4s\n"
+          "fmin v23.4s, v23.4s, v29.4s\n"
+          "fmax v24.4s, v24.4s, v25.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          // Store to float.
+          "st1 {v21.4s, v22.4s}, [%[output_ptr]], x4\n"
+          "st1 {v23.4s, v24.4s}, [%[output_ptr]], x4\n"
+          // Reset to int.
+          "fcvtms v21.4s, v21.4s\n"
+          "fcvtms v22.4s, v22.4s\n"
+          "fcvtms v23.4s, v23.4s\n"
+          "fcvtms v24.4s, v24.4s\n"
+
+          "movi v22.4s, #0\n"
+          "movi v24.4s, #0\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v14.8h, v26.8h, v14.8b\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
+          "movi v21.4s, #0\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
+          "movi v23.4s, #0\n"
+          "saddw v17.8h, v26.8h, v17.8b\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v19.8h, v26.8h, v19.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "saddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "saddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        // Cast to float.
+        "scvtf v21.4s, v21.4s\n"
+        "scvtf v22.4s, v22.4s\n"
+        "scvtf v23.4s, v23.4s\n"
+        "scvtf v24.4s, v24.4s\n"
+        // Multiply by per channel scale.
+        "fmul v21.4s, v21.4s, v27.4s\n"
+        "fmul v22.4s, v22.4s, v28.4s\n"
+        "fmul v23.4s, v23.4s, v27.4s\n"
+        "fmul v24.4s, v24.4s, v28.4s\n"
+        // Add bias.
+        "fadd v21.4s, v21.4s, v30.4s\n"
+        "fadd v22.4s, v22.4s, v31.4s\n"
+        "fadd v23.4s, v23.4s, v30.4s\n"
+        "fadd v24.4s, v24.4s, v31.4s\n"
+        // Clamp range.
+        "fmax v21.4s, v21.4s, v25.4s\n"
+        "fmin v21.4s, v21.4s, v29.4s\n"
+        "fmax v22.4s, v22.4s, v25.4s\n"
+        "fmin v22.4s, v22.4s, v29.4s\n"
+        "fmax v23.4s, v23.4s, v25.4s\n"
+        "fmin v23.4s, v23.4s, v29.4s\n"
+        "fmax v24.4s, v24.4s, v25.4s\n"
+        "fmin v24.4s, v24.4s, v29.4s\n"
+        // Store to float.
+        "st1 {v21.4s, v22.4s}, [%[output_ptr]], x4\n"
+        "st1 {v23.4s, v24.4s}, [%[output_ptr]], x4\n"
+        // Reset to int.
+        "fcvtms v21.4s, v21.4s\n"
+        "fcvtms v22.4s, v22.4s\n"
+        "fcvtms v23.4s, v23.4s\n"
+        "fcvtms v24.4s, v24.4s\n"
+
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "scvtf v21.4s, v21.4s\n"
+        "scvtf v22.4s, v22.4s\n"
+        "fmul v21.4s, v21.4s, v27.4s\n"
+        "fmul v22.4s, v22.4s, v28.4s\n"
+        "fadd v21.4s, v21.4s, v30.4s\n"
+        "fadd v22.4s, v22.4s, v31.4s\n"
+        "fmax v21.4s, v21.4s, v25.4s\n"
+        "fmin v21.4s, v21.4s, v29.4s\n"
+        "fmax v22.4s, v22.4s, v25.4s\n"
+        "fmin v22.4s, v22.4s, v29.4s\n"
+        "st1 {v21.4s, v22.4s}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height),
+    [per_channel_scales] "+r"(per_channel_scales)
+    :
+    // Inputs.
+    [input_scale] "r"(input_scale),
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
+    8, 2, 2> {
+  static inline void Run(const float* input_scale, const int8* input_ptr,
+                         const int8* filter_ptr, const float* bias_ptr,
+                         float* output_ptr, int64_t input_depth,
+                         int64_t input_row_size, int32 output_window_height,
+                         int32 output_window_width,
+                         const float* per_channel_scales,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * 4 * params_ptr->output_row_size;
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // The register planning here is really tricky:
+        // v0-v29 are all used at least once for either filter/input/output,
+        // some of them are used for output shift and output mulitplier, or
+        // input/output offset.
+        // Only v30 & v31 are only used for output activation min/max.
+        // For per-channel case, we need 4 registers to hold output shift &
+        // output multiplier. However, given the reality, we simply cannot do
+        // that without reloading.
+        //
+        // So here's the plan:
+        // We hold output_multiplier in v30 & v31, and we will load output_shift
+        // into two consecutive registers each time before use.
+        // We will duplicate output min & max before needed.
+        // Sometimes we may borrow registers from input offset or bias, we will
+        // dup them back after use.
+        //
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "mov x4, #4\n"
+        "mul x19, x19, x4\n"
+        "mul x4, x4, x5\n"
+        "ldr w2, [%[input_scale]]\n"
+        "dup v28.4s, w2\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v29.4s, w2\n"
+        "ld1 {v30.4s, v31.4s}, [%[per_channel_scales]]\n"
+        "fmul v30.4s, v30.4s, v28.4s\n"
+        "fmul v31.4s, v31.4s, v28.4s\n"
+        "dup v28.8h, w0\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "sshll v5.8h, v5.8b, #0\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "sshll v6.8h, v6.8b, #0\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "sshll v7.8h, v7.8b, #0\n"
+        "sshll v8.8h, v8.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "movi v21.4s, #0\n"
+          "movi v22.4s, #0\n"
+          "movi v23.4s, #0\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "movi v24.4s, #0\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "movi v19.4s, #0\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "movi v20.4s, #0\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "movi v25.4s, #0\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "movi v26.4s, #0\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "saddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "saddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+            // Cast to float.
+            "ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
+            "scvtf v21.4s, v21.4s\n"
+            "scvtf v22.4s, v22.4s\n"
+            "scvtf v23.4s, v23.4s\n"
+            "scvtf v24.4s, v24.4s\n"
+            // Multiply by per channel scale.
+            "fmul v21.4s, v21.4s, v30.4s\n"
+            "fmul v22.4s, v22.4s, v31.4s\n"
+            "fmul v23.4s, v23.4s, v30.4s\n"
+            "fmul v24.4s, v24.4s, v31.4s\n"
+            // Add bias.
+            "fadd v21.4s, v21.4s, v27.4s\n"
+            "fadd v22.4s, v22.4s, v28.4s\n"
+            "fadd v23.4s, v23.4s, v27.4s\n"
+            "fadd v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "dup v27.4s, w3\n"
+            "fmax v21.4s, v21.4s, v27.4s\n"
+            "fmin v21.4s, v21.4s, v29.4s\n"
+            "fmax v22.4s, v22.4s, v27.4s\n"
+            "fmin v22.4s, v22.4s, v29.4s\n"
+            "fmax v23.4s, v23.4s, v27.4s\n"
+            "fmin v23.4s, v23.4s, v29.4s\n"
+            "fmax v24.4s, v24.4s, v27.4s\n"
+            "fmin v24.4s, v24.4s, v29.4s\n"
+            // Store.
+            "st1 {v21.4s, v22.4s}, [x6], x4\n"
+            "st1 {v23.4s, v24.4s}, [x6], x4\n"
+            // Reset to int.
+            "fcvtms v21.4s, v21.4s\n"
+            "fcvtms v22.4s, v22.4s\n"
+            "fcvtms v23.4s, v23.4s\n"
+            "fcvtms v24.4s, v24.4s\n"
+
+            "movi v22.4s, #0\n"
+            "movi v24.4s, #0\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "saddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "saddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "saddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "movi v21.4s, #0\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "movi v23.4s, #0\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "saddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "saddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            // Cast to float.
+            "ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
+            "scvtf v19.4s, v19.4s\n"
+            "scvtf v20.4s, v20.4s\n"
+            "scvtf v25.4s, v25.4s\n"
+            "scvtf v26.4s, v26.4s\n"
+            // Multiply by per channel scale.
+            "fmul v19.4s, v19.4s, v30.4s\n"
+            "fmul v20.4s, v20.4s, v31.4s\n"
+            "fmul v25.4s, v25.4s, v30.4s\n"
+            "fmul v26.4s, v26.4s, v31.4s\n"
+            // Add bias.
+            "fadd v19.4s, v19.4s, v27.4s\n"
+            "fadd v20.4s, v20.4s, v28.4s\n"
+            "fadd v25.4s, v25.4s, v27.4s\n"
+            "fadd v26.4s, v26.4s, v28.4s\n"
+            "dup v27.4s, w3\n"
+            "fmax v19.4s, v19.4s, v27.4s\n"
+            "fmin v19.4s, v19.4s, v29.4s\n"
+            "fmax v20.4s, v20.4s, v27.4s\n"
+            "fmin v20.4s, v20.4s, v29.4s\n"
+            "fmax v25.4s, v25.4s, v27.4s\n"
+            "fmin v25.4s, v25.4s, v29.4s\n"
+            "fmax v26.4s, v26.4s, v27.4s\n"
+            "fmin v26.4s, v26.4s, v29.4s\n"
+            "dup v28.8h, w0\n"
+            // Store.
+            "st1 {v19.4s, v20.4s}, [x7], x4\n"
+            "st1 {v25.4s, v26.4s}, [x7], x4\n"
+            "fcvtms v19.4s, v19.4s\n"
+            "fcvtms v20.4s, v20.4s\n"
+            "fcvtms v25.4s, v25.4s\n"
+            "fcvtms v26.4s, v26.4s\n"
+
+            "movi v20.4s, #0\n"
+            "movi v26.4s, #0\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
+            "movi v19.4s, #0\n"
+            "saddw v14.8h, v28.8h, v14.8b\n"
+            "movi v25.4s, #0\n"
+            "saddw v15.8h, v28.8h, v15.8b\n"
+            "saddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
+          "scvtf v21.4s, v21.4s\n"
+          "scvtf v22.4s, v22.4s\n"
+          "scvtf v23.4s, v23.4s\n"
+          "scvtf v24.4s, v24.4s\n"
+          // Multiply by per channel scale.
+          "fmul v21.4s, v21.4s, v30.4s\n"
+          "fmul v22.4s, v22.4s, v31.4s\n"
+          "fmul v23.4s, v23.4s, v30.4s\n"
+          "fmul v24.4s, v24.4s, v31.4s\n"
+          // Add bias.
+          "fadd v21.4s, v21.4s, v27.4s\n"
+          "fadd v22.4s, v22.4s, v28.4s\n"
+          "fadd v23.4s, v23.4s, v27.4s\n"
+          "fadd v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "dup v27.4s, w3\n"
+          "fmax v21.4s, v21.4s, v27.4s\n"
+          "fmin v21.4s, v21.4s, v29.4s\n"
+          "fmax v22.4s, v22.4s, v27.4s\n"
+          "fmin v22.4s, v22.4s, v29.4s\n"
+          "fmax v23.4s, v23.4s, v27.4s\n"
+          "fmin v23.4s, v23.4s, v29.4s\n"
+          "fmax v24.4s, v24.4s, v27.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          // Store.
+          "st1 {v21.4s, v22.4s}, [x6], x4\n"
+          "st1 {v23.4s, v24.4s}, [x6]\n"
+          // Reset to int.
+          "fcvtms v21.4s, v21.4s\n"
+          "fcvtms v22.4s, v22.4s\n"
+          "fcvtms v23.4s, v23.4s\n"
+          "fcvtms v24.4s, v24.4s\n"
+
+          "movi v22.4s, #0\n"
+          "movi v24.4s, #0\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          // Cast to float.
+          "ld1 {v27.4s, v28.4s}, [%[bias_ptr]]\n"
+          "scvtf v19.4s, v19.4s\n"
+          "scvtf v20.4s, v20.4s\n"
+          "scvtf v25.4s, v25.4s\n"
+          "scvtf v26.4s, v26.4s\n"
+          // Multiply by per channel scale.
+          "fmul v19.4s, v19.4s, v30.4s\n"
+          "fmul v20.4s, v20.4s, v31.4s\n"
+          "fmul v25.4s, v25.4s, v30.4s\n"
+          "fmul v26.4s, v26.4s, v31.4s\n"
+          // Add bias.
+          "fadd v19.4s, v19.4s, v27.4s\n"
+          "fadd v20.4s, v20.4s, v28.4s\n"
+          "fadd v25.4s, v25.4s, v27.4s\n"
+          "fadd v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "dup v27.4s, w3\n"
+          "fmax v19.4s, v19.4s, v27.4s\n"
+          "fmin v19.4s, v19.4s, v29.4s\n"
+          "fmax v20.4s, v20.4s, v27.4s\n"
+          "fmin v20.4s, v20.4s, v29.4s\n"
+          "fmax v25.4s, v25.4s, v27.4s\n"
+          "fmin v25.4s, v25.4s, v29.4s\n"
+          "fmax v26.4s, v26.4s, v27.4s\n"
+          "fmin v26.4s, v26.4s, v29.4s\n"
+          "dup v28.8h, w0\n"
+          // Store.
+          "st1 {v19.4s, v20.4s}, [x7], x4\n"
+          "st1 {v25.4s, v26.4s}, [x7]\n"
+          "fcvtms v19.4s, v19.4s\n"
+          "fcvtms v20.4s, v20.4s\n"
+          "fcvtms v25.4s, v25.4s\n"
+          "fcvtms v26.4s, v26.4s\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "ld1 {v26.4s, v27.4s}, [%[bias_ptr]]\n"
+          "scvtf v21.4s, v21.4s\n"
+          "scvtf v22.4s, v22.4s\n"
+          "fmul v21.4s, v21.4s, v30.4s\n"
+          "fmul v22.4s, v22.4s, v31.4s\n"
+          "fadd v21.4s, v21.4s, v26.4s\n"
+          "fadd v22.4s, v22.4s, v27.4s\n"
+          "dup v26.4s, w3\n"
+          "fmax v21.4s, v21.4s, v26.4s\n"
+          "fmin v21.4s, v21.4s, v29.4s\n"
+          "fmax v22.4s, v22.4s, v26.4s\n"
+          "fmin v22.4s, v22.4s, v29.4s\n"
+          "st1 {v21.4s, v22.4s}, [x6]\n"
+          "fcvtms v21.4s, v21.4s\n"
+          "fcvtms v22.4s, v22.4s\n"
+
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "ld1 {v26.4s, v27.4s}, [%[bias_ptr]]\n"
+          "scvtf v23.4s, v23.4s\n"
+          "scvtf v24.4s, v24.4s\n"
+          "fmul v23.4s, v23.4s, v30.4s\n"
+          "fmul v24.4s, v24.4s, v31.4s\n"
+          "fadd v23.4s, v23.4s, v26.4s\n"
+          "fadd v24.4s, v24.4s, v27.4s\n"
+          "dup v26.4s, w3\n"
+          "fmax v23.4s, v23.4s, v26.4s\n"
+          "fmin v23.4s, v23.4s, v29.4s\n"
+          "fmax v24.4s, v24.4s, v26.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          "st1 {v23.4s, v24.4s}, [x7]\n"
+          "fcvtms v23.4s, v23.4s\n"
+          "fcvtms v24.4s, v24.4s\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "saddw v9.8h, v28.8h, v9.8b\n"
+        "movi v24.4s, #0\n"
+        "saddw v10.8h, v28.8h, v10.8b\n"
+        "movi v25.4s, #0\n"
+        "saddw v11.8h, v28.8h, v11.8b\n"
+        "movi v26.4s, #0\n"
+        "movi v27.4s, #0\n"
+        "saddw v12.8h, v28.8h, v12.8b\n"
+        "saddw v13.8h, v28.8h, v13.8b\n"
+        "saddw v14.8h, v28.8h, v14.8b\n"
+        "saddw v15.8h, v28.8h, v15.8b\n"
+        "saddw v16.8h, v28.8h, v16.8b\n"
+        "saddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "saddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "saddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "saddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "saddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "saddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "ld1 {v28.4s, v29.4s}, [%[bias_ptr]]\n"
+          "scvtf v24.4s, v24.4s\n"
+          "scvtf v25.4s, v25.4s\n"
+          "scvtf v26.4s, v26.4s\n"
+          "scvtf v27.4s, v27.4s\n"
+          "fmul v24.4s, v24.4s, v30.4s\n"
+          "fmul v25.4s, v25.4s, v31.4s\n"
+          "fmul v26.4s, v26.4s, v30.4s\n"
+          "fmul v27.4s, v27.4s, v31.4s\n"
+          "fadd v24.4s, v24.4s, v28.4s\n"
+          "fadd v25.4s, v25.4s, v29.4s\n"
+          "fadd v26.4s, v26.4s, v28.4s\n"
+          "fadd v27.4s, v27.4s, v29.4s\n"
+          "dup v28.4s, w3\n"
+          "dup v29.4s, w2\n"
+          "fmax v24.4s, v24.4s, v28.4s\n"
+          "fmin v24.4s, v24.4s, v29.4s\n"
+          "fmax v25.4s, v25.4s, v28.4s\n"
+          "fmin v25.4s, v25.4s, v29.4s\n"
+          "fmax v26.4s, v26.4s, v28.4s\n"
+          "fmin v26.4s, v26.4s, v29.4s\n"
+          "fmax v27.4s, v27.4s, v28.4s\n"
+          "fmin v27.4s, v27.4s, v29.4s\n"
+          "dup v28.8h, w0\n"
+          "st1 {v24.4s, v25.4s}, [x6], x4\n"
+          "st1 {v26.4s, v27.4s}, [x6], x4\n"
+          "fcvtms v24.4s, v24.4s\n"
+          "fcvtms v25.4s, v25.4s\n"
+          "fcvtms v26.4s, v26.4s\n"
+          "fcvtms v27.4s, v27.4s\n"
+
+          "movi v25.4s, #0\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
+          "movi v27.4s, #0\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
+          "movi v24.4s, #0\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
+          "movi v26.4s, #0\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "saddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "saddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "saddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "saddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "saddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "saddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "ld1 {v28.4s, v29.4s}, [%[bias_ptr]]\n"
+        "scvtf v24.4s, v24.4s\n"
+        "scvtf v25.4s, v25.4s\n"
+        "scvtf v26.4s, v26.4s\n"
+        "scvtf v27.4s, v27.4s\n"
+        "fmul v24.4s, v24.4s, v30.4s\n"
+        "fmul v25.4s, v25.4s, v31.4s\n"
+        "fmul v26.4s, v26.4s, v30.4s\n"
+        "fmul v27.4s, v27.4s, v31.4s\n"
+        "fadd v24.4s, v24.4s, v28.4s\n"
+        "fadd v25.4s, v25.4s, v29.4s\n"
+        "fadd v26.4s, v26.4s, v28.4s\n"
+        "fadd v27.4s, v27.4s, v29.4s\n"
+        "dup v28.4s, w3\n"
+        "dup v29.4s, w2\n"
+        "fmax v24.4s, v24.4s, v28.4s\n"
+        "fmin v24.4s, v24.4s, v29.4s\n"
+        "fmax v25.4s, v25.4s, v28.4s\n"
+        "fmin v25.4s, v25.4s, v29.4s\n"
+        "fmax v26.4s, v26.4s, v28.4s\n"
+        "fmin v26.4s, v26.4s, v29.4s\n"
+        "fmax v27.4s, v27.4s, v28.4s\n"
+        "fmin v27.4s, v27.4s, v29.4s\n"
+        "dup v28.8h, w0\n"
+        "st1 {v24.4s, v25.4s}, [x6], x4\n"
+        "st1 {v26.4s, v27.4s}, [x6]\n"
+        "fcvtms v24.4s, v24.4s\n"
+        "fcvtms v25.4s, v25.4s\n"
+        "fcvtms v26.4s, v26.4s\n"
+        "fcvtms v27.4s, v27.4s\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v29.8h, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "ld1 {v26.4s, v27.4s}, [%[bias_ptr]]\n"
+        "scvtf v24.4s, v24.4s\n"
+        "scvtf v25.4s, v25.4s\n"
+        "fmul v24.4s, v24.4s, v30.4s\n"
+        "fmul v25.4s, v25.4s, v31.4s\n"
+        "fadd v24.4s, v24.4s, v26.4s\n"
+        "fadd v25.4s, v25.4s, v27.4s\n"
+        "dup v26.4s, w3\n"
+        "dup v27.4s, w2\n"
+        "fmax v24.4s, v24.4s, v26.4s\n"
+        "fmin v24.4s, v24.4s, v27.4s\n"
+        "fmax v25.4s, v25.4s, v26.4s\n"
+        "fmin v25.4s, v25.4s, v27.4s\n"
+        "st1 {v24.4s, v25.4s}, [x6]\n"
+        "fcvtms v24.4s, v24.4s\n"
+        "fcvtms v25.4s, v25.4s\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [input_scale] "r"(input_scale),
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [per_channel_scales] "r"(per_channel_scales),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvHybridPartialPerChannel<
+    DepthwiseConvOutputRounding::kUpward, EdgeType::kCenter, 1, 1> {
+    static inline void Run(const float* input_scale, const int8* input_ptr,
+                           const int8* filter_ptr, const float* bias_ptr,
+                           float* output_ptr, const float* per_channel_scales,
+                           const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v10-v11 to hold output_shift.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[input_scale]]\n"
+        "cmp x11, #16\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w9\n"
+        "dup v31.4s, w10\n"
+        "movi v16.4s, #0\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "movi v17.4s, #0\n"
+        "sshll v0.8h, v0.8b, #0\n"
+
+        "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
+        "fmul v6.4s, v6.4s, v28.4s\n"
+        "ld1 {v10.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
+        "fmul v7.4s, v7.4s, v28.4s\n"
+        "ld1 {v11.4s}, [%[bias_ptr]], #16\n"
+
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "scvtf v16.4s, v16.4s\n"
+          "scvtf v17.4s, v17.4s\n"
+          "fmul v16.4s, v16.4s, v6.4s\n"
+          "fmul v17.4s, v17.4s, v7.4s\n"
+          "fadd v16.4s, v16.4s, v10.4s\n"
+          "fadd v17.4s, v17.4s, v11.4s\n"
+          "fmax v16.4s, v16.4s, v30.4s\n"
+          "fmin v16.4s, v16.4s, v31.4s\n"
+          "fmax v17.4s, v17.4s, v30.4s\n"
+          "fmin v17.4s, v17.4s, v31.4s\n"
+          "st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
+          "fcvtms v16.4s, v16.4s\n"
+          "fcvtms v17.4s, v17.4s\n"
+
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "movi v16.4s, #0\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "movi v17.4s, #0\n"
+          "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
+          "ld1 {v10.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
+          "ld1 {v11.4s}, [%[bias_ptr]], #16\n"
+          "fmul v6.4s, v6.4s, v28.4s\n"
+          "fmul v7.4s, v7.4s, v28.4s\n"
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "scvtf v16.4s, v16.4s\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v16.4s, v16.4s, v6.4s\n"
+        "fmul v17.4s, v17.4s, v7.4s\n"
+        "fadd v16.4s, v16.4s, v10.4s\n"
+        "fadd v17.4s, v17.4s, v11.4s\n"
+        "fmax v16.4s, v16.4s, v30.4s\n"
+        "fmin v16.4s, v16.4s, v31.4s\n"
+        "fmax v17.4s, v17.4s, v30.4s\n"
+        "fmin v17.4s, v17.4s, v31.4s\n"
+        "st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
+        "fcvtms v16.4s, v16.4s\n"
+        "fcvtms v17.4s, v17.4s\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [per_channel_scales] "+r"(per_channel_scales)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr), [input_scale] "r"(input_scale)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
+        "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvHybridPartialPerChannel<
+    DepthwiseConvOutputRounding::kUpward, EdgeType::kCorner, 1, 1> {
+  static inline void Run(const float* input_scale, const int8* input_ptr,
+                         const int8* filter_ptr, const float* bias_ptr,
+                         float* output_ptr, const float* per_channel_scales,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+        //
+        // Use v4-v5 to hold output_multiplier & v6-v7 to hold output_shift.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[input_scale]]\n"
+        "dup v28.4s, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w6\n"
+        "dup v31.4s, w7\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v4.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
+        "ld1 {v5.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
+        "fmul v6.4s, v6.4s, v28.4s\n"
+        "fmul v7.4s, v7.4s, v28.4s\n"
+
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "movi v16.4s, #0\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "movi v17.4s, #0\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "scvtf v16.4s, v16.4s\n"
+          "scvtf v17.4s, v17.4s\n"
+          "fmul v16.4s, v16.4s, v6.4s\n"
+          "fmul v17.4s, v17.4s, v7.4s\n"
+          "fadd v16.4s, v16.4s, v4.4s\n"
+          "fadd v17.4s, v17.4s, v5.4s\n"
+          "fmax v16.4s, v16.4s, v30.4s\n"
+          "fmin v16.4s, v16.4s, v31.4s\n"
+          "fmax v17.4s, v17.4s, v30.4s\n"
+          "fmin v17.4s, v17.4s, v31.4s\n"
+          "st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
+          "fcvtms v16.4s, v16.4s\n"
+          "fcvtms v17.4s, v17.4s\n"
+
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "movi v16.4s, #0\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "movi v17.4s, #0\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "sshll v3.8h, v3.8b, #0\n"
+          "ld1 {v4.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
+          "ld1 {v5.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
+          "fmul v6.4s, v6.4s, v28.4s\n"
+          "fmul v7.4s, v7.4s, v28.4s\n"
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "scvtf v16.4s, v16.4s\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v16.4s, v16.4s, v6.4s\n"
+        "fmul v17.4s, v17.4s, v7.4s\n"
+        "fadd v16.4s, v16.4s, v4.4s\n"
+        "fadd v17.4s, v17.4s, v5.4s\n"
+        "fmax v16.4s, v16.4s, v30.4s\n"
+        "fmin v16.4s, v16.4s, v31.4s\n"
+        "fmax v17.4s, v17.4s, v30.4s\n"
+        "fmin v17.4s, v17.4s, v31.4s\n"
+        "st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
+        "fcvtms v16.4s, v16.4s\n"
+        "fcvtms v17.4s, v17.4s\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [per_channel_scales] "+r"(per_channel_scales)
+        :
+        // Inputs.
+        [input_scale] "r"(input_scale),
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvHybridPartialPerChannel<
+    DepthwiseConvOutputRounding::kUpward, EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const float* input_scale, const int8* input_ptr,
+                         const int8* filter_ptr, const float* bias_ptr,
+                         float* output_ptr, const float* per_channel_scales,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v14-v15 to hold output_shift.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[input_scale]]\n"
+        "dup v28.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w12\n"
+        "dup v31.4s, w13\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
+        "fmul v14.4s, v14.4s, v28.4s\n"
+        "ld1 {v7.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
+        "fmul v15.4s, v15.4s, v28.4s\n"
+
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "movi v16.4s, #0\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "movi v17.4s, #0\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "sshll v5.8h, v5.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "scvtf v16.4s, v16.4s\n"
+          "fmul v16.4s, v16.4s, v14.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "scvtf v17.4s, v17.4s\n"
+          "fmul v17.4s, v17.4s, v15.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "fadd v16.4s, v16.4s, v6.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "fadd v17.4s, v17.4s, v7.4s\n"
+          "fmax v16.4s, v16.4s, v30.4s\n"
+          "fmin v16.4s, v16.4s, v31.4s\n"
+          "fmax v17.4s, v17.4s, v30.4s\n"
+          "fmin v17.4s, v17.4s, v31.4s\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
+          "fcvtms v16.4s, v16.4s\n"
+          "fcvtms v17.4s, v17.4s\n"
+
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "movi v16.4s, #0\n"
+          "sshll v3.8h, v3.8b, #0\n"
+          "movi v17.4s, #0\n"
+          "sshll v4.8h, v4.8b, #0\n"
+          "sshll v5.8h, v5.8b, #0\n"
+          "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
+          "ld1 {v7.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
+          "fmul v14.4s, v14.4s, v28.4s\n"
+          "fmul v15.4s, v15.4s, v28.4s\n"
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "scvtf v16.4s, v16.4s\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v16.4s, v16.4s, v14.4s\n"
+        "fmul v17.4s, v17.4s, v15.4s\n"
+        "fadd v16.4s, v16.4s, v6.4s\n"
+        "fadd v17.4s, v17.4s, v7.4s\n"
+        "fmax v16.4s, v16.4s, v30.4s\n"
+        "fmin v16.4s, v16.4s, v31.4s\n"
+        "fmax v17.4s, v17.4s, v30.4s\n"
+        "fmin v17.4s, v17.4s, v31.4s\n"
+        "st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
+        "fcvtms v16.4s, v16.4s\n"
+        "fcvtms v17.4s, v17.4s\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr),
+        [per_channel_scales] "+r"(per_channel_scales),
+        [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [input_scale] "r"(input_scale), [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+template <>
+struct DepthwiseConvHybridPartialPerChannel<
+    DepthwiseConvOutputRounding::kUpward, EdgeType::kVertical, 1, 1> {
+  static inline void Run(const float* input_scale, const int8* input_ptr,
+                         const int8* filter_ptr, const float* bias_ptr,
+                         float* output_ptr, const float* per_channel_scales,
+                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v14-v15 to hold output_shift.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[input_scale]]\n"
+        "dup v28.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MIN) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w12\n"
+        "dup v31.4s, w13\n"
+
+        "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
+        "ld1 {v7.4s}, [%[bias_ptr]], #16\n"
+        "ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
+        "fmul v14.4s, v14.4s, v28.4s\n"
+        "fmul v15.4s, v15.4s, v28.4s\n"
+
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "movi v16.4s, #0\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "movi v17.4s, #0\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "sshll v5.8h, v5.8b, #0\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "scvtf v16.4s, v16.4s\n"
+          "fmul v16.4s, v16.4s, v14.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "scvtf v17.4s, v17.4s\n"
+          "fmul v17.4s, v17.4s, v15.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "fadd v16.4s, v16.4s, v6.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "fadd v17.4s, v17.4s, v7.4s\n"
+          "fmax v16.4s, v16.4s, v30.4s\n"
+          "fmin v16.4s, v16.4s, v31.4s\n"
+          "fmax v17.4s, v17.4s, v30.4s\n"
+          "fmin v17.4s, v17.4s, v31.4s\n"
+          "st1 {v16.4s, v17.4s}, [%[output_ptr]], #32\n"
+          "fcvtms v16.4s, v16.4s\n"
+          "fcvtms v17.4s, v17.4s\n"
+
+          "saddw v8.8h, v26.8h, v8.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "movi v16.4s, #0\n"
+          "sshll v3.8h, v3.8b, #0\n"
+          "movi v17.4s, #0\n"
+          "sshll v4.8h, v4.8b, #0\n"
+          "sshll v5.8h, v5.8b, #0\n"
+
+          "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v14.4s}, [%[per_channel_scales]], #16\n"
+          "ld1 {v7.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v15.4s}, [%[per_channel_scales]], #16\n"
+          "fmul v14.4s, v14.4s, v28.4s\n"
+          "fmul v15.4s, v15.4s, v28.4s\n"
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "scvtf v16.4s, v16.4s\n"
+        "scvtf v17.4s, v17.4s\n"
+        "fmul v16.4s, v16.4s, v14.4s\n"
+        "fmul v17.4s, v17.4s, v15.4s\n"
+        "fadd v16.4s, v16.4s, v6.4s\n"
+        "fadd v17.4s, v17.4s, v7.4s\n"
+        "fmax v16.4s, v16.4s, v30.4s\n"
+        "fmin v16.4s, v16.4s, v31.4s\n"
+        "fmax v17.4s, v17.4s, v30.4s\n"
+        "fmin v17.4s, v17.4s, v31.4s\n"
+        "st1 {v16.4s, v17.4s}, [%[output_ptr]]\n"
+        "fcvtms v16.4s, v16.4s\n"
+        "fcvtms v17.4s, v17.4s\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [per_channel_scales] "+r"(per_channel_scales)
+        :
+        // Inputs.
+        [input_scale] "r"(input_scale),
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v26", "v28", "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_RIGHT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+#undef OFFSET_OUTPUT_FLOAT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_FLOAT_ACTIVATION_MAX
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvHybridThroughDepthPerChannel {
+  // Runs the DepthwiseConvWindowPerChannel kernels through the depth dimension
+  // from |start_depth| to |end_depth|. Keep this not inlined to maintain a
+  // small binary size. We use a DepthwiseConvParams struct for read only params
+  // to minimize call overhead.
+  static void __attribute__((noinline))
+  Run(const float* input_scale, const int8* input_ptr, const int8* filter_ptr,
+      const float* bias_ptr, float* output_ptr, int64_t start_depth,
+      int64_t end_depth, int64_t input_depth, int64_t input_row_size,
+      int32 output_window_height, int32 output_window_width,
+      const float* per_channel_scales, const DepthwiseConvParams& params) {
+    for (; start_depth <= end_depth - 8; start_depth += 8) {
+      DepthwiseConvHybridWindowPerChannel<output_rounding, 8, kStrideWidth,
+          kStrideHeight>::Run(input_scale,
+                              input_ptr, filter_ptr,
+                              bias_ptr, output_ptr,
+                              input_depth,
+                              input_row_size,
+                              output_window_height,
+                              output_window_width,
+                              per_channel_scales,
+                              &params);
+      input_ptr += 8;
+      output_ptr += 8;
+      filter_ptr += 8;
+      bias_ptr += 8;
+      per_channel_scales += 8;
+    }
+  }
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvHybridMultiRowPerChannel {
+  using ConvKernel =
+      DepthwiseConvHybridThroughDepthPerChannel<output_rounding, kStrideWidth,
+      kStrideHeight>;
+
+  static inline void Run(const float* input_scale, const int8* input_data,
+                         int32 start_x, int32 end_x, const int8* filter_data,
+                         const float* bias_data, float* output_data,
+                         const float* per_channel_scales,
+                         const DepthwiseConvParams& params,
+                         const ShuffleParams& shuffle_params,
+                         int8* shuffle_workspace) {
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
+        get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
+        get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+    TFLITE_DCHECK_LE(
+        64 * shuffle_params.input_width * shuffle_params.input_height,
+        kDepthwiseConvScratchWorkspaceSize);
+
+    int32 out_x = start_x;
+
+    // Run shuffling on inputs with sufficiently large depth and width. When
+    // these parameters are large enough, more time is taken to load inputs
+    // from memory. At this point, it becomes useful to prefetch and
+    // preshuffle the input data to maximize locality.
+
+    if (params.output_depth > 64 ||
+        (params.output_depth <= 64 && params.input_width > 150)) {
+      for (; out_x <= (end_x - shuffle_params.output_width);
+           out_x += shuffle_params.output_width) {
+        const int8* input_ptr = input_data;
+        const float* bias_ptr = bias_data;
+        const int8* filter_ptr = filter_data;
+        const float* per_channel_scales_ptr = per_channel_scales;
+        float* output_ptr = output_data;
+        int64_t depth = 0;
+        const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+        for (; depth <= params.output_depth - 64; depth += 64) {
+          // Preload.
+          const int8* h_ptr = input_ptr;
+          for (int32 i = 0; i < shuffle_params.input_height; i++) {
+            const int8* ptr = h_ptr;
+            for (int32 j = 0; j < shuffle_params.input_width; j++) {
+              optimized_ops_preload_l1_keep(ptr);
+              ptr += params.input_depth;
+            }
+            h_ptr += params.input_row_size;
+          }
+
+          // For a large enough input, shuffle into buckets.
+          ShuffleInput(input_ptr, params.input_depth, params.input_width,
+                       params.input_height, 64, shuffle_params.input_width,
+                       shuffle_params.input_height, shuffle_workspace);
+          ConvKernel::Run(input_scale,
+                          shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+                          0, 64, 64, shuffle_row_size,
+                          shuffle_params.output_height,
+                          shuffle_params.output_width, per_channel_scales_ptr,
+                          params);
+          input_ptr += 64;
+          output_ptr += 64;
+          filter_ptr += 64;
+          bias_ptr += 64;
+          per_channel_scales_ptr += 64;
+        }
+
+        // Preload.
+        const int8* h_ptr = input_ptr;
+        for (int32 i = 0; i < shuffle_params.input_height; i++) {
+          const int8* ptr = h_ptr;
+          for (int32 j = 0; j < shuffle_params.input_width; j++) {
+            optimized_ops_preload_l1_keep(ptr);
+            ptr += params.input_depth;
+          }
+          h_ptr += params.input_row_size;
+        }
+
+        // Handle leftover depth.
+        ConvKernel::Run(input_scale, input_ptr,
+                        filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
+                        params.input_row_size, shuffle_params.output_height,
+                        shuffle_params.output_width, per_channel_scales_ptr,
+                        params);
+        input_data +=
+            shuffle_params.output_width * kStrideWidth * params.input_depth;
+        output_data += shuffle_params.output_width * params.output_depth;
+      }
+    }
+
+
+    const int32 output_leftover_width = end_x - out_x;
+    if (output_leftover_width > 0) {
+      ConvKernel::Run(input_scale, input_data, filter_data,
+                      bias_data, output_data, 0, params.output_depth,
+                      params.input_depth, params.input_row_size,
+                      shuffle_params.output_height, output_leftover_width,
+                      per_channel_scales, params);
+    }
+  }
+};
+
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+template <DepthwiseConvOutputRounding output_rounding>
+    inline void DepthwiseConvHybridHandlePaddingPerChannel(
+        const float* input_scale, const int8* input_data,
+        const int8* filter_data, const float* bias_data, float* output_data,
+        const float* per_channel_scales, const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const int8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvHybridPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
+        1>::Run(input_scale, input_data,
+                filter_ptr, bias_data, output_data,
+                per_channel_scales, &params);
+    return;
+  }
+
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const int8* input_ptr = input_data;
+  const int8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
+  float* output_ptr = output_data;
+
+  DepthwiseConvHybridPartialPerChannel<
+      output_rounding, EdgeType::kCorner, 1, 1>::Run(
+          input_scale, input_ptr, filter_ptr, bias_data,
+          output_ptr, per_channel_scales, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvHybridPartialPerChannel<
+        output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
+            input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+            per_channel_scales, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvHybridPartialPerChannel<
+      output_rounding, EdgeType::kCorner, 1, 1>::Run(
+          input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+          per_channel_scales, &params);
+
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvHybridPartialPerChannel<
+        output_rounding, EdgeType::kVertical, 1, 1>::Run(
+            input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+            per_channel_scales, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+               (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvHybridPartialPerChannel<
+        output_rounding, EdgeType::kVertical, 1, 1>::Run(
+            input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+            per_channel_scales, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr =
+     output_data + (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvHybridPartialPerChannel<
+      output_rounding, EdgeType::kCorner, 1, 1>::Run(
+          input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+          per_channel_scales, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvHybridPartialPerChannel<
+        output_rounding, EdgeType::kHorizontal, 1, 1>::Run(
+            input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+            per_channel_scales, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+  DepthwiseConvHybridPartialPerChannel<
+      output_rounding, EdgeType::kCorner, 1, 1>::Run(
+          input_scale, input_ptr, filter_ptr, bias_data, output_ptr,
+          per_channel_scales, &params);
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConvHybrid3x3FilterPerChannel(
+    const DepthwiseParams& rt_params, const float* input_scales,
+    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scales, const int32* input_offsets,
+    int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvParams params;
+  const int32 stride_width = rt_params.stride_width;
+  const int32 stride_height = rt_params.stride_height;
+  const int32 pad_width = rt_params.padding_values.width;
+  const int32 pad_height = rt_params.padding_values.height;
+  const int32 depth_multiplier = rt_params.depth_multiplier;
+  const float output_activation_min = rt_params.float_activation_min;
+  const float output_activation_max = rt_params.float_activation_max;
+  const int32 filter_offset = rt_params.weights_offset;
+
+  params.input_depth = input_shape.Dims(3);
+  params.input_width = input_shape.Dims(2);
+  params.input_height = input_shape.Dims(1);
+  params.input_row_size = params.input_depth * params.input_width;
+  params.stride_width = stride_width;  params.stride_height = stride_height;
+  params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  params.output_width = output_shape.Dims(2);
+  params.output_height = output_shape.Dims(1);
+  params.output_row_size = params.output_depth * params.output_width;
+  params.filter_offset = filter_offset;
+  params.float_output_activation_min = output_activation_min;
+  params.float_output_activation_max = output_activation_max;
+
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+  params.filter_row_size = params.output_depth * filter_width;
+
+  // Algorithm assumes below constraints. It is optimized for depth
+  // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
+  TFLITE_DCHECK(depth_multiplier == 1);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 3);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int64_t input_batch_size = params.input_row_size * params.input_height;
+  const int64_t output_batch_size =
+      params.output_row_size * params.output_height;
+
+  ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+      four_row_shuffle_params, eight_row_shuffle_params;
+  if (stride_width == 1) {
+    one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+    two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+    four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+    eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+  } else {
+    one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+    two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+    four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+    eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+  }
+
+  using conv_multirow_func_t =
+      decltype(
+          &DepthwiseConvHybridMultiRowPerChannel<output_rounding, 1, 1>::Run);
+  conv_multirow_func_t conv_multirow_func =
+      DepthwiseConvHybridMultiRowPerChannel<output_rounding, 1, 1>::Run;
+  if (stride_width == 2) {
+    conv_multirow_func =
+        DepthwiseConvHybridMultiRowPerChannel<output_rounding, 2, 2>::Run;
+  }
+
+  // Allocate maximum memory needed for shuffled input.
+  int8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
+
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = params.output_height;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, params.output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      break;
+  }
+
+  for (int32 b = batch_start; b < batch_end; ++b) {
+    // input_ptr and output_ptr point to the start of each batch
+    const int8* input_ptr = input_data + b * input_batch_size;
+    float* output_ptr = output_data + b * output_batch_size;
+    params.input_offset = -input_offsets[b];
+    int32 out_x = 0;
+    int32 out_y = row_start;
+    int32 end_x = params.output_width;
+    int32 end_y = row_end;
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHybridHandlePaddingPerChannel<output_rounding>(
+          input_scales + b, input_ptr, filter_data,
+          bias_data, output_ptr, per_channel_scales, params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = std::max(1, out_y);
+      end_y = std::min(params.output_height - 1, end_y);
+    }
+
+    // pad_width and pad_height can both be 0 or 1, depending on padding option,
+    // such as Padding_VALID / Padding_SAME.
+    const int in_x = (out_x * stride_width) - pad_width;
+    const int in_y = (out_y * stride_height) - pad_height;
+
+    // input_ptr and output_ptr point to (in_y, in_x) and (out_y, out_x),
+    // respectively. (in_y, in_x) and (out_y, out_x) change along with
+    // row_start.
+    input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+    output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
+
+    // Shuffling shapes that maximize width over the shuffle workspace size
+    // perform better since the inputs are closer together, minimizing
+    // shuffling time.
+    //
+    // If the input shape has width large enough for the 2 row kernels,
+    // we prefer to use this. The innermost loop of the kernels handle
+    // 2 height x 2 width so this is the fastest path.
+    //
+    // If the input shape has smaller width but larger height, shuffling is
+    // still useful and can benefit from kernels 4 row and 8 row kernels.
+
+    // Handle 8 rows at a time.
+    if (params.input_width < four_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(input_scales + b, input_ptr,
+                           out_x, end_x, filter_data, bias_data, output_ptr,
+                           per_channel_scales, params, eight_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 8 * stride_height * params.input_row_size;
+        output_ptr += 8 * params.output_row_size;
+      }
+    }
+
+    // Handle 4 rows at a time.
+    if (params.input_width < two_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(input_scales + b, input_ptr,
+                           out_x, end_x, filter_data, bias_data, output_ptr,
+                           per_channel_scales, params, four_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 4 * stride_height * params.input_row_size;
+        output_ptr += 4 * params.output_row_size;
+      }
+    }
+
+    // Handle 2 rows at a time.
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(input_scales + b, input_ptr,
+                         out_x, end_x, filter_data, bias_data, output_ptr,
+                         per_channel_scales, params, two_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += 2 * stride_height * params.input_row_size;
+      output_ptr += 2 * params.output_row_size;
+    }
+    // Handle one row at a time.
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(input_scales + b, input_ptr,
+                         out_x, end_x, filter_data, bias_data, output_ptr,
+                         per_channel_scales, params, one_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += stride_height * params.input_row_size;
+      output_ptr += params.output_row_size;
+    }
+  }
+}
+#endif  // __aarch64__
+
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
new file mode 100644
index 00000000..2df5db9a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+template <typename InputScalar, typename DstScalar>
+inline void FullyConnectedPerChannel(
+    const FullyConnectedParams& params, const int32_t* output_multiplier,
+    const int* output_shift, const RuntimeShape& input_shape,
+    const InputScalar* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    DstScalar* output_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnectedInt8/8bit");
+
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  }
+  const bool use_caching =
+      (cpu_backend_context != nullptr) && cpu_backend_context->use_caching();
+
+  cpu_backend_gemm::MatrixParams<int8_t> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = 0;
+  lhs_params.cache_policy =
+      use_caching ? cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable)
+                  : cpu_backend_gemm::CachePolicy::kNeverCache;
+  cpu_backend_gemm::MatrixParams<InputScalar> rhs_params;
+  rhs_params.rows = filter_cols;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  rhs_params.cache_policy =
+      use_caching ? cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable)
+                  : cpu_backend_gemm::CachePolicy::kNeverCache;
+  cpu_backend_gemm::MatrixParams<DstScalar> dst_params;
+  dst_params.rows = filter_rows;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<
+      int32_t, DstScalar,
+      cpu_backend_gemm::QuantizationFlavor::kIntegerWithPerRowMultiplier>
+      gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint_perchannel = output_multiplier;
+  gemm_params.multiplier_exponent_perchannel = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+template <typename InputScalar, typename DstScalar>
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const InputScalar* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    DstScalar* output_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnectedInt8/8bit");
+
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  }
+  const bool use_caching =
+      (cpu_backend_context != nullptr) && cpu_backend_context->use_caching();
+
+  cpu_backend_gemm::MatrixParams<int8_t> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  lhs_params.cache_policy =
+      use_caching ? cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable)
+                  : cpu_backend_gemm::CachePolicy::kNeverCache;
+  cpu_backend_gemm::MatrixParams<InputScalar> rhs_params;
+  rhs_params.rows = filter_cols;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  rhs_params.cache_policy =
+      use_caching ? cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable)
+                  : cpu_backend_gemm::CachePolicy::kNeverCache;
+  cpu_backend_gemm::MatrixParams<DstScalar> dst_params;
+  dst_params.rows = filter_rows;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<int32_t, DstScalar> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/leaky_relu.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/leaky_relu.h
new file mode 100644
index 00000000..5131ea8f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/leaky_relu.h
@@ -0,0 +1,113 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_LEAKY_RELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_LEAKY_RELU_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/avx2_quantization_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+inline void QuantizeLeakyRelu(const LeakyReluParams& params,
+                              const RuntimeShape& input_shape,
+                              const int16* input_data,
+                              const RuntimeShape& output_shape,
+                              int16* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const int32_t quantized_min = std::numeric_limits<int16>::min();
+  const int32_t quantized_max = std::numeric_limits<int16>::max();
+  int i = 0;
+
+#ifdef __AVX2__
+  const __m256i input_offset = _mm256_set1_epi32(params.input_offset);
+  const __m256i output_offset = _mm256_set1_epi32(params.output_offset);
+  const __m256i output_muliplier_identity =
+      _mm256_set1_epi32(params.output_multiplier_identity);
+  const __m256i output_shift_identity =
+      _mm256_set1_epi32(params.output_shift_identity);
+  const __m256i output_multiplier_alpha =
+      _mm256_set1_epi32(params.output_multiplier_alpha);
+  const __m256i output_shift_alpha =
+      _mm256_set1_epi32(params.output_shift_alpha);
+  const __m256i clamp_max_v = _mm256_set1_epi32(quantized_max);
+  const __m256i clamp_min_v = _mm256_set1_epi32(quantized_min);
+
+  for (; i <= flat_size - 16; i += 16) {
+    const __m256i input =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(input_data + i));
+    __m256i input_low = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input));
+    __m256i input_high =
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input, 1));
+    input_low = _mm256_sub_epi32(input_low, input_offset);
+    input_high = _mm256_sub_epi32(input_high, input_offset);
+
+    const __m256i zeros = _mm256_setzero_si256();
+    const __m256i input_low_mask = _mm256_cmpgt_epi32(input_low, zeros);
+    const __m256i input_high_mask = _mm256_cmpgt_epi32(input_high, zeros);
+    const __m256i input_low_output_multiplier = avx2_utils::mm256_blendv_epi32(
+        output_multiplier_alpha, output_muliplier_identity, input_low_mask);
+    const __m256i input_low_output_shift = avx2_utils::mm256_blendv_epi32(
+        output_shift_alpha, output_shift_identity, input_low_mask);
+    const __m256i input_high_output_multiplier = avx2_utils::mm256_blendv_epi32(
+        output_multiplier_alpha, output_muliplier_identity, input_high_mask);
+    const __m256i input_high_output_shift = avx2_utils::mm256_blendv_epi32(
+        output_shift_alpha, output_shift_identity, input_high_mask);
+
+    input_low = avx2_utils::MultiplyByQuantizedMultiplier(
+        input_low, input_low_output_multiplier, input_low_output_shift);
+    input_high = avx2_utils::MultiplyByQuantizedMultiplier(
+        input_high, input_high_output_multiplier, input_high_output_shift);
+
+    input_low = _mm256_add_epi32(input_low, output_offset);
+    input_high = _mm256_add_epi32(input_high, output_offset);
+
+    input_low = _mm256_min_epi32(input_low, clamp_max_v);
+    input_low = _mm256_max_epi32(input_low, clamp_min_v);
+    input_high = _mm256_min_epi32(input_high, clamp_max_v);
+    input_high = _mm256_max_epi32(input_high, clamp_min_v);
+
+    avx2_utils::CastInt32ToInt16AndStore(output_data + i, input_low);
+    avx2_utils::CastInt32ToInt16AndStore(output_data + i + 8, input_high);
+  }
+#endif  // __AVX2__
+
+  for (; i < flat_size; ++i) {
+    const int32_t input_value = input_data[i] - params.input_offset;
+    int32_t unclamped_output;
+    if (input_value >= 0) {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_identity,
+                             params.output_shift_identity);
+    } else {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_alpha,
+                             params.output_shift_alpha);
+    }
+    const int16 clamped_output =
+        std::min(quantized_max, std::max(quantized_min, unclamped_output));
+    output_data[i] = static_cast<int16>(clamped_output);
+  }
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_LEAKY_RELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/lut.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/lut.h
new file mode 100644
index 00000000..c6779ad8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/lut.h
@@ -0,0 +1,72 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_LUT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_LUT_H_
+
+#include <cstdint>
+
+#if __aarch64__ && __clang__
+#include <arm_neon.h>
+#endif
+
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+inline void LookupTable(const uint8_t* input_data, int num_elements,
+                        const uint8_t* lut, uint8_t* output_data) {
+  int i = 0;
+#if __aarch64__ && __clang__
+  // This code uses ARM64-only instructions.
+  // TODO(b/143709993): Port to ARMv7
+
+  // Load the tables into registers. (4*4 128-bit registers)
+  uint8x16x4_t table[4];
+  table[0] = vld1q_u8_x4(lut + 16 * 4 * 0);
+  table[1] = vld1q_u8_x4(lut + 16 * 4 * 1);
+  table[2] = vld1q_u8_x4(lut + 16 * 4 * 2);
+  table[3] = vld1q_u8_x4(lut + 16 * 4 * 3);
+
+  // Vectorized loop; process uint8x16_t (16 elements) at a time.
+  constexpr int vectorized_16_loop_step = 16;
+  const int vectorized_16_loop_end =
+      num_elements / vectorized_16_loop_step * vectorized_16_loop_step;
+  for (; i < vectorized_16_loop_end; i += vectorized_16_loop_step) {
+    uint8x16_t input = vld1q_u8(input_data + i);
+    uint8x16_t output = optimized_ops::aarch64_lookup_vector(table, input);
+    vst1q_u8(output_data + i, output);
+  }
+  // Postamble and non-ARM64 code: simple for loop.
+#endif
+  for (; i < num_elements; ++i) {
+    output_data[i] = lut[input_data[i]];
+  }
+}
+
+// LUTPopulate<int8_t> has ordered the LUT so that indexing it with an
+// int8_t is just done by casting it to an uint8_t. We can thus reuse the uint8
+// LookupTable function.
+inline void LookupTable(const int8_t* input_data, int num_elements,
+                        const int8_t* lut, int8_t* output_data) {
+  LookupTable(reinterpret_cast<const uint8_t*>(input_data), num_elements,
+              reinterpret_cast<const uint8_t*>(lut),
+              reinterpret_cast<uint8_t*>(output_data));
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_LUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h
new file mode 100644
index 00000000..9a4c5376
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h
@@ -0,0 +1,251 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+inline void MeanImpl(const tflite::MeanParams& op_params,
+                     const RuntimeShape& input_shape, const int8_t* input_data,
+                     int32 multiplier, int32 shift, int32 bias,
+                     const RuntimeShape& output_shape, int8_t* output_data,
+                     int start_depth, int end_depth) {
+  ruy::profiler::ScopeLabel label("Mean4D/Int8/MeanImpl");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(2);
+  const int output_width = output_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  constexpr static int32_t kMinValue = std::numeric_limits<int8_t>::min();
+  constexpr static int32_t kMaxValue = std::numeric_limits<int8_t>::max();
+
+#ifdef USE_NEON
+  const int32x4_t bias_dup = vdupq_n_s32(bias);
+  const int32x4_t min_dup = vdupq_n_s32(kMinValue);
+  const int32x4_t max_dup = vdupq_n_s32(kMaxValue);
+#endif  // USE_NEON
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    int out_d = start_depth;
+#ifdef USE_NEON
+
+    for (; out_d <= end_depth - 16; out_d += 16) {
+      int32x4x4_t temp_sum;
+      temp_sum.val[0] = vdupq_n_s32(0);
+      temp_sum.val[1] = vdupq_n_s32(0);
+      temp_sum.val[2] = vdupq_n_s32(0);
+      temp_sum.val[3] = vdupq_n_s32(0);
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          const int8_t* input_data_ptr =
+              input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
+          int8x16_t input_data_val = vld1q_s8(input_data_ptr);
+
+          int16x8_t input_data_low_shift =
+              vmovl_s8(vget_low_s8(input_data_val));
+          int16x8_t input_data_high_shift =
+              vmovl_s8(vget_high_s8(input_data_val));
+
+          int32x4_t input_low_low =
+              vmovl_s16(vget_low_s16(input_data_low_shift));
+          int32x4_t input_high_low =
+              vmovl_s16(vget_high_s16(input_data_low_shift));
+          int32x4_t input_low_high =
+              vmovl_s16(vget_low_s16(input_data_high_shift));
+          int32x4_t input_high_high =
+              vmovl_s16(vget_high_s16(input_data_high_shift));
+
+          temp_sum.val[0] = vaddq_s32(temp_sum.val[0], input_low_low);
+          temp_sum.val[1] = vaddq_s32(temp_sum.val[1], input_high_low);
+          temp_sum.val[2] = vaddq_s32(temp_sum.val[2], input_low_high);
+          temp_sum.val[3] = vaddq_s32(temp_sum.val[3], input_high_high);
+        }
+      }
+
+      temp_sum =
+          MultiplyByQuantizedMultiplier4Rows(temp_sum, multiplier, shift);
+
+      temp_sum.val[0] = vaddq_s32(temp_sum.val[0], bias_dup);
+      temp_sum.val[1] = vaddq_s32(temp_sum.val[1], bias_dup);
+      temp_sum.val[2] = vaddq_s32(temp_sum.val[2], bias_dup);
+      temp_sum.val[3] = vaddq_s32(temp_sum.val[3], bias_dup);
+
+      temp_sum.val[0] = vminq_s32(vmaxq_s32(temp_sum.val[0], min_dup), max_dup);
+      temp_sum.val[1] = vminq_s32(vmaxq_s32(temp_sum.val[1], min_dup), max_dup);
+      temp_sum.val[2] = vminq_s32(vmaxq_s32(temp_sum.val[2], min_dup), max_dup);
+      temp_sum.val[3] = vminq_s32(vmaxq_s32(temp_sum.val[3], min_dup), max_dup);
+
+      int16x4_t narrowed_low_low = vmovn_s32(temp_sum.val[0]);
+      int16x4_t narrowed_high_low = vmovn_s32(temp_sum.val[1]);
+      int16x4_t narrowed_low_high = vmovn_s32(temp_sum.val[2]);
+      int16x4_t narrowed_high_high = vmovn_s32(temp_sum.val[3]);
+
+      int16x8_t combined_low =
+          vcombine_s16(narrowed_low_low, narrowed_high_low);
+      int16x8_t combined_high =
+          vcombine_s16(narrowed_low_high, narrowed_high_high);
+
+      int8x8_t narrowed_low = vmovn_s16(combined_low);
+      int8x8_t narrowed_high = vmovn_s16(combined_high);
+
+      int8x16_t combined_output = vcombine_s8(narrowed_low, narrowed_high);
+
+      int8_t* output_data_ptr =
+          output_data + Offset(output_shape, out_b, 0, 0, out_d);
+      vst1q_s8(output_data_ptr, combined_output);
+    }
+#endif  // USE_NEON
+
+    for (; out_d < end_depth; ++out_d) {
+      int acc = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc += bias;
+      acc = std::min(std::max(acc, kMinValue), kMaxValue);
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          static_cast<int8_t>(acc);
+    }
+  }
+}
+
+struct MeanWorkerTask : cpu_backend_threadpool::Task {
+  MeanWorkerTask(const tflite::MeanParams& op_params,
+                 const RuntimeShape& input_shape, const int8_t* input_data,
+                 int32 multiplier, int32 shift, int32 bias,
+                 const RuntimeShape& output_shape, int8_t* output_data,
+                 int start_height, int end_height)
+      : op_params(op_params),
+        input_shape(input_shape),
+        input_data(input_data),
+        multiplier(multiplier),
+        shift(shift),
+        bias(bias),
+        output_shape(output_shape),
+        output_data(output_data),
+        start_height(start_height),
+        end_height(end_height) {}
+
+  void Run() override {
+    MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
+             output_shape, output_data, start_height, end_height);
+  }
+
+ private:
+  const tflite::MeanParams& op_params;
+  const RuntimeShape& input_shape;
+  const int8_t* input_data;
+  int32 multiplier;
+  int32 shift;
+  int32 bias;
+  const RuntimeShape& output_shape;
+  int8_t* output_data;
+  int start_height;
+  int end_height;
+};
+
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const int8_t* input_data, int32 input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 int8_t* output_data, int32 output_zero_point,
+                 float output_scale, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("Mean4D/Int8");
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  float temp = input_zero_point * input_scale / output_scale;
+  temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
+  int32_t bias = output_zero_point - static_cast<int32_t>(temp);
+  float real_scale = input_scale / (num_elements_in_axis * output_scale);
+
+  int32 multiplier, shift;
+  QuantizeMultiplier(real_scale, &multiplier, &shift);
+
+  constexpr int kMinDepthPerThread = 8;
+  int thread_count = output_depth / kMinDepthPerThread;
+  thread_count = thread_count > 0 ? thread_count : 1;
+  const int capped_thread_count =
+      std::min(thread_count, cpu_backend_context->max_num_threads());
+
+  if (capped_thread_count == 1) {
+    MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
+             output_shape, output_data, 0, output_depth);
+  } else {
+    // Instead parallel for batch, we loop for the output_depth since batch
+    // is typical 1.
+    std::vector<MeanWorkerTask> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(capped_thread_count);
+    int depth_start = 0;
+    for (int i = 0; i < capped_thread_count; ++i) {
+      // Try to distribute the tasks as even as possible.
+      int depth_end = depth_start +
+                      (output_depth - depth_start) / (capped_thread_count - i);
+      tasks.emplace_back(op_params, input_shape, input_data, multiplier, shift,
+                         bias, output_shape, output_data, depth_start,
+                         depth_end);
+      depth_start = depth_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
new file mode 100644
index 00000000..bb5870f1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -0,0 +1,266 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const int8* input1_data, const int8* input2_data,
+                           int8* output_data) {
+  ruy::profiler::ScopeLabel label("MulElementwiseInt8/8bit");
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8>(clamped_output);
+  }
+}
+
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
+                               const int8 broadcast_value,
+                               const int8* input2_data, int8* output_data) {
+  ruy::profiler::ScopeLabel label("BroadMulSimpleBroadcastInt8/8bit");
+  const int16 input1_val = params.input1_offset + broadcast_value;
+
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+
+    const auto input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
+
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8* input1_data,
+                const RuntimeShape& input2_shape, const int8* input2_data,
+                const RuntimeShape& output_shape, int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("MulInt8/8bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMulDispatch(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int8* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int8* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int8* output_data) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_integer_ops::BroadcastMul6DSlow(
+        params, input1_shape, input1_data, input2_shape, input2_data,
+        output_shape, output_data);
+  }
+
+  optimized_ops::BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data, MulElementwise, MulSimpleBroadcast);
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
new file mode 100644
index 00000000..abb59a02
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
@@ -0,0 +1,278 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
+
+#include <string.h>
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("MaxPool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  int8_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch) {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          memset(acc, params.quantized_activation_min,
+                 tranche_depth * sizeof(acc[0]));
+          const int8_t* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const int8_t* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const int8_t* input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                int8x16_t acc_reg = vld1q_s8(acc + channel);
+                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg = vmaxq_s8(acc_reg, input_reg);
+                vst1q_s8(acc + channel, acc_reg);
+              }
+
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                int8x8_t acc_reg = vld1_s8(acc + channel);
+                int8x8_t input_reg = vld1_s8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vmax_s8(acc_reg, input_reg);
+                vst1_s8(acc + channel, acc_reg);
+              }
+#endif
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
+              }
+              input_row_ptr += depth;
+            }
+          }
+          int8_t* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                    out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 16; channel += 16) {
+            int8x16_t a = vld1q_s8(acc + channel);
+            a = vminq_s8(a, vdupq_n_s8(params.quantized_activation_max));
+            a = vmaxq_s8(a, vdupq_n_s8(params.quantized_activation_min));
+            vst1q_s8(output_ptr + channel, a);
+          }
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            int8x8_t a = vld1_s8(acc + channel);
+            a = vmin_s8(a, vdup_n_s8(params.quantized_activation_max));
+            a = vmax_s8(a, vdup_n_s8(params.quantized_activation_min));
+            vst1_s8(output_ptr + channel, a);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel) {
+            int8_t a = acc[channel];
+            a = std::max<int8_t>(a, params.quantized_activation_min);
+            a = std::min<int8_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<int8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const int8_t* input_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  int32_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch) {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          if (filter_count == 0) return false;
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const int8_t* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const int8_t* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const int8_t* input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                int16x4_t acc_reg[4];
+                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
+                acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
+                for (int i = 0; i < 4; i++) {
+                  vst1q_s32(
+                      acc + channel + 4 * i,
+                      vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                int16x4_t acc_reg[2];
+                int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_s16(input_reg);
+                acc_reg[1] = vget_high_s16(input_reg);
+                for (int i = 0; i < 2; i++) {
+                  vst1q_s32(
+                      acc + channel + 4 * i,
+                      vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          int8_t* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                    out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            int16_t buf[8];
+            for (int i = 0; i < 8; i++) {
+              buf[i] =
+                  acc[channel + i] > 0
+                      ? (acc[channel + i] + filter_count / 2) / filter_count
+                      : (acc[channel + i] - filter_count / 2) / filter_count;
+            }
+            int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
+            buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
+            buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
+            vst1_s8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel) {
+            int16_t a = acc[channel] > 0
+                            ? (acc[channel] + filter_count / 2) / filter_count
+                            : (acc[channel] - filter_count / 2) / filter_count;
+            a = std::max<int16_t>(a, params.quantized_activation_min);
+            a = std::min<int16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<int8_t>(a);
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/sub.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/sub.h
new file mode 100644
index 00000000..61bc4e78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/sub.h
@@ -0,0 +1,237 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SUB_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SUB_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/avx2_quantization_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+inline void SubElementwiseInt16(int size, const ArithmeticParams& params,
+                                const int16* input1_data,
+                                const int16* input2_data, int16* output_data) {
+  ruy::profiler::ScopeLabel label("SubElementwiseInt16/16bit");
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -32768);
+  TFLITE_DCHECK_GT(params.input2_offset, -32768);
+  TFLITE_DCHECK_LT(params.input1_offset, 32768);
+  TFLITE_DCHECK_LT(params.input2_offset, 32768);
+
+#ifdef __AVX2__
+  const int32_t input1_left_shift = params.left_shift + params.input1_shift;
+  const int32_t input2_left_shift = params.left_shift + params.input2_shift;
+  const __m256i input1_offset = _mm256_set1_epi32(params.input1_offset);
+  const __m256i input2_offset = _mm256_set1_epi32(params.input2_offset);
+  const __m256i output_offset = _mm256_set1_epi32(params.output_offset);
+  const __m256i clamp_max_v =
+      _mm256_set1_epi32(params.quantized_activation_max);
+  const __m256i clamp_min_v =
+      _mm256_set1_epi32(params.quantized_activation_min);
+
+  for (; i <= size - 16; i += 16) {
+    const __m256i input1_val_original =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(input1_data + i));
+    const __m256i input2_val_original =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(input2_data + i));
+
+    __m256i s11 =
+        _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input1_val_original));
+    __m256i s12 =
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input1_val_original, 1));
+    __m256i s21 =
+        _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input2_val_original));
+    __m256i s22 =
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input2_val_original, 1));
+
+    s11 = _mm256_add_epi32(s11, input1_offset);
+    s12 = _mm256_add_epi32(s12, input1_offset);
+    s21 = _mm256_add_epi32(s21, input2_offset);
+    s22 = _mm256_add_epi32(s22, input2_offset);
+
+    s11 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s11, params.input1_multiplier, input1_left_shift);
+    s12 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s12, params.input1_multiplier, input1_left_shift);
+    s21 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s21, params.input2_multiplier, input2_left_shift);
+    s22 = avx2_utils::MultiplyByQuantizedMultiplier(
+        s22, params.input2_multiplier, input2_left_shift);
+
+    __m256i s1 = _mm256_sub_epi32(s11, s21);
+    __m256i s2 = _mm256_sub_epi32(s12, s22);
+
+    s1 = avx2_utils::MultiplyByQuantizedMultiplier(s1, params.output_multiplier,
+                                                   params.output_shift);
+    s2 = avx2_utils::MultiplyByQuantizedMultiplier(s2, params.output_multiplier,
+                                                   params.output_shift);
+
+    s1 = _mm256_add_epi32(s1, output_offset);
+    s2 = _mm256_add_epi32(s2, output_offset);
+
+    s1 = _mm256_min_epi32(s1, clamp_max_v);
+    s1 = _mm256_max_epi32(s1, clamp_min_v);
+    s2 = _mm256_min_epi32(s2, clamp_max_v);
+    s2 = _mm256_max_epi32(s2, clamp_min_v);
+
+    avx2_utils::CastInt32ToInt16AndStore(output_data + i, s1);
+    avx2_utils::CastInt32ToInt16AndStore(output_data + i + 8, s2);
+  }
+#endif  // __AVX2__
+
+  for (; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int16>(clamped_output);
+  }
+}
+
+inline void BroadcastSubFiveFold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16* unswitched_input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubFiveFold/16bit");
+
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const int16_t* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const int16_t* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  int16_t* output_data_ptr = output_data;
+  const int16_t* input1_data_ptr = input1_data;
+  const int16_t* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // The flatsize for each inputs are as below.
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  const int y0 = params.broadcast_shape[0];
+  const int y1 = params.broadcast_shape[1];
+  const int y2 = params.broadcast_shape[2];
+  const int y3 = params.broadcast_shape[3];
+  const int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0) {
+    const int16_t* input2_data_ptr = nullptr;
+    for (int i1 = 0; i1 < y1; ++i1) {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2) {
+        for (int i3 = 0; i3 < y3; ++i3) {
+          if (use_unswitched) {
+            SubElementwiseInt16(y4, params, input1_data_ptr, input2_data_ptr,
+                                output_data_ptr);
+          } else {
+            // When input1 and input2 are switched, calculate (input2 - input1)
+            // and use unswitched_params as we switch the switched input here.
+            SubElementwiseInt16(y4, unswitched_params, input2_data_ptr,
+                                input1_data_ptr, output_data_ptr);
+          }
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        // We have broadcast y4 of input1 data y3 times, and now move on.
+        input1_data_ptr += y4;
+      }
+    }
+    // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int16* output_data) {
+  ruy::profiler::ScopeLabel label("SubInt16/16bit");
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GT(params.input1_offset, -32768);
+  TFLITE_DCHECK_GT(params.input2_offset, -32768);
+  TFLITE_DCHECK_LT(params.input1_offset, 32768);
+  TFLITE_DCHECK_LT(params.input2_offset, 32768);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  SubElementwiseInt16(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastSubDispatch(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubDispatchInt16/16bit");
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GT(params.input1_offset, -32768);
+  TFLITE_DCHECK_GT(params.input2_offset, -32768);
+  TFLITE_DCHECK_LT(params.input1_offset, 32768);
+  TFLITE_DCHECK_LT(params.input2_offset, 32768);
+
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::BroadcastQuantSubSlow(
+        params, input1_shape, input1_data, input2_shape, input2_data,
+        output_shape, output_data);
+  }
+
+  BroadcastSubFiveFold(params, input1_shape, input1_data, input2_shape,
+                       input2_data, output_shape, output_data);
+}
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SUB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
new file mode 100644
index 00000000..33d7c2a5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
@@ -0,0 +1,118 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_TRANSPOSE_CONV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+// TransposeConvV2 expect the weights in HWOI order.
+template <typename InputScalar, typename DestinationScalar>
+inline void TransposeConvV2(
+    const ConvParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const InputScalar* input_data,
+    const RuntimeShape& hwoi_ordered_filter_shape,
+    const int8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    DestinationScalar* output_data, const RuntimeShape& col2im_shape,
+    int32_t* col2im_data, int32_t* scratch_data,
+    CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("TransposeConvV2/int8");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK(col2im_data);
+  TFLITE_DCHECK(hwoi_ordered_filter_data);
+
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_image_size = output_height * output_width;
+  const int input_depth =
+      MatchingDim(input_shape, 3, hwoi_ordered_filter_shape, 3);
+  const int output_depth =
+      MatchingDim(output_shape, 3, hwoi_ordered_filter_shape, 2);
+  const int input_offset = input_image_size * input_depth;
+  const int output_offset = output_image_size * output_depth;
+
+  const int filter_height = hwoi_ordered_filter_shape.Dims(0);
+  const int filter_width = hwoi_ordered_filter_shape.Dims(1);
+  const int padding_top = params.padding_values.height;
+  const int padding_bottom =
+      params.padding_values.height + params.padding_values.height_offset;
+  const int padding_left = params.padding_values.width;
+  const int padding_right =
+      params.padding_values.width + params.padding_values.width_offset;
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+
+  const int hwoi_ordered_filter_total_size =
+      filter_height * filter_width * output_depth;
+
+  cpu_backend_gemm::MatrixParams<int8_t> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = hwoi_ordered_filter_total_size;
+  lhs_params.cols = input_depth;
+  // Since our weight is symmetric quantized, the zp will always be 0.
+  lhs_params.zero_point = 0;
+
+  int32_t* scratch_data_p = scratch_data;
+  std::fill_n(scratch_data, output_offset * batch_size, static_cast<int32>(0));
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_backend_gemm::MatrixParams<InputScalar> rhs_params;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.rows = input_depth;
+    rhs_params.cols = input_image_size;
+    rhs_params.zero_point = -params.input_offset;
+
+    cpu_backend_gemm::MatrixParams<int32_t> dst_params;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.rows = hwoi_ordered_filter_total_size;
+    dst_params.cols = input_image_size;
+
+    cpu_backend_gemm::GemmParams<int32_t, int32_t> gemm_params;
+    cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
+                           input_data + input_offset * i, dst_params,
+                           col2im_data, gemm_params, cpu_backend_context);
+
+    optimized_ops::Col2im(
+        col2im_data, output_depth, output_height, output_width, filter_height,
+        filter_width, padding_top, padding_left, padding_bottom, padding_right,
+        stride_height, stride_width, scratch_data_p);
+
+    scratch_data_p += output_offset;
+  }
+  scratch_data_p = scratch_data;
+  optimized_ops::BiasAdd(scratch_data_p, bias_data, batch_size, output_height,
+                         output_width, output_depth);
+
+  optimized_ops::Quantize(output_multiplier, output_shift, output_depth,
+                          output_shape.FlatSize(), params.output_offset,
+                          output_activation_min, output_activation_max,
+                          scratch_data, output_data);
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_TRANSPOSE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
new file mode 100644
index 00000000..8c8c7288
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -0,0 +1,5022 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/optimized/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Unoptimized reference ops:
+using reference_ops::Broadcast4DSlowGreater;
+using reference_ops::Broadcast4DSlowGreaterEqual;
+using reference_ops::Broadcast4DSlowGreaterEqualWithScaling;
+using reference_ops::Broadcast4DSlowGreaterWithScaling;
+using reference_ops::Broadcast4DSlowLess;
+using reference_ops::Broadcast4DSlowLessEqual;
+using reference_ops::Broadcast4DSlowLessEqualWithScaling;
+using reference_ops::Broadcast4DSlowLessWithScaling;
+using reference_ops::BroadcastAdd4DSlow;
+using reference_ops::BroadcastGreater;
+using reference_ops::BroadcastGreaterEqual;
+using reference_ops::BroadcastLess;
+using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastMul4DSlow;
+using reference_ops::BroadcastSubSlow;
+using reference_ops::Concatenation;
+using reference_ops::ConcatenationWithScaling;
+using reference_ops::DepthConcatenation;
+using reference_ops::Div;
+using reference_ops::FakeQuant;
+using reference_ops::Gather;
+using reference_ops::Greater;
+using reference_ops::GreaterEqual;
+using reference_ops::GreaterEqualWithScaling;
+using reference_ops::GreaterWithScaling;
+using reference_ops::Less;
+using reference_ops::LessEqual;
+using reference_ops::LessEqualWithScaling;
+using reference_ops::LessWithScaling;
+using reference_ops::Mean;
+using reference_ops::RankOneSelect;
+using reference_ops::Relu1;
+using reference_ops::Relu6;
+using reference_ops::ReluX;
+using reference_ops::Select;
+using reference_ops::SpaceToBatchND;
+using reference_ops::Split;
+using reference_ops::TensorFlowSplit;
+
+static constexpr int kDepthwiseReverseShift = -1;
+
+template <typename Scalar, int N>
+VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
+  const int size = FlatSize(dims);
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+                                                const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+                                               const Dims<N>& dims) {
+  const int cols = dims.sizes[N - 1];
+  int rows = 1;
+  for (int d = 0; d < N - 1; d++) {
+    rows *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
+                                              const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return ArrayMap<Scalar>(data, rows, cols);
+}
+
+// TODO(b/62193649): this function is only needed as long
+// as we have the --variable_batch hack.
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
+                                                   const Dims<N>& dims,
+                                                   int rows) {
+  const int flatsize = FlatSize(dims);
+  TFLITE_DCHECK((flatsize % rows) == 0);
+  const int cols = flatsize / rows;
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
+  for (int i = 0; i < 4; i++) {
+    if (dims1.sizes[i] != dims2.sizes[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  const RuntimeShape output_shape = DimsToShape(output_dims);
+  const int output_height = output_shape.Dims(1);
+
+  DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
+                    DimsToShape(filter_dims), filter_data,
+                    DimsToShape(bias_dims), bias_data, output_shape,
+                    output_data, CpuFlags(), /*thread_start=*/0,
+                    /*thread_end=*/output_height, /*thread_dim=*/1);
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, 1, 1, pad_width,
+                pad_height, depth_multiplier, output_activation_min,
+                output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, float* output_data,
+                   const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, pad_width, pad_height,
+                depth_multiplier, output_activation_min, output_activation_max,
+                output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+                    bias_dims, stride, stride, pad_width, pad_height,
+                    depth_multiplier, output_data, output_dims);
+}
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void LegacyDepthwiseConvWithRounding(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, int thread_start, int thread_end, int thread_dim) {
+  ruy::profiler::ScopeLabel label("DepthwiseConv/8bit");
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_GE(dilation_width_factor, 1);
+  TFLITE_DCHECK_GE(dilation_height_factor, 1);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int output_shift = params.output_shift;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (depthwise_conv::Fast3x3FilterKernelSupported(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, output_shift)) {
+    ruy::profiler::ScopeLabel specialized_label("DepthwiseConv/8bit/3x3");
+    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+        bias_data, output_shape, output_data, thread_start, thread_end,
+        thread_dim);
+    return;
+  }
+#endif
+
+  ruy::profiler::ScopeLabel specialized_label("DepthwiseConv/8bit/General");
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
+                                       filter_shape, filter_data, bias_shape,
+                                       bias_data, output_shape, output_data,
+                                       thread_start, thread_end, thread_dim);
+}
+
+inline void LegacyDepthwiseConvImpl(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, int thread_start, int thread_end, int thread_dim) {
+  return LegacyDepthwiseConvWithRounding<
+      DepthwiseConvOutputRounding::kAwayFromZero>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data, thread_start, thread_end,
+      thread_dim);
+}
+
+inline void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                          int32_t input_offset, const uint8_t* filter_data,
+                          const Dims<4>& filter_dims, int32_t filter_offset,
+                          const int32_t* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          int32_t output_offset, int32_t output_multiplier,
+                          int output_shift, int32_t output_activation_min,
+                          int32_t output_activation_max, uint8_t* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kDepthwiseReverseShift * output_shift;
+
+  const RuntimeShape output_shape = DimsToShape(output_dims);
+  const int output_height = output_shape.Dims(1);
+
+  LegacyDepthwiseConvImpl(
+      op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+      filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+      output_data, /*thread_start=*/0,
+      /*thread_end=*/output_height, /*thread_dim=*/1);
+}
+
+inline void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                          int32_t input_offset, const uint8_t* filter_data,
+                          const Dims<4>& filter_dims, int32_t filter_offset,
+                          const int32_t* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          int32_t output_offset, int32_t output_multiplier,
+                          int output_shift, int32_t output_activation_min,
+                          int32_t output_activation_max, uint8_t* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, 1, 1, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                   int32_t input_offset, const uint8_t* filter_data,
+                   const Dims<4>& filter_dims, int32_t filter_offset,
+                   const int32_t* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, int32_t output_offset,
+                   int32_t output_multiplier, int output_shift,
+                   int32_t output_activation_min, int32_t output_activation_max,
+                   uint8_t* output_data, const Dims<4>& output_dims) {
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                   int32_t input_offset, const uint8_t* filter_data,
+                   const Dims<4>& filter_dims, int32_t filter_offset,
+                   const int32_t* bias_data, const Dims<4>& bias_dims,
+                   int stride, int pad_width, int pad_height,
+                   int depth_multiplier, int32_t output_offset,
+                   int32_t output_multiplier, int output_shift,
+                   int32_t output_activation_min, int32_t output_activation_max,
+                   uint8_t* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+                    filter_dims, filter_offset, bias_data, bias_dims, stride,
+                    stride, pad_width, pad_height, depth_multiplier,
+                    output_offset, output_multiplier, output_shift,
+                    output_activation_min, output_activation_max, output_data,
+                    output_dims);
+}
+
+template <typename T, typename TS>
+struct LegacyDepthwiseConvWorkerTask : public gemmlowp::Task {
+  LegacyDepthwiseConvWorkerTask(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const T* input_data, const RuntimeShape& filter_shape,
+      const T* filter_data, const RuntimeShape& bias_shape, const TS* bias_data,
+      const RuntimeShape& output_shape, T* output_data, int thread_start,
+      int thread_end, int thread_dim)
+      : params_(params),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        thread_start_(thread_start),
+        thread_end_(thread_end),
+        thread_dim_(thread_dim) {}
+
+  void Run() override {
+    LegacyDepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
+                            filter_data_, bias_shape_, bias_data_,
+                            output_shape_, output_data_, thread_start_,
+                            thread_end_, thread_dim_);
+  }
+
+ private:
+  const DepthwiseParams& params_;
+  const RuntimeShape& input_shape_;
+  const T* input_data_;
+  const RuntimeShape& filter_shape_;
+  const T* filter_data_;
+  const RuntimeShape& bias_shape_;
+  const TS* bias_data_;
+  const RuntimeShape& output_shape_;
+  T* output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const RuntimeShape& output_shape,
+                              const RuntimeShape& filter_shape,
+                              int thread_dim) {
+  constexpr int kMinMulPerThread = 8;
+  const int output_units = output_shape.Dims(thread_dim);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_mul_per_unit =
+      FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+  int thread_count = output_units / min_units_per_thread;
+  return thread_count;
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) {
+  ruy::profiler::ScopeLabel label("DepthwiseConv");
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  const int max_threads =
+      gemmlowp_context ? gemmlowp_context->max_num_threads() : 1;
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1) {
+    LegacyDepthwiseConvImpl(params, input_shape, input_data, filter_shape,
+                            filter_data, bias_shape, bias_data, output_shape,
+                            output_data, /*thread_start=*/0,
+                            /*thread_end=*/output_rows, /*thread_dim=*/1);
+  } else {
+    std::vector<gemmlowp::Task*> tasks(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i) {
+      int thread_end =
+          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks[i] = new LegacyDepthwiseConvWorkerTask<uint8_t, int32_t>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data, thread_start,
+          thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+  }
+}
+
+template <typename T, typename TS>
+struct LegacyPerChannelDepthwiseConvWorkerTask : public gemmlowp::Task {
+  LegacyPerChannelDepthwiseConvWorkerTask(
+      const DepthwiseParams& params, const int32_t* output_multiplier,
+      const int32_t* output_shift, const RuntimeShape& input_shape,
+      const T* input_data, const RuntimeShape& filter_shape,
+      const T* filter_data, const RuntimeShape& bias_shape, const TS* bias_data,
+      const RuntimeShape& output_shape, T* output_data, int thread_start,
+      int thread_end, int thread_dim)
+      : params_(params),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        thread_start_(thread_start),
+        thread_end_(thread_end),
+        thread_dim_(thread_dim) {}
+
+  void Run() override {
+    CpuBackendContext backend_context;
+    optimized_integer_ops::DepthwiseConvImpl(
+        params_, output_multiplier_, output_shift_, input_shape_, input_data_,
+        filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
+        output_data_, thread_start_, thread_end_, thread_dim_, backend_context);
+  }
+
+ private:
+  const DepthwiseParams& params_;
+  const int32_t* output_multiplier_;
+  const int32_t* output_shift_;
+  const RuntimeShape& input_shape_;
+  const T* input_data_;
+  const RuntimeShape& filter_shape_;
+  const T* filter_data_;
+  const RuntimeShape& bias_shape_;
+  const TS* bias_data_;
+  const RuntimeShape& output_shape_;
+  T* output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) {
+  ruy::profiler::ScopeLabel label("DepthwiseConvInt8");
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  const int max_threads =
+      gemmlowp_context ? gemmlowp_context->max_num_threads() : 1;
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1) {
+    CpuBackendContext backend_context;
+    optimized_integer_ops::DepthwiseConvImpl(
+        params, output_multiplier, output_shift, input_shape, input_data,
+        filter_shape, filter_data, bias_shape, bias_data, output_shape,
+        output_data, /*thread_start=*/0,
+        /*thread_end=*/output_rows, /*thread_dim=*/1, backend_context);
+  } else {
+    std::vector<gemmlowp::Task*> tasks(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i) {
+      int thread_end =
+          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks[i] = new LegacyPerChannelDepthwiseConvWorkerTask<int8_t, int32_t>(
+          params, output_multiplier, output_shift, input_shape, input_data,
+          filter_shape, filter_data, bias_shape, bias_data, output_shape,
+          output_data, thread_start, thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+  }
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
+                    bias_shape, bias_data, output_shape, output_data,
+                    CpuFlags(),
+                    /*thread_start=*/0,
+                    /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
+}
+
+inline void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                             const Dims<4>& bias_dims,
+                                             float* array_data,
+                                             const Dims<4>& array_dims,
+                                             float output_activation_min,
+                                             float output_activation_max) {
+  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
+                                   DimsToShape(bias_dims), bias_data,
+                                   DimsToShape(array_dims), array_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                      const Dims<4>& bias_dims,
+                                      float* array_data,
+                                      const Dims<4>& array_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims,
+                                   output_activation_min,
+                                   output_activation_max);
+}
+
+template <typename Lhs, typename Rhs, typename Result>
+void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
+          Eigen::MatrixBase<Result>* result) {
+  if (rhs.cols() == 1) {
+    ruy::profiler::ScopeLabel label("GEMV");
+    result->col(0).noalias() = lhs * rhs.col(0);
+  } else {
+    ruy::profiler::ScopeLabel label("GEMM");
+    result->noalias() = lhs * rhs;
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& bias_shape,
+    const float* optional_bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  // TODO(b/62193649): this convoluted shape computation (determining
+  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
+  // is because the current --variable_batch hack consists in overwriting the
+  // 3rd dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  // When that is fixed, this should become:
+  // const auto input_matrix_map =
+  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const int dims_count = weights_shape.DimensionsCount();
+  const int input_rows = weights_shape.Dims(dims_count - 1);
+  const auto input_matrix_map =
+      MapAsMatrixWithGivenNumberOfRows(input_data, input_shape, input_rows);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsRows(weights_data, weights_shape);
+  auto output_matrix_map =
+      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  if (optional_bias_data != nullptr) {
+    AddBiasAndEvalActivationFunction(
+        output_activation_min, output_activation_max, bias_shape,
+        optional_bias_data, output_shape, output_data);
+  } else {
+    const int flat_size = output_shape.FlatSize();
+    for (int i = 0; i < flat_size; ++i) {
+      output_data[i] = ActivationFunctionWithMinMax(
+          output_data[i], output_activation_min, output_activation_max);
+    }
+  }
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(weights_dims), weights_data,
+                 DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+                 output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
+}
+
+struct GemmlowpOutputPipeline {
+  typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline MakeExp(const int32_t* bias_data, int output_rows,
+                          int32_t output_offset, int32_t output_multiplier,
+                          int output_left_shift, int32_t output_activation_min,
+                          int32_t output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+struct GemmlowpOutputPipelineInt8 {
+  typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToInt8>
+      Pipeline;
+  static Pipeline MakeExp(const int32_t* bias_data, int output_rows,
+                          int32_t output_offset, int32_t output_multiplier,
+                          int output_left_shift, int32_t output_activation_min,
+                          int32_t output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+#ifdef USE_NEON
+inline void LegacyFullyConnectedAsGEMVWorkerImpl(
+    const RuntimeShape& input_shape, const uint8_t* input_data,
+    int32_t input_offset, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, int32_t filter_offset,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    int32_t output_offset, int32_t output_multiplier, int output_shift,
+    int32_t output_activation_min, int32_t output_activation_max,
+    const RuntimeShape& output_shape, uint8_t* output_data, int row_start,
+    int row_end) {
+  ruy::profiler::ScopeLabel label("FullyConnectedAsGEMV/8bit");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kPeel = 4;
+  const bool shift_left = (output_shift > 0);
+  for (int k = 0; k < input_size; k += 64) {
+    optimized_ops_preload_l1_stream(input_data + k);
+  }
+  for (int k = 0; k < kPeel * input_size; k += 64) {
+    optimized_ops_preload_l1_stream(filter_data + k);
+  }
+
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      const uint8_t* filter_ptr = filter_data + in + out * input_size;
+      uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_1 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_2 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_3 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      int16x8_t input_val_0, input_val_1;
+      uint8x8_t low = vget_low_u8(input_val_u8);
+      uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_u8(filter_val_u8_0);
+      high = vget_high_u8(filter_val_u8_0);
+      int16x8_t filter_val_0_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_0_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_1);
+      high = vget_high_u8(filter_val_u8_1);
+      int16x8_t filter_val_1_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_1_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_2);
+      high = vget_high_u8(filter_val_u8_2);
+      int16x8_t filter_val_2_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_2_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_3);
+      high = vget_high_u8(filter_val_u8_3);
+      int16x8_t filter_val_3_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_3_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
+    }
+    for (; in <= input_size - 8; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      const uint8_t* filter_ptr = filter_data + in + out * input_size;
+      uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_2 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_3 = vld1_u8(filter_ptr);
+      int16x8_t input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val_0 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_0));
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_1));
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_2));
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_3));
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
+    }
+    if (in < input_size) {
+      int32_t buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32_t input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32_t filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
+    }
+
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
+    reduced = vaddq_s32(reduced, bias_vec);
+    if (shift_left) {
+      const int32_t multiplier_power_of_two = 1 << output_shift;
+      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    } else {
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, -output_shift);
+    }
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit unsigned, saturating.
+    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+    // Apply the clamping from the activation function
+    res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
+    res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
+    // Store results to destination.
+    vst1_lane_u8(output_data + out + 0, res8, 0);
+    vst1_lane_u8(output_data + out + 1, res8, 1);
+    vst1_lane_u8(output_data + out + 2, res8, 2);
+    vst1_lane_u8(output_data + out + 3, res8, 3);
+  }
+}
+
+struct LegacyFullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  LegacyFullyConnectedAsGEMVWorkerTask(
+      const RuntimeShape& input_shape, const uint8_t* input_data,
+      int32_t input_offset, const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, int32_t filter_offset,
+      const RuntimeShape& bias_shape, const int32_t* bias_data,
+      int32_t output_offset, int32_t output_multiplier, int output_shift,
+      int32_t output_activation_min, int32_t output_activation_max,
+      const RuntimeShape& output_shape, uint8_t* output_data, int row_start,
+      int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    LegacyFullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
+  }
+
+  const RuntimeShape& input_shape_;
+  const uint8_t* input_data_;
+  int32_t input_offset_;
+  const RuntimeShape& filter_shape_;
+  const uint8_t* filter_data_;
+  int32_t filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32_t* bias_data_;
+  int32_t output_offset_;
+  int32_t output_multiplier_;
+  int output_shift_;
+  int32_t output_activation_min_;
+  int32_t output_activation_max_;
+  const RuntimeShape& output_shape_;
+  uint8_t* output_data_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const uint8_t* input_data,
+    int32_t input_offset, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, int32_t filter_offset,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    int32_t output_offset, int32_t output_multiplier, int output_shift,
+    int32_t output_activation_min, int32_t output_activation_max,
+    const RuntimeShape& output_shape, uint8_t* output_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemmlowp_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    LegacyFullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
+      gemmlowp::CeilQuotient(output_rows, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks[i] = new LegacyFullyConnectedAsGEMVWorkerTask(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+}
+#endif  // USE_NEON
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected/8bit");
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+#ifdef USE_NEON
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemmlowp_context);
+    }
+  }
+#endif  // USE_NEON
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
+      filter_matrix(filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, batches, filter_cols);
+  gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, batches, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
+#ifdef GEMMLOWP_NEON
+// In the common case of batch size 1, a fully-connected node degenerates
+// to a matrix*vector product. LSTM cells contain a fully-connected node;
+// when quantized, this becomes a special type of GEMV operation where
+// the output is 16bit-quantized, thus needs its own special path.
+inline void GEMVForLstmCell(const RuntimeShape& input_shape,
+                            const uint8* input_data,
+                            const RuntimeShape& weights_shape,
+                            const uint8* weights_data, uint8 weights_zero_point,
+                            const RuntimeShape& bias_shape,
+                            const int32* bias_data, int32 accum_multiplier,
+                            int accum_shift, const RuntimeShape& output_shape,
+                            int16* output_data) {
+  ruy::profiler::ScopeLabel label("GEMVForLstmCell");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  const int output_size = MatchingDim(weights_shape, weights_dim_count - 2,
+                                      output_shape, output_dim_count - 1);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 8));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += 4) {
+    int32x4_t acc_0 = vdupq_n_s32(0);
+    int32x4_t acc_1 = vdupq_n_s32(0);
+    int32x4_t acc_2 = vdupq_n_s32(0);
+    int32x4_t acc_3 = vdupq_n_s32(0);
+    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
+    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
+    int in = 0;
+    // Handle 16 levels of depth at a time.
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
+      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
+      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
+      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val_0, input_val_1;
+      const uint8x8_t low = vget_low_u8(input_val_u8);
+      const uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
+          weights_val_3_0;
+      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
+          weights_val_3_1;
+      weights_val_0_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_0_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_1_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_1_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_2_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_2_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_3_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      weights_val_3_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
+                        vget_low_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
+                        vget_low_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
+                        vget_low_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
+                        vget_low_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
+                        vget_high_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
+                        vget_high_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
+                        vget_high_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
+                        vget_high_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
+                        vget_low_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
+                        vget_low_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
+                        vget_low_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
+                        vget_low_s16(input_val_1));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
+                        vget_high_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
+                        vget_high_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
+                        vget_high_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
+                        vget_high_s16(input_val_1));
+    }
+    // Handle 8 levels of depth at a time.
+    for (; in < input_size; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
+      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
+      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
+      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val;
+      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
+      weights_val_0 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
+                    weights_offset_vec);
+      weights_val_1 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
+                    weights_offset_vec);
+      weights_val_2 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
+                    weights_offset_vec);
+      weights_val_3 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
+                    weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
+                        vget_low_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
+                        vget_low_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
+                        vget_low_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
+                        vget_low_s16(input_val));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
+                        vget_high_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
+                        vget_high_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
+                        vget_high_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
+                        vget_high_s16(input_val));
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    int left_shift = accum_shift > 0 ? accum_shift : 0;
+    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
+#ifdef GEMMLOWP_NEON
+inline void GEMVForLstmCellWithSymmetricRange(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    const RuntimeShape& weights_shape, const uint8* weights_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    int32 accum_multiplier, int accum_shift, const RuntimeShape& output_shape,
+    int16* output_data) {
+  ruy::profiler::ScopeLabel label("GEMVForLstmCellWithSymmetricRange");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  const int output_size = MatchingDim(weights_shape, weights_dim_count - 2,
+                                      output_shape, output_dim_count - 1);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 64));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  for (int in = 0; in < input_size; in += 32) {
+    optimized_ops_preload_l1_keep(input_data + in);
+  }
+  const int left_shift = accum_shift > 0 ? accum_shift : 0;
+  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+  for (int out = 0; out < output_size; out += 4) {
+    // Load the bias values
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+
+    // Clear accumulators. We use 2 accumulator registers per row,
+    // for 4 rows. row_accumRN is the N-th accumulator for row R.
+    int32x4_t row_accum00 = vdupq_n_s32(0);
+    int32x4_t row_accum01 = vdupq_n_s32(0);
+    int32x4_t row_accum10 = vdupq_n_s32(0);
+    int32x4_t row_accum11 = vdupq_n_s32(0);
+    int32x4_t row_accum20 = vdupq_n_s32(0);
+    int32x4_t row_accum21 = vdupq_n_s32(0);
+    int32x4_t row_accum30 = vdupq_n_s32(0);
+    int32x4_t row_accum31 = vdupq_n_s32(0);
+
+    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
+    const int kReadAhead = 512;
+    // Prefetch the first weights values.
+    for (int k = 0; k < kReadAhead; k += 64) {
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      k);
+    }
+    // Loop along the rows, handling 64 bytes per iteration because that's
+    // cache line size on most current ARM-architecture CPUs.
+    for (int in = 0; in < input_size; in += 64) {
+      // Prefetch some future weights values.
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      in + kReadAhead);
+
+      // We will use 2 local 16-bit accumulators per row, for 2 rows.
+      // See below (*) for the rationale of processing only 2 rows at a time.
+      // local_accumRN is the N-th local accumulator for row R.
+      int16x8_t local_accum00;
+      int16x8_t local_accum01;
+      int16x8_t local_accum10;
+      int16x8_t local_accum11;
+
+      // Load 64 bytes of input activations values. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t input0 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
+      int8x16_t input1 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
+      int8x16_t input2 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
+      int8x16_t input3 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
+
+      // Beginning of the core accumulation. Notice how while we have 4
+      // rows to process, this code is taking care of only 2 rows at a time,
+      // thus being divided into two parts looking similar ("Rows 0 and 1" and
+      // "Rows 2 and 3").
+      //
+      // (*) The rationale for handling only 2 rows at a time is to avoid
+      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
+      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
+      // we may find ourselves in a situation where rows alias each other in
+      // the L1 cache, and moreover may also mutually alias with the input
+      // activations. If we try to load 4 rows at a time, together with the
+      // input activations, that may be 5 mutually-aliasing vectors, resulting
+      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
+      // here largely mitigates these issues, and seems at least to be very
+      // effective on Cortex-A53:
+      //                          Before       After
+      // big (Cortex-A73)         2.85 ms      2.85 ms
+      // little (Cortex-A53)      11.0 ms      5.16 ms
+
+      // Rows 0 and 1:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
+      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
+      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
+      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
+      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
+      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
+      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
+      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+
+      // Rows 2 and 3:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
+      weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
+      weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
+      weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
+      weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
+      weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
+      weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
+      weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+    }
+
+    row_accum00 = vaddq_s32(row_accum00, row_accum01);
+    row_accum10 = vaddq_s32(row_accum10, row_accum11);
+    row_accum20 = vaddq_s32(row_accum20, row_accum21);
+    row_accum30 = vaddq_s32(row_accum30, row_accum31);
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    reduced = vaddq_s32(reduced, bias_vec);
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data_int32, const RuntimeShape& output_shape,
+    int16_t* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected/Uint8Int16");
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+  (void)gemmlowp_context;  // only used in properly optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+  // Implementation of the fully connected node suited to the inside of an LSTM
+  // cell. The operands are 8-bit integers, the accumulators are internally
+  // 32bit integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+#ifdef GEMMLOWP_NEON
+  if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
+      output_activation_max == 32767) {
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
+      GEMVForLstmCellWithSymmetricRange(
+          input_shape, input_data, filter_shape, filter_data, bias_shape,
+          bias_data_int32, output_multiplier, output_shift, output_shape,
+          output_data);
+      return;
+    }
+    if (!(output_depth % 4) && !(accum_depth % 8)) {
+      GEMVForLstmCell(input_shape, input_data, filter_shape, filter_data,
+                      filter_offset, bias_shape, bias_data_int32,
+                      output_multiplier, output_shift, output_shape,
+                      output_data);
+      return;
+    }
+  }
+#endif
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
+      weights_matrix(filter_data, output_depth, accum_depth);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, accum_depth, batches);
+  gemmlowp::MatrixMap<int16_t, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_depth, batches);
+  typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  ColVectorMap bias_vector(bias_data_int32, output_depth);
+  gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+  bias_addition_stage.bias_vector = bias_vector;
+  gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+  scale_stage.result_offset_after_shift = 0;
+  scale_stage.result_fixedpoint_multiplier = output_multiplier;
+  // Note that this shift is negated wrt ordinary FC.
+  scale_stage.result_exponent = output_shift;
+  gemmlowp::OutputStageClamp clamp_stage;
+  clamp_stage.min = output_activation_min;
+  clamp_stage.max = output_activation_max;
+  gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+  auto output_pipeline =
+      std::make_tuple(bias_addition_stage, scale_stage, clamp_stage,
+                      saturating_cast_int16_stage);
+  gemmlowp::GemmWithOutputPipeline<uint8_t, int16_t,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, weights_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
+inline void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims,
+                           int32_t input_offset, const uint8_t* filter_data,
+                           const Dims<4>& filter_dims, int32_t filter_offset,
+                           const int32_t* bias_data, const Dims<4>& bias_dims,
+                           int32_t output_offset, int32_t output_multiplier,
+                           int output_shift, int32_t output_activation_min,
+                           int32_t output_activation_max, uint8_t* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data, DimsToShape(output_dims), output_data,
+                 gemmlowp_context);
+}
+
+inline void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims,
+                           int32_t input_offset, const uint8_t* filter_data,
+                           const Dims<4>& filter_dims, int32_t filter_offset,
+                           const int32_t* bias_data_int32,
+                           const Dims<4>& bias_dims, int32_t output_offset,
+                           int32_t output_multiplier, int output_shift,
+                           int32_t output_activation_min,
+                           int32_t output_activation_max, int16_t* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data_int32, DimsToShape(output_dims), output_data,
+                 gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims,
+                    int32_t input_offset, const uint8_t* filter_data,
+                    const Dims<4>& filter_dims, int32_t filter_offset,
+                    const int32_t* bias_data, const Dims<4>& bias_dims,
+                    int32_t output_offset, int32_t output_multiplier,
+                    int output_shift, int32_t output_activation_min,
+                    int32_t output_activation_max, uint8_t* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemmlowp_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims,
+                 gemmlowp_context);
+}
+
+#ifdef USE_NEON
+inline void LegacyInt8FullyConnectedAsGEMVWorkerImpl(
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    int32_t input_offset, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, int32_t filter_offset,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    int32_t output_offset, int32_t output_multiplier, int output_shift,
+    int32_t output_activation_min, int32_t output_activation_max,
+    const RuntimeShape& output_shape, int8_t* output_data, int row_start,
+    int row_end) {
+  ruy::profiler::ScopeLabel label("FullyConnectedAsGEMVInt8/8bit");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kPeel = 4;
+  const bool shift_left = (output_shift > 0);
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const int8x16_t input_val_s8 = vld1q_s8(input_data + in);
+      const int8_t* filter_ptr = filter_data + in + out * input_size;
+      int8x16_t filter_val_s8_0 = vld1q_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x16_t filter_val_s8_1 = vld1q_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x16_t filter_val_s8_2 = vld1q_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x16_t filter_val_s8_3 = vld1q_s8(filter_ptr);
+      int16x8_t input_val_0, input_val_1;
+      int8x8_t low = vget_low_s8(input_val_s8);
+      int8x8_t high = vget_high_s8(input_val_s8);
+      input_val_0 = vmovl_s8(low);
+      input_val_1 = vmovl_s8(high);
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_s8(filter_val_s8_0);
+      high = vget_high_s8(filter_val_s8_0);
+      int16x8_t filter_val_0_0 = vmovl_s8(low);
+      int16x8_t filter_val_0_1 = vmovl_s8(high);
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_s8(filter_val_s8_1);
+      high = vget_high_s8(filter_val_s8_1);
+      int16x8_t filter_val_1_0 = vmovl_s8(low);
+      int16x8_t filter_val_1_1 = vmovl_s8(high);
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_s8(filter_val_s8_2);
+      high = vget_high_s8(filter_val_s8_2);
+      int16x8_t filter_val_2_0 = vmovl_s8(low);
+      int16x8_t filter_val_2_1 = vmovl_s8(high);
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_s8(filter_val_s8_3);
+      high = vget_high_s8(filter_val_s8_3);
+      int16x8_t filter_val_3_0 = vmovl_s8(low);
+      int16x8_t filter_val_3_1 = vmovl_s8(high);
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
+    }
+    for (; in <= input_size - 8; in += 8) {
+      const int8x8_t input_val_s8 = vld1_s8(input_data + in);
+      const int8_t* filter_ptr = filter_data + in + out * input_size;
+      int8x8_t filter_val_s8_0 = vld1_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x8_t filter_val_s8_1 = vld1_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x8_t filter_val_s8_2 = vld1_s8(filter_ptr);
+      filter_ptr += input_size;
+      int8x8_t filter_val_s8_3 = vld1_s8(filter_ptr);
+      int16x8_t input_val = vmovl_s8(input_val_s8);
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val_0 = vmovl_s8(filter_val_s8_0);
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vmovl_s8(filter_val_s8_1);
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vmovl_s8(filter_val_s8_2);
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vmovl_s8(filter_val_s8_3);
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
+    }
+    if (in < input_size) {
+      int32_t buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32_t input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32_t filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
+    }
+
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
+    reduced = vaddq_s32(reduced, bias_vec);
+    if (shift_left) {
+      const int32_t multiplier_power_of_two = 1 << output_shift;
+      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    } else {
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, -output_shift);
+    }
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit signed, saturating.
+    int8x8_t res8 = vqmovn_s16(vcombine_s16(res16, res16));
+    // Apply the clamping from the activation function
+    res8 = vmax_s8(res8, vdup_n_s8(output_activation_min));
+    res8 = vmin_s8(res8, vdup_n_s8(output_activation_max));
+    // Store results to destination.
+    vst1_lane_s8(output_data + out + 0, res8, 0);
+    vst1_lane_s8(output_data + out + 1, res8, 1);
+    vst1_lane_s8(output_data + out + 2, res8, 2);
+    vst1_lane_s8(output_data + out + 3, res8, 3);
+  }
+}
+
+struct LegacyInt8FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  LegacyInt8FullyConnectedAsGEMVWorkerTask(
+      const RuntimeShape& input_shape, const int8_t* input_data,
+      int32_t input_offset, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, int32_t filter_offset,
+      const RuntimeShape& bias_shape, const int32_t* bias_data,
+      int32_t output_offset, int32_t output_multiplier, int output_shift,
+      int32_t output_activation_min, int32_t output_activation_max,
+      const RuntimeShape& output_shape, int8_t* output_data, int row_start,
+      int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    LegacyInt8FullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
+  }
+
+  const RuntimeShape& input_shape_;
+  const int8_t* input_data_;
+  int32_t input_offset_;
+  const RuntimeShape& filter_shape_;
+  const int8_t* filter_data_;
+  int32_t filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32_t* bias_data_;
+  int32_t output_offset_;
+  int32_t output_multiplier_;
+  int output_shift_;
+  int32_t output_activation_min_;
+  int32_t output_activation_max_;
+  const RuntimeShape& output_shape_;
+  int8_t* output_data_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void LegacyInt8FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    int32_t input_offset, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, int32_t filter_offset,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    int32_t output_offset, int32_t output_multiplier, int output_shift,
+    int32_t output_activation_min, int32_t output_activation_max,
+    const RuntimeShape& output_shape, int8_t* output_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemmlowp_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    LegacyInt8FullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<LegacyInt8FullyConnectedAsGEMVWorkerTask> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
+      gemmlowp::CeilQuotient(output_rows, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks.emplace_back(input_shape, input_data, input_offset, filter_shape,
+                       filter_data, filter_offset, bias_shape, bias_data,
+                       output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemmlowp_context->workers_pool()->Execute(tasks.size(), tasks.data());
+}
+#endif  // USE_NEON
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label("FullyConnectedInt8/8bit");
+
+#ifdef USE_NEON
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return LegacyInt8FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemmlowp_context);
+    }
+  }
+#endif  // USE_NEON
+
+#ifdef GEMMLOWP_NEON
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, batches, filter_cols);
+  gemmlowp::MatrixMap<int8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, batches, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipelineInt8::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+
+  gemmlowp::GemmWithOutputPipeline<
+      int8, int8, gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+  return;
+#endif  // GEMMLOWP_NEON
+
+  // If both GEMMLOWP_NEON && NEON paths are skipped, fallback to reference
+  // implementation.
+  reference_integer_ops::FullyConnected(params, input_shape, input_data,
+                                        filter_shape, filter_data, bias_shape,
+                                        bias_data, output_shape, output_data);
+}
+
+struct LegacyShuffledFullyConnectedWorkerTask : gemmlowp::Task {
+  LegacyShuffledFullyConnectedWorkerTask(const uint8_t* input_data,
+                                         const int8_t* shuffled_weights_data,
+                                         int batches, int output_depth,
+                                         int output_stride, int accum_depth,
+                                         const int32_t* bias_data,
+                                         int32_t output_multiplier,
+                                         int output_shift, int16_t* output_data)
+      : input_data_(input_data),
+        shuffled_weights_data_(shuffled_weights_data),
+        batches_(batches),
+        output_depth_(output_depth),
+        output_stride_(output_stride),
+        accum_depth_(accum_depth),
+        bias_data_(bias_data),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_data_(output_data) {}
+
+  void Run() override {
+    ShuffledFullyConnectedWorkerImpl(
+        input_data_, shuffled_weights_data_, batches_, output_depth_,
+        output_stride_, accum_depth_, bias_data_, output_multiplier_,
+        output_shift_, output_data_);
+  }
+
+  const uint8_t* input_data_;
+  const int8_t* shuffled_weights_data_;
+  int batches_;
+  int output_depth_;
+  int output_stride_;
+  int accum_depth_;
+  const int32_t* bias_data_;
+  int32_t output_multiplier_;
+  int output_shift_;
+  int16_t* output_data_;
+};
+
+inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label("ShuffledFullyConnected/8bit");
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  (void)gemmlowp_context;  // only used in optimized code.
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8_t* int8_shuffled_weights_data =
+      reinterpret_cast<const int8_t*>(shuffled_weights_data);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  if (batches == 1) {
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (int i = 0; i < accum_depth; i += 16) {
+      uint8x16_t val = vld1q_u8(input_data + i);
+      val = veorq_u8(val, signbit);
+      vst1q_u8(shuffled_input_workspace_data + i, val);
+    }
+#else
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+#endif
+  } else if (batches == 4) {
+    uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+    int c = 0;
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (c = 0; c < accum_depth; c += 16) {
+      const uint8_t* src_data_ptr = input_data + c;
+      uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
+      uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
+      uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
+      uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth);
+      val0 = veorq_u8(val0, signbit);
+      val1 = veorq_u8(val1, signbit);
+      val2 = veorq_u8(val2, signbit);
+      val3 = veorq_u8(val3, signbit);
+      vst1q_u8(shuffled_input_workspace_ptr + 0, val0);
+      vst1q_u8(shuffled_input_workspace_ptr + 16, val1);
+      vst1q_u8(shuffled_input_workspace_ptr + 32, val2);
+      vst1q_u8(shuffled_input_workspace_ptr + 48, val3);
+      shuffled_input_workspace_ptr += 64;
+    }
+#else
+    for (c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+#endif
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemmlowp_context->max_num_threads(), output_depth, batches, accum_depth);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    ShuffledFullyConnectedWorkerImpl(
+        shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
+        output_depth, output_depth, accum_depth, bias_data, output_multiplier,
+        output_shift, output_data);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
+      gemmlowp::CeilQuotient(output_depth, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; i++) {
+    int row_end = std::min(output_depth, row_start + kRowsPerWorker);
+    tasks[i] = new LegacyShuffledFullyConnectedWorkerTask(
+        shuffled_input_workspace_data,
+        int8_shuffled_weights_data + row_start * accum_depth, batches,
+        row_end - row_start, output_depth, accum_depth, bias_data + row_start,
+        output_multiplier, output_shift, output_data + row_start);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_depth);
+  gemmlowp_context->workers_pool()->LegacyExecuteAndDestroyTasks(tasks);
+}
+
+inline void ShuffledFullyConnected(
+    const uint8_t* input_data, const Dims<4>& input_dims,
+    const uint8_t* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32_t* bias_data, const Dims<4>& bias_dims,
+    int32_t output_multiplier, int output_shift, int32_t output_activation_min,
+    int32_t output_activation_max, int16_t* output_data,
+    const Dims<4>& output_dims, uint8_t* shuffled_input_workspace_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
+                         DimsToShape(weights_dims), shuffled_weights_data,
+                         DimsToShape(bias_dims), bias_data,
+                         DimsToShape(output_dims), output_data,
+                         shuffled_input_workspace_data, gemmlowp_context);
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
+    int stride_width, int stride_height, int pad_width, int pad_height,
+    int in_width, int in_height, int in_depth, int single_buffer_length,
+    int buffer_id, const T* in_data, T* conv_buffer_data, uint8_t zero_byte) {
+  ExtractPatchIntoBufferColumn(
+      DimsToShape(input_dims), w, h, b, kheight, kwidth, stride_width,
+      stride_height, pad_width, pad_height, in_width, in_height, in_depth,
+      single_buffer_length, buffer_id, in_data, conv_buffer_data, zero_byte);
+}
+
+template <typename T>
+void DilatedIm2col(const T* input_data, const Dims<4>& input_dims,
+                   const Dims<4>& filter_dims, int stride_width,
+                   int stride_height, int dilation_width_factor,
+                   int dilation_height_factor, int pad_width, int pad_height,
+                   const Dims<4>& output_dims, uint8_t zero_byte,
+                   T* im2col_data) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+
+  DilatedIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), DimsToShape(output_dims),
+                im2col_data);
+}
+
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
+            int stride_height, int pad_width, int pad_height, int kheight,
+            int kwidth, uint8_t zero_byte, T* output_data,
+            const Dims<4>& output_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+
+  Im2col(op_params, kheight, kwidth, zero_byte, DimsToShape(input_dims),
+         input_data, DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8_t zero_byte, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, zero_byte, output_data, output_dims);
+}
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  (void)im2col_data;
+  (void)im2col_shape;
+  ruy::profiler::ScopeLabel label("Conv");
+
+  // NB: the float 0.0f value is represented by all zero bytes.
+  const uint8_t float_zero_byte = 0x00;
+  const float* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col) {
+    DilatedIm2col(params, float_zero_byte, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    Im2col(params, filter_height, filter_width, float_zero_byte, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    // TODO(aselle): We need to make sure to not send im2col if it is not
+    // needed.
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using Eigen.
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+      Matrix;
+  typedef Eigen::Map<Matrix> MatrixRef;
+  typedef Eigen::Map<const Matrix> ConstMatrixRef;
+
+  MatrixRef matrix_c(c, m, n);
+  ConstMatrixRef matrix_a(a, m, k);
+  ConstMatrixRef matrix_b(b, n, k);
+
+  // The following special casing for when a or b is a vector is required
+  // as Eigen seem to fail to make this optimization on its own.
+  if (n == 1) {
+    ruy::profiler::ScopeLabel label("GEMV");
+    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
+  } else if (m == 1) {
+    ruy::profiler::ScopeLabel label("GEMV");
+    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
+  } else {
+    ruy::profiler::ScopeLabel label("GEMM");
+    matrix_c.noalias() = matrix_a * matrix_b.transpose();
+  }
+
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+inline void HybridConv(const int8_t* input_data, const Dims<4>& input_dims,
+                       const int8_t* filter_data, const Dims<4>& filter_dims,
+                       const float* bias_data, const Dims<4>& bias_dims,
+                       int stride_width, int stride_height, int pad_width,
+                       int pad_height, float* scaling_factors_ptr,
+                       float output_activation_min, float output_activation_max,
+                       int32_t* scratch_data, const Dims<4>& scratch_dims,
+                       float* output_data, const Dims<4>& output_dims,
+                       int8_t* im2col_data, const Dims<4>& im2col_dims,
+                       CpuBackendContext* context) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  HybridConv(op_params, scaling_factors_ptr, DimsToShape(input_dims),
+             input_data, DimsToShape(filter_dims), filter_data,
+             DimsToShape(bias_dims), bias_data, DimsToShape(scratch_dims),
+             scratch_data, DimsToShape(output_dims), output_data,
+             DimsToShape(im2col_dims), im2col_data, context);
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int dilation_width_factor,
+          int dilation_height_factor, int pad_width, int pad_height,
+          float* output_data, const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, dilation_width_factor,
+       dilation_height_factor, pad_width, pad_height, output_activation_min,
+       output_activation_max, output_data, output_dims, im2col_data,
+       im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, 1, 1, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label("Conv/8bit");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const uint8_t* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    DilatedIm2col(params, input_zero_point, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    Im2col(params, filter_height, filter_width, input_zero_point, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
+  // The root cause has not yet been identified though. Same applies below for
+  // the other calls commented out. This is a partial rollback of cl/196819423.
+  // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int gemm_input_cols = gemm_input_shape->Dims(0) *
+                              gemm_input_shape->Dims(1) *
+                              gemm_input_shape->Dims(2);
+  const int filter_rows = filter_shape.Dims(0);
+  // See b/79927784.
+  // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+  const int filter_cols =
+      filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
+  const int output_rows = output_shape.Dims(3);
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_shape, 3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+#ifdef USE_NEON
+  if (gemm_input_cols == 1 && output_rows >= 4) {
+    RuntimeShape fc_filter_shape{
+        filter_shape.Dims(0),
+        filter_shape.Dims(filter_shape.DimensionsCount() - 1)};
+
+    return FullyConnectedAsGEMV(
+        *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
+        filter_data, filter_offset, bias_shape, bias_data, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_shape, output_data, gemmlowp_context);
+  }
+#endif
+
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
+      filter_matrix(filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols);
+  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
+inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_offset, const uint8_t* filter_data,
+                 const Dims<4>& filter_dims, int32_t filter_offset,
+                 const int32_t* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32_t output_offset, int32_t output_multiplier,
+                 int output_shift, int32_t output_activation_min,
+                 int32_t output_activation_max, uint8_t* output_data,
+                 const Dims<4>& output_dims, uint8_t* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data, gemmlowp_context);
+}
+
+inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_offset, const uint8_t* filter_data,
+                 const Dims<4>& filter_dims, int32_t filter_offset,
+                 const int32_t* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32_t output_offset,
+                 int32_t output_multiplier, int output_shift,
+                 int32_t output_activation_min, int32_t output_activation_max,
+                 uint8_t* output_data, const Dims<4>& output_dims,
+                 uint8_t* im2col_data, const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_offset, const uint8_t* filter_data,
+                 const Dims<4>& filter_dims, int32_t filter_offset,
+                 const int32_t* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32_t output_offset,
+                 int32_t output_multiplier, int output_shift,
+                 int32_t output_activation_min, int32_t output_activation_max,
+                 uint8_t* output_data, const Dims<4>& output_dims,
+                 uint8_t* im2col_data, const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+          int32_t input_offset, const uint8_t* filter_data,
+          const Dims<4>& filter_dims, int32_t filter_offset,
+          const int32_t* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32_t output_offset,
+          int32_t output_multiplier, int output_shift,
+          int32_t output_activation_min, int32_t output_activation_max,
+          uint8_t* output_data, const Dims<4>& output_dims,
+          uint8_t* im2col_data, const Dims<4>& im2col_dims,
+          gemmlowp::GemmContext* gemmlowp_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+       pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8_t zero_byte, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, zero_byte, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
+                const float* filter_data, const Dims<4>& filter_dims,
+                const float* bias_data, const Dims<4>& bias_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  ruy::profiler::ScopeLabel label("ConvAsGemm");
+
+  const auto input_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const uint8_t* input_data, const Dims<4>& input_dims,
+                int32_t input_offset, const uint8_t* filter_data,
+                const Dims<4>& filter_dims, int32_t filter_offset,
+                const int32_t* bias_data, const Dims<4>& bias_dims,
+                int32_t output_offset, int32_t output_multiplier,
+                int output_shift, int32_t output_activation_min,
+                int32_t output_activation_max, uint8_t* output_data,
+                const Dims<4>& output_dims,
+                gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label("ConvAsGemm/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int input_rows = input_dims.sizes[0];
+  const int input_cols = FlatSizeSkipDim(input_dims, 0);
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols = FlatSizeSkipDim(output_dims, 0);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, input_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
+      filter_matrix(filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, output_cols, filter_cols);
+  gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, -output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemmlowp_context, filter_matrix, input_matrix, &output_matrix,
+      filter_offset, input_offset, output_pipeline);
+}
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  ruy::profiler::ScopeLabel label("TransposeConv");
+  // Note we could use transposed weights with forward conv for unstrided
+  // cases. But we are already getting good performance with this code as-is.
+  TFLITE_DCHECK(im2col_data);
+  TransposeIm2col(params, 0, input_shape, input_data, filter_shape,
+                  output_shape, im2col_data);
+
+  const auto im2col_matrix_map =
+      MapAsMatrixWithLastDimAsRows(im2col_data, im2col_shape);
+  const auto filter_matrix_map =
+      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
+  auto output_matrix_map =
+      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+
+  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+}
+
+inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, float* output_data,
+                          const Dims<4>& output_dims, float* im2col_data,
+                          const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+
+  TransposeConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(output_dims),
+                output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+inline void TransposeConvV2(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
+    const float* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data,
+    CpuBackendContext* cpu_backend_context) {
+  TransposeConvV2(params, input_shape, input_data, hwoi_ordered_filter_shape,
+                  hwoi_ordered_filter_data, /*bias_shape*/ RuntimeShape(),
+                  /*bias_data*/ nullptr, output_shape, output_data,
+                  col2im_shape, col2im_data, cpu_backend_context);
+}
+
+template <typename T>
+void TransposeIm2col(const T* input_data, const Dims<4>& input_dims,
+                     const Dims<4>& filter_dims, int stride_width,
+                     int stride_height, int pad_width, int pad_height,
+                     const Dims<4>& output_dims, uint8_t zero_byte,
+                     T* im2col_data) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+
+  TransposeIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data,
+                  DimsToShape(filter_dims), DimsToShape(output_dims),
+                  im2col_data);
+}
+
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
+    const float* prev_activ_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& unextended_bias_shape,
+    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
+    const float* prev_state_data,
+    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
+    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
+    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
+  ruy::profiler::ScopeLabel label("LstmCell");
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  MatchingDim(  // batches
+      input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+      output_state_shape, 0, output_activ_shape, 0);
+  MatchingDim(  // height
+      input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+      output_state_shape, 1, output_activ_shape, 1);
+  MatchingDim(  // width
+      input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+      output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<RuntimeShape const*> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]),
+                &(concat_input_arrays_data[0]), concat_temp_shape,
+                concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
+                 weights_data, bias_shape, bias_data, activ_temp_shape,
+                 activ_temp_data);
+
+  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+  // operations.
+  ArrayMap<float> activ_temp_map =
+      MapAsArrayWithLastDimAsRows(activ_temp_data, activ_temp_shape);
+  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+                                            activ_temp_map.cols());
+  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+                                           activ_temp_map.cols());
+  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  ArrayMap<const float> prev_state_map =
+      MapAsArrayWithLastDimAsRows(prev_state_data, prev_state_shape);
+  ArrayMap<float> output_state_map =
+      MapAsArrayWithLastDimAsRows(output_state_data, output_state_shape);
+  ArrayMap<float> output_activ_map =
+      MapAsArrayWithLastDimAsRows(output_activ_data, output_activ_shape);
+
+  // Combined memory state and final output calculation
+  ruy::profiler::ScopeLabel label2("MemoryStateAndFinalOutput");
+  output_state_map =
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+          new_input_sm.tanh() +
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+          prev_state_map;
+  output_activ_map =
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+      output_state_map.tanh();
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  tflite::LstmCellParams op_params;
+  // Float LSTM cell does not need parameters to be set: leave untouched.
+
+  LstmCell(op_params, DimsToShape(input_dims), input_data,
+           DimsToShape(prev_activ_dims), prev_activ_data,
+           DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims),
+           bias_data, DimsToShape(prev_state_dims), prev_state_data,
+           DimsToShape(output_state_dims), output_state_data,
+           DimsToShape(output_activ_dims), output_activ_data,
+           DimsToShape(concat_temp_dims), concat_temp_data,
+           DimsToShape(activ_temp_dims), activ_temp_data);
+}
+
+template <int StateIntegerBits>
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const uint8_t* input_data_uint8,
+    const RuntimeShape& unextended_prev_activ_shape,
+    const uint8_t* prev_activ_data_uint8, const RuntimeShape& weights_shape,
+    const uint8_t* weights_data_uint8,
+    const RuntimeShape& unextended_bias_shape, const int32_t* bias_data_int32,
+    const RuntimeShape& unextended_prev_state_shape,
+    const int16_t* prev_state_data_int16,
+    const RuntimeShape& unextended_output_state_shape,
+    int16_t* output_state_data_int16,
+    const RuntimeShape& unextended_output_activ_shape,
+    uint8_t* output_activ_data_uint8,
+    const RuntimeShape& unextended_concat_temp_shape,
+    uint8_t* concat_temp_data_uint8,
+    const RuntimeShape& unextended_activ_temp_shape,
+    int16_t* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) {
+  ruy::profiler::ScopeLabel label(
+      "LstmCell/quantized (8bit external, 16bit internal)");
+  int32_t weights_zero_point = params.weights_zero_point;
+  int32_t accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(
+      input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
+      output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  const uint8_t* concat_input_arrays_data[2] = {input_data_uint8,
+                                                prev_activ_data_uint8};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape,
+                concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  bool gemm_already_performed = false;
+#ifdef GEMMLOWP_NEON
+  if (fc_batches == 1 && !(fc_output_depth % 4) && !(fc_accum_depth % 8)) {
+    GEMVForLstmCell(concat_temp_shape, concat_temp_data_uint8, weights_shape,
+                    weights_data_uint8, weights_zero_point, bias_shape,
+                    bias_data_int32, accum_multiplier, accum_shift,
+                    activ_temp_shape, activ_temp_data_int16);
+    gemm_already_performed = true;
+  }
+#endif
+  if (!gemm_already_performed) {
+    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
+        weights_matrix(weights_data_uint8, fc_output_depth, fc_accum_depth);
+    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor>
+        input_matrix(concat_temp_data_uint8, fc_accum_depth, fc_batches);
+    gemmlowp::MatrixMap<int16_t, gemmlowp::MapOrder::ColMajor> output_matrix(
+        activ_temp_data_int16, fc_output_depth, fc_batches);
+    typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
+        ColVectorMap;
+    ColVectorMap bias_vector(bias_data_int32, fc_output_depth);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+    scale_stage.result_offset_after_shift = 0;
+    scale_stage.result_fixedpoint_multiplier = accum_multiplier;
+    scale_stage.result_exponent = accum_shift;
+    gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+    auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage,
+                                           saturating_cast_int16_stage);
+    gemmlowp::GemmWithOutputPipeline<
+        uint8_t, int16_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+        gemmlowp_context, weights_matrix, input_matrix, &output_matrix,
+        -weights_zero_point, -128, output_pipeline);
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int16_t* input_gate_input_ptr = activ_temp_data_int16;
+  const int16_t* input_modulation_gate_input_ptr =
+      activ_temp_data_int16 + output_depth;
+  const int16_t* forget_gate_input_ptr =
+      activ_temp_data_int16 + 2 * output_depth;
+  const int16_t* output_gate_input_ptr =
+      activ_temp_data_int16 + 3 * output_depth;
+  const int16_t* prev_state_ptr = prev_state_data_int16;
+  int16_t* output_state_data_ptr = output_state_data_int16;
+  uint8_t* output_activ_data_ptr = output_activ_data_uint8;
+
+  for (int b = 0; b < outer_size; ++b) {
+    int c = 0;
+#ifdef GEMMLOWP_NEON
+    for (; c <= output_depth - 8; c += 8) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<int16x8_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(vld1q_s16(input_gate_input_ptr));
+      input_gate_input_ptr += 8;
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+          F3::FromRaw(vld1q_s16(input_modulation_gate_input_ptr));
+      input_modulation_gate_input_ptr += 8;
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(vld1q_s16(forget_gate_input_ptr));
+      forget_gate_input_ptr += 8;
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(vld1q_s16(output_gate_input_ptr));
+      output_gate_input_ptr += 8;
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(vld1q_s16(prev_state_ptr));
+      prev_state_ptr += 8;
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      vst1q_s16(output_state_data_ptr, new_state.raw());
+      output_state_data_ptr += 8;
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16x8_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int8x8_t int8_output_activ = vqmovn_s16(rescaled_output_activ);
+      uint8x8_t uint8_output_activ =
+          vadd_u8(vdup_n_u8(128), vreinterpret_u8_s8(int8_output_activ));
+      vst1_u8(output_activ_data_ptr, uint8_output_activ);
+      output_activ_data_ptr += 8;
+    }
+#endif
+    for (; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(*input_gate_input_ptr++);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+          F3::FromRaw(*input_modulation_gate_input_ptr++);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(*forget_gate_input_ptr++);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(*output_gate_input_ptr++);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(*prev_state_ptr++);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      *output_state_data_ptr++ = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16_t clamped_output_activ = std::max<int16_t>(
+          -128, std::min<int16_t>(127, rescaled_output_activ));
+      *output_activ_data_ptr++ = 128 + clamped_output_activ;
+    }
+    input_gate_input_ptr += 3 * output_depth;
+    input_modulation_gate_input_ptr += 3 * output_depth;
+    forget_gate_input_ptr += 3 * output_depth;
+    output_gate_input_ptr += 3 * output_depth;
+  }
+}
+
+template <int StateIntegerBits>
+void LstmCell(const uint8_t* input_data_uint8, const Dims<4>& input_dims,
+              const uint8_t* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8_t* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32_t* bias_data_int32,
+              const Dims<4>& bias_dims, const int16_t* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16_t* output_state_data_int16,
+              const Dims<4>& output_state_dims,
+              uint8_t* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8_t* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16_t* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32_t weights_zero_point,
+              int32_t accum_multiplier, int accum_shift,
+              gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::LstmCellParams op_params;
+  op_params.weights_zero_point = weights_zero_point;
+  op_params.accum_multiplier = accum_multiplier;
+  op_params.accum_shift = accum_shift;
+
+  LstmCell<StateIntegerBits>(
+      op_params, DimsToShape(input_dims), input_data_uint8,
+      DimsToShape(prev_activ_dims), prev_activ_data_uint8,
+      DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims),
+      bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16,
+      DimsToShape(output_state_dims), output_state_data_int16,
+      DimsToShape(output_activ_dims), output_activ_data_uint8,
+      DimsToShape(concat_temp_dims), concat_temp_data_uint8,
+      DimsToShape(activ_temp_dims), activ_temp_data_int16, gemmlowp_context);
+}
+
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastDivSlow(op_params, DimsToShape(input1_dims), input1_data,
+                   DimsToShape(input2_dims), input2_data,
+                   DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const RuntimeShape& input_shape,
+                     float* output_data, const RuntimeShape& output_shape) {
+  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  tflite::L2NormalizationParams op_params;
+  // No params need to be set for float, but reserved in signature for future
+  // activations.
+
+  L2Normalization(op_params, input_shape, input_data, output_shape,
+                  output_data);
+}
+
+inline void L2Normalization(const uint8_t* input_data,
+                            const RuntimeShape& input_shape,
+                            int32_t input_zero_point, uint8_t* output_data,
+                            const RuntimeShape& output_shape) {
+  tflite::L2NormalizationParams op_params;
+  op_params.input_zero_point = input_zero_point;
+
+  L2Normalization(op_params, input_shape, input_data, output_shape,
+                  output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  L2Normalization<Ac>(input_data, DimsToShape(input_dims), output_data,
+                      DimsToShape(output_dims));
+}
+
+inline void L2Normalization(const uint8_t* input_data,
+                            const Dims<4>& input_dims, int32_t input_zero_point,
+                            uint8_t* output_data, const Dims<4>& output_dims) {
+  L2Normalization(input_data, DimsToShape(input_dims), input_zero_point,
+                  output_data, DimsToShape(output_dims));
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  Relu(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8_t* input1_data,
+                const Dims<4>& input1_dims, int32_t input1_offset,
+                int32_t input1_multiplier, int input1_shift,
+                const uint8_t* input2_data, const Dims<4>& input2_dims,
+                int32_t input2_offset, int32_t input2_multiplier,
+                int input2_shift, int32_t output_offset,
+                int32_t output_multiplier, int output_shift,
+                int32_t output_activation_min, int32_t output_activation_max,
+                uint8_t* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32_t* input1_data, const Dims<4>& input1_dims,
+         const int32_t* input2_data, const Dims<4>& input2_dims,
+         int32_t* output_data, const Dims<4>& output_dims) {
+  ruy::profiler::ScopeLabel label("Add/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = std::numeric_limits<int32_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int32_t>::max();
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8_t* input1_data,
+                         const Dims<4>& input1_dims, int32_t input1_offset,
+                         int32_t input1_multiplier, int input1_shift,
+                         const uint8_t* input2_data, const Dims<4>& input2_dims,
+                         int32_t input2_offset, int32_t input2_multiplier,
+                         int input2_shift, int32_t output_offset,
+                         int32_t output_multiplier, int output_shift,
+                         int32_t output_activation_min,
+                         int32_t output_activation_max, uint8_t* output_data,
+                         const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8_t* input1_data, const Dims<4>& input1_dims,
+    int32_t input1_offset, int32_t input1_multiplier, int input1_shift,
+    const uint8_t* input2_data, const Dims<4>& input2_dims,
+    int32_t input2_offset, int32_t input2_multiplier, int input2_shift,
+    int32_t output_offset, int32_t output_multiplier, int output_shift,
+    int32_t output_activation_min, int32_t output_activation_max,
+    uint8_t* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  tflite::ArithmeticParams op_params;
+  op_params.broadcast_category =
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.broadcast_shape[4] = y0;
+  op_params.broadcast_shape[3] = y1;
+  op_params.broadcast_shape[2] = y2;
+  op_params.broadcast_shape[1] = y3;
+  op_params.broadcast_shape[0] = y4;
+  BroadcastAddFivefold(op_params, DimsToShape(input1_dims), input1_data,
+                       DimsToShape(input2_dims), input2_data,
+                       DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(const int16_t* input1_data, const Dims<4>& input1_dims,
+                int input1_shift, const int16_t* input2_data,
+                const Dims<4>& input2_dims, int input2_shift,
+                int16_t output_activation_min, int16_t output_activation_max,
+                int16_t* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &output_activation_min, &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+         const Dims<4>& input2_dims, T* output_data,
+         const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &output_activation_min, &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void BroadcastMul(const uint8_t* input1_data, const Dims<4>& input1_dims,
+                         int32_t input1_offset, const uint8_t* input2_data,
+                         const Dims<4>& input2_dims, int32_t input2_offset,
+                         int32_t output_offset, int32_t output_multiplier,
+                         int output_shift, int32_t output_activation_min,
+                         int32_t output_activation_max, uint8_t* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+  op_params.input1_offset = input1_offset;
+  op_params.input2_offset = input2_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+
+  BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8_t* input1_data, const Dims<4>& input1_dims,
+                         int32_t input1_offset, const uint8_t* input2_data,
+                         const Dims<4>& input2_dims, int32_t input2_offset,
+                         int32_t output_offset, int32_t output_multiplier,
+                         int output_shift, int32_t output_activation_min,
+                         int32_t output_activation_max, uint8_t* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
+inline bool AveragePool(const float* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int kwidth, int kheight,
+                        float output_activation_min,
+                        float output_activation_max, float* output_data,
+                        const Dims<4>& output_dims) {
+  tflite::PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = kheight;
+  params.filter_width = kwidth;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.float_activation_min = output_activation_min;
+  params.float_activation_max = output_activation_max;
+  return AveragePool(params, DimsToShape(input_dims), input_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const float* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int kwidth, int kheight, float* output_data,
+                 const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  return AveragePool(input_data, input_dims, stride_width, stride_height,
+                     pad_width, pad_height, kwidth, kheight,
+                     output_activation_min, output_activation_max, output_data,
+                     output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, float* output_data,
+                 const Dims<4>& output_dims) {
+  return AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width,
+                         pad_height, filter_width, filter_height, output_data,
+                         output_dims);
+}
+
+inline bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int filter_width, int filter_height,
+                        int32_t output_activation_min,
+                        int32_t output_activation_max, uint8_t* output_data,
+                        const Dims<4>& output_dims) {
+  tflite::PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = filter_height;
+  params.filter_width = filter_width;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  return AveragePool(params, DimsToShape(input_dims), input_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int filter_width, int filter_height,
+                 int32_t output_activation_min, int32_t output_activation_max,
+                 uint8_t* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  return AveragePool(input_data, input_dims, stride_width, stride_height,
+                     pad_width, pad_height, filter_width, filter_height,
+                     output_activation_min, output_activation_max, output_data,
+                     output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int stride, int pad_width, int pad_height, int filter_width,
+                 int filter_height, int32_t output_activation_min,
+                 int32_t output_activation_max, uint8_t* output_data,
+                 const Dims<4>& output_dims) {
+  return AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width,
+                         pad_height, filter_width, filter_height,
+                         output_activation_min, output_activation_max,
+                         output_data, output_dims);
+}
+
+inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int kwidth, int kheight,
+                    float output_activation_min, float output_activation_max,
+                    float* output_data, const Dims<4>& output_dims) {
+  tflite::PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = kheight;
+  params.filter_width = kwidth;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.float_activation_min = output_activation_min;
+  params.float_activation_max = output_activation_max;
+  MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+          output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int kwidth, int kheight, float* output_data,
+             const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, kwidth, kheight, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             float* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int filter_width, int filter_height,
+                    int32_t output_activation_min,
+                    int32_t output_activation_max, uint8_t* output_data,
+                    const Dims<4>& output_dims) {
+  PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = filter_height;
+  params.filter_width = filter_width;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+          output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int filter_width, int filter_height, int32_t output_activation_min,
+             int32_t output_activation_max, uint8_t* output_data,
+             const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, filter_width, filter_height, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             int32_t output_activation_min, int32_t output_activation_max,
+             uint8_t* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int filter_width, int filter_height,
+                   float output_activation_min, float output_activation_max,
+                   float* output_data, const Dims<4>& output_dims) {
+  PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = filter_height;
+  params.filter_width = filter_width;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.float_activation_min = output_activation_min;
+  params.float_activation_max = output_activation_max;
+  L2Pool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+         output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+            int stride_width, int stride_height, int pad_width, int pad_height,
+            int filter_width, int filter_height, float* output_data,
+            const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
+         pad_height, filter_width, filter_height, output_activation_min,
+         output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int filter_width, int filter_height,
+            float* output_data, const Dims<4>& output_dims) {
+  L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+             filter_width, filter_height, output_data, output_dims);
+}
+
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const uint8_t* input_data,
+                    const RuntimeShape& output_shape, uint8_t* output_data) {
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+
+  ruy::profiler::ScopeLabel label("Softmax/8bit");
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int b = 0; b < outer_size; ++b) {
+    const uint8_t* input_data_ptr = input_data + b * depth;
+    uint8_t* output_data_ptr = output_data + b * depth;
+
+    // Determine the largest entry in the current row
+    uint8_t max_in_row = 0;
+    {
+      int c = 0;
+#ifdef USE_NEON
+      uint8x16_t max16_0 = vdupq_n_u8(0);
+      uint8x16_t max16_1 = vdupq_n_u8(0);
+      for (; c <= depth - 32; c += 32) {
+        max16_0 = vmaxq_u8(max16_0, vld1q_u8(input_data_ptr + c + 0));
+        max16_1 = vmaxq_u8(max16_1, vld1q_u8(input_data_ptr + c + 16));
+      }
+      uint8x16_t max16 = vmaxq_u8(max16_0, max16_1);
+      if (c <= depth - 16) {
+        max16 = vmaxq_u8(max16, vld1q_u8(input_data_ptr + c));
+        c += 16;
+      }
+      uint8x8_t max8 = vmax_u8(vget_low_u8(max16), vget_high_u8(max16));
+      if (c <= depth - 8) {
+        max8 = vmax_u8(max8, vld1_u8(input_data_ptr + c));
+        c += 8;
+      }
+      uint8x8_t max4 = vmax_u8(max8, vext_u8(max8, max8, 4));
+      uint8x8_t max2 = vmax_u8(max4, vext_u8(max4, max4, 2));
+      uint8x8_t max1 = vpmax_u8(max2, max2);
+      max_in_row = vget_lane_u8(max1, 0);
+#endif
+      for (; c < depth; ++c) {
+        max_in_row = std::max(max_in_row, input_data_ptr[c]);
+      }
+    }
+
+#ifdef USE_NEON
+    using FixedPointAccumInt32x4 =
+        gemmlowp::FixedPoint<int32x4_t, kAccumulationIntegerBits>;
+    using FixedPointScaledDiffInt32x4 =
+        gemmlowp::FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
+    using FixedPoint0Int32x4 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    FixedPoint0Int32x4 input_beta_multiplier_f0 =
+        FixedPoint0Int32x4::FromScalarRaw(input_beta_multiplier);
+    int16x8_t max_in_row_s16 = vdupq_n_s16(max_in_row);
+#endif
+
+    // Compute the sum of exponentials of the differences of entries in the
+    // current row from the largest entry in the current row.
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    {
+      int c = 0;
+#ifdef USE_NEON
+      int32x4_t diff_min_s32 = vdupq_n_s32(diff_min);
+      FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
+      FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
+      FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
+      for (; c <= depth - 8; c += 8) {
+        uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c));
+        int16x8_t input_diff_s16 =
+            vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16);
+        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
+        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
+        int32x4_t mask_0 =
+            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
+        int32x4_t mask_1 =
+            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
+        FixedPointScaledDiffInt32x4 scaled_diff_0 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
+        FixedPointScaledDiffInt32x4 scaled_diff_1 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
+        FixedPointAccumInt32x4 exps_0 =
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(scaled_diff_0));
+        FixedPointAccumInt32x4 exps_1 =
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(scaled_diff_1));
+        FixedPointAccumInt32x4 masked_exps_0 =
+            SelectUsingMask(mask_0, exps_0, zeros);
+        FixedPointAccumInt32x4 masked_exps_1 =
+            SelectUsingMask(mask_1, exps_1, zeros);
+        sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
+        sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
+      }
+      int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
+      int32x2_t sum_of_exps_reduced_2 =
+          vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
+                   vget_high_s32(sum_of_exps_reduced_4));
+      int32x2_t sum_of_exps_reduced_1 =
+          vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
+      sum_of_exps =
+          FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
+#endif
+      for (; c < depth; ++c) {
+        int32_t input_diff =
+            static_cast<int32_t>(input_data_ptr[c]) - max_in_row;
+        if (input_diff >= diff_min) {
+          const int32_t input_diff_rescaled =
+              MultiplyByQuantizedMultiplierGreaterThanOne(
+                  input_diff, input_beta_multiplier, input_beta_left_shift);
+          const FixedPointScaledDiff scaled_diff_f8 =
+              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+          sum_of_exps =
+              sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                exp_on_negative_values(scaled_diff_f8));
+        }
+      }
+    }
+
+    // Compute the fixed-point multiplier and shift that we need to apply to
+    // perform a division by the above-computed sum-of-exponentials.
+    int num_bits_over_unit = 0;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
+
+    // Compute the quotients of exponentials of differences of entries in the
+    // current row from the largest entry, over the previously-computed sum of
+    // exponentials.
+    {
+      int c = 0;
+#ifdef USE_NEON
+      int16x8_t diff_min_s16 = vdupq_n_s16(diff_min);
+      for (; c <= depth - 8; c += 8) {
+        uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c));
+        int16x8_t input_diff_s16 =
+            vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16);
+        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
+        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
+        uint8x8_t mask = vmovn_u16(vcgeq_s16(input_diff_s16, diff_min_s16));
+        FixedPointScaledDiffInt32x4 scaled_diff_0 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
+        FixedPointScaledDiffInt32x4 scaled_diff_1 =
+            input_beta_multiplier_f0 *
+            FixedPointScaledDiffInt32x4::FromRaw(
+                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
+        FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
+        FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
+        int32x4_t output_s32_0 = gemmlowp::RoundingDivideByPOT(
+            vqrdmulhq_n_s32(exp_0.raw(), shifted_scale.raw()),
+            num_bits_over_unit + 31 - 8);
+        int32x4_t output_s32_1 = gemmlowp::RoundingDivideByPOT(
+            vqrdmulhq_n_s32(exp_1.raw(), shifted_scale.raw()),
+            num_bits_over_unit + 31 - 8);
+        int16x8_t output_s16 =
+            vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
+        uint8x8_t output_u8 = vqmovun_s16(output_s16);
+        uint8x8_t masked_output = vbsl_u8(mask, output_u8, vdup_n_u8(0));
+        vst1_u8(output_data_ptr + c, masked_output);
+      }
+#endif
+      for (; c < depth; ++c) {
+        int32_t input_diff =
+            static_cast<int32_t>(input_data_ptr[c]) - max_in_row;
+        if (input_diff >= diff_min) {
+          const int32_t input_diff_rescaled =
+              MultiplyByQuantizedMultiplierGreaterThanOne(
+                  input_diff, input_beta_multiplier, input_beta_left_shift);
+          const FixedPointScaledDiff scaled_diff_f8 =
+              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+          FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+          int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
+              (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+          output_data_ptr[c] = std::max(std::min(unsat_output, 255), 0);
+
+        } else {
+          output_data_ptr[c] = 0;
+        }
+      }
+    }
+  }
+}
+
+inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
+                    float beta, float* output_data,
+                    const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.beta = beta;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+                    float beta, float* output_data,
+                    const Dims<4>& output_dims) {
+  Softmax(input_data, DimsToShape(input_dims), beta, output_data,
+          DimsToShape(output_dims));
+}
+
+inline void Softmax(const uint8_t* input_data, const RuntimeShape& input_shape,
+                    int32_t input_beta_multiplier,
+                    int32_t input_beta_left_shift, int diff_min,
+                    uint8_t* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_beta_multiplier;
+  params.input_left_shift = input_beta_left_shift;
+  params.diff_min = diff_min;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+inline void Softmax(const uint8_t* input_data, const Dims<4>& input_dims,
+                    int32_t input_beta_multiplier,
+                    int32_t input_beta_left_shift, int diff_min,
+                    uint8_t* output_data, const Dims<4>& output_dims) {
+  Softmax(input_data, DimsToShape(input_dims), input_beta_multiplier,
+          input_beta_left_shift, diff_min, output_data,
+          DimsToShape(output_dims));
+}
+
+inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape,
+                       float* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  // No params currently used for float LogSoftmax.
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
+                       float* output_data, const Dims<4>& output_dims) {
+  LogSoftmax(input_data, DimsToShape(input_dims), output_data,
+             DimsToShape(output_dims));
+}
+
+inline void LogSoftmax(const uint8_t* input_data,
+                       const RuntimeShape& input_shape,
+                       int32_t input_multiplier, int32_t input_left_shift,
+                       int32_t reverse_scaling_divisor,
+                       int32_t reverse_scaling_right_shift, int diff_min,
+                       uint8_t* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  params.reverse_scaling_divisor = reverse_scaling_divisor;
+  params.reverse_scaling_right_shift = reverse_scaling_right_shift;
+  params.diff_min = diff_min;
+  reference_ops::LogSoftmax(params, input_shape, input_data, output_shape,
+                            output_data);
+}
+
+inline void LogSoftmax(const uint8_t* input_data, const Dims<4>& input_dims,
+                       int32_t input_multiplier, int32_t input_left_shift,
+                       int32_t reverse_scaling_divisor,
+                       int32_t reverse_scaling_right_shift, int diff_min,
+                       uint8_t* output_data, const Dims<4>& output_dims) {
+  reference_ops::LogSoftmax(
+      input_data, DimsToShape(input_dims), input_multiplier, input_left_shift,
+      reverse_scaling_divisor, reverse_scaling_right_shift, diff_min,
+      output_data, DimsToShape(output_dims));
+}
+
+inline void Logistic(const LogisticParams& params,
+                     const RuntimeShape& input_shape, const uint8_t* input_data,
+                     const RuntimeShape& output_shape, uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Logistic/Uint8");
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+#ifdef USE_NEON
+  // Handle 16 values at a time
+  for (; c <= size - 16; c += 16) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    uint8x16_t input_val_u8 = vld1q_u8(input_data + c);
+    int16x8_t input_val_centered_0 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+    int16x8_t input_val_centered_1 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint16x8_t mask_rightclamp_0 =
+        vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_rightclamp_1 =
+        vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_leftclamp_0 =
+        vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius));
+    uint16x8_t mask_leftclamp_1 =
+        vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius));
+    uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                                             vshrn_n_u16(mask_rightclamp_1, 8));
+    uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                                            vshrn_n_u16(mask_leftclamp_1, 8));
+
+    // This performs what is expressed in the scalar code as
+    // const int32 input_val_rescaled =
+    //     MultiplyByQuantizedMultiplierGreaterThanOne(
+    //         input_val_centered, input_multiplier, input_left_shift);
+    int32x4_t input_val_rescaled_0 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_1 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_2 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_3 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    input_val_rescaled_0 =
+        vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier);
+    input_val_rescaled_1 =
+        vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier);
+    input_val_rescaled_2 =
+        vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier);
+    input_val_rescaled_3 =
+        vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier);
+
+    // Invoke gemmlowp::logistic on FixedPoint wrapping int32x4_t
+    using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>;
+    using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    const FixedPoint4 input_val_f4_0 =
+        FixedPoint4::FromRaw(input_val_rescaled_0);
+    const FixedPoint4 input_val_f4_1 =
+        FixedPoint4::FromRaw(input_val_rescaled_1);
+    const FixedPoint4 input_val_f4_2 =
+        FixedPoint4::FromRaw(input_val_rescaled_2);
+    const FixedPoint4 input_val_f4_3 =
+        FixedPoint4::FromRaw(input_val_rescaled_3);
+    const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
+    const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
+    const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
+    const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
+
+    // Divide by 2^23 as in the scalar code
+    using gemmlowp::RoundingDivideByPOT;
+    int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 23);
+    int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 23);
+    int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 23);
+    int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 23);
+
+    // Cast output values to uint8, saturating
+    int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0),
+                                              vqmovn_s32(output_val_s32_1));
+    int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2),
+                                              vqmovn_s32(output_val_s32_3));
+    uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0),
+                                           vqmovun_s16(output_val_s16_1));
+
+    // Perform the bit-masking with the bit masks computed at the beginning,
+    // see the comment there.
+    output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp);
+    output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp);
+
+    // Store back to memory
+    vst1q_u8(output_data + c, output_val_u8);
+  }
+#endif
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8_t input_val_u8 = input_data[c];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32_t input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8_t>(output_val_s32);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Logistic(const uint8_t* input_data, const RuntimeShape& input_shape,
+                     int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int input_left_shift,
+                     uint8_t* output_data, const RuntimeShape& output_shape) {
+  LogisticParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+           output_data);
+}
+
+inline void Logistic(const uint8_t* input_data, const Dims<4>& input_dims,
+                     int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int input_left_shift,
+                     uint8_t* output_data, const Dims<4>& output_dims) {
+  Logistic(input_data, DimsToShape(input_dims), input_zero_point,
+           input_range_radius, input_multiplier, input_left_shift, output_data,
+           DimsToShape(output_dims));
+}
+
+inline void Logistic(const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
+  LogisticParams params;
+  // No params currently needed by int16 Logistic.
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const int16_t* input_data, const RuntimeShape& input_shape,
+                     int16_t* output_data, const RuntimeShape& output_shape) {
+  LogisticParams params;
+  // No params currently needed by int16 Logistic.
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const int16_t* input_data, const Dims<4>& input_dims,
+                     int16_t* output_data, const Dims<4>& output_dims) {
+  Logistic(input_data, DimsToShape(input_dims), output_data,
+           DimsToShape(output_dims));
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  Tanh(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data) {
+  // Note that this is almost the exact same code as in Logistic().
+  ruy::profiler::ScopeLabel label("Tanh");
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+  int32_t output_zero_point = 128;
+#ifdef USE_NEON
+  // Handle 16 values at a time
+  for (; c <= size - 16; c += 16) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    uint8x16_t input_val_u8 = vld1q_u8(input_data + c);
+    int16x8_t input_val_centered_0 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+    int16x8_t input_val_centered_1 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint16x8_t mask_rightclamp_0 =
+        vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_rightclamp_1 =
+        vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_leftclamp_0 =
+        vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius));
+    uint16x8_t mask_leftclamp_1 =
+        vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius));
+    uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                                             vshrn_n_u16(mask_rightclamp_1, 8));
+    uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                                            vshrn_n_u16(mask_leftclamp_1, 8));
+
+    // This performs what is expressed in the scalar code as
+    // const int32 input_val_rescaled =
+    //     MultiplyByQuantizedMultiplierGreaterThanOne(
+    //         input_val_centered, input_multiplier, input_left_shift);
+    int32x4_t input_val_rescaled_0 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_1 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_2 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_3 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    input_val_rescaled_0 =
+        vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier);
+    input_val_rescaled_1 =
+        vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier);
+    input_val_rescaled_2 =
+        vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier);
+    input_val_rescaled_3 =
+        vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier);
+
+    // Invoke gemmlowp::tanh on FixedPoint wrapping int32x4_t
+    using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>;
+    using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    const FixedPoint4 input_val_f4_0 =
+        FixedPoint4::FromRaw(input_val_rescaled_0);
+    const FixedPoint4 input_val_f4_1 =
+        FixedPoint4::FromRaw(input_val_rescaled_1);
+    const FixedPoint4 input_val_f4_2 =
+        FixedPoint4::FromRaw(input_val_rescaled_2);
+    const FixedPoint4 input_val_f4_3 =
+        FixedPoint4::FromRaw(input_val_rescaled_3);
+    const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
+    const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
+    const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
+    const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
+
+    // Divide by 2^24 as in the scalar code
+    using gemmlowp::RoundingDivideByPOT;
+    int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 24);
+    int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 24);
+    int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 24);
+    int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 24);
+
+    // Add the output zero point
+    int32x4_t output_zero_point_s32 = vdupq_n_s32(output_zero_point);
+    output_val_s32_0 = vaddq_s32(output_val_s32_0, output_zero_point_s32);
+    output_val_s32_1 = vaddq_s32(output_val_s32_1, output_zero_point_s32);
+    output_val_s32_2 = vaddq_s32(output_val_s32_2, output_zero_point_s32);
+    output_val_s32_3 = vaddq_s32(output_val_s32_3, output_zero_point_s32);
+
+    // Cast output values to uint8, saturating
+    int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0),
+                                              vqmovn_s32(output_val_s32_1));
+    int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2),
+                                              vqmovn_s32(output_val_s32_3));
+    uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0),
+                                           vqmovun_s16(output_val_s16_1));
+
+    // Perform the bit-masking with the bit masks computed at the beginning,
+    // see the comment there.
+    output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp);
+    output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp);
+
+    // Store back to memory
+    vst1q_u8(output_data + c, output_val_u8);
+  }
+#endif
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8_t input_val_u8 = input_data[c];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32_t input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8_t>(output_val_s32);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Tanh(const uint8_t* input_data, const RuntimeShape& input_shape,
+                 int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int input_left_shift,
+                 uint8_t* output_data, const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int input_left_shift,
+                 uint8_t* output_data, const Dims<4>& output_dims) {
+  Tanh(input_data, DimsToShape(input_dims), input_zero_point,
+       input_range_radius, input_multiplier, input_left_shift, output_data,
+       DimsToShape(output_dims));
+}
+
+inline void Tanh(const int16_t* input_data, const RuntimeShape& input_shape,
+                 int input_left_shift, int16_t* output_data,
+                 const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const int16_t* input_data, const Dims<4>& input_dims,
+                 int input_left_shift, int16_t* output_data,
+                 const Dims<4>& output_dims) {
+  Tanh(input_data, DimsToShape(input_dims), input_left_shift, output_data,
+       DimsToShape(output_dims));
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = block_size;
+
+  DepthToSpace(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::SpaceToDepthParams op_params;
+  op_params.block_size = block_size;
+
+  SpaceToDepth(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+inline void Mul(const int32_t* input1_data, const Dims<4>& input1_dims,
+                const int32_t* input2_data, const Dims<4>& input2_dims,
+                int32_t output_activation_min, int32_t output_activation_max,
+                int32_t* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Mul(const int32_t* input1_data, const Dims<4>& input1_dims,
+         const int32_t* input2_data, const Dims<4>& input2_dims,
+         int32_t* output_data, const Dims<4>& output_dims) {
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+  tflite::ArithmeticParams op_params;
+  // No parameters needed.
+
+  MulNoActivation(op_params, DimsToShape(input1_dims), input1_data,
+                  DimsToShape(input2_dims), input2_data,
+                  DimsToShape(output_dims), output_data);
+}
+
+inline void Mul(const int16_t* input1_data, const Dims<4>& input1_dims,
+                const int16_t* input2_data, const Dims<4>& input2_dims,
+                int16_t* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  // No parameters needed.
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void Mul(const int16_t* input1_data, const Dims<4>& input1_dims,
+                const int16_t* input2_data, const Dims<4>& input2_dims,
+                int32_t output_offset, int32_t output_activation_min,
+                int32_t output_activation_max, uint8_t* output_data,
+                const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.output_offset = output_offset;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// For compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
+                         const float* input2_data, const Dims<4>& input2_dims,
+                         float* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  float float_activation_min;
+  float float_activation_max;
+  GetActivationMinMax(Ac, &float_activation_min, &float_activation_max);
+  SetActivationParams(float_activation_min, float_activation_max, &op_params);
+
+  BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+                                       const Dims<4>& input_dims, int range,
+                                       float bias, float alpha, float beta,
+                                       float* output_data,
+                                       const Dims<4>& output_dims) {
+  tflite::LocalResponseNormalizationParams op_params;
+  op_params.range = range;
+  op_params.bias = bias;
+  op_params.alpha = alpha;
+  op_params.beta = beta;
+
+  LocalResponseNormalization(op_params, DimsToShape(input_dims), input_data,
+                             DimsToShape(output_dims), output_data);
+}
+
+template <typename SrcT, typename DstT>
+void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data,
+          const Dims<4>& output_dims) {
+  Cast(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
+}
+
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = align_corners;
+  op_params.half_pixel_centers = false;
+  ResizeBilinear(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(output_size_dims), output_size_data,
+                 DimsToShape(output_dims), output_data);
+}
+
+inline void ResizeBilinear(const uint8_t* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims,
+                           uint8_t* output_data, const Dims<4>& output_dims,
+                           bool align_corners) {
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = align_corners;
+  op_params.half_pixel_centers = false;
+  ResizeBilinear(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(output_size_dims), output_size_data,
+                 DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const uint8_t* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims,
+                           uint8_t* output_data, const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+                           const int32_t* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32_t* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
+  BatchToSpaceND(DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(crops_dims), crops_data, DimsToShape(output_dims),
+                 output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  tflite::PadParams op_params;
+  op_params.left_padding_count = 4;
+  op_params.right_padding_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.left_padding[i] = left_paddings[3 - i];
+    op_params.right_padding[i] = right_paddings[3 - i];
+  }
+  const T pad_value_copy = pad_value;
+
+  Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+      DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  const T converted_pad_value = static_cast<T>(pad_value);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, converted_pad_value);
+}
+
+// Old Pad that only padded with 0.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  const T pad_value = static_cast<T>(0);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, pad_value);
+}
+
+template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::SliceParams op_params;
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.begin[i] = begin[3 - i];
+    op_params.size[i] = size[3 - i];
+  }
+
+  Slice(op_params, DimsToShape(input_dims), input_data,
+        DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+inline void Dequantize(const uint8_t* input_data, const Dims<4>& input_dims,
+                       int32_t zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  tflite::DequantizationParams op_params;
+  op_params.zero_point = zero_point;
+  op_params.scale = scale;
+
+  Dequantize(op_params, DimsToShape(input_dims), input_data,
+             DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void Transpose(const T* input, const Dims<4>& input_dims, T* output,
+               const Dims<4>& output_dims, const int* permuted_axes) {
+  TransposeParams params;
+  params.perm_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    params.perm[i] = 3 - permuted_axes[3 - i];
+  }
+  Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims),
+            output);
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  auto op_params = strided_slice::BuildStridedSliceParams(
+      begin_mask, end_mask, shrink_axis_mask, start_indices, stop_indices,
+      strides);
+  reference_ops::StridedSliceReverseIndices(&op_params);
+
+  StridedSlice(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data,
+            const tflite::Dims<4>& input_dims, T2* output_data,
+            const tflite::Dims<4>& output_dims) {
+  // Assumes the input always has 4 dimensions, and therefore,
+  // output always has three dimensions.
+  auto output_shape = RuntimeShape(
+      {output_dims.sizes[2], output_dims.sizes[1], output_dims.sizes[0]});
+  // Another way to interpret this is that output_dims.sizes[4] is always 1.
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(),
+                   DimsToShape(output_dims).FlatSize());
+  // Legacy path only supported this.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+  ArgMinMax(DimsToShape(input_dims), input_data, axis, output_shape,
+            output_data, /*is_arg_max=*/true);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+               T2* output_data, const Dims<4>& output_dims,
+               const bool is_arg_max) {
+  ArgMinMax(axis, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+            output_data, is_arg_max);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
new file mode 100644
index 00000000..6bb1aaac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace multithreaded_ops {
+
+// Shorthands for the types we need when interfacing with the EigenTensor
+// library.
+typedef Eigen::TensorMap<
+    Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+    EigenMatrix;
+typedef Eigen::TensorMap<
+    Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
+    Eigen::Aligned>
+    ConstEigenMatrix;
+
+typedef Eigen::TensorMap<
+    Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+    EigenTensor;
+typedef Eigen::TensorMap<
+    Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
+    Eigen::Aligned>
+    ConstEigenTensor;
+
+// Utility functions we need for the EigenTensor API.
+template <typename Device, typename T>
+struct MatMulConvFunctor {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, EigenMatrix out, ConstEigenMatrix in0,
+      ConstEigenMatrix in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    out.device(d) = in0.contract(in1, dim_pair);
+  }
+};
+
+template <class T>
+class EigenTensorConvFunctor {
+ private:
+  Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding) {
+    switch (padding) {
+      case PaddingType::kValid:
+        return Eigen::PADDING_VALID;
+      case PaddingType::kSame:
+        return Eigen::PADDING_SAME;
+      case PaddingType::kNone:
+        assert(false);  // should never get here.
+        return Eigen::PADDING_VALID;
+    }
+    return Eigen::PADDING_SAME;  // Prevent compiler warning about missing
+                                 // return
+  }
+
+ public:
+  void operator()(const Eigen::ThreadPoolDevice& device, const T* input_data,
+                  int input_batches, int input_height, int input_width,
+                  int input_depth, const T* filter_data, int filter_height,
+                  int filter_width, int filter_count, int stride_rows,
+                  int stride_cols, int pad_width, int pad_height,
+                  PaddingType padding, T* output_data, int output_height,
+                  int output_width) {
+    const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 &&
+                                stride_rows == 1 && stride_cols == 1);
+    if (is_1x1_kernel) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      const int conv_width = output_height * output_width;
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      EigenMatrix output(output_data, input_batches * conv_width, filter_count);
+      ConstEigenMatrix input(input_data, input_batches * conv_width,
+                             input_depth);
+      ConstEigenMatrix filter(filter_data, input_depth, filter_count);
+      MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
+                                                      filter, dim_pair);
+    } else if (filter_height == input_height && filter_width == input_width &&
+               pad_width == 0 && pad_height == 0) {
+      // If the input data and filter have the same height/width,
+      // the 2D convolution is reduced to matrix multiplication.
+      const int k =  // Length of reduction dimension.
+          filter_width * filter_height * input_depth;
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      EigenMatrix output(output_data, input_batches, filter_count);
+      ConstEigenMatrix input(input_data, input_batches, k);
+      ConstEigenMatrix filter(filter_data, k, filter_count);
+      MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
+                                                      filter, dim_pair);
+    } else {
+      EigenTensor output(output_data, input_batches, output_height,
+                         output_width, filter_count);
+      ConstEigenTensor input(input_data, input_batches, input_height,
+                             input_width, input_depth);
+      ConstEigenTensor filter(filter_data, filter_height, filter_width,
+                              input_depth, filter_count);
+      output.device(device) =
+          Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
+                                    RuntimePadding2EigenPadding(padding));
+    }
+  }
+};
+
+inline void Conv(const Eigen::ThreadPoolDevice& device,
+                 const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  // Nest profiling under "Conv", to aggregate with other kernels.
+  ruy::profiler::ScopeLabel label("Conv");
+  ruy::profiler::ScopeLabel inner_label("Multithreaded EigenTensor");
+
+  // im2col data should not be generated for the multi-thread supporting case.
+  TFLITE_DCHECK(!im2col_data);
+  (void)im2col_shape;
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const PaddingType padding = params.padding_type;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  EigenTensorConvFunctor<float> conv_functor;
+  conv_functor(device, input_data, batches, input_height, input_width,
+               input_depth, filter_data, filter_height, filter_width,
+               output_depth, stride_height, stride_width, pad_height, pad_width,
+               padding, output_data, output_height, output_width);
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
+}
+
+}  // namespace multithreaded_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_check.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_check.h
new file mode 100644
index 00000000..8fdaeef4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_check.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>  // IWYU pragma: export
+#endif
+
+#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#define USE_NEON
+#include "NEON_2_SSE.h"  // IWYU pragma: export
+#endif
+
+// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
+// defined, PortableSomeFunc(args) otherwise.
+#ifdef USE_NEON
+// Always use Neon code
+#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
+
+#else
+// No NEON available: Use Portable code
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+
+#endif  // defined(USE_NEON)
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
new file mode 100644
index 00000000..ebb5a2ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -0,0 +1,334 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vector, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vectors,
+                                         const float* scaling_factors,
+                                         int n_batch,
+                                         float* __restrict__ result) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vectors,
+                                         const float* scaling_factors,
+                                         int n_batch, int32_t* scratch,
+                                         float* __restrict__ result,
+                                         CpuBackendContext* context) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, scratch, result, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, per_channel_scale,
+                   input_offset, scratch, row_sums, compute_row_sums, context);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
+                   segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
+                   m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x16, matrix,
+                   segments, indices, m_rows, m_cols, vector, bias_vector,
+                   n_batch, input_offset, output_multiplier, output_shift,
+                   per_channel_scale, per_channel_shift, output_offset,
+                   output_activation_min, output_activation_max, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    const float* per_channel_scale) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
+                   m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+                   per_channel_scale);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input, bias,
+                   input_to_gate_weights, multiplier, shift, n_batch, n_input,
+                   n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input, bias,
+                   input_to_gate_weights, multiplier, shift, n_batch, n_input,
+                   n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
+void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                    int32_t n_row, int32_t n_col,
+                                    int32_t* output) {
+  NEON_OR_PORTABLE(MatrixScalarMultiplyAccumulate, matrix, scalar, n_row, n_col,
+                   output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  NEON_OR_PORTABLE(ApplyLayerNorm, input, layer_norm_weights, bias,
+                   layer_norm_scale_a, layer_norm_scale_b, variance_limit,
+                   n_batch, n_input, output);
+}
+
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output);
+}
+
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
+void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output) {
+  NEON_OR_PORTABLE(ApplyTanh, integer_bits, input, n_batch, n_input, output);
+}
+
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int shift, int n_batch, int n_input,
+              int32_t output_zp, int8_t* output) {
+  NEON_OR_PORTABLE(CwiseMul, input_1, input_2, multiplier, shift, n_batch,
+                   n_input, output_zp, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result) {
+  NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
+                   batch_vector, n_batch, multiplier, shift, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+// Check if all entries of a vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size) {
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
+// Check if all entries of a vector are zero for int8.
+bool IsZeroVector(const int8_t* vector, int v_size) {
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result) {
+  NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
+                   min_value, max_value, scaling_factor);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
+                   min_value, max_value, scaling_factor);
+}
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset) {
+  NEON_OR_PORTABLE(AsymmetricQuantizeFloats, values, size, quantized_values,
+                   scaling_factor, offset);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                   reduction_size);
+}
+
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                   reduction_size);
+}
+
+void MeanStddevNormalization(const float* __restrict__ input_vector,
+                             float* __restrict__ output_vector, int v_size,
+                             int n_batch) {
+  NEON_OR_PORTABLE(MeanStddevNormalization, input_vector, output_vector, v_size,
+                   n_batch);
+}
+
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
new file mode 100644
index 00000000..dd8a05f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -0,0 +1,198 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+namespace tensor_utils {
+
+#ifdef USE_NEON
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                             const int m_rows, const int m_cols,
+                                             const int8_t* __restrict__ vectors,
+                                             const float* scaling_factors,
+                                             int n_batch,
+                                             float* __restrict__ result);
+
+// Same as above but with a scratch buffer and CpuBackendContext for the
+// int8 x int8 -> int32 accumulation computation
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                             const int m_rows, const int m_cols,
+                                             const int8_t* __restrict__ vectors,
+                                             const float* scaling_factors,
+                                             int n_batch, int32_t* scratch,
+                                             float* __restrict__ result,
+                                             CpuBackendContext* context);
+
+// Matrix multiplication for quantized values using asymmetric quantization.
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
+
+void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                        const int32_t* bias, int32_t layer_norm_scale_a,
+                        int32_t layer_norm_scale_b, int32_t variance_limit,
+                        int n_batch, int n_input, int16_t* output);
+
+void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                      int16_t* output);
+
+void NeonApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+                   int32_t n_input, int16_t* output);
+
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int shift, int16_t* output);
+
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                  int32_t multiplier, int shift, int n_batch, int n_input,
+                  int32_t output_zp, int8_t* output);
+
+void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+                  int n_input, int16_t* output);
+
+void NeonCwiseClipping(float* vector, const int v_size,
+                       const float clipping_value);
+void NeonCwiseClipping(int16_t* vector, const int v_size,
+                       const int16_t clipping_value);
+void NeonCwiseClipping(int8_t* vector, const int v_size,
+                       const int8_t clipping_value);
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context);
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context);
+
+void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                        int32_t n_row, int32_t n_col,
+                                        int32_t* output);
+
+void NeonSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector. Sparse version.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result);
+
+// Multiplies a symmetric quantized matrix by a quantized batch vector. The
+// matrix is stored in sparse format.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+// Sparse version.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    const float* per_channel_scale);
+
+// Dot product of two vectors.
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                 int v_size);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void NeonSub1Vector(const float* vector, int v_size, float* result);
+
+void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
+// Multiply all elements of vector with a scalar.
+void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                              float* result);
+
+// Check if all entries of a vector are zero.
+bool NeonIsZeroVector(const float* vector, int v_size);
+
+// Check if all entries of a vector are zero.
+bool NeonIsZeroVector(const int8_t* vector, int v_size);
+
+// Symmetric quantizer.
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor);
+
+// Symmetric quantizer.
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float min, float max,
+                                 float* scaling_factor);
+
+// Asymmetric quantizer.
+void NeonAsymmetricQuantizeFloats(const float* values, const int size,
+                                  int8_t* quantized_values,
+                                  float* scaling_factor, int32_t* offset);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+                            int output_size, int reduction_size);
+
+void NeonReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                            int output_size, int reduction_size);
+
+void NeonVectorBatchVectorCwiseProductAccumulate(
+    const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
+    int32_t multiplier, int shift, int16_t* result);
+
+// Layer norm for each batch.
+void NeonMeanStddevNormalization(const float* __restrict__ input_vector,
+                                 float* __restrict__ output_vector, int v_size,
+                                 int n_batch);
+
+#endif  // USE_NEON
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
new file mode 100644
index 00000000..d340d2fe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -0,0 +1,8033 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#endif
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/transpose_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+#if __aarch64__ && __clang__
+#define TFLITE_SOFTMAX_USE_UINT16_LUT
+#endif
+
+namespace tflite {
+namespace optimized_ops {
+
+// Unoptimized reference ops:
+using reference_ops::Broadcast4DSlowGreater;
+using reference_ops::Broadcast4DSlowGreaterEqual;
+using reference_ops::Broadcast4DSlowGreaterEqualWithScaling;
+using reference_ops::Broadcast4DSlowGreaterWithScaling;
+using reference_ops::Broadcast4DSlowLess;
+using reference_ops::Broadcast4DSlowLessEqual;
+using reference_ops::Broadcast4DSlowLessEqualWithScaling;
+using reference_ops::Broadcast4DSlowLessWithScaling;
+using reference_ops::BroadcastAdd6DSlow;
+using reference_ops::BroadcastMul6DSlow;
+using reference_ops::BroadcastSub16POTSlow;
+using reference_ops::BroadcastSubSlow;
+using reference_ops::Concatenation;
+using reference_ops::ConcatenationWithScaling;
+using reference_ops::DepthConcatenation;
+using reference_ops::Div;
+using reference_ops::Elu;
+using reference_ops::FakeQuant;
+using reference_ops::Fill;
+using reference_ops::Gather;
+using reference_ops::Greater;
+using reference_ops::GreaterEqual;
+using reference_ops::GreaterEqualWithScaling;
+using reference_ops::GreaterWithScaling;
+using reference_ops::LeakyRelu;
+using reference_ops::Less;
+using reference_ops::LessEqual;
+using reference_ops::LessEqualWithScaling;
+using reference_ops::LessWithScaling;
+using reference_ops::ProcessBroadcastShapes;
+using reference_ops::RankOneSelect;
+using reference_ops::Relu0To1;  // NOLINT
+using reference_ops::Relu1;
+using reference_ops::Relu6;
+using reference_ops::ReluX;
+using reference_ops::Round;
+using reference_ops::Select;
+using reference_ops::SpaceToBatchND;
+using reference_ops::Split;
+using reference_ops::Sub16;
+
+// TODO(b/80247582) Remove this constant.
+// This will be phased out as the shifts are revised with more thought. Use of a
+// constant enables us to track progress on this work.
+//
+// Used to convert from old-style shifts (right) to new-style (left).
+static constexpr int kReverseShift = -1;
+
+// Copied from tensorflow/core/framework/tensor_types.h
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
+struct TTypes {
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Flat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
+      UnalignedConstMatrix;
+};
+
+// TODO(b/62193649): this function is only needed as long
+// as we have the --variable_batch hack.
+template <typename Scalar>
+MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
+                                                   const RuntimeShape& shape,
+                                                   int rows) {
+  const int flatsize = shape.FlatSize();
+  TFLITE_DCHECK_EQ(flatsize % rows, 0);
+  const int cols = flatsize / rows;
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+static inline void swap_data(ArithmeticParams& arithmetic_params) {
+  std::swap(arithmetic_params.input1_offset, arithmetic_params.input2_offset);
+  std::swap(arithmetic_params.input1_shift, arithmetic_params.input2_shift);
+  std::swap(arithmetic_params.input1_multiplier,
+            arithmetic_params.input2_multiplier);
+}
+
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+TFLITE_NOINLINE void BinaryBroadcastFiveFold(
+    const ArithmeticParams& unswitched_params,
+    const RuntimeShape& unswitched_input1_shape,
+    const T* unswitched_input1_data,
+    const RuntimeShape& unswitched_input2_shape,
+    const T* unswitched_input2_data, const RuntimeShape& output_shape,
+    T* output_data, ElementwiseF elementwise_f,
+    ScalarBroadcastF scalar_broadcast_f) {
+  ArithmeticParams& params = const_cast<ArithmeticParams&>(unswitched_params);
+  const T* input1_data_ptr = unswitched_input1_data;
+  const T* input2_data_reset = unswitched_input2_data;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+  if (!use_unswitched) {
+    swap_data(params);  // Swap in-place temporarily; will revert before return.
+    input1_data_ptr = unswitched_input2_data;
+    input2_data_reset = unswitched_input1_data;
+  }
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T* output_data_ptr = output_data;
+
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const T* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
+                          output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else if (input1_data_ptr != nullptr) {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const T* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+
+  if (!use_unswitched) {
+    swap_data(params);  // Revert the referenced params to the original state.
+  }
+}
+
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+
+// Looks up each element of <indices> in <table>, returns them in a vector.
+inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
+                                        uint8x16_t indices) {
+  // Look up in 1st quarter of the table: top 2 bits of indices == 00
+  uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
+  // Look up in 2nd quarter of the table: top 2 bits of indices == 01
+  uint8x16_t output2 =
+      vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
+  // Look up in 3rd quarter of the table: top 2 bits of indices == 10
+  uint8x16_t output3 =
+      vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
+  // Look up in 4th quarter of the table: top 2 bits of indices == 11
+  uint8x16_t output4 =
+      vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
+
+  // Combine result of the 4 lookups.
+  return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
+}
+
+#endif
+
+inline void AddBiasAndEvalActivationFunction(float output_activation_min,
+                                             float output_activation_max,
+                                             const RuntimeShape& bias_shape,
+                                             const float* bias_data,
+                                             const RuntimeShape& array_shape,
+                                             float* array_data) {
+  BiasAndClamp(output_activation_min, output_activation_max,
+               bias_shape.FlatSize(), bias_data, array_shape.FlatSize(),
+               array_data);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& bias_shape,
+    const float* optional_bias_data, const RuntimeShape& output_shape,
+    float* output_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  const int dims_count = weights_shape.DimensionsCount();
+  const int input_rows = weights_shape.Dims(dims_count - 1);
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = input_rows;
+  rhs_params.cols = input_shape.FlatSize() / input_rows;
+  rhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
+  TFLITE_DCHECK_EQ(input_shape.FlatSize(), rhs_params.rows * rhs_params.cols);
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.cols = weights_shape.Dims(dims_count - 1);
+  lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+  lhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  dst_params.cols =
+      FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = optional_bias_data;
+  gemm_params.clamp_min = params.float_activation_min;
+  gemm_params.clamp_max = params.float_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, weights_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected/8bit");
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
+  const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
+  TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  }
+
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  lhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
+  rhs_params.rows = filter_cols;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  rhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
+  cpu_backend_gemm::MatrixParams<uint8_t> dst_params;
+  dst_params.rows = filter_rows;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<int32_t, uint8_t> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data_int32, const RuntimeShape& output_shape,
+    int16_t* output_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected/Uint8Int16");
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
+  lhs_params.rows = output_depth;
+  lhs_params.cols = accum_depth;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  lhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  rhs_params.cache_policy =
+      cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
+  cpu_backend_gemm::MatrixParams<int16_t> dst_params;
+  dst_params.rows = output_depth;
+  dst_params.cols = batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = 0;
+  cpu_backend_gemm::GemmParams<int32_t, int16_t> gemm_params;
+  gemm_params.bias = bias_data_int32;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+// Internal function doing the actual arithmetic work for
+// ShuffledFullyConnected.
+// May be called either directly by it (single-threaded case) or may be used
+// as the 'task' for worker threads to run (multi-threaded case, see
+// ShuffledFullyConnectedWorkerTask below).
+inline void ShuffledFullyConnectedWorkerImpl(
+    const uint8_t* shuffled_input_workspace_data,
+    const int8_t* shuffled_weights_data, int batches, int output_depth,
+    int output_stride, int accum_depth, const int32_t* bias_data,
+    int32_t output_multiplier, int output_shift, int16_t* output_data) {
+#if defined USE_NEON
+  const int8_t* shuffled_weights_ptr = shuffled_weights_data;
+  if (batches == 1) {
+    const int right_shift = output_shift > 0 ? 0 : -output_shift;
+    const int left_shift = output_shift > 0 ? output_shift : 0;
+    for (int c = 0; c < output_depth; c += 4) {
+      // Accumulation loop.
+      int32x4_t row_accum0 = vdupq_n_s32(0);
+      int32x4_t row_accum1 = vdupq_n_s32(0);
+      int32x4_t row_accum2 = vdupq_n_s32(0);
+      int32x4_t row_accum3 = vdupq_n_s32(0);
+      for (int d = 0; d < accum_depth; d += 16) {
+        int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+        int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+        int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+        int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+        shuffled_weights_ptr += 64;
+        int8x16_t input =
+            vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d));
+        int16x8_t local_accum0 =
+            vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
+        int16x8_t local_accum1 =
+            vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
+        int16x8_t local_accum2 =
+            vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
+        int16x8_t local_accum3 =
+            vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
+        local_accum0 =
+            vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
+        local_accum1 =
+            vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
+        local_accum2 =
+            vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
+        local_accum3 =
+            vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
+        row_accum0 = vpadalq_s16(row_accum0, local_accum0);
+        row_accum1 = vpadalq_s16(row_accum1, local_accum1);
+        row_accum2 = vpadalq_s16(row_accum2, local_accum2);
+        row_accum3 = vpadalq_s16(row_accum3, local_accum3);
+      }
+      // Horizontally reduce accumulators
+      int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+          pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+      pairwise_reduced_acc_0 =
+          vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
+      pairwise_reduced_acc_1 =
+          vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
+      pairwise_reduced_acc_2 =
+          vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
+      pairwise_reduced_acc_3 =
+          vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
+      const int32x2_t reduced_lo =
+          vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      const int32x2_t reduced_hi =
+          vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+      // Add bias values.
+      int32x4_t bias_vec = vld1q_s32(bias_data + c);
+      reduced = vaddq_s32(reduced, bias_vec);
+      reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, right_shift);
+      // Narrow values down to 16 bit signed.
+      const int16x4_t res16 = vqmovn_s32(reduced);
+      vst1_s16(output_data + c, res16);
+    }
+  } else if (batches == 4) {
+    const int right_shift = output_shift > 0 ? 0 : -output_shift;
+    const int left_shift = output_shift > 0 ? output_shift : 0;
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8_t* shuffled_input_ptr =
+          reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+      // Accumulation loop.
+      int32x4_t row_accum00 = vdupq_n_s32(0);
+      int32x4_t row_accum10 = vdupq_n_s32(0);
+      int32x4_t row_accum20 = vdupq_n_s32(0);
+      int32x4_t row_accum30 = vdupq_n_s32(0);
+      int32x4_t row_accum01 = vdupq_n_s32(0);
+      int32x4_t row_accum11 = vdupq_n_s32(0);
+      int32x4_t row_accum21 = vdupq_n_s32(0);
+      int32x4_t row_accum31 = vdupq_n_s32(0);
+      int32x4_t row_accum02 = vdupq_n_s32(0);
+      int32x4_t row_accum12 = vdupq_n_s32(0);
+      int32x4_t row_accum22 = vdupq_n_s32(0);
+      int32x4_t row_accum32 = vdupq_n_s32(0);
+      int32x4_t row_accum03 = vdupq_n_s32(0);
+      int32x4_t row_accum13 = vdupq_n_s32(0);
+      int32x4_t row_accum23 = vdupq_n_s32(0);
+      int32x4_t row_accum33 = vdupq_n_s32(0);
+      for (int d = 0; d < accum_depth; d += 16) {
+        int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+        int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+        int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+        int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+        shuffled_weights_ptr += 64;
+        int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0);
+        int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16);
+        int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32);
+        int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48);
+        shuffled_input_ptr += 64;
+        int16x8_t local_accum0, local_accum1, local_accum2, local_accum3;
+#define TFLITE_SHUFFLED_FC_ACCUM(B)                                           \
+  local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B));      \
+  local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B));      \
+  local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B));      \
+  local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B));      \
+  local_accum0 =                                                              \
+      vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \
+  local_accum1 =                                                              \
+      vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \
+  local_accum2 =                                                              \
+      vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \
+  local_accum3 =                                                              \
+      vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \
+  row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0);                   \
+  row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1);                   \
+  row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2);                   \
+  row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3);
+
+        TFLITE_SHUFFLED_FC_ACCUM(0)
+        TFLITE_SHUFFLED_FC_ACCUM(1)
+        TFLITE_SHUFFLED_FC_ACCUM(2)
+        TFLITE_SHUFFLED_FC_ACCUM(3)
+
+#undef TFLITE_SHUFFLED_FC_ACCUM
+      }
+      // Horizontally reduce accumulators
+
+#define TFLITE_SHUFFLED_FC_STORE(B)                                           \
+  {                                                                           \
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,                 \
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;                       \
+    pairwise_reduced_acc_0 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \
+    pairwise_reduced_acc_1 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \
+    pairwise_reduced_acc_2 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \
+    pairwise_reduced_acc_3 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \
+    const int32x2_t reduced_lo =                                              \
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);            \
+    const int32x2_t reduced_hi =                                              \
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);            \
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);                 \
+    int32x4_t bias_vec = vld1q_s32(bias_data + c);                            \
+    reduced = vaddq_s32(reduced, bias_vec);                                   \
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));                    \
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);                    \
+    using gemmlowp::RoundingDivideByPOT;                                      \
+    reduced = RoundingDivideByPOT(reduced, right_shift);                      \
+    const int16x4_t res16 = vqmovn_s32(reduced);                              \
+    vst1_s16(output_data + c + B * output_stride, res16);                     \
+  }
+
+      TFLITE_SHUFFLED_FC_STORE(0);
+      TFLITE_SHUFFLED_FC_STORE(1);
+      TFLITE_SHUFFLED_FC_STORE(2);
+      TFLITE_SHUFFLED_FC_STORE(3);
+
+#undef TFLITE_SHUFFLED_FC_STORE
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+#else
+  if (batches == 1) {
+    int16_t* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8_t values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
+        }
+      }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32_t accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc =
+            MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+        // Saturate, cast to int16_t, and store to output array.
+        acc = std::max(acc, -32768);
+        acc = std::min(acc, 32767);
+        output_ptr[c + i] = acc;
+      }
+    }
+  } else if (batches == 4) {
+    int16_t* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8_t values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32_t accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          // Saturate, cast to int16_t, and store to output array.
+          acc = std::max(acc, -32768);
+          acc = std::min(acc, 32767);
+          output_ptr[b * output_stride + c + i] = acc;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+#endif
+}
+
+// Wraps ShuffledFullyConnectedWorkerImpl into a Task class
+// to allow using gemmlowp's threadpool.
+struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task {
+  ShuffledFullyConnectedWorkerTask(const uint8_t* input_data,
+                                   const int8_t* shuffled_weights_data,
+                                   int batches, int output_depth,
+                                   int output_stride, int accum_depth,
+                                   const int32_t* bias_data,
+                                   int32_t output_multiplier, int output_shift,
+                                   int16_t* output_data)
+      : input_data_(input_data),
+        shuffled_weights_data_(shuffled_weights_data),
+        batches_(batches),
+        output_depth_(output_depth),
+        output_stride_(output_stride),
+        accum_depth_(accum_depth),
+        bias_data_(bias_data),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_data_(output_data) {}
+
+  void Run() override {
+    ShuffledFullyConnectedWorkerImpl(
+        input_data_, shuffled_weights_data_, batches_, output_depth_,
+        output_stride_, accum_depth_, bias_data_, output_multiplier_,
+        output_shift_, output_data_);
+  }
+
+  const uint8_t* input_data_;
+  const int8_t* shuffled_weights_data_;
+  int batches_;
+  int output_depth_;
+  int output_stride_;
+  int accum_depth_;
+  const int32_t* bias_data_;
+  int32_t output_multiplier_;
+  int output_shift_;
+  int16_t* output_data_;
+};
+
+inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data,
+    CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("ShuffledFullyConnected/8bit");
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8_t values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8_t* int8_shuffled_weights_data =
+      reinterpret_cast<const int8_t*>(shuffled_weights_data);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  if (batches == 1) {
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (int i = 0; i < accum_depth; i += 16) {
+      uint8x16_t val = vld1q_u8(input_data + i);
+      val = veorq_u8(val, signbit);
+      vst1q_u8(shuffled_input_workspace_data + i, val);
+    }
+#else
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+#endif
+  } else if (batches == 4) {
+    uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+    int c = 0;
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (c = 0; c < accum_depth; c += 16) {
+      const uint8_t* src_data_ptr = input_data + c;
+      uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
+      uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
+      uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
+      uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth);
+      val0 = veorq_u8(val0, signbit);
+      val1 = veorq_u8(val1, signbit);
+      val2 = veorq_u8(val2, signbit);
+      val3 = veorq_u8(val3, signbit);
+      vst1q_u8(shuffled_input_workspace_ptr + 0, val0);
+      vst1q_u8(shuffled_input_workspace_ptr + 16, val1);
+      vst1q_u8(shuffled_input_workspace_ptr + 32, val2);
+      vst1q_u8(shuffled_input_workspace_ptr + 48, val3);
+      shuffled_input_workspace_ptr += 64;
+    }
+#else
+    for (c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8_t src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8_t values as int8_t, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8_t dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+#endif
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  static constexpr int kKernelRows = 4;
+  const int thread_count =
+      LegacyHowManyThreads<kKernelRows>(cpu_backend_context->max_num_threads(),
+                                        output_depth, batches, accum_depth);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    ShuffledFullyConnectedWorkerImpl(
+        shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
+        output_depth, output_depth, accum_depth, bias_data, output_multiplier,
+        output_shift, output_data);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<ShuffledFullyConnectedWorkerTask> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  const int kRowsPerWorker =
+      RoundUp<kKernelRows>(CeilQuotient(output_depth, thread_count));
+  int row_start = 0;
+  for (int i = 0; i < thread_count; i++) {
+    int row_end = std::min(output_depth, row_start + kRowsPerWorker);
+    tasks.emplace_back(shuffled_input_workspace_data,
+                       int8_shuffled_weights_data + row_start * accum_depth,
+                       batches, row_end - row_start, output_depth, accum_depth,
+                       bias_data + row_start, output_multiplier, output_shift,
+                       output_data + row_start);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_depth);
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
+}
+
+#ifdef USE_NEON
+
+inline int32x4_t RoundToNearest(const float32x4_t input) {
+#if defined(__aarch64__) || defined(__SSSE3__)
+  // Note: vcvtnq_s32_f32 is not available in ARMv7
+  return vcvtnq_s32_f32(input);
+#else
+  static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
+  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+  static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);
+
+  const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
+  const float32x4_t round =
+      vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
+  return vcvtq_s32_f32(vaddq_f32(input, round));
+#endif  // defined(__aarch64__) || defined(__SSSE3__)
+}
+
+inline uint32x4_t RoundToNearestUnsigned(const float32x4_t input) {
+#if defined(__aarch64__)
+  // Note that vcvtnq_u32_f32 is not available in ARMv7 or in arm_neon_sse.h.
+  return vcvtnq_u32_f32(input);
+#else
+  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+
+  return vcvtq_u32_f32(vaddq_f32(input, point5_val_dup));
+#endif  // defined(__aarch64__)
+}
+
+#endif  // USE_NEON
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data, CpuBackendContext* cpu_backend_context) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  ruy::profiler::ScopeLabel label("Conv");
+
+  // NB: the float 0.0f value is represented by all zero bytes.
+  const uint8_t float_zero_byte = 0x00;
+  const float* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col) {
+    DilatedIm2col(params, float_zero_byte, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    Im2col(params, filter_height, filter_width, float_zero_byte, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using cpu_backend_gemm.
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = n;
+  lhs_params.cols = k;
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = k;
+  rhs_params.cols = m;
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = n;
+  dst_params.cols = m;
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+}
+
+inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
+                       const RuntimeShape& input_shape,
+                       const int8_t* input_data,
+                       const RuntimeShape& filter_shape,
+                       const int8_t* filter_data,
+                       const RuntimeShape& bias_shape, const float* bias_data,
+                       const RuntimeShape& accum_scratch_shape,
+                       int32_t* accum_scratch, const RuntimeShape& output_shape,
+                       float* output_data, const RuntimeShape& im2col_shape,
+                       int8_t* im2col_data, CpuBackendContext* context) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batch_size = input_shape.Dims(0);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+
+  const int input_zero_point = 0;
+  const int8_t* gemm_input_data = nullptr;
+  int num_input;
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+
+  if (need_dilated_im2col) {
+    DilatedIm2col(params, input_zero_point, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    num_input = im2col_shape.FlatSize();
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    // symmetric quantization assumes zero point of 0.
+
+    Im2col(params, filter_height, filter_width, input_zero_point, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    num_input = im2col_shape.FlatSize();
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    num_input = input_shape.FlatSize();
+  }
+
+  // Flatten 4D matrices into 2D matrices for matrix multiplication.
+
+  // Flatten so that each filter has its own row.
+  const int filter_rows = filter_shape.Dims(0);
+  const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+
+  // In MatrixBatchVectorMultiplyAccumulate, each output value is the
+  // dot product of one row of the first matrix with one row of the second
+  // matrix. Therefore, the number of cols in each matrix are equivalent.
+  //
+  // After Im2Col, each input patch becomes a row.
+  const int gemm_input_cols = filter_cols;
+  const int gemm_input_rows = num_input / gemm_input_cols;
+
+  const int output_cols = output_shape.Dims(3);
+  const int output_rows = FlatSizeSkipDim(output_shape, 3);
+  TFLITE_DCHECK_EQ(output_cols, filter_rows);
+  TFLITE_DCHECK_EQ(output_rows, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_cols);
+
+  // MatrixBatchVectorMultiplyAccumulate assumes that each row of the second
+  // input matrix has its own scale factor. This code duplicates the scale
+  // factors for each row in the same batch.
+  const int rows_per_batch = gemm_input_rows / batch_size;
+  for (int i = gemm_input_rows - 1; i >= 0; --i) {
+    scaling_factors_ptr[i] = scaling_factors_ptr[i / rows_per_batch];
+  }
+
+  std::fill_n(output_data, output_rows * output_cols, 0.0f);
+
+  // The scratch buffer must have the same size as the output.
+  TFLITE_DCHECK_EQ(accum_scratch_shape.FlatSize(), output_shape.FlatSize());
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      filter_data, filter_rows, filter_cols, gemm_input_data,
+      scaling_factors_ptr, /*n_batch=*/gemm_input_rows, accum_scratch,
+      output_data, context);
+  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
+                                   bias_shape, bias_data, output_shape,
+                                   output_data);
+}
+
+inline void HybridConvPerChannel(
+    const ConvParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    const float* per_channel_scale, int32_t* input_offset,
+    const RuntimeShape& scratch_shape, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("ConvHybridPerChannel");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int8_t* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+
+  const int batch_size = input_shape.Dims(0);
+
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::DilatedIm2col(params, input_shape, input_data, filter_shape,
+                                 output_shape, im2col_data, input_offset,
+                                 batch_size);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    Im2col(params, filter_height, filter_width, input_offset, batch_size,
+           input_shape, input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int filter_rows = filter_shape.Dims(0);
+  const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int output_rows = output_shape.Dims(3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  TFLITE_DCHECK_EQ(scratch_shape.FlatSize(), output_shape.FlatSize());
+  if (!compute_row_sums || *compute_row_sums) {
+    tensor_utils::ReductionSumVector(filter_data, row_sums, filter_rows,
+                                     filter_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+
+  cpu_backend_gemm::MatrixParams<int8_t> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+
+  cpu_backend_gemm::MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = gemm_input_rows;
+  rhs_params.cols = gemm_input_cols;
+
+  cpu_backend_gemm::MatrixParams<int32_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = output_rows;
+  dst_params.cols = output_cols;
+
+  // TODO(b/149003801): Use hybrid gemm once supported in Ruy.
+  cpu_backend_gemm::GemmParams<int32_t, int32_t> gemm_params;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, scratch, gemm_params, cpu_backend_context);
+
+  MatrixMap<float> out_mat(output_data, filter_rows, output_cols);
+  MatrixMap<int32_t> in_mat(scratch, filter_rows, output_cols);
+  VectorMap<const float> bias_data_vec(bias_data, filter_rows, 1);
+  VectorMap<int32_t> row_sums_vec(row_sums, filter_rows, 1);
+  VectorMap<const float> per_channel_scale_vec(per_channel_scale, filter_rows,
+                                               1);
+  const int cols_per_batch = output_cols / batch_size;
+  for (int c = 0; c < output_cols; c++) {
+    const int b = c / cols_per_batch;
+    const float input_scale = scaling_factors_ptr[b];
+    const int32_t zero_point = input_offset[b];
+    out_mat.col(c) =
+        (((in_mat.col(c) - (row_sums_vec * zero_point))
+              .cast<float>()
+              .cwiseProduct((per_channel_scale_vec * input_scale))) +
+         bias_data_vec)
+            .cwiseMin(params.float_activation_max)
+            .cwiseMax(params.float_activation_min);
+  }
+}
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("Conv/8bit");
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const uint8_t* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    DilatedIm2col(params, input_zero_point, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    Im2col(params, filter_height, filter_width, input_zero_point, input_shape,
+           input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
+  // The root cause has not yet been identified though. Same applies below for
+  // the other calls commented out. This is a partial rollback of cl/196819423.
+  // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int gemm_input_cols = gemm_input_shape->Dims(0) *
+                              gemm_input_shape->Dims(1) *
+                              gemm_input_shape->Dims(2);
+  const int filter_rows = filter_shape.Dims(0);
+  // See b/79927784.
+  // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+  const int filter_cols =
+      filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
+  const int output_rows = output_shape.Dims(3);
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_shape, 3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = -filter_offset;
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
+  rhs_params.rows = gemm_input_rows;
+  rhs_params.cols = gemm_input_cols;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = -input_offset;
+  cpu_backend_gemm::MatrixParams<uint8_t> dst_params;
+  dst_params.rows = output_rows;
+  dst_params.cols = output_cols;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = output_offset;
+  cpu_backend_gemm::GemmParams<int32_t, uint8_t> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  gemm_params.multiplier_fixedpoint = output_multiplier;
+  gemm_params.multiplier_exponent = output_shift;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+}
+
+template <typename T>
+inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  ruy::profiler::ScopeLabel label("DepthToSpace");
+
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+
+  const int output_depth = output_shape.Dims(3);
+  const int batch_size = output_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = op_params.block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const T* input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
+      for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) {
+        const T* src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  ruy::profiler::ScopeLabel label("SpaceToDepth");
+
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  const int input_depth = input_shape.Dims(3);
+  const int batch_size = input_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = op_params.block_size * input_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      T* output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
+      for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) {
+        T* dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w) {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
+  }
+}
+
+inline void Relu(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Relu (not fused)");
+
+  const auto input = MapAsVector(input_data, input_shape);
+  auto output = MapAsVector(output_data, output_shape);
+  output = input.cwiseMax(0.0f);
+}
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const float* input_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data, float epsilon = 1e-6) {
+  ruy::profiler::ScopeLabel label("L2Normalization");
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c) {
+      *output_data = *input_data / l2_norm;
+      ++output_data;
+      ++input_data;
+    }
+  }
+}
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const uint8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("L2Normalization/8bit");
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = op_params.input_zero_point;
+  for (int i = 0; i < outer_size; ++i) {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      // Note that input_data advances by depth in the second pass below.
+      int32_t diff = input_data[c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    // TODO(b/29395854): add clamping to TOCO and TF Lite kernel
+    // for all zero tensors in the input_data
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = *input_data - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val = std::min(255, std::max(0, unclamped_output_val));
+      *output_data = static_cast<uint8_t>(output_val);
+      ++input_data;
+      ++output_data;
+    }
+  }
+}
+
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const float* input1_data, const float* input2_data,
+                           float* output_data) {
+  int i = 0;
+
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(params.float_activation_min);
+  const auto activation_max = vdupq_n_f32(params.float_activation_max);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vaddq_f32(a10, a20);
+    auto x1 = vaddq_f32(a11, a21);
+    auto x2 = vaddq_f32(a12, a22);
+    auto x3 = vaddq_f32(a13, a23);
+    x0 = vmaxq_f32(activation_min, x0);
+    x1 = vmaxq_f32(activation_min, x1);
+    x2 = vmaxq_f32(activation_min, x2);
+    x3 = vmaxq_f32(activation_min, x3);
+    x0 = vminq_f32(activation_max, x0);
+    x1 = vminq_f32(activation_max, x1);
+    x2 = vminq_f32(activation_max, x2);
+    x3 = vminq_f32(activation_max, x3);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vaddq_f32(a1, a2);
+    x = vmaxq_f32(activation_min, x);
+    x = vminq_f32(activation_max, x);
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const float* input1_data,
+                const RuntimeShape& input2_shape, const float* input2_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Add");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("AddElementwise/8bit");
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+#ifdef USE_NEON
+  const uint8x8_t output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const uint8x8_t output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    const uint8x8_t input1_val_original = vld1_u8(input1_data + i);
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const int16x8_t input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const int16x8_t input1_val =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high = vget_high_s16(input1_val);
+    const int16x4_t input1_val_low = vget_low_s16(input1_val);
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x11 = vmovl_s16(input1_val_low);
+    int32x4_t x12 = vmovl_s16(input1_val_high);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
+  using gemmlowp::RoundingDivideByPOT;
+
+  ruy::profiler::ScopeLabel label("AddScalarBroadcast/8bit");
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  int i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const uint8x8_t output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const uint8x8_t output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+
+  // Process broadcast scalar.
+  const uint8x8_t input1_val_original = vdup_n_u8(input1_data);
+  const int16x8_t input1_val_s16 =
+      vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+  const int16x8_t input1_val =
+      vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
+
+  for (; i <= size - 8; i += 8) {
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  if (i < size) {
+    // Process broadcast scalar.
+    const int32_t input1_val = params.input1_offset + input1_data;
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i) {
+      const int32_t input2_val = params.input2_offset + input2_data[i];
+      const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32_t scaled_input2_val =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input2_val, params.input2_multiplier,
+              params.input2_shift);
+      const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32_t raw_output =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              raw_sum, params.output_multiplier, params.output_shift) +
+          params.output_offset;
+      const int32_t clamped_output =
+          std::min(params.quantized_activation_max,
+                   std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<uint8_t>(clamped_output);
+    }
+  }
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               float broadcast_value, const float* input2_data,
+                               float* output_data) {
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t output_activation_min_vector =
+      vdupq_n_f32(params.float_activation_min);
+  const float32x4_t output_activation_max_vector =
+      vdupq_n_f32(params.float_activation_max);
+  const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
+  for (; i <= size - 4; i += 4) {
+    const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
+
+    const float32x4_t output =
+        vaddq_f32(input2_val_original, broadcast_value_dup);
+
+    const float32x4_t clamped =
+        vmaxq_f32(output_activation_min_vector,
+                  vminq_f32(output_activation_max_vector, output));
+    vst1q_f32(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    auto x = broadcast_value + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("Add/8bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Add/Int16");
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+    F0 scaled_input = F0::FromRaw(
+        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
+  }
+}
+
+template <typename T>
+inline typename std::enable_if<is_int32_or_int64<T>::value, void>::type Add(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Add/int32or64");
+
+  T activation_min, activation_max;
+  GetActivationParams(params, &activation_min, &activation_max);
+
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto input2_map = MapAsVector(input2_data, input2_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  if (input1_shape == input2_shape) {
+    output_map.array() = (input1_map.array() + input2_map.array())
+                             .cwiseMax(activation_min)
+                             .cwiseMin(activation_max);
+  } else if (input2_shape.FlatSize() == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = (input1_map.array() + scalar)
+                             .cwiseMax(activation_min)
+                             .cwiseMin(activation_max);
+  } else if (input1_shape.FlatSize() == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = (scalar + input2_map.array())
+                             .cwiseMax(activation_min)
+                             .cwiseMin(activation_max);
+  } else {
+    reference_ops::BroadcastAdd6DSlow<T>(params, input1_shape, input1_data,
+                                         input2_shape, input2_data,
+                                         output_shape, output_data);
+  }
+}
+
+template <typename T>
+inline void BroadcastAddDispatch(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return BroadcastAdd6DSlow(params, input1_shape, input1_data, input2_shape,
+                              input2_data, output_shape, output_data);
+  }
+
+  BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
+                           T*)>(AddElementwise),
+      static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
+          AddScalarBroadcast));
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8_t* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8_t* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8_t* output_data) {
+  BroadcastAddDispatch(unswitched_params, unswitched_input1_shape,
+                       unswitched_input1_data, unswitched_input2_shape,
+                       unswitched_input2_data, output_shape, output_data);
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const float* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const float* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 float* output_data) {
+  BroadcastAddDispatch(params, unswitched_input1_shape, unswitched_input1_data,
+                       unswitched_input2_shape, unswitched_input2_data,
+                       output_shape, output_data);
+}
+
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const float* input1_data, const float* input2_data,
+                           float* output_data) {
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  int i = 0;
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(output_activation_min);
+  const auto activation_max = vdupq_n_f32(output_activation_max);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vmulq_f32(a10, a20);
+    auto x1 = vmulq_f32(a11, a21);
+    auto x2 = vmulq_f32(a12, a22);
+    auto x3 = vmulq_f32(a13, a23);
+
+    x0 = vmaxq_f32(activation_min, x0);
+    x1 = vmaxq_f32(activation_min, x1);
+    x2 = vmaxq_f32(activation_min, x2);
+    x3 = vmaxq_f32(activation_min, x3);
+    x0 = vminq_f32(activation_max, x0);
+    x1 = vminq_f32(activation_max, x1);
+    x2 = vminq_f32(activation_max, x2);
+    x3 = vminq_f32(activation_max, x3);
+
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vmulq_f32(a1, a2);
+
+    x = vmaxq_f32(activation_min, x);
+    x = vminq_f32(activation_max, x);
+
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] * input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+                                                  output_activation_max);
+  }
+}
+
+inline void MulElementwise(int32_t n, const ArithmeticParams& params,
+                           const int32_t* __restrict lhs,
+                           const int32_t* __restrict rhs,
+                           int32_t* __restrict out) {
+  const int32_t activation_min_val = params.quantized_activation_min;
+  const int32_t activation_max_val = params.quantized_activation_max;
+
+  int32_t i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t activation_min = vdupq_n_s32(activation_min_val);
+  const int32x4_t activation_max = vdupq_n_s32(activation_max_val);
+
+  // Ewise Mul 16 elements at a time using 4 4-wide vector registers per loop.
+  for (; i <= n - 16; i += 16) {
+    // Load.
+    const int32x4_t lhs_reg = vld1q_s32(lhs + i);
+    const int32x4_t lhs_reg2 = vld1q_s32(lhs + i + 4);
+    const int32x4_t lhs_reg3 = vld1q_s32(lhs + i + 8);
+    const int32x4_t lhs_reg4 = vld1q_s32(lhs + i + 12);
+
+    const int32x4_t rhs_reg = vld1q_s32(rhs + i);
+    const int32x4_t rhs_reg2 = vld1q_s32(rhs + i + 4);
+    const int32x4_t rhs_reg3 = vld1q_s32(rhs + i + 8);
+    const int32x4_t rhs_reg4 = vld1q_s32(rhs + i + 12);
+
+    // Multiply.
+    const int32x4_t mul_reg = vmulq_s32(lhs_reg, rhs_reg);
+    const int32x4_t mul_reg2 = vmulq_s32(lhs_reg2, rhs_reg2);
+    const int32x4_t mul_reg3 = vmulq_s32(lhs_reg3, rhs_reg3);
+    const int32x4_t mul_reg4 = vmulq_s32(lhs_reg4, rhs_reg4);
+
+    // Apply activation.
+    const int32x4_t max_reg = vminq_s32(activation_max, mul_reg);
+    const int32x4_t max_reg2 = vminq_s32(activation_max, mul_reg2);
+    const int32x4_t max_reg3 = vminq_s32(activation_max, mul_reg3);
+    const int32x4_t max_reg4 = vminq_s32(activation_max, mul_reg4);
+    const int32x4_t min_reg = vmaxq_s32(activation_min, max_reg);
+    const int32x4_t min_reg2 = vmaxq_s32(activation_min, max_reg2);
+    const int32x4_t min_reg3 = vmaxq_s32(activation_min, max_reg3);
+    const int32x4_t min_reg4 = vmaxq_s32(activation_min, max_reg4);
+
+    // Store.
+    vst1q_s32(out + i, min_reg);
+    vst1q_s32(out + i + 4, min_reg2);
+    vst1q_s32(out + i + 8, min_reg3);
+    vst1q_s32(out + i + 12, min_reg4);
+  }
+#endif
+
+  // This will handle leftovers when n is not aligned to 4 elements.
+  for (; i < n; ++i) {
+    out[i] = ActivationFunctionWithMinMax(lhs[i] * rhs[i], activation_min_val,
+                                          activation_max_val);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const float* input1_data,
+                const RuntimeShape& input2_shape, const float* input2_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Mul");
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int32_t* input1_data,
+                const RuntimeShape& input2_shape, const int32_t* input2_data,
+                const RuntimeShape& output_shape, int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/int32_t/activation");
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void MulNoActivation(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const int32_t* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const int32_t* input2_data,
+                            const RuntimeShape& output_shape,
+                            int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/int32_t");
+
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto input2_map = MapAsVector(input2_data, input2_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  if (input1_shape == input2_shape) {
+    output_map.array() = input1_map.array() * input2_map.array();
+  } else if (input2_shape.FlatSize() == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() * scalar;
+  } else if (input1_shape.FlatSize() == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar * input2_map.array();
+  } else {
+    reference_ops::BroadcastMul6DSlow(params, input1_shape, input1_data,
+                                      input2_shape, input2_data, output_shape,
+                                      output_data);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/Int16/NoActivation");
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t output_offset = params.output_offset;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16_t rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 8; i += 8) {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+
+    auto p1 = vmull_s16(input2_val_low, input1_val_low);
+    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p =
+        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
+                               const uint8_t broadcast_value,
+                               const uint8_t* input2_data,
+                               uint8_t* output_data) {
+  const int16_t input1_val = params.input1_offset + broadcast_value;
+
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 8; i += 8) {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+
+    auto p1 = vmull_n_s16(input2_val_low, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+
+    const auto p1_narrowed = vmovn_s32(p1);
+    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p =
+        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+// This function will handle scalar_value (LHS) * vector_values (RHS).
+// Since it's a float function, input params does not matter here.
+inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
+                               const float broadcast_value,
+                               const float* input2_data, float* output_data) {
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t output_activation_min_vector =
+      vdupq_n_f32(params.float_activation_min);
+  const float32x4_t output_activation_max_vector =
+      vdupq_n_f32(params.float_activation_max);
+  const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
+  for (; i <= size - 4; i += 4) {
+    const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
+
+    const float32x4_t output =
+        vmulq_f32(input2_val_original, broadcast_value_dup);
+
+    const float32x4_t clamped =
+        vmaxq_f32(output_activation_min_vector,
+                  vminq_f32(output_activation_max_vector, output));
+    vst1q_f32(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    float x = broadcast_value * input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T>
+inline void BroadcastMulDispatch(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return BroadcastMul6DSlow(params, input1_shape, input1_data, input2_shape,
+                              input2_data, output_shape, output_data);
+  }
+
+  BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
+                           T*)>(MulElementwise),
+      static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
+          MulSimpleBroadcast));
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8_t* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8_t* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8_t* output_data) {
+  BroadcastMulDispatch(unswitched_params, unswitched_input1_shape,
+                       unswitched_input1_data, unswitched_input2_shape,
+                       unswitched_input2_data, output_shape, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const float* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const float* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 float* output_data) {
+  BroadcastMulDispatch(params, unswitched_input1_shape, unswitched_input1_data,
+                       unswitched_input2_shape, unswitched_input2_data,
+                       output_shape, output_data);
+}
+
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T, int N = 5>
+void BroadcastDivSlow(const ArithmeticParams& params,
+                      const RuntimeShape& unextended_input1_shape,
+                      const T* input1_data,
+                      const RuntimeShape& unextended_input2_shape,
+                      const T* input2_data,
+                      const RuntimeShape& unextended_output_shape,
+                      T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastDivSlow");
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto div_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] /
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            output_activation_min, output_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, div_func);
+}
+
+// BroadcastDiv is intentionally duplicated from reference_ops.h.
+// For more details see the comment above the generic version of
+// BroadcastDivSlow.
+template <int N = 5>
+inline void BroadcastDivSlow(const ArithmeticParams& params,
+                             const RuntimeShape& unextended_input1_shape,
+                             const uint8_t* input1_data,
+                             const RuntimeShape& unextended_input2_shape,
+                             const uint8_t* input2_data,
+                             const RuntimeShape& unextended_output_shape,
+                             uint8_t* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+
+  auto div_func = [&](int indexes[N]) {
+    int32_t input1_val =
+        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
+    int32_t input2_val =
+        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
+    TFLITE_DCHECK_NE(input2_val, 0);
+    if (input2_val < 0) {
+      // Invert signs to avoid a negative input2_val as input2_inv needs to be
+      // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
+      input1_val = -input1_val;
+      input2_val = -input2_val;
+    }
+    int recip_shift;
+    const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
+    const int headroom = CountLeadingSignBits(input1_val);
+    const int32_t unscaled_quotient =
+        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                    headroom);
+    const int total_shift = params.output_shift - recip_shift - headroom;
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            unscaled_quotient, params.output_multiplier, total_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        static_cast<uint8_t>(clamped_output);
+  };
+  NDOpsHelper<N>(output_desc, div_func);
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("SubWithActivation_optimized");
+  TFLITE_DCHECK_EQ(input1_shape.FlatSize(), input2_shape.FlatSize());
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto input2_map = MapAsVector(input2_data, input2_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  T activation_min, activation_max;
+  GetActivationParams(params, &activation_min, &activation_max);
+  output_map.array() = (input1_map.array() - input2_map.array())
+                           .cwiseMin(activation_max)
+                           .cwiseMax(activation_min);
+}
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const float* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const float* input2_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data) {
+  ruy::profiler::ScopeLabel label("SubNonBroadcast");
+  SubWithActivation<float>(params, input1_shape, input1_data, input2_shape,
+                           input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+         const T* input1_data, const RuntimeShape& input2_shape,
+         const T* input2_data, const RuntimeShape& output_shape,
+         T* output_data) {
+  ruy::profiler::ScopeLabel label("Sub");
+
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto input2_map = MapAsVector(input2_data, input2_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  if (input1_shape == input2_shape) {
+    output_map.array() = input1_map.array() - input2_map.array();
+  } else if (input1_shape.FlatSize() == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar - input2_map.array();
+  } else if (input2_shape.FlatSize() == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() - scalar;
+  } else {
+    BroadcastSubSlow(params, input1_shape, input1_data, input2_shape,
+                     input2_data, output_shape, output_data);
+  }
+}
+
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
+    const float* prev_activ_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& unextended_bias_shape,
+    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
+    const float* prev_state_data,
+    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
+    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
+    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data,
+    CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("LstmCell");
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  MatchingDim(  // batches
+      input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+      output_state_shape, 0, output_activ_shape, 0);
+  MatchingDim(  // height
+      input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+      output_state_shape, 1, output_activ_shape, 1);
+  MatchingDim(  // width
+      input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+      output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<RuntimeShape const*> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]),
+                &(concat_input_arrays_data[0]), concat_temp_shape,
+                concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  fc_params.lhs_cacheable = false;
+  fc_params.rhs_cacheable = false;
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
+                 weights_data, bias_shape, bias_data, activ_temp_shape,
+                 activ_temp_data, cpu_backend_context);
+
+  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+  // operations.
+  ArrayMap<float> activ_temp_map =
+      MapAsArrayWithLastDimAsRows(activ_temp_data, activ_temp_shape);
+  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+                                            activ_temp_map.cols());
+  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+                                           activ_temp_map.cols());
+  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  ArrayMap<const float> prev_state_map =
+      MapAsArrayWithLastDimAsRows(prev_state_data, prev_state_shape);
+  ArrayMap<float> output_state_map =
+      MapAsArrayWithLastDimAsRows(output_state_data, output_state_shape);
+  ArrayMap<float> output_activ_map =
+      MapAsArrayWithLastDimAsRows(output_activ_data, output_activ_shape);
+
+  // Combined memory state and final output calculation
+  ruy::profiler::ScopeLabel label2("MemoryStateAndFinalOutput");
+  output_state_map =
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+          new_input_sm.tanh() +
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+          prev_state_map;
+  output_activ_map =
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
+      output_state_map.tanh();
+}
+
+template <int StateIntegerBits>
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const uint8_t* input_data_uint8,
+    const RuntimeShape& unextended_prev_activ_shape,
+    const uint8_t* prev_activ_data_uint8, const RuntimeShape& weights_shape,
+    const uint8_t* weights_data_uint8,
+    const RuntimeShape& unextended_bias_shape, const int32_t* bias_data_int32,
+    const RuntimeShape& unextended_prev_state_shape,
+    const int16_t* prev_state_data_int16,
+    const RuntimeShape& unextended_output_state_shape,
+    int16_t* output_state_data_int16,
+    const RuntimeShape& unextended_output_activ_shape,
+    uint8_t* output_activ_data_uint8,
+    const RuntimeShape& unextended_concat_temp_shape,
+    uint8_t* concat_temp_data_uint8,
+    const RuntimeShape& unextended_activ_temp_shape,
+    int16_t* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label(
+      "LstmCell/quantized (8bit external, 16bit internal)");
+  int32_t weights_zero_point = params.weights_zero_point;
+  int32_t accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(
+      input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
+      output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8_t const* concat_input_arrays_data[2] = {input_data_uint8,
+                                                prev_activ_data_uint8};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape,
+                concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
+  lhs_params.rows = fc_output_depth;
+  lhs_params.cols = fc_accum_depth;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = weights_zero_point;
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
+  rhs_params.rows = fc_accum_depth;
+  rhs_params.cols = fc_batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = 128;
+  cpu_backend_gemm::MatrixParams<int16_t> dst_params;
+  dst_params.rows = fc_output_depth;
+  dst_params.cols = fc_batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = 0;
+  cpu_backend_gemm::GemmParams<int32_t, int16_t> gemm_params;
+  gemm_params.bias = bias_data_int32;
+  gemm_params.multiplier_fixedpoint = accum_multiplier;
+  gemm_params.multiplier_exponent = accum_shift;
+  cpu_backend_gemm::Gemm(
+      lhs_params, weights_data_uint8, rhs_params, concat_temp_data_uint8,
+      dst_params, activ_temp_data_int16, gemm_params, cpu_backend_context);
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int16_t* input_gate_input_ptr = activ_temp_data_int16;
+  const int16_t* input_modulation_gate_input_ptr =
+      activ_temp_data_int16 + output_depth;
+  const int16_t* forget_gate_input_ptr =
+      activ_temp_data_int16 + 2 * output_depth;
+  const int16_t* output_gate_input_ptr =
+      activ_temp_data_int16 + 3 * output_depth;
+  const int16_t* prev_state_ptr = prev_state_data_int16;
+  int16_t* output_state_data_ptr = output_state_data_int16;
+  uint8_t* output_activ_data_ptr = output_activ_data_uint8;
+
+  for (int b = 0; b < outer_size; ++b) {
+    int c = 0;
+#ifdef GEMMLOWP_NEON
+    for (; c <= output_depth - 8; c += 8) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16_t as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<int16x8_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(vld1q_s16(input_gate_input_ptr));
+      input_gate_input_ptr += 8;
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+          F3::FromRaw(vld1q_s16(input_modulation_gate_input_ptr));
+      input_modulation_gate_input_ptr += 8;
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(vld1q_s16(forget_gate_input_ptr));
+      forget_gate_input_ptr += 8;
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(vld1q_s16(output_gate_input_ptr));
+      output_gate_input_ptr += 8;
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(vld1q_s16(prev_state_ptr));
+      prev_state_ptr += 8;
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      vst1q_s16(output_state_data_ptr, new_state.raw());
+      output_state_data_ptr += 8;
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16x8_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int8x8_t int8_output_activ = vqmovn_s16(rescaled_output_activ);
+      uint8x8_t uint8_output_activ =
+          vadd_u8(vdup_n_u8(128), vreinterpret_u8_s8(int8_output_activ));
+      vst1_u8(output_activ_data_ptr, uint8_output_activ);
+      output_activ_data_ptr += 8;
+    }
+#endif
+    for (; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16_t as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(*input_gate_input_ptr++);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+          F3::FromRaw(*input_modulation_gate_input_ptr++);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(*forget_gate_input_ptr++);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(*output_gate_input_ptr++);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(*prev_state_ptr++);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      *output_state_data_ptr++ = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16_t clamped_output_activ = std::max<int16_t>(
+          -128, std::min<int16_t>(127, rescaled_output_activ));
+      *output_activ_data_ptr++ = 128 + clamped_output_activ;
+    }
+    input_gate_input_ptr += 3 * output_depth;
+    input_modulation_gate_input_ptr += 3 * output_depth;
+    forget_gate_input_ptr += 3 * output_depth;
+    output_gate_input_ptr += 3 * output_depth;
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+  return (b * height + h) * width + w;
+}
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const float* input_data,
+                        const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("AveragePool");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  if (stride_height == 0) return false;
+  if (stride_width == 0) return false;
+
+  // TODO(benoitjacob) make this a proper reference impl without Eigen!
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  // TODO(benoitjacob) get rid of the dynamic memory allocation here!
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + params.padding_values.height;
+        int wpad = w + params.padding_values.width;
+        int h_start = (hpad < params.filter_height)
+                          ? 0
+                          : (hpad - params.filter_height) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start = (wpad < params.filter_width)
+                          ? 0
+                          : (wpad - params.filter_width) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) +=
+                in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+  // Divide the output by the actual number of elements being averaged over
+  TFLITE_DCHECK_GT(out_count.minCoeff(), 0);
+  out_mat.array().rowwise() /= out_count.transpose().array();
+
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(output_data[i],
+                                                  params.float_activation_min,
+                                                  params.float_activation_max);
+  }
+
+  return true;
+}
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("AveragePool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  uint32_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch) {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          if (filter_count == 0) return false;
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8_t* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8_t* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8_t* input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint16x4_t acc_reg[4];
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg)));
+                acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg)));
+                acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg)));
+                acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg)));
+                for (int i = 0; i < 4; i++) {
+                  vst1q_u32(
+                      acc + channel + 4 * i,
+                      vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint16x4_t acc_reg[2];
+                uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_u16(input_reg);
+                acc_reg[1] = vget_high_u16(input_reg);
+                for (int i = 0; i < 2; i++) {
+                  vst1q_u32(
+                      acc + channel + 4 * i,
+                      vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          uint8_t* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                     out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
+  if (filter_count == FILTER_COUNT) {                                   \
+    for (; channel <= tranche_depth - 8; channel += 8) {                \
+      uint16_t buf[8];                                                  \
+      for (int i = 0; i < 8; i++) {                                     \
+        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
+      }                                                                 \
+      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                      \
+      buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
+      buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
+      vst1_u8(output_ptr + channel, buf8);                              \
+    }                                                                   \
+  }
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint16_t buf[8];
+            for (int i = 0; i < 8; i++) {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel) {
+            uint16_t a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16_t>(a, params.quantized_activation_min);
+            a = std::min<uint16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const float* input_data, const RuntimeShape& output_shape,
+                    float* output_data) {
+  ruy::profiler::ScopeLabel label("MaxPool");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  // Prefill the output to minimum representable float value
+  out_mat.setConstant(std::numeric_limits<float>::lowest());
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + params.padding_values.height;
+        int wpad = w + params.padding_values.width;
+        int h_start = (hpad < params.filter_height)
+                          ? 0
+                          : (hpad - params.filter_height) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start = (wpad < params.filter_width)
+                          ? 0
+                          : (wpad - params.filter_width) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) =
+                out_mat.col(out_offset)
+                    .cwiseMax(in_mat.col(
+                        NodeOffset(b, h, w, input_height, input_width)));
+          }
+        }
+      }
+    }
+  }
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(output_data[i],
+                                                  params.float_activation_min,
+                                                  params.float_activation_max);
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("MaxPool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  uint8_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch) {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8_t* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8_t* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8_t* input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint8x16_t acc_reg = vld1q_u8(acc + channel);
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg = vmaxq_u8(acc_reg, input_reg);
+                vst1q_u8(acc + channel, acc_reg);
+              }
+
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint8x8_t acc_reg = vld1_u8(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vmax_u8(acc_reg, input_reg);
+                vst1_u8(acc + channel, acc_reg);
+              }
+#endif
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
+              }
+              input_row_ptr += depth;
+            }
+          }
+          uint8_t* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                     out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 16; channel += 16) {
+            uint8x16_t a = vld1q_u8(acc + channel);
+            a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
+            a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
+            vst1q_u8(output_ptr + channel, a);
+          }
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint8x8_t a = vld1_u8(acc + channel);
+            a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
+            a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, a);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel) {
+            uint8_t a = acc[channel];
+            a = std::max<uint8_t>(a, params.quantized_activation_min);
+            a = std::min<uint8_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
+                   const float* input_data, const RuntimeShape& output_shape,
+                   float* output_data) {
+  ruy::profiler::ScopeLabel label("L2Pool");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  // Actually carry out L2 Pool. Code is written in forward mode: we go through
+  // the input values once, and write to all the pooled regions that it maps to.
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  Eigen::VectorXf in_square(in_mat.rows());
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        const int hpad = h + params.padding_values.height;
+        const int wpad = w + params.padding_values.width;
+        const int h_start =
+            (hpad < params.filter_height)
+                ? 0
+                : (hpad - params.filter_height) / stride_height + 1;
+        const int h_end = std::min(hpad / stride_height + 1, output_height);
+        const int w_start =
+            (wpad < params.filter_width)
+                ? 0
+                : (wpad - params.filter_width) / stride_width + 1;
+        const int w_end = std::min(wpad / stride_width + 1, output_width);
+        // pre-compute square
+        const int in_offset = w + input_width * (h + input_height * b);
+        in_square =
+            in_mat.col(in_offset).array() * in_mat.col(in_offset).array();
+        // compute elementwise sum of squares
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            const int out_offset = pw + output_width * (ph + output_height * b);
+            out_mat.col(out_offset) += in_square;
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+
+  out_count = out_count.array().inverse();
+  out_mat =
+      (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt();
+
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(output_data[i],
+                                                  params.float_activation_min,
+                                                  params.float_activation_max);
+  }
+}
+
+inline void LocalResponseNormalization(
+    const tflite::LocalResponseNormalizationParams& op_params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("LocalResponseNormalization");
+  MatchingFlatSize(input_shape, output_shape);
+
+  const auto data_in = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto data_out = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+
+  // Carry out local response normalization, vector by vector.
+  // Since the data are stored column major, making row-wise operation
+  // probably not memory efficient anyway, we do an explicit for loop over
+  // the columns.
+  const int double_range = op_params.range * 2;
+  Eigen::VectorXf padded_square(data_in.rows() + double_range);
+  padded_square.setZero();
+  const float bias = op_params.bias;
+  for (int r = 0; r < data_in.cols(); ++r) {
+    // Do local response normalization for data_in(:, r)
+    // first, compute the square and store them in buffer for repeated use
+    padded_square.block(op_params.range, 0, data_in.rows(), 1) =
+        data_in.col(r).cwiseProduct(data_in.col(r)) * op_params.alpha;
+    // Then, compute the scale and writes them to data_out
+    float accumulated_scale = 0;
+    for (int i = 0; i < double_range; ++i) {
+      accumulated_scale += padded_square(i);
+    }
+    for (int i = 0; i < data_in.rows(); ++i) {
+      accumulated_scale += padded_square(i + double_range);
+      data_out(i, r) = bias + accumulated_scale;
+      accumulated_scale -= padded_square(i);
+    }
+  }
+
+  // In a few cases, the pow computation could benefit from speedups.
+  if (op_params.beta == 1) {
+    data_out.array() = data_in.array() * data_out.array().inverse();
+  } else if (op_params.beta == 0.5f) {
+    data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+  } else {
+    data_out.array() = data_in.array() * data_out.array().pow(-op_params.beta);
+  }
+}
+
+inline void SoftmaxImpl(const SoftmaxParams& params,
+                        const RuntimeShape& input_shape,
+                        const float* input_data,
+                        const RuntimeShape& output_shape, float* output_data,
+                        int start_batch, int end_batch) {
+  ruy::profiler::ScopeLabel label("Softmax/Impl");
+  MatchingFlatSize(input_shape, output_shape);
+
+  const int logit_size = input_shape.Dims(input_shape.DimensionsCount() - 1);
+  const MatrixMap<const float> in_mat(input_data + logit_size * start_batch,
+                                      logit_size, end_batch - start_batch);
+  MatrixMap<float> out_mat(output_data + logit_size * start_batch, logit_size,
+                           end_batch - start_batch);
+  // Compute the exponential first, removing the max coefficient for numerical
+  // stability.
+  out_mat =
+      (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
+  // We are separating out the exp function so that exp can be vectorized.
+  out_mat = out_mat.array().exp();
+  // Normalize to get the activations.
+  Eigen::Array<float, 1, Eigen::Dynamic> scale =
+      out_mat.array().colwise().sum().inverse();
+  out_mat.array().rowwise() *= scale;
+}
+
+struct SoftmaxWorkerTask : cpu_backend_threadpool::Task {
+  SoftmaxWorkerTask(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const float* input_data,
+                    const RuntimeShape& output_shape, float* output_data,
+                    int start_batch, int end_batch)
+      : params(params),
+        input_shape(input_shape),
+        input_data(input_data),
+        output_shape(output_shape),
+        output_data(output_data),
+        start_batch(start_batch),
+        end_batch(end_batch) {}
+  void Run() override {
+    SoftmaxImpl(params, input_shape, input_data, output_shape, output_data,
+                start_batch, end_batch);
+  }
+
+ private:
+  const tflite::SoftmaxParams& params;
+  const RuntimeShape& input_shape;
+  const float* input_data;
+  const RuntimeShape& output_shape;
+  float* output_data;
+  int start_batch;
+  int end_batch;
+};
+
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const float* input_data,
+                    const RuntimeShape& output_shape, float* output_data,
+                    CpuBackendContext* cpu_backend_context = nullptr) {
+  ruy::profiler::ScopeLabel label("Softmax");
+
+  // We picture softmax input as a 2-D matrix while the last dim is the logit
+  // dim, and the rest dims will be the batch dim for the 2-D matrix.
+  const int batch_size =
+      FlatSizeSkipDim(input_shape, input_shape.DimensionsCount() - 1);
+  constexpr int kMinBatchPerThread = 8;
+  int thread_count = batch_size / kMinBatchPerThread;
+  thread_count = thread_count > 0 ? thread_count : 1;
+  const int capped_thread_count =
+      cpu_backend_context == nullptr
+          ? 1
+          : std::min(thread_count, cpu_backend_context->max_num_threads());
+  if (capped_thread_count == 1) {
+    SoftmaxImpl(params, input_shape, input_data, output_shape, output_data, 0,
+                batch_size);
+  } else {
+    std::vector<SoftmaxWorkerTask> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(capped_thread_count);
+    int batch_start = 0;
+    for (int i = 0; i < capped_thread_count; ++i) {
+      // Try to distribute the tasks as even as possible.
+      int batch_end =
+          batch_start + (batch_size - batch_start) / (capped_thread_count - i);
+      tasks.emplace_back(params, input_shape, input_data, output_shape,
+                         output_data, batch_start, batch_end);
+      batch_start = batch_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+}
+
+template <typename T>
+inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) {
+  const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
+  return prob_rnd + zero_point;
+}
+
+#if !__aarch64__
+// With ARM64, rounding is faster than add + truncation.
+template <>
+inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled,
+                                              int32_t zero_point) {
+  return static_cast<int32_t>(prob_rescaled + 0.5f);
+}
+#endif
+
+inline void PopulateSoftmaxLookupTable(SoftmaxParams* data, float input_scale,
+                                       float beta) {
+  const float scale = -input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  for (int32_t val = 0; val <= max_uint8; ++val) {
+    data->table[max_uint8 - val] = expf(scale * val);
+  }
+}
+
+template <typename In, typename Out>
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const In* input_data,
+                    const RuntimeShape& output_shape, Out* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int excluding_last_dim =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+  for (int i = 0; i < excluding_last_dim; ++i) {
+    int32_t max_val = std::numeric_limits<In>::min();
+    // Find max quantized value.
+    for (int j = 0; j < last_dim; ++j) {
+      max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
+    }
+
+    float sum_exp = 0.0f;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const float* table_offset = &params.table[max_uint8 - max_val];
+    // Calculate normalizer sum(exp(x)).
+    for (int j = 0; j < last_dim; ++j) {
+      sum_exp += table_offset[input_data[j]];
+    }
+
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+    // Normalize and quantize probabilities.
+    for (int j = 0; j < last_dim; ++j) {
+      const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
+      const int32_t prob_quantized =
+          QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
+      output_data[j] = static_cast<Out>(
+          std::max(std::min(clamp_max, prob_quantized), clamp_min));
+    }
+    input_data += last_dim;
+    output_data += last_dim;
+  }
+}
+
+// Here's the softmax LUT optimization strategy:
+// For softmax, we can do some mathmetically equivalent transformation:
+//
+// softmax(x) = e^x / sum(e^x, 0...n)  ===> equals to
+// softmax(x) = e^(x - CONST) / sum(e^(x - CONST), 0...n)
+//
+// For quantization, `x` in our case is (input_q - input_zp) * input_s
+// For uint8_t case (int8_t can be handled similarly), the range is [0, 255]
+//
+// so if we let
+// CONST = (255 - input_zp) * input_s
+// then we will have:
+// softmax(x) = e^((input_q - 255) * input_s) --------- (1)
+//         /
+// sum(e^(input_q - 255) * input_s, 0...n)   -------- (2)
+//
+// the good thing about (1) is it's within the range of (0, 1), so we can
+// approximate its result with uint16_t.
+//  (1) = uint8_out * 1 / 2^16.
+//
+// so (1) is lookup_uint8_table(input_zp) * 1 / 2^16.
+// then (2) is essentially the following:
+// sum(lookup_uint8_table(input_zp), 0...n) / 2^16.
+//
+// since (output_q - output_zp) * output_s = softmax(x)
+// output_q = lookup_uint8_table(input_zp)
+//            /
+// (sum(lookup_uint8_table(input_zp), 0...n) * output_s)
+//             +
+//   output_zp
+//
+// We can actually further improve the performance by using uint8_t instead of
+// uint16_t. But that we may lose some accuracy, so we need to pay attention
+// to that.
+inline void PopulateSoftmaxUInt8LookupTable(SoftmaxParams* data,
+                                            float input_scale, float beta) {
+  const float scale = input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  const int32_t max_uint16 = std::numeric_limits<uint16_t>::max();
+
+  for (int32_t val = 0; val <= max_uint8; ++val) {
+    float input_to_exp = scale * (val - max_uint8);
+    int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5);
+    temp = std::min(max_uint16, temp);
+    uint8_t part1 = temp >> 8;
+    uint8_t part2 = temp & 0xff;
+    data->uint8_table1[val] = static_cast<uint8_t>(part1);
+    data->uint8_table2[val] = static_cast<uint8_t>(part2);
+  }
+}
+
+inline int FindMaxValue(int size, const uint8_t* input_data, uint8_t offset) {
+  int32_t max_val = std::numeric_limits<uint8_t>::min();
+  int j = 0;
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+  uint8x16_t max_val_dup = vdupq_n_u8(max_val);
+  uint8x16_t offset_dup = vdupq_n_u8(offset);
+  for (; j <= size - 16; j += 16) {
+    uint8x16_t input_value = vld1q_u8(input_data + j);
+    input_value = veorq_u8(input_value, offset_dup);
+    max_val_dup = vmaxq_u8(input_value, max_val_dup);
+  }
+  max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup)));
+#endif
+
+  for (; j < size; ++j) {
+    max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset));
+  }
+  return max_val;
+}
+
+#ifdef USE_NEON
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, int8_t* output) {
+  const int16x8_t result_1 = vcombine_s16(vqmovn_s32(value_to_store.val[1]),
+                                          vqmovn_s32(value_to_store.val[0]));
+  const int16x8_t result_2 = vcombine_s16(vqmovn_s32(value_to_store.val[3]),
+                                          vqmovn_s32(value_to_store.val[2]));
+  const int8x16_t result =
+      vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
+  vst1q_s8(output, result);
+}
+
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, uint8_t* output) {
+  const uint16x8_t result_1 =
+      vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
+                   vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
+  const uint16x8_t result_2 =
+      vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
+                   vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
+  const uint8x16_t result =
+      vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
+  vst1q_u8(output, result);
+}
+
+#endif
+
+template <typename In, typename Out>
+inline void SoftmaxInt8LUT(const SoftmaxParams& params,
+                           const RuntimeShape& input_shape,
+                           const In* input_data,
+                           const RuntimeShape& output_shape, Out* output_data) {
+  ruy::profiler::ScopeLabel label("SoftmaxInt8LUT");
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int excluding_last_dim =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+
+  // Offset is used to interpret the input data "correctly".
+  // If the input is uint8_t, the data will be unchanged.
+  // If the input is int8_t, since it will be reinterpret as uint8_t.
+  // e.g.,
+  // int8_t 127 will be applied "offset" to become 255 in uint8_t.
+  uint8_t offset = 0;
+  if (std::is_same<In, int8_t>::value) {
+    offset = 0x80;
+  }
+
+  const uint8_t* input_data_uint = reinterpret_cast<const uint8_t*>(input_data);
+
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+  // This code uses ARM64-only instructions.
+  // TODO(b/143709993): Port to ARMv7
+
+  // Load the tables into registers. (4*4 128-bit registers)
+  uint8x16x4_t table1[4];
+  table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0);
+  table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1);
+  table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2);
+  table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3);
+
+  uint8x16x4_t table2[4];
+  table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0);
+  table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1);
+  table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2);
+  table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3);
+#endif
+
+  for (int i = 0; i < excluding_last_dim; ++i) {
+    // Find max quantized value.
+    int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
+
+    int32_t sum_exp = 0;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const uint8_t table_offset = max_uint8 - max_val;
+
+    // Calculate normalizer sum(exp(x)).
+    int sum_j = 0;
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+    uint8x16_t table_offset_dup = vdupq_n_u8(table_offset);
+    uint8x16_t offset_dup = vdupq_n_u8(offset);
+    uint32x4_t sum_4 = vdupq_n_u32(0);
+    const int multiplier_shift = 8;
+    for (; sum_j <= last_dim - 16; sum_j += 16) {
+      uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 =
+          vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 =
+          vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      sum_4 = vpadalq_u16(sum_4, exp_value1);
+      sum_4 = vpadalq_u16(sum_4, exp_value2);
+    }
+    int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) +
+               vgetq_lane_u32(sum_4, 2) + vgetq_lane_u32(sum_4, 3);
+    sum_exp += temp;
+
+#endif
+    for (; sum_j < last_dim; ++sum_j) {
+      const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset;
+
+      uint8_t part1 = params.uint8_table1[index];
+      uint8_t part2 = params.uint8_table2[index];
+      sum_exp += ((part1 << 8) + part2);
+    }
+
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+
+    int32_t multiplier, shift;
+    QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
+
+    // Normalize and quantize probabilities.
+    int j = 0;
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+    const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point);
+    const int32x4_t max_val_dup = vdupq_n_s32(clamp_max);
+    const int32x4_t min_val_dup = vdupq_n_s32(clamp_min);
+
+    for (; j <= last_dim - 16; j += 16) {
+      uint8x16_t input_value = vld1q_u8(input_data_uint + j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 =
+          vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 =
+          vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      int32x4x4_t output_value;
+      output_value.val[0] =
+          vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1)));
+      output_value.val[1] =
+          vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1)));
+      output_value.val[2] =
+          vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2)));
+      output_value.val[3] =
+          vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2)));
+
+      int32x4x4_t temp_val =
+          MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift);
+
+      temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+      temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+      temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
+      temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
+
+      temp_val.val[0] =
+          vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+      temp_val.val[1] =
+          vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+      temp_val.val[2] =
+          vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+      temp_val.val[3] =
+          vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+      StoreValue(temp_val, output_data + j);
+    }
+#endif
+    for (; j < last_dim; ++j) {
+      const uint8_t index = (input_data_uint[j] ^ offset) + table_offset;
+      const uint8_t part1 = params.uint8_table1[index];
+      const uint8_t part2 = params.uint8_table2[index];
+      const int32_t exp_value = (part1 << 8) + part2;
+      const int32_t output_value =
+          MultiplyByQuantizedMultiplier(exp_value, multiplier, shift);
+
+      output_data[j] = static_cast<Out>(std::max(
+          std::min(clamp_max, output_value + params.zero_point), clamp_min));
+    }
+    input_data_uint += last_dim;
+    output_data += last_dim;
+  }
+}
+
+inline void LogSoftmax(const SoftmaxParams& params,
+                       const RuntimeShape& input_shape, const float* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("LogSoftmax");
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    VectorMap<const float> block_input(input_data + i * depth, depth, 1);
+    VectorMap<float> block_output(output_data + i * depth, depth, 1);
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    const float max = block_input.maxCoeff();
+    const float log_sum = std::log((block_input.array() - max).exp().sum());
+    block_output = block_input.array() - max - log_sum;
+  }
+}
+
+// Backwards compatibility. Less optimized than below version.
+inline void LogSoftmax(const SoftmaxParams& params,
+                       const RuntimeShape& input_shape,
+                       const uint8_t* input_data,
+                       const RuntimeShape& output_shape, uint8_t* output_data) {
+  reference_ops::LogSoftmax(params, input_shape, input_data, output_shape,
+                            output_data);
+}
+
+// Compute LogSoftmax as (x - x_max) - ln(sum(e^(x_i - x_max)...)
+// as done in tf.nn.log_softmax to prevent underflow and overflow.
+// This is in contrast to just log(softmax(x))
+//
+// To handle quantization, first dequantize the inputs (from doing
+// e^(input scale * val) where we ignore the zero point since it cancels
+// out during subtraction due to the ln) and do a rescale at the end to int8_t.
+//
+// Notably this makes use of float and is intended as the optimized
+// form for quantized execution on CPU. For a fully integer version,
+// see the reference op.
+//
+// TODO(tflite): notes for optimization:
+// 1) See if e^ is also bottleneck in the reference fully-integer
+// version and apply lookup there and compare.
+template <typename T>
+inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
+                       const RuntimeShape& input_shape, const T* input_data,
+                       const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("LogSoftmax");
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int excluding_last_dim =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  const int32_t clamp_max = std::numeric_limits<T>::max();
+  const int32_t clamp_min = std::numeric_limits<T>::min();
+
+  for (int i = 0; i < excluding_last_dim; ++i) {
+    T max_val = std::numeric_limits<T>::min();
+    // Find max quantized value.
+    for (int j = 0; j < last_dim; ++j) {
+      max_val = std::max(max_val, input_data[j]);
+    }
+
+    float sum_exp = 0.0f;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    // Offset into table to compute exp(scale*(x - xmax)) instead of
+    // exp(scale*(x)) to prevent overflow.
+    const float* table_offset = &params.table[max_uint8 - max_val];
+    // Calculate sum(exp(scale*(x - x_max))).
+    for (int j = 0; j < last_dim; ++j) {
+      sum_exp += table_offset[input_data[j]];
+    }
+    const float log_sum_exp = std::log(sum_exp);
+
+    // params.scale is the output scale.
+    const float scale = input_scale / params.scale;
+    const float precomputed =
+        (input_scale * max_val + log_sum_exp) / params.scale;
+    for (int j = 0; j < last_dim; ++j) {
+      // Equivalent to (input_scale * (input_data[j] - max_val) - log_sum_exp) /
+      // output_scale.
+      const float log_prob = scale * input_data[j] - precomputed;
+
+      // TODO(tflite): look into better solution.
+      // Use std::rint over std::round (which is used in
+      // FakeQuant) since it's multiple times faster on tested arm32.
+      const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
+      output_data[j] = static_cast<T>(
+          std::max(std::min(clamp_max, prob_quantized), clamp_min));
+    }
+    input_data += last_dim;
+    output_data += last_dim;
+  }
+}
+
+inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
+                     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Logistic");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() =
+      input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
+                     const float* input_data, const RuntimeShape& output_shape,
+                     float* output_data) {
+  // Drop params: not needed.
+  Logistic(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const LogisticParams& params,
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Logistic/Int16");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+  }
+
+  int c = 0;
+  const int16_t* input_data_ptr = input_data;
+  int16_t* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      vst1q_s16(output_data_ptr, output0.raw());
+      vst1q_s16(output_data_ptr + 8, output1.raw());
+
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F0 output = gemmlowp::logistic(input);
+      vst1q_s16(output_data_ptr, output.raw());
+
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output0.raw().v);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                       output1.raw().v);
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F0 output = gemmlowp::logistic(input);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output.raw().v);
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    for (; c < flat_size; ++c) {
+      F3 input = F3::FromRaw(*input_data_ptr);
+      F0 output = gemmlowp::logistic(input);
+      *output_data_ptr = output.raw();
+
+      ++input_data_ptr;
+      ++output_data_ptr;
+    }
+  }
+}
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Tanh");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = input_map.array().tanh();
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16_t* input_data, const RuntimeShape& output_shape,
+                 int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Tanh/Int16");
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+  const int16_t* input_data_ptr = input_data;
+  int16_t* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr + 8)));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr + 8)))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(*input_data_ptr);
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
+    } else {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(
+            gemmlowp::SaturatingRoundingMultiplyByPOT<1>(*input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
+    }
+  }
+}
+
+template <typename SrcT, typename DstT>
+inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
+                 const RuntimeShape& output_shape, DstT* output_data) {
+  ruy::profiler::ScopeLabel label("Cast");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = input_map.array().template cast<DstT>();
+}
+
+inline void Floor(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Floor");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = Eigen::floor(input_map.array());
+}
+
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Ceil");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = Eigen::ceil(input_map.array());
+}
+
+// Helper methods for BatchToSpaceND.
+// `spatial_index_dim` specifies post-crop offset index in this spatial
+// dimension, i.e. spatial offset introduced by flattening batch to spatial
+// dimension minus the crop size at beginning. `block_shape_dim` is the block
+// size in current dimension. `input_dim` and `output_dim` are input and output
+// size of BatchToSpaceND operation in current dimension.
+// Output start index is inclusive and end index is exclusive.
+inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                          int input_dim, int output_dim, int* start_index,
+                          int* end_index) {
+  // (*start_index) * block_shape_dim is effectively rounded up to the next
+  // multiple of block_shape_dim by the integer division.
+  *start_index =
+      std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
+  // end_index is exclusive).
+  *end_index = std::min(
+      input_dim,
+      (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+}
+
+template <typename T>
+inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const int32_t* block_shape_data,
+                           const RuntimeShape& unextended_input3_shape,
+                           const int32_t* crops_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  ruy::profiler::ScopeLabel label("BatchToSpaceND");
+
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  auto extend_shape = [](const RuntimeShape& shape) {
+    if (shape.DimensionsCount() == 4) {
+      return shape;
+    }
+    RuntimeShape new_shape(4, 1);
+    new_shape.SetDim(0, shape.Dims(0));
+    new_shape.SetDim(1, shape.Dims(1));
+    new_shape.SetDim(3, shape.Dims(2));
+    return new_shape;
+  };
+  const RuntimeShape input1_shape = extend_shape(unextended_input1_shape);
+  const RuntimeShape output_shape = extend_shape(unextended_output_shape);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
+  const int crops_top = crops_data[0];
+  const int crops_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
+
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+
+    int in_h_start = 0;
+    int in_h_end = 0;
+    // GetIndexRange ensures start and end indices are in [0, output_height).
+    GetIndexRange(spatial_offset / block_shape_width - crops_top,
+                  block_shape_height, input_height, output_height, &in_h_start,
+                  &in_h_end);
+
+    for (int in_h = in_h_start; in_h < in_h_end; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      TFLITE_DCHECK_GE(out_h, 0);
+      TFLITE_DCHECK_LT(out_h, output_height);
+
+      int in_w_start = 0;
+      int in_w_end = 0;
+      // GetIndexRange ensures start and end indices are in [0, output_width).
+      GetIndexRange(spatial_offset % block_shape_width - crops_left,
+                    block_shape_width, input_width, output_width, &in_w_start,
+                    &in_w_end);
+
+      for (int in_w = in_w_start; in_w < in_w_end; ++in_w) {
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+        TFLITE_DCHECK_GE(out_w, 0);
+        TFLITE_DCHECK_LT(out_w, output_width);
+        T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T* in =
+            input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+template <typename T>
+TFLITE_NOINLINE void TypedMemset(void* ptr, T value, size_t num) {
+  // Optimization for common cases where memset() will suffice.
+  if (value == 0 || std::is_same<T, uint8_t>::value) {
+    memset(ptr, value, num * sizeof(T));
+  } else {
+    // Default implementation for cases where memset() will not preserve the
+    // bytes, e.g., typically when sizeof(T) > sizeof(uint8_t).
+    char* pos = static_cast<char*>(ptr);
+    for (size_t i = 0; i < num; ++i) {
+      memcpy(pos, &value, sizeof(T));
+      pos = pos + sizeof(T);
+    }
+  }
+}
+
+// This makes heavy use of Offset, along with conditional branches. There may be
+// opportunities for improvement.
+//
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
+  ruy::profiler::ScopeLabel label("PadImpl");
+  const int max_supported_dims = 5;
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(max_supported_dims, input_shape);
+  const RuntimeShape ext_output_shape =
+      RuntimeShape::ExtendedShape(max_supported_dims, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, max_supported_dims);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, max_supported_dims);
+
+  // Pad kernels are limited to max 4 dimensions. Copy inputs so we can pad them
+  // to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(max_supported_dims, 0);
+  const int left_padding_extend =
+      max_supported_dims - op_params.left_padding_count;
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[left_padding_extend + i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(max_supported_dims, 0);
+  const int right_padding_extend =
+      max_supported_dims - op_params.right_padding_count;
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[right_padding_extend + i] = op_params.right_padding[i];
+  }
+
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_spatial_dim1 = ext_output_shape.Dims(1);
+  const int output_spatial_dim2 = ext_output_shape.Dims(2);
+  const int output_spatial_dim3 = ext_output_shape.Dims(3);
+  const int output_channel = ext_output_shape.Dims(4);
+
+  const int left_b_padding = left_padding_copy[0];
+  const int left_s1_padding = left_padding_copy[1];
+  const int left_s2_padding = left_padding_copy[2];
+  const int left_s3_padding = left_padding_copy[3];
+  const int left_c_padding = left_padding_copy[4];
+
+  const int right_b_padding = right_padding_copy[0];
+  const int right_s1_padding = right_padding_copy[1];
+  const int right_s2_padding = right_padding_copy[2];
+  const int right_s3_padding = right_padding_copy[3];
+  const int right_c_padding = right_padding_copy[4];
+
+  const int input_depth = ext_input_shape.Dims(4);
+  const T pad_value = *pad_value_ptr;
+
+  if (left_b_padding != 0) {
+    TypedMemset<T>(output_data, pad_value,
+                   left_b_padding * output_spatial_dim1 * output_spatial_dim2 *
+                       output_spatial_dim3 * output_channel);
+  }
+  for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
+       ++out_b) {
+    if (left_s1_padding != 0) {
+      TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, 0, 0, 0, 0),
+                     pad_value,
+                     left_s1_padding * output_spatial_dim2 *
+                         output_spatial_dim3 * output_channel);
+    }
+    for (int out_p = left_s1_padding;
+         out_p < output_spatial_dim1 - right_s1_padding; ++out_p) {
+      if (left_s2_padding != 0) {
+        TypedMemset<T>(
+            output_data + Offset(ext_output_shape, out_b, out_p, 0, 0, 0),
+            pad_value, left_s2_padding * output_spatial_dim3 * output_channel);
+      }
+      for (int out_h = left_s2_padding;
+           out_h < output_spatial_dim2 - right_s2_padding; ++out_h) {
+        if (left_s3_padding != 0) {
+          TypedMemset<T>(
+              output_data + Offset(ext_output_shape, out_b, out_p, out_h, 0, 0),
+              pad_value, left_s3_padding * output_channel);
+        }
+        for (int out_w = left_s3_padding;
+             out_w < output_spatial_dim3 - right_s3_padding; ++out_w) {
+          if (left_c_padding != 0) {
+            TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, out_p,
+                                                out_h, out_w, 0),
+                           pad_value, left_c_padding);
+          }
+
+          T* out = output_data + Offset(ext_output_shape, out_b, out_p, out_h,
+                                        out_w, left_c_padding);
+          const T* in = input_data +
+                        Offset(ext_input_shape, out_b - left_b_padding,
+                               out_p - left_s1_padding, out_h - left_s2_padding,
+                               out_w - left_s3_padding, 0);
+          memcpy(out, in, input_depth * sizeof(T));
+
+          if (right_c_padding != 0) {
+            TypedMemset<T>(
+                output_data + Offset(ext_output_shape, out_b, out_p, out_h,
+                                     out_w, output_channel - right_c_padding),
+                pad_value, right_c_padding);
+          }
+        }
+        if (right_s3_padding != 0) {
+          TypedMemset<T>(
+              output_data + Offset(ext_output_shape, out_b, out_p, out_h,
+                                   output_spatial_dim3 - right_s3_padding, 0),
+              pad_value, right_s3_padding * output_channel);
+        }
+      }
+      if (right_s2_padding != 0) {
+        TypedMemset<T>(
+            output_data + Offset(ext_output_shape, out_b, out_p,
+                                 output_spatial_dim2 - right_s2_padding, 0, 0),
+            pad_value, right_s2_padding * output_spatial_dim3 * output_channel);
+      }
+    }
+    if (right_s1_padding != 0) {
+      TypedMemset<T>(
+          output_data + Offset(ext_output_shape, out_b,
+                               output_spatial_dim1 - right_s1_padding, 0, 0, 0),
+          pad_value,
+          right_s1_padding * output_spatial_dim2 * output_spatial_dim3 *
+              output_channel);
+    }
+  }
+  if (right_b_padding != 0) {
+    TypedMemset<T>(
+        output_data + Offset(ext_output_shape, output_batch - right_b_padding,
+                             0, 0, 0, 0),
+        pad_value,
+        right_b_padding * output_spatial_dim1 * output_spatial_dim2 *
+            output_spatial_dim3 * output_channel);
+  }
+}
+
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// TODO(b/117643175): Optimize. (This is an introductory copy of standard Pad.)
+//
+// This pad requires that (a) left and right paddings are in the 4D patterns
+// {0, h_pad, w_pad, 0}, and (b) memset can be used: *pad_value_ptr == 0 and/or
+// T is uint8_t.
+//
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
+template <typename T, typename P>
+inline void PadImageStyleMemset(const tflite::PadParams& op_params,
+                                const RuntimeShape& input_shape,
+                                const T* input_data, const P* pad_value_ptr,
+                                const RuntimeShape& output_shape,
+                                T* output_data) {
+  ruy::profiler::ScopeLabel label("PadImageStyle");
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(4, input_shape);
+  const RuntimeShape ext_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
+
+  // Pad kernels are limited to max 4 dimensions. Copy inputs so we can pad them
+  // to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(4, 0);
+  const int left_padding_extend = 4 - op_params.left_padding_count;
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[left_padding_extend + i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(4, 0);
+  const int right_padding_extend = 4 - op_params.right_padding_count;
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[right_padding_extend + i] = op_params.right_padding[i];
+  }
+  // The following padding restrictions are contractual requirements, and
+  // embody what it means for a padding op to be "image-style".
+  TFLITE_DCHECK_EQ(left_padding_copy[0], 0);
+  TFLITE_DCHECK_EQ(left_padding_copy[3], 0);
+  TFLITE_DCHECK_EQ(right_padding_copy[0], 0);
+  TFLITE_DCHECK_EQ(right_padding_copy[3], 0);
+
+  const int batch = MatchingDim(ext_input_shape, 0, ext_output_shape, 0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int input_height = ext_input_shape.Dims(1);
+  const int input_width = ext_input_shape.Dims(2);
+  const int depth = MatchingDim(ext_input_shape, 3, ext_output_shape, 3);
+
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+
+  TFLITE_DCHECK_EQ(output_height,
+                   input_height + left_h_padding + right_h_padding);
+  TFLITE_DCHECK_EQ(output_width,
+                   input_width + left_w_padding + right_w_padding);
+
+  const T pad_value = *pad_value_ptr;
+  const int top_block_size = left_h_padding * output_width * depth;
+  const size_t num_top_block_bytes = top_block_size * sizeof(T);
+  const int bottom_block_size = right_h_padding * output_width * depth;
+  const size_t num_bottom_block_bytes = bottom_block_size * sizeof(T);
+  const int left_blocks_size = left_w_padding * depth;
+  const size_t num_left_block_bytes = left_blocks_size * sizeof(T);
+  const int right_blocks_size = right_w_padding * depth;
+  const size_t num_right_block_bytes = right_blocks_size * sizeof(T);
+  const int inner_line_size = input_width * depth;
+  const size_t num_inner_line_bytes = inner_line_size * sizeof(T);
+
+  if (input_height == 0) {
+    memset(output_data, pad_value,
+           num_top_block_bytes + num_bottom_block_bytes);
+  } else {
+    for (int i = 0; i < batch; ++i) {
+      // For each image in the batch, apply the top padding, then iterate
+      // through rows, then apply the bottom padding.
+      //
+      // By unwinding one iteration, we can combine the first left-margin
+      // padding with the top padding, and the last right-margin padding with
+      // the bottom padding.
+      memset(output_data, pad_value,
+             num_top_block_bytes + num_left_block_bytes);
+      output_data += top_block_size + left_blocks_size;
+      memcpy(output_data, input_data, num_inner_line_bytes);
+      input_data += inner_line_size;
+      output_data += inner_line_size;
+      // One iteration unwound.
+      // Unwinding this loop affords the opportunity to reorder the loop work
+      // and hence combine memset() calls.
+      //
+      // Before unwinding:
+      // for (int j = 0; j < input_height; ++j) {
+      //   // Pad on left, copy central data, pad on right.
+      //   memset(output_data, pad_value, num_left_block_bytes);
+      //   output_data += left_blocks_size;
+      //   memcpy(output_data, input_data, num_inner_line_bytes);
+      //   input_data += inner_line_size;
+      //   output_data += inner_line_size;
+      //   memset(output_data, pad_value, num_right_block_bytes);
+      //   output_data += right_blocks_size;
+      // }
+      for (int j = 1; j < input_height; ++j) {
+        memset(output_data, pad_value,
+               num_right_block_bytes + num_left_block_bytes);
+        output_data += right_blocks_size + left_blocks_size;
+        memcpy(output_data, input_data, num_inner_line_bytes);
+        input_data += inner_line_size;
+        output_data += inner_line_size;
+      }
+      memset(output_data, pad_value,
+             num_right_block_bytes + num_bottom_block_bytes);
+      output_data += right_blocks_size + bottom_block_size;
+    }
+  }
+}
+
+template <typename T, typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const P* pad_value_ptr,
+                          const RuntimeShape& output_shape, T* output_data) {
+  reference_ops::PadImageStyle(op_params, input_shape, input_data,
+                               pad_value_ptr, output_shape, output_data);
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const uint8_t* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          uint8_t* output_data) {
+  PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
+                      output_shape, output_data);
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const float* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          float* output_data) {
+  const float converted_pad_value = static_cast<float>(*pad_value_ptr);
+  if (converted_pad_value == 0.0f) {
+    PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
+                        output_shape, output_data);
+  } else {
+    PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+            output_data);
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape,
+                  const RuntimeShape& output_shape,
+                  SequentialTensorWriter<T>* writer) {
+  ruy::profiler::ScopeLabel label("Slice");
+  const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  std::array<int, 5> start;
+  std::array<int, 5> stop;
+  for (int i = 0; i < 5; ++i) {
+    int padded_i = 5 - i;
+    start[i] =
+        begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] =
+        (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+            ? ext_shape.Dims(i)
+            : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  for (int i0 = start[0]; i0 < stop[0]; ++i0) {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1) {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2) {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3) {
+          const int len = stop[4] - start[4];
+          if (len > 0)
+            writer->WriteN(Offset(ext_shape, i0, i1, i2, i3, start[4]), len);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const TfLiteTensor* input,
+                  const RuntimeShape& output_shape, TfLiteTensor* output) {
+  SequentialTensorWriter<T> writer(input, output);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  ruy::profiler::ScopeLabel label("TensorFlowMinimum");
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  auto min_value = input2_data[0];
+  output_map.array() = input1_map.array().min(min_value);
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+                    const RuntimeShape&, const T* input2_data,
+                    const RuntimeShape& output_shape, T* output_data) {
+  // Drop shape of second input: not needed.
+  Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  ruy::profiler::ScopeLabel label("TensorFlowMaximum");
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  auto max_value = input2_data[0];
+  output_map.array() = input1_map.array().max(max_value);
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+                    const RuntimeShape&, const T* input2_data,
+                    const RuntimeShape& output_shape, T* output_data) {
+  // Drop shape of second input: not needed.
+  Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void TransposeIm2col(const ConvParams& params, uint8_t zero_byte,
+                     const RuntimeShape& input_shape, const T* input_data,
+                     const RuntimeShape& filter_shape,
+                     const RuntimeShape& output_shape, T* im2col_data) {
+  ruy::profiler::ScopeLabel label("TransposeIm2col");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK(im2col_data);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  MatchingDim(output_shape, 3, filter_shape, 0);  // output_depth
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  const RuntimeShape row_shape({1, batches, output_height, output_width});
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  const RuntimeShape col_shape({1, filter_height, filter_width, input_depth});
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  const RuntimeShape im2col_shape(
+      {1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  // Build the im2col matrix by looping through all the input pixels,
+  // computing their influence on the output, rather than looping through all
+  // the output pixels. We therefore must initialize the im2col array to zero.
+  // This is potentially inefficient because we subsequently overwrite bytes
+  // set here. However, in practice memset is very fast and costs negligible.
+  memset(im2col_data, zero_byte, im2col_shape.FlatSize() * sizeof(T));
+
+  // Loop through the output batches
+  for (int batch = 0; batch < batches; ++batch) {
+    // Loop through input pixels one at a time.
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        // Loop through the output pixels it will influence
+        const int out_x_origin = (in_x * stride_width) - pad_width;
+        const int out_y_origin = (in_y * stride_height) - pad_height;
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+          const int out_y = out_y_origin + filter_y;
+          // Is output pixel within height bounds?
+          if ((out_y >= 0) && (out_y < output_height)) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int out_x = out_x_origin + filter_x;
+              // Is output pixel within width bounds?
+              if ((out_x >= 0) && (out_x < output_width)) {
+                // Copy the input elements of this pixel
+                T const* src =
+                    input_data + Offset(input_shape, batch, in_y, in_x, 0);
+                int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
+                int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
+                T* dst = im2col_data +
+                         Offset(im2col_shape, 0, 0, row_offset, col_offset);
+                memcpy(dst, src, input_depth * sizeof(T));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage
+// order (height, width, depth), constructed from patches in 'col_data', which
+// is required to be in storage order (out_height * out_width, filter_height,
+// filter_width, in_depth).  Implementation by Yangqing Jia (jiayq).
+// Copied from //tensorflow/core/kernels/conv_grad_input_ops.cc
+template <typename T>
+void Col2im(const T* col_data, const int depth, const int height,
+            const int width, const int filter_h, const int filter_w,
+            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+            const int stride_h, const int stride_w, T* im_data) {
+  ruy::profiler::ScopeLabel label("Col2im");
+  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
+      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            // TODO(andydavis) Vectorize this loop (if compiler does not).
+            for (int i = 0; i < depth; ++i) {
+              im_patch_data[i] += col_data[i];
+            }
+          }
+          im_patch_data += depth;
+          col_data += depth;
+        }
+        // Jump over remaining number of depth.
+        im_patch_data += depth * (width - filter_w);
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+// TODO(b/188008864) Optimize this function by combining outer loops.
+template <typename T>
+void BiasAdd(T* im_data, const T* bias_data, const int batch_size,
+             const int height, const int width, const int depth) {
+  if (bias_data) {
+    for (int n = 0; n < batch_size; ++n) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          for (int d = 0; d < depth; ++d) {
+            im_data[d] += bias_data[d];
+          }
+          im_data += depth;
+        }
+      }
+    }
+  }
+}
+
+// TransposeConvV2 expect the weights in HWOI order.
+inline void TransposeConvV2(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
+    const float* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* const output_data, const RuntimeShape& col2im_shape,
+    float* col2im_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("TransposeConvV2/float");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK(col2im_data);
+  TFLITE_DCHECK(hwoi_ordered_filter_data);
+
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_image_size = output_height * output_width;
+  const int input_depth =
+      MatchingDim(input_shape, 3, hwoi_ordered_filter_shape, 3);
+  const int output_depth =
+      MatchingDim(output_shape, 3, hwoi_ordered_filter_shape, 2);
+  const int input_offset = input_image_size * input_depth;
+  const int output_offset = output_image_size * output_depth;
+
+  const int filter_height = hwoi_ordered_filter_shape.Dims(0);
+  const int filter_width = hwoi_ordered_filter_shape.Dims(1);
+  const int padding_top = params.padding_values.height;
+  const int padding_bottom =
+      params.padding_values.height + params.padding_values.height_offset;
+  const int padding_left = params.padding_values.width;
+  const int padding_right =
+      params.padding_values.width + params.padding_values.width_offset;
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const int hwoi_ordered_filter_total_size =
+      filter_height * filter_width * output_depth;
+
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = hwoi_ordered_filter_total_size;
+  lhs_params.cols = input_depth;
+  float* output_data_p = output_data;
+  std::fill_n(output_data, output_offset * batch_size, 0.0f);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_backend_gemm::MatrixParams<float> rhs_params;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.rows = input_depth;
+    rhs_params.cols = input_image_size;
+    cpu_backend_gemm::MatrixParams<float> dst_params;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.rows = hwoi_ordered_filter_total_size;
+    dst_params.cols = input_image_size;
+    cpu_backend_gemm::GemmParams<float, float> gemm_params;
+    cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
+                           input_data + input_offset * i, dst_params,
+                           col2im_data, gemm_params, cpu_backend_context);
+
+    Col2im(col2im_data, output_depth, output_height, output_width,
+           filter_height, filter_width, padding_top, padding_left,
+           padding_bottom, padding_right, stride_height, stride_width,
+           output_data_p);
+    output_data_p += output_offset;
+  }
+  output_data_p = output_data;
+  BiasAdd(output_data_p, bias_data, batch_size, output_height, output_width,
+          output_depth);
+
+  for (int i = 0; i < output_offset * batch_size; ++i) {
+    output_data[i] = std::min(std::max(output_data[i], output_activation_min),
+                              output_activation_max);
+  }
+}
+
+inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
+                     int32_t output_zp, const int32_t output_min,
+                     const int32_t output_max, int32_t* scratch,
+                     uint8_t* output) {
+  ruy::profiler::ScopeLabel label("Quantize/uint8_t");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
+  const int32x4_t max_val_dup = vdupq_n_s32(output_max);
+  const int32x4_t min_val_dup = vdupq_n_s32(output_min);
+
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+
+  for (; i <= total_size - 16; i += 16) {
+    int32x4x4_t scratch_val;
+    scratch_val.val[0] = vld1q_s32(scratch + i);
+    scratch_val.val[1] = vld1q_s32(scratch + i + 4);
+    scratch_val.val[2] = vld1q_s32(scratch + i + 8);
+    scratch_val.val[3] = vld1q_s32(scratch + i + 12);
+
+    int32x4x4_t temp_val =
+        MultiplyByQuantizedMultiplier4Rows(scratch_val, multiplier, shift);
+
+    temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+    temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+    temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
+    temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
+
+    temp_val.val[0] =
+        vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+    temp_val.val[1] =
+        vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+    temp_val.val[2] =
+        vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+    temp_val.val[3] =
+        vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+    const uint16x8_t result_1 =
+        vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[0])),
+                     vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[1])));
+    const uint16x8_t result_2 =
+        vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[2])),
+                     vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[3])));
+    const uint8x16_t result =
+        vcombine_u8(vqmovn_u16(result_1), vqmovn_u16(result_2));
+    vst1q_u8(output + i, result);
+  }
+#endif
+  for (; i < total_size; ++i) {
+    int32_t temp = MultiplyByQuantizedMultiplier(scratch[i], multiplier, shift);
+    temp += output_zp;
+    if (temp > output_max) {
+      temp = output_max;
+    }
+    if (temp < output_min) {
+      temp = output_min;
+    }
+    output[i] = static_cast<uint8_t>(temp);
+  }
+}
+
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
+inline void Quantize(const int32_t* multiplier, const int32_t* shift,
+                     int32_t channel_size, int32_t total_size,
+                     int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t* scratch, int8_t* output) {
+  ruy::profiler::ScopeLabel label("Quantize/int8_t");
+
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t minus_ones = vdupq_n_s32(-1);
+#endif
+
+  TFLITE_DCHECK_EQ(total_size % channel_size, 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  for (; c <= channel_size - 8; c += 8) {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, minus_ones);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, minus_ones);
+
+    int32x4_t left_shift_1 = vsubq_s32(out_shift_1, right_shift_1);
+    int32x4_t left_shift_2 = vsubq_s32(out_shift_2, right_shift_2);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int8_t and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
+      const int8x8_t res_s8 = vqmovn_s16(res_s16);
+      vst1_s8(output + loc, res_s8);
+    }
+  }
+
+#endif  // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++) {
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+inline void Quantize(const int32_t* multiplier, const int32_t* shift,
+                     int32_t channel_size, int32_t total_size,
+                     int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t* scratch, int16_t* output) {
+  ruy::profiler::ScopeLabel label("Quantize(Single-rounding)/int16_t");
+
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t minus_ones = vdupq_n_s32(-1);
+#endif
+
+  TFLITE_DCHECK_EQ(total_size % channel_size, 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  for (; c <= channel_size - 8; c += 8) {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, minus_ones);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, minus_ones);
+
+    int32x4_t left_shift_1 = vsubq_s32(out_shift_1, right_shift_1);
+    int32x4_t left_shift_2 = vsubq_s32(out_shift_2, right_shift_2);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int16_t and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      vst1_s16(reinterpret_cast<int16_t*>(output) + loc, acc_s16_1);
+      vst1_s16(reinterpret_cast<int16_t*>(output) + loc + 4, acc_s16_2);
+    }
+  }
+
+#endif  // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++) {
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int16_t>(acc);
+    }
+  }
+}
+// Double-rounding MultiplyByQuantizedMultiplier
+#else
+inline void Quantize(const int32_t* multiplier, const int32_t* shift,
+                     int32_t channel_size, int32_t total_size,
+                     int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t* scratch, int8_t* output) {
+  ruy::profiler::ScopeLabel label("Quantize/int8_t");
+
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t zeros = vdupq_n_s32(0);
+#endif
+
+  TFLITE_DCHECK_EQ(total_size % channel_size, 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  using gemmlowp::RoundingDivideByPOT;
+  for (; c <= channel_size - 8; c += 8) {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+    int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
+    int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
+
+    // Right shift will be performed as left shift with negative values.
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Rounding Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int8_t and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
+      const int8x8_t res_s8 = vqmovn_s16(res_s16);
+      vst1_s8(output + loc, res_s8);
+    }
+  }
+
+#endif  // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++) {
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+inline void Quantize(const int32_t* multiplier, const int32_t* shift,
+                     int32_t channel_size, int32_t total_size,
+                     int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t* scratch, int16_t* output) {
+  ruy::profiler::ScopeLabel label("Quantize(Double-rounding)/int16_t");
+
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t zeros = vdupq_n_s32(0);
+#endif
+
+  TFLITE_DCHECK_EQ(total_size % channel_size, 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  using gemmlowp::RoundingDivideByPOT;
+  for (; c <= channel_size - 8; c += 8) {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+    int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
+    int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
+
+    // Right shift will be performed as left shift with negative values.
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Rounding Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int16_t and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      vst1_s16(reinterpret_cast<int16_t*>(output) + loc, acc_s16_1);
+      vst1_s16(reinterpret_cast<int16_t*>(output) + loc + 4, acc_s16_2);
+    }
+  }
+
+#endif  // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++) {
+    for (int n = 0; n < rows; ++n) {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int16_t>(acc);
+    }
+  }
+}
+#endif  // TFLITE_SINGLE_ROUNDING
+
+// TransposeConvV2 expect the weights in HWOI order.
+inline void TransposeConvV2(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
+    const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, const RuntimeShape& col2im_shape,
+    int32_t* col2im_data, int32_t* scratch_data,
+    CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("TransposeConvV2/uint8_t");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK(col2im_data);
+  TFLITE_DCHECK(hwoi_ordered_filter_data);
+
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_image_size = output_height * output_width;
+  const int input_depth =
+      MatchingDim(input_shape, 3, hwoi_ordered_filter_shape, 3);
+  const int output_depth =
+      MatchingDim(output_shape, 3, hwoi_ordered_filter_shape, 2);
+  const int input_offset = input_image_size * input_depth;
+  const int output_offset = output_image_size * output_depth;
+
+  const int filter_height = hwoi_ordered_filter_shape.Dims(0);
+  const int filter_width = hwoi_ordered_filter_shape.Dims(1);
+  const int padding_top = params.padding_values.height;
+  const int padding_bottom =
+      params.padding_values.height + params.padding_values.height_offset;
+  const int padding_left = params.padding_values.width;
+  const int padding_right =
+      params.padding_values.width + params.padding_values.width_offset;
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const int hwoi_ordered_filter_total_size =
+      filter_height * filter_width * output_depth;
+
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = hwoi_ordered_filter_total_size;
+  lhs_params.cols = input_depth;
+  lhs_params.zero_point = -params.weights_offset;
+
+  int32_t* scratch_data_p = scratch_data;
+  std::fill_n(scratch_data, output_offset * batch_size,
+              static_cast<int32_t>(0));
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.rows = input_depth;
+    rhs_params.cols = input_image_size;
+    rhs_params.zero_point = -params.input_offset;
+
+    cpu_backend_gemm::MatrixParams<int32_t> dst_params;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.rows = hwoi_ordered_filter_total_size;
+    dst_params.cols = input_image_size;
+
+    cpu_backend_gemm::GemmParams<int32_t, int32_t> gemm_params;
+    cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
+                           input_data + input_offset * i, dst_params,
+                           col2im_data, gemm_params, cpu_backend_context);
+
+    Col2im(col2im_data, output_depth, output_height, output_width,
+           filter_height, filter_width, padding_top, padding_left,
+           padding_bottom, padding_right, stride_height, stride_width,
+           scratch_data_p);
+
+    scratch_data_p += output_offset;
+  }
+  scratch_data_p = scratch_data;
+  BiasAdd(scratch_data_p, bias_data, batch_size, output_height, output_width,
+          output_depth);
+
+  Quantize(params.output_multiplier, params.output_shift,
+           output_shape.FlatSize(), params.output_offset, output_activation_min,
+           output_activation_max, scratch_data, output_data);
+}
+
+// Integer-only version of ResizeNearestNeighbor. Since scales are represented
+// in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
+// reference version. Debug checks are in place to test if this occurs.
+// NOTE: If align_corners or half_pixel_centers is true, we use the reference
+// version.
+inline void ResizeNearestNeighbor(
+    const tflite::ResizeNearestNeighborParams& op_params,
+    const RuntimeShape& unextended_input_shape, const uint8_t* input_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
+    const RuntimeShape& unextended_output_shape, uint8_t* output_data) {
+  if (op_params.align_corners || op_params.half_pixel_centers) {
+    // TODO(b/149823713): Add support for align_corners & half_pixel_centers in
+    // this kernel.
+    reference_ops::ResizeNearestNeighbor(
+        op_params, unextended_input_shape, input_data, output_size_shape,
+        output_size_data, unextended_output_shape, output_data);
+    return;
+  }
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  // The Tensorflow version of this op allows resize on the width and height
+  // axis only.
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
+
+  // Convert scales to fixed-point with 16 fractional bits. We add 1 as an
+  // error factor and to avoid zero scales. For example, with input_height = 1,
+  // output_height = 3, the float scaling factor would be non-zero at 1/3.
+  // With fixed-point, this is zero.
+  int32_t height_scale = (input_height << 16) / output_height + 1;
+  int32_t width_scale = (input_width << 16) / output_width + 1;
+
+  const int col_offset = input_shape.Dims(3);
+  const int row_offset = input_shape.Dims(2) * col_offset;
+  const int batch_offset = input_shape.Dims(1) * row_offset;
+
+  const uint8_t* input_ptr = input_data;
+  uint8_t* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32_t in_y = std::min((y * height_scale) >> 16, input_height - 1);
+      // Check offset calculation is the same as the reference version. See
+      // function comment for details. We check using a non-float version of:
+      // TFLITE_DCHECK_EQ(in_y, std::floor(y * (static_cast<float>(input_height)
+      //                                            / output_height)));
+      TFLITE_DCHECK_LT(y * input_height, output_height + in_y * output_height);
+      TFLITE_DCHECK_GE(y * input_height, in_y * output_height);
+      const uint8_t* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32_t in_x = std::min((x * width_scale) >> 16, input_width - 1);
+        // Check offset calculation is the same as the reference version. See
+        // function comment for details. We check using a non-float version of:
+        // TFLITE_DCHECK_EQ(in_y,
+        //                  std::floor(y * (static_cast<float>(input_width)
+        //                                      / output_width)));
+        TFLITE_DCHECK_LT(x * input_width, output_width + in_x * output_width);
+        TFLITE_DCHECK_GE(x * input_width, in_x * output_width);
+        const uint8_t* x_input_ptr = y_input_ptr + in_x * col_offset;
+        memcpy(output_ptr, x_input_ptr, depth);
+        output_ptr += depth;
+      }
+    }
+    input_ptr += batch_offset;
+  }
+}
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type* input_data, int32_t size,
+                       int32_t effective_scale_multiplier,
+                       int32_t effective_scale_shift, int32_t input_zeropoint,
+                       int32_t output_zeropoint, output_type* output_data) {
+  reference_ops::Requantize(input_data, size, effective_scale_multiplier,
+                            effective_scale_shift, input_zeropoint,
+                            output_zeropoint, output_data);
+}
+
+template <>
+inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift,
+                                        int32_t input_zeropoint,
+                                        int32_t output_zeropoint,
+                                        uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Requantize/Int8ToUint8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vmovl_s16(vget_low_s16(first_half));
+    input.val[1] = vmovl_s16(vget_high_s16(first_half));
+    input.val[2] = vmovl_s16(vget_low_s16(second_half));
+    input.val[3] = vmovl_s16(vget_high_s16(second_half));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
+        input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] =
+        vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] =
+        vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] =
+        vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] =
+        vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const uint32x4_t result_val_1_unsigned =
+        vreinterpretq_u32_s32(result.val[0]);
+    const uint32x4_t result_val_2_unsigned =
+        vreinterpretq_u32_s32(result.val[1]);
+    const uint32x4_t result_val_3_unsigned =
+        vreinterpretq_u32_s32(result.val[2]);
+    const uint32x4_t result_val_4_unsigned =
+        vreinterpretq_u32_s32(result.val[3]);
+
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half =
+        vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half =
+        vcombine_u16(narrowed_val_3, narrowed_val_4);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t narrowed_result =
+        vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift,
+                                        int32_t input_zeropoint,
+                                        int32_t output_zeropoint,
+                                        int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Requantize/Uint8ToInt8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16) {
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
+        input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] =
+        vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] =
+        vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] =
+        vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] =
+        vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
+    const int16x8_t output_first_half =
+        vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half =
+        vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t narrowed_result =
+        vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<int8_t, int8_t>(const int8_t* input_data, int32_t size,
+                                       int32_t effective_scale_multiplier,
+                                       int32_t effective_scale_shift,
+                                       int32_t input_zeropoint,
+                                       int32_t output_zeropoint,
+                                       int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Requantize/Int8ToInt8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vmovl_s16(vget_low_s16(first_half));
+    input.val[1] = vmovl_s16(vget_high_s16(first_half));
+    input.val[2] = vmovl_s16(vget_low_s16(second_half));
+    input.val[3] = vmovl_s16(vget_high_s16(second_half));
+
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
+        input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] =
+        vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] =
+        vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] =
+        vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] =
+        vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
+    const int16x8_t output_first_half =
+        vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half =
+        vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t narrowed_result =
+        vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<uint8_t, uint8_t>(
+    const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier,
+    int32_t effective_scale_shift, int32_t input_zeropoint,
+    int32_t output_zeropoint, uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Requantize/Uint8ToUint8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16) {
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
+        input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] =
+        vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] =
+        vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] =
+        vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] =
+        vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const uint32x4_t result_val_1_unsigned =
+        vreinterpretq_u32_s32(result.val[0]);
+    const uint32x4_t result_val_2_unsigned =
+        vreinterpretq_u32_s32(result.val[1]);
+    const uint32x4_t result_val_3_unsigned =
+        vreinterpretq_u32_s32(result.val[2]);
+    const uint32x4_t result_val_4_unsigned =
+        vreinterpretq_u32_s32(result.val[3]);
+
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half =
+        vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half =
+        vcombine_u16(narrowed_val_3, narrowed_val_4);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t narrowed_result =
+        vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+inline void HardSwish(const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("HardSwish/Float");
+  auto size = MatchingFlatSize(input_shape, output_shape);
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t zero = vdupq_n_f32(0.0f);
+  const float32x4_t three = vdupq_n_f32(3.0f);
+  const float32x4_t six = vdupq_n_f32(6.0f);
+  const float32x4_t one_sixth = vdupq_n_f32(1.0f / 6.0f);
+
+  for (; i <= size - 16; i += 16) {
+    // 4x partially unrolled version of the loop below. Refer to its comments.
+    const float32x4_t in_0 = vld1q_f32(input_data + i + 0);
+    const float32x4_t in_1 = vld1q_f32(input_data + i + 4);
+    const float32x4_t in_2 = vld1q_f32(input_data + i + 8);
+    const float32x4_t in_3 = vld1q_f32(input_data + i + 12);
+    const float32x4_t in_scaled_0 = vmulq_f32(in_0, one_sixth);
+    const float32x4_t in_scaled_1 = vmulq_f32(in_1, one_sixth);
+    const float32x4_t in_scaled_2 = vmulq_f32(in_2, one_sixth);
+    const float32x4_t in_scaled_3 = vmulq_f32(in_3, one_sixth);
+    const float32x4_t in_reluish_0 =
+        vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_0, three)));
+    const float32x4_t in_reluish_1 =
+        vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_1, three)));
+    const float32x4_t in_reluish_2 =
+        vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_2, three)));
+    const float32x4_t in_reluish_3 =
+        vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_3, three)));
+    const float32x4_t product_0 = vmulq_f32(in_scaled_0, in_reluish_0);
+    const float32x4_t product_1 = vmulq_f32(in_scaled_1, in_reluish_1);
+    const float32x4_t product_2 = vmulq_f32(in_scaled_2, in_reluish_2);
+    const float32x4_t product_3 = vmulq_f32(in_scaled_3, in_reluish_3);
+    vst1q_f32(output_data + i + 0, product_0);
+    vst1q_f32(output_data + i + 4, product_1);
+    vst1q_f32(output_data + i + 8, product_2);
+    vst1q_f32(output_data + i + 12, product_3);
+  }
+  for (; i <= size - 4; i += 4) {
+    // The expression to be computed is:
+    //   out = one_sixth * in * min(six, max(zero, (in + three)))
+    // We structure the AST to have two roughly balanced, independent branches:
+    //  - Multiplication: in_scaled = one_sixth * in.
+    //  - Addition and clamping: in_reluish = min(six, max(zero, (in + three))).
+    // Then the remaining multiplication at the root of the tree.
+    const float32x4_t in = vld1q_f32(input_data + i);
+    const float32x4_t in_scaled = vmulq_f32(in, one_sixth);
+    const float32x4_t in_reluish =
+        vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in, three)));
+    const float32x4_t product = vmulq_f32(in_scaled, in_reluish);
+    vst1q_f32(output_data + i, product);
+  }
+#endif
+  for (; i < size; i++) {
+    const float in = input_data[i];
+    output_data[i] =
+        in * std::min(6.0f, std::max(0.0f, in + 3.0f)) * (1.0f / 6.0f);
+  }
+}
+
+#ifdef USE_NEON
+inline void SaturateAndStore(int16x8_t src, std::uint8_t* dst) {
+  // Narrow values down to 8 bit unsigned, saturating.
+  uint8x8_t res8 = vqmovun_s16(src);
+  // Store results to destination.
+  vst1_u8(dst, res8);
+}
+
+inline void SaturateAndStore(int16x8_t src, std::int8_t* dst) {
+  // Narrow values down to 8 bit unsigned, saturating.
+  int8x8_t res8 = vqmovn_s16(src);
+  // Store results to destination.
+  vst1_s8(dst, res8);
+}
+#endif
+
+template <typename T>
+inline void HardSwish(const HardSwishParams& params,
+                      const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("HardSwish/Quantized");
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+  // This code heavily uses NEON saturating left shifts (vqshl*) with shift
+  // amounts that can be zero, in which case we rely on the correct behavior
+  // of a left shift by zero returning just its first operand unmodified.
+  // Unfortunately, the Intel arm_neon_sse.h implementation of vqshl* is
+  // buggy in the case of zero shift amounts, see b/137199585. That is why
+  // this NEON code path is restricted to true ARM NEON, excluding
+  // arm_neon_sse.h. Anyway, the arm_neon_sse.h implementation of saturating
+  // left shifts is slow scalar code, so there may not be much benefit in
+  // running that over just plain reference code.
+  //
+  // TODO(b/137199585): revisit when this is fixed.
+#ifdef __ARM_NEON
+  const int16x8_t positive_reluish_multiplier_exponent_minus_one =
+      vdupq_n_s16(std::max(0, params.reluish_multiplier_exponent - 1));
+  const int16x8_t positive_reluish_multiplier_exponent_last_bit =
+      vdupq_n_s16(params.reluish_multiplier_exponent > 0 ? 1 : 0);
+  const int16x8_t negative_reluish_multiplier_exponent =
+      vdupq_n_s16(std::min(0, params.reluish_multiplier_exponent));
+  const int16x8_t constant_32767 = vdupq_n_s16(32767);
+  const int16x8_t output_multiplier_exponent =
+      vdupq_n_s16(params.output_multiplier_exponent);
+  const int16x8_t output_zero_point = vdupq_n_s16(params.output_zero_point);
+  // 4x unrolled version of the below NEON loop. Read that first.
+  for (; i <= flat_size - 32; i += 32) {
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_value_0_1 =
+        Load16AndSubtractZeroPoint(input_data + i, params.input_zero_point);
+    const int16x8x2_t input_value_2_3 = Load16AndSubtractZeroPoint(
+        input_data + i + 16, params.input_zero_point);
+    const int16x8_t input_value_on_hires_input_scale_0 =
+        vshlq_n_s16(input_value_0_1.val[0], 7);
+    const int16x8_t input_value_on_hires_input_scale_1 =
+        vshlq_n_s16(input_value_0_1.val[1], 7);
+    const int16x8_t input_value_on_hires_input_scale_2 =
+        vshlq_n_s16(input_value_2_3.val[0], 7);
+    const int16x8_t input_value_on_hires_input_scale_3 =
+        vshlq_n_s16(input_value_2_3.val[1], 7);
+    const int16x8_t input_value_on_preshift_output_scale_0 =
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_0,
+                        params.output_multiplier_fixedpoint_int16);
+    const int16x8_t input_value_on_preshift_output_scale_1 =
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_1,
+                        params.output_multiplier_fixedpoint_int16);
+    const int16x8_t input_value_on_preshift_output_scale_2 =
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_2,
+                        params.output_multiplier_fixedpoint_int16);
+    const int16x8_t input_value_on_preshift_output_scale_3 =
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_3,
+                        params.output_multiplier_fixedpoint_int16);
+    int16x8_t reluish_value_0 = input_value_on_hires_input_scale_0;
+    int16x8_t reluish_value_1 = input_value_on_hires_input_scale_1;
+    int16x8_t reluish_value_2 = input_value_on_hires_input_scale_2;
+    int16x8_t reluish_value_3 = input_value_on_hires_input_scale_3;
+    reluish_value_0 = vqshlq_s16(
+        reluish_value_0, positive_reluish_multiplier_exponent_minus_one);
+    reluish_value_1 = vqshlq_s16(
+        reluish_value_1, positive_reluish_multiplier_exponent_minus_one);
+    reluish_value_2 = vqshlq_s16(
+        reluish_value_2, positive_reluish_multiplier_exponent_minus_one);
+    reluish_value_3 = vqshlq_s16(
+        reluish_value_3, positive_reluish_multiplier_exponent_minus_one);
+    reluish_value_0 = vqrdmulhq_n_s16(
+        reluish_value_0, params.reluish_multiplier_fixedpoint_int16);
+    reluish_value_1 = vqrdmulhq_n_s16(
+        reluish_value_1, params.reluish_multiplier_fixedpoint_int16);
+    reluish_value_2 = vqrdmulhq_n_s16(
+        reluish_value_2, params.reluish_multiplier_fixedpoint_int16);
+    reluish_value_3 = vqrdmulhq_n_s16(
+        reluish_value_3, params.reluish_multiplier_fixedpoint_int16);
+    reluish_value_0 = vqshlq_s16(reluish_value_0,
+                                 positive_reluish_multiplier_exponent_last_bit);
+    reluish_value_1 = vqshlq_s16(reluish_value_1,
+                                 positive_reluish_multiplier_exponent_last_bit);
+    reluish_value_2 = vqshlq_s16(reluish_value_2,
+                                 positive_reluish_multiplier_exponent_last_bit);
+    reluish_value_3 = vqshlq_s16(reluish_value_3,
+                                 positive_reluish_multiplier_exponent_last_bit);
+    reluish_value_0 =
+        vrshlq_s16(reluish_value_0, negative_reluish_multiplier_exponent);
+    reluish_value_1 =
+        vrshlq_s16(reluish_value_1, negative_reluish_multiplier_exponent);
+    reluish_value_2 =
+        vrshlq_s16(reluish_value_2, negative_reluish_multiplier_exponent);
+    reluish_value_3 =
+        vrshlq_s16(reluish_value_3, negative_reluish_multiplier_exponent);
+    reluish_value_0 = vrhaddq_s16(reluish_value_0, constant_32767);
+    reluish_value_1 = vrhaddq_s16(reluish_value_1, constant_32767);
+    reluish_value_2 = vrhaddq_s16(reluish_value_2, constant_32767);
+    reluish_value_3 = vrhaddq_s16(reluish_value_3, constant_32767);
+    const int16x8_t preshift_output_value_0 =
+        vqdmulhq_s16(reluish_value_0, input_value_on_preshift_output_scale_0);
+    const int16x8_t preshift_output_value_1 =
+        vqdmulhq_s16(reluish_value_1, input_value_on_preshift_output_scale_1);
+    const int16x8_t preshift_output_value_2 =
+        vqdmulhq_s16(reluish_value_2, input_value_on_preshift_output_scale_2);
+    const int16x8_t preshift_output_value_3 =
+        vqdmulhq_s16(reluish_value_3, input_value_on_preshift_output_scale_3);
+    int16x8_t output_value_0 =
+        vrshlq_s16(preshift_output_value_0, output_multiplier_exponent);
+    int16x8_t output_value_1 =
+        vrshlq_s16(preshift_output_value_1, output_multiplier_exponent);
+    int16x8_t output_value_2 =
+        vrshlq_s16(preshift_output_value_2, output_multiplier_exponent);
+    int16x8_t output_value_3 =
+        vrshlq_s16(preshift_output_value_3, output_multiplier_exponent);
+    output_value_0 = vaddq_s16(output_value_0, output_zero_point);
+    output_value_1 = vaddq_s16(output_value_1, output_zero_point);
+    output_value_2 = vaddq_s16(output_value_2, output_zero_point);
+    output_value_3 = vaddq_s16(output_value_3, output_zero_point);
+    SaturateAndStore(output_value_0, output_data + i);
+    SaturateAndStore(output_value_1, output_data + i + 8);
+    SaturateAndStore(output_value_2, output_data + i + 16);
+    SaturateAndStore(output_value_3, output_data + i + 24);
+  }
+  // NEON version of reference_ops::HardSwish. Read that first.
+  for (; i <= flat_size - 8; i += 8) {
+    using cpu_backend_gemm::detail::Load8AndSubtractZeroPoint;
+    const int16x8_t input_value =
+        Load8AndSubtractZeroPoint(input_data + i, params.input_zero_point);
+    const int16x8_t input_value_on_hires_input_scale =
+        vshlq_n_s16(input_value, 7);
+    const int16x8_t input_value_on_preshift_output_scale =
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale,
+                        params.output_multiplier_fixedpoint_int16);
+    int16x8_t reluish_value = input_value_on_hires_input_scale;
+    reluish_value = vqshlq_s16(reluish_value,
+                               positive_reluish_multiplier_exponent_minus_one);
+    reluish_value = vqrdmulhq_n_s16(reluish_value,
+                                    params.reluish_multiplier_fixedpoint_int16);
+    reluish_value = vqshlq_s16(reluish_value,
+                               positive_reluish_multiplier_exponent_last_bit);
+    reluish_value =
+        vrshlq_s16(reluish_value, negative_reluish_multiplier_exponent);
+    reluish_value = vrhaddq_s16(reluish_value, constant_32767);
+    const int16x8_t preshift_output_value =
+        vqdmulhq_s16(reluish_value, input_value_on_preshift_output_scale);
+    int16x8_t output_value =
+        vrshlq_s16(preshift_output_value, output_multiplier_exponent);
+    output_value = vaddq_s16(output_value, output_zero_point);
+    SaturateAndStore(output_value, output_data + i);
+  }
+#endif
+  // TODO(b/137208495): revisit when unit tests cover reference code.
+  // Fall back to reference_ops::HardSwish. In general we have preferred
+  // to duplicate such scalar code rather than call reference code to handle
+  // leftovers, thinking that code duplication was not a big concern.
+  // However, most of our unit tests happen to test only optimized code,
+  // and the quantized HardSwish implementation is nontrivial enough that
+  // I really want test coverage for the reference code.
+  if (i < flat_size) {
+    const RuntimeShape leftover_shape{flat_size - i};
+    reference_ops::HardSwish(params, leftover_shape, input_data + i,
+                             leftover_shape, output_data + i);
+  }
+}
+
+template <typename T>
+inline void IntegerExponentPow(const ArithmeticParams& params,
+                               const RuntimeShape& unextended_base_shape,
+                               const T* base_data, const int exponent,
+                               const RuntimeShape& unextended_output_shape,
+                               T* output_data) {
+  TFLITE_DCHECK_GE(exponent, 1);
+  if (exponent == 1) {
+    // copy data over.
+    std::memcpy(output_data, base_data,
+                unextended_base_shape.FlatSize() * sizeof(T));
+  } else {
+    IntegerExponentPow(params, unextended_base_shape, base_data, exponent / 2,
+                       unextended_output_shape, output_data);
+    Mul(params, unextended_base_shape, output_data, unextended_base_shape,
+        output_data, unextended_output_shape, output_data);
+    if (exponent % 2 == 1) {
+      Mul(params, unextended_base_shape, base_data, unextended_base_shape,
+          output_data, unextended_output_shape, output_data);
+    }
+  }
+}
+
+template <typename T>
+inline void BroadcastPow4D(const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const T* input2_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  ruy::profiler::ScopeLabel label("PowBroadcast");
+
+  if (unextended_input2_shape.FlatSize() == 1) {
+    static const float epsilon = 1e-5;
+    const T exponent = input2_data[0];
+    const int int_exponent = static_cast<int>(std::round(exponent));
+    if ((std::abs(input2_data[0] - int_exponent) < epsilon) &&
+        (int_exponent >= 1)) {
+      ArithmeticParams params;
+      if (std::is_same<T, float>::value) {
+        params.float_activation_max = std::numeric_limits<float>::max();
+        params.float_activation_min = std::numeric_limits<float>::lowest();
+      } else if (std::is_same<T, int>::value) {
+        params.quantized_activation_max = std::numeric_limits<int>::max();
+        params.quantized_activation_min = std::numeric_limits<int>::lowest();
+      }
+      IntegerExponentPow(params, unextended_input1_shape, input1_data,
+                         int_exponent, unextended_output_shape, output_data);
+      return;
+    }
+  }
+  reference_ops::BroadcastPow4DSlow(unextended_input1_shape, input1_data,
+                                    unextended_input2_shape, input2_data,
+                                    unextended_output_shape, output_data);
+}
+
+#ifdef USE_NEON
+
+inline void ScaleWithNewZeroPoint(const int32x4_t input,
+                                  const float32x4_t scale_dup,
+                                  const float32x4_t zero_times_scale_dup,
+                                  float32x4_t* output) {
+#ifdef __ARM_FEATURE_FMA
+  *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
+#else
+  *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup),
+                      zero_times_scale_dup);
+#endif
+}
+
+#endif  // USE_NEON
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const uint8_t* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Dequantize/Uint8");
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup =
+      vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8) {
+    const uint8x8_t input_u8 = vld1_u8(input_data + i);
+    const uint16x8_t input_u16 = vmovl_u8(input_u8);
+    const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
+                          &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
+                          &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif  // NEON
+  for (; i < flat_size; ++i) {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const int8_t* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Dequantize/Int8");
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup =
+      vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8) {
+    const int8x8_t input_s8 = vld1_s8(input_data + i);
+    const int16x8_t input_s16 = vmovl_s8(input_s8);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
+                          &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
+                          &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif  // NEON
+  for (; i < flat_size; ++i) {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const int16_t* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Dequantize/Int16");
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup =
+      vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8) {
+    const int16x4_t input_s16_low = vld1_s16(input_data + i);
+    const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
+                          &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
+                          &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif  // NEON
+  for (; i < flat_size; ++i) {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const RuntimeShape& input_shape,
+                       const Eigen::half* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  reference_ops::Dequantize(input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape, T* output_data) {
+  reference_ops::AffineQuantize(op_params, input_shape, input_data,
+                                output_shape, output_data);
+}
+
+template <>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape,
+                           int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Quantize/Int8");
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = static_cast<double>(op_params.scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8) {
+    const float* src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+    const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
+    const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
+    vst1_s8(output_data + i, combined_val_narrowed);
+  }
+#endif  // NEON
+
+  for (; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+template <>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape,
+                           uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Quantize/Uint8");
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = static_cast<double>(op_params.scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8) {
+    const float* src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
+    const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
+    const uint16x8_t combined_val =
+        vcombine_u16(narrowed_val_0, narrowed_val_1);
+    const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
+    vst1_u8(output_data + i, combined_val_narrowed);
+  }
+#endif  // NEON
+
+  for (; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+template <>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_shape,
+                           int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Quantize/Int16");
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = static_cast<double>(op_params.scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8) {
+    const float* src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+    vst1_s16(output_data + i, narrowed_val_0);
+    vst1_s16(output_data + i + 4, narrowed_val_1);
+  }
+#endif  // NEON
+
+  for (; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+
+inline int16x8x4_t SaturatingRounding(
+    int16x8_t input_val_0, int16x8_t input_val_1, int16x8_t input_val_2,
+    int16x8_t input_val_3, int input_left_shift, int input_multiplier) {
+  // This performs what is expressed in the scalar code as
+  // const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+  //      static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+  //      static_cast<int16_t>(input_multiplier));
+  const int16x8_t left_shift_dup = vdupq_n_s16(input_left_shift);
+  const int16x8_t input_val_shifted_0 = vshlq_s16(input_val_0, left_shift_dup);
+  const int16x8_t input_val_shifted_1 = vshlq_s16(input_val_1, left_shift_dup);
+  const int16x8_t input_val_shifted_2 = vshlq_s16(input_val_2, left_shift_dup);
+  const int16x8_t input_val_shifted_3 = vshlq_s16(input_val_3, left_shift_dup);
+  int16x8x4_t result;
+  result.val[0] = vqrdmulhq_n_s16(input_val_shifted_0, input_multiplier);
+  result.val[1] = vqrdmulhq_n_s16(input_val_shifted_1, input_multiplier);
+  result.val[2] = vqrdmulhq_n_s16(input_val_shifted_2, input_multiplier);
+  result.val[3] = vqrdmulhq_n_s16(input_val_shifted_3, input_multiplier);
+  return result;
+}
+
+// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
+// considering 7 digits under zero.
+inline int16x8x4_t FixedPoint4Logistic(int16x8x4_t input_val) {
+  // Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
+  using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+  const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
+  const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
+  const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
+  const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
+
+  // TODO(b/134622898) Implement a low accuracy version of logistic. In this
+  // method, gemmlowp::tanh spends about 80% of the execution times. The
+  // current implementation is rougly 12-bit accurate in the 16-bit fixed
+  // point case. Until reaching to error bounds, there are rooms for
+  // improvements.
+  const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
+  const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
+  const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
+  const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
+
+  // Divide by 2^7 as in the scalar code
+  int16x8x4_t result;
+  result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 7);
+  result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 7);
+  result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 7);
+  result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 7);
+  return result;
+}
+
+// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
+// considering 11 digits under zero at least.
+inline int16x8x4_t FixedPoint4Tanh(int16x8x4_t input_val) {
+  // Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
+  using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+  const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
+  const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
+  const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
+  const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
+
+  // TODO(b/134622898) Implement a low accuracy version of logistic. In this
+  // method, gemmlowp::tanh spends about 80% of the execution times. The
+  // current implementation is rougly 12-bit accurate in the 16-bit fixed
+  // point case. Until reaching to error bounds, there are rooms for
+  // improvements.
+  const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
+  const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
+  const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
+  const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
+
+  // Divide by 2^7 as in the scalar code
+  int16x8x4_t result;
+  result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 8);
+  result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 8);
+  result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 8);
+  result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 8);
+  return result;
+}
+
+inline uint8x16x2_t CalculateUnsignedClampingWithRangeBitMasks(
+    int16x8x2_t input_val, int16x8_t range_radius_dup,
+    int16x8_t neg_range_radius_dup) {
+  const uint16x8_t mask_rightclamp_0 =
+      vcgtq_s16(input_val.val[0], range_radius_dup);
+  const uint16x8_t mask_rightclamp_1 =
+      vcgtq_s16(input_val.val[1], range_radius_dup);
+
+  const uint16x8_t mask_leftclamp_0 =
+      vcgeq_s16(input_val.val[0], neg_range_radius_dup);
+  const uint16x8_t mask_leftclamp_1 =
+      vcgeq_s16(input_val.val[1], neg_range_radius_dup);
+
+  uint8x16x2_t result;
+  result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                              vshrn_n_u16(mask_leftclamp_1, 8));
+  result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                              vshrn_n_u16(mask_rightclamp_1, 8));
+  return result;
+}
+
+inline uint8x16x2_t CalculateSignedClampingWithRangeBitMasks(
+    int16x8x2_t input_val, int16x8_t range_radius_dup,
+    int16x8_t neg_range_radius_dup) {
+  const uint16x8_t mask_rightclamp_0 =
+      vcgtq_s16(input_val.val[0], range_radius_dup);
+  const uint16x8_t mask_rightclamp_1 =
+      vcgtq_s16(input_val.val[1], range_radius_dup);
+
+  const uint16x8_t mask_leftclamp_0 =
+      vcltq_s16(input_val.val[0], neg_range_radius_dup);
+  const uint16x8_t mask_leftclamp_1 =
+      vcltq_s16(input_val.val[1], neg_range_radius_dup);
+
+  uint8x16x2_t result;
+  result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                              vshrn_n_u16(mask_leftclamp_1, 8));
+  result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                              vshrn_n_u16(mask_rightclamp_1, 8));
+  return result;
+}
+
+inline void ClampWithRangeAndStore(uint8_t* output_dst, uint8x16_t input_val,
+                                   uint8x16x2_t masks_clamp) {
+  // Store back to memory
+  vst1q_u8(output_dst, vandq_u8(vorrq_u8(input_val, masks_clamp.val[1]),
+                                masks_clamp.val[0]));
+}
+
+inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
+                                   uint8x16x2_t masks_clamp) {
+  static const int8x16_t max_dup = vdupq_n_s8(127);
+  static const int8x16_t min_dup = vdupq_n_s8(-128);
+  // Store back to memory
+  vst1q_s8(output_dst,
+           vbslq_s8(masks_clamp.val[1], max_dup,
+                    vbslq_s8(masks_clamp.val[0], min_dup, input_val)));
+}
+
+#endif  // GEMMLOWP_NEON
+
+inline void Tanh16bitPrecision(const TanhParams& params,
+                               const RuntimeShape& input_shape,
+                               const uint8_t* input_data,
+                               const RuntimeShape& output_shape,
+                               uint8_t* output_data) {
+  // Note that this is almost the exact same code as in Logistic().
+  ruy::profiler::ScopeLabel label("Tanh/Uint8");
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int16_t input_multiplier =
+      static_cast<int16_t>(params.input_multiplier);
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+  int16_t output_zero_point = 128;
+
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+  const int16x8_t output_zero_point_s16 = vdupq_n_s16(output_zero_point);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input uint8_t values, cast to int16_t and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
+
+    // Add the output zero point
+    output_val_s16.val[0] =
+        vaddq_s16(output_val_s16.val[0], output_zero_point_s16);
+    output_val_s16.val[1] =
+        vaddq_s16(output_val_s16.val[1], output_zero_point_s16);
+    output_val_s16.val[2] =
+        vaddq_s16(output_val_s16.val[2], output_zero_point_s16);
+    output_val_s16.val[3] =
+        vaddq_s16(output_val_s16.val[3], output_zero_point_s16);
+
+    // Cast output values to uint8_t, saturating
+    uint8x16_t output_val_u8_0_1 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
+    uint8x16_t output_val_u8_2_3 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[2]), vqmovun_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_u8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_u8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8_t input_val_u8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
+      output_val_s16 += output_zero_point;
+      if (output_val_s16 == 256) {
+        output_val_s16 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, 0);
+      TFLITE_DCHECK_LE(output_val_s16, 255);
+      output_val = static_cast<uint8_t>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Tanh16bitPrecision(const TanhParams& params,
+                               const RuntimeShape& input_shape,
+                               const int8_t* input_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  // Note that this is almost the exact same code as in Logistic().
+  ruy::profiler::ScopeLabel label("Tanh/Int8");
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int16_t input_multiplier =
+      static_cast<int16_t>(params.input_multiplier);
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input int8_t values, cast to int16_t and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = -128;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 127;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
+
+    // Cast output values to uint8_t, saturating
+    int8x16_t output_val_s8_0_1 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
+    int8x16_t output_val_s8_2_3 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[2]), vqmovn_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_s8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_s8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const int8_t input_val_s8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_s8) - input_zero_point;
+    int8_t output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = -128;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 127;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
+      if (output_val_s16 == 128) {
+        output_val_s16 = 127;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, -128);
+      TFLITE_DCHECK_LE(output_val_s16, 127);
+      output_val = static_cast<int8_t>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Logistic16bitPrecision(const LogisticParams& params,
+                                   const RuntimeShape& input_shape,
+                                   const uint8_t* input_data,
+                                   const RuntimeShape& output_shape,
+                                   uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Logistic/Uint8");
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input uint8_t values, cast to int16_t and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateUnsignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
+
+    // Cast output values to uint8_t, saturating
+    uint8x16_t output_val_u8_0_1 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
+    uint8x16_t output_val_u8_2_3 = vcombine_u8(
+        vqmovun_s16(output_val_s16.val[2]), vqmovun_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_u8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_u8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8_t input_val_u8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
+      if (output_val_s16 == 256) {
+        output_val_s16 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, 0);
+      TFLITE_DCHECK_LE(output_val_s16, 255);
+      output_val = static_cast<uint8_t>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Logistic16bitPrecision(const LogisticParams& params,
+                                   const RuntimeShape& input_shape,
+                                   const int8_t* input_data,
+                                   const RuntimeShape& output_shape,
+                                   int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Logistic/Int8");
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
+  const int size = MatchingFlatSize(input_shape, output_shape);
+
+  int c = 0;
+  const int16_t output_zero_point = 128;
+// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
+// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
+// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
+#ifdef GEMMLOWP_NEON
+  const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
+  const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
+  const int16x8_t output_zero_point_dup = vdupq_n_s16(output_zero_point);
+
+  // Handle 32 values at a time
+  for (; c <= size - 32; c += 32) {
+    // Read input int8_t values, cast to int16_t and subtract input_zero_point
+    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
+    const int16x8x2_t input_val_centered_0_1 =
+        Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
+    const int16x8x2_t input_val_centered_2_3 =
+        Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = -128;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 127;
+    //   } else {
+    //     ...
+    uint8x16x2_t masks_clamp_0_1 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
+    uint8x16x2_t masks_clamp_2_3 = CalculateSignedClampingWithRangeBitMasks(
+        input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
+
+    int16x8x4_t input_val_rescaled = SaturatingRounding(
+        input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
+        input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
+        input_left_shift, input_multiplier);
+
+    int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
+
+    // Substract output zero point.
+    output_val_s16.val[0] =
+        vsubq_s16(output_val_s16.val[0], output_zero_point_dup);
+    output_val_s16.val[1] =
+        vsubq_s16(output_val_s16.val[1], output_zero_point_dup);
+    output_val_s16.val[2] =
+        vsubq_s16(output_val_s16.val[2], output_zero_point_dup);
+    output_val_s16.val[3] =
+        vsubq_s16(output_val_s16.val[3], output_zero_point_dup);
+
+    // Cast output values to int8_t, saturating
+    int8x16_t output_val_s8_0_1 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
+    int8x16_t output_val_s8_2_3 = vcombine_s8(
+        vqmovn_s16(output_val_s16.val[2]), vqmovn_s16(output_val_s16.val[3]));
+
+    ClampWithRangeAndStore(output_data + c, output_val_s8_0_1, masks_clamp_0_1);
+    ClampWithRangeAndStore(output_data + c + 16, output_val_s8_2_3,
+                           masks_clamp_2_3);
+  }
+#endif  // GEMMLOWP_NEON
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const int8_t input_val_s8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_s8) - input_zero_point;
+    int8_t output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = -128;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 127;
+    } else {
+      using gemmlowp::SaturatingRoundingDoublingHighMul;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
+      output_val_s16 -= output_zero_point;
+      if (output_val_s16 == 128) {
+        output_val_s16 = 127;
+      }
+      TFLITE_DCHECK_GE(output_val_s16, -128);
+      TFLITE_DCHECK_LE(output_val_s16, 127);
+      output_val = static_cast<int8_t>(output_val_s16);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+template <typename T, int N = 6>
+void Transpose(const TransposeParams& params, const RuntimeShape& input_shape,
+               const T* input_data, const RuntimeShape& output_shape,
+               T* output_data) {
+  return reference_ops::Transpose(params, input_shape, input_data, output_shape,
+                                  output_data);
+}
+
+// Assume input1 & input2 have the same scale & zero point.
+inline void MaximumElementwise(int size, const ArithmeticParams& params,
+                               const int8_t* input1_data,
+                               const int8_t* input2_data, int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8_t input1_val = input1_data[i];
+    const int8_t input2_val = input2_data[i];
+    output_data[i] = std::max(input1_val, input2_val);
+  }
+}
+
+inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8_t input1_data,
+                                   const int8_t* input2_data,
+                                   int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8_t input2_val = input2_data[i];
+    output_data[i] = std::max(input1_data, input2_val);
+  }
+}
+
+// Assume input1 & input2 have the same scale & zero point.
+inline void MinimumElementwise(int size, const ArithmeticParams& params,
+                               const int8_t* input1_data,
+                               const int8_t* input2_data, int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8_t input1_val = input1_data[i];
+    const int8_t input2_val = input2_data[i];
+    output_data[i] = std::min(input1_val, input2_val);
+  }
+}
+
+inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8_t input1_data,
+                                   const int8_t* input2_data,
+                                   int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8_t input2_val = input2_data[i];
+    output_data[i] = std::min(input1_data, input2_val);
+  }
+}
+
+template <typename Op>
+inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8_t* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8_t* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8_t* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MaximumElementwise, MaximumScalarBroadcast);
+}
+
+template <typename Op>
+inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8_t* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8_t* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8_t* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MinimumElementwise, MinimumScalarBroadcast);
+}
+
+template <typename T>
+void CumsumImpl(const T* input_data, const RuntimeShape& shape, int axis,
+                bool exclusive, bool reverse, T* output_data) {
+  Eigen::array<Eigen::DenseIndex, 3> dims = {1, 1, 1};
+
+  for (int i = 0; i < axis; ++i) {
+    dims[0] *= shape.Dims(i);
+  }
+  dims[1] = shape.Dims(axis);
+  for (int i = axis + 1; i < shape.DimensionsCount(); ++i) {
+    dims[2] *= shape.Dims(i);
+  }
+
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 3, Eigen::RowMajor, Eigen::DenseIndex>,
+      Eigen::Aligned>
+      ConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<T, 3, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+      Tensor;
+  ConstTensor input(input_data, dims);
+  Tensor output(output_data, dims);
+
+  if (reverse) {
+    Eigen::array<bool, 3> reverse_idx = {false, true, false};
+    output =
+        input.reverse(reverse_idx).cumsum(1, exclusive).reverse(reverse_idx);
+  } else {
+    output = input.cumsum(1, exclusive);
+  }
+}
+
+template <typename T>
+void CumSum(const T* input_data, const RuntimeShape& shape, int axis,
+            bool exclusive, bool reverse, T* output_data) {
+  const int dim = shape.DimensionsCount();
+  TFLITE_DCHECK_GE(dim, 1);
+  CumsumImpl<T>(input_data, shape, axis, exclusive, reverse, output_data);
+}
+
+inline void PReluScalarBroadcast(int size, const ArithmeticParams& params,
+                                 float alpha, const float* input_data,
+                                 float* output_data) {
+  ruy::profiler::ScopeLabel label("PreluScalarBroadcast/float");
+  int i = 0;
+
+#ifdef USE_NEON
+  const float32x4_t zero_dup = vdupq_n_f32(0.0f);
+  const float32x4_t alpha_dup = vdupq_n_f32(alpha);
+  for (; i <= size - 16; i += 16) {
+    const float32x4_t input1 = vld1q_f32(input_data + i);
+    const float32x4_t input2 = vld1q_f32(input_data + i + 4);
+    const float32x4_t input3 = vld1q_f32(input_data + i + 8);
+    const float32x4_t input4 = vld1q_f32(input_data + i + 12);
+
+    const float32x4_t temp1 = vmulq_f32(input1, alpha_dup);
+    const float32x4_t temp2 = vmulq_f32(input2, alpha_dup);
+    const float32x4_t temp3 = vmulq_f32(input3, alpha_dup);
+    const float32x4_t temp4 = vmulq_f32(input4, alpha_dup);
+
+    const uint32x4_t mask1 = vcgeq_f32(input1, zero_dup);
+    const uint32x4_t mask2 = vcgeq_f32(input2, zero_dup);
+    const uint32x4_t mask3 = vcgeq_f32(input3, zero_dup);
+    const uint32x4_t mask4 = vcgeq_f32(input4, zero_dup);
+
+    const float32x4_t result1 = vbslq_f32(mask1, input1, temp1);
+    vst1q_f32(output_data + i, result1);
+    const float32x4_t result2 = vbslq_f32(mask2, input2, temp2);
+    vst1q_f32(output_data + i + 4, result2);
+    const float32x4_t result3 = vbslq_f32(mask3, input3, temp3);
+    vst1q_f32(output_data + i + 8, result3);
+    const float32x4_t result4 = vbslq_f32(mask4, input4, temp4);
+    vst1q_f32(output_data + i + 12, result4);
+  }
+
+  for (; i <= size - 4; i += 4) {
+    const float32x4_t input = vld1q_f32(input_data + i);
+    const float32x4_t temp = vmulq_f32(input, alpha_dup);
+    const uint32x4_t mask = vcgeq_f32(input, zero_dup);
+    const float32x4_t result = vbslq_f32(mask, input, temp);
+    vst1q_f32(output_data + i, result);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const float input = input_data[i];
+    output_data[i] = input >= 0.f ? input : input * alpha;
+  }
+}
+
+inline void PReluElementWise(int flat_size, const ArithmeticParams& params,
+                             const float* alpha_data, const float* input_data,
+                             float* output_data) {
+  ruy::profiler::ScopeLabel label("PreluElementWise/float");
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t zero_dup = vdupq_n_f32(0.0f);
+  for (; i <= flat_size - 16; i += 16) {
+    const float32x4_t input1 = vld1q_f32(input_data + i);
+    const float32x4_t alpha1 = vld1q_f32(alpha_data + i);
+    const float32x4_t input2 = vld1q_f32(input_data + i + 4);
+    const float32x4_t alpha2 = vld1q_f32(alpha_data + i + 4);
+    const float32x4_t input3 = vld1q_f32(input_data + i + 8);
+    const float32x4_t alpha3 = vld1q_f32(alpha_data + i + 8);
+    const float32x4_t input4 = vld1q_f32(input_data + i + 12);
+    const float32x4_t alpha4 = vld1q_f32(alpha_data + i + 12);
+
+    const float32x4_t temp1 = vmulq_f32(input1, alpha1);
+    const float32x4_t temp2 = vmulq_f32(input2, alpha2);
+    const float32x4_t temp3 = vmulq_f32(input3, alpha3);
+    const float32x4_t temp4 = vmulq_f32(input4, alpha4);
+
+    const uint32x4_t mask1 = vcgeq_f32(input1, zero_dup);
+    const uint32x4_t mask2 = vcgeq_f32(input2, zero_dup);
+    const uint32x4_t mask3 = vcgeq_f32(input3, zero_dup);
+    const uint32x4_t mask4 = vcgeq_f32(input4, zero_dup);
+
+    const float32x4_t result1 = vbslq_f32(mask1, input1, temp1);
+    vst1q_f32(output_data + i, result1);
+    const float32x4_t result2 = vbslq_f32(mask2, input2, temp2);
+    vst1q_f32(output_data + i + 4, result2);
+    const float32x4_t result3 = vbslq_f32(mask3, input3, temp3);
+    vst1q_f32(output_data + i + 8, result3);
+    const float32x4_t result4 = vbslq_f32(mask4, input4, temp4);
+    vst1q_f32(output_data + i + 12, result4);
+  }
+
+  for (; i <= flat_size - 4; i += 4) {
+    const float32x4_t input = vld1q_f32(input_data + i);
+    const float32x4_t alpha = vld1q_f32(alpha_data + i);
+
+    const float32x4_t temp = vmulq_f32(input, alpha);
+    const uint32x4_t mask = vcgeq_f32(input, zero_dup);
+    const float32x4_t result = vbslq_f32(mask, input, temp);
+    vst1q_f32(output_data + i, result);
+  }
+#endif  // USE_NEON
+  for (; i < flat_size; ++i) {
+    const float input = input_data[i];
+    const float alpha = alpha_data[i];
+    output_data[i] = input >= 0.f ? input : input * alpha;
+  }
+}
+
+inline void BroadcastPReluDispatch(
+    const ArithmeticParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& alpha_shape,
+    const float* alpha_data, const RuntimeShape& output_shape,
+    float* output_data, float (*func)(float, float)) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+        input_shape, input_data, alpha_shape, alpha_data, output_shape,
+        output_data, func);
+  }
+
+  BinaryBroadcastFiveFold(params, input_shape, input_data, alpha_shape,
+                          alpha_data, output_shape, output_data,
+                          PReluElementWise, PReluScalarBroadcast);
+}
+
+// Returns the index with minimum value within `input_data`.
+// If there is a tie, returns the smaller index.
+template <typename T>
+inline int ArgMinVector(const T* input_data, int size) {
+  T min_value = input_data[0];
+  int min_index = 0;
+  for (int i = 1; i < size; ++i) {
+    const T curr_value = input_data[i];
+    if (curr_value < min_value) {
+      min_value = curr_value;
+      min_index = i;
+    }
+  }
+  return min_index;
+}
+
+// Returns the index with maximum value within `input_data`.
+// If there is a tie, returns the smaller index.
+template <typename T>
+inline int ArgMaxVector(const T* input_data, int size) {
+  T max_value = input_data[0];
+  int max_index = 0;
+  for (int i = 1; i < size; ++i) {
+    const T curr_value = input_data[i];
+    if (curr_value > max_value) {
+      max_value = curr_value;
+      max_index = i;
+    }
+  }
+  return max_index;
+}
+
+template <>
+inline int ArgMinVector(const float* input_data, int size) {
+  int32_t min_index = 0;
+  float min_value = input_data[0];
+  int32_t i = 1;
+#ifdef USE_NEON
+  if (size >= 4) {
+    float32x4_t min_value_f32x4 = vld1q_f32(input_data);
+    const int32_t index_init[4] = {0, 1, 2, 3};
+    int32x4_t min_index_s32x4 = vld1q_s32(index_init);
+    int32x4_t index_s32x4 = min_index_s32x4;
+    int32x4_t inc = vdupq_n_s32(4);
+    for (i = 4; i <= size - 4; i += 4) {
+      // Increase indices by 4.
+      index_s32x4 = vaddq_s32(index_s32x4, inc);
+      float32x4_t v = vld1q_f32(&input_data[i]);
+      uint32x4_t mask = vcltq_f32(v, min_value_f32x4);
+      min_value_f32x4 = vminq_f32(min_value_f32x4, v);
+      min_index_s32x4 = vbslq_s32(mask, index_s32x4, min_index_s32x4);
+    }
+    // Find min element within float32x4_t.
+#ifdef __aarch64__
+    min_value = vminvq_f32(min_value_f32x4);
+#else
+    float32x2_t min_value_f32x2 = vpmin_f32(vget_low_f32(min_value_f32x4),
+                                            vget_high_f32(min_value_f32x4));
+    min_value_f32x2 = vpmin_f32(min_value_f32x2, min_value_f32x2);
+    min_value = vget_lane_f32(min_value_f32x2, 0);
+#endif  // __aarch64__
+    // Mask indices of non-min values with max int32_t.
+    float32x4_t fill_min_value_f32x4 = vdupq_n_f32(min_value);
+    uint32x4_t mask = vceqq_f32(min_value_f32x4, fill_min_value_f32x4);
+    int32x4_t all_set = vdupq_n_s32(std::numeric_limits<int>::max());
+    min_index_s32x4 = vbslq_s32(mask, min_index_s32x4, all_set);
+    // Find min index of min values.
+#ifdef __aarch64__
+    min_index = vminvq_s32(min_index_s32x4);
+#else
+    int32x2_t min_index_s32x2 = vpmin_s32(vget_low_s32(min_index_s32x4),
+                                          vget_high_s32(min_index_s32x4));
+    min_index_s32x2 = vpmin_s32(min_index_s32x2, min_index_s32x2);
+    min_index = vget_lane_s32(min_index_s32x2, 0);
+#endif  // __aarch64__
+  }
+#endif  // USE_NEON
+  // Leftover loop.
+  for (; i < size; ++i) {
+    const float curr_value = input_data[i];
+    if (curr_value < min_value) {
+      min_value = curr_value;
+      min_index = i;
+    }
+  }
+  return min_index;
+}
+
+template <>
+inline int ArgMaxVector(const float* input_data, int size) {
+  int32_t max_index = 0;
+  float max_value = input_data[0];
+  int32_t i = 1;
+#ifdef USE_NEON
+  if (size >= 4) {
+    float32x4_t max_value_f32x4 = vld1q_f32(input_data);
+    const int32_t index_init[4] = {0, 1, 2, 3};
+    int32x4_t max_index_s32x4 = vld1q_s32(index_init);
+    int32x4_t index_s32x4 = max_index_s32x4;
+    int32x4_t inc = vdupq_n_s32(4);
+    for (i = 4; i <= size - 4; i += 4) {
+      // Increase indices by 4.
+      index_s32x4 = vaddq_s32(index_s32x4, inc);
+      float32x4_t v = vld1q_f32(&input_data[i]);
+      uint32x4_t mask = vcgtq_f32(v, max_value_f32x4);
+      max_value_f32x4 = vmaxq_f32(max_value_f32x4, v);
+      max_index_s32x4 = vbslq_s32(mask, index_s32x4, max_index_s32x4);
+    }
+    // Find max element within float32x4_t.
+#ifdef __aarch64__
+    max_value = vmaxvq_f32(max_value_f32x4);
+#else
+    float32x2_t max_value_f32x2 = vpmax_f32(vget_low_f32(max_value_f32x4),
+                                            vget_high_f32(max_value_f32x4));
+    max_value_f32x2 = vpmax_f32(max_value_f32x2, max_value_f32x2);
+    max_value = vget_lane_f32(max_value_f32x2, 0);
+#endif  // __aarch64__
+    // Mask indices of non-max values with max int32_t.
+    float32x4_t fill_max_value_f32x4 = vdupq_n_f32(max_value);
+    uint32x4_t mask = vceqq_f32(max_value_f32x4, fill_max_value_f32x4);
+    int32x4_t all_set = vdupq_n_s32(std::numeric_limits<int>::max());
+    max_index_s32x4 = vbslq_s32(mask, max_index_s32x4, all_set);
+    // Find min index of max values.
+#ifdef __aarch64__
+    max_index = vminvq_s32(max_index_s32x4);
+#else
+    int32x2_t max_index_s32x2 = vpmin_s32(vget_low_s32(max_index_s32x4),
+                                          vget_high_s32(max_index_s32x4));
+    max_index_s32x2 = vpmin_s32(max_index_s32x2, max_index_s32x2);
+    max_index = vget_lane_s32(max_index_s32x2, 0);
+#endif  // __aarch64__
+  }
+#endif  // USE_NEON
+  // Leftover loop.
+  for (; i < size; ++i) {
+    const float curr_value = input_data[i];
+    if (curr_value > max_value) {
+      max_value = curr_value;
+      max_index = i;
+    }
+  }
+  return max_index;
+}
+
+template <>
+inline int ArgMaxVector(const int8_t* input_data, int size) {
+  int32_t max_index = 0;
+  int8_t max_value = input_data[0];
+  int32_t i = 0;
+#ifdef USE_NEON
+  constexpr int VECTOR_SIZE = 16;
+  if (size >= VECTOR_SIZE) {
+    int8x16_t max_value_s8x16;
+    for (; i <= size - VECTOR_SIZE; i += VECTOR_SIZE) {
+      max_value_s8x16 = vld1q_s8(input_data + i);
+      int8_t max_from_vec;
+#ifdef __aarch64__
+      max_from_vec = vmaxvq_s8(max_value_s8x16);
+#else   // 32 bit
+      int8x8_t max_val_s8x8 =
+          vpmax_s8(vget_low_s8(max_value_s8x16), vget_high_s8(max_value_s8x16));
+      max_val_s8x8 = vpmax_s8(max_val_s8x8, max_val_s8x8);
+      max_val_s8x8 = vpmax_s8(max_val_s8x8, max_val_s8x8);
+      max_val_s8x8 = vpmax_s8(max_val_s8x8, max_val_s8x8);
+      max_from_vec = vget_lane_s8(max_val_s8x8, 0);
+#endif  // __aarch64__
+      if (max_from_vec > max_value) {
+        max_value = max_from_vec;
+        max_index = i;
+      }
+    }
+  }
+  for (int start_idx = max_index; start_idx < max_index + VECTOR_SIZE;
+       start_idx++) {
+    if (input_data[start_idx] == max_value) {
+      max_index = start_idx;
+      break;
+    }
+  }
+
+#endif  // USE_NEON
+  // Leftover loop.
+  for (; i < size; ++i) {
+    const int8_t curr_value = input_data[i];
+    if (curr_value > max_value) {
+      max_value = curr_value;
+      max_index = i;
+    }
+  }
+
+  return max_index;
+}
+
+template <>
+inline int ArgMaxVector(const uint8_t* input_data, int size) {
+  int32_t max_index = 0;
+  uint8_t max_value = input_data[0];
+  int32_t i = 0;
+#ifdef USE_NEON
+  constexpr int VECTOR_SIZE = 16;
+  if (size >= VECTOR_SIZE) {
+    uint8x16_t max_value_u8x16;
+    for (; i <= size - VECTOR_SIZE; i += VECTOR_SIZE) {
+      max_value_u8x16 = vld1q_u8(input_data + i);
+      uint8_t max_from_vec;
+#ifdef __aarch64__
+      max_from_vec = vmaxvq_u8(max_value_u8x16);
+#else   // 32 bit
+      uint8x8_t max_val_u8x8 =
+          vpmax_u8(vget_low_u8(max_value_u8x16), vget_high_u8(max_value_u8x16));
+      max_val_u8x8 = vpmax_u8(max_val_u8x8, max_val_u8x8);
+      max_val_u8x8 = vpmax_u8(max_val_u8x8, max_val_u8x8);
+      max_val_u8x8 = vpmax_u8(max_val_u8x8, max_val_u8x8);
+      max_from_vec = vget_lane_u8(max_val_u8x8, 0);
+#endif  // __aarch64__
+      if (max_from_vec > max_value) {
+        max_value = max_from_vec;
+        max_index = i;
+      }
+    }
+  }
+  for (int start_idx = max_index; start_idx < max_index + VECTOR_SIZE;
+       start_idx++) {
+    if (input_data[start_idx] == max_value) {
+      max_index = start_idx;
+      break;
+    }
+  }
+
+#endif  // USE_NEON
+  // Leftover loop.
+  for (; i < size; ++i) {
+    const uint8_t curr_value = input_data[i];
+    if (curr_value > max_value) {
+      max_value = curr_value;
+      max_index = i;
+    }
+  }
+
+  return max_index;
+}
+
+// Specializes ArgMinMax function with axis=dims-1.
+// In this case, ArgMinMax reduction is applied on contiguous memory.
+template <typename T1, typename T2, bool is_arg_max>
+inline void ArgMinMaxLastAxis(const RuntimeShape& input_shape,
+                              const T1* input_data,
+                              const RuntimeShape& output_shape,
+                              T2* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_EQ(input_shape.Dims(0), output_shape.Dims(0));
+
+  int outer_size = input_shape.Dims(0);
+  int axis_size = input_shape.Dims(1);
+  for (int outer = 0; outer < outer_size; ++outer) {
+    if (is_arg_max) {
+      output_data[outer] = static_cast<T2>(
+          ArgMaxVector<T1>(input_data + outer * axis_size, axis_size));
+    } else {
+      output_data[outer] = static_cast<T2>(
+          ArgMinVector<T1>(input_data + outer * axis_size, axis_size));
+    }
+  }
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
+                      const T3* input2_data, const RuntimeShape& output_shape,
+                      T2* output_data, const bool is_arg_max) {
+  ruy::profiler::ScopeLabel label("ArgMinMax");
+
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
+                   output_shape.DimensionsCount());
+  int axis = input2_data[0];
+  if (axis < 0) {
+    axis += input1_shape.DimensionsCount();
+  }
+  const int axis_size = input1_shape.Dims(axis);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    outer_size *= input1_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  const int dims_count = input1_shape.DimensionsCount();
+  for (int i = axis + 1; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
+    inner_size *= input1_shape.Dims(i);
+  }
+
+  // Call specialized function when axis=dims-1. So far, only float32 is
+  // optimized so reroute to specialized function only when T1 is float32.
+  if (inner_size == 1 &&
+      (std::is_same<T1, float>::value || std::is_same<T1, int8_t>::value ||
+       std::is_same<T1, uint8_t>::value)) {
+    if (is_arg_max) {
+      ArgMinMaxLastAxis<T1, T2, /*is_arg_max=*/true>(
+          {outer_size, axis_size}, input1_data, {outer_size}, output_data);
+    } else {
+      ArgMinMaxLastAxis<T1, T2, /*is_arg_max=*/false>(
+          {outer_size, axis_size}, input1_data, {outer_size}, output_data);
+    }
+    return;
+  }
+
+  reference_ops::ArgMinMax(input1_shape, input1_data, input2_data, output_shape,
+                           output_data, is_arg_max);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
+            const T3* input2_data, const RuntimeShape& output_shape,
+            T2* output_data) {
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
+            /*is_arg_max=*/true);
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+// For backward compatibility, reference_ops has ArgMax function.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
+                   const RuntimeShape& input2_shape, const T3* input2_data,
+                   const RuntimeShape& output_shape, T2* output_data) {
+  // Drop shape of second input: not needed.
+  ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+inline TfLiteStatus Conv3D(
+    const Conv3DParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data,
+    CpuBackendContext* cpu_backend_context) {
+  const int stride_depth = params.stride_depth;
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  const int dilation_depth_factor = params.dilation_depth;
+  const int dilation_height_factor = params.dilation_height;
+  const int dilation_width_factor = params.dilation_width;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 5);
+
+  ruy::profiler::ScopeLabel label("Conv3D");
+
+  // NB: the float 0.0f value is represented by all zero bytes.
+  const uint8_t float_zero_byte = 0x00;
+  const float* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_depth = filter_shape.Dims(0);
+  const bool need_dilated_im2col = dilation_width_factor != 1 ||
+                                   dilation_height_factor != 1 ||
+                                   dilation_depth_factor != 1;
+  const bool need_im2col = stride_depth != 1 || stride_height != 1 ||
+                           stride_width != 1 || filter_depth != 1 ||
+                           filter_height != 1 || filter_width != 1;
+
+  if (need_dilated_im2col) {
+    DilatedIm2col3D(params, filter_depth, filter_height, filter_width,
+                    float_zero_byte, input_shape, input_data, im2col_shape,
+                    im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    Im2col3D(params, filter_depth, filter_height, filter_width, float_zero_byte,
+             input_shape, input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(4);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  lhs_params.rows = n;
+  lhs_params.cols = k;
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = k;
+  rhs_params.cols = m;
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = n;
+  dst_params.cols = m;
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+  return kTfLiteOk;
+}
+
+// Returns in 'im_data' (assumed to be zero-initialized) image patch in storage
+// order (planes, height, width, channel), constructed from patches in
+// 'col_data', which is required to be in storage order (out_planes * out_height
+// * out_width, filter_planes, filter_height, filter_width, in_channel).
+//
+// This function is copied from tensorflow/core/kernels/conv_grad_ops_3d.cc
+// authored by Eugene Zhulenev(ezhulenev).
+template <typename T>
+void Col2im(const T* col_data, const int channel, const int planes,
+            const int height, const int width, const int filter_p,
+            const int filter_h, const int filter_w, const int pad_pt,
+            const int pad_t, const int pad_l, const int pad_pb, const int pad_b,
+            const int pad_r, const int stride_p, const int stride_h,
+            const int stride_w, T* im_data) {
+  const int planes_col = (planes + pad_pt + pad_pb - filter_p) / stride_p + 1;
+  const int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  const int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+  int p_pad = -pad_pt;
+  for (int p = 0; p < planes_col; ++p) {
+    int h_pad = -pad_t;
+    for (int h = 0; h < height_col; ++h) {
+      int w_pad = -pad_l;
+      for (int w = 0; w < width_col; ++w) {
+        T* im_patch_data =
+            im_data +
+            (p_pad * height * width + h_pad * width + w_pad) * channel;
+        for (int ip = p_pad; ip < p_pad + filter_p; ++ip) {
+          for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+            for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+              if (ip >= 0 && ip < planes && ih >= 0 && ih < height && iw >= 0 &&
+                  iw < width) {
+                for (int i = 0; i < channel; ++i) {
+                  im_patch_data[i] += col_data[i];
+                }
+              }
+              im_patch_data += channel;
+              col_data += channel;
+            }
+            // Jump over remaining number of channel.
+            im_patch_data += channel * (width - filter_w);
+          }
+          // Jump over remaining number of (channel * width).
+          im_patch_data += (channel * width) * (height - filter_h);
+        }
+        w_pad += stride_w;
+      }
+      h_pad += stride_h;
+    }
+    p_pad += stride_p;
+  }
+}
+
+template <typename T>
+void BiasAdd3D(T* im_data, const T* bias_data, const RuntimeShape& input_shape,
+               float float_activation_min, float float_activation_max) {
+  if (bias_data) {
+    const int outer_size = input_shape.Dims(0) * input_shape.Dims(1) *
+                           input_shape.Dims(2) * input_shape.Dims(3);
+    const int num_channels = input_shape.Dims(4);
+    for (int n = 0; n < outer_size; ++n) {
+      for (int c = 0; c < num_channels; ++c) {
+        im_data[c] = ActivationFunctionWithMinMax(im_data[c] + bias_data[c],
+                                                  float_activation_min,
+                                                  float_activation_max);
+      }
+      im_data += num_channels;
+    }
+  } else {
+    const int flat_size = input_shape.FlatSize();
+    for (int i = 0; i < flat_size; ++i) {
+      im_data[i] = ActivationFunctionWithMinMax(
+          im_data[i], float_activation_min, float_activation_max);
+    }
+  }
+}
+
+inline void Conv3DTranspose(
+    const Conv3DTransposeParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* const output_data, const RuntimeShape& col2im_shape,
+    float* col2im_data, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("Conv3DTranspose/float");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK(col2im_data);
+
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_channel = MatchingDim(input_shape, 4, filter_shape, 4);
+  const int output_channel = MatchingDim(output_shape, 4, filter_shape, 3);
+  const int input_spatial_size =
+      input_shape.Dims(1) * input_shape.Dims(2) * input_shape.Dims(3);
+  const int output_spatial_size =
+      output_shape.Dims(1) * output_shape.Dims(2) * output_shape.Dims(3);
+
+  const int output_spatial_dim_1 = output_shape.Dims(1);
+  const int output_spatial_dim_2 = output_shape.Dims(2);
+  const int output_spatial_dim_3 = output_shape.Dims(3);
+  const int input_offset = input_spatial_size * input_channel;
+  const int output_offset = output_spatial_size * output_channel;
+
+  const int filter_spatial_dim_1 = filter_shape.Dims(0);
+  const int filter_spatial_dim_2 = filter_shape.Dims(1);
+  const int filter_spatial_dim_3 = filter_shape.Dims(2);
+
+  const int spatial_dim_1_padding_before = params.padding_values.depth;
+  const int spatial_dim_1_padding_after =
+      params.padding_values.height + params.padding_values.depth_offset;
+  const int spatial_dim_2_padding_before = params.padding_values.height;
+  const int spatial_dim_2_padding_after =
+      params.padding_values.height + params.padding_values.height_offset;
+  const int spatial_dim_3_padding_before = params.padding_values.width;
+  const int spatial_dim_3_padding_after =
+      params.padding_values.width + params.padding_values.width_offset;
+  const int spatial_dim_1_stride = params.stride_depth;
+  const int spatial_dim_2_stride = params.stride_height;
+  const int spatial_dim_3_stride = params.stride_width;
+  const int filter_total_size = filter_spatial_dim_1 * filter_spatial_dim_2 *
+                                filter_spatial_dim_3 * output_channel;
+
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = filter_total_size;
+  lhs_params.cols = input_channel;
+  float* output_data_p = output_data;
+  std::fill_n(output_data, output_offset * batch_size, 0.0f);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_backend_gemm::MatrixParams<float> rhs_params;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.rows = input_channel;
+    rhs_params.cols = input_spatial_size;
+    cpu_backend_gemm::MatrixParams<float> dst_params;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.rows = filter_total_size;
+    dst_params.cols = input_spatial_size;
+    cpu_backend_gemm::GemmParams<float, float> gemm_params;
+    cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params,
+                           input_data + input_offset * i, dst_params,
+                           col2im_data, gemm_params, cpu_backend_context);
+
+    Col2im(col2im_data, output_channel, output_spatial_dim_1,
+           output_spatial_dim_2, output_spatial_dim_3, filter_spatial_dim_1,
+           filter_spatial_dim_2, filter_spatial_dim_3,
+           spatial_dim_1_padding_before, spatial_dim_2_padding_before,
+           spatial_dim_3_padding_before, spatial_dim_1_padding_after,
+           spatial_dim_2_padding_after, spatial_dim_3_padding_after,
+           spatial_dim_1_stride, spatial_dim_2_stride, spatial_dim_3_stride,
+           output_data_p);
+    output_data_p += output_offset;
+  }
+  output_data_p = output_data;
+  BiasAdd3D(output_data_p, bias_data, output_shape, params.float_activation_min,
+            params.float_activation_max);
+}
+
+// Worker for summing up within a single interval. Interval is identified by
+// index from [start, end).
+template <typename T>
+struct AddNWorkerTask : cpu_backend_threadpool::Task {
+  AddNWorkerTask(const T* const* input_data, T* scratch_buffer, int start,
+                 int end, int num_elems, int split)
+      : input_data(input_data),
+        scratch_buffer(scratch_buffer),
+        start(start),
+        end(end),
+        num_elems(num_elems),
+        split(split) {}
+  void Run() override {
+    RuntimeShape shape(1);
+    shape.SetDim(0, num_elems);
+    ArithmeticParams params;
+    T output_activation_min = std::numeric_limits<T>::lowest(),
+      output_activation_max = std::numeric_limits<T>::max();
+    SetActivationParams(output_activation_min, output_activation_max, &params);
+    T* start_p = scratch_buffer + split * num_elems;
+    memcpy(start_p, input_data[start], sizeof(T) * num_elems);
+    for (int i = start + 1; i < end; i++) {
+      Add(params, shape, start_p, shape, input_data[i], shape, start_p);
+    }
+  }
+
+  const T* const* input_data;
+  T* scratch_buffer;
+  int start;
+  int end;
+  int num_elems;
+  int split;
+};
+
+// T is expected to be either float or int.
+template <typename T>
+inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
+                 const T* const* input_data, T* output_data, T* scratch_buffer,
+                 CpuBackendContext* cpu_backend_context) {
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t num_elems = input_shape.FlatSize();
+  const int thread_count =
+      std::min(std::max(1, static_cast<int>(num_inputs) / 2),
+               cpu_backend_context->max_num_threads());
+  memset(scratch_buffer, 0, sizeof(T) * num_elems * thread_count);
+
+  std::vector<AddNWorkerTask<T>> tasks;
+  tasks.reserve(thread_count);
+  int start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int end = start + (num_inputs - start) / (thread_count - i);
+    tasks.emplace_back(AddNWorkerTask<T>(input_data, scratch_buffer, start, end,
+                                         num_elems, i));
+    start = end;
+  }
+  // Run all tasks on the thread pool.
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
+  RuntimeShape shape(1);
+  shape.SetDim(0, num_elems);
+  ArithmeticParams params;
+  T output_activation_min = std::numeric_limits<T>::lowest(),
+    output_activation_max = std::numeric_limits<T>::max();
+  SetActivationParams(output_activation_min, output_activation_max, &params);
+  memcpy(output_data, scratch_buffer, sizeof(T) * num_elems);
+  for (int i = 1; i < tasks.size(); i++) {
+    Add(params, shape, output_data, shape, scratch_buffer + i * num_elems,
+        shape, output_data);
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic pop
+#endif
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h
new file mode 100644
index 00000000..7916b102
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h
@@ -0,0 +1,91 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_UTILS_H_
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen vector expression. The std::conditional here is to
+// construct the suitable Eigen type for the constness of the
+// data. Indeed, for const data, we need to produce
+//    Eigen::Map<const Eigen::Matrix<float, ...>>
+// and not the more straightforward
+//    Eigen::Map<Eigen::Matrix<const float, ...>>
+template <typename Scalar>
+using VectorMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, 1>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+
+template <typename Scalar>
+VectorMap<Scalar> MapAsVector(Scalar* data, const RuntimeShape& shape) {
+  const int size = shape.FlatSize();
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar* data,
+                                               const RuntimeShape& shape) {
+  const int dims_count = shape.DimensionsCount();
+  const int rows = shape.Dims(dims_count - 1);
+  const int cols = FlatSizeSkipDim(shape, dims_count - 1);
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsCols(Scalar* data,
+                                                const RuntimeShape& shape) {
+  const int cols = shape.Dims(0);
+  const int rows = FlatSizeSkipDim(shape, 0);
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar>
+using ArrayMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type,
+                                  Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar>
+ArrayMap<Scalar> MapAsArrayWithLastDimAsRows(Scalar* data,
+                                             const RuntimeShape& shape) {
+  const int dims_count = shape.DimensionsCount();
+  const int rows = shape.Dims(dims_count - 1);
+  const int cols = FlatSizeSkipDim(shape, dims_count - 1);
+  return ArrayMap<Scalar>(data, rows, cols);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/reduce.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/reduce.h
new file mode 100644
index 00000000..502103be
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/reduce.h
@@ -0,0 +1,808 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_REDUCE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_REDUCE_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/reduce_utils.h"
+#include "tensorflow/lite/kernels/internal/reduce_common.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+inline void MeanImpl(const tflite::MeanParams& op_params,
+                     const RuntimeShape& input_shape, const uint8_t* input_data,
+                     int32 multiplier, int32 shift, int32 bias,
+                     const RuntimeShape& output_shape, uint8_t* output_data,
+                     int start_depth, int end_depth) {
+  ruy::profiler::ScopeLabel label("Mean4D/Uint8/MeanImpl");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(2);
+  const int output_width = output_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
+  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
+
+#ifdef USE_NEON
+  const int32x4_t bias_dup = vdupq_n_s32(bias);
+  const int32x4_t min_dup = vdupq_n_s32(kMinValue);
+  const int32x4_t max_dup = vdupq_n_s32(kMaxValue);
+#endif  // USE_NEON
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    int out_d = start_depth;
+#ifdef USE_NEON
+
+    for (; out_d <= end_depth - 16; out_d += 16) {
+      int32x4x4_t temp_sum;
+      temp_sum.val[0] = vdupq_n_s32(0);
+      temp_sum.val[1] = vdupq_n_s32(0);
+      temp_sum.val[2] = vdupq_n_s32(0);
+      temp_sum.val[3] = vdupq_n_s32(0);
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          const uint8_t* input_data_ptr =
+              input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
+          uint8x16_t input_data_val = vld1q_u8(input_data_ptr);
+
+          int16x8_t input_data_low_shift =
+              vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_data_val)));
+          int16x8_t input_data_high_shift =
+              vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_data_val)));
+
+          int32x4_t input_low_low =
+              vmovl_s16(vget_low_s16(input_data_low_shift));
+          int32x4_t input_high_low =
+              vmovl_s16(vget_high_s16(input_data_low_shift));
+          int32x4_t input_low_high =
+              vmovl_s16(vget_low_s16(input_data_high_shift));
+          int32x4_t input_high_high =
+              vmovl_s16(vget_high_s16(input_data_high_shift));
+
+          temp_sum.val[0] = vaddq_s32(temp_sum.val[0], input_low_low);
+          temp_sum.val[1] = vaddq_s32(temp_sum.val[1], input_high_low);
+          temp_sum.val[2] = vaddq_s32(temp_sum.val[2], input_low_high);
+          temp_sum.val[3] = vaddq_s32(temp_sum.val[3], input_high_high);
+        }
+      }
+
+      temp_sum =
+          MultiplyByQuantizedMultiplier4Rows(temp_sum, multiplier, shift);
+
+      temp_sum.val[0] = vaddq_s32(temp_sum.val[0], bias_dup);
+      temp_sum.val[1] = vaddq_s32(temp_sum.val[1], bias_dup);
+      temp_sum.val[2] = vaddq_s32(temp_sum.val[2], bias_dup);
+      temp_sum.val[3] = vaddq_s32(temp_sum.val[3], bias_dup);
+
+      temp_sum.val[0] = vminq_s32(vmaxq_s32(temp_sum.val[0], min_dup), max_dup);
+      temp_sum.val[1] = vminq_s32(vmaxq_s32(temp_sum.val[1], min_dup), max_dup);
+      temp_sum.val[2] = vminq_s32(vmaxq_s32(temp_sum.val[2], min_dup), max_dup);
+      temp_sum.val[3] = vminq_s32(vmaxq_s32(temp_sum.val[3], min_dup), max_dup);
+
+      uint16x4_t narrowed_low_low =
+          vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[0]));
+      uint16x4_t narrowed_high_low =
+          vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[1]));
+      uint16x4_t narrowed_low_high =
+          vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[2]));
+      uint16x4_t narrowed_high_high =
+          vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[3]));
+
+      uint16x8_t combined_low =
+          vcombine_u16(narrowed_low_low, narrowed_high_low);
+      uint16x8_t combined_high =
+          vcombine_u16(narrowed_low_high, narrowed_high_high);
+
+      uint8x8_t narrowed_low = vmovn_u16(combined_low);
+      uint8x8_t narrowed_high = vmovn_u16(combined_high);
+
+      uint8x16_t combined_output = vcombine_u8(narrowed_low, narrowed_high);
+
+      uint8_t* output_data_ptr =
+          output_data + Offset(output_shape, out_b, 0, 0, out_d);
+      vst1q_u8(output_data_ptr, combined_output);
+    }
+#endif  // USE_NEON
+
+    for (; out_d < end_depth; ++out_d) {
+      int acc = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc += bias;
+      acc = std::min(std::max(acc, kMinValue), kMaxValue);
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          static_cast<uint8_t>(acc);
+    }
+  }
+}
+
+struct MeanWorkerTask : cpu_backend_threadpool::Task {
+  MeanWorkerTask(const tflite::MeanParams& op_params,
+                 const RuntimeShape& input_shape, const uint8_t* input_data,
+                 int32 multiplier, int32 shift, int32 bias,
+                 const RuntimeShape& output_shape, uint8_t* output_data,
+                 int start_height, int end_height)
+      : op_params(op_params),
+        input_shape(input_shape),
+        input_data(input_data),
+        multiplier(multiplier),
+        shift(shift),
+        bias(bias),
+        output_shape(output_shape),
+        output_data(output_data),
+        start_height(start_height),
+        end_height(end_height) {}
+
+  void Run() override {
+    MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
+             output_shape, output_data, start_height, end_height);
+  }
+
+ private:
+  const tflite::MeanParams& op_params;
+  const RuntimeShape& input_shape;
+  const uint8_t* input_data;
+  int32 multiplier;
+  int32 shift;
+  int32 bias;
+  const RuntimeShape& output_shape;
+  uint8_t* output_data;
+  int start_height;
+  int end_height;
+};
+
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const uint8_t* input_data, int32 input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 uint8_t* output_data, int32 output_zero_point,
+                 float output_scale, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("Mean4D/Uint8");
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  float temp = input_zero_point * input_scale / output_scale;
+  temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
+  int32_t bias = output_zero_point - static_cast<int32_t>(temp);
+  float real_scale = input_scale / (num_elements_in_axis * output_scale);
+
+  int32 multiplier, shift;
+  QuantizeMultiplier(real_scale, &multiplier, &shift);
+
+  constexpr int kMinDepthPerThread = 8;
+  int thread_count = output_depth / kMinDepthPerThread;
+  thread_count = thread_count > 0 ? thread_count : 1;
+  const int capped_thread_count =
+      std::min(thread_count, cpu_backend_context->max_num_threads());
+
+  if (capped_thread_count == 1) {
+    MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
+             output_shape, output_data, 0, output_depth);
+  } else {
+    // Instead parallel for batch, we loop for the output_depth since batch
+    // is typical 1.
+    std::vector<MeanWorkerTask> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(capped_thread_count);
+    int depth_start = 0;
+    for (int i = 0; i < capped_thread_count; ++i) {
+      // Try to distribute the tasks as even as possible.
+      int depth_end = depth_start +
+                      (output_depth - depth_start) / (capped_thread_count - i);
+      tasks.emplace_back(op_params, input_shape, input_data, multiplier, shift,
+                         bias, output_shape, output_data, depth_start,
+                         depth_end);
+      depth_start = depth_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+}
+
+template <typename T>
+struct SumOp {
+  inline T operator()(const T& a) const { return a; }
+  inline T operator()(const T& a, const T& b) const { return a + b; }
+  static constexpr T kNeutralElement = T(0);
+};
+
+template <typename T, typename U>
+struct CastSumOp {
+  inline U operator()(const T& a) const { return static_cast<U>(a); }
+  inline U operator()(const U& a, const T& b) const {
+    return a + static_cast<U>(b);
+  }
+  static constexpr U kNeutralElement = U(0);
+};
+
+template <typename T>
+struct ProdOp {
+  inline T operator()(const T& a) const { return a; }
+  inline T operator()(const T& a, const T& b) const { return a * b; }
+  static constexpr T kNeutralElement = T(1);
+};
+
+template <typename T>
+struct MaxOp {
+  inline T operator()(const T& a) const { return a; }
+  inline T operator()(const T& a, const T& b) const { return (a > b) ? a : b; }
+  static constexpr T kNeutralElement = std::numeric_limits<T>::lowest();
+};
+
+template <typename T>
+struct MinOp {
+  inline T operator()(const T& a) const { return a; }
+  inline T operator()(const T& a, const T& b) const { return (a < b) ? a : b; }
+  static constexpr T kNeutralElement = std::numeric_limits<T>::max();
+};
+
+struct AndOp {
+  inline bool operator()(bool a) const { return a; }
+  inline bool operator()(bool a, bool b) const { return a && b; }
+  static constexpr bool kNeutralElement = true;
+};
+
+struct OrOp {
+  inline bool operator()(bool a) const { return a; }
+  inline bool operator()(bool a, bool b) const { return a || b; }
+  static constexpr bool kNeutralElement = false;
+};
+
+// When the number of axis is zero, the reduction is simply a copy.
+template <typename T>
+void ReduceIsCopy(const T* input_data, const int* input_dims,
+                  const int input_num_dims, T* output_data) {
+  int num_elems = NumElements(input_dims, input_num_dims);
+  memcpy(output_data, input_data, num_elems * sizeof(T));
+}
+
+// Reduces the input over either odd or even dimensions using Op.
+// One recursive call for each dimension is made.
+// 'depth' is the depth of recursion.
+// 'parity' indicates whether odd or even dimensions are being reduced.
+// ReducerFirst is applied to the first element to be written to each output
+// position.
+// ReducerNext is applied to each subsequent element to be written to each
+// output position.
+template <typename T, typename U, typename ReducerFirst, typename ReducerNext>
+inline std::pair<const T*, U*> ReduceImpl(const T* input_data,
+                                          const int* input_dims, U* output_data,
+                                          int depth, int parity, bool next,
+                                          const ReducerFirst& reducer_first,
+                                          const ReducerNext& reducer_next) {
+  // The output pointer is incremented conditionally depending on whether the
+  // odd or even dimension is being reduced.
+  // The input pointer is always incremented as each input is read once.
+  if (depth > 0) {
+    U* future_output = output_data;
+    bool update_output = (depth % 2) == parity;
+    for (int i = 0; i < input_dims[0]; ++i) {
+      if (i > 0 && !update_output) {
+        next = true;
+      }
+      std::tie(input_data, future_output) =
+          ReduceImpl(input_data, &input_dims[1], output_data, depth - 1, parity,
+                     next, reducer_first, reducer_next);
+      if (update_output) {
+        output_data = future_output;
+      }
+    }
+    output_data = future_output;
+  } else {
+    // Reduce the final dimension.
+    if (parity) {
+      // Reduce the even dimension. The entire dimension is reduced into one
+      // value.
+      U res = next ? reducer_next(*output_data, *input_data++)
+                   : reducer_first(*input_data++);
+      for (int i = 1; i < input_dims[0]; ++i) {
+        res = reducer_next(res, *input_data++);
+      }
+      *output_data++ = res;
+    } else {
+      // Reduce the odd dimension. Each input is accumulated into a separate
+      // output.
+      if (!next) {
+        for (int i = 0; i < input_dims[0]; ++i) {
+          U res = reducer_first(*input_data++);
+          *output_data++ = res;
+        }
+      } else {
+        for (int i = 0; i < input_dims[0]; ++i) {
+          U res = *output_data;
+          res = reducer_next(res, *input_data++);
+          *output_data++ = res;
+        }
+      }
+    }
+  }
+  return {input_data, output_data};
+}
+
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis. ReducerFirst is used the first time each output
+// element is written and ReducerNext is used for all subsequent writes.
+template <typename In, typename Out, typename ReducerFirst,
+          typename ReducerNext>
+inline bool Reduce(const In* input_data, const int* input_dims,
+                   const int input_num_dims, const int* axis,
+                   const int num_axis, Out* output_data,
+                   const ReducerFirst& reducer_first,
+                   const ReducerNext& reducer_next) {
+  const int parity = (axis[num_axis - 1] == input_num_dims - 1) ? 1 : 0;
+  ReduceImpl(input_data, input_dims, output_data, input_num_dims - 1, parity,
+             /*next=*/false, reducer_first, reducer_next);
+  return true;
+}
+
+// Computes the mean or sum of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis for quantized values.
+template <typename T, typename U>
+bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
+                        float input_scale, const int* input_dims,
+                        const int input_num_dims, T* output_data,
+                        int32_t output_zero_point, float output_scale,
+                        const int* output_dims, const int output_num_dims,
+                        const int* axis, const int num_axis_dimensions,
+                        bool keep_dims, int* normalized_dims,
+                        int* resolved_axis, U* temp_sum, bool compute_sum) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
+  ruy::profiler::ScopeLabel label(compute_sum ? "QuantizedSum"
+                                              : "QuantizedMean");
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+
+  // Return early when input shape has zero dim. This is done after initializing
+  // data for output tensor because there are cases that the input tensor is
+  // empty but output tensor is not. In that case, output tensor should be
+  // filled with init_value.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) return true;
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  int normalized_num_dims = 0;
+  if (!reduce_utils::ResolveAxis(input_num_dims, axis, num_axis_dimensions,
+                                 resolved_axis, num_resolved_axis, input_dims,
+                                 normalized_dims, normalized_num_dims)) {
+    return false;
+  }
+
+  if (num_resolved_axis == 0) {
+    int count = NumElements(input_dims, input_num_dims);
+    for (int i = 0; i < count; ++i) {
+      temp_sum[i] = U(input_data[i]);
+    }
+  } else {
+    if (!Reduce<T, U, CastSumOp<T, U>, CastSumOp<T, U>>(
+            input_data, normalized_dims, normalized_num_dims, resolved_axis,
+            num_resolved_axis, temp_sum, CastSumOp<T, U>(),
+            CastSumOp<T, U>())) {
+      return false;
+    }
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  size_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(normalized_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0) {
+    const float scale = input_scale / output_scale;
+    if (compute_sum) {
+      const float bias = -input_zero_point * scale * num_elements_in_axis;
+      for (size_t idx = 0; idx < num_outputs; ++idx) {
+        U value = static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
+                  output_zero_point;
+        value = std::min(std::max(value, kMinValue), kMaxValue);
+        output_data[idx] = static_cast<T>(value);
+      }
+    } else {
+      const float bias = -input_zero_point * scale;
+      for (size_t idx = 0; idx < num_outputs; ++idx) {
+        float float_mean = static_cast<float>(temp_sum[idx]) /
+                           static_cast<float>(num_elements_in_axis);
+        float result = TfLiteMin(
+            TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            static_cast<float>(std::numeric_limits<T>::max()));
+        result = TfLiteMax(result,
+                           static_cast<float>(std::numeric_limits<T>::min()));
+        output_data[idx] = static_cast<T>(result);
+      }
+    }
+  }
+  return true;
+}
+
+using ops::builtin::reduce::ReduceType;
+
+template <typename T>
+inline bool ReduceDispatcher(const T* input_data, const int* input_dims,
+                             const int input_num_dims, const int* output_dims,
+                             int output_num_dims, T* output_data,
+                             const int* axis, const int64_t num_axis_dimensions,
+                             ReduceType reduce_type) {
+  T init_value;
+  switch (reduce_type) {
+    case ReduceType::kProd:
+      init_value = ProdOp<T>::kNeutralElement;
+      break;
+    case ReduceType::kSum:
+      init_value = SumOp<T>::kNeutralElement;
+      break;
+    case ReduceType::kMin:
+      init_value = MinOp<T>::kNeutralElement;
+      break;
+    case ReduceType::kMax:
+      init_value = MaxOp<T>::kNeutralElement;
+      break;
+    default:
+      return false;
+  }
+  // Return early when input shape has zero dim. This is done after initializing
+  // data for output tensor because there are cases that the input tensor is
+  // empty but output tensor is not. In that case, output tensor should be
+  // filled with Op::kNeutralElement.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) {
+      return reference_ops::InitTensorDataForReduce(
+          output_dims, output_num_dims, init_value, output_data);
+    }
+  }
+
+  switch (reduce_type) {
+    case ReduceType::kProd:
+      return Reduce<T, T, ProdOp<T>, ProdOp<T>>(
+          input_data, input_dims, input_num_dims, axis, num_axis_dimensions,
+          output_data, ProdOp<T>(), ProdOp<T>());
+    case ReduceType::kSum:
+      return Reduce<T, T, SumOp<T>, SumOp<T>>(
+          input_data, input_dims, input_num_dims, axis, num_axis_dimensions,
+          output_data, SumOp<T>(), SumOp<T>());
+    case ReduceType::kMin:
+      return Reduce<T, T, MinOp<T>, MinOp<T>>(
+          input_data, input_dims, input_num_dims, axis, num_axis_dimensions,
+          output_data, MinOp<T>(), MinOp<T>());
+    case ReduceType::kMax:
+      return Reduce<T, T, MaxOp<T>, MaxOp<T>>(
+          input_data, input_dims, input_num_dims, axis, num_axis_dimensions,
+          output_data, MaxOp<T>(), MaxOp<T>());
+    default:
+      return false;
+  }
+}
+
+template <>
+inline bool ReduceDispatcher<bool>(const bool* input_data,
+                                   const int* input_dims,
+                                   const int input_num_dims,
+                                   const int* output_dims, int output_num_dims,
+                                   bool* output_data, const int* axis,
+                                   const int64_t num_axis_dimensions,
+                                   ReduceType reduce_type) {
+  bool init_value;
+  switch (reduce_type) {
+    case ReduceType::kAny:
+      init_value = OrOp::kNeutralElement;
+      break;
+    case ReduceType::kAll:
+      init_value = AndOp::kNeutralElement;
+      break;
+    default:
+      return false;
+  }
+  // Return early when input shape has zero dim. This is done after initializing
+  // data for output tensor because there are cases that the input tensor is
+  // empty but output tensor is not. In that case, output tensor should be
+  // filled with Op::kNeutralElement.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) {
+      return reference_ops::InitTensorDataForReduce(
+          output_dims, output_num_dims, init_value, output_data);
+    }
+  }
+  switch (reduce_type) {
+    case ReduceType::kAll:
+      return Reduce<bool, bool, AndOp, AndOp>(
+          input_data, input_dims, input_num_dims, axis, num_axis_dimensions,
+          output_data, AndOp(), AndOp());
+    case ReduceType::kAny:
+      return Reduce<bool, bool, OrOp, OrOp>(
+          input_data, input_dims, input_num_dims, axis, num_axis_dimensions,
+          output_data, OrOp(), OrOp());
+    default:
+      return false;
+  }
+}
+
+// Calculate the reduced product by rescaling each multiplication step to
+// avoid an overflow.
+template <typename T>
+struct ReducerFirst {
+  explicit ReducerFirst(int input_zero_point_arg)
+      : input_zero_point(input_zero_point_arg) {}
+  int32_t operator()(T in) const { return in - input_zero_point; }
+  int input_zero_point;
+};
+
+template <typename T>
+struct ReducerNext {
+  ReducerNext(int32_t input_zero_point_arg, int32_t scaling_multiplier_arg,
+              int32_t scaling_shift_arg)
+      : input_zero_point(input_zero_point_arg),
+        scaling_multiplier(scaling_multiplier_arg),
+        scaling_shift(scaling_shift_arg) {}
+  int32_t operator()(int32_t current, T in) const {
+    const int64_t result =
+        static_cast<int64_t>(current) * (in - input_zero_point);
+    return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
+                                         scaling_shift);
+  }
+  int32_t input_zero_point, scaling_multiplier, scaling_shift;
+};
+
+template <typename T>
+inline bool QuantizedReduceProd(
+    const T* input_data, int32_t input_zero_point,
+    const RuntimeShape& input_shape, T* output_data, int32_t output_zero_point,
+    const RuntimeShape& output_shape, const int* axis,
+    const int64_t num_axis_dimensions, int* resolved_axis, int* normalized_dims,
+    int32_t* temp_prod, int32_t scaling_multiplier, int scaling_shift) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  int normalized_num_dims = 0;
+  if (!reduce_utils::ResolveAxis(input_shape.DimensionsCount(), axis,
+                                 num_axis_dimensions, resolved_axis,
+                                 num_resolved_axis, input_shape.DimsData(),
+                                 normalized_dims, normalized_num_dims)) {
+    return false;
+  }
+
+  if (!Reduce<T, int32_t, ReducerFirst<T>, ReducerNext<T>>(
+          input_data, normalized_dims, normalized_num_dims, resolved_axis,
+          num_resolved_axis, temp_prod, ReducerFirst<T>(input_zero_point),
+          ReducerNext<T>(input_zero_point, scaling_multiplier,
+                         scaling_shift))) {
+    return false;
+  }
+
+  for (int i = 0; i < output_shape.FlatSize(); i++) {
+    int32_t result =
+        MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
+                                      scaling_multiplier, scaling_shift) +
+        output_zero_point;
+    result = std::min(std::max(result, kMinValue), kMaxValue);
+    output_data[i] = static_cast<T>(result);
+  }
+
+  return true;
+}
+
+template <typename T>
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& input_shape, const T* input_data,
+                 const RuntimeShape& output_shape, T* output_data) {
+  return reference_ops::Mean(op_params, input_shape, input_data, output_shape,
+                             output_data);
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis.
+template <typename T, typename U>
+inline bool MeanGeneral(const T* input_data, const int* input_dims,
+                        const int input_num_dims, T* output_data,
+                        const int* output_dims, const int output_num_dims,
+                        const int* axis, const int num_axis_dimensions,
+                        bool keep_dims, int* normalized_dims,
+                        int* resolved_axis, U* temp_sum) {
+  ruy::profiler::ScopeLabel label("Mean");
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  int normalized_num_dims = 0;
+  if (!reduce_utils::ResolveAxis(input_num_dims, axis, num_axis_dimensions,
+                                 resolved_axis, num_resolved_axis, input_dims,
+                                 normalized_dims, normalized_num_dims)) {
+    return false;
+  }
+  if (num_resolved_axis == 0) {
+    optimized_ops::ReduceIsCopy(input_data, input_dims, input_num_dims,
+                                output_data);
+    return true;
+  }
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+
+  if (!Reduce<T, U, CastSumOp<T, U>, CastSumOp<T, U>>(
+          input_data, normalized_dims, normalized_num_dims, resolved_axis,
+          num_resolved_axis, temp_sum, CastSumOp<T, U>(), CastSumOp<T, U>())) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  size_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(normalized_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0) {
+    for (size_t idx = 0; idx < num_outputs; ++idx) {
+      output_data[idx] =
+          static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
+    }
+  }
+  return true;
+}
+
+template <typename T, typename U>
+inline bool Mean(const T* input_data, const int* input_dims,
+                 const int input_num_dims, T* output_data,
+                 const int* output_dims, const int output_num_dims,
+                 const int* axis, const int num_axis_dimensions, bool keep_dims,
+                 int* normalized_dims, int* resolved_axis, U* temp_sum) {
+  return MeanGeneral(input_data, input_dims, input_num_dims, output_data,
+                     output_dims, output_num_dims, axis, num_axis_dimensions,
+                     false, normalized_dims, resolved_axis, temp_sum);
+}
+
+// Use Eigen when Mean is calculated over the last dimension only of a float
+// tensor.
+template <>
+inline bool Mean<float, float>(const float* input_data, const int* input_dims,
+                               const int input_num_dims, float* output_data,
+                               const int* output_dims,
+                               const int output_num_dims, const int* axis,
+                               const int num_axis_dimensions, bool keep_dims,
+                               int* normalized_dims, int* resolved_axis,
+                               float* temp_sum) {
+  // Handle reduce_mean for the last dimensions.
+  int num_resolved_axis = 0;
+  int normalized_num_dims = 0;
+  if (!reduce_utils::ResolveAxis(input_num_dims, axis, num_axis_dimensions,
+                                 resolved_axis, num_resolved_axis, input_dims,
+                                 normalized_dims, normalized_num_dims)) {
+    return false;
+  }
+  if (normalized_num_dims > 1 && num_resolved_axis == 1 &&
+      resolved_axis[0] == (normalized_num_dims - 1)) {
+    ruy::profiler::ScopeLabel label("MeanLastDim/Float");
+    int output_size = normalized_dims[0];
+    const int last_input_dim = normalized_dims[1];
+
+    // TODO(b/152563685): Consider use eigen to cover more general cases.
+    const MatrixMap<const float> in_mat(input_data, last_input_dim,
+                                        output_size);
+    VectorMap<float> out(output_data, output_size, 1);
+    out = (in_mat.array().colwise().sum()) / static_cast<float>(last_input_dim);
+    return true;
+  }
+
+  return MeanGeneral(input_data, input_dims, input_num_dims, output_data,
+                     output_dims, output_num_dims, axis, num_axis_dimensions,
+                     false, normalized_dims, resolved_axis, temp_sum);
+}
+
+// Computes the generic value (i.e., sum/max/min/prod) of elements across
+// dimensions given in axis. It needs to pass in init_value and reducer.
+template <typename T>
+inline bool ReduceGeneric(const T* input_data, const int* input_dims,
+                          const int input_num_dims, T* output_data,
+                          const int* output_dims, const int output_num_dims,
+                          const int* axis, const int64_t num_axis_dimensions,
+                          int* resolved_axis, int* normalized_dims,
+                          ReduceType reduce_type) {
+  int num_resolved_axis = 0;
+  int normalized_num_dims = 0;
+  if (!reduce_utils::ResolveAxis(input_num_dims, axis, num_axis_dimensions,
+                                 resolved_axis, num_resolved_axis, input_dims,
+                                 normalized_dims, normalized_num_dims)) {
+    return false;
+  }
+  if (num_resolved_axis == 0) {
+    optimized_ops::ReduceIsCopy(input_data, input_dims, input_num_dims,
+                                output_data);
+    return true;
+  }
+  return ReduceDispatcher(input_data, normalized_dims, normalized_num_dims,
+                          output_dims, output_num_dims, output_data,
+                          resolved_axis, num_resolved_axis, reduce_type);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_REDUCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/reduce_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/reduce_utils.h
new file mode 100644
index 00000000..f89d06b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/reduce_utils.h
@@ -0,0 +1,138 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_REDUCE_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_REDUCE_UTILS_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cstring>
+
+namespace tflite {
+namespace reduce_utils {
+
+inline void RemoveSize1Dims(int* shape_out, int& out_num_dims, int* axis_out,
+                            int& out_num_axis) {
+  for (int64_t i = 0; i < out_num_dims;) {
+    if (shape_out[i] == 1) {
+      for (int64_t j = i + 1; j < out_num_dims; ++j) {
+        shape_out[j - 1] = shape_out[j];
+      }
+      for (int64_t j = 0; j < out_num_axis; ++j) {
+        if (axis_out[j] == i) {
+          for (int64_t k = j + 1; k < out_num_axis; ++k) {
+            axis_out[k - 1] = axis_out[k];
+          }
+          out_num_axis -= 1;
+          break;
+        }
+      }
+      for (int64_t j = 0; j < out_num_axis; ++j) {
+        if (axis_out[j] > i) {
+          axis_out[j] -= 1;
+        }
+      }
+      --out_num_dims;
+    } else {
+      ++i;
+    }
+  }
+}
+
+// This method parses the input 'axis' to remove duplicates, handle negative
+// values and remove redundant dimensions. It returns a valid 'axis_out' and
+// 'shape_out' contains the flattened input shape. 'out_num_dims' contains the
+// reduced number of dimensions.
+inline bool ResolveAxis(const int num_dims, const int* axis,
+                        const int64_t num_axis, int* axis_out,
+                        int& out_num_axis, const int* shape_in, int* shape_out,
+                        int& out_num_dims) {
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0) {
+    out_num_axis = 0;
+    out_num_dims = 0;
+    return true;
+  }
+  out_num_axis = 0;
+  out_num_dims = num_dims;
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (int64_t idx = 0; idx < num_axis; ++idx) {
+    // Handle negative index. A positive index 'p_idx' can be represented as a
+    // negative index 'n_idx' as: n_idx = p_idx-num_dims
+    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    if (current < 0 || current >= num_dims) {
+      return false;
+    }
+    bool is_dup = false;
+    for (int j = 0; j < out_num_axis; ++j) {
+      if (axis_out[j] == current) {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup) {
+      axis_out[out_num_axis] = current;
+      out_num_axis += 1;
+    }
+  }
+  // If two or more adjacent dimensions are either reduced
+  // over or not, then the second and subsequent dimensions may be flattened.
+  memcpy(shape_out, shape_in, num_dims * sizeof(int));
+  std::sort(&axis_out[0], &axis_out[out_num_axis]);
+
+  RemoveSize1Dims(shape_out, out_num_dims, axis_out, out_num_axis);
+  if (out_num_axis > 0) {
+    int64_t j = out_num_axis - 1;
+    // true if the previous index is present in axis_out.
+    bool previous_here = (axis_out[j] == out_num_dims - 1);
+    if (previous_here) {
+      j -= 1;
+    }
+
+    for (int64_t i = out_num_dims - 2; i >= 0; --i) {
+      // true if the current index is present in axis_out.
+      bool current_here = j >= 0 ? (axis_out[j] == i) : false;
+      if (current_here == previous_here) {
+        shape_out[i] *= shape_out[i + 1];
+        for (int64_t k = i + 1; k + 1 < out_num_dims; ++k) {
+          shape_out[k] = shape_out[k + 1];
+        }
+        // All axis bigger than this need to be reduced by 1.
+        for (int64_t k = 0; k < out_num_axis; ++k) {
+          if (axis_out[k] > i) {
+            axis_out[k] -= 1;
+          }
+        }
+        if (current_here) {
+          for (int64_t k = j + 1; k + 1 < out_num_axis; ++k) {
+            axis_out[k] = axis_out[k + 1];
+          }
+          out_num_axis -= 1;
+        }
+        out_num_dims -= 1;
+      }
+      if (current_here) {
+        j -= 1;
+      }
+      previous_here = current_here;
+    }
+  }
+  return true;
+}
+}  // namespace reduce_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_REDUCE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h
new file mode 100644
index 00000000..909965e1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h
@@ -0,0 +1,1738 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_RESIZE_BILINEAR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_RESIZE_BILINEAR_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace resize_bilinear {
+
+#ifdef USE_NEON
+// These utility functions are split off not just for convenience. Most
+// incoporate packing or unpacking of data.
+//
+// (a) Optimizations can be tried experimentally.
+// (b) Optimizations can be specialized for architectures, eg Intel vs ARM.
+
+inline int16x8_t Load8IntoLowerS16(const uint8_t* data_ptr) {
+  return vreinterpretq_s16_u16(vmovl_u8(vld1_u8(data_ptr)));
+}
+
+inline uint16x8_t Move8IntoUpperU16(const uint8x8_t vec_val) {
+  // Alternatively one could zip with a zero vector.
+  return vshlq_n_u16(vmovl_u8(vec_val), 8);
+}
+
+inline uint16x8_t Load8IntoUpperU16(const uint8_t* data_ptr) {
+  return Move8IntoUpperU16(vld1_u8(data_ptr));
+}
+
+// Extract upper 8 bits from each 16-bit integer in vector registers. This is
+// performed for a pair, because instructions often work on pairs.
+inline void PairExtractUpper(const uint16x8_t accum_0, const uint16x8_t accum_1,
+                             uint8x8_t* res_0, uint8x8_t* res_1) {
+  uint8x16x2_t unzipped =
+      vuzpq_u8(vreinterpretq_u8_u16(accum_0), vreinterpretq_u8_u16(accum_1));
+  *res_0 = vget_low_u8(unzipped.val[1]);
+  *res_1 = vget_high_u8(unzipped.val[1]);
+}
+
+// This is an exceptional definition.
+//
+// Modify int16x8_t, adding operators.
+//
+// There are exceptional circumstances that make it reasonable to write code
+// on vector types for quantized resize bilinear in *some cases*.
+//
+// (a) In exact quant resize bilinear, it should be possible to guarantee that
+//     arithmetic never overflows.
+// (b) When the resize scaling is 2 or 4 or 8 it is possible to guarantee
+//     exact accumulation and exact incrementation.
+// (c) In quant resize bilinear the choice of unsigned vs signed accumulation
+//     and saturated vs unsaturated arithmetic is often unimportant.
+//
+// This pattern simplifies the code considerably. This pattern should not be
+// used more widely in code since it can hide important numerical detail.
+//
+// DO NOT add to this any "class-like" methods: only those that do no more than
+// redirecting operators to specific intrinsics functions.
+struct op_int16x8_t {
+  inline op_int16x8_t() = default;
+  inline explicit op_int16x8_t(const int16x8_t& initial_val) {
+    val = initial_val;
+  }
+  inline op_int16x8_t& operator=(const int16x8_t& new_val) {
+    val = new_val;
+    return *this;
+  }
+  inline op_int16x8_t operator+=(const op_int16x8_t& add_val) {
+    val = vaddq_s16(val, add_val.val);
+    return *this;
+  }
+  inline op_int16x8_t operator-=(const op_int16x8_t& sub_val) {
+    val = vsubq_s16(val, sub_val.val);
+    return *this;
+  }
+  // This really selects vshlq_n_s16, but requires a longer implementation to
+  // convert the shift argument back to a constant. In some compiles are macros
+  // requiring constant args.
+  inline op_int16x8_t operator<<=(int32_t left_shift) {
+    switch (left_shift) {
+      case 1:
+        val = vshlq_n_s16(val, 1);
+        break;
+      case 4:
+        val = vshlq_n_s16(val, 4);
+        break;
+      case 8:
+        val = vshlq_n_s16(val, 8);
+        break;
+      default:
+        TFLITE_CHECK(false);
+        break;
+    }
+    return *this;
+  }
+  // This really selects vshrq_n_u16, but requires a longer implementation to
+  // convert the shift argument back to a constant. In some compiles are macros
+  // requiring constant args.
+  inline op_int16x8_t operator>>=(int32_t right_shift) {
+    switch (right_shift) {
+      case 1:
+        val = vshrq_n_s16(val, 1);
+        break;
+      case 4:
+        val = vshrq_n_s16(val, 4);
+        break;
+      case 8:
+        val = vshrq_n_s16(val, 8);
+        break;
+      default:
+        TFLITE_CHECK(false);
+        break;
+    }
+    return *this;
+  }
+  friend inline op_int16x8_t operator+(op_int16x8_t lhs,
+                                       const op_int16x8_t& rhs) {
+    lhs += rhs;
+    return lhs;
+  }
+  friend inline op_int16x8_t operator-(op_int16x8_t lhs,
+                                       const op_int16x8_t& rhs) {
+    lhs -= rhs;
+    return lhs;
+  }
+  friend inline op_int16x8_t operator<<(op_int16x8_t lhs, int32_t left_shift) {
+    lhs <<= left_shift;
+    return lhs;
+  }
+  friend inline op_int16x8_t operator>>(op_int16x8_t lhs, int32_t right_shift) {
+    lhs >>= right_shift;
+    return lhs;
+  }
+
+  int16x8_t val;
+};
+
+// This is an exceptional definition.
+//
+// Modify uint16x8_t, adding operators.
+//
+// Important: See above notes on op_int16x8_t.
+struct op_uint16x8_t {
+  inline op_uint16x8_t() = default;
+  inline explicit op_uint16x8_t(const uint16x8_t initial_val) {
+    val = initial_val;
+  }
+  inline op_uint16x8_t& operator=(const uint16x8_t& new_val) {
+    val = new_val;
+    return *this;
+  }
+  inline op_uint16x8_t operator+=(const op_int16x8_t& add_val) {
+    val = vaddq_u16(val, vreinterpretq_u16_s16(add_val.val));
+    return *this;
+  }
+  inline op_uint16x8_t operator-=(const op_int16x8_t& sub_val) {
+    val = vsubq_u16(val, vreinterpretq_u16_s16(sub_val.val));
+    return *this;
+  }
+  // This really selects vshlq_n_s16, but requires a longer implementation to
+  // convert the shift argument back to a constant. In some compiles are macros
+  // requiring constant args.
+  inline op_uint16x8_t operator<<=(int32_t left_shift) {
+    switch (left_shift) {
+      case 1:
+        val = vshlq_n_u16(val, 1);
+        break;
+      case 4:
+        val = vshlq_n_u16(val, 4);
+        break;
+      case 8:
+        val = vshlq_n_u16(val, 8);
+        break;
+      default:
+        TFLITE_CHECK(false);
+        break;
+    }
+    return *this;
+  }
+  // This really selects vshrq_n_u16, but requires a longer implementation to
+  // convert the shift argument back to a constant. In some compiles are macros
+  // requiring constant args.
+  inline op_uint16x8_t operator>>=(int32_t right_shift) {
+    switch (right_shift) {
+      case 1:
+        val = vshrq_n_u16(val, 1);
+        break;
+      case 4:
+        val = vshrq_n_u16(val, 4);
+        break;
+      case 8:
+        val = vshrq_n_u16(val, 8);
+        break;
+      default:
+        TFLITE_CHECK(false);
+        break;
+    }
+    return *this;
+  }
+  friend inline op_uint16x8_t operator+(op_uint16x8_t lhs,
+                                        const op_int16x8_t& rhs) {
+    lhs += rhs;
+    return lhs;
+  }
+  friend inline op_uint16x8_t operator-(op_uint16x8_t lhs,
+                                        const op_int16x8_t& rhs) {
+    lhs -= rhs;
+    return lhs;
+  }
+  friend inline op_uint16x8_t operator<<(op_uint16x8_t lhs,
+                                         int32_t left_shift) {
+    lhs <<= left_shift;
+    return lhs;
+  }
+  friend inline op_uint16x8_t operator>>(op_uint16x8_t lhs,
+                                         int32_t right_shift) {
+    lhs >>= right_shift;
+    return lhs;
+  }
+
+  uint16x8_t val;
+};
+
+inline op_uint16x8_t VReinterpretQU16S16(const op_int16x8_t& other) {
+  op_uint16x8_t ret_val(vreinterpretq_u16_s16(other.val));
+  return ret_val;
+}
+#endif  // USE_NEON
+
+// Optimized resize-bilinear for the special case where the scaling is x8 in
+// width and height, and where we can operate on depth-8 blocks at a time. So
+// the output blocks are 8x8x8 in width-height-depth.
+//
+// This optimization is for the half_pixel_centers == true version, for uint8.
+// There are versions for NEON and non-NEON compilation.
+inline void ResizeBilinear888Uint8(int32_t batches, int32_t input_height,
+                                   int32_t input_width, int32_t depth,
+                                   const uint8_t* input_data,
+                                   uint8_t* output_data) {
+  TFLITE_DCHECK_GE(input_height, 1);
+  TFLITE_DCHECK_GE(input_width, 1);
+  TFLITE_DCHECK_EQ(depth % 8, 0);
+
+  const int32_t input_row_stride = input_width * depth;
+  const int32_t output_row_stride = input_row_stride * 8;
+  for (int b = 0; b < batches; ++b) {
+    const uint8_t* input_base_ptr =
+        input_data + b * input_row_stride * input_height;
+    uint8_t* output_base_ptr =
+        output_data + b * output_row_stride * input_height * 8;
+
+#ifdef USE_NEON
+    for (int c_block = 0; c_block < depth; c_block += 8) {
+      op_uint16x8_t accum_c_v;
+      // Top-left margin corner.
+      {
+        uint8x8_t output_data = vld1_u8(&input_base_ptr[c_block]);
+        vst1_u8(&output_base_ptr[c_block], output_data);
+        vst1_u8(&output_base_ptr[c_block + depth], output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * 2], output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * 3], output_data);
+
+        // Accumulate in 8.8 representation, pre-adding 0.5 for later rounding.
+        accum_c_v = vaddq_u16(Move8IntoUpperU16(output_data), vdupq_n_u16(128));
+      }
+
+      // Top-centre margin.
+      op_int16x8_t wdelta_c_v;
+      op_int16x8_t wdelta_twice_c_v;
+      for (int j = 0; j < (input_width - 1); ++j) {
+        {
+          uint8x8_t output_data_alt;
+          uint8x8_t output_data;
+
+          const op_int16x8_t tl_val(
+              Load8IntoLowerS16(&input_base_ptr[c_block + depth * j]));
+          const op_int16x8_t tr_val(
+              Load8IntoLowerS16(&input_base_ptr[c_block + depth * (j + 1)]));
+          wdelta_c_v = (tr_val - tl_val) << 4;
+          wdelta_twice_c_v = wdelta_c_v << 1;
+
+          op_uint16x8_t accum_c_v_alt = accum_c_v + wdelta_c_v;
+          accum_c_v = accum_c_v_alt + wdelta_twice_c_v;
+          PairExtractUpper(accum_c_v_alt.val, accum_c_v.val, &output_data_alt,
+                           &output_data);
+
+          vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth * 4],
+                  output_data_alt);
+          vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth + depth * 4],
+                  output_data);
+
+          for (int p = 2; p < 8; p += 2) {
+            accum_c_v_alt = accum_c_v + wdelta_twice_c_v;
+            accum_c_v = accum_c_v_alt + wdelta_twice_c_v;
+            PairExtractUpper(accum_c_v_alt.val, accum_c_v.val, &output_data_alt,
+                             &output_data);
+
+            vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth * p +
+                                     depth * 4],
+                    output_data_alt);
+            vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth * (p + 1) +
+                                     depth * 4],
+                    output_data);
+          }
+          accum_c_v += wdelta_c_v;
+        }
+      }
+
+      // Top-right margin corner.
+      {
+        uint8x8_t output_data_discard;
+        uint8x8_t output_data;
+
+        // Accumulations have pre-added 0.5 for rounding, but that is just
+        // discarded and this just avoids re-loading.
+        PairExtractUpper(accum_c_v.val, accum_c_v.val, &output_data,
+                         &output_data_discard);
+
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4],
+                output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4 + depth],
+                output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4 + depth * 2],
+                output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4 + depth * 3],
+                output_data);
+      }
+    }
+    // Fill out remainder of top margin.
+    std::memcpy(output_base_ptr + output_row_stride, output_base_ptr,
+                output_row_stride * sizeof(uint8_t));
+    std::memcpy(output_base_ptr + output_row_stride * 2, output_base_ptr,
+                output_row_stride * sizeof(uint8_t));
+    std::memcpy(output_base_ptr + output_row_stride * 3, output_base_ptr,
+                output_row_stride * sizeof(uint8_t));
+    output_base_ptr += output_row_stride * 4;
+
+    // Main rows.
+    for (int k = 0; k < (input_height - 1); ++k) {
+      for (int c_block = 0; c_block < depth; c_block += 8) {
+        uint8_t* output_base_ptr_0 = output_base_ptr;
+        uint8_t* output_base_ptr_1;
+        uint8_t* output_base_ptr_2;
+        uint8_t* output_base_ptr_3;
+        uint8_t* output_base_ptr_4;
+        uint8_t* output_base_ptr_5;
+        uint8_t* output_base_ptr_6;
+        uint8_t* output_base_ptr_7;
+
+        op_uint16x8_t accum_0_c_v;
+        op_uint16x8_t accum_1_c_v;
+        op_uint16x8_t accum_2_c_v;
+        op_uint16x8_t accum_3_c_v;
+        op_uint16x8_t accum_4_c_v;
+        op_uint16x8_t accum_5_c_v;
+        op_uint16x8_t accum_6_c_v;
+        op_uint16x8_t accum_7_c_v;
+
+        op_int16x8_t hdelta_c_v;
+        op_int16x8_t hdelta_twice_c_v;
+
+        // Left margin for 8 rows.
+        {
+          uint8x8_t output_data_0_c;
+          uint8x8_t output_data_1_c;
+          uint8x8_t output_data_2_c;
+          uint8x8_t output_data_3_c;
+          uint8x8_t output_data_4_c;
+          uint8x8_t output_data_5_c;
+          uint8x8_t output_data_6_c;
+          uint8x8_t output_data_7_c;
+
+          const op_int16x8_t tl_val(
+              Load8IntoLowerS16(&input_base_ptr[c_block]));
+          const op_int16x8_t bl_val(
+              Load8IntoLowerS16(&input_base_ptr[c_block + input_row_stride]));
+          hdelta_c_v = (bl_val - tl_val) << 4;
+
+          // Accumulate in 8.8 representation, pre-adding 0.5 for later
+          // rounding.
+          accum_0_c_v = VReinterpretQU16S16(tl_val << 8);
+          accum_0_c_v = vaddq_u16(accum_0_c_v.val, vdupq_n_u16(128));
+
+          hdelta_twice_c_v = hdelta_c_v << 1;
+
+          accum_0_c_v += hdelta_c_v;
+          accum_1_c_v = accum_0_c_v + hdelta_twice_c_v;
+          PairExtractUpper(accum_0_c_v.val, accum_1_c_v.val, &output_data_0_c,
+                           &output_data_1_c);
+
+          vst1_u8(&output_base_ptr_0[c_block], output_data_0_c);
+          vst1_u8(&output_base_ptr_0[c_block + depth], output_data_0_c);
+          vst1_u8(&output_base_ptr_0[c_block + depth * 2], output_data_0_c);
+          vst1_u8(&output_base_ptr_0[c_block + depth * 3], output_data_0_c);
+
+          output_base_ptr_1 = output_base_ptr_0 + output_row_stride;
+          vst1_u8(&output_base_ptr_1[c_block], output_data_1_c);
+          vst1_u8(&output_base_ptr_1[c_block + depth], output_data_1_c);
+          vst1_u8(&output_base_ptr_1[c_block + depth * 2], output_data_1_c);
+          vst1_u8(&output_base_ptr_1[c_block + depth * 3], output_data_1_c);
+
+          //
+
+          output_base_ptr_2 = output_base_ptr_1 + output_row_stride;
+          accum_2_c_v = accum_1_c_v + hdelta_twice_c_v;
+          accum_3_c_v = accum_2_c_v + hdelta_twice_c_v;
+          PairExtractUpper(accum_2_c_v.val, accum_3_c_v.val, &output_data_2_c,
+                           &output_data_3_c);
+
+          vst1_u8(&output_base_ptr_2[c_block], output_data_2_c);
+          vst1_u8(&output_base_ptr_2[c_block + depth], output_data_2_c);
+          vst1_u8(&output_base_ptr_2[c_block + depth * 2], output_data_2_c);
+          vst1_u8(&output_base_ptr_2[c_block + depth * 3], output_data_2_c);
+
+          output_base_ptr_3 = output_base_ptr_2 + output_row_stride;
+          vst1_u8(&output_base_ptr_3[c_block], output_data_3_c);
+          vst1_u8(&output_base_ptr_3[c_block + depth], output_data_3_c);
+          vst1_u8(&output_base_ptr_3[c_block + depth * 2], output_data_3_c);
+          vst1_u8(&output_base_ptr_3[c_block + depth * 3], output_data_3_c);
+
+          //
+
+          output_base_ptr_4 = output_base_ptr_3 + output_row_stride;
+          accum_4_c_v = accum_3_c_v + hdelta_twice_c_v;
+          accum_5_c_v = accum_4_c_v + hdelta_twice_c_v;
+          PairExtractUpper(accum_4_c_v.val, accum_5_c_v.val, &output_data_4_c,
+                           &output_data_5_c);
+
+          vst1_u8(&output_base_ptr_4[c_block], output_data_4_c);
+          vst1_u8(&output_base_ptr_4[c_block + depth], output_data_4_c);
+          vst1_u8(&output_base_ptr_4[c_block + depth * 2], output_data_4_c);
+          vst1_u8(&output_base_ptr_4[c_block + depth * 3], output_data_4_c);
+
+          output_base_ptr_5 = output_base_ptr_4 + output_row_stride;
+          vst1_u8(&output_base_ptr_5[c_block], output_data_5_c);
+          vst1_u8(&output_base_ptr_5[c_block + depth], output_data_5_c);
+          vst1_u8(&output_base_ptr_5[c_block + depth * 2], output_data_5_c);
+          vst1_u8(&output_base_ptr_5[c_block + depth * 3], output_data_5_c);
+
+          //
+
+          output_base_ptr_6 = output_base_ptr_5 + output_row_stride;
+          accum_6_c_v = accum_5_c_v + hdelta_twice_c_v;
+          accum_7_c_v = accum_6_c_v + hdelta_twice_c_v;
+          PairExtractUpper(accum_6_c_v.val, accum_7_c_v.val, &output_data_6_c,
+                           &output_data_7_c);
+
+          vst1_u8(&output_base_ptr_6[c_block], output_data_6_c);
+          vst1_u8(&output_base_ptr_6[c_block + depth], output_data_6_c);
+          vst1_u8(&output_base_ptr_6[c_block + depth * 2], output_data_6_c);
+          vst1_u8(&output_base_ptr_6[c_block + depth * 3], output_data_6_c);
+
+          output_base_ptr_7 = output_base_ptr_6 + output_row_stride;
+          vst1_u8(&output_base_ptr_7[c_block], output_data_7_c);
+          vst1_u8(&output_base_ptr_7[c_block + depth], output_data_7_c);
+          vst1_u8(&output_base_ptr_7[c_block + depth * 2], output_data_7_c);
+          vst1_u8(&output_base_ptr_7[c_block + depth * 3], output_data_7_c);
+        }
+
+        // Main central body.
+        op_int16x8_t wdelta_c;
+        op_int16x8_t wdelta_twice_c;
+        op_int16x8_t hwdelta_c;
+        op_int16x8_t hwdelta_twice_c;
+
+        op_int16x8_t incr_0_c;
+        op_int16x8_t incr_1_c;
+        op_int16x8_t incr_2_c;
+        op_int16x8_t incr_3_c;
+        op_int16x8_t incr_4_c;
+        op_int16x8_t incr_5_c;
+        op_int16x8_t incr_6_c;
+        op_int16x8_t incr_7_c;
+
+        uint8x8_t output_data_0_c;
+        uint8x8_t output_data_1_c;
+        uint8x8_t output_data_2_c;
+        uint8x8_t output_data_3_c;
+        uint8x8_t output_data_4_c;
+        uint8x8_t output_data_5_c;
+        uint8x8_t output_data_6_c;
+        uint8x8_t output_data_7_c;
+        for (int j = 0; j < (input_width - 1); ++j) {
+          // output_base_ptr_0 = output_base_ptr;
+          // output_base_ptr_1 = output_base_ptr_0 + output_row_stride; ETC
+          {
+            const op_int16x8_t tl_val(
+                Load8IntoLowerS16(&input_base_ptr[c_block + depth * j]));
+            const op_int16x8_t bl_val(Load8IntoLowerS16(
+                &input_base_ptr[c_block + depth * j + input_row_stride]));
+            const op_int16x8_t tr_val(
+                Load8IntoLowerS16(&input_base_ptr[c_block + depth * (j + 1)]));
+            const op_int16x8_t br_val(Load8IntoLowerS16(
+                &input_base_ptr[c_block + depth * (j + 1) + input_row_stride]));
+
+            const op_int16x8_t tmp_diff = tr_val - tl_val;
+            wdelta_c = tmp_diff << 4;
+            wdelta_twice_c = wdelta_c << 1;
+            hwdelta_c = (br_val - bl_val) - tmp_diff;
+            hwdelta_twice_c = hwdelta_c << 1;
+
+            op_int16x8_t incr_base = wdelta_c + hwdelta_c;
+            accum_0_c_v += incr_base;
+            incr_0_c = incr_base << 1;
+            incr_base += hwdelta_twice_c;
+            accum_1_c_v += incr_base;
+            incr_1_c = incr_base << 1;
+
+            PairExtractUpper(accum_0_c_v.val, accum_1_c_v.val, &output_data_0_c,
+                             &output_data_1_c);
+            vst1_u8(&output_base_ptr_0[c_block + depth * j * 8 + depth * 4],
+                    output_data_0_c);
+            vst1_u8(&output_base_ptr_1[c_block + depth * j * 8 + depth * 4],
+                    output_data_1_c);
+
+            incr_base += hwdelta_twice_c;
+            accum_2_c_v += incr_base;
+            incr_2_c = incr_base << 1;
+            incr_base += hwdelta_twice_c;
+            accum_3_c_v += incr_base;
+            incr_3_c = incr_base << 1;
+
+            PairExtractUpper(accum_2_c_v.val, accum_3_c_v.val, &output_data_2_c,
+                             &output_data_3_c);
+            vst1_u8(&output_base_ptr_2[c_block + depth * j * 8 + depth * 4],
+                    output_data_2_c);
+            vst1_u8(&output_base_ptr_3[c_block + depth * j * 8 + depth * 4],
+                    output_data_3_c);
+
+            incr_base += hwdelta_twice_c;
+            accum_4_c_v += incr_base;
+            incr_4_c = incr_base << 1;
+            incr_base += hwdelta_twice_c;
+            accum_5_c_v += incr_base;
+            incr_5_c = incr_base << 1;
+
+            PairExtractUpper(accum_4_c_v.val, accum_5_c_v.val, &output_data_4_c,
+                             &output_data_5_c);
+            vst1_u8(&output_base_ptr_4[c_block + depth * j * 8 + depth * 4],
+                    output_data_4_c);
+            vst1_u8(&output_base_ptr_5[c_block + depth * j * 8 + depth * 4],
+                    output_data_5_c);
+
+            incr_base += hwdelta_twice_c;
+            accum_6_c_v += incr_base;
+            incr_6_c = incr_base << 1;
+            incr_base += hwdelta_twice_c;
+            accum_7_c_v += incr_base;
+            incr_7_c = incr_base << 1;
+
+            PairExtractUpper(accum_6_c_v.val, accum_7_c_v.val, &output_data_6_c,
+                             &output_data_7_c);
+            vst1_u8(&output_base_ptr_6[c_block + depth * j * 8 + depth * 4],
+                    output_data_6_c);
+            vst1_u8(&output_base_ptr_7[c_block + depth * j * 8 + depth * 4],
+                    output_data_7_c);
+
+            for (int p = 1; p < 8; ++p) {
+              accum_0_c_v += incr_0_c;
+              accum_1_c_v += incr_1_c;
+              PairExtractUpper(accum_0_c_v.val, accum_1_c_v.val,
+                               &output_data_0_c, &output_data_1_c);
+              vst1_u8(&output_base_ptr_0[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_0_c);
+              vst1_u8(&output_base_ptr_1[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_1_c);
+
+              accum_2_c_v += incr_2_c;
+              accum_3_c_v += incr_3_c;
+              PairExtractUpper(accum_2_c_v.val, accum_3_c_v.val,
+                               &output_data_2_c, &output_data_3_c);
+              vst1_u8(&output_base_ptr_2[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_2_c);
+              vst1_u8(&output_base_ptr_3[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_3_c);
+
+              accum_4_c_v += incr_4_c;
+              accum_5_c_v += incr_5_c;
+              PairExtractUpper(accum_4_c_v.val, accum_5_c_v.val,
+                               &output_data_4_c, &output_data_5_c);
+              vst1_u8(&output_base_ptr_4[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_4_c);
+              vst1_u8(&output_base_ptr_5[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_5_c);
+
+              accum_6_c_v += incr_6_c;
+              accum_7_c_v += incr_7_c;
+              PairExtractUpper(accum_6_c_v.val, accum_7_c_v.val,
+                               &output_data_6_c, &output_data_7_c);
+              vst1_u8(&output_base_ptr_6[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_6_c);
+              vst1_u8(&output_base_ptr_7[c_block + depth * j * 8 + depth * p +
+                                         depth * 4],
+                      output_data_7_c);
+            }
+
+            accum_0_c_v += (incr_0_c >> 1);
+            accum_1_c_v += (incr_1_c >> 1);
+            accum_2_c_v += (incr_2_c >> 1);
+            accum_3_c_v += (incr_3_c >> 1);
+            accum_4_c_v += (incr_4_c >> 1);
+            accum_5_c_v += (incr_5_c >> 1);
+            accum_6_c_v += (incr_6_c >> 1);
+            accum_7_c_v += (incr_7_c >> 1);
+          }
+        }
+
+        // Right margin.
+        {
+          // Accumulations have pre-added 0.5 for rounding, but that is just
+          // discarded and this just avoids re-loading.
+          PairExtractUpper(accum_0_c_v.val, accum_1_c_v.val, &output_data_0_c,
+                           &output_data_1_c);
+          PairExtractUpper(accum_2_c_v.val, accum_3_c_v.val, &output_data_2_c,
+                           &output_data_3_c);
+          PairExtractUpper(accum_4_c_v.val, accum_5_c_v.val, &output_data_4_c,
+                           &output_data_5_c);
+          PairExtractUpper(accum_6_c_v.val, accum_7_c_v.val, &output_data_6_c,
+                           &output_data_7_c);
+          for (int p = 0; p < 4; ++p) {
+            vst1_u8(&output_base_ptr_0[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_0_c);
+            vst1_u8(&output_base_ptr_1[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_1_c);
+            vst1_u8(&output_base_ptr_2[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_2_c);
+            vst1_u8(&output_base_ptr_3[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_3_c);
+            vst1_u8(&output_base_ptr_4[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_4_c);
+            vst1_u8(&output_base_ptr_5[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_5_c);
+            vst1_u8(&output_base_ptr_6[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_6_c);
+            vst1_u8(&output_base_ptr_7[c_block + depth * (input_width - 1) * 8 +
+                                       depth * 4 + depth * p],
+                    output_data_7_c);
+          }
+        }
+      }
+
+      output_base_ptr += output_row_stride * 8;
+      input_base_ptr += input_row_stride;
+    }
+
+    //
+
+    for (int c_block = 0; c_block < depth; c_block += 8) {
+      op_uint16x8_t accum_c_v;
+      // Bottom-left margin corner.
+      {
+        uint8x8_t output_data = vld1_u8(&input_base_ptr[c_block]);
+        vst1_u8(&output_base_ptr[c_block], output_data);
+        vst1_u8(&output_base_ptr[c_block + depth], output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * 2], output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * 3], output_data);
+
+        // Accumulate in 8.8 representation, pre-adding 0.5 for later rounding.
+        accum_c_v = vaddq_u16(Move8IntoUpperU16(output_data), vdupq_n_u16(128));
+      }
+
+      // Bottom-centre margin.
+      op_int16x8_t wdelta_c_v;
+      op_int16x8_t wdelta_twice_c_v;
+      for (int j = 0; j < (input_width - 1); ++j) {
+        {
+          uint8x8_t output_data_alt;
+          uint8x8_t output_data;
+
+          const op_int16x8_t tl_val(
+              Load8IntoLowerS16(&input_base_ptr[c_block + depth * j]));
+          const op_int16x8_t tr_val(
+              Load8IntoLowerS16(&input_base_ptr[c_block + depth * (j + 1)]));
+          wdelta_c_v = (tr_val - tl_val) << 4;
+          wdelta_twice_c_v = wdelta_c_v << 1;
+
+          op_uint16x8_t accum_c_v_alt = accum_c_v + wdelta_c_v;
+          accum_c_v = accum_c_v_alt + wdelta_twice_c_v;
+          PairExtractUpper(accum_c_v_alt.val, accum_c_v.val, &output_data_alt,
+                           &output_data);
+
+          vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth * 4],
+                  output_data_alt);
+          vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth + depth * 4],
+                  output_data);
+
+          for (int p = 2; p < 8; p += 2) {
+            accum_c_v_alt = accum_c_v + wdelta_twice_c_v;
+            accum_c_v = accum_c_v_alt + wdelta_twice_c_v;
+            PairExtractUpper(accum_c_v_alt.val, accum_c_v.val, &output_data_alt,
+                             &output_data);
+
+            vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth * p +
+                                     depth * 4],
+                    output_data_alt);
+            vst1_u8(&output_base_ptr[c_block + depth * j * 8 + depth * (p + 1) +
+                                     depth * 4],
+                    output_data);
+          }
+          accum_c_v += wdelta_c_v;
+        }
+      }
+
+      // Bottom-right margin corner.
+      {
+        uint8x8_t output_data_discard;
+        uint8x8_t output_data;
+
+        // Accumulations have pre-added 0.5 for rounding, but that is just
+        // discarded and this just avoids re-loading.
+        PairExtractUpper(accum_c_v.val, accum_c_v.val, &output_data,
+                         &output_data_discard);
+
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4],
+                output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4 + depth],
+                output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4 + depth * 2],
+                output_data);
+        vst1_u8(&output_base_ptr[c_block + depth * (input_width - 1) * 8 +
+                                 depth * 4 + depth * 3],
+                output_data);
+      }
+    }
+    // Fill out remainder of bottom margin.
+    std::memcpy(output_base_ptr + output_row_stride, output_base_ptr,
+                output_row_stride * sizeof(uint8_t));
+    std::memcpy(output_base_ptr + output_row_stride * 2, output_base_ptr,
+                output_row_stride * sizeof(uint8_t));
+    std::memcpy(output_base_ptr + output_row_stride * 3, output_base_ptr,
+                output_row_stride * sizeof(uint8_t));
+
+#else  // USE_NEON
+    for (int c_block = 0; c_block < depth; c_block += 8) {
+      uint8 output_data[8];
+      uint16 accum[8];
+      // Top-left margin corner.
+      for (int c = 0; c < 8; ++c) {
+        output_data[c] = input_base_ptr[c_block + c];
+        output_base_ptr[c_block + c] = output_data[c];
+        output_base_ptr[c_block + c + depth] = output_data[c];
+        output_base_ptr[c_block + c + depth * 2] = output_data[c];
+        output_base_ptr[c_block + c + depth * 3] = output_data[c];
+
+        // Accumulate in 8.8 representation, pre-adding 0.5 for later rounding.
+        accum[c] =
+            (output_data[c] << 8) + 128;  // 128 = 0.5 in 8.8 representation.
+      }
+
+      // Top-centre margin.
+      uint16 wdelta[8];
+      uint16 wdelta_twice[8];
+      for (int j = 0; j < (input_width - 1); ++j) {
+        for (int c = 0; c < 8; ++c) {
+          wdelta[c] = static_cast<uint16>(
+                          input_base_ptr[c_block + c + depth * (j + 1)] -
+                          input_base_ptr[c_block + c + depth * j])
+                      << 4;
+          wdelta_twice[c] = wdelta[c] << 1;
+
+          accum[c] += wdelta[c];
+          output_base_ptr[c_block + c + depth * j * 8 + depth * 4] =
+              accum[c] >> 8;
+          for (int p = 1; p < 8; ++p) {
+            accum[c] += wdelta_twice[c];
+            output_base_ptr[c_block + c + depth * j * 8 + depth * p +
+                            depth * 4] = accum[c] >> 8;
+          }
+          accum[c] += wdelta[c];
+        }
+      }
+
+      // Top-right margin corner.
+      for (int c = 0; c < 8; ++c) {
+        // Accumulations have pre-added 0.5 for rounding, but that is just
+        // discarded and this just avoids re-loading.
+        output_data[c] = accum[c] >> 8;
+        TFLITE_DCHECK_EQ(
+            output_data[c],
+            input_base_ptr[c_block + c + depth * (input_width - 1)]);
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4] = output_data[c];
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4 + depth] = output_data[c];
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4 + depth * 2] = output_data[c];
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4 + depth * 3] = output_data[c];
+      }
+    }
+    // Fill out remainder of top margin.
+    std::memcpy(output_base_ptr + output_row_stride, output_base_ptr,
+                output_row_stride * sizeof(uint8));
+    std::memcpy(output_base_ptr + output_row_stride * 2, output_base_ptr,
+                output_row_stride * sizeof(uint8));
+    std::memcpy(output_base_ptr + output_row_stride * 3, output_base_ptr,
+                output_row_stride * sizeof(uint8));
+    output_base_ptr += output_row_stride * 4;
+
+    // Main rows.
+    for (int k = 0; k < (input_height - 1); ++k) {
+      for (int c_block = 0; c_block < depth; c_block += 8) {
+        uint8* output_base_ptr_0 = output_base_ptr;
+        uint8* output_base_ptr_1;
+        uint8* output_base_ptr_2;
+        uint8* output_base_ptr_3;
+        uint8* output_base_ptr_4;
+        uint8* output_base_ptr_5;
+        uint8* output_base_ptr_6;
+        uint8* output_base_ptr_7;
+        uint16 accum_0[8];
+        uint16 accum_1[8];
+        uint16 accum_2[8];
+        uint16 accum_3[8];
+        uint16 accum_4[8];
+        uint16 accum_5[8];
+        uint16 accum_6[8];
+        uint16 accum_7[8];
+
+        // We prefer accum_0[c], etc, in sense of packed-data array for
+        // register. However the compiler will not reliably optimize for an
+        // array, and so we do most of the work in pure scalar variables.
+        uint16 accum_0_c;
+        uint16 accum_1_c;
+        uint16 accum_2_c;
+        uint16 accum_3_c;
+        uint16 accum_4_c;
+        uint16 accum_5_c;
+        uint16 accum_6_c;
+        uint16 accum_7_c;
+
+        int16 hdelta_c;
+        int16 hdelta_twice_c;
+
+        // Left margin for 8 rows.
+        for (int c = 0; c < 8; ++c) {
+          hdelta_c = static_cast<uint16>(
+                         input_base_ptr[c_block + c + input_row_stride] -
+                         input_base_ptr[c_block + c])
+                     << 4;
+
+          // Accumulate in 8.8 representation, pre-adding 0.5 for later
+          // rounding.
+          accum_0_c = (input_base_ptr[c_block + c] << 8) + 128;
+
+          accum_0_c += hdelta_c;
+          output_base_ptr_0[c_block + c] = accum_0_c >> 8;
+          output_base_ptr_0[c_block + c + depth] = accum_0_c >> 8;
+          output_base_ptr_0[c_block + c + depth * 2] = accum_0_c >> 8;
+          output_base_ptr_0[c_block + c + depth * 3] = accum_0_c >> 8;
+
+          hdelta_twice_c = hdelta_c << 1;
+
+          output_base_ptr_1 = output_base_ptr_0 + output_row_stride;
+          accum_1_c = accum_0_c + hdelta_twice_c;
+          output_base_ptr_1[c_block + c] = accum_1_c >> 8;
+          output_base_ptr_1[c_block + c + depth] = accum_1_c >> 8;
+          output_base_ptr_1[c_block + c + depth * 2] = accum_1_c >> 8;
+          output_base_ptr_1[c_block + c + depth * 3] = accum_1_c >> 8;
+
+          output_base_ptr_2 = output_base_ptr_1 + output_row_stride;
+          accum_2_c = accum_1_c + hdelta_twice_c;
+          output_base_ptr_2[c_block + c] = accum_2_c >> 8;
+          output_base_ptr_2[c_block + c + depth] = accum_2_c >> 8;
+          output_base_ptr_2[c_block + c + depth * 2] = accum_2_c >> 8;
+          output_base_ptr_2[c_block + c + depth * 3] = accum_2_c >> 8;
+
+          output_base_ptr_3 = output_base_ptr_2 + output_row_stride;
+          accum_3_c = accum_2_c + hdelta_twice_c;
+          output_base_ptr_3[c_block + c] = accum_3_c >> 8;
+          output_base_ptr_3[c_block + c + depth] = accum_3_c >> 8;
+          output_base_ptr_3[c_block + c + depth * 2] = accum_3_c >> 8;
+          output_base_ptr_3[c_block + c + depth * 3] = accum_3_c >> 8;
+
+          output_base_ptr_4 = output_base_ptr_3 + output_row_stride;
+          accum_4_c = accum_3_c + hdelta_twice_c;
+          output_base_ptr_4[c_block + c] = accum_4_c >> 8;
+          output_base_ptr_4[c_block + c + depth] = accum_4_c >> 8;
+          output_base_ptr_4[c_block + c + depth * 2] = accum_4_c >> 8;
+          output_base_ptr_4[c_block + c + depth * 3] = accum_4_c >> 8;
+
+          output_base_ptr_5 = output_base_ptr_4 + output_row_stride;
+          accum_5_c = accum_4_c + hdelta_twice_c;
+          output_base_ptr_5[c_block + c] = accum_5_c >> 8;
+          output_base_ptr_5[c_block + c + depth] = accum_5_c >> 8;
+          output_base_ptr_5[c_block + c + depth * 2] = accum_5_c >> 8;
+          output_base_ptr_5[c_block + c + depth * 3] = accum_5_c >> 8;
+
+          output_base_ptr_6 = output_base_ptr_5 + output_row_stride;
+          accum_6_c = accum_5_c + hdelta_twice_c;
+          output_base_ptr_6[c_block + c] = accum_6_c >> 8;
+          output_base_ptr_6[c_block + c + depth] = accum_6_c >> 8;
+          output_base_ptr_6[c_block + c + depth * 2] = accum_6_c >> 8;
+          output_base_ptr_6[c_block + c + depth * 3] = accum_6_c >> 8;
+
+          output_base_ptr_7 = output_base_ptr_6 + output_row_stride;
+          accum_7_c = accum_6_c + hdelta_twice_c;
+          output_base_ptr_7[c_block + c] = accum_7_c >> 8;
+          output_base_ptr_7[c_block + c + depth] = accum_7_c >> 8;
+          output_base_ptr_7[c_block + c + depth * 2] = accum_7_c >> 8;
+          output_base_ptr_7[c_block + c + depth * 3] = accum_7_c >> 8;
+
+          accum_0[c] = accum_0_c;
+          accum_1[c] = accum_1_c;
+          accum_2[c] = accum_2_c;
+          accum_3[c] = accum_3_c;
+          accum_4[c] = accum_4_c;
+          accum_5[c] = accum_5_c;
+          accum_6[c] = accum_6_c;
+          accum_7[c] = accum_7_c;
+        }
+
+        // Main central body.
+        int16 wdelta_c;
+        int16 wdelta_twice_c;
+        int16 hwdelta_c;
+        int16 hwdelta_twice_c;
+
+        int16 incr_0_c;
+        int16 incr_1_c;
+        int16 incr_2_c;
+        int16 incr_3_c;
+        int16 incr_4_c;
+        int16 incr_5_c;
+        int16 incr_6_c;
+        int16 incr_7_c;
+        for (int j = 0; j < (input_width - 1); ++j) {
+          for (int c = 0; c < 8; ++c) {
+            accum_0_c = accum_0[c];
+            accum_1_c = accum_1[c];
+            accum_2_c = accum_2[c];
+            accum_3_c = accum_3[c];
+            accum_4_c = accum_4[c];
+            accum_5_c = accum_5[c];
+            accum_6_c = accum_6[c];
+            accum_7_c = accum_7[c];
+
+            wdelta_c = static_cast<uint16>(
+                           input_base_ptr[c_block + c + depth * (j + 1)] -
+                           input_base_ptr[c_block + c + depth * j])
+                       << 4;
+            wdelta_twice_c = wdelta_c << 1;
+            hwdelta_c = static_cast<uint16>(
+                input_base_ptr[c_block + c + depth * (j + 1) +
+                               input_row_stride] -
+                input_base_ptr[c_block + c + depth * (j + 1)] -
+                input_base_ptr[c_block + c + depth * j + input_row_stride] +
+                input_base_ptr[c_block + c + depth * j]);
+            hwdelta_twice_c = hwdelta_c << 1;
+
+            uint16 incr_base = wdelta_c + hwdelta_c;
+            accum_0_c += incr_base;
+            output_base_ptr_0[c_block + c + depth * j * 8 + depth * 4] =
+                accum_0_c >> 8;
+            incr_0_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_1_c += incr_base;
+            output_base_ptr_1[c_block + c + depth * j * 8 + depth * 4] =
+                accum_1_c >> 8;
+            incr_1_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_2_c += incr_base;
+            output_base_ptr_2[c_block + c + depth * j * 8 + depth * 4] =
+                accum_2_c >> 8;
+            incr_2_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_3_c += incr_base;
+            output_base_ptr_3[c_block + c + depth * j * 8 + depth * 4] =
+                accum_3_c >> 8;
+            incr_3_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_4_c += incr_base;
+            output_base_ptr_4[c_block + c + depth * j * 8 + depth * 4] =
+                accum_4_c >> 8;
+            incr_4_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_5_c += incr_base;
+            output_base_ptr_5[c_block + c + depth * j * 8 + depth * 4] =
+                accum_5_c >> 8;
+            incr_5_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_6_c += incr_base;
+            output_base_ptr_6[c_block + c + depth * j * 8 + depth * 4] =
+                accum_6_c >> 8;
+            incr_6_c = incr_base << 1;
+
+            incr_base += hwdelta_twice_c;
+            accum_7_c += incr_base;
+            output_base_ptr_7[c_block + c + depth * j * 8 + depth * 4] =
+                accum_7_c >> 8;
+            incr_7_c = incr_base << 1;
+
+            for (int p = 1; p < 8; ++p) {
+              accum_0_c += incr_0_c;
+              output_base_ptr_0[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_0_c >> 8;
+              accum_1_c += incr_1_c;
+              output_base_ptr_1[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_1_c >> 8;
+              accum_2_c += incr_2_c;
+              output_base_ptr_2[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_2_c >> 8;
+              accum_3_c += incr_3_c;
+              output_base_ptr_3[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_3_c >> 8;
+              accum_4_c += incr_4_c;
+              output_base_ptr_4[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_4_c >> 8;
+              accum_5_c += incr_5_c;
+              output_base_ptr_5[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_5_c >> 8;
+              accum_6_c += incr_6_c;
+              output_base_ptr_6[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_6_c >> 8;
+              accum_7_c += incr_7_c;
+              output_base_ptr_7[c_block + c + depth * j * 8 + depth * p +
+                                depth * 4] = accum_7_c >> 8;
+            }
+            accum_0_c += incr_0_c / 2;
+            accum_1_c += incr_1_c / 2;
+            accum_2_c += incr_2_c / 2;
+            accum_3_c += incr_3_c / 2;
+            accum_4_c += incr_4_c / 2;
+            accum_5_c += incr_5_c / 2;
+            accum_6_c += incr_6_c / 2;
+            accum_7_c += incr_7_c / 2;
+
+            accum_0[c] = accum_0_c;
+            accum_1[c] = accum_1_c;
+            accum_2[c] = accum_2_c;
+            accum_3[c] = accum_3_c;
+            accum_4[c] = accum_4_c;
+            accum_5[c] = accum_5_c;
+            accum_6[c] = accum_6_c;
+            accum_7[c] = accum_7_c;
+          }
+        }
+
+        // Right margin.
+        uint8 output_data_0_c;
+        uint8 output_data_1_c;
+        uint8 output_data_2_c;
+        uint8 output_data_3_c;
+        uint8 output_data_4_c;
+        uint8 output_data_5_c;
+        uint8 output_data_6_c;
+        uint8 output_data_7_c;
+        for (int c = 0; c < 8; ++c) {
+          accum_0_c = accum_0[c];
+          accum_1_c = accum_1[c];
+          accum_2_c = accum_2[c];
+          accum_3_c = accum_3[c];
+          accum_4_c = accum_4[c];
+          accum_5_c = accum_5[c];
+          accum_6_c = accum_6[c];
+          accum_7_c = accum_7[c];
+
+          // Accumulations have pre-added 0.5 for rounding, but that is just
+          // discarded and this just avoids re-loading.
+          output_data_0_c = accum_0_c >> 8;
+          output_data_1_c = accum_1_c >> 8;
+          output_data_2_c = accum_2_c >> 8;
+          output_data_3_c = accum_3_c >> 8;
+          output_data_4_c = accum_4_c >> 8;
+          output_data_5_c = accum_5_c >> 8;
+          output_data_6_c = accum_6_c >> 8;
+          output_data_7_c = accum_7_c >> 8;
+          for (int p = 0; p < 4; ++p) {
+            output_base_ptr_0[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_0_c;
+            output_base_ptr_1[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_1_c;
+            output_base_ptr_2[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_2_c;
+            output_base_ptr_3[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_3_c;
+            output_base_ptr_4[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_4_c;
+            output_base_ptr_5[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_5_c;
+            output_base_ptr_6[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_6_c;
+            output_base_ptr_7[c_block + c + depth * (input_width - 1) * 8 +
+                              depth * 4 + depth * p] = output_data_7_c;
+          }
+
+          accum_0[c] = accum_0_c;
+          accum_1[c] = accum_1_c;
+          accum_2[c] = accum_2_c;
+          accum_3[c] = accum_3_c;
+          accum_4[c] = accum_4_c;
+          accum_5[c] = accum_5_c;
+          accum_6[c] = accum_6_c;
+          accum_7[c] = accum_7_c;
+        }
+      }
+
+      output_base_ptr += output_row_stride * 8;
+      input_base_ptr += input_row_stride;
+    }
+
+    for (int c_block = 0; c_block < depth; c_block += 8) {
+      uint8 output_data[8];
+      uint16 accum[8];
+      // Bottom-left margin corner.
+      for (int c = 0; c < 8; ++c) {
+        output_data[c] = input_base_ptr[c_block + c];
+        output_base_ptr[c_block + c] = output_data[c];
+        output_base_ptr[c_block + c + depth] = output_data[c];
+        output_base_ptr[c_block + c + depth * 2] = output_data[c];
+        output_base_ptr[c_block + c + depth * 3] = output_data[c];
+
+        // Accumulate in 8.8 representation, pre-adding 0.5 for later rounding.
+        accum[c] =
+            (output_data[c] << 8) + 128;  // 128 = 0.5 in 8.8 representation.
+      }
+
+      // Bottom-centre margin.
+      uint16 wdelta[8];
+      uint16 wdelta_twice[8];
+      for (int j = 0; j < (input_width - 1); ++j) {
+        for (int c = 0; c < 8; ++c) {
+          wdelta[c] = static_cast<uint16>(
+                          input_base_ptr[c_block + c + depth * (j + 1)] -
+                          input_base_ptr[c_block + c + depth * j])
+                      << 4;
+          wdelta_twice[c] = wdelta[c] << 1;
+
+          accum[c] += wdelta[c];
+          output_base_ptr[c_block + c + depth * j * 8 + depth * 4] =
+              accum[c] >> 8;
+          for (int p = 1; p < 8; ++p) {
+            accum[c] += wdelta_twice[c];
+            output_base_ptr[c_block + c + depth * j * 8 + depth * p +
+                            depth * 4] = accum[c] >> 8;
+          }
+          accum[c] += wdelta[c];
+        }
+      }
+
+      // Bottom-right margin corner.
+      for (int c = 0; c < 8; ++c) {
+        // Accumulations have pre-added 0.5 for rounding, but that is just
+        // discarded and this just avoids re-loading.
+        output_data[c] = accum[c] >> 8;
+        TFLITE_DCHECK_EQ(
+            output_data[c],
+            input_base_ptr[c_block + c + depth * (input_width - 1)]);
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4] = output_data[c];
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4 + depth] = output_data[c];
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4 + depth * 2] = output_data[c];
+        output_base_ptr[c_block + c + depth * (input_width - 1) * 8 +
+                        depth * 4 + depth * 3] = output_data[c];
+      }
+    }
+    // Fill out remainder of bottom margin.
+    std::memcpy(output_base_ptr + output_row_stride, output_base_ptr,
+                output_row_stride * sizeof(uint8));
+    std::memcpy(output_base_ptr + output_row_stride * 2, output_base_ptr,
+                output_row_stride * sizeof(uint8));
+    std::memcpy(output_base_ptr + output_row_stride * 3, output_base_ptr,
+                output_row_stride * sizeof(uint8));
+
+#endif  // USE_NEON
+  }
+}  // NOLINT(readability/fn_size)
+
+}  // namespace resize_bilinear
+
+#ifdef USE_NEON
+inline void ResizeBilinearKernel(const float* input_ptr, int32_t depth,
+                                 float scale, float* output_ptr) {
+  int ic = 0;
+  // Handle 32 input channels at a time.
+  for (; ic <= depth - 32; ic += 32) {
+    float32x4x2_t input[4];
+    for (int i = 0; i < 4; i++) {
+      input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
+      input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
+    }
+    float32x4x2_t acc[4];
+    for (int i = 0; i < 4; i++) {
+      acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
+      acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
+    }
+    for (int i = 0; i < 4; i++) {
+      acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
+      acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
+    }
+    for (int i = 0; i < 4; i++) {
+      vst1q_f32(output_ptr, acc[i].val[0]);
+      vst1q_f32(output_ptr + 4, acc[i].val[1]);
+      output_ptr += 8;
+    }
+    input_ptr += 32;
+  }
+  // Handle 16 input channels at a time.
+  for (; ic <= depth - 16; ic += 16) {
+    float32x4x2_t input[2];
+    for (int i = 0; i < 2; i++) {
+      input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
+      input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
+    }
+    float32x4x2_t acc[2];
+    for (int i = 0; i < 2; i++) {
+      acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
+      acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
+    }
+    for (int i = 0; i < 2; i++) {
+      acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
+      acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
+    }
+    for (int i = 0; i < 2; i++) {
+      vst1q_f32(output_ptr, acc[i].val[0]);
+      vst1q_f32(output_ptr + 4, acc[i].val[1]);
+      output_ptr += 8;
+    }
+    input_ptr += 16;
+  }
+  // Handle 8 input channels at a time.
+  for (; ic <= depth - 8; ic += 8) {
+    float32x4x2_t input;
+    input.val[0] = vld1q_f32(input_ptr);
+    input.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t acc;
+    acc.val[0] = vld1q_f32(output_ptr);
+    acc.val[1] = vld1q_f32(output_ptr + 4);
+    acc.val[0] = vmlaq_n_f32(acc.val[0], input.val[0], scale);
+    acc.val[1] = vmlaq_n_f32(acc.val[1], input.val[1], scale);
+
+    vst1q_f32(output_ptr, acc.val[0]);
+    vst1q_f32(output_ptr + 4, acc.val[1]);
+
+    input_ptr += 8;
+    output_ptr += 8;
+  }
+  // Handle 4 input channels at a time.
+  for (; ic <= depth - 4; ic += 4) {
+    float32x4_t input = vld1q_f32(input_ptr);
+    float32x4_t acc = vld1q_f32(output_ptr);
+
+    acc = vmlaq_n_f32(acc, input, scale);
+    vst1q_f32(output_ptr, acc);
+
+    input_ptr += 4;
+    output_ptr += 4;
+  }
+  // Handle 1 input channel at a time.
+  for (; ic < depth; ic++) {
+    *output_ptr += *input_ptr * scale;
+    output_ptr++;
+    input_ptr++;
+  }
+}
+#else
+inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
+                                 float scale, float* output_ptr) {
+  for (int32 i = 0; i < depth; i++) {
+    *output_ptr += *input_ptr * scale;
+    output_ptr++;
+    input_ptr++;
+  }
+}
+#endif
+
+inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0,
+                                    int32_t y1, int32_t x, int32_t y,
+                                    int32_t depth, int32_t batch,
+                                    const RuntimeShape& input_shape,
+                                    const float* input_data,
+                                    const RuntimeShape& output_shape,
+                                    float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t output_width = output_shape.Dims(2);
+
+  const int32_t input_x_offset = (x1 - x0) * depth;
+  const int32_t input_y_offset = (y1 - y0) * depth * input_width;
+  const int32_t output_x_offset = depth;
+  const int32_t output_y_offset = depth * output_width;
+
+#ifdef USE_NEON
+  TFLITE_DCHECK(x1 >= x0);
+  TFLITE_DCHECK(y1 >= y0);
+
+  int ic = 0;
+  // Handle 8 input channels at a time.
+  for (; ic <= depth - 8; ic += 8) {
+    const float* input_ptr = nullptr;
+
+    float32x4x2_t x0y0;
+    input_ptr = &input_data[Offset(input_shape, batch, y0, x0, ic)];
+    x0y0.val[0] = vld1q_f32(input_ptr);
+    x0y0.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t x1y0;
+    input_ptr += input_x_offset;
+    x1y0.val[0] = vld1q_f32(input_ptr);
+    x1y0.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t x0y1;
+    input_ptr += -input_x_offset + input_y_offset;
+    x0y1.val[0] = vld1q_f32(input_ptr);
+    x0y1.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t x1y1;
+    input_ptr += input_x_offset;
+    x1y1.val[0] = vld1q_f32(input_ptr);
+    x1y1.val[1] = vld1q_f32(input_ptr + 4);
+
+    // Top left corner.
+    float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)];
+    vst1q_f32(output_ptr, x0y0.val[0]);
+    vst1q_f32(output_ptr + 4, x0y0.val[1]);
+
+    // Top right corner.
+    output_ptr += output_x_offset;
+    float32x4x2_t tr;
+    tr.val[0] = vaddq_f32(x0y0.val[0], x1y0.val[0]);
+    tr.val[1] = vaddq_f32(x0y0.val[1], x1y0.val[1]);
+    tr.val[0] = vmulq_n_f32(tr.val[0], 0.5f);
+    tr.val[1] = vmulq_n_f32(tr.val[1], 0.5f);
+
+    vst1q_f32(output_ptr, tr.val[0]);
+    vst1q_f32(output_ptr + 4, tr.val[1]);
+
+    // Bottom left corner.
+    output_ptr += -output_x_offset + output_y_offset;
+    float32x4x2_t bl;
+    bl.val[0] = vaddq_f32(x0y0.val[0], x0y1.val[0]);
+    bl.val[1] = vaddq_f32(x0y0.val[1], x0y1.val[1]);
+    bl.val[0] = vmulq_n_f32(bl.val[0], 0.5f);
+    bl.val[1] = vmulq_n_f32(bl.val[1], 0.5f);
+    vst1q_f32(output_ptr, bl.val[0]);
+    vst1q_f32(output_ptr + 4, bl.val[1]);
+
+    // Bottom right corner.
+    output_ptr += output_x_offset;
+    float32x4x2_t br;
+    br.val[0] = vaddq_f32(x1y0.val[0], x1y1.val[0]);
+    br.val[1] = vaddq_f32(x1y0.val[1], x1y1.val[1]);
+    br.val[0] = vmlaq_n_f32(bl.val[0], br.val[0], 0.5f);
+    br.val[1] = vmlaq_n_f32(bl.val[1], br.val[1], 0.5f);
+    br.val[0] = vmulq_n_f32(br.val[0], 0.5f);
+    br.val[1] = vmulq_n_f32(br.val[1], 0.5f);
+    vst1q_f32(output_ptr, br.val[0]);
+    vst1q_f32(output_ptr + 4, br.val[1]);
+  }
+  // Handle 4 input channels at a time.
+  for (; ic <= depth - 4; ic += 4) {
+    const float* input_ptr =
+        &input_data[Offset(input_shape, batch, y0, x0, ic)];
+    float32x4_t x0y0 = vld1q_f32(input_ptr);
+    float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset);
+    float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset);
+    float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset);
+
+    // Top left corner.
+    float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)];
+    vst1q_f32(output_ptr, x0y0);
+
+    // Top right corner.
+    output_ptr += output_x_offset;
+    float32x4_t tr = vaddq_f32(x0y0, x1y0);
+    tr = vmulq_n_f32(tr, 0.5f);
+    vst1q_f32(output_ptr, tr);
+
+    // Bottom left corner.
+    output_ptr += -output_x_offset + output_y_offset;
+    float32x4_t bl = vaddq_f32(x0y0, x0y1);
+    bl = vmulq_n_f32(bl, 0.5f);
+    vst1q_f32(output_ptr, bl);
+
+    // Bottom right corner.
+    output_ptr += output_x_offset;
+    float32x4_t br = vaddq_f32(x1y0, x1y1);
+    br = vmlaq_n_f32(bl, br, 0.5f);
+    br = vmulq_n_f32(br, 0.5f);
+    vst1q_f32(output_ptr, br);
+  }
+  // Handle one input channel at a time.
+  for (; ic < depth; ic++) {
+    const int32_t input_offset = Offset(input_shape, batch, y0, x0, ic);
+
+    float x0y0 = input_data[input_offset];
+    float x1y0 = input_data[input_offset + input_x_offset];
+    float x0y1 = input_data[input_offset + input_y_offset];
+    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+    // Top left corner.
+    const int32_t output_offset = Offset(output_shape, batch, y, x, ic);
+    output_data[output_offset] = x0y0;
+
+    // Top right corner.
+    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+    // Bottom left corner.
+    float output = (x0y0 + x0y1) / 2;
+    output_data[output_offset + output_y_offset] = output;
+
+    // Bottom right corner.
+    output_data[output_offset + output_x_offset + output_y_offset] =
+        (output + ((x1y0 + x1y1) / 2)) / 2;
+  }
+#else
+  for (int ch = 0; ch < depth; ch++) {
+    const int32 input_offset = Offset(input_shape, batch, y0, x0, ch);
+
+    float x0y0 = input_data[input_offset];
+    float x1y0 = input_data[input_offset + input_x_offset];
+    float x0y1 = input_data[input_offset + input_y_offset];
+    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+    // Top left corner.
+    const int32 output_offset = Offset(output_shape, batch, y, x, ch);
+    output_data[output_offset] = x0y0;
+
+    // Top right corner.
+    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+    // Bottom left corner.
+    float output = (x0y0 + x0y1) / 2;
+    output_data[output_offset + output_y_offset] = output;
+
+    // Bottom right corner.
+    output_data[output_offset + output_x_offset + output_y_offset] =
+        (output + ((x1y0 + x1y1) / 2)) / 2;
+  }
+#endif
+}
+
+inline void ResizeBilinear2x2(int32_t batches, int32_t input_height,
+                              int32_t input_width, int32_t depth,
+                              int32_t output_height, int32_t output_width,
+                              const RuntimeShape& input_shape,
+                              const float* input_data,
+                              const RuntimeShape& output_shape,
+                              float* output_data) {
+  for (int b = 0; b < batches; b++) {
+    for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) {
+      for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) {
+        int32_t x1 = std::min(x0 + 1, input_width - 1);
+        int32_t y1 = std::min(y0 + 1, input_height - 1);
+        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape,
+                                input_data, output_shape, output_data);
+      }
+    }
+  }
+}
+
+inline void ResizeBilinearGeneric(
+    int32_t batches, int32_t input_height, int32_t input_width, int32_t depth,
+    int32_t output_height, int32_t output_width, float height_scale,
+    float width_scale, const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const bool half_pixel_centers) {
+  memset(output_data, 0,
+         batches * output_height * output_width * depth * sizeof(float));
+
+  int32_t output_offset = 0;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y;
+      int32_t y0, y1;
+      reference_ops::ComputeInterpolationValues(
+          y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
+          &y1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x;
+        int32_t x0, x1;
+        reference_ops::ComputeInterpolationValues(
+            x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
+            &x1);
+        float* output_ptr = &output_data[output_offset];
+
+        // Run kernel on the 4 corners of the bilinear resize algorithm.
+        int32_t input_offset = Offset(input_shape, b, y0, x0, 0);
+        float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
+        const float* input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y0, x1, 0);
+        scale = (1 - (input_y - y0)) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y1, x0, 0);
+        scale = (input_y - y0) * (1 - (input_x - x0));
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y1, x1, 0);
+        scale = (input_y - y0) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        output_offset += depth;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void ResizeBilinearGenericSmallChannel(
+    int32_t batches, int32_t input_height, int32_t input_width, int32_t depth,
+    int32_t output_height, int32_t output_width, float height_scale,
+    float width_scale, const RuntimeShape& input_shape, const T* input_data,
+    const RuntimeShape& output_shape, T* output_data,
+    const bool half_pixel_centers) {
+  T* output_ptr = &output_data[0];
+  const float rounding_offset = std::numeric_limits<T>::is_integer ? .5f : .0f;
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y;
+      int32_t y0, y1;
+      reference_ops::ComputeInterpolationValues(
+          y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
+          &y1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x;
+        int32_t x0, x1;
+        reference_ops::ComputeInterpolationValues(
+            x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
+            &x1);
+
+        int32_t input_offset[4] = {Offset(input_shape, b, y0, x0, 0),
+                                   Offset(input_shape, b, y0, x1, 0),
+                                   Offset(input_shape, b, y1, x0, 0),
+                                   Offset(input_shape, b, y1, x1, 0)};
+        float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
+                          (1 - (input_y - y0)) * (input_x - x0),
+                          (input_y - y0) * (1 - (input_x - x0)),
+                          (input_y - y0) * (input_x - x0)};
+
+        for (int d = 0; d < depth; d++) {
+          const T* input_ptr = &input_data[d];
+          *output_ptr++ = static_cast<T>(input_ptr[input_offset[0]] * scale[0] +
+                                         input_ptr[input_offset[1]] * scale[1] +
+                                         input_ptr[input_offset[2]] * scale[2] +
+                                         input_ptr[input_offset[3]] * scale[3] +
+                                         rounding_offset);
+        }
+      }
+    }
+  }
+}
+
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const float* input_data,
+                           const RuntimeShape& output_size_shape,
+                           const int32_t* output_size_data,
+                           const RuntimeShape& unextended_output_shape,
+                           float* output_data) {
+  ruy::profiler::ScopeLabel label("ResizeBilinear");
+  // If half_pixel_centers is True, align_corners must be False.
+  TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
+
+  // Specialize for 2x2 upsample.
+  if (!op_params.align_corners && !op_params.half_pixel_centers &&
+      output_height == 2 * input_height && output_width == 2 * input_width) {
+    ResizeBilinear2x2(batches, input_height, input_width, depth, output_height,
+                      output_width, input_shape, input_data, output_shape,
+                      output_data);
+  } else {
+    float height_scale = static_cast<float>(input_height) / output_height;
+    float width_scale = static_cast<float>(input_width) / output_width;
+    if (op_params.align_corners && output_height > 1) {
+      height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
+    }
+    if (op_params.align_corners && output_width > 1) {
+      width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
+    }
+
+    ResizeBilinearGeneric(batches, input_height, input_width, depth,
+                          output_height, output_width, height_scale,
+                          width_scale, input_shape, input_data, output_shape,
+                          output_data, op_params.half_pixel_centers);
+  }
+}
+
+// Note: This is not a universal quantized bilinear. It does not use int8
+// or int16 arithmetic.
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const uint8_t* input_data,
+                           const RuntimeShape& output_size_shape,
+                           const int32_t* output_size_data,
+                           const RuntimeShape& unextended_output_shape,
+                           uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("ResizeBilinearUint8");
+  // If half_pixel_centers is True, align_corners must be False.
+  TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
+
+  if (!op_params.align_corners && op_params.half_pixel_centers &&
+      ((depth % 8) == 0)) {
+    const int32_t scale = output_height / input_height;
+    // Restricting the minimum output dimensions may not be necessary, but
+    // ensures that kernels can use unrolling with minimal code size.
+    if ((output_height >= 8) && (output_width >= 8) &&
+        ((input_height * scale) == output_height) &&
+        ((input_width * scale) == output_width)) {
+      if (scale == 8) {
+        resize_bilinear::ResizeBilinear888Uint8(
+            batches, input_height, input_width, depth, input_data, output_data);
+        return;
+      }
+    }
+  }
+
+  float height_scale =
+      (op_params.align_corners && output_height > 1)
+          ? (static_cast<float>(input_height - 1) / (output_height - 1))
+          : (static_cast<float>(input_height) / output_height);
+
+  float width_scale =
+      (op_params.align_corners && output_width > 1)
+          ? (static_cast<float>(input_width - 1) / (output_width - 1))
+          : (static_cast<float>(input_width) / output_width);
+
+  ResizeBilinearGenericSmallChannel<uint8_t>(
+      batches, input_height, input_width, depth, output_height, output_width,
+      height_scale, width_scale, input_shape, input_data, output_shape,
+      output_data, op_params.half_pixel_centers);
+}
+
+// TODO(b/180609127) Create optimized int8 version from uint8. Call from here.
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const int8_t* input_data,
+                           const RuntimeShape& unextended_output_size_shape,
+                           const int32_t* output_size_data,
+                           const RuntimeShape& unextended_output_shape,
+                           int8_t* output_data) {
+  reference_ops::ResizeBilinearInteger(op_params, unextended_input_shape,
+                                       input_data, unextended_output_size_shape,
+                                       output_size_data,
+                                       unextended_output_shape, output_data);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_RESIZE_BILINEAR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
new file mode 100644
index 00000000..1dd8c3e5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -0,0 +1,268 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
+
+#include <algorithm>
+#include <cstdint>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+inline void FullyConnectedSparseWeight(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& weights_shape, const float* weights_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("Random Sparse");
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const int output_elements = output_shape.FlatSize();
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+                                       output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+  const int w0_size = sparsity.dim_metadata[0].dense_size;
+  const int* w1_segments = sparsity.dim_metadata[1].array_segments->data;
+  const int* w1_indices = sparsity.dim_metadata[1].array_indices->data;
+
+  for (int i = 0; i < output_elements; ++i) {
+    output_data[i] = 0.f;
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int idx_0 = 0; idx_0 < w0_size; ++idx_0) {
+      for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) {
+        int idx_1 = w1_indices[pw1];
+        output_data[b * output_depth + idx_0] +=
+            weights_data[pw1] * input_data[b * accum_depth + idx_1];
+      }
+    }
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int i = 0; i < output_depth; ++i) {
+      float total = output_data[b * output_depth + i];
+      const float bias_value = bias_data ? bias_data[i] : 0;
+      output_data[b * output_depth + i] = ActivationFunctionWithMinMax(
+          total + bias_value, output_activation_min, output_activation_max);
+    }
+  }
+}
+
+inline void FullyConnectedSparseWeight1x16Impl(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& weights_shape, const int8_t* weights_data,
+    const int32_t* per_channel_scale, const int32_t* per_channel_shift,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data, int thread_start,
+    int thread_end, const CpuBackendContext& cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("1x16 Block Sparse");
+
+  const int input_dims_count = input_shape.DimensionsCount();
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = thread_end - thread_start;
+  const int input_depth = MatchingDim(weights_shape, weights_dims_count - 1,
+                                      input_shape, input_dims_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+                                       output_shape, output_dims_count - 1);
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int32_t output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const int* w1_segments = sparsity.dim_metadata[1].array_segments->data;
+  const int* w1_indices = sparsity.dim_metadata[1].array_indices->data;
+
+  tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate1x16(
+      weights_data, w1_segments, w1_indices, weights_shape.Dims(0),
+      weights_shape.Dims(1), input_data + thread_start * input_depth, bias_data,
+      batches, input_offset, output_multiplier, output_shift, per_channel_scale,
+      per_channel_shift, output_offset, output_activation_min,
+      output_activation_max, output_data + thread_start * output_depth);
+}
+
+inline void FullyConnectedSparseWeight1x4Impl(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& weights_shape, const float* weights_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data, int thread_start,
+    int thread_end, const CpuBackendContext& cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("1x4 Block Sparse");
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const int input_dims_count = input_shape.DimensionsCount();
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = thread_end - thread_start;
+  const int input_depth = MatchingDim(weights_shape, weights_dims_count - 1,
+                                      input_shape, input_dims_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+                                       output_shape, output_dims_count - 1);
+  const int* w1_segments = sparsity.dim_metadata[1].array_segments->data;
+  const int* w1_indices = sparsity.dim_metadata[1].array_indices->data;
+
+  tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate1x4(
+      weights_data, w1_segments, w1_indices, weights_shape.Dims(0),
+      weights_shape.Dims(1), input_data + thread_start * input_depth, batches,
+      output_data + thread_start * output_depth);
+
+  ruy::profiler::ScopeLabel activation_label("activation function");
+  for (int b = thread_start; b < thread_end; ++b) {
+    for (int i = 0; i < output_depth; ++i) {
+      float total = output_data[b * output_depth + i];
+      const float bias_value = bias_data ? bias_data[i] : 0;
+      output_data[b * output_depth + i] = ActivationFunctionWithMinMax(
+          total + bias_value, output_activation_min, output_activation_max);
+    }
+  }
+}
+
+struct FullyConnectedSparseWeight1x4Task : cpu_backend_threadpool::Task {
+  FullyConnectedSparseWeight1x4Task(
+      const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+      const RuntimeShape& input_shape, const float* input_data,
+      const RuntimeShape& weights_shape, const float* weights_data,
+      const RuntimeShape& bias_shape, const float* bias_data,
+      const RuntimeShape& output_shape, float* output_data, int thread_start,
+      int thread_end, const CpuBackendContext& cpu_backend_context_x)
+      : sparsity(sparsity),
+        params(params),
+        input_shape(input_shape),
+        input_data(input_data),
+        weights_shape(weights_shape),
+        weights_data(weights_data),
+        bias_shape(bias_shape),
+        bias_data(bias_data),
+        output_shape(output_shape),
+        output_data(output_data),
+        thread_start(thread_start),
+        thread_end(thread_end),
+        cpu_backend_context(cpu_backend_context_x) {}
+
+  void Run() override {
+    FullyConnectedSparseWeight1x4Impl(
+        sparsity, params, input_shape, input_data, weights_shape, weights_data,
+        bias_shape, bias_data, output_shape, output_data, thread_start,
+        thread_end, cpu_backend_context);
+  }
+
+ private:
+  const TfLiteSparsity& sparsity;
+  const FullyConnectedParams& params;
+  const RuntimeShape& input_shape;
+  const float* input_data;
+  const RuntimeShape& weights_shape;
+  const float* weights_data;
+  const RuntimeShape& bias_shape;
+  const float* bias_data;
+  const RuntimeShape& output_shape;
+  float* output_data;
+  int thread_start;
+  int thread_end;
+  const CpuBackendContext& cpu_backend_context;
+};
+
+inline void FullyConnectedSparseWeight1x16(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& weights_shape, const int8_t* weights_data,
+    const int32_t* per_channel_scale, const int32_t* per_channel_shift,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data,
+    CpuBackendContext* cpu_backend_context) {
+  const int output_elements = output_shape.FlatSize();
+  memset(output_data, 0, output_elements * sizeof(int8_t));
+
+  const int batches =
+      FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+
+  // TODO(b/220851507): Add multi-thread support for quantized sparse kernel.
+  return FullyConnectedSparseWeight1x16Impl(
+      sparsity, params, input_shape, input_data, weights_shape, weights_data,
+      per_channel_scale, per_channel_shift, bias_shape, bias_data, output_shape,
+      output_data, 0, batches, *cpu_backend_context);
+}
+
+// The multi-threaded kernel slices the workload along the batch dimension. If
+// there's not enough batches of data, the number of threads used is equal to
+// the batch size. We can improve this later with slicing along the row
+// dimension of the weight.
+inline void FullyConnectedSparseWeight1x4(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& weights_shape, const float* weights_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    CpuBackendContext* cpu_backend_context) {
+  const int output_elements = output_shape.FlatSize();
+  memset(output_data, 0, output_elements * sizeof(float));
+
+  const int max_threads = cpu_backend_context->max_num_threads();
+  const int batches =
+      FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  const int thread_count = std::max(1, std::min(batches, max_threads));
+  if (thread_count == 1) {
+    return FullyConnectedSparseWeight1x4Impl(
+        sparsity, params, input_shape, input_data, weights_shape, weights_data,
+        bias_shape, bias_data, output_shape, output_data, 0, batches,
+        *cpu_backend_context);
+  }
+  std::vector<FullyConnectedSparseWeight1x4Task> tasks;
+  tasks.reserve(thread_count);
+  int thread_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    // This makes sure the workload is relatively balanced when batches is not a
+    // multiple of thread_count. The first mod(batches, thread_count) tasks need
+    // to process one more batch than the rest.
+    int thread_end = thread_start + batches / thread_count;
+    if (i < batches % thread_count) thread_end++;
+
+    tasks.emplace_back(sparsity, params, input_shape, input_data, weights_shape,
+                       weights_data, bias_shape, bias_data, output_shape,
+                       output_data, thread_start, thread_end,
+                       *cpu_backend_context);
+    thread_start = thread_end;
+  }
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_check.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_check.h
new file mode 100644
index 00000000..30139a3c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_check.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_CHECK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_CHECK_H_
+
+#if defined(__SSSE3__)
+// SSSE 3 available: Use the SSE code.
+#define SSE_OR_PORTABLE(funcname, ...) Sse##funcname(__VA_ARGS__)
+
+#else
+
+// No SSSE 3 available: Use Portable code
+#define SSE_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+#endif
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_CHECK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
new file mode 100644
index 00000000..4f313bd3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -0,0 +1,347 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
+
+// Note: This file is a copy-paste version of neon_tensor_utils.h, only
+// difference is in MatrixBatchVectorMultiplyAccumulate and
+// SparseMatrixBatchVectorMultiplyAccumulate (other functions do not have SSE
+// implementation yet).
+
+// Note: Most of the functions below use NEON_OR_PORTABLE, through the Intel
+// NEON_2_SSE translator library. If a native SSE version of a function is
+// implemented, replace the appropriate one to SSE_OR_PORTABLE.
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/optimized/sse_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result) {
+#if defined(__AVX2__)
+  Avx2MatrixBatchVectorMultiplyAccumulateImpl(matrix, m_rows, m_cols, vector,
+                                              n_batch, result);
+#else
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vector, n_batch, result);
+#endif
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result) {
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, result, per_channel_scale,
+                  input_offset, scratch, row_sums, compute_row_sums, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    int32_t* __restrict__ scratch, float* __restrict__ result,
+    CpuBackendContext* __restrict__ context) {
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, scratch, result, context);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
+                   segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x16, matrix,
+                   segments, indices, m_rows, m_cols, vector, bias_vector,
+                   n_batch, input_offset, output_multiplier, output_shift,
+                   per_channel_scale, per_channel_shift, output_offset,
+                   output_activation_min, output_activation_max, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
+                   m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale) {
+  SSE_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
+                  m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+                  per_channel_scale);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* input_zeropoint_times_weights,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
+      shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
+void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                    int32_t n_row, int32_t n_col,
+                                    int32_t* output) {
+  PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
+void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output) {
+  PortableApplyTanh(intger_bits, input, n_batch, n_input, output);
+}
+
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
+                   output_zp, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result) {
+  NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
+                   batch_vector, n_batch, multiplier, shift, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+// Check if all entries of a vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size) {
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
+// Check if all entries of a vector are zero for int8.
+bool IsZeroVector(const int8_t* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result) {
+  NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
+                   min_value, max_value, scaling_factor);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
+                   min_value, max_value, scaling_factor);
+}
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset) {
+  NEON_OR_PORTABLE(AsymmetricQuantizeFloats, values, size, quantized_values,
+                   scaling_factor, offset);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                   reduction_size);
+}
+
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                  reduction_size);
+}
+
+void MeanStddevNormalization(const float* __restrict__ input_vector,
+                             float* __restrict__ output_vector, int v_size,
+                             int n_batch) {
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
new file mode 100644
index 00000000..364b02ce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_IMPL_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+namespace tensor_utils {
+
+#if defined(__AVX2__)
+// Matrix multiplication for float values.
+void Avx2MatrixBatchVectorMultiplyAccumulateImpl(
+    const float* __restrict__ matrix, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+// Matrix multiplication for quantized values using asymmetric quantization.
+void Avx2MatrixBatchVectorMultiplyAccumulateImpl(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
+#endif  // defined(__AVX2__)
+
+#ifdef __SSSE3__
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result);
+
+// Matrix multiplication for quantized values using symmetric quantization
+// with additional scratch memory for GEMM operation prior to scaling.
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch, int32_t* scratch,
+    float* __restrict__ result, CpuBackendContext* context);
+
+// Matrix multiplication for quantized values using asymmetric quantization.
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+// Sparse version.
+void SseSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale);
+
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size);
+
+#endif  // __SSSE3__
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/portable_tensor.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/portable_tensor.h
new file mode 100644
index 00000000..a9f9551a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+
+#include <cstddef>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// A list of tensors in a format that can be used by kernels like split and
+// concatenation.
+template <typename T>
+class VectorOfTensors {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfTensors(const TfLiteContext& context,
+                  const TfLiteIntArray& tensor_list) {
+    int num_tensors = tensor_list.size;
+
+    all_data_.reserve(num_tensors);
+    all_shape_.reserve(num_tensors);
+    all_shape_ptr_.reserve(num_tensors);
+
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      all_data_.push_back(GetTensorData<T>(t));
+      all_shape_.push_back(GetTensorShape(t));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_shape in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_tensors; ++i) {
+      all_shape_ptr_.push_back(&all_shape_[i]);
+    }
+  }
+
+  explicit VectorOfTensors(const std::vector<TfLiteTensor*>& tensors) {
+    int num_tensors = tensors.size();
+
+    all_data_.reserve(num_tensors);
+    all_shape_.reserve(num_tensors);
+    all_shape_ptr_.reserve(num_tensors);
+
+    for (auto* t : tensors) {
+      all_data_.push_back(GetTensorData<T>(t));
+      all_shape_.push_back(GetTensorShape(t));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_shape in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_tensors; ++i) {
+      all_shape_ptr_.push_back(&all_shape_[i]);
+    }
+  }
+  // Return a pointer to the data pointers of all tensors in the list. For
+  // example:
+  //   float* const* f = v.data();
+  //   f[0][1] is the second element of the first tensor.
+  T* const* data() const { return all_data_.data(); }
+
+  // Return a pointer the shape pointers of all tensors in the list. For
+  // example:
+  //   const RuntimeShape* const* d = v.dims();
+  //   dims[1] are the dimensions of the second tensor in the list.
+  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
+
+  size_t size() const { return all_data_.size(); }
+
+ private:
+  std::vector<T*> all_data_;
+  std::vector<RuntimeShape> all_shape_;
+  std::vector<RuntimeShape*> all_shape_ptr_;
+};
+
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32_t> zero_point_;
+  std::vector<float> scale_;
+};
+
+// Writes randomly accessed values from `input` sequentially into `output`.
+template <typename T>
+class SequentialTensorWriter {
+ public:
+  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
+    input_data_ = GetTensorData<T>(input);
+    output_ptr_ = GetTensorData<T>(output);
+  }
+  SequentialTensorWriter(const T* input_data, T* output_data)
+      : input_data_(input_data), output_ptr_(output_data) {}
+
+  void Write(int position) { *output_ptr_++ = input_data_[position]; }
+  void WriteN(int position, int len) {
+    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
+    output_ptr_ += len;
+  }
+
+ private:
+  const T* input_data_;
+  T* output_ptr_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/portable_tensor_utils.h
new file mode 100644
index 00000000..ed59fd01
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -0,0 +1,624 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation. Use of CpuBackendContext in method
+// implementations is purely optional.
+class CpuBackendContext;
+
+namespace tensor_utils {
+
+// Multiplies a matrix with a scalar and reduce the result on each row to a
+// scalar.
+// Parameters:
+//     - matrix: matrix of size n_row * n_col
+//     - scalar: the scalar that is multiplied to each element in the matrix
+//     - n_row:  the row count of the matrix
+//     - n_col:  the column count of the matrix
+//     - output: the 32bit output
+// Note: We do not need saturation because the int8 * int8 is safe from overflow
+// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
+// initial output value is not exceptionally large.
+void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                    int32_t n_row, int32_t n_col,
+                                    int32_t* output);
+
+// Add another vector for each batch in the batch vector.
+template <typename T>
+void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
+                          T* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    for (int i = 0; i < v_size; ++i) {
+      batch_vector[i] += vector[i];
+    }
+    batch_vector += v_size;
+  }
+}
+
+// Cwise product of two vectors.
+template <typename T>
+inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2,
+                                     int v_size, T* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product of a vector and a batch-vector.
+template <typename T>
+inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
+                                          const T* batch_vector, int n_batch,
+                                          T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+template <typename T>
+inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
+                                               const T* __restrict__ vector2,
+                                               int v_size,
+                                               T* __restrict__ result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ += *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+template <typename T>
+inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
+                                                    const T* batch_vector,
+                                                    int n_batch, T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+// Batch vector initialization with another vector.
+template <typename T>
+void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
+                             T* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    std::copy_n(vector, v_size, batch_vector + b * v_size);
+  }
+}
+
+// Checks if all entries of vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size);
+
+// Checks if all entries of vector are zero for int8.
+bool IsZeroVector(const int8_t* vector, int v_size);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It also outputs the range (min, max) of the floating point buffer, and the
+// scaling factor used to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It uses the range (min, max) provided to the function to calculate the
+// appropriate scaling factor to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor);
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset);
+
+// Helper function to quantize floats.
+// float_data_ptr     input float vectors
+// n_batch            number of input vectors
+// n_data             size of a single input vector
+// quantized_data_ptr (out) vector with quantized data
+// scaling_factors    (out) scaling factors (one per vector)
+// zero_points        (out) zero points (one per vector)
+// do_asymmetric      controls if the quantization should be asymmetric.
+inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int32_t* zero_points,
+                                bool do_asymmetric) {
+  for (int b = 0; b < n_batch; ++b) {
+    const int offset = b * n_data;
+    if (do_asymmetric) {
+      tensor_utils::AsymmetricQuantizeFloats(
+          float_data_ptr + offset, n_data, quantized_data_ptr + offset,
+          &scaling_factors[b], &zero_points[b]);
+    } else {
+      float unused_min, unused_max;
+      tensor_utils::SymmetricQuantizeFloats(
+          float_data_ptr + offset, n_data, quantized_data_ptr + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+  }
+}
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer.
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result);
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x4.
+// This function assumes that m_cols is a multiple of the block size (4 in this
+// case) so that there's no incomplete block.
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result);
+
+// Same as the function above, but for values quantized using symmetric
+// quantization (e.g. by calling SymmetricQuantizeFloats).
+// The passed scaling factors is a buffer of the quantization scaling factors
+// that will be used to dequentize the products into the final result buffer.
+// These scaling factors are the multiplication of the matrix scaling factor
+// by the vector's scaling factor, one per batch (i.e. this allows quantizing
+// each batch in the batch-vector matrix independently).
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result);
+
+// Same as the function above except that vector values
+// are quantized with asymmetric quantization per-batch and the matrix
+// is quantized per row.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
+    const int32_t* __restrict__ input_offset);
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x16.
+// This function assumes that m_cols is a multiple of the block size (16 in this
+// case) so that there's no incomplete block. Also, it assumes all offsets of
+// input, output and filter are zero.
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale = nullptr);
+
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp);
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output);
+
+// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
+// vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - layer_norm_weights:  the quantized layer normalization weights.
+//     - bias: the bias for the layer normalization.
+//     - layer_norm_scale_a: multiplier for scale factor.
+//     - layer_norm_scale_b: shift for scale factor.
+//     - variance_limit: the guard to make sure the inverse does not overflow.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output);
+
+// Same as above but the internal calculation is done in float.
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output);
+
+// Apply Sigmoid to a quantized vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Q3.12 format and the output is in Q0.15 format.
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output);
+
+// Same as above but the internal calculation is float.
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output);
+
+// Apply Tanh to a quantized vector.
+// Parameters:
+//     - integer_bits: the integer bits of the input.
+//                     Currently supports 0, 1, 2, 3, 4, 5, 6.
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Qm.15-m format and the output is in Q0.15 format.
+void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output);
+
+// Apply Tanh to a quantized vector. The internal calculation is in float.
+//    - Input has 2^(integer_bits) as scale.
+//    - Output has Q0.15 as scale.
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output);
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 16 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output);
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output);
+
+// Element-wise multiplication of two quantized vectors with rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - multiplier: the multiplier part of scale.
+//     - shift:      the shift part of scale.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+//     - output_zp:  the zero point of output.
+// Output does not need to be initialized.
+// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
+// 2^(s - 31).
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output);
+
+// Element-wise saturating addition of two quantized vectors without rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output);
+
+// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
+// int8_t. Parameters:
+//     - vector:         vector of size v_size.
+//     - v_size:         the size of the vector.
+//     - clipping_value: the value used for clipping.
+void CwiseClipping(float* vector, const int v_size, const float clipping_value);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value);
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value);
+
+// Dot product of two vectors.
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size);
+
+// Dot product of two batch vectors of size n_batch * v_size:
+// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
+//            x_2_1, x_2_2, ..., x_2_vsize,
+//            ...
+//            x_nbatch_1,..., x_nbatch_vsize]
+// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
+//            y_2_1, y_2_2, ..., y_2_vsize,
+//            ...
+//            y_nbatch_1,..., y_nbatch_vsize]
+// Then result will be a vector of n_batch size starting from 'result':
+// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
+//  x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
+//  ...
+//  x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
+template <typename T>
+inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
+                                             int v_size, int n_batch,
+                                             T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
+    vector1 += v_size;
+    vector2 += v_size;
+  }
+}
+
+// Same as above but input is 16bit and output is 32bit.
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result);
+
+// Same as above, but inputs are 16bit integer and output is 16bit integer.
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
+// "vector" has range [0, 32767] because it is the output of sigmoid function.
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size);
+
+// Same as above but input/output is 32 bit integer.
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
+// Same as above but input is 8 bit integer.
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
+// Multiply all elements of vector with a scalar.
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result);
+
+// Layer norm for each batch.
+void MeanStddevNormalization(const float* input_vector, float* output_vector,
+                             int v_size, int n_batch);
+
+// Saturate Add with rescale on both inputs.
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output);
+
+// Same as the function above, but provide a scratch buffer for the
+// int8 x int8 -> int32 and a CpuBackendContext for the accumulator
+// computation.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    int32_t* __restrict__ scratch, float* __restrict__ result,
+    CpuBackendContext* __restrict__ context);
+
+// Same as the function above except that can make use of cached row sums.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
+
+// Same as the function above, but provides separate scaling factor for the
+// matrix and the vectors. The scaling factors are multiplied in the
+// scaling_factor_scratch buffer.
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float matrix_scaling_factor,
+    const float* vector_scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, float* scaling_factor_scratch,
+    CpuBackendContext* context) {
+  for (int b = 0; b < n_batch; ++b) {
+    scaling_factor_scratch[b] =
+        vector_scaling_factors[b] * matrix_scaling_factor;
+  }
+  MatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
+                                      scaling_factor_scratch, n_batch, result,
+                                      per_channel_scale, input_offset, scratch,
+                                      row_sums, compute_row_sums, context);
+}
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer,
+// Parameters:
+//     - input: batch vector of size n_batch * n_input
+//     - bias:  vector of size b_input
+//     - input_to_gate_weights: matrix of size n_input * n_output
+//     - multiplier: scalar
+//     - shift: scalar
+//     - n_batch: the batch size
+//     - n_input: the input size
+//     - n_output: the output size
+//     - output_zp: the zero point of the output.
+//     - scratch: batch vector of size n_batch * n_output
+//     - output: the 16 bit output
+// Notes:
+//     - this is used for gate matmul: for non-cifg it is for input, forget,
+//       cell, output gates; for cifg, it is for forget, cell, output gates.
+//     - multiplier and shift combined gives the scale.
+//     - assumes input zero point is 0.
+//     - scratch is created for optimization purpose only.
+// TODO(b/152066492): this can be removed if some future optimization
+// work makes it unnecessary.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context);
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer,
+// Parameters:
+//     - input: batch vector of size n_batch * n_input
+//     - bias:  vector of size b_input
+//     - input_to_gate_weights: matrix of size n_input * n_output
+//     - multiplier: scalar
+//     - shift: scalar
+//     - n_batch: the batch size
+//     - n_input: the input size
+//     - n_output: the output size
+//     - output_zp: the zero point of the output.
+//     - scratch: batch vector of size n_batch * n_output
+//     - output: the 8 bit output
+// Notes:
+//     - this is used for projection matmul.
+//     - multiplier and shift combined gives the scale.
+//     - assumes input zero point is 0.
+//     - scratch is created for optimization purpose only.
+// TODO(b/152066492): this can be removed if some future optimization
+// work makes it unnecessary.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context);
+
+// Apply Rectified Linear to elements of a vector.
+void ApplyReluToVector(const float* __restrict__ vector, int v_size,
+                       float* __restrict__ result);
+
+// Apply Rectified Linear 1 (cap to [-1;1]) to elements of a vector
+void ApplyRelu1ToVector(const float* __restrict__ vector, int v_size,
+                        float* __restrict__ result);
+
+// Apply Rectified Linear 6 (cap to [0;6]) to elements of a vector
+void ApplyRelu6ToVector(const float* __restrict__ vector, int v_size,
+                        float* __restrict__ result);
+
+// Apply signbit to elements of a vector
+void ApplySignbitToVector(const float* __restrict__ vector, int v_size,
+                          float* __restrict__ result);
+
+// Unpack or inflate `src_buffer` by taking each element and splitting it as
+// two elements into `dst_buffer`.
+// Parameters:
+//   src_buffer   : Densely packed buffer containing int4 values
+//   num_elements : Number of elements stored in the buffer. Note that this can
+//                  be smaller than the size of `src_buffer` by 1 if it's odd,
+//                  in which case the last nibble in `src_buffer` is ignored.
+//                  This should be equal to the size of `dst_buffer`.
+//   dst_buffer   : Buffer to unpack into. Should be allocated by the caller.
+//                  Size should be at least `num_elements`.
+// Notes:
+//   For example, given `src_buffer = {0x12, 0x34};`, calling this function
+//   will return `dst_buffer = {0x02, 0x01, 0x04, 0x03}`.
+void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements,
+                             int8_t* dst_buffer);
+
+}  // namespace tensor_utils
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/quantization_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/quantization_util.h
new file mode 100644
index 00000000..eb4e8401
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/quantization_util.h
@@ -0,0 +1,293 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Given the min and max values of a float array, return
+// reasonable quantization parameters to use for this array.
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
+                                            bool narrow_range) {
+  const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
+  const T qmax = std::numeric_limits<T>::max();
+  const double qmin_double = qmin;
+  const double qmax_double = qmax;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_CHECK_LE(rmin, 0.);
+  TFLITE_CHECK_GE(rmax, 0.);
+  if (rmin == rmax) {
+    // Special case where the min,max range is a point. Should be {0}.
+    TFLITE_CHECK_EQ(rmin, 0.);
+    TFLITE_CHECK_EQ(rmax, 0.);
+    QuantizationParams quantization_params;
+    quantization_params.zero_point = 0;
+    quantization_params.scale = 0.;
+    return quantization_params;
+  }
+
+  // General case.
+  //
+  // First determine the scale.
+  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  const double zero_point_from_min = qmin_double - rmin / scale;
+  const double zero_point_from_max = qmax_double - rmax / scale;
+  const double zero_point_from_min_error =
+      std::abs(qmin_double) + std::abs(rmin / scale);
+  const double zero_point_from_max_error =
+      std::abs(qmax_double) + std::abs(rmax / scale);
+
+  const double zero_point_double =
+      zero_point_from_min_error < zero_point_from_max_error
+          ? zero_point_from_min
+          : zero_point_from_max;
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with SAME
+  // padding).
+  T nudged_zero_point = 0;
+  if (zero_point_double < qmin_double) {
+    nudged_zero_point = qmin;
+  } else if (zero_point_double > qmax_double) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = static_cast<T>(round(zero_point_double));
+  }
+  // The zero point should always be in the range of quantized value,
+  // [qmin, qmax].
+  TFLITE_CHECK_GE(nudged_zero_point, qmin);
+  TFLITE_CHECK_LE(nudged_zero_point, qmax);
+
+  // Finally, store the result nudged quantization params.
+  QuantizationParams quantization_params;
+  quantization_params.zero_point = nudged_zero_point;
+  quantization_params.scale = scale;
+  return quantization_params;
+}
+
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
+  return ChooseQuantizationParams<T>(rmin, rmax, false);
+}
+
+// LINT.IfChange
+// Converts a floating-point number to an integer. For all inputs x where
+// static_cast<IntOut>(x) is legal according to the C++ standard, the result
+// is identical to that cast (i.e. the result is x with its fractional part
+// truncated whenever that is representable as IntOut).
+//
+// static_cast would cause undefined behavior for the following cases, which
+// have well-defined behavior for this function:
+//
+//  1. If x is NaN, the result is zero.
+//
+//  2. If the truncated form of x is above the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::max().
+//
+//  3. If the truncated form of x is below the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::min().
+//
+// Note that cases #2 and #3 cover infinities as well as finite numbers.
+//
+// The range of FloatIn must include the range of IntOut, otherwise
+// the results are undefined.
+// TODO(sfeuz): Replace by absl::SafeCast once available.
+template <class IntOut, class FloatIn>
+IntOut SafeCast(FloatIn x) {
+  static_assert(!std::numeric_limits<FloatIn>::is_integer,
+                "FloatIn is integer");
+  static_assert(std::numeric_limits<IntOut>::is_integer,
+                "IntOut is not integer");
+  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
+
+  // Special case NaN, for which the logic below doesn't work.
+  if (std::isnan(x)) {
+    return 0;
+  }
+
+  // Negative values all clip to zero for unsigned results.
+  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
+    return 0;
+  }
+
+  // Handle infinities.
+  if (std::isinf(x)) {
+    return x < 0 ? std::numeric_limits<IntOut>::min()
+                 : std::numeric_limits<IntOut>::max();
+  }
+
+  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
+  // unless x is zero in which case exp == 0. Note that this implies that the
+  // magnitude of x is strictly less than 2^exp.
+  int exp = 0;
+  std::frexp(x, &exp);
+
+  // Let N be the number of non-sign bits in the representation of IntOut. If
+  // the magnitude of x is strictly less than 2^N, the truncated version of x
+  // is representable as IntOut. The only representable integer for which this
+  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
+  // by the fall-through below.
+  if (exp <= std::numeric_limits<IntOut>::digits) {
+    return x;
+  }
+
+  // Handle numbers with magnitude >= 2^N.
+  return x < 0 ? std::numeric_limits<IntOut>::min()
+               : std::numeric_limits<IntOut>::max();
+}
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/kernels/internal/quantization_util.h)
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of NEGATIVE its exponent ---
+// this is intended as a RIGHT-shift.
+//
+// Restricted to the case where the multiplier < 1 (and non-negative).
+void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
+                                         int32_t* quantized_multiplier,
+                                         int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier > 1.
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift);
+
+// Splits a double input value into a returned fraction, and a shift value from
+// the exponent, using only bitwise and integer operations to support
+// microcontrollers and other environments without floating-point support.
+//
+// This is designed to be a replacement for how std::frexp() is used within the
+// QuantizeMultiplier() function, and so has a different signature than the
+// standard version, returning a 64-bit integer rather than a double. This
+// result has a maximum value of 1<<31, with the fraction expressed as a
+// proportion of that maximum.
+//
+// std::frexp() returns NaNs and infinities unmodified, but since we're
+// returning integers that can't represent those values, instead we return
+// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
+// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
+// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
+// result in return values that end up truncating some bits at the end,
+// reflecting the loss of precision inherent in denormalization.
+int64_t IntegerFrExp(double input, int* shift);
+
+// Converts an integer fraction in the format produced by IntegerFrExp (where
+// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
+// IEEE binary64 double format result. The implementation uses only integer and
+// bitwise operators, so no floating point hardware support or emulation is
+// needed. This is here so quantized operations can run non-time-critical
+// preparation calculations on microcontrollers and other platforms without
+// float support.
+double DoubleFromFractionAndShift(int64_t fraction, int shift);
+
+// Performs a multiplication of two numbers in double format, using only integer
+// and bitwise instructions. This is aimed at supporting housekeeping functions
+// for quantized operations on microcontrollers without floating-point hardware.
+double IntegerDoubleMultiply(double a, double b);
+
+// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
+// greater than b. It is implemented using only integer and logical instructions
+// so that it can be easily run on microcontrollers for quantized operations.
+int IntegerDoubleCompare(double a, double b);
+
+// This first creates a multiplier in a double equivalent of
+// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
+// precision in the double's fractional bits.  It then splits the result into
+// significand and exponent.
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift);
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
+                                    int input_integer_bits,
+                                    int32_t* quantized_multiplier,
+                                    int* left_shift,
+                                    int32_t* reverse_scaling_divisor,
+                                    int* reverse_scaling_left_shift);
+// Calculate the largest input that will result in a within-bounds intermediate
+// result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
+// it must not overflow before we reduce the value by multiplication by the
+// input multiplier.  The negative radius is used as the minimum difference in
+// Softmax.
+int CalculateInputRadius(int input_integer_bits, int input_left_shift,
+                         int total_signed_bits = 31);
+
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* nudged_scale);
+
+// Fake quantizes (quantizes and dequantizes) input_data using the scale,
+// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
+// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
+void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
+                       const float nudged_max, const float* input_data,
+                       float* output_data, const float size);
+
+// If x is approximately a power of two (with any positive or negative
+// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
+// returns false.
+bool CheckedLog2(const float x, int* log2_result);
+
+// Decomposes an array of double multipliers into a Q0.31 int32 representation
+// of its significand, and shift representation of its exponent.
+//
+// Handles an arbitrary multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplierArray(const double* effective_scales, size_t size,
+                             int32_t* effective_scale_significand,
+                             int* effective_shift);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reduce_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reduce_common.h
new file mode 100644
index 00000000..6928981e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reduce_common.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REDUCE_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REDUCE_COMMON_H_
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reduce {
+
+enum ReduceType {
+  kSum,
+  kProd,
+  kMax,
+  kMin,
+  kAny,
+  kAll,
+};
+
+}  // namespace reduce
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REDUCE_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/add.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/add.h
new file mode 100644
index 00000000..5b520bd1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/add.h
@@ -0,0 +1,561 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  T activation_min, activation_max;
+  GetActivationParams(params, &activation_min, &activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], activation_min, activation_max);
+  }
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
+
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  for (int i = 0; i < size; ++i) {
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void AddGeneralParamScale(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16_t* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16_t* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int max_value = std::numeric_limits<int16_t>::max();
+
+  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
+  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LT(params.input1_offset, max_value);
+  TFLITE_DCHECK_LT(params.input2_offset, max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data,
+                bool pot_scale = true) {
+  if (!pot_scale) {
+    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
+                         input2_data, output_shape, output_data);
+    return;
+  }
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+    F0 scaled_input = F0::FromRaw(
+        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
+  }
+}
+
+template <typename T>
+inline void AddBroadcast(const T* input_data, const T* broadcast_data,
+                         T* output_data, size_t size, T activation_min,
+                         T activation_max) {
+  for (size_t c = 0; c < size; ++c) {
+    output_data[c] = ActivationFunctionWithMinMax<T>(
+        input_data[c] + broadcast_data[0], activation_min, activation_max);
+  }
+}
+
+template <>
+inline void AddBroadcast<int32_t>(const int32_t* input_data,
+                                  const int32_t* broadcast_data,
+                                  int32_t* output_data, size_t size,
+                                  int32_t activation_min,
+                                  int32_t activation_max) {
+  size_t c = 0;
+#ifdef USE_NEON
+  const int32x4_t vmax = vdupq_n_s32(activation_max);
+  const int32x4_t vmin = vdupq_n_s32(activation_min);
+  const int32x4_t vb = vdupq_n_s32(broadcast_data[0]);
+  for (; c + 4 <= size; c += 4) {
+    const int32x4_t va = vld1q_s32(&input_data[c]);
+    int32x4_t vres = vaddq_s32(va, vb);
+    vres = vmaxq_s32(vmin, vres);
+    vres = vminq_s32(vmax, vres);
+    vst1q_s32(&output_data[c], vres);
+  }
+#endif
+  for (; c < size; ++c) {
+    output_data[c] = ActivationFunctionWithMinMax<int32_t>(
+        input_data[c] + broadcast_data[0], activation_min, activation_max);
+  }
+}
+
+template <typename T>
+void AddElementwise(const T* input1_data, const T* input2_data, T* output_data,
+                    size_t size, T activation_min, T activation_max) {
+  for (size_t c = 0; c < size; ++c) {
+    output_data[c] = ActivationFunctionWithMinMax<T>(
+        input1_data[c] + input2_data[c], activation_min, activation_max);
+  }
+}
+
+template <>
+inline void AddElementwise<int32_t>(const int32_t* input1_data,
+                                    const int32_t* input2_data,
+                                    int32_t* output_data, size_t size,
+                                    int32_t activation_min,
+                                    int32_t activation_max) {
+  size_t c = 0;
+#ifdef USE_NEON
+  const int32x4_t vmax = vdupq_n_s32(activation_max);
+  const int32x4_t vmin = vdupq_n_s32(activation_min);
+  for (; c + 4 <= size; c += 4) {
+    const int32x4_t va = vld1q_s32(&input1_data[c]);
+    const int32x4_t vb = vld1q_s32(&input2_data[c]);
+    int32x4_t vres = vaddq_s32(va, vb);
+    vres = vmaxq_s32(vmin, vres);
+    vres = vminq_s32(vmax, vres);
+    vst1q_s32(&output_data[c], vres);
+  }
+#endif
+  for (; c < size; ++c) {
+    output_data[c] = ActivationFunctionWithMinMax<int32_t>(
+        input1_data[c] + input2_data[c], activation_min, activation_max);
+  }
+}
+
+template <typename T>
+inline void BroadcastAddRecursiveDimensions(
+    int dimension, size_t* input1_offset_p, size_t* input2_offset_p,
+    size_t* output_offset, size_t* compressed_input1_stride,
+    size_t* compressed_input2_stride, size_t* compressed_output_shape,
+    T activation_min, T activation_max, const T* input1_data,
+    const T* input2_data, T* output_data) {
+  if (dimension > 0) {
+    for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastAddRecursiveDimensions(
+          dimension - 1, &input1_offset_c, &input2_offset_c, output_offset,
+          compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, activation_min, activation_max, input1_data,
+          input2_data, output_data);
+      *input1_offset_p += compressed_input1_stride[dimension];
+      *input2_offset_p += compressed_input2_stride[dimension];
+    }
+  } else {
+    TFLITE_DCHECK(dimension == 0);
+    bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
+    bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
+    TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
+    const T* input1_data_ptr = input1_data + *input1_offset_p;
+    const T* input2_data_ptr = input2_data + *input2_offset_p;
+    T* output_data_ptr = output_data + *output_offset;
+    if (input1_is_broadcast) {
+      // input1 is broadcast.
+      AddBroadcast<T>(input2_data_ptr, input1_data_ptr, output_data_ptr,
+                      compressed_output_shape[dimension], activation_min,
+                      activation_max);
+      *input2_offset_p += compressed_output_shape[dimension];
+    } else if (input2_is_broadcast) {
+      // input2 is broadcast.
+      AddBroadcast<T>(input1_data_ptr, input2_data_ptr, output_data_ptr,
+                      compressed_output_shape[dimension], activation_min,
+                      activation_max);
+      *input1_offset_p += compressed_output_shape[dimension];
+    } else {
+      // Add element-wise.
+      AddElementwise<T>(input1_data_ptr, input2_data_ptr, output_data_ptr,
+                        compressed_output_shape[dimension], activation_min,
+                        activation_max);
+      *input1_offset_p += compressed_output_shape[dimension];
+      *input2_offset_p += compressed_output_shape[dimension];
+    }
+    *output_offset += compressed_output_shape[dimension];
+  }
+}
+
+template <typename T,
+          // For unquantized add for small integers, explicitly set to true.
+          bool dummy = false>
+inline typename std::enable_if<!is_small_integer<T>::value || dummy, void>::type
+BroadcastAdd6DSlow(const ArithmeticParams& params,
+                   const RuntimeShape& input1_shape, const T* input1_data,
+                   const RuntimeShape& input2_shape, const T* input2_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  constexpr int kMaxBroadcastDim = 6;
+  T activation_min, activation_max;
+  GetActivationParams(params, &activation_min, &activation_max);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastAddRecursiveDimensions<T>(
+      kMaxBroadcastDim - 1, &input1_offset, &input2_offset, &output_offset,
+      compressed_input1_stride, compressed_input2_stride,
+      compressed_output_shape, activation_min, activation_max, input1_data,
+      input2_data, output_data);
+}
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void BroadcastAddRecursiveDimensions(
+    const ArithmeticParams& params, int dimension, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    size_t* compressed_input1_stride, size_t* compressed_input2_stride,
+    size_t* compressed_output_shape, const T* input1_data, const T* input2_data,
+    T* output_data) {
+  for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
+    if (dimension > 0) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastAddRecursiveDimensions(
+          params, dimension - 1, &input1_offset_c, &input2_offset_c,
+          output_offset, compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, input1_data, input2_data, output_data);
+    } else {
+      TFLITE_DCHECK(dimension == 0);
+      const int32_t input1_val =
+          params.input1_offset + input1_data[*input1_offset_p];
+      const int32_t input2_val =
+          params.input2_offset + input2_data[*input2_offset_p];
+      const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+      const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32_t scaled_input1_val =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input1_val, params.input1_multiplier,
+              params.input1_shift);
+      const int32_t scaled_input2_val =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input2_val, params.input2_multiplier,
+              params.input2_shift);
+      const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32_t raw_output =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              raw_sum, params.output_multiplier, params.output_shift) +
+          params.output_offset;
+      const int32_t clamped_output =
+          std::min(params.quantized_activation_max,
+                   std::max(params.quantized_activation_min, raw_output));
+      output_data[*output_offset] = static_cast<T>(clamped_output);
+      ++(*output_offset);
+    }
+    *input1_offset_p += compressed_input1_stride[dimension];
+    *input2_offset_p += compressed_input2_stride[dimension];
+  }
+}
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline typename std::enable_if<is_small_integer<T>::value, void>::type
+BroadcastAdd6DSlow(const ArithmeticParams& params,
+                   const RuntimeShape& input1_shape, const T* input1_data,
+                   const RuntimeShape& input2_shape, const T* input2_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  constexpr int kMaxBroadcastDim = 6;
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastAddRecursiveDimensions(
+      params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
+      &output_offset, compressed_input1_stride, compressed_input2_stride,
+      compressed_output_shape, input1_data, input2_data, output_data);
+}
+
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  return BroadcastAdd6DSlow(params, input1_shape, input1_data, input2_shape,
+                            input2_data, output_shape, output_data);
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8_t* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8_t* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8_t* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8_t* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8_t* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8_t* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8_t* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/add_n.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/add_n.h
new file mode 100644
index 00000000..b6b5882d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/add_n.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// T is expected to be either float or int.
+template <typename T>
+inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
+                 const T* const* input_data, T* output_data) {
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t size = input_shape.FlatSize();
+  for (size_t i = 0; i < size; ++i) {
+    T x = 0;
+    for (size_t j = 0; j < num_inputs; ++j) {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
+inline void AddN(const ArithmeticParams& params,
+                 const RuntimeShape& input_shape, const size_t num_inputs,
+                 const int8_t* const* input_data, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  // All inputs should have same zero-point and scale, this is checked during
+  // Prepare stage.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t size = input_shape.FlatSize();
+  for (size_t i = 0; i < size; ++i) {
+    // accumulate in scaled_x before clamping to avoid overflow
+    const int32_t x = params.input1_offset;  // x = 0
+    const int32_t shifted_x = x * (1 << params.left_shift);
+    int32_t scaled_x = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_x, params.input1_multiplier, params.input1_shift);
+
+    for (size_t j = 0; j < num_inputs; ++j) {
+      const int32_t y = params.input1_offset + input_data[j][i];
+      const int32_t shifted_y = y * (1 << params.left_shift);
+      int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_y, params.input1_multiplier, params.input1_shift);
+      scaled_x += scaled_y;
+    }
+
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            scaled_x, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/arg_min_max.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/arg_min_max.h
new file mode 100644
index 00000000..8154fbf7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/arg_min_max.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
+
+#include <functional>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
+  if (is_arg_max) {
+    return std::greater<T>();
+  } else {
+    return std::less<T>();
+  }
+}
+
+template <typename T1, typename T2, typename T3, typename Cmp>
+void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
+               const T3* input2_data, const RuntimeShape& output_shape,
+               T2* output_data, const Cmp& cmp) {
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
+                   output_shape.DimensionsCount());
+  int axis = input2_data[0];
+  if (axis < 0) {
+    axis += input1_shape.DimensionsCount();
+  }
+  const int axis_size = input1_shape.Dims(axis);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    outer_size *= input1_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  const int dims_count = input1_shape.DimensionsCount();
+  for (int i = axis + 1; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
+    inner_size *= input1_shape.Dims(i);
+  }
+  for (int outer = 0; outer < outer_size; ++outer) {
+    for (int inner = 0; inner < inner_size; ++inner) {
+      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
+      T2 min_max_index = 0;
+      for (int i = 1; i < axis_size; ++i) {
+        const auto& curr_value =
+            input1_data[(outer * axis_size + i) * inner_size + inner];
+        if (cmp(curr_value, min_max_value)) {
+          min_max_value = curr_value;
+          min_max_index = static_cast<T2>(i);
+        }
+      }
+      output_data[outer * inner_size + inner] = min_max_index;
+    }
+  }
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
+               const T3* input2_data, const RuntimeShape& output_shape,
+               T2* output_data, const bool is_arg_max) {
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
+            GetComparefunction<T1>(is_arg_max));
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/batch_matmul.h
new file mode 100644
index 00000000..d8369621
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -0,0 +1,280 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
+
+#include <algorithm>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+namespace batch_matmul {
+
+// Determine which dimension is the broadcast dimension.
+inline int broadcast_dim(int lhs_dim, int rhs_dim) {
+  if (lhs_dim == rhs_dim) return lhs_dim;
+  if (lhs_dim == 1) return rhs_dim;
+  TFLITE_DCHECK_EQ(rhs_dim, 1);
+  return lhs_dim;
+}
+
+// Compute the "extent" for iterating on this dimension.
+// If we are broadcasting, then don't advance (i.e return 0).
+inline int extent(const RuntimeShape& shape, int x) {
+  if (shape.Dims(x) == 1) {
+    return 0;
+  }
+  int prod = 1;
+  for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+    prod *= shape.Dims(i);
+  }
+  return prod;
+}
+
+}  // namespace batch_matmul
+
+template <typename Ta, typename Tb, typename Tout>
+inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
+                        const RuntimeShape& rhs_shape, const Tb* rhs_data,
+                        const RuntimeShape& output_shape, Tout* output_data) {
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  const int batch_dim0 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                       b1 * batch_dim2 + b2) *
+                                          lhs_rows * rhs_cols;
+        for (int j = 0; j < rhs_cols; ++j) {
+          for (int i = 0; i < lhs_rows; ++i) {
+            Tout total = 0;
+            for (int k = 0; k < accum_depth; ++k) {
+              total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
+                       static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
+            }
+            int idx = lhs_rows * j + i;
+            out_ptr[idx] = total;
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const float* scaling_factors,
+                        const int32_t* input_offset, int32_t* row_sums,
+                        const RuntimeShape& output_shape, float* output_data,
+                        bool* compute_row_sums,
+                        const float* per_channel_scales) {
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  const int batch_dim0 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
+  const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
+  const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
+  const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
+  const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
+  const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
+
+  if (!compute_row_sums || *compute_row_sums) {
+    int num_weights_matrices = 1;
+    for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
+      num_weights_matrices *= extended_lhs_shape.Dims(i);
+    }
+    tensor_utils::ReductionSumVector(
+        lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
+    const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
+    const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
+      const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
+      const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
+        const float* scale_ptr2 = scale_ptr1 + (b2 * ioff_ext2);
+        const int32_t* woff_ptr2 = woff_ptr1 + (b2 * woff_ext2);
+        float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                        b1 * batch_dim2 + b2) *
+                                           lhs_rows * rhs_cols;
+        for (int j = 0; j < rhs_cols; ++j) {
+          const float batch_scaling_factor = scale_ptr2[j];
+          const float batch_offset = static_cast<float>(ioff_ptr2[j]);
+          for (int i = 0; i < lhs_rows; ++i) {
+            int32_t total = 0;
+            for (int k = 0; k < accum_depth; ++k) {
+              total +=
+                  lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
+            }
+            int32_t row_sum = woff_ptr2[i];
+            total -= row_sum * batch_offset;
+            int idx = lhs_rows * j + i;
+            float scale = batch_scaling_factor;
+            if (per_channel_scales) {
+              scale *= per_channel_scales[i];
+            }
+            out_ptr[idx] += scale * total;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename AccumT>
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const T* lhs_data,
+                        const RuntimeShape& rhs_shape, const T* rhs_data,
+                        const RuntimeShape& output_shape, T* output_data) {
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  const int batch_dim0 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 = batch_matmul::broadcast_dim(
+      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        T* out_ptr = output_data +
+                     ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+                         lhs_rows * rhs_cols;
+
+        for (int j = 0; j < rhs_cols; ++j) {
+          for (int i = 0; i < lhs_rows; ++i) {
+            AccumT total = 0;
+            for (int k = 0; k < accum_depth; ++k) {
+              AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
+              AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
+              total += (lhs_val + filter_offset) * (rhs_val + input_offset);
+            }
+            int32_t total_scaled = MultiplyByQuantizedMultiplier(
+                total, output_multiplier, output_shift);
+            total_scaled += output_offset;
+            total_scaled = std::max(total_scaled, output_activation_min);
+            total_scaled = std::min(total_scaled, output_activation_max);
+            const int idx = lhs_rows * j + i;
+            out_ptr[idx] = static_cast<T>(total_scaled);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h
new file mode 100644
index 00000000..cda46a26
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// TODO(b/135760455): Move this method anonymous namespace in a cc file.
+inline RuntimeShape ExtendShapeBatchToSpace(const RuntimeShape& shape) {
+  if (shape.DimensionsCount() == 4) {
+    return shape;
+  }
+  RuntimeShape new_shape(4, 1);
+  new_shape.SetDim(0, shape.Dims(0));
+  new_shape.SetDim(1, shape.Dims(1));
+  new_shape.SetDim(3, shape.Dims(2));
+  return new_shape;
+}
+
+template <typename T>
+inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const int32_t* block_shape_data,
+                           const RuntimeShape& unextended_input3_shape,
+                           const int32_t* crops_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  ruy::profiler::ScopeLabel label("BatchToSpaceND");
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  const RuntimeShape input1_shape =
+      ExtendShapeBatchToSpace(unextended_input1_shape);
+  const RuntimeShape output_shape =
+      ExtendShapeBatchToSpace(unextended_output_shape);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
+  const int crops_top = crops_data[0];
+  const int crops_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      if (out_h < 0 || out_h >= output_height) {
+        continue;
+      }
+      for (int in_w = 0; in_w < input_width; ++in_w) {
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+
+        if (out_w < 0 || out_w >= output_width) {
+          continue;
+        }
+        T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T* in =
+            input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/binary_function.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/binary_function.h
new file mode 100644
index 00000000..0b124af8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/binary_function.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Also appears to duplicate MinimumMaximum.
+//
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction4DSlow(
+    const RuntimeShape& unextended_input1_shape, const T1* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T2* input2_data,
+    const RuntimeShape& unextended_output_shape, R* output_data,
+    R (*func)(T1, T2)) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  const int* dims_data =
+      reinterpret_cast<const int*>(output_shape.DimsDataUpTo5D());
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    int out_idx_b = b * dims_data[1];
+    int in_idx1_b = desc1.strides[0] * b;
+    int in_idx2_b = desc2.strides[0] * b;
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      int out_idx_y = (out_idx_b + y) * dims_data[2];
+      int in_idx1_y = in_idx1_b + desc1.strides[1] * y;
+      int in_idx2_y = in_idx2_b + desc2.strides[1] * y;
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        int out_idx_x = (out_idx_y + x) * dims_data[3];
+        int in1_idx = in_idx1_y + desc1.strides[2] * x;
+        int in2_idx = in_idx2_y + desc2.strides[2] * x;
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = out_idx_x + c;
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = func(in1_val, in2_val);
+          in1_idx += desc1.strides[3];
+          in2_idx += desc2.strides[3];
+        }
+      }
+    }
+  }
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BinaryFunction(const RuntimeShape& input1_shape,
+                           const T1* input1_data,
+                           const RuntimeShape& input2_shape,
+                           const T2* input2_data,
+                           const RuntimeShape& output_shape, R* output_data,
+                           R (*func)(T1, T2)) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/broadcast_args.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/broadcast_args.h
new file mode 100644
index 00000000..d93c316d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/broadcast_args.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+void BroadcastArgs(const RuntimeShape& input1_shape, const T* input1_data,
+                   const RuntimeShape& input2_shape, const T* input2_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  // Gets data at the backward index i of the shape tensor. Returns 1 if the
+  // index is out of range.
+  auto get_shape_data = [](const RuntimeShape& shape, const T* data,
+                           int backward_idx) -> T {
+    int forward_idx = shape.FlatSize() - 1 - backward_idx;
+    if (forward_idx < 0) return 1;
+    return data[forward_idx];
+  };
+
+  int output_num_elements = output_shape.FlatSize();
+  for (int i = 0; i < output_num_elements; ++i) {
+    int backward_i = output_num_elements - 1 - i;
+    int shape1_i = get_shape_data(input1_shape, input1_data, i);
+    int shape2_i = get_shape_data(input2_shape, input2_data, i);
+    if (shape1_i == 1) {
+      output_data[backward_i] = shape2_i;
+    } else if (shape2_i == 1) {
+      output_data[backward_i] = shape1_i;
+    } else {
+      TFLITE_CHECK_EQ(shape1_i, shape2_i);
+      output_data[backward_i] = shape1_i;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/broadcast_to.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/broadcast_to.h
new file mode 100644
index 00000000..f106b2b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/broadcast_to.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace reference_ops {
+template <int N>
+void BroadcastImpl(const NdArrayDesc<N>& input_desc, const char* input_data,
+                   const NdArrayDesc<N>& output_desc, char* output_data,
+                   int indexes[N], int dim, const int last_broadcasting_dim,
+                   const int type_size) {
+  // Copy data from input to output.
+  if (dim == last_broadcasting_dim) {
+    int copy_size = output_desc.strides[dim] * type_size;
+    const char* data_src =
+        input_data + SubscriptToIndex(input_desc, indexes) * type_size;
+    char* data_dst =
+        output_data + SubscriptToIndex(output_desc, indexes) * type_size;
+    for (int i = 0; i < output_desc.extents[dim]; ++i, data_dst += copy_size) {
+      memcpy(data_dst, data_src, copy_size);
+    }
+    return;
+  }
+
+  // Recursive call to find the next broadcasting.
+  for (indexes[dim] = 0; indexes[dim] < input_desc.extents[dim];
+       ++indexes[dim]) {
+    BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes,
+                     dim + 1, last_broadcasting_dim, type_size);
+  }
+
+  // Duplicate data in output tensor.
+  indexes[dim] = 0;
+  if (input_desc.extents[dim] != output_desc.extents[dim]) {
+    int copy_size = output_desc.strides[dim] * type_size;
+    char* data_src =
+        output_data + SubscriptToIndex(output_desc, indexes) * type_size;
+    char* data_dst = data_src + copy_size;
+    for (int i = 1; i < output_desc.extents[dim]; ++i, data_dst += copy_size) {
+      memcpy(data_dst, data_src, copy_size);
+    }
+  }
+}
+
+template <int N>
+inline void BroadcastTo(const RuntimeShape& unextended_input_shape,
+                        const char* input_data,
+                        const RuntimeShape& unextended_output_shape,
+                        char* output_data, TfLiteType data_type) {
+  NdArrayDesc<N> input_desc;
+  NdArrayDesc<N> output_desc;
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
+                 &input_desc);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  // Get the last dimension has broadcasting. At this dimension, the data is
+  // copied from input tensor to output tensor.
+  int last_broadcast_dim = -1;
+  for (int i = N - 1; i >= 0; --i) {
+    if (input_desc.extents[i] != output_desc.extents[i]) {
+      last_broadcast_dim = i;
+      break;
+    }
+  }
+
+  // If non-broadcasting, just copy data from input to output tensor.
+  if (last_broadcast_dim == -1) {
+    memcpy(output_data, input_data,
+           unextended_input_shape.FlatSize() * TfLiteTypeGetSize(data_type));
+    return;
+  }
+
+  // Broadcasting using memcpy.
+  int indexes[N] = {0};
+  BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, 0,
+                   last_broadcast_dim, TfLiteTypeGetSize(data_type));
+}
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/cast.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/cast.h
new file mode 100644
index 00000000..8f013234
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/cast.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CAST_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CAST_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename SrcT, typename DstT>
+inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
+                 const RuntimeShape& output_shape, DstT* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = static_cast<DstT>(input_data[offset]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CAST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/ceil.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/ceil.h
new file mode 100644
index 00000000..66d1dc35
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/ceil.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = std::ceil(input_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/comparisons.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/comparisons.h
new file mode 100644
index 00000000..e40e4045
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -0,0 +1,275 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline bool EqualFn(T lhs, T rhs) {
+  return lhs == rhs;
+}
+
+template <typename T>
+inline bool NotEqualFn(T lhs, T rhs) {
+  return lhs != rhs;
+}
+
+template <typename T>
+inline bool GreaterFn(T lhs, T rhs) {
+  return lhs > rhs;
+}
+template <typename T>
+inline bool GreaterEqualFn(T lhs, T rhs) {
+  return lhs >= rhs;
+}
+template <typename T>
+inline bool LessFn(T lhs, T rhs) {
+  return lhs < rhs;
+}
+template <typename T>
+inline bool LessEqualFn(T lhs, T rhs) {
+  return lhs <= rhs;
+}
+
+template <typename T>
+using ComparisonFn = bool (*)(T, T);
+
+template <typename T, ComparisonFn<T> F>
+inline void ComparisonImpl(
+    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] = F(input1_data[i], input2_data[i]);
+  }
+}
+
+template <ComparisonFn<float> F>
+inline void Comparison(const ComparisonParams& op_params,
+                       const RuntimeShape& input1_shape,
+                       const float* input1_data,
+                       const RuntimeShape& input2_shape,
+                       const float* input2_data,
+                       const RuntimeShape& output_shape, bool* output_data) {
+  ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
+                           input2_data, output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void ComparisonWithScaling(
+    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
+  int left_shift = op_params.left_shift;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
+  int input1_shift = op_params.input1_shift;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
+  int input2_shift = op_params.input2_shift;
+
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const int32_t input1_val = input1_offset + input1_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier, input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier, input2_shift);
+    output_data[i] = F(scaled_input1_val, scaled_input2_val);
+  }
+}
+
+struct BroadcastComparison4DSlowCommon {
+  const RuntimeShape output_shape;
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+};
+
+TFLITE_NOINLINE
+BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+    const RuntimeShape& unextended_input1_shape,
+    const RuntimeShape& unextended_input2_shape,
+    const RuntimeShape& unextended_output_shape);
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison4DSlowImpl(
+    const ComparisonParams& op_params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T* input2_data,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          output_data[Offset(dims.output_shape, b, y, x, c)] =
+              F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)],
+                input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+
+template <ComparisonFn<float> F>
+inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
+                                      const RuntimeShape& input1_shape,
+                                      const float* input1_data,
+                                      const RuntimeShape& input2_shape,
+                                      const float* input2_data,
+                                      const RuntimeShape& output_shape,
+                                      bool* output_data) {
+  BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
+                                          input2_shape, input2_data,
+                                          output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void BroadcastComparison4DSlowWithScaling(
+    const ComparisonParams& op_params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T* input2_data,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  int left_shift = op_params.left_shift;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
+  int input1_shift = op_params.input1_shift;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
+  int input2_shift = op_params.input2_shift;
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              input1_offset +
+              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
+          const int32_t input2_val =
+              input2_offset +
+              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          output_data[Offset(dims.output_shape, b, y, x, c)] =
+              F(scaled_input1_val, scaled_input2_val);
+        }
+      }
+    }
+  }
+}
+
+#define TFLITE_COMPARISON_OP(name)                                             \
+  inline void name(const ComparisonParams& op_params,                          \
+                   const RuntimeShape& input1_shape, const float* input1_data, \
+                   const RuntimeShape& input2_shape, const float* input2_data, \
+                   const RuntimeShape& output_shape, bool* output_data) {      \
+    Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape,   \
+                         input2_data, output_shape, output_data);              \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void name##NoScaling(                                                 \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data,          \
+                                input2_shape, input2_data, output_shape,       \
+                                output_data);                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void name##WithScaling(                                               \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data,   \
+                                       input2_shape, input2_data,              \
+                                       output_shape, output_data);             \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void Broadcast4DSlow##name##NoScaling(                                \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(                                \
+        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
+        output_shape, output_data);                                            \
+  }                                                                            \
+  inline void Broadcast4DSlow##name(                                           \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const float* input1_data, const RuntimeShape& input2_shape,              \
+      const float* input2_data, const RuntimeShape& output_shape,              \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data,  \
+                                        input2_shape, input2_data,             \
+                                        output_shape, output_data);            \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void Broadcast4DSlow##name##WithScaling(                              \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                         \
+        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
+        output_shape, output_data);                                            \
+  }
+TFLITE_COMPARISON_OP(Equal)
+TFLITE_COMPARISON_OP(NotEqual)
+TFLITE_COMPARISON_OP(Greater)
+TFLITE_COMPARISON_OP(GreaterEqual)
+TFLITE_COMPARISON_OP(Less)
+TFLITE_COMPARISON_OP(LessEqual)
+#undef TFLITE_COMPARISON_OP
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/concatenation.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/concatenation.h
new file mode 100644
index 00000000..9d2ecbec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -0,0 +1,141 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename Scalar>
+inline void Concatenation(const ConcatenationParams& params,
+                          const RuntimeShape* const* input_shapes,
+                          const Scalar* const* input_data,
+                          const RuntimeShape& output_shape,
+                          Scalar* output_data) {
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+  const int concat_dimensions = output_shape.DimensionsCount();
+  TFLITE_DCHECK_LT(axis, concat_dimensions);
+
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
+    for (int j = 0; j < concat_dimensions; j++) {
+      if (j != axis) {
+        MatchingDim(*input_shapes[i], j, output_shape, j);
+      }
+    }
+    concat_size += input_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= output_shape.Dims(i);
+  }
+  // For all input arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < concat_dimensions; ++i) {
+    base_inner_size *= output_shape.Dims(i);
+  }
+
+  Scalar* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+      const Scalar* input_ptr = input_data[i] + k * copy_size;
+      memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+// TODO(b/174275780): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void ConcatenationWithScaling(const ConcatenationParams& params,
+                                     const RuntimeShape* const* input_shapes,
+                                     const uint8_t* const* input_data,
+                                     const RuntimeShape& output_shape,
+                                     uint8_t* output_data) {
+  int axis = params.axis;
+  const int32_t* input_zeropoint = params.input_zeropoint;
+  const float* input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32_t output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  const int concat_dimensions = output_shape.DimensionsCount();
+  TFLITE_DCHECK_LT(axis, concat_dimensions);
+
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
+    for (int j = 0; j < concat_dimensions; j++) {
+      if (j != axis) {
+        MatchingDim(*input_shapes[i], j, output_shape, j);
+      }
+    }
+    concat_size += input_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= output_shape.Dims(i);
+  }
+  // For all input arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < concat_dimensions; ++i) {
+    base_inner_size *= output_shape.Dims(i);
+  }
+
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8_t* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+      const uint8_t* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value = static_cast<int32_t>(tflite::TfLiteRound(
+                                    input_ptr[j] * scale + bias)) +
+                                output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(
+              std::max<int32_t>(std::min<int32_t>(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv.h
new file mode 100644
index 00000000..3c9f9fc7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv.h
@@ -0,0 +1,289 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                float input_value =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                float filter_value = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                total += (input_value * filter_value);
+              }
+            }
+          }
+          float bias_value = 0.0f;
+          if (bias_data) {
+            bias_value = bias_data[out_channel];
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(total + bias_value,
+                                           output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, void* cpu_backend_context) {
+  (void)cpu_backend_context;  // only used in optimized code.
+  (void)im2col_data;          // only used in optimized code.
+  (void)im2col_shape;         // only used in optimized code.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                acc +=
+                    (filter_val + filter_offset) * (input_val + input_offset);
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void HybridConvPerChannel(
+    const ConvParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x,
+                      in_channel + group * filter_input_depth)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+          }
+          float acc_float =
+              acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch];
+          if (bias_data) {
+            acc_float += bias_data[out_channel];
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(acc_float, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv3d.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv3d.h
new file mode 100644
index 00000000..8ab6ef5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv3d.h
@@ -0,0 +1,114 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV3D_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV3D_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Conv3D(const Conv3DParams& params, const RuntimeShape& input_shape,
+                   const float* input_data, const RuntimeShape& filter_shape,
+                   const float* filter_data, const RuntimeShape& bias_shape,
+                   const float* bias_data, const RuntimeShape& output_shape,
+                   float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 5);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_num_channels = MatchingDim(input_shape, 4, filter_shape, 3);
+  const int output_num_channels = MatchingDim(filter_shape, 4, output_shape, 4);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_num_channels);
+  }
+
+  // Only NDHWC format is currently supported.
+  const int input_width = input_shape.Dims(3);
+  const int input_height = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_depth = filter_shape.Dims(0);
+  const int output_width = output_shape.Dims(3);
+  const int output_height = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(1);
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int pad_depth = params.padding_values.depth;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      const int in_d_origin = (out_d * params.stride_depth) - pad_depth;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * params.stride_height) - pad_height;
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * params.stride_width) - pad_width;
+          for (int out_channel = 0; out_channel < output_num_channels;
+               ++out_channel) {
+            float total = 0.f;
+            for (int filter_d = 0; filter_d < filter_depth; ++filter_d) {
+              const int in_d = in_d_origin + params.dilation_depth * filter_d;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                const int in_y =
+                    in_y_origin + params.dilation_height * filter_y;
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + params.dilation_width * filter_x;
+
+                  // Zero padding by omitting the areas outside the image.
+                  const bool is_point_inside_image =
+                      (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height) && (in_d >= 0) &&
+                      (in_d < input_depth);
+
+                  if (!is_point_inside_image) {
+                    continue;
+                  }
+
+                  for (int in_channel = 0; in_channel < input_num_channels;
+                       ++in_channel) {
+                    float input_value = input_data[Offset(
+                        input_shape, batch, in_d, in_y, in_x, in_channel)];
+                    float filter_value =
+                        filter_data[Offset(filter_shape, filter_d, filter_y,
+                                           filter_x, in_channel, out_channel)];
+                    total += (input_value * filter_value);
+                  }
+                }
+              }
+            }
+            float bias_value = 0.0f;
+            if (bias_data) {
+              bias_value = bias_data[out_channel];
+            }
+            output_data[Offset(output_shape, batch, out_d, out_y, out_x,
+                               out_channel)] =
+                ActivationFunctionWithMinMax(total + bias_value,
+                                             params.float_activation_min,
+                                             params.float_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV3D_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv3d_transpose.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv3d_transpose.h
new file mode 100644
index 00000000..5e944785
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/conv3d_transpose.h
@@ -0,0 +1,134 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV3D_TRANSPOSE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV3D_TRANSPOSE_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Conv3DTranspose(
+    const Conv3DTransposeParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int stride_depth = params.stride_depth;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int pad_depth = params.padding_values.depth;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 5);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_num_channels = MatchingDim(input_shape, 4, filter_shape, 4);
+  const int output_num_channels = output_shape.Dims(4);
+  const int input_depth = input_shape.Dims(1);
+  const int input_height = input_shape.Dims(2);
+  const int input_width = input_shape.Dims(3);
+  const int filter_depth = filter_shape.Dims(0);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_depth = output_shape.Dims(1);
+  const int output_height = output_shape.Dims(2);
+  const int output_width = output_shape.Dims(3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_num_channels);
+  }
+
+  // Initializes the output array to zero.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; i++) {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_d = 0; in_d < input_depth; ++in_d) {
+      for (int in_y = 0; in_y < input_height; ++in_y) {
+        for (int in_x = 0; in_x < input_width; ++in_x) {
+          for (int in_channel = 0; in_channel < input_num_channels;
+               ++in_channel) {
+            // Loop through the output elements it will influence.
+            const int out_x_origin = (in_x * stride_width) - pad_width;
+            const int out_y_origin = (in_y * stride_height) - pad_height;
+            const int out_d_origin = (in_d * stride_depth) - pad_depth;
+            for (int filter_d = 0; filter_d < filter_depth; ++filter_d) {
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  for (int out_channel = 0; out_channel < output_num_channels;
+                       ++out_channel) {
+                    // Compute output element location.
+                    const int out_x =
+                        out_x_origin + params.dilation_width * filter_x;
+                    const int out_y =
+                        out_y_origin + params.dilation_height * filter_y;
+                    const int out_d =
+                        out_d_origin + params.dilation_depth * filter_d;
+                    // We cannot accumulate out of bounds.
+                    if ((out_x >= 0) && (out_x < output_width) &&
+                        (out_y >= 0) && (out_y < output_height) &&
+                        (out_d >= 0) && (out_d < output_depth)) {
+                      float input_value = input_data[Offset(
+                          input_shape, batch, in_d, in_y, in_x, in_channel)];
+                      float filter_value = filter_data[Offset(
+                          filter_shape, filter_d, filter_y, filter_x,
+                          out_channel, in_channel)];
+                      output_data[Offset(output_shape, batch, out_d, out_y,
+                                         out_x, out_channel)] +=
+                          input_value * filter_value;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const float float_activation_min = params.float_activation_min;
+  const float float_activation_max = params.float_activation_max;
+  float* data_ptr = output_data;
+  if (bias_data) {
+    const int outer_size =
+        batches * output_depth * output_height * output_width;
+    for (int n = 0; n < outer_size; ++n) {
+      for (int c = 0; c < output_num_channels; ++c) {
+        data_ptr[c] = ActivationFunctionWithMinMax(data_ptr[c] + bias_data[c],
+                                                   float_activation_min,
+                                                   float_activation_max);
+      }
+      data_ptr += output_num_channels;
+    }
+  } else {
+    const int flat_size = output_shape.FlatSize();
+    for (int i = 0; i < flat_size; ++i) {
+      data_ptr[i] = ActivationFunctionWithMinMax(
+          data_ptr[i], float_activation_min, float_activation_max);
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV3D_TRANSPOSE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/cumsum.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/cumsum.h
new file mode 100644
index 00000000..7cbc87c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/cumsum.h
@@ -0,0 +1,175 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+inline void CumSum(const T* input_data, const RuntimeShape& shape, int32_t axis,
+                   bool exclusive, bool reverse, T* output_data) {
+  const int32_t rank = shape.DimensionsCount();
+  TFLITE_DCHECK_GE(rank, 1);
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, rank);
+
+  size_t inner = 1;
+  size_t outer = 1;
+  size_t depth = 1;
+  for (int32_t i = 0; i < rank; i++) {
+    if (i < axis)
+      inner *= shape.Dims(i);
+    else if (i > axis)
+      outer *= shape.Dims(i);
+    else
+      depth = shape.Dims(i);
+  }
+
+  for (size_t outer_index = 0; outer_index < outer; outer_index++) {
+    size_t outer_index_adj;
+    if (reverse)
+      outer_index_adj = (outer - 1) - outer_index;
+    else
+      outer_index_adj = outer_index;
+    for (size_t inner_index = 0; inner_index < inner; inner_index++) {
+      T accumulator = 0;
+      size_t inner_index_adj;
+      if (reverse)
+        inner_index_adj = (inner - 1) - inner_index;
+      else
+        inner_index_adj = inner_index;
+      for (size_t depth_index = 0; depth_index < depth; depth_index++) {
+        size_t depth_index_adj;
+        if (reverse)
+          depth_index_adj = (depth - 1) - depth_index;
+        else
+          depth_index_adj = depth_index;
+
+        size_t index = outer_index_adj;
+        index += inner_index_adj * depth * outer;
+        index += depth_index_adj * outer;
+
+        if (exclusive) {
+          output_data[index] = accumulator;
+          accumulator += input_data[index];
+        } else {
+          accumulator += input_data[index];
+          output_data[index] = accumulator;
+        }
+      }
+    }
+  }
+}
+
+//
+// Quantized INT8 CUMSUM
+//
+inline void CumSum(const ArithmeticParams& params, const int8_t* input_data,
+                   const RuntimeShape& shape, int32_t axis, bool exclusive,
+                   bool reverse, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  // All inputs should have same zero-point and scale, this is checked during
+  // Prepare stage.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+
+  const int32_t rank = shape.DimensionsCount();
+  TFLITE_DCHECK_GE(rank, 1);
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, rank);
+
+  size_t inner = 1;
+  size_t outer = 1;
+  size_t depth = 1;
+  for (int32_t i = 0; i < rank; i++) {
+    if (i < axis)
+      inner *= shape.Dims(i);
+    else if (i > axis)
+      outer *= shape.Dims(i);
+    else
+      depth = shape.Dims(i);
+  }
+
+  for (size_t outer_index = 0; outer_index < outer; outer_index++) {
+    size_t outer_index_adj;
+    if (reverse)
+      outer_index_adj = (outer - 1) - outer_index;
+    else
+      outer_index_adj = outer_index;
+    for (size_t inner_index = 0; inner_index < inner; inner_index++) {
+      int32_t accumulator = params.input1_offset;  // accumulator = 0
+      accumulator *= (1 << params.left_shift);
+      accumulator = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          accumulator, params.input1_multiplier, params.input1_shift);
+
+      size_t inner_index_adj;
+      if (reverse)
+        inner_index_adj = (inner - 1) - inner_index;
+      else
+        inner_index_adj = inner_index;
+
+      for (size_t depth_index = 0; depth_index < depth; depth_index++) {
+        size_t depth_index_adj;
+        if (reverse)
+          depth_index_adj = (depth - 1) - depth_index;
+        else
+          depth_index_adj = depth_index;
+
+        size_t index = outer_index_adj;
+        index += inner_index_adj * depth * outer;
+        index += depth_index_adj * outer;
+
+        const int32_t y = params.input1_offset + input_data[index];
+        const int32_t shifted_y = y * (1 << params.left_shift);
+        const int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_y, params.input1_multiplier, params.input1_shift);
+
+        int32_t scaled_output;
+        if (exclusive) {
+          scaled_output = accumulator;
+          accumulator += scaled_y;
+        } else {
+          accumulator += scaled_y;
+          scaled_output = accumulator;
+        }
+
+        const int32_t raw_output =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                scaled_output, params.output_multiplier, params.output_shift) +
+            params.output_offset;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        output_data[index] = static_cast<int8_t>(clamped_output);
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/densify.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/densify.h
new file mode 100644
index 00000000..98911345
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/densify.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DENSIFY_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DENSIFY_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+inline void Densify(const TfLiteSparsity* sparsity,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const RuntimeShape& output_shape, T* output_data,
+                    TfLiteContext* context) {
+  const int dims_count = output_shape.DimensionsCount();
+  std::vector<int> vector_shape(dims_count);
+  for (int i = 0; i < dims_count; i++) {
+    vector_shape[i] = output_shape.Dims(i);
+  }
+
+  tflite::internal::sparsity::FormatConverter<T> converter(vector_shape,
+                                                           *sparsity);
+  converter.SparseToDense(input_data, output_shape.FlatSize(), output_data,
+                          context);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DENSIFY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depth_to_space.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depth_to_space.h
new file mode 100644
index 00000000..23cff285
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depth_to_space.h
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_batch = input_shape.Dims(0);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch = output_shape.Dims(0);
+
+  const int32_t block_size = op_params.block_size;
+
+  TFLITE_DCHECK_EQ(input_width * block_size, output_width);
+  TFLITE_DCHECK_EQ(input_height * block_size, output_height);
+  TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
+  TFLITE_DCHECK_EQ(input_batch, output_batch);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        for (int out_d = 0; out_d < output_depth; ++out_d) {
+          const int in_d =
+              out_d + ((out_h % block_size) * block_size + out_w % block_size) *
+                          output_depth;
+
+          const int in_w = out_w / block_size;
+          const int in_h = out_h / block_size;
+          const int in_b = out_b;
+
+          const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
+          const int output_index =
+              Offset(output_shape, out_b, out_h, out_w, out_d);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h
new file mode 100644
index 00000000..0cecb16b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -0,0 +1,100 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            float total = 0.f;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value =
+                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                  float filter_value = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, oc)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            float bias_value = 0.0f;
+            if (bias_data) {
+              bias_value = bias_data[oc];
+            }
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                ActivationFunctionWithMinMax(total + bias_value,
+                                             output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // end namespace reference_ops
+}  // end namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
new file mode 100644
index 00000000..d4fba139
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -0,0 +1,319 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Used in tests and template parameters to control which version of depthwise
+// convolution is called. Primarily for reference code, and specializations
+// forced in tests.
+enum class DepthwiseConvImplementation {
+  // Run all tests against kUseStandardEntry even if also testing another
+  // kernel, since we need to be sure that the main DepthwiseConv() function in
+  // optimized_ops.h dispatches to a correctly-executing kernel.
+  kNone = 0,                 // The "default" option: use the normal
+                             // DepthwiseConv kernel (entry) function.
+  kUseGenericKernel,         // Forced use of generic kernel.
+  kUseNeon3x3,               // 3x3 kernel that uses NEON when available.
+  kUseNeon3x3DotProduct,     // 3x3 kernel that uses dot-product enabled NEON
+                             // when available.
+  kUseCModel3x3DotProduct,   // 3x3 kernel, reference C model that is intended
+                             // to match overall design NEON code.
+  kUseUnwound3x3DotProduct,  // 3x3 kernel, reference C model with unwound loops
+                             // and some arrays.
+  kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
+};
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding {
+  kNone = 0,      // Invalid: specific method must be specified.
+  kAwayFromZero,  // Original method: exact halves rounded away from zero.
+  kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication {
+  kNoMultiplication = 0,  // Depth multiplier = 1.
+  kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
+};
+
+namespace reference_ops {
+namespace depthwise_conv {
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
+                                  int shift) {
+  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
+template <>
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
+template <>
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+// Double-rounding MultiplyByQuantizedMultiplier
+#else
+template <>
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
+  return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                            quantized_multiplier) +
+          rounding_offset) >>
+         right_shift;
+}
+#endif  // TFLITE_SINGLE_ROUNDING
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DepthwiseConvBasicKernel {
+  static inline void Run(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const uint8_t* input_data, const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      uint8_t* output_data) {
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t input_offset = params.input_offset;
+    const int32_t filter_offset = params.weights_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int b = 0; b < batches; ++b) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int ic = 0; ic < input_depth; ++ic) {
+            for (int m = 0; m < depth_multiplier; m++) {
+              const int oc = m + ic * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32_t acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    int32_t input_val =
+                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                    int32_t filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, oc)];
+                    acc += (filter_val + filter_offset) *
+                           (input_val + input_offset);
+                  }
+                }
+              }
+              if (bias_data) {
+                acc += bias_data[oc];
+              }
+              acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
+                                                        output_shift);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                  static_cast<uint8_t>(acc);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // TODO(b/148596273): Reconcile reference versions, perhaps with common
+  // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
+  static inline void RunPerChannel(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const int8_t* input_data, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      int8_t* output_data) {
+    // Get parameters.
+    // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t* output_multiplier = params.output_multiplier_per_channel;
+    const int32_t* output_shift = params.output_shift_per_channel;
+
+    // Check dimensions of the tensors.
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+            for (int m = 0; m < depth_multiplier; ++m) {
+              const int output_channel = m + in_channel * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32_t acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // Zero padding by omitting the areas outside the image.
+                  const bool is_point_inside_image =
+                      (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height);
+                  if (is_point_inside_image) {
+                    int32_t input_val = input_data[Offset(
+                        input_shape, batch, in_y, in_x, in_channel)];
+                    int32_t filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, output_channel)];
+                    // Accumulate with 32 bits accumulator.
+                    // In the nudging process during model quantization, we
+                    // force real value of 0.0 be represented by a quantized
+                    // value. This guarantees that the input_offset is a int8_t,
+                    // even though it is represented using int32_t. int32_t +=
+                    // int8_t
+                    // * (int8_t - int8_t) so the highest value we can get from
+                    // each accumulation is [-127, 127] * ([-128, 127] -
+                    // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                    // = 14.98, which means we can accumulate at least 2^16
+                    // multiplications without overflow. The accumulator is
+                    // applied to a filter so the accumulation logic will hold
+                    // as long as the filter size (filter_y * filter_x *
+                    // in_channel) does not exceed 2^16, which is the case in
+                    // all the models we have seen so far.
+                    acc += filter_val * (input_val + input_offset);
+                  }
+                }
+              }
+              if (bias_data) {
+                acc += bias_data[output_channel];
+              }
+              acc = DepthwiseConvRound<output_rounding>(
+                  acc, output_multiplier[output_channel],
+                  output_shift[output_channel]);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 output_channel)] = static_cast<int8_t>(acc);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace depthwise_conv
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  return depthwise_conv::DepthwiseConvBasicKernel<
+      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
+                                                       input_data, filter_shape,
+                                                       filter_data, bias_shape,
+                                                       bias_data, output_shape,
+                                                       output_data);
+}
+
+}  // namespace reference_ops
+}  // end namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/dequantize.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/dequantize.h
new file mode 100644
index 00000000..b90951f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/dequantize.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
+
+#include <limits.h>
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Dequantizes into a float without rounding.
+template <typename InputT, typename OutputT>
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const InputT* input_data,
+                       const RuntimeShape& output_shape, OutputT* output_data) {
+  int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int32_t val = input_data[i];
+    const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+// Dequantizes per-channel quantized tensor to float.
+template <typename T>
+inline void PerChannelDequantize(
+    const tflite::PerChannelDequantizationParams& op_params,
+    const RuntimeShape& input_shape, const T* input_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  // Ensure flat size is same.
+  MatchingFlatSize(input_shape, output_shape);
+
+  const int32_t* zero_point = op_params.zero_point;
+  const float* scale = op_params.scale;
+  const int32_t quantized_dimension = op_params.quantized_dimension;
+  const int32_t num_dims = input_shape.DimensionsCount();
+  const int32_t* dims_data = input_shape.DimsData();
+  std::vector<int> current_dim(num_dims, 0);
+
+  do {
+    size_t offset =
+        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
+                            current_dim.data(), 0, nullptr);
+    const int channel = current_dim[quantized_dimension];
+    const int32_t val = input_data[offset];
+    const float result =
+        static_cast<float>(scale[channel] * (val - zero_point[channel]));
+    output_data[offset] = result;
+  } while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
+                     current_dim.data()));
+}
+
+}  // namespace reference_ops
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/div.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/div.h
new file mode 100644
index 00000000..df8da1b1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/div.h
@@ -0,0 +1,247 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  constexpr int32_t max_value =
+      static_cast<int32_t>(std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GE(params.input1_offset, -max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, max_value);
+  TFLITE_DCHECK_GE(params.output_offset, -max_value);
+  TFLITE_DCHECK_LE(params.output_offset, max_value);
+}
+
+// Element-wise div that can often be used for inner loop of broadcast Div as
+// well as the non-broadcast Div.
+template <typename T>
+inline void DivElementwise(int size, const ArithmeticParams& params,
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  DivCheckArithmeticParams<T>(params);
+
+  for (int i = 0; i < size; ++i) {
+    int32_t input1_val = params.input1_offset + input1_data[i];
+    int32_t input2_val = params.input2_offset + input2_data[i];
+    TFLITE_DCHECK_NE(input2_val, 0);
+    if (input2_val < 0) {
+      // Invert signs to avoid a negative input2_val as input2_inv needs to be
+      // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
+      input1_val = -input1_val;
+      input2_val = -input2_val;
+    }
+    int recip_shift;
+    const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
+    const int headroom = CountLeadingSignBits(input1_val);
+    const int32_t unscaled_quotient =
+        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                    headroom);
+    const int total_shift = params.output_shift - recip_shift - headroom;
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            unscaled_quotient, params.output_multiplier, total_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  DivElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  DivElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T, int N = 5>
+inline void BroadcastDivSlowQuantized(
+    const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
+    const T* input1_data, const RuntimeShape& unextended_input2_shape,
+    const T* input2_data, const RuntimeShape& unextended_output_shape,
+    T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  DivCheckArithmeticParams<T>(params);
+
+  auto div_func = [&](int indexes[N]) {
+    int32_t input1_val =
+        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
+    int32_t input2_val =
+        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
+    TFLITE_DCHECK_NE(input2_val, 0);
+    if (input2_val < 0) {
+      // Invert signs to avoid a negative input2_val as input2_inv needs to be
+      // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
+      input1_val = -input1_val;
+      input2_val = -input2_val;
+    }
+    int recip_shift;
+    const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
+    const int headroom = CountLeadingSignBits(input1_val);
+    const int32_t unscaled_quotient =
+        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                    headroom);
+    const int total_shift = params.output_shift - recip_shift - headroom;
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            unscaled_quotient, params.output_multiplier, total_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        static_cast<T>(clamped_output);
+  };
+  NDOpsHelper<N>(output_desc, div_func);
+}
+
+template <int N = 5>
+inline void BroadcastDivSlow(const ArithmeticParams& params,
+                             const RuntimeShape& unextended_input1_shape,
+                             const uint8_t* input1_data,
+                             const RuntimeShape& unextended_input2_shape,
+                             const uint8_t* input2_data,
+                             const RuntimeShape& unextended_output_shape,
+                             uint8_t* output_data) {
+  BroadcastDivSlowQuantized<uint8_t, N>(
+      params, unextended_input1_shape, input1_data, unextended_input2_shape,
+      input2_data, unextended_output_shape, output_data);
+}
+
+template <int N = 5>
+inline void BroadcastDivSlow(const ArithmeticParams& params,
+                             const RuntimeShape& unextended_input1_shape,
+                             const int8_t* input1_data,
+                             const RuntimeShape& unextended_input2_shape,
+                             const int8_t* input2_data,
+                             const RuntimeShape& unextended_output_shape,
+                             int8_t* output_data) {
+  BroadcastDivSlowQuantized<int8_t, N>(
+      params, unextended_input1_shape, input1_data, unextended_input2_shape,
+      input2_data, unextended_output_shape, output_data);
+}
+
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T, int N = 5>
+void BroadcastDivSlow(const ArithmeticParams& params,
+                      const RuntimeShape& unextended_input1_shape,
+                      const T* input1_data,
+                      const RuntimeShape& unextended_input2_shape,
+                      const T* input2_data,
+                      const RuntimeShape& unextended_output_shape,
+                      T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+
+  auto div_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] /
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            output_activation_min, output_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, div_func);
+}
+
+template <typename T>
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/elu.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/elu.h
new file mode 100644
index 00000000..3dc93589
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/elu.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Elu(const RuntimeShape& input_shape, const float* input_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0f ? TfLiteExpm1(val) : val;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/exp.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/exp.h
new file mode 100644
index 00000000..134ee13f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/exp.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+inline void Exp(const T* input_data, const size_t num_elements,
+                T* output_data) {
+  ruy::profiler::ScopeLabel label("Exp");
+  for (size_t idx = 0; idx < num_elements; ++idx) {
+    output_data[idx] = std::exp(input_data[idx]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/fill.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/fill.h
new file mode 100644
index 00000000..16630e61
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/fill.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+void Fill(const RuntimeShape& value_shape, const T* value_data,
+          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = *value_data;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor.h
new file mode 100644
index 00000000..0693fd42
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Floor(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor_div.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor_div.h
new file mode 100644
index 00000000..e75d473c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor_div.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
+
+#include <cmath>
+#include <functional>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+T FloorDiv(T input1, T input2) {
+  return std::floor(std::divides<double>()(static_cast<double>(input1),
+                                           static_cast<double>(input2)));
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor_mod.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor_mod.h
new file mode 100644
index 00000000..20ce18b7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/floor_mod.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
+
+#include <cmath>
+#include <functional>
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+T FloorMod(T input1, T input2) {
+  struct FloatMod {
+    float operator()(const float lhs, const float rhs) const {
+      return std::fmod(lhs, rhs);
+    }
+  };
+  using ModFunc = typename std::conditional<std::is_integral<T>::value,
+                                            std::modulus<T>, FloatMod>::type;
+  ModFunc mod_func;
+  T trunc_mod = mod_func(input1, input2);
+  return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0))
+             ? (trunc_mod + input2)
+             : trunc_mod;
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/fully_connected.h
new file mode 100644
index 00000000..ba51cbcf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -0,0 +1,323 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+                                       output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      float total = 0.f;
+      for (int d = 0; d < accum_depth; ++d) {
+        total += input_data[b * accum_depth + d] *
+                 weights_data[out_c * accum_depth + d];
+      }
+      float bias_value = 0.0f;
+      if (bias_data) {
+        bias_value = bias_data[out_c];
+      }
+      output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+          total + bias_value, output_activation_min, output_activation_max);
+    }
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
+    }
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; ++d) {
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
+        accum += filter_val * input_val;
+      }
+      // Down-scale the final int32_t accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
+      // Saturate, cast to int16_t, and store to output array.
+      accum = std::max(accum, output_activation_min - output_offset);
+      accum = std::min(accum, output_activation_max - output_offset);
+      accum += output_offset;
+      output_data[out_c + output_depth * b] = accum;
+    }
+  }
+}
+
+inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  if (batches == 1) {
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+  } else if (batches == 4) {
+    for (int c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8_t src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8_t values as int8_t, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8_t dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  // Actual computation
+  if (batches == 1) {
+    int16_t* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8_t values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
+        }
+      }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int32_t acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32_t accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc =
+            MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+        // Saturate, cast to int16_t, and store to output array.
+        acc = std::max(acc, output_activation_min);
+        acc = std::min(acc, output_activation_max);
+        output_ptr[c + i] = acc;
+      }
+    }
+  } else if (batches == 4) {
+    int16_t* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8_t values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int32_t acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32_t accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          // Saturate, cast to int16_t, and store to output array.
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_ptr[b * output_depth + c + i] = acc;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/gather.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/gather.h
new file mode 100644
index 00000000..ce66d266
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/gather.h
@@ -0,0 +1,120 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GATHER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GATHER_H_
+
+#include <cstdint>
+#include <cstring>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T, typename CoordsT = int32_t>
+inline TfLiteStatus Gather(const tflite::GatherParams& op_params,
+                           const RuntimeShape& input_shape, const T* input_data,
+                           const RuntimeShape& coords_shape,
+                           const CoordsT* coords_data,
+                           const RuntimeShape& output_shape, T* output_data,
+                           bool int4_input = false) {
+  ruy::profiler::ScopeLabel label("Gather");
+  int axis = op_params.axis;
+  if (axis < 0) {
+    axis += input_shape.DimensionsCount();
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, input_shape.DimensionsCount());
+
+  int batch_dims = op_params.batch_dims;
+  if (batch_dims < 0) {
+    batch_dims += coords_shape.DimensionsCount();
+  }
+  TFLITE_DCHECK_GE(batch_dims, 0);
+  TFLITE_DCHECK_LT(batch_dims, input_shape.DimensionsCount());
+  TFLITE_DCHECK_LE(batch_dims, coords_shape.DimensionsCount());
+  TFLITE_DCHECK_GE(axis, batch_dims);
+  for (int i = 0; i < batch_dims; ++i) {
+    TFLITE_DCHECK_EQ(input_shape.Dims(i), coords_shape.Dims(i));
+  }
+
+  const int axis_size = input_shape.Dims(axis);
+
+  int batch_size = 1;
+  for (int i = 0; i < batch_dims; ++i) {
+    batch_size *= input_shape.Dims(i);
+  }
+
+  int outer_size = 1;
+  for (int i = batch_dims; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
+    inner_size *= input_shape.Dims(i);
+  }
+
+  int input_flat_size = input_shape.FlatSize();
+  int output_flat_size = output_shape.FlatSize();
+
+  if (int4_input) {
+    // TODO(b/298210669) It doesn't handle the case when sizes are not
+    // divisible by 2.
+    TFLITE_DCHECK_EQ(inner_size % 2, 0);
+    inner_size /= 2;
+    TFLITE_DCHECK_EQ(input_flat_size % 2, 0);
+    input_flat_size /= 2;
+    TFLITE_DCHECK_EQ(output_flat_size % 2, 0);
+    output_flat_size /= 2;
+  }
+
+  int coord_size = 1;
+  for (int i = batch_dims; i < coords_shape.DimensionsCount(); ++i) {
+    coord_size *= coords_shape.Dims(i);
+  }
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int outer = 0; outer < outer_size; ++outer) {
+      for (int i = 0; i < coord_size; ++i) {
+        // TODO(rsun): replace memcpy with a for loop
+        const int64_t coord = coords_data[batch * coord_size + i];
+        if (coord < 0 || coord >= axis_size) {
+          return kTfLiteError;
+        }
+        const int64_t from_pos =
+            (((batch * outer_size) + outer) * axis_size + coord) * inner_size;
+        TFLITE_DCHECK(from_pos >= 0);
+        TFLITE_DCHECK(from_pos + inner_size <= input_flat_size);
+        const int64_t to_pos =
+            (((batch * outer_size) + outer) * coord_size + i) * inner_size;
+        TFLITE_DCHECK(to_pos >= 0);
+        TFLITE_DCHECK(to_pos + inner_size <= output_flat_size);
+        std::memcpy(&output_data[to_pos], &input_data[from_pos],
+                    sizeof(T) * inner_size);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GATHER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/gelu.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/gelu.h
new file mode 100644
index 00000000..98b49f3b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/gelu.h
@@ -0,0 +1,82 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GELU_H_
+
+#include <cmath>
+#include <functional>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/constants.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+namespace gelu_internal {
+
+constexpr float kSqrt2dPi = M_2_SQRTPI * M_SQRT1_2;  // sqrt( 2 / pi )
+
+}  // namespace gelu_internal
+
+// Plain implementations for GELU. Used for populating lookup table.
+inline float GeluTransform(float in) {
+  // Note: 0.5 * x * ( 1 + erf( x / sqrt( 2 ) ) ) is commonly used, but cause
+  // catastropic cancellation for large negative inputs. Rewriting the
+  // expression via erfc avoids the numerical stability issues.
+  return 0.5f * in * std::erfc(in * static_cast<float>(-M_SQRT1_2));
+}
+
+inline float GeluTransformApproximate(float in) {
+  // 0.5 * x * ( 1 + tanh( sqrt( 2 / pi ) * ( x + 0.044715 * x^3 ) ) )
+  return 0.5f * in *
+         (1.f + std::tanh(gelu_internal::kSqrt2dPi *
+                          // Note: Avoid std::pow for integer exponents
+                          // as it leads to much slower performance.
+                          (in + 0.044715f * in * in * in)));
+}
+
+template <typename T>
+inline void Gelu(const RuntimeShape& input_shape, const T* input_data,
+                 bool approximate, const RuntimeShape& output_shape,
+                 T* output_data) {
+  using VectorType = Eigen::VectorX<T>;
+  auto input_map = VectorType::Map(input_data, input_shape.FlatSize());
+  auto output_map = VectorType::Map(output_data, output_shape.FlatSize());
+
+  if (approximate) {
+    // 0.5 * x * ( 1 + tanh( sqrt( 2 / pi ) * ( x + 0.044715 * x^3 ) ) )
+    output_map.array() = static_cast<T>(0.5) * input_map.array() *
+                         (static_cast<T>(1) +
+                          (static_cast<T>(gelu_internal::kSqrt2dPi) *
+                           (input_map.array() + static_cast<T>(0.044715) *
+                                                    input_map.array().cube()))
+                              .tanh());
+  } else {
+    // Note: 0.5 * x * ( 1 + erf( x / sqrt( 2 ) ) ) is commonly used, but cause
+    // catastropic cancellation for large negative inputs. Rewriting the
+    // expression via erfc avoids the numerical stability issues.
+    output_map.array() =
+        static_cast<T>(0.5) * input_map.array() *
+        (input_map.array() * static_cast<T>(-M_SQRT1_2)).erfc();
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/hard_swish.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/hard_swish.h
new file mode 100644
index 00000000..81fcd63e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/hard_swish.h
@@ -0,0 +1,168 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_HARD_SWISH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_HARD_SWISH_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline int16_t SaturatingLeftShift(int16_t value, int amount) {
+  int64_t result = static_cast<int64_t>(value) * (1 << amount);
+  result = std::min<int64_t>(result, std::numeric_limits<int16_t>::max());
+  result = std::max<int64_t>(result, std::numeric_limits<int16_t>::min());
+  return result;
+}
+
+// Similar to ARM instruction SQDMULH.
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
+// rounding to zero instead of to nearest (SQRDMULH).
+inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+
+template <typename T>
+inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
+  auto matching_size = MatchingFlatSize(input_shape, output_shape);
+  const T* in_end = input_data + matching_size;
+  for (; input_data < in_end; input_data++, output_data++) {
+    const float in = *input_data;
+    *output_data =
+        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
+        6;
+  }
+}
+
+template <typename T>
+inline void HardSwish(const HardSwishParams& params,
+                      const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int16_t input_value = input_data[i] - params.input_zero_point;
+    // Left-shift as much as we can without overflow/saturation to put
+    // significant bits in the high bits of our 16-bit fixedpoint values, so
+    // that fixed-point approximate computations below are as accurate as
+    // possible.
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
+    // Compute the input value on essentially the output scale, just not
+    // right-shifted yet. This is the value that we'll use in the (x >= +3)
+    // case, and that in the general case we'll multiply against the "relu-ish"
+    // fixed-point multiplier in [0, 1].
+    const int16_t input_value_on_preshift_output_scale =
+        gemmlowp::SaturatingRoundingDoublingHighMul(
+            input_value_on_hires_input_scale,
+            params.output_multiplier_fixedpoint_int16);
+    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
+    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
+    // case, it is just that plus saturation at the boundaries of [-3, 3].
+    // First, we rescale from [-3, 3] to [-1, 1], saturating.
+    // That is done by rescaling the input value with a fixed-point multiplier
+    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
+    // that input value on the scale where the real value 3.0f is represented
+    // by the quantized value 32768.  (+32768 is actually not representable as
+    // int16_t, so this saturates at +32767, and that is seen empirically to be
+    // a negligible contribution to numerical error/bias).
+    //
+    // This code is careful to correctly implement any magnitude of multiplier,
+    // involving either a right shift or a left shift, with correct saturation
+    // behavior in the left-shift case. This forces this code to be more
+    // complicated, but is necessary for real applications: a partially
+    // trained quantized MobileNet v3-small model that motivated this code
+    // exhibits some large [min, max] range boundaries, of the order of
+    // magnitude of 10 or 100 depending on layers.
+    //
+    // The next few lines are basically just an ordinary
+    // MultiplyByQuantizedMultiplier, except that we are more careful here
+    // about the fine details of saturation when left-shifting, because here
+    // overflow in left-shift is a common case, not an anomaly as
+    // MultiplyByQuantizedMultiplier assumes.
+    int16_t reluish_value = input_value_on_hires_input_scale;
+    // Shift left, saturating, as much as we can while ensuring that this
+    // saturation will not contribute to the result. That is, left shift amount
+    // reduced by 1.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(
+          reluish_value, params.reluish_multiplier_exponent - 1);
+    }
+    // Apply the fixed-point multiplier, dividing the value by a divisor
+    // ranging in [1, 2].
+    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
+        reluish_value, params.reluish_multiplier_fixedpoint_int16);
+    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
+    // any saturation affects the result, it is happening here --- any
+    // saturation having occurred above is overwritten here, not affecting the
+    // result.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(reluish_value, 1);
+    }
+    // Shift right, in the right-shifting case.
+    if (params.reluish_multiplier_exponent < 0) {
+      reluish_value = gemmlowp::RoundingDivideByPOT(
+          reluish_value, -params.reluish_multiplier_exponent);
+    }
+    // At this point we have rescaled the value into a 16bit fixedpoint
+    // reluish_value in [-1, 1].
+    // We now convert that to a 16bit fixedpoint value in [0, 1].
+    reluish_value = (reluish_value + (1 << 15)) >> 1;
+    // Use of SaturatingDoublingHighMul here is important to cancel the biases
+    // from the above SaturatingRoundingDoublingHighMul.
+    //
+    // On a partially trained MobileNet-v3-small,
+    //
+    //                                       | bias on    |  ImageNet
+    //                                       | quantized  |  Top-1
+    // Operation used here                   | values     |  accuracy (50k)
+    // --------------------------------------+------------+-----------
+    // SaturatingDoublingHighMul             | -0.0024    |  58.920
+    // SaturatingRoundingDoublingHighMul     | -0.0067    |  58.064
+    //
+    // In activations_test, this is covered by this testcase:
+    //     QuantizedActivationsOpTest.HardSwishBias
+    //
+    const int16_t preshift_output_value = SaturatingDoublingHighMul(
+        reluish_value, input_value_on_preshift_output_scale);
+    // We were so far operating on the pre-shift output scale. Now we finally
+    // apply that output shift, arriving at the final output scale.
+    int16_t output_value = gemmlowp::RoundingDivideByPOT(
+        preshift_output_value, -params.output_multiplier_exponent);
+    output_value += params.output_zero_point;
+    output_value =
+        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
+    output_value =
+        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
+    output_data[i] = output_value;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_HARD_SWISH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
new file mode 100644
index 00000000..c2a0e0f0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void CheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
+}
+
+// TODO: b/270589088 - move to a more appropriate file (b/270589088#comment2)
+template <typename T>
+void BroadcastInput1(int size, const ArithmeticParams& params,
+                     const T* input1_data, const T* input2_data, T* output_data,
+                     void (*check_arithmetic_params)(const ArithmeticParams&),
+                     T (*binary_func)(T, T, const ArithmeticParams&)) {
+  CheckArithmeticParams(params);
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = binary_func(input1_data[0], input2_data[i], params);
+  }
+}
+
+template <typename T>
+void BroadcastInput2(int size, const ArithmeticParams& params,
+                     const T* input1_data, const T* input2_data, T* output_data,
+                     void (*check_arithmetic_params)(const ArithmeticParams&),
+                     T (*binary_func)(T, T, const ArithmeticParams&)) {
+  CheckArithmeticParams(params);
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = binary_func(input1_data[i], input2_data[0], params);
+  }
+}
+
+// TODO: b/270589088 - move to a more appropriate file (b/270589088#comment2)
+template <typename T>
+void ElementWise(int size, const ArithmeticParams& params, const T* input1_data,
+                 const T* input2_data, T* output_data,
+                 void (*check_arithmetic_params)(const ArithmeticParams&),
+                 T (*binary_func)(T, T, const ArithmeticParams&)) {
+  CheckArithmeticParams(params);
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = binary_func(input1_data[i], input2_data[i], params);
+  }
+}
+
+template <typename T>
+inline void BroadcastAddRecursiveDimensions(
+    const ArithmeticParams& params, int dimension, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    size_t* compressed_input1_stride, size_t* compressed_input2_stride,
+    size_t* compressed_output_shape, const T* input1_data, const T* input2_data,
+    T* output_data, void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  if (dimension > 0) {
+    for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastAddRecursiveDimensions(
+          params, dimension - 1, &input1_offset_c, &input2_offset_c,
+          output_offset, compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, input1_data, input2_data, output_data,
+          check_arithmetic_params, binary_func);
+      *input1_offset_p += compressed_input1_stride[dimension];
+      *input2_offset_p += compressed_input2_stride[dimension];
+    }
+  } else {
+    TFLITE_DCHECK(dimension == 0);
+    bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
+    bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
+    TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
+    const T* input1_data_ptr = input1_data + *input1_offset_p;
+    const T* input2_data_ptr = input2_data + *input2_offset_p;
+    T* output_data_ptr = output_data + *output_offset;
+    if (input1_is_broadcast) {
+      // input1 is broadcast.
+      BroadcastInput1<T>(compressed_output_shape[dimension], params,
+                         input1_data_ptr, input2_data_ptr, output_data_ptr,
+                         check_arithmetic_params, binary_func);
+      *input2_offset_p += compressed_output_shape[dimension];
+    } else if (input2_is_broadcast) {
+      // input2 is broadcast.
+      BroadcastInput2<T>(compressed_output_shape[dimension], params,
+                         input1_data_ptr, input2_data_ptr, output_data_ptr,
+                         check_arithmetic_params, binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+    } else {
+      // Add element-wise.
+      ElementWise<T>(compressed_output_shape[dimension], params,
+                     input1_data_ptr, input2_data_ptr, output_data_ptr,
+                     check_arithmetic_params, binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+      *input2_offset_p += compressed_output_shape[dimension];
+    }
+    *output_offset += compressed_output_shape[dimension];
+  }
+}
+
+// TODO: b/270589088 - move to a more appropriate file. (b/270589088#comment2)
+template <typename T>
+void BroadcastBinaryFunction6DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  constexpr int kMaxBroadcastDim = 6;
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastAddRecursiveDimensions(
+      params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
+      &output_offset, compressed_input1_stride, compressed_input2_stride,
+      compressed_output_shape, input1_data, input2_data, output_data,
+      check_arithmetic_params, binary_func);
+}
+
+template <typename T>
+void BroadcastBinaryFunction4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                check_arithmetic_params, binary_func);
+}
+
+inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
+  const int32_t input1_val = params.input1_offset + x;
+  const int32_t input2_val = params.input2_offset + y;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  const int32_t scaled_input2_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input2_val, params.input2_multiplier, params.input2_shift);
+  const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+  const int32_t raw_output =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          raw_sum, params.output_multiplier, params.output_shift) +
+      params.output_offset;
+  const int32_t clamped_output =
+      std::min(params.quantized_activation_max,
+               std::max(params.quantized_activation_min, raw_output));
+  return static_cast<int8_t>(clamped_output);
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  ElementWise(size, params, input1_data, input2_data, output_data,
+              CheckArithmeticParams, AddFunc);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  CheckArithmeticParams(params);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAdd6DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
new file mode 100644
index 00000000..eac00576
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -0,0 +1,241 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(b/174275578): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+// 16-bit data and 8-bit filter
+template <typename AccumScalar>
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          AccumScalar acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
new file mode 100644
index 00000000..7676fce0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -0,0 +1,291 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(b/174275578): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // We assume maximum of 2^16 accumulations as with the 8-bit
+                  // case so actually the value in the accumulator should not
+                  // exceed 40 bits
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            scaled_acc = std::max(scaled_acc, output_activation_min);
+            scaled_acc = std::min(scaled_acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                static_cast<int16_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConvHybridPerChannel(
+    const DepthwiseParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int bias_depth = bias_shape.FlatSize();
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_depth, output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+            float acc_float = static_cast<float>(acc);
+            acc_float *=
+                per_channel_scale[output_channel] * scaling_factors_ptr[batch];
+            if (bias_data && output_channel < bias_depth) {
+              acc_float += bias_data[output_channel];
+            }
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                ActivationFunctionWithMinMax(acc_float, output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
new file mode 100644
index 00000000..3c2b85a5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+template <typename T>
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape, const T* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int32_t val = static_cast<int32_t>(input_data[i]);
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
new file mode 100644
index 00000000..c6d06077
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -0,0 +1,128 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// For per-channel functions, since it is defined in quantization spec that
+// weights are symmetric
+// (https://www.tensorflow.org/lite/performance/quantization_spec#symmetric_vs_asymmetric),
+// zero_point (params.weights_offset) is always 0.
+// However, for per-tensor functions, params.weights_offset is still applied for
+// backward compatibility.
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnectedPerChannel(
+    const FullyConnectedParams& params, const int32_t* output_multiplier,
+    const int* output_shift, const RuntimeShape& input_shape,
+    const InputType* input_data, const RuntimeShape& filter_shape,
+    const WeightType* filter_data, const RuntimeShape& bias_shape,
+    const BiasType* bias_data, const RuntimeShape& output_shape,
+    OutputType* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += filter_val * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      int32_t acc_scaled = MultiplyByQuantizedMultiplier(
+          acc, output_multiplier[out_c], output_shift[out_c]);
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnected(const FullyConnectedParams& params,
+                    const RuntimeShape& input_shape,
+                    const InputType* input_data,
+                    const RuntimeShape& filter_shape,
+                    const WeightType* filter_data,
+                    const RuntimeShape& bias_shape, const BiasType* bias_data,
+                    const RuntimeShape& output_shape, OutputType* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      int32_t acc_scaled =
+          MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
new file mode 100644
index 00000000..164a8367
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
+                            int32_t depth, const int8_t* input_data,
+                            int8_t* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  // The output scale must be in sync with Prepare().
+  // Output is in 1/128 scale so the actual output range is nudged from [-1, 1]
+  // to [-1, 127/128].
+  static constexpr int32_t kOutputScale = 7;
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    // int32_t = (int8_t - int8_t) ^ 2.
+    // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
+    // safe from overflowing in at least 2^16 steps.
+    int32_t acc = 0;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input =
+          input_data[depth * outer_index + inner_index] - input_zero_point;
+      acc += input * input;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(acc, kReverseShift, &inv_l2norm_multiplier,
+                                     &inv_l2norm_shift);
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input =
+          input_data[depth * outer_index + inner_index] - input_zero_point;
+
+      // Rescale and downcast. Rescale is folded into the division.
+      int32_t output_in_q24 = MultiplyByQuantizedMultiplier(
+          input, inv_l2norm_multiplier, inv_l2norm_shift + kOutputScale);
+      output_in_q24 =
+          std::min(static_cast<int32_t>(kMaxInt8),
+                   std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
+      output_data[depth * outer_index + inner_index] =
+          static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
new file mode 100644
index 00000000..6d0b278b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
@@ -0,0 +1,113 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void LogSoftmax(int32_t input_multiplier, int32_t input_shift,
+                       int32_t reverse_multiplier, int32_t reverse_shift,
+                       int32_t diff_min, int32_t outer_size, int32_t depth,
+                       const int8_t* input_data, int8_t* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
+
+  // [-16, 0] is mapped to [-128, 127] with 1/16 as scale and 127 as zero
+  // point. This nudges the output to [-255/16, 0].
+  static constexpr int32_t kOutputZeroPoint = 127;
+
+  // All IntegerBits must agree with Prepare function.
+  // Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
+  static constexpr int kInputIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+  using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    int8_t max_in_row = kMinInt8;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      max_in_row =
+          std::max(max_in_row, input_data[outer_index * depth + inner_index]);
+    }
+
+    // Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
+    F12 sum_of_exps_in_q12 = F12::FromRaw(0);
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+        sum_of_exps_in_q12 =
+            sum_of_exps_in_q12 +
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
+      }
+    }
+
+    const int32_t log_sum_of_exps_in_q5 =
+        log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
+            sum_of_exps_in_q12)
+            .raw();
+
+    // Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
+    // smallest representable in Q5.26 plus the log_sum_of_exps.
+    const int32_t shifted_log_sum_of_exps_in_q5 =
+        log_sum_of_exps_in_q5 + kMinInt32;
+    const int32_t adjusted_diff_min = std::max(
+        diff_min - 1,
+        MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
+                                      reverse_multiplier, -reverse_shift));
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      // Note use of > below instead of >= above.
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+
+        // Rescale and downcast.
+        int32_t output_in_q27 =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_in_q5 - log_sum_of_exps_in_q5),
+                31 - kInputIntegerBits - kOutputIntegerBits) +
+            kOutputZeroPoint;
+
+        output_in_q27 =
+            std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxInt8)),
+                     static_cast<int32_t>(kMinInt8));
+        output_data[outer_index * depth + inner_index] =
+            static_cast<int8_t>(output_in_q27);
+      } else {
+        output_data[outer_index * depth + inner_index] = kMinInt8;
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
new file mode 100644
index 00000000..16eff133
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -0,0 +1,121 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int8_t* input_data,
+                     int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputIntegerBits = 8;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kOutputZeroPoint = -128;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
+          input, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+      const int32_t output_in_q0 =
+          gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q23 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
+      output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
+                                        static_cast<int32_t>(kMinInt8)),
+                               static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q23);
+    }
+  }
+}
+
+inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int16_t* ptr_input_data,
+                     int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+
+  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+  // In case of general parameter scale, multiplier 3 is taken into account
+  // in TanhPrepare function and it is included in
+  // input_multiplier already.
+
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  if (input_multiplier == 0) {  // power of two case
+    input_multiplier = 3 << input_left_shift;
+    input_left_shift = 0;
+  }
+
+  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
+
+  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data =
+        ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
+
+    // We do interpolation on unsigned values.
+    uint32_t abs_input_data = abs(input_data);
+
+    // We divide by 2 power of 9, because
+    // we need to divide by 2 in power of 7 for
+    // the input conversion + 1/4 from the scale above.
+
+    // Define uh as uint32_t type not to make this function overflow.
+    uint32_t uh = abs_input_data >> 9;
+    uint32_t result;
+
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0x7FFF << 10;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+      uint32_t ut = abs_input_data & 0x1ff;
+      // Interpolation is done using the fractional bit.
+      result = (ua << 9) + ut * (ub - ua);
+    }
+
+    result = (input_data >= 0) ? (result + (1 << 9))
+                               : ((1 << (16 + 9)) - result + (1 << 9) - 1);
+
+    // Back to 16-bit.
+    result >>= 10;
+
+    *ptr_output_data = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/lut.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/lut.h
new file mode 100644
index 00000000..d7a31c37
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/lut.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LUT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LUT_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+template <typename InputT, typename OutputT>
+inline void LookupTable(const InputT* input_data, int num_elements,
+                        const OutputT* lut, OutputT* output_data) {
+  for (int i = 0; i < num_elements; ++i) {
+    output_data[i] = LUTLookup(input_data[i], lut);
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LUT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
new file mode 100644
index 00000000..7e3f690e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -0,0 +1,18 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
new file mode 100644
index 00000000..a57056d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -0,0 +1,194 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Maximum dimension supported by the broadcast mul operation.
+constexpr int kMaxMulBroadcastDim = 6;
+
+template <typename InputType, typename OutputType>
+void MulElementwise(int size, const ArithmeticParams& params,
+                    const InputType* input1_data, const InputType* input2_data,
+                    OutputType* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<OutputType>(clamped_output);
+  }
+}
+
+template <typename T>
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Mul with 16 bit inputs and int8_t outputs.
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/Int16Int8");
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16_t rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+template <typename T>
+inline void BroadcastMul6DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastMul6DSlow");
+
+  NdArrayDesc<kMaxMulBroadcastDim> desc1;
+  NdArrayDesc<kMaxMulBroadcastDim> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
+
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape_dims[0]; ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape_dims[1]; ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape_dims[2]; ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape_dims[3]; ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape_dims[4]; ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape_dims[5]; ++c) {
+              const int32_t input1_val =
+                  params.input1_offset + input1_data[input1_offset_c];
+              const int32_t input2_val =
+                  params.input2_offset + input2_data[input2_offset_c];
+              const int32_t unclamped_result =
+                  params.output_offset +
+                  MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                params.output_multiplier,
+                                                params.output_shift);
+              const int32_t clamped_output = std::min(
+                  params.quantized_activation_max,
+                  std::max(params.quantized_activation_min, unclamped_result));
+              output_data[output_offset_c] = static_cast<T>(clamped_output);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape_dims[5];
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape_dims[4] * extended_output_shape_dims[5];
+        }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape_dims[3] *
+                           extended_output_shape_dims[4] *
+                           extended_output_shape_dims[5];
+      }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape_dims[2] * extended_output_shape_dims[3] *
+          extended_output_shape_dims[4] * extended_output_shape_dims[5];
+    }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape_dims[1] * extended_output_shape_dims[2] *
+        extended_output_shape_dims[3] * extended_output_shape_dims[4] *
+        extended_output_shape_dims[5];
+  }
+}
+
+template <typename T>
+inline void BroadcastMul4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  BroadcastMul6DSlow(params, input1_shape, input1_data, input2_shape,
+                     input2_data, output_shape, output_data);
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
new file mode 100644
index 00000000..4dc31d9e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -0,0 +1,264 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const int8_t* input_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32_t acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          if (filter_count == 0) return false;
+          // Round to the closest integer value.
+          acc = acc > 0 ? (acc + filter_count / 2) / filter_count
+                        : (acc - filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int8_t max = std::numeric_limits<int8_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int8_t>(max, params.quantized_activation_min);
+          max = std::min<int8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(max);
+        }
+      }
+    }
+  }
+}
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const int16_t* input_data,
+                        const RuntimeShape& output_shape,
+                        int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32_t acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          if (filter_count == 0) return false;
+          // Round to the closest integer value.
+          acc = acc > 0 ? (acc + filter_count / 2) / filter_count
+                        : (acc - filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int16_t>(acc);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int16_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int16_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int16_t max = std::numeric_limits<int16_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int16_t>(max, params.quantized_activation_min);
+          max = std::min<int16_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int16_t>(max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
new file mode 100644
index 00000000..7b1e003b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int32_t input_shift,
+                 const RuntimeShape& input_shape, const int8_t* input_data,
+                 const RuntimeShape& output_shape, int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputScale = 7;
+  static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 =
+          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
+      const int32_t output_in_q0 =
+          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q24 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
+      output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
+      output_data[i] = static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+
+inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
+                 const RuntimeShape& input_shape, const int16_t* ptr_input_data,
+                 const RuntimeShape& output_shape, int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+
+  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+  // In case of general parameter scale, multiplier 3 is taken into account
+  // in TanhPrepare function and it is included in
+  // input_multiplier already.
+
+  if (input_multiplier == 0) {  // power of two case
+    input_multiplier = 3 << input_left_shift;
+    input_left_shift = 0;
+  }
+
+  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
+
+  int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data =
+        ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
+
+    uint32_t abs_input_data = abs(input_data);
+    uint32_t uh = abs_input_data >> 8;
+    int32_t result;
+
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0xFFFF << 8;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+
+      uint8_t ut = abs_input_data & 0xFF;
+
+      result = (ua << 8) + ut * (ub - ua);
+    }
+
+    result = (input_data >= 0)
+                 ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
+                 : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
+
+    // Convert back to 16-bit.
+    result >>= (9 - 1);
+
+    *ptr_output_data = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
new file mode 100644
index 00000000..40f99cee
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
@@ -0,0 +1,224 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization transpose convolution reference kernel.
+inline void TransposeConv(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    int32_t* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  const int8_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  const int8_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      (input_value + input_offset) * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                              out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+// int16_t input (zero_point=0), int8_t filter, int32 or int64 accumulator
+template <typename Scalar>
+inline void TransposeConv(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const Scalar* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    Scalar* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(Scalar));
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  const int32_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  const int32_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          Scalar acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                             out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/l2normalization.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/l2normalization.h
new file mode 100644
index 00000000..e5c91bf5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const float* input_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data, float epsilon = 1e-6) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
+    }
+  }
+}
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const uint8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            uint8_t* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = op_params.input_zero_point;
+
+  for (int i = 0; i < outer_size; ++i) {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val =
+          std::min(static_cast<int32_t>(255),
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/leaky_relu.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/leaky_relu.h
new file mode 100644
index 00000000..06f691ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/leaky_relu.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void LeakyRelu(const tflite::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+    output_data[i] = val > 0 ? val : val * params.alpha;
+  }
+}
+
+template <typename T>
+inline void QuantizeLeakyRelu(const LeakyReluParams& params,
+                              const RuntimeShape& input_shape,
+                              const T* input_data,
+                              const RuntimeShape& output_shape,
+                              T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static const int32_t quantized_min = std::numeric_limits<T>::min();
+  static const int32_t quantized_max = std::numeric_limits<T>::max();
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = input_data[i] - params.input_offset;
+    int32_t unclamped_output;
+    if (input_value >= 0) {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_identity,
+                             params.output_shift_identity);
+    } else {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_alpha,
+                             params.output_shift_alpha);
+    }
+    const T clamped_output =
+        std::min(quantized_max, std::max(quantized_min, unclamped_output));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
new file mode 100644
index 00000000..c9d6ed96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -0,0 +1,2224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/legacy_types.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+static constexpr int kDepthwiseReverseShift = -1;
+
+inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) {
+  shape->BuildFrom(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                bias_data, DimsToShape(output_dims), output_data);
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, 1, 1, pad_width,
+                pad_height, depth_multiplier, output_activation_min,
+                output_activation_max, output_data, output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, float* output_data,
+                   const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, pad_width, pad_height,
+                depth_multiplier, output_activation_min, output_activation_max,
+                output_data, output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+                    bias_dims, stride, stride, pad_width, pad_height,
+                    depth_multiplier, output_data, output_dims);
+}
+
+inline void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                          int32_t input_offset, const uint8_t* filter_data,
+                          const Dims<4>& filter_dims, int32_t filter_offset,
+                          const int32_t* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height,
+                          int dilation_width_factor, int dilation_height_factor,
+                          int pad_width, int pad_height, int depth_multiplier,
+                          int32_t output_offset, int32_t output_multiplier,
+                          int output_shift, int32_t output_activation_min,
+                          int32_t output_activation_max, uint8_t* output_data,
+                          const Dims<4>& output_dims) {
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kDepthwiseReverseShift * output_shift;
+
+  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                bias_data, DimsToShape(output_dims), output_data);
+}
+
+inline void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                          int32_t input_offset, const uint8_t* filter_data,
+                          const Dims<4>& filter_dims, int32_t filter_offset,
+                          const int32_t* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          int32_t output_offset, int32_t output_multiplier,
+                          int output_shift, int32_t output_activation_min,
+                          int32_t output_activation_max, uint8_t* output_data,
+                          const Dims<4>& output_dims) {
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, 1, 1, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                   int32_t input_offset, const uint8_t* filter_data,
+                   const Dims<4>& filter_dims, int32_t filter_offset,
+                   const int32_t* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, int32_t output_offset,
+                   int32_t output_multiplier, int output_shift,
+                   int32_t output_activation_min, int32_t output_activation_max,
+                   uint8_t* output_data, const Dims<4>& output_dims) {
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims,
+                   int32_t input_offset, const uint8_t* filter_data,
+                   const Dims<4>& filter_dims, int32_t filter_offset,
+                   const int32_t* bias_data, const Dims<4>& bias_dims,
+                   int stride, int pad_width, int pad_height,
+                   int depth_multiplier, int32_t output_offset,
+                   int32_t output_multiplier, int output_shift,
+                   int32_t output_activation_min, int32_t output_activation_max,
+                   uint8_t* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+                    filter_dims, filter_offset, bias_data, bias_dims, stride,
+                    stride, pad_width, pad_height, depth_multiplier,
+                    output_offset, output_multiplier, output_shift,
+                    output_activation_min, output_activation_max, output_data,
+                    output_dims);
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int dilation_width_factor,
+          int dilation_height_factor, int pad_width, int pad_height,
+          float* output_data, const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, dilation_width_factor,
+       dilation_height_factor, pad_width, pad_height, output_activation_min,
+       output_activation_max, output_data, output_dims, im2col_data,
+       im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, 1, 1, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_offset, const uint8_t* filter_data,
+                 const Dims<4>& filter_dims, int32_t filter_offset,
+                 const int32_t* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32_t output_offset, int32_t output_multiplier,
+                 int output_shift, int32_t output_activation_min,
+                 int32_t output_activation_max, uint8_t* output_data,
+                 const Dims<4>& output_dims, uint8_t* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+  op_params.dilation_width_factor = dilation_width_factor;
+  op_params.dilation_height_factor = dilation_height_factor;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+       filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+       output_data, DimsToShape(im2col_dims), im2col_data, gemmlowp_context);
+}
+
+inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_offset, const uint8_t* filter_data,
+                 const Dims<4>& filter_dims, int32_t filter_offset,
+                 const int32_t* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32_t output_offset,
+                 int32_t output_multiplier, int output_shift,
+                 int32_t output_activation_min, int32_t output_activation_max,
+                 uint8_t* output_data, const Dims<4>& output_dims,
+                 uint8_t* im2col_data, const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_offset, const uint8_t* filter_data,
+                 const Dims<4>& filter_dims, int32_t filter_offset,
+                 const int32_t* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32_t output_offset,
+                 int32_t output_multiplier, int output_shift,
+                 int32_t output_activation_min, int32_t output_activation_max,
+                 uint8_t* output_data, const Dims<4>& output_dims,
+                 uint8_t* im2col_data, const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemmlowp_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8_t* input_data, const Dims<4>& input_dims,
+          int32_t input_offset, const uint8_t* filter_data,
+          const Dims<4>& filter_dims, int32_t filter_offset,
+          const int32_t* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32_t output_offset,
+          int32_t output_multiplier, int output_shift,
+          int32_t output_activation_min, int32_t output_activation_max,
+          uint8_t* output_data, const Dims<4>& output_dims,
+          uint8_t* im2col_data, const Dims<4>& im2col_dims,
+          gemmlowp::GemmContext* gemmlowp_context) {
+  Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
+           filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+           pad_height, output_offset, output_multiplier, output_shift,
+           output_activation_min, output_activation_max, output_data,
+           output_dims, im2col_data, im2col_dims, gemmlowp_context);
+}
+
+inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, float* output_data,
+                          const Dims<4>& output_dims, float* im2col_data,
+                          const Dims<4>& im2col_dims) {
+  tflite::ConvParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride_width;
+  op_params.stride_height = stride_height;
+
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &op_params.float_activation_min,
+                      &op_params.float_activation_max);
+
+  TransposeConv(op_params, DimsToShape(input_dims), input_data,
+                DimsToShape(filter_dims), filter_data,
+                /*bias_shape*/ RuntimeShape(), /*bias*/ nullptr,
+                DimsToShape(output_dims), output_data, DimsToShape(im2col_dims),
+                im2col_data);
+}
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  TransposeConv(params, input_shape, input_data, filter_shape, filter_data,
+                /*bias_shape*/ RuntimeShape(), /*bias*/ nullptr, output_shape,
+                output_data, im2col_shape, im2col_data);
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(weights_dims), weights_data,
+                 DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+                 output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, gemmlowp::GemmContext*) {
+  FullyConnected(params, input_shape, input_data, filter_shape, filter_data,
+                 bias_shape, bias_data, output_shape, output_data);
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, gemmlowp::GemmContext*) {
+  FullyConnected(params, input_shape, input_data, filter_shape, filter_data,
+                 bias_shape, bias_data, output_shape, output_data);
+}
+
+inline void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims,
+                           int32_t input_offset, const uint8_t* filter_data,
+                           const Dims<4>& filter_dims, int32_t filter_offset,
+                           const int32_t* bias_data, const Dims<4>& bias_dims,
+                           int32_t output_offset, int32_t output_multiplier,
+                           int output_shift, int32_t output_activation_min,
+                           int32_t output_activation_max, uint8_t* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data, DimsToShape(output_dims), output_data,
+                 gemmlowp_context);
+}
+
+inline void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims,
+                           int32_t input_offset, const uint8_t* filter_data,
+                           const Dims<4>& filter_dims, int32_t filter_offset,
+                           const int32_t* bias_data, const Dims<4>& bias_dims,
+                           int32_t output_offset, int32_t output_multiplier,
+                           int output_shift, int32_t output_activation_min,
+                           int32_t output_activation_max, int16_t* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  FullyConnected(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+                 bias_data, DimsToShape(output_dims), output_data,
+                 gemmlowp_context);
+}
+
+inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data,
+    gemmlowp::GemmContext*) {
+  ShuffledFullyConnected(params, input_shape, input_data, weights_shape,
+                         shuffled_weights_data, bias_shape, bias_data,
+                         output_shape, output_data,
+                         shuffled_input_workspace_data);
+}
+
+inline void ShuffledFullyConnected(
+    const uint8_t* input_data, const Dims<4>& input_dims,
+    const uint8_t* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32_t* bias_data, const Dims<4>& bias_dims,
+    int32_t output_multiplier, int output_shift, int32_t output_activation_min,
+    int32_t output_activation_max, int16_t* output_data,
+    const Dims<4>& output_dims, uint8_t* shuffled_input_workspace_data,
+    gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::FullyConnectedParams op_params;
+  op_params.output_multiplier = output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
+                         DimsToShape(weights_dims), shuffled_weights_data,
+                         DimsToShape(bias_dims), bias_data,
+                         DimsToShape(output_dims), output_data,
+                         shuffled_input_workspace_data, gemmlowp_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims,
+                    int32_t input_offset, const uint8_t* filter_data,
+                    const Dims<4>& filter_dims, int32_t filter_offset,
+                    const int32_t* bias_data, const Dims<4>& bias_dims,
+                    int32_t output_offset, int32_t output_multiplier,
+                    int output_shift, int32_t output_activation_min,
+                    int32_t output_activation_max, uint8_t* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemmlowp_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims,
+                 gemmlowp_context);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  tflite::LstmCellParams op_params;
+  // Float LSTM cell does not need parameters to be set: leave untouched.
+
+  LstmCell(op_params, DimsToShape(input_dims), input_data,
+           DimsToShape(prev_activ_dims), prev_activ_data,
+           DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims),
+           bias_data, DimsToShape(prev_state_dims), prev_state_data,
+           DimsToShape(output_state_dims), output_state_data,
+           DimsToShape(output_activ_dims), output_activ_data,
+           DimsToShape(concat_temp_dims), concat_temp_data,
+           DimsToShape(activ_temp_dims), activ_temp_data);
+}
+
+template <int StateIntegerBits>
+void LstmCell(const uint8_t* input_data_uint8, const Dims<4>& input_dims,
+              const uint8_t* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8_t* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32_t* bias_data_int32,
+              const Dims<4>& bias_dims, const int16_t* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16_t* output_state_data_int16,
+              const Dims<4>& output_state_dims,
+              uint8_t* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8_t* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16_t* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32_t weights_zero_point,
+              int32_t accum_multiplier, int accum_shift,
+              gemmlowp::GemmContext* gemmlowp_context) {
+  tflite::LstmCellParams op_params;
+  op_params.weights_zero_point = weights_zero_point;
+  op_params.accum_multiplier = accum_multiplier;
+  op_params.accum_shift = accum_shift;
+
+  LstmCell<StateIntegerBits>(
+      op_params, DimsToShape(input_dims), input_data_uint8,
+      DimsToShape(prev_activ_dims), prev_activ_data_uint8,
+      DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims),
+      bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16,
+      DimsToShape(output_state_dims), output_state_data_int16,
+      DimsToShape(output_activ_dims), output_activ_data_uint8,
+      DimsToShape(concat_temp_dims), concat_temp_data_uint8,
+      DimsToShape(activ_temp_dims), activ_temp_data_int16, gemmlowp_context);
+}
+
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastDivSlow(op_params, DimsToShape(input1_dims), input1_data,
+                   DimsToShape(input2_dims), input2_data,
+                   DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Div(const T* input1_data, const Dims<4>& input1_dims,
+                const T* input2_data, const Dims<4>& input2_dims,
+                T output_activation_min, T output_activation_max,
+                T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  Div(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+inline void Concatenation(int concat_dim, const Scalar* const* input_data,
+                          const Dims<4>* const* input_dims, int inputs_count,
+                          Scalar* output_data, const Dims<4>& output_dims) {
+  // For now we don't have a model with a Concatenation with fused activation.
+  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
+
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::ConcatenationParams op_params;
+  op_params.axis = 3 - concat_dim;
+  op_params.inputs_count = inputs_count;
+
+  Concatenation(op_params, input_shapes_indirect.data(), input_data,
+                DimsToShape(output_dims), output_data);
+}
+
+inline void Concatenation(int concat_dim, const uint8_t* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32_t* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8_t* output_data, const Dims<4>& output_dims,
+                          const int32_t output_zeropoint,
+                          const float output_scale) {
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::ConcatenationParams op_params;
+  op_params.axis = 3 - concat_dim;
+  op_params.input_zeropoint = input_zeropoint;
+  op_params.input_scale = input_scale;
+  op_params.inputs_count = inputs_count;
+  op_params.output_zeropoint = output_zeropoint;
+  op_params.output_scale = output_scale;
+
+  ConcatenationWithScaling(op_params, input_shapes_indirect.data(), input_data,
+                           DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+                        const Dims<4>* const* input_dims, int inputs_count,
+                        Scalar* output_data, const Dims<4>& output_dims) {
+  // For now we don't have a model with a Concatenation with fused activation.
+  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
+
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::ConcatenationParams op_params;
+  op_params.inputs_count = inputs_count;
+
+  DepthConcatenation(op_params, input_shapes_indirect.data(), input_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int axis, int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  std::vector<RuntimeShape> output_shapes(outputs_count);
+  std::vector<const RuntimeShape*> output_shapes_indirect(outputs_count);
+  for (int i = 0; i < outputs_count; ++i) {
+    ShapeFromDims(*output_dims[i], &output_shapes[i]);
+    output_shapes_indirect[i] = &output_shapes[i];
+  }
+  tflite::SplitParams op_params;
+  op_params.axis = 3 - axis;
+  op_params.num_split = outputs_count;
+
+  Split(op_params, DimsToShape(input_dims), input_data,
+        output_shapes_indirect.data(), output_data);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  TFLITE_DCHECK_GE(outputs_count, 1);
+  for (int i = 0; i < outputs_count; i++) {
+    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+  }
+  // For now we don't have a model with a Split with fused activation.
+  TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone);
+
+  TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count,
+                  output_data, output_dims);
+}
+
+inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
+                    float beta, float* output_data,
+                    const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.beta = beta;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Softmax(const uint8_t* input_data, const RuntimeShape& input_shape,
+                    int32_t input_beta_multiplier,
+                    int32_t input_beta_left_shift, int diff_min,
+                    uint8_t* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_beta_multiplier;
+  params.input_left_shift = input_beta_left_shift;
+  params.diff_min = diff_min;
+  Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape,
+                       float* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  // No params currently used for float LogSoftmax.
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void LogSoftmax(const uint8_t* input_data,
+                       const RuntimeShape& input_shape,
+                       int32_t input_multiplier, int32_t input_left_shift,
+                       int32_t reverse_scaling_divisor,
+                       int32_t reverse_scaling_right_shift, int diff_min,
+                       uint8_t* output_data, const RuntimeShape& output_shape) {
+  SoftmaxParams params;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  params.reverse_scaling_divisor = reverse_scaling_divisor;
+  params.reverse_scaling_right_shift = reverse_scaling_right_shift;
+  params.diff_min = diff_min;
+  LogSoftmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const LogisticParams& params,
+                     const RuntimeShape& input_shape, const uint8_t* input_data,
+                     const RuntimeShape& output_shape, uint8_t* output_data) {
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8_t input_val_u8 = input_data[i];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32_t input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      // Convert from Q0.31 to Q23.8.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      // Reinterpret as U0.8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8_t>(output_val_s32);
+    }
+    output_data[i] = output_val;
+  }
+}
+
+inline void Logistic(const uint8_t* input_data, const RuntimeShape& input_shape,
+                     int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int input_left_shift,
+                     uint8_t* output_data, const RuntimeShape& output_shape) {
+  LogisticParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
+  LogisticParams params;
+  // No params currently needed by int16 Logistic.
+  Logistic(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const uint8_t* input_data, const RuntimeShape& input_shape,
+                 int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int input_left_shift,
+                 uint8_t* output_data, const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_zero_point = input_zero_point;
+  params.input_range_radius = input_range_radius;
+  params.input_multiplier = input_multiplier;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const int16_t* input_data, const RuntimeShape& input_shape,
+                 int input_left_shift, int16_t* output_data,
+                 const RuntimeShape& output_shape) {
+  TanhParams params;
+  params.input_left_shift = input_left_shift;
+  Tanh(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Dequantize(const uint8_t* input_data, const Dims<4>& input_dims,
+                       int32_t zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  tflite::DequantizationParams op_params;
+  op_params.zero_point = zero_point;
+  op_params.scale = scale;
+
+  Dequantize(op_params, DimsToShape(input_dims), input_data,
+             DimsToShape(output_dims), output_data);
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+                      float rmin, float rmax, int num_bits, float* output_data,
+                      const Dims<4>& output_dims) {
+  tflite::FakeQuantParams op_params;
+  op_params.num_bits = num_bits;
+  op_params.minmax.min = rmin;
+  op_params.minmax.max = rmax;
+
+  FakeQuant(op_params, DimsToShape(input_dims), input_data,
+            DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+                   int input_rank, const int32_t* coords_data,
+                   const Dims<4>& coords_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  tflite::GatherParams op_params;
+  op_params.axis = 4 - input_rank;
+  op_params.batch_dims = 0;
+
+  Gather(op_params, DimsToShape(input_dims), input_data,
+         DimsToShape(coords_dims), coords_data, DimsToShape(output_dims),
+         output_data);
+}
+
+inline uint32_t LegacyReverseBits32(uint32_t n) {
+  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
+  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
+  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
+  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
+          ((n & 0xFF000000) >> 24));
+}
+
+inline void StridedSliceReverseIndices(tflite::StridedSliceParams* p) {
+  TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
+  TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
+
+  std::reverse(p->start_indices, p->start_indices + p->start_indices_count);
+  std::reverse(p->stop_indices, p->stop_indices + p->stop_indices_count);
+  std::reverse(p->strides, p->strides + p->strides_count);
+
+  p->begin_mask = LegacyReverseBits32(static_cast<uint32_t>(p->begin_mask)) >>
+                  (32 - p->start_indices_count);
+  p->ellipsis_mask =
+      LegacyReverseBits32(static_cast<uint32_t>(p->ellipsis_mask)) >>
+      (32 - p->start_indices_count);
+  p->end_mask = LegacyReverseBits32(static_cast<uint32_t>(p->end_mask)) >>
+                (32 - p->start_indices_count);
+  p->new_axis_mask =
+      LegacyReverseBits32(static_cast<uint32_t>(p->new_axis_mask)) >>
+      (32 - p->start_indices_count);
+  p->shrink_axis_mask =
+      LegacyReverseBits32(static_cast<uint32_t>(p->shrink_axis_mask)) >>
+      (32 - p->start_indices_count);
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  auto op_params = strided_slice::BuildStridedSliceParams(
+      begin_mask, end_mask, shrink_axis_mask, start_indices, stop_indices,
+      strides);
+  StridedSliceReverseIndices(&op_params);
+
+  StridedSlice(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Mean(const T* input_data, const Dims<4>& input_dims,
+                 const std::vector<int>& reduction_indices, T* output_data,
+                 const Dims<4>& output_dims) {
+  tflite::MeanParams op_params;
+  op_params.axis_count = reduction_indices.size();
+  for (int i = 0; i < op_params.axis_count; ++i) {
+    op_params.axis[i] = reduction_indices[op_params.axis_count - 1 - i];
+  }
+
+  Mean(op_params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+template <typename T>
+void Transpose(const T* input, const Dims<4>& input_dims, T* output,
+               const Dims<4>& output_dims, const int* permuted_axes) {
+  TransposeParams params;
+  params.perm_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    params.perm[i] = 3 - permuted_axes[3 - i];
+  }
+  Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims),
+            output);
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       bool* output_data, const Dims<4>& output_dims) {
+  ComparisonParams op_params;
+  // No parameters needed.
+  ComparisonImpl<T, F>(op_params, DimsToShape(input1_dims), input1_data,
+                       DimsToShape(input2_dims), input2_data,
+                       DimsToShape(output_dims), output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void Comparison(int left_shift, const T* input1_data,
+                       const Dims<4>& input1_dims, int32_t input1_offset,
+                       int32_t input1_multiplier, int input1_shift,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       int32_t input2_offset, int32_t input2_multiplier,
+                       int input2_shift, bool* output_data,
+                       const Dims<4>& output_dims) {
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input2_shift = kReverseShift * input2_shift;
+
+  ComparisonWithScaling<T, F>(op_params, DimsToShape(input1_dims), input1_data,
+                              DimsToShape(input2_dims), input2_data,
+                              DimsToShape(output_dims), output_data);
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison(const T* input1_data,
+                                const Dims<4>& input1_dims,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, bool* output_data,
+                                const Dims<4>& output_dims) {
+  ComparisonParams op_params;
+  // No parameters needed.
+  BroadcastComparison4DSlowImpl<T, F>(op_params, DimsToShape(input1_dims),
+                                      input1_data, DimsToShape(input2_dims),
+                                      input2_data, DimsToShape(output_dims),
+                                      output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void BroadcastComparison(
+    int left_shift, const T* input1_data, const Dims<4>& input1_dims,
+    int32_t input1_offset, int32_t input1_multiplier, int input1_shift,
+    const T* input2_data, const Dims<4>& input2_dims, int32_t input2_offset,
+    int32_t input2_multiplier, int input2_shift, bool* output_data,
+    const Dims<4>& output_dims) {
+  ComparisonParams op_params;
+
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.input2_shift = kReverseShift * input2_shift;
+
+  BroadcastComparison4DSlowWithScaling<T, F>(
+      op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+#define TFLITE_LEGACY_COMPARISON_OP(name)                                     \
+  template <typename T>                                                       \
+  inline void name(const T* input1_data, const Dims<4>& input1_dims,          \
+                   const T* input2_data, const Dims<4>& input2_dims,          \
+                   bool* output_data, const Dims<4>& output_dims) {           \
+    ruy::profiler::ScopeLabel label(#name);                                   \
+    Comparison<T, name##Fn>(input1_data, input1_dims, input2_data,            \
+                            input2_dims, output_data, output_dims);           \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void name(                                                           \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    ruy::profiler::ScopeLabel label(#name "/8bit");                           \
+    Comparison<T, name##Fn>(left_shift, input1_data, input1_dims,             \
+                            input1_offset, input1_multiplier, input1_shift,   \
+                            input2_data, input2_dims, input2_offset,          \
+                            input2_multiplier, input2_shift, output_data,     \
+                            output_dims);                                     \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \
+      const Dims<4>& input2_dims, bool* output_data,                          \
+      const Dims<4>& output_dims) {                                           \
+    ruy::profiler::ScopeLabel label("Broadcast" #name);                       \
+    BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data,   \
+                                     input2_dims, output_data, output_dims);  \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    ruy::profiler::ScopeLabel label("Broadcast" #name "/8bit");               \
+    BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims,    \
+                                     input1_offset, input1_multiplier,        \
+                                     input1_shift, input2_data, input2_dims,  \
+                                     input2_offset, input2_multiplier,        \
+                                     input2_shift, output_data, output_dims); \
+  }
+TFLITE_LEGACY_COMPARISON_OP(Equal);
+TFLITE_LEGACY_COMPARISON_OP(NotEqual);
+TFLITE_LEGACY_COMPARISON_OP(Greater);
+TFLITE_LEGACY_COMPARISON_OP(GreaterEqual);
+TFLITE_LEGACY_COMPARISON_OP(Less);
+TFLITE_LEGACY_COMPARISON_OP(LessEqual);
+#undef TFLITE_LEGACY_COMPARISON_OP
+
+template <typename D, typename T>
+inline void Select(const D* input_condition_data,
+                   const Dims<4>& input_condition_dims, const T* input_x_data,
+                   const Dims<4>& input_x_dims, const T* input_y_data,
+                   const Dims<4>& input_y_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  Select(DimsToShape(input_condition_dims), input_condition_data,
+         DimsToShape(input_x_dims), input_x_data, DimsToShape(input_y_dims),
+         input_y_data, DimsToShape(output_dims), output_data);
+}
+
+template <typename D, typename T>
+inline void RankOneSelect(const D* input_condition_data,
+                          const Dims<4>& input_condition_dims,
+                          const T* input_x_data, const Dims<4>& input_x_dims,
+                          const T* input_y_data, const Dims<4>& input_y_dims,
+                          T* output_data, const Dims<4>& output_dims) {
+  RankOneSelect(DimsToShape(input_condition_dims), input_condition_data,
+                DimsToShape(input_x_dims), input_x_data,
+                DimsToShape(input_y_dims), input_y_data,
+                DimsToShape(output_dims), output_data);
+}
+
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
+                          const T* values, T default_value, T* output_data,
+                          const Dims<4>& output_dims, bool value_is_scalar) {
+  SparseToDense(indices, values, default_value, value_is_scalar,
+                DimsToShape(output_dims), output_data);
+}
+
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, int inputs_count,
+          Scalar* output_data, const Dims<4>& output_dims) {
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::PackParams op_params;
+  op_params.axis = 3 - dim;
+  op_params.inputs_count = inputs_count;
+
+  Pack(op_params, input_shapes_indirect.data(), input_data,
+       DimsToShape(output_dims), output_data);
+}
+
+template <typename Scalar>
+void Unpack(int axis, const Scalar* input_data, const Dims<4>& input_dims,
+            int dimensions, int outputs_count, Scalar* const* output_datas,
+            const Dims<4>& output_dims) {
+  tflite::UnpackParams op_params;
+  op_params.axis = 3 - axis;
+  op_params.num_split = outputs_count;
+
+  Unpack(op_params, DimsToShape(input_dims), input_data,
+         DimsToShape(output_dims), output_datas);
+}
+
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, const int32_t* input_zeropoint,
+          const float* input_scale, int inputs_count, Scalar* output_data,
+          const Dims<4>& output_dims, const int32_t output_zeropoint,
+          const float output_scale) {
+  std::vector<RuntimeShape> input_shapes(inputs_count);
+  std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count);
+  for (int i = 0; i < inputs_count; ++i) {
+    ShapeFromDims(*input_dims[i], &input_shapes[i]);
+    input_shapes_indirect[i] = &input_shapes[i];
+  }
+  tflite::PackParams op_params;
+  op_params.axis = 3 - dim;
+  op_params.input_zeropoint = input_zeropoint;
+  op_params.input_scale = input_scale;
+  op_params.inputs_count = inputs_count;
+  op_params.output_zeropoint = output_zeropoint;
+  op_params.output_scale = output_scale;
+
+  PackWithScaling(op_params, input_shapes_indirect.data(), input_data,
+                  DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const RuntimeShape& input_shape,
+                     float* output_data, const RuntimeShape& output_shape) {
+  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  tflite::L2NormalizationParams op_params;
+  // No params need to be set for float.
+
+  L2Normalization(op_params, input_shape, input_data, output_shape,
+                  output_data);
+}
+
+inline void L2Normalization(const uint8_t* input_data,
+                            const RuntimeShape& input_shape,
+                            int32_t input_zero_point, uint8_t* output_data,
+                            const RuntimeShape& output_shape) {
+  tflite::L2NormalizationParams op_params;
+  op_params.input_zero_point = input_zero_point;
+
+  L2Normalization(op_params, input_shape, input_data, output_shape,
+                  output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  L2Normalization<Ac>(input_data, DimsToShape(input_dims), output_data,
+                      DimsToShape(output_dims));
+}
+
+inline void L2Normalization(const uint8_t* input_data,
+                            const Dims<4>& input_dims, int32_t input_zero_point,
+                            uint8_t* output_data, const Dims<4>& output_dims) {
+  L2Normalization(input_data, DimsToShape(input_dims), input_zero_point,
+                  output_data, DimsToShape(output_dims));
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  Relu(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  Relu1(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
+}
+
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  Relu6(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
+}
+
+inline void ReluX(uint8_t min_value, uint8_t max_value,
+                  const uint8_t* input_data, const RuntimeShape& input_shape,
+                  uint8_t* output_data, const RuntimeShape& output_shape) {
+  tflite::ActivationParams params;
+  params.quantized_activation_max = max_value;
+  params.quantized_activation_min = min_value;
+  ReluX(params, input_shape, input_data, output_shape, output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8_t* input1_data,
+                const Dims<4>& input1_dims, int32_t input1_offset,
+                int32_t input1_multiplier, int input1_shift,
+                const uint8_t* input2_data, const Dims<4>& input2_dims,
+                int32_t input2_offset, int32_t input2_multiplier,
+                int input2_shift, int32_t output_offset,
+                int32_t output_multiplier, int output_shift,
+                int32_t output_activation_min, int32_t output_activation_max,
+                uint8_t* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32_t* input1_data, const Dims<4>& input1_dims,
+         const int32_t* input2_data, const Dims<4>& input2_dims,
+         int32_t* output_data, const Dims<4>& output_dims) {
+  ruy::profiler::ScopeLabel label("Add/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = std::numeric_limits<int32_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int32_t>::max();
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8_t* input1_data,
+                         const Dims<4>& input1_dims, int32_t input1_offset,
+                         int32_t input1_multiplier, int input1_shift,
+                         const uint8_t* input2_data, const Dims<4>& input2_dims,
+                         int32_t input2_offset, int32_t input2_multiplier,
+                         int input2_shift, int32_t output_offset,
+                         int32_t output_multiplier, int output_shift,
+                         int32_t output_activation_min,
+                         int32_t output_activation_max, uint8_t* output_data,
+                         const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8_t* input1_data, const Dims<4>& input1_dims,
+    int32_t input1_offset, int32_t input1_multiplier, int input1_shift,
+    const uint8_t* input2_data, const Dims<4>& input2_dims,
+    int32_t input2_offset, int32_t input2_multiplier, int input2_shift,
+    int32_t output_offset, int32_t output_multiplier, int output_shift,
+    int32_t output_activation_min, int32_t output_activation_max,
+    uint8_t* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  tflite::ArithmeticParams op_params;
+  op_params.broadcast_category =
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.broadcast_shape[4] = y0;
+  op_params.broadcast_shape[3] = y1;
+  op_params.broadcast_shape[2] = y2;
+  op_params.broadcast_shape[1] = y3;
+  op_params.broadcast_shape[0] = y4;
+  BroadcastAddFivefold(op_params, DimsToShape(input1_dims), input1_data,
+                       DimsToShape(input2_dims), input2_data,
+                       DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(const int16_t* input1_data, const Dims<4>& input1_dims,
+                int input1_shift, const int16_t* input2_data,
+                const Dims<4>& input2_dims, int input2_shift,
+                int16_t output_activation_min, int16_t output_activation_max,
+                int16_t* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &output_activation_min, &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+         const Dims<4>& input2_dims, T* output_data,
+         const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = std::numeric_limits<T>::min();
+  op_params.quantized_activation_max = std::numeric_limits<T>::max();
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline bool AveragePool(const float* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int kwidth, int kheight,
+                        float output_activation_min,
+                        float output_activation_max, float* output_data,
+                        const Dims<4>& output_dims) {
+  tflite::PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = kheight;
+  params.filter_width = kwidth;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.float_activation_min = output_activation_min;
+  params.float_activation_max = output_activation_max;
+  return AveragePool(params, DimsToShape(input_dims), input_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// Transitional version that will be moved shortly to legacy_reference_ops, as
+// part of RuntimeShape revisions.
+inline void BroadcastMul4DSlow(
+    const uint8_t* input1_data, const Dims<4>& input1_dims,
+    int32_t input1_offset, const uint8_t* input2_data,
+    const Dims<4>& input2_dims, int32_t input2_offset, int32_t output_offset,
+    int32_t output_multiplier, int output_shift, int32_t output_activation_min,
+    int32_t output_activation_max, uint8_t* output_data,
+    const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+  op_params.input1_offset = input1_offset;
+  op_params.input2_offset = input2_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+
+  BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+inline void BroadcastMul(const uint8_t* input1_data, const Dims<4>& input1_dims,
+                         int32_t input1_offset, const uint8_t* input2_data,
+                         const Dims<4>& input2_dims, int32_t input2_offset,
+                         int32_t output_offset, int32_t output_multiplier,
+                         int output_shift, int32_t output_activation_min,
+                         int32_t output_activation_max, uint8_t* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul4DSlow(
+      input1_data, input1_dims, input1_offset, input2_data, input2_dims,
+      input2_offset, output_offset, output_multiplier,
+      //
+      kReverseShift * output_shift,
+      //
+      output_activation_min, output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8_t* input1_data, const Dims<4>& input1_dims,
+                         int32_t input1_offset, const uint8_t* input2_data,
+                         const Dims<4>& input2_dims, int32_t input2_offset,
+                         int32_t output_offset, int32_t output_multiplier,
+                         int output_shift, int32_t output_activation_min,
+                         int32_t output_activation_max, uint8_t* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const float* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int kwidth, int kheight, float* output_data,
+                 const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  return AveragePool(input_data, input_dims, stride_width, stride_height,
+                     pad_width, pad_height, kwidth, kheight,
+                     output_activation_min, output_activation_max, output_data,
+                     output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, float* output_data,
+                 const Dims<4>& output_dims) {
+  return AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width,
+                         pad_height, filter_width, filter_height, output_data,
+                         output_dims);
+}
+
+inline bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int filter_width, int filter_height,
+                        int32_t output_activation_min,
+                        int32_t output_activation_max, uint8_t* output_data,
+                        const Dims<4>& output_dims) {
+  tflite::PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = filter_height;
+  params.filter_width = filter_width;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  return AveragePool(params, DimsToShape(input_dims), input_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int filter_width, int filter_height,
+                 int32_t output_activation_min, int32_t output_activation_max,
+                 uint8_t* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  return AveragePool(input_data, input_dims, stride_width, stride_height,
+                     pad_width, pad_height, filter_width, filter_height,
+                     output_activation_min, output_activation_max, output_data,
+                     output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int stride, int pad_width, int pad_height, int filter_width,
+                 int filter_height, int32_t output_activation_min,
+                 int32_t output_activation_max, uint8_t* output_data,
+                 const Dims<4>& output_dims) {
+  return AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width,
+                         pad_height, filter_width, filter_height,
+                         output_activation_min, output_activation_max,
+                         output_data, output_dims);
+}
+
+inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int kwidth, int kheight,
+                    float output_activation_min, float output_activation_max,
+                    float* output_data, const Dims<4>& output_dims) {
+  tflite::PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = kheight;
+  params.filter_width = kwidth;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.float_activation_min = output_activation_min;
+  params.float_activation_max = output_activation_max;
+  MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+          output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int kwidth, int kheight, float* output_data,
+             const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, kwidth, kheight, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             float* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int filter_width, int filter_height,
+                    int32_t output_activation_min,
+                    int32_t output_activation_max, uint8_t* output_data,
+                    const Dims<4>& output_dims) {
+  PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = filter_height;
+  params.filter_width = filter_width;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+          output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int filter_width, int filter_height, int32_t output_activation_min,
+             int32_t output_activation_max, uint8_t* output_data,
+             const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, filter_width, filter_height, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             int32_t output_activation_min, int32_t output_activation_max,
+             uint8_t* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int filter_width, int filter_height,
+                   float output_activation_min, float output_activation_max,
+                   float* output_data, const Dims<4>& output_dims) {
+  PoolParams params;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+  params.filter_height = filter_height;
+  params.filter_width = filter_width;
+  params.padding_values.height = pad_height;
+  params.padding_values.width = pad_width;
+  params.float_activation_min = output_activation_min;
+  params.float_activation_max = output_activation_max;
+  L2Pool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+         output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+            int stride_width, int stride_height, int pad_width, int pad_height,
+            int filter_width, int filter_height, float* output_data,
+            const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
+         pad_height, filter_width, filter_height, output_activation_min,
+         output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int filter_width, int filter_height,
+            float* output_data, const Dims<4>& output_dims) {
+  L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+             filter_width, filter_height, output_data, output_dims);
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+                    float beta, float* output_data,
+                    const Dims<4>& output_dims) {
+  Softmax(input_data, DimsToShape(input_dims), beta, output_data,
+          DimsToShape(output_dims));
+}
+
+inline void Softmax(const uint8_t* input_data, const Dims<4>& input_dims,
+                    int32_t input_beta_multiplier,
+                    int32_t input_beta_left_shift, int diff_min,
+                    uint8_t* output_data, const Dims<4>& output_dims) {
+  Softmax(input_data, DimsToShape(input_dims), input_beta_multiplier,
+          input_beta_left_shift, diff_min, output_data,
+          DimsToShape(output_dims));
+}
+
+inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
+                       float* output_data, const Dims<4>& output_dims) {
+  LogSoftmax(input_data, DimsToShape(input_dims), output_data,
+             DimsToShape(output_dims));
+}
+
+inline void LogSoftmax(const uint8_t* input_data, const Dims<4>& input_dims,
+                       int32_t input_multiplier, int32_t input_left_shift,
+                       int32_t reverse_scaling_divisor,
+                       int32_t reverse_scaling_right_shift, int diff_min,
+                       uint8_t* output_data, const Dims<4>& output_dims) {
+  LogSoftmax(input_data, DimsToShape(input_dims), input_multiplier,
+             input_left_shift, reverse_scaling_divisor,
+             reverse_scaling_right_shift, diff_min, output_data,
+             DimsToShape(output_dims));
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+           output_data);
+}
+
+inline void Logistic(const uint8_t* input_data, const Dims<4>& input_dims,
+                     int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int input_left_shift,
+                     uint8_t* output_data, const Dims<4>& output_dims) {
+  Logistic(input_data, DimsToShape(input_dims), input_zero_point,
+           input_range_radius, input_multiplier, input_left_shift, output_data,
+           DimsToShape(output_dims));
+}
+
+inline void Logistic(const int16_t* input_data, const Dims<4>& input_dims,
+                     int16_t* output_data, const Dims<4>& output_dims) {
+  Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+           output_data);
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  Tanh(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+inline void Tanh(const uint8_t* input_data, const Dims<4>& input_dims,
+                 int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int input_left_shift,
+                 uint8_t* output_data, const Dims<4>& output_dims) {
+  Tanh(input_data, DimsToShape(input_dims), input_zero_point,
+       input_range_radius, input_multiplier, input_left_shift, output_data,
+       DimsToShape(output_dims));
+}
+
+inline void Tanh(const int16_t* input_data, const Dims<4>& input_dims,
+                 int input_left_shift, int16_t* output_data,
+                 const Dims<4>& output_dims) {
+  Tanh(input_data, DimsToShape(input_dims), input_left_shift, output_data,
+       DimsToShape(output_dims));
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = block_size;
+
+  DepthToSpace(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::SpaceToDepthParams op_params;
+  op_params.block_size = block_size;
+
+  SpaceToDepth(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void Mul(const T* input1_data, const Dims<4>& input1_dims,
+                const T* input2_data, const Dims<4>& input2_dims,
+                T output_activation_min, T output_activation_max,
+                T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+inline void Mul(const int16_t* input1_data, const Dims<4>& input1_dims,
+                const int16_t* input2_data, const Dims<4>& input2_dims,
+                int16_t* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  // No params in this version.
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void Mul(const int16_t* input1_data, const Dims<4>& input1_dims,
+                const int16_t* input2_data, const Dims<4>& input2_dims,
+                int32_t output_offset, int32_t output_activation_min,
+                int32_t output_activation_max, uint8_t* output_data,
+                const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.output_offset = output_offset;
+
+  Mul(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+                                       const Dims<4>& input_dims, int range,
+                                       float bias, float alpha, float beta,
+                                       float* output_data,
+                                       const Dims<4>& output_dims) {
+  tflite::LocalResponseNormalizationParams op_params;
+  op_params.range = range;
+  op_params.bias = bias;
+  op_params.alpha = alpha;
+  op_params.beta = beta;
+
+  LocalResponseNormalization(op_params, DimsToShape(input_dims), input_data,
+                             DimsToShape(output_dims), output_data);
+}
+
+template <typename SrcT, typename DstT>
+void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data,
+          const Dims<4>& output_dims) {
+  Cast(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
+}
+
+template <typename T>
+inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims, T* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = align_corners;
+  op_params.half_pixel_centers = false;
+  ResizeBilinear(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(output_size_dims), output_size_data,
+                 DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear<float>(input_data, input_dims, output_size_data,
+                        output_size_dims, output_data, output_dims,
+                        /*align_corners=*/false);
+}
+
+inline void ResizeBilinear(const uint8_t* input_data, const Dims<4>& input_dims,
+                           const int32_t* output_size_data,
+                           const Dims<4>& output_size_dims,
+                           uint8_t* output_data, const Dims<4>& output_dims) {
+  ResizeBilinear<uint8_t>(input_data, input_dims, output_size_data,
+                          output_size_dims, output_data, output_dims,
+                          /*align_corners=*/false);
+}
+
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+                           const int32_t* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32_t* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims,
+                           const int32_t pad_value) {
+  tflite::SpaceToBatchParams op_params;
+  op_params.output_offset = pad_value;
+
+  SpaceToBatchND(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(paddings_dims), paddings_data,
+                 DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+                           const int32_t* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32_t* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  tflite::SpaceToBatchParams op_params;
+  op_params.output_offset = 0;
+
+  SpaceToBatchND(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(paddings_dims), paddings_data,
+                 DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+                           const int32_t* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32_t* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
+  BatchToSpaceND(DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(crops_dims), crops_data, DimsToShape(output_dims),
+                 output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  tflite::PadParams op_params;
+  op_params.left_padding_count = 4;
+  op_params.right_padding_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.left_padding[i] = left_paddings[3 - i];
+    op_params.right_padding[i] = right_paddings[3 - i];
+  }
+  // SetFloatOrInt(pad_value, &op_params.pad_value);
+  const T pad_value_copy = pad_value;
+
+  Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+      DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  const T converted_pad_value = static_cast<T>(pad_value);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, converted_pad_value);
+}
+
+// Old Pad that only padded with 0.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  const T pad_value = static_cast<T>(0);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, pad_value);
+}
+
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T, typename Op>
+void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                              const T* input2_data, const Dims<4>& input2_dims,
+                              T* output_data, const Dims<4>& output_dims,
+                              Op op) {
+  MaximumMinimumBroadcastSlow(DimsToShape(input1_dims), input1_data,
+                              DimsToShape(input2_dims), input2_data,
+                              DimsToShape(output_dims), output_data, op);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data,
+            const tflite::Dims<4>& input_dims, T2* output_data,
+            const tflite::Dims<4>& output_dims) {
+  // Assumes the input always has 4 dimensions, and therefore,
+  // output always has three dimensions.
+  auto output_shape = RuntimeShape(
+      {output_dims.sizes[2], output_dims.sizes[1], output_dims.sizes[0]});
+  // Another way to interpret this is that output_dims.sizes[4] is always 1.
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(),
+                   DimsToShape(output_dims).FlatSize());
+  // Legacy path only supported this.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+  ArgMinMax(DimsToShape(input_dims), input_data, axis, output_shape,
+            output_data, std::greater<T1>());
+}
+
+template <typename T1, typename T2, typename T3, typename Cmp>
+void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+               T2* output_data, const Dims<4>& output_dims, const Cmp& cmp) {
+  ArgMinMax(axis, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+            output_data, cmp);
+}
+
+template <typename T>
+inline void Pow(const T* input1_data, const Dims<4>& input1_dims,
+                const T* input2_data, const Dims<4>& input2_dims,
+                T* output_data, const Dims<4>& output_dims) {
+  Pow(DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims),
+      input2_data, DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void BroadcastPow(const T* input1_data, const Dims<4>& input1_dims,
+                         const T* input2_data, const Dims<4>& input2_dims,
+                         T* output_data, const Dims<4>& output_dims) {
+  BroadcastPow4DSlow(DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction(const T1* input1_data,
+                                    const Dims<4>& input1_dims,
+                                    const T2* input2_data,
+                                    const Dims<4>& input2_dims, R* output_data,
+                                    const Dims<4>& output_dims,
+                                    R (*func)(T1, T2)) {
+  BroadcastBinaryFunction(DimsToShape(input1_dims), input1_data,
+                          DimsToShape(input2_dims), input2_data,
+                          DimsToShape(output_dims), output_data, func);
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BinaryFunction(const T1* input1_data, const Dims<4>& input1_dims,
+                           const T2* input2_data, const Dims<4>& input2_dims,
+                           R* output_data, const Dims<4>& output_dims,
+                           R (*func)(T1, T2)) {
+  BinaryFunction(DimsToShape(input1_dims), input1_data,
+                 DimsToShape(input2_dims), input2_data,
+                 DimsToShape(output_dims), output_data, func);
+}
+
+template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::SliceParams op_params;
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.begin[i] = begin[3 - i];
+    op_params.size[i] = size[3 - i];
+  }
+
+  Slice(op_params, DimsToShape(input_dims), input_data,
+        DimsToShape(output_dims), output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/log_softmax.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/log_softmax.h
new file mode 100644
index 00000000..394dd3a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/log_softmax.h
@@ -0,0 +1,256 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void LogSoftmax(const SoftmaxParams& params,
+                       const RuntimeShape& input_shape, const float* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(input_data[i * depth + c] - max);
+    }
+
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
+    }
+  }
+}
+
+inline void LogSoftmax(const SoftmaxParams& params,
+                       const RuntimeShape& input_shape,
+                       const uint8_t* input_data,
+                       const RuntimeShape& output_shape, uint8_t* output_data) {
+  const int32_t input_multiplier = params.input_multiplier;
+  const int32_t input_left_shift = params.input_left_shift;
+  const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
+  const int32_t reverse_scaling_right_shift =
+      params.reverse_scaling_right_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large
+  // as -32 before multiplying by input_beta_multiplier, and therefore as
+  // large as -16 afterwards.  Note that exp(-8) is definitely not
+  // insignificant to accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8_t max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    const int32_t fixed_log_sum_of_exps =
+        log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
+            sum_of_exps)
+            .raw();
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32_t>::lowest();
+    const int adjusted_diff_min =
+        std::max(static_cast<int32_t>(
+                     diff_min - 1),  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     -reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32_t unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8_t>(
+            std::max(std::min(unsat_output, static_cast<int32_t>(255)),
+                     static_cast<int32_t>(0)));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void LogSoftmaxQuantized(const SoftmaxParams& params,
+                                const size_t outer_size, const size_t depth,
+                                const RuntimeShape& input_shape,
+                                const T* input_data,
+                                const RuntimeShape& output_shape,
+                                T* output_data) {
+  const int32_t input_multiplier = params.input_multiplier;
+  const int32_t input_left_shift = params.input_left_shift;
+  const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
+  const int32_t reverse_scaling_right_shift =
+      params.reverse_scaling_right_shift;
+  const int diff_min = params.diff_min;
+
+  static constexpr T kMinT8 = std::numeric_limits<T>::min();
+  static constexpr T kMaxT8 = std::numeric_limits<T>::max();
+  static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
+
+  // All IntegerBits must agree with Prepare function.
+  // Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
+  static constexpr int kInputIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+  using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+
+  for (size_t outer_index = 0; outer_index < outer_size; ++outer_index) {
+    T max_in_row = kMinT8;
+    for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
+      max_in_row =
+          std::max(max_in_row, input_data[outer_index * depth + inner_index]);
+    }
+
+    // Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
+    F12 sum_of_exps_in_q12 = F12::FromRaw(0);
+    for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_left_shift);
+        sum_of_exps_in_q12 =
+            sum_of_exps_in_q12 +
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
+      }
+    }
+
+    const int32_t log_sum_of_exps_in_q5 =
+        log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
+            sum_of_exps_in_q12)
+            .raw();
+
+    // Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
+    // smallest representable in Q5.26 plus the log_sum_of_exps.
+    const int32_t shifted_log_sum_of_exps_in_q5 =
+        log_sum_of_exps_in_q5 + kMinInt32;
+    const int32_t adjusted_diff_min =
+        std::max(static_cast<int32_t>(diff_min - 1),
+                 MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
+                                               reverse_scaling_divisor,
+                                               -reverse_scaling_right_shift));
+
+    for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      // Note use of > below instead of >= above.
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_left_shift);
+
+        // Rescale and downcast.
+        int32_t output_in_q27 =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_in_q5 - log_sum_of_exps_in_q5),
+                31 - kInputIntegerBits - kOutputIntegerBits) +
+            kMaxT8;
+
+        output_in_q27 =
+            std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxT8)),
+                     static_cast<int32_t>(kMinT8));
+        output_data[outer_index * depth + inner_index] =
+            static_cast<T>(output_in_q27);
+      } else {
+        output_data[outer_index * depth + inner_index] = kMinT8;
+      }
+    }
+  }
+}
+
+inline void LogSoftmax(const SoftmaxParams& params, const size_t outer_size,
+                       const size_t depth, const RuntimeShape& input_shape,
+                       const int8_t* input_data,
+                       const RuntimeShape& output_shape, int8_t* output_data) {
+  LogSoftmaxQuantized(params, outer_size, depth, input_shape, input_data,
+                      output_shape, output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/logistic.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/logistic.h
new file mode 100644
index 00000000..64b7133b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
+                     const RuntimeShape& output_shape, float* output_data) {
+  const float cutoff_upper = 16.619047164916992188f;
+  const float cutoff_lower = -9.f;
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // Rational for using approximation in reference kernel.
+  // 0. This approximation gives enough precision for float.
+  // 1. This works around an issue on an embedded chipset where exp() does not
+  // return correctly as expected - exp(x) should return inf when overflown
+  // not 1.701417   IEEE 754 defines representation for inf.
+  // 2. This will speed up calculation and is matching the behavior in the
+  // optimized kernels. (check the definition of scalar_logistic_op<float>)
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result;
+    if (val > cutoff_upper) {
+      result = 1.0f;
+    } else if (val < cutoff_lower) {
+      result = std::exp(val);
+    } else {
+      result = 1.f / (1.f + std::exp(-val));
+    }
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
+                     const float* input_data, const RuntimeShape& output_shape,
+                     float* output_data) {
+  // Drop params: not needed.
+  Logistic(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const LogisticParams& params,
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    const F3 input = F3::FromRaw(input_data[i]);
+    F0 output = gemmlowp::logistic(input);
+    output_data[i] = output.raw();
+  }
+}
+
+// Quantized int8_t logistic activation.  Cheats by dequantizing and
+// requantizing around the floating point logistic method.  This implementation
+// is slow on platforms without a floating point unit.
+
+// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
+// approach used in TFLite for int8_t Logistic.
+inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
+                     float input_scale, int input_zero_point,
+                     const RuntimeShape& output_shape, int8_t* output_data,
+                     float output_scale, int output_zero_point) {
+  const float cutoff_upper = 16.619047164916992188f;
+  const float cutoff_lower = -9.f;
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // Rational for using approximation in reference kernel.
+  // 0. This approximation gives enough precision for float.
+  // 1. This works around an issue on an embedded chipset where exp() does not
+  // return correctly as expected - exp(x) should return inf when overflown
+  // not 1.701417   IEEE 754 defines representation for inf.
+  // 2. This will speed up calculation and is matching the behavior in the
+  // optimized kernels. (check the definition of scalar_logistic_op<float>)
+
+  for (int i = 0; i < flat_size; i++) {
+    // Dequantize.
+    float val =
+        static_cast<float>((input_data[i] - input_zero_point) * input_scale);
+    float result;
+    if (val > cutoff_upper) {
+      result = 1.0f;
+    } else if (val < cutoff_lower) {
+      result = std::exp(val);
+    } else {
+      result = 1.f / (1.f + std::exp(-val));
+    }
+    // Requantize
+    int8_t output =
+        static_cast<int8_t>(result / output_scale + output_zero_point);
+    output_data[i] = output;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/lstm_cell.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/lstm_cell.h
new file mode 100644
index 00000000..17b113eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/lstm_cell.h
@@ -0,0 +1,422 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
+    const float* prev_activ_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& unextended_bias_shape,
+    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
+    const float* prev_state_data,
+    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
+    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
+    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches =
+      MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                  output_state_shape, 0, output_activ_shape, 0);
+  const int height =
+      MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                  output_state_shape, 1, output_activ_shape, 1);
+  const int width =
+      MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                  output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  float const* concat_input_arrays_data[2] = {input_data, prev_activ_data};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
+                 weights_data, bias_shape, bias_data, activ_temp_shape,
+                 activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b) {
+    for (int w = 0; w < width; ++w) {
+      for (int h = 0; h < height; ++h) {
+        for (int c = 0; c < output_depth; ++c) {
+          const float input_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      0 * output_depth + c)]));
+          const float new_input = std::tanh(activ_temp_data[Offset(
+              activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      2 * output_depth + c)]));
+          const float output_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      3 * output_depth + c)]));
+          const float new_state =
+              input_gate * new_input +
+              forget_gate *
+                  prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+              output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void LstmCell(const LstmCellParams& params,
+                     const RuntimeShape& unextended_input_shape,
+                     const uint8_t* input_data_uint8,
+                     const RuntimeShape& unextended_prev_activ_shape,
+                     const uint8_t* prev_activ_data_uint8,
+                     const RuntimeShape& weights_shape,
+                     const uint8_t* weights_data_uint8,
+                     const RuntimeShape& unextended_bias_shape,
+                     const int32_t* bias_data_int32,
+                     const RuntimeShape& unextended_prev_state_shape,
+                     const int16_t* prev_state_data_int16,
+                     const RuntimeShape& unextended_output_state_shape,
+                     int16_t* output_state_data_int16,
+                     const RuntimeShape& unextended_output_activ_shape,
+                     uint8_t* output_activ_data_uint8,
+                     const RuntimeShape& unextended_concat_temp_shape,
+                     uint8_t* concat_temp_data_uint8,
+                     const RuntimeShape& unextended_activ_temp_shape,
+                     int16_t* activ_temp_data_int16, void* gemmlowp_context) {
+  (void)gemmlowp_context;  // only used in optimized code.
+  int32_t weights_zero_point = params.weights_zero_point;
+  int32_t accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(
+      input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
+      output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8_t const* concat_input_arrays_data[2] = {input_data_uint8,
+                                                prev_activ_data_uint8};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape,
+                concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16_t input_val =
+            concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16_t weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16_t clamped_output_activ = std::max<int16_t>(
+          -128, std::min<int16_t>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/maximum_minimum.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
new file mode 100644
index 00000000..cd11b419
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T, typename Op, int N = 5>
+void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape,
+                                 const T* input1_data,
+                                 const RuntimeShape& unextended_input2_shape,
+                                 const T* input2_data,
+                                 const RuntimeShape& unextended_output_shape,
+                                 T* output_data, Op op) {
+  // Uses element-wise calculation if broadcast is not required.
+  if (unextended_input1_shape == unextended_input2_shape) {
+    const int flat_size =
+        MatchingElementsSize(unextended_input1_shape, unextended_input2_shape,
+                             unextended_output_shape);
+    for (int i = 0; i < flat_size; ++i) {
+      output_data[i] = op(input1_data[i], input2_data[i]);
+    }
+  } else {
+    TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+    TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+    TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+    NdArrayDesc<N> desc1;
+    NdArrayDesc<N> desc2;
+    NdArrayDesc<N> output_desc;
+    NdArrayDescsForElementwiseBroadcast(
+        unextended_input1_shape, unextended_input2_shape, &desc1, &desc2);
+    CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                   &output_desc);
+
+    auto maxmin_func = [&](int indexes[N]) {
+      output_data[SubscriptToIndex(output_desc, indexes)] =
+          op(input1_data[SubscriptToIndex(desc1, indexes)],
+             input2_data[SubscriptToIndex(desc2, indexes)]);
+    };
+    NDOpsHelper<N>(output_desc, maxmin_func);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/mul.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/mul.h
new file mode 100644
index 00000000..fca74a32
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/mul.h
@@ -0,0 +1,267 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
+
+#include <algorithm>
+#include <complex>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Maximum dimension supported by the broadcast mul operation.
+constexpr int kMaxMulBroadcastDim = 6;
+
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+template <typename T>
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  const int flat_size =
+      MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax<T>(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape,
+                const std::complex<float>* input1_data,
+                const RuntimeShape& input2_shape,
+                const std::complex<float>* input2_data,
+                const RuntimeShape& output_shape,
+                std::complex<float>* output_data) {
+  const int flat_size =
+      MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = input1_data[i] * input2_data[i];
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T, typename F>
+void BroadcastMulRecursiveDimensions(
+    const ArithmeticParams& params, int dimension, const T* input1_data,
+    const T* input2_data, T* output_data, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    const NdArrayDesc<kMaxMulBroadcastDim>& desc1,
+    const NdArrayDesc<kMaxMulBroadcastDim>& desc2,
+    const int32_t extended_output_shape_dims[kMaxMulBroadcastDim],
+    F binary_func) {
+  if (dimension == kMaxMulBroadcastDim - 1) {
+    for (int c = 0; c < extended_output_shape_dims[dimension]; ++c) {
+      const T input1_val = input1_data[*input1_offset_p];
+      const T input2_val = input2_data[*input2_offset_p];
+      output_data[*output_offset] = binary_func(params, input1_val, input2_val);
+      *input1_offset_p += desc1.strides[dimension];
+      *input2_offset_p += desc2.strides[dimension];
+      ++(*output_offset);
+    }
+  } else {
+    for (int a = 0; a < extended_output_shape_dims[dimension]; ++a) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastMulRecursiveDimensions(
+          params, dimension + 1, input1_data, input2_data, output_data,
+          &input1_offset_c, &input2_offset_c, output_offset, desc1, desc2,
+          extended_output_shape_dims, binary_func);
+      *input1_offset_p += desc1.strides[dimension];
+      *input2_offset_p += desc2.strides[dimension];
+    }
+  }
+}
+
+inline void BroadcastMul6DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8_t* output_data) {
+  NdArrayDesc<kMaxMulBroadcastDim> desc1;
+  NdArrayDesc<kMaxMulBroadcastDim> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
+
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastMulRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const uint8_t input1_val,
+         const uint8_t input2_val) {
+        const int32_t offsetted_input1_val = params.input1_offset + input1_val;
+        const int32_t offsetted_input2_val = params.input2_offset + input2_val;
+        const int32_t unclamped_result =
+            params.output_offset +
+            MultiplyByQuantizedMultiplier(
+                offsetted_input1_val * offsetted_input2_val,
+                params.output_multiplier, params.output_shift);
+        const int32_t clamped_output = std::min(
+            params.quantized_activation_max,
+            std::max(params.quantized_activation_min, unclamped_result));
+        return static_cast<uint8_t>(clamped_output);
+      });
+}
+
+template <typename T,
+          // For unquantized mul on small integers, explicitly set to true.
+          bool enable_for_short_integers = false>
+inline typename std::enable_if<
+    !is_small_integer<T>::value || enable_for_short_integers, void>::type
+BroadcastMul6DSlow(const ArithmeticParams& params,
+                   const RuntimeShape& unextended_input1_shape,
+                   const T* input1_data,
+                   const RuntimeShape& unextended_input2_shape,
+                   const T* input2_data,
+                   const RuntimeShape& unextended_output_shape,
+                   T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6);
+  NdArrayDesc<kMaxMulBroadcastDim> desc1;
+  NdArrayDesc<kMaxMulBroadcastDim> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastMulRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const T input1_val,
+         const T input2_val) {
+        T output_activation_min;
+        T output_activation_max;
+        GetActivationParams(params, &output_activation_min,
+                            &output_activation_max);
+        return ActivationFunctionWithMinMax<T>(input1_val * input2_val,
+                                               output_activation_min,
+                                               output_activation_max);
+      });
+}
+
+inline void BroadcastMul6DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& unextended_input1_shape,
+                               const std::complex<float>* input1_data,
+                               const RuntimeShape& unextended_input2_shape,
+                               const std::complex<float>* input2_data,
+                               const RuntimeShape& unextended_output_shape,
+                               std::complex<float>* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 6);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 6);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 6);
+
+  NdArrayDesc<kMaxMulBroadcastDim> desc1;
+  NdArrayDesc<kMaxMulBroadcastDim> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxMulBroadcastDim, unextended_output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxMulBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
+
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastMulRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const std::complex<float> input1_val,
+         const std::complex<float> input2_val) {
+        return input1_val * input2_val;
+      });
+}
+
+template <typename T>
+inline void BroadcastMul4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  return BroadcastMul6DSlow(params, input1_shape, input1_data, input2_shape,
+                            input2_data, output_shape, output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/neg.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/neg.h
new file mode 100644
index 00000000..e127883f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/neg.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Negate(const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = -input_data[i];
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/non_max_suppression.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
new file mode 100644
index 00000000..64c27c17
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
@@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NON_MAX_SUPPRESSION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NON_MAX_SUPPRESSION_H_
+
+#include <algorithm>
+#include <cmath>
+#include <deque>
+#include <queue>
+
+namespace tflite {
+namespace reference_ops {
+
+// A pair of diagonal corners of the box.
+struct BoxCornerEncoding {
+  float y1;
+  float x1;
+  float y2;
+  float x2;
+};
+
+inline float ComputeIntersectionOverUnion(const float* boxes, const int i,
+                                          const int j) {
+  auto& box_i = reinterpret_cast<const BoxCornerEncoding*>(boxes)[i];
+  auto& box_j = reinterpret_cast<const BoxCornerEncoding*>(boxes)[j];
+  const float box_i_y_min = std::min<float>(box_i.y1, box_i.y2);
+  const float box_i_y_max = std::max<float>(box_i.y1, box_i.y2);
+  const float box_i_x_min = std::min<float>(box_i.x1, box_i.x2);
+  const float box_i_x_max = std::max<float>(box_i.x1, box_i.x2);
+  const float box_j_y_min = std::min<float>(box_j.y1, box_j.y2);
+  const float box_j_y_max = std::max<float>(box_j.y1, box_j.y2);
+  const float box_j_x_min = std::min<float>(box_j.x1, box_j.x2);
+  const float box_j_x_max = std::max<float>(box_j.x1, box_j.x2);
+
+  const float area_i =
+      (box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min);
+  const float area_j =
+      (box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min);
+  if (area_i <= 0 || area_j <= 0) return 0.0;
+  const float intersection_ymax = std::min<float>(box_i_y_max, box_j_y_max);
+  const float intersection_xmax = std::min<float>(box_i_x_max, box_j_x_max);
+  const float intersection_ymin = std::max<float>(box_i_y_min, box_j_y_min);
+  const float intersection_xmin = std::max<float>(box_i_x_min, box_j_x_min);
+  const float intersection_area =
+      std::max<float>(intersection_ymax - intersection_ymin, 0.0) *
+      std::max<float>(intersection_xmax - intersection_xmin, 0.0);
+  return intersection_area / (area_i + area_j - intersection_area);
+}
+
+// Implements (Single-Class) Soft NMS (with Gaussian weighting).
+// Supports functionality of TensorFlow ops NonMaxSuppressionV4 & V5.
+// Reference: "Soft-NMS - Improving Object Detection With One Line of Code"
+//            [Bodla et al, https://arxiv.org/abs/1704.04503]
+// Implementation adapted from the TensorFlow NMS code at
+// tensorflow/core/kernels/non_max_suppression_op.cc.
+//
+// Arguments:
+//  boxes: box encodings in format [y1, x1, y2, x2], shape: [num_boxes, 4]
+//  num_boxes: number of candidates
+//  scores: scores for candidate boxes, in the same order. shape: [num_boxes]
+//  max_output_size: the maximum number of selections.
+//  iou_threshold: Intersection-over-Union (IoU) threshold for NMS
+//  score_threshold: All candidate scores below this value are rejected
+//  soft_nms_sigma: Soft NMS parameter, used for decaying scores
+//
+// Outputs:
+//  selected_indices: all the selected indices. Underlying array must have
+//    length >= max_output_size. Cannot be null.
+//  selected_scores: scores of selected indices. Defer from original value for
+//    Soft NMS. If not null, array must have length >= max_output_size.
+//  num_selected_indices: Number of selections. Only these many elements are
+//    set in selected_indices, selected_scores. Cannot be null.
+//
+// Assumes inputs are valid (for eg, iou_threshold must be >= 0).
+inline void NonMaxSuppression(const float* boxes, const int num_boxes,
+                              const float* scores, const int max_output_size,
+                              const float iou_threshold,
+                              const float score_threshold,
+                              const float soft_nms_sigma, int* selected_indices,
+                              float* selected_scores,
+                              int* num_selected_indices) {
+  struct Candidate {
+    int index;
+    float score;
+    int suppress_begin_index;
+  };
+
+  // Priority queue to hold candidates.
+  auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+    return bs_i.score < bs_j.score;
+  };
+  std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)>
+      candidate_priority_queue(cmp);
+  // Populate queue with candidates above the score threshold.
+  for (int i = 0; i < num_boxes; ++i) {
+    if (scores[i] > score_threshold) {
+      candidate_priority_queue.emplace(Candidate({i, scores[i], 0}));
+    }
+  }
+
+  *num_selected_indices = 0;
+  int num_outputs = std::min(static_cast<int>(candidate_priority_queue.size()),
+                             max_output_size);
+  if (num_outputs == 0) return;
+
+  // NMS loop.
+  float scale = 0;
+  if (soft_nms_sigma > 0.0) {
+    scale = -0.5 / soft_nms_sigma;
+  }
+  while (*num_selected_indices < num_outputs &&
+         !candidate_priority_queue.empty()) {
+    Candidate next_candidate = candidate_priority_queue.top();
+    const float original_score = next_candidate.score;
+    candidate_priority_queue.pop();
+
+    // Overlapping boxes are likely to have similar scores, therefore we
+    // iterate through the previously selected boxes backwards in order to
+    // see if `next_candidate` should be suppressed. We also enforce a property
+    // that a candidate can be suppressed by another candidate no more than
+    // once via `suppress_begin_index` which tracks which previously selected
+    // boxes have already been compared against next_candidate prior to a given
+    // iteration.  These previous selected boxes are then skipped over in the
+    // following loop.
+    bool should_hard_suppress = false;
+    for (int j = *num_selected_indices - 1;
+         j >= next_candidate.suppress_begin_index; --j) {
+      const float iou = ComputeIntersectionOverUnion(
+          boxes, next_candidate.index, selected_indices[j]);
+
+      // First decide whether to perform hard suppression.
+      if (iou >= iou_threshold) {
+        should_hard_suppress = true;
+        break;
+      }
+
+      // Suppress score if NMS sigma > 0.
+      if (soft_nms_sigma > 0.0) {
+        next_candidate.score =
+            next_candidate.score * std::exp(scale * iou * iou);
+      }
+
+      // If score has fallen below score_threshold, it won't be pushed back into
+      // the queue.
+      if (next_candidate.score <= score_threshold) break;
+    }
+    // If `next_candidate.score` has not dropped below `score_threshold`
+    // by this point, then we know that we went through all of the previous
+    // selections and can safely update `suppress_begin_index` to
+    // `selected.size()`. If on the other hand `next_candidate.score`
+    // *has* dropped below the score threshold, then since `suppress_weight`
+    // always returns values in [0, 1], further suppression by items that were
+    // not covered in the above for loop would not have caused the algorithm
+    // to select this item. We thus do the same update to
+    // `suppress_begin_index`, but really, this element will not be added back
+    // into the priority queue.
+    next_candidate.suppress_begin_index = *num_selected_indices;
+
+    if (!should_hard_suppress) {
+      if (next_candidate.score == original_score) {
+        // Suppression has not occurred, so select next_candidate.
+        selected_indices[*num_selected_indices] = next_candidate.index;
+        if (selected_scores) {
+          selected_scores[*num_selected_indices] = next_candidate.score;
+        }
+        ++*num_selected_indices;
+      }
+      if (next_candidate.score > score_threshold) {
+        // Soft suppression might have occurred and current score is still
+        // greater than score_threshold; add next_candidate back onto priority
+        // queue.
+        candidate_priority_queue.push(next_candidate);
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NON_MAX_SUPPRESSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/pad.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/pad.h
new file mode 100644
index 00000000..27589445
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/pad.h
@@ -0,0 +1,169 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// TFLite Pad supports activation tensors with up to 5 dimensions.
+constexpr int PadKernelMaxDimensionCount() { return 5; }
+
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), input_shape);
+  const RuntimeShape ext_output_shape =
+      RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, PadKernelMaxDimensionCount());
+  TFLITE_DCHECK_LE(op_params.right_padding_count, PadKernelMaxDimensionCount());
+
+  // Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
+  // pad them to 5 dims (yes, we are "padding the padding").
+  int left_padding_copy[PadKernelMaxDimensionCount()];
+  for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
+    left_padding_copy[i] = 0;
+  }
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[i + PadKernelMaxDimensionCount() -
+                      op_params.left_padding_count] = op_params.left_padding[i];
+  }
+  int right_padding_copy[PadKernelMaxDimensionCount()];
+  for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
+    right_padding_copy[i] = 0;
+  }
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[i + PadKernelMaxDimensionCount() -
+                       op_params.right_padding_count] =
+        op_params.right_padding[i];
+  }
+
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_plane = ext_output_shape.Dims(1);
+  const int output_height = ext_output_shape.Dims(2);
+  const int output_width = ext_output_shape.Dims(3);
+  const int output_depth = ext_output_shape.Dims(4);
+
+  const int left_b_padding = left_padding_copy[0];
+  const int left_p_padding = left_padding_copy[1];
+  const int left_h_padding = left_padding_copy[2];
+  const int left_w_padding = left_padding_copy[3];
+  const int left_d_padding = left_padding_copy[4];
+
+  const int right_b_padding = right_padding_copy[0];
+  const int right_p_padding = right_padding_copy[1];
+  const int right_h_padding = right_padding_copy[2];
+  const int right_w_padding = right_padding_copy[3];
+  const int right_d_padding = right_padding_copy[4];
+
+  const T pad_value = *pad_value_ptr;
+
+  const T* in_ptr = input_data;
+  T* out_ptr = output_data;
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_p = 0; out_p < output_plane; ++out_p) {
+      for (int out_h = 0; out_h < output_height; ++out_h) {
+        for (int out_w = 0; out_w < output_width; ++out_w) {
+          for (int out_d = 0; out_d < output_depth; ++out_d) {
+            if (out_b < left_b_padding ||
+                out_b >= output_batch - right_b_padding ||
+                out_p < left_p_padding ||
+                out_p >= output_plane - right_p_padding ||
+                out_h < left_h_padding ||
+                out_h >= output_height - right_h_padding ||
+                out_w < left_w_padding ||
+                out_w >= output_width - right_w_padding ||
+                out_d < left_d_padding ||
+                out_d >= output_depth - right_d_padding) {
+              *out_ptr++ = pad_value;
+            } else {
+              *out_ptr++ = *in_ptr++;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+template <typename T, typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const P* pad_value_ptr,
+                          const RuntimeShape& output_shape, T* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const float* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          float* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/pooling.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/pooling.h
new file mode 100644
index 00000000..fe17484c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -0,0 +1,303 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const float* input_data,
+                        const RuntimeShape& output_shape, float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          float total = 0.f;
+          float filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              total +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          if (filter_count == 0) return false;
+          const float average = total / filter_count;
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              ActivationFunctionWithMinMax(average, params.float_activation_min,
+                                           params.float_activation_max);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline bool AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32_t acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          if (filter_count == 0) return false;
+          acc = (acc + filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
+                   const float* input_data, const RuntimeShape& output_shape,
+                   float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          float sum_squares = 0.f;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              const float val =
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              sum_squares += val * val;
+              filter_count++;
+            }
+          }
+          const float l2pool_result = std::sqrt(sum_squares / filter_count);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              ActivationFunctionWithMinMax(l2pool_result,
+                                           params.float_activation_min,
+                                           params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const float* input_data, const RuntimeShape& output_shape,
+                    float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          float max = std::numeric_limits<float>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              ActivationFunctionWithMinMax(max, params.float_activation_min,
+                                           params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
+  TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          uint8_t max = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<uint8_t>(max, params.quantized_activation_min);
+          max = std::min<uint8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<uint8_t>(max);
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
new file mode 100644
index 00000000..7c623f71
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -0,0 +1,336 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+namespace tensor_utils {
+
+// Check if all entries of a vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+// Check if all entries of a vector are zero for int8_t.
+bool IsZeroVector(const int8_t* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
+                                  scaling_factor);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
+                                  max_value, scaling_factor);
+}
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset) {
+  PortableAsymmetricQuantizeFloats(values, size, quantized_values,
+                                   scaling_factor, offset);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch,
+                                         float* __restrict__ result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
+      context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch, int32_t* scratch,
+                                         float* __restrict__ result,
+                                         CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+      matrix, segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+
+    int8_t* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+      matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
+      input_offset, output_multiplier, output_shift, per_channel_scale,
+      per_channel_shift, output_offset, output_activation_min,
+      output_activation_max, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    const float* per_channel_scale) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                    int32_t n_row, int32_t n_col,
+                                    int32_t* output) {
+  PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
+}
+
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
+void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output) {
+  PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
+}
+
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
+                   output_zp, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result) {
+  PortableVectorBatchVectorCwiseProductAccumulate(
+      vector, v_size, batch_vector, n_batch, multiplier, shift, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return PortableVectorVectorDotProduct(vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+// Multiply all elements of vector with a scalar.
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result) {
+  PortableVectorScalarMultiply(vector, v_size, scale, result);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void MeanStddevNormalization(const float* input_vector, float* output_vector,
+                             int v_size, int n_batch) {
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
new file mode 100644
index 00000000..5e228bbb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -0,0 +1,248 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
+
+#include <algorithm>
+#include <cstdint>
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation.
+class CpuBackendContext;
+
+namespace tensor_utils {
+
+template <typename T>
+bool PortableIsZeroVector(const T* vector, int v_size) {
+  for (int i = 0; i < v_size; ++i) {
+    if (vector[i] != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// LINT.IfChange(portable_symmetric_quantize_floats)
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float min_value,
+                                     float max_value, float* scaling_factor);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h:portable_symmetric_quantize_floats)
+
+void PortableAsymmetricQuantizeFloats(const float* values, const int size,
+                                      int8_t* quantized_values,
+                                      float* scaling_factor, int32_t* offset);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vector, const float* scaling_factors,
+    int n_batch, int32_t* scratch, float* __restrict__ result,
+    CpuBackendContext* context);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    const float* per_channel_scale);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size);
+
+void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                              const int16_t* vector2,
+                                              int v_size, int n_batch,
+                                              int32_t* result);
+
+void PortableVectorBatchVectorCwiseProductAccumulate(
+    const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
+    int32_t multiplier, int shift, int16_t* result);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context);
+
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp);
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output);
+
+void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
+                                            int32_t scalar, int32_t n_row,
+                                            int32_t n_col, int32_t* output);
+
+void PortableApplyLayerNorm(const int16_t* input,
+                            const int16_t* layer_norm_weights,
+                            const int32_t* bias, int32_t layer_norm_scale_a,
+                            int32_t layer_norm_scale_b, int32_t variance_limit,
+                            int n_batch, int n_input, int16_t* output);
+
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output);
+
+void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
+                          int32_t n_input, int16_t* output);
+
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output);
+
+void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
+                       int32_t n_batch, int32_t n_input, int16_t* output);
+
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output);
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int16_t* output);
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int32_t multiplier, int32_t shift, int32_t n_batch,
+                      int32_t n_input, int32_t output_zp, int8_t* output);
+
+void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int16_t* output);
+
+template <typename T>
+void PortableCwiseClipping(T* vector, const int v_size,
+                           const T& clipping_value) {
+  for (int i = 0; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<T>(-clipping_value));
+  }
+}
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+                                     int n_batch, float* batch_vector);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+
+void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
+// Multiply all elements of vector with a scalar.
+void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                                  float* result);
+
+// Reduce-sum on a vector:
+// input_vector: pointer to input vector.
+// output_vector: pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+template <typename INPUT, typename OUTPUT>
+void PortableReductionSumVector(const INPUT* input_vector,
+                                OUTPUT* output_vector, int output_size,
+                                int reduction_size) {
+  for (int o = 0; o < output_size; o++) {
+    OUTPUT result = 0;
+    for (int r = 0; r < reduction_size; r++) {
+      result += input_vector[r];
+    }
+    output_vector[o] = result;
+    input_vector += reduction_size;
+  }
+}
+
+// Layer norm for each batch.
+void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
+                                     float* __restrict__ output_vector,
+                                     int v_size, int n_batch);
+
+// Saturate Add.
+void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                                  const int8_t* recurrent, int8_t recurrent_zp,
+                                  int32_t input_effective_scale_a,
+                                  int32_t input_effective_scale_b,
+                                  int32_t recurrent_effective_scale_a,
+                                  int32_t recurrent_effective_scale_b,
+                                  int32_t n_batch, int32_t n_cell,
+                                  int16_t* output);
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/prelu.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/prelu.h
new file mode 100644
index 00000000..aa9901d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
+template <typename T>
+inline void BroadcastPrelu4DSlow(
+    const PreluParams& params, const RuntimeShape& input_shape,
+    const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data,
+    const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32_t input_value =
+              params.input_offset + input_data[input_index];
+          int32_t output_value;
+          if (input_value >= 0) {
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value, params.output_multiplier_1, params.output_shift_1);
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32_t alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value * alpha_value, params.output_multiplier_2,
+                params.output_shift_2);
+          }
+          output_value += params.output_offset;
+
+          const int32_t quantized_min = std::numeric_limits<T>::min();
+          const int32_t quantized_max = std::numeric_limits<T>::max();
+          const int32_t clamped_output =
+              std::min(quantized_max, std::max(quantized_min, output_value));
+          output_data[output_index] = static_cast<T>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32_t quantized_min = std::numeric_limits<T>::min();
+  const int32_t quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = params.input_offset + input_data[i];
+    int32_t output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32_t alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32_t clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
new file mode 100644
index 00000000..bda27693
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Consolidates dimensions in broadcast inputs, checks for five-fold pattern.
+//
+// For example, if sequence of dimensions of one input is
+// ..., 1, 3, 1, 7, 9, 5,... and the other is ..., 2, 3, 1, 7, 1, 1, ...
+// we can consolidate these as
+// ..., 1, 3*7, 9*5, ... and 2, 3*7, 1.
+//
+// The category is updated in the less-frequent case of shapes that are
+// not suited to a fivefold-loop broadcast.
+//
+// Falls back to generic pattern when it does not know how to process properly.
+//
+// Returns true iff there is some sort of broadcast, which includes five-fold
+// patterns and falling back to generic broadcast.
+inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
+                                   const RuntimeShape& shape1,
+                                   tflite::ArithmeticParams* params) {
+  const int dims_count =
+      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  RuntimeShape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1) {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i) {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
+      continue;
+    } else if (extended_shape0.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    } else if (extended_shape1.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    } else {
+      // This case is erroneous: there is a dimension that does not match and
+      // is not a broadcast from one shape to the other.
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      return true;
+    }
+  }
+
+  if (params->broadcast_category !=
+          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category !=
+          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    // This is unreachable because at least one else clause in the above loop
+    // must be reached.
+    TFLITE_DCHECK(false);
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs = params->broadcast_category ==
+                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const RuntimeShape* shape_a =
+      swap_inputs ? &extended_shape1 : &extended_shape0;
+  const RuntimeShape* shape_b =
+      swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1) {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1) {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0) {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/quantize.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/quantize.h
new file mode 100644
index 00000000..f304b641
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/quantize.h
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename InputT, typename OutputT>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const InputT* input_data,
+                           const RuntimeShape& output_shape,
+                           OutputT* output_data) {
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
+
+  for (int i = 0; i < flat_size; i++) {
+    const InputT val = input_data[i];
+    int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
+        zero_point;
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+// Quantizes per-channel.
+template <typename InputT, typename OutputT>
+inline void PerChannelQuantize(
+    const tflite::PerChannelQuantizationParams& op_params,
+    const RuntimeShape& input_shape, const InputT* input_data,
+    const RuntimeShape& output_shape, OutputT* output_data) {
+  // Ensure flat size is same.
+  MatchingFlatSize(input_shape, output_shape);
+
+  const int32_t* zero_point = op_params.zero_point;
+  const float* scale = op_params.scale;
+  const int32_t quantized_dimension = op_params.quantized_dimension;
+  const int32_t num_dims = input_shape.DimensionsCount();
+  const int32_t* dims_data = input_shape.DimsData();
+  std::vector<int> current_dim(num_dims, 0);
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
+
+  do {
+    size_t offset =
+        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
+                            current_dim.data(), 0, nullptr);
+    const InputT val = input_data[offset];
+    const int channel = current_dim[quantized_dimension];
+    int32_t unclamped = static_cast<int32_t>(TfLiteRound(
+                            val / static_cast<float>(scale[channel]))) +
+                        zero_point[channel];
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[offset] = static_cast<OutputT>(clamped);
+  } while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
+                     current_dim.data()));
+}
+
+}  // namespace reference_ops
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/reduce.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/reduce.h
new file mode 100644
index 00000000..5b795ea8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -0,0 +1,491 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+// Check if the reduction at index is the first one along the dimensions given
+// in axis.
+inline bool IsFirstReduction(const int* index, const int num_axis,
+                             const int* axis) {
+  if (num_axis == 0) {
+    return true;
+  }
+
+  TFLITE_DCHECK(index != nullptr);
+  TFLITE_DCHECK(axis != nullptr);
+  for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+    if (index[axis[axis_idx]] != 0) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+namespace tflite {
+
+namespace reference_ops {
+
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis.
+template <typename In, typename Out>
+inline bool Reduce(const In* input_data, const int* input_dims,
+                   const int* output_dims, const int input_num_dims,
+                   const int output_num_dims, const int* axis,
+                   const int num_axis, int* input_iter,
+                   Out reducer(Out current, const In in), Out* output_data) {
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do {
+    size_t input_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
+                                               input_iter, num_axis, axis);
+    output_data[output_offset] =
+        reducer(output_data[output_offset], input_data[input_offset]);
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+// Similar to above Reduce function but takes two reducer functions.
+// The 'reducer_first' is called with the first value of the reduction,
+// 'reducer_next' is then called for all the others.
+template <typename In, typename Out>
+inline bool Reduce(const In* input_data, const int* input_dims,
+                   const int* output_dims, const int input_num_dims,
+                   const int output_num_dims, const int* axis,
+                   const int num_axis, int* input_iter,
+                   const std::function<Out(In in)>& reducer_first,
+                   const std::function<Out(Out current, In in)>& reducer_next,
+                   Out* output_data) {
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do {
+    size_t input_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
+                                               input_iter, num_axis, axis);
+    if (IsFirstReduction(input_iter, num_axis, axis)) {
+      output_data[output_offset] = reducer_first(input_data[input_offset]);
+    } else {
+      output_data[output_offset] =
+          reducer_next(output_data[output_offset], input_data[input_offset]);
+    }
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+// This method parses the input 'axis' to remove duplicates and handle negative
+// values, and returns a valid 'out_axis'
+inline bool ResolveAxis(const int num_dims, const int* axis,
+                        const int64_t num_axis, int* out_axis,
+                        int* out_num_axis) {
+  *out_num_axis = 0;  // Just in case.
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0) {
+    return true;
+  }
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (int64_t idx = 0; idx < num_axis; ++idx) {
+    // Handle negative index. A positive index 'p_idx' can be represented as a
+    // negative index 'n_idx' as: n_idx = p_idx-num_dims
+    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    TFLITE_DCHECK(current >= 0 && current < num_dims);
+    if (current < 0 || current >= num_dims) {
+      return false;
+    }
+    bool is_dup = false;
+    for (int j = 0; j < *out_num_axis; ++j) {
+      if (out_axis[j] == current) {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup) {
+      out_axis[*out_num_axis] = current;
+      *out_num_axis += 1;
+    }
+  }
+  return true;
+}
+
+// This method expects that output_data has been initialized.
+template <typename In, typename Out>
+inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
+                          const int* output_dims, const int input_num_dims,
+                          const int output_num_dims, const int* axis,
+                          const int num_axis, int* input_iter,
+                          Out* output_data) {
+  auto reducer = [](const Out current, const In in) -> Out {
+    const Out actual_in = static_cast<Out>(in);
+    return current + actual_in;
+  };
+  return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
+                         output_num_dims, axis, num_axis, input_iter, reducer,
+                         output_data);
+}
+
+template <typename T>
+inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
+                                    const T init_value, T* data) {
+  size_t num_elements = 1;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    size_t current = static_cast<size_t>(dims[idx]);
+    // Overflow prevention.
+    if (current > 0 &&
+        num_elements > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_elements *= current;
+  }
+  for (size_t idx = 0; idx < num_elements; ++idx) {
+    data[idx] = init_value;
+  }
+  return true;
+}
+
+// Computes the generic value (i.e., sum/max/min/prod) of elements across
+// dimensions given in axis. It needs to pass in init_value and reducer.
+template <typename T>
+inline bool ReduceGeneric(const T* input_data, const int* input_dims,
+                          const int input_num_dims, T* output_data,
+                          const int* output_dims, const int output_num_dims,
+                          const int* axis, const int64_t num_axis_dimensions,
+                          bool keep_dims, int* temp_index, int* resolved_axis,
+                          T init_value,
+                          T reducer(const T current, const T in)) {
+  // Reset output data.
+  if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
+                               output_data)) {
+    return false;
+  }
+
+  // Return early when input shape has zero dim. This is done after initializing
+  // data for output tensor because there are cases that the input tensor is
+  // empty but output tensor is not. In that case, output tensor should be
+  // filled with init_value.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) return true;
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
+                      output_num_dims, resolved_axis, num_resolved_axis,
+                      temp_index, reducer, output_data);
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis.
+template <typename T, typename U>
+inline bool Mean(const T* input_data, const int* input_dims,
+                 const int input_num_dims, T* output_data,
+                 const int* output_dims, const int output_num_dims,
+                 const int* axis, const int num_axis_dimensions, bool keep_dims,
+                 int* temp_index, int* resolved_axis, U* temp_sum) {
+  ruy::profiler::ScopeLabel label("Mean");
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  size_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0) {
+    for (size_t idx = 0; idx < num_outputs; ++idx) {
+      output_data[idx] =
+          static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
+    }
+  }
+  return true;
+}
+
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const float* input_data,
+                 const RuntimeShape& unextended_output_shape,
+                 float* output_data) {
+  ruy::profiler::ScopeLabel label("Mean4D");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          value / (input_width * input_height);
+    }
+  }
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis for quantized values.
+template <typename T, typename U>
+inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
+                               const int* input_dims, const int input_num_dims,
+                               T* output_data, int32_t output_multiplier,
+                               int output_shift, int32_t output_zero_point,
+                               const int* output_dims,
+                               const int output_num_dims, const int* axis,
+                               const int num_axis_dimensions, bool keep_dims,
+                               int* temp_index, int* resolved_axis, U* temp_sum,
+                               bool compute_sum) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
+  const bool uint8_case = std::is_same<T, uint8_t>::value;
+  const bool int16_case = std::is_same<T, int16_t>::value;
+  if (uint8_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
+  } else if (int16_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
+  } else {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
+  }
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Return early when input shape has zero dim. This is done after initializing
+  // data for output tensor because there are cases that the input tensor is
+  // empty but output tensor is not. In that case, output tensor should be
+  // filled with init_value.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) return true;
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  int64_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > static_cast<size_t>(std::numeric_limits<int64_t>::max() /
+                                      num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis == 0) {
+    return true;
+  }
+
+  // Readapt output rescaling when calculating the mean to integrate a
+  // 1/num_elements_in_axis multiplier.
+  if (!compute_sum) {
+    TFLITE_DCHECK_GE(num_elements_in_axis, 0);
+    int shift =
+        63 - CountLeadingZeros(static_cast<uint64_t>(num_elements_in_axis));
+    // To avoid any overflow risk 'shift' should be <= 32 and to satisfy
+    // 'MultiplyByQuantizedMultiplier' pre-conditions 'output_shift - shift'
+    // should be >= -31. Clamp the value at the price of some precision loss.
+    shift = std::min(shift, 32);
+    shift = std::min(shift, 31 + output_shift);
+    output_multiplier = static_cast<int32_t>(
+        (static_cast<int64_t>(output_multiplier) << shift) /
+        num_elements_in_axis);
+    output_shift = output_shift - shift;
+  }
+
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    const U shifted_sum =
+        static_cast<U>(temp_sum[idx] - input_zero_point * num_elements_in_axis);
+    int32_t output = MultiplyByQuantizedMultiplier(
+                         shifted_sum, output_multiplier, output_shift) +
+                     output_zero_point;
+    output = std::min(std::max(output, kMinValue), kMaxValue);
+    output_data[idx] = static_cast<T>(output);
+  }
+  return true;
+}
+
+template <typename T, typename U>
+inline bool QuantizedMeanOrSumExtraArgs(
+    const T* input_data, int32_t input_zero_point, float input_scale,
+    const int* input_dims, const int input_num_dims, T* output_data,
+    float output_scale, int32_t output_multiplier, int output_shift,
+    int32_t output_zero_point, const int* output_dims,
+    const int output_num_dims, const int* axis, const int num_axis_dimensions,
+    bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum,
+    bool compute_sum) {
+  return QuantizedMeanOrSum<T, U>(
+      input_data, input_zero_point, input_dims, input_num_dims, output_data,
+      output_multiplier, output_shift, output_zero_point, output_dims,
+      output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index,
+      resolved_axis, temp_sum, compute_sum);
+}
+
+template <typename T>
+inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
+                                const RuntimeShape& input_shape, T* output_data,
+                                int32_t output_zero_point,
+                                const RuntimeShape& output_shape,
+                                const int* axis,
+                                const int64_t num_axis_dimensions,
+                                bool keep_dims, int* temp_index,
+                                int* resolved_axis, int32_t* temp_prod,
+                                int32_t scaling_multiplier, int scaling_shift) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
+                   resolved_axis, &num_resolved_axis)) {
+    return false;
+  }
+
+  // Calculate the reduced product by rescaling each multiplication step to
+  // avoid an overflow.
+  auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
+
+  auto reducer_next = [&](int32_t current, T in) -> int32_t {
+    const int64_t result =
+        static_cast<int64_t>(current) * (in - input_zero_point);
+    return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
+                                         scaling_shift);
+  };
+
+  if (!Reduce<T, int32_t>(
+          input_data, input_shape.DimsData(), output_shape.DimsData(),
+          input_shape.DimensionsCount(), output_shape.DimensionsCount(),
+          resolved_axis, num_resolved_axis, temp_index, reducer_first,
+          reducer_next, temp_prod)) {
+    return false;
+  }
+
+  for (int i = 0; i < output_shape.FlatSize(); i++) {
+    int32_t result =
+        MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
+                                      scaling_multiplier, scaling_shift) +
+        output_zero_point;
+    result = std::min(std::max(result, kMinValue), kMaxValue);
+    output_data[i] = static_cast<T>(result);
+  }
+
+  return true;
+}
+
+}  // namespace reference_ops
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/reference_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/reference_ops.h
new file mode 100644
index 00000000..7bf649b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -0,0 +1,1064 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/cast.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/gather.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/lstm_cell.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+#include "tensorflow/lite/kernels/internal/reference/select.h"
+#include "tensorflow/lite/kernels/internal/reference/slice.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Relu(const RuntimeShape& input_shape, const T* input_data,
+                 const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const T val = input_data[i];
+    const T lower = 0;
+    const T clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void Relu0To1(const RuntimeShape& input_shape, const T* input_data,
+                     const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Relu0To1 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const T val = input_data[i];
+    const T upper = 1;
+    const T lower = 0;
+    const T clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void Relu1(const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Relu1 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const T val = input_data[i];
+    const T upper = 1;
+    const T lower = -1;
+    const T clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("Relu6 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ReluParams& params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t val = static_cast<int32_t>(input_data[i]);
+    int32_t clamped = params.output_offset +
+                      MultiplyByQuantizedMultiplier(val - params.input_offset,
+                                                    params.output_multiplier,
+                                                    params.output_shift);
+    clamped = std::max(params.quantized_activation_min, clamped);
+    clamped = std::min(params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ActivationParams& params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
+  for (int i = 0; i < flat_size; ++i) {
+    const T val = input_data[i];
+    const T clamped = val > max_value   ? max_value
+                      : val < min_value ? min_value
+                                        : val;
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8_t* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8_t* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8_t* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8_t* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8_t* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0) {
+    const uint8_t* input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1) {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2) {
+        for (int i3 = 0; i3 < y3; ++i3) {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                         output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/Int16");
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16_t rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void Sub16(const ArithmeticParams& params,
+                  const RuntimeShape& input1_shape, const int16_t* input1_data,
+                  const RuntimeShape& input2_shape, const int16_t* input2_data,
+                  const RuntimeShape& output_shape, int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i) {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input = F0::FromRaw(
+          gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16_t raw_output = result.raw();
+      const int16_t clamped_output = std::min(
+          output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  } else {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i) {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input = F0::FromRaw(
+          gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16_t raw_output = result.raw();
+      const int16_t clamped_output = std::min(
+          output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
+template <typename Scalar>
+void Pack(const PackParams& params, const RuntimeShape* const* input_shapes,
+          const Scalar* const* input_data, const RuntimeShape& output_shape,
+          Scalar* output_data) {
+  ruy::profiler::ScopeLabel label("Pack");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++) {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++) {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < inputs_count; ++i) {
+    for (int k = 0; k < outer_size; k++) {
+      const Scalar* input_ptr = input_data[i] + copy_size * k;
+      int loc = k * inputs_count * copy_size + i * copy_size;
+      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
+            const Scalar* input_data, const RuntimeShape& output_shape,
+            Scalar* const* output_datas) {
+  ruy::profiler::ScopeLabel label("Unpack");
+  const int dimensions = input_shape.DimensionsCount();
+  const int outputs_count = params.num_split;
+
+  int outer_size = 1;
+  int axis = params.axis;
+  if (axis < 0) {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i) {
+    copy_size *= input_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < outputs_count; ++i) {
+    for (int k = 0; k < outer_size; k++) {
+      Scalar* output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void PackWithScaling(const PackParams& params,
+                     const RuntimeShape* const* input_shapes,
+                     const uint8_t* const* input_data,
+                     const RuntimeShape& output_shape, uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("PackWithScaling");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  const int32_t* input_zeropoint = params.input_zeropoint;
+  const float* input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32_t output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++) {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; i++) {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  Scalar* output_ptr = output_data;
+  const float inverse_output_scale = 1.f / output_scale;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_data[i] + k * copy_size,
+               copy_size * sizeof(Scalar));
+      } else {
+        assert(false);
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        auto input_ptr = input_data[i];
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <typename Scalar>
+void DepthConcatenation(const ConcatenationParams& params,
+                        const RuntimeShape* const* input_shapes,
+                        const Scalar* const* input_data,
+                        const RuntimeShape& output_shape, Scalar* output_data) {
+  ruy::profiler::ScopeLabel label("DepthConcatenation");
+  auto params_copy = params;
+  params_copy.axis = 3;
+  Concatenation(params_copy, input_shapes, input_data, output_shape,
+                output_data);
+}
+
+template <typename Scalar>
+void Split(const SplitParams& params, const RuntimeShape& input_shape,
+           const Scalar* input_data, const RuntimeShape* const* output_shapes,
+           Scalar* const* output_data) {
+  ruy::profiler::ScopeLabel label("Split");
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+  TFLITE_DCHECK_LT(axis, split_dimensions);
+
+  int64_t split_size = 0;
+  for (int i = 0; i < outputs_count; i++) {
+    TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++) {
+      if (j != axis) {
+        MatchingDim(*output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i) {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar* input_ptr = input_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < outputs_count; ++i) {
+      const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr,
+             copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+  return (b * height + h) * width + w;
+}
+
+inline void LocalResponseNormalization(
+    const tflite::LocalResponseNormalizationParams& op_params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      const int begin_input_c = std::max(0, c - op_params.range);
+      const int end_input_c = std::min(depth, c + op_params.range);
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
+      }
+      const float multiplier =
+          std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
+    }
+  }
+}
+
+inline void Dequantize(const RuntimeShape& input_shape,
+                       const Eigen::half* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = static_cast<float>(input_data[i]);
+  }
+}
+
+inline void FakeQuant(const tflite::FakeQuantParams& op_params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FakeQuant");
+  float rmin = op_params.minmax.min;
+  float rmax = op_params.minmax.max;
+  int num_bits = op_params.num_bits;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data,
+                    output_data, flat_size);
+}
+
+// Common subroutine for both `GatherNd` and `GatherNdString`.
+struct GatherNdHelperResult {
+  int n_slices;
+  int slice_size;
+  int indices_nd;
+  std::vector<int> dims_to_count;
+};
+
+// Returns common values being used on both `GatherNd` and `GatherNdString`.
+inline GatherNdHelperResult GatherNdHelper(const RuntimeShape& params_shape,
+                                           const RuntimeShape& indices_shape) {
+  GatherNdHelperResult ret;
+  ret.n_slices = 1;
+  ret.slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  ret.indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i) {
+    ret.n_slices *= indices_shape.Dims(i);
+  }
+  if (ret.n_slices == 0) return ret;
+
+  for (int i = ret.indices_nd; i < params_dims; ++i) {
+    ret.slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
+  for (int i = 0; i < ret.indices_nd; ++i) {
+    ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = ret.dims_to_count[i];
+  }
+
+  return ret;
+}
+
+// Implements GatherNd.
+// Returns an error if any of the indices_data would cause an out of bounds
+// memory read.
+template <typename ParamsT, typename IndicesT = int32_t>
+inline TfLiteStatus GatherNd(const RuntimeShape& params_shape,
+                             const ParamsT* params_data,
+                             const RuntimeShape& indices_shape,
+                             const IndicesT* indices_data,
+                             const RuntimeShape& output_shape,
+                             ParamsT* output_data) {
+  ruy::profiler::ScopeLabel label("GatherNd");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  for (int i = 0; i < res.n_slices; ++i) {
+    int64_t from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j) {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    if (from_pos < 0 || from_pos + res.slice_size > params_shape.FlatSize()) {
+      return kTfLiteError;
+    }
+    std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
+                sizeof(ParamsT) * res.slice_size);
+  }
+  return kTfLiteOk;
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Implements GatherNd on strings.
+// Returns an error if any of the indices_data would cause an out of bounds
+// memory read.
+template <typename IndicesT = int32_t>
+inline TfLiteStatus GatherNdString(const RuntimeShape& params_shape,
+                                   const TfLiteTensor* params_data,
+                                   const RuntimeShape& indices_shape,
+                                   const IndicesT* indices_data,
+                                   const RuntimeShape& output_shape,
+                                   TfLiteTensor* output_data) {
+  ruy::profiler::ScopeLabel label("GatherNdString");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  DynamicBuffer buffer;
+  for (int i = 0; i < res.n_slices; ++i) {
+    int64_t from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j) {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    if (from_pos < 0 || from_pos + res.slice_size > params_shape.FlatSize()) {
+      return kTfLiteError;
+    }
+    for (int j = 0; j < res.slice_size; ++j) {
+      buffer.AddString(GetString(params_data, from_pos + j));
+    }
+  }
+  buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
+  return kTfLiteOk;
+}
+#endif
+
+template <typename IndicesT, typename UpdatesT>
+inline TfLiteStatus ScatterNd(const RuntimeShape& indices_shape,
+                              const IndicesT* indices_data,
+                              const RuntimeShape& updates_shape,
+                              const UpdatesT* updates_data,
+                              const RuntimeShape& output_shape,
+                              UpdatesT* output_data) {
+  ruy::profiler::ScopeLabel label("ScatterNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int outer_dims = indices_shape.DimensionsCount() - 1;
+  const int indices_nd = indices_shape.Dims(outer_dims);
+  const int updates_dims = updates_shape.DimensionsCount();
+  for (int i = 0; i < outer_dims; ++i) {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = outer_dims; i < updates_dims; ++i) {
+    slice_size *= updates_shape.Dims(i);
+  }
+
+  int output_flat_size = output_shape.FlatSize();
+  int remain_flat_size = output_flat_size;
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i) {
+    dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  if (n_slices * slice_size > updates_shape.FlatSize()) {
+    return kTfLiteError;
+  }
+  memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
+  for (int i = 0; i < n_slices; ++i) {
+    int to_pos = 0;
+    for (int j = 0; j < indices_nd; ++j) {
+      IndicesT idx = indices_data[i * indices_nd + j];
+      to_pos += idx * dims_to_count[j];
+    }
+    if (to_pos < 0 || to_pos + slice_size > output_flat_size) {
+      return kTfLiteError;
+    }
+    for (int j = 0; j < slice_size; j++) {
+      output_data[to_pos + j] += updates_data[i * slice_size + j];
+    }
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto min_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+                    const RuntimeShape&, const T* input2_data,
+                    const RuntimeShape& output_shape, T* output_data) {
+  // Drop shape of second input: not needed.
+  Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto max_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+                    const RuntimeShape&, const T* input2_data,
+                    const RuntimeShape& output_shape, T* output_data) {
+  // Drop shape of second input: not needed.
+  Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
+            const T3* input2_data, const RuntimeShape& output_shape,
+            T2* output_data) {
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
+            std::greater<T1>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
+                   const RuntimeShape& input2_shape, const T3* input2_data,
+                   const RuntimeShape& output_shape, T2* output_data) {
+  // Drop shape of second input: not needed.
+  ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape& input_condition_shape,
+                      const D* input_condition_data, T* output_data) {
+  const size_t size = input_condition_shape.FlatSize();
+  if (size == 0) {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i) {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i) {
+    if (input_condition_data[i] != D(0)) {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j) {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
+                          const T* values, T default_value,
+                          bool value_is_scalar,
+                          const RuntimeShape& unextended_output_shape,
+                          T* output_data) {
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; ++i) {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar) {
+    for (int i = 0; i < value_count; ++i) {
+      const std::vector<TI>& index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values;  // just use the first value.
+      output_data[Offset(output_shape, index[0], index[1], index[2],
+                         index[3])] = value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i) {
+    const std::vector<TI>& index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] =
+        value;
+  }
+}
+
+template <typename T>
+inline void Pow(const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = std::pow(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape,
+                               const T* input1_data,
+                               const RuntimeShape& unextended_input2_shape,
+                               const T* input2_data,
+                               const RuntimeShape& unextended_output_shape,
+                               T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::pow(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void Reverse(std::array<int32_t, 8>& axes, int num_axes,
+             const RuntimeShape& input_shape, const Scalar* input_data,
+             Scalar* output_data) {
+  ruy::profiler::ScopeLabel label("Reverse");
+  bool is_upper = (axes[num_axes - 1] == input_shape.DimensionsCount() - 1);
+  bool is_lower = (axes[0] == 0);
+  int rank = input_shape.DimensionsCount();
+  if (is_upper && is_lower) {
+    std::reverse_copy(input_data, input_data + input_shape.FlatSize(),
+                      output_data);
+    return;
+  } else {
+    int32_t min_dim = axes[0];
+    int32_t max_dim = axes[num_axes - 1];
+    int upper_size = 1;
+    for (int i = 0; i < min_dim; ++i) {
+      upper_size *= input_shape.Dims(i);
+    }
+    int lower_size = 1;
+    for (int i = max_dim + 1; i < rank; ++i) {
+      lower_size *= input_shape.Dims(i);
+    }
+    int middle_size = 1;
+    for (int i = min_dim; i <= max_dim; ++i) {
+      middle_size *= input_shape.Dims(i);
+    }
+
+    if (lower_size > 1) {
+      for (int i = 0; i < upper_size; ++i) {
+        for (int j = 0; j < middle_size; ++j) {
+          Scalar* src =
+              (Scalar*)input_data + (i * (middle_size) + j) * lower_size;
+          Scalar* dst =
+              (Scalar*)output_data +
+              (i * (middle_size) + (middle_size - j - 1)) * lower_size;
+          memcpy(dst, src, lower_size * sizeof(Scalar));
+        }
+      }
+    } else {
+      for (int i = 0; i < upper_size; ++i) {
+        std::reverse_copy(input_data + i * (middle_size),
+                          input_data + i * middle_size + middle_size,
+                          output_data + i * (middle_size));
+      }
+    }
+  }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS* seq_lengths, const int seq_dim,
+                     const int batch_dim, const RuntimeShape& input_shape,
+                     const Scalar* input_data, const RuntimeShape& output_shape,
+                     Scalar* output_data) {
+  ruy::profiler::ScopeLabel label("ReverseSequence");
+
+  int outer_size = 1;
+  int outer_dim = std::min(batch_dim, seq_dim);
+  int medium_dim = std::max(batch_dim, seq_dim);
+  for (int i = 0; i < outer_dim; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int medium_size = 1;
+  for (int i = outer_dim + 1; i < medium_dim; ++i) {
+    medium_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i) {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+  const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+  Scalar* output_ptr;
+  if (batch_dim > seq_dim) {
+    for (int i = 0; i < outer_size; ++i) {
+      for (int j = 0; j < dims_at_outer_dim; ++j) {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p) {
+          for (int q = 0; q < dims_at_medium_dim; ++q) {
+            const int in_pos =
+                ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar* in_ptr = input_data + in_pos;
+            int sl = seq_lengths[q] - 1;
+            if (j > sl) {
+              output_ptr = output_data + in_pos;
+            } else {
+              const int out_pos_base =
+                  (i * dims_at_outer_dim + sl - j) * medium_size;
+              const int out_pos =
+                  ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  } else if (batch_dim < seq_dim) {
+    for (int i = 0; i < outer_size; ++i) {
+      for (int j = 0; j < dims_at_outer_dim; ++j) {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        int sl = seq_lengths[j] - 1;
+        const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p) {
+          for (int q = 0; q < dims_at_medium_dim; ++q) {
+            const int in_pos =
+                ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar* in_ptr = input_data + in_pos;
+            if (q > sl) {
+              output_ptr = output_data + in_pos;
+            } else {
+              const int out_pos =
+                  ((out_pos_base + p) * dims_at_medium_dim + sl - q) *
+                  copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SegmentSum(const RuntimeShape& input_shape, const T* input_data,
+                       const RuntimeShape& segment_ids_shape,
+                       const int32_t* segment_ids_data,
+                       const RuntimeShape& output_shape, T* output_data) {
+  const int segment_flat_size =
+      MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
+
+  memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+
+  for (int i = 0; i < input_shape.Dims(0); i++) {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j) {
+      output_data[output_index * segment_flat_size + j] +=
+          input_data[i * segment_flat_size + j];
+    }
+  }
+}
+
+template <typename T, template <typename T2> typename Op>
+inline void UnsortedSegmentRef(const RuntimeShape& input_shape,
+                               const T* input_data,
+                               const RuntimeShape& segment_ids_shape,
+                               const int32_t* segment_ids_data,
+                               const RuntimeShape& output_shape,
+                               T* output_data) {
+  for (int i = 0; i < output_shape.FlatSize(); ++i) {
+    output_data[i] = Op<T>::kInitialValue;
+  }
+  Op<T> op;
+  int segment_flat_size = 1;
+  for (int i = 1; i < output_shape.DimensionsCount(); ++i) {
+    segment_flat_size *= output_shape.Dims(i);
+  }
+  for (int i = 0; i < segment_ids_shape.FlatSize(); i++) {
+    int output_index = segment_ids_data[i];
+    if (output_index < 0) continue;
+    for (int j = 0; j < segment_flat_size; ++j) {
+      output_data[output_index * segment_flat_size + j] =
+          op(output_data[output_index * segment_flat_size + j],
+             input_data[i * segment_flat_size + j]);
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/requantize.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/requantize.h
new file mode 100644
index 00000000..f35f6fc8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/requantize.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
+
+#include <algorithm>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type* input_data, int32_t size,
+                       int32_t effective_scale_multiplier,
+                       int32_t effective_scale_shift, int32_t input_zeropoint,
+                       int32_t output_zeropoint, output_type* output_data) {
+  ruy::profiler::ScopeLabel label("Requantize");
+  const bool same_scale =
+      (effective_scale_multiplier == 1 << 30 && effective_scale_shift == 1);
+  if (same_scale) {
+    const bool mixed_type_int8_uint8 =
+        std::is_same<input_type, int8_t>::value &&
+        std::is_same<output_type, uint8_t>::value;
+    const bool mixed_type_uint8_int8 =
+        std::is_same<input_type, uint8_t>::value &&
+        std::is_same<output_type, int8_t>::value;
+    const int32_t zero_point_diff = input_zeropoint - output_zeropoint;
+    // Fast path to do requantization for the case when just a shift of 128 is
+    // needed.
+    if ((mixed_type_int8_uint8 && zero_point_diff == -128) ||
+        (mixed_type_uint8_int8 && zero_point_diff == 128)) {
+      for (int i = 0; i < size; ++i) {
+        output_data[i] = input_data[i] ^ 0x80;
+      }
+      return;
+    }
+  }
+  static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<output_type>::max();
+  for (int i = 0; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<output_type>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/resize_bilinear.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/resize_bilinear.h
new file mode 100644
index 00000000..bf9a88af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/resize_bilinear.h
@@ -0,0 +1,233 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void ComputeInterpolationValues(const float value, const float scale,
+                                       const bool half_pixel_centers,
+                                       int32_t input_size, float* scaled_value,
+                                       int32_t* lower_bound,
+                                       int32_t* upper_bound) {
+  if (half_pixel_centers) {
+    *scaled_value = (value + 0.5f) * scale - 0.5f;
+  } else {
+    *scaled_value = value * scale;
+  }
+  float scaled_value_floor = std::floor(*scaled_value);
+  *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor),
+                          static_cast<int32_t>(0));
+  *upper_bound =
+      std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
+}
+
+template <typename T>
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const T* input_data,
+                           const RuntimeShape& unextended_output_size_shape,
+                           const int32_t* output_size_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  // If half_pixel_centers is True, align_corners must be False.
+  TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_size_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  int32_t output_height =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  int32_t output_width =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
+
+  float height_scale = static_cast<float>(input_height) / output_height;
+  float width_scale = static_cast<float>(input_width) / output_width;
+  if (op_params.align_corners && output_height > 1) {
+    height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1) {
+    width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
+  }
+  const float rounding_offset = std::numeric_limits<T>::is_integer ? .5f : .0f;
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y;
+      int32_t y0, y1;
+      ComputeInterpolationValues(y, height_scale, op_params.half_pixel_centers,
+                                 input_height, &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x;
+        int32_t x0, x1;
+        ComputeInterpolationValues(x, width_scale, op_params.half_pixel_centers,
+                                   input_width, &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c) {
+          T interpolation =
+              static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
+                                 (1 - (input_y - y0)) * (1 - (input_x - x0)) +
+                             input_data[Offset(input_shape, b, y1, x0, c)] *
+                                 (input_y - y0) * (1 - (input_x - x0)) +
+                             input_data[Offset(input_shape, b, y0, x1, c)] *
+                                 (1 - (input_y - y0)) * (input_x - x0) +
+                             input_data[Offset(input_shape, b, y1, x1, c)] *
+                                 (input_y - y0) * (input_x - x0) +
+                             rounding_offset);
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
+inline void ComputeInterpolationValuesInteger(
+    const int32_t value, const int32_t scale_10, const bool half_pixel_centers,
+    int32_t input_size, int32_t* scaled_value, int32_t* lower_bound,
+    int32_t* upper_bound) {
+  if (half_pixel_centers) {
+    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+  } else {
+    *scaled_value = value * scale_10;
+  }
+  constexpr int32_t zero = 0;
+  *lower_bound = std::max(*scaled_value / (1 << 10), zero);
+  *upper_bound =
+      std::min((*scaled_value + (1 << 10) - 1) / (1 << 10), input_size - 1);
+}
+
+// Same as above but doesn't use any floating-point for the resize
+template <typename T>
+inline void ResizeBilinearInteger(
+    const tflite::ResizeBilinearParams& op_params,
+    const RuntimeShape& unextended_input_shape, const T* input_data,
+    const RuntimeShape& unextended_output_size_shape,
+    const int32_t* output_size_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  // If half_pixel_centers is True, align_corners must be False.
+  TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_size_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32_t input_height = input_shape.Dims(1);
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  const int32_t output_height =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  const int32_t output_width =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
+
+  int32_t height_scale_10 =
+      ((1 << 10) * input_height + output_height / 2) / output_height;
+  int32_t width_scale_10 =
+      ((1 << 10) * input_width + output_width / 2) / output_width;
+  if (op_params.align_corners && output_height > 1) {
+    height_scale_10 =
+        ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
+        (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1) {
+    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
+                     (output_width - 1);
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32_t input_y, y0, y1;
+      ComputeInterpolationValuesInteger(y, height_scale_10,
+                                        op_params.half_pixel_centers,
+                                        input_height, &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x) {
+        int32_t input_x, x0, x1;
+        ComputeInterpolationValuesInteger(x, width_scale_10,
+                                          op_params.half_pixel_centers,
+                                          input_width, &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c) {
+          const int64_t output_20_ll =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y0, x0, c)]) *
+              ((1 << 10) - (input_y - (1 << 10) * y0)) *
+              ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_lu =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y1, x0, c)]) *
+              (input_y - (1 << 10) * y0) *
+              ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_rl =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y0, x1, c)]) *
+              ((1 << 10) - (input_y - (1 << 10) * y0)) *
+              (input_x - (1 << 10) * x0);
+          const int64_t output_20_ru =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y1, x1, c)]) *
+              (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+          const int64_t output_20 =
+              output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+#if TFLITE_SINGLE_ROUNDING
+          const int64_t round = 1 << 19;
+          const T interpolation = static_cast<T>((output_20 + round) >> 20);
+#else
+          const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
+          const T interpolation =
+              static_cast<T>((output_20 + round) / (1 << 20));
+#endif  // TFLITE_SINGLE_ROUNDING
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
new file mode 100644
index 00000000..bf0b757e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline int32_t GetNearestNeighbor(const int input_value,
+                                  const int32_t input_size,
+                                  const int32_t output_size,
+                                  const bool align_corners,
+                                  const bool half_pixel_centers) {
+  const float scale =
+      (align_corners && output_size > 1)
+          ? (input_size - 1) / static_cast<float>(output_size - 1)
+          : input_size / static_cast<float>(output_size);
+  const float offset = half_pixel_centers ? 0.5f : 0.0f;
+  int32_t output_value = std::min(
+      align_corners
+          ? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
+          : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
+      input_size - 1);
+  if (half_pixel_centers) {
+    output_value = std::max(static_cast<int32_t>(0), output_value);
+  }
+  return output_value;
+}
+
+template <typename T>
+inline void ResizeNearestNeighbor(
+    const tflite::ResizeNearestNeighborParams& op_params,
+    const RuntimeShape& unextended_input_shape, const T* input_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  // The Tensorflow version of this op allows resize on the width and height
+  // axis only.
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
+
+  const int col_offset = input_shape.Dims(3);
+  const int row_offset = input_shape.Dims(2) * col_offset;
+  const int batch_offset = input_shape.Dims(1) * row_offset;
+
+  const T* input_ptr = input_data;
+  T* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
+                                        op_params.align_corners,
+                                        op_params.half_pixel_centers);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
+                                          op_params.align_corners,
+                                          op_params.half_pixel_centers);
+        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
+        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
+        output_ptr += depth;
+      }
+    }
+    input_ptr += batch_offset;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/round.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/round.h
new file mode 100644
index 00000000..9bd8f3f2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/round.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline float RoundToNearest(float value) {
+  auto floor_val = std::floor(value);
+  auto diff = value - floor_val;
+  if ((diff < 0.5f) ||
+      ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
+    return floor_val;
+  } else {
+    return floor_val = floor_val + 1.0f;
+  }
+}
+
+inline void Round(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    // Note that this implementation matches that of tensorFlow tf.round
+    // and corresponds to the bankers rounding method.
+    // cfenv (for fesetround) is not yet supported universally on Android, so
+    // using a work around.
+    output_data[i] = RoundToNearest(input_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/select.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/select.h
new file mode 100644
index 00000000..82b6097c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/select.h
@@ -0,0 +1,151 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SELECT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SELECT_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename D, typename T>
+void Select(const RuntimeShape& input_condition_shape,
+            const D* input_condition_data, const RuntimeShape& input_x_shape,
+            const T* input_x_data, const RuntimeShape& input_y_shape,
+            const T* input_y_data, const RuntimeShape& output_shape,
+            T* output_data) {
+  ruy::profiler::ScopeLabel label("Select");
+  int64_t flatsize;
+  // Allow select operator executions on mixed scalar tensors and one element
+  // tensors.
+  if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
+      input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1) {
+    flatsize = 1;
+  } else {
+    flatsize = MatchingFlatSize(input_condition_shape, input_x_shape,
+                                input_y_shape, output_shape);
+  }
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] =
+        input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const RuntimeShape& input_condition_shape,
+                   const D* input_condition_data,
+                   const RuntimeShape& input_x_shape, const T* input_x_data,
+                   const RuntimeShape& input_y_shape, const T* input_y_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Select/RankOneSelect");
+  const int64_t outer_size = input_condition_shape.FlatSize();
+  int64_t inner_size;
+  if (input_condition_shape.DimensionsCount() == 0) {
+    inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
+  } else {
+    TFLITE_DCHECK_EQ(
+        MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0),
+        outer_size);
+    inner_size =
+        MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+  }
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < outer_size; i++) {
+    const T* input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+template <typename D, typename T>
+void BroadcastSelect5DSlow(const RuntimeShape& input_condition_shape,
+                           const D* input_condition_data,
+                           const RuntimeShape& input_x_shape,
+                           const T* input_x_data,
+                           const RuntimeShape& input_y_shape,
+                           const T* input_y_data,
+                           const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Select/BroadcastSelectSlow");
+  TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 5);
+
+  NdArrayDesc<5> desc_condition;
+  NdArrayDesc<5> desc_x;
+  NdArrayDesc<5> desc_y;
+  NdArrayDesc<5> desc_output;
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(5, output_shape);
+  CopyDimsToDesc(extended_output_shape, &desc_output);
+  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape,
+                                      input_y_shape, &desc_condition, &desc_x,
+                                      &desc_y);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int n = 0; n < desc_output.extents[0]; ++n) {
+    int out_idx_n = desc_output.extents[1] * n;
+    int cond_idx_n = desc_condition.strides[0] * n;
+    int in_idx1_n = desc_x.strides[0] * n;
+    int in_idx2_n = desc_y.strides[0] * n;
+    for (int b = 0; b < desc_output.extents[1]; ++b) {
+      int out_idx_b = (out_idx_n + b) * desc_output.extents[2];
+      int cond_idx_b = cond_idx_n + desc_condition.strides[1] * b;
+      int in_idx1_b = in_idx1_n + desc_x.strides[1] * b;
+      int in_idx2_b = in_idx2_n + desc_y.strides[1] * b;
+      for (int y = 0; y < desc_output.extents[2]; ++y) {
+        int out_idx_y = (out_idx_b + y) * desc_output.extents[3];
+        int cond_idx_y = cond_idx_b + desc_condition.strides[2] * y;
+        int in_idx1_y = in_idx1_b + desc_x.strides[2] * y;
+        int in_idx2_y = in_idx2_b + desc_y.strides[2] * y;
+        for (int x = 0; x < desc_output.extents[3]; ++x) {
+          int out_idx = (out_idx_y + x) * desc_output.extents[4];
+          int cond_idx = cond_idx_y + desc_condition.strides[3] * x;
+          int in_idx1 = in_idx1_y + desc_x.strides[3] * x;
+          int in_idx2 = in_idx2_y + desc_y.strides[3] * x;
+          for (int c = 0; c < desc_output.extents[4]; ++c) {
+            output_data[out_idx] = input_condition_data[cond_idx]
+                                       ? input_x_data[in_idx1]
+                                       : input_y_data[in_idx2];
+            out_idx++;
+            cond_idx += desc_condition.strides[4];
+            in_idx1 += desc_x.strides[4];
+            in_idx2 += desc_y.strides[4];
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SELECT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/slice.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/slice.h
new file mode 100644
index 00000000..cb73ea0d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/slice.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
+
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape,
+                  const RuntimeShape& output_shape,
+                  SequentialTensorWriter<T>* writer) {
+  const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  int start[5];
+  int stop[5];
+  for (int i = 0; i < 5; ++i) {
+    int padded_i = 5 - i;
+    start[i] =
+        begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] =
+        (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+            ? ext_shape.Dims(i)
+            : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  for (int i0 = start[0]; i0 < stop[0]; ++i0) {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1) {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2) {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3) {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4) {
+            writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const TfLiteTensor* input,
+                  const RuntimeShape& output_shape, TfLiteTensor* output) {
+  SequentialTensorWriter<T> writer(input, output);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/softmax.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/softmax.h
new file mode 100644
index 00000000..2930217b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -0,0 +1,235 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const float* input_data,
+                    const RuntimeShape& output_shape, float* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      const float exp_c = std::exp((input_data[i * depth + c] - max) *
+                                   static_cast<float>(params.beta));
+      output_data[i * depth + c] = exp_c;
+      sum += exp_c;
+    }
+
+    // Compute result.
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = output_data[i * depth + c] / sum;
+    }
+  }
+}
+
+// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
+// output.
+template <typename InputT, typename OutputT>
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const InputT* input_data,
+                    const RuntimeShape& output_shape, OutputT* output_data) {
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    InputT max_in_row = std::numeric_limits<InputT>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int num_bits_over_unit;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
+
+    const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
+    TFLITE_CHECK(0 <= exponent && exponent <= 31);
+
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), exponent);
+
+        const int32_t shifted_output =
+            unsat_output +
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());
+
+        output_data[i * depth + c] = static_cast<OutputT>(std::max(
+            std::min(shifted_output,
+                     static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
+      } else {
+        output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
+      }
+    }
+  }
+}
+
+// Computes exp(input - max_input)
+inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
+                                   const int16_t* input_data, const int depth,
+                                   int16_t max_in_row, int i, int c) {
+  int32_t input_diff = input_data[i * depth + c] - max_in_row;
+  // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+  // exp lut generated with range [-10, 0], as exp(-10) is negligible.
+  int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+      input_diff, params.input_multiplier, params.input_left_shift);
+  // recenter to [-32768, 32767]
+  int32_t sym_scaled_diff = scaled_diff + 32767;
+  int16_t sat_sym_scaled_diff =
+      std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
+               static_cast<int32_t>(32767));
+  // apply the exp() LUT activation function
+  return LUTLookup(sat_sym_scaled_diff, params.exp_lut);
+}
+// Quantized softmax with int16_t input and int16_t output.
+inline void SoftmaxInt16(const SoftmaxParams& params,
+                         const RuntimeShape& input_shape,
+                         const int16_t* input_data,
+                         const RuntimeShape& output_shape,
+                         int16_t* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    // Find the largest element
+    int16_t max_in_row = std::numeric_limits<int16_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    // This loops computes the exp values and their sum. We will need the exp
+    // values later on in the function so we cache them in the output_data
+    // buffer. This is an optimization done to avoid calculating the exp values
+    // twice making use of the output_data buffer as scratch memory.
+    int32_t sum_of_exps = 0;  // Q16.15 fixed point format.
+    int16_t* exp_results_Q015 = output_data + i * depth;
+    for (int c = 0; c < depth; ++c) {
+      exp_results_Q015[c] =
+          SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
+      sum_of_exps += exp_results_Q015[c];
+    }
+
+    // Compute the reciprocal 1/sum_of_exps
+    uint8_t headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32_t>(sum_of_exps));
+    int32_t shifted_sum =
+        ((static_cast<int64_t>(sum_of_exps) << (headroom_plus_one - 1)) +
+         (1 << 13)) >>
+        14;
+    // since the LUT computes 1/(1 + x) we need to first compute x = (sum - 1).
+    // also, the LUT expects a symmetrical input, so we must also recenter x
+    // from [0, 65535] to [-32768, 32767].
+    int32_t sym_shifted_sum = shifted_sum + (-((1 << 15) + (1 << 16)));
+    int16_t sat_sym_shifted_sum = static_cast<int16_t>(
+        std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
+                 static_cast<int32_t>(32767)));
+    // apply 1/(1 + x) LUT activation function
+    int16_t reciprocal_scale_Q015 =
+        LUTLookup(sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
+
+    // Rescale the exp_result with reciprocal
+    // range of output is [0, 32767] correspond to [0.0, 1.0]
+    for (int c = 0; c < depth; ++c) {
+      uint8_t right_shift = 31 - headroom_plus_one;
+      int64_t round = 1 << (right_shift - 1);
+      int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
+                            static_cast<int64_t>(reciprocal_scale_Q015) +
+                        round) >>
+                       right_shift;
+      output_data[i * depth + c] = static_cast<int16_t>(
+          std::min(std::max(result, static_cast<int32_t>(0)),
+                   static_cast<int32_t>(32767)));
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h
new file mode 100644
index 00000000..7f844152
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// TODO(b/135760455): Move this method anonymous namespace in a cc file.
+inline RuntimeShape ExtendShapeSpaceToBatch(const RuntimeShape& shape) {
+  if (shape.DimensionsCount() == 4) {
+    return shape;
+  }
+  RuntimeShape new_shape(4, 1);
+  new_shape.SetDim(0, shape.Dims(0));
+  new_shape.SetDim(1, shape.Dims(1));
+  new_shape.SetDim(3, shape.Dims(2));
+  return new_shape;
+}
+
+template <typename T>
+inline void SpaceToBatchND(const SpaceToBatchParams& params,
+                           const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const int32_t* block_shape_data,
+                           const RuntimeShape& unextended_input3_shape,
+                           const int32_t* paddings_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  ruy::profiler::ScopeLabel label("SpaceToBatchND");
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  const RuntimeShape input1_shape =
+      ExtendShapeSpaceToBatch(unextended_input1_shape);
+  const RuntimeShape output_shape =
+      ExtendShapeSpaceToBatch(unextended_output_shape);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
+  const int padding_top = paddings_data[0];
+  const int padding_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
+
+  // For uint8 quantized, the correct padding "zero value" is the output offset.
+  const int32_t pad_value = params.output_offset;
+  for (int out_b = 0; out_b < output_batch_size; ++out_b) {
+    int input_batch = out_b % input_batch_size;
+    int shift_w = (out_b / input_batch_size) % block_shape_width;
+    int shift_h = (out_b / input_batch_size) / block_shape_width;
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
+        if (out_h * block_shape_height + shift_h < padding_top ||
+            out_h * block_shape_height + shift_h >=
+                padding_top + input_height ||
+            out_w * block_shape_width + shift_w < padding_left ||
+            out_w * block_shape_width + shift_w >= padding_left + input_width) {
+          // This may not execute correctly when pad_value != 0 and T != uint8.
+          memset(out, pad_value, depth * sizeof(T));
+        } else {
+          const T* in =
+              input1_data +
+              Offset(input1_shape, input_batch,
+                     (out_h * block_shape_height + shift_h) - padding_top,
+                     (out_w * block_shape_width + shift_w) - padding_left, 0);
+          memcpy(out, in, depth * sizeof(T));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/space_to_depth.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/space_to_depth.h
new file mode 100644
index 00000000..7ad46549
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/space_to_depth.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_batch = input_shape.Dims(0);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch = output_shape.Dims(0);
+
+  const int32_t block_size = op_params.block_size;
+
+  TFLITE_DCHECK_EQ(input_width, output_width * block_size);
+  TFLITE_DCHECK_EQ(input_height, output_height * block_size);
+  TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
+  TFLITE_DCHECK_EQ(input_batch, output_batch);
+
+  for (int in_b = 0; in_b < input_batch; ++in_b) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      for (int in_w = 0; in_w < input_width; ++in_w) {
+        for (int in_d = 0; in_d < input_depth; ++in_d) {
+          const int out_d =
+              in_d + ((in_h % block_size) * block_size + in_w % block_size) *
+                         input_depth;
+          const int out_w = in_w / block_size;
+          const int out_h = in_h / block_size;
+          const int out_b = in_b;
+
+          const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
+          const int output_index =
+              Offset(output_shape, out_b, out_h, out_w, out_d);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/sparse_ops/fully_connected.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/sparse_ops/fully_connected.h
new file mode 100644
index 00000000..342d8036
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/sparse_ops/fully_connected.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPARSE_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPARSE_OPS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// Convert weights to dense format and run dense fully connected.
+inline void FullyConnectedSparseWeight(
+    const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
+    const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& weights_shape, const float* weights_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  std::vector<int> weights_shape_vector(weights_shape.DimensionsCount());
+  for (int i = 0; i < weights_shape.DimensionsCount(); i++) {
+    weights_shape_vector[i] = weights_shape.Dims(i);
+  }
+  tflite::internal::sparsity::FormatConverter<float> converter(
+      weights_shape_vector, sparsity);
+  converter.SparseToDense(weights_data);
+  const std::vector<float>& dense_weights_data = converter.GetData();
+  FullyConnected(params, input_shape, input_data, weights_shape,
+                 dense_weights_data.data(), bias_shape, bias_data, output_shape,
+                 output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPARSE_OPS_FULLY_CONNECTED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/strided_slice.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/strided_slice.h
new file mode 100644
index 00000000..b76baaad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void StridedSlice(const tflite::StridedSliceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const RuntimeShape& unextended_output_shape,
+                         SequentialTensorWriter<T>* writer) {
+  ruy::profiler::ScopeLabel label("StridedSlice");
+
+  // Note that the output_shape is not used herein.
+  tflite::StridedSliceParams params_copy = op_params;
+
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 5);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(5, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(5, unextended_output_shape);
+
+  // Reverse and pad to 5 dimensions because that is what the runtime code
+  // requires (ie. all shapes must be 5D and are given backwards).
+  strided_slice::StridedSlicePadIndices(&params_copy, 5);
+
+  const int start_0 =
+      strided_slice::StridedSliceStartForAxis(params_copy, input_shape, 0);
+  const int stop_0 = strided_slice::StridedSliceEndForAxis(
+      params_copy, input_shape, 0, start_0);
+  const int start_1 =
+      strided_slice::StridedSliceStartForAxis(params_copy, input_shape, 1);
+  const int stop_1 = strided_slice::StridedSliceEndForAxis(
+      params_copy, input_shape, 1, start_1);
+  const int start_2 =
+      strided_slice::StridedSliceStartForAxis(params_copy, input_shape, 2);
+  const int stop_2 = strided_slice::StridedSliceEndForAxis(
+      params_copy, input_shape, 2, start_2);
+  const int start_3 =
+      strided_slice::StridedSliceStartForAxis(params_copy, input_shape, 3);
+  const int stop_3 = strided_slice::StridedSliceEndForAxis(
+      params_copy, input_shape, 3, start_3);
+  const int start_4 =
+      strided_slice::StridedSliceStartForAxis(params_copy, input_shape, 4);
+  const int stop_4 = strided_slice::StridedSliceEndForAxis(
+      params_copy, input_shape, 4, start_4);
+
+  auto lc = [&](int end, int stride, int index) {
+    if (stride < 0) {
+      return index > end;
+    } else {
+      return index < end;
+    }
+  };
+  // With a static_cast it is not possible to initialize
+  // a variable of type 'const int *'
+  // with an rvalue of type 'const int32_t *' (aka 'const long *').
+  // reinterpret_cast is required to handle this casting.
+  const int* shape = reinterpret_cast<const int*>(input_shape.DimsData());
+  const int* stride = reinterpret_cast<const int*>(params_copy.strides);
+  const bool inner_stride_is_1 = params_copy.strides[4] == 1;
+
+  for (int offset_0 = start_0; lc(stop_0, stride[0], offset_0);
+       offset_0 += stride[0]) {
+    for (int offset_1 = start_1; lc(stop_1, stride[1], offset_1);
+         offset_1 += stride[1]) {
+      for (int offset_2 = start_2; lc(stop_2, stride[2], offset_2);
+           offset_2 += stride[2]) {
+        for (int offset_3 = start_3; lc(stop_3, stride[3], offset_3);
+             offset_3 += stride[3]) {
+          // When the stride is 1, the inner loop is equivalent to the
+          // optimized slice inner loop. Otherwise, it is identical to the
+          // strided_slice reference implementation inner loop.
+          if (inner_stride_is_1) {
+            const int len = stop_4 - start_4;
+            int index = start_4 + offset_3 * shape[4] +
+                        offset_2 * shape[3] * shape[4] +
+                        offset_1 * shape[2] * shape[3] * shape[4] +
+                        offset_0 * shape[1] * shape[2] * shape[3] * shape[4];
+            if (len > 0) {
+              writer->WriteN(index, len);
+            }
+          } else {
+            for (int offset_4 = start_4; lc(stop_4, stride[4], offset_4);
+                 offset_4 += stride[4]) {
+              int index = offset_4 + offset_3 * shape[4] +
+                          offset_2 * shape[3] * shape[4] +
+                          offset_1 * shape[2] * shape[3] * shape[4] +
+                          offset_0 * shape[1] * shape[2] * shape[3] * shape[4];
+              writer->Write(index);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void StridedSlice(const tflite::StridedSliceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
+                  &writer);
+}
+
+template <typename T>
+inline void StridedSlice(const tflite::StridedSliceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const TfLiteTensor* input,
+                         const RuntimeShape& unextended_output_shape,
+                         TfLiteTensor* output) {
+  SequentialTensorWriter<T> writer(input, output);
+  StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
+                  &writer);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/string_comparisons.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/string_comparisons.h
new file mode 100644
index 00000000..d3cda1bc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/string_comparisons.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  if (lhs.len != rhs.len) return false;
+  for (int i = 0; i < lhs.len; ++i) {
+    if (lhs.str[i] != rhs.str[i]) return false;
+  }
+  return true;
+}
+
+inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  return !StringRefEqualFn(lhs, rhs);
+}
+
+inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
+                                 const RuntimeShape& input1_shape,
+                                 const TfLiteTensor* input1,
+                                 const RuntimeShape& input2_shape,
+                                 const TfLiteTensor* input2,
+                                 const RuntimeShape& output_shape,
+                                 bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const auto lhs = GetString(input1, i);
+    const auto rhs = GetString(input2, i);
+    output_data[i] = F(lhs, rhs);
+  }
+}
+
+inline void BroadcastComparison4DSlowStringImpl(
+    bool (*F)(const StringRef&, const StringRef&),
+    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
+    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const auto lhs =
+              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
+          const auto rhs =
+              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
+          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/sub.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/sub.h
new file mode 100644
index 00000000..1a74aebe
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/sub.h
@@ -0,0 +1,465 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <class T>
+struct SubImpl {
+  template <class F>
+  static void BroadcastInput1(const ArithmeticParams& params,
+                              const T* input1_data, const T* input2_data,
+                              T* output_data, size_t size, F binary_func) {
+    for (size_t c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[0], input2_data[c], params);
+    }
+  }
+
+  template <class F>
+  static void BroadcastInput2(const ArithmeticParams& params,
+                              const T* input1_data, const T* input2_data,
+                              T* output_data, size_t size, F binary_func) {
+    for (size_t c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[0], params);
+    }
+  }
+
+  template <class F>
+  static void ElementWise(const ArithmeticParams& params, const T* input1_data,
+                          const T* input2_data, T* output_data, size_t size,
+                          F binary_func) {
+    for (size_t c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[c], params);
+    }
+  }
+};
+
+template <>
+struct SubImpl<int32_t> {
+  template <class F>
+  static void BroadcastInput1(const ArithmeticParams& params,
+                              const int32_t* input1_data,
+                              const int32_t* input2_data, int32_t* output_data,
+                              size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    const int32x4_t vmax = vdupq_n_s32(activation_max);
+    const int32x4_t vmin = vdupq_n_s32(activation_min);
+    const int32x4_t va = vdupq_n_s32(input1_data[0]);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t vb = vld1q_s32(&input2_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[0], input2_data[c], params);
+    }
+  }
+
+  template <class F>
+  static void BroadcastInput2(const ArithmeticParams& params,
+                              const int32_t* input1_data,
+                              const int32_t* input2_data, int32_t* output_data,
+                              size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    const int32x4_t vmax = vdupq_n_s32(activation_max);
+    const int32x4_t vmin = vdupq_n_s32(activation_min);
+    const int32x4_t vb = vdupq_n_s32(input2_data[0]);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t va = vld1q_s32(&input1_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[0], params);
+    }
+  }
+
+  template <class F>
+  static void ElementWise(const ArithmeticParams& params,
+                          const int32_t* input1_data,
+                          const int32_t* input2_data, int32_t* output_data,
+                          size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    int32x4_t vmax = vdupq_n_s32(activation_max);
+    int32x4_t vmin = vdupq_n_s32(activation_min);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t va = vld1q_s32(&input1_data[c]);
+      const int32x4_t vb = vld1q_s32(&input2_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[c], params);
+    }
+  }
+};
+
+template <typename T, typename F>
+inline void BroadcastSubRecursiveDimensions(
+    int dimension, const ArithmeticParams& params, const T* input1_data,
+    const T* input2_data, T* output_data, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    size_t* compressed_input1_stride, size_t* compressed_input2_stride,
+    size_t* compressed_output_shape, F binary_func) {
+  if (dimension > 0) {
+    for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastSubRecursiveDimensions(
+          dimension - 1, params, input1_data, input2_data, output_data,
+          &input1_offset_c, &input2_offset_c, output_offset,
+          compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, binary_func);
+      *input1_offset_p += compressed_input1_stride[dimension];
+      *input2_offset_p += compressed_input2_stride[dimension];
+    }
+  } else {
+    TFLITE_DCHECK(dimension == 0);
+    bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
+    bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
+    TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
+    const T* input1_data_ptr = input1_data + *input1_offset_p;
+    const T* input2_data_ptr = input2_data + *input2_offset_p;
+    T* output_data_ptr = output_data + *output_offset;
+    if (input1_is_broadcast) {
+      // input1 is broadcast.
+      SubImpl<T>::BroadcastInput1(
+          params, input1_data_ptr, input2_data_ptr, output_data_ptr,
+          compressed_output_shape[dimension], binary_func);
+      *input2_offset_p += compressed_output_shape[dimension];
+    } else if (input2_is_broadcast) {
+      // input2 is broadcast.
+      SubImpl<T>::BroadcastInput2(
+          params, input1_data_ptr, input2_data_ptr, output_data_ptr,
+          compressed_output_shape[dimension], binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+    } else {
+      // Add element-wise.
+      SubImpl<T>::ElementWise(params, input1_data_ptr, input2_data_ptr,
+                              output_data_ptr,
+                              compressed_output_shape[dimension], binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+      *input2_offset_p += compressed_output_shape[dimension];
+    }
+    *output_offset += compressed_output_shape[dimension];
+  }
+}
+
+// TODO: b/296510380 - we may be able to factor out this to common.h for all
+// binary arithmetic ops (add, sub, mul).
+template <typename T, typename F>
+inline void BroadcastSubCommon(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const T* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const T* input2_data,
+                               const RuntimeShape& output_shape, T* output_data,
+                               F binary_func) {
+  constexpr int kMaxBroadcastDim = 6;
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxBroadcastDim);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxBroadcastDim);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxBroadcastDim);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastSubRecursiveDimensions(
+      kMaxBroadcastDim - 1, params, input1_data, input2_data, output_data,
+      &input1_offset, &input2_offset, &output_offset, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape, binary_func);
+}
+
+// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape, const T* input1_data,
+                      const RuntimeShape& input2_shape, const T* input2_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        T activation_min, activation_max;
+        GetActivationParams(params, &activation_min, &activation_max);
+        return ActivationFunctionWithMinMax(input1_val - input2_val,
+                                            activation_min, activation_max);
+      });
+}
+
+inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
+                                  const RuntimeShape& input1_shape,
+                                  const int16_t* input1_data,
+                                  const RuntimeShape& input2_shape,
+                                  const int16_t* input2_data,
+                                  const RuntimeShape& output_shape,
+                                  int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
+  BroadcastSubCommon<int16_t>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](int16_t input1_val, int16_t input2_val,
+         const ArithmeticParams& params) {
+        const int32_t scaled_input1_val =
+            gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
+        const int32_t scaled_input2_val =
+            gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
+        const int32_t raw_output = scaled_input1_val - scaled_input2_val;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        return static_cast<int16_t>(clamped_output);
+      });
+}
+
+template <typename T>
+void BroadcastQuantSubSlow(const ArithmeticParams& params,
+                           const RuntimeShape& input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& input2_shape,
+                           const T* input2_data,
+                           const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        const int32_t shifted_input1_val =
+            (params.input1_offset + input1_val) * (1 << params.left_shift);
+        const int32_t shifted_input2_val =
+            (params.input2_offset + input2_val) * (1 << params.left_shift);
+        const int32_t scaled_input1_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input1_val, params.input1_multiplier,
+                params.input1_shift);
+        const int32_t scaled_input2_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input2_val, params.input2_multiplier,
+                params.input2_shift);
+        const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+        const int32_t raw_output =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                raw_sub, params.output_multiplier, params.output_shift) +
+            params.output_offset;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        return static_cast<T>(clamped_output);
+      });
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+template <typename T>
+inline void SubElementwise(int size, const ArithmeticParams& params,
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GE(params.input1_offset, -128);
+  TFLITE_DCHECK_GE(params.input2_offset, -128);
+  // offset = -quantization_params.zero_point in PrepareGeneralSubOp().
+  // So it's maximum can be 128 not 127.
+  TFLITE_DCHECK_LE(params.input1_offset, 128);
+  TFLITE_DCHECK_LE(params.input2_offset, 128);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_EQ(params.input1_offset, 0);
+  TFLITE_DCHECK_EQ(params.input2_offset, 0);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+         const T* input1_data, const RuntimeShape& input2_shape,
+         const T* input2_data, const RuntimeShape& output_shape,
+         T* output_data) {
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        return input1_val - input2_val;
+      });
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int32_t* activation_min,
+                                int32_t* activation_max) {
+  *activation_min = params.quantized_activation_min;
+  *activation_max = params.quantized_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                float* activation_min, float* activation_max) {
+  *activation_min = params.float_activation_min;
+  *activation_max = params.float_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int64_t* activation_min,
+                                int64_t* activation_max) {
+  *activation_min = params.int64_activation_min;
+  *activation_max = params.int64_activation_max;
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("SubWithActivation");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  T activation_min, activation_max;
+  SetActivationMinMax(params, &activation_min, &activation_max);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], activation_min, activation_max);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/svdf.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/svdf.h
new file mode 100644
index 00000000..fa3de950
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -0,0 +1,249 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+// SVDF op that compresses a fully connected op via low-rank matrix
+// factorization. See https://research.google.com/pubs/archive/43813.pdf for
+// details.
+
+namespace tflite {
+namespace reference_ops {
+
+static inline void ApplyTimeWeightsBiasAndActivation(
+    int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const float* const __restrict__ weights_time_data,
+    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
+    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
+    float* const __restrict__ output_ptr) {
+  // Compute matmul(state, weights_time).
+  for (int b = 0; b < batch_size; ++b) {
+    float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+    tensor_utils::BatchVectorBatchVectorDotProduct(
+        weights_time_data, state_ptr_batch, memory_size, num_filters,
+        scratch_ptr_batch);
+  }
+
+  // Reduction sum.
+  tensor_utils::ReductionSumVector(scratch_ptr, output_ptr,
+                                   batch_size * num_units, rank);
+  // Add bias if provided.
+  if (bias_ptr) {
+    tensor_utils::VectorBatchVectorAdd(bias_ptr, num_units, batch_size,
+                                       output_ptr);
+  }
+
+  // Apply activation.
+  tensor_utils::ApplyActivationToVector(output_ptr, batch_size * num_units,
+                                        activation, output_ptr);
+}
+
+inline void EvalIntegerSVDF(
+    const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& weights_feature_shape,
+    const int8_t* weights_feature_data, const RuntimeShape& weights_time_shape,
+    const int16_t* weights_time_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, int16_t* state_data,
+    const RuntimeShape& output_shape, int8_t* output_data,
+    int32_t* scratch_data, int32_t* output_temp_data, int32_t scale_1_a,
+    int scale_1_b, int32_t scale_2_a, int scale_2_b, int32_t input_zp,
+    int32_t output_zp) {
+  const int n_rank = params->rank;
+  const int n_batch = input_shape.Dims(0);
+  const int n_input = input_shape.Dims(1);
+  const int n_filter = weights_feature_shape.Dims(0);
+  const int n_unit = n_filter / n_rank;
+  const int n_memory = weights_time_shape.Dims(1);
+
+  // Left shift the activation_state.
+  // std::copy is fine for overlapping ranges if the output is outside of the
+  // input range. (This is not true for copy_n.)
+  std::copy(state_data + 1, state_data + n_batch * n_memory * n_filter,
+            state_data);
+
+  // Feature matmul.
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+  {
+    const int32_t output_max = std::numeric_limits<int16_t>::max();
+    const int32_t output_min = std::numeric_limits<int16_t>::min();
+    int16_t* result_in_batch = state_data + (n_memory - 1);
+    for (int b = 0; b < n_batch; b++) {
+      const int8_t* matrix_data = weights_feature_data;
+      for (int r = 0; r < n_filter; r++) {
+        int32_t dot_prod = 0;
+        const int8_t* vector_in_batch = input_data + b * n_input;
+        for (int c = 0; c < n_input; c++) {
+          dot_prod += *matrix_data++ * (*vector_in_batch++ - input_zp);
+        }
+        dot_prod =
+            MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod.
+        *result_in_batch = dot_prod;
+        result_in_batch += n_memory;
+      }
+    }
+  }
+
+  // Time.
+  {
+    for (int b = 0; b < n_batch; ++b) {
+      const int16_t* state_data_batch = state_data + b * n_memory * n_filter;
+      int32_t* scratch_data_batch = scratch_data + b * n_filter;
+      tensor_utils::BatchVectorBatchVectorDotProduct(
+          weights_time_data, state_data_batch, n_memory, n_filter,
+          scratch_data_batch);
+    }
+  }
+
+  // Reduce, add bias, rescale, activation.
+  {
+    // Reduce.
+    tensor_utils::ReductionSumVector(scratch_data, output_temp_data,
+                                     n_batch * n_unit, n_rank);
+    // Add bias.
+    if (bias_data) {
+      tensor_utils::VectorBatchVectorAdd(bias_data, n_unit, n_batch,
+                                         output_temp_data);
+    }
+    // Rescale.
+    const int32_t output_max = std::numeric_limits<int8_t>::max();
+    const int32_t output_min = std::numeric_limits<int8_t>::min();
+    for (int i = 0; i < n_batch * n_unit; ++i) {
+      int32_t x1 = output_temp_data[i];
+      int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+      int32_t x3 = x2 + output_zp;
+      int32_t x4 = std::min(std::max(output_min, x3), output_max);
+      output_data[i] = static_cast<int8_t>(x4);
+    }
+  }
+}
+
+inline void EvalFloatSVDF(
+    const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_feature_shape,
+    const float* weights_feature_data, const RuntimeShape& weights_time_shape,
+    const float* weights_time_data, const RuntimeShape& bias_shape,
+    const float* bias_data, float* scratch_data, float* state_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  const int rank = params->rank;
+  const int batch_size = input_shape.Dims(0);
+  const int input_size = input_shape.Dims(1);
+  const int num_filters = weights_feature_shape.Dims(0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time_shape.Dims(1);
+
+  // Left shift the activation_state.
+  // std::copy is fine for overlapping ranges if the output is outside of the
+  // input range. (This is not true for copy_n.)
+  std::copy(state_data + 1, state_data + batch_size * memory_size * num_filters,
+            state_data);
+
+  // Clear scratch (the matmul is accumulative).
+  std::fill_n(scratch_data, batch_size * num_filters, 0.0f);
+
+  // Compute conv1d(inputs, weights_feature).
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      weights_feature_data, num_filters, input_size, input_data, batch_size,
+      scratch_data);
+
+  // Copy the latest activation from scratch into activation_state:
+  // The last, i.e. (memory_size-1)th entry for each batch, and filter.
+  for (int i = 0; i < batch_size * num_filters; ++i) {
+    state_data[i * memory_size + memory_size - 1] = scratch_data[i];
+  }
+
+  ApplyTimeWeightsBiasAndActivation(
+      batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
+      bias_data, params->activation, state_data, scratch_data, output_data);
+}
+
+inline void EvalHybridSVDF(
+    const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_feature_shape,
+    const int8_t* weights_feature_data, const float weights_feature_scale,
+    const RuntimeShape& weights_time_shape, const float* weights_time_data,
+    const RuntimeShape& bias_shape, const float* bias_data, float* scratch,
+    float* scaling_factors, int8_t* quantized_input, float* state,
+    const RuntimeShape& output_shape, float* output_data, int32_t* zero_points,
+    int32_t* row_sums, bool* compute_row_sums) {
+  const int rank = params->rank;
+  const int batch_size = input_shape.Dims(0);
+  const int input_size = input_shape.Dims(1);
+  const int num_filters = weights_feature_shape.Dims(0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time_shape.Dims(1);
+
+  // Left shift the activation_state.
+  // std::copy is fine for overlapping ranges if the output is outside of the
+  // input range. (This is not true for copy_n.)
+  std::copy(state + 1, state + batch_size * memory_size * num_filters, state);
+
+  // Clear scratch (the matmul is accumulative).
+  std::fill_n(scratch, batch_size * num_filters, 0.0f);
+
+  if (!tensor_utils::IsZeroVector(input_data, batch_size * input_size)) {
+    // Quantize input from float to int8_t.
+    tensor_utils::BatchQuantizeFloats(
+        input_data, batch_size, input_size, quantized_input, scaling_factors,
+        zero_points, params->asymmetric_quantize_inputs);
+    for (int b = 0; b < batch_size; ++b) {
+      scaling_factors[b] *= weights_feature_scale;
+    }
+
+    // Compute conv1d(inputs, weights_feature).
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        weights_feature_data, num_filters, input_size, quantized_input,
+        scaling_factors, batch_size, scratch,
+        /*per_channel_scale=*/nullptr, zero_points,
+        reinterpret_cast<int32_t*>(scratch), row_sums, compute_row_sums,
+        /*context=*/nullptr);
+  }
+  // Copy the latest activation from scratch into activation_state:
+  // The last, i.e. (memory_size-1)th entry for each batch, and filter.
+  for (int i = 0; i < batch_size * num_filters; ++i) {
+    state[i * memory_size + memory_size - 1] = scratch[i];
+  }
+
+  // TODO(b/174275776): can optimize hybrid case ~5% by unrolling loop in
+  // applying time weights so that the inner loop multiplies eight elements at
+  // a time.
+  ApplyTimeWeightsBiasAndActivation(
+      batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
+      bias_data, params->activation, state, scratch, output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/tanh.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/tanh.h
new file mode 100644
index 00000000..3a05c474
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16_t* input_data, const RuntimeShape& output_shape,
+                 int16_t* output_data) {
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  }
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data) {
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int32_t output_zero_point = 128;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8_t input_val_u8 = input_data[i];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32_t input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      // Reinterpret as Q0.7, encoded in uint8_t.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8_t>(output_val_s32);
+    }
+    output_data[i] = output_val;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/transpose.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/transpose.h
new file mode 100644
index 00000000..7e2bf7b2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/transpose.h
@@ -0,0 +1,203 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
+
+#include <array>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+namespace transpose_internal {
+
+// Recursively explores all the dimensions of the output tensor and writes the
+// corresponding input tensor data.
+//
+// - depth: the current depth of the recursion.
+// - dims: tensor dimension count, also `perm` size.
+// - perm: permutation array.
+// - input_data: Running input data pointer. If depth == num_dims-1, this points
+//               to the first element of the last dimension to traverse.
+// - input_stride: Reverse partial product of input shapes.
+// - output_data: Running output data pointer. If depth == num_dims-1, this
+//                points to the first element of the last dimension to traverse.
+// - output_stride: Reverse partial product of output shapes.
+// - output_shape: Shape of the output tensor.
+//
+// ## Algorithm explanation
+//
+// Assume a 3D tensor T with a shape of [I, J, K] stored in row major order.
+// T[i, j, k] is at position `i*J*K + j*K + k` in the tensor buffer.
+//
+// If we want to go through the whole tensor iteratively, we can use loops.
+//
+// ```
+// for(i = 0; i < I; ++i) {
+//   for(j = 0; j < J; ++j) {
+//     for(k = 0; k < K; ++k) {
+//        T.data[i*J*K + j*K + k] = ...
+//     }
+//   }
+// }
+// ```
+//
+// We can also compute the offset as we go through the loops.
+//
+// ```
+// stride_i = K * J;
+// stride_j = K;
+// stride_k = 1;
+// for(i = 0; i < I; ++i) {
+//   offset_i = i * stride_i;
+//   offset_j = 0;
+//   for(j = 0; j < J; ++j) {
+//     offset_j += stride_j;
+//     offset_k = 0;
+//     for(k = 0; k < K; ++k) {
+//        offset_k += stride_k;
+//        T.data[offset_i + offset_j + offset_k] = ...
+//     }
+//   }
+// }
+// ```
+//
+// This nicely extends to a recursive version which is the base of this
+// algorithm and supports any number of dimensions.
+//
+// ```
+// shape = [I, J, K]
+// strides = [K*J, K, 1]
+// void recurse(T* data, shape, strides, depth = 0) {
+//   if(depth == shape.size) {
+//     *data = ...
+//   } else {
+//     for(a = 0; a < shape[depth]; ++a) {
+//       recurse(data, shape, strides, depth+1);
+//       data += strides[depth];
+//     }
+//   }
+// }
+// ```
+template <typename T>
+void TransposeImpl(const int depth, const int dims, const int32_t* perm,
+                   const T* input_data, const int* input_stride, T* output_data,
+                   const int* output_stride, const int32_t* output_shape) {
+  const int dimension_size = output_shape[depth];
+  if (depth == dims - 1) {
+    const int loop_stride = input_stride[perm[depth]];
+    for (int i = 0; i < dimension_size; ++i) {
+      output_data[i] = *input_data;
+      input_data += loop_stride;
+    }
+  } else {
+    for (int i = 0; i < dimension_size; ++i) {
+      TransposeImpl(depth + 1, dims, perm, input_data, input_stride,
+                    output_data, output_stride, output_shape);
+
+      input_data += input_stride[perm[depth]];
+      output_data += output_stride[depth];
+    }
+  }
+}
+
+// Compile-time switch to get the storage type of the transposition.
+template <int Size>
+struct TransposeStorageType;
+
+template <>
+struct TransposeStorageType<1> {
+  using type = int8_t;
+};
+
+template <>
+struct TransposeStorageType<2> {
+  using type = int16_t;
+};
+
+template <>
+struct TransposeStorageType<4> {
+  using type = int32_t;
+};
+
+template <>
+struct TransposeStorageType<8> {
+  using type = int64_t;
+};
+
+// Sets up the stride arrays for the recursive transpose algorithm.
+//
+// Implementation notes:
+//
+// This is a reverse partial product. We could use standard algorithms to
+// implement this but the result is not a readable and is tricky to get right
+// because the first element must be set to 1, which leads to offset
+// shenanigans:
+//
+// ```
+//   stride[dims - 1] = 1;
+//   std::partial_sum(std::make_reverse_iterator(shape + dims),
+//                    std::make_reverse_iterator(shape + 1),
+//                    stride.rend() - input_rank + 1, std::multiplies());
+// ```
+//
+// Note that Abseil isn't used in kernels implementation. That would make the
+// above solution more readable.
+inline void SetupTransposeStrides(
+    std::array<int, kTransposeMaxDimensions>& stride, const int32_t* shape,
+    const int dims) {
+  stride[dims - 1] = 1;
+  for (int i = dims - 2; i >= 0; --i) {
+    stride[i] = stride[i + 1] * shape[i + 1];
+  }
+}
+
+}  // namespace transpose_internal
+
+// Copies a tensor to an other buffer and permutes its dimensions.
+//
+// Note: template parameter N is not used anymore. It is kept for API
+// compatibility with TFLite micro.
+template <typename T, int N = kTransposeMaxDimensions>
+void Transpose(const TransposeParams& params, const RuntimeShape& input_shape,
+               const T* input_data, const RuntimeShape& output_shape,
+               T* output_data) {
+  using transpose_internal::SetupTransposeStrides;
+  using transpose_internal::TransposeImpl;
+  using transpose_internal::TransposeStorageType;
+  // Transpose kernel only does rearranging values not numeric evaluations on
+  // each cell. It's safe to implement per size of scalar type and this trick
+  // keeps the total code size in a reasonable range.
+  using StorageType = typename TransposeStorageType<sizeof(T)>::type;
+  const StorageType* const input_data_storage =
+      reinterpret_cast<const StorageType*>(input_data);
+  StorageType* const output_data_storage =
+      reinterpret_cast<StorageType*>(output_data);
+
+  const int dims = input_shape.DimensionsCount();
+  std::array<int, kTransposeMaxDimensions> input_stride, output_stride;
+  SetupTransposeStrides(input_stride, input_shape.DimsData(), dims);
+  SetupTransposeStrides(output_stride, output_shape.DimsData(), dims);
+  TransposeImpl(0, dims, &params.perm[0], input_data_storage,
+                input_stride.data(), output_data_storage, output_stride.data(),
+                output_shape.DimsData());
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/transpose_conv.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/transpose_conv.h
new file mode 100644
index 00000000..744ed0f8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/reference/transpose_conv.h
@@ -0,0 +1,322 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; i++) {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  float input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  output_data[Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          float acc = output_data[Offset(output_shape, batch, out_y, out_x,
+                                         out_channel)];
+          if (bias_data) acc += bias_data[out_channel];
+
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(acc, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, const RuntimeShape& im2col_shape,
+    uint8_t* im2col_data, int32_t* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  uint8_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  uint8_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      (input_value + input_offset) *
+                      (filter_value + filter_offset);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                              out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier, output_shift);
+          scaled_acc += output_offset;
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<uint8_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+inline void HybridTransposeConv(
+    const ConvParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; i++) {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    const float scaling_factor = scaling_factors_ptr[batch];
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  int32_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  int32_t acc =
+                      (input_value - input_offset[batch]) * filter_value;
+                  output_data[Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] +=
+                      acc * per_channel_scale[out_channel] * scaling_factor;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          float acc = output_data[Offset(output_shape, batch, out_y, out_x,
+                                         out_channel)];
+          if (bias_data) acc += bias_data[out_channel];
+
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(acc, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/runtime_shape.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/runtime_shape.h
new file mode 100644
index 00000000..8982cb17
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/runtime_shape.h
@@ -0,0 +1,262 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
+
+// This file is copied to MLIR to avoid a dependency on TFLite.
+// LINT.IfChange
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+class RuntimeShape {
+ public:
+  // Shapes with dimensions up to 6 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 6;
+
+  RuntimeShape& operator=(RuntimeShape const&) = delete;
+
+  RuntimeShape() : size_(0) {}
+
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    if (dimensions_count > kMaxSmallSize) {
+      dims_pointer_ = new int32_t[dimensions_count];
+    }
+  }
+
+  RuntimeShape(int shape_size, int32_t value) : size_(0) {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i) {
+      SetDim(i, value);
+    }
+  }
+
+  RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  RuntimeShape(const std::initializer_list<int> init_list) : size_(0) {
+    BuildFrom(init_list);
+  }
+
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_ = new int32_t[size_];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
+  }
+
+  bool operator==(const RuntimeShape& comp) const {
+    return this->size_ == comp.size_ &&
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
+               0;
+  }
+
+  ~RuntimeShape();
+
+  inline int32_t DimensionsCount() const { return size_; }
+
+  int32_t Dims(int i) const;
+
+  inline void SetDim(int i, int32_t val) {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_[i] = val;
+    } else {
+      dims_[i] = val;
+    }
+  }
+
+  inline int32_t* DimsData() {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  inline const int32_t* DimsData() const {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  // The caller must ensure that the shape is no bigger than 5-D.
+  inline const int32_t* DimsDataUpTo5D() const { return dims_; }
+
+  inline void Resize(int dimensions_count) {
+    const int32_t old_size = size_;
+    size_ = dimensions_count;
+
+    if (old_size <= kMaxSmallSize) {
+      if (dimensions_count <= kMaxSmallSize) {
+        return;
+      } else {  // Small to big.
+        int32_t* new_big_data = new int32_t[dimensions_count];
+        memcpy(new_big_data, dims_, sizeof(int32_t) * old_size);
+        dims_pointer_ = new_big_data;
+      }
+    } else {
+      if (dimensions_count > kMaxSmallSize && dimensions_count <= old_size) {
+        return;
+      }
+      std::unique_ptr<int32_t[]> old_data(dims_pointer_);
+      if (dimensions_count <= old_size) {  // Big to small.
+        memcpy(dims_, old_data.get(), sizeof(int32_t) * dimensions_count);
+      } else {  // Big to bigger.
+        dims_pointer_ = new int32_t[dimensions_count];
+        memcpy(dims_pointer_, old_data.get(), sizeof(int32_t) * old_size);
+      }
+    }
+  }
+
+  void ReplaceWith(int dimensions_count, const int32_t* dims_data);
+
+  template <typename T>
+  inline void BuildFrom(const T& src_iterable) {
+    const int dimensions_count =
+        std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32_t* data = DimsData();
+    for (auto it : src_iterable) {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static RuntimeShape ExtendedShape(int new_shape_size,
+                                           const RuntimeShape& shape) {
+    return RuntimeShape(new_shape_size, shape, 1);
+  }
+
+  inline void BuildFrom(const std::initializer_list<int> init_list) {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  int FlatSize() const;
+
+  bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
+
+ private:
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
+      : size_(0) {
+    // If the following check fails, it is likely because a 4D-only kernel is
+    // being used with an array of larger dimension count.
+    TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i) {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32_t) * shape.DimensionsCount());
+  }
+
+  int32_t size_;
+  union {
+    int32_t dims_[kMaxSmallSize];
+    int32_t* dims_pointer_;
+  };
+};
+
+// Converts inference-style shape to legacy tflite::Dims<4>.
+inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) {
+  tflite::Dims<4> result;
+  const int dimensions_count = array_shape.DimensionsCount();
+  TFLITE_CHECK_LE(dimensions_count, 4);
+  int cum_prod = 1;
+  for (int i = 0; i < 4; i++) {
+    const int new_dim =
+        (i < dimensions_count) ? array_shape.Dims(dimensions_count - 1 - i) : 1;
+    result.sizes[i] = new_dim;
+    result.strides[i] = cum_prod;
+    cum_prod *= new_dim;
+  }
+  return result;
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
+// Since tensors with '0' in their shape are valid in TF, these offset functions
+// allow that as long as the corresponding index is also 0. It is upto the
+// calling ops to ensure that they perform verification checks on tensor shapes
+// if they don't support a particular behavior.
+
+inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
+  const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
+  TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
+                (i0 >= 0 && i0 < dims_data[0]));
+  TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
+                (i1 >= 0 && i1 < dims_data[1]));
+  TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
+                (i2 >= 0 && i2 < dims_data[2]));
+  TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
+                (i3 >= 0 && i3 < dims_data[3]));
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
+                  int i4) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
+  const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
+  TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
+                (i0 >= 0 && i0 < dims_data[0]));
+  TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
+                (i1 >= 0 && i1 < dims_data[1]));
+  TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
+                (i2 >= 0 && i2 < dims_data[2]));
+  TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
+                (i3 >= 0 && i3 < dims_data[3]));
+  TFLITE_DCHECK((dims_data[4] == 0 && i4 == 0) ||
+                (i4 >= 0 && i4 < dims_data[4]));
+  return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
+             dims_data[4] +
+         i4;
+}
+
+inline int Offset(const RuntimeShape& shape, int* index) {
+  return Offset(shape, index[0], index[1], index[2], index[3]);
+}
+
+}  // namespace tflite
+
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape.h)
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/spectrogram.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/spectrogram.h
new file mode 100644
index 00000000..879dc1c9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/spectrogram.h
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class for generating spectrogram slices from a waveform.
+// Initialize() should be called before calls to other functions.  Once
+// Initialize() has been called and returned true, The Compute*() functions can
+// be called repeatedly with sequential input data (ie. the first element of the
+// next input vector directly follows the last element of the previous input
+// vector). Whenever enough audio samples are buffered to produce a
+// new frame, it will be placed in output. Output is cleared on each
+// call to Compute*(). This class is thread-unsafe, and should only be
+// called from one thread at a time.
+// With the default parameters, the output of this class should be very
+// close to the results of the following MATLAB code:
+// overlap_samples = window_length_samples - step_samples;
+// window = hann(window_length_samples, 'periodic');
+// S = abs(spectrogram(audio, window, overlap_samples)).^2;
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+
+#include <complex>
+#include <deque>
+#include <vector>
+
+#include "third_party/fft2d/fft.h"
+
+namespace tflite {
+namespace internal {
+
+class Spectrogram {
+ public:
+  Spectrogram() : initialized_(false) {}
+  ~Spectrogram() {}
+
+  // Initializes the class with a given window length and step length
+  // (both in samples). Internally a Hann window is used as the window
+  // function. Returns true on success, after which calls to Process()
+  // are possible. window_length must be greater than 1 and step
+  // length must be greater than 0.
+  bool Initialize(int window_length, int step_length);
+
+  // Initialize with an explicit window instead of a length.
+  bool Initialize(const std::vector<double>& window, int step_length);
+
+  // Processes an arbitrary amount of audio data (contained in input)
+  // to yield complex spectrogram frames. After a successful call to
+  // Initialize(), Process() may be called repeatedly with new input data
+  // each time.  The audio input is buffered internally, and the output
+  // vector is populated with as many temporally-ordered spectral slices
+  // as it is possible to generate from the input.  The output is cleared
+  // on each call before the new frames (if any) are added.
+  //
+  // The template parameters can be float or double.
+  template <class InputSample, class OutputSample>
+  bool ComputeComplexSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<std::complex<OutputSample>>>* output);
+
+  // This function works as the one above, but returns the power
+  // (the L2 norm, or the squared magnitude) of each complex value.
+  template <class InputSample, class OutputSample>
+  bool ComputeSquaredMagnitudeSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<OutputSample>>* output);
+
+  // Return reference to the window function used internally.
+  const std::vector<double>& GetWindow() const { return window_; }
+
+  // Return the number of frequency channels in the spectrogram.
+  int output_frequency_channels() const { return output_frequency_channels_; }
+
+ private:
+  template <class InputSample>
+  bool GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                              int* input_start);
+  void ProcessCoreFFT();
+
+  int fft_length_;
+  int output_frequency_channels_;
+  int window_length_;
+  int step_length_;
+  bool initialized_;
+  int samples_to_next_step_;
+
+  std::vector<double> window_;
+  std::vector<double> fft_input_output_;
+  std::deque<double> input_queue_;
+
+  // Working data areas for the FFT routines.
+  std::vector<int> fft_integer_working_area_;
+  std::vector<double> fft_double_working_area_;
+};
+
+// Explicit instantiations in spectrogram.cc.
+
+extern template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input,
+    std::vector<std::vector<std::complex<float>>>*);
+extern template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<std::complex<float>>>*);
+extern template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input,
+    std::vector<std::vector<std::complex<double>>>*);
+extern template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<std::complex<double>>>*);
+
+extern template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<float>>*);
+extern template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<float>>*);
+extern template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<double>>*);
+extern template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<double>>*);
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/strided_slice_logic.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/strided_slice_logic.h
new file mode 100644
index 00000000..449cac04
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -0,0 +1,278 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+
+#include <limits>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace strided_slice {
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
+                                   int dim_count) {
+  // Add indices and mask bits to fully include extra dimensions
+  TFLITE_CHECK_LE(dim_count, 5);
+  TFLITE_CHECK_GE(dim_count, p->start_indices_count);
+  TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
+  TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
+
+  const int pad_count = dim_count - p->start_indices_count;
+
+  // Pad indices at start, so move arrays by pad_count.
+  for (int i = p->start_indices_count - 1; i >= 0; --i) {
+    p->strides[i + pad_count] = p->strides[i];
+    p->start_indices[i + pad_count] = p->start_indices[i];
+    p->stop_indices[i + pad_count] = p->stop_indices[i];
+  }
+  for (int i = 0; i < pad_count; ++i) {
+    p->start_indices[i] = 0;
+    p->stop_indices[i] = 1;
+    p->strides[i] = 1;
+  }
+
+  // Pad masks with 0s or 1s as required.
+  p->shrink_axis_mask <<= pad_count;
+  p->ellipsis_mask <<= pad_count;
+  p->new_axis_mask <<= pad_count;
+  p->begin_mask <<= pad_count;
+  p->end_mask <<= pad_count;
+  p->begin_mask |= (1 << pad_count) - 1;
+  p->end_mask |= (1 << pad_count) - 1;
+
+  p->start_indices_count = dim_count;
+  p->stop_indices_count = dim_count;
+  p->strides_count = dim_count;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
+// that can be used to index directly into the data.
+inline int StridedSliceStartForAxis(const tflite::StridedSliceParams& params,
+                                    const RuntimeShape& input_shape,
+                                    int32_t axis) {
+  const int32_t axis_size = input_shape.Dims(axis);
+  int32_t start = params.start_indices[axis];
+  const int32_t stride = params.strides[axis];
+  const int32_t begin_mask = (params.begin_mask & 1 << axis);
+  if (start < 0) {
+    start += axis_size;
+  }
+  if (stride > 0) {
+    start = Clamp(start, 0, axis_size);
+  } else {
+    start = Clamp(start, -1, axis_size - 1);
+  }
+  if (begin_mask) {
+    if (stride > 0) {
+      start = 0;
+    } else {
+      start = axis_size - 1;
+    }
+  }
+  return start;
+}
+
+inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
+                                  const RuntimeShape& input_shape, int axis,
+                                  int start) {
+  const auto shrink_axis_mask = params.shrink_axis_mask;
+  const bool shrink_axis = shrink_axis_mask & (1 << axis);
+  const int axis_size = input_shape.Dims(axis);
+  const bool offset = params.offset;
+  if (shrink_axis) {
+    if (start >= axis_size) {
+      return start;
+    } else {
+      return start + 1;
+    }
+  }
+  const auto* indices = params.stop_indices;
+  int end = indices[axis];
+  if (offset) {
+    end += start;
+  }
+  const int32_t stride = params.strides[axis];
+  const int32_t end_mask = (params.end_mask & 1 << axis);
+  if (end < 0) {
+    end += axis_size;
+  }
+  if (stride > 0) {
+    end = Clamp(end, 0, axis_size);
+  } else {
+    end = Clamp(end, -1, axis_size - 1);
+  }
+  if (end_mask) {
+    if (stride > 0) {
+      end = axis_size;
+    } else {
+      end = -1;
+    }
+  }
+  return end;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
+// that can be used to index directly into the data.
+inline int StartForAxis(const tflite::StridedSliceParams& params,
+                        const RuntimeShape& input_shape, int axis) {
+  const auto begin_mask = params.begin_mask;
+  const auto* start_indices = params.start_indices;
+  const auto* strides = params.strides;
+  const int axis_size = input_shape.Dims(axis);
+  if (axis_size == 0) {
+    return 0;
+  }
+  // Begin with the specified index.
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  if (strides[axis] > 0) {
+    // Forward iteration
+    start = Clamp(start, 0, axis_size);
+  } else {
+    // Backward iteration
+    start = Clamp(start, -1, axis_size - 1);
+  }
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int StopForAxis(const tflite::StridedSliceParams& params,
+                       const RuntimeShape& input_shape, int axis,
+                       int start_for_axis) {
+  const auto end_mask = params.end_mask;
+  const auto shrink_axis_mask = params.shrink_axis_mask;
+  const auto* stop_indices = params.stop_indices;
+  const auto* strides = params.strides;
+  const int axis_size = input_shape.Dims(axis);
+  if (axis_size == 0) {
+    return 0;
+  }
+
+  // Begin with the specified index
+  const bool shrink_axis = shrink_axis_mask & (1 << axis);
+  int stop = stop_indices[axis];
+
+  // When shrinking an axis, the end position does not matter (and can be
+  // incorrect when negative indexing is used, see Issue #19260). Always use
+  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
+  // already been adjusted for negative indices.
+  if (shrink_axis) {
+    return start_for_axis + 1;
+  }
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
+inline tflite::StridedSliceParams BuildStridedSliceParams(
+    int begin_mask, int end_mask, int shrink_axis_mask,
+    const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
+    const std::vector<int>& strides) {
+  tflite::StridedSliceParams op_params{};
+  const int dims_count = start_indices.size();
+
+  op_params.start_indices_count = dims_count;
+  op_params.stop_indices_count = dims_count;
+  op_params.strides_count = dims_count;
+  for (int i = 0; i < dims_count; ++i) {
+    op_params.start_indices[i] = start_indices[i];
+    op_params.stop_indices[i] = stop_indices[i];
+    op_params.strides[i] = strides[i];
+  }
+
+  op_params.begin_mask = begin_mask;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = end_mask;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = shrink_axis_mask;
+
+  return op_params;
+}
+
+}  // namespace strided_slice
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor.h
new file mode 100644
index 00000000..84de43ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+
+// Most functionality has been moved into a version of this file that doesn't
+// rely on std::string, so that it can be used in TFL Micro.
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+template <>
+class SequentialTensorWriter<string> {
+ public:
+  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output)
+      : input_(input), output_(output) {}
+  ~SequentialTensorWriter() { buffer_.WriteToTensor(output_, nullptr); }
+
+  void Write(int position) { this->WriteN(position, 1); }
+  void WriteN(int position, int len) {
+    for (int i = 0; i < len; i++) {
+      buffer_.AddString(GetString(input_, position + i));
+    }
+  }
+
+ private:
+  const TfLiteTensor* input_;
+  TfLiteTensor* output_;
+  DynamicBuffer buffer_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor_ctypes.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor_ctypes.h
new file mode 100644
index 00000000..9a7205c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+template <typename T>
+inline T* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
+}
+
+template <typename T>
+inline const T* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? reinterpret_cast<const T*>(tensor->data.raw)
+                           : nullptr;
+}
+
+TFLITE_NOINLINE RuntimeShape GetTensorShape(const TfLiteTensor* tensor);
+RuntimeShape GetTensorShape(std::vector<int32_t> data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor_utils.h
new file mode 100644
index 00000000..9ba91a78
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation. Use of CpuBackendContext in method
+// implementations is purely optional.
+class CpuBackendContext;
+
+namespace tensor_utils {
+
+// Apply tanh to elements of a vector
+inline void ApplyTanhToVector(const float* __restrict__ vector, int v_size,
+                              float* __restrict__ result) {
+  using VectorMap = Eigen::Map<Eigen::Vector<float, Eigen::Dynamic>>;
+  VectorMap input_map(const_cast<float* __restrict__>(vector), v_size);
+  VectorMap output_map(result, v_size);
+  output_map.array() = input_map.array().tanh();
+}
+
+// Apply sigmoid to elements of a vector.
+inline void ApplySigmoidToVector(const float* __restrict__ vector, int v_size,
+                                 float* __restrict__ result) {
+  using VectorMap = Eigen::Map<Eigen::Vector<float, Eigen::Dynamic>>;
+  VectorMap input_map(const_cast<float* __restrict__>(vector), v_size);
+  VectorMap output_map(result, v_size);
+  output_map.array() = input_map.array().logistic();
+}
+
+// Apply appropriate activation function to elements of a vector.
+inline void ApplyActivationToVector(const float* vector, int v_size,
+                                    TfLiteFusedActivation activation,
+                                    float* result) {
+  switch (activation) {
+    case kTfLiteActNone:
+      return;
+    case kTfLiteActRelu:
+      return tflite::tensor_utils::ApplyReluToVector(vector, v_size, result);
+    case kTfLiteActReluN1To1:
+      return tflite::tensor_utils::ApplyRelu1ToVector(vector, v_size, result);
+    case kTfLiteActRelu6:
+      return tflite::tensor_utils::ApplyRelu6ToVector(vector, v_size, result);
+    case kTfLiteActTanh:
+      return ApplyTanhToVector(vector, v_size, result);
+    case kTfLiteActSignBit:
+      return tflite::tensor_utils::ApplySignbitToVector(vector, v_size, result);
+    case kTfLiteActSigmoid:
+      return ApplySigmoidToVector(vector, v_size, result);
+  }
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/test_util.h
new file mode 100644
index 00000000..7e17170c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/test_util.h
@@ -0,0 +1,111 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Computes output and padding dimensions.
+bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth,
+                      int filter_width, int filter_height, int stride,
+                      int dilation_width_factor, int dilation_height_factor,
+                      PaddingType padding_type, RuntimeShape* output_shape,
+                      int* pad_width, int* pad_height);
+
+// Returns a mt19937 random engine.
+std::mt19937& RandomEngine();
+
+// Returns a random integer uniformly distributed between |min| and |max|.
+int UniformRandomInt(int min, int max);
+
+// Returns a random float uniformly distributed between |min| and |max|.
+float UniformRandomFloat(float min, float max);
+
+// Returns a random element in |v|.
+template <typename T>
+const T& RandomElement(const std::vector<T>& v) {
+  return v[UniformRandomInt(0, v.size() - 1)];
+}
+
+// Returns a random exponentially distributed integer.
+int ExponentialRandomPositiveInt(float percentile, int percentile_val,
+                                 int max_val);
+
+// Returns a random exponentially distributed float.
+float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
+                                     float max_val);
+
+// Fills a vector with random floats between |min| and |max|.
+void FillRandomFloat(std::vector<float>* vec, float min, float max);
+
+template <typename T>
+void FillRandom(typename std::vector<T>::iterator begin_it,
+                typename std::vector<T>::iterator end_it, T min, T max) {
+  // Workaround for compilers that don't support (u)int8_t uniform_distribution.
+  typedef typename std::conditional<sizeof(T) >= sizeof(int16_t), T,
+                                    std::int16_t>::type rand_type;
+  std::uniform_int_distribution<rand_type> dist(min, max);
+  // TODO(b/154540105): use std::ref to avoid copying the random engine.
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(begin_it, end_it, [&gen] { return static_cast<T>(gen()); });
+}
+
+// Fills a vector with random numbers between |min| and |max|.
+template <typename T>
+void FillRandom(std::vector<T>* vec, T min, T max) {
+  FillRandom(std::begin(*vec), std::end(*vec), min, max);
+}
+
+// Template specialization for float.
+template <>
+inline void FillRandom<float>(std::vector<float>* vec, float min, float max) {
+  FillRandomFloat(vec, min, max);
+}
+
+// Fills a vector with random numbers.
+template <typename T>
+void FillRandom(std::vector<T>* vec) {
+  FillRandom(vec, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+}
+
+// Fill with a "skyscraper" pattern, in which there is a central section (across
+// the depth) with higher values than the surround.
+template <typename T>
+void FillRandomSkyscraper(std::vector<T>* vec, int depth,
+                          double middle_proportion, uint8_t middle_min,
+                          uint8_t sides_max) {
+  for (auto base_it = std::begin(*vec); base_it != std::end(*vec);
+       base_it += depth) {
+    auto left_it = base_it + std::ceil(0.5 * depth * (1.0 - middle_proportion));
+    auto right_it =
+        base_it + std::ceil(0.5 * depth * (1.0 + middle_proportion));
+    FillRandom(base_it, left_it, std::numeric_limits<T>::min(), sides_max);
+    FillRandom(left_it, right_it, middle_min, std::numeric_limits<T>::max());
+    FillRandom(right_it, base_it + depth, std::numeric_limits<T>::min(),
+               sides_max);
+  }
+}
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/transpose_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/transpose_utils.h
new file mode 100644
index 00000000..b7fee188
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/transpose_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TRANSPOSE_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TRANSPOSE_UTILS_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace transpose_utils {
+
+// IsTranspose2DApplicable returns true if the given perm can be lowered to a
+// 2D transpose op. If possible, it copies the lowered dimension counts to the
+// given dim0 and dim1 pointers.
+bool IsTranspose2DApplicable(const TransposeParams& params,
+                             const RuntimeShape& input_shape, int* dim0,
+                             int* dim1);
+
+// RemoveOneSizeDimensions removes one size dimensions in the given input/output
+// shapes and adjusts the parameter values for transpose op.
+void RemoveOneSizeDimensions(RuntimeShape* input_shape,
+                             RuntimeShape* output_shape,
+                             TransposeParams* params);
+
+// Flatten finds the dimensions that can be flatten, shrinks the given shapes
+// and the given perm parameter to reflect the non-flatten dimensions, and
+// returns the total size of the non-flatten dimensions.
+//
+// E.g, in perm [0, 1, 3, 2] case, the first two dimensions can be flatten and
+// it returns |Dim Size(2)| x |Dim Size(3)|.
+size_t Flatten(const RuntimeShape& input_shape,
+               const RuntimeShape& output_shape, const TransposeParams& params,
+               RuntimeShape* non_flatten_input_shape,
+               RuntimeShape* non_flatten_output_shape,
+               TransposeParams* non_flatten_params);
+
+}  // namespace transpose_utils
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TRANSPOSE_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/types.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/types.h
new file mode 100644
index 00000000..510ffa30
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/types.h
@@ -0,0 +1,1097 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <initializer_list>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+namespace tflite {
+
+enum class FusedActivationFunctionType : uint8_t {
+  kNone,
+  kRelu6,
+  kRelu1,
+  kRelu
+};
+enum class PaddingType : uint8_t { kNone, kSame, kValid };
+
+struct PaddingValues {
+  int16_t width;
+  int16_t height;
+  // offset is used for calculating "remaining" padding, for example, `width`
+  // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
+  // 1 + 1 = 2.
+  int16_t width_offset;
+  // Same as width_offset except it's over the height dimension.
+  int16_t height_offset;
+};
+
+struct Padding3DValues {
+  int16_t width;
+  int16_t height;
+  int16_t depth;
+  // offset is used for calculating "remaining" padding, for example, `width`
+  // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
+  // 1 + 1 = 2.
+  int16_t width_offset;
+  // Same as width_offset except it's over the height dimension.
+  int16_t height_offset;
+  // Same as width_offset except it's over the depth dimension.
+  int16_t depth_offset;
+};
+
+// This enumeration allows for non-default formats for the weights array
+// of a fully-connected operator, allowing the use of special optimized
+// runtime paths.
+enum class FullyConnectedWeightsFormat : uint8_t {
+  // Default format (flat 2D layout, the inner contiguous dimension
+  // is input_depth, the outer non-contiguous dimension is output_depth)
+  kDefault,
+  // Summary: optimized layout for fast CPU runtime implementation,
+  // aimed specifically at ARM CPUs at the moment, and specialized for
+  // 8-bit quantized layers.
+  //
+  // The use case we're concerned with here is: 8-bit quantization,
+  // large weights matrix that doesn't fit in cache (e.g. 4096x2048 in
+  // a key application that drove this), very small batch size (e.g. 1 -- 4).
+  //
+  // Even with 8-bit quantization of weights, the performance of memory
+  // accesses to the weights can become the dominant issue when
+  // the batch size is small, so each weight value is used in only a few
+  // arithmetic ops, i.e. the fully-connected node has a low arithmetic
+  // intensity. The specific issues that arise are of three kinds:
+  // (1) One may, ideally, max out DRAM bandwidth, i.e. be truly memory
+  //     bound. That's the "good" issue to run into.
+  // (2) One may run into sub-optimal pre-fetching: the data hasn't been
+  //     prefetched into the cache by the time we need it.
+  // (3) One may run into cache aliasing: multiple values that are
+  //     pre-fetched, alias each other in the L1 cache (which typically
+  //     has only 4-way set associativity in ARM CPUs) and thus evict
+  //     each other before we get to using them.
+  //
+  // The point of this shuffling is to avoid issues (2) and (3) so that
+  // we get as fast as possible given only the hard constraint (1).
+  // This is achieved by turning the difficulty into a solution: the
+  // difficulty, that each value loaded from memory is used only in
+  // one kernel iteration, making this operation memory-intensive, hints at
+  // the solution, of shuffling the weights so that they are stored in the
+  // exact order as the kernel needs to load them, so that the memory
+  // accesses made by the kernel are trivial. This solves (2) because the
+  // trivial memory access pattern allows the CPU's automatic prefetching
+  // to perform very well (no need even for preload instructions), and this
+  // solves (3) because the values being loaded concurrently are now
+  // contiguous in the address space, thus don't alias each other in the cache.
+  //
+  // On ARM, we typically want our kernel to process a 4x16 block of weights
+  // at a time, because:
+  //   - 16 is the number of bytes in a NEON register.
+  //   - 4 is how many rows we need to handle concurrently in the kernel in
+  //     order to have sufficient mutual independence of instructions to
+  //     maximize arithmetic throughput.
+  //
+  // Finally, the 'Int8' part in the name refers to the fact that this
+  // weights format has each weights value encoded as a signed int8_t value,
+  // even if the data type of the weights buffer is uint8_t.  This is intended
+  // to save runtime kernels the effort to have to XOR the top bit of these
+  // bytes before using them in signed arithmetic, see this file for more
+  // explanations on the 'signed int8_t trick' in matrix multiplication kernels:
+  //
+  //   tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+  //
+  kShuffled4x16Int8,
+};
+
+// Quantization parameters, determining the mapping of quantized values
+// to real values (i.e. determining how quantized values are mathematically
+// interpreted).
+//
+// The correspondence is as follows:
+//
+//   real_value = scale * (quantized_value - zero_point);
+//
+// In other words, zero_point designates which quantized value corresponds to
+// the real 0 value, and scale designates the difference between the real values
+// corresponding to consecutive quantized values differing by 1.
+struct QuantizationParams {
+  int32_t zero_point = 0;
+  double scale = 0.0;
+};
+
+inline bool operator==(const QuantizationParams& qp1,
+                       const QuantizationParams& qp2) {
+  return qp1.zero_point == qp2.zero_point && qp1.scale == qp2.scale;
+}
+
+// Quantization parameters for each channel, determining the mapping of
+// quantized values to real values. See QuantizationParams for a single set of
+// parameters per tensor. This has one parameters set per each channel.
+//
+// The correspondence is as follows:
+//
+//   real_value = scale[channel] * (quantized_value - zero_point[channel]);
+//
+struct PerChannelQuantizationParams {
+  // The following members typically point to the corresponding members of a
+  // TfLiteAffineQuantization struct.
+  const float* scale;
+  const int32_t* zero_point;
+  int32_t quantized_dimension;
+};
+
+// Gets next index to iterate through a multidimensional array.
+template <typename IndexType = int>
+inline bool NextIndex(const int num_dims, const int* dims, IndexType* current) {
+  if (num_dims == 0) {
+    return false;
+  }
+  TFLITE_DCHECK(dims != nullptr);
+  TFLITE_DCHECK(current != nullptr);
+  int carry = 1;
+  for (int idx = num_dims - 1; idx >= 0; --idx) {
+    IndexType current_val = current[idx] + carry;
+    TFLITE_DCHECK_GE(dims[idx], current_val);
+    if (dims[idx] == current_val) {
+      current[idx] = 0;
+    } else {
+      current[idx] = current_val;
+      carry = 0;
+      break;
+    }
+  }
+  return (carry == 0);
+}
+
+// Gets offset of index if reducing on axis. When reducing, the flattened offset
+// will not change, if the input index changes on the given axis. For example,
+// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
+// offset.
+// TODO(kanlig): uses Dims to represent dimensions.
+inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
+                                  const int* index, const int num_axis,
+                                  const int* axis) {
+  if (num_dims == 0) {
+    return 0;
+  }
+  TFLITE_DCHECK(dims != nullptr);
+  TFLITE_DCHECK(index != nullptr);
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    // if we need to skip this axis
+    bool is_axis = false;
+    if (axis != nullptr) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+        if (idx == axis[axis_idx]) {
+          is_axis = true;
+          break;
+        }
+      }
+    }
+    if (!is_axis) {
+      offset = offset * static_cast<size_t>(dims[idx]) +
+               static_cast<size_t>(index[idx]);
+    }
+  }
+  return offset;
+}
+
+// Since tensors with '0' in their shape are valid in TF, these offset functions
+// allow that as long as the corresponding index is also 0. It is upto the
+// calling ops to ensure that they perform verification checks on tensor shapes
+// if they don't support a particular behavior.
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+  TFLITE_DCHECK((i0 == 0 && dims.sizes[0] == 0) ||
+                (i0 >= 0 && i0 < dims.sizes[0]));
+  TFLITE_DCHECK((i1 == 0 && dims.sizes[1] == 0) ||
+                (i1 >= 0 && i1 < dims.sizes[1]));
+  TFLITE_DCHECK((i2 == 0 && dims.sizes[2] == 0) ||
+                (i2 >= 0 && i2 < dims.sizes[2]));
+  TFLITE_DCHECK((i3 == 0 && dims.sizes[3] == 0) ||
+                (i3 >= 0 && i3 < dims.sizes[3]));
+  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+         i3 * dims.strides[3];
+}
+
+inline int Offset(const Dims<4>& dims, int* index) {
+  return Offset(dims, index[0], index[1], index[2], index[3]);
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+//
+// Note that this will be phased out with Dims<4>, since RuntimeShape::Dims()
+// already performs this check.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+  TFLITE_DCHECK(index >= 0 && index < N);
+  return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2) {
+  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2, Args... args) {
+  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return MatchingArraySize(array1, index1, args...);
+}
+
+// Get common shape dim, DCHECKing that they all agree.
+inline int MatchingDim(const RuntimeShape& shape1, int index1,
+                       const RuntimeShape& shape2, int index2) {
+  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
+  return std::min(shape1.Dims(index1), shape2.Dims(index2));
+}
+
+template <typename... Args>
+int MatchingDim(const RuntimeShape& shape1, int index1,
+                const RuntimeShape& shape2, int index2, Args... args) {
+  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
+  return MatchingDim(shape1, index1, args...);
+}
+
+// Will be phased out with Dims<4>, replaced by RuntimeShape::FlatSize().
+template <int N>
+inline int FlatSize(const Dims<N>& dims) {
+  int flat_size = 1;
+  for (int i = 0; i < N; ++i) {
+    flat_size *= dims.sizes[i];
+  }
+  return flat_size;
+}
+
+TFLITE_DEPRECATED("Prefer FlatSize.")
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  return FlatSize(dims);
+}
+
+inline int MatchingElementsSize(const RuntimeShape& shape,
+                                const RuntimeShape& check_shape_0) {
+  const int size_1 = shape.FlatSize();
+  const int size_2 = check_shape_0.FlatSize();
+  TFLITE_CHECK_EQ(size_1, size_2);
+  return size_1;
+}
+
+inline int MatchingElementsSize(const RuntimeShape& shape,
+                                const RuntimeShape& check_shape_0,
+                                const RuntimeShape& check_shape_1) {
+  const int size_1 = shape.FlatSize();
+  const int size_2 = check_shape_0.FlatSize();
+  const int size_3 = check_shape_1.FlatSize();
+  TFLITE_CHECK_EQ(size_1, size_2);
+  TFLITE_CHECK_EQ(size_2, size_3);
+  return size_1;
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return shape.FlatSize();
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1);
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1,
+                            const RuntimeShape& check_shape_2) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1, check_shape_2);
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1,
+                            const RuntimeShape& check_shape_2,
+                            const RuntimeShape& check_shape_3) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1, check_shape_2, check_shape_3);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2,
+                            const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
+}
+
+// Flat size calculation, checking if their extended shapes match.
+inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
+                                         const RuntimeShape& check_shape_0) {
+  const int shape_dims = shape.DimensionsCount();
+  const int check_shape_0_dims = check_shape_0.DimensionsCount();
+  const int min_dims = std::min(shape_dims, check_shape_0_dims);
+
+  for (int i = 0; i < min_dims; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(shape_dims - 1 - i),
+                     check_shape_0.Dims(check_shape_0_dims - 1 - i));
+  }
+  for (int i = min_dims; i < shape_dims; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(shape_dims - 1 - i), 1);
+  }
+  for (int i = min_dims; i < check_shape_0_dims; ++i) {
+    TFLITE_DCHECK_EQ(check_shape_0.Dims(check_shape_0_dims - 1 - i), 1);
+  }
+  return shape.FlatSize();
+}
+
+inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
+                                         const RuntimeShape& check_shape_0,
+                                         const RuntimeShape& check_shape_1) {
+  const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
+  TFLITE_DCHECK_EQ(MatchingExtendedShapeFlatSize(shape, check_shape_1),
+                   flat_size);
+  return flat_size;
+}
+
+inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
+                                         const RuntimeShape& check_shape_0,
+                                         const RuntimeShape& check_shape_1,
+                                         const RuntimeShape& check_shape_2) {
+  const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
+  TFLITE_DCHECK_EQ(
+      MatchingExtendedShapeFlatSize(shape, check_shape_1, check_shape_2),
+      flat_size);
+  return flat_size;
+}
+
+inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
+                                         const RuntimeShape& check_shape_0,
+                                         const RuntimeShape& check_shape_1,
+                                         const RuntimeShape& check_shape_2,
+                                         const RuntimeShape& check_shape_3) {
+  const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
+  TFLITE_DCHECK_EQ(MatchingExtendedShapeFlatSize(shape, check_shape_1,
+                                                 check_shape_2, check_shape_3),
+                   flat_size);
+  return flat_size;
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+template <int N>
+inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
+  int flat_size = 1;
+  for (int i = 0; i < N; ++i) {
+    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return FlatSizeSkipDim(dims, skip_dim);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2,
+                                   const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
+                                 check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+inline int FlatSizeSkipDim(const RuntimeShape& shape, int skip_dim) {
+  const int dims_count = shape.DimensionsCount();
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < dims_count);
+  const auto* dims_data = shape.DimsData();
+  int flat_size = 1;
+  for (int i = 0; i < dims_count; ++i) {
+    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return FlatSizeSkipDim(shape, skip_dim);
+}
+
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0,
+                                   const RuntimeShape& check_shape_1) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
+}
+
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0,
+                                   const RuntimeShape& check_shape_1,
+                                   const RuntimeShape& check_shape_2) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2);
+}
+
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0,
+                                   const RuntimeShape& check_shape_1,
+                                   const RuntimeShape& check_shape_2,
+                                   const RuntimeShape& check_shape_3) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2,
+                                 check_shape_3);
+}
+
+template <int N>
+bool IsPackedWithoutStrides(const Dims<N>& dims) {
+  int expected_stride = 1;
+  for (int d = 0; d < N; d++) {
+    if (dims.strides[d] != expected_stride) return false;
+    expected_stride *= dims.sizes[d];
+  }
+  return true;
+}
+
+template <int N>
+void ComputeStrides(Dims<N>* dims) {
+  dims->strides[0] = 1;
+  for (int d = 1; d < N; d++) {
+    dims->strides[d] = dims->strides[d - 1] * dims->sizes[d - 1];
+  }
+}
+
+enum class BroadcastableOpCategory : uint8_t {
+  kNone,
+  kNonBroadcast,               // Matching input shapes.
+  kFirstInputBroadcastsFast,   // Fivefold nested loops.
+  kSecondInputBroadcastsFast,  // Fivefold nested loops.
+  kGenericBroadcast,           // Fall-back.
+};
+
+struct MinMax {
+  float min;
+  float max;
+};
+static_assert(sizeof(MinMax) == 8, "");
+
+struct ActivationParams {
+  FusedActivationFunctionType activation_type;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+};
+
+struct ReluParams : public ActivationParams {
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+// Styles of resizing op usages. For example, kImageStyle can be used with a Pad
+// op for pattern-specific optimization.
+enum class ResizingCategory : uint8_t {
+  kNone,
+  kImageStyle,  // 4D, operating on inner dimensions, say {0, a, b, 0}.
+  kGenericResize,
+};
+
+// For Add, Sub, Mul ops.
+struct ArithmeticParams {
+  // Shape dependent / common to data / op types.
+  BroadcastableOpCategory broadcast_category;
+  // uint8_t inference params.
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // Add / Sub, not Mul, uint8_t inference params.
+  int left_shift;
+  int32_t input1_multiplier;
+  int input1_shift;
+  int32_t input2_multiplier;
+  int input2_shift;
+
+  // TODO(b/158622529): Union the following activation params.
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  // int64_t activation params.
+  int64_t int64_activation_min;
+  int64_t int64_activation_max;
+  // int16_t activation params.
+  int16_t int16_activation_min;
+  int16_t int16_activation_max;
+
+  // Processed output dimensions.
+  // Let input "a" be the one that broadcasts in the faster-changing dimension.
+  // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
+  // {b0, b1, b2, b3, b4},
+  // broadcast_shape[4] = b0 = a0.
+  // broadcast_shape[3] = b1; a1 = 1.
+  // broadcast_shape[2] = b2 = a2.
+  // broadcast_shape[1] = a3; b3 = 1.
+  // broadcast_shape[0] = b4 = a4.
+  int broadcast_shape[5];
+};
+
+struct ConcatenationParams {
+  int8_t axis;
+  const int32_t* input_zeropoint;
+  const float* input_scale;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
+  float output_scale;
+};
+
+struct ComparisonParams {
+  // uint8_t inference params.
+  int left_shift;
+  int32_t input1_offset;
+  int32_t input1_multiplier;
+  int input1_shift;
+  int32_t input2_offset;
+  int32_t input2_multiplier;
+  int input2_shift;
+  // Shape dependent / common to inference types.
+  bool is_broadcast;
+};
+
+struct ConvParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct Conv3DParams {
+  Padding3DValues padding_values;
+  int stride_width;
+  int stride_height;
+  int stride_depth;
+  int dilation_width;
+  int dilation_height;
+  int dilation_depth;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+typedef Conv3DParams Conv3DTransposeParams;
+
+struct DepthToSpaceParams {
+  int32_t block_size;
+};
+
+struct DepthwiseParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  int16_t depth_multiplier;
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  const int32_t* output_multiplier_per_channel;
+  const int32_t* output_shift_per_channel;
+};
+
+struct DequantizationParams {
+  double scale;
+  int32_t zero_point;
+};
+
+struct PerChannelDequantizationParams {
+  const float* scale;
+  const int32_t* zero_point;
+  int32_t quantized_dimension;
+};
+
+struct FakeQuantParams {
+  MinMax minmax;
+  int32_t num_bits;
+};
+
+struct FullyConnectedParams {
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  // Mark the operands as cacheable if they are unchanging, e.g. weights.
+  bool lhs_cacheable;
+  bool rhs_cacheable;
+  FullyConnectedWeightsFormat weights_format;
+};
+
+struct GatherParams {
+  int16_t axis;
+  int16_t batch_dims;
+};
+
+struct L2NormalizationParams {
+  // uint8_t inference params.
+  int32_t input_zero_point;
+};
+
+struct LocalResponseNormalizationParams {
+  int32_t range;
+  double bias;
+  double alpha;
+  double beta;
+};
+
+struct HardSwishParams {
+  // zero_point of the input activations.
+  int16_t input_zero_point;
+  // zero_point of the output activations.
+  int16_t output_zero_point;
+  // 16bit fixed-point component of the multiplier to apply to go from the
+  // "high-res input scale", which is the input scale multiplied by 2^7, to the
+  // "relu-ish scale", which 3.0/32768.
+  // See the implementation of HardSwishPrepare.
+  int16_t reluish_multiplier_fixedpoint_int16;
+  // exponent/bit-shift component of the aforementioned multiplier.
+  int reluish_multiplier_exponent;
+  // 16bit fixed-point component of the multiplier to apply to go from the
+  // "high-res input scale", which is the input scale multiplied by 2^7, to the
+  // output scale.
+  // See the implementation of HardSwishPrepare.
+  int16_t output_multiplier_fixedpoint_int16;
+  // exponent/bit-shift component of the aforementioned multiplier.
+  int output_multiplier_exponent;
+};
+
+struct LogisticParams {
+  // uint8_t inference params.
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+struct LstmCellParams {
+  int32_t weights_zero_point;
+  int32_t accum_multiplier;
+  int accum_shift;
+  int state_integer_bits;
+};
+
+struct MeanParams {
+  int8_t axis_count;
+  int16_t axis[4];
+};
+
+struct PackParams {
+  int8_t axis;
+  const int32_t* input_zeropoint;
+  const float* input_scale;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
+  float output_scale;
+};
+
+struct PadParams {
+  int8_t left_padding_count;
+  int32_t left_padding[5];
+  int8_t right_padding_count;
+  int32_t right_padding[5];
+  ResizingCategory resizing_category;
+};
+
+struct PreluParams {
+  int32_t input_offset;
+  int32_t alpha_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_1;
+  int output_shift_1;
+  int32_t output_multiplier_2;
+  int output_shift_2;
+};
+
+struct PoolParams {
+  FusedActivationFunctionType activation;
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int stride_height;
+  int stride_width;
+  int filter_height;
+  int filter_width;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct ReshapeParams {
+  int8_t shape_count;
+  int32_t shape[4];
+};
+
+struct ResizeBilinearParams {
+  bool align_corners;
+  // half_pixel_centers assumes pixels are of half the actual dimensions, and
+  // yields more accurate resizes. Corresponds to the same argument for the
+  // original TensorFlow op in TF2.0.
+  bool half_pixel_centers;
+};
+
+struct ResizeNearestNeighborParams {
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
+struct SliceParams {
+  int8_t begin_count;
+  int32_t begin[5];
+  int8_t size_count;
+  int32_t size[5];
+};
+
+struct SoftmaxParams {
+  // beta is not really used (not a Tensorflow parameter) and not implemented
+  // for LogSoftmax.
+  double beta;
+  // uint8_t inference params.  Used even when beta defaults to 1.0.
+  int32_t input_multiplier;
+  int32_t input_left_shift;
+  // Reverse scaling is only used by LogSoftmax.
+  int32_t reverse_scaling_divisor;
+  int32_t reverse_scaling_right_shift;
+  int diff_min;
+  int32_t zero_point;
+  float scale;
+  float* table;
+  // int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0]
+  int16_t* exp_lut;
+  // int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
+  int16_t* one_over_one_plus_x_lut;
+  uint8_t* uint8_table1;
+  uint8_t* uint8_table2;
+};
+
+struct SpaceToBatchParams {
+  // "Zero" padding for uint8_t means padding with the output offset.
+  int32_t output_offset;
+};
+
+struct SpaceToDepthParams {
+  int32_t block_size;
+};
+
+struct SplitParams {
+  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
+  // OperatorEdges are of type uint16_t.
+  uint16_t num_split;
+  int16_t axis;
+};
+
+struct SqueezeParams {
+  int8_t squeeze_dims_count;
+  int32_t squeeze_dims[4];
+};
+
+struct StridedSliceParams {
+  int8_t start_indices_count;
+  int32_t start_indices[5];
+  int8_t stop_indices_count;
+  int32_t stop_indices[5];
+  int8_t strides_count;
+  int32_t strides[5];
+
+  uint16_t begin_mask;
+  uint16_t ellipsis_mask;
+  uint16_t end_mask;
+  uint16_t new_axis_mask;
+  uint16_t shrink_axis_mask;
+  bool offset;
+};
+
+struct TanhParams {
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+constexpr int kTransposeMaxDimensions = 6;
+
+struct TransposeParams {
+  int8_t perm_count;
+  int32_t perm[kTransposeMaxDimensions];
+};
+
+struct UnpackParams {
+  uint16_t num_split;
+  int16_t axis;
+};
+
+struct LeakyReluParams {
+  float alpha;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_alpha;
+  int32_t output_shift_alpha;
+  int32_t output_multiplier_identity;
+  int32_t output_shift_identity;
+};
+
+template <typename P>
+inline void SetActivationParams(float min, float max, P* params) {
+  params->float_activation_min = min;
+  params->float_activation_max = max;
+}
+
+template <typename P>
+inline void SetActivationParams(int32_t min, int32_t max, P* params) {
+  params->quantized_activation_min = min;
+  params->quantized_activation_max = max;
+}
+
+template <typename P>
+inline void SetActivationParams(uint32_t min, uint32_t max, P* params) {
+  params->quantized_activation_min = min;
+  params->quantized_activation_max = max;
+}
+
+template <typename P>
+inline void SetActivationParams(int16_t min, int16_t max, P* params) {
+  params->int16_activation_min = min;
+  params->int16_activation_max = max;
+}
+
+template <typename P>
+inline void SetActivationParams(int64_t min, int64_t max, P* params) {
+  params->int64_activation_min = min;
+  params->int64_activation_max = max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
+  *min = params.quantized_activation_min;
+  *max = params.quantized_activation_max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, uint32_t* min, uint32_t* max) {
+  *min = params.quantized_activation_min;
+  *max = params.quantized_activation_max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int16_t* min, int16_t* max) {
+  *min = params.int16_activation_min;
+  *max = params.int16_activation_max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, float* min, float* max) {
+  *min = params.float_activation_min;
+  *max = params.float_activation_max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
+  *min = params.int64_activation_min;
+  *max = params.int64_activation_max;
+}
+
+// Type trait to check of given type has size smaller than 4 bytes.
+template <typename T>
+struct is_small_integer
+    : public std::integral_constant<bool,
+                                    std::is_same<T, int8_t>::value ||
+                                        std::is_same<T, uint8_t>::value ||
+                                        std::is_same<T, int16_t>::value ||
+                                        std::is_same<T, uint16_t>::value> {};
+
+// Type trait to check of given type is int32 or int64.
+template <typename T>
+struct is_int32_or_int64
+    : public std::integral_constant<bool, std::is_same<T, int32_t>::value ||
+                                              std::is_same<T, int64_t>::value> {
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
new file mode 100644
index 00000000..f27db6b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
@@ -0,0 +1,150 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_UTILS_SPARSITY_FORMAT_CONVERTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_UTILS_SPARSITY_FORMAT_CONVERTER_H_
+
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace internal {
+namespace sparsity {
+
+// A converter that keeps an internal representation of sparse tensor parameters
+// and converts tensors between dense and sparse formats.
+template <typename T>
+class FormatConverter {
+ public:
+  /*
+   * Creates a dense to sparse converter.
+   * @param shape             Shape of the dense tensor.
+   * @param traversal_order   In what order to traverse all dimensions,
+   *                          including block dimensions.
+   * @param format            Whether each dimension in the dense tensor is
+   *                          dense or sparse (not in the traversal order).
+   * @param block_size        Size of each block dimension.
+   * @param block_map         Map from block dimension to original tensor
+   *                          dimension.
+   */
+  FormatConverter(const std::vector<int>& shape,
+                  const std::vector<int>& traversal_order,
+                  const std::vector<TfLiteDimensionType>& format,
+                  const std::vector<int>& block_size = {},
+                  const std::vector<int>& block_map = {});
+
+  /*
+   * Creates a sparse to dense converter.
+   * @param shape             Shape of the target dense tensor.
+   * @param traversal_order   In what order to traverse all dimensions,
+   *                          including block dimensions.
+   * @param format            Whether each dimension in the dense tensor is
+   *                          dense or sparse (not in the traversal order).
+   * @param dense_size        Size of each dense dimension in the sparse tensor.
+   *                          Should be 0 for sparse dimensions.
+   * @param segments          Segments of each dimension in the sparse tensor.
+   *                          Should be empty for dense dimensions.
+   * @param indices           Indices in the dense tensor for each dimension.
+   *                          Should be empty for dense dimensions.
+   * @param block_map         Map from block dimension to original tensor
+   *                          dimension.
+   */
+  FormatConverter(const std::vector<int>& shape,
+                  const std::vector<int>& traversal_order,
+                  const std::vector<TfLiteDimensionType>& format,
+                  const std::vector<int>& dense_size,
+                  const std::vector<std::vector<int>>& segments,
+                  const std::vector<std::vector<int>>& indices,
+                  const std::vector<int>& block_map = {});
+
+  /* Creates a sparse to dense converter.
+   * @param shape      Shape of the target dense tensor.
+   * @param sparsity   Sparsity parameter of the sparse TfLiteTensor.
+   */
+  FormatConverter(const std::vector<int>& shape,
+                  const TfLiteSparsity& sparsity);
+
+  const std::vector<T>& GetData() { return data_; }
+  const std::vector<std::vector<int>>& GetDimMetadata() {
+    return dim_metadata_;
+  }
+
+  // Method for dense to sparse conversion. Need to call GetData() method to get
+  // the compressed data.
+  TfLiteStatus DenseToSparse(const T* src_data);
+
+  // Method for sparse to dense conversion. Need to call GetData() method to get
+  // the decompressed data.
+  TfLiteStatus SparseToDense(const T* src_data);
+  // Method for sparse to dense conversion with caller provided buffer. No need
+  // to call GetData() with this method.
+  TfLiteStatus SparseToDense(const T* src_data, const size_t dest_size,
+                             T* dest_data, TfLiteContext* context = nullptr);
+
+ private:
+  // Helper function for initializing this converter for sparse to dense
+  // conversion.
+  void InitSparseToDenseConverter(std::vector<int> shape,
+                                  std::vector<int> traversal_order,
+                                  std::vector<TfLiteDimensionType> format,
+                                  std::vector<int> dense_size,
+                                  std::vector<std::vector<int>> segments,
+                                  std::vector<std::vector<int>> indices,
+                                  std::vector<int> block_map);
+
+  // A recursive function to fetch data from the compressed src_data buffer and
+  // populate the dense buffer.
+  void Populate(const T* src_data, std::vector<int> indices, int level,
+                int prev_idx, int* src_data_ptr, T* dest_data);
+
+  // Check if val is equal to zero.
+  bool IsZero(const T val);
+
+  // Shape of the conceptual dense tensor.
+  std::vector<int> dense_shape_;
+  // Shape of the dense tensor with inner blocks reduced. For example, a (4, 4)
+  // tensor with (2, 2) block has blocked_shape (2, 2).
+  std::vector<int> blocked_shape_;
+  // Total number of elements in the dense tensor.
+  size_t dense_size_;
+  // Has n(original dimension)+k(block_dimension) elements.
+  std::vector<int> traversal_order_;
+  // Format of each dimension in the traversal order.
+  std::vector<TfLiteDimensionType> format_;
+  // Size of each block dimension, in the same order as block map.
+  std::vector<int> block_size_;
+  // Map from block dimension to the original tensor dimension.
+  std::vector<int> block_map_;
+  // Metadata of each dimension in the traversal order.
+  // Each dimension needs two vectors. For dense dimensions, the first vector
+  // stores the size of that dimension, and the second vector is empty. For
+  // sparse dimensions, the first vector stores the segments and the second one
+  // stores the indices.
+  std::vector<std::vector<int>> dim_metadata_;
+  // Actual buffer holding data after conversion. Could be sparse buffer or
+  // dense buffer.
+  std::vector<T> data_;
+};
+
+extern template class FormatConverter<int32_t>;
+extern template class FormatConverter<int8_t>;
+extern template class FormatConverter<float>;
+extern template class FormatConverter<Eigen::half>;
+}  // namespace sparsity
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_UTILS_SPARSITY_FORMAT_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/kernel_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/kernel_util.h
new file mode 100644
index 00000000..070f363b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/kernel_util.h
@@ -0,0 +1,343 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
+
+#include <stdint.h>
+
+#include <limits>
+#ifndef TF_LITE_STATIC_MEMORY
+#include <string>
+#endif  // TF_LITE_STATIC_MEMORY
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#ifndef NDEBUG
+#include "tensorflow/lite/kernels/op_macros.h"
+#endif
+
+namespace tflite {
+
+// A fair number of functions in this header have historically been inline.
+// It is ok to change functions to not be inline if the latency with
+// benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
+// made, move the newly non-inlined function declarations to the top of this
+// header file.
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetInput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index);
+
+// Same as `GetInput` but returns boolean and uses output argument for tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetInputSafe(context, node, kMyTensorIdx, &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                          int index, const TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetOutput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index);
+
+// Same as `GetOutput` but returns boolean and uses output argument for tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetOutputSafe(context, node, kMyTensorIdx, &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                           int index, TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetOptionalInputTensor(context, node, kIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+//
+// Deprecated. GetInput has the same functionality.
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index);
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetTemporary(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
+                           int index);
+
+// Same as `GetTemporary` but returns boolean and uses output argument for
+// tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetTemporarySafe(context, node, kMyTensorIdx,
+//                     &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
+                              const TfLiteNode* node, int index,
+                              TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetIntermediates(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+const TfLiteTensor* GetIntermediates(TfLiteContext* context,
+                                     const TfLiteNode* node, int index);
+
+// Same as `GetIntermediates` but returns boolean and uses output argument for
+// tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetIntermediatesSafe(context, node, kMyTensorIdx,
+//                     &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
+                                  const TfLiteNode* node, int index,
+                                  TfLiteTensor** tensor);
+#endif  // TF_LITE_STATIC_MEMORY
+
+inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
+inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
+  return t->dims->data[dim];
+}
+
+inline int NumInputs(const TfLiteNode* node) {
+  return node->inputs == nullptr ? 0 : node->inputs->size;
+}
+inline int NumOutputs(const TfLiteNode* node) {
+  return node->outputs == nullptr ? 0 : node->outputs->size;
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+inline int NumIntermediates(const TfLiteNode* node) {
+  return node->intermediates->size;
+}
+#endif  // TF_LITE_STATIC_MEMORY
+
+inline int64_t NumElements(const int* dims, int num_dims) {
+  int64_t count = 1;
+  for (int i = 0; i < num_dims; ++i) {
+#ifndef NDEBUG
+    if (count <= 0) {
+      break;
+    }
+    // Check that number of elements can fit in 32 bit int. Most of tflite
+    // assumes the result of `NumElements` is < MAX_INT and static or implicit
+    // casts to `int32_t` without any checks. It is more meaningful to check
+    // that the result fits into 32 bits than for standard overflow on 64 bit
+    // type.
+    TF_LITE_ASSERT(dims[i] < std::numeric_limits<int>::max() / count);
+#endif
+    count *= dims[i];
+  }
+  return count;
+}
+
+inline int64_t NumElements(const TfLiteIntArray* dims) {
+  return NumElements(dims->data, dims->size);
+}
+
+inline int64_t NumElements(const TfLiteTensor* t) {
+  return NumElements(t->dims);
+}
+
+// Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
+inline bool IsConstantTensor(const TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteMmapRo;
+}
+
+inline bool IsConstantOrPersistentTensor(const TfLiteTensor* tensor) {
+  return IsConstantTensor(tensor) ||
+         (tensor->allocation_type == kTfLitePersistentRo);
+}
+
+// Determines whether tensor is dynamic. Note that a tensor can be non-const and
+// not dynamic. This function specifically checks for a dynamic tensor.
+inline bool IsDynamicTensor(const TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteDynamic;
+}
+#ifndef TF_LITE_STATIC_MEMORY
+// Sets tensor to dynamic.
+inline void SetTensorToDynamic(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLiteDynamic) {
+    TfLiteTensorDataFree(tensor);
+    tensor->allocation_type = kTfLiteDynamic;
+  }
+}
+
+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    TfLiteTensorDataFree(tensor);
+    tensor->allocation_type = kTfLitePersistentRo;
+  }
+}
+#endif  // TF_LITE_STATIC_MEMORY
+
+// Determines whether it is a hybrid op - one that has float inputs and
+// quantized weights.
+inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
+  return ((weight->type == kTfLiteUInt8 || weight->type == kTfLiteInt8) &&
+          input->type == kTfLiteFloat32);
+}
+
+// Check dimensionality match and populate OpData for Conv and DepthwiseConv.
+TfLiteStatus PopulateConvolutionQuantizationParams(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
+    const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
+    int32_t* output_activation_min, int32_t* output_activation_max,
+    int32_t* per_channel_multiplier, int32_t* per_channel_shift);
+
+TfLiteStatus PopulateConvolutionQuantizationParams(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
+    const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
+    int32_t* output_activation_min, int32_t* output_activation_max,
+    int32_t* per_channel_multiplier, int32_t* per_channel_shift,
+    int num_channels);
+
+// Calculates the multiplication factor for a quantized convolution (or
+// quantized depthwise convolution) involving the given tensors. Returns an
+// error if the scales of the tensors are not compatible.
+TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
+                                              const TfLiteTensor* input,
+                                              const TfLiteTensor* filter,
+                                              const TfLiteTensor* bias,
+                                              TfLiteTensor* output,
+                                              double* multiplier);
+
+TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
+                                              const TfLiteTensor* input,
+                                              const TfLiteTensor* filter,
+                                              TfLiteTensor* output,
+                                              double* multiplier);
+
+// Calculates the useful quantized range of an activation layer given its
+// activation tensor.
+TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
+                                               TfLiteFusedActivation activation,
+                                               TfLiteTensor* output,
+                                               int32_t* act_min,
+                                               int32_t* act_max);
+
+// Calculates the useful range of an activation layer given its activation
+// tensor.a
+template <typename T>
+void CalculateActivationRange(TfLiteFusedActivation activation,
+                              T* activation_min, T* activation_max) {
+  if (activation == kTfLiteActRelu) {
+    *activation_min = 0;
+    *activation_max = std::numeric_limits<T>::max();
+  } else if (activation == kTfLiteActRelu6) {
+    *activation_min = 0;
+    *activation_max = 6;
+  } else if (activation == kTfLiteActReluN1To1) {
+    *activation_min = -1;
+    *activation_max = 1;
+  } else {
+    *activation_min = std::numeric_limits<T>::lowest();
+    *activation_max = std::numeric_limits<T>::max();
+  }
+}
+
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2);
+
+#if !defined(TF_LITE_STATIC_MEMORY)
+// Gets the output shape from the input tensor.
+TfLiteStatus GetOutputShapeFromInput(TfLiteContext* context,
+                                     const TfLiteTensor* input,
+                                     TfLiteIntArray** output_shape);
+
+std::string GetShapeDebugString(const TfLiteIntArray* shape);
+
+std::string GetTensorDebugString(const TfLiteTensor* tensor);
+
+#endif  // !defined(TF_LITE_STATIC_MEMORY)
+
+// Calculates the output_shape that is necessary for element-wise operations
+// with broadcasting involving the two input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        const TfLiteTensor* input1,
+                                        const TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape);
+
+// Calculates the output_shape that is necessary for element-wise operations
+// with broadcasting involving the three input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        const TfLiteTensor* input1,
+                                        const TfLiteTensor* input2,
+                                        const TfLiteTensor* input3,
+                                        TfLiteIntArray** output_shape);
+
+// Return the size of given type in bytes. Return 0 in case of string.
+int TfLiteTypeGetSize(TfLiteType type);
+
+// Whether the current platform is mobile (Android or iOS).
+bool IsMobilePlatform();
+
+// Returns whether there is unspecified dimension in the tensor's dim signature.
+bool HasUnspecifiedDimension(const TfLiteTensor* tensor);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/lstm_eval.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/lstm_eval.h
new file mode 100644
index 00000000..e0f1c273
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/lstm_eval.h
@@ -0,0 +1,233 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
+#define TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm_eval {
+
+// Pamameters for integer LSTM.
+// Consider split this into two Integer Parameters if more fields are added.
+struct IntegerLstmParameter {
+  int32_t effective_input_to_input_scale_a;
+  int32_t effective_input_to_input_scale_b;
+  int32_t effective_recurrent_to_input_scale_a;
+  int32_t effective_recurrent_to_input_scale_b;
+  int32_t effective_cell_to_input_scale_a;
+  int32_t effective_cell_to_input_scale_b;
+  int32_t effective_input_to_forget_scale_a;
+  int32_t effective_input_to_forget_scale_b;
+  int32_t effective_recurrent_to_forget_scale_a;
+  int32_t effective_recurrent_to_forget_scale_b;
+  int32_t effective_cell_to_forget_scale_a;
+  int32_t effective_cell_to_forget_scale_b;
+  int32_t effective_input_to_cell_scale_a;
+  int32_t effective_input_to_cell_scale_b;
+  int32_t effective_recurrent_to_cell_scale_a;
+  int32_t effective_recurrent_to_cell_scale_b;
+  int32_t effective_input_to_output_scale_a;
+  int32_t effective_input_to_output_scale_b;
+  int32_t effective_recurrent_to_output_scale_a;
+  int32_t effective_recurrent_to_output_scale_b;
+  int32_t effective_cell_to_output_scale_a;
+  int32_t effective_cell_to_output_scale_b;
+  int32_t effective_proj_scale_a;
+  int32_t effective_proj_scale_b;
+  int32_t effective_hidden_scale_a;
+  int32_t effective_hidden_scale_b;
+  int32_t layer_norm_input_scale_a;
+  int32_t layer_norm_input_scale_b;
+  int32_t layer_norm_forget_scale_a;
+  int32_t layer_norm_forget_scale_b;
+  int32_t layer_norm_cell_scale_a;
+  int32_t layer_norm_cell_scale_b;
+  int32_t layer_norm_output_scale_a;
+  int32_t layer_norm_output_scale_b;
+  // Quantized clip value for cell and projection. Zero value means no clipping.
+  int16_t quantized_cell_clip;
+  int8_t quantized_proj_clip;
+  int32_t hidden_zp;
+  int32_t cell_scale;
+
+  int32_t input_variance_guard;
+  int32_t forget_variance_guard;
+  int32_t cell_variance_guard;
+  int32_t output_variance_guard;
+
+  // Pre-calculate bias + zero_point * weight.
+  // Unabled to use temporary tensors since those are used in Prepare() and
+  // scratch buffer is only allocated after Preapre().
+  std::unique_ptr<int32_t[]> input_to_forget_effective_bias;
+  std::unique_ptr<int32_t[]> recurrent_to_forget_effective_bias;
+  std::unique_ptr<int32_t[]> input_to_cell_effective_bias;
+  std::unique_ptr<int32_t[]> recurrent_to_cell_effective_bias;
+  std::unique_ptr<int32_t[]> input_to_output_effective_bias;
+  std::unique_ptr<int32_t[]> recurrent_to_output_effective_bias;
+  std::unique_ptr<int32_t[]> input_to_input_effective_bias;
+  std::unique_ptr<int32_t[]> recurrent_to_input_effective_bias;
+  std::unique_ptr<int32_t[]> projection_effective_bias;
+
+  // Scale and zero point for intermediate tensors.
+  // Used only in the 8x8_8 case.
+  int32_t intermediate_scale_a[8];
+  int32_t intermediate_scale_b[8];
+  int32_t intermediate_zp[12];
+};
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    bool recurrent_to_input_is_diag, bool recurrent_to_forget_is_diag,
+    bool recurrent_to_cell_is_diag, bool recurrent_to_output_is_diag,
+    CpuBackendContext* context);
+
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_input_weights_ledger,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_forget_weights_ledger,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_cell_weights_ledger,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* input_to_output_weights_ledger,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_input_weights_ledger,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_forget_weights_ledger,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_cell_weights_ledger,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* recurrent_to_output_weights_ledger,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights,
+    const TfLiteTensor* projection_weights_ledger,
+    const TfLiteTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
+    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
+    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output_scratch_buffer, TfLiteTensor* output,
+    TfLiteTensor* input_zp, TfLiteTensor* aux_input_zp,
+    TfLiteTensor* output_state_zp, TfLiteTensor* row_sums, int row_sums_size,
+    bool* compute_row_sums, bool recurrent_to_input_is_diag,
+    bool recurrent_to_forget_is_diag, bool recurrent_to_cell_is_diag,
+    bool recurrent_to_output_is_diag, CpuBackendContext* context);
+
+TfLiteStatus EvalInteger8x8_16(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    const lstm_eval::IntegerLstmParameter* integer_lstm_param,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    CpuBackendContext* context);
+
+TfLiteStatus EvalInteger8x8_8(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    const lstm_eval::IntegerLstmParameter* integer_lstm_param,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    TfLiteTensor* scratch6, TfLiteTensor* scratch7);
+
+}  // namespace lstm_eval
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/lstm_shared.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/lstm_shared.h
new file mode 100644
index 00000000..0907be90
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/lstm_shared.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_LSTM_SHARED_H_
+#define TENSORFLOW_LITE_KERNELS_LSTM_SHARED_H_
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm {
+// For full inputs kernel (24-inputs).
+// Please note the 20-input full kernel is deprecated and only kept
+// here for backward compatibility.
+namespace full {
+
+// Input Tensors of size {n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1;  // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9;    // Optional
+constexpr int kCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12;  // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17;  // Optional
+
+// These state tensors are defined as variable tensors, and will be modified by
+// this op.
+constexpr int kOutputStateTensor = 18;
+constexpr int kCellStateTensor = 19;
+
+// Layer norm coefficient tensors of size {n_cell}, representing a diagonal
+// matrix.
+constexpr int kInputLayerNormCoefficientsTensor = 20;   // Optional
+constexpr int kForgetLayerNormCoefficientsTensor = 21;  // Optional
+constexpr int kCellLayerNormCoefficientsTensor = 22;    // Optional
+constexpr int kOutputLayerNormCoefficientsTensor = 23;  // Optional
+
+// Output tensors.
+constexpr int kOutputTensor = 0;
+}  // namespace full
+
+}  // namespace lstm
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_LSTM_SHARED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/op_macros.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/op_macros.h
new file mode 100644
index 00000000..3a09ce3d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/op_macros.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
+#define TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
+
+#include <cstdio>
+#include <cstdlib>
+
+
+// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
+#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name)                    \
+  do {                                                                      \
+    TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
+                       __FILE__, __LINE__, TfLiteTypeGetName(type),         \
+                       (op_name));                                          \
+    return kTfLiteError;                                                    \
+  } while (0)
+
+#define TFLITE_ABORT abort()
+
+#if defined(NDEBUG)
+#define TFLITE_ASSERT_FALSE (static_cast<void>(0))
+#else
+#define TFLITE_ASSERT_FALSE TFLITE_ABORT
+#endif
+
+#define TF_LITE_FATAL(msg)          \
+  do {                              \
+    fprintf(stderr, "%s\n", (msg)); \
+    TFLITE_ABORT;                   \
+  } while (0)
+
+#define TF_LITE_ASSERT(x)        \
+  do {                           \
+    if (!(x)) TF_LITE_FATAL(#x); \
+  } while (0)
+
+#define TF_LITE_ASSERT_EQ(x, y)                            \
+  do {                                                     \
+    if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
+  } while (0)
+
+#endif  // TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/padding.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/padding.h
new file mode 100644
index 00000000..cc9d596f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/padding.h
@@ -0,0 +1,115 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_PADDING_H_
+#define TENSORFLOW_LITE_KERNELS_PADDING_H_
+
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+inline int ComputePadding(int stride, int dilation_rate, int in_size,
+                          int filter_size, int out_size) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
+  return padding > 0 ? padding : 0;
+}
+
+// It's not guaranteed that padding is symmetric. It's important to keep
+// offset for algorithms need all paddings.
+inline int ComputePaddingWithOffset(int stride, int dilation_rate, int in_size,
+                                    int filter_size, int out_size,
+                                    int* offset) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int total_padding =
+      ((out_size - 1) * stride + effective_filter_size - in_size);
+  total_padding = total_padding > 0 ? total_padding : 0;
+  *offset = total_padding % 2;
+  return total_padding / 2;
+}
+
+// Matching GetWindowedOutputSize in TensorFlow.
+inline int ComputeOutSize(TfLitePadding padding, int image_size,
+                          int filter_size, int stride, int dilation_rate = 1) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+
+  // TODO(b/186448822): This uses 0 since the function has no other way to
+  // report error case
+  if (stride == 0) return 0;
+
+  switch (padding) {
+    case kTfLitePaddingSame:
+      return (image_size + stride - 1) / stride;
+    case kTfLitePaddingValid:
+      return (image_size + stride - effective_filter_size) / stride;
+    default:
+      return 0;
+  }
+}
+
+inline TfLitePaddingValues ComputePaddingHeightWidth(
+    int stride_height, int stride_width, int dilation_rate_height,
+    int dilation_rate_width, int in_height, int in_width, int filter_height,
+    int filter_width, TfLitePadding padding, int* out_height, int* out_width) {
+  *out_width = ComputeOutSize(padding, in_width, filter_width, stride_width,
+                              dilation_rate_width);
+  *out_height = ComputeOutSize(padding, in_height, filter_height, stride_height,
+                               dilation_rate_height);
+
+  TfLitePaddingValues padding_values;
+  int offset = 0;
+  padding_values.height =
+      ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height,
+                               filter_height, *out_height, &offset);
+  padding_values.height_offset = offset;
+  padding_values.width =
+      ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width,
+                               filter_width, *out_width, &offset);
+  padding_values.width_offset = offset;
+  return padding_values;
+}
+
+inline Padding3DValues ComputePadding3DValues(
+    int stride_height, int stride_width, int stride_depth,
+    int dilation_rate_height, int dilation_rate_width, int dilation_rate_depth,
+    int in_height, int in_width, int in_depth, int filter_height,
+    int filter_width, int filter_depth, TfLitePadding padding, int* out_height,
+    int* out_width, int* out_depth) {
+  *out_width = ComputeOutSize(padding, in_width, filter_width, stride_width,
+                              dilation_rate_width);
+  *out_height = ComputeOutSize(padding, in_height, filter_height, stride_height,
+                               dilation_rate_height);
+  *out_depth = ComputeOutSize(padding, in_depth, filter_depth, stride_depth,
+                              dilation_rate_depth);
+
+  Padding3DValues padding_values;
+  int offset = 0;
+  padding_values.depth =
+      ComputePaddingWithOffset(stride_depth, dilation_rate_depth, in_depth,
+                               filter_depth, *out_depth, &offset);
+  padding_values.depth_offset = offset;
+  padding_values.height =
+      ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height,
+                               filter_height, *out_height, &offset);
+  padding_values.height_offset = offset;
+  padding_values.width =
+      ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width,
+                               filter_width, *out_width, &offset);
+  padding_values.width_offset = offset;
+  return padding_values;
+}
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_PADDING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h
new file mode 100644
index 00000000..018e813a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h
@@ -0,0 +1,690 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_PARSE_EXAMPLE_EXAMPLE_PROTO_FAST_PARSING_H_
+#define TENSORFLOW_LITE_KERNELS_PARSE_EXAMPLE_EXAMPLE_PROTO_FAST_PARSING_H_
+#include <algorithm>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/example_proto_fast_parsing.h"
+#include "tensorflow/core/util/presized_cuckoo_map.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+namespace example {
+
+template <typename T>
+using SmallVector = gtl::InlinedVector<T, 4>;
+
+template <typename T>
+class LimitedArraySlice {
+ public:
+  using value_type = T;
+
+  LimitedArraySlice(T* begin, size_t num_elements)
+      : current_(begin), begin_(begin), end_(begin + num_elements) {}
+
+  // May return negative if there were push_back calls after slice was filled.
+  int64_t EndDistance() const { return end_ - current_; }
+
+  // Attempts to push value to the back of this. If the slice has
+  // already been filled, this method has no effect on the underlying data, but
+  // it changes the number returned by EndDistance into negative values.
+  void push_back(T&& value) {
+    if (EndDistance() > 0) *current_ = std::move(value);
+    ++current_;
+  }
+
+  // "Constructs" an element at the back of this by resizing the slice, and
+  // returns a mutable reference to the new last element.
+  // REQUIRES: EndDistance() > 0.
+  T& construct_at_end() {
+    DCHECK_GT(EndDistance(), 0);
+    return *(current_++);
+  }
+
+  // Returns a mutable reference to the last element in the slice.
+  // REQUIRES: size() > 0.
+  T& back() { return *(current_ - 1); }
+
+  // Returns the number of elements in the slice.
+  size_t size() const { return std::min(current_ - begin_, end_ - begin_); }
+
+  // Attempts to resize the vector to the given size. It does so by advancing
+  // the pointer to the current element, possibly beyond the end of the slice.
+  // As a consequence, calling `size()` after `resize(x)` was called might
+  // return a value less than `x`.
+  void resize(size_t size) { current_ = begin_ + size; }
+
+  // Returns the pointer to the underlying data buffer.
+  T* data() { return begin_; }
+
+ private:
+  T* current_;
+  T* begin_;
+  T* end_;
+};
+
+template <typename A>
+auto EnableAliasing(A* a) -> decltype(a->EnableAliasing(true), void()) {
+  a->EnableAliasing(true);
+}
+
+template <typename A>
+void EnableAliasing(A&& a) {}
+
+uint8 PeekTag(protobuf::io::CodedInputStream* stream);
+
+constexpr uint8 kVarintTag(uint32 tag) { return (tag << 3) | 0; }
+constexpr uint8 kDelimitedTag(uint32 tag) { return (tag << 3) | 2; }
+constexpr uint8 kFixed32Tag(uint32 tag) { return (tag << 3) | 5; }
+
+namespace parsed {
+
+// ParseDataType has to be called first, then appropriate ParseZzzzList.
+class Feature {
+ public:
+  Feature() {}
+  explicit Feature(absl::string_view serialized) : serialized_(serialized) {}
+
+  absl::Status ParseDataType(DataType* dtype) {
+    DCHECK(dtype != nullptr);
+    if (serialized_.empty()) {
+      *dtype = DT_INVALID;
+      return absl::OkStatus();
+    }
+    uint8 oneof_tag = static_cast<uint8>(*serialized_.data());
+    serialized_.remove_prefix(1);
+    switch (oneof_tag) {
+      case kDelimitedTag(1):
+        *dtype = DT_STRING;
+        break;
+      case kDelimitedTag(2):
+        *dtype = DT_FLOAT;
+        break;
+      case kDelimitedTag(3):
+        *dtype = DT_INT64;
+        break;
+      default:
+        // Initialize variable to avoid compiler warning
+        *dtype = DT_INVALID;
+        return errors::InvalidArgument("Unsupported datatype.");
+    }
+    return absl::OkStatus();
+  }
+
+  bool GetNumElementsInBytesList(int* num_elements) {
+    protobuf::io::CodedInputStream stream(
+        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+    EnableAliasing(&stream);
+    uint32 length = 0;
+    if (!stream.ReadVarint32(&length)) return false;
+    auto limit = stream.PushLimit(length);
+    *num_elements = 0;
+    while (!stream.ExpectAtEnd()) {
+      if (!stream.ExpectTag(kDelimitedTag(1))) return false;
+      uint32 bytes_length = 0;
+      if (!stream.ReadVarint32(&bytes_length)) return false;
+      if (!stream.Skip(bytes_length)) return false;
+      ++*num_elements;
+    }
+    stream.PopLimit(limit);
+    return true;
+  }
+
+  // Helper methods
+  tstring* construct_at_end(LimitedArraySlice<tstring>* bytes_list) {
+    if (bytes_list->EndDistance() <= 0) {
+      return nullptr;
+    }
+    return &bytes_list->construct_at_end();
+  }
+  tstring* construct_at_end(SmallVector<tstring>* bytes_list) {
+    return &bytes_list->emplace_back();
+  }
+
+  template <typename Result>
+  bool ParseBytesList(Result* bytes_list) {
+    DCHECK(bytes_list != nullptr);
+
+    protobuf::io::CodedInputStream stream(
+        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+
+    EnableAliasing(&stream);
+
+    uint32 length;
+    if (!stream.ReadVarint32(&length)) return false;
+    auto limit = stream.PushLimit(length);
+
+    while (!stream.ExpectAtEnd()) {
+      if (!stream.ExpectTag(kDelimitedTag(1))) return false;
+      // parse string
+      uint32 bytes_length;
+      if (!stream.ReadVarint32(&bytes_length)) return false;
+      tstring* bytes = construct_at_end(bytes_list);
+      if (bytes == nullptr) return false;
+      bytes->resize_uninitialized(bytes_length);
+      if (!stream.ReadRaw(bytes->data(), bytes_length)) return false;
+    }
+    stream.PopLimit(limit);
+    return true;
+  }
+
+  template <typename Result>
+  bool ParseFloatList(Result* float_list) {
+    DCHECK(float_list != nullptr);
+    protobuf::io::CodedInputStream stream(
+        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+    EnableAliasing(&stream);
+    uint32 length;
+    if (!stream.ReadVarint32(&length)) return false;
+    auto limit = stream.PushLimit(length);
+
+    if (!stream.ExpectAtEnd()) {
+      uint8 peek_tag = PeekTag(&stream);
+      if (peek_tag != kDelimitedTag(1) && peek_tag != kFixed32Tag(1)) {
+        return false;
+      }
+
+      constexpr int32_t kNumFloatBytes = 4;
+      if (peek_tag == kDelimitedTag(1)) {                       // packed
+        if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
+        uint32 packed_length;
+        if (!stream.ReadVarint32(&packed_length)) return false;
+        auto packed_limit = stream.PushLimit(packed_length);
+
+        // Store the initial size to know the offset we have to start writing
+        // data from before resizing the output "vector".
+        const size_t initial_size = float_list->size();
+        float_list->resize(initial_size + packed_length / kNumFloatBytes);
+
+        // If the result data type is float and we are on a little endian
+        // machine then we can simply memcpy the data from the proto into the
+        // result vector.
+        if (port::kLittleEndian &&
+            sizeof(typename Result::value_type) == kNumFloatBytes) {
+          // Calculate the length of the buffer available what can be less than
+          // what we requested in resize in case of a LimitedArraySlice.
+          const uint32 bytes_to_copy =
+              std::min(static_cast<uint32>((float_list->size() - initial_size) *
+                                           kNumFloatBytes),
+                       packed_length);
+          if (!stream.ReadRaw(float_list->data() + initial_size, bytes_to_copy))
+            return false;
+        } else {
+          int64_t index = initial_size;
+          while (!stream.ExpectAtEnd()) {
+            uint32 buffer32;
+            if (!stream.ReadLittleEndian32(&buffer32)) return false;
+            if (index < float_list->size()) {
+              float_list->data()[index] = absl::bit_cast<float>(buffer32);
+              ++index;
+            }
+          }
+        }
+
+        stream.PopLimit(packed_limit);
+      } else {  // non-packed
+        const size_t initial_size = float_list->size();
+        // 1 byte for the tag (`1` encoded as Variant32) and kNumFloatBytes for
+        // the value.
+        const int64_t num_elements =
+            stream.BytesUntilLimit() / (1 + kNumFloatBytes);
+        float_list->resize(initial_size + num_elements);
+        int64_t index = initial_size;
+        while (!stream.ExpectAtEnd()) {
+          if (!stream.ExpectTag(kFixed32Tag(1))) return false;
+          uint32 buffer32;
+          if (!stream.ReadLittleEndian32(&buffer32)) return false;
+          float_list->data()[index] = absl::bit_cast<float>(buffer32);
+          ++index;
+        }
+      }
+    }
+
+    stream.PopLimit(limit);
+    return true;
+  }
+
+  template <typename Result>
+  bool ParseInt64List(Result* int64_list) {
+    DCHECK(int64_list != nullptr);
+    protobuf::io::CodedInputStream stream(
+        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+    EnableAliasing(&stream);
+    uint32 length;
+    if (!stream.ReadVarint32(&length)) return false;
+    auto limit = stream.PushLimit(length);
+
+    if (!stream.ExpectAtEnd()) {
+      uint8 peek_tag = PeekTag(&stream);
+      if (peek_tag != kDelimitedTag(1) && peek_tag != kVarintTag(1)) {
+        return false;
+      }
+      if (peek_tag == kDelimitedTag(1)) {                       // packed
+        if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
+        uint32 packed_length;
+        if (!stream.ReadVarint32(&packed_length)) return false;
+        auto packed_limit = stream.PushLimit(packed_length);
+
+        while (!stream.ExpectAtEnd()) {
+          protobuf_uint64 n;  // There is no API for int64
+          if (!stream.ReadVarint64(&n)) return false;
+          int64_list->push_back(static_cast<int64_t>(n));
+        }
+
+        stream.PopLimit(packed_limit);
+      } else {  // non-packed
+        while (!stream.ExpectAtEnd()) {
+          if (!stream.ExpectTag(kVarintTag(1))) return false;
+          protobuf_uint64 n;  // There is no API for int64
+          if (!stream.ReadVarint64(&n)) return false;
+          int64_list->push_back(static_cast<int64_t>(n));
+        }
+      }
+    }
+    stream.PopLimit(limit);
+    return true;
+  }
+
+  absl::string_view GetSerialized() const { return serialized_; }
+
+ private:
+  absl::string_view serialized_;
+};
+
+using FeatureMapEntry = std::pair<absl::string_view, Feature>;
+using Example = std::vector<FeatureMapEntry>;
+
+}  // namespace parsed
+
+inline bool SkipExtraneousTag(protobuf::io::CodedInputStream* stream) {
+  uint32 data;
+  protobuf_uint64 dummy;
+  switch (stream->ReadTag() & 0x7) {
+    case 0:  // varint
+      if (!stream->ReadVarint32(&data)) return false;
+      return true;
+    case 1:  // fixed64
+      if (!stream->ReadLittleEndian64(&dummy)) return false;
+      return true;
+    case 2:  // length delimited
+      if (!stream->ReadVarint32(&data)) return false;
+      stream->Skip(data);
+      return true;
+    case 3:          // group begin
+      return false;  // groups not supported.
+    case 4:          // group end
+      return false;  // groups not supported.
+    case 5:          // fixed32
+      if (!stream->ReadLittleEndian32(&data)) return false;
+      return true;
+  }
+  return false;  // unrecognized tag type
+}
+
+bool ParseString(protobuf::io::CodedInputStream* stream,
+                 absl::string_view* result);
+
+bool ParseFeatureMapEntry(protobuf::io::CodedInputStream* stream,
+                          parsed::FeatureMapEntry* feature_map_entry);
+
+bool ParseFeatures(protobuf::io::CodedInputStream* stream,
+                   parsed::Example* example);
+
+bool ParseExample(protobuf::io::CodedInputStream* stream,
+                  parsed::Example* example);
+
+bool ParseExample(absl::string_view serialized, parsed::Example* example);
+
+using Config = FastParseExampleConfig;
+
+// Enumeration for distinguishing feature types.
+// Note: FastParseSequenceExample constructs a map that includes Type values,
+// and relies on the fact that they are default-initialized to Dense.
+enum class Type { Dense, Sparse, Ragged };
+
+// Note: We use SparseBuffer for sparse, ragged, and dense_varlen features.
+struct SparseBuffer {
+  // Features are in one of the 3 vectors below depending on config's dtype.
+  // Other 2 vectors remain empty.
+  SmallVector<tstring> bytes_list;
+  SmallVector<float> float_list;
+  SmallVector<int64_t> int64_list;
+
+  // Features of example i are elements with indices
+  // from example_end_indices[i-1] to example_end_indices[i]-1 on the
+  // appropriate xxxxx_list
+  std::vector<size_t> example_end_indices;
+};
+
+struct SeededHasher {
+  uint64 operator()(absl::string_view s) const {
+    return Hash64(s.data(), s.size(), seed);
+  }
+  uint64 seed{0xDECAFCAFFE};
+};
+
+// Use this in the "default" clause of switch statements when dispatching
+// on a dtype variable that was checked by CheckConfigDataType():
+inline void ReportUnexpectedDataType(DataType dtype) {
+  DCHECK(false)
+      << "Encountered unexpected DataType " << DataTypeString(dtype)
+      << "in variable that should have been checked by CheckConfigDataType().";
+}
+
+template <typename T>
+const SmallVector<T>& GetListFromBuffer(const SparseBuffer& buffer);
+
+template <>
+const SmallVector<int64_t>& GetListFromBuffer<int64_t>(
+    const SparseBuffer& buffer);
+
+template <>
+const SmallVector<float>& GetListFromBuffer<float>(const SparseBuffer& buffer);
+
+template <>
+const SmallVector<tstring>& GetListFromBuffer<tstring>(
+    const SparseBuffer& buffer);
+
+template <typename T>
+void CopyOrMoveBlock(const T* b, const T* e, T* t) {
+  std::copy(b, e, t);
+}
+template <>
+void CopyOrMoveBlock(const tstring* b, const tstring* e, tstring* t);
+
+void CountSparseFeatures(
+    const std::vector<std::vector<SparseBuffer>>& sparse_buffers, size_t d,
+    size_t* total_num_features, size_t* max_num_features);
+
+void CopySparseBufferToTensor(DataType dtype, size_t offset, SparseBuffer* src,
+                              Tensor* dst);
+
+// A struct used by FastParseSequenceExample to hold the serialized proto
+// substrings for a single feature, plus some auxiliary information derived
+// from those protos (such as the total value length).
+struct FeatureProtos {
+  // Proto substrings from each serialized SequenceExample that correspond
+  // with this feature.  `protos_present` records whether the proto had a
+  // value defined (even if that value is empty).
+  std::vector<absl::string_view> protos;
+  std::vector<bool> protos_present;
+
+  // Information derived from protos:
+  size_t length;    // total length for ragged/sparse, max row length for dense.
+  size_t num_rows;  // only populated for ragged sequence features.
+
+  // Information from the config:
+  Type type;  // Whether this feature is sparse, ragged, or dense.
+  DataType dtype;
+};
+
+// Map from feature name to FeatureProtos for that feature.
+using FeatureProtosMap = absl::flat_hash_map<absl::string_view, FeatureProtos>;
+
+string ExampleName(const absl::Span<const tstring> example_names, int n);
+
+// Return the number of bytes elements parsed, or -1 on error. If out is null,
+// this method simply counts the number of elements without any copying.
+inline int ParseBytesFeature(protobuf::io::CodedInputStream* stream,
+                             tstring* out) {
+  int num_elements = 0;
+  uint32 length;
+  if (!stream->ExpectTag(kDelimitedTag(1)) || !stream->ReadVarint32(&length)) {
+    return -1;
+  }
+  if (length > 0) {
+    auto limit = stream->PushLimit(length);
+    while (!stream->ExpectAtEnd()) {
+      uint32 bytes_length;
+      if (!stream->ExpectTag(kDelimitedTag(1)) ||
+          !stream->ReadVarint32(&bytes_length)) {
+        return -1;
+      }
+      if (out == nullptr) {
+        stream->Skip(bytes_length);
+      } else {
+        out->resize_uninitialized(bytes_length);
+        if (!stream->ReadRaw(out->data(), bytes_length)) {
+          return -1;
+        }
+        out++;
+      }
+      num_elements++;
+    }
+    stream->PopLimit(limit);
+  }
+  return num_elements;
+}
+
+inline void PadFloatFeature(int num_to_pad, float* out) {
+  for (int i = 0; i < num_to_pad; i++) {
+    *out++ = 0.0;
+  }
+}
+
+inline void PadInt64Feature(int num_to_pad, int64_t* out) {
+  for (int i = 0; i < num_to_pad; i++) {
+    *out++ = 0;
+  }
+}
+
+// Return the number of float elements parsed, or -1 on error. If out is null,
+// this method simply counts the number of elements without any copying.
+inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
+                             float* out) {
+  int num_elements = 0;
+  uint32 length;
+  if (!stream->ExpectTag(kDelimitedTag(2)) || !stream->ReadVarint32(&length)) {
+    return -1;
+  }
+  if (length > 0) {
+    auto limit = stream->PushLimit(length);
+    uint8 peek_tag = PeekTag(stream);
+    if (peek_tag == kDelimitedTag(1)) {  // packed
+      uint32 packed_length;
+      if (!stream->ExpectTag(kDelimitedTag(1)) ||
+          !stream->ReadVarint32(&packed_length)) {
+        return -1;
+      }
+      auto packed_limit = stream->PushLimit(packed_length);
+      while (!stream->ExpectAtEnd()) {
+        uint32 buffer32;
+        if (!stream->ReadLittleEndian32(&buffer32)) {
+          return -1;
+        }
+        if (out != nullptr) {
+          *out++ = absl::bit_cast<float>(buffer32);
+        }
+        num_elements++;
+      }
+      stream->PopLimit(packed_limit);
+    } else if (peek_tag == kFixed32Tag(1)) {
+      while (!stream->ExpectAtEnd()) {
+        uint32 buffer32;
+        if (!stream->ExpectTag(kFixed32Tag(1)) ||
+            !stream->ReadLittleEndian32(&buffer32)) {
+          return -1;
+        }
+        if (out != nullptr) {
+          *out++ = absl::bit_cast<float>(buffer32);
+        }
+        num_elements++;
+      }
+    } else {
+      // Unknown tag.
+      return -1;
+    }
+    stream->PopLimit(limit);
+  }
+  return num_elements;
+}
+
+// Return the number of int64 elements parsed, or -1 on error. If out is null,
+// this method simply counts the number of elements without any copying.
+inline int ParseInt64Feature(protobuf::io::CodedInputStream* stream,
+                             int64_t* out) {
+  int num_elements = 0;
+  uint32 length;
+  if (!stream->ExpectTag(kDelimitedTag(3)) || !stream->ReadVarint32(&length)) {
+    return -1;
+  }
+  if (length > 0) {
+    auto limit = stream->PushLimit(length);
+    uint8 peek_tag = PeekTag(stream);
+    if (peek_tag == kDelimitedTag(1)) {  // packed
+      uint32 packed_length;
+      if (!stream->ExpectTag(kDelimitedTag(1)) ||
+          !stream->ReadVarint32(&packed_length)) {
+        return -1;
+      }
+      auto packed_limit = stream->PushLimit(packed_length);
+      while (!stream->ExpectAtEnd()) {
+        protobuf_uint64 n;  // There is no API for int64
+        if (!stream->ReadVarint64(&n)) {
+          return -1;
+        }
+        if (out != nullptr) {
+          *out++ = n;
+        }
+        num_elements++;
+      }
+      stream->PopLimit(packed_limit);
+    } else if (peek_tag == kVarintTag(1)) {
+      while (!stream->ExpectAtEnd()) {
+        protobuf_uint64 n;  // There is no API for int64
+        if (!stream->ExpectTag(kVarintTag(1)) || !stream->ReadVarint64(&n)) {
+          return -1;
+        }
+        if (out != nullptr) {
+          *out++ = n;
+        }
+        num_elements++;
+      }
+    } else {
+      // Unknown tag.
+      return -1;
+    }
+    stream->PopLimit(limit);
+  }
+  return num_elements;
+}
+
+// Parses the next feature on `stream` into `out` starting at `out_offset`.
+// Updates `out_offset`, and returns the number of values added.
+// Returns -1 if the next feature on `stream` doesn't match `dtype`.
+inline int ParseFeature(DataType dtype, protobuf::io::CodedInputStream* stream,
+                        Tensor* out, size_t* out_offset) {
+  int delta;
+  switch (dtype) {
+    case DT_STRING:
+      delta =
+          ParseBytesFeature(stream, out->flat<tstring>().data() + *out_offset);
+      break;
+    case DT_FLOAT:
+      delta =
+          ParseFloatFeature(stream, out->flat<float>().data() + *out_offset);
+      break;
+    case DT_INT64:
+      delta =
+          ParseInt64Feature(stream, out->flat<int64_t>().data() + *out_offset);
+      break;
+    default:
+      ReportUnexpectedDataType(dtype);
+      delta = 0;
+  }
+  if (delta > 0) {
+    *out_offset += delta;
+  }
+  return delta;
+}
+
+// Returns the length of the next feature on `stream`.
+// Returns -1 if the next feature on `stream` doesn't match `dtype`.
+inline int GetFeatureLength(DataType dtype,
+                            protobuf::io::CodedInputStream* stream) {
+  switch (dtype) {
+    case DT_STRING:
+      return ParseBytesFeature(stream, nullptr);
+    case DT_FLOAT:
+      return ParseFloatFeature(stream, nullptr);
+    case DT_INT64:
+      return ParseInt64Feature(stream, nullptr);
+    default:
+      ReportUnexpectedDataType(dtype);
+      return -1;
+  }
+}
+
+inline DataType ParseDataType(protobuf::io::CodedInputStream* stream) {
+  uint8 peek_tag = PeekTag(stream);
+  switch (peek_tag) {
+    case kDelimitedTag(1):
+      return DT_STRING;
+    case kDelimitedTag(2):
+      return DT_FLOAT;
+    case kDelimitedTag(3):
+      return DT_INT64;
+    default:
+      return DT_INVALID;
+  }
+}
+
+inline bool SkipEmptyFeature(protobuf::io::CodedInputStream* stream,
+                             DataType dtype) {
+  switch (dtype) {
+    case DT_STRING:
+      if (!stream->ExpectTag(kDelimitedTag(1))) {
+        return false;
+      }
+      break;
+    case DT_FLOAT:
+      if (!stream->ExpectTag(kDelimitedTag(2))) {
+        return false;
+      }
+      break;
+    case DT_INT64:
+      if (!stream->ExpectTag(kDelimitedTag(3))) {
+        return false;
+      }
+      break;
+    default:
+      return false;
+  }
+  uint32 length;
+  return stream->ReadVarint32(&length) && length == 0;
+}
+
+}  // namespace example
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_LITE_KERNELS_PARSE_EXAMPLE_EXAMPLE_PROTO_FAST_PARSING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/parse_example/parse_example.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/parse_example/parse_example.h
new file mode 100644
index 00000000..ccda8579
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/parse_example/parse_example.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_PARSE_EXAMPLE_PARSE_EXAMPLE_H_
+#define TENSORFLOW_LITE_KERNELS_PARSE_EXAMPLE_PARSE_EXAMPLE_H_
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_PARSE_EXAMPLE();
+TfLiteRegistration* Register_PARSE_EXAMPLE_V2();
+
+extern "C" void AddParseExampleOp(::tflite::MutableOpResolver* resolver);
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_PARSE_EXAMPLE_PARSE_EXAMPLE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/perception/perception_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/perception/perception_ops.h
new file mode 100644
index 00000000..121c844b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/perception/perception_ops.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* RegisterDenseImageWarp();
+TfLiteRegistration* RegisterMaxPoolWithArgmax();
+TfLiteRegistration* RegisterMaxUnpooling2D();
+
+extern "C" void AddPerceptionOps(::tflite::MutableOpResolver* resolver);
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/reduce_test_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/reduce_test_common.h
new file mode 100644
index 00000000..ce24f439
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/reduce_test_common.h
@@ -0,0 +1,137 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_REDUCE_TEST_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_REDUCE_TEST_COMMON_H_
+
+#include <initializer_list>
+
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+class BaseOpModel : public SingleOpModel {
+ public:
+  void SetAxis(const std::vector<int>& data) { PopulateTensor(axis_, data); }
+
+  template <class T>
+  void SetInput(std::vector<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  const TfLiteTensor* GetOutputTensor(int index) {
+    return interpreter_->output_tensor(index);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  int Input() { return input_; }
+
+ protected:
+  void SymmetricInt16Scaling(TensorData& tensor) {
+    // Symmetric range and null zero-point is required for INT16 tensors. As
+    // SingleOpModel::QuantizationParams calculates the scale on an asymmetric
+    // base [int_type::min, int_type::max], manually calculate the scale on a
+    // symmetric range [int_type::min+1, int_type::max] to ensure a null
+    // zero-point.
+    if (tensor.type == TensorType_INT16) {
+      CHECK_EQ(std::abs(tensor.min), tensor.max);
+      tensor.scale = tensor.max / std::numeric_limits<int16_t>::max();
+      tensor.zero_point = 0;
+      tensor.min = 0;
+      tensor.max = 0;
+    }
+  }
+
+ protected:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+// Model for the tests case where axis is a const tensor.
+template <BuiltinOperator op_code, bool symmetric_int16_scaling = false>
+class BaseConstOpModel : public BaseOpModel {
+ public:
+  BaseConstOpModel(TensorData input, TensorData output,
+                   std::initializer_list<int> axis_shape,
+                   std::initializer_list<int> axis, bool keep_dims) {
+    if (symmetric_int16_scaling) {
+      SymmetricInt16Scaling(input);
+      SymmetricInt16Scaling(output);
+    }
+    input_ = AddInput(input);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op_code, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+// Model for the tests case where the input and axis are const tensors.
+template <typename InputType, BuiltinOperator op_code,
+          bool symmetric_int16_scaling = false>
+class BaseFullyConstOpModel : public BaseOpModel {
+ public:
+  BaseFullyConstOpModel(TensorData input, std::vector<InputType> input_data,
+                        TensorData output,
+                        std::initializer_list<int> axis_shape,
+                        std::initializer_list<int> axis, bool keep_dims) {
+    if (symmetric_int16_scaling) {
+      SymmetricInt16Scaling(input);
+      SymmetricInt16Scaling(output);
+    }
+    input_ = AddConstInput(input, input_data);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op_code, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+// Model for the tests case where axis is a dynamic tensor.
+template <BuiltinOperator op_code, bool symmetric_int16_scaling = false>
+class BaseDynamicOpModel : public BaseOpModel {
+ public:
+  BaseDynamicOpModel(TensorData input, TensorData output,
+                     const TensorData& axis, bool keep_dims) {
+    if (symmetric_int16_scaling) {
+      SymmetricInt16Scaling(input);
+      SymmetricInt16Scaling(output);
+    }
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op_code, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_REDUCE_TEST_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/register.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/register.h
new file mode 100644
index 00000000..e444acce
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/register.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
+#define TENSORFLOW_LITE_KERNELS_REGISTER_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/kernels/register.h
+
+#include "tensorflow/lite/core/kernels/register.h"  // IWYU pragma: export
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+using BuiltinOpResolver = ::tflite::ops::builtin::BuiltinOpResolver;
+using BuiltinOpResolverWithoutDefaultDelegates =
+    ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates;
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_REGISTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/register_ref.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/register_ref.h
new file mode 100644
index 00000000..f8054ec1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/register_ref.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
+#define TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+class BuiltinRefOpResolver : public MutableOpResolver {
+ public:
+  BuiltinRefOpResolver();
+
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+};
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/reshape_test_common.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/reshape_test_common.h
new file mode 100644
index 00000000..6a50e1a6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/reshape_test_common.h
@@ -0,0 +1,158 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
+
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+// There are three ways to specify the output shape of a Reshape
+// op.
+enum class ShapeSpecificationType {
+  // The output shape is hardcoded in the ReshapeOptions object.
+  kAsReshapeOption,
+  // The output shape is specified as an input tensor, which is connected to a
+  // Const node, which is guaranteed not to change once inference starts. The
+  // shape is also hardcoded as in kAsReshapeOption.
+  kAsConstantTensor,
+  // The output shape is specified as an input tensor that can change based on
+  // external input. That is, the shape is not know before the inference
+  // starts. The shape is also hardcoded as in kAsReshapeOption.
+  kAsTensor,
+};
+
+template <typename T, typename BASE = SingleOpModel>
+class ReshapeOpModel : public BASE {
+ public:
+  ReshapeOpModel(std::initializer_list<int> input_shape,
+                 std::initializer_list<int> shape_shape,
+                 std::initializer_list<int> shape_data,
+                 ShapeSpecificationType shape_type,
+                 const std::vector<T>* input_data = nullptr) {
+    this->SetBypassDefaultDelegates();
+    switch (shape_type) {
+      case ShapeSpecificationType::kAsTensor:
+        this->BuildWithTensorShape(input_shape, shape_shape, shape_data,
+                                   input_data);
+        break;
+      case ShapeSpecificationType::kAsConstantTensor:
+        this->BuildWithConstantTensorShape(input_shape, shape_shape, shape_data,
+                                           input_data);
+        break;
+      case ShapeSpecificationType::kAsReshapeOption:
+        // In this case the shape of the new shape doesn't matter. It is
+        // always hardcoded as a flat vector.
+        this->BuildWithHardcodedShape(input_shape, shape_data, input_data);
+        break;
+    }
+  }
+
+  void SetInput(std::vector<T> data) {
+    this->template PopulateTensor<T>(input_, data);
+  }
+
+  void SetStringInput(std::initializer_list<string> data) {
+    this->PopulateStringTensor(input_, data);
+  }
+
+  std::vector<T> GetOutput() {
+    return this->template ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return this->GetTensorShape(output_); }
+
+ private:
+  void BuildWithHardcodedShape(std::initializer_list<int> input_shape,
+                               std::initializer_list<int> shape_data,
+                               const std::vector<T>* input_data = nullptr) {
+    if (input_data != nullptr) {
+      input_ =
+          this->AddConstInput(GetTensorType<T>(), *input_data, input_shape);
+    } else {
+      input_ = this->AddInput({GetTensorType<T>(), input_shape});
+    }
+    output_ = this->AddOutput(GetTensorType<T>());
+    this->SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(
+            this->builder_,
+            this->builder_.template CreateVector<int>(shape_data))
+            .Union());
+    this->BuildInterpreter({this->GetShape(input_)});
+  }
+
+  void BuildWithTensorShape(std::initializer_list<int> input_shape,
+                            std::initializer_list<int> shape_shape,
+                            std::initializer_list<int> shape_data,
+                            const std::vector<T>* input_data = nullptr) {
+    if (input_data != nullptr) {
+      input_ =
+          this->AddConstInput(GetTensorType<T>(), *input_data, input_shape);
+    } else {
+      input_ = this->AddInput({GetTensorType<T>(), input_shape});
+    }
+    output_ = this->AddOutput(GetTensorType<T>());
+    int shape_input_tensor = this->AddInput({TensorType_INT32, shape_shape});
+    // Note how shape also appears in ReshapeOptions
+    this->SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(
+            this->builder_,
+            this->builder_.template CreateVector<int>(shape_data))
+            .Union());
+    this->BuildInterpreter(
+        {this->GetShape(input_), this->GetShape(shape_input_tensor)});
+    if (shape_data.size() != 0) {
+      this->template PopulateTensor<int32_t>(shape_input_tensor, shape_data);
+    }
+  }
+
+  void BuildWithConstantTensorShape(
+      std::initializer_list<int> input_shape,
+      std::initializer_list<int> shape_shape,
+      std::initializer_list<int> shape_data,
+      const std::vector<T>* input_data = nullptr) {
+    if (input_data != nullptr) {
+      input_ =
+          this->AddConstInput(GetTensorType<T>(), *input_data, input_shape);
+    } else {
+      input_ = this->AddInput({GetTensorType<T>(), input_shape});
+    }
+    output_ = this->AddOutput(GetTensorType<T>());
+    this->AddConstInput(TensorType_INT32, shape_data, shape_shape);
+    // Note how the shape also appears in the ReshapeOptions.
+    this->SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(
+            this->builder_,
+            this->builder_.template CreateVector<int>(shape_data))
+            .Union());
+    this->BuildInterpreter({this->GetShape(input_)});
+  }
+
+  int input_;
+  int output_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/rng_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/rng_util.h
new file mode 100644
index 00000000..0fe4b698
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/rng_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_RNG_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_RNG_UTIL_H_
+
+#include <array>
+#include <cstdint>
+
+namespace tflite {
+namespace rng {
+
+// Implements the ThreeFry counter-based PRNG algorithm. Use 20 rounds.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+std::array<uint32_t, 2> Threefry2x32(uint32_t key_0, uint32_t key_1,
+                                     std::array<uint32_t, 2> ctr);
+
+// Implements the Philox4x32 counter-based PRNG algorithm. Use 10 rounds.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+std::array<uint32_t, 4> Philox4x32(uint32_t key_0, uint32_t key_1,
+                                   std::array<uint32_t, 4> ctr);
+
+}  // namespace rng
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_RNG_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/op_kernel.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/op_kernel.h
new file mode 100644
index 00000000..4c6ae57e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/op_kernel.h
@@ -0,0 +1,278 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_OP_KERNEL_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_OP_KERNEL_H_
+
+// This file defines a shim layer on top of TF and TFLite custom op APIs.
+// The goal is for a custom op to be written once and used for both runtimes
+//
+// It consists of two pieces:
+//   * A set of *context* interfaces:
+//     ** InvokeContext, InitContext, ShapeInferenceContext
+//     These are passed on to the custom op implementation to read/write
+//     tensors, etc.
+//
+//   * An OpKernelShim interface:
+//     This is what a custom op needs to implement. By using that interface the
+//     custom op can then be easily converted to both a TF op kernel and a
+//     TFLite op kernel.
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/variant.h"
+#include "tensorflow/lite/kernels/shim/shape.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
+#include "tensorflow/lite/kernels/shim/tensor_view.h"
+
+namespace tflite {
+namespace shim {
+
+// List of the TF custom op APIs this shim library is abstracting away.
+//
+// This enum is used as the template parameter in various places in
+// order to pick the correct set of types (eg. TfInvokeContext vs.
+// TfLiteInvokeContext) in the op implementation.
+enum class Runtime { kTf, kTfLite };
+
+// TensorView or error
+using TensorViewOr = absl::StatusOr<std::unique_ptr<TensorView>>;
+using ConstTensorViewOr = absl::StatusOr<std::unique_ptr<const TensorView>>;
+
+// Below are the interfaces for various "Context" objects to abstract away the
+// TF and TFLite differences.
+//
+// The interfaces are static and use the CRTP pattern instead of virtual
+// methods.
+
+// The attribute dictionary passed to the op
+using AttrValue = std::variant<bool, int64_t, float, absl::string_view>;
+
+// The interface for available methods during an op kernel initialization
+template <typename SubType>
+class InitContext {
+ public:
+  // Read the given attribute and populate the given value.
+  template <typename AttrType>
+  absl::Status GetAttr(const std::string& attr_name, AttrType* value) const;
+
+ protected:
+  // Read a given attribute or return error
+  absl::StatusOr<AttrValue> GetAttr(const std::string& attr_name) const {
+    return static_cast<const SubType&>(*this).GetAttr(attr_name);
+  }
+};
+
+// The interface for available methods during an op kernel invocation
+template <typename SubType>
+class InvokeContext {
+ public:
+  // Read an input tensor
+  ConstTensorViewOr GetInput(const int idx) const {
+    return static_cast<const SubType&>(*this).GetInput(idx);
+  }
+  // Get a mutable output tensor
+  TensorViewOr GetOutput(const int idx, const Shape& shape) const {
+    return static_cast<const SubType&>(*this).GetOutput(idx, shape);
+  }
+  // Number of input tensors
+  int NumInputs() const {
+    return static_cast<const SubType&>(*this).NumInputs();
+  }
+  // Number of output tensors
+  int NumOutputs() const {
+    return static_cast<const SubType&>(*this).NumOutputs();
+  }
+};
+
+// The interface for available methods during shape inference
+template <typename SubType>
+class ShapeInferenceContext {
+ public:
+  // Read an input tensor shape
+  ShapeOr GetInputShape(const int idx) const {
+    return static_cast<const SubType&>(*this).GetInputShape(idx);
+  }
+  // Set an output tensor shape
+  absl::Status SetOutputShape(const int idx, const Shape& shape) {
+    return static_cast<SubType&>(*this).SetOutputShape(idx, shape);
+  }
+  // Read an input tensor during shape inference
+  ConstTensorViewOr GetInputTensor(const int idx) const {
+    return static_cast<const SubType&>(*this).GetInputTensor(idx);
+  }
+  // Number of input tensors
+  int NumInputs() const {
+    return static_cast<const SubType&>(*this).NumInputs();
+  }
+  // Number of output tensors
+  int NumOutputs() const {
+    return static_cast<const SubType&>(*this).NumOutputs();
+  }
+  // Read the given attribute and populate the given value.
+  template <typename AttrType>
+  absl::Status GetAttr(const std::string& attr_name, AttrType* value) const;
+
+ protected:
+  // Read a given attribute or return error
+  absl::StatusOr<AttrValue> GetAttr(const std::string& attr_name) const {
+    return static_cast<const SubType&>(*this).GetAttr(attr_name);
+  }
+};
+
+// Maps the Runtime to the correct context types.
+// eg. ContextTypeForRuntime<Runtime::Tf>  -->
+//       { TfInitContext, TfInvokeContext, TfShapreInferenceContext }
+template <Runtime Rt>
+struct ContextTypeForRuntime {
+  // * Init
+  // * Invoke
+  // * ShapeInference
+};
+
+// A Tensorflow operation interface which is then adapted to both TF and TFLite
+// runtimes.
+//
+// Example usage:
+//
+//   template<Runtime R>
+//   class MyOp : public OpKernelShim<MyOp, R> {
+//
+//     // Attributes declaration
+//     // (syntax: https://www.tensorflow.org/guide/create_op)
+//     static std::vector<std::string> Attrs();
+//
+//     // Input tensors declaration
+//     // (syntax: https://www.tensorflow.org/guide/create_op)
+//     static std::vector<std::string> Inputs();
+//
+//     // Output tensors declaration
+//     // (syntax: https://www.tensorflow.org/guide/create_op)
+//     static std::vector<std::string> Outputs();
+//
+//     // Initializes the op
+//     absl::Status Init(InitContext* ctx);
+//
+//     // Runs the operation
+//     absl::Status Invoke(InvokeContext* ctx);
+//
+//     // Shape inference
+//     static absl::Status ShapeInference(ShapeInferenceContext* ctx);
+//
+//   };
+//
+// WARNING: Experimental interface, subject to change
+template <template <Runtime, typename...> typename SubType, Runtime Rt,
+          typename... Ts>
+class OpKernelShim {
+ public:
+  // Some typedefs for convenience
+  using Shape = ::tflite::shim::Shape;
+  using InitContext =
+      ::tflite::shim::InitContext<typename ContextTypeForRuntime<Rt>::Init>;
+  using InvokeContext =
+      ::tflite::shim::InvokeContext<typename ContextTypeForRuntime<Rt>::Invoke>;
+  using ShapeInferenceContext = ::tflite::shim::ShapeInferenceContext<
+      typename ContextTypeForRuntime<Rt>::ShapeInference>;
+
+  // Needed because the pointer to this class is stored
+  virtual ~OpKernelShim() = default;
+
+  // If the operation has any attributes they are passed here.
+  absl::Status Init(InitContext* ctx) {
+    return static_cast<SubType<Rt, Ts...>&>(*this).Init(ctx);
+  }
+
+  // The actual computations of the operation
+  absl::Status Invoke(InvokeContext* ctx) {
+    return static_cast<SubType<Rt, Ts...>&>(*this).Invoke(ctx);
+  }
+
+  // Shape inference
+  static absl::Status ShapeInference(ShapeInferenceContext* ctx) {
+    return SubType<Rt, Ts...>::ShapeInference(ctx);
+  }
+
+ protected:
+  OpKernelShim() = default;
+
+  // Convience method for filling a single dimension output tensor.
+  template <typename BufferType, typename DType>
+  absl::Status FillOutputTensor(const std::vector<BufferType>& buffer,
+                                int index, InvokeContext* context) const;
+};
+
+/////////////////////// Implementations
+
+namespace internal {
+// Extract the given AttrType from the AttrValue variant or returns error.
+template <typename AttrType>
+absl::Status GetAttr(const std::string& attr_name,
+                     const absl::StatusOr<AttrValue> attr_value_or,
+                     AttrType* value) {
+  if (!attr_value_or.ok()) return attr_value_or.status();
+  const AttrValue& attr_value = attr_value_or.value();
+  if (!std::holds_alternative<AttrType>(attr_value)) {
+    return absl::InternalError(
+        absl::StrCat("The attribute type does not match the provided "
+                     "type: attr_name: ",
+                     attr_name));
+  }
+  *value = std::get<AttrType>(attr_value);
+  return absl::OkStatus();
+}
+}  // namespace internal
+
+template <typename SubType>
+template <typename AttrType>
+absl::Status InitContext<SubType>::GetAttr(const std::string& attr_name,
+                                           AttrType* value) const {
+  const auto attr_value_or = GetAttr(attr_name);
+  return internal::GetAttr<AttrType>(attr_name, attr_value_or, value);
+}
+
+template <typename SubType>
+template <typename AttrType>
+absl::Status ShapeInferenceContext<SubType>::GetAttr(
+    const std::string& attr_name, AttrType* value) const {
+  const auto attr_value_or = GetAttr(attr_name);
+  return internal::GetAttr<AttrType>(attr_name, attr_value_or, value);
+}
+
+template <template <Runtime, typename...> typename SubType, Runtime Rt,
+          typename... Ts>
+template <typename BufferType, typename DType>
+absl::Status OpKernelShim<SubType, Rt, Ts...>::FillOutputTensor(
+    const std::vector<BufferType>& buffer, const int index,
+    tflite::shim::InvokeContext<typename ContextTypeForRuntime<Rt>::Invoke>*
+        context) const {
+  SH_ASSIGN_OR_RETURN(
+      const auto tensorview,
+      context->GetOutput(
+          index, tflite::shim::Shape({static_cast<int>(buffer.size())})));
+  auto data = tensorview->template As<DType, 1>();
+  for (int i = 0; i < buffer.size(); ++i) data(i) = buffer.at(i);
+  return absl::OkStatus();
+}
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_OP_KERNEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/shape.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/shape.h
new file mode 100644
index 00000000..bacde309
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/shape.h
@@ -0,0 +1,96 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_SHAPE_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_SHAPE_H_
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+
+namespace tflite {
+namespace shim {
+
+// Shape of a tensor. When unset it means the rank is unknown. Individual dims
+// can also be unknown.
+class Shape {
+ public:
+  using ValueType = std::vector<int>;
+
+  Shape() = default;
+  Shape(const Shape& o) = default;
+  Shape(Shape&& o) = default;
+  Shape& operator=(const Shape& o) = default;
+  Shape& operator=(Shape&& o) = default;
+
+  // Ctors
+  Shape(const std::initializer_list<int>& o) : value_(o), has_value_(true) {}
+  template <typename... Args>
+  explicit Shape(Args&&... args)  // forward ctor args to that of std::vector
+      : value_(std::forward<Args>(args)...), has_value_(true) {}
+  explicit Shape(const absl::Span<int> value)
+      : value_(value.data(), value.data() + value.size()), has_value_(true) {}
+
+  // Accessors
+  inline bool has_value() const { return has_value_; }
+  inline ValueType& value() { return value_; }
+  inline const ValueType& value() const { return value_; }
+  ValueType* operator->() { return &value_; }
+  const ValueType* operator->() const { return &value_; }
+  ValueType& operator*() { return value_; }
+  const ValueType& operator*() const { return value_; }
+  // Get the specified dimension if known
+  int Dim(const int idx) const;
+
+  // Returns the rank of the shape
+  const int Rank() const { return has_value_ ? value_.size() : kUnknownRank; }
+
+  // Whether all the dimensions of the shape are known
+  bool FullyDefined() const;
+
+  // Pretty printer
+  std::string ToString() const;
+
+  // Adds two dimension taking into account unknown dims.
+  static int AddDims(const int dim1, const int dim2);
+
+  // Comparison
+
+  // Strict equality of the shapes. Unknown dims or rank on one side will
+  // result in false
+  bool operator==(const Shape& rhs) const;
+  bool operator!=(const Shape& rhs) const;
+
+  // Compatibility of the shapes. If there are two known and incompatible
+  // dimensions it returns false
+  bool Compatible(const Shape& rhs) const;
+
+  // The value for unknown dimensions and rank. There are static_asserts to
+  // ensure this matches the one defined in ::tensorflow namespace
+  static constexpr int kUnknownDim = -1;
+  static constexpr int kUnknownRank = -1;
+
+ private:
+  ValueType value_;
+  bool has_value_ = false;
+};
+using ShapeOr = absl::StatusOr<Shape>;
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_SHAPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/status_macros.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/status_macros.h
new file mode 100644
index 00000000..7fb40acc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/status_macros.h
@@ -0,0 +1,72 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_STATUS_MACROS_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_STATUS_MACROS_H_
+
+#include "absl/status/status.h"
+
+// A few macros to help with propagating status.
+// They are mainly adapted from the ones in  tensorflow/core/platform/errors.h
+// and tensorflow/core/platform/statusor.h but these files come with many
+// transitive deps which can be too much for TFLite use cases. If these type of
+// macros end up in `absl` we can replace them with those.
+
+// The macros are prefixed with SH_ to avoid name collision.
+
+namespace tflite {
+namespace shim {
+template <typename... Args>
+void AppendToMessage(::absl::Status* status, Args... args) {
+  *status = ::absl::Status(status->code(),
+                           ::absl::StrCat(status->message(), "\n\t", args...));
+}
+}  // namespace shim
+}  // namespace tflite
+
+// Propagates error up the stack and appends to the error message.
+#define SH_RETURN_WITH_CONTEXT_IF_ERROR(expr, ...)            \
+  do {                                                        \
+    ::absl::Status _status = (expr);                          \
+    if (!_status.ok()) {                                      \
+      ::tflite::shim::AppendToMessage(&_status, __VA_ARGS__); \
+      return _status;                                         \
+    }                                                         \
+  } while (0)
+
+// Propages the error up the stack.
+// This can't be merged with the SH_RETURN_WITH_CONTEXT_IF_ERROR unless some
+// overly clever/unreadable macro magic is used.
+#define SH_RETURN_IF_ERROR(...)             \
+  do {                                      \
+    ::absl::Status _status = (__VA_ARGS__); \
+    if (!_status.ok()) return _status;      \
+  } while (0)
+
+// Internal helper for concatenating macro values.
+#define SH_STATUS_MACROS_CONCAT_NAME_INNER(x, y) x##y
+#define SH_STATUS_MACROS_CONCAT_NAME(x, y) \
+  SH_STATUS_MACROS_CONCAT_NAME_INNER(x, y)
+
+// Assigns an expression to lhs or propagates the error up.
+#define SH_ASSIGN_OR_RETURN(lhs, rexpr) \
+  SH_ASSIGN_OR_RETURN_IMPL(             \
+      SH_STATUS_MACROS_CONCAT_NAME(statusor, __COUNTER__), lhs, rexpr)
+
+#define SH_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr) \
+  auto statusor = (rexpr);                             \
+  if (!statusor.ok()) return statusor.status();        \
+  lhs = std::move(statusor.value())
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_STATUS_MACROS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tensor_view.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tensor_view.h
new file mode 100644
index 00000000..fc952a44
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tensor_view.h
@@ -0,0 +1,256 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TENSOR_VIEW_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TENSOR_VIEW_H_
+
+#include <variant>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tstring.h"
+
+namespace tflite {
+namespace shim {
+
+// A type deduction template which is specialized for TF and TFLite.
+// That is it maps
+//   ::tensorflow::Tensor -> tflite::shim::TfTensorView
+//   ::TfLiteTensor -> tflite::shim::TfLiteTensorView
+template <typename W>
+struct TensorViewSubType {};
+
+// Common denominator for ::tflite::TfLiteTensor and ::tensorflow::Tensor.
+// It is a "view" over the underlying tensor without taking ownership.
+// Objects of this class can also mutate the underlying tensor depending on
+// whether the underlying tensor is "const" qualified or not.
+//
+// Movable and copyable.
+// It can be instantiated with the New() factory function. eg.
+//   TfTensorView t           = TensorView::New(&tf_tensor);
+//   const TfTensorView t     = TensorView::New(&const_tf_tensor);
+//   TfLiteTensorView t       = TensorView::New(&tflite_tensor);
+//   const TfLiteTensorView t = TensorView::New(&const_tflite_tensor);
+class TensorView {
+ protected:
+  // Union over all data types
+  using DataVariantType =
+      std::variant<absl::Span<bool>, absl::Span<int8_t>, absl::Span<uint8_t>,
+                   absl::Span<int16_t>, absl::Span<uint16_t>,
+                   absl::Span<int32_t>, absl::Span<uint32_t>,
+                   absl::Span<int64_t>, absl::Span<uint64_t>, absl::Span<float>,
+                   absl::Span<double>, absl::Span<::tensorflow::tstring>>;
+
+  // An interface while provides convenient row-major indexing over the
+  // underlying tensor.
+  // Example usage:
+  //
+  //   // A scalar view
+  //   const TensorView t_float
+  //   float val = t_float.AsScalar<float>();
+  //
+  //   // A vector view
+  //   const TensorView t_int;
+  //   auto t_int_vec = t_int.As<int32_t, /*RANK=*/ 1>();
+  //   int sum = t_int_vec(0) + t_int_vec(1);
+  //
+  //   // A matrix view
+  //   TensorView t_str;
+  //   auto t_str_mat = t_str.As<tensorflow::tstring, /*RANK=*/ 2>();
+  //   t_str_mat(0, 0) = "abc";
+  //   t_str_mat(2, 3) = "def";
+  template <typename DType, int RANK>
+  class Tensor {
+   public:
+    explicit Tensor(TensorView *t)
+        : data_(t->Data<DType>()), shape_(t->Shape()) {
+      DCHECK_EQ(RANK, shape_.size());
+      ComputeRowSizes();
+    }
+
+    explicit Tensor(const TensorView *t)
+        : data_(t->Data<DType>()), shape_(t->Shape()) {
+      DCHECK_EQ(RANK, shape_.size());
+      ComputeRowSizes();
+    }
+
+    // indexing operator
+    template <typename... IndexTypes>
+    inline DType &operator()(IndexTypes... indices) {
+      const auto idx = RowMajorIndex(std::array<int, RANK>{{indices...}});
+      return data_[idx];
+    }
+
+    // const indexing operator
+    template <typename... IndexTypes>
+    inline const DType &operator()(IndexTypes... indices) const {
+      const auto idx = RowMajorIndex(std::array<int, RANK>{{indices...}});
+      return data_.at(idx);
+    }
+
+    // Pointer accessor
+    typename absl::Span<DType>::pointer Ptr() { return data_.data(); }
+    constexpr typename absl::Span<DType>::const_pointer Ptr() const {
+      return data_.data();
+    }
+
+    // Size of the given dimension
+    inline int Dim(int dim_i) const {
+      DCHECK(RANK > 0 && dim_i < RANK) << "dim: " << dim_i << " rank:" << RANK;
+      // Handle negative indices
+      if (dim_i < 0) dim_i = ((dim_i % RANK) + RANK) % RANK;
+      return shape_[dim_i];
+    }
+
+    // The tensor's rank: number of dimensions
+    /*[[nodiscard]]*/ constexpr std::size_t Rank() const { return RANK; }
+
+   private:
+    // Computes the row-major index
+    inline std::size_t RowMajorIndex(
+        const std::array<int, RANK> &indices) const {
+      std::size_t ret = 0;
+      for (int i = 0; i < RANK; ++i) ret += indices[i] * row_sizes_[i];
+      return ret;
+    }
+
+    // Pre computes row sizes to convert multi dim indices into a row major
+    // index
+    void ComputeRowSizes() {
+      // Precompute row sizes for row major index computation
+      if (RANK > 0) {
+        row_sizes_[RANK - 1] = 1;
+        for (int i = RANK - 2; i >= 0; --i) {
+          row_sizes_[i] = row_sizes_[i + 1] * shape_[i + 1];
+        }
+      }
+    }
+
+    absl::Span<DType> data_;
+    const absl::Span<int> shape_;
+    std::size_t row_sizes_[RANK]{};
+  };
+
+ public:
+  // Factory which gets specialized for different wrapped tensor types.
+  template <typename W>
+  static absl::StatusOr<typename TensorViewSubType<W>::Type> New(
+      W *wrapped_tensor);
+
+ protected:
+  // Move constructor
+  TensorView(TensorView &&o) = default;
+  // Copy constructor
+  TensorView(const TensorView &o) = default;
+  // Move assignment operator
+  TensorView &operator=(TensorView &&o) = default;
+  // Copy assignment operator
+  TensorView &operator=(const TensorView &) = default;
+
+ public:
+  // Dtor
+  virtual ~TensorView() = default;
+
+  // Accessors
+
+  // Shape
+  absl::Span<int> Shape() { return shape_; }
+  /*[[nodiscard]]*/ const absl::Span<int> Shape() const { return shape_; }
+
+  // Data
+  template <typename DType>
+  absl::Span<DType> &Data() {
+    return std::get<absl::Span<DType>>(data_);
+  }
+  template <typename DType>
+  constexpr absl::Span<DType> Data() const {
+    return std::get<absl::Span<DType>>(data_);
+  }
+
+  // Reads the tensor given the dtype and its rank and provides an indexing
+  // operator.
+  template <typename DType, int RANK>
+  Tensor<DType, RANK> As() {
+    return Tensor<DType, RANK>(this);
+  }
+
+  // Const version of As()
+  template <typename DType, int RANK>
+  const Tensor<DType, RANK> As() const {
+    return Tensor<DType, RANK>(this);
+  }
+
+  // Read the given tensor as a scalar or return error if it isn't
+  template <typename DType>
+  DType &AsScalar();
+
+  template <typename DType>
+  const DType &AsScalar() const;
+
+ protected:
+  // Templated constructor. Since it's not possible to specify the template
+  // argument directly we place a dummy argument of that type so compiler
+  // can deduce the right template parameter
+  template <typename DType>
+  TensorView(const absl::Span<int> shape, void *data,
+             const std::size_t data_size, const DType &)
+      : shape_(shape),
+        data_(absl::Span<DType>(reinterpret_cast<DType *>(data),
+                                data_size / sizeof(DType))) {}
+
+  // Return the total number of elements given the shape.
+  static constexpr std::size_t NumElements(const absl::Span<int> shape) {
+    std::size_t ret = 1;
+    for (const auto dim : shape) ret *= dim;
+    return ret;
+  }
+
+  // Tensor shape
+  // Note: using int rather than size_t to avoid conversion to from TfLite shape
+  absl::Span<int> shape_;
+  // Tensor data
+  DataVariantType data_;
+};
+
+// Add or remove const qualifier to O based on whether it is in I.
+// For example
+//   MatchConstNess<const TfLiteTensor, TensorView>::Type == const TensorView
+//   MatchConstNess<TfLiteTensor, TensorView>::Type == TensorView
+//   MatchConstNess<TfLiteTensor, const TensorView>::Type == TensorView
+template <typename I, typename O>
+struct MatchConstNess {
+  using Type = std::conditional_t<std::is_const<I>::value, std::add_const_t<O>,
+                                  std::remove_const_t<O>>;
+};
+
+///////////////////////////// Implementation
+
+template <typename DType>
+DType &TensorView::AsScalar() {
+  DCHECK_EQ(shape_.size(), 0) << "Tensor is not a scalar";
+  return Data<DType>()[0];
+}
+
+template <typename DType>
+const DType &TensorView::AsScalar() const {
+  DCHECK_EQ(shape_.size(), 0) << "Tensor is not a scalar";
+  return Data<DType>().at(0);
+}
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TENSOR_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_op.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_op.h
new file mode 100644
index 00000000..04498410
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_op.h
@@ -0,0 +1,193 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_OP_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
+
+namespace tflite {
+namespace shim {
+
+// A simple operation for demonstration and testing purposes.
+// See the kDoc member for documentation.
+
+template <Runtime Rt>
+class SimpleOp : public OpKernelShim<SimpleOp, Rt> {
+ protected:
+  enum Inputs { kInput0 = 0, kInput1 };
+  enum Outputs { kOutput0 = 0, kOutput1, kOutput2, kOutput3 };
+  int64_t output1_size_;
+  std::string output2_suffix_;
+  int64_t n_;
+  static constexpr int kOutput0Size = 5;
+  static constexpr char kOutput1SizeAttr[] = "output1_size";
+
+ public:
+  using typename OpKernelShim<SimpleOp, Rt>::InitContext;
+  using typename OpKernelShim<SimpleOp, Rt>::InvokeContext;
+  using typename OpKernelShim<SimpleOp, Rt>::ShapeInferenceContext;
+
+  SimpleOp() = default;
+  static constexpr char kOpName[] = "SimpleOperation";
+  static constexpr char kDoc[] = R"doc(
+Description:
+  Simple example op for testing and demonstration purposes.
+
+Attrs
+  output1_size: int - the size of the second output
+  output2_suffix: string - the string value to be appended to the end of out2
+  N: int - the number of tensors for the second input and last output
+Inputs
+  in0: str, shape=[] - A scalar input
+  in1: int64, list<shape=?> - A list of tensors as input
+Outputs
+  out0: int, shape=[5] - first output
+  out1: float, shape=[?] - second output
+  out2: string, shape=[?] - third output
+  out3: int64, list<shape=?> - fourth output that is in1 but incremented.
+)doc";
+
+  static const char* OpName() { return kOpName; }
+  static const char* Doc() { return kDoc; }
+
+  // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Attrs() {
+    return {absl::StrCat(kOutput1SizeAttr, ": int"), "output2_suffix: string",
+            "N: int >= 0"};
+  }
+  // Input tensors declaration (syntax:
+  // https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Inputs() {
+    return {"in0: string", "in1: N*int64"};
+  }
+  // Output tensors declaration (syntax:
+  // https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Outputs() {
+    return {"out0: int32", "out1: float", "out2: string", "out3: N*int64"};
+  }
+
+  // Initializes the op
+  absl::Status Init(InitContext* ctx) {
+    SH_RETURN_IF_ERROR(ctx->GetAttr(kOutput1SizeAttr, &output1_size_));
+    if (output1_size_ < 1) {
+      return absl::InternalError(
+          absl::StrCat(kOutput1SizeAttr, " should be >= 1"));
+    }
+    SH_RETURN_IF_ERROR(ctx->GetAttr("N", &n_));
+    absl::string_view output2_suffix;
+    SH_RETURN_IF_ERROR(ctx->GetAttr("output2_suffix", &output2_suffix));
+    output2_suffix_ = std::string(output2_suffix);
+    return absl::OkStatus();
+  }
+
+  // Runs the operation
+  absl::Status Invoke(InvokeContext* ctx) {
+    using std::int32_t;
+    // read input
+    SH_ASSIGN_OR_RETURN(const auto input_t, ctx->GetInput(kInput0));
+    const auto input_str = input_t->template AsScalar<::tensorflow::tstring>();
+    // output0 whose size is static
+    SH_ASSIGN_OR_RETURN(auto output0_t,
+                        ctx->GetOutput(kOutput0, Shape({kOutput0Size})));
+    auto output0 = output0_t->template As<int32_t, 1>();
+    for (int i = 0; i < output0.Dim(0); ++i) output0(i) = i;
+    // output1 whose size is based on the attr
+    SH_ASSIGN_OR_RETURN(
+        auto output1_t,
+        ctx->GetOutput(kOutput1, Shape({static_cast<int>(output1_size_)})));
+    auto output1 = output1_t->template As<float, 1>();
+    for (int i = 0; i < output1.Dim(0); ++i) output1(i) = 0.5 * i;
+    // output2 whose size is based on input
+    const int output2_size = input_str.length() + 1;
+    SH_ASSIGN_OR_RETURN(auto output2_t,
+                        ctx->GetOutput(kOutput2, Shape({output2_size})));
+    auto output2 = output2_t->template As<tensorflow::tstring, 1>();
+    for (int i = 0; i < output2.Dim(0) - 1; ++i) output2(i) = std::to_string(i);
+    output2(output2.Dim(0) - 1) = output2_suffix_;
+    // output3 which is a list of length N
+    // The values in output3 are element wise equal to input2 + 1.
+    if (ctx->NumInputs() < kInput1 + n_) {
+      return absl::InternalError(absl::StrCat(
+          "out of bounds: num_inputs=", ctx->NumInputs(), " N=", n_));
+    }
+    if (ctx->NumOutputs() < kOutput3 + n_) {
+      return absl::InternalError(absl::StrCat(
+          "out of bounds: num_outputs=", ctx->NumOutputs(), " N=", n_));
+    }
+    for (int i = 0; i < n_; ++i) {
+      SH_ASSIGN_OR_RETURN(const auto input_t, ctx->GetInput(kInput1 + i));
+      Shape output_shape(input_t->Shape());
+      SH_ASSIGN_OR_RETURN(auto output_t,
+                          ctx->GetOutput(kOutput3 + i, output_shape));
+      const auto input_data = input_t->template Data<int64_t>();
+      auto output_buffer = output_t->template Data<int64_t>().data();
+      std::copy(input_data.begin(), input_data.end(), output_buffer);
+      // Increment the values of the output
+      for (auto& v : output_t->template Data<int64_t>()) ++v;
+    }
+    return absl::OkStatus();
+  }
+
+  // Shape inference
+  static absl::Status ShapeInference(ShapeInferenceContext* ctx) {
+    // outpu0
+    SH_RETURN_IF_ERROR(ctx->SetOutputShape(kOutput0, Shape({kOutput0Size})));
+    // output1
+    SH_RETURN_IF_ERROR(
+        ctx->SetOutputShape(kOutput1, Shape({Shape::kUnknownDim})));
+    // output2
+    const auto input_t_or = ctx->GetInputTensor(kInput0);
+    Shape output2_shape;
+    if (input_t_or.ok()) {
+      const auto& input_t = input_t_or.value();
+      const auto input_str =
+          input_t->template AsScalar<::tensorflow::tstring>();
+      output2_shape = Shape({static_cast<int>(input_str.length() + 1)});
+    } else {
+      output2_shape = Shape({Shape::kUnknownDim});
+    }
+    SH_RETURN_IF_ERROR(ctx->SetOutputShape(kOutput2, output2_shape));
+    // output3
+    for (int i = kOutput3; i < ctx->NumOutputs(); ++i) {
+      SH_RETURN_IF_ERROR(ctx->SetOutputShape(kOutput3, Shape()));
+    }
+    int64_t n;
+    SH_RETURN_IF_ERROR(ctx->GetAttr("N", &n));
+    if (n + 1 != ctx->NumInputs()) {
+      return absl::InternalError(absl::StrCat("n + 1 != num_inputs: ", n + 1,
+                                              " != ", ctx->NumInputs()));
+    }
+    if (n + 3 != ctx->NumOutputs()) {
+      return absl::InternalError(absl::StrCat("n + 1 != num_inputs: ", n + 1,
+                                              " != ", ctx->NumOutputs()));
+    }
+    return absl::OkStatus();
+  }
+};
+
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_tf_op.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_tf_op.h
new file mode 100644
index 00000000..260b433b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_tf_op.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TF_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TF_OP_H_
+
+#include "tensorflow/lite/kernels/shim/test_op/simple_op.h"
+#include "tensorflow/lite/kernels/shim/tf_op_shim.h"
+
+namespace tflite {
+namespace shim {
+
+class SimpleOpKernel : public TfOpKernel<SimpleOp> {
+ public:
+  using TfOpKernel::TfOpKernel;
+};
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TF_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h
new file mode 100644
index 00000000..e7eee061
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TFLITE_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TFLITE_OP_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+// Add SimpleOp to the resolver
+void AddSimpleOp(MutableOpResolver* resolver);
+
+// Creates and returns the op kernel
+TfLiteRegistration* Register_SIMPLE_OP();
+
+// The name of the op
+const char* OpName_SIMPLE_OP();
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TFLITE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_op.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_op.h
new file mode 100644
index 00000000..061e2404
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_op.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_OP_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
+
+namespace tflite {
+namespace shim {
+
+// A simple operation for demonstration and testing purposes.
+// See the kDoc member for documentation.
+
+template <Runtime Rt, typename AType, typename BType>
+class TmplOp : public OpKernelShim<TmplOp, Rt, AType, BType> {
+ protected:
+  enum Inputs { kInput0 = 0, kInput1 };
+  enum Outputs { kOutput0 = 0 };
+
+ public:
+  using typename OpKernelShim<TmplOp, Rt, AType, BType>::InitContext;
+  using typename OpKernelShim<TmplOp, Rt, AType, BType>::InvokeContext;
+  using typename OpKernelShim<TmplOp, Rt, AType, BType>::ShapeInferenceContext;
+
+  TmplOp() = default;
+  static constexpr char kOpName[] = "TemplatizedOperation";
+  static constexpr char kDoc[] = R"doc(
+Description:
+  Templatized op for testing and demonstration purposes.
+
+Attrs
+  AType: The type for input0
+  BType: The type for input1
+Inputs
+  in0: AType, shape=[] - A scalar input
+  in1: BType, shape=[] - A scalar input
+Outputs
+  out0: int, shape=[] - first output
+)doc";
+
+  static const char* OpName() { return kOpName; }
+  static const char* Doc() { return kDoc; }
+
+  // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Attrs() {
+    return {"AType: {int32, float} = DT_INT32", "BType: type"};
+  }
+  // Input tensors declaration (syntax:
+  // https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Inputs() {
+    return {"in0: AType", "in1: BType"};
+  }
+  // Output tensors declaration (syntax:
+  // https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Outputs() { return {"out0: float"}; }
+
+  // Initializes the op
+  absl::Status Init(InitContext* ctx) { return absl::OkStatus(); }
+
+  // Shape inference
+  static absl::Status ShapeInference(ShapeInferenceContext* ctx) {
+    // outpu0
+    SH_RETURN_IF_ERROR(ctx->SetOutputShape(kOutput0, Shape({})));
+    return absl::OkStatus();
+  }
+
+  // Runs the operation
+  absl::Status Invoke(InvokeContext* ctx) {
+    using std::int32_t;
+    // input 0
+    SH_ASSIGN_OR_RETURN(const auto input0_t, ctx->GetInput(kInput0));
+    const auto in0 = input0_t->template AsScalar<AType>();
+    // input 1
+    SH_ASSIGN_OR_RETURN(const auto input1_t, ctx->GetInput(kInput1));
+    const auto in1 = input1_t->template AsScalar<BType>();
+    // output 0
+    SH_ASSIGN_OR_RETURN(auto output0_t, ctx->GetOutput(kOutput0, Shape({})));
+    auto& out0 = output0_t->template AsScalar<float>();
+    out0 = in0 + in1;
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h
new file mode 100644
index 00000000..3183c261
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TF_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TF_OP_H_
+
+#include "tensorflow/lite/kernels/shim/test_op/tmpl_op.h"
+#include "tensorflow/lite/kernels/shim/tf_op_shim.h"
+
+namespace tflite {
+namespace shim {
+
+template <typename AType, typename BType>
+class TmplOpKernel : public TfOpKernel<TmplOp, AType, BType> {
+ public:
+  using TfOpKernel<TmplOp, AType, BType>::TfOpKernel;
+};
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TF_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h
new file mode 100644
index 00000000..f9e35ec9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TFLITE_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TFLITE_OP_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+// Add TmplOp to the resolver
+void AddTmplOp(MutableOpResolver* resolver);
+
+// Creates and returns the op kernel
+TfLiteRegistration* Register_TMPL_OP();
+
+// The name of the op
+const char* OpName_TMPL_OP();
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TFLITE_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_util.h
new file mode 100644
index 00000000..9177948e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/test_util.h
@@ -0,0 +1,111 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_UTIL_H_
+
+#include <initializer_list>
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/type_to_tflitetype.h"
+
+namespace tflite {
+
+// A wrapper around TfLiteTensor which frees it in its dtor.
+class UniqueTfLiteTensor {
+ public:
+  explicit UniqueTfLiteTensor(TfLiteTensor* tensor) : tensor_(tensor) {}
+
+  UniqueTfLiteTensor() = default;
+
+  // Returns the underlying pointer
+
+  TfLiteTensor* get();
+
+  TfLiteTensor& operator*();
+
+  TfLiteTensor* operator->();
+
+  const TfLiteTensor* get() const;
+
+  const TfLiteTensor& operator*() const;
+
+  const TfLiteTensor* operator->() const;
+
+  // Resets the underlying pointer
+  void reset(TfLiteTensor* tensor);
+
+  // Deallocates the tensor as well
+  ~UniqueTfLiteTensor();
+
+ private:
+  TfLiteTensor* tensor_ = nullptr;
+};
+
+// Prints a debug string for the given tensor.
+std::string TfliteTensorDebugString(const ::TfLiteTensor* tensor,
+                                    const std::size_t max_values = 30);
+
+// Calculate the total number of elements given the shape.
+std::size_t NumTotalFromShape(const std::initializer_list<int>& shape);
+
+template <typename T>
+void ReallocDynamicTensor(const std::initializer_list<int> shape,
+                          TfLiteTensor* tensor) {
+  TfLiteTensorFree(tensor);
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->type = typeToTfLiteType<T>();
+  // Populate Shape
+  TfLiteIntArray* shape_arr = TfLiteIntArrayCreate(shape.size());
+  int i = 0;
+  const std::size_t num_total = NumTotalFromShape(shape);
+  for (const int dim : shape) shape_arr->data[i++] = dim;
+  tensor->dims = shape_arr;
+  if (tensor->type != kTfLiteString) {
+    TfLiteTensorRealloc(num_total * sizeof(T), tensor);
+  }
+}
+
+// Populates a tensor with the given values
+template <typename T>
+void PopulateTfLiteTensorValue(const std::initializer_list<T> values,
+                               TfLiteTensor* tensor) {
+  T* buffer = reinterpret_cast<T*>(tensor->data.raw);
+  int i = 0;
+  for (const auto v : values) {
+    buffer[i++] = v;
+  }
+}
+
+template <>
+void PopulateTfLiteTensorValue<std::string>(
+    const std::initializer_list<std::string> values, TfLiteTensor* tensor);
+
+template <typename T>
+void PopulateTfLiteTensor(const std::initializer_list<T> values,
+                          const std::initializer_list<int> shape,
+                          TfLiteTensor* tensor) {
+  const std::size_t num_total = NumTotalFromShape(shape);
+  CHECK_EQ(num_total, values.size());
+  // Populate Shape
+  ReallocDynamicTensor<T>(shape, tensor);
+  // Value allocation
+  PopulateTfLiteTensorValue<T>(values, tensor);
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tf_op_shim.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tf_op_shim.h
new file mode 100644
index 00000000..a4ec7190
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tf_op_shim.h
@@ -0,0 +1,166 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TF_OP_SHIM_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TF_OP_SHIM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/shape.h"
+#include "tsl/platform/macros.h"
+
+// This file contains the TF adapter. That is, it takes a `OpKernelShim`
+// class and provides a TF kernel out of it.
+
+namespace tflite {
+namespace shim {
+
+// TF implementation of the methods during an op kernel initialization
+class TfInitContext : public InitContext<TfInitContext> {
+ public:
+  explicit TfInitContext(const ::tensorflow::OpKernelConstruction* context);
+  // Read a given attribute
+  absl::StatusOr<AttrValue> GetAttr(const std::string& attr_name) const;
+
+ private:
+  const ::tensorflow::OpKernelConstruction* context_;
+};
+
+// TF implementation of the methods during an op kernel invocation
+class TfInvokeContext : public InvokeContext<TfInvokeContext> {
+ public:
+  explicit TfInvokeContext(::tensorflow::OpKernelContext* context);
+  // Read an input tensor
+  ConstTensorViewOr GetInput(int idx) const;
+  // Get a mutable output tensor
+  TensorViewOr GetOutput(int idx, const Shape& shape) const;
+  // Number of input tensors
+  int NumInputs() const;
+  // Number of output tensors
+  int NumOutputs() const;
+
+ private:
+  ::tensorflow::OpKernelContext* context_;
+};
+
+// TF implementation of the methods during shape inference
+class TfShapeInferenceContext
+    : public ShapeInferenceContext<TfShapeInferenceContext> {
+ public:
+  explicit TfShapeInferenceContext(
+      ::tensorflow::shape_inference::InferenceContext* context);
+  // Read an input tensor shape
+  ShapeOr GetInputShape(int idx) const;
+  // Set an output tensor shape
+  absl::Status SetOutputShape(int idx, const Shape& shape);
+  // Read an input tensor during shape inference
+  ConstTensorViewOr GetInputTensor(int idx) const;
+  // Read a given attribute
+  absl::StatusOr<AttrValue> GetAttr(const std::string& attr_name) const;
+  // Number of input tensors
+  int NumInputs() const;
+  // Number of output tensors
+  int NumOutputs() const;
+
+ private:
+  ::tensorflow::shape_inference::InferenceContext* context_;
+};
+
+// The adaptor between an op implementation (OpKernelShim subclass) and TF
+// runtime
+template <template <Runtime, typename...> typename Impl, typename... Ts>
+class TfOpKernel : public ::tensorflow::OpKernel {
+ public:
+  using ImplType = Impl<Runtime::kTf, Ts...>;
+
+  explicit TfOpKernel(::tensorflow::OpKernelConstruction* c)
+      : OpKernel(c), impl_(std::make_unique<ImplType>()) {
+    TfInitContext ctx(c);
+    c->SetStatus(impl_->Init(&ctx));
+  }
+
+  // The main computation of the op
+  void Compute(::tensorflow::OpKernelContext* c) override {
+    TfInvokeContext ctx(c);
+    OP_REQUIRES_OK(c, impl_->Invoke(&ctx));
+  }
+
+  // Shape inference for the op.
+  static absl::Status ShapeInference(
+      ::tensorflow::shape_inference::InferenceContext* c) {
+    TfShapeInferenceContext ctx(c);
+    return ImplType::ShapeInference(&ctx);
+  }
+
+  // The operation name
+  static const char* OpName() { return ImplType::OpName(); }
+
+ protected:
+  std::unique_ptr<OpKernelShim<Impl, Runtime::kTf, Ts...>> impl_;
+};
+
+static_assert(::tensorflow::shape_inference::InferenceContext::kUnknownDim ==
+                  Shape::kUnknownDim,
+              "The values must match.");
+static_assert(::tensorflow::shape_inference::InferenceContext::kUnknownRank ==
+                  Shape::kUnknownRank,
+              "The values must match.");
+
+// Builds the OpDef to register the op with the TF runtime
+template <typename Kernel>
+::tensorflow::register_op::OpDefBuilderWrapper CreateOpDefBuilderWrapper() {
+  auto ret = ::tensorflow::register_op::OpDefBuilderWrapper(
+      Kernel::ImplType::OpName());
+  for (const auto& input : Kernel::ImplType::Inputs()) ret = ret.Input(input);
+  for (const auto& output : Kernel::ImplType::Outputs())
+    ret = ret.Output(output);
+  for (const auto& attr : Kernel::ImplType::Attrs()) ret = ret.Attr(attr);
+  ret.SetShapeFn(Kernel::ShapeInference).Doc(Kernel::ImplType::kDoc);
+  return ret;
+}
+
+template <>
+struct ContextTypeForRuntime<Runtime::kTf> {
+  using Init = TfInitContext;
+  using Invoke = TfInvokeContext;
+  using ShapeInference = TfShapeInferenceContext;
+};
+
+// Macros for defining an op. These are taken from op.h because they need to be
+// slightly modified here.
+#define REGISTER_OP_SHIM_IMPL(ctr, op_kernel_cls)                            \
+  static ::tensorflow::InitOnStartupMarker const register_op##ctr            \
+      TF_ATTRIBUTE_UNUSED =                                                  \
+          TF_INIT_ON_STARTUP_IF(SHOULD_REGISTER_OP(op_kernel_cls::OpName())) \
+          << ::tflite::shim::CreateOpDefBuilderWrapper<op_kernel_cls>()
+
+#define REGISTER_TF_OP_SHIM(...) \
+  TF_ATTRIBUTE_ANNOTATE("tf:op") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_SHIM_IMPL, __VA_ARGS__)
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TF_OP_SHIM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tf_tensor_view.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tf_tensor_view.h
new file mode 100644
index 00000000..5d0a624b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tf_tensor_view.h
@@ -0,0 +1,99 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TF_TENSOR_VIEW_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TF_TENSOR_VIEW_H_
+
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/lite/kernels/shim/tensor_view.h"
+
+namespace tflite {
+namespace shim {
+
+// A view over TF Tensor without taking ownership. It can be either mutable or
+// immutable.
+class TfTensorView : public TensorView {
+ public:
+  // Move constructor
+  TfTensorView(TfTensorView &&o) noexcept;
+  // Copy constructor
+  TfTensorView(const TfTensorView &o);
+  // Move assignment operator
+  TfTensorView &operator=(TfTensorView &&o) noexcept;
+  // Copy assignment operator
+  TfTensorView &operator=(const TfTensorView &);
+
+ protected:
+  // Templated constructor. Since it's not possible to specify the template
+  // argument directly we place a dummy argument of that type so compiler can
+  // deduce the right template parameter
+  template <typename DType>
+  TfTensorView(const ::tensorflow::Tensor *wrapped_tensor, const DType &dtype);
+
+  // Let the factory implementation use private constructors
+  template <typename TfTensorType>
+  friend absl::StatusOr<
+      typename MatchConstNess<TfTensorType, TfTensorView>::Type>
+  TfTensorViewTemplatizedNew(TfTensorType *wrapped_tensor);
+
+  // Stores the shape read from the TensorShape object
+  std::vector<int> shape_data_;
+};
+
+// Map ::tensorflow::Tensor -> TfTensorView
+template <>
+struct TensorViewSubType<::tensorflow::Tensor> {
+  using Type = TfTensorView;
+};
+
+// Map const ::tensorflow::Tensor -> const TfTensorView
+template <>
+struct TensorViewSubType<const ::tensorflow::Tensor> {
+  using Type = const TfTensorView;
+};
+
+// Specialization of New() factory
+template <>
+absl::StatusOr<TfTensorView> TensorView::New<::tensorflow::Tensor>(
+    ::tensorflow::Tensor *wrapped_tensor);
+
+// Specialization of New() factory
+template <>
+absl::StatusOr<const TfTensorView> TensorView::New<const ::tensorflow::Tensor>(
+    const ::tensorflow::Tensor *wrapped_tensor);
+
+/////////////////////// Implementation
+///////////////////////
+
+// Templated ctor
+template <typename DType>
+TfTensorView::TfTensorView(const ::tensorflow::Tensor *wrapped_tensor,
+                           const DType &dtype)
+    : TensorView({}, wrapped_tensor->data(),
+                 wrapped_tensor->tensor_data().size(), dtype) {
+  shape_data_.resize(wrapped_tensor->shape().dims());
+  for (int dim = 0; dim < wrapped_tensor->shape().dims(); ++dim) {
+    shape_data_[dim] = wrapped_tensor->shape().dim_size(dim);
+  }
+  shape_ = absl::Span<int>(shape_data_);
+}
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TF_TENSOR_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_op_shim.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_op_shim.h
new file mode 100644
index 00000000..56095040
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_op_shim.h
@@ -0,0 +1,225 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_SHIM_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_SHIM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/shape.h"
+#include "tensorflow/lite/kernels/shim/tensor_view.h"
+#include "tensorflow/lite/kernels/shim/tflite_tensor_view.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+// This file contains the TFLite adapter. That is, it takes a `OpKernelShim`
+// class and provides a TFLite kernel out of it.
+
+namespace tflite {
+namespace shim {
+
+// TfLite implementation of the methods during an op kernel initialization
+class TfLiteInitContext : public InitContext<TfLiteInitContext> {
+ public:
+  TfLiteInitContext(const TfLiteContext* context,
+                    const flexbuffers::Map* attr_map);
+  // Read a given attribute
+  absl::StatusOr<AttrValue> GetAttr(const std::string& attr_name) const;
+
+ private:
+  const flexbuffers::Map* attr_map_;
+};
+
+// TfLite implementation of the methods during an op kernel invocation
+class TfLiteInvokeContext : public InvokeContext<TfLiteInvokeContext> {
+ public:
+  TfLiteInvokeContext(TfLiteContext* context_, TfLiteNode* node_);
+  // Read an input tensor
+  ConstTensorViewOr GetInput(const int idx) const;
+  // Get a mutable output tensor. For output string tensors, this should only
+  // be called once.
+  TensorViewOr GetOutput(const int idx, const Shape& shape) const;
+  // Number of input tensors
+  int NumInputs() const;
+  // Number of output tensors
+  int NumOutputs() const;
+
+ private:
+  absl::Status AssertShapesEqual(const TfLiteIntArray* dims,
+                                 const Shape& output_shape) const;
+
+  std::string ShapeMismatchErrorMsg(const TfLiteIntArray* actual_shape,
+                                    const Shape& expected_shape) const;
+
+  TfLiteContext* context_;
+  TfLiteNode* node_;
+};
+
+// TfLite implementation of the methods during shape inference
+class TfLiteShapeInferenceContext
+    : public ShapeInferenceContext<TfLiteShapeInferenceContext> {
+ public:
+  TfLiteShapeInferenceContext(TfLiteContext* context, TfLiteNode* node,
+                              const flexbuffers::Map* attr_map,
+                              std::vector<Shape>* inferred_shapes);
+  // Read an input tensor shape
+  ShapeOr GetInputShape(const int idx) const;
+  // Set an output tensor shape
+  absl::Status SetOutputShape(const int idx, const Shape& shape);
+  // Read an input tensor during shape inference
+  ConstTensorViewOr GetInputTensor(const int idx) const;
+  // Read a given attribute
+  absl::StatusOr<AttrValue> GetAttr(const std::string& attr_name) const;
+  // Number of input tensors
+  int NumInputs() const;
+  // Number of output tensors
+  int NumOutputs() const;
+
+ private:
+  TfLiteContext* context_;
+  TfLiteNode* node_;
+  const flexbuffers::Map* attr_map_;
+  std::vector<Shape>* inferred_shapes_;
+};
+
+// Convert the absl::Status to a TfLiteStatus and report the error message.
+TfLiteStatus StatusToTfLiteStatus(TfLiteContext* context,
+                                  const absl::Status& status);
+
+// Converts a vector of dims into an int array for TFLite use.
+TfLiteIntArray* ShapeToTfLiteShape(const std::vector<int>& shape);
+
+// Converts an int array representing shape in TFLite to Shape.
+Shape TfLiteShapeToShape(const TfLiteIntArray* tflite_shape);
+
+// An op kernel base class which is an adapter between an Op implementation
+// (OpKernelShim subclass) and TFLite runtime
+template <template <Runtime, typename...> typename Impl, typename... Ts>
+class TfLiteOpKernel {
+ public:
+  using ImplType = Impl<Runtime::kTfLite, Ts...>;
+
+  // Builds a TfLiteRegistration object to register this with the TfLite runtime
+  static TfLiteRegistration* GetTfLiteRegistration() {
+    static TfLiteRegistration r =
+        TfLiteRegistration{Init, Free, Prepare, Invoke};
+    return &r;
+  }
+
+  // Adds this op kernel to the passed in op resolver
+  static void Add(MutableOpResolver* resolver) {
+    resolver->AddCustom(ImplType::OpName(), GetTfLiteRegistration());
+  }
+
+  // The operation name
+  static const char* OpName() { return ImplType::OpName(); }
+
+ protected:
+  // The data that is stored in node::user_data.
+  struct UserData {
+    UserData(const char* buffer, size_t length) {
+      impl = new ImplType;
+      attr_map = new flexbuffers::Map(
+          flexbuffers::GetRoot(reinterpret_cast<const uint8_t*>(buffer), length)
+              .AsMap());
+    }
+
+    // An instance of OpKernelShim<TF or TFLite>.
+    // This is so that the Invoke(), Prepare(), etc. can call Invoke(),
+    // ShapeInference(), ... on the kernel defined using this library.
+    ImplType* impl = nullptr;
+    // Attribute map for the op kernel.
+    // The map needs to be accessible because the library provides
+    // GetAttr() during ShapeInference() which is called during Prepare(). So
+    // this needs to be accessible at that point.
+    const flexbuffers::Map* attr_map = nullptr;
+
+    ~UserData() {
+      if (impl) delete impl;
+      if (attr_map) delete attr_map;
+    }
+  };
+
+  static void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+    auto* user_data = new UserData(buffer, length);
+    TfLiteInitContext ctx(context, user_data->attr_map);
+    auto status = user_data->impl->Init(&ctx);
+    StatusToTfLiteStatus(context, status);
+    return user_data;
+  }
+
+  static void Free(TfLiteContext* context, void* buffer) {
+    if (buffer) delete static_cast<UserData*>(buffer);
+  }
+
+  // Resizes the Output Tensor to their shape. There are two cases:
+  //
+  // case 1: output shape is known after ShapeInference() was called during
+  //     Prepare()
+  //   ResizeTensor(output_of_shape_inference)
+  // case 2: output shape is not fully defined even after shape inference
+  //   SetTensorToDynamic(...)
+  static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+    const size_t num_outputs = ::tflite::NumOutputs(node);
+    std::vector<Shape> inferred_output_shapes(num_outputs);
+    const auto* attr_map = static_cast<UserData*>(node->user_data)->attr_map;
+    TfLiteShapeInferenceContext ctx(context, node, attr_map,
+                                    &inferred_output_shapes);
+    auto status = ImplType::ShapeInference(&ctx);
+    TF_LITE_ENSURE_STATUS(StatusToTfLiteStatus(context, status));
+    // Output shapes.
+    for (int output_idx = 0; output_idx < num_outputs; ++output_idx) {
+      TfLiteTensor* output_tensor =
+          tflite::GetOutput(context, node, output_idx);
+      TF_LITE_ENSURE(context, output_tensor != nullptr);
+      if (inferred_output_shapes[output_idx].FullyDefined()) {
+        // Case: output shape can be inferred during `Prepare`
+        TF_LITE_ENSURE_OK(context,
+                          context->ResizeTensor(
+                              context, output_tensor,
+                              ShapeToTfLiteShape(
+                                  inferred_output_shapes[output_idx].value())));
+      } else {
+        // Case: output shape is dynamic
+        tflite::SetTensorToDynamic(output_tensor);
+      }
+    }
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
+    TfLiteInvokeContext ctx(context, node);
+    return StatusToTfLiteStatus(
+        context, static_cast<UserData*>(node->user_data)->impl->Invoke(&ctx));
+  }
+};
+
+template <>
+struct ContextTypeForRuntime<Runtime::kTfLite> {
+  using Init = TfLiteInitContext;
+  using Invoke = TfLiteInvokeContext;
+  using ShapeInference = TfLiteShapeInferenceContext;
+};
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_SHIM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_op_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
new file mode 100644
index 00000000..9ad950c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
@@ -0,0 +1,369 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_WRAPPER_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_WRAPPER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+
+namespace tflite {
+namespace shim {
+namespace op_wrapper {
+
+using ::tflite::shim::OpKernelShim;
+using ::tflite::shim::Runtime;
+
+// Represents an attribute which can have many types. The first template
+// parameter should be an AttrName, and the packed parameter should be the
+// list of types of the attribute.
+// TODO(b/265879737): When we begin compiling using C++20, the first template
+// parameter should be changed to the template type.
+template <typename N, typename... T>
+struct Attr {
+  const char* Name() const { return N::Name(); }
+};
+
+// Used to store the name of an attribute.
+template <char const* str>
+struct AttrName {
+  static const char* Name() { return str; }
+};
+
+// Object for passing around types.
+template <typename T>
+struct AttrType {
+  using type = T;
+};
+
+// The following constexprs are used to create the variant type which contains
+// the combinations of our templated op. This variant is what is wrapped and
+// ultimately called by the wrapper op.
+//
+// Example:
+//   TmplOp with Attrs {"AType: {bool, float}", "BType: {int32, int64}"};
+// Call:
+//   const char a_type[]("AType"), b_type[]("BType");
+//   VariantOp<Rt,
+//             TmplOp,
+//              Attr<AttrName<a_type>, bool, float>,
+//              Attr<AttrName<b_type>, int32_t, int64_t>> x;
+// Result:
+//   absl::variant<TmplOp<Rt, bool, int32_t>, TmplOp<Rt, bool, int64_t>,
+//                 TmplOp<Rt, float, int32_t>, TmplOp<Rt, float, int64_t>> x;
+
+// Prepends a type onto a tuple.
+template <typename T, typename... Us>
+static constexpr std::tuple<T, Us...> prependTypeInner(T, std::tuple<Us...>);
+
+// Prepend a type on each inner tuple group. This expression unwraps the inner
+// tuples, and the inner expression performs the prepending.
+template <typename T, typename... Us>
+static constexpr auto prependType(T, std::tuple<Us...>)
+    -> std::tuple<decltype(prependTypeInner(std::declval<T>(),
+                                            std::declval<Us>()))...>;
+
+// Base case for recursively processing all combinations of remaining
+// attributes. The result is a tuple containing each type individually.
+template <typename Name, typename... Ts>
+static constexpr std::tuple<std::tuple<Ts>...> getCombinations(
+    Attr<Name, Ts...>);
+
+// Base case for recursively processing all types of an attribute.
+template <typename Name, typename Head, typename... Attrs>
+static constexpr auto getCombinations(Attr<Name, Head>, Attrs...)
+    -> decltype(prependType(std::declval<Head>(),
+                            getCombinations(std::declval<Attrs>()...)));
+
+// Creates a tuple of tuples from a list of Attrute types by recursively
+// popping the first type off the first attribute and prepending it to the
+// combination of other attribute types. This result is then combined with the
+// recursive processing of other types left.
+template <typename Name, typename Head, typename... Tail, typename... Attrs>
+static constexpr auto getCombinations(Attr<Name, Head, Tail...>, Attrs...)
+    -> decltype(std::tuple_cat(
+        prependType(std::declval<Head>(),
+                    getCombinations(std::declval<Attrs>()...)),
+        getCombinations(std::declval<Attr<Name, Tail...>>(),
+                        std::declval<Attrs>()...)));
+
+// Converts a tuple of types into the corresponding op type.
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename... Ts>
+static constexpr Op<Rt, Ts...> convertTuplesToOpsInner(std::tuple<Ts...>);
+
+// Convert a tuple of types into our op with those types. We first need to
+// unwrap the inner tuples, we can then convert each individually in the
+// inner expression and wrap them back up into a tuple.
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename... Ts>
+static constexpr auto convertTuplesToOps(std::tuple<Ts...>) -> std::tuple<
+    decltype(convertTuplesToOpsInner<Rt, Op>(std::declval<Ts>()))...>;
+
+// Convert a tuple of types into a variant of types.
+template <typename... Ts>
+static constexpr std::variant<Ts...> convertTupleToVariant(std::tuple<Ts...>);
+
+// The variant Op type created with TMP. A tuple of tuples containing the
+// attribute combinations is first created. Then each inner tuple is converted
+// into the op types, and finally the outer tuple is converted into a variant.
+// Note, this uses a struct rather than a type alias because of a C++ limitation
+// with template parameter packs not being deduced for aliases.
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename FirstAttr, typename... OtherAttrs>
+struct VariantOp {
+  using type =
+      decltype(convertTupleToVariant(convertTuplesToOps<Rt, Op>(getCombinations(
+          std::declval<FirstAttr>(), std::declval<OtherAttrs>()...))));
+};
+
+// Intermediate object used by the OpWrapper to properly extend OpKernelShim.
+template <Runtime Rt>
+class OpWrapperExtension : public OpKernelShim<OpWrapperExtension, Rt> {};
+
+// Wraps a polymorphic op to be used by TF Lite. At this time, TF Lite does not
+// support TypeConstraints like TensorFlow. This will wrap the op variants
+// and delegate calls to the correctly typed variant when called.
+//
+// Example usage:
+// Given a templated Op `TmplOp` with Attrs:
+//     Attrs {"AType: {bool, float}", "BType: {int32_t, int64_t}"};
+//
+// We can define our type with the following (note that until C++20, these
+// strings cannot be defined inline):
+//
+// const char a_type[]("AType"), b_type[]("BType");
+// template <shim::Runtime Rt>
+// using OpWrapperType = OpWrapper<Rt, TmplOp,
+//     Attr<AttrName<a_type>, bool, float>,
+//     Attr<AttrName<b_type>, int32_t, int64_t>>;
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename... As>
+class OpWrapper : public OpWrapperExtension<Rt> {
+ public:
+  // This variant can be any permutation of the Op and its template params.
+  using TmplOpType = typename VariantOp<Rt, Op, As...>::type;
+  // For static calls, the exact type shouldn't matter, we just need a type.
+  using TmplOpType0 = typename std::variant_alternative<0, TmplOpType>::type;
+
+  using typename OpKernelShim<OpWrapperExtension, Rt>::InitContext;
+  using typename OpKernelShim<OpWrapperExtension, Rt>::InvokeContext;
+  using typename OpKernelShim<OpWrapperExtension, Rt>::ShapeInferenceContext;
+  OpWrapper() = default;
+
+  // For the static methods, they shouldn't change based on the types.
+  static const char* OpName() { return TmplOpType0::OpName(); }
+  static const char* Doc() { return TmplOpType0::Doc(); }
+
+  static std::vector<std::string> Attrs() { return TmplOpType0::Attrs(); }
+  static std::vector<std::string> Inputs() { return TmplOpType0::Inputs(); }
+  static std::vector<std::string> Outputs() { return TmplOpType0::Outputs(); }
+
+  static absl::Status ShapeInference(ShapeInferenceContext* context) {
+    return TmplOpType0::ShapeInference(context);
+  }
+
+  // Creates the correctly typed wrapped object before delegating the Init call
+  // to it. Invoke will also use this variant.
+  absl::Status Init(InitContext* context) {
+    SH_RETURN_IF_ERROR(SetVariantOp<As...>(context));
+
+    return std::visit(
+        [context](auto&& op) -> absl::Status { return op.Init(context); },
+        *op_);
+  }
+
+  // Call Invoke on the created wrapped object.
+  absl::Status Invoke(InvokeContext* context) {
+    return std::visit(
+        [context](auto&& op) -> absl::Status { return op.Invoke(context); },
+        *op_);
+  }
+
+ private:
+  // Sets op_ to the variant type matching the type attributes provided by the
+  // InitContext. Similar to creating the variant type, we recursively
+  // get all combinations of the attributes.
+  template <typename FirstAttr, typename... Attrs>
+  absl::Status SetVariantOp(InitContext* c) {
+    return CombineAttributeTypes(this, c, FirstAttr{}, Attrs{}...);
+  }
+
+  // A simple object to hold Attrutes while we recursively find the
+  // combinations. When called, it will unwrap the stored types to call the
+  // underlying function.
+  // The template parameters are:
+  //   F: Object to wrap which will be another Forwarder object or the OpWrapper
+  //   Name: AttrName of the attribute.
+  //   T: Type of attribute for this combination.
+  template <typename F, typename Name, typename T>
+  struct Forwarder {
+   public:
+    explicit Forwarder(F* f) : inner(f) {}
+
+    template <typename... Args>
+    absl::Status SetOpCombination(Args... args) {
+      return inner->SetOpCombination(Name::Name(), AttrType<T>{}, args...);
+    }
+
+   private:
+    F* inner;
+  };
+
+  // Recursively processes for each combination of attribute types. First,
+  // running over the first attibute and sub-combinations, then running over
+  // the combinations of the remaining types of the first attribute.
+  template <typename F, typename Name, typename Head, typename... Tail,
+            typename... Attrs>
+  absl::Status CombineAttributeTypes(F* obj, InitContext* c,
+                                     Attr<Name, Head, Tail...>, Attrs... rest) {
+    SH_RETURN_IF_ERROR(
+        ApplyAttrType(obj, c, Name{}, AttrType<Head>{}, rest...));
+
+    return CombineAttributeTypes(obj, c, Attr<Name, Tail...>{}, rest...);
+  }
+
+  // Base case for recursively processing types of an attribute.
+  template <typename F, typename Name, typename... Attrs>
+  absl::Status CombineAttributeTypes(F*, InitContext*, Attr<Name>, Attrs...) {
+    return absl::OkStatus();
+  }
+
+  // Saves the names and types of each attribute in the current combination
+  // in a Forwarder object which will ultimately call a typed function.
+  template <typename F, typename Name, typename T, typename Attr,
+            typename... Attrs>
+  absl::Status ApplyAttrType(F* obj, InitContext* c, Name, AttrType<T>, Attr a,
+                             Attrs... rest) {
+    Forwarder<F, Name, T> forwarder(obj);
+
+    return CombineAttributeTypes(&forwarder, c, a, rest...);
+  }
+
+  // Base case for recursively finding combinations of attributes.
+  template <typename F, typename Name, typename T>
+  absl::Status ApplyAttrType(F* obj, InitContext* c, Name, AttrType<T> t) {
+    return obj->SetOpCombination(Name::Name(), t, c);
+  }
+
+  // Checks the attribute types from the context for this particular attribute
+  // type combination. If correct, we set the op variant to this op combo.
+  //
+  // For this, we actually need to overload the functiona nd create a template
+  // for each number of attributes.
+  template <typename T>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                InitContext* context) {
+    int64_t datatype_1;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    if (datatype_1 == typeToTfLiteType<T>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U, typename V>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                std::string Name3, AttrType<V>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2, datatype_3;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name3, &datatype_3));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>() &&
+        datatype_3 == typeToTfLiteType<V>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U, V>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U, typename V, typename W>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                std::string Name3, AttrType<V>,
+                                std::string Name4, AttrType<W>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2, datatype_3, datatype_4;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name3, &datatype_3));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name4, &datatype_4));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>() &&
+        datatype_3 == typeToTfLiteType<V>() &&
+        datatype_4 == typeToTfLiteType<W>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U, V, W>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U, typename V, typename W, typename X>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                std::string Name3, AttrType<V>,
+                                std::string Name4, AttrType<W>,
+                                std::string Name5, AttrType<X>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2, datatype_3, datatype_4, datatype_5;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name3, &datatype_3));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name4, &datatype_4));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name5, &datatype_5));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>() &&
+        datatype_3 == typeToTfLiteType<V>() &&
+        datatype_4 == typeToTfLiteType<W>() &&
+        datatype_5 == typeToTfLiteType<X>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U, V, W, X>());
+    }
+    return absl::OkStatus();
+  }
+
+ protected:
+  // The wrapped object variant.
+  std::unique_ptr<TmplOpType> op_;
+};
+
+}  // namespace op_wrapper
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_tensor_view.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_tensor_view.h
new file mode 100644
index 00000000..8276f269
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/shim/tflite_tensor_view.h
@@ -0,0 +1,129 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_TENSOR_VIEW_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_TENSOR_VIEW_H_
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/shim/tensor_view.h"
+
+namespace tflite {
+namespace shim {
+
+// A view over a TFLite tensor without taking ownership. It can either be
+// mutable or immutable.
+class TfLiteTensorView : public TensorView {
+ public:
+  // Move constructor
+  TfLiteTensorView(TfLiteTensorView &&o) noexcept;
+  // Copy constructor
+  TfLiteTensorView(const TfLiteTensorView &o);
+  // Move assignment operator
+  TfLiteTensorView &operator=(TfLiteTensorView &&o) noexcept;
+  // Copy assignment operator
+  TfLiteTensorView &operator=(const TfLiteTensorView &);
+
+ protected:
+  // Templated constructor. Since it's not possible to specify the template
+  // argument directly we place a dummy argument of that type so compiler can
+  // deduce the right template parameter
+  template <typename DType>
+  TfLiteTensorView(::TfLiteTensor *wrapped_tensor, const DType &dtype)
+      : TensorView(absl::Span<int>(wrapped_tensor->dims->data,
+                                   wrapped_tensor->dims->size),
+                   wrapped_tensor->data.raw, wrapped_tensor->bytes, dtype),
+        wrapped_tensor_(wrapped_tensor),
+        const_wrapped_tensor_(wrapped_tensor) {}
+
+  // Specialization for string. (this take precedence over the above template)
+  TfLiteTensorView(::TfLiteTensor *wrapped_tensor,
+                   const ::tensorflow::tstring &dtype);
+
+  // Templated constructor for const input.
+  template <typename DType>
+  TfLiteTensorView(const ::TfLiteTensor *wrapped_tensor, const DType &dtype)
+      : TensorView(absl::Span<int>(wrapped_tensor->dims->data,
+                                   wrapped_tensor->dims->size),
+                   wrapped_tensor->data.raw, wrapped_tensor->bytes, dtype),
+        const_wrapped_tensor_(wrapped_tensor) {}
+
+  // Specialization for const string. (this take precedence over the above
+  // template)
+  TfLiteTensorView(const ::TfLiteTensor *wrapped_tensor,
+                   const ::tensorflow::tstring &dtype);
+
+  // Let the factory implementation use private constructors
+  template <typename TfLiteTensorType>
+  friend absl::StatusOr<
+      typename MatchConstNess<TfLiteTensorType, TfLiteTensorView>::Type>
+  TfLiteTensorViewTemplatizedNew(TfLiteTensorType *wrapped_tensor);
+
+  struct StringBuffer {
+    explicit StringBuffer(TfLiteTensorView *t_view);
+    ~StringBuffer();
+
+    // A vector of string as the intermediate shared buffer between
+    // TensorViews
+    std::vector<::tensorflow::tstring> buffer;
+    // The TFLite tensor to which the contents of the buffer is flushed in
+    // dtor
+    ::TfLiteTensor *wrapped_tensor = nullptr;
+  };
+
+  // Initialize the data_ field for string tensors
+  void InitForStringDType();
+
+  // The wrapped TFLiteTensor
+  ::TfLiteTensor *wrapped_tensor_ = nullptr;
+  // A const version of the wrapped TFLiteTensor used when the input is const
+  const ::TfLiteTensor *const_wrapped_tensor_ = nullptr;
+  // A temporary buffer used to expose TfLite strings tensor as Span<tstring>.
+  // This buffer will be flushed and serialized back to the underlying TfLite
+  // string tensor once all the TensorViews over that tensor are destructed.
+  std::shared_ptr<StringBuffer> str_vec_ = nullptr;
+};
+
+// Mapping of ::TfLiteTensor -> TfLiteTensorView
+template <>
+struct TensorViewSubType<::TfLiteTensor> {
+  using Type = TfLiteTensorView;
+};
+
+// Mapping of const ::TfLiteTensor -> const TfLiteTensorView
+template <>
+struct TensorViewSubType<const ::TfLiteTensor> {
+  using Type = const TfLiteTensorView;
+};
+
+// Specialization for TensorView::New()
+template <>
+absl::StatusOr<TfLiteTensorView> TensorView::New<::TfLiteTensor>(
+    ::TfLiteTensor *wrapped_tensor);
+
+// Specialization for TensorView::New()
+template <>
+absl::StatusOr<const TfLiteTensorView> TensorView::New<const ::TfLiteTensor>(
+    const ::TfLiteTensor *wrapped_tensor);
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_TENSOR_VIEW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/stablehlo_elementwise.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/stablehlo_elementwise.h
new file mode 100644
index 00000000..a5908bdd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/stablehlo_elementwise.h
@@ -0,0 +1,159 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_STABLEHLO_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_KERNELS_STABLEHLO_ELEMENTWISE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// Indicates the type of the computation performed by the element-wise op.
+enum class ComputationType { kAdd, kSub, kMax, kMin, kMul, kAnd };
+
+TfLiteStatus ElementwisePrepare(TfLiteContext* context, TfLiteNode* node);
+
+// A helper function that converts a tensor index into a flat array index.
+template <typename IndexType>
+static IndexType TensorIndexToFlat(const IndexType* index, const int64_t dims,
+                                   const RuntimeShape& shape) {
+  // If it's a scalar, just return the index of the first element.
+  if (dims == 0) {
+    return 0;
+  }
+  IndexType flat_index = index[0];
+  for (int64_t i = 1; i < dims; ++i) {
+    flat_index = flat_index * shape.Dims(i) + index[i];
+  }
+  return flat_index;
+}
+
+template <typename DataType, ComputationType computation_type>
+inline DataType ApplyComputation(DataType input1, DataType input2) {
+  if (computation_type == ComputationType::kAnd) {
+    if constexpr (std::is_integral<DataType>::value) {
+      return input1 & input2;
+    } else if constexpr (std::is_same<DataType, bool>::value) {
+      return input1 && input2;
+    }
+  } else if (computation_type == ComputationType::kAdd) {
+    return input1 + input2;
+  } else if (computation_type == ComputationType::kSub) {
+    return input1 - input2;
+  } else if (computation_type == ComputationType::kMax) {
+    return std::max(input1, input2);
+  } else if (computation_type == ComputationType::kMin) {
+    return std::min(input1, input2);
+  } else if (computation_type == ComputationType::kMul) {
+    return input1 * input2;
+  }
+  TFL_UNREACHABLE();
+}
+
+// Evaluates this node given the type of the elements in the output_tensor
+// and the type of the elements in the input/updates vector.
+template <ComputationType computation_type, typename DataType>
+TfLiteStatus EvalWithType(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input_tensor1));
+  RuntimeShape input_shape = GetTensorShape(input_tensor1);
+  const DataType* input_data1 = GetTensorData<DataType>(input_tensor1);
+
+  const TfLiteTensor* input_tensor2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input_tensor2));
+  const DataType* input_data2 = GetTensorData<DataType>(input_tensor2);
+
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  DataType* output_data = GetTensorData<DataType>(output);
+
+  int input_rank = input_tensor1->dims->size;
+  std::vector<int64_t> index(input_rank, 0);
+
+  do {
+    DataType input_value1 =
+        input_data1[TensorIndexToFlat(index.data(), input_rank, input_shape)];
+    DataType input_value2 =
+        input_data2[TensorIndexToFlat(index.data(), input_rank, input_shape)];
+
+    output_data[TensorIndexToFlat(index.data(), input_rank, input_shape)] =
+        ApplyComputation<DataType, computation_type>(input_value1,
+                                                     input_value2);
+  } while (NextIndex(input_rank, input_tensor1->dims->data, index.data()));
+
+  return TfLiteStatus::kTfLiteOk;
+}
+
+template <ComputationType computation_type>
+TfLiteStatus ElementwiseEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input_tensor1));
+
+  TfLiteType data_type = input_tensor1->type;
+
+  switch (data_type) {
+    case kTfLiteFloat16:
+      return EvalWithType<computation_type, Eigen::half>(context, node);
+    case kTfLiteFloat32:
+      return EvalWithType<computation_type, float>(context, node);
+    case kTfLiteFloat64:
+      return EvalWithType<computation_type, double>(context, node);
+    case kTfLiteInt8:
+      return EvalWithType<computation_type, int8_t>(context, node);
+    case kTfLiteInt16:
+      return EvalWithType<computation_type, int16_t>(context, node);
+    case kTfLiteInt32:
+      return EvalWithType<computation_type, int32_t>(context, node);
+    case kTfLiteInt64:
+      return EvalWithType<computation_type, int64_t>(context, node);
+    case kTfLiteUInt8:
+      return EvalWithType<computation_type, uint8_t>(context, node);
+    case kTfLiteUInt16:
+      return EvalWithType<computation_type, uint16_t>(context, node);
+    case kTfLiteUInt32:
+      return EvalWithType<computation_type, uint32_t>(context, node);
+    case kTfLiteUInt64:
+      return EvalWithType<computation_type, uint64_t>(context, node);
+    case kTfLiteBool:
+      return EvalWithType<computation_type, bool>(context, node);
+    default:
+      TF_LITE_KERNEL_LOG(context, "(Data Type: %s) currently not supported.\n",
+                         TfLiteTypeGetName(data_type));
+      return TfLiteStatus::kTfLiteError;
+  }
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_STABLEHLO_ELEMENTWISE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h
new file mode 100644
index 00000000..d5cd7cc6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h
@@ -0,0 +1,420 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_STABLEHLO_REDUCE_WINDOW_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_STABLEHLO_REDUCE_WINDOW_TEST_UTIL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+
+namespace tflite {
+namespace reduce_window {
+namespace reference {
+
+constexpr int kMaxDims = 6;
+
+// Holds a buffer and the shape associated to a tensor.
+template <class T>
+struct Tensor {
+  std::vector<int64_t> shape;
+  std::vector<T> data;
+
+  // Builds a tensor using the given shape and fill it with the given initial
+  // value.
+  static Tensor<T> FromShape(std::vector<int64_t> shape,
+                             const T init_value = 0) {
+    Tensor tensor{std::move(shape)};
+    tensor.data.resize(tensor.size(), init_value);
+    return tensor;
+  }
+
+  // Builds a tensor using the given shape and fill it with incrementing values
+  // starting from 1.
+  template <class I>
+  static Tensor<T> iota(std::initializer_list<I> shape) {
+    Tensor<T> tensor;
+    tensor.shape.assign(shape.begin(), shape.end());
+    tensor.data.resize(absl::c_accumulate(shape, 1, std::multiplies<>()));
+    absl::c_iota(tensor.data, 1);
+    return tensor;
+  }
+
+  // Returns the number of values in the tensor.
+  int64_t size() const {
+    return absl::c_accumulate(shape, 1, std::multiplies<>());
+  }
+
+  // Computes the strides for each valid dimension in the tensor.
+  //
+  // The returned vector always has a `kMaxDims` size.
+  std::vector<int64_t> Strides() const {
+    std::vector<int64_t> strides(kMaxDims, 0);
+    if (!shape.empty()) {
+      strides[shape.size() - 1] = 1;
+      for (size_t i = shape.size() - 1; i > 0; --i) {
+        strides[i - 1] = shape[i] * strides[i];
+      }
+    }
+    return strides;
+  }
+};
+
+// Returns a new vector resized to `kMaxDims` with `val` a a default value.
+inline std::vector<int64_t> ExtendToMaxDim(std::vector<int64_t> vec,
+                                           int64_t val = 0) {
+  vec.resize(kMaxDims, val);
+  return vec;
+}
+
+inline std::vector<int64_t> DilateShape(std::vector<int64_t> shape,
+                                        const std::vector<int64_t> dilations) {
+  for (size_t i = 0; i < shape.size(); ++i) {
+    shape[i] = (shape[i] - 1) * dilations[i] + 1;
+  }
+  if (absl::c_any_of(shape, [](auto s) { return s <= 0; })) {
+    absl::c_fill(shape, 0);
+  }
+  return shape;
+}
+
+template <class T>
+Tensor<T> Dilate(const Tensor<T>& input, const std::vector<int64_t>& dilations,
+                 const T padding_value) {
+  Tensor<T> output =
+      Tensor<T>::FromShape(DilateShape(input.shape, dilations), padding_value);
+
+  if (absl::c_all_of(output.shape, [](auto s) { return s == 0; })) {
+    return output;
+  }
+
+  const std::vector<int64_t> strides = input.Strides();
+  const std::vector<int64_t> output_strides = output.Strides();
+  const std::vector<int64_t> safe_dilations = ExtendToMaxDim(dilations);
+  const std::vector<int64_t> safe_input_shape = ExtendToMaxDim(input.shape);
+
+  int a = 0;
+  do {
+    int b = 0;
+    do {
+      int c = 0;
+      do {
+        int d = 0;
+        do {
+          int e = 0;
+          do {
+            int f = 0;
+            do {
+              const int i_idx = a * strides[0] + b * strides[1] +
+                                c * strides[2] + d * strides[3] +
+                                e * strides[4] + f * strides[5];
+              const int o_idx = a * safe_dilations[0] * output_strides[0] +
+                                b * safe_dilations[1] * output_strides[1] +
+                                c * safe_dilations[2] * output_strides[2] +
+                                d * safe_dilations[3] * output_strides[3] +
+                                e * safe_dilations[4] * output_strides[4] +
+                                f * safe_dilations[5] * output_strides[5];
+              output.data[o_idx] = input.data[i_idx];
+            } while (++f < safe_input_shape[5]);
+          } while (++e < safe_input_shape[4]);
+        } while (++d < safe_input_shape[3]);
+      } while (++c < safe_input_shape[2]);
+    } while (++b < safe_input_shape[1]);
+  } while (++a < safe_input_shape[0]);
+
+  return output;
+}
+
+inline std::vector<int64_t> PadCropShape(std::vector<int64_t> shape,
+                                         const std::vector<int64_t> padding) {
+  for (size_t i = 0; i < shape.size(); ++i) {
+    shape[i] = shape[i] + padding[2 * i] + padding[2 * i + 1];
+  }
+  if (absl::c_any_of(shape, [](auto s) { return s <= 0; })) {
+    absl::c_fill(shape, 0);
+  }
+  return shape;
+}
+
+// Pads the input tensor.
+//
+// `Pad` and `Crop` share the same pad/crop specification. The positive values
+// specify padding and the negative values specify cropping.
+template <class T>
+Tensor<T> Pad(const Tensor<T>& input, const std::vector<int64_t>& padding,
+              const T padding_value) {
+  // Keep only positive values in the padding.
+  std::vector<int64_t> safe_padding(kMaxDims * 2, 0);
+  absl::c_transform(padding, safe_padding.begin(),
+                    [](int64_t p) { return std::max<int64_t>(p, 0); });
+
+  Tensor<T> output = Tensor<T>::FromShape(
+      PadCropShape(input.shape, safe_padding), padding_value);
+
+  if (absl::c_all_of(output.shape, [](auto s) { return s == 0; })) {
+    return output;
+  }
+
+  const std::vector<int64_t> strides = input.Strides();
+  const std::vector<int64_t> output_strides = output.Strides();
+  const std::vector<int64_t> safe_input_shape = ExtendToMaxDim(input.shape);
+
+  int a = 0;
+  do {
+    int b = 0;
+    do {
+      int c = 0;
+      do {
+        int d = 0;
+        do {
+          int e = 0;
+          do {
+            int f = 0;
+            do {
+              const int i_idx = a * strides[0] + b * strides[1] +
+                                c * strides[2] + d * strides[3] +
+                                e * strides[4] + f * strides[5];
+              const int o_idx = (a + safe_padding[0]) * output_strides[0] +
+                                (b + safe_padding[2]) * output_strides[1] +
+                                (c + safe_padding[4]) * output_strides[2] +
+                                (d + safe_padding[6]) * output_strides[3] +
+                                (e + safe_padding[8]) * output_strides[4] +
+                                (f + safe_padding[10]) * output_strides[5];
+              output.data[o_idx] = input.data[i_idx];
+            } while (++f < safe_input_shape[5]);
+          } while (++e < safe_input_shape[4]);
+        } while (++d < safe_input_shape[3]);
+      } while (++c < safe_input_shape[2]);
+    } while (++b < safe_input_shape[1]);
+  } while (++a < safe_input_shape[0]);
+
+  return output;
+}
+
+// Crops the input tensor.
+//
+// Only negative values are taken into account for cropping.
+template <class T>
+Tensor<T> Crop(const Tensor<T>& input, const std::vector<int64_t>& cropping) {
+  // Keep only negative values in the cropping.
+  std::vector<int64_t> safe_cropping(kMaxDims * 2, 0);
+  absl::c_transform(cropping, safe_cropping.begin(),
+                    [](int64_t p) { return std::min<int64_t>(p, 0); });
+
+  Tensor<T> output =
+      Tensor<T>::FromShape(PadCropShape(input.shape, safe_cropping));
+
+  if (absl::c_all_of(output.shape, [](auto s) { return s == 0; })) {
+    return output;
+  }
+
+  const std::vector<int64_t> strides = input.Strides();
+  const std::vector<int64_t> output_strides = output.Strides();
+  const std::vector<int64_t> safe_output_shape = ExtendToMaxDim(output.shape);
+
+  int a = 0;
+  do {
+    int b = 0;
+    do {
+      int c = 0;
+      do {
+        int d = 0;
+        do {
+          int e = 0;
+          do {
+            int f = 0;
+            do {
+              const int i_idx = (a - safe_cropping[0]) * strides[0] +
+                                (b - safe_cropping[2]) * strides[1] +
+                                (c - safe_cropping[4]) * strides[2] +
+                                (d - safe_cropping[6]) * strides[3] +
+                                (e - safe_cropping[8]) * strides[4] +
+                                (f - safe_cropping[10]) * strides[5];
+              const int o_idx = a * output_strides[0] + b * output_strides[1] +
+                                c * output_strides[2] + d * output_strides[3] +
+                                e * output_strides[4] + f * output_strides[5];
+              output.data[o_idx] = input.data[i_idx];
+            } while (++f < safe_output_shape[5]);
+          } while (++e < safe_output_shape[4]);
+        } while (++d < safe_output_shape[3]);
+      } while (++c < safe_output_shape[2]);
+    } while (++b < safe_output_shape[1]);
+  } while (++a < safe_output_shape[0]);
+
+  return output;
+}
+
+// Gathers the elements that are visible through the given window spec in a new
+// tensor.
+template <class T>
+Tensor<T> WindowCopy(const Tensor<T>& input,
+                     const std::vector<int64_t>& window_dimensions,
+                     const std::vector<int64_t>& window_dilations,
+                     const std::vector<int64_t>& window_offset) {
+  Tensor<T> output = Tensor<T>::FromShape(window_dimensions);
+
+  const std::vector<int64_t> safe_window_dimensions =
+      ExtendToMaxDim(window_dimensions);
+  const std::vector<int64_t> safe_window_dilations =
+      ExtendToMaxDim(window_dilations, 1);
+  const std::vector<int64_t> safe_window_offset = ExtendToMaxDim(window_offset);
+
+  const std::vector<int64_t> strides = input.Strides();
+  const std::vector<int64_t> output_strides = output.Strides();
+
+  int a = 0;
+  do {
+    int b = 0;
+    do {
+      int c = 0;
+      do {
+        int d = 0;
+        do {
+          int e = 0;
+          do {
+            int f = 0;
+            do {
+              const int i_idx =
+                  (a * safe_window_dilations[0] + safe_window_offset[0]) *
+                      strides[0] +
+                  (b * safe_window_dilations[1] + safe_window_offset[1]) *
+                      strides[1] +
+                  (c * safe_window_dilations[2] + safe_window_offset[2]) *
+                      strides[2] +
+                  (d * safe_window_dilations[3] + safe_window_offset[3]) *
+                      strides[3] +
+                  (e * safe_window_dilations[4] + safe_window_offset[4]) *
+                      strides[4] +
+                  (f * safe_window_dilations[5] + safe_window_offset[5]) *
+                      strides[5];
+              const int o_idx = a * output_strides[0] + b * output_strides[1] +
+                                c * output_strides[2] + d * output_strides[3] +
+                                e * output_strides[4] + f * output_strides[5];
+              output.data[o_idx] = input.data[i_idx];
+            } while (++f < safe_window_dimensions[5]);
+          } while (++e < safe_window_dimensions[4]);
+        } while (++d < safe_window_dimensions[3]);
+      } while (++c < safe_window_dimensions[2]);
+    } while (++b < safe_window_dimensions[1]);
+  } while (++a < safe_window_dimensions[0]);
+
+  return output;
+}
+
+inline std::vector<int64_t> ReduceWindowShape(
+    std::vector<int64_t> shape, const std::vector<int64_t>& base_dilations,
+    const std::vector<int64_t>& padding,
+    const std::vector<int64_t>& window_dimensions,
+    const std::vector<int64_t>& window_dilations,
+    const std::vector<int64_t>& window_strides) {
+  const std::vector<int64_t> base_shape =
+      PadCropShape(DilateShape(shape, base_dilations), padding);
+  const std::vector<int64_t> dilated_window_dimensions =
+      DilateShape(window_dimensions, window_dilations);
+  shape.assign(base_shape.size(), 0);
+  for (int i = 0; i < base_shape.size(); ++i) {
+    if (base_shape[i] >= dilated_window_dimensions[i]) {
+      shape[i] =
+          (base_shape[i] - dilated_window_dimensions[i]) / window_strides[i] +
+          1;
+    }
+  }
+  return shape;
+}
+
+template <class T, class F>
+Tensor<T> ReduceWindow(const Tensor<T>& input,
+                       const std::vector<int64_t>& base_dilations,
+                       const std::vector<int64_t>& padding, const T& init_value,
+                       const std::vector<int64_t>& window_dimensions,
+                       const std::vector<int64_t>& window_dilations,
+                       const std::vector<int64_t>& window_strides, F&& body) {
+  Tensor<T> output = Tensor<T>::FromShape(
+      ReduceWindowShape(input.shape, base_dilations, padding, window_dimensions,
+                        window_dilations, window_strides),
+      init_value);
+
+  if (output.data.empty()) {
+    return output;
+  }
+
+  const std::vector<int64_t> safe_output_shape = ExtendToMaxDim(output.shape);
+  const std::vector<int64_t> safe_window_strides =
+      ExtendToMaxDim(window_strides);
+  const std::vector<int64_t> output_strides = output.Strides();
+
+  const Tensor<T> dilated = Dilate<T>(input, base_dilations, init_value);
+  const Tensor<T> padded = Pad<T>(dilated, padding, init_value);
+  const Tensor<T> base = Crop<T>(padded, padding);
+
+  std::vector<int64_t> output_offsets(6, 0);
+  std::vector<int64_t> window_offsets(6, 0);
+  do {
+    output_offsets[1] = 0;
+    window_offsets[1] = 0;
+    do {
+      output_offsets[2] = 0;
+      window_offsets[2] = 0;
+      do {
+        output_offsets[3] = 0;
+        window_offsets[3] = 0;
+        do {
+          output_offsets[4] = 0;
+          window_offsets[4] = 0;
+          do {
+            output_offsets[5] = 0;
+            window_offsets[5] = 0;
+            do {
+              const int64_t o_idx = output_offsets[0] * output_strides[0] +
+                                    output_offsets[1] * output_strides[1] +
+                                    output_offsets[2] * output_strides[2] +
+                                    output_offsets[3] * output_strides[3] +
+                                    output_offsets[4] * output_strides[4] +
+                                    output_offsets[5] * output_strides[5];
+              const Tensor<T> window = WindowCopy(
+                  base, window_dimensions, window_dilations, window_offsets);
+              if (window.data.empty()) {
+                output.data[o_idx] = init_value;
+              } else {
+                output.data[o_idx] = std::accumulate(
+                    window.data.begin(), window.data.end(), init_value, body);
+              }
+              window_offsets[5] += safe_window_strides[5];
+            } while (++output_offsets[5] < safe_output_shape[5]);
+            window_offsets[4] += safe_window_strides[4];
+          } while (++output_offsets[4] < safe_output_shape[4]);
+          window_offsets[3] += safe_window_strides[3];
+        } while (++output_offsets[3] < safe_output_shape[3]);
+        window_offsets[2] += safe_window_strides[2];
+      } while (++output_offsets[2] < safe_output_shape[2]);
+      window_offsets[1] += safe_window_strides[1];
+    } while (++output_offsets[1] < safe_output_shape[1]);
+    window_offsets[0] += safe_window_strides[0];
+  } while (++output_offsets[0] < safe_output_shape[0]);
+  return output;
+}
+
+}  // namespace reference
+}  // namespace reduce_window
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_STABLEHLO_REDUCE_WINDOW_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/subgraph_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/subgraph_test_util.h
new file mode 100644
index 00000000..f6518c68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/subgraph_test_util.h
@@ -0,0 +1,354 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module provides helper functions for testing the interaction between
+// control flow ops and subgraphs.
+// For convenience, we mostly only use `kTfLiteInt32` in this module.
+
+#ifndef TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/interpreter_test_util.h"
+
+namespace tflite {
+namespace subgraph_test_util {
+
+class SubgraphBuilder {
+ public:
+  ~SubgraphBuilder();
+
+  // Build a subgraph with ops which support memory sharing.
+  // An ADD node consumes a subgraph input so cannot be shared. RESHAPE consumes
+  // the output of ADD and may share. The second ADD can't share as it produces
+  // a subgraph output.
+  void BuildInplaceOpSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with broadcasting elementwise ops, some of which support
+  // sharing and some not.
+  void BuildBroadcastingSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with fictional op OFFSET_ADD which supports sharing of
+  // the second input but not the first.
+  void BuildOffsetAddSharing(Subgraph* subgraph);
+
+  // Build a subgraph with a dynamic update slice op which operates on
+  // a subgraph input tensor. The input buffer cannot be shared with the output.
+  void BuildInputDynamicUpdateSliceSubgraph(Subgraph& subgraph);
+
+  // Build a subgraph with a dynamic update slice op which operates on
+  // an intermediate tensor. The input buffer can be shared with the output if
+  // multiple nodes do not consume the input tensor.
+  void BuildInplaceDynamicUpdateSliceSubgraph(Subgraph& subgraph,
+                                              bool multiple_consumers);
+
+  // Build a subgraph whose output is not consumed by the parent subgraph.
+  void BuildOutputNotConsumedSubgraph(Subgraph& subgraph);
+
+  // Build an if subgraph with float inputs and outputs.
+  void BuildFloatIfSubgraph(Subgraph* subgraph, int num_inputs);
+
+  // Build a while subgraph with float inputs and outputs.
+  void BuildFloatWhileSubgraph(Subgraph* subgraph, int num_inputs);
+
+  // Build a while body subgraph with delegates to XNNPACK.
+  void BuildXNNPACKSubgraph(Subgraph* subgraph);
+
+  // Build a cond subgraph comparing float values.
+  void BuildFloatLessCondSubgraph(Subgraph* subgraph, float rhs);
+
+  // Build a body subgraph with a tensor which is both an input and an output.
+  void BuildInputIsOutputSubgraph(Subgraph* subgraph);
+
+  // Build a body subgraph with a tensor which is both an input and an output
+  // but in different positions.
+  void BuildInputIsDifferentOutputSubgraph(Subgraph* subgraph);
+
+  // Build a body subgraph whose output is written by a flex node.
+  void BuildFlexOutputSubgraph(Subgraph* subgraph);
+
+  // Build a body subgraph with only a counter.
+  void BuildCounterOnlySubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single binary op.
+  //
+  // The op must take in two inputs and have one output.
+  void BuildBinaryOpSubgraph(Subgraph* subgraph,
+                             TfLiteRegistration* (*Register_OP)(),
+                             TfLiteBuiltinOperator builtin_code, void* params,
+                             TfLiteType input1_type, TfLiteType input2_type,
+                             TfLiteType output_type);
+
+  // Build a subgraph with a single Add op.
+  // 2 inputs. 1 output.
+  void BuildAddSubgraph(Subgraph* subgraph,
+                        TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single stablehlo Add op.
+  // 2 inputs. 1 output.
+  void BuildStablehloAddSubgraph(Subgraph* subgraph,
+                                 TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single Maximum op.
+  // 2 inputs. 1 output.
+  void BuildMaximumSubgraph(Subgraph* subgraph,
+                            TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single stablehlo Maximum op.
+  // 2 inputs. 1 output.
+  void BuildStablehloMaximumSubgraph(Subgraph* subgraph,
+                                     TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single Minimum op.
+  // 2 inputs. 1 output.
+  void BuildMinimumSubgraph(Subgraph* subgraph,
+                            TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single stablehlo Minimum op.
+  // 2 inputs. 1 output.
+  void BuildStablehloMinimumSubgraph(Subgraph* subgraph,
+                                     TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single LogicalOr op.
+  // 2 inputs. 1 output.
+  void BuildLogicalOrSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single LogicalAnd op.
+  // 2 inputs. 1 output.
+  void BuildLogicalAndSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with no ops inside.
+  // 2 inputs. 1 output. Routes the second input to the output.
+  void BuildOutputIsSecondInputSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Mul op.
+  // 2 inputs. 1 output.
+  void BuildMulSubgraph(Subgraph* subgraph,
+                        TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single stablehlo Multiply op.
+  // 2 inputs. 1 output.
+  void BuildStablehloMulSubgraph(Subgraph* subgraph,
+                                 TfLiteType operand_type = kTfLiteInt32);
+
+  // Build a subgraph with a single Pad op.
+  // 2 inputs. 1 output.
+  void BuildPadSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single If op.
+  // 3 inputs:
+  //   The 1st input is condition with boolean type.
+  //   The 2nd and 3rd inputs are feed input the branch subgraphs.
+  // 1 output.
+  void BuildIfSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single StableHLO Composite op.
+  void BuildCompositeSubgraph(Subgraph* subgraph,
+                              const Subgraph* decomposition);
+
+  // Build a subgraph which triggers the reallocation of an inplace output
+  // tensor whose corresponding input has not been consumed yet. This tests that
+  // the input pointer has be updated.
+  void BuildDynamicOpTriggersAllocationOfUnsedInputSubgraph(Subgraph* subgraph);
+
+  // Build a body subgraph which tests all potential inplace write scenarios.
+  void BuildAllInplaceScenariosSubgraph(Subgraph* subgraph);
+  // Build a subgraph with a single Less op.
+  // The subgraph is used as the condition subgraph for testing `While` op.
+  // 2 inputs:
+  //   The 1st input is a counter with `kTfLiteInt32` type.
+  //   The 2nd input is ignored in this subgraph.
+  // 1 output with `kTfLiteBool` type.
+  //   Equivalent to (input < rhs).
+  void BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs);
+
+  // Build an if subgraph which does not consume an output of ifs body
+  // subgraph.
+  void BuildOutputNotConsumedIfSubgraph(Subgraph* subgraph);
+
+  // Build a while subgraph which does not consume an output of ifs body
+  // subgraph.
+  void BuildOutputNotConsumedWhileSubgraph(Subgraph* subgraph);
+
+  // Build a if subgraph with multiple inputs.
+  void BuildMultiInputIfSubgraph(Subgraph* subgraph, int num_inputs);
+
+  // Build a while subgraph with multiple inputs.
+  void BuildMultiInputWhileSubgraph(Subgraph* subgraph, int num_inputs);
+
+  // Build an if subgraph with multiple inputs and one output which is not
+  // consumed.
+  void BuildMultiInputIfSubgraphWithUnconsumedOutput(Subgraph* subgraph,
+                                                     int num_inputs);
+
+  // Build a while subgraph with multiple inputs and one output which is not
+  // consumed.
+  void BuildMultiInputWhileSubgraphWithUnconsumedOutput(Subgraph* subgraph,
+                                                        int num_inputs);
+
+  // Build a dynamic body subgraph with output tensor aliases.
+  void BuildDynamicBodySubgraphWithAliases(Subgraph* subgraph);
+
+  // Build a condition subgraph with a variable number of inputs and outputs.
+  void BuildLargeLessEqualCondSubgraph(Subgraph* subgraph, int rhs,
+                                       int num_inputs);
+
+  // An accumulate loop body subgraph. Used to produce triangle number
+  // sequence. 2 inputs and 2 outputs
+  //   Equivalent to (counter, value) -> (counter + 1, counter + 1 + value)
+  void BuildAccumulateLoopBodySubgraph(Subgraph* subgraph);
+
+  // An loop body subgraph in which the inputs and outputs may be shared.
+  void BuildDeepBodySubgraph(Subgraph* subgraph);
+
+  // A loop body subgraph with arbitrary sized inputs which may be shared.
+  void BuildLargeBodySubgraph(Subgraph* subgraph);
+
+  // Build a body subgraph whose output size increases each iteration.
+  void BuildDynamicIncreasingSizeSubgraph(Subgraph* subgraph);
+
+  // Build a body subgraph which increasing output size whose input and output
+  // tensors could potentially share buffers.
+  void BuildLargePadSubgraph(Subgraph* subgraph, std::vector<int> padding);
+
+  // A pad loop body subgraph. When used in a loop it will repeatively enlarge
+  // the
+  //   tensor.
+  // 2 inputs and 2 outputs.
+  //   Equivalent to (counter, value) -> (counter + 1, tf.pad(value, padding))
+  // Note the padding is created as a constant tensor.
+  void BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                const std::vector<int>& padding);
+
+  // Build a subgraph with a single While op.
+  // 2 inputs, 2 outputs.
+  void BuildWhileSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph that assigns a random value to a variable.
+  // No input/output.
+  void BuildAssignRandomValueToVariableSubgraph(Subgraph* graph);
+
+  // Build a subgraph with CallOnce op and ReadVariable op.
+  // No input and 1 output.
+  void BuildCallOnceAndReadVariableSubgraph(Subgraph* graph);
+
+  // Build a subgraph with CallOnce op, ReadVariable op and Add op.
+  // No input and 1 output.
+  void BuildCallOnceAndReadVariablePlusOneSubgraph(Subgraph* graph);
+
+  // Build a subgraph with a single Less op.
+  // The subgraph is used as the condition subgraph for testing `While` op.
+  // 3 inputs:
+  //   The 1st and 2nd inputs are string tensors, which will be ignored.
+  //   The 3rd input is an integner value as a counter in this subgraph.
+  // 1 output with `kTfLiteBool` type.
+  //   Equivalent to (int_val < rhs).
+  void BuildLessEqualCondSubgraphWithDynamicTensor(Subgraph* subgraph, int rhs);
+
+  // Build a subgraph with a single While op, which has 3 inputs and 3 outputs.
+  // This subgraph is used for creating/invoking dynamic allocated tensors based
+  // on string tensors.
+  //   Equivalent to (str1, str2, int_val) ->
+  //                 (str1, Fill(str1, int_val + 1), int_val + 1).
+  void BuildBodySubgraphWithDynamicTensor(Subgraph* subgraph);
+
+  // Build a subgraph with a single If op, that contains 4 inputs and 3
+  // outputs (str1, str2, int_val).
+  void BuildIfSubgraphWithDynamicTensor(Subgraph* subgraph);
+
+  // Build a subgraph with a single While op, that contains 3 inputs and 3
+  // outputs (str1, str2, int_val).
+  void BuildWhileSubgraphWithDynamicTensor(Subgraph* subgraph);
+
+ private:
+  template <typename T = int32_t>
+  void CreateConstantTensor(Subgraph* subgraph, int tensor_index,
+                            const std::vector<int>& shape,
+                            const std::vector<T>& data) {
+    ASSERT_GT(shape.size(), 0);
+    const int num_elements = absl::c_accumulate(shape, 1, std::multiplies<>());
+    ASSERT_EQ(data.size(), num_elements);
+    const size_t size_in_bytes = sizeof(T) * num_elements;
+    // Maybe aligned.
+    T* buffer = reinterpret_cast<T*>(malloc(size_in_bytes));
+    memcpy(buffer, data.data(), size_in_bytes);
+    buffers_.push_back(buffer);
+    ASSERT_EQ(subgraph->SetTensorParametersReadOnly(
+                  tensor_index, typeToTfLiteType<T>(), "", shape, {},
+                  reinterpret_cast<const char*>(buffer), size_in_bytes),
+              kTfLiteOk);
+  }
+
+  std::vector<void*> buffers_;
+};
+
+class ControlFlowOpTest : public InterpreterTest {
+ public:
+  ControlFlowOpTest() : builder_(new SubgraphBuilder) {}
+
+  ~ControlFlowOpTest() override { builder_.reset(); }
+
+ protected:
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+// Fill a `TfLiteTensor` with a 32-bits integer vector.
+// Preconditions:
+// * The tensor must have `kTfLiteInt32` type.
+// * The tensor must be allocated.
+// * The element count of the tensor must be equal to the length or
+//   the vector.
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data);
+
+// Fill a `TfLiteTensor` with a string value.
+// Preconditions:
+// * The tensor must have `kTfLitString` type.
+void FillScalarStringTensor(TfLiteTensor* tensor, const std::string& data);
+
+// Check if the scalar string data of a tensor is as expected.
+void CheckScalarStringTensor(const TfLiteTensor* tensor,
+                             const std::string& data);
+
+// Check if the shape and string data of a tensor is as expected.
+void CheckStringTensor(const TfLiteTensor* tensor,
+                       const std::vector<int>& shape,
+                       const std::vector<std::string>& data);
+
+// Check if the shape and int32 data of a tensor is as expected.
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data);
+// Check if the shape and bool data of a tensor is as expected.
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data);
+
+// Sets the tensor to be readable and writable. Call this on input
+// tensors when constructing Subgraphs to test.
+void SetupTensor(Subgraph* subgraph, int tensor_index, TfLiteType type);
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/tensor_slice_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/tensor_slice_util.h
new file mode 100644
index 00000000..48c6de8b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/tensor_slice_util.h
@@ -0,0 +1,127 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TENSOR_SLICE_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_TENSOR_SLICE_UTIL_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+// Element i contains the index of an entry in the i-th dimension.
+template <typename IndexType>
+using Index = std::vector<IndexType>;
+
+// Returns true if the `array` contains `value`.
+// Always returns false if the array is empty.
+inline bool ArrayContains(const int64_t* array, int size, int64_t value) {
+  if (size == 0) {
+    return false;
+  }
+  return std::find(array, array + size, value) != array + size;
+}
+
+// Creates a new Index based on the provided `scatter_dims`.
+// Example:
+// For `index`={s0, s1}, `scatter_dims`=[1, 0], returns {s2,s1,0}.
+// Result has same size as rank of input. All the result dimensions not in
+// `scatter_dims` get the value 0.
+template <typename IndexType>
+TfLiteStatus ScatterIndex(const Index<IndexType>& index,
+                          const int64_t* scatter_dims, int num_scatter_dims,
+                          int64_t to_rank, Index<IndexType>* result) {
+  if (result == nullptr) {
+    return kTfLiteError;
+  }
+
+  *result = Index<IndexType>(to_rank, 0);
+  for (int idx = 0; idx < num_scatter_dims; ++idx) {
+    if (scatter_dims[idx] >= result->size()) {
+      return kTfLiteError;
+    }
+    (*result)[scatter_dims[idx]] = index[idx];
+  }
+  return kTfLiteOk;
+}
+
+// A helper function that converts a tensor index into a flat array index.
+template <typename IndexType>
+IndexType TensorIndexToFlat(const IndexType* index, const int64_t dims,
+                            const RuntimeShape& shape) {
+  // If it's a scalar, just return the index of the first element.
+  if (dims == 0) {
+    return 0;
+  }
+  IndexType flat_index = index[0];
+  for (int64_t i = 1; i < dims; ++i) {
+    flat_index = flat_index * shape.Dims(i) + index[i];
+  }
+  return flat_index;
+}
+
+template <typename IndexType>
+Index<IndexType> AddIndices(const Index<IndexType>& index1,
+                            const Index<IndexType>& index2) {
+  Index<IndexType> result;
+  result.reserve(index1.size());
+  for (int64_t dim = 0; dim < index1.size(); ++dim) {
+    result.push_back(index1[dim] + index2[dim]);
+  }
+  return result;
+}
+
+// Creates a new Index with the number of dimensions increased with respect to
+// `avoided_dims` array.
+// Example: `index`=[i, j], `avoided_dims`=[1], the result is [i, 0, j]
+template <typename IndexType>
+TfLiteStatus ExpandDims(const Index<IndexType>& index,
+                        const int64_t* avoided_dims, int num_avoided_dims,
+                        Index<IndexType>* result) {
+  std::vector<int64_t> scatter_dims;
+  scatter_dims.reserve(index.size());
+  int64_t ctr = 0;
+  for (int idx = 0; idx < index.size(); ++idx) {
+    while (ArrayContains(avoided_dims, num_avoided_dims, ctr)) {
+      ++ctr;
+    }
+    scatter_dims.push_back(ctr);
+    ++ctr;
+  }
+  TF_LITE_ENSURE_STATUS(ScatterIndex(index, scatter_dims.data(),
+                                     scatter_dims.size(),
+                                     index.size() + num_avoided_dims, result));
+  return kTfLiteOk;
+}
+
+// Reads one array from a given tensor.
+// Example: `other_indices`=[j, l, m], `dim_to_read`= 2
+// The resulting read array is: [j, l, :, m]
+template <typename IndexType>
+Index<IndexType> ReadIndexVector(const TfLiteTensor* indices_tensor,
+                                 const RuntimeShape& tensor_shape,
+                                 const Index<IndexType>& other_indices,
+                                 int64_t dim_to_read);
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TENSOR_SLICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/test_delegate_providers.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/test_delegate_providers.h
new file mode 100644
index 00000000..8995f789
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/test_delegate_providers.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TEST_DELEGATE_PROVIDERS_H_
+#define TENSORFLOW_LITE_KERNELS_TEST_DELEGATE_PROVIDERS_H_
+
+#include <vector>
+
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+// A utility class to provide TfLite delegate creations for kernel tests. The
+// options of a particular delegate could be specified from commandline flags by
+// using the delegate provider registrar as implemented in lite/tools/delegates
+// directory.
+class KernelTestDelegateProviders {
+ public:
+  // Returns a global KernelTestDelegateProviders instance.
+  static KernelTestDelegateProviders* Get();
+
+  KernelTestDelegateProviders();
+
+  // Initialize delegate-related parameters from commandline arguments and
+  // returns true if successful.
+  bool InitFromCmdlineArgs(int* argc, const char** argv);
+
+  // This provides a way to overwrite parameter values programmatically before
+  // creating TfLite delegates. Note, changes to the returned ToolParams will
+  // have a global impact on creating TfLite delegates.
+  // If a local-only change is preferred, recommend using the following workflow
+  // create TfLite delegates via delegate providers:
+  // tools::ToolParams local_params;
+  // local_params.Merge(KernelTestDelegateProviders::Get()->ConstParams());
+  // Overwrite params in local_params by calling local_params.Set<...>(...);
+  // Get TfLite delegates via
+  // KernelTestDelegateProviders::Get()->CreateAllDelegates(local_params);
+  tools::ToolParams* MutableParams() { return &params_; }
+  const tools::ToolParams& ConstParams() const { return params_; }
+
+  // Create a list of TfLite delegates based on the provided parameters
+  // `params`.
+  std::vector<tools::ProvidedDelegateList::ProvidedDelegate> CreateAllDelegates(
+      const tools::ToolParams& params) const {
+    tools::ProvidedDelegateList util;
+    return util.CreateAllRankedDelegates(params);
+  }
+
+  // Similar to the above, but creating a list of TfLite delegates based on what
+  // have been initialized (i.e. 'params_').
+  std::vector<tools::ProvidedDelegateList::ProvidedDelegate>
+  CreateAllDelegates() const {
+    return delegate_list_util_.CreateAllRankedDelegates();
+  }
+
+  // An option name to use Simple Memory Allocator.
+  static constexpr char kUseSimpleAllocator[] = "use_simple_allocator";
+  // An option name to provide acceleration test config file.
+  static constexpr char kAccelerationTestConfigPath[] =
+      "acceleration_test_config_path";
+
+  // An option name to allow fp16 precision for fp32 computation.
+  static constexpr char kAllowFp16PrecisionForFp32[] =
+      "allow_fp16_precision_for_fp32";
+
+ private:
+  // Contain delegate-related parameters that are initialized from command-line
+  // flags.
+  tools::ToolParams params_;
+
+  // A helper to create TfLite delegates.
+  tools::ProvidedDelegateList delegate_list_util_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TEST_DELEGATE_PROVIDERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/test_util.h
new file mode 100644
index 00000000..58017e21
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/test_util.h
@@ -0,0 +1,1407 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <functional>
+#include <initializer_list>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "tensorflow/lite/type_to_tflitetype.h"
+#include "tensorflow/lite/util.h"
+#include "tsl/platform/logging.h"
+
+namespace tflite {
+
+// This constant indicates the error bound is derived automatically in functions
+// like ArrayFloatNear.
+constexpr float kFpErrorAuto = -1;
+
+// Returns whether we allow FP16 precision for FP32 operations, i.e. in FP16
+// mode.
+bool AllowFp16PrecisionForFp32();
+
+// It checks if the actual number almost equals the expected number with the
+// tolerance of 4 FP16 ULPs in FP16 mode; 4 FP32 ULPs in FP32 mode.
+// Given float x, 2^e <= |x| <= 2^(e+1), then ULP(x) = 2^(max(e, e_min)-p+1)
+// where e_min is -24 for FP16, -126 for FP32; p is 10 for FP16, 23 for FP32.
+::testing::Matcher<std::tuple<float, float>> FloatingPointAlmostEq();
+
+// In FP32 mode, it equals to Eq(), which means the error bound is zero (no
+// error allowed); in FP16 mode, it checks if the actual number almost equals
+// the expected number with the tolerance of 4 FP16 ULPs.
+::testing::Matcher<std::tuple<float, float>> FloatingPointEq();
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance. In FP32 mode, the tolerance is max(max_abs_err, value *
+// max_rel_err). In FP16 mode, the tolerance is max(fp16_max_abs_err, value *
+// fp16_max_rel_err). If fp16_max_abs_err is kFpErrorAuto, it is set to
+// std::max(max_abs_err, sqrt(max_abs_err)) automatically.
+std::vector<::testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_err = 1e-5,
+    float fp16_max_abs_err = kFpErrorAuto, float max_rel_err = 0,
+    float fp16_max_rel_err = 0.01);
+
+// TODO(b/280061335): Add FP16 logic as ArrayFloatNear does.
+// A gmock matcher that check that elements of a complex vector match to a given
+// tolerance.
+std::vector<::testing::Matcher<std::complex<float>>> ArrayComplex64Near(
+    const std::vector<std::complex<float>>& values, float max_abs_error = 1e-5);
+
+template <typename T>
+inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
+                               int32_t zero_point,
+                               TfLiteType type = kTfLiteNoType) {
+  std::vector<T> q;
+
+  T min = std::numeric_limits<T>::min();
+  T max = std::numeric_limits<T>::max();
+
+  if (type == kTfLiteInt4) {
+    min = -7;
+    max = 7;
+  }
+
+  q.reserve(data.size());
+  for (const auto& f : data) {
+    q.push_back(static_cast<T>(std::max<float>(
+        min, std::min<float>(max, std::round(zero_point + (f / scale))))));
+  }
+  return q;
+}
+
+template <typename T>
+inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
+                                     int32_t zero_point) {
+  std::vector<float> f;
+  f.reserve(data.size());
+  for (const T& q : data) {
+    f.push_back(scale * (q - zero_point));
+  }
+  return f;
+}
+
+template <>
+constexpr TfLiteType typeToTfLiteType<Eigen::half>() {
+  return kTfLiteFloat16;
+}
+
+template <>
+constexpr TfLiteType typeToTfLiteType<Eigen::bfloat16>() {
+  return kTfLiteBFloat16;
+}
+
+// A test model that contains a single operator. All operator inputs and
+// output are external to the model, so the tests can directly access them.
+// Typical usage:
+//    SingleOpModel m;
+//    int a = m.AddInput({TensorType_FLOAT32, a_shape});
+//    int b = m.AddInput({TensorType_FLOAT32, b_shape});
+//    int c = m.AddOutput({TensorType_FLOAT32, {}});
+//    m.SetBuiltinOp(...);
+//    m.BuildInterpreter({GetShape(a), GetShape(b)});
+//    m.PopulateTensor(a, {...});
+//    m.PopulateTensor(b, {...});
+//    m.Invoke();
+//    EXPECT_THAT(m.ExtractVector<float>(c), ArrayFloatNear({...}));
+//
+
+// A helper struct to construct test tensors. This is particularly useful for
+// quantized tensor which must have their scale and zero_point defined before
+// the actual data is known. This mimics what happens in practice: quantization
+// parameters are calculated during training or post training..
+struct TensorData {
+  // NOLINTNEXTLINE
+  TensorData(TensorType type = TensorType_FLOAT32, std::vector<int> shape = {},
+             float min = 0.0f, float max = 0.0f, float scale = 0.0f,
+             int32_t zero_point = 0, bool per_channel_quantization = false,
+             std::vector<float> per_channel_quantization_scales = {},
+             std::vector<int64_t> per_channel_quantization_offsets = {},
+             int32_t channel_index = 0, std::vector<int> traversal_order = {},
+             std::vector<TfLiteDimensionType> format = {},
+             std::vector<int> block_size = {}, std::vector<int> block_map = {},
+             std::vector<int> shape_signature = {})
+      : type(type),
+        shape(shape),
+        min(min),
+        max(max),
+        scale(scale),
+        zero_point(zero_point),
+        per_channel_quantization(per_channel_quantization),
+        per_channel_quantization_scales(
+            std::move(per_channel_quantization_scales)),
+        per_channel_quantization_offsets(
+            std::move(per_channel_quantization_offsets)),
+        channel_index(channel_index),
+        traversal_order(traversal_order),
+        format(format),
+        block_size(block_size),
+        block_map(block_map),
+        shape_signature(shape_signature) {}
+  TensorType type;
+  std::vector<int> shape;
+  float min;
+  float max;
+  float scale;
+  int32_t zero_point;
+  bool per_channel_quantization;
+  std::vector<float> per_channel_quantization_scales;
+  std::vector<int64_t> per_channel_quantization_offsets;
+  int32_t channel_index;
+  std::vector<int> traversal_order;
+  std::vector<TfLiteDimensionType> format;
+  std::vector<int> block_size;
+  std::vector<int> block_map;
+  std::vector<int> shape_signature;
+};
+
+class SingleOpResolver : public OpResolver {
+ public:
+  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration,
+                   int version = 1)
+      : op_(op), registration_(*registration) {
+    registration_.builtin_code = static_cast<int32_t>(op);
+    registration_.version = version;
+  }
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override {
+    if (op == op_) {
+      return &registration_;
+    }
+    return nullptr;
+  }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    return nullptr;
+  }
+
+ private:
+  const BuiltinOperator op_;
+  TfLiteRegistration registration_;
+};
+
+class SingleOpModel;
+class AccelerationValidator {
+ public:
+  using Callback = std::function<void(const SingleOpModel& model)>;
+
+  // Returns a global AccelerationValidator instance.
+  static AccelerationValidator* Get();
+
+  // Adds a callback function that will be invoked at the end of a kernel test
+  // to validate acceleration.
+  void AddCallback(Callback callback);
+
+  // Performs acceleration validation with all registered callbacks.
+  void Validate(const SingleOpModel& model) const;
+
+ private:
+  std::vector<Callback> callbacks_;
+};
+
+class SingleOpModel {
+ public:
+  SingleOpModel() = default;
+  ~SingleOpModel();
+
+  // Set a delegate that is applied right after graph is prepared. This is
+  // useful for testing other runtimes like NN API or GPU.
+  // Note: the caller still owns the memory of the passed-in `delegate`.
+  void SetDelegate(TfLiteDelegate* delegate) {
+    delegate_ = delegate;
+    // As this is a manually-set TF Lite delegate, we assume the intention of
+    // the test is to test against the particular delegate, hence bypassing
+    // applying TfLite default delegates (i.e. the XNNPACK delegate).
+    if (delegate_ != nullptr) {
+      SetBypassDefaultDelegates();
+    }
+  }
+
+  TfLiteStatus ApplyDelegate();
+
+  // Copying or assignment is disallowed to simplify ownership semantics.
+  SingleOpModel(const SingleOpModel&) = delete;
+  SingleOpModel& operator=(const SingleOpModel&) = delete;
+
+  // Add a TensorType input tensor and return its index.
+  int AddInput(const TensorData& t);
+  int AddVariableInput(const TensorData& t);
+
+  int AddIntermediate(TensorType type, const std::vector<float>& scale,
+                      const std::vector<int64_t>& zero_point);
+
+  // Returns the input tensor at position `index`.
+  TfLiteTensor* GetInputTensor(int index) {
+    return interpreter_->input_tensor(index);
+  }
+
+  // Returns the output tensor at position `index`.
+  TfLiteTensor* GetOutputTensor(int index) {
+    return interpreter_->output_tensor(index);
+  }
+  // Templated version of AddConstInput() taking pointer and size.
+  template <typename T>
+  int AddConstInput(const TensorData& t, const T* data, size_t size) {
+    int id = 0;
+    if (t.per_channel_quantization) {
+      id = AddTensorPerChannelQuant(t, data, size);
+    } else {
+      id = AddTensor(t, data, size);
+    }
+    inputs_.push_back(id);
+    return id;
+  }
+
+  // Templated version of AddConstInput() taking vector and shape.
+  template <typename T>
+  int AddConstInput(TensorType type, const std::vector<T>& data,
+                    std::initializer_list<int> shape) {
+    return AddConstInput(TensorData{type, shape}, data.data(), data.size());
+  }
+
+  // Templated version of AddConstInput() taking TensorType, initializer_list
+  // and shape.
+  template <typename T>
+  int AddConstInput(TensorType type, std::initializer_list<T> data,
+                    std::initializer_list<int> shape) {
+    return AddConstInput<T>(TensorData{type, shape}, data.begin(), data.size());
+  }
+
+  // Templated version of AddConstInput() taking TensorData, initializer_list
+  // and shape.
+  template <typename T>
+  int AddConstInput(const TensorData& t, std::initializer_list<T> data) {
+    return AddConstInput(t, data.begin(), data.size());
+  }
+
+  // Templated version of AddConstInput() taking TensorData and vector.
+  template <typename T>
+  int AddConstInput(const TensorData& t, const std::vector<T>& data) {
+    return AddConstInput(t, data.data(), data.size());
+  }
+
+  // TODO(b/166202747): Use a better way to do type specialization. Reduce
+  // duplicate code in the two functions below.
+  int AddConstSparseInput(const TensorData& t,
+                          const std::vector<int8_t>& data) {
+    int id = tensors_.size();
+    const int dims_count = t.traversal_order.size();
+    std::vector<int8_t> dense_data(data);
+
+    tflite::internal::sparsity::FormatConverter<int8_t> converter(
+        t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
+    converter.DenseToSparse(dense_data.data());
+
+    const auto& dim_metadata = converter.GetDimMetadata();
+    const auto& sparse_data = converter.GetData();
+
+    // Build sparsity parameter.
+    std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
+        dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      const int metadata_idx = 2 * i;
+      if (i < t.shape.size() &&
+          t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
+        auto array_segments =
+            CreateInt32Vector(builder_, builder_.CreateVector<int>(
+                                            dim_metadata[metadata_idx]))
+                .Union();
+        auto array_indices =
+            CreateInt32Vector(builder_, builder_.CreateVector<int>(
+                                            dim_metadata[metadata_idx + 1]))
+                .Union();
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_SPARSE_CSR, 0,
+            SparseIndexVector_Int32Vector, array_segments,
+            SparseIndexVector_Int32Vector, array_indices);
+      } else {
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
+      }
+    }
+
+    flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
+        builder_, builder_.CreateVector<int>(t.traversal_order),
+        builder_.CreateVector<int>(t.block_map),
+        builder_.CreateVector(fb_dim_metadata));
+
+    int buffer_id = 0;
+    if (!data.empty()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add compressed data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer = builder_.CreateVector(
+          reinterpret_cast<const uint8_t*>(sparse_data.data()),
+          sparse_data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
+
+    inputs_.push_back(id);
+    tensor_data_[id] = t;
+
+    return id;
+  }
+
+  // Add a constant sparse tensor as input.
+  template <typename T>
+  int AddConstSparseInput(const TensorData& t, const std::vector<T>& data,
+                          bool symmetric_quantize = false) {
+    int id = tensors_.size();
+    const int dims_count = t.traversal_order.size();
+    std::vector<T> dense_data(data);
+
+    tflite::internal::sparsity::FormatConverter<T> converter(
+        t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
+    converter.DenseToSparse(dense_data.data());
+
+    const auto dim_metadata = converter.GetDimMetadata();
+    const auto sparse_data = converter.GetData();
+
+    // Build sparsity parameter.
+    std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
+        dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      const int metadata_idx = 2 * i;
+      if (i < t.shape.size() &&
+          t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
+        auto array_segments =
+            CreateInt32Vector(builder_,
+                              builder_.CreateVector(dim_metadata[metadata_idx]))
+                .Union();
+        auto array_indices =
+            CreateInt32Vector(
+                builder_, builder_.CreateVector(dim_metadata[metadata_idx + 1]))
+                .Union();
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_SPARSE_CSR, 0,
+            SparseIndexVector_Int32Vector, array_segments,
+            SparseIndexVector_Int32Vector, array_indices);
+      } else {
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
+      }
+    }
+
+    flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
+        builder_, builder_.CreateVector(t.traversal_order),
+        builder_.CreateVector(t.block_map),
+        builder_.CreateVector(fb_dim_metadata));
+
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+    int buffer_id = 0;
+    if (!data.empty()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add compressed data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      // When the quantization parameter is set for the added tensor, we
+      // quantize the given data.
+      bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0 ||
+                           (t.per_channel_quantization &&
+                            !t.per_channel_quantization_scales.empty() &&
+                            !t.per_channel_quantization_offsets.empty()));
+      if (symmetric_quantize) {
+        const int length = sparse_data.size();
+        std::vector<int8_t> q(length);
+        float min, max, scaling_factor;
+        tensor_utils::SymmetricQuantizeFloats(
+            sparse_data.data(), length, q.data(), &min, &max, &scaling_factor);
+        std::vector<float> scales{scaling_factor};
+        std::vector<int64_t> zero_points{0};
+        q_params = CreateQuantizationParameters(
+            builder_, 0, 0, builder_.CreateVector<float>(scales),
+            builder_.CreateVector<int64_t>(zero_points));
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(q.data()), q.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      } else if (is_quantized) {
+        CHECK_EQ(t.type, TensorType_INT8)
+            << "The INT8 quantization is only supported for sparsified tensor";
+        std::vector<int8_t> quantized_output(sparse_data.size());
+        std::vector<float> scales;
+        std::vector<int64_t> zero_points;
+        if (t.per_channel_quantization) {
+          CHECK_EQ(t.per_channel_quantization_scales.size(),  // NOLINT
+                   t.per_channel_quantization_offsets.size())
+              << "Per channel quantization scales and offsets should have the "
+                 "same size";
+          std::vector<int8_t> temp_data(dense_data.size());
+          const int32_t num_channel = t.shape[t.channel_index];
+          std::vector<float> scales_inv(num_channel);
+          for (int i = 0; i < num_channel; ++i) {
+            scales_inv[i] = 1.0f / t.per_channel_quantization_scales[i];
+          }
+          optimize::utils::SymmetricPerChannelQuantizeValues(
+              dense_data.data(), scales_inv, t.shape, t.channel_index,
+              &temp_data, kTfLiteInt8);
+
+          tflite::internal::sparsity::FormatConverter<int8_t> quant_converter(
+              t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
+          quant_converter.DenseToSparse(temp_data.data());
+          quantized_output = std::move(quant_converter.GetData());
+          scales = t.per_channel_quantization_scales;
+          zero_points = t.per_channel_quantization_offsets;
+
+        } else {
+          quantized_output =
+              std::move(Quantize<int8_t>(sparse_data, t.scale, t.zero_point));
+          scales = {t.scale};
+          zero_points = {0};
+        }
+        q_params = CreateQuantizationParameters(
+            builder_, t.min, t.max, builder_.CreateVector<float>(scales),
+            builder_.CreateVector<int64_t>(zero_points));
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(quantized_output.data()),
+            quantized_output.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      } else {
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(sparse_data.data()),
+            sizeof(T) * sparse_data.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      }
+    }
+
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
+                     symmetric_quantize ? TensorType_INT8 : t.type,
+                     /*buffer=*/buffer_id,
+                     /*name=*/0, q_params, /*is_variable=*/false, s_param));
+
+    inputs_.push_back(id);
+    tensor_data_[id] = t;
+
+    return id;
+  }
+
+  // Add a null input tensor (optional input) and return kTfLiteOptionalTensor.
+  int AddNullInput();
+
+  // Add a TensorType output tensor and return its index.
+  int AddOutput(const TensorData& t);
+
+  template <typename T>
+  void QuantizeAndPopulate(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto q = Quantize<T>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor(index, 0, q.data(), q.data() + q.size());
+  }
+
+  void QuantizeAndPopulate4bit(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    t->type = kTfLiteInt4;
+    std::vector<int8_t> quantized_output =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
+                       quantized_output.data() + quantized_output.size());
+  }
+
+  void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
+    std::vector<int8_t> q = QuantizeTensor(index, data);
+    PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
+                   reinterpret_cast<uint8_t*>(q.data() + q.size()));
+  }
+
+  void SignedSymmetricQuantizeAndPopulate(int index,
+                                          const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    if (t->type == kTfLiteInt4) {
+      std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
+                                               t->params.zero_point, t->type);
+      PopulateTensor4bit(index, /*offset=*/0, q.data(), q.data() + q.size());
+    } else {
+      std::vector<int8_t> q = QuantizeTensor(index, data);
+      PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
+    }
+  }
+
+  // Quantize and populate data for filter with per channel quantization.
+  void PerChannelSymmetricQuantizeAndPopulate(
+      int index, const std::vector<float>& input_data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    const int channel_index = params->quantized_dimension;
+
+    std::vector<int32_t> shape(t->dims->size);
+    for (size_t i = 0; i < shape.size(); ++i) {
+      shape[i] = t->dims->data[i];
+    }
+    const int32_t num_inputs = input_data.size();
+    const int32_t num_channel = shape[channel_index];
+    std::vector<int8_t> quantized_output(num_inputs);
+    std::vector<float> scales_inv(num_channel);
+    for (int i = 0; i < num_channel; ++i) {
+      const float scale = params->scale->size == 1 ? params->scale->data[0]
+                                                   : params->scale->data[i];
+      scales_inv[i] = 1.0f / scale;
+    }
+
+    optimize::utils::SymmetricPerChannelQuantizeValues(
+        input_data.data(), scales_inv, shape, channel_index, &quantized_output,
+        t->type);
+
+    if (t->type == kTfLiteInt4) {
+      PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
+                         quantized_output.data() + quantized_output.size());
+
+    } else {
+      PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                     quantized_output.data() + quantized_output.size());
+    }
+  }
+
+  template <typename T>
+  void PerChannelQuantizeBiasPopulateTensor(
+      const std::vector<float>& input_data, int index,
+      TfLiteAffineQuantization* params) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<T> quantized_output(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      const float scale = params->scale->size == 1 ? params->scale->data[0]
+                                                   : params->scale->data[i];
+      quantized_output[i] = input_data[i] / scale;
+    }
+  }
+
+  template <typename T>
+  void PerChannelQuantizeBiasPopulateTensor(
+      int index, const std::vector<float>& input_data,
+      const TfLiteAffineQuantization* params) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<T> quantized_output(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      const float scale = params->scale->size == 1 ? params->scale->data[0]
+                                                   : params->scale->data[i];
+      quantized_output[i] = input_data[i] / scale;
+    }
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
+  // Quantize and populate data for bias with per channel quantization.
+  void PerChannelQuantizeBias(int index, const std::vector<float>& input_data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    CHECK(t->type == kTfLiteInt32 || t->type == kTfLiteInt64);
+    if (t->type == kTfLiteInt32) {
+      PerChannelQuantizeBiasPopulateTensor<int32_t>(index, input_data, params);
+    } else {
+      PerChannelQuantizeBiasPopulateTensor<int64_t>(index, input_data, params);
+    }
+  }
+
+  const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
+
+  float GetScale(int id) { return tensor_data_.at(id).scale; }
+  int32_t GetZeroPoint(int id) { return tensor_data_.at(id).zero_point; }
+
+  // Define the operator in this model.
+  void SetBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
+                    flatbuffers::Offset<void> builtin_options);
+  void SetBuiltinOp(BuiltinOperator type,
+                    BuiltinOptions2 builtin_options_2_type,
+                    flatbuffers::Offset<void> builtin_options_2);
+  void SetCustomOp(const string& name,
+                   const std::vector<uint8_t>& custom_option,
+                   const std::function<TfLiteRegistration*()>& registration);
+
+  // Allocate tensors and apply delegate.
+  // Note that this is called by default in BuiltInterpreter().
+  void AllocateAndDelegate(bool apply_delegate);
+
+  // Build the interpreter for this model. Also, resize and allocate all
+  // tensors given the shapes of the inputs.
+  // Note, if `allocate_and_delegate` is `false`, then the value of
+  // `apply_delegate` is ignored.
+  void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
+                        int num_threads, bool allow_fp32_relax_to_fp16,
+                        bool apply_delegate, bool allocate_and_delegate = true,
+                        bool use_simple_allocator = false);
+
+  void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
+                        bool use_simple_allocator = false);
+
+  // Executes inference and return status code.
+  TfLiteStatus Invoke();
+
+  void PopulateStringTensor(int index, const std::vector<string>& content) {
+    auto tensor = interpreter_->tensor(index);
+    DynamicBuffer buf;
+    for (const string& s : content) {
+      buf.AddString(s.data(), s.length());
+    }
+    buf.WriteToTensor(tensor, /*new_shape=*/nullptr);
+  }
+
+  // Populates the tensor given its index.
+  template <typename T>
+  void PopulateTensor(int index, const std::initializer_list<T>& data) {
+    PopulateTensorImpl<T>(index, /*offset=*/0, data);
+  }
+
+  // Populates the tensor given its index.
+  template <typename T>
+  void PopulateTensor(int index, const std::vector<T>& data) {
+    PopulateTensorImpl<T>(index, /*offset=*/0, data);
+  }
+
+  // Populates the tensor given its index.
+  template <typename T>
+  void PopulateTensor(int index, absl::Span<const T> data) {
+    PopulateTensorImpl<T>(index, /*offset=*/0, data);
+  }
+
+  // Partially populates the tensor, starting at the given offset.
+  template <typename T>
+  void PopulateTensor(int index, int offset, T* begin, T* end) {
+    PopulateTensorImpl<T>(index, offset, absl::Span<T>(begin, end - begin));
+  }
+
+  // Return a vector with the flattened contents of a tensor.
+  template <typename T>
+  std::vector<T> ExtractVector(int index) const {
+    const T* v = interpreter_->typed_tensor<T>(index);
+    const auto* tensor = interpreter_->tensor(index);
+    CHECK(v) << "Could not extract vector at index: " << index;
+    int tensor_size;
+    if (tensor->sparsity) {
+      // Getting the size of the sparse buffer this way is based on the
+      // assumption that the last dimension of the tensor is a compressed
+      // dimension.
+      tensor_size = tensor->sparsity
+                        ->dim_metadata[tensor->sparsity->dim_metadata_size - 1]
+                        .array_indices->size;
+    } else {
+      tensor_size = GetTensorSize(index);
+    }
+
+    return std::vector<T>(v, v + tensor_size);
+  }
+
+  // Return the TFLite model buffer, only available after BuildInterpreter.
+  const uint8_t* GetModelBuffer() { return builder_.GetBufferPointer(); }
+
+  std::vector<int> GetTensorShape(int index) {
+    std::vector<int> result;
+    TfLiteTensor* t = interpreter_->tensor(index);
+    result.reserve(t->dims->size);
+    for (int i = 0; i < t->dims->size; ++i) {
+      result.push_back(t->dims->data[i]);
+    }
+    return result;
+  }
+
+  // Sets the number of threads available to the interpreter.
+  // Reconstruct the interpreter if reset_interpreter is true.
+  void SetNumThreads(int num_threads, bool reset_interpreter = false) {
+    CHECK(interpreter_ != nullptr);
+    if (reset_interpreter) {
+      // Reconstruct interpreter as number of threads may affect internal state,
+      // e.g. stratch buffer allocation.
+      BuildInterpreter(input_shapes_, num_threads, allow_fp32_relax_to_fp16_,
+                       apply_delegate_, allocate_and_delegate_);
+    }
+    interpreter_->SetNumThreads(num_threads);
+  }
+
+  void SetResolver(std::unique_ptr<OpResolver> resolver) {
+    resolver_ = std::move(resolver);
+  }
+
+  // Indicate whether the test has the NNAPI delegate applied.
+  static bool GetForceUseNnapi();
+  int CountOpsExecutedByCpuKernel();
+  int CountNumberOfDelegatedPartitions() const;
+  int GetNumberOfAppliedDelegates() const { return num_applied_delegates_; }
+  // Return the most recent return status of ApplyDelegate.
+  std::optional<TfLiteStatus> GetDelegateApplicationStatus() const {
+    return delegate_application_status_;
+  }
+  void SetDelegateApplicationStatus(std::optional<TfLiteStatus> status) {
+    delegate_application_status_ = status;
+  }
+
+  // Tell TF Lite runtime to apply default delegates (i.e. XNNPACK delegate)
+  // when handling this op-level model.
+  void SetApplyDefaultDelegates() { bypass_default_delegates_ = false; }
+
+ protected:
+  int32_t GetTensorSize(int index) const;
+
+  // Tell TF Lite runtime to skip applying default delegates (i.e. XNNPACK
+  // delegate) when handling this op-level model.
+  void SetBypassDefaultDelegates() { bypass_default_delegates_ = true; }
+
+  flatbuffers::FlatBufferBuilder builder_;
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<OpResolver> resolver_;
+
+  std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
+  std::vector<flatbuffers::Offset<Operator>> operators_;
+  std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
+
+  template <typename T>
+  int AddTensor(TensorData t, const T* data, size_t size,
+                bool is_variable = false) {
+    int id = tensors_.size();
+
+    // This is slightly different depending on whether we are adding a
+    // quantized or a regular tensor.
+    bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
+
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+
+    if (is_quantized) {
+      if (t.min != 0 || t.max != 0) {
+        if (t.type == TensorType_UINT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<uint8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT32) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int32_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT16) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int16_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT4) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt4);
+        } else {
+          LOG(FATAL) << "No support for the requested quantized type";
+        }
+        t.min = 0;
+        t.max = 0;
+      }
+
+      std::vector<float> scales{t.scale};
+      std::vector<int64_t> zero_points{t.zero_point};
+      q_params = CreateQuantizationParameters(
+          builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>(scales),
+          builder_.CreateVector<int64_t>(zero_points));
+    }
+
+    int buffer_id = 0;
+    if (size) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      builder_.ForceVectorAlignment(size, sizeof(T), 16);
+      // Add data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer = builder_.CreateVector(
+          reinterpret_cast<const uint8_t*>(data), sizeof(T) * size);
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, q_params, is_variable,
+        /*sparsity=*/0, builder_.CreateVector<int>(t.shape_signature)));
+
+    tensor_data_[id] = t;
+
+    return id;
+  }
+
+  template <typename T>
+  std::pair<float, int32_t> QuantizationParams(
+      float f_min, float f_max, TfLiteType type = kTfLiteNoType) {
+    int32_t zero_point = 0;
+    float scale = 0;
+    T qmin;
+    T qmax;
+
+    if (type == kTfLiteInt4) {
+      qmin = -7;
+      qmax = 7;
+    } else {
+      qmin = std::numeric_limits<T>::min();
+      qmax = std::numeric_limits<T>::max();
+    }
+    const float qmin_double = qmin;
+    const float qmax_double = qmax;
+    // 0 should always be a representable value. Let's assume that the initial
+    // min,max range contains 0.
+    CHECK_LE(f_min, 0);
+    CHECK_GE(f_max, 0);
+    if (f_min == f_max) {
+      // Special case where the min,max range is a point. Should be {0}.
+      CHECK_EQ(f_min, 0);
+      CHECK_EQ(f_max, 0);
+      return {scale, zero_point};
+    }
+
+    // General case.
+    //
+    // First determine the scale.
+    scale = (f_max - f_min) / (qmax_double - qmin_double);
+
+    // Zero-point computation.
+    // First the initial floating-point computation. The zero-point can be
+    // determined from solving an affine equation for any known pair
+    // (real value, corresponding quantized value).
+    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+    // The arithmetic error on the zero point computed from either pair
+    // will be roughly machine_epsilon * (sum of absolute values of terms)
+    // so we want to use the variant that adds the smaller terms.
+    const float zero_point_from_min = qmin_double - f_min / scale;
+    const float zero_point_from_max = qmax_double - f_max / scale;
+
+    const float zero_point_from_min_error =
+        std::abs(qmin_double) + std::abs(f_min / scale);
+
+    const float zero_point_from_max_error =
+        std::abs(qmax_double) + std::abs(f_max / scale);
+
+    const float zero_point_double =
+        zero_point_from_min_error < zero_point_from_max_error
+            ? zero_point_from_min
+            : zero_point_from_max;
+
+    // Now we need to nudge the zero point to be an integer
+    // (our zero points are integer, and this is motivated by the requirement
+    // to be able to represent the real value "0" exactly as a quantized value,
+    // which is required in multiple places, for example in Im2col with SAME
+    //  padding).
+
+    T nudged_zero_point = 0;
+    if (zero_point_double < qmin_double) {
+      nudged_zero_point = qmin;
+    } else if (zero_point_double > qmax_double) {
+      nudged_zero_point = qmax;
+    } else {
+      nudged_zero_point = static_cast<T>(std::round(zero_point_double));
+    }
+
+    // The zero point should always be in the range of quantized value,
+    // // [qmin, qmax].
+    CHECK_GE(nudged_zero_point, qmin);
+    CHECK_LE(nudged_zero_point, qmax);
+
+    zero_point = nudged_zero_point;
+    // finally, return the values
+    return {scale, zero_point};
+  }
+
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr) {
+    interpreter_->AddSubgraphs(subgraphs_to_add, first_new_subgraph_index);
+  }
+
+  // Partially populates the tensor, starting at the given offset.
+  void PopulateTensor4bit(int index, int offset, const int8_t* begin,
+                          const int8_t* end) {
+    auto data = absl::Span<const int8_t>(begin, end - begin);
+    TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+    uint8_t* v = nullptr;
+    if (tensor_ptr) {
+      v = reinterpret_cast<uint8_t*>(tensor_ptr->data.data);
+    }
+
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      LOG(FATAL) << "Unknown tensor error.";
+    }
+    absl::c_copy(data, v + offset);
+    PackInt4ValuesDenselyInPlace(v, ElementCount(*tensor_ptr->dims));
+    tensor_ptr->bytes = ((ElementCount(*tensor_ptr->dims) + 1) / 2);
+  }
+
+ private:
+  // Populates the tensor starting at offset using given data.
+  template <typename T, typename Container>
+  void PopulateTensorImpl(int index, int offset, const Container& data) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      CHECK_EQ(t->type, typeToTfLiteType<T>())
+          << "Type mismatch for tensor with index " << index << ". Requested "
+          << TfLiteTypeGetName(typeToTfLiteType<T>()) << ", got "
+          << TfLiteTypeGetName(t->type) << ".";
+      LOG(FATAL) << "Unknown tensor error.";
+    }
+    absl::c_copy(data, v + offset);
+  }
+
+  void PackInt4ValuesDenselyInPlace(uint8_t* src_buffer, int buffer_size) {
+    for (int i = 0; i < buffer_size; ++i) {
+      if (i % 2 == 0) {
+        src_buffer[i / 2] = src_buffer[i] & 0x0F;
+      } else {
+        src_buffer[i / 2] |= src_buffer[i] << 4;
+      }
+    }
+    // the rest of the buffer should be empty since half of it is packed with
+    // the values
+    memset(src_buffer + (buffer_size + 1) / 2, 0, buffer_size / 2);
+  }
+
+  int ElementCount(TfLiteIntArray& dims) {
+    int result = 1;
+    for (int i = 0; i < dims.size; ++i) {
+      result *= dims.data[i];
+    }
+    return result;
+  }
+
+  int AddTensorPerChannelQuant(const TensorData& t) {
+    // type does not matter when adding empty data.
+    return AddTensorPerChannelQuant<uint8_t>(t, nullptr, 0);
+  }
+
+  template <typename T>
+  int AddTensorPerChannelQuant(const TensorData& t, const T* data,
+                               size_t size) {
+    const int id = tensors_.size();
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+    q_params = CreateQuantizationParameters(
+        builder_, /*min=*/0, /*max=*/0,
+        /*scale=*/
+        builder_.CreateVector<float>(t.per_channel_quantization_scales),
+        /*zero point=*/
+        builder_.CreateVector<int64_t>(t.per_channel_quantization_offsets),
+        QuantizationDetails_NONE, 0, t.channel_index);
+
+    int buffer_id = 0;
+    if (size) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer = builder_.CreateVector(
+          reinterpret_cast<const uint8_t*>(data), sizeof(T) * size);
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape), t.type,
+                     /*buffer=*/buffer_id,
+                     /*name=*/0, q_params, /*is_variable=*/false));
+    tensor_data_[id] = t;
+    return id;
+  }
+
+  std::vector<int8_t> QuantizeTensor(int index,
+                                     const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    const int length = data.size();
+    std::vector<int8_t> q(length);
+    float min, max, scaling_factor;
+    tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min,
+                                          &max, &scaling_factor);
+    // Update quantization params.
+    t->params.scale = scaling_factor;
+    t->params.zero_point = 0;
+    // Populate the new quantization params.
+    TfLiteQuantizationFree(&t->quantization);
+    t->quantization.type = kTfLiteAffineQuantization;
+    auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+        malloc(sizeof(TfLiteAffineQuantization)));
+    affine_quantization->quantized_dimension = 0;
+    affine_quantization->scale = TfLiteFloatArrayCreate(1);
+    affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+    affine_quantization->scale->data[0] = scaling_factor;
+    affine_quantization->zero_point->data[0] = 0;
+    t->quantization.params = affine_quantization;
+    return q;
+  }
+
+  // Checks if acceleration has been done as expected.
+  // Currently supports only NNAPI.
+  // It verifies if the test was configured to run with NNAPI acceleration
+  // or not (SetForceUseNnapi(true)).
+  // In affirmative case it checks if:
+  // - the test case has been listed in the list of nnapi-accelerated cases
+  // - the test is running on a device (NNAPI has been loaded)
+  //
+  // The list of nnapi-accelerated test cases is a file containing regex to
+  // include or exclude specific test cases plus the minimum android SDK version
+  // the acceleration should be enabled for. For example:
+  // To enable the test BorderFloat in TopKV2OpTest only from
+  // android_sdk_version 29:
+  //
+  // TopKV2OpTest/BorderFloat,29
+  //
+  // And to have it always excluded while enabling all other Float tests
+  // (the order of the rules is important, the first one matching is used):
+  //
+  // -TopKV2OpTest/BorderFloat
+  // TopKV2OpTest/.+Float
+
+  void ValidateAcceleration();
+
+  // If the test was configured to use NNAPI and NNAPI was actually loaded,
+  // checks if the single operation in the model has been accelerated.
+  void ExpectOpAcceleratedWithNnapi(const std::string& test_id);
+
+  std::map<int, TensorData> tensor_data_;
+  std::vector<int32_t> inputs_;
+  std::vector<int32_t> intermediates_;
+  std::vector<int32_t> outputs_;
+  std::vector<flatbuffers::Offset<Tensor>> tensors_;
+  std::vector<flatbuffers::Offset<Buffer>> buffers_;
+  TfLiteDelegate* delegate_ = nullptr;  // not own the memory.
+  std::optional<TfLiteStatus> delegate_application_status_ = std::nullopt;
+  std::vector<std::vector<int>> input_shapes_;
+  int num_applied_delegates_ = 0;
+  bool allow_fp32_relax_to_fp16_ = false;
+  bool apply_delegate_ = true;
+  bool allocate_and_delegate_ = true;
+
+  // Whether to bypass the application of TF Lite default delegates (i.e.
+  // XNNPACK delegate) at rutnime.
+  // True by default as delegated graphs are tested elsewhere.
+  bool bypass_default_delegates_ = true;
+};
+
+// Populate string tensors.
+template <>
+inline void SingleOpModel::PopulateTensor<string>(
+    int index, const std::initializer_list<string>& data) {
+  PopulateStringTensor(index, data);
+}
+
+// Base class for single op unit tests.
+// The tests are parameterized to test multiple kernels for a single op.
+// The parameters are strings like "optimized" and "reference" to have better
+// readability in test reports.
+//
+// To use this class:
+// * Define a constant map from strings to TfLiteRegistration.
+// * Implement a test class that inherits SingleOpTest.
+// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
+//   function.
+// * Call GetRegistration to get the TfLiteRegistration to be used before
+//   building the interpreter.
+class SingleOpTest : public ::testing::TestWithParam<string> {
+ public:
+  static std::vector<string> GetKernelTags(
+      const std::map<string, TfLiteRegistration*>& kernel_map) {
+    std::vector<string> tags;
+    tags.reserve(kernel_map.size());
+    for (const auto& it : kernel_map) {
+      tags.push_back(it.first);
+    }
+    return tags;
+  }
+
+ protected:
+  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
+  TfLiteRegistration* GetRegistration() {
+    return GetKernelMap().at(GetParam());
+  }
+};
+
+// Maps the native C++ types to the corresponding TFLite tensor type enum
+// values.
+template <class T>
+struct TensorTypeFor;
+
+#define TFLITE_TENSOR_TYPE_ASSOC(CPP_TYPE, TENSORTYPE_VALUE) \
+  template <>                                                \
+  struct TensorTypeFor<CPP_TYPE> {                           \
+    static constexpr TensorType value = TENSORTYPE_VALUE;    \
+  };
+
+TFLITE_TENSOR_TYPE_ASSOC(bool, TensorType_BOOL);
+TFLITE_TENSOR_TYPE_ASSOC(int8_t, TensorType_INT8);
+TFLITE_TENSOR_TYPE_ASSOC(int16_t, TensorType_INT16);
+TFLITE_TENSOR_TYPE_ASSOC(int32_t, TensorType_INT32);
+TFLITE_TENSOR_TYPE_ASSOC(int64_t, TensorType_INT64);
+TFLITE_TENSOR_TYPE_ASSOC(uint8_t, TensorType_UINT8);
+TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
+TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
+TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
+TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(TfLiteBFloat16, TensorType_BFLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(Eigen::bfloat16, TensorType_BFLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
+TFLITE_TENSOR_TYPE_ASSOC(double, TensorType_FLOAT64);
+TFLITE_TENSOR_TYPE_ASSOC(std::string, TensorType_STRING);
+
+#undef TFLITE_TENSOR_TYPE_ASSOC
+
+// Returns the corresponding TensorType given the type T.
+template <typename T>
+constexpr TensorType GetTensorType() {
+  return TensorTypeFor<T>::value;
+}
+
+// Strings have a special implementation that is in test_util.cc
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index) const;
+
+// The TypeUnion struct specializations hold a collection of related types.
+// Each struct holds: 1. a primitive type (e.g. float), 2. a TensorType (e.g.
+// TensorType_FLOAT32, and 3. a TfLiteType (e.g. kTfLiteFloat32). The latter
+// two are actually enum values and not raw types, but these specializations
+// make it easy to use gUnit Typed Test Suite:
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests
+template <typename T>
+struct TypeUnion;
+
+template <>
+struct TypeUnion<float> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT32;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat32;
+  typedef float ScalarType;
+};
+
+template <>
+struct TypeUnion<int32_t> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_INT32;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt32;
+  typedef int32_t ScalarType;
+};
+
+template <>
+struct TypeUnion<uint32_t> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_UINT32;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt32;
+  typedef uint32_t ScalarType;
+};
+
+template <>
+struct TypeUnion<int16_t> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_INT16;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt16;
+  typedef int16_t ScalarType;
+};
+
+template <>
+struct TypeUnion<uint16_t> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_UINT16;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt16;
+  typedef uint16_t ScalarType;
+};
+
+template <>
+struct TypeUnion<int8_t> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_INT8;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt8;
+  typedef int8_t ScalarType;
+};
+
+template <>
+struct TypeUnion<uint8_t> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_UINT8;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt8;
+  typedef uint8_t ScalarType;
+};
+
+template <>
+struct TypeUnion<Eigen::half> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT16;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat16;
+  typedef Eigen::half ScalarType;
+};
+
+template <>
+struct TypeUnion<Eigen::bfloat16> {
+ public:
+  // NOLINTNEXTLINE
+  static constexpr TensorType tensor_type = TensorType::TensorType_BFLOAT16;
+  // NOLINTNEXTLINE
+  static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteBFloat16;
+  typedef Eigen::bfloat16 ScalarType;
+};
+
+class MultiOpModel : public SingleOpModel {
+ public:
+  MultiOpModel() : SingleOpModel() {}
+  ~MultiOpModel() = default;
+
+  void AddBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
+                    const flatbuffers::Offset<void>& builtin_options,
+                    const std::vector<int32_t>& inputs,
+                    const std::vector<int32_t>& outputs);
+
+  void AddCustomOp(const string& name,
+                   const std::vector<uint8_t>& custom_option,
+                   const std::function<TfLiteRegistration*()>& registration,
+                   const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs);
+
+  template <typename T>
+  int AddInnerTensor(TensorData t) {
+    return AddTensor<T>(t, {}, false);
+  }
+};
+
+// This Matcher can be used to check the dimensions of a `TfLiteTensor`
+// in a readable way. It will print a helpful error message in the
+// case of a failure.
+//
+// EXAMPLE:
+// TfLiteTensor* t = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
+// t->dims = ConvertVectorToTfLiteIntArray({2, 2});
+// EXPECT_EQ(t, DimsAre({2, 2}));
+//
+// To check for scalar, simply pass empty initializer list: `DimsAre({})`.
+class DimsAreMatcher {
+ public:
+  using is_gtest_matcher = void;
+  explicit DimsAreMatcher(const std::vector<int>& dims) {
+    dims_ = std::shared_ptr<TfLiteIntArray>(ConvertVectorToTfLiteIntArray(dims),
+                                            TfLiteIntArrayFree);
+  }
+
+  // Required method to implement for matcher objects. We overload on
+  // both `TfLiteTensor*` and `TfLiteIntArray` for flexibility.
+  bool MatchAndExplain(const TfLiteIntArray* arg,
+                       testing::MatchResultListener* result_listener) const {
+    if (arg == nullptr) {
+      *result_listener << "dims are null";
+      return false;
+    }
+    if (TfLiteIntArrayEqual(arg, dims_.get())) {
+      return true;
+    }
+    *result_listener << "has dims " << tflite::GetShapeDebugString(arg);
+    return false;
+  }
+
+  bool MatchAndExplain(const TfLiteTensor* arg,
+                       testing::MatchResultListener* result_listener) const {
+    if (arg == nullptr) {
+      *result_listener << "tensor is null";
+      return false;
+    }
+    return MatchAndExplain(arg->dims, result_listener);
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "dims equal to ";
+    *os << tflite::GetShapeDebugString(dims_.get());
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "dims not equal to ";
+    *os << tflite::GetShapeDebugString(dims_.get());
+  }
+
+ private:
+  // gUnit uses the implicit copy constructor of `DimsAreMatcher`. Use
+  // `std::shared_ptr` here so copies of `DimsAreMatcher` can share the
+  // same `TfLiteIntArray`.
+  std::shared_ptr<TfLiteIntArray> dims_;
+};
+
+inline DimsAreMatcher DimsAre(std::initializer_list<int> dims_list) {
+  return DimsAreMatcher(dims_list);
+}
+
+inline DimsAreMatcher DimsAre(const std::vector<int>& dims) {
+  return DimsAreMatcher(dims);
+}
+
+inline DimsAreMatcher DimsAre(absl::Span<int> dims) {
+  return DimsAreMatcher(std::vector<int>(dims.begin(), dims.end()));
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/transpose_test_utils.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/transpose_test_utils.h
new file mode 100644
index 00000000..d121caac
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/transpose_test_utils.h
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TRANSPOSE_TEST_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_TRANSPOSE_TEST_UTILS_H_
+
+#include <functional>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Generates an input tensor and permutes its dimensions.
+//
+// The input tensor is filled with sequentially increasing values.
+//
+// - shape: input tensor shape.
+// - perms: permutation for the dimensions. This should hold a permutation of
+//   `[|0, shape.size()|]`.
+//
+// Returns a vector holding the transposed data.
+template <typename T>
+std::vector<T> RunTestPermutation(const absl::Span<const int> shape,
+                                  const absl::Span<const int> perms) {
+  // Count elements and allocate output.
+  const int count = absl::c_accumulate(shape, 1, std::multiplies<>{});
+  std::vector<T> out(count);
+
+  // Create the dummy data
+  std::vector<T> input(count);
+  absl::c_iota(input, static_cast<T>(0));
+
+  // Make input and output shapes.
+  const RuntimeShape input_shape(shape.size(), shape.data());
+  RuntimeShape output_shape(perms.size());
+  for (int i = 0; i < perms.size(); i++) {
+    output_shape.SetDim(i, input_shape.Dims(perms[i]));
+  }
+
+  TransposeParams params{};
+  params.perm_count = static_cast<int8_t>(perms.size());
+  absl::c_copy(perms, params.perm);
+
+  reference_ops::Transpose(params, input_shape, input.data(), output_shape,
+                           out.data());
+  return out;
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TRANSPOSE_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h
new file mode 100644
index 00000000..f3fc8219
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h
@@ -0,0 +1,385 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Shared code between unidirectional LSTM test and bench.
+
+#ifndef TENSORFLOW_LITE_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_UTIL_H_
+
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+class UnidirectionalLSTMOpModel : public SingleOpModel {
+ public:
+  UnidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                            int sequence_length, bool time_major, bool use_cifg,
+                            bool use_peephole, bool use_projection_weights,
+                            bool use_projection_bias, float cell_clip,
+                            float proj_clip,
+                            const std::vector<std::vector<int>>& input_shapes,
+                            const TensorType& weights_type = TensorType_FLOAT32,
+                            bool is_layer_norm = false,
+                            bool asymmetric_quantize_inputs = false,
+                            bool diagonal_recurrent_weights = false)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output),
+        sequence_length_(sequence_length),
+        diagonal_recurrent_weights_(diagonal_recurrent_weights) {
+    input_ = AddInput(TensorType_FLOAT32);
+    const TensorType recurrent_weight_type =
+        diagonal_recurrent_weights_ ? TensorType_FLOAT32 : weights_type;
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput(weights_type);
+    }
+
+    input_to_forget_weights_ = AddInput(weights_type);
+    input_to_cell_weights_ = AddInput(weights_type);
+    input_to_output_weights_ = AddInput(weights_type);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput(recurrent_weight_type);
+    }
+
+    recurrent_to_forget_weights_ = AddInput(recurrent_weight_type);
+    recurrent_to_cell_weights_ = AddInput(recurrent_weight_type);
+    recurrent_to_output_weights_ = AddInput(recurrent_weight_type);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(weights_type);
+      }
+      cell_to_forget_weights_ = AddInput(weights_type);
+      cell_to_output_weights_ = AddInput(weights_type);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(weights_type);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    output_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}});
+    cell_state_ =
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}});
+
+    // Layer norm weights.
+    if (is_layer_norm) {
+      if (use_cifg) {
+        input_layer_norm_coefficients_ = AddNullInput();
+      } else {
+        input_layer_norm_coefficients_ =
+            AddLayerNormCoeffsTensor(20, input_shapes);
+      }
+      forget_layer_norm_coefficients_ =
+          AddLayerNormCoeffsTensor(21, input_shapes);
+      cell_layer_norm_coefficients_ =
+          AddLayerNormCoeffsTensor(22, input_shapes);
+      output_layer_norm_coefficients_ =
+          AddLayerNormCoeffsTensor(23, input_shapes);
+    }
+
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(
+        BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+        BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+        CreateUnidirectionalSequenceLSTMOptions(
+            builder_, ActivationFunctionType_TANH, cell_clip, proj_clip,
+            time_major, asymmetric_quantize_inputs, diagonal_recurrent_weights)
+            .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    PopulateTensor(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    PopulateTensor(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    PopulateTensor(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    PopulateTensor(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    PopulateTensor(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    PopulateTensor(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    PopulateTensor(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    PopulateTensor(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    PopulateTensor(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    PopulateTensor(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    PopulateTensor(cell_to_output_weights_, f);
+  }
+
+  void SetInputGateBias(const std::vector<float>& f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(const std::vector<float>& f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_gate_bias_, f);
+  }
+
+  void SetOutputGateBias(const std::vector<float>& f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    PopulateTensor(projection_weights_, f);
+  }
+
+  void SetProjectionBias(const std::vector<float>& f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void SetInputLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(output_layer_norm_coefficients_, f);
+  }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    PopulateTensor(input_, offset, const_cast<float*>(begin),
+                   const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+  int sequence_length() { return sequence_length_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_gate_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_state_;
+  int cell_state_;
+
+  int input_layer_norm_coefficients_;
+  int forget_layer_norm_coefficients_;
+  int cell_layer_norm_coefficients_;
+  int output_layer_norm_coefficients_;
+
+  int output_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+  int sequence_length_;
+  bool diagonal_recurrent_weights_;
+
+ private:
+  int AddLayerNormCoeffsTensor(
+      int tensor_index, const std::vector<std::vector<int>>& input_shapes) {
+    if (input_shapes[tensor_index][0] != 0) {
+      return AddInput(TensorType_FLOAT32);
+    } else {
+      return AddNullInput();
+    }
+  }
+};
+
+// The hybrid model has quantized weights.
+class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
+ public:
+  HybridUnidirectionalLSTMOpModel(
+      int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
+      bool time_major, bool use_cifg, bool use_peephole,
+      bool use_projection_weights, bool use_projection_bias, float cell_clip,
+      float proj_clip, const std::vector<std::vector<int>>& input_shapes,
+      TensorType tensor_type, bool asymmetric_quantize_inputs,
+      bool diagonal_recurrent_weights = false)
+      : UnidirectionalLSTMOpModel(
+            n_batch, n_input, n_cell, n_output, sequence_length, time_major,
+            use_cifg, use_peephole, use_projection_weights, use_projection_bias,
+            cell_clip, proj_clip, input_shapes, tensor_type,
+            /*is_layer_norm=*/false, asymmetric_quantize_inputs,
+            diagonal_recurrent_weights) {
+    tensor_type_ = tensor_type;
+  }
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetWeights(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetWeights(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_input_weights_, f);
+    } else {
+      SetWeights(recurrent_to_input_weights_, f);
+    }
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_forget_weights_, f);
+    } else {
+      SetWeights(recurrent_to_forget_weights_, f);
+    }
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_cell_weights_, f);
+    } else {
+      SetWeights(recurrent_to_cell_weights_, f);
+    }
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_output_weights_, f);
+    } else {
+      SetWeights(recurrent_to_output_weights_, f);
+    }
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_output_weights_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetWeights(projection_weights_, f);
+  }
+
+ protected:
+  TensorType tensor_type_;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_LITE_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/unsorted_segment_test.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/unsorted_segment_test.h
new file mode 100644
index 00000000..5d9ee2a4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/unsorted_segment_test.h
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_UNSORTED_SEGMENT_TEST_H_
+#define TENSORFLOW_LITE_KERNELS_UNSORTED_SEGMENT_TEST_H_
+
+#include <limits.h>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+template <typename T>
+class UnsortedSegmentModel : public SingleOpModel {
+ public:
+  UnsortedSegmentModel(const TensorData& data, const TensorData& segment_ids,
+                       const TensorData& num_segments, const BuiltinOperator op,
+                       const BuiltinOptions options) {
+    data_id_ = AddInput(data);
+    segment_ids_id_ = AddInput(segment_ids);
+    num_segments_id_ = AddInput(num_segments);
+    output_id_ = AddOutput(data.type);
+    SetBuiltinOp(op, options, 0);
+    BuildInterpreter({GetShape(data_id_), GetShape(segment_ids_id_),
+                      GetShape(num_segments_id_)});
+  }
+
+  explicit UnsortedSegmentModel(
+      const TensorData& data, const std::initializer_list<int>& segment_id_data,
+      const std::initializer_list<int>& segment_id_shape,
+      const std::initializer_list<int>& num_segments_data,
+      const std::initializer_list<int>& num_segments_shape,
+      const BuiltinOperator op, const BuiltinOptions options) {
+    data_id_ = AddInput(data);
+    segment_ids_id_ =
+        AddConstInput(TensorType_INT32, segment_id_data, segment_id_shape);
+    num_segments_id_ =
+        AddConstInput(TensorType_INT32, num_segments_data, num_segments_shape);
+    output_id_ = AddOutput(data.type);
+    SetBuiltinOp(op, options, 0);
+    BuildInterpreter({GetShape(data_id_), GetShape(segment_ids_id_),
+                      GetShape(num_segments_id_)});
+  }
+
+  int data() const { return data_id_; }
+  int segment_ids() const { return segment_ids_id_; }
+  int num_segments() const { return num_segments_id_; }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+  std::vector<int32_t> GetOutputShape() { return GetTensorShape(output_id_); }
+
+ protected:
+  int data_id_;
+  int segment_ids_id_;
+  int num_segments_id_;
+  int output_id_;
+};
+
+class UnsortedSegmentTest : public ::testing::TestWithParam<BuiltinOperator> {
+ public:
+  UnsortedSegmentModel<int32_t> getModel(const TensorData& data,
+                                         const TensorData& segment_ids,
+                                         const TensorData& num_segments) {
+    return UnsortedSegmentModel<int32_t>(data, segment_ids, num_segments,
+                                         GetParam(), BuiltinOptions_NONE);
+  }
+  UnsortedSegmentModel<int32_t> getConstModel(
+      const TensorData& data, const std::initializer_list<int>& segment_id_data,
+      const std::initializer_list<int>& segment_id_shape,
+      const std::initializer_list<int>& num_segments_data,
+      const std::initializer_list<int>& num_segments_shape) {
+    return UnsortedSegmentModel<int32_t>(
+        data, segment_id_data, segment_id_shape, num_segments_data,
+        num_segments_shape, GetParam(), BuiltinOptions_NONE);
+  }
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_UNSORTED_SEGMENT_TEST_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_kernels/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_kernels/test_util.h
new file mode 100644
index 00000000..13035541
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_kernels/test_util.h
@@ -0,0 +1,104 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_KERNELS_TEST_UTIL_H_
+
+#include <cstring>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+class ListOpModel : public MultiOpModel {
+ public:
+  // Populate tensor at given index as a TensorList.
+  void PopulateListTensor(int index, absl::Span<const int> element_shape_data,
+                          int num_elements, TfLiteType element_type);
+
+  // Set a list element with given data.
+  void ListSetItem(int index, int list_index, absl::Span<const int> item_dims,
+                   TfLiteType item_type, const void* item_data);
+};
+
+// Gets the number of bytes required for a single element of `TfLiteType`,
+// or `nullopt` if type does not have a fixed size.
+std::optional<size_t> TfLiteTypeSizeOf(TfLiteType type);
+
+// Converts `TfLiteType` to `TensorType`.
+std::optional<TensorType> TflToTensorType(TfLiteType tfl_type);
+
+// Matches tensors on having the given type and buffer of proper size.
+MATCHER_P(IsAllocatedAs, tfl_type,
+          std::string(negation ? "is not" : "is") +
+              " of given tensor data type") {
+  if (arg == nullptr) {
+    *result_listener << "Tensor is null";
+    return false;
+  }
+  if (arg->type != tfl_type) {
+    *result_listener << "Data type is " << arg->type << " expected "
+                     << tfl_type;
+    return false;
+  }
+  std::optional<size_t> type_size = TfLiteTypeSizeOf(tfl_type);
+  if (!type_size.has_value()) {
+    *result_listener << "Expected type size is unknown.";
+    return false;
+  }
+  const size_t expected_bytes = type_size.value() * NumElements(arg);
+  if (arg->bytes != expected_bytes) {
+    *result_listener << "Allocated size is " << arg->bytes << " expected "
+                     << expected_bytes;
+    return false;
+  }
+  return true;
+}
+
+// Matches tensors on having data filled with given value.
+MATCHER_P(FilledWith, value,
+          std::string(negation ? "is not" : "is") + " filled with " +
+              std::to_string(value)) {
+  if (arg == nullptr) {
+    *result_listener << "Tensor is null";
+    return false;
+  }
+  const value_type* const element_data =
+      reinterpret_cast<value_type*>(arg->data.data);
+  if (element_data == nullptr) {
+    *result_listener << "Tensor data is null";
+    return false;
+  }
+  return ::testing::Value(
+      std::vector<value_type>(element_data, element_data + NumElements(arg)),
+      ::testing::Each(value));
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_KERNELS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_lib.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_lib.h
new file mode 100644
index 00000000..52efd3ab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_lib.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_LIB_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_LIB_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace variants {
+namespace detail {
+
+struct ListReserveOptions {
+  TensorType element_type;
+};
+
+}  // namespace detail
+namespace ops {
+
+TfLiteRegistration* Register_LIST_RESERVE();
+
+TfLiteRegistration* Register_LIST_STACK();
+
+TfLiteRegistration* Register_LIST_SET_ITEM();
+
+TfLiteRegistration* Register_LIST_FROM_TENSOR();
+
+TfLiteRegistration* Register_LIST_GET_ITEM();
+
+TfLiteRegistration* Register_LIST_LENGTH();
+
+TfLiteRegistration* Register_LIST_ELEMENT_SHAPE();
+
+TfLiteRegistration* Register_LIST_POP_BACK();
+
+TfLiteRegistration* Register_LIST_PUSH_BACK();
+
+TfLiteRegistration* Register_VARIANT_ADD_N();
+
+TfLiteRegistration* Register_VARIANT_ZEROS_LIKE();
+
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
new file mode 100644
index 00000000..cd0aa8eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
@@ -0,0 +1,89 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_SUBGRAPH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_SUBGRAPH_TEST_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/variants/list_ops_lib.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Helper class for constructing complicated subgraphs for testing.
+class ListOpsSubgraphBuilder {
+ public:
+  // Populates the given Subgraph with ops to add the value of two constants
+  // created by `CreateConstantInt32Tensor`.
+  void BuildAddConstSubgraph(Subgraph* subgraph);
+
+  // Populates the given Subgraph with a "ListReserve" op whose elements
+  // have type `element_type`.
+  void BuildReserveSubgraph(Subgraph* subgraph, TensorType element_type);
+
+  // Populates the given Subgraph with "ListStack" op that takes in
+  // a "ListReserve". Element types are i32.
+  void BuildReserveStackSubgraph(Subgraph* subgraph);
+
+  // Populates the given Subgraph with a "While" op, whose cond and body
+  // subgraphs are located at index 1, 2 respectively. The input signatures
+  // of both subgraphs 1, 2 are expected to be (kTfLiteInt32, kTfLiteVariant).
+  void BuildWhileSubgraph(Subgraph* subgraph);
+
+  // Populates the given Subgraph with a single "Less" op which checks
+  // if a given int is less that the constant 3. Also takes a `kTfLiteVariant`
+  // tensor in order to be compliant with `BuildWhileSubgraph`.
+  void BuildLessThanSubgraph(Subgraph* subgraph);
+
+  // Populates the given Subgraph with a "ListSetItem" op which sets the element
+  // at given indice into given tensorlist. Additionally increment and return
+  // the given int by 1.
+  void BuildSetItemAndIncrementSubgraph(Subgraph* subgraph);
+
+  // Populates the given Subgraph with a "ListReserve" and "ListLength" op.
+  void BuildReserveLengthSubgraph(Subgraph* subgraph);
+
+ private:
+  // Creates a constant tensor in given Subgraphs at given indice with
+  // corresponding data.
+  void CreateConstantInt32Tensor(Subgraph* subgraph, int tensor_index,
+                                 absl::Span<const int> shape,
+                                 absl::Span<const int> data);
+
+  // Custom options usually live in the flatbuffer, so they won't
+  // be cleaned up by the `Interpreter`. When we create them in test
+  // we need to free when test is done. So we provide a factory function
+  // for construction.
+  variants::detail::ListReserveOptions* RequestReserveOptions(
+      TensorType element_type);
+  std::vector<variants::detail::ListReserveOptions> list_reserve_opts_;
+
+  std::vector<std::vector<int32_t>> int_buffers_;
+};
+
+class ListOpsSubgraphTest : public ::testing::Test {
+ protected:
+  Interpreter interpreter_;
+  ListOpsSubgraphBuilder builder_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_SUBGRAPH_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_util.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_util.h
new file mode 100644
index 00000000..d581e3d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/list_ops_util.h
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_UTIL_H_
+
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+
+// Creates a dims array from tensor whose data represents a shape
+// signature.
+// TODO(b/288302706) `TfLiteIntArray` is ill-equiped to encode the semantics
+// of something like a `tf.TensorShape`. In particular, there is no way
+// to cleanly capture the difference between a concrete scalar shape, and an
+// unranked shape-signature. The latter is defined to be compatible with any
+// shape (like `tf.TensorShape(None)`). This causes the need for some extra
+// checks. Consider wrapping in something like a `std::union` to differentiate
+// between these cases.
+IntArrayUniquePtr TensorAsShape(const TfLiteTensor& shape);
+
+// "Merges" two shape signature arrays if possible, returns nullptr otherwise.
+// Merging means to compute the most specified possible signature which
+// is compatible with both inputs.
+IntArrayUniquePtr MergeShapesOrNull(IntArrayUniquePtr l, IntArrayUniquePtr r);
+
+// Checks if array encodes a fully defined shape.
+bool IsShapeFullyDefined(const TfLiteIntArray& shape);
+
+// Returns a status denoting whether all of the elements in the `arr`
+// have the same shape. Write that shape to `result`.
+// If the `arr` has no set elements, still succeed but set `result` to nullptr.
+// TODO(b/288302706) This may be a performance bottleneck. We could potentially
+// amortize this work by constraining `TensorArray::element_shape_` every
+// time an element is added. This may cause divergence from tensorflow behavior
+// however; further investigation is needed.
+TfLiteStatus GetShapeIfAllEqual(const TensorArray& arr,
+                                IntArrayUniquePtr& result);
+
+}  // namespace variants
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/register_list_ops.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/register_list_ops.h
new file mode 100644
index 00000000..e22f8f5a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/register_list_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_REGISTER_LIST_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_REGISTER_LIST_OPS_H_
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace variants {
+namespace ops {
+
+// Register all tflite TensorList kernels as custom ops with the given resolver.
+void RegisterListOps(MutableOpResolver* resolver);
+
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_REGISTER_LIST_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/tensor_array.h b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/tensor_array.h
new file mode 100644
index 00000000..107e8af3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/kernels/variants/tensor_array.h
@@ -0,0 +1,108 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_TENSOR_ARRAY_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_TENSOR_ARRAY_H_
+
+#include <utility>
+
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+
+// `VariantData` implementation for a dynamically sized array of `TfLiteTensor`.
+// Each element of the array is a lightweight `RefCountedTensor`.
+// --- WARNING ---
+// This is intended to be used in a single-threaded manner
+// and users must take care when calling non-const methods, even on different
+// instances. Different instances may share underlying control structures (when
+// using the copy constructor to initialize them), and in such cases function
+// calls across all affected instances must be properly synchronized. Calling
+// non-const functions on any of the linked objects requires exclusive access to
+// all of them.
+//
+// TODO(b/288302706) Implement standard container methods.
+class TensorArray : public AbstractVariantData<TensorArray> {
+ public:
+  // Takes ownership of `element_shape` input.
+  TensorArray(TfLiteType element_type, IntArrayUniquePtr element_shape)
+      : element_shape_(std::move(element_shape)), element_type_(element_type) {}
+
+  // Copying a `TensorArray` copies the sources underlying array of
+  // `RefCountedTensor` in one `memcpy` and increments each of the ref counts.
+  TensorArray(const TensorArray& other);
+
+  // Drops the references of `this` and assigns members in the same way
+  // as the copy constructor.
+  TensorArray& operator=(const TensorArray& other);
+
+  const TfLiteIntArray* ElementShape() const { return element_shape_.get(); }
+
+  int NumElements() const { return num_elements_; }
+
+  // Resizes the array for given number of elements. If the length of the array
+  // is being decreased, `Drop` the reference to the elements that will no
+  // longer be in the array. If index is out of bounds, no effect.
+  void Resize(int num_elements);
+
+  // Retrieve the tensor at the given index.
+  const TfLiteTensor* At(int index) const;
+
+  TfLiteType ElementType() const { return element_type_; }
+
+  // Set the item at the given index with the given tensor. Takes ownership
+  // of the given tensor. If there exists an element at the given index,
+  // `Drop` this array's reference to it.
+  bool Set(int index, TensorUniquePtr tensor);
+
+  // `Drop`s each reference that exists in the array.
+  ~TensorArray() override;
+
+ private:
+  // Simple structure to hold tensor pointer and ref count. Only to be used
+  // as elements within `TensorArray`.
+  struct RefCountedTensor {
+    TfLiteTensor* tensor = nullptr;
+    int* count = nullptr;
+  };
+
+  // "Drops" the reference at the given index because it will no longer be held
+  // in this array. Decrements the reference count, if this array holds the only
+  // reference than free the underlying tensor.
+  void Drop(int i);
+
+  // `Drop`s each element in the list.
+  void Clear();
+
+  // Assigns this `elements_` buffer to `dst`. Requires that the size
+  // of this elements buffer be the same as `dst` and that `dst` has
+  // been `Clear`ed. Like the rest of this class, copying `this` buffer
+  // needs to increment references of `const this`, so beware.
+  void AssignBuffer(RefCountedTensor* dst) const;
+
+  // elements_ is nullptr iff num_elements is 0.
+  RefCountedTensor* elements_ = nullptr;
+  int num_elements_ = 0;
+
+  IntArrayUniquePtr element_shape_;
+  TfLiteType element_type_;
+};
+
+}  // namespace variants
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_TENSOR_ARRAY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/logger.h b/third_party/tflite-hdrs/tensorflow/lite/logger.h
new file mode 100644
index 00000000..37b58385
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/logger.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_LOGGER_H_
+#define TENSORFLOW_LITE_LOGGER_H_
+
+namespace tflite {
+
+/// The severity level of a TFLite log message.
+/// WARNING: This is an experimental API and subject to change.
+enum LogSeverity {
+  /// Default log severity level.
+  TFLITE_LOG_VERBOSE = 0,
+  /// Log routine information.
+  TFLITE_LOG_INFO = 1,
+  /// Log warning events that might cause problems.
+  TFLITE_LOG_WARNING = 2,
+  /// Log error events that are likely to cause problems.
+  TFLITE_LOG_ERROR = 3,
+  /// Silence logging
+  TFLITE_LOG_SILENT = 4,
+};
+
+/// TFLite logger specific configurations.
+/// WARNING: This is an experimental API and subject to change.
+class LoggerOptions {
+ public:
+  /// Get the minimum severity level for logging. Default is INFO in prod
+  /// builds and VERBOSE in debug builds.
+  /// Note: Default is always VERBOSE on Android.
+  /// WARNING: This is an experimental API and subject to change.
+  static LogSeverity GetMinimumLogSeverity();
+
+  /// Set the minimum severity level for logging, returning the old severity.
+  /// WARNING: This is an experimental API and subject to change.
+  static LogSeverity SetMinimumLogSeverity(LogSeverity new_severity);
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/memory_planner.h b/third_party/tflite-hdrs/tensorflow/lite/memory_planner.h
new file mode 100644
index 00000000..a47e1576
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/memory_planner.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MEMORY_PLANNER_H_
+#define TENSORFLOW_LITE_MEMORY_PLANNER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// A MemoryPlanner is responsible for planning and executing a number of
+// memory-related operations that are necessary in TF Lite.
+class MemoryPlanner {
+ public:
+  virtual ~MemoryPlanner() {}
+
+  // Plans the necessary memory allocations. This is the MemoryPlanner's
+  // pre-processing step and is called when the graph structure is known but
+  // actual size of the tensors is not.
+  virtual TfLiteStatus PlanAllocations() = 0;
+
+  // Allocates the necessary memory to execute all nodes in the interval
+  // [first_node, last_node].
+  virtual TfLiteStatus ExecuteAllocations(int first_node, int last_node) = 0;
+
+  // Invalidates allocations made earlier. This is called when tensors sizes
+  // have changed. All planned allocations remain, but can't be used until
+  // ExecuteAllocations() is called.
+  virtual TfLiteStatus ResetAllocations() = 0;
+
+  // Invalidates allocations after the given node execution.
+  virtual TfLiteStatus ResetAllocationsAfter(int node) = 0;
+
+  // NOTE: The following two methods modify the data pointers for all tensors on
+  // the non-persistent arena (inputs, outputs, intermediates). If the user has
+  // manually set the pointers for any of these, they would need to be set
+  // again.
+
+  // This releases memory allocated for non-persistent tensors.
+  // It does NOT clear the allocation plan, but the memory can't be used
+  // until AcquireNonPersistentMemory() is called.
+  // It is safe to call Reset/PlanAllocations after this method, without calling
+  // ReleaseTemporaryAllocations in case tensor sizes change.
+  virtual TfLiteStatus ReleaseNonPersistentMemory() = 0;
+
+  // Allocates the necessary memory to contain non-persistent tensors.
+  virtual TfLiteStatus AcquireNonPersistentMemory() = 0;
+
+  // Returns true if the non-persistent memory is available.
+  virtual bool HasNonPersistentMemory() = 0;
+
+  // Dumps the memory planning information against the specified op node
+  // execution plan (i.e. `execution_plan`) for the purpose of debugging.
+  virtual void DumpDebugInfo(const std::vector<int>& execution_plan) const = 0;
+
+  // Returns a map of allocation information. It's only used for debugging.
+  virtual void GetAllocInfo(size_t *arena_size,
+                            size_t *arena_persist_size) const = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MEMORY_PLANNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/minimal_logging.h b/third_party/tflite-hdrs/tensorflow/lite/minimal_logging.h
new file mode 100644
index 00000000..41cec4f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/minimal_logging.h
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+#define TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+
+#include <cstdarg>
+
+#include "tensorflow/lite/logger.h"
+
+namespace tflite {
+
+namespace logging_internal {
+
+using tflite::LogSeverity;
+
+// Helper class for simple platform-specific console logging. Note that we
+// explicitly avoid the convenience of ostream-style logging to minimize binary
+// size impact.
+class MinimalLogger {
+ public:
+  // Logging hook that takes variadic args.
+  static void Log(LogSeverity severity, const char* format, ...);
+
+  // Logging hook that takes a formatted va_list.
+  static void LogFormatted(LogSeverity severity, const char* format,
+                           va_list args);
+
+  // Get the minimum severity level for logging. Default is INFO in prod builds
+  // and VERBOSE in debug builds.
+  // Note: Default is always VERBOSE on Android.
+  static LogSeverity GetMinimumLogSeverity();
+
+  // Set the minimum severity level for logging, returning the old severity.
+  static LogSeverity SetMinimumLogSeverity(LogSeverity new_severity);
+
+ private:
+  static const char* GetSeverityName(LogSeverity severity);
+  static LogSeverity minimum_log_severity_;
+};
+
+}  // namespace logging_internal
+}  // namespace tflite
+
+// Convenience macro for basic internal logging in production builds.
+// Note: This should never be used for debug-type logs, as it will *not* be
+// stripped in release optimized builds. In general, prefer the error reporting
+// APIs for developer-facing errors, and only use this for diagnostic output
+// that should always be logged in user builds.
+#define TFLITE_LOG_PROD(severity, format, ...)                              \
+  if (severity >=                                                           \
+      ::tflite::logging_internal::MinimalLogger::GetMinimumLogSeverity()) { \
+    ::tflite::logging_internal::MinimalLogger::Log(severity, format,        \
+                                                   ##__VA_ARGS__);          \
+  }
+
+// Convenience macro for logging a statement *once* for a given process lifetime
+// in production builds.
+#define TFLITE_LOG_PROD_ONCE(severity, format, ...)    \
+  do {                                                 \
+    static const bool s_logged = [&] {                 \
+      TFLITE_LOG_PROD(severity, format, ##__VA_ARGS__) \
+      return true;                                     \
+    }();                                               \
+    (void)s_logged;                                    \
+  } while (false);
+
+#ifndef NDEBUG
+// In debug builds, always log.
+#define TFLITE_LOG TFLITE_LOG_PROD
+#define TFLITE_LOG_ONCE TFLITE_LOG_PROD_ONCE
+#else
+// In prod builds, never log, but ensure the code is well-formed and compiles.
+#define TFLITE_LOG(severity, format, ...)             \
+  while (false) {                                     \
+    TFLITE_LOG_PROD(severity, format, ##__VA_ARGS__); \
+  }
+#define TFLITE_LOG_ONCE TFLITE_LOG
+#endif
+
+#endif  // TENSORFLOW_LITE_MINIMAL_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/model.h b/third_party/tflite-hdrs/tensorflow/lite/model.h
new file mode 100644
index 00000000..0bfaf6b4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/model.h
@@ -0,0 +1,23 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
+#define TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/model.h.
+
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/model_builder.h"
+
+#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/model_builder.h b/third_party/tflite-hdrs/tensorflow/lite/model_builder.h
new file mode 100644
index 00000000..a138444e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/model_builder.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MODEL_BUILDER_H_
+#define TENSORFLOW_LITE_MODEL_BUILDER_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/model_builder.h.
+
+#include "tensorflow/lite/core/model_builder.h"
+
+namespace tflite {
+using FlatBufferModel = ::tflite::impl::FlatBufferModel;
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MODEL_BUILDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/mutable_op_resolver.h b/third_party/tflite-hdrs/tensorflow/lite/mutable_op_resolver.h
new file mode 100644
index 00000000..cde48e47
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/mutable_op_resolver.h
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_H_
+
+#include <stddef.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+// Some versions of gcc don't support partial specialization in class scope,
+// so these are defined in a namescope.
+namespace op_resolver_hasher {
+template <typename V>
+struct ValueHasher {
+  size_t operator()(const V& v) const { return std::hash<V>()(v); }
+};
+
+template <>
+struct ValueHasher<tflite::BuiltinOperator> {
+  size_t operator()(const tflite::BuiltinOperator& v) const {
+    return std::hash<int>()(static_cast<int>(v));
+  }
+};
+
+template <typename T>
+struct OperatorKeyHasher {
+  size_t operator()(const T& x) const {
+    size_t a = ValueHasher<typename T::first_type>()(x.first);
+    size_t b = ValueHasher<typename T::second_type>()(x.second);
+    return CombineHashes({a, b});
+  }
+};
+}  // namespace op_resolver_hasher
+
+/// An OpResolver that is mutable, also used as the op in gen_op_registration.
+/// A typical usage:
+///   MutableOpResolver resolver;
+///   resolver.AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+///   resolver.AddCustom("CustomOp", Register_CUSTOM_OP());
+///   InterpreterBuilder(model, resolver)(&interpreter);
+class MutableOpResolver : public OpResolver {
+ public:
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+
+  /// Registers the specified `version` of the specified builtin operator `op`.
+  /// Replaces any previous registration for the same operator version.
+  void AddBuiltin(tflite::BuiltinOperator op,
+                  const TfLiteRegistration* registration, int version = 1);
+
+  /// Registers the specified version range (versions `min_version` to
+  /// `max_version`, inclusive) of the specified builtin operator `op`.
+  /// Replaces any previous registration for the same operator version.
+  void AddBuiltin(tflite::BuiltinOperator op,
+                  const TfLiteRegistration* registration, int min_version,
+                  int max_version);
+
+  /// Registers the specified `version` of the specified builtin operator `op`.
+  /// Replaces any previous registration for the same operator version.
+  /// Warning: use of this method in new code is discouraged: for new code,
+  /// we recommend using tflite::AddOp (from mutable_op_resolver_utils.h)
+  /// rather than tflite::MutableOpResolver::AddCustom.
+  void AddCustom(const char* name, const TfLiteRegistration* registration,
+                 int version = 1);
+
+  /// Registers the specified version range (versions `min_version` to
+  /// `max_version`, inclusive) of the specified custom operator `name`.
+  /// Replaces any previous registration for the same operator version.
+  /// Warning: use of this method in new code is discouraged: for new code,
+  /// we recommend using tflite::AddOp (from mutable_op_resolver_utils.h)
+  /// rather than tflite::MutableOpResolver::AddCustom.
+  void AddCustom(const char* name, const TfLiteRegistration* registration,
+                 int min_version, int max_version);
+
+  /// Registers all operator versions supported by another MutableOpResolver.
+  /// Replaces any previous registrations for the same operator versions,
+  /// except that registrations made with `AddOp`, `AddBuiltin` or `AddCustom`
+  /// always take precedence over registrations made with `ChainOpResolver`.
+  void AddAll(const MutableOpResolver& other);
+
+  OpResolver::TfLiteDelegateCreators GetDelegateCreators() const final {
+    return delegate_creators_;
+  }
+
+  OpResolver::TfLiteOpaqueDelegateCreators GetOpaqueDelegateCreators()
+      const final {
+    return opaque_delegate_creators_;
+  }
+
+ protected:
+  /// Registers all operator versions supported by another OpResolver,
+  /// except any already registered in this MutableOpResolver.
+  /// `other` must point to an OpResolver whose lifetime is at least as long
+  /// as the lifetime of the MutableOpResolver pointed to by `this`.
+  /// The OpResolver pointed to by `other` should not be modified during the
+  /// lifetime of this MutableOpResolver.
+  void ChainOpResolver(const OpResolver* other);
+
+  /// True if this OpResolver itself (as opposed to chained op resolvers
+  /// registed with ChainOpResolver) may contain user defined ops.
+  ///
+  /// By "user defined" ops, we mean any op definitions other than those
+  /// contained in tflite::ops::builtin::BuiltinOpResolver.
+  bool may_directly_contain_user_defined_ops_ = false;
+
+  /// A vector of delegate creators to create optional delegates for resolving
+  /// and handling ops in the flatbuffer model. This may be used in addition to
+  /// the standard TfLiteRegistration lookup for graph resolution.
+  TfLiteDelegateCreators delegate_creators_;
+
+  /// A vector of opaque delegate creators to create optional opaque delegates
+  /// for resolving and handling ops in the flatbuffer model. This may be used
+  /// in addition to the standard TfLiteRegistration lookup for graph
+  /// resolution.  This is used for TF Lite in Google Play Services.
+  TfLiteOpaqueDelegateCreators opaque_delegate_creators_;
+
+ private:
+  bool MayContainUserDefinedOps() const override;
+
+  typedef std::pair<tflite::BuiltinOperator, int> BuiltinOperatorKey;
+  typedef std::pair<std::string, int> CustomOperatorKey;
+
+  std::unordered_map<BuiltinOperatorKey, TfLiteRegistration,
+                     op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey> >
+      builtins_;
+  std::unordered_map<CustomOperatorKey, TfLiteRegistration,
+                     op_resolver_hasher::OperatorKeyHasher<CustomOperatorKey> >
+      custom_ops_;
+  std::vector<const OpResolver*> other_op_resolvers_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/mutable_op_resolver_utils.h b/third_party/tflite-hdrs/tensorflow/lite/mutable_op_resolver_utils.h
new file mode 100644
index 00000000..905b8a49
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/mutable_op_resolver_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_UTILS_H_
+#define TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_UTILS_H_
+
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+
+/// Registers (the specified version of) the operator `op`.
+/// Replaces any previous registration for the same operator version.
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op);
+
+/// Registers the specified version range (versions `min_version` to
+/// `max_version`, inclusive) of the specified operator `op`.
+/// Replaces any previous registration for the same operator version.
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op,
+           int min_version, int max_version);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/namespace.h b/third_party/tflite-hdrs/tensorflow/lite/namespace.h
new file mode 100644
index 00000000..49f0ca4d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/namespace.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Defines the TFLITE_CONDITIONAL_NAMESPACE macro.
+#ifndef TENSORFLOW_LITE_SHIMS_NAMESPACE_H_
+#define TENSORFLOW_LITE_SHIMS_NAMESPACE_H_
+
+// To avoid potential violation of the C++ "one definition rule" (ODR) for
+// code which depends on TF Lite types that are conditionally defined
+// (such as tflite::Interpreter, tflite::FlatBufferModel, TfLiteInterpreter
+// and TfLiteModel), symbols that depend on such types should be defined
+// in a (sub-)namespace whose name is also conditional.
+#define TFLITE_CONDITIONAL_NAMESPACE regular_tflite
+
+#endif  // TENSORFLOW_LITE_NAMESPACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/NeuralNetworksShim.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/NeuralNetworksShim.h
new file mode 100644
index 00000000..0642414d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/NeuralNetworksShim.h
@@ -0,0 +1,1779 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
+#define TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+// This interface is now deprecated. You should use instead
+// nnapi_implementation.
+
+// TODO(b/123017568): Update all current usages of this file.
+
+// helpers
+
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
+#define LOAD_FUNCTION(name) \
+  static name##_fn fn = reinterpret_cast<name##_fn>(loadFunction(#name));
+#define EXECUTE_FUNCTION(...) \
+  if (fn != nullptr) {        \
+    fn(__VA_ARGS__);          \
+  }
+#define EXECUTE_FUNCTION_RETURN(...) return fn != nullptr ? fn(__VA_ARGS__) : 0;
+
+inline void* loadLibrary(const char* name) {
+  // TODO(karimnosseir): change RTLD_LOCAL? Assumes there can be multiple
+  // instances of nn api RT
+  void* handle = nullptr;
+#ifdef __ANDROID__
+  handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+  if (handle == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", name);
+  }
+#endif
+  return handle;
+}
+
+// ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
+// which was added in 8.1.
+inline int ASharedMemory_create(const char* name, size_t size) {
+  static void* handle = loadLibrary("libandroid.so");
+  static ASharedMemory_create_fn fn =
+      handle != nullptr ? reinterpret_cast<ASharedMemory_create_fn>(
+                              dlsym(handle, "ASharedMemory_create"))
+                        : nullptr;
+  int fd = fn != nullptr ? fn(name, size) : -1;
+  return fd;
+}
+
+inline void* getLibraryHandle() {
+  static void* handle = loadLibrary("libneuralnetworks.so");
+  return handle;
+}
+
+inline void* loadFunction(const char* name) {
+  void* fn = nullptr;
+  if (getLibraryHandle() != nullptr) {
+    fn = dlsym(getLibraryHandle(), name);
+  }
+  if (fn == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open function %s", name);
+  }
+  return fn;
+}
+
+inline bool NNAPIExists() {
+  static bool nnapi_is_available = getLibraryHandle();
+  return nnapi_is_available;
+}
+
+// NN api types based on NNAPI header file
+// https://developer.android.com/ndk/reference/group/neural-networks
+
+/**
+ * Creates a shared memory object from a file descriptor.
+ *
+ * The shared memory is backed by a file descriptor via mmap.
+ * See {@link ANeuralNetworksMemory} for a description on how to use
+ * this shared memory.
+ *
+ * @param size The requested size in bytes.
+ *             Must not be larger than the file size.
+ * @param prot The desired memory protection for the mapping.
+ *             It is either PROT_NONE or the bitwise OR of one or
+ *             more of the following flags: PROT_READ, PROT_WRITE.
+ * @param fd The requested file descriptor.
+ *           The file descriptor has to be mmap-able. The file
+ *           descriptor will be duplicated.
+ * @param offset The offset to the beginning of the file of the area to map.
+ *               The offset has to be aligned to a page size.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+ */
+inline int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd,
+                                              size_t offset,
+                                              ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromFd);
+  EXECUTE_FUNCTION_RETURN(size, protect, fd, offset, memory);
+}
+
+/**
+ * Delete a memory object.
+ *
+ * Destroys the object used by the run time to keep track of the memory.
+ * This will free the underlying actual memory if no other code has open
+ * handles to this memory.
+ *
+ * @param memory The memory object to be freed.
+ */
+inline void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_free);
+  EXECUTE_FUNCTION(memory);
+}
+
+/**
+ * Create an empty {@link ANeuralNetworksModel}.
+ *
+ * <p>This only creates the object. Computation is performed once
+ * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ *
+ * The model should be constructed with calls to
+ * {@link ANeuralNetworksModel_addOperation} and
+ * {@link ANeuralNetworksModel_addOperand}
+ *
+ * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+ * has been fully constructed.</p>
+ *
+ * <p>{@link ANeuralNetworksModel_free} should be called once the model
+ * is no longer needed.</p>
+ *
+ * @param model The {@link ANeuralNetworksModel} to be created.
+ *              Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
+  LOAD_FUNCTION(ANeuralNetworksModel_create);
+  EXECUTE_FUNCTION_RETURN(model);
+}
+
+/**
+ * Destroy a model.
+ *
+ * The model need not have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
+ */
+inline void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
+  LOAD_FUNCTION(ANeuralNetworksModel_free);
+  EXECUTE_FUNCTION(model);
+}
+
+/**
+ * Indicate that we have finished modifying a model. Required before
+ * calling {@link ANeuralNetworksCompilation_compile}.
+ *
+ * An application is responsible to make sure that no other thread uses
+ * the model at the same time.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
+  LOAD_FUNCTION(ANeuralNetworksModel_finish);
+  EXECUTE_FUNCTION_RETURN(model);
+}
+
+/**
+ * Add an operand to a model.
+ *
+ * The order in which the operands are added is important. The first one added
+ * to a model will have the index value 0, the second 1, etc. These indexes are
+ * used as operand identifiers in {@link ANeuralNetworksModel_addOperation},
+ * {@link ANeuralNetworksExecution_setInput},
+ * {@link ANeuralNetworksExecution_setInputFromMemory},
+ * {@link ANeuralNetworksExecution_setOutput},
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+ * {@link ANeuralNetworksExecution_setOperandValue}.
+ *
+ * To build a model that can accommodate inputs of various sizes, as you may
+ * want to do for a CNN, set the size of the dimensions that will vary at run
+ * time to 0. If you do so, provide the full dimensions when calling
+ * {@link ANeuralNetworksExecution_setInput} or {@link
+ * ANeuralNetworksExecution_setInputFromMemory}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+ * of the operand.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_addOperand(
+    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) {
+  LOAD_FUNCTION(ANeuralNetworksModel_addOperand);
+  EXECUTE_FUNCTION_RETURN(model, type);
+}
+
+/**
+ * Sets an operand to a constant value.
+ *
+ * For scalar values, the content of buffer is copied into the model.
+ *
+ * For tensor values, a pointer to the buffer is stored within the model.
+ * The application is responsible for not changing the content of this region
+ * until all executions using this model have completed. As the data may
+ * be copied during processing, modifying the data after this call yields
+ * undefined results.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param buffer A pointer to the data to use.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model,
+                                                int32_t index,
+                                                const void* buffer,
+                                                size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValue);
+  EXECUTE_FUNCTION_RETURN(model, index, buffer, length);
+}
+
+/**
+ * Sets an operand's per channel quantization parameters.
+ *
+ * Sets parameters required by a tensor of type
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}.
+ * This function must be called for every tensor of type
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} before
+ * calling {@link ANeuralNetworksModel_finish}.
+ *
+ * Available since API level 29.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param channelQuant The per channel quantization parameters for the operand.
+ *                    No memory in this struct needs to outlive the call to
+ *                    this function.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandSymmPerChannelQuantParams);
+  EXECUTE_FUNCTION_RETURN(model, index, channelQuant);
+}
+
+/**
+ * Sets an operand to a value stored in a memory object.
+ *
+ * The content of the memory is not copied. A reference to that memory is stored
+ * inside the model. The application is responsible for not changing the content
+ * of the memory region until all executions using this model have completed.
+ * As the data may be copied during processing, modifying the data after this
+ * call yields undefined results.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param buffer A pointer to the data to use.
+ * @param memory The memory containing the data.
+ * @param offset This specifies the location of the data within the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandValueFromMemory(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValueFromMemory);
+  EXECUTE_FUNCTION_RETURN(model, index, memory, offset, length);
+}
+
+/**
+ * Add an operation to a model.
+ *
+ * @param model The model to be modified.
+ * @param type The type of the operation.
+ * @param inputCount The number of entries in the inputs array.
+ * @param inputs An array of indexes identifying each operand.
+ * @param outputCount The number of entries in the outputs array.
+ * @param outputs An array of indexes identifying each operand.
+ *
+ * The operands specified by inputs and outputs must have been
+ * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
+                                             ANeuralNetworksOperationType type,
+                                             uint32_t inputCount,
+                                             const uint32_t* inputs,
+                                             uint32_t outputCount,
+                                             const uint32_t* outputs) {
+  LOAD_FUNCTION(ANeuralNetworksModel_addOperation);
+  EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount,
+                          outputs);
+}
+
+/**
+ * Specifies which operands will be the model's inputs and outputs.
+ *
+ * An operand cannot be used for both input and output. Doing so will
+ * return an error.
+ *
+ * @param model The model to be modified.
+ * @param inputCount The number of entries in the inputs array.
+ * @param inputs An array of indexes identifying the input operands.
+ * @param outputCount The number of entries in the outputs array.
+ * @param outputs An array of indexes identifying the output operands.
+ *
+ * The operands specified by inputs and outputs must have been
+ * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ */
+inline int ANeuralNetworksModel_identifyInputsAndOutputs(
+    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+    uint32_t outputCount, const uint32_t* outputs) {
+  LOAD_FUNCTION(ANeuralNetworksModel_identifyInputsAndOutputs);
+  EXECUTE_FUNCTION_RETURN(model, inputCount, inputs, outputCount, outputs);
+}
+
+/**
+ * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
+ * calculated with range and/or precision as low as that of the IEEE 754 16-bit
+ * floating-point format. By default, {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+ * must be calculated using at least the range and precision of the IEEE 754
+ * 32-bit floating-point format.
+ *
+ * @param model The model to be modified.
+ * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
+ *              calculated with range and/or precision as low as that of the
+ *              IEEE 754 16-bit floating point format. 'false' indicates
+ *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using
+ *              at least the range and precision of the IEEE 754 32-bit floating
+ *              point format.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * Available since API level 28.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ */
+inline int ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+    ANeuralNetworksModel* model, bool allow) {
+  LOAD_FUNCTION(ANeuralNetworksModel_relaxComputationFloat32toFloat16);
+  EXECUTE_FUNCTION_RETURN(model, allow);
+}
+
+/**
+ * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+ * This only creates the object. Compilation is only performed once
+ * {@link ANeuralNetworksCompilation_start} is invoked.
+ *
+ * <p>The provided model must outlive the compilation.</p>
+ *
+ * The model must already have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param model The {@link ANeuralNetworksModel} to be compiled.
+ * @param compilation The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the model is invalid.
+ */
+inline int ANeuralNetworksCompilation_create(
+    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_create);
+  EXECUTE_FUNCTION_RETURN(model, compilation);
+}
+
+/**
+ * Destroy a compilation.
+ *
+ * <p>If called on a compilation for which
+ * {@link ANeuralNetworksCompilation_start} has been called, the
+ * function will return immediately but will mark the compilation to be deleted
+ * once the compilation completes. The {@link ANeuralNetworksCompilation_wait}
+ * will return ERROR_DELETED.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be destroyed. Passing NULL is
+ * acceptable and results in no operation.
+ */
+inline void ANeuralNetworksCompilation_free(
+    ANeuralNetworksCompilation* compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_free);
+  EXECUTE_FUNCTION(compilation);
+}
+
+/**
+ * Sets the execution preference.
+ *
+ * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param preference Either {@link PREFER_LOW_POWER},
+ *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+ *                  {@link PREFER_SUSTAINED_SPEED}.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksCompilation_setPreference(
+    ANeuralNetworksCompilation* compilation, int32_t preference) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_setPreference);
+  EXECUTE_FUNCTION_RETURN(compilation, preference);
+}
+
+/**
+ * Waits until the compilation completes.
+ *
+ * More than one thread can wait on a compilation. When the compilation
+ * completes, all threads will be released.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
+ */
+inline int ANeuralNetworksCompilation_finish(
+    ANeuralNetworksCompilation* compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_finish);
+  EXECUTE_FUNCTION_RETURN(compilation);
+}
+/**
+ * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+ * This only creates the object. Computation is only performed once
+ * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ *
+ * <p>The provided compilation must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+ * @param execution The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the compilation is invalid.
+ */
+inline int ANeuralNetworksExecution_create(
+    ANeuralNetworksCompilation* compilation,
+    ANeuralNetworksExecution** execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_create);
+  EXECUTE_FUNCTION_RETURN(compilation, execution);
+}
+
+/**
+ * Destroy an execution.
+ *
+ * <p>If called on an execution for which
+ * {@link ANeuralNetworksExecution_startCompute} has been called, the
+ * function will return immediately but will mark the execution to be deleted
+ * once the computation completes.   The {link ANeuralNetworksExecution_wait}
+ * will return ANEURALNETWORKS_ERROR_DELETED.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be destroyed. Passing NULL is acceptable
+ * and results in no operation.
+ */
+inline void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_free);
+  EXECUTE_FUNCTION(execution);
+}
+
+/**
+ * Associate a user buffer with an input of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided buffer must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the input argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This should be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other properties of the type must be the same as
+ *             specified in the model. If the type is the same as specified
+ *             when the model was built, NULL can be passed.
+ * @param buffer The buffer containing the data.
+ * @param length The length in bytes of the buffer.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the input.
+ */
+inline int ANeuralNetworksExecution_setInput(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const void* buffer, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setInput);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
+}
+
+/**
+ * Associate part of a memory object with an input of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided memory must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the input argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param memory The memory containing the data.
+ * @param offset This specifies the location of the data within the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the input.
+ */
+inline int ANeuralNetworksExecution_setInputFromMemory(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setInputFromMemory);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
+}
+
+/**
+ * Associate a user buffer with an output of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided buffer must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the output argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param buffer The buffer where the data is to be written.
+ * @param length The length in bytes of the buffer.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the output.
+ */
+inline int ANeuralNetworksExecution_setOutput(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, void* buffer, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setOutput);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
+}
+
+/**
+ * Associate part of a memory object with an output of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided memory must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the output argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param memory The memory where the data is to be stored.
+ * @param offset This specifies the location of the data within the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The length in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the output.
+ */
+inline int ANeuralNetworksExecution_setOutputFromMemory(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setOutputFromMemory);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
+}
+
+/**
+ * Schedule evaluation of the execution.
+ *
+ * <p>Schedules evaluation of the execution. Once the model has been
+ * applied and the outputs are ready to be consumed, the execution will be
+ * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that signal.
+ * </p>
+ *
+ * Multiple executions can be scheduled and evaluated concurrently, and
+ * compilations can be performed concurrently with executions. The runtime makes
+ * no guarantee on the ordering of the completion of compilations and
+ * executions. If it's important to the application, the application should
+ * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
+ * {@link ANeuralNetworksExecution_wait}.
+ *
+ * ANeuralNetworksExecution_wait must be called to recuperate the resources used
+ * by the execution.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be scheduled and executed.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_startCompute(
+    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_startCompute);
+  EXECUTE_FUNCTION_RETURN(execution, event);
+}
+
+/**
+ * Waits until the execution completes.
+ *
+ * More than one thread can wait on an event. When the execution completes,
+ * all threads will be released.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ */
+inline int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
+  LOAD_FUNCTION(ANeuralNetworksEvent_wait);
+  EXECUTE_FUNCTION_RETURN(event);
+}
+
+/**
+ * Destroys the event.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ */
+inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
+  LOAD_FUNCTION(ANeuralNetworksEvent_free);
+  EXECUTE_FUNCTION(event);
+}
+
+/**
+ * Get the number of available devices.
+ *
+ * @param numDevices Used to return the number of devices.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworks_getDeviceCount(uint32_t* numDevices) {
+  LOAD_FUNCTION(ANeuralNetworks_getDeviceCount);
+  EXECUTE_FUNCTION_RETURN(numDevices);
+}
+
+/**
+ * Get the representation of the specified device.
+ *
+ * @param devIndex The index of the specified device. Must be less than the
+ *                 number of available devices.
+ * @param device The representation of the specified device.
+ *               The same representation will always be returned for the
+ *               specified device.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+
+inline int ANeuralNetworks_getDevice(uint32_t devIndex,
+                                     ANeuralNetworksDevice** device) {
+  LOAD_FUNCTION(ANeuralNetworks_getDevice);
+  EXECUTE_FUNCTION_RETURN(devIndex, device);
+}
+
+/**
+ * Get the name of the specified device.
+ *
+ * @param device The representation of the specified device.
+ * @param name   The returned name of the specified device. The name will be in
+ *               UTF-8 and will be null-terminated. It will be recognizable as a
+ *               known device name rather than a cryptic string. For devices
+ *               with API level 29 and above, the format of the name is
+ *               {VENDOR}-{DEVICE}, e.g. “google-ipu”. For devices with feature
+ *               level 28 or lower, the name will always be “unknown-device”.
+ *               The name will remain valid for the duration of the application.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getName(const ANeuralNetworksDevice* device,
+                                         const char** name) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getName);
+  EXECUTE_FUNCTION_RETURN(device, name);
+}
+
+/**
+ * Get the version of the driver implementation of the specified device.
+ *
+ * It’s the responsibility of the driver implementor to insure that this version
+ * string uniquely distinguishes this implementation from all previous
+ * implementations.
+ *
+ * This version string must not be confused with the feature level which is
+ * solely defined by {@link ANeuralNetworksDevice_getFeatureLevel}. There is no
+ * implicit ordering of the versions. For example, it is not possible to filter
+ * all drivers older than a certain version.
+ *
+ * Application developers may use this version string to avoid or prefer
+ * specific driver implementations. For example, an application may want to do
+ * so because:
+ *     - A specific version of the driver does not provide the required
+ * performance, perhaps because of a performance regression.
+ *     - A specific version of the driver has a bug or returns results that
+ * don’t match the minimum precision requirement for the application.
+ *
+ * @param device  The representation of the specified device.
+ * @param version The returned version string of the driver for the specified
+ *                device. The string will be in UTF-8 and will be
+ *                null-terminated. For devices with feature level 28 or lower,
+ *                "UNKNOWN" will be returned. The version string will remain
+ *                valid for the duration of the application.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getVersion(const ANeuralNetworksDevice* device,
+                                            const char** version) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getVersion);
+  EXECUTE_FUNCTION_RETURN(device, version);
+}
+
+/**
+ * Get the supported NNAPI version of the specified device.
+ *
+ * Each device has a supported feature level, which is the most advanced feature
+ * this driver implements. For example, if the driver implements the features
+ * introduced in Android P, but does not implement the features introduced after
+ * Android P, the value would be 28. Developers could decide whether or not the
+ * specified device should be used for a Model that has certain feature
+ * requirements.
+ *
+ * @param device       The representation of the specified device.
+ * @param featureLevel The API level of the most advanced feature this driver
+ *                     implements.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getFeatureLevel(
+    const ANeuralNetworksDevice* device, int64_t* featureLevel) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getFeatureLevel);
+  EXECUTE_FUNCTION_RETURN(device, featureLevel);
+}
+
+/**
+ * Get the supported operations for a specified set of devices. If multiple
+ * devices are selected, the supported operation list is a union of supported
+ * operations of all selected devices.
+ *
+ * @param model        The model to be queried.
+ * @param devices      The set of devices. Must not contain duplicates.
+ * @param numDevices   The number of devices in the set.
+ * @param supportedOps The boolean array to be filled. True means supported. The
+ *                     size of the boolean array must be at least as large as
+ *                     the number of operations in the model. The order of
+ *                     elements in the supportedOps array matches the order in
+ *                     which the corresponding operations were added to the
+ *                     model.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksModel_getSupportedOperationsForDevices(
+    const ANeuralNetworksModel* model,
+    const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+    bool* supportedOps) {
+  LOAD_FUNCTION(ANeuralNetworksModel_getSupportedOperationsForDevices);
+  EXECUTE_FUNCTION_RETURN(model, devices, numDevices, supportedOps);
+}
+
+/**
+ * Create a {@link ANeuralNetworksCompilation} to compile the given model for a
+ * specified set of devices. If more than one device is specified, the
+ * compilation will distribute the workload automatically across the devices.
+ * The model must be fully supported by the specified set of devices. This means
+ * that ANeuralNetworksModel_getSupportedOperationsForDevices() must have
+ * returned true for every operation for that model/devices pair.
+ *
+ * @param model       The {@link ANeuralNetworksModel} to be compiled.
+ * @param devices     The set of devices. Must not contain duplicates.
+ * @param numDevices  The number of devices in the set.
+ * @param compilation The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the model is invalid.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksCompilation_createForDevices(
+    ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+    uint32_t numDevices, ANeuralNetworksCompilation** compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_createForDevices);
+  EXECUTE_FUNCTION_RETURN(model, devices, numDevices, compilation);
+}
+
+/**
+ * Sets the compilation caching signature and the cache directory.
+ *
+ * Provides optional caching information to the runtime for faster repeated
+ * compilation.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param cacheDir The cache directory to store and retrieve caching data. It is
+ *                 recommended to use the code_cache provided by the Android
+ *                 runtime. If not using the code_cache, the user should choose
+ *                 a directory local to the application, and is responsible to
+ *                 manage and clean the cache entries.
+ * @param token The token provided by the user to specify a model, must be of
+ *              length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should
+ *              ensure that the token is unique to a model within the
+ *              application. The NNAPI runtime will not detected token
+ *              collisions. If there is a collision, the compilation outcome may
+ *              be incorrect without notifying with error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksCompilation_setCaching(
+    ANeuralNetworksCompilation* compilation, const char* cacheDir,
+    const uint8_t* token) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_setCaching);
+  EXECUTE_FUNCTION_RETURN(compilation, cacheDir, token);
+}
+
+/**
+ * Schedule synchronous evaluation of the execution.
+ *
+ * <p>Schedules synchronous evaluation of the execution. Returns once the
+ * execution has completed and the outputs are ready to be consumed.
+ * </p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * See {@link ANeuralNetworksExecution_startCompute} for asynchronous execution.
+ * Synchronous execution incurs lower overhead than asynchronous execution.
+ *
+ * Available since API level 29.
+ *
+ * @param execution The execution to be scheduled and executed.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory
+ *         cannot be properly mapped.
+ */
+inline int ANeuralNetworksExecution_compute(
+    ANeuralNetworksExecution* execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_compute);
+  EXECUTE_FUNCTION_RETURN(execution);
+}
+
+/**
+ * Get the dimensional information of the specified output operand of the model
+ * of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * On asynchronous execution initiated by {@link
+ * ANeuralNetworksExecution_startCompute},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+ * recuperate the resources used by the execution.
+ *
+ * @param execution The execution to be queried.
+ * @param index The index of the output argument we are querying. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param rank The rank of the output operand.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful,
+ * ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is provided an
+ * insufficient buffer at execution time, ANEURALNETWORKS_BAD_DATA if the index
+ * is invalid.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksExecution_getOutputOperandRank(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getOutputOperandRank);
+  EXECUTE_FUNCTION_RETURN(execution, index, rank);
+}
+
+/**
+ * Get the dimensional information of the specified output operand of the model
+ * of the
+ * {@link ANeuralNetworksExecution}. The target output operand cannot be a
+ * scalar.
+ *
+ * On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+ * recuperate the resources used by the execution.
+ *
+ * @param execution The execution to be queried.
+ * @param index The index of the output argument we are querying. It is an index
+ *              into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with
+ *              {@link ANeuralNetworksModel_addOperand}.
+ * @param dimensions The dimension array to be filled. The size of the array
+ *                   must be exactly as large as the rank of the output operand
+ *                   to be queried in the model.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful,
+ * ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is provided an
+ * insufficient buffer at execution time, ANEURALNETWORKS_BAD_DATA if the index
+ * is invalid or if the target is a scalar.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksExecution_getOutputOperandDimensions(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getOutputOperandDimensions);
+  EXECUTE_FUNCTION_RETURN(execution, index, dimensions);
+}
+
+/**
+ * Create a {@link ANeuralNetworksBurst} to apply the given compilation.
+ * This only creates the burst object. Computation is only performed once
+ * {@link ANeuralNetworksExecution_burstCompute} is invoked with a valid
+ * {@link ANeuralNetworksExecution} and {@link ANeuralNetworksBurst}.
+ *
+ * <p>The provided compilation must outlive the burst object.</p>
+ *
+ * Available since API level 29.
+ *
+ * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+ * @param burst The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the compilation is invalid.
+ */
+inline int ANeuralNetworksBurst_create(ANeuralNetworksCompilation* compilation,
+                                       ANeuralNetworksBurst** burst) {
+  LOAD_FUNCTION(ANeuralNetworksBurst_create);
+  EXECUTE_FUNCTION_RETURN(compilation, burst);
+}
+
+/**
+ * Destroys the burst object.
+ *
+ * Available since API level 29.
+ *
+ * @param burst The burst object to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
+ */
+inline void ANeuralNetworksBurst_free(ANeuralNetworksBurst* burst) {
+  LOAD_FUNCTION(ANeuralNetworksBurst_free);
+  EXECUTE_FUNCTION(burst);
+}
+
+/**
+ * Schedule synchronous evaluation of the execution on a burst object.
+ *
+ * <p>Schedules synchronous evaluation of the execution. Returns once the
+ * execution has completed and the outputs are ready to be consumed.</p>
+ *
+ * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
+ * any given time for any given burst object. Any
+ * {@link ANeuralNetworksExecution} launched before the previous has finished
+ * will result in ANEURALNETWORKS_BAD_STATE.</p>
+ *
+ * Available since API level 29.
+ *
+ * @param burst The burst object to execute on.
+ * @param execution The execution to be scheduled and executed. The execution
+ *                  must be created from the same {@link
+ *                  ANeuralNetworksCompilation} as the burst object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ */
+inline int ANeuralNetworksExecution_burstCompute(
+    ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_burstCompute);
+  EXECUTE_FUNCTION_RETURN(execution, burst);
+}
+
+/**
+ * Creates a shared memory object from an AHardwareBuffer handle.
+ *
+ * If the shared memory is backed by an AHardwareBuffer of
+ * AHARDWAREBUFFER_FORMAT_BLOB format, it can be used the same way as shared
+ * memory created from a file handle. See
+ * {@link ANeuralNetworksMemory} for a description on how to use this shared
+ * memory.
+ *
+ * If the shared memory is backed by an AHardwareBuffer of a format other than
+ * AHARDWAREBUFFER_FORMAT_BLOB, it can only be used for Model inputs and
+ * outputs. When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the shared memory,
+ * both offset and length must be set to zero and the entire memory region will
+ * be associated with the specified input or output operand. There is no
+ * guarantee that an arbitrary AHardwareBuffer_Format and
+ * AHardwareBuffer_UsageFlags combination can be used by arbitrary devices. The
+ * execution will fail if selected set of devices cannot consume the buffer.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with shared
+ * memory backed by an AHardwareBuffer of a format other than
+ * AHARDWAREBUFFER_FORMAT_BLOB is disallowed.
+ *
+ * TODO(miaowang): add documentation about intended usage with introspection
+ * API.
+ *
+ * Available since API level 29.
+ *
+ * @param ahwb The AHardwareBuffer handle.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+ *
+ * @see AHardwareBuffer
+ */
+inline int ANeuralNetworksMemory_createFromAHardwareBuffer(
+    const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromAHardwareBuffer);
+  EXECUTE_FUNCTION_RETURN(ahwb, memory);
+}
+
+/**
+ * Specifies whether duration of the {@link ANeuralNetworksExecution} is to be
+ * measured. By default, duration is not measured.
+ *
+ * The {@link ANeuralNetworksExecution} must have been created with
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * Available since API level 29.
+ *
+ * @param execution The execution to be modified.
+ * @param measure 'true' if duration is to be measured, 'false' if not.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_setMeasureTiming(
+    ANeuralNetworksExecution* execution, bool measure) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setMeasureTiming);
+  EXECUTE_FUNCTION_RETURN(execution, measure);
+}
+
+/**
+ * Get the time spent in the specified {@link ANeuralNetworksExecution}, in
+ * nanoseconds. The execution must have completed.
+ *
+ * @param execution The execution to be queried.
+ * @param durationCode The measurement to be queried, specified by {@link
+ * DurationCode}.
+ * @param duration The returned duration. If no measurement was requested by
+ *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for
+ * some other reason the duration is not available, UINT64_MAX will be returned.
+ *                 A particular device need not support any given measurement.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_getDuration(
+    const ANeuralNetworksExecution* execution, int32_t durationCode,
+    uint64_t* duration) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getDuration);
+  EXECUTE_FUNCTION_RETURN(execution, durationCode, duration);
+}
+
+/**
+ * Queries whether an extension is supported by the driver implementation of
+ * the specified device.
+ *
+ * @param device The representation of the specified device.
+ * @param extension The extension name.
+ * @param isExtensionSupported The boolean value indicating whether the
+ * extension is supported.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getExtensionSupport(
+    const ANeuralNetworksDevice* device, const char* extensionName,
+    bool* isExtensionSupported) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getExtensionSupport);
+  EXECUTE_FUNCTION_RETURN(device, extensionName, isExtensionSupported);
+}
+
+/**
+ * Creates an operand type from an extension name and an extension operand code.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * Available since API level 29.
+ *
+ * @param model The model to contain the operand.
+ * @param extensionName The extension name.
+ * @param operandCodeWithinExtension The extension operand code.
+ * @param type The operand type.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_getExtensionOperandType(
+    ANeuralNetworksModel* model, const char* extensionName,
+    uint16_t operandCodeWithinExtension, int32_t* type) {
+  LOAD_FUNCTION(ANeuralNetworksModel_getExtensionOperandType);
+  EXECUTE_FUNCTION_RETURN(model, extensionName, operandCodeWithinExtension,
+                          type);
+}
+
+/**
+ * Creates an operation type from an extension name and an extension operation
+ * code.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * Available since API level 29.
+ *
+ * @param model The model to contain the operation.
+ * @param extensionName The extension name.
+ * @param operationCodeWithinExtension The extension operation code.
+ * @param type The operation type.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_getExtensionOperationType(
+    ANeuralNetworksModel* model, const char* extensionName,
+    uint16_t operationCodeWithinExtension, ANeuralNetworksOperationType* type) {
+  LOAD_FUNCTION(ANeuralNetworksModel_getExtensionOperationType);
+  EXECUTE_FUNCTION_RETURN(model, extensionName, operationCodeWithinExtension,
+                          type);
+}
+
+/**
+ * Sets extension operand parameters.
+ *
+ * Available since API level 29.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param data A pointer to the extension operand data.
+ *             The data does not have to outlive the call to this function.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandExtensionData(
+    ANeuralNetworksModel* model, int32_t index, const void* data,
+    size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandExtensionData);
+  EXECUTE_FUNCTION_RETURN(model, index, data, length);
+}
+
+/**
+ * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
+ *
+ * This only creates the memory descriptor. Its properties should be set with
+ * calls to
+ * {@link ANeuralNetworksMemoryDesc_addInputRole},
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and
+ * {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *
+ * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties
+ * have been set.
+ *
+ * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory
+ * descriptor is no longer needed.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The {@link ANeuralNetworksMemoryDesc} to be created.
+ *             Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_create(ANeuralNetworksMemoryDesc** desc) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_create);
+  EXECUTE_FUNCTION_RETURN(desc);
+}
+
+/**
+ * Destroy a memory descriptor.
+ *
+ * The memory descriptor need not have been finished by a call to
+ * {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be destroyed. Passing NULL is acceptable
+ * and results in no operation.
+ */
+inline void ANeuralNetworksMemoryDesc_free(ANeuralNetworksMemoryDesc* desc) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_free);
+  EXECUTE_FUNCTION(desc);
+}
+
+/**
+ * Specify that a memory object will be playing the role of an output to an
+ * execution created from a particular compilation.
+ *
+ * The compilation and the output index fully specify an output operand. This
+ * function may be invoked multiple times on the same memory descriptor with
+ * different output operands, and the same output operand may be specified on
+ * multiple memory descriptors. However, specifying the same output operand on
+ * the same memory descriptor object more than once will return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified
+ * by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each
+ * other. Two dimensions are incompatible if both ranks are fully specified but
+ * have different values, or if there is at least one axis that is fully
+ * specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the memory
+ * descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+ * error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished
+ * by calling {@link ANeuralNetworksCompilation_finish}, and must outlive the
+ * memory descriptor.
+ * @param index The index of the output argument we are referencing from the
+ *              compilation. It is an index into the outputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0].
+ * Describes how likely the memory is to be used in the specified role. This is
+ *                  provided as a hint to optimize the case when multiple roles
+ * prefer different memory locations or data layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_addOutputRole(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, int32_t index,
+    float frequency) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_addOutputRole);
+  EXECUTE_FUNCTION_RETURN(desc, compilation, index, frequency);
+}
+
+/**
+ * Specify that a memory object will be playing the role of an input to an
+ * execution created from a particular compilation.
+ *
+ * The compilation and the input index fully specify an input operand. This
+ * function may be invoked multiple times on the same memory descriptor with
+ * different input operands, and the same input operand may be specified on
+ * multiple memory descriptors. However, specifying the same input operand on
+ * the same memory descriptor more than once will return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified
+ * by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each
+ * other. Two dimensions are incompatible if both ranks are fully specified but
+ * have different values, or if there is at least one axis that is fully
+ * specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory
+ * descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+ * error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished
+ * by calling {@link ANeuralNetworksCompilation_finish}, and must outlive the
+ * memory descriptor.
+ * @param index The index of the input argument we are referencing from the
+ * compilation. It is an index into the inputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0].
+ * Describes how likely the memory is to be used in the specified role. This is
+ *                  provided as a hint to optimize the case when different roles
+ * prefer different memory locations or data layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_addInputRole(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, uint32_t index,
+    float frequency) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_addInputRole);
+  EXECUTE_FUNCTION_RETURN(desc, compilation, index, frequency);
+}
+
+/**
+ * Set the dimensional information of the memory descriptor.
+ *
+ * The specified dimensions must be compatible with the dimensions of the
+ * corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are
+ * incompatible if both ranks are fully specified but have different values, or
+ * if there is at least one axis that is fully specified in both but has
+ * different values.
+ *
+ * Attempting to modify a memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+ * error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param rank The number of dimensions. Must be 0 for scalars.
+ * @param dimensions An array of dimensions. An entry with the value 0 indicates
+ * that the corresponding axis has an unknown size.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_setDimensions(
+    ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+    const uint32_t* dimensions) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_setDimensions);
+  EXECUTE_FUNCTION_RETURN(desc, rank, dimensions);
+}
+
+/**
+ * Indicate that we have finished modifying a memory descriptor. Required before
+ * calling
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * This function must only be called once for a given memory descriptor.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_finish(ANeuralNetworksMemoryDesc* desc) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_finish);
+  EXECUTE_FUNCTION_RETURN(desc);
+}
+
+/**
+ * Creates a memory object from a memory descriptor.
+ *
+ * The memory object is created with an uninitialized buffer. A memory object
+ * with an uninitialized buffer may only be used according to the roles
+ * specified by
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination memory
+ * in
+ * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object is
+ * initialized after the memory object is used as an output in a successful
+ * execution, or used as the destination memory in a successful {@link
+ * ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may
+ * be used according to all roles specified in
+ * {@link ANeuralNetworksMemoryDesc}, or as the source or destination memory in
+ * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will return
+ * to the uninitialized state if the memory object is used as an output in a
+ * failed execution, or used as the destination memory in a failed {@link
+ * ANeuralNetworksMemory_copy}.
+ *
+ * The dimensions of the memory descriptor are deduced from the dimensions of
+ * the corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions
+ * set by the call to {@link ANeuralNetworksMemoryDesc_setDimensions}, if any.
+ * The memory descriptor may have unspecified dimensions or rank. In such a
+ * case, the same memory object may be used with different shapes of outputs in
+ * different executions. When the memory is used as an input, the input shape
+ * must be the same as the output shape from the last execution using this
+ * memory object as an output, or the last
+ * {@link ANeuralNetworkMemory_copy} using this memory object as the destination
+ * memory. Creating a memory object with unspecified dimensions or rank may fail
+ * for certain sets of roles.
+ *
+ * Using the memory in roles or shapes that are not compatible with the rules
+ * specified above will return an error.
+ *
+ * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory object,
+ * both offset and length must be set to zero and the entire memory region will
+ * be associated with the specified input or output operand.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the
+ * memory created from this function will return an error.
+ *
+ * {@link ANeuralNetworksMemory_free} must be called once the memory is no
+ * longer needed.
+ *
+ * Attempting to create memory from an unfinished memory descriptor will return
+ * an error.
+ *
+ * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the
+ * {@link ANeuralNetworksMemory} object.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED if
+ * the memory is created with unspecified dimensions or rank and it is not
+ * supported for this set of roles.
+ */
+inline int ANeuralNetworksMemory_createFromDesc(
+    const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromDesc);
+  EXECUTE_FUNCTION_RETURN(desc, memory);
+}
+
+/**
+ * Copies data from one memory object to another.
+ *
+ * If at most one of the src and dst is created from
+ * {@link ANeuralNetworksMemory_createFromDesc}, the src and dst must have the
+ * same logical size:
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd},
+ * or if it is created from {@link
+ * ANeuralNetworksMemory_createFromAHardwareBuffer} with format of
+ * AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the memory.
+ * - If the memory is created from
+ *   {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a format other
+ * than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size when there
+ * is no padding and the data is tightly packed. This function may fail if the
+ *   AHardwareBuffer cannot be accessed.
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromDesc},
+ * the logical size equals the size indicated by the {@link OperandCode}
+ * multiplied by the number of elements. This function will fail if the number
+ * of elements is unknown.
+ *
+ * If both src and dst are created from {@link
+ * ANeuralNetworksMemory_createFromDesc}, they must have compatible dimensions.
+ * Two dimensions are incompatible if both ranks are fully specified but have
+ * different values, or if there is at least one axis that is fully specified in
+ * both but has different values. The dst may have unspecified dimensions or
+ * rank. In such a case, the dimensions of dst will get updated according to the
+ * dimensions of the src.
+ *
+ * In both cases, if the src is created from
+ * {@link ANeuralNetworksMemory_createFromDesc}, it must have been used as an
+ * output in a successful execution, or used as the destination memory in a
+ * successful
+ * {@link ANeuralNetworksMemory_copy}.
+ *
+ * The src and dst may have different data layout, in which case the data
+ * copying is performed logically with data layout transformation.
+ *
+ * Available since API level 30.
+ *
+ * @param src The source memory object.
+ * @param dst The destination memory object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemory_copy(const ANeuralNetworksMemory* src,
+                                      const ANeuralNetworksMemory* dst) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_copy);
+  EXECUTE_FUNCTION_RETURN(src, dst);
+}
+
+/**
+ * Create a {@link ANeuralNetworksEvent} from a sync_fence file descriptor.
+ *
+ * The newly created ANeuralNetworksEvent does not take ownership of the
+ * provided sync_fence_fd, it will instead dup the provided sync_fence_fd and
+ * own the duplicate.
+ *
+ * @param sync_fence_fd The sync_fence file descriptor.
+ * @param event The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+inline int ANeuralNetworksEvent_createFromSyncFenceFd(
+    int sync_fence_fd, ANeuralNetworksEvent** event) {
+  LOAD_FUNCTION(ANeuralNetworksEvent_createFromSyncFenceFd);
+  EXECUTE_FUNCTION_RETURN(sync_fence_fd, event);
+}
+
+/**
+ * Get sync_fence file descriptor from the event.
+ *
+ * If the ANeuralNetworksEvent is not backed by a sync fence, the sync_fence_fd
+ * will be set to -1, and ANEURALNETWORKS_BAD_DATA will be returned.
+ *
+ * See {@link ANeuralNetworksEvent_createFromSyncFenceFd} and
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} to see how to
+ * create an event backed by a sync fence.
+ *
+ * The user takes ownership of the returned fd, and must close the returned file
+ * descriptor when it is no longer needed.
+ *
+ * @param event An event that is backed by a sync fence.
+ * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will
+ *                      be set to -1 if there is an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+inline int ANeuralNetworksEvent_getSyncFenceFd(
+    const ANeuralNetworksEvent* event, int* sync_fence_fd) {
+  LOAD_FUNCTION(ANeuralNetworksEvent_getSyncFenceFd);
+  EXECUTE_FUNCTION_RETURN(event, sync_fence_fd);
+}
+
+/**
+ * Schedule asynchronous evaluation of the execution with dependencies.
+ *
+ * The execution will wait for all the depending events to be signaled before
+ * starting the evaluation. Once the execution has completed and the outputs
+ * are ready to be consumed, the returned event will be signaled. Depending on
+ * which devices are handling the execution, the event could be backed by a sync
+ * fence. Use {@link ANeuralNetworksEvent_wait} to wait for that event.
+ *
+ * ANeuralNetworksEvent_wait must be called to recurperate the resources used
+ * by the execution.
+ *
+ * If parts of the execution are scheduled on devices that do not support fenced
+ * execution, the function call may wait for such parts to finish before
+ * returning.
+ *
+ * The function will return an error if any of the events in dependencies is
+ * already in a bad state. After the execution is scheduled, if any of the
+ * events in dependencies does not complete normally, the execution will fail,
+ * and {@link ANeuralNetworksEvent_wait} on the returned event will return an
+ * error.
+ *
+ * The function will return an error if any of the execution outputs has a
+ * tensor operand type that is not fully specified.
+ *
+ * The function can be passed a timeout duration in nanoseconds. This timeout
+ * duration acts as a hint to drivers in the same way that the timeout durations
+ * in {@link ANeuralNetworksCompilation_setTimeout} and {@link
+ * ANeuralNetworksExecution_setTimeout} act as hints to drivers. The duration
+ * begins when all waitFor sync fences have been signaled, and can be used
+ * together with {@link ANeuralNetworksExecution_setTimeout} which specifies the
+ * maximum timeout duration beginning at the call to
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies}.
+ * If the duration is non-zero, the {@link ANeuralNetworksExecution} must have
+ * been created from an {@link ANeuralNetworksCompilation} which in turn was
+ * created from
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If either
+ * the timeout duration from {@link ANeuralNetworksExecution_setTimeout} or the
+ * timeout duration passed to this call is exceeded, the execution may be
+ * aborted, in which case {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be
+ * returned through {@link
+ * ANeuralNetworksExecution_startComputeWithDependencies} or {@link
+ * ANeuralNetworksEvent_wait} on the event object. If the device has a feature
+ * level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that is lower
+ * than 30, then the timeout duration hints will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned through {@link ANeuralNetworksEvent_wait} on the event
+ * object.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous
+ * execution. See {@link ANeuralNetworksExecution_startCompute} for regular
+ * asynchronous execution.
+ *
+ * @param execution The execution to be scheduled and executed.
+ * @param dependencies A set of depending events. The actual evaluation will not
+ * start until all the events are signaled.
+ * @param num_dependencies The number of events in the dependencies set.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *                 be spent executing the model after all dependencies are
+ *                 signaled. If set to 0, the timeout duration is considered
+ *                 infinite.
+ * @param event The event that will be signaled on completion. event is set to
+ *              NULL if there's an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled.
+ *
+ * Available since API level 30.
+ */
+inline int ANeuralNetworksExecution_startComputeWithDependencies(
+    ANeuralNetworksExecution* execution,
+    const ANeuralNetworksEvent* const* dependencies, uint32_t num_dependencies,
+    uint64_t duration, ANeuralNetworksEvent** event) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_startComputeWithDependencies);
+  EXECUTE_FUNCTION_RETURN(execution, dependencies, num_dependencies, duration,
+                          event);
+}
+
+/**
+ * Specifies whether the {@link ANeuralNetworksExecution} is able to accept
+ * padded input and output buffers and memory objects.
+ *
+ * By default, the input and output buffers and memory objects of {@link
+ * ANeuralNetworksExecution} do not allow padding.
+ *
+ * Setting the execution to accept padded input and output buffers and memory
+ * objects enables the length argument of {@link
+ * ANeuralNetworksExecution_setInput},
+ * {@link ANeuralNetworksExecution_setInputFromMemory}, {@link
+ * ANeuralNetworksExecution_setOutput}, and {@link
+ * ANeuralNetworksExecution_setOutputFromMemory} to be greater than the raw
+ * size of the operand (i.e. the size of an element multiplied by the number
+ * of elements). The extra bytes at the end of the buffer or memory region may
+ * be used by the driver to access data in chunks, for efficiency.
+ *
+ * This method must not be called after {@link
+ * ANeuralNetworksExecution_setInput},
+ * {@link ANeuralNetworksExecution_setInputFromMemory}, {@link
+ * ANeuralNetworksExecution_setOutput}, or {@link
+ * ANeuralNetworksExecution_setOutputFromMemory}.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded
+ * usage.
+ *
+ * @param execution The execution to be modified.
+ * @param enable 'true' if the execution is to be able to accept padded input
+ * and output buffers and memory objects, 'false' if not.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+ *         ANEURALNETWORKS_BAD_STATE if {@link
+ * ANeuralNetworksExecution_setInput},
+ *         {@link ANeuralNetworksExecution_setInputFromMemory},
+ *         {@link ANeuralNetworksExecution_setOutput}, or
+ *         {@link ANeuralNetworksExecution_setOutputFromMemory} has been
+ * called on the execution.
+ *
+ * Available since API level 31.
+ */
+inline int ANeuralNetworksExecution_enableInputAndOutputPadding(
+    ANeuralNetworksExecution* execution, bool enable) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_enableInputAndOutputPadding);
+  EXECUTE_FUNCTION_RETURN(execution, enable);
+}
+
+/**
+ * Specifies whether the {@link ANeuralNetworksExecution} can be reused for
+ * multiple computations.
+ *
+ * By default, the {@link ANeuralNetworksExecution} is not reusable.
+ *
+ * Setting the execution to be reusable enables multiple computations to be
+ * scheduled and evaluated on the same execution sequentially, either by means
+ * of
+ * {@link ANeuralNetworksExecution_burstCompute}, {@link
+ * ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies}.
+ *
+ * This function may only be invoked when the execution is in the preparation
+ * state.
+ *
+ * See {@link ANeuralNetworksExecution} for information on execution states
+ * and multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param reusable 'true' if the execution is to be reusable, 'false' if not.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+ *         ANEURALNETWORKS_BAD_STATE if the execution is not in the
+ * preparation state.
+ *
+ * Available since API level 31.
+ */
+inline int ANeuralNetworksExecution_setReusable(
+    ANeuralNetworksExecution* execution, bool reusable) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setReusable);
+  EXECUTE_FUNCTION_RETURN(execution, reusable);
+}
+
+#endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/NeuralNetworksTypes.h
new file mode 100644
index 00000000..96ecdb07
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -0,0 +1,916 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
+#define TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <string>
+
+typedef struct AHardwareBuffer AHardwareBuffer;
+
+// NN api types based on NNAPI header file
+// https://developer.android.com/ndk/reference/group/neural-networks
+
+/**
+ * Operand types.
+ *
+ * The type of operands that can be added to a model.
+ *
+ * Although we define many types, most operators accept just a few
+ * types.  Most used are ANEURALNETWORKS_TENSOR_FLOAT32,
+ * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
+ */
+enum {
+  ANEURALNETWORKS_FLOAT32 = 0,
+  ANEURALNETWORKS_INT32 = 1,
+  ANEURALNETWORKS_UINT32 = 2,
+  ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
+  ANEURALNETWORKS_TENSOR_INT32 = 4,
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+  ANEURALNETWORKS_BOOL = 6,
+  ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7,
+  ANEURALNETWORKS_TENSOR_FLOAT16 = 8,
+  ANEURALNETWORKS_TENSOR_BOOL8 = 9,
+  ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
+  ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED = 14,
+};
+
+/**
+ * Operation types.
+ *
+ * The type of operations that can be added to a model.
+ */
+enum {
+  ANEURALNETWORKS_ADD = 0,
+  ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
+  ANEURALNETWORKS_CONCATENATION = 2,
+  ANEURALNETWORKS_CONV_2D = 3,
+  ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
+  ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
+  ANEURALNETWORKS_DEQUANTIZE = 6,
+  ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
+  ANEURALNETWORKS_FLOOR = 8,
+  ANEURALNETWORKS_FULLY_CONNECTED = 9,
+  ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
+  ANEURALNETWORKS_L2_NORMALIZATION = 11,
+  ANEURALNETWORKS_L2_POOL_2D = 12,
+  ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
+  ANEURALNETWORKS_LOGISTIC = 14,
+  ANEURALNETWORKS_LSH_PROJECTION = 15,
+  ANEURALNETWORKS_LSTM = 16,
+  ANEURALNETWORKS_MAX_POOL_2D = 17,
+  ANEURALNETWORKS_MUL = 18,
+  ANEURALNETWORKS_RELU = 19,
+  ANEURALNETWORKS_RELU1 = 20,
+  ANEURALNETWORKS_RELU6 = 21,
+  ANEURALNETWORKS_RESHAPE = 22,
+  ANEURALNETWORKS_RESIZE_BILINEAR = 23,
+  ANEURALNETWORKS_RNN = 24,
+  ANEURALNETWORKS_SOFTMAX = 25,
+  ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
+  ANEURALNETWORKS_SVDF = 27,
+  ANEURALNETWORKS_TANH = 28,
+  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
+  ANEURALNETWORKS_DIV = 30,
+  ANEURALNETWORKS_MEAN = 31,
+  ANEURALNETWORKS_PAD = 32,
+  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
+  ANEURALNETWORKS_SQUEEZE = 34,
+  ANEURALNETWORKS_STRIDED_SLICE = 35,
+  ANEURALNETWORKS_SUB = 36,
+  ANEURALNETWORKS_TRANSPOSE = 37,
+  ANEURALNETWORKS_ABS = 38,
+  ANEURALNETWORKS_ARGMAX = 39,
+  ANEURALNETWORKS_ARGMIN = 40,
+  ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_LSTM = 42,
+  ANEURALNETWORKS_CAST = 45,
+  ANEURALNETWORKS_EQUAL = 48,
+  ANEURALNETWORKS_EXP = 49,
+  ANEURALNETWORKS_EXPAND_DIMS = 50,
+  ANEURALNETWORKS_GATHER = 51,
+  ANEURALNETWORKS_GREATER = 53,
+  ANEURALNETWORKS_GREATER_EQUAL = 54,
+  ANEURALNETWORKS_GROUPED_CONV_2D = 55,
+  ANEURALNETWORKS_LESS = 58,
+  ANEURALNETWORKS_LESS_EQUAL = 59,
+  ANEURALNETWORKS_LOG = 60,
+  ANEURALNETWORKS_LOGICAL_AND = 61,
+  ANEURALNETWORKS_LOGICAL_NOT = 62,
+  ANEURALNETWORKS_LOGICAL_OR = 63,
+  ANEURALNETWORKS_LOG_SOFTMAX = 64,
+  ANEURALNETWORKS_MAXIMUM = 65,
+  ANEURALNETWORKS_MINIMUM = 66,
+  ANEURALNETWORKS_NEG = 67,
+  ANEURALNETWORKS_NOT_EQUAL = 68,
+  ANEURALNETWORKS_PAD_V2 = 69,
+  ANEURALNETWORKS_POW = 70,
+  ANEURALNETWORKS_PRELU = 71,
+  ANEURALNETWORKS_QUANTIZE = 72,
+  ANEURALNETWORKS_QUANTIZED_16BIT_LSTM = 73,
+  ANEURALNETWORKS_REDUCE_ANY = 76,
+  ANEURALNETWORKS_REDUCE_MAX = 77,
+  ANEURALNETWORKS_REDUCE_MIN = 78,
+  ANEURALNETWORKS_REDUCE_PROD = 79,
+  ANEURALNETWORKS_REDUCE_SUM = 80,
+  ANEURALNETWORKS_RSQRT = 83,
+  ANEURALNETWORKS_SELECT = 84,
+  ANEURALNETWORKS_SIN = 85,
+  ANEURALNETWORKS_SLICE = 86,
+  ANEURALNETWORKS_SPLIT = 87,
+  ANEURALNETWORKS_SQRT = 88,
+  ANEURALNETWORKS_TILE = 89,
+  ANEURALNETWORKS_TOPK_V2 = 90,
+  ANEURALNETWORKS_TRANSPOSE_CONV = 91,
+  ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
+  ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
+  ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+  ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+  ANEURALNETWORKS_IF = 96,
+  ANEURALNETWORKS_WHILE = 97,
+  ANEURALNETWORKS_ELU = 98,
+  ANEURALNETWORKS_HARD_SWISH = 99,
+  ANEURALNETWORKS_FILL = 100,
+  ANEURALNETWORKS_RANK = 101,
+  ANEURALNETWORKS_BATCH_MATMUL = 102,
+  ANEURALNETWORKS_PACK = 103,
+  ANEURALNETWORKS_MIRROR_PAD = 104,
+  ANEURALNETWORKS_REVERSE = 105,
+};
+
+/**
+ * Fused activation function types.
+ *
+ */
+enum {
+  ANEURALNETWORKS_FUSED_NONE = 0,
+  ANEURALNETWORKS_FUSED_RELU = 1,
+  ANEURALNETWORKS_FUSED_RELU1 = 2,
+  ANEURALNETWORKS_FUSED_RELU6 = 3,
+};
+
+/**
+ * Execution preferences.
+ */
+enum {
+  ANEURALNETWORKS_PREFER_LOW_POWER = 0,
+  ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
+  ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
+};
+
+/**
+ * Result codes.
+ */
+// LINT.IfChange
+enum {
+  ANEURALNETWORKS_NO_ERROR = 0,
+  ANEURALNETWORKS_OUT_OF_MEMORY = 1,
+  ANEURALNETWORKS_INCOMPLETE = 2,
+  ANEURALNETWORKS_UNEXPECTED_NULL = 3,
+  ANEURALNETWORKS_BAD_DATA = 4,
+  ANEURALNETWORKS_OP_FAILED = 5,
+  ANEURALNETWORKS_BAD_STATE = 6,
+  ANEURALNETWORKS_UNMAPPABLE = 7,
+  ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
+  ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
+  ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT = 10,
+  ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT = 11,
+  ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT = 12,
+  ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT = 13,
+  ANEURALNETWORKS_DEAD_OBJECT = 14,
+};
+// LINT.ThenChange(//tensorflow/lite/delegates/nnapi/nnapi_delegate.cc:NnApiErrorDescription)
+
+/**
+ * Implicit padding algorithms.
+ */
+enum {
+  ANEURALNETWORKS_PADDING_SAME = 1,
+  ANEURALNETWORKS_PADDING_VALID = 2,
+};
+
+/**
+ * Device types.
+ *
+ * The type of NNAPI device.
+ */
+enum {
+  /** The device type cannot be provided. */
+  ANEURALNETWORKS_DEVICE_UNKNOWN = 0,
+  /** The device does not fall into any category below. */
+  ANEURALNETWORKS_DEVICE_OTHER = 1,
+  /** The device runs NNAPI models on single or multi-core CPU. */
+  ANEURALNETWORKS_DEVICE_CPU = 2,
+  /** The device can run NNAPI models and also accelerate graphics APIs such
+   * as OpenGL ES and Vulkan. */
+  ANEURALNETWORKS_DEVICE_GPU = 3,
+  /** Dedicated accelerator for Machine Learning workloads. */
+  ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
+};
+
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+enum {
+  ANEURALNETWORKS_PRIORITY_LOW = 90,
+  ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+  ANEURALNETWORKS_PRIORITY_HIGH = 110,
+  ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+};
+
+/**
+ * NNAPI feature levels.
+ *
+ * Each update of the NNAPI specification yields a new NNAPI feature level enum
+ * value. NNAPI feature level corrseponds to an NNAPI specification version that
+ * a driver and/or the NNAPI runtime can implement.
+ */
+enum {
+  /** NNAPI specification available in Android O-MR1, Android NNAPI feature
+     level 1 */
+  ANEURALNETWORKS_FEATURE_LEVEL_1 = 27,
+  /** NNAPI specification available in Android P, Android NNAPI feature level 2
+   */
+  ANEURALNETWORKS_FEATURE_LEVEL_2 = 28,
+  /** NNAPI specification available in Android Q, Android NNAPI feature level 3
+   */
+  ANEURALNETWORKS_FEATURE_LEVEL_3 = 29,
+  /** NNAPI specification available in Android R, Android NNAPI feature level 4
+   */
+  ANEURALNETWORKS_FEATURE_LEVEL_4 = 30,
+  /**
+   * NNAPI specification available in Android S, Android NNAPI feature level 5.
+   * After Android S, the NNAPI specification can be updated between Android
+   * API releases.
+   */
+  ANEURALNETWORKS_FEATURE_LEVEL_5 = 31,
+  /** Android NNAPI feature level 6 */
+  ANEURALNETWORKS_FEATURE_LEVEL_6 = 1000006,
+  /** Android NNAPI feature level 7 */
+  ANEURALNETWORKS_FEATURE_LEVEL_7 = 1000007,
+  /** Android NNAPI feature level 8 */
+  ANEURALNETWORKS_FEATURE_LEVEL_8 = 1000008,
+};
+
+/**
+ * ANeuralNetworksMemoryDesc is an opaque type that represents a memory
+ * descriptor.
+ *
+ * A memory descriptor describes the properties of a memory object, and is used
+ * by
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * To use:
+ *   - Create a new memory descriptor by calling
+ *     {@link ANeuralNetworksMemoryDesc_create}.
+ *   - Specify all of the intended input and output roles by calling
+ *     {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ *     {@link ANeuralNetworksMemoryDesc_addOutputRole}.
+ *   - Optionally, specify the memory dimensions by calling
+ *     {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *   - Complete the memory descriptor with {@link
+ * ANeuralNetworksMemoryDesc_finish}.
+ *   - Use the memory descriptor as many times as needed with
+ *     {@link ANeuralNetworksMemory_createFromDesc}.
+ *   - Destroy the memory descriptor with {@link
+ * ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor is completed by calling {@link
+ * ANeuralNetworksMemoryDesc_finish}. A memory descriptor is destroyed by
+ * calling {@link ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor must not be modified once
+ * {@link ANeuralNetworksMemoryDesc_finish}
+ * has been called on it.
+ *
+ * It is the application's responsibility to make sure that only
+ * one thread modifies a memory descriptor at a given time. It is however
+ * safe for more than one thread to use the memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has returned.
+ *
+ * It is also the application's responsibility to ensure that there are no other
+ * uses of the memory descriptor after calling {@link
+ * ANeuralNetworksMemoryDesc_free}. It is however safe to continue using a
+ * {@link ANeuralNetworksMemory} object created from the memory descriptor.
+ *
+ * Available since API level 30.
+ */
+typedef struct ANeuralNetworksMemoryDesc ANeuralNetworksMemoryDesc;
+
+/**
+ * ANeuralNetworksMemory is an opaque type that represents memory.
+ *
+ * This type is used to represent shared memory, memory mapped files,
+ * and similar memories.
+ *
+ * By using shared memory, a program can efficiently communicate to the
+ * runtime and drivers the tensors that define a model. See
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
+ * should typically create one shared memory object that contains every tensor
+ * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
+ * used to create shared memory from a file handle. {@link
+ * ANeuralNetworksMemory_createShared} can be used to directly created shared
+ * memory.
+ *
+ * Memory objects can also be used to specify the input and output arguments of
+ * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
+ * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ */
+typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
+
+/**
+ * ANeuralNetworksModel is an opaque type that contains a description of the
+ * mathematical operations that constitute the model.
+ *
+ * <p>The model will be built by calling<ul>
+ * <li>{@link ANeuralNetworksModel_create},</li>
+ * <li>{@link ANeuralNetworksModel_addOperation},</li>
+ * <li>{@link ANeuralNetworksModel_addOperand},</li>
+ * </ul>
+ *
+ * A model is completed by calling {@link ANeuralNetworksModel_finish}.
+ * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a model at a given time. It is however safe for more than one
+ * thread to use the model once {@link ANeuralNetworksModel_finish} has
+ * returned.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the model after calling {@link ANeuralNetworksModel_free}. This
+ * includes any compilation or execution object created using the model.</p>
+ */
+typedef struct ANeuralNetworksModel ANeuralNetworksModel;
+
+/**
+ * ANeuralNetworksCompilation is an opaque type that can be used to compile
+ * a machine learning model.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new compilation instance by calling the
+ *        {@link ANeuralNetworksCompilation_create} function.</li>
+ *    <li>Perform the compilation with {@link
+ * ANeuralNetworksCompilation_start}.</li> <li>Wait for the compilation to
+ * complete with {@link ANeuralNetworksCompilation_wait}.</li> <li>Use the
+ * compilation as many times as needed with {@link
+ * ANeuralNetworksExecution_create}.</li> <li>Destroy the compilation with
+ * {@link ANeuralNetworksCompilation_free} once all executions using the
+ * compilation have completed.</li></ul></p>
+ *
+ * <p>A compilation cannot be modified once {@link
+ * ANeuralNetworksCompilation_start} has been called on it.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a compilation at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksCompilation_wait} at the same time.
+ * It is also safe for multiple threads to use a compilation object once
+ * {@link ANeuralNetworksCompilation_wait} has completed.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the compilation after calling {@link
+ * ANeuralNetworksCompilation_free}. This includes any execution object created
+ * using the compilation.</p>
+ */
+typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
+
+/**
+ * ANeuralNetworksExecution is an opaque type that can be used to apply a
+ * machine learning model to a set of inputs.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new execution instance by calling the
+ *        {@link ANeuralNetworksExecution_create} function.</li>
+ *    <li>Associate data to the model inputs with
+ *        {@link ANeuralNetworksExecution_setInput} or
+ *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
+ *    <li>Associate output buffers to the model outputs with
+ *        {@link ANeuralNetworksExecution_setOutput} or
+ *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
+ *    <li>Apply the model with {@link
+ * ANeuralNetworksExecution_startCompute}.</li> <li>Wait for the execution to
+ * complete with {@link ANeuralNetworksExecution_wait}.</li> <li>Destroy the
+ * execution with
+ *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
+ *
+ * <p>An execution cannot be modified once {@link
+ * ANeuralNetworksExecution_start} has been called on it.</p>
+ *
+ * <p>An execution can be applied to a model with
+ * {@link ANeuralNetworksExecution_startCompute} only once. Create new
+ * executions to do new evaluations of the model.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies an execution at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksExecution_wait} at the same time.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the request after calling {@link
+ * ANeuralNetworksRequest_free}.</p>
+ */
+typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
+
+/**
+ * Parameters for ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
+ */
+typedef struct ANeuralNetworksSymmPerChannelQuantParams {
+  /* The index of the channel dimension. */
+  uint32_t channelDim;
+  /** The size of the scale array. Should be equal to dimension[channelDim] of
+   * the Operand. */
+  uint32_t scaleCount;
+  /** The array of scaling values for each channel. Each value must be greater
+   * than zero. */
+  const float* scales;
+} ANeuralNetworksSymmPerChannelQuantParams;
+
+/**
+ * ANeuralNetworksBurst is an opaque type that can be used to reduce the latency
+ * of a rapid sequence of executions. It will likely cause overhead if only used
+ * for a single execution.
+ *
+ * ANeuralNetworksBurst serves as a context object for any number of inferences
+ * using {@link ANeuralNetworksExecution} objects. An ANeuralNetworksBurst
+ * object and the {@link ANeuralNetworksExecution} objects used with it must all
+ * have been created from the same {@link ANeuralNetworksCompilation} object.
+ *
+ * This object is also used as a hint to drivers, providing insight to the
+ * lifetime of a rapid sequence of executions. For example, a driver may choose
+ * to increase the clock frequency of its accelerator for the lifetime of a
+ * burst object.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new burst object by calling the
+ *        {@link ANeuralNetworksBurst_create} function.</li>
+ *    <li>For each execution:</li><ul>
+ *        <li>Create {@link ANeuralNetworksExecution} and configure its
+ *            properties (see {@link ANeuralNetworksExecution} for
+ * details).</li> <li>Apply the model synchronously with
+ *            {@link ANeuralNetworksExecution_burstCompute}, reusing the same
+ *            {@link ANeuralNetworksBurst} with the new
+ *            {@link ANeuralNetworksExecution}.</li>
+ *        <li>Use and free the {@link ANeuralNetworksExecution}.</li></ul>
+ *    <li>Destroy the burst with
+ *        {@link ANeuralNetworksBurst_free}.</li></ul></p>
+ *
+ * Available since API level 29.
+ */
+typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
+
+/**
+ * ANeuralNetworksOperandType describes the type of an operand.
+ * This structure is used to describe both scalars and tensors.
+ */
+typedef struct ANeuralNetworksOperandType {
+  /** The data type, e.g ANEURALNETWORKS_INT8. */
+  int32_t type;
+  /** The number of dimensions. It should be 0 for scalars. */
+  uint32_t dimensionCount;
+  /** The dimensions of the tensor. It should be nullptr for scalars. */
+  const uint32_t* dimensions;
+  /** These two fields are only used for quantized tensors.
+   * They should be zero for scalars and non-fixed point tensors.
+   * The dequantized value of each entry is (value - offset) * scale.
+   */
+  float scale;
+  int32_t zeroPoint;
+} ANeuralNetworksOperandType;
+
+/**
+ * ANeuralNetworksEvent is an opaque type that represents an event
+ * that will be signaled once an execution completes.
+ */
+typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
+
+typedef int32_t ANeuralNetworksOperationType;
+
+/**
+ * ANeuralNetworksDevice is an opaque type that represents a device.
+ *
+ * This type is used to query basic properties and supported operations of the
+ * corresponding device, and control which device(s) a model is to be run on.
+ *
+ * Available since API level 29.
+ */
+typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
+
+/**
+ * Diagnostic result codes.
+ */
+typedef enum {
+  ANNDIAG_NO_ERROR = 0,
+
+  /**
+   * Failure caused by failure to load support library driver.
+   */
+  ANNDIAG_FAILED_TO_LOAD_SL = 1,
+
+  /**
+   * Failure caused by failure to register HAL service.
+   */
+  ANNDIAG_FAILED_TO_REGISTER_SERVICE = 2,
+
+  /**
+   * General failure.
+   */
+  ANNDIAG_GENERAL_ERROR = 3,
+
+  /**
+   * Invalid argument
+   */
+  ANNDIAG_INVALID_ARGUMENT = 4,
+} ANeuralNetworksDiagnosticResultCode;
+
+/**
+ * Diagnostic data class.
+ */
+typedef enum {
+  ANNDIAG_DATA_CLASS_UNKNOWN = 0,
+  ANNDIAG_DATA_CLASS_OTHER = 1,
+  ANNDIAG_DATA_CLASS_FLOAT32 = 2,
+  ANNDIAG_DATA_CLASS_FLOAT16 = 3,
+  ANNDIAG_DATA_CLASS_QUANT = 4,
+  ANNDIAG_DATA_CLASS_MIXED = 5
+} ANeuralNetworksDiagnosticDataClass;
+
+/**
+ * Diagnostic execution mode.
+ */
+typedef enum {
+  ANNDIAG_EXECUTION_MODE_UNKNOWN = 0,
+  ANNDIAG_EXECUTION_MODE_ASYNC = 1,
+  ANNDIAG_EXECUTION_MODE_SYNC = 2,
+  ANNDIAG_EXECUTION_MODE_BURST = 3,
+  ANNDIAG_EXECUTION_MODE_ASYNC_WITH_DEPS = 4,
+} ANeuralNetworksDiagnosticExecutionMode;
+
+typedef struct ANeuralNetworksDiagnosticCompilationInfo
+    ANeuralNetworksDiagnosticCompilationInfo;
+typedef struct ANeuralNetworksDiagnosticExecutionInfo
+    ANeuralNetworksDiagnosticExecutionInfo;
+typedef void (*ANeuralNetworksDiagnosticCompilationFinishedCallback)(
+    const void* context, const ANeuralNetworksDiagnosticCompilationInfo* info);
+typedef void (*ANeuralNetworksDiagnosticExecutionFinishedCallback)(
+    const void* context, const ANeuralNetworksDiagnosticExecutionInfo* info);
+
+// nn api function types
+
+typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
+    size_t size, int protect, int fd, size_t offset,
+    ANeuralNetworksMemory** memory);
+
+typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
+
+typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
+
+typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
+
+typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
+
+typedef int (*ANeuralNetworksCompilation_create_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+typedef void (*ANeuralNetworksCompilation_free_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
+    ANeuralNetworksCompilation* compilation, int32_t preference);
+
+typedef int (*ANeuralNetworksCompilation_finish_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksModel_addOperand_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
+    ANeuralNetworksModel* model, int32_t index, const void* buffer,
+    size_t length);
+
+typedef int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
+typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksModel_addOperation_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+    const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
+    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+    uint32_t outputCount, const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
+    ANeuralNetworksModel* model, bool allow);
+
+typedef int (*ANeuralNetworksExecution_create_fn)(
+    ANeuralNetworksCompilation* compilation,
+    ANeuralNetworksExecution** execution);
+
+typedef void (*ANeuralNetworksExecution_free_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_setInput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_startCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
+
+typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
+
+typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
+
+typedef int (*ANeuralNetworks_getDeviceCount_fn)(uint32_t* numDevices);
+
+typedef int (*ANeuralNetworks_getDevice_fn)(uint32_t devIndex,
+                                            ANeuralNetworksDevice** device);
+
+typedef int (*ANeuralNetworksDevice_getName_fn)(
+    const ANeuralNetworksDevice* device, const char** name);
+
+typedef int (*ANeuralNetworksDevice_getType_fn)(
+    const ANeuralNetworksDevice* device, int32_t* type);
+
+typedef int (*ANeuralNetworksDevice_getVersion_fn)(
+    const ANeuralNetworksDevice* device, const char** version);
+
+typedef int (*ANeuralNetworksDevice_getFeatureLevel_fn)(
+    const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+typedef int (*ANeuralNetworksModel_getSupportedOperationsForDevices_fn)(
+    const ANeuralNetworksModel* model,
+    const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+    bool* supportedOps);
+
+typedef int (*ANeuralNetworksCompilation_createForDevices_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+    uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
+    ANeuralNetworksCompilation* compilation, const char* cacheDir,
+    const uint8_t* token);
+
+typedef int (*ANeuralNetworksCompilation_setTimeout_fn)(
+    ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+typedef int (*ANeuralNetworksCompilation_setPriority_fn)(
+    ANeuralNetworksCompilation* compilation, int priority);
+
+typedef int (*ANeuralNetworksExecution_compute_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_setTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
+typedef int (*ANeuralNetworksExecution_setLoopTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
+typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+typedef int (*ANeuralNetworksExecution_getOutputOperandDimensions_fn)(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+typedef int (*ANeuralNetworksBurst_create_fn)(
+    ANeuralNetworksCompilation* compilation, ANeuralNetworksBurst** burst);
+
+typedef void (*ANeuralNetworksBurst_free_fn)(ANeuralNetworksBurst* burst);
+
+typedef int (*ANeuralNetworksExecution_burstCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+typedef int (*ANeuralNetworksMemory_createFromAHardwareBuffer_fn)(
+    const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+typedef int (*ANeuralNetworksExecution_setMeasureTiming_fn)(
+    ANeuralNetworksExecution* execution, bool measure);
+
+typedef enum {
+  // Execution time on hardware (not driver, which runs on host processor).
+  ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
+  // Execution time in driver (including time on hardware).  Excludes overhead
+  // such as that of the runtime itself and the IPC needed for the runtime to
+  // communicate with the driver.
+  ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
+  // Execution time on hardware, after all dependencies have been signaled.
+  // If no dependencies specified (for example, if the execution was scheduled
+  // other
+  // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}),
+  // the
+  // reported time will be the same as ANEURALNETWORKS_DURATION_ON_HARDWARE.
+  // Available since API level 30.
+  ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE = 2,
+  // Execution time in driver, after all dependencies have been signaled.
+  // Excludes
+  // overhead such as that of the runtime itself and the IPC needed for the
+  // runtime
+  // to communicate with the driver.
+  // If no dependencies specified (for example, if the execution was scheduled
+  // other
+  // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}),
+  // the
+  // reported time will be the same as ANEURALNETWORKS_DURATION_IN_DRIVER.
+  // Available since API level 30.
+  ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER = 3,
+} DurationCode;
+
+typedef int (*ANeuralNetworksExecution_getDuration_fn)(
+    const ANeuralNetworksExecution* execution, int32_t durationCode,
+    uint64_t* duration);
+
+typedef int (*ANeuralNetworksDevice_getExtensionSupport_fn)(
+    const ANeuralNetworksDevice* device, const char* extensionName,
+    bool* isExtensionSupported);
+
+typedef int (*ANeuralNetworksModel_getExtensionOperandType_fn)(
+    ANeuralNetworksModel* model, const char* extensionName,
+    uint16_t operandCodeWithinExtension, int32_t* type);
+
+typedef int (*ANeuralNetworksModel_getExtensionOperationType_fn)(
+    ANeuralNetworksModel* model, const char* extensionName,
+    uint16_t operationCodeWithinExtension, ANeuralNetworksOperationType* type);
+
+typedef int (*ANeuralNetworksModel_setOperandExtensionData_fn)(
+    ANeuralNetworksModel* model, int32_t index, const void* data,
+    size_t length);
+
+typedef int (*ANeuralNetworksMemoryDesc_create_fn)(
+    ANeuralNetworksMemoryDesc** desc);
+
+typedef void (*ANeuralNetworksMemoryDesc_free_fn)(
+    ANeuralNetworksMemoryDesc* desc);
+
+typedef int (*ANeuralNetworksMemoryDesc_addInputRole_fn)(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, uint32_t index,
+    float frequency);
+
+typedef int (*ANeuralNetworksMemoryDesc_addOutputRole_fn)(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, uint32_t index,
+    float frequency);
+
+typedef int (*ANeuralNetworksMemoryDesc_setDimensions_fn)(
+    ANeuralNetworksMemoryDesc* desc, uint32_t rank, const uint32_t* dimensions);
+
+typedef int (*ANeuralNetworksMemoryDesc_finish_fn)(
+    ANeuralNetworksMemoryDesc* desc);
+
+typedef int (*ANeuralNetworksMemory_createFromDesc_fn)(
+    const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory);
+
+typedef int (*ANeuralNetworksMemory_copy_fn)(const ANeuralNetworksMemory* src,
+                                             const ANeuralNetworksMemory* dst);
+
+typedef int (*ANeuralNetworksEvent_createFromSyncFenceFd_fn)(
+    int sync_fence_fd, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksEvent_getSyncFenceFd_fn)(
+    const ANeuralNetworksEvent* event, int* sync_fence_fd);
+
+typedef int (*ANeuralNetworksExecution_startComputeWithDependencies_fn)(
+    ANeuralNetworksExecution* execution,
+    const ANeuralNetworksEvent* const* dependencies, uint32_t num_dependencies,
+    uint64_t duration, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksExecution_enableInputAndOutputPadding_fn)(
+    ANeuralNetworksExecution* execution, bool enable);
+
+typedef int (*ANeuralNetworksExecution_setReusable_fn)(
+    ANeuralNetworksExecution* execution, bool reusable);
+
+typedef int64_t (*ANeuralNetworks_getRuntimeFeatureLevel_fn)();
+
+typedef int32_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getSessionId_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef int64_t (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_getNnApiVersion_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef const uint8_t* (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_getModelArchHash_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef const char* (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_getDeviceIds_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef int32_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getErrorCode_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef ANeuralNetworksDiagnosticDataClass (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_getInputDataClass_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef ANeuralNetworksDiagnosticDataClass (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_getOutputDataClass_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef uint64_t (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_getCompilationTimeNanos_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_isCachingEnabled_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef bool (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_isControlFlowUsed_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef bool (
+    *SL_ANeuralNetworksDiagnosticCompilationInfo_areDynamicTensorsUsed_fn)(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef int32_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getSessionId_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef int64_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getNnApiVersion_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef const uint8_t* (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getModelArchHash_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef const char* (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getDeviceIds_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef ANeuralNetworksDiagnosticExecutionMode (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getExecutionMode_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef ANeuralNetworksDiagnosticDataClass (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getInputDataClass_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef ANeuralNetworksDiagnosticDataClass (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getOutputDataClass_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef uint32_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getErrorCode_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef uint64_t (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getRuntimeExecutionTimeNanos_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef uint64_t (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getDriverExecutionTimeNanos_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef uint64_t (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_getHardwareExecutionTimeNanos_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_isCachingEnabled_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_isControlFlowUsed_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef bool (
+    *SL_ANeuralNetworksDiagnosticExecutionInfo_areDynamicTensorsUsed_fn)(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef void (*SL_ANeuralNetworksDiagnostic_registerCallbacks_fn)(
+    ANeuralNetworksDiagnosticCompilationFinishedCallback compilationCallback,
+    ANeuralNetworksDiagnosticExecutionFinishedCallback executionCallback,
+    void* callbackContext);
+
+#endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_handler.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_handler.h
new file mode 100644
index 00000000..b0dc8421
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_handler.h
@@ -0,0 +1,346 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_HANDLER_H_
+#define TENSORFLOW_LITE_NNAPI_NNAPI_HANDLER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace nnapi {
+
+// Offers an interface to alter the behaviour of the NNAPI instance.
+// As for NNAPI, it is designed to be a singleton.
+// It allows to change the behaviour of some of the methods with some stub
+// implementation and then to reset the behavior to the original one using
+// Reset().
+//
+class NnApiHandler {
+ public:
+  // No destructor defined to allow this class to be used as singleton.
+
+  // Factory method, only one instance per process/jni library.
+  static NnApiHandler* Instance();
+
+  // Makes the current object a transparent proxy again, resetting any
+  // applied changes to its methods.
+  void Reset();
+
+  // Using templates in the ...Returns methods because the functions need to be
+  // stateless and the template generated code is more readable than using a
+  // file-local variable in the method implementation to store the configured
+  // result.
+
+  template <int Value>
+  void GetDeviceCountReturns() {
+    nnapi_->ANeuralNetworks_getDeviceCount = [](uint32_t* numDevices) -> int {
+      *numDevices = 1;
+      return Value;
+    };
+  }
+
+  template <int DeviceCount>
+  void GetDeviceCountReturnsCount() {
+    nnapi_->ANeuralNetworks_getDeviceCount = [](uint32_t* numDevices) -> int {
+      *numDevices = DeviceCount;
+      return ANEURALNETWORKS_NO_ERROR;
+    };
+  }
+
+  void StubGetDeviceCountWith(int(stub)(uint32_t*)) {
+    nnapi_->ANeuralNetworks_getDeviceCount = stub;
+  }
+
+  template <int Value>
+  void GetDeviceReturns() {
+    nnapi_->ANeuralNetworks_getDevice =
+        [](uint32_t devIndex, ANeuralNetworksDevice** device) -> int {
+      *device =
+          reinterpret_cast<ANeuralNetworksDevice*>(NnApiHandler::kNnapiDevice);
+      return Value;
+    };
+  }
+
+  void StubGetDeviceWith(int(stub)(uint32_t, ANeuralNetworksDevice**)) {
+    nnapi_->ANeuralNetworks_getDevice = stub;
+  }
+
+  template <int Value>
+  void GetDeviceNameReturns() {
+    nnapi_->ANeuralNetworksDevice_getName =
+        [](const ANeuralNetworksDevice* device, const char** name) -> int {
+      *name = NnApiHandler::nnapi_device_name_;
+      return Value;
+    };
+  }
+
+  void GetDeviceNameReturnsName(const std::string& name);
+
+  void StubGetDeviceNameWith(int(stub)(const ANeuralNetworksDevice*,
+                                       const char**)) {
+    nnapi_->ANeuralNetworksDevice_getName = stub;
+  }
+
+  // Configure all the functions related to device browsing to support
+  // a device with the given name and the cpu fallback nnapi-reference.
+  // The extra device will return support the specified feature level
+  void SetNnapiSupportedDevice(const std::string& name, int feature_level = 29);
+
+  template <int Value>
+  void ModelCreateReturns() {
+    nnapi_->ANeuralNetworksModel_create = [](ANeuralNetworksModel** model) {
+      *model = reinterpret_cast<ANeuralNetworksModel*>(1);
+      return Value;
+    };
+  }
+
+  void StubModelCreateWith(int(stub)(ANeuralNetworksModel** model)) {
+    nnapi_->ANeuralNetworksModel_create = stub;
+  }
+
+  template <int Value>
+  void AddOperandReturns() {
+    nnapi_->ANeuralNetworksModel_addOperand =
+        [](ANeuralNetworksModel* model,
+           const ANeuralNetworksOperandType* type) { return Value; };
+  }
+
+  void StubAddOperandWith(int(stub)(ANeuralNetworksModel* model,
+                                    const ANeuralNetworksOperandType* type)) {
+    nnapi_->ANeuralNetworksModel_addOperand = stub;
+  }
+
+  template <int Value>
+  void SetOperandValueReturns() {
+    nnapi_->ANeuralNetworksModel_setOperandValue =
+        [](ANeuralNetworksModel* model, int32_t index, const void* buffer,
+           size_t length) { return Value; };
+  }
+
+  template <int Value>
+  void AddOperationReturns() {
+    nnapi_->ANeuralNetworksModel_addOperation =
+        [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+           uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+           const uint32_t* outputs) { return Value; };
+  }
+
+  void StubAddOperationWith(
+      int(stub)(ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+                uint32_t inputCount, const uint32_t* inputs,
+                uint32_t outputCount, const uint32_t* outputs)) {
+    nnapi_->ANeuralNetworksModel_addOperation = stub;
+  }
+
+  template <int Value>
+  void IdentifyInputAndOutputsReturns() {
+    nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs =
+        [](ANeuralNetworksModel* model, uint32_t inputCount,
+           const uint32_t* inputs, uint32_t outputCount,
+           const uint32_t* outputs) { return Value; };
+  }
+
+  template <int Value>
+  void RelaxComputationFloatReturns() {
+    nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16 =
+        [](ANeuralNetworksModel* model, bool allow) { return Value; };
+  }
+
+  template <int Value>
+  void ModelFinishReturns() {
+    nnapi_->ANeuralNetworksModel_finish = [](ANeuralNetworksModel* model) {
+      return Value;
+    };
+  }
+
+  template <int Value>
+  void MemoryCreateFromFdReturns() {
+    nnapi_->ANeuralNetworksMemory_createFromFd =
+        [](size_t size, int protect, int fd, size_t offset,
+           ANeuralNetworksMemory** memory) {
+          *memory = reinterpret_cast<ANeuralNetworksMemory*>(2);
+          return Value;
+        };
+  }
+
+  template <int Value>
+  void CompilationCreateReturns() {
+    nnapi_->ANeuralNetworksCompilation_create =
+        [](ANeuralNetworksModel* model,
+           ANeuralNetworksCompilation** compilation) {
+          *compilation = reinterpret_cast<ANeuralNetworksCompilation*>(3);
+          return Value;
+        };
+  }
+
+  template <int Value>
+  void CompilationCreateForDevicesReturns() {
+    nnapi_->ANeuralNetworksCompilation_createForDevices =
+        [](ANeuralNetworksModel* model,
+           const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+           ANeuralNetworksCompilation** compilation) {
+          *compilation = reinterpret_cast<ANeuralNetworksCompilation*>(3);
+          return Value;
+        };
+  }
+
+  void StubCompilationCreateForDevicesWith(int(stub)(
+      ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+      uint32_t numDevices, ANeuralNetworksCompilation** compilation)) {
+    nnapi_->ANeuralNetworksCompilation_createForDevices = stub;
+  }
+
+  template <int Value>
+  void CompilationFinishReturns() {
+    nnapi_->ANeuralNetworksCompilation_finish =
+        [](ANeuralNetworksCompilation* compilation) { return Value; };
+  }
+
+  template <int Value>
+  void ExecutionCreateReturns() {
+    nnapi_->ANeuralNetworksExecution_create =
+        [](ANeuralNetworksCompilation* compilation,
+           ANeuralNetworksExecution** execution) {
+          if (compilation == nullptr) return 1;
+          *execution = reinterpret_cast<ANeuralNetworksExecution*>(4);
+          return Value;
+        };
+  }
+  template <int Value>
+  void ExecutionSetInputFromMemoryReturns() {
+    nnapi_->ANeuralNetworksExecution_setInputFromMemory =
+        [](ANeuralNetworksExecution* execution, int32_t index,
+           const ANeuralNetworksOperandType* type,
+           const ANeuralNetworksMemory* memory, size_t offset,
+           size_t length) { return Value; };
+  }
+  template <int Value>
+  void ExecutionSetOutputFromMemoryReturns() {
+    nnapi_->ANeuralNetworksExecution_setOutputFromMemory =
+        [](ANeuralNetworksExecution* execution, int32_t index,
+           const ANeuralNetworksOperandType* type,
+           const ANeuralNetworksMemory* memory, size_t offset,
+           size_t length) { return Value; };
+  }
+
+  template <int Value>
+  void ExecutionComputeReturns() {
+    nnapi_->ANeuralNetworksExecution_compute =
+        [](ANeuralNetworksExecution* execution) { return Value; };
+  }
+
+  template <int Value>
+  void GetSupportedOperationsForDevicesReturns() {
+    nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+        [](const ANeuralNetworksModel* model,
+           const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+           bool* supportedOps) { return Value; };
+  }
+
+  void StubGetSupportedOperationsForDevicesWith(
+      int(stub)(const ANeuralNetworksModel* model,
+                const ANeuralNetworksDevice* const* devices,
+                uint32_t numDevices, bool* supportedOps)) {
+    nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices = stub;
+  }
+
+  template <int Value>
+  void ExecutionStartComputeReturns() {
+    nnapi_->ANeuralNetworksExecution_startCompute =
+        [](ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
+          *event = reinterpret_cast<ANeuralNetworksEvent*>(1);
+          return Value;
+        };
+  }
+
+  template <int Value>
+  void EventWaitReturns() {
+    nnapi_->ANeuralNetworksEvent_wait = [](ANeuralNetworksEvent* event) {
+      return Value;
+    };
+  }
+
+  template <int Value>
+  void SetPriorityReturns() {
+    nnapi_->ANeuralNetworksCompilation_setPriority =
+        [](ANeuralNetworksCompilation* compilation, int priority) -> int {
+      return Value;
+    };
+  }
+
+  template <int Value>
+  void SetOperandSymmPerChannelQuantParamsReturns() {
+    nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams =
+        [](ANeuralNetworksModel* model, int32_t index,
+           const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) {
+          return Value;
+        };
+  }
+
+  /*
+   * Sets the SDK Version in the nnapi structure.
+   * If set_unsupported_ops_to_null is set to true, all the functions not
+   * available at the given sdk level will be set to null too.
+   */
+  void SetAndroidSdkVersion(int version,
+                            bool set_unsupported_ops_to_null = false);
+
+  const NnApi* GetNnApi() { return nnapi_; }
+
+ protected:
+  explicit NnApiHandler(NnApi* nnapi) : nnapi_(nnapi) { DCHECK(nnapi); }
+
+  NnApi* nnapi_;
+
+  static const char kNnapiReferenceDeviceName[];
+  static const int kNnapiReferenceDevice;
+  static const int kNnapiDevice;
+
+  static void SetDeviceName(const std::string& name);
+
+ private:
+  static char* nnapi_device_name_;
+  static int nnapi_device_feature_level_;
+};
+
+// Returns a pointer to an unaltered instance of NNAPI. Is intended
+// to be used by stub methods when wanting to pass-through to original
+// implementation for example:
+//
+// NnApiTestUtility()->StubGetDeviceWith(
+//  [](uint32_t devIndex, ANeuralNetworksDevice** device) -> int {
+//        static int count = 0;
+//        if (count++ < 1) {
+//          NnApiPassthroughInstance()->ANeuralNetworks_getDevice(
+//                devIndex, device);
+//        } else {
+//            return ANEURALNETWORKS_BAD_DATA;
+//        }
+//   });
+const NnApi* NnApiPassthroughInstance();
+
+// Returns an instance of NnApiProxy that can be used to alter
+// the behaviour of the TFLite wide instance of NnApi.
+NnApiHandler* NnApiProxyInstance();
+
+}  // namespace nnapi
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_NNAPI_NNAPI_HANDLER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_implementation.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_implementation.h
new file mode 100644
index 00000000..005ec01d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -0,0 +1,2134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+#define TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+struct NnApi {
+  bool nnapi_exists;
+  int32_t android_sdk_version;
+  // NNAPI feature level should be used when deciding which NNAPI feature to
+  // use, as feature levels after Android API level 31 have no association with
+  // API level because the NNAPI specification can be updated between Android
+  // API releases.
+  int64_t nnapi_runtime_feature_level;
+
+  /**
+   * Creates a shared memory object from a file descriptor.
+   *
+   * The shared memory is backed by a file descriptor via mmap.
+   * See {@link ANeuralNetworksMemory} for a description on how to use
+   * this shared memory.
+   *
+   * @param size The requested size in bytes.
+   *             Must not be larger than the file size.
+   * @param prot The desired memory protection for the mapping.
+   *             It is either PROT_NONE or the bitwise OR of one or
+   *             more of the following flags: PROT_READ, PROT_WRITE.
+   * @param fd The requested file descriptor.
+   *           The file descriptor has to be mmap-able. The file
+   *           descriptor will be duplicated.
+   * @param offset The offset to the beginning of the file of the area to map.
+   *               The offset has to be aligned to a page size.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   */
+  int (*ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd,
+                                            size_t offset,
+                                            ANeuralNetworksMemory** memory);
+
+  /**
+   * Delete a memory object.
+   *
+   * Destroys the object used by the run time to keep track of the memory.
+   * This will free the underlying actual memory if no other code has open
+   * handles to this memory.
+   *
+   * @param memory The memory object to be freed.
+   */
+  void (*ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
+
+  /**
+   * Create an empty {@link ANeuralNetworksModel}.
+   *
+   * <p>This only creates the object. Computation is performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * The model should be constructed with calls to
+   * {@link ANeuralNetworksModel_addOperation} and
+   * {@link ANeuralNetworksModel_addOperand}
+   *
+   * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+   * has been fully constructed.</p>
+   *
+   * <p>{@link ANeuralNetworksModel_free} should be called once the model
+   * is no longer needed.</p>
+   *
+   * @param model The {@link ANeuralNetworksModel} to be created.
+   *              Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
+
+  /**
+   * Destroy a model.
+   *
+   * The model need not have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be destroyed. Passing NULL is acceptable and
+   *              results in no operation.
+   */
+  void (*ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
+
+  /**
+   * Indicate that we have finished modifying a model. Required before
+   * calling {@link ANeuralNetworksCompilation_compile}.
+   *
+   * An application is responsible to make sure that no other thread uses
+   * the model at the same time.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be finished.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
+
+  /**
+   * Add an operand to a model.
+   *
+   * The order in which the operands are added is important. The first one added
+   * to a model will have the index value 0, the second 1, etc. These indexes
+   * are used as operand identifiers in
+   * {@link ANeuralNetworksModel_addOperation},
+   * {@link ANeuralNetworksExecution_setInput},
+   * {@link ANeuralNetworksExecution_setInputFromMemory},
+   * {@link ANeuralNetworksExecution_setOutput},
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+   * {@link ANeuralNetworksExecution_setOperandValue}.
+   *
+   * To build a model that can accommodate inputs of various sizes, as you may
+   * want to do for a CNN, set the size of the dimensions that will vary at run
+   * time to 0. If you do so, provide the full dimensions when calling
+   * {@link ANeuralNetworksExecution_setInput} or {@link
+   * ANeuralNetworksExecution_setInputFromMemory}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+   * of the operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperand)(
+      ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+  /**
+   * Sets an operand to a constant value.
+   *
+   * For scalar values, the content of buffer is copied into the model.
+   *
+   * For tensor values, a pointer to the buffer is stored within the model.
+   * The application is responsible for not changing the content of this region
+   * until all executions using this model have completed. As the data may
+   * be copied during processing, modifying the data after this call yields
+   * undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model,
+                                              int32_t index, const void* buffer,
+                                              size_t length);
+
+  /**
+   * Sets an operand's per channel quantization parameters.
+   *
+   * Sets parameters required by a tensor of type
+   * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}.
+   * This function must be called for every tensor of type
+   * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} before
+   * calling {@link ANeuralNetworksModel_finish}.
+   *
+   * Available since API level 29.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param channelQuant The per channel quantization parameters for the
+   *                     operand. No memory in this struct needs to outlive the
+   *                     call to this function.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
+  /**
+   * Sets an operand to a value stored in a memory object.
+   *
+   * The content of the memory is not copied. A reference to that memory is
+   * stored inside the model. The application is responsible for not changing
+   * the content of the memory region until all executions using this model have
+   * completed.
+   * As the data may be copied during processing, modifying the data after this
+   * call yields undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValueFromMemory)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Add an operation to a model.
+   *
+   * @param model The model to be modified.
+   * @param type The type of the operation.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying each operand.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying each operand.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model,
+                                           ANeuralNetworksOperationType type,
+                                           uint32_t inputCount,
+                                           const uint32_t* inputs,
+                                           uint32_t outputCount,
+                                           const uint32_t* outputs);
+
+  /**
+   * Specifies which operands will be the model's inputs and outputs.
+   *
+   * An operand cannot be used for both input and output. Doing so will
+   * return an error.
+   *
+   * @param model The model to be modified.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying the input operands.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying the output operands.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_identifyInputsAndOutputs)(
+      ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+      uint32_t outputCount, const uint32_t* outputs);
+
+  /**
+   * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
+   * calculated with range and/or precision as low as that of the
+   * IEEE 754 16-bit floating-point format. By default,
+   * {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using at least
+   * the range and precision of the IEEE 754 32-bit floating-point format.
+   *
+   * @param model The model to be modified.
+   * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
+   *              calculated with range and/or precision as low as that of the
+   *              IEEE 754 16-bit floating point format. 'false' indicates
+   *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated
+   *              using at least the range and precision of the IEEE 754 32-bit
+   *              floating point format.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * Available since API level 28.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16)(
+      ANeuralNetworksModel* model, bool allow);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+   * This only creates the object. Compilation is only performed once
+   * {@link ANeuralNetworksCompilation_start} is invoked.
+   *
+   * <p>The provided model must outlive the compilation.</p>
+   *
+   * The model must already have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param model The {@link ANeuralNetworksModel} to be compiled.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   */
+  int (*ANeuralNetworksCompilation_create)(
+      ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Destroy a compilation.
+   *
+   * <p>If called on a compilation for which
+   * {@link ANeuralNetworksCompilation_start} has been called, the
+   * function will return immediately but will mark the compilation to be
+   * deleted once the compilation completes. The
+   * {@link ANeuralNetworksCompilation_wait} will return ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be destroyed. Passing NULL is
+   * acceptable and results in no operation.
+   */
+  void (*ANeuralNetworksCompilation_free)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Sets the execution preference.
+   *
+   * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param preference Either {@link PREFER_LOW_POWER},
+   *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+   *                  {@link PREFER_SUSTAINED_SPEED}.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPreference)(
+      ANeuralNetworksCompilation* compilation, int32_t preference);
+
+  /**
+   * Waits until the compilation completes.
+   *
+   * More than one thread can wait on a compilation. When the compilation
+   * completes, all threads will be released.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
+   */
+  int (*ANeuralNetworksCompilation_finish)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+   * This only creates the object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * <p>The provided compilation must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param execution The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksExecution_create)(
+      ANeuralNetworksCompilation* compilation,
+      ANeuralNetworksExecution** execution);
+
+  /**
+   * Destroy an execution.
+   *
+   * <p>If called on an execution for which
+   * {@link ANeuralNetworksExecution_startCompute} has been called, the
+   * function will return immediately but will mark the execution to be deleted
+   * once the computation completes.   The {link ANeuralNetworksExecution_wait}
+   * will return ANEURALNETWORKS_ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Associate a user buffer with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This should be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other properties of the type must be the same as
+   *             specified in the model. If the type is the same as specified
+   *             when the model was built, NULL can be passed.
+   * @param buffer The buffer containing the data.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, const void* buffer,
+      size_t length);
+
+  /**
+   * Associate part of a memory object with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Associate a user buffer with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param buffer The buffer where the data is to be written.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+  /**
+   * Associate part of a memory object with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory where the data is to be stored.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The length in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Schedule evaluation of the execution.
+   *
+   * <p>Schedules evaluation of the execution. Once the model has been
+   * applied and the outputs are ready to be consumed, the execution will be
+   * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that
+   * signal.
+   * </p>
+   *
+   * Multiple executions can be scheduled and evaluated concurrently, and
+   * compilations can be performed concurrently with executions. The runtime
+   * makes no guarantee on the ordering of the completion of compilations and
+   * executions. If it's important to the application, the application should
+   * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
+   * {@link ANeuralNetworksExecution_wait}.
+   *
+   * ANeuralNetworksExecution_wait must be called to recuperate the resources
+   * used by the execution.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_startCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+  /**
+   * Waits until the execution completes.
+   *
+   * More than one thread can wait on an event. When the execution completes,
+   * all threads will be released.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
+
+  /**
+   * Destroys the event.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   */
+  void (*ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
+
+  // ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
+  // which was added in 8.1.
+  int (*ASharedMemory_create)(const char* name, size_t size);
+
+  /**
+   * Get the number of available devices.
+   *
+   * @param numDevices Used to return the number of devices.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworks_getDeviceCount)(uint32_t* numDevices);
+
+  /**
+   * Get the representation of the specified device.
+   *
+   * @param devIndex The index of the specified device. Must be less than the
+   *                 number of available devices.
+   * @param device The representation of the specified device.
+   *               The same representation will always be returned for the
+   *               specified device.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+
+  int (*ANeuralNetworks_getDevice)(uint32_t devIndex,
+                                   ANeuralNetworksDevice** device);
+
+  /**
+   * Get the name of the specified device.
+   *
+   * @param device The representation of the specified device.
+   * @param name The returned name of the specified device. The name will be
+   *             in UTF-8 and will be null-terminated. It will be recognizable
+   *             as a known device name rather than a cryptic string. For
+   *             devices with API level 29 and above, the format of the name is
+   *             {VENDOR}-{DEVICE}, e.g. “google-ipu”. For devices with feature
+   *             level 28 or lower, the name will always be “unknown-device”.
+   *             The name will remain valid for the duration of the application.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getName)(const ANeuralNetworksDevice* device,
+                                       const char** name);
+
+  /**
+   * Get the version of the driver implementation of the specified device.
+   *
+   * It’s the responsibility of the driver implementor to insure that this
+   * version string uniquely distinguishes this implementation from all previous
+   * implementations.
+   *
+   * This version string must not be confused with the feature level which is
+   * solely defined by {@link ANeuralNetworksDevice_getFeatureLevel}. There is
+   * no implicit ordering of the versions. For example, it is not possible to
+   * filter all drivers older than a certain version.
+   *
+   * Application developers may use this version string to avoid or prefer
+   * specific driver implementations. For example, an application may want to do
+   * so because:
+   *     - A specific version of the driver does not provide the required
+   * performance, perhaps because of a performance regression.
+   *     - A specific version of the driver has a bug or returns results that
+   * don’t match the minimum precision requirement for the application.
+   *
+   * @param device  The representation of the specified device.
+   * @param version The returned version string of the driver for the specified
+   *                device. The string will be in UTF-8 and will be
+   *                null-terminated. For devices with feature level 28 or lower,
+   *                "UNKNOWN" will be returned. The version string will remain
+   *                valid for the duration of the application.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getVersion)(const ANeuralNetworksDevice* device,
+                                          const char** version);
+
+  /**
+   * Get the supported NNAPI version of the specified device.
+   *
+   * Each device has a supported feature level, which is the most advanced
+   * feature this driver implements. For example, if the driver implements the
+   * features introduced in Android P, but does not implement the features
+   * introduced after Android P, the value would be 28. Developers could decide
+   * whether or not the specified device should be used for a Model that has
+   * certain feature requirements.
+   *
+   * @param device       The representation of the specified device.
+   * @param featureLevel The API level of the most advanced feature this driver
+   *                     implements.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getFeatureLevel)(
+      const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+  /**
+   * Get the type of a given device.
+   *
+   * The device type can be used to help application developers to distribute
+   * Machine Learning workloads and other workloads such as graphical rendering.
+   * E.g., for an app which renders AR scenes based on real time object
+   * detection results, the developer could choose an ACCELERATOR type device
+   * for ML workloads, and reserve GPU for graphical rendering.
+   *
+   * @param device The representation of the specified device.
+   * @param type The returned {@link DeviceTypeCode} of the specified device.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getType)(const ANeuralNetworksDevice* device,
+                                       int32_t* type);
+
+  /**
+   * Get the supported operations for a specified set of devices. If multiple
+   * devices are selected, the supported operation list is a union of supported
+   * operations of all selected devices.
+   *
+   * @param model        The model to be queried.
+   * @param devices      The set of devices. Must not contain duplicates.
+   * @param numDevices   The number of devices in the set.
+   * @param supportedOps The boolean array to be filled. True means supported.
+   *                     The size of the boolean array must be at least as large
+   *                     as the number of operations in the model. The order of
+   *                     elements in the supportedOps array matches the order in
+   *                     which the corresponding operations were added to the
+   *                     model.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksModel_getSupportedOperationsForDevices)(
+      const ANeuralNetworksModel* model,
+      const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+      bool* supportedOps);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model for
+   * a specified set of devices. If more than one device is specified, the
+   * compilation will distribute the workload automatically across the devices.
+   * The model must be fully supported by the specified set of devices. This
+   * means that ANeuralNetworksModel_getSupportedOperationsForDevices() must
+   * have returned true for every operation for that model/devices pair.
+   *
+   * @param model       The {@link ANeuralNetworksModel} to be compiled.
+   * @param devices     The set of devices. Must not contain duplicates.
+   * @param numDevices  The number of devices in the set.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksCompilation_createForDevices)(
+      ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+      uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Sets the compilation caching signature and the cache directory.
+   *
+   * Provides optional caching information to the runtime for faster repeated
+   * compilation.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param cacheDir The cache directory to store and retrieve caching data. It
+   *                 is recommended to use the code_cache provided by the
+   *                 Android runtime. If not using the code_cache, the user
+   *                 should choose a directory local to the application, and is
+   *                 responsible to manage and clean the cache entries.
+   * @param token The token provided by the user to specify a model, must be of
+   *              length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user
+   *              should ensure that the token is unique to a model within the
+   *              application. The NNAPI runtime will not detected token
+   *              collisions. If there is a collision, the compilation outcome
+   *              may be incorrect without notifying with error.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksCompilation_setCaching)(
+      ANeuralNetworksCompilation* compilation, const char* cacheDir,
+      const uint8_t* token);
+
+  /**
+   * Set the maximum expected duration for compiling the model.
+   *
+   * If the device is not able to complete the compilation within the specified
+   * duration, the compilation may be aborted. The timeout duration begins at
+   * the call to {@link ANeuralNetworksCompilation_finish}.
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long a compilation will take to abort the
+   * compilation before it has even started if the driver believes the
+   * compilation cannot be completed within the timeout duration. Similarly, it
+   * enables drivers to abort an ongoing compilation if it is taking too long.
+   * However, this call does not guarantee that the compilation will complete or
+   * abort within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+   * the timeout duration for compiling the model is considered infinite.
+   *
+   * The {@link ANeuralNetworksCompilation} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent finishing a compilation. If this duration is exceeded, the
+   *     compilation may be aborted. If set to 0, the timeout duration is
+   *     considered infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksCompilation_setTimeout)(
+      ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+  /**
+   * Set the execution priority.
+   *
+   * Execution priorities are relative to other executions created by the same
+   * application (specifically same uid) for the same device. Specifically,
+   * priorities of executions from one application will not affect executions
+   * from another application. Similarly, priorities of executions on one device
+   * will not affect executions on another device.
+   *
+   * Higher priority executions may use more compute resources than lower
+   * priority executions, and may preempt or starve lower priority executions.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param compilation The compilation to be modified.
+   * @param priority The relative priority of the execution compared to other
+   *     executions created by the application. Must be one of
+   *     ANEURALNETWORKS_PRIORITY_*.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPriority)(
+      ANeuralNetworksCompilation* compilation, int priority);
+
+  /**
+   * Schedule synchronous evaluation of the execution.
+   *
+   * <p>Schedules synchronous evaluation of the execution. Returns once the
+   * execution has completed and the outputs are ready to be consumed.
+   * </p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * See {@link ANeuralNetworksExecution_startCompute} for asynchronous
+   * execution. Synchronous execution incurs lower overhead than asynchronous
+   * execution.
+   *
+   * Available since API level 29.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory
+   *         cannot be properly mapped.
+   */
+  int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Set the maximum expected duration of the specified execution.
+   *
+   * If the device is not able to complete the execution within the specified
+   * duration, the execution may be aborted. The timeout duration begins at a
+   * call to one of:
+   * - {@link ANeuralNetworksExecution_burstCompute}
+   * - {@link ANeuralNetworksExecution_compute}
+   * - {@link ANeuralNetworksExecution_startCompute}
+   * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long an execution will take to abort the
+   * execution before it has even started if the driver believes the execution
+   * cannot be completed within the timeout duration. Similarly, it enables
+   * drivers to abort an ongoing execution if it is taking too long. However,
+   * this call does not guarantee that the execution will complete or abort
+   * within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+   * the timeout duration for execution is considered infinite.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created from an
+   * {@link ANeuralNetworksCompilation} which in turn was created from
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent executing a model. If this duration is exceeded, the execution
+   *     may be aborted. If set to 0, the timeout duration is considered
+   * infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * Set the maximum duration of WHILE loops in the specified execution.
+   *
+   * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+   *
+   * If a WHILE loop condition model does not output false within the specified
+   * duration, the execution will be aborted.
+   *
+   * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+   * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+   * and maximum timeout values.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that can be spent
+   *     executing a WHILE loop. If the specified duration value exceeds the
+   * value produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+   *     overridden by that value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_BAD_STATE if execution has started.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setLoopTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * Get the dimensional information of the specified output operand of the
+   * model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * On asynchronous execution initiated by {@link
+   * ANeuralNetworksExecution_startCompute},
+   * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+   * recuperate the resources used by the execution.
+   *
+   * @param execution The execution to be queried.
+   * @param index The index of the output argument we are querying. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with
+   *              {@link ANeuralNetworksModel_addOperand}.
+   * @param rank The rank of the output operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful,
+   *         ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is
+   *         provided an insufficient buffer at execution time,
+   *         ANEURALNETWORKS_BAD_DATA if the index is invalid.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandRank)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+  /**
+   * Get the dimensional information of the specified output operand of the
+   * model of the
+   * {@link ANeuralNetworksExecution}. The target output operand cannot be a
+   * scalar.
+   *
+   * On asynchronous execution initiated by {@link
+   * ANeuralNetworksExecution_startCompute},
+   * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+   * recuperate the resources used by the execution.
+   *
+   * @param execution The execution to be queried.
+   * @param index The index of the output argument we are querying. It is an
+   *              index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with
+   *              {@link ANeuralNetworksModel_addOperand}.
+   * @param dimensions The dimension array to be filled. The size of the array
+   *                   must be exactly as large as the rank of the output
+   *                   operand to be queried in the model.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful,
+   *         ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is
+   *         provided an insufficient buffer at execution time,
+   *         ANEURALNETWORKS_BAD_DATA if the index is invalid or if the target
+   *         is a scalar.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandDimensions)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+  /**
+   * Create a {@link ANeuralNetworksBurst} to apply the given compilation.
+   * This only creates the burst object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_burstCompute} is invoked with a valid
+   * {@link ANeuralNetworksExecution} and {@link ANeuralNetworksBurst}.
+   *
+   * <p>The provided compilation must outlive the burst object.</p>
+   *
+   * Available since API level 29.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param burst The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksBurst_create)(ANeuralNetworksCompilation* compilation,
+                                     ANeuralNetworksBurst** burst);
+
+  /**
+   * Destroys the burst object.
+   *
+   * Available since API level 29.
+   *
+   * @param burst The burst object to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksBurst_free)(ANeuralNetworksBurst* burst);
+
+  /**
+   * Schedule synchronous evaluation of the execution on a burst object.
+   *
+   * <p>Schedules synchronous evaluation of the execution. Returns once the
+   * execution has completed and the outputs are ready to be consumed.</p>
+   *
+   * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
+   * any given time for any given burst object. Any
+   * {@link ANeuralNetworksExecution} launched before the previous has finished
+   * will result in ANEURALNETWORKS_BAD_STATE.</p>
+   *
+   * Available since API level 29.
+   *
+   * @param burst The burst object to execute on.
+   * @param execution The execution to be scheduled and executed. The execution
+   *                  must be created from the same {@link
+   *                  ANeuralNetworksCompilation} as the burst object.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksExecution_burstCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+  /**
+   * Creates a shared memory object from an AHardwareBuffer handle.
+   *
+   * If the shared memory is backed by an AHardwareBuffer of
+   * AHARDWAREBUFFER_FORMAT_BLOB format, it can be used the same way as
+   * shared memory created from a file handle. See
+   * {@link ANeuralNetworksMemory} for a description on how to use this
+   * shared memory.
+   *
+   * If the shared memory is backed by an AHardwareBuffer of a format other
+   * than AHARDWAREBUFFER_FORMAT_BLOB, it can only be used for Model inputs
+   * and outputs. When calling
+   * {@link ANeuralNetworksExecution_setInputFromMemory} or
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} with the shared
+   * memory, both offset and length must be set to zero and the entire
+   * memory region will be associated with the specified input or output
+   * operand. There is no guarantee that an arbitrary AHardwareBuffer_Format
+   * and AHardwareBuffer_UsageFlags combination can be used by arbitrary
+   * devices. The execution will fail if selected set of devices cannot
+   * consume the buffer.
+   *
+   * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with
+   * shared memory backed by an AHardwareBuffer of a format other than
+   * AHARDWAREBUFFER_FORMAT_BLOB is disallowed.
+   *
+   * TODO(miaowang): add documentation about intended usage with
+   * introspection API.
+   *
+   * Available since API level 29.
+   *
+   * @param ahwb The AHardwareBuffer handle.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   *
+   * @see AHardwareBuffer
+   */
+  int (*ANeuralNetworksMemory_createFromAHardwareBuffer)(
+      const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+  /**
+   * Specifies whether duration of the {@link ANeuralNetworksExecution} is to be
+   * measured. By default, duration is not measured.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 29.
+   *
+   * @param execution The execution to be modified.
+   * @param measure 'true' if duration is to be measured, 'false' if not.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_setMeasureTiming)(
+      ANeuralNetworksExecution* execution, bool measure);
+
+  /**
+   * Get the time spent in the specified {@link ANeuralNetworksExecution}, in
+   * nanoseconds. The execution must have completed.
+   *
+   * @param execution The execution to be queried.
+   * @param durationCode The measurement to be queried, specified by {@link
+   * DurationCode}.
+   * @param duration The returned duration. If no measurement was requested by
+   *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for
+   * some other reason the duration is not available, UINT64_MAX will be
+   * returned. A particular device need not support any given measurement.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_getDuration)(
+      const ANeuralNetworksExecution* execution, int32_t durationCode,
+      uint64_t* duration);
+
+  /**
+   * Queries whether an extension is supported by the driver implementation of
+   * the specified device.
+   *
+   * @param device The representation of the specified device.
+   * @param extension The extension name.
+   * @param isExtensionSupported The boolean value indicating whether the
+   * extension is supported.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getExtensionSupport)(
+      const ANeuralNetworksDevice* device, const char* extensionName,
+      bool* isExtensionSupported);
+
+  /**
+   * Creates an operand type from an extension name and an extension operand
+   * code.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * Available since API level 29.
+   *
+   * @param model The model to contain the operand.
+   * @param extensionName The extension name.
+   * @param operandCodeWithinExtension The extension operand code.
+   * @param type The operand type.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_getExtensionOperandType)(
+      ANeuralNetworksModel* model, const char* extensionName,
+      uint16_t operandCodeWithinExtension, int32_t* type);
+
+  /**
+   * Creates an operation type from an extension name and an extension operation
+   * code.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * Available since API level 29.
+   *
+   * @param model The model to contain the operation.
+   * @param extensionName The extension name.
+   * @param operationCodeWithinExtension The extension operation code.
+   * @param type The operation type.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_getExtensionOperationType)(
+      ANeuralNetworksModel* model, const char* extensionName,
+      uint16_t operationCodeWithinExtension,
+      ANeuralNetworksOperationType* type);
+
+  /**
+   * Sets extension operand parameters.
+   *
+   * Available since API level 29.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param data A pointer to the extension operand data.
+   *             The data does not have to outlive the call to this function.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandExtensionData)(
+      ANeuralNetworksModel* model, int32_t index, const void* data,
+      size_t length);
+
+  /**
+   * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
+   *
+   * This only creates the memory descriptor. Its properties should be set with
+   * calls to
+   * {@link ANeuralNetworksMemoryDesc_addInputRole},
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and
+   * {@link ANeuralNetworksMemoryDesc_setDimensions}.
+   *
+   * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties
+   * have been set.
+   *
+   * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory
+   * descriptor is no longer needed.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The {@link ANeuralNetworksMemoryDesc} to be created.
+   *             Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_create)(ANeuralNetworksMemoryDesc** desc);
+
+  /**
+   * Destroy a memory descriptor.
+   *
+   * The memory descriptor need not have been finished by a call to
+   * {@link ANeuralNetworksMemoryDesc_finish}.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be destroyed. Passing NULL is
+   * acceptable and results in no operation.
+   */
+  void (*ANeuralNetworksMemoryDesc_free)(ANeuralNetworksMemoryDesc* desc);
+
+  /**
+   * Specify that a memory object will be playing the role of an input to an
+   * execution created from a particular compilation.
+   *
+   * The compilation and the input index fully specify an input operand. This
+   * function may be invoked multiple times on the same memory descriptor with
+   * different input operands, and the same input operand may be specified on
+   * multiple memory descriptors. However, specifying the same input operand on
+   * the same memory descriptor more than once will return an error.
+   *
+   * The dimensions of the corresponding model operands of all the roles
+   * specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with
+   * each other. Two dimensions are incompatible if both ranks are fully
+   * specified but have different values, or if there is at least one axis that
+   * is fully specified in both but has different values.
+   *
+   * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory
+   * descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+   *
+   * Attempting to modify a memory descriptor once
+   * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+   * error.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be modified.
+   * @param compilation The compilation object. It must already have been
+   * finished by calling {@link ANeuralNetworksCompilation_finish}, and must
+   * outlive the memory descriptor.
+   * @param index The index of the input argument we are referencing from the
+   * compilation. It is an index into the inputs list passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   * not the index associated with {@link ANeuralNetworksModel_addOperand}.
+   * @param frequency A floating-point value within the range (0.0, 1.0].
+   * Describes how likely the memory is to be used in the specified role. This
+   * is provided as a hint to optimize the case when different roles prefer
+   * different memory locations or data layouts.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_addInputRole)(
+      ANeuralNetworksMemoryDesc* desc,
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      float frequency);
+
+  /**
+   * Specify that a memory object will be playing the role of an output to an
+   * execution created from a particular compilation.
+   *
+   * The compilation and the output index fully specify an output operand. This
+   * function may be invoked multiple times on the same memory descriptor with
+   * different output operands, and the same output operand may be specified on
+   * multiple memory descriptors. However, specifying the same output operand on
+   * the same memory descriptor object more than once will return an error.
+   *
+   * The dimensions of the corresponding model operands of all the roles
+   * specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with
+   * each other. Two dimensions are incompatible if both ranks are fully
+   * specified but have different values, or if there is at least one axis that
+   * is fully specified in both but has different values.
+   *
+   * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the
+   * memory descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+   *
+   * Attempting to modify a memory descriptor once
+   * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+   * error.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be modified.
+   * @param compilation The compilation object. It must already have been
+   * finished by calling {@link ANeuralNetworksCompilation_finish}, and must
+   * outlive the memory descriptor.
+   * @param index The index of the output argument we are referencing from the
+   *              compilation. It is an index into the outputs list passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   * not the index associated with {@link ANeuralNetworksModel_addOperand}.
+   * @param frequency A floating-point value within the range (0.0, 1.0].
+   * Describes how likely the memory is to be used in the specified role. This
+   * is provided as a hint to optimize the case when multiple roles prefer
+   * different memory locations or data layouts.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_addOutputRole)(
+      ANeuralNetworksMemoryDesc* desc,
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      float frequency);
+
+  /**
+   * Set the dimensional information of the memory descriptor.
+   *
+   * The specified dimensions must be compatible with the dimensions of the
+   * corresponding model operands of all the roles specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are
+   * incompatible if both ranks are fully specified but have different values,
+   * or if there is at least one axis that is fully specified in both but has
+   * different values.
+   *
+   * Attempting to modify a memory descriptor once
+   * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+   * error.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be modified.
+   * @param rank The number of dimensions. Must be 0 for scalars.
+   * @param dimensions An array of dimensions. An entry with the value 0
+   * indicates that the corresponding axis has an unknown size.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_setDimensions)(
+      ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+      const uint32_t* dimensions);
+
+  /**
+   * Indicate that we have finished modifying a memory descriptor. Required
+   * before calling
+   * {@link ANeuralNetworksMemory_createFromDesc}.
+   *
+   * This function must only be called once for a given memory descriptor.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be finished.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_finish)(ANeuralNetworksMemoryDesc* desc);
+
+  /**
+   * Creates a memory object from a memory descriptor.
+   *
+   * The memory object is created with an uninitialized buffer. A memory object
+   * with an uninitialized buffer may only be used according to the roles
+   * specified by
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination
+   * memory in
+   * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object is
+   * initialized after the memory object is used as an output in a successful
+   * execution, or used as the destination memory in a successful {@link
+   * ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may
+   * be used according to all roles specified in
+   * {@link ANeuralNetworksMemoryDesc}, or as the source or destination memory
+   * in
+   * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will
+   * return to the uninitialized state if the memory object is used as an output
+   * in a failed execution, or used as the destination memory in a failed {@link
+   * ANeuralNetworksMemory_copy}.
+   *
+   * The dimensions of the memory descriptor are deduced from the dimensions of
+   * the corresponding model operands of all the roles specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions
+   * set by the call to {@link ANeuralNetworksMemoryDesc_setDimensions}, if any.
+   * The memory descriptor may have unspecified dimensions or rank. In such a
+   * case, the same memory object may be used with different shapes of outputs
+   * in different executions. When the memory is used as an input, the input
+   * shape must be the same as the output shape from the last execution using
+   * this memory object as an output, or the last
+   * {@link ANeuralNetworkMemory_copy} using this memory object as the
+   * destination memory. Creating a memory object with unspecified dimensions or
+   * rank may fail for certain sets of roles.
+   *
+   * Using the memory in roles or shapes that are not compatible with the rules
+   * specified above will return an error.
+   *
+   * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory
+   * object, both offset and length must be set to zero and the entire memory
+   * region will be associated with the specified input or output operand.
+   *
+   * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the
+   * memory created from this function will return an error.
+   *
+   * {@link ANeuralNetworksMemory_free} must be called once the memory is no
+   * longer needed.
+   *
+   * Attempting to create memory from an unfinished memory descriptor will
+   * return an error.
+   *
+   * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the
+   * {@link ANeuralNetworksMemory} object.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED
+   * if the memory is created with unspecified dimensions or rank and it is not
+   * supported for this set of roles.
+   */
+  int (*ANeuralNetworksMemory_createFromDesc)(
+      const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory);
+
+  /**
+   * Copies data from one memory object to another.
+   *
+   * If at most one of the src and dst is created from
+   * {@link ANeuralNetworksMemory_createFromDesc}, the src and dst must have the
+   * same logical size:
+   * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd},
+   * or if it is created from {@link
+   * ANeuralNetworksMemory_createFromAHardwareBuffer} with format of
+   * AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the
+   * memory.
+   * - If the memory is created from
+   *   {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a format
+   * other than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size
+   * when there is no padding and the data is tightly packed. This function may
+   * fail if the AHardwareBuffer cannot be accessed.
+   * - If the memory is created from {@link
+   * ANeuralNetworksMemory_createFromDesc}, the logical size equals the size
+   * indicated by the {@link OperandCode} multiplied by the number of elements.
+   * This function will fail if the number of elements is unknown.
+   *
+   * If both src and dst are created from {@link
+   * ANeuralNetworksMemory_createFromDesc}, they must have compatible
+   * dimensions. Two dimensions are incompatible if both ranks are fully
+   * specified but have different values, or if there is at least one axis that
+   * is fully specified in both but has different values. The dst may have
+   * unspecified dimensions or rank. In such a case, the dimensions of dst will
+   * get updated according to the dimensions of the src.
+   *
+   * In both cases, if the src is created from
+   * {@link ANeuralNetworksMemory_createFromDesc}, it must have been used as an
+   * output in a successful execution, or used as the destination memory in a
+   * successful
+   * {@link ANeuralNetworksMemory_copy}.
+   *
+   * The src and dst may have different data layout, in which case the data
+   * copying is performed logically with data layout transformation.
+   *
+   * Available since API level 30.
+   *
+   * @param src The source memory object.
+   * @param dst The destination memory object.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemory_copy)(const ANeuralNetworksMemory* src,
+                                    const ANeuralNetworksMemory* dst);
+
+  /**
+   * Create a {@link ANeuralNetworksEvent} from a sync_fence file descriptor.
+   *
+   * The newly created ANeuralNetworksEvent does not take ownership of the
+   * provided sync_fence_fd, it will instead dup the provided sync_fence_fd and
+   * own the duplicate.
+   *
+   * @param sync_fence_fd The sync_fence file descriptor.
+   * @param event The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksEvent_createFromSyncFenceFd)(
+      int sync_fence_fd, ANeuralNetworksEvent** event);
+
+  /**
+   * Get sync_fence file descriptor from the event.
+   *
+   * If the ANeuralNetworksEvent is not backed by a sync fence, the
+   * sync_fence_fd will be set to -1, and ANEURALNETWORKS_BAD_DATA will be
+   * returned.
+   *
+   * See {@link ANeuralNetworksEvent_createFromSyncFenceFd} and
+   * {@link ANeuralNetworksExecution_startComputeWithDependencies} to see how to
+   * create an event backed by a sync fence.
+   *
+   * The user takes ownership of the returned fd, and must close the returned
+   * file descriptor when it is no longer needed.
+   *
+   * @param event An event that is backed by a sync fence.
+   * @param sync_fence_fd The sync_fence file descriptor. The file descriptor
+   * will be set to -1 if there is an error.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksEvent_getSyncFenceFd)(const ANeuralNetworksEvent* event,
+                                             int* sync_fence_fd);
+
+  /**
+   * Schedule asynchronous evaluation of the execution with dependencies.
+   *
+   * The execution will wait for all the depending events to be signaled before
+   * starting the evaluation. Once the execution has completed and the outputs
+   * are ready to be consumed, the returned event will be signaled. Depending on
+   * which devices are handling the execution, the event could be backed by a
+   * sync fence. Use {@link ANeuralNetworksEvent_wait} to wait for that event.
+   *
+   * ANeuralNetworksEvent_wait must be called to recurperate the resources used
+   * by the execution.
+   *
+   * If parts of the execution are scheduled on devices that do not support
+   * fenced execution, the function call may wait for such parts to finish
+   * before returning.
+   *
+   * The function will return an error if any of the events in dependencies is
+   * already in a bad state. After the execution is scheduled, if any of the
+   * events in dependencies does not complete normally, the execution will fail,
+   * and {@link ANeuralNetworksEvent_wait} on the returned event will return an
+   * error.
+   *
+   * The function will return an error if any of the execution outputs has a
+   * tensor operand type that is not fully specified.
+   *
+   * The function can be passed a timeout duration in nanoseconds. This timeout
+   * duration acts as a hint to drivers in the same way that the timeout
+   * durations in {@link ANeuralNetworksCompilation_setTimeout} and {@link
+   * ANeuralNetworksExecution_setTimeout} act as hints to drivers. The duration
+   * begins when all waitFor sync fences have been signaled, and can be used
+   * together with {@link ANeuralNetworksExecution_setTimeout} which specifies
+   * the maximum timeout duration beginning at the call to
+   * {@link ANeuralNetworksExecution_startComputeWithDependencies}.
+   * If the duration is non-zero, the {@link ANeuralNetworksExecution} must have
+   * been created from an {@link ANeuralNetworksCompilation} which in turn was
+   * created from
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If either
+   * the timeout duration from {@link ANeuralNetworksExecution_setTimeout} or
+   * the timeout duration passed to this call is exceeded, the execution may be
+   * aborted, in which case {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be
+   * returned through {@link
+   * ANeuralNetworksExecution_startComputeWithDependencies} or {@link
+   * ANeuralNetworksEvent_wait} on the event object. If the device has a feature
+   * level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that is
+   * lower than 30, then the timeout duration hints will be ignored.
+   *
+   * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+   * the condition model does not output false within the loop timeout duration,
+   * then execution will be aborted and {@link
+   * ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned through {@link
+   * ANeuralNetworksEvent_wait} on the event object.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
+   * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous
+   * execution. See {@link ANeuralNetworksExecution_startCompute} for regular
+   * asynchronous execution.
+   *
+   * @param execution The execution to be scheduled and executed.
+   * @param dependencies A set of depending events. The actual evaluation will
+   * not start until all the events are signaled.
+   * @param num_dependencies The number of events in the dependencies set.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent executing the model after all dependencies are signaled. If set
+   * to 0, the timeout duration is considered infinite.
+   * @param event The event that will be signaled on completion. event is set to
+   *              NULL if there's an error.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully
+   * scheduled.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_startComputeWithDependencies)(
+      ANeuralNetworksExecution* execution,
+      const ANeuralNetworksEvent* const* dependencies,
+      uint32_t num_dependencies, uint64_t duration,
+      ANeuralNetworksEvent** event);
+
+  /**
+   * Specifies whether the {@link ANeuralNetworksExecution} is able to accept
+   * padded input and output buffers and memory objects.
+   *
+   * By default, the input and output buffers and memory objects of {@link
+   * ANeuralNetworksExecution} do not allow padding.
+   *
+   * Setting the execution to accept padded input and output buffers and memory
+   * objects enables the length argument of {@link
+   * ANeuralNetworksExecution_setInput},
+   * {@link ANeuralNetworksExecution_setInputFromMemory}, {@link
+   * ANeuralNetworksExecution_setOutput}, and {@link
+   * ANeuralNetworksExecution_setOutputFromMemory} to be greater than the raw
+   * size of the operand (i.e. the size of an element multiplied by the number
+   * of elements). The extra bytes at the end of the buffer or memory region may
+   * be used by the driver to access data in chunks, for efficiency.
+   *
+   * This method must not be called after {@link
+   * ANeuralNetworksExecution_setInput},
+   * {@link ANeuralNetworksExecution_setInputFromMemory}, {@link
+   * ANeuralNetworksExecution_setOutput}, or {@link
+   * ANeuralNetworksExecution_setOutputFromMemory}.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param enable 'true' if the execution is to be able to accept padded input
+   * and output buffers and memory objects, 'false' if not.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *         ANEURALNETWORKS_BAD_STATE if {@link
+   * ANeuralNetworksExecution_setInput},
+   *         {@link ANeuralNetworksExecution_setInputFromMemory},
+   *         {@link ANeuralNetworksExecution_setOutput}, or
+   *         {@link ANeuralNetworksExecution_setOutputFromMemory} has been
+   * called on the execution.
+   *
+   * Available since API level 31.
+   */
+  int (*ANeuralNetworksExecution_enableInputAndOutputPadding)(
+      ANeuralNetworksExecution* execution, bool enable);
+
+  /**
+   * Specifies whether the {@link ANeuralNetworksExecution} can be reused for
+   * multiple computations.
+   *
+   * By default, the {@link ANeuralNetworksExecution} is not reusable.
+   *
+   * Setting the execution to be reusable enables multiple computations to be
+   * scheduled and evaluated on the same execution sequentially, either by means
+   * of
+   * {@link ANeuralNetworksExecution_burstCompute}, {@link
+   * ANeuralNetworksExecution_compute},
+   * {@link ANeuralNetworksExecution_startCompute} or
+   * {@link ANeuralNetworksExecution_startComputeWithDependencies}.
+   *
+   * This function may only be invoked when the execution is in the preparation
+   * state.
+   *
+   * See {@link ANeuralNetworksExecution} for information on execution states
+   * and multithreaded usage.
+   *
+   * @param execution The execution to be modified.
+   * @param reusable 'true' if the execution is to be reusable, 'false' if not.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *         ANEURALNETWORKS_BAD_STATE if the execution is not in the
+   * preparation state.
+   *
+   * Available since API level 31.
+   */
+  int (*ANeuralNetworksExecution_setReusable)(
+      ANeuralNetworksExecution* execution, bool reusable);
+
+  /**
+   * Get the NNAPI runtime feature level.
+   *
+   * Since API level 31 (NNAPI feature level 5), the NNAPI runtime
+   * (libneuralnetworks.so) and its API specification can be updated between
+   * Android API releases.
+   *
+   * On Android devices with API level 31 and newer, for NNAPI runtime feature
+   * discovery, the NNAPI runtime feature level must be used instead of the
+   * Android device API level.
+   *
+   * On Android devices with API level 30 and older, the Android API level of
+   * the Android device must be used for NNAPI runtime feature discovery. Enum
+   * values in
+   * {@link FeatureLevelCode} from feature level 1 to 5 have their corresponding
+   * Android API levels listed in their documentation, and each such enum value
+   * equals the corresponding API level. This allows using the Android API level
+   * as the feature level. This mapping between enum value and Android API level
+   * does not exist for feature levels after NNAPI feature level 5 and API
+   * levels after S (31).
+   *
+   * Example usage:
+   * int device_api_level = android_get_device_api_level();
+   * int64_t runtime_feature_level = (device_api_level < __ANDROID_API_S__) ?
+   *                                  device_api_level :
+   * ANeuralNetworks_getRuntimeFeatureLevel();
+   *
+   * Runtime feature level is closely related to NNAPI device feature level
+   * ({@link ANeuralNetworksDevice_getFeatureLevel}), which indicates an NNAPI
+   * device feature level (the most advanced NNAPI specification and features
+   * that the driver implements). This function expresses NNAPI runtime feature
+   * level, which indicates the most advanced NNAPI specification and features
+   * the runtime implements. An NNAPI device feature level is always less than
+   * or equal to the runtime feature level.
+   *
+   * This function returns a {@link FeatureLevelCode} enum value,
+   * which is the NNAPI specification version that this NNAPI runtime
+   * implements. It is NOT an Android API level.
+   *
+   * Available since NNAPI feature level 5.
+   */
+  int64_t (*ANeuralNetworks_getRuntimeFeatureLevel)();
+
+  /**
+   * Gets the ID that identifies a single session of client interacting with
+   * NNAPI runtime.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Session info id.
+   */
+  int32_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getSessionId)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets NNAPI version.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return NNAPI version.
+   */
+  int64_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getNnApiVersion)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets the hash of the model architecture (without weights).
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Model hash.
+   */
+  const uint8_t* (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getModelArchHash)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets the device IDs as a comma-concatenated string.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Device ID.
+   */
+  const char* (*SL_ANeuralNetworksDiagnosticCompilationInfo_getDeviceIds)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets the error code.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Error code.
+   */
+  int32_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getErrorCode)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets the type of tensors used for inputs.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Input data class.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getInputDataClass)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets the type of tensors used for outputs.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Output data class.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getOutputDataClass)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets how many nanoseconds elapsed when compiling the model.
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Time to compile the model in nanoseconds. UINT64_MAX indicates that
+   * timing information is not available.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getCompilationTimeNanos)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Is caching enabled?
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Whether caching is enabled.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_isCachingEnabled)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Is control flow used?
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Whether control flow was used.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_isControlFlowUsed)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Are dynamic tensors used?
+   *
+   * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Whether dynamic tensors were used.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_areDynamicTensorsUsed)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * Gets the ID that identifies a single session of client interacting with
+   * NNAPI runtime.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Session info id.
+   */
+  int32_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getSessionId)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets NNAPI version.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return NNAPI version.
+   */
+  int64_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getNnApiVersion)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the hash of the model architecture (without weights).
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Model hash.
+   */
+  const uint8_t* (*SL_ANeuralNetworksDiagnosticExecutionInfo_getModelArchHash)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the device IDs as a comma-concatenated string.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Device ID.
+   */
+  const char* (*SL_ANeuralNetworksDiagnosticExecutionInfo_getDeviceIds)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the execution mode.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Execution mode.
+   */
+  ANeuralNetworksDiagnosticExecutionMode (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getExecutionMode)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the input data class.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Input data class.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getInputDataClass)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the output data class.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Output data class.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getOutputDataClass)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the error code.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Error code.
+   */
+  uint32_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getErrorCode)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the time taken to execute from runtime, including runtime/ipc
+   * overhead.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Time taken to execute as measured by the runtime in nanoseconds.
+   * UINT64_MAX indicates that timing information is not available.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getRuntimeExecutionTimeNanos)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the time taken to execute in the driver, excluding runtime/ipc
+   * overhead.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Time taken to execute on the driver in nanoseconds. UINT64_MAX
+   * indicates that timing information is not available.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getDriverExecutionTimeNanos)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Gets the time taken to execute on the hardware, excluding driver overhead.
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Time taken to execute on the hardware in nanoseconds. UINT64_MAX
+   * indicates that timing information is not available.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getHardwareExecutionTimeNanos)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Is caching enabled?
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Whether caching is enabled.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_isCachingEnabled)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Is control flow used?
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Whether control flow was used.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_isControlFlowUsed)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Are dynamic tensors used?
+   *
+   * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info
+   * object.
+   * @return Whether dynamic tensors were used.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_areDynamicTensorsUsed)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * Sets the callbacks to be called when compilations or executions finish.
+   *
+   * Example usage:
+   *
+   * // Callback to be invoked whenever a compilation has completed.
+   * void compilationCallback(void* context,
+   * ANeuralNetworksDiagnosticCompilationInfo* info) {
+   *     // The context object can be used to store state without the use of a
+   * global variable. ExampleLoggerObject* logger =
+   * static_cast<ExampleLoggerObject*>(context);
+   *
+   *     // Calls to getters to get the details...
+   *     const int32_t sessionId =
+   * ANeuralNetworksDiagnosticCompilationInfo_getSessionId(info);
+   *
+   *     ...
+   *
+   *     logger->write(...);
+   * }
+   *
+   * void executionCallback(void* context,
+   * ANeuralNetworksDiagnosticExecutionInfo* info) {
+   *      ...
+   * }
+   *
+   * ExampleLoggerObject exampleLoggerObject;
+   * ANeuralNetworksDiagnostic_registerCallbacks(&compilationCallback,
+   * &executionCallback, static_cast<void*>(&exampleLoggerObject));
+   *
+   * @param compilationCallback The compilation callback to set.
+   * @param executionCallback The execution callback to set.
+   * @param callbackContext The context to be passed to the callbacks when they
+   * are invoked. The context object may be used by multiple threads
+   * simulatenously, so it must be thread-safe.
+   */
+  void (*SL_ANeuralNetworksDiagnostic_registerCallbacks)(
+      ANeuralNetworksDiagnosticCompilationFinishedCallback compilationCallback,
+      ANeuralNetworksDiagnosticExecutionFinishedCallback executionCallback,
+      void* callbackContext);
+};
+
+/**
+ * Load the NNAPI implementation from the shared libraries.
+ * The NnApi structure is filled with all the pointers. If one function doesn't
+ * exist, a null pointer is stored.
+ */
+const NnApi* NnApiImplementation();
+
+// Forward declaration for CreateNnApiFromSupportLibrary below.
+struct NnApiSLDriverImplFL5;
+
+/**
+ * Allocate a new NnApi structure instance and fill it with function pointers
+ * from NnApiSLDriverImplFL5 instance. Functions that are not present in the
+ * support library are assigned null pointers.
+ *
+ * The NN API Support Library Driver must support at least NNAPI Feature Level 5
+ * (introduced in SDK level 31), but this might point to a compatible struct
+ * that also supports a higher NNAPI Feature Level. These cases can be
+ * distinguished by examining the base.implFeatureLevel field, which should be
+ * set to the supported feature level (which must be >=
+ * ANEURALNETWORKS_FEATURE_LEVEL_5).
+ */
+std::unique_ptr<const NnApi> CreateNnApiFromSupportLibrary(
+    const NnApiSLDriverImplFL5* nnapi_support_library_driver);
+
+#endif  // TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_util.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_util.h
new file mode 100644
index 00000000..4af50906
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/nnapi_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides general C++ utility functions for interacting with NNAPI.
+
+#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_UTIL_H_
+#define TENSORFLOW_LITE_NNAPI_NNAPI_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace nnapi {
+
+// Return std::vector consisting of pointers to null-terminated device names.
+// These names are guaranteed to be valid for the lifetime of the application.
+std::vector<const char*> GetDeviceNamesList();
+// An overload that uses a client-provided NnApi* structure to request available
+// devices instead of the static one provided by NnApiImplementation().
+// The names are guaranteed to be valid for the lifetime of the application.
+std::vector<const char*> GetDeviceNamesList(const NnApi* nnapi);
+
+// Return a string containing the names of all available devices.
+// Will take the format: "DeviceA,DeviceB,DeviceC"
+std::string GetStringDeviceNamesList();
+// An overload that uses a client-provided NnApi* structure to request available
+// devices instead of the static one provided by NnApiImplementation().
+std::string GetStringDeviceNamesList(const NnApi* nnapi);
+
+}  // namespace nnapi
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_NNAPI_NNAPI_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/include/SupportLibrary.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/include/SupportLibrary.h
new file mode 100644
index 00000000..1ed42261
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/include/SupportLibrary.h
@@ -0,0 +1,96 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_NNAPI_SL_INCLUDE_SUPPORT_LIBRARY_H_
+#define TENSORFLOW_LITE_NNAPI_SL_INCLUDE_SUPPORT_LIBRARY_H_
+
+#include <dlfcn.h>
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <variant>
+
+// Changed when importing from AOSP
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/sl/public/NeuralNetworksSupportLibraryImpl.h"
+
+namespace tflite {
+namespace nnapi {
+
+#ifndef __NNAPI_FL5_MIN_ANDROID_API__
+#define __NNAPI_FL5_MIN_ANDROID_API__ __ANDROID_API_S__
+#endif
+
+/**
+ * Helper struct, wraps different versions of NnApiSLDriverImpl.
+ *
+ * Owns the .so handle, and will close it in destructor.
+ * Sets proper implStructFeatureLevel in constructor.
+ *
+ * There's expectation that for M>N, NnApiSLDriverImplFL(M) is
+ * a strict superset of NnApiSLDriverImplFL(N), and *NnApiSLDriverImplFL(M) can
+ * be reinterpret_cast to *NnApiSLDriverImplFL(N) safely.
+ *
+ * The base->implFeatureLevel is set to the actual Feature Level
+ * implemented by the SLDriverImpl,
+ */
+struct NnApiSupportLibrary {
+  NnApiSupportLibrary(const NnApiSLDriverImplFL5* impl, void* libHandle)
+      : libHandle(libHandle), fl5(impl) {}
+  // No need for ctor below since FL6&7 are typedefs of FL5
+  // NnApiSupportLibrary(const NnApiSLDriverImplFL6& impl, void* libHandle):
+  // impl(impl), NnApiSupportLibrary(const NnApiSLDriverImplFL7& impl, void*
+  // libHandle): impl(impl), libHandle(libHandle) {}
+  ~NnApiSupportLibrary() {
+    if (libHandle != nullptr) {
+      dlclose(libHandle);
+      libHandle = nullptr;
+    }
+  }
+  NnApiSupportLibrary(const NnApiSupportLibrary&) = delete;
+  NnApiSupportLibrary& operator=(const NnApiSupportLibrary&) = delete;
+
+  int64_t getFeatureLevel() const { return fl5->base.implFeatureLevel; }
+
+  const NnApiSLDriverImplFL5* getFL5() const { return fl5; }
+  const NnApiSLDriverImplFL6* getFL6() const {
+    TFLITE_CHECK_GE(getFeatureLevel(), ANEURALNETWORKS_FEATURE_LEVEL_6);
+    return reinterpret_cast<const NnApiSLDriverImplFL6*>(&fl5);
+  }
+  const NnApiSLDriverImplFL7* getFL7() const {
+    TFLITE_CHECK_GE(getFeatureLevel(), ANEURALNETWORKS_FEATURE_LEVEL_7);
+    return reinterpret_cast<const NnApiSLDriverImplFL6*>(&fl5);
+  }
+
+  void* libHandle = nullptr;
+  const NnApiSLDriverImplFL5* fl5;
+};
+
+/**
+ * Loads the NNAPI support library.
+ * The NnApiSupportLibrary structure is filled with all the pointers. If one
+ * function doesn't exist, a null pointer is stored.
+ */
+std::unique_ptr<const NnApiSupportLibrary> loadNnApiSupportLibrary(
+    const std::string& libName);
+std::unique_ptr<const NnApiSupportLibrary> loadNnApiSupportLibrary(
+    void* libHandle);
+
+}  // namespace nnapi
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_NNAPI_SL_INCLUDE_SUPPORT_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/include/SupportLibrarySymbols.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/include/SupportLibrarySymbols.h
new file mode 100644
index 00000000..8a96c02d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/include/SupportLibrarySymbols.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_SL_INCLUDE_SUPPORT_LIBRARY_SYMBOLS_H_
+#define TENSORFLOW_LITE_NNAPI_SL_INCLUDE_SUPPORT_LIBRARY_SYMBOLS_H_
+
+#include <stddef.h>
+// Changed when importing from AOSP
+#include "tensorflow/lite/nnapi/sl/public/NeuralNetworksSupportLibraryImpl.h"
+
+// If you are linking against SL driver implementation through DT_NEEDED,
+// you can use this declaration to access its implementation instead
+// of doing dlsym.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Get the NNAPI SL Driver NnApiSLDriverImpl with all
+ * driver functions.
+ */
+NnApiSLDriverImpl* ANeuralNetworks_getSLDriverImpl();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_NNAPI_SL_INCLUDE_SUPPORT_LIBRARY_SYMBOLS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/public/NeuralNetworksSupportLibraryImpl.h b/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/public/NeuralNetworksSupportLibraryImpl.h
new file mode 100644
index 00000000..ffb09d01
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/nnapi/sl/public/NeuralNetworksSupportLibraryImpl.h
@@ -0,0 +1,1709 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+/******************************************************************
+ *
+ * IMPORTANT NOTICE:
+ *
+ *   This file is part of Android's set of stable system headers
+ *   exposed by the Android NDK (Native Development Kit).
+ *
+ *   Third-party source AND binary code relies on the definitions
+ *   here to be FROZEN ON ALL UPCOMING PLATFORM RELEASES.
+ *
+ *   - DO NOT MODIFY ENUMS (EXCEPT IF YOU ADD NEW 32-BIT VALUES)
+ *   - DO NOT MODIFY CONSTANTS OR FUNCTIONAL MACROS
+ *   - DO NOT CHANGE THE SIGNATURE OF FUNCTIONS IN ANY WAY
+ *   - DO NOT CHANGE THE LAYOUT OR SIZE OF STRUCTURES
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Changed when importing from AOSP
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Performance information for the reference workload.
+ *
+ * Used by a driver to report its performance characteristics.
+ */
+typedef struct {
+  /**
+   * Ratio of the time taken by the driver to execute the workload compared to
+   * the time the CPU would take for the same workload. A lower number is
+   * better.
+   */
+  float execTime;
+
+  /**
+   * Ratio of the energy used by the driver compared to what the CPU would use
+   * for doing the same workload. A lower number is better.
+   */
+  float powerUsage;
+} SL_ANeuralNetworksPerformanceInfo;
+
+/**
+ * Driver performance when operating on a particular data type. In the case of
+ * float32 data, this is used when the calculations are not relaxed.
+ */
+typedef struct {
+  int32_t operandType;
+  SL_ANeuralNetworksPerformanceInfo performanceInfo;
+} SL_ANeuralNetworksOperandPerformanceInfo;
+
+/**
+ * Information about NNAPI Vendor extension operand type.
+ */
+typedef struct {
+  /**
+   * The byte size of the operand (if scalar) or of a single element (if
+   * tensor).
+   */
+  uint32_t byteSize;
+
+  /**
+   * The extension operand type.
+   */
+  uint16_t type;
+
+  /**
+   * Indicates whether the extension operand type represents a tensor or a
+   * scalar.
+   */
+  bool isTensor;
+} SL_ANeuralNetworksExtensionOperandTypeInformation;
+
+/**
+ * The different performance info kinds.
+ */
+typedef enum {
+  /**
+   * Driver performance when operating on float32 data but performing
+   * calculations with range and/or precision as low as that of the IEEE 754
+   * 16-bit floating-point format.
+   */
+  SL_ANEURALNETWORKS_CAPABILITIES_PERFORMANCE_RELAXED_SCALAR = 0,
+
+  /**
+   * Driver performance when operating on float32 data but performing
+   * calculations with range and/or precision as low as that of the IEEE 754
+   * 16-bit floating-point format.
+   */
+  SL_ANEURALNETWORKS_CAPABILITIES_PERFORMANCE_RELAXED_TENSOR = 1,
+
+  /**
+   * Performance of an {@link ANEURALNETWORKS_IF} operation is the sum of {@link
+   * ANEURALNETWORKS_IF}'s performance and the mean of performance for the two
+   * branch subgraphs, where performance for a subgraph is the sum of the
+   * performance of all operations within the subgraph.
+   */
+  SL_ANEURALNETWORKS_CAPABILITIES_PERFORMANCE_IF = 2,
+
+  /**
+   * Performance of a {@link ANEURALNETWORKS_WHILE} operation is the sum of
+   * {@link ANEURALNETWORKS_WHILE}'s performance, performance for the condition
+   * subgraph and performance for the body subgraph, where performance for a
+   * subgraph is the sum of the performance of all operations within the
+   * subgraph.
+   */
+  SL_ANEURALNETWORKS_CAPABILITIES_PERFORMANCE_WHILE = 3,
+} SL_ANeuralNetworksPerformanceInfoCode;
+
+/**
+ * Sets the compilation caching signature and file descriptors.
+ *
+ * Provides optional caching information to the support library driver for
+ * faster repeated compilation.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param modelCacheFds An array of file descriptors for the security-sensitive
+ * cache. The file descriptors will be duplicated.
+ * @param numModelCacheFiles The number of the model cache files.
+ * @param dataCacheFds An array of file descriptors for the constants' cache.
+ *                     The file descriptors will be duplicated.
+ * @param numDataCacheFiles The number of the data cache files.
+ * @param token The token provided by the user to specify a model must be of
+ * length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that
+ *              the token is unique to a model within the application. The NNAPI
+ *              runtime cannot detect token collisions; a collision will result
+ * in a failed execution or in a successful execution that produces incorrect
+ *              output values.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksCompilation_setCachingFromFds(
+    ANeuralNetworksCompilation* compilation, const int* modelCacheFds,
+    const uint32_t numModelCacheFiles, const int* dataCacheFds,
+    const uint32_t numDataCacheFiles, const uint8_t* token);
+
+/**
+ * Gets the caching requirements of the driver implementation.
+ *
+ * There are two types of cache file descriptors provided to the driver: model
+ * cache and data cache.
+ *
+ * The data cache is for caching constant data, possibly including preprocessed
+ * and transformed tensor buffers. Any modification to the data cache should
+ * have no worse effect than generating bad output values at execution time.
+ *
+ * The model cache is for caching security-sensitive data such as compiled
+ * executable machine code in the device's native binary format. A modification
+ * to the model cache may affect the driver's execution behavior, and a
+ * malicious client could make use of this to execute beyond the granted
+ * permission.
+ *
+ * ANeuralNetworksDevice_getNumberOfCacheFilesNeeded returns how many of each
+ * type of cache files the driver implementation needs to cache a single
+ * compilation. Returning 0 for both types indicates compilation caching is not
+ * supported by this driver. The driver may still choose not to cache certain
+ * compiled models even if it reports that caching is supported.
+ *
+ * @param device The representation of the specified device.
+ * @param numModelCacheFiles The number of the model cache files. A value of 0
+ * is returned on error.
+ * @param numDataCacheFiles The number of the data cache files. A value of 0 is
+ * returned on error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksDevice_getNumberOfCacheFilesNeeded(
+    const ANeuralNetworksDevice* device, uint32_t* numModelCacheFiles,
+    uint32_t* numDataCacheFiles);
+
+/**
+ * Get NNAPI Device performance/power capabilities.
+ *
+ * This returns performance of non-extension operations.
+ *
+ * Performance of an operation other than {@link ANEURALNETWORKS_IF} and {@link
+ * ANEURALNETWORKS_WHILE} comes from the type of its first operand.
+ *
+ * @param device The representation of the specified device.
+ * @param performanceInfoKind The kind of performance info to be queried. Must
+ * be one of the values from {@link SL_ANeuralNetworksPerformanceInfoCode}.
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksDevice_getPerformanceInfo(
+    const ANeuralNetworksDevice* device, int32_t performanceInfoKind,
+    SL_ANeuralNetworksPerformanceInfo* performanceInfo);
+
+/**
+ * Get NNAPI Device operand performance/power capabilities.
+ *
+ * This returns performance of non-extension operations.
+ *
+ * Performance of an operation other than {@link ANEURALNETWORKS_IF} and {@link
+ * ANEURALNETWORKS_WHILE} comes from the type of its first operand.
+ *
+ * @param device The representation of the specified device.
+ * @param context Context to pass to the callback.
+ * @param callback Callback taking operand performance and context.
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksDevice_forEachOperandPerformanceInfo(
+    const ANeuralNetworksDevice* device, void* context,
+    void (*callback)(SL_ANeuralNetworksOperandPerformanceInfo, void*));
+
+/**
+ * Get the number of extensions supported by the driver implementation.
+ *
+ * @param device The representation of the specified device.
+ * @param vendorExtensionCount The number of vendor extensions the device
+ * supports. To be used in
+ *                             {@link
+ * ANeuralNetworksDevice_getVendorExtensionName} and {@link
+ *                             ANeuralNetworksDevice_forEachVendorExtensionOperandTypeInformation}.
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksDevice_getVendorExtensionCount(
+    const ANeuralNetworksDevice* device, uint32_t* vendorExtensionCount);
+
+/**
+ * Gets information about a specified extension supported by the driver
+ * implementation.
+ *
+ * @param device The representation of the specified device.
+ * @param vendorExtensionIndex The index of the specified vendor extension. Must
+ * be less than the number of available vendor extensions.
+ * @param extensionName Name of the NNAPI HAL Extension.
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksDevice_getVendorExtensionName(
+    const ANeuralNetworksDevice* device, uint32_t vendorExtensionIndex,
+    const char** extensionName);
+
+/**
+ * Gets a specified extension's operand type information supported by the driver
+ * implementation.
+ *
+ * @param device The representation of the specified device.
+ * @param vendorExtensionIndex The index of the specified vendor extension. Must
+ * be less than the number of available vendor extensions.
+ * @param context Context to pass to the callback.
+ * @param callback Callback taking operand type information and context.
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available in the compabibility library build only.
+ */
+int SL_ANeuralNetworksDevice_forEachVendorExtensionOperandTypeInformation(
+    const ANeuralNetworksDevice* device, uint32_t vendorExtensionIndex,
+    void* context,
+    void (*callback)(SL_ANeuralNetworksExtensionOperandTypeInformation, void*));
+
+typedef struct ANeuralNetworksDiagnosticCompilationInfo
+    ANeuralNetworksDiagnosticCompilationInfo;
+
+/**
+ * Gets the ID that identifies a single session of client interacting with NNAPI
+ * runtime.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Session info id.
+ */
+int32_t SL_ANeuralNetworksDiagnosticCompilationInfo_getSessionId(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets NNAPI version.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return NNAPI version.
+ */
+int64_t SL_ANeuralNetworksDiagnosticCompilationInfo_getNnApiVersion(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets the hash of the model architecture (without weights).
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Model hash.
+ */
+const uint8_t* SL_ANeuralNetworksDiagnosticCompilationInfo_getModelArchHash(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets the device IDs as a comma-concatenated string.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Device ID.
+ */
+const char* SL_ANeuralNetworksDiagnosticCompilationInfo_getDeviceIds(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets the error code.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Error code.
+ */
+int32_t SL_ANeuralNetworksDiagnosticCompilationInfo_getErrorCode(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets the type of tensors used for inputs.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Input data class.
+ */
+ANeuralNetworksDiagnosticDataClass
+SL_ANeuralNetworksDiagnosticCompilationInfo_getInputDataClass(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets the type of tensors used for outputs.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Output data class.
+ */
+ANeuralNetworksDiagnosticDataClass
+SL_ANeuralNetworksDiagnosticCompilationInfo_getOutputDataClass(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Gets how many nanoseconds elapsed when compiling the model.
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Time to compile the model in nanoseconds. UINT64_MAX indicates that
+ * timing information is not available.
+ */
+uint64_t SL_ANeuralNetworksDiagnosticCompilationInfo_getCompilationTimeNanos(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Is caching enabled?
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Whether caching is enabled.
+ */
+bool SL_ANeuralNetworksDiagnosticCompilationInfo_isCachingEnabled(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Is control flow used?
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Whether control flow was used.
+ */
+bool SL_ANeuralNetworksDiagnosticCompilationInfo_isControlFlowUsed(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+/**
+ * Are dynamic tensors used?
+ *
+ * @param diagnosticCompilationInfo The NNAPI diagnostic compilation info
+ * object.
+ * @return Whether dynamic tensors were used.
+ */
+bool SL_ANeuralNetworksDiagnosticCompilationInfo_areDynamicTensorsUsed(
+    const ANeuralNetworksDiagnosticCompilationInfo* diagnosticCompilationInfo);
+
+typedef struct ANeuralNetworksDiagnosticExecutionInfo
+    ANeuralNetworksDiagnosticExecutionInfo;
+
+/**
+ * Gets the ID that identifies a single session of client interacting with NNAPI
+ * runtime.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Session info id.
+ */
+int32_t SL_ANeuralNetworksDiagnosticExecutionInfo_getSessionId(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets NNAPI version.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return NNAPI version.
+ */
+int64_t SL_ANeuralNetworksDiagnosticExecutionInfo_getNnApiVersion(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the hash of the model architecture (without weights).
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Model hash.
+ */
+const uint8_t* SL_ANeuralNetworksDiagnosticExecutionInfo_getModelArchHash(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the device IDs as a comma-concatenated string.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Device ID.
+ */
+const char* SL_ANeuralNetworksDiagnosticExecutionInfo_getDeviceIds(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the execution mode.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Execution mode.
+ */
+ANeuralNetworksDiagnosticExecutionMode
+SL_ANeuralNetworksDiagnosticExecutionInfo_getExecutionMode(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the input data class.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Input data class.
+ */
+ANeuralNetworksDiagnosticDataClass
+SL_ANeuralNetworksDiagnosticExecutionInfo_getInputDataClass(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the output data class.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Output data class.
+ */
+ANeuralNetworksDiagnosticDataClass
+SL_ANeuralNetworksDiagnosticExecutionInfo_getOutputDataClass(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the error code.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Error code.
+ */
+uint32_t SL_ANeuralNetworksDiagnosticExecutionInfo_getErrorCode(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the time taken to execute from runtime, including runtime/ipc overhead.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Time taken to execute as measured by the runtime in nanoseconds.
+ * UINT64_MAX indicates that timing information is not available.
+ */
+uint64_t SL_ANeuralNetworksDiagnosticExecutionInfo_getRuntimeExecutionTimeNanos(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the time taken to execute in the driver, excluding runtime/ipc overhead.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Time taken to execute on the driver in nanoseconds. UINT64_MAX
+ * indicates that timing information is not available.
+ */
+uint64_t SL_ANeuralNetworksDiagnosticExecutionInfo_getDriverExecutionTimeNanos(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Gets the time taken to execute on the hardware, excluding driver overhead.
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Time taken to execute on the hardware in nanoseconds. UINT64_MAX
+ * indicates that timing information is not available.
+ */
+uint64_t
+SL_ANeuralNetworksDiagnosticExecutionInfo_getHardwareExecutionTimeNanos(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Is caching enabled?
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Whether caching is enabled.
+ */
+bool SL_ANeuralNetworksDiagnosticExecutionInfo_isCachingEnabled(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Is control flow used?
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Whether control flow was used.
+ */
+bool SL_ANeuralNetworksDiagnosticExecutionInfo_isControlFlowUsed(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+/**
+ * Are dynamic tensors used?
+ *
+ * @param diagnosticExecutionInfo The NNAPI diagnostic compilation info object.
+ * @return Whether dynamic tensors were used.
+ */
+bool SL_ANeuralNetworksDiagnosticExecutionInfo_areDynamicTensorsUsed(
+    const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+typedef void (*ANeuralNetworksDiagnosticCompilationFinishedCallback)(
+    const void* context, const ANeuralNetworksDiagnosticCompilationInfo* info);
+
+typedef void (*ANeuralNetworksDiagnosticExecutionFinishedCallback)(
+    const void* context, const ANeuralNetworksDiagnosticExecutionInfo* info);
+
+/**
+ * Sets the callbacks to be called when compilations or executions finish.
+ *
+ * Example usage:
+ *
+ * // Callback to be invoked whenever a compilation has completed.
+ * void compilationCallback(void* context,
+ * ANeuralNetworksDiagnosticCompilationInfo* info) {
+ *     // The context object can be used to store state without the use of a
+ * global variable. ExampleLoggerObject* logger =
+ * static_cast<ExampleLoggerObject*>(context);
+ *
+ *     // Calls to getters to get the details...
+ *     const int32_t sessionId =
+ * ANeuralNetworksDiagnosticCompilationInfo_getSessionId(info);
+ *
+ *     ...
+ *
+ *     logger->write(...);
+ * }
+ *
+ * void executionCallback(void* context, ANeuralNetworksDiagnosticExecutionInfo*
+ * info) {
+ *      ...
+ * }
+ *
+ * ExampleLoggerObject exampleLoggerObject;
+ * ANeuralNetworksDiagnostic_registerCallbacks(&compilationCallback,
+ * &executionCallback, static_cast<void*>(&exampleLoggerObject));
+ *
+ * @param compilationCallback The compilation callback to set.
+ * @param executionCallback The execution callback to set.
+ * @param callbackContext The context to be passed to the callbacks when they
+ * are invoked. The context object may be used by multiple threads
+ * simulatenously, so it must be thread-safe.
+ */
+void SL_ANeuralNetworksDiagnostic_registerCallbacks(
+    ANeuralNetworksDiagnosticCompilationFinishedCallback compilationCallback,
+    ANeuralNetworksDiagnosticExecutionFinishedCallback executionCallback,
+    void* callbackContext);
+
+/**
+ * Base version of NnApiSLDriverImpl with version information.
+ *
+ * NnApiSLDriverImpl is non-opaque, versioning struct to make it possible to
+ * pass its instance straight from the SL Driver to the shim registration. The
+ * glue code that loads the SL and calls the shim is non-updatable. An opaque
+ * struct would require the glue code to be updated if we would like to use
+ * newer NNAPI Feature Level.
+ *
+ * There's expectation that for M>N, NnApiSLDriverImplFL(M) is
+ * a strict superset of NnApiSLDriverImplFL(N), and NnApiSLDriverImplFL(M)* can
+ * be reinterpret_cast to NnApiSLDriverImplFL(N)* safely.
+ */
+typedef struct NnApiSLDriverImpl {
+  /**
+   * Version of the NnApiSLDriverImpl struct. Uses {@link FeatureLevelCode}
+   * values for versioning.
+   */
+  int64_t implFeatureLevel;
+} NnApiSLDriverImpl;
+
+/**
+ * NnApiSLDriverImpl for an Updatable SL Driver implementing {@link
+ * ANEURALNETWORKS_FEATURE_LEVEL_5}.
+ *
+ * This struct must set its implFeatureLevel to {@link
+ * ANEURALNETWORKS_FEATURE_LEVEL_5}.
+ *
+ * LINT.IfChange
+ */
+typedef struct NnApiSLDriverImplFL5 {
+  /**
+   * Base type with version information. Allows to cast a pointer of this type
+   * to NnApiSLDriverImpl* with valid version information.
+   * For this type, its .version fields should be always set to {@link
+   * ANEURALNETWORKS_FEATURE_LEVEL_5}.
+   */
+  NnApiSLDriverImpl base;
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksBurst_create}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksBurst_create},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksBurst_create)(ANeuralNetworksCompilation* compilation,
+                                     ANeuralNetworksBurst** burst);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksBurst_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksBurst_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksBurst_free)(ANeuralNetworksBurst* burst);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksCompilation_createForDevices}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_createForDevices},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_createForDevices)(
+      ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+      uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksCompilation_finish}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_finish},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_finish)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksCompilation_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksCompilation_free)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksCompilation_getPreferredMemoryAlignmentForInput}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_getPreferredMemoryAlignmentForInput},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_getPreferredMemoryAlignmentForInput)(
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      uint32_t* alignment);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksCompilation_getPreferredMemoryAlignmentForOutput}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_getPreferredMemoryAlignmentForOutput},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_getPreferredMemoryAlignmentForOutput)(
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      uint32_t* alignment);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksCompilation_getPreferredMemoryPaddingForInput}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_getPreferredMemoryPaddingForInput},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_getPreferredMemoryPaddingForInput)(
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      uint32_t* padding);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksCompilation_getPreferredMemoryPaddingForOutput}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_getPreferredMemoryPaddingForOutput},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_getPreferredMemoryPaddingForOutput)(
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      uint32_t* padding);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksCompilation_setCaching}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_setCaching},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_setCaching)(
+      ANeuralNetworksCompilation* compilation, const char* cacheDir,
+      const uint8_t* token);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksCompilation_setPreference}. Behavior, arguments, and outputs
+   * match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_setPreference},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_setPreference)(
+      ANeuralNetworksCompilation* compilation, int32_t preference);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksCompilation_setPriority}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_setPriority},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_setPriority)(
+      ANeuralNetworksCompilation* compilation, int priority);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksCompilation_setTimeout}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_setTimeout},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksCompilation_setTimeout)(
+      ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksDevice_getExtensionSupport}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksDevice_getExtensionSupport},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksDevice_getExtensionSupport)(
+      const ANeuralNetworksDevice* device, const char* extensionName,
+      bool* isExtensionSupported);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksDevice_getFeatureLevel}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksDevice_getFeatureLevel},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksDevice_getFeatureLevel)(
+      const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksDevice_getName}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksDevice_getName},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksDevice_getName)(const ANeuralNetworksDevice* device,
+                                       const char** name);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksDevice_getType}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksDevice_getType},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksDevice_getType)(const ANeuralNetworksDevice* device,
+                                       int32_t* type);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksDevice_getVersion}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksDevice_getVersion},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksDevice_getVersion)(const ANeuralNetworksDevice* device,
+                                          const char** version);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksDevice_wait}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksDevice_wait},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksDevice_wait)(const ANeuralNetworksDevice* device);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksEvent_createFromSyncFenceFd}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksEvent_createFromSyncFenceFd},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksEvent_createFromSyncFenceFd)(
+      int sync_fence_fd, ANeuralNetworksEvent** event);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksEvent_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksEvent_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksEvent_getSyncFenceFd}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksEvent_getSyncFenceFd},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksEvent_getSyncFenceFd)(const ANeuralNetworksEvent* event,
+                                             int* sync_fence_fd);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksEvent_wait}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksEvent_wait},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_burstCompute}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_burstCompute},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_burstCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_compute}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_compute},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_create}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_create},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_create)(
+      ANeuralNetworksCompilation* compilation,
+      ANeuralNetworksExecution** execution);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_enableInputAndOutputPadding}. Behavior, arguments,
+   * and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_enableInputAndOutputPadding},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_enableInputAndOutputPadding)(
+      ANeuralNetworksExecution* execution, bool enable);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_getDuration}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_getDuration},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_getDuration)(
+      const ANeuralNetworksExecution* execution, int32_t durationCode,
+      uint64_t* duration);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_getOutputOperandDimensions}. Behavior, arguments,
+   * and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_getOutputOperandDimensions},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandDimensions)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_getOutputOperandRank}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_getOutputOperandRank},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandRank)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_setInput}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setInput},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setInput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, const void* buffer,
+      size_t length);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_setInputFromMemory}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setInputFromMemory},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setInputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_setLoopTimeout}. Behavior, arguments, and outputs
+   * match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setLoopTimeout},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setLoopTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_setMeasureTiming}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setMeasureTiming},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setMeasureTiming)(
+      ANeuralNetworksExecution* execution, bool measure);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_setOutput}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setOutput},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setOutput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_setOutputFromMemory}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setOutputFromMemory},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setOutputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_setReusable}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setReusable},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setReusable)(
+      ANeuralNetworksExecution* execution, bool reusable);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksExecution_setTimeout}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_setTimeout},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_setTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksExecution_startComputeWithDependencies}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksExecution_startComputeWithDependencies)(
+      ANeuralNetworksExecution* execution,
+      const ANeuralNetworksEvent* const* dependencies,
+      uint32_t num_dependencies, uint64_t duration,
+      ANeuralNetworksEvent** event);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemoryDesc_addInputRole}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemoryDesc_addInputRole},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemoryDesc_addInputRole)(
+      ANeuralNetworksMemoryDesc* desc,
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      float frequency);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksMemoryDesc_addOutputRole}. Behavior, arguments, and outputs
+   * match NNAPI Runtime function
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemoryDesc_addOutputRole)(
+      ANeuralNetworksMemoryDesc* desc,
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      float frequency);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemoryDesc_create}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemoryDesc_create},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemoryDesc_create)(ANeuralNetworksMemoryDesc** desc);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemoryDesc_finish}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemoryDesc_finish},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemoryDesc_finish)(ANeuralNetworksMemoryDesc* desc);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemoryDesc_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemoryDesc_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksMemoryDesc_free)(ANeuralNetworksMemoryDesc* desc);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksMemoryDesc_setDimensions}. Behavior, arguments, and outputs
+   * match NNAPI Runtime function
+   * {@link ANeuralNetworksMemoryDesc_setDimensions},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemoryDesc_setDimensions)(
+      ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+      const uint32_t* dimensions);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemory_copy}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemory_copy},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemory_copy)(const ANeuralNetworksMemory* src,
+                                    const ANeuralNetworksMemory* dst);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksMemory_createFromAHardwareBuffer}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemory_createFromAHardwareBuffer},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemory_createFromAHardwareBuffer)(
+      const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemory_createFromDesc}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemory_createFromDesc},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemory_createFromDesc)(
+      const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemory_createFromFd}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemory_createFromFd},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd,
+                                            size_t offset,
+                                            ANeuralNetworksMemory** memory);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksMemory_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksMemory_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksModel_addOperand}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_addOperand},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_addOperand)(
+      ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksModel_addOperation}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_addOperation},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model,
+                                           ANeuralNetworksOperationType type,
+                                           uint32_t inputCount,
+                                           const uint32_t* inputs,
+                                           uint32_t outputCount,
+                                           const uint32_t* outputs);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksModel_create}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_create},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksModel_finish}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_finish},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksModel_free}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_free},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  void (*ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_getExtensionOperandType}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_getExtensionOperandType},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_getExtensionOperandType)(
+      ANeuralNetworksModel* model, const char* extensionName,
+      uint16_t operandCodeWithinExtension, int32_t* type);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_getExtensionOperationType}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_getExtensionOperationType},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_getExtensionOperationType)(
+      ANeuralNetworksModel* model, const char* extensionName,
+      uint16_t operationCodeWithinExtension,
+      ANeuralNetworksOperationType* type);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_getSupportedOperationsForDevices}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_getSupportedOperationsForDevices},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_getSupportedOperationsForDevices)(
+      const ANeuralNetworksModel* model,
+      const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+      bool* supportedOps);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_identifyInputsAndOutputs}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_identifyInputsAndOutputs},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_identifyInputsAndOutputs)(
+      ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+      uint32_t outputCount, const uint32_t* outputs);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_relaxComputationFloat32toFloat16}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_relaxComputationFloat32toFloat16},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16)(
+      ANeuralNetworksModel* model, bool allow);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_setOperandExtensionData}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_setOperandExtensionData},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_setOperandExtensionData)(
+      ANeuralNetworksModel* model, int32_t index, const void* data,
+      size_t length);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_setOperandSymmPerChannelQuantParams}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_setOperandSymmPerChannelQuantParams},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworksModel_setOperandValue}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_setOperandValue},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model,
+                                              int32_t index, const void* buffer,
+                                              size_t length);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_setOperandValueFromMemory}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_setOperandValueFromMemory},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_setOperandValueFromMemory)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * SL Driver implementation of {@link
+   * ANeuralNetworksModel_setOperandValueFromModel}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksModel_setOperandValueFromModel},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworksModel_setOperandValueFromModel)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksModel* value);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworks_getDefaultLoopTimeout}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworks_getDefaultLoopTimeout},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  uint64_t (*ANeuralNetworks_getDefaultLoopTimeout)();
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworks_getDevice}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworks_getDevice},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworks_getDevice)(uint32_t devIndex,
+                                   ANeuralNetworksDevice** device);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworks_getDeviceCount}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworks_getDeviceCount},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*ANeuralNetworks_getDeviceCount)(uint32_t* numDevices);
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworks_getMaximumLoopTimeout}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworks_getMaximumLoopTimeout},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  uint64_t (*ANeuralNetworks_getMaximumLoopTimeout)();
+
+  /**
+   * SL Driver implementation of {@link ANeuralNetworks_getRuntimeFeatureLevel}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworks_getRuntimeFeatureLevel},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int64_t (*ANeuralNetworks_getRuntimeFeatureLevel)();
+
+  /**
+   * SL Driver implementation of a function similar to
+   * {@link ANeuralNetworksCompilation_setCaching} that takes file descriptors
+   * instead of a cache directory.
+   * Behavior and outputs match NNAPI Runtime function
+   * {@link ANeuralNetworksCompilation_setCaching},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksCompilation_setCachingFromFds)(
+      ANeuralNetworksCompilation* compilation, const int* modelCacheFds,
+      const uint32_t numModelCacheFiles, const int* dataCacheFds,
+      const uint32_t numDataCacheFiles, const uint8_t* token);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDevice_getNumberOfCacheFilesNeeded}. Behavior, arguments,
+   * and outputs match NNAPI Runtime function
+   * {@link SL_ANeuralNetworksDevice_getNumberOfCacheFilesNeeded},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksDevice_getNumberOfCacheFilesNeeded)(
+      const ANeuralNetworksDevice* device, uint32_t* numModelCacheFiles,
+      uint32_t* numDataCacheFiles);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDevice_getPerformanceInfo}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link SL_ANeuralNetworksDevice_getPerformanceInfo},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksDevice_getPerformanceInfo)(
+      const ANeuralNetworksDevice* device, int32_t performanceInfoKind,
+      SL_ANeuralNetworksPerformanceInfo* performanceInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDevice_forEachOperandPerformanceInfo}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function
+   * {@link SL_ANeuralNetworksDevice_forEachOperandPerformanceInfo},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksDevice_forEachOperandPerformanceInfo)(
+      const ANeuralNetworksDevice* device, void* context,
+      void (*callback)(SL_ANeuralNetworksOperandPerformanceInfo, void*));
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDevice_getVendorExtensionCount}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link SL_ANeuralNetworksDevice_getVendorExtensionCount},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksDevice_getVendorExtensionCount)(
+      const ANeuralNetworksDevice* device, uint32_t* vendorExtensionCount);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDevice_getVendorExtensionName}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function
+   * {@link SL_ANeuralNetworksDevice_getVendorExtensionName},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksDevice_getVendorExtensionName)(
+      const ANeuralNetworksDevice* device, uint32_t vendorExtensionIndex,
+      const char** extensionName);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDevice_forEachVendorExtensionOperandTypeInformation}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function
+   * {@link
+   * SL_ANeuralNetworksDevice_forEachVendorExtensionOperandTypeInformation}, at
+   * the feature level of this NnApiSLDriver struct.
+   */
+  int (*SL_ANeuralNetworksDevice_forEachVendorExtensionOperandTypeInformation)(
+      const ANeuralNetworksDevice* device, uint32_t vendorExtensionIndex,
+      void* context,
+      void (*callback)(SL_ANeuralNetworksExtensionOperandTypeInformation,
+                       void*));
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getSessionId}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getSessionId}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  int32_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getSessionId)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getNnApiVersion}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getNnApiVersion}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  int64_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getNnApiVersion)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getModelArchHash}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getModelArchHash}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  const uint8_t* (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getModelArchHash)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getDeviceIds}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getDeviceIds}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  const char* (*SL_ANeuralNetworksDiagnosticCompilationInfo_getDeviceIds)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getErrorCode}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getErrorCode}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  int32_t (*SL_ANeuralNetworksDiagnosticCompilationInfo_getErrorCode)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getInputDataClass}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getInputDataClass}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getInputDataClass)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getOutputDataClass}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getOutputDataClass}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getOutputDataClass)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getCompilationTimeNanos}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_getCompilationTimeNanos}, at
+   * the feature level of this NnApiSLDriver struct.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticCompilationInfo_getCompilationTimeNanos)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_isCachingEnabled}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_isCachingEnabled}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_isCachingEnabled)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_isControlFlowUsed}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_isControlFlowUsed}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_isControlFlowUsed)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_areDynamicTensorsUsed}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticCompilationInfo_areDynamicTensorsUsed}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticCompilationInfo_areDynamicTensorsUsed)(
+      const ANeuralNetworksDiagnosticCompilationInfo*
+          diagnosticCompilationInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getSessionId}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getSessionId}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  int32_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getSessionId)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getNnApiVersion}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getNnApiVersion}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  int64_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getNnApiVersion)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getModelArchHash}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getModelArchHash}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  const uint8_t* (*SL_ANeuralNetworksDiagnosticExecutionInfo_getModelArchHash)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getDeviceIds}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getDeviceIds}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  const char* (*SL_ANeuralNetworksDiagnosticExecutionInfo_getDeviceIds)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getExecutionMode}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getExecutionMode}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  ANeuralNetworksDiagnosticExecutionMode (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getExecutionMode)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getInputDataClass}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getInputDataClass}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getInputDataClass)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getOutputDataClass}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getOutputDataClass}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  ANeuralNetworksDiagnosticDataClass (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getOutputDataClass)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getErrorCode}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getErrorCode}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  uint32_t (*SL_ANeuralNetworksDiagnosticExecutionInfo_getErrorCode)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getRuntimeExecutionTimeNanos}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getRuntimeExecutionTimeNanos}, at
+   * the feature level of this NnApiSLDriver struct.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getRuntimeExecutionTimeNanos)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getDriverExecutionTimeNanos}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getDriverExecutionTimeNanos}, at
+   * the feature level of this NnApiSLDriver struct.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getDriverExecutionTimeNanos)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getHardwareExecutionTimeNanos}.
+   * Behavior, arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_getHardwareExecutionTimeNanos},
+   * at the feature level of this NnApiSLDriver struct.
+   */
+  uint64_t (
+      *SL_ANeuralNetworksDiagnosticExecutionInfo_getHardwareExecutionTimeNanos)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_isCachingEnabled}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_isCachingEnabled}, at the feature
+   * level of this NnApiSLDriver struct.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_isCachingEnabled)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_isControlFlowUsed}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_isControlFlowUsed}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_isControlFlowUsed)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_areDynamicTensorsUsed}. Behavior,
+   * arguments, and outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnosticExecutionInfo_areDynamicTensorsUsed}, at the
+   * feature level of this NnApiSLDriver struct.
+   */
+  bool (*SL_ANeuralNetworksDiagnosticExecutionInfo_areDynamicTensorsUsed)(
+      const ANeuralNetworksDiagnosticExecutionInfo* diagnosticExecutionInfo);
+
+  /**
+   * SL Driver implementation of {@link
+   * SL_ANeuralNetworksDiagnostic_registerCallbacks}. Behavior, arguments, and
+   * outputs match NNAPI Runtime function {@link
+   * SL_ANeuralNetworksDiagnostic_registerCallbacks}, at the feature level of
+   * this NnApiSLDriver struct.
+   */
+  void (*SL_ANeuralNetworksDiagnostic_registerCallbacks)(
+      ANeuralNetworksDiagnosticCompilationFinishedCallback compilationCallback,
+      ANeuralNetworksDiagnosticExecutionFinishedCallback executionCallback,
+      void* callbackContext);
+
+} NnApiSLDriverImplFL5;
+// LINT.ThenChange()
+
+/**
+ * NnApiSLDriverImpl for an Updatable SL Driver implementing {@link
+ * ANEURALNETWORKS_FEATURE_LEVEL_6}.
+ *
+ * This struct must set its implFeatureLevel to {@link
+ * ANEURALNETWORKS_FEATURE_LEVEL_6}.
+ *
+ */
+typedef struct NnApiSLDriverImplFL5 NnApiSLDriverImplFL6;
+
+/**
+ * NnApiSLDriverImpl for an Updatable SL Driver implementing {@link
+ * ANEURALNETWORKS_FEATURE_LEVEL_7}.
+ *
+ * This struct must set its implFeatureLevel to {@link
+ * ANEURALNETWORKS_FEATURE_LEVEL_7}.
+ *
+ */
+typedef NnApiSLDriverImplFL6 NnApiSLDriverImplFL7;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLCoreMLDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLCoreMLDelegate.h
new file mode 100644
index 00000000..b2ad9717
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLCoreMLDelegate.h
@@ -0,0 +1,87 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "TFLDelegate.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLCoreMLDelegateEnabledDevices
+ * This enum specifies for which devices the Core ML delegate will be enabled.
+ */
+typedef NS_ENUM(NSUInteger, TFLCoreMLDelegateEnabledDevices) {
+  /** Enables the delegate for devices with Neural Engine only. */
+  TFLCoreMLDelegateEnabledDevicesNeuralEngine,
+  /** Enables the delegate for all devices. */
+  TFLCoreMLDelegateEnabledDevicesAll,
+};
+
+/** Custom configuration options for a Core ML delegate. */
+@interface TFLCoreMLDelegateOptions : NSObject
+
+/**
+ * Indicates which devices the Core ML delegate should be enabled for. The default value is
+ * `TFLCoreMLDelegateEnabledDevicesNeuralEngine`, indicating that the delegate is enabled for
+ * Neural Engine devices only.
+ */
+@property(nonatomic) TFLCoreMLDelegateEnabledDevices enabledDevices;
+
+/**
+ * Target Core ML version for the model conversion. When it's not set, Core ML version will be set
+ * to highest available version for the platform.
+ */
+@property(nonatomic) NSUInteger coreMLVersion;
+
+/**
+ * The maximum number of Core ML delegate partitions created. Each graph corresponds to one
+ * delegated node subset in the TFLite model. The default value is `0` indicating that all possible
+ * partitions are delegated.
+ */
+@property(nonatomic) NSUInteger maxDelegatedPartitions;
+
+/**
+ * The minimum number of nodes per partition to be delegated by the Core ML delegate. The default
+ * value is `2`.
+ */
+@property(nonatomic) NSUInteger minNodesPerPartition;
+
+@end
+
+/** A delegate that uses the Core ML framework for performing TensorFlow Lite graph operations. */
+@interface TFLCoreMLDelegate : TFLDelegate
+
+/**
+ * Initializes a new Core ML delegate with default options.
+ *
+ * @return A Core ML delegate initialized with default options. `nil` when the delegate creation
+ * fails. For example, trying to initialize a Core ML delegate on an unsupported device.
+ */
+- (nullable instancetype)init;
+
+/**
+ * Initializes a new Core ML delegate with the given options.
+ *
+ * @param options Core ML delegate options.
+ *
+ * @return A Core ML delegate initialized with default options. `nil` when the delegate creation
+ * fails. For example, trying to initialize Core ML delegate on an unsupported device.
+ */
+- (nullable instancetype)initWithOptions:(TFLCoreMLDelegateOptions *)options
+    NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLDelegate.h
new file mode 100644
index 00000000..8269fe0f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLDelegate.h
@@ -0,0 +1,28 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+typedef void* TFLCDelegate;
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLDelegate : NSObject
+
+/** Pointer to underlying TfLiteDelegate*. */
+@property(nonatomic, readonly) TFLCDelegate cDelegate;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLInterpreter.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLInterpreter.h
new file mode 100644
index 00000000..133c55df
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLInterpreter.h
@@ -0,0 +1,217 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLDelegate;
+@class TFLInterpreterOptions;
+@class TFLSignatureRunner;
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLInterpreterErrorCode
+ * This enum specifies various error codes related to `TFLInterpreter`.
+ */
+typedef NS_ENUM(NSUInteger, TFLInterpreterErrorCode) {
+  /** Provided tensor index is invalid. */
+  TFLInterpreterErrorCodeInvalidTensorIndex,
+
+  /** Input data has invalid byte size. */
+  TFLInterpreterErrorCodeInvalidInputByteSize,
+
+  /** Provided shape is invalid. It must be a non-empty array of positive unsigned integers. */
+  TFLInterpreterErrorCodeInvalidShape,
+
+  /** Provided model cannot be loaded. */
+  TFLInterpreterErrorCodeFailedToLoadModel,
+
+  /** Failed to create `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToCreateInterpreter,
+
+  /** Failed to invoke `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToInvoke,
+
+  /** Failed to retrieve a tensor. */
+  TFLInterpreterErrorCodeFailedToGetTensor,
+
+  /** Invalid tensor. */
+  TFLInterpreterErrorCodeInvalidTensor,
+
+  /** Failed to resize an input tensor. */
+  TFLInterpreterErrorCodeFailedToResizeInputTensor,
+
+  /** Failed to copy data into an input tensor. */
+  TFLInterpreterErrorCodeFailedToCopyDataToInputTensor,
+
+  /** Copying data into an output tensor not allowed. */
+  TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed,
+
+  /** Failed to get data from a tensor. */
+  TFLInterpreterErrorCodeFailedToGetDataFromTensor,
+
+  /** Failed to allocate memory for tensors. */
+  TFLInterpreterErrorCodeFailedToAllocateTensors,
+
+  /** Operation not allowed without allocating memory for tensors first. */
+  TFLInterpreterErrorCodeAllocateTensorsRequired,
+
+  /** Operation not allowed without invoking the interpreter first. */
+  TFLInterpreterErrorCodeInvokeInterpreterRequired,
+};
+
+/**
+ * A TensorFlow Lite model interpreter.
+ *
+ * Note: Interpreter instances are *not* thread-safe.
+ */
+@interface TFLInterpreter : NSObject
+
+/** The total number of input tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger inputTensorCount;
+
+/** The total number of output tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger outputTensorCount;
+
+/** An ordered list of the SignatureDef exported method names available in the model. */
+@property(nonatomic, readonly) NSArray<NSString *> *signatureKeys;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and the
+ * default interpreter options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and the default interpreter
+ *     options. `nil` if there is an error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and
+ * options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param options Options to use for configuring the TensorFlow Lite interpreter.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and options. `nil` if there is an
+ *     error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path, options
+ * and delegates.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param options Options to use for configuring the TensorFlow Lite interpreter.
+ * @param delegates Delegates to use with the TensorFlow Lite interpreter. When the array is empty,
+ *     no delegate will be applied.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and options. `nil` if there is an
+ *     error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                 delegates:(NSArray<TFLDelegate *> *)delegates
+                                     error:(NSError **)error NS_DESIGNATED_INITIALIZER;
+
+/**
+ * Invokes the interpreter to run inference.
+ *
+ * @param error An optional error parameter populated when there is an error in invoking the
+ *     interpreter.
+ *
+ * @return Whether the invocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)invokeWithError:(NSError **)error;
+
+/**
+ * Returns the input tensor at the given index.
+ *
+ * @param index The index of an input tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the input
+ *     tensor.
+ *
+ * @return The input tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Returns the output tensor at the given index.
+ *
+ * @param index The index of an output tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the
+ *     output tensor.
+ *
+ * @return The output tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Resizes the input tensor at the given index to the specified shape (an array of positive unsigned
+ * integers).
+ *
+ * @param index The index of an input tensor.
+ * @param shape Shape that the given input tensor should be resized to. It should be an array of
+ *     positive unsigned integer(s) containing the size of each dimension.
+ * @param error An optional error parameter populated when there is an error in resizing the input
+ *     tensor.
+ *
+ * @return Whether the input tensor was resized successfully. Returns NO if an error occurred.
+ */
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error;
+
+/**
+ * Allocates memory for tensors.
+ *
+ * @param error An optional error parameter populated when there is an error in allocating memory.
+ *
+ * @return Whether memory allocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)allocateTensorsWithError:(NSError **)error;
+
+/**
+ * Returns a new signature runner instance for the signature with the given key in the model.
+ *
+ * @param key The signature key.
+ * @param error An optional error parameter populated when there is an error creating the signature
+ * runner.
+ *
+ * @return A new signature runner instance for the signature with given key.
+ */
+- (nullable TFLSignatureRunner *)signatureRunnerWithKey:(NSString *)key error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLInterpreterOptions.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLInterpreterOptions.h
new file mode 100644
index 00000000..d7dbb2bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLInterpreterOptions.h
@@ -0,0 +1,58 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Custom configuration options for a TensorFlow Lite interpreter. */
+@interface TFLInterpreterOptions : NSObject
+
+/**
+ * Maximum number of threads that the interpreter should run on. Defaults to 0 (unspecified, letting
+ * TensorFlow Lite to optimize the threading decision).
+ */
+@property(nonatomic) NSUInteger numberOfThreads;
+
+/**
+ * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+ *
+ * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the
+ * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+ * Eventually, we plan to enable this by default, as it can provide significant performance benefits
+ * for many classes of floating point models. See
+ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+ * for more details.
+ *
+ * Things to keep in mind when enabling this flag:
+ *
+ *     * Startup time and resize time may increase.
+ *     * Baseline memory consumption may increase.
+ *     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+ *     * Quantized models will not see any benefit.
+ *
+ * WARNING: This is an experimental interface that is subject to change.
+ */
+@property(nonatomic) BOOL useXNNPACK;
+
+/**
+ * Initializes a new instance of `TFLInterpreterOptions`.
+ *
+ * @return A new instance of `TFLInterpreterOptions`.
+ */
+- (instancetype)init NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLMetalDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLMetalDelegate.h
new file mode 100644
index 00000000..026e945d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLMetalDelegate.h
@@ -0,0 +1,91 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "TFLDelegate.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLMetalDelegateThreadWaitType
+ * This enum specifies wait type for Metal delegate.
+ */
+typedef NS_ENUM(NSUInteger, TFLMetalDelegateThreadWaitType) {
+
+  /**
+   * The thread does not wait for the work to complete. Useful when the output of the work is used
+   * with the GPU pipeline.
+   */
+  TFLMetalDelegateThreadWaitTypeDoNotWait,
+  /** The thread waits until the work is complete. */
+  TFLMetalDelegateThreadWaitTypePassive,
+  /**
+   * The thread waits for the work to complete with minimal latency, which may require additional
+   * CPU resources.
+   */
+  TFLMetalDelegateThreadWaitTypeActive,
+  /** The thread waits for the work while trying to prevent the GPU from going into sleep mode. */
+  TFLMetalDelegateThreadWaitTypeAggressive,
+};
+
+/** Custom configuration options for a Metal delegate. */
+@interface TFLMetalDelegateOptions : NSObject
+
+/**
+ * Indicates whether the GPU delegate allows precision loss, such as allowing `Float16` precision
+ * for a `Float32` computation. The default is `false`.
+ */
+@property(nonatomic, getter=isPrecisionLossAllowed) BOOL precisionLossAllowed;
+
+/**
+ * Indicates how the current thread should wait for work on the GPU to complete. The default
+ * is `TFLMetalDelegateThreadWaitTypePassive`.
+ */
+@property(nonatomic) TFLMetalDelegateThreadWaitType waitType;
+
+/**
+ * Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default is
+ * `true`.
+ */
+@property(nonatomic, getter=isQuantizationEnabled) BOOL quantizationEnabled;
+
+@end
+
+/**
+ * A delegate that uses the `Metal` framework for performing TensorFlow Lite graph operations with
+ * GPU acceleration.
+ */
+@interface TFLMetalDelegate : TFLDelegate
+
+/**
+ * Initializes a new GPU delegate with default options.
+ *
+ * @return A new GPU delegate with default options. `nil` when the GPU delegate creation fails.
+ */
+- (nullable instancetype)init;
+
+/**
+ * Initializes a new GPU delegate with the given options.
+ *
+ * @param options GPU delegate options.
+ *
+ * @return A new GPU delegate with default options. `nil` when the GPU delegate creation fails.
+ */
+- (nullable instancetype)initWithOptions:(TFLMetalDelegateOptions *)options
+    NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLQuantizationParameters.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLQuantizationParameters.h
new file mode 100644
index 00000000..3d5cf793
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLQuantizationParameters.h
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Parameters for asymmetric quantization. Quantized values can be converted to float values using:
+ * `realValue = scale * (quantizedValue - zeroPoint)`.
+ */
+@interface TFLQuantizationParameters : NSObject
+
+/** Scale of asymmetric quantization. */
+@property(nonatomic, readonly) float scale;
+
+/** Zero point of asymmetric quantization. */
+@property(nonatomic, readonly) int32_t zeroPoint;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLSignatureRunner.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLSignatureRunner.h
new file mode 100644
index 00000000..2c063728
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLSignatureRunner.h
@@ -0,0 +1,154 @@
+// Copyright 2022 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@class TFLTensor;
+
+/** Domain for errors in the signature runner API. */
+FOUNDATION_EXPORT NSErrorDomain const TFLSignatureRunnerErrorDomain;
+
+/**
+ * @enum TFLSignatureRunnerErrorCode
+ * This enum specifies various error codes related to `TFLSignatureRunner`.
+ */
+typedef NS_ENUM(NSUInteger, TFLSignatureRunnerErrorCode) {
+  /** Input data has invalid byte size. */
+  TFLSignatureRunnerErrorCodeInvalidInputByteSize,
+
+  /** Provided shape is invalid. It must be a non-empty array of positive unsigned integers. */
+  TFLSignatureRunnerErrorCodeInvalidShape,
+
+  /** Failed to create `TFLSignatureRunner`. */
+  TFLSignatureRunnerErrorCodeFailedToCreateSignatureRunner,
+
+  /** Failed to invoke `TFLSignatureRunner`. */
+  TFLSignatureRunnerErrorCodeFailedToInvoke,
+
+  /** Failed to retrieve a tensor. */
+  TFLSignatureRunnerErrorCodeFailedToGetTensor,
+
+  /** Invalid tensor. */
+  TFLSignatureRunnerErrorCodeInvalidTensor,
+
+  /** Failed to resize an input tensor. */
+  TFLSignatureRunnerErrorCodeFailedToResizeInputTensor,
+
+  /** Failed to copy data into an input tensor. */
+  TFLSignatureRunnerErrorCodeFailedToCopyDataToInputTensor,
+
+  /** Copying data into an output tensor not allowed. */
+  TFLSignatureRunnerErrorCodeCopyDataToOutputTensorNotAllowed,
+
+  /** Failed to get data from a tensor. */
+  TFLSignatureRunnerErrorCodeFailedToGetDataFromTensor,
+
+  /** Failed to allocate memory for tensors. */
+  TFLSignatureRunnerErrorCodeFailedToAllocateTensors,
+};
+
+/**
+ * A TensorFlow Lite model signature runner. You can get a `TFLSignatureRunner` instance for a
+ * signature from the `TFLInterpreter` and then use the SignatureRunner APIs.
+ *
+ * @note `TFLSignatureRunner` instances are *not* thread-safe.
+ * @note Each `TFLSignatureRunner` instance is associated with a `TFLInterpreter` instance. As long
+ *     as a `TFLSignatureRunner` instance is still in use, its associated `TFLInterpreter` instance
+ *     will not be deallocated.
+ */
+@interface TFLSignatureRunner : NSObject
+
+/** The signature key. */
+@property(nonatomic, readonly) NSString *signatureKey;
+
+/** An ordered list of the SignatureDefs input names. */
+@property(nonatomic, readonly) NSArray<NSString *> *inputs;
+
+/** An ordered list of the SignatureDefs output names. */
+@property(nonatomic, readonly) NSArray<NSString *> *outputs;
+
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+/**
+ * Returns the input tensor with the given input name in the signature.
+ *
+ * @param name The input name in the signature.
+ * @param error An optional error parameter populated when there is an error in looking up the input
+ *     tensor.
+ *
+ * @return The input tensor with the given input name. `nil` if there is an error. See the
+ *     `TFLTensor` class documentation for more details on the life expectancy between the returned
+ *     tensor and this signature runner.
+ */
+- (nullable TFLTensor *)inputTensorWithName:(NSString *)name error:(NSError **)error;
+
+/**
+ * Returns the output tensor with the given output name in the signature.
+ *
+ * @param name The output name in the signature.
+ * @param error An optional error parameter populated when there is an error in looking up the
+ *     output tensor.
+ *
+ * @return The output tensor with the given output name. `nil` if there is an error. See the
+ *     `TFLTensor` class documentation for more details on the life expectancy between the returned
+ *     tensor and this signature runner.
+ */
+- (nullable TFLTensor *)outputTensorWithName:(NSString *)name error:(NSError **)error;
+
+/**
+ * Resizes the input tensor with the given input name to the specified shape (an array of positive
+ * unsigned integers).
+ *
+ * @param name The input name.
+ * @param shape Shape that the given input tensor should be resized to. It should be an array of
+ *     positive unsigned integer(s) containing the size of each dimension.
+ * @param error An optional error parameter populated when there is an error in resizing the input
+ *     tensor.
+ *
+ * @return Whether the input tensor was resized successfully. Returns NO if an error occurred.
+ */
+- (BOOL)resizeInputTensorWithName:(NSString *)name
+                          toShape:(NSArray<NSNumber *> *)shape
+                            error:(NSError **)error;
+
+/**
+ * Allocates memory for tensors.
+ *
+ * @note This call is *purely optional*. Tensor allocation will occur automatically during
+ *     execution.
+ *
+ * @param error An optional error parameter populated when there is an error in allocating memory.
+ *
+ * @return Whether memory allocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)allocateTensorsWithError:(NSError **)error;
+
+/**
+ * Invoke the signature with given input data.
+ *
+ * @param inputs A map from input name to the input data. The input data will be copied into the
+ *     input tensor.
+ * @param error An optional error parameter populated when there is an error in invoking the
+ * signature.
+ *
+ * @return Whether the invocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)invokeWithInputs:(NSDictionary<NSString *, NSData *> *)inputs Error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLTensor.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLTensor.h
new file mode 100644
index 00000000..deaf52f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLTensor.h
@@ -0,0 +1,121 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLQuantizationParameters;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorDataType
+ * This enum specifies supported TensorFlow Lite tensor data types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
+  /** Tensor data type not available. This indicates an error with the model. */
+  TFLTensorDataTypeNoType,
+
+  /** 32-bit single precision floating point. */
+  TFLTensorDataTypeFloat32,
+
+  /** 16-bit half precision floating point. */
+  TFLTensorDataTypeFloat16,
+
+  /** 32-bit signed integer. */
+  TFLTensorDataTypeInt32,
+
+  /** 8-bit unsigned integer. */
+  TFLTensorDataTypeUInt8,
+
+  /** 64-bit signed integer. */
+  TFLTensorDataTypeInt64,
+
+  /** Boolean. */
+  TFLTensorDataTypeBool,
+
+  /** 16-bit signed integer. */
+  TFLTensorDataTypeInt16,
+
+  /** 8-bit signed integer. */
+  TFLTensorDataTypeInt8,
+
+  /** 64-bit double precision floating point. */
+  TFLTensorDataTypeFloat64,
+
+  /** 16-bit bfloat16 floating point. */
+  TFLTensorDataTypeBFloat16,
+};
+
+/**
+ * An input or output tensor in a TensorFlow Lite model.
+ *
+ * @warning Each `TFLTensor` instance is associated with its provider, either a `TFLInterpreter` or
+ * a `TFLSignatureRunner` instance. Multiple `TFLTensor` instances of the same TensorFlow Lite model
+ * are associated with the same provider instance. As long as a `TFLTensor` instance is still in
+ * use, its associated provider instance will not be deallocated.
+ */
+@interface TFLTensor : NSObject
+
+/** Name of the tensor. */
+@property(nonatomic, readonly, copy) NSString *name;
+
+/** Data type of the tensor. */
+@property(nonatomic, readonly) TFLTensorDataType dataType;
+
+/** Parameters for asymmetric quantization. `nil` if the tensor does not use quantization. */
+@property(nonatomic, readonly, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Copies the given data into an input tensor. This is allowed only for an input tensor and only
+ * before the interpreter or the signature runner is invoked; otherwise an error will be returned.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor.
+ * @param error An optional error parameter populated when there is an error in copying the data.
+ *
+ * @return Whether the data was copied into the input tensor successfully. Returns NO if an error
+ *     occurred.
+ */
+- (BOOL)copyData:(NSData *)data error:(NSError **)error;
+
+/**
+ * Retrieves a copy of data in the tensor. For an output tensor, the data is only available after
+ * the interpreter or signature runner invocation has successfully completed; otherwise an error
+ * will be returned.
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the data.
+ *
+ * @return A copy of data in the tensor. `nil` if there is an error in retrieving the data or the
+ *     data is not available.
+ */
+- (nullable NSData *)dataWithError:(NSError **)error;
+
+/**
+ * Retrieves the shape of the tensor, an array of positive unsigned integers containing the size
+ * of each dimension. For example: the shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3] (i.e. an array of 2 arrays of 2 arrays of 3 numbers).
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLTensorFlowLite.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLTensorFlowLite.h
new file mode 100644
index 00000000..282af532
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apis/TFLTensorFlowLite.h
@@ -0,0 +1,32 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "TFLDelegate.h"
+#import "TFLInterpreter.h"
+#import "TFLInterpreterOptions.h"
+#import "TFLQuantizationParameters.h"
+#import "TFLSignatureRunner.h"
+#import "TFLTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * A string describing the semantic versioning information for the TensorFlow Lite runtime. Is an
+ * empty string if the version could not be determined.
+ */
+FOUNDATION_EXPORT NSString *const TFLVersion;
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apps/TestApp/TestApp/AppDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apps/TestApp/TestApp/AppDelegate.h
new file mode 100644
index 00000000..a8442869
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apps/TestApp/TestApp/AppDelegate.h
@@ -0,0 +1,25 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property(nonatomic) UIWindow *window;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/apps/TestApp/TestApp/ViewController.h b/third_party/tflite-hdrs/tensorflow/lite/objc/apps/TestApp/TestApp/ViewController.h
new file mode 100644
index 00000000..797c964d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/apps/TestApp/TestApp/ViewController.h
@@ -0,0 +1,22 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ViewController : UIViewController
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLCommonUtil.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLCommonUtil.h
new file mode 100644
index 00000000..43dab632
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLCommonUtil.h
@@ -0,0 +1,35 @@
+// Copyright 2022 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "tensorflow/lite/objc/apis/TFLTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+typedef struct TfLiteTensor TfLiteTensor;
+
+@class TFLQuantizationParameters;
+
+/** Gets the tensor data type from a c tensor. */
+FOUNDATION_EXTERN TFLTensorDataType TFLTensorDataTypeFromCTensor(const TfLiteTensor *cTensor);
+
+/** Gets the tensor name from a c tensor. */
+FOUNDATION_EXTERN NSString *__nullable TFLTensorNameFromCTensor(const TfLiteTensor *cTensor);
+
+/** Gets the quantization parameters from a c tensor. */
+FOUNDATION_EXTERN TFLQuantizationParameters *__nullable
+TFLQuantizationParamsFromCTensor(const TfLiteTensor *cTensor);
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLErrorUtil.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLErrorUtil.h
new file mode 100644
index 00000000..b34f276c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLErrorUtil.h
@@ -0,0 +1,53 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "tensorflow/lite/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Helper utility for error reporting. */
+@interface TFLErrorUtil : NSObject
+
+/**
+ * Creates and saves an interpreter error with the given error code and description.
+ *
+ * @param code Error code.
+ * @param description Error description.
+ * @param error Pointer to where to save the created error. If `nil`, no error will be saved.
+ */
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error;
+
+/**
+ * Sets the error with the given domain, error code and description.
+ *
+ * @param domain The error domain.
+ * @param code The error code.
+ * @param description The error description.
+ * @param error A pointer to populate the error. If `nil`, no error will be populated.
+ */
++ (void)setError:(NSError **)error
+      withDomain:(NSErrorDomain)domain
+            code:(NSInteger)code
+     description:(NSString *)description;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLInterpreter+Internal.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLInterpreter+Internal.h
new file mode 100644
index 00000000..7e345d5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLInterpreter+Internal.h
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/objc/apis/TFLInterpreter.h"
+
+#import "TFLTensorDataAccessor.h"
+
+typedef struct TfLiteInterpreter TfLiteInterpreter;
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLInterpreter (Internal) <TFLTensorDataAccessor>
+
+/** TfLiteInterpreter backed by C API. */
+@property(nonatomic, readonly) TfLiteInterpreter *interpreter;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLQuantizationParameters+Internal.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLQuantizationParameters+Internal.h
new file mode 100644
index 00000000..295dcf31
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLQuantizationParameters+Internal.h
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/objc/apis/TFLQuantizationParameters.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLQuantizationParameters (Internal)
+
+/**
+ * Initializes a `TFLQuantizationParameters` instance with the given scale and zero point.
+ *
+ * @param scale Scale of asymmetric quantization.
+ * @param zeroPoint Zero point of asymmetric quantization.
+ *
+ * @return A new instance of `TFLQuantizationParameters` with the given scale and zero point.
+ */
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLSignatureRunner+Internal.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLSignatureRunner+Internal.h
new file mode 100644
index 00000000..839c3ec4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLSignatureRunner+Internal.h
@@ -0,0 +1,43 @@
+// Copyright 2022 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/objc/apis/TFLSignatureRunner.h"
+
+#import "TFLTensorDataAccessor.h"
+
+@class TFLInterpreter;
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLSignatureRunner (Internal) <TFLTensorDataAccessor>
+
+/**
+ * Initializes a new TensorFlow Lite signature runner instance with the given interpreter and
+ * signature key.
+ *
+ * @param interpreter The TensorFlow Lite model interpreter.
+ * @param signatureKey The signature key.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ * signature runner.
+ *
+ * @return A new instance of `TFLSignatureRunner` with the given model and options. `nil` if there
+ * is an error in initializing the signature runner.
+ */
+- (nullable instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                                signatureKey:(NSString *)signatureKey
+                                       error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLTensor+Internal.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLTensor+Internal.h
new file mode 100644
index 00000000..9b440dd7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLTensor+Internal.h
@@ -0,0 +1,99 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/objc/apis/TFLTensor.h"
+
+@class TFLInterpreter;
+@class TFLSignatureRunner;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorType
+ * This enum specifies input or output tensor types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorType) {
+  /** Input tensor type. */
+  TFLTensorTypeInput,
+
+  /** Output tensor type. */
+  TFLTensorTypeOutput,
+};
+
+@interface TFLTensor (Internal)
+
+/** Input or output tensor type. */
+@property(nonatomic, readonly) TFLTensorType type;
+
+/** Index of the tensor. */
+@property(nonatomic, readonly) NSUInteger index;
+
+/** The input or output name of the tensor in the signatureDef. */
+@property(nonatomic, nullable, readonly) NSString *nameInSignature;
+
+/**
+ * Initializes a `TFLTensor` with the given interpreter, name, data type, and quantization
+ * parameters.
+ *
+ * @param interpreter Interpreter backing the tensor.
+ * @param type Input or output tensor type.
+ * @param index Index of the tensor.
+ * @param name Name of the tensor.
+ * @param dataType Data type of the tensor.
+ * @param quantizationParameters Quantization parameters of the tensor. `nil` if the tensor does not
+ *     use quantization.
+ *
+ * @return A new instance of `TFLTensor` with the given name, data type, shape, and quantization
+ *     parameters.
+ */
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters;
+
+/**
+ * Initializes a new `TFLTensor` instance.
+ *
+ * @param signatureRunner The signature runner backing the tensor.
+ * @param type Input or output tensor type.
+ * @param nameInSignature The input or output name of the tensor in the signatureDef.
+ * @param name Name of the tensor.
+ * @param dataType Data type of the tensor.
+ * @param quantizationParameters Quantization parameters of the tensor. `nil` if the tensor does not
+ *     use quantization.
+ *
+ * @return A new instance of `TFLTensor`.
+ */
+- (instancetype)initWithSignatureRunner:(TFLSignatureRunner *)signatureRunner
+                                   type:(TFLTensorType)type
+                        nameInSignature:(NSString *)nameInSignature
+                                   name:(NSString *)name
+                               dataType:(TFLTensorDataType)dataType
+                 quantizationParameters:
+                     (nullable TFLQuantizationParameters *)quantizationParameters;
+
+/**
+ * Returns the string name of the given input or output tensor type.
+ *
+ * @param type Input or output tensor type.
+ *
+ * @return The string name of the given input or output tensor type.
+ */
++ (NSString *)stringForTensorType:(TFLTensorType)type;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLTensorDataAccessor.h b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLTensorDataAccessor.h
new file mode 100644
index 00000000..ab7763af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/objc/sources/TFLTensorDataAccessor.h
@@ -0,0 +1,64 @@
+// Copyright 2022 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@class TFLTensor;
+
+/** Protocol for providing and mutating data on a `TFLTensor`. */
+@protocol TFLTensorDataAccessor <NSObject>
+
+/**
+ * Copies the given data into the input tensor. This is allowed only before the interpreter or
+ * signature runner is invoked.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ * tensor.
+ * @param inputTensor The input tensor to copy data to.
+ * @param error An optional error parameter populated when there is an error in setting the data.
+ *
+ * @return Whether the data was copied into the input tensor at the given index successfully.
+ *     Returns NO if an error occurred.
+ */
+- (BOOL)copyData:(NSData *)data toInputTensor:(TFLTensor *)inputTensor error:(NSError **)error;
+
+/**
+ * Retrieves a copy of the data from the given tensor. For an output tensor, the interpreter or
+ * signature invocation has to complete before the data can be retrieved.
+ *
+ * @param tensor A tensor.
+ * @param error An optional error parameter populated when there is an error in getting the data.
+ *
+ * @return The data of the given tensor. `nil` if there is an error or data is not available.
+ */
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+/**
+ * Retrieves the shape of the given tensor, an array of positive unsigned integer(s) containing the
+ * size of each dimension. For example: shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3].
+ *
+ * @param tensor An input or output tensor.
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/third_party/tflite-hdrs/tensorflow/lite/op_resolver.h b/third_party/tflite-hdrs/tensorflow/lite/op_resolver.h
new file mode 100644
index 00000000..faa79639
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/op_resolver.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for moved header location.
+#ifndef TENSORFLOW_LITE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_OP_RESOLVER_H_
+
+#include "tensorflow/lite/core/api/op_resolver.h"  // IWYU pragma: export
+#include "tensorflow/lite/mutable_op_resolver.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_LITE_OP_RESOLVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/optional_debug_tools.h b/third_party/tflite-hdrs/tensorflow/lite/optional_debug_tools.h
new file mode 100644
index 00000000..e3ed2973
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/optional_debug_tools.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+///
+/// Optional debugging functionality.
+/// For small sized binaries, these are not needed.
+#ifndef TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
+#define TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/subgraph.h"
+
+namespace tflite {
+// Returns the name of the allocation type.
+const char* AllocTypeName(TfLiteAllocationType type);
+
+// Prints a dump of what tensors and what nodes are in the interpreter.
+void PrintInterpreterState(const impl::Interpreter* interpreter,
+                           int32_t tensor_name_display_length = 25,
+                           int32_t tensor_type_display_length = 15,
+                           int32_t alloc_type_display_length = 18);
+
+struct SubgraphDelegationMetadata {
+  // A bit vector indicating whether a node is replaced by a delegate. The
+  // size of the vector is the number of nodes in the subgraph.
+  std::vector<bool> is_node_delegated;
+  // A vector mapping from the node id of a replaced node to the node id of
+  // the delegate node that replaced it. The size of the vector is the number
+  // of nodes in the subgraph.
+  // If a node is not replaced by a delegate, the corresponding value in this
+  // vector will be -1, checking the value of the corresponding
+  // bit in is_node_delegated is recommended.
+  std::vector<int> replaced_by_node;
+  // Whether any delegate has been applied to the subgraph.
+  bool has_delegate_applied = false;
+};
+
+// Returns the metadata of the delegation of the subgraph.
+SubgraphDelegationMetadata GetNodeDelegationMetadata(const Subgraph& subgraph);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/portable_type_to_tflitetype.h b/third_party/tflite-hdrs/tensorflow/lite/portable_type_to_tflitetype.h
new file mode 100644
index 00000000..03c5b87c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/portable_type_to_tflitetype.h
@@ -0,0 +1,80 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
+#define TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
+
+// Most of the definitions have been moved to this subheader so that Micro
+// can include it without relying on <string> and <complex>, which isn't
+// available on all platforms.
+
+// Arduino build defines abs as a macro here. That is invalid C++, and breaks
+// libc++'s <complex> header, undefine it.
+#ifdef abs
+#undef abs
+#endif
+
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// Map statically from a C++ type to a TfLiteType. Used in interpreter for
+// safe casts.
+// Example:
+//  typeToTfLiteType<bool>() -> kTfLiteBool
+template <typename T>
+constexpr TfLiteType typeToTfLiteType() {
+  return kTfLiteNoType;
+}
+// Map from TfLiteType to the corresponding C++ type.
+// Example:
+//   TfLiteTypeToType<kTfLiteBool>::Type -> bool
+template <TfLiteType TFLITE_TYPE_ENUM>
+struct TfLiteTypeToType {};  // Specializations below
+
+// Template specialization for both typeToTfLiteType and TfLiteTypeToType.
+#define MATCH_TYPE_AND_TFLITE_TYPE(CPP_TYPE, TFLITE_TYPE_ENUM) \
+  template <>                                                  \
+  constexpr TfLiteType typeToTfLiteType<CPP_TYPE>() {          \
+    return TFLITE_TYPE_ENUM;                                   \
+  }                                                            \
+  template <>                                                  \
+  constexpr TfLiteType typeToTfLiteType<const CPP_TYPE>() {    \
+    return TFLITE_TYPE_ENUM;                                   \
+  }                                                            \
+  template <>                                                  \
+  struct TfLiteTypeToType<TFLITE_TYPE_ENUM> {                  \
+    using Type = CPP_TYPE;                                     \
+  }
+
+// No string mapping is included here, since the TF Lite packed representation
+// doesn't correspond to a C++ type well.
+MATCH_TYPE_AND_TFLITE_TYPE(int32_t, kTfLiteInt32);
+MATCH_TYPE_AND_TFLITE_TYPE(uint32_t, kTfLiteUInt32);
+MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
+MATCH_TYPE_AND_TFLITE_TYPE(uint16_t, kTfLiteUInt16);
+MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
+MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
+MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(int8_t, kTfLiteInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(bool, kTfLiteBool);
+MATCH_TYPE_AND_TFLITE_TYPE(TfLiteFloat16, kTfLiteFloat16);
+MATCH_TYPE_AND_TFLITE_TYPE(TfLiteBFloat16, kTfLiteBFloat16);
+MATCH_TYPE_AND_TFLITE_TYPE(double, kTfLiteFloat64);
+MATCH_TYPE_AND_TFLITE_TYPE(uint64_t, kTfLiteUInt64);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/atrace_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/atrace_profiler.h
new file mode 100644
index 00000000..64f59e88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/atrace_profiler.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+// Creates a profiler which reports the traced events to the Android ATrace.
+// Nullptr will be returned if the Android system property 'debug.tflite.trace'
+// is not set or the property value is not 1.
+std::unique_ptr<tflite::Profiler> MaybeCreateATraceProfiler();
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/buffered_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/buffered_profiler.h
new file mode 100644
index 00000000..6340162b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/buffered_profiler.h
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+namespace tflite {
+namespace profiling {
+
+// Controls whether profiling is enabled or disabled and collects profiles.
+// TFLite is used on platforms that don't have posix threads, so the profiler is
+// kept as simple as possible. It is designed to be used only on a single
+// thread.
+//
+// Profiles are collected using Scoped*Profile objects that begin and end a
+// profile event.
+// An example usage is shown in the example below:
+//
+// Say Worker class has a DoWork method and we are interested in profiling
+// the overall execution time for DoWork and time spent in Task1 and Task2
+// functions.
+//
+// class Worker {
+//  public:
+//   void DoWork() {
+//    ScopedProfile(&controller, "DoWork");
+//    Task1();
+//    Task2();
+//    .....
+//   }
+//
+//   void Task1() {
+//    ScopedProfile(&controller, "Task1");
+//    ....
+//   }
+//
+//   void Task2() {
+//    ScopedProfile(&controller, "Task2");
+//   }
+//
+//    Profiler profiler;
+// }
+//
+// We instrument the functions that need to be profiled.
+//
+// Profile can be collected by enable profiling and then getting profile
+// events.
+//
+//  void ProfileWorker() {
+//    Worker worker;
+//    worker.profiler.EnableProfiling();
+//    worker.DoWork();
+//    worker.profiler.DisableProfiling();
+//    // Profiling is complete, extract profiles.
+//    auto profile_events = worker.profiler.GetProfiles();
+//  }
+//
+//
+class BufferedProfiler : public tflite::Profiler {
+ public:
+  BufferedProfiler(uint32_t max_num_initial_entries,
+                   bool allow_dynamic_buffer_increase)
+      : buffer_(max_num_initial_entries, false /*enabled*/,
+                allow_dynamic_buffer_increase),
+        supported_event_types_(
+            ~(static_cast<uint64_t>(
+                  EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT) |
+              static_cast<uint64_t>(EventType::TELEMETRY_EVENT) |
+              static_cast<uint64_t>(EventType::TELEMETRY_REPORT_SETTINGS) |
+              static_cast<uint64_t>(EventType::TELEMETRY_DELEGATE_EVENT) |
+              static_cast<uint64_t>(
+                  EventType::TELEMETRY_DELEGATE_REPORT_SETTINGS))) {}
+
+  explicit BufferedProfiler(uint32_t max_num_entries)
+      : BufferedProfiler(max_num_entries,
+                         false /*allow_dynamic_buffer_increase*/) {}
+
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return kInvalidEventHandle;
+    return buffer_.BeginEvent(tag, event_type, event_metadata1,
+                              event_metadata2);
+  }
+
+  void EndEvent(uint32_t event_handle) override {
+    buffer_.EndEvent(event_handle);
+  }
+
+  void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    buffer_.EndEvent(event_handle, &event_metadata1, &event_metadata2);
+  }
+
+  void AddEvent(const char* tag, EventType event_type, uint64_t elapsed_time,
+                int64_t event_metadata1, int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return;
+    buffer_.AddEvent(tag, event_type, elapsed_time, event_metadata1,
+                     event_metadata2);
+  }
+
+  void StartProfiling() { buffer_.SetEnabled(true); }
+  void StopProfiling() { buffer_.SetEnabled(false); }
+  void Reset() { buffer_.Reset(); }
+  std::vector<const ProfileEvent*> GetProfileEvents() {
+    std::vector<const ProfileEvent*> profile_events;
+    profile_events.reserve(buffer_.Size());
+    for (size_t i = 0; i < buffer_.Size(); i++) {
+      profile_events.push_back(buffer_.At(i));
+    }
+    return profile_events;
+  }
+
+ protected:
+  bool ShouldAddEvent(EventType event_type) {
+    return (static_cast<uint64_t>(event_type) & supported_event_types_) != 0;
+  }
+
+ private:
+  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
+  ProfileBuffer buffer_;
+  const uint64_t supported_event_types_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/memory_info.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/memory_info.h
new file mode 100644
index 00000000..a1246be6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/memory_info.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_MEMORY_INFO_H_
+#define TENSORFLOW_LITE_PROFILING_MEMORY_INFO_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <ostream>
+#include <sstream>
+
+namespace tflite {
+namespace profiling {
+namespace memory {
+
+struct MemoryUsage {
+  static const size_t kValueNotSet;
+
+  // Indicates whether obtaining memory usage is supported on the platform, thus
+  // indicating whether the values defined in this struct make sense or not.
+  static bool IsSupported();
+
+  MemoryUsage()
+      : mem_footprint_kb(kValueNotSet),
+        total_allocated_bytes(kValueNotSet),
+        in_use_allocated_bytes(kValueNotSet) {}
+
+  // The memory footprint (in kilobytes).
+  //
+  // For Linux:
+  // This is the maximum memory size (in kilobytes) occupied by an OS process
+  // that is held in main memory (RAM). Such memory usage information is
+  // generally referred as resident set size (rss). This is an alias to
+  // rusage::ru_maxrss.
+  //
+  // For Mac:
+  // This is the physical memory footprint (in kilobytes). This is an alias to
+  // task_vm_info::phys_footprint.
+  // Per kern/task.c, physical footprint is the sum of:
+  //    + (internal - alternate_accounting)
+  //    + (internal_compressed - alternate_accounting_compressed)
+  //    + iokit_mapped
+  //    + purgeable_nonvolatile
+  //    + purgeable_nonvolatile_compressed
+  //    + page_table
+  int64_t mem_footprint_kb;
+
+  // Total non-mmapped heap space allocated from system in bytes.
+  // For Linux, this is an alias to mallinfo::arena.
+  // For Mac, this is an alias to mstats::bytes_total
+  //
+  // This does not count mmapped heap space, nor does it count non-heap
+  // uses of memory such as other mmapped space, thread stacks, globals,
+  // code, etc.
+  size_t total_allocated_bytes;
+
+  // Total allocated (including mmapped) heap bytes that are in use
+  // (i.e. excluding those have been freed).
+  // For Linux, this is an alias to mallinfo::uordblks.
+  // For Mac, this is an alias to mstats::bytes_used
+  //
+  // This does not count non-heap uses of mmap, nor does it count other
+  // non-heap uses of memory such as thread stacks, globals, code, etc.
+  size_t in_use_allocated_bytes;
+
+  MemoryUsage operator+(MemoryUsage const& obj) const {
+    MemoryUsage res;
+    res.mem_footprint_kb = mem_footprint_kb + obj.mem_footprint_kb;
+    res.total_allocated_bytes =
+        total_allocated_bytes + obj.total_allocated_bytes;
+    res.in_use_allocated_bytes =
+        in_use_allocated_bytes + obj.in_use_allocated_bytes;
+    return res;
+  }
+
+  MemoryUsage operator-(MemoryUsage const& obj) const {
+    MemoryUsage res;
+    res.mem_footprint_kb = mem_footprint_kb - obj.mem_footprint_kb;
+    res.total_allocated_bytes =
+        total_allocated_bytes - obj.total_allocated_bytes;
+    res.in_use_allocated_bytes =
+        in_use_allocated_bytes - obj.in_use_allocated_bytes;
+    return res;
+  }
+
+  void AllStatsToStream(std::ostream* stream) const;
+
+  friend std::ostream& operator<<(std::ostream& stream,
+                                  const MemoryUsage& obj) {
+    obj.AllStatsToStream(&stream);
+    return stream;
+  }
+};
+
+// Return the memory usage from the system.
+// Note: this currently only works on Linux-based and Apple systems.
+MemoryUsage GetMemoryUsage();
+
+}  // namespace memory
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_MEMORY_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/memory_usage_monitor.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/memory_usage_monitor.h
new file mode 100644
index 00000000..621e0d33
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/memory_usage_monitor.h
@@ -0,0 +1,109 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_MEMORY_USAGE_MONITOR_H_
+#define TENSORFLOW_LITE_PROFILING_MEMORY_USAGE_MONITOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <thread>  // NOLINT(build/c++11)
+
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/lite/profiling/memory_info.h"
+
+namespace tflite {
+namespace profiling {
+namespace memory {
+
+// This class could help to tell the peak memory footprint of a running program.
+// It achieves this by spawning a thread to check the memory usage periodically
+// at a pre-defined frequency.
+class MemoryUsageMonitor {
+ public:
+  // A helper class that does memory usage sampling. This allows injecting an
+  // external dependency for the sake of testing or providing platform-specific
+  // implementations.
+  class Sampler {
+   public:
+    virtual ~Sampler() = default;
+    virtual bool IsSupported() { return MemoryUsage::IsSupported(); }
+    virtual MemoryUsage GetMemoryUsage() {
+      return tflite::profiling::memory::GetMemoryUsage();
+    }
+    virtual void SleepFor(const absl::Duration& duration) {
+      absl::SleepFor(duration);
+    }
+  };
+
+  static constexpr int64_t kInvalidMemUsageMB = -1;
+  static constexpr int64_t kInvalidMemUsageBytes =
+      kInvalidMemUsageMB * 1024 * 1024;
+
+  explicit MemoryUsageMonitor(int sampling_interval_ms = 50)
+      : MemoryUsageMonitor(sampling_interval_ms, std::make_unique<Sampler>()) {}
+  MemoryUsageMonitor(int sampling_interval_ms,
+                     std::unique_ptr<Sampler> sampler);
+  ~MemoryUsageMonitor() { StopInternal(); }
+
+  void Start();
+  void Stop();
+
+  // For simplicity, we will return kInvalidMemUsageMB for the either following
+  // conditions:
+  // 1. getting memory usage isn't supported on the platform.
+  // 2. the memory usage is being monitored (i.e. we've created the
+  // 'check_memory_thd_'.
+  float GetPeakMemUsageInMB() const {
+    if (!is_supported_ || check_memory_thd_ != nullptr) {
+      return kInvalidMemUsageMB;
+    }
+    return BytesToMegabytes(peak_mem_footprint_bytes_);
+  }
+
+  float GetCurrentInUseMemoryInMB() const {
+    return BytesToMegabytes(sampler_->GetMemoryUsage().in_use_allocated_bytes);
+  }
+
+  float GetPeakInUseMemoryInMB() const {
+    return BytesToMegabytes(peak_in_use_mem_bytes_);
+  }
+
+  MemoryUsageMonitor(MemoryUsageMonitor&) = delete;
+  MemoryUsageMonitor& operator=(const MemoryUsageMonitor&) = delete;
+  MemoryUsageMonitor(MemoryUsageMonitor&&) = delete;
+  MemoryUsageMonitor& operator=(const MemoryUsageMonitor&&) = delete;
+
+ private:
+  inline float BytesToMegabytes(int64_t bytes) const {
+    return bytes / 1024.0 / 1024.0;
+  }
+  void StopInternal();
+
+  std::unique_ptr<Sampler> sampler_ = nullptr;
+  bool is_supported_ = false;
+  std::unique_ptr<absl::Notification> stop_signal_ = nullptr;
+  absl::Duration sampling_interval_;
+  std::unique_ptr<std::thread> check_memory_thd_ = nullptr;
+  int64_t peak_mem_footprint_bytes_ = kInvalidMemUsageBytes;
+  int64_t peak_in_use_mem_bytes_ = kInvalidMemUsageBytes;
+};
+
+}  // namespace memory
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_MEMORY_USAGE_MONITOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/model_runtime_info.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/model_runtime_info.h
new file mode 100644
index 00000000..a88b80d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/model_runtime_info.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_MODEL_RUNTIME_INFO_H_
+#define TENSORFLOW_LITE_PROFILING_MODEL_RUNTIME_INFO_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/core/interpreter.h"
+
+namespace tflite {
+namespace profiling {
+
+// Generates a ModelRuntimeInfo proto for the given interpreter and writes it to
+// the given output file path.
+TfLiteStatus GenerateModelRuntimeInfo(const Interpreter &interpreter,
+                                      absl::string_view output_file_path);
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_MODEL_RUNTIME_INFO_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/noop_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/noop_profiler.h
new file mode 100644
index 00000000..078d0e8e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/noop_profiler.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+namespace tflite {
+namespace profiling {
+
+// A noop version of profiler when profiling is disabled.
+class NoopProfiler : public tflite::Profiler {
+ public:
+  NoopProfiler() {}
+  explicit NoopProfiler(int max_profiling_buffer_entries) {}
+
+  uint32_t BeginEvent(const char*, EventType, int64_t, int64_t) override {
+    return 0;
+  }
+  void EndEvent(uint32_t) override {}
+
+  void StartProfiling() {}
+  void StopProfiling() {}
+  void Reset() {}
+  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/platform_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/platform_profiler.h
new file mode 100644
index 00000000..52a51f87
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/platform_profiler.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_PLATFORM_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_PLATFORM_PROFILER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+std::unique_ptr<tflite::Profiler> MaybeCreatePlatformProfiler();
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_PLATFORM_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_buffer.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_buffer.h
new file mode 100644
index 00000000..ff7a47e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_buffer.h
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/memory_info.h"
+#include "tensorflow/lite/profiling/time.h"
+
+namespace tflite {
+namespace profiling {
+
+constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
+
+// A profiling event.
+struct ProfileEvent {
+  // Describes the type of event.
+  // The event_metadata field may contain additional data for interpreting
+  // the event.
+  using EventType = tflite::Profiler::EventType;
+
+  // Label of the event. This usually describes the event.
+  std::string tag;
+  // Timestamp in microseconds when the event began.
+  uint64_t begin_timestamp_us;
+  // Event processing time in microseconds.
+  uint64_t elapsed_time;
+
+  // The memory usage when the event begins.
+  memory::MemoryUsage begin_mem_usage;
+  // The memory usage when the event ends.
+  memory::MemoryUsage end_mem_usage;
+
+  // The field containing the type of event. This must be one of the event types
+  // in EventType.
+  EventType event_type;
+  // Meta data associated w/ the event.
+  int64_t event_metadata;
+  // Note: if this is an OPERATOR_INVOKE_EVENT, 'extra_event_metadata' will
+  // represent the index of the subgraph that this event comes from.
+  int64_t extra_event_metadata;
+};
+
+// A buffer of profile events. In general, the buffer works like a ring buffer.
+// However, when 'allow_dynamic_expansion' is set, a unlimitted number of buffer
+// entries is allowed and more profiling overhead could occur.
+// This class is *not thread safe*.
+class ProfileBuffer {
+ public:
+  ProfileBuffer(uint32_t max_num_entries, bool enabled,
+                bool allow_dynamic_expansion = false)
+      : enabled_(enabled),
+        current_index_(0),
+        event_buffer_(max_num_entries),
+        allow_dynamic_expansion_(allow_dynamic_expansion) {}
+
+  // Adds an event to the buffer with begin timestamp set to the current
+  // timestamp. Returns a handle to event that can be used to call EndEvent. If
+  // buffer is disabled this has no affect.
+  // The tag of the event should remain valid till the buffer is valid.
+  uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
+                      int64_t event_metadata1, int64_t event_metadata2);
+
+  // Sets the enabled state of buffer to |enabled|
+  void SetEnabled(bool enabled) { enabled_ = enabled; }
+
+  // Sets the end timestamp for event for the handle to current time.
+  // If the buffer is disabled or previous event has been overwritten this
+  // operation has not effect.
+  void EndEvent(uint32_t event_handle, const int64_t* event_metadata1 = nullptr,
+                const int64_t* event_metadata2 = nullptr);
+
+  void AddEvent(const char* tag, ProfileEvent::EventType event_type,
+                uint64_t elapsed_time, int64_t event_metadata1,
+                int64_t event_metadata2);
+
+  // Returns the size of the buffer.
+  size_t Size() const {
+    return (current_index_ >= event_buffer_.size()) ? event_buffer_.size()
+                                                    : current_index_;
+  }
+
+  // Resets the buffer.
+  void Reset() {
+    enabled_ = false;
+    current_index_ = 0;
+  }
+
+  // Returns the profile event at the given index. If the index is invalid a
+  // nullptr is returned. The return event may get overwritten if more events
+  // are added to buffer.
+  const struct ProfileEvent* At(size_t index) const;
+
+ private:
+  // Returns a pair of values. The 1st element refers to the next buffer id,
+  // the 2nd element refers to whether the buffer reaches its allowed capacity.
+  std::pair<int, bool> GetNextEntryIndex();
+
+  bool enabled_;
+  uint32_t current_index_;
+  std::vector<ProfileEvent> event_buffer_;
+  const bool allow_dynamic_expansion_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_summarizer.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_summarizer.h
new file mode 100644
index 00000000..986bb691
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_summarizer.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
+
+namespace tflite {
+namespace profiling {
+
+// Creates a summary of operator invocations in the interpreter.
+class ProfileSummarizer {
+ public:
+  explicit ProfileSummarizer(
+      std::shared_ptr<ProfileSummaryFormatter> summary_formatter =
+          std::make_shared<ProfileSummaryDefaultFormatter>());
+  virtual ~ProfileSummarizer() {}
+
+  // Process profile events to update statistics for operator invocations.
+  void ProcessProfiles(const std::vector<const ProfileEvent*>& profile_stats,
+                       const tflite::Interpreter& interpreter);
+
+  // Returns a string detailing the accumulated runtime stats in the format of
+  // summary_formatter_.
+  std::string GetOutputString() {
+    return summary_formatter_->GetOutputString(
+        stats_calculator_map_, *delegate_stats_calculator_, subgraph_name_map_);
+  }
+
+  std::string GetShortSummary() {
+    return summary_formatter_->GetShortSummary(
+        stats_calculator_map_, *delegate_stats_calculator_, subgraph_name_map_);
+  }
+
+  tensorflow::StatsCalculator* GetStatsCalculator(uint32_t subgraph_index);
+
+  bool HasProfiles() {
+    for (auto& stats_calc : stats_calculator_map_) {
+      auto subgraph_stats = stats_calc.second.get();
+      if (subgraph_stats->num_runs() >= 1) return true;
+    }
+    return false;
+  }
+
+ private:
+  // Map storing stats per subgraph.
+  std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
+      stats_calculator_map_;
+
+  std::unique_ptr<tensorflow::StatsCalculator> delegate_stats_calculator_;
+
+  // Summary formatter for customized output formats.
+  std::shared_ptr<ProfileSummaryFormatter> summary_formatter_;
+
+  std::map<uint32_t, std::string> subgraph_name_map_;
+
+  void SetSubgraphNameMap(const tflite::Interpreter& interpreter) {
+    subgraph_name_map_.clear();
+    for (int subgraph_index = 0; subgraph_index < interpreter.subgraphs_size();
+         ++subgraph_index) {
+      subgraph_name_map_[subgraph_index] =
+          interpreter.subgraph(subgraph_index)->GetName();
+    }
+  }
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_summary_formatter.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_summary_formatter.h
new file mode 100644
index 00000000..62514eaf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/profile_summary_formatter.h
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <functional>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/util/stat_summarizer_options.h"
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/profiling/proto/profiling_info.pb.h"
+
+namespace tflite {
+namespace profiling {
+
+// Formats the profile summary in a certain way.
+class ProfileSummaryFormatter {
+ public:
+  ProfileSummaryFormatter() = default;
+  virtual ~ProfileSummaryFormatter() {}
+  // Returns a string detailing the accumulated runtime stats in StatsCalculator
+  // of ProfileSummarizer.
+  virtual std::string GetOutputString(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const = 0;
+  // Returns a string detailing the short summary of the accumulated runtime
+  // stats in StatsCalculator of ProfileSummarizer.
+  virtual std::string GetShortSummary(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const = 0;
+  virtual tensorflow::StatSummarizerOptions GetStatSummarizerOptions()
+      const = 0;
+  virtual void HandleOutput(const std::string& init_output,
+                            const std::string& run_output,
+                            std::string output_file_path) const = 0;
+};
+
+class ProfileSummaryDefaultFormatter : public ProfileSummaryFormatter {
+ public:
+  ProfileSummaryDefaultFormatter() = default;
+  ~ProfileSummaryDefaultFormatter() override {}
+  std::string GetOutputString(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
+  std::string GetShortSummary(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
+  tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+  void HandleOutput(const std::string& init_output,
+                    const std::string& run_output,
+                    std::string output_file_path) const override;
+
+ private:
+  std::string GenerateReport(
+      const std::string& tag, bool include_output_string,
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const;
+  void WriteOutput(const std::string& header, const std::string& data,
+                   std::ostream* stream) const {
+    (*stream) << header << std::endl;
+    (*stream) << data << std::endl;
+  }
+};
+
+class ProfileSummaryCSVFormatter : public ProfileSummaryDefaultFormatter {
+ public:
+  ProfileSummaryCSVFormatter() = default;
+  tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+};
+
+class ProfileSummaryProtoFormatter : public ProfileSummaryFormatter {
+ public:
+  std::string GetOutputString(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
+  std::string GetShortSummary(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const override;
+  tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+  void HandleOutput(const std::string& init_output,
+                    const std::string& run_output,
+                    std::string output_file_path) const override;
+
+ private:
+  std::string GenerateReport(
+      const std::string& tag, bool include_output_string,
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator,
+      const std::map<uint32_t, std::string>& subgraph_name_map) const;
+  void GenerateSubGraphProfilingData(
+      const tensorflow::StatsCalculator* stats_calculator, int subgraph_index,
+      const std::map<uint32_t, std::string>& subgraph_name_map,
+      SubGraphProfilingData* sub_graph_profiling_data) const;
+
+  void GenerateDelegateProfilingData(
+      const tensorflow::StatsCalculator* stats_calculator,
+      DelegateProfilingData* delegate_profiling_data) const;
+
+  void GenerateOpProfileDataFromDetail(
+      const tensorflow::StatsCalculator::Detail* detail,
+      const tensorflow::StatsCalculator* stats_calculator,
+      OpProfileData* op_profile_data) const;
+
+  std::vector<tensorflow::StatsCalculator::Detail> GetDetailsSortedByRunOrder(
+      const tensorflow::StatsCalculator* stats_calculator) const;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/profiler.h
new file mode 100644
index 00000000..ff398698
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/profiler.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILER_H_
+
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/noop_profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+// TODO(b/131688504): Remove this and use runtime flags for profiler selection.
+#ifdef TFLITE_PROFILING_ENABLED
+using Profiler = BufferedProfiler;
+#else
+using Profiler = NoopProfiler;
+#endif  // TFLITE_PROFILING_ENABLED
+
+}  // namespace profiling
+}  // namespace tflite
+
+#define SCOPED_TAGGED_OPERATOR_PROFILE TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE
+
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/root_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/root_profiler.h
new file mode 100644
index 00000000..8fc5f091
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/root_profiler.h
@@ -0,0 +1,105 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_ROOT_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_ROOT_PROFILER_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+/// A root profiler instance installed in TFLite runtime.
+/// It's capable to dispatching profiling events to all child profilers attached
+/// to it. Child profilers can either accept for discard the events based on the
+/// event type.
+class RootProfiler : public Profiler {
+ public:
+  RootProfiler() = default;
+  ~RootProfiler() override = default;
+
+  // Not copiable.
+  RootProfiler(const RootProfiler&) = delete;
+  RootProfiler& operator=(const RootProfiler&) = delete;
+
+  // Movable.
+  RootProfiler(RootProfiler&&) = default;
+  RootProfiler& operator=(RootProfiler&&) = default;
+
+  /// Adds a profiler to root profiler.
+  /// Added `profiler` should not be nullptr or it will be ignored.
+  /// Caller must retains the ownership. The lifetime should exceed the
+  /// lifetime of the RootProfiler.
+  void AddProfiler(Profiler* profiler);
+
+  /// Adds a profiler to RootProfiler.
+  /// Added `profiler` should not be nullptr or it will be ignored.
+  /// Transfers the ownership of `profiler` to RootProfiler.
+  void AddProfiler(std::unique_ptr<Profiler>&& profiler);
+
+  /// Signals the beginning of an event to all child profilers.
+  /// The `tag`, `event_metadata1` and `event_metadata2` arguments have
+  /// different interpretations based on the actual Profiler instance
+  /// and the `event_type`.
+  /// Returns a handle to the profile event which can be used in a later
+  /// `EndEvent` call.
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override;
+
+  /// Signals an end to the specified profile event to all child profilers with
+  /// 'event_metadata's.
+  /// An invalid event handle (e.g. not a value returned from BeginEvent call or
+  /// a handle invalidated by RemoveChildProfilers) will be ignored.
+  void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                int64_t event_metadata2) override;
+  /// Signals an end to the specified profile event to all child profilers.
+  /// An invalid event handle (e.g. not a value returned from BeginEvent call or
+  /// a handle invalidated by RemoveChildProfilers) will be ignored.
+  void EndEvent(uint32_t event_handle) override;
+
+  /// Appends an event of type 'event_type' with 'tag' and 'event_metadata'
+  /// The `tag`, `metric`, `event_metadata1` and `event_metadata2` arguments
+  /// have different interpretations based on the actual Profiler instance and
+  /// the `event_type`.
+  void AddEvent(const char* tag, EventType event_type, uint64_t metric,
+                int64_t event_metadata1, int64_t event_metadata2) override;
+
+  // Adds a profiler event with data.
+  // Data will be a const TelemetrySettings* for TELEMETRY_REPORT_SETTINGS
+  // and TELEMETRY_DELEGATE_REPORT_SETTINGS.
+  void AddEventWithData(const char* tag, EventType event_type,
+                        const void* data) override;
+
+  /// Removes all child profilers and releases the child profiler if it's owned
+  /// by the root profiler. Also invalidates all event handles generated
+  /// from previous `BeginEvent` calls.
+  void RemoveChildProfilers();
+
+ private:
+  uint32_t next_event_id_ = 1;
+  std::vector<std::unique_ptr<Profiler>> owned_profilers_;
+  std::vector<Profiler*> profilers_;
+  std::map<uint32_t, std::vector<uint32_t>> events_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_ROOT_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/signpost_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/signpost_profiler.h
new file mode 100644
index 00000000..24fd5fbc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/signpost_profiler.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_SIGNPOST_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_SIGNPOST_PROFILER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+// Creates a platform profiler for iOS, macOS, tvOS and watchOS.
+// This profiler uses Apple's signpost API for tracing events.
+// User needs to set an enrionment variable 'debug.tflite.trace' for profile
+// scheme at Xcode to enable this profiler.
+std::unique_ptr<tflite::Profiler> MaybeCreateSignpostProfiler();
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_SIGNPOST_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/subgraph_tensor_profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/subgraph_tensor_profiler.h
new file mode 100644
index 00000000..a40c0817
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/subgraph_tensor_profiler.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_SUBGRAPH_TENSOR_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_SUBGRAPH_TENSOR_PROFILER_H_
+
+#include <functional>
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+
+namespace tflite::profiling {
+
+// The SubgraphTensorProfiler is invoked for every tensor in a subgraph at the
+// end of the subgraph's execution. This profiler is constructed with a user
+// provided callback to run on each tensor in the subgraph.
+class SubgraphTensorProfiler : public tflite::Profiler {
+ public:
+  using CallbackT = std::function<void(const TfLiteTensor*)>;
+
+  SubgraphTensorProfiler(const Interpreter& interpreter, CallbackT callback);
+
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override;
+
+  void EndEvent(uint32_t event_handle) override;
+
+ private:
+  // A mapping between event IDs and the subgraph that owns the event ID.
+  std::vector<int64_t> events_;
+
+  // A handle to the active TFLite interpreter.
+  const Interpreter& interpreter_;
+
+  // A user provided callback to run on each tensor in the subgraph. The
+  // callback signature is:
+  //
+  //  void Callback(const TfLiteTensor* tensor);
+  CallbackT callback_;
+};
+
+}  // namespace tflite::profiling
+
+#endif  // TENSORFLOW_LITE_PROFILING_SUBGRAPH_TENSOR_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/profiler.h
new file mode 100644
index 00000000..ff106cd9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/profiler.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_C_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_C_PROFILER_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// C API for TFLite telemetry profiler.
+// See C++ interface in tflite::telemetry::TelemetryProfiler.
+// Note: This struct does not comply with ABI stability.
+typedef struct TfLiteTelemetryProfilerStruct {
+  // Data that profiler needs to identify itself. This data is owned by the
+  // profiler. The profiler is owned in the user code, so the profiler is
+  // responsible for deallocating this when it is destroyed.
+  void* data;
+
+  // Reports a telemetry event with status.
+  // `event_name` indicates the name of the event (e.g. "Invoke") and should not
+  // be nullptr.
+  // `status`: uint64_t representation of TelemetryStatusCode.
+  void (*ReportTelemetryEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* event_name,
+      uint64_t status);
+
+  // Reports an op telemetry event with status.
+  // Same as `ReportTelemetryEvent`, with additional args `op_idx` and
+  // `subgraph_idx`.
+  // `status`: uint64_t representation of TelemetryStatusCode.
+  void (*ReportTelemetryOpEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* event_name,
+      int64_t op_idx, int64_t subgraph_idx, uint64_t status);
+
+  // Reports the model and interpreter settings.
+  // `setting_name` indicates the name of the setting and should not be nullptr.
+  // `settings`'s lifespan is not guaranteed outside the scope of
+  // `ReportSettings` call.
+  void (*ReportSettings)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* setting_name,
+      const TfLiteTelemetrySettings* settings);
+
+  // Signals the beginning of an operator invocation.
+  // `op_name` is the name of the operator and should not be nullptr.
+  // Op invoke event are triggered with OPERATOR_INVOKE_EVENT type for TfLite
+  // ops and delegate kernels, and DELEGATE_OPERATOR_INVOKE_EVENT for delegate
+  // ops within a delegate kernels, if the instrumentation is in place.
+  // Returns event handle which can be passed to `EndOpInvokeEvent` later.
+  uint32_t (*ReportBeginOpInvokeEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* op_name,
+      int64_t op_idx, int64_t subgraph_idx);
+
+  // Signals the end to the event specified by `event_handle`.
+  void (*ReportEndOpInvokeEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, uint32_t event_handle);
+
+  // For op / delegate op with built-in performance measurements, they
+  // are able to report the elapsed time directly.
+  // `elapsed_time` is in microsecond.
+  void (*ReportOpInvokeEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* op_name,
+      uint64_t elapsed_time, int64_t op_idx, int64_t subgraph_idx);
+} TfLiteTelemetryProfilerStruct;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_C_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/telemetry_setting.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/telemetry_setting.h
new file mode 100644
index 00000000..a984f4e3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/telemetry_setting.h
@@ -0,0 +1,91 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TFLite model, interpreter or delegate settings that will be reported by
+// telemetry.
+// Note: This struct does not comply with ABI stability.
+typedef struct TfLiteTelemetrySettings {
+  // Source of the settings. Determines how `data` is interpreted.
+  // See tflite::telemetry::TelemetrySource for definition.
+  uint32_t source;
+
+  // Settings data. Interpretation based on `source`.
+  // If `source` is TFLITE_INTERPRETER, the type of `data` will
+  // be `TelemetryInterpreterSettings`.
+  // Otherwise, the data is provided by the individual delegate.
+  // Owned by the caller that exports TelemetrySettings (e.g. Interpreter).
+  const void* data;
+} TfLiteTelemetrySettings;
+
+typedef struct TfLiteTelemetryConversionMetadata
+    TfLiteTelemetryConversionMetadata;
+
+const int32_t* TfLiteTelemetryConversionMetadataGetModelOptimizationModes(
+    const TfLiteTelemetryConversionMetadata* metadata);
+
+size_t TfLiteTelemetryConversionMetadataGetNumModelOptimizationModes(
+    const TfLiteTelemetryConversionMetadata* metadata);
+
+// TfLite model information and settings of the interpreter.
+// Note: This struct does not comply with ABI stability.
+typedef struct TfLiteTelemetryInterpreterSettings
+    TfLiteTelemetryInterpreterSettings;
+
+const TfLiteTelemetryConversionMetadata*
+TfLiteTelemetryInterpreterSettingsGetConversionMetadata(
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+// Telemetry data for a specific TFLite subgraph.
+typedef struct TfLiteTelemetrySubgraphInfo TfLiteTelemetrySubgraphInfo;
+
+size_t TfLiteTelemetryInterpreterSettingsGetNumSubgraphInfo(
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+const TfLiteTelemetrySubgraphInfo*
+TfLiteTelemetryInterpreterSettingsGetSubgraphInfo(
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+size_t TfLiteTelemetrySubgraphInfoGetNumQuantizations(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+const TfLiteQuantization* TfLiteTelemetrySubgraphInfoGetQuantizations(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+// Telemetry information for GPU delegate.
+typedef struct TfLiteTelemetryGpuDelegateSettings
+    TfLiteTelemetryGpuDelegateSettings;
+
+size_t TfLiteTelemetryGpuDelegateSettingsGetNumNodesDelegated(
+    const TfLiteTelemetryGpuDelegateSettings* settings);
+
+int TfLiteTelemetryGpuDelegateSettingsGetBackend(
+    const TfLiteTelemetryGpuDelegateSettings* settings);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h
new file mode 100644
index 00000000..392f5581
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_INTERNAL_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_INTERNAL_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct TfLiteTelemetryConversionMetadata {
+  std::vector<int32_t> model_optimization_modes;
+};
+
+struct TfLiteTelemetrySubgraphInfo {
+  std::vector<TfLiteQuantization> quantizations;
+};
+
+struct TfLiteTelemetryInterpreterSettings {
+  std::unique_ptr<TfLiteTelemetryConversionMetadata> conversion_metadata;
+  std::vector<TfLiteTelemetrySubgraphInfo> subgraph_infos;
+};
+
+struct TfLiteTelemetryGpuDelegateSettings {
+  // Reported by "GpuDelegate::DelegatePrepare" event.
+  size_t num_nodes_delegated;
+
+  // Reported by "GpuDelegateKernel::Prepare" event.
+  enum Backend : int {
+    UNKNOWN = 0,
+    OPENCL = 1,
+    OPENGL = 2,
+  };
+  Backend backend;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/profiler.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/profiler.h
new file mode 100644
index 00000000..915492eb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/profiler.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_PROFILER_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
+
+namespace tflite::telemetry {
+
+// Telemetry profiler interface.
+// When installed, the telemetry profilers accepts profiler events exported from
+// TFLite runtime profiler instrumentation points, interprets the events
+// based on the event type and forward to corresponding `Report` function.
+// The implementation of the `Report` functions are responsible for dumping the
+// profiling events to the data sink.
+// The implementation of TelemetryProfiler is required to be thread safe.
+class TelemetryProfiler : public Profiler {
+ public:
+  // General Telemetry events.
+
+  // Reports a telemetry event with status.
+  // `event_name` indicates the name of the event (e.g. "Invoke") and should not
+  // be nullptr.
+  // `status` shows 1) the source of the event, interpreter or which delegate,
+  // 2) the return status of the event.
+  virtual void ReportTelemetryEvent(const char* event_name,
+                                    TelemetryStatusCode status) = 0;
+
+  // Reports an op telemetry event with status.
+  // Same as `ReportTelemetryEvent`, with additional args `op_idx` and
+  // `subgraph_idx`.
+  virtual void ReportTelemetryOpEvent(const char* event_name, int64_t op_idx,
+                                      int64_t subgraph_idx,
+                                      TelemetryStatusCode status) = 0;
+
+  // Telemetry ReportSettings events.
+
+  // Reports the model and interpreter settings.
+  // `setting_name` indicates the name of the setting and should not be nullptr.
+  // `settings`'s lifespan is not guaranteed outside the scope of
+  // `ReportSettings` call.
+  virtual void ReportSettings(const char* setting_name,
+                              const TfLiteTelemetrySettings* settings) = 0;
+
+  // Performance measurement events.
+
+  // Signals the beginning of an operator invocation.
+  // `op_name` is the name of the operator and should not be nullptr.
+  // Op invoke event are triggered with OPERATOR_INVOKE_EVENT type for TfLite
+  // ops and delegate kernels, and DELEGATE_OPERATOR_INVOKE_EVENT for delegate
+  // ops within a delegate kernels, if the instrumentation is in place.
+  // Returns event handle which can be passed to `EndOpInvokeEvent` later.
+  virtual uint32_t ReportBeginOpInvokeEvent(const char* op_name, int64_t op_idx,
+                                            int64_t subgraph_idx) = 0;
+
+  // Signals the end to the event specified by `event_handle`.
+  virtual void ReportEndOpInvokeEvent(uint32_t event_handle) = 0;
+
+  // For op / delegate op with built-in performance measurements, they
+  // are able to report the elapsed time directly.
+  // `elapsed_time` is in microsecond.
+  virtual void ReportOpInvokeEvent(const char* op_name, uint64_t elapsed_time,
+                                   int64_t op_idx, int64_t subgraph_idx) = 0;
+
+ private:
+  // Methods inherited from TfLite Profiler.
+  // TelemetryProfiler will dispatch the event signals to appropriate `Report`
+  // functinos defined above based on the event type.
+  // Subclasses should not override those following methods.
+  void AddEvent(const char* tag, EventType event_type, uint64_t metric,
+                int64_t event_metadata1, int64_t event_metadata2) final;
+  void AddEventWithData(const char* tag, EventType event_type,
+                        const void* data) final;
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata1, int64_t event_metadata2) final;
+  void EndEvent(uint32_t event_handle) final;
+};
+
+// Creates a concrete TelemetryProfiler that wraps the
+// `TfLiteTelemetryProfilerStruct` C API.
+TelemetryProfiler* MakeTfLiteTelemetryProfiler(
+    TfLiteTelemetryProfilerStruct* profiler);
+
+}  // namespace tflite::telemetry
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_PROFILER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/telemetry.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/telemetry.h
new file mode 100644
index 00000000..bbb4bdcf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/telemetry.h
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
+
+namespace tflite::telemetry {
+
+// Methods for instrumenting TFLite runtime to export telemetry events to
+// profilers.
+
+// Reports an interpreter telemetry event.
+// `event_name` indicates the name of the event (e.g. "Invoke") and should not
+// be nullptr.
+void TelemetryReportEvent(TfLiteContext* context, const char* event_name,
+                          TfLiteStatus status);
+
+// Reports an interpreter telemetry event associated with an op.
+// `op_name` indicates the name of the op and should not be nullptr.
+void TelemetryReportOpEvent(TfLiteContext* context, const char* op_name,
+                            int64_t op_index, int64_t subgraph_index,
+                            TfLiteStatus status);
+
+// Reports a delegate telemetry event.
+// `event_name` indicates the name of the event (e.g. "Invoke") and should not
+// be nullptr.
+// `source` indicates which delegate the event is from.
+// `code` is the error code from the delegate.
+void TelemetryReportDelegateEvent(TfLiteContext* context,
+                                  const char* event_name,
+                                  TelemetrySource source, uint32_t code);
+
+// Reports a delegate telemetry event associated with an op.
+// `op_name` indicates the name of the op and should not be nullptr.
+void TelemetryReportDelegateOpEvent(TfLiteContext* context, const char* op_name,
+                                    int64_t op_index, int64_t subgraph_index,
+                                    TelemetrySource source, uint32_t code);
+
+// Reports model and interpreter level settings.
+// `setting_name` indicates the name of the setting.
+void TelemetryReportSettings(
+    TfLiteContext* context, const char* setting_name,
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+// Reports delegate settings.
+// `setting_name` indicates the name of the setting.
+// `source` indicates which delegate the event is from.
+// `settings` is the delegate provided settings and should not be nullptr.
+void TelemetryReportDelegateSettings(TfLiteContext* context,
+                                     const char* setting_name,
+                                     TelemetrySource source,
+                                     const void* settings);
+
+}  // namespace tflite::telemetry
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/telemetry_status.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/telemetry_status.h
new file mode 100644
index 00000000..54e58813
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/telemetry/telemetry_status.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_STATUS_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_STATUS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+namespace tflite::telemetry {
+
+// The source of a telemetry event. Enum values intentionally follow proto
+// guidelines as they are used for Clearcut logging.
+enum class TelemetrySource : uint32_t {
+  UNKNOWN = 0,
+  TFLITE_INTERPRETER = 1,
+
+  // For external delegate.
+  // External delegate should identify themselves in telemetry event names by
+  // prefixing the delegame name to it.
+  TFLITE_CUSTOM_DELEGATE = 2,
+
+  TFLITE_GPU = 3,
+  TFLITE_NNAPI = 4,
+  TFLITE_HEXAGON = 5,
+  TFLITE_XNNPACK = 6,
+  TFLITE_COREML = 7,
+};
+
+// A namespaced status code for telemetry events.
+struct TelemetryStatusCode {
+  TelemetrySource source = TelemetrySource::TFLITE_INTERPRETER;
+  uint32_t status_code = 0;
+
+  // Helper constructors to build the status code from various types.
+  TelemetryStatusCode() = default;
+  TelemetryStatusCode(TelemetrySource source, uint32_t status_code)
+      : source(source), status_code(status_code) {}
+  explicit TelemetryStatusCode(TfLiteStatus status)
+      : TelemetryStatusCode(TelemetrySource::TFLITE_INTERPRETER, status) {}
+  explicit TelemetryStatusCode(uint64_t code)
+      : TelemetryStatusCode(static_cast<TelemetrySource>(code >> 32),
+                            static_cast<uint32_t>(code)) {}
+
+  // Returns the uint64_t representation of the status code.
+  uint64_t code() const {
+    return (static_cast<uint64_t>(source) << 32 | status_code);
+  }
+
+  bool operator==(const TelemetryStatusCode& other) const {
+    return source == other.source && status_code == other.status_code;
+  }
+};
+
+}  // namespace tflite::telemetry
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_STATUS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/profiling/time.h b/third_party/tflite-hdrs/tensorflow/lite/profiling/time.h
new file mode 100644
index 00000000..c7527ad0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/profiling/time.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TIME_H_
+#define TENSORFLOW_LITE_PROFILING_TIME_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace profiling {
+namespace time {
+uint64_t NowMicros();
+void SleepForMicros(uint64_t micros);
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PROFILING_TIME_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/analyzer_wrapper/model_analyzer.h b/third_party/tflite-hdrs/tensorflow/lite/python/analyzer_wrapper/model_analyzer.h
new file mode 100644
index 00000000..e2907863
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/analyzer_wrapper/model_analyzer.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_ANALYZER_WRAPPER_MODEL_ANALYZER_H_
+#define TENSORFLOW_LITE_PYTHON_ANALYZER_WRAPPER_MODEL_ANALYZER_H_
+
+#include <string>
+
+namespace tflite {
+
+// Returns a brief dump of the given TFLite file or model.
+// It examines the model file itself without instantiating TFLite interpreters.
+std::string model_analyzer(const std::string& model_file_or_buffer,
+                           bool input_is_filepath,
+                           bool check_gpu_compatibility);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_ANALYZER_WRAPPER_MODEL_ANALYZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
new file mode 100644
index 00000000..888ec219
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -0,0 +1,160 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+#include "tensorflow/lite/core/interpreter.h"
+
+struct TfLiteDelegate;
+
+// We forward declare TFLite classes here to avoid exposing them to SWIG.
+namespace tflite {
+class MutableOpResolver;
+
+namespace impl {
+class FlatBufferModel;
+}
+
+namespace interpreter_wrapper {
+
+class PythonErrorReporter;
+
+class InterpreterWrapper {
+ public:
+  using Model = impl::FlatBufferModel;
+
+  // SWIG caller takes ownership of pointer.
+  static InterpreterWrapper* CreateWrapperCPPFromFile(
+      const char* model_path, int op_resolver_id,
+      const std::vector<std::string>& registerers, std::string* error_msg,
+      bool preserve_all_tensors, bool disable_delegate_clustering);
+  static InterpreterWrapper* CreateWrapperCPPFromFile(
+      const char* model_path, int op_resolver_id,
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg, bool preserve_all_tensors,
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
+
+  // SWIG caller takes ownership of pointer.
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(
+      PyObject* data, int op_resolver_id,
+      const std::vector<std::string>& registerers, std::string* error_msg,
+      bool preserve_all_tensors, bool disable_delegate_clustering);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(
+      PyObject* data, int op_resolver_id,
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg, bool preserve_all_tensors,
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
+
+  ~InterpreterWrapper();
+  PyObject* AllocateTensors(int subgraph_index);
+  PyObject* Invoke(int subgraph_index);
+
+  PyObject* InputIndices() const;
+  PyObject* OutputIndices() const;
+  PyObject* ResizeInputTensor(int i, PyObject* value, bool strict,
+                              int subgraph_index);
+
+  int NumTensors(int subgraph_index) const;
+  int NumSubgraphs() const;
+  std::string TensorName(int tensor_index, int subgraph_index) const;
+  PyObject* TensorType(int tensor_index, int subgraph_index) const;
+  PyObject* TensorSize(int tensor_index, int subgraph_index) const;
+  PyObject* TensorSizeSignature(int tensor_index, int subgraph_index) const;
+  PyObject* TensorSparsityParameters(int tensor_index,
+                                     int subgraph_index) const;
+  // Deprecated in favor of TensorQuantizationScales, below.
+  PyObject* TensorQuantization(int tensor_index, int subgraph_index) const;
+  PyObject* TensorQuantizationParameters(int tensor_index,
+                                         int subgraph_index) const;
+  PyObject* SetTensor(int tensor_index, PyObject* value, int subgraph_index);
+  PyObject* GetTensor(int tensor_index, int subgraph_index) const;
+  PyObject* GetSubgraphIndexFromSignature(const char* signature_key);
+  PyObject* GetSignatureDefs() const;
+  PyObject* ResetVariableTensors();
+
+  int NumNodes() const;
+  std::string NodeName(int i) const;
+  PyObject* NodeInputs(int i) const;
+  PyObject* NodeOutputs(int i) const;
+
+  // Returns a reference to tensor index as a numpy array from subgraph. The
+  // base_object should be the interpreter object providing the memory.
+  PyObject* tensor(PyObject* base_object, int tensor_index, int subgraph_index);
+
+  PyObject* SetNumThreads(int num_threads);
+
+  // Adds a delegate to the interpreter.
+  PyObject* ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // Experimental and subject to change.
+  //
+  // Returns a pointer to the underlying interpreter.
+  Interpreter* interpreter() { return interpreter_.get(); }
+
+ private:
+  // Helper function to construct an `InterpreterWrapper` object.
+  // It only returns InterpreterWrapper if it can construct an `Interpreter`.
+  // Otherwise it returns `nullptr`.
+  static InterpreterWrapper* CreateInterpreterWrapper(
+      std::unique_ptr<Model> model, int op_resolver_id,
+      std::unique_ptr<PythonErrorReporter> error_reporter,
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg, bool preserve_all_tensors,
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
+
+  InterpreterWrapper(std::unique_ptr<Model> model,
+                     std::unique_ptr<PythonErrorReporter> error_reporter,
+                     std::unique_ptr<tflite::MutableOpResolver> resolver,
+                     std::unique_ptr<Interpreter> interpreter);
+
+  // InterpreterWrapper is not copyable or assignable. We avoid the use of
+  // InterpreterWrapper() = delete here for SWIG compatibility.
+  InterpreterWrapper();
+  InterpreterWrapper(const InterpreterWrapper& rhs);
+
+  // Helper function to resize an input tensor.
+  PyObject* ResizeInputTensorImpl(int i, PyObject* value);
+
+  // The public functions which creates `InterpreterWrapper` should ensure all
+  // these member variables are initialized successfully. Otherwise it should
+  // report the error and return `nullptr`.
+  const std::unique_ptr<Model> model_;
+  const std::unique_ptr<PythonErrorReporter> error_reporter_;
+  const std::unique_ptr<tflite::MutableOpResolver> resolver_;
+  const std::unique_ptr<Interpreter> interpreter_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/numpy.h b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/numpy.h
new file mode 100644
index 00000000..e04418c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+
+#ifdef PyArray_Type
+#error "Numpy cannot be included before numpy.h."
+#endif
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+// To handle PyArray_* calles, numpy defines a static lookup table called
+// PyArray_API, or PY_ARRAY_UNIQUE_SYMBOL, if defined. This causes the
+// PyArray_* pointers to be different for different translation units, unless
+// we take care of selectivel defined NO_IMPORT_ARRAY.
+//
+// Virtually every usage will define NO_IMPORT_ARRAY, and will have access to
+// the lookup table via:
+//   extern void **PyArray_API;
+// In numpy.cc we will define TFLITE_IMPORT_NUMPY, effectively disabling that
+// and instead using:
+//   void **PyArray_API;
+// which is initialized when ImportNumpy() is called.
+//
+// If we don't define PY_ARRAY_UNIQUE_SYMBOL then PyArray_API is a static
+// variable, which causes strange crashes when the pointers are used across
+// translation unit boundaries.
+//
+// For more info see https://sourceforge.net/p/numpy/mailman/message/5700519
+// See also tensorflow/compiler/xla/tsl/python/lib/core/numpy.h for a similar
+// approach.
+#define PY_ARRAY_UNIQUE_SYMBOL _tflite_numpy_api
+#ifndef TFLITE_IMPORT_NUMPY
+#define NO_IMPORT_ARRAY
+#endif
+
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy();
+
+}  // namespace python
+
+namespace python_utils {
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type);
+
+TfLiteType TfLiteTypeFromPyType(int py_type);
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
+
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer);
+
+}  // namespace python_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
new file mode 100644
index 00000000..b406187e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+
+#include <Python.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/stateful_error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+class PythonErrorReporter : public tflite::StatefulErrorReporter {
+ public:
+  PythonErrorReporter() {}
+
+  // Report an error message
+  int Report(const char* format, va_list args) override;
+
+  // Sets a Python runtime exception with the last error and
+  // clears the error message buffer.
+  PyObject* exception();
+
+  // Gets the last error message and clears the buffer.
+  std::string message() override;
+
+ private:
+  std::stringstream buffer_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/python_utils.h b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/python_utils.h
new file mode 100644
index 00000000..1d35e81f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/interpreter_wrapper/python_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+
+#include <Python.h>
+
+#include <cstddef>
+
+namespace tflite {
+namespace python_utils {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length);
+PyObject* ConvertToPyString(const char* data, size_t length);
+
+}  // namespace python_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.h
new file mode 100644
index 00000000..80e11153
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.h
@@ -0,0 +1,58 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_METRICS_WRAPPER_METRICS_WRAPPER_H_
+#define TENSORFLOW_LITE_PYTHON_METRICS_WRAPPER_METRICS_WRAPPER_H_
+
+#include <memory>
+#include <string>
+
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+// We forward declare TFLite classes here to avoid exposing them to SWIG.
+namespace tensorflow {
+namespace monitoring {
+class MetricsExporter;
+}  // namespace monitoring
+}  // namespace tensorflow
+
+namespace tflite {
+namespace metrics_wrapper {
+
+class MetricsWrapper {
+ public:
+  using MetricsExporter = tensorflow::monitoring::MetricsExporter;
+  // SWIG caller takes ownership of pointer.
+  static MetricsWrapper* CreateMetricsWrapper(const std::string& session_id);
+
+  ~MetricsWrapper();
+
+  // Export metrics with Streamz.
+  PyObject* ExportMetrics();
+
+ private:
+  explicit MetricsWrapper(std::unique_ptr<MetricsExporter> exporter);
+
+  const std::unique_ptr<MetricsExporter> exporter_;
+};
+
+}  // namespace metrics_wrapper
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_METRICS_WRAPPER_METRICS_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/optimize/calibration_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/python/optimize/calibration_wrapper.h
new file mode 100644
index 00000000..832fd7b6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -0,0 +1,138 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
+#define TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+#include "tensorflow/lite/core/interpreter.h"
+
+// We forward declare TFLite classes here to avoid exposing them to SWIG.
+namespace tflite {
+namespace ops {
+namespace builtin {
+class BuiltinOpResolver;
+}  // namespace builtin
+}  // namespace ops
+
+namespace impl {
+class FlatBufferModel;
+}
+
+namespace interpreter_wrapper {
+class PythonErrorReporter;
+}  // namespace interpreter_wrapper
+
+namespace optimize {
+namespace calibration {
+class CalibrationReader;
+}  // namespace calibration
+}  // namespace optimize
+
+namespace calibration_wrapper {
+
+PyObject* AddIntermediateTensors(PyObject* data);
+
+class CalibrationWrapper {
+ public:
+  // SWIG caller takes ownership of pointer.
+  static CalibrationWrapper* CreateWrapperCPPFromBuffer(
+      PyObject* data, const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
+  ~CalibrationWrapper();
+
+  // Allocates the primary subgraph's tensors.
+  PyObject* Prepare();
+
+  // Allocates the tensors of the given signature, defined by the signature
+  // key.
+  PyObject* Prepare(std::string signature_key);
+
+  // Allocates the primary subgraph's tensors with the given input shapes.
+  PyObject* Prepare(PyObject* input_shapes);
+
+  // Allocates the tensors of the given signature with the given input
+  // shapes, defined by the signature key.
+  PyObject* Prepare(PyObject* input_shapes, std::string signature_key);
+
+  // Sets the given input tensors to the primary subgraph.
+  PyObject* FeedTensor(PyObject* input_value);
+
+  // Sets the given input tensor to the given signature, defined by the
+  // signature key.
+  PyObject* FeedTensor(PyObject* input_value, std::string signature_key);
+
+  // Allows quantizing only the operator that produces the tensor.
+  PyObject* QuantizeModel(int input_py_type, int output_py_type,
+                          bool allow_float, int activations_py_type,
+                          int bias_py_type);
+
+  // Allows quantizing only the operator that produces the tensor with name
+  // operator_output_name. (This can be used to help debug.).
+  // TODO(suharshs): Allow providing multiple names.
+  PyObject* QuantizeModel(int input_py_type, int output_py_type,
+                          bool allow_float, const char* operator_output_name);
+
+  // Disables per-channel quantization, can be used to produce smaller
+  // models but may cause accuracy issues.
+  PyObject* QuantizeModel(
+      int input_py_type, int output_py_type, bool allow_float,
+      int activations_py_type, int bias_py_type, bool disable_per_channel,
+      bool disable_per_channel_quantization_for_dense_layers);
+
+  // Writes the in-memory calibration results to the model flatbuffer. The
+  // produced model is as same as the original input model, but the min/max
+  // in the quantization field.
+  PyObject* Calibrate();
+
+ private:
+  // CalibrationWrapper is not copyable or assignable. We avoid the use of
+  // CalibrationWrapper() = delete here for SWIG compatibility.
+  CalibrationWrapper(
+      std::unique_ptr<Interpreter> interpreter,
+      std::unique_ptr<ops::builtin::BuiltinOpResolver> resolver,
+      std::unique_ptr<interpreter_wrapper::PythonErrorReporter> error_reporter,
+      std::unique_ptr<impl::FlatBufferModel> model,
+      std::unique_ptr<optimize::calibration::CalibrationReader> reader,
+      std::unique_ptr<std::string> model_str_);
+
+  CalibrationWrapper(const CalibrationWrapper& rhs);
+
+  PyObject* SetTensor(int index, PyObject* value);
+  PyObject* SetTensor(int index, PyObject* value, std::string signature_key);
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<interpreter_wrapper::PythonErrorReporter> error_reporter_;
+  std::unique_ptr<ops::builtin::BuiltinOpResolver> resolver_;
+  std::unique_ptr<impl::FlatBufferModel> model_;
+  std::unique_ptr<optimize::calibration::CalibrationReader> reader_;
+  std::unique_ptr<std::string> model_str_;
+};
+
+}  // namespace calibration_wrapper
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/python/testdata/test_registerer.h b/third_party/tflite-hdrs/tensorflow/lite/python/testdata/test_registerer.h
new file mode 100644
index 00000000..8ee7e198
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/python/testdata/test_registerer.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_TESTDATA_TEST_REGISTERER_H_
+#define TENSORFLOW_LITE_PYTHON_TESTDATA_TEST_REGISTERER_H_
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+
+// Dummy registerer function with the correct signature. Ignores the resolver
+// but increments the num_test_registerer_calls counter by one. The TF_ prefix
+// is needed to get past the version script in the OSS build.
+extern "C" void TF_TestRegisterer(tflite::MutableOpResolver *resolver);
+
+// Returns the num_test_registerer_calls counter and re-sets it.
+int get_num_test_registerer_calls();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_TESTDATA_TEST_REGISTERER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/schema/builtin_ops_header/generator.h b/third_party/tflite-hdrs/tensorflow/lite/schema/builtin_ops_header/generator.h
new file mode 100644
index 00000000..eca4fe88
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/schema/builtin_ops_header/generator.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// An utility library to generate pure C header for builtin ops definition.
+#ifndef TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
+#define TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
+
+#include <iostream>
+#include <string>
+
+namespace tflite {
+namespace builtin_ops_header {
+
+// Check if the input enum name (from the Flatbuffer definition) is valid.
+bool IsValidInputEnumName(const std::string& name);
+
+// Convert the enum name from Flatbuffer convention to C enum name convention.
+// E.g. `L2_POOL_2D` becomes `kTfLiteBuiltinL2Pool2d`.
+std::string ConstantizeVariableName(const std::string& name);
+
+// The function generates a pure C header for builtin ops definition, and write
+// it to the output stream.
+bool GenerateHeader(std::ostream& os);
+
+}  // namespace builtin_ops_header
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/schema/builtin_ops_list/generator.h b/third_party/tflite-hdrs/tensorflow/lite/schema/builtin_ops_list/generator.h
new file mode 100644
index 00000000..9bd1c490
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/schema/builtin_ops_list/generator.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// An utility library to generate pure C header for builtin ops definition.
+#ifndef TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_LIST_GENERATOR_H_
+#define TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_LIST_GENERATOR_H_
+
+#include <iostream>
+#include <string>
+
+namespace tflite {
+namespace builtin_ops_list {
+
+// Check if the input enum name (from the Flatbuffer definition) is valid.
+bool IsValidInputEnumName(const std::string& name);
+
+// The function generates a pure C header for builtin ops definition, and write
+// it to the output stream.
+bool GenerateHeader(std::ostream& os);
+
+}  // namespace builtin_ops_list
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_LIST_GENERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/schema/conversion_metadata_generated.h b/third_party/tflite-hdrs/tensorflow/lite/schema/conversion_metadata_generated.h
new file mode 100755
index 00000000..b05ec254
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/schema/conversion_metadata_generated.h
@@ -0,0 +1,20 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SCHEMA_CONVERSION_METADATA_GENERATED_H_
+#define TENSORFLOW_LITE_SCHEMA_CONVERSION_METADATA_GENERATED_H_
+
+#include "tensorflow/compiler/mlir/lite/schema/conversion_metadata_generated.h"
+
+#endif  // TENSORFLOW_LITE_SCHEMA_CONVERSION_METADATA_GENERATED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/schema/schema_conversion_utils.h b/third_party/tflite-hdrs/tensorflow/lite/schema/schema_conversion_utils.h
new file mode 100644
index 00000000..a8acd86d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/schema/schema_conversion_utils.h
@@ -0,0 +1,20 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
+#define TENSORFLOW_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
+
+#include "tensorflow/compiler/mlir/lite/schema/schema_conversion_utils.h"  // IWYU pragma: keep
+
+#endif  // TENSORFLOW_LITE_SCHEMA_SCHEMA_CONVERSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/schema/schema_generated.h b/third_party/tflite-hdrs/tensorflow/lite/schema/schema_generated.h
new file mode 100755
index 00000000..bfdda006
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/schema/schema_generated.h
@@ -0,0 +1,20 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SCHEMA_SCHEMA_GENERATED_H_
+#define TENSORFLOW_LITE_SCHEMA_SCHEMA_GENERATED_H_
+
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+
+#endif  // TENSORFLOW_LITE_SCHEMA_SCHEMA_GENERATED_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/schema/schema_utils.h b/third_party/tflite-hdrs/tensorflow/lite/schema/schema_utils.h
new file mode 100644
index 00000000..ff04cf14
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/schema/schema_utils.h
@@ -0,0 +1,20 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
+#define TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
+
+#include "tensorflow/compiler/mlir/lite/schema/schema_utils.h"  // IWYU pragma: keep
+
+#endif  // TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/shared_library.h b/third_party/tflite-hdrs/tensorflow/lite/shared_library.h
new file mode 100644
index 00000000..73886ccf
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/shared_library.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SHARED_LIBRARY_H_
+#define TENSORFLOW_LITE_SHARED_LIBRARY_H_
+
+#if defined(_WIN32)
+// Windows does not have dlfcn.h/dlsym, use GetProcAddress() instead.
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif  // defined(_WIN32)
+
+namespace tflite {
+
+// SharedLibrary provides a uniform set of APIs across different platforms to
+// handle dynamic library operations
+class SharedLibrary {
+ public:
+#if defined(_WIN32)
+  static inline void* LoadLibrary(const wchar_t* lib) {
+    return ::LoadLibraryW(lib);
+  }
+  static inline void* GetLibrarySymbol(void* handle, const char* symbol) {
+    return reinterpret_cast<void*>(
+        GetProcAddress(static_cast<HMODULE>(handle), symbol));
+  }
+  // Warning: Unlike dlsym(RTLD_DEFAULT), it doesn't search the symbol from
+  // dependent DLLs.
+  static inline void* GetSymbol(const char* symbol) {
+    return reinterpret_cast<void*>(GetProcAddress(nullptr, symbol));
+  }
+  static inline int UnLoadLibrary(void* handle) {
+    return FreeLibrary(static_cast<HMODULE>(handle));
+  }
+  static inline const char* GetError() { return "Unknown"; }
+#else
+  static inline void* LoadLibrary(const char* lib) {
+    return dlopen(lib, RTLD_LAZY | RTLD_LOCAL);
+  }
+  static inline void* GetLibrarySymbol(void* handle, const char* symbol) {
+    return dlsym(handle, symbol);
+  }
+  static inline void* GetSymbol(const char* symbol) {
+    return dlsym(RTLD_DEFAULT, symbol);
+  }
+  static inline int UnLoadLibrary(void* handle) { return dlclose(handle); }
+  static inline const char* GetError() { return dlerror(); }
+#endif  // defined(_WIN32)
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SHARED_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/signature_runner.h b/third_party/tflite-hdrs/tensorflow/lite/signature_runner.h
new file mode 100644
index 00000000..77812ae6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/signature_runner.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SIGNATURE_RUNNER_H_
+#define TENSORFLOW_LITE_SIGNATURE_RUNNER_H_
+/// \file
+///
+/// An abstraction for invoking the TF Lite interpreter.
+/// Provides support for named parameters, and for including multiple
+/// named computations in a single model, each with its own inputs/outputs.
+
+/// For documentation, see
+/// third_party/tensorflow/lite/core/signature_runner.h.
+
+#include "tensorflow/lite/core/signature_runner.h"  // IWYU pragma: export
+
+namespace tflite {
+using SignatureRunner = ::tflite::impl::SignatureRunner;
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SIGNATURE_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/simple_memory_arena.h b/third_party/tflite-hdrs/tensorflow/lite/simple_memory_arena.h
new file mode 100644
index 00000000..7275b301
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/simple_memory_arena.h
@@ -0,0 +1,193 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
+#define TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// This little structure holds the offset and the size for a dynamic memory
+// allocation in the memory arena as well as first_node and last_node that use
+// corresponding tensor. It means that continuous part of memory with this size
+// needs to be allocated before execution of operation in the first node and can
+// be deallocated after execution of the operation in the last_node. When the
+// arena is committed and the underlying buffer is set, the alloc can be
+// resolved into an actual memory pointer.
+struct ArenaAllocWithUsageInterval {
+  ArenaAllocWithUsageInterval() { reset(); }
+
+  size_t offset;
+  size_t size;
+  int32_t tensor;
+  int32_t first_node;
+  int32_t last_node;
+
+  inline void reset() {
+    offset = 0;
+    size = 0;
+    tensor = -1;
+    first_node = -1;
+    last_node = -1;
+  }
+
+  inline bool operator<(const ArenaAllocWithUsageInterval& other) const {
+    return offset < other.offset;
+  }
+};
+
+struct PointerAlignedPointerPair {
+  char* pointer;
+  char* aligned_pointer;
+};
+
+class ResizableAlignedBuffer {
+ public:
+  ResizableAlignedBuffer(size_t alignment, int subgraph_index)
+      : buffer_{nullptr, nullptr},
+        data_size_(0),
+        alignment_(alignment),
+        subgraph_index_(subgraph_index) {
+    // To silence unused private member warning, only used with
+    // TF_LITE_TENSORFLOW_PROFILER
+    (void)subgraph_index_;
+  }
+
+  ~ResizableAlignedBuffer() { Release(); }
+
+  // Resizes the buffer to make sure new_size bytes fit in the buffer. Keeps
+  // alignment and any existing the data. Returns true when any external
+  // pointers into the data array need to be adjusted (the buffer was moved).
+  bool Resize(size_t new_size);
+  // Releases any allocated memory.
+  void Release();
+
+  // Pointer to the data array.
+  char* GetPtr() const { return buffer_.aligned_pointer; }
+  // Size of the data array. Note: the allocated memory block might be larger
+  // due to excess alignment requirements.
+  size_t GetSize() const { return data_size_; }
+  // Alignment of the data array.
+  size_t GetAlignment() const { return alignment_; }
+
+ private:
+  ResizableAlignedBuffer(const ResizableAlignedBuffer&) = delete;
+  ResizableAlignedBuffer& operator=(const ResizableAlignedBuffer&) = delete;
+  ResizableAlignedBuffer(ResizableAlignedBuffer&&) = delete;
+  ResizableAlignedBuffer& operator=(ResizableAlignedBuffer&&) = delete;
+
+  PointerAlignedPointerPair buffer_;
+  size_t data_size_;
+  size_t alignment_;
+
+  int subgraph_index_;
+};
+
+// This small class is responsible for allocating, deallocating and reusing
+// dynamic memory from a common underlying buffer. The arena can be used in
+// scenarios when the pattern of memory allocations and deallocations is
+// repetitive, e.g. running NN inference in multiple iterations. Note that
+// zero-sized allocations are explicitly allowed, and will resolve to null.
+class SimpleMemoryArena {
+ public:
+  explicit SimpleMemoryArena(size_t arena_alignment, int subgraph_index = 0)
+      : committed_(false),
+        high_water_mark_(0),
+        underlying_buffer_(arena_alignment, subgraph_index),
+        active_allocs_() {}
+
+  // Delete all allocs. This should be called when allocating the first node of
+  // a subgraph.
+  void ResetAllocs();
+
+  // Delete all allocs which are deallocated before `node`. This should be
+  // called before allocating tensors associated with a series of nodes. It
+  // deletes allocs which are no longer required for allocating the next batch
+  // of tensors. Not calling it will have no impact on the result but it may be
+  // much slower.
+  void PurgeActiveAllocs(int32_t node);
+
+  // Delete all allocs which are allocated after `node`. This should be
+  // called when resetting allocs after `node`. It  deletes allocs which are no
+  // longer required for allocating the next batch of tensors. Not calling it
+  // will have no impact on the result but it may be much slower.
+  void PurgeAfter(int32_t node);
+
+  // Calculate the active allocs at `node`. Call this if the active allocs at
+  // `node` are unknown.
+  void CalculateActiveAllocs(
+      const std::vector<ArenaAllocWithUsageInterval>& allocs, int32_t node);
+
+  // Schedule memory allocation for a tensor with a given size, assuming that it
+  // needs to be allocated before the execution of first_node, and deallocated
+  // after the execution of last_node.
+  TfLiteStatus Allocate(TfLiteContext* context, size_t alignment, size_t size,
+                        int32_t tensor, int32_t first_node, int32_t last_node,
+                        ArenaAllocWithUsageInterval* new_alloc);
+
+  TfLiteStatus Commit(bool* arena_reallocated);
+
+  TfLiteStatus ResolveAlloc(TfLiteContext* context,
+                            const ArenaAllocWithUsageInterval& alloc,
+                            char** output_ptr);
+
+  // This clears allocation details but does not release the underlying buffer.
+  // New allocations should be committed & resolved before using this arena
+  // again.
+  TfLiteStatus ClearPlan();
+
+  // This releases the underlying buffer but does not clear the allocation plan.
+  // Since all associated pointers are invalidated, the arena cannot be used
+  // again until Commit() is called & tensor allocations are resolved.
+  TfLiteStatus ReleaseBuffer();
+
+  size_t GetBufferSize() const { return underlying_buffer_.GetSize(); }
+
+  std::intptr_t BasePointer() const {
+    return reinterpret_cast<std::intptr_t>(underlying_buffer_.GetPtr());
+  }
+
+  // Dumps the memory allocation information of this memory arena (which could
+  // be differentiated from others by the `name`) against the specified op node
+  // execution plan (i.e. `execution_plan`) for the purpose of debugging.
+  // Note: in order to have minimal binary increase caused by this debug info
+  // dump implementation for the TfLite library, and allow users to plug-in
+  // their own memory planner debugger, we have utilized weak symbols to meet
+  // these two requirementsements. By default, there is no debugging info
+  // dumped. To override this, provide a strong defintion of
+  // tflite::DumpArenaInfo(...) whose weak defintion is in
+  // simple_memory_arena.cc. TfLite provides a sample one as
+  // "lite:simple_memory_arena_debug_dump". When this dep is added to the
+  // program, calling this function will output information of this memory arena
+  // about tenosrs and ops, such as memory arena utilization rate, live tensors
+  // at each op etc.
+  void DumpDebugInfo(const std::string& name,
+                     const std::vector<int>& execution_plan) const;
+
+ private:
+  bool committed_;
+  size_t high_water_mark_;
+  ResizableAlignedBuffer underlying_buffer_;
+  std::vector<ArenaAllocWithUsageInterval> active_allocs_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/simple_planner.h b/third_party/tflite-hdrs/tensorflow/lite/simple_planner.h
new file mode 100644
index 00000000..32ee4584
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/simple_planner.h
@@ -0,0 +1,123 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SIMPLE_PLANNER_H_
+#define TENSORFLOW_LITE_SIMPLE_PLANNER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+// A structure to keep heap allocation records. This structure is used by
+// SimplePlanner::allocs_.
+struct SimpleAlloc {
+  SimpleAlloc() { reset(); }
+
+  // Size of allocation.
+  size_t size;
+  // The index of the node that first needs to use this tensor.
+  int32_t node;
+  // Allocated heap memory address of allocation.
+  char* ptr;
+
+  // Reset member variables.
+  inline void reset() {
+    size = 0;
+    node = 0;
+    ptr = nullptr;
+  }
+
+  // Allocate heap memory for a tensor with the given size and first_node
+  // information.
+  inline bool alloc(size_t new_size, int32_t new_first_node) {
+    if (new_size == 0) {
+      return false;
+    }
+    size = new_size;
+    node = new_first_node;
+    assert(ptr == nullptr);
+    ptr = static_cast<char*>(malloc(new_size));
+    return true;
+  }
+
+  // Free allocated heap memory and reset member variables.
+  inline void free() {
+    if (ptr) {
+      ::free(ptr);
+    }
+    reset();
+  }
+};
+
+// A memory planner that makes all the allocations using malloc()/free().
+//
+// This is simple implementation of MemoryPlanner which uses malloc()/free()
+// instead of memory areana. This planner is designed for AddressSanitizer.
+class SimplePlanner : public MemoryPlanner {
+ public:
+  // Ownership of 'context' is not taken and it must remain util the
+  // ArenaPlanner is destroyed. The inputs to the graph will not share
+  // memory with any other tensor, effectively preserving them until the end
+  // of inference.
+  SimplePlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info);
+  ~SimplePlanner() override;
+  SimplePlanner(const SimplePlanner&) = delete;
+  SimplePlanner& operator=(const SimplePlanner&) = delete;
+
+  TfLiteStatus ResetAllocations() override;
+  TfLiteStatus ResetAllocationsAfter(int node) override;
+  TfLiteStatus PlanAllocations() override;
+  TfLiteStatus ExecuteAllocations(int first_node, int last_node) override;
+  TfLiteStatus ReleaseNonPersistentMemory() override;
+  TfLiteStatus AcquireNonPersistentMemory() override;
+  bool HasNonPersistentMemory() override { return true; };
+  void DumpDebugInfo(const std::vector<int>& execution_plan) const override{};
+  void GetAllocInfo(size_t* arena_size,
+                    size_t* arena_persist_size) const override{};
+
+ private:
+  // Free all the all allocations.
+  void FreeAllAllocations();
+
+  // Assign absolute memory location to a tensor.
+  TfLiteStatus ResolveTensorAllocation(int tensor_index);
+
+  TfLiteContext* context_;
+  std::unique_ptr<GraphInfo> graph_info_;
+
+  // Stores allocation data for all tensors.
+  std::vector<SimpleAlloc> allocs_;
+
+  // First node, that uses the tensor. It needs to be allocated before
+  // execution of the node's operation.
+  std::vector<int32_t> alloc_node_;
+
+  // Last node, that uses the tensor. It can be deallocated after execution of
+  // the node's operation.
+  std::vector<int32_t> dealloc_node_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SIMPLE_PLANNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/stateful_error_reporter.h b/third_party/tflite-hdrs/tensorflow/lite/stateful_error_reporter.h
new file mode 100644
index 00000000..10dc0964
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/stateful_error_reporter.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_STATEFUL_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_STATEFUL_ERROR_REPORTER_H_
+
+// LINT.IfChange
+#include <string>
+
+#include "tensorflow/compiler/mlir/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+// Similar to tflite::ErrorReporter, except that it allows callers to get the
+// last error message.
+class StatefulErrorReporter : public ErrorReporter {
+ public:
+  // Returns last error message. Returns empty string if no error is reported.
+  virtual std::string message() = 0;
+};
+
+}  // namespace tflite
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/stateful_error_reporter.h)
+
+#endif  // TENSORFLOW_LITE_STATEFUL_ERROR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/stderr_reporter.h b/third_party/tflite-hdrs/tensorflow/lite/stderr_reporter.h
new file mode 100644
index 00000000..fdac5d40
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/stderr_reporter.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_STDERR_REPORTER_H_
+#define TENSORFLOW_LITE_STDERR_REPORTER_H_
+
+#include <cstdarg>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+// An error reporter that simply writes the message to stderr.
+struct StderrReporter : public ErrorReporter {
+  int Report(const char* format, va_list args) override;
+};
+
+// Return the default error reporter (output to stderr).
+ErrorReporter* DefaultErrorReporter();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_STDERR_REPORTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/string_type.h b/third_party/tflite-hdrs/tensorflow/lite/string_type.h
new file mode 100644
index 00000000..f5a7f833
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/string_type.h
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Abstract string. We don't want even absl at this level.
+#ifndef TENSORFLOW_LITE_STRING_TYPE_H_
+#define TENSORFLOW_LITE_STRING_TYPE_H_
+
+#include <string>
+
+namespace tflite {
+
+using std::string;
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_STRING_TYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/string_util.h b/third_party/tflite-hdrs/tensorflow/lite/string_util.h
new file mode 100644
index 00000000..511621d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/string_util.h
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Util methods to read and write String tensors.
+// String tensors are considered to be char tensor with protocol.
+//   [0, 3] 4 bytes: N, num of strings in the tensor in little endian.
+//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian,
+//                                 for i from 0 to N-1.
+//   [(N+1)*4, (N+1)*4+3] 4 bytes: length of the whole char buffer.
+//   [offset(i), offset(i+1) - 1] : content of i-th string.
+// Example of a string tensor:
+// [
+//   2, 0, 0, 0,     # 2 strings.
+//   16, 0, 0, 0,    # 0-th string starts from index 16.
+//   18, 0, 0, 0,    # 1-st string starts from index 18.
+//   18, 0, 0, 0,    # total length of array.
+//   'A', 'B',       # 0-th string [16..17]: "AB"
+// ]                 # 1-th string, empty
+//
+// A typical usage:
+// In op.Eval(context, node):
+//   DynamicBuffer buf;
+//   # Add string "AB" to tensor, string is stored in dynamic buffer.
+//   buf.AddString("AB", 2);
+//   # Write content of DynamicBuffer to tensor in format of string tensor
+//   # described above.
+//   buf.WriteToTensor(tensor, nullptr)
+
+#ifndef TENSORFLOW_LITE_STRING_UTIL_H_
+#define TENSORFLOW_LITE_STRING_UTIL_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/utils/string_utils.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+
+using ::mlir::TFL::GetString;
+using ::mlir::TFL::GetStringCount;
+using ::mlir::TFL::kDefaultMaxLength;
+using ::mlir::TFL::StringRef;
+
+// DynamicBuffer holds temporary buffer that will be used to create a dynamic
+// tensor. A typical usage is to initialize a DynamicBuffer object, fill in
+// content and call CreateStringTensor in op.Eval().
+class DynamicBuffer : public mlir::TFL::SimpleDynamicBuffer {
+ public:
+  explicit DynamicBuffer(size_t max_length = kDefaultMaxLength)
+      : mlir::TFL::SimpleDynamicBuffer(max_length) {}
+  // Add string to dynamic buffer by resizing the buffer and copying the data.
+  TfLiteStatus AddString(const StringRef& string);
+
+  // Add string to dynamic buffer by resizing the buffer and copying the data.
+  TfLiteStatus AddString(const char* str, size_t len);
+
+  // Join a list of string with separator, and add as a single string to the
+  // buffer.
+  void AddJoinedString(const std::vector<StringRef>& strings, char separator);
+  void AddJoinedString(const std::vector<StringRef>& strings,
+                       StringRef separator);
+
+  using mlir::TFL::SimpleDynamicBuffer::WriteToBuffer;
+  // Fill content into a string tensor, with the given new_shape. The new shape
+  // must match the number of strings in this object. Caller relinquishes
+  // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
+  // existing shape.
+  void WriteToTensor(TfLiteTensor* tensor, TfLiteIntArray* new_shape);
+
+  // Fill content into a string tensor. Set shape to {num_strings}.
+  void WriteToTensorAsVector(TfLiteTensor* tensor);
+};
+
+// Return num of strings in a String tensor.
+inline int GetStringCount(const TfLiteTensor* tensor) {
+  // The first integers in the raw buffer is the number of strings.
+  return GetStringCount(tensor->data.raw);
+}
+
+// Get String pointer and length of index-th string in tensor.
+// NOTE: This will not create a copy of string data.
+inline StringRef GetString(const TfLiteTensor* tensor, int string_index) {
+  return GetString(tensor->data.raw, string_index);
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_STRING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/swift/docsgen/TensorFlowLiteSwift/TensorFlowLiteSwift/TensorFlowLiteSwift.h b/third_party/tflite-hdrs/tensorflow/lite/swift/docsgen/TensorFlowLiteSwift/TensorFlowLiteSwift/TensorFlowLiteSwift.h
new file mode 100644
index 00000000..792d6717
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/swift/docsgen/TensorFlowLiteSwift/TensorFlowLiteSwift/TensorFlowLiteSwift.h
@@ -0,0 +1,25 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#import <Foundation/Foundation.h>
+
+//! Project version number for TensorFlowLiteSwift.
+FOUNDATION_EXPORT double TensorFlowLiteSwiftVersionNumber;
+
+//! Project version string for TensorFlowLiteSwift.
+FOUNDATION_EXPORT const unsigned char TensorFlowLiteSwiftVersionString[];
+
+// In this header, you should import all the public headers of your framework
+// using statements like #import <TensorFlowLiteSwift/PublicHeader.h>
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tensorflow_profiler_logger.h b/third_party/tflite-hdrs/tensorflow/lite/tensorflow_profiler_logger.h
new file mode 100644
index 00000000..35751072
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tensorflow_profiler_logger.h
@@ -0,0 +1,89 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TENSORFLOW_PROFILER_LOGGER_H_
+#define TENSORFLOW_LITE_TENSORFLOW_PROFILER_LOGGER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/lite/core/macros.h"
+
+struct TfLiteTensor;
+
+namespace tsl {
+namespace profiler {
+class TraceMe;
+}  // namespace profiler
+}  // namespace tsl
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::TraceMe;
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+namespace tflite {
+
+// Records an op preparation with `op_name` and `node_index`.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteOpPrepare(const char* op_name,
+                                             int subgraph_index,
+                                             int node_index);
+
+// Returns a `TraceMe` pointer to record a subgraph invocation with
+// `subgraph_name` and `subgraph_index`.
+TFLITE_ATTRIBUTE_WEAK tensorflow::profiler::TraceMe* OnTfLiteSubgraphInvoke(
+    const char* subgraph_name, int subgraph_index);
+
+// Records an end of the subgraph invocation with the given `TraceMe` pointer.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteSubgraphInvokeEnd(
+    tensorflow::profiler::TraceMe* trace_me);
+
+// Returns a `TraceMe` pointer to record an op invocation with `op_name` and
+// `node_index`.
+TFLITE_ATTRIBUTE_WEAK tensorflow::profiler::TraceMe* OnTfLiteOpInvoke(
+    const char* op_name, int subgraph_index, int node_index);
+
+// Records an end of the op invocation with the given `TraceMe` pointer.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteOpInvokeEnd(
+    tensorflow::profiler::TraceMe* trace_me);
+
+// Records an event of `num_bytes` of memory allocated for `tensor`.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteTensorAlloc(TfLiteTensor* tensor,
+                                               size_t num_bytes);
+
+// Records an event of memory deallocated for `tensor`.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteTensorDealloc(TfLiteTensor* tensor);
+
+// Records an event of `num_bytes` of memory allocated for arena.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteArenaAlloc(int subgraph_index, int arena_id,
+                                              size_t num_bytes);
+
+// Records an event of `num_bytes` of memory deallocated for arena.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteArenaDealloc(int subgraph_index,
+                                                int arena_id, size_t num_bytes);
+
+// Pause / resume heap monitoring via malloc/free hooks.
+TFLITE_ATTRIBUTE_WEAK void PauseHeapMonitoring(bool pause);
+
+// Records end of Interpreter so logger can report saved heap allocations.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteInterpreterEnd();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TENSORFLOW_PROFILER_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/test_util.h b/third_party/tflite-hdrs/tensorflow/lite/test_util.h
new file mode 100644
index 00000000..1cf2d020
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/test_util.h
@@ -0,0 +1,392 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TEST_UTIL_H_
+
+#include <cstddef>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/c/test_util.h"
+
+namespace tflite {
+namespace testing {
+
+class Test : public ::testing::Test {
+ public:
+  static void SetUpTestSuite() { ASSERT_EQ(TfLiteInitializeShimsForTest(), 0); }
+};
+
+}  // namespace testing
+
+namespace test_util_internal {
+
+template <class T>
+struct TfLiteArrayDataTypeImpl;
+
+template <>
+struct TfLiteArrayDataTypeImpl<TfLiteIntArray> {
+  using Type = int;
+};
+
+template <>
+struct TfLiteArrayDataTypeImpl<TfLiteArrayUniquePtr<int>> {
+  using Type = int;
+};
+
+template <>
+struct TfLiteArrayDataTypeImpl<TfLiteFloatArray> {
+  using Type = float;
+};
+
+template <>
+struct TfLiteArrayDataTypeImpl<TfLiteArrayUniquePtr<float>> {
+  using Type = float;
+};
+
+// Maps from the array type to its data type.
+template <class T>
+using TfLiteArrayDataType =
+    typename test_util_internal::TfLiteArrayDataTypeImpl<
+        std::remove_const_t<std::remove_pointer_t<std::decay_t<T>>>>::Type;
+
+// Matches TFLite array values against the expected values.
+template <class LhsType, class RhsType, class TfLiteArrayType>
+bool MatchAndExplainTfLiteArray(
+    const TfLiteArrayType& array, const std::vector<RhsType>& rhs,
+    const ::testing::Matcher<std::tuple<LhsType, RhsType>>& m,
+    ::testing::MatchResultListener* listener) {
+  if (static_cast<size_t>(array.size) != rhs.size()) {
+    *listener << "does not have the right size. Expected " << rhs.size()
+              << ", got " << array.size;
+    return false;
+  }
+  for (int i = 0; i < array.size; ++i) {
+    ::testing::StringMatchResultListener inner_listener;
+    if (!m.MatchAndExplain(std::tuple<LhsType, RhsType>(array.data[i], rhs[i]),
+                           &inner_listener)) {
+      *listener << "the values (" << array.data[i] << ", " << rhs[i]
+                << ") at index " << i << " do not match.";
+      const std::string matcher_explanation = inner_listener.str();
+      if (!matcher_explanation.empty()) {
+        *listener << matcher_explanation;
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+// Matches TFLite array pointee with the expected values.
+template <class LhsType, class RhsType, class TfLiteArrayType>
+bool MatchAndExplainTfLiteArray(
+    TfLiteArrayType* const array, const std::vector<RhsType>& rhs,
+    const ::testing::Matcher<std::tuple<LhsType, RhsType>>& m,
+    ::testing::MatchResultListener* listener) {
+  if (array == nullptr) {
+    *listener << "does not point to an array.";
+    return false;
+  }
+  return MatchAndExplainTfLiteArray(*array, rhs, m, listener);
+}
+
+// Matches TFLite array pointee with the expected values.
+template <class LhsType, class RhsType>
+bool MatchAndExplainTfLiteArray(
+    const TfLiteArrayUniquePtr<LhsType>& array, const std::vector<RhsType>& rhs,
+    const ::testing::Matcher<std::tuple<LhsType, RhsType>>& m,
+    ::testing::MatchResultListener* listener) {
+  return MatchAndExplainTfLiteArray(array.get(), rhs, m, listener);
+}
+
+}  // namespace test_util_internal
+
+// Matches a TFLite array against the provided expected values.
+//
+// This matcher allows checking TFLite array references, raw and smart
+// pointers against the expected result.
+//
+// Raw pointers make for the vast majority of TFLite array use cases. This
+// matchers aims to simplify testing those.
+//
+// A multi-argument matcher may be provided for tolerance, otherwise
+// `testing::Eq` is used. This should be most useful for float arrays in
+// combination with `testing::FloatNear` but will also work with int arrays.
+template <class TupleMatcher, class RhsType>
+class TfArrayMatcher {
+ public:
+  using RhsDataContainer = std::vector<RhsType>;
+
+  TfArrayMatcher(const TupleMatcher& tuple_matcher, const RhsType* rhs,
+                 const size_t size)
+      : tuple_matcher_(tuple_matcher), rhs_(rhs, rhs + size) {}
+
+  // This has to be implicit. GTest expects the matcher factory functions to
+  // return an object of type `testing::Matcher<Lhs>`. This matcher is
+  // polymorphic: it does not know the LHS beforehand, so we need to be able to
+  // implicitely convert it to the expected type when it is called.
+  //
+  // Note that this is the canonical way of implementing this kind of matcher.
+  // See `testing::PointwiseMatcher` for an example.
+  template <class ArrayType>
+  operator ::testing::Matcher<ArrayType>() const {
+    return ::testing::Matcher<ArrayType>(
+        new Impl<ArrayType>(tuple_matcher_, rhs_));
+  }
+
+  template <class TfLiteArrayType>
+  class Impl : public ::testing::MatcherInterface<TfLiteArrayType> {
+   public:
+    using LhsType = test_util_internal::TfLiteArrayDataType<TfLiteArrayType>;
+    using TupleArg = std::tuple<LhsType, RhsType>;
+
+    Impl(const TupleMatcher& tuple_matcher, const RhsDataContainer& rhs)
+        : m_(::testing::SafeMatcherCast<TupleArg>(tuple_matcher)), rhs_(rhs) {}
+
+    // Because of the 2 level template matcher implementation, we do not have
+    // much control on the signature of MatchAndexplain. We use a helper
+    // function to handle pointer and value semantics at the same time.
+    bool MatchAndExplain(
+        const TfLiteArrayType& array,
+        ::testing::MatchResultListener* listener) const override {
+      return test_util_internal::MatchAndExplainTfLiteArray(array, rhs_, m_,
+                                                            listener);
+    }
+
+    void DescribeTo(std::ostream* os) const override {
+      *os << "has each element and its corresponding value in [";
+      *os << absl::StrJoin(rhs_, ", ");
+      *os << "] that ";
+      m_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(std::ostream* os) const override {
+      *os << "has at least one element and its corresponding value in [";
+      *os << absl::StrJoin(rhs_, ", ");
+      *os << "] that ";
+      m_.DescribeNegationTo(os);
+    }
+
+   private:
+    ::testing::Matcher<TupleArg> m_;
+    RhsDataContainer rhs_;
+  };
+
+  TupleMatcher tuple_matcher_;
+  RhsDataContainer rhs_;
+};
+
+namespace test_util_internal {
+
+// Deduces the type and the matcher to build a TfArrayMatcher.
+template <class TupleMatcher, class T>
+TfArrayMatcher<TupleMatcher, T> TfLiteArrayIsFactory(
+    const TupleMatcher& m, std::tuple<const T*, size_t> reference_data) {
+  return TfArrayMatcher<TupleMatcher, T>(m, std::get<0>(reference_data),
+                                         std::get<1>(reference_data));
+}
+
+// Creates an `std::tuple` pointing to the container data and size.
+//
+// We need this intermediate function to create spans for the TFLite array forms
+// that we may encounter. Because the arrays are C structs, we can add a
+// convertion operator. This also helps deducing the data type of the array.
+template <class Container,
+          class T = std::decay_t<decltype(*(std::declval<Container>().data()))>>
+std::tuple<const T*, size_t> AsTuple(const Container& v) {
+  return std::tuple<const T*, size_t>(v.data(), v.size());
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+inline std::tuple<const int*, size_t> AsTuple(const TfLiteIntArray* v) {
+  return std::tuple<const int*, size_t>(v->data, v->size);
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+inline std::tuple<const float*, size_t> AsTuple(const TfLiteFloatArray* v) {
+  return std::tuple<const float*, size_t>(v->data, v->size);
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+inline std::tuple<const int*, size_t> AsTuple(const TfLiteIntArray& v) {
+  return std::tuple<const int*, size_t>(v.data, v.size);
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+inline std::tuple<const float*, size_t> AsTuple(const TfLiteFloatArray& v) {
+  return std::tuple<const float*, size_t>(v.data, v.size);
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+inline std::tuple<const int*, size_t> AsTuple(
+    const TfLiteArrayUniquePtr<int>& v) {
+  return std::tuple<const int*, size_t>(v->data, v->size);
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+inline std::tuple<const float*, size_t> AsTuple(
+    const TfLiteArrayUniquePtr<float>& v) {
+  return std::tuple<const float*, size_t>(v->data, v->size);
+}
+
+// Creates an `std::tuple` pointing to the array data and size.
+template <class T>
+std::tuple<const T*, size_t> AsTuple(std::initializer_list<T> v) {
+  return std::tuple<const T*, size_t>(v.begin(), v.size());
+}
+
+}  // namespace test_util_internal
+
+// Matches a TFLite array value, pointer or smart pointer against the expected
+// container using the testing::Eq matcher.
+template <class Container>
+auto TfLiteArrayIs(const Container& expected) {
+  return test_util_internal::TfLiteArrayIsFactory(
+      ::testing::Eq(), test_util_internal::AsTuple(expected));
+}
+
+// Matches a TFLite array value, pointer or smart pointer against the expected
+// container using the provided matcher.
+template <class TupleMatcher, class Container>
+auto TfLiteArrayIs(const TupleMatcher& m, const Container& expected) {
+  return test_util_internal::TfLiteArrayIsFactory(
+      m, test_util_internal::AsTuple(expected));
+}
+
+// Matches a TFLite array value, pointer or smart pointer against the expected
+// container using the testing::Eq matcher.
+//
+// This overload is needed to handle intializer lists.
+template <class T>
+auto TfLiteArrayIs(std::initializer_list<T> expected) {
+  return test_util_internal::TfLiteArrayIsFactory(
+      ::testing::Eq(), test_util_internal::AsTuple(expected));
+}
+
+// Matches a TFLite array value, pointer or smart pointer against the expected
+// container using the testing::Eq matcher.
+//
+// This overload is needed to handle intializer lists.
+template <class TupleMatcher, class T>
+auto TfLiteArrayIs(const TupleMatcher& m, std::initializer_list<T> expected) {
+  return test_util_internal::TfLiteArrayIsFactory(
+      m, test_util_internal::AsTuple(expected));
+}
+
+}  // namespace tflite
+
+// Abseil printing machinery for TFLite array GTest error printing and
+// debugging.
+//
+// TfLiteIntArray and TfLiteFloat array are C structs and therefore defined in
+// at the global namespace level. If we want GTest to find the printers, we need
+// to define them there too.
+//
+// We explicitely speel out the different overloads to avoid catching other
+// types by inadvertence.
+
+namespace tflite {
+namespace test_util_internal {
+
+// Writes the contents of the span to the absl sink.
+template <class Sink, class T>
+void WriteToSink(Sink& sink, const T* const span, const size_t size) {
+  sink.Append("[");
+  if (size) {
+    absl::Format(&sink, "%v", span[0]);
+  }
+  for (size_t i = 1; i < size; ++i) {
+    absl::Format(&sink, ", %v", span[i]);
+  }
+  sink.Append("]");
+}
+
+// Returns the given pointer.
+template <class T>
+const T* AsPointer(const T* ptr) {
+  return ptr;
+}
+
+// Get a pointer to the given object.
+template <class T>
+const T* AsPointer(const T& obj) {
+  return &obj;
+}
+
+// Implements the absl stringification of TFLite arrays.
+//
+// If the given type is an array, this will also check that it is not null.
+template <class Sink, class TfLiteArrayType>
+void AbslStringifyImpl(Sink& sink, const TfLiteArrayType& array) {
+  auto* array_ptr = AsPointer(array);
+  if (array_ptr) {
+    WriteToSink(sink, array_ptr->data, array_ptr->size);
+  } else {
+    sink.Append("nullptr");
+  }
+}
+
+}  // namespace test_util_internal
+}  // namespace tflite
+
+// Implements the printing of custom values in GTest for TfLiteIntArray.
+template <class Sink>
+void AbslStringify(Sink& sink, const TfLiteIntArray& arr) {
+  tflite::test_util_internal::AbslStringifyImpl(sink, arr);
+}
+
+// Implements the printing of custom values in GTest for TfLiteIntArray
+// pointers.
+template <class Sink>
+void AbslStringify(Sink& sink, const TfLiteIntArray* const arr) {
+  tflite::test_util_internal::AbslStringifyImpl(sink, arr);
+}
+
+// Implements the printing of custom values in GTest for TfLiteIntArray
+// smart pointers.
+template <class Sink>
+void AbslStringify(Sink& sink, const tflite::TfLiteArrayUniquePtr<int>& arr) {
+  AbslStringify(sink, arr.get());
+}
+
+// Implements the printing of custom values in GTest for TfLiteFloatArray.
+template <class Sink>
+void AbslStringify(Sink& sink, const TfLiteFloatArray& arr) {
+  tflite::test_util_internal::AbslStringifyImpl(sink, arr);
+}
+
+// Implements the printing of custom values in GTest for TfLiteFloatArray
+// pointers.
+template <class Sink>
+void AbslStringify(Sink& sink, const TfLiteFloatArray* const arr) {
+  tflite::test_util_internal::AbslStringifyImpl(sink, arr);
+}
+
+// Implements the printing of custom values in GTest for TfLiteFloatArray
+// smart pointers.
+template <class Sink>
+void AbslStringify(Sink& sink, const tflite::TfLiteArrayUniquePtr<float>& arr) {
+  AbslStringify(sink, arr.get());
+}
+
+#endif  // TENSORFLOW_LITE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/generate_testspec.h b/third_party/tflite-hdrs/tensorflow/lite/testing/generate_testspec.h
new file mode 100644
index 00000000..79d0114c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/generate_testspec.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_GENERATE_TESTSPEC_H_
+#define TENSORFLOW_LITE_TESTING_GENERATE_TESTSPEC_H_
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace testing {
+
+// Generate test spec by executing TensorFlow model on random inputs.
+// The test spec can be consumed by ParseAndRunTests.
+// See test spec format in parse_testdata.h
+//
+// Inputs:
+//   stream: mutable iostream that contains the contents of test spec.
+//   tensorflow_model_path: path to TensorFlow model.
+//   tflite_model_path: path to tflite_model_path that the test spec runs
+//   num_invocations: how many pairs of inputs and outputs will be generated.
+//   against. input_layer: names of input tensors. Example: input1
+//   input_layer_type: datatypes of input tensors. Example: float
+//   input_layer_shape: shapes of input tensors, separated by comma. example:
+//   1,3,4 output_layer: names of output tensors. Example: output
+bool GenerateTestSpecFromTensorflowModel(
+    std::iostream& stream, const string& tensorflow_model_path,
+    const string& tflite_model_path, int num_invocations,
+    const std::vector<string>& input_layer,
+    const std::vector<string>& input_layer_type,
+    const std::vector<string>& input_layer_shape,
+    const std::vector<string>& output_layer);
+
+// Generate test spec by executing TFLite model on random inputs.
+bool GenerateTestSpecFromTFLiteModel(
+    std::iostream& stream, const string& tflite_model_path, int num_invocations,
+    const std::vector<string>& input_layer,
+    const std::vector<string>& input_layer_type,
+    const std::vector<string>& input_layer_shape,
+    const std::vector<string>& output_layer);
+
+// Generates random values that are filled into the tensor.
+template <typename T, typename RandomFunction>
+std::vector<T> GenerateRandomTensor(const std::vector<int>& shape,
+                                    RandomFunction random_func) {
+  int64_t num_elements = 1;
+  for (const int dim : shape) {
+    num_elements *= dim;
+  }
+
+  std::vector<T> result(num_elements);
+  std::generate_n(result.data(), num_elements, random_func);
+  return result;
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_GENERATE_TESTSPEC_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/init_tensorflow.h b/third_party/tflite-hdrs/tensorflow/lite/testing/init_tensorflow.h
new file mode 100644
index 00000000..51d08c11
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/init_tensorflow.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
+#define TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
+
+namespace tflite {
+
+// Initializes tensorflow's libraries. Note that this simulates an empty
+// command line, so flags are not initialized.
+void InitTensorFlow();
+
+// Initializes tensorflow's libraries with the given command line arguments.
+void InitTensorFlow(int argc, char** argv);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/join.h b/third_party/tflite-hdrs/tensorflow/lite/testing/join.h
new file mode 100644
index 00000000..f7337d70
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/join.h
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_JOIN_H_
+#define TENSORFLOW_LITE_TESTING_JOIN_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <iomanip>
+#include <sstream>
+
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace testing {
+
+// Join a list of data with default precision separated by delimiter.
+template <typename T>
+string JoinDefault(T* data, size_t len, const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << data[0];
+  for (int i = 1; i < len; i++) {
+    result << delimiter << data[i];
+  }
+  return result.str();
+}
+
+// Join a list of data with fixed precision separated by delimiter.
+template <typename T>
+string Join(T* data, size_t len, const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << std::setprecision(9) << data[0];
+  for (int i = 1; i < len; i++) {
+    result << std::setprecision(9) << delimiter << data[i];
+  }
+  return result.str();
+}
+
+// Join a list of uint8 data separated by a delimiter. Cast data to int before
+// placing it in the string to prevent values from being treated like chars.
+template <>
+inline string Join<uint8_t>(uint8_t* data, size_t len,
+                            const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << static_cast<int>(data[0]);
+  for (int i = 1; i < len; i++) {
+    result << delimiter << static_cast<int>(data[i]);
+  }
+  return result.str();
+}
+
+// Join a list of int8 data separated by a delimiter. Cast data to int before
+// placing it in the string to prevent values from being treated like chars.
+template <>
+inline string Join<int8_t>(int8_t* data, size_t len, const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << static_cast<int>(data[0]);
+  for (int i = 1; i < len; i++) {
+    result << delimiter << static_cast<int>(data[i]);
+  }
+  return result.str();
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_JOIN_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/diff_analyzer.h
new file mode 100644
index 00000000..41c18043
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace testing {
+
+// Reads the baseline and test files with output tensor values, and calculates
+// the diff metrics.
+class DiffAnalyzer {
+ public:
+  DiffAnalyzer() = default;
+  // Reads base and test tensor values from files.
+  // Each file have lines in <name>:<values> format, where name is the signature
+  // output name and value as comma separated value string.
+  TfLiteStatus ReadFiles(const string& base, const string& test);
+  // Writes diff report in <name>:<L2 Error>,<Max Diff> format.
+  TfLiteStatus WriteReport(const string& filename);
+
+ private:
+  // Mappings from signature output names to its values.
+  std::unordered_map<string, std::vector<float>> base_tensors_;
+  std::unordered_map<string, std::vector<float>> test_tensors_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/input_generator.h b/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/input_generator.h
new file mode 100644
index 00000000..6f76c47d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/signature_runner.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace testing {
+
+// Generate random input, or read input from a file for kernel diff test.
+// Needs to load the tflite graph to get information like tensor shape and
+// data type.
+class InputGenerator {
+ public:
+  InputGenerator() = default;
+  TfLiteStatus LoadModel(const string& model_dir);
+  TfLiteStatus LoadModel(const string& model_dir, const string& signature);
+  TfLiteStatus ReadInputsFromFile(const string& filename);
+  TfLiteStatus GenerateInput(const string& distribution);
+  std::vector<std::pair<string, string>> GetInputs() { return inputs_; }
+  TfLiteStatus WriteInputsToFile(const string& filename);
+
+ private:
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  // Not owned.
+  SignatureRunner* signature_runner_ = nullptr;
+  // Mapping from input names to csv string values.
+  std::vector<std::pair<string, string>> inputs_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/util.h b/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/util.h
new file mode 100644
index 00000000..cac956da
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/kernel_test/util.h
@@ -0,0 +1,113 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+
+#include <fstream>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+
+struct TestOptions {
+  // Path of tensorflow lite model.
+  string tflite_model;
+  // Path of the input file. If empty, generate at runtime.
+  string read_input_from_file;
+  // Path to dump the input file.
+  string dump_input_to_file;
+  // Path to dump the output.
+  string dump_output_to_file;
+  // Input distribution.
+  string input_distribution;
+  // Kernel type.
+  string kernel_type;
+};
+
+inline TestOptions ParseTfliteKernelTestFlags(int* argc, char** argv) {
+  TestOptions options;
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag("tflite_model", &options.tflite_model,
+                       "Path of tensorflow lite model."),
+      tensorflow::Flag("read_input_from_file", &options.read_input_from_file,
+                       "File to read input data from. If empty, generates "
+                       "input at runtime."),
+      tensorflow::Flag("dump_input_to_file", &options.dump_input_to_file,
+                       "File to dump randomly generated input."),
+      tensorflow::Flag("dump_output_to_file", &options.dump_output_to_file,
+                       "File to dump output."),
+      tensorflow::Flag("input_distribution", &options.input_distribution,
+                       "Input distribution. Default: Gaussian."),
+      tensorflow::Flag("kernel_type", &options.kernel_type, "Kernel type."),
+  };
+
+  tensorflow::Flags::Parse(argc, argv, flags);
+
+  return options;
+}
+
+inline TfLiteStatus RunKernelTest(const kernel_test::TestOptions& options,
+                                  TestRunner* runner) {
+  InputGenerator input_generator;
+
+  if (options.read_input_from_file.empty()) {
+    TF_LITE_ENSURE_STATUS(input_generator.LoadModel(options.tflite_model));
+    TF_LITE_ENSURE_STATUS(
+        input_generator.GenerateInput(options.input_distribution));
+  } else {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.ReadInputsFromFile(options.read_input_from_file));
+  }
+
+  runner->LoadModel(options.tflite_model);
+  runner->AllocateTensors();
+  if (!runner->IsValid()) return kTfLiteError;
+  auto inputs = input_generator.GetInputs();
+
+  runner->Invoke(inputs);
+
+  if (!options.dump_input_to_file.empty()) {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.WriteInputsToFile(options.dump_input_to_file));
+  }
+
+  if (!options.dump_output_to_file.empty()) {
+    std::ofstream output_file;
+    output_file.open(options.dump_output_to_file,
+                     std::fstream::out | std::fstream::trunc);
+    if (!output_file) {
+      return kTfLiteError;
+    }
+
+    for (const auto& name : runner->GetOutputNames()) {
+      output_file << name << ":" << runner->ReadOutput(name) << "\n";
+    }
+    output_file.close();
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/matchers.h b/third_party/tflite-hdrs/tensorflow/lite/testing/matchers.h
new file mode 100644
index 00000000..3293519d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/matchers.h
@@ -0,0 +1,302 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TESTING_MATCHERS_H_
+#define TENSORFLOW_LITE_TESTING_MATCHERS_H_
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/log/absl_check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+// gMock matchers for TfLiteTensors.
+//
+// EXPECT_THAT(a, EqualsTensor(b));
+// EXPECT_THAT(a, Approximately(EqualsTensor(b)));
+// EXPECT_THAT(a, Approximately(EqualsTensor(b), /*margin*/));
+// EXPECT_THAT(a, Approximately(EqualsTensor(b), /*margin=*/0, /*fraction*/));
+//
+// TODO: who/impjdi - Expand to more dtypes than just float.
+// TODO: who/impjdi - Add cross-dtype matchers.
+
+inline void PrintTo(const TfLiteTensor& tensor, std::ostream* os) {
+  *os << "\n" << ::tflite::GetTensorDebugString(&tensor);
+}
+
+namespace testing {
+namespace tflite {
+namespace internal {
+
+enum class FloatComparison { kExact, kApproximate };
+
+struct TensorComparison {
+  FloatComparison float_comp = FloatComparison::kExact;
+  bool custom_margin = false;
+  bool custom_fraction = false;
+  double margin = 0.0;    // only used if custom_margin == true
+  double fraction = 0.0;  // only used if custom_fraction == true
+};
+
+class TensorMatcher {
+ public:
+  TensorMatcher(const TensorComparison& comp, const TfLiteTensor& expected)
+      : comp_(comp), expected_(expected) {}
+
+  bool MatchAndExplain(const TfLiteTensor& actual,
+                       MatchResultListener* listener) const {
+    const bool match = Match(actual);
+    if (listener->IsInterested() && !match) *listener << DescribeDiff(actual);
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const { Describe(os, "is "); }
+  void DescribeNegationTo(std::ostream* os) const { Describe(os, "is not "); }
+
+  void SetCompareApproximately() {
+    comp_.float_comp = FloatComparison::kApproximate;
+  }
+
+  void SetMargin(double margin) {
+    ABSL_QCHECK_GE(margin, 0.0)  // Crash OK
+        << "Using a negative margin for Approximately";
+    comp_.custom_margin = true;
+    comp_.margin = margin;
+  }
+
+  void SetFraction(double fraction) {
+    ABSL_QCHECK(0.0 <= fraction && fraction < 1.0)  // Crash OK
+        << "Fraction for Approximately must be >= 0.0 and < 1.0";
+    comp_.custom_fraction = true;
+    comp_.fraction = fraction;
+  }
+
+ private:
+  static std::string TensorIndex(int index, const TfLiteIntArray* dims) {
+    if (!dims->size) return "";
+    std::vector<int> index_nd(dims->size);
+    for (int i = dims->size - 1; i >= 0; --i) {
+      index_nd[i] = index % dims->data[i];
+      index /= dims->data[i];
+    }
+    return absl::StrCat("[", absl::StrJoin(index_nd, "]["), "]");
+  }
+
+  bool CompareFloat(float x, float y) const {
+    switch (comp_.float_comp) {
+      case FloatComparison::kExact:
+        return x == y;
+      case FloatComparison::kApproximate:
+        if (x == y) return true;
+        float fraction, margin;
+        if (comp_.custom_margin || comp_.custom_fraction) {
+          fraction = comp_.fraction;
+          margin = comp_.margin;
+        } else {
+          constexpr float kEpsilon = 32 * FLT_EPSILON;
+          if (std::fabs(x) <= kEpsilon && std::fabs(y) <= kEpsilon) return true;
+          fraction = kEpsilon;
+          margin = kEpsilon;
+        }
+        if (!std::isfinite(x) || !std::isfinite(y)) return false;
+        float relative_margin = fraction * std::max(std::fabs(x), std::fabs(y));
+        return std::fabs(x - y) <= std::max(margin, relative_margin);
+    }
+    return false;
+  }
+
+  void Describe(std::ostream* os, absl::string_view prefix) const {
+    *os << prefix;
+    if (comp_.float_comp == FloatComparison::kApproximate) {
+      *os << "approximately ";
+      if (comp_.custom_margin || comp_.custom_fraction) {
+        *os << "(";
+        if (comp_.custom_margin) {
+          std::stringstream ss;
+          ss << std::setprecision(std::numeric_limits<double>::digits10 + 2)
+             << comp_.margin;
+          *os << "absolute error of float values <= " << ss.str();
+        }
+        if (comp_.custom_margin && comp_.custom_fraction) {
+          *os << " or ";
+        }
+        if (comp_.custom_fraction) {
+          std::stringstream ss;
+          ss << std::setprecision(std::numeric_limits<double>::digits10 + 2)
+             << comp_.fraction;
+          *os << "relative error of float values <= " << ss.str();
+        }
+        *os << ") ";
+      }
+    }
+    *os << "equal to ";
+    PrintTo(expected_, os);
+  }
+
+  std::string DescribeDiff(const TfLiteTensor& actual) const {
+    if (actual.type != expected_.type) {
+      return absl::StrCat(
+          "dtypes don't match: ", TfLiteTypeGetName(actual.type), " vs ",
+          TfLiteTypeGetName(expected_.type));
+    }
+    if (!actual.dims) return "actual.dims is null.";
+    if (!expected_.dims) return "expected.dims is null.";
+    if (actual.dims->size != expected_.dims->size) {
+      return absl::StrCat("dims don't match: ", actual.dims->size, "D vs ",
+                          expected_.dims->size, "D");
+    }
+    if (int n = actual.dims->size;
+        std::memcmp(actual.dims->data, expected_.dims->data, n * sizeof(int))) {
+      return absl::StrCat(
+          "shapes don't match: ", ::tflite::GetShapeDebugString(actual.dims),
+          " vs ", ::tflite::GetShapeDebugString(expected_.dims));
+    }
+    if (!actual.data.raw) return "actual.data is null.";
+    if (!expected_.data.raw) return "expected.data is null.";
+    if (actual.bytes != expected_.bytes) {
+      return absl::StrCat("bytes don't match: ", actual.bytes, " vs ",
+                          expected_.bytes);
+    }
+    std::string error = "\n";
+    TfLiteIntArray* dims = actual.dims;
+    int n = ::tflite::NumElements(dims);
+    constexpr int kMaxMismatches = 20;
+    for (int i = 0, j = 0; i < n; ++i) {
+      if (!CompareFloat(actual.data.f[i], expected_.data.f[i])) {
+        absl::StrAppend(&error, "data", TensorIndex(i, dims),
+                        " don't match: ", actual.data.f[i], " vs ",
+                        expected_.data.f[i], "\n");
+        ++j;
+      }
+      if (j == kMaxMismatches) {
+        absl::StrAppend(&error, "Too many mismatches; stopping after ", j,
+                        ".\n");
+        break;
+      }
+    }
+    return error;
+  }
+
+  bool Match(const TfLiteTensor& actual) const {
+    if (actual.type != expected_.type) return false;
+    if (!actual.dims) return false;
+    if (!expected_.dims) return false;
+    if (actual.dims->size != expected_.dims->size) return false;
+    if (int n = actual.dims->size;
+        std::memcmp(actual.dims->data, expected_.dims->data, n * sizeof(int))) {
+      return false;
+    }
+    if (!actual.data.raw) return false;
+    if (!expected_.data.raw) return false;
+    if (actual.bytes != expected_.bytes) return false;
+    switch (comp_.float_comp) {
+      case FloatComparison::kExact:
+        if (int n = actual.bytes;
+            std::memcmp(actual.data.raw, expected_.data.raw, n)) {
+          return false;
+        }
+        break;
+      case FloatComparison::kApproximate:
+        for (int i = 0, n = ::tflite::NumElements(actual.dims); i < n; ++i) {
+          if (!CompareFloat(actual.data.f[i], expected_.data.f[i])) {
+            return false;
+          }
+        }
+        break;
+    };
+    return true;
+  }
+
+  TensorComparison comp_;
+  TfLiteTensor expected_;
+};
+
+}  // namespace internal
+
+// A struct that simplifies the creation and management of constant
+// `TfLiteTensor` objects, automatically deallocating the memory (including
+// dims) at destruction time.
+//
+// Example:
+//  float data[] = {2.71828f, 3.14159f};
+//  SimpleConstTensor a(TfLiteType::kTfLiteFloat32, {1, 2},
+//    absl::MakeSpan(data));
+struct SimpleConstTensor : public TfLiteTensor {
+  template <typename T>
+  SimpleConstTensor(TfLiteType dtype, const std::vector<int>& shape,
+                    absl::Span<T> buf) {
+    type = dtype;
+    dims = TfLiteIntArrayCreate(shape.size());
+    std::memcpy(dims->data, shape.data(), shape.size() * sizeof(int));
+    data = {.data = buf.data()};
+    bytes = buf.size() * sizeof(T);
+    sparsity = nullptr;
+  }
+  ~SimpleConstTensor() { TfLiteIntArrayFree(dims); }
+};
+
+// Delegate pretty print to PrintTo(TfLiteTensor&).
+inline void PrintTo(const SimpleConstTensor& tensor,
+                    std::ostream* os) {  // NOLINT
+  PrintTo(absl::implicit_cast<const TfLiteTensor&>(tensor), os);
+}
+
+inline PolymorphicMatcher<internal::TensorMatcher> EqualsTensor(
+    const TfLiteTensor& expected) {
+  internal::TensorComparison comp;
+  return MakePolymorphicMatcher(internal::TensorMatcher(comp, expected));
+}
+
+template <class InnerTensorMatcherT>
+inline InnerTensorMatcherT Approximately(InnerTensorMatcherT m) {
+  m.mutable_impl().SetCompareApproximately();
+  return m;
+}
+
+template <class InnerTensorMatcherT>
+inline InnerTensorMatcherT Approximately(InnerTensorMatcherT m, double margin) {
+  m.mutable_impl().SetCompareApproximately();
+  m.mutable_impl().SetMargin(margin);
+  return m;
+}
+
+template <class InnerTensorMatcherT>
+inline InnerTensorMatcherT Approximately(InnerTensorMatcherT m, double margin,
+                                         double fraction) {
+  m.mutable_impl().SetCompareApproximately();
+  m.mutable_impl().SetMargin(margin);
+  m.mutable_impl().SetFraction(fraction);
+  return m;
+}
+
+}  // namespace tflite
+}  // namespace testing
+
+#endif  // TENSORFLOW_LITE_TESTING_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/message.h b/third_party/tflite-hdrs/tensorflow/lite/testing/message.h
new file mode 100644
index 00000000..e6566ab1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/message.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_MESSAGE_H_
+#define TENSORFLOW_LITE_TESTING_MESSAGE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace testing {
+
+// A Message is a textual protobuf-like structure that looks like:
+//    tag {
+//      f : "values"
+//      child {
+//        a : 1
+//       }
+//    }
+// This class provides the framework for processing message but does not
+// associate any particular behavior to fields and submessage. In order
+// to properly parse a stream this class must be derived.
+class Message {
+ public:
+  // Reads a stream, tokenizes it and create a new message under the given
+  // top-level message. Returns true if the parsing succeeded.
+  static bool Read(std::istream* input, Message* message);
+
+  Message() {}
+  virtual ~Message() {}
+
+  // Called when a new field is found. For example, when:
+  //   f : "values"
+  // is found, it triggers:
+  //   SetField("f", "values");
+  virtual void SetField(const std::string& name, const std::string& value) {}
+
+  // Called when a submessage is started. For example, when:
+  //   child {
+  // is found, it triggers
+  //   AddChild("child");
+  // If nullptr is returned, the contents of the submessage will be ignored.
+  // Otherwise, the returned Message will be used to handle new fields and new
+  // submessages. The caller should not take ownership of the returned pointer.
+  virtual Message* AddChild(const std::string& name) { return nullptr; }
+
+  // Called when a submessage is completed, that is, whenever a '}' is found.
+  virtual void Finish() {}
+
+ protected:
+  // Takes ownership of the given pointer. Subclasses can use this method if
+  // they don't want to implement their own ownership semantics.
+  Message* Store(Message* n) {
+    children_.emplace_back(n);
+    return n;
+  }
+
+  // Returns a list of all owned submessages.
+  const std::vector<std::unique_ptr<Message>>& Children() const {
+    return children_;
+  }
+
+ private:
+  std::vector<std::unique_ptr<Message>> children_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_MESSAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/parse_testdata.h b/third_party/tflite-hdrs/tensorflow/lite/testing/parse_testdata.h
new file mode 100644
index 00000000..df0297d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/parse_testdata.h
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_PARSE_TESTDATA_H_
+#define TENSORFLOW_LITE_TESTING_PARSE_TESTDATA_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/testing/test_runner.h"
+
+namespace tflite {
+namespace testing {
+
+// Shape and data for a float tensor
+struct FloatTensor {
+  std::vector<int> shape;
+  std::vector<float> flat_data;
+};
+
+// A prescribed input, output example
+struct Example {
+  std::vector<FloatTensor> inputs;
+  std::vector<FloatTensor> outputs;
+};
+
+// Parses an example input and output file (used for unit tests)
+TfLiteStatus ParseExamples(const char* filename,
+                           std::vector<Example>* examples);
+
+// Inputs Tensors into a TensorFlow lite interpreter. Note, this will run
+// interpreter.AllocateTensors();
+TfLiteStatus FeedExample(tflite::Interpreter* interpreter, const Example&);
+
+// Check outputs against (already) evaluated result.
+TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter, const Example&);
+
+// Parses a test description and feeds the given test runner with data.
+// The input format is similar to a proto with the following schema:
+//
+// message TestMessage {
+//   // Path to the model to load.
+//   string load_model = 1;
+//   // Names to initialize the tensor with zeros.
+//   string init_state = 2;
+//   message Reshape {
+//     // Name of the input and csv string of shape of it.
+//     map<string, string> input = 1;
+//   }
+//   repeated Reshape reshape = 3;
+//   message Invoke {
+//     // Name of this invoke.
+//     string id = 1;
+//     // Name of the input to the csv string of input value.
+//     map<string, string> input = 2;
+//     // Name of the output to the csv string of expected output value.
+//     map<string, string> output = 3;
+//     // Name of the output to the csv string of expected output shape.
+//     map<string, string> output_shape = 4;
+//   }
+//   repeated Invoke invoke = 4;
+// }
+//
+// An example of the ASCII proto:
+//   // Loads model 'add.bin' from the TestRunner's model directory.
+//   load_model: "add.bin"
+//   // Changes the shape of inputs, provided in the same order they appear
+//   // in the model, or `input_names` if specified.
+//   reshape {
+//     input {
+//       key: "a"
+//       value: "1,224,224,3"
+//     }
+//     input {
+//       key: "b"
+//       value: "1,3,4,1"
+//     }
+//   }
+//   // Fills the given persistent tensors with zeros.
+//   init_state: "a,b,c,d"
+//   // Invokes the interpreter with the given input and checks that it
+//   // produces the expected output. Inputs and outputs should be specified in
+//   // the order they appear in the model, or `input_names` and `output_names`
+//   // if specified.
+//   invoke {
+//     input {
+//       key: "a"
+//       value: "1,2,3,4,56"
+//     }
+//     input {
+//       key: "b"
+//       value: "0.1,0.2,0.3,4.3,56.4"
+//     }
+//     output {
+//       key: "x"
+//       value: "12,3,4,545,3,6"
+//     }
+//     output {
+//       key: "y"
+//       value: "0.01,0.02"
+//     }
+//     output_shape {
+//       key: "x"
+//       value: "2,3"
+//     }
+//     output_shape {
+//       key: "y"
+//       value: "1"
+//     }
+//   }
+bool ParseAndRunTests(std::istream* input, TestRunner* test_runner,
+                      int max_invocations = -1);
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_PARSE_TESTDATA_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/result_expectations.h b/third_party/tflite-hdrs/tensorflow/lite/testing/result_expectations.h
new file mode 100644
index 00000000..081d142a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/result_expectations.h
@@ -0,0 +1,148 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_RESULT_EXPECTATIONS_H_
+#define TENSORFLOW_LITE_TESTING_RESULT_EXPECTATIONS_H_
+
+#include <complex>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+// Class for comparing the values of expectations against the values computed by
+// the model.
+class DataExpectation {
+ public:
+  //  Constructs a DataExpectation with the given relative threshold, absolute
+  //  threshold, and quantization error multiplier.
+  //
+  //  The relative threshold is the maximum allowed difference between the
+  //  expected value and the actual value, expressed as a percentage of the
+  //  expected value. The absolute threshold is the maximum allowed difference
+  //  between the expected value and the actual value, in absolute terms. The
+  //  quantization error multiplier is the factor by which the expected value
+  //  should be quantized.
+  DataExpectation(double relative_threshold, double absolute_threshold,
+                  int quantization_error_multiplier);
+
+  //  Sets the data for the tensor. The data is expected to be in CSV format,
+  //  with each value separated by a comma. The function will split the CSV
+  //  values into a vector of values and then set the data for the tensor to the
+  //  vector.
+  template <typename T>
+  void SetData(const std::string& csv_values) {
+    const auto values = testing::Split<T>(csv_values, ",");
+    num_elements_ = values.size();
+    data_ = make_type_erased_array<T>(num_elements_);
+    SetTensorData(values, data_.get());
+  }
+
+  //  Checks the data against the expectation.
+  //
+  //  Returns true if the data matches the expectation, false otherwise.
+  bool Check(bool verbose, const TfLiteTensor& tensor);
+
+ private:
+  bool CompareTwoValuesHelper(float v1, float v2);
+
+  bool CompareTwoValuesHelper(double v1, double v2);
+
+  bool CompareTwoValues(std::complex<float> v1, std::complex<float> v2) {
+    return CompareTwoValues(v1.real(), v2.real()) ||
+           CompareTwoValues(v1.imag(), v2.imag());
+  }
+
+  bool CompareTwoValues(std::complex<double> v1, std::complex<double> v2) {
+    return CompareTwoValues(v1.real(), v2.real()) ||
+           CompareTwoValues(v1.imag(), v2.imag());
+  }
+
+  bool CompareTwoValues(float v1, float v2) {
+    return CompareTwoValuesHelper(v1, v2);
+  }
+
+  bool CompareTwoValues(double v1, double v2) {
+    return CompareTwoValuesHelper(v1, v2);
+  }
+
+  // Creates a type-erased array.
+  template <typename T>
+  std::unique_ptr<void, void (*)(void*)> make_type_erased_array(size_t size) {
+    return std::unique_ptr<void, void (*)(void*)>(
+        static_cast<void*>(new T[size]),
+        [](void* data) { delete[] static_cast<T*>(data); });
+  }
+
+  template <typename T>
+  void SetTensorData(const std::vector<T>& values, void* data) {
+    T* input_ptr = static_cast<T*>(data);
+    std::copy(values.begin(), values.end(), input_ptr);
+  }
+
+  template <typename T, typename TS>
+  bool TypedCheck(bool verbose, const TfLiteTensor& tensor);
+
+  bool TypedCheckString(bool verbose, const TfLiteTensor& tensor);
+  bool QuantizedCheck(bool verbose, const TfLiteTensor& tensor);
+
+  std::unique_ptr<void, void (*)(void*)> data_;
+  size_t num_elements_;
+  double relative_threshold_;
+  double absolute_threshold_;
+  int quantization_error_multiplier_;
+};
+
+// SetData specializations.
+template <>
+inline void DataExpectation::SetData<std::string>(
+    const std::string& csv_values) {
+  std::string s = absl::HexStringToBytes(csv_values);
+  data_ = make_type_erased_array<char>(s.size());
+  memcpy(data_.get(), s.data(), s.size());
+}
+
+// Class for comparing the expected shape against the shape of data computed by
+// the model.
+class ShapeExpectation {
+ public:
+  //  Constructs a ShapeExpectation with the given shape.
+  //
+  //  The shape is a vector of integers, where each integer represents the
+  //  size of a dimension.
+  explicit ShapeExpectation(const std::string& csv_values);
+
+  //  Checks the shape of the data against the expectation.
+  //
+  //  Returns true if the shape of the data matches the expectation, false
+  //  otherwise.
+  bool CheckShape(bool verbose, const TfLiteTensor& tensor);
+
+ private:
+  std::vector<int32_t> shape_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_RESULT_EXPECTATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/split.h b/third_party/tflite-hdrs/tensorflow/lite/testing/split.h
new file mode 100644
index 00000000..ec932a8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/split.h
@@ -0,0 +1,226 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_SPLIT_H_
+#define TENSORFLOW_LITE_TESTING_SPLIT_H_
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace testing {
+
+// Splits a string based on the given delimiter string. Each pair in the
+// returned vector has the start and past-the-end positions for each of the
+// parts of the original string. Empty fields are not represented in the
+// output.
+std::vector<std::pair<size_t, size_t>> SplitToPos(const string& s,
+                                                  const string& delimiter);
+
+// Splits the given string and converts each part to the given T.
+template <typename T>
+std::vector<T> Split(const string& s, const string& delimiter);
+
+template <>
+inline std::vector<string> Split(const string& s, const string& delimiter) {
+  std::vector<string> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(s.substr(p.first, p.second - p.first));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<int> Split(const string& s, const string& delimiter) {
+  std::vector<int> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<uint32_t> Split(const string& s, const string& delimiter) {
+  std::vector<uint32_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<int64_t> Split(const string& s, const string& delimiter) {
+  std::vector<int64_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtoll(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<uint64_t> Split(const string& s, const string& delimiter) {
+  std::vector<uint64_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtoull(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<float> Split(const string& s, const string& delimiter) {
+  std::vector<float> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(strtod(s.data() + p.first, nullptr));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<double> Split(const string& s, const string& delimiter) {
+  std::vector<double> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(strtod(s.data() + p.first, nullptr));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
+  std::vector<uint8_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<int8_t> Split(const string& s, const string& delimiter) {
+  std::vector<int8_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<int16_t> Split(const string& s, const string& delimiter) {
+  std::vector<int16_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<uint16_t> Split(const string& s, const string& delimiter) {
+  std::vector<uint16_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<bool> Split(const string& s, const string& delimiter) {
+  std::vector<bool> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    // NOLINTNEXTLINE(runtime/deprecated_fn)
+    bool val = static_cast<bool>(strtol(s.data() + p.first, nullptr, 10));
+    fields.push_back(val);
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<std::complex<float>> Split(const string& s,
+                                              const string& delimiter) {
+  std::vector<std::complex<float>> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    std::string sc = s.substr(p.first, p.second - p.first);
+    std::string::size_type sz_real, sz_img;
+    float real = std::stof(sc, &sz_real);
+    float img = std::stof(sc.substr(sz_real), &sz_img);
+    if (sz_real + sz_img + 1 != sc.length()) {
+      std::cerr << "There were errors in parsing string, " << sc
+                << ", to complex value." << std::endl;
+      return fields;
+    }
+    std::complex<float> c(real, img);
+    fields.push_back(c);
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<std::complex<double>> Split(const string& s,
+                                               const string& delimiter) {
+  std::vector<std::complex<double>> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    std::string sc = s.substr(p.first, p.second - p.first);
+    std::string::size_type sz_real, sz_img;
+    double real = std::stod(sc, &sz_real);
+    double img = std::stod(sc.substr(sz_real), &sz_img);
+    if (sz_real + sz_img + 1 != sc.length()) {
+      std::cerr << "There were errors in parsing string, " << sc
+                << ", to complex value." << std::endl;
+      return fields;
+    }
+    std::complex<double> c(real, img);
+    fields.push_back(c);
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<Eigen::half> Split(const string& s,
+                                      const string& delimiter) {
+  std::vector<Eigen::half> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(Eigen::half_impl::float_to_half_rtne(
+        strtof(s.data() + p.first, nullptr)));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<Eigen::bfloat16> Split(const string& s,
+                                          const string& delimiter) {
+  std::vector<Eigen::bfloat16> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(
+        strtof(s.data() + p.first, nullptr)));
+  }
+  return fields;
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_SPLIT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/string_util.h b/third_party/tflite-hdrs/tensorflow/lite/testing/string_util.h
new file mode 100644
index 00000000..56c024d9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/string_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+
+#include <Python.h>
+#include <string>
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+// Take a python string array, convert it to TF Lite dynamic buffer format and
+// serialize it as a HexString.
+PyObject* SerializeAsHexString(PyObject* value);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/test_runner.h b/third_party/tflite-hdrs/tensorflow/lite/testing/test_runner.h
new file mode 100644
index 00000000..60c79ce0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/test_runner.h
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TEST_RUNNER_H_
+#define TENSORFLOW_LITE_TESTING_TEST_RUNNER_H_
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace testing {
+
+// This is the base class for processing test data. Each one of the virtual
+// methods must be implemented to forward the data to the appropriate executor
+// (e.g. TF Lite's interpreter, or the NNAPI).
+class TestRunner {
+ public:
+  TestRunner() {}
+  virtual ~TestRunner() {}
+
+  // Loads the given model, as a path relative to SetModelBaseDir().
+  // DEPRECATED: use LoadModel with signature instead.
+  virtual void LoadModel(const string& bin_file_path) = 0;
+  // Loads the given model with signature specification.
+  // Model path is relative to SetModelBaseDir().
+  virtual void LoadModel(const string& bin_file_path,
+                         const string& signature) = 0;
+
+  // The following methods are supported by models with SignatureDef.
+  //
+  // Reshapes the tensors.
+  // Keys are the input tensor names, values are csv string of the shape.
+  virtual void ReshapeTensor(const string& name, const string& csv_values) = 0;
+
+  // Sets the given tensor to some initial state, usually zero.
+  virtual void ResetTensor(const std::string& name) = 0;
+
+  // Reads the value of the output tensor and format it into a csv string.
+  virtual string ReadOutput(const string& name) = 0;
+
+  // Runs the model with signature.
+  // Keys are the input tensor names, values are corresponding csv string.
+  virtual void Invoke(const std::vector<std::pair<string, string>>& inputs) = 0;
+
+  // Verifies that the contents of all outputs conform to the existing
+  // expectations. Return true if there are no expectations or they are all
+  // satisfied.
+  // Keys are the input tensor names, values are corresponding csv string.
+  virtual bool CheckResults(
+      const std::vector<std::pair<string, string>>& expected_outputs,
+      const std::vector<std::pair<string, string>>& expected_output_shapes) = 0;
+
+  // Returns the list of output names in the loaded model for given signature.
+  virtual std::vector<string> GetOutputNames() = 0;
+
+  // Reserves memory for all tensors.
+  virtual void AllocateTensors() = 0;
+
+  // Sets the base path for loading models.
+  void SetModelBaseDir(const string& path) {
+    model_base_dir_ = path;
+    if (path[path.length() - 1] != '/') {
+      model_base_dir_ += "/";
+    }
+  }
+
+  // Returns the full path of a model.
+  string GetFullPath(const string& path) { return model_base_dir_ + path; }
+
+  // Gives an id to the next invocation to make error reporting more meaningful.
+  void SetInvocationId(const string& id) { invocation_id_ = id; }
+  const string& GetInvocationId() const { return invocation_id_; }
+
+  // Invalidates the test runner, preventing it from executing any further.
+  void Invalidate(const string& error_message) {
+    std::cerr << error_message << std::endl;
+    error_message_ = error_message;
+  }
+  bool IsValid() const { return error_message_.empty(); }
+  const string& GetErrorMessage() const { return error_message_; }
+
+  // Handles the overall success of this test runner. This will be true if all
+  // invocations were successful.
+  void SetOverallSuccess(bool value) { overall_success_ = value; }
+  bool GetOverallSuccess() const { return overall_success_; }
+
+ protected:
+  // A helper to check of the given number of values is consistent with the
+  // number of bytes in a tensor of type T. When incompatibles sizes are found,
+  // the test runner is invalidated and false is returned.
+  template <typename T>
+  bool CheckSizes(size_t tensor_bytes, size_t num_values) {
+    size_t num_tensor_elements = tensor_bytes / sizeof(T);
+    if (num_tensor_elements != num_values) {
+      Invalidate("Expected '" + std::to_string(num_tensor_elements) +
+                 "' elements for a tensor, but only got '" +
+                 std::to_string(num_values) + "'");
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  string model_base_dir_;
+  string invocation_id_;
+  bool overall_success_ = true;
+
+  string error_message_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TESTING_TEST_RUNNER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/tf_driver.h b/third_party/tflite-hdrs/tensorflow/lite/testing/tf_driver.h
new file mode 100644
index 00000000..f25db13e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/tf_driver.h
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TF_DRIVER_H_
+#define TENSORFLOW_LITE_TESTING_TF_DRIVER_H_
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/test_runner.h"
+
+namespace tflite {
+namespace testing {
+
+// A test runner that feeds inputs into Tensorflow and generates outputs.
+class TfDriver : public TestRunner {
+ public:
+  explicit TfDriver(const std::vector<string>& input_layer,
+                    const std::vector<string>& input_layer_type,
+                    const std::vector<string>& input_layer_shape,
+                    const std::vector<string>& output_layer);
+  ~TfDriver() override {}
+
+  void LoadModel(const string& bin_file_path) override;
+  void LoadModel(const string& bin_file_path, const string&) override {
+    // Input output specifications are now provided by constructor.
+    // TODO(b/205171855): Support TfDriver to load from SavedModel instead of
+    // GraphDef.
+    LoadModel(bin_file_path);
+  }
+
+  void ReshapeTensor(const string& name, const string& csv_values) override;
+  void ResetTensor(const std::string& name) override;
+  string ReadOutput(const string& name) override;
+  void Invoke(const std::vector<std::pair<string, string>>& inputs) override;
+  bool CheckResults(
+      const std::vector<std::pair<string, string>>& expected_outputs,
+      const std::vector<std::pair<string, string>>& expected_output_shapes)
+      override {
+    return true;
+  }
+  std::vector<string> GetOutputNames() override { return output_names_; }
+
+  // no-op. SetInput will overwrite existing data .
+  void AllocateTensors() override {}
+
+ protected:
+  void SetInput(const string& values_as_string, tensorflow::Tensor*);
+  string ReadOutput(const tensorflow::Tensor& tensor);
+
+ private:
+  std::unique_ptr<tensorflow::Session> session_;
+  std::vector<int> input_ids_;
+  std::vector<string> input_names_;
+  absl::flat_hash_map<string, int> input_name_to_id_;
+  std::vector<std::vector<int64_t>> input_shapes_;
+  std::vector<tensorflow::DataType> input_types_;
+  std::unordered_map<string, tensorflow::Tensor> input_tensors_;
+
+  std::vector<int> output_ids_;
+  std::vector<string> output_names_;
+  absl::flat_hash_map<string, int> output_name_to_id_;
+  std::vector<::tensorflow::Tensor> output_tensors_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_TF_DRIVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_diff_flags.h b/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_diff_flags.h
new file mode 100644
index 00000000..54f6a856
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_diff_flags.h
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+#define TENSORFLOW_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+
+#include <cstring>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_diff_util.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+
+inline DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
+  struct {
+    string tensorflow_model;
+    string tflite_model;
+    string input_layer;
+    string input_layer_type;
+    string input_layer_shape;
+    string output_layer;
+    int32_t num_runs_per_pass = 100;
+    string delegate_name;
+    string reference_tflite_model;
+  } values;
+
+  std::string delegate_name;
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag("tensorflow_model", &values.tensorflow_model,
+                       "Path of tensorflow model."),
+      tensorflow::Flag("tflite_model", &values.tflite_model,
+                       "Path of tensorflow lite model."),
+      tensorflow::Flag("input_layer", &values.input_layer,
+                       "Names of input tensors, separated by comma. Example: "
+                       "input_1,input_2."),
+      tensorflow::Flag("input_layer_type", &values.input_layer_type,
+                       "Data types of input tensors, separated by comma. "
+                       "Example: float,int."),
+      tensorflow::Flag(
+          "input_layer_shape", &values.input_layer_shape,
+          "Shapes of input tensors, separated by colon. Example: 1,3,4,1:2."),
+      tensorflow::Flag("output_layer", &values.output_layer,
+                       "Names of output tensors, separated by comma. Example: "
+                       "output_1,output_2."),
+      tensorflow::Flag("num_runs_per_pass", &values.num_runs_per_pass,
+                       "[optional] Number of full runs in each pass."),
+      tensorflow::Flag("delegate", &values.delegate_name,
+                       "[optional] Delegate to use for executing ops. Must be "
+                       "`{\"\", NNAPI, GPU, FLEX}`"),
+      tensorflow::Flag("reference_tflite_model", &values.reference_tflite_model,
+                       "[optional] Path of the TensorFlow Lite model to "
+                       "compare inference results against the model given in "
+                       "`tflite_model`."),
+  };
+
+  bool no_inputs = *argc == 1;
+  bool success = tensorflow::Flags::Parse(argc, argv, flags);
+  if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
+  } else if (values.tensorflow_model.empty() || values.tflite_model.empty() ||
+             values.input_layer.empty() || values.input_layer_type.empty() ||
+             values.input_layer_shape.empty() || values.output_layer.empty()) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
+  }
+
+  TfLiteDriver::DelegateType delegate = TfLiteDriver::DelegateType::kNone;
+  if (!values.delegate_name.empty()) {
+    if (absl::EqualsIgnoreCase(values.delegate_name, "nnapi")) {
+      delegate = TfLiteDriver::DelegateType::kNnapi;
+    } else if (absl::EqualsIgnoreCase(values.delegate_name, "gpu")) {
+      delegate = TfLiteDriver::DelegateType::kGpu;
+    } else if (absl::EqualsIgnoreCase(values.delegate_name, "flex")) {
+      delegate = TfLiteDriver::DelegateType::kFlex;
+    } else {
+      fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+      return {};
+    }
+  }
+
+  return {values.tensorflow_model,
+          values.tflite_model,
+          Split<string>(values.input_layer, ","),
+          Split<string>(values.input_layer_type, ","),
+          Split<string>(values.input_layer_shape, ":"),
+          Split<string>(values.output_layer, ","),
+          values.num_runs_per_pass,
+          delegate,
+          values.reference_tflite_model};
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_diff_util.h b/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_diff_util.h
new file mode 100644
index 00000000..981cc9c1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_diff_util.h
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/testing/test_runner.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+
+// Configurations to run Tflite diff test.
+struct DiffOptions {
+  // Path of tensorflow model.
+  string tensorflow_model;
+  // Path of tensorflow lite model.
+  string tflite_model;
+  // Names of input tensors.
+  // Example: input_1,input_2
+  std::vector<string> input_layer;
+  // Data types of input tensors.
+  // Example: float,int
+  std::vector<string> input_layer_type;
+  // Shapes of input tensors, separated by comma.
+  // Example: 1,3,4,1
+  std::vector<string> input_layer_shape;
+  // Names of output tensors.
+  // Example output_1,output_2
+  std::vector<string> output_layer;
+  // Number of full runs (from building interpreter to checking outputs) in
+  // each of the passes. The first pass has a single inference, while the
+  // second pass does multiple inferences back to back.
+  int num_runs_per_pass;
+  // The type of delegate to apply during inference.
+  TfLiteDriver::DelegateType delegate;
+  // Path of tflite model used to generate golden values.
+  std::string reference_tflite_model = "";
+};
+
+// Run a single TensorFLow Lite diff test with a given options.
+bool RunDiffTest(const DiffOptions& options, int num_invocations);
+
+// Runs diff test for custom TestRunner identified by the factory methiodd
+// 'runner_factory' against TFLite CPU given 'options' 'runner_factory' should
+// return instance of TestRunner, caller will take ownership of the returned
+// object.
+// Function returns True if test pass, false otherwise.
+bool RunDiffTestWithProvidedRunner(const tflite::testing::DiffOptions& options,
+                                   TestRunner* (*runner_factory)());
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_TFLITE_DIFF_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_driver.h b/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_driver.h
new file mode 100644
index 00000000..b4aaf0d6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/tflite_driver.h
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
+#define TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
+
+#include <stdlib.h>
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/signature_runner.h"
+#include "tensorflow/lite/testing/result_expectations.h"
+#include "tensorflow/lite/testing/test_runner.h"
+
+namespace tflite {
+namespace testing {
+
+// A test runner that feeds inputs into TF Lite and verifies its outputs.
+class TfLiteDriver : public TestRunner {
+ public:
+  enum class DelegateType {
+    kNone,
+    kNnapi,
+    kGpu,
+    kFlex,
+  };
+
+  // Initialize the global test delegate providers from commandline arguments
+  // and returns true if successful.
+  static bool InitTestDelegateProviders(int* argc, const char** argv);
+
+  /**
+   * Creates a new TfLiteDriver
+   * @param  delegate         The (optional) delegate to use.
+   * @param  reference_kernel Whether to use the builtin reference kernel
+   * ops.
+   */
+  explicit TfLiteDriver(DelegateType delegate_type = DelegateType::kNone,
+                        bool reference_kernel = false);
+  ~TfLiteDriver() override;
+
+  void LoadModel(const std::string& bin_file_path) override;
+  void LoadModel(const std::string& bin_file_path,
+                 const std::string& signature) override;
+
+  void ReshapeTensor(const std::string& name,
+                     const std::string& csv_values) override;
+  void ResetTensor(const std::string& name) override;
+  std::string ReadOutput(const std::string& name) override;
+  void Invoke(
+      const std::vector<std::pair<std::string, std::string>>& inputs) override;
+  bool CheckResults(
+      const std::vector<std::pair<std::string, std::string>>& expected_outputs,
+      const std::vector<std::pair<std::string, std::string>>&
+          expected_output_shapes) override;
+  std::vector<std::string> GetOutputNames() override;
+
+  void AllocateTensors() override;
+  void SetThreshold(double relative_threshold, double absolute_threshold);
+  void SetQuantizationErrorMultiplier(int quantization_error_multiplier);
+
+ protected:
+  Interpreter::TfLiteDelegatePtr delegate_;
+
+ private:
+  void SetInput(const std::string& name, const std::string& csv_values);
+  void SetExpectation(const std::string& name, const std::string& csv_values);
+  void SetShapeExpectation(const std::string& name,
+                           const std::string& csv_values);
+  void DeallocateStringTensor(TfLiteTensor* t) {
+    if (t) {
+      free(t->data.raw);
+      t->data.raw = nullptr;
+    }
+  }
+  void AllocateStringTensor(int id, size_t num_bytes, TfLiteTensor* t) {
+    t->data.raw = reinterpret_cast<char*>(malloc(num_bytes));
+    t->bytes = num_bytes;
+    tensors_to_deallocate_[id] = t;
+  }
+
+  void ResetLSTMStateTensors();
+  // Formats tensor value to string in csv format.
+  std::string TensorValueToCsvString(const TfLiteTensor* tensor);
+
+  std::map<std::string, uint32_t> signature_inputs_;
+  std::map<std::string, uint32_t> signature_outputs_;
+  std::unique_ptr<OpResolver> resolver_;
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::map<int, std::unique_ptr<DataExpectation>> expected_output_;
+  std::map<int, std::unique_ptr<ShapeExpectation>> expected_output_shape_;
+  SignatureRunner* signature_runner_ = nullptr;
+  bool must_allocate_tensors_ = true;
+  std::map<int, TfLiteTensor*> tensors_to_deallocate_;
+  double relative_threshold_;
+  double absolute_threshold_;
+  int quantization_error_multiplier_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/tokenize.h b/third_party/tflite-hdrs/tensorflow/lite/testing/tokenize.h
new file mode 100644
index 00000000..7bd27833
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/tokenize.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TOKENIZE_H_
+#define TENSORFLOW_LITE_TESTING_TOKENIZE_H_
+
+#include <istream>
+#include <string>
+
+namespace tflite {
+namespace testing {
+
+// Process tokens coming from Tokenize().
+class TokenProcessor {
+ public:
+  virtual ~TokenProcessor() {}
+  // Process a single token. The token won't be reused, so it is OK to call
+  // token.swap().
+  virtual void ConsumeToken(std::string* token) = 0;
+};
+
+// Tokenize a stream on whitespaces, colons and curly braces. Whitespaces are
+// removed from the tokens and double-quotes can be used to avoid that. Note
+// that there is no way to escape double-quotes, so there's no way to have a
+// double-quote inside a token.
+void Tokenize(std::istream* input, TokenProcessor* processor);
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_TOKENIZE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/testing/util.h b/third_party/tflite-hdrs/tensorflow/lite/testing/util.h
new file mode 100644
index 00000000..e14d4637
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/testing/util.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_UTIL_H_
+
+#include <cstdio>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/string_type.h"
+#include "tsl/platform/logging.h"
+
+namespace tflite {
+
+// An ErrorReporter that collects error message in a string, in addition
+// to printing to stderr.
+class TestErrorReporter : public ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override {
+    char buffer[1024];
+    int size = vsnprintf(buffer, sizeof(buffer), format, args);
+    fprintf(stderr, "%s", buffer);
+    error_messages_ += buffer;
+    num_calls_++;
+    return size;
+  }
+
+  void Reset() {
+    num_calls_ = 0;
+    error_messages_.clear();
+  }
+
+  int num_calls() const { return num_calls_; }
+  const string& error_messages() const { return error_messages_; }
+
+ private:
+  int num_calls_ = 0;
+  string error_messages_;
+};
+
+inline void LogToStderr() {
+#ifdef PLATFORM_GOOGLE
+  absl::SetFlag(&FLAGS_logtostderr, true);
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tflite_with_xnnpack_optional.h b/third_party/tflite-hdrs/tensorflow/lite/tflite_with_xnnpack_optional.h
new file mode 100644
index 00000000..4cb72a50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tflite_with_xnnpack_optional.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#define TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#include <memory>
+
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+enum class XNNPackQS8Options { default_value, enabled, disabled };
+
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+MaybeCreateXNNPACKDelegate(TfLiteContext* context, XNNPackQS8Options x);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/allocate_transient_arrays.h b/third_party/tflite-hdrs/tensorflow/lite/toco/allocate_transient_arrays.h
new file mode 100644
index 00000000..5d43d4cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/allocate_transient_arrays.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#define TENSORFLOW_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+// We align the allocated sizes to the next multiple of a cache line,
+// to get simple performance characteristics without side effects of
+// accesses to one buffer on accesses to another buffer.
+// That also takes care of data type alignment for any reasonable type
+// (no reasonable data type should have alignment greater than a cache line).
+// Here we make CPU-centric assumptions, in particular, we assume 64-byte cache
+// lines. Getting this wrong by a factor of 2x (if this ever changes) wouldn't
+// be terrible.
+// Embedded architectures may use a different value for alignment.
+constexpr std::size_t kDefaultTransientDataAlignment = 64;
+
+// Rounds up dividend to a value divisible by divisor.
+inline std::size_t RoundUpToNextMultipleOf(std::size_t dividend,
+                                           std::size_t divisor) {
+  return ((dividend + divisor - 1) / divisor) * divisor;
+}
+
+void AllocateTransientArrays(Model* model,
+                             std::size_t transient_data_alignment);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/args.h b/third_party/tflite-hdrs/tensorflow/lite/toco/args.h
new file mode 100644
index 00000000..09712fa1
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/args.h
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This abstracts command line arguments in toco.
+// Arg<T> is a parseable type that can register a default value, be able to
+// parse itself, and keep track of whether it was specified.
+#ifndef TENSORFLOW_LITE_TOCO_ARGS_H_
+#define TENSORFLOW_LITE_TOCO_ARGS_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_types.h"
+
+namespace toco {
+
+// Since std::vector<int32> is in the std namespace, and we are not allowed
+// to add ParseFlag/UnparseFlag to std, we introduce a simple wrapper type
+// to use as the flag type:
+struct IntList {
+  std::vector<int32> elements;
+};
+struct StringMapList {
+  std::vector<std::unordered_map<std::string, std::string>> elements;
+};
+
+// command_line_flags.h don't track whether or not a flag is specified. Arg
+// contains the value (which will be default if not specified) and also
+// whether the flag is specified.
+// TODO(aselle): consider putting doc string and ability to construct the
+// tensorflow argument into this, so declaration of parameters can be less
+// distributed.
+// Every template specialization of Arg is required to implement
+// default_value(), specified(), value(), parse(), bind().
+template <class T>
+class Arg final {
+ public:
+  explicit Arg(T default_ = T()) : value_(default_) {}
+  virtual ~Arg() {}
+
+  // Provide default_value() to arg list
+  T default_value() const { return value_; }
+  // Return true if the command line argument was specified on the command line.
+  bool specified() const { return specified_; }
+  // Const reference to parsed value.
+  const T& value() const { return value_; }
+
+  // Parsing callback for the tensorflow::Flags code
+  bool Parse(T value_in) {
+    value_ = value_in;
+    specified_ = true;
+    return true;
+  }
+
+  // Bind the parse member function so tensorflow::Flags can call it.
+  std::function<bool(T)> bind() {
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
+  }
+
+ private:
+  // Becomes true after parsing if the value was specified
+  bool specified_ = false;
+  // Value of the argument (initialized to the default in the constructor).
+  T value_;
+};
+
+template <>
+class Arg<toco::IntList> final {
+ public:
+  // Provide default_value() to arg list
+  std::string default_value() const { return ""; }
+  // Return true if the command line argument was specified on the command line.
+  bool specified() const { return specified_; }
+  // Bind the parse member function so tensorflow::Flags can call it.
+  bool Parse(std::string text);
+
+  std::function<bool(std::string)> bind() {
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
+  }
+
+  const toco::IntList& value() const { return parsed_value_; }
+
+ private:
+  toco::IntList parsed_value_;
+  bool specified_ = false;
+};
+
+template <>
+class Arg<toco::StringMapList> final {
+ public:
+  // Provide default_value() to StringMapList
+  std::string default_value() const { return ""; }
+  // Return true if the command line argument was specified on the command line.
+  bool specified() const { return specified_; }
+  // Bind the parse member function so tensorflow::Flags can call it.
+
+  bool Parse(std::string text);
+
+  std::function<bool(std::string)> bind() {
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
+  }
+
+  const toco::StringMapList& value() const { return parsed_value_; }
+
+ private:
+  toco::StringMapList parsed_value_;
+  bool specified_ = false;
+};
+
+// Flags that describe a model. See model_cmdline_flags.cc for details.
+struct ParsedModelFlags {
+  Arg<std::string> input_array;
+  Arg<std::string> input_arrays;
+  Arg<std::string> output_array;
+  Arg<std::string> output_arrays;
+  Arg<std::string> input_shapes;
+  Arg<int> batch_size = Arg<int>(1);
+  Arg<float> mean_value = Arg<float>(0.f);
+  Arg<std::string> mean_values;
+  Arg<float> std_value = Arg<float>(1.f);
+  Arg<std::string> std_values;
+  Arg<std::string> input_data_type;
+  Arg<std::string> input_data_types;
+  Arg<bool> variable_batch = Arg<bool>(false);
+  Arg<toco::IntList> input_shape;
+  Arg<toco::StringMapList> rnn_states;
+  Arg<toco::StringMapList> model_checks;
+  Arg<bool> change_concat_input_ranges = Arg<bool>(true);
+  // Debugging output options.
+  // TODO(benoitjacob): these shouldn't be ModelFlags.
+  Arg<std::string> graphviz_first_array;
+  Arg<std::string> graphviz_last_array;
+  Arg<std::string> dump_graphviz;
+  Arg<bool> dump_graphviz_video = Arg<bool>(false);
+  Arg<std::string> conversion_summary_dir;
+  Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
+  Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
+  Arg<std::string> arrays_extra_info_file;
+  Arg<std::string> model_flags_file;
+};
+
+// Flags that describe the operation you would like to do (what conversion
+// you want). See toco_cmdline_flags.cc for details.
+struct ParsedTocoFlags {
+  Arg<std::string> input_file;
+  Arg<std::string> savedmodel_directory;
+  Arg<std::string> output_file;
+  Arg<std::string> input_format = Arg<std::string>("TENSORFLOW_GRAPHDEF");
+  Arg<std::string> output_format = Arg<std::string>("TFLITE");
+  Arg<std::string> savedmodel_tagset;
+  // TODO(aselle): command_line_flags  doesn't support doubles
+  Arg<float> default_ranges_min = Arg<float>(0.);
+  Arg<float> default_ranges_max = Arg<float>(0.);
+  Arg<float> default_int16_ranges_min = Arg<float>(0.);
+  Arg<float> default_int16_ranges_max = Arg<float>(0.);
+  Arg<std::string> inference_type;
+  Arg<std::string> inference_input_type;
+  Arg<bool> drop_fake_quant = Arg<bool>(false);
+  Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
+  Arg<bool> allow_custom_ops = Arg<bool>(false);
+  Arg<bool> allow_dynamic_tensors = Arg<bool>(true);
+  Arg<std::string> custom_opdefs;
+  Arg<bool> post_training_quantize = Arg<bool>(false);
+  Arg<bool> quantize_to_float16 = Arg<bool>(false);
+  // Deprecated flags
+  Arg<bool> quantize_weights = Arg<bool>(false);
+  Arg<std::string> input_type;
+  Arg<std::string> input_types;
+  Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
+  Arg<bool> drop_control_dependency = Arg<bool>(false);
+  Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
+  Arg<bool> allow_nudging_weights_to_use_fast_gemm_kernel = Arg<bool>(false);
+  Arg<int64_t> dedupe_array_min_size_bytes = Arg<int64_t>(64);
+  Arg<bool> split_tflite_lstm_inputs = Arg<bool>(true);
+  // WARNING: Experimental interface, subject to change
+  Arg<bool> enable_select_tf_ops = Arg<bool>(false);
+  // WARNING: Experimental interface, subject to change
+  Arg<bool> force_select_tf_ops = Arg<bool>(false);
+  // WARNING: Experimental interface, subject to change
+  Arg<bool> unfold_batchmatmul = Arg<bool>(false);
+  // WARNING: Experimental interface, subject to change
+  Arg<std::string> accumulation_type;
+  // WARNING: Experimental interface, subject to change
+  Arg<bool> allow_bfloat16;
+};
+
+}  // namespace toco
+#endif  // TENSORFLOW_LITE_TOCO_ARGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/dump_graphviz.h b/third_party/tflite-hdrs/tensorflow/lite/toco/dump_graphviz.h
new file mode 100644
index 00000000..0e847896
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/dump_graphviz.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#define TENSORFLOW_LITE_TOCO_DUMP_GRAPHVIZ_H_
+
+#include <string>
+
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+void DumpGraphviz(const Model& model, std::string* output_file_contents,
+                  const std::string& graph_name);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_DUMP_GRAPHVIZ_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/export_tensorflow.h b/third_party/tflite-hdrs/tensorflow/lite/toco/export_tensorflow.h
new file mode 100644
index 00000000..bc7ccd8d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/export_tensorflow.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#define TENSORFLOW_LITE_TOCO_EXPORT_TENSORFLOW_H_
+
+#include <string>
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+void ExportTensorFlowGraphDef(const Model& model,
+                              std::string* output_file_contents);
+
+void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_EXPORT_TENSORFLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/format_port.h b/third_party/tflite-hdrs/tensorflow/lite/toco/format_port.h
new file mode 100644
index 00000000..b0f1f2e7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/format_port.h
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file is used to provide equivalents of internal absl::FormatF
+// and absl::StrAppendFormat. Unfortunately, type safety is not as good as a
+// a full C++ example.
+// TODO(aselle): When absl adds support for StrFormat, use that instead.
+#ifndef TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
+#define TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/lite/toco/toco_types.h"
+
+namespace toco {
+namespace port {
+
+/// Identity (default case)
+template <class T>
+T IdentityOrConvertStringToRaw(T foo) {
+  return foo;
+}
+
+// Overloaded case where we return std::string.
+inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
+  return foo.c_str();
+}
+
+// Delegate to TensorFlow Appendf function until absl has an equivalent.
+template <typename... Args>
+inline void AppendFHelper(std::string* destination, const char* fmt,
+                          Args&&... args) {
+  tensorflow::strings::Appendf(destination, fmt, args...);
+}
+
+// Specialization for no argument format string (avoid security bug).
+inline void AppendFHelper(std::string* destination, const char* fmt) {
+  tensorflow::strings::Appendf(destination, "%s", fmt);
+}
+
+// Append formatted string (with format fmt and args args) to the string
+// pointed to by destination. fmt follows C printf semantics.
+// One departure is that %s can be driven by a std::string or string.
+template <typename... Args>
+inline void AppendF(std::string* destination, const char* fmt, Args&&... args) {
+  AppendFHelper(destination, fmt, IdentityOrConvertStringToRaw(args)...);
+}
+
+// Return formatted string (with format fmt and args args). fmt follows C printf
+// semantics. One departure is that %s can be driven by a std::string or string.
+template <typename... Args>
+inline std::string StringF(const char* fmt, Args&&... args) {
+  std::string result;
+  AppendFHelper(&result, fmt, IdentityOrConvertStringToRaw(args)...);
+  return result;
+}
+
+}  // namespace port
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/graph_transformations.h
new file mode 100644
index 00000000..7e0b57c8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -0,0 +1,305 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+
+#include <cstddef>
+#include <initializer_list>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/toco_port.h"
+
+namespace toco {
+
+class GraphTransformation {
+ public:
+  virtual absl::Status Run(Model* model, std::size_t op_index,
+                           bool* modified) = 0;
+  virtual const char* Name() const = 0;
+  virtual ~GraphTransformation() {}
+  // Returns the list of messages that this graph transformation
+  // generated since ClearMessages() was called.
+  const std::vector<std::string>& Messages() const { return messages_; }
+  // Clears the list of messages; should be called after every
+  // run of this graph transformation.
+  void ClearMessages() { return messages_.clear(); }
+  // Adds a message; normally only called by the graph transformation
+  // itself during its run (this function could be protected).
+  template <typename... Args>
+  void AddMessageF(const char* format, const Args&... args) {
+    return messages_.push_back(toco::port::StringF(format, args...));
+  }
+
+ protected:
+  GraphTransformation() {}
+
+  // List of messages generated by this graph transformation.
+  std::vector<std::string> messages_;
+
+ private:
+  GraphTransformation(const GraphTransformation& other) = delete;
+  GraphTransformation(const GraphTransformation&& other) = delete;
+};
+
+class GraphTransformationsSet {
+ public:
+  // The choice of a container with fully-specified iteration order
+  // ensures that graph transformations are always run in the same order,
+  // which avoids having toco randomly fail or produce different results
+  // depending on the toolchain. Ideally success/results should be independent
+  // of the order in which graph transformations are run, but that's
+  // unfortunately not currently guaranteed to be the case.
+  using TransformationsContainer =
+      std::vector<std::unique_ptr<GraphTransformation>>;
+
+  GraphTransformationsSet() {}
+  GraphTransformationsSet(
+      const std::initializer_list<GraphTransformation*> transformations) {
+    for (GraphTransformation* t : transformations) {
+      Add(t);
+    }
+  }
+  void Add(GraphTransformation* transformation) {
+    const std::string& name = transformation->Name();
+    CHECK(!names_.count(name));
+    names_.insert(name);
+    transformations_.emplace_back(transformation);
+  }
+  TransformationsContainer::const_iterator begin() const {
+    return transformations_.begin();
+  }
+  TransformationsContainer::const_iterator end() const {
+    return transformations_.end();
+  }
+  bool empty() const { return transformations_.empty(); }
+
+ private:
+  GraphTransformationsSet(const GraphTransformationsSet& other) = delete;
+  GraphTransformationsSet(const GraphTransformationsSet&& other) = delete;
+  std::vector<std::unique_ptr<GraphTransformation>> transformations_;
+  // Names of transformations in the set. Only used to guard against dupes.
+  std::unordered_set<std::string> names_;
+};
+
+// Run the given list of graph transformations on the model.
+// The message is only for logging purposes.
+// The transformations is a rvalue reference, indicating that
+// nothing else will use these pointers. The user is supposed to
+// construct GraphTransformation objects by using 'new', pass us
+// the resulting raw pointers, and this RunGraphTransformations
+// takes care of delete'ing these pointers.
+absl::Status RunGraphTransformationsWithStatus(
+    Model* model, const std::string& msg,
+    const GraphTransformationsSet& transformations);
+
+inline void RunGraphTransformations(
+    Model* model, const std::string& msg,
+    const GraphTransformationsSet& transformations) {
+  auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
+  CHECK(s.ok()) << s.message();
+}
+
+#define DECLARE_GRAPH_TRANSFORMATION(GTName)                     \
+  class GTName : public GraphTransformation {                    \
+   public:                                                       \
+    ::tensorflow::Status Run(Model* model, std::size_t op_index, \
+                             bool* modified) override;           \
+    const char* Name() const override { return #GTName; }        \
+  };
+
+// List of all graph transformations
+DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertMatrixSetDiagV2OrV3ToV1)
+DECLARE_GRAPH_TRANSFORMATION(ConvertMatrixDiagV2OrV3ToV1)
+DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
+DECLARE_GRAPH_TRANSFORMATION(ConvertReorderAxes)
+DECLARE_GRAPH_TRANSFORMATION(ConvertSqueezeToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialAddNToAdd)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialPackToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTileToConcat)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
+DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
+DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
+DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
+DECLARE_GRAPH_TRANSFORMATION(FuseBroadcastIntoFollowingBinary)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceRnn)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceRnn)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyHardSwish)
+DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeReshapeIntoPrecedingTranspose)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyPRelu)
+DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
+DECLARE_GRAPH_TRANSFORMATION(MoveBinaryOperatorBeforeReshape)
+DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
+DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
+DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits)
+DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
+DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
+DECLARE_GRAPH_TRANSFORMATION(Quantize)
+DECLARE_GRAPH_TRANSFORMATION(RemoveFinalDequantizeOp)
+DECLARE_GRAPH_TRANSFORMATION(RemoveSuccessiveTranspose)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowAssert)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialFakeQuant)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedMinMax)
+DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
+DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantBinaryOperator)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
+DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
+DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
+DECLARE_GRAPH_TRANSFORMATION(ReadArrayMinmaxAndNarrowRangeFromFakeQuant)
+DECLARE_GRAPH_TRANSFORMATION(ReorderElementwiseUnary)
+DECLARE_GRAPH_TRANSFORMATION(ReorderReshapeTranspose)
+DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMerge)
+DECLARE_GRAPH_TRANSFORMATION(ResolveSqueezeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantReshape)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
+DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
+DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(UnrollBatchMatMul)
+DECLARE_GRAPH_TRANSFORMATION(ResolveSpaceToBatchNDAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveBatchToSpaceNDAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolvePadV2Attributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveReduceAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveReshapeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantPack)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSlice)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSelect)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTile)
+DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero)
+DECLARE_GRAPH_TRANSFORMATION(Dequantize)
+DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
+DECLARE_GRAPH_TRANSFORMATION(ShuffleFCWeights)
+DECLARE_GRAPH_TRANSFORMATION(ResolveFakeQuantArgsFromVars)
+DECLARE_GRAPH_TRANSFORMATION(ResolveGatherAttributes)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyNearestUpsample)
+
+class PropagateDefaultMinMax : public GraphTransformation {
+ public:
+  absl::Status Run(Model* model, std::size_t op_index, bool* modified) override;
+  const char* Name() const override { return "PropagateDefaultMinMax"; }
+
+  bool has_any_ranges_defined() const { return !type_ranges_.empty(); }
+  void DefineTypeRange(ArrayDataType data_type, double min, double max) {
+    MinMax minmax;
+    minmax.min = min;
+    minmax.max = max;
+    type_ranges_.emplace_back(data_type, minmax);
+  }
+
+ private:
+  bool SetArrayMinMax(const std::string& array_name, Array* array);
+  std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
+};
+
+class RemoveTrivialReshape : public GraphTransformation {
+ public:
+  absl::Status Run(Model* model, std::size_t op_index, bool* modified) override;
+  const char* Name() const override { return "RemoveTrivialReshape"; }
+  bool treat_expand_dims_as_trivial() const {
+    return treat_expand_dims_as_trivial_;
+  }
+  void set_treat_expand_dims_as_trivial(bool val) {
+    treat_expand_dims_as_trivial_ = val;
+  }
+
+ private:
+  bool treat_expand_dims_as_trivial_ = false;
+};
+
+class ResolveConstantFakeQuant : public GraphTransformation {
+ public:
+  absl::Status Run(Model* model, std::size_t op_index, bool* modified) override;
+  const char* Name() const override { return "ResolveConstantFakeQuant"; }
+
+  // True if the num_bits should adjust the final data type.
+  bool propagate_fake_quant_num_bits() const {
+    return propagate_fake_quant_num_bits_;
+  }
+  void set_propagate_fake_quant_num_bits(bool val) {
+    propagate_fake_quant_num_bits_ = val;
+  }
+
+ private:
+  bool propagate_fake_quant_num_bits_ = false;
+};
+
+class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
+ public:
+  absl::Status Run(Model* model, std::size_t op_index, bool* modified) override;
+  const char* Name() const override {
+    return "EnsureUint8WeightsSafeForFastInt8Kernels";
+  }
+  bool allow_nudging_weights() const { return allow_nudging_weights_; }
+  void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; }
+
+  bool has_default_ranges_flag() const { return has_default_ranges_flag_; }
+  void set_has_default_ranges_flag(bool val) { has_default_ranges_flag_ = val; }
+
+ private:
+  bool allow_nudging_weights_ = false;
+  bool has_default_ranges_flag_ = false;
+};
+
+class IdentifyDilatedConv : public GraphTransformation {
+ public:
+  absl::Status Run(Model* model, std::size_t op_index, bool* modified) override;
+  const char* Name() const override { return "IdentifyDilatedConv"; }
+  bool identify_depthwise_conv() const { return identify_depthwise_conv_; }
+  void set_identify_depthwise_conv(bool val) { identify_depthwise_conv_ = val; }
+
+ private:
+  bool identify_depthwise_conv_ = true;
+};
+
+#undef DECLARE_GRAPH_TRANSFORMATION
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/identify_util.h b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/identify_util.h
new file mode 100644
index 00000000..6c59b0b0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/identify_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_IDENTIFY_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_IDENTIFY_UTIL_H_
+#include <string>
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+
+namespace toco {
+
+namespace util {
+
+bool IsBinaryOp(
+    const Operator* op, OperatorType optype,
+    FusedActivationFunctionType act = FusedActivationFunctionType::kNone);
+
+// Returns true if given array is a scalar and is val.
+bool CheckArrayIsScalarFloat(Model* model, const std::string& name, float val);
+
+// Returns index of scalar input that is equal to val, returns -1 otherwise.
+int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
+                                        float val);
+}  // namespace util
+}  // namespace toco
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_IDENTIFY_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/lstm_utils.h b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/lstm_utils.h
new file mode 100644
index 00000000..b5b8dcab
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/lstm_utils.h
@@ -0,0 +1,111 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+// For consistency with the parameters defined in extended LstmCell's kernel
+// (tensorflow/lite/kernels/lstm.cc),
+// use lowercase for these constants.
+
+enum ExtendedLstmCellInputs {
+  kInputTensor = 0,
+  kInputToInputWeightsTensor = 1,  // Optional
+  kInputToForgetWeightsTensor = 2,
+  kInputToCellWeightsTensor = 3,
+  kInputToOutputWeightsTensor = 4,
+  kRecurrentToInputWeightsTensor = 5,  // Optional
+  kRecurrentToForgetWeightsTensor = 6,
+  kRecurrentToCellWeightsTensor = 7,
+  kRecurrentToOutputWeightsTensor = 8,
+  kCellToInputWeightsTensor = 9,    // Optional
+  kCellToForgetWeightsTensor = 10,  // Optional
+  kCellToOutputWeightsTensor = 11,  // Optional
+  kInputGateBiasTensor = 12,        // Optional
+  kForgetGateBiasTensor = 13,
+  kCellGateBiasTensor = 14,
+  kOutputGateBiasTensor = 15,
+  kProjectionWeightsTensor = 16,  // Optional
+  kProjectionBiasTensor = 17,     // Optional
+  kInputActivationStateTensor = 18,
+  // The op can handle 18 inputs or 20 inputs.
+  kInputCellStateTensor = 19,
+  kExtendedLstmInputCount = 20,
+};
+
+enum ExtendedLstmCellOutputs {
+  kOutputStateTensor = 0,
+  kCellStateTensor = 1,
+  kOutputTensor = 2,
+  kExtendedLstmOutputCount = 3
+};
+
+// Create optional array used for optional tensor in ExtendedLstmCell inputs.
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name);
+
+// Create float array and get its buffer.
+Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
+                                                      std::string* array_name,
+                                                      const Shape& shape);
+
+// Copy data from one array to the other one (supports 1D and 2D array),
+// for 1D array, the 2nd dim's size is 1.
+// Arguments:
+//   src_buffer: the source buffer
+//   src_stride: the stride of source buffer, i.e., 2nd dim's size
+//   src_start_idx1: the 1st dim index of start point in src matrix
+//   src_start_idx2: the 2nd dim index of start point in src matrix
+//   dst_buffer: the destination buffer
+//   dst_stride: the stride of destination buffer, i.e., 2nd dim's size
+//   dst_start_idx1: the 1st dim index of start point in dst matrix
+//   dst_start_idx2: the 2nd dim index of start point in dst matrix
+//   dim1_copy_size: 1st dim size of copy data
+//   dim2_copy_size: 2nd dim size of copy data
+void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
+                   int src_stride, int src_start_idx1, int src_start_idx2,
+                   Buffer<ArrayDataType::kFloat>* dst_buffer, int dst_stride,
+                   int dst_start_idx1, int dst_start_idx2, int dim1_copy_size,
+                   int dim2_copy_size);
+
+// Copy a subset of array data and create a smaller array,
+// mostly used for spliting weights and bias for Lstm cell.
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
+                         int dim2_size, const Array& original_array,
+                         int start_idx1, int start_idx2);
+
+// Copy array data to a large array's submatrix,
+// mostly used for merging weights and bias for Lstm cell.
+void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
+                         int tensor_stride, const Array& sub_array,
+                         int start_idx1, int start_idx2);
+
+// Get mating rnn array inputs using rnn_states flag.
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/quantization_util.h b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/quantization_util.h
new file mode 100644
index 00000000..f9b8dd85
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/quantization_util.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+// Gets the target quantized data type of an array based on the fake quant op.
+// For example, if the num_bits is 8 the data type will be kUint8.
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type);
+
+// Gets the min/max numerical range for the given quantized data type.
+// For example, kUint8 will return [0,255].
+// Returns true if the ranges were set and false if the type is not quantized.
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value);
+
+// Returns the quantized data type of an array, falling back to the provided
+// default data type.
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type);
+
+// Chooses the quantization params for a given array and a given target
+// quantized data type (which may not be the array's current data type).
+void ChooseQuantizationParamsForArrayAndQuantizedDataType(
+    const Array& array, ArrayDataType quantized_data_type,
+    QuantizationParams* quantization_params);
+
+// Quantizes an array by setting its data type and (if constant) quantizing
+// all values in the array.
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const std::string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params);
+
+// Returns true if the given array, when quantized, contains only values between
+// the provided clamp min/max.
+// Either clamp_min or clamp_max may be +/-infinity to indicate that the value
+// is unbounded on that side.
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h
new file mode 100644
index 00000000..315edc01
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+// A "passthrough op" is an op that satisfies the following conditions:
+//   1. One of its inputs is (per the semantics of that op) its "main input"
+//      for some notion of "main input" that is operator-specific; for example,
+//      for a Reshape op, the main input is the array being reshaped, not the
+//      other input which gives the new shape.
+//   2. It has exactly one output.
+//   3. It forwards exactly its main input to its single output.
+//
+// Examples include:
+//   1. TensorFlow Identity ops. (Have one input).
+//   2. TensorFlow Reshape ops when the input and output shapes agree.
+//   3. Any binary operator, one of whose two inputs is a constant and is the
+//      neutral value for that operation. For example, a binary Add operator
+//      where one of its inputs is a constant array filled with zeros.
+//
+// A passthrough op is "trivial" and can be removed when it is possible to
+// discard either its main input or output array, rerouting any
+// edge involving it to the other of these two arrays.
+//
+// It is only possible to discard such an array if it is not explicitly
+// designated as a global input/output array of the graph, e.g. the model's
+// input arrays, output arrays, and any array involved in a RNN back-edge
+// specified by the model.
+//
+// This function does not check that the given operator is a passthrough op:
+// that's the responsibility of the caller.
+// Given that it is a passthrough op, this function checks whether it is trivial
+// and then discards it and returns true, or, if it's not trivial (if neither
+// the input nor the output may be discarded), returns false.
+bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
+                                Model* model, std::size_t op_index,
+                                int input_index = -1);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/import_tensorflow.h b/third_party/tflite-hdrs/tensorflow/lite/toco/import_tensorflow.h
new file mode 100644
index 00000000..7d26347d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/import_tensorflow.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#define TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+
+namespace toco {
+
+struct TensorFlowImportFlags {
+  // If true, control dependencies will be dropped immediately
+  // during the import of the TensorFlow GraphDef.
+  bool drop_control_dependency = false;
+
+  // Do not recognize any op and import all ops as
+  // `TensorFlowUnsupportedOperator`. This is used to populated with the
+  // `force_select_tf_ops` flag.
+  bool import_all_ops_as_unsupported = false;
+};
+
+// Converts TOCO model from TensorFlow GraphDef with given flags.
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const tensorflow::GraphDef& tf_graph);
+
+// Converts TOCO model from the file content of TensorFlow GraphDef with given
+// flags.
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const std::string& input_file_contents);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/logging/conversion_log_util.h b/third_party/tflite-hdrs/tensorflow/lite/toco/logging/conversion_log_util.h
new file mode 100644
index 00000000..3a4ce352
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/logging/conversion_log_util.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_LOGGING_CONVERSION_LOG_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_LOGGING_CONVERSION_LOG_UTIL_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/toco/logging/toco_conversion_log.pb.h"
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+// This function scans through the error message string, extracts the part about
+// missing ops and prunes away all other information in the error info.
+std::string SanitizeErrorMessage(absl::string_view error_message);
+
+// Populates the TocoConversionLog proto after analyzing the model.
+void PopulateConversionLog(const Model& model, TocoConversionLog* log);
+
+// Returns the names of the operators in the model.
+std::vector<std::string> GetOperatorNames(const Model& model);
+
+// Counts the number of different types of operators in the model:
+// Built-in ops, custom ops and select ops.
+// Each map is mapping from the name of the operator (such as 'Conv') to its
+// total number of occurrences in the model.
+void CountOperatorsByType(const Model& model,
+                          std::map<std::string, int>* built_in_ops,
+                          std::map<std::string, int>* custom_ops,
+                          std::map<std::string, int>* select_ops);
+
+// Gets the input and output types of the model. The input and output is
+// specified by model.flags.input_arrays and model.flags.output_arrays.
+void GetInputAndOutputTypes(
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* input_types,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* output_types);
+
+// Calculates signatures for all the ops in the model. An op signature is
+// defined by its input/output shapes and types, op name and its version.
+void GetOpSignatures(
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* op_signatures);
+
+// TODO(b/123519920): Implement this.
+// Calculates a unique hash for the model.
+std::string GetModelHash(const Model& model);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_LOGGING_CONVERSION_LOG_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/model.h b/third_party/tflite-hdrs/tensorflow/lite/toco/model.h
new file mode 100644
index 00000000..19a4e205
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/model.h
@@ -0,0 +1,2491 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_MODEL_H_
+#define TENSORFLOW_LITE_TOCO_MODEL_H_
+
+#include <algorithm>
+#include <complex>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_types.h"
+
+namespace toco {
+
+using tflite::QuantizationParams;
+
+enum class OperatorType : uint8 {
+  kNone,
+  // General-purpose neural network operators.
+  kAdd,
+  kAddN,
+  kAveragePool,
+  kBatchMatMul,
+  kBatchNormalization,
+  kCeil,
+  kConv,
+  kConcatenation,
+  kCos,
+  kDepthwiseConv,
+  kDepthToSpace,
+  kSpaceToDepth,
+  kDequantize,
+  kDiv,
+  kExp,
+  kExpandDims,
+  kFill,
+  kFloorDiv,
+  kFloorMod,
+  kFullyConnected,
+  kL2Normalization,
+  kL2Pool,
+  kLstmCell,
+  kUnidirectionalSequenceLstm,
+  kLocalResponseNormalization,
+  kLog,
+  kLogistic,
+  kMaxPool,
+  kFakeQuant,
+  kMul,
+  kOneHot,
+  kRandomUniform,
+  kRange,
+  kRank,
+  kRelu,
+  kRelu1,
+  kRelu6,
+  kPRelu,
+  kHardSwish,
+  kSoftmax,
+  kLogSoftmax,
+  kSub,
+  kTanh,
+  kTransposeConv,
+  kCast,
+  kFloor,
+  kRound,
+  kGather,
+  kResizeBilinear,
+  kSin,
+  kSpaceToBatchND,
+  kPack,
+  kBatchToSpaceND,
+  kPad,
+  kPadV2,
+  kReduceProd,  // Reduction product
+  kStridedSlice,
+  kSlice,
+  kSqueeze,
+  kMean,
+  kArgMax,
+  // The SVDF Op is a decomposition of a densely connected Op into
+  // low rank filters. For details:
+  // https://research.google.com/pubs/pub43813.html
+  kSvdf,
+  // Special operators used for importing TensorFlow nodes.
+  // The general intent is to have some graph transformation either
+  // drop them or rewrite them as general-purpose operators.
+  kAll,
+  kAssert,
+  kConcat,
+  kConcatV2,
+  kGreater,
+  kGreaterEqual,
+  kIdentity,
+  kLess,
+  kLessEqual,
+  kReduceMax,  //  Reduction Max
+  kMaximum,    //  Element-wise Maximum
+  kReduceMin,  //  Reduction Min
+  kMinimum,    //  Element-wise Minimum
+  kMatMul,
+  kMerge,
+  kNeg,
+  kReshape,
+  kRsqrt,
+  kShape,
+  kSplit,
+  kSplitV,
+  kSqrt,
+  kSquare,
+  kSquaredDifference,
+  kSum,
+  kSwitch,
+  kTile,
+  kTranspose,
+  kTopK_V2,
+  kDynamicPartition,
+  kDynamicStitch,
+  // An unsupported TF operation. It's only needed to be able to represent TF
+  // graph internally and is expected to be dropped by graph transformations.
+  kUnsupported,
+  // Finally, TensorFlow uses different conventions for axes ordering,
+  // see AxesOrder, and this cannot always be resolved at the time of importing
+  // nodes, as TensorFlow parameters may be constant-expression subgraphs
+  // instead of being given as plain constant arrays. So we need to insert
+  // special nodes in the graph to shuffle axes.
+  kReorderAxes,
+  kSegmentSum,
+  kSelect,
+  kSelectV2,
+  kSparseToDense,
+  kEqual,
+  kNotEqual,
+  kPow,
+  kArgMin,
+  kAny,
+  kLogicalAnd,
+  kLogicalNot,
+  kLogicalOr,
+  kCTCBeamSearchDecoder,
+  kUnpack,
+  kZerosLike,
+  kResizeNearestNeighbor,
+  kLeakyRelu,
+  kAbs,
+  kMirrorPad,
+  kUnique,
+  kUnidirectionalSequenceRnn,
+  kBidirectionalSequenceLstm,
+  kReverseV2,
+  kBidirectionalSequenceRnn,
+  kGatherNd,
+  kWhere,
+  kElu,
+  kReverseSequence,
+  kMatrixDiag,
+  kMatrixSetDiag,
+  kMatrixDiagV2,
+  kMatrixSetDiagV2,
+  kMatrixDiagV3,
+  kMatrixSetDiagV3,
+  kScatterNd,
+  // Debugging operators.
+  kNumericVerify
+};
+
+// Helper to deal with TensorFlow arrays using a different ordering of
+// dimensions
+// ("axes") than our own.
+// TODO(benoitjacob): Ultimately, we shouldn't have any "ordering" of axes,
+// we should have associative arrays mapping symbolic axes identifiers (like
+// "output_depth") to dimensions. We would then not need this anymore.
+enum class AxesOrder {
+  kOneAxis,  // one-dimensional array, one unique axis.
+  kCR,       // column-major matrix storage order. Our standard.
+  kRC,       // row-major matrix storage order. TensorFlow default.
+  kOHWI,     // Our standard for conv weights
+  kHWIO,     // TensorFlow conv weights
+  k1HWO,     // Our standard for DepthwiseConv weights
+  kHWIM,     // TensorFlow DepthwiseConv weights
+  kNHWC,     // TensorFlow activations
+  kHWOI,     // TensorFlow back-prop conv weights
+};
+
+// The type of the scalars in an array.
+// Note that the type does not by itself tell whether the values in the array
+// are non-quantized (can be accessed directly) or quantized (must be
+// interpreted in conjunction with QuantizationParams).
+//
+// In practice though:
+//   float values are never quantized
+//   uint8 values are always quantized
+//   int32 values are sometimes quantized (depending on whether
+//   QuantizationParams are present).
+//   complex values are never quantized
+//   other types are never quantized at the moment.
+//
+// kNone means that we don't know the data type yet, or that we don't care
+// because we'll be dropping the array anyway (e.g. some exotic array types
+// may be involved only in debug-only subgraphs that we may not be interested
+// in actually supporting).
+enum class ArrayDataType : uint8 {
+  kNone,  // 0
+  kBool,
+  kFloat,
+  kInt8,
+  kUint8,
+  kInt16,  // 5
+  kUint16,
+  kInt32,
+  kUint32,
+  kInt64,
+  kUint64,  // 10
+  kString,
+  kComplex64,
+  kFloat16,
+  kFloat64,
+  kComplex128,
+};
+
+// Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
+template <ArrayDataType A>
+struct DataTypeImpl {};
+template <>
+struct DataTypeImpl<ArrayDataType::kNone> {
+  typedef int Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kBool> {
+  typedef bool Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kFloat> {
+  typedef float Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt8> {
+  typedef int8 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint8> {
+  typedef uint8 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt16> {
+  typedef int16 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint16> {
+  typedef uint16 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt32> {
+  typedef int32 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint32> {
+  typedef uint32 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt64> {
+  typedef int64_t Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint64> {
+  typedef uint64 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kString> {
+  typedef std::string Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kComplex64> {
+  typedef std::complex<float> Type;
+};
+
+template <ArrayDataType A>
+using DataType = typename DataTypeImpl<A>::Type;
+
+// Base class for type-specific buffer types.
+struct GenericBuffer {
+  // Non-default-constructible: only ArrayDataType-specific subclass
+  // objects may be constructed.
+  GenericBuffer() = delete;
+  // Non-copyable-or-movable: we should only store pointers-to-Buffer
+  // in containers, not Operators themselves, so there should be no
+  // copy or move.
+  GenericBuffer(const GenericBuffer&) = delete;
+  GenericBuffer(const GenericBuffer&&) = delete;
+
+  // We need a virtual destructor so we can store pointers-to-Buffer
+  // in containers and have the containers call the right subclass destructor.
+  virtual ~GenericBuffer() {}
+
+  virtual int Length() const = 0;
+
+  const ArrayDataType type;
+
+ protected:
+  // Constructor used by subclasses for specific ArrayDataType's.
+  explicit GenericBuffer(ArrayDataType t) : type(t) {}
+};
+
+// Type-specific buffer, containing type-specific storage.
+template <ArrayDataType A>
+struct Buffer : GenericBuffer {
+  Buffer() : GenericBuffer(A) {}
+
+  int Length() const override { return data.size(); }
+
+  std::vector<DataType<A>> data;
+};
+
+class Shape {
+ public:
+  // For Shape, we stick to half-way encapsulation for now:
+  // we hide the raw dims_ member, but expose it raw by accessors
+  // because from some brainstorming, it's not at all easy to
+  // anticipate which flavor of more hermetic encapsulation would
+  // actually buy us future-proof-ness without being needlessly
+  // cumbersome.
+  Shape() {}
+  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
+
+  void ReplaceDims(std::initializer_list<int> dim_list) {
+    dims_ = std::vector<int>(dim_list);
+  }
+
+  const std::vector<int>& dims() const { return dims_; }
+  std::vector<int>* mutable_dims() { return &dims_; }
+  const int dimensions_count() const { return dims_.size(); }
+
+  // We still have that one convenience accessor to avoid
+  // the awkward double bracket issue:  shape.dims()[i].
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurrence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
+
+  bool operator==(const Shape& comp) const {
+    return (this->dims_ == comp.dims());
+  }
+
+  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
+
+ private:
+  std::vector<int> dims_;
+};
+
+// Base class for all operator classes.
+struct Operator {
+  // Non-default-constructible: only OperatorType-specific subclass
+  // objects may be constructed.
+  Operator() = delete;
+  // Non-copyable-or-movable: we should only store pointers-to-Operator
+  // in containers, not Operators themselves, so there should be no
+  // copy or move.
+  Operator(const Operator&) = delete;
+  Operator(const Operator&&) = delete;
+
+  // We need a virtual destructor so we can store pointers-to-Operator
+  // in containers and have the containers call the right subclass destructor.
+  virtual ~Operator() {}
+
+  // The specific type of operator. Corresponds 1:1 to subclasses.
+  const OperatorType type;
+
+  // The activation function that may be fused into this operator,
+  // or None if no activation function is fused.
+  FusedActivationFunctionType fused_activation_function;
+
+  // Input arrays: either activation arrays or constant array parameters.
+  // We refer to them by their name, not by their address; the mapping of
+  // names to addresses is given by the Model, which owns both Operator's and
+  // Array's. Thus, an Operator on its own doesn't contain much information,
+  // it is meant to be used in conjunction with the Model that owns it.
+  std::vector<std::string> inputs;
+
+  // Output activation arrays. Same comments as for inputs apply here too.
+  std::vector<std::string> outputs;
+
+  // If true, the operator has more outputs than are listed in the 'outputs'
+  // member. These need to be resolved by some graph transformation.
+  // This flag is only here to indicate that an operator should not be
+  // discarded as unused, even if from its 'outputs' member alone it
+  // looks unused.
+  bool unresolved_outputs = false;
+
+  // A serialized tensorflow::NodeDef string.
+  // The field is filled only when importing from TensorFlow.
+  // It's guaranteed to be filled for `TensorFlowUnsupportedOperator`.
+  // It's not guaranteed to be filled for other ops. Ops created by graph
+  // transformations won't have TensorFlow NodeDef.
+  std::string tensorflow_node_def;
+
+ protected:
+  // Constructor used by subclasses for specific OperatorType's.
+  explicit Operator(OperatorType t)
+      : type(t),
+        fused_activation_function(FusedActivationFunctionType::kNone) {}
+};
+
+// Padding types for Conv-like operators. This is how padding is typically
+// specified in model files. But for inference, we will need to resolve this
+// to a FixedPadding, see below.
+enum class PaddingType { kNone, kSame, kValid };
+
+// Padding as resolved for a specific layer shape, as needed for inference.
+// For a given layer shape, a given padding type will resolve to a choice of
+// a number of padding rows and columns, which we call the padding height and
+// width respectively.
+struct FixedPadding {
+  int width = 0;
+  int height = 0;
+};
+
+// "Universal" padding struct containing both a generic PaddingType (as
+// represented in a model file), and a FixedPadding (as needed for inference).
+// The latter is resolved during the PropagateFixedSizes pass.
+struct Padding {
+  FixedPadding& GetOrCreateFixedPadding() {
+    if (!fixed) {
+      FixedPadding* ptr = new FixedPadding;
+      fixed = std::unique_ptr<FixedPadding>(ptr);
+    }
+    return *fixed;
+  }
+
+  Padding() : type(PaddingType::kNone) {}
+  PaddingType type;
+  std::unique_ptr<FixedPadding> fixed;
+};
+
+// "Convolutional" layer, as represented in model files.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the Conv weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//   outputs[1]: optional: the intermediate array of im2col-replicated input
+//                         activations. Present when targeting implementations
+//                         of Conv layers as Im2col+GEMM.
+//
+// TensorFlow equivalent: Conv2D
+struct ConvOperator : Operator {
+  ConvOperator() : Operator(OperatorType::kConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_width_factor = 1;
+  int dilation_height_factor = 1;
+};
+
+// CTCBeamSearchDecoder operator:
+//
+// Inputs:
+//   inputs[0]: required: the logits.
+//   inputs[1]: required: sequence length.
+//   inputs[2]: optional: beam width.
+//   inputs[3]: optional: top paths.
+//   inputs[4]: optional: merge repeated.
+//
+//  Outputs:
+//    outputs[0]: decoded.
+//    outputs[1]: log probability.
+//
+// TensorFlow equivalent: CTCBeamSearchDecoder
+struct CTCBeamSearchDecoderOperator : Operator {
+  CTCBeamSearchDecoderOperator()
+      : Operator(OperatorType::kCTCBeamSearchDecoder) {}
+  int beam_width;
+  int top_paths;
+  bool merge_repeated = true;
+};
+
+// Depthwise-separable convolution operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the DepthwiseConv weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// TensorFlow equivalent: DepthwiseConv2dNative
+struct DepthwiseConvOperator : Operator {
+  DepthwiseConvOperator() : Operator(OperatorType::kDepthwiseConv) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int depth_multiplier = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_width_factor = 1;
+  int dilation_height_factor = 1;
+};
+
+// Depth-to-space transform operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: DepthToSpace
+struct DepthToSpaceOperator : Operator {
+  DepthToSpaceOperator() : Operator(OperatorType::kDepthToSpace) {}
+  int block_size = 0;
+};
+
+// Space-to-depth transform operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: SpaceToDepth
+struct SpaceToDepthOperator : Operator {
+  SpaceToDepthOperator() : Operator(OperatorType::kSpaceToDepth) {}
+  int block_size = 0;
+};
+
+// Fully-connected operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the FullyConnected weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// TensorFlow equivalent: a pair consisting of a Reshape node reshaping the
+// input activations as a matrix, followed by a MatMul node.
+struct FullyConnectedOperator : Operator {
+  FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
+  FullyConnectedWeightsFormat weights_format =
+      FullyConnectedWeightsFormat::kDefault;
+
+  // `keep_num_dims` is supported in the FullyConnected kernel version 5, but
+  // it's never supported by Toco.
+  bool keep_num_dims = false;
+};
+
+// Dequantization operator, converting a quantized array of integers with
+// quantization parameters specifying how these integers correspond to real
+// numbers
+// (see QuantizationParams) to an output activations array of floating-point
+// values.
+//
+// In floating-point image models, there is typically a Dequantization operator
+// at the very beginning, converting the input image RGB data, consisting of
+// uint8 integer values, to floating-point input activations. That is where
+// image model parameters such as "mean_value" and "std_value" are typically
+// handled.
+//
+// This is the only operator type that converts from quantized to
+// floating-point,
+// and there is at the moment no operator type at all to convert from
+// floating-point
+// to quantized. Every other operator does either float->float or
+// quantized->quantized.
+//
+// Inputs:
+//   inputs[0]: required: the input quantized activations array
+//
+// TensorFlow equivalent: Dequantize
+struct DequantizeOperator : Operator {
+  DequantizeOperator() : Operator(OperatorType::kDequantize) {}
+};
+
+// Numeric verification operator, converting a quantized array of integers with
+// quantization parameters specifying how these integers correspond to real
+// numbers
+// (see QuantizationParams) and verify them with an array of floating-point
+// values.
+
+// Inputs:
+//   inputs[0]: required: the input quantized activations array
+//   inputs[1]: required: the input reference activations array
+//
+// TensorFlow equivalent: Dequantize
+struct NumericVerifyOperator : Operator {
+  NumericVerifyOperator() : Operator(OperatorType::kNumericVerify) {}
+};
+
+// Batch-normalization operator.
+//
+// We only support batch-normalization using pre-learned moments, so this is
+// just
+// computing (input - mean) * multiplier + offset. As such, this can be
+// expressed as a combination of Add and Mul nodes, and indeed this is how
+// we break it down during tooling for the purpose of fusing it into
+// other operators.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the learned mean array
+//   inputs[2]: required: the learned multiplier array
+//   inputs[3]: required: the learned offset array
+//
+// TensorFlow equivalent: a combination of Add and Mul nodes
+struct BatchNormalizationOperator : Operator {
+  BatchNormalizationOperator()
+      : Operator(OperatorType::kBatchNormalization),
+        global_normalization(false) {}
+  bool global_normalization;
+};
+
+// L2-normalization operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: none. In TensorFlow, L2 normalization is implemented
+// by a sub-graph of operators implementing L2-normalization
+// from lower-level arithmetic nodes; during tooling, we identify such
+// sub-graphs
+// and replace them by L2NormalizationOperator's. See IdentifyL2Normalization.
+struct L2NormalizationOperator : Operator {
+  L2NormalizationOperator() : Operator(OperatorType::kL2Normalization) {}
+};
+
+// LSTM Cell operator.
+//
+// Inputs:
+//   inputs[0]: required: the input data array
+//   inputs[1]: required: the previous output activations array
+//   inputs[2]: required: the learned weights array
+//   inputs[3]: required: the learned biases array
+//   inputs[4]: required: the previous output state
+//   outputs[0]: required: the output activations array
+//   outputs[1]: required: the new state array
+//
+// TensorFlow equivalent: none. In TensorFlow, an LSTM is implemented
+// with a sub-graph of lower-level arithmetic nodes; during tooling, we identify
+// such sub-graphs and replace them with LstmCells. See IdentifyLstmCell().
+struct LstmCellOperator : Operator {
+  enum Inputs {
+    DATA_INPUT = 0,
+    PREV_ACTIV_INPUT = 1,
+    WEIGHTS_INPUT = 2,
+    BIASES_INPUT = 3,
+    PREV_STATE_INPUT = 4,
+    NUM_INPUTS = 5
+  };
+  enum Outputs {
+    ACTIV_OUTPUT = 0,
+    STATE_OUTPUT = 1,
+    CONCAT_TEMP = 2,
+    ACTIV_TEMP = 3,
+    NUM_OUTPUTS = 4
+  };
+  enum KernelType {
+    KERNEL_BASIC = 0,
+    KERNEL_FULL = 1,
+  };
+
+  LstmCellOperator()
+      : Operator(OperatorType::kLstmCell), kernel_type(KERNEL_BASIC) {}
+
+  KernelType kernel_type;
+};
+
+struct UnidirectionalSequenceLstmOperator : Operator {
+  UnidirectionalSequenceLstmOperator()
+      : Operator(OperatorType::kUnidirectionalSequenceLstm) {}
+};
+
+struct BidirectionalSequenceLstmOperator : Operator {
+  BidirectionalSequenceLstmOperator()
+      : Operator(OperatorType::kBidirectionalSequenceLstm) {}
+  bool merge_outputs;
+};
+
+struct BidirectionalSequenceRnnOperator : Operator {
+  BidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kBidirectionalSequenceRnn) {}
+  bool merge_outputs;
+};
+
+// Element-wise multiplication operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Mul
+struct MulOperator : Operator {
+  MulOperator() : Operator(OperatorType::kMul) {}
+};
+
+// Element-wise Abs operator:
+//   x -> abs(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: abs
+struct AbsOperator : Operator {
+  AbsOperator() : Operator(OperatorType::kAbs) {}
+};
+
+// Element-wise HardSwish operator:
+//   x -> x * relu6(x+3)/6
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: hard_swish
+struct HardSwishOperator : Operator {
+  HardSwishOperator() : Operator(OperatorType::kHardSwish) {}
+};
+
+// Elu
+//   f(x) -> exp(x) - 1 for x < 0, x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Elu
+struct EluOperator : Operator {
+  EluOperator() : Operator(OperatorType::kElu) {}
+};
+
+// Element-wise Relu operator:
+//   x -> max(0, x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu
+struct ReluOperator : Operator {
+  ReluOperator() : Operator(OperatorType::kRelu) {}
+};
+
+// Element-wise Relu1 operator:
+//   x -> min(max(x, -1), 1)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. We can construct the operator with Minimum
+// and Maximum operations
+struct Relu1Operator : Operator {
+  Relu1Operator() : Operator(OperatorType::kRelu1) {}
+};
+
+// Element-wise Relu6 operator:
+//   x -> max(0, min(6, x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu6
+struct Relu6Operator : Operator {
+  Relu6Operator() : Operator(OperatorType::kRelu6) {}
+};
+
+// PRelu
+//   f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the alpha array
+//
+// Equivalent to keras.layers.PReLU.
+struct PReluOperator : Operator {
+  PReluOperator() : Operator(OperatorType::kPRelu) {}
+};
+
+// LeakyRelu
+//   x -> max(x, alpha * x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LeakyRelu
+struct LeakyReluOperator : Operator {
+  LeakyReluOperator() : Operator(OperatorType::kLeakyRelu) {}
+
+  float alpha = 0.2f;  // 0.2 matches the default value for the TF op attribute.
+};
+
+// Element-wise Logistic operator:
+//   x -> Logistic(x) = 1 / (1 + exp(-x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sigmoid
+struct LogisticOperator : Operator {
+  LogisticOperator() : Operator(OperatorType::kLogistic) {}
+};
+
+// Element-wise natural log operator:
+//   x -> ln(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Log
+struct LogOperator : Operator {
+  LogOperator() : Operator(OperatorType::kLog) {}
+};
+
+// Element-wise Tanh operator:
+//   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Tanh
+struct TanhOperator : Operator {
+  TanhOperator() : Operator(OperatorType::kTanh) {}
+};
+
+// Element-wise Sin operator:
+//   x -> Sin(x) = sin(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sin
+struct SinOperator : Operator {
+  SinOperator() : Operator(OperatorType::kSin) {}
+};
+
+// Element-wise addition operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Add
+struct AddOperator : Operator {
+  AddOperator() : Operator(OperatorType::kAdd) {}
+};
+
+// Element-wise addition operator for N inputs.
+//
+// Inputs:
+//   inputs[i]: The i-th array to add together to form the output.
+//
+// TensorFlow equivalent: AddN
+struct AddNOperator : Operator {
+  AddNOperator() : Operator(OperatorType::kAddN) {}
+};
+
+// Concatenation operator: concatenates its inputs
+// along the axis.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to concatenate.
+//
+// TensorFlow equivalent: Concat.
+struct ConcatenationOperator : Operator {
+  ConcatenationOperator() : Operator(OperatorType::kConcatenation) {}
+  int axis = 0;
+};
+
+// Reordering dimensions. Used only during tooling to transform graphs from
+// the TensorFlow format.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. This is only useful to convert between formats.
+struct ReorderAxesOperator : Operator {
+  ReorderAxesOperator() : Operator(OperatorType::kReorderAxes) {}
+  AxesOrder input_axes_order;
+  AxesOrder output_axes_order;
+};
+
+// Average-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: AveragePool
+struct AveragePoolOperator : Operator {
+  AveragePoolOperator() : Operator(OperatorType::kAveragePool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// Local response normalization operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LRN
+struct LocalResponseNormalizationOperator : Operator {
+  LocalResponseNormalizationOperator()
+      : Operator(OperatorType::kLocalResponseNormalization) {}
+
+  int range = 0;
+  float bias = 0.f;
+  float alpha = 0.f;
+  float beta = 0.f;
+};
+
+// Max-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: MaxPool
+struct MaxPoolOperator : Operator {
+  MaxPoolOperator() : Operator(OperatorType::kMaxPool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// L2-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. Can be shimmed by squaring+avgpool+sqrt.
+struct L2PoolOperator : Operator {
+  L2PoolOperator() : Operator(OperatorType::kL2Pool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// The expected [min, max] range of values in a given array.
+// Used for quantization only.
+// This information typically comes from special nodes found in quantized
+// models, see FakeQuantOperator, and is used during quantization to resolve
+// actual quantization parameters (see QuantizationParams).
+struct MinMax {
+  double min = 0.;
+  double max = 0.;
+};
+
+inline bool operator==(const MinMax& m1, const MinMax& m2) {
+  return m1.min == m2.min && m1.max == m2.max;
+}
+
+inline bool operator!=(const MinMax& m1, const MinMax& m2) {
+  return m1.min != m2.min || m1.max != m2.max;
+}
+
+// Fake-quantization operator. This does two things:
+//   - Annotate its input and output arrays with MinMax information,
+//   - Arithmetic-wise, this operator rounds incoming activation values
+//     to the nearest representable value on the scale of 256
+//     values from the min to the max value dictated by its MinMax info.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: optional: the 'min' value, if it has not yet been resolved
+//              to a constant.
+//   inputs[2]: optional: the 'max' value, if it has not yet been resolved
+//              to a constant.
+//
+// TensorFlow equivalent: FakeQuantWithMinMaxVars, FakeQuantWithMinMaxArgs.
+struct FakeQuantOperator : Operator {
+  FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
+  std::unique_ptr<MinMax> minmax;
+  int num_bits = 8;
+  bool narrow_range = false;
+};
+
+// Element-wise division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Div
+struct DivOperator : Operator {
+  DivOperator() : Operator(OperatorType::kDiv) {}
+};
+
+// Element-wise identity (x->x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Identity
+struct TensorFlowIdentityOperator : Operator {
+  TensorFlowIdentityOperator() : Operator(OperatorType::kIdentity) {}
+};
+
+// Batch matrix multiplication operator. This comes from a tf.matmul where one
+// of the operands has rank 3 or more.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct BatchMatMulOperator : Operator {
+  BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+  bool adj_x = false;
+  bool adj_y = false;
+};
+
+// General matrix multiplication operator. We don't want to support general
+// matrix multiplication at inference time, so we resolve it during tooling
+// to more specific operator types, namely, FullyConnected.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct TensorFlowMatMulOperator : Operator {
+  TensorFlowMatMulOperator() : Operator(OperatorType::kMatMul) {}
+  bool transpose_a = false;
+  bool transpose_b = false;
+};
+
+// Padding operator. Pads a tensor with zeros.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of
+// `input` in that dimension.
+//
+// TensorFlow equivalent: Pad
+struct PadOperator : Operator {
+  PadOperator() : Operator(OperatorType::kPad) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
+// PaddingV2 operator. Pads a tensor with the given constant value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//   inputs[2]: required: the scalar constant_values
+//
+// This operation pads input according to the paddings and constant_values you
+// specify. paddings is an integer tensor with shape [Dn, 2], where n is the
+// rank of input. For each dimension D of input, paddings[D, 0] indicates how
+// many padding values to add before the contents of input in that dimension,
+// and paddings[D, 1] indicates how many padding values to add after the
+// contents of input in that dimension. constant_values is a scalar tensor of
+// the same type as input that indicates the value to use for padding input.
+//
+// TensorFlow equivalent: PadV2
+struct PadV2Operator : Operator {
+  PadV2Operator() : Operator(OperatorType::kPadV2) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
+// Strided slice operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the begin array
+//   inputs[2]: required: the end array
+//   inputs[3]: optional: the strides array
+//
+// TensorFlow equivalent: StridedSlice
+struct StridedSliceOperator : Operator {
+  StridedSliceOperator() : Operator(OperatorType::kStridedSlice) {}
+
+  std::vector<int> start_indices;
+  std::vector<int> stop_indices;
+  std::vector<int> strides;
+
+  int begin_mask;
+  int ellipsis_mask;
+  int end_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+
+  StridedSliceOperator(const StridedSliceOperator& other)
+      : Operator(OperatorType::kStridedSlice) {
+    inputs = other.inputs;
+    outputs = other.outputs;
+
+    start_indices = other.start_indices;
+    stop_indices = other.stop_indices;
+    strides = other.strides;
+
+    begin_mask = other.begin_mask;
+    ellipsis_mask = other.ellipsis_mask;
+    end_mask = other.end_mask;
+    new_axis_mask = other.new_axis_mask;
+    shrink_axis_mask = other.shrink_axis_mask;
+  }
+
+  void PadIndices(int dim_count) {
+    // Add indices and mask bits to fully include extra dimensions
+    CHECK_GE(dim_count, start_indices.size());
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    for (int i = start_indices.size(); i < dim_count; i++) {
+      start_indices.push_back(0);
+      stop_indices.push_back(0);
+      strides.push_back(1);
+      begin_mask |= 1 << i;
+      end_mask |= 1 << i;
+    }
+  }
+
+  void ReverseIndices() {
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    std::reverse(start_indices.begin(), start_indices.end());
+    std::reverse(stop_indices.begin(), stop_indices.end());
+    std::reverse(strides.begin(), strides.end());
+
+    begin_mask = toco::port::ReverseBits32(static_cast<uint32>(begin_mask)) >>
+                 (32 - start_indices.size());
+    ellipsis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(ellipsis_mask)) >>
+        (32 - start_indices.size());
+    end_mask = toco::port::ReverseBits32(static_cast<uint32>(end_mask)) >>
+               (32 - start_indices.size());
+    new_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(new_axis_mask)) >>
+        (32 - start_indices.size());
+    shrink_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(shrink_axis_mask)) >>
+        (32 - start_indices.size());
+  }
+};
+
+// Reshaping operator, reshaping its input array to a two-dimensional shape
+// (a "matrix"). This is used in the TensorFlow format, in conjunction with
+// MatMul nodes, to implement fully-connected layers.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: optional: the output tensor shape
+//
+// TensorFlow equivalent: Reshape --- except that we only support a special case
+// here, where the output shape is a matrix (2D) shape.
+struct TensorFlowReshapeOperator : Operator {
+  TensorFlowReshapeOperator() : Operator(OperatorType::kReshape) {}
+  std::vector<int> shape;
+};
+
+// Removes dimensions of size 1 from the shape of a tensor.
+// https://www.tensorflow.org/api_docs/python/tf/squeeze
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Squeeze
+struct SqueezeOperator : Operator {
+  SqueezeOperator() : Operator(OperatorType::kSqueeze) {}
+
+  std::vector<int> squeeze_dims;
+};
+
+// Inputs:
+//   inputs[0]: required: the output shape
+//   inputs[1]: required: the weights
+//   inputs[2]: required: the input activations array
+//   inputs[3]: optional: the bias vector, specifying the biases for each output
+//                        channel.
+//   NOTE: The input activations is NOT the first input.
+//
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//
+// TensorFlow equivalent: Conv2DBackpropInput
+struct TransposeConvOperator : Operator {
+  enum Inputs {
+    OUTPUT_SHAPE = 0,
+    WEIGHTS = 1,
+    DATA_INPUT = 2,
+    BIAS = 3,
+  };
+
+  TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+  // Dilation is possible with transpose convolution, but Tensorflow does not
+  // currently support it, so we omit it.
+};
+
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = e^x).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Exp
+struct ExpOperator : Operator {
+  ExpOperator() : Operator(OperatorType::kExp) {}
+};
+
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = cos(x)).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Cos
+struct CosOperator : Operator {
+  CosOperator() : Operator(OperatorType::kCos) {}
+};
+
+// Given a tensor input, this operation inserts a dimension of 1 at the
+// dimension index axis of input's shape. The dimension index axis starts at
+// zero; if you specify a negative number for axis it is counted backward from
+// the end.
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//   inputs[1]: required: 0-D (scalar). Specifies the dimension index at which
+//   to expand the shape of input
+//
+// TensorFlow equivalent: ExpandDims
+struct ExpandDimsOperator : Operator {
+  ExpandDimsOperator() : Operator(OperatorType::kExpandDims) {}
+};
+
+// Creates a tensor of shape dims and fills it with the given scalar value.
+// Output type will be the same as the given scalar value.
+//
+// Inputs:
+//   inputs[0]: required: 1-D (int32) - the shape of the output tensor
+//   inputs[1]: required: 0-D (scalar) - value to fill the tensor with
+//
+// TensorFlow equivalent: Fill
+struct FillOperator : Operator {
+  FillOperator() : Operator(OperatorType::kFill) {}
+};
+
+// Element-wise floor division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorDiv
+struct FloorDivOperator : Operator {
+  FloorDivOperator() : Operator(OperatorType::kFloorDiv) {}
+};
+
+// Element-wise floor mod operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorMod
+struct FloorModOperator : Operator {
+  FloorModOperator() : Operator(OperatorType::kFloorMod) {}
+};
+
+struct RandomUniformOperator : Operator {
+  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+  int64_t seed;
+  int64_t seed2;
+};
+
+// Creates a sequence of numbers that begins at start and extends by increments
+// of delta up to but not including limit.
+//
+// The dtype of the resulting tensor is inferred from the inputs unless it is
+// provided explicitly.
+//
+// Inputs:
+//   inputs[0]: required: the start
+//   inputs[1]: required: the limit
+//   inputs[2]: required: the delta
+//
+// TensorFlow equivalent: Range
+struct RangeOperator : Operator {
+  RangeOperator() : Operator(OperatorType::kRange) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Rank operator. Extracts the rank of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 0-D int32 Tensor representing the rank of input.
+//
+// TensorFlow equivalent: Rank.
+struct TensorFlowRankOperator : Operator {
+  TensorFlowRankOperator() : Operator(OperatorType::kRank) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt32;
+};
+
+// Element-wise negation (-x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Neg
+struct NegOperator : Operator {
+  NegOperator() : Operator(OperatorType::kNeg) {}
+};
+
+// Element-wise select operator choosing elements from inputs[1] or input[2]
+//
+// Inputs:
+//  inputs[0]: required: boolean mask per index
+//  inputs[1]: required: tensor of values if true
+//  inputs[2]: required: tensor of values if false
+//
+//  TensorFlow equivalent: Select
+struct SelectOperator : Operator {
+  SelectOperator() : Operator(OperatorType::kSelect) {}
+};
+
+// Element-wise reciprocal-square-root (x^-0.5) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Rsqrt
+struct TensorFlowRsqrtOperator : Operator {
+  TensorFlowRsqrtOperator() : Operator(OperatorType::kRsqrt) {}
+};
+
+// Stacks a list of rank-R tensors into one rank-(R+1) tensor.
+//
+// Packs the list of tensors in values into a tensor with rank one higher than
+// each tensor in values, by packing them along the axis dimension. Given a list
+// of length N of tensors of shape (A, B, C);.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// TensorFlow equivalent: Pack
+struct PackOperator : Operator {
+  PackOperator() : Operator(OperatorType::kPack) {}
+  int values_count;
+  int axis = 0;
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Shape operator. Extracts the shape of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 1-D integer tensor representing the shape of
+// the input.
+//
+// TensorFlow equivalent: Shape.
+struct TensorFlowShapeOperator : Operator {
+  TensorFlowShapeOperator() : Operator(OperatorType::kShape) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt32;
+};
+
+// Element-wise square-root (x^0.5) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sqrt
+struct TensorFlowSqrtOperator : Operator {
+  TensorFlowSqrtOperator() : Operator(OperatorType::kSqrt) {}
+};
+
+// Element-wise square (x*x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Square
+struct TensorFlowSquareOperator : Operator {
+  TensorFlowSquareOperator() : Operator(OperatorType::kSquare) {}
+};
+
+// Element-wise squared difference ((x-y)*(x-y)) operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: SquaredDifference
+struct SquaredDifferenceOperator : Operator {
+  SquaredDifferenceOperator() : Operator(OperatorType::kSquaredDifference) {}
+};
+
+// Transposes a tensor.
+//
+// By default, this operation performs a regular matrix transpose on 2-D input
+// tensors.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Transpose
+struct TransposeOperator : Operator {
+  TransposeOperator() : Operator(OperatorType::kTranspose) {}
+  std::vector<int> perm;
+};
+
+// Element-wise subtraction operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Sub
+struct SubOperator : Operator {
+  SubOperator() : Operator(OperatorType::kSub) {}
+};
+
+// Sum reduction: computes the sum of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sum
+struct TensorFlowSumOperator : Operator {
+  TensorFlowSumOperator() : Operator(OperatorType::kSum) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Prod reduction: computes the product of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Prod
+struct TensorFlowProdOperator : Operator {
+  TensorFlowProdOperator() : Operator(OperatorType::kReduceProd) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// TensorFlow Tile equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: int array with length of rank(input[0])
+struct TensorFlowTileOperator : Operator {
+  TensorFlowTileOperator() : Operator(OperatorType::kTile) {}
+};
+
+// TensorFlow Slice equivalent. Refer to TensorFlow documentation for details.
+struct SliceOperator : Operator {
+  SliceOperator() : Operator(OperatorType::kSlice) {}
+
+  std::vector<int> begin;
+  std::vector<int> size;
+};
+
+// TensorFlow Split equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+struct TensorFlowSplitOperator : Operator {
+  TensorFlowSplitOperator() : Operator(OperatorType::kSplit) {}
+  int num_split = 0;
+};
+
+// TensorFlow SplitV equivalent. Refer to TensorFlow documentation for details.
+struct TensorFlowSplitVOperator : Operator {
+  TensorFlowSplitVOperator() : Operator(OperatorType::kSplitV) {}
+  int num_split = 0;
+};
+
+// TensorFlow Concat equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Concretely, once the concat dim becomes known, if it is the depth
+// dimension then we can change this op into a DepthConcatenation op.
+// Otherwise, we hope for some other graph transformation to drop this node.
+struct TensorFlowConcatOperator : Operator {
+  TensorFlowConcatOperator() : Operator(OperatorType::kConcat) {}
+};
+
+// TensorFlow ConcatV2 equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Concretely, once the concat dim becomes known, if it is the depth
+// dimension then we can change this op into a DepthConcatenation op.
+// Otherwise, we hope for some other graph transformation to drop this node.
+struct TensorFlowConcatV2Operator : Operator {
+  TensorFlowConcatV2Operator() : Operator(OperatorType::kConcatV2) {}
+};
+
+// TensorFlow Merge equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// It is expected that graph transformations will drop all but exactly one
+// of the inputs, at which point the Merge node will be equivalent to an
+// Identity node forwarding the remaining input.
+//
+// Note: We do not currently support runtime control flow: we only support
+// control flow that can be resolved at tooling time (independently of input
+// activations).
+struct TensorFlowMergeOperator : Operator {
+  TensorFlowMergeOperator() : Operator(OperatorType::kMerge) {}
+};
+
+// TensorFlow Switch equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the boolean predicate, given as an array of size 1
+//     and of type kBool, will determine which output gets selected.
+//
+// Outputs: a TensorFlow Switch node always has exactly two outputs. Depending
+// on the boolean value that the input predicate resolves to (see note below),
+// one or the other of the outputs will be 'selected': the input array will be
+// forwarded to the 'selected output' as if by a Identity node, while the other
+// output will be discarded, and any graph edge connecting that discarded output
+// will be dropped. The rule for selecting outputs is as follows:
+//   outputs[0] will be selected if the input predicate resolves to 'true'.
+//   outputs[1] will be selected if the input predicate resolves to 'false'.
+//
+// Note: We do not currently support runtime control flow: we only support
+// control flow that can be resolved at tooling time (independently of input
+// activations).
+struct TensorFlowSwitchOperator : Operator {
+  TensorFlowSwitchOperator() : Operator(OperatorType::kSwitch) {}
+};
+
+// TensorFlow All equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowAllOperator : Operator {
+  TensorFlowAllOperator() : Operator(OperatorType::kAll) {}
+};
+
+// TensorFlow Assert equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, we just drop Assert nodes.
+struct TensorFlowAssertOperator : Operator {
+  TensorFlowAssertOperator() : Operator(OperatorType::kAssert) {}
+};
+
+// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowLessOperator : Operator {
+  TensorFlowLessOperator() : Operator(OperatorType::kLess) {}
+};
+
+// TensorFlow LessEqual equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowLessEqualOperator : Operator {
+  TensorFlowLessEqualOperator() : Operator(OperatorType::kLessEqual) {}
+};
+
+// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowGreaterOperator : Operator {
+  TensorFlowGreaterOperator() : Operator(OperatorType::kGreater) {}
+};
+
+// TensorFlow GreaterEqual equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowGreaterEqualOperator : Operator {
+  TensorFlowGreaterEqualOperator() : Operator(OperatorType::kGreaterEqual) {}
+};
+
+// TensorFlow Equal equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowEqualOperator : Operator {
+  TensorFlowEqualOperator() : Operator(OperatorType::kEqual) {}
+};
+
+// TensorFlow Not Equal equivalent. Refer to TensorFlow documentation for
+// details.
+struct TensorFlowNotEqualOperator : Operator {
+  TensorFlowNotEqualOperator() : Operator(OperatorType::kNotEqual) {}
+};
+
+// Max reduction: computes the max of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Max
+struct TensorFlowMaxOperator : Operator {
+  TensorFlowMaxOperator() : Operator(OperatorType::kReduceMax) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Min reduction: computes the min of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Min
+struct TensorFlowMinOperator : Operator {
+  TensorFlowMinOperator() : Operator(OperatorType::kReduceMin) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Element-wise maximum operator. Currently it only supports scalar as
+// the second operand.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Maximum
+struct TensorFlowMaximumOperator : Operator {
+  TensorFlowMaximumOperator() : Operator(OperatorType::kMaximum) {}
+};
+
+// Element-wise minimum operator. Currently it only supports scalar as
+// the second operand.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Minimum
+struct TensorFlowMinimumOperator : Operator {
+  TensorFlowMinimumOperator() : Operator(OperatorType::kMinimum) {}
+};
+
+// General TF operation, unsupported by tf.mini. Expected to be dropped by
+// graph transformations.
+struct TensorFlowUnsupportedOperator : Operator {
+  TensorFlowUnsupportedOperator() : Operator(OperatorType::kUnsupported) {}
+
+  // The original TF operation type. Used for diagnostic purposes.
+  std::string tensorflow_op;
+  // A boolean indicating if the unsupported op should be treated as quantized.
+  bool quantized = false;
+  // A boolean indicating if the unsupported op output should allow float values
+  // in quantized mode.
+  bool support_output_type_float_in_quantized_op = false;
+  // Output data types
+  std::vector<ArrayDataType> output_data_types;
+  // Output shapes.
+  std::vector<Shape> output_shapes;
+};
+
+// Softmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Softmax
+struct SoftmaxOperator : Operator {
+  SoftmaxOperator() : Operator(OperatorType::kSoftmax) {}
+  float beta = 0.f;
+};
+
+// LogSoftmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the logits input array
+//
+// TensorFlow equivalent: LogSoftmax
+struct LogSoftmaxOperator : Operator {
+  LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+
+  // LogSoftmax can in principal have very large negative output, depending on
+  // the input size.  However, input x_i that is less than x_max-10 is
+  // accumulated as exp(x_i-x_max), which is truncated to zero.
+  //
+  // Since we effectively disregard smallish inputs in the normalizing factor,
+  // we also drop them in the output (set to minimum output), and in doing so
+  // make better use of the quantization range / resolution.
+  static constexpr float kOutputRangeMin = -16.0;
+};
+
+// Cast operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Cast
+struct CastOperator : Operator {
+  CastOperator() : Operator(OperatorType::kCast) {}
+  ArrayDataType src_data_type = ArrayDataType::kNone;
+  ArrayDataType dst_data_type = ArrayDataType::kNone;
+};
+
+// Floor operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Floor
+struct FloorOperator : Operator {
+  FloorOperator() : Operator(OperatorType::kFloor) {}
+};
+
+// Ceil operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Ceil
+struct CeilOperator : Operator {
+  CeilOperator() : Operator(OperatorType::kCeil) {}
+};
+
+// Round operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Round
+struct RoundOperator : Operator {
+  RoundOperator() : Operator(OperatorType::kRound) {}
+};
+
+// Gather operator. It gathers slices from params according to indices.
+// Only 1-D indices are supported at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//   inputs[2]: optional: axis
+//
+// TensorFlow equivalent: Gather
+struct GatherOperator : Operator {
+  GatherOperator() : Operator(OperatorType::kGather) {}
+  // Axis is populated explicitly or implicitly from the axis input by
+  // ResolveGatherAttributes. An empty axis indicates that the axis has not yet
+  // be resolved.
+  std::optional<int> axis;
+
+  // This field is not used by the standard TF Lite export but it is still need
+  // for legacy Gather implementations.
+  int input_rank = 0;
+};
+
+// GatherNd operator. It gathers slices from params according to indices.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//
+// TensorFlow equivalent: GatherNd
+struct GatherNdOperator : Operator {
+  GatherNdOperator() : Operator(OperatorType::kGatherNd) {}
+};
+
+// ArgMax operator. It returns the index of the maximum value along axis.
+//
+// Inputs:
+//   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
+//
+// TensorFlow equivalent: ArgMax
+struct ArgMaxOperator : Operator {
+  ArgMaxOperator() : Operator(OperatorType::kArgMax) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt64;
+};
+
+// ArgMin operator. It returns the index of the minimum value along axis.
+//
+// Inputs:
+//   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
+//
+// TensorFlow equivalent: ArgMin
+struct ArgMinOperator : Operator {
+  ArgMinOperator() : Operator(OperatorType::kArgMin) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt64;
+};
+
+// ResizeBilinear operator. It resizes input images with bilinear interpolation.
+// It does not support align_corners at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the new image size
+//
+// TensorFlow equivalent: ResizeBilinear
+struct ResizeBilinearOperator : Operator {
+  ResizeBilinearOperator() : Operator(OperatorType::kResizeBilinear) {}
+
+  bool align_corners = false;
+  bool half_pixel_centers = false;
+};
+
+// ResizeNearestNeighborOperator operator. It resizes input images with nearest
+// neighbor interpolation. It does not support align_corners at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the new image size
+//
+// TensorFlow equivalent: ResizeNearestNeighbor
+struct ResizeNearestNeighborOperator : Operator {
+  ResizeNearestNeighborOperator()
+      : Operator(OperatorType::kResizeNearestNeighbor) {}
+
+  bool align_corners = false;
+  bool half_pixel_centers = false;
+};
+
+// SpaceToBatchND operator. It divides spatial dimensions into a grid of
+// blocks and interleaves these blocks with the batch dimension. Currently,
+// only 2-d blocks are supported.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the block shape
+//   inputs[2]: required: the paddings
+//
+// TensorFlow equivalent: SpaceToBatchND
+struct SpaceToBatchNDOperator : Operator {
+  SpaceToBatchNDOperator() : Operator(OperatorType::kSpaceToBatchND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_paddings;
+  std::vector<int> after_paddings;
+};
+
+// BatchToSpaceND operator. Rearranges data from batch into blocks of
+// spatial data. Currently, only 2-d blocks are supported.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the block shape
+//   inputs[2]: required: the crops
+//
+// TensorFlow equivalent: BatchToSpaceND
+struct BatchToSpaceNDOperator : Operator {
+  BatchToSpaceNDOperator() : Operator(OperatorType::kBatchToSpaceND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_crops;
+  std::vector<int> after_crops;
+};
+
+// Mean operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Mean
+struct MeanOperator : Operator {
+  MeanOperator() : Operator(OperatorType::kMean) {}
+
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Svdf operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: weights_feature
+//   inputs[2]: required: weights_time
+//   inputs[3]: optional: bias
+struct SvdfOperator : Operator {
+  SvdfOperator() : Operator(OperatorType::kSvdf) {}
+  int rank;
+};
+
+// TopKV2 operator.
+//
+// Inputs:
+//    input tensor and top_k scalar.
+struct TopKV2Operator : Operator {
+  TopKV2Operator() : Operator(OperatorType::kTopK_V2) {}
+};
+
+// DynamicPartition operator:
+//
+// Inputs:
+//  inputs[0]: required: data.
+//  inputs[1]: required: partitions.
+//
+// TensorFlow equivalent: DynamicPartition
+struct DynamicPartitionOperator : Operator {
+  DynamicPartitionOperator() : Operator(OperatorType::kDynamicPartition) {}
+  int num_partitions;
+};
+
+// DynamicStitch operator:
+//
+// Inputs:
+//  inputs[0,N): required: indices.
+//  inputs[N,2N): required: data.
+//
+// TensorFlow equivalent: DynamicStitch/ParallelDynamicStitch
+struct DynamicStitchOperator : Operator {
+  DynamicStitchOperator() : Operator(OperatorType::kDynamicStitch) {}
+  int num_partitions;
+};
+
+// SparseToDense operator:
+//
+// Inputs:
+// Inputs[0]: required: sparse_indices.
+// Inputs[1]: required: output_shape.
+// Inputs[2]: required: sparse_values.
+//
+// TensorFlow equivalent: SparseToDense.
+struct SparseToDenseOperator : Operator {
+  SparseToDenseOperator() : Operator(OperatorType::kSparseToDense) {}
+  bool validate_indices;
+};
+
+// Pow operator:
+//
+// Inputs:
+// Inputs[0]: required: A tensor.
+// Inputs[1]: required: A tensor.
+//
+// TensorFlow equivalent: Pow.
+struct PowOperator : Operator {
+  PowOperator() : Operator(OperatorType::kPow) {}
+};
+
+// Any operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean input tensor.
+// Inputs[1]: required: reduction_indices.
+//
+// TensorFlow equivalent: tf.reduce_any.
+struct TensorFlowAnyOperator : Operator {
+  TensorFlowAnyOperator() : Operator(OperatorType::kAny) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// LogicalAnd operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean tensor.
+// Inputs[1]: required: A boolean tensor.
+//
+// TensorFlow equivalent: tf.logical_and.
+struct LogicalAndOperator : Operator {
+  LogicalAndOperator() : Operator(OperatorType::kLogicalAnd) {}
+};
+
+// LogicalNot operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean tensor.
+//
+// TensorFlow equivalent: tf.logical_not.
+struct LogicalNotOperator : Operator {
+  LogicalNotOperator() : Operator(OperatorType::kLogicalNot) {}
+};
+
+// OneHot operator:
+//
+// Inputs:
+// Inputs[0]: required: indices.
+// Inputs[1]: required: depth.
+// Inputs[2]: required: on_value.
+// Inputs[3]: required: off_value.
+//
+// TensorFlow equivalent: OneHot.
+struct OneHotOperator : Operator {
+  enum Inputs {
+    INDICES_INPUT = 0,
+    DEPTH_INPUT = 1,
+    ON_VALUE_INPUT = 2,
+    OFF_VALUE_INPUT = 3,
+  };
+
+  OneHotOperator() : Operator(OperatorType::kOneHot) {}
+  int axis = -1;
+};
+
+// LogicalOr operator:
+//
+// Inputs:
+// Inputs[0]: required: A Bool tensor.
+// Inputs[1]: required: A Bool tensor.
+//
+// TensorFlow equivalent: LogicalOr.
+struct LogicalOrOperator : Operator {
+  LogicalOrOperator() : Operator(OperatorType::kLogicalOr) {}
+};
+
+// Unpack operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean input tensor.
+// Inputs[1]: required: reduction_indices.
+//
+// TensorFlow equivalent: tf.unstack.
+struct UnpackOperator : Operator {
+  UnpackOperator() : Operator(OperatorType::kUnpack) {}
+  int num;
+  int axis;
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// ZerosLike operator:
+//
+// Inputs:
+// inputs[0]: required: the input array
+//
+// TensorFlow equivalent: tf.zeros_like
+struct TensorFlowZerosLikeOperator : Operator {
+  TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
+};
+
+// ReverseV2 operator:
+//
+// Inputs:
+// Inputs[0]: required: the input array.
+//
+// TensorFlow equivalent: ReverseV2.
+struct ReverseV2Operator : Operator {
+  ReverseV2Operator() : Operator(OperatorType::kReverseV2) {}
+};
+
+enum class MirrorPadMode { kNone, kSymmetric, kReflect };
+
+// MirrorPad Operator:
+//
+// Inputs:
+// Inputs[0]: required: input tensor to be padded.
+// Inputs[1]: required: 2 Column matrix specifying padding sizes. The number of
+// rows must be the same as the rank of the input.
+// Inputs[2]: required: REFLECT or SYMMETRIC.
+//
+// TensorFlow equivalent: MirrorPad.
+struct MirrorPadOperator : Operator {
+  MirrorPadOperator() : Operator(OperatorType::kMirrorPad) {}
+  // mode is either SYMMETRIC or REFLECT.
+  MirrorPadMode mode;
+};
+
+// ReverseSequence operator:
+//
+// Inputs:
+// Inputs[0]: required: the input array.
+// Inputs[1]: required: the lengths of the elements to be reversed.
+//
+// TensorFlow equivalent: tf.reverse_sequence.
+struct ReverseSequenceOperator : Operator {
+  ReverseSequenceOperator() : Operator(OperatorType::kReverseSequence) {}
+  int seq_dim;
+  int batch_dim = 0;
+};
+
+// Unique Operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Unique
+struct UniqueOperator : Operator {
+  UniqueOperator() : Operator(OperatorType::kUnique) {}
+  ArrayDataType idx_out_type = ArrayDataType::kInt32;
+};
+
+struct UnidirectionalSequenceRnnOperator : Operator {
+  UnidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kUnidirectionalSequenceRnn) {}
+  bool time_major;
+  FusedActivationFunctionType fused_activation_function;
+};
+
+// Where Operator:
+// Return the coordinates of the true values in condition tensor in row-major
+// order.
+//
+// Inputs:
+//  inputs[0]: required: boolean condition tensor
+//
+//  TensorFlow equivalent: Where
+struct WhereOperator : Operator {
+  WhereOperator() : Operator(OperatorType::kWhere) {}
+};
+
+// Matrix Diag Operator:
+// Construct a batched diagonal tensor with given batched diagonal values.
+// Inputs: A tensor of values that will be on the diagonal of the returned
+//         tensor.
+struct MatrixDiagOperator : Operator {
+  MatrixDiagOperator() : Operator(OperatorType::kMatrixDiag) {}
+};
+
+// Matrix Diag Operator V2:
+// Construct a batched diagonal tensor with given batched diagonal values.
+// Not fully supported, contains 4 extra inputs compared to MatrixDiag. Behave
+// like MatrixDiag when default parameters are used.
+struct MatrixDiagV2Operator : Operator {
+  MatrixDiagV2Operator() : Operator(OperatorType::kMatrixDiagV2) {}
+};
+
+// Matrix Diag Operator V3:
+// Construct a batched diagonal tensor with given batched diagonal values.
+// Not fully supported, contains 5 extra inputs compared to MatrixDiag. Behave
+// like MatrixDiag when default parameters are used.
+// V3 is only different from V2 because it has an extra attribute (align) which
+// controls the alignment of diagonals in the band matrix (compact) format.
+// The alignment in V2 contradicts with the default alignment in V3 so V2 is
+// skipped. (It has never been, and should never be, exposed in the public API.)
+struct MatrixDiagV3Operator : Operator {
+  MatrixDiagV3Operator() : Operator(OperatorType::kMatrixDiagV3) {}
+};
+
+// Matrix Set Diag Operator:
+// Construct a batched diagonal tensor with given input and diagonal values.
+// Input is a rank (k+1) tensor of values.
+// diagonal is a rank (k) tensor of values that will be on the diagonal
+// of the returned output. Output is rank k+1.
+//         tensor.
+struct MatrixSetDiagOperator : Operator {
+  MatrixSetDiagOperator() : Operator(OperatorType::kMatrixSetDiag) {}
+};
+
+// Matrix Set Diag Operator V2:
+// Construct a batched diagonal tensor with given input and diagonal values.
+// Not fully supported, contains 1 extra inputs compared to MatrixSetDiag.
+// Behave like MatrixSetDiag when default parameters are used.
+struct MatrixSetDiagV2Operator : Operator {
+  MatrixSetDiagV2Operator() : Operator(OperatorType::kMatrixSetDiagV2) {}
+};
+
+// Matrix Set Diag Operator V3:
+// Construct a batched diagonal tensor with given input and diagonal values.
+// Not fully supported, contains 2 extra inputs compared to MatrixSetDiag.
+// Behave like MatrixSetDiag when default parameters are used.
+// V3 is only different from V2 because it has an extra attribute (align) which
+// controls the alignment of diagonals in the band matrix (compact) format.
+// The alignment in V2 contradicts with the default alignment in V3 so V2 is
+// skipped. (It has never been, and should never be, exposed in the public API.)
+struct MatrixSetDiagV3Operator : Operator {
+  MatrixSetDiagV3Operator() : Operator(OperatorType::kMatrixSetDiagV3) {}
+};
+
+struct ScatterNdOperator : Operator {
+  ScatterNdOperator() : Operator(OperatorType::kScatterNd) {}
+};
+
+struct SegmentSumOperator : Operator {
+  SegmentSumOperator() : Operator(OperatorType::kSegmentSum) {}
+};
+
+// Alloc's are used for transient arrays only. An Alloc specifies which interval
+// of the "transient_data" workspace buffer passed to inference functions, is to
+// be used for the transient array at hand. The 'start' and 'end' values are
+// offsets from the start of the workspace buffer, expressed in bytes.
+struct Alloc {
+  int64_t start = 0;
+  int64_t end = 0;
+};
+
+inline bool operator<(const Alloc& a, const Alloc& b) {
+  return a.start < b.start;
+}
+
+// Array represents an array (either a constant parameter array or an
+// activations array) in a Model.
+struct Array {
+  template <ArrayDataType A>
+  const Buffer<A>& GetBuffer() const {
+    DCHECK(buffer);
+    DCHECK(buffer->type == A);
+    return *static_cast<const Buffer<A>*>(buffer.get());
+  }
+  template <ArrayDataType A>
+  Buffer<A>& GetMutableBuffer() {
+    if (!buffer) {
+      Buffer<A>* ptr = new Buffer<A>;
+      buffer = std::unique_ptr<GenericBuffer>(ptr);
+    }
+    DCHECK(buffer);
+    DCHECK(buffer->type == A);
+    return *static_cast<Buffer<A>*>(buffer.get());
+  }
+  Alloc& GetOrCreateAlloc() {
+    if (!alloc) {
+      alloc = std::make_unique<Alloc>();
+    }
+    return *alloc;
+  }
+  MinMax& GetOrCreateMinMax() {
+    if (!minmax) {
+      minmax = std::make_unique<MinMax>();
+    }
+    return *minmax;
+  }
+  MinMax& GetMinMax() const {
+    DCHECK(minmax);
+    return *minmax;
+  }
+  QuantizationParams& GetOrCreateQuantizationParams() {
+    if (!quantization_params) {
+      quantization_params = std::make_unique<QuantizationParams>();
+    }
+    return *quantization_params;
+  }
+  QuantizationParams& GetQuantizationParams() const {
+    DCHECK(quantization_params);
+    return *quantization_params;
+  }
+
+  // The data type of the actual elements of this array, that is:
+  //  - If there is a buffer (see 'buffer' member), it must be of the same
+  //    type.
+  //  - If there is no buffer, meaning that this is a runtime (i.e. activations)
+  //    array, then this specifies the type of elements that there will be
+  //    at runtime.
+  //
+  // Note that this only specifies the storage type of elements; this does
+  // not specify whether these are to be treated as 'real' or 'quantized'
+  // values.
+  // That is decided by whether the 'quantization_params' member is null.
+  ArrayDataType data_type = ArrayDataType::kNone;
+  // The final value that data_type should have at the end of graph
+  // transformations
+  ArrayDataType final_data_type = ArrayDataType::kNone;
+  // The dimensions of this array --- this specifies both sizes and strides
+  // (the storage layout).
+  //
+  // Issues with shape handling that remain include:
+  //   - No way to distinguish between 0-dimensional dims and missing dims.
+  //   - No way to describe dims that may be runtime-variable.
+  //   - Addressing of dims by integer index differs in different graph formats
+  //     (TensorFlow vs. other frameworks vs. what we have informally grown
+  //     within toco).
+  //     This is currently quite messy; see ReorderAxesOperator which is how we
+  //     bridge some of these discrepancies at the moment. This is overdue for
+  //     a redesign; I'm thinking that it would be nice to have more flexible
+  //     dims that allow mapping 1:1, cleanly, dims as they are in various
+  //     formats,
+  //     then explicitly convert between different conventions.
+
+  // Proto-style accessors
+  bool has_shape() const { return array_shape != nullptr; }
+  const Shape& shape() const {
+    CHECK(has_shape());
+    return *array_shape;
+  }
+  Shape* mutable_shape() {
+    if (!array_shape) {
+      array_shape = std::make_unique<Shape>();
+    }
+    return array_shape.get();
+  }
+  void copy_shape(const Shape& src_shape) { *mutable_shape() = src_shape; }
+  void clear_shape() { array_shape = nullptr; }
+
+  // The constant buffer backing this array. This is non-null if and only if
+  // this is a constant parameter array. Conversely, this is null for
+  // activations arrays.
+  //
+  // Note that this buffer is pure storage. In the case of quantized values,
+  // it only stores the quantized values, it does not know by itself about the
+  // quantization parameters necessary to interprete these values, that is
+  // in the separate 'quantization_params' field. In fact, this 'buffer' field
+  // does no even know whether values are quantized. It only has a data_type,
+  // which must equal the 'data_type' member here, and which only describes
+  // the storage type of element, does not tell whether they are quantized i.e.
+  // whether they are to be interpreted with quantization_params.
+  std::unique_ptr<GenericBuffer> buffer;
+  // Only for activation arrays (i.e. when 'buffer' is null).
+  // Only for code generation.
+  //
+  // Describes the allocation of this array within the workspace buffer
+  // allocated
+  // for all transient arrays.
+  std::unique_ptr<Alloc> alloc;
+  // Describes the [min, max] range of values
+  // to be assumed when determining quantization_params.
+  //
+  // Only used for quantization. In fact, only used for determining
+  // quantization_params.
+  //
+  // Used for both constant arrays (those having a 'buffer') and non-constant
+  // arrays (activations). Indeed, it is important to use the same min-max range
+  // as was used during training, even if that min-max range is slightly wrong
+  // w.r.t. actual buffer elements. Doing otherwise would defeat the point of
+  // re-training for quantization.
+  std::unique_ptr<MinMax> minmax;
+  // Quantization parameters. The non-null-ness of this pointer is what
+  // defines whether this array is quantized or not.
+  //
+  // If this is non-null, then these quantization parameters are to be used
+  // to assign a meaning as real numbers to the elements of this array.
+  std::unique_ptr<QuantizationParams> quantization_params;
+  // narrow_range is a detail of how toco handles FakeQuant operators with
+  // narrow_range, see
+  // https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_vars
+  //
+  // For more context about what that is useful for, see the big comment in
+  // graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+  //
+  // The narrow_range flag applies only to quantized arrays, and changes
+  // their quantization in the following way when it is set to 'true':
+  // 1. The computation of {zero_point, scale} from {min, max} needs to be
+  //    amended so that the real min value will get quantized to
+  //    (min_quantized_value + 1) instead of just (min_quantized_value).
+  //    E.g. for uint8 quantization, the real min value should get quantized to
+  //    the uint8 value 1, not 0.
+  // 2. Quantized values should get clamped to the interval
+  //    [min_quantized_value + 1, max_value]. Equivalently, the
+  //    min_quantized_value should get nudged to (min_quantized_value + 1).
+  // The reason why 1. does not imply 2. is that real values may not belong to
+  // the stated [min, max] interval. Concretely, weights recorded at the last
+  // learning step may not fall in the [min, max] interval recorded over
+  // previous learning steps, as the values evolve across learning steps.
+  //
+  // Rationale why this is directly a field on Array:
+  // - This can't be just a field on FakeQuantOperator, because
+  //   FakeQuantOperators are gone (DropFakeQuant) before we get to using that
+  //   information (Quantize). We need a place to store that bit in the interim.
+  // - This can't be in QuantizationParams because we need to record this
+  //   ahead of quantization, and QuantizationParams are only created during
+  //   quantization.
+  // - This could be in MinMax, but that would be an abuse of what MinMax is
+  //   about, and would break existing code that assumes that a MinMax is just
+  //   a min and a max. Unlike MinMax which is agnostic as to the quantized
+  //   data type, narrow_range refers to values in the quantized data type.
+  bool narrow_range = false;
+
+ private:
+  std::unique_ptr<Shape> array_shape;
+};
+
+// Our Model struct, represents an entire model (our "top-level" struct).
+// Owns everything.
+class Model {
+ public:
+  using ArrayMap = std::unordered_map<std::string, std::unique_ptr<Array>>;
+
+  bool HasArray(const std::string& name) const {
+    return arrays.count(name) > 0;
+  }
+  Array& GetArray(const std::string& name) const {
+    DCHECK(HasArray(name)) << "Array not found: " << name;
+    return *arrays.at(name);
+  }
+  Array& GetOrCreateArray(const std::string& name) {
+    // Make sure name is not used by an optional array
+    DCHECK(!optional_arrays.count(name));
+    if (!HasArray(name)) {
+      Array* ptr = new Array;
+      arrays[name] = std::unique_ptr<Array>(ptr);
+    }
+    Array& result = GetArray(name);
+    return result;
+  }
+  void CreateOptionalArray(const std::string& name) {
+    DCHECK(!arrays.count(name) && !optional_arrays.count(name));
+    optional_arrays.insert(name);
+  }
+  bool IsOptionalArray(const std::string& name) const {
+    return optional_arrays.count(name);
+  }
+
+  // Note that this invalidates all array iterators.
+  void EraseArray(const std::string& name) { arrays.erase(name); }
+  void EraseArrays(std::function<bool(const std::string&)> discardable) {
+    for (auto it = arrays.begin(); it != arrays.end();) {
+      if (discardable(it->first)) {
+        it = arrays.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  const ArrayMap& GetArrayMap() const { return arrays; }
+  ArrayMap& GetMutableArrayMap() { return arrays; }
+
+  int64_t ArithmeticOpsCount() const { return ops_count; }
+
+  void AddInvalidInputArray(std::string invalid_input_array) {
+    invalid_input_arrays_.insert(invalid_input_array);
+  }
+
+  const std::unordered_set<std::string>& GetInvalidInputArrays() const {
+    return invalid_input_arrays_;
+  }
+
+  // Optional arrays are used for optional tensors,
+  // these tensors do not have data, but with reserved names as op inputs.
+  std::set<std::string> optional_arrays;
+
+  // The list of operators. Notice how it's a list of unique_ptr's, implying
+  // that the Model is what owns Operator's and keeps them alive.
+  std::vector<std::unique_ptr<Operator>> operators;
+
+  // Generic flags, a place where we combine information passed to us via
+  // command-line parameters (e.g. --input_width=N) with information that
+  // we may or may not find in the input model file.
+  ModelFlags flags;
+  // For code-generation only: required size of the transient_data buffer
+  std::size_t transient_data_size = 0;
+  // For code-generation only: required alignment of the transient_data buffer
+  std::size_t transient_data_alignment = 0;
+  // Arithmetic operations performed in the model.
+  int64_t ops_count = 0;
+
+ private:
+  // The associative array mapping names to Array's.
+  // Notice how it's a container of unique_ptr's, implying
+  // that the Model is what owns Array's and keeps them alive.
+  // The Operator's refer to these Array's by their name strings, not by their
+  // addresses. See Operator::inputs, Operator::outputs.
+  std::unordered_map<std::string, std::unique_ptr<Array>> arrays;
+
+  // Invalid input arrays.
+  std::unordered_set<std::string> invalid_input_arrays_;
+};
+
+// OperatorSignature contains the information required to making versioning
+// decisions.
+struct OperatorSignature {
+  // The operator.
+  const Operator* op;
+
+  // The model in which the operator resides.
+  const Model* model;
+};
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/model_cmdline_flags.h b/third_party/tflite-hdrs/tensorflow/lite/toco/model_cmdline_flags.h
new file mode 100644
index 00000000..23e79e62
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/model_cmdline_flags.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#define TENSORFLOW_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
+
+namespace toco {
+// Parse and remove arguments for models (in toco). Returns true if parsing
+// is successful. msg has the usage string if there was an error or
+// "--help" was specified
+bool ParseModelFlagsFromCommandLineFlags(
+    int* argc, char* argv[], std::string* msg,
+    ParsedModelFlags* parsed_model_flags_ptr);
+// Populate the ModelFlags proto with model data.
+void ReadModelFlagsFromCommandLineFlags(
+    const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags);
+// Parse the global model flags to a static
+void ParseModelFlagsOrDie(int* argc, char* argv[]);
+// Get the global parsed model flags
+ParsedModelFlags* GlobalParsedModelFlags();
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/python/toco_python_api.h b/third_party/tflite-hdrs/tensorflow/lite/toco/python/toco_python_api.h
new file mode 100644
index 00000000..e8ca6c8b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/python/toco_python_api.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#define TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+
+#include <Python.h>
+
+namespace toco {
+
+// Convert a model represented in `input_contents`. `model_flags_proto`
+// describes model parameters. `toco_flags_proto` describes conversion
+// parameters (see relevant .protos for more information). Returns a string
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statistics like arithmetic ops count.
+PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
+                      PyObject* toco_flags_proto_txt_raw,
+                      PyObject* input_contents_txt_raw,
+                      bool extended_return = false);
+
+// All the exported functions should be listed in
+// tensorflow/tools/def_file_filter/symbols_pybind.txt for the Windows build.
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/runtime/common.h b/third_party/tflite-hdrs/tensorflow/lite/toco/runtime/common.h
new file mode 100644
index 00000000..1f83be8f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/runtime/common.h
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_RUNTIME_COMMON_H_
+#define TENSORFLOW_LITE_TOCO_RUNTIME_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+#endif  // TENSORFLOW_LITE_TOCO_RUNTIME_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/runtime/types.h b/third_party/tflite-hdrs/tensorflow/lite/toco/runtime/types.h
new file mode 100644
index 00000000..eac9b8af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/runtime/types.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_RUNTIME_TYPES_H_
+#define TENSORFLOW_LITE_TOCO_RUNTIME_TYPES_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace toco {
+
+// TODO(ahentz): These are just stopgaps for now, untils we move all
+// the code over to tflite.
+using tflite::Dims;
+using tflite::FullyConnectedWeightsFormat;
+using tflite::FusedActivationFunctionType;
+using tflite::RequiredBufferSizeForDims;
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_RUNTIME_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
new file mode 100644
index 00000000..7dc79f17
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+
+// The base class for Cluster. A cluster is group of nodes all related to each
+// other because their name match a given "pattern", which shows they all belong
+// to a composite op supported in TFLite. The nodes in a cluster will be
+// collapsed into a single composite op node plus a series of constant nodes
+// holding the input parameters to that node. The nodes in a cluster are assumed
+// to be using the same device. By changing the "pattern" we can have different
+// subclasses of the base Cluster class.
+class Cluster {
+ public:
+  virtual ~Cluster() {}
+
+  virtual void CreateNodes() = 0;
+
+  // Save the following info from the original GraphDef this cluster is from:
+  // 1- a pointer to the GraphDef
+  // 2- All the nodes in GraphDef which belong to this cluster.
+  void SetGraphDefInfo(const tensorflow::GraphDef* graph_def);
+
+  const std::string& GetName() const { return name_; }
+
+  const std::vector<std::unique_ptr<tensorflow::NodeDef>>& GetNewNodes() const {
+    return new_nodes_;
+  }
+
+  const std::vector<const tensorflow::NodeDef*>& GetNodes() { return nodes_; }
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  void SetDevice(const std::string& device) { device_ = device; }
+
+  // Find the input(s) and output(s) of this Cluster.
+  bool FindClusterInputsAndOutputs();
+
+ protected:
+  std::string name_;
+  std::string device_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+
+  // Used to hold the pointers to nodes which are in this cluster. These nodes
+  // are pointing to the nodes in graph_def_.
+  std::vector<const tensorflow::NodeDef*> nodes_;
+
+  // Used to cache the newly generated nodes: like the nodes created by
+  // collapsing Const nodes, or the nodes which is used to show the composite
+  // op.
+  std::vector<std::unique_ptr<tensorflow::NodeDef>> new_nodes_;
+
+  const tensorflow::GraphDef* graph_def_; /*Not owned*/
+};
+
+// A factory interface for cluster class.
+// It defines a virtual function interface which is responsible for creating
+// a cluster. Each cluster factory is responsible to pack a cluster of nodes
+// into a cluster using a name-based pattern matching approach.
+class ClusterFactoryInterface {
+ public:
+  virtual ~ClusterFactoryInterface() {}
+
+  // Creates a cluster of nodes using a name-based pattern matching approach. It
+  // uses a node as a seed and if its name matches a certain pattern, then it
+  // builds the cluster around that node.
+  virtual std::unique_ptr<Cluster> CreateCluster(
+      const tensorflow::NodeDef& node,
+      const tensorflow::GraphDef& graph_def) const = 0;
+};
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h
new file mode 100644
index 00000000..96303a00
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
+
+#include <string>
+
+namespace toco {
+
+// Check if string x includes string search_pattern.
+bool StrContains(const std::string& x, const std::string& search_pattern);
+
+// Transpose a 2D tensor of size row * col pointed by "tensor" and return the
+// results in "transposed_tensor". "transposed_tensor" must be pre-allocated
+// by the same size as "tensor".
+void Transpose2DTensor(const float* tensor, int row, int col,
+                       float* transposed_tensor);
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
new file mode 100644
index 00000000..5215114f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+// Given a graph info and a list of cluster classes (cluster_factories), it
+// partitions the graph to clusters, and then collapses each cluster into their
+// corresponding composite ops. It generates a new graph using the newly
+// generated composite ops. Each cluster factory is responsible to recognize a
+// cluster of nodes into a cluster using a name-based pattern matching approach.
+std::unique_ptr<tensorflow::GraphDef> MaybeResolveClusters(
+    const tensorflow::GraphDef& graph_def,
+    const std::vector<ClusterFactoryInterface*>& cluster_factories);
+
+// Adds a node to a given graph. The added node will be a copy of a given source
+// node, except for the inputs. If the inputs are coming from a node which
+// belongs to another cluster, then those inputs are renamed to the source
+// cluster name.
+void AddNodeToGraph(const tensorflow::NodeDef& node,
+                    const std::vector<std::string>& cluster_names,
+                    tensorflow::GraphDef* graph);
+
+// Given a graph and a cluster class, it finds all the nodes which belong to a
+// given class factory, encapsulate them inside a cluster of the given type and
+// returns a vector of those clusters. It also labels the nodes in that graph if
+// they belong to the generated clusters.
+bool FindCluster(const ClusterFactoryInterface& cluster_factory,
+                 const tensorflow::GraphDef& graph_def,
+                 std::unordered_map<std::string, bool>* is_node_in_cluster,
+                 std::vector<std::unique_ptr<Cluster>>* clusters);
+
+// Receives a graph and generates another graph by replacing the cluster of
+// nodes which matches a given composite op. Each composite op is represented
+// using a class factory.
+std::unique_ptr<tensorflow::GraphDef> MaybeReplaceCompositeSubgraph(
+    const tensorflow::GraphDef& tf_graph);
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
new file mode 100644
index 00000000..285c3762
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+class SvdfCluster : public Cluster {
+ public:
+  // For this cluster, it collapses all the nodes in nodes_ into a composite op
+  // and it returns all the newly generated ops in new_nodes_.
+  void CreateNodes() override;
+
+  // A helper function to set the pattern of Const nodes which CreateNodes()
+  // should handle specially.
+  void AddConstNodePattern(const std::string& const_pattern) {
+    const_node_patterns_.push_back(const_pattern);
+  }
+
+  ~SvdfCluster() override {}
+
+ private:
+  // The main function which is used to create Const nodes for this cluster.
+  // These Const nodes are the inputs to the composite op generated for this
+  // cluster.
+  void CreateConstNode(const std::string& const_pattern);
+
+  // Receives a vector of Const nodes, merge them (if necessary) and returns
+  // only one Const node holding all the arrays contents. It transposes it if
+  // needed.
+  void MaybeMergeConstNodes(
+      const std::vector<const tensorflow::NodeDef*>& const_node_parts,
+      bool transpose_tensor_value,
+      const std::unique_ptr<tensorflow::NodeDef>& merged_node);
+
+  // Infer the value of Svdf filter rank, by looking up a reshape operator which
+  // is used for 'output' which reshapes output from [num_filters, batch, 1]
+  // shape to [num_units, rank, batch] shape. The 2nd shape element is rank.
+  int InferFilterRank();
+
+  std::vector<std::string> const_node_patterns_;
+};
+
+class SvdfClusterFactory : public ClusterFactoryInterface {
+ public:
+  // Creates a cluster of nodes using a name-based pattern matching approach. It
+  // uses a node as a seed and if its name matches a certain pattern, then it
+  // builds the cluster around that node.
+  // This factory expects nodes which have "SVDF_weights_feature" and
+  // "SVDF_weights_time" pattern in their names (and optionally "SVDF_bias")
+  // and it creates an SVDF Op from them.
+  std::unique_ptr<Cluster> CreateCluster(
+      const tensorflow::NodeDef& node,
+      const tensorflow::GraphDef& graph_def) const override;
+};
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_util.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_util.h
new file mode 100644
index 00000000..6abad52b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tensorflow_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+
+void LogDumpGraphDef(int log_level, const std::string& message,
+                     const tensorflow::GraphDef& tf_graph);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/builtin_operator.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/builtin_operator.h
new file mode 100644
index 00000000..fa4c1df9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/builtin_operator.h
@@ -0,0 +1,88 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/mlir/lite/tools/versioning/op_version.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Builtin operators have special TF Lite objects describing their options.
+// This class has the boilerplate code for creating those.
+//
+// Template arguments:
+//   - T1 must derive from ::toco::Operator.
+//   - T2 must be one of TF Lite's objects defining Builtin Options, such as
+//     ::tflite::Conv2DOptions.
+template <typename T1, typename T2, ::tflite::BuiltinOptions TfLiteEnum>
+class BuiltinOperator : public BaseOperator {
+ public:
+  using TocoOperator = T1;
+  using TfLiteOptions = T2;
+
+  BuiltinOperator(::tflite::BuiltinOperator op, OperatorType type)
+      : BaseOperator(::tflite::EnumNameBuiltinOperator(op), type),
+        builtin_op_(op) {}
+
+  // Build the configuration object in the given flatbuffer builder. Return
+  // its offset.
+  virtual flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const = 0;
+
+  // Read options from the TF Lite object and set the corresponding values in
+  // the tf.mini operator.
+  virtual void ReadOptions(const TfLiteOptions& opt,
+                           TocoOperator* op) const = 0;
+
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    auto options = WriteOptions(static_cast<const TocoOperator&>(op), builder);
+    return Options::Builtin(TfLiteEnum, options.Union());
+  }
+
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    auto op = std::make_unique<TocoOperator>();
+    auto* options = static_cast<const TfLiteOptions*>(builtin_options);
+    if (options) {
+      ReadOptions(*options, op.get());
+    }
+    return std::unique_ptr<Operator>(op.release());
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return ::tflite::GetBuiltinOperatorVersion(
+        GetVersioningOpSig(builtin_op_, op_signature));
+  }
+
+  ::tflite::BuiltinOperator builtin_op() const { return builtin_op_; }
+
+ private:
+  const ::tflite::BuiltinOperator builtin_op_;
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/custom_operator.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/custom_operator.h
new file mode 100644
index 00000000..78c319f9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/custom_operator.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+
+#include <memory>
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Custom operators have a generic byte buffer describing their options. This
+// class provides the boilerplate code for populating those options using
+// flexbuffers. Note that most of toco's operators will likely be supported
+// as builtin operators in TF Lite.
+//
+// Template argument T must derive from ::toco::Operator.
+template <typename T>
+class CustomOperator : public BaseOperator {
+ public:
+  using TocoOperator = T;
+  using BaseOperator::BaseOperator;
+
+  // Populate the given flexbuffer with options obtained from the tf.mini
+  // operator.
+  virtual void WriteOptions(const TocoOperator& op,
+                            flexbuffers::Builder* fbb) const {}
+
+  // Set options in the given tf.mini operator using values from the flexbuffer
+  // map.
+  virtual void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const {}
+
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    flexbuffers::Builder fbb;
+    fbb.Map(
+        [&]() { WriteOptions(static_cast<const TocoOperator&>(op), &fbb); });
+    fbb.Finish();
+    return Options::Custom(builder->CreateVector(fbb.GetBuffer()));
+  }
+
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    auto op = std::make_unique<TocoOperator>();
+    if (custom_options) {
+      auto flexbuffer_map =
+          flexbuffers::GetRoot(custom_options->data(), custom_options->size())
+              .AsMap();
+      ReadOptions(flexbuffer_map, op.get());
+    }
+    return std::unique_ptr<Operator>(op.release());
+  }
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/export.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/export.h
new file mode 100644
index 00000000..ea58c967
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/export.h
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_EXPORT_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_EXPORT_H_
+
+#include <string>
+
+#include "absl/log/log.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/util.h"
+
+namespace toco {
+
+namespace tflite {
+
+enum class QuantizedBufferType { NONE, INT8, FLOAT16 };
+
+// The parameters for exporting a TFLite model.
+struct ExportParams {
+  bool allow_custom_ops = false;
+  bool allow_dynamic_tensors = true;
+  bool enable_select_tf_ops = false;
+  QuantizedBufferType quantize_weights = QuantizedBufferType::NONE;
+  // Whether to use per-tensor (false) or per-channel (true) for hybrid quant.
+  bool disable_per_channel = false;
+};
+
+// Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
+// result in the given string.
+absl::Status Export(const Model& model, std::string* output_file_contents,
+                    const ExportParams& params);
+
+// Export API with custom TFLite operator mapping.
+absl::Status Export(
+    const Model& model, std::string* output_file_contents,
+    const ExportParams& params,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
+
+// This is for backward-compatibility.
+inline void Export(const Model& model, bool allow_custom_ops,
+                   bool quantize_weights, std::string* output_file_contents) {
+  ExportParams params;
+  params.allow_custom_ops = allow_custom_ops;
+  params.quantize_weights =
+      quantize_weights ? QuantizedBufferType::INT8 : QuantizedBufferType::NONE;
+  auto status = Export(model, output_file_contents, params);
+  if (!status.ok()) LOG(QFATAL) << status.message();
+}
+
+// This is for backward-compatibility.
+inline void Export(
+    const Model& model, bool allow_custom_ops, bool quantize_weights,
+    std::string* output_file_contents,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
+  ExportParams params;
+  params.allow_custom_ops = allow_custom_ops;
+  params.quantize_weights =
+      quantize_weights ? QuantizedBufferType::INT8 : QuantizedBufferType::NONE;
+  auto status = Export(model, output_file_contents, params, ops_by_type);
+  if (!status.ok()) LOG(QFATAL) << status.message();
+}
+
+// This is for backward-compatibility.
+inline void Export(const Model& model, std::string* output_file_contents) {
+  ExportParams params;
+  params.allow_custom_ops = true;
+  auto status = Export(model, output_file_contents, params);
+  if (!status.ok()) LOG(QFATAL) << status.message();
+}
+
+namespace details {
+
+// A map from tensor name to its final position in the TF Lite buffer.
+using TensorsMap = std::unordered_map<std::string, int>;
+
+// A key to identify an operator.
+// Only when `type` is `kUnsupported`, `custom_code` is filled to
+// identify which operation is used.
+class OperatorKey {
+ public:
+  OperatorKey() {}
+
+  // Construct OperatorKey by Toco op.
+  OperatorKey(
+      const ::toco::OperatorSignature& op_signature,
+      const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+      bool enable_select_tf_ops);
+
+  // Construct OperatorKey by type, custom code and version.
+  // Note that this construct doesn't set the additional information including
+  // `is_custom_op`, `is_flex_op`, `is_unsupported_flex_op`.
+  OperatorKey(::tflite::BuiltinOperator type, const std::string& custom_code,
+              int version)
+      : type_(type), custom_code_(custom_code), version_(version) {}
+
+  // Only `type`, `custom_code` and `version` is used to compute hash and
+  // identity.
+  ::tflite::BuiltinOperator type() const { return type_; }
+  const std::string& custom_code() const { return custom_code_; }
+  int version() const { return version_; }
+
+  // The attributes below are not used to compute hash and identity.
+  //
+  // Return true if the op is a custom op. Note it will return false for Flex
+  // ops.
+  bool is_custom_op() const { return is_custom_op_; }
+  // Return true if the op is a Flex op.
+  bool is_flex_op() const { return is_flex_op_; }
+  // Return true if the op is a Flex op but it's knwon that the op is not
+  // supported by Flex runtime.
+  bool is_unsupported_flex_op() const { return is_unsupported_flex_op_; }
+  // Return the original TensorFlow op name for a Flex op.
+  const std::string& flex_tensorflow_op() const { return flex_tensorflow_op_; }
+
+  bool operator<(const OperatorKey& other) const {
+    if (type_ < other.type_)
+      return true;
+    else if (type_ > other.type_)
+      return false;
+    else if (custom_code_ < other.custom_code_)
+      return true;
+    else if (custom_code_ > other.custom_code_)
+      return false;
+    else
+      return version_ < other.version_;
+  }
+
+  bool operator==(const OperatorKey& other) const {
+    return type_ == other.type_ && custom_code_ == other.custom_code_ &&
+           version_ == other.version_;
+  }
+
+  struct Hash {
+    size_t operator()(const OperatorKey& key) const {
+      return ::tflite::CombineHashes(
+          {std::hash<size_t>()(static_cast<size_t>(key.type())),
+           std::hash<std::string>()(key.custom_code()),
+           std::hash<int>()(key.version())});
+    }
+  };
+
+ private:
+  ::tflite::BuiltinOperator type_ = ::tflite::BuiltinOperator_CUSTOM;
+  std::string custom_code_;
+  int version_ = 1;
+
+  bool is_custom_op_ = false;
+  bool is_flex_op_ = false;
+  bool is_unsupported_flex_op_ = false;
+  // The original TensorFlow op name for the flex op. Filled only when
+  // `is_flex_op` is true.
+  std::string flex_tensorflow_op_;
+};
+
+// A map from OperatorKey to its final position in the TF Lite buffer.
+using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
+
+void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    bool enable_select_tf_ops);
+
+}  // namespace details
+}  // namespace tflite
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/import.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/import.h
new file mode 100644
index 00000000..21a003a9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/import.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_IMPORT_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_IMPORT_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Parse the given string as TF Lite flatbuffer and return a new tf.mini model.
+std::unique_ptr<Model> Import(const ModelFlags &model_flags,
+                              const std::string &input_file_contents);
+
+namespace details {
+
+// The names of all tensors found in a TF Lite model.
+using TensorsTable = std::vector<std::string>;
+
+// The names of all operators found in TF Lite model. If the operator is
+// builtin, the string representation of the corresponding enum value is used
+// as name.
+using OperatorsTable = std::vector<std::string>;
+
+void LoadTensorsTable(const ::tflite::Model &input_model,
+                      TensorsTable *tensors_table);
+void LoadOperatorsTable(const ::tflite::Model &input_model,
+                        OperatorsTable *operators_table);
+
+}  // namespace details
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_IMPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/operator.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/operator.h
new file mode 100644
index 00000000..7dd941ad
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/operator.h
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_OPERATOR_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/tools/versioning/op_signature.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+namespace tflite {
+
+class BaseOperator;
+
+// Return a map contained all know TF Lite Operators, keyed by their names.
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+    bool enable_select_tf_ops = false);
+
+// Return a map contained all know TF Lite Operators, keyed by the type of
+// their tf.mini counterparts.
+std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
+    bool enable_select_tf_ops = false);
+
+// Write the custom option FlexBuffer with a serialized TensorFlow NodeDef
+// for a Flex op.
+std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
+    const std::string& tensorflow_node_def);
+
+// These are the flatbuffer types for custom and builtin options.
+using CustomOptions = flatbuffers::Vector<uint8_t>;
+using BuiltinOptions = void;
+
+// A simple wrapper around the flatbuffer objects used to describe options that
+// configure operators.
+struct Options {
+  // Build custom options.
+  static Options Custom(flatbuffers::Offset<CustomOptions> offset) {
+    return {::tflite::BuiltinOptions_NONE, 0, offset};
+  }
+
+  // Build builtin options of the given type.
+  static Options Builtin(::tflite::BuiltinOptions type,
+                         flatbuffers::Offset<BuiltinOptions> offset) {
+    return {type, offset, 0};
+  }
+
+  ::tflite::BuiltinOptions type;
+  flatbuffers::Offset<BuiltinOptions> builtin;
+  flatbuffers::Offset<CustomOptions> custom;
+};
+
+// A BaseOperator encapsulates the relationship between operators in tf.mini
+// and TF lite, and provides methods for converting between those two formats.
+class BaseOperator {
+ public:
+  // Build an operator with the given TF Lite name and tf.mini type.
+  BaseOperator(const std::string& name, OperatorType type)
+      : name_(name), type_(type) {}
+  virtual ~BaseOperator() = default;
+
+  std::string name() const { return name_; }
+  OperatorType type() const { return type_; }
+
+  // Given a tf.mini operator, create the corresponding flatbuffer options and
+  // return their offsets.
+  virtual Options Serialize(const Operator& op,
+                            flatbuffers::FlatBufferBuilder* builder) const = 0;
+
+  // Read TF Lite options and create the appropriate tf.mini operator.
+  virtual std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const = 0;
+
+  // Get the op version using the OperatorSignature.
+  // The function needs to be overridden to return the op version based on the
+  // parameters. Note:
+  // * The first version for each op should be 1 (to be consistent with the
+  //   default value in Flatbuffer. `return 1;` is okay for newly implemented
+  //   ops.
+  // * When multiple versions are defined for an op, this function could be
+  //   overridden. (See example in `operator_test.cc` and
+  //   'tools/versioning/op_version.cc`)
+  virtual int GetVersion(const OperatorSignature& op_signature) const = 0;
+
+  // Given a Toco `Operator`, return a list of booleans indicating the op
+  // mutates which input variables.
+  // * If the op mutates any input variables, it should return a list of bool
+  //   with the same length as inputs.
+  // * Otherwise, it will return an empty list.
+  virtual std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const {
+    // Most ops don't have variable tensors. This function can be overridden.
+    return std::vector<bool>();
+  }
+
+ private:
+  std::string name_;
+  OperatorType type_;
+};
+
+// Helper function to create ::tflite::OpSignature from the given
+// ::tflite::BuiltinOperator and OperatorSignature.
+::tflite::OpSignature GetVersioningOpSig(const ::tflite::BuiltinOperator op,
+                                         const OperatorSignature& op_signature);
+
+// Helper function to determine if a unsupported TensorFlow op should be
+// exported as an Flex op or a regular custom op.
+bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
+                          const std::string& tensorflow_op_name);
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_OPERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/simple_operator.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/simple_operator.h
new file mode 100644
index 00000000..7f26ee2e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/simple_operator.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+
+#include "tensorflow/compiler/mlir/lite/tools/versioning/op_version.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Simple operators don't have any configuration options and can be trivially
+// serialized and deserialized. Note that most of toco's operators will
+// likely be supported as builtin operators in TF Lite.  Simple (and custom)
+// operators are mostly a convenience for the times when tf.mini supports more
+// operators than TF Lite.
+//
+// Template argument T must derive from ::toco::Operator.
+template <typename T>
+class SimpleOperator : public BaseOperator {
+ public:
+  using BaseOperator::BaseOperator;
+
+  SimpleOperator(::tflite::BuiltinOperator op, OperatorType type)
+      : BaseOperator(::tflite::EnumNameBuiltinOperator(op), type),
+        builtin_op_(op) {}
+
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    return Options();
+  }
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    return std::unique_ptr<Operator>(new T);
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return ::tflite::GetBuiltinOperatorVersion(
+        GetVersioningOpSig(builtin_op_, op_signature));
+  }
+
+  ::tflite::BuiltinOperator builtin_op() const { return builtin_op_; }
+
+ private:
+  const ::tflite::BuiltinOperator builtin_op_;
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/types.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/types.h
new file mode 100644
index 00000000..ef655b60
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tflite/types.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
+
+#include <cstdint>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+
+namespace toco {
+
+namespace tflite {
+
+struct DataType {
+  static ::tflite::TensorType Serialize(ArrayDataType array_data_type);
+  static ArrayDataType Deserialize(int tensor_type);
+};
+
+struct DataBuffer {
+  using FlatBufferOffset = flatbuffers::Offset<flatbuffers::Vector<uint8_t>>;
+
+  // Build the flatbuffer representation of a toco's Array and return the
+  // corresponding offset into the flatbuffer. Note that data from the array
+  // will be copied into the flatbuffer.
+  static FlatBufferOffset Serialize(const Array& array,
+                                    flatbuffers::FlatBufferBuilder* builder);
+  // Copy data from the given tensor into toco's Array.
+  static void Deserialize(const ::tflite::Tensor& tensor,
+                          const ::tflite::Buffer& buffer, Array* array);
+};
+
+struct Padding {
+  static ::tflite::Padding Serialize(PaddingType padding_type);
+  static PaddingType Deserialize(int padding);
+};
+
+struct ActivationFunction {
+  static ::tflite::ActivationFunctionType Serialize(
+      FusedActivationFunctionType faf_type);
+  static FusedActivationFunctionType Deserialize(int activation_function);
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/toco_cmdline_flags.h b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_cmdline_flags.h
new file mode 100644
index 00000000..278c49d2
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_cmdline_flags.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
+
+namespace toco {
+// Parse and remove arguments handled from toco. Returns true if parsing
+// is successful. msg has the usage string if there was an error or
+// "--help" was specified
+bool ParseTocoFlagsFromCommandLineFlags(int* argc, char* argv[],
+                                        std::string* msg,
+                                        ParsedTocoFlags* parsed_toco_flags_ptr);
+// Populate the TocoFlags proto with parsed_toco_flags data.
+void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
+                                       TocoFlags* toco_flags);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/toco_convert.h b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_convert.h
new file mode 100644
index 00000000..e77ab87f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_convert.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+absl::Status Convert(const std::string& graph_def_contents,
+                     const TocoFlags& toco_flags, const ModelFlags& model_flags,
+                     std::string* output_file_contents,
+                     int64_t* arithmetic_ops_count = nullptr);
+
+absl::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                     const ParsedModelFlags& parsed_model_flags);
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/toco_graphviz_dump_options.h b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_graphviz_dump_options.h
new file mode 100644
index 00000000..00d9cd13
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_graphviz_dump_options.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+
+#include <string>
+
+namespace toco {
+
+// Global data for determining whether to output graph viz format from toco.
+struct GraphVizDumpOptions {
+  std::string dump_graphviz;
+  bool dump_graphviz_video = false;
+
+  static GraphVizDumpOptions* singleton();
+};
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/toco_port.h b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_port.h
new file mode 100644
index 00000000..553830bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_port.h
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_PORT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_PORT_H_
+
+// Portability layer for toco tool. Mainly, abstract filesystem access so we
+// can build and use on google internal environments and on OSX.
+
+#include <string>
+#include "google/protobuf/text_format.h"
+#include "tensorflow/lite/toco/format_port.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform.h"
+#if defined(PLATFORM_GOOGLE)
+#include "absl/strings/cord.h"
+#endif  // PLATFORM_GOOGLE
+
+#ifdef PLATFORM_GOOGLE
+#define TFLITE_PROTO_NS proto2
+#else
+#define TFLITE_PROTO_NS google::protobuf
+#endif
+
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
+namespace toco {
+namespace port {
+
+// Things like tests use other initialization routines that need control
+// of flags. However, for testing we still want to use toco_port.h facilities.
+// This function sets initialized flag trivially.
+void InitGoogleWasDoneElsewhere();
+void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags);
+void CheckInitGoogleIsDone(const char* message);
+
+namespace file {
+class Options {};
+inline Options Defaults() {
+  Options o;
+  return o;
+}
+absl::Status GetContents(const std::string& filename, std::string* contents,
+                         const Options& options);
+absl::Status SetContents(const std::string& filename,
+                         const std::string& contents, const Options& options);
+std::string JoinPath(const std::string& a, const std::string& b);
+absl::Status Writable(const std::string& filename);
+absl::Status Readable(const std::string& filename, const Options& options);
+absl::Status Exists(const std::string& filename, const Options& options);
+}  // namespace file
+
+// Copy `src` string to `dest`. User must ensure `dest` has enough space.
+#if defined(PLATFORM_GOOGLE)
+void CopyToBuffer(const ::absl::Cord& src, char* dest);
+#endif  // PLATFORM_GOOGLE
+void CopyToBuffer(const std::string& src, char* dest);
+
+inline uint32 ReverseBits32(uint32 n) {
+  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
+  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
+  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
+  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
+          ((n & 0xFF000000) >> 24));
+}
+}  // namespace port
+
+inline bool ParseFromStringOverload(const std::string& in,
+                                    TFLITE_PROTO_NS::Message* proto) {
+  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
+}
+
+template <typename Proto>
+bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
+                                       Proto* proto) {
+  if (proto->ParseFromString(input_file_contents)) {
+    return true;
+  }
+
+  if (ParseFromStringOverload(input_file_contents, proto)) {
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_PORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/toco_tooling.h b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_tooling.h
new file mode 100644
index 00000000..64d78f5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_tooling.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_TOOLING_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_TOOLING_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+// Imports the input file into a Model object.
+std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
+                              const ModelFlags& model_flags,
+                              const std::string& input_file_contents);
+
+// Transforms a Model. The resulting Model is ready to be passed
+// to Export with the exact same toco_flags.
+absl::Status TransformWithStatus(const TocoFlags& toco_flags, Model* model);
+inline void Transform(const TocoFlags& toco_flags, Model* model) {
+  auto s = TransformWithStatus(toco_flags, model);
+  CHECK(s.ok()) << s.message();
+}
+
+// Exports the Model, which must be of the 'lowered' form returned by
+// Transform, to a file of the format given by
+// toco_flags.output_format().
+absl::Status Export(const TocoFlags& toco_flags, const Model& model,
+                    bool allow_custom_ops, std::string* output_file_contents);
+
+// This if for backward-compatibility with internal tools.
+inline void Export(const TocoFlags& toco_flags, const Model& model,
+                   std::string* output_file_contents) {
+  auto status = Export(toco_flags, model, true, output_file_contents);
+  if (!status.ok()) {
+    LOG(QFATAL) << status.message();
+  }
+}
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_TOOLING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/toco_types.h b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_types.h
new file mode 100644
index 00000000..73f59af8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/toco_types.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace toco {
+
+using ::int64_t;
+using std::string;
+using tensorflow::int16;
+using tensorflow::int32;
+using tensorflow::int8;
+using tensorflow::uint16;
+using tensorflow::uint32;
+using tensorflow::uint64;
+using tensorflow::uint8;
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/toco/tooling_util.h b/third_party/tflite-hdrs/tensorflow/lite/toco/tooling_util.h
new file mode 100644
index 00000000..f87982e4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/toco/tooling_util.h
@@ -0,0 +1,374 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOOLING_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_TOOLING_UTIL_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
+
+// TODO(aselle): Replace with using a container specific hash override instead.
+namespace std {
+template <>
+struct hash<toco::OperatorType> {
+  size_t operator()(const toco::OperatorType& op) const {
+    return std::hash<size_t>()(static_cast<size_t>(op));
+  }
+};
+}  // namespace std
+
+namespace toco {
+
+constexpr int kLogLevelModelChanged = 1;
+constexpr int kLogLevelModelUnchanged = 2;
+
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b);
+std::string LogName(const Operator& op);
+
+std::string ArrayDataTypeName(ArrayDataType data_type);
+
+// Returns true if the given array is specified as a model input array.
+bool IsInputArray(const Model& model, const std::string& array_name);
+// Returns true if the given array is specified as a model output array.
+bool IsOutputArray(const Model& model, const std::string& array_name);
+
+bool IsArrayConsumed(const Model& model, const std::string& name);
+int CountTrueOutputs(const Model& model, const Operator& op);
+
+int CountOpsWithInput(const Model& model, const std::string& array_name);
+bool DeleteArrayIfUnused(const std::string& array_name, Model* model);
+
+// Deletes the op and any of its input and output arrays if they are unused
+// after the op has been deleted.
+void DeleteOpAndArrays(Model* model, const Operator* op);
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
+    const Model& model, const std::string& array_name);
+Operator* GetOpWithOutput(const Model& model, const std::string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
+    Model& model, const std::string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
+    const Model& model, const std::string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
+    Model& model, const std::string& array_name);
+
+Operator* GetOpWithInput(const Model& model, const std::string& array_name);
+Operator* GetFirstOpWithInput(const Model& model,
+                              const std::string& array_name);
+
+// Replaces all uses of the |old_array_name| with the |new_array_name|.
+void ReplaceArrayUsage(Model* model, const std::string& old_array_name,
+                       const std::string& new_array_name);
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
+    const Model& model, const Operator* op);
+std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
+                                                        const Operator* op);
+
+const char* OperatorTypeName(OperatorType type);
+std::string HelpfulOperatorTypeName(const Operator& op);
+
+// Whether the operator can be fused with an activation function. Note that this
+// will return false by default for new operators; fusing support is opt-in.
+bool OperatorSupportsFusedActivation(OperatorType type);
+
+void DumpGraphvizVideoFrame(const Model& model);
+void LogDump(int log_level, const std::string& message, const Model& model);
+void LogSummary(int log_level, const std::string& message, const Model& model);
+
+// TODO(b/36075966): Clean up when dims superseded by array shape.
+void ExtendShape(Shape* shape, int new_shape_size);
+
+// TODO(b/36075966): Clean up when dims superseded by array shape.
+void UnextendShape(Shape* shape, int new_shape_size);
+
+// Checks that all dimensions of 'shape' are at least 1. Note that scalars,
+// lacking dimensions, satisfy this condition and are considered non-empty.
+bool IsNonEmpty(const Shape& shape);
+
+// Given two shapes with potentially different dimensionality and dimension
+// arrays d0 and d1. Without loss of generality, assume that shape0 may have
+// higher dimensionality (length(d0) >= length(d1)). Then shape0 and shape1
+// "agree up to broadcasting" if:
+// - When walking the d0 and d1 from back to front with indices i0, i1,
+//   d0[i0] == d1[i1] or d0[i0] == 1 or d1[i1] == 1, for each dimension until
+//   i1 == 0 (inclusive).
+bool ShapesAgreeUpToBroadcasting(const Shape& shape0, const Shape& shape1);
+
+// A stricter constraint than ShapesAgreeUpToBroadcasting().
+//
+// Given two shapes with potentially different dimensionality and dimension
+// arrays d0 and d1. Without loss of generality, assume that shape0 may have
+// higher dimensionality (length(d0) >= length(d1)). Then shape0 and shape1
+// "agree up to extending" if:
+// - When walking the d0 and d1 from back to front with indices i0, i1,
+//   d0[i0] == d1[i1] for each dimension until i1 == 0 (inclusive).
+// - For the remaining indices [0..i0), d0[i0] == 1.
+bool ShapesAgreeUpToExtending(const Shape& shape0, const Shape& shape1);
+
+inline ::tflite::RuntimeShape ToRuntimeShape(const Shape& shape) {
+  return ::tflite::RuntimeShape(shape.dimensions_count(), shape.dims().data());
+}
+
+bool IsArrayFullyConnectedWeights(const Model& model, const std::string& name);
+
+// If there is a wildcard dimension (-1), this may return a negative value.
+int RequiredBufferSizeForShape(const Shape& shape);
+
+bool IsConstantParameterArray(const Model& model, const std::string& name);
+
+// Compares two constant parameter arrays for exact equality.
+bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array);
+
+void CheckNoMissingArray(const Model& model);
+void CheckInvariants(const Model& model);
+
+void CheckModelCounts(const Model& model);
+
+void FixOperatorOrdering(Model* model);
+void FixNoMissingArray(Model* model);
+void FixNoOrphanedArray(Model* model);
+
+// Fixes input/output arrays that may have issues during export or inference.
+void FixEdgeArrays(Model* model);
+
+// Finds and deduplicates large constant arrays in the model.
+// After constant propagation runs it's possible to end up with several of the
+// same large array (whether they be zeros or otherwise).
+//
+// |min_size| is used to adjust the minimum size in bytes of an array before
+// it's considered for deduping. As deduping can make the graphs more difficult
+// to read this helps prevent small arrays from spidering out.
+void DedupeConstantArrays(Model* model, size_t min_size);
+
+// Copies the contents of an array into another.
+// Expects that the shape and data type match.
+template <ArrayDataType A>
+void CopyArrayBuffer(const Array& source_array, Array* target_array) {
+  int source_buffer_size = RequiredBufferSizeForShape(source_array.shape());
+  int target_buffer_size = RequiredBufferSizeForShape(target_array->shape());
+  CHECK_EQ(source_buffer_size, target_buffer_size)
+      << "Buffer sizes must match in element count";
+  CHECK(source_array.data_type == target_array->data_type)
+      << "Data types must match";
+  if (source_array.buffer) {
+    const auto& source_buffer = source_array.GetBuffer<A>();
+    auto& target_buffer = target_array->GetMutableBuffer<A>();
+    target_buffer.data = source_buffer.data;
+  }
+}
+
+// Inserts a no-op reshape operator between the source array and the target
+// array. This effectively just copies the data.
+void InsertCopyOperator(Model* model, const std::string& source_array_name,
+                        const std::string& target_array_name);
+
+// Clones an array with all data and parameters.
+void CloneArray(Model* model, const std::string& source_array_name,
+                const std::string& target_array_name);
+
+void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
+
+template <typename T>
+T ConvertOperator(Operator* o, OperatorType type) {
+  if (o != nullptr && o->type == type) {
+    return static_cast<T>(o);
+  }
+
+  return nullptr;
+}
+
+void CheckIsReadyForQuantization(const Model& model);
+
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims);
+
+inline int Offset(const Shape& shape, const std::vector<int>& indices) {
+  DCHECK_EQ(shape.dimensions_count(), indices.size());
+  const int dims_count = shape.dimensions_count();
+  int offset = 0;
+  for (int i = 0; i < dims_count; i++) {
+    const int index = indices[i];
+    DCHECK(index >= 0 && index < shape.dims(i));
+    offset *= shape.dims(i);
+    offset += index;
+  }
+  return offset;
+}
+
+inline std::vector<int> ReverseOffset(const Shape& shape, int index) {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, RequiredBufferSizeForShape(shape));
+  const int dims_count = shape.dimensions_count();
+  std::vector<int> indices(dims_count);
+  int residual = index;
+  for (int i = dims_count - 1; i >= 0; i--) {
+    indices[i] = residual % shape.dims(i);
+    residual /= shape.dims(i);
+  }
+  return indices;
+}
+
+int ElementSize(ArrayDataType data_type);
+
+void DropMinMax(Model* model, const std::string& array_name);
+
+bool IsAllocatableTransientArray(const Model& model,
+                                 const std::string& array_name);
+
+void CreateOrCheckRnnStateArray(const std::string& name, int size,
+                                int state_num_dims, Model* model);
+
+std::string AvailableArrayName(const Model& model, const std::string& name);
+
+// Formats a shape as a string: [ dims(0), dims(1), ..., dims(num_dims-1) ].
+std::string ShapeToString(const Shape& shape);
+
+void PrintArrayShape(Model* model, const std::string& name);
+
+void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
+                   std::vector<int>* out_dims);
+
+// Defines a constant int32 array with the provided values formatted for use
+// as op parameters.
+std::string CreateInt32Array(Model* model, const std::string& param_name,
+                             const std::vector<int>& value);
+
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64_t* result);
+bool EstimateArithmeticOpsCount(const Model& model, int64_t* result);
+std::string FormattedNumber(int64_t x);
+
+int AxesCount(AxesOrder axes_order);
+
+// Returns the permutation of the dimensions based on the input axes order and
+// output axes order.
+void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
+                     std::vector<int>* shuffle);
+
+// Extend shuffle is designed to match ExtendShape, which pads the shape with
+// unit dimensions at the beginning.
+void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
+                   std::vector<int>* extended_shuffle);
+
+void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
+                 AxesOrder output_axes_order, Shape* output_shape);
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const float* input_data, float* output_data);
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const uint8* input_data, uint8* output_data);
+
+// Returns true if it may be OK for any graph transformation to ever discard
+// that array. The idea is that we can't ever discard arrays that are either
+// an input or an output of the whole graph, or that appear in RNN back-edges,
+// as that would undercut explicit flags that the user might pass.
+bool IsDiscardableArray(const Model& model, const std::string& array_name);
+
+void CheckFinalDataTypesSatisfied(const Model& model);
+
+ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
+
+// The process of building models varies according to the import format.
+//
+// (a) In some cases, such as model-proto format, the model should be fully
+// specified. In these cases, no extra action should be taken by this function.
+// (b) In other cases, such as TF graphdef format, the desired types of RNN
+// arrays are not specified directly in the model, neither can they be inferred.
+// However, we can set the types of RNN destination arrays to float. This breaks
+// any cycles such as when resolution of the type of an RNN source array depends
+// on the type of its destination array.
+//
+// This function is applied after the main import, after resolution of flags and
+// after application of ArraysExtraInfo. It only defaults destination RNN arrays
+// to float. If the model is subsequently quantized, it is assumed that the
+// model contains sufficient information for that to be completed. If it is
+// already quantized, then case (a) should hold.
+void FinishBuildingRNNStates(Model* model);
+
+void UseArraysExtraInfo(Model* model, bool quantize_output);
+
+// Calculates the number of elements in tensor given a shape. Shape elements
+// are assumed to be of type T, while the result total is of type U. If U
+// doesn't have enough range to represent the sum of elements, an error is
+// returned.
+template <typename T, typename U>
+absl::Status NumElements(const std::vector<T>& shape, U* num_elements) {
+  static_assert(
+      std::numeric_limits<T>::max() <= std::numeric_limits<uint64_t>::max(),
+      "vector type exceed capabilities of NumElements");
+
+  *num_elements = 1;
+  for (const T& dim : shape) {
+    if (dim < 0) {
+      // TensorFlow's shapes sometimes include -1 to represent an "unknown"
+      // size but TOCO isn't able to create arrays of unknown sizes and will
+      // crash in RequiredBufferSizeForShape().
+      return tensorflow::errors::InvalidArgument(
+          "Tensor shape should not include negative values");
+    }
+    if (*num_elements != 0 &&
+        static_cast<uint64_t>(dim) >
+            std::numeric_limits<U>::max() / *num_elements) {
+      *num_elements = 0;
+      return tensorflow::errors::InvalidArgument("Tensor shape is too large");
+    }
+    *num_elements *= dim;
+  }
+  return absl::OkStatus();
+}
+
+// A model file may have shuffled FC weights.
+// When that happens, we want to de-shuffle them immediately on import,
+// so that the rest of toco doesn't need to know about shuffled weights.
+void UndoWeightsShuffling(Model* model);
+
+// Copies minmax, quantization_params, and narrow_range.
+void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst);
+
+// Delete Array if it's discardable and not referenced as input or output array
+// by any other op than the specified op.
+bool DeleteArrayIfUnusedOutsideOfOp(const std::string& array_name,
+                                    const Operator* op, Model* model);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_model.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_model.h
new file mode 100644
index 00000000..93072ffc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -0,0 +1,243 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/profiling/memory_info.h"
+#include "tensorflow/lite/profiling/memory_usage_monitor.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+
+namespace tflite {
+namespace benchmark {
+
+enum RunType {
+  WARMUP,
+  REGULAR,
+};
+
+class BenchmarkResults {
+ public:
+  BenchmarkResults() {}
+  BenchmarkResults(double model_size_mb, int64_t startup_latency_us,
+                   uint64_t input_bytes,
+                   tensorflow::StatWithPercentiles<int64_t> warmup_time_us,
+                   tensorflow::StatWithPercentiles<int64_t> inference_time_us,
+                   const profiling::memory::MemoryUsage& init_mem_usage,
+                   const profiling::memory::MemoryUsage& overall_mem_usage,
+                   float peak_mem_mb)
+      : model_size_mb_(model_size_mb),
+        startup_latency_us_(startup_latency_us),
+        input_bytes_(input_bytes),
+        warmup_time_us_(warmup_time_us),
+        inference_time_us_(inference_time_us),
+        init_mem_usage_(init_mem_usage),
+        overall_mem_usage_(overall_mem_usage),
+        peak_mem_mb_(peak_mem_mb) {}
+
+  const double model_size_mb() const { return model_size_mb_; }
+  tensorflow::StatWithPercentiles<int64_t> inference_time_us() const {
+    return inference_time_us_;
+  }
+  tensorflow::StatWithPercentiles<int64_t> warmup_time_us() const {
+    return warmup_time_us_;
+  }
+  int64_t startup_latency_us() const { return startup_latency_us_; }
+  uint64_t input_bytes() const { return input_bytes_; }
+  double throughput_MB_per_second() const {
+    double bytes_per_sec = (input_bytes_ * inference_time_us_.count() * 1e6) /
+                           inference_time_us_.sum();
+    return bytes_per_sec / (1024.0 * 1024.0);
+  }
+
+  const profiling::memory::MemoryUsage& init_mem_usage() const {
+    return init_mem_usage_;
+  }
+  const profiling::memory::MemoryUsage& overall_mem_usage() const {
+    return overall_mem_usage_;
+  }
+  float peak_mem_mb() const { return peak_mem_mb_; }
+
+ private:
+  double model_size_mb_ = 0.0;
+  int64_t startup_latency_us_ = 0;
+  uint64_t input_bytes_ = 0;
+  tensorflow::StatWithPercentiles<int64_t> warmup_time_us_;
+  tensorflow::StatWithPercentiles<int64_t> inference_time_us_;
+  profiling::memory::MemoryUsage init_mem_usage_;
+  profiling::memory::MemoryUsage overall_mem_usage_;
+  // An invalid value could happen when we don't monitor memory footprint for
+  // the inference, or the memory usage info isn't available on the benchmarking
+  // platform.
+  float peak_mem_mb_ =
+      profiling::memory::MemoryUsageMonitor::kInvalidMemUsageMB;
+};
+
+class BenchmarkListener {
+ public:
+  // Called before the (outer) inference loop begins.
+  // Note that this is called *after* the interpreter has been initialized, but
+  // *before* any warmup runs have been executed.
+  virtual void OnBenchmarkStart(const BenchmarkParams& params) {}
+  // Called before a single (inner) inference call starts.
+  virtual void OnSingleRunStart(RunType runType) {}
+  // Called before a single (inner) inference call ends.
+  virtual void OnSingleRunEnd() {}
+  // Called after the (outer) inference loop begins.
+  virtual void OnBenchmarkEnd(const BenchmarkResults& results) {}
+  virtual ~BenchmarkListener() {}
+};
+
+// A listener that forwards its method calls to a collection of listeners.
+class BenchmarkListeners : public BenchmarkListener {
+ public:
+  // Added a listener to the listener collection.
+  // |listener| is not owned by the instance of |BenchmarkListeners|.
+  // |listener| should not be null and should outlast the instance of
+  // |BenchmarkListeners|.
+  void AddListener(BenchmarkListener* listener) {
+    listeners_.push_back(listener);
+  }
+
+  // Remove all listeners after [index] including the one at 'index'.
+  void RemoveListeners(int index) {
+    if (index >= NumListeners()) return;
+    listeners_.resize(index);
+  }
+
+  int NumListeners() const { return listeners_.size(); }
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override {
+    for (auto listener : listeners_) {
+      listener->OnBenchmarkStart(params);
+    }
+  }
+
+  void OnSingleRunStart(RunType runType) override {
+    for (auto listener : listeners_) {
+      listener->OnSingleRunStart(runType);
+    }
+  }
+
+  void OnSingleRunEnd() override {
+    for (auto listener : listeners_) {
+      listener->OnSingleRunEnd();
+    }
+  }
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    for (auto listener : listeners_) {
+      listener->OnBenchmarkEnd(results);
+    }
+  }
+
+  ~BenchmarkListeners() override {}
+
+ private:
+  // Use vector so listeners are invoked in the order they are added.
+  std::vector<BenchmarkListener*> listeners_;
+};
+
+// Benchmark listener that just logs the results of benchmark run.
+class BenchmarkLoggingListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
+template <typename T>
+Flag CreateFlag(const char* name, BenchmarkParams* params,
+                const std::string& usage) {
+  return Flag(
+      name,
+      [params, name](const T& val, int argv_position) {
+        params->Set<T>(name, val, argv_position);
+      },
+      params->Get<T>(name), usage, Flag::kOptional);
+}
+
+// Benchmarks a model.
+//
+// Subclasses need to implement initialization and running of the model.
+// The results can be collected by adding BenchmarkListener(s).
+class BenchmarkModel {
+ public:
+  static BenchmarkParams DefaultParams();
+  BenchmarkModel();
+  explicit BenchmarkModel(BenchmarkParams params)
+      : params_(std::move(params)) {}
+  virtual ~BenchmarkModel() {}
+  virtual TfLiteStatus Init() = 0;
+  virtual TfLiteStatus Run(int argc, char** argv);
+  virtual TfLiteStatus Run();
+  void AddListener(BenchmarkListener* listener) {
+    listeners_.AddListener(listener);
+  }
+  // Remove all listeners after [index] including the one at 'index'.
+  void RemoveListeners(int index) { listeners_.RemoveListeners(index); }
+  int NumListeners() const { return listeners_.NumListeners(); }
+
+  BenchmarkParams* mutable_params() { return &params_; }
+
+  // Unparsable flags will remain in 'argv' in the original order and 'argc'
+  // will be updated accordingly.
+  TfLiteStatus ParseFlags(int* argc, char** argv);
+
+ protected:
+  virtual void LogParams();
+  virtual TfLiteStatus ValidateParams();
+
+  TfLiteStatus ParseFlags(int argc, char** argv) {
+    return ParseFlags(&argc, argv);
+  }
+  virtual std::vector<Flag> GetFlags();
+
+  // Get the model file size if it's available.
+  virtual int64_t MayGetModelFileSize() { return -1; }
+  virtual uint64_t ComputeInputBytes() = 0;
+  virtual tensorflow::StatWithPercentiles<int64_t> Run(
+      int min_num_times, float min_secs, float max_secs, RunType run_type,
+      TfLiteStatus* invoke_status);
+  // Prepares input data for benchmark. This can be used to initialize input
+  // data that has non-trivial cost.
+  virtual TfLiteStatus PrepareInputData();
+
+  virtual TfLiteStatus ResetInputsAndOutputs();
+  virtual TfLiteStatus RunImpl() = 0;
+
+  // Create a MemoryUsageMonitor to report peak memory footprint if specified.
+  virtual std::unique_ptr<profiling::memory::MemoryUsageMonitor>
+  MayCreateMemoryUsageMonitor() const;
+
+  BenchmarkParams params_;
+  BenchmarkListeners listeners_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_multirun_stats_recorder.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_multirun_stats_recorder.h
new file mode 100644
index 00000000..31ddc723
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_multirun_stats_recorder.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MULTIRUN_STATS_RECORDER_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MULTIRUN_STATS_RECORDER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+
+namespace tflite {
+namespace benchmark {
+
+class MultiRunStatsRecorder : public BenchmarkListener {
+ public:
+  // BenchmarkListener::OnBenchmarkStart is invoked after each run's
+  // BenchmarkModel::Init. However, some run could fail during Init, e.g.
+  // delegate fails to be created etc. To still record such run, we will call
+  // the following function right before a run starts.
+  void MarkBenchmarkStart(const BenchmarkParams& params) {
+    results_.emplace_back(EachRunResult());
+    auto& current = results_.back();
+    current.completed = false;
+    current.params = std::make_unique<BenchmarkParams>();
+    current.params->Merge(params, true /* overwrite*/);
+  }
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) final {
+    auto& current = results_.back();
+    current.completed = true;
+    current.metrics = results;
+  }
+
+  virtual void OutputStats();
+
+ protected:
+  struct EachRunResult {
+    bool completed = false;
+    std::unique_ptr<BenchmarkParams> params;
+    BenchmarkResults metrics;
+  };
+  std::vector<EachRunResult> results_;
+
+  // Use this to order the runs by the average inference time in increasing
+  // order (i.e. the fastest run ranks first.). If the run didn't complete,
+  // we consider it to be slowest.
+  struct EachRunStatsEntryComparator {
+    bool operator()(const EachRunResult& i, const EachRunResult& j) {
+      if (!i.completed) return false;
+      if (!j.completed) return true;
+      return i.metrics.inference_time_us().avg() <
+             j.metrics.inference_time_us().avg();
+    }
+  };
+
+  virtual std::string PerfOptionName(const BenchmarkParams& params) const;
+};
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MULTIRUN_STATS_RECORDER_H_
+
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_params.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_params.h
new file mode 100644
index 00000000..6ed8f5af
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_params.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+namespace benchmark {
+using BenchmarkParam = tflite::tools::ToolParam;
+using BenchmarkParams = tflite::tools::ToolParams;
+
+// To be used in BenchmarkModel::LogParams() and its overrides as we assume
+// logging the parameters defined in BenchmarkModel as 'params_'.
+#define LOG_BENCHMARK_PARAM(type, name, description, verbose) \
+  LOG_TOOL_PARAM(params_, type, name, description, verbose)
+}  // namespace benchmark
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_performance_options.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
new file mode 100644
index 00000000..d18a57c3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
@@ -0,0 +1,83 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PERFORMANCE_OPTIONS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PERFORMANCE_OPTIONS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_multirun_stats_recorder.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Benchmarks all performance options on a model by repeatedly invoking the
+// single-performance-option run on a passed-in 'BenchmarkModel' object.
+class BenchmarkPerformanceOptions {
+ public:
+  // Doesn't own the memory of 'single_option_run'.
+  explicit BenchmarkPerformanceOptions(
+      BenchmarkModel* single_option_run,
+      std::unique_ptr<MultiRunStatsRecorder> all_run_stats =
+          std::make_unique<MultiRunStatsRecorder>());
+
+  virtual ~BenchmarkPerformanceOptions() = default;
+
+  // Just run the benchmark just w/ default parameter values.
+  TfLiteStatus Run();
+  TfLiteStatus Run(int argc, char** argv);
+
+ protected:
+  static BenchmarkParams DefaultParams();
+
+  BenchmarkPerformanceOptions(
+      BenchmarkParams params, BenchmarkModel* single_option_run,
+      std::unique_ptr<MultiRunStatsRecorder> all_run_stats);
+
+  // Unparsable flags will remain in 'argv' in the original order and 'argc'
+  // will be updated accordingly.
+  TfLiteStatus ParseFlags(int* argc, char** argv);
+  virtual std::vector<Flag> GetFlags();
+
+  TfLiteStatus ParsePerfOptions();
+  virtual std::vector<std::string> GetValidPerfOptions() const;
+  bool HasOption(const std::string& option) const;
+
+  virtual void ResetPerformanceOptions();
+  virtual void CreatePerformanceOptions();
+
+  BenchmarkParams params_;
+  std::vector<std::string> perf_options_;
+
+  // The object that drives a single-performance-option run.
+  BenchmarkModel* const single_option_run_;          // Doesn't own the memory.
+  BenchmarkParams* const single_option_run_params_;  // Doesn't own the memory.
+
+  // Each element is a set of performance-affecting benchmark parameters to be
+  // all set for a particular benchmark run.
+  std::vector<BenchmarkParams> all_run_params_;
+
+  std::unique_ptr<MultiRunStatsRecorder> all_run_stats_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PERFORMANCE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
new file mode 100644
index 00000000..489fe75d
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -0,0 +1,224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/signature_runner.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+#include "tensorflow/lite/tools/model_loader.h"
+#include "tensorflow/lite/tools/utils.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Splits the input_layer_name and input_layer_value_files and stores them in
+// the name_file_pair. In the case of failures, return an error status, and the
+// the state of name_file_pair is unchanged.
+//
+// BenchmarkTfLiteModel takes --input_layer_value_files flag, which is a comma-
+// separated list of input_layer_name:input_value_file_path pairs,
+// e.g. input1:/tmp/path.
+//
+// As TensorFlow allows ':' in the tensor names (e.g. input:0 to denote the
+// output index), having ':' as the delimiter can break the benchmark code
+// unexpectedly. To avoid this issue, we allow escaping ':' char with '::' for
+// this particular flag only. This function handles splitting the name and file
+// path that contains escaped colon.
+//
+// For example, "input::0:/tmp/path" will be divided into input:0 and /tmp/path.
+TfLiteStatus SplitInputLayerNameAndValueFile(
+    const std::string& name_and_value_file,
+    std::pair<std::string, std::string>& name_file_pair);
+
+// Provides a simplified interface to work with the interpreter and signature
+// runner, automatically selecting the appropriate one based on whether a
+// signature is specified.
+class BenchmarkInterpreterRunner {
+ public:
+  BenchmarkInterpreterRunner(tflite::Interpreter* const interpreter,
+                             tflite::SignatureRunner* const signature_runner,
+                             tflite::Subgraph* const subgraph)
+      : interpreter_(interpreter), subgraph_(subgraph) {
+    if (signature_runner != nullptr) {
+      signature_runner_.reset(signature_runner);
+    }
+  }
+
+  ~BenchmarkInterpreterRunner() {
+    if (signature_runner_ != nullptr) {
+      signature_runner_.release();
+    }
+  }
+
+  // Creates a BenchmarkInterpreterRunner for the given interpreter. If a
+  // signature key is specified, the signature runner is used. Otherwise, the
+  // interpreter is used.
+  static std::pair<TfLiteStatus, std::unique_ptr<BenchmarkInterpreterRunner>>
+  Create(tflite::Interpreter* interpreter, std::string signature_key);
+
+  // Updates allocations for all tensors, related to the given signature.
+  TfLiteStatus AllocateTensors();
+
+  // Invokes the interpreter or signature runner (run the graph identified by
+  // the given signature in dependency order).
+  TfLiteStatus Invoke();
+
+  // Return vector of node indices in the order of execution.
+  //
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  // Warning: This is an experimental API and subject to change.
+  const std::vector<int>& execution_plan() const;
+
+  // Read only access to list of inputs.
+  //
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  // Warning: This is an experimental API and subject to change.
+  const std::vector<int>& inputs() const;
+
+  // Read only access to list of outputs.
+  //
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  // Warning: This is an experimental API and subject to change.
+  const std::vector<int>& outputs() const;
+
+  // Get a mutable tensor data structure via index.
+  // Warning: This is an experimental API and subject to change.
+  TfLiteTensor* tensor(int tensor_index);
+
+  // Get a pointer to an operation and registration data structure if in
+  // bounds of the signature subgraph.
+  // Warning: This is an experimental API and subject to change.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const;
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs or variables.
+  // Returns status of failure or success. Note that this doesn't actually
+  // resize any existing buffers. A call to AllocateTensors() is required to
+  // change the tensor input buffer.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& new_size);
+
+ private:
+  BenchmarkInterpreterRunner() = delete;
+  tflite::Interpreter* const interpreter_ = nullptr;
+
+  std::unique_ptr<tflite::SignatureRunner> signature_runner_;
+  tflite::Subgraph* const subgraph_ = nullptr;
+};
+
+// Benchmarks a TFLite model by running tflite interpreter.
+class BenchmarkTfLiteModel : public BenchmarkModel {
+ public:
+  struct InputLayerInfo {
+    InputLayerInfo() : has_value_range(false), low(0), high(0) {}
+
+    std::string name;
+    std::vector<int> shape;
+
+    // The input value is randomly generated when benchmarking the NN model.
+    // However, the NN model might require the value be limited to a certain
+    // range [low, high] for this particular input layer. For simplicity,
+    // support integer value first.
+    bool has_value_range;
+    int low;
+    int high;
+
+    // The input value will be loaded from 'input_file_path' INSTEAD OF being
+    // randomly generated. Note the input file will be opened in binary mode.
+    std::string input_file_path;
+  };
+
+  explicit BenchmarkTfLiteModel(BenchmarkParams params = DefaultParams());
+  ~BenchmarkTfLiteModel() override;
+
+  std::vector<Flag> GetFlags() override;
+  void LogParams() override;
+  TfLiteStatus ValidateParams() override;
+  uint64_t ComputeInputBytes() override;
+  TfLiteStatus Init() override;
+  TfLiteStatus RunImpl() override;
+  static BenchmarkParams DefaultParams();
+
+ protected:
+  TfLiteStatus PrepareInputData() override;
+  TfLiteStatus ResetInputsAndOutputs() override;
+
+  int64_t MayGetModelFileSize() override;
+
+  virtual TfLiteStatus LoadModel();
+
+  // Allow subclasses to create a customized Op resolver during init.
+  virtual std::unique_ptr<tflite::OpResolver> GetOpResolver() const;
+
+  // Allow subclass to initialize a customized tflite interpreter.
+  virtual TfLiteStatus InitInterpreter();
+
+  // Create a BenchmarkListener that's specifically for TFLite profiling if
+  // necessary.
+  virtual std::unique_ptr<BenchmarkListener> MayCreateProfilingListener() const;
+
+  void CleanUp();
+
+  utils::InputTensorData LoadInputTensorData(
+      const TfLiteTensor& t, const std::string& input_file_path);
+
+  std::vector<InputLayerInfo> inputs_;
+  std::vector<utils::InputTensorData> inputs_data_;
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<BenchmarkInterpreterRunner> interpreter_runner_;
+  std::unique_ptr<tflite::ExternalCpuBackendContext> external_context_;
+
+ private:
+  utils::InputTensorData CreateRandomTensorData(
+      const TfLiteTensor& t, const InputLayerInfo* layer_info);
+
+  void AddOwnedListener(std::unique_ptr<BenchmarkListener> listener) {
+    if (listener == nullptr) return;
+    owned_listeners_.emplace_back(std::move(listener));
+    AddListener(owned_listeners_.back().get());
+  }
+
+  std::vector<std::unique_ptr<BenchmarkListener>> owned_listeners_;
+  std::mt19937 random_engine_;
+  std::vector<Interpreter::TfLiteDelegatePtr> owned_delegates_;
+  // Always TFLITE_LOG the benchmark result.
+  BenchmarkLoggingListener log_output_;
+  std::unique_ptr<tools::ModelLoader> model_loader_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_utils.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_utils.h
new file mode 100644
index 00000000..b6901162
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/benchmark_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace benchmark {
+namespace util {
+
+// A convenient function that wraps tflite::profiling::time::SleepForMicros and
+// simply return if 'sleep_seconds' is negative.
+void SleepForSeconds(double sleep_seconds);
+
+// Split the 'str' according to 'delim', and store each splitted element into
+// 'values'.
+template <typename T>
+bool SplitAndParse(const std::string& str, char delim, std::vector<T>* values) {
+  std::istringstream input(str);
+  for (std::string line; std::getline(input, line, delim);) {
+    std::istringstream to_parse(line);
+    T val;
+    to_parse >> val;
+    if (!to_parse.eof() && !to_parse.good()) {
+      return false;
+    }
+    values->emplace_back(val);
+  }
+  return true;
+}
+
+}  // namespace util
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h
new file mode 100644
index 00000000..a84e1793
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h
@@ -0,0 +1,137 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_C_BENCHMARK_C_API_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_C_BENCHMARK_C_API_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+// -----------------------------------------------------------------------------
+// Experimental C APIs for the benchmark tool, mainly intended to be used for
+// building a standalone TensorFlow Lite benchmark framework for iOS. This
+// header only has a minimal dependency to the C API types, which can be
+// included in the framework itself.
+// -----------------------------------------------------------------------------
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum {
+  TfLiteBenchmarkWarmup,
+  TfLiteBenchmarkRegular,
+} TfLiteBenchmarkRunType;
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tsl::Stat<int64_t> type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteBenchmarkInt64Stat {
+  bool empty;
+  int64_t first;
+  int64_t newest;
+  int64_t max;
+  int64_t min;
+  int64_t count;
+  int64_t sum;
+  double squared_sum;
+  bool all_same;
+  double avg;
+  int64_t std_deviation;
+} TfLiteBenchmarkInt64Stat;
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::benchmark::BenchmarkResults type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteBenchmarkResults TfLiteBenchmarkResults;
+
+extern TfLiteBenchmarkInt64Stat
+TfLiteBenchmarkResultsGetInferenceTimeMicroseconds(
+    const TfLiteBenchmarkResults* results);
+
+extern TfLiteBenchmarkInt64Stat TfLiteBenchmarkResultsGetWarmupTimeMicroseconds(
+    const TfLiteBenchmarkResults* results);
+
+extern int64_t TfLiteBenchmarkResultsGetStartupLatencyMicroseconds(
+    const TfLiteBenchmarkResults* results);
+
+extern uint64_t TfLiteBenchmarkResultsGetInputBytes(
+    const TfLiteBenchmarkResults* results);
+
+extern double TfLiteBenchmarkResultsGetThroughputMbPerSecond(
+    const TfLiteBenchmarkResults* results);
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::benchmark::BenchmarkListener type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteBenchmarkListener TfLiteBenchmarkListener;
+
+extern TfLiteBenchmarkListener* TfLiteBenchmarkListenerCreate();
+
+extern void TfLiteBenchmarkListenerDelete(TfLiteBenchmarkListener* listener);
+
+// Sets the listener callbacks. Only non-null callback functions will be called
+// when the following events occur. The user_data pointer provided by the caller
+// will also be forwarded as a parameter of each callback function.
+//
+// - on_benchmark_start: Called before the (outer) inference loop begins. Note
+//     that this is called *after* the interpreter has been initialized, but
+//     *before* any warmup runs have been executed.
+// - on_single_run_start: Called before a single (inner) inference call starts.
+// - on_single_run_end: Called before a single (inner) inference call ends.
+// - on_benchmark_end: Called after the (outer) inference loop ends.
+//
+// In case of `on_benchmark_end` callback, the passed in `results` pointer is
+// only valid during the callback function execution, and will be destroyed
+// afterwards.
+extern void TfLiteBenchmarkListenerSetCallbacks(
+    TfLiteBenchmarkListener* listener, void* user_data,
+    void (*on_benchmark_start_fn)(void* user_data),
+    void (*on_single_run_start_fn)(void* user_data,
+                                   TfLiteBenchmarkRunType runType),
+    void (*on_single_run_end_fn)(void* user_data),
+    void (*on_benchmark_end_fn)(void* user_data,
+                                TfLiteBenchmarkResults* results));
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::benchmark::BenchmarkTfLiteModel type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteBenchmarkTfLiteModel TfLiteBenchmarkTfLiteModel;
+
+// TODO(b/144321502): Support BenchmarkParams.
+extern TfLiteBenchmarkTfLiteModel* TfLiteBenchmarkTfLiteModelCreate();
+
+extern void TfLiteBenchmarkTfLiteModelDelete(
+    TfLiteBenchmarkTfLiteModel* benchmark_model);
+
+extern TfLiteStatus TfLiteBenchmarkTfLiteModelInit(
+    TfLiteBenchmarkTfLiteModel* benchmark_model);
+
+extern TfLiteStatus TfLiteBenchmarkTfLiteModelRun(
+    TfLiteBenchmarkTfLiteModel* benchmark_model);
+
+extern TfLiteStatus TfLiteBenchmarkTfLiteModelRunWithArgs(
+    TfLiteBenchmarkTfLiteModel* benchmark_model, int argc, char** argv);
+
+extern void TfLiteBenchmarkTfLiteModelAddListener(
+    TfLiteBenchmarkTfLiteModel* benchmark_model,
+    const TfLiteBenchmarkListener* listener);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_C_BENCHMARK_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
new file mode 100644
index 00000000..c2fcb0ca
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_ACCURACY_BENCHMARK_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_ACCURACY_BENCHMARK_H_
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+namespace benchmark {
+namespace accuracy {
+
+// Triggers MiniBenchmark testings. Uses the arguments passed from the testing
+// app to configure MiniBenchmark ValidatorRunner. The tests will access and
+// execute the pre-embedded models in the app via the model file descriptor. The
+// contents of a model are initialized using model_size bytes starting at
+// model_offset position in the file described by model_fd. Any intermediate
+// data and results will be dumped to the result path given.
+//
+// Returns a BenchmarkEvent flatbuffer offset. If the benchmark tests finish
+// successfully with a pass from MiniBenchmark, the returned offset contains the
+// concrete accuracy metrics and the overall result from MiniBenchmark.
+// Otherwise, the returned value contains an error code.
+flatbuffers::Offset<BenchmarkEvent> Benchmark(
+    flatbuffers::FlatBufferBuilder& fbb, const TFLiteSettings& tflite_settings,
+    int model_fd, size_t model_offset, size_t model_size,
+    const char* result_path_chars);
+
+}  // namespace accuracy
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_ACCURACY_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
new file mode 100644
index 00000000..68eed8d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_LATENCY_BENCHMARK_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_LATENCY_BENCHMARK_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
+
+namespace tflite {
+namespace benchmark {
+namespace latency {
+
+// Triggers TFLite Benchmark Tool. Passes the "args" from the testing app to
+// directly to TFLite Benchmark Tool. Converts the "tflite_settings" to
+// command-line options to configure TFLite Benchmark Tool. If the latency
+// benchmarking uses a stable delegate, the "tflite_settings_path" is passed to
+// enable the stable delegate provider. The contents of the tested model are
+// initialized using model_size bytes starting at model_offset position in the
+// file referenced by the file descriptor model_fd.
+//
+// Returns a LatencyResults proto message. If the benchmark tests finish
+// successfully from TFLite Benchmark Tool, the message contains the latency
+// metrics. Otherwise, the message contains the corresponding error.
+proto::benchmark::LatencyResults Benchmark(
+    const TFLiteSettings& tflite_settings,
+    const std::string& tflite_settings_path, int model_fd, size_t model_offset,
+    size_t model_size, const std::vector<std::string>& args);
+
+}  // namespace latency
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_LATENCY_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h
new file mode 100644
index 00000000..805df6e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_STATUS_CODES_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_STATUS_CODES_H_
+
+namespace tflite {
+namespace benchmark {
+
+enum DelegatePerformanceBenchmarkStatus {
+  kBenchmarkUnknownStatus = 0,
+
+  // Set of error codes that are used as the return codes to communicate between
+  // the native layer and the caller app.
+  kBenchmarkRunnerInitializationFailed = 1000,
+  kBenchmarkResultCountMismatch = 1001,
+  kBenchmarkInvalidTfLiteSettings = 1002,
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_STATUS_CODES_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h
new file mode 100644
index 00000000..a55c03e0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h
@@ -0,0 +1,22 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property(strong, nonatomic) UIWindow *window;
+
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h
new file mode 100644
index 00000000..ec6dea05
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h
@@ -0,0 +1,21 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface BenchmarkViewController : UIViewController
+@property(weak, nonatomic) IBOutlet UITextView *resultsView;
+
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/profiling_listener.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/profiling_listener.h
new file mode 100644
index 00000000..cc1fd3d7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/profiling_listener.h
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  ProfilingListener(
+      Interpreter* interpreter, uint32_t max_num_initial_entries,
+      bool allow_dynamic_buffer_increase,
+      const std::string& output_file_path = "",
+      std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter =
+          std::make_shared<profiling::ProfileSummaryDefaultFormatter>());
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ protected:
+  profiling::ProfileSummarizer run_summarizer_;
+  profiling::ProfileSummarizer init_summarizer_;
+  std::string output_file_path_;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::BufferedProfiler profiler_;
+  std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/register_custom_op.h b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/register_custom_op.h
new file mode 100644
index 00000000..9278e31a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/benchmark/register_custom_op.h
@@ -0,0 +1,23 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_REGISTER_CUSTOM_OP_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_REGISTER_CUSTOM_OP_H_
+
+#include "tensorflow/lite/op_resolver.h"
+
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_REGISTER_CUSTOM_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/command_line_flags.h b/third_party/tflite-hdrs/tensorflow/lite/tools/command_line_flags.h
new file mode 100644
index 00000000..2d729f59
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/command_line_flags.h
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+
+#include <stdint.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace tflite {
+// A simple command-line argument parsing module.
+// Dependency free simplified port of core/util/command_line_flags.
+// This class is written for benchmarks and uses inefficient string
+// concatenation. This was written to avoid dependency on tensorflow/core/util
+// which transitively brings in a lot of other dependencies that are not
+// necessary for tflite benchmarking code.
+// The recommended way of using it is with local variables and an initializer
+// list of Flag objects, for example:
+//
+// int some_int = 10;
+// bool some_switch = false;
+// std::string some_name = "something";
+//
+// std::vector<tensorFlow::Flag> flag_list = {
+//   Flag::CreateFlag("some_int", &some_int, "an integer that affects X"),
+//   Flag::CreateFlag("some_switch", &some_switch, "a bool that affects Y"),
+//   Flag::CreateFlag("some_name", &some_name, "a string that affects Z")
+// };
+// // Get usage message before ParseFlags() to capture default values.
+// std::string usage = Flag::Usage(argv[0], flag_list);
+// bool parsed_values_ok = Flags::Parse(&argc, argv, flag_list);
+//
+// tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+// if (argc != 1 || !parsed_values_ok) {
+//    ...output usage and error message...
+// }
+//
+// The argc and argv values are adjusted by the Parse function so all that
+// remains is the program name (at argv[0]) and any unknown arguments fill the
+// rest of the array. This means you can check for flags that weren't understood
+// by seeing if argv is greater than 1.
+// The result indicates if there were any errors parsing the values that were
+// passed to the command-line switches. For example, --some_int=foo would return
+// false because the argument is expected to be an integer.
+//
+// NOTE: Unlike gflags-style libraries, this library is intended to be
+// used in the `main()` function of your binary. It does not handle
+// flag definitions that are scattered around the source code.
+
+// A description of a single command line flag, holding its name, type, usage
+// text, and a pointer to the corresponding variable.
+class Flag {
+ public:
+  enum FlagType {
+    kPositional = 0,
+    kRequired,
+    kOptional,
+  };
+
+  // The order of the positional flags is the same as they are added.
+  // Positional flags are supposed to be required.
+  template <typename T>
+  static Flag CreateFlag(const char* name, T* val, const char* usage,
+                         FlagType flag_type = kOptional) {
+    return Flag(
+        name, [val](const T& v) { *val = v; }, *val, usage, flag_type);
+  }
+
+// "flag_T" is same as "default_value_T" for trivial types, like int32, bool
+// etc. But when it's a complex type, "default_value_T" is generally a const
+// reference "flag_T".
+#define CONSTRUCTOR_WITH_ARGV_INDEX(flag_T, default_value_T)         \
+  Flag(const char* name,                                             \
+       const std::function<void(const flag_T& /*flag_val*/,          \
+                                int /*argv_position*/)>& hook,       \
+       default_value_T default_value, const std::string& usage_text, \
+       FlagType flag_type);
+
+#define CONSTRUCTOR_WITHOUT_ARGV_INDEX(flag_T, default_value_T)            \
+  Flag(const char* name, const std::function<void(const flag_T&)>& hook,   \
+       default_value_T default_value, const std::string& usage_text,       \
+       FlagType flag_type)                                                 \
+      : Flag(                                                              \
+            name, [hook](const flag_T& flag_val, int) { hook(flag_val); }, \
+            default_value, usage_text, flag_type) {}
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(int32_t, int32_t)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(int32_t, int32_t)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(int64_t, int64_t)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(int64_t, int64_t)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(float, float)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(float, float)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(bool, bool)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(bool, bool)
+
+  CONSTRUCTOR_WITH_ARGV_INDEX(std::string, const std::string&)
+  CONSTRUCTOR_WITHOUT_ARGV_INDEX(std::string, const std::string&)
+
+#undef CONSTRUCTOR_WITH_ARGV_INDEX
+#undef CONSTRUCTOR_WITHOUT_ARGV_INDEX
+
+  FlagType GetFlagType() const { return flag_type_; }
+
+  std::string GetFlagName() const { return name_; }
+
+ private:
+  friend class Flags;
+
+  bool Parse(const std::string& arg, int argv_position,
+             bool* value_parsing_ok) const;
+
+  std::string name_;
+  enum {
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_BOOL,
+    TYPE_STRING,
+    TYPE_FLOAT,
+  } type_;
+
+  std::string GetTypeName() const;
+
+  std::function<bool(const std::string& /*read_value*/, int /*argv_position*/)>
+      value_hook_;
+  std::string default_for_display_;
+
+  std::string usage_text_;
+  FlagType flag_type_;
+};
+
+class Flags {
+ public:
+  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
+  // instances matching flags in flaglist[].  Update the variables associated
+  // with matching flags, and remove the matching arguments from (*argc, argv).
+  // Return true iff all recognized flag values were parsed correctly, and the
+  // first remaining argument is not "--help".
+  // Note:
+  // 1. when there are duplicate args in argv for the same flag, the flag value
+  // and the parse result will be based on the 1st arg.
+  // 2. when there are duplicate flags in flag_list (i.e. two flags having the
+  // same name), all of them will be checked against the arg list and the parse
+  // result will be false if any of the parsing fails.
+  // See *Duplicate* unit tests in command_line_flags_test.cc for the
+  // illustration of such behaviors.
+  static bool Parse(int* argc, const char** argv,
+                    const std::vector<Flag>& flag_list);
+
+  // Return a usage message with command line cmdline, and the
+  // usage_text strings in flag_list[].
+  static std::string Usage(const std::string& cmdline,
+                           const std::vector<Flag>& flag_list);
+
+  // Return a space separated string containing argv[1, ..., argc-1].
+  static std::string ArgsToString(int argc, const char** argv);
+};
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
new file mode 100644
index 00000000..6eacdaf6
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
@@ -0,0 +1,101 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_DELEGATE_COMPATIBILITY_CHECKER_BASE_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_DELEGATE_COMPATIBILITY_CHECKER_BASE_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
+
+namespace tflite {
+namespace tools {
+
+// TODO(b/243489631): Remove online mode support.
+
+// Base class for the delegate compatibility checker (DCC). Extracts the logic
+// of iterating through the model, and lets each specific DCC to check if the
+// operation of a node is compatible with that delegate. TfLiteNode and
+// operator (in schema.fbs) are equivalent.
+// There are two modes supported: online and offline. Online mode needs
+// TfLiteContext while offline mode doesn't. Online mode is supported as an
+// intermediate stage and it is recommended that delegates support offline
+// mode in validation logic, if possible.
+class DelegateCompatibilityCheckerBase {
+ public:
+  virtual ~DelegateCompatibilityCheckerBase() = default;
+
+  // Iterates over the subgraphs in the model, and for each operator, checks if
+  // it is compatible with the delegate.
+  // Stores the compatibility for each operator in the result structure.
+  absl::Status checkModelCompatibilityOffline(
+      tflite::FlatBufferModel* model_buffer,
+      tflite::proto::CompatibilityResult* result);
+
+  // This function is implemented differently by each specific DCC.
+  // Stores the compatibility for each operator in the result structure.
+  virtual absl::Status checkModelCompatibilityOnline(
+      tflite::FlatBufferModel* model_buffer,
+      tflite::proto::CompatibilityResult* result) = 0;
+
+  // This function gets the operation signature (OpSignature) from the
+  // op_code, op, subgraph and model, and then call to checkOpSigCompatibility()
+  // with the operation signature.
+  // Params:
+  //   op_code: Used to get the built in code of the operator, in
+  //               order to know which operator is being used (e.g. MUL,
+  //               FULLY_CONNECTED…).
+  //   op: Used to get the input and output tensors indexes, so that obtains the
+  //               tensors with the subgraph.
+  //   subgraph: Used to get the tensors to finally get the OpSignature.
+  //   model: Used to get the buffer in order to check if the tensor is
+  //               a constant tensor.
+  //   op_result: Stores whether the operation is compatible or not and why.
+  // Returns: absl::OkStatus() if the function is completed without exceptions.
+  absl::Status checkOpCompatibilityOffline(
+      const tflite::OperatorCode* op_code, const tflite::Operator* op,
+      const tflite::SubGraph* subgraph, const tflite::Model* model,
+      tflite::proto::OpCompatibilityResult* op_result);
+
+  // Returns the dictionary with the initialized keys. This is an abstract
+  // function because each specific DCC needs different parameters, which will
+  // be the keys in the returned dictionary.
+  virtual std::unordered_map<std::string, std::string>
+  getDccConfigurations() = 0;
+
+  // Sets the parameters needed in the specific DCC. Also checks if the
+  // value types are correct.
+  virtual absl::Status setDccConfigurations(
+      const std::unordered_map<std::string, std::string>& dcc_configs) = 0;
+
+ private:
+  // This function is implemented differently by each specific DCC because
+  // they contain the logic for checking if the operation in op_sig is
+  // compatible for that specific DCC. op_result stores whether the operation
+  // is supported or not, and why. By using offline mode, only op_signature is
+  // used to perform the checks.
+  virtual absl::Status checkOpSigCompatibility(
+      const tflite::OpSignature& op_sig,
+      tflite::proto::OpCompatibilityResult* op_result) = 0;
+};
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_DELEGATE_COMPATIBILITY_CHECKER_BASE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_util.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_util.h
new file mode 100644
index 00000000..ac033d96
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_DELEGATE_COMPATIBILITY_CHECKER_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_DELEGATE_COMPATIBILITY_CHECKER_UTIL_H_
+
+namespace tflite {
+namespace tools {
+
+#define RETURN_IF_ERROR(s) \
+  {                        \
+    auto c = (s);          \
+    if (!c.ok()) return c; \
+  }
+
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_DELEGATE_COMPATIBILITY_CHECKER_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.h
new file mode 100644
index 00000000..d0da4881
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.h
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_ONLINE_HELPER_DELEGATE_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_ONLINE_HELPER_DELEGATE_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
+
+namespace tflite {
+namespace tools {
+
+// This utility delegate is required because some TfLiteContext related
+// functions are forbidden if not calling in delegate. This class is only used
+// for online mode.
+// WARNING: This is an experimental class and subject to change.
+class OnlineHelperDelegate : public TfLiteDelegate {
+ public:
+  OnlineHelperDelegate(
+      std::unordered_map<std::string, std::string>& dcc_configs,
+      std::function<absl::Status(TfLiteContext*, const TfLiteNode*,
+                                 const TfLiteRegistration*,
+                                 std::unordered_map<std::string, std::string>&,
+                                 proto::OpCompatibilityResult*)>
+          check_op_func_ptr,
+      proto::CompatibilityResult* result)
+      : TfLiteDelegate(TfLiteDelegateCreate()),
+        result_(result),
+        dcc_configs_(dcc_configs),
+        check_op_func_ptr_(check_op_func_ptr) {
+    Prepare = DoPrepare;
+    CopyFromBufferHandle = DoCopyFromBufferHandle;
+    CopyToBufferHandle = DoCopyToBufferHandle;
+    FreeBufferHandle = DoFreeBufferHandle;
+    data_ = &delegate_data_;
+  }
+
+ protected:
+  // This function uses a pointer to a method (implemented by each specific DCC)
+  // which contains the logic to check whether the primary subgraph can be
+  // delegated to the specific delegate.
+  static TfLiteStatus DoPrepare(TfLiteContext* context,
+                                TfLiteDelegate* delegate);
+
+  // This function is not expected to be called in this delegate.
+  static TfLiteStatus DoCopyFromBufferHandle(TfLiteContext* context,
+                                             TfLiteDelegate* delegate,
+                                             TfLiteBufferHandle buffer_handle,
+                                             TfLiteTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  // This function is not expected to be called in this delegate.
+  static TfLiteStatus DoCopyToBufferHandle(TfLiteContext* context,
+                                           TfLiteDelegate* delegate,
+                                           TfLiteBufferHandle buffer_handle,
+                                           TfLiteTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  // There is no buffer handle in this delegate.
+  static void DoFreeBufferHandle(TfLiteContext* context,
+                                 TfLiteDelegate* delegate,
+                                 TfLiteBufferHandle* handle) {}
+
+ private:
+  int delegate_data_;
+  proto::CompatibilityResult* result_;
+  std::unordered_map<std::string, std::string> dcc_configs_;
+  std::function<absl::Status(TfLiteContext*, const TfLiteNode*,
+                             const TfLiteRegistration*,
+                             std::unordered_map<std::string, std::string>&,
+                             proto::OpCompatibilityResult*)>
+      check_op_func_ptr_;
+};
+
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_COMMON_ONLINE_HELPER_DELEGATE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h
new file mode 100644
index 00000000..c3121c50
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_GPU_GPU_DELEGATE_COMPATIBILITY_CHECKER_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_GPU_GPU_DELEGATE_COMPATIBILITY_CHECKER_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
+
+namespace tflite {
+namespace tools {
+
+// Class to check if an operation or a model is compatible with GPU delegate.
+// No supported parameters.
+class GpuDelegateCompatibilityChecker
+    : public DelegateCompatibilityCheckerBase {
+ public:
+  GpuDelegateCompatibilityChecker() {}
+
+  // Online mode is not supported in the GPU delegate compatibility checker.
+  absl::Status checkModelCompatibilityOnline(
+      tflite::FlatBufferModel* model_buffer,
+      tflite::proto::CompatibilityResult* result) override;
+
+  // No parameters are supported, no need to call to this function.
+  std::unordered_map<std::string, std::string> getDccConfigurations() override;
+
+  // No parameters are supported, no need to call to this function.
+  absl::Status setDccConfigurations(
+      const std::unordered_map<std::string, std::string>& dcc_configs) override;
+
+ private:
+  absl::Status checkOpSigCompatibility(
+      const OpSignature& op_sig,
+      tflite::proto::OpCompatibilityResult* op_result) override;
+};
+
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_GPU_GPU_DELEGATE_COMPATIBILITY_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib.h
new file mode 100644
index 00000000..48802460
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib.h
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_NNAPI_NNAPI_COMPATIBILITY_LIB_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_NNAPI_NNAPI_COMPATIBILITY_LIB_H_
+
+#include <map>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
+
+namespace tflite {
+namespace tools {
+
+// Check if the given TFLite flatbuffer model is compatible with NNAPI delegate.
+// WARNING: This is an experimental API and subject to change.
+TfLiteStatus CheckCompatibility(
+    TfLiteContext* context, int32_t runtime_feature_level,
+    std::vector<int>* supported_nodes,
+    std::map<int, std::vector<tflite::delegate::nnapi::NNAPIValidationFailure>>*
+        failures_by_node);
+
+// This utility delegate is required because some TfLiteContext related
+// functions are forbidden if not calling in delegate.
+// WARNING: This is an experimental class and subject to change.
+class CompatibilityCheckerDelegate : public TfLiteDelegate {
+ public:
+  explicit CompatibilityCheckerDelegate(int32_t runtime_feature_level)
+      : TfLiteDelegate(TfLiteDelegateCreate()),
+        runtime_feature_level_(runtime_feature_level),
+        supported_nodes_(),
+        failures_by_node_() {
+    Prepare = DoPrepare;
+    CopyFromBufferHandle = DoCopyFromBufferHandle;
+    CopyToBufferHandle = DoCopyToBufferHandle;
+    FreeBufferHandle = DoFreeBufferHandle;
+    data_ = &delegate_data_;
+  }
+
+  std::vector<int> GetSupportedNodes() { return supported_nodes_; }
+  std::map<int, std::vector<tflite::delegate::nnapi::NNAPIValidationFailure>>
+  GetFailuresByNode() {
+    return failures_by_node_;
+  }
+
+ protected:
+  static TfLiteStatus DoPrepare(TfLiteContext* context,
+                                TfLiteDelegate* delegate) {
+    auto self = reinterpret_cast<CompatibilityCheckerDelegate*>(delegate);
+    TF_LITE_ENSURE_OK(context,
+                      CheckCompatibility(context, self->runtime_feature_level_,
+                                         &(self->supported_nodes_),
+                                         &(self->failures_by_node_)));
+    return kTfLiteOk;
+  }
+
+  // This function is not expected to be called in this delegate.
+  static TfLiteStatus DoCopyFromBufferHandle(TfLiteContext* context,
+                                             TfLiteDelegate* delegate,
+                                             TfLiteBufferHandle buffer_handle,
+                                             TfLiteTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  // This function is not expected to be called in this delegate.
+  static TfLiteStatus DoCopyToBufferHandle(TfLiteContext* context,
+                                           TfLiteDelegate* delegate,
+                                           TfLiteBufferHandle buffer_handle,
+                                           TfLiteTensor* tensor) {
+    return kTfLiteError;
+  }
+
+  // There is no buffer handle in this delegate.
+  static void DoFreeBufferHandle(TfLiteContext* context,
+                                 TfLiteDelegate* delegate,
+                                 TfLiteBufferHandle* handle) {}
+
+ private:
+  int delegate_data_;
+  int runtime_feature_level_;
+  std::vector<int> supported_nodes_;
+  std::map<int, std::vector<tflite::delegate::nnapi::NNAPIValidationFailure>>
+      failures_by_node_;
+};
+
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_NNAPI_NNAPI_COMPATIBILITY_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.h
new file mode 100644
index 00000000..38d9d356
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.h
@@ -0,0 +1,89 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_NNAPI_NNAPI_DELEGATE_COMPATIBILITY_CHECKER_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_NNAPI_NNAPI_DELEGATE_COMPATIBILITY_CHECKER_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
+
+namespace tflite {
+namespace tools {
+
+// Default runtime feature level used when no parameters are specified.
+inline constexpr int kDefaultRuntimeFeatureLevel = 8;
+
+// Class to check if an operation or a model is compatible with NNAPI delegate.
+// Supported parameters:
+//   - nnapi-runtime_feature_level: Between 1 and 8 (default value: 8)
+class NnapiDelegateCompatibilityChecker
+    : public DelegateCompatibilityCheckerBase {
+ public:
+  NnapiDelegateCompatibilityChecker() {
+    runtime_feature_level_ = kDefaultRuntimeFeatureLevel;
+  }
+
+  absl::Status checkModelCompatibilityOnline(
+      tflite::FlatBufferModel* model_buffer,
+      tflite::proto::CompatibilityResult* result) override;
+
+  // Checks if the node is compatible with the NNAPI delegate using online mode.
+  // Params:
+  //   context: Used to get the tensors. TfLiteTensors can be obtained via
+  //         TfLiteContext, which are used to get tensor type and tensor data,
+  //         the same way as with OpSignature in Offline mode.
+  //   node: Used with context to get the desired tensor, e.g.:
+  //         context->tensors[node->inputs->data[0]]
+  //   registration: Used to get the builtin code and the operator version.
+  //   op_result: Used to store if the node is compatible with the delegate or
+  //              not and why (with a human readable message).
+  static absl::Status checkOpCompatibilityOnline(
+      TfLiteContext* context, const TfLiteNode* node,
+      const TfLiteRegistration* registration,
+      std::unordered_map<std::string, std::string> dcc_configs,
+      tflite::proto::OpCompatibilityResult* op_result);
+
+  // Returns a dictionary with NNAPI delegate specific params.
+  // Keys:
+  //   - nnapi-runtime_feature_level
+  std::unordered_map<std::string, std::string> getDccConfigurations() override;
+
+  // Sets the parameters needed in the specific DCC.
+  // Keys:
+  //   - nnapi-runtime_feature_level
+  absl::Status setDccConfigurations(
+      const std::unordered_map<std::string, std::string>& dcc_configs) override;
+
+ private:
+  absl::Status checkOpSigCompatibility(
+      const OpSignature& op_sig,
+      tflite::proto::OpCompatibilityResult* op_result) override;
+
+  // Runtime feature level
+  // Refer to '/tensorflow/lite/nnapi/NeuralNetworksTypes.h'
+  int runtime_feature_level_;
+};
+
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_COMPATIBILITY_NNAPI_NNAPI_DELEGATE_COMPATIBILITY_CHECKER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/delegate_provider.h b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/delegate_provider.h
new file mode 100644
index 00000000..6fcca4ef
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/delegates/delegate_provider.h
@@ -0,0 +1,177 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_DELEGATES_DELEGATE_PROVIDER_H_
+#define TENSORFLOW_LITE_TOOLS_DELEGATES_DELEGATE_PROVIDER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/logging.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+namespace tools {
+
+// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
+// tensorflow/lite/interpreter.h dependency
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteOpaqueDelegate, void (*)(TfLiteOpaqueDelegate*)>;
+
+class DelegateProvider {
+ public:
+  virtual ~DelegateProvider() {}
+
+  // Create a list of command-line parsable flags based on tool params inside
+  // 'params' whose value will be set to the corresponding runtime flag value.
+  virtual std::vector<Flag> CreateFlags(ToolParams* params) const = 0;
+
+  // Log tool params. If 'verbose' is set to false, the param is going to be
+  // only logged if its value has been set, say via being parsed from
+  // commandline flags.
+  virtual void LogParams(const ToolParams& params, bool verbose) const = 0;
+
+  // Create a TfLiteDelegate based on tool params.
+  virtual TfLiteDelegatePtr CreateTfLiteDelegate(
+      const ToolParams& params) const = 0;
+
+  // Similar to the above, create a TfLiteDelegate based on tool params. If the
+  // same set of tool params could lead to creating multiple TfLite delegates,
+  // also return a relative rank of the delegate that indicates the order of the
+  // returned delegate that should be applied to the TfLite runtime.
+  virtual std::pair<TfLiteDelegatePtr, int> CreateRankedTfLiteDelegate(
+      const ToolParams& params) const = 0;
+
+  virtual std::string GetName() const = 0;
+
+  const ToolParams& DefaultParams() const { return default_params_; }
+
+ protected:
+  template <typename T>
+  Flag CreateFlag(const char* name, ToolParams* params,
+                  const std::string& usage) const {
+    return Flag(
+        name,
+        [params, name](const T& val, int argv_position) {
+          params->Set<T>(name, val, argv_position);
+        },
+        default_params_.Get<T>(name), usage, Flag::kOptional);
+  }
+  ToolParams default_params_;
+};
+
+using DelegateProviderPtr = std::unique_ptr<DelegateProvider>;
+using DelegateProviderList = std::vector<DelegateProviderPtr>;
+
+class DelegateProviderRegistrar {
+ public:
+  template <typename T>
+  struct Register {
+    Register() {
+      auto* const instance = DelegateProviderRegistrar::GetSingleton();
+      instance->providers_.emplace_back(DelegateProviderPtr(new T()));
+    }
+  };
+
+  static const DelegateProviderList& GetProviders() {
+    return GetSingleton()->providers_;
+  }
+
+ private:
+  DelegateProviderRegistrar() {}
+  DelegateProviderRegistrar(const DelegateProviderRegistrar&) = delete;
+  DelegateProviderRegistrar& operator=(const DelegateProviderRegistrar&) =
+      delete;
+
+  static DelegateProviderRegistrar* GetSingleton() {
+    static auto* instance = new DelegateProviderRegistrar();
+    return instance;
+  }
+  DelegateProviderList providers_;
+};
+
+#define REGISTER_DELEGATE_PROVIDER_VNAME(T) gDelegateProvider_##T##_
+#define REGISTER_DELEGATE_PROVIDER(T)                          \
+  static tflite::tools::DelegateProviderRegistrar::Register<T> \
+      REGISTER_DELEGATE_PROVIDER_VNAME(T);
+
+// Creates a null delegate, useful for cases where no reasonable delegate can be
+// created.
+TfLiteDelegatePtr CreateNullDelegate();
+
+// A global helper function to get all registered delegate providers.
+inline const DelegateProviderList& GetRegisteredDelegateProviders() {
+  return DelegateProviderRegistrar::GetProviders();
+}
+
+// A helper class to create a list of TfLite delegates based on the provided
+// ToolParams and the global DelegateProviderRegistrar.
+class ProvidedDelegateList {
+ public:
+  struct ProvidedDelegate {
+    ProvidedDelegate()
+        : provider(nullptr), delegate(CreateNullDelegate()), rank(0) {}
+    const DelegateProvider* provider;
+    TfLiteDelegatePtr delegate;
+    int rank;
+  };
+
+  ProvidedDelegateList() : ProvidedDelegateList(/*params*/ nullptr) {}
+
+  // 'params' is the ToolParams instance that this class will operate on,
+  // including adding all registered delegate parameters to it etc.
+  explicit ProvidedDelegateList(ToolParams* params)
+      : providers_(GetRegisteredDelegateProviders()), params_(params) {}
+
+  const DelegateProviderList& providers() const { return providers_; }
+
+  // Add all registered delegate params to the contained 'params_'.
+  void AddAllDelegateParams() const;
+
+  // Append command-line parsable flags to 'flags' of all registered delegate
+  // providers, and associate the flag values at runtime with the contained
+  // 'params_'.
+  void AppendCmdlineFlags(std::vector<Flag>& flags) const;
+
+  // Removes command-line parsable flag 'name' from 'flags'
+  void RemoveCmdlineFlag(std::vector<Flag>& flags,
+                         const std::string& name) const;
+
+  // Return a list of TfLite delegates based on the provided 'params', and the
+  // list has been already sorted in ascending order according to the rank of
+  // the particular parameter that enables the creation of the delegate.
+  std::vector<ProvidedDelegate> CreateAllRankedDelegates(
+      const ToolParams& params) const;
+
+  // Similar to the above, the list of TfLite delegates are created based on the
+  // contained 'params_'.
+  std::vector<ProvidedDelegate> CreateAllRankedDelegates() const {
+    return CreateAllRankedDelegates(*params_);
+  }
+
+ private:
+  const DelegateProviderList& providers_;
+
+  // Represent the set of "ToolParam"s that this helper class will operate on.
+  ToolParams* const params_;  // Not own the memory.
+};
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_DELEGATES_DELEGATE_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
new file mode 100644
index 00000000..5b2f5c05
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_DELEGATE_PROVIDER_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_DELEGATE_PROVIDER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/utils.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+namespace evaluation {
+
+using ProvidedDelegateList = tflite::tools::ProvidedDelegateList;
+class DelegateProviders {
+ public:
+  DelegateProviders();
+
+  // Returns a list of commandline flags that delegate providers define.
+  std::vector<Flag> GetFlags();
+
+  // Initialize delegate-related parameters from commandline arguments and
+  // returns true if successful.
+  bool InitFromCmdlineArgs(int* argc, const char** argv);
+
+  // Get all parameters from all registered delegate providers.
+  const tools::ToolParams& GetAllParams() const { return params_; }
+
+  // Get a new set of parameters based on the given TfliteInferenceParams
+  // 'params' but considering what have been initialized (i.e. 'params_').
+  // Note the same-meaning parameter (e.g. number of TfLite interpreter threads)
+  // in TfliteInferenceParams will take precedence over the parameter of the
+  // same meaning in 'params_'.
+  tools::ToolParams GetAllParams(const TfliteInferenceParams& params) const;
+
+  // Create the a TfLite delegate instance based on the provided delegate
+  // 'name'. If the specified one isn't found, an empty TfLiteDelegatePtr is
+  // returned.
+  TfLiteDelegatePtr CreateDelegate(const std::string& name) const;
+
+  // Create a list of TfLite delegates based on what have been initialized (i.e.
+  // 'params_').
+  std::vector<ProvidedDelegateList::ProvidedDelegate> CreateAllDelegates()
+      const {
+    return delegate_list_util_.CreateAllRankedDelegates();
+  }
+
+  // Create a list of TfLite delegates based on the given TfliteInferenceParams
+  // 'params' but considering what have been initialized (i.e. 'params_').
+  std::vector<ProvidedDelegateList::ProvidedDelegate> CreateAllDelegates(
+      const TfliteInferenceParams& params) const {
+    auto converted = GetAllParams(params);
+    ProvidedDelegateList util(&converted);
+    return util.CreateAllRankedDelegates();
+  }
+
+ private:
+  // Contain delegate-related parameters that are initialized from command-line
+  // flags.
+  tools::ToolParams params_;
+
+  // A helper to create TfLite delegates.
+  ProvidedDelegateList delegate_list_util_;
+
+  // Key is the delegate name, and the value is the index to a TfLite delegate
+  // provider in the "delegate_list_util_.providers()" list.
+  const std::unordered_map<std::string, int> delegates_map_;
+};
+
+// Parse a string 'val' to the corresponding delegate type defined by
+// TfliteInferenceParams::Delegate.
+TfliteInferenceParams::Delegate ParseStringToDelegateType(
+    const std::string& val);
+
+// Create a TfLite delegate based on the given TfliteInferenceParams 'params'.
+// If there's an error during the creation, an error message will be recorded to
+// 'error_msg' if provided.
+TfLiteDelegatePtr CreateTfLiteDelegate(const TfliteInferenceParams& params,
+                                       std::string* error_msg = nullptr);
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_DELEGATE_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/evaluation_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/evaluation_stage.h
new file mode 100644
index 00000000..13b8faae
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/evaluation_stage.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// Superclass for a single stage of an EvaluationPipeline.
+// Defines basic skeleton for sub-classes to implement.
+//
+// Ideally EvaluationStages should obtain access to initializer/input objects
+// via Get/Set methods on pointers, and not take ownership unless necessary.
+class EvaluationStage {
+ public:
+  // Initializes an EvaluationStage, including verifying the
+  // EvaluationStageConfig. Returns kTfLiteError if initialization failed,
+  // kTfLiteOk otherwise.
+  //
+  // Sub-classes are responsible for ensuring required class members are defined
+  // via Get/Set methods.
+  virtual TfLiteStatus Init() = 0;
+
+  // An individual run of the EvaluationStage. This is where the task to be
+  // evaluated takes place. Returns kTfLiteError if there was a failure,
+  // kTfLiteOk otherwise.
+  //
+  // Sub-classes are responsible for ensuring they have access to required
+  // inputs via Get/Set methods.
+  virtual TfLiteStatus Run() = 0;
+
+  // Returns the latest metrics based on all Run() calls made so far.
+  virtual EvaluationStageMetrics LatestMetrics() = 0;
+
+  virtual ~EvaluationStage() = default;
+
+ protected:
+  // Constructs an EvaluationStage.
+  // Each subclass constructor must invoke this constructor.
+  explicit EvaluationStage(const EvaluationStageConfig& config)
+      : config_(config) {}
+
+  EvaluationStageConfig config_;
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
new file mode 100644
index 00000000..d1ec8da7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_CLASSIFICATION_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_CLASSIFICATION_STAGE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h"
+#include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
+#include "tensorflow/lite/tools/evaluation/stages/topk_accuracy_eval_stage.h"
+
+namespace tflite {
+namespace evaluation {
+
+// An EvaluationStage to encapsulate the complete Image Classification task.
+// Utilizes ImagePreprocessingStage, TfLiteInferenceStage &
+// TopkAccuracyEvalStage for individual sub-tasks.
+class ImageClassificationStage : public EvaluationStage {
+ public:
+  explicit ImageClassificationStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override { return Init(nullptr); }
+  TfLiteStatus Init(const DelegateProviders* delegate_providers);
+
+  TfLiteStatus Run() override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+  // Call before Init(), if topk_accuracy_eval_params is set in
+  // ImageClassificationParams. all_labels should contain the labels
+  // corresponding to model's output, in the same order. all_labels should
+  // outlive the call to Init().
+  void SetAllLabels(const std::vector<std::string>& all_labels) {
+    all_labels_ = &all_labels;
+  }
+
+  // Call before Run().
+  // If accuracy eval is not being performed, ground_truth_label is ignored.
+  void SetInputs(const std::string& image_path,
+                 const std::string& ground_truth_label) {
+    image_path_ = image_path;
+    ground_truth_label_ = ground_truth_label;
+  }
+
+  // Provides a pointer to the underlying TfLiteInferenceStage.
+  // Returns non-null value only if this stage has been initialized.
+  TfliteInferenceStage* const GetInferenceStage() {
+    return inference_stage_.get();
+  }
+
+ private:
+  const std::vector<std::string>* all_labels_ = nullptr;
+  std::unique_ptr<ImagePreprocessingStage> preprocessing_stage_;
+  std::unique_ptr<TfliteInferenceStage> inference_stage_;
+  std::unique_ptr<TopkAccuracyEvalStage> accuracy_eval_stage_;
+  std::string image_path_;
+  std::string ground_truth_label_;
+};
+
+struct ImageLabel {
+  std::string image;
+  std::string label;
+};
+
+// Reads a file containing newline-separated denylisted image indices and
+// filters them out from image_labels.
+TfLiteStatus FilterDenyListedImages(const std::string& denylist_file_path,
+                                    std::vector<ImageLabel>* image_labels);
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_CLASSIFICATION_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
new file mode 100644
index 00000000..f16fda5b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
@@ -0,0 +1,191 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_PREPROCESSING_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_PREPROCESSING_STAGE_H_
+
+#include <stdint.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/util/stats_calculator.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/preprocessing_steps.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// EvaluationStage to read contents of an image and preprocess it for inference.
+// Currently only supports JPEGs.
+class ImagePreprocessingStage : public EvaluationStage {
+ public:
+  explicit ImagePreprocessingStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override;
+
+  TfLiteStatus Run() override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+  ~ImagePreprocessingStage() override {}
+
+  // Call before Run().
+  void SetImagePath(std::string* image_path) { image_path_ = image_path; }
+
+  // Provides preprocessing output.
+  void* GetPreprocessedImageData();
+
+ private:
+  std::string* image_path_ = nullptr;
+  TfLiteType output_type_;
+  tsl::Stat<int64_t> latency_stats_;
+
+  // One of the following 3 vectors will be populated based on output_type_.
+  std::vector<float> float_preprocessed_image_;
+  std::vector<int8_t> int8_preprocessed_image_;
+  std::vector<uint8_t> uint8_preprocessed_image_;
+};
+
+// Helper class to build a new ImagePreprocessingParams.
+class ImagePreprocessingConfigBuilder {
+ public:
+  ImagePreprocessingConfigBuilder(const std::string& name,
+                                  TfLiteType output_type) {
+    config_.set_name(name);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->set_output_type(static_cast<int>(output_type));
+  }
+
+  // Adds a cropping step with cropping fraction.
+  void AddCroppingStep(float cropping_fraction,
+                       bool use_square_cropping = false) {
+    ImagePreprocessingStepParams params;
+    params.mutable_cropping_params()->set_cropping_fraction(cropping_fraction);
+    params.mutable_cropping_params()->set_square_cropping(use_square_cropping);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a cropping step with target size.
+  void AddCroppingStep(uint32_t width, uint32_t height,
+                       bool use_square_cropping = false) {
+    ImagePreprocessingStepParams params;
+    params.mutable_cropping_params()->mutable_target_size()->set_height(height);
+    params.mutable_cropping_params()->mutable_target_size()->set_width(width);
+    params.mutable_cropping_params()->set_square_cropping(use_square_cropping);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a resizing step.
+  void AddResizingStep(uint32_t width, uint32_t height,
+                       bool aspect_preserving) {
+    ImagePreprocessingStepParams params;
+    params.mutable_resizing_params()->set_aspect_preserving(aspect_preserving);
+    params.mutable_resizing_params()->mutable_target_size()->set_height(height);
+    params.mutable_resizing_params()->mutable_target_size()->set_width(width);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a padding step.
+  void AddPaddingStep(uint32_t width, uint32_t height, int value) {
+    ImagePreprocessingStepParams params;
+    params.mutable_padding_params()->mutable_target_size()->set_height(height);
+    params.mutable_padding_params()->mutable_target_size()->set_width(width);
+    params.mutable_padding_params()->set_padding_value(value);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a square padding step.
+  void AddSquarePaddingStep(int value) {
+    ImagePreprocessingStepParams params;
+    params.mutable_padding_params()->set_square_padding(true);
+    params.mutable_padding_params()->set_padding_value(value);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a subtracting means step.
+  void AddPerChannelNormalizationStep(float r_mean, float g_mean, float b_mean,
+                                      float scale) {
+    ImagePreprocessingStepParams params;
+    params.mutable_normalization_params()->mutable_means()->set_r_mean(r_mean);
+    params.mutable_normalization_params()->mutable_means()->set_g_mean(g_mean);
+    params.mutable_normalization_params()->mutable_means()->set_b_mean(b_mean);
+    params.mutable_normalization_params()->set_scale(scale);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a normalization step.
+  void AddNormalizationStep(float mean, float scale) {
+    ImagePreprocessingStepParams params;
+    params.mutable_normalization_params()->set_channelwise_mean(mean);
+    params.mutable_normalization_params()->set_scale(scale);
+    config_.mutable_specification()
+        ->mutable_image_preprocessing_params()
+        ->mutable_steps()
+        ->Add(std::move(params));
+  }
+
+  // Adds a normalization step with default value.
+  void AddDefaultNormalizationStep() {
+    switch (
+        config_.specification().image_preprocessing_params().output_type()) {
+      case kTfLiteFloat32:
+        AddNormalizationStep(127.5, 1.0 / 127.5);
+        break;
+      case kTfLiteUInt8:
+        break;
+      case kTfLiteInt8:
+        AddNormalizationStep(128.0, 1.0);
+        break;
+      default:
+        LOG(ERROR) << "Type not supported";
+        break;
+    }
+  }
+
+  EvaluationStageConfig build() { return std::move(config_); }
+
+ private:
+  EvaluationStageConfig config_;
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_PREPROCESSING_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
new file mode 100644
index 00000000..a68049ed
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_INFERENCE_PROFILER_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_INFERENCE_PROFILER_STAGE_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/util/stats_calculator.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
+
+namespace tflite {
+namespace evaluation {
+
+// An EvaluationStage to profile a custom TFLite inference config by comparing
+// performance in two settings:
+// 1. User-defined TfliteInferenceParams (The 'test' setting)
+// 2. Default TfliteInferenceParams (The 'reference' setting)
+// The latter essentially implies single-threaded CPU execution.
+class InferenceProfilerStage : public EvaluationStage {
+ public:
+  explicit InferenceProfilerStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override { return Init(nullptr); }
+  TfLiteStatus Init(const DelegateProviders* delegate_providers);
+
+  // New Gaussian random data is used as input for each Run.
+  TfLiteStatus Run() override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+ private:
+  std::unique_ptr<TfliteInferenceStage> reference_stage_;
+  std::unique_ptr<TfliteInferenceStage> test_stage_;
+
+  const TfLiteModelInfo* model_info_;
+  std::vector<int64_t> input_num_elements_;
+  std::vector<int64_t> output_num_elements_;
+
+  // One Stat for each model output.
+  std::vector<tsl::Stat<float>> error_stats_;
+
+  // One of the following vectors will be populated based on model_input_type_,
+  // and used as the input for the underlying model.
+  std::vector<std::vector<float>> float_tensors_;
+  std::vector<std::vector<int8_t>> int8_tensors_;
+  std::vector<std::vector<uint8_t>> uint8_tensors_;
+  std::vector<std::vector<uint16_t>> float16_tensors_;
+  std::vector<std::vector<int32_t>> int32_tensors_;
+  std::vector<std::vector<int64_t>> int64_tensors_;
+  // Use uint8_t for bool tensors to use void* casting.
+  std::vector<std::vector<uint8_t>> bool_tensors_;
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_INFERENCE_PROFILER_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h
new file mode 100644
index 00000000..bedd883a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_OBJECT_DETECTION_AVERAGE_PRECISION_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_OBJECT_DETECTION_AVERAGE_PRECISION_STAGE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/utils/image_metrics.h"
+
+namespace tflite {
+namespace evaluation {
+
+// EvaluationStage to compute Average Precision for Object Detection Task.
+// Computes Average Precision per-IoU threshold (averaged across all classes),
+// and then mean Average Precision (mAP) as the average AP value across all
+// thresholds.
+class ObjectDetectionAveragePrecisionStage : public EvaluationStage {
+ public:
+  explicit ObjectDetectionAveragePrecisionStage(
+      const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override;
+
+  TfLiteStatus Run() override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+  // Call before Run().
+  void SetEvalInputs(const ObjectDetectionResult& predicted_objects,
+                     const ObjectDetectionResult& ground_truth_objects) {
+    predicted_objects_ = predicted_objects;
+    ground_truth_objects_ = ground_truth_objects;
+  }
+
+ private:
+  int num_classes_ = -1;
+  ObjectDetectionResult predicted_objects_;
+  ObjectDetectionResult ground_truth_objects_;
+  int current_image_index_ = 0;
+
+  // One inner vector per class for ground truth objects.
+  std::vector<std::vector<image::Detection>> ground_truth_object_vectors_;
+  // One inner vector per class for predicted objects.
+  std::vector<std::vector<image::Detection>> predicted_object_vectors_;
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_OBJECT_DETECTION_AVERAGE_PRECISION_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
new file mode 100644
index 00000000..38494c1f
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_OBJECT_DETECTION_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_OBJECT_DETECTION_STAGE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h"
+#include "tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h"
+#include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
+
+namespace tflite {
+namespace evaluation {
+
+// An EvaluationStage to encapsulate the complete Object Detection task.
+// Assumes that the object detection model's signature (number of
+// inputs/outputs, ordering of outputs & what they denote) is same as the
+// MobileNet SSD model:
+// https://www.tensorflow.org/lite/examples/object_detection/overview#output_signature.
+// Input size/type & number of detections could be different.
+//
+// This class will be extended to support other types of detection models, if
+// required in the future.
+class ObjectDetectionStage : public EvaluationStage {
+ public:
+  explicit ObjectDetectionStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override { return Init(nullptr); }
+  TfLiteStatus Init(const DelegateProviders* delegate_providers);
+
+  TfLiteStatus Run() override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+  // Call before Init(). all_labels should contain all possible object labels
+  // that can be detected by the model, in the correct order. all_labels should
+  // outlive the call to Init().
+  void SetAllLabels(const std::vector<std::string>& all_labels) {
+    all_labels_ = &all_labels;
+  }
+
+  // Call before Run().
+  // ground_truth_objects instance should outlive the call to Run().
+  void SetInputs(const std::string& image_path,
+                 const ObjectDetectionResult& ground_truth_objects) {
+    image_path_ = image_path;
+    ground_truth_objects_ = &ground_truth_objects;
+  }
+
+  // Provides a pointer to the underlying TfLiteInferenceStage.
+  // Returns non-null value only if this stage has been initialized.
+  TfliteInferenceStage* const GetInferenceStage() {
+    return inference_stage_.get();
+  }
+
+  // Returns a const pointer to the latest inference output.
+  const ObjectDetectionResult* GetLatestPrediction() {
+    return &predicted_objects_;
+  }
+
+ private:
+  const std::vector<std::string>* all_labels_ = nullptr;
+  std::unique_ptr<ImagePreprocessingStage> preprocessing_stage_;
+  std::unique_ptr<TfliteInferenceStage> inference_stage_;
+  std::unique_ptr<ObjectDetectionAveragePrecisionStage> eval_stage_;
+  std::string image_path_;
+
+  // Obtained from SetInputs(...).
+  const ObjectDetectionResult* ground_truth_objects_;
+  // Reflects the outputs generated from the latest call to Run().
+  ObjectDetectionResult predicted_objects_;
+};
+
+// Reads a tflite::evaluation::ObjectDetectionGroundTruth instance from a
+// textproto file and populates a mapping of image name to
+// ObjectDetectionResult.
+// File with ObjectDetectionGroundTruth can be generated using the
+// preprocess_coco_minival.py script in evaluation/tasks/coco_object_detection.
+// Useful for wrappers/scripts that use ObjectDetectionStage.
+TfLiteStatus PopulateGroundTruth(
+    const std::string& grouth_truth_proto_file,
+    absl::flat_hash_map<std::string, ObjectDetectionResult>*
+        ground_truth_mapping);
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_OBJECT_DETECTION_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
new file mode 100644
index 00000000..ca8ffc68
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TFLITE_INFERENCE_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TFLITE_INFERENCE_STAGE_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
+#include "xla/tsl/util/stats_calculator.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+struct TfLiteModelInfo {
+  std::vector<const TfLiteTensor*> inputs;
+  std::vector<const TfLiteTensor*> outputs;
+};
+
+// EvaluationStage to run inference using TFLite.
+class TfliteInferenceStage : public EvaluationStage {
+ public:
+  explicit TfliteInferenceStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override { return Init(nullptr); }
+  TfLiteStatus Init(const DelegateProviders* delegate_providers);
+
+  TfLiteStatus Run() override;
+
+  // EvaluationStageMetrics.num_runs denotes the number of inferences run.
+  EvaluationStageMetrics LatestMetrics() override;
+
+  ~TfliteInferenceStage() override {}
+
+  // Call before Run().
+  // This class does not take ownership of raw_input_ptrs.
+  void SetInputs(const std::vector<void*>& raw_input_ptrs) {
+    inputs_ = &raw_input_ptrs;
+  }
+
+  // Resize input tensors with given shapes.
+  TfLiteStatus ResizeInputs(const std::vector<std::vector<int>>& shapes);
+
+  // Applies provided delegate to the underlying TFLite Interpreter.
+  TfLiteStatus ApplyCustomDelegate(Interpreter::TfLiteDelegatePtr delegate);
+
+  // Read-only view of a TfliteModelInfo. TfliteInferenceStage retains
+  // ownership.
+  // Only available after Init is done.
+  const TfLiteModelInfo* GetModelInfo() const { return &model_info_; }
+
+  // Provides a read-only view to the model's output tensor(s). Retains
+  // ownership of object.
+  const std::vector<void*>* GetOutputs() const { return &outputs_; }
+
+ private:
+  // Sets model_info_ & outputs_ after interpreter tensors are (re)allocated.
+  void UpdateModelInfo();
+
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<ops::builtin::BuiltinOpResolver> resolver_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::vector<Interpreter::TfLiteDelegatePtr> delegates_;
+
+  TfLiteModelInfo model_info_;
+  const std::vector<void*>* inputs_ = nullptr;
+  std::vector<void*> outputs_;
+
+  tsl::Stat<int64_t> latency_stats_;
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TFLITE_INFERENCE_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/topk_accuracy_eval_stage.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/topk_accuracy_eval_stage.h
new file mode 100644
index 00000000..d6da0493
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/topk_accuracy_eval_stage.h
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TOPK_ACCURACY_EVAL_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TOPK_ACCURACY_EVAL_STAGE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// EvaluationStage to compute top-K accuracy of a classification model.
+// The computed weights in the model output should be in the same order
+// as the vector provided during SetAllLabels
+// Ground truth label must be one of provided labels.
+// Current accuracies can be obtained with GetLatestMetrics().
+class TopkAccuracyEvalStage : public EvaluationStage {
+ public:
+  explicit TopkAccuracyEvalStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  TfLiteStatus Init() override;
+
+  TfLiteStatus Run() override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+  ~TopkAccuracyEvalStage() override {}
+
+  // Call before Init().
+  // model_output_shape is not owned, so this class does not free the
+  // TfLiteIntArray.
+  void SetTaskInfo(const std::vector<std::string>& all_labels,
+                   TfLiteType model_output_type,
+                   TfLiteIntArray* model_output_shape) {
+    // We copy ground_truth_labels to ensure we can access the data throughout
+    // the lifetime of this evaluation stage.
+    ground_truth_labels_ = all_labels;
+    model_output_type_ = model_output_type;
+    model_output_shape_ = model_output_shape;
+  }
+
+  // Call before Run().
+  void SetEvalInputs(void* model_raw_output, std::string* ground_truth_label) {
+    model_output_ = model_raw_output;
+    ground_truth_label_ = ground_truth_label;
+  }
+
+ private:
+  // Updates accuracy_counts_ based on comparing top k labels and the
+  // groundtruth one. Using string comparison since there are some duplicate
+  // labels in the imagenet dataset.
+  void UpdateCounts(const std::vector<int>& topk_indices);
+
+  std::vector<std::string> ground_truth_labels_;
+  TfLiteType model_output_type_ = kTfLiteNoType;
+  TfLiteIntArray* model_output_shape_ = nullptr;
+  int num_total_labels_;
+  void* model_output_ = nullptr;
+  std::string* ground_truth_label_ = nullptr;
+
+  // Equal to number of samples evaluated so far.
+  int num_runs_;
+  // Stores |k_| values, where the ith value denotes number of samples (out of
+  // num_runs_) for which correct label appears in the top (i+1) model outputs.
+  std::vector<int> accuracy_counts_;
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TOPK_ACCURACY_EVAL_STAGE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/utils/image_metrics.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/utils/image_metrics.h
new file mode 100644
index 00000000..2a9aae28
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/stages/utils/image_metrics.h
@@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_UTILS_IMAGE_METRICS_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_UTILS_IMAGE_METRICS_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+namespace tflite {
+namespace evaluation {
+namespace image {
+
+struct Box2D {
+  struct Interval {
+    float min = 0;
+    float max = 0;
+    Interval(float x, float y) {
+      min = x;
+      max = y;
+    }
+    Interval() {}
+  };
+
+  Interval x;
+  Interval y;
+  static float Length(const Interval& a);
+  static float Intersection(const Interval& a, const Interval& b);
+  float Area() const;
+  float Intersection(const Box2D& other) const;
+  float Union(const Box2D& other) const;
+  // Intersection of this box and the given box normalized over the union of
+  // this box and the given box.
+  float IoU(const Box2D& other) const;
+  // Intersection of this box and the given box normalized over the area of
+  // this box.
+  float Overlap(const Box2D& other) const;
+};
+
+// If the value is:
+//   - kDontIgnore: The object is included in this evaluation.
+//   - kIgnoreOneMatch: the first matched prediction bbox will be ignored. This
+//      is useful when this groundtruth object is not intended to be evaluated.
+//   - kIgnoreAllMatches: all matched prediction bbox will be ignored. Typically
+//      it is used to mark an area that has not been labeled.
+enum IgnoreType {
+  kDontIgnore = 0,
+  kIgnoreOneMatch = 1,
+  kIgnoreAllMatches = 2,
+};
+
+struct Detection {
+ public:
+  bool difficult = false;
+  int64_t imgid = 0;
+  float score = 0;
+  Box2D box;
+  IgnoreType ignore = IgnoreType::kDontIgnore;
+
+  Detection() {}
+  Detection(bool d, int64_t id, float s, Box2D b)
+      : difficult(d), imgid(id), score(s), box(b) {}
+  Detection(bool d, int64_t id, float s, Box2D b, IgnoreType i)
+      : difficult(d), imgid(id), score(s), box(b), ignore(i) {}
+};
+
+// Precision and recall.
+struct PR {
+  float p = 0;
+  float r = 0;
+  PR(const float p_, const float r_) : p(p_), r(r_) {}
+};
+
+class AveragePrecision {
+ public:
+  // iou_threshold: A predicted box matches a ground truth box if and only if
+  //   IoU between these two are larger than this iou_threshold. Default: 0.5.
+  // num_recall_points: AP is computed as the average of maximum precision at (1
+  //   + num_recall_points) recall levels. E.g., if num_recall_points is 10,
+  //   recall levels are 0., 0.1, 0.2, ..., 0.9, 1.0.
+  // Default: 100. If num_recall_points < 0, AveragePrecision of 0 is returned.
+  struct Options {
+    float iou_threshold = 0.5;
+    int num_recall_points = 100;
+  };
+  AveragePrecision() : AveragePrecision(Options()) {}
+  explicit AveragePrecision(const Options& opts) : opts_(opts) {}
+
+  // Given a sequence of precision-recall points ordered by the recall in
+  // non-increasing order, returns the average of maximum precisions at
+  // different recall values (0.0, 0.1, 0.2, ..., 0.9, 1.0).
+  // The p-r pairs at these fixed recall points will be written to pr_out, if
+  // it is not null_ptr.
+  float FromPRCurve(const std::vector<PR>& pr,
+                    std::vector<PR>* pr_out = nullptr);
+
+  // An axis aligned bounding box for an image with id 'imageid'.  Score
+  // indicates its confidence.
+  //
+  // 'difficult' is a special bit specific to Pascal VOC dataset and tasks using
+  // the data. If 'difficult' is true, by convention, the box is often ignored
+  // during the AP calculation. I.e., if a predicted box matches a 'difficult'
+  // ground box, this predicted box is ignored as if the model does not make
+  // such a prediction.
+
+  // Given the set of ground truth boxes and a set of predicted boxes, returns
+  // the average of the maximum precisions at different recall values.
+  float FromBoxes(const std::vector<Detection>& groundtruth,
+                  const std::vector<Detection>& prediction,
+                  std::vector<PR>* pr_out = nullptr);
+
+ private:
+  Options opts_;
+};
+
+}  // namespace image
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_UTILS_IMAGE_METRICS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.h
new file mode 100644
index 00000000..f81af2f4
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.h
@@ -0,0 +1,22 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property(strong, nonatomic) UIWindow *window;
+
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.h
new file mode 100644
index 00000000..cd0dcc64
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.h
@@ -0,0 +1,22 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import <UIKit/UIKit.h>
+
+@interface EvaluationViewController : UIViewController
+@property(weak, nonatomic) IBOutlet UITextView *resultsView;
+
+@end
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/task_executor.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/task_executor.h
new file mode 100644
index 00000000..c60b7fa7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/task_executor.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
+
+#include <optional>
+
+#include "absl/types/optional.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+// A common task execution API to avoid boilerpolate code in defining the main
+// function.
+class TaskExecutor {
+ public:
+  virtual ~TaskExecutor() {}
+
+  // If the run is successful, the latest metrics will be returned.
+  std::optional<EvaluationStageMetrics> Run(int* argc, char* argv[]);
+
+ protected:
+  // Returns a list of commandline flags that this task defines.
+  virtual std::vector<Flag> GetFlags() = 0;
+
+  virtual std::optional<EvaluationStageMetrics> RunImpl() = 0;
+
+  DelegateProviders delegate_providers_;
+};
+
+// Just a declaration. In order to avoid the boilerpolate main-function code,
+// every evaluation task should define this function.
+std::unique_ptr<TaskExecutor> CreateTaskExecutor();
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h
new file mode 100644
index 00000000..3509850a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_C_API_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_C_API_H_
+
+#include <cstddef>
+#include <cstdint>
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::LatencyMetrics proto.
+// -----------------------------------------------------------------------------
+
+typedef struct TfLiteEvaluationMetricsLatency {
+  // Latency for the last Run.
+  int64_t last_us;
+  // Maximum latency observed for any Run.
+  int64_t max_us;
+  // Minimum latency observed for any Run.
+  int64_t min_us;
+  // Sum of all Run latencies.
+  int64_t sum_us;
+  // Average latency across all Runs.
+  double avg_us;
+  // Standard deviation for latency across all Runs.
+  int64_t std_deviation_us;
+} TfLiteEvaluationMetricsLatency;
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::AccuracyMetrics proto.
+// -----------------------------------------------------------------------------
+
+typedef struct TfLiteEvaluationMetricsAccuracy {
+  // Maximum value observed for any Run.
+  float max_value;
+  // Minimum value observed for any Run.
+  float min_value;
+  // Average value across all Runs.
+  double avg_value;
+  // Standard deviation across all Runs.
+  float std_deviation;
+} TfLiteEvaluationMetricsAccuracy;
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::EvaluationStageMetrics type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteEvaluationMetrics TfLiteEvaluationMetrics;
+
+extern int32_t TfLiteEvaluationMetricsGetNumRuns(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern TfLiteEvaluationMetricsLatency TfLiteEvaluationMetricsGetTestLatency(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern TfLiteEvaluationMetricsLatency
+TfLiteEvaluationMetricsGetReferenceLatency(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern size_t TfLiteEvaluationMetricsGetOutputErrorCount(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern TfLiteEvaluationMetricsAccuracy TfLiteEvaluationMetricsGetOutputError(
+    const TfLiteEvaluationMetrics* metrics, int32_t output_error_index);
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::TaskExecutor type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteEvaluationTask TfLiteEvaluationTask;
+
+extern TfLiteEvaluationTask* TfLiteEvaluationTaskCreate();
+
+extern TfLiteEvaluationMetrics* TfLiteEvaluationTaskRunWithArgs(
+    TfLiteEvaluationTask* evaluation_task, int argc, char** argv);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_C_API_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/utils.h b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/utils.h
new file mode 100644
index 00000000..638d9e7e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/evaluation/utils.h
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#if !TFLITE_WITH_STABLE_ABI
+// TODO(b/240438534): enable nnapi.
+#if defined(__ANDROID__)
+#define TFLITE_SUPPORTS_NNAPI_DELEGATE 1
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
+#elif defined(CL_DELEGATE_NO_GL)
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
+#endif  // defined(__ANDROID__)
+#endif  // TFLITE_WITH_STABLE_ABI
+
+// XNNPACK does not support s390x
+// (see <https://github.com/tensorflow/tensorflow/pull/51655>).
+#ifdef __s390x__
+#define TFLITE_WITHOUT_XNNPACK 1
+#endif
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#endif
+
+#if TFLITE_ENABLE_HEXAGON
+#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
+#endif
+
+#if TFLITE_SUPPORTS_NNAPI_DELEGATE
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#endif  // TFLITE_SUPPORTS_NNAPI_DELEGATE
+
+#ifndef TFLITE_WITHOUT_XNNPACK
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#endif  // !defined(TFLITE_WITHOUT_XNNPACK)
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace evaluation {
+
+// Same as Interpreter::TfLiteDelegatePtr, defined here to avoid pulling
+// in tensorflow/lite/interpreter.h dependency.
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteOpaqueDelegate, void (*)(TfLiteOpaqueDelegate*)>;
+
+std::string StripTrailingSlashes(const std::string& path);
+
+bool ReadFileLines(const std::string& file_path,
+                   std::vector<std::string>* lines_output);
+
+// If extension set is empty, all files will be listed. The strings in
+// extension set are expected to be in lowercase and include the dot.
+TfLiteStatus GetSortedFileNames(
+    const std::string& directory, std::vector<std::string>* result,
+    const std::unordered_set<std::string>& extensions);
+
+inline TfLiteStatus GetSortedFileNames(const std::string& directory,
+                                       std::vector<std::string>* result) {
+  return GetSortedFileNames(directory, result,
+                            std::unordered_set<std::string>());
+}
+
+// Returns nullptr on error, e.g. if NNAPI isn't supported on this platform.
+TfLiteDelegatePtr CreateNNAPIDelegate();
+#if TFLITE_SUPPORTS_NNAPI_DELEGATE
+TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
+#endif  // TFLITE_SUPPORTS_NNAPI_DELEGATE
+
+TfLiteDelegatePtr CreateGPUDelegate();
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options);
+#endif  // TFLITE_SUPPORTS_GPU_DELEGATE
+
+TfLiteDelegatePtr CreateHexagonDelegate(
+    const std::string& library_directory_path, bool profiling);
+#if TFLITE_ENABLE_HEXAGON
+TfLiteDelegatePtr CreateHexagonDelegate(
+    const TfLiteHexagonDelegateOptions* options,
+    const std::string& library_directory_path);
+#endif
+
+#ifndef TFLITE_WITHOUT_XNNPACK
+TfLiteXNNPackDelegateOptions XNNPackDelegateOptionsDefault();
+TfLiteDelegatePtr CreateXNNPACKDelegate();
+TfLiteDelegatePtr CreateXNNPACKDelegate(
+    const TfLiteXNNPackDelegateOptions* options);
+#endif  // !defined(TFLITE_WITHOUT_XNNPACK)
+TfLiteDelegatePtr CreateXNNPACKDelegate(
+    int num_threads, bool force_fp16,
+    const char* weight_cache_file_path = nullptr);
+
+TfLiteDelegatePtr CreateCoreMlDelegate();
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/gen_op_registration.h b/third_party/tflite-hdrs/tensorflow/lite/tools/gen_op_registration.h
new file mode 100644
index 00000000..ed378468
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/gen_op_registration.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#define TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+
+// Convert the custom op name to registration name following the convention.
+// Example:
+//   "custom_op" -> "CUSTOM_OP"
+//   "CustomOp" -> "CUSTOM_OP"
+// Note "Register_" suffix will be added later in the tool.
+string NormalizeCustomOpName(const string& op);
+
+// A map from op name to {min_version, max_version}.
+typedef std::map<string, std::pair<int, int>> RegisteredOpMap;
+
+// Read ops from the TFLite model.
+// The builtin ops key is the enum name of builtin ops, such as "CONV_2D".
+// The custom ops key is stored as it is.
+void ReadOpsFromModel(const ::tflite::Model* model,
+                      RegisteredOpMap* builtin_ops,
+                      RegisteredOpMap* custom_ops);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/list_flex_ops.h b/third_party/tflite-hdrs/tensorflow/lite/tools/list_flex_ops.h
new file mode 100644
index 00000000..f36e827c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/list_flex_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
+#define TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
+
+#include <set>
+#include <string>
+
+#include "tensorflow/lite/core/model.h"
+
+namespace tflite {
+namespace flex {
+
+// Store the Op and Kernel name of an op as the key of a set or map.
+struct OpKernel {
+  std::string op_name;
+  std::string kernel_name;
+};
+
+// The comparison function for OpKernel.
+struct OpKernelCompare {
+  bool operator()(const OpKernel& lhs, const OpKernel& rhs) const {
+    if (lhs.op_name == rhs.op_name) {
+      return lhs.kernel_name < rhs.kernel_name;
+    }
+    return lhs.op_name < rhs.op_name;
+  }
+};
+
+using OpKernelSet = std::set<OpKernel, OpKernelCompare>;
+
+// Find flex ops and its kernel classes inside a TFLite model and add them to
+// the map flex_ops.
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops);
+
+// Serialize the list op of to a json string. If flex_ops is empty, return an
+// empty string.
+std::string OpListToJSONString(const OpKernelSet& flex_ops);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/logging.h b/third_party/tflite-hdrs/tensorflow/lite/tools/logging.h
new file mode 100644
index 00000000..368c76d5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/logging.h
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_LOGGING_H_
+#define TENSORFLOW_LITE_TOOLS_LOGGING_H_
+
+// LOG and CHECK macros for tflite tooling.
+
+#include <cstdlib>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+
+#ifdef _WIN32
+#undef ERROR
+#endif
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+namespace tflite {
+namespace logging {
+// A wrapper that logs to stderr.
+//
+// Used for TFLITE_LOG and TFLITE_BENCHMARK_CHECK macros.
+class LoggingWrapper {
+ public:
+  enum class LogSeverity : int {
+    INFO = 0,
+    WARN = 1,
+    ERROR = 2,
+    FATAL = 3,
+  };
+  explicit LoggingWrapper(LogSeverity severity)
+      : severity_(severity), should_log_(true) {}
+  LoggingWrapper(LogSeverity severity, bool log)
+      : severity_(severity), should_log_(log) {}
+  std::stringstream& Stream() { return stream_; }
+  ~LoggingWrapper() {
+    if (should_log_) {
+      // Also print log to logcat for android, as stderr will be hidden
+      // in the app use case.
+#ifdef __ANDROID__
+      switch (severity_) {
+        case LogSeverity::INFO:
+          __android_log_print(ANDROID_LOG_INFO, "tflite", "%s",
+                              stream_.str().c_str());
+          break;
+        case LogSeverity::WARN:
+          __android_log_print(ANDROID_LOG_WARN, "tflite", "%s",
+                              stream_.str().c_str());
+          break;
+        case LogSeverity::ERROR:
+          __android_log_print(ANDROID_LOG_ERROR, "tflite", "%s",
+                              stream_.str().c_str());
+          break;
+        case LogSeverity::FATAL:
+          __android_log_print(ANDROID_LOG_ERROR, "tflite", "%s",
+                              stream_.str().c_str());
+      }
+#endif
+      switch (severity_) {
+        case LogSeverity::INFO:
+          std::cout << "INFO: " << stream_.str() << std::endl;
+          break;
+        case LogSeverity::WARN:
+          std::cout << "WARN: " << stream_.str() << std::endl;
+          break;
+        case LogSeverity::ERROR:
+          std::cerr << "ERROR: " << stream_.str() << std::endl;
+          break;
+        case LogSeverity::FATAL:
+          std::cerr << "FATAL: " << stream_.str() << std::endl;
+          std::flush(std::cerr);
+          std::abort();
+          break;
+      }
+    }
+  }
+
+ private:
+  std::stringstream stream_;
+  LogSeverity severity_;
+  bool should_log_;
+};
+}  // namespace logging
+}  // namespace tflite
+
+#define TFLITE_LOG(severity)                                  \
+  tflite::logging::LoggingWrapper(                            \
+      tflite::logging::LoggingWrapper::LogSeverity::severity) \
+      .Stream()
+
+#define TFLITE_MAY_LOG(severity, should_log)                                \
+  tflite::logging::LoggingWrapper(                                          \
+      tflite::logging::LoggingWrapper::LogSeverity::severity, (should_log)) \
+      .Stream()
+
+#define TFLITE_TOOLS_CHECK(condition) TFLITE_MAY_LOG(FATAL, !(condition))
+
+#define TFLITE_TOOLS_CHECK_EQ(a, b) TFLITE_TOOLS_CHECK((a) == (b))
+
+#endif  // TENSORFLOW_LITE_TOOLS_LOGGING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/model_loader.h b/third_party/tflite-hdrs/tensorflow/lite/tools/model_loader.h
new file mode 100644
index 00000000..e638730c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/model_loader.h
@@ -0,0 +1,180 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_MODEL_LOADER_H_
+#define TENSORFLOW_LITE_TOOLS_MODEL_LOADER_H_
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif  // !_WIN32
+
+#include <cstddef>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/core/model_builder.h"
+
+namespace tflite {
+namespace tools {
+
+// Class to load the Model.
+class ModelLoader {
+ public:
+  enum class Type : int {
+    kPathModelLoader = 0,
+    kBufferModelLoader = 1,
+    kMmapModelLoader = 2,
+    kPipeModelLoader = 3,
+  };
+
+  virtual ~ModelLoader() = default;
+
+  // Return whether the model is loaded successfully.
+  virtual bool Init();
+
+  // Return the concrete type of the class.
+  virtual Type type() const = 0;
+
+  const FlatBufferModel* GetModel() const { return model_.get(); }
+
+ protected:
+  // Interface for subclass to create model_. Init() calls InitInternal(). If
+  // InitInternal() returns false, or if it returns true but model_ remains
+  // null, then Init() will return false.
+  virtual bool InitInternal() = 0;
+
+  std::unique_ptr<FlatBufferModel> model_;
+};
+
+// Load the Model from a file path.
+class PathModelLoader : public ModelLoader {
+ public:
+  explicit PathModelLoader(absl::string_view model_path)
+      : ModelLoader(), model_path_(model_path) {}
+
+  Type type() const override { return Type::kPathModelLoader; }
+
+ protected:
+  bool InitInternal() override;
+
+ private:
+  const std::string model_path_;
+};
+
+// Load the Model from buffer. The buffer is owned by the caller.
+class BufferModelLoader : public ModelLoader {
+ public:
+  BufferModelLoader(const char* caller_owned_buffer, size_t model_size)
+      : caller_owned_buffer_(caller_owned_buffer), model_size_(model_size) {}
+
+  // Move only.
+  BufferModelLoader(BufferModelLoader&&) = default;
+  BufferModelLoader& operator=(BufferModelLoader&&) = default;
+
+  ~BufferModelLoader() override = default;
+
+  Type type() const override { return Type::kBufferModelLoader; }
+
+ protected:
+  bool InitInternal() override;
+
+ private:
+  const char* caller_owned_buffer_ = nullptr;
+  size_t model_size_ = 0;
+};
+
+#ifndef _WIN32
+// Load the Model from a file descriptor. This class is not available on
+// Windows.
+class MmapModelLoader : public ModelLoader {
+ public:
+  // Create the model loader from file descriptor. The model_fd only has to be
+  // valid for the duration of the constructor (it's dup'ed inside).
+  MmapModelLoader(int model_fd, size_t model_offset, size_t model_size)
+      : ModelLoader(),
+        model_fd_(dup(model_fd)),
+        model_offset_(model_offset),
+        model_size_(model_size) {}
+
+  ~MmapModelLoader() override {
+    if (model_fd_ >= 0) {
+      close(model_fd_);
+    }
+  }
+
+  Type type() const override { return Type::kMmapModelLoader; }
+
+ protected:
+  bool InitInternal() override;
+
+ private:
+  const int model_fd_ = -1;
+  const size_t model_offset_ = 0;
+  const size_t model_size_ = 0;
+};
+
+// Load the Model from a pipe file descriptor.
+// IMPORTANT: This class tries to read the model from a pipe file descriptor,
+// and the caller needs to ensure that this pipe should be read from in a
+// different process / thread than written to. It may block when running in the
+// same process / thread.
+class PipeModelLoader : public ModelLoader {
+ public:
+  PipeModelLoader(int pipe_fd, size_t model_size)
+      : ModelLoader(), pipe_fd_(pipe_fd), model_size_(model_size) {}
+
+  // Move only.
+  PipeModelLoader(PipeModelLoader&&) = default;
+  PipeModelLoader& operator=(PipeModelLoader&&) = default;
+
+  ~PipeModelLoader() override { std::free(model_buffer_); }
+
+  Type type() const override { return Type::kPipeModelLoader; }
+
+ protected:
+  // Reads the serialized Model from read_pipe_fd. Returns false if the number
+  // of bytes read in is less than read_size. This function also closes the
+  // read_pipe_fd and write_pipe_fd.
+  bool InitInternal() override;
+
+ private:
+  const int pipe_fd_ = -1;
+  const size_t model_size_ = 0;
+  uint8_t* model_buffer_ = nullptr;
+};
+
+#endif  // !_WIN32
+
+// Create the model loader from a string path. Path can be one of the following:
+// 1) File descriptor path: path must be in the format of
+// "fd:%model_fd%:%model_offset%:%model_size%". Returns null if path cannot be
+// parsed.
+// 2) Pipe descriptor path: path must be in the format of
+// "pipe:%read_pipe%:%write_pipe%:%model_size%". This function also closes the
+// write_pipe when write_pipe >= 0, so it should be called at the read thread /
+// process. Returns null if path cannot be parsed.
+// 3) File path: Always return a PathModelLoader.
+// 4) Buffer path: path must be in the format of
+// "buffer:%buffer_handle%:%buffer_size%". This model loader does not own the
+// buffer_handle, and the caller needs to ensure the buffer_handle out-lives the
+// model loader.
+std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(const std::string& path);
+
+}  // namespace tools
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_MODEL_LOADER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
new file mode 100644
index 00000000..edcdc8bd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_BUILTIN_LOGGING_OPS_LSTM_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_BUILTIN_LOGGING_OPS_LSTM_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace builtin {
+
+enum class LSTMType {
+  kLSTM,
+  kUnidirectionalSequenceLSTM,
+};
+
+TfLiteStatus lstm_logging_kernel(TfLiteContext* context,
+                                 const int subgraph_index, TfLiteNode* node,
+                                 Logger* logger, ErrorReporter* error_reporter);
+
+TfLiteStatus unidirectional_sequence_lstm_logging_kernel(
+    TfLiteContext* context, const int subgraph_index, TfLiteNode* node,
+    Logger* logger, ErrorReporter* error_reporter);
+
+}  // namespace builtin
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_BUILTIN_LOGGING_OPS_LSTM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_common.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_common.h
new file mode 100644
index 00000000..c7eaaf3b
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_common.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_COMMON_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_COMMON_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+using BuiltinOperatorKey = std::pair<BuiltinOperator, int>;
+
+using CustomOperatorKey = std::pair<std::string, int>;
+
+using BuiltinOpsSet = std::unordered_set<
+    BuiltinOperatorKey,
+    op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>>;
+
+using CustomOpsSet = std::unordered_set<
+    CustomOperatorKey,
+    op_resolver_hasher::OperatorKeyHasher<CustomOperatorKey>>;
+
+template <typename T>
+class BuiltinOpsMap
+    : public std::unordered_map<
+          BuiltinOperatorKey, T,
+          op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>> {};
+
+template <typename T>
+class CustomOpsMap
+    : public std::unordered_map<
+          CustomOperatorKey, T,
+          op_resolver_hasher::OperatorKeyHasher<CustomOperatorKey>> {};
+
+// An alias for |TfLiteRegistration.invoke|.
+using KernelEvalFuncPtr = TfLiteStatus (*)(TfLiteContext*, TfLiteNode*);
+
+enum class OperatorTensorType { kNone, kInput, kOutput, kIntermediate };
+
+// Information about an operator in the TfLite graph.
+struct OperatorInfo {
+  int subgraph_index;
+  int node_index;
+  std::string name;
+  BuiltinOperator builtin_op_code;
+  bool is_custom_op;
+  std::vector<int> inputs;
+  std::vector<int> outputs;
+  // Inputs that need to be logged.
+  std::vector<int> loggable_inputs;
+  // Outputs that need to be logged.
+  std::vector<int> loggable_outputs;
+  const TfLiteRegistration* registration;
+  int version;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_COMMON_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_logger.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
new file mode 100644
index 00000000..fd2835a8
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_LOGGER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_LOGGER_H_
+
+#include <limits>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+class MinMax {
+ public:
+  TfLiteStatus Update(const float* values, size_t tensor_size,
+                      ErrorReporter* error_reporter);
+
+  bool HasValues() const { return has_values_; }
+
+  TfLiteStatus Get(float* min_val, float* max_val) const {
+    if (!has_values_) return kTfLiteError;
+    *min_val = min_;
+    *max_val = max_;
+    return kTfLiteOk;
+  }
+
+ private:
+  bool has_values_ = false;
+  float min_ = std::numeric_limits<float>::max();
+  float max_ = std::numeric_limits<float>::min();
+};
+
+// Captures min max values for tensors.
+class Logger {
+ public:
+  // Log the value for tensor at |tensor_index| which has |tensor_values|
+  TfLiteStatus LogTensorValue(int subgraph_index, int tensor_index,
+                              const float* tensor_values, size_t tensor_size,
+                              ErrorReporter* error_reporter) {
+    std::tuple<int, int> key{subgraph_index, tensor_index};
+    return tensor_id_to_stats_map_[key].Update(tensor_values, tensor_size,
+                                               error_reporter);
+  }
+
+  // Returns a map from tensor_index -> observed min max values.
+  const absl::flat_hash_map<std::tuple<int, int>, MinMax>&
+  GetCalibrationValues() const {
+    return tensor_id_to_stats_map_;
+  }
+
+ private:
+  absl::flat_hash_map<std::tuple<int, int>, MinMax> tensor_id_to_stats_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_LOGGER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_reader.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
new file mode 100644
index 00000000..9168b119
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_READER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_READER_H_
+
+#include <tuple>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// Warning: This is not a public API and subject to change.
+//
+// Reads calibrator data collected by running the interpreter through
+// a calibration set.
+class CalibrationReader {
+ public:
+  struct CalibrationStats {
+    float min;
+    float max;
+  };
+  explicit CalibrationReader(const Logger* logger) : logger_(logger) {}
+
+  // Gets a map from tensor index to recorded calibration values.
+  virtual TfLiteStatus GetTensorStatsAsMap(
+      absl::flat_hash_map<std::tuple<int, int>, CalibrationStats>*
+          tensor_id_to_stats_map) const;
+
+  // Annotates the tensors in the given model with statistics captured during
+  // calibration.
+  // "update" is a flag: when set to true, the min/max are updated, instead of
+  // being overwritten.
+  virtual TfLiteStatus AddCalibrationToModel(ModelT* model, bool update) const;
+
+  virtual ~CalibrationReader() {}
+
+ private:
+  const Logger* logger_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_READER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibrator.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibrator.h
new file mode 100644
index 00000000..930c9ad7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/calibrator.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATOR_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATOR_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/mlir/lite/allocation.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// Warning: This is not a public API and subject to change.
+
+// Builds a interpreter that logs the calibration data in memory.
+// The calibration data can be recovered using |calibration_reader|.
+//
+// Sample usage:
+// std::unique_ptr<Interpreter> interpreter;
+// std::unique_ptr<CalibrationReader> calibration_reader;
+// BuiltinOpResolver resolver = ...
+// FlatBufferModel model = ..
+//
+// BuildLoggingInterpreter(model, resolver, &interpreter,
+//  &calibration_reader);
+//
+//
+// * Allocate tensors...
+// * Call interpreter->invoke on calibration dataset.
+//
+// Calibration data can be read either directly by calling
+// std::unordered_map<int,  CalibrationStats>> tensor_index_to_stats;
+// calibration_reader->GetTensorStatsAsMap(&tensor_index_to_stats);
+//
+// or adding calibration data to model itself.
+// ModelT * original_floating_point_model = ...
+// calibration_reader->AddCalibrationToModel(original_floating_point_model,
+// false);
+//
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader);
+
+// Same as above, except gets separate tflite::Model and ErrorReporter pointers.
+TfLiteStatus BuildLoggingInterpreter(
+    const tflite::Model* model, ErrorReporter* error_reporter,
+    const OpResolver& op_resolver, std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader,
+    const Allocation* allocation = nullptr);
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATOR_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
new file mode 100644
index 00000000..7df018d0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace custom {
+
+TfLiteStatus lstm_logging_kernel(TfLiteContext* context,
+                                 const int subgraph_index, TfLiteNode* node,
+                                 Logger* logger, ErrorReporter* error_reporter);
+
+}  // namespace custom
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/logging_op.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/logging_op.h
new file mode 100644
index 00000000..1836a269
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/logging_op.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+typedef TfLiteStatus (*logging_kernel_func_ptr)(TfLiteContext* context,
+                                                const int subgraph_index,
+                                                TfLiteNode* node,
+                                                Logger* logger,
+                                                ErrorReporter* error_reporter);
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
new file mode 100644
index 00000000..56eaf5fd
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_RESOLVER_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// A resolver that replaces the kernel invocations with a wrapper
+// eval function.
+class LoggingOpResolver : public OpResolver {
+ public:
+  // Creates an instance of |LoggingOpResolver|.
+  // All |TfLiteRegistration.invoke| functions are replaced by
+  // |logging_eval_fn|.
+  // TODO(shashishekhar): This interface needs to change for
+  // BuiltinOps that need special logging implementations.
+  LoggingOpResolver(const BuiltinOpsSet& builtin_ops_to_replace,
+                    const CustomOpsSet& custom_ops_to_replace,
+                    const OpResolver& base_resolver,
+                    KernelEvalFuncPtr logging_eval_fn,
+                    ErrorReporter* error_reporter);
+
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override;
+  KernelEvalFuncPtr GetWrappedKernelInvoke(BuiltinOperator op,
+                                           int version) const;
+
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+  KernelEvalFuncPtr GetWrappedKernelInvoke(const char* op, int version) const;
+
+ private:
+  BuiltinOpsMap<std::unique_ptr<TfLiteRegistration>>
+      builtin_op_registration_map_;
+  BuiltinOpsMap<KernelEvalFuncPtr> builtin_op_evalfn_map_;
+  CustomOpsMap<std::unique_ptr<TfLiteRegistration>> custom_op_registration_map_;
+  CustomOpsMap<KernelEvalFuncPtr> custom_op_evalfn_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_RESOLVER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/model_utils.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/model_utils.h
new file mode 100644
index 00000000..24849774
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/model_utils.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODEL_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODEL_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+
+// LINT.IfChange(MakeDequantizeOperator)
+// Creates a Dequantize OperatorT object.
+void MakeDequantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
+                            int32_t input, int32_t output);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/model_utils.cc:MakeDequantizeOperator)
+
+// Creates a Quantize OperatorT object.
+void MakeQuantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
+                          int32_t input, int32_t output);
+
+// LINT.IfChange(MakeTensor)
+// Create a new TensorT object without quantization parameters.
+void MakeTensor(const string& name, const std::vector<int32_t>& shape,
+                const std::vector<int32_t>& shape_signature,
+                const TensorType& type, std::unique_ptr<TensorT>* tensor);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/model_utils.cc:MakeTensor)
+
+// Create a new TensorT object with quantization parameters.
+void MakeTensorWithQuantParam(const string& name,
+                              const std::vector<int32_t>& shape,
+                              const std::vector<int32_t>& shape_signature,
+                              const TensorType& type, float scale,
+                              int64_t zero_point,
+                              std::unique_ptr<TensorT>* tensor);
+
+// Checks if the tensor has scale and zero point populated.
+bool QuantizationParametersExist(const TensorT* tensor);
+
+bool HasBuffer(const ModelT* model, const SubGraphT* subgraph,
+               int tensor_index);
+
+// LINT.IfChange(HasMinMax)
+bool HasMinMax(const TensorT* tensor);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/model_utils.cc:HasMinMax)
+
+// Set version of OperatorCode. The version will only be applied for operations
+// that have been quantized.
+void SetOperatorCodeVersion(ModelT* model);
+
+// Writes model buffer to file.
+void WriteFile(const std::string& out_file, const uint8_t* bytes,
+               size_t num_bytes);
+
+// Finishes model buffer and returns its builder.
+std::unique_ptr<flatbuffers::FlatBufferBuilder> FinishModel(
+    const tflite::ModelT* model);
+
+// Reads TensorFlow Lite model from the given path.
+std::unique_ptr<tflite::ModelT> CreateMutableModelFromFile(
+    const string& model_filepath);
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODEL_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/modify_model_interface.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/modify_model_interface.h
new file mode 100644
index 00000000..20fe6e65
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/modify_model_interface.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
+
+#include <cstdint>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+// Changes the interface of a quantized model. This method allows the users to
+// replace float interface with other types. Currently only int8, int16 and
+// uint8 are supported.
+//
+// This method populates the builder with the new model.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
+                                  ModelT* model, const TensorType& input_type,
+                                  const TensorType& output_type);
+
+// Same as above but allows input file path and output file path.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus ModifyModelInterface(const string& input_file,
+                                  const string& output_file,
+                                  const TensorType& input_type,
+                                  const TensorType& output_type);
+
+// Adds uint8 quantize ops for specified inputs and uint8 dequantize ops for
+// specified outputs for a float model. The scale and zero point of uint8
+// tensors are provided through quant_params.
+//   - input_quant_params has a map between tensor name and the
+//     <scale and zero_point> pair for inputs.
+//   - output_quant_params has a map between tensor name and the
+//     <scale and zero_point> pair for inputs.
+// For the inputs/output tensors for the model, if its quantization parameters
+// are not provided, that tensor is not affected.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus Uint8QuantizeModelInputsOutputs(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    const std::unordered_map<string, std::pair<float, int32_t>>&
+        input_quant_params,
+    const std::unordered_map<string, std::pair<float, int32_t>>&
+        output_quant_params);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_utils.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_utils.h
new file mode 100644
index 00000000..7f58971a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -0,0 +1,187 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+
+// LINT.IfChange(num_elements)
+// Returns the number of elements in the given tensor.
+TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:num_elements)
+
+// Populates the scale and zero point for quantization parameters.
+//
+// Nudges min and max so that floating point 0 falls exactly on a quantized
+// value, returning the nudges scale and zero_point.
+void GetAsymmetricQuantizationParams(
+    float min, float max, const int quant_min, const int quant_max,
+    QuantizationParametersT* quantization_params);
+
+// Populates the single total max and min values for a tensor.
+void FillSingleMinMax(const float* const input, const uint64_t input_size,
+                      QuantizationParametersT* quantization_params);
+
+// LINT.IfChange(fill_per_channel_min_max)
+// Populates the max and min values for per channel quantization.
+TfLiteStatus FillPerChannelMinMax(const float* const input,
+                                  const std::vector<int>& dimension,
+                                  int32_t channel_dim_index,
+                                  QuantizationParametersT* quantization_params,
+                                  ErrorReporter* error_reporter);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:fill_per_channel_min_max)
+
+// LINT.IfChange(symmetric_per_channel_quantization)
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+// Parameters:
+// - tensor is the tensor to be quantized, needed to access associated
+//   quantization parameters
+// - input is the float input data to be quantized.
+// - channel_dim_index is the channel index within "dimension".
+//   dimension[channel_dim_index] gives the number of channels.
+// - output_scale is the output scale, the size of which equals the number of
+//   channels.
+// - output_value is the output data, the size of which equals the number of
+//   inputs.
+TfLiteStatus SymmetricPerChannelQuantization(TensorT* tensor,
+                                             const float* const input,
+                                             int32_t channel_dim_index,
+                                             std::vector<float>* output_scales,
+                                             std::vector<int8_t>* output_value,
+                                             ErrorReporter* error_reporter);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:symmetric_per_channel_quantization)
+
+// LINT.IfChange(symmetric_per_channel_quantize_values)
+// Quantize the values given an array of scales.
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int32_t>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value,
+                                       TfLiteType type = kTfLiteNoType);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:symmetric_per_channel_quantize_values)
+
+// LINT.IfChange(symmetric_quantize_tensor)
+// Quantizes tensor using symmetric quantization with the min and max elements
+// of the tensor.
+TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:symmetric_quantize_tensor)
+
+// LINT.IfChange(quantize_tensor_float16)
+// Quantizes tensor to float16.
+TfLiteStatus QuantizeTensorFloat16(ModelT* model, TensorT* tensor);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:quantize_tensor_float16)
+
+// LINT.IfChange(add_quantization_params)
+// Add quantization parameters.
+TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
+                                   const std::vector<int64_t>& zero_point,
+                                   int quantized_dimension,
+                                   const uint8_t* buffer_data,
+                                   size_t buffer_size, TensorType output_type,
+                                   ModelT* model, TensorT* tensor,
+                                   ErrorReporter* error_reporter);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:add_quantization_params)
+
+// Populates the scales vector based on max and min values of quant_params
+TfLiteStatus GetSymmetricScalesFromMaxMin(QuantizationParametersT* quant_params,
+                                          std::vector<float>* scales,
+                                          ErrorReporter* error_reporter);
+
+// Adjusts scale of weights if incompatible with bias scale and likely to
+// cause overflow.
+TfLiteStatus AdjustWeightsForBiasScale(QuantizationParametersT* quant_params,
+                                       const float* bias_data,
+                                       const size_t bias_size,
+                                       const float input_scale,
+                                       ErrorReporter* error_reporter);
+
+// LINT.IfChange(symmetric_quantize_tensor_per_channel)
+// Quantizes tensor with per channel.
+TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
+                                               int32_t channel_dim_index,
+                                               ErrorReporter* error_reporter);
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantization_utils.h:symmetric_quantize_tensor_per_channel)
+
+// Symmetrically quantizes float to 16bits.
+TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
+                                            float scaling_factor,
+                                            ErrorReporter* error_reporter);
+
+std::vector<int16_t> SymmetricQuantizeFloatsToInt16(const float* data,
+                                                    uint64_t num_elements,
+                                                    float scaling_factor);
+
+// Symmetrically quantizes the bias for per-layer ops (i.e. FullyConnected).
+template <typename BiasType>
+TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
+                                           float scaling_factor,
+                                           ErrorReporter* error_reporter);
+
+// Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
+// The scale of bias if weight_per_channel_scale[channel] * input_scale.
+template <typename BiasType>
+TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
+                                             float input_scale,
+                                             const float* weight_scales,
+                                             int number_of_dimension,
+                                             ErrorReporter* error_reporter);
+
+template <typename BiasType>
+std::vector<BiasType> SymmetricBiasQuantize(const float* data,
+                                            uint64_t num_elements,
+                                            const std::vector<float>& scales);
+
+// Quantize weight with or without per channel.
+TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
+                            int per_axis_index, ErrorReporter* error_reporter);
+
+// Get effective scale by combining input scale, intermediate scale and factors.
+float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
+                        std::vector<int> input_index,
+                        std::vector<int> intermediate_index,
+                        std::vector<float> factors);
+
+// Return quantization parameters depending on activations type.
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter);
+
+// Quantize activation.
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter);
+
+// Quantize activation to 16bit.
+TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale);
+
+// Get the power of two scale for min and max for symmetric quantization case.
+int GetPowerOfTwoScale(float min, float max);
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_wrapper.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_wrapper.h
new file mode 100644
index 00000000..125bad3e
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_wrapper.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_WRAPPER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_WRAPPER_H_
+
+#include <string>
+
+namespace tflite {
+namespace optimize {
+
+// Makes an copy of the model at input_path and writes it to output_path, adding
+// tensors to the model needed for calibration.
+// Returns true if it is successful.
+// Example: a/b/c.tflite becomes a/b/c.calibrated.tflite and has
+// intermediate tensors added according to operator properties.
+bool CreateModelForCalibration(const std::string& input_path,
+                               const std::string& output_path);
+
+// Quantize a model in place. This function is only to be called after calling
+// CreateModelForCalibration and running calibration over data.
+// Returns true if it is successful.
+bool CreateQuantizedModel(const std::string& path);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h
new file mode 100644
index 00000000..90eb1bdb
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_WRAPPER_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_WRAPPER_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+// Load a tflite model from path.
+TfLiteStatus LoadModel(const string& path, ModelT* model);
+
+// Going through the model and add intermediates tensors if the ops have any.
+// Returns early if the model has already intermediate tensors. This is to
+// support cases where a model is initialized multiple times.
+TfLiteStatus AddIntermediateTensorsToFusedOp(
+    flatbuffers::FlatBufferBuilder* builder, ModelT* model);
+
+// Write model to a given location.
+bool WriteFile(const std::string& out_file, const uint8_t* bytes,
+               size_t num_bytes);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_WRAPPER_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantize_model.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantize_model.h
new file mode 100644
index 00000000..77c94f43
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/quantize_model.h
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
+
+#include <memory>
+#include <unordered_set>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace optimize {
+
+// Quantizes input_model and populates the provided builder with the new model.
+// input_model is required to have min/max information populated in its
+// quantization params.
+//
+// Inputs and output types default to float instead of a quantized type.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, ErrorReporter* error_reporter);
+
+// Same as above, but the types of quantized inputs and outputs are
+// configurable.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type,
+                           ErrorReporter* error_reporter);
+
+// Same as above, but can enable allowing float intermediate operations for ops
+// that do not yet support quantizable.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           ErrorReporter* error_reporter);
+
+// Same as above but with added option of disabling per channel quantization
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(
+    flatbuffers::FlatBufferBuilder* builder, ModelT* input_model,
+    const TensorType& input_type, const TensorType& output_type,
+    bool allow_float, bool disable_per_channel,
+    bool disable_per_channel_quantization_for_dense_layers,
+    ErrorReporter* error_reporter);
+
+// Same as above but with added option of handling quantization of external
+// state tensors. This assumes first input and output tensors are ouputs and
+// rest are state tensors which are quantized later with type as
+// activation type (hence no fake quant ops).
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(
+    flatbuffers::FlatBufferBuilder* builder, ModelT* input_model,
+    const TensorType& input_type, const TensorType& output_type,
+    bool allow_float, bool disable_per_channel,
+    bool disable_per_channel_quantization_for_dense_layers,
+    ErrorReporter* error_reporter, bool handle_external_state);
+
+// Same as above, but enables only quantizing an allowlist of operations,
+// specified by their operator output name.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
+                           ErrorReporter* error_reporter);
+
+// Same as above, but enables to provide activation type, which
+// could be TensorType_INT16 or TensorType_INT8.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
+                           const TensorType& bias_type,
+                           ErrorReporter* error_reporter);
+
+// Same as above, but all operators supporting quantization are quantized.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModelAllOperators(
+    flatbuffers::FlatBufferBuilder* builder, ModelT* model,
+    const TensorType& input_type, const TensorType& output_type,
+    bool allow_float, const TensorType& activations_type,
+    const TensorType& bias_type, ErrorReporter* error_reporter);
+
+// Same as above, but allows disabling per channel quantization.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModelAllOperators(
+    flatbuffers::FlatBufferBuilder* builder, ModelT* model,
+    const TensorType& input_type, const TensorType& output_type,
+    bool allow_float, const TensorType& activations_type,
+    const TensorType& bias_type, bool disable_per_channel,
+    bool disable_per_channel_quantization_for_dense_layers,
+    ErrorReporter* error_reporter);
+
+// Quantizes input_model and populates the provided builder with the new model
+// with all possible input parameters including disabling per_channel
+// quantization.
+//
+// All functions above call this function underneath.
+TfLiteStatus QuantizeModel(
+    flatbuffers::FlatBufferBuilder* builder, ModelT* model,
+    const TensorType& input_type, const TensorType& output_type,
+    bool allow_float, const std::unordered_set<string>& operator_names,
+    const TensorType& activations_type, const TensorType& bias_type,
+    bool disable_per_channel,
+    bool disable_per_channel_quantization_for_dense_layers,
+    ErrorReporter* error_reporter, bool handle_external_state);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/reduced_precision_support.h b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/reduced_precision_support.h
new file mode 100644
index 00000000..cf75325c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/optimize/reduced_precision_support.h
@@ -0,0 +1,91 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_REDUCED_PRECISION_SUPPORT_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_REDUCED_PRECISION_SUPPORT_H_
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h"
+
+namespace tflite {
+namespace optimize {
+
+inline bool ReadInferenceType(const std::string& metadata, size_t* idx,
+                              ReducedPrecisionSupport* mask) {
+  if (metadata.substr(*idx, 4) == kTfLiteFloat16String) {
+    *idx += 4;
+    *mask = *mask | ReducedPrecisionSupport::Float16Inference;
+    return true;
+  } else if (metadata.substr(*idx, 4) == kTfLiteBfloat16String) {
+    *idx += 4;
+    *mask = *mask | ReducedPrecisionSupport::Bfloat16Inference;
+    return true;
+  }
+  return false;
+}
+
+inline bool ReadAccumulationType(const std::string& metadata, size_t* idx,
+                                 ReducedPrecisionSupport* mask) {
+  if (metadata.substr(*idx, 4) == kTfLiteFloat16String) {
+    *idx += 4;
+    *mask = *mask | ReducedPrecisionSupport::Float16Accumulation;
+    return true;
+  } else if (metadata.substr(*idx, 4) == kTfLiteFloat32String) {
+    *idx += 4;
+    *mask = *mask | ReducedPrecisionSupport::Float32Accumulation;
+    return true;
+  }
+  return false;
+}
+
+// If the string is valid, set the given mask to indicate the state in
+// string and return true. If the string is invalid, return false.
+// A valid string is:
+// >= 1 valid inference types + accumulation token + 1 valid accumulation type.
+// Valid examples would be: "fp16accfp16", "bf16accfp32"
+inline bool SetMaskFromReducedPrecisionMetadata(const std::string& metadata,
+                                                ReducedPrecisionSupport* mask) {
+  bool check = true;
+  size_t idx = 0;
+  ReducedPrecisionSupport rsp = ReducedPrecisionSupport::None;
+  do {
+    check = ReadInferenceType(metadata, &idx, &rsp);
+  } while (check);
+  // Ensure we read at least 1 inference type.
+  if (idx == 0) {
+    return false;
+  }
+  // Next read the accumulation token.
+  if (metadata.substr(idx, 3) != kTfLiteAccumulationString) {
+    return false;
+  }
+  idx += std::string(kTfLiteAccumulationString).size();
+  // Next read a valid accumulation type.
+  if (!ReadAccumulationType(metadata, &idx, &rsp)) {
+    return false;
+  }
+  // This should be the end of string.
+  if (idx != metadata.length()) {
+    return false;
+  }
+  // The string is a valid mask description. Set the value and return.
+  *mask = rsp;
+  return true;
+}
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_REDUCED_PRECISION_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/serialization/enum_mapping.h b/third_party/tflite-hdrs/tensorflow/lite/tools/serialization/enum_mapping.h
new file mode 100644
index 00000000..4b25a878
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/serialization/enum_mapping.h
@@ -0,0 +1,162 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_SERIALIZATION_ENUM_MAPPING_H_
+#define TENSORFLOW_LITE_TOOLS_SERIALIZATION_ENUM_MAPPING_H_
+
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
+#include "tensorflow/lite/builtin_op_data.h"
+
+// TODO(aselle): Ideally extract this from the schema.
+
+namespace tflite {
+
+inline ActivationFunctionType TfLiteActivationToSchemaActivation(
+    TfLiteFusedActivation act) {
+  switch (act) {
+    case kTfLiteActNone:
+      return ActivationFunctionType_NONE;
+    case kTfLiteActRelu:
+      return ActivationFunctionType_RELU;
+    case kTfLiteActReluN1To1:
+      return ActivationFunctionType_RELU_N1_TO_1;
+    case kTfLiteActRelu6:
+      return ActivationFunctionType_RELU6;
+    case kTfLiteActTanh:
+      return ActivationFunctionType_TANH;
+    case kTfLiteActSignBit:
+      return ActivationFunctionType_SIGN_BIT;
+    case kTfLiteActSigmoid:
+      return ActivationFunctionType_NONE;  // TODO(aselle): Add to schema
+  }
+  return ActivationFunctionType_NONE;
+}
+
+inline Padding TfLitePaddingToSchemaPadding(TfLitePadding padding) {
+  switch (padding) {
+    case kTfLitePaddingUnknown:
+      return Padding_SAME;  // TODO(aselle): Consider an error.
+    case kTfLitePaddingSame:
+      return Padding_SAME;
+    case kTfLitePaddingValid:
+      return Padding_VALID;
+  }
+  return Padding_SAME;  // TODO(aselle): Consider an error.
+}
+
+inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
+  switch (type) {
+    // case kTfLiteNoType: return TensorType_NONE;
+    case kTfLiteNoType:
+      return TensorType_FLOAT32;  // TODO(aselle): Consider an error.
+    case kTfLiteFloat32:
+      return TensorType_FLOAT32;
+    case kTfLiteFloat16:
+      return TensorType_FLOAT16;
+    case kTfLiteBFloat16:
+      return TensorType_BFLOAT16;
+    case kTfLiteFloat64:
+      return TensorType_FLOAT64;
+    case kTfLiteInt32:
+      return TensorType_INT32;
+    case kTfLiteUInt32:
+      return TensorType_UINT32;
+    case kTfLiteInt4:
+      return TensorType_INT4;
+    case kTfLiteUInt8:
+      return TensorType_UINT8;
+    case kTfLiteInt8:
+      return TensorType_INT8;
+    case kTfLiteInt64:
+      return TensorType_INT64;
+    case kTfLiteUInt64:
+      return TensorType_UINT64;
+    case kTfLiteString:
+      return TensorType_STRING;
+    case kTfLiteBool:
+      return TensorType_BOOL;
+    case kTfLiteUInt16:
+      return TensorType_UINT16;
+    case kTfLiteInt16:
+      return TensorType_INT16;
+    case kTfLiteComplex64:
+      return TensorType_COMPLEX64;
+    case kTfLiteComplex128:
+      return TensorType_COMPLEX128;
+    case kTfLiteResource:
+      return TensorType_RESOURCE;
+    case kTfLiteVariant:
+      return TensorType_VARIANT;
+  }
+  // TODO(aselle): consider an error
+}
+
+inline FullyConnectedOptionsWeightsFormat
+FullyConnectedOptionsWeightsFormatToSchema(
+    TfLiteFullyConnectedWeightsFormat format) {
+  switch (format) {
+    case kTfLiteFullyConnectedWeightsFormatDefault:
+      return FullyConnectedOptionsWeightsFormat_DEFAULT;
+    case kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8:
+      return FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8;
+  }
+}
+
+inline LSTMKernelType LSTMKernelTypeToSchema(TfLiteLSTMKernelType type) {
+  switch (type) {
+    case kTfLiteLSTMFullKernel:
+      return LSTMKernelType_FULL;
+    case kTfLiteLSTMBasicKernel:
+      return LSTMKernelType_BASIC;
+  }
+}
+
+inline LSHProjectionType LSHProjectionTypeToSchema(
+    TfLiteLSHProjectionType type) {
+  switch (type) {
+    case kTfLiteLshProjectionUnknown:
+      return LSHProjectionType_UNKNOWN;
+    case kTfLiteLshProjectionSparse:
+      return LSHProjectionType_SPARSE;
+    case kTfLiteLshProjectionDense:
+      return LSHProjectionType_DENSE;
+  }
+}
+
+inline MirrorPadMode MirrorPaddingModeToSchema(TfLiteMirrorPaddingMode mode) {
+  switch (mode) {
+    case kTfLiteMirrorPaddingUnknown:
+      return MirrorPadMode_REFLECT;  // TODO(aselle): consider an error
+    case kTfLiteMirrorPaddingReflect:
+      return MirrorPadMode_REFLECT;
+    case kTfLiteMirrorPaddingSymmetric:
+      return MirrorPadMode_SYMMETRIC;
+  }
+}
+
+inline CombinerType CombinerTypeToSchema(TfLiteCombinerType type) {
+  switch (type) {
+    case kTfLiteCombinerTypeSum:
+      return CombinerType_SUM;
+    case kTfLiteCombinerTypeMean:
+      return CombinerType_MEAN;
+    case kTfLiteCombinerTypeSqrtn:
+      return CombinerType_SQRTN;
+  }
+}
+
+// int
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_SERIALIZATION_ENUM_MAPPING_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/serialization/writer_lib.h b/third_party/tflite-hdrs/tensorflow/lite/tools/serialization/writer_lib.h
new file mode 100644
index 00000000..a9648265
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/serialization/writer_lib.h
@@ -0,0 +1,292 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Library to write a flatbuffer of a currently loaded TFLite model/subgraph.
+
+#ifndef TENSORFLOW_LITE_TOOLS_SERIALIZATION_WRITER_LIB_H_
+#define TENSORFLOW_LITE_TOOLS_SERIALIZATION_WRITER_LIB_H_
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+
+// This #include needs to precede the inclusion of any other TF Lite header
+// file that might depend on the non-mutable schema_generated.h, directly,
+// e.g. core/api/op_resolver.h, or indirectly, e.g. core/subgraph.h.
+// That's because "tensorflow/lite/schema/mutable/schema_generated.h"
+// and "tensorflow/lite/schema/schema_generated.h" both use the same
+// header guard macro (FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_), but have
+// different contents (the former is a superset of the latter). In particular
+// the one in mutable/ is built with the "--gen-mutable" and "--gen-object-api"
+// flags to the flatbuffer schema compiler which cause some additional
+// (non-virtual) accessor methods and API functions to be declared.
+// The code here uses those methods, so we need to make sure that we get
+// the mutable variant of this header.
+//
+//  The '#if' here prevents automatic reordering of this #include.
+#if 1
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
+#endif
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/tools/serialization/enum_mapping.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+
+struct OpCode {
+  int builtin;
+  std::string custom;
+};
+
+// Forward declaration.
+class SubgraphWriter;
+
+// Handles writing a full TFLite model (with 1 or more subgraphs) to a
+// serialized TF lite file format.
+// TODO(b/174708523): Support custom I/O or unused tensors later.
+class ModelWriter {
+ public:
+  // CustomWriter allows the delegate to customize the write to the flatbuffer.
+  typedef flatbuffers::Offset<Operator> (*CustomWriter)(
+      flatbuffers::FlatBufferBuilder* fbb, Subgraph* subgraph, int node_index,
+      flatbuffers::Offset<flatbuffers::Vector<uint8_t>>* output_options,
+      CustomOptionsFormat* custom_options_format);
+
+  // Construct a writer for the specified `interpreter`. Then, use
+  // .Write() or .GetBuffer(...) to extract the data.
+  explicit ModelWriter(Interpreter* interpreter,
+                       bool serialize_dims_signature = true);
+
+  // Same as above, except takes subgraphs as input.
+  explicit ModelWriter(const std::vector<Subgraph*>& subgraphs,
+                       bool serialize_dims_signature = true);
+
+  // Get a buffer and size of a serialized flatbuffer.
+  TfLiteStatus GetBuffer(std::unique_ptr<uint8_t[]>* out, size_t* size);
+  // Write the serialized flatbuffer to the prescribed `filename`.
+  TfLiteStatus Write(const std::string& filename);
+
+  // Specifies unused tensors on the target subgraph.
+  void SetUnusedTensors(int subgraph_index,
+                        const std::set<int>& unused_tensors);
+
+  // Specifies custom inputs, outputs, and execution_plan to target subgraph.
+  TfLiteStatus SetCustomInputOutput(int subgraph_index,
+                                    const std::vector<int>& inputs,
+                                    const std::vector<int>& outputs,
+                                    const std::vector<int>& execution_plan);
+
+  // Registers a custom writer for a custom op. The customization allows the
+  // caller to change the custom data.
+  TfLiteStatus RegisterCustomWriter(const std::string& custom_name,
+                                    CustomWriter custom_writer);
+
+ private:
+  // For initializing the ModelWriter internal data.
+  void Init(const std::vector<Subgraph*>& subgraphs,
+            bool serialize_dims_signature);
+
+  template <class T>
+  using Offset = flatbuffers::Offset<T>;
+  Offset<flatbuffers::Vector<Offset<OperatorCode>>> CreateOpCodeTable(
+      flatbuffers::FlatBufferBuilder* fbb);
+  Offset<flatbuffers::Vector<Offset<Buffer>>> ExportBuffers(
+      flatbuffers::FlatBufferBuilder* fbb);
+
+  // Updates subgraph_index references of control flow ops in the serialized
+  // model. This is necessary since the order of the subgraphs in the serialized
+  // model might be different than the original input model.
+  TfLiteStatus UpdateSubgraphReferences(flatbuffers::FlatBufferBuilder* fbb);
+
+  // List of subgraph writers owned by this model writer.
+  // There is one subgraph writer for each subgraph in the model.
+  std::vector<SubgraphWriter> subgraph_writers_;
+
+  // This data corresponds to the overall model (rather than individual
+  // subgraphs), so we define common fields. Keep track of byte buffers
+  std::vector<std::pair<const uint8_t*, size_t>> buffers_;
+  // List of used opcodes
+  std::vector<OpCode> opcodes_;
+  absl::flat_hash_map<int, int> builtin_op_to_opcode_;
+
+  // Map from original subgraph indices to the new indices.
+  absl::flat_hash_map<int, int> subgraph_index_mapper_;
+};
+
+// Handles writing TensorFlow Lite running subgraph to a serialized TF lite
+// file format.
+// TODO(b/174708523): Reconcile into ModelWriter?
+class SubgraphWriter {
+ public:
+  friend class ModelWriter;
+
+  typedef flatbuffers::Offset<Operator> (*CustomWriter)(
+      flatbuffers::FlatBufferBuilder* fbb, Subgraph* subgraph, int node_index,
+      flatbuffers::Offset<flatbuffers::Vector<uint8_t>>* output_options,
+      CustomOptionsFormat* custom_options_format);
+
+  // Construct a subgraph writer for the specified `subgraph`. Then, use
+  // .Write() or .GetBuffer(...) to extract the data.
+  explicit SubgraphWriter(Subgraph* subgraph,
+                          bool serialize_dims_signature = true)
+      : subgraph_(subgraph),
+        inputs_(subgraph->inputs()),
+        outputs_(subgraph->outputs()),
+        execution_plan_(subgraph->execution_plan()),
+        serialize_dims_signature_(serialize_dims_signature) {
+    buffers_ = &buffers_data_;
+    opcodes_ = &opcodes_data_;
+    builtin_op_to_opcode_ = &builtin_op_to_opcode_data_;
+    buffers_->push_back(std::make_pair(nullptr, 0));
+  }
+
+  // Get a buffer and size of a serialized flatbuffer.
+  TfLiteStatus GetBuffer(std::unique_ptr<uint8_t[]>* out, size_t* size);
+  // Write the serialized flatbuffer to the prescribed `filename`.
+  TfLiteStatus Write(const std::string& filename);
+  // Registers a custom writer for a custom op. The customization allows the
+  // caller to change the custom data.
+  TfLiteStatus RegisterCustomWriter(const std::string& custom_name,
+                                    CustomWriter custom_writer);
+  // Tensors that are unused and shouldn't be written.
+  void SetUnusedTensors(const std::set<int>& unused_tensors) {
+    unused_tensors_ = unused_tensors;
+  }
+  // Sets custom inputs, outputs, and execution_plan so that a portion of the
+  // subgraph is written to the buffer instead of the whole subgraph.
+  TfLiteStatus SetCustomInputOutput(const std::vector<int>& inputs,
+                                    const std::vector<int>& outputs,
+                                    const std::vector<int>& execution_plan);
+
+ private:
+  // Used by ModelWriter.
+  explicit SubgraphWriter(
+      Subgraph* subgraph,
+      std::vector<std::pair<const uint8_t*, size_t>>* external_buffers,
+      std::vector<OpCode>* external_opcodes,
+      absl::flat_hash_map<int, int>* external_builtin_op_to_opcode,
+      bool serialize_dims_signature)
+      : subgraph_(subgraph),
+        inputs_(subgraph->inputs()),
+        outputs_(subgraph->outputs()),
+        execution_plan_(subgraph->execution_plan()),
+        serialize_dims_signature_(serialize_dims_signature) {
+    buffers_ = external_buffers;
+    opcodes_ = external_opcodes;
+    builtin_op_to_opcode_ = external_builtin_op_to_opcode;
+    buffers_->push_back(std::make_pair(nullptr, 0));
+  }
+
+  // Used by ModelWriter to populate data specific to this subgraph.
+  // Global stuff (like opcodes & buffers) is populated into buffers_, opcodes_,
+  // etc. & populated in the Flatbuffer by ModelWriter.
+  flatbuffers::Offset<SubGraph> PopulateAndGetOffset(
+      flatbuffers::FlatBufferBuilder* builder,
+      const std::string& subgraph_name);
+
+  template <class T>
+  using Offset = flatbuffers::Offset<T>;
+  template <class T_OUTPUT, class T_INPUT>
+  Offset<flatbuffers::Vector<T_OUTPUT>> ExportVector(
+      flatbuffers::FlatBufferBuilder* fbb, const T_INPUT& v);
+  Offset<flatbuffers::Vector<Offset<Tensor>>> ExportTensors(
+      flatbuffers::FlatBufferBuilder* fbb);
+  Offset<flatbuffers::Vector<Offset<Operator>>> ExportOperators(
+      flatbuffers::FlatBufferBuilder* fbb);
+  Offset<flatbuffers::Vector<Offset<OperatorCode>>> CreateOpCodeTable(
+      flatbuffers::FlatBufferBuilder* fbb);
+  Offset<flatbuffers::Vector<Offset<Buffer>>> ExportBuffers(
+      flatbuffers::FlatBufferBuilder* fbb);
+
+  template <class T>
+  std::vector<int> RemapTensorIndicesToWritten(const T& input);
+
+  // Checks if given `input`, `output`, and `execution_plan` represents a valid
+  // model within the Subgraph.
+  TfLiteStatus CheckInputOutput(const std::vector<int>& inputs,
+                                const std::vector<int>& outputs,
+                                const std::vector<int>& execution_plan);
+
+  int GetOpCodeForBuiltin(int builtin_op_index) {
+    // auto it = builtin_op_to_opcode_.find(builtin_op_index);
+    std::pair<decltype(builtin_op_to_opcode_data_)::iterator, bool> result =
+        builtin_op_to_opcode_->insert(
+            std::make_pair(builtin_op_index, opcodes_->size()));
+    if (result.second) {
+      opcodes_->push_back({builtin_op_index, ""});
+    }
+    return result.first->second;
+  }
+
+  int GetOpCodeForCustom(const std::string& custom_name) {
+    std::pair<decltype(custom_op_to_opcode_)::iterator, bool> result =
+        custom_op_to_opcode_.insert(
+            std::make_pair(custom_name, opcodes_->size()));
+    if (result.second) {
+      opcodes_->push_back({BuiltinOperator_CUSTOM, custom_name});
+    }
+    return result.first->second;
+  }
+
+  // The subgraph we are writing
+  Subgraph* subgraph_;
+  // Input tensor indices to be written.
+  std::vector<int> inputs_;
+  // Output tensor indices to be written.
+  std::vector<int> outputs_;
+  // Order of nodes to be written.
+  std::vector<int> execution_plan_;
+  // List of op codes and mappings from builtin or custom op to opcode
+  std::set<int> unused_tensors_;
+  // For every tensor index in the subgraph, the index in the written.
+  // This is different due to temporary and unused tensors not being written.
+  std::vector<int> tensor_to_written_tensor_;
+  std::unordered_map<std::string, int> custom_op_to_opcode_;
+  std::unordered_map<std::string, CustomWriter> custom_op_to_writer_;
+
+  // We use pointers for these, since they may be provided by ModelWriter.
+  // Keep track of byte buffers
+  std::vector<std::pair<const uint8_t*, size_t>>* buffers_;
+  // List of used opcodes
+  std::vector<OpCode>* opcodes_;
+  absl::flat_hash_map<int, int>* builtin_op_to_opcode_;
+
+  // These are used if SubgraphWriter is being used directly.
+  std::vector<std::pair<const uint8_t*, size_t>> buffers_data_;
+  // List of used opcodes
+  std::vector<OpCode> opcodes_data_;
+  absl::flat_hash_map<int, int> builtin_op_to_opcode_data_;
+
+  // Specifies whether tensor dims_signature should be serialized.
+  bool serialize_dims_signature_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_SERIALIZATION_WRITER_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/signature/signature_def_util.h b/third_party/tflite-hdrs/tensorflow/lite/tools/signature/signature_def_util.h
new file mode 100644
index 00000000..c55600cc
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/signature/signature_def_util.h
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_SIGNATURE_SIGNATURE_DEF_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_SIGNATURE_SIGNATURE_DEF_UTIL_H_
+
+#include <map>
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Constant for name of the Metadata entry associated with SignatureDefs.
+constexpr char kSignatureDefsMetadataName[] = "signature_defs_metadata";
+
+// The function `SetSignatureDefMap()` results in
+// `model_data_with_signature_defs` containing a serialized TFLite model
+// identical to `model` with a metadata and associated buffer containing
+// a FlexBuffer::Map with `signature_def_map` keys and values serialized to
+// String.
+//
+// If a Metadata entry containing a SignatureDef map exists, it will be
+//   overwritten.
+//
+// Returns error if `model_data_with_signature_defs` is null or
+//   `signature_def_map` is empty.
+//
+// On success, returns tensorflow::OkStatus() or error otherwise.
+// On error, `model_data_with_signature_defs` is unchanged.
+absl::Status SetSignatureDefMap(
+    const Model* model,
+    const std::map<std::string, tensorflow::SignatureDef>& signature_def_map,
+    std::string* model_data_with_signature_defs);
+
+// The function `HasSignatureDef()` returns true if `model` contains a Metadata
+// table pointing to a buffer containing a FlexBuffer::Map and the map has
+// `signature_key` as a key, or false otherwise.
+bool HasSignatureDef(const Model* model, const std::string& signature_key);
+
+// The function `GetSignatureDefMap()` results in `signature_def_map`
+// pointing to a map<std::string, tensorflow::SignatureDef>
+// parsed from `model`'s metadata buffer.
+//
+// If the Metadata entry does not exist, `signature_def_map` is unchanged.
+// If the Metadata entry exists but cannot be parsed, returns an error.
+absl::Status GetSignatureDefMap(
+    const Model* model,
+    std::map<std::string, tensorflow::SignatureDef>* signature_def_map);
+
+// The function `ClearSignatureDefs` results in `model_data`
+// containing a serialized Model identical to `model` omitting any
+// SignatureDef-related metadata or buffers.
+absl::Status ClearSignatureDefMap(const Model* model, std::string* model_data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_SIGNATURE_SIGNATURE_DEF_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/strip_buffers/stripping_lib.h b/third_party/tflite-hdrs/tensorflow/lite/tools/strip_buffers/stripping_lib.h
new file mode 100644
index 00000000..2169a8ec
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/strip_buffers/stripping_lib.h
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_STRIP_BUFFERS_STRIPPING_LIB_H_
+#define TENSORFLOW_LITE_TOOLS_STRIP_BUFFERS_STRIPPING_LIB_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Strips eligible buffers from Flatbuffer, to generate a 'leaner' model.
+// Buffers for tensors that satisfy the following constraints are stripped out:
+// 1. Are either of: Float32, Int32, UInt8, Int8
+// 2. If Int32, the tensor should have a min of 10 elements
+// NOTE: This only supports a single Subgraph for now.
+TfLiteStatus StripWeightsFromFlatbuffer(
+    const Model* input_model,
+    flatbuffers::FlatBufferBuilder* new_model_builder);
+
+// The same function as above, but takes in the input model flatbuffer in a
+// string and returns the stripped version in a string.
+// Returns empty string on error.
+// NOTE: This only supports a single Subgraph for now.
+std::string StripWeightsFromFlatbuffer(
+    const absl::string_view input_flatbuffer);
+
+// Generates buffers with random data, for tensors that were mutated using
+// strip_buffers_from_fb.
+// The modified flatbuffer is built into new_model_builder.
+// NOTE: This only supports a single Subgraph for now.
+TfLiteStatus ReconstituteConstantTensorsIntoFlatbuffer(
+    const Model* input_model,
+    flatbuffers::FlatBufferBuilder* new_model_builder);
+
+// The same function as above but takes in the input model flatbuffer in a
+// string and returns the reconstituded version in a string.
+// Returns empty string on error.
+// NOTE: This only supports a single Subgraph for now.
+std::string ReconstituteConstantTensorsIntoFlatbuffer(
+    const absl::string_view input_flatbuffer);
+
+// Return true if the input model has been stripped before.
+bool FlatbufferHasStrippedWeights(const Model* input_model);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_STRIP_BUFFERS_STRIPPING_LIB_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/tool_params.h b/third_party/tflite-hdrs/tensorflow/lite/tools/tool_params.h
new file mode 100644
index 00000000..09f64797
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/tool_params.h
@@ -0,0 +1,179 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_TOOL_PARAMS_H_
+#define TENSORFLOW_LITE_TOOLS_TOOL_PARAMS_H_
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tflite {
+namespace tools {
+
+template <typename T>
+class TypedToolParam;
+
+class ToolParam {
+ protected:
+  enum class ParamType { TYPE_INT32, TYPE_FLOAT, TYPE_BOOL, TYPE_STRING };
+  template <typename T>
+  static ParamType GetValueType();
+
+ public:
+  template <typename T>
+  static std::unique_ptr<ToolParam> Create(const T& default_value,
+                                           int position = 0) {
+    auto* param = new TypedToolParam<T>(default_value);
+    param->SetPosition(position);
+    return std::unique_ptr<ToolParam>(param);
+  }
+
+  template <typename T>
+  TypedToolParam<T>* AsTyped() {
+    AssertHasSameType(GetValueType<T>(), type_);
+    return static_cast<TypedToolParam<T>*>(this);
+  }
+
+  template <typename T>
+  const TypedToolParam<T>* AsConstTyped() const {
+    AssertHasSameType(GetValueType<T>(), type_);
+    return static_cast<const TypedToolParam<T>*>(this);
+  }
+
+  virtual ~ToolParam() {}
+  explicit ToolParam(ParamType type)
+      : has_value_set_(false), position_(0), type_(type) {}
+
+  bool HasValueSet() const { return has_value_set_; }
+
+  int GetPosition() const { return position_; }
+  void SetPosition(int position) { position_ = position; }
+
+  virtual void Set(const ToolParam&) {}
+
+  virtual std::unique_ptr<ToolParam> Clone() const = 0;
+
+ protected:
+  bool has_value_set_;
+
+  // Represents the relative ordering among a set of params.
+  // Note: in our code, a ToolParam is generally used together with a
+  // tflite::Flag so that its value could be set when parsing commandline flags.
+  // In this case, the `position_` is simply the index of the particular flag
+  // into the list of commandline flags (i.e. named 'argv' in general).
+  int position_;
+
+ private:
+  static void AssertHasSameType(ParamType a, ParamType b);
+
+  const ParamType type_;
+};
+
+template <typename T>
+class TypedToolParam : public ToolParam {
+ public:
+  explicit TypedToolParam(const T& value)
+      : ToolParam(GetValueType<T>()), value_(value) {}
+
+  void Set(const T& value) {
+    value_ = value;
+    has_value_set_ = true;
+  }
+
+  const T& Get() const { return value_; }
+
+  void Set(const ToolParam& other) override {
+    Set(other.AsConstTyped<T>()->Get());
+    SetPosition(other.AsConstTyped<T>()->GetPosition());
+  }
+
+  std::unique_ptr<ToolParam> Clone() const override {
+    return ToolParam::Create<T>(value_, position_);
+  }
+
+ private:
+  T value_;
+};
+
+// A map-like container for holding values of different types.
+class ToolParams {
+ public:
+  // Add a ToolParam instance `value` w/ `name` to this container.
+  void AddParam(const std::string& name, std::unique_ptr<ToolParam> value) {
+    params_[name] = std::move(value);
+  }
+
+  void RemoveParam(const std::string& name) { params_.erase(name); }
+
+  bool HasParam(const std::string& name) const {
+    return params_.find(name) != params_.end();
+  }
+
+  bool Empty() const { return params_.empty(); }
+
+  const ToolParam* GetParam(const std::string& name) const {
+    const auto& entry = params_.find(name);
+    if (entry == params_.end()) return nullptr;
+    return entry->second.get();
+  }
+
+  template <typename T>
+  void Set(const std::string& name, const T& value, int position = 0) {
+    AssertParamExists(name);
+    params_.at(name)->AsTyped<T>()->Set(value);
+    params_.at(name)->AsTyped<T>()->SetPosition(position);
+  }
+
+  template <typename T>
+  bool HasValueSet(const std::string& name) const {
+    AssertParamExists(name);
+    return params_.at(name)->AsConstTyped<T>()->HasValueSet();
+  }
+
+  template <typename T>
+  int GetPosition(const std::string& name) const {
+    AssertParamExists(name);
+    return params_.at(name)->AsConstTyped<T>()->GetPosition();
+  }
+
+  template <typename T>
+  T Get(const std::string& name) const {
+    AssertParamExists(name);
+    return params_.at(name)->AsConstTyped<T>()->Get();
+  }
+
+  // Set the value of all same parameters from 'other'.
+  void Set(const ToolParams& other);
+
+  // Merge the value of all parameters from 'other'. 'overwrite' indicates
+  // whether the value of the same paratmeter is overwritten or not.
+  void Merge(const ToolParams& other, bool overwrite = false);
+
+ private:
+  void AssertParamExists(const std::string& name) const;
+  std::unordered_map<std::string, std::unique_ptr<ToolParam>> params_;
+};
+
+#define LOG_TOOL_PARAM(params, type, name, description, verbose)      \
+  do {                                                                \
+    TFLITE_MAY_LOG(INFO, (verbose) || params.HasValueSet<type>(name)) \
+        << description << ": [" << params.Get<type>(name) << "]";     \
+  } while (0)
+
+}  // namespace tools
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_TOOL_PARAMS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/utils.h b/third_party/tflite-hdrs/tensorflow/lite/tools/utils.h
new file mode 100644
index 00000000..9e163f7a
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/utils.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace utils {
+// Implement type erasure with unique_ptr with custom deleter.
+using VoidUniquePtr = std::unique_ptr<void, void (*)(void*)>;
+
+// Represents any input tensor data to TfLite model.
+struct InputTensorData {
+  InputTensorData() : data(nullptr, nullptr) {}
+
+  VoidUniquePtr data;
+  size_t bytes;
+};
+
+// Returns random test data that can be used for testing purposes only.
+// Data returned should be between 'low_range' and 'high_range'.
+InputTensorData CreateRandomTensorData(const TfLiteTensor& tensor,
+                                       float low_range, float high_range);
+
+// Fills out params 'low_range' and 'high_range' with range for tensor type
+// 'type'. Note that these ranges returned are just dummy used only for
+// benchmarking and/or testing purposes.
+void GetDataRangesForType(TfLiteType type, float* low_range, float* high_range);
+
+// Converts TfLiteTensor to float array. Returns an error if the tensor type is
+// not supported or the values size is not equal to the tensor dimension.
+TfLiteStatus TfLiteTensorToFloat32Array(const TfLiteTensor& tensor,
+                                        absl::Span<float> values);
+
+// Same as above, but converts to int64_t array.
+TfLiteStatus TfLiteTensorToInt64Array(const TfLiteTensor& tensor,
+                                      absl::Span<int64_t> values);
+}  // namespace utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_UTILS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/verifier.h b/third_party/tflite-hdrs/tensorflow/lite/tools/verifier.h
new file mode 100644
index 00000000..f90d77b5
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/verifier.h
@@ -0,0 +1,28 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_H_
+#define TENSORFLOW_LITE_TOOLS_VERIFIER_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/tools/verifier.h
+
+#include "tensorflow/lite/core/tools/verifier.h"  // IWYU pragma: export
+
+namespace tflite {
+
+using ::tflite::Verify;  // NOLINT
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/verifier_internal.h b/third_party/tflite-hdrs/tensorflow/lite/tools/verifier_internal.h
new file mode 100644
index 00000000..88380466
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/verifier_internal.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
+#define TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
+
+/// For documentation, see third_party/tensorflow/lite/core/tools/verifier_internal.h
+
+#include "tensorflow/lite/core/tools/verifier_internal.h"  // IWYU pragma: export
+
+namespace tflite {
+namespace internal {
+
+using ::tflite::internal::VerifyFlatBufferAndGetModel;  // NOLINT
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/gpu_compatibility.h b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/gpu_compatibility.h
new file mode 100644
index 00000000..5bc7cd89
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/gpu_compatibility.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_VERSIONING_GPU_COMPATIBILITY_H_
+#define TENSORFLOW_LITE_TOOLS_VERSIONING_GPU_COMPATIBILITY_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
+
+namespace tflite {
+
+// Bitwise flags to specify GPU compatibility. Multiple flags can be set.
+enum class GpuCompatibilityFlags {
+  kStandard = 0,
+  kEnhancedBroadcast = 1 << 0,  // Set when backend supports enhanced broadcast.
+};
+
+// Check if the given op signature is compatible with GPU delegate.
+absl::Status CheckGpuDelegateCompatibility(
+    const OpSignature& op_sig,
+    GpuCompatibilityFlags flags = GpuCompatibilityFlags::kStandard);
+
+// Check if the given operator in a TFLite flatbuffer model is compatible with
+// GPU delegate.
+// WARNING: It's not fully implemented and still under development. Only use the
+// function for an experimental feature.
+// WARNING: This is an experimental API and subject to change.
+absl::Status CheckGpuDelegateCompatibility(const OperatorCode* op_code,
+                                           const Operator* op,
+                                           const SubGraph* subgraph,
+                                           const Model* model);
+
+// Check if the given TfLiteNode op is compatible with GPU delegate.
+absl::Status CheckGpuDelegateCompatibility(
+    const TfLiteContext* context, const TfLiteNode* node,
+    const TfLiteRegistration* registration,
+    GpuCompatibilityFlags flags = GpuCompatibilityFlags::kStandard);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_VERSIONING_GPU_COMPATIBILITY_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/op_signature.h b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/op_signature.h
new file mode 100644
index 00000000..b2dd3086
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/op_signature.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_VERSIONING_OP_SIGNATURE_H_
+#define TENSORFLOW_LITE_TOOLS_VERSIONING_OP_SIGNATURE_H_
+
+#include "tensorflow/compiler/mlir/lite/tools/versioning/op_signature.h"  // iwyu pragma: export
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// Generate OpSignature with the given TfLiteContext, TfLiteNode and
+// TfLiteRegistration.
+// The function can be used by a compatibility checker of a delegate such as
+// TFLiteOperationParser::IsSupported() in the GPU delegate.
+OpSignature GetOpSignature(const TfLiteContext* context, const TfLiteNode* node,
+                           const TfLiteRegistration* registration);
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_VERSIONING_OP_SIGNATURE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/op_version.h b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/op_version.h
new file mode 100644
index 00000000..2d9a5d69
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/op_version.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_VERSIONING_OP_VERSION_H_
+#define TENSORFLOW_LITE_TOOLS_VERSIONING_OP_VERSION_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
+
+namespace tflite {
+
+// Returns version of builtin ops by the given signature.
+int GetBuiltinOperatorVersion(const OpSignature& op_sig);
+
+// Update operator's version of the given TFL flatbuffer model.
+void UpdateOpVersion(uint8_t* model_buffer_pointer);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_VERSIONING_OP_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/runtime_version.h b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/runtime_version.h
new file mode 100644
index 00000000..79ff4be3
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/tools/versioning/runtime_version.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
+#define TENSORFLOW_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
+
+#include <string>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
+
+namespace tflite {
+// Update minimum runtime version of the given TFL flatbuffer model.
+void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
+
+// Find the minimum runtime version of a given op version. Return an empty
+// string the version is not registered.
+std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
+                                           int op_version);
+
+// Returns true if the first version string precedes the second.
+// For example, '1.9' should precede '1.14', also '1.14' should precede
+// '1.14.1'. If two version string is equal, then false will be returned.
+bool CompareRuntimeVersion(const std::string&, const std::string&);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/type_to_tflitetype.h b/third_party/tflite-hdrs/tensorflow/lite/type_to_tflitetype.h
new file mode 100644
index 00000000..a7bca5c0
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/type_to_tflitetype.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
+#define TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
+
+#include <complex>
+#include <string>
+
+#include "tensorflow/lite/core/c/common.h"
+
+// Most of the definitions have been moved to this subheader so that Micro
+// can include it without relying on <string> and <complex>, which isn't
+// available on all platforms.
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+
+namespace tflite {
+
+// TODO(b/163167649): This string conversion means that only the first entry
+// in a string tensor will be returned as a std::string, so it's deprecated.
+MATCH_TYPE_AND_TFLITE_TYPE(std::string, kTfLiteString);
+
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<float>, kTfLiteComplex64);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<double>, kTfLiteComplex128);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/util.h b/third_party/tflite-hdrs/tensorflow/lite/util.h
new file mode 100644
index 00000000..6e826497
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/util.h
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides general C++ utility functions in TFLite.
+// For example: Converting between `TfLiteIntArray`, `std::vector` and
+// Flatbuffer vectors. These functions can't live in `context.h` since it's pure
+// C.
+
+#ifndef TENSORFLOW_LITE_UTIL_H_
+#define TENSORFLOW_LITE_UTIL_H_
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+
+// Memory allocation parameter used by ArenaPlanner.
+// Clients (such as delegates) might look at this to ensure interop between
+// TFLite memory & hardware buffers.
+// NOTE: This only holds for tensors allocated on the arena.
+constexpr int kDefaultTensorAlignment = 64;
+
+// The prefix of Flex op custom code.
+// This will be matched agains the `custom_code` field in `OperatorCode`
+// Flatbuffer Table.
+// WARNING: This is an experimental API and subject to change.
+constexpr char kFlexCustomCodePrefix[] = "Flex";
+
+// Checks whether the prefix of the custom name indicates the operation is an
+// Flex operation.
+bool IsFlexOp(const char* custom_name);
+
+// Converts a `std::vector` to a `TfLiteIntArray`. The caller takes ownership
+// of the returned pointer.
+TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
+
+// Converts an array (of the given size) to a `TfLiteIntArray`. The caller
+// takes ownership of the returned pointer, and must make sure 'dims' has at
+// least 'ndims' elements.
+TfLiteIntArray* ConvertArrayToTfLiteIntArray(int ndims, const int* dims);
+
+// Checks whether a `TfLiteIntArray` and an int array have matching elements.
+// The caller must guarantee that 'b' has at least 'b_size' elements.
+bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, int b_size,
+                                 const int* b);
+
+size_t CombineHashes(std::initializer_list<size_t> hashes);
+
+// Populates the size in bytes of a type into `bytes`. Returns kTfLiteOk for
+// valid types, and kTfLiteError otherwise.
+TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
+                           size_t* bytes);
+
+// Creates a stub TfLiteRegistration instance with the provided
+// `custom_op_name`. The op will fail if invoked, and is useful as a
+// placeholder to defer op resolution.
+// Note that `custom_op_name` must remain valid for the returned op's lifetime..
+TfLiteRegistration CreateUnresolvedCustomOp(const char* custom_op_name);
+
+// Checks whether the provided op is an unresolved custom op.
+bool IsUnresolvedCustomOp(const TfLiteRegistration& registration);
+
+// Returns a descriptive name with the given op TfLiteRegistration.
+std::string GetOpNameByRegistration(const TfLiteRegistration& registration);
+
+// The prefix of a validation subgraph name.
+// WARNING: This is an experimental API and subject to change.
+constexpr char kValidationSubgraphNamePrefix[] = "VALIDATION:";
+
+// Checks whether the prefix of the subgraph name indicates the subgraph is a
+// validation subgraph.
+bool IsValidationSubgraph(const char* name);
+
+// Multiply two sizes and return true if overflow occurred;
+// This is based off tensorflow/overflow.h but is simpler as we already
+// have unsigned numbers. It is also generalized to work where sizeof(size_t)
+// is not 8.
+TfLiteStatus MultiplyAndCheckOverflow(size_t a, size_t b, size_t* product);
+
+// Returns whether the TfLiteTensor is a resource or variant tensor.
+inline bool IsResourceOrVariant(const TfLiteTensor* tensor) {
+  return tensor->type == kTfLiteResource || tensor->type == kTfLiteVariant;
+}
+
+// Compute the number of bytes required to represent a tensor with dimensions
+// specified by the array dims (of length dims_size). Returns the status code
+// and bytes.
+TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                           size_t* bytes, TfLiteContext* context);
+
+// `unique_ptr` wrapper for `TfLiteTensor`s.
+struct TfLiteTensorDeleter {
+  void operator()(TfLiteTensor* t) {
+    if (t) {
+      TfLiteTensorFree(t);
+    }
+    free(t);
+  }
+};
+
+using TensorUniquePtr = std::unique_ptr<TfLiteTensor, TfLiteTensorDeleter>;
+TensorUniquePtr BuildTfLiteTensor();
+TensorUniquePtr BuildTfLiteTensor(TfLiteType type, const std::vector<int>& dims,
+                                  TfLiteAllocationType allocation_type);
+TensorUniquePtr BuildTfLiteTensor(TfLiteType type, IntArrayUniquePtr dims,
+                                  TfLiteAllocationType allocation_type);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/third_party/tflite-hdrs/tensorflow/lite/version.h b/third_party/tflite-hdrs/tensorflow/lite/version.h
new file mode 100644
index 00000000..3cd3de26
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/lite/version.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_VERSION_H_
+#define TENSORFLOW_LITE_VERSION_H_
+
+#include "tensorflow/core/public/version.h"
+
+// The version number of the Schema. Ideally all changes will be backward
+// compatible. If that ever changes, we must ensure that version is the first
+// entry in the new tflite root so that we can see that version is not 1.
+#define TFLITE_SCHEMA_VERSION (3)
+
+// TensorFlow Lite Runtime version.
+// This value is currently shared with that of TensorFlow.
+#define TFLITE_VERSION_STRING TF_VERSION_STRING
+
+// TensorFlow Lite Extension APIs version.
+// This is the semantic version number for the custom op and delegate APIs.
+// This value is currently shared with that of TensorFlow Lite.
+#define TFLITE_EXTENSION_APIS_VERSION_STRING TFLITE_VERSION_STRING
+
+#endif  // TENSORFLOW_LITE_VERSION_H_
diff --git a/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/datatype_domains.h b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/datatype_domains.h
new file mode 100644
index 00000000..237099db
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/datatype_domains.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_DATATYPE_DOMAINS_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_DATATYPE_DOMAINS_H_
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow::fuzzing {
+
+/// Returns a fuzztest domain of valid DataTypes to construct a Tensor
+fuzztest::Domain<DataType> AnyValidDataType();
+
+}  // namespace tensorflow::fuzzing
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_DATATYPE_DOMAINS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h
new file mode 100644
index 00000000..f3ed2d8c
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_DOMAINS_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_DOMAINS_H_
+
+#include <string>
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow::fuzzing {
+
+inline constexpr double kDefaultMaxAbsoluteValue = 100.0;
+
+/// Returns a fuzztest domain of tensors of the specified shape and datatype
+fuzztest::Domain<Tensor> AnyValidNumericTensor(
+    const TensorShape& shape, DataType datatype,
+    double min = -kDefaultMaxAbsoluteValue,
+    double max = kDefaultMaxAbsoluteValue);
+
+/// Returns a fuzztest domain of tensors with shape and datatype
+/// that live in the given corresponding domains.
+fuzztest::Domain<Tensor> AnyValidNumericTensor(
+    fuzztest::Domain<TensorShape> tensor_shape_domain,
+    fuzztest::Domain<DataType> datatype_domain,
+    double min = -kDefaultMaxAbsoluteValue,
+    double max = kDefaultMaxAbsoluteValue);
+
+// Returns a fuzztest domain of tensor of max rank 5, with dimensions sizes
+// between 0 and 10 and values between -10 and 10.
+fuzztest::Domain<Tensor> AnySmallValidNumericTensor(
+    DataType datatype = DT_INT32);
+
+fuzztest::Domain<Tensor> AnyValidStringTensor(
+    const TensorShape& shape, fuzztest::Domain<std::string> string_domain =
+                                  fuzztest::Arbitrary<std::string>());
+
+}  // namespace tensorflow::fuzzing
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_DOMAINS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h
new file mode 100644
index 00000000..8a274b93
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_SHAPE_DOMAINS_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_SHAPE_DOMAINS_H_
+
+#include <limits>
+#include <tuple>
+#include <utility>
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow::fuzzing {
+
+/// Returns a fuzztest domain with valid TensorShapes.
+/// The domain can be customized by setting the maximum rank,
+/// and the minimum and maximum size of all dimensions.
+fuzztest::Domain<TensorShape> AnyValidTensorShape(
+    size_t max_rank = std::numeric_limits<int>::max(),
+    int64_t dim_lower_bound = std::numeric_limits<int64_t>::min(),
+    int64_t dim_upper_bound = std::numeric_limits<int64_t>::max());
+
+}  // namespace tensorflow::fuzzing
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_SHAPE_DOMAINS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/fuzz_domains.h b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/fuzz_domains.h
new file mode 100644
index 00000000..974f25c7
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/fuzz_domains.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_DOMAINS_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_DOMAINS_H_
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace helper {
+
+inline fuzztest::Domain<absl::StatusCode> AnyErrorCode() {
+  // We cannot build a `Status` with error_code of 0 and a message, so force
+  // error code to be non-zero.
+  return fuzztest::Map(
+      [](uint32_t code) { return static_cast<absl::StatusCode>(code); },
+      fuzztest::Filter([](uint32_t code) { return code != 0; },
+                       fuzztest::Arbitrary<uint32_t>()));
+}
+
+}  // namespace helper
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_DOMAINS_H_
diff --git a/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/fuzz_session.h b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/fuzz_session.h
new file mode 100644
index 00000000..b492c6e9
--- /dev/null
+++ b/third_party/tflite-hdrs/tensorflow/security/fuzzing/cc/fuzz_session.h
@@ -0,0 +1,174 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_SESSION_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_SESSION_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_FUZZER(dtype, opName)                             \
+  class Fuzz##opName : public FuzzSession<Tensor> {                       \
+    void BuildGraph(const Scope& scope) override {                        \
+      auto op_node =                                                      \
+          tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+      tensorflow::ops::opName(scope.WithOpName("output"), op_node);       \
+    }                                                                     \
+    void FuzzImpl(const Tensor& input_tensor) final {                     \
+      RunInputs({{"input", input_tensor}});                               \
+    }                                                                     \
+  }
+
+#define BINARY_INPUT_OP_FUZZER(dtype1, dtype2, opName)                         \
+  class Fuzz##opName : public FuzzSession<Tensor, Tensor> {                    \
+    void BuildGraph(const Scope& scope) override {                             \
+      auto op_node1 =                                                          \
+          tensorflow::ops::Placeholder(scope.WithOpName("input1"), dtype1);    \
+      auto op_node2 =                                                          \
+          tensorflow::ops::Placeholder(scope.WithOpName("input2"), dtype2);    \
+      tensorflow::ops::opName(scope.WithOpName("output"), op_node1, op_node2); \
+    }                                                                          \
+    void FuzzImpl(const Tensor& input_tensor1,                                 \
+                  const Tensor& input_tensor2) final {                         \
+      RunInputs({{"input1", input_tensor1}, {"input2", input_tensor2}});       \
+    }                                                                          \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Used by GFT to map a known domain (vector<T>) to an unknown
+// domain (Tensor of datatype). T and datatype should match/be compatible.
+template <typename T = uint8_t>
+inline auto AnyTensor() {
+  return fuzztest::Map(
+      [](auto v) {
+        Tensor tensor(DataTypeToEnum<T>::v(),
+                      TensorShape({static_cast<int64_t>(v.size())}));
+        auto flat_tensor = tensor.flat<T>();
+        for (int i = 0; i < v.size(); ++i) {
+          flat_tensor(i) = v[i];
+        }
+        return tensor;
+      },
+      fuzztest::Arbitrary<std::vector<T>>());
+}
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// SINGLE_INPUT_OP_FUZZER(DT_UINT8, Identity);
+// FUZZ_TEST_F(FuzzIdentity, Fuzz).WithDomains(AnyTensor());
+template <typename... T>
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() = default;
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) = 0;
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const T&...) = 0;
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  absl::Status InitIfNeeded() {
+    if (initialized_) {
+      return absl::OkStatus();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::DisabledShapeInferenceScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    absl::Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: "  // Crash OK
+                 << status.message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the input node(s), and discarding
+  // any returned output.
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
+  }
+
+  // Same as RunInputs but don't ignore status
+  absl::Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor>>& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  void Fuzz(const T&... args) {
+    absl::Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(args...);
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_SESSION_H_
diff --git a/third_party/tflite-hdrs/third_party/ducc/ducc0_custom_lowlevel_threading.h b/third_party/tflite-hdrs/third_party/ducc/ducc0_custom_lowlevel_threading.h
new file mode 100644
index 00000000..688efe75
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/ducc/ducc0_custom_lowlevel_threading.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
+#define THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
+
+#include "tsl/platform/mutex.h"
+
+namespace ducc0 {
+namespace detail_threading {
+
+using Mutex = tsl::mutex;
+using UniqueLock = tsl::mutex_lock;
+using LockGuard = tsl::mutex_lock;
+using CondVar = tsl::condition_variable;
+
+// Missing variable used by DUCC threading.cc.
+extern thread_local bool in_parallel_region;
+
+}  // namespace detail_threading
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
diff --git a/third_party/tflite-hdrs/third_party/ducc/fft.h b/third_party/tflite-hdrs/third_party/ducc/fft.h
new file mode 100644
index 00000000..8c1691d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/ducc/fft.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_FFT_H_
+#define THIRD_PARTY_DUCC_GOOGLE_FFT_H_
+
+// Wrapper around the DUCC FFT library to isolate usage of exceptions
+// and RTTI.  Eliminates all direct usage of DUCC headers.
+
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+namespace google {
+
+using Shape = std::vector<std::size_t>;
+using Stride = std::vector<std::ptrdiff_t>;
+
+template <typename RealScalar>
+void c2c(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, std::complex<RealScalar>* out,
+         const Shape& out_shape, const Stride& out_stride, const Shape& axes,
+         bool forward, RealScalar scale,
+         Eigen::ThreadPoolInterface* thread_pool);
+
+template <typename RealScalar>
+void r2c(const RealScalar* in, const Shape& in_shape, const Stride& in_stride,
+         std::complex<RealScalar>* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
+
+template <typename RealScalar>
+void c2r(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, RealScalar* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
+
+#define FFT_DECLARATIONS(RealScalar)                                        \
+  extern template void c2c<RealScalar>(                                     \
+      const std::complex<RealScalar>* in, const Shape& in_shape,            \
+      const Stride& in_stride, std::complex<RealScalar>* out,               \
+      const Shape& out_shape, const Stride& out_stride, const Shape& axes,  \
+      bool forward, RealScalar scale,                                       \
+      Eigen::ThreadPoolInterface* thread_pool);                             \
+  extern template void r2c<RealScalar>(                                     \
+      const RealScalar* in, const Shape& in_shape, const Stride& in_stride, \
+      std::complex<RealScalar>* out, const Shape& out_shape,                \
+      const Stride& out_stride, const Shape& axes, bool forward,            \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);           \
+  extern template void c2r(                                                 \
+      const std::complex<RealScalar>* in, const Shape& in_shape,            \
+      const Stride& in_stride, RealScalar* out, const Shape& out_shape,     \
+      const Stride& out_stride, const Shape& axes, bool forward,            \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool)
+FFT_DECLARATIONS(float);
+FFT_DECLARATIONS(double);
+#undef FFT_DECLARATIONS
+
+}  // namespace google
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_FFT_H_
\ No newline at end of file
diff --git a/third_party/tflite-hdrs/third_party/ducc/threading.h b/third_party/tflite-hdrs/third_party/ducc/threading.h
new file mode 100644
index 00000000..a374e3d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/ducc/threading.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
+#define THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
+
+#include "ducc/src/ducc0/infra/threading.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+namespace google {
+
+using std::size_t;
+
+// Pseudo thread-pool for single-threaded execution.
+class NoThreadPool : public ducc0::detail_threading::thread_pool {
+ public:
+  size_t nthreads() const override { return 1; }
+  size_t adjust_nthreads(size_t nthreads_in) const override { return 1; };
+  void submit(std::function<void()> work) override { work(); }
+};
+
+// Thread-pool wrapper around Eigen's ThreadPool.
+class EigenThreadPool : public ducc0::detail_threading::thread_pool {
+ public:
+  EigenThreadPool(Eigen::ThreadPoolInterface& pool) : pool_{&pool} {}
+  size_t nthreads() const override { return pool_->NumThreads(); }
+  size_t adjust_nthreads(size_t nthreads_in) const override {
+    // If called by a thread in the pool, return 1
+    if (pool_->CurrentThreadId() >= 0) {
+      return 1;
+    } else if (nthreads_in == 0) {
+      return pool_->NumThreads();
+    }
+    return std::min<size_t>(nthreads_in, pool_->NumThreads());
+  };
+  void submit(std::function<void()> work) override {
+    pool_->Schedule(std::move(work));
+  }
+
+ private:
+  Eigen::ThreadPoolInterface* pool_;
+};
+
+}  // namespace google
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
diff --git a/third_party/tflite-hdrs/third_party/fft2d/fft.h b/third_party/tflite-hdrs/third_party/fft2d/fft.h
new file mode 100644
index 00000000..36d838b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/fft2d/fft.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declarations for 1D FFT routines in third_party/fft2d/fft2d.
+
+#ifndef FFT2D_FFT_H__
+#define FFT2D_FFT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft(int, int, double *, int *, double *);
+extern void rdft(int, int, double *, int *, double *);
+extern void ddct(int, int, double *, int *, double *);
+extern void ddst(int, int, double *, int *, double *);
+extern void dfct(int, double *, double *, int *, double *);
+extern void dfst(int, double *, double *, int *, double *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // FFT2D_FFT_H__
diff --git a/third_party/tflite-hdrs/third_party/fft2d/fft2d.h b/third_party/tflite-hdrs/third_party/fft2d/fft2d.h
new file mode 100644
index 00000000..e3311cc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/fft2d/fft2d.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declarations for 2D FFT routines in third_party/fft2d/fft2d.
+
+#ifndef FFT2D_FFT_H__
+#define FFT2D_FFT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft2d(int, int, int, double **, double *, int *, double *);
+extern void rdft2d(int, int, int, double **, double *, int *, double *);
+extern void ddct2d(int, int, int, double **, double *, int *, double *);
+extern void ddst2d(int, int, int, double **, double *, int *, double *);
+extern void rdft2dsort(int, int, int, double **);
+extern void ddct8x8s(int isgn, double **a);
+extern void ddct16x16s(int isgn, double **a);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // FFT2D_FFT_H__
diff --git a/third_party/tflite-hdrs/third_party/hwloc/static-components.h b/third_party/tflite-hdrs/third_party/hwloc/static-components.h
new file mode 100644
index 00000000..e83b311e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/hwloc/static-components.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
+#define TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
+
+#include <private/internal-components.h>
+static const struct hwloc_component* hwloc_static_components[] = {
+    &hwloc_noos_component,
+    &hwloc_xml_component,
+    &hwloc_synthetic_component,
+    &hwloc_xml_nolibxml_component,
+#ifdef __linux__
+    &hwloc_linux_component,
+    &hwloc_linuxio_component,
+#endif
+#ifdef __FreeBSD__
+    &hwloc_freebsd_component,
+#endif
+#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+    &hwloc_x86_component,
+#endif
+    NULL};
+
+#endif  // TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/nasm/config.h b/third_party/tflite-hdrs/third_party/nasm/config.h
new file mode 100644
index 00000000..3533280c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/nasm/config.h
@@ -0,0 +1,543 @@
+/* config/config.h.  Generated from config.h.in by configure.  */
+/* config/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 to call abort() on panics (internal errors), for debugging. */
+/* #undef ABORT_ON_PANIC */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 if compiled with the `-fdata-sections' compiler flag */
+/* #undef CFLAG_FDATA_SECTIONS */
+
+/* Define to 1 if compiled with the `-ffunction-sections' compiler flag */
+/* #undef CFLAG_FFUNCTION_SECTIONS */
+
+/* Define to 1 if compiled with the `-fgnu89-inline' compiler flag */
+/* #undef CFLAG_FGNU89_INLINE */
+
+/* Define to 1 if compiled with the `-flto' compiler flag */
+/* #undef CFLAG_FLTO */
+
+/* Define to 1 if compiled with the `-fno-common' compiler flag */
+#define CFLAG_FNO_COMMON 1
+
+/* Define to 1 if compiled with the `-fno-omit-frame-pointer' compiler flag */
+/* #undef CFLAG_FNO_OMIT_FRAME_POINTER */
+
+/* Define to 1 if compiled with the `-fsanitize=address' compiler flag */
+/* #undef CFLAG_FSANITIZE_ADDRESS */
+
+/* Define to 1 if compiled with the `-fsanitize=undefined' compiler flag */
+/* #undef CFLAG_FSANITIZE_UNDEFINED */
+
+/* Define to 1 if compiled with the `-fvisibility=hidden' compiler flag */
+#define CFLAG_FVISIBILITY_HIDDEN 1
+
+/* Define to 1 if compiled with the `-fwrapv' compiler flag */
+#define CFLAG_FWRAPV 1
+
+/* Define to 1 if compiled with the `-ggdb3' compiler flag */
+/* #undef CFLAG_GGDB3 */
+
+/* Define to 1 if compiled with the `-pedantic' compiler flag */
+#define CFLAG_PEDANTIC 1
+
+/* Define to 1 if compiled with the `-U__STRICT_ANSI__' compiler flag */
+#define CFLAG_U_STRICT_ANSI 1
+
+/* Define to 1 if compiled with the `-W' compiler flag */
+#define CFLAG_W 1
+
+/* Define to 1 if compiled with the `-Wall' compiler flag */
+#define CFLAG_WALL 1
+
+/* Define to 1 if compiled with the `-Wc90-c99-compat' compiler flag */
+/* #undef CFLAG_WC90_C99_COMPAT */
+
+/* Define to 1 if compiled with the `-Werror' compiler flag */
+/* #undef CFLAG_WERROR */
+
+/* Define to 1 if compiled with the `-Werror=attributes' compiler flag */
+#define CFLAG_WERROR_ATTRIBUTES 1
+
+/* Define to 1 if compiled with the `-Werror=comment' compiler flag */
+#define CFLAG_WERROR_COMMENT 1
+
+/* Define to 1 if compiled with the `-Werror=implicit' compiler flag */
+#define CFLAG_WERROR_IMPLICIT 1
+
+/* Define to 1 if compiled with the `-Werror=missing-braces' compiler flag */
+#define CFLAG_WERROR_MISSING_BRACES 1
+
+/* Define to 1 if compiled with the `-Werror=missing-declarations' compiler
+   flag */
+#define CFLAG_WERROR_MISSING_DECLARATIONS 1
+
+/* Define to 1 if compiled with the `-Werror=missing-prototypes' compiler flag
+ */
+#define CFLAG_WERROR_MISSING_PROTOTYPES 1
+
+/* Define to 1 if compiled with the `-Werror=pointer-arith' compiler flag */
+#define CFLAG_WERROR_POINTER_ARITH 1
+
+/* Define to 1 if compiled with the `-Werror=return-type' compiler flag */
+#define CFLAG_WERROR_RETURN_TYPE 1
+
+/* Define to 1 if compiled with the `-Werror=strict-prototypes' compiler flag
+ */
+/* #undef CFLAG_WERROR_STRICT_PROTOTYPES */
+
+/* Define to 1 if compiled with the `-Werror=trigraphs' compiler flag */
+#define CFLAG_WERROR_TRIGRAPHS 1
+
+/* Define to 1 if compiled with the `-Werror=unknown-warning-option' compiler
+   flag */
+/* #undef CFLAG_WERROR_UNKNOWN_WARNING_OPTION */
+
+/* Define to 1 if compiled with the `-Werror=vla' compiler flag */
+#define CFLAG_WERROR_VLA 1
+
+/* Define to 1 if compiled with the `-Wlong-long' compiler flag */
+#define CFLAG_WLONG_LONG 1
+
+/* Define to 1 if compiled with the `-Wl,--gc-sections' compiler flag */
+/* #undef CFLAG_WL_GC_SECTIONS */
+
+/* Define to 1 if compiled with the `-Wpedantic-ms-format' compiler flag */
+/* #undef CFLAG_WPEDANTIC_MS_FORMAT */
+
+/* Define to 1 if compiled with the `-Wshift-negative-value' compiler flag */
+#define CFLAG_WSHIFT_NEGATIVE_VALUE 1
+
+/* Define to 1 if compiled with the `-Wstringop-truncation' compiler flag */
+/* #undef CFLAG_WSTRINGOP_TRUNCATION */
+
+/* Define to 1 if you have the `access' function. */
+#define HAVE_ACCESS 1
+
+/* Define to 1 if you have the `canonicalize_file_name' function. */
+/* #undef HAVE_CANONICALIZE_FILE_NAME */
+
+/* Define to 1 if you have the `cpu_to_le16' intrinsic function. */
+/* #undef HAVE_CPU_TO_LE16 */
+
+/* Define to 1 if you have the `cpu_to_le32' intrinsic function. */
+/* #undef HAVE_CPU_TO_LE32 */
+
+/* Define to 1 if you have the `cpu_to_le64' intrinsic function. */
+/* #undef HAVE_CPU_TO_LE64 */
+
+/* Define to 1 if you have the declaration of `strcasecmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRCASECMP 1
+
+/* Define to 1 if you have the declaration of `stricmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRICMP 0
+
+/* Define to 1 if you have the declaration of `strlcpy', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRLCPY 0
+
+/* Define to 1 if you have the declaration of `strncasecmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the declaration of `strnicmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNICMP 0
+
+/* Define to 1 if you have the declaration of `strnlen', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNLEN 1
+
+/* Define to 1 if you have the declaration of `strrchrnul', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRRCHRNUL 0
+
+/* Define to 1 if you have the declaration of `strsep', and to 0 if you don't.
+ */
+#define HAVE_DECL_STRSEP 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+/* #undef HAVE_ENDIAN_H */
+
+/* Define to 1 if you have the `faccessat' function. */
+#define HAVE_FACCESSAT 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `fileno' function. */
+#define HAVE_FILENO 1
+
+/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
+#define HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `fstat' function. */
+#define HAVE_FSTAT 1
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if your compiler supports __attribute__((alloc_size)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_ALLOC_SIZE 1
+
+/* Define to 1 if your compiler supports __attribute__((cold)) on functions */
+#define HAVE_FUNC_ATTRIBUTE_COLD 1
+
+/* Define to 1 if your compiler supports __attribute__((const)) on functions
+ */
+#define HAVE_FUNC_ATTRIBUTE_CONST 1
+
+/* Define to 1 if your compiler supports __attribute__((error)) on functions
+ */
+/* #undef HAVE_FUNC_ATTRIBUTE_ERROR */
+
+/* Define to 1 if your compiler supports __attribute__((format)) on functions
+ */
+#define HAVE_FUNC_ATTRIBUTE_FORMAT 1
+
+/* Define to 1 if your compiler supports __attribute__((malloc)) on functions
+ */
+#define HAVE_FUNC_ATTRIBUTE_MALLOC 1
+
+/* Define to 1 if your compiler supports __attribute__((noreturn)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_NORETURN 1
+
+/* Define to 1 if your compiler supports __attribute__((pure)) on functions */
+#define HAVE_FUNC_ATTRIBUTE_PURE 1
+
+/* Define to 1 if your compiler supports __attribute__((returns_nonnull)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL 1
+
+/* Define to 1 if your compiler supports __attribute__((sentinel)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_SENTINEL 1
+
+/* Define to 1 if you have the `getgid' function. */
+#define HAVE_GETGID 1
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getuid' function. */
+#define HAVE_GETUID 1
+
+/* Define to 1 if you have the `htole16' intrinsic function. */
+/* #undef HAVE_HTOLE16 */
+
+/* Define to 1 if you have the `htole32' intrinsic function. */
+/* #undef HAVE_HTOLE32 */
+
+/* Define to 1 if you have the `htole64' intrinsic function. */
+/* #undef HAVE_HTOLE64 */
+
+/* Define to 1 if you have the <intrin.h> header file. */
+/* #undef HAVE_INTRIN_H */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <io.h> header file. */
+/* #undef HAVE_IO_H */
+
+/* Define to 1 if you have the <machine/endian.h> header file. */
+/* #undef HAVE_MACHINE_ENDIAN_H */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have a working `mmap' system call. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if you have the `pathconf' function. */
+#define HAVE_PATHCONF 1
+
+/* Define to 1 if you have the `realpath' function. */
+#define HAVE_REALPATH 1
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* Define to 1 if you have the `stat' function. */
+#define HAVE_STAT 1
+
+/* Define to 1 if stdbool.h conforms to C99. */
+#define HAVE_STDBOOL_H 1
+
+/* Define to 1 if your compiler supports C99 extern inline */
+#define HAVE_STDC_INLINE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <stdnoreturn.h> header file. */
+#define HAVE_STDNORETURN_H 1
+
+/* Define to 1 if you have the `strcasecmp' function. */
+#define HAVE_STRCASECMP 1
+
+/* Define to 1 if you have the `stricmp' function. */
+/* #undef HAVE_STRICMP */
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcpy' function. */
+/* #undef HAVE_STRLCPY */
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strnicmp' function. */
+/* #undef HAVE_STRNICMP */
+
+/* Define to 1 if you have the `strnlen' function. */
+#define HAVE_STRNLEN 1
+
+/* Define to 1 if you have the `strrchrnul' function. */
+/* #undef HAVE_STRRCHRNUL */
+
+/* Define to 1 if you have the `strsep' function. */
+#define HAVE_STRSEP 1
+
+/* Define to 1 if the system has the type `struct stat'. */
+#define HAVE_STRUCT_STAT 1
+
+/* Define to 1 if the system has the type `struct _stati64'. */
+/* #undef HAVE_STRUCT__STATI64 */
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if the system has the type `uintptr_t'. */
+#define HAVE_UINTPTR_T 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the `vsnprintf' function. */
+#define HAVE_VSNPRINTF 1
+
+/* Define to 1 if you have the `_access' function. */
+/* #undef HAVE__ACCESS */
+
+/* Define to 1 if you have the `_BitScanReverse' intrinsic function. */
+/* #undef HAVE__BITSCANREVERSE */
+
+/* Define to 1 if you have the `_BitScanReverse64' intrinsic function. */
+/* #undef HAVE__BITSCANREVERSE64 */
+
+/* Define to 1 if the system has the type `_Bool'. */
+#define HAVE__BOOL 1
+
+/* Define to 1 if you have the `_byteswap_uint64' intrinsic function. */
+/* #undef HAVE__BYTESWAP_UINT64 */
+
+/* Define to 1 if you have the `_byteswap_ulong' intrinsic function. */
+/* #undef HAVE__BYTESWAP_ULONG */
+
+/* Define to 1 if you have the `_byteswap_ushort' intrinsic function. */
+/* #undef HAVE__BYTESWAP_USHORT */
+
+/* Define to 1 if you have the `_chsize' function. */
+/* #undef HAVE__CHSIZE */
+
+/* Define to 1 if you have the `_chsize_s' function. */
+/* #undef HAVE__CHSIZE_S */
+
+/* Define to 1 if you have the `_filelengthi64' function. */
+/* #undef HAVE__FILELENGTHI64 */
+
+/* Define to 1 if you have the `_fileno' function. */
+/* #undef HAVE__FILENO */
+
+/* Define to 1 if you have the `_fseeki64' function. */
+/* #undef HAVE__FSEEKI64 */
+
+/* Define to 1 if you have the `_fstati64' function. */
+/* #undef HAVE__FSTATI64 */
+
+/* Define to 1 if you have the `_fullpath' function. */
+/* #undef HAVE__FULLPATH */
+
+/* Define to 1 if you have the `_snprintf' function. */
+/* #undef HAVE__SNPRINTF */
+
+/* Define to 1 if you have the `_stati64' function. */
+/* #undef HAVE__STATI64 */
+
+/* Define to 1 if you have the `_vsnprintf' function. */
+/* #undef HAVE__VSNPRINTF */
+
+/* Define to 1 if you have the `__bswap_16' intrinsic function. */
+/* #undef HAVE___BSWAP_16 */
+
+/* Define to 1 if you have the `__bswap_32' intrinsic function. */
+/* #undef HAVE___BSWAP_32 */
+
+/* Define to 1 if you have the `__bswap_64' intrinsic function. */
+/* #undef HAVE___BSWAP_64 */
+
+/* Define to 1 if you have the `__builtin_bswap16' intrinsic function. */
+#define HAVE___BUILTIN_BSWAP16 1
+
+/* Define to 1 if you have the `__builtin_bswap32' intrinsic function. */
+#define HAVE___BUILTIN_BSWAP32 1
+
+/* Define to 1 if you have the `__builtin_bswap64' intrinsic function. */
+#define HAVE___BUILTIN_BSWAP64 1
+
+/* Define to 1 if you have the `__builtin_clz' intrinsic function. */
+#define HAVE___BUILTIN_CLZ 1
+
+/* Define to 1 if you have the `__builtin_clzl' intrinsic function. */
+#define HAVE___BUILTIN_CLZL 1
+
+/* Define to 1 if you have the `__builtin_clzll' intrinsic function. */
+#define HAVE___BUILTIN_CLZLL 1
+
+/* Define to 1 if you have the `__builtin_constant_p' intrinsic function. */
+#define HAVE___BUILTIN_CONSTANT_P 1
+
+/* Define to 1 if you have the `__builtin_expect' intrinsic function. */
+#define HAVE___BUILTIN_EXPECT 1
+
+/* Define to 1 if you have the `__cpu_to_le16' intrinsic function. */
+/* #undef HAVE___CPU_TO_LE16 */
+
+/* Define to 1 if you have the `__cpu_to_le32' intrinsic function. */
+/* #undef HAVE___CPU_TO_LE32 */
+
+/* Define to 1 if you have the `__cpu_to_le64' intrinsic function. */
+/* #undef HAVE___CPU_TO_LE64 */
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME ""
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING ""
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME ""
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION ""
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+#define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+#define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+#define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+#define __EXTENSIONS__ 1
+#endif
+
+/* Define to 1 if your processor stores words with the most significant byte
+   first (like Motorola and SPARC, unlike Intel and VAX). */
+/* #undef WORDS_BIGENDIAN */
+
+/* Define to 1 if your processor stores words with the least significant byte
+   first (like Intel and VAX, unlike Motorola and SPARC). */
+#define WORDS_LITTLEENDIAN 1
+
+/* Enable large inode numbers on Mac OS X 10.5.  */
+#ifndef _DARWIN_USE_64_BIT_INODE
+#define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
+/* #undef _LARGEFILE_SOURCE */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the equivalent of the C99 'restrict' keyword, or to
+   nothing if this is not supported.  Do not define if restrict is
+   supported directly.  */
+#define restrict __restrict
+/* Work around a bug in Sun C++: it does not support _Restrict or
+   __restrict__, even though the corresponding Sun C compiler ends up with
+   "#define restrict _Restrict" or "#define restrict __restrict__" in the
+   previous line.  Perhaps some future version of Sun C++ will work with
+   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
+#if defined __SUNPRO_CC && !defined __RESTRICT
+#define _Restrict
+#define __restrict__
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to the type of an unsigned integer type wide enough to hold a
+   pointer, if such a type exists, and if the system does not define it. */
+/* #undef uintptr_t */
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/ducc0_custom_lowlevel_threading.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/ducc0_custom_lowlevel_threading.h
new file mode 100644
index 00000000..688efe75
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/ducc0_custom_lowlevel_threading.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
+#define THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
+
+#include "tsl/platform/mutex.h"
+
+namespace ducc0 {
+namespace detail_threading {
+
+using Mutex = tsl::mutex;
+using UniqueLock = tsl::mutex_lock;
+using LockGuard = tsl::mutex_lock;
+using CondVar = tsl::condition_variable;
+
+// Missing variable used by DUCC threading.cc.
+extern thread_local bool in_parallel_region;
+
+}  // namespace detail_threading
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/fft.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/fft.h
new file mode 100644
index 00000000..8c1691d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/fft.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_FFT_H_
+#define THIRD_PARTY_DUCC_GOOGLE_FFT_H_
+
+// Wrapper around the DUCC FFT library to isolate usage of exceptions
+// and RTTI.  Eliminates all direct usage of DUCC headers.
+
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+namespace google {
+
+using Shape = std::vector<std::size_t>;
+using Stride = std::vector<std::ptrdiff_t>;
+
+template <typename RealScalar>
+void c2c(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, std::complex<RealScalar>* out,
+         const Shape& out_shape, const Stride& out_stride, const Shape& axes,
+         bool forward, RealScalar scale,
+         Eigen::ThreadPoolInterface* thread_pool);
+
+template <typename RealScalar>
+void r2c(const RealScalar* in, const Shape& in_shape, const Stride& in_stride,
+         std::complex<RealScalar>* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
+
+template <typename RealScalar>
+void c2r(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, RealScalar* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
+
+#define FFT_DECLARATIONS(RealScalar)                                        \
+  extern template void c2c<RealScalar>(                                     \
+      const std::complex<RealScalar>* in, const Shape& in_shape,            \
+      const Stride& in_stride, std::complex<RealScalar>* out,               \
+      const Shape& out_shape, const Stride& out_stride, const Shape& axes,  \
+      bool forward, RealScalar scale,                                       \
+      Eigen::ThreadPoolInterface* thread_pool);                             \
+  extern template void r2c<RealScalar>(                                     \
+      const RealScalar* in, const Shape& in_shape, const Stride& in_stride, \
+      std::complex<RealScalar>* out, const Shape& out_shape,                \
+      const Stride& out_stride, const Shape& axes, bool forward,            \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);           \
+  extern template void c2r(                                                 \
+      const std::complex<RealScalar>* in, const Shape& in_shape,            \
+      const Stride& in_stride, RealScalar* out, const Shape& out_shape,     \
+      const Stride& out_stride, const Shape& axes, bool forward,            \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool)
+FFT_DECLARATIONS(float);
+FFT_DECLARATIONS(double);
+#undef FFT_DECLARATIONS
+
+}  // namespace google
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_FFT_H_
\ No newline at end of file
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/threading.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/threading.h
new file mode 100644
index 00000000..a374e3d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/ducc/threading.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
+#define THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
+
+#include "ducc/src/ducc0/infra/threading.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+namespace google {
+
+using std::size_t;
+
+// Pseudo thread-pool for single-threaded execution.
+class NoThreadPool : public ducc0::detail_threading::thread_pool {
+ public:
+  size_t nthreads() const override { return 1; }
+  size_t adjust_nthreads(size_t nthreads_in) const override { return 1; };
+  void submit(std::function<void()> work) override { work(); }
+};
+
+// Thread-pool wrapper around Eigen's ThreadPool.
+class EigenThreadPool : public ducc0::detail_threading::thread_pool {
+ public:
+  EigenThreadPool(Eigen::ThreadPoolInterface& pool) : pool_{&pool} {}
+  size_t nthreads() const override { return pool_->NumThreads(); }
+  size_t adjust_nthreads(size_t nthreads_in) const override {
+    // If called by a thread in the pool, return 1
+    if (pool_->CurrentThreadId() >= 0) {
+      return 1;
+    } else if (nthreads_in == 0) {
+      return pool_->NumThreads();
+    }
+    return std::min<size_t>(nthreads_in, pool_->NumThreads());
+  };
+  void submit(std::function<void()> work) override {
+    pool_->Schedule(std::move(work));
+  }
+
+ private:
+  Eigen::ThreadPoolInterface* pool_;
+};
+
+}  // namespace google
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/hwloc/static-components.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/hwloc/static-components.h
new file mode 100644
index 00000000..e83b311e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/hwloc/static-components.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
+#define TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
+
+#include <private/internal-components.h>
+static const struct hwloc_component* hwloc_static_components[] = {
+    &hwloc_noos_component,
+    &hwloc_xml_component,
+    &hwloc_synthetic_component,
+    &hwloc_xml_nolibxml_component,
+#ifdef __linux__
+    &hwloc_linux_component,
+    &hwloc_linuxio_component,
+#endif
+#ifdef __FreeBSD__
+    &hwloc_freebsd_component,
+#endif
+#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+    &hwloc_x86_component,
+#endif
+    NULL};
+
+#endif  // TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/nasm/config.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/nasm/config.h
new file mode 100644
index 00000000..3533280c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/third_party/nasm/config.h
@@ -0,0 +1,543 @@
+/* config/config.h.  Generated from config.h.in by configure.  */
+/* config/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 to call abort() on panics (internal errors), for debugging. */
+/* #undef ABORT_ON_PANIC */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 if compiled with the `-fdata-sections' compiler flag */
+/* #undef CFLAG_FDATA_SECTIONS */
+
+/* Define to 1 if compiled with the `-ffunction-sections' compiler flag */
+/* #undef CFLAG_FFUNCTION_SECTIONS */
+
+/* Define to 1 if compiled with the `-fgnu89-inline' compiler flag */
+/* #undef CFLAG_FGNU89_INLINE */
+
+/* Define to 1 if compiled with the `-flto' compiler flag */
+/* #undef CFLAG_FLTO */
+
+/* Define to 1 if compiled with the `-fno-common' compiler flag */
+#define CFLAG_FNO_COMMON 1
+
+/* Define to 1 if compiled with the `-fno-omit-frame-pointer' compiler flag */
+/* #undef CFLAG_FNO_OMIT_FRAME_POINTER */
+
+/* Define to 1 if compiled with the `-fsanitize=address' compiler flag */
+/* #undef CFLAG_FSANITIZE_ADDRESS */
+
+/* Define to 1 if compiled with the `-fsanitize=undefined' compiler flag */
+/* #undef CFLAG_FSANITIZE_UNDEFINED */
+
+/* Define to 1 if compiled with the `-fvisibility=hidden' compiler flag */
+#define CFLAG_FVISIBILITY_HIDDEN 1
+
+/* Define to 1 if compiled with the `-fwrapv' compiler flag */
+#define CFLAG_FWRAPV 1
+
+/* Define to 1 if compiled with the `-ggdb3' compiler flag */
+/* #undef CFLAG_GGDB3 */
+
+/* Define to 1 if compiled with the `-pedantic' compiler flag */
+#define CFLAG_PEDANTIC 1
+
+/* Define to 1 if compiled with the `-U__STRICT_ANSI__' compiler flag */
+#define CFLAG_U_STRICT_ANSI 1
+
+/* Define to 1 if compiled with the `-W' compiler flag */
+#define CFLAG_W 1
+
+/* Define to 1 if compiled with the `-Wall' compiler flag */
+#define CFLAG_WALL 1
+
+/* Define to 1 if compiled with the `-Wc90-c99-compat' compiler flag */
+/* #undef CFLAG_WC90_C99_COMPAT */
+
+/* Define to 1 if compiled with the `-Werror' compiler flag */
+/* #undef CFLAG_WERROR */
+
+/* Define to 1 if compiled with the `-Werror=attributes' compiler flag */
+#define CFLAG_WERROR_ATTRIBUTES 1
+
+/* Define to 1 if compiled with the `-Werror=comment' compiler flag */
+#define CFLAG_WERROR_COMMENT 1
+
+/* Define to 1 if compiled with the `-Werror=implicit' compiler flag */
+#define CFLAG_WERROR_IMPLICIT 1
+
+/* Define to 1 if compiled with the `-Werror=missing-braces' compiler flag */
+#define CFLAG_WERROR_MISSING_BRACES 1
+
+/* Define to 1 if compiled with the `-Werror=missing-declarations' compiler
+   flag */
+#define CFLAG_WERROR_MISSING_DECLARATIONS 1
+
+/* Define to 1 if compiled with the `-Werror=missing-prototypes' compiler flag
+ */
+#define CFLAG_WERROR_MISSING_PROTOTYPES 1
+
+/* Define to 1 if compiled with the `-Werror=pointer-arith' compiler flag */
+#define CFLAG_WERROR_POINTER_ARITH 1
+
+/* Define to 1 if compiled with the `-Werror=return-type' compiler flag */
+#define CFLAG_WERROR_RETURN_TYPE 1
+
+/* Define to 1 if compiled with the `-Werror=strict-prototypes' compiler flag
+ */
+/* #undef CFLAG_WERROR_STRICT_PROTOTYPES */
+
+/* Define to 1 if compiled with the `-Werror=trigraphs' compiler flag */
+#define CFLAG_WERROR_TRIGRAPHS 1
+
+/* Define to 1 if compiled with the `-Werror=unknown-warning-option' compiler
+   flag */
+/* #undef CFLAG_WERROR_UNKNOWN_WARNING_OPTION */
+
+/* Define to 1 if compiled with the `-Werror=vla' compiler flag */
+#define CFLAG_WERROR_VLA 1
+
+/* Define to 1 if compiled with the `-Wlong-long' compiler flag */
+#define CFLAG_WLONG_LONG 1
+
+/* Define to 1 if compiled with the `-Wl,--gc-sections' compiler flag */
+/* #undef CFLAG_WL_GC_SECTIONS */
+
+/* Define to 1 if compiled with the `-Wpedantic-ms-format' compiler flag */
+/* #undef CFLAG_WPEDANTIC_MS_FORMAT */
+
+/* Define to 1 if compiled with the `-Wshift-negative-value' compiler flag */
+#define CFLAG_WSHIFT_NEGATIVE_VALUE 1
+
+/* Define to 1 if compiled with the `-Wstringop-truncation' compiler flag */
+/* #undef CFLAG_WSTRINGOP_TRUNCATION */
+
+/* Define to 1 if you have the `access' function. */
+#define HAVE_ACCESS 1
+
+/* Define to 1 if you have the `canonicalize_file_name' function. */
+/* #undef HAVE_CANONICALIZE_FILE_NAME */
+
+/* Define to 1 if you have the `cpu_to_le16' intrinsic function. */
+/* #undef HAVE_CPU_TO_LE16 */
+
+/* Define to 1 if you have the `cpu_to_le32' intrinsic function. */
+/* #undef HAVE_CPU_TO_LE32 */
+
+/* Define to 1 if you have the `cpu_to_le64' intrinsic function. */
+/* #undef HAVE_CPU_TO_LE64 */
+
+/* Define to 1 if you have the declaration of `strcasecmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRCASECMP 1
+
+/* Define to 1 if you have the declaration of `stricmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRICMP 0
+
+/* Define to 1 if you have the declaration of `strlcpy', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRLCPY 0
+
+/* Define to 1 if you have the declaration of `strncasecmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the declaration of `strnicmp', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNICMP 0
+
+/* Define to 1 if you have the declaration of `strnlen', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNLEN 1
+
+/* Define to 1 if you have the declaration of `strrchrnul', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRRCHRNUL 0
+
+/* Define to 1 if you have the declaration of `strsep', and to 0 if you don't.
+ */
+#define HAVE_DECL_STRSEP 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+/* #undef HAVE_ENDIAN_H */
+
+/* Define to 1 if you have the `faccessat' function. */
+#define HAVE_FACCESSAT 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `fileno' function. */
+#define HAVE_FILENO 1
+
+/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
+#define HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `fstat' function. */
+#define HAVE_FSTAT 1
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if your compiler supports __attribute__((alloc_size)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_ALLOC_SIZE 1
+
+/* Define to 1 if your compiler supports __attribute__((cold)) on functions */
+#define HAVE_FUNC_ATTRIBUTE_COLD 1
+
+/* Define to 1 if your compiler supports __attribute__((const)) on functions
+ */
+#define HAVE_FUNC_ATTRIBUTE_CONST 1
+
+/* Define to 1 if your compiler supports __attribute__((error)) on functions
+ */
+/* #undef HAVE_FUNC_ATTRIBUTE_ERROR */
+
+/* Define to 1 if your compiler supports __attribute__((format)) on functions
+ */
+#define HAVE_FUNC_ATTRIBUTE_FORMAT 1
+
+/* Define to 1 if your compiler supports __attribute__((malloc)) on functions
+ */
+#define HAVE_FUNC_ATTRIBUTE_MALLOC 1
+
+/* Define to 1 if your compiler supports __attribute__((noreturn)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_NORETURN 1
+
+/* Define to 1 if your compiler supports __attribute__((pure)) on functions */
+#define HAVE_FUNC_ATTRIBUTE_PURE 1
+
+/* Define to 1 if your compiler supports __attribute__((returns_nonnull)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL 1
+
+/* Define to 1 if your compiler supports __attribute__((sentinel)) on
+   functions */
+#define HAVE_FUNC_ATTRIBUTE_SENTINEL 1
+
+/* Define to 1 if you have the `getgid' function. */
+#define HAVE_GETGID 1
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getuid' function. */
+#define HAVE_GETUID 1
+
+/* Define to 1 if you have the `htole16' intrinsic function. */
+/* #undef HAVE_HTOLE16 */
+
+/* Define to 1 if you have the `htole32' intrinsic function. */
+/* #undef HAVE_HTOLE32 */
+
+/* Define to 1 if you have the `htole64' intrinsic function. */
+/* #undef HAVE_HTOLE64 */
+
+/* Define to 1 if you have the <intrin.h> header file. */
+/* #undef HAVE_INTRIN_H */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <io.h> header file. */
+/* #undef HAVE_IO_H */
+
+/* Define to 1 if you have the <machine/endian.h> header file. */
+/* #undef HAVE_MACHINE_ENDIAN_H */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have a working `mmap' system call. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if you have the `pathconf' function. */
+#define HAVE_PATHCONF 1
+
+/* Define to 1 if you have the `realpath' function. */
+#define HAVE_REALPATH 1
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* Define to 1 if you have the `stat' function. */
+#define HAVE_STAT 1
+
+/* Define to 1 if stdbool.h conforms to C99. */
+#define HAVE_STDBOOL_H 1
+
+/* Define to 1 if your compiler supports C99 extern inline */
+#define HAVE_STDC_INLINE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <stdnoreturn.h> header file. */
+#define HAVE_STDNORETURN_H 1
+
+/* Define to 1 if you have the `strcasecmp' function. */
+#define HAVE_STRCASECMP 1
+
+/* Define to 1 if you have the `stricmp' function. */
+/* #undef HAVE_STRICMP */
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcpy' function. */
+/* #undef HAVE_STRLCPY */
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strnicmp' function. */
+/* #undef HAVE_STRNICMP */
+
+/* Define to 1 if you have the `strnlen' function. */
+#define HAVE_STRNLEN 1
+
+/* Define to 1 if you have the `strrchrnul' function. */
+/* #undef HAVE_STRRCHRNUL */
+
+/* Define to 1 if you have the `strsep' function. */
+#define HAVE_STRSEP 1
+
+/* Define to 1 if the system has the type `struct stat'. */
+#define HAVE_STRUCT_STAT 1
+
+/* Define to 1 if the system has the type `struct _stati64'. */
+/* #undef HAVE_STRUCT__STATI64 */
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if the system has the type `uintptr_t'. */
+#define HAVE_UINTPTR_T 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the `vsnprintf' function. */
+#define HAVE_VSNPRINTF 1
+
+/* Define to 1 if you have the `_access' function. */
+/* #undef HAVE__ACCESS */
+
+/* Define to 1 if you have the `_BitScanReverse' intrinsic function. */
+/* #undef HAVE__BITSCANREVERSE */
+
+/* Define to 1 if you have the `_BitScanReverse64' intrinsic function. */
+/* #undef HAVE__BITSCANREVERSE64 */
+
+/* Define to 1 if the system has the type `_Bool'. */
+#define HAVE__BOOL 1
+
+/* Define to 1 if you have the `_byteswap_uint64' intrinsic function. */
+/* #undef HAVE__BYTESWAP_UINT64 */
+
+/* Define to 1 if you have the `_byteswap_ulong' intrinsic function. */
+/* #undef HAVE__BYTESWAP_ULONG */
+
+/* Define to 1 if you have the `_byteswap_ushort' intrinsic function. */
+/* #undef HAVE__BYTESWAP_USHORT */
+
+/* Define to 1 if you have the `_chsize' function. */
+/* #undef HAVE__CHSIZE */
+
+/* Define to 1 if you have the `_chsize_s' function. */
+/* #undef HAVE__CHSIZE_S */
+
+/* Define to 1 if you have the `_filelengthi64' function. */
+/* #undef HAVE__FILELENGTHI64 */
+
+/* Define to 1 if you have the `_fileno' function. */
+/* #undef HAVE__FILENO */
+
+/* Define to 1 if you have the `_fseeki64' function. */
+/* #undef HAVE__FSEEKI64 */
+
+/* Define to 1 if you have the `_fstati64' function. */
+/* #undef HAVE__FSTATI64 */
+
+/* Define to 1 if you have the `_fullpath' function. */
+/* #undef HAVE__FULLPATH */
+
+/* Define to 1 if you have the `_snprintf' function. */
+/* #undef HAVE__SNPRINTF */
+
+/* Define to 1 if you have the `_stati64' function. */
+/* #undef HAVE__STATI64 */
+
+/* Define to 1 if you have the `_vsnprintf' function. */
+/* #undef HAVE__VSNPRINTF */
+
+/* Define to 1 if you have the `__bswap_16' intrinsic function. */
+/* #undef HAVE___BSWAP_16 */
+
+/* Define to 1 if you have the `__bswap_32' intrinsic function. */
+/* #undef HAVE___BSWAP_32 */
+
+/* Define to 1 if you have the `__bswap_64' intrinsic function. */
+/* #undef HAVE___BSWAP_64 */
+
+/* Define to 1 if you have the `__builtin_bswap16' intrinsic function. */
+#define HAVE___BUILTIN_BSWAP16 1
+
+/* Define to 1 if you have the `__builtin_bswap32' intrinsic function. */
+#define HAVE___BUILTIN_BSWAP32 1
+
+/* Define to 1 if you have the `__builtin_bswap64' intrinsic function. */
+#define HAVE___BUILTIN_BSWAP64 1
+
+/* Define to 1 if you have the `__builtin_clz' intrinsic function. */
+#define HAVE___BUILTIN_CLZ 1
+
+/* Define to 1 if you have the `__builtin_clzl' intrinsic function. */
+#define HAVE___BUILTIN_CLZL 1
+
+/* Define to 1 if you have the `__builtin_clzll' intrinsic function. */
+#define HAVE___BUILTIN_CLZLL 1
+
+/* Define to 1 if you have the `__builtin_constant_p' intrinsic function. */
+#define HAVE___BUILTIN_CONSTANT_P 1
+
+/* Define to 1 if you have the `__builtin_expect' intrinsic function. */
+#define HAVE___BUILTIN_EXPECT 1
+
+/* Define to 1 if you have the `__cpu_to_le16' intrinsic function. */
+/* #undef HAVE___CPU_TO_LE16 */
+
+/* Define to 1 if you have the `__cpu_to_le32' intrinsic function. */
+/* #undef HAVE___CPU_TO_LE32 */
+
+/* Define to 1 if you have the `__cpu_to_le64' intrinsic function. */
+/* #undef HAVE___CPU_TO_LE64 */
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME ""
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING ""
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME ""
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION ""
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+#define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+#define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+#define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+#define __EXTENSIONS__ 1
+#endif
+
+/* Define to 1 if your processor stores words with the most significant byte
+   first (like Motorola and SPARC, unlike Intel and VAX). */
+/* #undef WORDS_BIGENDIAN */
+
+/* Define to 1 if your processor stores words with the least significant byte
+   first (like Intel and VAX, unlike Motorola and SPARC). */
+#define WORDS_LITTLEENDIAN 1
+
+/* Enable large inode numbers on Mac OS X 10.5.  */
+#ifndef _DARWIN_USE_64_BIT_INODE
+#define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
+/* #undef _LARGEFILE_SOURCE */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the equivalent of the C99 'restrict' keyword, or to
+   nothing if this is not supported.  Do not define if restrict is
+   supported directly.  */
+#define restrict __restrict
+/* Work around a bug in Sun C++: it does not support _Restrict or
+   __restrict__, even though the corresponding Sun C compiler ends up with
+   "#define restrict _Restrict" or "#define restrict __restrict__" in the
+   previous line.  Perhaps some future version of Sun C++ will work with
+   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
+#if defined __SUNPRO_CC && !defined __RESTRICT
+#define _Restrict
+#define __restrict__
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to the type of an unsigned integer type wide enough to hold a
+   pointer, if such a type exists, and if the system does not define it. */
+/* #undef uintptr_t */
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/abi.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/abi.h
new file mode 100644
index 00000000..20f2fbf0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/abi.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ABI_H_
+#define TENSORFLOW_TSL_PLATFORM_ABI_H_
+
+#include <string>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace port {
+
+std::string MaybeAbiDemangle(const char* name);
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ABI_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/base64.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/base64.h
new file mode 100644
index 00000000..08867207
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/base64.h
@@ -0,0 +1,62 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_BASE64_H_
+#define TENSORFLOW_TSL_PLATFORM_BASE64_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+/// \brief Converts data into web-safe base64 encoding.
+///
+/// See https://en.wikipedia.org/wiki/Base64
+template <typename T>
+absl::Status Base64Encode(absl::string_view source, bool with_padding,
+                          T* encoded);
+template <typename T>
+absl::Status Base64Encode(absl::string_view source,
+                          T* encoded);  // with_padding=false.
+
+/// \brief Converts data from web-safe base64 encoding.
+///
+/// See https://en.wikipedia.org/wiki/Base64
+template <typename T>
+absl::Status Base64Decode(absl::string_view data, T* decoded);
+
+// Explicit instantiations defined in base64.cc.
+extern template Status Base64Decode<std::string>(StringPiece data,
+                                                 std::string* decoded);
+extern template Status Base64Encode<std::string>(StringPiece source,
+                                                 std::string* encoded);
+extern template Status Base64Encode<std::string>(StringPiece source,
+                                                 bool with_padding,
+                                                 std::string* encoded);
+
+extern template Status Base64Decode<tstring>(StringPiece data,
+                                             tstring* decoded);
+extern template Status Base64Encode<tstring>(StringPiece source,
+                                             tstring* encoded);
+extern template Status Base64Encode<tstring>(StringPiece source,
+                                             bool with_padding,
+                                             tstring* encoded);
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_BASE64_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/bfloat16.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/bfloat16.h
new file mode 100644
index 00000000..daf5e93e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/bfloat16.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_BFLOAT16_H_
+#define TENSORFLOW_TSL_PLATFORM_BFLOAT16_H_
+
+// clang-format off
+#include "Eigen/Core"  // from @eigen_archive
+// clang-format on
+
+namespace tsl {
+typedef Eigen::bfloat16 bfloat16;
+}  // end namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_BFLOAT16_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/blocking_counter.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/blocking_counter.h
new file mode 100644
index 00000000..e46fc759
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/blocking_counter.h
@@ -0,0 +1,80 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_BLOCKING_COUNTER_H_
+#define TENSORFLOW_TSL_PLATFORM_BLOCKING_COUNTER_H_
+
+#include <atomic>
+
+#include "xla/tsl/platform/logging.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+class BlockingCounter {
+ public:
+  BlockingCounter(int initial_count)
+      : state_(initial_count << 1), notified_(false) {
+    CHECK_GE(initial_count, 0);
+    DCHECK_EQ((initial_count << 1) >> 1, initial_count);
+  }
+
+  ~BlockingCounter() {}
+
+  inline void DecrementCount() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      DCHECK_NE(((v + 2) & ~1), 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    mutex_lock l(mu_);
+    DCHECK(!notified_);
+    notified_ = true;
+    cond_var_.notify_all();
+  }
+
+  inline void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    mutex_lock l(mu_);
+    while (!notified_) {
+      cond_var_.wait(l);
+    }
+  }
+  // Wait for the specified time, return false iff the count has not dropped to
+  // zero before the timeout expired.
+  inline bool WaitFor(std::chrono::milliseconds ms) {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return true;
+    mutex_lock l(mu_);
+    while (!notified_) {
+      const std::cv_status status = cond_var_.wait_for(l, ms);
+      if (status == std::cv_status::timeout) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  mutex mu_;
+  condition_variable cond_var_;
+  std::atomic<int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_BLOCKING_COUNTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/byte_order.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/byte_order.h
new file mode 100644
index 00000000..f38df9fb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/byte_order.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_BYTE_ORDER_H_
+#define TENSORFLOW_TSL_PLATFORM_BYTE_ORDER_H_
+
+// Byte order defines provided by gcc. MSVC doesn't define those so
+// we define them here.
+// We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define __ORDER_LITTLE_ENDIAN__ 0x4d2
+#define __ORDER_BIG_ENDIAN__ 0x10e1
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
+
+namespace tsl {
+namespace port {
+
+// TODO(jeff,sanjay): Make portable
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_BYTE_ORDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/casts.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/casts.h
new file mode 100644
index 00000000..8ec9f78e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/casts.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CASTS_H_
+#define TENSORFLOW_TSL_PLATFORM_CASTS_H_
+
+#include "tsl/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/casts.h"  // IWYU pragma: export
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) ||    \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_POSIX_IOS) || \
+    defined(PLATFORM_GOOGLE_IOS) || defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/default/casts.h"  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CASTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/coding.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/coding.h
new file mode 100644
index 00000000..b8153c18
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/coding.h
@@ -0,0 +1,72 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CODING_H_
+#define TENSORFLOW_TSL_PLATFORM_CODING_H_
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/tstring.h"
+
+namespace tsl {
+namespace core {
+
+// Maximum number of bytes occupied by a varint32.
+static const int kMaxVarint32Bytes = 5;
+
+// Maximum number of bytes occupied by a varint64.
+static const int kMaxVarint64Bytes = 10;
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed16(char* dst, uint16 value);
+extern void EncodeFixed32(char* dst, uint32 value);
+extern void EncodeFixed64(char* dst, uint64 value);
+extern void PutFixed16(string* dst, uint16 value);
+extern void PutFixed32(string* dst, uint32 value);
+extern void PutFixed64(string* dst, uint64 value);
+
+extern void PutVarint32(string* dst, uint32 value);
+extern void PutVarint64(string* dst, uint64 value);
+
+extern void PutVarint32(tstring* dst, uint32 value);
+extern void PutVarint64(tstring* dst, uint64 value);
+
+extern bool GetVarint32(absl::string_view* input, uint32* value);
+extern bool GetVarint64(absl::string_view* input, uint64* value);
+
+extern const char* GetVarint32Ptr(const char* p, const char* limit, uint32* v);
+extern const char* GetVarint64Ptr(const char* p, const char* limit, uint64* v);
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p, const char* limit,
+                                          uint32* value);
+extern const char* GetVarint32Ptr(const char* p, const char* limit,
+                                  uint32* value);
+extern char* EncodeVarint32(char* dst, uint32 v);
+extern char* EncodeVarint64(char* dst, uint64 v);
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+}  // namespace core
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CODING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/context.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/context.h
new file mode 100644
index 00000000..52543675
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/context.h
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CONTEXT_H_
+#define TENSORFLOW_TSL_PLATFORM_CONTEXT_H_
+
+#include "tsl/platform/platform.h"
+
+namespace tsl {
+
+enum class ContextKind {
+  // Initial state with default (empty) values.
+  kDefault,
+  // Initial state inherited from the creating or scheduling thread.
+  kThread,
+};
+
+// Context is a container for request-specific information that should be passed
+// to threads that perform related work. The default constructor should capture
+// all relevant context.
+class Context;
+
+// Scoped object that sets the current thread's context until the object is
+// destroyed.
+class WithContext;
+
+}  // namespace tsl
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/context.h"  // IWYU pragma: export
+#else
+#include "xla/tsl/platform/default/context.h"  // IWYU pragma: export
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cord.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cord.h
new file mode 100644
index 00000000..cb1233f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cord.h
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CORD_H_
+#define TENSORFLOW_TSL_PLATFORM_CORD_H_
+
+// It seems CORD doesn't work well with CUDA <= 10.2
+#if !defined(__CUDACC__) || ((defined(__CUDACC__) && CUDA_VERSION > 10020))
+#include "absl/strings/cord.h"  // IWYU pragma: export
+#define TF_CORD_SUPPORT 1
+
+#endif  // __CUDACC__
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CORD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
new file mode 100644
index 00000000..c8d3903f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
@@ -0,0 +1,190 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_
+#define TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_
+
+#include <string>
+
+// TODO(ahentz): This is not strictly required here but, for historical
+// reasons, many people depend on cpu_info.h in order to use kLittleEndian.
+#include "tsl/platform/byte_order.h"
+#include "tsl/platform/platform.h"
+
+#if defined(_MSC_VER)
+// included so __cpuidex function is available for GETCPUID on Windows
+#include <intrin.h>
+#endif
+
+namespace tsl {
+namespace port {
+
+// Returns an estimate of the number of schedulable CPUs for this
+// process.  Usually, it's constant throughout the lifetime of a
+// process, but it might change if the underlying cluster management
+// software can change it dynamically.  If the underlying call fails, a default
+// value (e.g. `4`) may be returned.
+int NumSchedulableCPUs();
+
+// Returns an estimate for the maximum parallelism for this process.
+// Applications should avoid running more than this number of threads with
+// intensive workloads concurrently to avoid performance degradation and
+// contention.
+// This value is either the number of schedulable CPUs, or a value specific to
+// the underlying cluster management. Applications should assume this value can
+// change throughout the lifetime of the process. This function must not be
+// called during initialization, i.e., before main() has started.
+int MaxParallelism();
+
+// Returns an estimate for the maximum parallelism for this process on the
+// provided numa node, or any numa node if `numa_node` is kNUMANoAffinity.
+// See MaxParallelism() for more information.
+int MaxParallelism(int numa_node);
+
+// Returns the total number of CPUs on the system.  This number should
+// not change even if the underlying cluster management software may
+// change the number of schedulable CPUs.  Unlike `NumSchedulableCPUs`, if the
+// underlying call fails, an invalid value of -1 will be returned;
+// the user must check for validity.
+static constexpr int kUnknownCPU = -1;
+int NumTotalCPUs();
+
+// Returns the id of the current CPU.  Returns -1 if the current CPU cannot be
+// identified.  If successful, the return value will be in [0, NumTotalCPUs()).
+int GetCurrentCPU();
+
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
+// Mostly ISA related features that we care about
+enum CPUFeature {
+  // Do not change numeric assignments.
+  MMX = 0,
+  SSE = 1,
+  SSE2 = 2,
+  SSE3 = 3,
+  SSSE3 = 4,
+  SSE4_1 = 5,
+  SSE4_2 = 6,
+  CMOV = 7,
+  CMPXCHG8B = 8,
+  CMPXCHG16B = 9,
+  POPCNT = 10,
+  AES = 11,
+  AVX = 12,
+  RDRAND = 13,
+  AVX2 = 14,
+  FMA = 15,
+  F16C = 16,
+  PCLMULQDQ = 17,
+  RDSEED = 18,
+  ADX = 19,
+  SMAP = 20,
+
+  // Prefetch Vector Data Into Caches with Intent to Write and T1 Hint
+  // http://www.felixcloutier.com/x86/PREFETCHWT1.html.
+  // You probably want PREFETCHW instead.
+  PREFETCHWT1 = 21,
+
+  BMI1 = 22,
+  BMI2 = 23,
+  HYPERVISOR = 25,  // 0 when on a real CPU, 1 on (well-behaved) hypervisor.
+
+  // Prefetch Data into Caches in Anticipation of a Write (3D Now!).
+  // http://www.felixcloutier.com/x86/PREFETCHW.html
+  PREFETCHW = 26,
+
+  // AVX-512: 512-bit vectors (plus masking, etc.) in Knights Landing,
+  // Skylake, Xeon, etc. Each of these entries is a different subset of
+  // instructions, various combinations of which occur on various CPU types.
+  AVX512F = 27,        // Foundation
+  AVX512CD = 28,       // Conflict detection
+  AVX512ER = 29,       // Exponential and reciprocal
+  AVX512PF = 30,       // Prefetching
+  AVX512VL = 31,       // Shorter vector lengths
+  AVX512BW = 32,       // Byte and word
+  AVX512DQ = 33,       // Dword and qword
+  AVX512VBMI = 34,     // Bit manipulation
+  AVX512IFMA = 35,     // Integer multiply-add
+  AVX512_4VNNIW = 36,  // Integer neural network (Intel Xeon Phi only)
+  AVX512_4FMAPS = 37,  // Floating point neural network (Intel Xeon Phi only)
+  AVX512_VNNI = 38,    // Integer neural network
+  AVX512_BF16 = 39,    // Bfloat16 neural network
+
+  // AVX version of AVX512_VNNI in CPUs such as Alder Lake and Sapphire Rapids.
+  AVX_VNNI = 40,  // Integer neural network
+
+  // AMX: Advanced Matrix Extension in Sapphire Rapids.
+  // Perform matrix multiplication on the Tile Matrix Multiply (TMUL) unit,
+  // supporting two popular data types in neural networks, int8 and bfloat16.
+  AMX_TILE = 41,  // Tile configuration and load/store
+  AMX_INT8 = 42,  // Int8 tile matrix multiplication
+  AMX_BF16 = 43,  // Bfloat16 tile matrix multiplication
+
+  AVX512_FP16 = 44,     // Float16 neural network
+  AMX_FP16 = 45,        // Float16 tile matrix multiplication
+  AVX_NE_CONVERT = 46,  // Instructions for faster bfloat16, float16 convert.
+  AVX_VNNI_INT8 = 47,   // VNNI instructions for combinations of u8, s8 dtypes.
+};
+
+enum Aarch64CPU {
+  ARM_NEOVERSE_N1 = 0,  // ARM NEOVERSE N1
+  ARM_NEOVERSE_V1 = 1,  // ARM NEOVERSE V1
+};
+// Checks whether the current AArch64 processor is supported.
+bool TestAarch64CPU(Aarch64CPU cpu);
+
+// Checks whether the current processor supports one of the features above.
+// Checks CPU registers to return hardware capabilities.
+bool TestCPUFeature(CPUFeature feature);
+
+// Checks whether the current processor is x86.
+constexpr bool IsX86CPU() {
+#ifdef PLATFORM_IS_X86
+  return true;
+#else
+  return false;
+#endif
+}
+
+// Checks whether the current processor is aarch64.
+constexpr bool IsAarch64CPU() {
+#if defined(PLATFORM_IS_ARM64) && !defined(__APPLE__) && !defined(__OpenBSD__)
+  return true;
+#else
+  return false;
+#endif
+}
+
+// Returns CPU Vendor string (i.e. 'GenuineIntel', 'AuthenticAMD', etc.)
+std::string CPUVendorIDString();
+
+// Returns CPU family.
+int CPUFamily();
+
+// Returns CPU model number.
+int CPUModelNum();
+
+// Returns nominal core processor cycles per second of each processor.
+double NominalCPUFrequency();
+
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/crash_analysis.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/crash_analysis.h
new file mode 100644
index 00000000..e0926f7e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/crash_analysis.h
@@ -0,0 +1,28 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CRASH_ANALYSIS_H_
+#define TENSORFLOW_TSL_PLATFORM_CRASH_ANALYSIS_H_
+
+#include "tsl/platform/platform.h"
+
+// Include appropriate platform-dependent implementations
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/crash_analysis.h"  // IWYU pragma: export
+#else
+#include "xla/tsl/platform/default/crash_analysis.h"  // IWYU pragma: export
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CRASH_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/criticality.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/criticality.h
new file mode 100644
index 00000000..722c8135
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/criticality.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CRITICALITY_H_
+#define TENSORFLOW_TSL_PLATFORM_CRITICALITY_H_
+
+#include "tsl/platform/platform.h"
+
+namespace tsl {
+
+namespace criticality {
+
+enum class Criticality {
+  // Frequent full and paritial unavailability is expected and not a cause for
+  // concern.
+  kSheddable = 0,
+  // Partial unavailability is expected and not necessarily a cause for concern.
+  kSheddablePlus = 1,
+  // Any outage is a serious concern. This is the default priority for RPCs
+  // sent from production jobs.
+  kCritical = 2,
+  // Any outage is a serious concern.  Less than 50% of requests to a service
+  // can be in this band. During an outage, this band will be prioritized above
+  // all others.
+  kCriticalPlus = 3,
+};
+
+}  // namespace criticality
+
+}  // namespace tsl
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/criticality.h"  // IWYU pragma: export
+#else
+#include "xla/tsl/platform/default/criticality.h"  // IWYU pragma: export
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CRITICALITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ctstring.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
new file mode 100644
index 00000000..f841e5f4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CTSTRING_H_
+#define TENSORFLOW_TSL_PLATFORM_CTSTRING_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tsl/platform/ctstring_internal.h"
+
+// Initialize a new tstring.  This must be called before using any function
+// below.
+inline void TF_TString_Init(TF_TString *str);
+// Deallocate a tstring.
+inline void TF_TString_Dealloc(TF_TString *str);
+
+// Resizes `str' to `new_size'.  This function will appropriately grow or shrink
+// the string buffer to fit a `new_size' string.  Grown regions of the string
+// will be initialized with `c'.
+inline char *TF_TString_Resize(TF_TString *str, size_t new_size, char c);
+// Similar to TF_TString_Resize, except the newly allocated regions will remain
+// uninitialized.  This is useful if you plan on overwriting the newly grown
+// regions immediately after allocation; doing so will elide a superfluous
+// initialization of the new buffer.
+inline char *TF_TString_ResizeUninitialized(TF_TString *str, size_t new_size);
+// Reserves a string buffer with a capacity of at least `new_cap'.
+// Reserve will not change the size, or the contents of the existing
+// string.  This is useful if you have a rough idea of `str's upperbound in
+// size, and want to avoid allocations as you append to `str'. It should not be
+// considered safe to write in the region between size and capacity; explicitly
+// resize before doing so.
+inline void TF_TString_Reserve(TF_TString *str, size_t new_cap);
+// Similar to TF_TString_Reserve, except that we ensure amortized growth, i.e.
+// that we grow the capacity by at least a constant factor >1.
+inline void TF_TString_ReserveAmortized(TF_TString *str, size_t new_cap);
+
+// Returns the size of the string.
+inline size_t TF_TString_GetSize(const TF_TString *str);
+// Returns the capacity of the string buffer.  It should not be considered safe
+// to write in the region between size and capacity---call Resize or
+// ResizeUninitialized before doing so.
+inline size_t TF_TString_GetCapacity(const TF_TString *str);
+// Returns the underlying type of the tstring:
+// TF_TSTR_SMALL:
+//    Small string optimization; the contents of strings
+//    less than 22-bytes are stored in the TF_TString struct. This avoids any
+//    heap allocations.
+// TF_TSTR_LARGE:
+//    Heap allocated string.
+// TF_TSTR_OFFSET: (currently unused)
+//    An offset defined string.  The string buffer begins at an internally
+//    defined little-endian offset from `str'; i.e. GetDataPointer() = str +
+//    offset.  This type is useful for memory mapping or reading string tensors
+//    directly from file, without the need to deserialize the data.  For
+//    security reasons, it is imperative that OFFSET based string tensors are
+//    validated before use, or are from a trusted source.
+// TF_TSTR_VIEW:
+//    A view into an unowned character string.
+//
+// NOTE:
+//    VIEW and OFFSET types are immutable, so any modifcation via Append,
+//    AppendN, or GetMutableDataPointer of a VIEW/OFFSET based tstring will
+//    result in a conversion to an owned type (SMALL/LARGE).
+inline TF_TString_Type TF_TString_GetType(const TF_TString *str);
+
+// Returns a const char pointer to the start of the underlying string. The
+// underlying character buffer may not be null-terminated.
+inline const char *TF_TString_GetDataPointer(const TF_TString *str);
+// Returns a char pointer to a mutable representation of the underlying string.
+// In the case of VIEW and OFFSET types, `src' is converted to an owned type
+// (SMALL/LARGE).  The underlying character buffer may not be null-terminated.
+inline char *TF_TString_GetMutableDataPointer(TF_TString *str);
+
+// Sets `dst' as a VIEW type to `src'.  `dst' will not take ownership of `src'.
+// It is the user's responsibility to ensure that the lifetime of `src' exceeds
+// `dst'.  Any mutations to `dst' via Append, AppendN, or GetMutableDataPointer,
+// will result in a copy into an owned SMALL or LARGE type, and will not modify
+// `src'.
+inline void TF_TString_AssignView(TF_TString *dst, const char *src,
+                                  size_t size);
+
+// Appends `src' onto `dst'.  If `dst' is a VIEW or OFFSET type, it will first
+// be converted to an owned LARGE or SMALL type.  `dst' should not point to
+// memory owned by `src'.
+inline void TF_TString_Append(TF_TString *dst, const TF_TString *src);
+inline void TF_TString_AppendN(TF_TString *dst, const char *src, size_t size);
+
+// Copy/Move/Assign semantics
+//
+//        | src     | dst          | complexity
+// Copy   | *       |  SMALL/LARGE | fixed/O(size)
+// Assign | SMALL   |  SMALL       | fixed
+// Assign | OFFSET  |  VIEW        | fixed
+// Assign | VIEW    |  VIEW        | fixed
+// Assign | LARGE   |  LARGE       | O(size)
+// Move   | *       |  same as src | fixed
+
+// Copies `src' to `dst'. `dst' will be an owned type (SMALL/LARGE). `src'
+// should not point to memory owned by `dst'.
+inline void TF_TString_Copy(TF_TString *dst, const char *src, size_t size);
+// Assigns a `src' tstring to `dst'.  An OFFSET `src' type will yield a `VIEW'
+// `dst'.  LARGE `src' types will be copied to a new buffer; all other `src'
+// types will incur a fixed cost.
+inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src);
+// Moves a `src' tstring to `dst'.  Moving a LARGE `src' to `dst' will result in
+// a valid but unspecified `src'.  This function incurs a fixed cost for all
+// inputs.
+inline void TF_TString_Move(TF_TString *dst, TF_TString *src);
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CTSTRING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
new file mode 100644
index 00000000..43e909a8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
@@ -0,0 +1,455 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CTSTRING_INTERNAL_H_
+#define TENSORFLOW_TSL_PLATFORM_CTSTRING_INTERNAL_H_
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+     __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||                  \
+    defined(_WIN32)
+#define TF_TSTRING_LITTLE_ENDIAN 1
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define TF_TSTRING_LITTLE_ENDIAN 0
+#else
+#error "Unable to detect endianness."
+#endif
+
+#if defined(__clang__) || \
+    (defined(__GNUC__) && \
+     ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5))
+static inline uint32_t TF_swap32(uint32_t host_int) {
+  return __builtin_bswap32(host_int);
+}
+
+#elif defined(_MSC_VER)
+static inline uint32_t TF_swap32(uint32_t host_int) {
+  return _byteswap_ulong(host_int);
+}
+
+#elif defined(__APPLE__)
+static inline uint32_t TF_swap32(uint32_t host_int) {
+  return OSSwapInt32(host_int);
+}
+
+#else
+static inline uint32_t TF_swap32(uint32_t host_int) {
+#if defined(__GLIBC__)
+  return bswap_32(host_int);
+#else   // defined(__GLIBC__)
+  return (((host_int & uint32_t{0xFF}) << 24) |
+          ((host_int & uint32_t{0xFF00}) << 8) |
+          ((host_int & uint32_t{0xFF0000}) >> 8) |
+          ((host_int & uint32_t{0xFF000000}) >> 24));
+#endif  // defined(__GLIBC__)
+}
+#endif
+
+#if TF_TSTRING_LITTLE_ENDIAN
+#define TF_le32toh(x) x
+#else  // TF_TSTRING_LITTLE_ENDIAN
+#define TF_le32toh(x) TF_swap32(x)
+#endif  // TF_TSTRING_LITTLE_ENDIAN
+
+static inline size_t TF_align16(size_t i) { return (i + 0xF) & ~0xF; }
+
+static inline size_t TF_max(size_t a, size_t b) { return a > b ? a : b; }
+static inline size_t TF_min(size_t a, size_t b) { return a < b ? a : b; }
+
+typedef enum TF_TString_Type {  // NOLINT
+  TF_TSTR_SMALL = 0x00,
+  TF_TSTR_LARGE = 0x01,
+  TF_TSTR_OFFSET = 0x02,
+  TF_TSTR_VIEW = 0x03,
+  TF_TSTR_TYPE_MASK = 0x03
+} TF_TString_Type;
+
+typedef struct TF_TString_Large {  // NOLINT
+  size_t size;
+  size_t cap;
+  char *ptr;
+} TF_TString_Large;
+
+typedef struct TF_TString_Offset {  // NOLINT
+  uint32_t size;
+  uint32_t offset;
+  uint32_t count;
+} TF_TString_Offset;
+
+typedef struct TF_TString_View {  // NOLINT
+  size_t size;
+  const char *ptr;
+} TF_TString_View;
+
+typedef struct TF_TString_Raw {  // NOLINT
+  uint8_t raw[24];
+} TF_TString_Raw;
+
+typedef union TF_TString_Union {  // NOLINT
+  TF_TString_Large large;
+  TF_TString_Offset offset;
+  TF_TString_View view;
+  TF_TString_Raw raw;
+} TF_TString_Union;
+
+enum {
+  TF_TString_SmallCapacity =
+      (sizeof(TF_TString_Union) - sizeof(/* null delim */ char) -
+       sizeof(/* uint8_t size */ uint8_t)),
+};
+
+typedef struct TF_TString_Small {  // NOLINT
+  uint8_t size;
+  char str[TF_TString_SmallCapacity + sizeof(/* null delim */ char)];
+} TF_TString_Small;
+
+typedef struct TF_TString {  // NOLINT
+  union {
+    // small conflicts with '#define small char' in RpcNdr.h for MSVC, so we use
+    // smll instead.
+    TF_TString_Small smll;
+    TF_TString_Large large;
+    TF_TString_Offset offset;
+    TF_TString_View view;
+    TF_TString_Raw raw;
+  } u;
+} TF_TString;
+
+// TODO(dero): Fix for OSS, and add C only build test.
+// _Static_assert(CHAR_BIT == 8);
+// _Static_assert(sizeof(TF_TString) == 24);
+
+static inline TF_TString_Type TF_TString_GetType(const TF_TString *str) {
+  return (TF_TString_Type)(str->u.raw.raw[0] & TF_TSTR_TYPE_MASK);  // NOLINT
+}
+
+// XXX(dero): For the big-endian case, this function could potentially be more
+// performant and readable by always storing the string size as little-endian
+// and always byte-swapping on big endian, resulting in a simple 'bswap'+'shr'
+// (for architectures that have a bswap op).
+static inline size_t TF_TString_ToActualSizeT(size_t size) {
+#if TF_TSTRING_LITTLE_ENDIAN
+  return size >> 2;
+#else   // TF_TSTRING_LITTLE_ENDIAN
+  // 0xFF000000 or 0xFF00000000000000 depending on platform
+  static const size_t mask = ~((~(size_t)0) >> 8);
+
+  return (((mask << 2) & size) >> 2) | (~mask & size);
+#endif  // TF_TSTRING_LITTLE_ENDIAN
+}
+
+static inline size_t TF_TString_ToInternalSizeT(size_t size,
+                                                TF_TString_Type type) {
+#if TF_TSTRING_LITTLE_ENDIAN
+  return (size << 2) | type;
+#else   // TF_TSTRING_LITTLE_ENDIAN
+  // 0xFF000000 or 0xFF00000000000000 depending on platform
+  static const size_t mask = ~((~(size_t)0) >> 8);
+
+  return (mask & (size << 2)) | (~mask & size) |
+         ((size_t)type << ((sizeof(size_t) - 1) * 8));  // NOLINT
+#endif  // TF_TSTRING_LITTLE_ENDIAN
+}
+
+static inline void TF_TString_Init(TF_TString *str) {
+  memset(str->u.raw.raw, 0, sizeof(TF_TString_Raw));
+}
+
+static inline void TF_TString_Dealloc(TF_TString *str) {
+  if (TF_TString_GetType(str) == TF_TSTR_LARGE &&
+      str->u.large.ptr != NULL) {  // NOLINT
+    free(str->u.large.ptr);
+    TF_TString_Init(str);
+  }
+}
+
+static inline size_t TF_TString_GetSize(const TF_TString *str) {
+  switch (TF_TString_GetType(str)) {
+    case TF_TSTR_SMALL:
+      return str->u.smll.size >> 2;
+    case TF_TSTR_LARGE:
+      return TF_TString_ToActualSizeT(str->u.large.size);
+    case TF_TSTR_OFFSET:
+      return TF_le32toh(str->u.offset.size) >> 2;
+    case TF_TSTR_VIEW:
+      return TF_TString_ToActualSizeT(str->u.view.size);
+    default:
+      return 0;  // Unreachable.
+  }
+}
+
+static inline size_t TF_TString_GetCapacity(const TF_TString *str) {
+  switch (TF_TString_GetType(str)) {
+    case TF_TSTR_SMALL:
+      return TF_TString_SmallCapacity;
+    case TF_TSTR_LARGE:
+      return str->u.large.cap;
+    case TF_TSTR_OFFSET:
+    case TF_TSTR_VIEW:
+    default:
+      return 0;
+  }
+}
+
+static inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
+  switch (TF_TString_GetType(str)) {
+    case TF_TSTR_SMALL:
+      return str->u.smll.str;
+    case TF_TSTR_LARGE:
+      return str->u.large.ptr;
+    case TF_TSTR_OFFSET:
+      return (const char *)str + TF_le32toh(str->u.offset.offset);  // NOLINT
+    case TF_TSTR_VIEW:
+      return str->u.view.ptr;
+    default:
+      // Unreachable.
+      return NULL;  // NOLINT
+  }
+}
+
+static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
+                                                   size_t new_size) {
+  size_t curr_size = TF_TString_GetSize(str);
+  size_t copy_size = TF_min(new_size, curr_size);
+
+  TF_TString_Type curr_type = TF_TString_GetType(str);
+  const char *curr_ptr = TF_TString_GetDataPointer(str);
+
+  // Case: SMALL/LARGE/VIEW/OFFSET -> SMALL
+  if (new_size <= TF_TString_SmallCapacity) {
+    str->u.smll.size = (uint8_t)((new_size << 2) | TF_TSTR_SMALL);  // NOLINT
+    str->u.smll.str[new_size] = '\0';
+
+    if (curr_type != TF_TSTR_SMALL && copy_size) {
+      memcpy(str->u.smll.str, curr_ptr, copy_size);
+    }
+
+    if (curr_type == TF_TSTR_LARGE) {
+      free((void *)curr_ptr);  // NOLINT
+    }
+
+    // We do not clear out the newly excluded region.
+
+    return str->u.smll.str;
+  }
+
+  // Case: SMALL/LARGE/VIEW/OFFSET -> LARGE
+  size_t new_cap;
+  size_t curr_cap = TF_TString_GetCapacity(str);
+
+  if (new_size < curr_size && new_size < curr_cap / 2) {
+    // TODO(dero): Replace with shrink_to_fit flag.
+    new_cap = TF_align16(curr_cap / 2 + 1) - 1;
+  } else if (new_size > curr_cap) {
+    new_cap = TF_align16(new_size + 1) - 1;
+  } else {
+    new_cap = curr_cap;
+  }
+
+  char *new_ptr;
+  if (new_cap == curr_cap) {
+    new_ptr = str->u.large.ptr;
+  } else if (curr_type == TF_TSTR_LARGE) {
+    new_ptr = (char *)realloc(str->u.large.ptr, new_cap + 1);  // NOLINT
+  } else {
+    new_ptr = (char *)malloc(new_cap + 1);  // NOLINT
+    if (copy_size) {
+      memcpy(new_ptr, curr_ptr, copy_size);
+    }
+  }
+
+  str->u.large.size = TF_TString_ToInternalSizeT(new_size, TF_TSTR_LARGE);
+  str->u.large.ptr = new_ptr;
+  str->u.large.ptr[new_size] = '\0';
+  str->u.large.cap = new_cap;
+
+  return str->u.large.ptr;
+}
+
+static inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
+  switch (TF_TString_GetType(str)) {
+    case TF_TSTR_SMALL:
+      return str->u.smll.str;
+    case TF_TSTR_OFFSET:
+    case TF_TSTR_VIEW:
+      // Convert OFFSET/VIEW to SMALL/LARGE
+      TF_TString_ResizeUninitialized(str, TF_TString_GetSize(str));
+      return (TF_TString_GetType(str) == TF_TSTR_SMALL) ? str->u.smll.str
+                                                        : str->u.large.ptr;
+    case TF_TSTR_LARGE:
+      return str->u.large.ptr;
+    default:
+      // Unreachable.
+      return NULL;  // NOLINT
+  }
+}
+
+static inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
+  TF_TString_Type curr_type = TF_TString_GetType(str);
+
+  if (new_cap <= TF_TString_SmallCapacity) {
+    // We do nothing, we let Resize/GetMutableDataPointer handle the
+    // conversion to SMALL from VIEW/OFFSET when the need arises.
+    // In the degenerate case, where new_cap <= TF_TString_SmallCapacity,
+    // curr_size > TF_TString_SmallCapacity, and the type is VIEW/OFFSET, we
+    // defer the malloc to Resize/GetMutableDataPointer.
+    return;
+  }
+
+  if (curr_type == TF_TSTR_LARGE && new_cap <= str->u.large.cap) {
+    // We handle reduced cap in resize.
+    return;
+  }
+
+  // Case: VIEW/OFFSET -> LARGE or grow an existing LARGE type
+  size_t curr_size = TF_TString_GetSize(str);
+  const char *curr_ptr = TF_TString_GetDataPointer(str);
+
+  // Since VIEW and OFFSET types are read-only, their capacity is effectively 0.
+  // So we make sure we have enough room in the VIEW and OFFSET cases.
+  new_cap = TF_align16(TF_max(new_cap, curr_size) + 1) - 1;
+
+  if (curr_type == TF_TSTR_LARGE) {
+    str->u.large.ptr =
+        (char *)realloc(str->u.large.ptr, new_cap + 1);  // NOLINT
+  } else {
+    // Convert to Large
+    char *new_ptr = (char *)malloc(new_cap + 1);  // NOLINT
+    memcpy(new_ptr, curr_ptr, curr_size);
+
+    str->u.large.size = TF_TString_ToInternalSizeT(curr_size, TF_TSTR_LARGE);
+    str->u.large.ptr = new_ptr;
+    str->u.large.ptr[curr_size] = '\0';
+  }
+
+  str->u.large.cap = new_cap;
+}
+
+static inline void TF_TString_ReserveAmortized(TF_TString *str,
+                                               size_t new_cap) {
+  const size_t curr_cap = TF_TString_GetCapacity(str);
+  if (new_cap > curr_cap) {
+    TF_TString_Reserve(str, new_cap > 2 * curr_cap ? new_cap : 2 * curr_cap);
+  }
+}
+
+static inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
+                                      char c) {
+  size_t curr_size = TF_TString_GetSize(str);
+  char *cstr = TF_TString_ResizeUninitialized(str, new_size);
+
+  if (new_size > curr_size) {
+    memset(cstr + curr_size, c, new_size - curr_size);
+  }
+
+  return cstr;
+}
+
+static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
+                                         size_t size) {
+  TF_TString_Dealloc(dst);
+
+  dst->u.view.size = TF_TString_ToInternalSizeT(size, TF_TSTR_VIEW);
+  dst->u.view.ptr = src;
+}
+
+static inline void TF_TString_AppendN(TF_TString *dst, const char *src,
+                                      size_t src_size) {
+  if (!src_size) return;
+
+  size_t dst_size = TF_TString_GetSize(dst);
+
+  // For append use cases, we want to ensure amortized growth.
+  TF_TString_ReserveAmortized(dst, dst_size + src_size);
+  char *dst_c = TF_TString_ResizeUninitialized(dst, dst_size + src_size);
+
+  memcpy(dst_c + dst_size, src, src_size);
+}
+
+static inline void TF_TString_Append(TF_TString *dst, const TF_TString *src) {
+  const char *src_c = TF_TString_GetDataPointer(src);
+  size_t size = TF_TString_GetSize(src);
+
+  TF_TString_AppendN(dst, src_c, size);
+}
+
+static inline void TF_TString_Copy(TF_TString *dst, const char *src,
+                                   size_t size) {
+  char *dst_c = TF_TString_ResizeUninitialized(dst, size);
+
+  if (size) memcpy(dst_c, src, size);
+}
+
+static inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
+  if (dst == src) return;
+
+  TF_TString_Dealloc(dst);
+
+  switch (TF_TString_GetType(src)) {
+    case TF_TSTR_SMALL:
+    case TF_TSTR_VIEW:
+      *dst = *src;
+      return;
+    case TF_TSTR_LARGE: {
+      const char *src_c = TF_TString_GetDataPointer(src);
+      size_t size = TF_TString_GetSize(src);
+
+      TF_TString_Copy(dst, src_c, size);
+    }
+      return;
+    case TF_TSTR_OFFSET: {
+      const char *src_c = TF_TString_GetDataPointer(src);
+      size_t size = TF_TString_GetSize(src);
+
+      TF_TString_AssignView(dst, src_c, size);
+    }
+      return;
+    default:
+      return;  // Unreachable.
+  }
+}
+
+static inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
+  if (dst == src) return;
+
+  TF_TString_Dealloc(dst);
+
+  switch (TF_TString_GetType(src)) {
+    case TF_TSTR_SMALL:
+    case TF_TSTR_VIEW:
+      *dst = *src;
+      return;
+    case TF_TSTR_LARGE:
+      *dst = *src;
+      TF_TString_Init(src);
+      return;
+    case TF_TSTR_OFFSET: {
+      const char *src_c = TF_TString_GetDataPointer(src);
+      size_t size = TF_TString_GetSize(src);
+
+      TF_TString_AssignView(dst, src_c, size);
+    }
+      return;
+    default:
+      return;  // Unreachable.
+  }
+}
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CTSTRING_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cuda_root_path.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cuda_root_path.h
new file mode 100644
index 00000000..65a9ca5a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/cuda_root_path.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_CUDA_ROOT_PATH_H_
+#define TENSORFLOW_TSL_PLATFORM_CUDA_ROOT_PATH_H_
+
+#include <string>
+#include <vector>
+
+namespace tsl {
+
+// Returns, in order of preference, potential locations of the root directory of
+// the CUDA SDK, which contains sub-folders such as bin, lib64, and nvvm.
+std::vector<std::string> CandidateCudaRoots();
+
+// A convenient wrapper for CandidateCudaRoots, which allows supplying a
+// preferred location (inserted first in the output vector), and a flag whether
+// the current working directory should be searched (inserted last).
+inline std::vector<std::string> CandidateCudaRoots(
+    std::string preferred_location, bool use_working_directory = true) {
+  std::vector<std::string> candidates = CandidateCudaRoots();
+  if (!preferred_location.empty()) {
+    candidates.insert(candidates.begin(), preferred_location);
+  }
+
+  // "." is our last resort, even though it probably won't work.
+  candidates.push_back(".");
+
+  return candidates;
+}
+
+// Returns true if we should prefer ptxas from PATH.
+bool PreferPtxasFromPath();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_CUDA_ROOT_PATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/demangle.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/demangle.h
new file mode 100644
index 00000000..4b7576f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/demangle.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_DEMANGLE_H_
+#define TENSORFLOW_TSL_PLATFORM_DEMANGLE_H_
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace port {
+
+// If the compiler supports, demangle a mangled symbol name and return
+// the demangled name. Otherwise, returns 'mangled' as is.
+string Demangle(const char* mangled);
+inline string Demangle(const string mangled) {
+  return Demangle(mangled.c_str());
+}
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_DEMANGLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/denormal.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/denormal.h
new file mode 100644
index 00000000..05e52d3c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/denormal.h
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_DENORMAL_H_
+#define TENSORFLOW_TSL_PLATFORM_DENORMAL_H_
+
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+namespace port {
+
+// State for handling of denormals.
+class DenormalState {
+ public:
+  DenormalState(bool flush_to_zero, bool denormals_are_zero)
+      : flush_to_zero_(flush_to_zero),
+        denormals_are_zero_(denormals_are_zero) {}
+
+  // Output denormals of floating-point operations are flushed to zero.
+  inline bool flush_to_zero() const { return flush_to_zero_; }
+
+  // Input denormals to floating-point operations are treated as zero.
+  inline bool denormals_are_zero() const { return denormals_are_zero_; }
+
+  bool operator==(const DenormalState& other) const;
+  bool operator!=(const DenormalState& other) const;
+
+ private:
+  bool flush_to_zero_;
+  bool denormals_are_zero_;
+};
+
+// Gets the platform's current state for handling denormals.
+DenormalState GetDenormalState();
+
+// Sets handling of denormals if the platform allows it. Returns `true` if the
+// platform supports setting denormals to the specified state. Otherwise the
+// denormal state remains unmodified and false is returned.
+bool SetDenormalState(const DenormalState& state);
+
+// Remembers the flush denormal state on construction and restores that same
+// state on destruction.
+class ScopedRestoreFlushDenormalState {
+ public:
+  ScopedRestoreFlushDenormalState();
+  ~ScopedRestoreFlushDenormalState();
+
+ private:
+  DenormalState denormal_state_;
+  ScopedRestoreFlushDenormalState(const ScopedRestoreFlushDenormalState&) =
+      delete;
+  void operator=(const ScopedRestoreFlushDenormalState&) = delete;
+};
+
+// While this class is active, denormal floating point numbers are flushed
+// to zero.  The destructor restores the original flags.
+class ScopedFlushDenormal {
+ public:
+  ScopedFlushDenormal();
+
+ private:
+  ScopedRestoreFlushDenormalState restore_;
+  ScopedFlushDenormal(const ScopedFlushDenormal&) = delete;
+  void operator=(const ScopedFlushDenormal&) = delete;
+};
+
+// While this class is active, denormal floating point numbers are not flushed
+// to zero.  The destructor restores the original flags.
+class ScopedDontFlushDenormal {
+ public:
+  ScopedDontFlushDenormal();
+
+ private:
+  ScopedRestoreFlushDenormalState restore_;
+  ScopedDontFlushDenormal(const ScopedDontFlushDenormal&) = delete;
+  void operator=(const ScopedDontFlushDenormal&) = delete;
+};
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_DENORMAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h
new file mode 100644
index 00000000..b10a5f4e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_DSO_LOADER_H_
+#define TENSORFLOW_TSL_PLATFORM_DSO_LOADER_H_
+
+#include "tsl/platform/platform.h"
+
+// Include appropriate platform-dependent implementations
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_CHROMIUMOS)
+#include "xla/tsl/platform/google/dso_loader.h"
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/default/dso_loader.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_DSO_LOADER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/dynamic_annotations.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/dynamic_annotations.h
new file mode 100644
index 00000000..e0c5867c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/dynamic_annotations.h
@@ -0,0 +1,36 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+#define TENSORFLOW_TSL_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+
+#include "absl/base/dynamic_annotations.h"
+
+#define TF_ANNOTATE_MEMORY_IS_INITIALIZED(ptr, bytes) \
+  ANNOTATE_MEMORY_IS_INITIALIZED(ptr, bytes)
+
+#define TF_ANNOTATE_BENIGN_RACE(ptr, description) \
+  ANNOTATE_BENIGN_RACE(ptr, description)
+
+// Tell MemorySanitizer to relax the handling of a given function. All "Use of
+// uninitialized value" warnings from such functions will be suppressed, and
+// all values loaded from memory will be considered fully initialized.
+#ifdef MEMORY_SANITIZER
+#define TF_ATTRIBUTE_NO_SANITIZE_MEMORY __attribute__((no_sanitize_memory))
+#else
+#define TF_ATTRIBUTE_NO_SANITIZE_MEMORY
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_DYNAMIC_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/env.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/env.h
new file mode 100644
index 00000000..806cbb1c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/env.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ENV_H_
+#define TENSORFLOW_TSL_PLATFORM_ENV_H_
+
+#include "xla/tsl/platform/env.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ENV_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/env_time.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/env_time.h
new file mode 100644
index 00000000..eaadae80
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/env_time.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ENV_TIME_H_
+#define TENSORFLOW_TSL_PLATFORM_ENV_TIME_H_
+
+#include "xla/tsl/platform/env_time.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ENV_TIME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/error_logging.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/error_logging.h
new file mode 100644
index 00000000..0ee471d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/error_logging.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ERROR_LOGGING_H_
+#define TENSORFLOW_TSL_PLATFORM_ERROR_LOGGING_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+
+namespace tsl::error_logging {
+
+absl::Status Log(absl::string_view component, absl::string_view subcomponent,
+                 absl::string_view error_msg);
+
+}
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ERROR_LOGGING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/errors.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/errors.h
new file mode 100644
index 00000000..0c28bd41
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/errors.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ERRORS_H_
+#define TENSORFLOW_TSL_PLATFORM_ERRORS_H_
+
+#include "xla/tsl/platform/errors.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ERRORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_statistics.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_statistics.h
new file mode 100644
index 00000000..07bf908e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_statistics.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_FILE_STATISTICS_H_
+#define TENSORFLOW_TSL_PLATFORM_FILE_STATISTICS_H_
+
+#include "xla/tsl/platform/file_statistics.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_FILE_STATISTICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_system.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_system.h
new file mode 100644
index 00000000..8d55471a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_system.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_H_
+#define TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_H_
+
+#include "xla/tsl/platform/file_system.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h
new file mode 100644
index 00000000..49a0bd1c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include "xla/tsl/platform/file_system_helper.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h
new file mode 100644
index 00000000..33d2b707
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h
@@ -0,0 +1,127 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_FINGERPRINT_H_
+#define TENSORFLOW_TSL_PLATFORM_FINGERPRINT_H_
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/stringpiece.h"
+
+#if TSL_IS_IN_OSS
+#define USE_OSS_FARMHASH
+#endif  // TSL_IS_IN_OSS
+
+#ifdef USE_OSS_FARMHASH
+#include <farmhash.h>
+#else
+#include "util/hash/farmhash_fingerprint.h"
+#endif
+
+namespace tsl {
+
+struct Fprint128 {
+  uint64_t low64;
+  uint64_t high64;
+};
+
+inline bool operator==(const Fprint128& lhs, const Fprint128& rhs) {
+  return lhs.low64 == rhs.low64 && lhs.high64 == rhs.high64;
+}
+
+struct Fprint128Hasher {
+  size_t operator()(const Fprint128& v) const {
+    // Low64 should be sufficiently mixed to allow use of it as a Hash.
+    return static_cast<size_t>(v.low64);
+  }
+};
+
+namespace internal {
+// Mixes some of the bits that got propagated to the high bits back into the
+// low bits.
+inline uint64_t ShiftMix(const uint64_t val) { return val ^ (val >> 47); }
+}  // namespace internal
+
+// This concatenates two 64-bit fingerprints. It is a convenience function to
+// get a fingerprint for a combination of already fingerprinted components. For
+// example this code is used to concatenate the hashes from each of the features
+// on sparse crosses.
+//
+// One shouldn't expect FingerprintCat64(Fingerprint64(x), Fingerprint64(y))
+// to indicate anything about FingerprintCat64(StrCat(x, y)). This operation
+// is not commutative.
+//
+// From a security standpoint, we don't encourage this pattern to be used
+// for everything as it is vulnerable to length-extension attacks and it
+// is easier to compute multicollisions.
+inline uint64_t FingerprintCat64(const uint64_t fp1, const uint64_t fp2) {
+  static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
+  uint64_t result = fp1 ^ kMul;
+  result ^= internal::ShiftMix(fp2 * kMul) * kMul;
+  result *= kMul;
+  result = internal::ShiftMix(result) * kMul;
+  result = internal::ShiftMix(result);
+  return result;
+}
+
+// This is a portable fingerprint interface for strings that will never change.
+// However, it is not suitable for cryptography.
+inline uint64_t Fingerprint64(const absl::string_view s) {
+#ifdef USE_OSS_FARMHASH
+  return ::util::Fingerprint64(s.data(), s.size());
+#else
+  // Fingerprint op depends on the fact that Fingerprint64() is implemented by
+  // Farmhash. If the implementation ever changes, Fingerprint op should be
+  // modified to keep using Farmhash.
+  // LINT.IfChange
+  return farmhash::Fingerprint64(s.data(), s.size());
+  // LINT.ThenChange(//tensorflow/core/kernels/fingerprint_op.cc)
+#endif
+}
+
+// 32-bit variant of Fingerprint64 above (same properties and caveats apply).
+inline uint32_t Fingerprint32(const absl::string_view s) {
+#ifdef USE_OSS_FARMHASH
+  return ::util::Fingerprint32(s.data(), s.size());
+#else
+  return farmhash::Fingerprint32(s.data(), s.size());
+#endif
+}
+
+// 128-bit variant of Fingerprint64 above (same properties and caveats apply).
+inline Fprint128 Fingerprint128(const absl::string_view s) {
+#ifdef USE_OSS_FARMHASH
+  const auto fingerprint = ::util::Fingerprint128(s.data(), s.size());
+  return {::util::Uint128Low64(fingerprint),
+          ::util::Uint128High64(fingerprint)};
+#else
+  const auto fingerprint = farmhash::Fingerprint128(s.data(), s.size());
+  return {absl::Uint128Low64(fingerprint), absl::Uint128High64(fingerprint)};
+#endif
+}
+
+inline Fprint128 FingerprintCat128(const Fprint128& a, const Fprint128& b) {
+  return {FingerprintCat64(a.low64, b.low64),
+          FingerprintCat64(a.high64, b.high64)};
+}
+
+inline Fprint128 FingerprintCat128(const Fprint128& a, const uint64_t b) {
+  auto x = FingerprintCat64(a.low64, b);
+  return {x, FingerprintCat64(a.high64, x)};
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_FINGERPRINT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/grpc_credentials.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/grpc_credentials.h
new file mode 100644
index 00000000..5625811c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/grpc_credentials.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2024 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORFLOW_TSL_PLATFORM_GRPC_CREDENTIALS_H_
+#define TENSORFLOW_TSL_PLATFORM_GRPC_CREDENTIALS_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace tsl {
+
+// Get credentials to use in the client gRPC.
+// If `verify_secure_credentials`, crash if insecure credentials are used.
+std::shared_ptr<::grpc::ChannelCredentials> GetClientCredentials(
+    bool verify_secure_credentials = true);
+
+// Get credentials to use in the server gRPC.
+// If `verify_secure_credentials`, crash if insecure credentials are used.
+std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
+    bool verify_secure_credentials = true);
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_GRPC_CREDENTIALS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/hash.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/hash.h
new file mode 100644
index 00000000..174b233c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/hash.h
@@ -0,0 +1,135 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple hash functions used for internal data structures
+
+#ifndef TENSORFLOW_TSL_PLATFORM_HASH_H_
+#define TENSORFLOW_TSL_PLATFORM_HASH_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <functional>
+#include <string>
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+extern uint32 Hash32(const char* data, size_t n, uint32 seed);
+extern uint64 Hash64(const char* data, size_t n, uint64 seed);
+
+inline uint64 Hash64(const char* data, size_t n) {
+  return Hash64(data, n, 0xDECAFCAFFE);
+}
+
+inline uint64 Hash64(const char* data) { return Hash64(data, ::strlen(data)); }
+
+inline uint64 Hash64(const std::string& str) {
+  return Hash64(str.data(), str.size());
+}
+
+inline uint64 Hash64(const tstring& str) {
+  return Hash64(str.data(), str.size());
+}
+
+inline uint64 Hash64Combine(uint64 a, uint64 b) {
+  return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
+}
+
+// Combine two hashes in an order-independent way. This operation should be
+// associative and compute the same hash for a collection of elements
+// independent of traversal order. Note that it is better to combine hashes
+// symmetrically with addition rather than XOR, since (x^x) == 0 but (x+x) != 0.
+inline uint64 Hash64CombineUnordered(uint64 a, uint64 b) { return a + b; }
+
+// Hash functor suitable for use with power-of-two sized hashtables.  Use
+// instead of std::hash<T>.
+//
+// In particular, tsl::hash is not the identity function for pointers.
+// This is important for power-of-two sized hashtables like FlatMap and FlatSet,
+// because otherwise they waste the majority of their hash buckets.
+//
+// The second type argument is only used for SFNIAE below.
+template <typename T, typename = void>
+struct hash {
+  size_t operator()(const T& t) const { return std::hash<T>()(t); }
+};
+
+template <typename T>
+struct hash<T, typename std::enable_if<std::is_enum<T>::value>::type> {
+  size_t operator()(T value) const {
+    // This works around a defect in the std::hash C++ spec that isn't fixed in
+    // (at least) gcc 4.8.4:
+    // http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2148
+    //
+    // We should be able to remove this and use the default
+    // tsl::hash<EnumTy>() once we stop building with GCC versions old
+    // enough to not have this defect fixed.
+    return std::hash<uint64>()(static_cast<uint64>(value));
+  }
+};
+
+template <typename T>
+struct hash<T*> {
+  size_t operator()(const T* t) const {
+    // Hash pointers as integers, but bring more entropy to the lower bits.
+    size_t k = static_cast<size_t>(reinterpret_cast<uintptr_t>(t));
+    return k + (k >> 6);
+  }
+};
+
+template <>
+struct hash<string> {
+  size_t operator()(const string& s) const {
+    return static_cast<size_t>(Hash64(s));
+  }
+};
+
+template <>
+struct hash<tstring> {
+  size_t operator()(const tstring& s) const {
+    return static_cast<size_t>(Hash64(s.data(), s.size()));
+  }
+};
+
+template <>
+struct hash<absl::string_view> {
+  size_t operator()(absl::string_view sp) const {
+    return static_cast<size_t>(Hash64(sp.data(), sp.size()));
+  }
+};
+using StringPieceHasher = ::tsl::hash<absl::string_view>;
+
+template <typename T, typename U>
+struct hash<std::pair<T, U>> {
+  size_t operator()(const std::pair<T, U>& p) const {
+    return Hash64Combine(hash<T>()(p.first), hash<U>()(p.second));
+  }
+};
+
+}  // namespace tsl
+
+namespace std {
+template <>
+struct hash<tsl::tstring> {
+  size_t operator()(const tsl::tstring& s) const {
+    return static_cast<size_t>(tsl::Hash64(s.data(), s.size()));
+  }
+};
+}  // namespace std
+
+#endif  // TENSORFLOW_TSL_PLATFORM_HASH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/host_info.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/host_info.h
new file mode 100644
index 00000000..687045c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/host_info.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_HOST_INFO_H_
+#define TENSORFLOW_TSL_PLATFORM_HOST_INFO_H_
+
+#include <cstdint>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace port {
+
+// Statistical data of IO operations performed by the job.
+struct IOStatistics {
+  struct Distribution {
+    uint64_t count = 0;
+    double mean = 0.0;
+    double std_dev = 0.0;
+  };
+  // Distribution of round trip IO latency in microseconds.
+  Distribution roundtrip_latency_usec;
+  // Distribution of data received by IO reads in bytes.
+  Distribution response_bytes;
+};
+
+// Return the hostname of the machine on which this process is running.
+string Hostname();
+
+// Return the job name as a string if it exists, otherwise return an empty
+// string.
+string JobName();
+
+// Returns the Borg job UID as an int64_t if it exists. Otherwise return -1.
+int64_t JobUid();
+
+// Returns the Borg task ID as an int64_t if it exists. Otherwise return -1.
+int64_t TaskId();
+
+// Retrieves the host file read statistics.
+IOStatistics GetIOStatistics();
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_HOST_INFO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h
new file mode 100644
index 00000000..3fedff06
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_HUMAN_READABLE_JSON_H_
+#define TENSORFLOW_TSL_PLATFORM_HUMAN_READABLE_JSON_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/protobuf.h"
+
+namespace tsl {
+
+// Converts a proto to a JSON-like string that's meant to be human-readable
+// but still machine-parseable.
+//
+// This string may not be strictly JSON-compliant, but it must be parsable by
+// HumanReadableJSONToProto.
+//
+// When ignore_accuracy_loss = true, this function may ignore JavaScript
+// accuracy loss with large integers.
+absl::StatusOr<std::string> ProtoToHumanReadableJson(
+    const protobuf::Message& proto, bool ignore_accuracy_loss);
+absl::StatusOr<std::string> ProtoToHumanReadableJson(
+    const protobuf::MessageLite& proto, bool ignore_accuracy_loss);
+
+// Converts a string produced by ProtoToHumanReadableJSON to a protobuf.  Not
+// guaranteed to work for general JSON.
+absl::Status HumanReadableJsonToProto(const string& str,
+                                      protobuf::Message* proto);
+absl::Status HumanReadableJsonToProto(const string& str,
+                                      protobuf::MessageLite* proto);
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_HUMAN_READABLE_JSON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/init_main.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/init_main.h
new file mode 100644
index 00000000..c02c1e8d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/init_main.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_INIT_MAIN_H_
+#define TENSORFLOW_TSL_PLATFORM_INIT_MAIN_H_
+
+namespace tsl {
+namespace port {
+
+void InitMain(const char* usage, int* argc, char*** argv);
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_INIT_MAIN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/intrusive_ptr.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/intrusive_ptr.h
new file mode 100644
index 00000000..5407f15a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/intrusive_ptr.h
@@ -0,0 +1,81 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PLATFORM_INTRUSIVE_PTR_H_
+#define TENSORFLOW_TSL_PLATFORM_INTRUSIVE_PTR_H_
+
+#include <algorithm>
+namespace tsl {
+namespace core {
+
+// A utility for managing the lifetime of ref-counted objects.
+//
+// Generally used for objects that derive from `tensorflow::RefCounted`.
+template <class T>
+class IntrusivePtr {
+ public:
+  // add_ref=false indicates that IntrusivePtr owns the underlying pointer.
+  //
+  // In most cases, we expect this to be called with add_ref=false, except in
+  // special circumstances where the lifetime of the underlying RefCounted
+  // object needs to be externally managed.
+  IntrusivePtr(T* h, bool add_ref) { reset(h, add_ref); }
+  IntrusivePtr(const IntrusivePtr& o) { reset(o.handle_, /*add_ref=*/true); }
+  IntrusivePtr(IntrusivePtr&& o) noexcept { *this = std::move(o); }
+  IntrusivePtr() {}
+  void reset(T* h, bool add_ref) {
+    if (h != handle_) {
+      if (add_ref && h) h->Ref();
+      if (handle_) handle_->Unref();
+      handle_ = h;
+    }
+  }
+  IntrusivePtr& operator=(const IntrusivePtr& o) {
+    reset(o.handle_, /*add_ref=*/true);
+    return *this;
+  }
+  IntrusivePtr& operator=(IntrusivePtr&& o) noexcept {
+    if (handle_ != o.handle_) {
+      // Must clear o.handle_ before calling reset to capture the case where
+      // handle_->member == o. In this case, calling handle_->Unref first would
+      // delete o.handle_ so we clear it out first.
+      reset(o.detach(), /*add_ref=*/false);
+    }
+    return *this;
+  }
+  bool operator==(const IntrusivePtr& o) const { return handle_ == o.handle_; }
+  T* operator->() const { return handle_; }
+  T& operator*() const { return *handle_; }
+  explicit operator bool() const noexcept { return get(); }
+  T* get() const { return handle_; }
+  // Releases ownership of the pointer without unreffing. Caller is responsible
+  // for calling Unref on the returned pointer.
+  T* detach() {
+    T* handle = handle_;
+    handle_ = nullptr;
+    return handle;
+  }
+
+  ~IntrusivePtr() {
+    if (handle_) handle_->Unref();
+  }
+
+ private:
+  T* handle_ = nullptr;
+};
+
+}  // namespace core
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_INTRUSIVE_PTR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/load_library.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/load_library.h
new file mode 100644
index 00000000..5a42f2a3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/load_library.h
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_LOAD_LIBRARY_H_
+#define TENSORFLOW_TSL_PLATFORM_LOAD_LIBRARY_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+
+namespace tsl {
+
+namespace internal {
+
+absl::Status LoadDynamicLibrary(const char* library_filename, void** handle);
+absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                  void** symbol);
+std::string FormatLibraryFileName(const std::string& name,
+                                  const std::string& version);
+
+}  // namespace internal
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_LOAD_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/logging.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/logging.h
new file mode 100644
index 00000000..193cb9b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/logging.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_LOGGING_H_
+#define TENSORFLOW_TSL_PLATFORM_LOGGING_H_
+
+#include "xla/tsl/platform/logging.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_LOGGING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/macros.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/macros.h
new file mode 100644
index 00000000..960d7ed2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/macros.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_MACROS_H_
+#define TENSORFLOW_TSL_PLATFORM_MACROS_H_
+
+#include "xla/tsl/platform/macros.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_MACROS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/mem.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/mem.h
new file mode 100644
index 00000000..bc975ae1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/mem.h
@@ -0,0 +1,87 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_MEM_H_
+#define TENSORFLOW_TSL_PLATFORM_MEM_H_
+
+// TODO(cwhipkey): remove this when callers use annotations directly.
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/dynamic_annotations.h"
+#include "tsl/platform/platform.h"
+
+namespace tsl {
+namespace port {
+
+// Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
+// and a multiple of sizeof(void*).
+void* AlignedMalloc(size_t size, int minimum_alignment);
+void AlignedFree(void* aligned_memory);
+void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size);
+
+void* Malloc(size_t size);
+void* Realloc(void* ptr, size_t size);
+void Free(void* ptr);
+
+// Tries to release num_bytes of free memory back to the operating
+// system for reuse.  Use this routine with caution -- to get this
+// memory back may require faulting pages back in by the OS, and
+// that may be slow.
+//
+// Currently, if a malloc implementation does not support this
+// routine, this routine is a no-op.
+void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
+
+// Returns the actual number N of bytes reserved by the malloc for the
+// pointer p.  This number may be equal to or greater than the number
+// of bytes requested when p was allocated.
+//
+// This routine is just useful for statistics collection.  The
+// client must *not* read or write from the extra bytes that are
+// indicated by this call.
+//
+// Example, suppose the client gets memory by calling
+//    p = malloc(10)
+// and GetAllocatedSize(p) may return 16.  The client must only use the
+// first 10 bytes p[0..9], and not attempt to read or write p[10..15].
+//
+// Currently, if a malloc implementation does not support this
+// routine, this routine returns 0.
+std::size_t MallocExtension_GetAllocatedSize(const void* p);
+
+struct MemoryInfo {
+  int64_t total = 0;
+  int64_t free = 0;
+};
+
+struct MemoryBandwidthInfo {
+  int64_t bw_used = 0;  // memory bandwidth used across all CPU (in MBs/second)
+};
+
+// Retrieves the host memory information. If any of the fields in the returned
+// MemoryInfo structure is INT64_MAX, it means such information is not
+// available.
+MemoryInfo GetMemoryInfo();
+
+// Retrieves the host memory bandwidth information. If any field in the returned
+// structure is INT64_MAX, it means such information is not available.
+MemoryBandwidthInfo GetMemoryBandwidthInfo();
+
+// Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
+static inline int64_t AvailableRam() { return GetMemoryInfo().free; }
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_MEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
new file mode 100644
index 00000000..a6a1b56a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ML_DTYPES_H_
+#define TENSORFLOW_TSL_PLATFORM_ML_DTYPES_H_
+
+#include "ml_dtypes/include/float8.h"  // from @ml_dtypes
+#include "ml_dtypes/include/intn.h"  // from @ml_dtypes
+
+namespace tsl {
+using float8_e3m4 = ::ml_dtypes::float8_e3m4;
+using float8_e4m3 = ::ml_dtypes::float8_e4m3;
+using float8_e4m3fn = ::ml_dtypes::float8_e4m3fn;
+using float8_e4m3fnuz = ::ml_dtypes::float8_e4m3fnuz;
+using float8_e4m3b11fnuz = ::ml_dtypes::float8_e4m3b11fnuz;
+using float8_e5m2 = ::ml_dtypes::float8_e5m2;
+using float8_e5m2fnuz = ::ml_dtypes::float8_e5m2fnuz;
+
+using int1 = ::ml_dtypes::int1;
+using uint1 = ::ml_dtypes::uint1;
+using int2 = ::ml_dtypes::int2;
+using uint2 = ::ml_dtypes::uint2;
+using int4 = ::ml_dtypes::int4;
+using uint4 = ::ml_dtypes::uint4;
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ML_DTYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/mutex.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/mutex.h
new file mode 100644
index 00000000..ec9d593a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/mutex.h
@@ -0,0 +1,275 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_MUTEX_H_
+#define TENSORFLOW_TSL_PLATFORM_MUTEX_H_
+
+#include <chrono>   // NOLINT
+#include <cstdint>  // NOLINT
+// for std::try_to_lock_t and std::cv_status
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+enum ConditionResult { kCond_Timeout, kCond_MaybeNotified };
+enum LinkerInitialized { LINKER_INITIALIZED };
+
+class condition_variable;
+using Condition = absl::Condition;
+
+// Mimic std::mutex + C++17's shared_mutex, adding a LinkerInitialized
+// constructor interface.  This type is as fast as mutex, but is also a shared
+// lock, and provides conditional critical sections (via Await()), as an
+// alternative to condition variables.
+class TF_LOCKABLE ABSL_DEPRECATED("Use absl::Mutex instead.") mutex {
+ public:
+  mutex();
+  // The default implementation of the underlying mutex is safe to use after
+  // the linker initialization to zero.
+  explicit constexpr mutex(LinkerInitialized x) : mu_(absl::kConstInit) {}
+
+  void lock() TF_EXCLUSIVE_LOCK_FUNCTION();
+  bool try_lock() TF_EXCLUSIVE_TRYLOCK_FUNCTION(true);
+  void unlock() TF_UNLOCK_FUNCTION();
+  void assert_held() const TF_ASSERT_EXCLUSIVE_LOCK();
+
+  void lock_shared() TF_SHARED_LOCK_FUNCTION();
+  bool try_lock_shared() TF_SHARED_TRYLOCK_FUNCTION(true);
+  void unlock_shared() TF_UNLOCK_FUNCTION();
+  void assert_held_shared() const TF_ASSERT_SHARED_LOCK();
+
+  // -------
+  // Conditional critical sections.
+  // These represent an alternative to condition variables that is easier to
+  // use.  The predicate must be encapsulated in a function (via Condition),
+  // but there is no need to use a while-loop, and no need to signal the
+  // condition.  Example:  suppose "mu" protects "counter"; we wish one thread
+  // to wait until counter is decremented to zero by another thread.
+  //   // Predicate expressed as a function:
+  //   static bool IntIsZero(int* pi) { return *pi == 0; }
+  //
+  //   // Waiter:
+  //   mu.lock();
+  //   mu.Await(Condition(&IntIsZero, &counter));   // no loop needed
+  //   // lock is held and counter==0...
+  //   mu.unlock();
+  //
+  //   // Decrementer:
+  //   mu.lock();
+  //   counter--;
+  //   mu.unlock();    // no need to signal; mutex will check condition
+  //
+  // A mutex may be used with condition variables and conditional critical
+  // sections at the same time.  Conditional critical sections are easier to
+  // use, but if there are multiple conditions that are simultaneously false,
+  // condition variables may be faster.
+
+  // Unlock *this and wait until cond.Eval() is true, then atomically reacquire
+  // *this in the same mode in which it was previously held and return.
+  void Await(const Condition& cond);
+
+  // Unlock *this and wait until either cond.Eval is true, or abs_deadline_ns
+  // has been reached, then atomically reacquire *this in the same mode in
+  // which it was previously held, and return whether cond.Eval() is true.
+  // See tsl/tsl/platform/env_time.h for the time interface.
+  bool AwaitWithDeadline(const Condition& cond, uint64_t abs_deadline_ns);
+  // -------
+
+ private:
+  friend class condition_variable;
+  absl::Mutex mu_;
+};
+
+// Mimic a subset of the std::unique_lock<tsl::mutex> functionality.
+class TF_SCOPED_LOCKABLE mutex_lock {
+ public:
+  typedef ::tsl::mutex mutex_type;
+
+  explicit mutex_lock(mutex_type& mu) TF_EXCLUSIVE_LOCK_FUNCTION(mu)
+      : mu_(&mu) {
+    mu_->lock();
+  }
+
+  mutex_lock(mutex_type& mu, std::try_to_lock_t) TF_EXCLUSIVE_LOCK_FUNCTION(mu)
+      : mu_(&mu) {
+    if (!mu.try_lock()) {
+      mu_ = nullptr;
+    }
+  }
+
+  // Manually nulls out the source to prevent double-free.
+  // (std::move does not null the source pointer by default.)
+  mutex_lock(mutex_lock&& ml) noexcept TF_EXCLUSIVE_LOCK_FUNCTION(ml.mu_)
+      : mu_(ml.mu_) {
+    ml.mu_ = nullptr;
+  }
+  ~mutex_lock() TF_UNLOCK_FUNCTION() {
+    if (mu_ != nullptr) {
+      mu_->unlock();
+    }
+  }
+  mutex_type* mutex() { return mu_; }
+
+  explicit operator bool() const { return mu_ != nullptr; }
+
+ private:
+  mutex_type* mu_;
+};
+
+// Catch bug where variable name is omitted, e.g. mutex_lock (mu);
+#define mutex_lock(x) static_assert(0, "mutex_lock_decl_missing_var_name");
+
+// Mimic a subset of the std::shared_lock<tsl::mutex> functionality.
+// Name chosen to minimize conflicts with the tf_shared_lock macro, below.
+class TF_SCOPED_LOCKABLE tf_shared_lock {
+ public:
+  typedef ::tsl::mutex mutex_type;
+
+  explicit tf_shared_lock(mutex_type& mu) TF_SHARED_LOCK_FUNCTION(mu)
+      : mu_(&mu) {
+    mu_->lock_shared();
+  }
+
+  tf_shared_lock(mutex_type& mu, std::try_to_lock_t) TF_SHARED_LOCK_FUNCTION(mu)
+      : mu_(&mu) {
+    if (!mu.try_lock_shared()) {
+      mu_ = nullptr;
+    }
+  }
+
+  // Manually nulls out the source to prevent double-free.
+  // (std::move does not null the source pointer by default.)
+  tf_shared_lock(tf_shared_lock&& ml) noexcept TF_SHARED_LOCK_FUNCTION(ml.mu_)
+      : mu_(ml.mu_) {
+    ml.mu_ = nullptr;
+  }
+  ~tf_shared_lock() TF_UNLOCK_FUNCTION() {
+    if (mu_ != nullptr) {
+      mu_->unlock_shared();
+    }
+  }
+  mutex_type* mutex() { return mu_; }
+
+  explicit operator bool() const { return mu_ != nullptr; }
+
+ private:
+  mutex_type* mu_;
+};
+
+// Catch bug where variable name is omitted, e.g. tf_shared_lock (mu);
+#define tf_shared_lock(x) \
+  static_assert(0, "tf_shared_lock_decl_missing_var_name");
+
+// Mimic std::condition_variable.
+class ABSL_DEPRECATED("Use absl::CondVar instead.") condition_variable {
+ public:
+  condition_variable();
+
+  void wait(mutex_lock& lock);
+
+  template <class Predicate>
+  void wait(mutex_lock& lock, Predicate stop_waiting) {
+    while (!stop_waiting()) {
+      wait(lock);
+    }
+  }
+
+  template <class Rep, class Period>
+  std::cv_status wait_for(mutex_lock& lock,
+                          std::chrono::duration<Rep, Period> dur);
+  void notify_one();
+  void notify_all();
+
+ private:
+  friend ConditionResult WaitForMilliseconds(mutex_lock* mu,
+                                             condition_variable* cv,
+                                             int64_t ms);
+  absl::CondVar cv_;
+};
+
+// Like "cv->wait(*mu)", except that it only waits for up to "ms" milliseconds.
+//
+// Returns kCond_Timeout if the timeout expired without this
+// thread noticing a signal on the condition variable.  Otherwise may
+// return either kCond_Timeout or kCond_MaybeNotified
+inline ConditionResult WaitForMilliseconds(mutex_lock* mu,
+                                           condition_variable* cv, int64_t ms) {
+  std::cv_status s = cv->wait_for(*mu, std::chrono::milliseconds(ms));
+  return (s == std::cv_status::timeout) ? kCond_Timeout : kCond_MaybeNotified;
+}
+
+// ------------------------------------------------------------
+// Implementation details follow.   Clients should ignore them.
+
+inline mutex::mutex() = default;
+
+inline void mutex::lock() TF_EXCLUSIVE_LOCK_FUNCTION() { mu_.Lock(); }
+
+inline bool mutex::try_lock() TF_EXCLUSIVE_TRYLOCK_FUNCTION(true) {
+  return mu_.TryLock();
+};
+
+inline void mutex::unlock() TF_UNLOCK_FUNCTION() { mu_.Unlock(); }
+
+inline void mutex::assert_held() const TF_ASSERT_EXCLUSIVE_LOCK() {
+  mu_.AssertHeld();
+}
+
+inline void mutex::lock_shared() TF_SHARED_LOCK_FUNCTION() { mu_.ReaderLock(); }
+
+inline bool mutex::try_lock_shared() TF_SHARED_TRYLOCK_FUNCTION(true) {
+  return mu_.ReaderTryLock();
+}
+
+inline void mutex::unlock_shared() TF_UNLOCK_FUNCTION() { mu_.ReaderUnlock(); }
+
+inline void mutex::assert_held_shared() const TF_ASSERT_SHARED_LOCK() {
+  mu_.AssertReaderHeld();
+}
+
+inline void mutex::Await(const Condition& cond) { mu_.Await(cond); }
+
+inline bool mutex::AwaitWithDeadline(const Condition& cond,
+                                     uint64_t abs_deadline_ns) {
+  return mu_.AwaitWithDeadline(cond, absl::FromUnixNanos(abs_deadline_ns));
+}
+
+inline condition_variable::condition_variable() = default;
+
+inline void condition_variable::wait(mutex_lock& lock) {
+  cv_.Wait(&lock.mutex()->mu_);
+}
+
+inline void condition_variable::notify_one() { cv_.Signal(); }
+
+inline void condition_variable::notify_all() { cv_.SignalAll(); }
+
+template <class Rep, class Period>
+std::cv_status condition_variable::wait_for(
+    mutex_lock& lock, std::chrono::duration<Rep, Period> dur) {
+  bool r = cv_.WaitWithTimeout(&lock.mutex()->mu_, ::absl::FromChrono(dur));
+  return r ? std::cv_status::timeout : std::cv_status::no_timeout;
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_MUTEX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/net.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/net.h
new file mode 100644
index 00000000..8f08e922
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/net.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_NET_H_
+#define TENSORFLOW_TSL_PLATFORM_NET_H_
+
+namespace tsl {
+namespace internal {
+
+int PickUnusedPortOrDie();
+
+}  // namespace internal
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_NET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/notification.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/notification.h
new file mode 100644
index 00000000..80e5b388
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/notification.h
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_NOTIFICATION_H_
+#define TENSORFLOW_TSL_PLATFORM_NOTIFICATION_H_
+
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <cstdint>
+#include <mutex>  // NOLINT
+
+#include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
+
+namespace tsl {
+
+using absl::Notification;
+
+// TODO(ddunleavy): remove this method and replace uses of `tsl::Notification`
+// with `absl::Notification`.
+inline bool WaitForNotificationWithTimeout(Notification* n,
+                                           int64_t timeout_in_us) {
+  return n->WaitForNotificationWithTimeout(absl::Microseconds(timeout_in_us));
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_NOTIFICATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h
new file mode 100644
index 00000000..8c882985
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_NULL_FILE_SYSTEM_H_
+#define TENSORFLOW_TSL_PLATFORM_NULL_FILE_SYSTEM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/file_system_helper.h"
+
+namespace tsl {
+
+// START_SKIP_DOXYGEN
+
+#ifndef SWIG
+// Degenerate file system that provides no implementations.
+class NullFileSystem : public FileSystem {
+ public:
+  NullFileSystem() {}
+
+  ~NullFileSystem() override = default;
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
+    return errors::Unimplemented("NewRandomAccessFile unimplemented");
+  }
+
+  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
+    return errors::Unimplemented("NewWritableFile unimplemented");
+  }
+
+  absl::Status NewAppendableFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
+    return errors::Unimplemented("NewAppendableFile unimplemented");
+  }
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return errors::Unimplemented(
+        "NewReadOnlyMemoryRegionFromFile unimplemented");
+  }
+
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override {
+    return errors::Unimplemented("FileExists unimplemented");
+  }
+
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
+    return errors::Unimplemented("GetChildren unimplemented");
+  }
+
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* results) override {
+    return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+  }
+
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override {
+    return errors::Unimplemented("DeleteFile unimplemented");
+  }
+
+  absl::Status CreateDir(const string& dirname,
+                         TransactionToken* token) override {
+    return errors::Unimplemented("CreateDir unimplemented");
+  }
+
+  absl::Status DeleteDir(const string& dirname,
+                         TransactionToken* token) override {
+    return errors::Unimplemented("DeleteDir unimplemented");
+  }
+
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* file_size) override {
+    return errors::Unimplemented("GetFileSize unimplemented");
+  }
+
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override {
+    return errors::Unimplemented("RenameFile unimplemented");
+  }
+
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
+    return errors::Unimplemented("Stat unimplemented");
+  }
+};
+#endif
+
+// END_SKIP_DOXYGEN
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_NULL_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/numa.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/numa.h
new file mode 100644
index 00000000..12a65894
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/numa.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_NUMA_H_
+#define TENSORFLOW_TSL_PLATFORM_NUMA_H_
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/platform.h"
+
+namespace tsl {
+namespace port {
+
+// Returns true iff NUMA functions are supported.
+bool NUMAEnabled();
+
+// Returns the number of NUMA nodes present with respect to CPU operations.
+// Typically this will be the number of sockets where some RAM has greater
+// affinity with one socket than another.
+int NUMANumNodes();
+
+static const int kNUMANoAffinity = -1;
+
+// If possible sets affinity of the current thread to the specified NUMA node.
+// If node == kNUMANoAffinity removes affinity to any particular node.
+void NUMASetThreadNodeAffinity(int node);
+
+// Returns NUMA node affinity of the current thread, kNUMANoAffinity if none.
+int NUMAGetThreadNodeAffinity();
+
+// Like AlignedMalloc, but allocates memory with affinity to the specified NUMA
+// node.
+//
+// Notes:
+//  1. node must be >= 0 and < NUMANumNodes.
+//  1. minimum_alignment must a factor of system page size, the memory
+//     returned will be page-aligned.
+//  2. This function is likely significantly slower than AlignedMalloc
+//     and should not be used for lots of small allocations.  It makes more
+//     sense as a backing allocator for BFCAllocator, PoolAllocator, or similar.
+void* NUMAMalloc(int node, size_t size, int minimum_alignment);
+
+// Memory allocated by NUMAMalloc must be freed via NUMAFree.
+void NUMAFree(void* ptr, size_t size);
+
+// Returns NUMA node affinity of memory address, kNUMANoAffinity if none.
+int NUMAGetMemAffinity(const void* ptr);
+
+}  // namespace port
+}  // namespace tsl
+#endif  // TENSORFLOW_TSL_PLATFORM_NUMA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/numbers.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/numbers.h
new file mode 100644
index 00000000..0f4dc84e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/numbers.h
@@ -0,0 +1,191 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_NUMBERS_H_
+#define TENSORFLOW_TSL_PLATFORM_NUMBERS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "absl/strings/numbers.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace strings {
+
+// ----------------------------------------------------------------------
+// FastIntToBufferLeft()
+//    These are intended for speed.
+//
+//    All functions take the output buffer as an arg.  FastInt() uses
+//    at most 22 bytes, FastTime() uses exactly 30 bytes.  They all
+//    return a pointer to the beginning of the output, which is the same as
+//    the beginning of the input buffer.
+//
+//    NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible
+//    to pass to FastTimeToBuffer() a time whose year cannot be
+//    represented in 4 digits. In this case, the output buffer
+//    will contain the string "Invalid:<value>"
+// ----------------------------------------------------------------------
+
+// Previously documented minimums -- the buffers provided must be at least this
+// long, though these numbers are subject to change:
+//     Int32, UInt32:                   12 bytes
+//     Int64, UInt64, Int, Uint:        22 bytes
+//     Time:                            30 bytes
+// Use kFastToBufferSize rather than hardcoding constants.
+inline constexpr int kFastToBufferSize = 32;
+
+// ----------------------------------------------------------------------
+// FastInt32ToBufferLeft()
+// FastUInt32ToBufferLeft()
+// FastInt64ToBufferLeft()
+// FastUInt64ToBufferLeft()
+//
+// These functions convert their numeric argument to an ASCII
+// representation of the numeric value in base 10, with the
+// representation being left-aligned in the buffer.  The caller is
+// responsible for ensuring that the buffer has enough space to hold
+// the output.  The buffer should typically be at least kFastToBufferSize
+// bytes.
+//
+// Returns the number of characters written.
+// ----------------------------------------------------------------------
+
+size_t FastInt32ToBufferLeft(int32_t i, char* buffer);  // at least 12 bytes
+size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer);  // at least 12 bytes
+size_t FastInt64ToBufferLeft(int64_t i, char* buffer);  // at least 22 bytes
+size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer);  // at least 22 bytes
+
+// Required buffer size for DoubleToBuffer is kFastToBufferSize.
+// Required buffer size for FloatToBuffer is kFastToBufferSize.
+size_t DoubleToBuffer(double value, char* buffer);
+size_t FloatToBuffer(float value, char* buffer);
+
+// Convert a 64-bit fingerprint value to an ASCII representation.
+std::string FpToString(Fprint fp);
+
+// Attempt to parse a `uint64_t` in the form encoded by
+// `absl::StrCat(absl::Hex(*result))`.  If successful, stores the value in
+// `result` and returns true.  Otherwise, returns false.
+bool HexStringToUint64(absl::string_view s, uint64_t* result);
+
+// Convert strings to 32bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool safe_strto32(absl::string_view str, int32_t* value) {
+  return absl::SimpleAtoi(str, value);
+}
+
+// Convert strings to unsigned 32bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool safe_strtou32(absl::string_view str, uint32_t* value) {
+  return absl::SimpleAtoi(str, value);
+}
+
+// Convert strings to 64bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool safe_strto64(absl::string_view str, int64_t* value) {
+  return absl::SimpleAtoi(str, value);
+}
+
+// Convert strings to unsigned 64bit integer values.
+// Leading and trailing spaces are allowed.
+// Return false with overflow or invalid input.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool safe_strtou64(absl::string_view str, uint64_t* value) {
+  return absl::SimpleAtoi(str, value);
+}
+
+// Convert strings to floating point values.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool safe_strtof(absl::string_view str, float* value) {
+  return absl::SimpleAtof(str, value);
+}
+
+// Convert strings to double precision floating point values.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool safe_strtod(absl::string_view str, double* value) {
+  return absl::SimpleAtod(str, value);
+}
+
+inline bool ProtoParseNumeric(absl::string_view s, int32_t* value) {
+  return absl::SimpleAtoi(s, value);
+}
+
+inline bool ProtoParseNumeric(absl::string_view s, uint32_t* value) {
+  return absl::SimpleAtoi(s, value);
+}
+
+inline bool ProtoParseNumeric(absl::string_view s, int64_t* value) {
+  return absl::SimpleAtoi(s, value);
+}
+
+inline bool ProtoParseNumeric(absl::string_view s, uint64_t* value) {
+  return absl::SimpleAtoi(s, value);
+}
+
+inline bool ProtoParseNumeric(absl::string_view s, float* value) {
+  return absl::SimpleAtof(s, value);
+}
+
+inline bool ProtoParseNumeric(absl::string_view s, double* value) {
+  return absl::SimpleAtod(s, value);
+}
+
+// Convert strings to number of type T.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+template <typename T>
+bool SafeStringToNumeric(absl::string_view s, T* value) {
+  return ProtoParseNumeric(s, value);
+}
+
+// Converts from an int64 to a human readable string representing the
+// same number, using decimal powers.  e.g. 1200000 -> "1.20M".
+std::string HumanReadableNum(int64_t value);
+
+// Converts from an int64 representing a number of bytes to a
+// human readable string representing the same number.
+// e.g. 12345678 -> "11.77MiB".
+std::string HumanReadableNumBytes(int64_t num_bytes);
+
+// Converts a time interval as double to a human readable
+// string. For example:
+//   0.001       -> "1 ms"
+//   10.0        -> "10 s"
+//   933120.0    -> "10.8 days"
+//   39420000.0  -> "1.25 years"
+//   -10         -> "-10 s"
+std::string HumanReadableElapsedTime(double seconds);
+
+}  // namespace strings
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_NUMBERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/path.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/path.h
new file mode 100644
index 00000000..bf9537c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/path.h
@@ -0,0 +1,135 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_PATH_H_
+#define TENSORFLOW_TSL_PLATFORM_PATH_H_
+
+#include <string>
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace io {
+namespace internal {
+std::string JoinPathImpl(std::initializer_list<absl::string_view> paths);
+}
+
+// Utility routines for processing filenames
+
+#ifndef SWIG  // variadic templates
+// Join multiple paths together, without introducing unnecessary path
+// separators.
+// For example:
+//
+//  Arguments                  | JoinPath
+//  ---------------------------+----------
+//  '/foo', 'bar'              | /foo/bar
+//  '/foo/', 'bar'             | /foo/bar
+//  '/foo', '/bar'             | /foo/bar
+//
+// Usage:
+// string path = io::JoinPath("/mydir", filename);
+// string path = io::JoinPath(FLAGS_test_srcdir, filename);
+// string path = io::JoinPath("/full", "path", "to", "filename");
+template <typename... T>
+std::string JoinPath(const T&... args) {
+  return internal::JoinPathImpl({args...});
+}
+#endif /* SWIG */
+
+// Return true if path is absolute.
+bool IsAbsolutePath(absl::string_view path);
+
+// Returns the part of the path before the final "/".  If there is a single
+// leading "/" in the path, the result will be the leading "/".  If there is
+// no "/" in the path, the result is the empty prefix of the input.
+absl::string_view Dirname(absl::string_view path);
+
+// Returns the part of the path after the final "/".  If there is no
+// "/" in the path, the result is the same as the input.
+absl::string_view Basename(absl::string_view path);
+
+// Returns the part of the basename of path after the final ".".  If
+// there is no "." in the basename, the result is empty.
+absl::string_view Extension(absl::string_view path);
+
+// Returns the part of the basename of path before the final ".".  If
+// there is no "." in the basename, the result is empty.
+absl::string_view BasenamePrefix(absl::string_view path);
+
+// Returns the largest common subpath of `paths`.
+//
+// For example, for "/alpha/beta/gamma" and "/alpha/beta/ga" returns
+// "/alpha/beta/". For "/alpha/beta/gamma" and "/alpha/beta/gamma" returns
+// "/alpha/beta/".
+//
+// Does not perform any path normalization.
+std::string CommonPathPrefix(absl::Span<std::string const> paths);
+
+// Collapse duplicate "/"s, resolve ".." and "." path elements, remove
+// trailing "/".
+//
+// NOTE: This respects relative vs. absolute paths, but does not
+// invoke any system calls (getcwd(2)) in order to resolve relative
+// paths with respect to the actual working directory.  That is, this is purely
+// string manipulation, completely independent of process state.
+std::string CleanPath(absl::string_view path);
+
+// Populates the scheme, host, and path from a URI. scheme, host, and path are
+// guaranteed by this function to point into the contents of uri, even if
+// empty.
+//
+// Corner cases:
+// - If the URI is invalid, scheme and host are set to empty strings and the
+//   passed string is assumed to be a path
+// - If the URI omits the path (e.g. file://host), then the path is left empty.
+void ParseURI(absl::string_view uri, absl::string_view* scheme,
+              absl::string_view* host, absl::string_view* path);
+
+// Creates a URI from a scheme, host, and path. If the scheme is empty, we just
+// return the path.
+std::string CreateURI(absl::string_view scheme, absl::string_view host,
+                      absl::string_view path);
+
+// Creates a temporary file name with an extension.
+std::string GetTempFilename(const std::string& extension);
+
+// Returns whether the test workspace directory is known. If it's known and dir
+// != nullptr then sets *dir to that.
+//
+// The test workspace directory is known to be TEST_SRCDIR/TEST_WORKSPACE if
+// both the TEST_SRCDIR and TEST_WORKSPACE environment variables are set.
+bool GetTestWorkspaceDir(std::string* dir);
+
+// Returns whether the TEST_UNDECLARED_OUTPUTS_DIR environment variable is set.
+// If it's set and dir != nullptr then sets *dir to that.
+bool GetTestUndeclaredOutputsDir(std::string* dir);
+
+// Resolves paths to help tests find files in their workspace or output
+// directory. Returns whether the path can be resolved. If it can be then sets
+// resolved_path to that.
+//
+// Currently the TEST_WORKSPACE and the TEST_UNDECLARED_OUTPUTS_DIR prefixes can
+// be resolved.
+bool ResolveTestPrefixes(absl::string_view path, std::string& resolved_path);
+
+// Appends `.exe` if `PLATFORM_WINDOWS` is defined.
+[[maybe_unused]] std::string& AppendDotExeIfWindows(std::string& path);
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_PATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/platform.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/platform.h
new file mode 100644
index 00000000..a8977237
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/platform.h
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_PLATFORM_H_
+#define TENSORFLOW_TSL_PLATFORM_PLATFORM_H_
+
+// Set one PLATFORM_* macro and set IS_MOBILE_PLATFORM if the platform is for
+// mobile.
+
+#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) &&                 \
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID) && \
+    !defined(PLATFORM_WINDOWS)
+
+// Choose which platform we are on.
+#if defined(ANDROID) || defined(__ANDROID__)
+#define PLATFORM_POSIX_ANDROID
+#define IS_MOBILE_PLATFORM
+
+#elif defined(__APPLE__)
+#include "TargetConditionals.h"
+#if TARGET_IPHONE_SIMULATOR || TARGET_OS_IPHONE
+#define PLATFORM_POSIX_IOS
+#define IS_MOBILE_PLATFORM
+#else
+// If no platform specified, use:
+#define PLATFORM_POSIX
+#endif
+
+#elif defined(_WIN32)
+#define PLATFORM_WINDOWS
+
+#elif defined(__EMSCRIPTEN__)
+#define PLATFORM_PORTABLE_GOOGLE
+#define PLATFORM_POSIX
+// EMSCRIPTEN builds are considered "mobile" for the sake of portability.
+#define IS_MOBILE_PLATFORM
+
+#elif defined(__TF_CHROMIUMOS__)
+#define PLATFORM_PORTABLE_GOOGLE
+#define PLATFORM_POSIX
+#define PLATFORM_CHROMIUMOS
+
+#elif defined(__Fuchsia__)
+#define PLATFORM_FUCHSIA
+// PLATFORM_GOOGLE needs to be defined by default to get the right header
+// files.
+#define PLATFORM_GOOGLE
+
+#else
+// If no platform specified, use:
+#define PLATFORM_POSIX
+
+#endif
+#endif
+
+// Look for both gcc/clang and Visual Studio macros indicating we're compiling
+// for an x86 device.
+#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+#define PLATFORM_IS_X86
+#endif
+
+// Check if we are compmiling for an arm device.
+#if defined(__arm__) || defined(__aarch64__)
+#define PLATFORM_IS_ARM
+#if defined(__aarch64__)
+#define PLATFORM_IS_ARM64
+#else
+#define PLATFORM_IS_ARM32
+#endif
+#endif
+
+#define TSL_IS_IN_OSS 1
+
+#ifdef __cplusplus
+namespace tsl {
+// Constant which is false internally and true in open source.
+inline constexpr bool kIsOpenSource = TSL_IS_IN_OSS;
+}  // namespace tsl
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/platform_strings_computed.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/platform_strings_computed.h
new file mode 100644
index 00000000..6938fca7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/platform_strings_computed.h
@@ -0,0 +1,735 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Generated from platform_strings.h.
+
+#ifndef TENSORFLOW_TSL_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+#define TENSORFLOW_TSL_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+
+#if defined(_M_IX86_FP)
+#define TF_PLAT_STR__M_IX86_FP TF_PLAT_STR_(_M_IX86_FP)
+#else
+#define TF_PLAT_STR__M_IX86_FP
+#endif
+#if defined(_NO_PREFETCHW)
+#define TF_PLAT_STR__NO_PREFETCHW TF_PLAT_STR_(_NO_PREFETCHW)
+#else
+#define TF_PLAT_STR__NO_PREFETCHW
+#endif
+#if defined(__3dNOW_A__)
+#define TF_PLAT_STR___3dNOW_A__ TF_PLAT_STR_(__3dNOW_A__)
+#else
+#define TF_PLAT_STR___3dNOW_A__
+#endif
+#if defined(__3dNOW__)
+#define TF_PLAT_STR___3dNOW__ TF_PLAT_STR_(__3dNOW__)
+#else
+#define TF_PLAT_STR___3dNOW__
+#endif
+#if defined(__ABM__)
+#define TF_PLAT_STR___ABM__ TF_PLAT_STR_(__ABM__)
+#else
+#define TF_PLAT_STR___ABM__
+#endif
+#if defined(__ADX__)
+#define TF_PLAT_STR___ADX__ TF_PLAT_STR_(__ADX__)
+#else
+#define TF_PLAT_STR___ADX__
+#endif
+#if defined(__AES__)
+#define TF_PLAT_STR___AES__ TF_PLAT_STR_(__AES__)
+#else
+#define TF_PLAT_STR___AES__
+#endif
+#if defined(__AVX2__)
+#define TF_PLAT_STR___AVX2__ TF_PLAT_STR_(__AVX2__)
+#else
+#define TF_PLAT_STR___AVX2__
+#endif
+#if defined(__AVX512BW__)
+#define TF_PLAT_STR___AVX512BW__ TF_PLAT_STR_(__AVX512BW__)
+#else
+#define TF_PLAT_STR___AVX512BW__
+#endif
+#if defined(__AVX512CD__)
+#define TF_PLAT_STR___AVX512CD__ TF_PLAT_STR_(__AVX512CD__)
+#else
+#define TF_PLAT_STR___AVX512CD__
+#endif
+#if defined(__AVX512DQ__)
+#define TF_PLAT_STR___AVX512DQ__ TF_PLAT_STR_(__AVX512DQ__)
+#else
+#define TF_PLAT_STR___AVX512DQ__
+#endif
+#if defined(__AVX512ER__)
+#define TF_PLAT_STR___AVX512ER__ TF_PLAT_STR_(__AVX512ER__)
+#else
+#define TF_PLAT_STR___AVX512ER__
+#endif
+#if defined(__AVX512F__)
+#define TF_PLAT_STR___AVX512F__ TF_PLAT_STR_(__AVX512F__)
+#else
+#define TF_PLAT_STR___AVX512F__
+#endif
+#if defined(__AVX512IFMA__)
+#define TF_PLAT_STR___AVX512IFMA__ TF_PLAT_STR_(__AVX512IFMA__)
+#else
+#define TF_PLAT_STR___AVX512IFMA__
+#endif
+#if defined(__AVX512PF__)
+#define TF_PLAT_STR___AVX512PF__ TF_PLAT_STR_(__AVX512PF__)
+#else
+#define TF_PLAT_STR___AVX512PF__
+#endif
+#if defined(__AVX512VBMI__)
+#define TF_PLAT_STR___AVX512VBMI__ TF_PLAT_STR_(__AVX512VBMI__)
+#else
+#define TF_PLAT_STR___AVX512VBMI__
+#endif
+#if defined(__AVX512VL__)
+#define TF_PLAT_STR___AVX512VL__ TF_PLAT_STR_(__AVX512VL__)
+#else
+#define TF_PLAT_STR___AVX512VL__
+#endif
+#if defined(__AVX__)
+#define TF_PLAT_STR___AVX__ TF_PLAT_STR_(__AVX__)
+#else
+#define TF_PLAT_STR___AVX__
+#endif
+#if defined(__BMI2__)
+#define TF_PLAT_STR___BMI2__ TF_PLAT_STR_(__BMI2__)
+#else
+#define TF_PLAT_STR___BMI2__
+#endif
+#if defined(__BMI__)
+#define TF_PLAT_STR___BMI__ TF_PLAT_STR_(__BMI__)
+#else
+#define TF_PLAT_STR___BMI__
+#endif
+#if defined(__CLFLUSHOPT__)
+#define TF_PLAT_STR___CLFLUSHOPT__ TF_PLAT_STR_(__CLFLUSHOPT__)
+#else
+#define TF_PLAT_STR___CLFLUSHOPT__
+#endif
+#if defined(__CLZERO__)
+#define TF_PLAT_STR___CLZERO__ TF_PLAT_STR_(__CLZERO__)
+#else
+#define TF_PLAT_STR___CLZERO__
+#endif
+#if defined(__F16C__)
+#define TF_PLAT_STR___F16C__ TF_PLAT_STR_(__F16C__)
+#else
+#define TF_PLAT_STR___F16C__
+#endif
+#if defined(__FMA4__)
+#define TF_PLAT_STR___FMA4__ TF_PLAT_STR_(__FMA4__)
+#else
+#define TF_PLAT_STR___FMA4__
+#endif
+#if defined(__FMA__)
+#define TF_PLAT_STR___FMA__ TF_PLAT_STR_(__FMA__)
+#else
+#define TF_PLAT_STR___FMA__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__FSGSBASE__)
+#define TF_PLAT_STR___FSGSBASE__ TF_PLAT_STR_(__FSGSBASE__)
+#else
+#define TF_PLAT_STR___FSGSBASE__
+#endif
+#if defined(__FXSR__)
+#define TF_PLAT_STR___FXSR__ TF_PLAT_STR_(__FXSR__)
+#else
+#define TF_PLAT_STR___FXSR__
+#endif
+#if defined(__LWP__)
+#define TF_PLAT_STR___LWP__ TF_PLAT_STR_(__LWP__)
+#else
+#define TF_PLAT_STR___LWP__
+#endif
+#if defined(__LZCNT__)
+#define TF_PLAT_STR___LZCNT__ TF_PLAT_STR_(__LZCNT__)
+#else
+#define TF_PLAT_STR___LZCNT__
+#endif
+#if defined(__MMX__)
+#define TF_PLAT_STR___MMX__ TF_PLAT_STR_(__MMX__)
+#else
+#define TF_PLAT_STR___MMX__
+#endif
+#if defined(__MWAITX__)
+#define TF_PLAT_STR___MWAITX__ TF_PLAT_STR_(__MWAITX__)
+#else
+#define TF_PLAT_STR___MWAITX__
+#endif
+#if defined(__PCLMUL__)
+#define TF_PLAT_STR___PCLMUL__ TF_PLAT_STR_(__PCLMUL__)
+#else
+#define TF_PLAT_STR___PCLMUL__
+#endif
+#if defined(__PKU__)
+#define TF_PLAT_STR___PKU__ TF_PLAT_STR_(__PKU__)
+#else
+#define TF_PLAT_STR___PKU__
+#endif
+#if defined(__POPCNT__)
+#define TF_PLAT_STR___POPCNT__ TF_PLAT_STR_(__POPCNT__)
+#else
+#define TF_PLAT_STR___POPCNT__
+#endif
+#if defined(__PRFCHW__)
+#define TF_PLAT_STR___PRFCHW__ TF_PLAT_STR_(__PRFCHW__)
+#else
+#define TF_PLAT_STR___PRFCHW__
+#endif
+#if defined(__RDRND__)
+#define TF_PLAT_STR___RDRND__ TF_PLAT_STR_(__RDRND__)
+#else
+#define TF_PLAT_STR___RDRND__
+#endif
+#if defined(__RDSEED__)
+#define TF_PLAT_STR___RDSEED__ TF_PLAT_STR_(__RDSEED__)
+#else
+#define TF_PLAT_STR___RDSEED__
+#endif
+#if defined(__RTM__)
+#define TF_PLAT_STR___RTM__ TF_PLAT_STR_(__RTM__)
+#else
+#define TF_PLAT_STR___RTM__
+#endif
+#if defined(__SHA__)
+#define TF_PLAT_STR___SHA__ TF_PLAT_STR_(__SHA__)
+#else
+#define TF_PLAT_STR___SHA__
+#endif
+#if defined(__SSE2_MATH__)
+#define TF_PLAT_STR___SSE2_MATH__ TF_PLAT_STR_(__SSE2_MATH__)
+#else
+#define TF_PLAT_STR___SSE2_MATH__
+#endif
+#if defined(__SSE2__)
+#define TF_PLAT_STR___SSE2__ TF_PLAT_STR_(__SSE2__)
+#else
+#define TF_PLAT_STR___SSE2__
+#endif
+#if defined(__SSE_MATH__)
+#define TF_PLAT_STR___SSE_MATH__ TF_PLAT_STR_(__SSE_MATH__)
+#else
+#define TF_PLAT_STR___SSE_MATH__
+#endif
+#if defined(__SSE__)
+#define TF_PLAT_STR___SSE__ TF_PLAT_STR_(__SSE__)
+#else
+#define TF_PLAT_STR___SSE__
+#endif
+#if defined(__SSE3__)
+#define TF_PLAT_STR___SSE3__ TF_PLAT_STR_(__SSE3__)
+#else
+#define TF_PLAT_STR___SSE3__
+#endif
+#if defined(__SSE4A__)
+#define TF_PLAT_STR___SSE4A__ TF_PLAT_STR_(__SSE4A__)
+#else
+#define TF_PLAT_STR___SSE4A__
+#endif
+#if defined(__SSE4_1__)
+#define TF_PLAT_STR___SSE4_1__ TF_PLAT_STR_(__SSE4_1__)
+#else
+#define TF_PLAT_STR___SSE4_1__
+#endif
+#if defined(__SSE4_2__)
+#define TF_PLAT_STR___SSE4_2__ TF_PLAT_STR_(__SSE4_2__)
+#else
+#define TF_PLAT_STR___SSE4_2__
+#endif
+#if defined(__SSSE3__)
+#define TF_PLAT_STR___SSSE3__ TF_PLAT_STR_(__SSSE3__)
+#else
+#define TF_PLAT_STR___SSSE3__
+#endif
+#if defined(__TBM__)
+#define TF_PLAT_STR___TBM__ TF_PLAT_STR_(__TBM__)
+#else
+#define TF_PLAT_STR___TBM__
+#endif
+#if defined(__XOP__)
+#define TF_PLAT_STR___XOP__ TF_PLAT_STR_(__XOP__)
+#else
+#define TF_PLAT_STR___XOP__
+#endif
+#if defined(__XSAVEC__)
+#define TF_PLAT_STR___XSAVEC__ TF_PLAT_STR_(__XSAVEC__)
+#else
+#define TF_PLAT_STR___XSAVEC__
+#endif
+#if defined(__XSAVEOPT__)
+#define TF_PLAT_STR___XSAVEOPT__ TF_PLAT_STR_(__XSAVEOPT__)
+#else
+#define TF_PLAT_STR___XSAVEOPT__
+#endif
+#if defined(__XSAVES__)
+#define TF_PLAT_STR___XSAVES__ TF_PLAT_STR_(__XSAVES__)
+#else
+#define TF_PLAT_STR___XSAVES__
+#endif
+#if defined(__XSAVE__)
+#define TF_PLAT_STR___XSAVE__ TF_PLAT_STR_(__XSAVE__)
+#else
+#define TF_PLAT_STR___XSAVE__
+#endif
+#if defined(_SOFT_DOUBLE)
+#define TF_PLAT_STR__SOFT_DOUBLE TF_PLAT_STR_(_SOFT_DOUBLE)
+#else
+#define TF_PLAT_STR__SOFT_DOUBLE
+#endif
+#if defined(_SOFT_FLOAT)
+#define TF_PLAT_STR__SOFT_FLOAT TF_PLAT_STR_(_SOFT_FLOAT)
+#else
+#define TF_PLAT_STR__SOFT_FLOAT
+#endif
+#if defined(__ALTIVEC__)
+#define TF_PLAT_STR___ALTIVEC__ TF_PLAT_STR_(__ALTIVEC__)
+#else
+#define TF_PLAT_STR___ALTIVEC__
+#endif
+#if defined(__APPLE_ALTIVEC__)
+#define TF_PLAT_STR___APPLE_ALTIVEC__ TF_PLAT_STR_(__APPLE_ALTIVEC__)
+#else
+#define TF_PLAT_STR___APPLE_ALTIVEC__
+#endif
+#if defined(__CRYPTO__)
+#define TF_PLAT_STR___CRYPTO__ TF_PLAT_STR_(__CRYPTO__)
+#else
+#define TF_PLAT_STR___CRYPTO__
+#endif
+#if defined(__FLOAT128_HARDWARE__)
+#define TF_PLAT_STR___FLOAT128_HARDWARE__ TF_PLAT_STR_(__FLOAT128_HARDWARE__)
+#else
+#define TF_PLAT_STR___FLOAT128_HARDWARE__
+#endif
+#if defined(__FLOAT128_TYPE__)
+#define TF_PLAT_STR___FLOAT128_TYPE__ TF_PLAT_STR_(__FLOAT128_TYPE__)
+#else
+#define TF_PLAT_STR___FLOAT128_TYPE__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__HTM__)
+#define TF_PLAT_STR___HTM__ TF_PLAT_STR_(__HTM__)
+#else
+#define TF_PLAT_STR___HTM__
+#endif
+#if defined(__NO_FPRS__)
+#define TF_PLAT_STR___NO_FPRS__ TF_PLAT_STR_(__NO_FPRS__)
+#else
+#define TF_PLAT_STR___NO_FPRS__
+#endif
+#if defined(__NO_LWSYNC__)
+#define TF_PLAT_STR___NO_LWSYNC__ TF_PLAT_STR_(__NO_LWSYNC__)
+#else
+#define TF_PLAT_STR___NO_LWSYNC__
+#endif
+#if defined(__POWER8_VECTOR__)
+#define TF_PLAT_STR___POWER8_VECTOR__ TF_PLAT_STR_(__POWER8_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER8_VECTOR__
+#endif
+#if defined(__POWER9_VECTOR__)
+#define TF_PLAT_STR___POWER9_VECTOR__ TF_PLAT_STR_(__POWER9_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER9_VECTOR__
+#endif
+#if defined(__PPC405__)
+#define TF_PLAT_STR___PPC405__ TF_PLAT_STR_(__PPC405__)
+#else
+#define TF_PLAT_STR___PPC405__
+#endif
+#if defined(__QUAD_MEMORY_ATOMIC__)
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__ TF_PLAT_STR_(__QUAD_MEMORY_ATOMIC__)
+#else
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__
+#endif
+#if defined(__RECIPF__)
+#define TF_PLAT_STR___RECIPF__ TF_PLAT_STR_(__RECIPF__)
+#else
+#define TF_PLAT_STR___RECIPF__
+#endif
+#if defined(__RECIP_PRECISION__)
+#define TF_PLAT_STR___RECIP_PRECISION__ TF_PLAT_STR_(__RECIP_PRECISION__)
+#else
+#define TF_PLAT_STR___RECIP_PRECISION__
+#endif
+#if defined(__RECIP__)
+#define TF_PLAT_STR___RECIP__ TF_PLAT_STR_(__RECIP__)
+#else
+#define TF_PLAT_STR___RECIP__
+#endif
+#if defined(__RSQRTEF__)
+#define TF_PLAT_STR___RSQRTEF__ TF_PLAT_STR_(__RSQRTEF__)
+#else
+#define TF_PLAT_STR___RSQRTEF__
+#endif
+#if defined(__RSQRTE__)
+#define TF_PLAT_STR___RSQRTE__ TF_PLAT_STR_(__RSQRTE__)
+#else
+#define TF_PLAT_STR___RSQRTE__
+#endif
+#if defined(__TM_FENCE__)
+#define TF_PLAT_STR___TM_FENCE__ TF_PLAT_STR_(__TM_FENCE__)
+#else
+#define TF_PLAT_STR___TM_FENCE__
+#endif
+#if defined(__UPPER_REGS_DF__)
+#define TF_PLAT_STR___UPPER_REGS_DF__ TF_PLAT_STR_(__UPPER_REGS_DF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_DF__
+#endif
+#if defined(__UPPER_REGS_SF__)
+#define TF_PLAT_STR___UPPER_REGS_SF__ TF_PLAT_STR_(__UPPER_REGS_SF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_SF__
+#endif
+#if defined(__VEC__)
+#define TF_PLAT_STR___VEC__ TF_PLAT_STR_(__VEC__)
+#else
+#define TF_PLAT_STR___VEC__
+#endif
+#if defined(__VSX__)
+#define TF_PLAT_STR___VSX__ TF_PLAT_STR_(__VSX__)
+#else
+#define TF_PLAT_STR___VSX__
+#endif
+#if defined(__ARM_ARCH)
+#define TF_PLAT_STR___ARM_ARCH TF_PLAT_STR_(__ARM_ARCH)
+#else
+#define TF_PLAT_STR___ARM_ARCH
+#endif
+#if defined(__ARM_FEATURE_CLZ)
+#define TF_PLAT_STR___ARM_FEATURE_CLZ TF_PLAT_STR_(__ARM_FEATURE_CLZ)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CLZ
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRYPTO)
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO TF_PLAT_STR_(__ARM_FEATURE_CRYPTO)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO
+#endif
+#if defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING \
+  TF_PLAT_STR_(__ARM_FEATURE_DIRECTED_ROUNDING)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING
+#endif
+#if defined(__ARM_FEATURE_DSP)
+#define TF_PLAT_STR___ARM_FEATURE_DSP TF_PLAT_STR_(__ARM_FEATURE_DSP)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DSP
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#define TF_PLAT_STR___ARM_FEATURE_FMA TF_PLAT_STR_(__ARM_FEATURE_FMA)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_FMA
+#endif
+#if defined(__ARM_FEATURE_IDIV)
+#define TF_PLAT_STR___ARM_FEATURE_IDIV TF_PLAT_STR_(__ARM_FEATURE_IDIV)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_IDIV
+#endif
+#if defined(__ARM_FEATURE_LDREX)
+#define TF_PLAT_STR___ARM_FEATURE_LDREX TF_PLAT_STR_(__ARM_FEATURE_LDREX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_LDREX
+#endif
+#if defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN \
+  TF_PLAT_STR_(__ARM_FEATURE_NUMERIC_MAXMIN)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN
+#endif
+#if defined(__ARM_FEATURE_QBIT)
+#define TF_PLAT_STR___ARM_FEATURE_QBIT TF_PLAT_STR_(__ARM_FEATURE_QBIT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QBIT
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX TF_PLAT_STR_(__ARM_FEATURE_QRDMX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX
+#endif
+#if defined(__ARM_FEATURE_SAT)
+#define TF_PLAT_STR___ARM_FEATURE_SAT TF_PLAT_STR_(__ARM_FEATURE_SAT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SAT
+#endif
+#if defined(__ARM_FEATURE_SIMD32)
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32 TF_PLAT_STR_(__ARM_FEATURE_SIMD32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32
+#endif
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED \
+  TF_PLAT_STR_(__ARM_FEATURE_UNALIGNED)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED
+#endif
+#if defined(__ARM_FP)
+#define TF_PLAT_STR___ARM_FP TF_PLAT_STR_(__ARM_FP)
+#else
+#define TF_PLAT_STR___ARM_FP
+#endif
+#if defined(__ARM_NEON_FP)
+#define TF_PLAT_STR___ARM_NEON_FP TF_PLAT_STR_(__ARM_NEON_FP)
+#else
+#define TF_PLAT_STR___ARM_NEON_FP
+#endif
+#if defined(__ARM_NEON__)
+#define TF_PLAT_STR___ARM_NEON__ TF_PLAT_STR_(__ARM_NEON__)
+#else
+#define TF_PLAT_STR___ARM_NEON__
+#endif
+#if defined(__ARM_WMMX)
+#define TF_PLAT_STR___ARM_WMMX TF_PLAT_STR_(__ARM_WMMX)
+#else
+#define TF_PLAT_STR___ARM_WMMX
+#endif
+#if defined(__IWMMXT2__)
+#define TF_PLAT_STR___IWMMXT2__ TF_PLAT_STR_(__IWMMXT2__)
+#else
+#define TF_PLAT_STR___IWMMXT2__
+#endif
+#if defined(__IWMMXT__)
+#define TF_PLAT_STR___IWMMXT__ TF_PLAT_STR_(__IWMMXT__)
+#else
+#define TF_PLAT_STR___IWMMXT__
+#endif
+#if defined(__VFP_FP__)
+#define TF_PLAT_STR___VFP_FP__ TF_PLAT_STR_(__VFP_FP__)
+#else
+#define TF_PLAT_STR___VFP_FP__
+#endif
+#if defined(TARGET_IPHONE_SIMULATOR)
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR \
+  TF_PLAT_STR_(TARGET_IPHONE_SIMULATOR)
+#else
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR
+#endif
+#if defined(TARGET_OS_IOS)
+#define TF_PLAT_STR_TARGET_OS_IOS TF_PLAT_STR_(TARGET_OS_IOS)
+#else
+#define TF_PLAT_STR_TARGET_OS_IOS
+#endif
+#if defined(TARGET_OS_IPHONE)
+#define TF_PLAT_STR_TARGET_OS_IPHONE TF_PLAT_STR_(TARGET_OS_IPHONE)
+#else
+#define TF_PLAT_STR_TARGET_OS_IPHONE
+#endif
+#if defined(_MSC_VER)
+#define TF_PLAT_STR__MSC_VER TF_PLAT_STR_(_MSC_VER)
+#else
+#define TF_PLAT_STR__MSC_VER
+#endif
+#if defined(_M_ARM)
+#define TF_PLAT_STR__M_ARM TF_PLAT_STR_(_M_ARM)
+#else
+#define TF_PLAT_STR__M_ARM
+#endif
+#if defined(_M_ARM64)
+#define TF_PLAT_STR__M_ARM64 TF_PLAT_STR_(_M_ARM64)
+#else
+#define TF_PLAT_STR__M_ARM64
+#endif
+#if defined(_M_ARM_ARMV7VE)
+#define TF_PLAT_STR__M_ARM_ARMV7VE TF_PLAT_STR_(_M_ARM_ARMV7VE)
+#else
+#define TF_PLAT_STR__M_ARM_ARMV7VE
+#endif
+#if defined(_M_ARM_FP)
+#define TF_PLAT_STR__M_ARM_FP TF_PLAT_STR_(_M_ARM_FP)
+#else
+#define TF_PLAT_STR__M_ARM_FP
+#endif
+#if defined(_M_IX86)
+#define TF_PLAT_STR__M_IX86 TF_PLAT_STR_(_M_IX86)
+#else
+#define TF_PLAT_STR__M_IX86
+#endif
+#if defined(_M_X64)
+#define TF_PLAT_STR__M_X64 TF_PLAT_STR_(_M_X64)
+#else
+#define TF_PLAT_STR__M_X64
+#endif
+#if defined(_WIN32)
+#define TF_PLAT_STR__WIN32 TF_PLAT_STR_(_WIN32)
+#else
+#define TF_PLAT_STR__WIN32
+#endif
+#if defined(_WIN64)
+#define TF_PLAT_STR__WIN64 TF_PLAT_STR_(_WIN64)
+#else
+#define TF_PLAT_STR__WIN64
+#endif
+#if defined(__ANDROID__)
+#define TF_PLAT_STR___ANDROID__ TF_PLAT_STR_(__ANDROID__)
+#else
+#define TF_PLAT_STR___ANDROID__
+#endif
+#if defined(__APPLE__)
+#define TF_PLAT_STR___APPLE__ TF_PLAT_STR_(__APPLE__)
+#else
+#define TF_PLAT_STR___APPLE__
+#endif
+#if defined(__BYTE_ORDER__)
+#define TF_PLAT_STR___BYTE_ORDER__ TF_PLAT_STR_(__BYTE_ORDER__)
+#else
+#define TF_PLAT_STR___BYTE_ORDER__
+#endif
+#if defined(__CYGWIN__)
+#define TF_PLAT_STR___CYGWIN__ TF_PLAT_STR_(__CYGWIN__)
+#else
+#define TF_PLAT_STR___CYGWIN__
+#endif
+#if defined(__FreeBSD__)
+#define TF_PLAT_STR___FreeBSD__ TF_PLAT_STR_(__FreeBSD__)
+#else
+#define TF_PLAT_STR___FreeBSD__
+#endif
+#if defined(__LITTLE_ENDIAN__)
+#define TF_PLAT_STR___LITTLE_ENDIAN__ TF_PLAT_STR_(__LITTLE_ENDIAN__)
+#else
+#define TF_PLAT_STR___LITTLE_ENDIAN__
+#endif
+#if defined(__NetBSD__)
+#define TF_PLAT_STR___NetBSD__ TF_PLAT_STR_(__NetBSD__)
+#else
+#define TF_PLAT_STR___NetBSD__
+#endif
+#if defined(__OpenBSD__)
+#define TF_PLAT_STR___OpenBSD__ TF_PLAT_STR_(__OpenBSD__)
+#else
+#define TF_PLAT_STR___OpenBSD__
+#endif
+#if defined(____MSYS__)
+#define TF_PLAT_STR_____MSYS__ TF_PLAT_STR_(____MSYS__)
+#else
+#define TF_PLAT_STR_____MSYS__
+#endif
+#if defined(__aarch64__)
+#define TF_PLAT_STR___aarch64__ TF_PLAT_STR_(__aarch64__)
+#else
+#define TF_PLAT_STR___aarch64__
+#endif
+#if defined(__alpha__)
+#define TF_PLAT_STR___alpha__ TF_PLAT_STR_(__alpha__)
+#else
+#define TF_PLAT_STR___alpha__
+#endif
+#if defined(__arm__)
+#define TF_PLAT_STR___arm__ TF_PLAT_STR_(__arm__)
+#else
+#define TF_PLAT_STR___arm__
+#endif
+#if defined(__i386__)
+#define TF_PLAT_STR___i386__ TF_PLAT_STR_(__i386__)
+#else
+#define TF_PLAT_STR___i386__
+#endif
+#if defined(__i686__)
+#define TF_PLAT_STR___i686__ TF_PLAT_STR_(__i686__)
+#else
+#define TF_PLAT_STR___i686__
+#endif
+#if defined(__ia64__)
+#define TF_PLAT_STR___ia64__ TF_PLAT_STR_(__ia64__)
+#else
+#define TF_PLAT_STR___ia64__
+#endif
+#if defined(__linux__)
+#define TF_PLAT_STR___linux__ TF_PLAT_STR_(__linux__)
+#else
+#define TF_PLAT_STR___linux__
+#endif
+#if defined(__mips32__)
+#define TF_PLAT_STR___mips32__ TF_PLAT_STR_(__mips32__)
+#else
+#define TF_PLAT_STR___mips32__
+#endif
+#if defined(__mips64__)
+#define TF_PLAT_STR___mips64__ TF_PLAT_STR_(__mips64__)
+#else
+#define TF_PLAT_STR___mips64__
+#endif
+#if defined(__powerpc64__)
+#define TF_PLAT_STR___powerpc64__ TF_PLAT_STR_(__powerpc64__)
+#else
+#define TF_PLAT_STR___powerpc64__
+#endif
+#if defined(__powerpc__)
+#define TF_PLAT_STR___powerpc__ TF_PLAT_STR_(__powerpc__)
+#else
+#define TF_PLAT_STR___powerpc__
+#endif
+#if defined(__riscv___)
+#define TF_PLAT_STR___riscv___ TF_PLAT_STR_(__riscv___)
+#else
+#define TF_PLAT_STR___riscv___
+#endif
+#if defined(__s390x__)
+#define TF_PLAT_STR___s390x__ TF_PLAT_STR_(__s390x__)
+#else
+#define TF_PLAT_STR___s390x__
+#endif
+#if defined(__sparc64__)
+#define TF_PLAT_STR___sparc64__ TF_PLAT_STR_(__sparc64__)
+#else
+#define TF_PLAT_STR___sparc64__
+#endif
+#if defined(__sparc__)
+#define TF_PLAT_STR___sparc__ TF_PLAT_STR_(__sparc__)
+#else
+#define TF_PLAT_STR___sparc__
+#endif
+#if defined(__x86_64__)
+#define TF_PLAT_STR___x86_64__ TF_PLAT_STR_(__x86_64__)
+#else
+#define TF_PLAT_STR___x86_64__
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/prefetch.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/prefetch.h
new file mode 100644
index 00000000..d883529c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/prefetch.h
@@ -0,0 +1,45 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_PREFETCH_H_
+#define TENSORFLOW_TSL_PLATFORM_PREFETCH_H_
+
+#include "absl/base/prefetch.h"
+
+namespace tsl {
+namespace port {
+
+// Prefetching support.
+// Deprecated. Prefer to call absl::Prefetch* directly.
+
+enum PrefetchHint {
+  PREFETCH_HINT_T0 = 3,  // Temporal locality
+  PREFETCH_HINT_NTA = 0  // No temporal locality
+};
+
+template <PrefetchHint hint>
+void prefetch(const void* x) {
+  absl::PrefetchToLocalCache(x);
+}
+
+template <>
+inline void prefetch<PREFETCH_HINT_NTA>(const void* x) {
+  absl::PrefetchToLocalCacheNta(x);
+}
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_PREFETCH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/protobuf.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
new file mode 100644
index 00000000..a4525bab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
@@ -0,0 +1,136 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
+#define TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
+
+#include <cstdint>
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/platform.h"
+
+// Import whatever namespace protobuf comes from into the
+// ::tsl::protobuf namespace.
+//
+// TensorFlow code should use the ::tensorflow::protobuf namespace to
+// refer to all protobuf APIs.
+
+#include "google/protobuf/descriptor.pb.h"         // IWYU pragma: export
+#include "google/protobuf/arena.h"                // IWYU pragma: export
+#include "google/protobuf/descriptor.h"           // IWYU pragma: export
+#include "google/protobuf/dynamic_message.h"      // IWYU pragma: export
+#include "google/protobuf/io/coded_stream.h"      // IWYU pragma: export
+#include "google/protobuf/io/tokenizer.h"         // IWYU pragma: export
+#include "google/protobuf/io/zero_copy_stream.h"  // IWYU pragma: export
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"  // IWYU pragma: export
+#include "google/protobuf/map.h"                    // IWYU pragma: export
+#include "google/protobuf/message.h"                // IWYU pragma: export
+#include "google/protobuf/repeated_field.h"         // IWYU pragma: export
+#include "google/protobuf/repeated_ptr_field.h"     // IWYU pragma: export
+#include "google/protobuf/text_format.h"            // IWYU pragma: export
+#include "google/protobuf/util/field_comparator.h"  // IWYU pragma: export
+#include "google/protobuf/util/json_util.h"         // IWYU pragma: export
+#include "google/protobuf/util/message_differencer.h"  // IWYU pragma: export
+#include "google/protobuf/util/type_resolver_util.h"  // IWYU pragma: export
+
+#if !TSL_IS_IN_OSS
+#define TENSORFLOW_PROTOBUF_USES_CORD 1
+#endif  // TSL_IS_IN_OSS
+
+namespace tsl {
+
+namespace protobuf = ::google::protobuf;
+using protobuf_int64 = int64_t;
+using protobuf_uint64 = uint64_t;
+extern const char* kProtobufInt64Typename;
+extern const char* kProtobufUint64Typename;
+
+// Parses a protocol buffer contained in a string in the binary wire format.
+// Returns true on success. Note: Unlike protobuf's builtin ParseFromString,
+// this function has no size restrictions on the total size of the encoded
+// protocol buffer.
+bool ParseProtoUnlimited(protobuf::MessageLite* proto,
+                         const std::string& serialized);
+bool ParseProtoUnlimited(protobuf::MessageLite* proto, const void* serialized,
+                         size_t size);
+inline bool ParseProtoUnlimited(protobuf::MessageLite* proto,
+                                const tstring& serialized) {
+  return ParseProtoUnlimited(proto, serialized.data(), serialized.size());
+}
+
+// Returns the string value for the value of a string or bytes protobuf field.
+inline const std::string& ProtobufStringToString(const std::string& s) {
+  return s;
+}
+
+// Set <dest> to <src>. Swapping is allowed, as <src> does not need to be
+// preserved.
+inline void SetProtobufStringSwapAllowed(std::string* src, std::string* dest) {
+  *dest = std::move(*src);
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// These versions of ProtobufStringToString and SetProtobufString get used by
+// tools/proto_text's generated code.  They have the same name as the versions
+// in tsl/platform/protobuf.h, so the generation code doesn't need to determine
+// if the type is Cord or string at generation time.
+inline std::string ProtobufStringToString(const absl::Cord& s) {
+  return std::string(s);
+}
+inline void SetProtobufStringSwapAllowed(std::string* src, absl::Cord* dest) {
+  *dest = *src;
+}
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
+inline bool SerializeToTString(const protobuf::MessageLite& proto,
+                               tstring* output) {
+  size_t size = proto.ByteSizeLong();
+  output->resize_uninitialized(size);
+  return proto.SerializeWithCachedSizesToArray(
+      reinterpret_cast<uint8*>(output->data()));
+}
+
+inline bool ParseFromTString(const tstring& input,
+                             protobuf::MessageLite* proto) {
+  return proto->ParseFromArray(input.data(), static_cast<int>(input.size()));
+}
+
+// Analogue to StringOutputStream for tstring.
+class TStringOutputStream : public protobuf::io::ZeroCopyOutputStream {
+ public:
+  explicit TStringOutputStream(tstring* target);
+  ~TStringOutputStream() override = default;
+
+  TStringOutputStream(const TStringOutputStream&) = delete;
+  void operator=(const TStringOutputStream&) = delete;
+
+  bool Next(void** data, int* size) override;
+  void BackUp(int count) override;
+  int64_t ByteCount() const override;
+
+ private:
+  static constexpr int kMinimumSize = 16;
+
+  tstring* target_;
+};
+
+std::string LegacyUnredactedDebugString(const tsl::protobuf::Message& message);
+std::string LegacyUnredactedDebugString(
+    const tsl::protobuf::MessageLite& message);
+std::string LegacyUnredactedShortDebugString(
+    const tsl::protobuf::Message& message);
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/protobuf_compiler.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/protobuf_compiler.h
new file mode 100644
index 00000000..a796133b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/protobuf_compiler.h
@@ -0,0 +1,21 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_PROTOBUF_COMPILER_H_
+#define TENSORFLOW_TSL_PLATFORM_PROTOBUF_COMPILER_H_
+
+#include "google/protobuf/compiler/importer.h"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_TSL_PLATFORM_PROTOBUF_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
new file mode 100644
index 00000000..626239e9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
@@ -0,0 +1,361 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_RAM_FILE_SYSTEM_H_
+#define TENSORFLOW_TSL_PLATFORM_RAM_FILE_SYSTEM_H_
+
+// Implementation of an in-memory TF filesystem for simple prototyping (e.g.
+// via Colab). The TPU TF server does not have local filesystem access, which
+// makes it difficult to provide Colab tutorials: users must have GCS access
+// and sign-in in order to try out an example.
+//
+// Files are implemented on top of std::string. Directories, as with GCS or S3,
+// are implicit based on the existence of child files. Multiple files may
+// reference a single FS location, though no thread-safety guarantees are
+// provided.
+
+#include <string>
+
+#include "absl/strings/match.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/stringpiece.h"
+
+#ifdef PLATFORM_WINDOWS
+#undef DeleteFile
+#undef CopyFile
+#undef TranslateName
+#endif
+
+namespace tsl {
+
+class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
+ public:
+  RamRandomAccessFile(std::string name, std::shared_ptr<std::string> cord)
+      : name_(name), data_(cord) {}
+  ~RamRandomAccessFile() override {}
+
+  absl::Status Name(absl::string_view* result) const override {
+    *result = name_;
+    return absl::OkStatus();
+  }
+
+  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+                    char* scratch) const override {
+    if (offset >= data_->size()) {
+      return errors::OutOfRange("");
+    }
+
+    uint64 left = std::min(static_cast<uint64>(n), data_->size() - offset);
+    auto start = data_->begin() + offset;
+    auto end = data_->begin() + offset + left;
+
+    std::copy(start, end, scratch);
+    *result = absl::string_view(scratch, left);
+
+    // In case of a partial read, we must still fill `result`, but also return
+    // OutOfRange.
+    if (left < n) {
+      return errors::OutOfRange("");
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status Append(absl::string_view data) override {
+    data_->append(data.data(), data.size());
+    return absl::OkStatus();
+  }
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status Append(const absl::Cord& cord) override {
+    data_->append(cord.char_begin(), cord.char_end());
+    return absl::OkStatus();
+  }
+#endif
+
+  absl::Status Close() override { return absl::OkStatus(); }
+  absl::Status Flush() override { return absl::OkStatus(); }
+  absl::Status Sync() override { return absl::OkStatus(); }
+
+  absl::Status Tell(int64_t* position) override {
+    *position = -1;
+    return errors::Unimplemented("This filesystem does not support Tell()");
+  }
+
+ private:
+  RamRandomAccessFile(const RamRandomAccessFile&) = delete;
+  void operator=(const RamRandomAccessFile&) = delete;
+  std::string name_;
+  std::shared_ptr<std::string> data_;
+};
+
+class RamFileSystem : public FileSystem {
+ public:
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const std::string& fname_, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
+    mutex_lock m(mu_);
+    auto fname = StripRamFsPrefix(fname_);
+
+    if (fs_.find(fname) == fs_.end()) {
+      return errors::NotFound("");
+    }
+    if (fs_[fname] == nullptr) {
+      return errors::InvalidArgument(fname_, " is a directory.");
+    }
+    *result = std::unique_ptr<RandomAccessFile>(
+        new RamRandomAccessFile(fname, fs_[fname]));
+    return absl::OkStatus();
+  }
+
+  absl::Status NewWritableFile(const std::string& fname_,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
+    mutex_lock m(mu_);
+    auto fname = StripRamFsPrefix(fname_);
+
+    if (fs_.find(fname) == fs_.end()) {
+      fs_[fname] = std::make_shared<std::string>();
+    }
+    if (fs_[fname] == nullptr) {
+      return errors::InvalidArgument(fname_, " is a directory.");
+    }
+    *result = std::unique_ptr<WritableFile>(
+        new RamRandomAccessFile(fname, fs_[fname]));
+    return absl::OkStatus();
+  }
+
+  absl::Status NewAppendableFile(
+      const std::string& fname_, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
+    mutex_lock m(mu_);
+    auto fname = StripRamFsPrefix(fname_);
+
+    if (fs_.find(fname) == fs_.end()) {
+      fs_[fname] = std::make_shared<std::string>();
+    }
+    if (fs_[fname] == nullptr) {
+      return errors::InvalidArgument(fname_, " is a directory.");
+    }
+    *result = std::unique_ptr<WritableFile>(
+        new RamRandomAccessFile(fname, fs_[fname]));
+    return absl::OkStatus();
+  }
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return errors::Unimplemented("");
+  }
+
+  absl::Status FileExists(const std::string& fname_,
+                          TransactionToken* token) override {
+    FileStatistics stat;
+    auto fname = StripRamFsPrefix(fname_);
+
+    return Stat(fname, token, &stat);
+  }
+
+  absl::Status GetChildren(const std::string& dir_, TransactionToken* token,
+                           std::vector<std::string>* result) override {
+    mutex_lock m(mu_);
+    auto dir = StripRamFsPrefix(dir_);
+
+    auto it = fs_.lower_bound(dir);
+    while (it != fs_.end() && StartsWith(it->first, dir)) {
+      auto filename = StripPrefix(StripPrefix(it->first, dir), "/");
+      // It is not either (a) the parent directory itself or (b) a subdirectory
+      if (!filename.empty() && filename.find("/") == std::string::npos) {
+        result->push_back(filename);
+      }
+      ++it;
+    }
+
+    return absl::OkStatus();
+  }
+
+  absl::Status GetMatchingPaths(const std::string& pattern_,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override {
+    mutex_lock m(mu_);
+    auto pattern = StripRamFsPrefix(pattern_);
+
+    Env* env = Env::Default();
+    for (auto it = fs_.begin(); it != fs_.end(); ++it) {
+      if (env->MatchPath(it->first, pattern)) {
+        results->push_back("ram://" + it->first);
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status Stat(const std::string& fname_, TransactionToken* token,
+                    FileStatistics* stat) override {
+    mutex_lock m(mu_);
+    auto fname = StripRamFsPrefix(fname_);
+
+    auto it = fs_.lower_bound(fname);
+    if (it == fs_.end() || !StartsWith(it->first, fname)) {
+      return errors::NotFound("");
+    }
+
+    if (it->first == fname && it->second != nullptr) {
+      stat->is_directory = false;
+      stat->length = fs_[fname]->size();
+      stat->mtime_nsec = 0;
+      return absl::OkStatus();
+    }
+
+    stat->is_directory = true;
+    stat->length = 0;
+    stat->mtime_nsec = 0;
+    return absl::OkStatus();
+  }
+
+  absl::Status DeleteFile(const std::string& fname_,
+                          TransactionToken* token) override {
+    mutex_lock m(mu_);
+    auto fname = StripRamFsPrefix(fname_);
+
+    if (fs_.find(fname) != fs_.end()) {
+      fs_.erase(fname);
+      return absl::OkStatus();
+    }
+
+    return errors::NotFound("");
+  }
+
+  absl::Status CreateDir(const std::string& dirname_,
+                         TransactionToken* token) override {
+    mutex_lock m(mu_);
+    auto dirname = StripRamFsPrefix(dirname_);
+
+    auto it = fs_.find(dirname);
+    if (it != fs_.end() && it->second != nullptr) {
+      return errors::AlreadyExists(
+          "cannot create directory with same name as an existing file");
+    }
+
+    fs_[dirname] = nullptr;
+    return absl::OkStatus();
+  }
+
+  absl::Status RecursivelyCreateDir(const std::string& dirname_,
+                                    TransactionToken* token) override {
+    auto dirname = StripRamFsPrefix(dirname_);
+
+    std::vector<std::string> dirs = StrSplit(dirname, "/");
+    absl::Status last_status;
+    std::string dir = dirs[0];
+    last_status = CreateDir(dir, token);
+
+    for (int i = 1; i < dirs.size(); ++i) {
+      dir = dir + "/" + dirs[i];
+      last_status = CreateDir(dir, token);
+    }
+    return last_status;
+  }
+
+  absl::Status DeleteDir(const std::string& dirname_,
+                         TransactionToken* token) override {
+    mutex_lock m(mu_);
+    auto dirname = StripRamFsPrefix(dirname_);
+
+    auto it = fs_.find(dirname);
+    if (it == fs_.end()) {
+      return errors::NotFound("");
+    }
+    if (it->second != nullptr) {
+      return errors::InvalidArgument("Not a directory");
+    }
+    fs_.erase(dirname);
+
+    return absl::OkStatus();
+  }
+
+  absl::Status GetFileSize(const std::string& fname_, TransactionToken* token,
+                           uint64* file_size) override {
+    mutex_lock m(mu_);
+    auto fname = StripRamFsPrefix(fname_);
+
+    if (fs_.find(fname) != fs_.end()) {
+      if (fs_[fname] == nullptr) {
+        return errors::InvalidArgument("Not a file");
+      }
+      *file_size = fs_[fname]->size();
+      return absl::OkStatus();
+    }
+    return errors::NotFound("");
+  }
+
+  absl::Status RenameFile(const std::string& src_, const std::string& target_,
+                          TransactionToken* token) override {
+    mutex_lock m(mu_);
+    auto src = StripRamFsPrefix(src_);
+    auto target = StripRamFsPrefix(target_);
+
+    if (fs_.find(src) != fs_.end()) {
+      fs_[target] = fs_[src];
+      fs_.erase(fs_.find(src));
+      return absl::OkStatus();
+    }
+    return errors::NotFound("");
+  }
+
+  RamFileSystem() {}
+  ~RamFileSystem() override {}
+
+ private:
+  mutex mu_;
+  std::map<std::string, std::shared_ptr<std::string>> fs_;
+
+  std::vector<std::string> StrSplit(std::string s, std::string delim) {
+    std::vector<std::string> ret;
+    size_t curr_pos = 0;
+    while ((curr_pos = s.find(delim)) != std::string::npos) {
+      ret.push_back(s.substr(0, curr_pos));
+      s.erase(0, curr_pos + delim.size());
+    }
+    ret.push_back(s);
+    return ret;
+  }
+
+  bool StartsWith(std::string s, std::string prefix) {
+    return absl::StartsWith(s, prefix);
+  }
+
+  string StripPrefix(std::string s, std::string prefix) {
+    if (absl::StartsWith(s, prefix)) {
+      return s.erase(0, prefix.size());
+    }
+    return s;
+  }
+
+  string StripRamFsPrefix(std::string name) {
+    std::string s = StripPrefix(name, "ram://");
+    if (*(s.rbegin()) == '/') {
+      s.pop_back();
+    }
+    return s;
+  }
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RAM_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/random.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/random.h
new file mode 100644
index 00000000..680520d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/random.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_RANDOM_H_
+#define TENSORFLOW_TSL_PLATFORM_RANDOM_H_
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace random {
+
+// Return a 64-bit random value.  Different sequences are generated
+// in different processes.
+uint64 New64();
+
+// Same as previous method, but uses a different RNG for each thread.
+uint64 ThreadLocalNew64();
+
+// Return a 64-bit random value. Uses
+// std::mersenne_twister_engine::default_seed as seed value.
+uint64 New64DefaultSeed();
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RANDOM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/raw_coding.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/raw_coding.h
new file mode 100644
index 00000000..efa959af
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/raw_coding.h
@@ -0,0 +1,72 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_RAW_CODING_H_
+#define TENSORFLOW_TSL_PLATFORM_RAW_CODING_H_
+
+#include <string.h>
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/byte_order.h"
+
+namespace tsl {
+namespace core {
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint16 DecodeFixed16(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint16 result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint16>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint16>(static_cast<unsigned char>(ptr[1])) << 8));
+  }
+}
+
+inline uint32 DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32 result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
+            (static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
+            (static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64 DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64 result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64 lo = DecodeFixed32(ptr);
+    uint64 hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+}  // namespace core
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RAW_CODING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/refcount.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/refcount.h
new file mode 100644
index 00000000..5af30791
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/refcount.h
@@ -0,0 +1,355 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_REFCOUNT_H_
+#define TENSORFLOW_TSL_PLATFORM_REFCOUNT_H_
+
+#include <atomic>
+#include <map>
+#include <memory>
+
+#include "xla/tsl/platform/logging.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace core {
+
+class RefCounted {
+ public:
+  // Initial reference count is one.
+  RefCounted();
+
+  // Increments reference count by one.
+  void Ref() const;
+
+  // Decrements reference count by one.  If the count remains
+  // positive, returns false.  When the count reaches zero, returns
+  // true and deletes this, in which case the caller must not access
+  // the object afterward.
+  bool Unref() const;
+
+  // Gets the current reference count.
+  int_fast32_t RefCount() const;
+
+  // Return whether the reference count is one.
+  // If the reference count is used in the conventional way, a
+  // reference count of 1 implies that the current thread owns the
+  // reference and no other thread shares it.
+  // This call performs the test for a reference count of one, and
+  // performs the memory barrier needed for the owning thread
+  // to act on the object, knowing that it has exclusive access to the
+  // object.
+  bool RefCountIsOne() const;
+
+ protected:
+  // Make destructor protected so that RefCounted objects cannot
+  // be instantiated directly. Only subclasses can be instantiated.
+  virtual ~RefCounted();
+
+  // Increments reference count by one if the object is not being destructed.
+  // This function is used by WeakRefCounted for securely acquiring a
+  // strong reference. It is only safe to call this as part of the weak
+  // reference implementation.
+  bool TryRef() const;
+
+  // Notifies the instance is deleted. This function is used by WeakRefCounted
+  // for securely propagating the delete notification before the destruction
+  // sequence starts.
+  virtual void NotifyDeleted() const;
+
+ private:
+  mutable std::atomic_int_fast32_t ref_;
+
+  RefCounted(const RefCounted&) = delete;
+  void operator=(const RefCounted&) = delete;
+};
+
+// A deleter class to form a std::unique_ptr that unrefs objects.
+struct RefCountDeleter {
+  void operator()(const RefCounted* o) const { o->Unref(); }
+};
+
+template <typename T>
+class RefCountPtr;
+
+// Adds a new reference to a RefCounted pointer.
+template <typename T>
+ABSL_MUST_USE_RESULT RefCountPtr<T> GetNewRef(T* ptr) {
+  static_assert(std::is_base_of<RefCounted, T>::value);
+
+  if (ptr == nullptr) return RefCountPtr<T>();
+  ptr->Ref();
+  RefCountPtr<T> ret(ptr);
+  return ret;
+}
+
+// A unique_ptr that unrefs the owned object on destruction.
+template <typename T>
+class RefCountPtr : public std::unique_ptr<T, RefCountDeleter> {
+ public:
+  using std::unique_ptr<T, RefCountDeleter>::unique_ptr;
+  ABSL_MUST_USE_RESULT RefCountPtr GetNewRef() const {
+    if (this->get() == nullptr) return RefCountPtr<T>();
+    this->get()->Ref();
+    return RefCountPtr<T>(this->get());
+  }
+};
+
+// Helper class to unref an object when out-of-scope.
+class ScopedUnref {
+ public:
+  explicit ScopedUnref(const RefCounted* o) : obj_(o) {}
+  ~ScopedUnref() {
+    if (obj_) obj_->Unref();
+  }
+
+ private:
+  const RefCounted* obj_;
+
+  ScopedUnref(const ScopedUnref&) = delete;
+  void operator=(const ScopedUnref&) = delete;
+};
+
+// Forward declaration for friend class of WeakRefCounted.
+template <typename T>
+class WeakPtr;
+
+// A WeakNotifyFn is called when the weakly referred object is being destroyed.
+// The object may already be destructed when the call occurs. A WeakNotifyFn
+// can be passed into WeakPtr at construction.
+using WeakNotifyFn = std::function<void()>;
+
+// A base class for RefCounted objects that allow weak references by WeakPtr.
+// WeakRefCounted and every WeakPtr to it, each holds a strong reference to a
+// WeakRefData.
+//
+// If the WeakRefCounted is valid, WeakPtr::GetNewRef() returns a new strong
+// reference to the WeakRefCounted.
+// If the WeakRefCounted is being destructed, `WeakRefCounted::ref_ == 0`;
+// if the WeakRefcounted is already destructed,`WeakRefData::ptr == nullptr`.
+// In either case, WeakPtr::GetNewRef() returns a nullptr.
+class WeakRefCounted : public RefCounted {
+ public:
+  int WeakRefCount() const {
+    // Each weak ref owns one ref to data_, and *this owns the last one.
+    return data_->RefCount() - 1;
+  }
+
+ protected:
+  void NotifyDeleted() const override { data_->Notify(); }
+
+ private:
+  struct WeakRefData : public RefCounted {
+    explicit WeakRefData(WeakRefCounted* ptr) : ptr(ptr), next_notifier_id(1) {}
+
+    mutable mutex mu;
+    WeakRefCounted* ptr TF_GUARDED_BY(mu);
+    std::map<int, WeakNotifyFn> notifiers;
+    int next_notifier_id;
+
+    // Notifies WeakPtr instances that this object is being destructed.
+    void Notify() {
+      mutex_lock ml(mu);
+
+      while (!notifiers.empty()) {
+        auto iter = notifiers.begin();
+        WeakNotifyFn notify_fn = std::move(iter->second);
+        notifiers.erase(iter);
+
+        mu.unlock();
+        notify_fn();
+        mu.lock();
+      }
+      ptr = nullptr;
+    }
+
+    WeakRefCounted* GetNewRef() {
+      mutex_lock ml(mu);
+      if (ptr != nullptr && ptr->TryRef()) {
+        return ptr;
+      }
+      return nullptr;
+    }
+
+    // Inserts notify_fn and returns a non-zero id.
+    // Returns 0 if insertion fails due to the object is being destroyed.
+    // 0 is also used by WeakPtr to represent "no notify_fn".
+    int AddNotifier(WeakNotifyFn notify_fn) {
+      mutex_lock ml(mu);
+      if (ptr == nullptr) {
+        return 0;
+      }
+      int notifier_id = next_notifier_id++;
+      notifiers.emplace(notifier_id, std::move(notify_fn));
+      return notifier_id;
+    }
+
+    int DupNotifier(int notifier_id) {
+      mutex_lock ml(mu);
+      auto iter = notifiers.find(notifier_id);
+      if (iter != notifiers.end()) {
+        int notifier_id = next_notifier_id++;
+        notifiers.emplace(notifier_id, iter->second);
+        return notifier_id;
+      }
+      return 0;
+    }
+
+    void RemoveNotifier(int notifier_id) {
+      mutex_lock ml(mu);
+      notifiers.erase(notifier_id);
+    }
+  };
+
+  mutable RefCountPtr<WeakRefData> data_{new WeakRefData(this)};
+
+  template <typename T>
+  friend class WeakPtr;
+  // MSVC14 workaround: access permission of a nested class member is not
+  // treated as an ordinary member in MSVC14.
+  friend struct WeakRefData;
+};
+
+// A weak reference to a WeakRefCounted object. Refer to WeakRefCounted.
+template <typename T>
+class WeakPtr {
+ public:
+  // Creates a weak reference.
+  // When the object is being destroyed, notify_fn is called.
+  explicit WeakPtr(WeakRefCounted* ptr = nullptr,
+                   WeakNotifyFn notify_fn = nullptr)
+      : data_(nullptr), notifier_id_(0) {
+    if (ptr != nullptr) {
+      ptr->data_->Ref();
+      data_.reset(ptr->data_.get());
+      if (notify_fn) {
+        notifier_id_ = data_->AddNotifier(notify_fn);
+      }
+    }
+  }
+
+  ~WeakPtr() {
+    if (data_ != nullptr && notifier_id_ != 0) {
+      data_->RemoveNotifier(notifier_id_);
+    }
+  }
+
+  WeakPtr(const WeakPtr& other) { operator=(other); }
+
+  WeakPtr& operator=(const WeakPtr& other) {
+    if (data_ != nullptr && notifier_id_ != 0) {
+      data_->RemoveNotifier(notifier_id_);
+    }
+    other.data_->Ref();
+    data_.reset(other.data_.get());
+    notifier_id_ = data_->DupNotifier(other.notifier_id_);
+    return *this;
+  }
+
+  WeakPtr(WeakPtr&& other) noexcept {
+    data_ = std::move(other.data_);
+    notifier_id_ = other.notifier_id_;
+    other.notifier_id_ = 0;
+  }
+
+  WeakPtr& operator=(WeakPtr&& other) noexcept {
+    if (this != &other) {
+      if (data_ != nullptr && notifier_id_ != 0) {
+        data_->RemoveNotifier(notifier_id_);
+      }
+      data_ = std::move(other.data_);
+      notifier_id_ = other.notifier_id_;
+      other.notifier_id_ = 0;
+    }
+    return *this;
+  }
+
+  // Returns a new strong reference to the referred object, or nullptr if the
+  // object is in an invalid state (being destructed or already destructed).
+  RefCountPtr<T> GetNewRef() const {
+    RefCountPtr<T> ref;
+    if (data_ != nullptr) {
+      WeakRefCounted* ptr = data_->GetNewRef();
+      ref.reset(static_cast<T*>(ptr));
+    }
+    return std::move(ref);
+  }
+
+ private:
+  RefCountPtr<WeakRefCounted::WeakRefData> data_;
+  int notifier_id_;
+};
+
+// Inlined routines, since these are performance critical
+inline RefCounted::RefCounted() : ref_(1) {}
+
+inline RefCounted::~RefCounted() {
+  // A destructing object has ref_ == 0.
+  // It is a bug if the object is resurrected (ref_ > 0) before delete is
+  // called by Unref().
+  DCHECK_EQ(ref_.load(), 0);
+}
+
+inline void RefCounted::Ref() const {
+  // Ref() uses relaxed order because it is never called with old_ref == 0.
+  // When old_ref >= 1, no actions depend on the new value of ref.
+  int_fast32_t old_ref = ref_.fetch_add(1, std::memory_order_relaxed);
+  DCHECK_GT(old_ref, 0);
+}
+
+inline bool RefCounted::TryRef() const {
+  // This is not on a hot path.
+  // Be conservative and use seq_cst to prevent racing with Unref() when
+  // old_ref == 0, as done in LLVM libstdc++.
+  int_fast32_t old_ref = ref_.load();
+  while (old_ref != 0) {
+    if (ref_.compare_exchange_weak(old_ref, old_ref + 1)) {
+      return true;
+    }
+  }
+  // Already destructing, cannot increase ref.
+  return false;
+}
+
+inline bool RefCounted::Unref() const {
+  DCHECK_GT(ref_.load(), 0);
+  // acq_rel is used to prevent reordering introduces object access after
+  // destruction.
+
+  // Using release alone is a bug on systems where acq_rel differs from release.
+  // (e.g. arm), according to Herb Sutter's 2012 talk on "Atomic<> Weapons".
+  if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+    NotifyDeleted();
+    delete this;
+    return true;
+  }
+  return false;
+}
+
+inline int_fast32_t RefCounted::RefCount() const {
+  return ref_.load(std::memory_order_acquire);
+}
+
+inline void RefCounted::NotifyDeleted() const {}
+
+inline bool RefCounted::RefCountIsOne() const {
+  return (ref_.load(std::memory_order_acquire) == 1);
+}
+
+}  // namespace core
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_REFCOUNT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/regexp.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/regexp.h
new file mode 100644
index 00000000..fac545c2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/regexp.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_REGEXP_H_
+#define TENSORFLOW_TSL_PLATFORM_REGEXP_H_
+
+#include "tsl/platform/platform.h"
+
+#if TSL_IS_IN_OSS
+#include "re2/re2.h"  // IWYU pragma: export
+#else
+#include "third_party/re2/re2.h"  // IWYU pragma: export
+#endif                            // TSL_IS_IN_OSS
+
+#endif  // TENSORFLOW_TSL_PLATFORM_REGEXP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/resource.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/resource.h
new file mode 100644
index 00000000..19a56707
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/resource.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_RESOURCE_H_
+#define TENSORFLOW_TSL_PLATFORM_RESOURCE_H_
+
+#include <memory>
+
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+// ResourceTagger objects should only be allocated on the stack.
+class ResourceTagger {
+ public:
+  ResourceTagger(absl::string_view key, absl::string_view value);
+  ~ResourceTagger();
+
+  // Do not allow copying or moving ResourceTagger
+  ResourceTagger(const ResourceTagger&) = delete;
+  ResourceTagger(ResourceTagger&&) = delete;
+  ResourceTagger& operator=(const ResourceTagger&) = delete;
+  ResourceTagger& operator=(ResourceTagger&&) = delete;
+
+ private:
+  class ResourceTaggerImpl;
+  const std::unique_ptr<ResourceTaggerImpl> impl_;
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RESOURCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/resource_loader.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/resource_loader.h
new file mode 100644
index 00000000..047de4d9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/resource_loader.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Small helper library to access "data" dependencies defined in BUILD files.
+// Requires the relative paths starting from tensorflow/...
+// For example, to get this file, a user would call:
+// GetDataDependencyFilepath("tensorflow/core/platform/resource_loadder.h")
+
+#ifndef TENSORFLOW_TSL_PLATFORM_RESOURCE_LOADER_H_
+#define TENSORFLOW_TSL_PLATFORM_RESOURCE_LOADER_H_
+
+#include <string>
+
+namespace tsl {
+
+std::string GetDataDependencyFilepath(const std::string& relative_path);
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RESOURCE_LOADER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
new file mode 100644
index 00000000..f915e5d4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
@@ -0,0 +1,315 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_RETRYING_FILE_SYSTEM_H_
+#define TENSORFLOW_TSL_PLATFORM_RETRYING_FILE_SYSTEM_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/random.h"
+#include "tsl/platform/retrying_utils.h"
+
+namespace tsl {
+
+/// A wrapper to add retry logic to another file system.
+template <typename Underlying>
+class RetryingFileSystem : public FileSystem {
+ public:
+  RetryingFileSystem(std::unique_ptr<Underlying> base_file_system,
+                     const RetryConfig& retry_config)
+      : base_file_system_(std::move(base_file_system)),
+        retry_config_(retry_config) {}
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  absl::Status NewWritableFile(const string& filename, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override;
+
+  absl::Status NewAppendableFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override;
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname, token]() {
+          return base_file_system_->FileExists(fname, token);
+        },
+        retry_config_);
+  }
+
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dir, result, token]() {
+          return base_file_system_->GetChildren(dir, token, result);
+        },
+        retry_config_);
+  }
+
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &pattern, result, token]() {
+          return base_file_system_->GetMatchingPaths(pattern, token, result);
+        },
+        retry_config_);
+  }
+
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname, stat, token]() {
+          return base_file_system_->Stat(fname, token, stat);
+        },
+        retry_config_);
+  }
+
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override {
+    return RetryingUtils::DeleteWithRetries(
+        [this, &fname, token]() {
+          return base_file_system_->DeleteFile(fname, token);
+        },
+        retry_config_);
+  }
+
+  absl::Status CreateDir(const string& dirname,
+                         TransactionToken* token) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dirname, token]() {
+          return base_file_system_->CreateDir(dirname, token);
+        },
+        retry_config_);
+  }
+
+  absl::Status DeleteDir(const string& dirname,
+                         TransactionToken* token) override {
+    return RetryingUtils::DeleteWithRetries(
+        [this, &dirname, token]() {
+          return base_file_system_->DeleteDir(dirname, token);
+        },
+        retry_config_);
+  }
+
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* file_size) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname, file_size, token]() {
+          return base_file_system_->GetFileSize(fname, token, file_size);
+        },
+        retry_config_);
+  }
+
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &src, &target, token]() {
+          return base_file_system_->RenameFile(src, target, token);
+        },
+        retry_config_);
+  }
+
+  absl::Status IsDirectory(const string& dirname,
+                           TransactionToken* token) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dirname, token]() {
+          return base_file_system_->IsDirectory(dirname, token);
+        },
+        retry_config_);
+  }
+
+  absl::Status HasAtomicMove(const string& path,
+                             bool* has_atomic_move) override {
+    // this method does not need to be retried
+    return base_file_system_->HasAtomicMove(path, has_atomic_move);
+  }
+
+  absl::Status CanCreateTempFile(const std::string& fname,
+                                 bool* can_create_temp_file) override {
+    // this method does not need to be retried
+    return base_file_system_->CanCreateTempFile(fname, can_create_temp_file);
+  }
+
+  absl::Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override {
+    return RetryingUtils::DeleteWithRetries(
+        [this, &dirname, token, undeleted_files, undeleted_dirs]() {
+          return base_file_system_->DeleteRecursively(
+              dirname, token, undeleted_files, undeleted_dirs);
+        },
+        retry_config_);
+  }
+
+  void FlushCaches(TransactionToken* token) override {
+    base_file_system_->FlushCaches(token);
+  }
+
+  Underlying* underlying() const { return base_file_system_.get(); }
+
+ private:
+  std::unique_ptr<Underlying> base_file_system_;
+  const RetryConfig retry_config_;
+
+  RetryingFileSystem(const RetryingFileSystem&) = delete;
+  void operator=(const RetryingFileSystem&) = delete;
+};
+
+namespace retrying_internals {
+
+class RetryingRandomAccessFile : public RandomAccessFile {
+ public:
+  RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
+                           const RetryConfig& retry_config)
+      : base_file_(std::move(base_file)), retry_config_(retry_config) {}
+
+  absl::Status Name(absl::string_view* result) const override {
+    return base_file_->Name(result);
+  }
+
+  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+                    char* scratch) const override {
+    return RetryingUtils::CallWithRetries(
+        [this, offset, n, result, scratch]() {
+          return base_file_->Read(offset, n, result, scratch);
+        },
+        retry_config_);
+  }
+
+ private:
+  std::unique_ptr<RandomAccessFile> base_file_;
+  const RetryConfig retry_config_;
+};
+
+class RetryingWritableFile : public WritableFile {
+ public:
+  RetryingWritableFile(std::unique_ptr<WritableFile> base_file,
+                       const RetryConfig& retry_config)
+      : base_file_(std::move(base_file)), retry_config_(retry_config) {}
+
+  ~RetryingWritableFile() override {
+    // Makes sure the retrying version of Close() is called in the destructor.
+    Close().IgnoreError();
+  }
+
+  absl::Status Append(absl::string_view data) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &data]() { return base_file_->Append(data); }, retry_config_);
+  }
+  absl::Status Close() override {
+    return RetryingUtils::CallWithRetries(
+        [this]() { return base_file_->Close(); }, retry_config_);
+  }
+  absl::Status Flush() override {
+    return RetryingUtils::CallWithRetries(
+        [this]() { return base_file_->Flush(); }, retry_config_);
+  }
+  absl::Status Name(absl::string_view* result) const override {
+    return base_file_->Name(result);
+  }
+  absl::Status Sync() override {
+    return RetryingUtils::CallWithRetries(
+        [this]() { return base_file_->Sync(); }, retry_config_);
+  }
+  absl::Status Tell(int64_t* position) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &position]() { return base_file_->Tell(position); },
+        retry_config_);
+  }
+
+ private:
+  std::unique_ptr<WritableFile> base_file_;
+  const RetryConfig retry_config_;
+};
+
+}  // namespace retrying_internals
+
+template <typename Underlying>
+absl::Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
+  std::unique_ptr<RandomAccessFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewRandomAccessFile(filename, token,
+                                                      &base_file);
+      },
+      retry_config_));
+  result->reset(new retrying_internals::RetryingRandomAccessFile(
+      std::move(base_file), retry_config_));
+  return absl::OkStatus();
+}
+
+template <typename Underlying>
+absl::Status RetryingFileSystem<Underlying>::NewWritableFile(
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<WritableFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewWritableFile(filename, token, &base_file);
+      },
+      retry_config_));
+  result->reset(new retrying_internals::RetryingWritableFile(
+      std::move(base_file), retry_config_));
+  return absl::OkStatus();
+}
+
+template <typename Underlying>
+absl::Status RetryingFileSystem<Underlying>::NewAppendableFile(
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<WritableFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewAppendableFile(filename, token,
+                                                    &base_file);
+      },
+      retry_config_));
+  result->reset(new retrying_internals::RetryingWritableFile(
+      std::move(base_file), retry_config_));
+  return absl::OkStatus();
+}
+
+template <typename Underlying>
+absl::Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+  return RetryingUtils::CallWithRetries(
+      [this, &filename, result, token]() {
+        return base_file_system_->NewReadOnlyMemoryRegionFromFile(
+            filename, token, result);
+      },
+      retry_config_);
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RETRYING_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h
new file mode 100644
index 00000000..5b1e802c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PLATFORM_RETRYING_UTILS_H_
+#define TENSORFLOW_TSL_PLATFORM_RETRYING_UTILS_H_
+
+#include <functional>
+
+#include "absl/time/time.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+
+// Default time before reporting failure: ~100 seconds.
+struct RetryConfig {
+  RetryConfig(int64_t init_delay_time_us = 100 * 1000,
+              int64_t max_delay_time_us = 32 * 1000 * 1000,
+              int max_retries = 10) {
+    this->init_delay_time_us = init_delay_time_us;
+    this->max_delay_time_us = max_delay_time_us;
+    this->max_retries = max_retries;
+  }
+
+  // In case of failure, every call will be retried max_retries times.
+  int max_retries;
+
+  // Initial backoff time
+  int64_t init_delay_time_us;
+
+  // Maximum backoff time in microseconds.
+  int64_t max_delay_time_us;
+};
+
+class RetryingUtils {
+ public:
+  /// \brief Retries the function in case of failure with exponential backoff.
+  ///
+  /// The provided callback is retried with an exponential backoff until it
+  /// returns OK or a non-retriable error status.
+  /// If initial_delay_microseconds is zero, no delays will be made between
+  /// retries.
+  /// If all retries failed, returns the last error status.
+  static absl::Status CallWithRetries(const std::function<absl::Status()>& f,
+                                      const RetryConfig& config);
+
+  /// sleep_usec is a function that sleeps for the given number of microseconds.
+  static absl::Status CallWithRetries(
+      const std::function<absl::Status()>& f,
+      const std::function<void(int64_t)>& sleep_usec,
+      const RetryConfig& config);
+  /// \brief A retrying wrapper for a function that deletes a resource.
+  ///
+  /// The function takes care of the scenario when a delete operation
+  /// returns a failure but succeeds under the hood: if a retry returns
+  /// NOT_FOUND, the whole operation is considered a success.
+  static absl::Status DeleteWithRetries(
+      const std::function<absl::Status()>& delete_func,
+      const RetryConfig& config);
+};
+
+// Given the total number of retries attempted, returns a randomized duration of
+// time to delay before the next retry.
+//
+// The average computed backoff increases with the number of retries attempted.
+// See implementation for details on the calculations.
+absl::Duration ComputeRetryBackoff(
+    int current_retry_attempt, absl::Duration min_delay = absl::Milliseconds(1),
+    absl::Duration max_delay = absl::Seconds(10));
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_RETRYING_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/rocm_rocdl_path.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/rocm_rocdl_path.h
new file mode 100644
index 00000000..7134df49
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/rocm_rocdl_path.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_ROCM_ROCDL_PATH_H_
+#define TENSORFLOW_TSL_PLATFORM_ROCM_ROCDL_PATH_H_
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+// Returns the root directory of the ROCM SDK, which contains sub-folders such
+// as bin, lib, and rocdl.
+string RocmRoot();
+
+// Returns the directory that contains ROCm-Device-Libs files in the ROCm SDK.
+string RocdlRoot();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_ROCM_ROCDL_PATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/scanner.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/scanner.h
new file mode 100644
index 00000000..4eb70b82
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/scanner.h
@@ -0,0 +1,249 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_SCANNER_H_
+#define TENSORFLOW_TSL_PLATFORM_SCANNER_H_
+
+#include <string>
+
+#include "xla/tsl/platform/macros.h"
+#include "tsl/platform/str_util.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace strings {
+
+// Scanner provides simplified string parsing, in which a string is parsed as a
+// series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
+// finally GetResult is called. If GetResult returns true, then it also returns
+// the remaining characters and any captured substring.
+//
+// The range to capture can be controlled with RestartCapture and StopCapture;
+// by default, all processed characters are captured.
+class Scanner {
+ public:
+  // Classes of characters. Each enum name is to be read as the union of the
+  // parts - e.g., class LETTER_DIGIT means the class includes all letters and
+  // all digits.
+  //
+  // LETTER means ascii letter a-zA-Z.
+  // DIGIT means ascii digit: 0-9.
+  enum CharClass {
+    // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
+    // in scanner_test.cc
+    ALL,
+    DIGIT,
+    LETTER,
+    LETTER_DIGIT,
+    LETTER_DIGIT_DASH_UNDERSCORE,
+    LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
+    LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
+    LETTER_DIGIT_DOT,
+    LETTER_DIGIT_DOT_PLUS_MINUS,
+    LETTER_DIGIT_DOT_UNDERSCORE,
+    LETTER_DIGIT_UNDERSCORE,
+    LOWERLETTER,
+    LOWERLETTER_DIGIT,
+    LOWERLETTER_DIGIT_UNDERSCORE,
+    NON_ZERO_DIGIT,
+    SPACE,
+    UPPERLETTER,
+    RANGLE,
+  };
+
+  explicit Scanner(absl::string_view source) : cur_(source) {
+    RestartCapture();
+  }
+
+  // Consume the next character of the given class from input. If the next
+  // character is not in the class, then GetResult will ultimately return false.
+  Scanner& One(CharClass clz) {
+    if (cur_.empty() || !Matches(clz, cur_[0])) {
+      return Error();
+    }
+    cur_.remove_prefix(1);
+    return *this;
+  }
+
+  // Consume the next s.size() characters of the input, if they match <s>. If
+  // they don't match <s>, this is a no-op.
+  Scanner& ZeroOrOneLiteral(absl::string_view s) {
+    absl::ConsumePrefix(&cur_, s);
+    return *this;
+  }
+
+  // Consume the next s.size() characters of the input, if they match <s>. If
+  // they don't match <s>, then GetResult will ultimately return false.
+  Scanner& OneLiteral(absl::string_view s) {
+    if (!absl::ConsumePrefix(&cur_, s)) {
+      error_ = true;
+    }
+    return *this;
+  }
+
+  // Consume characters from the input as long as they match <clz>. Zero
+  // characters is still considered a match, so it will never cause GetResult to
+  // return false.
+  Scanner& Any(CharClass clz) {
+    while (!cur_.empty() && Matches(clz, cur_[0])) {
+      cur_.remove_prefix(1);
+    }
+    return *this;
+  }
+
+  // Shorthand for One(clz).Any(clz).
+  Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
+
+  // Reset the capture start point.
+  //
+  // Later, when GetResult is called and if it returns true, the capture
+  // returned will start at the position at the time this was called.
+  Scanner& RestartCapture() {
+    capture_start_ = cur_.data();
+    capture_end_ = nullptr;
+    return *this;
+  }
+
+  // Stop capturing input.
+  //
+  // Later, when GetResult is called and if it returns true, the capture
+  // returned will end at the position at the time this was called.
+  Scanner& StopCapture() {
+    capture_end_ = cur_.data();
+    return *this;
+  }
+
+  // If not at the input of input, then GetResult will ultimately return false.
+  Scanner& Eos() {
+    if (!cur_.empty()) error_ = true;
+    return *this;
+  }
+
+  // Shorthand for Any(SPACE).
+  Scanner& AnySpace() { return Any(SPACE); }
+
+  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
+  Scanner& ScanUntil(char end_ch) {
+    ScanUntilImpl(end_ch, false);
+    return *this;
+  }
+
+  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
+  // Backslash escape sequences are skipped.
+  // Used for implementing quoted string scanning.
+  Scanner& ScanEscapedUntil(char end_ch) {
+    ScanUntilImpl(end_ch, true);
+    return *this;
+  }
+
+  // Return the next character that will be scanned, or <default_value> if there
+  // are no more characters to scan.
+  // Note that if a scan operation has failed (so GetResult() returns false),
+  // then the value of Peek may or may not have advanced since the scan
+  // operation that failed.
+  char Peek(char default_value = '\0') const {
+    return cur_.empty() ? default_value : cur_[0];
+  }
+
+  // Returns false if there are no remaining characters to consume.
+  int empty() const { return cur_.empty(); }
+
+  // Returns true if the input string successfully matched. When true is
+  // returned, the remaining string is returned in <remaining> and the captured
+  // string returned in <capture>, if non-NULL.
+  bool GetResult(absl::string_view* remaining = nullptr,
+                 absl::string_view* capture = nullptr);
+
+ private:
+  void ScanUntilImpl(char end_ch, bool escaped);
+
+  Scanner& Error() {
+    error_ = true;
+    return *this;
+  }
+
+  static bool IsLetter(char ch) {
+    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
+  }
+
+  static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
+
+  static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
+
+  static bool IsSpace(char ch) {
+    return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
+            ch == '\r');
+  }
+
+  static bool Matches(CharClass clz, char ch) {
+    switch (clz) {
+      case ALL:
+        return true;
+      case DIGIT:
+        return IsDigit(ch);
+      case LETTER:
+        return IsLetter(ch);
+      case LETTER_DIGIT:
+        return IsLetter(ch) || IsDigit(ch);
+      case LETTER_DIGIT_DASH_UNDERSCORE:
+        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
+      case LETTER_DIGIT_DASH_DOT_SLASH:
+        return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
+               ch == '/';
+      case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
+        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
+                ch == '/' || ch == '_');
+      case LETTER_DIGIT_DOT:
+        return IsLetter(ch) || IsDigit(ch) || ch == '.';
+      case LETTER_DIGIT_DOT_PLUS_MINUS:
+        return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
+               ch == '.';
+      case LETTER_DIGIT_DOT_UNDERSCORE:
+        return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
+      case LETTER_DIGIT_UNDERSCORE:
+        return IsLetter(ch) || IsDigit(ch) || ch == '_';
+      case LOWERLETTER:
+        return ch >= 'a' && ch <= 'z';
+      case LOWERLETTER_DIGIT:
+        return IsLowerLetter(ch) || IsDigit(ch);
+      case LOWERLETTER_DIGIT_UNDERSCORE:
+        return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
+      case NON_ZERO_DIGIT:
+        return IsDigit(ch) && ch != '0';
+      case SPACE:
+        return IsSpace(ch);
+      case UPPERLETTER:
+        return ch >= 'A' && ch <= 'Z';
+      case RANGLE:
+        return ch == '>';
+    }
+    return false;
+  }
+
+  absl::string_view cur_;
+  const char* capture_start_ = nullptr;
+  const char* capture_end_ = nullptr;
+  bool error_ = false;
+
+  friend class ScannerTest;
+
+  Scanner(const Scanner&) = delete;
+  void operator=(const Scanner&) = delete;
+};
+
+}  // namespace strings
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_SCANNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/setround.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/setround.h
new file mode 100644
index 00000000..503bda01
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/setround.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_SETROUND_H_
+#define TENSORFLOW_TSL_PLATFORM_SETROUND_H_
+
+#if defined(__ANDROID_API__) && (__ANDROID_API__ < 21)
+// The <cfenv> header is broken pre-API 21 for several NDK releases.
+#define TF_BROKEN_CFENV
+#endif
+
+#if defined(TF_BROKEN_CFENV)
+#include <fenv.h>  // NOLINT
+#else
+#include <cfenv>  // NOLINT
+#endif
+
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+namespace port {
+
+// While this class is active, floating point rounding mode is set to the given
+// mode. The mode can be one of the modes defined in <cfenv>, i.e. FE_DOWNWARD,
+// FE_TONEAREST, FE_TOWARDZERO, or FE_UPWARD. The destructor restores the
+// original rounding mode if it could be determined. If the original rounding
+// mode could not be determined, the destructor sets it to FE_TONEAREST.
+class ScopedSetRound {
+ public:
+  ScopedSetRound(int mode);
+  ~ScopedSetRound();
+
+ private:
+  int original_mode_;
+
+  ScopedSetRound(const ScopedSetRound&) = delete;
+  void operator=(const ScopedSetRound&) = delete;
+};
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_SETROUND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/snappy.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/snappy.h
new file mode 100644
index 00000000..d2acb887
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/snappy.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_SNAPPY_H_
+#define TENSORFLOW_TSL_PLATFORM_SNAPPY_H_
+
+#include "xla/tsl/platform/types.h"
+
+#if !defined(PLATFORM_WINDOWS)
+#include <sys/uio.h>
+namespace tsl {
+using ::iovec;  // NOLINT(misc-unused-using-decls)
+}  // namespace tsl
+#else
+namespace tsl {
+struct iovec {
+  void* iov_base;
+  size_t iov_len;
+};
+}  // namespace tsl
+#endif
+
+namespace tsl {
+namespace port {
+
+// Snappy compression/decompression support
+bool Snappy_Compress(const char* input, size_t length, string* output);
+
+bool Snappy_CompressFromIOVec(const struct iovec* iov,
+                              size_t uncompressed_length, string* output);
+
+bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                  size_t* result);
+bool Snappy_Uncompress(const char* input, size_t length, char* output);
+
+bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
+                              const struct iovec* iov, size_t iov_cnt);
+
+}  // namespace port
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_SNAPPY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stack_frame.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stack_frame.h
new file mode 100644
index 00000000..a52a8d53
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stack_frame.h
@@ -0,0 +1,52 @@
+#ifndef TENSORFLOW_TSL_PLATFORM_STACK_FRAME_H_
+#define TENSORFLOW_TSL_PLATFORM_STACK_FRAME_H_
+
+/* Copyright 2020 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <utility>
+
+namespace tsl {
+
+// A struct representing a frame in a stack trace.
+struct StackFrame {
+  std::string file_name;
+  int line_number;
+  std::string function_name;
+
+  StackFrame() = default;
+  StackFrame(std::string file_name, int line_number, std::string function_name)
+      : file_name(std::move(file_name)),
+        line_number(line_number),
+        function_name(std::move(function_name)) {}
+
+  bool operator==(const StackFrame& other) const {
+    return line_number == other.line_number &&
+           function_name == other.function_name && file_name == other.file_name;
+  }
+
+  bool operator!=(const StackFrame& other) const { return !(*this == other); }
+
+  template <class H>
+  friend H AbslHashValue(H h, const StackFrame& frame) {
+    return h.combine(std::move(h), frame.file_name, frame.line_number,
+                     frame.function_name);
+  }
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STACK_FRAME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stacktrace.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stacktrace.h
new file mode 100644
index 00000000..b4f4f4a0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stacktrace.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STACKTRACE_H_
+#define TENSORFLOW_TSL_PLATFORM_STACKTRACE_H_
+
+#include "tsl/platform/platform.h"  // IWYU pragma: export
+
+// Include appropriate platform-dependent implementation.
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/stacktrace.h"  // IWYU pragma: export
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) ||    \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_POSIX_IOS) || \
+    defined(PLATFORM_GOOGLE_IOS)
+#include "xla/tsl/platform/default/stacktrace.h"  // IWYU pragma: export
+#elif defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/windows/stacktrace.h"  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STACKTRACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stacktrace_handler.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stacktrace_handler.h
new file mode 100644
index 00000000..57c04ead
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stacktrace_handler.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STACKTRACE_HANDLER_H_
+#define TENSORFLOW_TSL_PLATFORM_STACKTRACE_HANDLER_H_
+
+namespace tsl {
+namespace testing {
+
+// Installs signal handlers to print out stack trace.
+// Although GoogleTest has support for generating stacktraces with abseil via
+// https://github.com/google/googletest/pull/1653, this doesn't cover our use
+// case of getting C++ stacktraces in our python tests.
+void InstallStacktraceHandler();
+
+}  // namespace testing
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STACKTRACE_HANDLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status.h
new file mode 100644
index 00000000..fdd9343a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STATUS_H_
+#define TENSORFLOW_TSL_PLATFORM_STATUS_H_
+
+#include "xla/tsl/platform/status.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
new file mode 100644
index 00000000..e9a55986
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STATUS_MATCHERS_H_
+#define TENSORFLOW_TSL_PLATFORM_STATUS_MATCHERS_H_
+
+#include "xla/tsl/platform/status_matchers.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STATUS_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h
new file mode 100644
index 00000000..89b0de80
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
+#define TENSORFLOW_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
+
+#include "xla/tsl/platform/status_to_from_proto.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/statusor.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/statusor.h
new file mode 100644
index 00000000..c4e6da37
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/statusor.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STATUSOR_H_
+#define TENSORFLOW_TSL_PLATFORM_STATUSOR_H_
+
+#include "xla/tsl/platform/statusor.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STATUSOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/str_util.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/str_util.h
new file mode 100644
index 00000000..ff7c4cd6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/str_util.h
@@ -0,0 +1,238 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STR_UTIL_H_
+#define TENSORFLOW_TSL_PLATFORM_STR_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/strip.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+// Basic string utility routines
+namespace tsl {
+namespace str_util {
+
+// Returns a version of 'src' where unprintable characters have been
+// escaped using C-style escape sequences.
+ABSL_DEPRECATE_AND_INLINE() inline std::string CEscape(absl::string_view src) {
+  return absl::CEscape(src);
+}
+
+// Copies "source" to "dest", rewriting C-style escape sequences --
+// '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
+//
+// Errors: Sets the description of the first encountered error in
+// 'error'. To disable error reporting, set 'error' to NULL.
+//
+// NOTE: Does not support \u or \U!
+ABSL_DEPRECATE_AND_INLINE()
+inline bool CUnescape(absl::string_view source, std::string* dest,
+                      std::string* error) {
+  return absl::CUnescape(source, dest, error);
+}
+
+// Removes any trailing whitespace from "*s".
+ABSL_DEPRECATE_AND_INLINE()
+inline void StripTrailingWhitespace(std::string* s) {
+  absl::StripTrailingAsciiWhitespace(s);
+}
+
+// Removes leading ascii_isspace() characters.
+// Returns number of characters removed.
+size_t RemoveLeadingWhitespace(absl::string_view* text);
+
+// Removes trailing ascii_isspace() characters.
+// Returns number of characters removed.
+size_t RemoveTrailingWhitespace(absl::string_view* text);
+
+// Removes leading and trailing ascii_isspace() chars.
+// Returns number of chars removed.
+size_t RemoveWhitespaceContext(absl::string_view* text);
+
+// Consume a leading positive integer value.  If any digits were
+// found, store the value of the leading unsigned number in "*val",
+// advance "*s" past the consumed number, and return true.  If
+// overflow occurred, returns false.  Otherwise, returns false.
+bool ConsumeLeadingDigits(absl::string_view* s, uint64_t* val);
+
+// Consume a leading token composed of non-whitespace characters only.
+// If *s starts with a non-zero number of non-whitespace characters, store
+// them in *val, advance *s past them, and return true.  Else return false.
+bool ConsumeNonWhitespace(absl::string_view* s, absl::string_view* val);
+
+// If "*s" starts with "expected", consume it and return true.
+// Otherwise, return false.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool ConsumePrefix(absl::string_view* s, absl::string_view expected) {
+  return absl::ConsumePrefix(s, expected);
+}
+
+// If "*s" ends with "expected", remove it and return true.
+// Otherwise, return false.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool ConsumeSuffix(absl::string_view* s, absl::string_view expected) {
+  return absl::ConsumeSuffix(s, expected);
+}
+
+// If "s" starts with "expected", return a view into "s" after "expected" but
+// keep "s" unchanged.
+// Otherwise, return the original "s".
+ABSL_DEPRECATE_AND_INLINE()
+TF_MUST_USE_RESULT inline absl::string_view StripPrefix(
+    absl::string_view s, absl::string_view expected) {
+  return absl::StripPrefix(s, expected);
+}
+
+// If "s" ends with "expected", return a view into "s" until "expected" but
+// keep "s" unchanged.
+// Otherwise, return the original "s".
+ABSL_DEPRECATE_AND_INLINE()
+TF_MUST_USE_RESULT inline absl::string_view StripSuffix(
+    absl::string_view s, absl::string_view expected) {
+  return absl::StripSuffix(s, expected);
+}
+
+// Return lower-cased version of s.
+ABSL_DEPRECATE_AND_INLINE() inline std::string Lowercase(absl::string_view s) {
+  return absl::AsciiStrToLower(s);
+}
+
+// Return upper-cased version of s.
+ABSL_DEPRECATE_AND_INLINE() inline std::string Uppercase(absl::string_view s) {
+  return absl::AsciiStrToUpper(s);
+}
+
+// Capitalize first character of each word in "*s".  "delimiters" is a
+// set of characters that can be used as word boundaries.
+void TitlecaseString(std::string* s, absl::string_view delimiters);
+
+// Replaces the first occurrence (if replace_all is false) or all occurrences
+// (if replace_all is true) of oldsub in s with newsub.
+std::string StringReplace(absl::string_view s, absl::string_view oldsub,
+                          absl::string_view newsub, bool replace_all);
+
+// Join functionality
+template <typename T>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Join(const T& s, const char* sep) {
+  return absl::StrJoin(s, sep);
+}
+
+// A variant of Join where for each element of "s", f(&dest_string, elem)
+// is invoked (f is often constructed with a lambda of the form:
+//   [](string* result, ElemType elem)
+template <typename T, typename Formatter>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Join(const T& s, const char* sep, Formatter f) {
+  return absl::StrJoin(s, sep, f);
+}
+
+struct AllowEmpty {
+  bool operator()(absl::string_view sp) const { return true; }
+};
+struct SkipEmpty {
+  bool operator()(absl::string_view sp) const { return !sp.empty(); }
+};
+struct SkipWhitespace {
+  bool operator()(absl::string_view sp) const {
+    return !absl::StripTrailingAsciiWhitespace(sp).empty();
+  }
+};
+
+// Split strings using any of the supplied delimiters. For example:
+// Split("a,b.c,d", ".,") would return {"a", "b", "c", "d"}.
+inline std::vector<string> Split(absl::string_view text,
+                                 absl::string_view delims) {
+  return text.empty() ? std::vector<string>()
+                      : absl::StrSplit(text, absl::ByAnyChar(delims));
+}
+
+template <typename Predicate>
+std::vector<string> Split(absl::string_view text, absl::string_view delims,
+                          Predicate p) {
+  return text.empty() ? std::vector<string>()
+                      : absl::StrSplit(text, absl::ByAnyChar(delims), p);
+}
+
+inline std::vector<string> Split(absl::string_view text, char delim) {
+  return text.empty() ? std::vector<string>() : absl::StrSplit(text, delim);
+}
+
+template <typename Predicate>
+std::vector<string> Split(absl::string_view text, char delim, Predicate p) {
+  return text.empty() ? std::vector<string>() : absl::StrSplit(text, delim, p);
+}
+
+// StartsWith()
+//
+// Returns whether a given string `text` begins with `prefix`.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool StartsWith(absl::string_view text, absl::string_view prefix) {
+  return absl::StartsWith(text, prefix);
+}
+
+// EndsWith()
+//
+// Returns whether a given string `text` ends with `suffix`.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool EndsWith(absl::string_view text, absl::string_view suffix) {
+  return absl::EndsWith(text, suffix);
+}
+
+// StrContains()
+//
+// Returns whether a given string `haystack` contains the substring `needle`.
+ABSL_DEPRECATE_AND_INLINE()
+inline bool StrContains(absl::string_view haystack, absl::string_view needle) {
+  return absl::StrContains(haystack, needle);
+}
+
+// Returns the length of the given null-terminated byte string 'str'.
+// Returns 'string_max_len' if the null character was not found in the first
+// 'string_max_len' bytes of 'str'.
+size_t Strnlen(const char* str, const size_t string_max_len);
+
+//   ----- NON STANDARD, TF SPECIFIC METHOD -----
+// Converts "^2ILoveYou!" to "i_love_you_". More specifically:
+// - converts all non-alphanumeric characters to underscores
+// - replaces each occurrence of a capital letter (except the very
+//   first character and if there is already an '_' before it) with '_'
+//   followed by this letter in lower case
+// - Skips leading non-alpha characters
+// This method is useful for producing strings matching "[a-z][a-z0-9_]*"
+// as required by OpDef.ArgDef.name. The resulting string is either empty or
+// matches this regex.
+std::string ArgDefCase(absl::string_view s);
+
+}  // namespace str_util
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STR_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/strcat.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/strcat.h
new file mode 100644
index 00000000..dfea8694
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/strcat.h
@@ -0,0 +1,246 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// #status: RECOMMENDED
+// #category: operations on strings
+// #summary: Merges strings or numbers with no delimiter.
+//
+#ifndef TENSORFLOW_TSL_PLATFORM_STRCAT_H_
+#define TENSORFLOW_TSL_PLATFORM_STRCAT_H_
+
+#include <string>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/numbers.h"
+#include "tsl/platform/stringpiece.h"
+
+// The AlphaNum type was designed to be used as the parameter type for StrCat().
+// Any routine accepting either a string or a number may accept it.
+// The basic idea is that by accepting a "const AlphaNum &" as an argument
+// to your function, your callers will automatically convert bools, integers,
+// and floating point values to strings for you.
+//
+// NOTE: Use of AlphaNum outside of the "strings" package is unsupported except
+// for the specific case of function parameters of type "AlphaNum" or "const
+// AlphaNum &". In particular, instantiating AlphaNum directly as a stack
+// variable is not supported.
+//
+// Conversion from 8-bit values is not accepted because if it were, then an
+// attempt to pass ':' instead of ":" might result in a 58 ending up in your
+// result.
+//
+// Bools convert to "0" or "1".
+//
+// Floating point values are converted to a string which, if passed to strtod(),
+// would produce the exact same original double (except in case of NaN; all NaNs
+// are considered the same value). We try to keep the string short but it's not
+// guaranteed to be as short as possible.
+//
+// You can convert to Hexadecimal output rather than Decimal output using Hex.
+// To do this, pass strings::Hex(my_int) as a parameter to StrCat. You may
+// specify a minimum field width using a separate parameter, so the equivalent
+// of Printf("%04x", my_int) is StrCat(Hex(my_int, strings::kZeroPad4))
+//
+// This class has implicit constructors.
+namespace tsl {
+namespace strings {
+
+enum PadSpec {
+  kNoPad = 1,
+  kZeroPad2,
+  kZeroPad3,
+  kZeroPad4,
+  kZeroPad5,
+  kZeroPad6,
+  kZeroPad7,
+  kZeroPad8,
+  kZeroPad9,
+  kZeroPad10,
+  kZeroPad11,
+  kZeroPad12,
+  kZeroPad13,
+  kZeroPad14,
+  kZeroPad15,
+  kZeroPad16
+};
+
+struct Hex {
+  uint64 value;
+  enum PadSpec spec;
+  template <class Int>
+  explicit Hex(Int v, PadSpec s = kNoPad) : spec(s) {
+    // Prevent sign-extension by casting integers to
+    // their unsigned counterparts.
+    static_assert(
+        sizeof(v) == 1 || sizeof(v) == 2 || sizeof(v) == 4 || sizeof(v) == 8,
+        "Unknown integer type");
+    value = sizeof(v) == 1   ? static_cast<uint8>(v)
+            : sizeof(v) == 2 ? static_cast<uint16>(v)
+            : sizeof(v) == 4 ? static_cast<uint32>(v)
+                             : static_cast<uint64>(v);
+  }
+};
+
+class AlphaNum {
+  // NOLINTBEGIN(google-explicit-constructor)
+ public:
+  // No bool ctor -- bools convert to an integral type.
+  // A bool ctor would also convert incoming pointers (bletch).
+  AlphaNum(int i32)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FastInt32ToBufferLeft(i32, digits_)) {}
+  AlphaNum(unsigned int u32)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FastUInt32ToBufferLeft(u32, digits_)) {}
+  AlphaNum(long x)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FastInt64ToBufferLeft(x, digits_)) {}
+  AlphaNum(unsigned long x)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FastUInt64ToBufferLeft(x, digits_)) {}
+  AlphaNum(long long int i64)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FastInt64ToBufferLeft(i64, digits_)) {}
+  AlphaNum(unsigned long long int u64)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FastUInt64ToBufferLeft(u64, digits_)) {}
+
+  AlphaNum(float f)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FloatToBuffer(f, digits_)) {}
+  AlphaNum(double f)  // NOLINT(runtime/explicit)
+      : piece_(digits_, DoubleToBuffer(f, digits_)) {}
+  AlphaNum(bfloat16 bf)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FloatToBuffer(static_cast<float>(bf), digits_)) {}
+
+  AlphaNum(Hex hex);  // NOLINT(runtime/explicit)
+
+  AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
+  AlphaNum(const absl::string_view &pc)
+      : piece_(pc) {}                              // NOLINT(runtime/explicit)
+  AlphaNum(const std::string &str)                 // NOLINT(runtime/explicit)
+      : piece_(str) {}
+  AlphaNum(const tstring &str)  // NOLINT(runtime/explicit)
+      : piece_(str) {}
+  template <typename A>
+  AlphaNum(const std::basic_string<char, std::char_traits<char>, A> &str)
+      : piece_(str) {}  // NOLINT(runtime/explicit)
+
+  absl::string_view::size_type size() const { return piece_.size(); }
+  const char *data() const { return piece_.data(); }
+  absl::string_view Piece() const { return piece_; }
+
+ private:
+  absl::string_view piece_;
+  char digits_[kFastToBufferSize];
+
+  // Use ":" not ':'
+  AlphaNum(char c);  // NOLINT(runtime/explicit)
+
+  // NOLINTEND(google-explicit-constructor)
+  AlphaNum(const AlphaNum &) = delete;
+  void operator=(const AlphaNum &) = delete;
+};
+
+// ----------------------------------------------------------------------
+// StrCat()
+//    This merges the given strings or numbers, with no delimiter.  This
+//    is designed to be the fastest possible way to construct a string out
+//    of a mix of raw C strings, StringPieces, strings, bool values,
+//    and numeric values.
+//
+//    Don't use this for user-visible strings.  The localization process
+//    works poorly on strings built up out of fragments.
+//
+//    For clarity and performance, don't use StrCat when appending to a
+//    string.  In particular, avoid using any of these (anti-)patterns:
+//      str.append(StrCat(...))
+//      str += StrCat(...)
+//      str = StrCat(str, ...)
+//    where the last is the worse, with the potential to change a loop
+//    from a linear time operation with O(1) dynamic allocations into a
+//    quadratic time operation with O(n) dynamic allocations.  StrAppend
+//    is a better choice than any of the above, subject to the restriction
+//    of StrAppend(&str, a, b, c, ...) that none of the a, b, c, ... may
+//    be a reference into str.
+// ----------------------------------------------------------------------
+
+// For performance reasons, we have specializations for <= 4 args.
+std::string StrCat(const AlphaNum &a) TF_MUST_USE_RESULT;
+std::string StrCat(const AlphaNum &a, const AlphaNum &b) TF_MUST_USE_RESULT;
+std::string StrCat(const AlphaNum &a, const AlphaNum &b,
+                   const AlphaNum &c) TF_MUST_USE_RESULT;
+std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
+                   const AlphaNum &d) TF_MUST_USE_RESULT;
+
+namespace internal {
+
+// Do not call directly - this is not part of the public API.
+std::string CatPieces(std::initializer_list<absl::string_view> pieces);
+void AppendPieces(std::string *dest,
+                  std::initializer_list<absl::string_view> pieces);
+
+}  // namespace internal
+
+// Support 5 or more arguments
+template <typename... AV>
+std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
+                   const AlphaNum &d, const AlphaNum &e,
+                   const AV &...args) TF_MUST_USE_RESULT;
+
+template <typename... AV>
+std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
+                   const AlphaNum &d, const AlphaNum &e, const AV &...args) {
+  return internal::CatPieces({a.Piece(), b.Piece(), c.Piece(), d.Piece(),
+                              e.Piece(),
+                              static_cast<const AlphaNum &>(args).Piece()...});
+}
+
+// ----------------------------------------------------------------------
+// StrAppend()
+//    Same as above, but adds the output to the given string.
+//    WARNING: For speed, StrAppend does not try to check each of its input
+//    arguments to be sure that they are not a subset of the string being
+//    appended to.  That is, while this will work:
+//
+//    string s = "foo";
+//    s += s;
+//
+//    This will not (necessarily) work:
+//
+//    string s = "foo";
+//    StrAppend(&s, s);
+//
+//    Note: while StrCat supports appending up to 26 arguments, StrAppend
+//    is currently limited to 9.  That's rarely an issue except when
+//    automatically transforming StrCat to StrAppend, and can easily be
+//    worked around as consecutive calls to StrAppend are quite efficient.
+// ----------------------------------------------------------------------
+
+void StrAppend(std::string *dest, const AlphaNum &a);
+void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b);
+void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b,
+               const AlphaNum &c);
+void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b,
+               const AlphaNum &c, const AlphaNum &d);
+
+// Support 5 or more arguments
+template <typename... AV>
+inline void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b,
+                      const AlphaNum &c, const AlphaNum &d, const AlphaNum &e,
+                      const AV &...args) {
+  internal::AppendPieces(dest,
+                         {a.Piece(), b.Piece(), c.Piece(), d.Piece(), e.Piece(),
+                          static_cast<const AlphaNum &>(args).Piece()...});
+}
+
+}  // namespace strings
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STRCAT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
new file mode 100644
index 00000000..058d1d98
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// StringPiece is a simple structure containing a pointer into some external
+// storage and a size.  The user of a StringPiece must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a StringPiece without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same StringPiece must use
+// external synchronization.
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
+#define TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
+
+#include "absl/base/macros.h"
+#include "absl/strings/string_view.h"  // IWYU pragma: export
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tsl {
+
+using StringPiece ABSL_DEPRECATE_AND_INLINE() = absl::string_view;
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
new file mode 100644
index 00000000..6e1268df
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
@@ -0,0 +1,52 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Printf variants that place their output in a C++ string.
+//
+// Usage:
+//      string result = strings::Printf("%d %s\n", 10, "hello");
+//      strings::Appendf(&result, "%d %s\n", 20, "there");
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STRINGPRINTF_H_
+#define TENSORFLOW_TSL_PLATFORM_STRINGPRINTF_H_
+
+#include <stdarg.h>
+
+#include <string>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace strings {
+
+// Return a C++ string
+std::string Printf(const char* format, ...)
+    // Tell the compiler to do printf format string checking.
+    TF_PRINTF_ATTRIBUTE(1, 2);
+
+// Append result to a supplied string
+void Appendf(std::string* dst, const char* format, ...)
+    // Tell the compiler to do printf format string checking.
+    TF_PRINTF_ATTRIBUTE(2, 3);
+
+// Lower-level routine that takes a va_list and appends to a specified
+// string.  All other routines are just convenience wrappers around it.
+void Appendv(std::string* dst, const char* format, va_list ap);
+
+}  // namespace strings
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STRINGPRINTF_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h
new file mode 100644
index 00000000..d956340c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+#define TENSORFLOW_TSL_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+
+namespace tsl {
+
+// NOTE: The usage of this function is only supported through the Tensorflow
+// framework.
+void enable_tensor_float_32_execution(bool enabled);
+
+bool tensor_float_32_execution_enabled();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/test.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/test.h
new file mode 100644
index 00000000..31ca8753
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/test.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_TEST_H_
+#define TENSORFLOW_TSL_PLATFORM_TEST_H_
+
+#include "xla/tsl/platform/test.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/test_benchmark.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/test_benchmark.h
new file mode 100644
index 00000000..6772a5f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/test_benchmark.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_TEST_BENCHMARK_H_
+#define TENSORFLOW_TSL_PLATFORM_TEST_BENCHMARK_H_
+
+#include "xla/tsl/platform/test_benchmark.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_TEST_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/thread_annotations.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/thread_annotations.h
new file mode 100644
index 00000000..4f9604d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/thread_annotations.h
@@ -0,0 +1,165 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file contains the macro definitions for thread safety
+// annotations that allow the developers to document the locking policies
+// of their multi-threaded code. The annotations can also help program
+// analysis tools to identify potential thread safety issues.
+//
+// The primary documentation on these annotations is external:
+// http://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+//
+// The annotations are implemented using compiler attributes.
+// Using the macros defined here instead of the raw attributes allows
+// for portability and future compatibility.
+//
+// When referring to mutexes in the arguments of the attributes, you should
+// use variable names or more complex expressions (e.g. my_object->mutex_)
+// that evaluate to a concrete mutex object whenever possible. If the mutex
+// you want to refer to is not in scope, you may use a member pointer
+// (e.g. &MyClass::mutex_) to refer to a mutex in some (unknown) object.
+//
+
+#ifndef TENSORFLOW_TSL_PLATFORM_THREAD_ANNOTATIONS_H_
+#define TENSORFLOW_TSL_PLATFORM_THREAD_ANNOTATIONS_H_
+
+// IWYU pragma: private, include "tsl/platform/thread_annotations.h"
+// IWYU pragma: friend third_party/tensorflow/tsl/platform/thread_annotations.h
+
+#if defined(__clang__) && (!defined(SWIG))
+#define TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(x) __attribute__((x))
+#else
+#define TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(x)  // no-op
+#endif
+
+// Document if a shared variable/field needs to be protected by a mutex.
+// TF_GUARDED_BY allows the user to specify a particular mutex that should be
+// held when accessing the annotated variable.  GUARDED_VAR indicates that
+// a shared variable is guarded by some unspecified mutex, for use in rare
+// cases where a valid mutex expression cannot be specified.
+#define TF_GUARDED_BY(x) TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(guarded_by(x))
+#define GUARDED_VAR  // no-op
+
+// Document if the memory location pointed to by a pointer should be guarded
+// by a mutex when dereferencing the pointer.  PT_GUARDED_VAR is analogous to
+// GUARDED_VAR.   Note that a pointer variable to a shared memory location
+// could itself be a shared variable. For example, if a shared global pointer
+// q, which is guarded by mu1, points to a shared memory location that is
+// guarded by mu2, q should be annotated as follows:
+//     int *q TF_GUARDED_BY(mu1) TF_PT_GUARDED_BY(mu2);
+#define TF_PT_GUARDED_BY(x) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(pt_guarded_by(x))
+#define TF_PT_GUARDED_VAR  // no-op
+
+// Document the acquisition order between locks that can be held
+// simultaneously by a thread. For any two locks that need to be annotated
+// to establish an acquisition order, only one of them needs the annotation.
+// (i.e. You don't have to annotate both locks with both TF_ACQUIRED_AFTER
+// and TF_ACQUIRED_BEFORE.)
+#define TF_ACQUIRED_AFTER(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquired_after(__VA_ARGS__))
+
+#define TF_ACQUIRED_BEFORE(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquired_before(__VA_ARGS__))
+
+#define TF_ACQUIRE(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(__VA_ARGS__))
+
+#define TF_ACQUIRE_SHARED(...)             \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+      acquire_shared_capability(__VA_ARGS__))
+
+#define TF_RELEASE(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(release_capability(__VA_ARGS__))
+
+// Document a function that expects a mutex to be held prior to entry.
+// The mutex is expected to be held both on entry to and exit from the
+// function.
+#define TF_EXCLUSIVE_LOCKS_REQUIRED(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(exclusive_locks_required(__VA_ARGS__))
+
+#define TF_SHARED_LOCKS_REQUIRED(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_locks_required(__VA_ARGS__))
+
+// Document the locks acquired in the body of the function. These locks
+// cannot be held when calling this function (for instance, when the
+// mutex implementation is non-reentrant).
+#define TF_LOCKS_EXCLUDED(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(__VA_ARGS__))
+
+// Document a function that returns a mutex without acquiring it.  For example,
+// a public getter method that returns a pointer to a private mutex should
+// be annotated with TF_LOCK_RETURNED.
+#define TF_LOCK_RETURNED(x) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(lock_returned(x))
+
+// Document if a class/type is a lockable type (such as the Mutex class).
+#define TF_LOCKABLE TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(lockable)
+
+// Document if a class does RAII locking (such as the MutexLock class).
+// The constructor should use LOCK_FUNCTION to specify the mutex that is
+// acquired, and the destructor should use TF_UNLOCK_FUNCTION with no arguments;
+// the analysis will assume that the destructor unlocks whatever the
+// constructor locked.
+#define TF_SCOPED_LOCKABLE \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(scoped_lockable)
+
+// Document functions that acquire a lock in the body of a function, and do
+// not release it.
+#define TF_EXCLUSIVE_LOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(exclusive_lock_function(__VA_ARGS__))
+
+#define TF_SHARED_LOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_lock_function(__VA_ARGS__))
+
+// Document functions that expect a lock to be held on entry to the function,
+// and release it in the body of the function.
+#define TF_UNLOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(unlock_function(__VA_ARGS__))
+
+// Document functions that try to acquire a lock, and return success or failure
+// (or a non-boolean value that can be interpreted as a boolean).
+// The first argument should be true for functions that return true on success,
+// or false for functions that return false on success. The second argument
+// specifies the mutex that is locked on success. If unspecified, it is assumed
+// to be 'this'.
+#define TF_EXCLUSIVE_TRYLOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+      exclusive_trylock_function(__VA_ARGS__))
+
+#define TF_SHARED_TRYLOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_trylock_function(__VA_ARGS__))
+
+// Document functions that dynamically check to see if a lock is held, and fail
+// if it is not held.
+#define TF_ASSERT_EXCLUSIVE_LOCK(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(assert_exclusive_lock(__VA_ARGS__))
+
+#define TF_ASSERT_SHARED_LOCK(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(assert_shared_lock(__VA_ARGS__))
+
+// Turns off thread safety checking within the body of a particular function.
+// This is used as an escape hatch for cases where either (a) the function
+// is correct, but the locking is more complicated than the analyzer can handle,
+// or (b) the function contains race conditions that are known to be benign.
+#define TF_NO_THREAD_SAFETY_ANALYSIS \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
+
+// TF_TS_UNCHECKED should be placed around lock expressions that are not valid
+// C++ syntax, but which are present for documentation purposes.  These
+// annotations will be ignored by the analysis.
+#define TF_TS_UNCHECKED(x) ""
+
+#endif  // TENSORFLOW_TSL_PLATFORM_THREAD_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool.h
new file mode 100644
index 00000000..3ab00c4d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_THREADPOOL_H_
+#define TENSORFLOW_TSL_PLATFORM_THREADPOOL_H_
+
+#include "xla/tsl/platform/threadpool.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_async_executor.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_async_executor.h
new file mode 100644
index 00000000..deadc951
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_async_executor.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_THREADPOOL_ASYNC_EXECUTOR_H_
+#define TENSORFLOW_TSL_PLATFORM_THREADPOOL_ASYNC_EXECUTOR_H_
+
+#include "xla/tsl/platform/threadpool_async_executor.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_THREADPOOL_ASYNC_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_interface.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_interface.h
new file mode 100644
index 00000000..930d8bcd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_interface.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_THREADPOOL_INTERFACE_H_
+#define TENSORFLOW_TSL_PLATFORM_THREADPOOL_INTERFACE_H_
+
+#include "xla/tsl/platform/threadpool_interface.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_THREADPOOL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_options.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_options.h
new file mode 100644
index 00000000..ea884edf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/threadpool_options.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_THREADPOOL_OPTIONS_H_
+#define TENSORFLOW_TSL_PLATFORM_THREADPOOL_OPTIONS_H_
+
+#include "xla/tsl/platform/threadpool_options.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_THREADPOOL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tracing.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tracing.h
new file mode 100644
index 00000000..07a725f2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tracing.h
@@ -0,0 +1,150 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_TRACING_H_
+#define TENSORFLOW_TSL_PLATFORM_TRACING_H_
+
+// Tracing interface
+
+#include <array>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace tracing {
+
+// This enumeration contains the identifiers of all TensorFlow CPU profiler
+// events. It must be kept in sync with the code in GetEventCategoryName().
+enum struct EventCategory : unsigned {
+  kScheduleClosure = 0,
+  kRunClosure = 1,
+  kCompute = 2,
+  kNumCategories = 3  // sentinel - keep last
+};
+constexpr unsigned GetNumEventCategories() {
+  return static_cast<unsigned>(EventCategory::kNumCategories);
+}
+const char* GetEventCategoryName(EventCategory);
+
+// Interface for CPU profiler events.
+class EventCollector {
+ public:
+  virtual ~EventCollector() {}
+  virtual void RecordEvent(uint64 arg) const = 0;
+  virtual void StartRegion(uint64 arg) const = 0;
+  virtual void StopRegion() const = 0;
+
+  // Annotates the current thread with a name.
+  static void SetCurrentThreadName(const char* name);
+  // Returns whether event collection is enabled.
+  static bool IsEnabled();
+
+ private:
+  friend void SetEventCollector(EventCategory, const EventCollector*);
+  friend const EventCollector* GetEventCollector(EventCategory);
+
+  static std::array<const EventCollector*, GetNumEventCategories()> instances_;
+};
+// Set the callback for RecordEvent and ScopedRegion of category.
+// Not thread safe. Only call while EventCollector::IsEnabled returns false.
+void SetEventCollector(EventCategory category, const EventCollector* collector);
+
+// Returns the callback for RecordEvent and ScopedRegion of category if
+// EventCollector::IsEnabled(), otherwise returns null.
+inline const EventCollector* GetEventCollector(EventCategory category) {
+  if (EventCollector::IsEnabled()) {
+    return EventCollector::instances_[static_cast<unsigned>(category)];
+  }
+  return nullptr;
+}
+
+// Returns a unique id to pass to RecordEvent/ScopedRegion. Never returns zero.
+uint64 GetUniqueArg();
+
+// Returns an id for name to pass to RecordEvent/ScopedRegion.
+uint64 GetArgForName(absl::string_view name);
+
+// Records an atomic event through the currently registered EventCollector.
+inline void RecordEvent(EventCategory category, uint64 arg) {
+  if (auto collector = GetEventCollector(category)) {
+    collector->RecordEvent(arg);
+  }
+}
+
+// Records an event for the duration of the instance lifetime through the
+// currently registered EventCollector.
+class ScopedRegion {
+ public:
+  ScopedRegion(ScopedRegion&& other) noexcept  // Move-constructible.
+      : collector_(other.collector_) {
+    other.collector_ = nullptr;
+  }
+
+  ScopedRegion(EventCategory category, uint64 arg)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(arg);
+    }
+  }
+
+  // Same as ScopedRegion(category, GetUniqueArg()), but faster if
+  // EventCollector::IsEnabled() returns false.
+  explicit ScopedRegion(EventCategory category)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(GetUniqueArg());
+    }
+  }
+
+  // Same as ScopedRegion(category, GetArgForName(name)), but faster if
+  // EventCollector::IsEnabled() returns false.
+  ScopedRegion(EventCategory category, absl::string_view name)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(GetArgForName(name));
+    }
+  }
+
+  ~ScopedRegion() {
+    if (collector_ && EventCollector::IsEnabled()) {
+      collector_->StopRegion();
+    }
+  }
+
+  bool IsEnabled() const { return collector_ != nullptr; }
+
+ private:
+  ScopedRegion(const ScopedRegion&) = delete;
+  void operator=(const ScopedRegion&) = delete;
+
+  const EventCollector* collector_;
+};
+
+// Return the pathname of the directory where we are writing log files.
+const char* GetLogDir();
+
+}  // namespace tracing
+}  // namespace tsl
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/tracing_impl.h"
+#else
+#include "xla/tsl/platform/default/tracing_impl.h"
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_TRACING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tstring.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tstring.h
new file mode 100644
index 00000000..97028f6c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/tstring.h
@@ -0,0 +1,588 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_TSTRING_H_
+#define TENSORFLOW_TSL_PLATFORM_TSTRING_H_
+
+#include <assert.h>
+
+#include <ostream>
+#include <string>
+
+#include "tsl/platform/cord.h"
+#include "tsl/platform/ctstring.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+// tensorflow::tstring is the scalar type for DT_STRING tensors.
+//
+// tstrings are meant to be used when interfacing with string tensors, and
+// should not be considered as a general replacement for std::string in
+// tensorflow.  The primary purpose of tstring is to provide a unified and
+// stable ABI for string tensors across TF Core/C-API/Lite/etc---mitigating
+// unnecessary conversions across language boundaries, and allowing for compiler
+// agnostic interoperability across dynamically loaded modules.
+//
+// In addition to ABI stability, tstrings features two string subtypes, VIEW and
+// OFFSET.
+//
+// VIEW tstrings are views into unowned character buffers; they can be used to
+// pass around existing character strings without incurring a per object heap
+// allocation.  Note that, like std::string_view, it is the user's
+// responsibility to ensure that the underlying buffer of a VIEW tstring exceeds
+// the lifetime of the associated tstring object.
+//
+// TODO(dero): Methods for creating OFFSET tensors are not currently
+// implemented.
+//
+// OFFSET tstrings are platform independent offset defined strings which can be
+// directly mmaped or copied into a tensor buffer without the need for
+// deserialization or processing.  For security reasons, it is imperative that
+// OFFSET based string tensors are validated before use, or are from a trusted
+// source.
+//
+// Underlying VIEW and OFFSET buffers are considered immutable, so l-value
+// assignment, mutation, or non-const access to data() of tstrings will result
+// in the conversion to an owned SMALL/LARGE type.
+//
+// The interface for tstring largely overlaps with std::string. Except where
+// noted, expect equivalent semantics with synonymous std::string methods.
+class tstring {
+  TF_TString tstr_;
+
+ public:
+  enum Type {
+    // See cstring.h
+    SMALL = TF_TSTR_SMALL,
+    LARGE = TF_TSTR_LARGE,
+    OFFSET = TF_TSTR_OFFSET,
+    VIEW = TF_TSTR_VIEW,
+  };
+
+  // Assignment to a tstring object with a tstring::view type will create a VIEW
+  // type tstring.
+  class view {
+    const char* data_;
+    size_t size_;
+
+   public:
+    explicit view(const char* data, size_t size) : data_(data), size_(size) {}
+    explicit view(const char* data) : data_(data), size_(::strlen(data)) {}
+
+    const char* data() const { return data_; }
+
+    size_t size() const { return size_; }
+
+    view() = delete;
+    view(const view&) = delete;
+    view& operator=(const view&) = delete;
+  };
+
+  typedef const char* const_iterator;
+
+  // Ctor
+  tstring();
+  tstring(const std::string& str);  // NOLINT TODO(b/147740521): Make explicit.
+  tstring(const char* str, size_t len);
+  tstring(const char* str);  // NOLINT TODO(b/147740521): Make explicit.
+  tstring(size_t n, char c);
+  explicit tstring(const absl::string_view str);
+#ifdef PLATFORM_GOOGLE
+  explicit tstring(const absl::Cord& cord);
+#endif  // PLATFORM_GOOGLE
+
+  // Copy
+  tstring(const tstring& str);
+
+  // Move
+  tstring(tstring&& str) noexcept;
+
+  // Dtor
+  ~tstring();
+
+  // Copy Assignment
+  tstring& operator=(const tstring& str);
+  tstring& operator=(const std::string& str);
+  tstring& operator=(const char* str);
+  tstring& operator=(char ch);
+  tstring& operator=(const absl::string_view str);
+#ifdef PLATFORM_GOOGLE
+  tstring& operator=(const absl::Cord& cord);
+#endif  // PLATFORM_GOOGLE
+
+  // View Assignment
+  tstring& operator=(const view& tsv);
+
+  // Move Assignment
+  tstring& operator=(tstring&& str) noexcept;
+
+  // Comparison
+  int compare(const char* str, size_t len) const;
+  bool operator<(const tstring& o) const;
+  bool operator>(const tstring& o) const;
+  bool operator==(const char* str) const;
+  bool operator==(const tstring& o) const;
+  bool operator!=(const char* str) const;
+  bool operator!=(const tstring& o) const;
+
+  // Conversion Operators
+  // TODO(b/147740521): Make explicit.
+  operator std::string() const;  // NOLINT
+  // TODO(b/147740521): Make explicit.
+  operator absl::string_view() const;  // NOLINT
+#ifdef PLATFORM_GOOGLE
+  template <typename T,
+            typename std::enable_if<std::is_same<T, absl::AlphaNum>::value,
+                                    T>::type* = nullptr>
+  operator T() const;  // NOLINT TODO(b/147740521): Remove.
+#endif  // PLATFORM_GOOGLE
+
+  // Attributes
+  size_t size() const;
+  size_t length() const;
+  size_t capacity() const;
+  bool empty() const;
+  Type type() const;
+
+  // Allocation
+  void resize(size_t new_size, char c = 0);
+  // Similar to resize, but will leave the newly grown region uninitialized.
+  void resize_uninitialized(size_t new_size);
+  void clear() noexcept;
+  void reserve(size_t n);
+
+  // Iterators
+  const_iterator begin() const;
+  const_iterator end() const;
+
+  // Const Element Access
+  const char* c_str() const;
+  const char* data() const;
+  const char& operator[](size_t i) const;
+  const char& back() const;
+
+  // Mutable Element Access
+  // NOTE: For VIEW/OFFSET types, calling these methods will result in the
+  // conversion to a SMALL or heap allocated LARGE type.  As a result,
+  // previously obtained pointers, references, or iterators to the underlying
+  // buffer will point to the original VIEW/OFFSET and not the new allocation.
+  char* mdata();
+  char* data();  // DEPRECATED: Use mdata().
+  char& operator[](size_t i);
+
+  // Assignment
+  tstring& assign(const char* str, size_t len);
+  tstring& assign(const char* str);
+
+  // View Assignment
+  tstring& assign_as_view(const tstring& str);
+  tstring& assign_as_view(const std::string& str);
+  tstring& assign_as_view(const absl::string_view str);
+  tstring& assign_as_view(const char* str, size_t len);
+  tstring& assign_as_view(const char* str);
+
+  // Modifiers
+  // NOTE: Invalid input will result in undefined behavior.
+  tstring& append(const tstring& str);
+  tstring& append(const char* str, size_t len);
+  tstring& append(const char* str);
+  tstring& append(size_t n, char c);
+
+  tstring& erase(size_t pos, size_t len);
+
+  tstring& insert(size_t pos, const tstring& str, size_t subpos, size_t sublen);
+  tstring& insert(size_t pos, size_t n, char c);
+  void swap(tstring& str) noexcept;
+  void push_back(char ch);
+
+  // Friends
+  friend bool operator==(const char* a, const tstring& b);
+  friend bool operator==(const std::string& a, const tstring& b);
+  friend tstring operator+(const tstring& a, const tstring& b);
+  friend std::ostream& operator<<(std::ostream& o, const tstring& str);
+  friend std::hash<tstring>;
+};
+
+// Non-member function overloads
+
+bool operator==(const char* a, const tstring& b);
+bool operator==(const std::string& a, const tstring& b);
+tstring operator+(const tstring& a, const tstring& b);
+std::ostream& operator<<(std::ostream& o, const tstring& str);
+
+// Implementations
+
+// Ctor
+
+inline tstring::tstring() { TF_TString_Init(&tstr_); }
+
+inline tstring::tstring(const char* str, size_t len) {
+  TF_TString_Init(&tstr_);
+  TF_TString_Copy(&tstr_, str, len);
+}
+
+inline tstring::tstring(const char* str) : tstring(str, ::strlen(str)) {}
+
+inline tstring::tstring(size_t n, char c) {
+  TF_TString_Init(&tstr_);
+  TF_TString_Resize(&tstr_, n, c);
+}
+
+inline tstring::tstring(const std::string& str)
+    : tstring(str.data(), str.size()) {}
+
+inline tstring::tstring(const absl::string_view str)
+    : tstring(str.data(), str.size()) {}
+
+#ifdef PLATFORM_GOOGLE
+inline tstring::tstring(const absl::Cord& cord) {
+  TF_TString_Init(&tstr_);
+  TF_TString_ResizeUninitialized(&tstr_, cord.size());
+
+  cord.CopyToArray(data());
+}
+#endif  // PLATFORM_GOOGLE
+
+// Copy
+
+inline tstring::tstring(const tstring& str) {
+  TF_TString_Init(&tstr_);
+  TF_TString_Assign(&tstr_, &str.tstr_);
+}
+
+// Move
+
+inline tstring::tstring(tstring&& str) noexcept {
+  TF_TString_Init(&tstr_);
+  TF_TString_Move(&tstr_, &str.tstr_);
+}
+
+// Dtor
+
+inline tstring::~tstring() { TF_TString_Dealloc(&tstr_); }
+
+// Copy Assignment
+
+inline tstring& tstring::operator=(const tstring& str) {
+  TF_TString_Assign(&tstr_, &str.tstr_);
+
+  return *this;
+}
+
+inline tstring& tstring::operator=(const std::string& str) {
+  TF_TString_Copy(&tstr_, str.data(), str.size());
+  return *this;
+}
+
+inline tstring& tstring::operator=(const char* str) {
+  TF_TString_Copy(&tstr_, str, ::strlen(str));
+
+  return *this;
+}
+
+inline tstring& tstring::operator=(char c) {
+  resize_uninitialized(1);
+  (*this)[0] = c;
+
+  return *this;
+}
+
+inline tstring& tstring::operator=(const absl::string_view str) {
+  TF_TString_Copy(&tstr_, str.data(), str.size());
+
+  return *this;
+}
+
+#ifdef PLATFORM_GOOGLE
+inline tstring& tstring::operator=(const absl::Cord& cord) {
+  TF_TString_ResizeUninitialized(&tstr_, cord.size());
+
+  cord.CopyToArray(data());
+
+  return *this;
+}
+#endif  // PLATFORM_GOOGLE
+
+// View Assignment
+
+inline tstring& tstring::operator=(const tstring::view& tsv) {
+  assign_as_view(tsv.data(), tsv.size());
+
+  return *this;
+}
+
+// Move Assignment
+
+inline tstring& tstring::operator=(tstring&& str) noexcept {
+  TF_TString_Move(&tstr_, &str.tstr_);
+
+  return *this;
+}
+
+// Comparison
+
+inline int tstring::compare(const char* str, size_t len) const {
+  int ret = ::memcmp(data(), str, std::min(len, size()));
+
+  if (ret < 0) return -1;
+  if (ret > 0) return +1;
+
+  if (size() < len) return -1;
+  if (size() > len) return +1;
+
+  return 0;
+}
+
+inline bool tstring::operator<(const tstring& o) const {
+  return compare(o.data(), o.size()) < 0;
+}
+
+inline bool tstring::operator>(const tstring& o) const {
+  return compare(o.data(), o.size()) > 0;
+}
+
+inline bool tstring::operator==(const char* str) const {
+  return ::strlen(str) == size() && ::memcmp(data(), str, size()) == 0;
+}
+
+inline bool tstring::operator==(const tstring& o) const {
+  return o.size() == size() && ::memcmp(data(), o.data(), size()) == 0;
+}
+
+inline bool tstring::operator!=(const char* str) const {
+  return !(*this == str);
+}
+
+inline bool tstring::operator!=(const tstring& o) const {
+  return !(*this == o);
+}
+
+// Conversion Operators
+
+inline tstring::operator std::string() const {
+  return std::string(data(), size());
+}
+
+inline tstring::operator absl::string_view() const {
+  return absl::string_view(data(), size());
+}
+
+#ifdef PLATFORM_GOOGLE
+template <typename T, typename std::enable_if<
+                          std::is_same<T, absl::AlphaNum>::value, T>::type*>
+inline tstring::operator T() const {
+  return T(absl::string_view(*this));
+}
+#endif  // PLATFORM_GOOGLE
+
+// Attributes
+
+inline size_t tstring::size() const { return TF_TString_GetSize(&tstr_); }
+
+inline size_t tstring::length() const { return size(); }
+
+inline size_t tstring::capacity() const {
+  return TF_TString_GetCapacity(&tstr_);
+}
+
+inline bool tstring::empty() const { return size() == 0; }
+
+inline tstring::Type tstring::type() const {
+  return static_cast<tstring::Type>(TF_TString_GetType(&tstr_));
+}
+
+// Allocation
+
+inline void tstring::resize(size_t new_size, char c) {
+  TF_TString_Resize(&tstr_, new_size, c);
+}
+
+inline void tstring::resize_uninitialized(size_t new_size) {
+  TF_TString_ResizeUninitialized(&tstr_, new_size);
+}
+
+inline void tstring::clear() noexcept {
+  TF_TString_ResizeUninitialized(&tstr_, 0);
+}
+
+inline void tstring::reserve(size_t n) { TF_TString_Reserve(&tstr_, n); }
+
+// Iterators
+
+inline tstring::const_iterator tstring::begin() const { return &(*this)[0]; }
+inline tstring::const_iterator tstring::end() const { return &(*this)[size()]; }
+
+// Element Access
+
+inline const char* tstring::c_str() const { return data(); }
+
+inline const char* tstring::data() const {
+  return TF_TString_GetDataPointer(&tstr_);
+}
+
+inline const char& tstring::operator[](size_t i) const { return data()[i]; }
+
+inline const char& tstring::back() const { return (*this)[size() - 1]; }
+
+inline char* tstring::mdata() {
+  return TF_TString_GetMutableDataPointer(&tstr_);
+}
+
+inline char* tstring::data() {
+  // Deprecated
+  return mdata();
+}
+
+inline char& tstring::operator[](size_t i) { return mdata()[i]; }
+
+// Assignment
+
+inline tstring& tstring::assign(const char* str, size_t len) {
+  TF_TString_Copy(&tstr_, str, len);
+
+  return *this;
+}
+
+inline tstring& tstring::assign(const char* str) {
+  assign(str, ::strlen(str));
+
+  return *this;
+}
+
+// View Assignment
+
+inline tstring& tstring::assign_as_view(const tstring& str) {
+  assign_as_view(str.data(), str.size());
+
+  return *this;
+}
+
+inline tstring& tstring::assign_as_view(const std::string& str) {
+  assign_as_view(str.data(), str.size());
+
+  return *this;
+}
+
+inline tstring& tstring::assign_as_view(const absl::string_view str) {
+  assign_as_view(str.data(), str.size());
+
+  return *this;
+}
+
+inline tstring& tstring::assign_as_view(const char* str, size_t len) {
+  TF_TString_AssignView(&tstr_, str, len);
+
+  return *this;
+}
+
+inline tstring& tstring::assign_as_view(const char* str) {
+  assign_as_view(str, ::strlen(str));
+
+  return *this;
+}
+
+// Modifiers
+
+inline tstring& tstring::append(const tstring& str) {
+  TF_TString_Append(&tstr_, &str.tstr_);
+
+  return *this;
+}
+
+inline tstring& tstring::append(const char* str, size_t len) {
+  TF_TString_AppendN(&tstr_, str, len);
+
+  return *this;
+}
+
+inline tstring& tstring::append(const char* str) {
+  append(str, ::strlen(str));
+
+  return *this;
+}
+
+inline tstring& tstring::append(size_t n, char c) {
+  // For append use cases, we want to ensure amortized growth.
+  const size_t new_size = size() + n;
+  TF_TString_ReserveAmortized(&tstr_, new_size);
+  resize(new_size, c);
+
+  return *this;
+}
+
+inline tstring& tstring::erase(size_t pos, size_t len) {
+  memmove(mdata() + pos, data() + pos + len, size() - len - pos);
+
+  resize(size() - len);
+
+  return *this;
+}
+
+inline tstring& tstring::insert(size_t pos, const tstring& str, size_t subpos,
+                                size_t sublen) {
+  size_t orig_size = size();
+  TF_TString_ResizeUninitialized(&tstr_, orig_size + sublen);
+
+  memmove(mdata() + pos + sublen, data() + pos, orig_size - pos);
+  memmove(mdata() + pos, str.data() + subpos, sublen);
+
+  return *this;
+}
+
+inline tstring& tstring::insert(size_t pos, size_t n, char c) {
+  size_t size_ = size();
+  TF_TString_ResizeUninitialized(&tstr_, size_ + n);
+
+  memmove(mdata() + pos + n, data() + pos, size_ - pos);
+  memset(mdata() + pos, c, n);
+
+  return *this;
+}
+
+inline void tstring::swap(tstring& str) noexcept {
+  // TODO(dero): Invalid for OFFSET (unimplemented).
+  std::swap(tstr_, str.tstr_);
+}
+
+inline void tstring::push_back(char ch) { append(1, ch); }
+
+// Friends
+
+inline bool operator==(const char* a, const tstring& b) {
+  return ::strlen(a) == b.size() && ::memcmp(a, b.data(), b.size()) == 0;
+}
+
+inline bool operator==(const std::string& a, const tstring& b) {
+  return a.size() == b.size() && ::memcmp(a.data(), b.data(), b.size()) == 0;
+}
+
+inline tstring operator+(const tstring& a, const tstring& b) {
+  tstring r;
+  r.reserve(a.size() + b.size());
+  r.append(a);
+  r.append(b);
+
+  return r;
+}
+
+inline std::ostream& operator<<(std::ostream& o, const tstring& str) {
+  return o.write(str.data(), str.size());
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_TSTRING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/types.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/types.h
new file mode 100644
index 00000000..90aa7993
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/types.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_TYPES_H_
+#define TENSORFLOW_TSL_PLATFORM_TYPES_H_
+
+#include "xla/tsl/platform/types.h"
+
+#endif  // TENSORFLOW_TSL_PLATFORM_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/unbounded_work_queue.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/unbounded_work_queue.h
new file mode 100644
index 00000000..d4d8537e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/platform/unbounded_work_queue.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
+#define TENSORFLOW_TSL_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
+
+#include "tsl/platform/platform.h"
+
+// An `UnboundedWorkQueue` feeds potentially-blocking work into a thread-pool
+// whose size automatically increases with demand.
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/unbounded_work_queue.h"  // IWYU pragma: export
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) ||    \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_POSIX_IOS) || \
+    defined(PLATFORM_GOOGLE_IOS) || defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/default/unbounded_work_queue.h"  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_TSL_PLATFORM_UNBOUNDED_WORK_QUEUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000..422e8271
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/profiler/lib/context_types.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+namespace tsl {
+namespace profiler {
+
+/*
+ * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
+ * different threads. TraceMeProducer generates the context information to be
+ * passed to TraceMeConsumer, which consists of the context id and optionally
+ * the context type. They may be provided by the user. Then, the events of the
+ * same context information can be correlated during the analysis.
+ *
+ * Example Usages:
+ * (1) Using the user-provided context type and id. The user is responsible for
+ *     providing the same context type and id to TraceMeProducer and
+ *     TraceMeConsumer.
+ * [Producer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor, user_context_id);
+ * [Consumer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, ContextType::kTfExecutor, user_context_id);
+ *
+ * (2) Using the user-provided context type and generic id. The user is
+ *     responsible for passing the TraceMeProducer's context id to
+ *     TraceMeConsumer as well as providing the same context type to
+ *     TraceMeProducer and TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor);
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, ContextType::kTfExecutor, context_id);
+ *
+ * (3) Using the generic context information. The user is responsible for
+ *     passing the TraceMeProducer's context id to TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
+ */
+class TraceMeProducer : public TraceMe {
+ public:
+  template <typename NameT>
+  explicit TraceMeProducer(NameT&& name,
+                           ContextType context_type = ContextType::kGeneric,
+                           std::optional<uint64> context_id = std::nullopt,
+                           int level = 2)
+      : TraceMe(std::forward<NameT>(name), level),
+        context_id_(context_id.has_value() ? context_id.value()
+                                           : TraceMe::NewActivityId()) {
+    AppendMetadata([&] {
+      return TraceMeEncode({{"_pt", context_type}, {"_p", context_id_}});
+    });
+  }
+
+  uint64 GetContextId() const { return context_id_; }
+
+ private:
+  uint64 context_id_;
+};
+
+class TraceMeConsumer : public TraceMe {
+ public:
+  template <typename NameT>
+  TraceMeConsumer(NameT&& name, ContextType context_type, uint64 context_id,
+                  int level = 2)
+      : TraceMe(std::forward<NameT>(name), level) {
+    AppendMetadata([&] {
+      return TraceMeEncode({{"_ct", context_type}, {"_c", context_id}});
+    });
+  }
+
+  template <typename NameT>
+  TraceMeConsumer(NameT&& name, uint64 context_id, int level = 2)
+      : TraceMeConsumer(std::forward<NameT>(name), ContextType::kGeneric,
+                        context_id, level) {}
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
new file mode 100644
index 00000000..35bf1b8b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_CONTEXT_TYPES_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_CONTEXT_TYPES_H_
+
+#include <cstdint>
+
+namespace tsl {
+namespace profiler {
+
+// Note: Please add new context type after all existing ones.
+enum class ContextType : int {
+  kGeneric = 0,
+  kLegacy,
+  kTfExecutor,
+  kTfrtExecutor,
+  kSharedBatchScheduler,
+  kPjRt,
+  kAdaptiveSharedBatchScheduler,
+  kTfrtTpuRuntime,
+  kTpuEmbeddingEngine,
+  kGpuLaunch,
+  kBatcher,
+  kTpuStream,
+  kTpuLaunch,
+  kPathwaysExecutor,
+  kPjrtLibraryCall,
+  kThreadpoolEvent,
+  kLastContextType = ContextType::kTpuLaunch,
+};
+
+// In XFlow we encode context type as flow category as 6 bits.
+static_assert(static_cast<int>(ContextType::kLastContextType) < 64,
+              "Should have less than 64 categories.");
+
+const char* GetContextTypeString(ContextType context_type);
+
+inline ContextType GetSafeContextType(uint32_t context_type) {
+  if (context_type > static_cast<uint32_t>(ContextType::kLastContextType)) {
+    return ContextType::kGeneric;
+  }
+  return static_cast<ContextType>(context_type);
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_CONTEXT_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
new file mode 100644
index 00000000..4d65c39e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <string>
+
+namespace tsl::profiler {
+struct String;
+// Opaque handle to a string that has been pre-registered with the profiler/NVTX
+// implementation
+using StringHandle = String*;
+
+struct ProfilerDomain;
+// Opaque handle to a domain in the profiler/NVTX implementation
+using ProfilerDomainHandle = ProfilerDomain*;
+
+// Get the "TSL" domain if NVTX profiling is enabled, otherwise null
+ProfilerDomainHandle DefaultProfilerDomain();
+
+// Assign a human-readable name to the current thread
+void NameCurrentThread(const std::string&);
+
+// Assign a human-readable name to the given local device
+void NameDevice(int device_id, const std::string& device_name);
+
+struct Stream;
+// Opaque handle to an execution stream
+using StreamHandle = Stream*;
+
+// Assign a human-readable name to the given execution stream
+void NameStream(StreamHandle stream, const std::string& stream_name);
+
+// Register a string with the profiler/NVTX implementation for faster use
+StringHandle RegisterString(ProfilerDomainHandle, const std::string&);
+
+// End a range that was created on this thread by RangePush
+void RangePop(ProfilerDomainHandle);
+
+// Older/simpler version; NVTX implementation copies a C-style string each time
+void RangePush(ProfilerDomainHandle domain, const char*);
+inline void RangePush(ProfilerDomainHandle domain, const std::string& str) {
+  RangePush(domain, str.c_str());
+}
+
+namespace detail {
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
+               uint64_t schema_id, const void* payload, size_t payload_size);
+}
+
+// More powerful version: pass a registered string instead of a C-style
+// string, and attach a generic payload. The Annotation type must implement a
+// method called NvtxSchemaId() that allows the NVTX backend to interpret the
+// payload.
+template <typename Annotation>
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
+               const Annotation& annotation) {
+  return detail::RangePush(domain, title, annotation.NvtxSchemaId(),
+                           &annotation, sizeof(Annotation));
+}
+
+// Register the schema of a custom payload type, for use with the more powerful
+// version of RangePush
+uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr);
+}  // namespace tsl::profiler
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h
new file mode 100644
index 00000000..e2b9fd3e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_COLLECTION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_COLLECTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// ProfilerCollection multiplexes ProfilerInterface calls into a collection of
+// profilers.
+class ProfilerCollection : public ProfilerInterface {
+ public:
+  explicit ProfilerCollection(
+      std::vector<std::unique_ptr<ProfilerInterface>> profilers);
+
+  absl::Status Start() override;
+
+  absl::Status Stop() override;
+
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
+
+ private:
+  std::vector<std::unique_ptr<ProfilerInterface>> profilers_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_COLLECTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h
new file mode 100644
index 00000000..cc0334e9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_CONTROLLER_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_CONTROLLER_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Decorator for xprof profiler plugins.
+//
+// Tracks that calls to the underlying profiler interface functions are made
+// in the expected order: Start, Stop and CollectData. Making the calls
+// in a different order causes them to be aborted.
+//
+// Calls made in the right order will be aborted if one of the calls to the
+// decorated profiler interface fails, and no more calls will be forwarded to
+// the decorated profiler.
+class ProfilerController : public ProfilerInterface {
+ public:
+  explicit ProfilerController(std::unique_ptr<ProfilerInterface> profiler);
+  ~ProfilerController() override;
+
+  absl::Status Start() override;
+
+  absl::Status Stop() override;
+
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
+
+ private:
+  enum class ProfilerState {
+    kInit = 0,
+    kStart = 1,
+    kStop = 2,
+    kCollectData = 3,
+  };
+
+  ProfilerState state_ = ProfilerState::kInit;
+  std::unique_ptr<ProfilerInterface> profiler_;
+  absl::Status status_;  // result of calls to profiler_
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_CONTROLLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory.h
new file mode 100644
index 00000000..7e6c64c3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_FACTORY_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_FACTORY_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// A ProfilerFactory returns an instance of ProfilerInterface if ProfileOptions
+// require it. Otherwise, it might return nullptr.
+using ProfilerFactory = std::function<std::unique_ptr<ProfilerInterface>(
+    const tensorflow::ProfileOptions&)>;
+
+// Registers a profiler factory. Should be invoked at most once per factory.
+void RegisterProfilerFactory(ProfilerFactory factory);
+
+// Invokes all registered profiler factories with the given options, and
+// returns the instantiated (non-null) profiler interfaces.
+std::vector<std::unique_ptr<ProfilerInterface>> CreateProfilers(
+    const tensorflow::ProfileOptions& options);
+
+// For testing only.
+void ClearRegisteredProfilersForTest();
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_FACTORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h
new file mode 100644
index 00000000..2b0b7124
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_INTERFACE_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_INTERFACE_H_
+
+#include "xla/tsl/platform/status.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Interface for tensorflow profiler plugins.
+//
+// ProfileSession calls each of these methods at most once per instance, and
+// implementations can rely on that guarantee for simplicity.
+//
+// Thread-safety: Implementations are only required to be go/thread-compatible.
+// ProfileSession is go/thread-safe and synchronizes access to ProfilerInterface
+// instances.
+class ProfilerInterface {
+ public:
+  virtual ~ProfilerInterface() = default;
+
+  // Starts profiling.
+  virtual absl::Status Start() = 0;
+
+  // Stops profiling.
+  virtual absl::Status Stop() = 0;
+
+  // Saves collected profile data into XSpace.
+  virtual absl::Status CollectData(tensorflow::profiler::XSpace* space) = 0;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h
new file mode 100644
index 00000000..719ed8f2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_LOCK_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_LOCK_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace tsl {
+namespace profiler {
+
+constexpr absl::string_view kProfilerLockContention =
+    "Another profiling session active.";
+
+// Handle for the profiler lock. At most one instance of this class, the
+// "active" instance, owns the profiler lock.
+class ProfilerLock {
+ public:
+  // Returns true if the process has active profiling session.
+  static bool HasActiveSession();
+
+  // Acquires the profiler lock if no other profiler session is currently
+  // active.
+  static absl::StatusOr<ProfilerLock> Acquire();
+
+  // Default constructor creates an inactive instance.
+  ProfilerLock() = default;
+
+  // Non-copyable.
+  ProfilerLock(const ProfilerLock&) = delete;
+  ProfilerLock& operator=(const ProfilerLock&) = delete;
+
+  // Movable.
+  ProfilerLock(ProfilerLock&& other) noexcept
+      : active_(std::exchange(other.active_, false)) {}
+  ProfilerLock& operator=(ProfilerLock&& other) noexcept {
+    active_ = std::exchange(other.active_, false);
+    return *this;
+  }
+
+  ~ProfilerLock() { ReleaseIfActive(); }
+
+  // Allow creating another active instance.
+  void ReleaseIfActive();
+
+  // Returns true if this is the active instance.
+  bool Active() const { return active_; }
+
+ private:
+  // Explicit constructor allows creating an active instance, private so it can
+  // only be called by Acquire.
+  explicit ProfilerLock(bool active) : active_(active) {}
+
+  bool active_ = false;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_LOCK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
new file mode 100644
index 00000000..f65ff7c3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_SESSION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_SESSION_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/thread_annotations.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/lib/profiler_lock.h"
+#endif
+
+namespace tsl {
+
+// A profiler which will start profiling when creating the object and will stop
+// when either the object is destroyed or CollectData is called.
+// Multiple instances can be created, but at most one of them will profile.
+// Status() will return OK only for the instance that is profiling.
+// Thread-safety: ProfilerSession is thread-safe.
+class ProfilerSession {
+ public:
+  // Creates a ProfilerSession and starts profiling.
+  static std::unique_ptr<ProfilerSession> Create(
+    const tensorflow::ProfileOptions& options);
+
+  static tensorflow::ProfileOptions DefaultOptions() {
+    tensorflow::ProfileOptions options;
+    options.set_version(1);
+    options.set_device_tracer_level(1);
+    options.set_host_tracer_level(2);
+    options.set_device_type(tensorflow::ProfileOptions::UNSPECIFIED);
+    options.set_python_tracer_level(0);
+    options.set_enable_hlo_proto(true);
+    options.set_include_dataset_ops(true);
+    return options;
+  }
+
+  // Deletes an existing Profiler and enables starting a new one.
+  ~ProfilerSession();
+
+  absl::Status Status() TF_LOCKS_EXCLUDED(mutex_);
+
+  // Collects profile data into XSpace.
+  absl::Status CollectData(tensorflow::profiler::XSpace* space)
+      TF_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit ProfilerSession(const tensorflow::ProfileOptions& options);
+
+  // ProfilerSession is neither copyable or movable.
+  ProfilerSession(const ProfilerSession&) = delete;
+  ProfilerSession& operator=(const ProfilerSession&) = delete;
+
+#if !defined(IS_MOBILE_PLATFORM)
+  // Collects profile data into XSpace without post-processsing.
+  absl::Status CollectDataInternal(tensorflow::profiler::XSpace* space);
+
+  profiler::ProfilerLock profiler_lock_ TF_GUARDED_BY(mutex_);
+
+  std::unique_ptr<profiler::ProfilerInterface> profilers_ TF_GUARDED_BY(mutex_);
+
+  uint64 start_time_ns_;
+  uint64 stop_time_ns_;
+  tensorflow::ProfileOptions options_;
+#endif
+  absl::Status status_ TF_GUARDED_BY(mutex_);
+  mutex mutex_;
+};
+
+}  // namespace tsl
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
new file mode 100644
index 00000000..d3953640
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_H_
+
+#include <stddef.h>
+
+#include <atomic>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "xla/tsl/platform/macros.h"
+#include "tsl/platform/platform.h"  // IWYU pragma: keep
+#include "tsl/profiler/lib/nvtx_utils.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
+#endif
+
+namespace tsl::profiler {
+
+// Adds an annotation to all activities through the currently registered
+// TraceCollector until PopAnnotation() is called.
+template <typename T>
+void PushAnnotation(const T& generator) {
+  if (auto domain = DefaultProfilerDomain();
+      TF_PREDICT_FALSE(domain != nullptr)) {
+    RangePush(domain, generator());
+    return;
+  }
+
+#if !defined(IS_MOBILE_PLATFORM)
+  if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+    AnnotationStack::PushAnnotation(static_cast<std::string_view>(generator()));
+  }
+#endif
+}
+
+inline void PushAnnotation(const char* name) {
+  PushAnnotation([&] { return name; });
+}
+inline void PushAnnotation(const std::string& name) {
+  PushAnnotation([&] { return name; });
+}
+
+inline void PopAnnotation() {
+  // TODO(b/137971921): without this memory fence, two presubmit tests will
+  // fail probably due to compiler in that presubmit config.
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  if (auto domain = DefaultProfilerDomain();
+      TF_PREDICT_FALSE(domain != nullptr)) {
+    RangePop(domain);
+    return;
+  }
+
+#if !defined(IS_MOBILE_PLATFORM)
+  if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+    AnnotationStack::PopAnnotation();
+  }
+#endif
+}
+
+// Adds an annotation to all activities for the duration of the instance
+// lifetime through the currently registered TraceCollector.
+//
+// Usage: {
+//          ScopedAnnotation annotation("my kernels");
+//          Kernel1<<<x,y>>>;
+//          LaunchKernel2(); // Launches a CUDA kernel.
+//        }
+// This will add 'my kernels' to both kernels in the profiler UI
+class ScopedAnnotation {
+ public:
+  template <typename T>
+  explicit ScopedAnnotation(T&& annotation) {
+    PushAnnotation(std::forward<T>(annotation));
+  }
+
+  // Pops the name passed in the constructor from the current annotation.
+  ~ScopedAnnotation() { PopAnnotation(); }
+
+  static bool IsEnabled() {
+#if !defined(IS_MOBILE_PLATFORM)
+    return AnnotationStack::IsEnabled();
+#else
+    return false;
+#endif
+  }
+
+ private:
+  ScopedAnnotation(const ScopedAnnotation&) = delete;
+  ScopedAnnotation& operator=(const ScopedAnnotation&) = delete;
+};
+
+}  // namespace tsl::profiler
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_memory_debug_annotation.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_memory_debug_annotation.h
new file mode 100644
index 00000000..5f86bf1b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_memory_debug_annotation.h
@@ -0,0 +1,112 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+
+namespace tsl {
+namespace profiler {
+
+// Annotations for memory profiling and debugging purpose.
+// ScopedMemoryDebugAnnotation will cache the annotations in thread-local
+// memory, and some allocators will try to tag allocations with the annotations.
+struct MemoryDebugAnnotation {
+  const char* pending_op_name = nullptr;
+  int64_t pending_step_id = 0;
+  const char* pending_region_type = nullptr;
+  int32_t pending_data_type = 0;
+  // A lambda function, when invoked, it will generate the string that describe
+  // the shape of the pending tensor. By default, the TensorShape string is an
+  // empty string.
+  std::function<std::string()> pending_shape_func = []() { return ""; };
+};
+
+// Wrapper class of MemoryDebugAnnotation for RAII.
+class ScopedMemoryDebugAnnotation {
+ public:
+  static const MemoryDebugAnnotation& CurrentAnnotation() {
+    return *ThreadMemoryDebugAnnotation();
+  }
+
+  explicit ScopedMemoryDebugAnnotation(const char* op_name) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    *thread_local_annotation = MemoryDebugAnnotation();
+    thread_local_annotation->pending_op_name = op_name;
+  }
+
+  explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    *thread_local_annotation = MemoryDebugAnnotation();
+    thread_local_annotation->pending_op_name = op_name;
+    thread_local_annotation->pending_step_id = step_id;
+  }
+
+  // This constructor keeps the pending_op_name and pending_step_id from parent
+  // (if any).  Otherwise it overwrites with op_name.
+  explicit ScopedMemoryDebugAnnotation(
+      const char* op_name, const char* region_type, int32_t data_type,
+      std::function<std::string()>&& pending_shape_func) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    if (!thread_local_annotation->pending_op_name) {
+      thread_local_annotation->pending_op_name = op_name;
+    }
+    thread_local_annotation->pending_region_type = region_type;
+    thread_local_annotation->pending_data_type = data_type;
+    thread_local_annotation->pending_shape_func = std::move(pending_shape_func);
+  }
+
+  explicit ScopedMemoryDebugAnnotation(
+      const char* op_name, int64_t step_id, const char* region_type,
+      int32_t data_type, std::function<std::string()>&& pending_shape_func) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    thread_local_annotation->pending_op_name = op_name;
+    thread_local_annotation->pending_step_id = step_id;
+    thread_local_annotation->pending_region_type = region_type;
+    thread_local_annotation->pending_data_type = data_type;
+    thread_local_annotation->pending_shape_func = std::move(pending_shape_func);
+  }
+
+  ~ScopedMemoryDebugAnnotation() {
+    *ThreadMemoryDebugAnnotation() = last_annotation_;
+  }
+
+ private:
+  // Returns a pointer to the MemoryDebugAnnotation for the current thread.
+  static MemoryDebugAnnotation* ThreadMemoryDebugAnnotation();
+
+  // Stores the previous values in case the annotations are nested.
+  MemoryDebugAnnotation last_annotation_;
+
+  ScopedMemoryDebugAnnotation(const ScopedMemoryDebugAnnotation&) = delete;
+  ScopedMemoryDebugAnnotation& operator=(const ScopedMemoryDebugAnnotation&) =
+      delete;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h
new file mode 100644
index 00000000..566dfef0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h
@@ -0,0 +1,343 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_TRACEME_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_TRACEME_H_
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/profiler/utils/no_init.h"
+#include "tsl/profiler/lib/traceme_encode.h"  // IWYU pragma: export
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "xla/tsl/profiler/backends/cpu/traceme_recorder.h"
+#include "xla/tsl/profiler/utils/time_utils.h"
+#endif
+
+namespace tsl {
+namespace profiler {
+
+constexpr uint64_t kTraceMeDefaultFilterMask =
+    std::numeric_limits<uint64_t>::max();
+
+// Predefined levels:
+// - Level 1 (kCritical) is the default and used only for user instrumentation.
+// - Level 2 (kInfo) is used by profiler for instrumenting high level program
+//   execution details (expensive TF ops, XLA ops, etc).
+// - Level 3 (kVerbose) is also used by profiler to instrument more verbose
+//   (low-level) program execution details (cheap TF ops, etc).
+enum TraceMeLevel {
+  kCritical = 1,
+  kInfo = 2,
+  kVerbose = 3,
+};
+
+// This is specifically used for instrumenting Tensorflow ops.
+// Takes input as whether a TF op is expensive or not and returns the TraceMe
+// level to be assigned to trace that particular op. Assigns level 2 for
+// expensive ops (these are high-level details and shown by default in profiler
+// UI). Assigns level 3 for cheap ops (low-level details not shown by default).
+inline int GetTFTraceMeLevel(bool is_expensive) {
+  return is_expensive ? kInfo : kVerbose;
+}
+
+// This class permits user-specified (CPU) tracing activities. A trace activity
+// is started when an object of this class is created and stopped when the
+// object is destroyed.
+//
+// CPU tracing can be useful when trying to understand what parts of GPU
+// computation (e.g., kernels and memcpy) correspond to higher level activities
+// in the overall program. For instance, a collection of kernels maybe
+// performing one "step" of a program that is better visualized together than
+// interspersed with kernels from other "steps". Therefore, a TraceMe object
+// can be created at each "step".
+//
+// Two APIs are provided:
+//   (1) Scoped object: a TraceMe object starts tracing on construction, and
+//       stops tracing when it goes out of scope.
+//          {
+//            TraceMe trace("step");
+//            ... do some work ...
+//          }
+//       TraceMe objects can be members of a class, or allocated on the heap.
+//   (2) Static methods: ActivityStart and ActivityEnd may be called in pairs.
+//          auto id = ActivityStart("step");
+//          ... do some work ...
+//          ActivityEnd(id);
+//       The two static methods should be called within the same thread.
+class TraceMe {
+ public:
+  // Constructor that traces a user-defined activity labeled with name
+  // in the UI. Level defines the trace priority, used for filtering TraceMe
+  // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
+  // - Must be a positive integer.
+  // - Can be a value in enum TraceMeLevel.
+  // Users are welcome to use level > 3 in their code, if they wish to filter
+  // out their host traces based on verbosity.
+  explicit TraceMe(absl::string_view name, int level = 1,
+                   uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+    DCHECK_GE(level, 1);
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level) &&
+                         TraceMeRecorder::CheckFilter(filter_mask))) {
+      name_.Emplace(std::string(name));
+      start_time_ = GetCurrentTimeNanos();
+    }
+#endif
+  }
+
+  // Do not allow passing a temporary string as the overhead of generating that
+  // string should only be incurred when tracing is enabled. Wrap the temporary
+  // string generation (e.g., StrCat) in a lambda and use the name_generator
+  // template instead.
+  explicit TraceMe(std::string&& name, int level = 1,
+                   uint64_t filter_mask = kTraceMeDefaultFilterMask) = delete;
+
+  // Do not allow passing strings by reference or value since the caller
+  // may unintentionally maintain ownership of the name.
+  // Explicitly wrap the name in a string_view if you really wish to maintain
+  // ownership of a string already generated for other purposes. For temporary
+  // strings (e.g., result of StrCat) use the name_generator template.
+  explicit TraceMe(const std::string& name, int level = 1,
+                   uint64_t filter_mask = kTraceMeDefaultFilterMask) = delete;
+
+  // This overload is necessary to make TraceMe's with string literals work.
+  // Otherwise, the name_generator template would be used.
+  explicit TraceMe(const char* raw, int level = 1,
+                   uint64_t filter_mask = kTraceMeDefaultFilterMask)
+      : TraceMe(absl::string_view(raw), level, filter_mask) {}
+
+  // This overload only generates the name (and possibly metadata) if tracing is
+  // enabled. Useful for avoiding expensive operations (e.g., string
+  // concatenation) when tracing is disabled.
+  // name_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
+  // name_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Example Usage:
+  //   TraceMe trace_me([&]() {
+  //     return StrCat("my_trace", id);
+  //   }
+  //   TraceMe op_trace_me([&]() {
+  //     return TraceMeOp(op_name, op_type);
+  //   }
+  //   TraceMe trace_me_with_metadata([&value1]() {
+  //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+  //   });
+  template <typename NameGeneratorT,
+            std::enable_if_t<std::is_invocable_v<NameGeneratorT>, bool> = true>
+  explicit TraceMe(NameGeneratorT&& name_generator, int level = 1,
+                   uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+    DCHECK_GE(level, 1);
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level) &&
+                         TraceMeRecorder::CheckFilter(filter_mask))) {
+      name_.Emplace(std::forward<NameGeneratorT>(name_generator)());
+      start_time_ = GetCurrentTimeNanos();
+    }
+#endif
+  }
+
+  // Movable.
+  TraceMe(TraceMe&& other) noexcept { *this = std::move(other); }
+  TraceMe& operator=(TraceMe&& other) noexcept {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(other.start_time_ != kUntracedActivity)) {
+      name_.Emplace(std::move(other.name_).Consume());
+      start_time_ = std::exchange(other.start_time_, kUntracedActivity);
+    }
+#endif
+    return *this;
+  }
+
+  ~TraceMe() { Stop(); }
+
+  // Stop tracing the activity. Called by the destructor, but exposed to allow
+  // stopping tracing before the object goes out of scope. Only has an effect
+  // the first time it is called.
+  void Stop() {
+    // We do not need to check the trace level again here.
+    // - If tracing wasn't active to start with, we have kUntracedActivity.
+    // - If tracing was active and was stopped, we have
+    //   TraceMeRecorder::Active().
+    // - If tracing was active and was restarted at a lower level, we may
+    //   spuriously record the event. This is extremely rare, and acceptable as
+    //   event will be discarded when its start timestamp fall outside of the
+    //   start/stop session timestamp.
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
+      if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
+        TraceMeRecorder::Record(
+            {std::move(name_.value), start_time_, GetCurrentTimeNanos()});
+      }
+      name_.Destroy();
+      start_time_ = kUntracedActivity;
+    }
+#endif
+  }
+
+  // Appends new_metadata to the TraceMe name passed to the constructor.
+  // metadata_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
+  // metadata_generator is only evaluated when tracing is enabled.
+  // metadata_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Example Usage:
+  //   trace_me.AppendMetadata([&value1]() {
+  //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+  //   });
+  template <
+      typename MetadataGeneratorT,
+      std::enable_if_t<std::is_invocable_v<MetadataGeneratorT>, bool> = true>
+  void AppendMetadata(MetadataGeneratorT&& metadata_generator) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
+      if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
+        traceme_internal::AppendMetadata(
+            &name_.value,
+            std::forward<MetadataGeneratorT>(metadata_generator)());
+      }
+    }
+#endif
+  }
+
+  // Static API, for use when scoped objects are inconvenient.
+
+  // Record the start time of an activity.
+  // Returns the activity ID, which is used to stop the activity.
+  // Calls `name_generator` to get the name for activity.
+  template <typename NameGeneratorT,
+            std::enable_if_t<std::is_invocable_v<NameGeneratorT>, bool> = true>
+  static int64_t ActivityStart(
+      NameGeneratorT&& name_generator, int level = 1,
+      uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level) &&
+                         TraceMeRecorder::CheckFilter(filter_mask))) {
+      int64_t activity_id = TraceMeRecorder::NewActivityId();
+      TraceMeRecorder::Record({std::forward<NameGeneratorT>(name_generator)(),
+                               GetCurrentTimeNanos(), -activity_id});
+      return activity_id;
+    }
+#endif
+    return kUntracedActivity;
+  }
+
+  // Record the start time of an activity.
+  // Returns the activity ID, which is used to stop the activity.
+  static int64_t ActivityStart(
+      absl::string_view name, int level = 1,
+      uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level) &&
+                         TraceMeRecorder::CheckFilter(filter_mask))) {
+      int64_t activity_id = TraceMeRecorder::NewActivityId();
+      TraceMeRecorder::Record(
+          {std::string(name), GetCurrentTimeNanos(), -activity_id});
+      return activity_id;
+    }
+#endif
+    return kUntracedActivity;
+  }
+
+  // Same as ActivityStart above, an overload for "const std::string&"
+  static int64_t ActivityStart(
+      const std::string& name, int level = 1,
+      uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+    return ActivityStart(absl::string_view(name), level, filter_mask);
+  }
+
+  // Same as ActivityStart above, an overload for "const char*"
+  static int64_t ActivityStart(
+      const char* name, int level = 1,
+      uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+    return ActivityStart(absl::string_view(name), level, filter_mask);
+  }
+
+  // Record the end time of an activity started by ActivityStart().
+  static void ActivityEnd(int64_t activity_id) {
+#if !defined(IS_MOBILE_PLATFORM)
+    // We don't check the level again (see TraceMe::Stop()).
+    if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) {
+      if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
+        TraceMeRecorder::Record(
+            {std::string(), -activity_id, GetCurrentTimeNanos()});
+      }
+    }
+#endif
+  }
+
+  // Records the time of an instant activity.
+  template <typename NameGeneratorT,
+            std::enable_if_t<std::is_invocable_v<NameGeneratorT>, bool> = true>
+  static void InstantActivity(
+      NameGeneratorT&& name_generator, int level = 1,
+      uint64_t filter_mask = kTraceMeDefaultFilterMask) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level) &&
+                         TraceMeRecorder::CheckFilter(filter_mask))) {
+      int64_t now = GetCurrentTimeNanos();
+      TraceMeRecorder::Record({std::forward<NameGeneratorT>(name_generator)(),
+                               /*start_time=*/now, /*end_time=*/now});
+    }
+#endif
+  }
+
+  static bool Active(int level = 1) {
+#if !defined(IS_MOBILE_PLATFORM)
+    return TraceMeRecorder::Active(level);
+#else
+    return false;
+#endif
+  }
+
+  static int64_t NewActivityId() {
+#if !defined(IS_MOBILE_PLATFORM)
+    return TraceMeRecorder::NewActivityId();
+#else
+    return 0;
+#endif
+  }
+
+ private:
+  // Start time used when tracing is disabled.
+  constexpr static int64_t kUntracedActivity = 0;
+
+  TraceMe(const TraceMe&) = delete;
+  void operator=(const TraceMe&) = delete;
+
+  NoInit<std::string> name_;
+
+  int64_t start_time_ = kUntracedActivity;
+};
+
+// Whether OpKernel::TraceString will populate additional information for
+// profiler, such as tensor shapes.
+inline bool TfOpDetailsEnabled() {
+  return TraceMe::Active(TraceMeLevel::kVerbose);
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_TRACEME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h
new file mode 100644
index 00000000..69f12dd0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_TRACEME_ENCODE_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_TRACEME_ENCODE_H_
+
+#include <string.h>
+
+#include <initializer_list>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+namespace profiler {
+
+// An argument passed to TraceMeEncode.
+struct TraceMeArg {
+  // String conversions of value types are supported via AlphaNum. We keep a
+  // reference to the AlphaNum's internal buffer here, so it must remain valid
+  // for the lifetime of this object. We cannot store it by value because it is
+  // not safe to construct an AlphaNum as a member of a class, particularly when
+  // AbslStringify is being used (it may reference default arguments that are on
+  // the caller's stack, if we constructed it here those default arguments would
+  // be destroyed before they are used).
+  TraceMeArg(absl::string_view k,
+             const absl::AlphaNum& v ABSL_ATTRIBUTE_LIFETIME_BOUND)
+      : key(k), value(v.Piece()) {}
+
+  TraceMeArg(const TraceMeArg&) = delete;
+  void operator=(const TraceMeArg&) = delete;
+
+  absl::string_view key;
+  absl::string_view value;
+};
+
+namespace traceme_internal {
+
+// Copies the contents of str to the address pointed by out.
+// Returns the address after the copy.
+// REQUIRED: The address range [out, out + str.size()] must have been allocated.
+TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
+                                               absl::string_view str) {
+  DCHECK(!absl::StrContains(str, '#'))
+      << "'#' is not a valid character in TraceMeEncode";
+  const size_t str_size = str.size();
+  if (TF_PREDICT_TRUE(str_size > 0)) {
+    memcpy(out, str.data(), str_size);
+    out += str_size;
+  }
+  return out;
+}
+
+// Appends args encoded as TraceMe metadata to name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
+    std::string name, std::initializer_list<TraceMeArg> args) {
+  if (TF_PREDICT_TRUE(args.size() > 0)) {
+    const auto old_size = name.size();
+    auto new_size = old_size + args.size() * 2 + 1;
+    for (const auto& arg : args) {
+      new_size += arg.key.size() + arg.value.size();
+    }
+    name.resize(new_size);
+    char* const begin = &name[0];
+    char* out = begin + old_size;
+    *out++ = '#';
+    for (const auto& arg : args) {
+      out = Append(out, arg.key);
+      *out++ = '=';
+      out = Append(out, arg.value);
+      *out++ = ',';
+    }
+    *(out - 1) = '#';
+    DCHECK_EQ(out, begin + new_size);
+  }
+  return name;
+}
+
+// Appends new_metadata to the metadata part of name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
+    std::string* name, absl::string_view new_metadata) {
+  if (!TF_PREDICT_FALSE(new_metadata.empty())) {
+    if (!name->empty() && name->back() == '#') {  // name already has metadata
+      name->back() = ',';
+      if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+        new_metadata.remove_prefix(1);
+      }
+    }
+    name->append(new_metadata.data(), new_metadata.size());
+  }
+}
+
+}  // namespace traceme_internal
+
+// Encodes an event name and arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me([value1]() {
+//     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+//   });
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    std::string name, std::initializer_list<TraceMeArg> args) {
+  return traceme_internal::AppendArgs(std::move(name), args);
+}
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    absl::string_view name, std::initializer_list<TraceMeArg> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    const char* name, std::initializer_list<TraceMeArg> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+
+// Encodes arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me("my_trace");
+//   ...
+//   trace_me.AppendMetadata([value1]() {
+//     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+//   });
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    std::initializer_list<TraceMeArg> args) {
+  return traceme_internal::AppendArgs(std::string(), args);
+}
+
+// Concatenates op_name and op_type.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOp(
+    absl::string_view op_name, absl::string_view op_type) {
+  return absl::StrCat(op_name, ":", op_type);
+}
+
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOp(const char* op_name,
+                                                        const char* op_type) {
+  return absl::StrCat(op_name, ":", op_type);
+}
+
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOp(
+    std::string&& op_name, absl::string_view op_type) {
+  absl::StrAppend(&op_name, ":", op_type);
+  return op_name;
+}
+
+// Concatenates op_name and op_type.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOpOverride(
+    absl::string_view op_name, absl::string_view op_type) {
+  return absl::StrCat("#tf_op=", op_name, ":", op_type, "#");
+}
+
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeOpOverride(
+    const char* op_name, const char* op_type) {
+  return absl::StrCat("#tf_op=", op_name, ":", op_type, "#");
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_TRACEME_ENCODE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/array.h b/third_party/tflite-hdrs/third_party/xla/xla/array.h
new file mode 100644
index 00000000..0bec1540
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/array.h
@@ -0,0 +1,730 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_ARRAY_H_
+#define XLA_ARRAY_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <random>
+#include <string>
+#include <type_traits>
+
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/types.h"
+
+namespace xla {
+
+namespace array_impl {
+
+template <typename T, typename T2>
+using overload_for_float = std::enable_if_t<
+    is_specialized_floating_point_v<T> && std::is_same<T2, float>::value, bool>;
+
+// A type trait that is valid when all elements in a parameter pack are of
+// integral type. Not using an alias template to work around MSVC 14.00 bug.
+template <typename... Ts>
+struct pack_is_integral : std::conjunction<std::is_integral<Ts>...> {};
+
+// Compares three same-sized vectors elementwise. For each item in `values`,
+// returns false if any of values[i] is outside the half-open range [starts[i],
+// ends[i]).
+template <typename C1, typename C2, typename C3>
+bool all_inside_range(const C1& values, const C2& range_starts,
+                      const C3& range_ends) {
+  for (size_t i = 0, e = values.size(); i < e; ++i) {
+    if (values[i] < range_starts[i] || values[i] >= range_ends[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace array_impl
+
+// General N dimensional array class with arbitrary value type.
+template <typename T>
+class Array {
+ public:
+  // Type inference can have a hard time parsing very deep initializer list
+  // nests, especially if one or more dimensions is one as the compiler just
+  // sees a single-element integer initializer. These typedefs allow casting
+  // explicitly with less typing.
+  template <typename D>
+  using InitializerList1D = std::initializer_list<D>;
+  template <typename D>
+  using InitializerList2D = std::initializer_list<InitializerList1D<D>>;
+  template <typename D>
+  using InitializerList3D = std::initializer_list<InitializerList2D<D>>;
+  template <typename D>
+  using InitializerList4D = std::initializer_list<InitializerList3D<D>>;
+
+  using value_type = T;
+
+  // Creates a new array with the specified dimensions and initialized elements.
+  explicit Array(absl::Span<const int64_t> sizes)
+      : sizes_(sizes.size()),
+        values_(calculate_elements(sizes), default_init_t{}) {
+    std::memcpy(sizes_.data.get(), sizes.data(),
+                sizeof(int64_t) * sizes.size());
+  }
+
+  // Creates a new array with the specified dimensions and specified value for
+  // every cell.
+  Array(absl::Span<const int64_t> sizes, T value)
+      : Array(sizes, no_default_init_t{}) {
+    Fill(value);
+  }
+
+  // Creates a 2D array from the given nested initializer list. The outer
+  // initializer list is the first dimension, the inner is the second dimension.
+  // For example, {{1, 2, 3}, {4, 5, 6}} results in an array with n1=2 and n2=3.
+  Array(InitializerList2D<T> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      for (const auto& it2 : it1) {
+        values_[idx] = it2;
+        ++idx;
+      }
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 1D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array(std::initializer_list<T2> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      values_[idx] = static_cast<T>(it1);
+      ++idx;
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 2D array of a floating-point type (float8, half, bfloat16, float,
+  // or double) from an initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array(std::initializer_list<std::initializer_list<T2>> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      for (const auto& it2 : it1) {
+        values_[idx] = static_cast<T>(it2);
+        ++idx;
+      }
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 3D array from the given nested initializer list. The outer
+  // initializer list is the first dimension, and so on.
+  Array(InitializerList3D<T> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      for (const auto& it2 : it1) {
+        for (const auto& it3 : it2) {
+          values_[idx] = it3;
+          ++idx;
+        }
+      }
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 3D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array(std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
+            values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      for (const auto& it2 : it1) {
+        for (const auto& it3 : it2) {
+          values_[idx] = static_cast<T>(it3);
+          ++idx;
+        }
+      }
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 4D array from the given nested initializer list. The outer
+  // initializer list is the first dimension, and so on.
+  Array(InitializerList4D<T> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      for (const auto& it2 : it1) {
+        for (const auto& it3 : it2) {
+          for (const auto& it4 : it3) {
+            values_[idx] = it4;
+            ++idx;
+          }
+        }
+      }
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 4D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array(std::initializer_list<
+        std::initializer_list<std::initializer_list<std::initializer_list<T2>>>>
+            values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
+    int64_t idx = 0;
+    for (const auto& it1 : values) {
+      for (const auto& it2 : it1) {
+        for (const auto& it3 : it2) {
+          for (const auto& it4 : it3) {
+            values_[idx] = static_cast<T>(it4);
+            ++idx;
+          }
+        }
+      }
+    }
+    CHECK(idx == num_elements());
+  }
+
+  Array(const Array<T>& other)
+      : sizes_(other.sizes_.Clone()), values_(other.values_.Clone()) {}
+
+  Array(Array<T>&& other) = default;
+
+  Array<T>& operator=(const Array<T>& other) {
+    sizes_ = other.sizes_.Clone();
+    values_ = other.values_.Clone();
+    return *this;
+  }
+
+  Array<T>& operator=(Array<T>&& other) = default;
+
+  // Fills the array with the specified value.
+  void Fill(const T& value) { std::fill(begin(), end(), value); }
+
+  // Fills the array with sequentially increasing values.
+  void FillIota(const T& value) { std::iota(begin(), end(), value); }
+
+  // Fills the array with a repeating sequence:
+  //   [value, value + 1, ..., value + length - 1, value, ... ]
+  void FillRepeatedIota(const T& value, int64_t length) {
+    for (int64_t i = 0; i < num_elements(); i += length) {
+      std::iota(begin() + i, begin() + std::min(i + length, num_elements()),
+                value);
+    }
+  }
+
+  // Fills the array with the sequence i*multiplier for i=0,1,...
+  void FillWithMultiples(const T& multiplier) {
+    for (int64_t i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(i) * multiplier;
+    }
+  }
+
+  // Fills the array with random normal variables with the specified mean.
+  void FillRandom(const T& stddev, double mean = 0.0, int seed = 12345) {
+    FillRandomDouble(static_cast<double>(stddev), mean, seed);
+  }
+
+  void FillRandomDouble(double stddev, double mean = 0.0, int seed = 12345) {
+    std::mt19937 g(seed);
+    std::normal_distribution<double> distribution(mean, stddev);
+    for (int64_t i = 0; i < num_elements(); ++i) {
+      if (std::is_same<T, bool>()) {
+        values_[i] = static_cast<T>(distribution(g) > 0.0);
+      } else {
+        values_[i] = static_cast<T>(distribution(g));
+      }
+    }
+  }
+
+  // Fills the array with random uniform variables in the [min_value, max_value]
+  // range. Defined for integral types.
+  template <typename = typename std::enable_if<is_specialized_integral_v<T>>>
+  void FillRandomUniform(const T& min_value, const T& max_value,
+                         int seed = 12345) {
+    using RngInputType =
+        std::conditional_t<std::is_integral_v<T>, T,
+                           std::conditional_t<std::numeric_limits<T>::is_signed,
+                                              int64_t, uint64_t>>;
+    static_assert(std::numeric_limits<T>::digits <=
+                  std::numeric_limits<RngInputType>::digits);
+    std::mt19937 g(seed);
+    std::uniform_int_distribution<RngInputType> distribution(
+        static_cast<RngInputType>(min_value),
+        static_cast<RngInputType>(max_value));
+    for (int64_t i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
+    }
+  }
+
+  // Fills the array with random uniform variables that's either True or False.
+  // Defined for boolean type.
+  void FillRandomBool(int seed = 12345) {
+    std::mt19937 g(seed);
+    std::uniform_int_distribution<int32_t> distribution(0, 1);
+    for (int64_t i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<bool>(distribution(g));
+    }
+  }
+
+  // Sets all the values in the array to values specified in the container.
+  template <typename Container = std::initializer_list<T>>
+  void SetValues(const Container& container) {
+    CHECK_EQ(std::distance(std::begin(container), std::end(container)),
+             num_elements());
+    std::copy(std::begin(container), std::end(container), begin());
+  }
+
+  // Invokes a callback with the (indices, value_ptr) for each cell in the
+  // array.
+  void Each(absl::FunctionRef<void(absl::Span<const int64_t>, T*)> f) {
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      f(index.span(), &values_[i]);
+    }
+  }
+
+  // Invokes a callback with the (indices, value) for each cell in the array.
+  void Each(absl::FunctionRef<void(absl::Span<const int64_t>, T)> f) const {
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      f(index.span(), values_[i]);
+    }
+  }
+
+  // Invokes a callback with the (indices, value_ptr) for each cell in the
+  // array. If a callback returns a non-OK status, returns that else returns
+  // absl::OkStatus().
+  absl::Status EachStatus(
+      absl::FunctionRef<absl::Status(absl::Span<const int64_t>, T*)> f) {
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      absl::Status s = f(index.span(), &values_[i]);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  // Invokes a callback with the (indices, value) for each cell in the array.
+  // If a callback returns a non-OK status, returns that else returns
+  // absl::OkStatus().
+  absl::Status EachStatus(
+      absl::FunctionRef<absl::Status(absl::Span<const int64_t>, T)> f) const {
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      absl::Status s = f(index.span(), values_[i]);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  // Returns the value at the cell specified by the indexes. The number of
+  // arguments have to match with the number of dimensions for the array.
+  //
+  // The type trait is required to avoid this overload participating too
+  // eagerly; a parameter pack can take zero or more elements, so we must
+  // restrict this to only parameter packs that are all of integral type.
+  template <typename... Dims>
+  typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
+                          const T&>::type
+  operator()(Dims... dims) const {
+    CHECK_EQ(sizeof...(dims), num_dimensions());
+    // We are using a std::array to avoid having to allocate memory in this
+    // function for performance reasons.
+    std::array<int64_t, sizeof...(dims)> indexes{
+        {static_cast<int64_t>(dims)...}};
+    return values_[calculate_index(indexes)];
+  }
+
+  // Returns the value at the cell specified by the indexes. The number of
+  // arguments have to match with the number of dimensions for the array.
+  template <typename... Dims>
+  typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
+                          T&>::type
+  operator()(Dims... dims) {
+    return const_cast<T&>(const_cast<const Array*>(this)->operator()(
+        std::forward<Dims>(dims)...));
+  }
+
+  // Returns the value at the cell specified by the indexes. The number of
+  // arguments have to match with the number of dimensions for the array.
+  const T& operator()(absl::Span<const int64_t> indexes) const {
+    CHECK_EQ(indexes.size(), num_dimensions());
+    return values_[calculate_index(indexes)];
+  }
+
+  // Returns the value at the cell specified by the indexes. The number of
+  // arguments have to match with the number of dimensions for the array.
+  T& operator()(absl::Span<const int64_t> indexes) {
+    return const_cast<T&>(const_cast<const Array*>(this)->operator()(indexes));
+  }
+
+  // Low-level accessor for stuff like memcmp, handle with care. Returns pointer
+  // to the underlying storage of the array (similarly to std::vector::data()).
+  T* data() const {
+    // TODO(tberghammer): Get rid of the const_cast. Currently it is needed
+    // because the Eigen backend needs a non-const pointers even for reading
+    // from the array.
+    return const_cast<Array*>(this)->values_.data.get();
+  }
+
+  // Returns the size of the dimension at the given index.
+  int64_t dim(int64_t n) const {
+    DCHECK_LT(n, sizes_.size);
+    return sizes_[n];
+  }
+
+  // Returns a vector containing the dimensions of the array.
+  absl::Span<const int64_t> dimensions() const { return sizes_.span(); }
+
+  int64_t num_dimensions() const { return sizes_.size; }
+
+  // Returns the total number of elements in the array.
+  int64_t num_elements() const { return values_.size; }
+
+  const T* begin() const { return values_.data.get(); }
+  T* begin() { return values_.data.get(); }
+  const T* end() const { return values_.data.get() + num_elements(); }
+  T* end() { return values_.data.get() + num_elements(); }
+
+  bool operator==(const Array<T>& other) const {
+    if (sizes_.size != other.sizes_.size) {
+      return false;
+    }
+    for (int64_t i = 0, end = sizes_.size; i < end; ++i) {
+      if (sizes_[i] != other.sizes_[i]) {
+        return false;
+      }
+    }
+    for (int64_t i = 0; i < num_elements(); ++i) {
+      if (values_[i] != other.values_[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool operator!=(const Array<T>& other) const { return !(*this == other); }
+
+  // Performs the equivalent of a slice operation on this array.
+  // When `out_of_bounds_value` is specified, the out of bounds accesses are ok
+  // and the slice is initialized to the given value.
+  Array<T> Slice(absl::Span<const int64_t> starts,
+                 absl::Span<const int64_t> limits,
+                 std::optional<T> out_of_bounds_value = std::nullopt) const {
+    CHECK_EQ(starts.size(), num_dimensions());
+    CHECK_EQ(limits.size(), num_dimensions());
+
+    OwnedBuffer<int64_t> sizes(starts.size());
+    for (int64_t i = 0; i < starts.size(); ++i) {
+      CHECK_GE(starts[i], 0);
+      if (!out_of_bounds_value.has_value()) {
+        CHECK_LE(limits[i], dim(i));
+      }
+      sizes[i] = limits[i] - starts[i];
+    }
+    Array<T> result(sizes.span());
+    if (result.num_elements() == 0) {
+      return result;
+    }
+    // Initializes the slice to the given value if out of bounds access are ok.
+    if (out_of_bounds_value.has_value()) {
+      for (int64_t i = 0; i < result.num_elements(); ++i) {
+        result.values_[i] = out_of_bounds_value.value();
+      }
+    }
+
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    int64_t slice_i = 0;
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      if (array_impl::all_inside_range(index.span(), starts, limits)) {
+        // Even though the bounds of result are different to our bounds, we're
+        // iterating in the same order. So we can simply write successive linear
+        // indices instead of recalculating a multi-dimensional index.
+        result.values_[slice_i++] = values_[i];
+      }
+    }
+    return result;
+  }
+
+  // Performs the equivalent of a DynamicUpdateSlice in-place on this array.
+  void UpdateSlice(const Array<T>& from,
+                   absl::Span<const int64_t> start_indices) {
+    CHECK_EQ(from.num_dimensions(), num_dimensions());
+    OwnedBuffer<int64_t> limit_indices(start_indices.size());
+    for (int64_t i = 0; i < start_indices.size(); ++i) {
+      limit_indices[i] = from.sizes_[i] + start_indices[i];
+    }
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    int64_t from_i = 0;
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      if (array_impl::all_inside_range(index.span(), start_indices,
+                                       limit_indices)) {
+        // Even though the bounds of from are different to our bounds, we're
+        // iterating in the same order. So we can simply write successive linear
+        // indices instead of recalculating a multi-dimensional index.
+        values_[i] = from.values_[from_i++];
+      }
+    }
+  }
+
+  // Performs an in-place reshape, modifying the dimensions but not the
+  // underlying data.
+  void Reshape(absl::Span<const int64_t> new_dimensions) {
+    const int64_t new_num_elements =
+        std::accumulate(new_dimensions.begin(), new_dimensions.end(), 1LL,
+                        std::multiplies<int64_t>());
+    CHECK_EQ(new_num_elements, num_elements());
+    if (sizes_.size != new_dimensions.size()) {
+      sizes_ = OwnedBuffer<int64_t>(new_dimensions.size());
+    }
+    std::memcpy(sizes_.data.get(), new_dimensions.data(),
+                new_dimensions.size() * sizeof(int64_t));
+  }
+
+  // Performs a permutation of dimensions.
+  void TransposeDimensions(absl::Span<const int64_t> permutation) {
+    return TransposeDimensionsImpl<int64_t>(permutation);
+  }
+  void TransposeDimensions(absl::Span<const int> permutation) {
+    return TransposeDimensionsImpl<int>(permutation);
+  }
+  void TransposeDimensions(std::initializer_list<int> permutation) {
+    return TransposeDimensionsImpl<int>(permutation);
+  }
+  template <typename IntT,
+            std::enable_if_t<std::is_integral_v<IntT>>* = nullptr>
+  void TransposeDimensionsImpl(absl::Span<const IntT> permutation) {
+    CHECK_EQ(sizes_.size, permutation.size());
+    OwnedBuffer<int64_t> permuted_dims(permutation.size());
+    for (int64_t i = 0; i < permutation.size(); ++i) {
+      permuted_dims[i] = this->dim(permutation[i]);
+    }
+    Array<T> permuted(permuted_dims.span());
+    OwnedBuffer<int64_t> src_indices(sizes_.size, -1);
+    permuted.Each([&](absl::Span<const int64_t> indices, T* value) {
+      for (int64_t i = 0; i < sizes_.size; ++i) {
+        src_indices[permutation[i]] = indices[i];
+      }
+      *value = (*this)(src_indices.span());
+    });
+    *this = std::move(permuted);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Array& array) {
+    return H::combine(std::move(h), array.values_.span(), array.dimensions());
+  }
+
+  // Returns a string representation of the array suitable for debugging.
+  std::string ToString() const {
+    if (sizes_.size == 0) {
+      return "";
+    }
+    std::string result;
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    do {
+      // Emit leading spaces and opening square brackets
+      if (index[index.size - 1] == 0) {
+        for (int64_t i = sizes_.size - 1; i >= 0; --i) {
+          if (i == 0 || index[i - 1] != 0) {
+            for (int64_t j = 0; j < sizes_.size; ++j) {
+              absl::StrAppend(&result, j < i ? " " : "[");
+            }
+            break;
+          }
+        }
+      }
+      int value_index = calculate_index(index.span());
+      if (value_index < num_elements()) {
+        absl::StrAppend(&result, values_[value_index]);
+      }
+
+      // Emit comma if it isn't the last element
+      if (index[index.size - 1] < sizes_[sizes_.size - 1] - 1) {
+        absl::StrAppend(&result, ", ");
+      }
+
+      // Emit closing square brackets
+      for (int64_t i = sizes_.size - 1; i >= 0; --i) {
+        if (index[i] < sizes_[i] - 1) {
+          break;
+        }
+        absl::StrAppend(&result, "]");
+        if (i != 0 && index[i - 1] < sizes_[i - 1] - 1) {
+          absl::StrAppend(&result, ",\n");
+        }
+      }
+    } while (next_index(&index));
+    return result;
+  }
+
+ private:
+  struct default_init_t {};
+  struct no_default_init_t {};
+  // A fixed sized dynamically allocated buffer to replace std::vector usage. It
+  // saves one word for storing capacity which is always the same as size and it
+  // provides the ability to leave its elements uninitialized if the element
+  // type is trivially destructible.
+  template <typename D>
+  struct OwnedBuffer {
+    explicit OwnedBuffer(size_t size)
+        : data(std::is_trivially_destructible_v<D> ? new D[size]
+                                                   : new D[size]()),
+          size(size) {}
+    explicit OwnedBuffer(size_t size, default_init_t)
+        : data(new D[size]()), size(size) {}
+
+    explicit OwnedBuffer(size_t size, D init) : OwnedBuffer(size) {
+      std::fill(data.get(), data.get() + size, init);
+    }
+
+    OwnedBuffer(OwnedBuffer&& other) noexcept
+        : data(std::move(other.data)), size(other.size) {
+      other.size = 0;
+    }
+
+    OwnedBuffer& operator=(OwnedBuffer&& other) noexcept {
+      data = std::move(other.data);
+      size = other.size;
+      other.size = 0;
+      return *this;
+    }
+
+    OwnedBuffer Clone() const {
+      OwnedBuffer clone(size);
+      std::memcpy(clone.data.get(), data.get(), size * sizeof(D));
+      return clone;
+    }
+
+    D& operator[](int64_t index) { return data[index]; }
+    const D& operator[](int64_t index) const { return data[index]; }
+
+    absl::Span<const D> span() const {
+      return absl::MakeConstSpan(data.get(), size);
+    }
+
+    std::unique_ptr<D[]> data;
+    size_t size;
+  };
+
+  explicit Array(absl::Span<const int64_t> sizes, no_default_init_t)
+      : sizes_(sizes.size()), values_(calculate_elements(sizes)) {
+    std::memcpy(sizes_.data.get(), sizes.data(),
+                sizeof(int64_t) * sizes.size());
+  }
+
+  // Extracts the dimensions of an initializer_list to an array type int64_t.
+  // Used by the initializer list based constructors to convert the size type
+  // into int64_t to be passed to the size based constructor.
+  template <typename D>
+  static std::array<int64_t, 1> ToInt64Array(const InitializerList1D<D>& data) {
+    return std::array<int64_t, 1>{static_cast<int64_t>(data.size())};
+  }
+
+  template <typename D>
+  static std::array<int64_t, 2> ToInt64Array(const InitializerList2D<D>& data) {
+    return std::array<int64_t, 2>{static_cast<int64_t>(data.size()),
+                                  static_cast<int64_t>(data.begin()->size())};
+  }
+
+  template <typename D>
+  static std::array<int64_t, 3> ToInt64Array(const InitializerList3D<D>& data) {
+    return std::array<int64_t, 3>{
+        static_cast<int64_t>(data.size()),
+        static_cast<int64_t>(data.begin()->size()),
+        static_cast<int64_t>(data.begin()->begin()->size())};
+  }
+
+  template <typename D>
+  static std::array<int64_t, 4> ToInt64Array(const InitializerList4D<D>& data) {
+    return std::array<int64_t, 4>{
+        static_cast<int64_t>(data.size()),
+        static_cast<int64_t>(data.begin()->size()),
+        static_cast<int64_t>(data.begin()->begin()->size()),
+        static_cast<int64_t>(data.begin()->begin()->begin()->size())};
+  }
+
+  // Returns the linear index from the list of per-dimension indexes. Function
+  // is templated so can be used with an std::array from operator() to avoid
+  // memory allocation.
+  // The returned value may be larger than or equal to the number of elements if
+  // the indexes exceed the array's corresponding dimension size.
+  int64_t calculate_index(absl::Span<const int64_t> indexes) const {
+    DCHECK_EQ(sizes_.size, indexes.size());
+    int64_t index = 0;
+    for (int64_t i = 0; i < sizes_.size; ++i) {
+      index *= sizes_[i];
+      index += indexes[i];
+    }
+    return index;
+  }
+
+  // Advances the specified set of indexes and returns true if we haven't
+  // wrapped around (i.e. result isn't {0, 0, ...}).
+  bool next_index(OwnedBuffer<int64_t>* index) const {
+    DCHECK_EQ(index->size, sizes_.size);
+    for (int64_t i = sizes_.size - 1; i >= 0; --i) {
+      (*index)[i]++;
+      if ((*index)[i] < sizes_[i]) {
+        return true;
+      }
+      (*index)[i] = 0;
+    }
+    return false;
+  }
+
+  static size_t calculate_elements(absl::Span<const int64_t> sizes) {
+    return std::accumulate(sizes.begin(), sizes.end(), 1LL,
+                           std::multiplies<int64_t>());
+  }
+
+  OwnedBuffer<int64_t> sizes_;
+  OwnedBuffer<T> values_;
+};
+
+// Specialization of FillRandom() method for complex64 type. Uses real part of
+// the stddev parameter as the standard deviation value.
+template <>
+void Array<complex64>::FillRandom(const complex64& stddev, double mean,
+                                  int seed);
+
+}  // namespace xla
+
+#endif  // XLA_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/array2d.h b/third_party/tflite-hdrs/third_party/xla/xla/array2d.h
new file mode 100644
index 00000000..98942f95
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/array2d.h
@@ -0,0 +1,123 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_ARRAY2D_H_
+#define XLA_ARRAY2D_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/strings/str_cat.h"
+#include "xla/array.h"
+#include "xla/types.h"
+#include "xla/util.h"
+
+namespace xla {
+
+template <typename T>
+class Array2D : public Array<T> {
+ public:
+  Array2D() : Array<T>(std::vector<int64_t>{0, 0}) {}
+
+  Array2D(const int64_t n1, const int64_t n2)
+      : Array<T>(std::vector<int64_t>{n1, n2}) {}
+
+  Array2D(const int64_t n1, const int64_t n2, const T value)
+      : Array<T>({n1, n2}, value) {}
+
+  // Creates an array from the given nested initializer list. The outer
+  // initializer list is the first dimension; the inner is the second dimension.
+  // For example, {{1, 2, 3}, {4, 5, 6}} results in an array with n1=2 and n2=3.
+  Array2D(std::initializer_list<std::initializer_list<T>> values)
+      : Array<T>(values) {}
+
+  // Creates an array of a floating-point type (float8, half, bfloat16, float,
+  // or double) from the given nested initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array2D(std::initializer_list<std::initializer_list<T2>> values)
+      : Array<T>(values) {}
+
+  Array2D(const Array2D<T>& other) : Array<T>(other) {}
+  Array2D(Array2D<T>&& other) noexcept : Array<T>(std::move(other)) {}
+
+  Array2D& operator=(const Array2D<T>& other) {
+    Array<T>::operator=(other);
+    return *this;
+  }
+
+  Array2D& operator=(Array2D<T>&& other) noexcept {
+    Array<T>::operator=(std::move(other));
+    return *this;
+  }
+
+  int64_t n1() const { return this->dim(0); }
+  int64_t n2() const { return this->dim(1); }
+
+  int64_t height() const { return this->dim(0); }
+  int64_t width() const { return this->dim(1); }
+
+  // Fills the array with a pattern of values of the form:
+  //
+  //    (rowno << log2ceil(width) | colno) + start_value
+  //
+  // This makes it easy to see distinct row/column values in the array.
+  void FillUnique(T start_value = 0) {
+    int shift = Log2Ceiling<uint64_t>(n2());
+    for (int64_t i0 = 0; i0 < n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < n2(); ++i1) {
+        (*this)(i0, i1) = ((i0 << shift) | i1) + start_value;
+      }
+    }
+  }
+
+  // Applies f to all cells in this array, in row-major order.
+  void Each(absl::FunctionRef<void(int64_t, int64_t, T*)> f) {
+    for (int64_t i0 = 0; i0 < n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < n2(); ++i1) {
+        f(i0, i1, &(*this)(i0, i1));
+      }
+    }
+  }
+};
+
+// Returns a linspace-populated Array2D in the range [from, to] (inclusive)
+// with dimensions n1 x n2.
+template <typename NativeT = float>
+std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
+                                                      int64_t n1, int64_t n2) {
+  auto array = std::make_unique<Array2D<NativeT>>(n1, n2);
+  int64_t count = n1 * n2;
+  double step =
+      static_cast<double>((count > 1) ? (to - from) / (count - 1) : 0);
+
+  auto set = [&array, n2](int64_t index, NativeT value) {
+    (*array)(index / n2, index % n2) = value;
+  };
+  for (int64_t i = 0; i < count - 1; ++i) {
+    set(i, (static_cast<NativeT>(from + i * step)));
+  }
+  set(count - 1, static_cast<NativeT>(to));
+  return array;
+}
+}  // namespace xla
+
+#endif  // XLA_ARRAY2D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/array3d.h b/third_party/tflite-hdrs/third_party/xla/xla/array3d.h
new file mode 100644
index 00000000..ba314747
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/array3d.h
@@ -0,0 +1,73 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_ARRAY3D_H_
+#define XLA_ARRAY3D_H_
+
+#include <algorithm>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <random>
+
+#include "xla/array.h"
+#include "xla/types.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Simple 3D array structure.
+template <typename T>
+class Array3D : public Array<T> {
+ public:
+  Array3D() : Array<T>(std::vector<int64_t>{0, 0, 0}) {}
+
+  // Creates an array of dimensions n1 x n2 x n3, uninitialized values.
+  Array3D(const int64_t n1, const int64_t n2, const int64_t n3)
+      : Array<T>(std::vector<int64_t>{n1, n2, n3}) {}
+
+  // Creates an array of dimensions n1 x n2 x n3, initialized to value.
+  Array3D(const int64_t n1, const int64_t n2, const int64_t n3, const T value)
+      : Array<T>(std::vector<int64_t>{n1, n2, n3}, value) {}
+
+  // Creates an array from the given nested initializer list. The outer
+  // initializer list is the first dimension, and so on.
+  //
+  // For example {{{1, 2}, {3, 4}, {5, 6}, {7, 8}},
+  //              {{9, 10}, {11, 12}, {13, 14}, {15, 16}},
+  //              {{17, 18}, {19, 20}, {21, 22}, {23, 24}}}
+  // results in an array with n1=3, n2=4, n3=2.
+  Array3D(std::initializer_list<std::initializer_list<std::initializer_list<T>>>
+              values)
+      : Array<T>(values) {}
+
+  // Creates an array of a floating-point type (half, bfloat16, float,
+  // or double) from the given nested initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array3D(
+      std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
+          values)
+      : Array<T>(values) {}
+
+  int64_t n1() const { return this->dim(0); }
+  int64_t n2() const { return this->dim(1); }
+  int64_t n3() const { return this->dim(2); }
+};
+
+}  // namespace xla
+
+#endif  // XLA_ARRAY3D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/array4d.h b/third_party/tflite-hdrs/third_party/xla/xla/array4d.h
new file mode 100644
index 00000000..816a1fd9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/array4d.h
@@ -0,0 +1,170 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_ARRAY4D_H_
+#define XLA_ARRAY4D_H_
+
+#include <algorithm>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/types.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Simple 4D array structure, similar in form to Array2D, for use primarily in
+// testing and describing to XLA APIs values in the 4D array structures used
+// in convolutions.
+//
+// The data layout is, in order from major to minor:
+//
+//    First dimension: plane, batch, n1
+//   Second dimension: depth, feature, z, n2
+//    Third dimension: height, y, n3
+//   Fourth dimension: width, x, n4
+//
+// These dimensions are referred to by various names, so that is why
+// more than one name is given above. See operator() for the exact
+// calculation of 1d indices from 4d indices.
+template <typename T>
+class Array4D : public Array<T> {
+ public:
+  Array4D() : Array<T>(std::vector<int64_t>{0, 0, 0, 0}) {}
+
+  // Creates a 4D array, uninitialized values.
+  Array4D(int64_t planes, int64_t depth, int64_t height, int64_t width)
+      : Array<T>(std::vector<int64_t>{planes, depth, height, width}) {}
+
+  // Creates a 4D array, initialized to value.
+  Array4D(int64_t planes, int64_t depth, int64_t height, int64_t width, T value)
+      : Array<T>(std::vector<int64_t>{planes, depth, height, width}, value) {}
+
+  // Creates a 4D array, filled with values.
+  //
+  // We need to set a default type for Container so that code like
+  // Array4D(1, 1, 1, 1, {1}) will work. The template cannot infer the
+  // initializer_list type in that case without this default.
+  template <typename Container = std::initializer_list<T>>
+  Array4D(int64_t planes, int64_t depth, int64_t height, int64_t width,
+          const Container& values)
+      : Array4D(planes, depth, height, width) {
+    this->SetValues(values);
+  }
+
+  // Construct an Array4D with the given nested initializer list.
+  Array4D(std::initializer_list<std::initializer_list<
+              std::initializer_list<std::initializer_list<T>>>>
+              values)
+      : Array<T>(values) {}
+
+  // Creates an array of a floating-point type (half, bfloat16, float,
+  // or double) from the given nested initializer list of float values.
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
+  Array4D(std::initializer_list<std::initializer_list<
+              std::initializer_list<std::initializer_list<T2>>>>
+              values)
+      : Array<T>(values) {}
+
+  // Numerically-named aliases for the various dimensions. This matches the
+  // dimension names used in array3d.
+  int64_t n4() const { return this->dim(3); }
+  int64_t n3() const { return this->dim(2); }
+  int64_t n2() const { return this->dim(1); }
+  int64_t n1() const { return this->dim(0); }
+
+  int64_t width() const { return this->dim(3); }
+  int64_t height() const { return this->dim(2); }
+  int64_t depth() const { return this->dim(1); }
+  int64_t planes() const { return this->dim(0); }
+
+  // Fills all of the {p,z} with the array provided, which specifies {y,x}.
+  void FillWithYX(const Array2D<T>& value) {
+    CHECK_EQ(value.height(), height());
+    CHECK_EQ(value.width(), width());
+    for (int64_t plane = 0; plane < planes(); ++plane) {
+      for (int64_t depth = 0; depth < this->depth(); ++depth) {
+        for (int64_t height = 0; height < this->height(); ++height) {
+          for (int64_t width = 0; width < this->width(); ++width) {
+            (*this)(plane, depth, height, width) = value(height, width);
+          }
+        }
+      }
+    }
+  }
+
+  // Fills all of the {p,x} with the array provided, which specifies {z,y}.
+  void FillWithZY(const Array2D<T>& value) {
+    CHECK_EQ(value.height(), depth());
+    CHECK_EQ(value.width(), height());
+    for (int64_t plane = 0; plane < planes(); ++plane) {
+      for (int64_t depth = 0; depth < this->depth(); ++depth) {
+        for (int64_t height = 0; height < this->height(); ++height) {
+          for (int64_t width = 0; width < this->width(); ++width) {
+            (*this)(plane, depth, height, width) = value(depth, height);
+          }
+        }
+      }
+    }
+  }
+
+  // Fills all of the {x,y} with the array provided, which specifies {p,z}.
+  void FillWithPZ(const Array2D<T>& value) {
+    CHECK_EQ(value.height(), planes());
+    CHECK_EQ(value.width(), depth());
+    for (int64_t height = 0; height < this->height(); ++height) {
+      for (int64_t width = 0; width < this->width(); ++width) {
+        for (int64_t plane = 0; plane < planes(); ++plane) {
+          for (int64_t depth = 0; depth < this->depth(); ++depth) {
+            (*this)(plane, depth, height, width) = value(plane, depth);
+          }
+        }
+      }
+    }
+  }
+
+  // Fills each of the minor-dim matrices with a number designating which minor
+  // dim matrix is enclosed by the shape.
+  void FillWithMinorDimNum() {
+    LOG(INFO) << "width: " << this->width();
+    LOG(INFO) << "height: " << this->height();
+    LOG(INFO) << "depth: " << this->depth();
+    LOG(INFO) << "planes: " << this->planes();
+    for (int64_t height = 0; height < this->height(); ++height) {
+      for (int64_t width = 0; width < this->width(); ++width) {
+        for (int64_t plane = 0; plane < planes(); ++plane) {
+          for (int64_t depth = 0; depth < this->depth(); ++depth) {
+            float this_val = plane * this->depth() + depth;
+            (*this)(plane, depth, height, width) = this_val;
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_ARRAY4D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/autotune_result_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/autotune_result_wrapper.h
new file mode 100644
index 00000000..7a92ce5f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/autotune_result_wrapper.h
@@ -0,0 +1,69 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_AUTOTUNE_RESULT_WRAPPER_H_
+#define XLA_AUTOTUNE_RESULT_WRAPPER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/autotune_results.pb.h"
+
+namespace xla {
+
+// This class is a thin wrapper around AutotuneResults::Entry. It is used to
+// provide opaque accessors to an entry's key and value without exposing the
+// internal structure of the entry.
+class AutotuneResultWrapper {
+ public:
+  using OpaqueKey = std::string;
+  using OpaqueValue = std::string;
+
+  // Creates an AutotuneResultWrapper from a key and value. The provided key and
+  // value must be ones that were previously returned by calls to Key() and
+  // Value().
+  static absl::StatusOr<AutotuneResultWrapper> FromKeyAndValue(
+      OpaqueKey key, OpaqueValue value);
+
+  // An opaque string that can be used as a key for this Autotuning result.
+  // Do not rely on the format of this string.
+  OpaqueKey Key() const;
+
+  // An opaque string that encodes the autotuning result.
+  // Do not rely on the format of this string.
+  OpaqueValue Value() const;
+
+  static std::vector<AutotuneResultWrapper> AutotuneResultsToWrappers(
+      const AutotuneResults& autotune_results);
+
+  // Returns the AutotuneResults proto that corresponds to the provided
+  // wrappers. This function will return an error if the provided wrappers have
+  // inconsistent versions.
+  static absl::StatusOr<AutotuneResults> AutotuneResultsFromWrappers(
+      const std::vector<AutotuneResultWrapper>& wrappers);
+
+ private:
+  AutotuneResultWrapper(const AutotuneResults::Entry& result, int32_t version)
+      : autotune_result_(result), version_(version) {}
+
+  AutotuneResults::Entry autotune_result_;
+  int32_t version_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_AUTOTUNE_RESULT_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/alignment.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/alignment.h
new file mode 100644
index 00000000..08db408b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/alignment.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_ALIGNMENT_H_
+#define XLA_BACKENDS_CPU_ALIGNMENT_H_
+
+#include <cstddef>
+
+#include "Eigen/Core"  // IWYU pragma: keep
+
+namespace xla::cpu {
+
+// The minimum alignment of buffers passed to XLA:CPU.
+//
+// XLA:CPU emits code that assumes that all buffers passed to it are aligned to
+// this boundary, and passing buffers with smaller alignment might lead to
+// crashes at run time (illegal instruction loading from unaligned memory). We
+// use the same alignment as Eigen, which is consistent with TensorFlow
+// behavior.
+inline constexpr size_t MinAlign() { return EIGEN_MAX_ALIGN_BYTES; }
+
+// Align to 64-bytes, to mimic tsl::Allocator::kAllocatorAlignment.
+//
+// Preferred XLA:CPU alignment for buffers. XLA:CPU itself aligns intermediate
+// buffers to this boundary (slices inside the preallocated temp buffer).
+inline constexpr size_t Align() { return 64; }
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_ALIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h
new file mode 100644
index 00000000..b91100a6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h
@@ -0,0 +1,68 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_COMPILED_FUNCTION_LIBRARY_H_
+#define XLA_BACKENDS_CPU_CODEGEN_COMPILED_FUNCTION_LIBRARY_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+
+namespace xla::cpu {
+
+// A CompiledFunctionLibrary is a FunctionLibrary that resolves function names
+// to compiled functions using LLVM's ORC JIT.
+class CompiledFunctionLibrary : public FunctionLibrary {
+ public:
+  struct ResolvedSymbol {
+    TypeId type_id;
+    void* ptr;
+  };
+
+  // Constructs a new CompiledFunctionLibrary.
+  //
+  // `execution_session` is the LLVM ORC execution session to use.
+  // `object_layer` is the LLVM ORC object linking layer with preloaded object
+  // files.
+  // `symbols_map` is a map from symbol names to resolved symbols.
+  CompiledFunctionLibrary(
+      std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
+      std::unique_ptr<llvm::orc::RTDyldObjectLinkingLayer> object_layer,
+      absl::flat_hash_map<std::string, ResolvedSymbol> symbols_map);
+
+  ~CompiledFunctionLibrary() final;
+
+  // Resolves the function with the given name and type ID.
+  absl::StatusOr<void*> ResolveFunction(TypeId type_id,
+                                        absl::string_view name) final;
+
+ private:
+  std::unique_ptr<llvm::orc::ExecutionSession> execution_session_;
+  // Owns resources required for the execution session.
+  std::unique_ptr<llvm::orc::RTDyldObjectLinkingLayer> object_layer_;
+  // Caches the resolved symbols so we don't have to look them up every time a
+  // function is resolved.
+  absl::flat_hash_map<std::string, ResolvedSymbol> symbols_map_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_COMPILED_FUNCTION_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/contiguous_section_memory_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/contiguous_section_memory_manager.h
new file mode 100644
index 00000000..5b0f0285
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/contiguous_section_memory_manager.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_CONTIGUOUS_SECTION_MEMORY_MANAGER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_CONTIGUOUS_SECTION_MEMORY_MANAGER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Memory.h"
+
+namespace xla::cpu {
+
+// On Windows, LLVM may emit IMAGE_REL_AMD64_ADDR32NB COFF relocations when
+// referring to read-only data, however IMAGE_REL_AMD64_ADDR32NB requires that
+// the read-only data section follow within 2GB of the code. Oddly enough,
+// the LLVM SectionMemoryManager does nothing to enforce this
+// (https://github.com/llvm/llvm-project/issues/55386), leading to crashes on
+// Windows when the sections end up in the wrong order. Since none
+// of the memory managers in the LLVM tree obey the necessary ordering
+// constraints, we need to roll our own.
+//
+// ContiguousSectionMemoryManager is an alternative to SectionMemoryManager
+// that maps one large block of memory and suballocates it
+// for each section, in the correct order. This is easy enough to do because of
+// the llvm::RuntimeDyld::MemoryManager::reserveAllocationSpace() hook, which
+// ensures that LLVM will tell us ahead of time the total sizes of all the
+// relevant sections. We also know that XLA isn't going to do any more
+// complicated memory management: we will allocate the sections once and we are
+// done.
+class ContiguousSectionMemoryManager : public llvm::RTDyldMemoryManager {
+ public:
+  explicit ContiguousSectionMemoryManager(
+      llvm::SectionMemoryManager::MemoryMapper* mmapper);
+  ~ContiguousSectionMemoryManager() override;
+
+  bool needsToReserveAllocationSpace() override { return true; }
+  void reserveAllocationSpace(uintptr_t code_size, llvm::Align code_align,
+                              uintptr_t ro_data_size, llvm::Align ro_data_align,
+                              uintptr_t rw_data_size,
+                              llvm::Align rw_data_align) override;
+
+  uint8_t* allocateDataSection(uintptr_t size, unsigned alignment,
+                               unsigned section_id,
+                               llvm::StringRef section_name,
+                               bool is_read_only) override;
+
+  uint8_t* allocateCodeSection(uintptr_t size, unsigned alignment,
+                               unsigned section_id,
+                               llvm::StringRef section_name) override;
+
+  bool finalizeMemory(std::string* err_msg) override;
+
+ private:
+  llvm::SectionMemoryManager::MemoryMapper* mmapper_;
+  bool mmapper_is_owned_;
+
+  llvm::sys::MemoryBlock allocation_;
+
+  // Sections must be in the order code < rodata < rwdata.
+  llvm::sys::MemoryBlock code_block_;
+  llvm::sys::MemoryBlock ro_data_block_;
+  llvm::sys::MemoryBlock rw_data_block_;
+
+  llvm::sys::MemoryBlock code_free_;
+  llvm::sys::MemoryBlock ro_data_free_;
+  llvm::sys::MemoryBlock rw_data_free_;
+
+  uint8_t* Allocate(llvm::sys::MemoryBlock& free_block, std::uintptr_t size,
+                    unsigned alignment);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_CONTIGUOUS_SECTION_MEMORY_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/cpu_features.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/cpu_features.h
new file mode 100644
index 00000000..c98ed1b4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/cpu_features.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_CPU_FEATURES_H_
+#define XLA_BACKENDS_CPU_CODEGEN_CPU_FEATURES_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/string_view.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace xla::cpu {
+
+// Returns the earliest CPU generation that supports the instruction set.
+absl::string_view CpuTargetFromMaxFeature(tsl::port::CPUFeature max_feature);
+
+// Converts a string representation of a CPU feature to a CPUFeature enum.
+// Returns std::nullopt if the string is not a valid CPU feature.
+std::optional<tsl::port::CPUFeature> CpuFeatureFromString(
+    absl::string_view cpu_feature);
+
+// Returns true if `feature` can be enabled given the maximum allowed CPU
+// feature `max_feature`.
+bool ShouldEnableCpuFeature(absl::string_view feature,
+                            tsl::port::CPUFeature max_feature);
+
+struct DetectedMachineAttributes {
+  // The list of features available on the current machine.
+  std::vector<std::string> features;
+  // Number of features that are filtered out due to the `max_feature` setting.
+  int32_t num_filtered_features = 0;
+};
+
+// Detects the machine attributes of the current machine.
+//
+// If `max_feature` is provided, the returned attributes will be filtered
+// according to the maximum allowed CPU feature.
+DetectedMachineAttributes DetectMachineAttributes(
+    std::optional<tsl::port::CPUFeature> max_feature);
+
+// TODO(penporn): PJRT's CPU client also calls this function. We should
+// make it get the same filtered attributes according to the `max_isa` setting.
+std::vector<std::string> DetectMachineAttributes()
+    ABSL_DEPRECATED("Use DetectMachineAttributes defined above instead.");
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_CPU_FEATURES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/elemental_kernel_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/elemental_kernel_emitter.h
new file mode 100644
index 00000000..337cfce6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/elemental_kernel_emitter.h
@@ -0,0 +1,76 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_KERNEL_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_KERNEL_EMITTER_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/elemental_ir_emitter.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::cpu {
+
+class ElementalKernelEmitter final : public KernelEmitter {
+ public:
+  explicit ElementalKernelEmitter(const HloInstruction* instr);
+
+  ElementalKernelEmitter(const HloInstruction* instr,
+                         const BufferAssignment* buffer_assignment,
+                         const TargetMachineFeatures* target_machine);
+
+  absl::StatusOr<std::unique_ptr<KernelSpec>> EmitKernelSpec() override;
+
+ private:
+  // Emits LLVM IR using elemental loop emitter and the given element generator.
+  // If the instruction is parallelized, it will emit a parallel loop partition
+  // and return the requested number of execution threads.
+  absl::StatusOr<se::ThreadDim> EmitElementalLoops(
+      llvm::IRBuilderBase& b, const HloInstruction* instr,
+      const KernelApiIrBuilder::KernelPrototype& kernel_prototype,
+      const llvm_ir::ElementGenerator& element_generator);
+
+  // Create a thread local call callback, can be empty if no IrEmitter is
+  // registered.
+  absl::StatusOr<CpuElementalIrEmitter::ThreadLocalCallCallback>
+  ThreadLocalCallbackFactory(llvm::IRBuilderBase& builder,
+                             llvm::Module& module) const;
+
+ private:
+  const HloInstruction* instr_;
+
+  const BufferAssignment* buffer_assignment_ = nullptr;
+  const TargetMachineFeatures* target_machine_ = nullptr;
+
+  llvm::orc::ThreadSafeContext context_;
+
+  KernelApiIrBuilder kernel_api_ir_builder_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_KERNEL_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_dialect.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_dialect.h
new file mode 100644
index 00000000..a9a8edbc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_dialect.h
@@ -0,0 +1,24 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_DIALECT_H_
+#define XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+
+// Include the auto-generated header file.
+#include "xla/backends/cpu/codegen/ir/xla_cpu_dialect.h.inc"
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_DIALECT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_ops.h
new file mode 100644
index 00000000..5607bfab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_ops.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_OPS_H_
+#define XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_OPS_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/IR/Attributes.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinAttributes.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
+#include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
+#include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
+#include "xla/backends/cpu/codegen/ir/xla_cpu_types.h"  // IWYU pragma: keep
+
+#define GET_OP_CLASSES
+#include "xla/backends/cpu/codegen/ir/xla_cpu_ops.h.inc"
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_types.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_types.h
new file mode 100644
index 00000000..f59ba42c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir/xla_cpu_types.h
@@ -0,0 +1,24 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_TYPES_H_
+#define XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_TYPES_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
+
+#define GET_TYPEDEF_CLASSES
+#include "xla/backends/cpu/codegen/ir/xla_cpu_types.h.inc"
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_IR_XLA_CPU_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
new file mode 100644
index 00000000..9c6678bd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
@@ -0,0 +1,95 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_IR_COMPILER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_IR_COMPILER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/IR/FMF.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace xla::cpu {
+
+// IrCompiler compiles LLVM modules to object files using LLVM compilation
+// pipeline customized for XLA:CPU. Default LLVM compilation pipeline is
+// optimized for compiling LLVM IR produced by Clang, and in XLA we are a lot
+// more constrained and produce a very different IR.
+class IrCompiler : public llvm::orc::IRCompileLayer::IRCompiler {
+ public:
+  // Returns an instance of `llvm::TargetMachine` for a compilation. It can be
+  // a shared `llvm::TargetMachine` if compilation is single threaded, or must
+  // be a unique instance of `llvm::TargetMachine` if compilation is multi
+  // threaded (because `llvm::TargetMachine` is not thread safe).
+  //
+  // See `llvm::orc::ConcurrentIRCompiler` to see corresponding API in ORC.
+  using TargetMachineBuilder =
+      std::function<absl::StatusOr<std::shared_ptr<llvm::TargetMachine>>()>;
+
+  // Options for configuring the LLVM compilation pipeline and optimizations.
+  struct Options {
+    llvm::CodeGenOptLevel opt_level = llvm::CodeGenOptLevel::None;
+    bool optimize_for_size = false;
+
+    llvm::FastMathFlags fast_math_flags;
+
+    bool disable_expensive_passes = false;
+    bool disable_slp_vectorizer = false;
+
+    bool dfsan_enabled = false;
+    std::vector<std::string> dfsan_abi_list_files;
+  };
+
+  // Compilation hooks for intercepting IR compilation stages.
+  struct CompilationHooks {
+    std::function<void(const llvm::Module&)> pre_optimization;
+    std::function<void(const llvm::Module&)> post_optimization;
+    std::function<void(const llvm::Module&, const llvm::object::ObjectFile&)>
+        post_codegen;
+  };
+
+  IrCompiler(TargetMachineBuilder target_machine_builder, Options options,
+             CompilationHooks hooks);
+
+  // Compiles a `module` to an ObjectFile.
+  llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
+      llvm::Module& module) final;
+
+ private:
+  TargetMachineBuilder target_machine_builder_;
+  Options options_;
+
+  // IRCompiler can be called in concurrently when JitCompiler compiles multiple
+  // modules concurrently, we need to make sure that we don't introduce data
+  // races when calling user provided compilation hooks.
+  absl::Mutex mutex_;
+  CompilationHooks hooks_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_IR_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h
new file mode 100644
index 00000000..e98a999d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h
@@ -0,0 +1,190 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_JIT_COMPILER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_JIT_COMPILER_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "xla/backends/cpu/codegen/ir_compiler.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace xla::cpu {
+
+// Jit compiler that compiles LLVM modules added to it into a FunctionLibrary.
+// Jit-compiled function library will be backed by multiple dynamic libraries
+// compiled from LLVM modules using LLVM ORC APIs.
+//
+// JitCompiler is an opinionated JIT compiler built on top of LLVM ORC stack,
+// optimized for compiling LLVM modules produced by XLA:CPU. LLVM itself
+// has another pre-fabricated ORC JIT stack called `llvm::orc::LLJIT`.
+class JitCompiler {
+ public:
+  using Symbol = FunctionLibrary::Symbol;
+
+  // Task and a TaskRunner are used to run compilation tasks in parallel.
+  using Task = std::function<void()>;  // NOLINT (must be copyable)
+  using TaskRunner = absl::AnyInvocable<void(Task)>;
+
+  // A callback that returns a definition generator that will be added to all
+  // dynamic libraries created by the jit compiler. Definition generator enables
+  // linking host runtime symbols into the jit-compiled function library.
+  using DefinitionGenerator =
+      std::function<std::unique_ptr<llvm::orc::DefinitionGenerator>(
+          llvm::TargetMachine*)>;
+
+  JitCompiler(JitCompiler&&) = default;
+  JitCompiler& operator=(JitCompiler&&) = default;
+
+  ~JitCompiler();
+
+  // Infers the `llvm::TargetMachine` for the current host. If `max_cpu_feature`
+  // is provided, it will be used to constrain the set of features that LLVM
+  // codegen (instruction selection) is allowed to use, e.g. it can be used to
+  // explicitly disable certain AVX512 extensions, in case the compiled
+  // executable will be serialized and later loaded on a different machine.
+  static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
+  InferTargetMachine(const llvm::TargetOptions& target_options,
+                     llvm::CodeGenOptLevel opt_level,
+                     std::optional<tsl::port::CPUFeature> max_cpu_feature);
+
+  // Returns a target machine builder that uses `InferTargetMachine` defined
+  // above to infer the target machine for the given options.
+  static IrCompiler::TargetMachineBuilder InferTargetMachineBuilder(
+      const llvm::TargetOptions& target_options,
+      llvm::CodeGenOptLevel opt_level,
+      std::optional<tsl::port::CPUFeature> max_cpu_feature);
+
+  struct Options {
+    // Options for the underlying IR compiler instance.
+    IrCompiler::Options ir_compiler_options;
+    IrCompiler::CompilationHooks ir_compiler_hooks;
+
+    // The number of dynamic libraries to create for the jit compiler instance.
+    // We compile XLA:CPU program into multiple LLVM modules, and by using
+    // multiple dynamic libraries we enable parallel compilation.
+    size_t num_dylibs = 1;
+
+    // Optional definition generator to inject host runtime symbols into the
+    // jit-compiled function library.
+    DefinitionGenerator definition_generator;
+
+    // Maximum CPU instruction set for wich the compiler should generate code.
+    // If instruction set is empty, compiler will generate code for all ISA
+    // extensions detected on the current machine.
+    std::optional<tsl::port::CPUFeature> max_cpu_feature;
+  };
+
+  // Creates a new instance of the JitCompiler.
+  static absl::StatusOr<JitCompiler> Create(llvm::TargetOptions target_options,
+                                            Options options,
+                                            TaskRunner task_runner = nullptr);
+
+  // Adds a LLVM module to the dynamic library at `dylib_index`.
+  absl::Status AddModule(llvm::orc::ThreadSafeModule module,
+                         size_t dylib_index = 0);
+
+  // Adds an object file to the dynamic library at `dylib_index`.
+  absl::Status AddObjFile(std::unique_ptr<llvm::MemoryBuffer> obj_file,
+                          size_t dylib_index = 0);
+
+  // Compiles all added LLVM modules and object files into the FunctionLibrary
+  // by resolving all symbols in `symbols`.
+  //
+  // After this method returns, the FunctionLibrary will contain compiled
+  // functions that can be invoked via function calls. Returned FunctionLibrary
+  // tracks type ids of the resolved symbols, but the compiler doesn't verify
+  // that LLVM IR function signature matches the type id, and it's up to the
+  // user to make sure that function types actually match, otherwise it will
+  // lead to run-time crashes.
+  //
+  // TODO(ezhulenev): Add an option to pass symbol (function) types at compile
+  // time together with names and type-check LLVM function signature against the
+  // function type to make compilation process type safe. Currently we only keep
+  // track of type ids, but we don't track function signatures for type ids, and
+  // have a simple run-time type checking inside of the FunctionLibrary.
+  absl::StatusOr<std::unique_ptr<FunctionLibrary>> Compile(
+      absl::Span<const Symbol> symbols) &&;
+
+  llvm::TargetMachine* target_machine() { return target_machine_.get(); }
+
+ private:
+  // LLVM ORC task dispatcher that uses `TaskRunner` to run compilation tasks.
+  class TaskDispatcher : public llvm::orc::TaskDispatcher {
+   public:
+    explicit TaskDispatcher(TaskRunner task_runner);
+    ~TaskDispatcher() final;
+
+    void dispatch(std::unique_ptr<llvm::orc::Task> T) final;
+    void shutdown() final;
+
+   private:
+    TaskRunner task_runner_;
+
+    absl::Mutex mu_;
+    size_t num_dispatched_tasks_ ABSL_GUARDED_BY(mu_) = 0;
+  };
+
+  JitCompiler(IrCompiler::TargetMachineBuilder target_machine_builder,
+              std::shared_ptr<llvm::TargetMachine> target_machine,
+              TaskDispatcher* task_dispatcher,
+              std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
+              std::unique_ptr<IrCompiler> ir_compiler, size_t num_dylibs,
+              DefinitionGenerator definition_generator);
+
+  // Target machine builder that is used to construct target machines for this
+  // instance of `JitCompiler` (when compiling LLVM modules in parallel).
+  IrCompiler::TargetMachineBuilder target_machine_builder_;
+  std::shared_ptr<llvm::TargetMachine> target_machine_;
+
+  TaskDispatcher* task_dispatcher_;  // owned by `execution_session_`
+
+  std::unique_ptr<llvm::orc::ExecutionSession> execution_session_;
+  std::unique_ptr<llvm::orc::RTDyldObjectLinkingLayer> object_layer_;
+  std::unique_ptr<llvm::orc::IRCompileLayer> compile_layer_;
+
+  // Non-owning pointers to dynamic libraries created for the execution session.
+  std::vector<llvm::orc::JITDylib*> dylibs_;
+
+  // Non owning pointer to JIT event listeners for gdb and perf.
+  llvm::JITEventListener* gdb_;   // not owned
+  llvm::JITEventListener* perf_;  // not owned
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_JIT_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
new file mode 100644
index 00000000..06b193ab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
@@ -0,0 +1,133 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_KERNEL_API_IR_BUILDER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_KERNEL_API_IR_BUILDER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
+
+namespace xla::cpu {
+
+class KernelApiIrBuilder {
+ public:
+  struct Options {
+    bool enable_invariant_load_metadata;
+    int32_t prefer_vector_width;
+  };
+
+  // Thread dimensions of the kernel invocation.
+  struct ThreadDims {
+    llvm::Value* x;
+    llvm::Value* y;
+    llvm::Value* z;
+  };
+
+  // Thread coordinates of the kernel invocation.
+  struct ThreadId {
+    llvm::Value* x;
+    llvm::Value* y;
+    llvm::Value* z;
+  };
+
+  // Kernel parameter (argument or result buffer) passed to a kernel function.
+  // We rely on buffer allocation slice information to infer buffer aliasing
+  // scopes for LLVM codegen.
+  struct KernelParameter {
+    Shape shape;
+    BufferAllocation::Slice slice;
+  };
+
+  // A kernel function prototype with all the LLVM values that might be needed
+  // to emit the actual kernel body.
+  struct KernelPrototype {
+    llvm::Function* function;
+    llvm::BasicBlock* return_block;
+
+    // LLVM values identifying kernel invocation thread coordinates.
+    ThreadDims thread_dims;
+    ThreadId thread_id;
+
+    // LLVM values corresponding to the kernel arguments and results arrays. All
+    // tuples are flattened as we do not have any tuples at run time and only
+    // read and write data from/to leaf arrays.
+    std::vector<llvm_ir::IrArray> arguments;
+    std::vector<llvm_ir::IrArray> results;
+
+    // Set containing all invariant (read-only) buffers indices. A buffer is
+    // read-only if it is not aliased with any result.
+    absl::flat_hash_set<int64_t> invariant_arguments;
+
+    // the set of buffer uses for this kernel, can be empty if buffer
+    // was not provided.
+    absl::InlinedVector<BufferUse, 8> buffer_uses;
+  };
+
+  KernelApiIrBuilder(llvm::LLVMContext& context_, Options options);
+
+  // Emits a kernel prototype for the given HLO instruction.
+  // buffer_assignment may be null, in which case we will not compute alias
+  // metadata.
+  absl::StatusOr<KernelPrototype> EmitKernelPrototype(
+      llvm::Module& module, const HloInstruction* instr,
+      const BufferAssignment* buffer_assignment, absl::string_view suffix = "");
+
+  absl::StatusOr<KernelPrototype> EmitKernelPrototype(
+      llvm::Module& module, absl::string_view name,
+      absl::Span<const KernelParameter> arguments,
+      absl::Span<const KernelParameter> results,
+      bool compute_alias_metadata = true);
+
+ private:
+  ThreadDims EmitKernelThreadDims(llvm::IRBuilderBase& builder,
+                                  llvm::Value* call_frame);
+  ThreadId EmitKernelThread(llvm::IRBuilderBase& builder,
+                            llvm::Value* call_frame);
+  llvm_ir::IrArray EmitKernelArgument(llvm::IRBuilderBase& builder,
+                                      llvm::Value* call_frame, int64_t index,
+                                      const Shape& shape);
+  llvm::Function* EmitKernelFunction(llvm::Module& module,
+                                     absl::string_view name);
+
+ private:
+  llvm::LLVMContext& context_;
+
+  Options options_;
+
+  llvm::StructType* thread_dim_ty_;
+  llvm::StructType* thread_ty_;
+  llvm::StructType* arg_ty_;
+  llvm::StructType* call_frame_ty_;
+  llvm::FunctionType* kernel_function_ty_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_KERNEL_API_IR_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/llvm_ir_kernel_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/llvm_ir_kernel_spec.h
new file mode 100644
index 00000000..cedd7e6d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/llvm_ir_kernel_spec.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_LLVM_IR_KERNEL_SPEC_H_
+#define XLA_BACKENDS_CPU_CODEGEN_LLVM_IR_KERNEL_SPEC_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::cpu {
+
+// A KernelSpec that wraps an LlvmIrKernelSource and owns fake buffer
+// allocations for all kernel arguments.
+class LlvmIrKernelSpec final : public xla::KernelSpec {
+ public:
+  LlvmIrKernelSpec(se::ThreadDim thread_dim,
+                   std::vector<BufferAllocation> buffer_allocations,
+                   BufferUses buffer_uses,
+                   std::unique_ptr<LlvmIrKernelSource> kernel_source);
+
+  LlvmIrKernelSpec(LlvmIrKernelSpec&& other) = default;
+  LlvmIrKernelSpec& operator=(LlvmIrKernelSpec&& other) noexcept = default;
+
+  LlvmIrKernelSource& kernel_source() override { return *kernel_source_; }
+
+ private:
+  std::vector<BufferAllocation> buffer_allocations_;
+  std::unique_ptr<LlvmIrKernelSource> kernel_source_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_LLVM_IR_KERNEL_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.h
new file mode 100644
index 00000000..5ab7b6e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_POLYNOMIAL_APPROXIMATIONS_H_
+#define XLA_BACKENDS_CPU_CODEGEN_POLYNOMIAL_APPROXIMATIONS_H_
+
+#include <vector>
+
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Module.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+
+namespace xla::cpu {
+
+// A library of XLA:CPU polynomial approximations for common math functions.
+//
+// By default LLVM compiles math functions (tanh, exp, etc.) as library calls
+// to the `libm` library, and library calls inside XLA loops have unacceptable
+// performance overheads (it's an external function call). In XLA:CPU we rewrite
+// these library calls into the LLVM IR using polynomial approximations (often
+// borrowed from Eigen), and as a result we get loops without any library calls
+// and rely on LLVM to optimize loop bodies into efficient machine code.
+//
+// MLIR framework has a set of rewrite patterns that do similar rewrites on MLIR
+// representation (see `PolynomialApproximation.cpp`). In contrast to MLIR
+// approach these approximations applied late in the compilation pipeline, by
+// invoking the rewrite explicitly from our custom `IRCompiler` layer which in
+// plugged into our own JIT compiler built on top of LLVM ORC APIs.
+//
+// See: https://en.wikipedia.org/wiki/C_mathematical_functions
+
+// Returns a vector of vectorization information for functions that have
+// vectorized polynomial approximations. This enables LLVM vectorization passes
+// to vectorize scalar math functions to custom function calls, that we later
+// rewrite into LLVM IR, so we don't have any function calls in compiled code.
+std::vector<llvm::VecDesc> PolynomialApproximationsVectorization();
+
+// Rewrites supported math functions into LLVM IR polynomial approximations.
+//
+// This function rewrites function calls to the builtin LLVM math functions
+// intrinsics (i.e. `llvm.tanh.f32`) and custom XLA:CPU vectorized math
+// functions (see `PolynomialApproximationsVectorization` above) into lower
+// level polynomial approximations LLVM IR.
+void RewriteToPolynomialApproximations(llvm::Module* module,
+                                       llvm::FastMathFlags fast_math_flags);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_POLYNOMIAL_APPROXIMATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h
new file mode 100644
index 00000000..e47acef5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h
@@ -0,0 +1,85 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TARGET_MACHINE_FEATURES_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TARGET_MACHINE_FEATURES_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// Resolve vectorization and alignment properties from the LLVM TargetMachine.
+class TargetMachineFeatures {
+ public:
+  static constexpr int32_t kX86AvxVectorByteSize = 32;
+
+  // Input and output tensor buffers must be aligned to this many bytes if we
+  // want to call an Eigen backed GEMM or Convolution.
+  static constexpr int32_t kEigenExpectedTensorAlignment = 16;
+
+  explicit TargetMachineFeatures(llvm::TargetMachine* target_machine);
+  virtual ~TargetMachineFeatures() = default;
+
+  TargetMachineFeatures(TargetMachineFeatures&&) = default;
+  TargetMachineFeatures& operator=(TargetMachineFeatures&&) = default;
+
+  // Return the vectorization factor, which is the number of bytes of data
+  // explicitly vectorized routines will try to process at once.
+  virtual int32_t vectorization_factor_in_bytes() const;
+
+  // Return the size of the largest vector size in bytes.  We need to pass in
+  // "function" since llvm functions can contain annotations for specializing
+  // them to specific micro-architectures (though currently XLA does not use
+  // this functionality).
+  virtual int32_t vector_register_byte_size(const llvm::Function& fn) const;
+
+  // Return the number of elements of type `type` that can fit into the largest
+  // vector register available.  We need to pass in "function" since llvm
+  // functions can contain annotations for specializing them to specific
+  // micro-architectures (though currently XLA does not use this functionality).
+  virtual int32_t vector_register_num_elements(const llvm::Function& fn,
+                                               PrimitiveType type) const;
+
+  // Return the number of vector registers.  We need to pass in
+  // "function" since llvm functions can contain annotations for specializing
+  // them to specific micro-architectures (though currently XLA does not use
+  // this functionality).
+  virtual int32_t vector_register_count(const llvm::Function& fn) const;
+
+  // Returns the minimum alignment for a buffer of size size_bytes.
+  virtual int64_t minimum_alignment_for_allocation(int64_t size_bytes) const;
+
+  virtual std::string get_target_feature_string() const;
+
+ private:
+  llvm::TargetTransformInfo* GetTargetTransformInfoFor(
+      const llvm::Function& fn) const;
+
+  // A cache of resolved TargetTransformInfo for LLVM functions.
+  mutable absl::flat_hash_map<const llvm::Function*, llvm::TargetTransformInfo>
+      target_transform_info_;
+
+  llvm::TargetMachine* target_machine_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TARGET_MACHINE_FEATURES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/transforms/passes.h
new file mode 100644
index 00000000..fdd0c4e5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/transforms/passes.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TRANSFORMS_PASSES_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL
+#include "xla/backends/cpu/codegen/transforms/passes.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateLowerTrivialPass();
+
+#define GEN_PASS_REGISTRATION
+#include "xla/backends/cpu/codegen/transforms/passes.h.inc"
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/transforms/xla_cpu_rewrite_patterns.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/transforms/xla_cpu_rewrite_patterns.h
new file mode 100644
index 00000000..25e3fcd4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/transforms/xla_cpu_rewrite_patterns.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TRANSFORMS_XLA_CPU_REWRITE_PATTERNS_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TRANSFORMS_XLA_CPU_REWRITE_PATTERNS_H_
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace xla::cpu {
+
+// Populates type conversion and legality constraints for lowering XLA:CPU
+// types to LLVM types.
+void PopulateXlaCpuTypeConversionAndLegality(mlir::TypeConverter& converter,
+                                             mlir::ConversionTarget& target);
+
+// Populates rewrite patterns for converting XLA:CPU ops to LLVM ops.
+void PopulateXlaCpuConversionPatterns(mlir::RewritePatternSet& patterns);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TRANSFORMS_XLA_CPU_REWRITE_PATTERNS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.h
new file mode 100644
index 00000000..59629540
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.h
@@ -0,0 +1,351 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_VECTOR_IR_BUILDER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_VECTOR_IR_BUILDER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/TypeSize.h"
+#include "xla/primitive_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// Simple wrappers around llvm::APFloat::APFloat to make the calling code more
+// obvious.
+
+inline llvm::APFloat GetIeeeF32(float f) { return llvm::APFloat(f); }
+inline llvm::APFloat GetIeeeF32FromBitwiseRep(int32_t bitwise_value) {
+  return llvm::APFloat(
+      llvm::APFloat::IEEEsingle(),
+      llvm::APInt(/*numBits=*/32, /*val=*/bitwise_value, /*isSigned=*/true));
+}
+
+// A thin wrapper around llvm_util.h to make code generating vector math flow
+// more readable.
+class VectorIrBuilder {
+ public:
+  // This VectorIrBuilder instance remembers `primitive_type` and
+  // `vector_size`, and these are implicitly used by the methods on this
+  // instance (i.e. LoadVector will load a vector of type <`vector_size` x
+  // `primitive_type`>).
+  VectorIrBuilder(PrimitiveType primitive_type, int64_t vector_size,
+                  llvm::IRBuilderBase* b, std::string name);
+
+  llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Mul(int64_t lhs, llvm::Value* rhs) {
+    return Mul(b()->getInt64(lhs), rhs);
+  }
+  llvm::Value* Mul(const llvm::APFloat& lhs, llvm::Value* rhs) {
+    return Mul(GetConstantFloat(rhs->getType(), lhs), rhs);
+  }
+
+  // If your call resolved to these then you probably wanted the versions taking
+  // APFloat.
+  llvm::Value* Mul(double lhs, llvm::Value* rhs) = delete;
+  llvm::Value* Mul(float lhs, llvm::Value* rhs) = delete;
+
+  llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Add(int64_t lhs, llvm::Value* rhs) {
+    return Add(b()->getInt64(lhs), rhs);
+  }
+  llvm::Value* Add(const llvm::APFloat& lhs, llvm::Value* rhs) {
+    return Add(GetConstantFloat(rhs->getType(), lhs), rhs);
+  }
+
+  // If your call resolved to these then you probably wanted the versions taking
+  // APFloat.
+  llvm::Value* Add(double lhs, llvm::Value* rhs) = delete;
+  llvm::Value* Add(float lhs, llvm::Value* rhs) = delete;
+
+  llvm::Value* Sub(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Sub(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return Sub(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs,
+                   bool enable_fast_min_max);
+  llvm::Value* Max(const llvm::APFloat& lhs, llvm::Value* rhs,
+                   bool enable_fast_min_max) {
+    return Max(GetConstantFloat(rhs->getType(), lhs), rhs, enable_fast_min_max);
+  }
+  llvm::Value* Div(llvm::Value* lhs, llvm::Value* rhs);
+
+  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
+    return Add(c, Mul(a, b));
+  }
+
+  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, const llvm::APFloat& c) {
+    return Add(GetConstantFloat(vector_type(), c), Mul(a, b));
+  }
+
+  llvm::Value* MulAdd(llvm::Value* a, const llvm::APFloat& b,
+                      const llvm::APFloat& c) {
+    return Add(GetConstantFloat(a->getType(), c),
+               Mul(a, GetConstantFloat(a->getType(), b)));
+  }
+
+  llvm::Value* Floor(llvm::Value* a);
+
+  // Precondition: Neither `low` nor `high` is nan.
+  llvm::Value* Clamp(llvm::Value* a, const llvm::APFloat& low,
+                     const llvm::APFloat& high);
+
+  llvm::Value* SplatFloat(const llvm::APFloat& d) {
+    return GetConstantFloat(vector_type(), d);
+  }
+
+  // These compare instructions return a floating point typed mask instead of an
+  // i1.  For instance, on a vector typed input, lanes where the predicate is
+  // true get a float with all ones and other lanes get a float with all zeros.
+  // This is slightly odd from the perspective of LLVM's type system, but it
+  // makes kernel IR generation code written using VectorIrBuilder (its
+  // raison d'etre) less cluttered.
+
+  llvm::Value* FCmpEQMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpEQMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FCmpEQMask(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* FCmpULEMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpOLTMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpOLTMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FCmpOLTMask(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+
+  // These boolean operations operate on the bitwise values of the floating
+  // point inputs.  They return a (vector of) float(s) but like in the mask
+  // generating predicates above this type system oddity makes the kernel IR
+  // generation code less cluttered.
+  llvm::Value* FloatAnd(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FloatAnd(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FloatAnd(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* FloatOr(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FloatOr(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FloatOr(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* FloatNot(llvm::Value* lhs);
+  llvm::Value* FloatAndNot(llvm::Value* lhs, llvm::Value* rhs) {
+    return FloatAnd(FloatNot(lhs), rhs);
+  }
+
+  llvm::Value* BroadcastScalar(llvm::Value* x);
+  llvm::Value* BroadcastScalar(const llvm::APFloat& d) {
+    return BroadcastScalar(GetConstantFloat(scalar_type(), d));
+  }
+
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements);
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements,
+                                    int64_t scale) {
+    return ComputeOffsetPointer(
+        base_pointer, b_->CreateMul(b_->getInt64(scale), offset_elements));
+  }
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    int64_t offset_elements) {
+    return ComputeOffsetPointer(base_pointer, b()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadVector(llvm::Value* pointer);
+
+  llvm::Value* LoadVector(llvm::Value* base_pointer,
+                          llvm::Value* offset_elements) {
+    return LoadVector(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  llvm::Value* LoadVector(llvm::Value* base_pointer, int64_t offset_elements) {
+    return LoadVector(base_pointer, b()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadScalar(llvm::Value* pointer);
+
+  llvm::Value* LoadScalar(llvm::Value* base_pointer,
+                          llvm::Value* offset_elements) {
+    return LoadScalar(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  llvm::Value* LoadScalar(llvm::Value* base_pointer, int64_t offset_elements) {
+    return LoadScalar(base_pointer, b()->getInt64(offset_elements));
+  }
+
+  void StoreVector(llvm::Value* value, llvm::Value* pointer);
+
+  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
+                   llvm::Value* offset_elements) {
+    StoreVector(value, ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
+                   int64_t offset_elements) {
+    StoreVector(value, base_pointer, b()->getInt64(offset_elements));
+  }
+
+  void StoreScalar(llvm::Value* value, llvm::Value* pointer);
+  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
+                   llvm::Value* offset_elements) {
+    StoreScalar(value, ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
+                   int64_t offset_elements) {
+    StoreScalar(base_pointer, b()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadBroadcast(llvm::Value* pointer);
+  llvm::Value* LoadBroadcast(llvm::Value* base_pointer,
+                             llvm::Value* offset_elements) {
+    return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+  llvm::Value* LoadBroadcast(llvm::Value* base_pointer,
+                             int64_t offset_elements) {
+    return LoadBroadcast(base_pointer, b()->getInt64(offset_elements));
+  }
+
+  // Compute the horizontal sum of each vector in `vectors`.  The i'th element
+  // in the result vector is the (scalar) horizontal sum of the i'th vector in
+  // `vectors`.  If `init_values` is not nullptr then the value in the i'th lane
+  // in `init_values` is added to the i'th horizontal sum.
+  std::vector<llvm::Value*> ComputeHorizontalSums(
+      std::vector<llvm::Value*> vectors, llvm::Value* init_values = nullptr);
+
+  llvm::Value* GetZeroVector();
+  llvm::Value* GetZeroScalar();
+
+  llvm::IRBuilderBase* b() const { return b_; }
+  int64_t vector_size() const { return vector_size_; }
+  llvm::Type* vector_type() const { return vector_type_; }
+  llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
+  llvm::Type* scalar_type() const { return scalar_type_; }
+  llvm::Type* scalar_pointer_type() const { return scalar_pointer_type_; }
+  int64_t scalar_byte_size() const {
+    return primitive_util::BitWidth(primitive_type_) / 8;
+  }
+
+  const std::string& name() const { return name_; }
+
+ private:
+  llvm::Value* ExtractLowHalf(llvm::Value*);
+  llvm::Value* ExtractHighHalf(llvm::Value*);
+
+  llvm::Value* MulInternal(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* AddInternal(llvm::Value* lhs, llvm::Value* rhs);
+
+  llvm::Value* AddReduce(llvm::Value* vector);
+
+  // Checks that each value in `values` is either of type scalar_type() or
+  // vector_type().  This LOG(FATAL)'s so it should only be called in cases
+  // where a mismatching type is a programmer bug.
+  void AssertCorrectTypes(std::initializer_list<llvm::Value*> values);
+
+  // Perform an X86 AVX style horizontal add between `lhs` and `rhs`.  The
+  // resulting IR for an 8-float wide vector is expected to lower to a single
+  // vhaddps instruction on a CPU that supports vhaddps, and not be too bad in
+  // other cases.
+  //
+  // For a vector width of 8, the result vector is computed as:
+  //   Result[0] = Lhs[0] + Lhs[1]
+  //   Result[1] = Lhs[2] + Lhs[3]
+  //   Result[2] = Rhs[0] + Rhs[1]
+  //   Result[3] = Rhs[2] + Rhs[3]
+  //   Result[4] = Lhs[4] + Lhs[5]
+  //   Result[5] = Lhs[6] + Lhs[7]
+  //   Result[6] = Rhs[4] + Rhs[5]
+  //   Result[7] = Rhs[6] + Rhs[7]
+  llvm::Value* AvxStyleHorizontalAdd(llvm::Value* lhs, llvm::Value* rhs);
+
+  std::vector<llvm::Value*> ComputeAvxOptimizedHorizontalSums(
+      std::vector<llvm::Value*> vectors, llvm::Value* init_values);
+
+  llvm::Type* IntegerTypeForFloatSize(bool vector);
+  llvm::Value* I1ToFloat(llvm::Value* i1);
+  llvm::Value* GetConstantFloat(llvm::Type* type, const llvm::APFloat& f) {
+    llvm::Constant* scalar_value = llvm::ConstantFP::get(type->getContext(), f);
+    if (llvm::isa<llvm::VectorType>(type)) {
+      return llvm::ConstantVector::getSplat(
+          llvm::ElementCount::getFixed(vector_size()), scalar_value);
+    }
+    return scalar_value;
+  }
+
+  int64_t vector_size_;
+  PrimitiveType primitive_type_;
+  llvm::IRBuilderBase* b_;
+  llvm::Type* vector_type_;
+  llvm::Type* vector_pointer_type_;
+  llvm::Type* scalar_type_;
+  llvm::Type* scalar_pointer_type_;
+  std::string name_;
+};
+
+// This wraps an alloca-backed stack variable which LLVM's SSA construction pass
+// can later convert to a SSA value.
+class LlvmVariable {
+ public:
+  LlvmVariable(llvm::Type*, llvm::IRBuilderBase* b);
+
+  llvm::Value* Get() const;
+  void Set(llvm::Value* new_value);
+
+ private:
+  llvm::AllocaInst* alloca_;
+  llvm::IRBuilderBase* b_;
+};
+
+class VectorVariable : public LlvmVariable {
+ public:
+  VectorVariable(VectorIrBuilder* vector_support, llvm::Value* initial_value)
+      : LlvmVariable(vector_support->vector_type(), vector_support->b()) {
+    Set(initial_value);
+  }
+};
+
+class ScalarVariable : public LlvmVariable {
+ public:
+  ScalarVariable(VectorIrBuilder* vector_support, llvm::Value* initial_value)
+      : LlvmVariable(vector_support->scalar_type(), vector_support->b()) {
+    Set(initial_value);
+  }
+};
+
+// This wraps a set of alloca-backed stack variables that can, as a whole, store
+// a tile.  A "tile" is a sequence of vectors that is typically used as a 2D
+// grid of scalar values (e.g. for tiled GEMMs).
+class TileVariable {
+ public:
+  TileVariable(VectorIrBuilder* vector_support,
+               std::vector<llvm::Value*> initial_value);
+
+  std::vector<llvm::Value*> Get() const;
+  void Set(absl::Span<llvm::Value* const> value);
+
+ private:
+  std::vector<VectorVariable> storage_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_VECTOR_IR_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_clique.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_clique.h
new file mode 100644
index 00000000..e1ff3025
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_clique.h
@@ -0,0 +1,42 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUE_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUE_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/backends/cpu/collectives/cpu_clique_key.h"
+#include "xla/core/collectives/clique.h"
+
+namespace xla::cpu {
+
+// A group of CPU communicators making up a clique.
+class CpuClique final : public Clique {
+ public:
+  explicit CpuClique(CpuCliqueKey key);
+
+  absl::Status HealthCheck() const final;
+
+  std::string DebugString() const final;
+
+ private:
+  CpuCliqueKey key_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_clique_key.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_clique_key.h
new file mode 100644
index 00000000..30b257c1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_clique_key.h
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUE_KEY_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUE_KEY_H_
+
+#include <string>
+
+#include "absl/hash/hash.h"
+#include "xla/core/collectives/clique_key.h"
+
+namespace xla::cpu {
+
+// Clique key for identifying a particular CPU collectives clique.
+class CpuCliqueKey final : public CliqueKey {
+ public:
+  using CliqueKey::CliqueKey;
+
+  bool IsSubsetOf(const CliqueKey& other) const final;
+  std::string ToString() const final;
+
+  friend bool operator==(const CpuCliqueKey& a, const CpuCliqueKey& b);
+  friend bool operator<(const CpuCliqueKey& a, const CpuCliqueKey& b);
+  friend bool operator>(const CpuCliqueKey& a, const CpuCliqueKey& b);
+
+ private:
+  void HashValue(absl::HashState state) const final;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUE_KEY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.h
new file mode 100644
index 00000000..b4277461
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.h
@@ -0,0 +1,33 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUES_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUES_H_
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/collectives/cpu_clique_key.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla::cpu {
+
+// Returns a communicator for a given clique key and rank.
+absl::StatusOr<Communicator*> AcquireCommunicator(
+    CpuCollectives* collectives, const CpuCliqueKey& clique_key, RankId rank);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_CPU_CLIQUES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
new file mode 100644
index 00000000..330b35f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_CPU_COLLECTIVES_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_CPU_COLLECTIVES_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// XLA:CPU extension of the Collectives interface with CPU-specific APIs.
+class CpuCollectives : public Collectives {
+ public:
+  // Returns the default collectives implementation for CPU backend.
+  static CpuCollectives* Default();
+
+  class Device : public Collectives::Device {
+   public:
+    Device() = default;
+  };
+
+  // Executor allows CPU collectives clients to pass additional information to
+  // the collectives implementation.
+  class Executor : public Communicator::Executor {
+   public:
+    Executor(RendezvousKey rendezvous_key, absl::Duration timeout);
+
+    const RendezvousKey& rendezvous_key() const { return rendezvous_key_; }
+    const absl::Duration& timeout() const { return timeout_; }
+
+   private:
+    RendezvousKey rendezvous_key_;
+    absl::Duration timeout_;
+  };
+
+  absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final {
+    return Unimplemented("CPU collectives do not support clique ids");
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const> comms, int32_t color,
+      absl::Span<const RankId> keys, const Config& config) final {
+    return Unimplemented(
+        "CPU collectives do not support communicator splitting");
+  }
+
+  // Tries to cast a Collectives::Device to a CpuCollectives::Device.
+  static absl::StatusOr<const Device*> TryCast(
+      const Collectives::Device* device);
+
+  // Tries to cast a Communicator::Executor to a CpuCollectives::Executor.
+  static absl::StatusOr<const Executor*> TryCast(
+      const Communicator::Executor* executor);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_CPU_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.h
new file mode 100644
index 00000000..9b52a05e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_GLOO_COLLECTIVES_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_GLOO_COLLECTIVES_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "gloo/context.h"
+#include "gloo/rendezvous/store.h"
+#include "gloo/transport/device.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class GlooCollectives : public CpuCollectives {
+ public:
+  GlooCollectives(std::unique_ptr<gloo::rendezvous::Store> store,
+                  std::shared_ptr<gloo::transport::Device> device);
+  ~GlooCollectives() override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueId>& clique_id,
+                      absl::Span<const DeviceRank> ranks,
+                      const Config& config) final;
+
+ private:
+  std::unique_ptr<gloo::rendezvous::Store> store_;
+  std::shared_ptr<gloo::transport::Device> device_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_GLOO_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h
new file mode 100644
index 00000000..234716da
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h
@@ -0,0 +1,103 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_GLOO_COMMUNICATOR_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_GLOO_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "gloo/context.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// XLA communicator implemented using Gloo communication library.
+class GlooCommunicator : public Communicator {
+ public:
+  GlooCommunicator(std::shared_ptr<gloo::Context> context, size_t rank,
+                   size_t num_ranks);
+  ~GlooCommunicator() override;
+
+  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, ReductionKind reduction_kind,
+                         const Executor& executor) override;
+
+  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 std::optional<RankId> source_rank,
+                                 absl::Span<const RankId> target_ranks,
+                                 const Executor& executor) override;
+
+  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
+                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
+                        PrimitiveType dtype, size_t count,
+                        const Executor& executor) override;
+
+  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, const Executor& executor) override;
+
+  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
+                             se::DeviceMemoryBase recv_buffer,
+                             PrimitiveType dtype, size_t count,
+                             ReductionKind reduction_kind,
+                             const Executor& executor) override;
+
+  absl::Status Broadcast(se::DeviceMemoryBase, se::DeviceMemoryBase,
+                         PrimitiveType, size_t, RankId,
+                         const Executor&) override {
+    return Unimplemented("Broadcast is not implemented");
+  }
+
+  absl::Status Send(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
+                    const Executor&) override {
+    return Unimplemented("Send is not implemented");
+  }
+
+  absl::Status Recv(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
+                    const Executor&) override {
+    return Unimplemented("Recv is not implemented");
+  }
+
+  absl::StatusOr<size_t> NumRanks() const override { return num_ranks_; }
+
+  std::string ToString() const override {
+    return absl::StrCat("GlooCommunicator [rank: ", rank_,
+                        " num_ranks: ", num_ranks_, "]");
+  }
+
+ private:
+  std::shared_ptr<gloo::Context> context_;
+  size_t rank_;
+  size_t num_ranks_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_GLOO_COMMUNICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_kv_store.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_kv_store.h
new file mode 100644
index 00000000..1cba490b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/gloo_kv_store.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_GLOO_KV_STORE_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_GLOO_KV_STORE_H_
+
+#include <chrono>  // NOLINT
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "gloo/rendezvous/store.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+
+namespace xla::cpu {
+
+class GlooKeyValueStore : public ::gloo::rendezvous::Store {
+ public:
+  explicit GlooKeyValueStore(std::shared_ptr<KeyValueStoreInterface> kv_store);
+  ~GlooKeyValueStore() override;
+
+  void set(const std::string& key, const std::vector<char>& data) override;
+
+  std::vector<char> get(const std::string& key) override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(const std::vector<std::string>& keys,
+            const std::chrono::milliseconds& timeout) override;
+
+ private:
+  std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Duration kv_get_timeout_ = absl::Minutes(1);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_GLOO_KV_STORE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/in_process_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/in_process_collectives.h
new file mode 100644
index 00000000..9d3150a4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/in_process_collectives.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_IN_PROCESS_COLLECTIVES_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_IN_PROCESS_COLLECTIVES_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/backends/cpu/collectives/in_process_communicator.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class InProcessCollectives : public CpuCollectives {
+ public:
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueId>& clique_id,
+                      absl::Span<const DeviceRank> ranks,
+                      const Config& config) final;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_IN_PROCESS_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h
new file mode 100644
index 00000000..f4366c85
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h
@@ -0,0 +1,99 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_IN_PROCESS_COMMUNICATOR_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_IN_PROCESS_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// XLA communicator that implements collective operations using shared memory
+// and works only within a single process.
+class InProcessCommunicator : public Communicator {
+ public:
+  InProcessCommunicator(size_t rank, size_t num_ranks);
+
+  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, ReductionKind reduction_kind,
+                         const Executor& executor) override;
+
+  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 std::optional<RankId> source_rank,
+                                 absl::Span<const RankId> target_ranks,
+                                 const Executor& executor) override;
+
+  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
+                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
+                        PrimitiveType dtype, size_t count,
+                        const Executor& executor) override;
+
+  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, const Executor& executor) override;
+
+  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
+                             se::DeviceMemoryBase recv_buffer,
+                             PrimitiveType dtype, size_t count,
+                             ReductionKind reduction_kind,
+                             const Executor& executor) override;
+
+  absl::Status Broadcast(se::DeviceMemoryBase, se::DeviceMemoryBase,
+                         PrimitiveType, size_t, RankId,
+                         const Executor&) override {
+    return Unimplemented("Broadcast is not implemented");
+  }
+
+  absl::Status Send(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
+                    const Executor&) override {
+    return Unimplemented("Send is not implemented");
+  }
+
+  absl::Status Recv(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
+                    const Executor&) override {
+    return Unimplemented("Recv is not implemented");
+  }
+
+  absl::StatusOr<size_t> NumRanks() const override { return num_ranks_; }
+
+  std::string ToString() const override {
+    return absl::StrCat("InProcessCommunicator [rank: ", rank_,
+                        " num_ranks: ", num_ranks_, "]");
+  }
+
+ private:
+  size_t rank_;
+  size_t num_ranks_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_IN_PROCESS_COMMUNICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/mpi_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/mpi_collectives.h
new file mode 100644
index 00000000..702cb05f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/mpi_collectives.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_MPI_COLLECTIVES_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_MPI_COLLECTIVES_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class MpiCollectives : public CpuCollectives {
+ public:
+  /*
+  The user has to explicitly call Init() and Finalize() before and
+  after use.
+  For example, using the Python client, this can be achieved with:
+
+  collectives = xla_client._xla.make_mpi_collectives()
+  collectives.Init()
+  atexit.register(collectives.Finalize)
+  */
+  void Init();
+  void Finalize();
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueId>& clique_id,
+                      absl::Span<const DeviceRank> ranks,
+                      const Config& config) final;
+
+ private:
+  absl::Status ExchangeGlobalDeviceIds(
+      absl::Span<GlobalDeviceId const> global_devices, int rank);
+
+  int mpi_world_rank_;
+  int mpi_world_size_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_MPI_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h
new file mode 100644
index 00000000..cfed534b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_COLLECTIVES_MPI_COMMUNICATOR_H_
+#define XLA_BACKENDS_CPU_COLLECTIVES_MPI_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "mpi.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class MpiCommunicator : public Communicator {
+ public:
+  explicit MpiCommunicator(int color, int key);
+  ~MpiCommunicator() override;
+
+  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, ReductionKind reduction_kind,
+                         const Executor& executor) override;
+
+  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 std::optional<RankId> source_rank,
+                                 absl::Span<const RankId> target_ranks,
+                                 const Executor& executor) override;
+
+  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
+                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
+                        PrimitiveType dtype, size_t count,
+                        const Executor& executor) override;
+  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, const Executor& executor) override;
+  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
+                             se::DeviceMemoryBase recv_buffer,
+                             PrimitiveType dtype, size_t count,
+                             ReductionKind reduction_kind,
+                             const Executor& executor) override;
+
+  absl::Status Broadcast(se::DeviceMemoryBase, se::DeviceMemoryBase,
+                         PrimitiveType, size_t, RankId,
+                         const Executor&) override {
+    return Unimplemented("Broadcast is not implemented");
+  }
+
+  absl::Status Send(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
+                    const Executor&) override {
+    return Unimplemented("Send is not implemented");
+  }
+
+  absl::Status Recv(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
+                    const Executor&) override {
+    return Unimplemented("Recv is not implemented");
+  }
+
+  absl::StatusOr<size_t> NumRanks() const override { return mpi_size_; }
+
+  std::string ToString() const override {
+    return absl::StrCat("MpiCommunicator [rank: ", mpi_rank_,
+                        " num_ranks: ", mpi_size_, "]");
+  }
+
+ private:
+  MPI_Comm comm_;
+  int mpi_rank_;
+  int mpi_size_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_COLLECTIVES_MPI_COMMUNICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/nanort/nanort_client.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
new file mode 100644
index 00000000..c842c338
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_NANORT_NANORT_CLIENT_H_
+#define XLA_BACKENDS_CPU_NANORT_NANORT_CLIENT_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla::cpu {
+
+// A client for compiling XLA programs to executables using the XLA:CPU backend.
+class NanoRtClient {
+ public:
+  NanoRtClient();
+
+  // Compiles the given XLA computation to a NanoRtExecutable using the XLA:CPU
+  // backend.
+  absl::StatusOr<std::unique_ptr<NanoRtExecutable>> Compile(
+      const XlaComputation& computation);
+
+ private:
+  // Thread pool for running XLA:CPU compute tasks.
+  std::shared_ptr<tsl::thread::ThreadPool> intra_op_thread_pool_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_NANORT_NANORT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
new file mode 100644
index 00000000..8421c590
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
@@ -0,0 +1,141 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_NANORT_NANORT_EXECUTABLE_H_
+#define XLA_BACKENDS_CPU_NANORT_NANORT_EXECUTABLE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/fixed_array.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/service/executable.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla::cpu {
+
+class NanoRtExecutable {
+ public:
+  // Creates a new instance of the NanoRtExecutable from compatible XLA
+  // executable.
+  static absl::StatusOr<std::unique_ptr<NanoRtExecutable>> Create(
+      std::unique_ptr<Executable> executable,
+      std::shared_ptr<tsl::thread::ThreadPool> thread_pool);
+
+  // NanoRtExecutable can be asynchronous and return unavailable async value
+  // that becomes available after the execution is complete. It is the caller's
+  // responsibility to make sure that arguments, results and temp buffers are
+  // alive during execution.
+  using ExecuteEvent = tsl::Chain;
+
+  // A non-owning read-only view into the XLA executable's argument buffer.
+  class Argument {
+   public:
+    template <typename T>
+    Argument(const T* data, int64_t size);
+
+    template <typename T>
+    explicit Argument(absl::Span<const T> data);
+
+    absl::Span<const std::byte> data() const { return data_; }
+
+   private:
+    absl::Span<const std::byte> data_;
+  };
+
+  // A non-owning writable view into the XLA executable's result buffer.
+  class Result {
+   public:
+    template <typename T>
+    Result(T* data, int64_t size);
+
+    template <typename T>
+    explicit Result(absl::Span<T> data);
+
+    absl::Span<std::byte> data() const { return data_; }
+
+   private:
+    absl::Span<std::byte> data_;
+  };
+
+  // A non-owning writable view into the XLA executable's temporary buffer (a
+  // buffer that is used by the executable to store intermediate results).
+  using PreallocatedTemp = absl::Span<std::byte>;
+
+  // An owning writable byte buffer that can be used as a temporary buffer.
+  template <size_t n>
+  using ManagedTemp = absl::FixedArray<std::byte, n>;
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(absl::Span<const Argument> arguments,
+                                           absl::Span<const Result> results,
+                                           PreallocatedTemp temp = {});
+
+  template <size_t n>
+  tsl::AsyncValueRef<ExecuteEvent> Execute(absl::Span<const Argument> arguments,
+                                           absl::Span<const Result> results,
+                                           ManagedTemp<n>& temp) {
+    return Execute(arguments, results, absl::MakeSpan(temp));
+  }
+
+  // Returns the size of the temp buffer required to run the executable.
+  size_t temp_buffer_size() const;
+
+ private:
+  NanoRtExecutable(std::unique_ptr<Executable> executable,
+                   std::shared_ptr<tsl::thread::ThreadPool> thread_pool,
+                   std::vector<size_t> allocation_sizes,
+                   std::vector<size_t> argument_to_allocation_index,
+                   std::vector<size_t> result_to_allocation_index,
+                   std::optional<size_t> temp_allocation_index);
+
+  std::unique_ptr<Executable> executable_;
+  std::shared_ptr<tsl::thread::ThreadPool> thread_pool_;
+
+  std::vector<size_t> allocation_sizes_;
+
+  // A mapping from the argument/result index to the index of the corresponding
+  // allocation (defined by the executable's buffer assignment).
+  std::vector<size_t> argument_to_allocation_index_;
+  std::vector<size_t> result_to_allocation_index_;
+
+  // Index of the temp allocation.
+  std::optional<size_t> temp_allocation_index_;
+};
+
+template <typename T>
+NanoRtExecutable::Argument::Argument(const T* data, int64_t size)
+    : data_(reinterpret_cast<const std::byte*>(data), size * sizeof(T)) {}
+
+template <typename T>
+NanoRtExecutable::Argument::Argument(absl::Span<const T> data)
+    : Argument(data.data(), data.size()) {}
+
+template <typename T>
+NanoRtExecutable::Result::Result(T* data, int64_t size)
+    : data_(reinterpret_cast<std::byte*>(data), size * sizeof(T)) {}
+
+template <typename T>
+NanoRtExecutable::Result::Result(absl::Span<T> data)
+    : Result(data.data(), data.size()) {}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_NANORT_NANORT_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.h
new file mode 100644
index 00000000..2d2dca9a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_ALL_GATHER_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_ALL_GATHER_THUNK_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/collective_thunk.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class AllGatherThunk final : public CollectiveThunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<AllGatherThunk>> Create(
+      Info info, OpParams op_params, OpBuffers op_buffers,
+      OpResources op_resources);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+ private:
+  AllGatherThunk(Info info, OpParams op_params, OpBuffers op_buffers,
+                 OpResources op_resources);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_ALL_GATHER_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.h
new file mode 100644
index 00000000..77866382
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_ALL_REDUCE_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_ALL_REDUCE_THUNK_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/collective_thunk.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class AllReduceThunk final : public CollectiveThunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<AllReduceThunk>> Create(
+      Info info, ReductionKind reduction_kind, OpParams op_params,
+      OpBuffers op_buffers, OpResources op_resources, bool single_replica);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+ private:
+  AllReduceThunk(Info info, ReductionKind reduction_kind, OpParams op_params,
+                 OpBuffers op_buffers, OpResources op_resources,
+                 bool single_replica);
+
+  ReductionKind reduction_kind_;
+  bool single_replica_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_ALL_REDUCE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.h
new file mode 100644
index 00000000..b58afe94
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_ALL_TO_ALL_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_ALL_TO_ALL_THUNK_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/collective_thunk.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class AllToAllThunk final : public CollectiveThunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<AllToAllThunk>> Create(
+      Info info, OpParams op_params, OpBuffers op_buffers,
+      OpResources op_resources);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+ private:
+  AllToAllThunk(Info info, OpParams op_params, OpBuffers op_buffers,
+                OpResources op_resources);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
new file mode 100644
index 00000000..dd47f271
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
@@ -0,0 +1,160 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_BUFFER_ALLOCATIONS_H_
+#define XLA_BACKENDS_CPU_RUNTIME_BUFFER_ALLOCATIONS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/util.h"
+
+namespace xla::cpu {
+
+// Buffer allocation is a container for device buffers allocated for a
+// particular XLA execution. Buffers are indexed by the buffer allocation index.
+class BufferAllocations {
+ public:
+  using Buffers = absl::InlinedVector<se::DeviceMemoryBase, 8>;
+
+  explicit BufferAllocations(Buffers buffers);
+  explicit BufferAllocations(absl::Span<const se::DeviceMemoryBase> buffers);
+  explicit BufferAllocations(absl::Span<const MaybeOwningDeviceMemory> buffers);
+
+  // Returns the device address of buffer at the given index. Returns an error
+  // if the index is out of range.
+  absl::StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
+      BufferAllocation::Index index) const;
+
+  // Same as above, but also adjusts the returned address for the offset and
+  // size contained in the given slice.
+  absl::StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
+      BufferAllocation::Slice slice) const;
+
+  // Unchecked version of `GetDeviceAddress` that does not check the buffer
+  // index and assumes it is valid.
+  se::DeviceMemoryBase GetDeviceAddressUnchecked(
+      BufferAllocation::Index buffer_index) const;
+
+  // Unchecked version of `GetDeviceAddress` that does not check the slice
+  // buffer index, offset and size and assumes they all are valid.
+  se::DeviceMemoryBase GetDeviceAddressUnchecked(
+      BufferAllocation::Slice slice) const;
+
+ private:
+  absl::InlinedVector<se::DeviceMemoryBase, 8> buffers_;
+  se::DeviceMemoryBase* buffers_data_;  // buffers_.data()
+  size_t num_buffers_;
+};
+
+inline BufferAllocations::BufferAllocations(Buffers buffers)
+    : buffers_(std::move(buffers)),
+      buffers_data_(buffers_.data()),
+      num_buffers_(buffers_.size()) {}
+
+inline BufferAllocations::BufferAllocations(
+    absl::Span<const se::DeviceMemoryBase> buffers)
+    : buffers_(buffers.begin(), buffers.end()),
+      buffers_data_(buffers_.data()),
+      num_buffers_(buffers_.size()) {}
+
+inline BufferAllocations::BufferAllocations(
+    absl::Span<const MaybeOwningDeviceMemory> buffers)
+    : buffers_(buffers.size()),
+      buffers_data_(buffers_.data()),
+      num_buffers_(buffers_.size()) {
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    buffers_[i] = buffers[i].AsDeviceMemoryBase();
+  }
+}
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE absl::StatusOr<se::DeviceMemoryBase>
+BufferAllocations::GetDeviceAddress(BufferAllocation::Index index) const {
+  if (ABSL_PREDICT_FALSE(index < 0 || index >= num_buffers_)) {
+    return InvalidArgument(
+        "Invalid buffer index %d. It must be in the range [0, %d)", index,
+        num_buffers_);
+  }
+
+  return buffers_[index];
+}
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE absl::StatusOr<se::DeviceMemoryBase>
+BufferAllocations::GetDeviceAddress(BufferAllocation::Slice slice) const {
+  // Handle empty slices explicitly and return a null pointer device memory to
+  // guarantee that we do not accidentally write through the empty slice which
+  // would hide a real bug in the code.
+  if (ABSL_PREDICT_FALSE(slice.size() == 0)) {
+    return se::DeviceMemoryBase(nullptr, 0);
+  }
+
+  int64_t index = slice.index();
+  if (ABSL_PREDICT_FALSE(index < 0 || index >= num_buffers_)) {
+    return InvalidArgument(
+        "Invalid buffer index %d. It must be in the range [0, %d)", index,
+        num_buffers_);
+  }
+  const se::DeviceMemoryBase& base = buffers_data_[index];
+
+  int64_t offset = slice.offset();
+  int64_t extent = offset + slice.size();
+
+  if (ABSL_PREDICT_FALSE(offset < 0)) {
+    return InvalidArgument("Buffer slice offset %d must be non-negative",
+                           offset);
+  }
+
+  if (ABSL_PREDICT_FALSE(offset >= base.size())) {
+    return InvalidArgument(
+        "Buffer slice offset %d is out of range for buffer #%d of size %d",
+        offset, index, base.size());
+  }
+
+  if (ABSL_PREDICT_FALSE(extent > base.size())) {
+    return InvalidArgument(
+        "Buffer slice extent %d is out of range for buffer #%d of size %d",
+        extent, index, base.size());
+  }
+
+  return base.GetByteSlice(offset, slice.size());
+}
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE se::DeviceMemoryBase
+BufferAllocations::GetDeviceAddressUnchecked(
+    BufferAllocation::Index buffer_index) const {
+  return buffers_data_[buffer_index];
+}
+
+// Unchecked version of `GetDeviceAddress` that does not check the slice
+// buffer index, offset and size and assumes they are valid.
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE se::DeviceMemoryBase
+BufferAllocations::GetDeviceAddressUnchecked(
+    BufferAllocation::Slice slice) const {
+  return buffers_data_[slice.index()].GetByteSlice(slice.offset(),
+                                                   slice.size());
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_BUFFER_ALLOCATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/call_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/call_thunk.h
new file mode 100644
index 00000000..b7addf72
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/call_thunk.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CALL_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CALL_THUNK_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// A thunk constructed from a call instruction that simply calls a thunk
+// sequence emitted from the called computation.
+class CallThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<CallThunk>> Create(
+      Info info, ThunkSequence called_sequence);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+  ResourceUses resource_uses() const final;
+
+ private:
+  CallThunk(Info info, ThunkExecutor called_executor);
+
+  ThunkExecutor called_executor_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CALL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.h
new file mode 100644
index 00000000..702b2f2b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_COLLECTIVE_PERMUTE_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_COLLECTIVE_PERMUTE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/collective_thunk.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class CollectivePermuteThunk final : public CollectiveThunk {
+ public:
+  using SourceTargetPair = std::pair<int64_t, int64_t>;
+
+  static absl::StatusOr<std::unique_ptr<CollectivePermuteThunk>> Create(
+      Info info, OpParams op_params, OpBuffers op_buffers,
+      OpResources op_resources,
+      absl::Span<const SourceTargetPair> source_target_pairs);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+ private:
+  CollectivePermuteThunk(
+      Info info, OpParams op_params, OpBuffers op_buffers,
+      OpResources op_resources,
+      absl::Span<const SourceTargetPair> source_target_pairs);
+
+  std::vector<SourceTargetPair> source_target_pairs_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_COLLECTIVE_PERMUTE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h
new file mode 100644
index 00000000..e226f7ab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h
@@ -0,0 +1,124 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_COLLECTIVE_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_COLLECTIVE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/global_device_id.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+class CollectiveThunk : public Thunk {
+  using Thunk::Thunk;
+
+ public:
+  // Parameters of the collective operation behind the collective thunk. We rely
+  // on them to construct the rendezvous key and to find a thunk "location" in
+  // the collective operation "clique" (group of communicating devices).
+  struct OpParams {
+    int64_t op_id;
+    bool has_channel_id;
+    std::optional<bool> use_global_device_ids;
+    std::vector<ReplicaGroup> group;
+  };
+
+  // Source and destination buffers for the collective operation.
+  struct OpBuffers {
+    std::vector<BufferAllocation::Slice> source_buffers;
+    std::vector<Shape> source_shapes;
+
+    std::vector<BufferAllocation::Slice> destination_buffers;
+    std::vector<Shape> destination_shapes;
+  };
+
+  // Resources used by the collective operation.
+  struct OpResources {
+    std::shared_ptr<Resource> communicator_resource;
+  };
+
+  // Device memory resolved for the collective operation buffers.
+  struct OpDeviceMemory {
+    absl::InlinedVector<se::DeviceMemoryBase, 4> source;
+    absl::InlinedVector<se::DeviceMemoryBase, 4> destination;
+  };
+
+  CollectiveThunk(Kind kind, Thunk::Info info, OpParams op_params,
+                  OpBuffers op_buffers, OpResources op_resources);
+
+  const OpParams& op_params() const { return op_params_; }
+
+  // Resolves operation's device memory from the buffers and buffer allocations.
+  absl::StatusOr<OpDeviceMemory> GetOpDeviceMemory(const ExecuteParams& params);
+
+  BufferUses buffer_uses() const final;
+  ResourceUses resource_uses() const final;
+
+ protected:
+  // Callback for collective thunk implementations.
+  using Callback = absl::AnyInvocable<absl::Status(const RendezvousKey& key,
+                                                   Communicator& comm)>;
+
+  static bool IsDataTypeSupportedByCollectiveReduce(PrimitiveType datatype);
+
+  absl::Duration DefaultCollectiveTimeout();
+
+  absl::StatusOr<RendezvousKey> GetRendezvousKey(
+      const Thunk::CollectiveExecuteParams& params);
+
+  absl::StatusOr<int32_t> RankInGlobalDevices(const RendezvousKey& key,
+                                              GlobalDeviceId device);
+
+  // Acquires collective communicator for the given parameters and executes the
+  // user provided callback with acquired rendezvous key, rank and communicator.
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteWithCommunicator(
+      const Thunk::CollectiveExecuteParams* params, Callback callback);
+
+  const BufferAllocation::Slice& source_buffer(int64_t index) const;
+  absl::Span<const BufferAllocation::Slice> source_buffers() const;
+
+  const Shape& source_shape(int64_t index) const;
+
+  const BufferAllocation::Slice& destination_buffer(int64_t index) const;
+  absl::Span<const BufferAllocation::Slice> destination_buffers() const;
+
+  const Shape& destination_shape(int64_t index) const;
+
+ private:
+  OpParams op_params_;
+  OpBuffers op_buffers_;
+  OpResources op_resources_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_COLLECTIVE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/concurrency.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/concurrency.h
new file mode 100644
index 00000000..06fa8997
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/concurrency.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CONCURRENCY_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CONCURRENCY_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include "tsl/platform/logging.h"
+
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace xla::cpu {
+
+// A collection of concurrency primitives for use in the XLA CPU runtime.
+
+// Schedules `n` tasks on the `intra_op_threadpool`, calling `F` for each index
+// in the [0, n) range. Returns immediately after scheduling all tasks. It's a
+// caller's responsibility to wait for all tasks to finish.
+template <typename F,
+          std::enable_if_t<std::is_invocable_v<F, int64_t>>* = nullptr>
+void ScheduleAll(const Eigen::ThreadPoolDevice* intra_op_threadpool, int64_t n,
+                 F&& f) {
+  DCHECK(n >= 0) << "n must be non-negative";
+
+  // Short-circuit the case of no tasks.
+  if (n == 0) return;
+
+  // Short-circuit the case of a single task.
+  if (n == 1) {
+    f(0);
+    return;
+  }
+
+  // Heap-allocated state that manages concurrent execution of `f`.
+  struct State {
+    State(const Eigen::ThreadPoolDevice* intra_op_threadpool, F&& f)
+        : intra_op_threadpool(intra_op_threadpool), f(std::forward<F>(f)) {}
+
+    void Execute(std::shared_ptr<State> self, int64_t start, int64_t end) {
+      while (end - start > 1) {
+        uint64_t mid = (start + end) / 2;
+        intra_op_threadpool->getPool()->Schedule(
+            std::bind(&State::Execute, this, self, mid, end));
+        end = mid;
+      }
+      f(start);
+    }
+
+    const Eigen::ThreadPoolDevice* intra_op_threadpool;
+    F f;
+  };
+
+  auto s = std::make_shared<State>(intra_op_threadpool, std::forward<F>(f));
+  s->Execute(std::move(s), 0, n);
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CONCURRENCY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
new file mode 100644
index 00000000..0b01d851
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CONDITIONAL_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CONDITIONAL_THUNK_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk_executor.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+class ConditionalThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<ConditionalThunk>> Create(
+      Info info, BufferAllocation::Slice branch_index_buffer,
+      std::vector<ThunkSequence> branch_sequences);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+  ResourceUses resource_uses() const final;
+
+ private:
+  ConditionalThunk(Info info, BufferAllocation::Slice branch_index_buffer,
+                   std::vector<ThunkExecutor> branch_executors);
+
+  BufferAllocation::Slice branch_index_buffer_;
+  std::vector<ThunkExecutor> branch_executors_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CONDITIONAL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
new file mode 100644
index 00000000..4ee732ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
@@ -0,0 +1,144 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// Performs 1D, 2D or 3D convolution.
+class ConvolutionThunk final : public Thunk {
+ public:
+  struct Options {
+    bool multi_threaded = false;
+    bool use_acl = false;
+  };
+  static absl::StatusOr<std::unique_ptr<ConvolutionThunk>> Create(
+      Info info, Options options, BufferAllocation::Slice input_buffer,
+      const Shape& input_shape, BufferAllocation::Slice kernel_buffer,
+      const Shape& kernel_shape, BufferAllocation::Slice output_buffer,
+      const Shape& output_shape, const ConvolutionDimensionNumbers& dnums,
+      const Window& window, int64_t feature_group_count);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  Thunk::BufferUses buffer_uses() const final {
+    return {{input_buffer_, BufferUse::kRead},
+            {kernel_buffer_, BufferUse::kRead},
+            {output_buffer_, BufferUse::kWrite}};
+  }
+
+ private:
+  ConvolutionThunk(Info info, BufferAllocation::Slice input_buffer,
+                   const Shape& input_shape,
+                   BufferAllocation::Slice kernel_buffer,
+                   const Shape& kernel_shape,
+                   BufferAllocation::Slice output_buffer,
+                   const Shape& output_shape, int64_t input_batch,
+                   const absl::InlinedVector<int64_t, 2>& input_dims,
+                   int64_t input_channels,
+                   const absl::InlinedVector<int64_t, 2>& kernel_dims,
+                   int64_t kernel_channels, int64_t kernel_filters,
+                   const absl::InlinedVector<int64_t, 2>& output_dims,
+                   const absl::InlinedVector<int64_t, 2>& strides,
+                   const absl::InlinedVector<int64_t, 2>& padding_before,
+                   const absl::InlinedVector<int64_t, 2>& padding_after,
+                   const absl::InlinedVector<int64_t, 2>& base_dilation,
+                   const absl::InlinedVector<int64_t, 2>& window_dilation,
+                   int64_t feature_group_count, Options options);
+
+  void HandleACLConvolution(const ExecuteParams& params,
+                            se::DeviceMemoryBase input,
+                            se::DeviceMemoryBase kernel,
+                            se::DeviceMemoryBase output);
+
+  tsl::AsyncValueRef<Thunk::ExecuteEvent> HandleEigen2DConvolution(
+      const ExecuteParams& params, se::DeviceMemoryBase input,
+      se::DeviceMemoryBase kernel, se::DeviceMemoryBase output);
+
+  tsl::AsyncValueRef<Thunk::ExecuteEvent> HandleEigen3DConvolution(
+      const ExecuteParams& params, se::DeviceMemoryBase input,
+      se::DeviceMemoryBase kernel, se::DeviceMemoryBase output);
+
+  // A helper struct to store the x, y and z dimensions of a tensor, introduced
+  // for readability.
+  // In case of 2D convolution, only the x and y dimensions are used and z is
+  // set to 0.
+  struct Dims {
+    explicit Dims(const absl::InlinedVector<int64_t, 2>& dims);
+
+    int64_t x;
+    int64_t y;
+    int64_t z;
+
+    std::string ToString(int rank) const {
+      switch (rank) {
+        case 2:
+          return absl::StrFormat("[%d,%d]", x, y);
+        case 3:
+          return absl::StrFormat("[%d,%d,%d]", x, y, z);
+        default:
+          return absl::StrFormat("[invalid rank %d]", rank);
+      }
+    }
+  };
+
+  std::string ToString(Dims dims) const {
+    return dims.ToString(convolution_rank_);
+  }
+
+  BufferAllocation::Slice input_buffer_;
+  Shape input_shape_;
+
+  BufferAllocation::Slice kernel_buffer_;
+  Shape kernel_shape_;
+
+  BufferAllocation::Slice output_buffer_;
+  Shape output_shape_;
+
+  int64_t input_batch_;
+  Dims input_dims_;
+  int64_t input_channels_;
+  Dims kernel_dims_;
+  int64_t kernel_channels_;
+  int64_t kernel_filters_;
+  Dims output_dims_;
+  Dims strides_;
+  Dims padding_before_;
+  Dims padding_after_;
+  Dims base_dilation_;
+  Dims window_dilation_;
+  int64_t feature_group_count_;
+  int convolution_rank_;
+  Options options_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h
new file mode 100644
index 00000000..fa8cbdab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h
@@ -0,0 +1,586 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "xla/backends/cpu/runtime/concurrency.h"
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"  // IWYU pragma: keep
+#include "tsl/platform/logging.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/Core"
+#include "Eigen/ThreadPool"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu::internal {
+
+constexpr auto kMaxConvMatrixSize = static_cast<size_t>(8) << 30;  // 8 GiB
+
+// Returns in 'out_data' (assumes to be zero-initialized) image patch in storage
+// order (width, height, depth), constructed from patches in 'conv_matrix',
+// which is required to be in storage order (in_width * in_height, filter_width,
+// filter_height, out_depth).
+// Based on TF implementation by Yangqing Jia (jiayq).
+// TODO(adambanas): The original implementation implicitly rotates the kernel by
+// 180 degrees, but to be backwards compatible, we cannot do that in XLA. This
+// results in counterintuitive operations on conv_matrix, which is also 15-20%
+// slower. Try alternative approaches (e.g. rotate kernel before matrix
+// multiplication in the calling function).
+template <typename T>
+void Pack2DPatches(const T* conv_matrix, const int depth, const int height,
+                   const int width, const int filter_h, const int filter_w,
+                   const int pad_top, const int pad_bottom, const int pad_left,
+                   const int pad_right, const int stride_h, const int stride_w,
+                   T* __restrict out_im_data) {
+  int w_patches_number =
+      (width + filter_w - pad_left - pad_right - 2) / stride_w + 1;
+  int h_patches_number =
+      (height + filter_h - pad_top - pad_bottom - 2) / stride_h + 1;
+
+  const int filter_spatial_size = filter_h * filter_w;
+
+  int w_patch_begin = pad_left - filter_w + 1;
+  conv_matrix += depth * (filter_spatial_size - 1);
+  for (int w = 0; w < w_patches_number; ++w) {
+    int h_patch_begin = pad_top - filter_h + 1;
+    for (int h = 0; h < h_patches_number; ++h) {
+      // This loop body covers 1 output patch, at all depths, accounting for
+      // padding. The next line is always a pointer to the first element of the
+      // new output patch. Notice in case of less-than-full padding, the pointer
+      // can point to an element outside the image, but such elements will be
+      // skipped by the inner if (so no write occurs).
+      T* out_im_patch_data =
+          out_im_data + (w_patch_begin * height + h_patch_begin) * depth;
+
+      for (int iw = w_patch_begin; iw < w_patch_begin + filter_w; ++iw) {
+        for (int ih = h_patch_begin; ih < h_patch_begin + filter_h; ++ih) {
+          // This loop body covers 1 spatial point with coordinates (iw, ih)
+          // in the output buffer, at all depths
+          if (iw >= 0 && iw < width && ih >= 0 && ih < height) {
+            for (int i = 0; i < depth; ++i) {
+              out_im_patch_data[i] += conv_matrix[i];
+            }
+          }
+          out_im_patch_data += depth;
+          conv_matrix -= depth;
+        }
+        // Jump over remaining number of depth.
+        out_im_patch_data += depth * (height - filter_h);
+      }
+
+      conv_matrix += 2 * depth * filter_spatial_size;
+      h_patch_begin += stride_h;
+    }
+    w_patch_begin += stride_w;
+  }
+}
+
+// This implementation is based on TF algorithm with parallel contraction.
+// TODO(adambanas): There are other variants of this algorithm, 10% performance
+// improvement was observed on 1D case when not using parallel contraction.
+// Explore these alternatives.
+// TODO(adambanas): Add support for feature group count.
+template <typename EigenDevice, typename ScalarType>
+bool EigenTransposedConv2D(
+    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
+    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
+    Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+    Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+    Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+    std::function<void()> done_callback, bool use_thunk_runtime) {
+  // Grouped convolutions are not supported yet.
+  CHECK(kernel_channels == input_channels);
+
+  using TensorMap2D =
+      Eigen::TensorMap<Eigen::Tensor<ScalarType, 2, Eigen::RowMajor>,
+                       Eigen::Unaligned>;
+  using ConstTensorMap3D =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 3, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+  using ConstTensorMap2D =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 2, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+
+  // Total spatial dimensions.
+  const int input_image_size = input_x * input_y;
+  const int output_image_size = output_x * output_y;
+  // Kernel dimensions per input channel.
+  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
+
+  // Intermediate buffer (convolution matrix)
+  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
+  if (buffer_size * sizeof(ScalarType) > kMaxConvMatrixSize) {
+    LOG(WARNING)
+        << "Falling back to generic convolution implementation, because custom "
+           "transposed convolution algorithm needs too much memory ("
+        << buffer_size * sizeof(ScalarType)
+        << " bytes, exceeding the threshold of " << kMaxConvMatrixSize
+        << " bytes).";
+    return false;
+  }
+  auto conv_matrix = std::make_unique<ScalarType[]>(buffer_size);
+  ScalarType* conv_matrix_data = conv_matrix.get();
+
+  // Initialize output to zero.
+  ScalarType* out_data = out;
+  std::fill(out_data,
+            out_data + input_batch * output_image_size * kernel_filters,
+            ScalarType(0.0f));
+
+  // Initialize contraction dims (we need to transpose 'B' below, the dimension
+  // we need to contract is 'kernel_channels').
+  Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims = {
+      Eigen::IndexPair<Eigen::DenseIndex>(1, 1)};
+
+  // Compute intermediate results (convolution matrix) into conv_matrix.
+  TensorMap2D C(conv_matrix_data, input_batch * input_image_size,
+                kernel_total_size);
+
+  ConstTensorMap2D A(lhs, input_batch * input_image_size, input_channels);
+  ConstTensorMap3D B(rhs, kernel_x * kernel_y, kernel_channels, kernel_filters);
+
+  // Use concurrent execution if we have a thread pool device.
+  constexpr bool use_thread_pool =
+      std::is_same_v<EigenDevice, Eigen::ThreadPoolDevice>;
+
+  // For thunk runtime, `done_callback` must be provided only if we use a thread
+  // pool device. This check is not true for classic runtime which does not
+  // support async execution.
+  if (use_thunk_runtime) {
+    CHECK_EQ(use_thread_pool, static_cast<bool>(done_callback));  // Crash OK
+  }
+
+  const int input_offset = input_image_size * kernel_total_size;
+  const int output_offset = output_image_size * kernel_filters;
+
+  // Pack the calculated patches into the output buffer.
+  // NOTE: The ownership of the conv_matrix is transferred to the lambda without
+  // data copy or reallocation. Thanks to that, conv_matrix_data pointer remains
+  // valid, and that is important because 'C' matrix is referencing it.
+  auto pack_patches = [=, conv_matrix = std::move(conv_matrix)]() {
+    // Using local pointers to buffers, because lambda is not mutable.
+    const ScalarType* conv_matrix_data = conv_matrix.get();
+    ScalarType* local_out_data = out_data;
+
+    // TODO(adambanas): Run this part in parallel.
+    for (int image_id = 0; image_id < input_batch; ++image_id) {
+      Pack2DPatches<ScalarType>(
+          conv_matrix_data, kernel_filters, output_y, output_x, kernel_y,
+          kernel_x, padding_y_before, padding_y_after, padding_x_before,
+          padding_x_after, lhs_y_dilation, lhs_x_dilation, local_out_data);
+
+      conv_matrix_data += input_offset;
+      local_out_data += output_offset;
+    }
+
+    // If done callback is provided, we need to call it after all the work is
+    // done.
+    if (done_callback) {
+      done_callback();
+    }
+  };
+
+  // Molds the output of the contraction into the shape expected by packing
+  // algorithm:
+  // - the minor dimension (dims[1]): the patch values to be packed; contiguous
+  //   in memory
+  // - the major dimension (dims[0]): everything else
+  Eigen::DSizes<Eigen::Index, 2> post_contract_dims;
+  post_contract_dims[0] = input_batch * input_image_size;
+  post_contract_dims[1] = kernel_total_size;
+
+  if (done_callback) {
+    // Schedule the work in the thread pool and return.
+    C.device(device, std::move(pack_patches)) =
+        A.contract(B, contract_dims).reshape(post_contract_dims);
+  } else {
+    // Run synchronously in the current thread.
+    C.device(device) = A.contract(B, contract_dims).reshape(post_contract_dims);
+    pack_patches();
+  }
+  return true;
+}
+
+inline bool CanUseCustomTransposedConv(
+    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count) {
+  return (lhs_x_dilation > 1 || lhs_y_dilation > 1) && rhs_x_dilation == 1 &&
+         rhs_y_dilation == 1 && feature_group_count == 1 && x_stride == 1 &&
+         y_stride == 1;
+}
+
+// Algorithm that works for all types of 2D convolutions. Even though it works
+// for transposed convolutions, the custom algorithm should be used whenever
+// applicable, because it is faster.
+template <typename EigenDevice, typename ScalarType>
+void EigenGenericConv2D(
+    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
+    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
+    Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index padding_x_before,
+    Eigen::Index padding_x_after, Eigen::Index padding_y_before,
+    Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
+    std::function<void()> done_callback, bool use_thunk_runtime) {
+  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      input(lhs, input_batch, input_x, input_y, input_channels);
+
+  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      kernel(rhs, kernel_x, kernel_y, kernel_channels, kernel_filters);
+
+  Eigen::TensorMap<Eigen::Tensor<ScalarType, 4, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, input_batch, output_x, output_y, kernel_filters);
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  Eigen::DSizes<Eigen::Index, 5> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = feature_group_count;
+  input_reshaped_dims[4] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 5> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = feature_group_count;
+  output_reshaped_dims[4] = kernel_filters / feature_group_count;
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  //   kernels
+  // - the second dimension (dims[1]): everything else
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
+  pre_contract_dims[0] = output_y * output_x * input_batch;
+  pre_contract_dims[1] = kernel_channels * kernel_y * kernel_x;
+
+  // Molds the output of the contraction into the shape expected by the user:
+  Eigen::DSizes<Eigen::Index, 4> post_contract_dims;
+  post_contract_dims[0] = input_batch;
+  post_contract_dims[1] = output_x;
+  post_contract_dims[2] = output_y;
+  post_contract_dims[3] = kernel_filters / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
+  kernel_dims[0] = kernel_channels * kernel_y * kernel_x;
+  kernel_dims[1] = feature_group_count;
+  kernel_dims[2] = kernel_filters / feature_group_count;
+
+  // Constructs convolution and output expressions for a given group index.
+  auto convolve_group = [=](int64_t i) {
+    // The row and column dimensions must be flipped when passed to Eigen.
+    auto convolved =
+        input.reshape(input_reshaped_dims)
+            .chip(i, 3)
+            .extract_image_patches(
+                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
+                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
+                padding_y_before, padding_y_after, padding_x_before,
+                padding_x_after, static_cast<ScalarType>(0.0f))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
+            .reshape(post_contract_dims);
+    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 3);
+    return std::make_pair(output_reshaped, convolved);
+  };
+
+  // Use concurrent execution if we have a thread pool device.
+  constexpr bool use_thread_pool =
+      std::is_same_v<EigenDevice, Eigen::ThreadPoolDevice>;
+
+  // For thunk runtime, `done_callback` must be provided only if we use a thread
+  // pool device. This check is not true for classic runtime which does not
+  // support async execution.
+  if (use_thunk_runtime) {
+    CHECK_EQ(use_thread_pool, static_cast<bool>(done_callback));  // Crash OK
+  }
+
+  if constexpr (use_thread_pool) {
+    // Although we schedule at most one tasks for each thread, individual
+    // convolution might also schedule more tasks into the same thread pool.
+    auto max_tasks = static_cast<Eigen::Index>(device.numThreads());
+    auto task_size = Eigen::numext::div_ceil(feature_group_count, max_tasks);
+    auto num_tasks = Eigen::numext::div_ceil(feature_group_count, task_size);
+
+    if (use_thunk_runtime) {
+      ScheduleAll(&device, num_tasks, [=, &device](Eigen::Index task_index) {
+        Eigen::Index start = task_index * task_size;
+        Eigen::Index end = std::min(start + task_size, feature_group_count);
+        for (Eigen::Index i = start; i < end; ++i) {
+          auto [output, convolved] = convolve_group(i);
+          output.device(device, done_callback) = convolved;
+        }
+      });
+    } else {
+      Eigen::Barrier barrier(num_tasks);
+      ScheduleAll(
+          &device, num_tasks, [=, &device, &barrier](Eigen::Index task_index) {
+            Eigen::Index start = task_index * task_size;
+            Eigen::Index end = std::min(start + task_size, feature_group_count);
+            for (Eigen::Index i = start; i < end; ++i) {
+              auto [output, convolved] = convolve_group(i);
+              output.device(device) = convolved;
+            }
+            barrier.Notify();
+          });
+      barrier.Wait();
+    }
+
+  } else {
+    // Convolve all feature groups sequentially in the caller thread.
+    for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+      auto [output, convolved] = convolve_group(i);
+      output.device(device) = convolved;
+    }
+  }
+}
+
+// TODO(ezhulenev): Make internal implementation a private static method of
+// ConvolutionThunk (for consistency with DotThunk). Today we keep it as a
+// free function to use it in the legacy XLA CPU runtime.
+template <typename EigenDevice, typename ScalarType>
+void EigenConv2D(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+                 ScalarType* rhs, Eigen::Index input_batch,
+                 Eigen::Index input_x, Eigen::Index input_y,
+                 Eigen::Index input_channels, Eigen::Index kernel_x,
+                 Eigen::Index kernel_y, Eigen::Index kernel_channels,
+                 Eigen::Index kernel_filters, Eigen::Index output_x,
+                 Eigen::Index output_y, Eigen::Index x_stride,
+                 Eigen::Index y_stride, Eigen::Index padding_x_before,
+                 Eigen::Index padding_x_after, Eigen::Index padding_y_before,
+                 Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
+                 Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+                 Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
+                 std::function<void()> done_callback, bool use_thunk_runtime) {
+  if (CanUseCustomTransposedConv(x_stride, y_stride, lhs_x_dilation,
+                                 lhs_y_dilation, rhs_x_dilation, rhs_y_dilation,
+                                 feature_group_count)) {
+    if (EigenTransposedConv2D(
+            device, out, lhs, rhs, input_batch, input_x, input_y,
+            input_channels, kernel_x, kernel_y, kernel_channels, kernel_filters,
+            output_x, output_y, padding_x_before, padding_x_after,
+            padding_y_before, padding_y_after, lhs_x_dilation, lhs_y_dilation,
+            done_callback, use_thunk_runtime)) {
+      return;
+    }
+    // Transposed convolution failed, fallback to generic implementation.
+  }
+  EigenGenericConv2D(
+      device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
+      kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
+      x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
+      padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
+      rhs_y_dilation, feature_group_count, done_callback, use_thunk_runtime);
+}
+
+template <typename EigenDevice, typename ScalarType>
+void EigenConv3D(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+                 ScalarType* rhs, Eigen::Index input_batch,
+                 Eigen::Index input_x, Eigen::Index input_y,
+                 Eigen::Index input_z, Eigen::Index input_channels,
+                 Eigen::Index kernel_x, Eigen::Index kernel_y,
+                 Eigen::Index kernel_z, Eigen::Index kernel_channels,
+                 Eigen::Index kernel_filters, Eigen::Index output_x,
+                 Eigen::Index output_y, Eigen::Index output_z,
+                 Eigen::Index x_stride, Eigen::Index y_stride,
+                 Eigen::Index z_stride, Eigen::Index padding_x_before,
+                 Eigen::Index padding_x_after, Eigen::Index padding_y_before,
+                 Eigen::Index padding_y_after, Eigen::Index padding_z_before,
+                 Eigen::Index padding_z_after, Eigen::Index lhs_x_dilation,
+                 Eigen::Index lhs_y_dilation, Eigen::Index lhs_z_dilation,
+                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
+                 Eigen::Index rhs_z_dilation, Eigen::Index feature_group_count,
+                 std::function<void()> done_callback) {
+  using ConstTType =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+  const ConstTType input(lhs, input_batch, input_x, input_y, input_z,
+                         input_channels);
+
+  const ConstTType kernel(rhs, kernel_x, kernel_y, kernel_z, kernel_channels,
+                          kernel_filters);
+
+  Eigen::TensorMap<Eigen::Tensor<ScalarType, 5, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, input_batch, output_x, output_y, output_z, kernel_filters);
+
+  Eigen::DSizes<Eigen::Index, 6> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = input_z;
+  input_reshaped_dims[4] = feature_group_count;
+  input_reshaped_dims[5] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 6> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = output_z;
+  output_reshaped_dims[4] = feature_group_count;
+  output_reshaped_dims[5] = kernel_filters / feature_group_count;
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  //   kernels
+  // - the second dimension (dims[1]): everything else
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
+  pre_contract_dims[0] = output_x * output_y * output_z * input_batch;
+  pre_contract_dims[1] = kernel_channels * kernel_x * kernel_y * kernel_z;
+
+  // Molds the output of the contraction into the shape expected by the user:
+  Eigen::DSizes<Eigen::Index, 5> post_contract_dims;
+  post_contract_dims[0] = input_batch;
+  post_contract_dims[1] = output_x;
+  post_contract_dims[2] = output_y;
+  post_contract_dims[3] = output_z;
+  post_contract_dims[4] = kernel_filters / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
+  kernel_dims[0] = kernel_channels * kernel_x * kernel_y * kernel_z;
+  kernel_dims[1] = feature_group_count;
+  kernel_dims[2] = kernel_filters / feature_group_count;
+
+  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+    // The dimension order must be flipped when passed to Eigen.
+    auto input_chip = input.reshape(input_reshaped_dims).chip(i, 4);
+    auto patches =
+        Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
+                                   Eigen::Dynamic, decltype(input_chip)>(
+            input_chip, kernel_z, kernel_y, kernel_x, z_stride, y_stride,
+            x_stride, rhs_z_dilation, rhs_y_dilation, rhs_x_dilation,
+            lhs_z_dilation, lhs_y_dilation, lhs_x_dilation, padding_z_before,
+            padding_z_after, padding_y_before, padding_y_after,
+            padding_x_before, padding_x_after, static_cast<ScalarType>(0.0f));
+
+    auto convolved =
+        patches.reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
+            .reshape(post_contract_dims);
+
+    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 4);
+    if (done_callback) {
+      output_reshaped.device(device, done_callback) = convolved;
+    } else {
+      output_reshaped.device(device) = convolved;
+    }
+  }
+}
+
+// Extern Conv2D template for all supported devices and data types.
+#define CONV2D_EXTERN_TEMPLATE(DEVICE, SCALAR_TYPE)                        \
+  extern template void EigenConv2D<DEVICE, SCALAR_TYPE>(                   \
+      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,            \
+      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,    \
+      Eigen::Index input_y, Eigen::Index input_channels,                   \
+      Eigen::Index kernel_x, Eigen::Index kernel_y,                        \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,           \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride, \
+      Eigen::Index y_stride, Eigen::Index padding_x_before,                \
+      Eigen::Index padding_x_after, Eigen::Index padding_y_before,         \
+      Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
+      Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
+      Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
+      std::function<void()> done_callback, bool use_thunk_runtime)
+
+CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
+CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
+CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
+CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
+
+#undef CONV2D_EXTERN_TEMPLATE
+
+// Extern Conv3D template for all supported devices and data types.
+#define CONV3D_EXTERN_TEMPLATE(DEVICE, SCALAR_TYPE)                            \
+  extern template void EigenConv3D<DEVICE, SCALAR_TYPE>(                       \
+      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,                \
+      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,        \
+      Eigen::Index input_y, Eigen::Index input_z, Eigen::Index input_channels, \
+      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,     \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,               \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,     \
+      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,     \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,             \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,             \
+      Eigen::Index padding_z_before, Eigen::Index padding_z_after,             \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,                \
+      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
+      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
+      Eigen::Index feature_group_count, std::function<void()> done_callback)
+
+CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
+CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
+CONV3D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
+CONV3D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
+
+#undef CONV3D_EXTERN_TEMPLATE
+
+}  // namespace xla::cpu::internal
+
+#define CONV2D_INSTANTIATE_TEMPLATE(DEVICE, SCALAR_TYPE)                   \
+  template void xla::cpu::internal::EigenConv2D<DEVICE, SCALAR_TYPE>(      \
+      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,            \
+      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,    \
+      Eigen::Index input_y, Eigen::Index input_channels,                   \
+      Eigen::Index kernel_x, Eigen::Index kernel_y,                        \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,           \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride, \
+      Eigen::Index y_stride, Eigen::Index padding_x_before,                \
+      Eigen::Index padding_x_after, Eigen::Index padding_y_before,         \
+      Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
+      Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
+      Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
+      std::function<void()> done_callback, bool use_thunk_runtime)
+
+#define CONV3D_INSTANTIATE_TEMPLATE(DEVICE, SCALAR_TYPE)                       \
+  template void xla::cpu::internal::EigenConv3D<DEVICE, SCALAR_TYPE>(          \
+      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,                \
+      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,        \
+      Eigen::Index input_y, Eigen::Index input_z, Eigen::Index input_channels, \
+      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,     \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,               \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,     \
+      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,     \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,             \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,             \
+      Eigen::Index padding_z_before, Eigen::Index padding_z_after,             \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,                \
+      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
+      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
+      Eigen::Index feature_group_count, std::function<void()> done_callback)
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h
new file mode 100644
index 00000000..ed2cd68d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h
@@ -0,0 +1,72 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_COPY_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_COPY_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// Copies data from a source buffer to a destination buffer. If source and
+// destination buffers have different layouts it will transpose the data.
+class CopyThunk final : public Thunk {
+ public:
+  // Parameters for running a copy operation in parallel.
+  struct ParallelBlockParams {
+    int64_t size_in_bytes;
+    int64_t block_size;
+    int64_t block_count;
+  };
+
+  static absl::StatusOr<std::unique_ptr<CopyThunk>> Create(
+      Info info, BufferAllocation::Slice src_buffer, const Shape& src_shape,
+      BufferAllocation::Slice dst_buffer, const Shape& dst_shape);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final {
+    return {{src_buffer_, BufferUse::kRead}, {dst_buffer_, BufferUse::kWrite}};
+  }
+
+ private:
+  CopyThunk(Info info, BufferAllocation::Slice src_buffer,
+            const Shape& src_shape, BufferAllocation::Slice dst_buffer,
+            const Shape& dst_shape);
+
+  static ParallelBlockParams ComputeParallelBlockParams(const Shape& shape);
+
+  BufferAllocation::Slice src_buffer_;
+  Shape src_shape_;
+
+  BufferAllocation::Slice dst_buffer_;
+  Shape dst_shape_;
+
+  ParallelBlockParams parallel_block_params_;
+  std::unique_ptr<TransposePlan> transpose_plan_;  // optional
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_COPY_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h
new file mode 100644
index 00000000..19b387a8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CUSTOM_CALL_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CUSTOM_CALL_THUNK_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_state.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// Handles XLA custom calls.
+class CustomCallThunk final : public Thunk {
+ public:
+  // Buffer allocation slices and shapes to fill FFI arguments.
+  struct OpBuffers {
+    std::vector<BufferAllocation::Slice> arguments_buffers;
+    std::vector<Shape> arguments_shapes;
+
+    std::vector<BufferAllocation::Slice> results_buffers;
+    std::vector<Shape> results_shapes;
+    bool is_tuple_result;
+  };
+
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      Info info, absl::string_view target_name, OpBuffers op_buffers,
+      absl::string_view backend_config, CustomCallApiVersion api_version);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+
+ private:
+  CustomCallThunk(Info info, absl::string_view target_name,
+                  OpBuffers op_buffers, CustomCallApiVersion api_version,
+                  absl::string_view backend_config,
+                  std::optional<ffi::CallFrame> call_frame,
+                  std::unique_ptr<ffi::ExecutionState> execution_state);
+
+  // Handles typed-FFI custom calls (API v4).
+  tsl::AsyncValueRef<ExecuteEvent> CallTypedFFI(const ExecuteParams& params);
+
+  // Handles legacy, untyped custom calls (API v1-v3).
+  tsl::AsyncValueRef<ExecuteEvent> CallUntypedAPI(const ExecuteParams& params);
+
+  // Function signature for legacy untyped API.
+  using CustomCallTarget = std::function<void(void*, const void**, const char*,
+                                              size_t, XlaCustomCallStatus*)>;
+
+  std::string target_name_;
+  OpBuffers op_buffers_;
+  CustomCallApiVersion api_version_;
+  std::string backend_config_;
+  std::optional<ffi::CallFrame> call_frame_;
+
+  // Execution state bound to the FFI handler. Optional.
+  std::unique_ptr<ffi::ExecutionState> execution_state_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CUSTOM_CALL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/dot_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
new file mode 100644
index 00000000..e913f56c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
+
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+
+namespace xla::cpu {
+
+// Allocation slices of the dot operation.
+struct DotSlices {
+  BufferAllocation::Slice lhs_buffer;
+  Shape lhs_shape;
+
+  BufferAllocation::Slice rhs_buffer;
+  Shape rhs_shape;
+
+  BufferAllocation::Slice out_buffer;
+  Shape out_shape;
+};
+
+// Shape of the batched dot operation supported by the XLA:CPU runtime.
+struct DotShape {
+  // Product of batch dimensions.
+  int64_t batch_size;
+
+  // Shapes of the non-batch matrix-multiplication for the dot operation
+  Shape lhs_matmul_shape;
+  Shape rhs_matmul_shape;
+  Shape out_matmul_shape;
+};
+
+// Dot operation is implemented as a matrix-matrix multiply (row-major x
+// rowm-major or col-major x col-major). For batched dot operations, it is
+// implemented as multiple matrix multiplications repeated for each batch
+// element.
+struct DotCanonicalDims {
+  // The number of rows in the LHS.
+  int64_t m;
+
+  // The number of columns in the LHS, which also must be equal to the
+  // number of rows in the RHS.
+  int64_t k;
+
+  // The number of columns in the RHS.
+  int64_t n;
+
+  // True if the LHS matrix is column major.
+  bool lhs_column_major;
+
+  // True if the LHS contraction dimension is 1.
+  bool lhs_canonical;
+
+  // True if the RHS matrix is column major.
+  bool rhs_column_major;
+
+  // True if the RHS contraction dimension is 0.
+  bool rhs_canonical;
+};
+
+// Returns buffer uses of the dot operation.
+absl::InlinedVector<BufferUse, 4> DotBufferUses(const DotSlices& slices);
+
+// Verifies dot dimensions and shapes and returns the shape of the dot operation
+// in a form that is convenient for the runtime implementation.
+absl::StatusOr<DotShape> GetDotShape(const DotDimensionNumbers& dot_dimensions,
+                                     const Shape& lhs_shape,
+                                     const Shape& rhs_shape,
+                                     const Shape& out_shape);
+
+// Get canonical dot dimensions for the given dot shape.
+absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
+    const DotDimensionNumbers& dot_dimensions, const DotShape& dot_shape);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
new file mode 100644
index 00000000..15b5b97f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
@@ -0,0 +1,157 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
+
+#include "xla/backends/cpu/runtime/dot_lib.h"
+#define EIGEN_USE_THREADS
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/optimization.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class DotThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<DotThunk>> Create(
+      Info info, DotDimensionNumbers dot_dimensions,
+      BufferAllocation::Slice lhs_buffer, Shape lhs_shape,
+      BufferAllocation::Slice rhs_buffer, Shape rhs_shape,
+      BufferAllocation::Slice out_buffer, Shape out_shape);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final { return DotBufferUses(dot_slices_); }
+
+ private:
+  DotThunk(Info info, DotDimensionNumbers dot_dimensions, DotSlices dot_slices,
+           DotShape dot_shape, DotCanonicalDims dot_canonical_dims);
+
+  using DoneCallback = absl::AnyInvocable<void()>;
+
+  // Col-major x Col-major MatMul implementation as Eigen contraction.
+  template <typename T, Eigen::AlignmentType alignment>
+  static void MatMul(const Eigen::ThreadPoolDevice* device, T* out, T* lhs,
+                     T* rhs, int64_t m, int64_t n, int64_t k,
+                     int32_t transpose_lhs, int32_t transpose_rhs,
+                     DoneCallback done);
+
+  template <typename T>
+  static void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out,
+                          void* lhs, void* rhs, int64_t m, int64_t n, int64_t k,
+                          bool transpose_lhs, bool transpose_rhs,
+                          DoneCallback done);
+
+  DotDimensionNumbers dot_dimensions_;
+  DotSlices dot_slices_;
+  DotShape dot_shape_;
+  DotCanonicalDims dot_canonical_dims_;
+
+  // Contracting dimensions of the LHS and RHS matmul shapes.
+  absl::InlinedVector<int64_t, 2> lhs_matmul_contracting_dims_;
+  absl::InlinedVector<int64_t, 2> rhs_matmul_contracting_dims_;
+};
+
+//===----------------------------------------------------------------------===//
+// DotThunk implementation details.
+//===----------------------------------------------------------------------===//
+
+template <typename T, Eigen::AlignmentType alignment>
+void DotThunk::MatMul(const Eigen::ThreadPoolDevice* device, T* out, T* lhs,
+                      T* rhs, int64_t m, int64_t n, int64_t k,
+                      int32_t transpose_lhs, int32_t transpose_rhs,
+                      DoneCallback done) {
+  int64_t lhs_rows = m;
+  int64_t lhs_cols = k;
+  if (transpose_lhs) std::swap(lhs_rows, lhs_cols);
+
+  int64_t rhs_rows = k;
+  int64_t rhs_cols = n;
+  if (transpose_rhs) std::swap(rhs_rows, rhs_cols);
+
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, alignment> a(lhs, lhs_rows,
+                                                                 lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, alignment> b(rhs, rhs_rows,
+                                                                 rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<T, 2>, alignment> c(out, m, n);
+
+  typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+  int lhs_contract_dim = transpose_lhs ? 0 : 1;
+  int rhs_contract_dim = transpose_rhs ? 1 : 0;
+  std::array<DimPair, 1> dims({DimPair(lhs_contract_dim, rhs_contract_dim)});
+
+  c.device(*device, std::move(done)) = a.contract(b, dims);
+}
+
+template <typename T>
+void DotThunk::TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out,
+                           void* lhs, void* rhs, int64_t m, int64_t n,
+                           int64_t k, bool transpose_lhs, bool transpose_rhs,
+                           DoneCallback done) {
+  auto is_16_byte_aligned = [](void* ptr) {
+    return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+  };
+
+  bool is_aligned = is_16_byte_aligned(lhs) && is_16_byte_aligned(rhs) &&
+                    is_16_byte_aligned(out);
+
+  if (ABSL_PREDICT_TRUE(is_aligned)) {
+    MatMul<T, Eigen::Aligned16>(device, static_cast<T*>(out),
+                                static_cast<T*>(lhs), static_cast<T*>(rhs), m,
+                                n, k, transpose_lhs, transpose_rhs,
+                                std::move(done));
+  } else {
+    MatMul<T, Eigen::Unaligned>(device, static_cast<T*>(out),
+                                static_cast<T*>(lhs), static_cast<T*>(rhs), m,
+                                n, k, transpose_lhs, transpose_rhs,
+                                std::move(done));
+  }
+}
+
+// Extern DotThunk::TypedMatMul template for all supported data types to enable
+// parallel compilation.
+#define DOT_THUNK_EXTERN_MATMUL_TEMPLATE(T)                                    \
+  extern template void DotThunk::TypedMatMul<T>(                               \
+      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
+      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
+      DoneCallback done)
+
+DOT_THUNK_EXTERN_MATMUL_TEMPLATE(Eigen::half);
+DOT_THUNK_EXTERN_MATMUL_TEMPLATE(float);
+DOT_THUNK_EXTERN_MATMUL_TEMPLATE(double);
+DOT_THUNK_EXTERN_MATMUL_TEMPLATE(int32_t);
+DOT_THUNK_EXTERN_MATMUL_TEMPLATE(std::complex<float>);
+DOT_THUNK_EXTERN_MATMUL_TEMPLATE(std::complex<double>);
+
+#undef DOT_THUNK_EXTERN_MATMUL_TEMPLATE
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/fft_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/fft_thunk.h
new file mode 100644
index 00000000..64d4063d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/fft_thunk.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_FFT_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_FFT_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// This class stores everything that is needed to launch an FFT.
+// It is generated by IrEmitter.
+//
+// This is thread-compatible.
+class FftThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<FftThunk>> Create(
+      Info thunk_info, bool is_multi_thread_eigen, int32_t fft_type,
+      absl::Span<const int64_t> fft_length,
+      BufferAllocation::Slice input_buffer, const Shape& input_shape,
+      BufferAllocation::Slice output_buffer, const Shape& output_shape);
+
+  tsl::AsyncValueRef<Thunk::ExecuteEvent> Execute(
+      const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+
+ private:
+  // Constructs a thunk for launching an FFT on a host.
+  FftThunk(Info thunk_info, bool is_multi_thread_eigen, int32_t fft_type,
+           absl::Span<const int64_t> fft_length,
+           BufferAllocation::Slice input_buffer, const Shape& input_shape,
+           BufferAllocation::Slice output_buffer, const Shape& output_shape);
+
+  const bool is_multi_thread_eigen_;
+  const bool is_double_precision_;
+  const int32_t fft_type_;
+  const std::vector<int64_t> fft_length_;
+
+  const BufferAllocation::Slice input_buffer_;
+  const BufferAllocation::Slice output_buffer_;
+
+  const Shape input_shape_;
+  const Shape output_shape_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_FFT_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/function_library.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/function_library.h
new file mode 100644
index 00000000..76e213c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/function_library.h
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_FUNCTION_LIBRARY_H_
+#define XLA_BACKENDS_CPU_RUNTIME_FUNCTION_LIBRARY_H_
+
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/kernel_c_api.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::cpu {
+
+// A library of functions required by the XLA:CPU runtime to execute an XLA
+// program.
+//
+// XLA:CPU program compiles to a collection of functions that are dispatched by
+// the runtime. The most common type of compiled function is an XLA CPU Kernel,
+// however some operations can be compiled to auxiliary functions that are
+// invoked by operation-specific Thunks, e.g. `sort` operation comparator
+// compiles to a separate function used by a SortThunk in combination with an
+// `std::sort` library call.
+class FunctionLibrary {
+ public:
+  // Compute kernel function type (corresponds to `fusion` operation).
+  using Kernel = XLA_CPU_Kernel;
+
+  // Comparator functor for `sort` operation.
+  //
+  // TODO(ezhulenev): We rely on legacy IrEmitter to emit comparator
+  // functions, and we use legacy compute function ABI. We should emit a
+  // much simpler comparator function that only takes compared values.
+  using Comparator = void(bool* result, const void* run_options,
+                          const void** params, const void* buffer_table,
+                          const void* status, const void* prof_counters);
+
+  virtual ~FunctionLibrary() = default;
+
+  // We use a `TypeId` to distinguish functions of different type at run time.
+  TSL_LIB_GTL_DEFINE_INT_TYPE(TypeId, int64_t);
+  static constexpr TypeId kUnknownTypeId = TypeId(0);
+
+  struct Symbol {
+    TypeId type_id;
+    std::string name;
+  };
+
+  template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
+  static Symbol Sym(std::string name) {
+    return Symbol{GetTypeId<F>(), std::move(name)};
+  }
+
+  template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
+  absl::StatusOr<F*> ResolveFunction(absl::string_view name) {
+    TF_ASSIGN_OR_RETURN(void* ptr, ResolveFunction(GetTypeId<F>(), name));
+    return reinterpret_cast<F*>(ptr);
+  }
+
+ protected:
+  // Returns a type-erased pointer to the function with the given name and type
+  // id. Implementation might choose not to verify the type id and then it is up
+  // to the caller to ensure the resolved function is of the correct type.
+  virtual absl::StatusOr<void*> ResolveFunction(TypeId type_id,
+                                                absl::string_view name) = 0;
+
+ private:
+  // Returns a type id for a given function type.
+  template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
+  static TypeId GetTypeId() {
+    static const TypeId id = GetNextTypeId();
+    return id;
+  }
+
+  static TypeId GetNextTypeId();
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_FUNCTION_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h
new file mode 100644
index 00000000..1d4225d1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_INFEED_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_INFEED_THUNK_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// Infeeds data from the runtime-managed infeed queue into the given slices.
+class InfeedThunk final : public Thunk {
+ public:
+  struct InfeedBuffer {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  struct InfeedResources {
+    std::shared_ptr<Resource> consume_token;
+    std::shared_ptr<Resource> produce_token;
+  };
+
+  static absl::StatusOr<std::unique_ptr<InfeedThunk>> Create(
+      Info info, absl::Span<const InfeedBuffer> infeed_buffers,
+      InfeedResources infeed_resources);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+  ResourceUses resource_uses() const final;
+
+ private:
+  InfeedThunk(Info info, absl::Span<const InfeedBuffer> infeed_buffers,
+              InfeedResources infeed_resources);
+
+  std::vector<InfeedBuffer> infeed_buffers_;
+  InfeedResources infeed_resources_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_INFEED_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel.h
new file mode 100644
index 00000000..86e3055d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel.h
@@ -0,0 +1,145 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_KERNEL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/kernel_c_api.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla::cpu {
+
+class Kernel {
+ public:
+  using Task = std::function<void()>;
+  using TaskRunner = absl::AnyInvocable<void(Task)>;
+
+  // A struct to report completion of the kernel execution.
+  using LaunchEvent = tsl::Chain;
+
+  using ThreadDim = stream_executor::ThreadDim;
+  using DeviceMemoryBase = stream_executor::DeviceMemoryBase;
+
+  // Virtual base class that owns the function behind the host kernel. It can be
+  // a function in a jit-compiled LLVM module or simply a pointer to the
+  // in-process function written in C++. Kernel is responsible for launching
+  // the kernel function owned by the KernelFunction with given user-provided
+  // arguments potentially on a thread pool.
+  class KernelFunction {
+   public:
+    virtual ~KernelFunction() = default;
+    virtual XLA_CPU_Kernel* kernel() const = 0;
+  };
+
+  // A wrapper around function pointer that implements XLA_CPU_Kernel API.
+  class KernelFunctionPtr final : public KernelFunction {
+   public:
+    explicit KernelFunctionPtr(XLA_CPU_Kernel* ptr) : ptr_(ptr) {}
+    XLA_CPU_Kernel* kernel() const final { return ptr_; }
+
+   private:
+    XLA_CPU_Kernel* ptr_;  // not owned
+  };
+
+  // TODO(tsilytskyi): make this implementation detail private
+  Kernel(unsigned arity, XLA_CPU_Kernel* kernel);
+
+  // Calls the kernel once in the caller thread for a thread dim (0,0,0).
+  // This is a fast path for small host kernels that have just one thread.
+  absl::Status CallOnce(absl::Span<const XLA_CPU_KernelArg> args) const;
+
+  // Launches the kernel on the current thread by iterating over all threads in
+  // `thread_dims` and calling the kernel function.
+  absl::Status Launch(const ThreadDim& thread_dims,
+                      absl::Span<const DeviceMemoryBase> buffers) const;
+  absl::Status Launch(const ThreadDim& thread_dims,
+                      absl::Span<const XLA_CPU_KernelArg> args) const;
+
+  // Launches the kernel by iterating over all threads in `thread_dims` and
+  // calling `task_runner` to run individual task (implementation might decide
+  // to run some of the tasks in the caller thread to save on scheduling
+  // overheads). It's up to the caller to define where task runner will execute
+  // the task, i.e., a common case is to launch them on a thread pool.
+  //
+  // The returned async value becomes available after all tasks are completed.
+  // Async value returned in constructed state and the caller can access it to
+  // get the number of tasks that are expected to be completed.
+  tsl::AsyncValueRef<LaunchEvent> Launch(
+      const ThreadDim& thread_dims, absl::Span<const DeviceMemoryBase> buffers,
+      TaskRunner task_runner) const;
+  tsl::AsyncValueRef<LaunchEvent> Launch(
+      const ThreadDim& thread_dims, absl::Span<const XLA_CPU_KernelArg> args,
+      TaskRunner task_runner) const;
+
+  // For host platform, we assume that a core is a thread, and we can run at
+  // most one instance of a kernel on a given thread.
+  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(ThreadDim, size_t) const {
+    return 1;
+  };
+
+  void SetArity(unsigned arity) { arity_ = arity; };
+  unsigned Arity() const { return arity_; };
+
+  template <typename T,
+            std::enable_if_t<std::is_base_of_v<KernelFunction, T>>* = nullptr>
+  void SetKernelFunction(std::unique_ptr<T> function) {
+    function_ = std::move(function);
+    kernel_ = function_->kernel();
+  }
+
+ private:
+  std::unique_ptr<KernelFunction> function_;
+  XLA_CPU_Kernel* kernel_;  // pointer to the kernel owned by `function_`
+
+  unsigned arity_;
+};
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE absl::Status Kernel::CallOnce(
+    absl::Span<const XLA_CPU_KernelArg> args) const {
+  constexpr XLA_CPU_KernelThreadDim kernel_thread_dims = {1, 1, 1};
+  constexpr XLA_CPU_KernelThread kernel_thread = {1, 1, 1};
+
+  XLA_CPU_KernelCallFrame call_frame = {&kernel_thread_dims, &kernel_thread,
+                                        args.size(), args.data()};
+
+  XLA_CPU_KernelError* error = (*kernel_)(&call_frame);
+
+  if (ABSL_PREDICT_FALSE(error != nullptr)) {
+    return absl::InternalError("Failed to call host kernel");
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel_c_api.h
new file mode 100644
index 00000000..cbe05685
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel_c_api.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_KERNEL_C_API_H_
+#define XLA_BACKENDS_CPU_RUNTIME_KERNEL_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+//===----------------------------------------------------------------------===//
+// CPU Kernel API
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// CPU kernel API is an integration point between a codegen
+// backend and a runtime. XLA:CPU backend compiles fusion regions to native
+// functions (via LLVM backend) that are compatible with a kernel API (and ABI),
+// and the runtime is simply invoking them with user buffers and orchestrates
+// multi-threaded execution.
+
+// WARNING: This API does not provide any backward compatibility guarantees as
+// today XLA:CPU backend is statically linked and we do not plan to load
+// kernels from dynamic libraries. It's defined as C API because we have to
+// match it in the codegen backend (built on top of LLVM) and C structs have
+// trivial layout that can be expressed as llvm stuct (*).
+//
+// (*) https://llvm.org/docs/LangRef.html#structure-types
+
+// Similar to a Gpu backend an XLA:CPU compiler generates a tiled function from
+// an HLO fusion where each tile is responsible for computing a part of the
+// output. It's up to compiler to chose the tiling strategy, from CPU runtime
+// perspective it's simply an iteration space where each task is independent and
+// can be executed concurrently.
+typedef struct XLA_CPU_KernelDim3 {
+  uint64_t x;
+  uint64_t y;
+  uint64_t z;
+} XLA_CPU_KernelDim3;
+
+// Kernel grid size roughly corresponds to a CUDA block size.
+typedef struct XLA_CPU_KernelDim3 XLA_CPU_KernelThreadDim;
+
+// Kernel grid coordinate roughly corresponds to a CUDA block, with an
+// assumption that all kernel invocations can run concurrently.
+typedef struct XLA_CPU_KernelDim3 XLA_CPU_KernelThread;
+
+// A CPU kernel argument that corresponds to se::DeviceMemoryBase.
+typedef struct XLA_CPU_KernelArg {
+  void* data;
+  size_t size;
+} XLA_CPU_KernelArg;
+
+// A CPU kernel call frame.
+typedef struct XLA_CPU_KernelCallFrame {
+  const XLA_CPU_KernelThreadDim* thread_dims;
+  const XLA_CPU_KernelThread* thread;
+
+  size_t num_args;
+  const XLA_CPU_KernelArg* args;
+} XLA_CPU_KernelCallFrame;
+
+// Error reporting for host kernels. NULL means success.
+typedef struct XLA_CPU_KernelError XLA_CPU_KernelError;
+
+// Host kernel API.
+typedef XLA_CPU_KernelError* XLA_CPU_Kernel(
+    const XLA_CPU_KernelCallFrame* call_frame);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_KERNEL_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
new file mode 100644
index 00000000..173f4442
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
@@ -0,0 +1,172 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_KERNEL_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_KERNEL_THUNK_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/codegen/llvm_ir_kernel_spec.h"
+#include "xla/backends/cpu/runtime/kernel.h"
+#include "xla/backends/cpu/runtime/kernel_c_api.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// Forward declare thunk defined below.
+class KernelThunk;
+
+namespace internal {
+
+// If the number of kernel parameters (arguments and results) is unknown at
+// compile time, we use this value to indicate that the parameter is dynamic.
+inline constexpr int64_t kDynamicKernelParameter = -1;
+
+// A base template for a KernelThunk that can be specialized for a statically
+// known number of arguments and results. We go extra mile here to optimize
+// host kernel dispatching on the hot execution path to minimize the XLA runtime
+// overheads for the smallest HLO modules.
+template <int64_t num_arguments = kDynamicKernelParameter,
+          int64_t num_results = kDynamicKernelParameter>
+class KernelThunk : public Thunk {
+ public:
+  BufferUses buffer_uses() const final;
+
+ protected:
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteInternal(const ExecuteParams& params);
+
+ private:
+  friend class ::xla::cpu::KernelThunk;
+
+  static constexpr bool IsDynamic(size_t n) {
+    return n == kDynamicKernelParameter;
+  }
+
+  static constexpr size_t Size(int64_t size) {
+    return std::max<size_t>(size, 0);
+  }
+
+  // If we know the number of arguments and results at compile time, we use
+  // std::array with a fixed size, which allows compiler to automatically unroll
+  // all the loops on a hot path.
+
+  using ArgumentsBuffers = std::conditional_t<
+      IsDynamic(num_arguments), std::vector<BufferAllocation::Slice>,
+      std::array<BufferAllocation::Slice, Size(num_arguments)>>;
+
+  using ResultsBuffers = std::conditional_t<
+      IsDynamic(num_results), std::vector<BufferAllocation::Slice>,
+      std::array<BufferAllocation::Slice, Size(num_results)>>;
+
+  using KernelArgs = std::conditional_t<
+      IsDynamic(num_arguments) || IsDynamic(num_results),
+      absl::InlinedVector<XLA_CPU_KernelArg, 8>,
+      std::array<XLA_CPU_KernelArg, Size(num_arguments + num_results)>>;
+
+  KernelThunk(Info info,
+              absl::Span<const BufferAllocation::Slice> arguments_buffers,
+              absl::Span<const BufferAllocation::Slice> results_buffers,
+              std::optional<absl::flat_hash_set<int64_t>> invariant_arguments,
+              std::string kernel_name, se::ThreadDim thread_dim,
+              std::optional<uint64_t> min_alignment);
+
+  absl::Status CheckInvariantBuffersMemory(const KernelArgs& kernel_args) const;
+
+  ArgumentsBuffers arguments_buffers_;
+  ResultsBuffers results_buffers_;
+
+  // A set of invariant arguments (their indices).
+  std::optional<absl::flat_hash_set<int64_t>> invariant_arguments_;
+
+  size_t num_kernel_args_;
+
+  std::string kernel_name_;
+  se::ThreadDim thread_dim_;
+  std::optional<uint64_t> min_alignment_;
+
+  // If `true`, host kernel will be called just once for a logical thread dim
+  // (1,1,1). This is a fast path for small host kernels that have just one
+  // logical thread dim.
+  bool call_once_;
+
+  // Lazily loaded host kernel corresponding to `kernel_name_`.
+  absl::once_flag kernel_init_flag_;
+  absl::StatusOr<Kernel> kernel_;
+
+  // Pre-initialized kernel arguments that are updated with memory addresses
+  // before the kernel launch.
+  KernelArgs kernel_args_;
+};
+
+}  // namespace internal
+
+// Kernel thunk specialization for a small kernel with a statically known number
+// of arguments and results.
+template <int64_t num_arguments, int64_t num_results>
+class SmallKernelThunk final
+    : public internal::KernelThunk<num_arguments, num_results> {
+  using Base = internal::KernelThunk<num_arguments, num_results>;
+
+ public:
+  using Base::Base;
+
+  tsl::AsyncValueRef<Thunk::ExecuteEvent> Execute(
+      const Thunk::ExecuteParams& params) final;
+};
+
+// Kernel thunk specialization for dynamic number of arguments and results.
+class KernelThunk final : public internal::KernelThunk<> {
+  using Base = internal::KernelThunk<>;
+
+ public:
+  using Base::Base;
+
+  static absl::StatusOr<std::unique_ptr<Thunk>> Create(
+      Thunk::Info info,
+      absl::Span<const BufferAllocation::Slice> arguments_buffers,
+      absl::Span<const BufferAllocation::Slice> results_buffers,
+      std::string kernel_name, se::ThreadDim thread_dim,
+      std::optional<absl::flat_hash_set<int64_t>> invariant_arguments,
+      std::optional<uint64_t> min_alignment = std::nullopt);
+
+  static absl::StatusOr<std::unique_ptr<Thunk>> Create(
+      Thunk::Info info, std::unique_ptr<LlvmIrKernelSpec> kernel_spec,
+      std::optional<uint64_t> min_alignment);
+
+  tsl::AsyncValueRef<Thunk::ExecuteEvent> Execute(
+      const Thunk::ExecuteParams& params) final;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_KERNEL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/logical_id_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/logical_id_thunk.h
new file mode 100644
index 00000000..6a42fe69
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/logical_id_thunk.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_LOGICAL_ID_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_LOGICAL_ID_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+namespace internal {
+enum class LogicalIdKind {
+  kPartitionId,
+  kReplicaId,
+};
+
+template <LogicalIdKind logical_id_kind>
+class LogicalIdThunk : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<LogicalIdThunk>> Create(
+      Info info, BufferAllocation::Slice logical_id_buffer);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+
+ private:
+  LogicalIdThunk(Info info, BufferAllocation::Slice logical_id_buffer);
+
+  absl::StatusOr<int32_t> GetIdForDevice(
+      const DeviceAssignment* device_assignment,
+      GlobalDeviceId device_id) const;
+
+  BufferAllocation::Slice logical_id_buffer_;
+};
+
+// Template is defined and explicitly instantiated in logical_id_thunk.cc.
+extern template class LogicalIdThunk<LogicalIdKind::kReplicaId>;
+extern template class LogicalIdThunk<LogicalIdKind::kPartitionId>;
+
+}  // namespace internal
+
+class ReplicaIdThunk final
+    : public internal::LogicalIdThunk<internal::LogicalIdKind::kReplicaId> {};
+
+class PartitionIdThunk final
+    : public internal::LogicalIdThunk<internal::LogicalIdKind::kPartitionId> {};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_LOGICAL_ID_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h
new file mode 100644
index 00000000..74920899
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_OUTFEED_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_OUTFEED_THUNK_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+class OutfeedThunk final : public Thunk {
+ public:
+  struct OutfeedBuffer {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  struct OutfeedResources {
+    std::shared_ptr<Resource> consume_token;
+    std::shared_ptr<Resource> produce_token;
+  };
+
+  static absl::StatusOr<std::unique_ptr<OutfeedThunk>> Create(
+      Info info, absl::Span<const OutfeedBuffer> outfeed_buffers,
+      OutfeedResources outfeed_resources);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+  ResourceUses resource_uses() const final;
+
+ private:
+  OutfeedThunk(Info info, absl::Span<const OutfeedBuffer> outfeed_buffers,
+               OutfeedResources outfeed_resources);
+
+  std::vector<OutfeedBuffer> outfeed_buffers_;
+  OutfeedResources outfeed_resources_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_OUTFEED_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.h
new file mode 100644
index 00000000..104d6c35
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_REDUCE_SCATTER_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_REDUCE_SCATTER_THUNK_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/collective_thunk.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class ReduceScatterThunk final : public CollectiveThunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<ReduceScatterThunk>> Create(
+      Info info, ReductionKind reduction_kind, OpParams op_params,
+      OpBuffers op_buffers, OpResources op_resources);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+ private:
+  ReduceScatterThunk(Info info, ReductionKind reduction_kind,
+                     OpParams op_params, OpBuffers op_buffers,
+                     OpResources op_resources);
+
+  ReductionKind reduction_kind_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_REDUCE_SCATTER_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/resource_use.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/resource_use.h
new file mode 100644
index 00000000..1442a289
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/resource_use.h
@@ -0,0 +1,114 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_RESOURCE_USE_H_
+#define XLA_BACKENDS_CPU_RUNTIME_RESOURCE_USE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+
+namespace xla::cpu {
+
+// `Resource` models a run time resource that imposes ordering on the thunk
+// execution in addition to thunk buffer uses.
+class Resource {
+ public:
+  enum class Kind {
+    // Side-effecting operations (i.e., infeed and outfeed) define their
+    // execution order via token dependencies. We rely on token resource to
+    // enforce ordering at run time.
+    kToken,
+
+    // Collective operations must be executed in the same order as they are
+    // defined in the HLO module. We rely on collective communicator resource
+    // to enforce ordering at run time.
+    kCollectiveCommunicator
+  };
+
+  static constexpr Kind kToken = Kind::kToken;
+  static constexpr Kind kCollectiveCommunicator = Kind::kCollectiveCommunicator;
+
+  static std::shared_ptr<Resource> Create(Kind kind);
+
+  Kind kind() const { return kind_; }
+
+ private:
+  explicit Resource(Kind kind);
+  Kind kind_;
+};
+
+// For consistency with BufferUse, we model resource uses as writes or reads
+// to and from resource. Resources have referential equality: we rely on
+// comparing pointers to check if resource is the same or not.
+class ResourceUse {
+ public:
+  enum class ResourceAccess { kRead, kWrite };
+
+  static constexpr ResourceAccess kRead = ResourceAccess::kRead;
+  static constexpr ResourceAccess kWrite = ResourceAccess::kWrite;
+
+  static ResourceUse Read(std::shared_ptr<Resource> resource) {
+    return ResourceUse(std::move(resource), ResourceAccess::kRead);
+  }
+
+  static ResourceUse Write(std::shared_ptr<Resource> resource) {
+    return ResourceUse(std::move(resource), ResourceAccess::kWrite);
+  }
+
+  const std::shared_ptr<Resource>& resource() const { return resource_; }
+  ResourceAccess access() const { return access_; }
+
+  // ReadWriteSet tracks a set of read and write resources.
+  class ReadWriteSet {
+   public:
+    ReadWriteSet();
+
+    void Add(ResourceUse use);
+    void AddAll(absl::Span<const ResourceUse> uses);
+
+    // Returns true if any of the resource use(s) has a conflict with tracked
+    // resource reads or writes.
+    bool HasConflicts(const ResourceUse& use) const;
+    bool HasConflicts(absl::Span<const ResourceUse> uses) const;
+    bool HasConflicts(const ReadWriteSet& other);
+
+   private:
+    absl::flat_hash_set<std::shared_ptr<Resource>> read_;
+    absl::flat_hash_set<std::shared_ptr<Resource>> write_;
+  };
+
+  bool operator==(const ResourceUse& other) const {
+    return resource_ == other.resource_ && access_ == other.access_;
+  }
+
+  bool operator!=(const ResourceUse& other) const { return !(*this == other); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ResourceUse& use) {
+    return H::combine(std::move(h), use.resource_, use.access_);
+  }
+
+ private:
+  ResourceUse(std::shared_ptr<Resource> resource, ResourceAccess access);
+  std::shared_ptr<Resource> resource_;
+  ResourceAccess access_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_RESOURCE_USE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h
new file mode 100644
index 00000000..d00bf452
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_RNG_STATE_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_RNG_STATE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/numeric/int128.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla::cpu {
+
+// Keeps rng state as a "global" variable (global for the parent cpu
+// executable) and adds a delta value to it and returns the old value on every
+// call to execute.
+class RngGetAndUpdateStateThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<RngGetAndUpdateStateThunk>> Create(
+      Info info, BufferAllocation::Slice state_buffer, int64_t delta);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final {
+    return {{state_buffer_, BufferUse::kWrite}};
+  }
+
+ private:
+  RngGetAndUpdateStateThunk(Info info, BufferAllocation::Slice state_buffer,
+                            int64_t delta);
+
+  BufferAllocation::Slice state_buffer_;
+  int64_t delta_;
+
+  absl::Mutex mu_;
+  absl::int128 state_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_RNG_STATE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
new file mode 100644
index 00000000..6d32ab1a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_SORT_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_SORT_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// Sorts data in the input buffers along the given dimension with a custom
+// less-than comparator function.
+class SortThunk final : public Thunk {
+ public:
+  using LessThan = absl::AnyInvocable<bool(const void** data)>;
+
+  enum class SortDirection {
+    kAscending,
+    kDescending,
+  };
+
+  struct Input {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  static absl::StatusOr<std::unique_ptr<SortThunk>> Create(
+      Info info, absl::Span<const Input> inputs, int64_t dimension,
+      bool is_stable, LessThan less_than,
+      std::optional<SortDirection> direction);
+
+  static absl::StatusOr<std::unique_ptr<SortThunk>> Create(
+      Info info, absl::Span<const Input> inputs, int64_t dimension,
+      bool is_stable, std::string comparator_name,
+      std::optional<SortDirection> direction);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+
+ private:
+  SortThunk(Info info, absl::Span<const Input> inputs, int64_t dimension,
+            bool is_stable, LessThan less_than,
+            std::optional<SortDirection> direction);
+
+  SortThunk(Info info, absl::Span<const Input> inputs, int64_t dimension,
+            bool is_stable, std::string comparator_name,
+            std::optional<SortDirection> direction);
+
+  std::vector<Input> inputs_;
+  int64_t dimension_;
+  bool is_stable_;
+  std::optional<SortDirection> direction_;
+
+  // Name of the comparator function, lazily resolved to a comparator function
+  // pointer using Thunk::FunctionRegistry.
+  std::string comparator_name_;
+
+  // Lazily resolved LessThan comparator function.
+  absl::once_flag less_than_init_flag_;
+  absl::StatusOr<LessThan> less_than_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_SORT_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thread_pool_task_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thread_pool_task_runner.h
new file mode 100644
index 00000000..95e36c29
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thread_pool_task_runner.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_THREAD_POOL_TASK_RUNNER_H_
+#define XLA_BACKENDS_CPU_RUNTIME_THREAD_POOL_TASK_RUNNER_H_
+
+#define EIGEN_USE_THREADS
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "unsupported/Eigen/CXX11/ThreadPool"
+#include "xla/backends/cpu/runtime/thunk.h"
+
+namespace xla::cpu {
+
+// An implementation of a `Thunk::TaskRunner` that uses Eigen thread pool for
+// launching ThunkExecutor tasks. In XLA in practice it means that we run
+// all ThunkExecutor tasks in the intra-op thread pool (owned by PjRt client).
+class ThreadPoolTaskRunner : public Thunk::TaskRunner {
+ public:
+  explicit ThreadPoolTaskRunner(Eigen::ThreadPoolInterface* thread_pool)
+      : thread_pool_(thread_pool) {}
+
+  void operator()(Thunk::Task task) final {
+    if (thread_pool_ == nullptr) {
+      task();
+    } else {
+      thread_pool_->Schedule(std::move(task));
+    }
+  }
+
+  std::optional<int64_t> current_worker_id() const final {
+    if (thread_pool_ == nullptr) {
+      return {0};
+    } else {
+      int64_t thread_id = thread_pool_->CurrentThreadId();
+      return thread_id == -1 ? std::nullopt : std::make_optional(thread_id);
+    }
+  }
+
+ private:
+  Eigen::ThreadPoolInterface* thread_pool_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_THREAD_POOL_TASK_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk.h
new file mode 100644
index 00000000..2c86db92
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk.h
@@ -0,0 +1,346 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_THUNK_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/backends/cpu/runtime/buffer_allocations.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/cpu/xfeed_manager.h"
+#include "xla/service/global_device_id.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}  // namespace Eigen
+
+namespace xla::cpu {
+
+// WARNING: This is under construction. Long term plan for XLA is to unify
+// runtimes between different backends and have a shared Thunk interface,
+// however for now we chose to have separate Thunk implementations in xla::cpu
+// and xla::gpu namespaces with a plan to unify them in the future.
+
+// Thunk is the basic unit of execution for the XLA CPU runtime.
+//
+// This is thread-compatible. Thunk implementation should expect that it will be
+// called concurrently from multiple threads, for different run ids and for
+// different devices. For partitioned XLA programs the expectation is that all
+// local participants execute simultaneously on different threads and coordinate
+// resource acquisition via rendezvous.
+//
+// This is XLA CPU's counterpart of the XLA GPU runtime Thunk.
+class Thunk {
+ public:
+  enum class Kind {
+    kAllGather,
+    kAllReduce,
+    kAllToAll,
+    kCall,
+    kCollectivePermute,
+    kCopy,
+    kConditional,
+    kConvolution,
+    kCustomCall,
+    kDot,
+    kFft,
+    kInfeed,
+    kKernel,
+    kOutfeed,
+    kPartitionId,
+    kReduceScatter,
+    kReplicaId,
+    kRngGetAndUpdateState,
+    kSort,
+    kTopK,
+    kWhile,
+    kXnnFusion,
+  };
+
+  struct Info {
+    std::string op_name;
+    std::string module_name;
+    int64_t module_id;
+  };
+
+  using Task = std::function<void()>;
+
+  // An abstract task runner that can be used by a ThunkExecutor (including
+  // thunk executors for nested computations in conditional or while thunks) for
+  // running tasks corresponding to thunk execution. It can be a simple inline
+  // executor that runs tasks on the same thread, or a runner backed by a thread
+  // pool. By default XLA:CPU uses task runner that shares underlying thread
+  // pool with the intra-op thread pool used for compute tasks. We deliberately
+  // do not prescribe task runner to be Eigen or any other particular thread
+  // pool, and let users make the choice.
+  class TaskRunner {
+   public:
+    virtual ~TaskRunner() = default;
+
+    virtual void operator()(Task task) = 0;
+
+    // Returns the current worker id if the caller happens to run on a thread
+    // managed by the task runner. Otherwise returns empty optional. Thunk
+    // executor relies on this information to do a best-effort resource
+    // isolation by making sure that all thunks are executed inside a task
+    // runner, and do not "leak" into arbitrary thread pools in the process,
+    // because by default we resume execution on a thread that completed thunk
+    // execute event AsyncValue, and it can be an external thread pool.
+    virtual std::optional<int64_t> current_worker_id() const = 0;
+  };
+
+  Thunk(Kind kind, Info info);
+
+  Thunk(const Thunk&) = delete;
+  Thunk& operator=(const Thunk&) = delete;
+
+  virtual ~Thunk() = default;
+
+  Kind kind() const { return kind_; }
+  const Info& info() const { return info_; }
+
+  static absl::string_view KindToString(Kind kind);
+
+  // Returns the list of buffers used by a thunk. Thunk executor relies on this
+  // information to execute thunks concurrently and to avoid data races.
+  using BufferUses = absl::InlinedVector<BufferUse, 4>;
+  virtual BufferUses buffer_uses() const = 0;
+
+  // Returns the list of resources used by a thunk. Thunk executor relies on
+  // this information to execute thunks concurrently and to avoid data races. In
+  // contrast to buffer uses, only a handful of thunks are expected to use
+  // resources, so we define a default implementation for `resource_uses()`
+  // that returns an empty vector.
+  using ResourceUses = absl::InlinedVector<ResourceUse, 4>;
+  virtual ResourceUses resource_uses() const { return {}; }
+
+  //===--------------------------------------------------------------------===//
+  // CollectiveExecuteParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters capturing all the details required for collective execution of
+  // XLA executables (multiple partitions and replicas).
+  struct CollectiveExecuteParams {
+    static absl::StatusOr<CollectiveExecuteParams> Create(
+        const ExecutableRunOptions* run_options);
+
+    RunId run_id;
+
+    int64_t local_device_ordinal;
+    GlobalDeviceId global_device_id;
+
+    const DeviceAssignment* device_assignment = nullptr;
+    CpuCollectives* collectives = nullptr;
+
+   private:
+    CollectiveExecuteParams(RunId run_id, int64_t local_device_ordinal,
+                            GlobalDeviceId global_device_id,
+                            const DeviceAssignment* device_assignment,
+                            CpuCollectives* collectives);
+  };
+
+  //===--------------------------------------------------------------------===//
+  // CustomCallExecuteParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters capturing all the details required for custom call execution of
+  // XLA executables.
+  struct CustomCallExecuteParams {
+    static absl::StatusOr<CustomCallExecuteParams> Create(
+        const ExecutableRunOptions* run_options);
+
+    int32_t device_ordinal;
+    const Eigen::ThreadPoolDevice* intra_op_thread_pool = nullptr;
+    const ffi::ExecutionContext* ffi_execution_context = nullptr;
+
+   private:
+    CustomCallExecuteParams(int32_t device_ordinal,
+                            const Eigen::ThreadPoolDevice* intra_op_thread_pool,
+                            const ffi::ExecutionContext* ffi_execution_context);
+  };
+
+  //===--------------------------------------------------------------------===//
+  // ExecuteParams
+  //===--------------------------------------------------------------------===//
+
+  // ExecuteSession controls the number of task runner threads that can
+  // execute thunks concurrently (all thunks in a sequence, including thunks in
+  // nested computations). We limit the number of worker threads that process
+  // ready thunks concurrently to avoid overheads of launching too many tasks.
+  // Once the size of a ready queue exceeds the split threshold, we try to
+  // offload processing of the tail of the ready queue to the task runner.
+  //
+  // We use best-effort strategy to limit the number of worker threads (we rely
+  // on non-atomic pair of compare and add operations for efficiency), and don't
+  // guarantee that the number of concurrent workers is always below the limit,
+  // in some cases it can temporarily go above the limit.
+  //
+  // Execution session only controls the number of additional workers, and the
+  // main thread that kicks off the execution is not counted towards the limit.
+  class ExecuteSession {
+   public:
+    // TODO(ezhulenev): Number of workers and split threshold should be
+    // configurable with XLA_FLAGS. Also, we should find representative
+    // benchmarks to determine the optimal default values.
+    static constexpr int64_t kMaxWorkers = 4;
+    static constexpr int64_t kSplitThreshold = 8;
+
+    // We use std::shared_ptr as a "lock" where grabbing a copy of the shared
+    // pointer means joining the session executing a thunk sequence. We rely on
+    // shared pointer to keep track of the number of workers executing a thunk
+    // sequence because it is automatically manages atomic counter for us.
+    using Lock = std::shared_ptr<std::nullopt_t>;
+
+    ExecuteSession(int64_t max_workers, int64_t split_threshold);
+
+    // Joins the execute session and increments the number of session workers.
+    Lock Join() const { return lock_; }
+
+    // Tries to join the execute session. Returns empty lock if the session
+    // has reached the maximum number of workers.
+    Lock TryJoin() const {
+      return num_workers() >= max_workers_ ? nullptr : lock_;
+    }
+
+    int64_t num_workers() const { return lock_.use_count() - 1; }
+    int64_t max_workers() const { return max_workers_; }
+    int64_t split_threshold() const { return split_threshold_; }
+
+   private:
+    Lock lock_;
+    int64_t max_workers_;
+    int64_t split_threshold_;
+  };
+
+  // Parameters passed to Execute. Execute is responsible for launching "work"
+  // on device, i.e., it launches host kernels, calls into libraries, etc.
+  struct ExecuteParams {
+    FunctionLibrary* function_library = nullptr;
+    const BufferAllocations* buffer_allocations = nullptr;
+    runtime::XfeedManager* xfeed = nullptr;
+    const Eigen::ThreadPoolDevice* intra_op_threadpool = nullptr;
+    TaskRunner* task_runner = nullptr;
+    CollectiveExecuteParams* collective_params = nullptr;
+    CustomCallExecuteParams* custom_call_params = nullptr;
+    ExecuteSession session = ExecuteSession(ExecuteSession::kMaxWorkers,
+                                            ExecuteSession::kSplitThreshold);
+  };
+
+  // An execute event that becomes ready when all tasks are completed.
+  using ExecuteEvent = tsl::Chain;
+
+  // Returns non-reference-counted async value ref in constructed state.
+  // Returned async value is a per-process singleton stored in a storage with a
+  // static duration, and can be safely compared using pointer equality.
+  static tsl::AsyncValueRef<ExecuteEvent> OkExecuteEventSingleton();
+
+  // Returns `OkExecuteEventSingleton()` cached by this thunk instance.
+  tsl::AsyncValueRef<ExecuteEvent> OkExecuteEvent() const { return ok_event_; }
+
+  bool IsOkExecuteEvent(const tsl::AsyncValueRef<ExecuteEvent>& event) const {
+    return event == ok_event_;
+  }
+
+  bool IsOkExecuteEvent(tsl::AsyncValuePtr<ExecuteEvent> event) const {
+    return event == ok_event_.AsPtr();
+  }
+
+  // Thunk execution must be asynchronous and never block the caller thread,
+  // especially waiting for work submitted into the `intra_op_threadpool`,
+  // because thunks themselves are executed on the same thread pool.
+  //
+  // Thunk execution completion must be reported via the `ExecuteEvent`.
+  virtual tsl::AsyncValueRef<ExecuteEvent> Execute(
+      const ExecuteParams& params) = 0;
+
+ protected:
+  // Encodes thunk info into the TraceMe compatible format.
+  std::string TraceMeEncode() const;
+
+  // Returns `true` if thunk should check buffer slices bounds, alignment, etc.
+  // In optimized builds, we skip buffer slices checks, and assume that all
+  // buffer slices are valid, as overhead of buffer slices checks adds up and
+  // become measurable on a hot path of executing tiny thunks.
+  static constexpr bool ShouldCheckBufferSlices() {
+#ifdef NDEBUG
+    return false;
+#else
+    return true;
+#endif  // NDEBUG
+  }
+
+ private:
+  Kind kind_;
+  Info info_;
+
+  tsl::AsyncValueRef<ExecuteEvent> ok_event_;
+};
+
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
+
+// A sequence of thunks to execute.
+class ThunkSequence : public std::vector<std::unique_ptr<Thunk>> {
+ public:
+  ThunkSequence() = default;
+
+  // Returns an empty thunk sequence.
+  static ThunkSequence Empty() { return ThunkSequence(); }
+
+  // Returns a thunk sequence that contains a single thunk of type `T`. Uses
+  // factory constructor `T::Create()` to create the thunk.
+  template <typename T, typename... Args>
+  static absl::StatusOr<ThunkSequence> Of(Args&&... args) {
+    static_assert(std::is_base_of_v<Thunk, T>,
+                  "ThunkSequence::Of() requires `T` to be a `Thunk` subclass.");
+    TF_ASSIGN_OR_RETURN(auto thunk, T::Create(std::forward<Args>(args)...));
+    return ThunkSequence(std::move(thunk));
+  }
+
+  using BufferUses = Thunk::BufferUses;
+  BufferUses buffer_uses() const;
+
+  using ResourceUses = Thunk::ResourceUses;
+  ResourceUses resource_uses() const;
+
+  void Append(ThunkSequence other);
+
+ private:
+  explicit ThunkSequence(std::unique_ptr<Thunk> thunk);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h
new file mode 100644
index 00000000..54b4a4be
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h
@@ -0,0 +1,267 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_THUNK_EXECUTOR_H_
+#define XLA_BACKENDS_CPU_RUNTIME_THUNK_EXECUTOR_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <new>
+#include <queue>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/fixed_array.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+namespace internal {
+// Clang does not allow defining a nested struct with member initializer, as
+// a workaround we define a struct in internal namespace and create an alias.
+struct ThunkExecutorOptions {
+  // If all thunks in a sequence use buffers of size less than or equal to the
+  // given threshold, we mark execution as sequential, as concurrency overheads
+  // will likely dominate the overall execution time.
+  size_t execute_sequential_buffer_threshold = 512;
+
+  // If thunk sequence length is less than or equal to the given threshold, we
+  // mark execution as sequential, as concurrency overheads will likely dominate
+  // the overall execution time.
+  size_t execute_sequential_num_thunks_threshold = 8;
+
+  // Use priority ready queue to execute nodes according to their priority. By
+  // default we use FIFO ready queue.
+  bool use_priority_ready_queue = false;
+};
+}  // namespace internal
+
+// A dataflow-style (run when ready) executor for a ThunkSequence that depends
+// on buffer uses to build a DAG defining execution order. At run time executes
+// thunks concurrently in a given thread pool.
+class ThunkExecutor {
+ public:
+  using BufferUses = Thunk::BufferUses;
+  using ResourceUses = Thunk::ResourceUses;
+  using ExecuteEvent = Thunk::ExecuteEvent;
+  using Options = internal::ThunkExecutorOptions;
+
+  // Nodes identified by their index in the captured ThunkSequence.
+  using NodeId = int64_t;
+
+  static constexpr NodeId kInvalidNodeId = std::numeric_limits<NodeId>::min();
+
+  ThunkExecutor(ThunkExecutor&&) = default;
+  ThunkExecutor& operator=(ThunkExecutor&&) = default;
+
+  static absl::StatusOr<ThunkExecutor> Create(
+      ThunkSequence thunk_sequence, const Options& options = Options());
+
+  // NodeDef defines an execution order for all thunks in a sequence.
+  struct NodeDef {
+    NodeId id = kInvalidNodeId;
+    int64_t priority = 0;
+    std::vector<NodeId> in_edges;
+    std::vector<NodeId> out_edges;
+  };
+
+  // Executes the thunk sequence using the prepared dataflow graph. Executor
+  // uses runner to execute ready tasks concurrently. If runner is not provided,
+  // executes all tasks in the caller thread.
+  //
+  // Returned execute event becomes ready when all thunks completed execution.
+  // If any of the thunks failed, the event will be in error state.
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const Thunk::ExecuteParams& params);
+
+  absl::Span<const NodeDef> nodes_defs() const { return nodes_defs_; }
+  const NodeDef& node_def(NodeId id) const { return nodes_defs_[id]; }
+
+  absl::Span<const NodeId> source() const { return source_; }
+  absl::Span<const NodeId> sink() const { return sink_; }
+
+  BufferUses buffer_uses() const { return thunk_sequence_.buffer_uses(); }
+  ResourceUses resource_uses() const { return thunk_sequence_.resource_uses(); }
+
+  std::string ToString() const;
+
+  bool is_sequential() const { return is_sequential_; }
+
+  // A ready queue that executes nodes in FIFO order.
+  class FifoReadyQueue {
+   public:
+    explicit FifoReadyQueue(absl::Span<const NodeId> ready_nodes);
+
+    void Push(NodeId id);
+
+    NodeId Pop();
+    FifoReadyQueue PopHalf();
+
+    size_t Size() const;
+    bool Empty() const;
+
+    FifoReadyQueue CreateEmptyReadyQueue() const;
+
+   private:
+    absl::InlinedVector<NodeId, 8> queue_;
+    size_t head_ = 0;
+  };
+
+  // A ready queue that executes nodes sorted by NodeDef priority.
+  class PriorityReadyQueue {
+   public:
+    PriorityReadyQueue(absl::Span<const NodeDef> nodes_defs,
+                       absl::Span<const NodeId> ready_nodes);
+
+    void Push(NodeId id);
+
+    NodeId Pop();
+    PriorityReadyQueue PopHalf();
+
+    size_t Size() const;
+    bool Empty() const;
+
+    PriorityReadyQueue CreateEmptyReadyQueue() const;
+
+   private:
+    struct Compare {
+      bool operator()(NodeId a, NodeId b) const {
+        return nodes_defs[a].priority < nodes_defs[b].priority;
+      }
+      absl::Span<const NodeDef> nodes_defs;
+    };
+
+    using InlinedPriorityQueue =
+        std::priority_queue<NodeId, absl::InlinedVector<NodeId, 8>, Compare>;
+
+    absl::Span<const NodeDef> nodes_defs_;
+    InlinedPriorityQueue queue_;
+  };
+
+ private:
+  // Align all atomic counters to a cache line boundary to avoid false
+  // sharing between multiple worker threads.
+  static constexpr size_t kAtomicAlignment =
+#if defined(__cpp_lib_hardware_interference_size)
+      std::hardware_destructive_interference_size;
+#else
+      64;
+#endif
+
+  // A struct to keep the state of a running ThunkExecutor.
+  struct ExecuteState {
+    // At run time NodeDef instantiated as a Node with an atomic counter that
+    // drops to zero when all `in_edges` are ready.
+    struct Node {
+      explicit Node(const NodeDef& node_def);
+
+      alignas(kAtomicAlignment) std::atomic<int64_t> counter;
+      const std::vector<NodeId>* out_edges;
+    };
+
+    static_assert(std::is_trivially_destructible_v<Node>,
+                  "Node must be trivially destructible");
+
+    // We use indirection via NodeStorage to be able to allocate uninitialized
+    // memory and do not pay the cost of default initializing all nodes.
+    using NodeStorage = std::aligned_storage_t<sizeof(Node), alignof(Node)>;
+
+    ExecuteState(ThunkExecutor* executor, Thunk::TaskRunner* runner);
+
+    Node& node(NodeId id) { return *reinterpret_cast<Node*>(&nodes[id]); }
+
+    ThunkExecutor* executor;
+    Thunk::TaskRunner* runner;
+
+    absl::FixedArray<NodeStorage> nodes;
+    tsl::AsyncValueRef<ExecuteEvent> execute_event;
+
+    // Once the number of pending sink nodes drops to zero, the execution is
+    // completed and we set `execute_event` as concrete or error.
+    alignas(kAtomicAlignment) std::atomic<int64_t> pending_sink_nodes;
+
+    // We store the first error from failed thunks in `abort_status` and at the
+    // end of execution the executor forwards it via the `execute_event`.
+    alignas(kAtomicAlignment) std::atomic<bool> abort;
+    absl::Mutex abort_mutex;
+    absl::Status abort_status ABSL_GUARDED_BY(abort_mutex);
+  };
+
+  ThunkExecutor(ThunkSequence thunk_sequence, std::vector<NodeDef> nodes_defs,
+                const Options& options);
+
+  // Executes thunks sequentially starting from the first thunk in the sequence.
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteSequential(
+      const Thunk::ExecuteParams& params);
+
+  // Resumes sequential thunk execution starting from the given index.
+  using ThunkIterator = typename ThunkSequence::iterator;
+  void ResumeExecuteSequential(ThunkIterator it,
+                               const Thunk::ExecuteParams& params,
+                               tsl::AsyncValueRef<ExecuteEvent> event);
+
+  // Executes nodes in the ready queue with given thunk parameters.
+  template <typename ReadyQueue>
+  void Execute(ExecuteState* state, const Thunk::ExecuteParams& params,
+               ReadyQueue ready_queue, Thunk::ExecuteSession::Lock lock);
+
+  // Splits ready queue starting from `start_index` into ThunkExecutor tasks and
+  // offloads them to the task runner.
+  template <typename ReadyQueue>
+  void SplitReadyQueue(ExecuteState* state, const Thunk::ExecuteParams& params,
+                       ReadyQueue& ready_queue, int64_t split_threshold);
+
+  // Processes out edges of a completed `node` and updates `ready_queue` with
+  // nodes that are ready to execute. If `node_event` is in error state, aborts
+  // the execution and records the error status to forward it to the caller.
+  template <typename ReadyQueue>
+  void ProcessOutEdges(ExecuteState* state,
+                       tsl::AsyncValuePtr<Thunk::ExecuteEvent> node_event,
+                       ExecuteState::Node& node, ReadyQueue& ready_queue);
+
+  // Runs a transitive reduction on the NodeDef graph to remove redundant edges,
+  // and updates nodes priorities. Returns the number of removed edges.
+  //
+  // See: https://en.wikipedia.org/wiki/Transitive_reduction
+  int64_t RunTransitiveReductionAndUpdatePriorities();
+
+  ThunkSequence thunk_sequence_;
+  Options options_;
+
+  int64_t num_thunks_;
+
+  std::vector<NodeDef> nodes_defs_;
+
+  std::vector<NodeId> source_;
+  std::vector<NodeId> sink_;
+
+  // If NodeDef graph dependency structure is sequential and does not have any
+  // opportunities for executing thunks concurrently, we skip the expensive
+  // async execution and simply run thunks in the `thunk_sequence_` one by one.
+  bool is_sequential_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_THUNK_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h
new file mode 100644
index 00000000..94761847
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h
@@ -0,0 +1,126 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_THUNK_TESTLIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_THUNK_TESTLIB_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <type_traits>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/buffer_allocations.h"
+#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/literal.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// A set of helper functions to create buffer allocations from Literals.
+//===----------------------------------------------------------------------===//
+
+// Creates a BufferAllocation with given index from a literal.
+BufferAllocation CreateBufferAllocation(size_t index, const Literal& literal);
+
+// Creates an array of BufferAllocations from a variadic pack of literals.
+template <
+    typename... Literals,
+    std::enable_if_t<std::conjunction_v<std::is_same<Literals, Literal>...>>* =
+        nullptr>
+std::array<BufferAllocation, sizeof...(Literals)> CreateBufferAllocation(
+    Literals&... literals) {
+  size_t index = 0;
+  return {CreateBufferAllocation(index++, literals)...};
+}
+
+// Creates a BufferAllocation::Slice that covers the entire allocation.
+BufferAllocation::Slice CreateBufferAllocationSlice(
+    const BufferAllocation& allocation);
+
+// Creates a BufferAllocation::Slice that covers a subrange of the allocation.
+BufferAllocation::Slice CreateBufferAllocationSlice(
+    const BufferAllocation& allocation, int64_t offset, int64_t size);
+
+// Creates an array of BufferAllocation::Slice from a pack of allocations. Each
+// slice covers the entire corresponding allocation.
+template <typename... BufferAllocations,
+          std::enable_if_t<std::conjunction_v<
+              std::is_same<BufferAllocations, BufferAllocation>...>>* = nullptr>
+std::array<BufferAllocation::Slice, sizeof...(BufferAllocations)>
+CreateBufferAllocationSlice(const BufferAllocations&... allocations) {
+  return {CreateBufferAllocationSlice(allocations)...};
+}
+
+// Creates a BufferAllocations from a span of literals.
+BufferAllocations CreateBufferAllocations(absl::Span<Literal*> literals);
+
+// Creates a BufferAllocations from a variadic pack of literals.
+template <
+    typename... Literals,
+    std::enable_if_t<std::conjunction_v<std::is_same<Literals, Literal>...>>* =
+        nullptr>
+BufferAllocations CreateBufferAllocations(Literals&... literals) {
+  std::vector<Literal*> literals_ptrs = {&literals...};
+  return CreateBufferAllocations(absl::MakeSpan(literals_ptrs));
+}
+
+//===----------------------------------------------------------------------===//
+// A library of test-only thunks.
+//===----------------------------------------------------------------------===//
+
+// A test-only thunk to create a Thunk with a specific buffer use.
+class BufferUseThunk : public Thunk {
+ public:
+  explicit BufferUseThunk(BufferUse buffer_use)
+      : Thunk(Kind::kKernel, {"buffer-use"}), buffer_use_(buffer_use) {}
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams&) final {
+    return absl::UnimplementedError("Unimplemented");
+  }
+
+  BufferUses buffer_uses() const final { return {buffer_use_}; }
+
+ private:
+  BufferUse buffer_use_;
+};
+
+// A test-only thunk to create a Thunk with a specific resource use.
+class ResourceUseThunk : public Thunk {
+ public:
+  explicit ResourceUseThunk(ResourceUse resource_use)
+      : Thunk(Kind::kKernel, {"resource-use"}), resource_use_(resource_use) {}
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams&) final {
+    return absl::UnimplementedError("Unimplemented");
+  }
+
+  BufferUses buffer_uses() const final { return {}; }
+  ResourceUses resource_uses() const final { return {resource_use_}; }
+
+ private:
+  ResourceUse resource_use_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_THUNK_TESTLIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/topk_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/topk_thunk.h
new file mode 100644
index 00000000..7e7fadb0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/topk_thunk.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_TOPK_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_TOPK_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+class TopKThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<TopKThunk>> Create(
+      Info info, BufferAllocation::Slice values, BufferAllocation::Slice output,
+      BufferAllocation::Slice indices, int64_t batch_size, int64_t input_size,
+      int64_t k);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final {
+    return {BufferUse::Read(values_buffer_), BufferUse::Write(output_buffer_),
+            BufferUse::Write(indices_buffer_)};
+  }
+
+ private:
+  TopKThunk(Info info, BufferAllocation::Slice values,
+            BufferAllocation::Slice output, BufferAllocation::Slice indices,
+            int64_t batch_size, int64_t input_size, int64_t k);
+
+  BufferAllocation::Slice values_buffer_;
+  BufferAllocation::Slice output_buffer_;
+  BufferAllocation::Slice indices_buffer_;
+  int64_t batch_size_;
+  int64_t input_size_;
+  int64_t k_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_TOPK_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/while_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
new file mode 100644
index 00000000..c1de07de
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
@@ -0,0 +1,86 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_WHILE_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_WHILE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk_executor.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// While loop written as two thunk sequences:
+//
+// while (condition_thunk.Execute(...) && condition_buffer) {
+//   body_thunk.Execute(...);
+// }
+//
+// Condition buffer must be a i1 (bool) buffer that holds a loop predicate.
+class WhileThunk final : public Thunk {
+ public:
+  static absl::StatusOr<std::unique_ptr<WhileThunk>> Create(
+      Info info, BufferAllocation::Slice cond_buffer,
+      ThunkSequence cond_sequence, ThunkSequence body_sequence,
+      std::optional<int64_t> trip_count = std::nullopt);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+  ResourceUses resource_uses() const final;
+
+ private:
+  WhileThunk(Info info, BufferAllocation::Slice cond_buffer,
+             ThunkExecutor cond_executor, ThunkExecutor body_executor,
+             std::optional<int64_t> trip_count);
+
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteForLoop(const ExecuteParams& params,
+                                                  int64_t trip_count);
+
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteWhileLoop(const ExecuteParams& params,
+                                                    bool* condition);
+
+  // If `cond` or `body` thunk sequence return unavailable async values, then
+  // we execute the while loop asynchronously by chaining `Execute` calls via
+  // `AndThen` callbacks. This execution mode adds significant overheads, so we
+  // try to avoid it when possible and run everything in the caller thread.
+
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteAsyncForLoop(
+      const ExecuteParams& params, tsl::AsyncValueRef<ExecuteEvent> dependency,
+      int64_t loop_counter, int64_t trip_count);
+
+  tsl::AsyncValueRef<ExecuteEvent> ExecuteAsyncWhileLoop(
+      const ExecuteParams& params, tsl::AsyncValueRef<ExecuteEvent> dependency,
+      bool* condition);
+
+  BufferAllocation::Slice cond_buffer_;
+  ThunkExecutor cond_executor_;
+  ThunkExecutor body_executor_;
+
+  // Statically known trip count. If available, WhileThunk::Execute will not
+  // execute `cond_executor_` and simply call `body_executor_` `trip_count`
+  // times (effectively converting while loop into a for loop).
+  std::optional<int64_t> trip_count_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_WHILE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/object_pool.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/object_pool.h
new file mode 100644
index 00000000..32313c2d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/object_pool.h
@@ -0,0 +1,140 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_OBJECT_POOL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_OBJECT_POOL_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::cpu {
+
+// A non-blocking pool of objects of type `T`. Objects in the pool are created
+// lazily when needed by calling the user-provided `builder` function.
+//
+// This object pool is intended to be used on a critical path and optimized for
+// zero-allocation in steady state.
+template <typename T, typename... Args>
+class ObjectPool {
+  struct Entry {
+    T object;
+    std::atomic<Entry*> next;
+  };
+
+ public:
+  explicit ObjectPool(absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder);
+  ~ObjectPool();
+
+  class BorrowedObject {
+   public:
+    ~BorrowedObject();
+
+    T& operator*() { return entry_->object; }
+    T* operator->() { return &entry_->object; }
+
+    BorrowedObject(BorrowedObject&&) = default;
+    BorrowedObject& operator=(BorrowedObject&&) = default;
+
+   private:
+    friend class ObjectPool;
+
+    BorrowedObject(ObjectPool* parent, std::unique_ptr<Entry> entry);
+
+    ObjectPool* parent_;
+    std::unique_ptr<Entry> entry_;
+  };
+
+  absl::StatusOr<BorrowedObject> GetOrCreate(Args... args);
+
+  size_t num_created() const { return num_created_.load(); }
+
+ private:
+  absl::StatusOr<std::unique_ptr<Entry>> CreateEntry(Args... args);
+  std::unique_ptr<Entry> PopEntry();
+  void PushEntry(std::unique_ptr<Entry> entry);
+
+  absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder_;
+  std::atomic<Entry*> head_;
+  std::atomic<size_t> num_created_;
+};
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::ObjectPool(
+    absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder)
+    : builder_(std::move(builder)), head_(nullptr), num_created_(0) {}
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::~ObjectPool() {
+  while (Entry* entry = head_.load()) {
+    head_.store(entry->next);
+    delete entry;
+  }
+}
+
+template <typename T, typename... Args>
+auto ObjectPool<T, Args...>::CreateEntry(Args... args)
+    -> absl::StatusOr<std::unique_ptr<Entry>> {
+  auto entry = std::make_unique<Entry>();
+  TF_ASSIGN_OR_RETURN(entry->object, builder_(std::forward<Args>(args)...));
+  entry->next = nullptr;
+  num_created_.fetch_add(1);
+  return entry;
+}
+
+template <typename T, typename... Args>
+auto ObjectPool<T, Args...>::PopEntry() -> std::unique_ptr<Entry> {
+  Entry* head = head_.load();
+  while (head && !head_.compare_exchange_weak(head, head->next)) {
+  }
+  return std::unique_ptr<Entry>(head);
+}
+
+template <typename T, typename... Args>
+void ObjectPool<T, Args...>::PushEntry(std::unique_ptr<Entry> entry) {
+  Entry* head = head_.load();
+  Entry* new_head = entry.release();
+  do {
+    new_head->next = head;
+  } while (!head_.compare_exchange_weak(head, new_head));
+}
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::BorrowedObject::BorrowedObject(
+    ObjectPool<T, Args...>* parent, std::unique_ptr<Entry> entry)
+    : parent_(parent), entry_(std::move(entry)) {}
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::BorrowedObject::~BorrowedObject() {
+  if (parent_ && entry_) parent_->PushEntry(std::move(entry_));
+}
+
+template <typename T, typename... Args>
+auto ObjectPool<T, Args...>::GetOrCreate(Args... args)
+    -> absl::StatusOr<BorrowedObject> {
+  if (std::unique_ptr<Entry> entry = PopEntry()) {
+    return BorrowedObject(this, std::move(entry));
+  }
+  TF_ASSIGN_OR_RETURN(auto entry, CreateEntry(std::forward<Args>(args)...));
+  return BorrowedObject(this, std::move(entry));
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_OBJECT_POOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h
new file mode 100644
index 00000000..361378a6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h
@@ -0,0 +1,161 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_PARALLEL_LOOP_RUNNER_H_
+#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_PARALLEL_LOOP_RUNNER_H_
+
+#include <atomic>
+#include <cstddef>
+#include <functional>
+
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}  // namespace Eigen
+
+namespace xla::cpu {
+
+// Parallel loop runner uses underlying Eigen ThreadPoolDevice to execute
+// parallel loops providing implicit synchronization: the next parallel loop
+// starts execution only after all tasks from the previous loop are completed.
+//
+// Scheduled parallel loops execute asynchronously without blocking the caller
+// thread. It is the user's responsibility to ensure that all values captured by
+// the task are valid until the task is completed.
+//
+// Parallel loop runner is an implementation of the `pthreadpool` API adaptor
+// for XLA:CPU runtime.
+//
+// WARNING: ParallelLoopRunner is not thread-safe, and must be externally
+// synchronized by the user.
+class ParallelLoopRunner {
+ public:
+  explicit ParallelLoopRunner(const Eigen::ThreadPoolDevice* device);
+
+  // Takes ownership of the runner and returns a done event. After the done
+  // event is transferred to the caller, it is illegal to schedule more parallel
+  // loops on the moved-from runner.
+  static tsl::AsyncValueRef<tsl::Chain> TakeDoneEvent(
+      ParallelLoopRunner&& runner);
+
+  using Task1D = std::function<void(size_t offset)>;
+
+  using Task1DTile1D = std::function<void(size_t offset, size_t extent)>;
+
+  using Task2DTile1D =
+      std::function<void(size_t offset_i, size_t offset_j, size_t extent_j)>;
+
+  using Task3DTile2D =
+      std::function<void(size_t offset_i, size_t offset_j, size_t offset_k,
+                         size_t extent_j, size_t extent_k)>;
+
+  // This function implements a parallel version of a following loop:
+  //
+  //   for (size_t i = 0; i < range; i++)
+  //     task(i);
+  void Parallelize(size_t range, Task1D task);
+
+  // This function implements a parallel version of a following loop:
+  //
+  //   for (size_t i = 0; i < range; i += tile)
+  //     task(i, std::min(range - i, tile));
+  void Parallelize(size_t range, size_t tile, Task1DTile1D task);
+
+  // This function implements a parallel version of a following loop:
+  //
+  //   for (size_t i = 0; i < range_i; i++)
+  //     for (size_t j = 0; j < range_j; j += tile_j)
+  //       task(i, j, min(range_j - j, tile_j));
+  void Parallelize(size_t range_i, size_t range_j, size_t tile_j,
+                   Task2DTile1D task);
+
+  // This function implements a parallel version of a following loop:
+  //
+  //   for (size_t i = 0; i < range_i; i++)
+  //     for (size_t j = 0; j < range_j; j += tile_j)
+  //       for (size_t k = 0; k < range_k; k += tile_k)
+  //         task(i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
+  void Parallelize(size_t range_i, size_t range_j, size_t range_k,
+                   size_t tile_j, size_t tile_k, Task3DTile2D task);
+
+  // Resets the parallel loop runner `done_event` and returns the previous one
+  // to the caller.
+  tsl::AsyncValueRef<tsl::Chain> ResetDoneEvent();
+
+  tsl::AsyncValueRef<tsl::Chain> done_event() const { return done_event_; }
+
+  const Eigen::ThreadPoolDevice* device() const { return device_; }
+  void set_device(const Eigen::ThreadPoolDevice* device) { device_ = device; }
+
+  size_t num_threads() const;
+
+ private:
+  // When parallelizing loops, we split the loop iteration space of `num_tasks`
+  // size into `num_parallel_tasks` parallel tasks, each of which processes
+  // `parallel_task_size` original tasks sequentially on a single thread. We do
+  // this to avoid excessive task scheduling overheads at run time.
+  struct ParallelTaskConfig {
+    struct TaskRange {
+      size_t begin;
+      size_t end;
+    };
+
+    TaskRange ParallelTaskRange(size_t parallel_task_index) const;
+
+    size_t num_tasks;
+    size_t parallel_task_size;
+    size_t num_parallel_tasks;
+  };
+
+  ParallelTaskConfig ComputeParallelTaskConfig(size_t num_tasks) const;
+
+  // Schedules tasks in the [start_index, end_index) range into the Eigen thread
+  // pool using recursive work splitting. Executes the `start_index` task in the
+  // caller thread.
+  template <typename ParallelTask>
+  void Parallelize(tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
+                   size_t start_index, size_t end_index,
+                   ParallelTask&& parallel_task);
+
+  // Schedules `task` as the AndThen callback of the `done_event_`. Updates
+  // `done_event_` to the new completion event.
+  template <typename Task>
+  void ScheduleOne(Task&& task);
+
+  // Schedules `num_tasks` invocation of the `parallel_task` into the Eigen
+  // thread pool when the `done_event_` becomes available. Updates `done_event_`
+  // to the new completion event.
+  template <typename ParallelTask>
+  void ScheduleAll(size_t num_tasks, ParallelTask&& parallel_task);
+
+  // Async value that signals completion of the last scheduled parallel loop.
+  tsl::AsyncValueRef<tsl::Chain> done_event_;
+
+  // We keep a pointer to the Eigen thread pool device as an atomic variable
+  // because we might update it between concurrent runs of XNNPACK operations
+  // and non-atomic access to the `device_` pointer might lead to a data race.
+  //
+  // In practice PjRt CPU client owns the intra-op thread pool and passes it to
+  // XLA via Thunk::ExecuteParams, and PjRt client might have multiple thread
+  // pools for different NUMA nodes, and we have to be able to switch between
+  // them from run to run.
+  std::atomic<const Eigen::ThreadPoolDevice*> device_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_PARALLEL_LOOP_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
new file mode 100644
index 00000000..b3ae7e88
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
@@ -0,0 +1,75 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+
+namespace xla::cpu {
+
+// Dot operation implemented on top of XNNPACK.
+class XnnDotThunk final : public XnnFusionThunk {
+ public:
+  // Returns true if the dot operation is supported by XNNPACK. Returns an error
+  // if the dot operation shape is invalid.
+  static absl::StatusOr<bool> IsSupported(
+      const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
+      const Shape& rhs_shape, const Shape& out_shape);
+
+  static absl::StatusOr<std::unique_ptr<XnnDotThunk>> Create(
+      Info info, DotDimensionNumbers dot_dimensions,
+      BufferAllocation::Slice lhs_buffer, Shape lhs_shape,
+      BufferAllocation::Slice rhs_buffer, Shape rhs_shape,
+      BufferAllocation::Slice out_buffer, Shape out_shape);
+
+ protected:
+  std::string fusion_kind() const final;
+  std::string fusion_description() const final;
+
+  bool has_fusion_details() const final { return true; }
+  std::vector<std::string> fusion_details() const final;
+
+  std::string argument_name(size_t index) const final;
+  std::string result_name(size_t index) const final;
+
+ private:
+  XnnDotThunk(Info info, DotDimensionNumbers dot_dimensions,
+              DotSlices dot_slices, DotShape dot_shape,
+              DotCanonicalDims dot_canonical_dims);
+
+  absl::StatusOr<xnn_subgraph_t> BuildDotSubgraph(
+      absl::Span<const Argument> arguments, absl::Span<const Result> results);
+
+  DotDimensionNumbers dot_dimensions_;
+  DotSlices dot_slices_;
+  DotShape dot_shape_;
+  DotCanonicalDims dot_canonical_dims_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
new file mode 100644
index 00000000..1653bb2b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
@@ -0,0 +1,105 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/xnnpack/object_pool.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+// Forward declare XNNPACK types.
+typedef struct xnn_subgraph* xnn_subgraph_t;  // NOLINT
+
+namespace xla::cpu {
+
+// XNN fusion thunk encapsulates XNNPACK subgraph contructed from an XLA fusion
+// operation, where each HLO op has a corresponding XNNPACK operator.
+class XnnFusionThunk : public Thunk {
+ public:
+  ~XnnFusionThunk() override;
+
+  struct Argument {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  struct Result {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  // Builder function constructs XNNPACK subgraph for the fusion operation.
+  using Builder = absl::AnyInvocable<absl::StatusOr<xnn_subgraph_t>(
+      absl::Span<const Argument> arguments, absl::Span<const Result> results)>;
+
+  static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> Create(
+      Info info, std::vector<Argument> arguments, std::vector<Result> results,
+      Builder builder);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  BufferUses buffer_uses() const final;
+
+ protected:
+  XnnFusionThunk(Info info, std::vector<Argument> arguments,
+                 std::vector<Result> results, Builder builder);
+
+  // Extension points for subclasses to customize the logging behavior.
+  virtual std::string fusion_kind() const { return "fusion"; }
+  virtual std::string fusion_description() const { return ""; }
+
+  virtual bool has_fusion_details() const { return false; }
+  virtual std::vector<std::string> fusion_details() const { return {}; }
+
+  virtual std::string argument_name(size_t index) const {
+    return absl::StrCat("arg #", index);
+  }
+
+  virtual std::string result_name(size_t index) const {
+    return absl::StrCat("res #", index);
+  }
+
+ private:
+  // XNNPACK runtime instantiated for the fusion operation.
+  struct XnnRuntime;
+
+  absl::StatusOr<XnnRuntime> CreateXnnRuntime(
+      const Eigen::ThreadPoolDevice* device);
+
+  std::vector<Argument> arguments_;
+  std::vector<Result> results_;
+  Builder builder_;
+
+  // XLA:CPU executable can be called concurrently from multiple threads,
+  // and we need to keep a pool of XNNPACK runtimes to avoid data races.
+  ObjectPool<XnnRuntime, const Eigen::ThreadPoolDevice*> xnn_runtime_pool_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
new file mode 100644
index 00000000..47f6aa3d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
+#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
+
+#include "xnnpack.h"
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "xla/util.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+#define XNN_RETURN_IF_ERROR(expr)             \
+  do {                                        \
+    absl::Status s = XnnStatusToStatus(expr); \
+    if (!s.ok()) {                            \
+      return s;                               \
+    }                                         \
+  } while (0)
+
+#define XNN_LOG_IF_ERROR(expr)                         \
+  do {                                                 \
+    absl::Status s = XnnStatusToStatus(expr);          \
+    if (!s.ok()) {                                     \
+      LOG(ERROR) << "XNNPACK operation failed: " << s; \
+    }                                                  \
+  } while (0)
+
+// Statically initializes XNNPACK for the current process.
+absl::Status InitializeXnnPack();
+
+// Converts XNNPACK status to absl::Status.
+inline absl::Status XnnStatusToStatus(xnn_status status) {
+  if (ABSL_PREDICT_TRUE(status == xnn_status_success)) {
+    return absl::OkStatus();
+  }
+
+  auto error_message = [](xnn_status status) {
+    switch (status) {
+      case xnn_status_success:
+        return "";
+      case xnn_status_uninitialized:
+        return "uninitialized";
+      case xnn_status_invalid_parameter:
+        return "invalid parameter";
+      case xnn_status_invalid_state:
+        return "invalid state";
+      case xnn_status_unsupported_parameter:
+        return "unsupported parameter";
+      case xnn_status_unsupported_hardware:
+        return "unsupported hardware";
+      case xnn_status_out_of_memory:
+        return "out of memory";
+      case xnn_status_reallocation_required:
+        return "reallocation required";
+      case xnn_status_deprecated:
+        return "deprecated";
+    }
+  };
+
+  return Internal("XNNPACK operation failed: %s", error_message(status));
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
new file mode 100644
index 00000000..4afe664b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
+
+#include "pthreadpool.h"
+#include "xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h"
+
+namespace xla::cpu {
+
+// Returns true if the custom pthreadpool is enabled.
+bool IsCustomPthreadpoolEnabled();
+
+// Returns the default per-process pthreadpool. If custom `pthreadpool` is
+// enabled, it will return nullptr.
+pthreadpool_t DefaultPthreadpool();
+
+// Creates a `pthreadpool` that uses the given `runner` to execute work. If
+// custom `pthreadpool` is disabled, it will kill the process.
+pthreadpool_t CreateCustomPthreadpool(xla::cpu::ParallelLoopRunner* runner);
+
+// Returns the parallel loop runner associated with the given `pthreadpool`. If
+// the `pthreadpool` is not associated with a parallel loop runner, returns
+// nullptr.
+xla::cpu::ParallelLoopRunner* GetParallelLoopRunner(pthreadpool_t threadpool);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
new file mode 100644
index 00000000..503ab81d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_TESTLIB_KERNEL_RUNNER_H_
+#define XLA_BACKENDS_CPU_TESTLIB_KERNEL_RUNNER_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/codegen/jit_compiler.h"
+#include "xla/backends/cpu/codegen/llvm_ir_kernel_spec.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/kernel.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/testlib/kernel_runner.h"
+
+namespace xla::cpu {
+
+// Kernel runner for XLA:CPU backend.
+class KernelRunner final : public xla::KernelRunner {
+ public:
+  // Create a KernelRunner from a KernelSpec, this factory takes care of the
+  // downcasting to supported kernel spec types, currently only LlvmIrKernelSpec
+  // is supported.
+  static absl::StatusOr<KernelRunner> Create(
+      std::unique_ptr<KernelSpec> kernel_spec);
+
+  // Keep this llvm specific constructor for python bindings:
+  // nanobind will do the downcasting for us and give the python specific
+  // error if there is not a valid Create(...) call.
+  static absl::StatusOr<KernelRunner> Create(LlvmIrKernelSpec kernel_spec,
+                                             JitCompiler compiler);
+
+  KernelRunner(KernelRunner&&) = default;
+  KernelRunner& operator=(KernelRunner&&) = default;
+
+  absl::Status Call(absl::Span<const Argument> arguments) final;
+
+  static absl::StatusOr<JitCompiler> CreateJitCompiler();
+
+ private:
+  KernelRunner(std::unique_ptr<FunctionLibrary> library, Kernel kernel,
+               Kernel::ThreadDim thread_dim);
+
+  std::unique_ptr<FunctionLibrary> library_;
+  Kernel kernel_;
+  Kernel::ThreadDim thread_dim_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_TESTLIB_KERNEL_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
new file mode 100644
index 00000000..60e737b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_TESTLIB_LLVM_IR_KERNEL_EMITTER_H_
+#define XLA_BACKENDS_CPU_TESTLIB_LLVM_IR_KERNEL_EMITTER_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::cpu {
+
+// An XLA kernel emitter that emits a kernel by parsing the given LLVM IR module
+// into the dedicated LLVM context and module instance. This kernel emitter is
+// intended to be used for testing purposes only: (1) load pre-compiled LLVM IR
+// into the XLA kernel spec; (2) Execute it with user provided input buffers.
+class LlvmIrKernelEmitter : public KernelEmitter {
+ public:
+  // When loading kernel IR into the KernelSpec we create a separate buffer
+  // allocation for every kernel argument. We don't use buffer assignment in
+  // kernel testlib, but we still need to return a valid BufferUses vector.
+  struct KernelArg {
+    size_t size_bytes;
+    BufferUse::MemoryAccess memory_access;
+  };
+
+  LlvmIrKernelEmitter(absl::string_view llvm_ir, absl::string_view kernel_name,
+                      se::ThreadDim thread_dim,
+                      absl::Span<const KernelArg> args);
+
+  absl::StatusOr<std::unique_ptr<KernelSpec>> EmitKernelSpec() final;
+
+ private:
+  std::string llvm_ir_;
+  std::string kernel_name_;
+  se::ThreadDim thread_dim_;
+  std::vector<KernelArg> args_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_TESTLIB_LLVM_IR_KERNEL_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/xnn_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/xnn_emitter.h
new file mode 100644
index 00000000..fb6b1b9b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/cpu/xnn_emitter.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_XNN_EMITTER_H_
+#define XLA_BACKENDS_CPU_XNN_EMITTER_H_
+
+#include "xnnpack.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<xnn_subgraph_t>()>>
+EmitXnnFusionBuilder(const HloComputation* computation);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_XNN_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/codegen/ir/xla_gpu_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/codegen/ir/xla_gpu_ops.h
new file mode 100644
index 00000000..0d712d90
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/codegen/ir/xla_gpu_ops.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_GPU_CODEGEN_IR_XLA_GPU_OPS_H_
+#define XLA_BACKENDS_GPU_CODEGEN_IR_XLA_GPU_OPS_H_
+
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep
+#include "mlir/IR/Attributes.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
+#include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+#include "mlir/IR/MLIRContext.h"  // IWYU pragma: keep
+#include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
+#include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/CallInterfaces.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/codegen/ir/xla_gpu_dialect.h.inc"
+#include "xla/backends/gpu/codegen/ir/xla_gpu_enums.h.inc"
+#include "xla/codegen/ir/xla_ops.h"
+#include "xla/hlo/analysis/indexing_map.h"  // IWYU pragma: keep
+#define GET_ATTRDEF_CLASSES
+#include "xla/backends/gpu/codegen/ir/xla_gpu_attrs.h.inc"
+#define GET_TYPEDEF_CLASSES
+#include "xla/backends/gpu/codegen/ir/xla_gpu_types.h.inc"
+#define GET_OP_CLASSES
+#include "xla/backends/gpu/codegen/ir/xla_gpu_ops.h.inc"
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_IR_XLA_GPU_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/codegen/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/codegen/transforms/passes.h
new file mode 100644
index 00000000..de12227f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/codegen/transforms/passes.h
@@ -0,0 +1,76 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_GPU_CODEGEN_TRANSFORMS_PASSES_H_
+#define XLA_BACKENDS_GPU_CODEGEN_TRANSFORMS_PASSES_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "xla/codegen/ir/xla_ops.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DECL
+#include "xla/backends/gpu/codegen/transforms/passes.h.inc"
+
+// Returns atomic op modifier and the atomic bin op kind.
+std::optional<std::pair<mlir::Value, mlir::LLVM::AtomicBinOp>>
+GetAtomicModifierParameters(AtomicRMWOp op);
+
+std::unique_ptr<mlir::Pass> CreateConvertFloatNvidiaPass();
+std::optional<std::unique_ptr<mlir::Pass>> MaybeCreateConvertFloatNvidiaPass(
+    const se::DeviceDescription& device_description);
+std::unique_ptr<mlir::Pass> CreateConvertPureCallOpsPass();
+std::unique_ptr<mlir::Pass> CreateEraseDeadFunctionsPass();
+std::unique_ptr<mlir::Pass> CreateExpandFloatOpsPass();
+std::unique_ptr<mlir::Pass> CreateFlattenTensorsPass();
+std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
+    const se::DeviceDescription& device_description);
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass(
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass(
+    const se::DeviceDescription& device_description);
+std::unique_ptr<mlir::Pass> CreateLowerXlaGpuToScfPass(int64_t warp_size = 32);
+std::unique_ptr<mlir::Pass> CreateLowerXlaGpuLoopsToScfPass();
+std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
+std::unique_ptr<mlir::Pass> CreateOptimizeLoopsPass();
+std::unique_ptr<mlir::Pass> CreateFuseLoopsPass();
+std::unique_ptr<mlir::Pass> CreatePeelLoopsPass();
+std::unique_ptr<mlir::Pass> CreatePropagateSliceIndicesPass();
+std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass();
+std::unique_ptr<mlir::Pass> CreateSimplifyArithPass();
+std::unique_ptr<mlir::Pass> CreateUnswitchLoopsPass();
+std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
+    const se::DeviceDescription& device_description);
+
+#define GEN_PASS_REGISTRATION
+#include "xla/backends/gpu/codegen/transforms/passes.h.inc"
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h
new file mode 100644
index 00000000..3a2a3500
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h
@@ -0,0 +1,86 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/btree_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/core/collectives/clique.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/lockable.h"
+
+namespace xla::gpu {
+
+class LockableGpuClique;
+
+// A group of GPU communicators making up a clique for a given clique key.
+class GpuClique : public Clique {
+ public:
+  GpuClique(
+      GpuCliqueKey key, std::optional<CliqueId> id,
+      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators);
+
+  // Returns true if clique is local: all communicators belong to current
+  // process. Non-local cliques spans multiple processes (typically hosts).
+  bool IsLocal() const { return num_communicators() == key_.devices().size(); }
+
+  const GpuCliqueKey& key() const { return key_; }
+  const std::optional<CliqueId>& id() const { return id_; }
+
+  std::string DebugString() const final;
+  absl::Status HealthCheck() const final;
+
+ private:
+  friend LockableGpuClique;
+
+  // A functor to give human-readable names to lockable GPU cliques.
+  struct LockableName {
+    static std::string ToString(const GpuClique& clique);
+  };
+
+  GpuCliqueKey key_;
+  std::optional<CliqueId> id_;
+};
+
+// A lockable version of GpuClique that guarantees exclusive access to the
+// clique communicators.
+class LockableGpuClique : public Lockable<GpuClique, GpuClique::LockableName> {
+ public:
+  LockableGpuClique(
+      GpuCliqueKey clique_key, std::optional<CliqueId> clique_id,
+      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators);
+
+  std::string DebugString() const;
+
+  // Checks for async errors for all the communicators in the clique without
+  // taking the lock. If at least one of the communicators has an async error,
+  // it returns one of the errors.
+  absl::Status HealthCheck() const;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
new file mode 100644
index 00000000..d563db28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
@@ -0,0 +1,115 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_KEY_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_KEY_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/hash/hash.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/service/global_device_id.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla::gpu {
+
+// In XLA:GPU we use different streams for different kinds of collective
+// operations, and include the async stream kind into the GPU clique key.
+//
+// We carefully isolate different kinds of collectives using separate
+// communicators and guarantee that all collective operations have a total order
+// that will not create a deadlock.
+enum class AsyncStreamKind : int64_t {
+  kCollective = 0,  // Stream for asynchronous collective ops.
+  kP2P0 = 1,        // One Stream for P2P Send and Recv ops.
+  kP2P1 = 2,        // Another Stream for P2P Send and Recv ops.
+  kMemCpyP2P = 3,   // Stream for MemCpyP2P
+};
+
+inline constexpr int64_t kAsyncStreamTotal =
+    static_cast<int64_t>(AsyncStreamKind::kMemCpyP2P) + 1;
+
+// Strongly-typed wrapper to represent collective stream ID.
+TSL_LIB_GTL_DEFINE_INT_TYPE(CollectiveStreamId, uint64_t);
+
+// Assigns a unique ID to a stream for asynchronous or synchronous execution.
+// These IDs can be used, for example, to look up the NCCL communicator.
+CollectiveStreamId GetCollectiveStreamId(
+    bool is_async, AsyncStreamKind stream_kind = AsyncStreamKind::kCollective);
+
+// Clique key for identifying a particular collectives clique on a GPU backend.
+class GpuCliqueKey : public CliqueKey {
+ public:
+  explicit GpuCliqueKey(
+      std::vector<GlobalDeviceId> devices,
+      CollectiveStreamId stream_id = CollectiveStreamId(0),
+      AsyncStreamKind stream_kind = AsyncStreamKind::kCollective,
+      std::vector<std::vector<GlobalDeviceId>> participant_groups = {});
+
+  GpuCliqueKey(const GpuCliqueKey&) = default;
+  GpuCliqueKey& operator=(const GpuCliqueKey&) = default;
+
+  GpuCliqueKey(GpuCliqueKey&&) = default;
+  GpuCliqueKey& operator=(GpuCliqueKey&&) = default;
+
+  CollectiveStreamId stream_id() const;
+
+  // Returns true if this clique is a subset of `other`: both cliques have the
+  // same `stream_id` and all clique devices are part of `other` clique.
+  bool IsSubsetOf(const CliqueKey& other) const final;
+
+  // Returns the stream kind for this clique key, stream kind will be used to
+  // specify what configuration to pass for each type of operation.
+  AsyncStreamKind stream_kind() const { return stream_kind_; }
+
+  std::string ToString() const final;
+
+  // GPU clique keys have a total order on which we rely on for acquiring
+  // cliques in the same order across all participating devices.
+  friend bool operator==(const GpuCliqueKey& a, const GpuCliqueKey& b);
+  friend bool operator<(const GpuCliqueKey& a, const GpuCliqueKey& b);
+  friend bool operator>(const GpuCliqueKey& a, const GpuCliqueKey& b);
+
+ private:
+  void HashValue(absl::HashState state) const final;
+
+  CollectiveStreamId stream_id_;
+  AsyncStreamKind stream_kind_;
+
+  // The full list of groups across all devices which this clique is a part of.
+  //
+  // When GPU communicator splitting is enabled, this is used to distinguish
+  // which cliques can be reused from the cache or must be split in order to
+  // prevent a deadlock situation.
+  //
+  // For example, imagine we have a communicator with devices = [0,1] and
+  // groups = [0, 1] Later on, we may want to create communicators [0, 1] and
+  // [2, 3] by splitting [0, 1, 2, 3] If ranks 0 and 1 reuse the existing
+  // [0, 1] clique but ranks 2 and 3 initiate a split, there will be a deadlock
+  // since ranks 2, 3 and will be waiting forever for 0, 1 to join the split.
+  //
+  // Having the participating groups as part of the cache key will prevent such
+  // situations
+  std::vector<std::vector<GlobalDeviceId>> participant_groups_;
+};
+
+bool operator==(const GpuCliqueKey& a, const GpuCliqueKey& b);
+bool operator<(const GpuCliqueKey& a, const GpuCliqueKey& b);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_KEY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
new file mode 100644
index 00000000..9825949c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUES_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUES_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/container/btree_map.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_clique.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/lockable.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla::gpu {
+
+// A sorted container of acquired cliques. We keep cliques ordered by the key,
+// so that all participants are guaranteed to iterate over the cliques in the
+// same order, because otherwise we could get deadlocks when different
+// participants try to split cliques in different orders.
+class AcquiredCliquesMap
+    : public absl::btree_map<GpuCliqueKey,
+                             std::shared_ptr<LockableGpuClique::Lock>,
+                             std::greater<GpuCliqueKey>> {};
+
+// Acquires a "shared exclusive" access to a GPU clique (exclusive in a sense
+// that the clique is locked for exclusive use by `num_local_participants`
+// threads holding the shared lock object). XLA uses this lock to serialize
+// execution of all collective operations sharing a `clique_id`.
+//
+// We rely on exclusive access to a GPU clique (using Lockable<T> mechanism) to
+// guarantee that only a set of threads executing a particular collective
+// operation can schedule new work using communicators belonging to a clique.
+//
+// If clique for a given key does not exist it will be initialized from newly
+// created communicators or maybe created by splitting of the already acquired
+// cliques.
+//
+// WARNING: This is a collective operation that must be executed by all local
+// participants of the clique key concurrently (it must be called from an
+// appropriately sized thread pool to avoid deadlocks). Implementation relies on
+// the rendezvous mechanism to ensure that all participants join clique
+// acquisition, with a rendezvous key derived from the clique key.
+absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
+    GpuCollectives* collectives, se::StreamExecutor* device, RunId run_id,
+    const GpuCliqueKey& clique_key,
+    const GpuCollectives::CliqueIdCallback& clique_id_callback, RankId rank,
+    size_t num_local_participants, const AcquiredCliquesMap& acquired_cliques,
+    int64_t max_nchannels = 0);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
new file mode 100644
index 00000000..7e6fbe79
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
@@ -0,0 +1,110 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+// XLA:GPU extension of the Collectives interface with GPU-specific APIs.
+class GpuCollectives : public Collectives {
+ public:
+  // Returns the default collectives implementation for GPU backend.
+  static GpuCollectives* Default();
+
+  // A callback to get a unique clique id.
+  using CliqueIdCallback =  // NOLINT
+      std::function<absl::StatusOr<CliqueId>(const CliqueKey&)>;
+
+  // GPU collectives device is just a wrapper around the StreamExecutor.
+  class Device : public Collectives::Device {
+   public:
+    explicit Device(stream_executor::StreamExecutor* stream_executor);
+    stream_executor::StreamExecutor* stream_executor() const;
+
+   private:
+    stream_executor::StreamExecutor* stream_executor_;
+  };
+
+  // GPU collectives executor is just a wrapper around the Stream.
+  class Executor : public Communicator::Executor {
+   public:
+    explicit Executor(stream_executor::Stream* stream);
+    stream_executor::Stream* stream() const;
+
+   private:
+    stream_executor::Stream* stream_;
+  };
+
+  static Executor On(se::Stream& stream) { return Executor(&stream); }
+
+  // GPU communicator configuration.
+  //
+  // For NCCL backend see configuration options documentation at:
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig
+  struct Config : public Collectives::Config {
+    bool split_share = false;
+    int64_t max_nchannels = 0;
+  };
+
+  // Returns true if GPU collectives are implemented.
+  virtual bool IsImplemented() const = 0;
+
+  // Returns true if collectives backend uses global config.
+  virtual bool IsGlobalConfig() const = 0;
+
+  // Returns a clique id callback passed as an argument if it's not null or a
+  // default callback to get create a clique id if we are running in local mode.
+  virtual absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback* clique_id_callback, bool is_local) = 0;
+
+  // Returns a slice of device memory `buff` containing `count` values of data
+  // type `dtype` starting from `offset`.
+  static stream_executor::DeviceMemoryBase Slice(
+      stream_executor::DeviceMemoryBase buff, PrimitiveType dtype,
+      size_t offset, size_t count);
+
+  // Starts a group call.
+  virtual absl::Status GroupStart() = 0;
+
+  // Ends a group call.
+  virtual absl::Status GroupEnd() = 0;
+
+  // Tries to cast a Collectives::Device to a GpuCollectives::Device.
+  static absl::StatusOr<Device*> TryCast(Collectives::Device* device);
+
+  // Tries to cast a Collectives::Config to a GpuCollectives::Config.
+  static absl::StatusOr<const Config*> TryCast(
+      const Collectives::Config* config);
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
new file mode 100644
index 00000000..590d0854
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
@@ -0,0 +1,76 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_STUB_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_STUB_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+// A stub for GPU collectives when XLA:GPU compiled without collectives support.
+class GpuCollectivesStub : public GpuCollectives {
+ public:
+  bool IsImplemented() const final { return false; }
+  bool IsGlobalConfig() const final { return false; }
+
+  absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final {
+    return UnimplementedError();
+  }
+
+  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback*, bool) final {
+    return UnimplementedError();
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey&, const std::optional<CliqueId>&,
+                      absl::Span<const DeviceRank>,
+                      const Collectives::Config&) final {
+    return UnimplementedError();
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const>, int32_t, absl::Span<const RankId>,
+      const Collectives::Config&) final {
+    return UnimplementedError();
+  }
+
+  absl::Status GroupStart() final { return UnimplementedError(); }
+  absl::Status GroupEnd() final { return UnimplementedError(); }
+
+ protected:
+  static absl::Status UnimplementedError() {
+    return Unimplemented("XLA compiled without GPU collectives support");
+  }
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_STUB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
new file mode 100644
index 00000000..721e94d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla::gpu {
+
+// XLA host-initiated collectives implemented on top of NCCL.
+class NcclCollectives : public GpuCollectives {
+ public:
+  bool IsImplemented() const final { return true; }
+
+  bool IsGlobalConfig() const final;
+
+  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback* clique_id_callback, bool is_local) final;
+
+  absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final;
+
+  absl::Status GroupStart() final;
+  absl::Status GroupEnd() final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueId>& clique_id,
+                      absl::Span<const DeviceRank> ranks,
+                      const Collectives::Config& config) final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const> comms, int32_t color,
+      absl::Span<const RankId> keys, const Collectives::Config& config) final;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
new file mode 100644
index 00000000..07211c0b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
@@ -0,0 +1,110 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COMMUNICATOR_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+#else
+#include "third_party/nccl/nccl.h"
+#endif  // TENSORFLOW_USE_ROCM
+
+namespace xla::gpu {
+
+// XLA collectives communicator wrapping an NCCL communicator.
+class NcclCommunicator : public Communicator {
+ public:
+  explicit NcclCommunicator(ncclComm_t comm);
+  ~NcclCommunicator() override;
+
+  absl::Status Abort() final;
+  absl::Status HealthCheck() const final;
+  absl::StatusOr<size_t> NumRanks() const final;
+
+  absl::StatusOr<std::unique_ptr<RegisteredBufferHandle>> RegisterBuffer(
+      se::DeviceMemoryBase buffer) final;
+
+  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, ReductionKind reduction_kind,
+                         const Executor& executor) final;
+
+  absl::Status Broadcast(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, RankId root,
+                         const Executor& executor) final;
+
+  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
+                             se::DeviceMemoryBase recv_buffer,
+                             PrimitiveType dtype, size_t count,
+                             ReductionKind reduction_kind,
+                             const Executor& executor) final;
+
+  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
+                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                         size_t count, const Executor& executor) final;
+
+  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
+                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
+                        PrimitiveType dtype, size_t count,
+                        const Executor& executor) final;
+
+  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 std::optional<RankId> source_rank,
+                                 absl::Span<const RankId> target_ranks,
+                                 const Executor& executor) final;
+
+  absl::Status Send(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+                    size_t count, RankId peer, const Executor& executor) final;
+
+  absl::Status Recv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                    size_t count, RankId peer, const Executor& executor) final;
+
+  std::string ToString() const final;
+
+  ncclComm_t comm() const { return comm_; }
+
+ private:
+  static absl::StatusOr<se::Stream*> ToStream(const Executor& executor);
+
+  ncclComm_t comm_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COMMUNICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
new file mode 100644
index 00000000..473fc9f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_NCCL_ERRORS_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_NCCL_ERRORS_H_
+
+#include "absl/strings/str_format.h"          // IWYU pragma: keep
+#include "xla/util.h"     // IWYU pragma: keep
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+//===----------------------------------------------------------------------===//
+// Collection of helper macros for handling NCCL errors.
+//===----------------------------------------------------------------------===//
+
+#define XLA_NCCL_STATUS(expr)                                         \
+  [](ncclResult_t s, absl::string_view str) -> absl::Status {         \
+    if (s == ncclSuccess) return absl::OkStatus();                    \
+    return xla::Internal(                                             \
+        "NCCL operation %s failed: %s. Last NCCL warning(error) log " \
+        "entry (may be unrelated) '%s'.",                             \
+        str, ncclGetErrorString(s), ncclGetLastError(nullptr));       \
+  }(expr, #expr)
+
+#define XLA_NCCL_RETURN_IF_ERROR(expr)      \
+  do {                                      \
+    absl::Status s = XLA_NCCL_STATUS(expr); \
+    if (!s.ok()) {                          \
+      return s;                             \
+    }                                       \
+  } while (0)
+
+#define XLA_NCCL_LOG_IF_ERROR(expr)         \
+  do {                                      \
+    absl::Status s = XLA_NCCL_STATUS(expr); \
+    if (!s.ok()) {                          \
+      LOG(ERROR) << s.ToString();           \
+    }                                       \
+  } while (0)
+
+#define XLA_NCCL_CHECK(expr) CHECK(XLA_NCCL_STATUS(expr).ok())
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_NCCL_ERRORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/compiler.h
new file mode 100644
index 00000000..aacd1757
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/compiler.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_INTERPRETER_COMPILER_H_
+#define XLA_BACKENDS_INTERPRETER_COMPILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/interpreter/platform_id.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+namespace interpreter {
+
+// Despite the inherited "compiler" naming, InterpreterCompiler does not
+// perform any lowering as other backends do. It operates at HLO-level for
+// and is responsible for generating an InterpreterExecutable.
+// Refer to interpreter/README.md for more.
+class InterpreterCompiler : public Compiler {
+ public:
+  InterpreterCompiler() {}
+  ~InterpreterCompiler() override {}
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options) override;
+  absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options) override;
+  absl::StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     const AotCompilationOptions& aot_options) override;
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
+  se::Platform::Id PlatformId() const override;
+
+ private:
+  absl::Status RunHloOptimization(HloModule* hlo_module);
+
+  InterpreterCompiler(const InterpreterCompiler&) = delete;
+  InterpreterCompiler& operator=(const InterpreterCompiler&) = delete;
+};
+
+}  // namespace interpreter
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_INTERPRETER_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executable.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executable.h
new file mode 100644
index 00000000..97e2bd82
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executable.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_INTERPRETER_EXECUTABLE_H_
+#define XLA_BACKENDS_INTERPRETER_EXECUTABLE_H_
+
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/interpreter/executable_base.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/service/dynamic_dimension_inference.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_execution_profile.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace interpreter {
+
+// Responsible for running a HLO graph through the HloEvaluator and output
+// buffer allocation. Refer to interpreter/README.md for more.
+class InterpreterExecutable : public InterpreterExecutableBase {
+ public:
+  InterpreterExecutable(
+      std::unique_ptr<HloModule> hlo_module,
+      std::unique_ptr<HloEvaluator> evaluator,
+      std::optional<DynamicDimensionInference> dynamic_dymension_inference);
+
+  static int64_t ShapeSizeBytes(const Shape& shape);
+
+ protected:
+  absl::StatusOr<Literal> Evaluate(
+      const ServiceExecutableRunOptions* run_options,
+      const HloComputation& computation,
+      absl::Span<const Literal> arg_literals) override
+      ABSL_LOCKS_EXCLUDED(evaluator_lock_);
+
+  // The interpreter interprets executables with an HloEvaluator.
+  std::unique_ptr<HloEvaluator> evaluator_ ABSL_PT_GUARDED_BY(evaluator_lock_);
+  mutable absl::Mutex evaluator_lock_;
+
+ private:
+  std::optional<DynamicDimensionInference> dynamic_dimension_inference_;
+  InterpreterExecutable(const InterpreterExecutable&) = delete;
+  InterpreterExecutable& operator=(const InterpreterExecutable&) = delete;
+};
+
+}  // namespace interpreter
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_INTERPRETER_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executable_base.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executable_base.h
new file mode 100644
index 00000000..da1a5c53
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executable_base.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_INTERPRETER_EXECUTABLE_BASE_H_
+#define XLA_BACKENDS_INTERPRETER_EXECUTABLE_BASE_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/service/dynamic_dimension_inference.h"
+#include "xla/service/executable.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla.pb.h"
+namespace xla {
+namespace interpreter {
+
+// Responsible for running a HLO graph through the HloEvaluator and output
+// buffer allocation. Refer to interpreter/README.md for more.
+class InterpreterExecutableBase : public Executable {
+ public:
+  explicit InterpreterExecutableBase(std::unique_ptr<HloModule> hlo_module);
+
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments) override;
+
+ protected:
+  virtual absl::StatusOr<Literal> Evaluate(
+      const ServiceExecutableRunOptions* run_options,
+      const HloComputation& computation,
+      absl::Span<const Literal> arg_literals) = 0;
+
+ private:
+  absl::StatusOr<ExecutionOutput> AllocateOutputMemoryWithInputReuse(
+      const Shape& shape, const HloInputOutputAliasConfig& alias_config,
+      se::DeviceMemoryAllocator* allocator,
+      std::vector<ExecutionInput>* arguments, stream_executor::Stream* stream);
+
+  InterpreterExecutableBase(const InterpreterExecutableBase&) = delete;
+  InterpreterExecutableBase& operator=(const InterpreterExecutableBase&) =
+      delete;
+};
+
+}  // namespace interpreter
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_INTERPRETER_EXECUTABLE_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executor.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executor.h
new file mode 100644
index 00000000..f421d18a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/executor.h
@@ -0,0 +1,151 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the XlaInterpreterExecutor class, which is a CPU-only implementation
+// of the StreamExecutor interface. For now, this is used for testing and to
+// examine the performance of host-based StreamExecutor code.
+#ifndef XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
+#define XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/host/host_stream.h"
+#include "xla/stream_executor/host_memory_allocation.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_common.h"
+#include "xla/xla_data.pb.h"
+
+namespace stream_executor {
+namespace interpreter {
+
+// A HostStream that is used for the interpreter.
+class InterpreterStream : public host::HostStream {
+ public:
+  explicit InterpreterStream(StreamExecutor *executor)
+      : host::HostStream(executor) {}
+  absl::Status WaitFor(Stream *stream) override {
+    return host::HostStream::WaitFor(stream);
+  }
+  absl::Status WaitFor(Event *event) override {
+    return absl::UnimplementedError("Not implemented.");
+  }
+  absl::Status RecordEvent(Event *event) override {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                      uint64_t size) override {
+    void *src_mem = const_cast<void *>(gpu_src.opaque());
+    EnqueueTask(
+        [this, host_dst, src_mem, size]() { memcpy(host_dst, src_mem, size); });
+    return BlockUntilDone();
+  }
+
+  absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                      uint64_t size) override {
+    void *dst_mem = gpu_dst->opaque();
+    EnqueueTask(
+        [this, dst_mem, host_src, size]() { memcpy(dst_mem, host_src, size); });
+    return BlockUntilDone();
+  }
+};
+
+class XlaInterpreterExecutor : public StreamExecutorCommon {
+ public:
+  XlaInterpreterExecutor(int device_ordinal, Platform *platform)
+      : StreamExecutorCommon(platform), device_ordinal_(device_ordinal) {}
+
+  absl::Status Init() override { return absl::OkStatus(); }
+
+  int device_ordinal() const override { return device_ordinal_; };
+
+  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
+    return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
+  }
+  void HostMemoryDeallocate(void *mem) override {
+    delete[] static_cast<char *>(mem);
+  }
+
+  // No "synchronize all activity" implemented for this platform at the moment.
+  bool SynchronizeAllActivity() override { return true; }
+  absl::Status SynchronousMemZero(DeviceMemoryBase *location,
+                                  uint64_t size) override {
+    return absl::InternalError("Interpreter can not memzero");
+  }
+
+  absl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                 const void *host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &dev_src,
+                                 uint64_t size) override;
+
+  void DeallocateStream(Stream *stream) override {}
+
+  bool DeviceMemoryUsage(int64_t *free, int64_t *total) const override {
+    return false;
+  }
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return CreateDeviceDescription(0);
+  }
+
+  static absl::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
+
+  absl::Status EnablePeerAccessTo(StreamExecutor *other) override {
+    return absl::OkStatus();
+  }
+
+  bool CanEnablePeerAccessTo(StreamExecutor *other) override { return true; }
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override {
+    return std::make_unique<Event>();
+  }
+
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) override {
+    return std::make_unique<InterpreterStream>(this);
+  }
+
+ private:
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+};
+
+}  // namespace interpreter
+}  // namespace stream_executor
+
+#endif  // XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/interpreter_transfer_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/interpreter_transfer_manager.h
new file mode 100644
index 00000000..cd9fd8f0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/interpreter_transfer_manager.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_
+#define XLA_BACKENDS_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_
+
+#include "xla/service/generic_transfer_manager.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+
+// An implementation of the XLA GenericTransferManager for interpreter backend.
+class InterpreterTransferManager : public GenericTransferManager {
+ public:
+  InterpreterTransferManager();
+  ~InterpreterTransferManager() override = default;
+
+  bool CanShapedBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const ShapedBuffer& device_buffer) const override {
+    return true;
+  }
+
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override {
+    return true;
+  }
+
+ private:
+  InterpreterTransferManager(const InterpreterTransferManager&) = delete;
+  InterpreterTransferManager& operator=(const InterpreterTransferManager&) =
+      delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/platform.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/platform.h
new file mode 100644
index 00000000..50a69504
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/platform.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_INTERPRETER_PLATFORM_H_
+#define XLA_BACKENDS_INTERPRETER_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/interpreter/platform_id.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+namespace interpreter {
+
+class XlaInterpreterPlatform : public Platform {
+ public:
+  XlaInterpreterPlatform()
+      : XlaInterpreterPlatform("Interpreter", kXlaInterpreterPlatformId) {}
+  XlaInterpreterPlatform(const std::string& name, const Platform::Id& id);
+  ~XlaInterpreterPlatform() override;
+
+  Platform::Id id() const override;
+
+  int VisibleDeviceCount() const override;
+
+  const std::string& Name() const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  absl::StatusOr<StreamExecutor*> FindExisting(int ordinal) override;
+
+  // Returns a device constructed with ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      int ordinal);
+
+ private:
+  // This platform's name.
+  std::string name_;
+  // This platform's id.
+  Platform::Id id_;
+
+  // Cache of created StreamExecutors.
+  ExecutorCache executor_cache_;
+
+  XlaInterpreterPlatform(const XlaInterpreterPlatform&) = delete;
+  void operator=(const XlaInterpreterPlatform&) = delete;
+};
+
+}  // namespace interpreter
+}  // namespace stream_executor
+
+#endif  // XLA_BACKENDS_INTERPRETER_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/platform_id.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/platform_id.h
new file mode 100644
index 00000000..7552d9c6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/interpreter/platform_id.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_INTERPRETER_PLATFORM_ID_H_
+#define XLA_BACKENDS_INTERPRETER_PLATFORM_ID_H_
+
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace interpreter {
+
+extern const Platform::Id kXlaInterpreterPlatformId;
+
+}  // namespace interpreter
+}  // namespace stream_executor
+
+#endif  // XLA_BACKENDS_INTERPRETER_PLATFORM_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/host_tracer.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/host_tracer.h
new file mode 100644
index 00000000..96c9fcc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/host_tracer.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_PROFILER_CPU_HOST_TRACER_H_
+#define XLA_BACKENDS_PROFILER_CPU_HOST_TRACER_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+#include "tsl/profiler/lib/profiler_interface.h"
+
+namespace xla {
+namespace profiler {
+
+struct HostTracerOptions {
+  // Levels of host tracing:
+  // - Level 0 is used to disable host traces.
+  // - Level 1 enables tracing of only user instrumented (or default) TraceMe.
+  // - Level 2 enables tracing of all level 1 TraceMe(s) and instrumented high
+  //           level program execution details (expensive TF ops, XLA ops, etc).
+  //           This is the default.
+  // - Level 3 enables tracing of all level 2 TraceMe(s) and more verbose
+  //           (low-level) program execution details (cheap TF ops, etc).
+  int trace_level = 2;
+  uint64_t filter_mask = std::numeric_limits<uint64_t>::max();
+};
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreateHostTracer(
+    const HostTracerOptions& options);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_CPU_HOST_TRACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/metadata_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/metadata_utils.h
new file mode 100644
index 00000000..b30da770
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/metadata_utils.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_CPU_METADATA_UTILS_H_
+#define XLA_BACKENDS_PROFILER_CPU_METADATA_UTILS_H_
+
+#include "xla/service/hlo.pb.h"
+#include "xla/tsl/profiler/convert/xla_op_utils.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+
+class MetadataXPlaneBuilder {
+ public:
+  explicit MetadataXPlaneBuilder(tsl::profiler::XPlane* raw_plane)
+      : plane_(raw_plane),
+        hlo_proto_stat_(plane_.GetOrCreateStatMetadata(
+            GetStatTypeStr(tsl::profiler::StatType::kHloProto))) {}
+
+  void AddHloProto(uint64_t program_id, const xla::HloProto& hlo_proto) {
+    tsl::profiler::XEventMetadata* event_metadata =
+        plane_.GetOrCreateEventMetadata(program_id);
+    if (event_metadata->name().empty()) {
+      event_metadata->set_name(tsl::profiler::HloModuleNameWithProgramId(
+          hlo_proto.hlo_module().name(), program_id));
+      tsl::profiler::XStatsBuilder<tsl::profiler::XEventMetadata> event_stats(
+          event_metadata, &plane_);
+      event_stats.AddStatValue(*hlo_proto_stat_, hlo_proto);
+    }
+  }
+
+ private:
+  tsl::profiler::XPlaneBuilder plane_;
+  const tsl::profiler::XStatMetadata* hlo_proto_stat_ = nullptr;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_CPU_METADATA_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/python_tracer.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/python_tracer.h
new file mode 100644
index 00000000..0ea3c7e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/cpu/python_tracer.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_PROFILER_CPU_PYTHON_TRACER_H_
+#define XLA_BACKENDS_PROFILER_CPU_PYTHON_TRACER_H_
+
+#include <memory>
+
+#include "tsl/profiler/lib/profiler_interface.h"
+
+namespace xla {
+namespace profiler {
+
+struct PythonTracerOptions {
+  // Whether to enable python function calls tracing.
+  // NOTE: Runtime overhead ensues if enabled.
+  bool enable_trace_python_function = false;
+
+  // Whether to enable python TraceMe instrumentation.
+  bool enable_python_traceme = true;
+
+  // Whether profiling stops within an atexit handler.
+  bool end_to_end_mode = false;
+};
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreatePythonTracer(
+    const PythonTracerOptions& options);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_CPU_PYTHON_TRACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cuda_test.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cuda_test.h
new file mode 100644
index 00000000..31d13710
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cuda_test.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUDA_TEST_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUDA_TEST_H_
+
+namespace xla {
+namespace profiler {
+namespace test {
+// Calls a function on the device to print a string as many times as indicated
+// by iters.
+void PrintfKernel(int iters = 1);
+
+// Calls an empty kernel (named "empty") on the device as many times as
+// indicated by iters.
+void EmptyKernel(int iters = 1);
+
+// Waits for device activity to complete.
+void Synchronize();
+
+// Copies a few bytes of memory from host to device.
+void MemCopyH2D();
+
+// Copies a few bytes of memory from device to host, asynchronously.
+void MemCopyH2D_Async();
+
+// Copies a few bytes of memory from device to host.
+void MemCopyD2H();
+
+// Returns true if it s possible to copy bytes from device 0 to device 1.
+bool MemCopyP2PAvailable();
+
+// Copies a few bytes of memory from device 0 to device 1.
+void MemCopyP2PImplicit();
+
+// Copies a few bytes of memory from device 0 to device 1.
+void MemCopyP2PExplicit();
+
+// Creates a simple cuda graph, clones it, instantiates it and executes it.
+void CudaGraphCreateAndExecute();
+
+// Return true if the cuda version is new enough so cuda graph activity trace
+// is supported by its CUPTI.
+bool IsCudaNewEnoughForGraphTraceTest();
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUDA_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
new file mode 100644
index 00000000..fb77d4c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -0,0 +1,398 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/profiler/utils/buffer_pool.h"
+#include "xla/tsl/profiler/utils/lock_free_queue.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace xla {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
+  // For events from other CuptiTracerEventSource, it is always 0.
+  int8_t copy_kind;
+  // CUpti_ActivityMemoryKind of source.
+  int8_t src_mem_kind;
+  // CUpti_ActivityMemoryKind of destination.
+  int8_t dst_mem_kind;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = static_cast<uint32_t>(-1);
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct MemAllocDetails {
+  // Size of memory to be written over in bytes.
+  size_t num_bytes;
+  // The CUpti_ActivityMemoryKind value for this activity event.
+  int8_t mem_kind;
+  // The virtual address of allocation. 0 if it is a free operation.
+  uint64_t address;
+};
+
+using MemFreeDetails = MemAllocDetails;
+
+// Memory residency contains details read from CUpti_ActivityMemory type. This
+// is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
+// event. The start of this even corresponse to a cudaMalloc, and the end
+// corresponds to a cudaFree.
+using MemoryResidencyDetails = MemAllocDetails;
+
+// cudaHostRegister
+struct HostRegisterDetails {
+  size_t num_bytes;
+  uint64_t address;
+  unsigned int flags;
+};
+
+// cudaHostUnregister
+struct HostUnregisterDetails {
+  uint64_t address;
+};
+
+struct MemsetDetails {
+  // Size of memory to be written over in bytes.
+  size_t num_bytes;
+  // The CUpti_ActivityMemoryKind value for this activity event.
+  int8_t mem_kind;
+  // Whether or not the memset is asynchronous.
+  bool async;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct GenericDetails {
+  uint32_t cbid;
+};
+
+struct CudaGraphDetails {
+  uint32_t cbid;  // 0 for activity events, otherwise the cbid of the callback
+  uint32_t orig_graph_id;  // The original graph from which new graph is
+                           // instantiated. Note graph_id is put into general
+                           // fields as if trace in node mode, many activity
+                           // events will contains graph id.
+};
+
+inline std::string ToXStat(const KernelDetails& kernel_info,
+                           double occupancy_pct) {
+  return absl::StrCat(
+      "regs:", kernel_info.registers_per_thread,
+      " static_shared:", kernel_info.static_shared_memory_usage,
+      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
+      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
+      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
+      kernel_info.block_y, ",", kernel_info.block_z,
+      " occ_pct:", occupancy_pct);
+}
+
+// Gets the name of the CUpti_ActivityMemoryKind value.
+absl::string_view GetMemoryKindName(int8_t memory_kind);
+
+enum class CuptiTracerEventType {
+  Unsupported = 0,
+  Kernel = 1,
+  MemcpyH2D = 2,
+  MemcpyD2H = 3,
+  MemcpyD2D = 4,
+  MemcpyP2P = 5,
+  MemcpyOther = 6,
+  MemoryAlloc = 7,
+  Overhead = 8,
+  UnifiedMemory = 9,
+  MemoryFree = 10,
+  Memset = 11,
+  MemoryResidency = 12,
+  HostRegister = 13,
+  HostUnregister = 14,
+  CudaGraph = 15,
+  ThreadMarkerRange = 16,
+  ThreadMarkerStart = 17,
+  ThreadMarkerEnd = 18,
+  Generic = 100,
+};
+
+const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
+
+enum class CuptiTracerEventSource {
+  Invalid = 0,
+  DriverCallback = 1,
+  Activity = 2,
+  // Maybe consider adding runtime callback and metric api in the future.
+};
+
+struct CuptiTracerEvent {
+  static constexpr uint64_t kInvalidThreadId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidContextId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
+  CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
+  // Although CUpti_CallbackData::functionName is persistent, however
+  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+  // it.
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view nvtx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = 0;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint64_t thread_id = kInvalidThreadId;
+  int64_t context_id = kInvalidContextId;
+  int64_t stream_id = kInvalidStreamId;
+  uint32_t graph_id = 0;
+  int64_t scope_range_id = 0;
+  union {
+    // For Memcpy API and activities. `type` must be Memcpy*.
+    MemcpyDetails memcpy_info;
+    // Used for MemAlloc API. `type` must be MemoryAlloc.
+    MemAllocDetails memalloc_info;
+    // Used for kernel activities. `type` must be Kernel.
+    KernelDetails kernel_info;
+    // Used for MemFree activities. `type` must be MemoryFree.
+    MemFreeDetails memfree_info;
+    // Used for cuMemHostRegister.  `type` must be HostRegister.
+    HostRegisterDetails host_register_info;
+    // Used for cuMemHostUnregister.  `type` must be HostUnregister.
+    HostUnregisterDetails host_unregister_info;
+    // Used for Memset API and activities. `type` must be Memset.
+    MemsetDetails memset_info;
+    // Used for Memory residency activities. `type` must be MemoryResidency.
+    MemoryResidencyDetails memory_residency_info;
+    // Used for `source` DriverCallback, `type` must be Generic.
+    GenericDetails generic_info;
+    // Used for `source` DriverCallback, `type` must be CudaGraph.
+    CudaGraphDetails cuda_graph_info;
+  };
+};
+
+// As annotation and nvtx range strings are of large duplication, it is worth
+// to keep single copy of different strings to save memory footprint. This class
+// will construct a string when deduping unseen string_view input, and return
+// the string_view on the newly created string. If the input str is contains in
+// its internal data, it just return it's internal copy's string_view. All
+// returned string_view will keep valid as the object of this class is alive.
+class StringDeduper {
+ public:
+  void Clear() { strings_.clear(); }
+
+  // max_unique_count is not put into data member to make it consistent with
+  // existing logic.
+  absl::string_view Dedup(absl::string_view str, size_t max_unique_count = 0);
+
+  size_t Size() const { return strings_.size(); }
+
+ private:
+  absl::node_hash_set<std::string> strings_;
+};
+
+// AnnotationMap keep the map from a correlation id to its corresponding
+// annotation and nvtx_range. During Add(), unseen input string view will
+// cause new internal string constructed. This annotation map also controls
+// per-device annotation string count.
+class AnnotationMap {
+ public:
+  struct AnnotationInfo {
+    absl::string_view annotation;
+    absl::string_view nvtx_range;
+    int64_t scope_range_id = 0;
+  };
+
+  explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
+      : max_size_(max_size), per_device_map_(num_gpus) {}
+
+  void Add(uint32_t device_id, uint32_t correlation_id,
+           absl::string_view annotation, absl::string_view nvtx_range,
+           int64_t scope_range_id = 0);
+
+  AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id) const
+      ABSL_ATTRIBUTE_LIFETIME_BOUND;
+
+ private:
+  struct PerDeviceAnnotationMap {
+    StringDeduper annotation_deduper;
+    StringDeduper nvtx_range_deduper;
+    absl::flat_hash_map<uint32_t, AnnotationInfo> correlation_map;
+  };
+  const uint64_t max_size_;
+  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
+
+  AnnotationMap(const AnnotationMap&) = delete;
+  void operator=(const AnnotationMap&) = delete;
+};
+
+struct CuptiEventCollectorDelegate {
+  AnnotationMap& annotation_map;
+  std::function<void(CuptiTracerEvent&&)> receive;
+
+  explicit CuptiEventCollectorDelegate(
+      AnnotationMap& p_annotation_map,
+      std::function<void(CuptiTracerEvent&&)> p_receive)
+      : annotation_map(p_annotation_map), receive(std::move(p_receive)) {}
+};
+
+// A tree of scope range ids which map child_id ==> parent_id
+typedef absl::flat_hash_map<int64_t, int64_t> ScopeRangeIdTree;
+
+class CuptiActivityBufferManager {
+ public:
+  struct ActivityBufferAndSize {
+    std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> buffer;
+    size_t size;  // size in bytes for the events filled by CUPTI.
+    explicit ActivityBufferAndSize(uint8_t* p = nullptr, size_t sz = 0);
+  };
+
+  explicit CuptiActivityBufferManager(size_t buffer_size_in_bytes)
+      : buffer_pool_(buffer_size_in_bytes) {}
+
+  size_t GetBufferSizeInBytes() { return buffer_pool_.GetBufferSizeInBytes(); }
+
+  uint8_t* GetOrCreateBuffer() { return buffer_pool_.GetOrCreateBuffer(); }
+
+  void ReclaimBuffer(uint8_t* p) { buffer_pool_.ReclaimBuffer(p); }
+
+  void CacheCuptiFilledActivityBuffer(uint8_t* p, size_t sz) {
+    tsl::mutex_lock lock(buffer_mutex_);
+    cached_buffers_.emplace_back(p, sz);
+  }
+
+  std::list<ActivityBufferAndSize> PopCachedBuffers() {
+    std::list<ActivityBufferAndSize> result;
+    tsl::mutex_lock lock(buffer_mutex_);
+    std::swap(result, cached_buffers_);
+    return result;
+  }
+
+ private:
+  tsl::profiler::BufferPool buffer_pool_;
+  tsl::mutex buffer_mutex_;
+  std::list<ActivityBufferAndSize> cached_buffers_ TF_GUARDED_BY(buffer_mutex_);
+};
+
+void AddActivityBufferListEventsTo(
+    CuptiEventCollectorDelegate& receiver,
+    std::list<CuptiActivityBufferManager::ActivityBufferAndSize>& buffer_list,
+    size_t max_activity_event_count, size_t& dropped_activity_event_count);
+
+class CallbackAnnotationsAndEvents {
+ public:
+  static constexpr size_t kQueueBlockSize = 256 * 1024;
+  using EventQueue =
+      tsl::profiler::BlockedQueue<CuptiTracerEvent, kQueueBlockSize>;
+
+  CallbackAnnotationsAndEvents() = default;
+
+  CallbackAnnotationsAndEvents(CallbackAnnotationsAndEvents&& another);
+
+  CallbackAnnotationsAndEvents& operator=(
+      CallbackAnnotationsAndEvents&& another);
+
+  void Clear();
+
+  size_t NumAnnotations() const { return annotations_.Size(); }
+
+  absl::string_view DedupAnnotation(absl::string_view str) {
+    return annotations_.Dedup(str);
+  }
+
+  absl::string_view DedupNvtxRange(absl::string_view str) {
+    return nvtx_ranges_.Dedup(str);
+  }
+
+  EventQueue& event_queue() { return event_queue_; }
+
+  ScopeRangeIdTree& scope_range_id_tree() { return scope_range_id_tree_; }
+
+  size_t NumDroppedEvents() const { return num_dropped_events_; }
+
+  void IncNumDroppedEvents() { ++num_dropped_events_; }
+
+ private:
+  // Annotation tends to be repetitive, use a hash_set to store the strings,
+  // and use the reference to the string in the hash_set.
+  StringDeduper annotations_;
+  StringDeduper nvtx_ranges_;
+  size_t num_dropped_events_ = 0;
+  EventQueue event_queue_;
+  ScopeRangeIdTree scope_range_id_tree_;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
new file mode 100644
index 00000000..87e7b25e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
@@ -0,0 +1,140 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/backends/profiler/gpu/cupti_buffer_events.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+
+struct CuptiTracerCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events = 2 * 1024 * 1024;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events = 2 * 1024 * 1024;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings = 1024 * 1024;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+// This struct will be used to store the PM Sampling data.
+// Same as CUDA 12.6.2 extras/CUPTI/samples/pm_sampling/pm_sampling.h
+struct SamplerRange {
+  size_t range_index;
+  uint64_t start_timestamp_ns;
+  uint64_t end_timestamp_ns;
+  // Instead of map<std::string, double> in the above sample code, we use to
+  // vector<double> to save memory.
+  std::vector<double> metric_values;
+};
+
+// This is to hold multiple PM Sampling data with one std::string vector for
+// holding the names.
+class PmSamples {
+ public:
+  PmSamples(std::vector<std::string> metrics,
+            std::vector<SamplerRange> sampler_ranges)
+      : metrics_(std::move(metrics)),
+        sampler_ranges_(std::move(sampler_ranges)) {}
+  void PopulateCounterLine(tsl::profiler::XPlaneBuilder* plane);
+
+ private:
+  std::vector<std::string> metrics_;
+  std::vector<SamplerRange> sampler_ranges_;
+};
+
+class CuptiTraceCollector {
+ public:
+  explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
+      : options_(options),
+        annotation_map_(options.max_annotation_strings, options.num_gpus) {}
+  virtual ~CuptiTraceCollector() {}
+
+  // Producer side functions (i.e. called by CuptiTracer).
+  virtual void AddEvent(CuptiTracerEvent&& event) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32_t num_events) = 0;
+  virtual void Flush() = 0;
+
+  // After CuptiTracer stop, collected per-thread callback data from threads
+  // will be send here. Default behavior are: a) create merged annotation map
+  // (for later activity event usage), and b) direct add all event by calling
+  // AddEvent(). If need_callback_events is false, only annotation map and scope
+  // range id tree will be merged, all events will be dropped.
+  virtual void OnTracerCollectedCallbackData(
+      std::vector<CallbackAnnotationsAndEvents> callback_events,
+      bool need_callback_events);
+
+  // CuptiTracer tracer now cache all activity buffers during tracing.
+  // After tracing stop, the cached activity buffers will be send here.
+  // Default behavior is direct process those cached activity events and
+  // add it into this class by calling AddEvent().
+  virtual void OnTracerCachedActivityBuffers(
+      std::list<CuptiActivityBufferManager::ActivityBufferAndSize>
+          activity_buffers);
+
+  // Consumer side functions (i.e. called by GPU tracer);
+  virtual bool Export(tensorflow::profiler::XSpace* space,
+                      uint64_t end_gpu_ns) {
+    return true;
+  }
+  virtual std::string ReportNumEventsIfDropped() { return ""; }
+
+  // Set by the cupti tracer right after tracing is stopped.
+  void SetTracingEndTimeNs(uint64_t end_time_ns) {
+    tracing_end_time_ns_ = end_time_ns;
+  }
+
+  uint64_t GetTracingEndTimeNs() const { return tracing_end_time_ns_; }
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+  const CuptiTracerCollectorOptions& GetOptions() const { return options_; }
+
+ protected:
+  CuptiTracerCollectorOptions options_;
+  // map of child_scope_id -> parent_scope_id
+  ScopeRangeIdTree scope_range_id_tree_;
+
+ private:
+  AnnotationMap annotation_map_;
+  uint64_t tracing_end_time_ns_ = 0;
+
+  CuptiTraceCollector(const CuptiTraceCollector&) = delete;
+  void operator=(const CuptiTraceCollector&) = delete;
+};
+
+std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
+    const CuptiTracerCollectorOptions& options, uint64_t start_walltime_ns,
+    uint64_t start_gputime_ns);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
new file mode 100644
index 00000000..79b124a5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -0,0 +1,176 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_ERROR_MANAGER_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_ERROR_MANAGER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace xla {
+namespace profiler {
+
+class CuptiErrorManager : public xla::profiler::CuptiInterface {
+ public:
+  explicit CuptiErrorManager(std::unique_ptr<CuptiInterface> interface);
+
+  // Returns whether CUPTI is disabled.
+  bool Disabled() const override { return disabled_.load(); }
+
+  // CUPTI activity API: all thread-safe
+  // Disables activity monitoring.
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
+
+  // Enables activity monitoring. If this is successfully executed, we add
+  // ActivityDisable to the undo log.
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
+
+  // Flushes all outstanding activities.
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
+
+  // Gets a next activity record from a pool of already collected activity
+  // records.
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
+
+  // Reports the number of dropped activity records.
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
+
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
+
+  // Registers callback functions handling activity.
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
+
+  CUptiResult ActivityUsePerThreadBuffer() override;
+
+  CUptiResult SetActivityFlushPeriod(uint32_t period_ms) override;
+
+  // Returns device ID for a given context.
+  CUptiResult GetDeviceId(CUcontext context, uint32_t* device_id) override;
+
+  // Returns CUPTI timestamp.
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
+
+  // Explicitly destroys and cleans up all resources associated with CUPTI in
+  // the current process.
+  CUptiResult Finalize() override;
+
+  // CUPTI callback API
+  // Enables or disables callback. If we successfully enables callback, we add
+  // EnableCallback to disable callback to the undo log.
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId callback_id) override;
+
+  // Enables or disables callback domain. If we successfully enables a domain,
+  // we add EnableDomain to disable the domain to the undo log.
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
+
+  // Subscribes callbacks. If we successfully subscribes the callback, we add
+  // Unsubscribe to the undo log.
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
+
+  // Unsubscribes callbacks.
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
+
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
+
+  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
+
+  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                            uint8_t per_thread_stream,
+                            uint32_t* stream_id) override;
+
+  CUptiResult GetGraphId(CUgraph graph, uint32_t* graph_id) override;
+
+  CUptiResult GetGraphExecId(CUgraphExec graph_exec,
+                             uint32_t* graph_id) override;
+
+  CUptiResult SetThreadIdType(CUpti_ActivityThreadIdType type) override;
+
+  // Clears Undo stack. We are maintaining undo stack for each profiling phase.
+  // Once the profiling is done, we need to clear the undo stack.
+  void CleanUp() override;
+
+ private:
+  typedef std::function<CUptiResult()> UndoFunction;
+
+  // Register undo function.
+  void RegisterUndoFunction(const UndoFunction& func);
+
+  // Resets profiling status by calling some undo functions registered,
+  // and then disables profiling.
+  void UndoAndDisable();
+
+  // Returns a descriptive string for a CUptiResult.
+  std::string ResultString(CUptiResult result) const;
+
+  // Contains a pointer to a cupti interface instance. Normally, this will point
+  // to a real CUPTI interface that interacts with underlying hardware, but for
+  // testing, we often replace this with a CUPTI mock object to mock hardware
+  // behavior. This will be set when CuptiBase singleton was created and an
+  // object that this variable points to will die when CuptiBase singleton dies,
+  // i.e., at the end of program execution.
+  std::unique_ptr<CuptiInterface> interface_;
+
+  // A vector of functions that needs to be called by Undo upon an error
+  // detected. This vector is managed like a statck through push_back and
+  // pop_back. Whenever an API function is successfully executed, its
+  // corresponding undo function will be pushed into this stack and Undo will
+  // pop and execute the unroll function upon detecting an error.
+  std::vector<UndoFunction> undo_stack_ TF_GUARDED_BY(undo_stack_mu_);
+
+  // A mutex to guarantee atomicity for undo_stack_. Given that threads that
+  // can update undo_stack_ are a profiling control thread such as a webserver
+  // thread or a thread that executes a kernel during performance counter
+  // profiling, which is already serialized, the contention for this lock will
+  // be extremely low. In other words, it will be contended only when the
+  // profiling is being enabled or disabled, and we will have at most two
+  // threads that will contend for this mutex.
+  tsl::mutex undo_stack_mu_;
+
+  // Once an error is detected, we will ignore any CUPTI API call.
+  std::atomic<int> disabled_;
+
+  // Prevent recursive undo if an UndoFunction fails.
+  bool undo_disabled_;
+
+  CuptiErrorManager(const CuptiErrorManager&) = delete;
+  void operator=(const CuptiErrorManager&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_ERROR_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
new file mode 100644
index 00000000..c577b1e1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
@@ -0,0 +1,127 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/types.h"
+
+namespace xla {
+namespace profiler {
+
+// Provides a wrapper interface to every single CUPTI API function. This class
+// is needed to create an easy mock object for CUPTI API calls. All member
+// functions are defined in the following order: activity related APIs, callback
+// related APIs, Event APIs, and metric APIs. Within each category, we follow
+// the order in the original CUPTI documentation.
+class CuptiInterface {
+ public:
+  CuptiInterface() {}
+
+  virtual ~CuptiInterface() {}
+
+  // CUPTI activity API
+  virtual CUptiResult ActivityDisable(CUpti_ActivityKind kind) = 0;
+
+  virtual CUptiResult ActivityEnable(CUpti_ActivityKind kind) = 0;
+
+  virtual CUptiResult ActivityFlushAll(uint32_t flag) = 0;
+
+  virtual CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                            size_t valid_buffer_size_bytes,
+                                            CUpti_Activity** record) = 0;
+
+  virtual CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                                   uint32_t stream_id,
+                                                   size_t* dropped) = 0;
+
+  virtual CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) = 0;
+
+  virtual CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) = 0;
+
+  virtual CUptiResult ActivityUsePerThreadBuffer() = 0;
+
+  virtual CUptiResult SetActivityFlushPeriod(uint32_t period_ms) = 0;
+
+  virtual CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) = 0;
+
+  virtual CUptiResult GetTimestamp(uint64_t* timestamp) = 0;
+
+  virtual CUptiResult Finalize() = 0;
+
+  // CUPTI callback API
+  virtual CUptiResult EnableCallback(uint32_t enable,
+                                     CUpti_SubscriberHandle subscriber,
+                                     CUpti_CallbackDomain domain,
+                                     CUpti_CallbackId cbid) = 0;
+
+  virtual CUptiResult EnableDomain(uint32_t enable,
+                                   CUpti_SubscriberHandle subscriber,
+                                   CUpti_CallbackDomain domain) = 0;
+
+  virtual CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                                CUpti_CallbackFunc callback,
+                                void* userdata) = 0;
+
+  virtual CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) = 0;
+
+  virtual CUptiResult GetResultString(CUptiResult result, const char** str) = 0;
+
+  virtual CUptiResult GetContextId(CUcontext context, uint32_t* context_id) = 0;
+
+  virtual CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                                    uint8_t per_thread_stream,
+                                    uint32_t* stream_id) = 0;
+
+  virtual CUptiResult GetGraphId(CUgraph graph, uint32_t* graph_id) = 0;
+
+  virtual CUptiResult GetGraphExecId(CUgraphExec graph_exec,
+                                     uint32_t* graph_id) = 0;
+
+  virtual CUptiResult SetThreadIdType(CUpti_ActivityThreadIdType type) = 0;
+
+  // Interface maintenance functions. Not directly related to CUPTI, but
+  // required for implementing an error resilient layer over CUPTI API.
+
+  // Performance any clean up work that is required each time profile session
+  // is done. Therefore this can be called multiple times during process life
+  // time.
+  virtual void CleanUp() = 0;
+
+  // Whether CUPTI API is currently disabled due to unrecoverable errors.
+  // All subsequent calls will fail immediately without forwarding calls to
+  // CUPTI library.
+  virtual bool Disabled() const = 0;
+
+ private:
+  CuptiInterface(const CuptiInterface&) = delete;
+  void operator=(const CuptiInterface&) = delete;
+};
+
+CuptiInterface* GetCuptiInterface();
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h
new file mode 100644
index 00000000..3065da37
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h
@@ -0,0 +1,73 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_PROFILER_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_PROFILER_H_
+
+#include <optional>
+#include <string>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "tsl/platform/types.h"
+
+namespace xla {
+namespace profiler {
+
+struct CuptiProfilerOptions {};
+
+// The class enables CUPTI Profiling/Perfworks API.
+class CuptiProfiler {
+ public:
+  // Not copyable or movable
+  CuptiProfiler(const CuptiProfiler&) = delete;
+  CuptiProfiler& operator=(const CuptiProfiler&) = delete;
+
+  // Returns a pointer to singleton CuptiProfiler.
+  static CuptiProfiler* GetCuptiProfilerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+  bool NeedRootAccess() const { return need_root_access_; }
+
+  void Enable(const CuptiProfilerOptions& option);
+  void Disable();
+
+  static uint64_t GetTimestamp();
+  static int NumGpus();
+  // Returns the error (if any) when using libcupti.
+  static std::string ErrorIfAny();
+
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit CuptiProfiler(CuptiInterface* cupti_interface);
+
+ private:
+  int num_gpus_;
+  std::optional<CuptiProfilerOptions> option_;
+  CuptiInterface* cupti_interface_ = nullptr;
+
+  // CUPTI 10.1 and higher need root access to profile.
+  bool need_root_access_ = false;
+
+  // Cupti handle for driver or runtime API callbacks. Cupti permits a single
+  // subscriber to be active at any time and can be used to trace Cuda runtime
+  // as and driver calls for all contexts and devices.
+  CUpti_SubscriberHandle subscriber_;  // valid when api_tracing_enabled_.
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_PROFILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
new file mode 100644
index 00000000..80974a13
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
@@ -0,0 +1,197 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
+#include "xla/backends/profiler/gpu/cupti_collector.h"
+#include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "tsl/platform/types.h"
+
+namespace xla {
+namespace profiler {
+
+struct CuptiTracerOptions {
+  bool required_callback_api_events = true;
+  // The callback ids that will be enabled and monitored, if empty, all
+  // Callback ids to be enabled using Callback API.
+  // We only care CUPTI_CB_DOMAIN_DRIVER_API domain for now. It is kind of
+  // redundant to have both CUPTI_CB_DOMAIN_DRIVER_API and
+  // CUPTI_CB_DOMAIN_RUNTIME_API.
+  std::vector<CUpti_driver_api_trace_cbid_enum> cbids_selected;
+  // Activity kinds to be collected using Activity API. If empty, the Activity
+  // API is disable.
+  std::vector<CUpti_ActivityKind> activities_selected;
+  // Whether to call cuptiFinalize.
+  bool cupti_finalize = false;
+  // Whether to call cuCtxSynchronize for each device before Stop().
+  bool sync_devices_before_stop = false;
+  // Whether to enable NVTX tracking, we need this for TensorRT tracking.
+  bool enable_nvtx_tracking = false;
+};
+
+class CuptiTracer;
+
+class CuptiDriverApiHook {
+ public:
+  virtual ~CuptiDriverApiHook() {}
+
+  virtual absl::Status OnDriverApiEnter(
+      int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+      const CUpti_CallbackData* callback_info) = 0;
+  virtual absl::Status OnDriverApiExit(
+      int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+      const CUpti_CallbackData* callback_info) = 0;
+  virtual absl::Status SyncAndFlush() = 0;
+};
+
+// The class use to enable cupti callback/activity API and forward the collected
+// trace events to CuptiTraceCollector. There should be only one CuptiTracer
+// per process.
+class CuptiTracer {
+ public:
+  // Not copyable or movable
+  CuptiTracer(const CuptiTracer&) = delete;
+  CuptiTracer& operator=(const CuptiTracer&) = delete;
+
+  // Returns a pointer to singleton CuptiTracer.
+  static CuptiTracer* GetCuptiTracerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+  bool NeedRootAccess() const { return need_root_access_; }
+
+  void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
+  void Disable();
+
+  // Control threads could periodically call this function to flush the
+  // collected events to the collector. Note that this function will lock the
+  // per-thread data mutex and may impact the performance.
+  absl::Status FlushEventsToCollector();
+
+  // Sets the activity event buffer flush period. Set to 0 to disable the
+  // periodic flush. Before using the FlushEventsToCollector() function, user
+  // either need to set the activity flush period or call the
+  // FlushActivityBuffers()
+  absl::Status SetActivityFlushPeriod(uint32_t period_ms);
+
+  // Force the cupti to flush activity buffers to this tracer.
+  absl::Status FlushActivityBuffers();
+
+  absl::Status HandleCallback(CUpti_CallbackDomain domain,
+                              CUpti_CallbackId cbid,
+                              const CUpti_CallbackData* callback_info);
+
+  // Returns a buffer and its size for CUPTI to store activities. This buffer
+  // will be reclaimed when CUPTI makes a callback to ProcessActivityBuffer.
+  void RequestActivityBuffer(uint8_t** buffer, size_t* size);
+
+  // Parses CUPTI activity events from activity buffer, and emits events for
+  // CuptiTraceCollector. This function is public because called from registered
+  // callback.
+  absl::Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
+                                     uint8_t* buffer, size_t size);
+
+  static uint64_t GetTimestamp();
+  static int NumGpus();
+  // Returns the error (if any) when using libcupti.
+  static std::string ErrorIfAny();
+
+  // Returns true if the number of annotation strings is too large. The input
+  // count is the per-thread count.
+  bool TooManyAnnotationStrings(size_t count) const;
+
+  // Returns true if the total number of callback events across all threads
+  // is too large.
+  bool TooManyCallbackEvents() const;
+
+  void IncCallbackEventCount() {
+    num_callback_events_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  bool IsCallbackApiEventsRequired() const {
+    return option_.has_value() ? option_->required_callback_api_events : false;
+  }
+
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit CuptiTracer(CuptiInterface* cupti_interface);
+
+ private:
+  // Buffer size and alignment, 32K and 8 as in CUPTI samples.
+  static constexpr size_t kBufferSizeInBytes = 32 * 1024;
+
+  std::unique_ptr<CuptiActivityBufferManager> activity_buffers_;
+  static_assert(std::atomic<size_t>::is_always_lock_free,
+                "std::atomic<size_t> is not lock free! This may cause very bad"
+                " profiling overhead in some circumstances.");
+  std::atomic<size_t> cupti_dropped_activity_event_count_ = 0;
+  std::atomic<size_t> num_activity_events_in_dropped_buffer_ = 0;
+  std::atomic<size_t> num_activity_events_in_cached_buffer_ = 0;
+  std::atomic<size_t> num_callback_events_ = 0;
+
+  // Clear activity_buffers, reset activity event counters.
+  void PrepareActivityStart();
+
+  // Empty all per-thread callback annotations, reset callback event counter.
+  void PrepareCallbackStart();
+
+  // Gather all per-thread callback events and annotations.
+  std::vector<CallbackAnnotationsAndEvents> GatherCallbackAnnotationsAndEvents(
+      bool stop_recording);
+
+  absl::Status EnableApiTracing();
+  absl::Status EnableActivityTracing();
+  absl::Status DisableApiTracing();
+  absl::Status DisableActivityTracing();
+  absl::Status Finalize();
+  void ConfigureActivityUnifiedMemoryCounter(bool enable);
+  absl::Status HandleNVTXCallback(CUpti_CallbackId cbid,
+                                  const CUpti_CallbackData* cbdata);
+  absl::Status HandleDriverApiCallback(CUpti_CallbackId cbid,
+                                       const CUpti_CallbackData* cbdata);
+  absl::Status HandleResourceCallback(CUpti_CallbackId cbid,
+                                      const CUpti_CallbackData* cbdata);
+  int num_gpus_;
+  std::optional<CuptiTracerOptions> option_;
+  CuptiInterface* cupti_interface_ = nullptr;
+  CuptiTraceCollector* collector_ = nullptr;
+
+  // CUPTI 10.1 and higher need root access to profile.
+  bool need_root_access_ = false;
+
+  bool api_tracing_enabled_ = false;
+  // Cupti handle for driver or runtime API callbacks. Cupti permits a single
+  // subscriber to be active at any time and can be used to trace Cuda runtime
+  // as and driver calls for all contexts and devices.
+  CUpti_SubscriberHandle subscriber_;  // valid when api_tracing_enabled_.
+
+  bool activity_tracing_enabled_ = false;
+
+  std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
new file mode 100644
index 00000000..9fc26c4c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
@@ -0,0 +1,191 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_WRAPPER_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_WRAPPER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/backends/profiler/gpu/cupti_interface.h"
+
+namespace xla {
+namespace profiler {
+
+class CuptiWrapper : public xla::profiler::CuptiInterface {
+ public:
+  CuptiWrapper() {}
+
+  ~CuptiWrapper() override {}
+
+  // CUPTI activity API
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
+
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
+
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
+
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
+
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
+
+  CUptiResult ActivityUsePerThreadBuffer() override;
+
+  CUptiResult SetActivityFlushPeriod(uint32_t period_ms) override;
+
+  CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) override;
+
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
+
+  // cuptiFinalize is only defined in CUDA8 and above.
+  // To enable it in CUDA8, the environment variable CUPTI_ENABLE_FINALIZE must
+  // be set to 1.
+  CUptiResult Finalize() override;
+
+  // CUPTI callback API
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId cbid) override;
+
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
+
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
+
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
+
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
+
+  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
+
+  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                            uint8_t per_thread_stream,
+                            uint32_t* stream_id) override;
+
+  CUptiResult GetGraphId(CUgraph graph, uint32_t* graph_id) override;
+
+  CUptiResult GetGraphExecId(CUgraphExec graph_exec,
+                             uint32_t* graph_id) override;
+
+  CUptiResult SetThreadIdType(CUpti_ActivityThreadIdType type) override;
+
+  void CleanUp() override {}
+  bool Disabled() const override { return false; }
+
+ private:
+  CuptiWrapper(const CuptiWrapper&) = delete;
+  void operator=(const CuptiWrapper&) = delete;
+};
+
+// This is an implementation of CuptiWrapper that implements all load bearing
+// APIs as no-op. This is a stub that keeps XLA profiler functional, but all
+// collected profiles will be empty.
+class CuptiWrapperStub : public xla::profiler::CuptiInterface {
+ public:
+  CuptiWrapperStub() {}
+
+  ~CuptiWrapperStub() override {}
+
+  // CUPTI activity API
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
+
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
+
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
+
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
+
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
+
+  CUptiResult ActivityUsePerThreadBuffer() override;
+
+  CUptiResult SetActivityFlushPeriod(uint32_t period_ms) override;
+
+  CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) override;
+
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
+
+  // cuptiFinalize is only defined in CUDA8 and above.
+  // To enable it in CUDA8, the environment variable CUPTI_ENABLE_FINALIZE must
+  // be set to 1.
+  CUptiResult Finalize() override;
+
+  // CUPTI callback API
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId cbid) override;
+
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
+
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
+
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
+
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
+
+  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
+
+  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                            uint8_t per_thread_stream,
+                            uint32_t* stream_id) override;
+
+  CUptiResult GetGraphId(CUgraph graph, uint32_t* graph_id) override;
+
+  CUptiResult GetGraphExecId(CUgraphExec graph_exec,
+                             uint32_t* graph_id) override;
+
+  CUptiResult SetThreadIdType(CUpti_ActivityThreadIdType type) override;
+
+  void CleanUp() override {}
+  bool Disabled() const override { return false; }
+
+ private:
+  CuptiWrapperStub(const CuptiWrapperStub&) = delete;
+  void operator=(const CuptiWrapperStub&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
new file mode 100644
index 00000000..6384a67c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
@@ -0,0 +1,101 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_MOCK_CUPTI_H_
+#define XLA_BACKENDS_PROFILER_GPU_MOCK_CUPTI_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstdint>
+
+#include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace profiler {
+
+// A mock object automatically generated by gmock_gen.py.
+class MockCupti : public xla::profiler::CuptiInterface {
+ public:
+  MOCK_METHOD(CUptiResult, ActivityDisable, (CUpti_ActivityKind kind),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityEnable, (CUpti_ActivityKind kind),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityFlushAll, (uint32_t flag), (override));
+  MOCK_METHOD(CUptiResult, ActivityGetNextRecord,
+              (uint8_t* buffer, size_t valid_buffer_size_bytes,
+               CUpti_Activity** record),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityGetNumDroppedRecords,
+              (CUcontext context, uint32_t stream_id, size_t* dropped),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityConfigureUnifiedMemoryCounter,
+              (CUpti_ActivityUnifiedMemoryCounterConfig * config,
+               uint32_t count),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityRegisterCallbacks,
+              (CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+               CUpti_BuffersCallbackCompleteFunc func_buffer_completed),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityUsePerThreadBuffer, (), (override));
+  MOCK_METHOD(CUptiResult, SetActivityFlushPeriod, (uint32_t period_ms),
+              (override));
+  MOCK_METHOD(CUptiResult, GetDeviceId, (CUcontext context, uint32_t* deviceId),
+              (override));
+  MOCK_METHOD(CUptiResult, GetTimestamp, (uint64_t* timestamp), (override));
+  MOCK_METHOD(CUptiResult, Finalize, (), (override));
+  MOCK_METHOD(CUptiResult, EnableCallback,
+              (uint32_t enable, CUpti_SubscriberHandle subscriber,
+               CUpti_CallbackDomain domain, CUpti_CallbackId cbid),
+              (override));
+  MOCK_METHOD(CUptiResult, EnableDomain,
+              (uint32_t enable, CUpti_SubscriberHandle subscriber,
+               CUpti_CallbackDomain domain),
+              (override));
+  MOCK_METHOD(CUptiResult, Subscribe,
+              (CUpti_SubscriberHandle * subscriber, CUpti_CallbackFunc callback,
+               void* userdata),
+              (override));
+  MOCK_METHOD(CUptiResult, Unsubscribe, (CUpti_SubscriberHandle subscriber),
+              (override));
+  MOCK_METHOD(CUptiResult, GetResultString,
+              (CUptiResult result, const char** str), (override));
+
+  MOCK_METHOD(CUptiResult, GetContextId,
+              (CUcontext context, uint32_t* context_id), (override));
+
+  MOCK_METHOD(CUptiResult, GetStreamIdEx,
+              (CUcontext context, CUstream stream, uint8_t per_thread_stream,
+               uint32_t* stream_id),
+              (override));
+
+  MOCK_METHOD(CUptiResult, GetGraphId, (CUgraph graph, uint32_t* graph_id),
+              (override));
+
+  MOCK_METHOD(CUptiResult, SetThreadIdType, (CUpti_ActivityThreadIdType type),
+              (override));
+
+  MOCK_METHOD(CUptiResult, GetGraphExecId,
+              (CUgraphExec graph_exec, uint32_t* graph_id), (override));
+
+  MOCK_METHOD(void, CleanUp, (), (override));
+  MOCK_METHOD(bool, Disabled, (), (const, override));
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_MOCK_CUPTI_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
new file mode 100644
index 00000000..9f253659
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
+#define XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
+
+#include <stack>
+
+#include "absl/strings/string_view.h"
+#include "tsl/platform/macros.h"
+
+namespace xla {
+namespace profiler {
+
+/***
+ * TODO: After using CUPTI activity marker, remove NVTXRangeTracker related
+ * code.
+ * We have no intention to use NVTX in tensorflow right now, we use this class
+ * to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
+ * This bears a lot of resemblance to ScopedAnnotation for now.  In the future,
+ * we will use TraceMe to keep track trace context within a thread.
+ */
+class NVTXRangeTracker {
+ public:
+  static void EnterRange(const std::string& range) {
+    auto& range_stack = GetRangeStack();
+    range_stack.push(range);
+  }
+  static void ExitRange() {
+    auto& range_stack = GetRangeStack();
+    if (!range_stack.empty()) range_stack.pop();
+  }
+  static const absl::string_view CurrentRange() {
+    auto& range_stack = GetRangeStack();
+    if (!range_stack.empty()) return range_stack.top();
+    return "";
+  }
+
+ private:
+  static std::stack<std::string>& GetRangeStack();
+
+  NVTXRangeTracker(const NVTXRangeTracker&) = delete;
+  void operator=(const NVTXRangeTracker&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/nvtx_with_cuda_kernels.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/nvtx_with_cuda_kernels.h
new file mode 100644
index 00000000..7f50e4bc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/nvtx_with_cuda_kernels.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_NVTX_WITH_CUDA_KERNELS_H_
+#define XLA_BACKENDS_PROFILER_GPU_NVTX_WITH_CUDA_KERNELS_H_
+
+#include <vector>
+
+namespace xla {
+namespace profiler {
+namespace test {
+
+// If runs correctly, the returned vector will only contain num_elements of 0.
+std::vector<int> SimpleAddSubWithNvtxTag(int num_elements);
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_NVTX_WITH_CUDA_KERNELS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
new file mode 100644
index 00000000..46e8e71e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
@@ -0,0 +1,230 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
+#define XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+
+namespace xla {
+namespace profiler {
+
+using tsl::profiler::XSpace;
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_bytes;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // kernel address. Used for calculating core occupancy
+  void* func_ptr;
+};
+
+inline std::string ToXStat(const KernelDetails& kernel_info,
+                           double occupancy_pct) {
+  return absl::StrCat(
+      "regs:", kernel_info.registers_per_thread,
+      " static_shared:", kernel_info.static_shared_memory_usage,
+      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
+      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
+      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
+      kernel_info.block_y, ",", kernel_info.block_z,
+      " occ_pct:", occupancy_pct);
+}
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  MemoryFree,
+  Memset,
+  Synchronization,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  Invalid = 0,
+  ApiCallback,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  InvalidDomain = 0,
+  HIP_API,
+  HIP_OPS,
+};
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+// RocmTracerSyncTypes forward declaration
+enum class RocmTracerSyncTypes;
+
+struct SynchronizationDetails {
+  RocmTracerSyncTypes sync_type;
+};
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidThreadId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view roctx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint64_t thread_id = kInvalidThreadId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;                    // If type == Memcpy*
+    MemsetDetails memset_info;                    // If type == Memset*
+    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
+    KernelDetails kernel_info;                    // If type == Kernel
+    SynchronizationDetails synchronization_info;  // If type == Synchronization
+  };
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+class RocmTraceCollector {
+ public:
+  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
+      : options_(options), annotation_map_(options.max_annotation_strings) {}
+  virtual ~RocmTraceCollector() {}
+
+  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32_t num_events) = 0;
+  virtual void Flush() = 0;
+  virtual void Export(XSpace* space) = 0;
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  RocmTraceCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+ public:
+  // Disable copy and move.
+  RocmTraceCollector(const RocmTraceCollector&) = delete;
+  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
+};
+
+std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
+    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
+    const uint64_t start_gputime_ns);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
new file mode 100644
index 00000000..b82a1e66
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -0,0 +1,213 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
+#define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/types/optional.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/types.h"
+
+namespace xla {
+namespace profiler {
+
+enum class RocmTracerSyncTypes {
+  InvalidSync = 0,
+  StreamSynchronize,  // caller thread wait stream to become empty
+  EventSynchronize,   // caller thread will block until event happens
+  StreamWait          // compute stream will wait for event to happen
+};
+
+struct RocmTracerOptions {
+  std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
+
+  // map of domain --> ops for which we need to enable the API callbacks
+  // If the ops vector is empty, then enable API callbacks for entire domain
+  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
+
+  // map of domain --> ops for which we need to enable the Activity records
+  // If the ops vector is empty, then enable Activity records for entire domain
+  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
+      activity_tracing;
+};
+
+class RocmTracer;
+
+class RocmApiCallbackImpl {
+ public:
+  RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
+                      RocmTraceCollector* collector)
+      : options_(options), tracer_(tracer), collector_(collector) {}
+
+  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+
+ private:
+  void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                 uint64_t enter_time, uint64_t exit_time);
+  void AddNormalMemcpyEventUponApiExit(uint32_t cbid,
+                                       const hip_api_data_t* data,
+                                       uint64_t enter_time, uint64_t exit_time);
+  void AddMemcpyPeerEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                     uint64_t enter_time, uint64_t exit_time);
+  void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                 uint64_t enter_time, uint64_t exit_time);
+  void AddMallocFreeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                     uint32_t device_id, uint64_t enter_time,
+                                     uint64_t exit_time);
+  void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
+                                            const hip_api_data_t* data,
+                                            uint64_t enter_time,
+                                            uint64_t exit_time);
+  void AddSynchronizeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                      uint64_t enter_time, uint64_t exit_time);
+
+  RocmTracerOptions options_;
+  RocmTracer* tracer_ = nullptr;
+  RocmTraceCollector* collector_ = nullptr;
+  tsl::mutex api_call_start_mutex_;
+  // TODO(rocm-profiler): replace this with absl hashmap
+  // keep a map from the corr. id to enter time for API callbacks.
+  std::map<uint32_t, uint64_t> api_call_start_time_
+      TF_GUARDED_BY(api_call_start_mutex_);
+};
+
+class RocmActivityCallbackImpl {
+ public:
+  RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
+                           RocmTraceCollector* collector)
+      : options_(options), tracer_(tracer), collector_(collector) {}
+
+  absl::Status operator()(const char* begin, const char* end);
+
+ private:
+  void AddHipKernelActivityEvent(const roctracer_record_t* record);
+  void AddNormalHipMemcpyActivityEvent(const roctracer_record_t* record);
+  void AddHipMemsetActivityEvent(const roctracer_record_t* record);
+  void AddHipMallocActivityEvent(const roctracer_record_t* record);
+  void AddHipStreamSynchronizeActivityEvent(const roctracer_record_t* record);
+  void AddHccKernelActivityEvent(const roctracer_record_t* record);
+  void AddNormalHipOpsMemcpyActivityEvent(const roctracer_record_t* record);
+  void AddHipOpsMemsetActivityEvent(const roctracer_record_t* record);
+  RocmTracerOptions options_;
+  RocmTracer* tracer_ = nullptr;
+  RocmTraceCollector* collector_ = nullptr;
+};
+
+// The class uses roctracer callback/activity API and forward the collected
+// trace events to RocmTraceCollector. There should be only one RocmTracer
+// per process.
+class RocmTracer {
+ public:
+  // Returns a pointer to singleton RocmTracer.
+  static RocmTracer* GetRocmTracerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+
+  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
+  void Disable();
+
+  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                  const void* cbdata);
+  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
+
+  static uint64_t GetTimestamp();
+  static int NumGpus();
+
+  void AddToPendingActivityRecords(uint32_t correlation_id) {
+    pending_activity_records_.Add(correlation_id);
+  }
+
+  void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
+    pending_activity_records_.Remove(correlation_id);
+  }
+
+  void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
+
+  size_t GetPendingActivityRecordsCount() {
+    return pending_activity_records_.Count();
+  }
+
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit RocmTracer() : num_gpus_(NumGpus()) {}
+
+ private:
+  absl::Status EnableApiTracing();
+  absl::Status DisableApiTracing();
+
+  absl::Status EnableActivityTracing();
+  absl::Status DisableActivityTracing();
+
+  int num_gpus_;
+  std::optional<RocmTracerOptions> options_;
+  RocmTraceCollector* collector_ = nullptr;
+
+  bool api_tracing_enabled_ = false;
+  bool activity_tracing_enabled_ = false;
+
+  RocmApiCallbackImpl* api_cb_impl_;
+  RocmActivityCallbackImpl* activity_cb_impl_;
+
+  class PendingActivityRecords {
+   public:
+    // add a correlation id to the pending set
+    void Add(uint32_t correlation_id) {
+      absl::MutexLock lock(&mutex);
+      pending_set.insert(correlation_id);
+    }
+    // remove a correlation id from the pending set
+    void Remove(uint32_t correlation_id) {
+      absl::MutexLock lock(&mutex);
+      pending_set.erase(correlation_id);
+    }
+    // clear the pending set
+    void Clear() {
+      absl::MutexLock lock(&mutex);
+      pending_set.clear();
+    }
+    // count the number of correlation ids in the pending set
+    size_t Count() {
+      absl::MutexLock lock(&mutex);
+      return pending_set.size();
+    }
+
+   private:
+    // set of co-relation ids for which the hcc activity record is pending
+    absl::flat_hash_set<uint32_t> pending_set;
+    // the callback which processes the activity records (and consequently
+    // removes items from the pending set) is called in a separate thread
+    // from the one that adds item to the list.
+    absl::Mutex mutex;
+  };
+  PendingActivityRecords pending_activity_records_;
+
+ public:
+  // Disable copy and move.
+  RocmTracer(const RocmTracer&) = delete;
+  RocmTracer& operator=(const RocmTracer&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+#endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
new file mode 100644
index 00000000..6a415fe7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_H_
+#define XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_H_
+
+#include "absl/status/status.h"
+#include "xla/backends/profiler/plugin/profiler_c_api.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+
+// Plugin implementation of ProfilerInterface.
+//
+// Thread-safety: This class is go/thread-compatible.
+class PluginTracer : public tsl::profiler::ProfilerInterface {
+ public:
+  explicit PluginTracer(const PLUGIN_Profiler_Api* profiler_api,
+                        const tensorflow::ProfileOptions& options);
+  ~PluginTracer() override;
+
+  absl::Status Start() override;
+
+  absl::Status Stop() override;
+
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
+
+ private:
+  const PLUGIN_Profiler_Api* profiler_api_;
+  PLUGIN_Profiler* profiler_;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.h
new file mode 100644
index 00000000..5e1eda38
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_IMPL_H_
+#define XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_IMPL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "xla/backends/profiler/plugin/profiler_c_api.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+struct PLUGIN_Profiler {
+  std::optional<tensorflow::profiler::XSpace> space;
+  std::unique_ptr<std::vector<uint8_t>> buffer;
+  size_t byte_size;
+  std::unique_ptr<tsl::profiler::ProfilerInterface> impl;
+  bool stopped;
+};
+
+namespace xla {
+namespace profiler {
+
+PLUGIN_Profiler_Error* PLUGIN_Profiler_Create(
+    PLUGIN_Profiler_Create_Args* args);
+
+PLUGIN_Profiler_Error* PLUGIN_Profiler_Destroy(
+    PLUGIN_Profiler_Destroy_Args* args);
+
+PLUGIN_Profiler_Error* PLUGIN_Profiler_Start(PLUGIN_Profiler_Start_Args* args);
+
+PLUGIN_Profiler_Error* PLUGIN_Profiler_Stop(PLUGIN_Profiler_Stop_Args* args);
+
+PLUGIN_Profiler_Error* PLUGIN_Profiler_CollectData(
+    PLUGIN_Profiler_CollectData_Args* args);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_PLUGIN_PLUGIN_TRACER_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/profiler_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/profiler_c_api.h
new file mode 100644
index 00000000..23ead308
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/profiler_c_api.h
@@ -0,0 +1,149 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_PLUGIN_PROFILER_C_API_H_
+#define XLA_BACKENDS_PROFILER_PLUGIN_PROFILER_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PROFILER_STRUCT_SIZE(struct_type, last_field) \
+  offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
+
+#define PROFILER_DEFINE_STRUCT_TRAITS(sname, last_field) \
+  typedef struct sname sname;                            \
+  enum { sname##_STRUCT_SIZE = PROFILER_STRUCT_SIZE(sname, last_field) }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PLUGIN_PROFILER_VERSION 1
+
+typedef struct PLUGIN_Profiler PLUGIN_Profiler;
+typedef struct PLUGIN_Profiler_Error PLUGIN_Profiler_Error;
+
+struct PLUGIN_Profiler_Error_Destroy_Args {
+  size_t struct_size;
+  void* priv;
+  PLUGIN_Profiler_Error* error;
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Error_Destroy_Args, error);
+
+// Frees `error`. `error` can be nullptr.
+typedef void PLUGIN_Profiler_Error_Destroy(
+    PLUGIN_Profiler_Error_Destroy_Args* args);
+
+struct PLUGIN_Profiler_Error_Message_Args {
+  size_t struct_size;
+  void* priv;
+  const PLUGIN_Profiler_Error* error;
+  // Has the lifetime of `error`.
+  const char* message;  // out
+  size_t message_size;  // out
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Error_Message_Args, message_size);
+
+// Gets the human-readable reason for `error`. `message` has the lifetime of
+// `error`.
+typedef void PLUGIN_Profiler_Error_Message(
+    PLUGIN_Profiler_Error_Message_Args* args);
+
+struct PLUGIN_Profiler_Error_GetCode_Args {
+  size_t struct_size;
+  void* priv;
+  const PLUGIN_Profiler_Error* error;
+  int code;  // out
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Error_GetCode_Args, code);
+
+typedef PLUGIN_Profiler_Error* PLUGIN_Profiler_Error_GetCode(
+    PLUGIN_Profiler_Error_GetCode_Args* args);
+
+struct PLUGIN_Profiler_Create_Args {
+  size_t struct_size;
+  const char* options;
+  size_t options_size;
+  PLUGIN_Profiler* profiler;  // out
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Create_Args, profiler);
+
+typedef PLUGIN_Profiler_Error* PLUGIN_Profiler_Create(
+    PLUGIN_Profiler_Create_Args* args);
+
+struct PLUGIN_Profiler_Destroy_Args {
+  size_t struct_size;
+  PLUGIN_Profiler* profiler;
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Destroy_Args, profiler);
+
+typedef PLUGIN_Profiler_Error* PLUGIN_Profiler_Destroy(
+    PLUGIN_Profiler_Destroy_Args* args);
+
+struct PLUGIN_Profiler_Start_Args {
+  size_t struct_size;
+  PLUGIN_Profiler* profiler;
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Start_Args, profiler);
+
+typedef PLUGIN_Profiler_Error* PLUGIN_Profiler_Start(
+    PLUGIN_Profiler_Start_Args* args);
+
+struct PLUGIN_Profiler_Stop_Args {
+  size_t struct_size;
+  PLUGIN_Profiler* profiler;
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Stop_Args, profiler);
+
+typedef PLUGIN_Profiler_Error* PLUGIN_Profiler_Stop(
+    PLUGIN_Profiler_Stop_Args* args);
+
+struct PLUGIN_Profiler_CollectData_Args {
+  size_t struct_size;
+  PLUGIN_Profiler* profiler;
+  uint8_t* buffer;              // in/out
+  size_t buffer_size_in_bytes;  // out
+};
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_CollectData_Args,
+                              buffer_size_in_bytes);
+
+// Callers should generally call this function twice with the same `args`.
+// In the first call, `args->buffer` must be nullptr. This call will populate
+// `args->buffer_size_in_bytes`. Clients should then allocate a buffer `buffer`
+// of at least `buffer_size_in_bytes` bytes. Before the second call, callers
+// should set `args->buffer = buffer`. The second call will then write the
+// serialized data to `buffer`.
+typedef PLUGIN_Profiler_Error* PLUGIN_Profiler_CollectData(
+    PLUGIN_Profiler_CollectData_Args* args);
+
+typedef struct PLUGIN_Profiler_Api {
+  size_t struct_size;
+  void* priv;
+  PLUGIN_Profiler_Error_Destroy* error_destroy;
+  PLUGIN_Profiler_Error_Message* error_message;
+  PLUGIN_Profiler_Error_GetCode* error_get_code;
+  PLUGIN_Profiler_Create* create;
+  PLUGIN_Profiler_Destroy* destroy;
+  PLUGIN_Profiler_Start* start;
+  PLUGIN_Profiler_Stop* stop;
+  PLUGIN_Profiler_CollectData* collect_data;
+} PLUGIN_Profiler_Api;
+PROFILER_DEFINE_STRUCT_TRAITS(PLUGIN_Profiler_Api, collect_data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_BACKENDS_PROFILER_PLUGIN_PROFILER_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/profiler_error.h b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/profiler_error.h
new file mode 100644
index 00000000..bbd6cc60
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/backends/profiler/plugin/profiler_error.h
@@ -0,0 +1,65 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_PROFILER_PLUGIN_PROFILER_ERROR_H_
+#define XLA_BACKENDS_PROFILER_PLUGIN_PROFILER_ERROR_H_
+
+#include "absl/status/status.h"
+#include "xla/backends/profiler/plugin/profiler_c_api.h"
+
+struct PLUGIN_Profiler_Error {
+  absl::Status status;
+};
+
+#define PLUGIN_PROFILER_RETURN_IF_ERROR(expr)            \
+  do {                                                   \
+    absl::Status _status = (expr);                       \
+    if (!_status.ok()) {                                 \
+      PLUGIN_Profiler_Error* _c_status =                 \
+          new PLUGIN_Profiler_Error{std::move(_status)}; \
+      return _c_status;                                  \
+    }                                                    \
+  } while (false)
+
+#define PLUGIN_PROFILER_ASSIGN_OR_RETURN(lhs, rexpr)                      \
+  _PLUGIN_PROFILER_ASSIGN_OR_RETURN_IMPL(                                 \
+      _PLUGIN_PROFILER_CONCAT(_status_or_value, __COUNTER__), lhs, rexpr, \
+      _PLUGIN_PROFILER_CONCAT(_c_status, __COUNTER__));
+
+#define _PLUGIN_PROFILER_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr, c_status) \
+  auto statusor = (rexpr);                                                     \
+  if (!statusor.ok()) {                                                        \
+    PLUGIN_Profiler_Error* c_status = new PLUGIN_Profiler_Error();             \
+    c_status->status = statusor.status();                                      \
+    return c_status;                                                           \
+  }                                                                            \
+  lhs = std::move(*statusor)
+
+#define _PLUGIN_PROFILER_CONCAT(x, y) _PLUGIN_PROFILER_CONCAT_IMPL(x, y)
+#define _PLUGIN_PROFILER_CONCAT_IMPL(x, y) x##y
+
+namespace xla {
+namespace profiler {
+
+void PLUGIN_Profiler_Error_Destroy(PLUGIN_Profiler_Error_Destroy_Args* args);
+
+void PLUGIN_Profiler_Error_Message(PLUGIN_Profiler_Error_Message_Args* args);
+
+PLUGIN_Profiler_Error* PLUGIN_Profiler_Error_GetCode(
+    PLUGIN_Profiler_Error_GetCode_Args* args);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_PLUGIN_PROFILER_ERROR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/bit_cast.h b/third_party/tflite-hdrs/third_party/xla/xla/bit_cast.h
new file mode 100644
index 00000000..ba413cfc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/bit_cast.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// BitCast is an extension of std::bit_cast/absl::bit_cast. Whereas those
+// functions require trivially copyable source and destination types, the
+// present function template may be specialized for additional types that
+// do not satisfy that triviality property, but that have alternative ways
+// of accessing their underlying representation.
+//
+// Concretely, we provide specializations for the "custom floating point types"
+// Eigen::half and tsl::bfloat16. Those types are effectively stored as
+// a sequence of bits, but the classes are not trivially copyable.
+
+#ifndef XLA_BIT_CAST_H_
+#define XLA_BIT_CAST_H_
+
+#include <cstdint>
+
+#include "absl/base/casts.h"
+#include "Eigen/Core"
+#include "xla/types.h"
+#include "tsl/platform/bfloat16.h"
+
+namespace xla {
+
+template <typename T, typename U>
+T BitCast(U src) {
+  static_assert(sizeof(T) == sizeof(U), "sizes don't match");
+  // We would like to check std::is_trivially_copyable here, but there's no
+  // reliable implementation of that available to us.
+  return absl::bit_cast<T>(src);
+}
+
+template <>
+inline tsl::bfloat16 BitCast<tsl::bfloat16, uint16_t>(uint16_t src) {
+  return Eigen::numext::bit_cast<tsl::bfloat16>(src);
+}
+
+template <>
+inline uint16_t BitCast<uint16_t, tsl::bfloat16>(tsl::bfloat16 src) {
+  return Eigen::numext::bit_cast<uint16_t>(src);
+}
+
+template <>
+inline Eigen::half BitCast<Eigen::half, uint16_t>(uint16_t src) {
+  return Eigen::numext::bit_cast<Eigen::half>(src);
+}
+
+template <>
+inline uint16_t BitCast<uint16_t, Eigen::half>(Eigen::half src) {
+  return Eigen::numext::bit_cast<uint16_t>(src);
+}
+
+}  // namespace xla
+
+#endif  // XLA_BIT_CAST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/c/c_api_decl.h b/third_party/tflite-hdrs/third_party/xla/xla/c/c_api_decl.h
new file mode 100644
index 00000000..4c415151
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/c/c_api_decl.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_C_C_API_DECL_H_
+#define XLA_C_C_API_DECL_H_
+
+extern "C" {
+
+// XLA Layout preferences.
+typedef enum {
+  XLA_LayoutPreference_kNoPreference,
+  XLA_LayoutPreference_kTpuPreferCompactChunkPaddedLayout,
+  XLA_LayoutPreference_kTpuPreferLinearLayout,
+} XLA_LayoutPreference;
+}
+
+#endif  // XLA_C_C_API_DECL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/client.h b/third_party/tflite-hdrs/third_party/xla/xla/client/client.h
new file mode 100644
index 00000000..9216d752
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/client.h
@@ -0,0 +1,226 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_CLIENT_H_
+#define XLA_CLIENT_CLIENT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
+#include "xla/types.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// XLA service's client object -- wraps the service with convenience and
+// lifetime-oriented methods.
+class Client {
+ public:
+  explicit Client(Service* stub);
+  virtual ~Client();
+
+  using XlaComputationInstance = xla::XlaComputationInstance;
+
+  // Compile the computation with the given argument shapes and returns the
+  // handle to the compiled executable. The compiled executable is cached on the
+  // service, and the returned handle can be used for execution without
+  // re-compile.
+  // * The shape and layout of the arguments being executed with will affect how
+  //   the computation is compiled. If argument_shapes is empty, the parameters'
+  //   shape and layout will be used in the compilation.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_options.device_handles should be empty. If you need
+  //   non-empty device handles, call 'Execute' instead.
+  //
+  // TODO(b/122731460): This call caches the resulting Executable in the Service
+  // *forever*.  If you're only going to run the computation once, you may want
+  // to call the Execute(const XlaComputation&) overload.  If you're going to
+  // run the computation more than once but you want control over when the
+  // Executable is unloaded, use the LocalClient API.
+  absl::StatusOr<ExecutionHandle> Compile(
+      const XlaComputation& computation,
+      absl::Span<const Shape> argument_shapes,
+      const ExecutionOptions* execution_options = nullptr);
+
+  // Executes the compiled executable for the given handle with the given
+  // arguments and returns the global data that was produced from the execution.
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  absl::StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+      ExecutionProfile* execution_profile = nullptr
+
+  );
+
+  // Executes the computation with the given arguments and returns the global
+  // data that was produced from the execution.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_options.device_handles is not empty, the computation is
+  //   executed on the devices associated with the handles by partitioning the
+  //   computation based on the attached sharding attributes. Otherwise, a
+  //   device is chosen by the service.
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  //
+  // TODO(b/122731460): The given computation is compiled and then thrown away
+  // immediately after it's run.  If you want control over how long the
+  // resulting Executable lives, use the LocalClient API.
+  absl::StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const XlaComputation& computation,
+      absl::Span<GlobalData* const> arguments,
+      const ExecutionOptions* execution_options = nullptr,
+      ExecutionProfile* execution_profile = nullptr);
+
+
+  // Executes a list XlaComputationInstances and returns global data produced
+  // from each computation.
+  //
+  absl::StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
+      absl::Span<const XlaComputationInstance> computations);
+
+  // Requests device_count device handles available on the target. The returned
+  // device handles are used to specify the devices to execute the computations
+  // (see ExecuteParallel) or to transfer data (see TransferToServer or
+  // TransferToInfeed).
+  absl::StatusOr<std::vector<DeviceHandle>> GetDeviceHandles(
+      int64_t device_count);
+
+  // Transfer the global data provided to this client process, which is
+  // returned in the provided literal. Use sparingly to avoid transfer
+  // overheads.
+  //
+  // If shape_with_layout is not nullptr, it points to a shape whose layout will
+  // be the layout of the returned literal.
+  absl::StatusOr<Literal> Transfer(const GlobalData& data,
+                                   const Shape* shape_with_layout = nullptr);
+
+  // Transfer the given literal to the server. This allocates memory on the
+  // device and copies the literal's contents over. Returns a global data handle
+  // that can be used to refer to this value from the client.
+  //
+  // If device_handle is not nullptr, data is transferred to the associated
+  // device (and its replicas if replication is enabled). Otherwise, data is
+  // transferred to the default device (and its replicas).
+  absl::StatusOr<std::unique_ptr<GlobalData>> TransferToServer(
+      const LiteralSlice& literal, const DeviceHandle* device_handle = nullptr);
+
+  // Transfer the given literal to the Infeed interface of the device.
+  //
+  // device_handle and replica_id together specify a particular device; a device
+  // assigned for the given replica_id among the replicas that the given device
+  // handle belongs to.
+  absl::Status TransferToInfeed(const LiteralSlice& literal,
+                                int64_t replica_id = 0,
+                                const DeviceHandle* device_handle = nullptr);
+
+  // Transfers from the Outfeed of the device.
+  //
+  // device_handle and replica_id together specify a particular device; a device
+  // assigned for the given replica_id among the replicas that the given device
+  // handle belongs to.
+  absl::StatusOr<Literal> TransferFromOutfeed(
+      const Shape* shape_with_layout, int64_t replica_id = 0,
+      const DeviceHandle* device_handle = nullptr);
+
+  // Resets the device, clearing all existing state on the device.
+  absl::Status ResetDevice();
+
+  // Executes the computation with the given arguments and transfers the result
+  // to the client as a literal. Parameters are defined the same as for
+  // Execute() and Transfer().
+  absl::StatusOr<Literal> ExecuteAndTransfer(
+      const XlaComputation& computation,
+      absl::Span<GlobalData* const> arguments,
+      const ExecutionOptions* execution_options = nullptr,
+      ExecutionProfile* execution_profile = nullptr);
+
+  // Computes the value of the given computation using a non-optimized
+  // interpreter on the host.
+  //
+  // The computation must not depend on any parameters, or on stateful operators
+  // such as `RngNormal` or `Infeed`.
+  //
+  // This functionality can be useful when translating a computation into XLA
+  // where something that looked dynamic is required by XLA to be specified as a
+  // constant. E.g. the source computation (outside of XLA) may include a
+  // dynamic computation of the shape of something and ComputeConstant lets you
+  // determine what the value of that computation is in the case where the value
+  // can be determined at compile time.
+  //
+  // If output_layout is non-null, then the output of the computation will be
+  // stored using that layout.
+  absl::StatusOr<Literal> ComputeConstant(
+      const XlaComputation& computation,
+      const Layout* output_layout = nullptr) const;
+
+  // Unregister the memory for the given GlobalData on the device.
+  absl::Status Unregister(const GlobalData& data);
+
+  // Returns a vector of global data handles that point to the tuple elements.
+  absl::StatusOr<std::vector<std::unique_ptr<GlobalData>>> DeconstructTuple(
+      const GlobalData& data);
+
+  // Returns the Shape of the given array specified by 'data'. The shape
+  // includes the Layout of the array as it is stored on the service.
+  absl::StatusOr<Shape> GetShape(const GlobalData& data);
+
+  // As above, but returns the shape of the provided computation (parameter
+  // types/names and return type).
+  absl::StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
+      const XlaComputation& computation);
+
+  // Creates a channel handle that can be used to transfer data between two
+  // computations on different devices via a pair of Send and Recv instructions.
+  absl::StatusOr<ChannelHandle> CreateChannelHandle();
+
+  // Create a channel for communicating with the host via a SendtoHost or
+  // RecvFromHost operation.
+  absl::StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle();
+  absl::StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle();
+
+  absl::StatusOr<XlaComputation> LoadSnapshot(const HloSnapshot& module);
+
+  Service* stub() { return stub_; }
+
+ private:
+  absl::StatusOr<ChannelHandle> CreateChannelHandleByType(
+      ChannelHandle::ChannelType type);
+
+  Service* stub_;  // Stub that this client is connected on.
+
+  Client(const Client&) = delete;
+  Client& operator=(const Client&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/client_library.h b/third_party/tflite-hdrs/third_party/xla/xla/client/client_library.h
new file mode 100644
index 00000000..0e4f3a9a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/client_library.h
@@ -0,0 +1,149 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The "client library" instantiates a local (in-process) XLA service for
+// use by this process, and connects to it with a singleton XLA local
+// client. ClientLibrary::GetOrCreateLocalClient will spawn a local service,
+// and return a client that's connected to it and ready to run XLA
+// computations.
+#ifndef XLA_CLIENT_CLIENT_LIBRARY_H_
+#define XLA_CLIENT_CLIENT_LIBRARY_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/client/compile_only_client.h"
+#include "xla/client/local_client.h"
+#include "xla/service/compile_only_service.h"
+#include "xla/service/local_service.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Options to configure the local client when it is created.
+class LocalClientOptions {
+ public:
+  LocalClientOptions(
+      se::Platform* platform = nullptr, int number_of_replicas = 1,
+      int intra_op_parallelism_threads = -1,
+      const std::optional<std::set<int>>& allowed_devices = std::nullopt);
+
+  // Set the platform backing the service, or nullptr for the default platform.
+  LocalClientOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
+
+  // Set the number of replicas to use when compiling replicated
+  // programs.
+  LocalClientOptions& set_number_of_replicas(int number_of_replicas);
+  int number_of_replicas() const;
+
+  // Sets the thread pool size for parallel execution of an individual operator.
+  LocalClientOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
+  // Sets the allowed_devices set for selectively constructing stream executors
+  // on the platform.
+  LocalClientOptions& set_allowed_devices(
+      const std::optional<std::set<int>>& allowed_devices);
+  const std::optional<std::set<int>>& allowed_devices() const;
+
+ private:
+  se::Platform* platform_;
+  int number_of_replicas_;
+  int intra_op_parallelism_threads_;
+  std::optional<std::set<int>> allowed_devices_;
+};
+
+class ClientLibrary {
+ public:
+  // Singleton constructor-or-accessor -- returns a client for the application
+  // to issue XLA commands on. Arguments:
+  //
+  //   platform : The platform the underlying XLA service should target. If
+  //     null then default platform is used.
+  //   device_set: Set of device IDs for which the stream executor will be
+  //   created, for the given platform.
+  static absl::StatusOr<LocalClient*> GetOrCreateLocalClient(
+      se::Platform* platform = nullptr,
+      const std::optional<std::set<int>>& device_set = std::nullopt);
+  static absl::StatusOr<LocalClient*> GetOrCreateLocalClient(
+      const LocalClientOptions& options);
+
+  // Convenience "or-die" wrapper around the above which returns the existing
+  // client library or creates one with default platform and allocator.
+  static LocalClient* LocalClientOrDie();
+
+  // Returns the service from the service thread. Only used in unit tests to
+  // access user computations from client.
+  static LocalService* GetXlaService(se::Platform* platform);
+
+  // Singleton constructor-or-accessor for compile-only clients. Arguments:
+  //
+  //   platform : The platform the underlying XLA service should target. If
+  //     null then default platform is used.
+  static absl::StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
+      se::Platform* platform = nullptr);
+
+  // Clears the local instance and compile only instance caches. The client
+  // pointers returned by the previous GetOrCreateLocalClient() or
+  // GetOrCreateCompileOnlyClient() invocations are not valid anymore.
+  static void DestroyLocalInstances();
+
+ private:
+  // Returns the singleton instance of ClientLibrary.
+  static ClientLibrary& Singleton();
+
+  ClientLibrary();
+  ~ClientLibrary();
+
+  struct LocalInstance {
+    // Service that is wrapped by the singleton client object.
+    std::unique_ptr<LocalService> service;
+    // Singleton client object.
+    std::unique_ptr<LocalClient> client;
+  };
+
+  struct CompileOnlyInstance {
+    // Service that is wrapped by the singleton client object.
+    std::unique_ptr<CompileOnlyService> service;
+    // Singleton client object.
+    std::unique_ptr<CompileOnlyClient> client;
+  };
+
+  absl::Mutex service_mutex_;  // Guards the singleton creation state.
+  absl::flat_hash_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
+      local_instances_ ABSL_GUARDED_BY(service_mutex_);
+
+  absl::flat_hash_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
+      compile_only_instances_ ABSL_GUARDED_BY(service_mutex_);
+
+  ClientLibrary(const ClientLibrary&) = delete;
+  ClientLibrary& operator=(const ClientLibrary&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_CLIENT_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/compile_only_client.h b/third_party/tflite-hdrs/third_party/xla/xla/client/compile_only_client.h
new file mode 100644
index 00000000..a786bd1c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/compile_only_client.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+#define XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/client/client.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/service/compile_only_service.h"
+#include "xla/service/compiler.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// An XLA Client specialization for doing ahead-of-time compilation.  This does
+// not require (or attempt to instantiate) an execution-capable backend for the
+// relevant platform.
+class CompileOnlyClient : public Client {
+ public:
+  explicit CompileOnlyClient(CompileOnlyService* service)
+      : Client(service), compiler_service_(service) {}
+
+  CompileOnlyClient(const CompileOnlyClient&) = delete;
+  void operator=(const CompileOnlyClient&) = delete;
+
+  // A description of an xla computation to compile using CompileAheadOfTime.
+  struct AotXlaComputationInstance {
+    const XlaComputation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.
+  // This is intended for use in static compilation. The |options|
+  // parameter describes the target for which the compiler should emit
+  // code. |metadata|, if provided, is populated during compilation.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      absl::Span<const AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options,
+      std::unique_ptr<AotCompilationMetadata>* metadata = nullptr);
+
+  // Create a Hlo module config for the given program shape and arguments.
+  // execution_options is optional; if not given a default is used.
+  absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
+      const ProgramShape& program_shape,
+      absl::Span<const Shape* const> argument_shapes,
+      const ExecutionOptions* execution_options);
+
+  // Returns the size of a pointer in bytes for a given triple.
+  static int64_t PointerSizeForTriple(absl::string_view triple);
+
+ private:
+  CompileOnlyService* compiler_service_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/executable_build_options.h b/third_party/tflite-hdrs/third_party/xla/xla/client/executable_build_options.h
new file mode 100644
index 00000000..76d5d415
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/executable_build_options.h
@@ -0,0 +1,330 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+#define XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/compile_options.pb.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/computation_placer.h"
+#include "xla/shape.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace stream_executor {
+
+// Forward-declared to avoid StreamExecutor dependency.
+class DeviceMemoryAllocator;
+
+}  // namespace stream_executor
+
+namespace xla {
+class HloModule;
+
+// Class containing options for building an LocalExecutable with
+// LocalClient::Compile.
+class ExecutableBuildOptions {
+ public:
+  // If set, this is the device to build the computation for. Valid
+  // device_ordinal values are: 0 to # of devices - 1. These values are
+  // identical to the device ordinal values used by StreamExecutor. The built
+  // executable will be executable on any device equivalent to the specified
+  // device as determined by Backend::devices_equivalent(). A value of -1
+  // indicates this option has not been set.
+  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
+  int device_ordinal() const;
+
+  // If set, this specifies the layout of the result of the computation. If not
+  // set, the service will chose the layout of the result. A Shape is used to
+  // store the layout to accommodate tuple result shapes. A value of nullptr
+  // indicates the option has not been set.
+  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
+  const Shape* result_layout() const;
+
+  // Expose access to the XLA compilation environments, which will be passed to
+  // the compilation process. `comp_envs()` must not be called if
+  // `has_comp_envs()` returns false.
+  bool has_comp_envs() const { return comp_envs_.has_value(); }
+  const CompilationEnvironments& comp_envs() const { return *comp_envs_; }
+  CompilationEnvironments* mutable_comp_envs();
+
+  // Expose access to the XLA debug options which will be passed to the
+  // compilation process. `debug_options()` must not be called if
+  // `has_debug_options()` returns false.
+  bool has_debug_options() const { return debug_options_.has_value(); }
+  const DebugOptions& debug_options() const { return *debug_options_; }
+  DebugOptions* mutable_debug_options();
+
+  // If set, this specifies an allocator that can be used to allocate temporary
+  // space on the device during compilation.  For example, the compiler might
+  // want to run various algorithms on the device and pick the fastest one -- it
+  // might allocate buffers for use by these algorithms using this allocator.
+  //
+  // This does not need to be the same as the se::DeviceMemoryAllocator passed
+  // when running the executable.
+  ExecutableBuildOptions& set_device_allocator(
+      se::DeviceMemoryAllocator* allocator);
+  se::DeviceMemoryAllocator* device_allocator() const;
+
+  // The number of replicas of this computation that are to be executed.
+  // Defaults to 1.
+  int num_replicas() const { return num_replicas_; }
+  ExecutableBuildOptions& set_num_replicas(int num_replicas);
+
+  // The number of partitions in this computation. Defaults to 1.
+  int num_partitions() const { return num_partitions_; }
+  ExecutableBuildOptions& set_num_partitions(int num_partitions);
+
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+  ExecutableBuildOptions& set_use_spmd_partitioning(bool use_spmd_partitioning);
+
+  // Whether to automatically generate XLA shardings for SPMD partitioner.
+  bool use_auto_spmd_partitioning() const {
+    return use_auto_spmd_partitioning_;
+  }
+  ExecutableBuildOptions& set_use_auto_spmd_partitioning(
+      bool use_auto_spmd_partitioning);
+
+  std::vector<int64_t> auto_spmd_partitioning_mesh_shape() const {
+    return auto_spmd_partitioning_mesh_shape_;
+  }
+  ExecutableBuildOptions& set_auto_spmd_partitioning_mesh_shape(
+      std::vector<int64_t> mesh_shape);
+
+  std::vector<int64_t> auto_spmd_partitioning_mesh_ids() const {
+    return auto_spmd_partitioning_mesh_ids_;
+  }
+  ExecutableBuildOptions& set_auto_spmd_partitioning_mesh_ids(
+      std::vector<int64_t> mesh_ids);
+
+  float exec_time_optimization_effort() const {
+    return exec_time_optimization_effort_;
+  }
+  ExecutableBuildOptions& set_exec_time_optimization_effort(
+      float exec_time_optimization_effort) {
+    exec_time_optimization_effort_ = exec_time_optimization_effort;
+    return *this;
+  }
+
+  float memory_fitting_effort() const { return memory_fitting_effort_; }
+  ExecutableBuildOptions& set_memory_fitting_effort(
+      float memory_fitting_effort) {
+    memory_fitting_effort_ = memory_fitting_effort;
+    return *this;
+  }
+
+  bool deduplicate_hlo() const { return deduplicate_hlo_; }
+  ExecutableBuildOptions& set_deduplicate_hlo(bool deduplicate_hlo);
+
+  // If set, this specifies a static device assignment for the computation.
+  // Otherwise, the computation will be compiled generically and can be run with
+  // any device assignment compatible with the computation's replica and
+  // partition counts.
+  bool has_device_assignment() const { return device_assignment_.has_value(); }
+  ExecutableBuildOptions& set_device_assignment(
+      const DeviceAssignment& device_assignment);
+  const DeviceAssignment& device_assignment() const {
+    CHECK(device_assignment_.has_value());
+    return device_assignment_.value();
+  }
+  void clear_device_assignment() { device_assignment_.reset(); }
+
+  // Whether input and output buffers are aliased if the associated parameter is
+  // passed-through XLA modules without being changed.
+  bool alias_passthrough_params() const { return alias_passthrough_params_; }
+  void set_alias_passthrough_params(bool alias_passthrough_params) {
+    alias_passthrough_params_ = alias_passthrough_params;
+  }
+
+  bool run_backend_only() const { return run_backend_only_; }
+  // By default, XLA builds an executable by invoking standard compilation, i.e,
+  // running Compiler::Compile, or both Compiler::RunHloPasses and
+  // Compiler::RunBackend. When run_backend_only is set to true, XLA builds an
+  // executable by invoking only RunBackend and skip invoking RunHloPasses,
+  // which can be used to compile post-optimizations HLO modules.
+  ExecutableBuildOptions& set_run_backend_only(bool run_backend_only) {
+    run_backend_only_ = run_backend_only;
+    return *this;
+  }
+
+  absl::Span<const bool> allow_spmd_sharding_propagation_to_parameters() const {
+    return allow_spmd_sharding_propagation_to_parameters_;
+  }
+  absl::Span<const bool> allow_spmd_sharding_propagation_to_output() const {
+    return allow_spmd_sharding_propagation_to_output_;
+  }
+  bool any_allow_spmd_sharding_propagation_to_parameters() const {
+    return absl::c_linear_search(allow_spmd_sharding_propagation_to_parameters_,
+                                 true);
+  }
+  bool any_allow_spmd_sharding_propagation_to_output() const {
+    return absl::c_linear_search(allow_spmd_sharding_propagation_to_output_,
+                                 true);
+  }
+  // Allows sharding propagation to propagate to the inputs. This changes the
+  // input shape of the computation (which is undesirable), but it can be used
+  // to allow to run partial compilation to determine what would be the input
+  // sharding of a computation if XLA would be allowed to propagate the sharding
+  // which can be used by higher level framework as a way to query intermediate
+  // sharding of operations when multiple computation would be chained and
+  // merged together.
+  ExecutableBuildOptions& set_allow_spmd_sharding_propagation_to_parameters(
+      absl::Span<const bool> allow_spmd_sharding_propagation_to_parameters) {
+    allow_spmd_sharding_propagation_to_parameters_.assign(
+        allow_spmd_sharding_propagation_to_parameters.begin(),
+        allow_spmd_sharding_propagation_to_parameters.end());
+    return *this;
+  }
+  // Allows sharding propagation to propagate to the outputs. This changes the
+  // output shape of the computation (which is undesirable), but it can be used
+  // to allow to run partial compilation to determine what would be the output
+  // sharding of a computation if XLA would be allowed to propagate the sharding
+  // which can be used by higher level framework as a way to query intermediate
+  // sharding of operations when multiple computation would be chained and
+  // merged together.
+  ExecutableBuildOptions& set_allow_spmd_sharding_propagation_to_output(
+      absl::Span<const bool> allow_spmd_sharding_propagation_to_output) {
+    allow_spmd_sharding_propagation_to_output_.assign(
+        allow_spmd_sharding_propagation_to_output.begin(),
+        allow_spmd_sharding_propagation_to_output.end());
+    return *this;
+  }
+
+  // Thread pool for parallel compilation.
+  tsl::thread::ThreadPool* compile_thread_pool() const {
+    return compile_thread_pool_;
+  }
+  ExecutableBuildOptions& set_compile_thread_pool(
+      tsl::thread::ThreadPool* compile_thread_pool) {
+    compile_thread_pool_ = compile_thread_pool;
+    return *this;
+  }
+
+  using LayoutCanonicalizationCallback =
+      std::function<absl::StatusOr<std::pair<std::vector<Shape>, Shape>>(
+          const HloModule& module)>;
+  void set_layout_canonicalization_callback(
+      LayoutCanonicalizationCallback callback) {
+    layout_canonicalization_callback_ = std::move(callback);
+  }
+  LayoutCanonicalizationCallback layout_canonicalization_callback() const {
+    return layout_canonicalization_callback_;
+  }
+
+  absl::string_view fdo_profile() const { return fdo_profile_; }
+  void set_fdo_profile(std::string fdo_profile) {
+    fdo_profile_ = std::move(fdo_profile);
+  }
+  std::string* mutable_fdo_profile() { return &fdo_profile_; }
+
+  // The amount of device memory available for the executable.
+  int64_t device_memory_size() const { return device_memory_size_; }
+  ExecutableBuildOptions& set_device_memory_size(int64_t device_memory_size) {
+    device_memory_size_ = device_memory_size;
+    return *this;
+  }
+
+  bool use_shardy_partitioner() const { return use_shardy_partitioner_; }
+  ExecutableBuildOptions& set_use_shardy_partitioner(
+      bool use_shardy_partitioner) {
+    use_shardy_partitioner_ = use_shardy_partitioner;
+    return *this;
+  }
+
+  // Returns a string representation of the build options, suitable for
+  // debugging.
+  std::string ToString() const;
+
+  absl::StatusOr<ExecutableBuildOptionsProto> ToProto() const;
+
+  int process_index() const { return process_index_; }
+  void set_process_index(const int process_index) {
+    process_index_ = process_index;
+  }
+  int process_count() const { return process_count_; }
+  void set_process_count(const int process_count) {
+    process_count_ = process_count;
+  }
+
+  std::shared_ptr<KeyValueStoreInterface> key_value_store() const {
+    return key_value_store_;
+  }
+  void set_key_value_store(std::shared_ptr<KeyValueStoreInterface> kv_store) {
+    key_value_store_ = kv_store;
+  }
+
+ private:
+  int device_ordinal_ = -1;
+  Shape result_layout_;
+  bool result_layout_set_ = false;
+  std::optional<CompilationEnvironments> comp_envs_;
+  std::optional<DebugOptions> debug_options_;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
+  int num_replicas_ = 1;
+  int num_partitions_ = 1;
+  bool use_spmd_partitioning_ = false;
+  bool use_auto_spmd_partitioning_ = false;
+  std::vector<int64_t> auto_spmd_partitioning_mesh_shape_;
+  std::vector<int64_t> auto_spmd_partitioning_mesh_ids_;
+  float exec_time_optimization_effort_ = 0.0f;
+  float memory_fitting_effort_ = 0.0f;
+  bool deduplicate_hlo_ = false;
+  bool broadcast_replicated_params_ = false;
+  std::optional<DeviceAssignment> device_assignment_;
+  bool alias_passthrough_params_ = false;
+  bool run_backend_only_ = false;
+  absl::InlinedVector<bool, 1> allow_spmd_sharding_propagation_to_parameters_ =
+      {false};
+  absl::InlinedVector<bool, 1> allow_spmd_sharding_propagation_to_output_ = {
+      false};
+  tsl::thread::ThreadPool* compile_thread_pool_ = nullptr;
+  LayoutCanonicalizationCallback layout_canonicalization_callback_;
+  std::string fdo_profile_;
+  int64_t device_memory_size_ = 0;
+  bool use_shardy_partitioner_ = false;
+  int process_index_ = 0;
+  int process_count_ = 1;
+  std::shared_ptr<KeyValueStoreInterface> key_value_store_;
+};
+
+absl::StatusOr<ExecutableBuildOptions> ExecutableBuildOptionsFromProto(
+    const ExecutableBuildOptionsProto& input);
+
+// Creates an ExecutionOptions based on a given ExecutableBuildOptions and
+// ProgramShape.
+ExecutionOptions CreateExecutionOptions(
+    const ExecutableBuildOptions& build_options,
+    const ProgramShape* program_shape);
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/global_data.h b/third_party/tflite-hdrs/third_party/xla/xla/client/global_data.h
new file mode 100644
index 00000000..d47209ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/global_data.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_GLOBAL_DATA_H_
+#define XLA_CLIENT_GLOBAL_DATA_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/service/service.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// TODO(cheshire): Remove.
+// Deprecated target for backwards compatibility.
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_GLOBAL_DATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/approx_topk.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/approx_topk.h
new file mode 100644
index 00000000..175a12ca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/approx_topk.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_APPROX_TOPK_H_
+#define XLA_CLIENT_LIB_APPROX_TOPK_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/approx_topk.h"
+
+#endif  // XLA_CLIENT_LIB_APPROX_TOPK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/approx_topk_shape.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/approx_topk_shape.h
new file mode 100644
index 00000000..eef1e296
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/approx_topk_shape.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
+#define XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/approx_topk_shape.h"
+
+#endif  // XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/arithmetic.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/arithmetic.h
new file mode 100644
index 00000000..0b8e000a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/arithmetic.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_ARITHMETIC_H_
+#define XLA_CLIENT_LIB_ARITHMETIC_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/arithmetic.h"
+
+#endif  // XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/broadcast.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/broadcast.h
new file mode 100644
index 00000000..deb85ae9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/broadcast.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_BROADCAST_H_
+#define XLA_CLIENT_LIB_BROADCAST_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/broadcast.h"
+
+#endif  // XLA_CLIENT_LIB_BROADCAST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/comparators.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/comparators.h
new file mode 100644
index 00000000..ad9b37d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/comparators.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_COMPARATORS_H_
+#define XLA_CLIENT_LIB_COMPARATORS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/comparators.h"
+
+#endif  // XLA_CLIENT_LIB_COMPARATORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/constants.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/constants.h
new file mode 100644
index 00000000..2135f481
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/constants.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_CONSTANTS_H_
+#define XLA_CLIENT_LIB_CONSTANTS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/constants.h"
+
+#endif  // XLA_CLIENT_LIB_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/conv_grad_size_util.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/conv_grad_size_util.h
new file mode 100644
index 00000000..e9919829
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/conv_grad_size_util.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
+#define XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/conv_grad_size_util.h"
+
+#endif  // XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/dynamic_shaped_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/dynamic_shaped_ops.h
new file mode 100644
index 00000000..cf62a37d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/dynamic_shaped_ops.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_DYNAMIC_SHAPED_OPS_H_
+#define XLA_CLIENT_LIB_DYNAMIC_SHAPED_OPS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/dynamic_shaped_ops.h"
+
+#endif  // XLA_CLIENT_LIB_DYNAMIC_SHAPED_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/loops.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/loops.h
new file mode 100644
index 00000000..d714efea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/loops.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_LOOPS_H_
+#define XLA_CLIENT_LIB_LOOPS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/loops.h"
+
+#endif  // XLA_CLIENT_LIB_LOOPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/lu_decomposition.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/lu_decomposition.h
new file mode 100644
index 00000000..752e84c9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/lu_decomposition.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
+#define XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/lu_decomposition.h"
+
+#endif  // XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/math.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/math.h
new file mode 100644
index 00000000..9956776e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/math.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_MATH_H_
+#define XLA_CLIENT_LIB_MATH_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/math.h"
+
+#endif  // XLA_CLIENT_LIB_MATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/matrix.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/matrix.h
new file mode 100644
index 00000000..aaf93878
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/matrix.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_MATRIX_H_
+#define XLA_CLIENT_LIB_MATRIX_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/matrix.h"
+
+#endif  // XLA_CLIENT_LIB_MATRIX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/pooling.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/pooling.h
new file mode 100644
index 00000000..22f3d2f0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/pooling.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_POOLING_H_
+#define XLA_CLIENT_LIB_POOLING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/pooling.h"
+
+#endif  // XLA_CLIENT_LIB_POOLING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/prng.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/prng.h
new file mode 100644
index 00000000..0c9e460b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/prng.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_PRNG_H_
+#define XLA_CLIENT_LIB_PRNG_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/prng.h"
+
+#endif  // XLA_CLIENT_LIB_PRNG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/qr.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/qr.h
new file mode 100644
index 00000000..743b3650
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/qr.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_QR_H_
+#define XLA_CLIENT_LIB_QR_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/qr.h"
+
+#endif  // XLA_CLIENT_LIB_QR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/quantize.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/quantize.h
new file mode 100644
index 00000000..459716b3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/quantize.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_QUANTIZE_H_
+#define XLA_CLIENT_LIB_QUANTIZE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/quantize.h"
+
+#endif  // XLA_CLIENT_LIB_QUANTIZE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/self_adjoint_eig.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/self_adjoint_eig.h
new file mode 100644
index 00000000..ae81dbc0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/self_adjoint_eig.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+#define XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/self_adjoint_eig.h"
+
+#endif  // XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/slicing.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/slicing.h
new file mode 100644
index 00000000..c2ea243a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/slicing.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_SLICING_H_
+#define XLA_CLIENT_LIB_SLICING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/slicing.h"
+
+#endif  // XLA_CLIENT_LIB_SLICING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/sorting.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/sorting.h
new file mode 100644
index 00000000..5cb81a43
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/sorting.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_SORTING_H_
+#define XLA_CLIENT_LIB_SORTING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/sorting.h"
+
+#endif  // XLA_CLIENT_LIB_SORTING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/svd.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/svd.h
new file mode 100644
index 00000000..54893697
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/svd.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_SVD_H_
+#define XLA_CLIENT_LIB_SVD_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/svd.h"
+
+#endif  // XLA_CLIENT_LIB_SVD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/testing.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/testing.h
new file mode 100644
index 00000000..a9b566c6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/testing.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_TESTING_H_
+#define XLA_CLIENT_LIB_TESTING_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/client/client.h"
+#include "xla/client/global_data.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Generates fake data of the given shape on the device or dies. The fake data
+// is created by performing a computation on the device rather than transferring
+// data from the host to the device.
+//
+// The optional DebugOptions are used when generating fake data on the device.
+std::unique_ptr<GlobalData> MakeFakeDataOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts = nullptr);
+
+// Returns vector of GlobalData handles of fake data (created using
+// MakeFakeDataOrDie) that are correctly shaped arguments for the given
+// xla computation.
+//
+// The optional DebugOptions are used when generating fake data on the device.
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client,
+    DebugOptions* debug_opts = nullptr);
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_LIB_TESTING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/tridiagonal.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/tridiagonal.h
new file mode 100644
index 00000000..5cc51c5e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/tridiagonal.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_TRIDIAGONAL_H_
+#define XLA_CLIENT_LIB_TRIDIAGONAL_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/tridiagonal.h"
+
+#endif  // XLA_CLIENT_LIB_TRIDIAGONAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/lib/tuple.h b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/tuple.h
new file mode 100644
index 00000000..c1dc9de0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/lib/tuple.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LIB_TUPLE_H_
+#define XLA_CLIENT_LIB_TUPLE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/lib/tuple.h"
+
+#endif  // XLA_CLIENT_LIB_TUPLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/local_client.h b/third_party/tflite-hdrs/third_party/xla/xla/client/local_client.h
new file mode 100644
index 00000000..6cb2dd22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/local_client.h
@@ -0,0 +1,251 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_LOCAL_CLIENT_H_
+#define XLA_CLIENT_LOCAL_CLIENT_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/client/client.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/service/backend.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/local_service.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/service/stream_pool.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+class LocalExecutable {
+ public:
+  // Low-level constructor; LocalClient::Compile() is the usual way to create
+  // executables.
+  LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
+                  ExecutableBuildOptions build_options);
+
+  // Run the compiled computation with the given arguments and options and
+  // return the result.
+  absl::StatusOr<ScopedShapedBuffer> Run(
+      absl::Span<const ShapedBuffer* const> arguments,
+      ExecutableRunOptions run_options);
+
+  // Similar to Run(), but allows for donating argument buffers to the
+  // executable.
+  absl::StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                      ExecutableRunOptions run_options);
+
+  // Similar to Run(), but need not block the host waiting for the computation
+  // to complete before returning.
+  absl::StatusOr<ScopedShapedBuffer> RunAsync(
+      absl::Span<const ShapedBuffer* const> arguments,
+      ExecutableRunOptions run_options);
+
+  // Similar to RunAsync(), but allows for donating argument buffers to the
+  // executable.
+  absl::StatusOr<ExecutionOutput> RunAsync(
+      std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
+
+  // Return the options used to build the executable.
+  const ExecutableBuildOptions& build_options() const { return build_options_; }
+
+  // Return the built executable.
+  Executable* executable() const { return executable_.get(); }
+
+  // Verifies that the a device is compatible with the executable's
+  // build device.
+  absl::Status VerifyRunDeviceCompatible(int run_device_ordinal) const;
+
+ private:
+  absl::StatusOr<ExecutionOutput> RunAsync(
+      absl::Span<Shape const* const> argument_host_shapes,
+      std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
+
+  // Validates that the given arguments and options satisfy various constraints
+  // of the computation.
+  //
+  // The given ExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
+  absl::Status ValidateExecutionOptions(const ExecutableRunOptions& run_options,
+                                        const Backend& backend);
+
+  // Returns a literal containing the contents of the given ShapedBuffer.
+  absl::StatusOr<Literal> LiteralFromShapedBuffer(
+      const ShapedBuffer& shaped_buffer);
+
+  absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+  RunHelper(absl::Span<const Shape* const> argument_shapes,
+            ExecutableRunOptions run_options);
+
+  // The ordinal of the device which this executable was compiled for. The
+  // executable can run on all equivalent devices (as determined by
+  // Backend::devices_equivalent).
+  int build_device_ordinal() const { return build_options_.device_ordinal(); }
+
+  template <typename T>
+  absl::StatusOr<T> AsyncCallAndBlockHostUntilDone(
+      absl::Span<Shape const* const> argument_shapes,
+      const ExecutableRunOptions& run_options,
+      std::function<absl::StatusOr<T>(const ExecutableRunOptions&)>
+          async_callback) {
+    TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                        RunHelper(argument_shapes, run_options));
+    ExecutableRunOptions options = options_and_stream.first.run_options();
+    options.set_device_ordinal(-1);
+    absl::StatusOr<T> result = async_callback(options);
+    absl::Status block_status = options.stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(result.status());
+    TF_RETURN_IF_ERROR(block_status);
+    return result;
+  }
+
+  // Compiled computation.
+  std::unique_ptr<Executable> executable_;
+
+  // Execution backend.
+  Backend* backend_ = nullptr;
+
+  // Options used to build the executable.
+  const ExecutableBuildOptions build_options_;
+};
+
+// An XLA Client specialization for use when the client and service run in
+// the same process.
+class LocalClient : public Client {
+ public:
+  explicit LocalClient(LocalService* service)
+      : Client(service), local_service_(service) {}
+
+  LocalClient(const LocalClient&) = delete;
+  void operator=(const LocalClient&) = delete;
+
+  // Build and return LocalExecutable objects (one per partition, as specified
+  // by the build options). The executable is compiled using the given
+  // XlaComputation, argument layouts and options.
+  //
+  // The given ExecutableBuildOptions overrides any values from XLA_FLAGS
+  // environment variable.
+  absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> Compile(
+      const XlaComputation& computation,
+      absl::Span<const Shape* const> argument_layouts,
+      const ExecutableBuildOptions& options);
+
+  // Same as Compile() above, but return AotCompilationResult objects (instead
+  // of LocalExecutable objects), which can be persisted to later load
+  // LocalExecutable(s) using the Load() method below.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(const XlaComputation& computation,
+                     absl::Span<const Shape* const> argument_layouts,
+                     const ExecutableBuildOptions& options);
+
+  // Return a LocalExecutable object loaded from a serialized
+  // AotCompilationResult.
+  absl::StatusOr<std::unique_ptr<LocalExecutable>> Load(
+      const std::string& serialized_aot_result,
+      const ExecutableBuildOptions& options);
+
+  // Copy the literal data to the device with the given ordinal and return as a
+  // ScopedShapedBuffer. If non-null the given memory allocator is used for
+  // device memory allocation. If null, the default memory allocator for the
+  // device is used.
+  absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
+      const LiteralSlice& literal, int device_ordinal,
+      se::DeviceMemoryAllocator* allocator = nullptr);
+
+  // Transfer the BorrowingLiteral to the device with the given ordinal.
+  absl::StatusOr<GlobalDataHandle> TransferToLocalServer(
+      const ::xla::BorrowingLiteral& literal, int device_ordinal);
+
+  // Copy the data from the device contained in the given ShapedBuffer and
+  // return as a Literal.
+  absl::StatusOr<Literal> ShapedBufferToLiteral(
+      const ShapedBuffer& shaped_buffer);
+
+  // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
+  // as long as the handle is valid.
+  absl::StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+      const GlobalDataHandle& data, int replica_number);
+
+  // Transfer the given literal to the infeed queue of the given device.
+  // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
+  // not inherit from Client and there is no possibility of confusion with
+  // Client::TransferToInfeed.
+  absl::Status TransferToInfeedLocal(const LiteralSlice& literal,
+                                     int device_ordinal);
+
+  // Transfer and return a value from the outfeed of the given device. The
+  // shape of the object to transfer is determined by `literal`'s shape.
+  // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
+  // not inherit from Client and there is no possibility of confusion with
+  // Client::TransferFromOutfeed.
+  absl::Status TransferFromOutfeedLocal(int device_ordinal,
+                                        MutableBorrowingLiteral literal);
+
+  // Returns the device ordinal that corresponds to the given replica number.
+  //
+  // This returns an error if there is not a one-to-one correspondence of
+  // replicas to device ordinals, but is useful as a short term mechanism for
+  // the "easy" case where a single replica is a single device.
+  absl::StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
+
+  // Returns the platform that the underlying service targets.
+  se::Platform* platform() const;
+
+  // Returns the number of devices on the system of the service platform
+  // type. Not all devices may be supported by the service (see
+  // device_ordinal_supported method).
+  int device_count() const;
+
+  // Returns the default device ordinal that the service will run computations
+  // on if no device ordinal is specified in execute options.
+  int default_device_ordinal() const;
+
+  // Returns whether the device with the given ordinal can be used by the
+  // service to execute computations. Not all devices of a particular platform
+  // may be usable by the service (eg, a GPU with insufficient CUDA compute
+  // capability).
+  bool device_ordinal_supported(int device_ordinal) const;
+
+  // Returns the backend used to execute computations.
+  const Backend& backend() const;
+  Backend* mutable_backend();
+
+  LocalService* local_service() { return local_service_; }
+
+ private:
+  LocalService* local_service_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CLIENT_LOCAL_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/padding.h b/third_party/tflite-hdrs/third_party/xla/xla/client/padding.h
new file mode 100644
index 00000000..a9e928d8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/padding.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_PADDING_H_
+#define XLA_CLIENT_PADDING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/padding.h"
+
+#endif  // XLA_CLIENT_PADDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/sharding_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/client/sharding_builder.h
new file mode 100644
index 00000000..995978b1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/sharding_builder.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_SHARDING_BUILDER_H_
+#define XLA_CLIENT_SHARDING_BUILDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/sharding_builder.h"
+
+#endif  // XLA_CLIENT_SHARDING_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/value_inference.h b/third_party/tflite-hdrs/third_party/xla/xla/client/value_inference.h
new file mode 100644
index 00000000..f717cc70
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/value_inference.h
@@ -0,0 +1,21 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CLIENT_VALUE_INFERENCE_H_
+#define XLA_CLIENT_VALUE_INFERENCE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/value_inference.h"
+
+#endif  // XLA_CLIENT_VALUE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/xla_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/client/xla_builder.h
new file mode 100644
index 00000000..1599160a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/xla_builder.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_XLA_BUILDER_H_
+#define XLA_CLIENT_XLA_BUILDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/xla_builder.h"
+
+#endif  // XLA_CLIENT_XLA_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/client/xla_computation.h b/third_party/tflite-hdrs/third_party/xla/xla/client/xla_computation.h
new file mode 100644
index 00000000..685fcfec
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/client/xla_computation.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CLIENT_XLA_COMPUTATION_H_
+#define XLA_CLIENT_XLA_COMPUTATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/builder/xla_computation.h"
+
+#endif  // XLA_CLIENT_XLA_COMPUTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/codegen/ir/xla_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/codegen/ir/xla_ops.h
new file mode 100644
index 00000000..30d04655
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/codegen/ir/xla_ops.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_IR_XLA_OPS_H_
+#define XLA_CODEGEN_IR_XLA_OPS_H_
+
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep
+#include "mlir/IR/Attributes.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
+#include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+#include "mlir/IR/MLIRContext.h"  // IWYU pragma: keep
+#include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
+#include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/CallInterfaces.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // IWYU pragma: keep
+#include "xla/codegen/ir/xla_dialect.h.inc"
+#include "xla/hlo/analysis/indexing_map.h"  // IWYU pragma: keep
+#define GET_ATTRDEF_CLASSES
+#include "xla/codegen/ir/xla_attrs.h.inc"
+#define GET_OP_CLASSES
+#include "xla/codegen/ir/xla_ops.h.inc"
+
+namespace xla {
+
+struct VariableConstraints {
+  llvm::SmallVector<llvm::SmallDenseMap<mlir::AffineExpr, Interval>>
+      constraints_for_dims;
+  llvm::SmallVector<llvm::SmallDenseMap<mlir::AffineExpr, Interval>>
+      constraints_for_symbols;
+};
+VariableConstraints GetConstraintsForVariables(const IndexingMap& map);
+
+// Parses a comma-separated list of operands, ex: %d1, %d2.
+mlir::ParseResult parseOperands(
+    mlir::OpAsmParser& parser,
+    mlir::SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4>* operands);
+
+// Parses a chain of string attributes into an indexing map.
+// Example:
+// "()[s0, s1] -> (1 + s0 + s1 mod 3 - s1, s0 mod 2),"
+//   " domain: s0 in [-10, 10], s1 in [0, 2]"
+// will be parsed as 3 StringAttrs, concatenated into a single string, and then
+// parsed into an IndexingMap.
+std::optional<IndexingMap> parseChainOfStringsAsIndexingMap(
+    mlir::AsmParser& parser);
+
+// Returns the range of a given value, if it can be statically determined.
+std::optional<Interval> GetRange(mlir::Value value);
+
+// Returns the range for the induction variable, if it can be statically
+// determined.
+std::optional<Interval> GetIVRange(mlir::Value iv);
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_IR_XLA_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/codegen/kernel_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/codegen/kernel_emitter.h
new file mode 100644
index 00000000..bf57d869
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/codegen/kernel_emitter.h
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_KERNEL_EMITTER_H_
+#define XLA_CODEGEN_KERNEL_EMITTER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/codegen/kernel_spec.h"
+
+namespace xla {
+
+// TODO(ezhulenev): Do we need virtual KernelEmitterContext in API?
+
+// KernelEmitter is an API that emits kernel specification from a given input
+// (i.e. it emits kernels compiled from HLO fusions).
+class KernelEmitter {
+ public:
+  virtual ~KernelEmitter() = default;
+
+  virtual absl::StatusOr<std::unique_ptr<KernelSpec>> EmitKernelSpec() = 0;
+};
+
+// A base class for backend-specific kernel emitters.
+//
+// Example: XLA:GPU backend kernel emitter.
+//
+//   class xla::gpu::GpuPlatform;
+//
+//   class xla::gpu::HloFusionEmitter :
+//     public KernelEmitter<GpuPlatform, const HloFusionInstruction*>;
+//
+template <typename Platform, typename Operation>
+class KernelEmitterBase {
+ public:
+  KernelEmitterBase(std::shared_ptr<Platform> platform, Operation operation)
+      : platform_(std::move(platform)), operation_(std::move(operation)) {}
+
+  const Operation& operation() const { return operation_; }
+  const Platform& platform() const { return *platform_; }
+
+ private:
+  std::shared_ptr<Platform> platform_;
+  Operation operation_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_KERNEL_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/codegen/kernel_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/codegen/kernel_spec.h
new file mode 100644
index 00000000..1bfea457
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/codegen/kernel_spec.h
@@ -0,0 +1,89 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_KERNEL_SPEC_H_
+#define XLA_CODEGEN_KERNEL_SPEC_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla {
+
+// KernelSource is a base class for generated kernel source. Concrete types of
+// kernel source are backends specific, i.e. on GPU backend it can be PTX (if
+// already compiled) or an LLVM IR (if XLA itself will compile it to PTX).
+class KernelSource {
+ public:
+  virtual ~KernelSource() = default;
+
+  // Get a human readable string representation of the kernel source.
+  virtual std::string ToString() const = 0;
+};
+
+// KernelSpec is a specification of an XLA kernel produced by the XLA codegen.
+// At XLA compilation time, backends instantiates kernel specification into run
+// time instances that can be executed on the device, i.e. on GPU XLA runtime
+// will load kernel PTX on device and instantiate a KernelThunk.
+class KernelSpec {
+ public:
+  using BufferUses = absl::InlinedVector<BufferUse, 8>;
+
+  KernelSpec(se::ClusterDim cluster_dim, se::BlockDim block_dim,
+             se::ThreadDim thread_dim, std::optional<size_t> scratch_bytes,
+             BufferUses buffer_uses);
+
+  virtual ~KernelSpec() = default;
+
+  // Kernel launch dimensions define how the kernel execution must be
+  // parallelized. The meaning of these dimensions is backend specific, i.e.
+  // on GPU these are CUDA block and thread dimensions, and on CPU these
+  // dimensions mapped to tasks submitted to a thread pool.
+  //
+  // At a high level kernel codegen can rely on these dimensions to define
+  // spatial partitioning of the computation problem and optimize for data
+  // locality. However it's up to the backend codegen and runtime to agree
+  // on the exact meaning of these dimensions and how they are mapped to the
+  // underlying hardware, and how to use them for perfrormance optimization.
+  se::ClusterDim cluster_dim() const { return cluster_dim_; }
+  se::BlockDim block_dim() const { return block_dim_; }
+  se::ThreadDim thread_dim() const { return thread_dim_; }
+
+  // Requested amount of scratch bytes for the kernel (backed by backend
+  // specific memory, i.e. on GPU this is shared memory, on CPU it can runtime
+  // managed buffer that is likely to be in L1/L2 cache).
+  std::optional<size_t> scratch_bytes() const { return scratch_bytes_; }
+
+  // Buffers (buffer allocation slices) used by the kernel.
+  const BufferUses& buffer_uses() const { return buffer_uses_; }
+
+  // Compiled kernel source (backend specific).
+  virtual KernelSource& kernel_source() = 0;
+
+ private:
+  se::ClusterDim cluster_dim_;
+  se::BlockDim block_dim_;
+  se::ThreadDim thread_dim_;
+  std::optional<size_t> scratch_bytes_;
+  BufferUses buffer_uses_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_KERNEL_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/codegen/llvm_ir_kernel_source.h b/third_party/tflite-hdrs/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
new file mode 100644
index 00000000..0726380b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
+#define XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "xla/codegen/kernel_spec.h"
+
+namespace xla {
+
+// XLA kernel compiled to LLVM IR. Depending on the concrete kernel emitter
+// implementation we might emit a single LLVM module with multiple kernels or a
+// separate LLVM module for each kernel. Kernel function signature is defined by
+// the backend specific ABI.
+class LlvmIrKernelSource final : public KernelSource {
+ public:
+  LlvmIrKernelSource(llvm::orc::ThreadSafeContext context,
+                     std::unique_ptr<llvm::Module> module,
+                     std::string kernel_name)
+      : context_(std::move(context)),
+        module_(std::move(module)),
+        kernel_name_(std::move(kernel_name)) {}
+
+  LlvmIrKernelSource(LlvmIrKernelSource&& other) = default;
+  LlvmIrKernelSource& operator=(LlvmIrKernelSource&& other) = default;
+
+  llvm::orc::ThreadSafeModule thread_safe_module() && {
+    return llvm::orc::ThreadSafeModule(std::move(module_), context_);
+  }
+
+  const std::string& kernel_name() const { return kernel_name_; }
+
+  const llvm::Function* kernel_function() const {
+    return module_->getFunction(kernel_name_);
+  }
+
+  std::string ToString() const final;
+
+ private:
+  llvm::orc::ThreadSafeContext context_;
+  std::unique_ptr<llvm::Module> module_;
+  std::string kernel_name_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/codegen/testlib/kernel_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/codegen/testlib/kernel_runner.h
new file mode 100644
index 00000000..e2728756
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/codegen/testlib/kernel_runner.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_TESTLIB_KERNEL_RUNNER_H_
+#define XLA_CODEGEN_TESTLIB_KERNEL_RUNNER_H_
+
+#include <cstddef>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+// A base API for running XLA kernels. Intended for use in tests only.
+class KernelRunner {
+ public:
+  virtual ~KernelRunner() = default;
+
+  // Kernel argument is a non-owning view into the byte array on the host.
+  // Kernel runners are responsible for transferring data from these buffers to
+  // the device (if kernel is a device kernel, i.e. GPU kernel), and
+  // transferring updates from the device back to these buffers.
+  using Argument = absl::Span<std::byte>;
+
+  // Calls the kernel with the given arguments.
+  //
+  // It's important to note that kernels (in contrast to HLO operations and XLA
+  // programs) do not have parameters and results, they have buffer arguments
+  // and they might read and write into the given buffers. Memory access kind is
+  // available in the KernelSpec buffer uses.
+  virtual absl::Status Call(absl::Span<const Argument> arguments) = 0;
+
+  // Wrapper that takes in a set of Literals and converts them to Arguments.
+  // Intentionally takes in raw pointers to allow for zero copy of the Literals
+  // held by python.
+  absl::Status Call(absl::Span<Literal*> literals);
+};
+
+// A collection of utility functions for working with KernelRunners.
+class KernelRunnerUtil {
+ public:
+  // Creates a KernelRunner::Argument from a Literal.
+  static KernelRunner::Argument CreateArgument(Literal& literal,
+                                               const ShapeIndex& index = {});
+};
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_TESTLIB_KERNEL_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/comparison_util.h b/third_party/tflite-hdrs/third_party/xla/xla/comparison_util.h
new file mode 100644
index 00000000..5a21595d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/comparison_util.h
@@ -0,0 +1,259 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_COMPARISON_UTIL_H_
+#define XLA_COMPARISON_UTIL_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/primitive_util.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// A utility class for primitive comparisons. A comparison includes three
+// components: the type of the elements being compared (F32, S16, etc), whether
+// it is a partial or total order comparison, and the actual comparison operator
+// (==, <=, >, etc).
+//
+// Note that integer comparisons are always total order. Float comparisons can
+// be either total or partial order.
+//
+// Some examples:
+//
+//   Comparison a(
+//     Comparison::Direction::kLt,
+//     xla::PrimitiveType::BF16,
+//     Comparison::Order::kTotal
+//   );
+//   a.ToString(); /* ".LT.BF16.TOTALORDER" */
+//
+//   Comparison b(Comparison::Direction::kEq, xla::PrimitiveType::U32);
+//   b.IsTotalOrder(); /* true */
+class Comparison {
+ public:
+  // Represents the ordering of the comparison.
+  enum class Order : uint8_t {
+    // https://en.wikipedia.org/wiki/Total_order
+    kTotal,
+    // https://en.wikipedia.org/wiki/Partially_ordered_set
+    kPartial,
+  };
+
+  friend absl::string_view ComparisonOrderToString(Comparison::Order order);
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Order& p) {
+    absl::Format(&sink, "%s", ComparisonOrderToString(p));
+  }
+
+  // Represents different comparison operations.
+  enum class Direction : uint8_t {
+    kEq,
+    kNe,
+    kGe,
+    kGt,
+    kLe,
+    kLt,
+  };
+
+  // (DEPRECATED) Represents the type of comparison. Prefer xla::PrimitiveType
+  // and Comparison::Order, since there are multiple floating point
+  // representations that support total ordering.
+  enum class [[deprecated("Use PrimitiveType and Order")]] Type : uint8_t{
+      kFloat,
+      kFloatTotalOrder,
+      kSigned,
+      kUnsigned,
+  };
+
+  Comparison() = delete;
+
+  // This will default to the expected behavior for Comparison::Order: integers
+  // will use total ordering, and floats will use partial ordering.
+  explicit Comparison(Direction dir, PrimitiveType type);
+
+  // Pass in a Comparison::Order to specify a non-default ordering, e.g., some
+  // targets may support total order floating point type comparisons.
+  explicit Comparison(Direction dir, PrimitiveType type, Order order);
+
+  // Returns a comparison with a primitive type matching the Comparison::Type
+  // and using a default bit width of 32. For example,
+  // Comparison(Direction::kLt, Type::kFloat).PrimitiveType()  /* F32 */
+  [[deprecated(
+      "Use Comparison(Comparison::Direction, "
+      "PrimitiveType)")]] explicit Comparison(Direction dir, Type type);
+
+  inline Direction GetDirection() const { return dir_; }
+  inline PrimitiveType GetPrimitiveType() const { return primitive_type_; }
+  inline Order GetOrder() const { return order_; }
+
+  [[deprecated("Use GetPrimitiveType() and GetOrder()")]] inline Type GetType()
+      const {
+    return type_;
+  }
+
+  inline bool IsEq() const { return dir_ == Direction::kEq; }
+  inline bool IsNe() const { return dir_ == Direction::kNe; }
+  inline bool IsGe() const { return dir_ == Direction::kGe; }
+  inline bool IsGt() const { return dir_ == Direction::kGt; }
+  inline bool IsLt() const { return dir_ == Direction::kLt; }
+  inline bool IsTotalOrder() const { return order_ == Order::kTotal; }
+  inline bool IsPartialOrder() const { return order_ == Order::kPartial; }
+
+  // Returns whether this is a floating point total order comparison.
+  inline bool IsF32TotalOrder() const {
+    return primitive_type_ == PrimitiveType::F32 && IsTotalOrder();
+  }
+
+  // Returns whether this is a standard comparison, i.e., what you would expect
+  // as the industry standard on most architectures.
+  inline bool IsStandardF32() const {
+    return primitive_type_ == PrimitiveType::F32 && IsPartialOrder();
+  }
+  inline bool IsStandardS32() const {
+    return primitive_type_ == PrimitiveType::S32 && IsTotalOrder();
+  }
+  inline bool IsStandardU32() const {
+    return primitive_type_ == PrimitiveType::U32 && IsTotalOrder();
+  }
+
+  inline bool IsIntegralPrimitiveType() const {
+    return primitive_util::IsIntegralType(primitive_type_);
+  }
+  inline bool IsFloatingPointPrimitiveType() const {
+    return primitive_util::IsFloatingPointType(primitive_type_);
+  }
+
+  // Returns whether (a dir a) is always true for this comparison.
+  bool IsReflexive() const;
+
+  // Returns whether (a dir a) is always false for this comparison.
+  bool IsAntireflexive() const;
+
+  // Gets the converse of the given comparison direction (e.g. >= turns to <=).
+  // Useful when commuting operands to get constants into immediate-accepting
+  // positions in the ISA.
+  Comparison Converse() const;
+
+  // Gets the inverse of the given comparison if it exists (e.g. >= turns to <).
+  // Returns optional value because not all inversions may be supported.
+  std::optional<Comparison> Inverse() const;
+
+  // Returns a string version of this comparison, e.g., ".GT.F32.TOTALORDER"
+  std::string ToString(std::string prefix1 = ".", std::string prefix2 = ".",
+                       std::string prefix3 = ".") const;
+
+  // Returns a comparison operator: (T, T) -> bool for this Comparison's
+  // Direction.
+  template <typename T>
+  inline std::function<bool(T, T)> GetComparator() const {
+    switch (GetDirection()) {
+      case Direction::kEq:
+        return std::equal_to<T>();
+      case Direction::kNe:
+        return std::not_equal_to<T>();
+      case Direction::kGe:
+        return std::greater_equal<T>();
+      case Direction::kGt:
+        return std::greater<T>();
+      case Direction::kLe:
+        return std::less_equal<T>();
+      case Direction::kLt:
+        return std::less<T>();
+    }
+  }
+
+  template <typename T>
+  inline bool Compare(const T a, const T b) const {
+    DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
+    if constexpr (is_specialized_floating_point_v<T>) {
+      if (IsTotalOrder()) {
+        //  -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
+        // Reference:
+        // https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations
+        using R = SignedIntegerTypeForSizeType<sizeof(T)>;
+        return GetComparator<R>()(ToSignMagnitude(a), ToSignMagnitude(b));
+      }
+    }
+    // Applies the comparison from this Comparison's direction and ordering.
+    return GetComparator<T>()(a, b);
+  }
+
+  // Returns the Comparison::Type for the given primitive type. This assumes
+  // that each numerical representation follows the standard behavior, e.g.,
+  // integers are total order and floats are partial order.
+  [[deprecated("Use PrimitiveType and Order")]] static Comparison::Type
+  DefaultComparisonType(PrimitiveType type);
+
+ private:
+  // The direction of the Comparison, e.g., GT.
+  const Direction dir_;
+  // The primitive type of the Comparison operands, e.g., F32.
+  const PrimitiveType primitive_type_;
+  // The ordering of the Comparison, e.g., kPartial.
+  const Order order_;
+  // The Type of the Comparison. This tries to mesh together the ordering and
+  // the numerical data classification.
+  [[deprecated]] const Type type_;
+};
+
+using ComparisonDirection = Comparison::Direction;
+using ComparisonOrder = Comparison::Order;
+
+inline std::ostream& operator<<(std::ostream& os, const Comparison& cmp) {
+  return os << cmp.ToString();
+}
+
+std::string ComparisonDirectionToString(Comparison::Direction direction);
+std::string ComparisonTypeToString(Comparison::Type type);
+absl::string_view ComparisonPrimitiveTypeToString(PrimitiveType type);
+
+absl::StatusOr<Comparison::Direction> StringToComparisonDirection(
+    absl::string_view direction);
+absl::StatusOr<Comparison::Type> StringToComparisonType(
+    absl::string_view comparison);
+
+// Returns a comparison function using the provided key function on each value,
+// i.e. `key_fn(a) < key_fn(b)`.
+template <typename KeyFn>
+auto LessThanByKey(KeyFn&& key_fn) {
+  return [=](const auto& a, const auto& b) { return key_fn(a) < key_fn(b); };
+}
+
+// Two comparisons are equivalent iff they have the same direction, precision,
+// and ordering.
+inline bool operator==(const Comparison& a, const Comparison& b) {
+  return a.GetDirection() == b.GetDirection() &&
+         a.GetPrimitiveType() == b.GetPrimitiveType() &&
+         a.GetOrder() == b.GetOrder();
+}
+
+inline bool operator!=(const Comparison& a, const Comparison& b) {
+  return !(a == b);
+}
+
+}  // namespace xla
+
+#endif  // XLA_COMPARISON_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/compiler_macros.h b/third_party/tflite-hdrs/third_party/xla/xla/compiler_macros.h
new file mode 100644
index 00000000..026ebbdc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/compiler_macros.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_COMPILER_MACROS_H_
+#define XLA_COMPILER_MACROS_H_
+
+#if (defined(__GNUC__) || defined(__clang__)) && defined(__SSE2__)
+#define XLA_HAS_SSE2
+#elif defined(_MSC_VER) && !defined(_M_ARM64EC) && defined(_M_X64)
+#define XLA_HAS_SSE2
+#elif defined(_MSC_VER) && !defined(_M_ARM64EC) && \
+    (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define XLA_HAS_SSE2
+#elif defined(__AVX__)
+#define XLA_HAS_SSE2
+#endif
+
+#if defined(_M_ARM64) || defined(_M_ARM64EC)
+#define XLA_HAS_ARM64
+#define XLA_HAS_ARM_NEON
+#elif defined(__ARM_NEON) && !defined(__ARM_BIG_ENDIAN)
+#define XLA_HAS_ARM_NEON
+
+#if defined(__aarch64__)
+#define XLA_HAS_ARM64
+#endif  // defined(__aarch64__)
+
+#endif  // defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#if defined(__clang__)
+#define XLA_UNROLL _Pragma("unroll")
+#elif defined(__GNUC__)
+#define XLA_UNROLL _Pragma("GCC unroll 128")
+#else
+#define XLA_UNROLL
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define XLA_FLATTEN __attribute__((flatten))
+#elif defined(_MSC_VER)
+#define XLA_FLATTEN [[msvc::flatten]]
+#else
+#define XLA_FLATTEN
+#endif
+
+#endif  // XLA_COMPILER_MACROS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique.h
new file mode 100644
index 00000000..24f80a3f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_CLIQUE_H_
+#define XLA_CORE_COLLECTIVES_CLIQUE_H_
+
+// A group of NCCL communicators making up a clique. With NCCL it's notoriously
+// easy to get a deadlock, so we take extra care by grouping communicators into
+// cliques and making sure that we have a well defined order of all collective
+// operations that does not lead to deadlocks.
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/btree_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla {
+
+// A group of collective communicators for make up a clique.
+//
+// We use clique mechanism to group communicators to be able to efficiently
+// get exclusive access to all communicators in a clique, as we typically have
+// to guarantee that collective operations on all ranks are executed in the
+// same order across all devices.
+class Clique {
+ public:
+  explicit Clique(
+      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators);
+  virtual ~Clique() = default;
+
+  // Returns a communicator for a given rank if it's in a clique.
+  std::optional<Communicator*> comm(RankId rank) const;
+
+  // Adds a communicator to the clique.
+  absl::Status AddComm(RankId rank, std::unique_ptr<Communicator> communicator);
+
+  // Calls `fn` for each communicator in the clique.
+  void ForEachComm(absl::FunctionRef<void(RankId, Communicator*)> fn) const;
+
+  // Checks that all communicators in the clique are in a healthy state.
+  virtual absl::Status HealthCheck() const = 0;
+
+  // Returns a human-readable string representation of the clique.
+  virtual std::string DebugString() const = 0;
+
+  size_t num_communicators() const { return communicators_.size(); }
+
+ private:
+  // We keep communicators in a sorted order by rank to guarantee
+  // deterministic traversal order in `ForEachComm`.
+  absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CORE_COLLECTIVES_CLIQUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique_id.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique_id.h
new file mode 100644
index 00000000..104e1dbd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique_id.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_CLIQUE_ID_H_
+#define XLA_CORE_COLLECTIVES_CLIQUE_ID_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+namespace xla {
+
+// A globally unique collective clique identifier. Underlying bytes payload is
+// backend specific and optional, some collectives implementations may not
+// require it.
+//
+// Globally unique clique identifier allows multiple hosts and devices to find
+// each other and agree who is a member of a clique. It is a user responsibility
+// to redistribute this id to all participating hosts (i.e. JAX uses shared KV
+// store for that). For single host collective operations XLA automatically
+// generates a unique id for local cliques (cliques consisting of devices
+// visible from a process).
+class CliqueId {
+ public:
+  CliqueId() = default;
+
+  explicit CliqueId(absl::string_view data);
+
+  absl::Span<const char> data() const;
+  std::string ToString() const;
+  uint32_t fingerprint() const;
+  size_t size() const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const CliqueId& id);
+
+ private:
+  std::vector<char> data_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const CliqueId& id) {
+  return H::combine(std::move(h), id.data());
+}
+
+}  // namespace xla
+
+#endif  // XLA_CORE_COLLECTIVES_CLIQUE_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique_key.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique_key.h
new file mode 100644
index 00000000..7e5fddbb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/clique_key.h
@@ -0,0 +1,85 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_CLIQUE_KEY_H_
+#define XLA_CORE_COLLECTIVES_CLIQUE_KEY_H_
+
+#include <cstddef>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <typeindex>
+#include <vector>
+
+#include "absl/hash/hash.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/global_device_id.h"
+
+namespace xla {
+
+// Clique key for identifying a particular collective clique (set of
+// communicating devices together with backend-specific information).
+//
+// Clique keys are backend specific and might include additional information
+// that identifies a particular set of communicating devices, i.e. in GPU
+// backend we distinguish between cliques that do collective operations like
+// `all-reduce` and cliques that do P2P communication (`send` and `recv`), and
+// these cliques launch operations (device kernels) on different device streams.
+class CliqueKey {
+ public:
+  explicit CliqueKey(absl::Span<const GlobalDeviceId> devices);
+  virtual ~CliqueKey() = default;
+
+  CliqueKey(const CliqueKey& other) = default;
+  CliqueKey& operator=(const CliqueKey& other) = default;
+
+  CliqueKey(CliqueKey&& other) = default;
+  CliqueKey& operator=(CliqueKey&& other) = default;
+
+  // Returns the rank of the global device in the clique.
+  std::optional<RankId> rank(GlobalDeviceId id) const;
+
+  absl::Span<const GlobalDeviceId> devices() const;
+  size_t num_devices() const;
+
+  // Returns true if this clique is a subset of `other`.
+  virtual bool IsSubsetOf(const CliqueKey& other) const = 0;
+
+  virtual std::string ToString() const = 0;
+
+  template <typename H>
+  friend H AbslHashValue(H state, const CliqueKey& value);
+
+ private:
+  virtual void HashValue(absl::HashState state) const = 0;
+
+  std::vector<GlobalDeviceId> devices_;
+};
+
+template <typename H>
+H AbslHashValue(H state, const CliqueKey& value) {
+  state = H::combine(std::move(state), std::type_index(typeid(&value)));
+  value.HashValue(absl::HashState::Create(&state));
+  return state;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const CliqueKey& key) {
+  return os << key.ToString();
+}
+
+}  // namespace xla
+
+#endif  // XLA_CORE_COLLECTIVES_CLIQUE_KEY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/collectives.h
new file mode 100644
index 00000000..68f06125
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/collectives.h
@@ -0,0 +1,85 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_COLLECTIVES_H_
+#define XLA_CORE_COLLECTIVES_COLLECTIVES_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla {
+
+// Collectives is a base class for host-initiated collective operations in XLA.
+//
+// Host-initiated collective operations are collective operations that are
+// initiated by the host runtime, i.e. in XLA:GPU the default collectives
+// implementation uses NCCL and Thunks initiate collective operations of the
+// runtime-managed streams.
+//
+// IMPORTANT: XLA also supports device-initiated collective operations, which
+// are collective operations for communication between device kernels. In
+// XLA:GPU device-initiated collective operations are implemented using NVSHMEM.
+class Collectives {
+ public:
+  virtual ~Collectives() = default;
+
+  // A base class for the device that the collectives are running on, i.e. in
+  // XLA:GPU this is the GPU device (StreamExecutor).
+  class Device {
+   public:
+    virtual ~Device() = default;
+  };
+
+  // A collective device together with its rank in the collective clique.
+  struct DeviceRank {
+    DeviceRank(Device* device, RankId rank) : device(device), rank(rank) {}
+
+    Device* device;
+    RankId rank;
+  };
+
+  // A base class for the communicator configuration.
+  class Config {
+   public:
+    virtual ~Config() = default;
+  };
+
+  // Creates a unique CliqueId.
+  virtual absl::StatusOr<CliqueId> CreateUniqueCliqueId() const = 0;
+
+  // Creates communicators for given clique key and id.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueId>& clique_id,
+                      absl::Span<const DeviceRank> ranks,
+                      const Config& config) = 0;
+
+  // Creates communicators by splitting `comms`.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicators(absl::Span<const Communicator* const> comms, int32_t color,
+                     absl::Span<const RankId> keys, const Config& config) = 0;
+};
+
+}  // namespace xla
+#endif  // XLA_CORE_COLLECTIVES_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/collectives_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/collectives_registry.h
new file mode 100644
index 00000000..558deb64
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/collectives_registry.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_COLLECTIVES_REGISTRY_H_
+#define XLA_CORE_COLLECTIVES_COLLECTIVES_REGISTRY_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/base/attributes.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/core/collectives/collectives.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// A registry of collective implementations registered with the current process.
+class CollectivesRegistry {
+ public:
+  // Registers collective implementation for a given platform and name with a
+  // given priority.
+  //
+  // The priority is used to determine which implementation is the default for
+  // the given platform. Higher priority wins.
+  //
+  // Returns an error if the implementation is already registered.
+  static absl::Status Register(absl::string_view platform_name,
+                               absl::string_view name, int32_t priority,
+                               std::unique_ptr<Collectives> collectives);
+
+  // Returns the default collectives implementation for the given platform.
+  static absl::StatusOr<Collectives*> Default(absl::string_view platform_name);
+};
+
+}  // namespace xla
+
+#define XLA_COLLECTIVES_REGISTER(PLATFORM, NAME, PRIORITY, IMPL) \
+  XLA_COLLECTIVES_REGISTER_(PLATFORM, NAME, PRIORITY, IMPL, __COUNTER__)
+#define XLA_COLLECTIVES_REGISTER_(PLATFORM, NAME, PRIORITY, IMPL, N) \
+  XLA_COLLECTIVES_REGISTER__(PLATFORM, NAME, PRIORITY, IMPL, N)
+#define XLA_COLLECTIVES_REGISTER__(PLATFORM, NAME, PRIORITY, IMPL, N)         \
+  ABSL_ATTRIBUTE_UNUSED static const bool xla_collectives_##N##_registered_ = \
+      [] {                                                                    \
+        absl::Status status = ::xla::CollectivesRegistry::Register(           \
+            PLATFORM, NAME, PRIORITY, IMPL);                                  \
+        if (!status.ok()) {                                                   \
+          LOG(ERROR) << "Failed to register XLA collectives: " << status;     \
+        }                                                                     \
+        return true;                                                          \
+      }()
+
+#endif  // XLA_CORE_COLLECTIVES_COLLECTIVES_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/communicator.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/communicator.h
new file mode 100644
index 00000000..af95f706
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/communicator.h
@@ -0,0 +1,147 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_COMMUNICATOR_H_
+#define XLA_CORE_COLLECTIVES_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Collective communicator defines the set of communicating XLA processes.
+class Communicator {
+ public:
+  virtual ~Communicator() = default;
+
+  // An executor is an abstraction for the underlying resource where collective
+  // operations are executed. For example on GPU backend it could be a device
+  // stream, and on CPU backend it could be a thread pool.
+  class Executor {
+   public:
+    virtual ~Executor() = default;
+  };
+
+  // An RAII handle for buffers registered with the communicator. Child classes
+  // are responsible for unregistering the buffer when the handle is destroyed.
+  class RegisteredBufferHandle {
+   public:
+    virtual ~RegisteredBufferHandle() = default;
+    virtual absl::Status Unregister() = 0;
+  };
+
+  // Register `buffer` for efficient collective operations (i.e. on NCCL backend
+  // it registers the buffer for zero-copy collective operations).
+  virtual absl::StatusOr<std::unique_ptr<RegisteredBufferHandle>>
+  RegisterBuffer(stream_executor::DeviceMemoryBase buffer) {
+    return Unimplemented("User-managed buffer registration is not supported");
+  }
+
+  // Abort any uncompleted operations and destroys the underlying communicator
+  // object. It is undefined behavior to use the communicator after calling
+  // this method.
+  virtual absl::Status Abort() {
+    return Unimplemented("Aborting communicator is not implemented");
+  }
+
+  // Checks the health of the communicator. It might return an error from the
+  // previously launched asynchronous collective operations, and it does not
+  // have to wait for the completion of scheduled operations.
+  virtual absl::Status HealthCheck() const { return absl::OkStatus(); }
+
+  // Reduce buffers of length `count` in `send_buff` using `reduction_kind`
+  // reduction and leaves identical copies of the result on each `recv_buff`.
+  virtual absl::Status AllReduce(stream_executor::DeviceMemoryBase send_buffer,
+                                 stream_executor::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 ReductionKind reduction_kind,
+                                 const Executor& executor) = 0;
+
+  // Copy data in `send_buff` from the root device to the `recv_buff` on
+  // all other devices.
+  virtual absl::Status Broadcast(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count, RankId root,
+                                 const Executor& executor) = 0;
+
+  // Reduce data in `send_buff` from all devices using the `reduction_kind`
+  // operation and leave the reduced result scattered over the devices so that
+  // the `recv_buff` on rank `i` will contain the i-th block of the result.
+  virtual absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                     se::DeviceMemoryBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     ReductionKind reduction_kind,
+                                     const Executor& executor) = 0;
+
+  // Gather `count` values from all devices into `recv_buffer`, receiving data
+  // from rank `i` at offset `i * sendcount`.
+  virtual absl::Status AllGather(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 const Executor& executor) = 0;
+
+  // Sends data from `send_buffer` to `target_ranks` and receives data from
+  // `source_rank` into `recv_buffer`. If `source_rank` is not specified, the
+  // output is filled with zeros.
+  virtual absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                         se::DeviceMemoryBase recv_buffer,
+                                         PrimitiveType dtype, size_t count,
+                                         std::optional<RankId> source_rank,
+                                         absl::Span<const RankId> target_ranks,
+                                         const Executor& executor) = 0;
+
+  // Sends `count` values from `send_buffers` to other ranks and receives data
+  // from other ranks into `recv_buffers`.
+  virtual absl::Status AllToAll(
+      absl::Span<const se::DeviceMemoryBase> send_buffers,
+      absl::Span<const se::DeviceMemoryBase> recv_buffers, PrimitiveType dtype,
+      size_t count, const Executor& executor) = 0;
+
+  // Send data from `send_buff` to rank `peer`.
+  virtual absl::Status Send(se::DeviceMemoryBase send_buffer,
+                            PrimitiveType dtype, size_t count, RankId peer,
+                            const Executor& executor) = 0;
+
+  // Receive data from rank `peer` into `recv_buff`.
+  virtual absl::Status Recv(se::DeviceMemoryBase recv_buffer,
+                            PrimitiveType dtype, size_t count, RankId peer,
+                            const Executor& executor) = 0;
+
+  // Returns the number of ranks in the communicator.
+  virtual absl::StatusOr<size_t> NumRanks() const = 0;
+
+  // Returns a human-readable description of the communicator.
+  virtual std::string ToString() const = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Communicator& comm) {
+  return os << comm.ToString();
+}
+
+}  // namespace xla
+
+#endif  // XLA_CORE_COLLECTIVES_COMMUNICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/rank_id.h b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/rank_id.h
new file mode 100644
index 00000000..06b4a2b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/core/collectives/rank_id.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CORE_COLLECTIVES_RANK_ID_H_
+#define XLA_CORE_COLLECTIVES_RANK_ID_H_
+
+#include <cstdint>
+
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla {
+
+// Strongly-typed integer type for defining the rank of the process in the
+// collective clique.
+TSL_LIB_GTL_DEFINE_INT_TYPE(RankId, int64_t);
+
+}  // namespace xla
+
+#endif  // XLA_CORE_COLLECTIVES_RANK_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/cpu_function_runtime.h b/third_party/tflite-hdrs/third_party/xla/xla/cpu_function_runtime.h
new file mode 100644
index 00000000..5e2efc0a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/cpu_function_runtime.h
@@ -0,0 +1,226 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CPU_FUNCTION_RUNTIME_H_
+#define XLA_CPU_FUNCTION_RUNTIME_H_
+
+#include <stdint.h>
+
+#include <cassert>
+#include <cstdlib>
+
+#include "xla/backends/cpu/alignment.h"
+
+namespace xla {
+namespace cpu_function_runtime {
+
+struct EncodedBufferInfo {
+  uint64_t packed_kind_and_size = 0;
+  uint32_t entry_param_number = -1;
+  uint32_t result_param_number = -1;
+};
+
+// Stores information about one buffer used by an XLA:CPU compiled function.
+// These buffers are used for holding inputs to the computation, outputs from
+// the computation and as temporary scratch space.
+class BufferInfo {
+ public:
+  // Creates a BufferInfo from a serialized encoding generated by `Encode`.
+  explicit constexpr BufferInfo(const EncodedBufferInfo& encoded)
+      : kind_(UnpackKind(encoded.packed_kind_and_size)),
+        size_(UnpackSize(encoded.packed_kind_and_size)),
+        entry_param_number_(encoded.entry_param_number),
+        result_param_number_(encoded.result_param_number) {}
+
+  // Returns true if this buffer stores a constant.  These never need to be
+  // allocated by the runtime.
+  bool is_constant() const { return kind() == Kind::kConstant; }
+
+  // Returns true if this buffer stores an entry parameter.  These may or may
+  // not need to be allocated by the runtime, depending on
+  // XlaCompiledCpuFunction::AllocMode.
+  bool is_entry_parameter() const {
+    return kind() == Kind::kParameter && entry_param_number_ >= 0;
+  }
+
+  // Returns the entry parameter number of this buffer.
+  uint32_t entry_parameter_number() const {
+    assert(is_entry_parameter());
+    return entry_param_number_;
+  }
+
+  void set_result_parameter_number(uint32_t param_number) {
+    result_param_number_ = param_number;
+  }
+
+  bool is_result_parameter() const {
+    // Note: the kind is not unique, e.g. could be a kTempBuffer, or a
+    // kParameter if it is an in-out argument.
+    return result_param_number_ >= 0;
+  }
+
+  uint32_t result_parameter_number() const {
+    assert(is_result_parameter());
+    return result_param_number_;
+  }
+
+  // Returns true if this buffer is temporary scratch space required by the XLA
+  // computations.  These are always allocated by the runtime.
+  bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; }
+
+  // Returns true if this buffer is allocated on the C stack or into registers.
+  // These buffers are never allocated by the runtime.
+  bool is_on_stack_buffer() const { return kind() == Kind::kOnStackBuffer; }
+
+  // Returns the size for this buffer.
+  uint64_t size() const { return size_; }
+
+  // Encodes this BufferInfo into two 64 bit integers that can be used to
+  // reconstruct the BufferInfo later using the constructor.  We need this
+  // because we use BufferInfo in places where using protocol buffers would
+  // negatively impact binary size.
+  EncodedBufferInfo Encode() const {
+    static_assert(sizeof(*this) == 16, "");
+    EncodedBufferInfo ret;
+    ret.packed_kind_and_size = Pack(kind(), size_);
+    ret.entry_param_number = entry_param_number_;
+    ret.result_param_number = result_param_number_;
+    return ret;
+  }
+
+  bool operator==(const BufferInfo& buffer_info) const {
+    if (kind() != buffer_info.kind() || size() != buffer_info.size()) {
+      return false;
+    }
+    return !is_entry_parameter() ||
+           entry_parameter_number() == buffer_info.entry_parameter_number();
+  }
+
+  // Factory methods:
+
+  static BufferInfo MakeTempBuffer(uint64_t size) {
+    return BufferInfo(Kind::kTempBuffer, size);
+  }
+  static BufferInfo MakeConstant(uint64_t size) {
+    return BufferInfo(Kind::kConstant, size);
+  }
+  // Note: in-out parameters are possible by first creating an entry parameter
+  // and then calling set_result_parameter_number().
+  static BufferInfo MakeEntryParameter(uint64_t size,
+                                       uint32_t entry_param_number) {
+    return BufferInfo(Kind::kParameter, size, entry_param_number);
+  }
+  // Only used in tests. Here we use kTempBuffer but it is unimportant.
+  static BufferInfo MakeResultParameter(uint64_t size,
+                                        uint32_t result_param_number) {
+    // Here we
+    return BufferInfo(Kind::kTempBuffer, size, /*entry_param_number=*/-1,
+                      result_param_number);
+  }
+  static BufferInfo MakeOnStackBuffer(uint64_t size) {
+    return BufferInfo(Kind::kOnStackBuffer, size);
+  }
+
+ private:
+  BufferInfo() = default;
+
+  enum class Kind : uint64_t {
+    kConstant,
+    kTempBuffer,
+    kParameter,
+    kOnStackBuffer
+  };
+
+  Kind kind() const { return static_cast<Kind>(kind_); }
+
+  explicit BufferInfo(Kind kind, uint64_t size)
+      : BufferInfo(kind, size,
+                   /*entry_param_number=*/-1,
+                   /*result_param_number=*/-1) {}
+  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number)
+      : BufferInfo(kind, size, entry_param_number,
+                   /*result_param_number=*/-1) {}
+  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number,
+                      uint32_t result_param_number)
+      : kind_(kind),
+        size_(size),
+        entry_param_number_(entry_param_number),
+        result_param_number_(result_param_number) {}
+
+  static uint64_t Pack(Kind kind, uint64_t size) {
+    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
+  }
+
+  static inline constexpr Kind UnpackKind(uint64_t packed) {
+    return static_cast<Kind>((packed << 62) >> 62);
+  }
+
+  static inline constexpr uint64_t UnpackSize(uint64_t packed) {
+    return packed >> 2;
+  }
+
+  Kind kind_ : 2;
+  uint64_t size_ : 62;
+  int32_t entry_param_number_ = -1;
+  int32_t result_param_number_ = -1;
+};
+
+// Align to 64-bytes, to mimic tsl::Allocator::kAllocatorAlignment.
+inline constexpr size_t Align() { return xla::cpu::Align(); }
+
+// The minimum alignment of buffers passed to XLA:CPU.
+inline constexpr size_t MinAlign() { return xla::cpu::MinAlign(); }
+
+// When declaring variables that will be passed to an XLA instance as input via
+// set_arg_data(), be it a regular input or a resource variable in the graph,
+// the C++ variables must be aligned.
+//
+// Example usage:
+//   XLA_ALIGN std::array<float, 4> arg_x;
+//   XLA_ALIGN float arg_y;
+//   xla_instance.set_arg_data(0, arg_x.date());
+//   xla_instance.set_arg_data(0, &arg_y);
+#define XLA_ALIGN alignas(xla::cpu_function_runtime::Align())
+
+// AlignedBufferBytes returns the sum of the size of each buffer in
+// `buffer_infos`, skipping constants, on-stack buffers and, if
+// allocate_entry_params is false, entry parameters.  There are `n` entries in
+// `buffer_infos`.  Each buffer is aligned to Align() byte boundaries.
+size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n,
+                          bool allocate_entry_params);
+
+// MallocContiguousBuffers allocates buffers for use by the entry point
+// generated by tfcompile.  There are `n` entries in `buffer_infos`.  If
+// `annotate_initialized` is set, the allocated memory will be annotated as
+// having been initialized - this is useful when allocating temporary buffers.
+// If allocate_entry_params is true then allocates temp buffers and entry
+// parameters, otherwise allocated only temp buffers.  Slots in `bufs`
+// corresponding to unallocated buffers are set to nullptr.
+//
+// A single contiguous block of memory is allocated, and portions of it are
+// parceled out into `bufs`, which must have space for `n` entries.  Returns
+// the head of the allocated contiguous block, which should be passed to
+// FreeContiguous when the buffers are no longer in use.
+void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n,
+                              bool allocate_entry_params, void** bufs,
+                              bool annotate_initialized);
+
+// FreeContiguous frees the contiguous block of memory allocated by
+// MallocContiguousBuffers.
+void FreeContiguous(void* contiguous);
+}  // namespace cpu_function_runtime
+}  // namespace xla
+
+#endif  // XLA_CPU_FUNCTION_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/debug_options_flags.h b/third_party/tflite-hdrs/third_party/xla/xla/debug_options_flags.h
new file mode 100644
index 00000000..4bc84204
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/debug_options_flags.h
@@ -0,0 +1,108 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_DEBUG_OPTIONS_FLAGS_H_
+#define XLA_DEBUG_OPTIONS_FLAGS_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Construct flags which write to the debug_options proto when parsed. Existing
+// contents of debug_options is used as the default. Can be called multiple
+// times.
+void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
+                           DebugOptions* debug_options);
+
+// Appends flag definitions for debug options to flag_list. Existing
+// contents of debug_options is used as the default. If debug_options is null,
+// uses global defaults. Modifies global state on first call.
+void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
+                             DebugOptions* debug_options = nullptr);
+
+// Fetches a DebugOptions proto message from flags provided to the program.
+// Flags must be registered with the flags parser using AppendDebugOptionsFlags
+// first.
+DebugOptions GetDebugOptionsFromFlags();
+
+// Gets a DebugOptions proto that reflects the defaults as if no flags were set.
+DebugOptions DefaultDebugOptionsIgnoringFlags();
+
+// Consumes a unit of "compiler fuel" for the given pass, and returns false if
+// we're out of fuel for that pass.
+//
+// Compiler fuel is a debugging tool useful for bisecting compiler passes.  Each
+// time a pass "does something", it consumes a unit of fuel, and once it's out
+// of fuel, it stops doing any transformations.  This way if you suspect a pass
+// has a bug, you can bisect the amount of fuel it gets and find exactly which
+// change causes the problem.
+//
+// The very first time a pass runs out of fuel, `just_ran_out` is set to true.
+// This lets you take action (e.g. log a message).  But see also the convenience
+// overload below.
+//
+// By default all passes have infinite fuel.  You can restrict how much fuel a
+// pass has by specifying XLA_FLAGS=--xla_fuel=PASS1=NUM1,PASS2=NUM2,...
+//
+// If a user specifies --xla_fuel=PASS=NUM but ConsumeFuel(PASS) is not called
+// before the program exits, we'll print a warning.
+//
+// We recommend as a convention you use a pass's name for the `pass` argument,
+// but any value is accepted.
+bool ConsumeFuel(absl::string_view pass, bool* just_ran_out = nullptr);
+
+// Overload of ConsumeFuel that lets you pass in a functor which generates a log
+// message when we first run out of fuel for a pass.  This is useful because
+// you're usually interested in what *would have* happened right when we ran out
+// of fuel.
+//
+// Example usage:
+//
+//   if (ConsumeFuel("pass-name", [&] { return "Not fooing bar."; })) {
+//     return;
+//   }
+//
+template <typename MsgGenerator>
+bool ConsumeFuel(absl::string_view pass,
+                 const MsgGenerator& ran_out_of_fuel_msg) {
+  bool just_ran_out = false;
+  bool ret = ConsumeFuel(pass, &just_ran_out);
+  if (just_ran_out) {
+    LOG(ERROR) << "Out of fuel for \"" << pass
+               << "\": " << ran_out_of_fuel_msg();
+  }
+  return ret;
+}
+
+// By default compiler fuel is global; if you run two compiler threads, they
+// will consume from the same fuel pool.
+//
+// Calling this function changes the behavior of fuel for the current thread:
+// From this point onward, it will use a private fuel pool.  The thread-local
+// fuel pool is initialized to the values the global fuel pool had at process
+// startup.
+//
+// You may call this function twice in the same thread to reset its fuel pool
+// back to the initial state.
+void ResetThreadLocalFuel();
+
+}  // namespace xla
+
+#endif  // XLA_DEBUG_OPTIONS_FLAGS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/debug_options_parsers.h b/third_party/tflite-hdrs/third_party/xla/xla/debug_options_parsers.h
new file mode 100644
index 00000000..865f69f2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/debug_options_parsers.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_DEBUG_OPTIONS_PARSERS_H_
+#define XLA_DEBUG_OPTIONS_PARSERS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+
+template <typename T>
+void parse_xla_backend_extra_options(T* extra_options_map,
+                                     std::string comma_separated_values) {
+  std::vector<std::string> extra_options_parts =
+      absl::StrSplit(comma_separated_values, ',');
+
+  // The flag contains a comma-separated list of options; some options
+  // have arguments following "=", some don't.
+  for (const auto& part : extra_options_parts) {
+    size_t eq_pos = part.find_first_of('=');
+    if (eq_pos == std::string::npos) {
+      (*extra_options_map)[part] = "";
+    } else {
+      std::string value = "";
+      if (eq_pos + 1 < part.size()) {
+        value = part.substr(eq_pos + 1);
+      }
+      (*extra_options_map)[part.substr(0, eq_pos)] = value;
+    }
+  }
+}
+
+}  // namespace xla
+
+#endif  // XLA_DEBUG_OPTIONS_PARSERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/device_util.h b/third_party/tflite-hdrs/third_party/xla/xla/device_util.h
new file mode 100644
index 00000000..195a980b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/device_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities common between the client and server for working with
+// StreamExecutor devices.
+
+#ifndef XLA_DEVICE_UTIL_H_
+#define XLA_DEVICE_UTIL_H_
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Returns a string that represents the device in terms of platform and ordinal;
+// e.g. the first CUDA device will be "cuda:0"
+inline std::string DeviceIdentifier(se::StreamExecutor* stream_exec) {
+  return absl::StrCat(stream_exec->GetPlatform()->Name(), ":",
+                      stream_exec->device_ordinal());
+}
+
+}  // namespace xla
+
+#endif  // XLA_DEVICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ef57.h b/third_party/tflite-hdrs/third_party/xla/xla/ef57.h
new file mode 100644
index 00000000..06cf1715
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ef57.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_EF57_H_
+#define XLA_EF57_H_
+
+#include <cmath>
+#include <utility>
+
+#include "absl/types/span.h"
+
+namespace xla {
+
+// Utility function to split a double-precision float (F64) into a pair of F32s.
+// For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
+// algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
+// value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
+// original theorem by Dekker.
+//
+// For double-precision F64s, which contain a 53 bit mantissa (52 of them
+// explicit), we can represent the most significant 49 digits as the unevaluated
+// sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
+// most significant 24 bits and the sign bit of 'lo' together with its mantissa
+// store the remaining 25 bits. The exponent of the resulting representation is
+// still restricted to 8 bits of F32.
+//
+// References:
+// [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
+//     SIGGRAPH Research Posters, 2006.
+//     (http://andrewthall.org/papers/df64_qf128.pdf)
+// [2] T. J. Dekker, A floating point technique for extending the available
+//     precision, Numerische Mathematik, vol. 18, pp. 224–242, 1971.
+inline std::pair<float, float> SplitF64ToF32(double x) {
+  const float x_f32 = static_cast<float>(x);
+
+  const bool result_is_finite = std::isfinite(x_f32);
+
+  // The high float is simply the double rounded to the nearest float. Because
+  // we are rounding to nearest with ties to even, the error introduced in
+  // rounding is less than half an ULP in the high ULP.
+  const float hi = x_f32;
+  // We can compute the low term using Sterbenz' lemma: If a and b are two
+  // positive floating point numbers and a/2 ≤ b ≤ 2a, then their difference can
+  // be computed exactly.
+  // Note: the difference is computed exactly but is rounded to the nearest
+  // float which will introduce additional error.
+  const float lo = static_cast<float>(x - static_cast<double>(hi));
+  return std::make_pair(hi, result_is_finite ? lo : 0.0f);
+}
+void ConvertF64ToEf57(absl::Span<const double> input, absl::Span<float> output);
+
+}  // namespace xla
+
+#endif  // XLA_EF57_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/error_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/error_spec.h
new file mode 100644
index 00000000..70cf7cf9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/error_spec.h
@@ -0,0 +1,78 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_ERROR_SPEC_H_
+#define XLA_ERROR_SPEC_H_
+
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Structure describing permissible absolute and relative error bounds.
+struct ErrorSpec {
+  explicit constexpr ErrorSpec(double aabs, double arel = 0,
+                               bool relaxed_nans = false)
+      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
+
+  double abs;  // Absolute error bound.
+  double rel;  // Relative error bound.
+
+  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
+  // In effect, this allows the tested operation to produce incorrect results
+  // for inputs outside its mathematical domain.
+  bool relaxed_nans;
+
+  // If true, then we don't check for bitwise equality of NaNs.  All NaNs are
+  // considered equivalent.
+  bool all_nans_are_equivalent = true;
+
+  // If this is true, then we treat each +/-inf in the actual result as
+  // equivalent to our choice of either +/-inf or the min/max floating-point
+  // value.
+  //
+  // If the expected result is +/-inf, the actual result must still be +/-inf.
+  //
+  // In effect, this allows the tested operation to overflow, so long as it's
+  // overflowing on "large" values.
+  //
+  // (We could have a symmetric more_infs_ok flag if necessary; right now it
+  // appears not to be.)
+  bool fewer_infs_ok = false;
+
+  struct LowPrecisionFPErrorSpec {
+    // Type of low precision floating point to use for error bound calculations.
+    // We can't infer this type from the result because the lower precision
+    // could have been used for intermediate calculations.
+    PrimitiveType type = PrimitiveType::PRIMITIVE_TYPE_INVALID;
+    // Allowable distance in number of representable floats between the expected
+    // and actual once they're converted to the PrimitiveType specified above.
+    // Note:
+    // - this is only valid if the expected value is outside the error bound.
+    // - +/-0 are considered equivalent.
+    int within_n_values = -1;
+  };
+
+  // If the computation uses low precision floating point (e.g. FP8), this field
+  // specifies the error bounds to be used. This allows us to have a per element
+  // error bound measured in floats vs relying on the default relative/absolute
+  // error bounds. We need this for FP8 since it's very sparse and we'd like to
+  // avoid unnecessarily large error bounds. This overrides abserr/relerr when
+  // specified.
+  LowPrecisionFPErrorSpec low_precision_fp_error_spec;
+};
+
+}  // namespace xla
+
+#endif  // XLA_ERROR_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/executable_run_options.h b/third_party/tflite-hdrs/third_party/xla/xla/executable_run_options.h
new file mode 100644
index 00000000..b377e916
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/executable_run_options.h
@@ -0,0 +1,281 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_EXECUTABLE_RUN_OPTIONS_H_
+#define XLA_EXECUTABLE_RUN_OPTIONS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+
+// These classes are forward declared so that ExecutableRunOptions can be linked
+// into an XLA-compiled binary without having to link all of the pointed-to
+// objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
+// need to be linked).
+namespace stream_executor {
+class Stream;
+class Event;
+class Platform;
+class DeviceMemoryAllocator;
+class DeviceMemoryBase;
+}  // namespace stream_executor
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}  // namespace Eigen
+
+namespace tsl {
+template <typename T>
+class AsyncValueRef;
+}  // namespace tsl
+
+namespace xla {
+
+class DeviceAssignment;
+class ExecutionProfile;
+class Shape;
+
+namespace cpu {
+class CpuExecutableRunOptions;
+}  // namespace cpu
+
+namespace gpu {
+class GpuExecutableRunOptions;
+}  // namespace gpu
+
+namespace ffi {
+class ExecutionContext;
+}  // namespace ffi
+
+// A unique identifier for a particular "logical execution" of an XLA model.
+//
+// A logical execution might encompass multiple executions of one or more
+// HloModules.  Runs that are part of the same logical execution can
+// communicate via collective ops (e.g. kAllToAll), whereas runs that are part
+// of different logical executions are isolated.
+class RunId {
+ public:
+  // Creates a new, unique RunId.
+  RunId();
+  explicit RunId(int64_t value) : data_(value) {}
+
+  RunId(const RunId&) = default;
+  RunId& operator=(const RunId&) = default;
+  friend bool operator==(const RunId& a, const RunId& b);
+  std::string ToString() const;
+  int64_t ToInt() const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const RunId& id) {
+    return H::combine(std::move(h), id.data_);
+  }
+
+ private:
+  int64_t data_;
+};
+
+// Callback used by the GPU backend only. This is an "one-sided" version of
+// ThenDoHostCallback that enqueues a callback onto a stream. The difference
+// with ThenDoHostCallback is that the device does not block waiting for the
+// callback to complete; instead the callback is scheduled by the runtime.
+// This functionality must be provided by the caller, and hence is provided in
+// callback form.
+using ThenExecuteFunction =
+    std::function<void(stream_executor::Stream*, std::function<void()>)>;
+
+// Callback for sending device buffer to a channel. Returned event will be
+// recorded on a `stream` once the send operation is completed and data was
+// copied from the `src` memory. `frontend_attrs` contains frontend specific
+// attributes for the send.
+using SendDeviceMemoryFunction = std::function<
+    absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<stream_executor::Event>>>(
+        int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
+        const stream_executor::DeviceMemoryBase& src,
+        const absl::flat_hash_map<std::string, std::string>& frontend_attrs)>;
+
+// Callback for receiving device buffer from a channel. Returned event will be
+// recorded on a `stream` once the recv operation is completed and data was
+// copied into the `dst` memory. `frontend_attrs` contains frontend specific
+// attributes for the receive.
+using RecvDeviceMemoryFunction = std::function<
+    absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<stream_executor::Event>>>(
+        int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
+        stream_executor::DeviceMemoryBase* dst,
+        const absl::flat_hash_map<std::string, std::string>& frontend_attrs)>;
+
+// Class containing options for running a LocalExecutable.
+class ExecutableRunOptions {
+ public:
+  // Specifies the allocator to use during execution.
+  ExecutableRunOptions& set_allocator(
+      stream_executor::DeviceMemoryAllocator* allocator);
+  stream_executor::DeviceMemoryAllocator* allocator() const;
+
+  // If set, this is the device to run the computation on. Valid device_ordinal
+  // values are: 0 to # of devices - 1. These are the logical device ordinals,
+  // since multiple logical devices could reside on the same physical device,
+  // e.g., virtual GPUs. If there is only one logical device on a physical
+  // device, then these values are identical to the device ordinal values used
+  // by StreamExecutor. The device must be of the same type as the executable
+  // was compiled for. A value of -1 indicates this option has not been set.
+  ExecutableRunOptions& set_device_ordinal(int device_ordinal);
+  int device_ordinal() const;
+
+  // If set, this is the physical device to run the computation on. These values
+  // are identical to the device ordinal values used by StreamExecutor. The
+  // device must be of the same type as the executable was compiled for. A value
+  // of -1 indicates this option has not been set, in which case the physical
+  // device ordinal is the same as the logical device ordinal.
+  ExecutableRunOptions& set_physical_device_ordinal(
+      int physical_device_ordinal);
+  int physical_device_ordinal() const;
+
+  // If set, this is the stream to run the computation on. The platform of the
+  // stream must match the platform the executable was built for.  A value of
+  // nullptr indicates the option has not been set.
+  ExecutableRunOptions& set_stream(stream_executor::Stream* stream);
+  stream_executor::Stream* stream() const;
+
+  // If set, this is the stream to perform host to device transfers on (e.g. any
+  // pre-computation transfers). The platform of the stream must match the
+  // platform the executable was built for. A value of nullptr indicates the
+  // option has not been set.
+  ExecutableRunOptions& set_host_to_device_stream(
+      stream_executor::Stream* stream);
+  stream_executor::Stream* host_to_device_stream() const;
+
+  // If set, this is the stream to perform device to host transfers on.
+  // The platform of the stream must match the platform the executable was
+  // built for. A value of nullptr indicates the option has not been set.
+  ExecutableRunOptions& set_device_to_host_stream(
+      stream_executor::Stream* stream);
+  stream_executor::Stream* device_to_host_stream() const;
+
+  // Sets the thread pool device on which to run Eigen subcomputations.
+  //
+  // This field must be set for XLA:CPU models that call Eigen routines, but may
+  // be null otherwise.  Routines that use this field should always CHECK (or
+  // TF_RET_CHECK) that it's not null before dereferencing it, so that users get
+  // a clean crash rather than a segfault.
+  //
+  // Does not take ownership.
+  ExecutableRunOptions& set_intra_op_thread_pool(
+      const Eigen::ThreadPoolDevice* intra_op_thread_pool);
+  const Eigen::ThreadPoolDevice* intra_op_thread_pool() const;
+
+  // If set, profiling information is written to 'profile'.
+  ExecutionProfile* execution_profile() const;
+  ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
+
+  ExecutableRunOptions& set_device_assignment(
+      const DeviceAssignment* device_assignment);
+  const DeviceAssignment* device_assignment() const;
+
+  ExecutableRunOptions& set_rng_seed(int rng_seed);
+  int rng_seed() const;
+
+  ExecutableRunOptions& set_launch_id(int32_t launch_id) {
+    launch_id_ = launch_id;
+    return *this;
+  }
+
+  int32_t launch_id() const { return launch_id_; }
+
+  ExecutableRunOptions& set_run_id(RunId id);
+  RunId run_id() const;
+
+  // See documentation on ThenExecuteFunction.
+  ExecutableRunOptions& set_then_execute_function(ThenExecuteFunction* f) {
+    then_execute_function_ = f;
+    return *this;
+  }
+  ThenExecuteFunction* then_execute_function() const {
+    return then_execute_function_;
+  }
+
+  // See documentation on SendDeviceMemoryFunction.
+  ExecutableRunOptions& set_send_device_memory_function(
+      SendDeviceMemoryFunction* f) {
+    send_device_memory_function_ = f;
+    return *this;
+  }
+  SendDeviceMemoryFunction* send_device_memory_function() const {
+    return send_device_memory_function_;
+  }
+
+  // See documentation on RecvDeviceMemoryFunction.
+  ExecutableRunOptions& set_recv_device_memory_function(
+      RecvDeviceMemoryFunction* f) {
+    recv_device_memory_function_ = f;
+    return *this;
+  }
+  RecvDeviceMemoryFunction* recv_device_memory_function() const {
+    return recv_device_memory_function_;
+  }
+
+  // CPU-backend specific options. These are kept out-of-line to avoid bloating
+  // the size of this dependency for CPU-only AOT builds.
+  ExecutableRunOptions& set_cpu_executable_run_options(
+      const cpu::CpuExecutableRunOptions* cpu_executable_run_options);
+  const cpu::CpuExecutableRunOptions* cpu_executable_run_options() const;
+
+  // GPU-backend specific options. These are kept out-of-line to avoid bloating
+  // the size of this dependency for CPU-only AOT builds.
+  ExecutableRunOptions& set_gpu_executable_run_options(
+      const gpu::GpuExecutableRunOptions* gpu_executable_run_options);
+  const gpu::GpuExecutableRunOptions* gpu_executable_run_options() const;
+
+  // XLA FFI specific execution context that allows to pass auxiliary data to
+  // FFI handlers. It's a caller responsibility to ensure that the XLA FFI
+  // execution context stays alive while the executable is running.
+  ExecutableRunOptions& set_ffi_execution_context(
+      const ffi::ExecutionContext* ffi_execution_context);
+  const ffi::ExecutionContext* ffi_execution_context() const;
+
+  // This indicates how many local devices are used by the execution.
+  // Valid values are any value greater than 0.
+  // 0 means unset.
+  ExecutableRunOptions& set_local_device_count(int local_device_count);
+  int local_device_count() const;
+
+ private:
+  stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
+  int device_ordinal_ = -1;
+  int local_device_count_ = 0;
+  int physical_device_ordinal_ = -1;
+  const DeviceAssignment* device_assignment_ = nullptr;
+  stream_executor::Stream* stream_ = nullptr;
+  const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
+  ExecutionProfile* execution_profile_ = nullptr;
+  int rng_seed_ = 0;
+  int32_t launch_id_ = 0;
+  stream_executor::Stream* device_to_host_stream_ = nullptr;
+  stream_executor::Stream* host_to_device_stream_ = nullptr;
+  ThenExecuteFunction* then_execute_function_ = nullptr;
+  SendDeviceMemoryFunction* send_device_memory_function_ = nullptr;
+  RecvDeviceMemoryFunction* recv_device_memory_function_ = nullptr;
+  RunId run_id_;
+  const cpu::CpuExecutableRunOptions* cpu_executable_run_options_ = nullptr;
+  const gpu::GpuExecutableRunOptions* gpu_executable_run_options_ = nullptr;
+  const ffi::ExecutionContext* ffi_execution_context_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // XLA_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/execution_options_util.h b/third_party/tflite-hdrs/third_party/xla/xla/execution_options_util.h
new file mode 100644
index 00000000..60db0257
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/execution_options_util.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_EXECUTION_OPTIONS_UTIL_H_
+#define XLA_EXECUTION_OPTIONS_UTIL_H_
+
+#include "xla/xla.pb.h"
+
+namespace xla {
+
+// Create a default ExecutionOptions proto; this proto has its debug options
+// populated to the default values taken from flags.
+ExecutionOptions CreateDefaultExecutionOptions();
+
+}  // namespace xla
+
+#endif  // XLA_EXECUTION_OPTIONS_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h b/third_party/tflite-hdrs/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h
new file mode 100644
index 00000000..611a9a61
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_KERNELS_H_
+#define XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_KERNELS_H_
+
+namespace experiments {
+namespace benchmark {
+
+template <int chunks>
+void BenchmarkDeviceCopy(float* in, float* out, int64_t size, int num_blocks,
+                         int num_threads);
+
+}  // namespace benchmark
+}  // namespace experiments
+
+#endif  // XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_KERNELS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h
new file mode 100644
index 00000000..47bf5a08
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_UTILS_H_
+#define XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_UTILS_H_
+#if GOOGLE_CUDA
+
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tsl/platform/logging.h"
+
+#define CHECK_CUDA(_expr) \
+  CHECK((_expr) == cudaSuccess) << cudaGetErrorString(cudaGetLastError())
+
+#endif  // GOOGLE_CUDA
+#endif  // XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/api.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/api.h
new file mode 100644
index 00000000..389d2d2a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/api.h
@@ -0,0 +1,1906 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_API_H_
+#define XLA_FFI_API_API_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+// This is a header-only base C++ library that defines templates for decoding
+// XLA FFI call frames and invoking corresponding C++ functions. This must have
+// no dependencies outside of the C++ standard library.
+//
+// There are two extensions to this base library:
+//
+//   (1) xla/ffi/api/ffi.h for defining "external" FFI handlers loaded from
+//       dynamic libraries potentially built with different toolchains and/or
+//       a different XLA commit. It is a header-only library without any
+//       dependencies.
+//
+//   (2) xla/ffi/ffi.h for defining "internal" FFI handlers that must be
+//       statically linked into the binary and must be built from the same
+//       commit using the same toolchain, as it provides access to XLA
+//       implementation details (e.g. ServiceExecutableOptions) and C++ ABI
+//       across different libraries is hard.
+//
+// Extensions define template specializations for argument-decoding hooks
+// defined in this file.
+
+#include "xla/ffi/api/c_api.h"
+
+#ifdef __has_builtin
+#define XLA_FFI_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define XLA_FFI_HAS_BUILTIN(x) 0
+#endif
+
+#if __has_attribute(always_inline)
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#else
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+#endif
+
+#if __has_attribute(noinline)
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
+#else
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE
+#endif
+
+#if XLA_FFI_HAS_BUILTIN(__builtin_expect)
+#define XLA_FFI_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
+#define XLA_FFI_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+#else
+#define XLA_FFI_PREDICT_FALSE(x) (x)
+#define XLA_FFI_PREDICT_TRUE(x) (x)
+#endif
+
+//===----------------------------------------------------------------------===//
+// Builtin enum pretty printing
+//===----------------------------------------------------------------------===//
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const XLA_FFI_DataType dtype) {
+  switch (dtype) {
+    case XLA_FFI_DataType_INVALID:
+      return os << "INVALID";
+    case XLA_FFI_DataType_PRED:
+      return os << "PRED";
+    case XLA_FFI_DataType_S8:
+      return os << "S8";
+    case XLA_FFI_DataType_S16:
+      return os << "S16";
+    case XLA_FFI_DataType_S32:
+      return os << "S32";
+    case XLA_FFI_DataType_S64:
+      return os << "S64";
+    case XLA_FFI_DataType_U8:
+      return os << "U8";
+    case XLA_FFI_DataType_U16:
+      return os << "U16";
+    case XLA_FFI_DataType_U32:
+      return os << "U32";
+    case XLA_FFI_DataType_U64:
+      return os << "U64";
+    case XLA_FFI_DataType_F16:
+      return os << "F16";
+    case XLA_FFI_DataType_F32:
+      return os << "F32";
+    case XLA_FFI_DataType_F64:
+      return os << "F64";
+    case XLA_FFI_DataType_BF16:
+      return os << "BF16";
+    case XLA_FFI_DataType_C64:
+      return os << "C64";
+    case XLA_FFI_DataType_C128:
+      return os << "C128";
+    case XLA_FFI_DataType_TOKEN:
+      return os << "TOKEN";
+    case XLA_FFI_DataType_F8E5M2:
+      return os << "F8E5M2";
+    case XLA_FFI_DataType_F8E3M4:
+      return os << "F8E3M4";
+    case XLA_FFI_DataType_F8E4M3:
+      return os << "F8E4M3";
+    case XLA_FFI_DataType_F8E4M3FN:
+      return os << "F8E4M3FN";
+    case XLA_FFI_DataType_F8E4M3B11FNUZ:
+      return os << "F8E4M3B11FNUZ";
+    case XLA_FFI_DataType_F8E5M2FNUZ:
+      return os << "F8E5M2FNUZ";
+    case XLA_FFI_DataType_F8E4M3FNUZ:
+      return os << "F8E4M3FNUZ";
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
+  switch (type) {
+    case XLA_FFI_AttrType_ARRAY:
+      return os << "array";
+    case XLA_FFI_AttrType_DICTIONARY:
+      return os << "dictionary";
+    case XLA_FFI_AttrType_SCALAR:
+      return os << "scalar";
+    case XLA_FFI_AttrType_STRING:
+      return os << "string";
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const XLA_FFI_ExecutionStage stage) {
+  switch (stage) {
+    case XLA_FFI_ExecutionStage_INSTANTIATE:
+      return os << "instantiate";
+    case XLA_FFI_ExecutionStage_PREPARE:
+      return os << "prepare";
+    case XLA_FFI_ExecutionStage_INITIALIZE:
+      return os << "initialize";
+    case XLA_FFI_ExecutionStage_EXECUTE:
+      return os << "execute";
+  }
+}
+
+namespace xla::ffi {
+
+enum class ExecutionStage : uint8_t {
+  kInstantiate = XLA_FFI_ExecutionStage_INSTANTIATE,
+  kPrepare = XLA_FFI_ExecutionStage_PREPARE,
+  kInitialize = XLA_FFI_ExecutionStage_INITIALIZE,
+  kExecute = XLA_FFI_ExecutionStage_EXECUTE,
+};
+
+enum class Traits : uint32_t {
+  kCmdBufferCompatible = XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE,
+};
+
+// Forward declare template defined below.
+template <ExecutionStage stage, typename... Ts>
+class Binding;
+
+// Forward declare template defined below.
+template <ExecutionStage stage, typename Fn, typename... Ts>
+class Handler;
+
+//===----------------------------------------------------------------------===//
+// XLA FFI virtual base for implementing FFI handlers
+//===----------------------------------------------------------------------===//
+
+class Ffi {
+ public:
+  // Creates an empty binding specification which allows to define FFI handler
+  // signature separately from implementation and rely on compile time type
+  // checking to verify that signature matches the provided implementation.
+  template <ExecutionStage stage = ExecutionStage::kExecute>
+  static Binding<stage> Bind();
+
+  // Creates an empty binding for the instantiate stage.
+  static Binding<ExecutionStage::kInstantiate> BindInstantiate();
+
+  // Automatic FFI binding that does binding specification inference from the
+  // `fn` type signature and binds `fn` to it. This enables a more concise FFI
+  // handler registration with fully automatic type inference at the cost of
+  // less readable error messages, template metaprogramming "magic" and a risk
+  // to accidentally change handler type without noticing it.
+  template <typename Fn, ExecutionStage stage = ExecutionStage::kExecute>
+  static auto BindTo(Fn fn, std::initializer_list<Traits> traits = {});
+
+  virtual ~Ffi() = default;
+  virtual XLA_FFI_Error* Call(XLA_FFI_CallFrame* call_frame) const = 0;
+
+  // Registers FFI handler bundle with an XLA runtime under the given name on a
+  // given platform.
+  static XLA_FFI_Error* RegisterStaticHandler(
+      const XLA_FFI_Api* api, std::string_view name, std::string_view platform,
+      XLA_FFI_Handler_Bundle bundle, XLA_FFI_Handler_Traits traits = 0);
+
+  // Registers FFI execute handler with an XLA runtime under the given name on a
+  // given platform.
+  static XLA_FFI_Error* RegisterStaticHandler(
+      const XLA_FFI_Api* api, std::string_view name, std::string_view platform,
+      XLA_FFI_Handler* execute, XLA_FFI_Handler_Traits traits = 0) {
+    return RegisterStaticHandler(
+        api, name, platform,
+        XLA_FFI_Handler_Bundle{nullptr, nullptr, nullptr, execute}, traits);
+  }
+
+  // Registers a custom type so that it can be used with State and UserData
+  // arguments to external FFI handlers. The `name` argument must be a unique
+  // identifier for the type, and duplicate registrations with the same name
+  // are not allowed. When successful, a unique ID will be returned by updating
+  // `type_id`.
+  static XLA_FFI_Error* RegisterTypeId(const XLA_FFI_Api* api,
+                                       std::string_view name,
+                                       XLA_FFI_TypeId* type_id);
+
+ protected:
+  template <typename... Args>
+  static std::string StrCat(Args... args);
+
+  static XLA_FFI_Error* Sucess();
+
+  static XLA_FFI_Error* MakeError(const XLA_FFI_Api* api,
+                                  XLA_FFI_Error_Code errc, std::string message);
+
+  static XLA_FFI_Error* InvalidArgument(const XLA_FFI_Api* api,
+                                        std::string message);
+
+  static XLA_FFI_Error* CheckStructSize(const XLA_FFI_Api* api,
+                                        std::string_view struct_name,
+                                        size_t expected, size_t actual);
+
+  static XLA_FFI_Error* StructSizeIsGreaterOrEqual(const XLA_FFI_Api* api,
+                                                   std::string_view struct_name,
+                                                   size_t expected,
+                                                   size_t actual);
+};
+
+inline XLA_FFI_Error* Ffi::RegisterStaticHandler(
+    const XLA_FFI_Api* api, std::string_view name, std::string_view platform,
+    XLA_FFI_Handler_Bundle bundle, XLA_FFI_Handler_Traits traits) {
+  XLA_FFI_Handler_Register_Args args;
+  args.struct_size = XLA_FFI_Handler_Register_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.name = XLA_FFI_ByteSpan{name.data(), name.size()};
+  args.platform = XLA_FFI_ByteSpan{platform.data(), platform.size()};
+  args.bundle = bundle;
+  args.traits = traits;
+  return api->XLA_FFI_Handler_Register(&args);
+}
+
+inline XLA_FFI_Error* Ffi::RegisterTypeId(const XLA_FFI_Api* api,
+                                          std::string_view name,
+                                          XLA_FFI_TypeId* type_id) {
+  XLA_FFI_TypeId_Register_Args args;
+  args.struct_size = XLA_FFI_TypeId_Register_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.name = XLA_FFI_ByteSpan{name.data(), name.size()};
+  args.type_id = type_id;
+  return api->XLA_FFI_TypeId_Register(&args);
+}
+
+template <typename... Args>
+std::string Ffi::StrCat(Args... args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  return ss.str();
+}
+
+inline XLA_FFI_Error* Ffi::Sucess() { return nullptr; }
+
+inline XLA_FFI_Error* Ffi::MakeError(const XLA_FFI_Api* api,
+                                     XLA_FFI_Error_Code errc,
+                                     std::string message) {
+  XLA_FFI_Error_Create_Args args;
+  args.struct_size = XLA_FFI_Error_Create_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.errc = errc;
+  args.message = message.c_str();
+  return api->XLA_FFI_Error_Create(&args);
+}
+
+inline XLA_FFI_Error* Ffi::InvalidArgument(const XLA_FFI_Api* api,
+                                           std::string message) {
+  return MakeError(api, XLA_FFI_Error_Code_INVALID_ARGUMENT,
+                   std::move(message));
+}
+
+inline XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
+                                           std::string_view struct_name,
+                                           size_t expected, size_t actual) {
+  if (expected != actual) {
+    return InvalidArgument(
+        api, StrCat("Unexpected ", struct_name, " size: expected ", expected,
+                    " got ", actual, ". Check installed software versions."));
+  }
+  return nullptr;
+}
+
+inline XLA_FFI_Error* Ffi::StructSizeIsGreaterOrEqual(
+    const XLA_FFI_Api* api, std::string_view struct_name, size_t expected,
+    size_t actual) {
+  if (actual < expected) {
+    return InvalidArgument(
+        api, StrCat("Unexpected ", struct_name, " size: expected at least ",
+                    expected, " got ", actual,
+                    ". Check installed software versions."));
+  }
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Type tags for distinguishing handler argument types
+//===----------------------------------------------------------------------===//
+
+// Forward declare.
+class Dictionary;
+
+namespace internal {
+
+// WARNING: A lot of template metaprogramming on top of C++ variadic templates
+// parameter packs. We need this to be able to pattern match FFI handler
+// signature at compile time.
+
+// A type tag for decoding optional argument.
+template <typename T>
+struct OptionalArgTag {};
+
+// A type tag to forward all remaining args as `RemainingArgs`.
+struct RemainingArgsTag {};
+
+// A type tag to distinguish parameters tied to results in the `Binding`
+// variadic template. In XLA FFI we use destination passing style APIs and don't
+// return anything from the handler, but instead pass a destination where the
+// handler should write the result.
+template <typename T>
+struct RetTag {};
+
+// A type tag for decoding optional result.
+template <typename T>
+struct OptionalRetTag {};
+
+// A type tag to forward all remaining results as `RemainingRets`.
+struct RemainingRetsTag {};
+
+// A type tag to distinguish parameters tied to the attributes in the
+// `Binding` variadic template.
+template <typename T>
+struct AttrTag {};
+
+// A type tag to forward all attributes as `Dictionary` (and optionally decode
+// it into a custom struct).
+template <typename T = Dictionary>
+struct AttrsTag {};
+
+// A type tag to distinguish parameter extracted from an execution context.
+template <typename T>
+struct CtxTag {};
+
+//----------------------------------------------------------------------------//
+// A template for counting tagged arguments in the Ts pack (i.e. attributes).
+//----------------------------------------------------------------------------//
+
+template <template <typename> class Tag, typename... Ts>
+struct NumTagged;
+
+template <template <typename> class Tag>
+struct NumTagged<Tag> {
+  static constexpr int64_t value = 0;
+};
+
+template <template <typename> class Tag, typename T, typename... Ts>
+struct NumTagged<Tag, Tag<T>, Ts...> {
+  static constexpr int64_t value = 1 + NumTagged<Tag, Ts...>::value;
+};
+
+template <template <typename> class Tag, typename T, typename... Ts>
+struct NumTagged<Tag, T, Ts...> {
+  static constexpr int64_t value = 0 + NumTagged<Tag, Ts...>::value;
+};
+
+//----------------------------------------------------------------------------//
+
+template <typename T>
+struct IsOptionalArgTag : std::false_type {};
+template <typename T>
+struct IsOptionalArgTag<OptionalArgTag<T>> : std::true_type {};
+
+template <typename T>
+struct IsOptionalRetTag : std::false_type {};
+template <typename T>
+struct IsOptionalRetTag<OptionalRetTag<T>> : std::true_type {};
+
+// Checks if parameter pack has an optional argument.
+template <typename... Ts>
+using HasOptionalArgTag = std::disjunction<IsOptionalArgTag<Ts>...>;
+
+// Checks if parameter pack has remaining arguments.
+template <typename... Ts>
+using HasRemainingArgsTag =
+    std::disjunction<std::is_same<RemainingArgsTag, Ts>...>;
+
+// Checks if parameter pack has an optional result.
+template <typename... Ts>
+using HasOptionalRetTag = std::disjunction<IsOptionalRetTag<Ts>...>;
+
+// Checks if parameter pack has remaining results.
+template <typename... Ts>
+using HasRemainingRetsTag =
+    std::disjunction<std::is_same<RemainingRetsTag, Ts>...>;
+
+//----------------------------------------------------------------------------//
+
+template <typename T>
+XLA_FFI_DataType NativeTypeToCApiDataType() {
+  if constexpr (std::is_same_v<T, bool>) {
+    return XLA_FFI_DataType_PRED;
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    return XLA_FFI_DataType_S8;
+  } else if constexpr (std::is_same_v<T, int16_t>) {
+    return XLA_FFI_DataType_S16;
+  } else if constexpr (std::is_same_v<T, int32_t>) {
+    return XLA_FFI_DataType_S32;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    return XLA_FFI_DataType_S64;
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    return XLA_FFI_DataType_U8;
+  } else if constexpr (std::is_same_v<T, uint16_t>) {
+    return XLA_FFI_DataType_U16;
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return XLA_FFI_DataType_U32;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    return XLA_FFI_DataType_U64;
+  } else if constexpr (std::is_same_v<T, float>) {
+    return XLA_FFI_DataType_F32;
+  } else if constexpr (std::is_same_v<T, double>) {
+    return XLA_FFI_DataType_F64;
+  } else if constexpr (std::is_same_v<T, std::complex<float>>) {
+    return XLA_FFI_DataType_C64;
+  } else {
+    static_assert(std::is_same_v<T, std::complex<double>>,
+                  "unsupported FFI data type");
+    return XLA_FFI_DataType_C128;
+  }
+}
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Binding variadic template defines FFI handler signature
+//===----------------------------------------------------------------------===//
+
+template <ExecutionStage stage, typename... Ts>
+class Binding {
+ public:
+  template <typename T>
+  Binding<stage, Ts..., T> Arg() && {
+    static_assert(!internal::HasOptionalArgTag<Ts...>::value,
+                  "argument can't be passed after optional argument");
+    static_assert(!internal::HasRemainingArgsTag<Ts...>::value,
+                  "argument can't be passed after remaining arguments");
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  Binding<stage, Ts..., internal::RetTag<T>> Ret() && {
+    static_assert(!internal::HasOptionalRetTag<Ts...>::value,
+                  "result can't be passed after optional result");
+    static_assert(!internal::HasRemainingRetsTag<Ts...>::value,
+                  "result can't be passed after remaining results");
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  Binding<stage, Ts..., internal::OptionalArgTag<T>> OptionalArg() && {
+    static_assert(
+        !internal::HasRemainingArgsTag<Ts...>::value,
+        "optional argument can't be passed after remaining arguments");
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  Binding<stage, Ts..., internal::OptionalRetTag<T>> OptionalRet() && {
+    static_assert(!internal::HasRemainingRetsTag<Ts...>::value,
+                  "optional result can't be passed after remaining results");
+    return {std::move(*this)};
+  }
+
+  Binding<stage, Ts..., internal::RemainingArgsTag> RemainingArgs() && {
+    static_assert(!internal::HasRemainingArgsTag<Ts...>::value,
+                  "remaining arguments can be passed just once");
+    return {std::move(*this)};
+  }
+
+  Binding<stage, Ts..., internal::RemainingRetsTag> RemainingRets() && {
+    static_assert(!internal::HasRemainingRetsTag<Ts...>::value,
+                  "remaining results can be passed just once");
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  Binding<stage, Ts..., internal::CtxTag<T>> Ctx() && {
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  Binding<stage, Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
+    static_assert(internal::NumTagged<internal::AttrsTag, Ts...>::value == 0,
+                  "dictionary attributes can't be mixed with regular ones");
+    attrs_.push_back(std::move(attr));
+    return {std::move(*this)};
+  }
+
+  template <typename T = Dictionary>
+  Binding<stage, Ts..., internal::AttrsTag<T>> Attrs() && {
+    static_assert(internal::NumTagged<internal::AttrTag, Ts...>::value == 0,
+                  "dictionary attributes can't be mixed with regular ones");
+    return {std::move(*this)};
+  }
+
+  template <typename Fn>
+  std::unique_ptr<Handler<stage, Fn, Ts...>> To(
+      Fn fn, std::initializer_list<Traits> traits = {}) {
+    return std::unique_ptr<Handler<stage, Fn, Ts...>>(
+        new Handler<stage, Fn, Ts...>(std::move(fn), traits, attrs_));
+  }
+
+ private:
+  template <ExecutionStage, typename...>
+  friend class Binding;
+  friend class Ffi;
+
+  explicit Binding() {
+    static_assert(sizeof...(Ts) == 0, "arguments must be empty");
+  }
+
+  template <typename... TTs>
+  Binding(Binding<stage, TTs...>&& other)  // NOLINT
+      : attrs_(std::move(other.attrs_)) {}
+
+  Binding(Binding&) = delete;
+
+  std::vector<std::string> attrs_;  // names of bound attributes
+};
+
+template <ExecutionStage stage>
+Binding<stage> Ffi::Bind() {
+  return xla::ffi::Binding<stage>();
+}
+
+inline Binding<ExecutionStage::kInstantiate> Ffi::BindInstantiate() {
+  return Bind<ExecutionStage::kInstantiate>();
+}
+
+//===----------------------------------------------------------------------===//
+// Template metaprogramming to automatically infer Binding from invocable
+// object.
+//===----------------------------------------------------------------------===//
+
+// A little bit of metaprogramming that automatically infers the binding schema
+// from an invocable type signature.
+
+// XLA FFI binding for an argument.
+//
+// Example: binding for the `MyType` argument
+//
+//   template <>
+//   struct ArgBinding<MyType> {
+//     using Arg = MyType;
+//   };
+//
+template <typename T>
+struct ArgBinding {
+  using Arg = void;
+};
+
+// XLA FFI binding for a returned result.
+//
+// Example: binding for the `MyType` result
+//
+//   template <>
+//   struct RetBinding<MyType> {
+//     using Ret = MyType;
+//   };
+//
+template <typename T>
+struct RetBinding {
+  using Ret = void;
+};
+
+// XLA FFI binding for a named attribute.
+//
+// Example: binding for the `MyType` attribute
+//
+//   template <>
+//   struct AttrBinding<MyAttr> {
+//     using Attr = MyAttr;
+//     static constexpr std::string_view name() { return "my_attr"; }
+//   };
+//
+template <typename T>
+struct AttrBinding {
+  using Attr = void;
+};
+
+// XLA FFI binding for dictionary attributes: automatic parsing of all
+// attributes into user defined struct.
+template <typename T>
+struct AttrsBinding {
+  using Attrs = void;
+};
+
+// XLA FFI binding for values passed via context.
+//
+// Example: binding for the `gpuStream_t` platform stream
+//
+//   template <>
+//   struct CtxBinding<gpuStream_t> {
+//     using Ctx = PlatformStream<gpuStream_t>;
+//   };
+//
+template <typename T>
+struct CtxBinding {
+  using Ctx = void;
+};
+
+namespace internal {
+
+template <typename Param>
+inline constexpr bool is_arg_binding_v =
+    !std::is_void_v<typename ArgBinding<Param>::Arg>;
+
+template <typename Param>
+inline constexpr bool is_ret_binding_v =
+    !std::is_void_v<typename RetBinding<Param>::Ret>;
+
+template <typename Param>
+inline constexpr bool is_attr_binding_v =
+    !std::is_void_v<typename AttrBinding<Param>::Attr>;
+
+template <typename Param>
+inline constexpr bool is_attrs_binding_v =
+    !std::is_void_v<typename AttrsBinding<Param>::Attrs>;
+
+template <typename Param>
+inline constexpr bool is_ctx_binding_v =
+    !std::is_void_v<typename CtxBinding<Param>::Ctx>;
+
+// A helper template to bind `Params` to `Fn` one by one.
+template <typename Fn, typename... Params>
+struct BindOne;
+
+// A specialization that binds one parameter.
+template <typename Fn, typename Param, typename... Params>
+struct BindOne<Fn, Param, Params...> {
+  // Binds single parameter and then continues with remaining parameters using
+  // recursive template instantiation.
+  template <typename InFlightBinding>
+  static auto To(Fn fn, InFlightBinding binding,
+                 std::initializer_list<Traits> traits) {
+    if constexpr (is_arg_binding_v<Param>) {
+      // Bind parameter as an FFI handler argument.
+      return BindOne<Fn, Params...>::To(
+          std::move(fn),
+          std::move(binding).template Arg<typename ArgBinding<Param>::Arg>(),
+          traits);
+
+    } else if constexpr (is_ret_binding_v<Param>) {
+      // Bind parameter as an FFI handler result.
+      return BindOne<Fn, Params...>::To(
+          std::move(fn),
+          std::move(binding).template Ret<typename RetBinding<Param>::Ret>(),
+          traits);
+
+    } else if constexpr (is_attr_binding_v<Param>) {
+      // Bind parameter as a named FFI handler attribute.
+      return BindOne<Fn, Params...>::To(
+          std::move(fn),
+          std::move(binding).template Attr<typename AttrBinding<Param>::Attr>(
+              std::string(AttrBinding<Param>::name())),
+          traits);
+
+    } else if constexpr (is_attrs_binding_v<Param>) {
+      // Bind parameter as attributes dictionary.
+      return BindOne<Fn, Params...>::To(
+          std::move(fn),
+          std::move(binding)
+              .template Attrs<typename AttrsBinding<Param>::Attrs>(),
+          traits);
+
+    } else if constexpr (is_ctx_binding_v<Param>) {
+      // Bind parameter as an FFI handler context.
+      return BindOne<Fn, Params...>::To(
+          std::move(fn),
+          std::move(binding).template Ctx<typename CtxBinding<Param>::Ctx>(),
+          traits);
+
+    } else {
+      // Parameter is not recognized as one of the types that can be bound to
+      // FFI handler.
+      static_assert(sizeof(Param) == 0,
+                    "parameter is not supported for binding");
+    }
+  }
+};
+
+// A specialization that binds `Fn` after consuming all parameters.
+template <typename Fn>
+struct BindOne<Fn> {
+  template <typename InFlightBinding>
+  static auto To(Fn fn, InFlightBinding binding,
+                 std::initializer_list<Traits> traits) {
+    return binding.To(std::move(fn), traits);
+  }
+};
+
+template <ExecutionStage stage, typename Fn>
+struct Bind;
+
+// Binding specialization for function pointers (and captureless lambdas that
+// can be casted to function pointers).
+template <ExecutionStage stage, typename ResultType, typename... Params>
+struct Bind<stage, ResultType (*)(Params...)> {
+  using Fn = ResultType (*)(Params...);
+
+  static auto To(Fn fn, std::initializer_list<Traits> traits) {
+    return BindOne<Fn, Params...>::To(std::move(fn), Ffi::Bind<stage>(),
+                                      traits);
+  }
+};
+
+// Binding specialization for callables (lambdas with captures).
+template <ExecutionStage stage, typename ResultType, typename Fn,
+          typename... Params>
+struct Bind<stage, ResultType (Fn::*)(Params...) const> {
+  static auto To(Fn fn, std::initializer_list<Traits> traits) {
+    return BindOne<Fn, Params...>::To(std::move(fn), Ffi::Bind<stage>(),
+                                      traits);
+  }
+};
+
+}  // namespace internal
+
+template <typename Fn, ExecutionStage stage>
+auto Ffi::BindTo(Fn fn, std::initializer_list<Traits> traits) {
+  if constexpr (std::is_pointer_v<Fn>) {
+    return internal::Bind<stage, Fn>::To(fn, traits);
+  } else {
+    return internal::Bind<stage, decltype(&Fn::operator())>::To(std::move(fn),
+                                                                traits);
+  }
+}
+
+// A container for defining parameters corresponding to results.
+template <typename T>
+class Result {
+ public:
+  Result(T value) : value_(value) {}  // NOLINT
+  T& operator*() { return value_; }
+  T* operator->() { return &value_; }
+
+ private:
+  T value_;
+};
+
+// A container for defining parameters corresponding to attributes with an
+// attribute name available as compile time value.
+template <typename T, char const* attr_name>
+class Attr {
+ public:
+  Attr(T value) : value_(value) {}  // NOLINT
+  T& operator*() { return value_; }
+  T* operator->() { return &value_; }
+
+ private:
+  T value_;
+};
+
+//===----------------------------------------------------------------------===//
+// Attributes bindings
+//===----------------------------------------------------------------------===//
+
+// Default attribute binding for `Attr` parameters.
+template <typename T, const char* attr_name>
+struct AttrBinding<Attr<T, attr_name>> {
+  using Attr = T;
+  static constexpr std::string_view name() { return attr_name; }
+};
+
+// Default attributes binding for `Dictionary` parameters.
+template <>
+struct AttrsBinding<Dictionary> {
+  using Attrs = Dictionary;
+};
+
+//===----------------------------------------------------------------------===//
+// Arguments decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI arguments decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` arguments
+//
+//   template <>
+//   struct ArgDecoding<MyType> {
+//     static std::optional<MyType> Decode(XLA_FFI_ArgType type, void* arg);
+//   };
+//
+// If argument can't be decoded it should return the empty optional.
+template <typename T>
+struct ArgDecoding;
+
+//===----------------------------------------------------------------------===//
+// Results decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI results decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` results
+//
+//   template <>
+//   struct RetDecoding<MyType> {
+//     static std::optional<MyType> Decode(XLA_FFI_RetType type, void* ret);
+//   };
+//
+// If argument can't be decoded it should return the empty optional.
+template <typename T>
+struct RetDecoding;
+
+//===----------------------------------------------------------------------===//
+// Attributes decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI attribute decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` attributes
+//
+//   template <>
+//   struct AttrDecoding<MyType> {
+//    using Type = <handler argument type for attribute type MyType>
+//    static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
+//                                        DiagnosticEngine&);
+//   }
+//
+template <typename T>
+struct AttrDecoding;
+
+//===----------------------------------------------------------------------===//
+// Context decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI execution context decoding must be defined by specializing this
+// template.
+//
+// Example: decoding for the `MyType` context
+//
+//   template <>
+//   struct CtxDecoding<MyType> {
+//    using Type = <handler argument type for context type MyType>;
+//    static std::optional<Type> Decode(const XLA_FFI_Api* api,
+//                                      XLA_FFI_ExecutionContext* ctx);
+//   }
+//
+template <typename T>
+struct CtxDecoding;
+
+//===----------------------------------------------------------------------===//
+// Result encoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI result encoding (conversion from a returned status-like type to FFI
+// error type) must be defined by specializing this template.
+//
+// Example: encoding `absl::Status` result
+//
+//   template<ExecutionStage stage>
+//   struct ResultEncoding<stage, absl::Status> {
+//     XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
+//                           XLA_FFI_ExecutionContext* ctx,
+//                           absl::Status status) {...}
+//   };
+//
+// Result encoding is execution stage specific, for example at instantiation
+// stage FFI handler can return an FFI handler state, while at execution stage
+// we only support returning a status-like type.
+//
+// Asynchronous FFI handlers can return encoded result as an `XLA_FFI_Future*`
+// or as an `std::variant` of `XLA_FFI_Error*` and `XLA_FFI_Future*`, where an
+// error can be used to return synchronous errors (i.e., invalid arguments), and
+// a future can be used to return asynchronous completion. See example of such
+// encoding in result encoding for `Future`.
+//
+// Example: encoding `xla::ffi::Future` result
+//
+//   template<ExecutionStage stage>
+//   struct ResultEncoding<state, xla::ffi::Future> {
+//     std::variant<XLA_FFI_Error*, XLA_FFI_Future*> Encode(
+//       const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+//       xla::ffi::Future future) {...}
+//   };
+//
+template <ExecutionStage stage, typename T>
+struct ResultEncoding;
+
+//===----------------------------------------------------------------------===//
+// Diagnostics
+//===----------------------------------------------------------------------===//
+
+class DiagnosticEngine;
+
+// RAII wrapper around constructed, but but not yet emitted diagnostic. In
+// flight diagnostic gives an opportunity to build a diagnostic before reporting
+// it to the engine, similar to the builder pattern.
+class InFlightDiagnostic {
+ public:
+  explicit InFlightDiagnostic(DiagnosticEngine* engine, std::string s)
+      : engine_(engine) {
+    stream_ << s;
+  }
+  InFlightDiagnostic(const InFlightDiagnostic&) = delete;
+  InFlightDiagnostic& operator=(const InFlightDiagnostic&) = delete;
+
+  ~InFlightDiagnostic();
+
+  template <typename Arg>
+  InFlightDiagnostic& operator<<(Arg&& arg) {
+    stream_ << std::forward<Arg>(arg);
+    return *this;
+  }
+
+  template <typename T>
+  operator std::optional<T>() const {  // NOLINT
+    return std::nullopt;
+  }
+
+ private:
+  DiagnosticEngine* engine_;
+  std::stringstream stream_;
+};
+
+class DiagnosticEngine {
+ public:
+  DiagnosticEngine() = default;
+  DiagnosticEngine(const DiagnosticEngine&) = delete;
+  DiagnosticEngine& operator=(const DiagnosticEngine&) = delete;
+
+  InFlightDiagnostic Emit(std::string message) {
+    return InFlightDiagnostic(this, std::move(message));
+  }
+
+  std::string Result() const { return acc_; }
+
+ private:
+  friend class InFlightDiagnostic;
+
+  void append(std::string s) { acc_.append(std::move(s)); }
+
+  std::string acc_;
+};
+
+inline InFlightDiagnostic::~InFlightDiagnostic() {
+  engine_->append(stream_.str());
+}
+
+//===----------------------------------------------------------------------===//
+// Decoding arguments and attributes
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// When decoding input data we need to keep track of how many arguments and
+// attributes we decoded so far to compute call frame offsets.
+struct DecodingOffsets {
+  int64_t args = 0;
+  int64_t rets = 0;
+  int64_t attrs = 0;
+};
+
+struct DecodingContext {
+  const XLA_FFI_CallFrame* call_frame;
+
+  const std::string* attrs_names;  // not owned
+  const std::size_t* attrs_idx;    // not owned
+};
+
+template <typename T>
+struct Decode {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
+    int64_t idx = offsets.args++;
+    return ArgDecoding<T>::Decode(ctx.call_frame->args.types[idx],
+                                  ctx.call_frame->args.args[idx], diagnostic);
+  }
+};
+
+template <typename T>
+struct Decode<OptionalArgTag<T>> {
+  static std::optional<std::optional<T>> call(DecodingOffsets& offsets,
+                                              DecodingContext& ctx,
+                                              DiagnosticEngine& diagnostic) {
+    if (offsets.args >= ctx.call_frame->args.size) {
+      return std::optional<T>(std::nullopt);
+    }
+    return Decode<T>::call(offsets, ctx, diagnostic);
+  }
+};
+
+template <typename T>
+struct Decode<RetTag<T>> {
+  static std::optional<Result<T>> call(DecodingOffsets& offsets,
+                                       DecodingContext& ctx,
+                                       DiagnosticEngine& diagnostic) {
+    int64_t idx = offsets.rets++;
+    return RetDecoding<T>::Decode(ctx.call_frame->rets.types[idx],
+                                  ctx.call_frame->rets.rets[idx], diagnostic);
+  }
+};
+
+template <typename T>
+struct Decode<OptionalRetTag<T>> {
+  static std::optional<std::optional<Result<T>>> call(
+      DecodingOffsets& offsets, DecodingContext& ctx,
+      DiagnosticEngine& diagnostic) {
+    if (offsets.rets >= ctx.call_frame->rets.size) {
+      return std::optional<Result<T>>(std::nullopt);
+    }
+    return Decode<RetTag<T>>::call(offsets, ctx, diagnostic);
+  }
+};
+
+template <typename T>
+struct Decode<AttrTag<T>> {
+  using R = typename AttrDecoding<T>::Type;
+
+  static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
+    // Find decoded attribute corresponding to the given attribute index.
+    int64_t i = offsets.attrs++;
+
+    // Get mapping from the attribute to its index in the sorted array.
+    size_t idx = ctx.attrs_idx[i];
+
+    // Load attribute from call frame using index into the sorted array.
+    XLA_FFI_AttrType attr_type = ctx.call_frame->attrs.types[idx];
+    XLA_FFI_ByteSpan* attr_name = ctx.call_frame->attrs.names[idx];
+    void* attr = ctx.call_frame->attrs.attrs[idx];
+
+    // TODO(ezhulenev): Currently we require that attributes passed to the FFI
+    // handler must match attributes referenced in a binding, however
+    // we could safely ignore extra attributes. Relax this if needed.
+
+    // Attribute name does not match.
+    std::string_view attr_name_view = {attr_name->ptr, attr_name->len};
+    if (attr_name_view != ctx.attrs_names[i]) {
+      return diagnostic.Emit("Attribute name mismatch: ")
+             << attr_name_view << " vs " << ctx.attrs_names[i];
+    }
+
+    return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
+  }
+};
+
+template <typename T>
+struct Decode<CtxTag<T>> {
+  using R = typename CtxDecoding<T>::Type;
+
+  static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
+    return CtxDecoding<T>::Decode(ctx.call_frame->api, ctx.call_frame->ctx,
+                                  diagnostic);
+  }
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of arguments.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+class RemainingArgsBase {
+ public:
+  RemainingArgsBase(const XLA_FFI_Args* args, size_t offset)
+      : args_(args), offset_(offset) {
+    assert(offset <= args_->size && "illegal remaining args offset");
+  }
+
+  size_t size() const { return args_->size - offset_; }
+  bool empty() const { return size() == 0; }
+
+ protected:
+  const XLA_FFI_Args* args() const { return args_; }
+  size_t offset() const { return offset_; }
+
+ private:
+  const XLA_FFI_Args* args_;
+  size_t offset_;
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of results.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+class RemainingRetsBase {
+ public:
+  RemainingRetsBase(const XLA_FFI_Rets* rets, size_t offset)
+      : rets_(rets), offset_(offset) {
+    assert(offset <= rets_->size && "illegal remaining rets offset");
+  }
+
+  size_t size() const { return rets_->size - offset_; }
+  bool empty() const { return size() == 0; }
+
+ protected:
+  const XLA_FFI_Rets* rets() const { return rets_; }
+  size_t offset() const { return offset_; }
+
+ private:
+  const XLA_FFI_Rets* rets_;  // not owned
+  size_t offset_;
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing dictionary attributes.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// Forward declare dictionary attribute decoding defined below.
+template <typename T, typename... Ts>
+struct DecodeDictionaryAttr;
+
+class DictionaryBase {
+ public:
+  explicit DictionaryBase(const XLA_FFI_Attrs* attrs) : attrs_(attrs) {}
+
+  size_t size() const { return attrs_->size; }
+
+  bool contains(std::string_view name) const {
+    return Find(name) < attrs_->size;
+  }
+
+ protected:
+  template <typename T, typename... Ts>
+  friend struct DecodeDictionaryAttr;
+
+  template <typename T>
+  std::optional<T> get(std::string_view name,
+                       DiagnosticEngine& diagnostic) const {
+    size_t idx = Find(name);
+    if (XLA_FFI_PREDICT_FALSE(idx >= attrs_->size)) {
+      return diagnostic.Emit("Unexpected attribute: ") << name;
+    }
+
+    XLA_FFI_AttrType attr_type = attrs_->types[idx];
+    void* attr = attrs_->attrs[idx];
+    return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
+  }
+
+ private:
+  size_t Find(std::string_view name) const {
+    XLA_FFI_ByteSpan** begin = attrs_->names;
+    XLA_FFI_ByteSpan** end = begin + attrs_->size;
+
+    auto name_eq = [&](XLA_FFI_ByteSpan* attr) {
+      std::string_view name_view = {attr->ptr, attr->len};
+      return name_view == name;
+    };
+
+    // TODO(ezhulenev): Attributes names sorted by name. We can use a binary
+    // search here instead of a linear scan.
+    return std::distance(begin, std::find_if(begin, end, name_eq));
+  }
+
+  const XLA_FFI_Attrs* attrs_;
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Decoding for aggregate attributes (decoding dictionaries into structs).
+//===----------------------------------------------------------------------===//
+
+// Decode `AttrsTag` into a type `T` relying on struct decoding defined below.
+template <typename T>
+struct internal::Decode<internal::AttrsTag<T>> {
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
+    return AttrDecoding<T>::Decode(
+        XLA_FFI_AttrType_DICTIONARY,
+        const_cast<XLA_FFI_Attrs*>(&ctx.call_frame->attrs), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Template metaprogramming for decoding handler signature
+//===----------------------------------------------------------------------===//
+
+// Forward declare classes for decoding variadic number of arguments and
+// results. They are defined in `ffi.h` headers (internal and external), to be
+// able to use slightly different implementations for internal and external
+// FFI (`absl::StatusOr` vs `ffi::ErrorOr`).
+class RemainingArgs;
+class RemainingRets;
+
+namespace internal {
+// A helper struct to extract the type of the handler argument.
+template <typename T>
+struct FnArgType {
+  using Type = T;
+};
+
+template <typename T>
+struct FnArgType<internal::OptionalArgTag<T>> {
+  using Type = std::optional<T>;
+};
+
+template <>
+struct FnArgType<internal::RemainingArgsTag> {
+  using Type = RemainingArgs;
+};
+
+template <typename T>
+struct FnArgType<internal::RetTag<T>> {
+  using Type = Result<T>;
+};
+
+template <typename T>
+struct FnArgType<internal::OptionalRetTag<T>> {
+  using Type = std::optional<Result<T>>;
+};
+
+template <>
+struct FnArgType<internal::RemainingRetsTag> {
+  using Type = RemainingRets;
+};
+
+template <typename T>
+struct FnArgType<internal::AttrTag<T>> {
+  using Type = typename AttrDecoding<T>::Type;
+};
+
+template <typename T>
+struct FnArgType<internal::AttrsTag<T>> {
+  using Type = T;
+};
+
+template <typename T>
+struct FnArgType<internal::CtxTag<T>> {
+  using Type = typename CtxDecoding<T>::Type;
+};
+
+// A template for checking if type in a parameter pack is a tagged one and has
+// a special decoding rule defined by template specialization.
+template <typename>
+struct IsTagged : std::false_type {};
+
+template <typename T>
+struct IsTagged<OptionalArgTag<T>> : std::true_type {};
+template <typename T>
+struct IsTagged<RetTag<T>> : std::true_type {};
+template <typename T>
+struct IsTagged<OptionalRetTag<T>> : std::true_type {};
+template <typename T>
+struct IsTagged<AttrTag<T>> : std::true_type {};
+template <typename T>
+struct IsTagged<AttrsTag<T>> : std::true_type {};
+template <typename T>
+struct IsTagged<CtxTag<T>> : std::true_type {};
+
+template <>
+struct IsTagged<RemainingArgsTag> : std::true_type {};
+template <>
+struct IsTagged<RemainingRetsTag> : std::true_type {};
+
+// A template for counting regular arguments in the Ts pack (arguments that are
+// not wrapped into a special tag).
+template <typename... Ts>
+struct NumArgs;
+
+template <>
+struct NumArgs<> {
+  static constexpr int64_t value = 0;
+};
+
+template <typename T, typename... Ts>
+struct NumArgs<T, Ts...> {
+  static constexpr int64_t value = !IsTagged<T>::value + NumArgs<Ts...>::value;
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Handler decodes FFI call frame and invokes `Fn` with decoded arguments
+//===----------------------------------------------------------------------===//
+
+template <ExecutionStage stage, typename Fn, typename... Ts>
+class Handler : public Ffi {
+  static constexpr int64_t kSize = sizeof...(Ts);
+
+  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
+
+  static constexpr int64_t kNumOptionalArgs =
+      internal::NumTagged<internal::OptionalArgTag, Ts...>::value;
+
+  static constexpr int64_t kNumRets =
+      internal::NumTagged<internal::RetTag, Ts...>::value;
+
+  static constexpr int64_t kNumOptionalRets =
+      internal::NumTagged<internal::OptionalRetTag, Ts...>::value;
+
+  static constexpr int64_t kNumAttrs =
+      internal::NumTagged<internal::AttrTag, Ts...>::value;
+
+  static constexpr int64_t kNumDictAttrs =
+      internal::NumTagged<internal::AttrsTag, Ts...>::value;
+
+  static_assert(kNumAttrs == 0 || kNumDictAttrs == 0,
+                "dictionary attributes can't be mixed with regular ones");
+
+  template <typename T>
+  using FnArgType = typename internal::FnArgType<T>::Type;
+
+  static_assert(std::is_invocable_v<Fn, FnArgType<Ts>...>,
+                "FFI binding signature is not compatible with a function type");
+
+  using ResultType = std::invoke_result_t<Fn, FnArgType<Ts>...>;
+
+ public:
+  // We deliberately opt-out from the cognitive complexity check, as this
+  // function is on a hot path, any any attempt to split it leads to measurable
+  // regressions in microbenchmarks. It is a straight line block of mostly
+  // constexpr conditionals, that gets optimized to a much smaller code size in
+  // all template instantiations.
+  //
+  // NOLINTNEXTLINE(readability-function-cognitive-complexity)
+  XLA_FFI_Error* Call(XLA_FFI_CallFrame* call_frame) const override {
+    // Sanity checking call frame struct size.
+    if (XLA_FFI_Error* err = CheckStructSize(
+            call_frame->api, "XLA_FFI_CallFrame", XLA_FFI_CallFrame_STRUCT_SIZE,
+            call_frame->struct_size)) {
+      return err;
+    }
+
+    // If passed a call frame with the metadata extension, just return the
+    // metadata.
+    if (call_frame->extension_start != nullptr &&
+        call_frame->extension_start->type == XLA_FFI_Extension_Metadata) {
+      return PopulateMetadata(call_frame->api,
+                              reinterpret_cast<XLA_FFI_Metadata_Extension*>(
+                                  call_frame->extension_start));
+    }
+
+    // Check that handler is called during correct execution stage.
+    if (XLA_FFI_PREDICT_FALSE(call_frame->stage !=
+                              static_cast<XLA_FFI_ExecutionStage>(stage))) {
+      return InvalidArgument(call_frame->api,
+                             StrCat("Wrong execution stage: expected `",
+                                    static_cast<XLA_FFI_ExecutionStage>(stage),
+                                    "` but got `", call_frame->stage, "`"));
+    }
+
+    // Check that the number of passed arguments matches the signature. Each
+    // individual argument decoding will check the actual type.
+    if constexpr (internal::HasRemainingArgsTag<Ts...>::value) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.size < kNumArgs)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of arguments: expected at least ",
+                   kNumArgs - kNumOptionalArgs - 1, " but got ",
+                   call_frame->args.size));
+      }
+    } else if constexpr (internal::HasOptionalArgTag<Ts...>::value) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.size < kNumArgs)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of arguments: expected at least ",
+                   kNumArgs - kNumOptionalArgs, " but got ",
+                   call_frame->args.size));
+      }
+    } else {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.size != kNumArgs)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of arguments: expected ", kNumArgs,
+                   " but got ", call_frame->args.size));
+      }
+    }
+
+    // Check that the number of results matches the signature. Each individual
+    // result decoding will check the actual type.
+    if constexpr (internal::HasRemainingRetsTag<Ts...>::value) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size < kNumRets)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of results: expected at least ",
+                   kNumRets - kNumOptionalRets - 1, " but got ",
+                   call_frame->rets.size));
+      }
+    } else if constexpr (internal::HasOptionalRetTag<Ts...>::value) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size < kNumRets)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of results: expected at least ",
+                   kNumRets - kNumOptionalRets, " but got ",
+                   call_frame->rets.size));
+      }
+    } else {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size != kNumRets)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of results: expected ", kNumRets, " but got ",
+                   call_frame->rets.size));
+      }
+    }
+
+    // Check that the number of passed attributes matches the signature. Each
+    // individual attribute decoding will check the actual type. If we decode
+    // attributes into a dictionary (or a custom struct decoded from a
+    // dictionary), then there is no need to check attributes, as the FFI
+    // handler (or a struct decoding) should be responsible for it.
+    if (XLA_FFI_PREDICT_FALSE(kNumDictAttrs == 0 &&
+                              call_frame->attrs.size != kNumAttrs)) {
+      std::stringstream msg;
+      msg << "Wrong number of attributes: expected " << kNumAttrs << " but got "
+          << call_frame->attrs.size;
+      if (call_frame->attrs.size > 0) {
+        msg << " with name(s): ";
+        for (int64_t n = 0; n < call_frame->attrs.size - 1; ++n) {
+          msg << std::string_view(call_frame->attrs.names[n]->ptr,
+                                  call_frame->attrs.names[n]->len)
+              << ", ";
+        }
+        msg << std::string_view(
+            call_frame->attrs.names[call_frame->attrs.size - 1]->ptr,
+            call_frame->attrs.names[call_frame->attrs.size - 1]->len);
+      }
+      return InvalidArgument(call_frame->api, msg.str());
+    }
+
+    // Define index sequences to access custom call operands.
+    using Is = std::make_index_sequence<kSize>;
+
+    return Call(call_frame, Is{});
+  }
+
+ private:
+  XLA_FFI_Error* PopulateMetadata(const XLA_FFI_Api* api,
+                                  XLA_FFI_Metadata_Extension* extension) const {
+    if (XLA_FFI_Error* err =
+            StructSizeIsGreaterOrEqual(api, "XLA_FFI_Metadata_Extension",
+                                       XLA_FFI_Metadata_Extension_STRUCT_SIZE,
+                                       extension->extension_base.struct_size)) {
+      return err;
+    }
+
+    if (XLA_FFI_Error* err = StructSizeIsGreaterOrEqual(
+            api, "XLA_FFI_Metadata", XLA_FFI_Metadata_STRUCT_SIZE,
+            extension->metadata->struct_size)) {
+      return err;
+    }
+
+    extension->metadata->api_version = XLA_FFI_Api_Version{
+        XLA_FFI_Api_Version_STRUCT_SIZE,
+        /*extension_start=*/nullptr,
+        XLA_FFI_API_MAJOR,
+        XLA_FFI_API_MINOR,
+    };
+
+    XLA_FFI_Handler_Traits traits = 0;
+    for (const auto& trait : traits_) {
+      traits |= static_cast<XLA_FFI_Handler_Traits>(trait);
+    }
+    extension->metadata->traits = traits;
+
+    return Sucess();
+  }
+
+  template <size_t... Is>
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE XLA_FFI_Error* Call(
+      XLA_FFI_CallFrame* call_frame, std::index_sequence<Is...>) const {
+    // A helper structure to allow each decoder find the correct offset.
+    internal::DecodingOffsets offsets;
+
+    // Package all the data required for decoding ffi handler operands.
+    internal::DecodingContext ctx = {call_frame, attrs_.data(),
+                                     attrs_idx_.data()};
+
+    DiagnosticEngine diagnostic;
+
+    std::tuple<std::optional<FnArgType<Ts>>...> args = {
+        internal::Decode<Ts>::call(offsets, ctx, diagnostic)...};
+
+    if constexpr (sizeof...(Ts) > 0) {
+      // We intentionally use `&`, as it generates fewer branch instructions.
+      bool all_decoded = (std::get<Is>(args).has_value() & ...);
+      if (XLA_FFI_PREDICT_FALSE(!all_decoded)) {
+        return FailedDecodeError(
+            call_frame, {std::get<Is>(args).has_value()...}, diagnostic);
+      }
+    }
+
+    ResultType result = fn_(std::move(*std::get<Is>(args))...);
+    auto encoded = ResultEncoding<stage, ResultType>::Encode(
+        call_frame->api, call_frame->ctx, std::move(result));
+
+    // We do support three kinds of FFI result encodings:
+    //   (1) Synchronous handlers that return result encoded as XLA_FFI_Error*
+    //   (2) Asynchronous handlers that return result encoded as XLA_FFI_Future*
+    //   (3) Handlers that can return either (1) or (2)
+    static constexpr bool kIsEncodedError =
+        std::is_same_v<decltype(encoded), XLA_FFI_Error*>;
+    static constexpr bool kIsEncodedFuture =
+        std::is_same_v<decltype(encoded), XLA_FFI_Future*>;
+    static constexpr bool kIsEncodedErrorOrFuture =
+        std::is_same_v<decltype(encoded),
+                       std::variant<XLA_FFI_Error*, XLA_FFI_Future*>>;
+
+    static_assert(
+        kIsEncodedError || kIsEncodedFuture || kIsEncodedErrorOrFuture,
+        "Unsupported result encoding type");
+
+    if constexpr (kIsEncodedError) {
+      return encoded;
+    }
+
+    if constexpr (kIsEncodedFuture) {
+      call_frame->future = encoded;
+      assert(call_frame->future != nullptr);
+      return nullptr;
+    }
+
+    if constexpr (kIsEncodedErrorOrFuture) {
+      if (encoded.index() == 0) {
+        return std::get<0>(encoded);
+      } else {
+        call_frame->future = std::get<1>(encoded);
+        assert(call_frame->future != nullptr);
+        return nullptr;
+      }
+    }
+
+    std::abort();  // unreachable
+  }
+
+  XLA_FFI_Error* FailedDecodeError(const XLA_FFI_CallFrame* call_frame,
+                                   std::array<bool, kSize> decoded,
+                                   const DiagnosticEngine& diagnostic) const {
+    std::stringstream message;
+    message << "[" << call_frame->stage << "] "
+            << "Failed to decode all FFI handler operands (bad operands at: ";
+    for (size_t cnt = 0, idx = 0; idx < kSize; ++idx) {
+      if (!decoded[idx]) {
+        if (cnt++) message << ", ";
+        message << std::to_string(idx);
+      }
+    }
+    message << ")";
+    if (auto s = std::move(diagnostic).Result(); !s.empty()) {
+      message << "\nDiagnostics:\n" << s;
+    }
+    return InvalidArgument(call_frame->api, message.str());
+  }
+
+  template <ExecutionStage, typename...>
+  friend class Binding;
+
+  Handler(Fn fn, std::vector<Traits> traits, std::vector<std::string> attrs)
+      : fn_(std::move(fn)),
+        traits_(std::move(traits)),
+        attrs_(std::move(attrs)) {
+    // Sort attributes' names and remove duplicates. These unique attributes are
+    // what we'll be looking for in the call frame attributes.
+    std::vector<std::string> sorted = attrs_;
+    std::sort(sorted.begin(), sorted.end());
+    sorted.erase(
+        std::unique(sorted.begin(), sorted.end(), std::equal_to<std::string>()),
+        sorted.end());
+
+    // Find index of every attribute in the sorted attributes vector.
+    for (size_t i = 0; i < attrs_.size(); ++i) {
+      attrs_idx_.push_back(std::distance(
+          sorted.begin(), std::find(sorted.begin(), sorted.end(), attrs_[i])));
+    }
+  }
+
+  Fn fn_;
+  std::vector<Traits> traits_;
+
+  std::vector<std::string> attrs_;  // names of bound attributes
+
+  // A mapping from the attribute index (index into the `attrs_` member) to its
+  // index in the lexicographically sorted vector of attribute names. Call frame
+  // passes attributes sorted by name, and with this index we can find the
+  // attribute we are looking for using O(1) lookup, assuming if the call frame
+  // has exact same attributes as the binding. If not, this allows to do a more
+  // efficient binary search by skipping a part of the call frame attributes.
+  std::vector<size_t> attrs_idx_;
+};
+
+//===----------------------------------------------------------------------===//
+// Builtin attributes decoding
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                \
+  template <>                                                         \
+  struct AttrDecoding<T> {                                            \
+    using Type = T;                                                   \
+    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
+                                   DiagnosticEngine& diagnostic) {    \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {   \
+        return diagnostic.Emit("Wrong attribute type: expected ")     \
+               << XLA_FFI_AttrType_SCALAR << " but got " << type;     \
+      }                                                               \
+                                                                      \
+      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);         \
+      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != TYPE)) {             \
+        return diagnostic.Emit("Wrong scalar data type: expected ")   \
+               << TYPE << " but got " << scalar->dtype;               \
+      }                                                               \
+                                                                      \
+      return *reinterpret_cast<T*>(scalar->value);                    \
+    }                                                                 \
+  }
+
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(bool, XLA_FFI_DataType_PRED);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int8_t, XLA_FFI_DataType_S8);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int16_t, XLA_FFI_DataType_S16);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint8_t, XLA_FFI_DataType_U8);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint16_t, XLA_FFI_DataType_U16);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint32_t, XLA_FFI_DataType_U32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint64_t, XLA_FFI_DataType_U64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(double, XLA_FFI_DataType_F64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(std::complex<float>,
+                                      XLA_FFI_DataType_C64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(std::complex<double>,
+                                      XLA_FFI_DataType_C128);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
+//===----------------------------------------------------------------------===//
+// Automatic dictionary attributes to structs decoding.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct StructMember {
+  using Type = T;
+
+  explicit StructMember(std::string_view name) : name(name) {}
+  std::string_view name;
+};
+
+namespace internal {
+
+// Decodes dictionary attribute into the object of type `T` that must be
+// constructible from the `Ts` types.
+template <typename T, typename... Ts>
+struct DecodeDictionaryAttr {
+  static constexpr size_t kSize = sizeof...(Ts);
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<T> Decode(const XLA_FFI_Attrs* attrs,
+                                 std::array<std::string_view, kSize> names,
+                                 DiagnosticEngine& diagnostic) {
+    return Decode(attrs, names, std::make_index_sequence<kSize>{}, diagnostic);
+  }
+
+  template <size_t... Is>
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
+      const XLA_FFI_Attrs* attrs, std::array<std::string_view, kSize> names,
+      std::index_sequence<Is...>, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(kSize != attrs->size)) {
+      return diagnostic.Emit("Wrong number of attributes: expected ")
+             << kSize << " attributes but got " << attrs->size;
+    }
+
+    // TODO(ezhulenev): We rely on dictionary to lookup struct members by name
+    // at run time, however it can become really expensive. We should
+    // pre-compute mapping from `names` to index in the `XLA_FFI_Attrs`
+    // (attributes ordered by name) in a static variable, and rely on it
+    // to decode attributes with constant run time complexity.
+    //
+    // Consider using `static auto decoder = ...` below, and compute mapping in
+    // constructor. Add benchmarks first to know what to improve!
+    internal::DictionaryBase dict(attrs);
+
+    std::tuple<std::optional<Ts>...> members = {
+        dict.get<Ts>(names[Is], diagnostic)...};
+    bool all_decoded = (std::get<Is>(members).has_value() && ...);
+    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) return std::nullopt;
+
+    return T{std::move(*std::get<Is>(members))...};
+  }
+};
+
+template <typename... Members>
+auto StructMemberNames(Members... m) {
+  return std::array<std::string_view, sizeof...(Members)>{m.name...};
+}
+
+template <typename T, typename... Members>
+auto DictionaryDecoder(Members... m) {
+  return DecodeDictionaryAttr<T, typename Members::Type...>();
+}
+
+}  // namespace internal
+
+// Example: register decoding for a user-defined struct
+//
+//   struct PairOfI64 { int64_t a; int64_t b; };
+//
+//   XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
+//     PairOfI64,
+//     StructMember<int64_t>("a"),
+//     StructMember<int64_t>("b"));
+//
+// Automatically registers attributes binding for a struct that allows automatic
+// binding specification inference from a callable signature.
+//
+#define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                         \
+  namespace xla::ffi {                                                        \
+  template <>                                                                 \
+  struct AttrsBinding<T> {                                                    \
+    using Attrs = T;                                                          \
+  };                                                                          \
+                                                                              \
+  template <>                                                                 \
+  struct AttrDecoding<T> {                                                    \
+    using Type = T;                                                           \
+    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr,         \
+                                   DiagnosticEngine& diagnostic) {            \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {       \
+        diagnostic.Emit("Wrong attribute type: expected ")                    \
+            << XLA_FFI_AttrType_DICTIONARY << " but got " << type;            \
+        return std::nullopt;                                                  \
+      }                                                                       \
+                                                                              \
+      auto decoder = ::xla::ffi::internal::DictionaryDecoder<T>(__VA_ARGS__); \
+      return decltype(decoder)::Decode(                                       \
+          reinterpret_cast<const XLA_FFI_Attrs*>(attr),                       \
+          internal::StructMemberNames(__VA_ARGS__), diagnostic);              \
+    }                                                                         \
+  };                                                                          \
+  } /* namespace xla::ffi */                                                  \
+  static_assert(std::is_class_v<::xla::ffi::AttrsBinding<T>>);                \
+  static_assert(std::is_class_v<::xla::ffi::AttrDecoding<T>>)
+
+// Registers decoding for a user-defined enum class type. Uses enums underlying
+// type to decode the attribute as a scalar value and cast it to the enum type.
+#define XLA_FFI_REGISTER_ENUM_ATTR_DECODING(T)                                \
+  namespace xla::ffi {                                                        \
+  template <>                                                                 \
+  struct AttrDecoding<T> {                                                    \
+    using Type = T;                                                           \
+    using U = std::underlying_type_t<Type>;                                   \
+    static_assert(std::is_enum<Type>::value, "Expected enum class");          \
+                                                                              \
+    static std::optional<Type> Decode(XLA_FFI_AttrType attr_type, void* attr, \
+                                      DiagnosticEngine& diagnostic) {         \
+      if (XLA_FFI_PREDICT_FALSE(attr_type != XLA_FFI_AttrType_SCALAR)) {      \
+        return diagnostic.Emit("Wrong attribute type: expected ")             \
+               << XLA_FFI_AttrType_SCALAR << " but got " << attr_type;        \
+      }                                                                       \
+                                                                              \
+      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);                 \
+      auto expected_dtype =                                                   \
+          ::xla::ffi::internal::NativeTypeToCApiDataType<U>();                \
+      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != expected_dtype)) {           \
+        return diagnostic.Emit("Wrong scalar data type: expected ")           \
+               << expected_dtype << " but got " << scalar->dtype;             \
+      }                                                                       \
+                                                                              \
+      auto underlying = *reinterpret_cast<U*>(scalar->value);                 \
+      return static_cast<Type>(underlying);                                   \
+    }                                                                         \
+  };                                                                          \
+  } /* namespace xla::ffi */                                                  \
+  static_assert(std::is_class_v<::xla::ffi::AttrDecoding<T>>)
+
+//===----------------------------------------------------------------------===//
+// Helper macro for registering FFI implementations
+//===----------------------------------------------------------------------===//
+
+#if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)  // GCC-style
+#define XLA_FFI_ATTRIBUTE_UNUSED __attribute__((unused))
+#else  // Non-GCC equivalents
+#define XLA_FFI_ATTRIBUTE_UNUSED
+#endif
+
+// Use captureless lambda to function pointer conversion to create a static
+// XLA_FFI_Handler function pointer variable.
+
+// Use explicit binding specification to create a handler.
+#define XLA_FFI_DEFINE_HANDLER_EXPLICIT(fn, impl, binding)                    \
+  static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) { \
+    static auto* handler = binding.To(impl).release();                        \
+    return handler->Call(call_frame);                                         \
+  }
+
+#define XLA_FFI_DEFINE_HANDLER_EXPLICIT_WITH_TRAITS(fn, impl, binding, traits) \
+  static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) {  \
+    static auto* handler = binding.To(impl, traits).release();                 \
+    return handler->Call(call_frame);                                          \
+  }
+
+// Automatically infer binding specification from the implementation.
+#define XLA_FFI_DEFINE_HANDLER_AUTO(fn, impl)                                 \
+  static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) { \
+    static auto* handler = ::xla::ffi::Ffi::BindTo(impl).release();           \
+    return handler->Call(call_frame);                                         \
+  }
+
+#define XLA_FFI_DEFINE_HANDLER_X(x, fn, impl, binding, traits, FUNC, ...) FUNC
+
+// Define XLA FFI handler as a static function pointer variable, which allows
+// to define handlers in nested scopes without polluting the global namespace.
+//
+// This is a trick to define macro with optional parameters.
+// Source: https://stackoverflow.com/a/8814003
+#define XLA_FFI_DEFINE_HANDLER(fn, impl, ...)                             \
+  XLA_FFI_DEFINE_HANDLER_X(                                               \
+      , fn, impl, ##__VA_ARGS__,                                          \
+      XLA_FFI_DEFINE_HANDLER_EXPLICIT_WITH_TRAITS(fn, impl, __VA_ARGS__), \
+      XLA_FFI_DEFINE_HANDLER_EXPLICIT(fn, impl, __VA_ARGS__),             \
+      XLA_FFI_DEFINE_HANDLER_AUTO(fn, impl))
+
+// TODO(ezhulenev): Add a callback so that end users can log registration error
+// to appropriate logging destination, e.g. LOG(FATAL) for duplicate internal
+// FFI handlers.
+#define XLA_FFI_REGISTER_HANDLER(API, NAME, PLATFORM, FUNC, ...)    \
+  XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, __COUNTER__, \
+                            ##__VA_ARGS__)
+#define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N, ...) \
+  XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ##__VA_ARGS__)
+#define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ...)       \
+  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                      \
+      xla_ffi_static_handler_##N##_registered_ = [] {                       \
+        return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM,  \
+                                                      FUNC, ##__VA_ARGS__); \
+      }()
+
+// Following two APIs are intended for users who want to export XLA FFI handler
+// from a shared library as a C function symbol.
+
+// Declares C function that implements FFI handler.
+#define XLA_FFI_DECLARE_HANDLER_SYMBOL(fn) \
+  extern "C" XLA_FFI_Error* fn(XLA_FFI_CallFrame* call_frame)
+
+// Defines C function that implements FFI handler.
+#define XLA_FFI_DEFINE_HANDLER_SYMBOL(fn, impl, ...)                           \
+  extern "C" XLA_FFI_Error* fn(XLA_FFI_CallFrame* call_frame) {                \
+    XLA_FFI_DEFINE_HANDLER(handler, impl, ##__VA_ARGS__);                      \
+    return (*handler)(call_frame);                                             \
+  }                                                                            \
+                                                                               \
+  static_assert(                                                               \
+      std::is_invocable_r_v<XLA_FFI_Error*, decltype(fn), XLA_FFI_CallFrame*>, \
+      "FFI handler must return XLA_FFI_Error* and accept XLA_FFI_CallFrame*")
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_API_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/c_api.h
new file mode 100644
index 00000000..8d6f1095
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/c_api.h
@@ -0,0 +1,695 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_C_API_H_
+#define XLA_FFI_API_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// XLA FFI C API follows PJRT API style for consistency. See `pjrt_c_api.h`.
+// More details on versioning strategy and example version checks:
+// https://github.com/tensorflow/community/blob/master/rfcs/20200612-stream-executor-c-api/C_API_versioning_strategy.md
+
+// Every struct passed across the C API boundary has its size as a member, and
+// we use it as a sanity check for API compatibility.
+#define XLA_FFI_STRUCT_SIZE(struct_type, last_field) \
+  (offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field))
+
+// Must update XLA_FFI_DEFINE_STRUCT_TRAITS with the new `last_field` after
+// adding a new member to a struct.
+#define XLA_FFI_DEFINE_STRUCT_TRAITS(sname, last_field) \
+  typedef struct sname sname;                           \
+  enum { sname##_STRUCT_SIZE = XLA_FFI_STRUCT_SIZE(sname, last_field) }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct XLA_FFI_Api XLA_FFI_Api;                  // Forward declare
+typedef struct XLA_FFI_InternalApi XLA_FFI_InternalApi;  // Forward declare
+
+//===----------------------------------------------------------------------===//
+// Extensions
+//===----------------------------------------------------------------------===//
+
+typedef enum {
+  XLA_FFI_Extension_Metadata = 1,
+} XLA_FFI_Extension_Type;
+
+typedef struct XLA_FFI_Extension_Base {
+  size_t struct_size;
+  XLA_FFI_Extension_Type type;
+  struct XLA_FFI_Extension_Base* next;
+} XLA_FFI_Extension_Base;
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Extension_Base, next);
+
+//===----------------------------------------------------------------------===//
+// Version
+//===----------------------------------------------------------------------===//
+
+// Incremented when an ABI-incompatible change is made to the interface.
+//
+// Major changes include:
+// * Deleting a method or argument
+// * Changing the type of an argument
+// * Rearranging fields in the XLA_FFI_Api or argument structs
+#define XLA_FFI_API_MAJOR 0
+
+// Incremented when the interface is updated in a way that is potentially
+// ABI-compatible with older versions, if supported by the caller and/or
+// implementation.
+//
+// Callers can implement forwards compatibility by using XLA_FFI_Api_Version to
+// check if the implementation is aware of newer interface additions.
+//
+// Implementations can implement backwards compatibility by using the
+// `struct_size` fields to detect how many struct fields the caller is aware of.
+//
+// Minor changes include:
+// * Adding a new field to the XLA_FFI_Api or argument structs
+// * Renaming a method or argument (doesn't affect ABI)
+#define XLA_FFI_API_MINOR 1
+
+struct XLA_FFI_Api_Version {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  int major_version;  // out
+  int minor_version;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api_Version, minor_version);
+
+//===----------------------------------------------------------------------===//
+// Error codes
+//===----------------------------------------------------------------------===//
+
+// XLA FFI error is a mechanism to communicate errors between XLA and XLA FFI
+// via a set of C APIs. This is somewhat similar to type-erased version of
+// absl::Status exposed via API with opaque pointers.
+//
+// Returning NULL error is equivalent to returning absl::OkStatus().
+//
+// Ownership of an XLA_FFI_Error is always transferred to the caller, and the
+// caller is responsible for destroying it:
+//
+// (1) If the error is returned from an XLA FFI handler, the XLA runtime will
+//     destroy it (XLA is the caller who calls into the handler implementation).
+//
+// (2) If the error is returned from an XLA FFI API call, the caller is
+//     responsible for destroying it.
+typedef struct XLA_FFI_Error XLA_FFI_Error;
+
+// Codes are based on https://abseil.io/docs/cpp/guides/status-codes
+typedef enum {
+  XLA_FFI_Error_Code_OK = 0,
+  XLA_FFI_Error_Code_CANCELLED = 1,
+  XLA_FFI_Error_Code_UNKNOWN = 2,
+  XLA_FFI_Error_Code_INVALID_ARGUMENT = 3,
+  XLA_FFI_Error_Code_DEADLINE_EXCEEDED = 4,
+  XLA_FFI_Error_Code_NOT_FOUND = 5,
+  XLA_FFI_Error_Code_ALREADY_EXISTS = 6,
+  XLA_FFI_Error_Code_PERMISSION_DENIED = 7,
+  XLA_FFI_Error_Code_RESOURCE_EXHAUSTED = 8,
+  XLA_FFI_Error_Code_FAILED_PRECONDITION = 9,
+  XLA_FFI_Error_Code_ABORTED = 10,
+  XLA_FFI_Error_Code_OUT_OF_RANGE = 11,
+  XLA_FFI_Error_Code_UNIMPLEMENTED = 12,
+  XLA_FFI_Error_Code_INTERNAL = 13,
+  XLA_FFI_Error_Code_UNAVAILABLE = 14,
+  XLA_FFI_Error_Code_DATA_LOSS = 15,
+  XLA_FFI_Error_Code_UNAUTHENTICATED = 16
+} XLA_FFI_Error_Code;
+
+//===----------------------------------------------------------------------===//
+// Error reporting APIs
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Error_Create_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  const char* message;
+  XLA_FFI_Error_Code errc;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_Create_Args, errc);
+
+typedef XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args);
+
+struct XLA_FFI_Error_GetMessage_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  XLA_FFI_Error* error;
+  const char* message;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_GetMessage_Args, message);
+
+typedef void XLA_FFI_Error_GetMessage(XLA_FFI_Error_GetMessage_Args* args);
+
+struct XLA_FFI_Error_Destroy_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  XLA_FFI_Error* error;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_Destroy_Args, error);
+
+typedef void XLA_FFI_Error_Destroy(XLA_FFI_Error_Destroy_Args* args);
+
+//===----------------------------------------------------------------------===//
+// DataType
+//===----------------------------------------------------------------------===//
+
+// This enum corresponds to xla::PrimitiveType enum defined in `xla_data.proto`.
+// LINT.IfChange
+typedef enum {
+  XLA_FFI_DataType_INVALID = 0,
+  XLA_FFI_DataType_PRED = 1,
+  XLA_FFI_DataType_S8 = 2,
+  XLA_FFI_DataType_S16 = 3,
+  XLA_FFI_DataType_S32 = 4,
+  XLA_FFI_DataType_S64 = 5,
+  XLA_FFI_DataType_U8 = 6,
+  XLA_FFI_DataType_U16 = 7,
+  XLA_FFI_DataType_U32 = 8,
+  XLA_FFI_DataType_U64 = 9,
+  XLA_FFI_DataType_F16 = 10,
+  XLA_FFI_DataType_F32 = 11,
+  XLA_FFI_DataType_F64 = 12,
+  XLA_FFI_DataType_BF16 = 16,
+  XLA_FFI_DataType_C64 = 15,
+  XLA_FFI_DataType_C128 = 18,
+  XLA_FFI_DataType_TOKEN = 17,
+  XLA_FFI_DataType_F8E5M2 = 19,
+  XLA_FFI_DataType_F8E3M4 = 29,
+  XLA_FFI_DataType_F8E4M3 = 28,
+  XLA_FFI_DataType_F8E4M3FN = 20,
+  XLA_FFI_DataType_F8E4M3B11FNUZ = 23,
+  XLA_FFI_DataType_F8E5M2FNUZ = 24,
+  XLA_FFI_DataType_F8E4M3FNUZ = 25,
+} XLA_FFI_DataType;
+// LINT.ThenChange(ffi_test.cc)
+
+//===----------------------------------------------------------------------===//
+// Builtin argument types
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Buffer {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_DataType dtype;
+  void* data;
+  int64_t rank;
+  int64_t* dims;  // length == rank
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Buffer, dims);
+
+typedef enum {
+  XLA_FFI_ArgType_BUFFER = 1,
+} XLA_FFI_ArgType;
+
+//===----------------------------------------------------------------------===//
+// Builtin result types
+//===----------------------------------------------------------------------===//
+
+typedef enum {
+  XLA_FFI_RetType_BUFFER = 1,
+} XLA_FFI_RetType;
+
+//===----------------------------------------------------------------------===//
+// Builtin attribute types
+//===----------------------------------------------------------------------===//
+
+typedef enum {
+  XLA_FFI_AttrType_ARRAY = 1,
+  XLA_FFI_AttrType_DICTIONARY = 2,
+  XLA_FFI_AttrType_SCALAR = 3,
+  XLA_FFI_AttrType_STRING = 4,
+} XLA_FFI_AttrType;
+
+//===----------------------------------------------------------------------===//
+// Execution context
+//===----------------------------------------------------------------------===//
+
+// Execution context provides access to per-invocation state.
+typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
+
+//===----------------------------------------------------------------------===//
+// Primitives
+//===----------------------------------------------------------------------===//
+
+// TypeId uniquely identifies a user-defined type in a given XLA FFI instance.
+typedef struct XLA_FFI_TypeId {
+  int64_t type_id;
+} XLA_FFI_TypeId;
+
+// We use byte spans to pass strings to handlers because strings might not be
+// null terminated, and even if they are, looking for a null terminator can
+// become very expensive in tight loops.
+typedef struct XLA_FFI_ByteSpan {
+  const char* ptr;
+  size_t len;
+} XLA_FFI_ByteSpan;
+
+// A struct to pass a scalar value to FFI handler.
+typedef struct XLA_FFI_Scalar {
+  XLA_FFI_DataType dtype;
+  void* value;
+} XLA_FFI_Scalar;
+
+// A struct to pass a dense array to FFI handler.
+typedef struct XLA_FFI_Array {
+  XLA_FFI_DataType dtype;
+  size_t size;
+  void* data;
+} XLA_FFI_Array;
+
+//===----------------------------------------------------------------------===//
+// Future
+//===----------------------------------------------------------------------===//
+
+// XLA FFI future is a mechanism to signal a result of asynchronous computation
+// (FFI handler) to the XLA runtime. It is similar to `std::future<void>` in C++
+// standard library, and implemented on top of `tsl::AsyncValue` in XLA runtime.
+//
+// XLA FFI users should use `Future` and `Promise` types defined in `xla::ffi`
+// namespace (see `ffi/api/ffi.h`), instead of using this API directly.
+typedef struct XLA_FFI_Future XLA_FFI_Future;
+
+struct XLA_FFI_Future_Create_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  XLA_FFI_Future* future;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Future_Create_Args, extension_start);
+
+typedef XLA_FFI_Error* XLA_FFI_Future_Create(XLA_FFI_Future_Create_Args* args);
+
+struct XLA_FFI_Future_SetAvailable_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  XLA_FFI_Future* future;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Future_SetAvailable_Args, future);
+
+typedef XLA_FFI_Error* XLA_FFI_Future_SetAvailable(
+    XLA_FFI_Future_SetAvailable_Args* args);
+
+struct XLA_FFI_Future_SetError_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  XLA_FFI_Future* future;
+  XLA_FFI_Error* error;  // ownership is transferred to the XLA runtime
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Future_SetError_Args, error);
+
+typedef XLA_FFI_Error* XLA_FFI_Future_SetError(
+    XLA_FFI_Future_SetError_Args* args);
+
+//===----------------------------------------------------------------------===//
+// Call frame
+//===----------------------------------------------------------------------===//
+
+// XLA runtime has multiple execution stages and it is possible to run
+// different handlers for each stage:
+//
+// (1) Instantiate - called when FFI handler is instantiated as a part of XLA
+//     executable instantiation. Every call site will have its own "instance" of
+//     the FFI handler, and it is possible to attach an arbitrary user-defined
+//     state to the FFI handler instance, and get it back in other execution
+//     stages. Constructed state owned by the XLA runtime and destructed
+//     together with a parent executable.
+//
+// (2) Prepare - called before the execution to let FFI handlers to prepare
+//     for the execution and request resources from runtime, i.e. in XLA:GPU
+//     we use prepare stage to request collective cliques.
+//
+// (3) Initialize - called before the execution after acquiring all the
+//     resources requested in the prepare stage.
+//
+// (4) Execute - called when FFI handler is executed. Note that FFI handler
+//     can be called as a part of command buffer capture (CUDA graph capture
+//     on GPU backend) and argument buffers might contain uninitialized
+//     values in this case.
+//
+// XLA program (HLO module) compiled to an XLA executable that can be executed
+// on any device accessible to the process, and by extension FFI handlers are
+// not instantiated for any particular device, but for a process. FFI handlers
+// running at instantiation stage do not have access to the underlying device
+// (memory allocation, stream, etc.) and arguments, however they can access
+// execution context and attributes.
+//
+// It is undefined behavior to access argument buffers in prepare and initialize
+// stages as they might not be initialized yet. However it is safe to use memory
+// address as it is assigned ahead of time by buffer assignment.
+typedef enum {
+  XLA_FFI_ExecutionStage_INSTANTIATE = 0,
+  XLA_FFI_ExecutionStage_PREPARE = 1,
+  XLA_FFI_ExecutionStage_INITIALIZE = 2,
+  XLA_FFI_ExecutionStage_EXECUTE = 3,
+} XLA_FFI_ExecutionStage;
+
+struct XLA_FFI_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  int64_t size;
+  XLA_FFI_ArgType* types;  // length == size
+  void** args;             // length == size
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Args, args);
+
+struct XLA_FFI_Rets {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  int64_t size;
+  XLA_FFI_RetType* types;  // length == size
+  void** rets;             // length == size
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Rets, rets);
+
+// FFI handler attributes are always sorted by name, so that the handler can
+// rely on binary search to look up attributes by name.
+struct XLA_FFI_Attrs {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  int64_t size;
+  XLA_FFI_AttrType* types;   // length == size
+  XLA_FFI_ByteSpan** names;  // length == size
+  void** attrs;              // length == size
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Attrs, attrs);
+
+struct XLA_FFI_CallFrame {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  const XLA_FFI_Api* api;
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_ExecutionStage stage;
+  XLA_FFI_Args args;
+  XLA_FFI_Rets rets;
+  XLA_FFI_Attrs attrs;
+
+  // XLA FFI handler implementation can use `future` to signal a result of
+  // asynchronous computation to the XLA runtime. XLA runtime will keep all
+  // arguments, results and attributes alive until `future` is completed.
+  XLA_FFI_Future* future;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_CallFrame, attrs);
+
+//===----------------------------------------------------------------------===//
+// FFI handler
+//===----------------------------------------------------------------------===//
+
+// External functions registered with XLA as FFI handlers.
+typedef XLA_FFI_Error* XLA_FFI_Handler(XLA_FFI_CallFrame* call_frame);
+
+// XLA FFI handlers for execution stages (see XLA_FFI_ExecutionStage).
+typedef struct XLA_FFI_Handler_Bundle {
+  XLA_FFI_Handler* instantiate;  // optional
+  XLA_FFI_Handler* prepare;      // optional
+  XLA_FFI_Handler* initialize;   // optional
+  XLA_FFI_Handler* execute;      // required
+} XLA_FFI_Handler_Bundle;
+
+enum XLA_FFI_Handler_TraitsBits {
+  // Calls to FFI handler are safe to trace into the command buffer. It means
+  // that calls to FFI handler always launch exactly the same device operations
+  // (can depend on attribute values) that can be captured and then replayed.
+  XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE = 1u << 0,
+};
+
+typedef uint32_t XLA_FFI_Handler_Traits;
+
+struct XLA_FFI_Handler_Register_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ByteSpan name;
+  XLA_FFI_ByteSpan platform;
+  XLA_FFI_Handler_Bundle bundle;
+  XLA_FFI_Handler_Traits traits;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, traits);
+
+typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
+    XLA_FFI_Handler_Register_Args* args);
+
+//===----------------------------------------------------------------------===//
+// TypeId
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_TypeId_Register_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ByteSpan name;
+  XLA_FFI_TypeId* type_id;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeId_Register_Args, type_id);
+
+// Registers user type `name` and returns a unique `type_id`.
+typedef XLA_FFI_Error* XLA_FFI_TypeId_Register(
+    XLA_FFI_TypeId_Register_Args* args);
+
+//===----------------------------------------------------------------------===//
+// ExecutionContext
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_ExecutionContext_Get_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_TypeId* type_id;
+  void* data;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ExecutionContext_Get_Args, data);
+
+// Returns an opaque data from the execution context for a given type id.
+typedef XLA_FFI_Error* XLA_FFI_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext_Get_Args* args);
+
+//===----------------------------------------------------------------------===//
+// State
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_State_Set_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_TypeId* type_id;
+  void* state;
+  void (*deleter)(void* state);
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_State_Set_Args, deleter);
+
+// Sets execution state to the `state` of type `type_id`. Returns an error if
+// state already set.
+typedef XLA_FFI_Error* XLA_FFI_State_Set(XLA_FFI_State_Set_Args* args);
+
+struct XLA_FFI_State_Get_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_TypeId* type_id;
+  void* state;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_State_Get_Args, state);
+
+// Gets execution state of type `type_id`. Returns an error if state is not set,
+// or set with a state of a different type.
+typedef XLA_FFI_Error* XLA_FFI_State_Get(XLA_FFI_State_Get_Args* args);
+
+//===----------------------------------------------------------------------===//
+// Stream
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Stream_Get_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  void* stream;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Stream_Get_Args, stream);
+
+// Returns an underling platform-specific stream via out argument, i.e. for CUDA
+// platform it returns `CUstream` (same as `cudaStream`).
+typedef XLA_FFI_Error* XLA_FFI_Stream_Get(XLA_FFI_Stream_Get_Args* args);
+
+//===----------------------------------------------------------------------===//
+// Device memory allocation
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_DeviceMemory_Allocate_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  size_t size;
+  size_t alignment;
+  void* data;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_DeviceMemory_Allocate_Args, data);
+
+// Allocates a block of memory on the device bound to the execution context.
+typedef XLA_FFI_Error* XLA_FFI_DeviceMemory_Allocate(
+    XLA_FFI_DeviceMemory_Allocate_Args* args);
+
+struct XLA_FFI_DeviceMemory_Free_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  size_t size;
+  void* data;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_DeviceMemory_Free_Args, data);
+
+// Frees previously allocated device memory.
+typedef XLA_FFI_Error* XLA_FFI_DeviceMemory_Free(
+    XLA_FFI_DeviceMemory_Free_Args* args);
+
+//===----------------------------------------------------------------------===//
+// ThreadPool
+//===----------------------------------------------------------------------===//
+
+// A function pointer for a task to be scheduled on a thread pool. XLA runtime
+// will call this function with a user-defined `data` pointer on one of the
+// runtime-managed threads. For XLA:CPU backends the task will be invoked on
+// a thread pool that runs all compute tasks (Eigen thread pool).
+//
+// IMPORTANT: Users must not rely on any particular execution order or the
+// number of available threads. Tasks can be executed in the caller thread, or
+// in a thread pool with size `1`, and it is unsafe to assume that all scheduled
+// tasks can be executed in parallel.
+typedef void XLA_FFI_Task(void* data);
+
+struct XLA_FFI_ThreadPool_Schedule_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_Task* task;
+  void* data;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ThreadPool_Schedule_Args, data);
+
+// Schedules a task to be executed on a thread pool managed by XLA runtime.
+// Returns an error if thread pool is not available.
+typedef XLA_FFI_Error* XLA_FFI_ThreadPool_Schedule(
+    XLA_FFI_ThreadPool_Schedule_Args* args);
+
+struct XLA_FFI_ThreadPool_NumThreads_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  int64_t* num_threads;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ThreadPool_NumThreads_Args, num_threads);
+
+// Returns the number of threads in the thread pool managed by XLA runtime.
+typedef XLA_FFI_Error* XLA_FFI_ThreadPool_NumThreads(
+    XLA_FFI_ThreadPool_NumThreads_Args* args);
+
+//===----------------------------------------------------------------------===//
+// Metadata extension
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Metadata {
+  size_t struct_size;
+  XLA_FFI_Api_Version api_version;
+  XLA_FFI_Handler_Traits traits;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Metadata, traits);
+
+struct XLA_FFI_Metadata_Extension {
+  XLA_FFI_Extension_Base extension_base;
+  XLA_FFI_Metadata* metadata;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Metadata_Extension, metadata);
+
+//===----------------------------------------------------------------------===//
+// API access
+//===----------------------------------------------------------------------===//
+
+#define _XLA_FFI_API_STRUCT_FIELD(fn_type) fn_type* fn_type
+
+struct XLA_FFI_Api {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_Api_Version api_version;
+  XLA_FFI_InternalApi* internal_api;
+
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Create);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_GetMessage);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Destroy);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Handler_Register);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Stream_Get);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_TypeId_Register);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_Get);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_State_Set);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_State_Get);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_DeviceMemory_Allocate);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_DeviceMemory_Free);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ThreadPool_Schedule);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ThreadPool_NumThreads);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Future_Create);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Future_SetAvailable);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Future_SetError);
+};
+
+#undef _XLA_FFI_API_STRUCT_FIELD
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api, XLA_FFI_Stream_Get);
+
+const XLA_FFI_Api* XLA_FFI_GetApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_FFI_API_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/c_api_internal.h
new file mode 100644
index 00000000..47f6f2e5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -0,0 +1,111 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_C_API_INTERNAL_H_
+#define XLA_FFI_API_C_API_INTERNAL_H_
+
+#include <cstdint>
+
+#include "xla/ffi/api/c_api.h"
+
+// Internal XLA FFI API that gives access to XLA implementation details that
+// should be used only for implementing FFI handlers statically linked into
+// the binary. This API should be used only by XLA itself (to implement builtin
+// custom calls), or libraries tightly coupled to XLA and built from exact same
+// commit and using the same toolchain (e.g. jaxlib). Trying to use this API
+// from a dynamically loaded shared library can lead to undefined behavior and
+// likely impossible to debug run time crashes.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Because this is an internal XLA FFI API we use a slightly relaxed C API
+// style and do not track the struct size, as we expect this API to be used
+// only in statically linked binaries, and we do not need any backward or
+// forward compatibility.
+
+// Forwards `absl::Status` object pointed to by `status` to XLA FFI error
+// (status left in moved-from state). Pointer ownership stays with the
+// caller.
+typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Error_Forward(void* status);
+
+// Forwards `tsl::AsyncValue` object pointed to by `async_value` to XLA FFI
+// future. Async value ownership transferred to the XLA FFI future.
+typedef XLA_FFI_Future* XLA_FFI_INTERNAL_Future_Forward(void* async_value);
+
+// Returns a pointer to main compute stream (`se::Stream` pointer). In
+// contrast to public C API which returns a pointer to underlying platform
+// stream (i.e. cudaStream_t for CUDA backend), this API returns a pointer to
+// StreamExecutor stream which is unsafe to use across dynamic library boundary.
+typedef void* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx);
+
+// Returns the device ordinal of the device associated with the execution
+// context.
+typedef int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
+// pointer) which allows to allocate memory inside a custom call from the same
+// allocator as XLA (i.e. it allows to construct scratch memory allocator).
+typedef void* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+// Returns a pointer to `xla::HloComputation` if FFI handler has a called
+// computation attached to it.
+typedef void* XLA_FFI_INTERNAL_CalledComputation_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+// Returns a pointer to the underlying `xla::ffi::ExecutionContext` object which
+// allows to access typed user data attached to the execution context.
+typedef void* XLA_FFI_INTERNAL_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+// Returns a pointer to the underlying `xla::ffi::ExecutionState` object which
+// allows to access typed data stored in the execution state.
+typedef void* XLA_FFI_INTERNAL_ExecutionState_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+// Returns a pointer to the `Eigen::ThreadPoolDevice` passed via run options,
+// which allows FFI handlers to execute tasks in the same thread pool as XLA.
+typedef void* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+//===----------------------------------------------------------------------===//
+// API access
+//===----------------------------------------------------------------------===//
+
+#define _XLA_FFI_INTERNAL_API_STRUCT_FIELD(fn_type) fn_type* fn_type
+
+struct XLA_FFI_InternalApi {
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Error_Forward);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Future_Forward);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Stream_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_DeviceOrdinal_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(
+      XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_CalledComputation_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_ExecutionContext_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_ExecutionState_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_IntraOpThreadPool_Get);
+};
+
+#undef _XLA_FFI_INTERNAL_API_STRUCT_FIELD
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_FFI_API_C_API_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/ffi.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/ffi.h
new file mode 100644
index 00000000..f264451d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/api/ffi.h
@@ -0,0 +1,1426 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_FFI_H_
+#define XLA_FFI_API_FFI_H_
+
+#ifdef XLA_FFI_FFI_H_
+#error Two different XLA FFI implementations cannot be included together. \
+       See README.md for more details.
+#endif  // XLA_FFI_FFI_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "xla/ffi/api/c_api.h"
+
+// IWYU pragma: begin_exports
+#include "xla/ffi/api/api.h"
+// IWYU pragma: end_exports
+
+namespace xla::ffi {
+
+// All user data types that are passed via the execution context or state must
+// be registered with the XLA FFI ahead of time to get unique type id.
+using TypeId = XLA_FFI_TypeId;  // NOLINT
+
+enum class DataType : uint8_t {
+  INVALID = XLA_FFI_DataType_INVALID,
+  PRED = XLA_FFI_DataType_PRED,
+  S8 = XLA_FFI_DataType_S8,
+  S16 = XLA_FFI_DataType_S16,
+  S32 = XLA_FFI_DataType_S32,
+  S64 = XLA_FFI_DataType_S64,
+  U8 = XLA_FFI_DataType_U8,
+  U16 = XLA_FFI_DataType_U16,
+  U32 = XLA_FFI_DataType_U32,
+  U64 = XLA_FFI_DataType_U64,
+  F16 = XLA_FFI_DataType_F16,
+  F32 = XLA_FFI_DataType_F32,
+  F64 = XLA_FFI_DataType_F64,
+  BF16 = XLA_FFI_DataType_BF16,
+  C64 = XLA_FFI_DataType_C64,
+  C128 = XLA_FFI_DataType_C128,
+  TOKEN = XLA_FFI_DataType_TOKEN,
+  F8E5M2 = XLA_FFI_DataType_F8E5M2,
+  F8E4M3 = XLA_FFI_DataType_F8E4M3,
+  F8E4M3FN = XLA_FFI_DataType_F8E4M3FN,
+  F8E4M3B11FNUZ = XLA_FFI_DataType_F8E4M3B11FNUZ,
+  F8E5M2FNUZ = XLA_FFI_DataType_F8E5M2FNUZ,
+  F8E4M3FNUZ = XLA_FFI_DataType_F8E4M3FNUZ,
+  F8E3M4 = XLA_FFI_DataType_F8E3M4,
+};
+
+// Create aliases in ::xla::ffi namespace for all DataTypes, for consistency
+// with xla that defines PrimitiveType enums in ::xla namespace.
+inline constexpr DataType PRED = DataType::PRED;
+inline constexpr DataType S8 = DataType::S8;
+inline constexpr DataType S16 = DataType::S16;
+inline constexpr DataType S32 = DataType::S32;
+inline constexpr DataType S64 = DataType::S64;
+inline constexpr DataType U8 = DataType::U8;
+inline constexpr DataType U16 = DataType::U16;
+inline constexpr DataType U32 = DataType::U32;
+inline constexpr DataType U64 = DataType::U64;
+inline constexpr DataType F16 = DataType::F16;
+inline constexpr DataType F32 = DataType::F32;
+inline constexpr DataType F64 = DataType::F64;
+inline constexpr DataType BF16 = DataType::BF16;
+inline constexpr DataType C64 = DataType::C64;
+inline constexpr DataType C128 = DataType::C128;
+inline constexpr DataType TOKEN = DataType::TOKEN;
+inline constexpr DataType F8E5M2 = DataType::F8E5M2;
+inline constexpr DataType F8E4M3 = DataType::F8E4M3;
+inline constexpr DataType F8E4M3FN = DataType::F8E4M3FN;
+inline constexpr DataType F8E4M3B11FNUZ = DataType::F8E4M3B11FNUZ;
+inline constexpr DataType F8E5M2FNUZ = DataType::F8E5M2FNUZ;
+inline constexpr DataType F8E4M3FNUZ = DataType::F8E4M3FNUZ;
+inline constexpr DataType F8E3M4 = DataType::F8E3M4;
+
+inline std::ostream& operator<<(std::ostream& os, const DataType dtype) {
+  return os << static_cast<XLA_FFI_DataType>(dtype);
+}
+
+constexpr size_t ByteWidth(DataType dtype) {
+  switch (dtype) {
+    case DataType::INVALID:
+    case DataType::TOKEN:
+      return 0;
+    case DataType::PRED:
+      return 1;
+    case DataType::S8:
+    case DataType::U8:
+    case DataType::F8E5M2:
+    case DataType::F8E4M3:
+    case DataType::F8E4M3FN:
+    case DataType::F8E4M3B11FNUZ:
+    case DataType::F8E5M2FNUZ:
+    case DataType::F8E4M3FNUZ:
+    case DataType::F8E3M4:
+      return 1;
+    case DataType::S16:
+    case DataType::U16:
+    case DataType::F16:
+    case DataType::BF16:
+      return 2;
+    case DataType::S32:
+    case DataType::U32:
+    case DataType::F32:
+      return 4;
+    case DataType::S64:
+    case DataType::U64:
+    case DataType::F64:
+      return 8;
+    case DataType::C64:
+      return 8;
+    case DataType::C128:
+      return 16;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Span is non-owning view into contiguous values of type `T`.
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): Replace with `std::span` when C++20 is available.
+template <typename T>
+class Span {
+ public:
+  constexpr Span() : data_(nullptr), size_(0) {}
+
+  Span(T* data, size_t size) : data_(data), size_(size) {}
+  Span(const std::vector<std::remove_const_t<T>>& vec)  // NOLINT
+      : Span(vec.data(), vec.size()) {}
+
+  T& operator[](size_t index) const { return data_[index]; }
+
+  bool operator==(const Span<T>& other) const {
+    return size() == other.size() && std::equal(begin(), end(), other.begin());
+  }
+
+  T& front() const { return data_[0]; }
+  T& back() const { return data_[size_ - 1]; }
+  Span<T> first(size_t n) const { return Span<T>(data_, n); }
+  Span<T> last(size_t n) const { return Span<T>(data_ + size_ - n, n); }
+  size_t size() const { return size_; }
+
+  T* begin() const { return data_; }
+  T* end() const { return data_ + size_; }
+
+ private:
+  T* data_;
+  size_t size_;
+};
+
+//===----------------------------------------------------------------------===//
+// Error
+//===----------------------------------------------------------------------===//
+
+enum class ErrorCode : uint8_t {
+  kOk = XLA_FFI_Error_Code_OK,
+  kCancelled = XLA_FFI_Error_Code_CANCELLED,
+  kUnknown = XLA_FFI_Error_Code_UNKNOWN,
+  kInvalidArgument = XLA_FFI_Error_Code_INVALID_ARGUMENT,
+  kDeadlineExceeded = XLA_FFI_Error_Code_DEADLINE_EXCEEDED,
+  kNotFound = XLA_FFI_Error_Code_NOT_FOUND,
+  kAlreadyExists = XLA_FFI_Error_Code_ALREADY_EXISTS,
+  kPermissionDenied = XLA_FFI_Error_Code_PERMISSION_DENIED,
+  kResourceExhausted = XLA_FFI_Error_Code_RESOURCE_EXHAUSTED,
+  kFailedPrecondition = XLA_FFI_Error_Code_FAILED_PRECONDITION,
+  kAborted = XLA_FFI_Error_Code_ABORTED,
+  kOutOfRange = XLA_FFI_Error_Code_OUT_OF_RANGE,
+  kUnimplemented = XLA_FFI_Error_Code_UNIMPLEMENTED,
+  kInternal = XLA_FFI_Error_Code_INTERNAL,
+  kUnavailable = XLA_FFI_Error_Code_UNAVAILABLE,
+  kDataLoss = XLA_FFI_Error_Code_DATA_LOSS,
+  kUnauthenticated = XLA_FFI_Error_Code_UNAUTHENTICATED,
+};
+
+class Error {
+ public:
+  Error() = default;
+
+  Error(ErrorCode errc, std::string message)
+      : errc_(errc), message_(std::move(message)) {}
+
+  Error(XLA_FFI_Error_Code errc, std::string message)
+      : Error(static_cast<ErrorCode>(errc), std::move(message)) {}
+
+  bool success() const { return errc_ == ErrorCode::kOk; }
+  bool failure() const { return !success(); }
+
+  std::optional<ErrorCode> errc() const { return errc_; }
+  const std::string& message() const { return message_; }
+
+  static Error Success() { return Error(); }
+
+  static Error Internal(std::string message) {
+    return Error(ErrorCode::kInternal, std::move(message));
+  }
+
+  static Error InvalidArgument(std::string message) {
+    return Error(ErrorCode::kInvalidArgument, std::move(message));
+  }
+
+ private:
+  ErrorCode errc_ = ErrorCode::kOk;
+  std::string message_;
+};
+
+//===----------------------------------------------------------------------===//
+// Expected<T, E> and ErrorOr<T>
+//===----------------------------------------------------------------------===//
+
+// Forward declare.
+template <typename E>
+class Unexpected;
+
+// TODO(slebedev): Replace with `std::expected` when C++23 is available.
+template <typename T, typename E>
+class Expected {
+ public:
+  constexpr Expected(T value) : data_(std::move(value)) {}  // NOLINT
+  constexpr Expected(Unexpected<E> u);                      // NOLINT
+
+  constexpr operator bool() const {  // NOLINT
+    return has_value();
+  }
+
+  constexpr T& operator*() & { return value(); }
+  constexpr const T& operator*() const& { return value(); }
+  constexpr T&& operator*() && { return std::move(value()); }
+  constexpr const T& operator*() const&& { return std::move(value()); }
+
+  constexpr T* operator->() { return &value(); }
+  constexpr const T* operator->() const { return &value(); }
+
+  constexpr bool has_value() const { return std::holds_alternative<T>(data_); }
+  constexpr bool has_error() const { return std::holds_alternative<E>(data_); }
+
+  constexpr T& value() & { return std::get<T>(data_); }
+  constexpr const T& value() const& { return std::get<T>(data_); }
+  constexpr T&& value() && { return std::get<T>(std::move(data_)); }
+  constexpr const T& value() const&& { return std::get<T>(std::move(data_)); }
+
+  constexpr E& error() & { return std::get<E>(data_); }
+  constexpr const E& error() const& { return std::get<E>(data_); }
+  constexpr E&& error() && { return std::get<E>(std::move(data_)); }
+  constexpr const E&& error() const&& { return std::get<E>(std::move(data_)); }
+
+ private:
+  std::variant<T, E> data_;
+};
+
+template <typename E>
+class Unexpected {
+ public:
+  constexpr Unexpected(E error) : error_(std::move(error)) {}  // NOLINT
+
+ private:
+  template <typename, typename>
+  friend class Expected;
+
+  E error_;
+};
+
+Unexpected(const char*) -> Unexpected<std::string>;
+
+template <typename T, typename E>
+constexpr Expected<T, E>::Expected(Unexpected<E> u)
+    : data_(std::move(u.error_)) {}
+
+template <typename T>
+class ErrorOr : public Expected<T, Error> {
+ public:
+  using Expected<T, Error>::Expected;
+};
+
+//===----------------------------------------------------------------------===//
+// Future
+//===----------------------------------------------------------------------===//
+
+// A Promise and a Future are loosely based on `std::promise` and `std::future`,
+// with an API similar to `tsl::AsyncValue`. Implementation is based on a
+// simplified version of an AsyncValue with at most one waiter.
+
+// A promise to complete execution with a success or an error.
+class Promise;
+
+// A future that becomes available when a corresponding promise is completed.
+class Future {
+ public:
+  explicit Future(const Promise& promise);
+
+  Future(Future&&) = default;
+  Future& operator=(Future&&) = default;
+
+  template <typename F>
+  void OnReady(F&& f);
+
+ private:
+  friend class Promise;
+
+  using Waiter = std::function<void(const std::optional<Error>& error)>;
+
+  enum class State : uint8_t { kPending, kAvailable, kError };
+
+  struct WaiterAndState {
+    static_assert(alignof(std::max_align_t) >= 8 && sizeof(Waiter*) == 8);
+
+    static constexpr uint64_t kStateMask = (1ull << 2) - 1;
+    static constexpr uint64_t kPointerMask = ~kStateMask;
+
+    WaiterAndState(Waiter* ptr, State state) {
+      value = (reinterpret_cast<uintptr_t>(ptr) & kPointerMask) |
+              (static_cast<uintptr_t>(state) & kStateMask);
+    }
+
+    WaiterAndState() : WaiterAndState(nullptr, State::kPending) {}
+
+    State state() const { return static_cast<State>(value & kStateMask); }
+
+    Waiter* waiter() const {
+      return reinterpret_cast<Waiter*>(value & kPointerMask);
+    }
+
+    uintptr_t value;
+  };
+
+  static_assert(std::atomic<WaiterAndState>::is_always_lock_free,
+                "WaiterAndState atomic must be lock-free");
+
+  struct Data {
+    std::atomic<WaiterAndState> waiter_and_state = WaiterAndState();
+    std::optional<Error> error;
+  };
+
+  std::shared_ptr<Data> data_;
+};
+
+class Promise {
+ public:
+  Promise() : data_(std::make_shared<Future::Data>()) {}
+
+  Promise(Promise&&) = default;
+  Promise& operator=(Promise&&) = default;
+
+  void SetAvailable();
+  void SetError(Error error);
+
+ private:
+  friend class Future;
+
+  void SetCompleted(Future::State state);
+
+  std::shared_ptr<Future::Data> data_;
+};
+
+inline Future::Future(const Promise& promise) : data_(promise.data_) {
+  assert(data_.use_count() == 2 &&
+         "Promise can be used to create at most one Future");
+}
+
+template <typename F>
+void Future::OnReady(F&& f) {
+  static_assert(std::is_invocable_v<F, const std::optional<Error>&>,
+                "F must be compatible with Waiter signature");
+
+  WaiterAndState old_value =
+      data_->waiter_and_state.load(std::memory_order_acquire);
+
+  // If future is already completed, just run the waiter.
+  if (old_value.state() != State::kPending) {
+    f(data_->error);
+    return;
+  }
+
+  // Otherwise, add the waiter to the future.
+  auto* waiter = new Waiter(std::forward<F>(f));
+  auto new_value = WaiterAndState(waiter, State::kPending);
+
+  while (!data_->waiter_and_state.compare_exchange_weak(
+      old_value, new_value, std::memory_order_acq_rel,
+      std::memory_order_acquire)) {
+    // Another thread completed the future, just run the waiter.
+    if (old_value.state() != State::kPending) {
+      assert(old_value.waiter() == nullptr);
+      (*waiter)(data_->error);
+      delete waiter;
+      return;
+    }
+  }
+
+  // If CAS succeeded the future must be in the pending state.
+  assert(old_value.state() == State::kPending);
+}
+
+inline void Promise::SetAvailable() { SetCompleted(Future::State::kAvailable); }
+
+inline void Promise::SetError(Error error) {
+  assert(error.errc() != ErrorCode::kOk);
+  assert(data_->error == std::nullopt);
+  data_->error = std::move(error);
+
+  SetCompleted(Future::State::kError);
+}
+
+inline void Promise::SetCompleted(Future::State state) {
+  Future::WaiterAndState old_value = data_->waiter_and_state.exchange(
+      {nullptr, state}, std::memory_order_acq_rel);
+  assert(old_value.state() == Future::State::kPending);
+
+  if (Future::Waiter* waiter = old_value.waiter()) {
+    (*waiter)(data_->error);
+    delete waiter;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Arguments
+//===----------------------------------------------------------------------===//
+
+// Dynamically-typed buffer.
+//
+// No checks are done at decoding time. Any dtype and rank combination is
+// accepted.
+class AnyBuffer {
+ public:
+  using Dimensions = Span<const int64_t>;
+
+  explicit AnyBuffer(const XLA_FFI_Buffer* buf) : buf_(buf) {
+    assert(buf != nullptr && "XLA_FFI_Buffer must be non-null");
+  }
+
+  DataType element_type() const { return DataType(buf_->dtype); }
+
+  Dimensions dimensions() const { return Dimensions(buf_->dims, buf_->rank); }
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE size_t size_bytes() const {
+    return ByteWidth(element_type()) * element_count();
+  }
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE size_t element_count() const {
+    Dimensions dims = dimensions();
+    return std::accumulate(dims.begin(), dims.end(), int64_t{1},
+                           std::multiplies<>());
+  }
+
+  void* untyped_data() const { return buf_->data; }
+
+  template <typename T>
+  T* typed_data() const {
+    assert(internal::NativeTypeToCApiDataType<T>() == buf_->dtype &&
+           "Template type must match the underlying buffer dtype");
+    return reinterpret_cast<T*>(buf_->data);
+  }
+
+  template <typename T>
+  T* reinterpret_data() const {
+    assert(sizeof(T) == ByteWidth(element_type()) &&
+           !(reinterpret_cast<std::uintptr_t>(buf_->data) % alignof(T)) &&
+           "Requested type must have the same byte width and alignment as the "
+           "underlying buffer type");
+    return reinterpret_cast<T*>(buf_->data);
+  }
+
+ private:
+  const XLA_FFI_Buffer* buf_;
+};
+
+namespace internal {
+
+// A workaround for the fact that a static_assertion can be evaluated
+// whether or not the template is instantiated
+template <DataType dtype>
+struct always_false : std::false_type {};
+
+template <DataType dtype>
+struct DataTypeToNative {
+  static_assert(always_false<dtype>::value, "unsupported data type");
+};
+
+#define XLA_FFI_REGISTER_DATATYPE_MAPPING(data_type_value, actual_type) \
+  template <>                                                           \
+  struct DataTypeToNative<data_type_value> {                            \
+    using type = actual_type;                                           \
+  };
+
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::PRED, bool);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U8, uint8_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U16, uint16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U32, uint32_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U64, uint64_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S8, int8_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S16, int16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S32, int32_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S64, int64_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::F16, uint16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::F32, float);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::F64, double);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::BF16, uint16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::C64, std::complex<float>);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::C128, std::complex<double>);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::TOKEN, void);
+
+#undef XLA_FFI_REGISTER_DATATYPE_MAPPING
+
+inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
+
+}  // namespace internal
+
+constexpr DataType ToComplex(DataType dtype) {
+  switch (dtype) {
+    case DataType::F32:
+      return DataType::C64;
+    case DataType::F64:
+      return DataType::C128;
+    default:
+      return DataType::INVALID;
+  }
+}
+
+constexpr DataType ToReal(DataType dtype) {
+  switch (dtype) {
+    case DataType::C64:
+      return DataType::F32;
+    case DataType::C128:
+      return DataType::F64;
+    default:
+      return dtype;
+  }
+}
+
+constexpr DataType ToImag(DataType dtype) {
+  switch (dtype) {
+    case DataType::C64:
+      return DataType::F32;
+    case DataType::C128:
+      return DataType::F64;
+    default:
+      return dtype;
+  }
+}
+
+template <DataType dtype>
+using NativeType = typename internal::DataTypeToNative<dtype>::type;
+
+template <DataType dtype>
+constexpr bool IsComplexType() {
+  return std::is_same_v<NativeType<dtype>,
+                        std::complex<NativeType<ToReal(dtype)>>>;
+}
+
+static_assert(ToReal(DataType::C64) == DataType::F32);
+static_assert(ToReal(DataType::C128) == DataType::F64);
+static_assert(ToReal(DataType::F32) == DataType::F32);
+static_assert(ToComplex(DataType::F32) == DataType::C64);
+static_assert(ToComplex(DataType::F64) == DataType::C128);
+static_assert(ToComplex(DataType::S32) == DataType::INVALID);
+static_assert(ToComplex(ToReal(DataType::C64)) == DataType::C64);
+static_assert(ToComplex(ToImag(DataType::C128)) == DataType::C128);
+static_assert(IsComplexType<DataType::C64>());
+static_assert(IsComplexType<DataType::C128>());
+static_assert(!IsComplexType<DataType::F32>());
+
+// Buffer with a statically-known dtype and rank.
+//
+// The dtype and rank are checked at decoding time. If rank is not specified,
+// any rank is accepted.
+template <DataType dtype, size_t rank = internal::kDynamicRank>
+class Buffer {
+ public:
+  using Dimensions = AnyBuffer::Dimensions;
+
+  explicit Buffer(const XLA_FFI_Buffer* buf) : buf_(buf) {
+    assert(buf_ != nullptr && "XLA_FFI_Buffer must be non-null");
+  }
+
+  DataType element_type() const { return dtype; }
+
+  Dimensions dimensions() const {
+    return Dimensions(buf_->dims,
+                      rank == internal::kDynamicRank ? buf_->rank : rank);
+  }
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE size_t size_bytes() const {
+    return ByteWidth(dtype) * element_count();
+  }
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE size_t element_count() const {
+    Dimensions dims = dimensions();
+    return std::accumulate(dims.begin(), dims.end(), int64_t{1},
+                           std::multiplies<>());
+  }
+
+  void* untyped_data() const { return buf_->data; }
+
+  NativeType<dtype>* typed_data() const {
+    return reinterpret_cast<NativeType<dtype>*>(untyped_data());
+  }
+
+ private:
+  const XLA_FFI_Buffer* buf_;
+};
+
+// clang-format off
+template <DataType dtype> using BufferR0 = Buffer<dtype, 0>;
+template <DataType dtype> using BufferR1 = Buffer<dtype, 1>;
+template <DataType dtype> using BufferR2 = Buffer<dtype, 2>;
+template <DataType dtype> using BufferR3 = Buffer<dtype, 3>;
+template <DataType dtype> using BufferR4 = Buffer<dtype, 4>;
+// clang-format on
+
+using Token = BufferR0<DataType::TOKEN>;  // NOLINT
+
+namespace internal {
+
+template <DataType dtype, size_t rank>
+XLA_FFI_ATTRIBUTE_ALWAYS_INLINE std::optional<Buffer<dtype, rank>> DecodeBuffer(
+    XLA_FFI_Buffer* buf, DiagnosticEngine& diagnostic) {
+  if (auto buf_dtype = static_cast<DataType>(buf->dtype);
+      XLA_FFI_PREDICT_FALSE(buf_dtype != dtype)) {
+    return diagnostic.Emit("Wrong buffer dtype: expected ")
+           << dtype << " but got " << buf_dtype;
+  }
+
+  if constexpr (rank != internal::kDynamicRank) {
+    if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
+      return diagnostic.Emit("Wrong buffer rank: expected ")
+             << rank << " but got " << buf->rank;
+    }
+  }
+
+  return Buffer<dtype, rank>(buf);
+}
+
+}  // namespace internal
+
+template <DataType dtype, size_t rank = internal::kDynamicRank>
+using ResultBuffer = Result<Buffer<dtype, rank>>;
+
+// clang-format off
+template <DataType dtype> using ResultBufferR0 = ResultBuffer<dtype, 0>;
+template <DataType dtype> using ResultBufferR1 = ResultBuffer<dtype, 1>;
+template <DataType dtype> using ResultBufferR2 = ResultBuffer<dtype, 2>;
+template <DataType dtype> using ResultBufferR3 = ResultBuffer<dtype, 3>;
+template <DataType dtype> using ResultBufferR4 = ResultBuffer<dtype, 4>;
+// clang-format on
+
+//===----------------------------------------------------------------------===//
+// Arguments binding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ArgBinding<AnyBuffer> {
+  using Arg = AnyBuffer;
+};
+
+template <DataType dtype, size_t rank>
+struct ArgBinding<Buffer<dtype, rank>> {
+  using Arg = Buffer<dtype, rank>;
+};
+
+//===----------------------------------------------------------------------===//
+// Results binding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct RetBinding<Result<AnyBuffer>> {
+  using Ret = AnyBuffer;
+};
+
+template <DataType dtype, size_t rank>
+struct RetBinding<Result<Buffer<dtype, rank>>> {
+  using Ret = Buffer<dtype, rank>;
+};
+
+//===----------------------------------------------------------------------===//
+// Arguments decoding
+//===----------------------------------------------------------------------===//
+
+inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
+  switch (type) {
+    case XLA_FFI_ArgType_BUFFER:
+      return os << "buffer";
+  }
+}
+
+template <>
+struct ArgDecoding<AnyBuffer> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<AnyBuffer> Decode(XLA_FFI_ArgType type, void* arg,
+                                         DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+    return AnyBuffer(reinterpret_cast<XLA_FFI_Buffer*>(arg));
+  }
+};
+
+template <DataType dtype, size_t rank>
+struct ArgDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Buffer<dtype, rank>> Decode(
+      XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(arg), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of arguments.
+//===----------------------------------------------------------------------===//
+
+class RemainingArgs : public internal::RemainingArgsBase {
+ public:
+  using internal::RemainingArgsBase::RemainingArgsBase;
+
+  template <typename T>
+  ErrorOr<T> get(size_t index) const {
+    size_t idx = offset() + index;
+    if (XLA_FFI_PREDICT_FALSE(idx >= args()->size)) {
+      return Unexpected(
+          Error(ErrorCode::kInvalidArgument, "Index out of range"));
+    }
+
+    DiagnosticEngine diagnostic;
+    std::optional<T> value = ArgDecoding<T>::Decode(
+        args()->types[idx], args()->args[idx], diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+
+    return *value;
+  }
+};
+
+template <>
+struct internal::Decode<internal::RemainingArgsTag> {
+  static std::optional<RemainingArgs> call(DecodingOffsets& offsets,
+                                           DecodingContext& ctx,
+                                           DiagnosticEngine& diagnostic) {
+    return RemainingArgs(&ctx.call_frame->args, offsets.args);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Results decoding
+//===----------------------------------------------------------------------===//
+
+inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_RetType type) {
+  switch (type) {
+    case XLA_FFI_RetType_BUFFER:
+      return os << "buffer";
+  }
+}
+
+template <>
+struct RetDecoding<AnyBuffer> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<AnyBuffer>> Decode(XLA_FFI_RetType type,
+                                                 void* ret,
+                                                 DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
+    }
+    return AnyBuffer(reinterpret_cast<XLA_FFI_Buffer*>(ret));
+  }
+};
+
+template <DataType dtype, size_t rank>
+struct RetDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<Buffer<dtype, rank>>> Decode(
+      XLA_FFI_RetType type, void* ret, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
+    }
+
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(ret), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of results.
+//===----------------------------------------------------------------------===//
+
+class RemainingRets : public internal::RemainingRetsBase {
+ public:
+  using internal::RemainingRetsBase::RemainingRetsBase;
+
+  template <typename T>
+  ErrorOr<Result<T>> get(size_t index) const {
+    size_t idx = offset() + index;
+    if (XLA_FFI_PREDICT_FALSE(idx >= rets()->size)) {
+      return Unexpected(
+          Error(ErrorCode::kInvalidArgument, "Index out of range"));
+    }
+
+    DiagnosticEngine diagnostic;
+    std::optional<Result<T>> value = RetDecoding<T>::Decode(
+        rets()->types[idx], rets()->rets[idx], diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+
+    return *value;
+  }
+};
+
+template <>
+struct internal::Decode<internal::RemainingRetsTag> {
+  static std::optional<RemainingRets> call(DecodingOffsets& offsets,
+                                           DecodingContext& ctx,
+                                           DiagnosticEngine& diagnostic) {
+    return RemainingRets(&ctx.call_frame->rets, offsets.rets);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Attributes decoding
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(T, TYPE)                       \
+  template <>                                                               \
+  struct AttrDecoding<Span<const T>> {                                      \
+    using Type = Span<const T>;                                             \
+    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,    \
+                                      DiagnosticEngine& diagnostic) {       \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {          \
+        return diagnostic.Emit("Wrong attribute type: expected ")           \
+               << XLA_FFI_AttrType_ARRAY << " but got " << type;            \
+      }                                                                     \
+                                                                            \
+      auto* array = reinterpret_cast<XLA_FFI_Array*>(attr);                 \
+      if (XLA_FFI_PREDICT_FALSE(array->dtype != TYPE)) {                    \
+        return diagnostic.Emit("Wrong array data type: expected ")          \
+               << TYPE << " but got " << array->dtype;                      \
+      }                                                                     \
+                                                                            \
+      return Span<const T>(reinterpret_cast<T*>(array->data), array->size); \
+    }                                                                       \
+  }
+
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int8_t, XLA_FFI_DataType_S8);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int16_t, XLA_FFI_DataType_S16);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(uint8_t, XLA_FFI_DataType_U8);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(uint16_t, XLA_FFI_DataType_U16);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(uint32_t, XLA_FFI_DataType_U32);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(uint64_t, XLA_FFI_DataType_U64);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(double, XLA_FFI_DataType_F64);
+
+#undef XLA_FFI_REGISTER_ARRAY_ATTR_DECODING
+
+template <>
+struct AttrDecoding<std::string_view> {
+  using Type = std::string_view;
+  static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
+                                                void* attr,
+                                                DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
+      return diagnostic.Emit("Wrong attribute type: expected ")
+             << XLA_FFI_AttrType_STRING << " but got " << type;
+    }
+
+    auto* span = reinterpret_cast<XLA_FFI_ByteSpan*>(attr);
+    return std::string_view(span->ptr, span->len);
+  }
+};
+
+// A type tag to mark i64 attributes as pointers to `T`.
+template <typename T>
+struct Pointer {};
+
+template <typename T>
+struct AttrDecoding<Pointer<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
+                                    DiagnosticEngine& diagnostic) {
+    auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
+                              scalar->dtype != XLA_FFI_DataType_S64)) {
+      return diagnostic.Emit("Wrong attribute type: ")
+             << "expected i64 scalar for passing pointer but got " << type;
+    }
+
+    static_assert(sizeof(uintptr_t) == sizeof(int64_t));
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(scalar->value);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing dictionary attributes.
+//===----------------------------------------------------------------------===//
+
+class Dictionary : public internal::DictionaryBase {
+ public:
+  using internal::DictionaryBase::DictionaryBase;
+
+  template <typename T>
+  ErrorOr<T> get(std::string_view name) const {
+    DiagnosticEngine diagnostic;
+    std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
+    if (!value.has_value()) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+    return *value;
+  }
+};
+
+// Decode `AttrsTag` (all attributes) into a `Dictionary`.
+template <>
+struct internal::Decode<internal::AttrsTag<Dictionary>> {
+  static std::optional<Dictionary> call(DecodingOffsets& offsets,
+                                        DecodingContext& ctx,
+                                        DiagnosticEngine& diagnostic) {
+    return Dictionary(&ctx.call_frame->attrs);
+  }
+};
+
+// Decode individual attribute into `Dictionary` type.
+template <>
+struct AttrDecoding<Dictionary> {
+  using Type = Dictionary;
+  static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
+                                          DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
+      return diagnostic.Emit("Wrong attribute type: expected ")
+             << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
+    }
+    return Dictionary(reinterpret_cast<XLA_FFI_Attrs*>(attr));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Error helpers
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+inline XLA_FFI_Error* CreateError(const XLA_FFI_Api* api, const Error& error) {
+  XLA_FFI_Error_Create_Args args;
+  args.struct_size = XLA_FFI_Error_Create_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.errc = static_cast<XLA_FFI_Error_Code>(*error.errc());
+  args.message = error.message().c_str();
+  return api->XLA_FFI_Error_Create(&args);
+}
+
+inline void DestroyError(const XLA_FFI_Api* api, XLA_FFI_Error* error) {
+  XLA_FFI_Error_Destroy_Args args;
+  args.struct_size = XLA_FFI_Error_Destroy_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.error = error;
+  api->XLA_FFI_Error_Destroy(&args);
+}
+
+inline const char* GetErrorMessage(const XLA_FFI_Api* api,
+                                   XLA_FFI_Error* error) {
+  XLA_FFI_Error_GetMessage_Args args;
+  args.struct_size = XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.error = error;
+  api->XLA_FFI_Error_GetMessage(&args);
+  return args.message;
+}
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Result encoding
+//===----------------------------------------------------------------------===//
+
+// Encodes `Error` as an FFI error.
+template <ExecutionStage stage>
+struct ResultEncoding<stage, Error> {
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx, Error error) {
+    if (XLA_FFI_PREDICT_TRUE(error.success())) {
+      return nullptr;
+    }
+
+    return internal::CreateError(api, error);
+  }
+};
+
+// Encodes `ErrorOr<std::unique_ptr<T>>` as an FFI state.
+template <typename T>
+struct ResultEncoding<ExecutionStage::kInstantiate,
+                      ErrorOr<std::unique_ptr<T>>> {
+  static_assert(std::is_same_v<decltype(T::id), TypeId>,
+                "State type must have a static `TypeId id` field");
+
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               ErrorOr<std::unique_ptr<T>> state) {
+    if (XLA_FFI_PREDICT_TRUE(state.has_value())) {
+      XLA_FFI_State_Set_Args args;
+      args.struct_size = XLA_FFI_State_Set_Args_STRUCT_SIZE;
+      args.extension_start = nullptr;
+      args.ctx = ctx;
+      args.type_id = &T::id;
+      args.state = state.value().release();
+      args.deleter = +[](void* state) { delete reinterpret_cast<T*>(state); };
+      return api->XLA_FFI_State_Set(&args);
+    }
+
+    return internal::CreateError(api, state.error());
+  }
+};
+
+// Encodes `Future` as an asynchronous FFI result.
+template <ExecutionStage stage>
+struct ResultEncoding<stage, Future> {
+  static std::variant<XLA_FFI_Error*, XLA_FFI_Future*> Encode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx, Future future) {
+    // Create XLA_FFI_Future object that will signal completion to the runtime.
+    XLA_FFI_Future_Create_Args args;
+    args.struct_size = XLA_FFI_Future_Create_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.future = nullptr;
+
+    if (auto* err = api->XLA_FFI_Future_Create(&args)) {
+      return err;
+    }
+
+    assert(args.future != nullptr && "XLA_FFI_Future_Create failed");
+
+    future.OnReady([api, f = args.future](const std::optional<Error>& error) {
+      // When the OnReady callback is invoked, we no longer have access to the
+      // diagnostics, and can't signal an error to the runtime. We chose to
+      // abort execution, because otherwise it will lead to a deadlock. However
+      // we should never get to this point, because execution must be aborted on
+      // a synchronous path when checking XLA FFI version compatibility.
+      auto abort_on_error = [api](XLA_FFI_Error* err) {
+        if (XLA_FFI_PREDICT_TRUE(err == nullptr)) {
+          return;
+        }
+
+        std::cerr << "Failed to signal XLA_FFI_Future completion: "
+                  << internal::GetErrorMessage(api, err) << std::endl;
+        internal::DestroyError(api, err);
+        std::abort();
+      };
+
+      if (XLA_FFI_PREDICT_FALSE(error.has_value())) {
+        XLA_FFI_Future_SetError_Args args;
+        args.struct_size = XLA_FFI_Future_SetError_Args_STRUCT_SIZE;
+        args.extension_start = nullptr;
+        args.future = f;
+        args.error = internal::CreateError(api, *error);
+
+        abort_on_error(api->XLA_FFI_Future_SetError(&args));
+
+      } else {
+        XLA_FFI_Future_SetAvailable_Args args;
+        args.struct_size = XLA_FFI_Future_SetAvailable_Args_STRUCT_SIZE;
+        args.extension_start = nullptr;
+        args.future = f;
+
+        abort_on_error(api->XLA_FFI_Future_SetAvailable(&args));
+      }
+    });
+
+    return args.future;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PlatformStream
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct PlatformStream {};
+
+// Context decoding for platform stream.
+//
+// Example: Ffi::Bind().Ctx<PlatformStream<cudaStream_t>()
+//                     .To([](cudaStream_t stream) { ... });
+template <typename T>
+struct CtxDecoding<PlatformStream<T>> {
+  using Type = T;
+
+  static_assert(std::is_pointer_v<T>, "stream type must be a pointer");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    XLA_FFI_Stream_Get_Args args;
+    args.struct_size = XLA_FFI_Stream_Get_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx;
+    args.stream = nullptr;
+
+    if (XLA_FFI_Error* error = api->XLA_FFI_Stream_Get(&args)) {
+      diagnostic.Emit("Failed to get platform stream: ")
+          << internal::GetErrorMessage(api, error);
+      internal::DestroyError(api, error);
+      return std::nullopt;
+    }
+
+    return reinterpret_cast<T>(args.stream);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ScratchAllocator
+//===----------------------------------------------------------------------===//
+
+// Interface for "scratch" allocator for device memory, which deallocates all
+// buffers it has allocated at destruction.
+//
+// WARNING: It is illegal to keep scratch allocator alive after returning from
+// the FFI handler as it relies on execution context whose lifetime is bound to
+// the particular call to FFI handler.
+class ScratchAllocator {
+ public:
+  ~ScratchAllocator();
+
+  ScratchAllocator(ScratchAllocator&&) = default;
+  ScratchAllocator& operator=(ScratchAllocator&&) = default;
+
+  std::optional<void*> Allocate(size_t size, size_t alignment = 1);
+
+ private:
+  friend struct CtxDecoding<ScratchAllocator>;
+
+  ScratchAllocator(const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+                   DiagnosticEngine& diagnostic);
+
+  struct Allocation {
+    size_t size;
+    void* data;
+  };
+
+  const XLA_FFI_Api* api_;
+  XLA_FFI_ExecutionContext* ctx_;
+
+  DiagnosticEngine& diagnostic_;
+  std::vector<Allocation> allocations_;
+};
+
+// Context decoding for scratch allocator.
+//
+// Example: Ffi::Bind().Ctx<ScratchAllocator>()
+//                     .To([](ScratchAllocator scratch) { ... });
+template <>
+struct CtxDecoding<ScratchAllocator> {
+  using Type = ScratchAllocator;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    return ScratchAllocator(api, ctx, diagnostic);
+  }
+};
+
+inline std::optional<void*> ScratchAllocator::Allocate(size_t size,
+                                                       size_t alignment) {
+  XLA_FFI_DeviceMemory_Allocate_Args args;
+  args.struct_size = XLA_FFI_DeviceMemory_Allocate_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.ctx = ctx_;
+  args.size = size;
+  args.alignment = alignment;
+  args.data = nullptr;
+  if (XLA_FFI_Error* error = api_->XLA_FFI_DeviceMemory_Allocate(&args)) {
+    diagnostic_.Emit("Failed to allocate scratch memory: ")
+        << internal::GetErrorMessage(api_, error);
+    internal::DestroyError(api_, error);
+    return std::nullopt;
+  }
+  allocations_.push_back({size, args.data});
+  return args.data;
+}
+
+inline ScratchAllocator::ScratchAllocator(const XLA_FFI_Api* api,
+                                          XLA_FFI_ExecutionContext* ctx,
+                                          DiagnosticEngine& diagnostic)
+    : api_(api), ctx_(ctx), diagnostic_(diagnostic) {}
+
+inline ScratchAllocator::~ScratchAllocator() {
+  for (Allocation& alloc : allocations_) {
+    XLA_FFI_DeviceMemory_Free_Args args;
+    args.struct_size = XLA_FFI_DeviceMemory_Free_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx_;
+    args.size = alloc.size;
+    args.data = alloc.data;
+    if (XLA_FFI_Error* error = api_->XLA_FFI_DeviceMemory_Free(&args)) {
+      diagnostic_.Emit("Failed to free scratch memory: ")
+          << internal::GetErrorMessage(api_, error);
+      internal::DestroyError(api_, error);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ThreadPool
+//===----------------------------------------------------------------------===//
+
+class ThreadPool {
+ public:
+  template <typename F>
+  void Schedule(F&& f) {
+    XLA_FFI_Task* task = +[](void* data) {
+      auto* f = reinterpret_cast<F*>(data);
+      (*f)();
+      delete f;
+    };
+
+    F* data = new F(std::forward<F>(f));
+
+    XLA_FFI_ThreadPool_Schedule_Args args;
+    args.struct_size = XLA_FFI_ThreadPool_Schedule_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx_;
+    args.task = task;
+    args.data = data;
+
+    if (XLA_FFI_Error* error = api_->XLA_FFI_ThreadPool_Schedule(&args)) {
+      diagnostic_.Emit("Failed to schedule task on a thread pool: ")
+          << internal::GetErrorMessage(api_, error);
+      internal::DestroyError(api_, error);
+
+      // If thread pool is not available, we execute the task in the caller
+      // thread. We choose not to return error from `Schedule` for consistency
+      // with Eigen thread pool implementation, and because it would make
+      // recursive work scheduling more difficult.
+      task(data);
+    }
+  }
+
+  int64_t num_threads() const {
+    int64_t num_threads = 0;
+
+    XLA_FFI_ThreadPool_NumThreads_Args args;
+    args.struct_size = XLA_FFI_ThreadPool_NumThreads_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx_;
+    args.num_threads = &num_threads;
+
+    // Silently ignore errors if we can't get the number of threads.
+    if (XLA_FFI_Error* error = api_->XLA_FFI_ThreadPool_NumThreads(&args)) {
+      internal::DestroyError(api_, error);
+    }
+
+    return num_threads;
+  }
+
+ private:
+  friend struct CtxDecoding<ThreadPool>;
+
+  ThreadPool(const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+             DiagnosticEngine& diagnostic);
+
+  const XLA_FFI_Api* api_;
+  XLA_FFI_ExecutionContext* ctx_;
+  DiagnosticEngine& diagnostic_;
+};
+
+// Context decoding for thread pool.
+//
+// Example: Ffi::Bind().Ctx<ThreadPool>()
+//                     .To([](ThreadPool thread_pool) { ... });
+template <>
+struct CtxDecoding<ThreadPool> {
+  using Type = ThreadPool;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    return ThreadPool(api, ctx, diagnostic);
+  }
+};
+
+inline ThreadPool::ThreadPool(const XLA_FFI_Api* api,
+                              XLA_FFI_ExecutionContext* ctx,
+                              DiagnosticEngine& diagnostic)
+    : api_(api), ctx_(ctx), diagnostic_(diagnostic) {}
+
+//===----------------------------------------------------------------------===//
+// Type Registration
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_TYPE(API, NAME, TYPE_ID) \
+  XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, __COUNTER__)
+#define XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, N) \
+  XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, N)
+#define XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, N) \
+  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error* \
+      xla_ffi_type_##N##_registered_ =                 \
+          [] { return ::xla::ffi::Ffi::RegisterTypeId(API, NAME, TYPE_ID); }()
+
+//===----------------------------------------------------------------------===//
+// UserData
+//===----------------------------------------------------------------------===//
+
+// A type tag for automatic user data decoding passed via the execution
+// context.
+template <typename T>
+struct UserData {};
+
+// Context decoding for user data of type `T`.
+//
+// Example: Ffi::Bind().Ctx<UserData<MyData>>()
+//                     .To([](MyData* data) { ... });
+template <typename T>
+struct CtxDecoding<UserData<T>> {
+  using Type = T*;
+
+  static_assert(std::is_same_v<decltype(T::id), TypeId>,
+                "UserData type must have a static `TypeId id` field");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    XLA_FFI_ExecutionContext_Get_Args args;
+    args.struct_size = XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx;
+    args.type_id = &T::id;
+    args.data = nullptr;
+
+    assert(args.type_id->type_id > 0 && "type must be registered with XLA FFI");
+
+    if (XLA_FFI_Error* err = api->XLA_FFI_ExecutionContext_Get(&args); err) {
+      diagnostic.Emit("Failed to get user data from execution context: ")
+          << internal::GetErrorMessage(api, err);
+      internal::DestroyError(api, err);
+      return std::nullopt;
+    }
+
+    return static_cast<Type>(args.data);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// State
+//===----------------------------------------------------------------------===//
+
+// A type tag for automatic state decoding passed via the execution
+// context.
+template <typename T>
+struct State {};
+
+// Context decoding for state of type `T`.
+//
+// Example: Ffi::Bind().Ctx<State<MyState>>()
+//                     .To([](MyState* state) { ... });
+template <typename T>
+struct CtxDecoding<State<T>> {
+  using Type = T*;
+
+  static_assert(std::is_same_v<decltype(T::id), TypeId>,
+                "State type must have a static `TypeId id` field");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    XLA_FFI_State_Get_Args args;
+    args.struct_size = XLA_FFI_State_Get_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx;
+    args.type_id = &T::id;
+    args.state = nullptr;
+
+    assert(args.type_id->type_id > 0 && "type must be registered with XLA FFI");
+
+    if (XLA_FFI_Error* err = api->XLA_FFI_State_Get(&args); err) {
+      diagnostic.Emit("Failed to get state from execution context: ")
+          << internal::GetErrorMessage(api, err);
+      internal::DestroyError(api, err);
+      return std::nullopt;
+    }
+
+    return static_cast<Type>(args.state);
+  }
+};
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_API_FFI_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/attribute_map.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/attribute_map.h
new file mode 100644
index 00000000..43ad4188
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/attribute_map.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_ATTRIBUTE_MAP_H_
+#define XLA_FFI_ATTRIBUTE_MAP_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "xla/ffi/call_frame.h"
+
+namespace xla::ffi {
+
+// Converts MLIR dictionary attribute attached to a custom call operation to a
+// custom call handler attributes that are forwarded to the FFI handler.
+absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict);
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_ATTRIBUTE_MAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/call_frame.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/call_frame.h
new file mode 100644
index 00000000..0614bd75
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/call_frame.h
@@ -0,0 +1,242 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_CALL_FRAME_H_
+#define XLA_FFI_CALL_FRAME_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::ffi {
+
+// CallFrame library encodes C++ arguments using XLA FFI C API structs in a form
+// compatible with the decoding defined in `ffi/api.h`.
+
+//===----------------------------------------------------------------------===//
+// CallFrameBuilder
+//===----------------------------------------------------------------------===//
+
+class CallFrame;  // forward declare
+
+class CallFrameBuilder {
+  // A little bit of template metaprogramming to append type to std::variant.
+  template <typename V, class T>
+  struct AppendType;
+
+  template <typename... Ts, class T>
+  struct AppendType<std::variant<Ts...>, T> {
+    using Type = std::variant<Ts..., T>;
+  };
+
+ public:
+  CallFrameBuilder(size_t num_args, size_t num_rets);
+  ~CallFrameBuilder();
+
+  CallFrameBuilder(CallFrameBuilder&&);
+  CallFrameBuilder& operator=(CallFrameBuilder&&);
+
+  using Scalar = std::variant<bool, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                              uint16_t, uint32_t, uint64_t, float, double>;
+  using Array = std::variant<std::vector<int8_t>, std::vector<int16_t>,
+                             std::vector<int32_t>, std::vector<int64_t>,
+                             std::vector<uint8_t>, std::vector<uint16_t>,
+                             std::vector<uint32_t>, std::vector<uint64_t>,
+                             std::vector<float>, std::vector<double>>;
+
+  // Declare implementation detail structs for call frame builder storage.
+  struct Dictionary;
+
+  // Attributes that do not support nested dictionaries.
+  using FlatAttribute = std::variant<Scalar, Array, std::string>;
+  using FlatAttributesMap = absl::flat_hash_map<std::string, FlatAttribute>;
+
+  // Attributes that support arbitrary nesting.
+  using Attribute = typename AppendType<FlatAttribute, Dictionary>::Type;
+  using AttributesMap = absl::flat_hash_map<std::string, Attribute>;
+
+  // Dictionary is just a wrapper around AttributesMap. We need an indirection
+  // through `std::shared_ptr` to be able to define recursive `std::variant`. We
+  // use shared pointer to keep `AttributesMap` copyable.
+  struct Dictionary {
+    std::shared_ptr<AttributesMap> attrs;
+  };
+
+  // A helper class to build call frame attributes.
+  class AttributesBuilder {
+   public:
+    AttributesBuilder();
+    ~AttributesBuilder();
+
+    void Insert(std::string name, Attribute attr);
+    void Insert(std::string name, AttributesMap attrs);
+    void Append(AttributesMap attrs);
+
+    // This overload is only necessary to support older GCC versions.
+    void Insert(std::string name, const char* attr) {
+      Insert(std::move(name), Attribute{std::string(attr)});
+    }
+
+    AttributesMap Build();
+
+   private:
+    AttributesMap attrs_;
+  };
+
+  CallFrame Build();
+
+  void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
+                    absl::Span<const int64_t> dims);
+
+  void AddTokenArg();
+
+  void AddBufferRet(se::DeviceMemoryBase memory, PrimitiveType type,
+                    absl::Span<const int64_t> dims);
+
+  void AddTokenRet();
+
+  void AddAttributes(AttributesMap attrs);
+
+ private:
+  friend class CallFrame;
+
+  struct Buffer;
+
+  std::vector<Buffer> args_;
+  std::vector<Buffer> rets_;
+  AttributesMap attrs_;
+};
+
+//===----------------------------------------------------------------------===//
+// CallFrame
+//===----------------------------------------------------------------------===//
+
+class CallFrame {
+ public:
+  CallFrame(CallFrame&&);
+  CallFrame& operator=(CallFrame&&);
+
+  ~CallFrame();
+
+  // Updates *this call frame with new device memory pointers. It's up to the
+  // caller to ensure that access to the call frame is synchronized.
+  //
+  // For any particular instance of a custom call in the XLA program, all
+  // attributes are defined at compile time. Also types and dimensions of all
+  // array (buffer) arguments and results are known at compile time. Instead of
+  // rebuilding the call frame from scratch on every execution, we can just
+  // update the arguments and results with new pointers to device memory.
+  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceMemoryBase> args,
+                                 absl::Span<const se::DeviceMemoryBase> rets);
+
+  // Creates a copy of the call frame with updated arguments and results.
+  absl::StatusOr<CallFrame> CopyWithBuffers(
+      absl::Span<const se::DeviceMemoryBase> args,
+      absl::Span<const se::DeviceMemoryBase> rets);
+
+  // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
+  XLA_FFI_CallFrame Build(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      XLA_FFI_ExecutionStage stage = XLA_FFI_ExecutionStage_EXECUTE);
+
+ private:
+  friend class CallFrameBuilder;
+
+  // Declare implementation detail structs to grant access to private members.
+  struct AttributeStorage;
+  struct AttributeType;
+  struct ConvertAttribute;
+  struct FixUpAttribute;
+
+  // Declare implementation detail structs for call frame storage.
+  struct Arguments;
+  struct Array;
+  struct Attributes;
+  struct Buffer;
+  struct Dictionary;
+  struct NamedAttribute;
+  struct Results;
+  struct Scalar;
+  struct String;
+
+  using Attribute = std::variant<Scalar, Array, String, Dictionary>;
+
+  CallFrame(std::unique_ptr<Arguments> arguments,
+            std::unique_ptr<Results> results,
+            std::shared_ptr<Attributes> attributes);
+
+  static Buffer ConvertBuffer(const CallFrameBuilder::Buffer& buffer);
+
+  //===----- Call frame arguments -----------------------------------------===//
+
+  // Creates call frame arguments from the call frame builder buffers.
+  static std::unique_ptr<Arguments> CreateArgs(
+      absl::Span<const CallFrameBuilder::Buffer> args);
+
+  // Copies call frame arguments.
+  static std::unique_ptr<Arguments> CopyArgs(const Arguments& args);
+
+  // Fixes up call frame arguments by initializing XLA FFI structs with valid
+  // pointers into storage objects.
+  static std::unique_ptr<Arguments> FixUpArgs(std::unique_ptr<Arguments> args);
+
+  //===----- Call frame results -------------------------------------------===//
+
+  // Creates call frame results from the call frame builder buffers.
+  static std::unique_ptr<Results> CreateRets(
+      absl::Span<const CallFrameBuilder::Buffer> rets);
+
+  // Copies call frame results.
+  static std::unique_ptr<Results> CopyRets(const Results& rets);
+
+  // Fixes up call frame results by initializing XLA FFI structs with valid
+  // pointers into storage objects.
+  static std::unique_ptr<Results> FixUpRets(std::unique_ptr<Results> rets);
+
+  //===----- Call frame attributes ----------------------------------------===//
+
+  // Creates call frame attributes from the call frame builder attributes.
+  static std::unique_ptr<Attributes> CreateAttrs(
+      const CallFrameBuilder::AttributesMap& attrs);
+
+  // Fixes up call frame attributes by initializing XLA FFI structs with valid
+  // pointers into storage objects.
+  static std::unique_ptr<Attributes> FixUpAttrs(
+      std::unique_ptr<Attributes> attrs);
+
+  std::unique_ptr<Arguments> arguments_;
+  std::unique_ptr<Results> results_;
+
+  // Attributes are defined at compile time and can be shared between multiple
+  // instances of a call frame (see `Update` above).
+  std::shared_ptr<Attributes> attributes_;
+};
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_CALL_FRAME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/execution_context.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/execution_context.h
new file mode 100644
index 00000000..32adfcdc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/execution_context.h
@@ -0,0 +1,124 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_EXECUTION_CONTEXT_H_
+#define XLA_FFI_EXECUTION_CONTEXT_H_
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/ffi/type_id_registry.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::ffi {
+
+// Execution context is a container for forwarding arbitrary user data to FFI
+// handlers in the scope of a single XLA execution. Execution context allows to
+// pass arbitrary user data to FFI handlers via the side channel that does not
+// require modifying HLO modules.
+//
+// From XLA FFI perspective user data is an opaque pointer that can be forwarded
+// to the FFI handler. We rely on type id to guarantee that we forward user data
+// of correct type.
+//
+// Examples: FFI handler can register a per-execution cache in the execution
+// context and get access to it in the FFI handler, with a guarantee that it is
+// unique between separate calls to XLA execute.
+class ExecutionContext {
+ public:
+  using TypeId = TypeIdRegistry::TypeId;
+
+  template <typename T>
+  using Deleter = std::function<void(T*)>;
+
+  // Inserts opaque user data with a given type id and optional deleter.
+  absl::Status Insert(TypeId type_id, void* data,
+                      Deleter<void> deleter = nullptr);
+
+  // Inserts typed user data of type `T` and optional deleter.
+  template <typename T>
+  absl::Status Insert(T* data, Deleter<T> deleter = nullptr);
+
+  // Emplaces typed user data constructed from `args`. Execution context
+  // becomes the owner of the constructed object.
+  template <typename T, typename... Args>
+  absl::Status Emplace(Args&&... args);
+
+  // Looks up typed execution context data of type `T`.
+  template <typename T>
+  absl::StatusOr<T*> Lookup() const {
+    TF_ASSIGN_OR_RETURN(auto user_data,
+                        LookupUserData(TypeIdRegistry::GetTypeId<T>()));
+    return static_cast<T*>(user_data->data());
+  }
+
+  // Looks up opaque execution context data with given `type_id`.
+  absl::StatusOr<void*> Lookup(TypeId type_id) const {
+    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(type_id));
+    return user_data->data();
+  }
+
+ private:
+  // An RAII wrapper for opaque user data. Optional deleter will be called when
+  // UserData is destroyed together with the execution context. If deleter is
+  // nullptr then the caller is responsible for making sure that the pointer
+  // stays valid during the XLA execution and correctly destroyed afterwards.
+  class UserData {
+   public:
+    UserData(void* data, Deleter<void> deleter);
+    ~UserData();
+
+    UserData(UserData&) = delete;
+    UserData& operator=(const UserData&) = delete;
+
+    void* data() const { return data_; }
+
+   private:
+    void* data_;
+    Deleter<void> deleter_;
+  };
+
+  absl::Status InsertUserData(TypeId type_id, std::unique_ptr<UserData> data);
+  absl::StatusOr<UserData*> LookupUserData(TypeId type_id) const;
+
+  absl::flat_hash_map<TypeId, std::unique_ptr<UserData>> user_data_;
+};
+
+template <typename T>
+absl::Status ExecutionContext::Insert(T* data, Deleter<T> deleter) {
+  return InsertUserData(TypeIdRegistry::GetTypeId<T>(),
+                        std::make_unique<UserData>(
+                            data, [deleter = std::move(deleter)](void* data) {
+                              if (deleter) deleter(static_cast<T*>(data));
+                            }));
+}
+
+template <typename T, typename... Args>
+absl::Status ExecutionContext::Emplace(Args&&... args) {
+  return InsertUserData(TypeIdRegistry::GetTypeId<T>(),
+                        std::make_unique<UserData>(
+                            new T(std::forward<Args>(args)...),
+                            [](void* data) { delete static_cast<T*>(data); }));
+}
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_EXECUTION_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/execution_state.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/execution_state.h
new file mode 100644
index 00000000..f2f26334
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/execution_state.h
@@ -0,0 +1,95 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_EXECUTION_STATE_H_
+#define XLA_FFI_EXECUTION_STATE_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/ffi/type_id_registry.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::ffi {
+
+// ExecutionState is an RAII wrapper for an opaque state that can be attached
+// to the instance of the XLA FFI handler, and allows to implement "stateful"
+// custom calls:
+//
+//   (1) At instantiation stage XLA FFI handler creates a state object and
+//       passes ownership to the XLA runtime.
+//
+//   (2) At prepare/initialize/execute stages XLA runtime passes state back to
+//       the FFI handler via the execution context.
+//
+//   (3) XLA runtime automatically destroys the state object when parent XLA
+//       executable is destroyed.
+//
+class ExecutionState {
+ public:
+  using TypeId = TypeIdRegistry::TypeId;
+
+  template <typename T>
+  using Deleter = std::function<void(T*)>;
+
+  ExecutionState();
+  ~ExecutionState();
+
+  ExecutionState(const ExecutionState&) = delete;
+  ExecutionState& operator=(const ExecutionState&) = delete;
+
+  // Sets opaque state with a given type id and deleter. Returns an error if
+  // state is already set.
+  absl::Status Set(TypeId type_id, void* state, Deleter<void> deleter);
+
+  // Returns opaque state of the given type id. If set state type id does not
+  // match the requested one, returns an error.
+  absl::StatusOr<void*> Get(TypeId type_id) const;
+
+  // Sets typed state of type `T` and optional deleter. Returns an
+  // error if state is already set.
+  template <typename T>
+  absl::Status Set(std::unique_ptr<T> state);
+
+  // Gets typed state of type `T`. If set state type id does not match the
+  // requested one, returns an error.
+  template <typename T>
+  absl::StatusOr<T*> Get() const;
+
+  bool IsSet() const;
+
+ private:
+  TypeId type_id_;
+  void* state_;
+  Deleter<void> deleter_;
+};
+
+template <typename T>
+absl::Status ExecutionState::Set(std::unique_ptr<T> state) {
+  return Set(TypeIdRegistry::GetTypeId<T>(), state.release(),
+             [](void* state) { delete reinterpret_cast<T*>(state); });
+}
+
+template <typename T>
+absl::StatusOr<T*> ExecutionState::Get() const {
+  TF_ASSIGN_OR_RETURN(void* state, Get(TypeIdRegistry::GetTypeId<T>()));
+  return reinterpret_cast<T*>(state);
+}
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_EXECUTION_STATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/ffi.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/ffi.h
new file mode 100644
index 00000000..9335bfa0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/ffi.h
@@ -0,0 +1,706 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_FFI_H_
+#define XLA_FFI_FFI_H_
+
+#ifdef XLA_FFI_API_FFI_H_
+#error Two different XLA FFI implementations cannot be included together. \
+       See README.md for more details.
+#endif  // XLA_FFI_API_FFI_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+
+// IWYU pragma: begin_exports
+#include "xla/ffi/api/api.h"
+// IWYU pragma: end_exports
+
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/base/nullability.h"
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/execution_state.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::ffi {
+
+// Type tags to bind parameters passed via execution context to FFI handler.
+struct Stream {};             // binds `se::Stream*`
+struct DeviceOrdinal {};      // binds `int32_t` with device ordinal
+struct Allocator {};          // binds `se::DeviceMemoryAllocator*`
+struct ScratchAllocator {};   // binds `se::OwningScratchAllocator`
+struct CalledComputation {};  // binds `HloComputation*`
+struct IntraOpThreadPool {};  // binds `const Eigen::ThreadPoolDevice*`
+
+template <typename T>
+struct PlatformStream {};  // binds a platform stream, e.g. `cudaStream_t`
+
+//===----------------------------------------------------------------------===//
+// Arguments
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
+
+template <PrimitiveType dtype>
+using NativeType = typename primitive_util::PrimitiveTypeToNative<dtype>::type;
+
+}  // namespace internal
+
+// Dynamically-typed buffer.
+//
+// No checks are done at decoding time. Any dtype and rank combination is
+// accepted.
+class AnyBuffer {
+ public:
+  using Dimensions = absl::Span<const int64_t>;
+
+  explicit AnyBuffer(absl::Nonnull<const XLA_FFI_Buffer*> buf) : buf_(buf) {
+    DCHECK(buf_ != nullptr) << "XLA_FFI_Buffer must be non-null";
+  }
+
+  PrimitiveType element_type() const { return PrimitiveType(buf_->dtype); }
+
+  Dimensions dimensions() const { return Dimensions(buf_->dims, buf_->rank); }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE size_t size_bytes() const {
+    if (ABSL_PREDICT_TRUE(primitive_util::IsArrayType(element_type()))) {
+      return primitive_util::ByteWidth(element_type()) * element_count();
+    }
+    return 0;
+  }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE size_t element_count() const {
+    return absl::c_accumulate(dimensions(), int64_t{1}, std::multiplies<>());
+  }
+
+  void* untyped_data() const { return buf_->data; }
+
+  template <typename T>
+  T* typed_data() const {
+    DCHECK(primitive_util::NativeToPrimitiveType<T>() == element_type())
+        << "Template type must match the underlying buffer dtype";
+    return reinterpret_cast<T*>(buf_->data);
+  }
+
+  template <typename T>
+  T* reinterpret_data() const {
+    DCHECK(primitive_util::IsArrayType(element_type()) &&
+           sizeof(T) == primitive_util::ByteWidth(element_type()) &&
+           !(reinterpret_cast<std::uintptr_t>(buf_->data) % alignof(T)))
+        << "Requested type must have the same byte width and alignment as the "
+           "underlying buffer type";
+    return reinterpret_cast<T*>(buf_->data);
+  }
+
+  se::DeviceMemoryBase device_memory() const {
+    return se::DeviceMemoryBase(untyped_data(), size_bytes());
+  }
+
+ private:
+  const XLA_FFI_Buffer* buf_;
+};
+
+// Buffer with a statically-known dtype and rank.
+//
+// The dtype and rank are checked at decoding time. If rank is not specified,
+// any rank is accepted.
+template <PrimitiveType dtype, size_t rank = internal::kDynamicRank>
+class Buffer {
+ public:
+  using Dimensions = AnyBuffer::Dimensions;
+
+  explicit Buffer(absl::Nonnull<const XLA_FFI_Buffer*> buf) : buf_(buf) {
+    DCHECK(buf_ != nullptr) << "XLA_FFI_Buffer must be non-null";
+  }
+
+  PrimitiveType element_type() const { return dtype; }
+
+  Dimensions dimensions() const {
+    return Dimensions(buf_->dims,
+                      rank == internal::kDynamicRank ? buf_->rank : rank);
+  }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE size_t size_bytes() const {
+    if constexpr (primitive_util::IsArrayType(dtype)) {
+      return primitive_util::ByteWidth(dtype) * element_count();
+    }
+    return 0;
+  }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE size_t element_count() const {
+    return absl::c_accumulate(dimensions(), int64_t{1}, std::multiplies<>());
+  }
+
+  void* untyped_data() const { return buf_->data; }
+
+  internal::NativeType<dtype>* typed_data() const {
+    return reinterpret_cast<internal::NativeType<dtype>*>(untyped_data());
+  }
+
+  se::DeviceMemory<internal::NativeType<dtype>> device_memory() const {
+    return se::DeviceMemory<internal::NativeType<dtype>>(
+        se::DeviceMemoryBase(untyped_data(), size_bytes()));
+  }
+
+ private:
+  const XLA_FFI_Buffer* buf_;
+};
+
+// clang-format off
+template <PrimitiveType dtype> using BufferR0 = Buffer<dtype, 0>;
+template <PrimitiveType dtype> using BufferR1 = Buffer<dtype, 1>;
+template <PrimitiveType dtype> using BufferR2 = Buffer<dtype, 2>;
+template <PrimitiveType dtype> using BufferR3 = Buffer<dtype, 3>;
+template <PrimitiveType dtype> using BufferR4 = Buffer<dtype, 4>;
+// clang-format on
+
+using Token = BufferR0<PrimitiveType::TOKEN>;  // NOLINT
+
+namespace internal {
+
+template <PrimitiveType dtype, size_t rank>
+ABSL_ATTRIBUTE_ALWAYS_INLINE std::optional<Buffer<dtype, rank>> DecodeBuffer(
+    XLA_FFI_Buffer* buf, DiagnosticEngine& diagnostic) {
+  if (auto buf_dtype = PrimitiveType(buf->dtype);
+      ABSL_PREDICT_FALSE(buf_dtype != dtype)) {
+    return diagnostic.Emit("Wrong buffer dtype: expected ")
+           << primitive_util::LowercasePrimitiveTypeName(dtype) << " but got "
+           << primitive_util::LowercasePrimitiveTypeName(buf_dtype);
+  }
+
+  if constexpr (rank != internal::kDynamicRank) {
+    if (ABSL_PREDICT_FALSE(buf->rank != rank)) {
+      return diagnostic.Emit("Wrong buffer rank: expected ")
+             << rank << " but got " << buf->rank;
+    }
+  }
+
+  return Buffer<dtype, rank>(buf);
+}
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Arguments binding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ArgBinding<AnyBuffer> {
+  using Arg = AnyBuffer;
+};
+
+template <PrimitiveType dtype, size_t rank>
+struct ArgBinding<Buffer<dtype, rank>> {
+  using Arg = Buffer<dtype, rank>;
+};
+
+//===----------------------------------------------------------------------===//
+// Arguments decoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ArgDecoding<AnyBuffer> {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<AnyBuffer> Decode(XLA_FFI_ArgType type, void* arg,
+                                         DiagnosticEngine& diagnostic) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+
+    return AnyBuffer(reinterpret_cast<XLA_FFI_Buffer*>(arg));
+  }
+};
+
+template <PrimitiveType dtype, size_t rank>
+struct ArgDecoding<Buffer<dtype, rank>> {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Buffer<dtype, rank>> Decode(
+      XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(arg), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of arguments.
+//===----------------------------------------------------------------------===//
+
+class RemainingArgs : public internal::RemainingArgsBase {
+ public:
+  using internal::RemainingArgsBase::RemainingArgsBase;
+
+  template <typename T>
+  absl::StatusOr<T> get(size_t index) const {
+    size_t idx = offset() + index;
+    if (ABSL_PREDICT_FALSE(idx >= args()->size)) {
+      return InvalidArgument("Index out of range.");
+    }
+
+    DiagnosticEngine diagnostic;
+    std::optional<T> value = ArgDecoding<T>::Decode(
+        args()->types[idx], args()->args[idx], diagnostic);
+    if (ABSL_PREDICT_FALSE(!value.has_value())) {
+      return Internal("%s", diagnostic.Result());
+    }
+
+    return *value;
+  }
+};
+
+template <>
+struct internal::Decode<internal::RemainingArgsTag> {
+  static std::optional<RemainingArgs> call(DecodingOffsets& offsets,
+                                           DecodingContext& ctx,
+                                           DiagnosticEngine& diagnostic) {
+    return RemainingArgs(&ctx.call_frame->args, offsets.args);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Results decoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct RetDecoding<AnyBuffer> {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<AnyBuffer>> Decode(XLA_FFI_RetType type,
+                                                 void* arg,
+                                                 DiagnosticEngine& diagnostic) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
+    }
+    return AnyBuffer(reinterpret_cast<XLA_FFI_Buffer*>(arg));
+  }
+};
+
+template <PrimitiveType dtype, size_t rank>
+struct RetDecoding<Buffer<dtype, rank>> {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<Buffer<dtype, rank>>> Decode(
+      XLA_FFI_RetType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
+    }
+
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(arg), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of results.
+//===----------------------------------------------------------------------===//
+
+class RemainingRets : public internal::RemainingRetsBase {
+ public:
+  using internal::RemainingRetsBase::RemainingRetsBase;
+
+  template <typename T>
+  absl::StatusOr<Result<T>> get(size_t index) const {
+    size_t idx = offset() + index;
+    if (ABSL_PREDICT_FALSE(idx >= rets()->size)) {
+      return InvalidArgument("Index out of range.");
+    }
+
+    DiagnosticEngine diagnostic;
+    std::optional<Result<T>> value = RetDecoding<T>::Decode(
+        rets()->types[idx], rets()->rets[idx], diagnostic);
+    if (ABSL_PREDICT_FALSE(!value.has_value())) {
+      return Internal("%s", diagnostic.Result());
+    }
+
+    return *value;
+  }
+};
+
+template <>
+struct internal::Decode<internal::RemainingRetsTag> {
+  static std::optional<RemainingRets> call(DecodingOffsets& offsets,
+                                           DecodingContext& ctx,
+                                           DiagnosticEngine& diagnostic) {
+    return RemainingRets(&ctx.call_frame->rets, offsets.rets);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Attributes decoding
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(T, TYPE)                   \
+  template <>                                                            \
+  struct AttrDecoding<absl::Span<const T>> {                             \
+    using Type = absl::Span<const T>;                                    \
+    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr, \
+                                      DiagnosticEngine& diagnostic) {    \
+      if (ABSL_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {          \
+        return diagnostic.Emit("Wrong attribute type: expected ")        \
+               << XLA_FFI_AttrType_ARRAY << " but got " << type;         \
+      }                                                                  \
+                                                                         \
+      auto* array = reinterpret_cast<XLA_FFI_Array*>(attr);              \
+      if (ABSL_PREDICT_FALSE(array->dtype != TYPE)) {                    \
+        return diagnostic.Emit("Wrong array data type: expected ")       \
+               << TYPE << " but got " << array->dtype;                   \
+      }                                                                  \
+                                                                         \
+      return absl::Span<const T>(reinterpret_cast<T*>(array->data),      \
+                                 array->size);                           \
+    }                                                                    \
+  }
+
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int8_t, XLA_FFI_DataType_S8);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int16_t, XLA_FFI_DataType_S16);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(double, XLA_FFI_DataType_F64);
+
+#undef XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING
+
+template <>
+struct AttrDecoding<absl::string_view> {
+  using Type = absl::string_view;
+  static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
+                                                void* attr,
+                                                DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
+      return diagnostic.Emit("Wrong attribute type: expected ")
+             << XLA_FFI_AttrType_STRING << " but got " << type;
+    }
+
+    auto* span = reinterpret_cast<XLA_FFI_ByteSpan*>(attr);
+    return std::string_view(span->ptr, span->len);
+  }
+};
+
+// A type tag to mark i64 attributes as pointers to `T`.
+template <typename T>
+struct Pointer {};
+
+template <typename T>
+struct AttrDecoding<Pointer<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
+                                    DiagnosticEngine& diagnostic) {
+    auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
+                           scalar->dtype != XLA_FFI_DataType_S64)) {
+      return diagnostic.Emit("Wrong attribute type: ")
+             << "expected i64 scalar for passing pointer but got " << type;
+    }
+
+    static_assert(sizeof(uintptr_t) == sizeof(int64_t));
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(scalar->value);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing dictionary attributes.
+//===----------------------------------------------------------------------===//
+
+class Dictionary : public internal::DictionaryBase {
+ public:
+  using internal::DictionaryBase::DictionaryBase;
+
+  template <typename T>
+  absl::StatusOr<T> get(std::string_view name) const {
+    DiagnosticEngine diagnostic;
+    std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
+    if (!value.has_value()) {
+      return Internal("%s", diagnostic.Result());
+    }
+    return *value;
+  }
+};
+
+// Decode `AttrsTag` (all attributes) into a `Dictionary`.
+template <>
+struct internal::Decode<internal::AttrsTag<Dictionary>> {
+  static std::optional<Dictionary> call(DecodingOffsets& offsets,
+                                        DecodingContext& ctx,
+                                        DiagnosticEngine& diagnostic) {
+    return Dictionary(&ctx.call_frame->attrs);
+  }
+};
+
+// Decode individual attribute into `Dictionary` type.
+template <>
+struct AttrDecoding<Dictionary> {
+  using Type = Dictionary;
+  static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
+                                          DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
+      return diagnostic.Emit("Wrong attribute type: expected ")
+             << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
+    }
+    return Dictionary(reinterpret_cast<XLA_FFI_Attrs*>(attr));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Context decoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct CtxDecoding<Stream> {
+  using Type = se::Stream*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    void* ptr = api->internal_api->XLA_FFI_INTERNAL_Stream_Get(ctx);
+    if (ABSL_PREDICT_FALSE(ptr == nullptr)) {
+      return diagnostic.Emit("Failed to decode stream");
+    }
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
+template <>
+struct CtxDecoding<DeviceOrdinal> {
+  using Type = int32_t;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    return api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
+  }
+};
+
+template <>
+struct CtxDecoding<Allocator> {
+  using Type = se::DeviceMemoryAllocator*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    void* device_allocator =
+        api->internal_api->XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(ctx);
+    return reinterpret_cast<Type>(device_allocator);
+  }
+};
+
+template <>
+struct CtxDecoding<ScratchAllocator> {
+  using Type = se::OwningScratchAllocator<>;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    int32_t device_ordinal =
+        api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
+    void* device_allocator =
+        api->internal_api->XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(ctx);
+
+    return se::OwningScratchAllocator<>(
+        device_ordinal,
+        reinterpret_cast<se::DeviceMemoryAllocator*>(device_allocator));
+  }
+};
+
+template <>
+struct CtxDecoding<CalledComputation> {
+  using Type = const HloComputation*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    void* ptr = api->internal_api->XLA_FFI_INTERNAL_CalledComputation_Get(ctx);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
+template <>
+struct CtxDecoding<IntraOpThreadPool> {
+  using Type = const Eigen::ThreadPoolDevice*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    void* intra_op_thread_pool =
+        api->internal_api->XLA_FFI_INTERNAL_IntraOpThreadPool_Get(ctx);
+    return reinterpret_cast<Type>(intra_op_thread_pool);
+  }
+};
+
+template <typename T>
+struct CtxDecoding<PlatformStream<T>> {
+  using Type = T;
+  static_assert(std::is_pointer_v<T>, "platform stream type must be a pointer");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    if (auto stream = CtxDecoding<Stream>::Decode(api, ctx, diagnostic)) {
+      return reinterpret_cast<Type>(
+          stream.value()->platform_specific_handle().stream);
+    }
+    return std::nullopt;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// UserData
+//===----------------------------------------------------------------------===//
+
+// A type tag for automatic user data decoding passed via the execution context.
+template <typename T>
+struct UserData {};
+
+template <typename T>
+struct CtxDecoding<UserData<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    auto* execution_context = reinterpret_cast<const ExecutionContext*>(
+        api->internal_api->XLA_FFI_INTERNAL_ExecutionContext_Get(ctx));
+
+    if (execution_context == nullptr) {
+      return diagnostic.Emit(
+          "Execution context must be not null to fetch UserData parameter");
+    }
+
+    auto user_data = execution_context->Lookup<T>();
+    if (!user_data.ok()) {
+      return diagnostic.Emit("Failed to get user data from execution context: ")
+             << user_data.status().message();
+    }
+
+    return *std::move(user_data);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// State
+//===----------------------------------------------------------------------===//
+
+// A type tag for automatic state decoding passed via the execution context.
+template <typename T>
+struct State {};
+
+template <typename T>
+struct CtxDecoding<State<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    auto* execution_state = reinterpret_cast<const ExecutionState*>(
+        api->internal_api->XLA_FFI_INTERNAL_ExecutionState_Get(ctx));
+
+    if (execution_state == nullptr) {
+      return diagnostic.Emit(
+          "Execution state must be not null to fetch State parameter");
+    }
+
+    auto state = execution_state->Get<T>();
+    if (!state.ok()) {
+      return diagnostic.Emit("Failed to get state from execution context: ")
+             << state.status().message();
+    }
+
+    return *std::move(state);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Result encoding
+//===----------------------------------------------------------------------===//
+
+template <ExecutionStage stage>
+struct ResultEncoding<stage, absl::Status> {
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               absl::Status status) {
+    if (ABSL_PREDICT_TRUE(status.ok())) {
+      return nullptr;
+    }
+    return api->internal_api->XLA_FFI_INTERNAL_Error_Forward(&status);
+  }
+};
+
+template <typename T>
+struct ResultEncoding<ExecutionStage::kInstantiate,
+                      absl::StatusOr<std::unique_ptr<T>>> {
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               absl::StatusOr<std::unique_ptr<T>> state) {
+    if (ABSL_PREDICT_TRUE(state.ok())) {
+      auto* execution_state = reinterpret_cast<ExecutionState*>(
+          api->internal_api->XLA_FFI_INTERNAL_ExecutionState_Get(ctx));
+      absl::Status status = execution_state->Set<T>(*std::move(state));
+      if (ABSL_PREDICT_TRUE(status.ok())) {
+        return nullptr;
+      }
+      return api->internal_api->XLA_FFI_INTERNAL_Error_Forward(&status);
+    }
+
+    absl::Status status = state.status();
+    return api->internal_api->XLA_FFI_INTERNAL_Error_Forward(&status);
+  }
+};
+
+template <ExecutionStage stage>
+struct ResultEncoding<stage, tsl::AsyncValueRef<tsl::Chain>> {
+  static XLA_FFI_Future* Encode(const XLA_FFI_Api* api,
+                                XLA_FFI_ExecutionContext* ctx,
+                                tsl::AsyncValueRef<tsl::Chain> async_value) {
+    return api->internal_api->XLA_FFI_INTERNAL_Future_Forward(
+        async_value.release());
+  }
+};
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_FFI_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/ffi_api.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/ffi_api.h
new file mode 100644
index 00000000..63b4436b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/ffi_api.h
@@ -0,0 +1,167 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_FFI_API_H_
+#define XLA_FFI_FFI_API_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/api/api.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/execution_state.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/chain.h"
+
+namespace xla::ffi {
+
+// This is an implementation of XLA FFI API defined in `api/c_api.h` header. It
+// should be linked statically into the "main" XLA binary, and third party FFI
+// handlers can be linked and registered dynamically.
+//
+// FFI handlers registered statically (and built from the same XLA commit with
+// the same toolchain) can also use `api/c_api_internal.h` to get access to
+// various internal data structures.
+
+//===----------------------------------------------------------------------===//
+// Calling XLA FFI handlers
+//===----------------------------------------------------------------------===//
+
+// Options for calling XLA FFI handlers. Backend specific options must be
+// constructed from `xla::ExecuteRunOptions`, to give FFI handlers access to
+// XLA runtime internals.
+struct CallOptions {
+  struct CpuOptions {
+    const Eigen::ThreadPoolDevice* intra_op_thread_pool = nullptr;
+  };
+
+  struct GpuOptions {
+    se::Stream* stream = nullptr;
+    se::DeviceMemoryAllocator* allocator = nullptr;
+  };
+
+  using BackendOptions = std::variant<std::monostate, CpuOptions, GpuOptions>;
+
+  int32_t device_ordinal = -1;
+  BackendOptions backend_options = {};
+
+  const HloComputation* called_computation = nullptr;
+  const ExecutionContext* execution_context = nullptr;
+  ExecutionState* execution_state = nullptr;
+};
+
+// Takes ownership of the XLA FFI error and returns underlying status. Frees
+// `error` if it's not nullptr. If `error` is nullptr, returns OkStatus.
+absl::Status TakeStatus(XLA_FFI_Error* error);
+
+// Takes ownership of the XLA FFI future and returns underlying AsyncValue.
+// Frees `future` if it's not nullptr. If `future` is nullptr, returns available
+// async value.
+tsl::AsyncValueRef<tsl::Chain> TakeFuture(XLA_FFI_Future* future);
+
+// Calls an XLA FFI handler with the given call frame and options. This is a
+// synchronous call and it might block the caller thread if the handler is
+// asynchronous. It is unsafe to call if from a thread pool that runs tasks
+// scheduled by the handler itself.
+absl::Status Call(Ffi& handler, CallFrame& call_frame,
+                  const CallOptions& options = {},
+                  ExecutionStage stage = ExecutionStage::kExecute);
+
+absl::Status Call(
+    XLA_FFI_Handler* handler, CallFrame& call_frame,
+    const CallOptions& options = {},
+    XLA_FFI_ExecutionStage stage = XLA_FFI_ExecutionStage_EXECUTE);
+
+// Calls an XLA FFI handler with the given call frame and options. This is an
+// asynchronous call and it will not block the caller thread. Returned async
+// value will become available when the handler completes execution.
+tsl::AsyncValueRef<tsl::Chain> CallAsync(
+    Ffi& handler, CallFrame& call_frame, const CallOptions& options = {},
+    ExecutionStage stage = ExecutionStage::kExecute);
+
+tsl::AsyncValueRef<tsl::Chain> CallAsync(
+    XLA_FFI_Handler* handler, CallFrame& call_frame,
+    const CallOptions& options = {},
+    XLA_FFI_ExecutionStage stage = XLA_FFI_ExecutionStage_EXECUTE);
+
+// Gets metadata from the handler by calling it with a special call frame.
+absl::StatusOr<XLA_FFI_Metadata> GetMetadata(Ffi& handler);
+absl::StatusOr<XLA_FFI_Metadata> GetMetadata(XLA_FFI_Handler* handler);
+
+namespace internal {
+// This is an internal workaround to override FFI execution context for FFI
+// calls executed in the current thread with `context` in tests that use legacy
+// xla::Client, xla::Service and xla::Backend APIs because it's not worth it to
+// add proper execution context support throughout all abstraction layers
+// (legacy client APIs should be eventually deleted instead). This workaround
+// should not be used outside of tests.
+class ScopedExecutionContext {
+ public:
+  explicit ScopedExecutionContext(const ExecutionContext* context);
+  ~ScopedExecutionContext();
+
+  ScopedExecutionContext(ScopedExecutionContext&&) = delete;
+  ScopedExecutionContext& operator=(ScopedExecutionContext&&) = delete;
+
+  // Returns an execution context that should be used for FFI calls based on the
+  // call options and the current thread's execution context.
+  static const ExecutionContext* GetCallExecutionContext(
+      const CallOptions& options);
+
+ private:
+  const ExecutionContext* recover_;
+};
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// XLA FFI registry
+//===----------------------------------------------------------------------===//
+
+struct HandlerRegistration {
+  XLA_FFI_Handler_Bundle bundle = {};
+  XLA_FFI_Handler_Traits traits = {};
+};
+
+bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits);
+
+// Returns registered FFI handler for a given name and platform, or an error if
+// it's not found in the static registry.
+absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
+                                                std::string_view platform);
+
+// Returns all registered calls in the static registry for a given platform.
+absl::StatusOr<absl::flat_hash_map<std::string, HandlerRegistration>>
+StaticRegisteredHandlers(std::string_view platform);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Api Implementation
+//===----------------------------------------------------------------------===//
+
+const XLA_FFI_Api* GetXlaFfiApi();
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_FFI_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/ffi/type_id_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/ffi/type_id_registry.h
new file mode 100644
index 00000000..6b745554
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/ffi/type_id_registry.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_TYPE_ID_REGISTRY_H_
+#define XLA_FFI_TYPE_ID_REGISTRY_H_
+
+#include <cstdint>
+#include <string_view>
+
+#include "absl/status/statusor.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla::ffi {
+
+// XLA FFI has a several APIs that can take ownership of an opaque user data,
+// and then passes it back to the FFI handler via the execution context:
+//
+// 1. UserData put into the ExecutionContext
+// 2. XLA FFI handler state created at instantiation stage
+//
+// To avoid passing the pointer of the wrong type and guard against undefined
+// behavior, that likely will manifest as hard to debug crashes, we rely on
+// type id to guarantee that we pass pointers of the correct type.
+//
+// There are two kinds of type ids:
+//
+// 1. External type id. When FFI handlers defined in a dynamically loaded
+//    library, they must register types used in the execution context ahead
+//    of time and explicitly get a unique type id for them.
+//
+// 2. Internal type id. When FFI handler defined in the same binary we rely
+//    on a global static registry to automatically assign type ids.
+class TypeIdRegistry {
+ public:
+  TSL_LIB_GTL_DEFINE_INT_TYPE(TypeId, int64_t);
+
+  static constexpr TypeId kUnknownTypeId = TypeId(0);
+
+  // Registers external type with a given name in a static type registry.
+  static absl::StatusOr<TypeId> RegisterExternalTypeId(std::string_view name);
+
+  // Returns a type id for a given type. For internal type ids only.
+  template <typename T>
+  static TypeId GetTypeId();
+
+ private:
+  static TypeId GetNextTypeId();
+};
+
+template <typename T>
+TypeIdRegistry::TypeId TypeIdRegistry::GetTypeId() {
+  static const TypeId id = GetNextTypeId();
+  return id;
+}
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_TYPE_ID_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/fp_util.h b/third_party/tflite-hdrs/third_party/xla/xla/fp_util.h
new file mode 100644
index 00000000..bf51caac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/fp_util.h
@@ -0,0 +1,311 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_FP_UTIL_H_
+#define XLA_FP_UTIL_H_
+
+#include <algorithm>
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <optional>
+#include <utility>
+
+#include "xla/types.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Returns true if the value is zero as per the IEEE-754 isZero operation.
+template <typename T>
+constexpr bool IsZero(T x) {
+  return x == static_cast<T>(0.0f);
+}
+
+// Returns true if the value is negative as per the IEEE-754 isSignMinus
+// operation.
+template <typename T>
+constexpr bool IsSignMinus(T x) {
+  return x < 0;
+}
+
+// Returns the absolute value of the value.
+template <typename T>
+constexpr T Abs(T x) {
+  if (IsZero(x)) {
+    // Positive zero plus negative zero is positive zero.
+    // Positive zero plus positive zero is positive zero.
+    return x + static_cast<T>(0.0f);
+  }
+  return IsSignMinus(x) ? -x : x;
+}
+
+// Returns true if the value is NaN as per the IEEE-754 isNaN operation.
+template <typename T>
+constexpr bool IsNaN(T x) {
+  return x != x;
+}
+
+// Returns true if the value is infinite as per the IEEE-754 isInfinite
+// operation.
+template <typename T>
+constexpr bool IsInfinite(T x) {
+  return x == std::numeric_limits<T>::infinity() ||
+         x == -std::numeric_limits<T>::infinity();
+}
+
+// Returns true if the value is finite as per the IEEE-754 isFinite operation.
+template <typename T>
+constexpr bool IsFinite(T x) {
+  return !IsNaN(x) && !IsInfinite(x);
+}
+
+// Returns true if the value is normal as per the IEEE-754 isNormal operation.
+template <typename T>
+constexpr bool IsNormal(T x) {
+  T abs_x = Abs(x);
+  return abs_x >= std::numeric_limits<T>::min() &&
+         abs_x <= std::numeric_limits<T>::max();
+}
+
+// Returns true if the value is subnormal as per the IEEE-754 isSubnormal
+// operation.
+template <typename T>
+constexpr bool IsSubnormal(T x) {
+  T abs_x = Abs(x);
+  return abs_x > static_cast<T>(0) && abs_x < std::numeric_limits<T>::min();
+}
+
+// Scales a value by a power of the radix as per the IEEE-754 scaleB operation.
+template <typename T>
+constexpr T ScaleBase(T x, int n) {
+  static_assert(is_specialized_floating_point_v<T>);
+  // While n is positive, move the radix point right. This is the same as
+  // multiplying by the radix. Rounding will not occur because the next radix
+  // interval has at least as much precision as the last.
+  while (n > 0 && IsFinite(x) && !IsZero(x)) {
+    int multiplier_exponent =
+        std::min(n, std::numeric_limits<T>::max_exponent - 1);
+    x *= IPow(static_cast<T>(std::numeric_limits<T>::radix),
+              multiplier_exponent);
+    n -= multiplier_exponent;
+  }
+  // While n is negative, move the radix point left. For normal numbers, this
+  // is the same as dividing by the radix. For subnormal numbers, we need to
+  // divide by a scaled form of the radix so that we will not induce rounding.
+  for (; n < 0 && IsFinite(x) && !IsZero(x); ++n) {
+    T shifted_x = x / std::numeric_limits<T>::radix;
+    // This shift would make the number subnormal which means our result is
+    // either a subnormal or 0. We can compute the answer by just scaling the
+    // smallest subnormal and multiplying by that.
+    if (IsSubnormal(shifted_x)) {
+      int scale_exponent = -((std::numeric_limits<T>::min_exponent - 1) -
+                             (std::numeric_limits<T>::digits - 1)) +
+                           n;
+      // denorm_min is the smallest subnormal number so multiplying it by 2^m
+      // where m < 0 is just zero.
+      if (scale_exponent < 0) {
+        return x * static_cast<T>(0);
+      }
+      return x *
+             ScaleBase(std::numeric_limits<T>::denorm_min(), scale_exponent);
+    }
+    x = shifted_x;
+  }
+  return x;
+}
+
+// Returns the exponent of the given value as per the IEEE-754 logB operation.
+template <typename T>
+constexpr std::optional<int> LogBase(T x) {
+  if (IsNaN(x)) {
+    return std::nullopt;
+  }
+  if (IsInfinite(x)) {
+    return std::numeric_limits<int>::max();
+  }
+  if (IsZero(x)) {
+    return std::numeric_limits<int>::min();
+  }
+  T abs_x = Abs(x);
+  int exponent = 0;
+  while (abs_x < static_cast<T>(1)) {
+    abs_x *= std::numeric_limits<T>::radix;
+    exponent -= 1;
+  }
+  while (abs_x >= std::numeric_limits<T>::radix) {
+    abs_x /= std::numeric_limits<T>::radix;
+    exponent += 1;
+  }
+  return exponent;
+}
+
+enum class RoundingDirection {
+  kRoundTiesToEven,
+  kRoundTowardsZero,
+};
+
+// Splits a double in two floats, high and low such that high + low approximates
+// the double very closely. The high float will have `kNumHighFloatZeroLsbs`
+// clear. Returns {high, low}.
+// This lets us turn a double with 53 bits of precision into a result with
+// `49 - kNumHighFloatZeroLsbs` bits of precision.
+// N.B. The number 49 comes from 2*24 + 1. The extra bit of precision comes from
+// the sign bit of the low component (e.g. 0x1.ffffffffffffp+0 which has 49 bits
+// of precision can be represented via 0x1p+1 - 0x1p-48.)
+template <typename DstT, typename SrcT>
+constexpr std::pair<DstT, DstT> SplitToFpPair(
+    SrcT to_split, int num_high_trailing_zeros,
+    RoundingDirection rounding_direction =
+        RoundingDirection::kRoundTiesToEven) {
+  constexpr auto kError =
+      std::make_pair(std::numeric_limits<DstT>::quiet_NaN(),
+                     std::numeric_limits<DstT>::quiet_NaN());
+  if (num_high_trailing_zeros < 0) {
+    return kError;
+  }
+  if (!IsFinite(to_split)) {
+    return kError;
+  }
+  if (IsZero(to_split)) {
+    DstT zero = static_cast<DstT>(to_split);
+    return std::make_pair(zero, zero);
+  }
+  if (IsSignMinus(to_split)) {
+    auto [high, low] =
+        SplitToFpPair<DstT, SrcT>(Abs(to_split), num_high_trailing_zeros);
+    return std::make_pair(-high, -low);
+  }
+  // First, let's round our double to fewer bits of precision.
+  auto maybe_exponent = LogBase(to_split);
+  if (!maybe_exponent.has_value()) {
+    return kError;
+  }
+  int exponent = *maybe_exponent;
+  constexpr int kMinNormalExponent =
+      std::numeric_limits<DstT>::min_exponent - 1;
+  const int effective_precision = std::numeric_limits<DstT>::digits -
+                                  std::max(kMinNormalExponent - exponent, 0);
+  const int high_bits_to_keep = effective_precision - num_high_trailing_zeros;
+  if (high_bits_to_keep < 1) {
+    return kError;
+  }
+  // Rescale the input value to a fixed point representation such that the bits
+  // that we want to round-off are always in the fractional part.
+  static_assert(std::numeric_limits<SrcT>::max_exponent - 1 >=
+                std::numeric_limits<DstT>::digits);
+  SrcT scaled_significand =
+      ScaleBase(to_split, high_bits_to_keep - (exponent + 1));
+  // `integer_part` is the value of the significand with the bits we want to
+  // keep in the high float.
+  uint64_t integer_part = static_cast<uint64_t>(scaled_significand);
+  // `fractional_part` is the value of the significand with the bits we want to
+  // round-off in the high float.
+  SrcT fractional_part = scaled_significand - static_cast<SrcT>(integer_part);
+  switch (rounding_direction) {
+    case RoundingDirection::kRoundTiesToEven: {
+      // Perform RTNE: if the fractional part is greater than 0.5 or if the
+      // fractional part is 0.5 and the integer part is odd, we need to round
+      // up.
+      if (fractional_part > static_cast<SrcT>(0.5f) ||
+          (fractional_part == static_cast<SrcT>(0.5f) &&
+           integer_part % 2 == 1)) {
+        integer_part += 1;
+      }
+      break;
+    }
+    case RoundingDirection::kRoundTowardsZero: {
+      // Perform RTZ: do nothing.
+      break;
+    }
+  }
+  // Rescale the integer part to the original exponent.
+  SrcT rounded = ScaleBase(static_cast<SrcT>(integer_part),
+                           (exponent + 1) - high_bits_to_keep);
+  // Now, we will turn our double into a float. This is merely a format change,
+  // no rounding should occur.
+  DstT high = static_cast<DstT>(rounded);
+  // This conversion should not result in any bits changing in kHigh.
+  if (static_cast<SrcT>(high) != rounded) {
+    return kError;
+  }
+  DstT low = static_cast<DstT>(to_split - double{high});
+  return std::make_pair(high, low);
+}
+
+// Rounds a floating point number to less precision.
+template <typename DstT, typename SrcT>
+constexpr DstT RoundToPrecision(
+    SrcT to_round, int precision = std::numeric_limits<DstT>::digits,
+    RoundingDirection rounding_direction =
+        RoundingDirection::kRoundTiesToEven) {
+  auto [high, low] = SplitToFpPair<DstT, SrcT>(
+      to_round,
+      /*num_high_trailing_zeros=*/std::numeric_limits<DstT>::digits - precision,
+      rounding_direction);
+  return high;
+}
+
+// Use splitting to find high + low == log(2) where high has the bottom
+// `kBitsToDrop` clear. Returns {high, low}.
+template <typename DstT>
+constexpr std::pair<DstT, DstT> Log2FloatPair(int num_high_trailing_zeros) {
+  return SplitToFpPair<DstT>(M_LN2, num_high_trailing_zeros);
+}
+
+// There are many different definitions of ulp(x) in the literature. Here, we
+// are using the "GoldbergUlp" definition as found in: Jean-Michel Muller. On
+// the definition of ulp(x). [Research Report] RR-5504, LIP RR-2005-09, INRIA,
+// LIP. 2005, pp.16. ⟨inria-00070503⟩
+template <typename T>
+constexpr T GoldbergUlp(T x) {
+  if (IsZero(x) || IsSubnormal(x)) {
+    return GoldbergUlp(std::numeric_limits<T>::min());
+  }
+  std::optional<int> maybe_exponent = LogBase(x);
+  if (maybe_exponent.has_value(); const int exponent = *maybe_exponent) {
+    return ScaleBase(std::numeric_limits<T>::epsilon(), exponent);
+  }
+  if constexpr (std::numeric_limits<T>::has_quiet_NaN) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else if constexpr (std::numeric_limits<T>::has_infinity) {
+    return std::numeric_limits<T>::infinity();
+  } else {
+    return GoldbergUlp(std::numeric_limits<T>::max());
+  }
+}
+
+// Returns the number of FP values between two floating point values. Please
+// note that +/-0 are considered equivalent.
+template <typename T>
+int64_t CalculateDistanceInFloats(T a, T b) {
+  auto a_sign_and_magnitude = SignAndMagnitude(a);
+  auto b_sign_and_magnitude = SignAndMagnitude(b);
+  uint64_t a_distance_from_zero = a_sign_and_magnitude.first
+                                      ? -a_sign_and_magnitude.second
+                                      : a_sign_and_magnitude.second;
+  uint64_t b_distance_from_zero = b_sign_and_magnitude.first
+                                      ? -b_sign_and_magnitude.second
+                                      : b_sign_and_magnitude.second;
+  // Bitcast into signed type after doing subtraction in unsigned to allow for
+  // integer overflow.
+  int64_t signed_distance = a_distance_from_zero - b_distance_from_zero;
+  return std::abs(signed_distance);
+}
+
+}  // namespace xla
+
+#endif  // XLA_FP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/frontend_attributes.h b/third_party/tflite-hdrs/third_party/xla/xla/frontend_attributes.h
new file mode 100644
index 00000000..f8f9a68c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/frontend_attributes.h
@@ -0,0 +1,38 @@
+#ifndef XLA_FRONTEND_ATTRIBUTES_H_
+#define XLA_FRONTEND_ATTRIBUTES_H_
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+
+// Attribute which indicates that an in-place instruction has disjoint read
+// and write regions w.r.t aliased input/output buffers.
+inline constexpr char kXlaDisjointReadWriteRegions[] =
+    "_xla_disjoint_read_write_regions";
+
+// Set frontend attribute on 'instruction' which indices that in-place
+// 'instruction' has disjoint read/write buffer regions.
+void SetDisjointReadWriteRegionsAttr(HloInstruction* instruction);
+
+// Returns 'true' if in-place 'instruction' has the kXlaDisjointReadWriteRegions
+// frontend attribute set (returns false otherwise).
+bool HasDisjointReadWriteRegionsAttr(HloInstruction* instruction);
+
+}  // namespace xla
+
+#endif  // XLA_FRONTEND_ATTRIBUTES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h
new file mode 100644
index 00000000..0d1462d4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h
@@ -0,0 +1,126 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_ALIAS_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_HLO_ALIAS_ANALYSIS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Analysis which allocates HloBuffers to HloValues.
+class HloAliasAnalysis {
+ public:
+  // The callgraph of the given HloModule must be flattened
+  // (xla::FlattenCallGraph) prior to running the analysis.
+  static absl::StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
+      const HloModule* module,
+      const HloDataflowAnalysis::CanShareBuffer& can_share_buffer = nullptr);
+
+  std::string ToString() const;
+
+  // Return the buffer containing the given value.
+  const HloBuffer& GetBufferContainingValue(const HloValue& value) const {
+    return *value_to_buffer_.at(&value);
+  }
+  HloBuffer& GetBufferContainingValue(const HloValue& value) {
+    return *value_to_buffer_.at(&value);
+  }
+
+  // Return the HloBuffer with the given ID.
+  const HloBuffer& GetBuffer(HloBuffer::Id buffer_id) const {
+    return buffers_.at(buffer_id);
+  }
+  HloBuffer& GetBuffer(HloBuffer::Id buffer_id) {
+    return buffers_.at(buffer_id);
+  }
+
+  // Returns the unique buffer at the given position. CHECK fails if the buffer
+  // set at that position does not contain exactly one buffer.
+  const HloBuffer& GetUniqueBufferAt(const HloInstruction* instruction,
+                                     const ShapeIndex& index = {}) const;
+  HloBuffer& GetUniqueBufferAt(const HloInstruction* instruction,
+                               const ShapeIndex& index = {});
+
+  // Compute the set of buffers at the given instruction and index and return as
+  // a vector. This set is exactly the union of the buffers containing the
+  // HloValues at this position.
+  std::vector<const HloBuffer*> ComputeBuffersAt(
+      const HloInstruction* instruction, const ShapeIndex& index = {}) const;
+
+  // Return a vector of all HloBuffers stabily sorted by HloBuffer::Id. This
+  // vector is lazily computed. Mutating operations on HloAliasAnalysis may
+  // invalidate the underlying vector requiring recomputation.
+  const std::vector<HloBuffer>& buffers() const { return buffers_; }
+
+  // Returns the underlying dataflow analysis used by this alias analysis.
+  HloDataflowAnalysis& dataflow_analysis() const { return *dataflow_analysis_; }
+
+  // Returns true if a buffer lives out of the module.
+  bool BufferLivesOut(const HloBuffer& buffer) const {
+    return live_out_buffers_.contains(&buffer);
+  }
+
+  // Returns true if a hlo value lives out of the module.
+  bool ValueLivesOut(const HloValue& value) const {
+    return live_out_buffers_.contains(&GetBufferContainingValue(value));
+  }
+
+  std::vector<const HloBuffer*> LiveOutBuffers() const {
+    std::vector<const HloBuffer*> results(live_out_buffers_.begin(),
+                                          live_out_buffers_.end());
+    absl::c_sort(results, HloBuffer::IdLessThan);
+    return results;
+  }
+
+ protected:
+  explicit HloAliasAnalysis(const HloModule* module);
+
+  // Verify various invariants of the alias analysis.
+  absl::Status Verify() const;
+
+  const HloModule* module_;
+
+  // A set of buffers that live out the module.
+  absl::flat_hash_set<const HloBuffer*> live_out_buffers_;
+
+  // The underlying dataflow analysis used by this alias analysis.
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+
+  // A map indicating which buffer a value is contained in.
+  absl::flat_hash_map<const HloValue*, HloBuffer*> value_to_buffer_;
+
+  // A lazily constructed vector containing all HloBuffers sorted by
+  // HloBuffer::Id.
+  std::vector<HloBuffer> buffers_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_ALIAS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h
new file mode 100644
index 00000000..5509e137
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h
@@ -0,0 +1,407 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Analysis for determining the possible set of values for all positions
+// (instructions and ShapeIndexes) in the HLO module. Analysis is module-scoped
+// tracking values across computation boundaries.
+
+#ifndef XLA_HLO_ANALYSIS_HLO_DATAFLOW_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_HLO_DATAFLOW_ANALYSIS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_phi_graph.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Identifies one array input of an HloInstruction.
+struct HloOperandIndex {
+  using MyTuple = std::tuple<int64_t, const ShapeIndex&>;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloOperandIndex& hlo_operand_index) {
+    return H::combine(std::move(h), hlo_operand_index.ToTuple());
+  }
+
+  friend bool operator==(const HloOperandIndex& lhs,
+                         const HloOperandIndex& rhs) {
+    return lhs.ToTuple() == rhs.ToTuple();
+  }
+
+  bool operator!=(const HloOperandIndex& other) const {
+    return !(*this == other);
+  }
+
+  MyTuple ToTuple() const {
+    return std::make_tuple(operand_number, std::cref(operand_index));
+  }
+
+  // The operand number in which the array value appears.
+  int64_t operand_number;
+
+  // The shape index within the operand in which the array value appears.
+  ShapeIndex operand_index;
+};
+
+// Analysis which identifies all HLO values and their uses in an HLO module.
+class HloDataflowAnalysis {
+ public:
+  // Infrastructure for passing may-alias hints: HLO passes can populate the
+  // may-alias table. If an empty optional is returned, default rules are used.
+  //
+  // Must-alias rules (as defined by GetInPlaceInputOutputPairs) cannot be
+  // overriden using backend-specific overrides.
+  //
+  // The first parameter of the function should be the instruction, the
+  // second parameter should be an operand of the instruction. The third
+  // parameter should be the output index of the instruction.
+  using CanShareBuffer = std::function<std::optional<bool>(
+      const HloInstruction* instr, const HloInstruction* operand,
+      const ShapeIndex& user_index)>;
+
+  // Infrastructure for overriding whether an instruction defines a new value.
+  //
+  // The first parameter is the instruction and the second parameter is the
+  // output index. If an empty optional is used, default rules are used. If a
+  // ForwardedOperand object is returned, the value at the corresponding
+  // operand's index is used for the output, overriding all default logic.
+  struct ForwardedOperand {
+    int64_t operand_number;
+    ShapeIndex operand_index;
+  };
+  using ForwardsValue = std::function<std::optional<ForwardedOperand>(
+      const HloInstruction* instr, const ShapeIndex& index)>;
+
+  // Runs dataflow analysis on the given module. Parameters:
+  //
+  //   ssa_form : If true then new values are defined at the merge points of
+  //     kWhile instructions. Abusing nomenclature somewhat, we call these "phi
+  //     values".  The merge is formed by the init value and loop backedge. The
+  //     SSA form is minimal in that a new phi value is defined only if the
+  //     merge point is reachable by multiple different values. The SSA form is
+  //     also in loop-closed form in that no values defined inside of a loop
+  //     (while body) is used outside of the loop. Example use of this ssa_form
+  //     mode is to reason about live range interference of buffers.
+  //
+  //     If ssa_form is false, then merge points do not define new
+  //     values. Rather, the HloValueSet for the merge point contains the union
+  //     of the merged HloValues.
+  //
+  //   bitcast_defines_value : If true then the Bitcast HLO instruction defines
+  //     a new HLO value in the analysis. If false then Bitcast forwards the
+  //     value of its operand.
+  static absl::StatusOr<std::unique_ptr<HloDataflowAnalysis>> Run(
+      const HloModule& module, bool ssa_form = false,
+      bool bitcast_defines_value = false,
+      const CanShareBuffer& can_share_buffer = nullptr,
+      const ForwardsValue& forwards_value = nullptr,
+      absl::flat_hash_set<absl::string_view> execution_threads = {});
+
+  // Returns true if 'instruction' defines an HLO value at the given shape index
+  // of its output.
+  bool ValueIsDefinedAt(const HloInstruction* instruction,
+                        const ShapeIndex& index = {}) const;
+
+  // Returns the HloValue defined by 'instruction' at the given shape index of
+  // its output.
+  //
+  // Precondition: ValueIsDefinedAt is true for this instruction and index.
+  const HloValue& GetValueDefinedAt(const HloInstruction* instruction,
+                                    const ShapeIndex& index = {}) const;
+  HloValue& GetValueDefinedAt(const HloInstruction* instruction,
+                              const ShapeIndex& index = {});
+
+  // Returns the InstructionValueSet for the given instruction.
+  const InstructionValueSet& GetInstructionValueSet(
+      const HloInstruction* instruction) const;
+  InstructionValueSet& GetInstructionValueSet(
+      const HloInstruction* instruction);
+
+  // Returns all values that are contained in the output of this instruction in
+  // a flattened set.
+  HloValueSet GetFlattenedValueSet(const HloInstruction* instruction) const;
+
+  // Returns the HloValueSet for the given instruction at the given index or the
+  // given position.
+  const HloValueSet& GetValueSet(const HloInstruction* instruction,
+                                 const ShapeIndex& index = {}) const;
+  const HloValueSet& GetValueSet(const HloPosition& position) const;
+  HloValueSet& GetValueSet(const HloPosition& position);
+  HloValueSet& GetValueSet(const HloInstruction* instruction,
+                           const ShapeIndex& index = {});
+
+  // Returns the unique value in the HloValueSet at the given instruction and
+  // shape index. CHECKs if the value set does not contain a exactly one value.
+  const HloValue& GetUniqueValueAt(const HloInstruction* instruction,
+                                   const ShapeIndex& index = {}) const {
+    return GetValueSet(instruction, index).GetUniqueValue();
+  }
+  HloValue& GetUniqueValueAt(const HloInstruction* instruction,
+                             const ShapeIndex& index = {}) {
+    return GetValue(GetValueSet(instruction, index).GetUniqueValue().id());
+  }
+
+  // Returns the HloValue with the given Id.
+  const HloValue& GetValue(HloValue::Id value_id) const;
+  HloValue& GetValue(HloValue::Id value_id);
+
+  // Returns the total number of HloValues.
+  int64_t value_count() const { return values_.size(); }
+
+  // Returns a vector of all HloValues stabily sorted by HloValue::Id.
+  const std::vector<HloValue*>& values() const { return values_vector_; }
+
+  // Returns the call graph used for computing the dataflow.
+  const CallGraph& call_graph() const { return *call_graph_; }
+
+  std::string ToString() const;
+
+  // Returns true if 'user' cannot possibly use the buffer at 'index' in
+  // 'operand'. Returns false otherwise.
+  //
+  // 'operand' does not have to be an operand of 'user'. This can be the
+  // case with indirect uses.
+  bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                               const ShapeIndex& index,
+                               const HloInstruction* user) const;
+
+  // Returns true if 'user' (at 'user_index') can share a buffer with its
+  // operand 'operand' (at 'operand_index'). Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool CanShareOperandBufferWithUser(HloInstruction* operand,
+                                     const ShapeIndex& operand_index,
+                                     HloInstruction* user,
+                                     const ShapeIndex& user_index) const;
+
+  const HloModule& module() const { return module_; }
+
+  // Returns true if the operation is an in-place operation and its operand 0
+  // must alias with the output.
+  static bool IsInPlaceOperation(HloOpcode opcode);
+
+  // Returns true if the operation is the start/done of an asynchronous
+  // operation, where the buffer used/produced by the op needs to stay alive
+  // until the asynchronous operation completes.
+  static bool IsAsynchronousOperationStart(HloOpcode opcode);
+  static bool IsAsynchronousOperationDone(HloOpcode opcode);
+
+  // Returns the pairs of inputs and outputs that must share the same buffer,
+  // according to the aliasing rules for that instruction.
+  //
+  // This function only considers array values as inputs and outputs, so
+  // when tuples are present it "sees through" to the array values inside. The
+  // HloUse describing the input parameter contains not only the operand number
+  // but also a shape index describing its position inside a nested tuple shape
+  // (if any). Similarly, the output parameter is described by a shape index
+  // into the nested tuple shape (if any) of the output value.
+  //
+  // For example, for this hypothetical op:
+  //   %foo = (f32[1], (f32[2], f32[3]))
+  //              op((f32[4], f32[5]) %arg0, f32[6] %arg1)
+  //
+  // ... the results can include any of the 3 * 3 = 9 possible pairs of
+  // input and output arrays.
+  static std::vector<std::pair<HloOperandIndex, ShapeIndex>>
+  GetInPlaceInputOutputPairs(const HloInstruction* instruction);
+
+  // Verifies various invariants of the dataflow analysis.
+  absl::Status Verify() const;
+
+ private:
+  static bool AreTransitiveUsesElementwiseOrTuple(const HloInstruction* inst);
+
+  HloDataflowAnalysis(const HloModule& module, bool ssa_form,
+                      bool bitcast_defines_value,
+                      const CanShareBuffer& can_share_buffer,
+                      const ForwardsValue& forwards_value,
+                      absl::flat_hash_set<absl::string_view> execution_threads);
+
+  // 1. During value propagation (Propagate function), always create phi
+  // values once it see multiple inputs merging at the same point. It then
+  // records those phi values as well as their inputs in a phi graph.
+  //
+  // 2. Post value propagation, Dataflow analysis can then do certain
+  // optimization(OptimizePhiValues) on the phi graph to prune uncessary phi
+  // nodes.
+  //
+  // Note that this applies in SSA form, and Both of the functions are
+  // guaranteed to exit.
+  //
+  void OptimizePhiValues();
+
+  // Returns a new HloValue defined at the given instruction and shape index.
+  HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
+                        bool is_phi);
+
+  // Marks the HloValue with the given ID for deletion.
+  void MarkValueForDeletion(HloValue::Id value_id);
+
+  // Deletes all HloValues marked for deletion. Should be called after
+  // propagation is complete.
+  void DeleteMarkedValues();
+
+  // Constructs and initializes the InstructionValueSets of all instructions to
+  // contain exactly the HloValues defined by each instruction. These values can
+  // then propagated throughout the HLO graph by calling Propagate.
+  absl::Status InitializeInstructionValueSets();
+
+  // Updates the value set of the given instruction based on the values flowing
+  // into the instruction (operands and cross-computation dataflow).
+  bool UpdateInstructionValueSet(HloInstruction* instruction);
+
+  // Updates the value set for a particular instruction type. Returns whether
+  // the instruction value set changed.
+  bool UpdateBitcastValueSet(HloInstruction* bitcast);
+  bool UpdateCallValueSet(HloInstruction* call);
+  bool UpdateConditionalValueSet(HloInstruction* conditional);
+  bool UpdateCopyValueSet(HloInstruction* copy);
+  bool UpdateCustomCallValueSet(HloInstruction* custom_call);
+  bool UpdateDomainValueSet(HloInstruction* domain);
+  bool UpdateGetTupleElementValueSet(HloInstruction* gte);
+  bool UpdateParameterValueSet(HloInstruction* parameter);
+  // Async op propagation rules:
+  //  - Operand of async-start to parameter of async wrapped computation and at
+  //    index {0, operand_number} of async-start and async-update outputs.
+  //  - Root of async wrapped computation to index {1} of async-start and
+  //    async-update and index {} of async-done.
+  //  - The contexts in indices {2+} of async-start to the same indices of
+  //    async-update.
+  //
+  // As a result of this, the operands/outputs of async-start and async-done
+  // instructions share the same values as the parameters/roots of the async
+  // wrapped computation.
+  bool UpdateAsyncStartValueSet(HloInstruction* async_start);
+  bool UpdateAsyncUpdateValueSet(HloInstruction* async_update);
+  bool UpdateAsyncDoneValueSet(HloInstruction* async_done);
+  bool UpdateCopyStartValueSet(HloInstruction* copy_start);
+  bool UpdateCopyDoneValueSet(HloInstruction* copy_done);
+  bool UpdateOptimizationBarrierValueSet(HloInstruction* barrier);
+  bool UpdateRecvDoneValueSet(HloInstruction* recv_done);
+  bool UpdateSendValueSet(HloInstruction* send);
+  bool UpdateTupleValueSet(HloInstruction* tuple);
+  bool UpdateWhileValueSet(HloInstruction* xla_while);
+  bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
+  bool UpdateAllGatherStartValueSet(HloInstruction* all_gather_start);
+  bool UpdateAllGatherDoneValueSet(HloInstruction* all_gather_done);
+  bool UpdateAllReduceDoneValueSet(HloInstruction* all_reduce_done);
+  bool UpdateCollectivePermuteStartValueSet(
+      HloInstruction* collective_permute_start);
+  bool UpdateCollectivePermuteDoneValueSet(
+      HloInstruction* collective_permute_done);
+
+  // Propagates the dataflow through the module. In particular, it propagates
+  // the HloValueSet from its defining instruction to the users of the
+  // instructions.
+  void Propagate();
+
+  // Returns the result of the SSA Phi function applied to the given inputs at
+  // the given instruction.
+  bool Phi(HloInstruction* instruction,
+           absl::Span<const InstructionValueSet* const> inputs);
+
+  // Updates the positions of the HloValues in the output of the given
+  // instruction. This should be called after the instruction value set of
+  // 'instruction' has been changed. 'prev_value_set' must point to the previous
+  // state of the value set prior to the change. 'prev_value_set' may be null if
+  // this is the first time positions are being computed. The previous state is
+  // necessary to efficiently remove positions which have been eliminated due to
+  // changes in the instructions' InstructionValueSet.
+  void UpdatePositionsOfValuesAt(
+      HloInstruction* instruction, const InstructionValueSet& new_value_set,
+      const InstructionValueSet* prev_value_set = nullptr);
+
+  const HloModule& module_;
+  const absl::flat_hash_set<absl::string_view> execution_threads_;
+  const bool ssa_form_;
+  const bool bitcast_defines_value_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+
+  // The map of all HloValues in the module. We pass around pointers to the
+  // mapped HloValues, so the underlying container must keep them valid despite
+  // mutations touching other map entries.
+  absl::flat_hash_map<HloValue::Id, std::unique_ptr<HloValue>> values_;
+
+  // A map from instruction to InstructionValueSet.
+  absl::flat_hash_map<const HloInstruction*,
+                      std::unique_ptr<InstructionValueSet>>
+      value_sets_;
+
+  // Values marked for deletion during construction. We don't delete them
+  // immediately because references to them may remain in ValueSets temporarily
+  // during propagation. After construction, these values are deleted.
+  std::vector<HloValue::Id> value_ids_to_delete_;
+
+  // A vector containing all HloValues sorted by HloValue::Id.
+  std::vector<HloValue*> values_vector_;
+
+  // The Id to use for the next HloValue.
+  HloValue::Id next_value_id_ = 0;
+
+  // An explicit graph holding phi values and edges.
+  PhiGraph phi_graph_;
+
+  // Backend specific function that decides whether an instruction can share
+  // a buffer with its operand.
+  CanShareBuffer can_share_buffer_ = nullptr;
+
+  ForwardsValue forwards_value_ = nullptr;
+};
+
+// Removes layers of tuple indirection introduced via 'tuple' and
+// 'get-tuple-element' instructions to more directly identify the source of the
+// given HLO value (identified by the given `ShapeIndex` into the output of the
+// given `HloInstruction`).
+//
+// e.g. for the following:
+//    %x = some-op(...)
+//    %foo = get-tuple-element(%x), index=0
+//    %bar = tuple(%y, %foo)
+//
+// ... FollowTupleIndirection(%bar, {1}) == {%x, {0}} (output 1 of 'bar' comes
+// from output 0 of %x).
+//
+// Note that all 'tuple' instructions are followed before all
+// 'get-tuple-element' instructions are followed. This is because it is assumed
+// that tupling a value and then extracting it from the tuple again will not
+// occur in properly-optimized IR.
+std::pair<const HloInstruction*, ShapeIndex> FollowTupleIndirection(
+    const HloInstruction* instruction, ShapeIndex operand_index);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_DATAFLOW_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h
new file mode 100644
index 00000000..775dd8eb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_DFS_REACHABILITY_H_
+#define XLA_HLO_ANALYSIS_HLO_DFS_REACHABILITY_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "llvm/ADT/DenseMap.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+
+// A simple DFS-based reachability analysis for HLO instructions.
+//
+// When the class is created, the instructions are ordered in a defs-before-uses
+// topological order.
+// The reachability query runs a DFS from the destination node (going up through
+// operands / control predecessors), and stops when the instruction's index in
+// the defs-before-uses list is before the source node. As the reachability is
+// tested for nodes that are close to each other, this optimization works well,
+// and the time is dominated by the post-order sort.
+class HloDfsReachability {
+ public:
+  // Returns true iff the instruction was present in the computation passed to
+  // Build(). The calling code may want to still use the class after the
+  // computation is modified, if it's known that the def-before-use order is
+  // still preserved.
+  bool IsPresent(const HloInstruction* instruction) const;
+  // Returns true iff there is a path (with edges being users and control
+  // successors) from 'from' to 'to'. (i.e. path from definitions to uses; from
+  // producers to consumers)
+  bool IsReachable(const HloInstruction* from, const HloInstruction* to) const;
+  // Returns true iff either `a` is reachable from `b` or `b` is reachable from
+  // `a`.
+  bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
+  static std::unique_ptr<HloDfsReachability> Build(
+      const HloComputation* computation);
+
+ private:
+  // LLVM dense map shows ~10-20% speedup compared to absl::flat_hash_map.
+  llvm::DenseMap<const HloInstruction*, size_t> instruction_to_idx_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_DFS_REACHABILITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.h
new file mode 100644
index 00000000..40d991a2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_LIVENESS_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_HLO_LIVENESS_ANALYSIS_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+// Analysis which identifies all live {HloInstruction, ShapeIndex} pairs in
+// an HLO module.
+//
+// HloLivenessAnalysis marks the shape index of each live output of each
+// instruction in the module, by propagating live shape index information
+// from an instruction to its called computations and operands.
+class HloLivenessAnalysis {
+ public:
+  // Maps from an HloInstruction to its live/dead output shape indices.
+  using HloIndexMap = absl::flat_hash_map<const HloInstruction*,
+                                          std::unique_ptr<ShapeTree<bool>>>;
+
+  // Runs liveness analysis on 'module'. Returns HloLivenessAnalysis object
+  // which exports liveness for each {HloInstruction, ShapeIndex} in 'module'.
+  static absl::StatusOr<std::unique_ptr<HloLivenessAnalysis>> Run(
+      const HloModule& module);
+
+  // Returns true if output of 'instruction' at 'shape_index' is live.
+  // Returns false otherwise.
+  bool IsLive(const HloInstruction* instruction,
+              const ShapeIndex& shape_index) const;
+
+ private:
+  HloLivenessAnalysis(const HloModule& module);
+
+  void RunAnalysis();
+
+  const HloModule& module_;
+  std::unique_ptr<CallGraph> call_graph_;
+  HloIndexMap live_index_map_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_LIVENESS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_ordering.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_ordering.h
new file mode 100644
index 00000000..ded9fed8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_ordering.h
@@ -0,0 +1,243 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_ORDERING_H_
+#define XLA_HLO_ANALYSIS_HLO_ORDERING_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Base class for describing a partial ordering of HLO instructions. Used to
+// determine live range overlap of HLO instruction output buffers.
+class HloOrdering {
+ public:
+  explicit HloOrdering(const HloModule* module)
+      : module_(module), call_graph_(CallGraph::Build(module)) {}
+  virtual ~HloOrdering() = default;
+
+  // Specify the ordering constraints between a pair of instructions a and b.
+  enum class ExecutionConstraint {
+    // Indicate a and b are the same instruction;
+    kIsSame,
+    // Indicate a runs before b starts;
+    kRunBeforeStart,
+    // Indicate a runs before b ends but after b starts, e.g., when b is a
+    // conditional or while loop;
+    kRunBeforeEnd,
+    // Only one of a or b runs each time their common ancestor is evaluated,
+    // and a is in an earlier branch than b.
+    kRunExclusiveBefore,
+    // Only one of a or b runs each time, and a is in a later branch than b.
+    kRunExclusiveAfter,
+    // Indicate a runs after b ends.
+    kRunAfter,
+    // An order cannot be detrermined as a and b do not have a common ancestor.
+    kUnordered,
+  };
+  // Return the execution constraint between a and b.
+  HloOrdering::ExecutionConstraint GetExecutionConstraint(
+      const HloInstruction* a, const HloInstruction* b) const;
+
+  // Returns true if instruction 'a' executes before instruction 'b'. This is
+  // not reflexive, that is, an instruction does not execute before itself.
+  bool ExecutesBefore(const HloInstruction* a, const HloInstruction* b) const;
+
+  // Returns whether the value 'a' is defined before the value 'b' under the
+  // given ordering.
+  bool IsDefinedBefore(const HloValue& a, const HloValue& b) const;
+
+  // Returns whether the given use is before the given value definition under
+  // the given ordering. Set use_is_always_before_def_in_same_instr to false if
+  // you want the analysis to always consider a use at an instruction's operand
+  // to be strictly before that instructions definition. The configuration needs
+  // to be false when result will be used to remove unnecessary copy
+  // instructions, due to additional buffer sharing constraints.
+  bool UsesBeforeValueDefinition(
+      absl::Span<const HloUse* const> uses, const HloValue& value,
+      const HloDataflowAnalysis& dataflow,
+      bool use_is_always_before_def_in_same_instr = false) const;
+  // Returns whether the given values interfere. Two values interfere if they
+  // may both be simultaneously live.
+  bool MayInterfere(const HloValue& a, const HloValue& b,
+                    const HloDataflowAnalysis& dataflow) const;
+
+  // Returns true if the live range of the given value 'a' is strictly before
+  // the live range of value 'b' using the given HLO ordering.
+  bool LiveRangeStrictlyBefore(
+      const HloValue& a, const HloValue& b, const HloDataflowAnalysis& dataflow,
+      bool use_is_always_before_def_in_same_instr = false) const;
+
+  // Returns the sequential instruction order for the given computation, or
+  // nullptr if the computation does not have a sequential ordering.
+  virtual const HloInstructionSequence* SequentialOrder(
+      const HloComputation& computation) const = 0;
+
+  // Return the call graph of the module used to compute ordering.
+  const CallGraph& call_graph() const { return *call_graph_; }
+
+  virtual std::string ToString() const = 0;
+
+ protected:
+  // Returns true if instruction 'a' executes before instruction 'b'.
+  // Precondition: 'a' and 'b' are in the same computation.
+  //
+  // Derived classes should implement this method for determining order of
+  // instructions in the same computation. ExecutesBefore() analyzes the
+  // callgraph and uses this method to determine ordering of instructions in
+  // different computations.
+  virtual bool ExecutesBeforeInSameComputation(
+      const HloInstruction* a, const HloInstruction* b) const = 0;
+
+  const HloModule* module_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+// Base class for partial orderings implemented by a map of predecessors for
+// each instruction. Subclasses should fill in predecessors_.
+class PredecessorHloOrdering : public HloOrdering {
+ public:
+  ~PredecessorHloOrdering() override = default;
+
+  // Returns nullptr indicating the computation does not have a sequential
+  // ordering.
+  const HloInstructionSequence* SequentialOrder(
+      const HloComputation& computation) const override {
+    return nullptr;
+  }
+
+  HloReachabilityMap& reachability_map(const HloComputation* computation) {
+    return *predecessors_.at(computation);
+  }
+  const HloReachabilityMap& reachability_map(
+      const HloComputation* computation) const {
+    return *predecessors_.at(computation);
+  }
+
+ protected:
+  explicit PredecessorHloOrdering(const HloModule* module);
+  std::string ToStringHelper(const std::string& name) const;
+
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
+
+  // For each computation in the module, this is the set of the instruction's
+  // predecessors. An instruction is an element of its own predecessor set.
+  //
+  // Subclasses should fill this in to define the desired ordering.
+  absl::flat_hash_map<const HloComputation*,
+                      std::unique_ptr<HloReachabilityMap>>
+      predecessors_;
+};
+
+// An HLO ordering based on data dependencies in the HLO graph. In this partial
+// order, instruction A executes before instruction B only if there is a path
+// from A to B in the HLO graph. For example, given the following graph:
+/*
+          param
+         /     \
+      negate   exp
+          \    /
+           add
+*/
+// DependencyHloOrdering gives the following executes-before relations:
+//   param executes before negate, exp, and add
+//   negate executes before add
+//   exp executes before add
+//   add executes before nothing
+// negate and exp are not ordered because the dependencies allow either to
+// execute before the other (or in parallel). DependencyHloOrdering ordering
+// allows maximum parallelism and enables any execution order which satisfies
+// data dependencies. This requires pessimistic assumptions about buffer live
+// ranges and can result in more memory used than more constrained orderings.
+class DependencyHloOrdering : public PredecessorHloOrdering {
+ public:
+  explicit DependencyHloOrdering(const HloModule* module);
+  ~DependencyHloOrdering() override = default;
+
+  std::string ToString() const override;
+};
+
+// An HLO ordering based on a total order of instructions in each computation.
+// The computation total order is a sequencing of all of its instructions in
+// the computation (eg, {inst0, inst1, inst2,...}) as in single-threaded
+// execution. For example, given the following HLO graph:
+/*
+          param
+         /     \
+      negate   exp
+          \    /
+           add
+*/
+// and the following sequence:
+//
+//  {param, negate, exp, add}
+//
+// SequentialHloOrdering gives the following executes-before relations:
+//   param executes before negate, exp, and add
+//   negate executes before exp and add
+//   exp executes before add
+//   add executes before nothing
+// This is more constrained than DependencyHloOrdering in this example because
+// negate and exp are ordered (negate before exp). This enables param to share
+// the same buffer as exp (param buffer is dead after exp). Generally, this
+// ordering enables more buffer sharing (reduced memory usage) because buffer
+// interference is reduced relative to DependencyHloOrdering.
+class SequentialHloOrdering : public HloOrdering {
+ public:
+  explicit SequentialHloOrdering(const HloSchedule& schedule);
+  explicit SequentialHloOrdering(HloSchedule&& schedule);
+  ~SequentialHloOrdering() override = default;
+
+  // Returns the sequential instruction order for the given computation.
+  const HloInstructionSequence* SequentialOrder(
+      const HloComputation& computation) const override;
+
+  std::string ToString() const override;
+
+ protected:
+  void Initialize();
+
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
+
+  const HloSchedule schedule_;
+
+  // The position of every instruction in the HLO module in its respective
+  // computation sequence (a value of zero indicates the instruction is first in
+  // the sequence, etc). Instructions from all computations are contained in
+  // this map so more than one instruction may have the same position
+  // value. This is not a problem because ExecutesBefore also verifies
+  // instructions are in the same computation.
+  absl::flat_hash_map<const HloInstruction*, int> order_position_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_ORDERING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_reachability.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_reachability.h
new file mode 100644
index 00000000..6c895c38
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_reachability.h
@@ -0,0 +1,221 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_REACHABILITY_H_
+#define XLA_HLO_ANALYSIS_HLO_REACHABILITY_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// A class for representing reachability between HloInstructions.
+//
+// It has an adjacency matrix and it is up to the user of the class to set the
+// adjacency matrix such that it represents reachability, i.e. such that it is
+// transitive. That the graph be transitive is thus not an invariant of this
+// class, but it is required for the name of the class and its methods to make
+// sense.
+class HloReachabilityMap {
+ public:
+  using Index = size_t;
+
+  // Sets up a graph with no edges and where the nodes correspond to the given
+  // instructions.
+  explicit HloReachabilityMap(
+      absl::Span<const HloInstruction* const> instructions);
+
+  // Computes and returns the reachability between HLO instructions in the
+  // computation. The returned HloReachabilityMap is constructed such that
+  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
+  // directed path (from producer to consumer) from 'a' to 'b'. Both data
+  // dependencies (operands) and control dependencies are considered for
+  // reachability. Trivially an instruction is reachable from itself.
+  static std::unique_ptr<HloReachabilityMap> Build(
+      const HloComputation* computation);
+
+  // Similar to the above Build operation except that it tries to identify
+  // paths between instructions that do not contain control instructions
+  // and multiple operands, i.e., b is_reachable a == true iff
+  // b = f(f(f(f(f(a), constant), constant), constant).
+  // Further, the only ops allowed in a path are basic math operations such
+  // as add, sub, mul, div.
+  static std::unique_ptr<HloReachabilityMap> BuildWithRestrictions(
+      const HloComputation* computation,
+      absl::FunctionRef<void(const HloInstruction*,
+                             std::vector<HloInstruction*>*)>
+          add_dependencies);
+
+  // Set the reachability set of 'instruction' to the union of the reachability
+  // sets of 'inputs'. Upon return, IsReachable(x, instruction) where
+  // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
+  // for some 'input' in 'inputs'. Also sets 'instruction' to be reachable from
+  // itself. Returns whether the reachability set of 'instruction' changed.
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // vector in the internal graph of this HloReachabilityMap for the given
+  // instruction and does not transitively update any other part of the
+  // adjacency matrix.
+  bool SetReachabilityToUnion(absl::Span<const HloInstruction* const> inputs,
+                              const HloInstruction* instruction);
+
+  // As above, but faster because it does not check if the reachability changed.
+  void FastSetReachabilityToUnion(
+      absl::Span<const HloInstruction* const> inputs,
+      const HloInstruction* instruction);
+  // As above, but use Index instead if it's already looked up which is even
+  // faster since no hash map lookup will occur.
+  void FastSetReachabilityToUnion(absl::Span<const Index> input_indices,
+                                  Index index);
+
+  Index GetIndex(const HloInstruction* instruction) const {
+    return indices_.at(GetKey(instruction));
+  }
+
+  // Sets entry so that IsReachable(a, b) will return true
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // matrix in the internal graph of this HloReachabilityMap to have an edge
+  // from a to b and does not transitively update any other part of the
+  // adjacency matrix.
+  void SetReachable(const HloInstruction* a, const HloInstruction* b) {
+    SetReachable(GetIndex(a), GetIndex(b));
+  }
+  void SetReachable(Index a, Index b) { bit_sets_[b].Set(a); }
+
+  // Updates the given reachability map after the immediate predecessor set
+  // (operands and control predecessors) of 'instruction' has changed.
+  void UpdateReachabilityThroughInstruction(const HloInstruction* instruction);
+
+  // Returns true if "b" is reachable from "a"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
+  bool IsReachable(const HloInstruction* a, const HloInstruction* b) const {
+    return IsReachable(GetIndex(a), GetIndex(b));
+  }
+  bool IsReachable(Index a, Index b) const { return bit_sets_[b].Get(a); }
+
+  // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
+  bool IsConnected(const HloInstruction* a, const HloInstruction* b) const {
+    return IsConnected(GetIndex(a), GetIndex(b));
+  }
+  bool IsConnected(Index a, Index b) const {
+    return IsReachable(a, b) || IsReachable(b, a);
+  }
+
+  // Checks if an instruction is in the Reachability map.
+  bool IsPresent(const HloInstruction* instruction) const {
+    return indices_.contains(GetKey(instruction));
+  }
+
+  // Replace the instruction "original" with "replacement" in the reachability
+  // map.
+  void Replace(const HloInstruction* original,
+               const HloInstruction* replacement);
+
+ private:
+  // A dynamically sized bit-set implementation specialized for this use case
+  // providing fast bitwise OR (not available in tsl::gtl::BitMap).
+  class BitSet {
+   public:
+    BitSet() = default;
+    explicit BitSet(size_t size)
+        : size_(size), vector_((size + kBits - 1) / kBits, 0) {}
+
+    // Returns the bit at the given index.
+    bool Get(Index index) const {
+      DCHECK(index >= 0 && index < size_);
+      return vector_[index / kBits] & (1ull << (index % kBits));
+    }
+
+    // Sets the bit at the given index.
+    void Set(Index index) {
+      DCHECK(index >= 0 && index < size_);
+      vector_[index / kBits] |= 1ull << (index % kBits);
+    }
+
+    // Sets this bit-set to union of this bit-set and `other`.
+    void operator|=(const BitSet& other) {
+      if (this == &other) return;
+      DCHECK(size_ == other.size_);
+
+      // Ease the work of the auto-vectorizer.
+      const Word* a = vector_.data();
+      const Word* b = other.vector_.data();
+      Word* __restrict out = vector_.data();
+      size_t num_words = vector_.size();
+      for (size_t i = 0; i < num_words; ++i) {
+        out[i] = a[i] | b[i];
+      }
+    }
+
+    // Sets the bitvector to all zeros.
+    void SetToZero() { absl::c_fill(vector_, 0); }
+
+    bool operator==(const BitSet& other) const {
+      return vector_ == other.vector_;
+    }
+    bool operator!=(const BitSet& other) const { return !(*this == other); }
+
+   private:
+    using Word = uint64_t;
+    static constexpr size_t kBits = 64;
+
+    size_t size_;  // Number of bits in the set.
+    std::vector<Word> vector_;
+  };
+
+  friend class HloReachabilityMapBitSetBenchmark;
+
+  using Key = std::pair<int, int>;  // module ID, instruction ID.
+  static Key GetKey(const HloInstruction* instruction) {
+    return {instruction->GetModule()->unique_id(), instruction->unique_id()};
+  }
+
+  // Helper for SetReachabilityToUnion/FastSetReachabilityToUnion.
+  void SetReachabilityToUnionHelper(
+      absl::Span<const HloInstruction* const> inputs, Index index);
+  void SetReachabilityToUnionHelper(absl::Span<const Index> input_indices,
+                                    Index index);
+
+  // Map from instruction to index. The index is used for bit_set_ and the bits
+  // within a BitSet.
+  absl::flat_hash_map<Key, Index> indices_;
+
+  // Bit-sets holding the reachability to each instruction. The bit-set for
+  // instruction X includes ones for each instruction which X is reachable from.
+  std::vector<BitSet> bit_sets_;
+
+  // A temporary used by SetReachabilityToUnion to avoid an allocation with each
+  // call to the method.
+  BitSet tmp_bit_set_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_REACHABILITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h
new file mode 100644
index 00000000..aa4f15ab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h
@@ -0,0 +1,160 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_REPLICATION_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_HLO_REPLICATION_ANALYSIS_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// An HLO pass that determines whether each instruction in the module outputs
+// the same value across replicas or across partitions (depending on the value
+// `cross_partition_spmd`). It propagates sources of replicated values to
+// the rest of the module, where sources include cross-replica-sum, annotated
+// entry parameters, and constants.
+class HloReplicationAnalysis {
+ public:
+  // Runs the analysis on module and returns the result or an error.
+  static absl::StatusOr<std::unique_ptr<HloReplicationAnalysis>> Run(
+      const HloModule* module, bool cross_partition_spmd);
+
+  // Same as above, but the caller can provide additional annotations: a set of
+  // while loops that are known to have the same iteration counts across
+  // replicas or partitions.
+  static absl::StatusOr<std::unique_ptr<HloReplicationAnalysis>> Run(
+      const HloModule* module, bool cross_partition_spmd,
+      const absl::flat_hash_set<const HloInstruction*>*
+          loops_known_with_same_iterations);
+
+  // Same as above but supports finding partially replicated HLOs.
+  static absl::StatusOr<std::unique_ptr<HloReplicationAnalysis>>
+  RunWithPartialReplication(const HloModule* module, bool cross_partition_spmd);
+
+  // Returns if the HLO instruction outputs the same value (i.e., replicated) at
+  // the given index across all replicas or partitions.
+  bool HloInstructionIsReplicatedAt(const HloInstruction* inst,
+                                    const ShapeIndex& index) const;
+
+  bool HloInstructionIsReplicatedAt(
+      const HloInstruction* inst, const ShapeIndex& index,
+      absl::Span<const ReplicaGroup> replica_groups) const;
+
+ private:
+  // A data structure that represents how an HLO is replicated among a set of
+  // devices. Device ID could be either partition ID or replica ID.
+  // We represent partial replication by grouping devices that have the same
+  // value into the same set.
+  class HloReplication {
+   public:
+    static HloReplication ReplicatedOnAllDevices();
+    static HloReplication UniqueOnAllDevices();
+    static HloReplication PartiallyReplicated(
+        absl::Span<const std::vector<std::vector<int64_t>>>
+            device_sets_per_replica);
+    HloReplication();
+    HloReplication(const HloReplication& other) = default;
+    HloReplication(HloReplication&& other) = default;
+    HloReplication& operator=(HloReplication&& other) = default;
+    HloReplication Merge(const HloReplication& other) const;
+    bool Equal(const HloReplication& other) const;
+    bool IsReplicatedOnAllDevices() const;
+    bool IsUniqueOnAllDevices() const;
+    bool IsReplicatedWithinSubgroup(absl::Span<const int64_t> device_ids) const;
+    std::string ToString() const;
+
+   private:
+    enum class State {
+      kReplicatedOnAllDevices = 0,
+      kUniqueOnAllDevices = 1,
+      kPartiallyReplicated = 2,
+    };
+    explicit HloReplication(
+        State state,
+        absl::Span<const std::vector<int64_t>> device_set_root_per_replica);
+    State state_;
+    // Empty if state_ is kReplicatedOnAllDevices or kUniqueOnAllDevices.
+
+    // If cross_partition_spmd is true, groups_for_replicas_[k]'s size equals
+    // the number of partitions, and within replica k, groups_for_replicas_[k]
+    // maps each partition ID to the smallest partition ID in the set.
+    //
+    // If cross_partition_spmd is false, groups_for_replicas_[k]'s size equals
+    // the number of replicas, and within partition k, groups_for_replicas_[k]
+    // maps each replica to the smallest replica ID in the set.
+    std::vector<std::vector<int64_t>> device_set_root_per_replica_;
+  };
+
+  static HloReplication DetermineHloInstructionIsReplicated(
+      const HloInstruction* hlo, const ShapeIndex& index,
+      bool cross_partition_spmd,
+      const absl::flat_hash_map<const HloInstruction*,
+                                ShapeTree<HloReplication>>& hlo_replication,
+      bool support_partial_replication);
+
+  HloReplicationAnalysis(const HloModule* module, bool cross_partition_spmd,
+                         const absl::flat_hash_set<const HloInstruction*>*
+                             loops_known_with_same_iterations,
+                         bool support_partial_replication)
+      : module_(module),
+        cross_partition_spmd_(cross_partition_spmd),
+        loops_known_with_same_iterations_(*loops_known_with_same_iterations),
+        support_partial_replication_(support_partial_replication) {}
+
+  // Computes hlo_replication_.
+  absl::Status ComputeHloReplication();
+
+  // A helper function to recursively compute hlo_replication on a computation.
+  // Returns whether hlo_replication_ is changed.
+  bool ComputeHloReplicationOnComputation(const HloComputation* computation,
+                                          bool mark_everything_not_replicated);
+
+  const HloModule* module_;
+
+  // If true, run this replication analysis for replicated values across
+  // partitions (not across replicas) on an SPMD partitioned module. This means
+  // that HloInstructionIsReplicatedAt() returns true if the value is identical
+  // across partitions for each replica. The module-level parameter and root
+  // instructions may have HloSharding attributes that indicate whether values
+  // are identical across partitions.
+  //
+  // If false, HloReplicationAnalysis runs across replicas.
+  bool cross_partition_spmd_;
+
+  // A set of while loops that are known to have the same iteration counts
+  // across replicas or partitions. This is provided by the caller as additional
+  // annotations.
+  const absl::flat_hash_set<const HloInstruction*>&
+      loops_known_with_same_iterations_;
+
+  const bool support_partial_replication_;
+
+  // A map from each analyzed HLO instruction to a shape tree that represents
+  // whether the instruction outputs the same value across replicas or
+  // partitions at each shape index.
+  absl::flat_hash_map<const HloInstruction*, ShapeTree<HloReplication>>
+      hlo_replication_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_REPLICATION_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h
new file mode 100644
index 00000000..ec1f6df4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h
@@ -0,0 +1,443 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_HLO_VALUE_SEMANTICS_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_HLO_VALUE_SEMANTICS_ANALYSIS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+struct SendRecvGroup {
+  HloInstruction* send;
+  HloInstruction* recv;
+};
+
+class SendRecvGroupMap {
+ public:
+  explicit SendRecvGroupMap(const HloModule& hlo_module);
+  SendRecvGroupMap(SendRecvGroupMap&& other) = default;
+  SendRecvGroupMap(const SendRecvGroupMap& other) = default;
+  virtual ~SendRecvGroupMap() = default;
+  virtual absl::StatusOr<HloInstruction*> GetMatchingSendOrRecv(
+      HloInstruction* send_or_recv) const;
+
+ private:
+  absl::flat_hash_map<std::string, SendRecvGroup> host_transfer_rendezvous_map_;
+};
+
+class HloPreOrderDFS {
+ public:
+  HloPreOrderDFS() = default;
+  ~HloPreOrderDFS() = default;
+  absl::Status Run(const HloComputation& computation,
+                   DfsHloVisitorBase<HloInstruction*>* visitor);
+
+ private:
+  bool IsReady(const HloInstruction* instruction) const;
+  std::vector<HloInstruction*> stack_;
+  absl::flat_hash_set<HloInstruction*> visited_;
+};
+
+using EinsumDepthMap =
+    absl::node_hash_map<const HloInstruction*, ShapeTree<int>>;
+
+// The einsum depth is the length of the einsum dependency chain. And we
+// distinguish instructions that are used by root and that are not used by
+// root.
+// The einsum depth of an HLO value A is defined as follows:
+// for B = op(A, ...)
+// 1) the root instruction has a depth of 0;
+// 2) non-root instructions that have zero users have a depth of -1;
+// 3) if op is a Dot or Convolution (i.e., einsum),
+//    depth(A, B) = depth(B) >= 0 ? depth(B) + 1 : depth(B) - 1.
+//    depth(A, B) means the depth of A because of B;
+// 4) otherwise depth(A, B) = depth(B);
+// 5) depth(A) is computed by merging all depth(A, u) where u is a user of A.
+//    See MergeDepth for how user depths are merged.
+
+class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
+ public:
+  static absl::StatusOr<std::unique_ptr<EinsumDepthAnalysis>> Run(
+      const HloComputation& computation,
+      const SendRecvGroupMap& send_recv_group_map);
+  ~EinsumDepthAnalysis() override = default;
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleAfterAll(HloInstruction* after_all) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleAllReduce(HloInstruction* all_reduce) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  const EinsumDepthMap& GetEinsumDepthMap() const { return einsum_depth_map_; }
+
+ private:
+  explicit EinsumDepthAnalysis(const SendRecvGroupMap& send_recv_group_map)
+      : send_recv_group_map_(&send_recv_group_map) {}
+  absl::Status RunInternal(const HloComputation& computation,
+                           const std::optional<ShapeTree<int>>& root_depth);
+  ShapeTree<int>& GetOrCreateDepthTree(const HloInstruction* instruction);
+  ShapeTree<int>& GetDepthTreeOrDie(const HloInstruction* instruction);
+  absl::Status SetInstructionDepth(const HloInstruction* instruction,
+                                   int depth);
+  absl::Status SetInstructionDepth(const HloInstruction* instruction,
+                                   const ShapeTree<int>& depth);
+  absl::Status SetInstructionDepthFromTupleDepth(
+      const HloInstruction* instruction, const ShapeTree<int>& tuple_depth_tree,
+      int tuple_index);
+  absl::Status HandleDepthIncrementInstruction(HloInstruction* instruction);
+  absl::Status HandleCalledComputation(
+      const HloComputation& called_computation,
+      const ShapeTree<int>& root_depth,
+      absl::Span<HloInstruction* const> operands);
+  absl::Status HandleTupleLike(HloInstruction* tuple_like);
+  EinsumDepthMap einsum_depth_map_;
+  const SendRecvGroupMap* const send_recv_group_map_;
+};
+
+using EinsumHeightMap =
+    absl::node_hash_map<const HloInstruction*, ShapeTree<int>>;
+
+// Einsum height is the maximum number of einsums between this instruction and
+// any leaf.
+
+class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
+ public:
+  static absl::StatusOr<std::unique_ptr<EinsumHeightAnalysis>> Run(
+      const HloComputation& computation,
+      const SendRecvGroupMap& send_recv_group_map);
+  ~EinsumHeightAnalysis() override = default;
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleAllReduce(HloInstruction* all_reduce) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  const EinsumHeightMap& GetEinsumHeightMap() const {
+    return einsum_height_map_;
+  }
+
+ private:
+  explicit EinsumHeightAnalysis(const SendRecvGroupMap& send_recv_group_map)
+      : send_recv_group_map_(&send_recv_group_map) {}
+  absl::Status RunInternal(const HloComputation& computation,
+                           absl::Span<HloInstruction* const> operands);
+  ShapeTree<int>& GetOrCreateHeightTree(const HloInstruction* instruction);
+  ShapeTree<int>& GetHeightTreeOrDie(const HloInstruction* instruction);
+  bool HasHeightFor(const HloInstruction* instruction) const;
+  absl::Status SetInstructionHeight(const HloInstruction* instruction,
+                                    int height);
+  absl::Status SetInstructionHeight(const HloInstruction* instruction,
+                                    const ShapeTree<int>& height);
+  absl::Status HandleHeightIncrementInstruction(HloInstruction* instruction);
+  absl::Status HandleCalledComputation(
+      const HloComputation& computation,
+      absl::Span<HloInstruction* const> operands);
+  absl::Status HandleTupleLike(HloInstruction* tuple_like);
+
+  EinsumHeightMap einsum_height_map_;
+  const SendRecvGroupMap* const send_recv_group_map_;
+};
+
+// The comment below explains where the labels could originate from. Once
+// originated,  those labels are then propagated throughout the HLO module.
+enum class HloValueSemanticLabel {
+  // Values that are known or predictable at compile time, including constants,
+  // iota, replica-id, and partition-id.
+  kStatic,
+  // Values that are not known or can't be predicated at compile time.
+  kRandom,
+  // HLO module parameters.
+  kWeight,
+  // Output of weight-weight or weight-activation matmuls.
+  kActivation,
+  // Output of weight-activation matmuls where the weight is a dependence of
+  // that activation. Or output of weight-activation-gradient matmuls.
+  kActivationGradient,
+  // Output of activation-gradient-activation matmuls.
+  kWeightGradient,
+  kTupleOrToken,
+};
+
+std::string HloValueSemanticLabelToString(HloValueSemanticLabel label);
+
+class HloValueSemantics {
+ public:
+  using Id = int64_t;
+  HloValueSemantics(HloValueSemanticLabel label, const HloPosition& origin);
+  HloValueSemantics(Id id, HloValueSemanticLabel label,
+                    const HloPosition& origin);
+  HloValueSemantics(const HloValueSemantics& other) = default;
+  HloValueSemantics(HloValueSemantics&& other) = default;
+  HloValueSemantics& operator=(const HloValueSemantics& other) = default;
+
+  Id id() const { return id_; }
+  HloValueSemanticLabel label() const { return label_; }
+  const HloPosition& origin() const { return origin_; }
+  std::string ToString() const;
+
+ private:
+  const Id id_;
+  const HloValueSemanticLabel label_;
+  const HloPosition origin_;
+};
+
+std::string HloValueSemanticsTreeToString(
+    const ShapeTree<const HloValueSemantics*>& tree);
+
+using HloValueSemanticsMap =
+    absl::node_hash_map<const HloInstruction*,
+                        ShapeTree<const HloValueSemantics*>>;
+class HloValueSemanticsPropagation;
+
+class HloValueSemanticsAnalysis {
+ public:
+  static absl::StatusOr<std::unique_ptr<HloValueSemanticsAnalysis>> Run(
+      const HloModule& module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+  virtual ~HloValueSemanticsAnalysis() = default;
+  bool HasSemanticsFor(const HloInstruction* instruction) const;
+  const HloValueSemantics* GetSemantics(const HloInstruction* instruction,
+                                        const ShapeIndex& index = {}) const;
+
+  const HloValueSemanticsMap& GetSemanticsMap() const {
+    return value_semantics_;
+  }
+
+  const EinsumDepthMap& GetEinsumDepthMap() const { return einsum_depth_map_; }
+  const EinsumHeightMap& GetEinsumHeightMap() const {
+    return einsum_height_map_;
+  }
+  int GetDepth(const HloInstruction* instruction,
+               const ShapeIndex& index = {}) const;
+  int GetHeight(const HloInstruction* instruction,
+                const ShapeIndex& index = {}) const;
+
+  const SendRecvGroupMap& GetSendRecvGroupMap() const {
+    return *send_recv_group_map_;
+  }
+
+  absl::StatusOr<HloInstruction*> GetMatchingSendOrRecv(
+      HloInstruction* send_or_recv) const;
+
+ protected:
+  friend class HloValueSemanticsPropagation;
+  explicit HloValueSemanticsAnalysis(
+      const HloModule& module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  virtual absl::Status InitializeEinsumDepth();
+  virtual absl::Status InitializeEinsumHeight();
+  // We match send and recv HLOs to propagate semantics from send to recv.
+  virtual void InitializeSendRecvGroups();
+  void AnnotateWeights();
+
+  // Infer semantics for all instructions in the computation. Computation
+  // parameters are assigned the semantics of the corresponding operand.
+  absl::Status RunOnComputation(
+      const HloComputation& computation,
+      absl::Span<const HloInstruction* const> operands);
+  // Same as the above RunOnComputation, but computation parameters have
+  // already been assigned with semantics.
+  virtual absl::Status RunOnComputation(const HloComputation& computation);
+  HloValueSemantics::Id NextId();
+  const HloValueSemantics* NewHloValueSemantics(HloValueSemanticLabel label,
+                                                const HloPosition& origin);
+  const ShapeTree<const HloValueSemantics*>& GetInstructionSemantics(
+      const HloInstruction* instruction) const;
+  void DeepCopyHloValueSemantics(
+      ShapeTree<const HloValueSemantics*>& copy_to,
+      const ShapeTree<const HloValueSemantics*>& copy_from,
+      const ShapeIndex& source_index, const ShapeIndex& destination_index);
+  void DeepCopyHloValueSemantics(
+      const HloInstruction* target,
+      const ShapeTree<const HloValueSemantics*>& copy_from,
+      const ShapeIndex& source_index = {});
+  void SetHloValueSemantics(
+      const HloInstruction* target,
+      const ShapeTree<const HloValueSemantics*>& semantics);
+  void DeleteHloValueSemantics(
+      const ShapeTree<const HloValueSemantics*>& to_delete);
+  void DeleteHloValueSemantics(const HloValueSemantics* to_delete);
+  const HloModule& module_;
+  const absl::flat_hash_set<absl::string_view>& execution_threads_;
+  HloValueSemanticsMap value_semantics_;
+  absl::flat_hash_map<HloValueSemantics::Id, std::unique_ptr<HloValueSemantics>>
+      value_semantics_map_;
+  HloValueSemantics::Id next_id_;
+  EinsumDepthMap einsum_depth_map_;
+  EinsumHeightMap einsum_height_map_;
+  std::unique_ptr<SendRecvGroupMap> send_recv_group_map_;
+};
+
+class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
+ public:
+  explicit HloValueSemanticsPropagation(HloValueSemanticsAnalysis* analysis);
+  absl::Status Run(const HloComputation& computation);
+  // Infer the output semantics from all operands of the instruction.
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status HandleParameter(HloInstruction* parameter) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleIota(HloInstruction* iota) override;
+  absl::Status HandlePartitionId(HloInstruction* partition_id) override;
+  absl::Status HandleReplicaId(HloInstruction* replica_id) override;
+  absl::Status HandleClamp(HloInstruction* clamp) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleAllGatherStart(HloInstruction* all_gather_start) override;
+  absl::Status HandleAllGatherDone(HloInstruction* all_gather_done) override;
+  absl::Status HandleCollectivePermuteStart(
+      HloInstruction* collective_permute_start) override;
+  absl::Status HandleCollectivePermuteDone(
+      HloInstruction* collective_permute_done) override;
+  absl::Status HandleGather(HloInstruction* gather) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleAfterAll(HloInstruction* after_all) override;
+  absl::Status HandleAllReduce(HloInstruction* all_reduce) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status HandleInfeed(HloInstruction* infeed) override;
+  absl::Status HandleOutfeed(HloInstruction* outfeed) override;
+  absl::Status HandleDomain(HloInstruction* domain) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* opt_barrier) override;
+  absl::Status HandleRngBitGenerator(
+      HloInstruction* rng_bit_generator) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+
+ protected:
+  HloValueSemantics CopySemantics(const HloValueSemantics& semantics) const;
+  HloValueSemantics CopySemanticsWithNewOrigin(
+      const HloValueSemantics& semantics, HloInstruction* new_origin,
+      const ShapeIndex& index = {}) const;
+  const HloValueSemantics* AddSemantics(const HloValueSemantics& semantics);
+  struct EinsumAndOperandIndex {
+    HloInstruction* einsum;
+    int64_t operand_index;
+  };
+  // Checks if the origin of `semantics` is an einsum that takes
+  // `origin_dependence` as an operand.
+  // If `recursive` is set to true, recursively checks all ancestors of the
+  // `semantics`' origin (including itself) for the above condition.
+  // Returns all such einsums and the operand index corresponding to
+  // `origin_dependence`.
+  // We use this function to find whether the output of an einsum who has an
+  // operand X is used in another einsum who takes X as an operand. This is
+  // the pattern for gradient.
+  // For example, consider C = einsum(A, B), dC / dB = einsum(A, C).
+  std::vector<EinsumAndOperandIndex> FindEinsumsWhereOriginDependsOnOther(
+      const HloValueSemantics& semantics, const HloPosition& origin_dependence,
+      bool recursive = false) const;
+  bool OriginDependsOn(const HloValueSemantics& semantics,
+                       const HloPosition& origin_dependence,
+                       bool recursive = false) const;
+  absl::StatusOr<HloValueSemantics> MaybeCreateGradientSemantics(
+      HloInstruction* gradient_candidate,
+      HloValueSemanticLabel fallback_label) const;
+  absl::StatusOr<HloValueSemantics> ComputeSemanticsFromStaticAndOther(
+      const HloValueSemantics& static_semantics,
+      const HloValueSemantics& other_semantics,
+      HloInstruction* instruction) const;
+  absl::StatusOr<HloValueSemantics> ComputeSemanticsFromRandomAndOther(
+      const HloValueSemantics& random_semantics,
+      const HloValueSemantics& other_semantics,
+      HloInstruction* instruction) const;
+  absl::StatusOr<HloValueSemantics> ComputeSemanticsFromWeightAndOther(
+      const HloValueSemantics& weight_semantics,
+      const HloValueSemantics& other_semantics,
+      HloInstruction* instruction) const;
+  absl::StatusOr<HloValueSemantics> ComputeSemanticsFromActivationAndOther(
+      const HloValueSemantics& activation_semantics,
+      const HloValueSemantics& other_semantics,
+      HloInstruction* instruction) const;
+  absl::StatusOr<HloValueSemantics>
+  ComputeSemanticsFromActivationGradientAndOther(
+      const HloValueSemantics& activation_gradient_semantics,
+      const HloValueSemantics& other_semantics,
+      HloInstruction* instruction) const;
+  absl::StatusOr<HloValueSemantics> ComputeSemanticsFromWeightGradientAndOther(
+      const HloValueSemantics& weight_gradient_semantics,
+      const HloValueSemantics& other_semantics,
+      HloInstruction* instruction) const;
+  absl::StatusOr<HloValueSemantics> MergeSemanticsForAnInstruction(
+      HloInstruction* instruction,
+      std::vector<HloValueSemantics>& semantics_vec) const;
+  absl::StatusOr<HloValueSemantics> ComputeSemanticsFromOperands(
+      HloInstruction* instruction, absl::Span<const int64_t> operand_indices,
+      absl::Span<const ShapeIndex> operand_shape_indices = {}) const;
+  absl::Status HandleTupleLike(HloInstruction* tuple_like);
+  absl::Status HandleCollectiveOrCopyStart(HloInstruction* op_start);
+  absl::Status HandleCollectiveOrCopyDone(HloInstruction* op_done);
+  HloValueSemanticsAnalysis* analysis_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_HLO_VALUE_SEMANTICS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h
new file mode 100644
index 00000000..83bf625a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h
@@ -0,0 +1,395 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
+
+#include <type_traits>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// IndexedArrayAnalysis decides if an HLO instruction can be rewritten as a
+// gather from another array.  It does this by mapping HLO instructions to
+// instances of IndexedArrayAnalysis::Array, which can be inspected to discover
+// whether said HLO is equivalent to a gather.
+class IndexedArrayAnalysis {
+ public:
+  // IndexedArrayAnalysis maps each HLO instruction to an instance of a Array.
+  // Array really just a sum type of the classes that inherit from it.  The
+  // meaning of each of the subtypes is documented on the subtype declaration.
+  //
+  // Array instances are immutable once created.
+  class Array {
+   public:
+    enum Kind {
+      kUnknown,
+      kConstant,
+      kReshaped,
+      kScalarIndexedConstant,
+      kScalarIndexed
+    };
+
+    virtual Kind kind() const = 0;
+    virtual const Shape& shape() const = 0;
+
+    // Does a checked downcast from `Array` to `T` which must be one of its
+    // subtypes.
+    template <typename T>
+    T* as() {
+      static_assert((std::is_base_of<Array, T>::value),
+                    "target type not derived from source type");
+      // We skip the CHECK and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+      CHECK_NE(dynamic_cast<T*>(this), nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+      return static_cast<T*>(this);
+    }
+
+    virtual ~Array() = default;
+
+    Array& operator=(const Array& other) = delete;
+  };
+
+  // Represents an HLO instruction that was not analyzable by this
+  // IndexedArrayAnalysis.  Instances of UnknownArray just wrap an existing
+  // HloInstruction.
+  class UnknownArray : public Array {
+   public:
+    Kind kind() const override { return kUnknown; }
+    const Shape& shape() const override { return instruction().shape(); }
+    const HloInstruction& instruction() const { return instruction_; }
+
+   private:
+    explicit UnknownArray(const HloInstruction* instr) : instruction_(*instr) {}
+
+    const HloInstruction& instruction_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Represents a constant value.  This constant value may be present in the HLO
+  // module being analyzed, or it could have been created on the fly by the
+  // analysis.
+  class ConstantArray : public Array {
+   public:
+    Kind kind() const override { return kConstant; }
+    const Shape& shape() const override { return literal()->shape(); }
+    const Literal* literal() const { return literal_; }
+
+   private:
+    explicit ConstantArray(const Literal* literal) : literal_(literal) {}
+    const Literal* literal_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Represents an Array that is a reshape of another Array.
+  class ReshapedArray : public Array {
+   public:
+    Kind kind() const override { return kReshaped; }
+
+    // The array to reshape.
+    Array* operand() const { return operand_; }
+
+    // The output shape.
+    const Shape& shape() const override { return shape_; }
+
+   private:
+    explicit ReshapedArray(Array* operand, Shape shape)
+        : operand_(operand), shape_(shape) {}
+
+    Array* operand_;
+    const Shape shape_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // ---------------------------------------------------------------------------
+  // Indexed Array Overview
+  // ---------------------------------------------------------------------------
+  //
+  // ScalarIndexedArray and ScalarIndexedConstantArray form the core of this
+  // analysis.  ScalarIndexedConstantArray is just a specialization of
+  // ScalarIndexedArray so we will only discuss ScalarIndexedArray in this
+  // overview.
+  //
+  // A ScalarIndexedArray represents an array that can be computed by indexing
+  // into a "source" array using an "indices" tensor.  A simple example is a
+  // gather operation gathering 12 rows out of a [100,100] matrix -- such an
+  // operation will be represented by an instance of a ScalarIndexedArray with
+  // the [100,100] matrix as the "source" array and the [12]-shaped indices
+  // array as the "indices" tensor.  The ScalarIndexedArray operation itself
+  // will be of shape [12,100] (assuming we were gathering with axis=0).
+  //
+  // Gather operations are not the only operation that maps to
+  // ScalarIndexedArray instances (if that were true there would be little point
+  // in having a separate analysis).  We can often infer ScalarIndexedArrays for
+  // other operations too.  For instance, consider:
+  //
+  //   %source = f32[100,100] constant
+  //   %indices = s32[12] ...
+  //   %gather = f32[12,100] ... gather from %source using %indices at axis 0
+  //   %dot = dot(%gather, other_constant) [canonical contracting dims]
+  //
+  // The dot operation itself is also a ScalarIndexedArray with source =
+  // dot(constant, other_constant) and indices = %indices.  A reshape of %gather
+  // to [12,5,20] too is a ScalarIndexedArray with source = an appropriately
+  // reshaped constant and indices = %indices.
+
+  // Represents the result of a gather operation.  This gather operation may
+  // explicitly be present in the HLO module being analyzed, or it could have
+  // been created on the fly by the analysis.
+  //
+  // An instance of ScalarIndexedArray represents a array whose I'th element can
+  // be mapped to the J'th element of the `source` array (where I and J are
+  // multidimensional indices) in this way:
+  //
+  //   I' = remove components at positions `output_dims` from I
+  //   G' = remove components not at positions `output_dims` from I
+  //   T  = indices[G']
+  //   J  = I' with T inserted at position `source_dim`
+  //
+  // For example, if source is of shape [11,13,17,19], indices is of shape
+  // [23,29], output_dims is [0,2] and source_dim is 2 then the output is of
+  // shape [23,11,29,13,19] and the output index [A,B,C,D,E] is mapped to the
+  // input index [B,D,indices[A,C],E].
+  class ScalarIndexedArray : public Array {
+   public:
+    Kind kind() const override { return kScalarIndexed; }
+    const Shape& shape() const override { return shape_; }
+
+    Array* source() const { return source_; }
+    Array* indices() const { return indices_; }
+
+    // `source_dim` is the dimension in the source array that is being indexed
+    // over using indices from the `indices` array.  See the class documentation
+    // and the overview for more details.
+    int64_t source_dim() const { return source_dim_; }
+
+    // `output_dims` are the dimensions in the output array that are being used
+    // to compute an index into the `indices` array.  See the class
+    // documentation and the overview for more details.
+    absl::Span<const int64_t> output_dims() const { return output_dims_; }
+
+   private:
+    explicit ScalarIndexedArray(Array* source, Array* indices,
+                                int64_t source_dim,
+                                std::vector<int64_t> output_dims, Shape shape)
+        : source_(source),
+          indices_(indices),
+          source_dim_(source_dim),
+          output_dims_(std::move(output_dims)),
+          shape_(std::move(shape)) {}
+
+    Array* source_;
+    Array* indices_;
+    int64_t source_dim_;
+    std::vector<int64_t> output_dims_;
+    Shape shape_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // A ScalarIndexedConstantArray is just a ScalarIndexedArray constrained to
+  // have a ConstantArray instance as the source.  This is an ergonomic
+  // concession -- in theory it is possible to just keep ScalarIndexedArray and
+  // check source()->kind().
+  class ScalarIndexedConstantArray : public ScalarIndexedArray {
+   public:
+    Kind kind() const override { return kScalarIndexedConstant; }
+
+    const Literal& literal() const {
+      return *source()->as<ConstantArray>()->literal();
+    }
+
+   private:
+    explicit ScalarIndexedConstantArray(Array* source, Array* indices,
+                                        int64_t source_dim,
+                                        std::vector<int64_t> output_dims,
+                                        Shape shape)
+        : ScalarIndexedArray(source, indices, source_dim,
+                             std::move(output_dims), std::move(shape)) {
+      CHECK(dynamic_cast<ConstantArray*>(source));
+    }
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Returns an Array instance for `instr`.  The IndexedArrayAnalysis instance
+  // keeps ownership of the returned Array instance.
+  //
+  // Caching Behavior: IndexedArrayAnalysis has a cache mapping HLO
+  // instructions to IndexedArrayAnalysis::Array instances.  This entire cache
+  // becomes stale and may cause the analysis to return incorrect results if any
+  // transitive operand (stopping at the containing computation) is modified for
+  // any HLO instruction on which GetArrayFor has been invoked.
+  //
+  // NB!  By inspecting the implementation, you may be able to infer a stronger
+  // caching guarantee than what is mentioned above.  Nevertheless, what is
+  // stated above is the contract.
+  absl::StatusOr<Array*> GetArrayFor(const HloInstruction* instr);
+
+  // Pretty-prints the expression rooted at `root`.
+  std::string ToString(Array* root, bool print_constants = false);
+
+ private:
+  // Helper function that ensures that every HLO instruction that is
+  // transitively used by `root` has an entry in `cache_`.
+  absl::Status TraverseAndPopulateCache(const HloInstruction* root);
+
+  // Creates an Array instance for `instr` under the assumption that all
+  // operations of `instr` are present in `cache_`.
+  absl::StatusOr<Array*> ComputeArrayFor(const HloInstruction* instr);
+
+  absl::StatusOr<Array*> ComputeArrayForConstant(const Literal& literal);
+
+  absl::StatusOr<Array*> ComputeArrayForGather(
+      const Shape& shape, const GatherDimensionNumbers& dim_numbers,
+      absl::Span<const int64_t> slice_sizes, Array* source, Array* indices);
+
+  absl::StatusOr<Array*> ComputeArrayForDotWithIndexedLhs(
+      const Shape& shape, const DotDimensionNumbers& dim_numbers,
+      const PrecisionConfig& precision_config, ScalarIndexedConstantArray* lhs,
+      ConstantArray* rhs);
+
+  absl::StatusOr<Array*> ComputeArrayForDotWithIndexedRhs(
+      const Shape& shape, const DotDimensionNumbers& dim_numbers,
+      const PrecisionConfig& precision_config, ConstantArray* lhs,
+      ScalarIndexedConstantArray* rhs);
+
+  absl::StatusOr<Array*> ComputeArrayForDot(
+      const Shape& shape, const DotDimensionNumbers& dim_numbers,
+      const PrecisionConfig& precision_config, Array* lhs, Array* rhs);
+
+  // This tries to fold a ScalarIndexedArray which has another
+  // ScalarIndexedArray as a source into a ScalarIndexedArray that instead has a
+  // ScalarIndexedArray as indices.  If `source` happened to be a
+  // ScalarIndexedConstantArray this can result in an expression that is more
+  // canonical.
+  //
+  // As an example, consider a gather operation, G0, gathering 7 elements from
+  // an array "Arr" of shape [100] resulting in an array of shape [7], and a
+  // second gather operation, G1, which gathers 3 elements out of the result of
+  // G0 resulting in an array of shape [3].  Let the indices uses by G0 be I0
+  // (of shape [7]) and the indices used by G1 be I1 (of shape [3]).  We can
+  // instead rewrite G1 to gather directly from "Arr" with the three indices
+  // from I0 as per I1.  In other words, we can rewrite:
+  //
+  //    G0 = [Arr[i] for i in I0]
+  //    G1 = [G0[i]  for i in I1]
+  //
+  // into
+  //
+  //    I2 = [I0[i]  for i in I1]
+  //    G1 = [Arr[i] for i in I2]
+  absl::StatusOr<ScalarIndexedArray*> FoldGatherOfGather(
+      ScalarIndexedArray* source, Array* indices, int64_t source_dim,
+      absl::Span<const int64_t> output_dims, Shape shape);
+
+  // Reshapes a scalar-indexed node to remove the degenerate dimensions in its
+  // output.  The result is always a scalar-indexed node.
+  absl::StatusOr<ScalarIndexedArray*> ReshapeToRemoveDegenerateDims(
+      ScalarIndexedArray* operand);
+
+  // Reshapes a scalar-indexed node such that the result has the degenerate
+  // dimensions `degenerate_dims`.  The result is always a scalar-indexed node.
+  absl::StatusOr<ScalarIndexedArray*> ReshapeToAddDegenerateDims(
+      ScalarIndexedArray* operand, absl::Span<const int64_t> degenerate_dims);
+
+  absl::StatusOr<ScalarIndexedArray*> FoldReshapeOfGather(
+      const Shape& shape, ScalarIndexedConstantArray* operand);
+  absl::StatusOr<ScalarIndexedArray*> FoldReshapeOfGatherNoDegenerateDims(
+      const Shape& shape, ScalarIndexedConstantArray* scalar_indexed);
+  absl::StatusOr<Array*> ComputeArrayForReshape(const Shape& shape,
+                                                Array* operand);
+
+  absl::StatusOr<Array*> ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
+                                                            Array* lhs,
+                                                            Array* rhs);
+  absl::StatusOr<Array*> ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
+                                                           Array* operand);
+
+  template <typename T, typename... Args>
+  T* Construct(Args&&... args) {
+    T* new_tensor = new T(std::forward<Args>(args)...);
+    owned_tensors_.push_back(std::unique_ptr<T>(new_tensor));
+    return new_tensor;
+  }
+
+  ScalarIndexedArray* ConstructScalarIndexedArray(
+      Array* source, Array* indices, int64_t source_dim,
+      std::vector<int64_t> output_dims, Shape shape) {
+    if (source->kind() == Array::kConstant) {
+      return Construct<ScalarIndexedConstantArray>(source, indices, source_dim,
+                                                   std::move(output_dims),
+                                                   std::move(shape));
+    } else {
+      return Construct<ScalarIndexedArray>(source, indices, source_dim,
+                                           std::move(output_dims),
+                                           std::move(shape));
+    }
+  }
+
+  Literal* TakeOwnership(Literal literal) {
+    owned_literals_.push_back(std::move(literal));
+    return &owned_literals_.back();
+  }
+
+  absl::StatusOr<Literal*> TakeOwnership(
+      absl::StatusOr<Literal> literal_or_error) {
+    TF_ASSIGN_OR_RETURN(Literal literal, std::move(literal_or_error));
+    owned_literals_.push_back(std::move(literal));
+    return &owned_literals_.back();
+  }
+
+  std::vector<std::unique_ptr<Array>> owned_tensors_;
+  std::vector<Literal> owned_literals_;
+  absl::flat_hash_map<const HloInstruction*, Array*> cache_;
+};
+
+// A pass that prints all non-trivial results returned by IndexedArrayAnalysis.
+// This pass is a no-op if !VLOG_IS_ON(2) so it should be fine to
+// unconditionally add to the regular HLO pass pipeline.
+class IndexedArrayAnalysisPrinterPass : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "indexed-array-analysis-printer-pass";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_analysis.h
new file mode 100644
index 00000000..81c0a8cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_analysis.h
@@ -0,0 +1,170 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+using IndexingMapSet = absl::flat_hash_set<IndexingMap>;
+
+// Contains indexing maps for all N-dimensional tensor input operands that
+// correspond to a particular output.
+struct HloInstructionIndexing {
+  std::string ToString() const;
+
+  // Returns true if the indexing was simplified.
+  bool Simplify();
+
+  // Creates a HloInstructionIndexing from a list of indexing maps for all
+  // operands and sorted w.r.t. operand index, i.e. indexing_maps[i] corresponds
+  // to operand[i] of the instruction.
+  static HloInstructionIndexing FromIndexingMaps(
+      absl::Span<const IndexingMap> indexing_maps);
+
+  // Maps input operand index to the indexing map for one particular output.
+  std::vector<IndexingMapSet> indexing_maps;
+};
+std::ostream& operator<<(std::ostream& out,
+                         const HloInstructionIndexing& instr_indexing);
+
+// Computes indexing maps for all input operands necessary to compute an element
+// of the `output_id` instruction output.
+HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
+                                                    int output_id,
+                                                    mlir::MLIRContext* ctx);
+
+// Computes indexing maps for all output operands that the element of the
+// `input_id` instruction input will participate in.
+HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
+                                                    int input_id,
+                                                    mlir::MLIRContext* ctx);
+
+// Computes the indexing for `epilogue_parent`'s epilogue. For example, if
+// `epilogue_parent` is a transpose, computes the input to output indexing for
+// the path from the transpose's output to the root's output.
+//
+//   transpose
+//       |
+//     bitcast
+//       |
+//      ROOT
+//
+// The root must be specified because in HLO, an instruction can both be a hero
+// and part of a side output:
+//
+//          reduce
+//         /      \
+//   broadcast    log
+//        |        |
+//       neg    bitcast
+//         \      /
+//           ROOT
+//
+// Here, the we must use the path through the `log` for the epilogue indexing,
+// since the other path is not actually an epilogue (it's a side output). This
+// fusion does not make much sense, but they are created sometimes.
+IndexingMap ComputeEpilogueInputToOutputIndexing(
+    HloInstructionAdaptor epilogue_parent, HloInstructionAdaptor epilogue_root,
+    mlir::MLIRContext* mlir_context);
+
+using GroupedByOpIndexingMap =
+    absl::flat_hash_map<const HloInstruction*, IndexingMapSet>;
+
+// Computes output-to-input indexing for every instruction within a fusion
+// cluster starting with `target_instr` and going from def to use.
+GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
+    const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
+    mlir::MLIRContext* ctx);
+
+// Groups indexing maps by instructions.
+absl::flat_hash_map<const HloInstruction*, IndexingMapSet>
+GroupIndexingMapsByProducers(const HloInstructionIndexing& indexing,
+                             const HloInstruction* instr);
+
+// Computes producer indexing maps and fuse/compose them with the consumer
+// indexing maps.
+bool FuseProducerConsumerOutputToInputIndexing(
+    const HloInstruction* producer_instr,
+    absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
+        consumer_indexing,
+    mlir::MLIRContext* mlir_context);
+
+// Creates an indexing map for bitcasting from `input_shape` to `output_shape`.
+// Equivalent to linearizing the input_shape index and then delinearizing it
+// to output_shape.
+IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
+                          mlir::MLIRContext* mlir_context);
+IndexingMap GetBitcastMap(absl::Span<const int64_t> input_shape,
+                          const Shape& output_shape,
+                          mlir::MLIRContext* mlir_context);
+IndexingMap GetBitcastMap(absl::Span<const int64_t> input_shape,
+                          absl::Span<const int64_t> output_shape,
+                          mlir::MLIRContext* mlir_context);
+
+// Creates an indexing map from the physical layout of the tensor to its logical
+// layout.
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
+    const Shape& shape, mlir::MLIRContext* mlir_context);
+
+// Creates an indexing map from the logical layout of the tensor to its physical
+// layout.
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
+    const Shape& shape, mlir::MLIRContext* mlir_context);
+
+// Returns the shape of the output of the instruction.
+const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
+
+// Computes 1D index given a shape and N-d indexing expressions.
+mlir::AffineExpr LinearizeShape(
+    absl::Span<const int64_t> dims,
+    absl::Span<const mlir::AffineExpr> dimension_exprs,
+    mlir::MLIRContext* mlir_context);
+
+// Computes N-d indexing expressions given a linear index and a shape.
+std::vector<mlir::AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
+                                               mlir::AffineExpr linear_index,
+                                               mlir::MLIRContext* mlir_context);
+
+// Creates an identity indexing map corresponding to the parameter shape.
+IndexingMap CreateIdentityMap(const Shape& shape,
+                              mlir::MLIRContext* mlir_context);
+IndexingMap CreateIdentityMap(absl::Span<const int64_t> dimensions,
+                              mlir::MLIRContext* mlir_context);
+
+llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
+    mlir::AffineExpr linear, absl::Span<const int64_t> sizes);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_map.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_map.h
new file mode 100644
index 00000000..77ea7ec2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_map.h
@@ -0,0 +1,487 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXING_MAP_H_
+#define XLA_HLO_ANALYSIS_INDEXING_MAP_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+
+enum class VariableKind : char {
+  kDefault = 0,
+  // GPU Block IDs.
+  kBlockX,
+  kBlockY,
+  kBlockZ,
+  // GPU Thread IDs.
+  kThreadX,
+  kThreadY,
+  kThreadZ,
+  // GPU warp ID.
+  kWarp,
+  // GPU thread ID in the warp.
+  kWarpThread
+};
+
+absl::string_view ToVariableName(VariableKind var_kind);
+VariableKind ToVariableType(absl::string_view var_name);
+std::ostream& operator<<(std::ostream& out, VariableKind var_type);
+
+// Interval represents a closed interval [lower_bound, upper_bound].
+struct Interval {
+  std::string ToString() const;
+  bool IsPoint() const { return lower == upper; }
+  bool IsFeasible() const { return lower <= upper; }
+
+  // Returns the number of elements in the interval. Asserts that the number of
+  // elements fits in an int64_t. For this reason, this should only be used for
+  // intervals corresponding to symbols, not for general intervals. Use
+  // `IsFeasible` to check if the interval is non-empty.
+  int64_t GetLoopTripCount() const;
+
+  bool Contains(int64_t value) const {
+    return value >= lower && value <= upper;
+  }
+
+  // Returns true if this interval contains the entire other interval.
+  bool Contains(Interval other) const { return Intersect(other) == other; }
+
+  // The result of a range comparison. We wrap std::optional in a struct to
+  // avoid accidental implicit conversion to bool:
+  // if (range < 42) {
+  //   Executed if the result of the comparison is known to be false!
+  // }
+  struct ComparisonResult {
+    // true or false if the result is known, nullopt otherwise.
+    std::optional<bool> result;
+
+    ComparisonResult operator!() const {
+      if (result) return {!*result};
+      return {result};
+    }
+    bool operator==(const ComparisonResult& other) const {
+      return result == other.result;
+    }
+    bool operator==(bool other) const { return result && *result == other; }
+    bool operator==(std::nullopt_t) const { return !result; }
+    bool operator!=(std::nullopt_t) const { return result.has_value(); }
+    bool operator*() const { return *result; }
+  };
+
+  // All comparison operators here return true or false if the result is known,
+  // or nullopt if it may be either true or false.
+  // We don't use operators here, because the "==" used for hashing is not the
+  // same as "Eq".
+  ComparisonResult Gt(const Interval& b) const;
+  ComparisonResult Lt(const Interval& b) const { return b.Gt(*this); }
+  ComparisonResult Ge(const Interval& b) const { return !b.Gt(*this); }
+  ComparisonResult Le(const Interval& b) const { return !this->Gt(b); }
+  // This is not the same as "==".  See the implementations.
+  ComparisonResult Eq(const Interval& b) const;
+  // This is not the same as "!=".  See the implementations.
+  ComparisonResult Ne(const Interval& b) const { return !this->Eq(b); }
+
+  Interval Intersect(const Interval& rhs) const {
+    Interval result{std::max(lower, rhs.lower), std::min(upper, rhs.upper)};
+    if (result.upper < result.lower) {
+      // Normalize empty results such that NumElements returns 0.
+      result.upper = result.lower - 1;
+    }
+    return result;
+  }
+
+  Interval Union(const Interval& rhs) const {
+    return {std::min(lower, rhs.lower), std::max(upper, rhs.upper)};
+  }
+
+  // Computes the range of the sum of the two intervals. Implements saturating
+  // semantics (i.e. overflow and underflow get clamped to the maximum and
+  // minimum int64). Additionally, bounds of the minimum/maximum value are
+  // considered to be possibly saturated, i.e. `{-2 ** 63, 0} + {42, 42}`
+  // returns `{-2 ** 63, 42}`, not `{-2 ** 63 + 42, 42}`.
+  Interval operator+(const Interval& rhs) const;
+  // Computes the range of the product of the two intervals. Implements
+  // saturating semantics.
+  Interval operator*(const Interval& rhs) const;
+  // Computes the range of the difference of the two intervals. Implements
+  // saturating semantics.
+  Interval operator-(const Interval& rhs) const { return *this + (-rhs); }
+  Interval operator-() const;
+  Interval FloorDiv(int64_t rhs) const;
+
+  Interval min(const Interval& rhs) const {
+    return {std::min(lower, rhs.lower), std::min(upper, rhs.upper)};
+  }
+
+  Interval max(const Interval& rhs) const {
+    return {std::max(lower, rhs.lower), std::max(upper, rhs.upper)};
+  }
+
+  // This is not the same as "Eq".  See the implementations.
+  bool operator==(const Interval& rhs) const {
+    return lower == rhs.lower && upper == rhs.upper;
+  }
+  // This is not the same as "Ne".  See the implementations.
+  bool operator!=(const Interval& rhs) const { return !(*this == rhs); }
+
+  int64_t lower = 0;
+  int64_t upper = 0;
+};
+
+std::ostream& operator<<(std::ostream& out, const Interval& interval);
+inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
+                                     const Interval& interval);
+
+template <typename H>
+H AbslHashValue(H h, const Interval& range) {
+  return H::combine(std::move(h), range.lower, range.upper);
+}
+
+// For use in llvm::hash_combine.
+inline size_t hash_value(const Interval& range) {
+  return llvm::hash_combine(range.lower, range.upper);
+}
+
+class IndexingMap;
+
+// Evaluates lower and upper bounds for expressions given the domain.
+// Not thread safe. Lifetime is tied to the owning IndexingMap's lifetime.
+class RangeEvaluator {
+ public:
+  RangeEvaluator(const IndexingMap& indexing_map,
+                 mlir::MLIRContext* mlir_context, bool use_constraints = true);
+
+  // Checks whether an `AffineExpr` always describes a non-negative value.
+  bool IsAlwaysPositiveOrZero(mlir::AffineExpr expr);
+
+  // Checks whether an `AffineExpr` always describes a non-positive value.
+  bool IsAlwaysNegativeOrZero(mlir::AffineExpr expr);
+
+  // Computes the range of expression using its subexpression ranges.
+  Interval ComputeExpressionRange(mlir::AffineExpr expr);
+
+  // Return MLIR context.
+  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
+
+ private:
+  mlir::MLIRContext* mlir_context_;
+  const IndexingMap& indexing_map_;
+  bool use_constraints_;
+};
+
+// Contains an affine map with N dimension expressions and M + K symbols:
+// (d0, ..., d_{N - 1})[s_0, ..., s_{M - 1}]{r_0, ..., r_{K - 1}} -> f(d_i, s_j)
+// Dimensions d_i correspond to the iteration space of the output tensor.
+// Symbols s_j correspond to ranges of the input dimensions.
+// Runtime variables r_k correspond to the runtime variables.
+// Some or all of the dimensions of the input operands can be expressed as a
+// function of dimensions of output. For example, for broadcasts and cwise ops
+// all dimensions of the inputs are covered by the output dimensions. Domain
+// specifies for what ranges of values the indexing map is specified.
+//
+// Example:
+//
+// 1. Indexing map for the input of the following reduction
+// ```
+//   p0 = f32[150, 20, 10, 50] parameter(0)
+//   reduce = f32[150, 10] reduce(p0, p0_init), dimensions={3, 1}
+// ```
+// can be written as `(d0, d1)[s0, s1] -> (d0, s0, d1, s1)`  with
+// d0 in [0, 149], d1 in [0, 9], s0 in [0, 19] and s1 in [0, 49].
+//
+// 2. Indexing map for the input of the reverse op
+// ```
+//  %p0 = f32[1, 17, 9, 9] parameter(0)
+//  reverse = f32[1, 17, 9, 9] reverse(%p0), dimensions={1, 2}
+// ```
+// can be written as `(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)` with
+// d0 in [0, 0], d1 in [0, 16], d2 in [0, 8] and d3 in [0, 8].
+class IndexingMap {
+ public:
+  // Variable represents dimension, range or runtime variable.
+  struct Variable {
+    Variable() = default;
+    explicit Variable(Interval bounds, llvm::StringRef name = "")
+        : bounds(bounds), name(name) {}
+    Variable(int64_t lb, int64_t ub, llvm::StringRef name = "")
+        : Variable(Interval{lb, ub}, name) {}
+
+    Interval bounds;
+    std::string name = "";
+  };
+
+  IndexingMap(
+      mlir::AffineMap affine_map, std::vector<Variable> dimensions,
+      std::vector<Variable> range_vars, std::vector<Variable> rt_vars,
+      absl::Span<std::pair<mlir::AffineExpr, Interval> const> constraints = {});
+
+  IndexingMap(mlir::AffineMap affine_map, std::vector<Variable> dimensions,
+              std::vector<Variable> range_vars, std::vector<Variable> rt_vars,
+              const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints);
+
+  IndexingMap(const IndexingMap&) = default;
+  IndexingMap(IndexingMap&&) noexcept = default;
+  IndexingMap& operator=(const IndexingMap&) = default;
+  IndexingMap& operator=(IndexingMap&&) = default;
+
+  // Returns an undefined indexing map.
+  static IndexingMap GetUndefined() { return IndexingMap(); }
+
+  static IndexingMap FromTensorSizes(
+      mlir::AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
+      absl::Span<const int64_t> symbol_upper_bounds);
+
+  // Returns true if the indexing map is valid.
+  bool Verify(std::ostream& out) const;
+
+  // If kPreserve, then point dimensions will not be simplified to constants.
+  enum class SimplifyPointDimensions { kPreserve, kReplace };
+
+  // Returns true if the map was simplified.
+  bool Simplify(SimplifyPointDimensions simplify_point_dimensions =
+                    SimplifyPointDimensions::kReplace);
+
+  // Return MLIRContext.
+  mlir::MLIRContext* GetMLIRContext() const;
+
+  // Returns the affine map.
+  mlir::AffineMap GetAffineMap() const { return affine_map_; }
+  mlir::AffineMap& GetMutableAffineMap() { return affine_map_; }
+
+  // Returns the number of indexing map results.
+  int64_t GetNumResults() const { return affine_map_.getNumResults(); }
+
+  // Returns the range evaluator for the indexing map's domain.
+  RangeEvaluator GetRangeEvaluator() const;
+
+  // Getters for dimension vars.
+  const Variable& GetDimVar(int64_t id) const { return dim_vars_[id]; }
+  const std::vector<Variable>& GetDimVars() const { return dim_vars_; }
+  int64_t GetDimVarsCount() const { return dim_vars_.size(); }
+
+  // Getters for range vars.
+  const Variable& GetRangeVar(int64_t id) const { return range_vars_[id]; }
+  const std::vector<Variable>& GetRangeVars() const { return range_vars_; }
+  int64_t GetRangeVarsCount() const { return range_vars_.size(); }
+
+  // Getters for runtime vars.
+  const Variable& GetRTVar(int64_t id) const { return rt_vars_[id]; }
+  const std::vector<Variable>& GetRTVars() const { return rt_vars_; }
+  int64_t GetRTVarsCount() const { return rt_vars_.size(); }
+
+  // Gets bounds of `affine_map_` dimensions.
+  const Interval& GetDimensionBound(int64_t dim_id) const;
+  Interval& GetMutableDimensionBound(int64_t dim_id);
+  std::vector<Interval> GetDimensionBounds() const;
+  int64_t GetDimensionCount() const { return affine_map_.getNumDims(); }
+
+  // Gets bounds of `affine_map_` symbols.
+  const Interval& GetSymbolBound(int64_t symbol_id) const;
+  Interval& GetMutableSymbolBound(int64_t symbol_id);
+  std::vector<Interval> GetSymbolBounds() const;
+  int64_t GetSymbolCount() const { return affine_map_.getNumSymbols(); }
+
+  // Getters for affine expression constraints.
+  const llvm::DenseMap<mlir::AffineExpr, Interval>& GetConstraints() const {
+    return constraints_;
+  }
+  int64_t GetConstraintsCount() const { return constraints_.size(); }
+
+  // Allows to add bounds for the affine expression `expr`. If there are
+  // bounds for the `expr`, then computes intersection of the current and new
+  // ranges.
+  void AddConstraint(mlir::AffineExpr expr, Interval range);
+  void ClearConstraints() { constraints_.clear(); }
+  void EraseConstraint(mlir::AffineExpr expr);
+
+  // Evaluates the constraints at a given point and returns `true` if all
+  // constraints are satisfied.
+  bool ConstraintsSatisfied(
+      llvm::ArrayRef<mlir::AffineExpr> dim_const_exprs,
+      llvm::ArrayRef<mlir::AffineExpr> symbol_const_exprs) const;
+
+  // Evaluates indexing map results at a given point.
+  llvm::SmallVector<int64_t, 4> Evaluate(
+      llvm::ArrayRef<mlir::AffineExpr> dim_const_exprs,
+      llvm::ArrayRef<mlir::AffineExpr> symbol_const_exprs) const;
+
+  // Returns true if there is a constraint on the given symbol.
+  bool IsSymbolConstrained(int64_t symbol_id) const;
+
+  // Returns true if the domain is empty. If it returns false, that does not
+  // mean that the domain is not effectively empty.
+  // For example, if there are two constraints 0 <= d0 mod 7 <= 0 and
+  // 0 <= d0 mod 11 <= 0 for a dimension 0<= d0 <= 50 then there is no d0 that
+  // satisfies both constraints.
+  bool IsKnownEmpty() const { return is_known_empty_; }
+
+  bool IsUndefined() const { return affine_map_ == mlir::AffineMap(); }
+
+  // Removes unused symbols from the `affine_map_` and constraints.
+  // Returns a bit vector of symbols that were removed. If none of the symbols
+  // were removed, returns {}.
+  llvm::SmallBitVector RemoveUnusedSymbols();
+
+  // Removes unused dimensions and symbols from the `affine_map_` and
+  // constraints. Returns a bit vector of all variables [dimensions, symbols]
+  // that were removed. If none of the symbols were removed, returns {}.
+  llvm::SmallBitVector RemoveUnusedVars();
+
+  // Rescales all symbols that are sufficiently constrained through `s? mod x =
+  // [N, N]` constraints. Returns true if a rescale took place, otherwise false.
+  bool RescaleSymbols();
+
+  // Does `symbol` correspond to a range var?
+  bool IsRangeVarSymbol(mlir::AffineSymbolExpr symbol) const;
+
+  // Does `symbol` correspond to an RTVar?
+  bool IsRTVarSymbol(mlir::AffineSymbolExpr symbol) const;
+
+  IndexingMap GetSubMap(unsigned int result_index) const {
+    return {affine_map_.getSubMap({result_index}), dim_vars_, range_vars_,
+            rt_vars_, constraints_};
+  }
+
+  // Returns a new indexing map with all RangeVars and RTVars converted to
+  // DimVars.
+  // For example,
+  // (d0, d1, d2)[s0, s1]{r0} -> (d0, d1, d2, s0, s1, r0)
+  // will be converted to
+  // (d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)
+  IndexingMap ConvertSymbolsToDimensions() const;
+
+ private:
+  IndexingMap() = default;
+
+  // Merges "mod" constraints for the same AffineExpr.
+  // Returns true if simplification was performed.
+  bool MergeModConstraints();
+
+  // Removes DimVars, RangeVars, RTVars that correspond to the unused dimensions
+  // and symbols. If unused_dims is empty, then dims won't be removed. The same
+  // applies to unused_symbols. Returns true, if anything was removed.
+  bool CompressVars(const llvm::SmallBitVector& unused_dims,
+                    const llvm::SmallBitVector& unused_symbols);
+
+  // Resets the indexing map to the canonical "known" empty indexing map, i.e.
+  // (d0...)[s0...]{r0...} -> (0...) affine map.
+  // Does not change the number of symbols, dimensions or results.
+  void ResetToKnownEmpty();
+
+  // Verify if all intervals for DimVars, RangeVars and RTVars are feasible.
+  bool VerifyVariableIntervals();
+
+  // Verify if all intervals for constraints.
+  bool VerifyConstraintIntervals();
+
+  mlir::AffineMap affine_map_;
+
+  // A dimension variable represents a dimension of a tensor or a GPU grid.
+  // Dimension variables correspond to the dimensions of the `affine_map_`.
+  std::vector<Variable> dim_vars_;
+
+  // A range variable represents a range of values, e.g. to compute a single
+  // element of the reduction's result we need a range of values from the input
+  // tensor. Range variables correspond to the front portion of the
+  // symbols in `affine_map_`.
+  std::vector<Variable> range_vars_;
+
+  // A runtime variable represents a runtime symbol, e.g. a dynamic offset in of
+  // a HLO dynamic-update-slice op. Runtime variables correspond to the back
+  // portion of the symbols in `affine_map_`.
+  std::vector<Variable> rt_vars_;
+
+  // Inequality constraints for affine expressions. They restrict the feasible
+  // set for the domain of the indexing map. It contains affine expressions
+  // other than AffineDimExpr and AffineSymbolExpr.
+  llvm::DenseMap<mlir::AffineExpr, Interval> constraints_;
+  // Flag to indicate that the domain is empty.
+  bool is_known_empty_ = false;
+};
+std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map);
+bool operator==(const IndexingMap& lhs, const IndexingMap& rhs);
+inline bool operator!=(const IndexingMap& lhs, const IndexingMap& rhs) {
+  return !(lhs == rhs);
+}
+IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs);
+
+bool operator==(const IndexingMap::Variable& lhs,
+                const IndexingMap::Variable& rhs);
+inline bool operator!=(const IndexingMap::Variable& lhs,
+                       const IndexingMap::Variable& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename H>
+H AbslHashValue(H h, const IndexingMap::Variable& dimension) {
+  return H::combine(std::move(h), dimension.bounds);
+}
+
+inline size_t hash_value(const IndexingMap::Variable& dim_var) {
+  return llvm::hash_combine(dim_var.bounds);
+}
+
+// Composes affine maps, i.e. second ∘ first.
+IndexingMap ComposeIndexingMaps(const IndexingMap& first,
+                                const IndexingMap& second);
+
+template <typename H>
+H AbslHashValue(H h, const IndexingMap& indexing_map) {
+  llvm::hash_code affine_map_hash =
+      llvm::hash_combine(indexing_map.GetAffineMap());
+  llvm::SmallVector<size_t> constraint_hashes;
+  constraint_hashes.reserve(indexing_map.GetConstraintsCount());
+  for (const auto& [expr, interval] : indexing_map.GetConstraints()) {
+    constraint_hashes.push_back(llvm::hash_combine(expr, interval));
+  }
+  h = H::combine(std::move(h), static_cast<size_t>(affine_map_hash),
+                 indexing_map.GetDimVars(), indexing_map.GetRangeVars(),
+                 indexing_map.GetRTVars());
+  h = H::combine_unordered(std::move(h), constraint_hashes.begin(),
+                           constraint_hashes.end());
+  return h;
+}
+
+std::vector<IndexingMap::Variable> DimVarsFromTensorSizes(
+    absl::Span<const int64_t> tensor_sizes);
+
+std::vector<IndexingMap::Variable> DimVarsFromGPUGrid(
+    absl::Span<const int64_t> grid_sizes);
+
+std::vector<IndexingMap::Variable> RangeVarsFromTensorSizes(
+    absl::Span<const int64_t> tensor_sizes);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXING_MAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
new file mode 100644
index 00000000..8181be04
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXING_MAP_SERIALIZATION_H_
+#define XLA_HLO_ANALYSIS_INDEXING_MAP_SERIALIZATION_H_
+
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+
+namespace xla {
+
+// Parses the given string into an IndexingMap.
+std::optional<IndexingMap> ParseIndexingMap(llvm::StringRef input,
+                                            mlir::MLIRContext* context);
+
+// Prints AffineExpr using the default (d0, d1, ..., s0, s1, ...) variable
+// names.
+std::string ToString(mlir::AffineExpr affine_expr);
+
+// Prints AffineExpr using the provided variable names.
+std::string ToString(mlir::AffineExpr affine_expr,
+                     absl::Span<const std::string> dim_names,
+                     absl::Span<const std::string> symbol_names);
+
+std::ostream& operator<<(std::ostream& out, mlir::AffineExpr affine_expr);
+
+// Prints AffineMap using the default (d0, d1, ..., s0, s1, ...) variable names.
+// Mixes range and runtime variables into a single symbol list.
+std::string ToString(mlir::AffineMap affine_map);
+
+// Prints AffineMap using the provided variable names.
+std::string ToString(mlir::AffineMap affine_map,
+                     absl::Span<const std::string> dim_names,
+                     absl::Span<const std::string> range_names,
+                     absl::Span<const std::string> rt_names);
+
+std::ostream& operator<<(std::ostream& out, mlir::AffineMap affine_map);
+
+// Prints IndexingMap using the default (d0, d1, ..., s0, s1, ..., r0, r1, ...)
+// variable names.
+std::string ToString(const IndexingMap& indexing_map);
+
+// Prints IndexingMap using the provided variable names.
+std::string ToString(const IndexingMap& indexing_map,
+                     absl::Span<const std::string> dim_names,
+                     absl::Span<const std::string> range_names,
+                     absl::Span<const std::string> rt_names);
+
+std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map);
+
+// Dimension variable names.
+llvm::SmallVector<std::string> GetDimVarNames(const IndexingMap& map);
+// Range variables names.
+llvm::SmallVector<std::string> GetRangeVarNames(const IndexingMap& map);
+// Runtime variable names.
+llvm::SmallVector<std::string> GetRTVarNames(const IndexingMap& map);
+// Symbol variable names: concatenation of range and runtime variables.
+llvm::SmallVector<std::string> GetSymbolVarNames(const IndexingMap& map);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXING_MAP_SERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
new file mode 100644
index 00000000..92ccc2de
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXING_TEST_UTILS_H_
+#define XLA_HLO_ANALYSIS_INDEXING_TEST_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+
+// Matches two strings ignoring whitespaces.
+bool ApproximateMatch(absl::string_view lhs, absl::string_view rhs);
+
+MATCHER(UndefinedMap, "") { return arg.IsUndefined(); }
+
+MATCHER_P(MatchIndexingMap, indexing_string, "") {
+  if (arg.IsUndefined()) {
+    return false;
+  }
+  return ExplainMatchResult(
+      true, ApproximateMatch(indexing_string, ToString(arg)), result_listener);
+}
+
+MATCHER_P(MatchIndexingString, indexing_string, "") {
+  return ExplainMatchResult(true, ApproximateMatch(indexing_string, arg),
+                            result_listener);
+}
+
+class IndexingTestBase : public HloTestBase {
+ public:
+  HloInstruction* ParseAndGetRoot(absl::string_view hlo_string);
+
+  HloInstructionIndexing GetOutputToInputIndexing(
+      const HloInstruction* instr, int output_id = 0,
+      bool use_physical_layout = false);
+
+  HloInstructionIndexing GetInputToOutputIndexing(
+      const HloInstruction* instr, int input_id = 0,
+      bool use_physical_layout = false);
+
+  mlir::MLIRContext mlir_context_;
+  std::unique_ptr<VerifiedHloModule> module_;
+};
+
+HloInstructionIndexing ComputeOutputToInputIndexingForEntryComputation(
+    HloTestBase* test_base, mlir::MLIRContext* mlir_context,
+    absl::string_view hlo_string, int output_id = 0,
+    bool use_physical_layout = false);
+
+HloInstructionIndexing ComputeInputToOutputIndexingForEntryComputation(
+    HloTestBase* test_base, mlir::MLIRContext* mlir_context,
+    absl::string_view hlo_string, int input_id = 0,
+    bool use_physical_layout = false);
+
+mlir::AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
+                               mlir::MLIRContext* context);
+
+mlir::AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
+                                 mlir::MLIRContext* context);
+
+// Safely evaluates the given expression, returning nullopt if the result is
+// undefined (due to undefined behavior, e.g. division by zero or overflow).
+std::optional<int64_t> SafeEvaluateAffineExpr(mlir::AffineExpr expr,
+                                              absl::Span<int64_t const> dims,
+                                              absl::Span<int64_t const> syms);
+
+// Enumerates all the points in the domain of the given indexing map: points
+// within the bounds of the dimensions and symbols that do not violate any of
+// the constraints.
+absl::Status EnumerateDomain(
+    const IndexingMap& indexing_map,
+    const std::function<absl::Status(absl::Span<int64_t const> dims,
+                                     absl::Span<int64_t const> syms)>&
+        callback);
+
+// Checks if the indexing map is a bijection: verifies that each point in the
+// expected codomain is mapped to a unique point in the domain.
+// The codomain is the output of the indexing map. For example, for an
+// input->output map for an instruction, it would be the instruction's output
+// shape.
+absl::Status VerifyBijection(const IndexingMap& indexing_map,
+                             absl::Span<Interval const> expected_codomain);
+
+// Checks that two affine expressions map to the same values for all points in
+// their domain. If `reference` is undefined at a point, the value of `other` is
+// ignored. If `other` is undefined at a point, but `reference` is not, this is
+// a failure.
+absl::Status VerifyExprsAreIdentical(
+    mlir::AffineExpr reference, mlir::AffineExpr other,
+    absl::Span<Interval const> dimension_ranges,
+    absl::Span<Interval const> symbol_ranges);
+
+// Returns the trip counts for each symbol in the indexing map.
+std::vector<int64_t> GetLoopTripCounts(const IndexingMap& indexing_map);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXING_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.h
new file mode 100644
index 00000000..94cb521b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_LOGICAL_BUFFER_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_LOGICAL_BUFFER_ANALYSIS_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+// A class to create all the logical buffers defined by the HLO ops in a module.
+class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
+ public:
+  // Runs points-to analysis on 'module'.
+  static absl::StatusOr<std::unique_ptr<LogicalBufferAnalysis>> Run(
+      const HloModule* module);
+
+  // Returns the logical buffer with the given ID.
+  LogicalBuffer& GetBuffer(LogicalBuffer::Id id) const;
+
+  // Returns the logical buffer that represents the output of a given HLO
+  // at a given index.
+  LogicalBuffer& GetBuffer(HloInstruction* instruction,
+                           const ShapeIndex& index) const;
+
+  const std::vector<std::unique_ptr<LogicalBuffer>>& logical_buffers() const {
+    return logical_buffers_;
+  }
+  size_t num_logical_buffers() const { return logical_buffers_.size(); }
+
+ private:
+  explicit LogicalBufferAnalysis(const HloModule* module) : module_(module) {}
+  absl::Status Analyze();
+
+  // The module this analysis is performed on.
+  const HloModule* module_;
+
+  // Create a new logical buffer and return a reference to it. The newly created
+  // buffer is stored in an internal vector of LogicalBuffers and can be
+  // accessed with GetBuffer.
+  void NewLogicalBuffer(HloInstruction* instruction, const ShapeIndex& index);
+
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleDomain(HloInstruction* domain) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+
+  // A map from the buffer ID to the logical buffer
+  std::vector<std::unique_ptr<LogicalBuffer>> logical_buffers_;
+
+  // A map from an hlo + shape index to the logical buffer representing
+  // the appropriate output.
+  absl::flat_hash_map<std::pair<const HloInstruction*, const ShapeIndex>,
+                      LogicalBuffer*>
+      output_buffers_;
+  // Whether to alias buffers defined by dataflow relations. This aliasing
+  // relation should not be recognized if copies can be inserted to break up
+  // the dataflow relation-induced aliasing.
+  const bool alias_buffer_across_dataflow_ = false;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_LOGICAL_BUFFER_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h
new file mode 100644
index 00000000..d182cfc7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h
@@ -0,0 +1,370 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_TUPLE_POINTS_TO_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_TUPLE_POINTS_TO_ANALYSIS_H_
+
+#include <stddef.h>
+
+#include <iosfwd>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/logical_buffer_analysis.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/shape_tree.h"
+#include "xla/tsl/lib/gtl/compactptrset.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// A class describing the source(s) of the Buffer(s) contained in the output of
+// a particular HLO instruction. The structure of PointsToSet mirrors the
+// structure of the instruction's shape, which may be an arbitrary tree (eg, a
+// nested tuple). Each node in this tree corresponds to a single buffer in the
+// instruction's output and contains the set of Buffers which might define
+// the corresponding buffer.
+class PointsToSet {
+ public:
+  // Construct our ShapeTree with a pointer rather than a reference to a Shape
+  // because this is very hot code, and copying (and then destroying) all these
+  // Shapes is slow.
+  explicit PointsToSet(const Shape* shape) : tree_(shape) {}
+
+  // Returns true if any points-to sets for any subshape element is not a
+  // singleton.
+  bool IsAmbiguous() const;
+
+  // Returns true if no LogicalBuffer appears in more than one points-to set of
+  // the shape nodes.
+  bool IsDistinct() const;
+
+  // Returns the total number of different LogicalBuffers contained in this
+  // object. This is equal to CreateFlattenedSet().size().
+  size_t size() const;
+
+  // Creates a set containing the union of all LogicalBuffers contained in the
+  // PointsToSet.
+  using BufferSet = tsl::gtl::CompactPointerSet<const LogicalBuffer*>;
+  BufferSet CreateFlattenedSet() const;
+
+  // Returns true if the given buffer is in the points-to set at the given
+  // index.
+  bool ContainsBufferAtIndex(const LogicalBuffer& buffer,
+                             const ShapeIndex& index) const;
+
+  // Returns true if the given buffer is in the points-to set at any index.
+  bool ContainsBuffer(const LogicalBuffer& buffer) const;
+
+  // Adds the given buffer to the points-to set at the given index. This is a
+  // nop if the buffer already is in the set at that index.
+  void AddPointedToBuffer(const LogicalBuffer& buffer, const ShapeIndex& index);
+
+  // For the subshape at the given index (where index is defined as in
+  // ShapeUtil::GetSubshape) this method returns the set of HLO instructions
+  // which may produce the tuple subshape at that index. For example, given:
+  //
+  // %tuple1 = tuple(...)
+  // %tuple2 = tuple(...)
+  // %select = select(%tuple1, %tuple2)
+  // %nested_tuple = tuple(%select, %tuple1)
+  //
+  // These are the values for tuple_sources() for the PointsToSet of
+  // %nested_tuple:
+  //
+  // tuple_sources({}) = {%nested_tuple}
+  // tuple_sources({0}) = {%tuple1, %tuple2}
+  // tuple_sources({1}) = {%tuple1}
+  //
+  // tuple_sources() at the index of an array shape (not a tuple) returns the
+  // empty set. The instructions in the set returned by tuple_sources
+  // necessarily are either Tuple instructions, constants, or parameters.
+  using SourceSet = tsl::gtl::CompactPointerSet<HloInstruction*>;
+  const SourceSet& tuple_sources(const ShapeIndex& index) const;
+
+  // Add a tuple source instruction for the given index.
+  void add_tuple_source(const ShapeIndex& index, HloInstruction* tuple);
+
+  using BufferList = absl::InlinedVector<const LogicalBuffer*, 1>;
+
+  // Return the list of logical buffers for the subshape at index.
+  const BufferList& element(const ShapeIndex& index) const {
+    return tree_.element(index).buffers;
+  }
+  BufferList* mutable_element(const ShapeIndex& index) {
+    return &tree_.mutable_element(index)->buffers;
+  }
+
+  // Call fn(index, buflist) for every subshape index.
+  template <typename Fn>
+  void ForEachElement(const Fn& fn) const {
+    tree_.ForEachElement([&fn](const ShapeIndex& index, const Elem& elem) {
+      fn(index, elem.buffers);
+    });
+  }
+  template <typename Fn>
+  void ForEachMutableElement(const Fn& fn) {
+    tree_.ForEachMutableElement([&fn](const ShapeIndex& index, Elem* elem) {
+      fn(index, &elem->buffers);
+    });
+  }
+  template <typename Fn>
+  absl::Status ForEachElementWithStatus(const Fn& fn) const {
+    return tree_.ForEachElementWithStatus(
+        [&fn](const ShapeIndex& index, const Elem& elem) {
+          return fn(index, elem.buffers);
+        });
+  }
+
+ private:
+  struct Elem {
+    BufferList buffers;
+    SourceSet tuple_sources;
+  };
+  ShapeTree<Elem> tree_;
+
+  // PointsToSet contains references (const LogicalBuffer*) to elements within
+  // TuplePointsToAnalysis, so disable copying.
+  PointsToSet(const PointsToSet&) = delete;
+  PointsToSet& operator=(const PointsToSet&) = delete;
+};
+
+// This class describes a particular subshape in a computation (instruction and
+// shape index) and the logical buffer which may be a source of the subshape
+// value.
+class BufferAlias {
+ public:
+  BufferAlias(HloInstruction* instruction, const ShapeIndex& index)
+      : instruction_(instruction), index_(index) {}
+
+  // Return the instruction/index of the subshape.
+  HloInstruction* instruction() const { return instruction_; }
+  const ShapeIndex& index() const { return index_; }
+
+  bool operator==(const BufferAlias& other) const {
+    return instruction_ == other.instruction_ && index_ == other.index_;
+  }
+  bool operator!=(const BufferAlias& other) const { return !(*this == other); }
+
+  std::string ToString() const;
+
+ private:
+  HloInstruction* instruction_;
+  ShapeIndex index_;
+};
+
+std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
+
+// DFS visitor that performs tuple points-to analysis. This analysis determines
+// the potential sources of each buffer in each instruction's output.
+class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
+ public:
+  // Runs points-to analysis on 'module'.
+  static absl::StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
+      const HloModule* module);
+
+  // Return the points-to set of an instruction. This describes the potential
+  // sources of each buffer in the instruction's output.
+  const PointsToSet& GetPointsToSet(
+      const HloInstruction* hlo_instruction) const;
+
+  // Returns the logical buffer with the given ID.
+  const LogicalBuffer& GetBuffer(LogicalBuffer::Id id) const;
+
+  // Returns the buffer defined at the given instruction and index. An error is
+  // returned if no buffer is defined at that point.
+  absl::StatusOr<const LogicalBuffer*> GetBufferDefinedAt(
+      const HloInstruction* instruction, const ShapeIndex& index) const;
+
+  // Return a (possibly empty) vector containing all BufferAliases of the given
+  // logical buffer The buffer alias set is the inverse of the points-to set.
+  // That is, LogicalBuffer B is in the points-to set of instruction I at index
+  // N iff instruction I, index N is a BufferAlias of B.
+  using BufferAliasVector = absl::InlinedVector<BufferAlias, 1>;
+  const BufferAliasVector& GetBufferAliases(const LogicalBuffer& buffer) const;
+
+  // Returns the number of logical buffers in the module
+  LogicalBuffer::Id num_logical_buffers() const {
+    return logical_buffer_analysis_->num_logical_buffers();
+  }
+
+  // Return a the logical buffer with id "id" in the module. Iteration
+  // over all logical buffers is usually done with something like:
+  //
+  // for (LogicalBuffer:Id id = 0; id < points_to.num_logical_buffers(); id++){
+  //   const auto& buffer = points_to.logical_buffer(id);
+  //   ... do something with buffer ...
+  // }
+  LogicalBuffer& logical_buffer(LogicalBuffer::Id id) const {
+    return logical_buffer_analysis_->GetBuffer(id);
+  }
+
+  // Returns a vector of buffers that the instruction produces. Most
+  // instructions produce a single buffer (the top-level buffer), some produce
+  // no buffers (eg bitcast), and some produce more than one buffer (eg,
+  // tuple-shaped parameters).
+  using BufferDefinitionVector = absl::InlinedVector<const LogicalBuffer*, 1>;
+  const BufferDefinitionVector& GetBuffersDefinedByInstruction(
+      const HloInstruction* instruction) const;
+
+  // Returns true if the given instruction defines a buffer at the given index.
+  bool InstructionDefinesBufferAtIndex(const HloInstruction* instruction,
+                                       const ShapeIndex& index) const;
+
+  // Returns an OK status if the given buffer is defined by instruction
+  // 'buffer.instruction()' at index 'buffer.index()' and if the given buffer
+  // matches the TuplePointsToAnalysis' LogicalBuffer with 'buffer.id'. Returns
+  // an FailedPrecondition error status otherwise. An example of a LogicalBuffer
+  // which is not defined is a tuple element in a Tuple instruction. In this
+  // case, the Tuple instruction does not define the LogicalBuffer, rather that
+  // index aliases one of its operands.
+  absl::Status VerifyBuffer(const LogicalBuffer& buffer) const;
+
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleDomain(HloInstruction* domain) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* barrier) override;
+
+  std::string ToString() const;
+
+  // Returns true if 'user' cannot possibly use the buffer at 'index' in
+  // 'operand'. Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                               const ShapeIndex& index,
+                               const HloInstruction* user) const;
+
+ private:
+  explicit TuplePointsToAnalysis(
+      const HloModule* module,
+      std::unique_ptr<LogicalBufferAnalysis> logical_buffer_analysis)
+      : module_(module),
+        logical_buffer_analysis_(std::move(logical_buffer_analysis)) {}
+
+  // Perform the analysis. Should be called immediately after constructing the
+  // object and before calling GetPointsToSet.
+  absl::Status Analyze();
+
+  // Populates instruction-defined buffers and aliases for each instruction
+  // in 'instructions'.
+  absl::Status PopulateDefinedBuffersAndAliases(
+      const decltype(std::declval<HloComputation>()
+                         .instructions())& instructions);
+
+  // Creates an empty PointsToSet in the points_to_ map for the given
+  // instruction.
+  PointsToSet& CreateEmptyPointsToSet(const HloInstruction* instruction);
+
+  // Creates a PointsToSet in the points_to_ map for 'instruction' which is a
+  // copy of the existing PointsToSet for 'src'.
+  PointsToSet& CreateCopiedPointsToSet(const HloInstruction* instruction,
+                                       const HloInstruction* src);
+
+  // Adds the buffers defined by the given instruction to the given vector.
+  absl::Status GatherBuffersDefinedByInstruction(
+      const HloInstruction* instruction, BufferDefinitionVector* buffers);
+
+  // Print points-to set for 'instruction' to 'output'.
+  void InstructionToString(const HloInstruction* instruction,
+                           std::string* output) const;
+
+  // Information kept per instruction
+  struct PerInstruction {
+    std::unique_ptr<PointsToSet> points_to_set;
+    // Empirically, ~92% of instructions have 1
+    // instruction_defined_buffer, and 99% have 0 or 1
+    BufferDefinitionVector instruction_defined_buffers;
+  };
+
+  const PerInstruction* PerInst(const HloInstruction* inst) const {
+    int id = inst->unique_id();
+    DCHECK_GE(id, 0);
+    auto iter = per_instruction_.find(id);
+    if (iter == per_instruction_.end()) {
+      LOG(FATAL) << "Expected per-instruction information to already exist";
+    } else {
+      return iter->second.get();
+    }
+  }
+  PerInstruction* PerInst(const HloInstruction* inst) {
+    int id = inst->unique_id();
+    DCHECK_GE(id, 0);
+    auto iter = per_instruction_.find(id);
+    if (iter == per_instruction_.end()) {
+      return per_instruction_.emplace(id, std::make_unique<PerInstruction>())
+          .first->second.get();
+    } else {
+      return iter->second.get();
+    }
+  }
+
+  std::vector<std::pair<HloInstruction*, int64_t>>
+  GetAllUsesOfInstructionAtIndex(HloInstruction* instruction,
+                                 const ShapeIndex& index) const;
+  bool HasUniqueFusedUseOfOperandAt(HloInstruction* operand,
+                                    const ShapeIndex& operand_index,
+                                    HloInstruction* fusion,
+                                    const int64_t use_operand_index) const;
+
+  // The module this analysis is performed on.
+  const HloModule* module_;
+
+  // The logical buffers for this module.
+  const std::unique_ptr<LogicalBufferAnalysis> logical_buffer_analysis_;
+
+  // A map from instruction->unique_id() to
+  absl::flat_hash_map<int, std::unique_ptr<PerInstruction>> per_instruction_;
+
+  // A map from LogicalBuffer->id() to alias information about that logical
+  // buffer
+  std::vector<BufferAliasVector> logical_buffer_aliases_;
+
+  TuplePointsToAnalysis(const TuplePointsToAnalysis&) = delete;
+  TuplePointsToAnalysis& operator=(const TuplePointsToAnalysis&) = delete;
+  // Whether to alias buffers connected by dataflow relations. This aliasing
+  // relation should not be recognized if copies can be inserted to break up
+  // the dataflow relation.
+  const bool alias_buffer_across_dataflow_ = false;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_TUPLE_POINTS_TO_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/while_loop_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/while_loop_analysis.h
new file mode 100644
index 00000000..8a99e2b4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/analysis/while_loop_analysis.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_WHILE_LOOP_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_WHILE_LOOP_ANALYSIS_H_
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/value_range.h"
+
+namespace xla {
+
+// Returns the precise trip count of the loop if it's statically known,
+// nullopt otherwise.
+//
+// max_brute_force_iters limits the number of steps that are evaluated while
+// trying to brute force a loop trip count. trip counts larger than
+// max_brute_force_iters may be returned if we can pattern-match the loop
+// condition.
+std::optional<int64_t> ComputeWhileLoopTripCount(
+    const HloInstruction *while_op, int64_t max_brute_force_iters = 128);
+
+// Returns an upper bound on the trip count of the loop if it's statically
+// known, nullopt otherwise.
+std::optional<int64_t> ComputeWhileLoopTripCountUpperBound(
+    const HloInstruction *while_op);
+
+// The below function identifies a subset of all possible auxiliary
+// induction variables (AIV). Specifically, candidates are gtes, e.g.,
+// gte(param0, N)
+std::vector<const HloInstruction *> GetAuxiliaryLoopInductionVars(
+    const HloInstruction *while_op);
+// Returns the tuple index of the loop induction variable if there is such an
+// induction variable detected. It is also checked that all ops that depend on
+// the induction variable have scalar shape. Otherwise returns nullopt.
+std::optional<int64_t> GetLoopInductionVarTupleIdx(
+    const HloInstruction *while_op);
+
+// Checks the following conditions:
+//  - `i`, the induction variable, is initialized to a scalar constant K
+//    (namely, `indvar_init`),
+//  - the while condition does `i < N` or `i <= N` (where N is a known constant)
+//  - the while body does `i += C` (where C is a positive constant)
+// If so, it's trivial to compute the loop bound as `(N - K) div C` or
+// `(N - K + 1) div C`, respectively.
+std::optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction *while_op,
+                                                 int64_t indvar_tuple_idx,
+                                                 const Literal &indvar_init);
+
+// Same as above, but returns the loop range, i.e., start (inclusive), end
+// (inclusive) and step instead of the trip count.
+std::optional<Range> MatchTrivialLoopRange(const HloInstruction *while_op);
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_WHILE_LOOP_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/approx_topk.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/approx_topk.h
new file mode 100644
index 00000000..b4f63c1e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/approx_topk.h
@@ -0,0 +1,74 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_APPROX_TOPK_H_
+#define XLA_HLO_BUILDER_LIB_APPROX_TOPK_H_
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Computes approximate top-ks by aggregating top-1s in equal-sized windows.
+// The number and the size of the windows are determined by the `recall_target`.
+//
+// operand: A sequence of multi-dimensional arrays of type T_0, ..., T_{N-1}
+// init_values: N starting values for top-1 reductions
+// top_k: Determines the k in top-k operation.
+// reduction_dim: Determines the dimension to compute top-k.
+// comparator: The comparator computation to use, which should have function
+//   signatore of (T_0, T_0, T_1, T_1, ..., T_{N-1}, T_{N-1}) -> bool.
+// recall_target: Valid range (0, 1]. User can trade-off quality and performance
+//   with this knob.
+// aggregate_to_topk: When true, sorts the set of approximate top-k elements and
+//   only keep the final k elements on TPU. This option is useful when user
+//   wanted to forward the approximate results to host and aggregate the results
+//   on CPU for better throughput.
+// reduction_input_size_override: When set to a positive value, it overrides the
+//   size determined by operands[reduction_dim] for evaluating the recall. This
+//   option is useful when the given operand is only a subset of the overall
+//   computation in SPMD or distributed pipelines, where the true input size
+//   cannot be deferred by the operand shape.
+//
+// Returns a sequence of multidimensional arrays of type T_0, ..., T_{N-1},
+// which contains the approximate top-ks from the input operands. When
+// `aggregate_to_topk` is set to true, the output size is just top_k. When
+// `aggregate_to_topk` is set to false, the output size varied by the target
+// recall. For target recall = 0.9, the output size is roughly 10 * top_k. For
+// target recall = 0.99, the output size is roughly 100 * top_k.
+//
+// TODO(fchern): Support other hardware platforms.
+XlaOp ApproxTopK(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+                 absl::Span<const XlaOp> init_values, int64_t top_k,
+                 int64_t reduction_dim, const XlaComputation& comparator,
+                 float recall_target = 0.9, bool aggregate_to_topk = true,
+                 int64_t reduction_input_size_override = -1);
+
+// Fallback for platforms that haven't been optimized.
+XlaOp ApproxTopKFallback(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+                         absl::Span<const XlaOp> init_values, int64_t top_k,
+                         int64_t reduction_dim,
+                         const XlaComputation& comparator,
+                         float recall_target = 0.9,
+                         bool aggregate_to_topk = true,
+                         int64_t reduction_input_size_override = -1);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_APPROX_TOPK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/approx_topk_shape.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/approx_topk_shape.h
new file mode 100644
index 00000000..f373ee51
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/approx_topk_shape.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_APPROX_TOPK_SHAPE_H_
+#define XLA_HLO_BUILDER_LIB_APPROX_TOPK_SHAPE_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/status/statusor.h"
+
+namespace xla {
+
+// Determine the output size of the reduction dimension. This is useful for jax
+// abstract eval to determine the output size.
+//
+// input_size: Input size of the reduction dimension.
+// rank: Rank of the input operand.
+// top_k: Determines the k in top-k operation.
+// recall_target: Valid range (0, 1]. User can trade-off quality and performance
+//   with this knob.
+// aggregate_to_topk: When true, sorts the set of approximate top-k elements and
+//   only keep the final k elements on TPU. This option is useful when user
+//   wanted to forward the approximate results to host and aggregate the results
+//   on CPU for better throughput.
+//
+// Returns a pair of
+//   1. Reduction output size
+//   2. Reduction amount in log2 form.
+//
+// 2. is invalid and set to -1 when the approximate output is disabled, i.e.
+//   top_k = 1 or aggregate_to_topk = true.
+absl::StatusOr<std::pair<int64_t, int64_t>> ApproxTopKReductionOutputSize(
+    int64_t input_size, int64_t rank, int64_t top_k, float recall_target,
+    bool aggregate_to_topk, int64_t input_size_override = -1);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_APPROX_TOPK_SHAPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/arithmetic.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/arithmetic.h
new file mode 100644
index 00000000..fda73057
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/arithmetic.h
@@ -0,0 +1,90 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_ARITHMETIC_H_
+#define XLA_HLO_BUILDER_LIB_ARITHMETIC_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+using XlaOpGenerator = std::function<XlaOp(XlaOp, XlaOp)>;
+
+// Creates a scalar computation based on a lambda and returns it.
+XlaComputation CreateScalarComputation(const std::string& name,
+                                       PrimitiveType type, XlaBuilder* builder,
+                                       XlaOpGenerator generator);
+
+// Creates a scalar add computation and returns it.
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+
+// Creates a scalar multiply computation and returns it.
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder);
+
+// Creates a scalar ge computation and returns it.
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
+
+// Creates a scalar max computation and returns it.
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+
+// Creates a scalar min computation and returns it.
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+
+// Creates a scalar logical AND computation and returns it.
+XlaComputation CreateScalarAndComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+
+// Creates a scalar logical OR computation and returns it.
+XlaComputation CreateScalarOrComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
+
+// This is to be used for general purpose "identity" like reductions with zero
+// for any type (ie. boolean operations for PRED and Add for real numbers).
+// As an example, this operation can be used for a situation of:
+// x_type = type(x)
+// op = CreateScalarIdentityWithZeroComputation(x_type)
+// ASSERT_TRUE(op(x, 0) == x)
+//
+// This functionality is used for operations that are similar to a slice,
+// gather, or broadcast, but are created through a reduction.
+XlaComputation CreateScalarIdentityWithZeroComputation(PrimitiveType type,
+                                                       XlaBuilder* builder);
+
+// Returns whether any predicate in "predicates" is set.
+//
+// Note: if predicates is zero-sized, Any() vacuously returns false.
+XlaOp Any(XlaOp predicates);
+
+// Returns the argmax of `input` along `axis`. `output_type` is the type to
+// use for the output. In case of ties always prefers smaller index.
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis);
+
+// Dispatch to ArgMin or ArgMax above, depending on bool.
+XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_ARITHMETIC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/broadcast.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/broadcast.h
new file mode 100644
index 00000000..86cf39f6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/broadcast.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_BROADCAST_H_
+#define XLA_HLO_BUILDER_LIB_BROADCAST_H_
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/primitive_util.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Broadcasts 'input' up to shape 'output_dims', using TensorFlow broadcasting
+// rules. Supports broadcasting a dimension of size x to size x*y, i.e., tiling.
+absl::StatusOr<XlaOp> BroadcastTo(XlaOp input,
+                                  absl::Span<int64_t const> output_dims);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_BROADCAST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/comparators.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/comparators.h
new file mode 100644
index 00000000..8dd3e47e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/comparators.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_COMPARATORS_H_
+#define XLA_HLO_BUILDER_LIB_COMPARATORS_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Creates a scalar less-than computation and returns it. The created
+// computation has 2 * 'operand_types.size()' many parameters, where parameters
+// 2 * i and 2 * i + 1 are a scalar with primitive type 'operand_types[i]'. The
+// computation compares the first two parameters. For floating point types, a
+// total order is created where
+// -NaN < -infinity < ... < -0 < 0 < ... < infinity < NaN
+XlaComputation CreateScalarLtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
+
+// Creates a scalar greater-than computation and returns it. The created
+// computation has 2 * 'operand_types.size()' many parameters, where parameters
+// 2 * i and 2 * i + 1 are a scalar with primitive type 'operand_types[i]'. The
+// computation compares the first two parameters. For floating point types, a
+// total order is created where
+// NaN > infinity > ... > 0 > -0 > ... > -infinity > -NaN
+XlaComputation CreateScalarGtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
+
+// Creates a scalar comparison computation and returns it. This function takes
+// a vector of comparator functions to compare the operands where the function
+// isn't nullopt with the specified comparator at that location.
+XlaComputation CreateScalarComparisonComputation(
+    const std::string& name, const std::vector<PrimitiveType>& operand_types,
+    const std::vector<
+        std::optional<XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64_t>)>>&
+        generators,
+    XlaBuilder* builder);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_COMPARATORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/constants.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/constants.h
new file mode 100644
index 00000000..ce695736
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/constants.h
@@ -0,0 +1,140 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_CONSTANTS_H_
+#define XLA_HLO_BUILDER_LIB_CONSTANTS_H_
+
+#include <type_traits>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/primitive_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/ml_dtypes.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Returns scalar 'value' as a scalar of 'type'. Unlike ConstantR0, 'type' is
+// determined at C++ run-time, rather than C++ compile-time.
+// If 'value' is floating point but 'type' is not, or if 'value' is complex but
+// 'type' is not, an error will be returned. This is to catch accidental
+// truncation; in such cases, use an explicit cast.
+template <typename T>
+XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) {
+  if (std::is_floating_point<T>::value &&
+      !(primitive_util::IsFloatingPointType(type) ||
+        primitive_util::IsComplexType(type))) {
+    return builder->ReportError(InvalidArgument(
+        "Invalid cast from floating point type to %s in ConstantR0WithType.",
+        PrimitiveType_Name(type)));
+  }
+  if (std::is_same<T, complex64>::value &&
+      !primitive_util::IsComplexType(type)) {
+    return builder->ReportError(InvalidArgument(
+        "Invalid cast from complex type to %s in ConstantR0WithType.",
+        PrimitiveType_Name(type)));
+  }
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return ConstantR0<NativeT>(builder, static_cast<NativeT>(value));
+        }
+        return builder->ReportError(
+            InvalidArgument("Invalid type for ConstantR0WithType (%s).",
+                            PrimitiveType_Name(type)));
+      },
+      type);
+}
+
+// Returns a scalar containing 'value' cast to the same run-time type as
+// 'prototype'.
+// If 'value' is floating point but 'prototype' is not, or if 'value' is complex
+// 'prototype' is not, an error will be returned.
+template <typename T>
+XlaOp ScalarLike(XlaOp prototype, T value) {
+  XlaBuilder* builder = prototype.builder();
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype));
+    return ConstantR0WithType(builder, shape.element_type(), value);
+  });
+}
+
+// Returns an array or scalar containing copies of `value` cast to the same
+// run-type type as `prototype` and broadcast to the same dimensions as
+// `prototype`.
+//
+// If `prototype` is not a scalar or array, returns an error.
+template <typename T>
+XlaOp FullLike(XlaOp prototype, T value) {
+  XlaBuilder* builder = prototype.builder();
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype));
+    if (ShapeUtil::IsScalar(shape) || shape.IsArray()) {
+      return Broadcast(ScalarLike(prototype, value), shape.dimensions());
+    } else {
+      return InvalidArgument(
+          "Prototype shape for BroadcastConstantLike must be a scalar or "
+          "array, but was %s",
+          shape.ToString());
+    }
+  });
+}
+
+// Returns a scalar with value '0' of 'type'.
+XlaOp Zero(XlaBuilder* builder, PrimitiveType type);
+
+// Returns a zero-filled tensor with shape `shape`.
+XlaOp Zeros(XlaBuilder* builder, const Shape& shape);
+
+// Returns a zero-filled tensor with the same shape as `prototype`.
+XlaOp ZerosLike(XlaOp prototype);
+
+// Returns a scalar with value '1' of 'type'.
+XlaOp One(XlaBuilder* builder, PrimitiveType type);
+
+// Returns the machine epsilon for floating-point type `type`, i.e.,
+// the difference between 1.0 and the next representable value.
+XlaOp Epsilon(XlaBuilder* builder, PrimitiveType type);
+
+// Returns the minimum representable finite or infinite value for 'type'.
+// Returns '-inf' for floating-point types.
+XlaOp MinValue(XlaBuilder* builder, PrimitiveType type);
+
+// Returns the minimum representable finite value for 'type'. For a floating
+// point type, this is equal to -MaxFiniteValue().
+XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type);
+
+// Returns the minimum positive normal value for floating-point type `type`.
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type);
+
+// Returns the maximum representable finite or infinite value for 'type'.
+// Returns 'inf' for floating-point types.
+XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
+
+// Returns the maximum representable finite value for 'type'.
+XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type);
+
+// Returns a nan for the given type.  Only valid for real-valued fp types.
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/conv_grad_size_util.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/conv_grad_size_util.h
new file mode 100644
index 00000000..862c2da1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/conv_grad_size_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_CONV_GRAD_SIZE_UTIL_H_
+#define XLA_HLO_BUILDER_LIB_CONV_GRAD_SIZE_UTIL_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/padding.h"
+
+namespace xla {
+
+// Information about a single spatial dimension for a convolution gradients and
+// windowed operations.
+struct SpatialDimensionOutputSizeAndPadding {
+  // Effective size of the operation output (potentially expanded).
+  int64_t output_size;
+  // Number of padding elements to be added before/after this dimension of
+  // the input when computing the input gradient.
+  int64_t pad_before;
+  int64_t pad_after;
+};
+
+// Verifies that the dimensions all match, and computes the size and padding of
+// a spatial dimension for convolution gradient operations.
+absl::StatusOr<SpatialDimensionOutputSizeAndPadding>
+ConvGradExtractAndVerifyDimension(int64_t input_size, int64_t filter_size,
+                                  int64_t output_size, int64_t dilation,
+                                  int64_t stride, Padding padding);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_CONV_GRAD_SIZE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.h
new file mode 100644
index 00000000..6073e032
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.h
@@ -0,0 +1,61 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_DYNAMIC_SHAPED_OPS_H_
+#define XLA_HLO_BUILDER_LIB_DYNAMIC_SHAPED_OPS_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/lib/constants.h"
+#include "xla/hlo/builder/value_inference.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/primitive_util.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Similar to static shaped conditional, but allows true_computation and
+// false_computation to have different dimension sizes (ranks still have to be
+// the same). Fall back to static conditional if dynamism is not presented.
+XlaOp DynamicConditional(XlaBuilder* builder, XlaOp predicate,
+                         XlaOp true_operand,
+                         const XlaComputation& true_computation,
+                         XlaOp false_operand,
+                         const XlaComputation& false_computation);
+
+// Similar to DynamicConditional, but support multiple branches.
+XlaOp DynamicConditional(
+    XlaBuilder* builder, XlaOp branch_index,
+    absl::Span<const XlaComputation* const> branch_computations,
+    absl::Span<const XlaOp> branch_operands);
+
+// Similar to SetDimensionSize, but automatically adjust the bound of output if
+// a tighter one can be inferred by `value_inference`.
+absl::StatusOr<XlaOp> SetDimensionSizeWithRebound(
+    ValueInference* value_inference, XlaOp operand, XlaOp dimension_size,
+    int64_t dimension);
+
+// Take a `operand` tensor and a R1 tensor `size_vector` representing the sizes
+// of `operand`, Call SetDimensionSize if for each dimension whose size is
+// dynamic.
+absl::StatusOr<XlaOp> SetAllDimensionSizes(ValueInference* value_inference,
+                                           XlaOp operand, XlaOp size_vector);
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_DYNAMIC_SHAPED_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/logdet.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/logdet.h
new file mode 100644
index 00000000..8c02d72d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/logdet.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_LOGDET_H_
+#define XLA_HLO_BUILDER_LIB_LOGDET_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+
+namespace xla {
+
+// Computes the sign and logarithm of the absolute value of the determinant
+// of a batch of square matrices with shape [..., n, n].
+struct SignAndLogDet {
+  XlaOp sign;    // Either 1, 0, or -1, depending on the determinant's sign.
+  XlaOp logdet;  // log(abs(det(a)).
+};
+SignAndLogDet SLogDet(XlaOp a);
+
+// For a batch of matrices with shape [..., n, n], return log(det(a)).
+// Returns NaN if a matrix has a negative determinant.
+XlaOp LogDet(XlaOp a);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_LOGDET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/loops.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/loops.h
new file mode 100644
index 00000000..cef4d161
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/loops.h
@@ -0,0 +1,75 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_LOOPS_H_
+#define XLA_HLO_BUILDER_LIB_LOOPS_H_
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Function that builds a loop condition. Takes as input a sequence of input
+// values, and returns a boolean value representing if the condition succeeds.
+typedef std::function<absl::StatusOr<XlaOp>(absl::Span<const XlaOp>,
+                                            XlaBuilder*)>
+    WhileLoopHelperConditionFunction;
+
+// Function that builds a loop body. Takes as input a sequence of input values
+// and returns a sequence of output values.
+typedef std::function<absl::StatusOr<std::vector<XlaOp>>(
+    absl::Span<const XlaOp>, XlaBuilder*)>
+    WhileLoopHelperBodyFunction;
+
+// Helper function for building an XLA while loop, where the values carried by
+// the loop are a tuple of values, e.g., (a, b, c):
+// while(
+//   condition: (a, b, c) -> bool,
+//   body: (a, b, c) -> (a, b, c)
+//   init: (a, b, c)
+// )
+// 'name' is a descriptive name for the loop.
+absl::StatusOr<std::vector<XlaOp>> WhileLoopHelper(
+    const WhileLoopHelperConditionFunction& condition_function,
+    const WhileLoopHelperBodyFunction& body_function,
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder);
+
+// Builds an XLA loop that repeats a computation `num_iterations` times.
+//
+// The body function (ForEachIndexBodyFunction) takes as input a pair of
+// (current iteration number, loop-carried values), and returns an updated
+// vector of the loop-carried values.
+typedef std::function<absl::StatusOr<std::vector<XlaOp>>(
+    XlaOp, absl::Span<const XlaOp>, XlaBuilder*)>
+    ForEachIndexBodyFunction;
+
+absl::StatusOr<std::vector<XlaOp>> ForEachIndex(
+    int64_t num_iterations, PrimitiveType num_iterations_type,
+    const ForEachIndexBodyFunction& body_function,
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_LOOPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/lu_decomposition.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/lu_decomposition.h
new file mode 100644
index 00000000..d233dab0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/lu_decomposition.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_LU_DECOMPOSITION_H_
+#define XLA_HLO_BUILDER_LIB_LU_DECOMPOSITION_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Computes the LU decomposition with partial pivoting of a batch of matrices.
+//
+// Given a (batched) matrix a with shape [..., m, n], computes the matrix
+// decomposition A = P @ L @ U where P is a permutation matrix, L is a
+// lower-triangular matrix with unit diagonal entries, and U is an
+// upper-triangular matrix.
+//
+// L and U are returned as a single matrix [..., m, n] containing both L and U
+// packed in the same array. The unit diagonal of L is not represented
+// explicitly.
+//
+// The permutation matrix P is returned in two forms, both as `pivots`, which is
+// an s32[..., min(m, n)] array that describes a sequence of row-swaps in the
+// style of LAPACK's xGETRF API, and `permutation`, which is a s32[..., m] array
+// which gives the permutation to apply to the rows. We return both
+// representations because they are each useful for different purposes; `pivots`
+// is useful for computing the sign of a determinant, whereas `permutation` can
+// be used via a Gather operation to permute the rows of a matrix.
+//
+// This method is only implemented on TPU at the moment.
+// TODO(b/168208200): the implementation only supports F32 arrays. Handle the
+// complex case.
+struct LuDecompositionResult {
+  // The LU decomposition, with both L and U packed into an array with shape
+  // [..., m, n].
+  XlaOp lu;
+  // An array of shape s32[..., min(m, n)] containing the pivot rows.
+  XlaOp pivots;
+  // An array of shape s32[..., m], containing an another representation of the
+  // pivots as a permutation.
+  XlaOp permutation;
+};
+
+LuDecompositionResult LuDecomposition(XlaOp a);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_LU_DECOMPOSITION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/math.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/math.h
new file mode 100644
index 00000000..6c26ec20
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/math.h
@@ -0,0 +1,127 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_MATH_H_
+#define XLA_HLO_BUILDER_LIB_MATH_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+
+namespace xla {
+
+// Determines whether operand is +/-inf or nan.
+//
+// Raises an error if called on integral or complex values.
+XlaOp IsPosInf(XlaOp operand);
+XlaOp IsNegInf(XlaOp operand);
+XlaOp IsInf(XlaOp operand);
+XlaOp IsNan(XlaOp operand);
+
+// Determines whether operand is equal to -0.
+//
+// Raises an error for integral or complex values.
+XlaOp IsNegZero(XlaOp operand);
+
+// Returns the next number after 'from' in the direction of 'to' the same way
+// std::nextafter(from, to) would.
+XlaOp NextAfter(XlaOp from, XlaOp to);
+
+// Computes the square of 'operand'.
+XlaOp Square(XlaOp operand);
+
+// Computes the reciprocal of 'operand'.
+XlaOp Reciprocal(XlaOp operand);
+
+// Computes an approximation of the error function complement (1 - erf(x)).
+XlaOp Erfc(XlaOp x);
+
+// Computes an approximation of the inverse of the error function.
+XlaOp ErfInv(XlaOp x);
+
+// Computes an approximation of the lgamma function.
+XlaOp Lgamma(XlaOp input);
+
+// Computes an approximation of the digamma function.
+XlaOp Digamma(XlaOp input);
+
+// Computes an approximation of the incomplete gamma function.
+XlaOp Igamma(XlaOp a, XlaOp x);
+
+// Computes an approximation of the derivative of the incomplete gamma function
+// with respect to a.
+XlaOp IgammaGradA(XlaOp a, XlaOp x);
+
+// Computes an approximation of the derivative of a sample `x` from a `Gamma(a,
+// 1)` distribution with respect to a.
+XlaOp RandomGammaGrad(XlaOp a, XlaOp x);
+
+// Computes an approximation of the complementary incomplete gamma function.
+XlaOp Igammac(XlaOp a, XlaOp x);
+
+// Computes the Polygamma of two arguments.
+XlaOp Polygamma(XlaOp n, XlaOp x);
+
+// Computes the Riemann zeta function of two arguments.
+XlaOp Zeta(XlaOp x, XlaOp q);
+
+// Rounds the given number to even when the number is equidistant between two
+// integers.
+XlaOp RoundToEven(XlaOp x);
+
+// Trigonometric functions
+
+// Computes the arc cosine of 'x'.
+XlaOp Acos(XlaOp x);
+
+// Computes the arc sine of 'x'.
+XlaOp Asin(XlaOp x);
+
+// Computes the arc tangent of 'x'.
+XlaOp Atan(XlaOp x);
+
+// Hyperbolic trigonometric functions
+
+// Computes the inverse hyperbolic cosine of 'x'.
+XlaOp Acosh(XlaOp x);
+
+// Computes the inverse hyperbolic sine of 'x'.
+XlaOp Asinh(XlaOp x);
+
+// Computes the inverse hyperbolic tangent of 'x'.
+XlaOp Atanh(XlaOp x);
+
+// Computes the hyperbolic cosine of 'x'.
+XlaOp Cosh(XlaOp x);
+
+// Computes the hyperbolic sine of 'x'.
+XlaOp Sinh(XlaOp x);
+
+// Applies a complex conjugation operation if 'a' is complex and 'conjugate'
+// is true, otherwise returns its argument.
+xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
+
+// Computes the Modified Bessel function of the first kind of the zeroth order
+// at x.
+XlaOp BesselI0e(XlaOp x);
+
+// Computes the Modified Bessel function of the first kind of the first order
+// at x.
+XlaOp BesselI1e(XlaOp x);
+
+// Computes the Regularized Incomplete Beta function.
+XlaOp RegularizedIncompleteBeta(XlaOp a, XlaOp b, XlaOp x);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_MATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/math_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/math_impl.h
new file mode 100644
index 00000000..65380249
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/math_impl.h
@@ -0,0 +1,259 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is generated using functional_algorithms tool (0.3.1), see
+//   https://github.com/pearu/functional_algorithms
+// for more information.
+
+#ifndef XLA_HLO_BUILDER_LIB_MATH_IMPL_H_
+#define XLA_HLO_BUILDER_LIB_MATH_IMPL_H_
+
+#include "xla/hlo/builder/lib/constants.h"
+#include "xla/hlo/builder/lib/math.h"
+#include "xla/hlo/builder/xla_builder.h"
+
+namespace xla {
+namespace math_impl {
+// NOLINTBEGIN(whitespace/line_length)
+// clang-format off
+
+// Arcus sine on complex input.
+//
+//     Here we well use a modified version of the [Hull et
+//     al]((https://dl.acm.org/doi/10.1145/275323.275324) algorithm with
+//     a reduced number of approximation regions.
+//
+//     Hull et al define complex arcus sine as
+//
+//       arcsin(x + I*y) = arcsin(x/a) + sign(y; x) * I * log(a + sqrt(a*a-1))
+//
+//     where
+//
+//       x and y are real and imaginary parts of the input to arcsin, and
+//       I is imaginary unit,
+//       a = (hypot(x+1, y) + hypot(x-1, y))/2,
+//       sign(y; x) = 1 when y >= 0 and abs(x) <= 1, otherwise -1.
+//
+//     x and y are assumed to be non-negative as the arcus sine on other
+//     quadrants of the complex plane are defined by
+//
+//       arcsin(-z) == -arcsin(z)
+//       arcsin(conj(z)) == conj(arcsin(z))
+//
+//     where z = x + I*y.
+//
+//     Hull et al split the first quadrant into 11 regions in total, each
+//     region using a different approximation of the arcus sine
+//     function. It turns out that when considering the evaluation of
+//     arcus sine real and imaginary parts separately, the 11 regions can
+//     be reduced to 3 regions for the real part, and to 4 regions for
+//     the imaginary part. This reduction of the approximation regions
+//     constitutes the modification of the Hull et al algorithm that is
+//     implemented below and it is advantageous for functional
+//     implementations as there will be less branches. The modified Hull
+//     et al algorithm is validated against the original Hull algorithm
+//     implemented in MPMath.
+//
+//     Per Hull et al Sec. "Analyzing Errors", in the following we'll use
+//     symbol ~ (tilde) to denote "approximately equal" relation with the
+//     following meaning:
+//
+//       A ~ B  iff  A = B * (1 + s * eps)
+//
+//     where s * eps is a small multiple of eps that quantification
+//     depends on the particular case of error analysis.
+//     To put it simply, A ~ B means that the numerical values of A and B
+//     within the given floating point system are equal or very
+//     close. So, from the numerical evaluation point of view it does not
+//     matter which of the expressions, A or B, to use as the numerical
+//     results will be the same.
+//
+//     We define:
+//       safe_min = sqrt(<smallest normal value>) * 4
+//       safe_max = sqrt(<largest finite value>) / 8
+//
+//     Real part
+//     ---------
+//     In general, the real part of arcus sine input can be expressed as
+//     follows:
+//
+//       arcsin(x / a) = arctan((x/a) / sqrt(1 - (x/a)**2))
+//                     = arctan(x / sqrt(a**2 - x**2))
+//                     = arctan2(x, sqrt(a**2 - x**2))              Eq. 1
+//                     = arctan2(x, sqrt((a + x) * (a - x)))        Eq. 2
+//
+//     for which the following approximations will be used (in the
+//     missing derivation cases, see Hull et al paper for details):
+//
+//     - Hull et al Case 5:
+//       For x > safe_max and any y, we have
+//         x + 1 ~ x - 1 ~ x
+//       so that
+//         a ~ hypot(x, y)
+//       For y > safe_max and x < safe_max, we have
+//         hypot(x + 1, y) ~ hypot(x - 1, y) ~ hypot(x, y) ~ a.
+//       Combining these together gives: if max(x, y) > safe_max then
+//         a**2 ~ hypot(x, y)**2 ~ x**2 + y**2
+//       and Eq. 1 becomes
+//         arcsin(x / a) ~ arctan2(x, y)
+//
+//     - Hull et al Safe region: for max(x, y) < safe_max, we have (see
+//       `a - x` approximation in Hull et al Fig. 2):
+//
+//       If x <= 1 then
+//         arcsin(x / a) ~ arctan2(x, sqrt(0.5 * (a + x) * (y * y / (hypot(x + 1, y) + x + 1) + hypot(x - 1, y) - x - 1)))
+//       else
+//         arcsin(x / a) ~ arctan2(x, y * sqrt(0.5 * (a + x) * (1 / (hypot(x + 1, y) + x + 1) + 1 / (hypot(x - 1, y) + x - 1))))
+//
+//     Imaginary part
+//     --------------
+//     In general, the unsigned imaginary part of arcus sine input can be
+//     expressed as follows:
+//
+//       log(a + sqrt(a*a-1)) = log(a + sqrt((a + 1) * (a - 1)))
+//                            = log1p(a - 1 + sqrt((a + 1) * (a - 1)))   # Eq.3
+//
+//     for which the following approximations will be used (for the
+//     derivation, see Hull et al paper):
+//
+//     - modified Hull et al Case 5: for y > safe_max_opt we have
+//         log(a + sqrt(a*a-1)) ~ log(2) + log(y) + 0.5 * log1p((x / y) * (x / y))
+//       where using
+//         safe_max_opt = safe_max * 1e-6 if x < safe_max * 1e12 else safe_max * 1e2
+//       will expand the approximation region to capture also the Hull et
+//       Case 4 (x is large but less that eps * y) that does not have
+//       log1p term but under the Case 4 conditions, log(y) +
+//       0.5*log1p(...) ~ log(y).
+//
+//     - Hull et al Case 1 & 2: for 0 <= y < safe_min and x < 1, we have
+//         log(a + sqrt(a*a-1)) ~ y / sqrt((a - 1) * (a + 1))
+//       where
+//         a - 1 ~ -(x + 1) * (x - 1) / (a + 1)
+//
+//     - Hull et al Safe region. See the approximation of `a -
+//       1` in Hull et al Fig. 2 for Eq. 3:
+//         log(a + sqrt(a*a-1)) ~ log1p(a - 1 + sqrt((a + 1) * (a - 1)))
+//       where
+//         a - 1 ~ 0.5 * y * y / (hypot(x + 1, y) + x + 1) + 0.5 * (hypot(x - 1, y) + x - 1)        if x >= 1
+//         a - 1 ~ 0.5 * y * y * (1 / (hypot(x + 1, y) + x + 1) + 1 / (hypot(x - 1, y) - x - 1))    if x < 1 and a < 1.5
+//         a - 1 ~ a - 1                                                                            otherwise
+//
+//     Different from Hull et al, we don't handle Cases 3 and 6 because
+//     these only minimize the number of operations which may be
+//     advantageous for procedural implementations but for functional
+//     implementations these would just increase the number of branches
+//     with no gain in accuracy.
+//
+//
+template <typename FloatType>
+XlaOp AsinComplex(XlaOp z) {
+  XlaOp signed_x = Real(z);
+  XlaOp x = Abs(signed_x);
+  XlaOp signed_y = Imag(z);
+  XlaOp y = Abs(signed_y);
+  FloatType safe_max_ =
+      (std::sqrt(std::numeric_limits<FloatType>::max())) / (8);
+  XlaOp safe_max = ScalarLike(signed_x, safe_max_);
+  XlaOp one = ScalarLike(signed_x, 1);
+  XlaOp half = ScalarLike(signed_x, 0.5);
+  XlaOp xp1 = Add(x, one);
+  XlaOp abs_xp1 = Abs(xp1);
+  XlaOp _hypot_1_mx = Max(abs_xp1, y);
+  XlaOp mn = Min(abs_xp1, y);
+  FloatType two_ = 2;
+  XlaOp sqrt_two = ScalarLike(signed_x, std::sqrt(two_));
+  XlaOp _hypot_1_r = Square(Div(mn, _hypot_1_mx));
+  XlaOp sqa = Sqrt(Add(one, _hypot_1_r));
+  XlaOp zero = ScalarLike(signed_x, 0);
+  XlaOp two = ScalarLike(signed_x, two_);
+  XlaOp r =
+      Select(Eq(_hypot_1_mx, mn), Mul(sqrt_two, _hypot_1_mx),
+             Select(And(Eq(sqa, one), Gt(_hypot_1_r, zero)),
+                    Add(_hypot_1_mx, Div(Mul(_hypot_1_mx, _hypot_1_r), two)),
+                    Mul(_hypot_1_mx, sqa)));
+  XlaOp xm1 = Sub(x, one);
+  XlaOp abs_xm1 = Abs(xm1);
+  XlaOp _hypot_2_mx = Max(abs_xm1, y);
+  XlaOp _hypot_2_mn = Min(abs_xm1, y);
+  XlaOp _hypot_2_r = Square(Div(_hypot_2_mn, _hypot_2_mx));
+  XlaOp _hypot_2_sqa = Sqrt(Add(one, _hypot_2_r));
+  XlaOp s =
+      Select(Eq(_hypot_2_mx, _hypot_2_mn), Mul(sqrt_two, _hypot_2_mx),
+             Select(And(Eq(_hypot_2_sqa, one), Gt(_hypot_2_r, zero)),
+                    Add(_hypot_2_mx, Div(Mul(_hypot_2_mx, _hypot_2_r), two)),
+                    Mul(_hypot_2_mx, _hypot_2_sqa)));
+  XlaOp a = Mul(half, Add(r, s));
+  XlaOp half_apx = Mul(half, Add(a, x));
+  XlaOp yy = Mul(y, y);
+  XlaOp rpxp1 = Add(r, xp1);
+  XlaOp smxm1 = Sub(s, xm1);
+  XlaOp spxm1 = Add(s, xm1);
+  XlaOp real = Atan2(
+      signed_x,
+      Select(Ge(Max(x, y), safe_max), y,
+             Select(Le(x, one), Sqrt(Mul(half_apx, Add(Div(yy, rpxp1), smxm1))),
+                    Mul(y, Sqrt(Add(Div(half_apx, rpxp1),
+                                    Div(half_apx, spxm1)))))));
+  XlaOp safe_max_opt =
+      Select(Lt(x, ScalarLike(signed_x, (safe_max_) * (1000000000000.0))),
+             ScalarLike(signed_x, (safe_max_) * (1e-06)),
+             ScalarLike(signed_x, (safe_max_) * (100.0)));
+  XlaOp y_gt_safe_max_opt = Ge(y, safe_max_opt);
+  XlaOp mx = Select(y_gt_safe_max_opt, y, x);
+  XlaOp xoy = Select(
+      And(y_gt_safe_max_opt,
+          Not(Eq(y, ScalarLike(signed_y,
+                               std::numeric_limits<FloatType>::infinity())))),
+      Div(x, y), zero);
+  XlaOp logical_and_lt_y_safe_min_lt_x_one = And(
+      Lt(y,
+         ScalarLike(signed_x,
+                    (std::sqrt(std::numeric_limits<FloatType>::min())) * (4))),
+      Lt(x, one));
+  XlaOp ap1 = Add(a, one);
+  XlaOp half_yy = Mul(half, yy);
+  XlaOp divide_half_yy_rpxp1 = Div(half_yy, rpxp1);
+  XlaOp x_ge_1_or_not = Select(
+      Ge(x, one), Add(divide_half_yy_rpxp1, Mul(half, spxm1)),
+      Select(Le(a, ScalarLike(signed_x, 1.5)),
+             Add(divide_half_yy_rpxp1, Div(half_yy, smxm1)), Sub(a, one)));
+  XlaOp am1 = Select(logical_and_lt_y_safe_min_lt_x_one,
+                     Neg(Div(Mul(xp1, xm1), ap1)), x_ge_1_or_not);
+  XlaOp sq = Sqrt(Mul(am1, ap1));
+  XlaOp imag = Select(Ge(mx, Select(y_gt_safe_max_opt, safe_max_opt, safe_max)),
+                      Add(Add(ScalarLike(signed_x, std::log(two_)), Log(mx)),
+                          Mul(half, Log1p(Mul(xoy, xoy)))),
+                      Select(logical_and_lt_y_safe_min_lt_x_one, Div(y, sq),
+                             Log1p(Add(am1, sq))));
+  return Complex(real, Select(Lt(signed_y, zero), Neg(imag), imag));
+}
+
+// Arcus sine on real input:
+//
+//     arcsin(x) = 2 * arctan2(x, (1 + sqrt(1 - x * x)))
+//
+template <typename FloatType>
+XlaOp AsinReal(XlaOp x) {
+  XlaOp one = ScalarLike(x, 1);
+  return Mul(ScalarLike(x, 2),
+             Atan2(x, Add(one, Sqrt(Mul(Sub(one, x), Add(one, x))))));
+}
+
+// clang-format on
+// NOLINTEND(whitespace/line_length)
+}  // namespace math_impl
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_MATH_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/matrix.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/matrix.h
new file mode 100644
index 00000000..6b69b1d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/matrix.h
@@ -0,0 +1,160 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_MATRIX_H_
+#define XLA_HLO_BUILDER_LIB_MATRIX_H_
+
+#include <array>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere
+// else.
+XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64_t m,
+                     int64_t n);
+
+// Returns a mask where the 'diagonal'-th diagonal is true and everything else
+// is false.
+XlaOp GetDiagonalMask(XlaOp x, int diagonal = 0);
+
+// Get the diagonals of the last two dimensions. Use k>0 for diagonals above the
+// main diagonal, and k<0 for diagonals below the main diagonal.
+//
+// If 'x' has shape [..., M, N]
+//  If k >= 0: then the output has shape [..., min(M, N - k)], containing the
+//            diagonal elements (i.e., with indices [..., i, i + k]).
+//  If k < 0: then the output has shape [..., min(M + k, N)], containing the
+//            diagonal elements (i.e., with indices [..., i - k, i]).
+XlaOp GetMatrixDiagonal(XlaOp x, int k = 0);
+XlaOp GetMatrixDiagonalViaGather(XlaOp x, int k = 0);
+
+// Places diag along the kth diagonal of target.
+XlaOp SetMatrixDiagonal(XlaOp matrix, XlaOp diag, int k = 0);
+
+// Returns a lower-triangular mask, i.e., true below and including the
+// `diagonal`-th diagonal and false above that diagonal.
+XlaOp TriangleMask(XlaOp x, int diagonal);
+
+// Get the upper or lower triangle part of the last two dimensions
+XlaOp Triangle(XlaOp x, bool lower);
+
+// Get the upper triangle part of the last two dimensions
+XlaOp UpperTriangle(XlaOp x);
+
+// Get the lower triangle part of the last two dimensions
+XlaOp LowerTriangle(XlaOp x);
+
+// If x is an array of shape [..., n, n], symmetrizes the matrix by replacing
+// the upper triangle with the transpose of the lower triangle (if lower is
+// True, vice-versa otherwise). If the type of `x` is complex, makes the matrix
+// Hermitian by taking the conjugate of the complex part and setting the
+// complex diagonal to zero.
+XlaOp Symmetrize(XlaOp x, bool lower);
+
+// Multiplies slices of two tensors in batches.
+
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if transpose_x else r_x
+//     c_o = r_y if transpose_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+xla::XlaOp BatchDot(
+    xla::XlaOp x, xla::XlaOp y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+xla::XlaOp BatchDot(
+    xla::XlaOp x, bool transpose_x, xla::XlaOp y, bool transpose_y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    bool grad_x = false, bool grad_y = false);
+
+// Parse an einsum string into dimension numbers:
+//   "ab,cb->ac"
+// becomes:
+//   {{0, 1},{2, 1},{0, 2}}
+//
+// Each occurrence of ellipsis ("...") occurring in the input is replaced with
+// the same numeric dimensions. The number of such dimensions is inferred from
+// x_rank and y_rank. For example:
+//   einsum_config: "...ab,...bcd->...acd"
+//   x_rank: 4
+//   y_rank: 5
+// becomes:
+//   {{0, 1, 2, 3},{0, 1, 3, 4, 5},{0, 1, 2, 4, 5}}
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+
+absl::StatusOr<std::array<std::vector<int64_t>, 3>> ParseEinsumString(
+    absl::string_view einsum_config, int64_t x_rank, int64_t y_rank);
+
+// If an einsum config does not contain an -> one will be added and the output
+// config will be the sorted characters with any ellipsis at the beginning.
+// Returns an empty string if the einsum string already has an ->.
+std::string NormalizeEinsumString(absl::string_view einsum_config);
+
+// Supports two operand einsum notation like "ab,cb->ac".
+xla::XlaOp Einsum(
+    xla::XlaOp x, xla::XlaOp y, absl::string_view einsum_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    bool grad_x = false, bool grad_y = false);
+xla::XlaOp Einsum(
+    xla::XlaOp x, absl::string_view einsum_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Same as above but supporting numeric labels on dimensions. So "ab,cb->ac"
+// becomes:
+//   x_config = {0, 1}
+//   y_config = {2, 1}
+//   output_config = {0, 2}
+xla::XlaOp Einsum(
+    xla::XlaOp x, absl::Span<const int64_t> x_config, xla::XlaOp y,
+    absl::Span<const int64_t> y_config, absl::Span<const int64_t> output_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    bool grad_x = false, bool grad_y = false);
+
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
+
+// Transposes `x` in its minor dimensions if `transpose` is true, otherwise
+// returns `x` unchanged.
+xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_MATRIX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/pooling.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/pooling.h
new file mode 100644
index 00000000..29400081
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/pooling.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_POOLING_H_
+#define XLA_HLO_BUILDER_LIB_POOLING_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/padding.h"
+#include "xla/hlo/builder/xla_builder.h"
+
+namespace xla {
+
+// Tensor format for reduce window operations.
+class TensorFormat {
+ public:
+  TensorFormat(int batch_dimension, int feature_dimension,
+               absl::Span<const int64_t> spatial_dimensions)
+      : batch_dimension_(batch_dimension),
+        feature_dimension_(feature_dimension),
+        spatial_dimensions_(spatial_dimensions.begin(),
+                            spatial_dimensions.end()) {}
+
+  int batch_dimension() const { return batch_dimension_; }
+
+  int feature_dimension() const { return feature_dimension_; }
+
+  int spatial_dimension(int dim) const { return spatial_dimensions_[dim]; }
+
+  int num_spatial_dims() const { return spatial_dimensions_.size(); }
+
+ private:
+  // The number of the dimension that represents the batch.
+  int batch_dimension_;
+  // The number of the dimension that represents the features.
+  int feature_dimension_;
+  // The dimension numbers for the spatial dimensions.
+  absl::InlinedVector<int, 4> spatial_dimensions_;
+};
+
+// Computes the max pool of 'operand'.
+XlaOp MaxPool(XlaOp operand, absl::Span<const int64_t> kernel_size,
+              absl::Span<const int64_t> stride, Padding padding,
+              const TensorFormat& data_format);
+
+// Computes the average pool of 'operand'.
+XlaOp AvgPool(XlaOp operand, absl::Span<const int64_t> kernel_size,
+              absl::Span<const int64_t> stride,
+              absl::Span<const std::pair<int64_t, int64_t>> padding,
+              const TensorFormat& data_format, bool counts_include_padding);
+
+// Returns the list of low and high padding elements in each spatial dimension
+// for the given 'padding' specification.
+std::vector<std::pair<int64_t, int64_t>> MakeSpatialPadding(
+    absl::Span<const int64_t> input_size, absl::Span<const int64_t> kernel_size,
+    absl::Span<const int64_t> stride, Padding padding,
+    const TensorFormat& data_format);
+
+// Computes the average pool gradient.
+XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span<const int64_t> gradients_size,
+                  absl::Span<const int64_t> kernel_size,
+                  absl::Span<const int64_t> stride,
+                  absl::Span<const std::pair<int64_t, int64_t>> spatial_padding,
+                  const TensorFormat& data_format, bool counts_include_padding);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_POOLING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/prng.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/prng.h
new file mode 100644
index 00000000..89b4dd62
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/prng.h
@@ -0,0 +1,101 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_PRNG_H_
+#define XLA_HLO_BUILDER_LIB_PRNG_H_
+
+#include <array>
+#include <functional>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Records the bits and state generated by a random number generator.
+struct RngOutput {
+  XlaOp value;
+  XlaOp state;
+};
+
+// A BitGenerator returns random bits and updated random bit generator state.
+//
+// key: is a value input to a random number generator that can affect the
+//   sequence of number it will generate. A random number generator constructs
+//   its seed using the key and the initial state. The tf2xla bridge passes the
+//   seed operand of a tensorflow random operation as a key to the random bit
+//   generator, for example.
+// initial_state: initial_state is the initial state of the current random
+//   number generation. It could be 0 for a stateless random operation, and
+//   the returned state from a previous execution for a stateful random
+//   operation.
+// shape: the shape of the random bits.
+using BitGeneratorTy = std::function<RngOutput(XlaOp key, XlaOp initial_state,
+                                               const xla::Shape& shape)>;
+
+// Implements the ThreeFry counter-based PRNG algorithm.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
+                               const xla::Shape& shape);
+
+// Implements the Philox algorithm to generate random numbers in parallel.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+//
+// The paper presents a few variants of the Philox algorithm, we picked the
+// 4x32_10 version of the algorithm for the following reasons:
+//   . 4x32 uses 32-bit multiplication which is fast on GPUs.
+//   . The authors recommend the 10-round variant, and TensorFlow also uses it.
+RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state,
+                             const Shape& shape);
+// Returns a scrambled pair of (state, key) from a single key.
+std::pair<XlaOp, XlaOp> ScramblePhiloxKey(XlaOp key);
+
+// Uses the given bit generator to generate random bits and then converts the
+// random bits to random numbers of uniform distribution in the given range.
+// Returns the random numbers and the state of the random number generator.
+// This function is for shape with floating point element types.
+RngOutput UniformFloatingPointDistribution(XlaOp key, XlaOp initial_state,
+                                           BitGeneratorTy bit_generator,
+                                           XlaOp minval, XlaOp maxval,
+                                           const xla::Shape& shape);
+
+// Similar to UniformFloatingPointDistribution but for shape with integer
+// element types.
+RngOutput UniformIntDistribution(XlaOp key, XlaOp initial_state,
+                                 BitGeneratorTy bit_generator, XlaOp minval,
+                                 XlaOp maxval, const xla::Shape& shape);
+
+// Uses the given bit generator to generate random bits and then converts the
+// random bits to random numbers of normal distribution.
+// Returns the random numbers and the state of the random number generator.
+RngOutput NormalFloatingPointDistribution(XlaOp key, XlaOp initial_state,
+                                          BitGeneratorTy bit_generator,
+                                          const xla::Shape& shape);
+
+// Concatenates scalars into a vector.
+xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
+                         absl::Span<const xla::XlaOp> scalars);
+
+// Increases Philox counter (an uint128_t) by a delta (an uint64_t).
+xla::XlaOp PhiloxIncreaseCounter(xla::XlaOp counter, xla::XlaOp delta);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_PRNG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/qr.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/qr.h
new file mode 100644
index 00000000..6e4f3cc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/qr.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_QR_H_
+#define XLA_HLO_BUILDER_LIB_QR_H_
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Computes the QR decompositions of a batch of matrices. That is,
+// given a (batched) matrix a, computes an orthonormal matrix Q and an
+// upper-triangular matrix R such that a = QR.
+// `a` must be a (batched) matrix of size [..., m, n].
+struct QrDecomposition {
+  // A matrix with the same shape as the input matrix `a`, whose upper triangle
+  // (inclusive of the diagonal) is the matrix R, and whose lower triangle
+  // (exclusive of the diagonal) contains the elementary Householder reflectors.
+  // This is the same output format as used by LAPACK's xGEQRF routine.
+  XlaOp q_and_r;
+  // A vector of shape [..., min(m, n)] containing the scalar factors of the
+  // elementary Householder reflectors.
+  XlaOp taus;
+};
+
+QrDecomposition Qr(XlaOp a);
+
+// Given `a` and `taus` as returned by `QRDecomposition`, compute the product of
+// the elementary Householder reflectors (i.e., the matrix Q of the QR
+// decomposition). The equivalent LAPACK routine is xORGQR/xUNGQR.
+XlaOp ProductOfElementaryHouseholderReflectors(XlaOp a, XlaOp taus);
+
+// Helper that combines `Qr` and `ProductOfElementaryHouseholderReflectors` to
+// compute explicit matrices `q` and `r`.
+void QrExplicit(XlaOp a, bool full_matrices, XlaOp& q, XlaOp& r);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_QR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/quantize.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/quantize.h
new file mode 100644
index 00000000..d0126f0c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/quantize.h
@@ -0,0 +1,184 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_QUANTIZE_H_
+#define XLA_HLO_BUILDER_LIB_QUANTIZE_H_
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "xla/hlo/builder/lib/constants.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/bfloat16.h"
+
+namespace xla {
+
+// Represents the range used for quantization
+struct QuantizedRange {
+  QuantizedRange() = default;
+  QuantizedRange(float min_in, float max_in) : min(min_in), max(max_in) {}
+
+  bool operator==(const QuantizedRange& rhs) const {
+    return this->min == rhs.min && this->max == rhs.max;
+  }
+
+  bool operator!=(const QuantizedRange& rhs) const { return !(*this == rhs); }
+
+  tsl::bfloat16 min = tsl::bfloat16(0.0f);
+  tsl::bfloat16 max = tsl::bfloat16(0.0f);
+};
+
+template <typename T>
+inline std::vector<uint32_t> PackToUint32(absl::Span<const T> input) {
+  const int64_t kElementsPerPack = sizeof(uint32_t) / sizeof(T);
+  const int64_t input_size = input.size();
+  const int64_t output_size = CeilOfRatio(input_size, kElementsPerPack);
+
+  std::vector<uint32_t> output_vec;
+  constexpr int64_t kShiftBits = sizeof(T) / sizeof(uint8_t) * CHAR_BIT;
+
+  for (int64_t i = 0; i < output_size; i++) {
+    uint32_t result = 0;
+    for (int64_t p = 0; p < kElementsPerPack; p++) {
+      int64_t index = i * kElementsPerPack + p;
+      if (index < input_size) {
+        int64_t total_shift_bits = kShiftBits * (kElementsPerPack - p - 1);
+        result |= (input[index] << total_shift_bits);
+      }
+    }
+    output_vec.push_back(result);
+  }
+
+  return output_vec;
+}
+
+// Dequantize the quantized input of packed uint32_t to bfloat16.
+// Only uint8_t or uint16_t is supported for the original unpacked input.
+// Returns a tensor of shape [d0,..., dn * unpack_size] if
+// input shape is [d0, ..., dn], where unpack_size = sizeof(unit32) / sizeof(T).
+// If transpose_output is true, will return a tensor of shape
+// [dn * unpack_size, dn-1, ..., d1, d0]. transpose_output is faster when
+// input's rank higher than 1. The input needs to be transposed to use
+// transpose_output feature.
+template <typename T>
+inline XlaOp Dequantize(XlaOp input, const QuantizedRange& range,
+                        absl::string_view mode_string = "MIN_COMBINED",
+                        bool transpose_output = false) {
+  XlaBuilder* const builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
+    float half_range =
+        !std::is_signed<T>::value
+            ? 0.0f
+            : (static_cast<float>(std::numeric_limits<T>::max()) -
+               std::numeric_limits<T>::min() + 1) /
+                  2.0f;
+    const int64_t unpack_size = sizeof(uint32_t) / sizeof(T);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(input));
+
+    auto element_type = shape.element_type();
+    if (element_type != U32) {
+      return InvalidArgument(
+          "Only U32 is supported for input type of xla::Dequantize Op.");
+    }
+
+    // Broadcast the input to [unpack_size, d0, ..., dn] if input size is
+    // [d0, ..., dn].
+    auto broadcast_input = Broadcast(input, {unpack_size});
+
+    XlaOp iota_r1 = Iota(builder, U32, unpack_size);
+    // Highest significant bytes needs to shift more bytes than lower
+    // significant bytes.
+    XlaOp shift_bytes =
+        xla::ConstantR0<uint32_t>(builder, unpack_size - 1) - iota_r1;
+
+    const int bytes_of_type = sizeof(T) / sizeof(uint8_t);
+    std::vector<uint32_t> shift_vec(unpack_size, CHAR_BIT * bytes_of_type);
+    XlaOp shift_bits =
+        shift_bytes * xla::ConstantR1<uint32_t>(builder, shift_vec);
+
+    // Make bit_mask for different data type T.
+    uint32_t bit_mask = 0x00000000;
+    for (int i = 0; i < bytes_of_type; i++) {
+      bit_mask <<= CHAR_BIT;
+      bit_mask |= 0x000000ff;
+    }
+
+    std::vector<int64_t> shift_transpose_dimensions(shape.dimensions_size());
+    std::iota(shift_transpose_dimensions.begin(),
+              shift_transpose_dimensions.end(), 0);
+    shift_transpose_dimensions.insert(shift_transpose_dimensions.begin(), 1,
+                                      shape.dimensions_size());
+
+    // Shift the input by sizeof(T) bytes and apply bit_mask to unpack.
+    XlaOp shifted_input = ShiftRightLogical(
+        broadcast_input, Transpose(Broadcast(shift_bits, shape.dimensions()),
+                                   shift_transpose_dimensions));
+    XlaOp unpack_input =
+        And(shifted_input, xla::ConstantR0<uint32_t>(builder, bit_mask));
+
+    XlaOp result;
+
+    if (mode_string == "MIN_COMBINED") {
+      const tsl::bfloat16 scale_factor =
+          (range.max - range.min) /
+          (static_cast<tsl::bfloat16>(std::numeric_limits<T>::max() -
+                                      std::numeric_limits<T>::min()));
+      // result = bfloat16(input + half_range) * scale_factor + range.min
+      XlaOp unpack_input_bf16 = ConvertElementType(unpack_input, BF16);
+      XlaOp half_range_bf16 = xla::ConstantR0<tsl::bfloat16>(
+          builder, static_cast<bfloat16>(half_range));
+      XlaOp sum = unpack_input_bf16 + half_range_bf16;
+
+      result = sum * xla::ConstantR0<tsl::bfloat16>(builder, scale_factor) +
+               xla::ConstantR0<tsl::bfloat16>(builder, range.min);
+    } else {
+      // TODO(wangtao): support other modes.
+      return InvalidArgument(
+          "Only MIN_COMBINED mode is supported in xla::Dequantize Op.");
+    }
+
+    std::vector<int64_t> transpose_dimensions(shape.dimensions_size());
+    std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 1);
+    std::reverse(transpose_dimensions.begin(), transpose_dimensions.end());
+    transpose_dimensions.insert(transpose_dimensions.begin() + 1, 1, 0);
+
+    // Transpose the result to be [dn, unpack_size, dn-1, ..., d1, d0].
+    XlaOp transposed_result = Transpose(result, transpose_dimensions);
+
+    // Reshape to be [dn * unpack_size, dn-1, ..., d1, d0].
+    XlaOp reshaped_result = Collapse(transposed_result, {0, 1});
+
+    // Return the transpose result if transpose_output is true.
+    if (transpose_output) {
+      return reshaped_result;
+    }
+
+    // Transpose the result to be [d0, d1, ..., dn-1, dn * unpack_size].
+    std::vector<int64_t> result_dimensions(shape.dimensions_size());
+    std::iota(result_dimensions.begin(), result_dimensions.end(), 0);
+    std::reverse(result_dimensions.begin(), result_dimensions.end());
+
+    return Transpose(reshaped_result, result_dimensions);
+  });
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_QUANTIZE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.h
new file mode 100644
index 00000000..3a9a7d21
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_SELF_ADJOINT_EIG_H_
+#define XLA_HLO_BUILDER_LIB_SELF_ADJOINT_EIG_H_
+
+#include <cstdint>
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// The eigenvalue decomposition of a symmetric matrix, the original matrix is
+// recovered by v * w * v_t.
+struct SelfAdjointEigResult {
+  // The i-th column is the normalized eigenvector corresponding to the
+  // eigenvalue w[i]. Will return a matrix object if a is a matrix object.
+  XlaOp v;
+  // The eigenvalues in ascending order, each repeated according to its
+  // multiplicity.
+  XlaOp w;
+};
+
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower = true,
+                                    int64_t max_iter = 15, float tol = 1e-5,
+                                    bool sort_eigenvalues = true);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_SELF_ADJOINT_EIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/slicing.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/slicing.h
new file mode 100644
index 00000000..2e40c00e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/slicing.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/types.h"
+
+#ifndef XLA_HLO_BUILDER_LIB_SLICING_H_
+#define XLA_HLO_BUILDER_LIB_SLICING_H_
+
+namespace xla {
+
+// Updates a slice of 'x', i.e.,
+// x[start[0], ..., start[n]] = update
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64_t> start);
+
+// Performs a slice in the minor dimensions of a tensor.
+// x[..., start[0]:end[0], ..., start[n]:end[n]]
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64_t> start,
+                       absl::Span<const int64_t> end);
+
+// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
+// x[..., start[0]:..., ..., start[n]:...] = update
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64_t> start);
+
+// Performs a dynamic slice in the minor dimensions of a tensor.
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64_t> sizes);
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts);
+
+// Gathers values along an axis specified by dim.
+//
+// For a 3-D tensor the output is specified by:
+//
+// out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+// out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+// out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+//
+// If `input` is an n-dimensional tensor with size
+// [X0,X1,X2,..XN] and dim = i `index` must be an n-dimensional tensor with size
+// [X0,X1,...Y,Xi+1,...,X[N] where y >= 1 and `out` will have the same sizes as
+// `index`.
+XlaOp TorchGather(XlaOp input, XlaOp index, int64_t dim, bool sparse = true);
+
+// idx = index[i][j][k]
+// output[idx][j][k] = combiner(input[idx][j][k], src[i][j][k])  # if dim == 0
+// output[i][idx][k] = combiner(input[i][idx][k], src[i][j][k])  # if dim == 1
+// output[i][j][idx] = combiner(input[i][j][idx], src[i][j][k])  # if dim == 2
+XlaOp TorchScatterDense(XlaOp input, XlaOp index, XlaOp src, int64_t dim,
+                        const std::function<XlaOp(XlaOp, XlaOp)>& combiner);
+
+// Returns a new tensor which indexes the input tensor along dimension dim using
+// the entries in index.
+//
+// The returned tensor has the same number of dimensions as the original tensor
+// (input). The dimth dimension has the same size as the length of index; other
+// dimensions have the same size as in the original tensor.
+//
+// This operation supports 0 or more major batch dimensions that act like a
+// multidimensional loop over both the input and the index.
+XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64_t dim,
+                       int64_t batch_dims = 0);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_SLICING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/sorting.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/sorting.h
new file mode 100644
index 00000000..c96d6800
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/sorting.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_SORTING_H_
+#define XLA_HLO_BUILDER_LIB_SORTING_H_
+
+#include <cstdint>
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns a tuple composed of the top `k` values and corresponding indices in
+// `input`.  Output values are in descending order, from largest to smallest.
+XlaOp TopK(XlaOp input, int64_t k,
+           PrimitiveType index_type = PrimitiveType::S32);
+
+// Split sort in TopK into smaller sorts.
+// Returns a tuple composed of the top `k` values and corresponding indices in
+// `input`.  Output values are in descending order, from largest to smallest.
+XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions = 1,
+                         PrimitiveType index_type = PrimitiveType::S32);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_SORTING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/svd.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/svd.h
new file mode 100644
index 00000000..0560a8cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/svd.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_SVD_H_
+#define XLA_HLO_BUILDER_LIB_SVD_H_
+
+#include <cstdint>
+
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// The singular value decomposition of a given matrix A[..., M, N], the original
+// matrix is recovered by u * diag(d) * v_t, where the first dims(A) - 2
+// dimensions are batch dimensions.
+struct SVDResult {
+  // The columns of U are the left-singular vectors, e.g.,
+  // U[..., :, :]_T * U[..., :, :] = I.
+  XlaOp u;
+  // Vector(s) with the singular values, within each vector sorted in descending
+  // order. The first dims(D) - 1 dimensions have the same size as the batch
+  // dimensions of A. And U[..., :, i] * D[..., i] = A[..., :, :] * V[..., :,
+  // i].
+  XlaOp d;
+  // The columns of V are the right-singular vectors. e.g.,
+  // V[..., :, :]_T * V[..., :, :] = I.
+  XlaOp v;
+};
+
+// TODO(kuny): Add a bool flag that supports SVD with economy (reduced)
+// representation, which is more memory efficient, especially in the case of
+// tall-skinny matrices.
+SVDResult SVD(XlaOp a, int64_t max_iter = 100, float epsilon = 1e-6,
+              PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_SVD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/tridiagonal.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/tridiagonal.h
new file mode 100644
index 00000000..d6bf56c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/tridiagonal.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_TRIDIAGONAL_H_
+#define XLA_HLO_BUILDER_LIB_TRIDIAGONAL_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace tridiagonal {
+
+enum SolverAlgorithm { kThomas };
+
+absl::StatusOr<XlaOp> TridiagonalSolver(SolverAlgorithm algo,
+                                        XlaOp lower_diagonal,
+                                        XlaOp main_diagonal,
+                                        XlaOp upper_diagonal, XlaOp rhs);
+
+absl::StatusOr<XlaOp> TridiagonalSolver(SolverAlgorithm algo, XlaOp diagonals,
+                                        XlaOp rhs);
+
+absl::StatusOr<XlaOp> TridiagonalMatMul(XlaOp upper_diagonal,
+                                        XlaOp main_diagonal,
+                                        XlaOp lower_diagonal, XlaOp rhs);
+
+}  // namespace tridiagonal
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_TRIDIAGONAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/tuple.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/tuple.h
new file mode 100644
index 00000000..11d7d022
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/lib/tuple.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_LIB_TUPLE_H_
+#define XLA_HLO_BUILDER_LIB_TUPLE_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/shape_tree.h"
+
+namespace xla {
+
+// Returns a ShapeTree where each index is a GetTupleElement instruction for
+// that subshape of the tuple.  The root index is the original argument.
+absl::StatusOr<ShapeTree<XlaOp>> DisassembleTuple(XlaOp tuple);
+
+// Assembles a tuple from a ShapeTree that contains the leaves of the tuple.
+// Non-leaf elements of the ShapeTree are ignored.  DisassembleTuple and
+// AssembleTuple are essentially inverse operations.
+XlaOp AssembleTuple(XlaBuilder* builder, ShapeTree<XlaOp> elements);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_LIB_TUPLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/padding.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/padding.h
new file mode 100644
index 00000000..b0c83b75
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/padding.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_PADDING_H_
+#define XLA_HLO_BUILDER_PADDING_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+
+namespace xla {
+
+// Describes the padding applied for a windowed operation like
+// convolution, where a window is placed inside a base area.
+enum class Padding {
+  // Make the output have the same dimensions as the base area. For
+  // example, for a 3x3 base area and a 2x2 window, the output will be
+  // 3x3, so that requires padding the 3x3 base area to 4x4.
+  kSame,
+
+  // Use no padding. For example, for a 4x4 base area and a 2x2
+  // window, the output will be 3x3.
+  kValid,
+};
+
+// Validates that the slices are acceptable for determining padding -- this can
+// be used to check the preconditions of MakePadding below to produce an error
+// message that can be returned to the user.
+absl::Status ValidatePaddingValues(absl::Span<const int64_t> input_dimensions,
+                                   absl::Span<const int64_t> window_dimensions,
+                                   absl::Span<const int64_t> window_strides);
+
+// Returns the padding needed for the base area, given the base area dimensions,
+// window dimensions, strides, and the type of padding.
+//
+// If v is the returned vector, then for each dimension number i,
+// v[i].first is the padding to the left (i.e. in the direction of
+// lower indices) and v[i].second is the padding to the right (i.e. in
+// the direction of higher indices).
+//
+// Precondition: The number of dimensions (i.e., rank) in input_dimensions,
+// window_dimensions, and strides must match, which is equal to the number
+// of elements in the result vector.
+std::vector<std::pair<int64_t, int64_t>> MakePadding(
+    absl::Span<const int64_t> input_dimensions,
+    absl::Span<const int64_t> window_dimensions,
+    absl::Span<const int64_t> window_strides, Padding padding);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_PADDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/sharding_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/sharding_builder.h
new file mode 100644
index 00000000..245ab9e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/sharding_builder.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_SHARDING_BUILDER_H_
+#define XLA_HLO_BUILDER_SHARDING_BUILDER_H_
+
+#include <cstdint>
+
+#include "xla/array.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace sharding_builder {
+// A shaped array used to describe the assignment of tiles to devices.
+using TileAssignment = Array<int64_t>;
+
+// Creates a replicated sharding - replicate a tensor on every device.
+OpSharding Replicate();
+
+// Creates a manual sharding - the partitioner will not change the shape.
+OpSharding Manual();
+
+// Creates a sharding that assigns a tensor to just one device.
+OpSharding AssignDevice(int device);
+
+// Creates a tiled sharding with the given tile shape and assignment of tiles
+// to devices.
+//
+// If tile_shape is not evenly divisible by the number of devices in
+// tile_assignment, operations behave as if implicit padding had been inserted.
+// The value of this padding is undefined.
+OpSharding Tile(const Shape& tile_shape, const TileAssignment& tile_assignment);
+
+// Creates a sharding in one dimension, with the given tile shape which must
+// be rank 1 and using devices [0..num_tiles).
+//
+// This is simply a convenience wrapper for Tile().
+OpSharding Tile1D(const Shape& tile_shape, int64_t num_tiles);
+
+// Creates a tuple sharding from the given ShapeTree of element shardings.
+OpSharding Tuple(const ShapeTree<OpSharding>& shardings);
+
+}  // namespace sharding_builder
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_SHARDING_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/value_inference.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/value_inference.h
new file mode 100644
index 00000000..7f69a597
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/value_inference.h
@@ -0,0 +1,117 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_BUILDER_VALUE_INFERENCE_H_
+#define XLA_HLO_BUILDER_VALUE_INFERENCE_H_
+
+#include <optional>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+// OptionalLiteral is an augmented literal class which returns optional
+// values for each index (the value can be either valid or invalid). The
+// implementation keeps two literals, a value literal, holding both the valid
+// and garabage value, and a masking literal representing if a value is valid or
+// garbage.
+class OptionalLiteral {
+ public:
+  explicit OptionalLiteral(Literal value, Literal mask)
+      : value_(std::move(value)), mask_(std::move(mask)) {}
+
+  template <typename NativeT>
+  std::optional<NativeT> Get(absl::Span<const int64_t> element_index,
+                             ShapeIndex shape_index = {}) const {
+    if (mask_.Get<bool>(element_index, shape_index)) {
+      return std::nullopt;
+    } else {
+      return value_.Get<NativeT>(element_index, shape_index);
+    }
+  }
+
+  // Returns true if all values in this literal slice are value.
+  bool AllValid() { return mask_.IsAll(0); }
+
+  // Get value out of this slice if all values are valid. Otherwise returns
+  // nullopt.
+  std::optional<LiteralSlice> GetValue() {
+    if (!AllValid()) {
+      return std::nullopt;
+    }
+    return LiteralSlice(value_);
+  }
+
+ private:
+  Literal value_;
+  Literal mask_;
+};
+
+enum ValueInferenceMode {
+  // Inference the constant value itself.
+  kValue = 0,
+  // Inference upper-bound and lower-bound of the value. Bounds are inclusive.
+  kUpperBound,
+  kLowerBound,
+};
+
+class ValueInference {
+ public:
+  // ValueInference analyzes values in XlaOp answers following questions:
+  // - What's the upper-bound of each value in a tensor.
+  // - What's the lower-bound of each value in a tensor.
+  // - What's the constant value of each tensor.
+  // - Whether or not each value in a tensor is dynamic.
+  explicit ValueInference(XlaBuilder* builder) : builder_(builder) {
+    CHECK(builder_);
+  }
+  absl::StatusOr<Literal> AnalyzeIsDynamic(XlaOp op);
+  // Returns an OptionalLiteral. Each individual value of the literal is
+  // the concrete constant value if it can be inferred, otherwise a nullopt.
+  absl::StatusOr<OptionalLiteral> AnalyzeConstant(XlaOp op,
+                                                  ValueInferenceMode mode);
+
+  // Returns underlying xla builder.
+  XlaBuilder* builder() { return builder_; }
+
+ private:
+  // Given an op handle, returns a simplified version of the handle inside a
+  // int64_t Literal. If the a -1 value for the handle means invalid
+  // simplification and the result shouldn't be used.
+  absl::StatusOr<Literal> SimplifyOp(int64_t handle);
+
+  // Perform CSE on a given handle, and return an equivalent handle if seen
+  // before. Otherwise, returns nullopt.
+  absl::StatusOr<std::optional<int64_t>> CseOpHandle(int64_t handle);
+  XlaBuilder* builder_;
+  HloEvaluator evaluator_;
+  // A map from instruction_hash to handle that helps perform CSE.
+  absl::flat_hash_map<int64_t, int64_t> cse_map_;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_VALUE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/xla_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/xla_builder.h
new file mode 100644
index 00000000..789b22ea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/xla_builder.h
@@ -0,0 +1,3139 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_XLA_BUILDER_H_
+#define XLA_HLO_BUILDER_XLA_BUILDER_H_
+
+#include <cstdint>
+#include <deque>
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/builder/padding.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/dynamic_parameter_binding.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/lib/core/bitmap.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/stacktrace.h"
+
+namespace xla {
+
+class XlaBuilder;
+class XlaOp;
+class HloInstruction;
+
+namespace internal {
+
+struct XlaBuilderFriend {
+  static XlaOp BuildAddDependency(XlaBuilder* builder, XlaOp operand,
+                                  XlaOp token, const Shape& shape);
+
+  static std::pair<XlaOp, int64_t> BuildAsyncStart(
+      XlaBuilder* builder, absl::Span<const XlaOp> operands,
+      std::string execution_thread, const XlaComputation& called_computation,
+      const Shape& shape);
+  static XlaOp BuildAsyncUpdate(XlaBuilder* builder, XlaOp operands,
+                                const Shape& shape);
+  static XlaOp BuildAsyncDone(XlaBuilder* builder, XlaOp operands,
+                              const Shape& shape);
+
+  static XlaOp BuildAllGatherStart(
+      XlaBuilder* builder, XlaOp operand, int64_t all_gather_dimension,
+      int64_t shard_count, absl::Span<const ReplicaGroup> replica_groups = {},
+      const std::optional<ChannelHandle>& channel_id = std::nullopt,
+      const std::optional<Layout>& layout = std::nullopt,
+      std::optional<bool> use_global_device_ids = std::nullopt);
+  static XlaOp BuildAllGatherDone(XlaBuilder* builder, XlaOp operands,
+                                  const Shape& shape);
+
+  static XlaOp BuildAllReduceStart(
+      XlaBuilder* builder, XlaOp operand, const XlaComputation& computation,
+      absl::Span<const ReplicaGroup> replica_groups = {},
+      const std::optional<ChannelHandle>& channel_id = std::nullopt,
+      const std::optional<Shape>& layout = std::nullopt,
+      std::optional<bool> use_global_device_ids = std::nullopt);
+  static XlaOp BuildAllReduceDone(XlaBuilder* builder, XlaOp operands,
+                                  const Shape& shape);
+
+  static XlaOp BuildCollectivePermuteStart(
+      XlaBuilder* builder, XlaOp operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+  static XlaOp BuildCollectivePermuteDone(XlaBuilder* builder, XlaOp operands,
+                                          const Shape& shape);
+
+  static XlaOp BuildCopyStart(
+      XlaBuilder* builder, XlaOp operand,
+      std::optional<int> cross_program_prefetch_index = std::nullopt);
+  static XlaOp BuildCopyDone(XlaBuilder* builder, XlaOp operand,
+                             const Shape& shape);
+
+  static XlaOp BuildFusion(
+      XlaBuilder* builder, absl::Span<const XlaOp> operands,
+      absl::string_view fusion_kind, const XlaComputation& fused_computation,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing = {});
+
+  static XlaOp BuildBitcast(XlaBuilder* builder, XlaOp operand,
+                            const Shape& shape);
+
+  static XlaOp BuildPartitionId(XlaBuilder* builder, const Shape& shape);
+
+  static XlaOp BuildSend(XlaBuilder* builder, XlaOp operand, XlaOp token,
+                         const ChannelHandle& handle, bool is_host_transfer);
+  static XlaOp BuildSendDone(XlaBuilder* builder, XlaOp operand,
+                             const ChannelHandle& handle,
+                             bool is_host_transfer);
+
+  static XlaOp BuildRecv(XlaBuilder* builder, XlaOp token, const Shape& shape,
+                         const ChannelHandle& handle, bool is_host_transfer);
+  static XlaOp BuildRecvDone(XlaBuilder* builder, XlaOp token,
+                             const Shape& shape, const ChannelHandle& handle,
+                             bool is_host_transfer);
+
+  static XlaOp BuildDomain(XlaBuilder* builder, XlaOp operand, OpSharding entry,
+                           OpSharding exit, const Shape& shape);
+
+  static XlaOp BuildRngGetAndUpdateState(XlaBuilder* builder, int64_t delta,
+                                         const Shape& shape);
+
+  static HloInstructionProto* GetInstruction(XlaOp op);
+  static HloInstructionProto* GetInstructionByHandle(XlaBuilder* builder,
+                                                     int64_t handle);
+};
+
+}  // namespace internal
+
+// This represents an instruction that has been enqueued using the XlaBuilder.
+// This is used to pass to subsequent computations that depends upon the
+// instruction as an operand.
+class XlaOp {
+ public:
+  XlaOp() : handle_(-1), builder_(nullptr) {
+    static_assert(std::is_trivially_destructible<XlaOp>::value,
+                  "XlaOp should be trivially destructible");
+  }
+  ~XlaOp() = default;
+
+  XlaOp(const XlaOp& other) = default;
+  XlaOp& operator=(const XlaOp& other) = default;
+
+  // Precondition: !IsUninitialized().
+  //
+  // It's very common to do foo.builder()->bar().  Without this precondition, if
+  // foo.builder() is null, the call to bar will segfault at some point possibly
+  // deep in the callstack when we finally dereference `this`.  The precondition
+  // lets us avoid this tricky-to-debug problem.
+  XlaBuilder* builder() const {
+    CHECK(builder_ != nullptr);
+    return builder_;
+  }
+
+  // Returns true if the XlaOp represents valid, non-erroneous value.
+  bool valid() const { return handle_ >= 0; }
+
+  // Returns true if the XlaOp was created by the XlaOp() constructor and
+  // not returned by a builder.
+  bool IsUninitialized() const { return builder_ == nullptr; }
+
+  bool IsIdenticalTo(XlaOp rhs) const {
+    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, XlaOp op) {
+    out << op.handle();
+    return out;
+  }
+
+ private:
+  explicit XlaOp(XlaBuilder* builder) : handle_(-1), builder_(builder) {}
+  XlaOp(int64_t handle, XlaBuilder* builder)
+      : handle_(handle), builder_(builder) {}
+
+  int64_t handle() const { return handle_; }
+
+  friend class XlaBuilder;
+  friend class ValueInference;
+  friend struct internal::XlaBuilderFriend;
+
+  // < 0 means "invalid handle".
+  int64_t handle_;
+
+  // Not owned. Non-null for any handle returned by XlaBuilder, even if the
+  // handle is invalid.
+  XlaBuilder* builder_;
+};
+
+// Arithmetic operator overloads for the XlaOp type.
+XlaOp operator-(XlaOp x);
+XlaOp operator+(XlaOp x, XlaOp y);
+XlaOp operator-(XlaOp x, XlaOp y);
+XlaOp operator*(XlaOp x, XlaOp y);
+XlaOp operator/(XlaOp x, XlaOp y);
+XlaOp operator%(XlaOp x, XlaOp y);
+
+// Bitwise operator overloads for the XlaOp type.
+XlaOp operator~(XlaOp x);
+XlaOp operator&(XlaOp x, XlaOp y);
+XlaOp operator|(XlaOp x, XlaOp y);
+XlaOp operator^(XlaOp x, XlaOp y);
+XlaOp operator<<(XlaOp x, XlaOp y);
+// Performs a right arithmetic shift if 'x' is a signed type, otherwise performs
+// a right logical shift.
+XlaOp operator>>(XlaOp x, XlaOp y);
+
+// We don't overload the relational operators (==, !=, <, <=, >, >=) because the
+// semantics might be surprising since their result types are usually 'bool'.
+// Further programmers may expect == to be a structural equality.
+// We also choose not to overload any of the mutating operators (e.g., +=, -=)
+// because the semantics might be misleading — XLA computations are immutable.
+
+// A convenient interface for building up computations.
+//
+// Thread-compatible.
+class XlaBuilder {
+ public:
+  // computation_name: name to use for the built computation.
+  explicit XlaBuilder(const std::string& computation_name);
+
+  XlaBuilder(const XlaBuilder&) = delete;
+  XlaBuilder& operator=(const XlaBuilder&) = delete;
+
+  virtual ~XlaBuilder();
+
+  // Returns the computation name.
+  const std::string& name() const { return name_; }
+
+  // Sets OpMetadata that will be added to all instructions until cleared.
+  //
+  // OpMetadata is often applied to a series of XLA HLO instructions. As a
+  // result, OpMetadata is set on the computation builder. All subsequent
+  // instructions generated via this computation builder will have the same
+  // OpMetadata attached until a call to ClearOpMetadata.
+  void SetOpMetadata(OpMetadata metadata) { metadata_ = std::move(metadata); }
+
+  // Swaps the passed op metadata with the ones currently set.
+  //
+  // Returns the old op metadata.
+  OpMetadata SwapOpMetadata(OpMetadata metadata) {
+    OpMetadata old_metadata = std::move(metadata_);
+    metadata_ = std::move(metadata);
+    return old_metadata;
+  }
+
+  // Similar to SetOpMetadata, but only set the metadata for the next op.
+  void SetOneShotOpMetadata(OpMetadata metadata) {
+    one_shot_metadata_ = std::move(metadata);
+  }
+
+  // Clears the HloMetadata state.
+  void ClearOpMetadata() { metadata_.Clear(); }
+
+  // Sets an OpSharding that will be attached to all instructions until cleared.
+  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
+
+  // Sets the FrontendAttributes that will be added to all instructions until
+  // cleared.
+  //
+  // FrontendAttributes are often applied to a series of XLA HLO instructions.
+  // As a result they are set on the computation builder and all the
+  // instructions generated via the computation builder will have the same
+  // frontend attributes attached to them.
+  virtual void SetFrontendAttributes(
+      const FrontendAttributes& frontend_attributes) {
+    frontend_attributes_ = frontend_attributes;
+  }
+
+  // Swap the passed FrontendAttributes with the ones currently set.
+  //
+  // Return the old attributes.
+  FrontendAttributes SwapFrontendAttributes(
+      const FrontendAttributes& frontend_attributes) {
+    FrontendAttributes old_attributes = std::move(frontend_attributes_);
+    frontend_attributes_ = frontend_attributes;
+    return old_attributes;
+  }
+
+  // Returns the FrontendAttributes that will be attached to all instructions.
+  const FrontendAttributes& frontend_attributes() const {
+    return frontend_attributes_;
+  }
+
+  // Clears all the frontend attributes.
+  void ClearFrontendAttributes() { frontend_attributes_.Clear(); }
+
+  // Clears the sharding. Ops will be sharded according to the default placement
+  // policy.
+  void ClearSharding() { sharding_ = std::nullopt; }
+
+  // Returns the OpSharding that will be attached to all instructions.
+  const std::optional<OpSharding>& sharding() const { return sharding_; }
+
+  // Sets the builder to a mode where it will die immediately when an error is
+  // encountered, rather than producing it in a deferred fashion when Build() is
+  // called (which is the default).
+  void set_die_immediately_on_error(bool enabled) {
+    die_immediately_on_error_ = enabled;
+  }
+
+  // Default dimension numbers used for a 2D convolution.
+  static constexpr int64_t kConvBatchDimension = 0;
+  static constexpr int64_t kConvFeatureDimension = 1;
+  static constexpr int64_t kConvFirstSpatialDimension = 2;
+  static constexpr int64_t kConvSecondSpatialDimension = 3;
+  static constexpr int64_t kConvKernelOutputDimension = 0;
+  static constexpr int64_t kConvKernelInputDimension = 1;
+  static constexpr int64_t kConvKernelFirstSpatialDimension = 2;
+  static constexpr int64_t kConvKernelSecondSpatialDimension = 3;
+
+  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
+  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
+  // the kernel operand
+  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
+  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
+      int num_spatial_dims = 2);
+
+  // Returns an error if the convolution dimension numbers have conflicts.
+  static absl::Status Validate(const ConvolutionDimensionNumbers& dnum);
+
+  // Returns a new XlaBuilder whose resultant Computation is used only by this
+  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
+  // behavior as the parent.
+  std::unique_ptr<XlaBuilder> CreateSubBuilder(
+      const std::string& computation_name);
+
+  // Builds the computation with the requested operations, or returns a non-ok
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned. The root of the computation will be the last
+  // added operation.
+  //
+  // `remove_dynamic_dimensions` tells the builder whether to remove the
+  // dynamic dimensions information in all ops.
+  //
+  // TODO(b/121223198): Delete `remove_dynamic_dimensions` and keeps the
+  // dynamic dimensions information when XLA backend can handle dynamic
+  // dimensions.
+  absl::StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = false);
+
+  // Overload of Build which specifies a particular root instruction for the
+  // computation.
+  absl::StatusOr<XlaComputation> Build(XlaOp root,
+                                       bool remove_dynamic_dimensions = false);
+
+  // Builds the computation with the requested operations, or notes an error in
+  // the parent XlaBuilder and returns an empty computation if building failed.
+  // This function is intended to be used where the returned XlaComputation is
+  // only used by the parent XlaBuilder and hence further operation on the
+  // returned XlaComputation will simply be error'ed out if an error occurred
+  // while building this computation. If the built computation is to be used by
+  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
+  // instead.
+  XlaComputation BuildAndNoteError();
+
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  absl::StatusOr<XlaComputation> BuildConstantSubGraph(
+      XlaOp root_op, bool dynamic_dimension_is_minus_one = false);
+
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // XlaOp and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  absl::Status first_error() const { return first_error_; }
+
+  // Returns the current status of the builder, complete with the stack trace
+  // information.
+  absl::Status GetCurrentStatus() const;
+
+  // Returns the shape of the given op.
+  absl::StatusOr<Shape> GetShape(XlaOp op) const;
+
+  // Returns the shape of the given op.
+  virtual absl::StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
+
+  // Returns the OpSharding of the given op. If "op" has no sharding, return
+  // std::nullopt.
+  absl::StatusOr<std::optional<OpSharding>> GetOpSharding(XlaOp op) const;
+
+  // Returns the (inferred) result for the current computation's shape. This
+  // assumes the root instruction is the last added instruction.
+  absl::StatusOr<ProgramShape> GetProgramShape() const;
+
+  // Returns the (inferred) result for the current computation's shape using the
+  // given operation as the root.
+  absl::StatusOr<ProgramShape> GetProgramShape(XlaOp root) const;
+
+  // Reports an error to the builder, by
+  // * storing it internally and capturing a backtrace if it's the first error
+  //   (this deferred value will be produced on the call to
+  //    Build()/GetShape()/...)
+  // * dying if die_immediately_on_error_ is true.
+  // Returns an XlaOp with an invalid handle but a valid builder. This value can
+  // be returned in place of a value in APIs that return an XlaOp.
+  XlaOp ReportError(const absl::Status& error);
+
+  // A helper function that converts a absl::StatusOr<XlaOp> into an XlaOp.
+  // If the absl::Status was an error, reports the error to builder and returns
+  // an invalid XlaOp handle.
+  XlaOp ReportErrorOrReturn(const absl::StatusOr<XlaOp>& op);
+
+  // A helper function that runs a function that returns a absl::StatusOr<XlaOp>
+  // and returns an XlaOp.
+  XlaOp ReportErrorOrReturn(
+      absl::FunctionRef<absl::StatusOr<XlaOp>()> op_creator);
+
+  // Returns true if 'operand' is a compile-time constant. A compile-time
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  absl::StatusOr<bool> IsConstant(XlaOp operand) const;
+
+  // Adds a new input/output alias. Since the input/output shape information are
+  // not available until the computation is built, any eventual error in the
+  // arguments of this API will be detected only at computation Build() time.
+  //
+  // Note: Except when 'must-alias' is true, alias is assumed to be 'may-alias'
+  // and only donated buffer at runtime will be aliased with output. If a buffer
+  // is not donated at runtime, a copy will be inserted by XLA to prevent buffer
+  // clobbering.
+  void SetUpAlias(const ShapeIndex& output_index, int64_t param_number,
+                  const ShapeIndex& param_index,
+                  HloInputOutputAliasConfig::AliasKind kind =
+                      HloInputOutputAliasConfig::AliasKind::kMayAlias) {
+    input_output_aliases_.push_back(
+        {output_index, param_number, param_index, kind});
+  }
+
+  // Describes an input/output alias as inserted by the SetUpAlias() API.
+  struct InputOutputAlias {
+    // Specifies the index of the aliased buffer in the result tuple.
+    ShapeIndex output_index;
+    // Specifies the parameter containing the buffer to be aliased.
+    int64_t param_number;
+    // Specifies the index of the aliased buffer in the parameter.
+    ShapeIndex param_index;
+    // Specifies if the alias is a must alias or may alias.
+    HloInputOutputAliasConfig::AliasKind kind;
+  };
+
+  // Adds a new buffer donor. The donated buffer may be paired with any valid
+  // output. On the contrary, the buffer aliasing bonds the input output pair.
+  // The input can only donate the buffer to the paired output.
+  void AddBufferDonor(int64_t param_number, const ShapeIndex& param_index) {
+    buffer_donors_.insert({param_number, param_index});
+  }
+
+  // Looks up the HloInstruction and sets the frontend attribute "attribute" to
+  // "value". If the attribute already existed, then its value is updated.
+  //
+  // The attribute is only added to the HloInstruction, not to the builder.
+  absl::Status SetInstructionFrontendAttribute(XlaOp op, std::string attribute,
+                                               std::string value);
+
+  // Looks up the HloInstruction and sets the sharding. If the sharding already
+  // existed, then its value is updated.
+  //
+  // The sharding is only added to the HloInstruction, not to the builder.
+  absl::Status SetInstructionSharding(
+      XlaOp op, const std::optional<OpSharding>& sharding);
+
+  // Returns shapes for the operands.
+  absl::StatusOr<std::vector<Shape>> GetOperandShapes(
+      absl::Span<const XlaOp> operands) const;
+
+  // Converts the op to string for the ease of debugging.
+  std::string OpToString(XlaOp op) const;
+
+ private:
+  void ToStringHelper(std::string* out, int ident, int64_t op_handle) const;
+
+  // Build helper which takes the id of the root operation..
+  absl::StatusOr<XlaComputation> Build(int64_t root_id,
+                                       bool remove_dynamic_dimensions);
+
+  // Description for the methods below can be found in the corresponding public
+  // functions section in this file.
+
+  XlaOp Parameter(int64_t parameter_number, const Shape& shape,
+                  const std::string& name,
+                  const std::vector<bool>& replicated_at_leaf_buffers);
+  XlaOp Parameter(int64_t parameter_number, const Shape& shape,
+                  const std::string& name) {
+    std::vector<bool> empty_bools;
+    return Parameter(parameter_number, shape, name, empty_bools);
+  }
+
+  virtual XlaOp ConstantLiteral(const LiteralSlice& literal);
+
+  XlaOp Broadcast(XlaOp operand, absl::Span<const int64_t> broadcast_sizes);
+
+  XlaOp BroadcastInDim(XlaOp operand, absl::Span<const int64_t> out_dim_size,
+                       absl::Span<const int64_t> broadcast_dimensions);
+
+  // This is an experimental API for creating the mhlo.dynamic_broadcast_in_dim
+  // op from the XlaBuilder. This is only intended for export to MHLO or
+  // StableHLO, and cannot be compiled. Only static output_dimensions are
+  // allowed, and broadcast_dimensions is verified.
+  XlaOp MhloDynamicBroadcastInDim(
+      XlaOp operand, XlaOp output_dimensions,
+      absl::Span<const int64_t> broadcast_dimensions,
+      const Shape& output_shape);
+
+  XlaOp Pad(XlaOp operand, XlaOp padding_value,
+            const PaddingConfig& padding_config);
+  XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
+                 int64_t pad_lo, int64_t pad_hi);
+
+  virtual absl::StatusOr<XlaOp> PadInternal(
+      const Shape& shape, XlaOp operand, XlaOp padding_value,
+      const PaddingConfig& padding_config);
+
+  XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
+                absl::Span<const int64_t> new_sizes,
+                int64_t inferred_dimension = -1);
+
+  XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes,
+                int64_t inferred_dimension = -1);
+
+  XlaOp Reshape(const Shape& shape, XlaOp operand,
+                int64_t inferred_dimension = -1);
+
+  XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                       absl::Span<const int64_t> new_size_bounds,
+                       const std::vector<bool>& dims_are_dynamic);
+
+  XlaOp MhloDynamicReshape(XlaOp operand, XlaOp output_shape,
+                           const Shape& shape);
+
+  XlaOp Collapse(XlaOp operand, absl::Span<const int64_t> dimensions);
+
+  XlaOp Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
+              absl::Span<const int64_t> limit_indices,
+              absl::Span<const int64_t> strides);
+  virtual absl::StatusOr<XlaOp> SliceInternal(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64_t> start_indices,
+      absl::Span<const int64_t> limit_indices,
+      absl::Span<const int64_t> strides);
+  virtual XlaOp SliceInDim(XlaOp operand, int64_t start_index,
+                           int64_t limit_index, int64_t stride, int64_t dimno);
+
+  XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
+                     absl::Span<const int64_t> slice_sizes);
+  virtual absl::StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64_t> slice_sizes);
+
+  XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
+                           absl::Span<const XlaOp> start_indices);
+  virtual absl::StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices);
+
+  XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64_t dimension);
+  virtual absl::StatusOr<XlaOp> ConcatInDimInternal(
+      const Shape& shape, absl::Span<const XlaOp> operands, int64_t dimension);
+
+  XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
+
+  XlaOp Tuple(absl::Span<const XlaOp> elements);
+  virtual absl::StatusOr<XlaOp> TupleInternal(const Shape& shape,
+                                              absl::Span<const XlaOp> elements);
+
+  XlaOp GetTupleElement(XlaOp tuple_data, int64_t index);
+  virtual absl::StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape,
+                                                        XlaOp tuple_data,
+                                                        int64_t index);
+
+  XlaOp Dot(XlaOp lhs, XlaOp rhs,
+            const PrecisionConfig* precision_config = nullptr,
+            std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp DotGeneral(
+      XlaOp lhs, XlaOp rhs, const DotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp SparseDot(
+      XlaOp lhs, XlaOp rhs, absl::Span<const XlaOp> sparse_meta,
+      absl::Span<const SparsityDescriptor> sparsity,
+      const DotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp RaggedAllToAll(
+      XlaOp input, XlaOp input_offsets, XlaOp send_sizes, XlaOp output,
+      XlaOp output_offsets, XlaOp recv_sizes,
+      absl::Span<const ReplicaGroup> replica_groups = {},
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  XlaOp RaggedDot(
+      XlaOp lhs, XlaOp rhs, XlaOp group_sizes,
+      const RaggedDotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp Conv(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      Padding padding, int64_t feature_group_count = 1,
+      int64_t batch_group_count = 1,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp ConvWithGeneralPadding(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp ConvWithGeneralDimensions(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp ConvGeneral(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp ConvGeneralDilated(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+      const PrecisionConfig* precision_config = nullptr,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+      std::optional<std::vector<bool>> window_reversal = std::nullopt);
+
+  XlaOp DynamicConvForward(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp DynamicConvInputGrad(
+      XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  XlaOp DynamicConvKernelGrad(
+      XlaOp activations, XlaOp gradients,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  absl::StatusOr<HloInstructionProto> DynamicConvInstruction(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+  virtual absl::StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config);
+
+  XlaOp Fft(XlaOp operand, FftType fft_type,
+            absl::Span<const int64_t> fft_length);
+  virtual absl::StatusOr<XlaOp> FftInternal(
+      const Shape& shape, XlaOp operand, FftType fft_type,
+      absl::Span<const int64_t> fft_length);
+
+  virtual absl::StatusOr<XlaOp> TriangularSolveInternal(
+      const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options);
+
+  virtual absl::StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                                 bool lower);
+
+  XlaOp Infeed(const Shape& shape, const std::string& config = "");
+  XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
+                        const std::string& config);
+  virtual absl::StatusOr<XlaOp> InfeedWithTokenInternal(
+      const Shape& infeed_instruction_shape, XlaOp token,
+      const std::string& config);
+
+  void Outfeed(XlaOp operand, const Shape& shape_with_layout,
+               const std::string& outfeed_config);
+  XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
+                         const Shape& shape_with_layout,
+                         const std::string& outfeed_config);
+  virtual absl::StatusOr<XlaOp> OutfeedWithTokenInternal(
+      XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+      const std::string& outfeed_config);
+  XlaOp Call(const XlaComputation& computation,
+             absl::Span<const XlaOp> operands);
+
+  XlaOp CompositeCall(
+      const XlaComputation& computation, absl::Span<const XlaOp> operands,
+      const std::string& name,
+      std::optional<absl::string_view> attributes = std::nullopt,
+      std::optional<int64_t> version = std::nullopt);
+
+  XlaOp CustomCall(
+      const std::string& call_target_name, absl::Span<const XlaOp> operands,
+      const Shape& shape_with_layout, const std::string& opaque,
+      std::optional<absl::Span<const Shape>> operand_shapes_with_layout,
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, std::optional<Window> window,
+      std::optional<ConvolutionDimensionNumbers> dnums,
+      CustomCallSchedule schedule, CustomCallApiVersion api_version);
+
+  // Internal version of CustomCall without computation that doesn't do op
+  // specific error handling and expects arguments to be legal. CustomCall
+  // method above calls this method after error handling.
+  virtual absl::StatusOr<XlaOp> CustomCallInternal(
+      const std::string& call_target_name, absl::Span<const XlaOp> operands,
+      const XlaComputation* computation, const Shape& shape_with_layout,
+      const std::string& opaque,
+      std::optional<absl::Span<const Shape>> operand_shapes_with_layout,
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, std::optional<Window> window,
+      std::optional<ConvolutionDimensionNumbers> dnums,
+      CustomCallSchedule schedule, CustomCallApiVersion api_version);
+
+  // TODO(b/239474321) Remove this overload as it has simply led to code
+  // duplication.
+  XlaOp CustomCall(
+      const std::string& call_target_name, absl::Span<const XlaOp> operands,
+      const XlaComputation& computation, const Shape& shape_with_layout,
+      const std::string& opaque,
+      std::optional<absl::Span<const Shape>> operand_shapes_with_layout,
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, CustomCallSchedule schedule,
+      CustomCallApiVersion api_version);
+
+  XlaOp OptimizationBarrier(XlaOp operand);
+
+  XlaOp Reduce(XlaOp operand, XlaOp init_value,
+               const XlaComputation& computation,
+               absl::Span<const int64_t> dimensions_to_reduce);
+
+  XlaOp Reduce(absl::Span<const XlaOp> operands,
+               absl::Span<const XlaOp> init_values,
+               const XlaComputation& computation,
+               absl::Span<const int64_t> dimensions_to_reduce);
+
+  virtual absl::StatusOr<XlaOp> ReduceInternal(
+      const Shape& shape, absl::Span<const XlaOp> all_operands,
+      const XlaComputation& computation,
+      absl::Span<const int64_t> dimensions_to_reduce);
+
+  XlaOp ReduceAll(XlaOp operand, XlaOp init_value,
+                  const XlaComputation& computation);
+
+  XlaOp ReduceWindow(XlaOp operand, XlaOp init_value,
+                     const XlaComputation& computation,
+                     absl::Span<const int64_t> window_dimensions,
+                     absl::Span<const int64_t> window_strides, Padding padding);
+
+  XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                     absl::Span<const XlaOp> init_values,
+                     const XlaComputation& computation,
+                     absl::Span<const int64_t> window_dimensions,
+                     absl::Span<const int64_t> window_strides, Padding padding);
+
+  XlaOp ReduceWindowWithGeneralPadding(
+      absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
+      const XlaComputation& computation,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const int64_t> base_dilations,
+      absl::Span<const int64_t> window_dilations,
+      absl::Span<const std::pair<int64_t, int64_t>> padding);
+  absl::StatusOr<HloInstructionProto> ReduceWindowInternal(
+      absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
+      const XlaComputation& computation,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const int64_t> base_dilations,
+      absl::Span<const int64_t> window_dilations,
+      absl::Span<const std::pair<int64_t, int64_t>> padding);
+  virtual absl::StatusOr<XlaOp> ReduceWindowInternal(
+      const Shape& shape, XlaOp operand, XlaOp init_value,
+      const XlaComputation& computation, Window window);
+  XlaOp CrossReplicaSum(XlaOp operand,
+                        absl::Span<const ReplicaGroup> replica_groups = {});
+
+  XlaOp AllGather(XlaOp operand, int64_t all_gather_dimension,
+                  int64_t shard_count,
+                  absl::Span<const ReplicaGroup> replica_groups = {},
+                  const std::optional<ChannelHandle>& channel_id = std::nullopt,
+                  const std::optional<Layout>& layout = std::nullopt,
+                  std::optional<bool> use_global_device_ids = std::nullopt);
+
+  XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
+                  absl::Span<const ReplicaGroup> replica_groups = {},
+                  const std::optional<ChannelHandle>& channel_id = std::nullopt,
+                  const std::optional<Shape>& shape_with_layout = std::nullopt,
+                  std::optional<bool> use_global_device_ids = std::nullopt);
+
+  XlaOp ReduceScatter(
+      XlaOp operand, const XlaComputation& computation,
+      int64_t scatter_dimension, int64_t shard_count,
+      absl::Span<const ReplicaGroup> replica_groups = {},
+      const std::optional<ChannelHandle>& channel_id = std::nullopt,
+      const std::optional<Layout>& layout = std::nullopt,
+      std::optional<bool> use_global_device_ids = std::nullopt);
+
+  XlaOp AllToAll(XlaOp operand, int64_t split_dimension,
+                 int64_t concat_dimension, int64_t split_count,
+                 absl::Span<const ReplicaGroup> replica_groups,
+                 const std::optional<Layout>& layout = std::nullopt,
+                 const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  XlaOp AllToAllTuple(
+      absl::Span<const XlaOp> operands,
+      absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<Layout>& layout,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  XlaOp AllToAllTuple(
+      XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+      int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<Layout>& layout,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  XlaOp CollectiveBroadcast(
+      XlaOp operand, absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  XlaOp CollectivePermute(
+      XlaOp operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  XlaOp ReplicaId();
+
+  XlaOp SelectAndScatter(XlaOp operand, const XlaComputation& select,
+                         absl::Span<const int64_t> window_dimensions,
+                         absl::Span<const int64_t> window_strides,
+                         Padding padding, XlaOp source, XlaOp init_value,
+                         const XlaComputation& scatter);
+
+  XlaOp SelectAndScatterWithGeneralPadding(
+      XlaOp operand, const XlaComputation& select,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
+      XlaOp init_value, const XlaComputation& scatter);
+
+  absl::StatusOr<HloInstructionProto> SelectAndScatterInternal(
+      XlaOp operand, const XlaComputation& select,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
+      XlaOp init_value, const XlaComputation& scatter);
+
+  virtual XlaOp Iota(const Shape& shape, int64_t iota_dimension);
+
+  XlaOp Iota(PrimitiveType type, int64_t size);
+
+  XlaOp ConvertElementType(XlaOp operand, PrimitiveType new_element_type);
+
+  XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
+  virtual absl::StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
+                                                           XlaOp operand);
+
+  XlaOp StochasticConvertType(XlaOp operand, XlaOp random,
+                              PrimitiveType new_element_type);
+
+  XlaOp Transpose(XlaOp operand, absl::Span<const int64_t> permutation);
+  virtual absl::StatusOr<XlaOp> TransposeInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const int64_t> permutation);
+
+  XlaOp Rev(XlaOp operand, absl::Span<const int64_t> dimensions);
+  virtual absl::StatusOr<XlaOp> RevInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const int64_t> dimensions);
+
+  XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+             int64_t dimension = -1, bool is_stable = false);
+  virtual absl::StatusOr<XlaOp> SortInternal(const Shape& shape,
+                                             absl::Span<const XlaOp> operands,
+                                             const XlaComputation& comparator,
+                                             int64_t dimension, bool is_stable);
+
+  XlaOp TopK(XlaOp operand, int64_t k, bool largest);
+  virtual absl::StatusOr<XlaOp> TopKInternal(const Shape& shape, XlaOp operand,
+                                             int64_t k, bool largest);
+
+  XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
+
+  XlaOp Map(absl::Span<const XlaOp> operands, const XlaComputation& computation,
+            absl::Span<const int64_t> dimensions,
+            absl::Span<const XlaOp> static_operands = {});
+
+  XlaOp RngNormal(XlaOp mu, XlaOp sigma, const Shape& shape);
+
+  XlaOp RngUniform(XlaOp a, XlaOp b, const Shape& shape);
+
+  XlaOp RngBitGenerator(RandomAlgorithm algorithm, XlaOp initial_state,
+                        const Shape& shape);
+  // Internal variant for the op with the full result shape containing both data
+  // and state shape as a tuple.
+  virtual absl::StatusOr<XlaOp> RngBitGeneratorInternal(
+      const Shape& full_result_shape, RandomAlgorithm algorithm,
+      XlaOp initial_state);
+
+  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+              XlaOp init);
+  virtual absl::StatusOr<XlaOp> WhileInternal(const Shape& shape,
+                                              const XlaComputation& condition,
+                                              const XlaComputation& body,
+                                              XlaOp init);
+
+  XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
+                    const XlaComputation& true_computation, XlaOp false_operand,
+                    const XlaComputation& false_computation);
+
+  XlaOp Conditional(XlaOp branch_index,
+                    absl::Span<const XlaComputation* const> branch_computations,
+                    absl::Span<const XlaOp> branch_operands);
+
+  XlaOp ReducePrecision(XlaOp operand, int exponent_bits, int mantissa_bits);
+  virtual absl::StatusOr<XlaOp> ReducePrecisionInternal(const Shape& shape,
+                                                        XlaOp operand,
+                                                        int exponent_bits,
+                                                        int mantissa_bits);
+
+  XlaOp Gather(XlaOp input, XlaOp start_indices,
+               const GatherDimensionNumbers& dimension_numbers,
+               absl::Span<const int64_t> slice_sizes,
+               bool indices_are_sorted = false);
+
+  virtual absl::StatusOr<XlaOp> GatherInternal(
+      const Shape& shape, XlaOp input, XlaOp start_indices,
+      const GatherDimensionNumbers& dimension_numbers,
+      absl::Span<const int64_t> slice_sizes, bool indices_are_sorted);
+
+  XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
+                const XlaComputation& update_computation,
+                const ScatterDimensionNumbers& dimension_numbers,
+                bool indices_are_sorted = false, bool unique_indices = false);
+  XlaOp Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
+                absl::Span<const XlaOp> updates,
+                const XlaComputation& update_computation,
+                const ScatterDimensionNumbers& dimension_numbers,
+                bool indices_are_sorted = false, bool unique_indices = false);
+
+  virtual absl::StatusOr<XlaOp> ScatterInternal(
+      const Shape& shape, absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
+      absl::Span<const XlaOp> updates, const XlaComputation& update_computation,
+      const ScatterDimensionNumbers& dimension_numbers, bool indices_are_sorted,
+      bool unique_indices);
+
+  void Send(XlaOp operand, const ChannelHandle& handle);
+  XlaOp SendWithToken(XlaOp operand, XlaOp token, const ChannelHandle& handle);
+
+  XlaOp SendToHost(XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+                   const ChannelHandle& handle);
+
+  XlaOp RecvFromHost(XlaOp token, const Shape& shape,
+                     const ChannelHandle& handle);
+
+  virtual XlaOp CreateToken();
+
+  XlaOp AfterAll(absl::Span<const XlaOp> tokens);
+
+  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
+  XlaOp RecvWithToken(XlaOp token, const Shape& shape,
+                      const ChannelHandle& handle);
+
+  XlaOp BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
+                          float epsilon, int64_t feature_index);
+
+  XlaOp BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset, XlaOp mean,
+                           XlaOp variance, float epsilon,
+                           int64_t feature_index);
+
+  XlaOp BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
+                      XlaOp batch_var, XlaOp grad_output, float epsilon,
+                      int64_t feature_index);
+
+  XlaOp GetDimensionSize(XlaOp operand, int64_t dimension);
+
+  XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64_t dimension);
+
+  virtual absl::StatusOr<XlaOp> SetDimensionSizeInternal(const Shape& shape,
+                                                         XlaOp operand,
+                                                         XlaOp val,
+                                                         int64_t dimension);
+
+  XlaOp RemoveDynamicDimension(XlaOp operand, int64_t dimension);
+
+  virtual absl::StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      absl::Span<const XlaOp> operands);
+  absl::StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                       HloOpcode opcode) {
+    return AddInstruction(std::move(instr), opcode, /*operands=*/{});
+  }
+
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
+
+  absl::StatusOr<const HloInstructionProto*> LookUpInstruction(XlaOp op) const;
+  absl::StatusOr<const HloInstructionProto*> LookUpInstructionByHandle(
+      int64_t handle) const;
+  absl::StatusOr<HloInstructionProto*> LookUpMutableInstruction(XlaOp op);
+  absl::StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(
+      int64_t handle);
+
+  // Internal helper method that does the building for an arbitrary unary op.
+  virtual XlaOp UnaryOp(HloOpcode unop, XlaOp operand);
+
+  // Internal helper method that does the building for an arbitrary unary op
+  // with a result accuracy intended for unary functions.
+  virtual XlaOp UnaryOp(HloOpcode unop, XlaOp operand,
+                        const ResultAccuracy& result_accuracy);
+
+  // Internal helper method that does the building for an arbitrary binary op.
+  // broadcast_dimensions specifies which dimensions to use for broadcasting
+  // when the operation is between tensors of different ranks. The direction is
+  // only used if opcode is kCompare.
+  XlaOp BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
+                 absl::Span<const int64_t> broadcast_dimensions,
+                 std::optional<ComparisonDirection> direction = std::nullopt,
+                 std::optional<Comparison::Type> type = std::nullopt);
+
+  absl::StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                ComparisonDirection direction);
+
+  // Internal helper method for binary op compare without broadcast dimensions.
+  virtual absl::StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs,
+                                        XlaOp rhs,
+                                        ComparisonDirection direction,
+                                        Comparison::Type type);
+
+  // Internal helper method that does the building for an arbitrary binary op
+  // with same ranked operands that doesn't broadcast.
+  virtual XlaOp BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
+                                    XlaOp lhs, XlaOp rhs);
+
+  // Internal helper method that does the building for an arbitrary ternary op.
+  XlaOp TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs);
+
+  XlaOp RngOp(RandomDistribution distribution,
+              absl::Span<const XlaOp> parameters, const Shape& shape);
+
+  virtual absl::StatusOr<XlaOp> RngOpInternal(
+      RandomDistribution distribution, absl::Span<const XlaOp> parameters,
+      const Shape& shape);
+
+  virtual absl::StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64_t> broadcast_dimensions);
+
+  // Internal helper method that creates a sequence of instructions that
+  // performs an explicit broadcast of the operand to the target shape.
+  // All dimensions of the operand must either be equal to the corresponding
+  // output shape dimension, or be exactly 1.  (Such dimensions are the
+  // degenerate dimensions.)
+  absl::StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                             XlaOp operand);
+
+  // Internal helper method that broadcasts a scalar to the shape of the output.
+  absl::StatusOr<XlaOp> BroadcastScalarToOutputShape(XlaOp scalar,
+                                                     XlaOp output);
+
+  // Internal helper method for creating a Reshape op with the already inferred
+  // shape.
+  virtual absl::StatusOr<XlaOp> ReshapeInternal(const Shape& shape,
+                                                XlaOp operand,
+                                                int64_t inferred_dimension);
+
+  // Returns the (inferred) result for the program shape using the given root.
+  absl::StatusOr<ProgramShape> GetProgramShape(int64_t root_id) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(int64_t op_handle, int depth,
+                         absl::flat_hash_set<int64_t>* visited,
+                         bool* is_constant) const;
+
+  // Checks bounds for convolution parameters.
+  absl::Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  int64_t GetNextId() { return ++next_id_; }
+
+  // Populates the module with the input/output alias information stored within
+  // the input_output_aliases vector.
+  static absl::Status PopulateInputOutputAliasAndBufferDonor(
+      HloModuleProto* module, const ProgramShape& program_shape,
+      const std::vector<InputOutputAlias>& input_output_aliases,
+      const absl::flat_hash_set<HloBufferDonorConfig::BufferDonor>&
+          buffer_donors);
+
+  std::string name_;  // Name to use for the built computation.
+
+  // The next sequential ID for every instruction/computation contained within
+  // this computation.
+  int64_t next_id_ = 0;
+
+  // The first error encountered while building the computation.
+  // This is OK until the first error is encountered.
+  absl::Status first_error_;
+
+  // The saved stack trace from the point at which the first error occurred.
+  tsl::SavedStackTrace first_error_backtrace_;
+
+  // The instructions of this computation.
+  // Use a deque so pointers into this are stable, for example the return
+  // value of LookUpInstructionByHandle().
+  std::deque<HloInstructionProto> instructions_;
+  // A cache for the HloInstructionProto shapes, to avoid recreating Shape
+  // objects from protos and to support the GetShapePtr() API.
+  std::vector<std::unique_ptr<Shape>> instruction_shapes_;
+
+  // Dynamic parameter configuration of this computation.
+  DynamicParameterBinding dynamic_parameter_binding_;
+
+  // Holds the input/output alias information populated by the SetUpAlias() API.
+  std::vector<InputOutputAlias> input_output_aliases_;
+
+  // Holds the buffer donor information populated by the AddBufferDonor() API.
+  absl::flat_hash_set<HloBufferDonorConfig::BufferDonor> buffer_donors_;
+
+  // A map from XlaOp::Handle to the index in the instructions_ vector where the
+  // instruction is held.
+  absl::flat_hash_map<int64_t, int64_t> handle_to_index_;
+
+  // Track imported instructions by their computation id and the position in
+  // their computation's instruction list.
+  struct ImportedInstruction {
+    int64_t computation_id;
+    int64_t instruction_index;
+  };
+
+  absl::flat_hash_map<int64_t, ImportedInstruction> handle_to_imported_index_;
+
+  // The embedded computations used by this computation. Each computation was
+  // the entry computation of some XlaComputation, the key is the unique id of
+  // that XlaComputation.
+  std::map<int64_t, HloComputationProto> embedded_;
+
+  // The unique parameter numbers.
+  absl::flat_hash_set<int64_t> parameter_numbers_;
+
+  // The metadata to attach to each op. This is structured as a "modal"-like
+  // operation, in order to simplify client code (and not sprinkle this metadata
+  // throughout the TensorFlow op kernel implementations).
+  OpMetadata metadata_;
+
+  // A temporary metadata that will only be applied to the next op created.
+  std::optional<OpMetadata> one_shot_metadata_;
+
+  // Sharding for this operator. This is structured as a "model"-like operation,
+  // in order to simplify client code, similar to metadata_.
+  std::optional<OpSharding> sharding_;
+
+  // Mode bit that indicates whether to die when a first error is encountered.
+  bool die_immediately_on_error_ = false;
+
+  XlaBuilder* parent_builder_{nullptr};
+
+  FrontendAttributes frontend_attributes_;
+
+  // If the user cannot provide a token for infeed/outfeed, assume they are
+  // being added to the computation in the correct order. Implicitly reuse
+  // the tokens from the previous op to guarantee the user intended ordering.
+  XlaOp infeed_token_;
+  XlaOp outfeed_token_;
+
+  friend XlaOp Parameter(XlaBuilder* builder, int64_t parameter_number,
+                         const Shape& shape, const std::string& name,
+                         const std::vector<bool>& replicated_at_leaf_buffers);
+  friend XlaOp ConstantLiteral(XlaBuilder* builder,
+                               const LiteralSlice& literal);
+
+  friend XlaOp Broadcast(XlaOp operand,
+                         absl::Span<const int64_t> broadcast_sizes);
+
+  friend XlaOp BroadcastInDim(XlaOp operand,
+                              absl::Span<const int64_t> out_dim_size,
+                              absl::Span<const int64_t> broadcast_dimensions);
+
+  friend XlaOp MhloDynamicBroadcastInDim(
+      XlaOp operand, XlaOp output_dimensions,
+      absl::Span<const int64_t> broadcast_dimensions,
+      const Shape& output_shape);
+
+  friend XlaOp Copy(XlaOp operand);
+
+  friend XlaOp Pad(XlaOp operand, XlaOp padding_value,
+                   const PaddingConfig& padding_config);
+
+  friend XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
+                        int64_t pad_lo, int64_t pad_hi);
+
+  friend XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
+                       absl::Span<const int64_t> new_sizes);
+
+  friend XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes);
+
+  friend XlaOp Reshape(const Shape& shape, XlaOp operand);
+
+  friend XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                              absl::Span<const int64_t> new_size_bounds,
+                              const std::vector<bool>& dims_are_dynamic);
+
+  friend XlaOp MhloDynamicReshape(XlaOp operand, XlaOp output_shape,
+                                  const Shape& shape);
+
+  friend XlaOp ReshapeWithInferredDimension(XlaOp operand,
+                                            absl::Span<const int64_t> new_sizes,
+                                            int64_t inferred_dimension);
+
+  friend XlaOp Collapse(XlaOp operand, absl::Span<const int64_t> dimensions);
+
+  friend XlaOp Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
+                     absl::Span<const int64_t> limit_indices,
+                     absl::Span<const int64_t> strides);
+
+  friend XlaOp SliceInDim(XlaOp operand, int64_t start_index,
+                          int64_t limit_index, int64_t stride, int64_t dimno);
+
+  friend XlaOp DynamicSlice(XlaOp operand,
+                            absl::Span<const XlaOp> start_indices,
+                            absl::Span<const int64_t> slice_sizes);
+
+  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
+                                  absl::Span<const XlaOp> start_indices);
+
+  friend XlaOp ConcatInDim(XlaBuilder* builder,
+                           absl::Span<const XlaOp> operands, int64_t dimension);
+
+  friend XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
+  friend XlaOp Tuple(XlaBuilder* builder, absl::Span<const XlaOp> elements);
+  friend XlaOp GetTupleElement(XlaOp tuple_data, int64_t index);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
+                       absl::Span<const int64_t> broadcast_dimensions,
+                       ComparisonDirection direction);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
+                       absl::Span<const int64_t> broadcast_dimensions,
+                       ComparisonDirection direction,
+                       Comparison::Type compare_type);
+  friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
+                   const PrecisionConfig* precision_config,
+                   std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
+                          const DotDimensionNumbers& dimension_number,
+                          const PrecisionConfig* precision_config,
+                          std::optional<PrimitiveType> preferred_element_type);
+  virtual absl::StatusOr<XlaOp> DotGeneralInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs,
+      const DotDimensionNumbers& dimension_number,
+      const PrecisionConfig* precision_config);
+  friend XlaOp SparseDot(XlaOp lhs, XlaOp rhs,
+                         absl::Span<const XlaOp> sparse_meta,
+                         absl::Span<const SparsityDescriptor> sparsity,
+                         const DotDimensionNumbers& dimension_number,
+                         const PrecisionConfig* precision_config,
+                         std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp RaggedAllToAll(XlaOp input, XlaOp input_offsets,
+                              XlaOp send_sizes, XlaOp output,
+                              XlaOp output_offsets, XlaOp recv_sizes,
+                              absl::Span<const ReplicaGroup> replica_groups,
+                              const std::optional<ChannelHandle>& channel_id);
+  friend XlaOp RaggedDot(XlaOp lhs, XlaOp rhs, XlaOp group_sizes,
+                         const RaggedDotDimensionNumbers& dimension_numbers,
+                         const PrecisionConfig* precision_config,
+                         std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp Conv(XlaOp lhs, XlaOp rhs,
+                    absl::Span<const int64_t> window_strides, Padding padding,
+                    int64_t feature_group_count, int64_t batch_group_count,
+                    const PrecisionConfig* precision_config,
+                    std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp ConvWithGeneralPadding(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config,
+      std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp ConvWithGeneralDimensions(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config,
+      std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp ConvGeneral(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config,
+      std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp DynamicConvForward(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp DynamicConvKernelGrad(
+      XlaOp activations, XlaOp gradients,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type);
+  friend XlaOp DynamicConvInputGrad(
+      XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config, PaddingType padding_type,
+      std::optional<PrimitiveType> preferred_element_type);
+
+  friend XlaOp ConvKernelGrad(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config,
+      std::optional<PrimitiveType> preferred_element_type);
+
+  friend XlaOp ConvGeneralDilated(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const PrecisionConfig* precision_config,
+      std::optional<PrimitiveType> preferred_element_type,
+      std::optional<std::vector<bool>> window_reversal);
+
+  friend XlaOp Fft(XlaOp operand, FftType fft_type,
+                   absl::Span<const int64_t> fft_length);
+  friend XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                               bool unit_diagonal,
+                               TriangularSolveOptions::Transpose transpose_a);
+  friend XlaOp Cholesky(XlaOp a, bool lower);
+  friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
+                      const std::string& config);
+  friend void Outfeed(XlaOp operand, const Shape& shape_with_layout,
+                      const std::string& outfeed_config);
+  friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
+                    absl::Span<const XlaOp> operands);
+
+  friend XlaOp CompositeCall(XlaBuilder* builder,
+                             const XlaComputation& computation,
+                             absl::Span<const XlaOp> operands,
+                             const std::string& name,
+                             std::optional<absl::string_view> attributes,
+                             std::optional<int64_t> version);
+
+  friend XlaOp CustomCall(
+      XlaBuilder* builder, const std::string& call_target_name,
+      absl::Span<const XlaOp> operands, const Shape& shape,
+      const std::string& opaque, bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, CustomCallSchedule schedule,
+      CustomCallApiVersion api_version);
+  friend XlaOp CustomCallWithComputation(
+      XlaBuilder* builder, const std::string& call_target_name,
+      absl::Span<const XlaOp> operands, const XlaComputation& computation,
+      const Shape& shape, const std::string& opaque, bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, CustomCallSchedule schedule,
+      CustomCallApiVersion api_version);
+  friend XlaOp CustomCallWithLayout(
+      XlaBuilder* builder, const std::string& call_target_name,
+      absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+      absl::Span<const Shape> operand_shapes_with_layout,
+      const std::string& opaque, bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, CustomCallSchedule schedule,
+      CustomCallApiVersion api_version);
+  friend XlaOp CustomCallWithConvDnums(
+      XlaBuilder* builder, const std::string& call_target_name,
+      absl::Span<const XlaOp> operands, const Shape& shape,
+      absl::Span<const Shape> operand_shapes_with_layout,
+      const std::string& opaque, bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing,
+      const Literal* literal, Window window, ConvolutionDimensionNumbers dnums,
+      CustomCallSchedule schedule, CustomCallApiVersion api_version);
+  friend XlaOp OptimizationBarrier(XlaOp operand);
+  friend XlaOp Complex(XlaOp real, XlaOp imag,
+                       absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Conj(XlaOp operand);
+  friend XlaOp Add(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Sub(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Mul(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Div(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Rem(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Max(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Min(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp And(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Or(XlaOp lhs, XlaOp rhs,
+                  absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Xor(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Not(XlaOp operand);
+  friend XlaOp PopulationCount(XlaOp operand);
+  friend XlaOp ShiftLeft(XlaOp lhs, XlaOp rhs,
+                         absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp ShiftRightArithmetic(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp ShiftRightLogical(
+      XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Reduce(XlaOp operand, XlaOp init_value,
+                      const XlaComputation& computation,
+                      absl::Span<const int64_t> dimensions_to_reduce);
+  friend XlaOp Reduce(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+                      absl::Span<const XlaOp> init_values,
+                      const XlaComputation& computation,
+                      absl::Span<const int64_t> dimensions_to_reduce);
+  friend XlaOp ReduceAll(XlaOp operand, XlaOp init_value,
+                         const XlaComputation& computation);
+  friend XlaOp ReduceWindow(XlaOp operand, XlaOp init_value,
+                            const XlaComputation& computation,
+                            absl::Span<const int64_t> window_dimensions,
+                            absl::Span<const int64_t> window_strides,
+                            Padding padding);
+  friend XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                            absl::Span<const XlaOp> init_values,
+                            const XlaComputation& computation,
+                            absl::Span<const int64_t> window_dimensions,
+                            absl::Span<const int64_t> window_strides,
+                            Padding padding);
+  friend XlaOp ReduceWindowWithGeneralPadding(
+      XlaOp operand, XlaOp init_value, const XlaComputation& computation,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const int64_t> base_dilations,
+      absl::Span<const int64_t> window_dilations,
+      absl::Span<const std::pair<int64_t, int64_t>> padding);
+  friend XlaOp ReduceWindowWithGeneralPadding(
+      absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
+      const XlaComputation& computation,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const int64_t> base_dilations,
+      absl::Span<const int64_t> window_dilations,
+      absl::Span<const std::pair<int64_t, int64_t>> padding);
+
+  friend XlaOp CrossReplicaSum(XlaOp operand,
+                               absl::Span<const ReplicaGroup> replica_groups);
+  friend XlaOp AllGather(XlaOp operand, int64_t all_gather_dimension,
+                         int64_t shard_count,
+                         absl::Span<const ReplicaGroup> replica_groups,
+                         const std::optional<ChannelHandle>& channel_id,
+                         const std::optional<Layout>& layout,
+                         std::optional<bool> use_global_device_ids);
+  friend XlaOp AllGatherTuple(absl::Span<const XlaOp> operands,
+                              int64_t all_gather_dimension, int64_t shard_count,
+                              absl::Span<const ReplicaGroup> replica_groups,
+                              const std::optional<ChannelHandle>& channel_id,
+                              const std::optional<Layout>& layout,
+                              std::optional<bool> use_global_device_ids);
+  friend XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
+                         absl::Span<const ReplicaGroup> replica_groups,
+                         const std::optional<ChannelHandle>& channel_id,
+                         const std::optional<Shape>& shape_with_layout,
+                         std::optional<bool> use_global_device_ids);
+  friend XlaOp AllReduceTuple(absl::Span<const XlaOp> operand,
+                              const XlaComputation& computation,
+                              absl::Span<const ReplicaGroup> replica_groups,
+                              const std::optional<ChannelHandle>& channel_id,
+                              const std::optional<Shape>& shape_with_layout,
+                              std::optional<bool> use_global_device_ids);
+  friend XlaOp ReduceScatter(XlaOp operand, const XlaComputation& computation,
+                             int64_t scatter_dimension, int64_t shard_count,
+                             absl::Span<const ReplicaGroup> replica_groups,
+                             const std::optional<ChannelHandle>& channel_id,
+                             const std::optional<Layout>& layout,
+                             std::optional<bool> use_global_device_ids);
+
+  friend XlaOp AllToAll(XlaOp operand, int64_t split_dimension,
+                        int64_t concat_dimension, int64_t split_count,
+                        absl::Span<const ReplicaGroup> replica_groups,
+                        const std::optional<Layout>& layout,
+                        const std::optional<ChannelHandle>& channel_id);
+  friend XlaOp AllToAllTuple(absl::Span<const XlaOp> operands,
+                             absl::Span<const ReplicaGroup> replica_groups,
+                             const std::optional<Layout>& layout,
+                             const std::optional<ChannelHandle>& channel_id);
+  friend XlaOp AllToAllTuple(XlaOp operand, int64_t split_dimension,
+                             int64_t concat_dimension, int64_t split_count,
+                             absl::Span<const ReplicaGroup> replica_groups,
+                             const std::optional<Layout>& layout,
+                             const std::optional<ChannelHandle>& channel_id);
+  friend XlaOp CollectiveBroadcast(
+      XlaOp operand, absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<ChannelHandle>& channel_id);
+  friend XlaOp CollectivePermute(
+      XlaOp operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<ChannelHandle>& channel_id);
+  friend XlaOp ReplicaId(XlaBuilder* builder);
+  friend XlaOp SelectAndScatter(XlaOp operand, const XlaComputation& select,
+                                absl::Span<const int64_t> window_dimensions,
+                                absl::Span<const int64_t> window_strides,
+                                Padding padding, XlaOp source, XlaOp init_value,
+                                const XlaComputation& scatter);
+  friend XlaOp SelectAndScatterWithGeneralPadding(
+      XlaOp operand, const XlaComputation& select,
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
+      XlaOp init_value, const XlaComputation& scatter);
+  friend XlaOp Abs(XlaOp operand);
+  friend XlaOp Atan2(XlaOp y, XlaOp x,
+                     absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Erf(XlaOp operand);
+  friend XlaOp Exp(XlaOp operand);
+  friend XlaOp Exp(XlaOp operand, const ResultAccuracy& result_accuracy);
+  friend XlaOp Expm1(XlaOp operand);
+  friend XlaOp Floor(XlaOp operand);
+  friend XlaOp Ceil(XlaOp operand);
+  friend XlaOp Round(XlaOp operand);
+  friend XlaOp RoundNearestEven(XlaOp operand);
+  friend XlaOp Log(XlaOp operand);
+  friend XlaOp Log1p(XlaOp operand);
+  friend XlaOp Logistic(XlaOp operand);
+  friend XlaOp Sign(XlaOp operand);
+  friend XlaOp Clz(XlaOp operand);
+  friend XlaOp Cos(XlaOp operand);
+  friend XlaOp Sin(XlaOp operand);
+  friend XlaOp Tan(XlaOp operand);
+  friend XlaOp Tanh(XlaOp operand);
+  friend XlaOp Real(XlaOp operand);
+  friend XlaOp Imag(XlaOp operand);
+  friend XlaOp Sqrt(XlaOp operand);
+  friend XlaOp Rsqrt(XlaOp operand);
+  friend XlaOp Cbrt(XlaOp operand);
+  friend XlaOp Pow(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp IsFinite(XlaOp operand);
+  friend XlaOp Iota(XlaBuilder* builder, const Shape& shape,
+                    int64_t iota_dimension);
+  friend XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64_t size);
+  friend XlaOp ConvertElementType(XlaOp operand,
+                                  PrimitiveType new_element_type);
+  friend XlaOp BitcastConvertType(XlaOp operand,
+                                  PrimitiveType new_element_type);
+  friend XlaOp StochasticConvertType(XlaOp operand, XlaOp random,
+                                     PrimitiveType new_element_type);
+  friend XlaOp Neg(XlaOp operand);
+  friend XlaOp Transpose(XlaOp operand, absl::Span<const int64_t> permutation);
+  friend XlaOp Rev(XlaOp operand, absl::Span<const int64_t> dimensions);
+  friend XlaOp Sort(absl::Span<const XlaOp> operands,
+                    const XlaComputation& comparator, int64_t dimension,
+                    bool is_stable);
+  friend XlaOp TopK(XlaOp operand, int64_t k, bool largest);
+  friend XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
+  friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+                   const XlaComputation& computation,
+                   absl::Span<const int64_t> dimensions,
+                   absl::Span<const XlaOp> static_operands);
+  friend XlaOp RngNormal(XlaOp mu, XlaOp sigma, const Shape& shape);
+  friend XlaOp RngUniform(XlaOp a, XlaOp b, const Shape& shape);
+  friend XlaOp RngBitGenerator(RandomAlgorithm algorithm, XlaOp initial_state,
+                               const Shape& shape);
+  friend XlaOp While(const XlaComputation& condition,
+                     const XlaComputation& body, XlaOp init);
+  friend XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
+                           const XlaComputation& true_computation,
+                           XlaOp false_operand,
+                           const XlaComputation& false_computation);
+  friend XlaOp Conditional(
+      XlaOp branch_index,
+      absl::Span<const XlaComputation* const> branch_computations,
+      absl::Span<const XlaOp> branch_operands);
+  friend XlaOp ConditionalImpl(
+      XlaOp branch_index,
+      absl::Span<const XlaComputation* const> branch_computations,
+      absl::Span<const XlaOp> branch_operands);
+  friend XlaOp ReducePrecision(XlaOp operand, int exponent_bits,
+                               int mantissa_bits);
+  friend XlaOp Gather(XlaOp input, XlaOp start_indices,
+                      const GatherDimensionNumbers& dimension_numbers,
+                      absl::Span<const int64_t> slice_sizes,
+                      bool indices_are_sorted);
+  friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
+                       const XlaComputation& update_computation,
+                       const ScatterDimensionNumbers& dimension_numbers,
+                       bool indices_are_sorted, bool unique_indices);
+  friend XlaOp Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
+                       absl::Span<const XlaOp> updates,
+                       const XlaComputation& update_computation,
+                       const ScatterDimensionNumbers& dimension_numbers,
+                       bool indices_are_sorted, bool unique_indices);
+  friend void Send(XlaOp operand, const ChannelHandle& handle);
+  friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
+                    const ChannelHandle& handle);
+  friend XlaOp BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
+                                 float epsilon, int64_t feature_index);
+  friend XlaOp BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset,
+                                  XlaOp mean, XlaOp variance, float epsilon,
+                                  int64_t feature_index);
+  friend XlaOp BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
+                             XlaOp batch_var, XlaOp grad_output, float epsilon,
+                             int64_t feature_index);
+  friend XlaOp SendWithToken(XlaOp operand, XlaOp token,
+                             const ChannelHandle& handle);
+  friend XlaOp RecvWithToken(XlaOp token, const Shape& shape,
+                             const ChannelHandle& handle);
+  friend XlaOp SendToHost(XlaOp operand, XlaOp token,
+                          const Shape& shape_with_layout,
+                          const ChannelHandle& handle);
+  friend XlaOp RecvFromHost(XlaOp token, const Shape& shape,
+                            const ChannelHandle& handle);
+  friend XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
+                               const std::string& config);
+  friend XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
+                                const Shape& shape_with_layout,
+                                const std::string& outfeed_config);
+  friend XlaOp CreateToken(XlaBuilder* builder);
+  friend XlaOp AfterAll(XlaBuilder* builder, absl::Span<const XlaOp> tokens);
+
+  friend XlaOp GetDimensionSize(XlaOp operand, int64_t dimension);
+  friend XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64_t dimension);
+  friend XlaOp RemoveDynamicDimension(XlaOp operand, int64_t dimension);
+
+ protected:
+  // Returns OK status if the given op was built using this builder. Otherwise,
+  // returns an error.
+  absl::Status CheckOpBuilder(XlaOp op) const;
+
+ private:
+  XlaOp AllGatherImpl(XlaOp operand, int64_t all_gather_dimension,
+                      int64_t shard_count,
+                      absl::Span<const ReplicaGroup> replica_groups,
+                      const std::optional<ChannelHandle>& channel_id,
+                      const std::optional<Layout>& layout,
+                      std::optional<bool> use_global_device_ids, bool async);
+
+  XlaOp AllReduceImpl(XlaOp operand, const XlaComputation& computation,
+                      absl::Span<const ReplicaGroup> replica_groups,
+                      const std::optional<ChannelHandle>& channel_id,
+                      const std::optional<Shape>& layout,
+                      std::optional<bool> use_global_device_ids, bool async);
+
+  XlaOp CollectiveBroadcastImpl(XlaOp operand,
+                                absl::Span<const ReplicaGroup> replica_groups,
+                                const std::optional<ChannelHandle>& channel_id);
+
+  XlaOp CollectivePermuteImpl(
+      XlaOp operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<ChannelHandle>& channel_id, bool async);
+
+  XlaOp ConditionalImpl(
+      XlaOp branch_index,
+      absl::Span<const XlaComputation* const> branch_computations,
+      absl::Span<const XlaOp> branch_operands);
+
+  XlaOp AllToAllArray(
+      XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+      int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+  // Creates an op with the given opcode and the output shape.
+  virtual absl::StatusOr<XlaOp> AddOpWithShape(
+      HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands);
+
+  // Creates an op with the given opcode and the output shape.
+  virtual absl::StatusOr<XlaOp> AddOpWithResultAccuracy(
+      HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands,
+      const ResultAccuracy& result_accuracy);
+
+  // Here, InstructionType is either const HloInstructionProto* or non-const
+  // HloInstructionProto*.
+  template <typename InstructionType>
+  absl::StatusOr<InstructionType> LookUpInstructionByHandleInternal(
+      int64_t handle) const {
+    auto it = handle_to_index_.find(handle);
+    if (it == handle_to_index_.end()) {
+      // Try look for the instruction in the imported instructions.
+      auto imported_it = handle_to_imported_index_.find(handle);
+      if (imported_it != handle_to_imported_index_.end()) {
+        ImportedInstruction imported = imported_it->second;
+        return const_cast<InstructionType>(
+            &embedded_.at(imported.computation_id)
+                 .instructions(imported.instruction_index));
+      }
+      return InvalidArgument("No XlaOp with handle %d", handle);
+    }
+    return const_cast<InstructionType>(&instructions_.at(it->second));
+  }
+
+  // Here, InstructionType is either const HloInstructionProto* or non-const
+  // HloInstructionProto*.
+  //
+  // TODO(hinsu): Return const pointer within absl::StatusOr and use
+  // absl::implicit_cast at callsites. This requires implicit_cast support in
+  // absl::StatusOr similar to absl::StatusOr.
+  template <typename InstructionType>
+  absl::StatusOr<InstructionType> LookUpInstructionInternal(XlaOp op) const {
+    TF_RETURN_IF_ERROR(CheckOpBuilder(op));
+    return LookUpInstructionByHandleInternal<InstructionType>(op.handle());
+  }
+
+  friend struct internal::XlaBuilderFriend;
+
+  friend class ValueInference;
+};
+
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+class XlaScopedShardingAssignment {
+ public:
+  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
+                              std::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
+  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
+      delete;
+
+  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const std::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::XlaBuilder* const builder_;
+  std::optional<OpSharding> prev_sharding_;
+};
+
+// RAII-style object: save the current builder's frontend attributes, and merge
+// them with the new ones on construction.
+// Restore the original attributes on destruction.
+class XlaScopedFrontendAttributesAssignment {
+ public:
+  XlaScopedFrontendAttributesAssignment(xla::XlaBuilder* builder,
+                                        FrontendAttributes attributes)
+      : builder_(builder) {
+    saved_ = builder_->SwapFrontendAttributes(attributes);
+  }
+
+  ~XlaScopedFrontendAttributesAssignment() {
+    builder_->SetFrontendAttributes(saved_);
+  }
+
+ private:
+  xla::XlaBuilder* const builder_;
+  FrontendAttributes saved_;
+
+  XlaScopedFrontendAttributesAssignment(
+      const XlaScopedFrontendAttributesAssignment&) = delete;
+  XlaScopedFrontendAttributesAssignment& operator=(
+      const XlaScopedFrontendAttributesAssignment&) = delete;
+};
+
+// RAII-style object: sets the current op metadata in builder on construction,
+// and sets back to the previous assignment on destruction.
+class XlaScopedOpMetadataAssignment {
+ public:
+  XlaScopedOpMetadataAssignment(xla::XlaBuilder* builder, OpMetadata metadata)
+      : builder_(builder) {
+    saved_ = builder_->SwapOpMetadata(metadata);
+  }
+
+  ~XlaScopedOpMetadataAssignment() { builder_->SwapOpMetadata(saved_); }
+
+ private:
+  xla::XlaBuilder* const builder_;
+  OpMetadata saved_;
+
+  XlaScopedOpMetadataAssignment(const XlaScopedOpMetadataAssignment&) = delete;
+  XlaScopedOpMetadataAssignment& operator=(
+      const XlaScopedOpMetadataAssignment&) = delete;
+};
+
+// Free functions for building XlaOps. The intention is that these will
+// become the public API for building XlaOps rather than calling methods on
+// XlaBuilder directly.
+//
+
+// Enqueues a "retrieve parameter value" instruction for a parameter that was
+// passed to the computation.
+XlaOp Parameter(XlaBuilder* builder, int64_t parameter_number,
+                const Shape& shape, const std::string& name);
+
+// Same as above, but with leaf buffer replication annotation.
+XlaOp Parameter(XlaBuilder* builder, int64_t parameter_number,
+                const Shape& shape, const std::string& name,
+                const std::vector<bool>& replicated_at_leaf_buffers);
+
+// Enqueues a constant with the value of the given literal onto the
+// computation.
+XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal);
+
+// Enqueues a constant onto the computation. Methods are templated on the
+// native host type (NativeT) which corresponds to a specific XLA
+// PrimitiveType as given in the following table:
+//
+//  Native Type   PrimitiveType
+// -----------------------------
+//   bool           PRED
+//   int32_t        S32
+//   int64_t        S64
+//   uint32_t       U32
+//   uint64_t       U64
+//   float          F32
+//   double         F64
+//
+// Note: not all primitive types defined in xla_data.proto have a
+// corresponding native type yet.
+template <typename NativeT>
+XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, absl::Span<const NativeT> values);
+XlaOp ConstantR1(XlaBuilder* builder, const tsl::core::Bitmap& values);
+template <typename NativeT>
+XlaOp ConstantR2(XlaBuilder* builder,
+                 std::initializer_list<std::initializer_list<NativeT>> values);
+template <typename NativeT>
+XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                  const Array<NativeT>& values,
+                                  const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                      const Array2D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                            const Array2D<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                      const Array3D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                            const Array3D<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                      const Array4D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                            const Array4D<NativeT>& values);
+
+// Enqueues a rank one constant (XlaBuilder* builder, vector) onto the
+// computation. The vector has size 'length' and every element has the value
+// 'value'.
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, int64_t length, NativeT value);
+
+// Adds dimensions to an array by duplicating the data in the array.
+//
+// The new dimensions are inserted on the left, i.e. if
+// broadcast_sizes has values {a0, ..., aN} and the operand shape
+// has dimensions {b0, ..., bM} then the shape of the output has
+// dimensions {a0, ..., aN, b0, ..., bM}.
+//
+// The new dimensions index into copies of the operand, i.e.
+//
+//   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+XlaOp Broadcast(XlaOp operand, absl::Span<const int64_t> broadcast_sizes);
+
+// This op broadcasts the `operand` to an output with the given `shape`.
+// `broadcast_dimensions` are the dimensions to be broadcasting into, i.e., the
+// i'th dimension of the operand is mapped to the broadcast_dimensions[i]'th
+// dimension of the output. This also requires that the i'th input dimension is
+// either 1 or is the same as the output dimension it's broadcasting into.
+//
+// For example, say operand = {1, 2}, i.e., a 1D tensor in shape s32[2]; the
+// output shape is s32[2,2]:
+// - Specifying {1} as broadcast_dimension will generate output
+//   {{1, 2},
+//    {1, 2}}
+// - On the other hand, specifying {0} as broadcast_dimension
+//   will generate output
+//   {{1 , 1},
+//    {2 , 2}}
+XlaOp BroadcastInDim(XlaOp operand, absl::Span<const int64_t> out_dim_size,
+                     absl::Span<const int64_t> broadcast_dimensions);
+
+// This is an experimental API for creating the mhlo.dynamic_broadcast_in_dim
+// op from the XlaBuilder. This is only intended for export to MHLO or
+// StableHLO, and cannot be compiled. See
+// https://www.tensorflow.org/mlir/hlo_ops#mhlodynamic_broadcast_in_dim_mhlodynamicbroadcastindimop.
+// for the op semantics.
+XlaOp MhloDynamicBroadcastInDim(XlaOp operand, XlaOp output_dimensions,
+                                absl::Span<const int64_t> broadcast_dimensions,
+                                const Shape& output_shape);
+
+// Copies the input operand to the output. This operation is for internal
+// purpose and is only used by the compiler for optimization purposes or to
+// ensure correctness. The XLA client should never have to generate this
+// instruction.
+//
+// Copy has two potential use cases:
+//
+// * Create a copy of the operand with a new layout.
+//
+// * Create a copy of the operand in a separately allocated buffer. This is
+//   necessary for some backends if the operand is a parameter or constant and
+//   the operand is returned within a tuple. In this case, the lifetime of the
+//   operand buffer must be the same as the lifetime of the output result.
+//   However, the lifetimes of parameters and constants are managed separately
+//   from the lifetime of the output result. Creating a separate copy of the
+//   parameter or constant buffer resolves this issue.
+XlaOp Copy(XlaOp operand);
+
+// Enqueues a pad operation onto the computation that pads the given value on
+// the edges as well as between the elements of the input. padding_config
+// specifies the padding amount for each dimension.
+XlaOp Pad(XlaOp operand, XlaOp padding_value,
+          const PaddingConfig& padding_config);
+
+// Enqueues a pad operation in a given dimension, taking all other
+// dimensions as they are.
+XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
+               int64_t pad_lo, int64_t pad_hi);
+
+// Enqueues an operation onto the computation that flattens the operand based
+// on the dimension order (major/slowest-varying to minor/fastest-varying)
+// given, followed by reshaping it into the shape with the given dimension
+// sizes (also major to minor). Conceptually, this is a limited form of
+// "shape casting".
+XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
+              absl::Span<const int64_t> new_sizes);
+
+// Enqueues a dynamic reshape operation. The dynamic reshape takes additional
+// XlaOps as sizes for the result dimension. The result dim i is a dynamic
+// dimension dimension if dims_are_dynamic[i] is true.
+XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                     absl::Span<const int64_t> new_size_bounds,
+                     const std::vector<bool>& dims_are_dynamic);
+
+// This is an experimental API for creating the mhlo.dynamic_reshape op from the
+// XlaBuilder. This is only intended for export to MHLO or StableHLO, and cannot
+// be compiled.
+XlaOp MhloDynamicReshape(XlaOp operand, XlaOp output_shape, const Shape& shape);
+
+// Enqueues an operation onto the computation that collapses the operand,
+// from first to last dimension (C order), then reshapes it to the given
+// dimension sizes. Conceptually, this is a limited form of "shape casting".
+XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes);
+
+// Enqueues a Reshape op that uses an explicit target shape.
+XlaOp Reshape(const Shape& shape, XlaOp operand);
+
+// `inferred_dimension` represents the output dimension that's inferred by
+// upper-level framework by dividing the input element count by the known
+// output element count. While an inferred_dimension can be static, if there
+// is a dynamic dimension in the output, it must be the inferred dimension.
+XlaOp ReshapeWithInferredDimension(XlaOp operand,
+                                   absl::Span<const int64_t> new_sizes,
+                                   int64_t inferred_dimension);
+
+// Wrapper for Reshape.
+// Enqueues an operation to collapse the provided dimensions; e.g. an
+// operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+// {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+// be a consecutive, in-order subsequence of the operand dimensions.
+//
+// Note that collapsing a single dimension does nothing:
+//
+//    {256} collapsing {0} => {256}
+//    {1} collapsing {0} => {1}
+//
+// Collapsing multiple dimensions produces a single result dimension:
+//
+//    {256, 2} collapsing {0,1} => {512}
+//    {256, 2, 3} collapsing {0,1} => {512, 3}
+//
+// This could potentially cause data to be moved -- it provides a more
+// structured form of reshaping than an arbitrary Reshape operation.
+XlaOp Collapse(XlaOp operand, absl::Span<const int64_t> dimensions);
+
+// Enqueues a slice operation onto the computation that slices the operand
+// from the start indices to the limit indices; e.g.
+//
+//        x
+//   [ 0 1 2 3 ]
+// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+//   [ 8 9 a b ]
+//
+// Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+// range notation.
+// The strides parameter determines the stride over the slice
+XlaOp Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
+            absl::Span<const int64_t> limit_indices,
+            absl::Span<const int64_t> strides);
+
+// Enqueues a slice operation in a given dimension, taking all other
+// dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+// limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+// for:
+//
+//  array[:, 2:4:1, :]
+XlaOp SliceInDim(XlaOp operand, int64_t start_index, int64_t limit_index,
+                 int64_t stride, int64_t dimno);
+
+// Enqueues a slice operation onto the computation that slices the 'operand'
+// from dynamic start indices which are passed in 'start_indices'.
+// The size of the slice in each dimension is passed in 'slice_sizes',
+// which specify the end point of exclusive slice intervals in each
+// dimension [start, start + size).
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
+// Slice index calculations are computed modulo input dimension sizes to
+// prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64_t> slice_sizes);
+
+// Enqueues a dynamic update slice operation onto the computation, which
+// updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+// The shape of 'update' determines the shape of the slice of 'operand'
+// which is updated.
+// The indices specified in 'start_indices' specify the offset of the slice
+// of 'operand' which is updated.
+//
+//               update = {10, 11} // calculated at runtime.
+//   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+//   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+//   [7 8 9]                                                  [7 8  9 ]
+//
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
+// Slice index calculations are computed modulo update dimension sizes to
+// prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
+                         absl::Span<const XlaOp> start_indices);
+
+// Enqueues a concatenate instruction onto the computation. 'operands' must
+// have >= 1 entry.
+XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+                  int64_t dimension);
+
+// Enqueues a conditional-move-like select operation onto the computation;
+// predicated on pred, selects between on_true and on_false.
+XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
+
+// Enqueues a tuple-creation instruction onto the computation.
+XlaOp Tuple(XlaBuilder* builder, absl::Span<const XlaOp> elements);
+
+// Enqueues a tuple-element-get instruction onto the computation.
+XlaOp GetTupleElement(XlaOp tuple_data, int64_t index);
+
+// Enqueues an equal-to comparison instruction onto the computation.
+XlaOp Eq(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp EqTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a not-equal comparison instruction onto the computation.
+XlaOp Ne(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp NeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a greater-or-equal comparison instruction onto the computation.
+XlaOp Ge(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp GeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a greater-than comparison instruction onto the computation.
+XlaOp Gt(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp GtTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a less-than comparison instruction onto the computation.
+XlaOp Lt(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp LtTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a less-or-equal comparison instruction onto the computation.
+XlaOp Le(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp LeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a comparison instruction onto the computation (optionally without
+// broadcast_dimensions for consistency with others).
+XlaOp Compare(XlaOp lhs, XlaOp rhs,
+              absl::Span<const int64_t> broadcast_dimensions,
+              ComparisonDirection direction, Comparison::Type compare_type);
+XlaOp Compare(XlaOp lhs, XlaOp rhs,
+              absl::Span<const int64_t> broadcast_dimensions,
+              ComparisonDirection direction);
+XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
+
+// Enqueues a dot instruction onto the computation.
+XlaOp Dot(XlaOp lhs, XlaOp rhs,
+          const PrecisionConfig* precision_config = nullptr,
+          std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a general dot instruction onto the computation.
+XlaOp DotGeneral(
+    XlaOp lhs, XlaOp rhs, const DotDimensionNumbers& dimension_numbers,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a sparse dot instruction onto the computation.
+XlaOp SparseDot(
+    XlaOp lhs, XlaOp rhs, absl::Span<const XlaOp> sparse_meta,
+    absl::Span<const SparsityDescriptor> sparsity,
+    const DotDimensionNumbers& dimension_numbers,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a ragged all to all instruction onto the computation.
+XlaOp RaggedAllToAll(
+    XlaOp input, XlaOp input_offsets, XlaOp send_sizes, XlaOp output,
+    XlaOp output_offsets, XlaOp recv_sizes,
+    absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+// Enqueues a ragged dot instruction onto the computation.
+XlaOp RaggedDot(
+    XlaOp lhs, XlaOp rhs, XlaOp group_sizes,
+    const RaggedDotDimensionNumbers& dimension_numbers,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a convolution instruction onto the computation, which uses the
+// default convolution dimension numbers.
+XlaOp Conv(XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+           Padding padding, int64_t feature_group_count = 1,
+           int64_t batch_group_count = 1,
+           const PrecisionConfig* precision_config = nullptr,
+           std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration in the format returned by MakePadding().
+XlaOp ConvWithGeneralPadding(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding,
+    int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided dimension numbers configuration.
+XlaOp ConvWithGeneralDimensions(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+    Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration as well as the dimension numbers.
+XlaOp ConvGeneral(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration, dilation factors and dimension numbers.
+XlaOp ConvGeneralDilated(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64_t feature_group_count = 1, int64_t batch_group_count = 1,
+    const PrecisionConfig* precision_config = nullptr,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    std::optional<std::vector<bool>> window_reversal = std::nullopt);
+
+XlaOp DynamicConvForward(
+    XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64_t feature_group_count, int64_t batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+XlaOp DynamicConvInputGrad(
+    XlaOp input_sizes, XlaOp lhs, XlaOp rhs,
+    absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64_t feature_group_count, int64_t batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+XlaOp DynamicConvKernelGrad(
+    XlaOp activations, XlaOp gradients,
+    absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64_t feature_group_count, int64_t batch_group_count,
+    const PrecisionConfig* precision_config, PaddingType padding_type,
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+
+// Enqueues an FFT instruction onto the computation, of the given type and
+// with the given FFT length.
+XlaOp Fft(XlaOp operand, FftType fft_type,
+          absl::Span<const int64_t> fft_length);
+
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves for x in one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+//
+// * `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+//   square matrices. If `lower` is true (false), then the strictly upper
+//   (lower) triangular part of each innermost matrix in `a` is assumed to be
+//   zero and is not accessed.
+// * `b` is a tensor of shape `[..., M, K]` if `left_side` is true, otherwise a
+//   tensor of shape `[..., K, M]`.
+// * `left_side` is a boolean, indicating whether to solve a system of the form
+//   op(a) * x = b (true) or x * op(a) = b (false).
+// * `lower` is a boolean, indicating whether the argument `a` is
+//   lower-triangular (true) or upper-triangular (false).
+// * If `unit_diagonal` is true, the diagonal elements of `a` are assumed to be
+//   1 and not accessed.
+// * `transpose_a` indicates which function `op` we use to transform the tensor
+//   `a`: the identity function, transpose(a), or conjugate(transpose(a))
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a);
+
+// Computes the Cholesky decompositions of a batch of symmetric (Hermitian)
+// positive definite matrices.
+// `a` must be a (batched) square matrix; i.e., it must have rank >= 2 with the
+// two minor dimensions equal.
+// If `lower` is true, the data from the lower triangle is used; if false, the
+// upper triangle is used. The input data in the other triangle of the input
+// does not affect the output. Returns the output in the same lower/upper
+// triangle. The data returned in the other output triangle is arbitrary and
+// implementation-defined.
+//
+// If `a` is not Hermitian positive definite, returns an array full of NaNs.
+XlaOp Cholesky(XlaOp a, bool lower);
+
+// Enqueues an infeed instruction onto the computation, which writes data of
+// the given shape to the infeed buffer of the device.
+XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
+             const std::string& config = "");
+
+// Variant of Infeed which takes a token-shaped operand and produces a
+// two-element tuple containing the data value and a token-shaped value.
+// Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
+                      const std::string& config = "");
+
+// Enqueues an outfeed instruction onto the computation. This instruction
+// generates outgoing data transfers for the given data.
+//
+// shape_with_layout communicates the laid out shape that we want to outfeed
+// -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+// will occur.
+void Outfeed(XlaOp operand, const Shape& shape_with_layout,
+             const std::string& outfeed_config);
+
+// Variant of Outfeed which takes a token-shaped operand and produces a
+// token-shaped value. Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
+                       const Shape& shape_with_layout,
+                       const std::string& outfeed_config);
+
+// Enqueues a call instruction onto the computation.
+XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
+           absl::Span<const XlaOp> operands);
+
+// Enqueues a composite call instruction onto the computation.
+XlaOp CompositeCall(XlaBuilder* builder, const XlaComputation& computation,
+                    absl::Span<const XlaOp> operands, const std::string& name,
+                    std::optional<absl::string_view> attributes = std::nullopt,
+                    std::optional<int64_t> version = std::nullopt);
+
+// Enqueues a custom call instruction onto the computation. A custom call
+// invokes code external to XLA. The |operands| are passed to the external code,
+// and the external code is expected to produce a result of the given
+// |shape|. The exact mechanism is backend-specific. For example, in the CPU
+// backend, a call instruction is emitted which targets a symbol with the name
+// |call_target_name|.  |call_target_name| and |opaque| can arbitrary strings,
+// but |call_target_name| should be short as it may be used in labels. |opaque|
+// can encode arbitrarily large amounts of information. |has_side_effect|
+// specifies whether the instruction can have side effects.
+// |output_operand_aliasing| specifies a list of output/operand buffer pairs
+// that alias each other, where the output buffer is represented as a
+// ShapeIndex, and the operand buffer is represented as the operand index and
+// the ShapeIndex.
+XlaOp CustomCall(
+    XlaBuilder* builder, const std::string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape,
+    const std::string& opaque = "", bool has_side_effect = false,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+        output_operand_aliasing = {},
+    const Literal* literal = nullptr,
+    CustomCallSchedule schedule = CustomCallSchedule::SCHEDULE_NONE,
+    CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+// Overload which constructs a custom call that applies an Xla computation.
+XlaOp CustomCallWithComputation(
+    XlaBuilder* builder, const std::string& call_target_name,
+    absl::Span<const XlaOp> operands, const XlaComputation& computation,
+    const Shape& shape, const std::string& opaque = "",
+    bool has_side_effect = false,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+        output_operand_aliasing = {},
+    const Literal* literal = nullptr,
+    CustomCallSchedule schedule = CustomCallSchedule::SCHEDULE_NONE,
+    CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+// Overload which constructs a custom call with fixed layouts. The operands will
+// have the layouts specified by |operand_shapes_with_layout| when provided to
+// external code, and the external code is expected to produce a result with the
+// layout specified by |shape_with_layout|. All shapes in |shape_with_layout|
+// and |operand_shapes_with_layout| must have layouts.
+XlaOp CustomCallWithLayout(
+    XlaBuilder* builder, const std::string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+    absl::Span<const Shape> operand_shapes_with_layout,
+    const std::string& opaque = "", bool has_side_effect = false,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+        output_operand_aliasing = {},
+    const Literal* literal = nullptr,
+    CustomCallSchedule schedule = CustomCallSchedule::SCHEDULE_NONE,
+    CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+// Overload which annotates a custom call with the given Window and
+// ConvolutionDimensionNumbers.  Useful for custom-calls which represent
+// convolutions.
+//
+// This sets the layout of its operands if operand_shapes_with_layout is
+// nonempty, and it sets the layout of its result if `shape` has a layout.
+XlaOp CustomCallWithConvDnums(
+    XlaBuilder* builder, const std::string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape,
+    absl::Span<const Shape> operand_shapes_with_layout,
+    const std::string& opaque, bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+        output_operand_aliasing,
+    const Literal* literal, Window window, ConvolutionDimensionNumbers dnums,
+    CustomCallSchedule schedule = CustomCallSchedule::SCHEDULE_NONE,
+    CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+// Enqueues an optimization barrier onto the computation.
+XlaOp OptimizationBarrier(XlaOp operand);
+
+// The following methods enqueue element-wise binary arithmetic operations
+// onto the computation. The shapes of the operands have to match unless one
+// of the operands is a scalar, or an explicit broadcast dimension is given
+// (see g3doc for more details).
+
+// Enqueues a complex compose instruction onto the computation.
+XlaOp Complex(XlaOp real, XlaOp imag,
+              absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a complex conjugate instruction onto the computation.
+XlaOp Conj(XlaOp operand);
+
+// Enqueues an add instruction onto the computation.
+XlaOp Add(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a subtract instruction onto the computation.
+XlaOp Sub(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a multiply instruction onto the computation.
+XlaOp Mul(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a divide instruction onto the computation.
+XlaOp Div(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a remainder instruction onto the computation.
+XlaOp Rem(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a max instruction onto the computation.
+XlaOp Max(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues a min instruction onto the computation.
+XlaOp Min(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Element-wise logical operators
+XlaOp And(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Overload to call And with 3 or more operands.  We need the following somewhat
+// convoluted overload set to disambiguate with the overload that takes the
+// `broadcast_dimensions` optional param.
+inline XlaOp And(XlaOp op1, XlaOp op2, XlaOp op3) {
+  return And(op1, And(op2, op3));
+}
+template <typename... XlaOpTs>
+XlaOp And(XlaOp op1, XlaOp op2, XlaOp op3, const XlaOpTs&... operands) {
+  return And(op1, And(op2, And(op3, operands...)));
+}
+
+XlaOp Or(XlaOp lhs, XlaOp rhs,
+         absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Overload to call Or with 3 or more operands.  As with `And`, we need the
+// following complicated overload set to handle the default arg in the `Or`
+// overload above.
+inline XlaOp Or(XlaOp op1, XlaOp op2, XlaOp op3) {
+  return Or(op1, Or(op2, op3));
+}
+template <typename... XlaOpTs>
+XlaOp Or(XlaOp op1, XlaOp op2, XlaOp op3, const XlaOpTs&... operands) {
+  return Or(op1, Or(op2, Or(op3, operands...)));
+}
+
+XlaOp Xor(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+XlaOp Not(XlaOp operand);
+
+XlaOp PopulationCount(XlaOp operand);
+
+XlaOp ShiftLeft(XlaOp lhs, XlaOp rhs,
+                absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp ShiftRightArithmetic(XlaOp lhs, XlaOp rhs,
+                           absl::Span<const int64_t> broadcast_dimensions = {});
+XlaOp ShiftRightLogical(XlaOp lhs, XlaOp rhs,
+                        absl::Span<const int64_t> broadcast_dimensions = {});
+// Reduces an array among the provided dimensions, given "computation" as a
+// reduction operator.
+XlaOp Reduce(XlaOp operand, XlaOp init_value, const XlaComputation& computation,
+             absl::Span<const int64_t> dimensions_to_reduce);
+
+// Reduces several arrays simultaneously among the provided dimensions, given
+// "computation" as a reduction operator.
+XlaOp Reduce(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+             absl::Span<const XlaOp> init_values,
+             const XlaComputation& computation,
+             absl::Span<const int64_t> dimensions_to_reduce);
+
+// Convenience wrapper around the above that reduces all the dimensions in the
+// operand shape.
+XlaOp ReduceAll(XlaOp operand, XlaOp init_value,
+                const XlaComputation& computation);
+
+// Enqueues a windowed reduce instruction onto the computation.
+XlaOp ReduceWindow(XlaOp operand, XlaOp init_value,
+                   const XlaComputation& computation,
+                   absl::Span<const int64_t> window_dimensions,
+                   absl::Span<const int64_t> window_strides, Padding padding);
+
+XlaOp ReduceWindow(absl::Span<const XlaOp> operands,
+                   absl::Span<const XlaOp> init_values,
+                   const XlaComputation& computation,
+                   absl::Span<const int64_t> window_dimensions,
+                   absl::Span<const int64_t> window_strides, Padding padding);
+
+// As ReduceWindow(), but the padding is given in the format
+// returned by MakePadding().
+XlaOp ReduceWindowWithGeneralPadding(
+    XlaOp operand, XlaOp init_value, const XlaComputation& computation,
+    absl::Span<const int64_t> window_dimensions,
+    absl::Span<const int64_t> window_strides,
+    absl::Span<const int64_t> base_dilations,
+    absl::Span<const int64_t> window_dilations,
+    absl::Span<const std::pair<int64_t, int64_t>> padding);
+XlaOp ReduceWindowWithGeneralPadding(
+    absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
+    const XlaComputation& computation,
+    absl::Span<const int64_t> window_dimensions,
+    absl::Span<const int64_t> window_strides,
+    absl::Span<const int64_t> base_dilations,
+    absl::Span<const int64_t> window_dilations,
+    absl::Span<const std::pair<int64_t, int64_t>> padding);
+
+// Returns the sum of the operand value within each subgroup of replicas. All
+// replicas supply one input to the sum and all replicas receive the resulting
+// sum for each subgroup.
+XlaOp CrossReplicaSum(XlaOp operand,
+                      absl::Span<const ReplicaGroup> replica_groups = {});
+
+XlaOp AllGather(XlaOp operand, int64_t all_gather_dimension,
+                int64_t shard_count,
+                absl::Span<const ReplicaGroup> replica_groups = {},
+                const std::optional<ChannelHandle>& channel_id = std::nullopt,
+                const std::optional<Layout>& layout = std::nullopt,
+                std::optional<bool> use_global_device_ids = std::nullopt);
+
+XlaOp AllGatherTuple(
+    absl::Span<const XlaOp> operands, int64_t all_gather_dimension,
+    int64_t shard_count, absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<ChannelHandle>& channel_id = std::nullopt,
+    const std::optional<Layout>& layout = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt);
+
+// Enqueues an operation that do an AllReduce of the operand cross cores. Here
+// AllReduce means doing a reduction on the input operand cross cores and then
+// broadcasting the reduction result to those cores. The reduction function is
+// defined by `computation`, which should be a commutative computation on
+// scalars, e.g., add, min, or max. The way that AllReduce is applied is
+// configured by:
+//
+// - `replica_groups`: each ReplicaGroup contains a list of replica id. If
+// empty, all replicas belong to one group. Allreduce will be applied within
+// subgroups. For example, we have 4 replicas, then replica_groups={{0,2},{1,3}}
+// means, replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+//
+// - `channel_id`: for Allreduce nodes from different modules, if they have the
+// same channel_id, they will be 'AllReduce'd. If empty, AllReduce will not be
+// applied cross modules.
+//
+// - `shape_with_layout`: forces the layout of the AllReduce to the given
+// layout. This is used to guarantee the same layout for a group of AllReduce
+// ops compiled separately.
+XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
+                absl::Span<const ReplicaGroup> replica_groups = {},
+                const std::optional<ChannelHandle>& channel_id = std::nullopt,
+                const std::optional<Shape>& shape_with_layout = std::nullopt,
+                std::optional<bool> use_global_device_ids = std::nullopt);
+
+XlaOp AllReduceTuple(
+    absl::Span<const XlaOp> operand, const XlaComputation& computation,
+    absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<ChannelHandle>& channel_id = std::nullopt,
+    const std::optional<Shape>& shape_with_layout = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt);
+
+XlaOp ReduceScatter(
+    XlaOp operand, const XlaComputation& computation, int64_t scatter_dimension,
+    int64_t shard_count, absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<ChannelHandle>& channel_id = std::nullopt,
+    const std::optional<Layout>& layout = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt);
+
+// Enqueues an operation that do an AllToAll of the operand cross cores.
+// This involves AllToAll, followed by Reshape, Transpose, and another Reshape
+// to get proper codegen. See implementation for additional details.
+//
+// An optional `layout` can be specified to force the layout of the instruction.
+// This is used to guarantee the same layout for a group of AllToAll ops
+// compiled separately.
+XlaOp AllToAll(XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+               int64_t split_count,
+               absl::Span<const ReplicaGroup> replica_groups = {},
+               const std::optional<Layout>& layout = std::nullopt,
+               const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+XlaOp AllToAllTuple(
+    absl::Span<const XlaOp> operand,
+    absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<Layout>& layout = std::nullopt,
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+XlaOp AllToAllTuple(
+    XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+    int64_t split_count, absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<Layout>& layout = std::nullopt,
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+XlaOp CollectiveBroadcast(
+    XlaOp operand, absl::Span<const ReplicaGroup> replica_groups,
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+// Enqueues an collective operation that sends and receives data cross replicas.
+//
+// - `source_target_pair`: a list of (source_replica_id, target_replica_id)
+// pairs. For each pair, the operand is sent from source replica to target
+// replica. Note that, 1) any two pairs should not have the same target replica
+// id, and they should not have the same source replica id; 2) if a replica id
+// is not a target in any pair, then the output on that replica is a tensor
+// consists of 0(s) with the same shape as the input.
+XlaOp CollectivePermute(
+    XlaOp operand,
+    const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
+
+// Enqueues an operation that returns the replica ID.
+XlaOp ReplicaId(XlaBuilder* builder);
+
+// Enqueues an operation that scatters the `source` array to the selected
+// indices of each window.
+XlaOp SelectAndScatter(XlaOp operand, const XlaComputation& select,
+                       absl::Span<const int64_t> window_dimensions,
+                       absl::Span<const int64_t> window_strides,
+                       Padding padding, XlaOp source, XlaOp init_value,
+                       const XlaComputation& scatter);
+
+// As SelectAndScatter(), but the padding is given in the format
+// returned by MakePadding().
+XlaOp SelectAndScatterWithGeneralPadding(
+    XlaOp operand, const XlaComputation& select,
+    absl::Span<const int64_t> window_dimensions,
+    absl::Span<const int64_t> window_strides,
+    absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
+    XlaOp init_value, const XlaComputation& scatter);
+
+// Enqueues an abs instruction onto the computation.
+XlaOp Abs(XlaOp operand);
+
+// Enqueues a atan2 instruction onto the computation.
+XlaOp Atan2(XlaOp y, XlaOp x,
+            absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues an erf instruction onto the computation.
+XlaOp Erf(XlaOp operand);
+
+// Enqueues an exp instruction onto the computation.
+XlaOp Exp(XlaOp operand);
+XlaOp Exp(XlaOp operand, const ResultAccuracy& result_accuracy);
+
+// Enqueues an expm1 instruction onto the computation.
+XlaOp Expm1(XlaOp operand);
+
+// Enqueues a floor instruction onto the computation.
+XlaOp Floor(XlaOp operand);
+
+// Enqueues a ceil instruction onto the computation.
+XlaOp Ceil(XlaOp operand);
+
+// Enqueues a round instruction onto the computation,
+// with half-way cases rounding away from zero.
+XlaOp Round(XlaOp operand);
+
+// Enqueues a round instruction onto the computation, rounding to nearest even
+XlaOp RoundNearestEven(XlaOp operand);
+
+// Enqueues an log instruction (natural logarithm) onto the computation.
+XlaOp Log(XlaOp operand);
+
+// Enqueues an log1p instruction (log(x+1)) onto the computation.
+XlaOp Log1p(XlaOp operand);
+
+// Enqueues a logistic instruction onto the computation.
+XlaOp Logistic(XlaOp operand);
+
+// Enqueues a sign instruction onto the computation.
+XlaOp Sign(XlaOp operand);
+
+// Enqueues a count leading zeros instruction onto the computation.
+XlaOp Clz(XlaOp operand);
+
+// Enqueues a cosine instruction onto the computation.
+XlaOp Cos(XlaOp operand);
+
+// Enqueues a sine instruction onto the computation.
+XlaOp Sin(XlaOp operand);
+
+// Enqueues a tan instruction onto the computation.
+XlaOp Tan(XlaOp operand);
+
+// Enqueues a tanh instruction onto the computation.
+XlaOp Tanh(XlaOp operand);
+
+// Enqueues a real-part instruction onto the computation.
+XlaOp Real(XlaOp operand);
+
+// Enqueues an imaginary-part instruction onto the computation.
+XlaOp Imag(XlaOp operand);
+
+// Enqueues a sqrt computation onto the computation.
+XlaOp Sqrt(XlaOp operand);
+
+// Enqueues a cbrt computation onto the computation.
+XlaOp Cbrt(XlaOp operand);
+
+// Enqueues a rsqrt computation onto the computation.
+XlaOp Rsqrt(XlaOp operand);
+
+// Enqueues a lhs^rhs computation onto the computation.
+XlaOp Pow(XlaOp lhs, XlaOp rhs,
+          absl::Span<const int64_t> broadcast_dimensions = {});
+
+// Enqueues an operator that tests if the operand's values are finite, i.e., not
+// +/-Inf or NaN.  Returns an array of booleans with the same shape where
+// entries are true iff the corresponding entry was not infinite or NaN.
+//
+// Defined only for real-valued (i.e. not complex) floating-point types; raises
+// an error for other types.
+//
+// See also IsInf, IsPosInf, IsNegInf, and IsNan in lib/math.h.
+XlaOp IsFinite(XlaOp operand);
+
+// Enqueues an iota operation onto the computation.
+XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64_t iota_dimension);
+
+// Enqueues a rank-1 iota operation onto the computation.
+XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64_t size);
+
+// Enqueues a convert instruction onto the computation that changes the
+// element type of the operand array to primitive_type.
+XlaOp ConvertElementType(XlaOp operand, PrimitiveType new_element_type);
+
+// Enqueues a no-op instruction onto the computation that changes
+// the element type of the operand array to primitive_type. The
+// bit-widths of the source and destination element types must be
+// identical.
+XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
+
+// Enqueues a stochastic convert instruction onto the computation that changes
+// the element type of the operand array with stochastic rounding to
+// primitive_type.
+XlaOp StochasticConvertType(XlaOp operand, XlaOp random,
+                            PrimitiveType new_element_type);
+
+// Enqueues a negate instruction onto the computation.
+XlaOp Neg(XlaOp operand);
+
+// Enqueues a transpose instruction onto the computation.
+XlaOp Transpose(XlaOp operand, absl::Span<const int64_t> permutation);
+
+// Enqueues a reverse instruction onto the computation. The order of the
+// elements in the given dimensions is reversed (i.e., the element at index i
+// is moved to index dimension_size - 1 - i).
+XlaOp Rev(XlaOp operand, absl::Span<const int64_t> dimensions);
+
+// Enqueues a sort instruction onto the computation, using 'comparator' for
+// comparisons. 'comparator' needs to define a strict weak order. 'is_stable'
+// determines whether the stable sorting should be used.
+// If only one operand is provided:
+// * If the operand is a rank-1 tensor (an array), the result is a sorted array.
+//   The resulting sorting order has the property that for all index positions
+//   i, j with i < j, either
+//   comparator(value[i], value[j]) = comparator(value[j], value[i]) = false or
+//   comparator(value[i], value[j]) = true.
+// * If the operand has higher rank, the operand is sorted along the provided
+//   dimension. For example, for a rank-2 tensor (a matrix), a dimension value
+//   of 0 will independently sort every column, and a dimension value of 1 will
+//   independently sort each row. If no dimension number is provided, then the
+//   last dimension is chosen by default. For the dimension which is sorted, the
+//   same sorting order applies as in the rank-1 case.
+//
+// If more than one operand is provided:
+// * All operands must be tensors with the same dimensions. The element types of
+//   the tensors may be different.
+// * The result is a tuple that consists of the operands in sorted order (along
+//   the provided dimension, as above). The same permutation as implied by the
+//   comparison computation is applied to all operand tensors. When comparing
+//   two index positions, 'comparator' is called with 2 * n scalar parameters,
+//   where parameter 2 * i and 2 * i + 1 correspond to the value of operand i at
+//   two index positions.
+// Default comparator computations can be found in lib/comparators.h
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64_t dimension = -1, bool is_stable = false);
+
+// Enqueues a topk instruction onto the computation. TopK returns the largest
+// 'k' values and their indices along the last dimension of the 'operand' if
+// `lagest=true` or the smallest `k` values if `largest=false`.
+//
+// * If the operand is a rank-1 tensor (an array), the result is a tuple that
+//   consists of:
+//   * a sorted array with the top 'k' elements.
+//   * an array containing the indices of the k elements.
+//   For example, if the input is [0.1, 0.3, 0.2] and k == 2, the output tuple
+//   is ([0.3, 0.2], [1, 2]).
+// * If the operand has higher rank, the result is a tuple that consists of:
+//   * a tensor equivalent to one produced by sorting the operand along the last
+//     dimension and slicing that dimension to only the top 'k' values. The last
+//     dimension is sorted as in the rank-1 case.
+//   * a tensor containing the indices of the top 'k' values along the last
+//     dimension.
+//   For example, if the input is [0.1, 0.3, 0.2][0.5, 0.4, 0.6] and k == 1, the
+//   output tuple is ([0.3][0.6], [1][2]).
+XlaOp TopK(XlaOp operand, int64_t k, bool largest);
+
+// Enqueues a clamp instruction onto the computation.
+XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
+
+// Enqueues a map instruction onto the computation.
+XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
+          const XlaComputation& computation,
+          absl::Span<const int64_t> dimensions,
+          absl::Span<const XlaOp> static_operands = {});
+
+// Enqueues a N(mu, sigma) random number generation instruction onto the
+// computation.
+XlaOp RngNormal(XlaOp mu, XlaOp sigma, const Shape& shape);
+
+// Enqueues a U(a, b) random number generation instruction onto the
+// computation. Returns values in the semi-open interval [a, b).
+XlaOp RngUniform(XlaOp a, XlaOp b, const Shape& shape);
+
+// Enqueues a B(initial_state) random bit generation instruction onto the
+// computation. Returns the new key and random bits with the specified shape.
+XlaOp RngBitGenerator(RandomAlgorithm algorithm, XlaOp initial_state,
+                      const Shape& shape);
+
+// Enqueues a while node onto the computation.
+XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+            XlaOp init);
+
+// Enqueues a conditional node onto the computation.
+XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
+                  const XlaComputation& true_computation, XlaOp false_operand,
+                  const XlaComputation& false_computation);
+
+// Enqueues either a predicated (if/else) or indexed (switch/case/default)
+// conditional node onto the computation. N >= 1 branch_computations and
+// branch_operands are matched by index. branch_index selects the branch that
+// will be executed. Out of range branch_index uses the N-1'th
+// branch_computation as default.
+XlaOp Conditional(XlaOp branch_index,
+                  absl::Span<const XlaComputation* const> branch_computations,
+                  absl::Span<const XlaOp> branch_operands);
+
+// Enqueues a ReducePrecision node onto the computation.
+XlaOp ReducePrecision(XlaOp operand, int exponent_bits, int mantissa_bits);
+
+// Enqueues a Gather node onto the computation.
+XlaOp Gather(XlaOp input, XlaOp start_indices,
+             const GatherDimensionNumbers& dimension_numbers,
+             absl::Span<const int64_t> slice_sizes,
+             bool indices_are_sorted = false);
+
+// Enqueues a Scatter node onto the computation.
+XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
+              const XlaComputation& update_computation,
+              const ScatterDimensionNumbers& dimension_numbers,
+              bool indices_are_sorted = false, bool unique_indices = false);
+XlaOp Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
+              absl::Span<const XlaOp> updates,
+              const XlaComputation& update_computation,
+              const ScatterDimensionNumbers& dimension_numbers,
+              bool indices_are_sorted = false, bool unique_indices = false);
+
+// Enqueues a Send node onto the computation for device-to-device
+// communication. This operation sends the given operand to
+// a Recv instruction in a different computation that shares the same channel
+// handle.
+void Send(XlaOp operand, const ChannelHandle& handle);
+
+// Variant of Send which takes a token-shaped operand and produces a
+// token-shaped value.  Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp SendWithToken(XlaOp operand, XlaOp token, const ChannelHandle& handle);
+
+// Enqueues a Recv node onto the computation for device-to-device
+// communication. The data comes from a Send instruction in a different
+// computation that shares the same channel handle and its shape must be the
+// same as the given shape.
+XlaOp Recv(XlaBuilder* builder, const Shape& shape,
+           const ChannelHandle& handle);
+
+// Variant of Recv which takes a token-shaped operand and produces a two-element
+// tuple containing the data value and a token-shaped value. Tokens are used
+// for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp RecvWithToken(XlaOp token, const Shape& shape,
+                    const ChannelHandle& handle);
+
+// Enqueues a Send node which transfers data from the device to the host. The
+// 'shape_with_layout' argument defines the layout of the data transferred; its
+// shape must be compatible with the shape of the operand. The operand must be
+// array-shaped.
+// TODO(b/111544877): Support tuple shapes.
+XlaOp SendToHost(XlaOp operand, XlaOp token, const Shape& shape_with_layout,
+                 const ChannelHandle& handle);
+
+// Enqueues a Recv node which transfers data from the host to the device. The
+// given shape must contain a layout and must be an array.
+// TODO(b/111544877): Support tuple shapes.
+XlaOp RecvFromHost(XlaOp token, const Shape& shape,
+                   const ChannelHandle& handle);
+
+// Enqueues an operation (AfterAll) with no operands that produces a
+// token-shaped value.  Tokens are used for ordering side-effecting operations.
+// This is a separate method from AfterAll to facility the removal of
+// operand-less AfterAll instructions.
+// TODO(b/110532604): Remove this function when all tokens are derived from a
+// single token generated or passed into the entry computation.
+XlaOp CreateToken(XlaBuilder* builder);
+
+// Enqueues an AfterAll instruction which produces a token-shaped value and
+// takes a variadic number of token-shaped operands. The number of operands must
+// be greater than zero. Used for joining tokens.
+XlaOp AfterAll(XlaBuilder* builder, absl::Span<const XlaOp> tokens);
+
+// Normalizes operand across spatial and batch dimensions for each feature.
+//
+// Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+// is the normalized result and batch_mean and batch_var are the mean and
+// variance, respectively, across batch for the operand.
+XlaOp BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset, float epsilon,
+                        int64_t feature_index);
+
+// Normalizes operand across spatial and batch dimensions for each feature.
+//
+// `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+// computing `mean` and `variance` for each batch inside the operation. It
+// uses the input `mean` and `variance` instead as estimated values. The
+// purpose of this op is to reduce latency in inference, hence the name
+// `BatchNormInference`.
+//
+// The output has the same shape as `operand`, and contains the normalized
+// values for each batch.
+XlaOp BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset, XlaOp mean,
+                         XlaOp variance, float epsilon, int64_t feature_index);
+
+// Calculates the gradients of a batch norm op.
+//
+// The inputs `batch_mean` and `batch_var` represent the mean and variance
+// across the batch.
+//
+// Returns a tuple of three elements:
+//   - grad_operand: Gradient with respect to input `operand`
+//   - grad_offset: Gradient with respect to input `offset`
+//   - grad_scale: Gradient with respect to input `scale`
+XlaOp BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
+                    XlaOp batch_var, XlaOp grad_output, float epsilon,
+                    int64_t feature_index);
+
+// Returns the size of the given dimension of the operand. The operand must be
+// array shaped.
+XlaOp GetDimensionSize(XlaOp operand, int64_t dimension);
+
+// Sets the size of the given dimension of the operand. The operand must be
+// array shaped.  The result will have the same shape as the operand, but the
+// given dimension will be dynamic (if not already).
+XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64_t dimension);
+
+// Returns the same op but with dynamic dimension removed.
+XlaOp RemoveDynamicDimension(XlaOp operand, int64_t dimension);
+
+// Implementation details below this point.
+//
+
+// Free function template implementations.
+
+template <typename NativeT>
+XlaOp ConstantR0(XlaBuilder* builder, NativeT value) {
+  return ConstantLiteral(builder, LiteralUtil::CreateR0<NativeT>(value));
+}
+
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, absl::Span<const NativeT> values) {
+  BorrowingLiteral literal(
+      reinterpret_cast<const char*>(values.begin()),
+      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<NativeT>(),
+                           {static_cast<int64_t>(values.size())}));
+  return ConstantLiteral(builder, literal);
+}
+
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, int64_t length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(builder, literal);
+}
+
+inline XlaOp ConstantR1(XlaBuilder* builder, const tsl::core::Bitmap& values) {
+  return ConstantLiteral(builder, LiteralUtil::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2(XlaBuilder* builder,
+                 std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(builder, LiteralUtil::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                  const Array<NativeT>& values,
+                                  const Layout& layout) {
+  return ConstantLiteral(
+      builder, LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values) {
+  return ConstantLiteral(builder,
+                         LiteralUtil::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                      const Array2D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantLiteral(
+      builder, LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                            const Array2D<NativeT>& values) {
+  return ConstantLiteral(builder,
+                         LiteralUtil::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                      const Array3D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                            const Array3D<NativeT>& values) {
+  return ConstantFromArray(builder, values);
+}
+
+template <typename NativeT>
+XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                      const Array4D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantFromArrayWithLayout(builder, values, layout);
+}
+
+template <typename NativeT>
+XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                            const Array4D<NativeT>& values) {
+  return ConstantFromArray(builder, values);
+}
+
+// Switches from automatic SPMD partitioning to manual partitioning. Converts a
+// full-shaped tensor (to be automatically partitioned by SPMD partitioner) to a
+// shard-shaped tensor to be consumed by manually partitioned ops.
+absl::StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
+    xla::XlaBuilder* builder, xla::XlaOp input, int single_dim,
+    const xla::OpSharding& manual_sharding,
+    absl::Span<const int64_t> unspecified_dims);
+
+// Switches from manual partitioning to automatic SPMD partitioning. Converts a
+// shard-shaped tensor (manually partitioned in SPMD-style) to a full-shaped
+// tensor to be partitioned automatically by the SPMD partitioner.
+absl::StatusOr<xla::XlaOp> ConvertSpmdShardToFullShape(
+    xla::XlaBuilder* builder, xla::XlaOp input, const xla::Shape& output_shape,
+    int single_dim, const xla::OpSharding& manual_sharding,
+    absl::Span<const int64_t> unspecified_dims);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_XLA_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/xla_computation.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/xla_computation.h
new file mode 100644
index 00000000..379d386e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/builder/xla_computation.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_BUILDER_XLA_COMPUTATION_H_
+#define XLA_HLO_BUILDER_XLA_COMPUTATION_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// The computation graph that the user builds up with the XlaBuilder.
+class XlaComputation {
+ public:
+  XlaComputation() : unique_id_(-1) {}
+  XlaComputation(HloModuleProto proto)
+      : unique_id_(proto.id()), proto_(std::move(proto)) {}
+
+  ~XlaComputation() = default;
+
+  XlaComputation(const XlaComputation&) = delete;
+  XlaComputation& operator=(const XlaComputation&) = delete;
+
+  XlaComputation(XlaComputation&& from) = default;
+
+  XlaComputation& operator=(XlaComputation&& from) = default;
+
+  // Returns the "program shape" (parameter and return shapes) for this
+  // computation.
+  absl::StatusOr<ProgramShape> GetProgramShape() const;
+
+  const std::string& name() const { return proto().name(); }
+
+  const HloModuleProto& proto() const { return proto_; }
+  HloModuleProto* mutable_proto() { return &proto_; }
+
+  // Requests that we snapshot the computation into a serializable protocol
+  // buffer form.
+  absl::StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
+
+  // Returns true if this object is a null Computation.
+  bool IsNull() const { return unique_id_ == -1; }
+
+ private:
+  XlaComputation(const int64_t unique_id) : unique_id_(unique_id) {}
+  friend class XlaBuilder;
+
+  int64_t unique_id_;
+  HloModuleProto proto_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_BUILDER_XLA_COMPUTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
new file mode 100644
index 00000000..7b7bac28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
@@ -0,0 +1,569 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EVALUATOR_HLO_EVALUATOR_H_
+#define XLA_HLO_EVALUATOR_HLO_EVALUATOR_H_
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "Eigen/Core"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+#include "xla/status_macros.h"
+#include "tsl/platform/errors.h"
+#define _USE_MATH_DEFINES
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/array2d.h"
+#include "xla/hlo/analysis/tuple_points_to_analysis.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/dynamic_dimension_inference.h"
+#include "xla/service/shape_inference.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/ml_dtypes.h"
+
+namespace xla {
+
+// Responsible for evaluating HLO and obtain literal as the evaluation results.
+//
+// This class is not thread-safe.
+class HloEvaluator : public ConstDfsHloVisitorWithDefault {
+ public:
+  // Precomputed analyses that can be passed to Evaluate functions to avoid
+  // recomputation during evaluation.
+  struct PrecomputedAnalyses {
+    TuplePointsToAnalysis* tuple_points_to;
+    CallGraph* call_graph;
+  };
+
+  // Only evaluate up to max_loop_iterations per while-loop execution if
+  // specified.
+  explicit HloEvaluator(int64_t max_loop_iterations = -1);
+
+  // Called by the evaluator to create an embedded evaluator to execute a
+  // sub-region of control flow. Subclasses should override this to return an
+  // instance of the subclass instead.
+  virtual std::unique_ptr<HloEvaluator> CreateEmbedded(
+      int64_t max_loop_iterations) {
+    auto result = std::make_unique<HloEvaluator>(max_loop_iterations);
+    result->set_custom_call_handler(custom_call_handler_);
+    return result;
+  }
+
+  // Enables subclasses to be notified when a new computation is being
+  // evaluated.
+  virtual void OnEvaluateComputation(const HloComputation& computation) {}
+
+  // Evaluates an HLO module and an array of pointers to literals.  Returns the
+  // evaluated result as a literal if successful.
+  //
+  // Precondition: The indices of arg_literals correspond to the parameter
+  // numbers of the HLO parameters in the computation. See comment below for an
+  // example.
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  absl::StatusOr<Literal> Evaluate(
+      const HloModule& module, absl::Span<const Literal* const> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
+  template <typename Dummy = void>
+  absl::StatusOr<Literal> Evaluate(const HloModule& module,
+                                   absl::Span<const Literal> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
+
+  // Evaluates an HLO computation and an array of pointers to literals.
+  // Returns the evaluated result as a literal if successful.
+  // Precondition: The indices of arg_literals correspond to the parameter
+  // numbers of the HLO parameters in the computation. For e.g., consider the
+  // following graph:
+  //
+  //                *
+  //            /       \
+  //            +     Parameter1
+  //        /      \
+  //       /        \
+  //    Parameter0  Constant
+  //
+  // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
+  // 1 in this computation. The input literals array will then have its first
+  // literal map to Parameter0 and the second map to Parameter1.
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  absl::StatusOr<Literal> Evaluate(
+      const HloComputation& computation,
+      absl::Span<const Literal* const> arg_literals);
+  template <typename Dummy = void>
+  absl::StatusOr<Literal> Evaluate(const HloComputation& computation,
+                                   absl::Span<const Literal> arg_literals) {
+    std::vector<const Literal*> arg_literal_ptrs;
+    for (const auto& l : arg_literals) {
+      arg_literal_ptrs.push_back(&l);
+    }
+    return Evaluate(computation, arg_literal_ptrs);
+  }
+
+  // Gets the value of running a single HLO instruction.
+  //
+  // This function may recursively evaluate the dependency of this instruction
+  // within its parent computation until it encounters something that cannot be
+  // evaluated, such as an Infeed or a Parameter instruction.
+  // It makes best effort to partially evaluate a dependency if possible.
+  // The caller may pass in non-null `precomputed_analyses` to avoid
+  // recomputation during evaluation; the caller must ensure that any
+  // precomputed analyses were performed on the module containing `instruction`.
+  absl::StatusOr<Literal> Evaluate(
+      const HloInstruction* instruction,
+      PrecomputedAnalyses precomputed_analyses = {},
+      bool recursively_evaluate_nonconstant_operands = false);
+
+  // Same as Evaluate, except returning false on error and accepts an output
+  // pointer.
+  bool TryEvaluate(const HloInstruction* instruction, Literal* result,
+                   bool recursively_evaluate_nonconstant_operands = false);
+
+  // Evaluates a single HLO instruction, substituting the given literals for
+  // some of the instruction's operands.
+  //
+  // For example, given instruction = op(A, B, C) and the map
+  // {A = x, C = y}, this evaluates op(x, B, y).
+  absl::StatusOr<Literal> EvaluateWithSubstitutions(
+      const HloInstruction* instruction,
+      const absl::flat_hash_map<const HloInstruction*, const LiteralBase*>&
+          substitutions,
+      bool recursively_evaluate_nonconstant_operands = false);
+
+  absl::StatusOr<Literal> EvaluateElementwiseBinaryOp(HloOpcode opcode,
+                                                      const Literal& lhs,
+                                                      const Literal& rhs);
+
+  absl::StatusOr<Literal> EvaluateElementwiseUnaryOp(HloOpcode opcode,
+                                                     const Literal& operand);
+
+  absl::StatusOr<Literal> EvaluateElementwiseTernaryOp(HloOpcode opcode,
+                                                       const Literal& lhs,
+                                                       const Literal& rhs,
+                                                       const Literal& ehs);
+
+  absl::StatusOr<Literal> EvaluateElementwiseCompareOp(
+      ComparisonDirection direction, const Literal& lhs, const Literal& rhs);
+
+  absl::StatusOr<Literal> EvaluateDotOp(const DotDimensionNumbers& dim_numbers,
+                                        const PrecisionConfig& precision_config,
+                                        const Literal& lhs, const Literal& rhs);
+
+  void set_dynamic_dimension_inference(
+      DynamicDimensionInference* dynamic_dimension_inference) {
+    dynamic_dimension_inference_ = dynamic_dimension_inference;
+  }
+
+  DynamicDimensionInference* dynamic_dimension_inference() {
+    return dynamic_dimension_inference_;
+  }
+
+  // Enable the fast path for certain operations like dot or convolution.
+  void set_use_fast_path(bool value) { use_fast_path_ = value; }
+
+  // Use fast path that doesn't use embedded evaluators in reduce.
+  void set_reduce_use_fast_path(bool value) { use_fast_path_reduce_ = value; }
+
+  // Handles evaluation of a custom-call op.
+  // Operand literals are provided in |operands| and implementations must
+  // populate |output| before returning.
+  using CustomCallHandler = std::function<absl::StatusOr<Literal>(
+      const HloInstruction* custom_call, absl::Span<const Literal*> operands)>;
+
+  // Sets a handler that is called during evaluation for custom-call ops.
+  // If no handler is defined the default error behavior will occur. The handler
+  // will be provided evaluated literals for all operands and is expected to
+  // return an output literal of the appropriate shape.
+  void set_custom_call_handler(CustomCallHandler handler) {
+    custom_call_handler_ = std::move(handler);
+  }
+
+  // Callback for each multiply-accumulate in each dot or convolution operation.
+  using TraceMACHandler = std::function<void(
+      int64_t result_index, int64_t lhs_index, int64_t rhs_index)>;
+
+  // Sets a callback for each multiply-accumulate in each dot or convolution
+  // operation.
+  void set_trace_mac_handler(TraceMACHandler handler) {
+    trace_mac_handler_ = std::move(handler);
+  }
+
+  // Returns the result of a matrix multiply `lhs x rhs`.
+  static std::unique_ptr<Array2D<Eigen::half>> MatmulArray2D(
+      const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs);
+  static std::unique_ptr<Array2D<float>> MatmulArray2D(
+      const Array2D<float>& lhs, const Array2D<float>& rhs);
+  static std::unique_ptr<Array2D<double>> MatmulArray2D(
+      const Array2D<double>& lhs, const Array2D<double>& rhs);
+  static std::unique_ptr<Array2D<std::complex<float>>> MatmulArray2D(
+      const Array2D<std::complex<float>>& lhs,
+      const Array2D<std::complex<float>>& rhs);
+  static std::unique_ptr<Array2D<std::complex<double>>> MatmulArray2D(
+      const Array2D<std::complex<double>>& lhs,
+      const Array2D<std::complex<double>>& rhs);
+  static std::unique_ptr<Array2D<int32_t>> MatmulArray2D(
+      const Array2D<int32_t>& lhs, const Array2D<int32_t>& rhs);
+  static std::unique_ptr<Array2D<tsl::float8_e4m3fn>> MatmulArray2D(
+      const Array2D<tsl::float8_e4m3fn>& lhs,
+      const Array2D<tsl::float8_e4m3fn>& rhs);
+  static std::unique_ptr<Array2D<tsl::float8_e5m2>> MatmulArray2D(
+      const Array2D<tsl::float8_e5m2>& lhs,
+      const Array2D<tsl::float8_e5m2>& rhs);
+  static std::unique_ptr<Array2D<uint8_t>> MatmulArray2D(
+      const Array2D<uint8_t>& lhs, const Array2D<uint8_t>& rhs);
+
+ protected:
+  // Evaluates the given instruction, and stores the evaluation result in the
+  // evaluated_ map.
+  // When a non-empty shape_index is given, the instruction may be partially
+  // evaluated at the given shape_index and the rest of the result could be
+  // marked as undetermined unless it has been previously evaluated using
+  // EvaluateInternal. Such partial evaluation reduces the computation and
+  // memory overhead in cases where we need only one tuple element by avoiding
+  // the evaluation of a full tuple. Any non-null `precomputed_analyses` will be
+  // used instead of recomputing.
+  absl::Status EvaluateInternal(
+      const HloInstruction* instruction,
+      PrecomputedAnalyses precomputed_analyses,
+      const ShapeIndex& shape_index = {},
+      bool recursively_evaluate_nonconstant_operands = false);
+
+  // Evaluates the result of a `parameter` instruction by traversing the call
+  // graph as given in `analyses`. `shape_index` has the same effect as in
+  // EvaluateInternal above.
+  absl::Status EvaluateParameterFromCallerArgument(
+      const HloInstruction* parameter, const ShapeIndex& shape_index,
+      PrecomputedAnalyses analyses);
+
+  // Helper method to extract a list of int64_t from evaluated instruction for
+  // start_indices for DynamicSlice and DynamicUpdateSlice.
+  std::vector<int64_t> GetS64Indices(
+      absl::Span<HloInstruction* const> start_indices);
+
+  // Creates a vector of multipliers which can be used to create a linear index
+  // into shape.
+  //
+  // Given the multidimensional index {i1, ..., iN} and
+  // M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+  //
+  //   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+  //
+  // This lets you calculate LI given the multidimensional indices in any order.
+  static DimensionVector MakeDimMultipliers(const Shape& shape);
+
+  // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
+  // class.
+  //
+  // A straightforward implementation would be to make it a nested class
+  // declared and defined in hlo_evaluator.cc.  Instead HloEvaluatorTypedVisitor
+  // lives as a separate class with its own header because its template gets
+  // instantiated many times and we want to use extern templates to shard out
+  // the compilation of those instantiations across multiple cc files.
+  template <typename ReturnT, typename ElementwiseT>
+  friend class HloEvaluatorTypedVisitor;
+
+  // Wraps around instruction handling to infer types before dispatching to
+  // the corresponding typed Visitor.
+  absl::Status DefaultAction(const HloInstruction* hlo) override {
+    return hlo->Visit(typed_visitors_[hlo->shape().element_type()].get());
+  }
+
+  absl::Status Preprocess(const HloInstruction* hlo) override;
+  absl::Status Postprocess(const HloInstruction* hlo) override;
+
+  // Operations that are type-agnostic or always return a specific type, such as
+  // HandleIsFinite where boolean is always returned.
+  //
+  absl::Status HandleBitcast(const HloInstruction* bitcast) override;
+  absl::Status HandleBitcastConvert(const HloInstruction* convert) override;
+  absl::Status HandleGetDimensionSize(
+      const HloInstruction* get_dimension_size) override;
+  absl::Status HandleSetDimensionSize(
+      const HloInstruction* set_dimension_size) override;
+  absl::Status HandleParameter(const HloInstruction* parameter) override;
+  absl::Status HandleInfeed(const HloInstruction* infeed) override;
+  absl::Status HandleConstant(const HloInstruction* constant) override;
+  absl::Status HandleConcatenate(const HloInstruction* concatenate) override;
+  absl::Status HandleReshape(const HloInstruction* reshape) override;
+  absl::Status HandleTranspose(const HloInstruction* transpose) override;
+  absl::Status HandleIsFinite(const HloInstruction* is_finite) override;
+  absl::Status HandleCompare(const HloInstruction* compare) override;
+  absl::Status HandleTuple(const HloInstruction* tuple) override;
+  absl::Status HandleFft(const HloInstruction* fft) override;
+  absl::Status HandleGather(const HloInstruction* gather) override;
+  absl::Status HandleScatter(const HloInstruction* hlo) override;
+  absl::Status HandleGetTupleElement(
+      const HloInstruction* get_tuple_element) override;
+  absl::Status HandleAsyncStart(const HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(const HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(const HloInstruction* async_done) override;
+  absl::Status HandleCopy(const HloInstruction* copy) override;
+  absl::Status HandleCopyStart(const HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(const HloInstruction* copy_done) override;
+  absl::Status HandleConditional(const HloInstruction* conditional) override;
+  absl::Status HandleConvert(const HloInstruction* convert) override;
+  absl::Status HandleCall(const HloInstruction* call) override;
+  absl::Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(const HloInstruction* dus) override;
+  absl::Status HandleFusion(const HloInstruction* fusion) override;
+  absl::Status HandleWhile(const HloInstruction* while_hlo) override;
+  absl::Status HandleSelect(const HloInstruction* select) override;
+  absl::Status HandleBroadcast(const HloInstruction* broadcast) override;
+  absl::Status HandleAfterAll(const HloInstruction* after_all) override;
+  absl::Status HandleAddDependency(
+      const HloInstruction* add_dependency) override;
+  absl::Status HandleReverse(const HloInstruction* reverse) override;
+  absl::Status HandleSelectAndScatter(
+      const HloInstruction* select_and_scatter) override;
+  absl::Status HandleSlice(const HloInstruction* slice) override;
+  absl::Status HandleSort(const HloInstruction* sort) override;
+  absl::Status HandleStochasticConvert(
+      const HloInstruction* stochastic_convert) override;
+  absl::Status HandleReal(const HloInstruction* real) override;
+  absl::Status HandleImag(const HloInstruction* imag) override;
+  absl::Status HandleComplex(const HloInstruction* complex) override;
+  absl::Status HandleReduce(const HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(const HloInstruction* hlo) override;
+  absl::Status HandleMap(const HloInstruction* map) override;
+  absl::Status HandleCustomCall(const HloInstruction* custom_call) override;
+
+  // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
+  // expanded in a semantic-preserving way into other HLOs by adding expansion
+  // HLO pass to the HLO optimization pass during compilation, which can then be
+  // handled by the evaluator.
+  absl::Status HandleBatchNormGrad(
+      const HloInstruction* batch_norm_grad) override {
+    return Unimplemented("BatchNormGrad HLO is unsupported by the evaluator.");
+  }
+  absl::Status HandleBatchNormInference(
+      const HloInstruction* batch_norm_inference) override {
+    return Unimplemented(
+        "BatchNormInference HLO is unsupported by the evaluator.");
+  }
+  absl::Status HandleBatchNormTraining(
+      const HloInstruction* batch_norm_training) override {
+    return Unimplemented(
+        "BatchNormTraining HLO is unsupported by the evaluator.");
+  }
+  absl::Status HandleOutfeed(const HloInstruction* outfeed) override {
+    return Unimplemented("Outfeed HLO is unsupported by the evaluator.");
+  }
+
+  // Returns the already-evaluated literal result for the instruction.
+  //
+  // A Constant instruction is considered evaluated and its literal will be
+  // returned directly without looking up the cache.
+  //
+  // Similarly, a Parameter instruction is considered evaluated and its literal
+  // is looked up in arg_literals.
+  //
+  // Crash with log if the given instruction has not been evaluated previously.
+  const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
+    if (hlo->IsConstant()) {
+      return hlo->literal();
+    }
+    if (hlo->opcode() == HloOpcode::kParameter && !arg_literals_.empty()) {
+      return *arg_literals_.at(hlo->parameter_number());
+    }
+
+    auto it = evaluated_.find(hlo);
+    CHECK(it != evaluated_.end())
+        << "could not find evaluated value for: " << hlo->ToString();
+    return it->second;
+  }
+
+  // Returns true if the given hlo has been evaluated and cached.
+  bool IsAlreadyEvaluated(const HloInstruction* hlo,
+                          const ShapeIndex& shape_index = {}) {
+    if (hlo->IsConstant()) {
+      return true;
+    }
+    if (hlo->opcode() == HloOpcode::kParameter && !arg_literals_.empty()) {
+      return true;
+    }
+    auto it = evaluated_.find(hlo);
+    if (it == evaluated_.end()) {
+      return false;
+    }
+    // We may evaluate some elements of a tuple-shaped instruction and mark
+    // the other elements as undetermined. This way we avoid the computation
+    // and memory overhead of evaluating a large tuple when only some elements
+    // are needed. By marking the other elements undetermined, we allow the
+    // evaluator to update the cached tuple literal when more elements are
+    // evaluated.
+    return it->second.IsDetermined(shape_index);
+  }
+
+  // Tracks the HLO instruction and its evaluated literal result.
+  //
+  // Parameters and constants aren't stored here, see implementation of
+  // GetEvaluatedLiteralFor.
+  //
+  // TODO(b/35950897): have better memory management here to free instructions
+  // that are no longer a parent for any other subsequent instruction in
+  // post-ordering.
+  //
+  // Must be cleared for each evaluation.
+  //
+  // Storing Literal in place requires the container to have pointer stability
+  // so we cannot use flat_hash_map any more.
+  absl::node_hash_map<const HloInstruction*, Literal> evaluated_;
+  // Set by EvaluateInternal and opportunitiscally used by the HandleXXX
+  // functions. When non-empty, the HandleXXX function may evaluate the
+  // instruction at only the given shape index.
+  ShapeIndex visitor_shape_index_;
+  bool enable_partial_evaluation_ = false;
+
+  std::unique_ptr<CallGraph> call_graph_cache_;
+  std::unique_ptr<TuplePointsToAnalysis> tuple_points_to_analysis_cache_;
+
+  // Use fast path that uses eigen in the evaluator.
+  bool use_fast_path_ = false;
+
+  // Use fast path that doesn't use embedded evaluators in reduce.
+  bool use_fast_path_reduce_ = true;
+
+ private:
+  template <typename ReturnT, typename NativeT>
+  static absl::StatusOr<Literal> ElementWiseUnaryOpImpl(
+      const HloInstruction* instruction,
+      const std::function<ReturnT(NativeT)>& unary_op,
+      const Literal& operand_literal) {
+    const Shape& shape = instruction->shape();
+    const auto* operand = instruction->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
+
+    Literal result(shape);
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> multi_index, int) {
+          return unary_op(operand_literal.Get<NativeT>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  // Map from a primitive type to its associated (templated) DfsHloVisitor.
+  std::unique_ptr<ConstDfsHloVisitor> typed_visitors_[PrimitiveType_ARRAYSIZE];
+
+  // Caches pointers to input literals, assuming they are in post-order.
+  // Literals are not owned by this class, and they must outlive the lifetime of
+  // each invocation to the Evaluate* method.
+  // Must be cleared for each evaluation.
+  std::vector<const Literal*> arg_literals_;
+
+  // Max loop iterations to execute with no maximum if negative.
+  int64_t max_loop_iterations_ = 0;
+
+  // Module-level seed handle.
+  uint64_t seed_ = 0;
+  // RNG engine.
+  std::minstd_rand0 engine_;
+
+  // DynamicDimensionInference is used to evaluate GetDimensionSize, which
+  // returns the dynamic dimension size of its operand.
+  DynamicDimensionInference* dynamic_dimension_inference_ = nullptr;
+
+  // Optional handler for custom_call ops.
+  CustomCallHandler custom_call_handler_;
+
+  // Optional handler for tracing MAC operations (eg in dot and convolution).
+  TraceMACHandler trace_mac_handler_;
+
+  HloEvaluator(const HloEvaluator&) = delete;
+  HloEvaluator& operator=(const HloEvaluator&) = delete;
+};
+
+std::unique_ptr<Array2D<float>> MatmulArray2D(const Array2D<float>& lhs,
+                                              const Array2D<float>& rhs);
+
+// Represents a parsed static while loop. We normalize the loop representation
+// so that it starts from the induction_var_init_value and increments by
+// step_size until it exceeds or goes below loop_bound.
+struct ParsedStaticWhileLoop {
+  // The number of iterations to be executed.
+  int64_t trip_count = -1;
+  // The tuple index of the induction variable in the while argument tuple.
+  int64_t induction_var_index = -1;
+  // The induction variable's initial value.
+  int64_t induction_var_init_value = -1;
+  // The induction variable is incremented by this number (could be negative)
+  // in each iteration.
+  int64_t step_size = -1;
+  int64_t loop_bound = -1;
+};
+
+// Indicates whether a parsed while loop is static or dynamic. If the loop is
+// static, it contains a value for StaticLoopInfo; otherwise the loop is
+// dynamic. We consider a loop dynamic if its induction variable's initial
+// value or the loop bound's value depends on the while's parent computation's
+// parameter.
+struct ParsedWhileLoop {
+  std::optional<ParsedStaticWhileLoop> static_while_loop;
+  bool is_dynamic() const { return !static_while_loop.has_value(); }
+};
+constexpr ParsedWhileLoop kParsedDynamicWhileLoop = ParsedWhileLoop();
+
+// Tries to parse a while loop using a set of predefined patterns.
+// Returns the parsing result. Any non-null `precompute_analyses` will be used
+// instead of recomputing, and it is the caller's responsibility to ensure that
+// the analyses are valid for the module that contains `while_op`.
+std::optional<ParsedWhileLoop> PatternMatchParseWhileLoop(
+    const HloInstruction* while_op,
+    HloEvaluator::PrecomputedAnalyses precomputed_analyses = {});
+
+// Functionality exposed for testing. Do not rely on anything in this namespace
+// outside this file.
+namespace internal {
+
+// Use this class to represent the precise details of the error to enable
+// special treatment.
+enum class EvalErrorDetail : uint32_t {
+  // The evaluation result depends on dynamic values such as parameters and
+  // infeed. Therefore, the HLO's value cannot be statically evaluated.
+  kDynamicValueDependence = 0,
+};
+
+extern const absl::string_view kEvalErrorDetailUrl;
+
+std::optional<EvalErrorDetail> ParseEvalErrorDetail(const absl::Status& error);
+
+}  // namespace internal
+}  // namespace xla
+
+#endif  // XLA_HLO_EVALUATOR_HLO_EVALUATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
new file mode 100644
index 00000000..74feab55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -0,0 +1,1749 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EVALUATOR_HLO_EVALUATOR_TYPED_VISITOR_H_
+#define XLA_HLO_EVALUATOR_HLO_EVALUATOR_TYPED_VISITOR_H_
+
+#include <fenv.h>  // NOLINT
+
+#include <algorithm>
+#include <bitset>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <random>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
+#include "absl/types/span.h"
+#include "xla/array2d.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/index_util.h"
+#include "xla/literal.h"
+#include "xla/primitive_util.h"
+#include "xla/service/shape_inference.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+template <typename T>
+T Nibble0(T t) {
+  if constexpr (std::is_integral_v<T>) {
+    constexpr auto shift = (8 * sizeof(T)) - 4;
+    return (t << shift) >> shift;
+  }
+  return t;
+}
+
+template <typename T>
+T Nibble1(T t) {
+  if constexpr (std::is_integral_v<T>) {
+    return t >> 4;
+  }
+  return t;
+}
+
+namespace detail {
+template <typename T>
+using unsigned_promoted_type_t =
+    std::make_unsigned_t<decltype(std::declval<T>() + std::declval<T>())>;
+}
+
+// ToArithmeticSafeType(T t):
+//  - converts `t` to an unsigned integer at least as wide as `int` if T is an
+//    integer, and
+//  - otherwise returns `t` unchanged.
+//
+// It's UB in C++ to under/overflow a signed integer, so we wrap all arithmetic
+// in this type to force 2's complement behavior.
+template <typename T>
+auto ToArithmeticSafeType(T t) {
+  if constexpr (std::is_integral_v<T>) {
+    return static_cast<detail::unsigned_promoted_type_t<T>>(t);
+  }
+  if constexpr (!std::is_integral_v<T>) {
+    return std::move(t);
+  }
+}
+
+// Templated DfsHloVisitor for use by HloEvaluator.
+//
+// Typically ReturnT here indicates the resulting literal type of each evaluated
+// Handle* method of a TypedVisitor.  There are however a few exceptions to this
+// rule, notably:
+// - HandleCompare and HandleIsFinite: where the resulting literal type is
+//   always boolean.
+// - HandleImag and HandleReal: where the resulting literal type is always float
+//   and the operand is always complex, or real in the case of HandleReal.
+// These operations are handled outside of the parent HloEvaluator handlers
+// instead of from within TypedVisitor.
+//
+// Type params:
+//   - ReturnT: The type of input and output of each operation.
+//   - ElementwiseT: The type in which internal computation are done.
+//
+// This is logically a private part of HloEvaluator.  It lives in this header
+// file rather than in hlo_evaluator.cc because we use extern templates and a
+// bunch of independent cc files to speed up compiling the many instantiations
+// of this class.
+//
+// NOTE: Prefer putting new implementation to HloEvalator rather than
+// HloEvaluatorTypedVisitor whenever possible, because this class is templated
+// for all primitive types and is an order of magnitude larger in code size as
+// well as compile time. Only put op handling that involves compute using native
+// C++ types here, such as elementwise ops with compute, convolution, dot, etc.
+template <typename ReturnT, typename ElementwiseT = ReturnT>
+class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
+ private:
+  ABSL_ATTRIBUTE_NOINLINE absl::Status UnsupportedTypeError(
+      const HloInstruction* instruction) {
+    return InvalidArgument(
+        "Unsupported type for %s: %s", HloOpcodeString(instruction->opcode()),
+        PrimitiveType_Name(instruction->shape().element_type()));
+  }
+
+ public:
+  explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
+
+  // The following higher-order functions convert a function with ElementwiseT
+  // to a function with ReturnT.
+  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    return [&unary_op](ReturnT arg) {
+      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    return [&binary_op](ReturnT arg1, ReturnT arg2) {
+      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
+                                            static_cast<ElementwiseT>(arg2)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
+                                       ElementwiseT)>& ternary_op) {
+    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
+      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
+                                             static_cast<ElementwiseT>(arg2),
+                                             static_cast<ElementwiseT>(arg3)));
+    };
+  }
+
+  absl::Status DefaultAction(const HloInstruction* hlo_instruction) override {
+    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
+                         HloOpcodeString(hlo_instruction->opcode()));
+  }
+
+  template <typename NativeT,
+            typename std::enable_if_t<std::is_unsigned_v<NativeT>>* = nullptr>
+  absl::Status HandleAbs(const HloInstruction* abs) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return elem_operand;
+                        }));
+    return absl::OkStatus();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if_t<std::is_signed_v<NativeT>>* = nullptr>
+  absl::Status HandleAbs(const HloInstruction* abs) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return std::abs(elem_operand);
+                        }));
+    return absl::OkStatus();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if_t<is_complex_v<NativeT>>* = nullptr>
+  absl::Status HandleAbs(const HloInstruction* abs) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(abs->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[abs],
+        (HloEvaluator::ElementWiseUnaryOpImpl<typename NativeT::value_type,
+                                              NativeT>(
+            abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
+            operand_literal)));
+
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleAbs(const HloInstruction* abs) override {
+    // If the operand is of C64 type, the return type of abs will be F32.
+    // However, ElementwiseT would still be the return type, F32, and thus
+    // specifying the ElementwiseT explicitly as C64 is needed below.
+    if (abs->operand(0)->shape().element_type() == C64) {
+      return HandleAbs<complex64>(abs);
+    } else if (abs->operand(0)->shape().element_type() == C128) {
+      return HandleAbs<complex128>(abs);
+    }
+    return HandleAbs<ElementwiseT>(abs);
+  }
+
+  absl::Status HandleRound(const HloInstruction* round) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[round],
+          ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
+            return std::round(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(round);
+  }
+
+  absl::Status HandleRoundNearestEven(const HloInstruction* round) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      // Verify the current rounding direction.
+      TF_RET_CHECK(fegetround() == FE_TONEAREST);
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[round],
+          ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
+            return std::nearbyint(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(round);
+  }
+
+  absl::Status HandleCeil(const HloInstruction* ceil) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[ceil],
+          ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
+            return std::ceil(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(ceil);
+  }
+
+  absl::Status HandleErf(const HloInstruction* erf) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[erf],
+          ElementWiseUnaryOp(erf, [](ElementwiseT elem_operand) {
+            return std::erf(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(erf);
+  }
+
+  absl::Status HandleExp(const HloInstruction* exp) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
+                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
+                          return std::exp(elem_operand);
+                        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleExpm1(const HloInstruction* expm1) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[expm1],
+          ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+            return std::expm1(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(expm1);
+  }
+
+  absl::Status HandleFloor(const HloInstruction* floor) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[floor],
+          ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
+            return std::floor(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(floor);
+  }
+
+  absl::Status HandleLog(const HloInstruction* log) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
+                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
+                          return std::log(elem_operand);
+                        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleLog1p(const HloInstruction* log1p) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[log1p],
+          ElementWiseUnaryOp(log1p, [](ElementwiseT elem_operand) {
+            return std::log1p(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(log1p);
+  }
+
+  absl::Status HandleNot(const HloInstruction* not_) override {
+    if constexpr (std::is_arithmetic_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[not_],
+          ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+            if constexpr (std::is_floating_point_v<ElementwiseT> ||
+                          std::is_same_v<ElementwiseT, bool>) {
+              return !elem_operand;
+            } else {
+              static_assert(std::is_integral_v<ElementwiseT>);
+              return ~elem_operand;
+            }
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(not_);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if_t<std::is_signed_v<NativeT> &&
+                                !std::is_floating_point_v<NativeT>>* = nullptr>
+  absl::Status HandleNegate(const HloInstruction* negate) {
+    using type = std::make_unsigned_t<NativeT>;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
+          return NativeT(-type(elem_operand));
+        }));
+    return absl::OkStatus();
+  }
+
+  template <typename NativeT, typename std::enable_if_t<
+                                  !std::is_signed_v<NativeT> ||
+                                  std::is_floating_point_v<NativeT>>* = nullptr>
+  absl::Status HandleNegate(const HloInstruction* negate) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(
+            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleNegate(const HloInstruction* negate) override {
+    return HandleNegate<ReturnT>(negate);
+  }
+
+  absl::Status HandleLogistic(const HloInstruction* logistic) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logistic],
+        ElementWiseUnaryOp(logistic, [](ElementwiseT elem_operand) {
+          return static_cast<ElementwiseT>(1) /
+                 (static_cast<ElementwiseT>(1) + std::exp(-elem_operand));
+        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleSign(const HloInstruction* sign) override {
+    using NativeT = ElementwiseT;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[sign],
+        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+          if constexpr (std::is_integral_v<NativeT>) {
+            return (ElementwiseT(0) < elem_operand) -
+                   (elem_operand < ElementwiseT(0));
+          }
+          if constexpr (std::is_floating_point_v<ElementwiseT>) {
+            return std::isnan(elem_operand)
+                       ? elem_operand
+                       : std::copysign(elem_operand != ElementwiseT(0),
+                                       elem_operand);
+          }
+          if constexpr (is_complex_v<NativeT>) {
+            auto abs_val = std::abs(elem_operand);
+            return 0 == abs_val ? ElementwiseT(0) : elem_operand / abs_val;
+          }
+        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleAtan2(const HloInstruction* atan2) override {
+    if constexpr (std::is_floating_point_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
+                          ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
+                                                        ElementwiseT rhs_elem) {
+                            return std::atan2(lhs_elem, rhs_elem);
+                          }));
+      return absl::OkStatus();
+    }
+    if constexpr (is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[atan2],
+          ElementWiseBinaryOp(atan2, [](ElementwiseT y, ElementwiseT x) {
+            // atan2(y,x) = -i * log((x + i * y)/sqrt(x**2+y**2))
+            auto i = ElementwiseT(0.0, 1.0);
+            return (-i) * (std::log((x + i * y) / std::sqrt(x * x + y * y)));
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(atan2);
+  }
+
+  absl::Status HandleTanh(const HloInstruction* tanh) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
+                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
+                          return std::tanh(elem_operand);
+                        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleMultiply(const HloInstruction* multiply) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(
+            multiply, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+              return ElementwiseT(ToArithmeticSafeType(lhs_elem) *
+                                  ToArithmeticSafeType(rhs_elem));
+            }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleSubtract(const HloInstruction* subtract) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[subtract],
+        ElementWiseBinaryOp(
+            subtract, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+              return ElementwiseT(ToArithmeticSafeType(lhs_elem) -
+                                  ToArithmeticSafeType(rhs_elem));
+            }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleAdd(const HloInstruction* add) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
+                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
+                                                    ElementwiseT rhs_elem) {
+                          return ElementwiseT(ToArithmeticSafeType(lhs_elem) +
+                                              ToArithmeticSafeType(rhs_elem));
+                        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleDivide(const HloInstruction* divide) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[divide],
+        ElementWiseBinaryOp(
+            divide,
+            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) -> ElementwiseT {
+              if constexpr (std::is_integral_v<ElementwiseT>) {
+                if constexpr (std::is_unsigned_v<ElementwiseT>) {
+                  if (rhs_elem == 0) {
+                    return std::numeric_limits<ElementwiseT>::max();
+                  }
+                }
+                if constexpr (std::is_signed_v<ElementwiseT>) {
+                  if (rhs_elem == 0) {
+                    return static_cast<ElementwiseT>(-1);
+                  }
+                  if (rhs_elem == -1 &&
+                      lhs_elem == std::numeric_limits<ElementwiseT>::min()) {
+                    return lhs_elem;
+                  }
+                }
+              }
+              return lhs_elem / rhs_elem;
+            }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleMaximum(const HloInstruction* maximum) override {
+    if constexpr (!is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[maximum],
+          ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
+            if constexpr (std::numeric_limits<ElementwiseT>::has_quiet_NaN) {
+              if (std::isnan(lhs)) {
+                return lhs;
+              }
+              if (std::isnan(rhs)) {
+                return rhs;
+              }
+            }
+            return std::max(lhs, rhs);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(maximum);
+  }
+
+  absl::Status HandleMinimum(const HloInstruction* minimum) override {
+    if constexpr (!is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[minimum],
+          ElementWiseBinaryOp(minimum, [](ElementwiseT lhs, ElementwiseT rhs) {
+            if constexpr (std::numeric_limits<ElementwiseT>::has_quiet_NaN) {
+              if (std::isnan(lhs)) {
+                return lhs;
+              }
+              if (std::isnan(rhs)) {
+                return rhs;
+              }
+            }
+            return std::min(lhs, rhs);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(minimum);
+  }
+
+  absl::Status HandlePower(const HloInstruction* power) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[power],
+        ElementWiseBinaryOp(
+            power, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+              // Case 0: 1^x = 1 and x^0 = 1, regardless of X, see
+              // Branch Cuts for Complex Elementary Functions or Much Ado About
+              // Nothing's Sign Bit, W. Kahan, Section 10.
+              if (lhs_el == ElementwiseT(1) || rhs_el == ElementwiseT(0)) {
+                return static_cast<ElementwiseT>(1);
+              }
+              // Case 1:
+              // 1. inf^(a + 0i) = inf, if a > 0.
+              // 2. inf^(a + 0i) = 0, if a < 0.
+              if constexpr (is_complex_v<ElementwiseT>) {
+                auto is_positive_infinity = [](ElementwiseT c) {
+                  return c.imag() == 0 && c.real() > 0 && std::isinf(c.real());
+                };
+                auto is_positive_real = [](ElementwiseT c) {
+                  return c.real() > 0 && c.imag() == 0;
+                };
+                auto is_negative_real = [](ElementwiseT c) {
+                  return c.real() < 0 && c.imag() == 0;
+                };
+                if (is_positive_infinity(lhs_el) && is_positive_real(rhs_el)) {
+                  return static_cast<ElementwiseT>(lhs_el);
+                }
+                if (is_positive_infinity(lhs_el) && is_negative_real(rhs_el)) {
+                  return static_cast<ElementwiseT>(0);
+                }
+              }
+              // Case 2:
+              // Fallback to pow.
+              if constexpr (std::is_same_v<ElementwiseT, bool>) {
+                return lhs_el || !rhs_el;
+              } else if constexpr (std::is_integral_v<ElementwiseT>) {
+                if constexpr (std::is_signed_v<ElementwiseT>) {
+                  if (rhs_el < static_cast<ElementwiseT>(0)) {
+                    return static_cast<ElementwiseT>(
+                        lhs_el == static_cast<ElementwiseT>(1) ? 1 : 0);
+                  }
+                }
+                return static_cast<ElementwiseT>(
+                    IPow<std::make_unsigned_t<ElementwiseT>>(lhs_el, rhs_el));
+              } else {
+                return static_cast<ElementwiseT>(std::pow(lhs_el, rhs_el));
+              }
+            }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleSqrt(const HloInstruction* sqrt) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sqrt],
+                        ElementWiseUnaryOp(sqrt, [](ElementwiseT elem_operand) {
+                          return std::sqrt(elem_operand);
+                        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleCbrt(const HloInstruction* cbrt) override {
+    if constexpr (!is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[cbrt],
+          ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) {
+            return std::cbrt(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(cbrt);
+  }
+
+  absl::Status HandleRsqrt(const HloInstruction* rsqrt) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[rsqrt],
+        ElementWiseUnaryOp(rsqrt, [](ElementwiseT elem_operand) {
+          return static_cast<ElementwiseT>(1) / std::sqrt(elem_operand);
+        }));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleRemainder(const HloInstruction* remainder) override {
+    if constexpr (!is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[remainder],
+          ElementWiseBinaryOp(
+              remainder,
+              [](ElementwiseT lhs_el, ElementwiseT rhs_el) -> ElementwiseT {
+                if constexpr (std::is_integral_v<ElementwiseT>) {
+                  if (rhs_el == 0) {
+                    return lhs_el;
+                  }
+                  if constexpr (std::is_signed_v<ElementwiseT>) {
+                    if (rhs_el == -1 &&
+                        lhs_el == std::numeric_limits<ElementwiseT>::min()) {
+                      return 0;
+                    }
+                  }
+                  return lhs_el % rhs_el;
+                }
+                if constexpr (std::is_floating_point_v<ElementwiseT>) {
+                  return std::fmod(lhs_el, rhs_el);
+                }
+              }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(remainder);
+  }
+
+  absl::Status HandleAnd(const HloInstruction* and_inst) override {
+    if constexpr (std::is_integral_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[and_inst],
+          ElementWiseBinaryOp(and_inst,
+                              [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+                                return lhs_el & rhs_el;
+                              }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(and_inst);
+  }
+
+  absl::Status HandleOr(const HloInstruction* or_inst) override {
+    if constexpr (std::is_integral_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(parent_->evaluated_[or_inst],
+                          ElementWiseBinaryOp(or_inst, [](ElementwiseT lhs_el,
+                                                          ElementwiseT rhs_el) {
+                            return lhs_el | rhs_el;
+                          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(or_inst);
+  }
+
+  absl::Status HandleXor(const HloInstruction* xor_inst) override {
+    if constexpr (std::is_integral_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[xor_inst],
+          ElementWiseBinaryOp(xor_inst,
+                              [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+                                return lhs_el ^ rhs_el;
+                              }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(xor_inst);
+  }
+
+  absl::Status HandleShiftLeft(const HloInstruction* shl) override {
+    if constexpr (std::is_integral_v<ElementwiseT> &&
+                  !std::is_same_v<ElementwiseT, bool>) {
+      TF_ASSIGN_OR_RETURN(parent_->evaluated_[shl],
+                          ElementWiseBinaryOp(shl, [](ElementwiseT lhs_elem,
+                                                      ElementwiseT rhs_elem) {
+                            return IsShiftOutOfBounds<ElementwiseT>(rhs_elem)
+                                       ? 0
+                                       : (lhs_elem << rhs_elem);
+                          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(shl);
+  }
+
+  absl::Status HandleShiftRightArithmetic(const HloInstruction* shr) override {
+    if constexpr (std::is_integral_v<ElementwiseT> &&
+                  !std::is_same_v<ElementwiseT, bool>) {
+      using SignedT = make_specialized_signed_t<ReturnT>;
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[shr],
+          ElementWiseBinaryOp(
+              shr, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
+                if (IsShiftOutOfBounds<ReturnT>(rhs_elem)) {
+                  return lhs_signed < 0 ? static_cast<ElementwiseT>(-1) : 0;
+                } else {
+                  return static_cast<ElementwiseT>(lhs_signed >> rhs_elem);
+                }
+              }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(shr);
+  }
+
+  absl::Status HandleShiftRightLogical(const HloInstruction* shr) override {
+    if constexpr (std::is_integral_v<ElementwiseT> &&
+                  !std::is_same_v<ElementwiseT, bool>) {
+      using UnsignedT = make_specialized_unsigned_t<ReturnT>;
+      TF_ASSIGN_OR_RETURN(parent_->evaluated_[shr],
+                          ElementWiseBinaryOp(shr, [](ElementwiseT lhs_elem,
+                                                      ElementwiseT rhs_elem) {
+                            // If shift amount is greater than the number of
+                            // bits, then return 0.
+                            if (IsShiftOutOfBounds<ReturnT>(rhs_elem)) {
+                              return static_cast<ElementwiseT>(0);
+                            }
+                            return static_cast<ElementwiseT>(
+                                static_cast<UnsignedT>(lhs_elem) >> rhs_elem);
+                          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(shr);
+  }
+
+  absl::Status HandleClamp(const HloInstruction* clamp) override {
+    if constexpr (!is_complex_v<ElementwiseT>) {
+      auto clamp_op = [](ElementwiseT low, ElementwiseT value,
+                         ElementwiseT high) {
+        if constexpr (std::numeric_limits<ElementwiseT>::has_quiet_NaN) {
+          if (std::isnan(low)) {
+            return low;
+          }
+          if (std::isnan(value)) {
+            return value;
+          }
+          if (std::isnan(high)) {
+            return high;
+          }
+        }
+        return std::min(high, std::max(value, low));
+      };
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[clamp],
+          ElementwiseTernaryOp(clamp,
+                               std::move(ConvertTernaryFunction(clamp_op))));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(clamp);
+  }
+
+  absl::Status HandleSelect(const HloInstruction* select) override {
+    CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
+    CHECK(select->shape().IsArray());
+    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
+        [](bool pred, ReturnT on_true, ReturnT on_false) {
+          if (pred) {
+            return on_true;
+          }
+          return on_false;
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
+                        ElementwiseTernaryOp(select, std::move(select_op)));
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleConvolutionWithLiterals(const HloInstruction* conv,
+                                             const Literal& lhs_literal,
+                                             const Literal& rhs_literal) {
+    const auto& window = conv->window();
+    const Shape& result_shape = conv->shape();
+    const Shape& lhs_shape = lhs_literal.shape();
+    const Shape& rhs_shape = rhs_literal.shape();
+    const auto packed_nibble_count =
+        absl::c_count(conv->precision_config().operand_precision(),
+                      PrecisionConfig::PACKED_NIBBLE);
+    CHECK_NE(packed_nibble_count, 1);
+    const bool is_packed_nibble = packed_nibble_count == 2;
+
+    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK(lhs_shape.IsArray());
+    CHECK(rhs_shape.IsArray());
+    CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
+    CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
+
+    const auto& dnums = conv->convolution_dimension_numbers();
+    const int64_t num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
+    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
+    CHECK_GE(num_spatial_dims, 0);
+    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
+
+    std::vector<int64_t> window_dimension_sizes;
+    for (auto i : dnums.kernel_spatial_dimensions()) {
+      window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
+    }
+
+    const Shape& window_shape =
+        ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
+
+    DimensionVector lhs_dim_multipliers =
+        HloEvaluator::MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers =
+        HloEvaluator::MakeDimMultipliers(rhs_shape);
+
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
+    const int64_t feature_group_count = conv->feature_group_count();
+    const int64_t batch_group_count = conv->batch_group_count();
+
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data, feature_group_count, batch_group_count,
+                 is_packed_nibble, result_shape,
+                 this](const absl::Span<const int64_t> out_index,
+                       int /*thread_id*/) {
+      // Dimension number applicable for input (lhs).
+      const int64_t input_batch_dim = dnums.input_batch_dimension();
+      const int64_t input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64_t kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64_t kernel_output_z_dim =
+          dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64_t output_batch_dim = dnums.output_batch_dimension();
+      const int64_t output_z_dim = dnums.output_feature_dimension();
+
+      const int64_t input_z_size =
+          ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
+      const int64_t input_batch_size =
+          ShapeUtil::GetDimension(lhs_shape, input_batch_dim);
+
+      const int64_t batch_group_size = input_batch_size / batch_group_count;
+
+      // The size of an input feature group.
+      const int64_t input_feature_group_size =
+          input_z_size / feature_group_count;
+
+      const int64_t output_z_size =
+          ShapeUtil::GetDimension(rhs_shape, kernel_output_z_dim);
+      // The output feature dimension is a concatenation of convolution results
+      // from the different groups.
+      const int64_t output_feature_group_size =
+          output_z_size / feature_group_count;
+
+      // Calculate the group index to which the current output index
+      // belongs.
+      const int64_t feature_group_index =
+          out_index[output_z_dim] / output_feature_group_size;
+
+      const int64_t depthwise_multiplier = output_z_size / batch_group_count;
+      const int64_t batch_group_index =
+          out_index[output_z_dim] / depthwise_multiplier;
+
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
+
+      // Convolve input feature with kernel.
+      // The mechanism indexes into the correct LHS (input) and RHS (kernel)
+      // locations and accumulates multiplications for a given output index.
+      do {
+        // Find corresponding spatial dimension index for input (lhs).
+        int64_t lhs_linear_spatial_index = 0;
+        int64_t rhs_linear_spatial_index = 0;
+        for (int64_t ki = 0; ki < rhs_spatial_index.size(); ++ki) {
+          // Spatial dimension number for input (lhs) and output.
+          const int64_t input_spatial_dim = dnums.input_spatial_dimensions(ki);
+          const int64_t output_spatial_dim =
+              dnums.output_spatial_dimensions(ki);
+
+          // Calculate lhs (input) index without taking base dilation into
+          // account.
+          const auto& window_dim = window.dimensions(ki);
+          const int64_t undilated_index =
+              out_index[output_spatial_dim] * window_dim.stride() -
+              window_dim.padding_low() +
+              rhs_spatial_index[ki] * window_dim.window_dilation();
+          // Skip if the lhs (input) index is to be dilated.  As an
+          // optimization, skip this mod if there's no dilation.
+          if (window_dim.base_dilation() > 1 &&
+              undilated_index % window_dim.base_dilation() != 0) {
+            goto cnt;
+          }
+
+          // Calculate the actual lhs (input) index after dilation.  As an
+          // optimization, skip this integer divide if there's no dilation.
+          int64_t lhs_spatial_index;
+          if (window_dim.base_dilation() > 1) {
+            lhs_spatial_index = undilated_index / window_dim.base_dilation();
+          } else {
+            lhs_spatial_index = undilated_index;
+          }
+
+          // Skip if input index is not in bounds.
+          if (!(lhs_spatial_index >= 0 &&
+                lhs_spatial_index < lhs_shape.dimensions(input_spatial_dim))) {
+            goto cnt;
+          }
+
+          lhs_linear_spatial_index +=
+              lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
+          rhs_linear_spatial_index +=
+              (window_dim.window_reversal()
+                   ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                   : rhs_spatial_index[ki]) *
+              rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
+        }
+
+        for (int64_t rhs_iz = 0; rhs_iz < input_feature_group_size; ++rhs_iz) {
+          const int64_t iz =
+              feature_group_index * input_feature_group_size + rhs_iz;
+
+          int64_t lhs_linear_index = lhs_linear_spatial_index;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+
+          // We are scraping only the diagonal elements in the resultant
+          // convolution output when batch_group_count is greater than 1,
+          // where 1 is the default. No scraping is done in that case.
+          // This approach works out automatically for 'groups' in batches
+          // with group_size > 1, because we already descend down the batch
+          // dimension for the 'output_batch_dim' above.
+          lhs_linear_index += (batch_group_index * batch_group_size) *
+                              lhs_dim_multipliers[input_batch_dim];
+
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+          int64_t rhs_linear_index = rhs_linear_spatial_index;
+
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += rhs_iz * rhs_dim_multipliers[kernel_input_z_dim];
+          auto lhs =
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]);
+          auto rhs =
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
+          if (is_packed_nibble) {
+            auto lhs_n0 = ToArithmeticSafeType(Nibble0(lhs));
+            auto lhs_n1 = ToArithmeticSafeType(Nibble1(lhs));
+            auto rhs_n0 = ToArithmeticSafeType(Nibble0(rhs));
+            auto rhs_n1 = ToArithmeticSafeType(Nibble1(rhs));
+            result_val += (lhs_n0 * rhs_n0) + (lhs_n1 * rhs_n1);
+          } else {
+            result_val += ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
+
+            if (parent_->trace_mac_handler_ != nullptr) {
+              const int64_t result_linear_index =
+                  IndexUtil::MultidimensionalIndexToLinearIndex(result_shape,
+                                                                out_index);
+
+              parent_->trace_mac_handler_(result_linear_index, lhs_linear_index,
+                                          rhs_linear_index);
+            }
+          }
+        }
+      cnt: {}
+      } while (IndexUtil::BumpIndices(window_shape,
+                                      absl::MakeSpan(rhs_spatial_index)));
+
+      if constexpr (std::is_integral_v<ReturnT>) {
+        auto l = static_cast<ElementwiseT>(std::numeric_limits<ReturnT>::min());
+        auto h = static_cast<ElementwiseT>(std::numeric_limits<ReturnT>::max());
+        result_val = std::max(l, std::min(h, result_val));
+      }
+      return static_cast<ReturnT>(result_val);
+    };
+
+    Literal result(result_shape);
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(func));
+
+    parent_->evaluated_[conv] = std::move(result);
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleConvolution(const HloInstruction* conv) override {
+    auto lhs = conv->operand(0);
+    auto rhs = conv->operand(1);
+    const auto& window = conv->window();
+    const Shape& result_shape = conv->shape();
+    const Shape& lhs_shape = lhs->shape();
+    const Shape& rhs_shape = rhs->shape();
+
+    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK(lhs_shape.IsArray());
+    CHECK(rhs_shape.IsArray());
+
+    const auto& dnums = conv->convolution_dimension_numbers();
+    const int64_t num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
+    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
+    CHECK_GE(num_spatial_dims, 0);
+    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
+
+    const auto lhs_rank = lhs_shape.rank();
+    const auto rhs_rank = rhs_shape.rank();
+
+    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
+    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
+
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferConvolveShape(
+            lhs_shape, rhs_shape, conv->feature_group_count(),
+            conv->batch_group_count(), window, dnums,
+            /*preferred_element_type=*/conv->shape().element_type()));
+    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const bool lhs_same = ShapeUtil::SameElementType(lhs_shape, result_shape);
+    const bool rhs_same = ShapeUtil::SameElementType(rhs_shape, result_shape);
+    if (rhs_same && lhs_same) {
+      return HandleConvolutionWithLiterals(conv, lhs_literal, rhs_literal);
+    }
+    if (rhs_same) {
+      return HandleConvolutionWithLiterals(
+          conv, lhs_literal.Convert(result_shape.element_type()).value(),
+          rhs_literal);
+    }
+    if (lhs_same) {
+      return HandleConvolutionWithLiterals(
+          conv, lhs_literal,
+          rhs_literal.Convert(result_shape.element_type()).value());
+    }
+    return HandleConvolutionWithLiterals(
+        conv, lhs_literal.Convert(result_shape.element_type()).value(),
+        rhs_literal.Convert(result_shape.element_type()).value());
+  }
+
+  absl::Status HandleDot(const HloInstruction* dot) override {
+    if (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() == 1 &&
+        parent_->use_fast_path_ &&
+        ShapeUtil::SameElementType(dot->operand(0)->shape(), dot->shape()) &&
+        ShapeUtil::SameElementType(dot->operand(1)->shape(), dot->shape())) {
+      return HandleDot<ElementwiseT>(dot);
+    }
+    return HandleDotSlowPath(dot);
+  }
+
+  template <typename NativeT, typename std::enable_if_t<
+                                  std::is_same_v<NativeT, float>>* = nullptr>
+  absl::Status HandleDot(const HloInstruction* dot) {
+    const HloInstruction* lhs = dot->operand(0);
+    const HloInstruction* rhs = dot->operand(1);
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
+
+    const auto& dnums = dot->dot_dimension_numbers();
+
+    const int64_t lhs_rank = lhs->shape().rank();
+    const int64_t rhs_rank = rhs->shape().rank();
+
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
+
+    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
+    const int64_t lhs_contracting_dimension =
+        dnums.lhs_contracting_dimensions(0);
+    const int64_t rhs_contracting_dimension =
+        dnums.rhs_contracting_dimensions(0);
+    // Contracted dimension sizes must be the same.
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
+             rhs->shape().dimensions(rhs_contracting_dimension))
+        << "lhs contracted dimension: "
+        << lhs->shape().dimensions(lhs_contracting_dimension)
+        << " rhs contracted dimension: "
+        << rhs->shape().dimensions(rhs_contracting_dimension);
+
+    // The fast path is for a simple rank 2 dot with default layout operands.
+    if (lhs_rank != 2 || rhs_rank != 2 || lhs_contracting_dimension != 1 ||
+        rhs_contracting_dimension != 0 ||
+        !LayoutUtil::Equal(lhs->shape().layout(),
+                           LayoutUtil::GetDefaultLayoutForR2()) ||
+        !LayoutUtil::Equal(rhs->shape().layout(),
+                           LayoutUtil::GetDefaultLayoutForR2()) ||
+        !LayoutUtil::Equal(dot->shape().layout(),
+                           LayoutUtil::GetDefaultLayoutForR2())) {
+      return HandleDotSlowPath(dot);
+    }
+
+    const PrimitiveType native_ty =
+        primitive_util::NativeToPrimitiveType<NativeT>();
+    Literal lhs_literal =
+        parent_->GetEvaluatedLiteralFor(lhs).Convert(native_ty).value();
+    Literal rhs_literal =
+        parent_->GetEvaluatedLiteralFor(rhs).Convert(native_ty).value();
+    const int64_t contracted_dimension_size =
+        lhs->shape().dimensions(lhs_contracting_dimension);
+    Array2D<NativeT> lhs_array(lhs->shape().dimensions(0),
+                               contracted_dimension_size);
+    lhs_array.SetValues(lhs_literal.data<NativeT>());
+    Array2D<NativeT> rhs_array(contracted_dimension_size,
+                               rhs->shape().dimensions(1));
+    rhs_array.SetValues(rhs_literal.data<NativeT>());
+    std::unique_ptr<Array2D<NativeT>> result_array =
+        HloEvaluator::MatmulArray2D(lhs_array, rhs_array);
+    Literal result(ShapeUtil::MakeShape(native_ty, dot->shape().dimensions()));
+    result.PopulateR2FromArray2D(*result_array);
+    parent_->evaluated_[dot] =
+        std::move(result).Convert(dot->shape().element_type()).value();
+    return absl::OkStatus();
+  }
+
+  template <typename NativeT, typename std::enable_if_t<
+                                  !std::is_same_v<NativeT, float>>* = nullptr>
+  absl::Status HandleDot(const HloInstruction* dot) {
+    return HandleDotSlowPath(dot);
+  }
+
+  absl::Status HandleDotSlowPathWithLiterals(const HloInstruction* dot,
+                                             const Literal& lhs_literal,
+                                             const Literal& rhs_literal) {
+    const auto& dnums = dot->dot_dimension_numbers();
+
+    const auto lhs_rank = lhs_literal.shape().rank();
+    const auto rhs_rank = rhs_literal.shape().rank();
+
+    CHECK(ShapeUtil::SameElementType(lhs_literal.shape(), rhs_literal.shape()));
+    CHECK(ShapeUtil::SameElementType(lhs_literal.shape(), dot->shape()));
+
+    const auto packed_nibble_count =
+        absl::c_count(dot->precision_config().operand_precision(),
+                      PrecisionConfig::PACKED_NIBBLE);
+    CHECK_NE(packed_nibble_count, 1);
+    const bool is_packed_nibble = packed_nibble_count == 2;
+    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+             dnums.rhs_batch_dimensions_size());
+
+    DimensionVector lhs_non_contracting_dims =
+        GetNonContractingDims(lhs_rank, dnums.lhs_contracting_dimensions(),
+                              dnums.lhs_batch_dimensions());
+    DimensionVector rhs_non_contracting_dims =
+        GetNonContractingDims(rhs_rank, dnums.rhs_contracting_dimensions(),
+                              dnums.rhs_batch_dimensions());
+
+    DimensionVector contracting_dim_sizes;
+    contracting_dim_sizes.reserve(dnums.lhs_contracting_dimensions_size());
+    DimensionVector lhs_contracting_dims;
+    DimensionVector rhs_contracting_dims;
+    for (int64_t i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+      const int64_t lhs_dnum = dnums.lhs_contracting_dimensions(i);
+      const int64_t rhs_dnum = dnums.rhs_contracting_dimensions(i);
+      lhs_contracting_dims.push_back(lhs_dnum);
+      rhs_contracting_dims.push_back(rhs_dnum);
+      const int64_t dim_size = lhs_literal.shape().dimensions(lhs_dnum);
+      contracting_dim_sizes.push_back(dim_size);
+    }
+    const int64_t total_contraction_size = Product(contracting_dim_sizes);
+    Literal result(dot->shape());
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> result_index, int /*thread_id*/) {
+          // Locations in LHS and RHS that we read from.
+          DimensionVector lhs_index(lhs_rank);
+          DimensionVector rhs_index(rhs_rank);
+
+          // First come the batch dimensions.
+          int64_t idx = 0;
+          for (int64_t i = 0; i < dnums.lhs_batch_dimensions_size(); i++) {
+            lhs_index[dnums.lhs_batch_dimensions(i)] = result_index[idx];
+            rhs_index[dnums.rhs_batch_dimensions(i)] = result_index[idx];
+            idx++;
+          }
+
+          // Next we have non-contracting dimensions, if any.
+          for (int64_t i = 0; i < lhs_non_contracting_dims.size(); i++) {
+            lhs_index[lhs_non_contracting_dims[i]] = result_index[idx++];
+          }
+          for (int64_t i = 0; i < rhs_non_contracting_dims.size(); i++) {
+            rhs_index[rhs_non_contracting_dims[i]] = result_index[idx++];
+          }
+
+          // Accumulate resulting product along the contracting dimensions.
+          ElementwiseT result_val = static_cast<ElementwiseT>(0);
+          for (int64_t k = 0; k < total_contraction_size; k++) {
+            const auto lhs =
+                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index));
+            const auto rhs =
+                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+            if (is_packed_nibble) {
+              auto lhs_n0 = ToArithmeticSafeType(Nibble0(lhs));
+              auto lhs_n1 = ToArithmeticSafeType(Nibble1(lhs));
+              auto rhs_n0 = ToArithmeticSafeType(Nibble0(rhs));
+              auto rhs_n1 = ToArithmeticSafeType(Nibble1(rhs));
+              result_val += (lhs_n0 * rhs_n0) + (lhs_n1 * rhs_n1);
+            } else {
+              result_val +=
+                  ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
+
+              if (parent_->trace_mac_handler_ != nullptr) {
+                const int64_t result_linear_index =
+                    IndexUtil::MultidimensionalIndexToLinearIndex(dot->shape(),
+                                                                  result_index);
+                const int64_t lhs_linear_index =
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        lhs_literal.shape(), lhs_index);
+                const int64_t rhs_linear_index =
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        rhs_literal.shape(), rhs_index);
+
+                parent_->trace_mac_handler_(result_linear_index,
+                                            lhs_linear_index, rhs_linear_index);
+              }
+            }
+
+            // If there are no contracting dimensions, do not try to count down
+            // from -1 to 0; that's an infinite loop.
+            if (!contracting_dim_sizes.empty()) {
+              for (int64_t i = contracting_dim_sizes.size() - 1; i >= 0; --i) {
+                lhs_index[lhs_contracting_dims[i]]++;
+                rhs_index[rhs_contracting_dims[i]]++;
+                if (lhs_index[lhs_contracting_dims[i]] !=
+                    contracting_dim_sizes[i]) {
+                  break;
+                }
+                lhs_index[lhs_contracting_dims[i]] = 0;
+                rhs_index[rhs_contracting_dims[i]] = 0;
+              }
+            }
+          }
+
+          return static_cast<ReturnT>(result_val);
+        }));
+
+    parent_->evaluated_[dot] = std::move(result);
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleDotSlowPath(const HloInstruction* dot) {
+    auto lhs = dot->operand(0);
+    auto rhs = dot->operand(1);
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
+    const bool lhs_same =
+        ShapeUtil::SameElementType(lhs->shape(), dot->shape());
+    const bool rhs_same =
+        ShapeUtil::SameElementType(rhs->shape(), dot->shape());
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    if (lhs_same && rhs_same) {
+      return HandleDotSlowPathWithLiterals(dot, lhs_literal, rhs_literal);
+    }
+    if (lhs_same) {
+      return HandleDotSlowPathWithLiterals(
+          dot, lhs_literal,
+          rhs_literal.Convert(dot->shape().element_type()).value());
+    }
+    if (rhs_same) {
+      return HandleDotSlowPathWithLiterals(
+          dot, lhs_literal.Convert(dot->shape().element_type()).value(),
+          rhs_literal);
+    }
+    return HandleDotSlowPathWithLiterals(
+        dot, lhs_literal.Convert(dot->shape().element_type()).value(),
+        rhs_literal.Convert(dot->shape().element_type()).value());
+  }
+
+  absl::Status HandlePad(const HloInstruction* pad) override {
+    CHECK(pad->operand(0)->shape().IsArray());
+    // Padding value must be scalar.
+    CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
+    CHECK_EQ(pad->operand(0)->shape().rank(),
+             pad->padding_config().dimensions_size());
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferPadShape(
+                            /*operand_shape=*/pad->operand(0)->shape(),
+                            /*padding_value_shape=*/pad->operand(1)->shape(),
+                            /*padding_config=*/pad->padding_config()));
+    // Try to convert the element type if the inferred type is not compatible.
+    bool convert_element_type =
+        pad->shape().element_type() != inferred_return_shape.element_type();
+    if (convert_element_type) {
+      inferred_return_shape.set_element_type(pad->shape().element_type());
+    }
+    CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(pad->shape())
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+    ReturnT scalar;
+    if (convert_element_type) {
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          parent_->GetEvaluatedLiteralFor(pad->operand(1))
+                              .Convert(inferred_return_shape.element_type()));
+      scalar = literal.Get<ReturnT>({});
+    } else {
+      scalar =
+          parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
+    }
+
+    // Create new HLO of padded shape with padding value.
+    Literal result(pad->shape());
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&scalar](absl::Span<const int64_t> multi_index, int) {
+          return scalar;
+        }));
+
+    const Literal& evaluated_operand =
+        parent_->GetEvaluatedLiteralFor(pad->operand(0));
+
+    std::vector<int64_t> target_index(result.shape().rank(), 0);
+
+    // Loop through each element of the operand, assign them to the
+    // corresponding index of the resulting padded literal.
+    const PaddingConfig& pad_config = pad->padding_config();
+
+    auto func = [&](absl::Span<const int64_t> input_index) {
+      for (auto i = 0; i < input_index.size(); ++i) {
+        // Interior padding occurs logically before edge padding, so in the case
+        // of negative edge padding elements are removed from the
+        // interior-padded operand.
+        target_index[i] =
+            pad_config.dimensions(i).edge_padding_low() +
+            input_index[i] * (pad_config.dimensions(i).interior_padding() + 1);
+
+        // Account for negative low and high padding: skip assignment if the
+        // any target index is out of range.
+        if (!(target_index[i] >= 0 &&
+              target_index[i] < pad->shape().dimensions(i))) {
+          return true;
+        }
+      }
+      result.Set<ReturnT>(target_index,
+                          evaluated_operand.Get<ReturnT>(input_index));
+      return true;
+    };
+
+    std::vector<int64_t> zero_base(evaluated_operand.shape().dimensions_size(),
+                                   0);
+    std::vector<int64_t> step(evaluated_operand.shape().dimensions_size(), 1);
+
+    ShapeUtil::ForEachIndexNoStatus(evaluated_operand.shape(), zero_base,
+                                    evaluated_operand.shape().dimensions(),
+                                    step, func);
+
+    parent_->evaluated_[pad] = std::move(result);
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleClz(const HloInstruction* clz) override {
+    // Enable CLZ only for integer types.
+    if constexpr (std::is_integral_v<ElementwiseT> &&
+                  !std::is_same_v<ElementwiseT, bool>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[clz],
+          ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+            int64_t unsigned_digits = std::numeric_limits<ReturnT>::digits +
+                                      std::numeric_limits<ReturnT>::is_signed;
+            return (unsigned_digits - 1) - Log2Floor<uint64_t>(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(clz);
+  }
+
+  absl::Status HandlePopulationCount(const HloInstruction* popcnt) override {
+    if constexpr (std::is_integral_v<ElementwiseT> &&
+                  !std::is_same_v<ElementwiseT, bool>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[popcnt],
+          ElementWiseUnaryOp(popcnt, [](ElementwiseT elem_operand) {
+            return std::bitset<CHAR_BIT * sizeof(ReturnT)>(elem_operand)
+                .count();
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(popcnt);
+  }
+
+  absl::Status HandleSin(const HloInstruction* sin) override {
+    if constexpr (std::is_floating_point_v<ElementwiseT> ||
+                  is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[sin],
+          ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+            return std::sin(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(sin);
+  }
+
+  absl::Status HandleCos(const HloInstruction* cos) override {
+    if constexpr (std::is_floating_point_v<ElementwiseT> ||
+                  is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[cos],
+          ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+            return std::cos(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(cos);
+  }
+
+  absl::Status HandleTan(const HloInstruction* tan) override {
+    if constexpr (std::is_floating_point_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[tan],
+          ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
+            return std::tan(elem_operand);
+          }));
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(tan);
+  }
+
+  template <typename NativeT, typename std::enable_if_t<
+                                  std::is_floating_point_v<NativeT>>* = nullptr>
+  absl::Status HandleReducePrecision(const HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [&](ElementwiseT elem) {
+          const uint32_t src_mantissa_bits =
+              std::numeric_limits<NativeT>::digits - 1;
+          const uint32_t src_exponent_bits =
+              8 * sizeof(NativeT) - src_mantissa_bits - 1;
+          const uint32_t dest_mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t dest_exponent_bits = reduce_precision->exponent_bits();
+
+          using Uint = UnsignedIntegerTypeForSizeType<sizeof(NativeT)>;
+          Uint value_as_int = absl::bit_cast<Uint>(elem);
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float32 type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (dest_mantissa_bits < src_mantissa_bits) {
+            const Uint last_mantissa_bit_mask =
+                Uint{1} << (src_mantissa_bits - dest_mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const Uint base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
+            const Uint x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >>
+                (src_mantissa_bits - dest_mantissa_bits);
+            const Uint x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const Uint truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (dest_exponent_bits < src_exponent_bits) {
+            // Masks for f32 values.
+            const Uint sign_bit_mask = Uint{1} << 8 * sizeof(NativeT) - 1;
+            const Uint exp_bits_mask = (Uint{1 << src_exponent_bits} - 1)
+                                       << src_mantissa_bits;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const Uint exponent_bias = (Uint{1} << (src_exponent_bits - 1)) - 1;
+            const Uint reduced_exponent_bias =
+                (1 << (dest_exponent_bits - 1)) - 1;
+            const Uint reduced_max_exponent =
+                exponent_bias + reduced_exponent_bias;
+            const Uint reduced_min_exponent =
+                exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const Uint x_exponent = value_as_int & exp_bits_mask;
+            const bool x_overflows =
+                x_exponent > (reduced_max_exponent << src_mantissa_bits);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << src_mantissa_bits);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const Uint x_signed_zero = value_as_int & sign_bit_mask;
+            const Uint x_signed_inf = x_signed_zero | exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          NativeT reduced_result = absl::bit_cast<NativeT>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = dest_mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<NativeT>::infinity();
+          }
+          return reduced_result;
+        }));
+    return absl::OkStatus();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if_t<std::is_integral_v<NativeT> ||
+                                      is_complex_v<NativeT>>* = nullptr>
+  absl::Status HandleReducePrecision(const HloInstruction* reduce_precision) {
+    return UnsupportedTypeError(reduce_precision);
+  }
+
+  absl::Status HandleReducePrecision(
+      const HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
+  }
+
+  absl::Status HandleIota(const HloInstruction* instruction) override {
+    auto* iota = Cast<HloIotaInstruction>(instruction);
+    if constexpr (std::is_integral_v<ElementwiseT> ||
+                  is_complex_v<ElementwiseT> ||
+                  std::is_floating_point_v<ElementwiseT>) {
+      Literal result(iota->shape());
+      ShapeUtil::ForEachIndexNoStatus(
+          iota->shape(), [&](absl::Span<const int64_t> idx) {
+            result.Set(idx, static_cast<ReturnT>(idx[iota->iota_dimension()]));
+            return true;
+          });
+      parent_->evaluated_[iota] = std::move(result);
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(iota);
+  }
+
+  absl::Status HandleRng(const HloInstruction* random) override {
+    RandomDistribution distribution = random->random_distribution();
+    const Shape& result_shape = random->shape();
+    Literal result(result_shape);
+
+    if constexpr (std::is_floating_point_v<ElementwiseT>) {
+      switch (distribution) {
+        case RNG_UNIFORM: {
+          const Literal& low =
+              parent_->GetEvaluatedLiteralFor(random->operand(0));
+          const Literal& high =
+              parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+          // std::uniform_real_distribution(a, b) can sometimes return a value
+          // equal to b.  Unclear if this is a spec bug or an implementation bug
+          // or WAI [0] [1] [2].  Anyway for our purposes we want a half-open
+          // interval, so we have to re-sample if we get `b` out.
+          //
+          // [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63176
+          // [1] https://bugs.llvm.org/show_bug.cgi?id=18767
+          // [2] http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524
+          const ReturnT low_val = low.Get<ReturnT>({});
+          const ReturnT high_val = high.Get<ReturnT>({});
+          std::uniform_real_distribution<ElementwiseT> generator(
+              static_cast<ElementwiseT>(low_val),
+              static_cast<ElementwiseT>(high_val));
+          TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
+              [&](absl::Span<const int64_t> /*indexes*/) {
+                while (true) {
+                  const ReturnT v =
+                      static_cast<ReturnT>(generator(parent_->engine_));
+                  if (v >= low_val && v < high_val) {
+                    return v;
+                  }
+                }
+              }));
+          break;
+        }
+        case RNG_NORMAL: {
+          const Literal& mean =
+              parent_->GetEvaluatedLiteralFor(random->operand(0));
+          const Literal& stddev =
+              parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+          std::normal_distribution<ElementwiseT> generator(
+              static_cast<ElementwiseT>(mean.Get<ReturnT>({})),
+              static_cast<ElementwiseT>(stddev.Get<ReturnT>({})));
+
+          TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
+              [&](absl::Span<const int64_t> /*indexes*/) {
+                return static_cast<ReturnT>(generator(parent_->engine_));
+              }));
+          break;
+        }
+        default:
+          return UnimplementedStrCat("The distribution ",
+                                     RandomDistribution_Name(distribution),
+                                     " is not implemented.");
+      }
+      parent_->evaluated_[random] = std::move(result);
+      return absl::OkStatus();
+    }
+    if constexpr (std::is_integral_v<ElementwiseT>) {
+      switch (distribution) {
+        case RNG_UNIFORM: {
+          const Literal& low =
+              parent_->GetEvaluatedLiteralFor(random->operand(0));
+          const Literal& high =
+              parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+          // Note std::uniform_int_distribution assumes interval is closed,
+          // i.e., [low, high], but we want [low, high) instead. Hence high-1 is
+          // used as the upper range.
+          std::uniform_int_distribution<int64_t> generator(
+              static_cast<int64_t>(low.Get<ReturnT>({})),
+              static_cast<int64_t>(high.Get<ReturnT>({})) - 1);
+
+          TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
+              [&](absl::Span<const int64_t> /*indexes*/) {
+                return static_cast<ReturnT>(generator(parent_->engine_));
+              }));
+          break;
+        }
+        case RNG_NORMAL: {
+          return Unimplemented(
+              "Normal distribution is not supported for integral types.");
+        }
+        default:
+          return UnimplementedStrCat("The distribution ",
+                                     RandomDistribution_Name(distribution),
+                                     " is not implemented.");
+      }
+      parent_->evaluated_[random] = std::move(result);
+      return absl::OkStatus();
+    }
+    return UnsupportedTypeError(random);
+  }
+
+ private:
+  absl::StatusOr<Literal> ElementWiseUnaryOp(
+      const HloInstruction* instruction,
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(instruction->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        auto result_literal,
+        (HloEvaluator::ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
+            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
+
+    return std::move(result_literal);
+  }
+
+  absl::StatusOr<Literal> ElementWiseBinaryOp(
+      const HloInstruction* instruction,
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    const auto& shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    Literal result(shape);
+
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> multi_index, int) {
+          return ConvertBinaryFunction(binary_op)(
+              lhs_literal.Get<ReturnT>(multi_index),
+              rhs_literal.Get<ReturnT>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  template <typename LhsType, typename RhsType, typename EhsType>
+  absl::StatusOr<Literal> ElementwiseTernaryOp(
+      const HloInstruction* instruction,
+      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
+    const auto& shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    const auto* ehs = instruction->operand(2);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, lhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()));
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
+
+    Literal result(shape);
+
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> multi_index, int) {
+          return ternary_op(lhs_literal.Get<LhsType>(multi_index),
+                            rhs_literal.Get<RhsType>(multi_index),
+                            ehs_literal.Get<EhsType>(multi_index));
+        }));
+
+    return std::move(result);
+  }
+
+  template <typename NativeT>
+  static bool IsShiftOutOfBounds(ElementwiseT rhs) {
+    using UnsignedT = make_specialized_unsigned_t<NativeT>;
+    UnsignedT lhs_bits_unsigned =
+        static_cast<UnsignedT>(std::numeric_limits<UnsignedT>::digits);
+    UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
+    return rhs_unsigned >= lhs_bits_unsigned;
+  }
+
+  HloEvaluator* parent_;
+};
+
+// These extern templates prevent users of this class from implicitly
+// instantiating it.  We explicitly instantiate this class in the various
+// hlo_evaluator_typed_visitor*.cc files.
+extern template class HloEvaluatorTypedVisitor<bool>;
+extern template class HloEvaluatorTypedVisitor<u1, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<u2, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<u4, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<uint8_t, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<uint16_t, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<uint32_t, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<uint64_t>;
+extern template class HloEvaluatorTypedVisitor<s1, int64_t>;
+extern template class HloEvaluatorTypedVisitor<s2, int64_t>;
+extern template class HloEvaluatorTypedVisitor<s4, int64_t>;
+extern template class HloEvaluatorTypedVisitor<int8_t, int64_t>;
+extern template class HloEvaluatorTypedVisitor<int16_t, int64_t>;
+extern template class HloEvaluatorTypedVisitor<int32_t, int64_t>;
+extern template class HloEvaluatorTypedVisitor<int64_t>;
+extern template class HloEvaluatorTypedVisitor<Eigen::half, float>;
+extern template class HloEvaluatorTypedVisitor<float>;
+extern template class HloEvaluatorTypedVisitor<double>;
+extern template class HloEvaluatorTypedVisitor<complex64>;
+extern template class HloEvaluatorTypedVisitor<complex128>;
+extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11fnuz, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e5m2fnuz, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fnuz, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e3m4, float>;
+
+}  // namespace xla
+
+#endif  // XLA_HLO_EVALUATOR_HLO_EVALUATOR_TYPED_VISITOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
new file mode 100644
index 00000000..40cbc629
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -0,0 +1,373 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "xla/hlo/experimental/auto_sharding/cluster_environment.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+class AutoShardingImplementation {
+ public:
+  explicit AutoShardingImplementation(const AutoShardingOption& option);
+  ~AutoShardingImplementation() = default;
+
+  absl::StatusOr<bool> RunAutoSharding(
+      HloModule* module,
+      const absl::flat_hash_set<std::string>& replicated_small_tensors,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      const absl::flat_hash_map<std::string, HloSharding>&
+          sharding_propagation_solution = {});
+
+  struct SaveShardingAnnotationsResult {
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>
+        preserved_shardings;
+    bool module_is_changed;
+  };
+
+  // Returns sharding annotations that need to be preserved in a map (for
+  // verification after auto-sharding is done), and removes any sharding
+  // annotations that need to be removed.
+  absl::StatusOr<SaveShardingAnnotationsResult> SaveAndRemoveShardingAnnotation(
+      HloModule* module,
+      const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+      const absl::flat_hash_set<std::string>& replicated_small_tensors,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Canonicalizes entry_computation_layouts by calling
+  // module.layout_canonicalization_callback(), which gives canonicalized
+  // argument and result layouts based on current module. Currently used by
+  // PJRT which assigns layouts based on runtime shapes: see
+  // DetermineArgumentLayoutsFromCompileOptions() in
+  //     tensorflow/compiler/xla/pjrt/utils.cc
+  absl::Status CanonicalizeLayouts(HloModule* module);
+
+  // Returns the optimal objective value that the ILP solver computes.
+  double GetSolverOptimalObjectiveValue() {
+    return solver_optimal_objective_value_;
+  }
+
+ private:
+  AutoShardingOption option_;
+
+  // Stores the optimal value of the objective the solver found. This is used to
+  // choose the best mesh shape when the try_multiple_mesh_shapes option is on.
+  double solver_optimal_objective_value_ = -1.0;
+};
+
+class AutoSharding : public HloModulePass {
+ public:
+  explicit AutoSharding(const AutoShardingOption& option);
+  ~AutoSharding() override = default;
+  absl::string_view name() const override { return "auto_sharding"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  double GetSolverOptimalObjectiveValue() {
+    return solver_optimal_objective_value_;
+  }
+
+  std::vector<int64_t> GetChosenDeviceMeshShape() { return chosen_mesh_shape_; }
+
+ protected:
+  AutoShardingOption option_;
+
+ private:
+  // Stores the optimal value of the objective the solver found.
+  double solver_optimal_objective_value_ = -1.0;
+  // Stores the optimal mesh shape found.
+  std::vector<int64_t> chosen_mesh_shape_;
+};
+
+namespace spmd {
+// Function declarations.
+// Their comments can be found in their definitions in *.cc files.
+HloSharding Tile(const Shape& shape, absl::Span<const int64_t> tensor_dims,
+                 absl::Span<const int64_t> mesh_dims,
+                 const DeviceMesh& device_mesh);
+
+std::vector<double> CommunicationReshardingCostVector(
+    const StrategyGroup& strategy_group, const Shape& shape,
+    const HloSharding& required_sharding,
+    const ClusterEnvironment& cluster_env);
+
+std::vector<double> MemoryReshardingCostVector(
+    const StrategyGroup& strategy_group, const Shape& operand_shape,
+    const HloSharding& required_sharding,
+    const ClusterEnvironment& cluster_env);
+
+std::vector<double> FollowInsCostVector(int64_t source_len, int64_t index);
+
+std::unique_ptr<StrategyGroup> CreateLeafStrategyGroup(
+    size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, StrategyGroups& strategy_groups);
+
+void RemoveDuplicatedStrategy(StrategyGroup& strategy_group);
+
+absl::Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
+                            const ClusterEnvironment& cluster_env,
+                            const AutoShardingOption& option,
+                            StrategyGroup& strategy_group);
+
+absl::Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
+                       StrategyGroups& strategy_groups,
+                       StrategyMap& strategy_map, const HloInstruction* ins,
+                       size_t instruction_id,
+                       const HloInstructionSequence& instruction_sequence,
+                       const HloCostAnalysis& hlo_cost_analysis,
+                       const ClusterEnvironment& cluster_env,
+                       const AutoShardingOption& option,
+                       const CallGraph& call_graph);
+
+absl::Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
+                        StrategyGroups& strategy_groups,
+                        StrategyMap& strategy_map, const HloInstruction* ins,
+                        size_t instruction_id,
+                        const HloInstructionSequence& instruction_sequence,
+                        const HloCostAnalysis& hlo_cost_analysis,
+                        const ClusterEnvironment& cluster_env,
+                        const AutoShardingOption& option,
+                        const CallGraph& call_graph);
+
+// Handle alias: alias pairs must have the same HloSharding.
+// To deal with alias, we do special process both before and after
+// BuildStrategyAndCost. Because it is easier to handle elementwise
+// instructions before BuildStrategyAndCost and it is easier to handle
+// dot/conv instructions after BuildStrategyAndCost. Before
+// BuildStrategyAndCost, we build an AliasMap to guide the generation of
+// strategies. After BuildStrategyAndCost, we use AliasSet to add alias
+// constraints in the ILP problem.
+AliasMap BuildAliasMap(const HloModule* module);
+
+AliasSet BuildAliasSet(const HloModule* module,
+                       const StrategyMap& strategy_map);
+
+absl::Status CheckAliasSetCompatibility(const AliasSet& alias_set,
+                                        const StrategyGroups& strategy_groups,
+                                        const HloInstructionSequence& sequence,
+                                        bool crash_on_error);
+absl::Status RemoveFollowersIfMismatchedStrategies(
+    const AliasSet& alias_set, const StrategyGroups& strategy_groups,
+    const HloInstructionSequence& sequence, bool crash_on_error);
+
+absl::Status GenerateReduceScatter(
+    const HloInstructionSequence& sequence, const AliasMap& alias_map,
+    const InstructionDepthMap& depth_map, const StrategyMap& strategy_map,
+    const CostGraph& cost_graph, absl::Span<const int64_t> s_val,
+    const ClusterEnvironment& cluster_env, const AutoShardingOption& option);
+
+bool HasReduceScatterOpportunity(const HloInstruction* inst,
+                                 const StrategyMap& strategy_map,
+                                 const CostGraph& cost_graph,
+                                 absl::Span<const int64_t> s_val,
+                                 const ConstInstructionSet& modified);
+
+HloSharding GetReduceScatterOutput(const HloInstruction* ins,
+                                   const InputShardings& input_shardings,
+                                   const ShardingStrategy& strategy,
+                                   const ClusterEnvironment& cluster_env);
+
+// Populates temporal distance values.
+void PopulateTemporalValues(const CostGraph& cost_graph,
+                            AutoShardingSolverRequest& request);
+
+void AddReplicatedStrategy(
+    const HloInstruction* ins, const Shape& shape,
+    const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
+    double replicated_penalty,
+    absl::flat_hash_set<int64_t> operands_to_consider_all_strategies_for,
+    StrategyGroup& strategy_group);
+
+void CheckMemoryCosts(const StrategyGroup& strategy_group, const Shape& shape);
+
+// Choose an operand to follow. We choose to follow the operand with the highest
+// priority.
+std::pair<int64_t, bool> ChooseOperandToFollow(
+    const StrategyMap& strategy_map, const InstructionDepthMap& depth_map,
+    const AliasMap& alias_map, int64_t max_depth, const HloInstruction* ins);
+
+void FillAllStrategiesForArray(
+    const HloInstruction* ins, const Shape& shape,
+    const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
+    const AutoShardingOption& option, double replicated_penalty,
+    const CallGraph& call_graph, bool only_allow_divisible,
+    bool create_replicated_strategies,
+    bool create_partially_replicated_strategies, StrategyGroup& strategy_group);
+
+absl::StatusOr<std::unique_ptr<StrategyGroup>> CreateAllStrategiesGroup(
+    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map, const AutoShardingOption& option,
+    double replicated_penalty, const CallGraph& call_graph,
+    bool only_allow_divisible, bool create_replicated_strategies,
+    bool create_partially_replicated_strategies);
+
+// Enumerates sharding strategies for elementwise operators by following
+// strategies of an operand of the elementwise op.
+std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
+    size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    const InstructionDepthMap& depth_map, const AliasMap& alias_map,
+    const StableMap<int64_t, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map,
+    int64_t max_depth, StrategyGroups& strategy_groups,
+    AssociativeDotPairs& associative_dot_pairs);
+
+std::unique_ptr<StrategyGroup> HandleManuallyShardedInstruction(
+    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
+    StrategyGroups& strategy_groups, StrategyMap& strategy_map);
+
+std::unique_ptr<StrategyGroup> HandlePartialReduce(
+    const HloInstruction* ins, size_t instruction_id,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    StrategyMap& strategy_map, const CallGraph& call_graph);
+
+// Factory functions for StrategyGroup.
+std::unique_ptr<StrategyGroup> CreateLeafStrategyGroupWithoutInNodes(
+    size_t instruction_id, StrategyGroups& strategy_groups);
+
+// Enumerates sharding strategies for reshape operators. The function does so by
+// essentially reshaping the sharding of the operand in a manner similar to the
+// tensor reshape itself.
+std::unique_ptr<StrategyGroup> CreateReshapeStrategies(
+    size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    bool only_allow_divisible, double replicated_penalty,
+    const AutoShardingOption& option, StrategyGroups& strategy_groups,
+    const CallGraph& call_graph);
+
+std::unique_ptr<StrategyGroup> CreateTupleStrategyGroup(size_t instruction_id);
+
+// Enumerate all 1d partition strategies.
+void EnumerateAll1DPartition(
+    const HloInstruction* ins, const Shape& shape,
+    const DeviceMesh& device_mesh, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map, bool only_allow_divisible,
+    bool allow_shardings_small_dims_across_many_devices,
+    const std::string& suffix, const CallGraph& call_graph,
+    StrategyGroup& strategy_group);
+
+// Enumerate all partitions recursively.
+void EnumerateAllPartition(
+    const HloInstruction* ins, const Shape& shape,
+    const DeviceMesh& device_mesh, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map, bool only_allow_divisible,
+    bool allow_shardings_small_dims_across_many_devices,
+    const CallGraph& call_graph, int64_t partition_dimensions,
+    const std::vector<int64_t>& tensor_dims, StrategyGroup& strategy_group);
+
+absl::StatusOr<std::unique_ptr<StrategyGroup>> FollowReduceStrategy(
+    const HloInstruction* ins, const Shape& output_shape,
+    const HloInstruction* operand, const HloInstruction* unit,
+    size_t instruction_id, StrategyMap& strategy_map,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    bool allow_mixed_mesh_shape, bool crash_at_error);
+
+void GenerateOutfeedStrategy(const HloInstruction* ins, const Shape& shape,
+                             const ClusterEnvironment& cluster_env,
+                             const StrategyMap& strategy_map,
+                             double replicated_penalty,
+                             StrategyGroup& strategy_group);
+
+std::pair<ReshardingCosts, ReshardingCosts>
+GenerateReshardingCostsAndMissingShardingsForAllOperands(
+    const HloInstruction* ins, const HloSharding& output_sharding,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    const CallGraph& call_graph, InputShardings& input_shardings);
+
+std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
+    const StrategyGroup& src_strategy_group, const Shape& shape,
+    size_t instruction_id, StrategyGroups& strategy_groups,
+    const ClusterEnvironment& cluster_env,
+    const StableMap<NodeIdx, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map);
+
+void RemoveShardingsWhereSmallDimsShardedAcrossManyDevices(
+    const Shape& shape, bool instruction_has_user_sharding,
+    StrategyGroup& strategy_group);
+
+void ScaleCostsWithExecutionCounts(int64_t execution_count,
+                                   StrategyGroup& strategy_group);
+
+// Existing shardings refer to the HloSharding field in the given
+// HloInstruction.
+void TrimOrGenerateStrategiesBasedOnExistingSharding(
+    const Shape& output_shape, const StrategyMap& strategy_map,
+    const std::vector<HloInstruction*>& instructions,
+    const HloSharding& existing_sharding, const ClusterEnvironment& cluster_env,
+    StableMap<int64_t, std::vector<ShardingStrategy>>& pretrimmed_strategy_map,
+    const CallGraph& call_graph, bool strict, StrategyGroup& strategy_group);
+
+// Build possible sharding strategies and their costs for all instructions.
+absl::StatusOr<std::tuple<StrategyMap, StrategyGroups, AssociativeDotPairs>>
+BuildStrategyAndCost(
+    const HloInstructionSequence& sequence, const HloModule* module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_execution_counts,
+    const InstructionDepthMap& depth_map, const AliasMap& alias_map,
+    const ClusterEnvironment& cluster_env, AutoShardingOption& option,
+    const CallGraph& call_graph, const HloCostAnalysis& hlo_cost_analysis,
+    bool trying_multiple_mesh_shapes);
+
+// Computes an approximate lower bound on the per-device memory usage of a
+// module once it has been sharded. This quantity is multiplied with
+// memory_budget_ratio to obtain the memory budget using in our ILP formulation.
+int64_t MemoryBudgetLowerBound(
+    const HloModule& module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const LivenessSet& liveness_set, const HloAliasAnalysis& alias_analysis,
+    int64_t num_devices,
+    const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+        preserved_shardings);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
new file mode 100644
index 00000000..4190993d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -0,0 +1,190 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_COST_GRAPH_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_COST_GRAPH_H_
+
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "xla/hlo/experimental/auto_sharding/matrix.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace spmd {
+
+struct EdgeReshardingCost {
+  double communication_cost = 0;
+  double memory_cost = 0;
+
+  EdgeReshardingCost() : communication_cost(0), memory_cost(0) {}
+
+  EdgeReshardingCost(double communication_cost_, double memory_cost_)
+      : communication_cost(communication_cost_), memory_cost(memory_cost_) {}
+
+  EdgeReshardingCost operator+(const EdgeReshardingCost& other) const {
+    return EdgeReshardingCost(other.communication_cost + communication_cost,
+                              other.memory_cost + memory_cost);
+  }
+
+  std::string ToString() const {
+    return absl::StrCat("{communication_cost=", communication_cost,
+                        ", memory_cost=", memory_cost, "}");
+  }
+};
+
+using EdgeReshardingCostMatrix = Matrix<EdgeReshardingCost>;
+
+// Normalizes the edge cost matrix by a fixed constant to ensure there are no
+// negative communication costs.
+EdgeReshardingCostMatrix Normalize(const EdgeReshardingCostMatrix& edge_cost);
+
+// A graph data structure to simplify the edge cost graph. It merges nodes and
+// performs path compression.
+class CostGraph {
+ public:
+  CostGraph(const StrategyGroups& strategy_groups,
+            const AssociativeDotPairs& associative_dot_pairs);
+
+  EdgeReshardingCostMatrix CreateEdgeCost(NodeIdx src_idx, NodeIdx dst_idx,
+                                          size_t in_node_idx,
+                                          StrategyGroup* strategy_group,
+                                          bool zero_cost = false);
+
+  EdgeReshardingCostMatrix GetEdgeCost(NodeIdx i, NodeIdx j);
+
+  void AddEdgeCost(NodeIdx i, NodeIdx j, EdgeReshardingCostMatrix& cost);
+
+  void RemoveEdge(NodeIdx i, NodeIdx j);
+
+  // Merge node src into node dst. This is used when we set one operator to
+  // follow another operator's sharding spec. For the following computation
+  // graph:
+  //   dst -- src -- adj1
+  //           |
+  //          adj2
+  //
+  // It will be transformed into the following graph:
+  //   (src)
+  //    dst -- adj1
+  //     |
+  //    adj2
+  // Where all the edges costs between src and adjs will be added into the edge
+  // costs between dst and adjs. The edge cost between src and dst will be added
+  // to the extra node cost of dst. Other node costs of src will be added into
+  // dst's node cost in the ILP.
+  void MergeNode(NodeIdx src, NodeIdx dst);
+
+  NodeIdx QueryDestination(NodeIdx node_idx);
+
+  void Simplify(bool enable);
+
+  NodeStrategyIdx RemapIndex(NodeIdx node_id, NodeStrategyIdx value) const;
+
+  std::string ToString() const;
+
+  // TODO: Make class member variables private.
+
+  // The number of strategies of each node.
+  std::vector<int> node_lens_;
+  // The adjacency list of each node.
+  std::vector<StableSet<int>> adjacency_;
+  // The cost matrix between two nodes.
+
+  StableMap<std::pair<NodeIdx, NodeIdx>, EdgeReshardingCostMatrix> edge_costs_;
+  // The extra node costs introduced by merging nodes.
+  std::vector<std::vector<double>> extra_node_costs_;
+  // The reindexing vector of the node.
+  // A reindexing vector maps a strategy index from the node being followed
+  // to a strategy index of the current node.
+  StableMap<int, std::vector<NodeStrategyIdx>> reindexing_vector_;
+  // Maps a node id to the node id that is being followed by this node.
+  // The value is -1 if the current node does not follow any node.
+  std::vector<NodeIdx> follow_idx_;
+
+  // Save the destination of merged nodes.
+  StableMap<NodeIdx, NodeIdx> merged_to_;
+  // Save pairs that need to be merged.
+  std::vector<std::pair<NodeIdx, NodeIdx>> to_merge_pairs_;
+};
+
+// Get the final sharding strategy according to the ILP solution.
+inline const ShardingStrategy& GetShardingStrategy(
+    const HloInstruction* inst, const StrategyMap& strategy_map,
+    const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val) {
+  const StrategyGroup* strategy_group = strategy_map.at(inst).get();
+  CHECK(!strategy_group->is_tuple);
+  NodeIdx node_idx = strategy_group->node_idx;
+  NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
+  return strategy_group->GetStrategies()[stra_idx];
+}
+
+// Get the input shardings according to the ILP solution.
+inline const InputShardings& GetInputShardings(
+    const HloInstruction* inst, const StrategyMap& strategy_map,
+    const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val) {
+  const StrategyGroup* strategy_group = strategy_map.at(inst).get();
+  CHECK(!strategy_group->is_tuple);
+  NodeIdx node_idx = strategy_group->node_idx;
+  NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
+  return strategy_group->GetInputShardingsForStrategy(stra_idx);
+}
+
+// Get the final sharding strategy according to the ILP solution.
+inline const ShardingStrategy& GetShardingStrategyForTuple(
+    const HloInstruction* inst, const ShapeIndex& index,
+    const StrategyMap& strategy_map, const CostGraph& cost_graph,
+    absl::Span<const NodeStrategyIdx> s_val) {
+  const StrategyGroup* strategy_group = strategy_map.at(inst).get();
+  CHECK(strategy_group->is_tuple);
+  for (auto index_element : index) {
+    CHECK_LT(index_element, strategy_group->GetChildren().size());
+    const auto& strategies = strategy_group->GetChildren()[index_element];
+    strategy_group = strategies.get();
+  }
+  NodeIdx node_idx = strategy_group->node_idx;
+  NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
+  return strategy_group->GetStrategies()[stra_idx];
+}
+
+// Get the input shardings according to the ILP solution.
+inline const InputShardings& GetInputShardingsForTuple(
+    const HloInstruction* inst, const ShapeIndex& index,
+    const StrategyMap& strategy_map, const CostGraph& cost_graph,
+    absl::Span<const NodeStrategyIdx> s_val) {
+  const StrategyGroup* strategy_group = strategy_map.at(inst).get();
+  CHECK(strategy_group->is_tuple);
+  for (auto index_element : index) {
+    CHECK_LT(index_element, strategy_group->GetChildren().size());
+    const auto& strategies = strategy_group->GetChildren()[index_element];
+    strategy_group = strategies.get();
+  }
+  NodeIdx node_idx = strategy_group->node_idx;
+  NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
+  return strategy_group->GetInputShardingsForStrategy(stra_idx);
+}
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_COST_GRAPH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h
new file mode 100644
index 00000000..0f974f64
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h
@@ -0,0 +1,148 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_DEVICE_MESH_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_DEVICE_MESH_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+
+namespace xla {
+namespace spmd {
+class DeviceMesh {
+ public:
+  explicit DeviceMesh(absl::Span<const int64_t> sizes)
+      : device_array_(sizes), is_iota_(false) {}
+
+  void FillIota(const int64_t value) {
+    device_array_.FillIota(value);
+    is_iota_ = true;
+  }
+
+  void SetValues(absl::Span<const int64_t> values);
+
+  bool IsIota() const { return is_iota_; }
+
+  const Array<int64_t>& DeviceArray() const { return device_array_; }
+
+  int64_t num_dimensions() const { return device_array_.num_dimensions(); }
+
+  // Returns the size of the dimension at the given index.
+  int64_t dim(int64_t n) const { return device_array_.dim(n); }
+
+  // Returns a vector containing the dimensions of the array.
+  absl::Span<const int64_t> dimensions() const {
+    return device_array_.dimensions();
+  }
+
+  // Returns the total number of elements in the array.
+  int64_t num_elements() const { return device_array_.num_elements(); }
+
+  std::string ToString() const { return device_array_.ToString(); }
+
+  void Reshape(absl::Span<const int64_t> new_dimensions) {
+    device_array_.Reshape(new_dimensions);
+  }
+
+  void TransposeDimensions(absl::Span<const int> permutation) {
+    device_array_.TransposeDimensions(permutation);
+    is_iota_ = false;
+  }
+
+  const int64_t& operator()(absl::Span<const int64_t> indexes) const {
+    return device_array_(indexes);
+  }
+
+  int64_t& operator()(absl::Span<const int64_t> indexes) {
+    return device_array_(indexes);
+  }
+
+  void Each(absl::FunctionRef<void(absl::Span<const int64_t>, int64_t*)> f) {
+    device_array_.Each(f);
+  }
+
+  void Each(
+      absl::FunctionRef<void(absl::Span<const int64_t>, int64_t)> f) const {
+    device_array_.Each(f);
+  }
+
+  absl::StatusOr<std::vector<int64_t>> GetMeshDimPermutationOrderInShardingSpec(
+      const HloSharding& sharding, bool consider_reverse_device_meshes) const;
+
+ private:
+  Array<int64_t> device_array_;
+  bool is_iota_;
+
+  class MeshDimPermutationOrderCacheKey {
+   public:
+    MeshDimPermutationOrderCacheKey(const HloSharding& sharding,
+                                    bool consider_reverse_device_meshes)
+        : sharding_string_(sharding.ToString()),
+          consider_reverse_device_meshes_(consider_reverse_device_meshes) {}
+
+    bool operator==(const MeshDimPermutationOrderCacheKey& other) const {
+      return this->sharding_string_ == other.sharding_string_ &&
+             this->consider_reverse_device_meshes_ ==
+                 other.consider_reverse_device_meshes_;
+    };
+
+    template <typename H>
+    friend H AbslHashValue(H h, const MeshDimPermutationOrderCacheKey& key) {
+      return H::combine(std::move(h), key.sharding_string_,
+                        key.consider_reverse_device_meshes_);
+    }
+
+   private:
+    // NB: We use sharding.ToString() instead of key.sharding as the latter will
+    // materialize the tile assignment array of the sharding (if it is V2, as
+    // are a majority of our sharding objects). This is necessary has a sharding
+    // can have a V1 or a V2 representation. Hashing the ToString repr of the
+    // sharding is much faster as it won't materialize the tile assignment
+    // array. This can, however, mean that equivalent shardings can have
+    // different hash values. In this case, this is okay, as a cache miss will
+    // merely invoke the function again, and the faster hashing more than
+    // compensates for the potentially lower hit rate.
+    const std::string sharding_string_;
+    bool consider_reverse_device_meshes_;
+  };
+
+  mutable absl::flat_hash_map<MeshDimPermutationOrderCacheKey,
+                              std::vector<int64_t>>
+      mesh_dim_permutation_order_cache_;
+};
+
+template <class T>
+inline bool AreValuesIota(absl::Span<const T> values) {
+  for (int i = 1; i < values.size(); ++i) {
+    if (values[i] - values[i - 1] != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_DEVICE_MESH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
new file mode 100644
index 00000000..a9e0938c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
@@ -0,0 +1,93 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_MEMORY_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_MEMORY_H_
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_set.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+namespace spmd {
+
+// Reduces the # of terms in a liveness matrix by collapsing co-occurring terms:
+//
+//  |                      |  444466666555
+//  |      333333333  ==>  |      ........3  Groups:
+//  |      22222222   ==>  |      ........     m[4] = m[0] + m[1]
+//  |  111111111      ==>  |  .........        m[5] = m[2] + m[3]
+//  | 0000000000      ==>  | 0.........        m[6] = m[0] + m[1] + m[2] + m[3]
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+//
+// In the above example, we have four overlapping primitives (0, 1, 2, and 3)
+// that are alive for up to ten time units each.  To enforce all memory
+// constraints, the encoding on the left requires thirty-six terms in the Mixed
+// ILP.  The encoding on the right requires only fourteen terms, plus eight more
+// to model some new groups (4, 5, and 6) formed from the others.
+class MemoryTermReducer {
+ public:
+  // Returns the number of memory terms before and after the reduction.
+  std::pair<int64_t, int64_t> Reduce(
+      int64_t num_lives, int64_t num_primitives,
+      const std::function<
+          tsl::protobuf::RepeatedField<int64_t>(int64_t)>&  // NOLINT
+          live,
+      int64_t max_iterations = std::numeric_limits<int64_t>::max());
+
+  // An alternate interface that consumes primitive intervals instead of a
+  // liveness matrix.
+  std::pair<int64_t, int64_t> Reduce(
+      int64_t num_lives, int64_t num_primitives,
+      const std::function<std::pair<int64_t, int64_t>(int64_t)>& intervals,
+      int64_t max_iterations = std::numeric_limits<int64_t>::max());
+
+  const std::vector<std::vector<int64_t>>& GetReducedLive() const;
+  const std::vector<std::pair<int64_t, int64_t>>& GetReducedIntervals() const;
+  const std::vector<absl::btree_set<int64_t>>& GetReducedGroups() const;
+
+  // Retrieves a reduced subset of time points along the liveness profile that
+  // are sufficient to establish memory constraints.
+  absl::flat_hash_set<int64_t> GetReducedTimes(int64_t num_primitives);
+
+  // A static version of the above method (in case we're using a precomputed
+  // memory term reduction).
+  static absl::flat_hash_set<int64_t> GetReducedTimes(
+      int64_t num_primitives,
+      const std::vector<std::pair<int64_t, int64_t>>& reduced_intervals,
+      const std::vector<absl::btree_set<int64_t>>& reduced_groups);
+
+ private:
+  // The internal implementation, agnostic to whether the client uses a liveness
+  // matrix or primitive intervals.
+  void Reduce(int64_t num_lives, int64_t num_primitives,
+              int64_t max_iterations);
+
+  std::vector<std::vector<int64_t>> reduced_live_;
+  std::vector<std::pair<int64_t, int64_t>> reduced_intervals_;
+  std::vector<absl::btree_set<int64_t>> reduced_groups_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_MEMORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
new file mode 100644
index 00000000..9d2f1690
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
@@ -0,0 +1,227 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_OPTION_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_OPTION_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace xla {
+
+static constexpr double kIciDeviceMeshAlpha = 1.0;
+static constexpr double kIciDeviceMeshBeta = 1.0;
+// By default, assume that DCN communication is 10 times slower than ICI
+// communication
+static constexpr double kDcnDeviceMeshAlpha = 10.0;
+static constexpr double kDcnDeviceMeshBeta = 10.0;
+static constexpr double kOverbudgetCoeff = 1e6;
+
+// Options for the autosharding pass
+struct AutoShardingOption {
+  // Enable the auto sharding pass.
+  bool enable = false;
+
+  enum class PreserveShardingsType {
+    // AutoSharding constrains the search space using all user shardings.
+    kKeepAllShardings,
+    // AutoSharding constrains the search space using input and output shardings
+    // of HloModule's entry computations and remove shardings of all
+    // intermediate tensors.
+    kKeepInputOutputShardings,
+    // Remove all user shardings. This is useful when testing with HLO
+    // modules with XLA shardings, so that we can get performance comparison
+    // with
+    // and without AutoSharding, without changing HLO Modules.
+    kRemoveAllShardings
+  };
+
+  PreserveShardingsType preserve_shardings =
+      PreserveShardingsType::kKeepInputOutputShardings;
+
+  // Simplify the cost graph by merging nodes that should have the same sharding
+  // strategy. E.g., an XLAop constructed from an elementwise transformation of
+  // another XLAop.
+  bool simplify_graph = true;
+
+  // Memory budget (bytes) per device. Default value -1 means no memory budget.
+  // Value 0 means setting it to the memory lower bound estimation.
+  int64_t memory_budget_per_device = -1;
+
+  // Memory budget =
+  //     memory_budget_ratio * (memory lower bound estimation).
+  // Enabled when memory_budget_per_device == 0;
+  float memory_budget_ratio = 1.1;
+
+  // Controls the penalty associated with violating memory constraints; if
+  // negative, the memory budget is instead imposed as a hard constraint.
+  float memory_overbudget_coeff = kOverbudgetCoeff;
+
+  // Overwrite the all gather cost with the input all reduce cost.
+  bool force_override_all_gather_cost = false;
+  double all_gather_cost = 0;
+
+  // Overwrite the all gather cost with the input all reduce cost.
+  bool force_override_all_to_all_cost = false;
+  double all_to_all_cost = 0;
+
+  // Overwrite the all gather cost with the input all reduce cost.
+  bool force_override_all_reduce_cost = false;
+  double all_reduce_cost = 0;
+
+  // Overwrite the all gather cost with the input all reduce cost.
+  bool force_override_reduce_scatter_cost = false;
+  double reduce_scatter_cost = 0;
+
+  // If true, allow replicated parameters.
+  bool allow_replicated_parameters = true;
+
+  // If true, prefer reduce-scatter + all-gather over all-reduce.
+  // A post process will be applied to replace all-reduce with reduce-scatter +
+  // all-gather if no communication overhead is introduced.
+  bool prefer_reduce_scatter = false;
+
+  // If True, generate a gradient-accumulation friendly variant of
+  // reduce-scatter
+  bool reduce_scatter_grad_acc_friendly = false;
+
+  // If true, aggressively partition more tensors when generating
+  // reduce-scatter, even if it introduces more communication.
+  bool reduce_scatter_aggressive_partition = false;
+
+  // If true, the batch matmul will always be parallelized on the batch dim in
+  // 2d mesh case.
+  bool batch_matmul_always_split_batch = false;
+
+  // If true, allow strategies that recompute heavy operators (e.g., dot) to
+  // reduce communication. This will generate generate replicated or partially
+  // replicated strategies for dot/conv ops. Generating these seems to be
+  // beneficial for LLM serving models, but can increase the search space, so
+  // this feature is exposed as an option.
+  bool allow_recompute_heavy_op = true;
+
+  // If true, allow adding 1d strategies in 2d logical mesh.
+  bool allow_mixed_mesh_shape = true;
+
+  // If true, N-D sharding (e.g., N maybe be 2 or 3) will be solved in N
+  // iterations, where one iteration chooses one tensor dimension to shard. If
+  // false, solve N-D sharding directly, i.e., generating all possible sharding
+  // strategies for N-D mesh shape.
+  bool solve_nd_sharding_iteratively = true;
+
+  // If true, forcibly set the strategy of some instructions.
+  bool force_strategy = false;
+  std::vector<int64_t> force_strategy_inst_indices;
+  std::vector<std::string> force_strategy_stra_names;
+
+  // Whether or not we allow sharding strategies where the tensor dim is
+  // indivisible by the #tiles in that dimension.
+  bool only_allow_divisible_input_output = true;
+  bool only_allow_divisible_intermediate = false;
+
+  // If true, strictly limit the following iterations to use the same number of
+  // shards for sharded tensor dimensions; if false, the following iterations
+  // can choose different number of shards for sharded tensor dimensions.
+  // Enabling it can hurt the performance of dot ops, but can make the search
+  // space more scalable. Therefore leaving it as an option.
+  bool nd_sharding_iteratively_strict_search_space = false;
+
+  // Device mesh shape.
+  std::vector<int64_t> device_mesh_shape;
+  // Device IDs in the mesh.
+  std::vector<int64_t> device_mesh_ids;
+  // We use an alpha-beta model as the communication model:
+  //   latency = alpha + beta * size
+  // the following two vectors have the same size as device_mesh_shape and each
+  // element models the communication performance along each mesh dimension.
+  std::vector<double> device_mesh_alpha;
+  std::vector<double> device_mesh_beta;
+
+  // Explore other mesh shapes with the same number of devices as the provided
+  // one for a potentially better auto-sharding solution.
+  bool try_multiple_mesh_shapes = false;
+
+  // Timeout for the solver. If the solver fails to find an optimal solution
+  // before the timeout, we rely on the heuristic-based sharding implemented in
+  // sharding_propagation.cc.
+  int64_t solver_timeout_in_seconds = 3600;
+
+  // Static estimate for iteration count of a while loop, used in the cost
+  // model. This estimate is used when we cannot infer an upper bound on the
+  // number of iterations in the loop (as implemented in
+  // third_party/tensorflow/compiler/xla/hlo/analysis/while_loop_analysis.h)
+  int64_t loop_iteration_count_estimate = 100;
+
+  // Allows the conversion of aliases to followers if their pairwise strategy
+  // compatibilities are embodied by the identity matrix (which makes for a
+  // smaller Mixed ILP).
+  bool allow_alias_to_follower_conversion = true;
+
+  // If greater than zero, tensors with size smaller than or equal to this limit
+  // will always be replicated if they don't have a different user-specified
+  // sharding.
+  int64_t small_tensor_byte_size = 0;
+
+  // In order to obtain default sharding strategies for instructions to limit
+  // departures from the defaults, use sharding propagation instead of assuming
+  // a simple replicated default.
+  bool use_sharding_propagation_for_default_shardings = false;
+
+  // Whether or not to model the memory usage of intermediate tensors, if any,
+  // for resharding edges.
+  bool model_resharding_memory_costs = false;
+
+  // Whether or not to generate strategies that model the windowed einsum (or
+  // collective matmul) optimization
+  // TODO(331684721,329508561): Generate windowed-einsum strategies by default
+  // once it is fully implemented.
+  bool generate_windowed_einsum_strategies = false;
+
+  // Whether or not to allow shardings where a tensor dim is shared across a
+  // number of devices larger than the size of the tensor dimension
+  bool allow_shardings_small_dims_across_many_devices = false;
+
+  // Split constant expressions as well when invoking HloConstantSplitter.
+  bool enable_expression_constant_splitter = false;
+
+  // Whether to post-process the solution by reshaping/resharding tensors for
+  // non-dot/conv ops. We insert the reshapes for dots/convs as this empirically
+  // gives better auto-sharding outcomes.
+  // TODO(b/365834709) Investigate the need for resharding reshapes across all
+  // ops in a principled manner.
+  bool insert_resharding_reshapes_for_non_dot_ops = false;
+
+  // The number of slices used
+  std::optional<int64_t> num_dcn_slices = std::nullopt;
+
+  // Prints a debug string.
+  std::string ToString() const;
+
+  // Initializes uninitialized fields with default values, as well as checks the
+  // consistency of different options.
+  absl::Status CheckAndSetup();
+};
+
+AutoShardingOption DefaultAutoShardingOptionFromModuleConfig(
+    const HloModuleConfig& config);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_OPTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
new file mode 100644
index 00000000..d3f79ddd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -0,0 +1,161 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "ortools/linear_solver/linear_solver.h"
+
+namespace xla {
+namespace spmd {
+
+struct AutoShardingSolverOutput {
+  std::vector<NodeStrategyIdx> s_val;
+  double cost = -1.0;
+  bool is_optimal = true;
+  absl::flat_hash_set<LivenessIdx> peak_times;
+
+  bool operator==(const AutoShardingSolverOutput& other) const;
+};
+
+// Scales down values to reduce the range of costs & coefficients in the solver.
+AutoShardingSolverRequest ScaleRequest(
+    const AutoShardingSolverRequest& request);
+
+absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
+    const AutoShardingSolverRequest& request);
+
+// TODO(fahrbach): Create AutoShardingHeuristicOptions proto with a oneof field.
+// Runs a heuristic specified by one of the following values of `algorithm`:
+// - "trivial"
+// - "random"
+// - "greedy-node-cost"
+// - "greedy-node-memory"
+// - "brkga"
+absl::StatusOr<AutoShardingSolverOutput> RunHeuristicSolver(
+    const AutoShardingSolverRequest& request, const std::string& algorithm);
+
+enum AutoShardingViolationCode {
+  kAliasViolationCode,     // Some node's strategy does not match its alias
+  kFollowerViolationCode,  // Some node's strategy does not match its follower
+  kInfiniteCostViolationCode,   // Some node or edge incurs infinite cost
+  kMemoryViolationCode,         // The solution eclipses the memory budget
+  kMaxDeparturesViolationCode,  // The solution has too many sharding departures
+};
+
+struct CostComponents {
+  double communication_cost = 0.0;
+  double computation_cost = 0.0;
+  double resharding_cost = 0.0;
+  double overbudget_cost = 0.0;
+  double makespan_cost = 0.0;
+
+  double cost() const;
+
+  bool operator==(const CostComponents& other) const;
+};
+
+// Captures the metrics, lower bounds, and constraint violations for the
+// sharding result.
+struct AutoShardingEvaluation {
+  // A set of constraint violations; should be empty for any viable solution.
+  absl::flat_hash_set<AutoShardingViolationCode> violation_codes;
+
+  // A breakdown and lower bound for each individual cost component.
+  CostComponents total;
+  CostComponents lower_bound;
+
+  // How many instructions departed from the "default" sharding strategy.
+  double total_departures = 0.0;
+
+  // The (raw) total makespan, i.e., not scaled by the makespan coefficient.
+  double total_makespan = 0.0;
+
+  // The maximum total memory over all time steps.
+  double max_total_memory = 0.0;
+
+  bool operator==(const AutoShardingEvaluation& other) const;
+};
+
+// Evaluates the given solver result w.r.t. the input request, computing various
+// solution quality metrics and validating the consistency of hard constraints.
+AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
+                                const AutoShardingSolverOutput& result);
+
+// Computes the objective value of the sharding strategy. If the objective value
+// is infinite or the sharding is infeasible (e.g., violates the peak-memory
+// constraint), then a negated `AutoShardingViolationCode` value is returned.
+// This function is used instead of `Evaluate` for faster iteration loops in the
+// heuristic solver library.
+double ComputeShardingStrategyCost(
+    const AutoShardingSolverRequest& request,
+    const std::vector<NodeStrategyIdx>& node_strategies);
+
+// Creates and returns a variable for makespan.
+operations_research::MPVariable* CreateMakespanVar(
+    const AutoShardingSolverRequest& request,
+    const std::vector<std::vector<operations_research::MPVariable*>>& e,
+    operations_research::MPSolver& solver);
+
+double EvaluateMakespan(const AutoShardingSolverRequest& request,
+                        const AutoShardingSolverOutput& result,
+                        AutoShardingEvaluation& evaluation);
+
+// Determines if strategy 'first' is dominated by strategy 'second' (i.e., its
+// costs are all equal or worse, and it has identical alias mappings).
+bool CheckDominance(const AutoShardingSolverRequest& request,
+                    const std::vector<EdgeIdx>& src_edges,
+                    const std::vector<EdgeIdx>& dst_edges,
+                    const std::vector<AliasIdx>& src_aliases,
+                    const std::vector<AliasIdx>& dst_aliases, NodeIdx node_idx,
+                    NodeStrategyIdx first, NodeStrategyIdx second);
+
+class StrategyShaver {
+ public:
+  explicit StrategyShaver(const AutoShardingSolverRequest& request);
+
+  // For every node, examine each sharding strategy to see if it is dominated by
+  // another.
+  NodeStrategies FindShavedStrategies() const;
+
+ private:
+  const AutoShardingSolverRequest& request_;  // NOLINT
+  std::vector<std::vector<EdgeIdx>> src_edge_map_;
+  std::vector<std::vector<EdgeIdx>> dst_edge_map_;
+  std::vector<std::vector<AliasIdx>> src_alias_map_;
+  std::vector<std::vector<AliasIdx>> dst_alias_map_;
+  std::vector<std::vector<NodeIdx>> followers_;
+};
+
+// Check fail if `request` is invalid (e.g., because of negative node costs).
+// Note: This does not include checks for valid variable aliasing yet.
+absl::Status ValidateRequest(const AutoShardingSolverRequest& request);
+
+void SolverRequestCallback(const AutoShardingSolverRequest& request);
+
+AutoShardingSolverOutput SolveBrkga(const AutoShardingSolverRequest& request);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
new file mode 100644
index 00000000..49212fe8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
@@ -0,0 +1,392 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_STRATEGY_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_STRATEGY_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/container/btree_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace spmd {
+
+// A constant to represent infinity cost.
+constexpr double kInfinityCost = 1e20;
+
+// Type alias
+template <typename Key, typename Value>
+using StableMap = absl::btree_map<Key, Value>;
+template <typename Key>
+using StableSet = absl::btree_set<Key>;
+
+struct CompareHloInstruction {
+  bool operator()(const HloInstruction* a, const HloInstruction* b) const {
+    return a->name() < b->name();
+  }
+};
+
+template <typename Value>
+using ConstInstructionMap =
+    absl::btree_map<const HloInstruction*, Value, CompareHloInstruction>;
+template <typename Value>
+using InstructionMap =
+    absl::btree_map<HloInstruction*, Value, CompareHloInstruction>;
+
+using ConstInstructionSet =
+    absl::btree_set<const HloInstruction*, CompareHloInstruction>;
+using InstructionSet = absl::btree_set<HloInstruction*, CompareHloInstruction>;
+
+// Map an instruction to its depth.
+using InstructionDepthMap = ConstInstructionMap<int64_t>;
+// Map an instruction to its batch dimension.
+using InstructionBatchDimMap = StableMap<std::string, int>;
+// Map an instruction to its alias source parameter.
+using AliasMap = ConstInstructionMap<HloInstruction*>;
+// Map an instruction to its resharding cache.
+using ReshardingCache =
+    ConstInstructionMap<std::vector<std::pair<HloSharding, HloInstruction*>>>;
+// Resharding costs for each operand
+using ReshardingCosts = std::vector<std::vector<double>>;
+
+// A named vector of optional shardings for each operand.
+struct InputShardings {
+  std::string name;
+  std::vector<std::optional<HloSharding>> shardings;
+
+  std::string ToString() const {
+    std::string str = absl::StrCat(name, " ");
+    for (const auto& s : shardings) {
+      if (!s.has_value()) {
+        absl::StrAppend(&str, "[*],");
+      } else if (s->IsReplicated()) {
+        absl::StrAppend(&str, "[R],");
+      } else {
+        if (s->ReplicateOnLastTileDim()) {
+          absl::StrAppend(
+              &str, "[", absl::StrJoin(s->tile_assignment().dimensions(), ", "),
+              "]last_tile_dim_replicate,");
+        } else {
+          absl::StrAppend(
+              &str, "[", absl::StrJoin(s->tile_assignment().dimensions(), ", "),
+              "],");
+        }
+      }
+    }
+    return str;
+  }
+};
+
+// One sharding strategy
+struct ShardingStrategy {
+  HloSharding output_sharding;
+  double compute_cost;
+  double communication_cost;
+  double memory_cost;
+  // resharding_costs[i][j] is the resharding cost from the output of
+  // i-th operand's j-th strategy to this strategy.
+  // If there is only one tuple operand,resharding_costs[i][j] is the resharding
+  // cost from i-th tuple element's j-th strategy.
+  ReshardingCosts communication_resharding_costs;
+  ReshardingCosts memory_resharding_costs;
+
+  std::string ToString() const { return output_sharding.ToString(); }
+
+  std::string ToStringLong() const {
+    std::vector<std::string> communication_resharding_vector_strings;
+    communication_resharding_vector_strings.reserve(
+        communication_resharding_costs.size());
+    for (const auto& v : communication_resharding_costs) {
+      communication_resharding_vector_strings.push_back(
+          absl::StrCat("[", absl::StrJoin(v, ", "), "]"));
+    }
+    std::string communication_resharding_cost_str = absl::StrCat(
+        "{", absl::StrJoin(communication_resharding_vector_strings, ", "), "}");
+
+    std::vector<std::string> memory_resharding_vector_strings;
+    memory_resharding_vector_strings.reserve(memory_resharding_costs.size());
+    for (const auto& v : memory_resharding_costs) {
+      memory_resharding_vector_strings.push_back(
+          absl::StrCat("[", absl::StrJoin(v, ", "), "]"));
+    }
+    std::string memory_resharding_cost_str = absl::StrCat(
+        "{", absl::StrJoin(memory_resharding_vector_strings, ", "), "}");
+
+    return absl::StrCat(
+        output_sharding.ToString(), ", compute_cost=", compute_cost,
+        ", communication_cost=", communication_cost,
+        ", memory_cost=", memory_cost,
+        ", communication_resharding_costs=", communication_resharding_cost_str,
+        ", memory_resharding_costs=", memory_resharding_cost_str);
+  }
+
+  bool operator==(const ShardingStrategy& other) const {
+    return output_sharding == other.output_sharding &&
+           compute_cost == other.compute_cost &&
+           communication_cost == other.communication_cost &&
+           memory_cost == other.memory_cost &&
+           communication_resharding_costs ==
+               other.communication_resharding_costs &&
+           memory_resharding_costs == other.memory_resharding_costs;
+  }
+};
+
+using NodeIdx = int64_t;          // An index into the solver's node list.
+using EdgeIdx = int64_t;          // An index into the solver's edge list.
+using NodeStrategyIdx = int64_t;  // An index into a node's strategy vector.
+using EdgeStrategyIdx = int64_t;  // An index into an edge's strategy vector.
+using LivenessIdx = int64_t;      // An index into the liveness vector.
+using AliasIdx = int64_t;         // An index into the alias vector.
+
+// Various classes needed to support strategy shaving.
+using NodeStrategy = std::pair<NodeIdx, NodeStrategyIdx>;
+using NodeStrategies = StableSet<NodeStrategy>;
+
+// A group of strategy choices (along with details like index values)
+// for each instruction.
+struct StrategyGroup {
+  bool is_tuple;
+  // The index used in the solver. For non-leaf nodes, this is set to -1.
+  NodeIdx node_idx;
+  // The index of the HLO instruction that this strategy group belongs to.
+  size_t instruction_id;
+  // The connected nodes used for resharding costs;
+  // The size must be the same as the size of resharding cost
+  // each element in strategies's resharding_costs.size() needs to be the same
+  // as strategies->in_nodes.size()
+  std::vector<const StrategyGroup*> in_nodes;
+  // The followed strategy. Used for merging nodes.
+  const StrategyGroup* following = nullptr;
+  // The index of this instruction in the HLO operand (or tuple shape) list.
+  std::optional<int64_t> tuple_element_idx;
+
+  StrategyGroup() = default;
+
+  StrategyGroup(bool is_tuple, NodeIdx node_idx, size_t instruction_id)
+      : is_tuple(is_tuple),
+        node_idx(node_idx),
+        instruction_id(instruction_id) {}
+
+  StrategyGroup(bool is_tuple, NodeIdx node_idx, size_t instruction_id,
+                const std::vector<const StrategyGroup*>& in_nodes,
+                const StrategyGroup* following,
+                const std::vector<ShardingStrategy>& strategies)
+      : is_tuple(is_tuple),
+        node_idx(node_idx),
+        instruction_id(instruction_id),
+        in_nodes(in_nodes),
+        following(following) {
+    for (const ShardingStrategy& strategy : strategies) {
+      AddStrategy(strategy);
+    }
+  }
+
+  std::string ToString(size_t indentation = 0) const {
+    std::string str;
+    const std::string indent(indentation, ' ');
+    absl::StrAppend(&str, indent, "node_idx: ", node_idx, "\n");
+    absl::StrAppend(&str, indent, "instruction id: ", instruction_id, "\n");
+    absl::StrAppend(&str, indent, "is_tuple: ", is_tuple, "\n");
+    if (tuple_element_idx.has_value()) {
+      absl::StrAppend(&str, indent,
+                      "index in producer inst.: ", *tuple_element_idx, "\n");
+    }
+    if (following != nullptr) {
+      absl::StrAppend(&str, indent,
+                      "following instruction: ", following->instruction_id,
+                      "\n");
+    } else {
+      absl::StrAppend(&str, indent, "source instruction\n");
+    }
+    for (auto i : in_nodes) {
+      absl::StrAppend(&str, indent, "in nodes: node_idx=", i->node_idx,
+                      " instruction_id=", i->instruction_id, "\n");
+    }
+    if (is_tuple) {
+      for (size_t i = 0; i < children.size(); ++i) {
+        absl::StrAppend(&str, indent, "Tuple element #", i, ":\n");
+        absl::StrAppend(&str, children[i]->ToString(indentation + 2));
+      }
+    } else {
+      for (const auto& strategy : strategies) {
+        absl::StrAppend(&str, indent, "Strategy ", strategy.ToStringLong(),
+                        "\n");
+      }
+    }
+    if (!is_tuple) {
+      for (const auto& input_shardings : strategy_input_shardings) {
+        const std::string input_sharding_str =
+            absl::StrCat("{", input_shardings.ToString(), "}\n");
+        absl::StrAppend(&str, indent, "Input Sharding ", input_sharding_str);
+      }
+    }
+    return str;
+  }
+
+  const StrategyGroup* GetSubStrategyGroup(const ShapeIndex& index) const {
+    const StrategyGroup* result = this;
+    for (auto index_element : index) {
+      CHECK_LE(index_element, result->children.size());
+      result = result->children.at(index_element).get();
+    }
+    return result;
+  }
+
+  void ForEachLeafStrategyGroup(
+      absl::FunctionRef<void(const StrategyGroup&)> fn) const {
+    if (is_tuple) {
+      for (const std::unique_ptr<StrategyGroup>& child : children) {
+        fn(*child);
+      }
+    } else {
+      fn(*this);
+    }
+  }
+
+  void ForEachLeafStrategyGroup(absl::FunctionRef<void(StrategyGroup&)> fn) {
+    if (is_tuple) {
+      for (std::unique_ptr<StrategyGroup>& child : children) {
+        fn(*child);
+      }
+    } else {
+      fn(*this);
+    }
+  }
+
+  //////// Accessor methods for strategies ////////
+
+  void AddStrategy(const ShardingStrategy& strategy,
+                   const InputShardings& input_shardings = {}) {
+    // Create a new strategy if needed (otherwise, reuse an existing one).
+    size_t strategy_idx = strategies.size();
+    const size_t input_sharding_idx = strategy_input_shardings.size();
+    const auto it = std::find(strategies.begin(), strategies.end(), strategy);
+    if (it == strategies.end()) {
+      strategies.push_back(strategy);
+      strategy_idx_to_input_sharding_idx.push_back(input_sharding_idx);
+    } else {
+      strategy_idx = std::distance(strategies.begin(), it);
+    }
+    input_sharding_idx_to_strategy_idx.push_back(strategy_idx);
+    strategy_input_shardings.push_back(input_shardings);
+  }
+
+  void ClearStrategies() {
+    strategies.clear();
+    strategy_input_shardings.clear();
+    input_sharding_idx_to_strategy_idx.clear();
+    strategy_idx_to_input_sharding_idx.clear();
+  }
+
+  ShardingStrategy& GetStrategy(size_t strategy_idx) {
+    return strategies[strategy_idx];
+  }
+
+  const ShardingStrategy& GetStrategyForInputShardings(
+      size_t input_sharding_idx) const {
+    const size_t strategy_idx =
+        input_sharding_idx_to_strategy_idx[input_sharding_idx];
+    CHECK_LT(strategy_idx, strategies.size());
+    return strategies[strategy_idx];
+  }
+
+  size_t GetStrategyIdxForInputShardings(size_t input_sharding_idx) const {
+    return input_sharding_idx_to_strategy_idx[input_sharding_idx];
+  }
+
+  const InputShardings& GetInputShardings(size_t input_sharding_idx) const {
+    return strategy_input_shardings[input_sharding_idx];
+  }
+
+  const InputShardings& GetInputShardingsForStrategy(
+      size_t strategy_idx) const {
+    const size_t input_sharding_idx =
+        strategy_idx_to_input_sharding_idx[strategy_idx];
+    CHECK_LT(input_sharding_idx, strategy_input_shardings.size());
+    return strategy_input_shardings[input_sharding_idx];
+  }
+
+  const std::vector<ShardingStrategy>& GetStrategies() const {
+    return strategies;
+  }
+
+  const std::vector<InputShardings>& GetStrategyInputShardings() const {
+    return strategy_input_shardings;
+  }
+
+  //////// Accessor methods for children ////////
+
+  void AddChild(std::unique_ptr<StrategyGroup> child) {
+    children.push_back(std::move(child));
+  }
+
+  void ClearChildren() { children.clear(); }
+
+  StrategyGroup& GetChild(size_t child_idx) { return *children[child_idx]; }
+
+  const std::vector<std::unique_ptr<StrategyGroup>>& GetChildren() const {
+    return children;
+  }
+
+ private:
+  // Used when is_tuple == False. Leaf strategy vector.
+  // A vector of strategy choices for the non-tuple output.
+  std::vector<ShardingStrategy> strategies;
+  std::vector<InputShardings> strategy_input_shardings;
+  std::vector<size_t> input_sharding_idx_to_strategy_idx;
+  std::vector<size_t> strategy_idx_to_input_sharding_idx;
+
+  // Used when is_tuple == True. A vector of pointers, each pointer is one
+  // StrategyGroup for one value in the output Tuple
+  std::vector<std::unique_ptr<StrategyGroup>> children;
+};
+
+// Type aliases.
+using LivenessSet = std::vector<std::vector<const HloValue*>>;
+// A liveness set using node indices instead of HLO values.
+using LivenessNodeSet = std::vector<std::vector<NodeIdx>>;
+// A liveness set using edge indices instead of HLO values.
+using LivenessEdgeSet = std::vector<std::vector<EdgeIdx>>;
+// Map an instruction to its strategy group.
+using StrategyMap = ConstInstructionMap<std::unique_ptr<StrategyGroup>>;
+// The list of all strategy groups.
+using StrategyGroups = std::vector<StrategyGroup*>;
+// The list of all dot instruction pairs that can be optimized by
+// AllReduceReassociate pass.
+using AssociativeDotPairs =
+    std::vector<std::pair<const StrategyGroup*, const StrategyGroup*>>;
+// The set of all alias pairs
+using AliasSet = StableSet<std::pair<NodeIdx, NodeIdx>>;
+
+}  // namespace spmd
+}  // namespace xla
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_STRATEGY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
new file mode 100644
index 00000000..862aefd7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -0,0 +1,649 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_UTIL_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_UTIL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/call_graph.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace spmd {
+
+inline constexpr absl::string_view kIdentityMarker = "identity";
+
+inline constexpr int64_t kAutoShardingPointerSize = 8;
+
+inline bool IsSPMDFullToShardShapeCustomCall(const HloInstruction* ins) {
+  return ins->IsCustomCall("SPMDFullToShardShape");
+}
+
+inline bool IsSPMDShardToFullShapeCustomCall(const HloInstruction* ins) {
+  return ins->IsCustomCall("SPMDShardToFullShape");
+}
+
+inline std::pair<int, int> ParseMeshDims(const std::string& strategy_name) {
+  if (absl::StrContains(strategy_name, "{0,1}")) {
+    return {0, 1};
+  }
+  return {1, 0};
+}
+
+inline std::string ToAdaptiveString(const HloInstruction* ins) {
+  bool is_large_instruction =
+      ins->shape().IsTuple() && ins->shape().tuple_shapes_size() > 500;
+  if (!is_large_instruction) {
+    for (const auto& operand : ins->operands()) {
+      is_large_instruction =
+          is_large_instruction || (operand->shape().IsTuple() &&
+                                   operand->shape().tuple_shapes_size() > 500);
+    }
+  }
+  return is_large_instruction ? ins->ToShortString() : ins->ToString();
+}
+
+// Return whether the tensor shape is divisible by
+// the number of devices along multiple dimensions.
+bool IsDivisible(const HloInstruction* ins, const DeviceMesh& device_mesh,
+                 absl::Span<const int64_t> tensor_dims,
+                 absl::Span<const int64_t> mesh_dims);
+
+// Array/Vector/Matrix Utility
+
+// Append elements of `array` to `result`. The `indices` is a generalized
+// multi-dimensional index that can index a whole row (use -1 to indicate this).
+template <typename T>
+void AppendFlattenElementsInternal(std::vector<T>* result,
+                                   const Array<T>& array,
+                                   absl::Span<const int64_t> indices,
+                                   int cur_depth,
+                                   std::vector<int64_t> cur_indices) {
+  if (cur_depth == array.num_dimensions() - 1) {
+    result->push_back(array(cur_indices));
+  } else {
+    int next_depth = cur_depth + 1;
+    int64_t index = indices[next_depth];
+
+    if (index == -1) {
+      for (int64_t i = 0; i < array.dim(next_depth); ++i) {
+        cur_indices[next_depth] = i;
+        AppendFlattenElementsInternal(result, array, indices, next_depth,
+                                      cur_indices);
+      }
+    } else {
+      cur_indices[next_depth] = index;
+      AppendFlattenElementsInternal(result, array, indices, next_depth,
+                                    cur_indices);
+    }
+  }
+}
+
+template <typename T>
+void AppendFlattenElements(std::vector<T>* result, const Array<T>& array,
+                           absl::Span<const int64_t> indices) {
+  std::vector<int64_t> tmp_indices(array.num_dimensions(), 0);
+  AppendFlattenElementsInternal(result, array, indices,
+                                /*cur_depth=*/-1, tmp_indices);
+}
+
+// Return the index of key in a span. -1 means not found.
+template <typename T>
+int64_t GetIndex(absl::Span<const T> v, const T& key) {
+  auto iter = std::find(v.cbegin(), v.cend(), key);
+
+  if (iter != v.cend()) {
+    return std::distance(v.cbegin(), iter);
+  }
+  return -1;
+}
+
+// Print a vector as string.
+template <typename T>
+std::string ToString(const std::vector<T>& vector) {
+  return absl::StrCat("[", absl::StrJoin(vector, ", "), "]");
+}
+
+template <typename K, typename V>
+std::string ToString(const StableMap<K, V>& map) {
+  std::string result;
+  for (const auto& [k, v] : map) {
+    result = absl::StrCat(result, " [", k, "->", v, "]");
+  }
+  return result;
+}
+
+template <typename T>
+std::string ToString(const std::vector<std::vector<T>>& vector) {
+  std::string result;
+  for (const auto& v : vector) {
+    result = absl::StrCat(result, "\n[", absl::StrJoin(v, ", "), "]");
+  }
+  return result;
+}
+
+// Print a span as string.
+template <typename T>
+std::string ToString(absl::Span<T> span) {
+  return absl::StrCat("[", absl::StrJoin(span, ", "), "]");
+}
+
+// Return whether two shapes are equal in dimension.
+// The element type and layout are ignored.
+inline bool DimensionsEqual(const Shape& a, const Shape& b) {
+  return Shape::Equal().IgnoreLayout().IgnoreElementType()(a, b);
+}
+
+/*
+ * HloInstruction Utility
+ */
+// Get the space dimensions of a dot instruction.
+inline std::pair<tsl::protobuf::RepeatedField<int64_t>,
+                 tsl::protobuf::RepeatedField<int64_t>>
+GetSpaceDims(const Shape& lhs_shape, const Shape& rhs_shape,
+             const DotDimensionNumbers& dnums) {
+  tsl::protobuf::RepeatedField<int64_t> lhs_space_dims, rhs_space_dims;
+
+  for (int64_t i = 0; i < lhs_shape.rank(); ++i) {
+    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    lhs_space_dims.Add(i);
+  }
+
+  for (int64_t i = 0; i < rhs_shape.rank(); ++i) {
+    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    rhs_space_dims.Add(i);
+  }
+  return std::make_pair(std::move(lhs_space_dims), std::move(rhs_space_dims));
+}
+
+// Replace old operand with the new one.
+inline void ReplaceOperand(HloInstruction* inst,
+                           const HloInstruction* old_operand,
+                           HloInstruction* new_operand) {
+  for (int i = 0; i < inst->operand_count(); ++i) {
+    if (inst->operand(i) == old_operand) {
+      TF_CHECK_OK(inst->ReplaceOperandWith(i, new_operand));
+    }
+  }
+}
+
+// Return whether this instruction is a TopK custom call.
+inline bool IsTopKCustomCall(const HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kCustomCall &&
+         inst->custom_call_target() == "TopK";
+}
+
+// Return whether this instruction is a TopK custom call.
+inline bool IsPartialReduceCustomCall(const HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kCustomCall &&
+         inst->custom_call_target() == "PartialReduce";
+}
+
+// Return the users of an instruction and its alias,
+// excluding the final output tuple.
+inline InstructionSet UsersWithAlias(const HloInstruction* inst,
+                                     const AliasMap& alias_map,
+                                     const HloInstruction* output) {
+  InstructionSet users;
+  for (HloInstruction* user : inst->users()) {
+    HloInstruction* pass_through_user = user;
+    if (pass_through_user == output) {
+      continue;
+    }
+    users.insert(pass_through_user);
+  }
+
+  auto iter = alias_map.find(inst);
+  if (iter != alias_map.end()) {
+    for (HloInstruction* user : iter->second->users()) {
+      HloInstruction* pass_through_user = user;
+      if (pass_through_user == output) {
+        continue;
+      }
+      users.insert(pass_through_user);
+    }
+  }
+
+  return users;
+}
+
+// Return whether this instruction is a convert on a parameter.
+bool IsParameterConvert(const HloInstruction* inst);
+
+// Return whether the instruction is always replicated.
+// (e.g., constant, broadcasted constant, scalar)
+bool IsAlwaysReplicated(const HloInstruction* inst);
+
+// Try to reduce the boundary set to its common ancestor
+void TryReduceWithCommonAncestor(InstructionSet& replicated_set,
+                                 InstructionSet& boundary_set,
+                                 InstructionSet& consumer_set,
+                                 const AliasMap& alias_map);
+
+// Return whether all users of an instruction is reduce.
+bool AllUsersAreReduce(const HloInstruction* inst);
+
+void UseAllReduceForGradAcc(InstructionSet& replicated_set,
+                            const HloInstruction* inst);
+
+void SetSharding(HloInstruction* to_split, const HloSharding& output_spec,
+                 const HloInstruction* ref_inst,
+                 const HloInstruction* shape_inst,
+                 ConstInstructionSet& modified);
+
+template <typename T>
+inline std::vector<int> Argsort(const std::vector<T>& scores) {
+  std::vector<int> index;
+  index.reserve(scores.size());
+  for (size_t i = 0; i < scores.size(); ++i) {
+    index.push_back(i);
+  }
+  auto cmp = [&scores](int l, int r) { return scores[l] > scores[r]; };
+  std::sort(index.begin(), index.end(), cmp);
+  return index;
+}
+
+// Given the sharding for an instruction, invoke the sharding propagation pass
+// to infer appropriate shardings for its operands.
+std::optional<HloSharding> GetInputSharding(const HloInstruction* ins,
+                                            int64_t op_index,
+                                            const HloSharding& output_sharding,
+                                            const xla::CallGraph& call_graph,
+                                            int64_t num_devices);
+
+// Depth analysis (breadth first search) that compute the depth of each
+// instruction. We also assign a much larger distance to heavy operators (e.g.,
+// dot, convolution).
+InstructionDepthMap BuildInstructionDepthMap(
+    const HloInstructionSequence& sequence);
+
+std::string GetBatchDimMapKey(const HloInstruction* ins, int64_t idx = -1);
+
+// Batch dimension analysis that finds the batch dimension of each instruction.
+InstructionBatchDimMap BuildInstructionBatchDimMap(
+    const HloInstructionSequence& sequence);
+
+/*
+ * HloSharding Utility
+ */
+// We reuse "Manual" to represent "Undefined" sharding strategy.
+// If an op has an"Undefined" strategy, it means auto-sharding pass does not
+// decide the sharding strategy for this op.
+// We rely on the later sharding propagation pass to assign strategies to them.
+inline HloSharding Undefined() { return HloSharding::Manual(); }
+
+inline bool IsUndefined(const HloSharding& hlo_sharding) {
+  return hlo_sharding.IsManual();
+}
+
+// Pretty print a HloSharding in a simplified form
+inline std::string ToStringSimple(const HloSharding& spec) {
+  if (spec.IsReplicated()) {
+    return "R";
+  }
+  return ToString(spec.tile_assignment().dimensions());
+}
+
+// Insert a copy of the operand to force the sharding of the operand.
+inline void ForceOperandSharding(HloInstruction* inst, int operand_num,
+                                 const HloSharding& sharding) {
+  HloInstruction* operand = inst->mutable_operand(operand_num);
+  if (operand->sharding() == sharding) {
+    return;
+  }
+  HloInstruction* replace_with = inst->parent()->AddInstruction(
+      HloInstruction::CreateReshape(operand->shape(), operand));
+  replace_with->set_sharding(sharding);
+  TF_CHECK_OK(inst->ReplaceOperandWith(operand_num, replace_with));
+}
+
+// Return whether the sharding is fully tiled.
+inline bool IsFullyTiled(const HloSharding& sharding) {
+  return sharding.NumTiles() == sharding.tile_assignment().num_elements();
+}
+
+// The sharding is replicated or the total number of tiles is over or equal to
+// the total number of devices. If returns true, this sharding is likely
+// provided by users.
+inline bool ShardingIsComplete(const HloSharding& sharding,
+                               size_t total_num_devices) {
+  return sharding.TotalNumTiles() >= total_num_devices ||
+         sharding.IsReplicated();
+}
+
+// Checks if the argument instruction is a producer for a SPMDFullToShardShape
+// custom call.
+inline bool IsInstructionBeforeSPMDFullToShardShapeCustomCall(
+    const HloInstruction* ins) {
+  for (const HloInstruction* user : ins->users()) {
+    if (spmd::IsSPMDFullToShardShapeCustomCall(user)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Computes the cartesian product of N vectors
+template <typename T>
+void ForEachInCartesianProduct(
+    const std::vector<std::vector<T>>& sets,
+    absl::FunctionRef<void(const std::vector<T>&)> fn) {
+  std::vector<std::vector<T>> elements(1, std::vector<T>());
+  std::vector<std::vector<T>> temp_elements;
+  for (int i = 0; i < sets.size(); i++) {
+    temp_elements.clear();
+    for (const std::vector<T>& product : elements) {
+      for (const T& element : sets[i]) {
+        std::vector<T> product_copy = product;
+        product_copy.push_back(element);
+        temp_elements.push_back(product_copy);
+      }
+    }
+    std::swap(elements, temp_elements);
+  }
+  for (const std::vector<T>& element : elements) {
+    fn(element);
+  }
+}
+
+// Propagate sharding for dim-wise operations (e.g., slice, pad) which works
+// independently on each dimension.
+// The sharding can successfully propagate if the operation only happens on
+// tensor dimensions that are not tiled.
+std::optional<HloSharding> PropagateDimwiseSharding(
+    const HloSharding& input_spec, const Shape& old_shape,
+    const Shape& new_shape);
+
+// Propagate sharding for ReduceWindow-like operations.
+// The sharding can successfully propagate if the window operation only happens
+// on tensor dimensions that are not tiled.
+std::optional<HloSharding> PropagateReduceWindowSharding(
+    const HloSharding& input_spec, const Shape& old_shape,
+    const Window& window);
+
+// Check whether the tile assignment of a HloSharding is valid for our system.
+// Definition of validity:
+// For every tile dimension, the device id sequence along that dimension has to
+// be an arithmetic sequence.
+// e.g., we don't allow specs like sharding={devices=[8,1] 0,4,1,5,2,7,3,8}
+bool IsValidTileAssignment(const HloSharding& sharding);
+
+// Get number of tile dimensions that are not 1. For example, for sharding
+// {devices=[2,1,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+// sharding.tile_assignment.num_dimensions() = [2,1,1,4]. This function
+// returns 2. -1 means the tensor is replicated on the whole the mesh.
+int64_t NumTileDimensions(const HloSharding& sharding);
+
+// When fixing mixed mesh resharding (see below), compute the correct
+// intermediate shape in order to insert copies.
+absl::StatusOr<Shape> ComputeIntermediateShape(const HloSharding& src_sharding,
+                                               const HloSharding& dst_sharding,
+                                               const Shape& shape,
+                                               const DeviceMesh& device_mesh);
+
+// Forcibly set the sharding of the operand of inst.
+// Also fix the resharding between 1d and 2d logical mesh.
+absl::Status FixMixedMeshShapeReshardingGetTupleElement(
+    HloInstruction* inst, const HloSharding& dst_sharding,
+    const DeviceMesh& device_mesh,
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+        preserve_shardings);
+
+absl::Status FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
+    HloInstruction* inst,
+    const std::vector<std::optional<HloSharding>>& dst_sharding,
+    const DeviceMesh& device_mesh);
+
+absl::Status FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
+                                         const HloSharding& dst_sharding,
+                                         const DeviceMesh& device_mesh,
+                                         ReshardingCache* resharding_cache);
+
+// Gets the mapping vector from dim_from to dim_to.
+// Example: GetDimensionMapping([2], 3) = [0, 1, -1]
+std::vector<int64_t> GetDimensionMapping(
+    absl::Span<const int64_t> reduced_dimensions, int64_t op_count);
+
+// Checks whether numerator is divisible by denominator.
+bool IsDivisible(int64_t numerator, int64_t denominator);
+
+// Generate all replica groups along one device_mesh dimension. Device_mesh can
+// be any number of dimensions. |communication_dim| has to be one of
+// |device_mesh|'s dimension.
+std::vector<std::vector<int64_t>> GetReplicaGroupsAlongOneDimension(
+    const DeviceMesh& device_mesh, int32_t communication_dim);
+
+// Gets values in |array| along |dim| while keeping indices at other
+// dimensions at 0, e.g., array is 2D and dim = 1, this returns array[0, 1],
+// array[1, 1], array [2, 1], ....
+// Returns error status if dim >= array.num_dimensions().
+absl::StatusOr<std::vector<int64_t>> GetValuesAlongOneDim(
+    const Array<int64_t>& array, int dim);
+
+absl::StatusOr<int64_t> CheckArithmeticSequence(
+    absl::Span<const int64_t> sequence);
+
+// Checks if the number of sharded dimensions in the tile assignment matches the
+// device mesh.
+bool TileAssignmentMatchesMesh(const HloSharding& sharding,
+                               const DeviceMesh& mesh);
+
+absl::StatusOr<std::vector<int64_t>> GetMeshDimPermutationOrderInShardingSpec(
+    const HloSharding& spec, const Array<int64_t>& device_mesh,
+    bool consider_reverse_device_meshes);
+
+absl::StatusOr<std::vector<absl::btree_set<int64_t>>>
+GetTensorDimToMeshDimMixedMeshSharding(
+    int64_t tensor_shape_rank, const HloSharding& sharding,
+    const DeviceMesh& device_mesh, bool consider_reverse_device_meshes = false);
+
+// Get the mapped mesh dimension for every tensor dimension.
+// The returned value maps ith tensor dim to one mesh dim. -1 means the tensor
+// is replicated on that dimension.
+// For example, returned value [1,2] means the 0th tensor dim maps to the 1st
+// mesh dim, and 1st tensor dim maps to the 2nd mesh dim.
+std::vector<int64_t> GetTensorDimToMeshDim(
+    int64_t tensor_shape_rank, const HloSharding& spec,
+    const DeviceMesh& device_mesh, bool consider_reverse_device_meshes = false);
+
+absl::StatusOr<std::vector<int64_t>> GetTensorDimToMeshDimNoCrash(
+    int64_t tensor_shape_rank, const HloSharding& spec,
+    const DeviceMesh& device_mesh, bool consider_reverse_device_meshes = false);
+
+HloSharding Tile(const Shape& tensor_shape,
+                 absl::Span<const int64_t> tensor_dims,
+                 const std::vector<std::vector<int64_t>>& mesh_dims,
+                 const DeviceMesh& device_mesh);
+
+HloSharding Tile(const Shape& tensor_shape,
+                 absl::Span<const int64_t> tensor_dims,
+                 absl::Span<const int64_t> mesh_dims,
+                 const DeviceMesh& device_mesh);
+
+AliasMap BuildAliasMap(const HloModule* module,
+                       const HloInputOutputAliasConfig& alias_config);
+
+AliasSet BuildAliasSet(const HloModule* module,
+                       const HloInputOutputAliasConfig& alias_config,
+                       const StrategyMap& strategy_map);
+
+// Transpose an array of any number of dimensions given any axes order.
+// Similar to numpy.transpose(array, axes=()) function.
+template <typename T>
+Array<T> Transpose(const Array<T> array, std::vector<int64_t> axes) {
+  // Computes transposed array's size.
+  std::vector<int64_t> transposed_array_dimensions(array.dimensions().begin(),
+                                                   array.dimensions().end());
+  for (size_t i = 0; i < axes.size(); i++) {
+    transposed_array_dimensions[i] = array.dimensions()[axes[i]];
+  }
+  Array<T> transposed_array(transposed_array_dimensions);
+  std::vector<int64_t> transposed_array_indices(axes.size());
+  array.Each([&](absl::Span<const int64_t> indices, T value) {
+    for (int i = 0; i < axes.size(); ++i) {
+      transposed_array_indices[i] = indices[axes[i]];
+    }
+    transposed_array(transposed_array_indices) = value;
+  });
+  return transposed_array;
+}
+
+// Used to determine whether a sharding or mesh shape is 1D, 2D, or 3D.
+size_t VectorGreaterThanOneElementCount(absl::Span<const int64_t> span,
+                                        bool omit_last_dim = false);
+
+// This functions returns the indices of all vector elements larger than 1, in
+// order.
+std::vector<int64_t> VectorGreaterThanOneElementIndices(
+    absl::Span<const int64_t> span, bool omit_last_dim = false);
+
+std::vector<int64_t> VectorGreaterThanOneElements(
+    absl::Span<const int64_t> span, bool omit_last_dim = false);
+
+// Computes bytes size of a shape recursively if it is sharded according to an
+// optionally provided sharding
+int64_t ByteSizeOfShapeWithSharding(const Shape& shape,
+                                    std::optional<HloSharding> sharding);
+
+// Computes bytes size of a shape recursively
+inline int64_t ByteSizeOfShape(const Shape& shape) {
+  return ByteSizeOfShapeWithSharding(shape, /*sharding=*/std::nullopt);
+}
+
+// Computes the byte size of a shape recursively if it is sharded across a given
+// number of devices per an optionally provided sharding. If the sharding is
+// provided, this function behaves the same as ByteSizeOfShapeWithSharding
+// above. If not, it will give a lower bound on the bytes size of the shape if
+// sharded across `num_devices` devices.
+int64_t ByteSizeOfShapeIfShardedAcrossDevices(
+    const Shape& shape, int64_t num_devices,
+    std::optional<HloSharding> sharding = std::nullopt);
+
+HloInstruction* FindInstruction(
+    const std::vector<HloInstruction*>& instructions, absl::string_view name);
+
+// When a complete mesh shape is [1, 8, 4], [1, 8, 1] is its partial mesh shape.
+// If a sharding is [8, 4] for the complete mesh shape, we convert it to [8, 1]
+// given [1, 8, 1] as the partial mesh shape.
+// total_num_devices should equal to the product of mesh_shape elements.
+absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
+    const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const std::vector<int64_t>& mesh_shape,
+    const DeviceMesh& original_device_mesh, bool crash_on_error);
+
+inline bool AdjustShardingsWithPartialMeshShape(
+    const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const std::vector<int64_t>& mesh_shape,
+    const DeviceMesh& original_device_mesh) {
+  absl::StatusOr<bool> result = AdjustShardingsWithPartialMeshShape(
+      instructions, instructions_to_shard, mesh_shape, original_device_mesh,
+      /*crash_on_error=*/true);
+  CHECK_OK(result);
+  return *result;
+}
+
+// Decompose mesh shapes into partial mesh shapes so that we can solve the auto
+// sharding problem iteratively. Returns partial mesh shapes with larger
+// dimensions and more expensive collective costs first. For example, if all
+// mesh axes all have collective costs, input [1, 4, 2] returns [1, 4, 1] and
+// [1, 4, 2]; input [4, 8, 2] returns [1, 8, 1], [4, 8, 1] and [ 4, 8, 2].
+std::vector<std::vector<int64_t>> DecomposeMeshShapes(
+    const std::vector<int64_t>& mesh_shape,
+    const std::vector<double>& mesh_alpha,
+    const std::vector<double>& mesh_beta);
+
+bool OutputInputSameShapes(const HloInstruction* ins);
+
+bool IsEntryComputationInputOrOutput(const HloModule* module,
+                                     const HloInstruction* ins);
+
+// Statically estimate the execution counts of HLO ops. This matters for while
+// loops, and we use a constant iteration count for all while loops for this
+// approximation.
+absl::flat_hash_map<const HloInstruction*, int64_t>
+ComputeInstructionExecutionCounts(const HloModule* module,
+                                  int64_t loop_iteration_count_estimate);
+
+// Generates a set of mesh shapes to try for a given module based on
+// pre-existing sharding annotations. If not such annotations exist, it will
+// enumerate and return all possible mesh shapes for a given number of devices
+// and mesh dimensions.
+std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
+    const HloModule& module, int64_t num_devices, int num_mesh_dims,
+    bool symmetrical_mesh_dims);
+
+// Check if the sharding is "misaligned" wrt the shape. This is true if there is
+// at least one dimension of the tensor that is sharded over a number of devices
+// that do not complete divide the size of the tensor dimension.
+bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape);
+
+// In a given tuple sharding, replace certain leaves with
+// HloSharding::Unknown()
+HloSharding ReplaceGivenShardingsWithUnknownForTuple(
+    const HloSharding& sharding, const Shape& shape,
+    absl::Span<const bool> to_replace_sharding_ids);
+
+// Extract the reduction_dim of a PartialReduce custom call
+absl::StatusOr<int64_t> GetPartialReduceReductionDim(const HloInstruction* ins);
+
+// Returns true if an HLO op flows to a SPMDShardToFullShape custom call without
+// encountering a SPMDFullToShardShape custom call on the call.
+bool OpEncountersShardToFull(const HloInstruction* op);
+
+// Ensures that the modules entry_computation_layout has input/output shapes
+// with layouts. If this is not the case, this function will add the layout
+// information by extracting it from the HLO ops.
+absl::Status EnsureEntryComputationLayoutHasShapeLayouts(HloModule* module);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
new file mode 100644
index 00000000..333df715
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
@@ -0,0 +1,84 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_WRAPPER_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_WRAPPER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/hlo_cost_analysis.h"
+
+namespace xla {
+namespace spmd {
+
+// The high-level "recipe" for solving an Auto Sharding problem.
+absl::StatusOr<AutoShardingSolverOutput> Solve(
+    const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
+    const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
+    const std::vector<absl::btree_set<int64_t>>& node_groups,
+    const std::vector<absl::btree_set<int64_t>>& edge_groups,
+    const AutoShardingOption& option, absl::string_view request_prefix,
+    const absl::flat_hash_map<std::string, HloSharding>&
+        sharding_propagation_solution = {});
+
+// A wrapper around the solver that converts the given objects into a
+// combinatorial optimization problem & solves it.
+absl::StatusOr<AutoShardingSolverOutput>
+CreateAutoShardingSolverRequestAndCallSolver(
+    const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
+    const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
+    const std::vector<absl::btree_set<int64_t>>& node_groups,
+    const std::vector<absl::btree_set<int64_t>>& edge_groups,
+    const std::vector<NodeStrategyIdx>& s_hint, bool compute_iis,
+    int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
+    std::optional<double> max_cost, absl::string_view request_name,
+    const absl::flat_hash_map<std::string, HloSharding>&
+        sharding_propagation_solution = {},
+    bool deterministic_mode = false);
+
+// Computes the penalty to be used for fully replicated sharding strategies for
+// dots and convs.
+double GetDotConvReplicationPenalty(const HloInstruction* inst,
+                                    size_t instruction_id, size_t window,
+                                    const HloInstructionSequence& sequence,
+                                    const HloCostAnalysis& hlo_cost_analysis);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
new file mode 100644
index 00000000..89b81133
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -0,0 +1,234 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_CLUSTER_ENVIRONMENT_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_CLUSTER_ENVIRONMENT_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "xla/hlo/experimental/auto_sharding/profiling_result.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace spmd {
+
+// The cluster has a multi-dimensional device mesh topology.
+// Each mesh dimension has its own latency and bandwidth.
+// We use alpha-beta model to model the communication cost.
+// If profiling result is provided, we always prefer to use
+// the real profiling result.
+class ClusterEnvironment {
+ public:
+  ClusterEnvironment(const DeviceMesh& original_device_mesh,
+                     const DeviceMesh& device_mesh,
+                     absl::Span<const double> mesh_alpha,
+                     absl::Span<const double> mesh_beta,
+                     const ProfilingResult& prof_result,
+                     const AutoShardingOption& auto_sharding_option)
+      : original_device_mesh_(original_device_mesh),
+        device_mesh_(device_mesh),
+        mesh_alpha_(mesh_alpha.begin(), mesh_alpha.end()),
+        mesh_beta_(mesh_beta.begin(), mesh_beta.end()),
+        prof_result_(prof_result),
+        total_devices_(device_mesh.num_elements()),
+        device_mesh_1d_(device_mesh),
+        original_device_mesh_1d_(original_device_mesh),
+        auto_sharding_option_(auto_sharding_option) {
+    // Build replica group for each dimension.
+    non_zero_mesh_dims_ =
+        VectorGreaterThanOneElementIndices(device_mesh.dimensions());
+    GenerateCachedReplicaGroups();
+
+    // Essentially, we want to create a 1D mesh here such that the resharding
+    // costs between the original mesh and this 1D mesh are the least. This
+    // essentially means we create a 1D shape which stretches along the largest
+    // dimension of the original mesh. This will not however for asymmetric
+    // values of alpha and beta, I think.
+    // TODO(pratikf) Fix this for asymmetric alpha and beta values.
+    auto original_device_mesh_shape = original_device_mesh.dimensions();
+    auto max_dim_iterator = std::max_element(original_device_mesh_shape.begin(),
+                                             original_device_mesh_shape.end());
+    size_t largest_dim_idx =
+        std::distance(original_device_mesh_shape.begin(), max_dim_iterator);
+    std::vector<int64_t> device_mesh_1d_shape(device_mesh.num_dimensions(), 1);
+    device_mesh_1d_shape[largest_dim_idx] = device_mesh.num_elements();
+    device_mesh_1d_.Reshape(device_mesh_1d_shape);
+
+    std::vector<int64_t> original_device_mesh_1d_shape(
+        original_device_mesh.num_dimensions(), 1);
+    original_device_mesh_1d_shape[largest_dim_idx] =
+        original_device_mesh.num_elements();
+    original_device_mesh_1d_.Reshape(original_device_mesh_1d_shape);
+  }
+
+  size_t NumDevices() const { return total_devices_; }
+
+  bool IsDeviceMesh3D() const {
+    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 3;
+  }
+
+  bool IsDeviceMesh2D() const {
+    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 2;
+  }
+
+  bool IsDeviceMesh1D() const {
+    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 1;
+  }
+
+  bool IsOriginalDeviceMesh2D() const {
+    return VectorGreaterThanOneElementCount(
+               original_device_mesh_.dimensions()) == 2;
+  }
+
+  // Get the corresponding mesh dimension for every tensor dimension.
+  // -1 means replicated on that dimension
+  std::vector<int64_t> GetTensorDimToMeshDimWrapper(
+      const Shape& shape, const HloSharding& spec,
+      bool consider_reverse_device_meshes = false,
+      bool crash_at_error = true) const {
+    int64_t n_dim = NumTileDimensions(spec);
+    std::vector<int64_t> tensor_dim_to_mesh_dim;
+    if (crash_at_error) {
+      tensor_dim_to_mesh_dim = GetTensorDimToMeshDim(
+          shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+    } else {
+      auto tensor_dim_to_mesh_dim_status = GetTensorDimToMeshDimNoCrash(
+          shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+      if (tensor_dim_to_mesh_dim_status.ok()) {
+        tensor_dim_to_mesh_dim = tensor_dim_to_mesh_dim_status.value();
+      }
+    }
+    AdjustTensorMeshDimMapping(tensor_dim_to_mesh_dim, n_dim);
+    return tensor_dim_to_mesh_dim;
+  }
+
+  double GetDefaultReplicatedPenalty() const {
+    double replicated_penalty = 0;
+    for (int i = 0; i < device_mesh_.num_dimensions(); ++i) {
+      replicated_penalty += AllReduceCost(1, i);
+    }
+    return std::round(replicated_penalty);
+  }
+
+  double AllGatherCost(double num_bytes, int mesh_dim) const;
+
+  double AllReduceCost(double num_bytes, int32_t mesh_dim,
+                       int32_t mesh_dim_another = -1) const;
+
+  double ReduceScatterCost(double num_bytes, int mesh_dim) const;
+
+  double AllToAllCost(double num_bytes, int mesh_dim) const;
+
+  double ReshardingCostMixedMeshShape(const Shape& shape,
+                                      const HloSharding& src_sharding,
+                                      const HloSharding& dst_sharding) const;
+
+  double CollectivePermuteCost(
+      double num_bytes,
+      absl::Span<const std::pair<int64_t, int64_t>> src_dst_pairs) const;
+
+  double TryCollectivePermuteForResharding(const Shape& shape,
+                                           const HloSharding& src_spec,
+                                           const HloSharding& dst_spec) const;
+
+  // This function attempts to overestimate the cost of replicating a tensor of
+  // shape `shape` sharded according to `src_spec`.
+  double OverestimateReplicationCost(const Shape& shape,
+                                     const HloSharding& src_spec,
+                                     const DeviceMesh& device_mesh) const;
+
+  double ReshardingCost(const Shape& shape, const HloSharding& src_spec,
+                        const HloSharding& dst_spec) const;
+
+  // Print the information of this device mesh.
+  std::string ToString() {
+    std::string str;
+    absl::StrAppend(&str, "device_mesh: ", device_mesh_.ToString(), "\n");
+    absl::StrAppend(&str, "mesh_alpha: ", absl::StrJoin(mesh_alpha_, " "),
+                    "\n");
+    absl::StrAppend(&str, "mesh_beta: ", absl::StrJoin(mesh_beta_, " "), "\n");
+    return str;
+  }
+
+  // The original, complete device mesh shape that describes the hardware.
+  const DeviceMesh original_device_mesh_;
+  // When solve_nd_sharding_iteratively is true, it is a partial mesh shape from
+  // the original_device_mesh_. When solve_nd_sharding_iteratively is false, it
+  // is the same as original_device_mesh_.
+  const DeviceMesh device_mesh_;
+  // Bandwidth of the device mesh
+  const std::vector<double> mesh_alpha_;
+  const std::vector<double> mesh_beta_;
+  const ProfilingResult& prof_result_;
+  std::vector<int64_t> non_zero_mesh_dims_;
+  const int total_devices_;
+
+  // Cache a flatten 1d version of the device mesh.
+  // Used for mixed mesh shape strategies.
+  DeviceMesh device_mesh_1d_;
+
+  // Cache a flatten 1d version of the original device mesh.
+  // Used for mixed mesh shape strategies.
+  DeviceMesh original_device_mesh_1d_;
+
+  // The option may override the cost of communication primitives
+  const AutoShardingOption& auto_sharding_option_;
+
+  // Cached replica groups. Shape: [mesh_dim, group_id, ids in this group].
+  std::vector<std::vector<std::vector<int64_t>>> cached_replica_groups_;
+
+ private:
+  double AllToAllCostUtil(double num_bytes, int mesh_dim,
+                          int64_t num_devices) const;
+
+  void GenerateCachedReplicaGroups() {
+    // One vector per device_mesh_ dimension.
+    cached_replica_groups_.reserve(device_mesh_.num_dimensions());
+    for (size_t i = 0; i < device_mesh_.num_dimensions(); i++) {
+      cached_replica_groups_.push_back(
+          GetReplicaGroupsAlongOneDimension(device_mesh_, i));
+    }
+  }
+
+  void AdjustTensorMeshDimMapping(std::vector<int64_t>& mapping,
+                                  int64_t n_dim) const {
+    // Shift the non-zero dim for 1d mesh
+    if (n_dim == 1 && non_zero_mesh_dims_.size() == 1) {
+      for (size_t i = 0; i < mapping.size(); ++i) {
+        if (mapping[i] == 0) {
+          mapping[i] = non_zero_mesh_dims_.front();
+        }
+      }
+    }
+  }
+};
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_CLUSTER_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
new file mode 100644
index 00000000..903973ee
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
@@ -0,0 +1,117 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_MATRIX_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_MATRIX_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace spmd {
+// A simple matrix class to store and manipulate the cost matrices on edges.
+// It can create a view for matrix transpose without copying the memory.
+// TODO (zhuohan): Inherit from Array2D and add Transpose and operator+ (See
+// tensorflow/compiler/xla/array2d.h;l=39)
+template <typename T>
+class Matrix {
+ public:
+  Matrix() : n_(0), m_(0), transpose_(false), data_(nullptr) {}
+
+  Matrix(size_t n, size_t m) {
+    this->n_ = n;
+    this->m_ = m;
+    transpose_ = false;
+    data_ = std::make_shared<std::vector<T>>(n * m, T());
+  }
+
+  Matrix(size_t n, size_t m, bool transpose,
+         std::shared_ptr<std::vector<T>> data) {
+    this->n_ = n;
+    this->m_ = m;
+    this->transpose_ = transpose;
+    this->data_ = data;
+  }
+
+  Matrix Transpose() { return Matrix(m_, n_, !transpose_, data_); }
+
+  T operator()(size_t i, size_t j) const {
+    size_t idx;
+    if (transpose_) {
+      idx = j * n_ + i;
+    } else {
+      idx = i * m_ + j;
+    }
+    CHECK(data_ != nullptr) << n_ << " , " << m_;
+    CHECK(idx < n_ * m_) << idx << " , " << n_ << " , " << m_;
+    return (*data_)[idx];
+  }
+
+  T& operator()(size_t i, size_t j) {
+    size_t idx;
+    if (transpose_) {
+      idx = j * n_ + i;
+    } else {
+      idx = i * m_ + j;
+    }
+    CHECK(data_ != nullptr) << n_ << " , " << m_;
+    CHECK(idx < n_ * m_) << idx << " , " << n_ << " , " << m_;
+    return (*data_)[idx];
+  }
+
+  Matrix<T> operator+(const Matrix<T>& other) {
+    CHECK_EQ(n_, other.n_);
+    CHECK_EQ(m_, other.m_);
+    Matrix ret = Matrix(n_, m_);
+    for (size_t i = 0; i < n_; ++i) {
+      for (size_t j = 0; j < m_; ++j) {
+        ret(i, j) = operator()(i, j) + other(i, j);
+      }
+    }
+    return ret;
+  }
+
+  std::string ToString() const {
+    std::string str;
+
+    for (size_t i = 0; i < n_; ++i) {
+      for (size_t j = 0; j < m_; ++j) {
+        absl::StrAppend(&str, operator()(i, j).ToString(), " ");
+      }
+      absl::StrAppend(&str, "\n");
+    }
+
+    return str;
+  }
+
+  size_t n_;
+  size_t m_;
+  bool transpose_;
+  std::shared_ptr<std::vector<T>> data_;
+};
+}  // namespace spmd
+}  // namespace xla
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_MATRIX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.h
new file mode 100644
index 00000000..31f56fe1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_METRICS_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_METRICS_H_
+
+#include <cstdint>
+
+namespace xla {
+namespace metrics {
+
+void RecordAutoShardingInvocations();
+
+void RecordAutoShardingCompilationTime(uint64_t time_usecs);
+
+}  // namespace metrics
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/profiling_result.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/profiling_result.h
new file mode 100644
index 00000000..1fe95c47
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/experimental/auto_sharding/profiling_result.h
@@ -0,0 +1,157 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_PROFILING_RESULT_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_PROFILING_RESULT_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+
+namespace xla {
+namespace spmd {
+
+// Store the profiling results of communication and computation.
+class ProfilingResult {
+ public:
+  // TODO (zhuohan): loading the profiling result.
+  ProfilingResult() {
+    if (all_reduce_cost_dict_.empty()) {
+      enabled_ = false;
+    } else {
+      enabled_ = true;
+    }
+  }
+
+  bool Enabled() const { return enabled_; }
+
+  double EstimateAllGatherCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    if (all_gather_cost_dict_.empty()) {
+      // Use all-reduce to approximate all-gather.
+      return EstimateAllReduceCost(replica_groups, size, dtype) / 2;
+    }
+
+    return EstimateInternal(replica_groups, size, dtype,
+                            all_gather_cost_dict_) -
+           EstimateInternal(replica_groups, 0, dtype, all_gather_cost_dict_);
+  }
+
+  double EstimateAllReduceCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    return EstimateInternal(replica_groups, size, dtype,
+                            all_reduce_cost_dict_) -
+           EstimateInternal(replica_groups, 0, dtype, all_reduce_cost_dict_);
+  }
+
+  double EstimateReduceScatterCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    if (reduce_scatter_cost_dict_.empty()) {
+      // Use all-reduce to approximate reduce-scatter.
+      return EstimateAllReduceCost(replica_groups, size, dtype) / 2;
+    }
+
+    return EstimateInternal(replica_groups, size, dtype,
+                            reduce_scatter_cost_dict_) -
+           EstimateInternal(replica_groups, 0, dtype,
+                            reduce_scatter_cost_dict_);
+  }
+
+  double EstimateAllToAllCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    // A penalty factor to make the theoretical cost match the
+    // empirical cost on v100 + nvlink.
+    int64_t num_devices = replica_groups.front().size();
+    double penalty_factor = static_cast<double>(num_devices) / 2.0;
+    // Use all-gather to approximate all-to-all.
+    return EstimateAllGatherCost(replica_groups, size / num_devices, dtype) *
+           penalty_factor;
+  }
+
+  std::string ToString() {
+    std::string str;
+    for (const auto& item : all_reduce_cost_dict_) {
+      absl::StrAppend(&str, item.first.first, " ", item.first.second, "\n");
+    }
+    return str;
+  }
+
+ private:
+  // pair<group, dtype>
+  using Key = std::pair<std::string, std::string>;
+  // vector<pair<size, time>>
+  using Value = std::vector<std::pair<int64_t, double>>;
+
+  // Estimate the cost by linear interpolation between the two closest points.
+  double EstimateInternal(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype, const StableMap<Key, Value>& cost_dict) const {
+    Key key(Group2Str(replica_groups), dtype);
+    Value cost_list = cost_dict.at(key);
+
+    CHECK(!cost_list.empty());
+
+    size_t i;
+    if (size > cost_list.back().first) {
+      i = cost_list.size() - 2;
+    } else if (size < cost_list.front().first) {
+      i = 0;
+    } else {
+      for (i = 0; i < cost_list.size() - 1; ++i) {
+        if (cost_list[i].first <= size && size <= cost_list[i + 1].first) {
+          break;
+        }
+      }
+    }
+
+    int64_t left_size = cost_list[i].first;
+    double left_cost = cost_list[i].second;
+    int64_t right_size = cost_list[i + 1].first;
+    double right_cost = cost_list[i + 1].second;
+
+    return 1.0 * (size - left_size) / (right_size - left_size) *
+               (right_cost - left_cost) +
+           left_cost;
+  }
+
+  // Make a string key of a replica_groups.
+  std::string Group2Str(
+      const std::vector<std::vector<int64_t>>& replica_groups) const {
+    std::string str("(");
+    for (const auto& group : replica_groups) {
+      absl::StrAppend(&str, "(", absl::StrJoin(group, ","), ")");
+    }
+    absl::StrAppend(&str, ")");
+
+    return str;
+  }
+
+  bool enabled_;
+  StableMap<Key, Value> all_reduce_cost_dict_;
+  StableMap<Key, Value> all_gather_cost_dict_;
+  StableMap<Key, Value> reduce_scatter_cost_dict_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_PROFILING_RESULT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/backend_config.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/backend_config.h
new file mode 100644
index 00000000..9829a23b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/backend_config.h
@@ -0,0 +1,118 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_BACKEND_CONFIG_H_
+#define XLA_HLO_IR_BACKEND_CONFIG_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+// Returns a string representation of a proto in the format used by
+// HloInstruction::raw_backend_config_string.
+//
+// This is morally equivalent to:
+//
+//   HloInstruction instr;
+//   TF_RETURN_IF_ERROR(instr.set_backend_config(proto));
+//   return instr.raw_backend_config_string();
+//
+absl::StatusOr<std::string> BackendConfigToRawString(
+    const tsl::protobuf::Message& proto);
+
+// Clones the provided proto. If the input is nullptr, the result is also
+// nullptr.
+std::unique_ptr<tsl::protobuf::Message> CloneBackendConfigProto(
+    const tsl::protobuf::Message* proto);
+
+// A wrapper around the BackendConfig proto. It can be initialized either with
+// a proto object or a string representing the JSON encoding of a proto. Once
+// the wrapper is initialized (either during construction or via an assignment)
+// it becomes immutable and any further assignment attempts will fail.
+//
+// When the wrapper is initialized only the provided format is stored. If the
+// other format is requested from the wrapper later, it is lazily computed and
+// cached internally, before it is returned. Subsequent accesses will directly
+// return the cached value.
+//
+// All accesses are protected via a mutex because instances of this class are
+// accessed concurrently during auto tuning.
+class BackendConfigWrapper {
+ public:
+  BackendConfigWrapper() = default;
+  explicit BackendConfigWrapper(std::string raw_string)
+      : raw_string_(std::move(raw_string)) {}
+  explicit BackendConfigWrapper(const tsl::protobuf::Message& proto)
+      : proto_(CloneBackendConfigProto(&proto)) {}
+  BackendConfigWrapper(const BackendConfigWrapper& other) {
+    absl::MutexLock other_lock{&other.mutex_};
+    proto_ = CloneBackendConfigProto(other.proto_.get());
+    raw_string_ = other.raw_string_;
+  }
+
+  BackendConfigWrapper& operator=(BackendConfigWrapper&& other);
+  bool operator==(const BackendConfigWrapper& other) const;
+  bool operator!=(const BackendConfigWrapper& other) const {
+    return !(*this == other);
+  }
+
+  // Returns a reference to the raw string that corresponds to this backend
+  // config.
+  //
+  // WARNING: This function returns a reference which is valid at the time the
+  //          call terminates. If the BackendConfig is reassigned the reference
+  //          becomes invalid, which could lead to subtle and hard to detect
+  //          bugs, especially in multi-threaded code. The caller is responsible
+  //          for ensuring the lifetime of the referenced string.
+  //
+  //          Prefer to use the safer (but potentially slower) GetProto().
+  const std::string& GetRawString() const {
+    absl::WriterMutexLock lock{&mutex_};
+    return GetRawStringWithoutMutex();
+  }
+  absl::Status GetProto(tsl::protobuf::Message* output_proto) const;
+
+  bool empty() const {
+    absl::MutexLock lock{&mutex_};
+    return proto_ == nullptr && raw_string_.empty();
+  }
+
+ private:
+  const std::string& GetRawStringWithoutMutex() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // proto_ and raw_string_ must be consistent. If one is set, the other
+  // will be lazily initialized when requested. Because this class is accessed
+  // concurrently, a mutex is used to protect all access.
+  //
+  // Unfortunately, all members have to be mutable, since either of them can be
+  // the cached one.
+  mutable absl::Mutex mutex_;
+  mutable std::unique_ptr<tsl::protobuf::Message> proto_
+      ABSL_GUARDED_BY(mutex_);
+  mutable std::string raw_string_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_BACKEND_CONFIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/collective_device_list.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/collective_device_list.h
new file mode 100644
index 00000000..bf69893d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/collective_device_list.h
@@ -0,0 +1,130 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_COLLECTIVE_DEVICE_LIST_H_
+#define XLA_HLO_IR_COLLECTIVE_DEVICE_LIST_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+std::string ReplicaGroupsToString(
+    absl::Span<const ReplicaGroup> replica_groups);
+
+// Represents a list of replica groups (a list of list of devices) with
+// reshaping and transposing an iota array (iota tile assignment). Can be used
+// to represent certain common patterns of device lists in a compact, scalable
+// format.
+class IotaReplicaGroupList {
+ public:
+  explicit IotaReplicaGroupList(int64_t num_replica_groups,
+                                int64_t num_devices_per_group)
+      : iota_tile_assignment_(IotaTileAssignment::Create(
+            {num_replica_groups, num_devices_per_group})),
+        num_replica_groups_(num_replica_groups),
+        num_devices_per_group_(num_devices_per_group) {}
+
+  explicit IotaReplicaGroupList(int64_t num_replica_groups,
+                                int64_t num_devices_per_group,
+                                absl::Span<const int64_t> reshape_dims,
+                                absl::Span<const int> transpose_perm)
+      : iota_tile_assignment_(IotaTileAssignment::Create(
+            {num_replica_groups, num_devices_per_group}, reshape_dims,
+            transpose_perm)),
+        num_replica_groups_(num_replica_groups),
+        num_devices_per_group_(num_devices_per_group) {}
+
+  int64_t num_replica_groups() const;
+  int64_t num_devices_per_group() const;
+  absl::Span<const int64_t> reshape_dims() const {
+    return iota_tile_assignment_.reshape_dims();
+  }
+  absl::Span<const int> transpose_perm() const {
+    return iota_tile_assignment_.transpose_perm();
+  }
+  Array<int64_t> ToArray() const { return iota_tile_assignment_.ToArray(); }
+
+  std::string ToString() const;
+
+  IotaReplicaGroupListProto ToProto() const;
+
+  static IotaReplicaGroupList FromProto(const IotaReplicaGroupListProto& proto);
+
+ private:
+  IotaTileAssignment iota_tile_assignment_;
+  int64_t num_replica_groups_ = -1;
+  int64_t num_devices_per_group_ = -1;
+};
+
+// Represents a series of devices participating in a collective operation
+// (all-gather, all-reduce, etc.). While this directly translates to a list of
+// replica groups, it may be used to represent these lists in compact forms.
+class CollectiveDeviceList {
+ public:
+  explicit CollectiveDeviceList(absl::Span<const ReplicaGroup> replica_groups);
+
+  explicit CollectiveDeviceList(
+      absl::Span<const std::vector<int64_t>> replica_groups);
+
+  explicit CollectiveDeviceList(
+      const IotaReplicaGroupList& iota_replica_group_list)
+      : iota_replica_group_list_(iota_replica_group_list) {}
+
+  // TODO(b/316622399): Remove this constructor and its usage as creating an
+  // empty collective device list has no meaning.
+  explicit CollectiveDeviceList();
+
+  const std::vector<ReplicaGroup>& replica_groups() const;
+
+  const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
+    return iota_replica_group_list_;
+  }
+
+  std::string ToString(bool print_full_replica_group_list = false) const;
+
+  CollectiveDeviceListProto ToProto() const;
+
+  static CollectiveDeviceList FromProto(const CollectiveDeviceListProto& proto);
+
+  static CollectiveDeviceList FromProto(const HloInstructionProto& proto);
+
+ private:
+  // Construct collective device list from replica group start and end
+  // iterators.
+  CollectiveDeviceList(
+      tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator start,
+      tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator end);
+
+  void MaybeMaterializeFullReplicaGroupList() const;
+  std::optional<IotaReplicaGroupList> iota_replica_group_list_ = std::nullopt;
+  mutable std::shared_ptr<const std::vector<ReplicaGroup>>
+      replica_groups_shared_ = nullptr;
+  mutable const std::vector<ReplicaGroup>* replica_groups_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_COLLECTIVE_DEVICE_LIST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
new file mode 100644
index 00000000..15c039db
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
@@ -0,0 +1,439 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_DFS_HLO_VISITOR_H_
+#define XLA_HLO_IR_DFS_HLO_VISITOR_H_
+
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+class HloComputation;
+class HloInstruction;
+
+// A postorder depth-first HloInstruction visitor. When Handle* is called on an
+// instruction, all its operands were already visited. User code can subclass
+// this to iterate over an HloInstruction DAG. The Handle* routines have
+// operands / data unpacked for ease of use in the visitor subclass.
+//
+// No instruction will ever be visited twice; however, the root instruction will
+// be reported again when the traversal is done via a call to FinishVisit.
+//
+// If new HloInstructions are added during the traversal (e.g. by replacing an
+// instruction), they will also be visited if they are the operand of an
+// instruction that has not been visited yet (i.e. the instruction is in state
+// kNotVisited). If you want to avoid that a newly added instruction 'hlo' is
+// visited, you can call SetVisited(hlo). This may be necessary in normalization
+// passes that replace all instructions, otherwise already replaced instructions
+// might be visited (and replaced) again.
+//
+// A subclass must override at least
+// (either HandleElementwiseUnary or all the Handle methods for unary ops) and
+// (either HandleElementwiseBinary or all the Handle methods for binary ops)).
+// The default Handle methods for (unary, binary) ops call
+// (HandleElementwiseUnary, HandleElementwiseBinary).
+// The default (HandleElementwiseUnary, HandleElementwiseBinary) return an
+// "unimplemented" error status.
+//
+// Note: this may change to an iterator in the future for flexibility purposes.
+//
+// Users should not use this class directly, but use the type-aliases
+// DfsHloVisitor/ConstDfsHloVisitor instead.
+template <typename HloInstructionPtr>
+class DfsHloVisitorBase {
+  static_assert(
+      std::is_same<HloInstruction*, HloInstructionPtr>::value ||
+          std::is_same<const HloInstruction*, HloInstructionPtr>::value,
+      "Template argument expected to be HloInstruction* or const "
+      "HloInstruction*");
+
+ public:
+  DfsHloVisitorBase() = default;
+  virtual ~DfsHloVisitorBase() = default;
+
+  // These routines are self-descriptive, see class comment for usage
+  // information.
+
+  virtual absl::Status HandleElementwiseUnary(HloInstructionPtr hlo);
+  virtual absl::Status HandleElementwiseBinary(HloInstructionPtr hlo);
+
+  virtual absl::Status HandleClamp(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSelect(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleMaximum(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleMinimum(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleConcatenate(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConvert(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleBitcastConvert(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleStochasticConvert(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleCopy(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleComplex(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleMultiply(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleDot(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRaggedDot(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandlePower(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleSqrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleRsqrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleCbrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  /* go/keep-sorted start */
+  virtual absl::Status HandleAllGather(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllGatherDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllGatherStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllReduce(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllReduceDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllReduceStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllToAll(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectiveBroadcast(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectivePermuteDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConvolution(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleOptimizationBarrier(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandlePartitionId(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRaggedAllToAll(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReduceScatter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReplicaId(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
+  /* go/keep-sorted start */
+  virtual absl::Status HandleCholesky(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleFft(HloInstructionPtr fft) = 0;
+  virtual absl::Status HandleTopK(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
+  virtual absl::Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSetDimensionSize(HloInstructionPtr hlo) = 0;
+
+  virtual absl::Status HandleCompare(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleAdd(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleDivide(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleRemainder(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleSubtract(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleAbs(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleAtan2(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleRound(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleRoundNearestEven(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleErf(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleLogistic(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleSign(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleNegate(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleExp(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleExpm1(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleFloor(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleCeil(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleLog(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleClz(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleLog1p(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleCos(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleSin(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleTan(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleTanh(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleReal(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleImag(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleIsFinite(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleAnd(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleNot(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleOr(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleXor(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandlePopulationCount(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual absl::Status HandleShiftLeft(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleShiftRightArithmetic(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual absl::Status HandleShiftRightLogical(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+
+  virtual absl::Status HandleReducePrecision(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+
+  virtual absl::Status HandleDomain(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+
+  /* go/keep-sorted start */
+  virtual absl::Status HandleInfeed(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleOutfeed(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
+  /* go/keep-sorted start */
+  virtual absl::Status HandleBitcast(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleBroadcast(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCall(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConditional(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConstant(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCustomCall(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleDynamicSlice(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleDynamicUpdateSlice(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleFusion(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleGather(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleIota(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleMap(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleParameter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReduce(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReduceWindow(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReshape(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReverse(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRng(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRngBitGenerator(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRngGetAndUpdateState(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleScatter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSelectAndScatter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSlice(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSort(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleTranspose(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleTuple(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleWhile(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
+  virtual absl::Status HandlePad(HloInstructionPtr hlo) = 0;
+
+  virtual absl::Status HandleAsyncStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAsyncUpdate(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAsyncDone(HloInstructionPtr hlo) = 0;
+
+  virtual absl::Status HandleCopyStart(HloInstructionPtr copy_start) = 0;
+  virtual absl::Status HandleCopyDone(HloInstructionPtr copy_done) = 0;
+
+  virtual absl::Status HandleSend(HloInstructionPtr send) = 0;
+  virtual absl::Status HandleSendDone(HloInstructionPtr send_done) = 0;
+
+  virtual absl::Status HandleRecv(HloInstructionPtr recv) = 0;
+  virtual absl::Status HandleRecvDone(HloInstructionPtr recv_done) = 0;
+
+  virtual absl::Status HandleBatchNormTraining(HloInstructionPtr hlo) = 0;
+
+  virtual absl::Status HandleBatchNormInference(HloInstructionPtr hlo) = 0;
+
+  virtual absl::Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
+
+  virtual absl::Status HandleAddDependency(
+      HloInstructionPtr add_dependency) = 0;
+  virtual absl::Status HandleAfterAll(HloInstructionPtr token) = 0;
+
+  // Invoked to inform the visitor that the traversal has completed, and that
+  // the root was "root".
+  virtual absl::Status FinishVisit(HloInstructionPtr root) = 0;
+
+  // 3 possible visitation states of HLO instructions. Each instruction's
+  // state only flows one way: kNotVisited -> kVisiting -> kVisited.
+  enum VisitState {
+    kNotVisited = 0,
+    kVisiting = 1,
+    kVisited = 2,
+  };
+
+  VisitState GetVisitState(int id) {
+    auto iter = visit_state_.find(id);
+    if (iter == visit_state_.end()) {
+      return VisitState::kNotVisited;
+    }
+    return iter->second;
+  }
+  VisitState GetVisitState(const HloInstruction& instruction);
+
+  // Resize internal state if necessary to hold state for ids <= num.
+  // This call is purely a performance hint and can be omitted without
+  // affecting correctness.
+  void ReserveVisitStates(int num) { visit_state_.reserve(num); }
+  size_t VisitStateCapacity() const { return visit_state_.capacity(); }
+
+  // Useful when we want to visit the same computation more than once with the
+  // same visitor.
+  void ResetVisitStates() {
+    // Clear the map, but don't resize the capacity across uses -- Calculating
+    // and reserving space could be expensive, and we always use the same
+    // module->instruction_count() as the capacity.
+    visit_state_.erase(visit_state_.begin(), visit_state_.end());
+  }
+
+  // Useful when we want to free up the memory used by the visit state without
+  // destroying the actual visitor subclass.
+  void DestroyVisitState() {
+    visit_state_ = absl::flat_hash_map<int, VisitState>{};
+  }
+
+  void SetVisitState(int id, VisitState state) { visit_state_[id] = state; }
+
+  // Sets the visitation state of the given instruction as kVisiting.
+  //
+  // Precondition: current state must be kNotVisited.
+  void SetVisiting(const HloInstruction& instruction);
+
+  // Sets the visitation state of the given instruction as kVisited.
+  //
+  // Precondition: current state must be either kNotVisited or kVisiting.
+  void SetVisited(const HloInstruction& instruction);
+
+  // Returns whether the state of the given instruction is kVisiting.
+  bool IsVisiting(const HloInstruction& instruction) {
+    return GetVisitState(instruction) == kVisiting;
+  }
+
+  // Returns whether the state of the given instruction is kVisited.
+  bool DidVisit(const HloInstruction& instruction) {
+    return GetVisitState(instruction) == kVisited;
+  }
+
+  // Returns whether the state of the given instruction is kNotVisited.
+  bool NotVisited(const HloInstruction& instruction) {
+    return GetVisitState(instruction) == kNotVisited;
+  }
+
+  // This method should be overridden by subclasses that wish to run some
+  // operation on an op before its Handle* visitor method is called.
+  //
+  // For any HLO op, the order of calls is:
+  //
+  //   Preprocess(op);
+  //   Handle/OpType/(op);
+  //   Postprocess(op);
+  //
+  // Overriding methods should call DfsHloVisitor::Preprocess before doing their
+  // own preprocessing.
+  virtual absl::Status Preprocess(HloInstructionPtr hlo);
+
+  // This method should be overridden by subclasses that wish to run some
+  // operation on an op after its Handle* visitor method is called. See
+  // Preprocess for more details.
+  //
+  // Overriding methods should call DfsHloVisitor::Postprocess after doing their
+  // own postprocessing.
+  virtual absl::Status Postprocess(HloInstructionPtr hlo);
+
+ private:
+  absl::flat_hash_map<int, VisitState> visit_state_;
+
+  DfsHloVisitorBase(const DfsHloVisitorBase&) = delete;
+  DfsHloVisitorBase& operator=(const DfsHloVisitorBase&) = delete;
+};
+
+// Explicit instantiations in dfs_hlo_visitor.cc.
+extern template class DfsHloVisitorBase<HloInstruction*>;
+extern template class DfsHloVisitorBase<const HloInstruction*>;
+
+// Users should use one of these two type aliases, which are the only two valid
+// instantiations of DfsHloVisitorBase.
+using DfsHloVisitor = DfsHloVisitorBase<HloInstruction*>;
+using ConstDfsHloVisitor = DfsHloVisitorBase<const HloInstruction*>;
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_DFS_HLO_VISITOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
new file mode 100644
index 00000000..56846cac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
@@ -0,0 +1,420 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
+#define XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/base/optimization.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// DfsHloVisitor with default action based on the HloInstruction being visited.
+// Users should not use this class directly, but use the type aliases
+// DfsHloVisitorWithDefault/ConstDfsHloVisitorWithDefault instead.
+//
+// Do *not* add an override to this class if the opcode is covered by
+// HandleElementwiseUnary/Binary. These opcode handlers dispatch to
+// HandleElementwiseUnary/Binary in DfsHloVisitorBase. Adding such a handler
+// here will break passes which rely on the HandleElementwiseUnary/Binary
+// handling these opcodes.
+template <typename HloInstructionPtr>
+class DfsHloVisitorWithDefaultBase
+    : public DfsHloVisitorBase<HloInstructionPtr> {
+ public:
+  DfsHloVisitorWithDefaultBase() = default;
+  ~DfsHloVisitorWithDefaultBase() override = default;
+
+  // Default action performed on HloInstruction.
+  virtual absl::Status DefaultAction(HloInstructionPtr hlo_instruction) = 0;
+
+  absl::Status HandleElementwiseUnary(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleElementwiseBinary(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+
+  absl::Status HandleBatchNormTraining(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+
+  absl::Status HandleBatchNormInference(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+
+  absl::Status HandleBatchNormGrad(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+
+  absl::Status HandleClamp(HloInstructionPtr clamp) override {
+    return DefaultAction(clamp);
+  }
+  absl::Status HandleConcatenate(HloInstructionPtr concatenate) override {
+    return DefaultAction(concatenate);
+  }
+  absl::Status HandleSelect(HloInstructionPtr select) override {
+    return DefaultAction(select);
+  }
+  absl::Status HandleDot(HloInstructionPtr dot) override {
+    return DefaultAction(dot);
+  }
+  absl::Status HandleRaggedDot(HloInstructionPtr dot) override {
+    return DefaultAction(dot);
+  }
+  absl::Status HandleConvolution(HloInstructionPtr convolution) override {
+    return DefaultAction(convolution);
+  }
+  absl::Status HandleFft(HloInstructionPtr fft) override {
+    return DefaultAction(fft);
+  }
+  absl::Status HandleTriangularSolve(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleCholesky(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleOptimizationBarrier(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleAllGather(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
+  absl::Status HandleAllGatherStart(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
+  absl::Status HandleAllGatherDone(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
+  absl::Status HandleAllReduce(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
+  absl::Status HandleReduceScatter(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleAllReduceStart(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleAllReduceDone(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleAllToAll(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleRaggedAllToAll(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleCollectiveBroadcast(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleCollectivePermute(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleCollectivePermuteStart(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleCollectivePermuteDone(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleReplicaId(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandlePartitionId(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  absl::Status HandleRng(HloInstructionPtr random) override {
+    return DefaultAction(random);
+  }
+  absl::Status HandleRngBitGenerator(HloInstructionPtr random) override {
+    return DefaultAction(random);
+  }
+  absl::Status HandleRngGetAndUpdateState(HloInstructionPtr random) override {
+    return DefaultAction(random);
+  }
+  absl::Status HandleInfeed(HloInstructionPtr infeed) override {
+    return DefaultAction(infeed);
+  }
+  absl::Status HandleOutfeed(HloInstructionPtr outfeed) override {
+    return DefaultAction(outfeed);
+  }
+  absl::Status HandleReverse(HloInstructionPtr reverse) override {
+    return DefaultAction(reverse);
+  }
+  absl::Status HandleSort(HloInstructionPtr sort) override {
+    return DefaultAction(sort);
+  }
+  absl::Status HandleConstant(HloInstructionPtr constant) override {
+    return DefaultAction(constant);
+  }
+  absl::Status HandleIota(HloInstructionPtr iota) override {
+    return DefaultAction(iota);
+  }
+  absl::Status HandleGetTupleElement(
+      HloInstructionPtr get_tuple_element) override {
+    return DefaultAction(get_tuple_element);
+  }
+  absl::Status HandleParameter(HloInstructionPtr parameter) override {
+    return DefaultAction(parameter);
+  }
+  absl::Status HandleFusion(HloInstructionPtr fusion) override {
+    return DefaultAction(fusion);
+  }
+  absl::Status HandleCall(HloInstructionPtr call) override {
+    return DefaultAction(call);
+  }
+  absl::Status HandleCustomCall(HloInstructionPtr custom_call) override {
+    return DefaultAction(custom_call);
+  }
+  absl::Status HandleSlice(HloInstructionPtr slice) override {
+    return DefaultAction(slice);
+  }
+  absl::Status HandleDynamicSlice(HloInstructionPtr dynamic_slice) override {
+    return DefaultAction(dynamic_slice);
+  }
+  absl::Status HandleDynamicUpdateSlice(
+      HloInstructionPtr dynamic_update_slice) override {
+    return DefaultAction(dynamic_update_slice);
+  }
+  absl::Status HandleTuple(HloInstructionPtr tuple) override {
+    return DefaultAction(tuple);
+  }
+  absl::Status HandleMap(HloInstructionPtr map) override {
+    return DefaultAction(map);
+  }
+  absl::Status HandleReduce(HloInstructionPtr reduce) override {
+    return DefaultAction(reduce);
+  }
+  absl::Status HandleReduceWindow(HloInstructionPtr reduce_window) override {
+    return DefaultAction(reduce_window);
+  }
+  absl::Status HandleSelectAndScatter(
+      HloInstructionPtr select_and_scatter) override {
+    return DefaultAction(select_and_scatter);
+  }
+  absl::Status HandleBitcast(HloInstructionPtr bitcast) override {
+    return DefaultAction(bitcast);
+  }
+  absl::Status HandleBroadcast(HloInstructionPtr broadcast) override {
+    return DefaultAction(broadcast);
+  }
+  absl::Status HandlePad(HloInstructionPtr pad) override {
+    return DefaultAction(pad);
+  }
+  absl::Status HandleDynamicReshape(
+      HloInstructionPtr dynamic_reshape) override {
+    return DefaultAction(dynamic_reshape);
+  }
+  absl::Status HandleReshape(HloInstructionPtr reshape) override {
+    return DefaultAction(reshape);
+  }
+  absl::Status HandleTranspose(HloInstructionPtr transpose) override {
+    return DefaultAction(transpose);
+  }
+  absl::Status HandleWhile(HloInstructionPtr xla_while) override {
+    return DefaultAction(xla_while);
+  }
+  absl::Status HandleConditional(HloInstructionPtr conditional) override {
+    return DefaultAction(conditional);
+  }
+  absl::Status HandleAsyncStart(HloInstructionPtr async_start) override {
+    return DefaultAction(async_start);
+  }
+  absl::Status HandleAsyncUpdate(HloInstructionPtr async_update) override {
+    return DefaultAction(async_update);
+  }
+  absl::Status HandleAsyncDone(HloInstructionPtr async_done) override {
+    return DefaultAction(async_done);
+  }
+  absl::Status HandleCopyStart(HloInstructionPtr copy_start) override {
+    return DefaultAction(copy_start);
+  }
+  absl::Status HandleCopyDone(HloInstructionPtr copy_done) override {
+    return DefaultAction(copy_done);
+  }
+  absl::Status HandleRecv(HloInstructionPtr recv) override {
+    return DefaultAction(recv);
+  }
+  absl::Status HandleRecvDone(HloInstructionPtr recv_done) override {
+    return DefaultAction(recv_done);
+  }
+  absl::Status HandleSend(HloInstructionPtr send) override {
+    return DefaultAction(send);
+  }
+  absl::Status HandleTopK(HloInstructionPtr topk) override {
+    return DefaultAction(topk);
+  }
+  absl::Status HandleSendDone(HloInstructionPtr send_done) override {
+    return DefaultAction(send_done);
+  }
+  absl::Status HandleGather(HloInstructionPtr gather) override {
+    return DefaultAction(gather);
+  }
+  absl::Status HandleScatter(HloInstructionPtr scatter) override {
+    return DefaultAction(scatter);
+  }
+  absl::Status HandleAfterAll(HloInstructionPtr token) override {
+    return DefaultAction(token);
+  }
+  absl::Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
+    return DefaultAction(get_size);
+  }
+  absl::Status HandleSetDimensionSize(HloInstructionPtr get_size) override {
+    return DefaultAction(get_size);
+  }
+  absl::Status HandleAddDependency(HloInstructionPtr add_dependency) override {
+    return DefaultAction(add_dependency);
+  }
+
+  // Invoked to inform the visitor that the traversal has completed, and that
+  // the root was "root".
+  absl::Status FinishVisit(HloInstructionPtr /*root*/) override {
+    return absl::OkStatus();
+  }
+
+ private:
+  DfsHloVisitorWithDefaultBase(const DfsHloVisitorWithDefaultBase&) = delete;
+  DfsHloVisitorWithDefaultBase& operator=(const DfsHloVisitorWithDefaultBase&) =
+      delete;
+};
+
+// Users should use one of these two type aliases, which are the only two valid
+// instantiations of DfsHloVisitorWithDefaultBase.
+using DfsHloVisitorWithDefault = DfsHloVisitorWithDefaultBase<HloInstruction*>;
+using ConstDfsHloVisitorWithDefault =
+    DfsHloVisitorWithDefaultBase<const HloInstruction*>;
+
+// A common base class for visitors performing rewriting operation.
+//
+// Subclasses call ReplaceWithNewInstruction and ReplaceInstruction while
+// visiting.
+class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Runs a visitor on the module and returns whether the module has changed.
+  absl::StatusOr<bool> RunOnModule(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {}) {
+    absl::Status status;
+    for (HloComputation* computation :
+         module->MakeNonfusionComputations(execution_threads)) {
+      status = computation->Accept(this);
+      if (ABSL_PREDICT_FALSE(!status.ok())) return status;
+    }
+    return changed();
+  }
+
+  // Default visitor action is to do nothing and return OK.
+  absl::Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return absl::OkStatus();
+  }
+
+  bool changed() const { return changed_; }
+
+ protected:
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the absl::Status representing the result of the replace operation.
+  absl::Status ReplaceWithNewInstruction(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction) {
+    VLOG(3) << "Replacing instruction:" << "\n  old: "
+            << old_instruction->ToString()
+            << "\n  new: " << new_instruction->ToString();
+    absl::Status status = old_instruction->parent()->ReplaceWithNewInstruction(
+        old_instruction, std::move(new_instruction));
+    if (ABSL_PREDICT_TRUE(status.ok())) {
+      changed_ = true;
+    }
+    return status;
+  }
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the absl::Status representing the result of the replace operation.
+  absl::StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
+                                          HloInstruction* new_instruction,
+                                          bool preserve_sharding) {
+    VLOG(3) << "Replacing instruction:" << "\n  old: "
+            << old_instruction->ToString()
+            << "\n  new: " << new_instruction->ToString();
+    absl::StatusOr<bool> changed_or =
+        old_instruction->parent()->ReplaceInstruction(
+            old_instruction, new_instruction, preserve_sharding);
+    if (ABSL_PREDICT_TRUE(changed_or.ok())) {
+      changed_ |= changed_or.value();
+    }
+    return changed_or;
+  }
+
+  absl::Status ReplaceInstruction(HloInstruction* old_instruction,
+                                  HloInstruction* new_instruction) {
+    absl::StatusOr<bool> changed_or =
+        ReplaceInstruction(old_instruction, new_instruction,
+                           /*preserve_sharding=*/false);
+    if (ABSL_PREDICT_TRUE(changed_or.ok())) {
+      DCHECK(changed_or.value());
+    }
+    return changed_or.status();
+  }
+
+  // Mark the computation as having changed.
+  void MarkAsChanged() { changed_ = true; }
+  void MarkAsMaybeChanged(bool changed) { changed_ |= changed; }
+
+ private:
+  bool changed_ = false;
+};
+
+// (Const)FunctionVisitor lets you transform an
+// std::function<absl::Status((const) HloInstruction*)> into a
+// (Const)DfsHloVisitor.
+//
+// This is useful if you have code that needs to handle visitors in the form of
+// both std::function and DfsHloVisitor.  You can wrap the function in a
+// FunctionVisitor and then treat it like any other DfsHloVisitor.
+template <typename HloInstructionPtr>
+class FunctionVisitorBase
+    : public DfsHloVisitorWithDefaultBase<HloInstructionPtr> {
+ public:
+  explicit FunctionVisitorBase(
+      std::function<absl::Status(HloInstructionPtr)> visitor_func)
+      : visitor_func_(std::move(visitor_func)) {}
+
+  absl::Status DefaultAction(HloInstructionPtr hlo_instruction) override {
+    return visitor_func_(hlo_instruction);
+  }
+
+ private:
+  FunctionVisitorBase(const FunctionVisitorBase&) = delete;
+  FunctionVisitorBase& operator=(const FunctionVisitorBase&) = delete;
+
+  std::function<absl::Status(HloInstructionPtr)> visitor_func_;
+};
+
+using FunctionVisitor = FunctionVisitorBase<HloInstruction*>;
+using ConstFunctionVisitor = FunctionVisitorBase<const HloInstruction*>;
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
new file mode 100644
index 00000000..12d42ddf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
@@ -0,0 +1,123 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_DYNAMIC_PARAMETER_BINDING_H_
+#define XLA_HLO_IR_DYNAMIC_PARAMETER_BINDING_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+// We currently use an explicit API that takes an extra parameter to indicate
+// the runtime size of a dynamic dimension. DynamicParameterBinding indicates
+// the relationship between parameter: We can have a dynamic parameter that
+// points to another target parameter to indicate that the target parameter is
+// dynamic.
+//
+//
+// TODO(b/119520625): Remove this API once we have more dynamic shape infra
+// ready.
+class DynamicParameterBinding {
+ public:
+  // DynamicSizeParameter represents a special parameter that is used to
+  // represent the runtime size of a dimension of another parameter. A dynamic
+  // size parameter has to be a scalar value.
+  struct DynamicSizeParameter {
+    // The parameter number of dynamic parameter.
+    int64_t parameter_num;
+    // The index of the parameter.
+    ShapeIndex parameter_index;
+  };
+
+  // DynamicDimension represents a dimension whose size is determined at
+  // runtime. A DynamicDimension's runtime size is determined by the bound
+  // DynamicSizeParameter using `DynamicParameterBinding::Bind` method.
+  struct DynamicDimension {
+    // The parameter number of dynamic dimension.
+    int64_t parameter_num;
+    // The subshape index of the parameter.
+    ShapeIndex parameter_index;
+    // The dimension number in the subshape.
+    int64_t dimension;
+
+    // "friend" keyword are added so these functions can be found by ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.parameter_num, m.parameter_index,
+                        m.dimension);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.parameter_num == rhs.parameter_num &&
+             lhs.parameter_index == rhs.parameter_index &&
+             lhs.dimension == rhs.dimension;
+    }
+  };
+
+  // Adds binding which indicates that the dimension indicated by
+  // `dynamic_dimension` is dynamic, and its runtime size is represented by
+  // `dynamic_parameter`.
+  absl::Status Bind(const DynamicSizeParameter& dynamic_parameter,
+                    const DynamicDimension& dynamic_dimension);
+
+  // Returns the parameter and the index representing the runtime size of
+  // dimension `dim_num` of parameter `param_num` at `param_index`.
+  //
+  // Returns nullopt if the binding is not set.
+  std::optional<DynamicSizeParameter> GetBinding(
+      const DynamicDimension& dynamic_dimension) const;
+
+  using BindingFn =
+      std::function<absl::Status(const DynamicSizeParameter& dynamic_parameter,
+                                 const DynamicDimension& dynamic_dimension)>;
+
+  // Iterate through each binding.
+  absl::Status ForEachBinding(BindingFn fn) const;
+
+  std::string ToString() const;
+
+  // Verifies that the given binding is valid for the given computation.
+  // Specifically, the binding's parameter and parameter size should be valid.
+  absl::Status Verify(const HloComputation& computation) const;
+
+  // Returns true iff there are no bindings.
+  bool empty() const { return bindings_.empty(); }
+
+ private:
+  // Keeps track of mappings from DynamicDimension to DynamicParameter. The
+  // direction of is chosen so that we can easily query if a dimension is
+  // dynamic and which dynamic parameter represents the real size of that
+  // dimension.
+  absl::flat_hash_map<DynamicDimension, DynamicSizeParameter> bindings_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_DYNAMIC_PARAMETER_BINDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_casting_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_casting_utils.h
new file mode 100644
index 00000000..ff635675
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_casting_utils.h
@@ -0,0 +1,115 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Casting utility functions for HLO instructions.
+
+#ifndef XLA_HLO_IR_HLO_CASTING_UTILS_H_
+#define XLA_HLO_IR_HLO_CASTING_UTILS_H_
+
+#include <type_traits>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+template <class T>
+using EnableIfDerivedFromHlo =
+    typename std::enable_if<std::is_base_of<HloInstruction, T>::value>::type;
+
+// Casts an HloInstruction pointer to one of its subclasses, dies if argument is
+// nullptr or runtime information does not match.
+//
+// Similar to LLVM's cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* Cast(const HloInstruction* instruction) {
+  CHECK(instruction != nullptr);
+  CHECK(T::ClassOf(instruction))
+      << "Invalid HloInstruction casting. Destination type: "
+      << typeid(T).name() << ". Instruction: " << instruction->name();
+  const T* casted = static_cast<const T*>(instruction);
+#ifndef NDEBUG
+  const T* dynamic_casted = dynamic_cast<const T*>(instruction);
+  CHECK(dynamic_casted != nullptr)
+      << "Invalid HloInstruction casting. Destination type: "
+      << typeid(T).name() << ". Instruction: " << instruction->name();
+#endif
+  return casted;
+}
+
+// Non-const overload of Cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* Cast(HloInstruction* instruction) {
+  return const_cast<T*>(
+      Cast<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Works just like the Cast, except that it allows for a null pointer as an
+// argument which it then propagates.
+//
+// Similar to LLVM's cast_or_null.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* CastOrNull(const HloInstruction* instruction) {
+  return instruction != nullptr ? Cast<T>(instruction) : nullptr;
+}
+
+// Non-const overload of CastOrNull.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* CastOrNull(HloInstruction* instruction) {
+  return const_cast<T*>(
+      CastOrNull<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Casts an HloInstruction pointer to one of its subclasses, dies if argument is
+// nullptr, returns nullptr if runtime information does not match.
+//
+// Similar to LLVM's dyn_cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* DynCast(const HloInstruction* instruction) {
+  CHECK(instruction != nullptr);
+  const T* casted =
+      T::ClassOf(instruction) ? static_cast<const T*>(instruction) : nullptr;
+#ifndef NDEBUG
+  CHECK_EQ(casted, dynamic_cast<const T*>(instruction));
+#endif
+  return casted;
+}
+
+// Non-const overload of DynCast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* DynCast(HloInstruction* instruction) {
+  return const_cast<T*>(
+      DynCast<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Works just like the DynCast, except that it allows for a null pointer as an
+// argument which it then propagates.
+//
+// Similar to LLVM's dyn_cast_or_null.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* DynCastOrNull(const HloInstruction* instruction) {
+  return instruction != nullptr ? DynCast<T>(instruction) : nullptr;
+}
+
+// Non-const overload of DynCastOrNull.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* DynCastOrNull(HloInstruction* instruction) {
+  return const_cast<T*>(
+      DynCastOrNull<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_CASTING_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_clone_context.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_clone_context.h
new file mode 100644
index 00000000..b1eaa313
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_clone_context.h
@@ -0,0 +1,98 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_CLONE_CONTEXT_H_
+#define XLA_HLO_IR_HLO_CLONE_CONTEXT_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/map_util.h"
+
+namespace xla {
+
+class HloInstruction;
+class HloComputation;
+class HloModule;
+
+// Data structure used to track the cloning of HloInstruction and HloComputation
+// objects.
+class HloCloneContext {
+ public:
+  // Creates a new HloCloneContext object to clone HloInstruction and
+  // HloComputation objects to be added to the module specified as argument.
+  // The suffix string will be appended to computation names.
+  explicit HloCloneContext(HloModule* module, const std::string& suffix = "")
+      : module_(module), suffix_(suffix) {}
+
+  HloModule* module() const { return module_; }
+
+  const std::string& suffix() const { return suffix_; }
+
+  void MapInstruction(const HloInstruction* old_instruction,
+                      HloInstruction* new_instruction) {
+    instructions_[old_instruction] = new_instruction;
+  }
+
+  void MapComputation(const HloComputation* old_computation,
+                      HloComputation* new_computation) {
+    computations_[old_computation] = new_computation;
+  }
+
+  // Finds the new instruction mapped to its old copy, or return nullptr in case
+  // it is not found.
+  HloInstruction* FindInstruction(const HloInstruction* old_instruction) const {
+    return FindOrDefault(instructions_, old_instruction, nullptr);
+  }
+
+  // Finds the new computation mapped to its old copy, or return nullptr in case
+  // it is not found.
+  HloComputation* FindComputation(const HloComputation* old_computation) const {
+    return FindOrDefault(computations_, old_computation, nullptr);
+  }
+
+  // Retrieves the new instruction mapped to its old copy, or fail if not found.
+  HloInstruction* GetInstruction(const HloInstruction* old_instruction) const {
+    return FindOrDie(instructions_, old_instruction);
+  }
+
+  // Retrieves the new computation mapped to its old copy, or fail if not found.
+  HloComputation* GetComputation(const HloComputation* old_computation) const {
+    return FindOrDie(computations_, old_computation);
+  }
+
+  const absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
+  cloned_instructions() const {
+    return instructions_;
+  }
+
+  const absl::flat_hash_map<const HloComputation*, HloComputation*>&
+  cloned_computations() const {
+    return computations_;
+  }
+
+ private:
+  HloCloneContext(const HloCloneContext&) = delete;
+  const HloCloneContext& operator=(const HloCloneContext&) = delete;
+
+  HloModule* module_;
+  std::string suffix_;
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*> instructions_;
+  absl::flat_hash_map<const HloComputation*, HloComputation*> computations_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_CLONE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_computation.h
new file mode 100644
index 00000000..4411e310
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -0,0 +1,1109 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_COMPUTATION_H_
+#define XLA_HLO_IR_HLO_COMPUTATION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/hash/hash.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/ptrvec.h"
+#include "xla/iterator_util.h"
+#include "xla/printer.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/name_uniquer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/lib/gtl/iterator_range.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+
+class HloModule;
+
+// Describes a computation at the HLO level.
+//
+// You can think of an HloComputation like a function.  It has some inputs
+// (parameters) and returns exactly one value (the value of its root node).  If
+// you want to return multiple values, you can return a tuple.
+//
+// The instructions inside of a computation do not have an explicit total order.
+// Instead, they have a partial order determined by their data and control
+// dependencies.
+//
+// An HloModule contains one "entry computation" -- this is like main() in a C
+// program.  Every other computation inside of a module is attached to one or
+// more HloInstructions, as a "nested computation".  For example, the kMap
+// instruction has a nested computation and "applies" it to every element of its
+// input, elementwise.  (That is, the input [x, y, z] is transformed to [f(x),
+// f(y), f(z)].)
+class HloComputation {
+ public:
+  // Used by instructions_.
+  using InstructionList = std::vector<HloInstructionInfo>;
+
+  // Builder class for HloComputation.
+  class Builder {
+   public:
+    explicit Builder(absl::string_view name) : name_(name) {}
+    Builder(Builder&& b) = default;
+    virtual ~Builder() = default;
+
+    // Build and return an HloComputation. The parameter root_instruction
+    // specifies the already-added instruction to use as the root. If
+    // root_instruction is nullptr then use the last added instruction as the
+    // root.
+    std::unique_ptr<HloComputation> Build(
+        HloInstruction* root_instruction = nullptr);
+
+    // Add the instruction to be part of this computation.
+    // If the new instruction is derived from another one,
+    // you probably want to do
+    // `original_inst->AddInstruction(new_inst)` instead.
+    virtual HloInstruction* AddInstruction(
+        std::unique_ptr<HloInstruction> instruction) {
+      auto* added_instruction = instruction.get();
+      instructions_.push_back(std::move(instruction));
+      return added_instruction;
+    }
+
+    HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                   std::optional<absl::string_view> new_name) {
+      instruction->SetAndSanitizeName(new_name.value());
+      return AddInstruction(std::move(instruction));
+    }
+
+    absl::StatusOr<HloInstruction*> AddParameter(
+        std::unique_ptr<HloInstruction> parameter) {
+      if (!parameter_numbers_.insert(parameter->parameter_number()).second) {
+        return Internal("Duplicate parameter number %d",
+                        parameter->parameter_number());
+      }
+      return AddInstruction(std::move(parameter));
+    }
+
+    absl::Status ForEachInstruction(
+        absl::FunctionRef<absl::Status(const HloInstruction*)> func) const {
+      for (const auto& instruction : instructions_) {
+        TF_RETURN_IF_ERROR(func(instruction.get()));
+      }
+      return absl::OkStatus();
+    }
+
+    HloInstruction* last_added_instruction() const {
+      return instructions_.empty() ? nullptr : instructions_.back().get();
+    }
+
+   private:
+    const std::string name_;
+    std::vector<std::unique_ptr<HloInstruction>> instructions_;
+    absl::flat_hash_set<int> parameter_numbers_;
+
+    Builder(const Builder&) = delete;
+    Builder& operator=(const Builder&) = delete;
+  };
+
+  // Helper class to automatically set the OpMetadata for every instruction
+  // added to a computation.
+  class MetadataBuilder {
+   public:
+    MetadataBuilder(HloComputation* computation, const OpMetadata& metadata)
+        : computation_(computation), metadata_(metadata) {}
+
+    HloInstruction* AddInstruction(
+        std::unique_ptr<HloInstruction> instruction) {
+      instruction->set_metadata(metadata_);
+      return computation_->AddInstruction(std::move(instruction));
+    }
+
+   private:
+    HloComputation* computation_;
+    OpMetadata metadata_;
+  };
+
+  // Helper class for returning the instruction post order for a computation,
+  // but maintaining a cache to avoid repeated calls to
+  // computation->MakeInstructionPostorder().  The cache is invalidated if
+  // RecordChange(<something evaluating to true>)  is called.
+  //
+  // This class can be handy to avoid recomputing the instruction post order
+  // when an optimization pass wants to make multiple passes over the
+  // instructions.
+  //
+  // Example usage:
+  //   for (auto* computation : module->computations(execution_threads)) {
+  //     HloComputation::CachingPostOrder cpo(computation);
+  //     for (auto instruction : cpo.PostOrder()) {  // Pass 1
+  //       bool did_change = ... maybe do something to instruction ...;
+  //       cpo.RecordChange(did_change);
+  //     }
+  //     for (auto instruction : cpo.PostOrder()) {  // Pass 2
+  //       bool did_change = ... maybe do something else to instruction ...;
+  //       cpo.RecordChange(did_change);
+  //     }
+  //   }
+  class CachingPostOrder {
+   public:
+    explicit CachingPostOrder(const HloComputation* computation)
+        : computation_(computation), recompute_(true) {}
+
+    // Returns the instruction post-order for "computation"
+    const std::vector<HloInstruction*>& PostOrder() {
+      if (recompute_) {
+        cached_post_order_ = computation_->MakeInstructionPostOrder();
+        recompute_ = false;
+      }
+      return cached_post_order_;
+    }
+
+    void RecordChange(bool changed) { recompute_ |= changed; }
+
+   private:
+    const HloComputation* computation_;
+    bool recompute_;
+    std::vector<HloInstruction*> cached_post_order_;
+  };
+
+  ~HloComputation();
+
+  enum class InstructionType : uint8_t {
+    kUnset,
+    // This computation is a fusion computation. A fusion computation ordinarily
+    // also has a non-null instruction. However, if a fusion instruction
+    // is removed during compilation, the fusion computation becomes
+    // unreachable, and its instruction is set to null. We still need to regard
+    // such computations as fusion computations for HLO scheduling purposes.
+    kFusion,
+    // This computation is a custom-call computation.
+    kCustomCall,
+    // This computation is a collective computation.
+    kCollective,
+    // This computation is a while body computation.
+    kWhile,
+    // This computation is a conditional branch computation.
+    kConditional,
+    // Last Value for range checking.
+    kLast = kConditional,
+  };
+  static constexpr uintptr_t kInstructionTypeMask = 0b111;
+  static_assert(static_cast<int>(InstructionType::kUnset) == 0,
+                "kUnset must be 0.");
+
+  InstructionType instruction_type() const {
+    return static_cast<InstructionType>(instruction_and_type_ &
+                                        kInstructionTypeMask);
+  }
+
+  HloInstruction* instruction() const {
+    DCHECK(instruction_type() <= InstructionType::kLast);
+    return reinterpret_cast<HloInstruction*>(instruction_and_type_ &
+                                             ~kInstructionTypeMask);
+  }
+  // Add an instruction to the computation. The computation takes ownership of
+  // the instruction.
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                 absl::string_view new_name = "");
+
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                 const OpMetadata* metadata);
+
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                 const OpMetadata* metadata,
+                                 const FrontendAttributes* frontend_attributes);
+
+  // Replace the old parameter at index param_no with
+  // `instruction`. Updates uses and root instruction. Removes old
+  // instruction from computation. No check is done on the shape.
+  HloInstruction* ReplaceParameter(int64_t param_no,
+                                   std::unique_ptr<HloInstruction> instruction);
+
+  // Remove the param_no'th parameter from the computation.
+  // Note this is only applicatable to the computation for the fusion
+  // instruction.
+  absl::Status RemoveParameter(int64_t param_no);
+
+  // Remove unused parameters from the computation.
+  // Note this is only applicatable to the computation for the fusion
+  // instruction.
+  absl::Status RemoveUnusedParametersFromFusedComputation();
+
+  // Remove unused parameters from the computation. Unlike
+  // RemoveUnusedParametersFromFusedComputation, this function can be used
+  // to remove parameters from non-fusion computations.
+  absl::Status RemoveUnusedParametersFromAnyComputation();
+
+  // Adds a new parameter instruction to a fusion computation.
+  //
+  // This should be a new parameter. Instruction will be appended to parameters
+  // and inserted to the instruction list.
+  HloInstruction* AddParameter(std::unique_ptr<HloInstruction> instruction);
+
+  // Adds a new parameter instruction to the entry computation and update
+  // the parent module config to reflect the change.
+  //
+  // This should be a new parameter. Instruction will be appended to parameters
+  // and inserted to the instruction list.
+  HloInstruction* AddEntryComputationParameter(
+      std::unique_ptr<HloInstruction> instruction);
+
+  // Replaces an old parameter with a new parameter. Adds the new parameter
+  // instruction to the entry computation.  Updates users instruction.
+  absl::Status ReplaceEntryComputationParameter(
+      int64_t param_no, HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> instruction);
+
+  // Remove an instruction from the computation. The instruction must have no
+  // users. This call does not yet deallocate the instruction, but marks it as
+  // deleted, so that the next call to Cleanup() will deallocate it. If the
+  // instruction is a constant, its literal is cleared.
+  absl::Status RemoveInstruction(HloInstruction* instruction);
+
+  // Removes an instruction from the computation. The instruction must have no
+  // users. The instruction will be removed even if it is marked as not
+  // removable. This call does not yet deallocate the instruction, but marks it
+  // as deleted, so that the next call to Cleanup() will deallocate it. If the
+  // instruction is a constant, its literal is cleared.
+  absl::Status ForceRemoveInstruction(HloInstruction* instruction);
+
+  // Remove an instruction (including side effecting ones) from the computation
+  // and also transitively any operand that has no side effect and no users post
+  // removing an instruction. The instruction must have no users. This call does
+  // not yet deallocate the instruction, but marks it as deleted, so that the
+  // next call to Cleanup() will deallocate it. If the instruction is a
+  // constant, its literal is cleared. If given, the cleanup routine is executed
+  // on a removed instruction before its marked as deleted. If
+  // ignore_control_dependencies is set to true, if will remove the unused
+  // operands even when they have control dependencies, and transitively pass
+  // the control dependencies from the predecessors to the succesors of the
+  // removed instructions, so that the logical exeuction order of the remaining
+  // unremoved instructions are preserved.
+  absl::Status RemoveInstructionAndUnusedOperands(
+      HloInstruction* instruction,
+      std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup =
+          std::nullopt,
+      bool ignore_control_dependencies = false);
+
+  // Set the root of the computation to the given instruction. The instruction
+  // must have already been added to the computation. In addition it must have
+  // the same shape as the result of the computation for non fusion
+  // computations, except if accept_different_shape is set to true.
+  void set_root_instruction(HloInstruction* new_root_instruction,
+                            bool accept_different_shape = false);
+
+  // Return the root instruction of the computation. The root instruction is the
+  // instruction which produces the output of the computation.
+  HloInstruction* root_instruction() const { return root_instruction_; }
+
+  // Returns the number of parameters for this computation.
+  int64_t num_parameters() const { return param_instructions_.size(); }
+
+  // Returns the parameter instruction for the given parameter number.
+  HloInstruction* parameter_instruction(int64_t param_no) const {
+    CHECK_GE(param_no, 0);
+    CHECK_LT(param_no, static_cast<int64_t>(param_instructions_.size()))
+        << "Computation " << name() << " has no parameter number " << param_no;
+    return param_instructions_[param_no];
+  }
+
+  const HloInstruction::InstructionVector& parameter_instructions() const {
+    return param_instructions_;
+  }
+
+  absl::string_view name() const { return name_; }
+
+  // Sets the string identifier for this computation. Name will be sanitized to
+  // match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  //
+  // See also HloModule::SetAndUniquifyComputationName(), which does this plus
+  // UniqufyName().
+  void SetAndSanitizeName(absl::string_view name) {
+    name_ = NameUniquer::GetSanitizedName(name);
+  }
+
+  // Use the given NameUniquer to select a unique name for the computation based
+  // on the computation's existing name.
+  //
+  // See also HloModule::SetAndUniquifyComputationName(), which does this plus
+  // SetAndSanitizeName().
+  void UniquifyName(NameUniquer* name_uniquer);
+
+  // Use the given `module` to select a unique name for this computation based
+  // on computation's existing name.
+  void UniquifyName(HloModule* module);
+
+  // Prints a string representation of the computation.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  void Print(Printer* printer) const {
+    return Print(printer, HloPrintOptions::Default());
+  }
+  void Print(Printer* printer, const HloPrintOptions& options) const;
+  void Print(Printer* printer, const HloPrintOptions& options,
+             absl::Span<const HloInstruction* const> instruction_order) const;
+
+  // Return a string representation of the computation.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  std::string ToString() const;
+  std::string ToString(const HloPrintOptions& options) const;
+
+  // Overload which accepts an order to emit the instructions in.
+  std::string ToString(
+      const HloPrintOptions& options,
+      absl::Span<const HloInstruction* const> instruction_order) const;
+
+  // Returns a Cord representation of the computation.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+
+  // Overload which accepts an order to emit the instructions in.
+  absl::Cord ToCord(
+      const HloPrintOptions& options,
+      absl::Span<const HloInstruction* const> instruction_order) const;
+
+  // Returns a serialized representation of this computation.
+  HloComputationProto ToProto() const;
+
+  // Creates a computation from the given proto. Arguments:
+  //
+  //   proto: the proto to convert from.
+  //   computation_map: a map from computation id to HloComputation*. This map
+  //     must contain all computations which the newly constructed computation
+  //     calls.
+  static absl::StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
+      const HloComputationProto& proto,
+      const absl::flat_hash_map<int64_t, HloComputation*>& computation_map,
+      bool prohibit_empty_literal = true);
+
+  // Generates a hash value of an HLO computation. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO computations,
+  // with respect to HloComputation::Equal() method.
+  template <typename H>
+  friend H AbslHashValue(H h, const HloComputation& computation) {
+    // Walk the computation in post-order, computing (and caching) the
+    // Absl::Hash after each instruction to use to as an operand for
+    // subsequent instructions.
+    auto instructions = computation.MakeInstructionPostOrder();
+    absl::flat_hash_map<HloInstruction*, size_t> instruction_hash_cache;
+    instruction_hash_cache.reserve(instructions.size());
+    for (auto* instruction : instructions) {
+      absl::InlinedVector<size_t, 2> operand_hashes;
+      for (auto* operand : instruction->operands()) {
+        operand_hashes.push_back(instruction_hash_cache[operand]);
+      }
+      instruction_hash_cache.emplace(
+          instruction, absl::HashOf(*instruction, operand_hashes));
+    }
+    return H::combine(std::move(h),
+                      instruction_hash_cache[computation.root_instruction()],
+                      instructions.size());
+  }
+
+  using InstructionSequence = tsl::gtl::iterator_range<
+      UnwrappingIterator<HloInstructionList::iterator>>;
+
+  using ConstInstructionSequence = tsl::gtl::iterator_range<
+      UnwrappingIterator<HloInstructionList::const_iterator>>;
+
+  // Gets the instructions in this computation.
+  //
+  // The returned type is a range of HloInstruction*s, so you can iterate over
+  // it using a range-based for loop in the natural way:
+  //
+  //   for (HloInstruction* instr : computation->instructions()) { ... }
+  //
+
+  tsl::gtl::iterator_range<xla::HloInstructionUnwrappingConstIterator>
+  instructions() const {
+    const int end = instructions_.size();
+    return {HloInstructionUnwrappingConstIterator(
+                HloInstructionConstIterator(&instructions_, 0, end)),
+            HloInstructionUnwrappingConstIterator(
+                HloInstructionConstIterator(&instructions_, end, end))};
+  }
+  tsl::gtl::iterator_range<xla::HloInstructionUnwrappingIterator>
+  instructions() {
+    const int end = instructions_.size();
+    return {HloInstructionUnwrappingIterator(
+                HloInstructionIterator(&instructions_, 0, end)),
+            HloInstructionUnwrappingIterator(
+                HloInstructionIterator(&instructions_, end, end))};
+  }
+  tsl::gtl::iterator_range<HloInstructionIterator> instructions_with_info() {
+    const int end = instructions_.size();
+    return {HloInstructionIterator(&instructions_, 0, end),
+            HloInstructionIterator(&instructions_, end, end)};
+  }
+  tsl::gtl::iterator_range<HloInstructionConstIterator> instructions_with_info()
+      const {
+    const int end = instructions_.size();
+    return {HloInstructionConstIterator(&instructions_, 0, end),
+            HloInstructionConstIterator(&instructions_, end, end)};
+  }
+
+  using ChannelDependencies =
+      absl::flat_hash_map<const HloInstruction*,
+                          absl::InlinedVector<HloInstruction*, 1>>;
+
+  // Compute and return a post-order of the instructions in the computation. In
+  // this order, definitions of values always appear before their uses.
+  std::vector<HloInstruction*> MakeInstructionPostOrder() const;
+  // Same as MakeInstructionPostOrder but starting at any instruction in the
+  // computation, not just the root. Describes the corresponding subgraph.
+  std::vector<HloInstruction*> MakeInstructionPostOrderFrom(
+      HloInstruction&) const;
+  std::vector<HloInstruction*> MakeInstructionPostOrder(
+      const ChannelDependencies& channel_dependencies) const;
+  // Same as MakeInstructionPostOrder but with special tie-breaking behavior.
+  // Specifically, when ties (in ordering) between instructions occur, Reshapes
+  // will be sorted before other operations.
+  std::vector<HloInstruction*> MakeInstructionPostOrderWithReshapeFirst() const;
+
+  // Calls `func` with each instruction in the computation in post-order.
+  void ForEachInstructionPostOrder(
+      absl::FunctionRef<void(HloInstruction*)> func) const;
+
+  int64_t instruction_count() const { return instruction_count_; }
+
+  // Creates and returns a list of the embedded computations called by this
+  // computation. This includes all embedded computations called directly or
+  // transitively. The embedded computations are sorted such that if computation
+  // A calls computation B (eg, via a map instruction) then A will appear after
+  // B in the list.
+  std::vector<HloComputation*> MakeEmbeddedComputationsList() const;
+
+  // Creates a fusion instruction containing the given instructions.
+  // `fusion_kind` indicates the type of the fusion, e.g., loop fusion or fusion
+  // into a library call. Instructions must be in reverse topological order
+  // (root of the fused expression first). Replaces all uses of the original
+  // root instruction with the fusion instruction. The original instructions are
+  // removed if they have no uses after fusion (this is necessarily true for at
+  // least the root).
+  HloInstruction* CreateFusionInstruction(
+      absl::Span<HloInstruction* const> instructions_to_fuse,
+      HloInstruction::FusionKind fusion_kind);
+
+  // Creates a call instruction containing the given instructions. Instructions
+  // must be in reverse topological order (root of the called computation
+  // first). Replaces all uses of the original root instruction with the call
+  // instruction. The original instructions are removed if they have no uses
+  // after creating the call (this is necessarily true for at least the root).
+  HloInstruction* CreateCallInstruction(
+      absl::Span<HloInstruction* const> instructions_to_call);
+
+  // Creates a composite call instruction containing the given instructions.
+  // Instructions must be in reverse topological order (root of the called
+  // computation first). Replaces all uses of the original root instruction with
+  // the composite call instruction. The original instructions are removed if
+  // they have no uses after creating the composite call (this is necessarily
+  // true for at least the root).
+  HloInstruction* CreateCompositeCallInstruction(
+      absl::Span<HloInstruction* const> instructions_to_call,
+      const std::string& name, const std::string& attributes, int64_t version);
+
+  // Creates an async start/done instruction pair where instruction is wrapped
+  // inside an asynchronous computation. The context shapes are appended to the
+  // output tuple of the asynchronous start which is backend specific. Returns
+  // the async done instruction. The new async start instruction is the operand
+  // of the async done instruction so that can be accessed using that. If
+  // present, `async_execution_thread` will be attached to the
+  // async-start/update/done instructions as well as wrapped computations.
+  // If `replace` is true, replace instruction with the async done instruction.
+  // If `override_names` is true, the clone on `instruction` and the async op
+  // created will get non-default names.
+  absl::StatusOr<HloInstruction*> CreateAsyncInstructions(
+      HloInstruction* instruction, absl::Span<const Shape> context_shapes,
+      absl::string_view async_execution_thread =
+          HloInstruction::kMainExecutionThread,
+      bool replace = true, bool override_names = false);
+
+  // Create a deep copy of the given instruction and return the instruction
+  // producing the copied result. All instructions performing the copy are added
+  // to the computation. For array-shaped values, this method trivially returns
+  // a kCopy instruction. For tuple-shaped instructions, the copy is performed
+  // with a series of kGetTupleElement and kTuple instructions. If
+  // indices_to_copy is non-null then this ShapeTree indicates which elements
+  // (arrays) of the shape to copy. Non-copied elements are passed through
+  // transparently. If copies_added is non-null, then the added kCopy
+  // instructions will be inserted in the respective index in the given
+  // ShapeTree.
+  absl::StatusOr<HloInstruction*> DeepCopyInstruction(
+      HloInstruction* instruction,
+      const ShapeTree<bool>* indices_to_copy = nullptr,
+      ShapeTree<HloInstruction*>* copies_added = nullptr);
+
+  // As above, but uses a custom function to copy the leaf nodes, which could
+  // create alternative HLOs other than kCopy, or even pass-throughs.
+  absl::StatusOr<HloInstruction*> DeepCopyInstructionWithCustomCopier(
+      HloInstruction* instruction,
+      absl::FunctionRef<HloInstruction*(HloInstruction* leaf,
+                                        const ShapeIndex& leaf_index,
+                                        HloComputation* computation)>
+          copy_leaf);
+
+  // Computes and returns the ProgramShape of this computation (shape of
+  // parameters and result with layout).
+  ProgramShape ComputeProgramShape(bool include_ids = true) const;
+
+  // Return whether `*this` and `other` are functionally equivalent.
+  bool Equal(
+      const HloComputation& other, bool is_layout_sensitive,
+      std::optional<
+          absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>>
+          computations_comparator = std::nullopt) const {
+    return EqualInternal(other, is_layout_sensitive, computations_comparator,
+                         /*ignore_channel_id_values=*/false,
+                         /*ignore_execution_thread=*/false);
+  }
+
+  // Same as Equal() but ignores channel ID value mismatches on instructions, as
+  // long as the two instructions both have channel IDs or neither has a channel
+  // ID.
+  bool EqualIgnoringChannelIdValues(
+      const HloComputation& other, bool is_layout_sensitive,
+      std::optional<
+          absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>>
+          computations_comparator = std::nullopt) const {
+    return EqualInternal(other, is_layout_sensitive, computations_comparator,
+                         /*ignore_channel_id_values=*/true,
+                         /*ignore_execution_thread=*/false);
+  }
+
+  bool EqualIgnoringExecutionThread(
+      const HloComputation& other, bool is_layout_sensitive,
+      bool ignore_channel_id_values,
+      std::optional<
+          absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>>
+          computations_comparator = std::nullopt) const {
+    return EqualInternal(other, is_layout_sensitive, computations_comparator,
+                         ignore_channel_id_values,
+                         /*ignore_execution_thread=*/true);
+  }
+
+  // Return whether `*this` and `other` are functionally equivalent.
+  bool operator==(const HloComputation& other) const {
+    return Equal(other, true);
+  }
+  bool operator!=(const HloComputation& other) const {
+    return !(*this == other);
+  }
+
+  // Replaces old instruction with newly created instruction. Removes old
+  // instruction from computation. Updates uses and root instruction.
+  absl::Status ReplaceWithNewInstruction(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction);
+
+  // Replaces an old instruction with a newly created instruction, and adds the
+  // new instruction as an entry computation's parameter. Removes old
+  // instruction from computation. Updates uses and root instruction.
+  absl::Status ReplaceWithNewEntryComputationParameter(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction);
+
+  // Replace old instruction with new instruction.  Updates uses and root
+  // instruction. Removes old instruction from computation. Transitively removes
+  // non-side effecting operands of old instruction that no longer have users,
+  // similar to RemoveInstructionAndUnusedOperands(). Precondition:
+  // old_instruction and new_instruction must have the compatible shapes.
+  // If preserve_sharding is true, the replacement will fail if both new and old
+  // instruction have sharding that is not compatible, and the function will
+  // return false. Otherwise, when the replacement happens, if |new_instruction|
+  // doesn't have any sharding information it will receive the sharding
+  // information of |old_instruction|, and function will return true.
+  absl::StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
+                                          HloInstruction* new_instruction,
+                                          bool preserve_sharding,
+                                          bool relay_control_dependency = false,
+                                          bool remove_unused_operands = true);
+
+  // Same as above, with preserve_sharding=false. Since this replacement always
+  // happens, it returns just a absl::Status as opposed to absl::StatusOr<bool>
+  absl::Status ReplaceInstruction(HloInstruction* old_instruction,
+                                  HloInstruction* new_instruction);
+
+  // Same as ReplaceInstruction, but the new instruction can have a different
+  // shape.
+  absl::StatusOr<bool> ReplaceInstructionWithDifferentShape(
+      HloInstruction* old_instruction, HloInstruction* new_instruction,
+      bool preserve_sharding, bool relay_control_dependency = false,
+      bool remove_unused_operands = true);
+  absl::Status ReplaceInstructionWithDifferentShape(
+      HloInstruction* old_instruction, HloInstruction* new_instruction);
+
+  // Set/get the module containing this computation.
+  void set_parent(HloModule* module) { parent_ = module; }
+  const HloModule* parent() const { return parent_; }
+  HloModule* parent() { return parent_; }
+
+  // Visit every node in the computation in DFS post-order with the given
+  // visitor. This is similar to calling HloInstruction::Accept on the root of
+  // the computation except this method also visits instructions not reachable
+  // via the root. The root instruction of the computation is visited last, and
+  // the visitor's FinishVisit method is called once upon completion (with the
+  // root instruction as the argument).
+  template <typename HloInstructionPtr>
+  absl::Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor) const;
+
+  // Same as Accept() above, but the order of operand and control predecessor
+  // visitation is determined by the given operand order; if compare(A, B) ==
+  // true, A is visited before B.
+  absl::Status AcceptWithOperandOrder(
+      DfsHloVisitor* visitor,
+      const HloInstruction::CompareFunction& operand_order) const;
+
+  // Visit every node in the computation in the given order. 'order' must
+  // be a topological sort of all instructions in the computation.
+  template <typename HloInstructionPtr>
+  absl::Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
+                             absl::Span<HloInstruction* const> order) const;
+
+  // Returns a deep copy of this computation including all instructions.
+  // If the clone context is specified, it will be populated with the cloned
+  // object mappings, and its module() will be used to add new computations
+  // into.
+  std::unique_ptr<HloComputation> Clone(const std::string& suffix = "clone",
+                                        HloCloneContext* context = nullptr);
+
+  // Like Clone(), but if an instruction is present in replacement_map, we use
+  // the map's value to replace that instruction in the cloned computation.
+  //
+  // If replacements is nullptr, don't perform replacement.
+  // If replacements maps a key to nullptr, we remove that instruction from the
+  // new computation.  If an element of `replacements` references an instruction
+  // that's not already in the computation, it's cloned and added to the new
+  // computation.
+  //
+  // 'extra_parameters' allows to specify additional parameters that should be
+  // added to the computation.
+  //
+  // All relevant instructions are cloned, *including* unique_ptr in the
+  // `replacements` map.
+  std::unique_ptr<HloComputation> CloneWithReplacements(
+      const absl::flat_hash_map<const HloInstruction*,
+                                std::unique_ptr<HloInstruction>>* replacements,
+      absl::Span<const HloInstruction* const> extra_parameters = {},
+      HloCloneContext* context = nullptr, const std::string& suffix = "clone",
+      const HloInstruction* new_root = nullptr);
+
+  // Like CloneWithReplacements(), but this is a const method and `context` must
+  // be specified.
+  std::unique_ptr<HloComputation> CloneInContext(
+      HloCloneContext& context,
+      const absl::flat_hash_map<const HloInstruction*,
+                                std::unique_ptr<HloInstruction>>* replacements =
+          nullptr,
+      absl::Span<const HloInstruction* const> extra_parameters = {},
+      const std::string& suffix = "clone",
+      const HloInstruction* new_root = nullptr) const;
+
+  // Convenience overloads for CloneWithReplacements.  You want to do
+  //
+  //   CloneWithReplacements({{a, std::move(b)}, {c, std::move(d)}})  // ERROR
+  //
+  // but that doesn't work because std::initializer_list is not movable.  These
+  // overloads let you do
+  //
+  //   CloneWithReplacementPairs({a, std::move(b)}, {c, std::move(d)});   // OK
+  //
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      HloCloneContext* context = nullptr, const std::string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      HloCloneContext* context = nullptr, const std::string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+      HloCloneContext* context = nullptr, const std::string& suffix = "clone");
+
+  // Returns true if the given instruction can be removed from the computation.
+  // Parameter instructions cannot be removed without violating invariants of
+  // the HLO computation with the exception of fusion computation. A parameter
+  // instruction is removable for a fusion computation.
+  //
+  // Note that IsSafelyRemovable() is a necessary condition to remove an
+  // instruction rather than a sufficient condition. For example, instructions
+  // with side-effect (e.g., Send, Infeed) may be removed from a computation,
+  // but the transformation must guarantee the invariants relevant to the
+  // instructions still hold (e.g., Send and Recv must be removed together to
+  // make each channel complete).
+  bool IsSafelyRemovable(const HloInstruction* instruction,
+                         bool ignore_control_dependency = false);
+
+  // Returns a map from an instruction to the group of instructions associated
+  // with the same channel. These instructions will be considered as a single
+  // node for dependency purposes.
+  // RecvDone ops will map to the corresponding Send op.
+  // Cross-partition collectives will map to every other instruction with the
+  // same channel ID (it doesn't map to itself).
+  ChannelDependencies ComputeChannelDependencies() const;
+
+  // Returns true if this computation has a side effect. A computation has a
+  // side effect if it contains one or more instructions with a side effect.
+  bool HasSideEffect() const;
+
+  // Returns if this computation is a fusion computation.
+  // Do not use this method to determine if fusion_instruction_ != nullptr.
+  // Instead, directly do: FusionInstruction() != nullptr
+  bool IsFusionComputation() const {
+    return instruction_type() == InstructionType::kFusion;
+  }
+
+  // Returns if this computation is the entry computation of the module.
+  bool IsEntryComputation() const;
+
+  // Returns the owning fusion instruction, or nullptr if this is not a fusion
+  // computation.
+  HloInstruction* FusionInstruction() const {
+    return instruction_type() == InstructionType::kFusion ? instruction()
+                                                          : nullptr;
+  }
+  void SetFusionInstruction(HloInstruction* fusion_instruction) {
+    SetInstruction(fusion_instruction, InstructionType::kFusion);
+  }
+
+  // Returns if this computation is a custom-call computation.
+  bool IsCustomCallComputation() const {
+    return instruction_type() == InstructionType::kCustomCall;
+  }
+
+  // Returns the owning custom call instruction, or nullptr if this is not a
+  // custom call computation.
+  HloInstruction* CustomCallInstruction() const {
+    return instruction_type() == InstructionType::kCustomCall ? instruction()
+                                                              : nullptr;
+  }
+  void SetCustomCallInstruction(HloInstruction* custom_call_instruction) {
+    SetInstruction(custom_call_instruction, InstructionType::kCustomCall);
+  }
+
+  // Returns if this computation is a to_apply region of a collective.
+  bool IsCollectiveCalledComputation() const {
+    return instruction_type() == InstructionType::kCollective;
+  }
+
+  // Returns the owning collective call instruction, or nullptr if this is not a
+  // collective call computation.
+  HloInstruction* CollectiveCallInstruction() const {
+    return instruction_type() == InstructionType::kCollective ? instruction()
+                                                              : nullptr;
+  }
+
+  void SetCollectiveCallInstruction(
+      HloInstruction* collective_call_instruction) {
+    SetInstruction(collective_call_instruction, InstructionType::kCollective);
+  }
+
+  // Returns if this computation is a body computation of a while.
+  [[deprecated(
+      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
+  bool IsWhileBodyComputation() const {
+    return instruction_type() == InstructionType::kWhile;
+  }
+
+  // Returns the owning while call instruction, or nullptr if this is not a
+  // while call body computation.
+  [[deprecated(
+      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
+  HloInstruction* WhileCallInstruction() const {
+    return instruction_type() == InstructionType::kWhile ? instruction()
+                                                         : nullptr;
+  }
+
+  [[deprecated(
+      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
+  void SetWhileCallInstruction(HloInstruction* while_call_instruction) {
+    CHECK(while_call_instruction != nullptr);
+    CHECK(while_call_instruction->opcode() == HloOpcode::kWhile);
+    SetInstruction(while_call_instruction, InstructionType::kWhile);
+  }
+
+  // Returns if this computation is a branch computation of a conditional.
+  [[deprecated(
+      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
+  bool IsConditionalBranchComputation() const {
+    return instruction_type() == InstructionType::kConditional;
+  }
+
+  // Returns the owning conditional call instruction, or nullptr if this is not
+  // a conditional branch computation.
+  [[deprecated(
+      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
+  HloInstruction* ConditionalCallInstruction() const {
+    return instruction_type() == InstructionType::kConditional ? instruction()
+                                                               : nullptr;
+  }
+
+  [[deprecated(
+      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
+  void SetConditionalCallInstruction(
+      HloInstruction* conditional_call_instruction) {
+    CHECK(conditional_call_instruction != nullptr);
+    CHECK(conditional_call_instruction->opcode() == HloOpcode::kConditional);
+    SetInstruction(conditional_call_instruction, InstructionType::kConditional);
+  }
+
+  // Returns if this computation is an async computation.
+  bool IsAsyncComputation() const { return async_start_ != nullptr; }
+
+  // Returns true if this computation only contains send/recv instructions.
+  bool OnlyContainsSendRecv() {
+    for (const HloInstruction* instruction : this->instructions()) {
+      if (!HloPredicateIsOp<HloOpcode::kSend, HloOpcode::kRecv,
+                            HloOpcode::kBitcast, HloOpcode::kParameter,
+                            HloOpcode::kTuple>(instruction)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Returns the owning async instruction. It's nullptr if this is not an async
+  // computation.
+  HloInstruction* AsyncStart() const { return async_start_; }
+
+  void AddAsyncStart(HloInstruction* async_instruction) {
+    // TODO: Add instruction type for async instructions.
+    CHECK(instruction_type() == InstructionType::kUnset);
+    CHECK(async_instruction->opcode() == HloOpcode::kAsyncStart);
+    async_start_ = async_instruction;
+  }
+
+  void RemoveAsyncStart() { async_start_ = nullptr; }
+
+  // Clear the unique ID of the computation so that it can be re-assigned, such
+  // as for the purpose of compacting the unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
+  // The id of this computation should be unique within the module.
+  void SetUniqueId(int64_t id) {
+    CHECK_EQ(unique_id_, -1);
+    CHECK_GE(id, 0);
+    unique_id_ = id;
+  }
+
+  // Returns the instruction in this computation that has name `name`.  Returns
+  // null if there is no such computation.
+  HloInstruction* GetInstructionWithName(absl::string_view name);
+
+  int64_t unique_id() const { return unique_id_; }
+
+  void SetExecutionThread(absl::string_view execution_thread) {
+    execution_thread_ = std::string(execution_thread);
+  }
+
+  absl::string_view execution_thread() const { return execution_thread_; }
+  // Returns true if this computation is annotated on "main" execution thread.
+  bool IsMainThread() const {
+    return execution_thread_ == HloInstruction::kMainExecutionThread;
+  }
+
+  // Deallocates instructions that are marked by "RemoveInstruction" and
+  // compacts the instructions_ vector by removing the deleted instructions'
+  // entries (a.k.a. tombstones).
+  // This two-stage clean up process is designed such that HloPass can have
+  // stable internal pointers to HloInstructions while we create and remove
+  // HloInstructions in a pass.
+  // Note: the removal operation is stable because some users depend on it.
+  void Cleanup();
+
+  // Returns true if a given instruction is marked dead in this computation.
+  bool IsMarkedAsDead(const HloInstruction* inst);
+
+  // Returns true iff this computation can be inlined as a single instruction.
+  bool CanExpandIntoSingleInstruction() const;
+
+ private:
+  explicit HloComputation(
+      const std::string& name, int parameter_count,
+      std::vector<std::unique_ptr<HloInstruction>>* instructions,
+      HloInstruction* root_instruction);
+
+  // Internal helper for adding instructions.
+  HloInstruction* AddInstructionInternal(
+      std::unique_ptr<HloInstruction> instruction);
+
+  // Internal helper for comparison with different options.
+  bool EqualInternal(
+      const HloComputation& other, bool is_layout_sensitive,
+      std::optional<
+          absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>>
+          computations_comparator,
+      bool ignore_channel_id_values, bool ignore_execution_thread) const;
+  // Appends (fuses) HLOs in instructions_to_append into the called computation
+  // of the caller.
+  void AppendInstructionsIntoCalledComputation(
+      absl::Span<HloInstruction* const> instructions_to_append,
+      HloInstruction* caller);
+
+  // Internal helper for recursive copying of an instruction. Creates and
+  // returns a deep copy of the given instruction.
+  absl::StatusOr<HloInstruction*> DeepCopyHelper(
+      HloInstruction* instruction, ShapeIndex* index,
+      absl::FunctionRef<HloInstruction*(HloInstruction* leaf,
+                                        const ShapeIndex& leaf_index,
+                                        HloComputation* computation)>
+          copy_leaf);
+
+  // Internal helper to collect unreachable roots.
+  std::vector<HloInstruction*> CollectUnreachableRoots() const;
+
+  class VisitMap;
+  void ComputeInstructionPostOrder(
+      HloInstruction* root, const ChannelDependencies& channel_dependencies,
+      VisitMap& visited, std::vector<HloInstruction*>& post_order,
+      std::vector<HloInstruction*>* dfs_stack_scratch) const;
+
+  void ForEachInstructionPostOrderImpl(
+      absl::FunctionRef<void(HloInstruction*)> func, HloInstruction* root,
+      const ChannelDependencies& channel_dependencies, VisitMap& visited,
+      std::vector<HloInstruction*>* dfs_stack_scratch) const;
+
+  absl::Status RemoveUnusedParametersImpl(bool allow_non_fusion);
+
+  absl::Status RemoveInstructionImpl(HloInstruction* instruction,
+                                     bool ignore_safety_check);
+
+  void SetInstruction(HloInstruction* instruction, InstructionType type);
+
+  int64_t unique_id_;
+  HloInstruction* root_instruction_;
+
+  // Module containing this computation.
+  HloModule* parent_ = nullptr;
+
+  // Contains HloInstruction* and its type.
+  // The respective type in the least significant three bits.
+  uintptr_t instruction_and_type_ = 0;
+
+  // If this computation is an async computation, this field points to the
+  // first async instruction (async-start) in the asynchronous op chain that
+  // calls this computation.
+  // Otherwise, this is empty.
+  HloInstruction* async_start_ = nullptr;
+
+  HloInstruction::InstructionVector param_instructions_;
+
+  // Store instructions in std::vector as they can be added and removed
+  // arbitrarily and we want a stable iteration order.
+  // For the reverse mapping we use HloInstruction::index_in_parent_.
+  //
+  // Note: removals from this vector must be stable because some users depend on
+  // it. See the Cleanup() method for details on the two-stage removal process.
+  HloInstructionList instructions_;
+
+  // Number of not-marked-for-deletion entries in instructions_.
+  int64_t instruction_count_;
+
+  // Removed instructions are moved into to_be_deleted_ first and then
+  // deallocated when Cleanup is called.
+  PtrVec<HloInstruction*> to_be_deleted_;
+
+  // Execution thread of this computation. By default, it's main thread.
+  std::string execution_thread_ = HloInstruction::kMainExecutionThread;
+
+  std::string name_;
+
+  HloComputation(const HloComputation&) = delete;
+  HloComputation& operator=(const HloComputation&) = delete;
+};
+
+template <typename HloInstructionPtr>
+absl::Status HloComputation::Accept(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor) const {
+  // Visit unreachable roots. Beware that the visitor might delete the currently
+  // visited root, which would invalidate iterators if the unreachable roots
+  // weren't computed ahead of time.
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    VLOG(3) << "Traversing unreachable root: " << root->ToString();
+    // Call FinishVisit only at the end.
+    TF_RETURN_IF_ERROR(root->Accept(visitor, /*call_finish_visit=*/false));
+  }
+  // Visit the computation root instruction last.
+  return root_instruction()->Accept(visitor, /*call_finish_visit=*/true);
+}
+
+// Explicit instantiations.
+template absl::Status HloComputation::Accept(DfsHloVisitor* visitor) const;
+template absl::Status HloComputation::Accept(ConstDfsHloVisitor* visitor) const;
+
+template <typename HloInstructionPtr>
+absl::Status HloComputation::AcceptOrdered(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor,
+    absl::Span<HloInstruction* const> order) const {
+  VLOG(3) << "Accepting visitor with order.";
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    TF_RET_CHECK(absl::c_linear_search(order, root)) << root->ToString();
+  }
+  TF_RET_CHECK(order.size() == instruction_count());
+  absl::flat_hash_set<const HloInstruction*> visited;
+  for (const HloInstruction* instruction : order) {
+    VLOG(3) << "Visiting ordered: " << instruction->ToString();
+    TF_RET_CHECK(!visited.contains(instruction))
+        << "Instruction " << instruction->name()
+        << " appears more than once in order";
+    HloInstruction* mutable_instruction =
+        const_cast<HloInstruction*>(instruction);
+    TF_RETURN_IF_ERROR(visitor->Preprocess(mutable_instruction));
+    TF_RETURN_IF_ERROR(mutable_instruction->Visit(visitor));
+    visitor->SetVisited(*mutable_instruction);
+    TF_RETURN_IF_ERROR(visitor->Postprocess(mutable_instruction));
+    visited.insert(instruction);
+  }
+  return visitor->FinishVisit(root_instruction());
+}
+
+// Explicit instantiations.
+template absl::Status HloComputation::AcceptOrdered(
+    DfsHloVisitor*, absl::Span<HloInstruction* const>) const;
+template absl::Status HloComputation::AcceptOrdered(
+    ConstDfsHloVisitor*, absl::Span<HloInstruction* const>) const;
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_COMPUTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.h
new file mode 100644
index 00000000..446be761
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_DFS_REACHABILITY_H_
+#define XLA_HLO_IR_HLO_DFS_REACHABILITY_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_dfs_reachability.h"
+
+#endif  // XLA_HLO_IR_HLO_DFS_REACHABILITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_domain_metadata.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_domain_metadata.h
new file mode 100644
index 00000000..249e406b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_domain_metadata.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_DOMAIN_METADATA_H_
+#define XLA_HLO_IR_HLO_DOMAIN_METADATA_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/types.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Cannot include hlo_instruction.h as this file is included from there.
+class HloInstruction;
+
+// The DomainMetadata represents the base class for metadata which can be
+// attached to kDomain HLO instructions.
+class DomainMetadata {
+ public:
+  // A Domain data structure captures all the information about a kDomain
+  // bounded instruction set.
+  struct Domain {
+    // The set of instructions which are reachable from each other via
+    // operand/user pathways, without crossing a kDomain instruction of a given
+    // kind. The reach_set can contain kDomain instructions of other kinds, if
+    // two domains of different kind intersect each other.
+    absl::flat_hash_set<HloInstruction*> reach_set;
+
+    // The same instructions in reach_set, but purged from kDomain instructions
+    // and ordered according to their computation graph post-order, i.e.
+    // if instructions[pos_a] depends on instructions[pos_b], then pos_a >
+    // pos_b.
+    std::vector<HloInstruction*> instructions;
+
+    // If we consider a graph edge as an arrow oriented from the operand to the
+    // user, the enter_domains will contain the set of kDomain instructions
+    // whose dataflow enters the reach set (domain), while the exit_domains
+    // contains the set of kDomain instructions whose dataflow exit the reach
+    // set.
+    absl::flat_hash_set<HloInstruction*> enter_domains;
+    absl::flat_hash_set<HloInstruction*> exit_domains;
+  };
+
+  virtual ~DomainMetadata() = default;
+
+  // Clones the metadata object.
+  virtual std::unique_ptr<DomainMetadata> Clone() const = 0;
+
+  // Returns the metadata type. A unique identifier which describes the real
+  // metadata type.
+  virtual absl::string_view Kind() const = 0;
+
+  // Compares the metadata object with another one and returns true if the
+  // two matches.
+  virtual bool Matches(const DomainMetadata& other) const = 0;
+
+  // Returns the hash value of the metadata.
+  virtual size_t Hash() const = 0;
+
+  // Returns a string representation of the metadata.
+  virtual std::string ToString() const = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_DOMAIN_METADATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
new file mode 100644
index 00000000..d00609ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
@@ -0,0 +1,232 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
+#define XLA_HLO_IR_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
+
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "absl/container/btree_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+
+// This class specifies the alias map from output index to parameter number and
+// parameter index in the entry computation.
+class HloInputOutputAliasConfig {
+ public:
+  // The kind of aliases which can be set. A kMayAlias is one setup at
+  // compilation time by the user, and has to be respected. A kMustAlias one
+  // might be setup by the compiler, if it decides it is convenient to do so.
+  enum AliasKind {
+    kMayAlias,
+    kMustAlias,
+  };
+  // Defines the alias information for a given output buffer. A given output
+  // buffer shape index can refer only to one parameter+index.
+  struct Alias {
+    Alias(int64_t parameter_number, ShapeIndex parameter_index,
+          AliasKind kind = kMayAlias)
+        : parameter_number(parameter_number),
+          parameter_index(std::move(parameter_index)),
+          kind(kind) {}
+
+    int64_t parameter_number;
+    ShapeIndex parameter_index;
+    AliasKind kind;
+
+    bool must_alias() const { return kind == kMustAlias; }
+
+    std::string ToString() const {
+      return absl::StrFormat("(%lld, %s, %s)", parameter_number,
+                             parameter_index.ToString(),
+                             kind == kMustAlias ? "must-alias" : "may-alias");
+    }
+  };
+
+  HloInputOutputAliasConfig() = default;
+
+  explicit HloInputOutputAliasConfig(Shape output_shape)
+      : alias_(std::move(output_shape)) {}
+
+  // Sets up alias config from `output_index` to `param_index` at
+  // `param_number`.
+  absl::Status SetUpAlias(const ShapeIndex& output_index, int64_t param_number,
+                          const ShapeIndex& param_index,
+                          AliasKind must_alias = kMayAlias);
+
+  // Returns true if the given parameter is aliased with one of the output
+  // buffers.
+  bool ParameterHasAlias(int64_t param_number,
+                         const ShapeIndex& param_index) const {
+    return GetAliasedOutput(param_number, param_index).has_value();
+  }
+
+  // Checks whether the provided output index has already been aliased.
+  bool OutputHasAlias(const ShapeIndex& output_index) const;
+
+  // (De)Serializes an HloInputOutputAliasConfig to/from an
+  // HloInputOutputAliasProto.
+  HloInputOutputAliasProto ToProto() const;
+
+  static absl::StatusOr<HloInputOutputAliasConfig> CreateFromProto(
+      Shape output_shape, const HloInputOutputAliasProto& proto);
+
+  // Returns the output index that the given parameter and parameter index is
+  // aliased with. A nullopt is returned if there is no output that is aliased
+  // with the parameter number and index.
+  std::optional<ShapeIndex> GetAliasedOutput(
+      int64_t param_number, const ShapeIndex& param_index) const;
+
+  // Returns the number of parameter and index of the parameter buffer that the
+  // given output buffer index is aliased with. A nullopt is returned if there
+  // is no parameter is aliased with the specific output.
+  std::optional<Alias> GetAliasedParameter(
+      const ShapeIndex& output_index) const;
+
+  // Returns if the parameter at the given parameter number and parameter
+  // index must-alias with an output.
+  bool ParameterMustAlias(int64_t param_number,
+                          const ShapeIndex& param_index) const;
+
+  using AliasFn =
+      absl::FunctionRef<void(const ShapeIndex& output_index, const Alias&)>;
+
+  // Iterates through each aliased output and input.
+  void ForEachAlias(AliasFn fn) const;
+
+  using AliasFnWithStatus = absl::FunctionRef<absl::Status(
+      const ShapeIndex& output_index, const Alias&)>;
+
+  // Verifies that the given config is valid for the given module.
+  // Specifically, the config's input and output should be in-bound and size ofF
+  // the aliased buffers should match.
+  absl::Status Verify(const HloModule& module,
+                      absl::FunctionRef<int64_t(const Shape&)> size_func) const;
+
+  absl::Status ForEachAliasWithStatus(AliasFnWithStatus fn) const;
+
+  // Returns the shape of the output of the alias config.
+  const Shape& shape() const;
+
+  std::string ToString() const;
+
+  std::string ToShortString() const;
+
+ private:
+  // A ShapeTree which indicates the list of buffers that's expected to be
+  // aliased. The key on this shape tree represents the output index. The value
+  // is an Alias data structure which defines the input parameter coordinates.
+  // If the value is nullopt, it means there is no parameter aliasing for this
+  // output.
+  ShapeTree<std::optional<Alias>> alias_;
+};
+
+// This class specifies donors of the input buffer (specified by parameter
+// number and parameter index in the entry computation). The donated buffer can
+// be matched with any valid output tensor, which differs from
+// HloInputOutputAliasConfig.
+class HloBufferDonorConfig {
+ public:
+  // Defines a input buffer donor. In real world, organ donors refer to the
+  // persons agreeing to remove their organs (usually after death). Similarly, a
+  // registered buffer donor indicates that the input parameter can be removed
+  // when there is no dependency. Therefore, the memory buffer can be reused by
+  // a matched output.
+  struct BufferDonor {
+    BufferDonor(int64_t param_number, ShapeIndex param_index)
+        : param_number(param_number), param_index(std::move(param_index)) {}
+
+    int64_t param_number;
+    ShapeIndex param_index;
+
+    bool operator==(const BufferDonor& other) const {
+      return param_number == other.param_number &&
+             param_index == other.param_index;
+    }
+
+    bool operator<(const BufferDonor& other) const {
+      return std::forward_as_tuple(param_number, param_index) <
+             std::forward_as_tuple(other.param_number, other.param_index);
+    }
+    bool operator>(const BufferDonor& other) const { return other < *this; }
+    bool operator<=(const BufferDonor& other) const { return !(*this > other); }
+    bool operator>=(const BufferDonor& other) const { return !(*this < other); }
+
+    // A hash function borrowed from go/absl-hash.
+    template <typename H>
+    friend H AbslHashValue(H h, const BufferDonor& donor) {
+      return H::combine(std::move(h), donor.param_number, donor.param_index);
+    }
+  };
+
+  HloBufferDonorConfig() = default;
+
+  // Register and unregister the parameter with `param_index` at `param_number`
+  // as a buffer donor.
+  absl::Status AddBufferDonor(int64_t param_number,
+                              const ShapeIndex& param_index);
+  absl::Status RemoveBufferDonor(int64_t param_number,
+                                 const ShapeIndex& param_index);
+
+  // Returns true if the given parameter is registered as a buffer donor.
+  bool ParameterIsBufferDonor(int64_t param_number,
+                              const ShapeIndex& param_index) const;
+
+  // (De)Serializes an HloBufferDonorConfig to/from an HloBufferDonorProto.
+  HloBufferDonorProto ToProto() const;
+  static absl::StatusOr<HloBufferDonorConfig> CreateFromProto(
+      const HloBufferDonorProto& proto);
+
+  // Verifies that the given config is valid for the given module.
+  // The config's input should be in-bound and this config cannot overlap with
+  // the given module's input_output_alias_config.
+  absl::Status Verify(const HloModule& module) const;
+
+  // Returns the registered buffer donors
+  const absl::btree_set<BufferDonor>& buffer_donor() const {
+    return buffer_donor_;
+  }
+
+  std::string ToString() const;
+
+  std::string ToShortString() const;
+
+ private:
+  // A set recording the registered buffer donors.
+  absl::btree_set<BufferDonor> buffer_donor_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const HloInputOutputAliasConfig& config);
+std::ostream& operator<<(std::ostream& out, const HloBufferDonorConfig& config);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instruction.h
new file mode 100644
index 00000000..bd1d9213
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -0,0 +1,3129 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// HLO instructions are in DAG form and represent the computations that the user
+// has built up via the XLA service interface. They are ultimately lowered
+// in a platform-aware way by traversing the HLO DAG and emitting a lowered
+// form; e.g. see DfsHloVisitor.
+
+#ifndef XLA_HLO_IR_HLO_INSTRUCTION_H_
+#define XLA_HLO_IR_HLO_INSTRUCTION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/hash/hash.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/backend_config.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/ptrvec.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/printer.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/mapped_ptr_container_sorter.h"
+#include "xla/service/name_uniquer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/lib/gtl/iterator_range.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+class HloComputation;
+class HloModule;
+
+absl::string_view PrintName(absl::string_view name, bool print_ids);
+
+// A bunch of switches that control how the hlo text should be printed.
+class HloPrintOptions {
+ public:
+  enum class PrintSubcomputationMode {
+    kOff,                  // Do not print anything about subcomputations.
+    kNameOnly,             // Only print the name of subcomputations.
+    kFullBodies,           // Print the full bodies of subcomputations.
+    kNonSequentialBodies,  // Print the full bodies of subcomputations that are
+                           // not in a sequential context.
+  };
+
+  // Constructs the default print options: don't print large constants, don't
+  // compact operands, no indentation.
+  constexpr HloPrintOptions()
+      : print_operand_index_annotation_interval_(5),
+        print_subcomputation_mode_(PrintSubcomputationMode::kNameOnly),
+        indent_amount_(0),
+        print_large_constants_(false),
+        print_only_essential_constants_(false),
+        print_original_value_(true),
+        print_metadata_(true),
+        print_metadata_only_op_name_(false),
+        print_backend_config_(true),
+        sort_backend_config_(false),
+        print_infeed_outfeed_config_(true),
+        compact_operands_(false),
+        include_layout_in_shapes_(true),
+        print_result_shape_(true),
+        print_operand_shape_(true),
+        print_operand_names_(true),
+        print_program_shape_(true),
+        print_percent_(true),
+        print_control_dependencies_(true),
+        canonicalize_instruction_names_(false),
+        is_in_nested_computation_(false),
+        print_ids_(true),
+        canonicalize_computations_(false),
+        print_extra_attributes_(true),
+        syntax_sugar_async_ops_(true),
+        print_name_after_closing_brace_(false),
+        print_full_replica_group_list_(false),
+        print_parameter_number_(true) {}
+  // Static reference to a default construction HloPrintOptions, to avoid
+  // constructing a new one each time default is needed.
+  static const HloPrintOptions& Default() {
+    ABSL_CONST_INIT static const HloPrintOptions options;
+    return options;
+  }
+
+  static HloPrintOptions ShortParsable() {
+    return HloPrintOptions()
+        .set_print_large_constants(true)
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly)
+        .set_print_metadata(false)
+        .set_print_backend_config(false)
+        .set_print_operand_shape(false)
+        .set_print_operand_index_annotation_interval(0)
+        .set_print_program_shape(false)
+        .set_print_percent(false)
+        .set_print_control_dependencies(false);
+  }
+
+  // Options to produce the canonical string representing an isomorphic
+  // computation graph.
+  static HloPrintOptions Canonical() {
+    return HloPrintOptions()
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kFullBodies)
+        .set_print_metadata(false)
+        .set_print_backend_config(false)
+        // Compact option won't print name of operand_k, where k > a threshold.
+        // Canonical (and Fingerprint) must include all operands.
+        .set_compact_operands(false)
+        .set_print_operand_names(false)
+        .set_print_operand_shape(true)
+        // No index annotations as they are only for ease of human inspection.
+        .set_print_operand_index_annotation_interval(0)
+        .set_print_program_shape(false)
+        .set_print_percent(false)
+        .set_print_control_dependencies(false)
+        .set_canonicalize_instruction_names(true);
+  }
+
+  // Options to produce a fingerprint of an HLO instruction.
+  // Based on Canonical() with some important changes commented below.
+  static HloPrintOptions Fingerprint() {
+    return Canonical()
+        // Exclude because they do not affect HLO optimizations.
+        .set_print_infeed_outfeed_config(false)
+        // Exclude floating point constant literals that are not all
+        // zeros, all ones, or integers because they may be randomly
+        // initialized weights, which may be changed across different
+        // runs.
+        .set_print_only_essential_constants(true)
+        // Remove "id" in "name.id" (after period) because it can be
+        // non-deterministic. This mainly affects computations' names
+        // because canonicalized instructions' names are in "tmp_id"
+        // format.
+        .set_print_ids(false)
+        // Sort computations.
+        .set_canonicalize_computations(true)
+        // Force to print full replica group list to avoid non-determinism.
+        // With this flag set to false, the replica group list may be printed
+        // in a compact form when iota_replica_group_list is present, which may
+        // be different across different runs.
+        .set_print_full_replica_group_list(true);
+  }
+
+  // Options to produce a fingerprint of an HLO module and computation.
+  // Shorter (and therefore faster) than Fingerprint().
+  static HloPrintOptions ModuleFingerprint() {
+    return Fingerprint()
+        // Operand shapes can be inferred from output shapes and canonicalized
+        // names when we have an entire computation.
+        .set_print_operand_shape(false);
+  }
+
+  // If true, large constants will be printed out.
+  HloPrintOptions& set_print_large_constants(bool value) {
+    print_large_constants_ = value;
+    return *this;
+  }
+
+  // If true, only integer, all-zero, are all-one constants will be printed out.
+  HloPrintOptions& set_print_only_essential_constants(bool value) {
+    print_only_essential_constants_ = value;
+    return *this;
+  }
+
+  HloPrintOptions& set_print_subcomputation_mode(
+      PrintSubcomputationMode value) {
+    print_subcomputation_mode_ = value;
+    return *this;
+  }
+
+  // If true, origin will be printed.
+  HloPrintOptions& set_print_original_value(bool value) {
+    print_original_value_ = value;
+    return *this;
+  }
+  // If true, metadata will be printed.
+  HloPrintOptions& set_print_metadata(bool value) {
+    print_metadata_ = value;
+    return *this;
+  }
+
+  // If true and print_metadata is true, metadata op name will be printed. Other
+  // metadata values will be omitted.
+  HloPrintOptions& set_print_metadata_only_op_name(bool value) {
+    print_metadata_only_op_name_ = value;
+    return *this;
+  }
+
+  // If true, backend_config will be printed.
+  HloPrintOptions& set_print_backend_config(bool value) {
+    print_backend_config_ = value;
+    return *this;
+  }
+
+  // If true, will attempt to sort the backend config's json representation
+  // before printing it. If the backend config is a raw string that is not json,
+  // it will be printed as is, without sorting.
+  HloPrintOptions& set_sort_backend_config(bool value) {
+    sort_backend_config_ = value;
+    return *this;
+  }
+
+  // If true, infeed_config and outfeed_config will be printed.
+  HloPrintOptions& set_print_infeed_outfeed_config(bool value) {
+    print_infeed_outfeed_config_ = value;
+    return *this;
+  }
+
+  // If true, result shapes will be printed.
+  HloPrintOptions& set_print_result_shape(bool value) {
+    print_result_shape_ = value;
+    return *this;
+  }
+
+  // If true, operands' shapes will be printed.
+  HloPrintOptions& set_print_operand_shape(bool value) {
+    print_operand_shape_ = value;
+    return *this;
+  }
+
+  // If true, operands' shapes will be printed.
+  HloPrintOptions& set_print_operand_index_annotation_interval(int64_t value) {
+    print_operand_index_annotation_interval_ = value;
+    return *this;
+  }
+
+  // If true, the operand names will be printed.
+  HloPrintOptions& set_print_operand_names(bool value) {
+    print_operand_names_ = value;
+    return *this;
+  }
+
+  // If true, all printed names include unique identifiers.
+  HloPrintOptions& set_print_ids(bool value) {
+    print_ids_ = value;
+    return *this;
+  }
+
+  // If true, the HLO includes its attributes.
+  HloPrintOptions& set_print_extra_attributes(bool value) {
+    print_extra_attributes_ = value;
+    return *this;
+  }
+
+  // If true, program shape of hlo computations will be printed.
+  HloPrintOptions& set_print_program_shape(bool value) {
+    print_program_shape_ = value;
+    return *this;
+  }
+
+  // If true, names will be printed with prefix '%'.
+  HloPrintOptions& set_print_percent(bool value) {
+    print_percent_ = value;
+    return *this;
+  }
+
+  // If true, control dependencies will be printed.
+  HloPrintOptions& set_print_control_dependencies(bool value) {
+    print_control_dependencies_ = value;
+    return *this;
+  }
+
+  HloPrintOptions& set_print_full_replica_group_list(bool value) {
+    print_full_replica_group_list_ = value;
+    return *this;
+  }
+
+  // If true, uses the async operation syntax sugar to print async-start,
+  // async-update, and async-done HLOs. If the syntax sugar is enabled and the
+  // async computation is trivial (i.e. only a single instruction taking
+  // computation parameters as operands, and otherwise is illegal), the
+  // computations called by these instructions will not be printed and instead
+  // the root of the called computation will be printed instead of these
+  // instructions and -start, -update, and -done suffixes will be appended to
+  // the opcode of the async operation. For example, for an HLO module where the
+  // syntax sugar is off:
+  //
+  // HloModule Module
+  //
+  // %AsyncOp (p0.1: f32[10]) -> f32[20] {
+  //   %p0.1 = f32[10]{0} parameter(0)
+  //   ROOT %custom-call = f32[20]{0} custom-call(f32[10]{0} %p0.1),
+  //                                  custom_call_target="foo"
+  // }
+  //
+  // ENTRY %Entry (p0: f32[10]) -> f32[20] {
+  //   %p0 = f32[10]{0} parameter(0)
+  //   %async-start = ((f32[10]{0}), f32[20]{0}, s32[]) async-start(%p0),
+  //                                                    calls=%AsyncOp
+  //   %async-update = ((f32[10]{0}), f32[20]{0}, s32[]) async-update(
+  //                                                         %async-start),
+  //                                                     calls=%AsyncOp
+  //   ROOT %async-done = f32[20]{0} async-done(%async-update), calls=%AsyncOp
+  // }
+  //
+  // will be printed as following if the syntax sugar is enabled:
+  //
+  // HloModule Module
+  //
+  // ENTRY %Entry (p0: f32[10]) -> f32[20] {
+  //   %p0 = f32[10]{0} parameter(0)
+  //   %async-start = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-start(%p0),
+  //                                                    custom_call_target="foo"
+  //   %async-update = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-update(
+  //                                                         %async-start),
+  //                                                    custom_call_target="foo"
+  //   ROOT %async-done = f32[20]{0} custom-call-done(%async-update),
+  //                                                    custom_call_target="foo"
+  // }
+  HloPrintOptions& set_syntax_sugar_async_ops(bool value) {
+    syntax_sugar_async_ops_ = value;
+    return *this;
+  }
+
+  // If true, only a part of operands will be printed out (note that in this
+  // case the text will not be parsable).
+  HloPrintOptions& set_compact_operands(bool value) {
+    compact_operands_ = value;
+    return *this;
+  }
+
+  // If true, include the layout in any shapes that are printed (instruction
+  // and operands).
+  HloPrintOptions& set_include_layout_in_shapes(bool value) {
+    include_layout_in_shapes_ = value;
+    return *this;
+  }
+
+  // If true, canonicalizes instructions' name. Instead of using "%foo.1" as
+  // the name of an instruction, we use "%tmp_1", "%tmp_2" etc.
+  HloPrintOptions& set_canonicalize_instruction_names(bool value) {
+    canonicalize_instruction_names_ = value;
+    return *this;
+  }
+
+  // If true, canonicalizes computations, sorting by computations' names.
+  HloPrintOptions& set_canonicalize_computations(bool value) {
+    canonicalize_computations_ = value;
+    return *this;
+  }
+
+  // The indent of the hlo text block.
+  HloPrintOptions& set_indent_amount(int value) {
+    indent_amount_ = value;
+    return *this;
+  }
+
+  // If true, indicates the instruction being printed is inside a nested
+  // computation.
+  HloPrintOptions& set_is_in_nested_computation(bool value) {
+    is_in_nested_computation_ = value;
+    return *this;
+  }
+
+  HloPrintOptions& set_print_name_after_closing_brace(bool value) {
+    print_name_after_closing_brace_ = value;
+    return *this;
+  }
+
+  // If true, prints the parameter number of a parameter instruction.
+  HloPrintOptions& set_print_parameter_number(bool value) {
+    print_parameter_number_ = value;
+    return *this;
+  }
+
+  bool print_large_constants() const { return print_large_constants_; }
+  bool print_only_essential_constants() const {
+    return print_only_essential_constants_;
+  }
+  PrintSubcomputationMode print_subcomputation_mode() const {
+    return print_subcomputation_mode_;
+  }
+  bool print_original_value() const { return print_original_value_; }
+  bool print_metadata() const { return print_metadata_; }
+  bool print_metadata_only_op_name() const {
+    return print_metadata_only_op_name_;
+  }
+  bool print_backend_config() const { return print_backend_config_; }
+  bool sort_backend_config() const { return sort_backend_config_; }
+  bool print_infeed_outfeed_config() const {
+    return print_infeed_outfeed_config_;
+  }
+  bool compact_operands() const { return compact_operands_; }
+  bool include_layout_in_shapes() const { return include_layout_in_shapes_; }
+  bool print_result_shape() const { return print_result_shape_; }
+  bool print_operand_shape() const { return print_operand_shape_; }
+  bool print_operand_names() const { return print_operand_names_; }
+  int64_t print_operand_index_annotation_interval() const {
+    return print_operand_index_annotation_interval_;
+  }
+  bool print_ids() const { return print_ids_; }
+  bool print_program_shape() const { return print_program_shape_; }
+  bool print_percent() const { return print_percent_; }
+  bool print_control_dependencies() const {
+    return print_control_dependencies_;
+  }
+  bool print_extra_attributes() const { return print_extra_attributes_; }
+  bool syntax_sugar_async_ops() const { return syntax_sugar_async_ops_; }
+  bool canonicalize_instruction_names() const {
+    return canonicalize_instruction_names_;
+  }
+  bool canonicalize_computations() const { return canonicalize_computations_; }
+  int indent_amount() const { return indent_amount_; }
+  int is_in_nested_computation() const { return is_in_nested_computation_; }
+  int print_name_after_closing_brace() const {
+    return print_name_after_closing_brace_;
+  }
+  bool print_full_replica_group_list() const {
+    return print_full_replica_group_list_;
+  }
+  bool print_parameter_number() const { return print_parameter_number_; }
+
+ private:
+  // The interval between the /*index=*/ annotated operands. 0 means never print
+  // the annotation, 1 means print annotation for every operand.
+  int64_t print_operand_index_annotation_interval_;
+  PrintSubcomputationMode print_subcomputation_mode_;
+  int indent_amount_;
+  bool print_large_constants_;
+  bool print_only_essential_constants_;
+  bool print_original_value_;
+  bool print_metadata_;
+  bool print_metadata_only_op_name_;
+  bool print_backend_config_;
+  bool sort_backend_config_;
+  bool print_infeed_outfeed_config_;
+  bool compact_operands_;
+  bool include_layout_in_shapes_;
+  bool print_result_shape_;
+  bool print_operand_shape_;
+  bool print_operand_names_;
+  bool print_program_shape_;
+  bool print_percent_;
+  bool print_control_dependencies_;
+  bool canonicalize_instruction_names_;
+  bool is_in_nested_computation_;
+  bool print_ids_;
+  bool canonicalize_computations_;
+  bool print_extra_attributes_;
+  bool syntax_sugar_async_ops_;
+  bool print_name_after_closing_brace_;
+  bool print_full_replica_group_list_;
+  bool print_parameter_number_;
+};
+
+// For canonical string output, we need to have a canonical way to rename
+// each instruction and its operands. Each operand is renamed as "tmp_<xxx>",
+// where <xxx> is an index starting from 0.
+class CanonicalNameMap {
+ public:
+  const std::string& LookupOrInsert(int unique_id) {
+    std::string& canonical_name = canonical_name_map_[unique_id];
+    if (canonical_name.empty()) {
+      absl::StrAppend(&canonical_name, "tmp_", canonical_name_map_.size() - 1);
+    }
+    return canonical_name;
+  }
+
+  void Reserve(size_t size) { canonical_name_map_.reserve(size); }
+
+ private:
+  absl::flat_hash_map<int, std::string> canonical_name_map_;
+};
+
+class HloInstruction;
+
+// A small holder that is used to keep some immutable info alongside an
+// instruction pointer in an HloComputation's list of instructions
+class HloInstructionInfo {
+ public:
+  HloInstruction* get() const { return inst_; }
+  HloInstruction& operator*() { return *inst_; }
+  HloInstruction* operator->() { return inst_; }
+  const HloInstruction& operator*() const { return *inst_; }
+  const HloInstruction* operator->() const { return inst_; }
+
+  HloOpcode opcode() const { return opcode_; }
+  HloInstruction* inst() const { return inst_; }
+
+ private:  // TODO: Make private and provide accessors?
+  friend class HloComputation;
+  HloOpcode opcode_;
+  HloInstruction* inst_;
+};
+
+namespace mapped_ptr_container_sorter_internal {
+
+template <typename T>
+struct PtrGetter<const HloInstructionInfo&, const T*> {
+  static const T* Get(const HloInstructionInfo& p) { return p.get(); }
+};
+
+}  // namespace mapped_ptr_container_sorter_internal
+
+using HloInstructionList = std::vector<HloInstructionInfo>;
+
+template <typename UnderlyingList>
+class HloInstructionIteratorBase {
+ public:
+  using iterator_category = std::input_iterator_tag;
+  using value_type = HloInstructionInfo;
+  using difference_type = ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  HloInstructionIteratorBase(UnderlyingList* list, int begin_index,
+                             int end_index)
+      : list_(list), current_(begin_index), end_index_(end_index) {
+    if (current_ < end_index_ && (*list_)[current_].inst() == nullptr) {
+      ++*this;
+    }
+  }
+
+  HloInstruction* get() const { return (*list_)[current_].inst(); }
+
+  auto operator*() -> HloInstructionInfo { return (*list_)[current_]; }
+  HloInstructionIteratorBase& operator++() {
+    int next = current_;
+    do {
+      ++next;
+    } while (next < end_index_ && (*list_)[next].inst() == nullptr);
+    current_ = next;
+    return *this;
+  }
+  HloInstructionIteratorBase operator++(int) {
+    HloInstructionIteratorBase temp(list_, current_, end_index_);
+    operator++();
+    return temp;
+  }
+
+  friend bool operator==(const HloInstructionIteratorBase& a,
+                         const HloInstructionIteratorBase& b) {
+    return a.current_ == b.current_;
+  }
+
+  friend bool operator!=(const HloInstructionIteratorBase& a,
+                         const HloInstructionIteratorBase& b) {
+    return !(a == b);
+  }
+
+ private:
+  UnderlyingList* list_;
+  int current_;
+  int end_index_;
+};
+using HloInstructionIterator = HloInstructionIteratorBase<HloInstructionList>;
+using HloInstructionConstIterator =
+    HloInstructionIteratorBase<const HloInstructionList>;
+
+template <typename WrappedIter>
+class HloInstructionUnwrappingIteratorBase {
+ public:
+  using iterator_category = std::input_iterator_tag;
+  using value_type = HloInstruction*;
+  using difference_type = ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  explicit HloInstructionUnwrappingIteratorBase(WrappedIter iter)
+      : iter_(std::move(iter)) {}
+
+  auto operator*() -> value_type { return iter_.get(); }
+  HloInstructionUnwrappingIteratorBase& operator++() {
+    ++iter_;
+    return *this;
+  }
+  HloInstructionUnwrappingIteratorBase operator++(int) {
+    HloInstructionUnwrappingIteratorBase temp(iter_);
+    operator++();
+    return temp;
+  }
+
+  friend bool operator==(const HloInstructionUnwrappingIteratorBase& a,
+                         const HloInstructionUnwrappingIteratorBase& b) {
+    return a.iter_ == b.iter_;
+  }
+
+  friend bool operator!=(const HloInstructionUnwrappingIteratorBase& a,
+                         const HloInstructionUnwrappingIteratorBase& b) {
+    return !(a == b);
+  }
+
+ private:
+  WrappedIter iter_;
+};
+using HloInstructionUnwrappingIterator =
+    HloInstructionUnwrappingIteratorBase<HloInstructionIterator>;
+using HloInstructionUnwrappingConstIterator =
+    HloInstructionUnwrappingIteratorBase<HloInstructionConstIterator>;
+
+// HLO instructions are the atomic unit of the high-level compiler's IR.
+//
+// HloInstructions live inside of an HloComputation, which is analogous to a
+// function in other programming languages.  Nodes have no total order within
+// their computation.  Instead, they have a partial ordering determined by their
+// data and control dependencies.
+//
+// HLO does not have basic blocks or explicit "branch" instructions.  Instead,
+// certain HloInstructions -- namely, kWhile, kConditional, and kCall -- encode
+// control flow.  For example, the kConditional HLO executes one of two possible
+// computations, depending on the runtime value of a predicate.
+//
+// HLO is pure (mostly).  It has no concept of mutable state.  Instead, data
+// values are produced by one HLO and flow into consumers across dependency
+// edges.
+class HloInstruction {
+ public:
+  // A fusion node computes the same value a call to its fusion computation
+  // would compute.  However, the choice of fusion kind dictates codegen
+  // strategy for the backend.
+  //
+  // To generate code for a kFusion HloInstruction, most backends do something
+  // like the following:
+  //
+  // 1) Identify the "primary" HloInstruction of the fused computation.
+  // 2) Emit code that does the work of the primary node, creating its inputs
+  //    and transforming its outputs as specified by the fused computation.
+  //
+  // In step (2), the code emitted is usually similar to the code that would be
+  // emitted for an *unfused* version of the primary node, except that
+  //
+  //  - when the primary node reads an element of one of its operands, instead
+  //    of loading the value from memory, it *computes* the value based on the
+  //    contents of the fused computation.
+  //  - when the primary node outputs a value, instead of storing it to memory,
+  //    it forwards the value to its users, which then perform additional
+  //    computations before the value is finally stored to memory at the root of
+  //    the fusion node.
+  //
+  // An HloInstruction's FusionKind helps us find the kFusion instruction's
+  // primary node, and can also affect how we generate code in step (2).
+  //
+  //  - kInput: The primary node is the root of the fused instruction.
+  //
+  //  - kOutput: The primary node is not the root of the fused instruction.
+  //    This fusion kind requires that one operand buffer of the fusion
+  //    instruction be able to alias the output buffer.  This constraint is
+  //    usually enough to let backends find the primary node unambiguously.
+  //
+  //  - kLoop: The primary node is the root of the fused computation, but,
+  //    unlike in input fusion, we prescribe a specific implementation for
+  //    codegen.  Rather than generating code that looks like the code we'd emit
+  //    for an unfused version of the primary/root node, we emit code that
+  //    generates one element of the root at a time.
+  //
+  //  - kCustom: Custom category for backend-specific fusions that don't fit
+  //    into the above patterns.
+  //
+  // Not all backends support all fusion kinds, and given a particular fused
+  // computation, it's not in general safe to change its fusion kind.  Creation
+  // of fusion nodes is always backend-specific.
+  //
+  // For elementwise ops (e.g. kAdd), most backends would emit a
+  // one-element-at-a-time implementation for the unfused version, so loop
+  // fusion and input fusion are probably equivalent if the root node is
+  // elementwise.  They're not necessarily equivalent e.g. for kReduce, where an
+  // implementation might emit something more sophisticated for an unfused or
+  // input-fusion reduce, but will emit the naive code that reduces one element
+  // at a time for loop fusion with a reduce as the root.
+  //
+  // Another way to think of loop fusion is that it's equivalent to input
+  // fusion, but where the root node is an implicit identity node, whose
+  // unfused implementation is "read one element, write one element".
+  //
+  // TODO(b/79869434): This categorization scheme is not great.  For one thing,
+  // input and loop fusion are basically the same thing: There is no reason for
+  // the HLO to encode backend-specific decisions about how e.g. a reduce that's
+  // the root of a fusion should be lowered.  In addition, this scheme as
+  // written doesn't work for multi-output fusion, where the primary node is
+  // never actually the root (which is a kTuple instruction that gathers the
+  // multiple outputs of the fusion).
+  enum class FusionKind {
+    kLoop,
+    kInput,
+    kOutput,
+    kCustom,
+  };
+
+  inline static constexpr char kMainExecutionThread[] = "main";
+  inline static constexpr char kHostThread[] = "host";
+
+  virtual ~HloInstruction() { DetachFromOperandsAndUsers(); }
+
+  // Detaches an instruction from its operands and users. That is, remove the
+  // instruction from each operand's user set and user's operand set.
+  void DetachFromOperandsAndUsers();
+
+  // Adds a derived instruction to the parent computation of this instruction.
+  // Also update setup the new instruction as a derived instruction.
+  HloInstruction* AddInstruction(
+      std::unique_ptr<HloInstruction> derived_instruction);
+
+  // Creates an instruction from the given proto. Arguments:
+  //
+  //   proto: the proto to convert from.
+  //   instruction_map: a map from instruction id to HloInstruction*. This map
+  //     must contain all operands of the newly constructed instruction.
+  //   computation_map: a map from computation id to HloComputation*. This map
+  //     must contain all computations which the newly constructed instruction
+  //     calls.
+  static absl::StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
+      const HloInstructionProto& proto,
+      const absl::flat_hash_map<int64_t, HloInstruction*>& instruction_map,
+      const absl::flat_hash_map<int64_t, HloComputation*>& computation_map = {},
+      bool prohibit_empty_literal = true);
+
+  // Creates a parameter-retrieving instruction.
+  static std::unique_ptr<HloInstruction> CreateParameter(
+      int64_t parameter_number, const Shape& shape, absl::string_view name);
+
+  // Creates a literal constant instruction.
+  static std::unique_ptr<HloInstruction> CreateConstant(Literal literal);
+
+  // Creates an Iota instruction.
+  static std::unique_ptr<HloInstruction> CreateIota(const Shape& shape,
+                                                    int64_t iota_dimension);
+
+  // Creates a Top-K instruction returning the top k values along the last
+  // dimension of the input operand.
+  //
+  // - `k` indicates how many elements to return in the last dimension.
+  // - `largest` indicates whether to return the largest or smallest elements.
+  static std::unique_ptr<HloInstruction> CreateTopK(const Shape& shape,
+                                                    HloInstruction* input,
+                                                    int64_t k, bool largest);
+
+  // Creates a get tuple element instruction.
+  static std::unique_ptr<HloInstruction> CreateGetTupleElement(
+      const Shape& shape, HloInstruction* operand, int64_t index);
+
+  // Creates a get tuple element instruction.
+  static std::unique_ptr<HloInstruction> CreateGetTupleElement(
+      HloInstruction* operand, int64_t index);
+
+  // Creates a random number generation instruction that fills a shape with
+  // random numbers from a given distribution.
+  //
+  // The parameters to the instruction are interpreted as follows:
+  //
+  //  - If `distribution` is RNG_UNIFORM, generates a number in range
+  //    [param0, param1).
+  //
+  //  - If `distribution` is RNG_NORMAL, generates a normally-distributed value
+  //    with mean `param0` and standard deviation `param1`.
+  static std::unique_ptr<HloInstruction> CreateRng(
+      const Shape& shape, RandomDistribution distribution,
+      absl::Span<HloInstruction* const> parameters);
+
+  // Creates a stateless random bit generator instruction that fills a shape
+  // with random bits.
+  static std::unique_ptr<HloInstruction> CreateRngBitGenerator(
+      const Shape& shape, HloInstruction* state, RandomAlgorithm algorithm);
+
+  // Creates an instruction to update the random number generator state to
+  // reflect the new state after `delta` units of 32 random bits are generated
+  // and returns the old state.
+  static std::unique_ptr<HloInstruction> CreateRngGetAndUpdateState(
+      const Shape& shape, int64_t delta);
+
+  // Creates a unary instruction (one operand).
+  // Precondition: opcode must be a legitimate unary operation.
+  static std::unique_ptr<HloInstruction> CreateUnary(
+      const Shape& shape, HloOpcode opcode, HloInstruction* operand,
+      std::optional<ResultAccuracy> result_accuracy = std::nullopt);
+
+  // Creates a binary instruction (two operands).
+  // Precondition: opcode must be a legitimate binary operation.
+  static std::unique_ptr<HloInstruction> CreateBinary(const Shape& shape,
+                                                      HloOpcode opcode,
+                                                      HloInstruction* lhs,
+                                                      HloInstruction* rhs);
+
+  // Creates a ternary instruction (three operands).
+  // Precondition: opcode must be a legitimate ternary operation.
+  static std::unique_ptr<HloInstruction> CreateTernary(const Shape& shape,
+                                                       HloOpcode opcode,
+                                                       HloInstruction* lhs,
+                                                       HloInstruction* rhs,
+                                                       HloInstruction* ehs);
+
+  // Creates a variadic instruction (variable number of operands).
+  // Precondition: opcode must be a legitimate variadic operation.
+  static std::unique_ptr<HloInstruction> CreateVariadic(
+      const Shape& shape, HloOpcode opcode,
+      absl::Span<HloInstruction* const> operands);
+
+  // Creates a map instruction, where the computation (given by the handle) is
+  // applied element-wise to every element in operands (across the operands,
+  // at a given index)
+  static std::unique_ptr<HloInstruction> CreateMap(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* map_computation);
+
+  // Creates a convolution op, where rhs is the convolutional filter
+  // and window describes how the filter is applied to lhs.
+  static std::unique_ptr<HloInstruction> CreateConvolve(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const Window& window,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      const PrecisionConfig& precision_config);
+
+  // Creates an FFT op, of the type indicated by fft_type.
+  static std::unique_ptr<HloInstruction> CreateFft(
+      const Shape& shape, HloInstruction* operand, FftType fft_type,
+      absl::Span<const int64_t> fft_length);
+
+  // Creates an asynchronous start, update, and done op.
+  static std::unique_ptr<HloInstruction> CreateAsyncStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* async_computation,
+      absl::string_view async_execution_thread = kMainExecutionThread);
+  static std::unique_ptr<HloInstruction> CreateAsyncUpdate(
+      const Shape& shape, HloInstruction* operand);
+  static std::unique_ptr<HloInstruction> CreateAsyncDone(
+      const Shape& shape, HloInstruction* operand);
+
+  // Creates a copy-start op, indicating whether this is a cross-program
+  // prefetch or not.
+  static std::unique_ptr<HloInstruction> CreateCopyStart(
+      const Shape& shape, HloInstruction* operand,
+      std::optional<int> cross_program_prefetch_index = std::nullopt);
+
+  // Creates a compare op, performing the comparison specified in direction.
+  static std::unique_ptr<HloInstruction> CreateCompare(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      Comparison::Direction direction,
+      std::optional<Comparison::Type> type = std::nullopt);
+
+  static std::unique_ptr<HloInstruction> CreateTriangularSolve(
+      const Shape& shape, HloInstruction* a, HloInstruction* b,
+      const TriangularSolveOptions& options);
+
+  static std::unique_ptr<HloInstruction> CreateCholesky(
+      const Shape& shape, HloInstruction* a, const CholeskyOptions& options);
+
+  // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
+  // dimensions specified in 'dimension_numbers'. If 'sparsity' is set, then
+  // 'sparse_meta' must also be present (and have the same size).
+  // Note: 'sparsity' argument is eventually moved in the HloDotInstruction
+  // constructor, so no extra copies are created.
+  static std::unique_ptr<HloInstruction> CreateDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig& precision_config,
+      std::vector<SparsityDescriptor> sparsity = {},
+      absl::Span<HloInstruction* const> sparse_meta = {});
+
+  // Creates a ragged dot op with operands 'lhs', 'rhs', and 'group_sizes', with
+  // contracting, batch, ragged, and group dimensions specified in
+  // 'dimension_numbers'.
+  static std::unique_ptr<HloInstruction> CreateRaggedDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      HloInstruction* group_sizes,
+      const RaggedDotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig& precision_config);
+
+  // Creates a reduce-precision op, where operand is the data to reduce in
+  // precision, and exponent_bits and mantissa_bits describe the precision to
+  // reduce it to.
+  static std::unique_ptr<HloInstruction> CreateReducePrecision(
+      const Shape& shape, HloInstruction* operand, int exponent_bits,
+      int mantissa_bits);
+
+  // Creates an all-gather op, which concats the operands of all participants
+  // along all_gather_dimension. The replica_groups, channel_id, and
+  // use_global_device_ids arguments are identical to those in all-reduce,
+  // except that the order of the group members determines the concatenation
+  // order of inputs from different participants.
+  static std::unique_ptr<HloInstruction> CreateAllGather(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      bool constrain_layout, const std::optional<int64_t>& channel_id,
+      bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateAllGather(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t all_gather_dimension,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  // Creates an all-gather-start op, which concats the operands of all
+  // participants
+  // along all_gather_dimension. The replica_groups, channel_id, and
+  // use_global_device_ids arguments are identical to those in all-reduce,
+  // except that the order of the group members determines the concatenation
+  // order of inputs from different participants. Needs to be used in
+  // conjunction of a AllGatherDone op that synchronizes and returns the result.
+  static std::unique_ptr<HloInstruction> CreateAllGatherStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      bool constrain_layout, const std::optional<int64_t>& channel_id,
+      bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateAllGatherStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t all_gather_dimension,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  // Creates a cross replica reduction op.
+  //
+  // `reduction_computation`: the reduction function.
+  //
+  // `replica_groups`: each ReplicaGroup contains a list of replica id. If
+  // empty, all replicas belong to one group in the order of 0 - (n-1).
+  // Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_groups={{0,2},{1,3}} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // `channel_id`: for Allreduce nodes from different modules, if
+  // they have the same channel_id, they will be 'Allreduce'd. If
+  // empty, Allreduce will not be applied cross modules.
+  static std::unique_ptr<HloInstruction> CreateAllReduce(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateAllReduce(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  // Creates a reduce-scatter operation which reduces its inputs across the
+  // given replica groups and then scatters the reduced data across the N
+  // participants.
+  static std::unique_ptr<HloInstruction> CreateReduceScatter(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+      int64_t scatter_dimension);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateReduceScatter(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+      int64_t scatter_dimension);
+
+  // Creates an asynchronous cross replica reduction op.
+  //
+  // `reduction_computation`: the reduction function.
+  //
+  // `replica_groups`: each ReplicaGroup contains a list of replica id. If
+  // empty, all replicas belong to one group in the order of 0 - (n-1).
+  // Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_groups={{0,2},{1,3}} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // `channel_id`: for Allreduce nodes from different modules, if
+  // they have the same channel_id, they will be 'Allreduce'd. If
+  // empty, Allreduce will not be applied cross modules.
+  static std::unique_ptr<HloInstruction> CreateAllReduceStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateAllReduceStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  // An all-to-all op takes N array operands of the same shape and scatters them
+  // to N replicas.  Each replica gathers the results into a tuple.
+  //
+  // For example, suppose we have 3 replicas, with replica i passing inputs
+  // [a_i, b_i, c_i] to its all-to-all op.  Then the resulting tuples are
+  //
+  //   replica 0: (a_0, a_1, a_2)
+  //   replica 1: (b_0, b_1, b_2)
+  //   replica 2: (c_0, c_1, c_2).
+  //
+  // If replica_groups is set, the op is sharded and the replicas are permuted.
+  // To explain by way of example, suppose we have replica_groups={{1,2},{3,0}}.
+  // Then each replica passes two operands, say [a_i, b_i], and the result is
+  //
+  //   replica 0: (b_3, b_0)
+  //   replica 1: (a_1, a_2)
+  //   replica 2: (b_1, b_2)
+  //   replica 3: (a_3, a_0).
+  //
+  // All replica groups must have the same number of elements, and the number of
+  // operands must be equal to the size of one replica group.  Each replica must
+  // appear in exactly one group.
+  //
+  // Note that in addition to supporting this instruction, XlaBuilder also
+  // supports a higher-level instruction which takes one input and slices it,
+  // performs AllToAll and then concatenates the results into a single array.
+  static std::unique_ptr<HloInstruction> CreateAllToAll(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id,
+      const std::optional<int64_t>& split_dimension = std::nullopt);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateAllToAll(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id,
+      const std::optional<int64_t>& split_dimension = std::nullopt);
+
+  // The RaggedAllToAll instruction performs a collective all-to-all operation,
+  // where the input and output are ragged tensors.
+  //
+  // Ragged tensors are defined by a set of three tensors:
+  // *) ‘data’: the ‘data’ tensor is “ragged” along its outermost dimension,
+  //   along which each indexed element has variable size.
+  // *) ‘offsets’: the ‘offsets’ tensor indexes the outermost dimension of the
+  //  ‘data’ tensor, and represents the starting offset of each ragged element
+  //  of the ‘data’ tensor.
+  // *) ‘sizes’: the ‘sizes’ tensor represents the size of each ragged element
+  //  of the ‘data’ tensor, where the size is specified in units of
+  //  sub-elements. A sub-element is defined as the suffix of the ‘data’ tensor
+  //  shape obtained by removing the outermost “ragged” dimension.
+  // *) The ‘offsets’ and ‘sizes’ tensors must have the same size.
+  //
+  // An example ragged tensor
+  // data: [8,3] =
+  //  {{a,b,c},{d,e,f},{g,h,i},{j,k,l},{m,n,o},{p,q,r},{s,t,u},{v,w,x}}
+  // offsets: [3] = {0, 1, 4}
+  // sizes: [3] = {1, 3, 4}
+  //
+  // Index 'data' at 'offsets'[0], 'sizes'[0]'
+  // {a,b,c}
+  //
+  // Index 'data' at 'offsets'[1], 'sizes'[1]'
+  // {d,e,f},{g,h,i},{j,k,l}
+  //
+  // Index 'data' at 'offsets'[2], 'sizes'[2]'
+  // {m,n,o},{p,q,r},{s,t,u},{v,w,x}
+  //
+  //
+  // ``output_offsets`` must be sharded in a way that each replica has offsets
+  // in the target replica output perspective.
+  //
+  // For i-th output offset, the current replica will send
+  // `input[input_offsets[i]:input_offsets[i]+input_sizes[i]]` update to
+  // `i`-th replica that will be written to
+  // `output_i[output_offsets[i]:output_offsets[i]+send_sizes[i]]` in `i`-th
+  // replica ``output``.
+  //
+  // For example, if we have 2 replicas:
+  //
+  // replica 0:
+  //   input: [1, 2, 2]
+  //   output: [0, 0, 0, 0]
+  //   input_offsets: [0, 1]
+  //   send_sizes: [1, 2]
+  //   output_offsets: [0, 0]
+  //   recv_sizes: [1, 1]
+  //
+  // replica 1:
+  //   input: [3, 4, 0]
+  //   output: [0, 0, 0, 0]
+  //   input_offsets: [0, 1]
+  //   send_sizes: [1, 1]
+  //   output_offsets: [1, 2]
+  //   recv_sizes: [2, 1]
+  //
+  // replica 0's result will be: [1, 3, 0, 0]
+  // replica 1's result will be: [2, 2, 4, 0]
+  //
+  // The ragged all-to-all HLO has the following arguments:
+  //   input: ragged input data tensor.
+  //   output: ragged output data tensor.
+  //   input_offsets: ragged input offsets tensor.
+  //   send_sizes: ragged send sizes tensor.
+  //   output_offsets: array of ragged offsets in the target replica output.
+  //   recv_sizes: ragged recv sizes tensor.
+  //
+  // The '*_offsets' and '*_sizes' tensors must have the same shape.
+  // The output buffer is passed in as an input (and aliased in the output),
+  // to support incremental updates to the same buffer.
+  //
+  static std::unique_ptr<HloInstruction> CreateRaggedAllToAll(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateRaggedAllToAll(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<int64_t>& channel_id);
+
+  // Creates a communication instruction that broadcasts data cross replicas.
+  // Data is sent from to the first replica id in each group to the other ids in
+  // the same group. If a replica id is not a in any replica group, the output
+  // on that replica is a tensor consists of 0(s) in `shape`.
+  static std::unique_ptr<HloInstruction> CreateCollectiveBroadcast(
+      const Shape& shape, absl::Span<HloInstruction* const> operand,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  static std::unique_ptr<HloInstruction> CreateCollectiveBroadcast(
+      const Shape& shape, absl::Span<HloInstruction* const> operand,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  // Creates a communication instruction that permutes data cross replicas.
+  // Data is sent/received according to the (source_replica_id,
+  // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a
+  // target_replica_id in any pair, the output on that replica is a tensor
+  // consists of 0(s) in `shape`.
+  static std::unique_ptr<HloInstruction> CreateCollectivePermute(
+      const Shape& shape, HloInstruction* operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<int64_t>& channel_id);
+
+  static std::unique_ptr<HloInstruction> CreateCollectivePermute(
+      const Shape& shape, HloInstruction* input, HloInstruction* output,
+      HloInstruction* input_start_indices, HloInstruction* output_start_indices,
+      absl::Span<const std::pair<int64_t, int64_t>> source_target_pairs,
+      absl::Span<const std::vector<int64_t>> slice_sizes,
+      const std::optional<int64_t>& channel_id);
+
+  // Creates a communication instruction that initiates the start of
+  // CollectivePermute.
+  static std::unique_ptr<HloInstruction> CreateCollectivePermuteStart(
+      const Shape& shape, HloInstruction* operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<int64_t>& channel_id);
+
+  static std::unique_ptr<HloInstruction> CreateCollectivePermuteStart(
+      const Shape& shape, HloInstruction* input, HloInstruction* output,
+      HloInstruction* input_start_indices, HloInstruction* output_start_indices,
+      absl::Span<const std::pair<int64_t, int64_t>> source_target_pairs,
+      absl::Span<const std::vector<int64_t>> slice_sizes,
+      const std::optional<int64_t>& channel_id);
+
+  // Creates an instruction that returns a U32 replica ID.
+  static std::unique_ptr<HloInstruction> CreateReplicaId(
+      const Shape& shape = ShapeUtil::MakeShape(U32, {}));
+
+  // Creates an instruction that returns a U32 partition ID.
+  static std::unique_ptr<HloInstruction> CreatePartitionId(
+      const Shape& shape = ShapeUtil::MakeShape(U32, {}));
+
+  // Creates a conversion instruction, where operand is the data to convert and
+  // shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
+                                                       HloInstruction* operand);
+
+  // Creates a bitcast instruction, where operand is the data to
+  // convert and shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateBitcast(const Shape& shape,
+                                                       HloInstruction* operand);
+
+  // Creates a bitcast conversion instruction, where operand is the data to
+  // convert and shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateBitcastConvert(
+      const Shape& shape, HloInstruction* operand);
+
+  // Creates a stochastic conversion instruction, where operand is the data to
+  // convert, random is a given random input to determine the rounding direction
+  // and shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateStochasticConvert(
+      const Shape& shape, HloInstruction* operand, HloInstruction* random);
+
+  // Creates an infeed instruction, which reads data of the given shape from the
+  // Infeed interface of the device. infeed_shape is the shape of the data
+  // received from the infeed *not* the shape of the infeed instruction which
+  // is a tuple containing the infeed_shape and the TOKEN.
+  static std::unique_ptr<HloInstruction> CreateInfeed(
+      const Shape& infeed_shape, HloInstruction* token_operand,
+      const std::string& config);
+
+  // Creates an outfeed instruction, which outputs data. outfeed_shape is the
+  // shape of the data being outfed *not* the shape of the outfeed instruction
+  // which is a TOKEN.
+  static std::unique_ptr<HloInstruction> CreateOutfeed(
+      const Shape& outfeed_shape, HloInstruction* operand,
+      HloInstruction* token_operand, absl::string_view outfeed_config);
+
+  // Creates an asynchronous send instruction with the given channel id, which
+  // initiates sending the operand data to a unique receive instruction in
+  // another computation that has the same channel id. If is_host_transfer is
+  // true, then this Send operation transfers data to the host.
+  static std::unique_ptr<HloInstruction> CreateSend(
+      HloInstruction* operand, HloInstruction* token,
+      std::optional<int64_t> channel_id, bool is_host_transfer);
+
+  // Blocks until data transfer for the Send instruction (operand) is complete.
+  // The operand must be kSend.
+  static std::unique_ptr<HloInstruction> CreateSendDone(
+      HloInstruction* operand, std::optional<int64_t> channel_id,
+      bool is_host_transfer);
+
+  // Creates an asynchronous receive instruction with the given channel id,
+  // which allocates resources to receive data of the given shape from a unique
+  // send instruction in another computation that has the same channel id.  If
+  // is_host_transfer is true, then this Recv operation transfers data from the
+  // host.
+  static std::unique_ptr<HloInstruction> CreateRecv(
+      const Shape& shape, HloInstruction* token,
+      std::optional<int64_t> channel_id, bool is_host_transfer);
+
+  // Blocks until data transfer for the Recv instruction (operand) is complete
+  // and returns the receive buffer. The operand must be kRecv.
+  static std::unique_ptr<HloInstruction> CreateRecvDone(
+      HloInstruction* operand, std::optional<int64_t> channel_id,
+      bool is_host_transfer);
+
+  // Creates a slice instruction, where the operand is sliced by the given
+  // start/limit indices.
+  static std::unique_ptr<HloInstruction> CreateSlice(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<const int64_t> start_indices,
+      absl::Span<const int64_t> limit_indices,
+      absl::Span<const int64_t> strides);
+
+  // Creates a slice instruction, where the first operand is sliced by
+  // start indices specified in the second operand, and by size specified in
+  // 'slice_sizes'.
+  static std::unique_ptr<HloInstruction> CreateDynamicSlice(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64_t> slice_sizes);
+
+  // Creates a dynamic update slice instruction, which updates a slice
+  // of 'operand' with 'update' and 'start_indices'.
+  static std::unique_ptr<HloInstruction> CreateDynamicUpdateSlice(
+      const Shape& shape, HloInstruction* operand, HloInstruction* update,
+      absl::Span<HloInstruction* const> start_indices);
+
+  // Creates a concatenate instruction, where the operands are concatenated on
+  // the provided dimension.
+  static std::unique_ptr<HloInstruction> CreateConcatenate(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t dimension);
+
+  // Creates a reduce instruction, where the computation (given by the handle)
+  // is applied successively to every element in operand. For example, let f be
+  // the function to apply, which takes 2 arguments, an accumulator and the
+  // current value. Let init be an initial value (which is normally chosen to be
+  // the identity element for f, e.g. 0 if f is addition).
+  // Then the reduce HLO will compute:
+  // f(f(init, value0), value1), ...)
+  static std::unique_ptr<HloInstruction> CreateReduce(
+      const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
+      absl::Span<const int64_t> dimensions_to_reduce,
+      HloComputation* reduce_computation);
+
+  // A more general, multiple-argument version of the above.
+  // The function to apply, f, now takes N arguments:
+  // [accumulator0, accumulator1, ..., accumulatorN, value0, value1, ...,
+  // init_valueN], and returns an N-tuple. The performed computation is (for
+  // commutative and associative f operators) equivalent to:
+  //
+  // f_1 = f(init0, ...  initN, input0.value0, ..., inputN.value0)
+  // f_2 = f(f_1.tuple_element(0), ..., f_1.tuple_element(N), input0.value1,
+  // ..., inputN.value1)
+  // ...
+  static std::unique_ptr<HloInstruction> CreateReduce(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloInstruction* const> init_values,
+      absl::Span<const int64_t> dimensions_to_reduce,
+      HloComputation* reduce_computation);
+
+  // Helper version where the operands are given by a single instruction which
+  // either is a tuple of size `init_values`, or a single input, in which case
+  // size of `init_values` is one.
+  static std::unique_ptr<HloInstruction> CreateReduce(
+      const Shape& shape, HloInstruction* tuple_of_instructions,
+      absl::Span<HloInstruction* const> init_values,
+      absl::Span<const int64_t> dimensions_to_reduce,
+      HloComputation* reduce_computation);
+
+  // Creates a reduce-window instruction, where the computation (given
+  // by the handle) is applied window-wise at each valid window
+  // position in the operand.
+  static std::unique_ptr<HloInstruction> CreateReduceWindow(
+      const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
+      const Window& window, HloComputation* reduce_computation);
+
+  // A more general, multiple-argument version of the above.
+  // The reduce_computation being applied,now takes N arguments:
+  // [accumulator0, accumulator1, ..., accumulatorN, value0, value1, ...,
+  // valueN], and returns an N-tuple. The operands and init_values now each
+  // contain a span of N input arrays and n initial values.
+  static std::unique_ptr<HloInstruction> CreateReduceWindow(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloInstruction* const> init_values, const Window& window,
+      HloComputation* reduce_computation);
+
+  // Creates a batch-norm-training instruction.
+  static std::unique_ptr<HloInstruction> CreateBatchNormTraining(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, float epsilon, int64_t feature_index);
+
+  // Creates a batch-norm-inference instruction.
+  static std::unique_ptr<HloInstruction> CreateBatchNormInference(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
+      float epsilon, int64_t feature_index);
+
+  // Creates a batch-norm-grad instruction.
+  static std::unique_ptr<HloInstruction> CreateBatchNormGrad(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* mean, HloInstruction* variance,
+      HloInstruction* grad_output, float epsilon, int64_t feature_index);
+
+  // Creates a scatter computation that scatters the `source` array to the
+  // selected indices of each window.
+  static std::unique_ptr<HloInstruction> CreateSelectAndScatter(
+      const Shape& shape, HloInstruction* operand, HloComputation* select,
+      const Window& window, HloInstruction* source, HloInstruction* init_value,
+      HloComputation* scatter);
+
+  // Creates a broadcast instruction.
+  static std::unique_ptr<HloInstruction> CreateBroadcast(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<const int64_t> broadcast_dimensions);
+
+  // Creates a sequence of instructions that performs an explicit broadcast of
+  // the operand to the target shape.
+  //
+  // Interior HLOs are passed to "adder", but the "root" HLO of the sequence is
+  // returned as a unique_ptr for API consistency with other factory methods in
+  // this interface.
+  //
+  // TODO(b/72173833) Ideally HloComputations would always be present, and so
+  // the adder being passed by the caller would not be necessary.
+  static std::unique_ptr<HloInstruction> CreateBroadcastSequence(
+      const Shape& output_shape, HloInstruction* operand,
+      absl::FunctionRef<HloInstruction*(std::unique_ptr<HloInstruction>)>
+          adder);
+
+  // Creates a pad instruction, where the operand is padded on the edges and
+  // between the elements with the given padding value.
+  static std::unique_ptr<HloInstruction> CreatePad(
+      const Shape& shape, HloInstruction* operand,
+      HloInstruction* padding_value, const PaddingConfig& padding_config);
+
+  // Creates a reshape instruction, where the operand is flattened row-major
+  // order and then reshaped to the given result shape.
+  static std::unique_ptr<HloInstruction> CreateReshape(
+      const Shape& shape, HloInstruction* operand,
+      int64_t inferred_dimension = -1);
+
+  // Creates a dynamic reshape instruction. Similar to reshape but dynamic
+  // dimensions sizes are provided as additional variadic arguments.
+  //
+  // Precondition: dim_sizes.size() == shape.rank()
+  static std::unique_ptr<HloInstruction> CreateDynamicReshape(
+      const Shape& shape, HloInstruction* data_operand,
+      absl::Span<HloInstruction* const> dim_sizes);
+
+  // Creates a transpose instruction which permutes the operand dimensions.
+  static std::unique_ptr<HloInstruction> CreateTranspose(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<const int64_t> dimensions);
+
+  // Creates a n-ary sort op with a 'compare' computation which is used for
+  // comparisons in the sorting algorithm. 'compare' gets 2 * n parameters,
+  // where parameters 2 * i and 2 * i + 1 are the values of the i-th operand at
+  // specific index positions which should be compared, and should return a
+  // PRED. 'is_stable' specifies whether stable sorting is required.
+  static std::unique_ptr<HloInstruction> CreateSort(
+      const Shape& shape, int64_t dimension,
+      absl::Span<HloInstruction* const> operands, HloComputation* compare,
+      bool is_stable);
+
+  // Creates a while instruction, given a condition computation, a body
+  // computation, and the initial value for the input of the computations. For
+  // example, shape: S32, condition: i -> i < 1000, body: i -> i * 2, init: 1
+  // corresponds to the C code below.
+  // int32_t i = 1; int32_t result = while(i < 1000) { i = i * 2 }
+  static std::unique_ptr<HloInstruction> CreateWhile(const Shape& shape,
+                                                     HloComputation* condition,
+                                                     HloComputation* body,
+                                                     HloInstruction* init);
+
+  static std::unique_ptr<HloInstruction> CreateConditional(
+      const Shape& shape, HloInstruction* pred,
+      HloInstruction* true_computation_arg, HloComputation* true_computation,
+      HloInstruction* false_computation_arg, HloComputation* false_computation);
+
+  static std::unique_ptr<HloInstruction> CreateConditional(
+      const Shape& shape, HloInstruction* branch_index,
+      absl::Span<HloComputation* const> branch_computations,
+      absl::Span<HloInstruction* const> branch_computation_args);
+
+  static std::unique_ptr<HloInstruction> CreateGather(
+      const Shape& shape, HloInstruction* operand,
+      HloInstruction* start_indices,
+      const GatherDimensionNumbers& gather_dim_numbers,
+      absl::Span<const int64_t> slice_sizes, bool indices_are_sorted);
+
+  static std::unique_ptr<HloInstruction> CreateScatter(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloInstruction* scatter_indices,
+      absl::Span<HloInstruction* const> updates,
+      HloComputation* update_computation,
+      const ScatterDimensionNumbers& scatter_dim_numbers,
+      bool indices_are_sorted, bool unique_indices);
+
+  static std::unique_ptr<HloInstruction> CreateScatter(
+      const Shape& shape, HloInstruction* operand,
+      HloInstruction* scatter_indices, HloInstruction* updates,
+      HloComputation* update_computation,
+      const ScatterDimensionNumbers& scatter_dim_numbers,
+      bool indices_are_sorted, bool unique_indices);
+
+  // Creates a kDomain instruction which delimits an HLO domain which have
+  // the provided user and operand side metadata.
+  static std::unique_ptr<HloInstruction> CreateDomain(
+      const Shape& shape, HloInstruction* operand,
+      std::unique_ptr<DomainMetadata> operand_side_metadata,
+      std::unique_ptr<DomainMetadata> user_side_metadata);
+
+  // Creates a fusion instruction. A fusion instruction contains one or more
+  // fused instructions forming an expression with a single root
+  // "fused_root". Additional instructions can be added to the fusion
+  // instruction with the method FuseInstruction.
+  static std::unique_ptr<HloInstruction> CreateFusion(
+      const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root,
+      absl::string_view prefix = "");
+
+  static std::unique_ptr<HloInstruction> CreateFusion(
+      const Shape& shape, FusionKind fusion_kind,
+      absl::Span<HloInstruction* const> operands,
+      HloComputation* fusion_computation, absl::string_view prefix = "");
+
+  // Creates a call instruction that applies the given computation on the given
+  // operands. "shape" is the resultant shape.
+  static std::unique_ptr<HloInstruction> CreateCall(
+      const Shape& shape, HloInstruction* called_computation_root);
+
+  static std::unique_ptr<HloInstruction> CreateCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* computation);
+
+  // Creates a composite call instruction that applies the given computation on
+  // the given operands. "shape" is the resultant shape.
+  static std::unique_ptr<HloInstruction> CreateCompositeCall(
+      const Shape& shape, HloInstruction* decomposition_root,
+      const std::string& name, const std::string& attributes, int64_t version);
+
+  static std::unique_ptr<HloInstruction> CreateCompositeCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* decomposition, const std::string& name,
+      const std::string& attributes, int64_t version);
+
+  // Creates a custom call instruction that applies the given custom call target
+  // to the given operands. "opaque" can be an arbitrary string with a
+  // backend-specific interpretation. "shape" is the resultant shape.
+  static std::unique_ptr<HloInstruction> CreateCustomCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::string_view custom_call_target, std::string opaque = "",
+      CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+  // Overload with a to_apply computation.
+  static std::unique_ptr<HloInstruction> CreateCustomCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* to_apply, absl::string_view custom_call_target,
+      std::string opaque = "",
+      CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+  // Overload with multiple computations. The called computations can have
+  // different function signatures.
+  static std::unique_ptr<HloInstruction> CreateCustomCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloComputation* const> called_computations,
+      absl::string_view custom_call_target, std::string opaque = "",
+      CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+  // Overload which constrains the layouts of the operand and result. 'shape'
+  // and 'operand_shapes_with_layout' must have layouts.
+  // 'operand_shapes_with_layout' must have a compatible element for each
+  // operand.
+  static std::unique_ptr<HloInstruction> CreateCustomCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::string_view custom_call_target,
+      absl::Span<const Shape> operand_shapes_with_layout,
+      std::string opaque = "",
+      CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
+
+  // Creates a tuple instruction with the given elements. This is a convenience
+  // wrapper around CreateVariadic.
+  static std::unique_ptr<HloInstruction> CreateTuple(
+      absl::Span<HloInstruction* const> elements);
+
+  // Creates a reverse instruction, which reverses the order of the elements
+  // in the specified dimensions.
+  static std::unique_ptr<HloInstruction> CreateReverse(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<const int64_t> dimensions);
+
+  // Creates a Afterall instruction used for joining or creating new values of
+  // token type which thread through side-effecting operations. Operands must
+  // all be tokens, calls without operands generates a token.
+  static std::unique_ptr<HloInstruction> CreateAfterAll(
+      absl::Span<HloInstruction* const> operands);
+
+  // Creates an AfterAll instruction which creates a token type out of thin air
+  // (no operands). This is a separate method from CreateAfterAll to facility
+  // the removal of operand-less AfterAll instructions.
+  // TODO(b/110532604): Remove this capability of creating a token from nothing
+  // when we plumb a primordial token from the entry computation.
+  static std::unique_ptr<HloInstruction> CreateToken();
+
+  static std::unique_ptr<HloInstruction> CreateGetDimensionSize(
+      const Shape& shape, HloInstruction* operand, int64_t dimension);
+
+  static std::unique_ptr<HloInstruction> CreateSetDimensionSize(
+      const Shape& shape, HloInstruction* operand, HloInstruction* val,
+      int64_t dimension);
+
+  static std::unique_ptr<HloInstruction> CreateAddDependency(
+      HloInstruction* data_operand, HloInstruction* token_operand);
+
+  // Returns true if `execution_thread` is included in the
+  // `execution_threads_set`.
+  static bool IsThreadIncluded(
+      absl::string_view execution_thread,
+      const absl::flat_hash_set<absl::string_view>& execution_threads_set);
+
+
+  // Returns the opcode for this instruction.
+  HloOpcode opcode() const { return opcode_; }
+  HloOpcode* mutable_opcode() { return &opcode_; }
+
+  // Returns whether this instruction is the root of its parent computation.
+  bool IsRoot() const { return is_root_; }
+  void MarkAsRoot() { is_root_ = true; }
+  void MarkAsNonRoot() { is_root_ = false; }
+
+  // Does this instruction have no users.
+  bool IsDead() const { return users_.empty() && !IsRoot(); }
+
+  // Returns true if this instruction has a side effect, irrespective of whether
+  // any called computations may contain an instruction with side effects.
+  bool HasSideEffectNoRecurse() const;
+
+  // Returns true if this instruction has a side effect. An instruction has a
+  // side effect if it uses certain opcodes or calls a computation with a side
+  // effect.
+  virtual bool HasSideEffect() const;
+
+  // Returns the result shape of this instruction.
+  const Shape& shape() const;
+
+  // Returns the (mutable) result shape of this instruction.
+  Shape* mutable_shape() { return &shape_; }
+
+  // Returns the ith operand to this instruction.
+  const HloInstruction* operand(int64_t i) const;
+
+  // Returns the ith operand to this instruction.
+  HloInstruction* mutable_operand(int64_t i);
+
+  // Returns the number of operands to this instruction.
+  int64_t operand_count() const { return operands_.size(); }
+
+  // Returns the vector of operands of this instruction.
+  using InstructionVector = absl::InlinedVector<HloInstruction*, 2>;
+  const InstructionVector& operands() const { return operands_; }
+  InstructionVector mutable_operands() { return operands_; }
+
+  // Returns the vector of unique operands, in the same order they are found
+  // within the operand vector.
+  InstructionVector unique_operands() const;
+
+  // Returns the first index of 'target' that occurs in the operands sequence.
+  // Precondition: target must be an operand (or a fatal error will occur).
+  int64_t operand_index(const HloInstruction* target) const;
+
+  // Returns all indices of 'target' that occur in the operands sequence.
+  // Precondition: target must be an operand (or a fatal error will occur).
+  std::vector<int64_t> operand_indices(const HloInstruction* target) const;
+
+  // Returns the number of users of this instruction.
+  int64_t user_count() const { return users_.size(); }
+
+  // Returns the users of this instruction.
+  const PtrVec<HloInstruction*>& users() const { return users_.vec(); }
+
+  // Returns the index of the user in the users() vector.
+  //
+  // Precondition: `user` is a user of the instruction.
+  int64_t UserId(HloInstruction* user) { return users_.UserId(user); }
+
+  // Returns true if this instruction is a user of 'instruction'.
+  bool IsUserOf(const HloInstruction* instruction) const {
+    return instruction->users_.Contains(this);
+  }
+
+  // Adds a control dependency from this instruction to the given
+  // instruction. This instruction becomes a control predecessor of
+  // 'instruction', and 'instruction' becomes a control successor of this
+  // instruction. Returns an error status if either of the given instructions
+  // does not belong to the same computation.
+  //
+  // This is used to enforce an additional ordering requirement that is not
+  // captured by normal data dependencies, such as ordering among Send or Recv
+  // operations to avoid deadlock.
+  absl::Status AddControlDependencyTo(HloInstruction* instruction);
+
+  // Removes a previously added control dependency from this instruction to
+  // 'instruction'.
+  absl::Status RemoveControlDependencyTo(HloInstruction* instruction);
+
+  // Drops all control predecessors and successors from this HLO instruction.
+  absl::Status DropAllControlDeps();
+
+  // Drops all control predecessors and successors from this HLO instruction,
+  // and the maintain the transitivie control dependencies between
+  // control predecessors and control successors.
+  absl::Status SafelyDropAllControlDependencies();
+
+  // Returns if instruction has any control dependencies.
+  bool HasControlDependencies() const;
+
+  // Copies the control predecessors and successors on this HLO instruction to
+  // `inst`.  Does not do a deep copy so this makes sense only if `inst` and
+  // this HLO are in the same module.
+  //
+  // Depending on the use cases we see in practice, in the future we may
+  // consider folding the logic here into Clone, CloneWithNewOperands and
+  // ReplaceAllUsesWith by treating control dependencies like data dependencies.
+  absl::Status CopyAllControlDepsFrom(const HloInstruction* inst) {
+    return inst->CopyAllControlDepsTo(this, this);
+  }
+
+  // Copies all control dependencies of this instruction to start/end. Copies
+  // all control predecessors of this instruction to control predecessors of
+  // `start` and copies all control successors of this instruction to control
+  // successors of `end`.
+  absl::Status CopyAllControlDepsTo(HloInstruction* start,
+                                    HloInstruction* end) const;
+
+  // Returns the set of control predecessors (successors) of this
+  // instruction. Control predecessors (successors) must execute before (after)
+  // the current instruction.
+  const PtrVec<HloInstruction*>& control_predecessors() const {
+    return rare()->control_predecessors;
+  }
+  const PtrVec<HloInstruction*>& control_successors() const {
+    return rare()->control_successors;
+  }
+
+  // Returns true if 'other' performs the same computation as this instruction.
+  bool Identical(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloInstruction*, const HloInstruction*)>
+          eq_operands = std::equal_to<const HloInstruction*>(),
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true, bool sharding_sensitive = false) const {
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive, sharding_sensitive,
+                             /*ignore_channel_id_values=*/false,
+                             /*ignore_commutative_operand_order=*/false);
+  }
+  // Returns true if 'other' is the same kind of op as this instruction. For
+  // regular ops, it just checks whether the opcode is the same, for ops like
+  // e.g. kCompare, it also checks extra attributes.
+  bool SameOp(const HloInstruction& other) const {
+    return opcode() == other.opcode() &&
+           IdenticalSlowPath(other, std::equal_to<const HloComputation*>());
+  }
+
+  // Same as Identical() but ignores the order of commutative operands (e.g.
+  // considers add(a,b) equal to add(b,a)).
+  bool IdenticalIgnoringCommutativeOperandOrder(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloInstruction*, const HloInstruction*)>
+          eq_operands = std::equal_to<const HloInstruction*>(),
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true, bool sharding_sensitive = false) const {
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive, sharding_sensitive,
+                             /*ignore_channel_id_values=*/false,
+                             /*ignore_commutative_operand_order=*/true);
+  }
+
+  // Same as Identical() but ignores channel ID value mismatches, as long as
+  // both have channel IDs or neither has a channel ID.
+  bool IdenticalIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloInstruction*, const HloInstruction*)>
+          eq_operands = std::equal_to<const HloInstruction*>(),
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true, bool sharding_sensitive = false) const {
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive, sharding_sensitive,
+                             /*ignore_channel_id_values=*/true,
+                             /*ignore_commutative_operand_order=*/true);
+  }
+
+  // Allow subclasses to contribute additional attributes to the hash.
+  virtual void HashAdditionalAttributes(absl::HashState h) const {};
+
+  // Generates a hash value of an HLO instruction. Hash considers
+  // information on opcode, shape, number of operands, and other relevant
+  // additional attributes (e.g. literal values, parameters, etc.).
+  template <typename H>
+  friend H AbslHashValue(H h, const HloInstruction& hlo) {
+    h = H::combine(std::move(h), hlo.opcode(), hlo.shape());
+    if (!hlo.IsCrossModuleAllReduce()) {
+      h = H::combine(std::move(h), hlo.operand_count());
+    }
+    // Allow subclasses to mix additional data into h before returning
+    hlo.HashAdditionalAttributes(absl::HashState::Create(&h));
+    return h;
+  }
+
+  // Returns whether the instruction has a constant operand.
+  bool HasConstantOperand() const;
+
+  // Replaces the use of this instruction in "user" with "new_producer". Note
+  // that there might be multiple uses of this instruction in "user"; all will
+  // be replaced.
+  //
+  // If user is a fusion instruction, this function will remove any duplicated
+  // operands of it which could be created due to this replacement.
+  absl::Status ReplaceUseWith(HloInstruction* user,
+                              HloInstruction* new_producer);
+
+  // Same as ReplaceUseWith(), but new_producer can have a different shape.
+  absl::Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                            HloInstruction* new_producer);
+
+  // Same as ReplaceUseWith but only replaces the use at the given operand
+  // number.
+  absl::Status ReplaceUseWith(HloInstruction* user, int operand_number,
+                              HloInstruction* new_producer);
+  absl::Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                            int operand_number,
+                                            HloInstruction* new_producer);
+
+  // Replaces the specified operand with new_operand. The old and new operands
+  // must have compatible shapes ignoring floating-point precision.
+  //
+  // This function does NOT remove duplicated operands even if this instruction
+  // is a fusion, so that the existing operand numbers do not change.
+  absl::Status ReplaceOperandWith(int64_t operand_num,
+                                  HloInstruction* new_operand);
+
+  // Same as ReplaceOperandWith(), but new_operand can have a different shape.
+  absl::Status ReplaceOperandWithDifferentShape(int64_t operand_num,
+                                                HloInstruction* new_operand);
+
+  // Decomposes fusion back to individual parts.
+  absl::Status Defuse();
+
+  // Unfuses the given instruction from its fusion computation. If the given
+  // instruction is not fused, this is a no-op and returns nullptr. Returns a
+  // pointer to the newly unfused instruction if successful. Currently, fused
+  // instructions with parameter or constant operands are supported.
+  absl::StatusOr<HloInstruction*> UnfuseInstruction(
+      HloInstruction* instruction);
+
+  // Replaces all uses of this instruction with the new producer. If
+  // new_producer is a user of this instruction then new_producer remains a use
+  // of this instruction to avoid introducing cycles into the graph.
+  //
+  // If this instruction is the root of its computation, sets the computation's
+  // root to new_producer.
+  //
+  // The new producer must have a compatible shape ignoring floating-point
+  // precision.
+  //
+  // If a user is a fusion instruction, this function will remove any duplicated
+  // operands of it which could be created due to this replacement.
+  //
+  // trigger is a string used in the error message if the new and the
+  // current instruction don't have a compatible shape.
+  absl::Status ReplaceAllUsesWith(HloInstruction* new_producer,
+                                  absl::string_view trigger = "");
+
+  // Same as ReplaceAllUsesWith, but new_producer can have a different shape.
+  absl::Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
+
+  // Same as ReplaceAllUsesWith, but only replace given set of users.
+  absl::Status ReplaceUsesWith(absl::Span<HloInstruction* const> users,
+                               HloInstruction* new_producer);
+  absl::Status ReplaceAllUsesWithDifferentShape(
+      absl::Span<HloInstruction* const> users, HloInstruction* new_producer);
+
+  // Performs a postorder DFS visit using this node as the root. If
+  // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
+  // complete. If ignore_control_predecessors is true, instructions only
+  // reachable via control dependencies will not be visited, and the postorder
+  // will not take control dependencies into account. It is as if the control
+  // dependencies didn't exist in the graph at all. If cross_computation is
+  // true, DFS will go across the computation boundary (i.e., from an
+  // instruction to the root instruction of a computation it calls).
+  template <typename HloInstructionPtr>
+  absl::Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
+                      bool call_finish_visit = true,
+                      bool ignore_control_predecessors = false,
+                      bool cross_computation = false);
+  absl::Status Accept(ConstDfsHloVisitor* visitor,
+                      bool call_finish_visit = true,
+                      bool ignore_control_predecessors = false,
+                      bool cross_computation = false) const {
+    return const_cast<HloInstruction*>(this)->Accept(
+        visitor, call_finish_visit, ignore_control_predecessors,
+        cross_computation);
+  }
+
+  // Same as Accept() above, but the order of operand and control predecessor
+  // visitation is determined by the given operand order; if compare(A, B) ==
+  // true, A is visited before B.
+  using CompareFunction =
+      absl::FunctionRef<bool(const HloInstruction*, const HloInstruction*)>;
+  absl::Status AcceptWithOperandOrder(DfsHloVisitor* visitor,
+                                      CompareFunction operand_order,
+                                      bool call_finish_visit = true);
+
+  // Visit this instruction and only this instruction with the given visitor.
+  template <typename HloInstructionPtr>
+  absl::Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
+  absl::Status Visit(ConstDfsHloVisitor* visitor) const {
+    return const_cast<HloInstruction*>(this)->Visit(visitor);
+  }
+
+  // Returns the first non-GetTupleElement ancestor instruction of 'hlo'.
+  // If the first non-GTE ancestor is tuple-shaped, populates 'index' with the
+  // (possibly nested) tuple indices used on the path from ancestor to 'hlo'.
+  std::pair<const HloInstruction*, ShapeIndex> LatestNonGteAncestorAndIndex()
+      const;
+
+  std::pair<HloInstruction*, ShapeIndex> LatestNonGteAncestorAndIndex() {
+    auto rv =
+        const_cast<const HloInstruction*>(this)->LatestNonGteAncestorAndIndex();
+    return {const_cast<HloInstruction*>(rv.first), rv.second};
+  }
+
+  // Same as LatestNonGteAncestorAndIndex, but just returns the HloInstruction.
+  const HloInstruction* LatestNonGteAncestor() const;
+
+  HloInstruction* LatestNonGteAncestor() {
+    return const_cast<HloInstruction*>(
+        const_cast<const HloInstruction*>(this)->LatestNonGteAncestor());
+  }
+
+  // Returns true whether this instruction is effectively a bitcast. Currently,
+  // this means it either is a bitcast, or it is a transpose that is effectively
+  // a bitcast.
+  bool IsEffectiveBitcast() const;
+
+  // Returns true if this instruction is asynchronous with the
+  // async_execution_thread set to `execution_thread`.
+  bool IsAsyncInstructionWithExecutionThread(
+      absl::string_view execution_thread) const {
+    return IsAsynchronous() && async_execution_thread() == execution_thread;
+  };
+
+  // Gets/sets the to_apply HloComputation for Call, Map, Reduce, etc.
+  // The setter should only be called by HloModule or HloComputation methods.
+  //
+  // Precondition: The instruction has a valid to_apply_ field.
+  HloComputation* to_apply() const;
+  void set_to_apply(HloComputation* computation);
+  // Whether the instruction has a valid to_apply_ field.
+  bool has_to_apply() const;
+
+  // Gets/sets the while_condition or while_body HloComputation for While. The
+  // setters should only be called by HloModule or HloComputation methods.
+  //
+  // Precondition: The instruction is a While instruction.
+  HloComputation* while_condition() const;
+  HloComputation* while_body() const;
+  void set_while_condition(HloComputation* computation);
+  void set_while_body(HloComputation* computation);
+
+  HloInstruction* while_init() const;
+
+  // Gets/sets the true and false HloComputation for Conditional.
+  //
+  // Precondition: The instruction is a predicated Conditional instruction.
+  HloComputation* true_computation() const;
+  HloComputation* false_computation() const;
+
+  // Gets the branch HloComputations for Conditional.
+  //
+  // Precondition: The instruction is a Conditional instruction.
+  const PtrVec<HloComputation*>& branch_computations() const;
+  int32_t branch_count() const;
+  HloComputation* branch_computation(int32_t b) const;
+  int32_t branch_index(HloComputation* computation) const;
+  // Sets a branch HloComputation for Conditional.
+  // The setter should only be called by HloModule or HloComputation methods.
+  //
+  // Precondition: The instruction is a Conditional instruction.
+  void set_branch_computation(int b, HloComputation* computation);
+
+  // Returns a string for the signature of this instruction if considered as a
+  // function, e.g. the signature of an F32 add is (F32, F32) -> F32.
+  std::string SignatureString() const;
+
+  // Prints a debugging string that represents this instruction.
+  void Print(Printer* printer) const {
+    return Print(printer, HloPrintOptions::Default());
+  }
+  void Print(Printer* printer, const HloPrintOptions& options) const;
+
+  // Returns a debugging string that represents this instruction.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  //
+  // TODO(b/73348663): Make ToString() adaptive to the size of the string by
+  // default, backing off on providing full information for very large strings,
+  // or provide a different name for a ToString-like function that does that.
+  std::string ToString() const;
+  std::string ToString(const HloPrintOptions& options) const;
+
+  // Components of the Print() and ToString() representation:
+
+  // Helper class for PrintExtraAttributes.
+  class AttributePrinter {
+   public:
+    explicit AttributePrinter(std::function<Printer*()> next_printer)
+        : next_printer_(std::move(next_printer)) {}
+
+    void Next(absl::FunctionRef<void(Printer*)> print_func) {
+      print_func(next_printer_());
+    }
+
+   private:
+    std::function<Printer*()> next_printer_;
+  };
+  // Prints the string representation of op-specific attributes.
+  void PrintExtraAttributes(AttributePrinter& printer,
+                            const HloPrintOptions& options) const;
+
+  // Returns string representation of op-specific attributes.
+  std::vector<std::string> ExtraAttributesToString(
+      const HloPrintOptions& options) const;
+
+  // As ToString, but returns a shorter string.
+  std::string ToShortString() const;
+
+  // Prints an instruction to a string.
+  //
+  // The canonical string representation needs to name operands and instruction
+  // names in a consistent way. This is implemented through the
+  // canonical_name_map.
+  void PrintWithCanonicalNameMap(Printer* printer,
+                                 const HloPrintOptions& options,
+                                 CanonicalNameMap* canonical_name_map) const;
+
+  // Returns a serialized representation of this instruction.
+  virtual HloInstructionProto ToProto() const;
+
+  // Returns a category for the HLO. This could be something like "convolution"
+  // or "elementwise".
+  virtual std::string ToCategory() const;
+
+  // Returns true if this instruction is fused, ie contained within a fusion
+  // instruction.
+  bool IsFused() const;
+
+  bool IsLoopFusion() const;
+  bool IsInputFusion() const;
+  bool IsOutputFusion() const;
+  bool IsCustomFusion() const;
+
+  // Returns true if this instruction can be legally fused into a fusion
+  // instruction.
+  bool IsFusible() const;
+
+  bool IsCustomCall(absl::string_view target) const;
+  bool IsCustomCall(absl::Span<const absl::string_view> targets) const;
+
+  // Returns the sharding applied to this operator.
+  // REQUIRES: has_sharding() is true.
+  const HloSharding& sharding() const {
+    CHECK(has_sharding());
+    return *sharding_;
+  }
+  std::shared_ptr<const HloSharding> sharding_ptr() const { return sharding_; }
+
+  // Returns the sharding applied to this operator, or default_ if none exists.
+  const HloSharding& sharding_or_default(const HloSharding& default_) const {
+    return sharding_ ? *sharding_ : default_;
+  }
+  // Returns the sharding unique device, if any.
+  std::optional<int64_t> sharding_unique_device() const {
+    if (sharding_ == nullptr) {
+      return std::nullopt;
+    }
+    return sharding_->UniqueDevice();
+  }
+  // Sets the sharding of this operator. Should only be called by HloModule or
+  // HloComputation methods.
+  void set_sharding(HloSharding sharding) {
+    set_sharding(std::make_shared<HloSharding>(std::move(sharding)));
+  }
+  void set_sharding(std::shared_ptr<const HloSharding> sharding) {
+    sharding_ = std::move(sharding);
+  }
+  // Copies the sharding of another instruction, this is more efficient than
+  // set_sharding(hlo->sharding()) because it avoids a deep copy and shares the
+  // storage. Note that if the other instruction has no sharding set, it also
+  // clears the sharding of the current instruction.
+  void copy_sharding(const HloInstruction* hlo) {
+    set_sharding(hlo->sharding_ptr());
+  }
+  void set_single_sharding(const HloSharding& sharding);
+  // Sets a sharding that assigns the current instruction to device.
+  void set_device_sharding(int64_t device) {
+    set_single_sharding(HloSharding::AssignDevice(device));
+  }
+  // Remove any sharding from this operator.
+  void clear_sharding() { sharding_ = nullptr; }
+  // Return true if this operator has a sharding assigned.
+  bool has_sharding() const { return sharding_ != nullptr; }
+  // Checks whether the instruction has compatible sharding with the other
+  // instruction.
+  bool has_compatible_sharding(const HloInstruction* other) const {
+    if (!has_sharding()) {
+      return !other->has_sharding();
+    }
+    return other->has_sharding() ? sharding() == other->sharding() : false;
+  }
+
+  // When creating a new instruction which either replaces, or shifts up (kCopy
+  // insertion case), another instruction, we need to make sure the certain
+  // properties of the new instruction are copied into the derived one. As of
+  // today, the metadata and sharding will be propagated to the derived
+  // instruction.
+  void SetupDerivedInstruction(HloInstruction* derived_instruction) const;
+
+  // Clones the HLO instruction. The clone will have the same opcode, shape, and
+  // operands. After creation the clone has no uses. "this" (the instruction
+  // cloned from) is not changed. Suffix is the string to append to the name of
+  // the instruction to form the name of the cloned instruction.
+  // Ignores the control predecessors and successors of this HLO instruction.
+  std::unique_ptr<HloInstruction> Clone(
+      const std::string& suffix = "clone",
+      HloCloneContext* context = nullptr) const;
+
+  // Clones the HLO instruction as above but with new shape.
+  std::unique_ptr<HloInstruction> CloneWithNewShape(
+      const Shape& shape, const std::string& suffix = "clone",
+      HloCloneContext* context = nullptr) const;
+
+  // Clones the HLO instruction as above but with new shape and operands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperands(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context = nullptr) const;
+
+  // Clones the HLO instruction with new shape, operands and suffix.
+  std::unique_ptr<HloInstruction> CloneWithNewOperands(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      const std::string& suffix, HloCloneContext* context = nullptr) const;
+
+  // Returns the computations this instruction directly calls (if any).
+  const PtrVec<HloComputation*>& called_computations() const {
+    return rare()->called_computations;
+  }
+  bool has_called_computations() const {
+    return has_rare() && !called_computations().empty();
+  }
+
+  // Returns true iff an instruction of type "opcode" might have non-empty
+  // called_computations.
+  static bool MightHaveCalledComputations(HloOpcode opcode);
+
+  // Replaces all called computations based on a map function. This is needed
+  // when we clone hlo_computations and want to let the instructions to point
+  // to the newly cloned nodes.
+  void ReplaceCalledComputations(
+      absl::FunctionRef<HloComputation*(HloComputation*)> map_function) {
+    for (int64_t i = 0; i < called_computations().size(); ++i) {
+      mutable_rare()->called_computations[i] =
+          map_function(rare()->called_computations[i]);
+    }
+  }
+
+  // Clears out the called computations.
+  //
+  // This is, in particular, necessary when inlining function bodies into their
+  // caller. If there were side-effecting operations in the called computations,
+  // the call itself is considered side-effecting and thus cannot be removed. By
+  // clearing out the computations, we reflect the fact that all side-effecting
+  // properties have been reflected in the caller, and make the call HLO
+  // removable.
+  virtual void ClearCalledComputations() {
+    if (has_rare()) {
+      mutable_rare()->called_computations.clear();
+    }
+  }
+
+  // Returns true if this instruction performs an elementwise operation on
+  // `operand_idx`-th operand. An instruction is elementwise on an operand iff,
+  // to compute the output at index {i_0,i_1,...,i_n}, the only element required
+  // from the operand (if any) is the element at {i_0,i_1,...,i_n}.
+  //
+  // Note on performance: when this instruction is kFusion, this method, in the
+  // worst case, scans all fused instructions. We could speed this up by
+  // caching.
+  bool IsElementwiseOnOperand(int64_t operand_idx) const;
+
+  // Returns true if this instruction is elementwise on all its operands.
+  bool IsElementwise() const;
+
+  static bool IsOpElementwise(HloOpcode opcode);
+
+  // Returns true if this is a cross module all-reduce instruction.
+  bool IsCrossModuleAllReduce() const;
+
+  // Returns true if this is a cross-replica all-reduce instruction.
+  bool IsCrossReplicaAllReduce() const;
+
+  // Returns true if this instruction is binary and elementwise.
+  bool IsElementwiseBinary() const;
+
+  // Returns whether this instruction may reuse elements of its `i`th operand.
+  bool ReusesOperandElements(int64_t i) const;
+
+  // Returns the indices that the given operand appear in the operand list of
+  // this instruction. Note that an instruction can use the same operand
+  // multiple times.
+  absl::InlinedVector<int64_t, 4> OperandIndices(
+      const HloInstruction* operand) const;
+
+  // Convenience helper for ShapeUtil::InsertedOrDeleted1SizedDimensions. If
+  // this reshape merely inserts or deletes 1-sized dimensions, return the input
+  // indices of the deleted dimensions and the output indices of the inserted
+  // dimensions.
+  //
+  // Precondition: this op must be a reshape.
+  std::optional<ShapeUtil::ShapeEqualityDescriptor>
+  ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
+
+  // Gets the string identifier for this instruction.
+  absl::string_view name() const { return name_; }
+
+  // Sets the string identifier for this instruction. Name will be sanitized to
+  // match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  //
+  // See also HloModule::SetAndUniquifyInstrName(), which does this plus
+  // UniquifyName().
+  void SetAndSanitizeName(absl::string_view name) {
+    name_ = NameUniquer::GetSanitizedName(name);
+  }
+
+  // Use the given NameUniquer to select a unique name for the instruction based
+  // on the instruction's existing name.
+  //
+  // See also HloModule::SetAndUniquifyInstrName(), which does this plus
+  // SetAndSanitizeName().
+  void UniquifyName(NameUniquer* name_uniquer);
+
+  // Use the `module`'s name uniquer to select a unique name for this
+  // instruction based on the instruction's existing name.
+  void UniquifyName(HloModule* module);
+
+  // Use the `module`s `NewUniqueInstructionId` to set the id of this
+  // instruction.
+  void UniquifyId(HloModule* module);
+
+  // Clear the unique ID of the instruction so that it can be re-assigned, such
+  // as for the purpose of compacting the instruction unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
+  // Set the unique id for this instruction to "id"
+  void SetUniqueId(int id) {
+    CHECK_EQ(unique_id_, -1);  // Should not be assigned already
+    CHECK_GE(id, 0);
+    unique_id_ = id;
+  }
+
+  // Return the unique ID assigned to this node via SetUniqueId (or -1
+  // if no id has been assigned yet).
+  int unique_id() const { return unique_id_; }
+
+  bool has_backend_config() const { return !backend_config_.empty(); }
+
+  void clear_backend_config() { backend_config_ = BackendConfigWrapper(); }
+
+  void CopyBackendConfigFrom(const HloInstruction* other) {
+    backend_config_ = BackendConfigWrapper(other->backend_config_);
+  }
+
+  void set_frontend_attributes(FrontendAttributes frontend_attributes) {
+    if (!has_rare() && frontend_attributes.map().empty()) {
+      return;
+    }
+    mutable_rare()->frontend_attributes = std::move(frontend_attributes);
+  }
+
+  // Appends the given frontend attributes to the existing ones. If existing
+  // frontend attributes are empty, then create it and set it to the provided
+  // one.
+  void add_frontend_attributes(FrontendAttributes frontend_attributes) {
+    if (!frontend_attributes.map().empty()) {
+      mutable_rare()->frontend_attributes.mutable_map()->insert(
+          frontend_attributes.map().begin(), frontend_attributes.map().end());
+    }
+  }
+
+  bool has_frontend_attributes() const {
+    return has_rare() && !rare()->frontend_attributes.map().empty();
+  }
+
+  const FrontendAttributes& frontend_attributes() const {
+    return rare()->frontend_attributes;
+  }
+
+  void set_is_composite(bool is_composite) {
+    if (!has_rare() && !is_composite) {
+      return;
+    }
+    mutable_rare()->is_composite = is_composite;
+  }
+
+  // Return the is_composite attribute. This attribute is only relevant for
+  // kCall instructions used as a Composite op.
+  bool is_composite() const { return has_rare() && rare()->is_composite; }
+
+  const ResultAccuracy& result_accuracy() const {
+    return rare()->result_accuracy;
+  }
+
+  bool has_result_accuracy() const {
+    return has_rare() && (result_accuracy().has_tolerance() ||
+                          result_accuracy().mode() != ResultAccuracy::DEFAULT);
+  }
+
+  void add_single_statistic(Statistic statistic) {
+    *mutable_rare()->statistics_viz.add_statistics() = std::move(statistic);
+  }
+
+  void set_stat_index_to_visualize(int64_t index) {
+    mutable_rare()->statistics_viz.set_stat_index_to_visualize(index);
+  }
+
+  // Whether this specific instruction has statistics
+  bool has_statistics() const { return !statistics_viz().statistics().empty(); }
+
+  // Whether any instruction within the same HLO module as this has statistics
+  bool module_has_statistics() const {
+    return statistics_viz().stat_index_to_visualize() == -1;
+  }
+
+  const Statistic& statistic_to_visualize() const {
+    return statistics_viz().statistics().at(
+        statistics_viz().stat_index_to_visualize());
+  }
+
+  void set_statistics_viz(StatisticsViz statistics_viz) {
+    mutable_rare()->statistics_viz = std::move(statistics_viz);
+  }
+
+  const StatisticsViz& statistics_viz() const { return rare()->statistics_viz; }
+
+  template <typename T>
+  using EnableIfProto = typename std::enable_if_t<
+      std::is_base_of<tsl::protobuf::Message, T>::value>;
+
+  // Returns the backend-specific configuration for how a backend should compile
+  // this HLO. The meaning of the field is backend specific. Not for use before
+  // or during general HLO optimization, since HLO optimizations do not preserve
+  // this field and they cannot interpret it due to its meaning being backend
+  // specific. Except for CustomCall, where this field is preserved and no
+  // general HLO optimization needs to interpret it.
+  //
+  // ConfigProto should be a protobuf Message type.
+  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
+  absl::StatusOr<ConfigProto> backend_config() const {
+    ConfigProto proto;
+    TF_RETURN_IF_ERROR(backend_config_.GetProto(&proto));
+    return std::move(proto);
+  }
+
+  absl::Status set_backend_config(const tsl::protobuf::Message& proto) {
+    backend_config_ = BackendConfigWrapper(proto);
+    return absl::OkStatus();
+  }
+
+  // Getter/setter for raw JSON-encoded backend config.  Prefer the
+  // functions above that deal in proto Messages where possible.
+  const std::string& raw_backend_config_string() const {
+    return backend_config_.GetRawString();
+  }
+  void set_raw_backend_config_string(std::string config_str) {
+    backend_config_ = BackendConfigWrapper(std::move(config_str));
+  }
+
+  bool is_default_config() const { return is_default_config_; }
+  void set_default_config() { is_default_config_ = true; }
+
+  // Returns the information used to tell the implementation information about
+  // what sort of precision is requested. The meaning of the field is backend
+  // specific. At the moment, it is only supported for kConvolution and kDot.
+  // Transformations on one kDot or kConvolution to another will preserve this
+  // information. Transformations to other HLOs will not preserve this
+  // information but it is presumed that the alternate lowering is strictly
+  // superior.
+  // Precondition: opcode must be kConvolution or kDot.
+  const PrecisionConfig& precision_config() const;
+  PrecisionConfig* mutable_precision_config();
+
+  // Sets the result accuracy for this instruction. Supported for unary ops
+  // with multiple implementations.
+  void set_result_accuracy(ResultAccuracy result_accuracy) {
+    mutable_rare()->result_accuracy = std::move(result_accuracy);
+  }
+
+  // Sets the debug metadata for this instruction, excluding creation_pass_id,
+  // which should never be copied anywhere.
+  void set_metadata(const OpMetadata& metadata) { *metadata_ = metadata; }
+
+  void set_size_of_generated_code_in_bytes(int64_t code_size_in_bytes) {
+    metadata_->set_size_of_generated_code_in_bytes(code_size_in_bytes);
+  }
+  void set_size_of_memory_working_set_in_bytes(
+      int64_t working_set_size_in_bytes) {
+    metadata_->set_size_of_memory_working_set_in_bytes(
+        working_set_size_in_bytes);
+  }
+  void set_metadata_op_name(const std::string& name) {
+    metadata_->set_op_name(name);
+  }
+  void set_metadata_deduplicated_name(std::string deduplicated_name) {
+    metadata_->set_deduplicated_name(std::move(deduplicated_name));
+  }
+  void set_metadata_scheduling_name(absl::string_view name) {
+    metadata_->set_scheduling_name(std::string(name));
+  }
+  const OpMetadata& metadata() const { return *metadata_; }
+
+  // Set/get the computation containing this instruction. set_parent should only
+  // be called by HloComputation methods which add/remove instructions to
+  // computations.
+  void set_parent(HloComputation* computation) { parent_ = computation; }
+  const HloComputation* parent() const { return parent_; }
+  HloComputation* parent() { return parent_; }
+
+  // Returns the module for this instruction.
+  HloModule* GetModule() const;
+
+  // A method that sorts users_, control_predecessors_, and control_successors_
+  // according to the orders used in sorted_instruction. The sorting is used
+  // during cloning, to make clone behavior match uncloned behavior.
+  void SortInstructionUsersAndControlLists(
+      const MappedPtrContainerSorter<HloInstruction>::MapPtrFn& map_fn,
+      const HloInstruction& sorted_instruction);
+
+  // Old methods kept for smooth subclassing transition BEGIN.
+  // NOTE: Refrain from adding more delegates, prefer down casting to subclasses
+  // rather than using these methods.
+  // TODO(b/80131774): Remove this code.
+
+  // Delegates to HloBatchNormInstruction::feature_index.
+  int64_t feature_index() const;
+
+  // Delegates to HloBatchNormInstruction::epsilon.
+  float epsilon() const;
+
+  // Delegates to HloFftInstruction::fft_type.
+  FftType fft_type() const;
+
+  // Delegates to HloFftInstruction::fft_length.
+  const std::vector<int64_t>& fft_length() const;
+
+  // Delegates to HloChannelInstruction::channel_id.
+  std::optional<int64_t> channel_id() const;
+  void set_channel_id(const std::optional<int64_t>& channel_id);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  virtual absl::Span<const int64_t> dimensions() const {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
+  int64_t dimensions(int64_t index) const { return dimensions()[index]; }
+
+  virtual std::vector<int64_t>* mutable_dimensions() {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
+  // Delegates to HloConcatenateInstruction::concatenate_dimension.
+  virtual int64_t concatenate_dimension() const;
+
+  // Delegates to HloGetDimensionSizeInstruction::dimension.
+  int64_t dimension() const;
+
+  // Delegates to HloReshapeInstruction::inferred_dimension.
+  int64_t inferred_dimension() const;
+
+  // Returns whether this instruction does a rank-2 transposition.
+  bool IsRank2Transpose() const;
+
+  // Delegates to HloSliceInstruction::slice_start.
+  int64_t slice_starts(int64_t dimension) const;
+  const std::vector<int64_t>& slice_starts() const;
+  std::vector<int64_t>* mutable_slice_starts();
+
+  // Delegates to HloSliceInstruction::slice_limits.
+  int64_t slice_limits(int64_t dimension) const;
+  const std::vector<int64_t>& slice_limits() const;
+  std::vector<int64_t>* mutable_slice_limits();
+
+  // Delegates to HloSliceInstruction::slice_strides.
+  int64_t slice_strides(int64_t dimension) const;
+  const std::vector<int64_t>& slice_strides() const;
+  std::vector<int64_t>* mutable_slice_strides();
+
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const;
+
+  // Returns whether the instruction is a constant.
+  bool IsConstant() const;
+
+  // Delegate to HloConstantInstruction::RelayoutConstant.
+  void RelayoutConstant(const Layout& new_layout,
+                        const ShapeIndex& shape_index = {});
+
+  // Delegates to
+  // HloCallableInstruction::AppendInstructionIntoCalledComputation.
+  HloInstruction* AppendInstructionIntoCalledComputation(
+      HloInstruction* instruction_to_append, bool add_output = false);
+
+  // Delegates to HloFusionInstruction::AddFusionOperand.
+  HloInstruction* AddFusionOperand(HloInstruction* new_operand);
+
+  // Delegates to HloFusionInstruction::MergeFusionInstruction.
+  void MergeFusionInstruction(HloInstruction* instruction_to_merge);
+
+  // Delegates to HloFusionInstruction::MergeFusionInstructionIntoMultiOutput.
+  void MergeFusionInstructionIntoMultiOutput(
+      HloInstruction* instruction_to_merge);
+
+  // Delegates to HloFusionInstruction::FuseInstruction.
+  HloInstruction* FuseInstruction(HloInstruction* instruction_to_fuse);
+
+  // Delegates to HloFusionInstruction::FuseInstructionIntoMultiOutput.
+  HloInstruction* FuseInstructionIntoMultiOutput(
+      HloInstruction* instruction_to_fuse);
+
+  // Delegates to HloFusionInstruction::fused_instruction.
+  HloComputation* fused_instructions_computation() const;
+
+  // Delegates to HloFusionInstruction::fused_expression_root.
+  HloInstruction* fused_expression_root() const;
+
+  // Delegates to HloFusionInstruction::fused_instructions.
+  tsl::gtl::iterator_range<HloInstructionUnwrappingConstIterator>
+  fused_instructions() const;
+
+  tsl::gtl::iterator_range<HloInstructionUnwrappingIterator>
+  fused_instructions();
+
+  // Delegates to HloFusionInstruction::fused_instruction_count.
+  int64_t fused_instruction_count() const;
+
+  // Delegates to HloFusionInstruction::fused_parameter.
+  HloInstruction* fused_parameter(int64_t parameter_number) const;
+
+  // Delegates to HloFusionInstruction::fused_parameters.
+  const InstructionVector& fused_parameters() const;
+
+  // Returns true if this instruction is a fusion instruction that generates
+  // multiple outputs.
+  bool IsMultiOutputFusion() const;
+
+  // Delegates to HloFusionInstruction::fusion_kind.
+  FusionKind fusion_kind() const;
+
+  // Delegates to HloFusionInstruction::set_fusion_kind.
+  void set_fusion_kind(FusionKind kind);
+
+  // Delegates to HloRngInstruction::random_distribution.
+  RandomDistribution random_distribution() const;
+
+  // Delegates to HloParameterInstruction::parameter_number.
+  int64_t parameter_number() const;
+
+  // Delegates to
+  // HloParameterInstruction::set_parameter_replicated_at_leaf_buffers.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers);
+  void set_parameter_replicated_at_leaf_buffers(
+      const std::vector<bool>& parameter_replicated_at_leaf_buffers);
+
+  // Delegates to HloParameterInstruction::parameter_replicated_at_leaf_buffers.
+  const std::optional<std::vector<bool>>& parameter_replicated_at_leaf_buffers()
+      const;
+
+  // Delegates to HloGetTupleElementInstruction::tuple_index.
+  int64_t tuple_index() const;
+
+  // Delegates to HloGetTupleElementInstruction::set_tuple_index.
+  void set_tuple_index(int64_t new_tuple_index);
+
+  // Delegates to HloReducePrecisionInstruction::exponent_bits.
+  int32_t exponent_bits() const;
+
+  // Delegates to HloReducePrecisionInstruction::mantissa_bits.
+  int32_t mantissa_bits() const;
+
+  // Delegates to HloInfeedInstruction::infeed_config.
+  std::string infeed_config() const;
+
+  // Delegates to HloInfeedInstruction::set_infeed_config.
+  void set_infeed_config(const std::string& config);
+
+  // Returns the config for the Outfeed instruction.
+  const std::string& outfeed_config() const;
+
+  // Delegates to HloOutfeedInstruction::set_outfeed_config.
+  void set_outfeed_config(const std::string& config);
+
+  // Returns the shape for the Outfeed instruction.
+  const Shape& outfeed_shape() const;
+
+  // Returns the mutable shape for the Outfeed instruction.
+  Shape* mutable_outfeed_shape();
+
+  // Delegates to HloCollectiveInstruction::replica_groups.
+  // TODO(b/316622399): Remove usages of this method and replace with
+  // device_list()->replica_groups().
+  const std::vector<ReplicaGroup>& replica_groups() const;
+
+  // Delegates to HloCollectiveInstruction::device_list.
+  const CollectiveDeviceList& device_list() const;
+
+  // Delegates to HloCollectivePermuteInstruction::source_target_pairs.
+  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs() const;
+
+  // Returns data on the window in a windowed operation such as
+  // convolution.
+  virtual const Window& window() const {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
+  // Sets the window data in a windowed operation such as convolution.
+  virtual void set_window(const Window& window) {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
+  // Returns the unique_indices field.
+  virtual bool unique_indices() const { LOG(FATAL) << "Unimplemented method."; }
+
+  // Returns data on the dimension numbers used for a convolution operation,
+  // which may be a kConvolution instruction or a kCustomCall that implements a
+  // convolution.
+  const ConvolutionDimensionNumbers& convolution_dimension_numbers() const;
+
+  // Sets the convolution dimension numbers on this instruction.  In general you
+  // shouldn't need to call this; instead, specify the convolution dimension
+  // numbers when you create the instruction.
+  void set_convolution_dimension_numbers(
+      const ConvolutionDimensionNumbers& dnums);
+
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64_t feature_group_count() const;
+
+  void set_feature_group_count(int64_t feature_group_count);
+
+  // The number of batch groups. Must be a divisor of the input batch dimension
+  int64_t batch_group_count() const;
+
+  void set_batch_group_count(int64_t batch_group_count);
+
+  // Delegates to HloSelectAndScatterInstruction::select.
+  HloComputation* select() const;
+
+  // Delegates to HloSelectAndScatterInstruction::scatter.
+  HloComputation* scatter() const;
+
+  // Delegates to HloSelectAndScatterInstruction::set_select.
+  void set_select(HloComputation* computation);
+
+  // Delegates to HloSelectAndScatterInstruction::set_scatter.
+  void set_scatter(HloComputation* computation);
+
+  // Delegates to HloCustomCallInstruction::custom_call_target.
+  const std::string& custom_call_target() const;
+  void set_custom_call_target(absl::string_view target);
+
+  // Delegates to HloPadInstruction::padding_config.
+  const PaddingConfig& padding_config() const;
+  PaddingConfig* mutable_padding_config();
+
+  // Delegates to HloConvolutionInstruction::padding_type.
+  PaddingType padding_type() const;
+
+  // Delegates to HloDynamicSliceInstruction::slice_sizes.
+  int64_t slice_sizes(int64_t dimension) const;
+
+  // Delegates to HloDynamicSliceInstruction::dynamic_slice_sizes.
+  const std::vector<int64_t>& dynamic_slice_sizes() const;
+
+  // Delegates to HloCollectivePermuteInstruction::dynamic_slice_sizes.
+  const std::vector<std::vector<int64_t>>& dynamic_slice_sizes_list() const;
+
+  // Delegates to HloGatherInstruction::gather_dimension_numbers.
+  const GatherDimensionNumbers& gather_dimension_numbers() const;
+  // Delegates to HloGatherInstruction::gather_slice_sizes.
+  absl::Span<const int64_t> gather_slice_sizes() const;
+
+  // Delegates to HloScatterInstruction::scatter_dimension_numbers().
+  const ScatterDimensionNumbers& scatter_dimension_numbers() const;
+
+  // Delegates to HloDotInstruction::dot_dimension_numbers().
+  const DotDimensionNumbers& dot_dimension_numbers() const;
+
+  // Delegates to HloRaggedDotInstruction::ragged_dot_dimension_numbers().
+  const RaggedDotDimensionNumbers& ragged_dot_dimension_numbers() const;
+
+  // Delegates to HloDomainInstruction::operand_side_metadata().
+  const DomainMetadata& operand_side_metadata() const;
+
+  // Delegates to HloDomainInstruction::user_side_metadata().
+  const DomainMetadata& user_side_metadata() const;
+
+  // Returns true if the instruction is an async-start, async-update, or
+  // async-done.
+  bool IsAsynchronous() const;
+
+  // Delagates to HloAsyncInstruction::async_chain_start().
+  HloInstruction* async_chain_start() const;
+
+  // Delagates to HloAsyncInstruction::async_done().
+  HloInstruction* async_chain_done() const;
+
+  // Returns the computation that will executed asynchronously.
+  HloComputation* async_wrapped_computation() const;
+
+  // Delagates to HloAsyncInstruction::async_wrapped_instruction().
+  HloInstruction* async_wrapped_instruction() const;
+
+  // Delagates to HloAsyncInstruction::async_wrapped_opcode().
+  HloOpcode async_wrapped_opcode() const;
+
+  // Delegates to HloAsyncInstruction::async_execution_thread().
+  absl::string_view async_execution_thread() const;
+
+  // Delegates to HloAsyncInstruction::set_async_execution_thread().
+  void set_async_execution_thread(absl::string_view async_execution_thread);
+
+  // Delegates to
+  // HloCallableInstruction::RecursivelySetComputationsThreadName().
+  void set_called_computations_execution_thread(
+      absl::string_view async_execution_thread,
+      bool skip_async_execution_thread_overwrite);
+
+  // Delegates to HloCopyStartInstruction::is_cross_program_prefetch_index().
+  std::optional<int> cross_program_prefetch_index() const;
+
+  // Delegates to HloCompareInstruction::direction().
+  ComparisonDirection comparison_direction() const;
+  // Delegates to HloCompareInstruction::order().
+  ComparisonOrder comparison_order() const;
+
+  // Delegates to HloTriangularSolveInstruction::triangular_solve_options().
+  const TriangularSolveOptions& triangular_solve_options() const;
+
+  // Delegates to HloCholeskyInstruction::cholesky_options().
+  const CholeskyOptions& cholesky_options() const;
+
+  // Delegates to HloCallableInstruction::output_to_operand_aliasing().
+  const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
+  output_operand_aliasing() const;
+
+  // Delegates to HloCallableInstruction::set_output_to_operand_aliasing().
+  void set_output_to_operand_aliasing(
+      std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          aliasing);
+
+  // Appends operand to the list of operands and adds this instruction as a user
+  // of the operand.
+  void AppendOperand(HloInstruction* operand);
+
+  // Old methods kept for smooth subclassing transition END.
+
+  HloInstruction(const HloInstruction&) = delete;
+  HloInstruction& operator=(const HloInstruction&) = delete;
+
+  std::shared_ptr<OriginalValue> original_value() const;
+  void set_original_value(std::shared_ptr<OriginalValue> original_value);
+
+ protected:
+  // Internal constructor for a given opcode/shape, other fields must be filled
+  // by factory methods.
+  HloInstruction(HloOpcode opcode, const Shape& shape);
+
+  void RemoveAllOperands() { operands_.clear(); }
+
+  void RemoveOperandAt(int index) {
+    operands_.erase(operands_.begin() + index);
+  }
+
+  // Removes a list of operands with the given indices in ascending order.
+  void RemoveOperandsAtAscendingIndices(
+      absl::Span<const int> ascending_indices);
+
+  void AppendComputation(HloComputation* computation);
+
+  void DetachFrom(HloInstruction* usee) { usee->RemoveUser(this); }
+
+  void set_called_computation(int index, HloComputation* computation) {
+    mutable_rare()->called_computations[index] = computation;
+  }
+  // Indices of computations in called_computations for instructions which call
+  // multiple computations.
+  enum {
+    // kWhile computations.
+    kBodyComputationIndex = 0,
+    kConditionComputationIndex = 1,
+
+    // kSelectAndScatter computations.
+    kSelectComputationIndex = 0,
+    kScatterComputationIndex = 1,
+
+    // kConditional computations.
+    kTrueComputationIndex = 0,
+    kFalseComputationIndex = 1,
+  };
+
+  // Change instruction's name to have a given suffix.
+  void AddSuffixToInstructionName(const absl::string_view suffix);
+
+ private:
+  friend class HloComputation;
+
+  bool IdenticalInternal(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloInstruction*, const HloInstruction*)>
+          eq_operands,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations,
+      bool layout_sensitive, bool sharding_sensitive,
+      bool ignore_channel_id_values,
+      bool ignore_commutative_operand_order) const;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  virtual std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const {
+    // TODO(b/80131774): This should be pure virtual.
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
+  // Implementation for non-common logic of PrintExtraAttributes.
+  virtual void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                        const HloPrintOptions& options) const {}
+
+  // Implementation for IsElementwise if operand_idx is nullopt and for
+  // IsElementwiseOnOperand if otherwise.
+  //
+  // NOTE: For all instructions other than kFusion, being elementwise on one of
+  // the operands is equivalent to being elementwise on all the operands.
+  virtual bool IsElementwiseImpl(
+      const std::optional<int64_t>& operand_idx) const;
+
+  // Prints an operand to a string. Accessed by friend class HloInstruction.
+  virtual void PrintOperandsWithCanonicalNameMap(
+      Printer* printer, const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const;
+
+  // See comments on Identical().
+  virtual bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const;
+
+  // Creates an n-ary elementwise operation.
+  static std::unique_ptr<HloInstruction> CreateNary(
+      const Shape& shape, HloOpcode opcode,
+      absl::Span<HloInstruction* const> operands);
+
+  // Adds a user for this instruction.
+  void AddUser(HloInstruction* user) { users_.AddUser(user); }
+
+  // Removes a user for this instruction.
+  void RemoveUser(HloInstruction* user) { users_.RemoveUser(user); }
+
+  // Helper for implementing backend_config().  Parses backend_config_ into the
+  // given proto.
+  absl::Status GetBackendConfigInternal(tsl::protobuf::Message* proto) const;
+
+  // Mark this instruction as dead. Accessed by friend class HloInstruction.
+  void MarkAsDead() { marked_as_dead_ = true; }
+
+  // Has this instruction been marked as dead? Accessed by friend class
+  // HloInstruction.
+  bool IsMarkedAsDead() const { return marked_as_dead_; }
+
+  // Rare is allocated lazily, only when any of its constituent fields are
+  // non-empty.  This reduces the memory footprint of HloInstruction objects.
+  struct Rare {
+    // Computations called by this instruction.
+    PtrVec<HloComputation*> called_computations;
+
+    // The set of control predecessors of this instruction.
+    // Note that the order of the instructions in the vector influences the
+    // order computed in HloComputation::ComputeInstructionPostOrder, which may
+    // influence the result of the compilation by changing the scheduling. We
+    // are not sure if it matters.
+    PtrVec<HloInstruction*> control_predecessors;
+
+    // The set of control successors of this instruction.
+    PtrVec<HloInstruction*> control_successors;
+
+    // Attributes passed from the frontend to give hints to the backend about
+    // how to compile this HLO.
+    // HLO -> HLO transforms are expected to preserve these attributes on a
+    // "best effort" basis only.
+    // For example:
+    //    x = const(10, frontend_attributes={x}
+    //    y = const(10, frontend_attributes={y}
+    //    z = add(x,y), frontend_attributes={y}
+    // Could be simplified to:
+    //    z' = const(20), frontend_attributes={?}
+    FrontendAttributes frontend_attributes;
+
+    // Used by kCall to determine if the Call instruction is a composite.
+    bool is_composite;
+
+    // Used to render an HLO graph when tracking the propagation desired values
+    // through it.
+    StatisticsViz statistics_viz;
+
+    // Used to select different implementations for unary functions.
+    ResultAccuracy result_accuracy;
+  };
+
+  static const Rare* const kEmptyRare;
+
+  bool has_rare() const { return rare_ != nullptr; }
+
+  // Return the allocated rare state, or the pointer to the static empty rare
+  // state
+  const Rare* rare() const {
+    Rare* r = rare_.get();
+    return (r == nullptr) ? kEmptyRare : r;
+  }
+
+  // Lazily allocate the Rare struct
+  Rare* mutable_rare() {
+    if (rare_ == nullptr) {
+      rare_ = std::make_unique<Rare>();
+    }
+    return rare_.get();
+  }
+
+  // Users holds the list of users of an HloInstruction, plus it provides a fast
+  // way for checking for presence of a potential user.
+  class Users {
+   public:
+    Users() = default;
+    ~Users();
+
+    // No copying allowed
+    Users(const Users&) = delete;
+    Users& operator=(const Users&) = delete;
+
+    bool empty() const { return users_.empty(); }
+    int64_t size() const { return users_.size(); }
+    const PtrVec<HloInstruction*>& vec() const { return users_; }
+
+    void Clear();
+    bool Contains(const HloInstruction* instruction) const;
+    void AddUser(HloInstruction* user);
+    void MaybeRemoveUser(HloInstruction* user);  // Remove user if present
+    void RemoveUser(HloInstruction* user);       // REQUIRES: Contains(user)
+    int64_t UserId(HloInstruction* user);
+    void SortInstructionUsers(
+        const MappedPtrContainerSorter<HloInstruction>::MapPtrFn& map_fn,
+        const Users& sorted_instruction_users);
+    bool CheckInvariants();
+
+   private:
+    void RebuildMap();
+
+    PtrVec<HloInstruction*> users_;
+
+    // If users_ is big, we also maintain a copy of the elements of users_
+    // in a hash map to enable fast membership tests. The value in the map
+    // contains the index of the instruction in the vector what enables fast
+    // removal.
+    static constexpr size_t kMapThreshold = 16;
+    std::unique_ptr<absl::flat_hash_map<const HloInstruction*, int64_t>>
+        user_map_;
+  };
+
+  int unique_id_;  // Unique to this HloInstruction within a HloModule
+  uint32_t index_in_parent_;  // Index that identifies inst in HloComputation
+
+  // Opcode for this instruction.
+  HloOpcode opcode_;
+
+  // This field is assigned to true when backend_config_ is assigned to
+  // a default configuration.
+  bool is_default_config_ : 1;
+
+  // True if this instruction has already been detached from its user and
+  // operands.
+  bool cleaned_up_ : 1;
+
+  // Intrusive flag used by HloComputation, whether this instruction has
+  // been marked as dead.
+  bool marked_as_dead_ : 1;
+
+  // True if this instruction is the root of a computation.
+  bool is_root_ : 1;
+
+  // Instruction operands.
+  InstructionVector operands_;
+
+  // If needed, points off to allocated struct holding out-of-line info
+  // for things that are rarely filled
+  std::unique_ptr<Rare> rare_;
+
+  // The users of this instruction. Users are HLOs where this instruction is an
+  // operand.
+  Users users_;
+
+  // The computation in which this instruction is contained.
+  HloComputation* parent_ = nullptr;
+
+  // The sharding, if one exists.
+  // Uses std::shared_ptr to allow reuse of the same sharding object between
+  // HloInstructions and other components as HloSharding can be very large for
+  // many element tuples.
+  std::shared_ptr<const HloSharding> sharding_;
+
+  // Result shape of this instruction.
+  Shape shape_;
+
+  // The backend-specific configuration for how a backend should compile this
+  // HLO. See the documentation on backend_config().
+  BackendConfigWrapper backend_config_;
+
+  // String identifier for instruction.
+  std::string name_;
+
+  // Original value this instruction corresponds to in the unoptimized HLO
+  // graph.
+  std::shared_ptr<OriginalValue> original_value_ = nullptr;
+
+  // Metadata for debugging.  Allocate it on heap, so that it does not increase
+  // the memory footprint of HloInstruction.
+  std::unique_ptr<OpMetadata> metadata_ = std::make_unique<OpMetadata>();
+};
+
+// Explicit instantiations in hlo_instruction.cc.
+extern template absl::Status HloInstruction::Accept(DfsHloVisitor*, bool, bool,
+                                                    bool);
+extern template absl::Status HloInstruction::Accept(ConstDfsHloVisitor*, bool,
+                                                    bool, bool);
+extern template absl::Status HloInstruction::Visit(DfsHloVisitor* visitor);
+extern template absl::Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
+
+absl::string_view ToString(HloInstruction::FusionKind kind);
+absl::StatusOr<HloInstruction::FusionKind> StringToFusionKind(
+    absl::string_view kind_name);
+
+// Custom (de)stringification functions for protos that live inside
+// HloInstruction.
+std::string PaddingConfigToString(const PaddingConfig& padding);
+
+// Returns string representation of frontend attributes.
+// Frontend attribute is a list of attribute=<value> pairs where value is either
+// a "string" or a JSON-like dict surrounded in {}. Similar to custom_call
+// backend config, this can be used to store stringified MLIR-dictionaries with
+// pretty printing.
+std::string FrontendAttributesToString(
+    const FrontendAttributes& frontend_attributes);
+std::string StatisticsVizToString(const StatisticsViz& statistics_viz);
+std::string ResultAccuracyToleranceToString(
+    const ResultAccuracy::Tolerance& tolerance);
+std::string RandomAlgorithmToString(const RandomAlgorithm& algorithm);
+std::string RandomDistributionToString(const RandomDistribution& distribution);
+std::string PrecisionToString(const PrecisionConfig::Precision& precision);
+std::string ResultAccuracyToString(ResultAccuracy::Mode accuracy_mode);
+std::string AlgorithmToString(const PrecisionConfig::Algorithm& algorithm);
+std::string DotDimensionNumbersToString(const DotDimensionNumbers& dnums);
+std::string RaggedDotDimensionNumbersToString(
+    const RaggedDotDimensionNumbers& dnums);
+std::string ConvolutionDimensionNumbersToString(
+    const ConvolutionDimensionNumbers& dnums);
+
+absl::StatusOr<RandomAlgorithm> StringToRandomAlgorithm(
+    const std::string& name);
+absl::StatusOr<RandomDistribution> StringToRandomDistribution(
+    const std::string& name);
+absl::StatusOr<PrecisionConfig::Precision> StringToPrecision(
+    const std::string& name);
+absl::StatusOr<PrecisionConfig::Algorithm> StringToAlgorithm(
+    const std::string& name);
+absl::StatusOr<ResultAccuracy::Mode> StringToResultAccuracy(
+    absl::string_view name);
+absl::StatusOr<CustomCallSchedule> StringToCustomCallSchedule(
+    absl::string_view name);
+absl::StatusOr<CustomCallApiVersion> StringToCustomCallApiVersion(
+    absl::string_view name);
+
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
+
+bool IsUnaryOpWithResultAccuracy(HloOpcode opcode);
+bool IsValidResultAccuracy(const ResultAccuracy& result_accuracy);
+// Map classes that guarantee a deterministic iteration order when the key is
+// an HloInstruction* or a const HloInstruction*.
+// To make the iteration order over the map deterministic, the comparator
+// should not be using the pointer values, but rather an intrinsic property of
+// the hlo. Exception: null pointer values compare less than non-null.
+struct HloPtrComparator {
+  bool operator()(const HloInstruction* const& lhs,
+                  const HloInstruction* const& rhs) const;
+};
+
+template <typename ValueT>
+using HloInstructionMap = std::map<HloInstruction*, ValueT, HloPtrComparator>;
+
+template <typename ValueT>
+using ConstHloInstructionMap =
+    std::map<const HloInstruction*, ValueT, HloPtrComparator>;
+
+using HloInstructionSet = std::set<HloInstruction*, HloPtrComparator>;
+using ConstHloInstructionSet =
+    std::set<const HloInstruction*, HloPtrComparator>;
+
+template <HloOpcode op, HloOpcode... rest>
+bool HloPredicateIsOp(const HloInstruction* instruction) {
+  return (instruction->opcode() == op) ||
+         ((instruction->opcode() == rest) || ...);
+}
+
+template <HloOpcode op, HloOpcode... rest>
+bool HloPredicateIsNotOp(const HloInstruction* instruction) {
+  return (instruction->opcode() != op) &&
+         ((instruction->opcode() != rest) && ...);
+}
+
+/* static */ inline bool HloInstruction::MightHaveCalledComputations(
+    HloOpcode opcode) {
+  switch (opcode) {
+    // Control flow opcodes
+    case HloOpcode::kWhile:
+    case HloOpcode::kConditional:
+
+    // Fusion contains a sub-computation
+    case HloOpcode::kFusion:
+
+    // Async
+    case HloOpcode::kAsyncStart:
+    case HloOpcode::kAsyncUpdate:
+    case HloOpcode::kAsyncDone:
+
+    // Opcodes for which has_to_apply can return true
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kCall:
+    case HloOpcode::kMap:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceScatter:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
+    case HloOpcode::kTopK:
+    case HloOpcode::kCustomCall:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_INSTRUCTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instruction_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instruction_utils.h
new file mode 100644
index 00000000..35d53112
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instruction_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_INSTRUCTION_UTILS_H_
+#define XLA_HLO_IR_HLO_INSTRUCTION_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace hlo_instruction_utils {
+// Returns true if the given HLO is a slice operation which has a unit stride in
+// all dimensions.
+bool IsUnstridedSlice(const HloInstruction* hlo);
+
+// Adds or updates the attributes for an instruction. If the attribute is
+// already present, then it is overwritten. Otherwise, this is added as another
+// attribute.
+void AddOrUpdateVectorOfPairsAsAttribute(
+    HloInstruction* instr, std::string attr_name,
+    std::vector<std::pair<int64_t, int64_t>> intervals);
+
+}  // namespace hlo_instruction_utils
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_INSTRUCTION_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instructions.h
new file mode 100644
index 00000000..c21dddee
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -0,0 +1,2849 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// All HloInstruction subclasses are put in this file.
+
+#ifndef XLA_HLO_IR_HLO_INSTRUCTIONS_H_
+#define XLA_HLO_IR_HLO_INSTRUCTIONS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/hash/hash.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/literal_pool.h"
+#include "xla/printer.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/lib/gtl/iterator_range.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Base class for instructions with a dimensions vector.
+class HloDimensionsInstruction : public HloInstruction {
+ public:
+  absl::Span<const int64_t> dimensions() const override { return dimensions_; }
+
+  std::vector<int64_t>* mutable_dimensions() override { return &dimensions_; }
+
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    switch (hlo->opcode()) {
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kConcatenate:
+      case HloOpcode::kReduce:
+      case HloOpcode::kReverse:
+      case HloOpcode::kSort:
+      case HloOpcode::kTranspose:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ protected:
+  HloDimensionsInstruction(HloOpcode opcode, const Shape& shape,
+                           absl::Span<const int64_t> dimensions)
+      : HloInstruction(opcode, shape),
+        dimensions_(dimensions.begin(), dimensions.end()) {}
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  std::vector<int64_t> dimensions_;
+};
+
+class HloBatchNormInstruction : public HloInstruction {
+ public:
+  // Returns feature_index field associated with the instruction. The index
+  // represents the index of the feature dimension.
+  int64_t feature_index() const { return feature_index_; }
+
+  // Returns a epsilon value associated with the instruction. The is a small
+  // number added to the variance to avoid divide-by-zero error.
+  float epsilon() const { return epsilon_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    switch (hlo->opcode()) {
+      case HloOpcode::kBatchNormGrad:
+      case HloOpcode::kBatchNormInference:
+      case HloOpcode::kBatchNormTraining:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ protected:
+  explicit HloBatchNormInstruction(HloOpcode opcode, const Shape& shape,
+                                   HloInstruction* operand,
+                                   HloInstruction* scale, float epsilon,
+                                   int64_t feature_index);
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // A small float number added to the variance to avoid divide-by-zero error.
+  float epsilon_ = 0.0f;
+
+  // An integer value representing the index of the feature dimension.
+  int64_t feature_index_ = -1;
+};
+
+class HloBatchNormTrainingInstruction : public HloBatchNormInstruction {
+ public:
+  explicit HloBatchNormTrainingInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, float epsilon, int64_t feature_index);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kBatchNormTraining;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloBatchNormInferenceInstruction : public HloBatchNormInstruction {
+ public:
+  explicit HloBatchNormInferenceInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
+      float epsilon, int64_t feature_index);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kBatchNormInference;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloBatchNormGradInstruction : public HloBatchNormInstruction {
+ public:
+  explicit HloBatchNormGradInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* mean, HloInstruction* variance,
+      HloInstruction* grad_output, float epsilon, int64_t feature_index);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kBatchNormGrad;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloFftInstruction : public HloInstruction {
+ public:
+  explicit HloFftInstruction(const Shape& shape, HloInstruction* operand,
+                             FftType fft_type,
+                             absl::Span<const int64_t> fft_length);
+  FftType fft_type() const { return fft_type_; }
+
+  const std::vector<int64_t>& fft_length() const { return fft_length_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kFft;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes FFT type for an FFT instruction.
+  FftType fft_type_ = FftType::FFT;
+
+  // Indicates the FFT length for an FFT instruction.
+  std::vector<int64_t> fft_length_;
+};
+
+class HloAsyncInstruction : public HloInstruction {
+ public:
+  // Constructs async-{update,done}.
+  HloAsyncInstruction(HloOpcode opcode, const Shape& shape,
+                      HloInstruction* operand);
+
+  HloComputation* async_wrapped_computation() const;
+  HloInstruction* async_wrapped_instruction() const;
+  HloOpcode async_wrapped_opcode() const;
+
+  // Async thread name is a unique thread name for one or more async groups.
+  // Typically one HLO module contains a main thread as well as one or more
+  // parallel threads.
+  virtual absl::string_view async_execution_thread() const;
+  virtual void set_async_execution_thread(
+      absl::string_view async_execution_thread) {}
+  HloInstructionProto ToProto() const override {
+    return HloInstruction::ToProto();
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    switch (hlo->opcode()) {
+      case HloOpcode::kAsyncStart:
+      case HloOpcode::kAsyncUpdate:
+      case HloOpcode::kAsyncDone:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  // Returns async-start instruction of the async chain.
+  HloAsyncInstruction* async_chain_start() const;
+  // Returns async-done instruction of the async chain.
+  HloAsyncInstruction* async_chain_done() const;
+  // Returns the chain of async op referencing this computation,
+  // where *begin(GetAsyncChain()) is the async-start op and
+  // *end(GetAsyncChain()) is the async-done op.
+  std::vector<HloAsyncInstruction*> GetAsyncChain() const;
+
+  bool HasSideEffect() const override {
+    return async_wrapped_instruction()->HasSideEffect();
+  }
+
+ protected:
+  // Helper to constructs async-{start,update,done}.
+  HloAsyncInstruction(HloOpcode opcode, const Shape& shape,
+                      absl::Span<HloInstruction* const> operands,
+                      HloOpcode async_wrapped_opcode);
+
+ private:
+  // async-{update,done} inherit all their attributes from async-start,
+  // so they shouldn't print any.
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override {
+  }
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+  HloAsyncInstruction* async_chain_next_ = nullptr;
+};
+
+// Creates async-start.
+class HloAsyncStartInstruction : public HloAsyncInstruction {
+ public:
+  HloAsyncStartInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      HloComputation* async_computation,
+      absl::string_view async_execution_thread = kMainExecutionThread);
+
+  ~HloAsyncStartInstruction() override;
+  void ClearCalledComputations() override;
+  // When an async instruction is being destructed, remove it from the vector of
+  // pointers of its called computation, to avoid referencing freed memory.
+  void ClearAsyncComputationInstruction();
+
+  absl::string_view async_execution_thread() const override {
+    return async_execution_thread_;
+  };
+  void set_async_execution_thread(
+      absl::string_view async_execution_thread) override;
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    switch (hlo->opcode()) {
+      case HloOpcode::kAsyncStart:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  std::string async_execution_thread_ = kMainExecutionThread;
+};
+
+class HloCopyStartInstruction : public HloInstruction {
+ public:
+  explicit HloCopyStartInstruction(
+      const Shape& shape, HloInstruction* operand,
+      std::optional<int> cross_program_prefetch_index);
+
+  std::optional<int> cross_program_prefetch_index() const {
+    return cross_program_prefetch_index_;
+  }
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCopyStart;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Each cross program prefetched buffer has a unique index. The indices are
+  // assigned contiguously starting from zero in
+  // MsaAlgorithm::AllocateCrossProgramPrefetchBuffer. This value is used during
+  // codegen to determine which buffer is being speculated at runtime. One
+  // possible implementation is to initialize an array with boolean values
+  // indicating whether the cross program prefetch succeeds or fails for each
+  // buffer.
+  std::optional<int> cross_program_prefetch_index_;
+};
+
+class HloCompareInstruction : public HloInstruction {
+ public:
+  explicit HloCompareInstruction(const Shape& shape, HloInstruction* lhs,
+                                 HloInstruction* rhs,
+                                 ComparisonDirection direction,
+                                 std::optional<Comparison::Type> type);
+  ComparisonDirection direction() const { return compare_.GetDirection(); }
+  ComparisonOrder order() const { return compare_.GetOrder(); }
+  Comparison::Type type() const { return compare_.GetType(); }
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCompare;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  Comparison compare_;
+};
+
+class HloTriangularSolveInstruction : public HloInstruction {
+ public:
+  explicit HloTriangularSolveInstruction(const Shape& shape, HloInstruction* a,
+                                         HloInstruction* b,
+                                         const TriangularSolveOptions& options);
+  const TriangularSolveOptions& triangular_solve_options() const {
+    return triangular_solve_options_;
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kTriangularSolve;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  TriangularSolveOptions triangular_solve_options_;
+};
+
+class HloCholeskyInstruction : public HloInstruction {
+ public:
+  explicit HloCholeskyInstruction(const Shape& shape, HloInstruction* a,
+                                  const CholeskyOptions& options);
+  const CholeskyOptions& cholesky_options() const { return cholesky_options_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCholesky;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  CholeskyOptions cholesky_options_;
+};
+
+// Class that represents instructions that synchronize and transfer data between
+// partitioned devices. Send/Recv and collective instructions (AllReduce,
+// AllToAll, CollectivePermute, CollectiveBroadcast) belong to this instruction
+// type. A group of instructions (of the same opcode) with the same channel_id
+// communicate during execution.
+class HloChannelInstruction : public HloInstruction {
+ public:
+  // Returns the channel id associated with the instruction. The id is
+  // shared between each Send/Recv pair or a group of collective instructions
+  // and is globally unique to identify each channel.
+  std::optional<int64_t> channel_id() const { return channel_id_; }
+  void set_channel_id(const std::optional<int64_t>& channel_id);
+
+  // Whether this instruction is identical to `other` except for the values of
+  // channel IDs, as long as both have channel IDs or neither has a channel ID.
+  virtual bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const {
+    return channel_id_.has_value() == other.channel_id().has_value();
+  }
+
+  static bool ClassOf(const HloInstruction* hlo);
+
+ protected:
+  explicit HloChannelInstruction(HloOpcode opcode, const Shape& shape,
+                                 const std::optional<int64_t>& channel_id);
+
+  HloInstructionProto ToProto() const override;
+
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+
+  // Do not override IdenticalSlowPath(). Override
+  // IdenticalSlowPathIgnoringChannelIdValues() instead.
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const final;
+
+  std::optional<int64_t> channel_id_;
+};
+
+// Class that represents a top-k instruction.
+class HloTopKInstruction : public HloInstruction {
+ public:
+  HloTopKInstruction(const Shape& shape, HloInstruction* input, int64_t k,
+                     bool largest);
+
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kTopK;
+  }
+
+  // Returns how many K-s does it need.
+  int64_t k() const { return k_; }
+
+  // Returns whether the largest or smallest K values should be computed.
+  bool largest() const { return largest_; }
+
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t k_;
+  bool largest_;
+};
+
+class HloSendRecvInstruction : public HloChannelInstruction {
+ public:
+  // Returns whether this send/recv instruction sends data to/from the host.
+  bool is_host_transfer() const { return is_host_transfer_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    switch (hlo->opcode()) {
+      case HloOpcode::kSend:
+      case HloOpcode::kSendDone:
+      case HloOpcode::kRecv:
+      case HloOpcode::kRecvDone:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ protected:
+  explicit HloSendRecvInstruction(HloOpcode opcode, const Shape& shape,
+                                  std::optional<int64_t> channel_id,
+                                  bool is_host_transfer);
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Whether this send/recv instruction sends data to/from the host.
+  bool is_host_transfer_;
+};
+
+class HloSendInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloSendInstruction(HloInstruction* operand, HloInstruction* token,
+                              std::optional<int64_t> channel_id,
+                              bool is_host_transfer);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kSend;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloSendDoneInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloSendDoneInstruction(HloSendInstruction* operand,
+                                  bool is_host_transfer);
+  explicit HloSendDoneInstruction(HloInstruction* operand,
+                                  std::optional<int64_t> channel_id,
+                                  bool is_host_transfer);
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kSendDone;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloRecvInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloRecvInstruction(const Shape& shape, HloInstruction* token,
+                              std::optional<int64_t> channel_id,
+                              bool is_host_transfer);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRecv;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloRecvDoneInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloRecvDoneInstruction(HloRecvInstruction* operand,
+                                  bool is_host_transfer);
+  explicit HloRecvDoneInstruction(HloInstruction* operand,
+                                  std::optional<int64_t> channel_id,
+                                  bool is_host_transfer);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRecvDone;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloCollectiveInstruction : public HloChannelInstruction {
+ public:
+  // TODO(b/316622399): Remove usages of this method and replace with
+  // device_list()->replica_groups().
+  const std::vector<ReplicaGroup>& replica_groups() const {
+    return device_list_.replica_groups();
+  }
+
+  const CollectiveDeviceList& device_list() const { return device_list_; }
+
+  // Returns true if the layout of the AllReduce is enforced by XLA client (as
+  // the layout set in the shape). The only reason for the client to set the
+  // layout is to separately compile computations that communicate with
+  // AllReduce. Since this field is only set `true` by the client, the compiler
+  // only needs to propagate existing values (e.g., Clone, X64Rewriter) or set
+  // `false` for all other cases.
+  //
+  // When this is `true`, there may be communication endpoints outside the
+  // current compilation unit, so the compiler considers this AllReduce as
+  // side-effecting to disable compiler transformations. The compiler is free to
+  // transform unconstrained AllReduces differently across compilation units.
+  // It is an error for an HloModule to have a mix of constrained and
+  // unconstrained AllReduce instructions (checked by HloVerifier).
+  bool constrain_layout() const { return constrain_layout_; }
+
+  static bool ClassOf(const HloInstruction* hlo);
+
+ protected:
+  explicit HloCollectiveInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& collective_device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloCollectiveInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  HloInstructionProto ToProto() const override;
+
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  CollectiveDeviceList device_list_;
+  bool constrain_layout_;
+};
+
+class HloAllGatherInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloAllGatherInstruction(HloOpcode opcode, const Shape& shape,
+                                   absl::Span<HloInstruction* const> operands,
+                                   int64_t all_gather_dimension,
+                                   const CollectiveDeviceList& device_list,
+                                   bool constrain_layout,
+                                   const std::optional<int64_t>& channel_id,
+                                   bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloAllGatherInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands, int64_t all_gather_dimension,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  // Same as HloAllReduceInstruction::use_global_device_ids.
+  bool use_global_device_ids() const { return use_global_device_ids_; }
+
+  // The dimension on which data from different participants are concatenated.
+  int64_t all_gather_dimension() const { return all_gather_dimension_; }
+  absl::Span<const int64_t> dimensions() const override {
+    return absl::MakeConstSpan(&all_gather_dimension_, 1);
+  }
+
+  void set_all_gather_dimension(int64_t dim) { all_gather_dimension_ = dim; }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kAllGather ||
+           hlo->opcode() == HloOpcode::kAllGatherStart;
+  }
+
+ protected:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t all_gather_dimension_;
+  bool use_global_device_ids_;
+};
+
+// Base class for all-reduce and all-reduce scatter instructions.
+class HloAllReduceInstructionBase : public HloCollectiveInstruction {
+ public:
+  explicit HloAllReduceInstructionBase(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloAllReduceInstructionBase(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  // Returns true if the ids in the ReplicaGroup config represent a global id of
+  // (replica_id * partition_count + partition_id) instead of a replica id.
+  // This enables more flexible grouping of devices if this all-reduce is both
+  // cross-partition and cross-replica.
+  //
+  // For example with 2 replicas and 4 partitions,
+  // replica_groups={{0,1,4,5},{2,3,6,7}}, use_global_device_ids=true means that
+  // group[0] = (0,0), (0,1), (1,0), (1,1)
+  // group[1] = (0,2), (0,3), (1,2), (1,3)
+  // where each pair is (replica_id, partition_id).
+  bool use_global_device_ids() const { return use_global_device_ids_; }
+  void set_use_global_device_ids(bool value) { use_global_device_ids_ = value; }
+
+  static bool ClassOf(const HloInstruction* hlo);
+
+ protected:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+ private:
+  bool use_global_device_ids_;
+};
+
+class HloAllReduceInstruction : public HloAllReduceInstructionBase {
+ public:
+  using HloAllReduceInstructionBase::HloAllReduceInstructionBase;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kAllReduce ||
+           hlo->opcode() == HloOpcode::kAllReduceStart;
+  }
+
+  // Returns true if the AllReduce does no communication, so it's equivalent
+  // to a mem copy.
+  bool IsNoop() const;
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloReduceScatterInstruction : public HloAllReduceInstructionBase {
+ public:
+  explicit HloReduceScatterInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+      int64_t scatter_dimension);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloReduceScatterInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+      int64_t scatter_dimension);
+
+  // The dimension on which reduced data is scattered to different participants.
+  int64_t scatter_dimension() const { return scatter_dimension_; }
+  absl::Span<const int64_t> dimensions() const override {
+    return absl::MakeConstSpan(&scatter_dimension_, 1);
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kReduceScatter;
+  }
+
+ protected:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t scatter_dimension_;
+};
+
+class HloAllToAllInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloAllToAllInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id,
+      const std::optional<int64_t>& split_dimension);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloAllToAllInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id,
+      const std::optional<int64_t>& split_dimension);
+
+  // AllToAll can optionally take a split dimension, which means that this
+  // AllToAll takes a single (flattened) array operand and produces an array
+  // output (instead of taking a list of operands and producing a tuple).
+  //
+  // split_dimension specifies which dimension in the operand is split across
+  // devices in each replica_group, and also means the concatenated dimension
+  // on the output (i.e., input and the output shapes are the same).
+  std::optional<int64_t> split_dimension() const { return split_dimension_; }
+  void set_split_dimension(int64_t dim) { split_dimension_ = dim; }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kAllToAll;
+  }
+
+ protected:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  std::optional<int64_t> split_dimension_;
+};
+
+class HloRaggedAllToAllInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloRaggedAllToAllInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloRaggedAllToAllInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<int64_t>& channel_id);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRaggedAllToAll;
+  }
+
+ protected:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloCollectiveBroadcastInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloCollectiveBroadcastInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
+  explicit HloCollectiveBroadcastInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCollectiveBroadcast;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloCollectivePermuteInstruction : public HloChannelInstruction {
+ public:
+  explicit HloCollectivePermuteInstruction(
+      HloOpcode opcode, const Shape& shape, HloInstruction* operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<int64_t>& channel_id);
+
+  explicit HloCollectivePermuteInstruction(
+      HloOpcode opcode, const Shape& shape, HloInstruction* input,
+      HloInstruction* output, HloInstruction* input_start_indices,
+      HloInstruction* output_start_indices,
+      absl::Span<const std::pair<int64_t, int64_t>> source_target_pairs,
+      absl::Span<const std::vector<int64_t>> slice_sizes,
+      const std::optional<int64_t>& channel_id);
+
+  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs() const {
+    return source_target_pairs_;
+  }
+
+  const std::vector<std::vector<int64_t>>& dynamic_slice_sizes_list() const {
+    return slice_sizes_;
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCollectivePermute ||
+           hlo->opcode() == HloOpcode::kCollectivePermuteStart;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  const std::vector<std::pair<int64_t, int64_t>> source_target_pairs_;
+  const std::vector<std::vector<int64_t>> slice_sizes_;
+};
+
+inline bool HloAllReduceInstructionBase::ClassOf(const HloInstruction* hlo) {
+  return HloAllReduceInstruction::ClassOf(hlo) ||
+         hlo->opcode() == HloOpcode::kReduceScatter;
+}
+
+inline bool HloCollectiveInstruction::ClassOf(const HloInstruction* hlo) {
+  return HloAllReduceInstructionBase::ClassOf(hlo) ||
+         HloCollectiveBroadcastInstruction::ClassOf(hlo) ||
+         HloAllGatherInstruction::ClassOf(hlo) ||
+         HloAllToAllInstruction::ClassOf(hlo) ||
+         HloRaggedAllToAllInstruction::ClassOf(hlo);
+}
+
+inline bool HloChannelInstruction::ClassOf(const HloInstruction* hlo) {
+  return HloCollectiveInstruction::ClassOf(hlo) ||
+         HloCollectivePermuteInstruction::ClassOf(hlo) ||
+         HloSendRecvInstruction::ClassOf(hlo);
+}
+
+class HloReverseInstruction : public HloDimensionsInstruction {
+ public:
+  explicit HloReverseInstruction(const Shape& shape, HloInstruction* operand,
+                                 absl::Span<const int64_t> dimensions);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kReverse;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloConcatenateInstruction : public HloDimensionsInstruction {
+ public:
+  explicit HloConcatenateInstruction(const Shape& shape,
+                                     absl::Span<HloInstruction* const> operands,
+                                     int64_t dimension);
+  // Accessor for the dimension in which a concatenate HLO should occur.
+  int64_t concatenate_dimension() const override {
+    return HloInstruction::dimensions(0);
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kConcatenate;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloReduceInstruction : public HloDimensionsInstruction {
+ public:
+  explicit HloReduceInstruction(const Shape& shape,
+                                absl::Span<HloInstruction* const> args,
+                                absl::Span<const int64_t> dimensions_to_reduce,
+                                HloComputation* reduce_computation);
+
+  // Returns the number of input arrays (and, consequentially, the number of
+  // init values) this reduce has.
+  int64_t input_count() const { return operand_count() / 2; }
+
+  // Returns the input tensors to be reduced.
+  absl::Span<HloInstruction* const> inputs() const {
+    return absl::MakeSpan(operands()).subspan(0, input_count());
+  }
+
+  // Returns the init values of the reduction.
+  absl::Span<HloInstruction* const> init_values() const {
+    return absl::MakeSpan(operands()).subspan(input_count(), operand_count());
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kReduce;
+  }
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloSortInstruction : public HloDimensionsInstruction {
+ public:
+  explicit HloSortInstruction(const Shape& shape, int64_t dimension,
+                              absl::Span<HloInstruction* const> operands,
+                              HloComputation* compare, bool is_stable);
+  // Returns the sort dimension for this instruction
+  int64_t sort_dimension() const { return HloInstruction::dimensions(0); }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+  // Returns the key operand to this instruction.
+  const HloInstruction* keys() const { return operand(0); }
+  HloInstruction* mutable_keys() { return mutable_operand(0); }
+  // Returns the number of value operands.
+  int64_t values_count() const { return operand_count() - 1; }
+  bool is_stable() const { return is_stable_; }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kSort;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  bool is_stable_;
+};
+
+class HloTransposeInstruction : public HloDimensionsInstruction {
+ public:
+  explicit HloTransposeInstruction(const Shape& shape, HloInstruction* operand,
+                                   absl::Span<const int64_t> dimensions);
+  // Returns whether this instruction does a rank-2 transposition.
+  bool IsRank2Transpose() const;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kTranspose;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloBroadcastInstruction : public HloDimensionsInstruction {
+ public:
+  explicit HloBroadcastInstruction(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<const int64_t> broadcast_dimension);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kBroadcast;
+  }
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloDynamicReshapeInstruction : public HloInstruction {
+ public:
+  explicit HloDynamicReshapeInstruction(
+      const Shape& shape, HloInstruction* data_operand,
+      absl::Span<HloInstruction* const> dim_sizes);
+
+  // Returns the input dim sizes dimensions, which is operands[1:]
+  absl::Span<HloInstruction* const> dim_sizes() const {
+    return absl::MakeSpan(operands()).subspan(1, operand_count());
+  }
+
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Returns the input dim size dimension, which is operands[1+i]
+  HloInstruction* dim_sizes(int64_t i) const { return operands()[i + 1]; }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kDynamicReshape;
+  }
+};
+
+class HloReshapeInstruction : public HloInstruction {
+ public:
+  explicit HloReshapeInstruction(const Shape& shape, HloInstruction* operand,
+                                 int64_t inferred_dimension);
+  int64_t inferred_dimension() const { return inferred_dimension_; }
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kReshape;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+  int64_t inferred_dimension_;
+};
+
+class HloMapInstruction : public HloInstruction {
+ public:
+  explicit HloMapInstruction(const Shape& shape,
+                             absl::Span<HloInstruction* const> operands,
+                             HloComputation* map_computation);
+  // Returns the dimension sizes or numbers associated with this instruction.
+  absl::Span<const int64_t> dimensions() const override { return dimensions_; }
+
+  std::vector<int64_t>* mutable_dimensions() override { return &dimensions_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kMap;
+  }
+
+ private:
+  bool IsElementwiseImpl(
+      const std::optional<int64_t>& operand_idx) const override;
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64_t> dimensions_;
+};
+
+class HloSliceInstruction : public HloInstruction {
+ public:
+  explicit HloSliceInstruction(const Shape& shape, HloInstruction* operand,
+                               absl::Span<const int64_t> start_indices,
+                               absl::Span<const int64_t> limit_indices,
+                               absl::Span<const int64_t> strides);
+
+  HloInstructionProto ToProto() const override;
+
+  // Returns the start index in the given dimension for a slice node.
+  int64_t slice_starts(int64_t dimension) const {
+    return slice_starts_[dimension];
+  }
+  const std::vector<int64_t>& slice_starts() const { return slice_starts_; }
+  std::vector<int64_t>* mutable_slice_starts() { return &slice_starts_; }
+
+  // Returns the (exclusive) limit index in the given dimension for a slice
+  // node.
+  int64_t slice_limits(int64_t dimension) const {
+    return slice_limits_[dimension];
+  }
+  const std::vector<int64_t>& slice_limits() const { return slice_limits_; }
+  std::vector<int64_t>* mutable_slice_limits() { return &slice_limits_; }
+
+  // Returns the stride in the given dimension for a slice node.
+  int64_t slice_strides(int64_t dimension) const {
+    return slice_strides_[dimension];
+  }
+  const std::vector<int64_t>& slice_strides() const { return slice_strides_; }
+  std::vector<int64_t>* mutable_slice_strides() { return &slice_strides_; }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kSlice;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes the [begin, end) index range for a slice.
+  std::vector<int64_t> slice_starts_;
+  std::vector<int64_t> slice_limits_;
+  std::vector<int64_t> slice_strides_;
+};
+
+class HloConstantInstruction : public HloInstruction {
+ public:
+  explicit HloConstantInstruction(Literal literal);
+  HloConstantInstruction(Literal literal, const Shape& shape);
+  HloConstantInstruction(std::shared_ptr<Literal> literal, const Shape& shape);
+  // Used when the literal is too large and dropped.
+  explicit HloConstantInstruction(const Shape& shape);
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const { return *literal_; }
+  // Returns the (mutable) literal associated with this instruction.
+  // Clone the literal if necessary (do not modify the shared instance).
+  Literal* mutable_literal() {
+    if (literal_.use_count() > 1) {
+      literal_.reset(new Literal(literal_->Clone()));
+    }
+    return literal_.get();
+  }
+  // Returns whether there is literal associated with this instruction.
+  bool HasLiteral() const { return static_cast<bool>(literal_); }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  // Change the layout for an Constant Hlo instruction to match new_layout.  For
+  // tuple shaped constants shape_index is the path to the internal array
+  // subshape whose layout needs to be changed.
+  void RelayoutConstant(const Layout& new_layout,
+                        const ShapeIndex& shape_index = {});
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kConstant;
+  }
+
+  // Canonicalize constant literal using the given literal pool.
+  bool Canonicalize(LiteralPool* literal_pool) {
+    if (literal_pool && literal_) {
+      auto canonical = literal_pool->GetCanonicalLiteral(literal_);
+      if (canonical != literal_) {
+        literal_ = std::move(canonical);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Add literal to the hash state.
+  void HashAdditionalAttributes(absl::HashState h) const override {
+    if (HasLiteral()) {
+      absl::HashState::combine(std::move(h),
+                               Literal::AbslHashable<true>(literal()));
+    }
+  }
+
+ private:
+  bool IsElementwiseImpl(
+      const std::optional<int64_t>& operand_idx) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  void PrintOperandsWithCanonicalNameMap(
+      Printer* printer, const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+  std::shared_ptr<Literal> literal_;
+};
+
+// Abstract class that represents an HLO instruction that "calls" a computation.
+// Fusion and Call HLOs inherit from this class.
+class HloCallableInstruction : public HloInstruction {
+ public:
+  HloCallableInstruction(HloOpcode opcode, const Shape& shape);
+
+  HloCallableInstruction(HloOpcode opcode, const Shape& shape,
+                         absl::Span<HloInstruction* const> operands);
+
+  HloCallableInstruction(HloOpcode opcode, const Shape& shape,
+                         absl::Span<HloInstruction* const> operands,
+                         HloComputation* called_computation,
+                         absl::string_view prefix = "");
+
+  HloCallableInstruction(HloOpcode opcode, const Shape& shape,
+                         absl::Span<HloInstruction* const> operands,
+                         absl::Span<HloComputation* const> called_computations);
+
+  HloCallableInstruction(HloOpcode opcode, const Shape& shape,
+                         const std::string& name, const std::string& attributes,
+                         int64_t version);
+
+  HloCallableInstruction(HloOpcode opcode, const Shape& shape,
+                         absl::Span<HloInstruction* const> operands,
+                         HloComputation* decomposition, const std::string& name,
+                         const std::string& attributes, int64_t version);
+
+  ~HloCallableInstruction() override;
+
+  // Adds a new operand to the callable instruction.
+  HloInstruction* AddCallOperand(HloInstruction* new_operand);
+
+  // Appends (fuses) the given instruction into this callable instruction.
+  // instruction_to_append is cloned and the clone is placed in the callable
+  // instruction.  The users of instruction_to_append will be redirected to this
+  // callable instruction. instruction_to_append is unchanged otherwise. When
+  // add_output is true, a clone of the instruction_to_append will be added as
+  // additional output resulting in a multi-output callable instruction.
+  HloInstruction* AppendInstructionIntoCalledComputation(
+      HloInstruction* instruction_to_append, bool add_output = false);
+  // Clones the given instruction_to_append and inserts the clone into this
+  // callable instruction. If add_output is true, a clone of
+  // instruction_to_append will be in the output of the this callable
+  // instruction (part of the tuple of the callable root).
+  HloInstruction* CloneAndAppendInstructionIntoCalledComputation(
+      HloInstruction* instruction_to_append, bool add_output = false);
+
+  // Retrieves the called computations of an HloCallableInstruction that is
+  // being cloned. If the called computations have not yet been cloned, then
+  // they are first cloned and added to the context.
+  absl::InlinedVector<HloComputation*, 1> GetOrCloneCalledComputations(
+      HloCloneContext* context) const;
+
+  HloComputation* called_computation() const;
+
+  HloInstruction* called_computation_root() const;
+
+  // Recursively sets all nested called computation to have thread name as
+  // `execution_thread`. if `skip_async_execution_thread_overwrite` is true,
+  // skip overwrite async instruction and its comptuations thread name
+  // overwriting.
+  void RecursivelySetComputationsThreadName(
+      absl::string_view execution_thread,
+      bool skip_async_execution_thread_overwrite);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kFusion ||
+           hlo->opcode() == HloOpcode::kCall ||
+           hlo->opcode() == HloOpcode::kCustomCall;
+  }
+
+  // Gets a list of output/operand buffer pairs that alias each other, where the
+  // output buffer is represented as a ShapeIndex, and the operand buffer is
+  // represented as the operand index and the ShapeIndex. By default this list
+  // is empty.
+  const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
+  output_to_operand_aliasing() const {
+    return output_to_operand_aliasing_;
+  }
+  // Sets the list of output/operand buffer pairs that alias each other.
+  void set_output_to_operand_aliasing(
+      std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          aliasing) {
+    output_to_operand_aliasing_ = std::move(aliasing);
+  }
+
+  FrontendAttributes BuildFrontendAttributesForComposite(
+      const std::string& name,
+      std::optional<absl::string_view> attributes = std::nullopt,
+      std::optional<int64_t> version = std::nullopt) {
+    FrontendAttributes frontend_attributes;
+    frontend_attributes.mutable_map()->insert({"composite.name", name});
+    frontend_attributes.mutable_map()->insert(
+        {"composite.attributes",
+         attributes.has_value() ? std::string(*attributes) : "{}"});
+    frontend_attributes.mutable_map()->insert(
+        {"composite.version",
+         version.has_value() ? std::to_string(*version) : "0"});
+    return frontend_attributes;
+  }
+
+ protected:
+  // Returns the default called computation name.
+  virtual std::string default_called_computation_name() const = 0;
+
+ private:
+  // A list of output/operand buffer pairs that alias each other. See comment of
+  // output_to_operand_aliasing().
+  std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+      output_to_operand_aliasing_;
+};
+
+class HloFusionInstruction : public HloCallableInstruction {
+ public:
+  explicit HloFusionInstruction(const Shape& shape, FusionKind fusion_kind,
+                                HloInstruction* fused_root,
+                                absl::string_view prefix = "");
+
+  explicit HloFusionInstruction(const Shape& shape, FusionKind fusion_kind,
+                                absl::Span<HloInstruction* const> operands,
+                                HloComputation* fusion_computation,
+                                absl::string_view prefix = "");
+
+  ~HloFusionInstruction() override;
+
+  void ClearCalledComputations() override;
+
+  // When a fusion instruction is being destructed, clear the back pointer of
+  // its fusion computation, to avoid referencing freed memory.
+  void ClearFusionComputationInstruction();
+
+  // Clones the given instruction_to_append and inserts the clone into this
+  // callable instruction.
+  HloInstruction* CloneAndAppendInstructionIntoCalledComputation(
+      HloInstruction* instruction_to_append, bool add_output = false);
+
+  std::string ToCategory() const override;
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  // Adds a new operand the fusion instruction.
+  HloInstruction* AddFusionOperand(HloInstruction* new_operand);
+
+  // Merges the fused instructions from 'instruction_to_merge' into the
+  // fused instruction set of 'this', updating operands as necessary.
+  //
+  // Precondition: 'instruction_to_merge' must be an operand of 'this'.
+  void MergeFusionInstruction(HloFusionInstruction* instruction_to_merge);
+
+  // Merges the fused instructions from instruction_to_merge into the fused
+  // instruction set of 'this' and generates multi-output fusion instructions.
+  // All the users of instruction_to_merge will be redirected to 'this'
+  // instruction. instruction_to_merge will be removed from its parent
+  // computation.
+  void MergeFusionInstructionIntoMultiOutput(
+      HloFusionInstruction* instruction_to_merge);
+
+  // Fuses the given instruction in this fusion instruction. instruction_to_fuse
+  // is cloned and the clone is placed in the fusion
+  // instruction. instruction_to_fuse is unchanged. Instruction is cloned rather
+  // than moved to cleanly handle the case where the instruction has a use
+  // outside the fusion instruction. Moving such an instruction into a fusion
+  // instruction would violate the single-result invariant of HLO instructions
+  // and significantly complicate code generation.
+  HloInstruction* FuseInstruction(HloInstruction* instruction_to_fuse) {
+    CHECK(instruction_to_fuse->IsFusible()) << instruction_to_fuse->ToString();
+    return AppendInstructionIntoCalledComputation(instruction_to_fuse);
+  }
+
+  // Fuses the given instruction in this fusion instruction and generates a
+  // multioutput fusion instruction. A clone of the instruction_to_fuse will
+  // be part of the output of fusion instructions. The users of
+  // instruction_to_fuse will be redirected to this fusion instructions.
+  // instruction_to_fuse is unchanged otherwise.
+  HloInstruction* FuseInstructionIntoMultiOutput(
+      HloInstruction* instruction_to_fuse) {
+    return AppendInstructionIntoCalledComputation(instruction_to_fuse,
+                                                  /*add_output=*/true);
+  }
+
+  // Returns the computation for this fused instruction.
+  HloComputation* fused_instructions_computation() const;
+
+  // Returns the root instruction of the fused expression contained within this
+  // fusion instruction.
+  HloInstruction* fused_expression_root() const;
+
+  // Returns the list of fused instructions inside this fusion instruction.  The
+  // returned type is a range of HloInstruction*s.
+  tsl::gtl::iterator_range<HloInstructionUnwrappingConstIterator>
+  fused_instructions() const;
+
+  tsl::gtl::iterator_range<HloInstructionUnwrappingIterator>
+  fused_instructions();
+
+  // Gets the number of instructions inside this fusion instruction.
+  int64_t fused_instruction_count() const;
+
+  // Returns the fused parameter instruction in this fusion instruction
+  // corresponding to the given parameter number.
+  HloInstruction* fused_parameter(int64_t parameter_number) const;
+
+  // Returns the vector of fused parameters inside this fusion instruction.
+  const HloInstruction::InstructionVector& fused_parameters() const;
+
+  // Returns true if this instruction is a fusion instruction that generates
+  // multiple outputs.
+  bool IsMultiOutputFusion() const {
+    return fused_expression_root()->opcode() == HloOpcode::kTuple;
+  }
+
+  FusionKind fusion_kind() const { return fusion_kind_; }
+
+  void set_fusion_kind(FusionKind kind) { fusion_kind_ = kind; }
+
+  // If multiple operands are the same instruction, keeps only one of them.
+  absl::Status DeduplicateFusionOperands();
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kFusion;
+  }
+
+  // Add various fusion parameters to the hash.
+  void HashAdditionalAttributes(absl::HashState h) const override {
+    absl::HashState::combine(std::move(h), *fused_expression_root(),
+                             fusion_kind(), fused_instruction_count(),
+                             fused_parameters().size());
+  }
+
+ protected:
+  std::string default_called_computation_name() const override {
+    return "fused_computation";
+  }
+
+ private:
+  bool IsElementwiseImpl(
+      const std::optional<int64_t>& operand_idx) const override;
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // The type of the fusion.
+  FusionKind fusion_kind_;
+};
+
+class HloCallInstruction : public HloCallableInstruction {
+ public:
+  HloCallInstruction(const Shape& shape,
+                     HloInstruction* called_computation_root);
+
+  HloCallInstruction(const Shape& shape,
+                     absl::Span<HloInstruction* const> operands,
+                     HloComputation* called_computation);
+
+  HloCallInstruction(const Shape& shape, HloInstruction* decomposition_root,
+                     const std::string& name, const std::string& attributes,
+                     int64_t version);
+
+  HloCallInstruction(const Shape& shape,
+                     absl::Span<HloInstruction* const> operands,
+                     HloComputation* decomposition, const std::string& name,
+                     const std::string& attributes, int64_t version);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCall;
+  }
+
+ protected:
+  std::string default_called_computation_name() const override {
+    return "called_computation";
+  }
+};
+
+class HloRngInstruction : public HloInstruction {
+ public:
+  explicit HloRngInstruction(const Shape& shape,
+                             RandomDistribution distribution,
+                             absl::Span<HloInstruction* const> parameters);
+  // Returns the random distribution for this rng node.
+  RandomDistribution random_distribution() const { return distribution_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRng;
+  }
+
+ private:
+  bool IsElementwiseImpl(
+      const std::optional<int64_t>& operand_idx) const override;
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // The distribution requested for random number generation.
+  RandomDistribution distribution_;
+};
+
+class HloParameterInstruction : public HloInstruction {
+ public:
+  explicit HloParameterInstruction(int64_t parameter_number, const Shape& shape,
+                                   absl::string_view name);
+  int64_t parameter_number() const { return parameter_number_; }
+
+  // Sets and gets the whether all replicas will receive the same parameter data
+  // for each leaf buffer in data parallelism.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+    CHECK_EQ(ShapeUtil::GetLeafCount(shape()),
+             parameter_replicated_at_leaf_buffers.size());
+    parameter_replicated_at_leaf_buffers_.emplace(
+        parameter_replicated_at_leaf_buffers.begin(),
+        parameter_replicated_at_leaf_buffers.end());
+  }
+  void set_parameter_replicated_at_leaf_buffers(
+      const std::vector<bool>& parameter_replicated_at_leaf_buffers) {
+    CHECK_EQ(ShapeUtil::GetLeafCount(shape()),
+             parameter_replicated_at_leaf_buffers.size());
+    parameter_replicated_at_leaf_buffers_ =
+        parameter_replicated_at_leaf_buffers;
+  }
+  const std::optional<std::vector<bool>>& parameter_replicated_at_leaf_buffers()
+      const {
+    return parameter_replicated_at_leaf_buffers_;
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kParameter;
+  }
+
+  // Add parameter number to the hash.
+  void HashAdditionalAttributes(absl::HashState h) const override {
+    absl::HashState::combine(std::move(h), parameter_number());
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  void PrintOperandsWithCanonicalNameMap(
+      Printer* printer, const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t parameter_number_ = 0;
+
+  // Specifies whether each buffer has the same parameter value on all replicas
+  // in data parallelism.
+  std::optional<std::vector<bool>> parameter_replicated_at_leaf_buffers_;
+};
+
+class HloGetTupleElementInstruction : public HloInstruction {
+ public:
+  explicit HloGetTupleElementInstruction(const Shape& shape,
+                                         HloInstruction* operand,
+                                         int64_t index);
+  // Returns the tuple index associated with this instruction.
+  int64_t tuple_index() const { return tuple_index_; }
+  // Sets the tuple index associated with this instruction.
+  void set_tuple_index(int64_t new_tuple_index) {
+    tuple_index_ = new_tuple_index;
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kGetTupleElement;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t tuple_index_ = -1;
+};
+
+class HloReducePrecisionInstruction : public HloInstruction {
+ public:
+  explicit HloReducePrecisionInstruction(const Shape& shape,
+                                         HloInstruction* operand,
+                                         int exponent_bits, int mantissa_bits);
+  // Returns the number of exponent bits for a reduce-precision node.
+  int32_t exponent_bits() const { return exponent_bits_; }
+  // Returns the number of mantissa bits for a reduce-precision node.
+  int32_t mantissa_bits() const { return mantissa_bits_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kReducePrecision;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // The bit sizes for a reduce-precision operation.
+  int32_t exponent_bits_ = 0;
+  int32_t mantissa_bits_ = 0;
+};
+
+class HloInfeedInstruction : public HloInstruction {
+ public:
+  explicit HloInfeedInstruction(const Shape& infeed_shape,
+                                HloInstruction* token_operand,
+                                const std::string& config);
+  // Returns the infeed configuration string. The infeed configuration includes
+  // any metadata needed for the backend compiler (e.g., infeed buffer address)
+  // and is target-dependent.
+  std::string infeed_config() const { return infeed_config_; }
+  void set_infeed_config(const std::string& config) { infeed_config_ = config; }
+  // Returns the shape of the data received by the infeed. This is not the same
+  // as the shape of the infeed instruction which produces a tuple containing
+  // the infeed data shape and a TOKEN.
+  const Shape& infeed_shape() const {
+    TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape()));
+    return ShapeUtil::GetSubshape(shape(), {0});
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kInfeed;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // The string representation of the infeed configuration.
+  std::string infeed_config_;
+};
+
+class HloOutfeedInstruction : public HloInstruction {
+ public:
+  explicit HloOutfeedInstruction(const Shape& outfeed_shape,
+                                 HloInstruction* operand,
+                                 HloInstruction* token_operand,
+                                 absl::string_view outfeed_config);
+  // Returns the shape for the Outfeed instruction.
+  const Shape& outfeed_shape() const { return outfeed_shape_; }
+  // Returns the mutable shape for the Outfeed instruction.
+  Shape* mutable_outfeed_shape() { return &outfeed_shape_; }
+  // Returns the config for the Outfeed instruction.
+  const std::string& outfeed_config() const { return outfeed_config_; }
+  void set_outfeed_config(const std::string& config) {
+    outfeed_config_ = config;
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kOutfeed;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Shape of outfeed request.
+  Shape outfeed_shape_;
+  // Outfeed configuration information, only present for kOutfeed.
+  std::string outfeed_config_;
+};
+
+class HloConvolutionInstruction : public HloInstruction {
+ public:
+  explicit HloConvolutionInstruction(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      int64_t feature_group_count, int64_t batch_group_count,
+      const Window& window,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      const PrecisionConfig& precision_config);
+  const Window& window() const override { return window_; }
+  void set_window(const Window& window) override { window_ = window; }
+  const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
+    return convolution_dimension_numbers_;
+  }
+  void set_convolution_dimension_numbers(
+      const ConvolutionDimensionNumbers& dnums) {
+    convolution_dimension_numbers_ = dnums;
+  }
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64_t feature_group_count() const { return feature_group_count_; }
+  void set_feature_group_count(int64_t num_feature_groups) {
+    feature_group_count_ = num_feature_groups;
+  }
+  // The number of batch groups. Must be a divisor of the input batch dimension.
+  int64_t batch_group_count() const { return batch_group_count_; }
+  void set_batch_group_count(int64_t num_batch_groups) {
+    batch_group_count_ = num_batch_groups;
+  }
+
+  // Returns the information used to tell the implementation information about
+  // what sort of precision is requested. The meaning of the field is backend
+  // specific. At the moment, it is only supported for kConvolution and kDot.
+  // Transformations on one kDot or kConvolution to another will preserve this
+  // information. Transformations to other HLOs will not preserve this
+  // information but it is presumed that the alternate lowering is strictly
+  // superior.
+  const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
+
+  std::string ToCategory() const override;
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kConvolution;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64_t feature_group_count_;
+  // The number of batch groups. Must be a divisor of the input batch dimension.
+  int64_t batch_group_count_;
+  // Describes the window used for a convolution.
+  Window window_;
+  // Describes the dimension numbers used for a convolution.
+  ConvolutionDimensionNumbers convolution_dimension_numbers_;
+  // Information used to communicate to the implementation about the algorithm
+  // used to produce results. See the documentation on precision_config().
+  PrecisionConfig precision_config_;
+};
+
+class HloReduceWindowInstruction : public HloInstruction {
+ public:
+  explicit HloReduceWindowInstruction(const Shape& shape,
+                                      HloInstruction* operand,
+                                      HloInstruction* init_value,
+                                      const Window& window,
+                                      HloComputation* reduce_computation);
+  explicit HloReduceWindowInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloInstruction* const> init_values, const Window& window,
+      HloComputation* reduce_computation);
+  const Window& window() const override { return window_; }
+  void set_window(const Window& window) override { window_ = window; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+  // Returns the number of input arrays (and, consequentially, the number of
+  // init values) this reduce has.
+  int64_t input_count() const { return operand_count() / 2; }
+  // Returns the input tensors to be reduced.
+  absl::Span<HloInstruction* const> inputs() const {
+    return absl::MakeSpan(operands()).subspan(0, input_count());
+  }
+  // Returns the init values of the reduction.
+  absl::Span<HloInstruction* const> init_values() const {
+    return absl::MakeSpan(operands()).subspan(input_count(), operand_count());
+  }
+  // Returns the shapes of input tensors to be reduced.
+  absl::InlinedVector<const Shape*, 2> input_shapes() const {
+    absl::InlinedVector<const Shape*, 2> shapes;
+    for (const auto* op : inputs()) {
+      shapes.push_back(&op->shape());
+    }
+    return shapes;
+  }
+  // Returns the init values of the reduction.
+  absl::InlinedVector<const Shape*, 2> init_value_shapes() const {
+    absl::InlinedVector<const Shape*, 2> shapes;
+    for (const auto* op : init_values()) {
+      shapes.push_back(&op->shape());
+    }
+    return shapes;
+  }
+  // Returns the shapes of the reduced output tensors.
+  absl::InlinedVector<const Shape*, 2> output_shapes() const {
+    absl::InlinedVector<const Shape*, 2> shapes;
+    if (shape().IsArray()) {
+      shapes.push_back(&shape());
+    } else {
+      for (const Shape& tuple_element_shape : shape().tuple_shapes()) {
+        shapes.push_back(&tuple_element_shape);
+      }
+    }
+    return shapes;
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kReduceWindow;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  Window window_;
+};
+
+class HloSelectAndScatterInstruction : public HloInstruction {
+ public:
+  explicit HloSelectAndScatterInstruction(
+      const Shape& shape, HloInstruction* operand, HloComputation* select,
+      const Window& window, HloInstruction* source, HloInstruction* init_value,
+      HloComputation* scatter);
+  const Window& window() const override { return window_; }
+  void set_window(const Window& window) override { window_ = window; }
+  // Gets/sets the select or scatter HloComputation for SelectAndScatter. The
+  // setters should only be called by HloModule or HloComputation methods.
+  HloComputation* select() const {
+    return called_computations()[kSelectComputationIndex];
+  }
+
+  HloComputation* scatter() const {
+    return called_computations()[kScatterComputationIndex];
+  }
+
+  void set_select(HloComputation* computation) {
+    set_called_computation(kSelectComputationIndex, computation);
+  }
+
+  void set_scatter(HloComputation* computation) {
+    set_called_computation(kScatterComputationIndex, computation);
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kSelectAndScatter;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+  Window window_;
+};
+
+class HloCustomCallInstruction : public HloCallableInstruction {
+ public:
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           absl::string_view custom_call_target,
+                           std::string opaque,
+                           CustomCallApiVersion api_version);
+
+  // Constructor for a custom call with constrained layout. 'shape' and
+  // 'operands_with_layout' must all have layouts.
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           absl::string_view custom_call_target,
+                           std::string opaque,
+                           absl::Span<const Shape> operand_shapes_with_layout,
+                           CustomCallApiVersion api_version);
+
+  // Constructor for a custom call with a to_apply computation.
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           HloComputation* to_apply,
+                           absl::string_view custom_call_target,
+                           std::string opaque,
+                           CustomCallApiVersion api_version);
+
+  // Constructor for a custom call with multiple computations.
+  HloCustomCallInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::Span<HloComputation* const> called_computations,
+      absl::string_view custom_call_target, std::string opaque,
+      CustomCallApiVersion api_version);
+
+  const Window& window() const override {
+    CHECK(window_ != nullptr);
+    return *window_;
+  }
+
+  void set_window(const Window& window) override {
+    window_ = std::make_unique<Window>(window);
+  }
+
+  const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
+    CHECK(convolution_dimension_numbers_ != nullptr);
+    return *convolution_dimension_numbers_;
+  }
+
+  void set_convolution_dimension_numbers(
+      const ConvolutionDimensionNumbers& dnums) {
+    convolution_dimension_numbers_ =
+        std::make_unique<ConvolutionDimensionNumbers>(dnums);
+  }
+  // TODO(jpienaar): Remove this accessor in the follow up.
+  const std::string& opaque() const { return raw_backend_config_string(); }
+  const std::string& custom_call_target() const { return custom_call_target_; }
+  void set_custom_call_target(absl::string_view target) {
+    custom_call_target_ = std::string(target);
+  }
+  void set_feature_group_count(int64_t feature_group_count) {
+    feature_group_count_ = feature_group_count;
+  }
+  void set_batch_group_count(int64_t batch_group_count) {
+    batch_group_count_ = batch_group_count;
+  }
+  // Sets whether this custom call has a side-effect - by default a custom call
+  // has no side-effects.
+  void set_custom_call_has_side_effect(bool custom_call_has_side_effect) {
+    custom_call_has_side_effect_ = custom_call_has_side_effect;
+  }
+  int64_t feature_group_count() const { return feature_group_count_; }
+  int64_t batch_group_count() const { return batch_group_count_; }
+  bool custom_call_has_side_effect() const {
+    return custom_call_has_side_effect_;
+  }
+  // Returns padding type used for ops like convolution.
+  PaddingType padding_type() const { return padding_type_; }
+
+  void set_padding_type(PaddingType padding_type) {
+    padding_type_ = padding_type;
+  }
+
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const { return *literal_; }
+  // Set the value of literal to a new one.
+  void set_literal(Literal&& literal) { literal_.emplace(std::move(literal)); }
+  // Returns whether there is literal associated with this instruction.
+  bool HasLiteral() const { return literal_.has_value(); }
+
+  const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  // Returns whether the result and operand layouts are constrained.
+  bool layout_constrained() const { return layout_constrained_; }
+
+  // Returns the shapes (with layout) of the operands. CHECKs if this custom
+  // call does not have constrained layouts.
+  const std::vector<Shape>& operand_shapes_with_layout() const {
+    CHECK(layout_constrained());
+    return operand_shapes_with_layout_;
+  }
+  void set_operand_shapes_with_layout(
+      std::vector<Shape> operand_shapes_with_layout) {
+    CHECK(layout_constrained());
+    operand_shapes_with_layout_ = std::move(operand_shapes_with_layout);
+  }
+  void set_custom_call_schedule(CustomCallSchedule custom_call_schedule) {
+    custom_call_schedule_ = custom_call_schedule;
+  }
+  CustomCallSchedule custom_call_schedule() const {
+    return custom_call_schedule_;
+  }
+  void set_api_version(CustomCallApiVersion api_version) {
+    api_version_ = api_version;
+  }
+  CustomCallApiVersion api_version() const { return api_version_; }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kCustomCall;
+  }
+
+ protected:
+  std::string default_called_computation_name() const override {
+    return "custom_call_computation";
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+  // Name of a global symbol to call.
+  std::string custom_call_target_;
+  // Describes the window in a windowed operation such as convolution.
+  std::unique_ptr<Window> window_;
+  // Describes the dimension numbers used for a convolution.
+  std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
+  // The number of feature groups. This is used for grouped convolutions.
+  int64_t feature_group_count_;
+  int64_t batch_group_count_;
+  // Whether the result and operand layouts are constrained.
+  bool layout_constrained_;
+  // Information used to communicate to the implementation about the algorithm
+  // used to produce results for convolution instructions.
+  PrecisionConfig precision_config_;
+  // Describes the padding type for convolution instructions.
+  PaddingType padding_type_;
+  // For layout-constrained custom calls, this vector holds the shape with
+  // layout for each operand.
+  std::vector<Shape> operand_shapes_with_layout_;
+  // Whether this custom call has a side-effect.
+  bool custom_call_has_side_effect_;
+  std::optional<Literal> literal_;
+  // A custom-call schedule hint.
+  CustomCallSchedule custom_call_schedule_;
+  // The version of the API used by the custom call function.
+  // TODO(b/189822916): Remove this field when all clients are migrated to the
+  // status-returning API.
+  CustomCallApiVersion api_version_;
+};
+
+class HloPadInstruction : public HloInstruction {
+ public:
+  explicit HloPadInstruction(const Shape& shape, HloInstruction* operand,
+                             HloInstruction* padding_value,
+                             const PaddingConfig& padding_config);
+  // Returns the padding configuration for a pad node.
+  const PaddingConfig& padding_config() const { return padding_config_; }
+  PaddingConfig* mutable_padding_config() { return &padding_config_; }
+  // Returns the operand being padded.
+  const HloInstruction* padded_operand() const { return operand(0); }
+  HloInstruction* mutable_padded_operand() { return mutable_operand(0); }
+  // Returns the padding value.
+  const HloInstruction* padding_value() const { return operand(1); }
+  HloInstruction* mutable_padding_value() { return mutable_operand(1); }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kPad;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // The padding configuration that describes the edge padding and interior
+  // padding of this pad instruction.
+  PaddingConfig padding_config_;
+};
+
+class HloDynamicIndexInstruction : public HloInstruction {
+ public:
+  explicit HloDynamicIndexInstruction(HloOpcode opcode, const Shape& shape)
+      : HloInstruction(opcode, shape) {}
+  virtual int64_t first_index_operand_number() const = 0;
+
+  // Returns a subspan of operands which represent the start indices.
+  absl::Span<HloInstruction* const> index_operands() const {
+    return absl::MakeSpan(operands()).subspan(first_index_operand_number());
+  }
+
+  // Returns the shapes of the index operands.
+  std::vector<Shape> index_shapes() const {
+    std::vector<Shape> shapes;
+    auto indices = index_operands();
+    for (const HloInstruction* index : indices) {
+      shapes.push_back(index->shape());
+    }
+    return shapes;
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kDynamicSlice ||
+           hlo->opcode() == HloOpcode::kDynamicUpdateSlice;
+  }
+};
+
+class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
+ public:
+  explicit HloDynamicSliceInstruction(const Shape& shape,
+                                      HloInstruction* operand,
+                                      HloInstruction* start_indices,
+                                      absl::Span<const int64_t> slice_sizes);
+  explicit HloDynamicSliceInstruction(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64_t> slice_sizes);
+  // Old methods kept for smooth subclassing transition END.
+  // Returns the size of the slice in the given dimension for a dynamic
+  // slice node.
+  int64_t slice_sizes(int64_t dimension) const {
+    return dynamic_slice_sizes_[dimension];
+  }
+  const std::vector<int64_t>& dynamic_slice_sizes() const {
+    return dynamic_slice_sizes_;
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  int64_t first_index_operand_number() const override { return 1; }
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kDynamicSlice;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes the [start, start + size) range size for a dynamic slice
+  // ('start' is specified dynamically in the second operand of the operation).
+  std::vector<int64_t> dynamic_slice_sizes_;
+};
+
+class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
+ public:
+  explicit HloDynamicUpdateSliceInstruction(const Shape& shape,
+                                            HloInstruction* operand,
+                                            HloInstruction* update,
+                                            HloInstruction* start_indices);
+  explicit HloDynamicUpdateSliceInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* update,
+      absl::Span<HloInstruction* const> start_indices);
+
+  int64_t first_index_operand_number() const override { return 2; }
+
+  const HloInstruction* update() const { return operand(1); }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kDynamicUpdateSlice;
+  }
+};
+
+class HloGatherInstruction : public HloInstruction {
+ public:
+  explicit HloGatherInstruction(
+      const Shape& shape, HloInstruction* operand,
+      HloInstruction* start_indices,
+      const GatherDimensionNumbers& gather_dim_numbers,
+      absl::Span<const int64_t> slice_sizes, bool indices_are_sorted);
+  const GatherDimensionNumbers& gather_dimension_numbers() const {
+    CHECK(gather_dimension_numbers_ != nullptr);
+    return *gather_dimension_numbers_;
+  }
+  absl::Span<const int64_t> gather_slice_sizes() const {
+    return gather_slice_sizes_;
+  }
+  bool indices_are_sorted() const { return indices_are_sorted_; }
+  void set_indices_are_sorted(bool indices_are_sorted) {
+    indices_are_sorted_ = indices_are_sorted;
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  // Creates an instance of GatherDimensionNumbers.
+  static GatherDimensionNumbers MakeGatherDimNumbers(
+      absl::Span<const int64_t> offset_dims,
+      absl::Span<const int64_t> collapsed_slice_dims,
+      absl::Span<const int64_t> start_index_map, int64_t index_vector_dim,
+      absl::Span<const int64_t> operand_batching_dims = {},
+      absl::Span<const int64_t> start_indices_batching_dims = {});
+  // Returns the dump string of the given gather dimension numbers.
+  static std::string GatherDimensionNumbersToString(
+      const GatherDimensionNumbers& dim_numbers);
+  // Prints the dump string of the given gather dimension numbers.
+  static void PrintGatherDimensionNumbers(
+      Printer* printer, const GatherDimensionNumbers& dim_numbers);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kGather;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers_;
+  std::vector<int64_t> gather_slice_sizes_;
+  bool indices_are_sorted_;
+};
+
+class HloScatterInstruction : public HloInstruction {
+ public:
+  explicit HloScatterInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> args,
+      HloComputation* update_computation,
+      const ScatterDimensionNumbers& scatter_dim_numbers,
+      bool indices_are_sorted, bool unique_indices);
+  const ScatterDimensionNumbers& scatter_dimension_numbers() const {
+    CHECK(scatter_dimension_numbers_ != nullptr);
+    return *scatter_dimension_numbers_;
+  }
+  bool indices_are_sorted() const { return indices_are_sorted_; }
+  void set_indices_are_sorted(bool indices_are_sorted) {
+    indices_are_sorted_ = indices_are_sorted;
+  }
+  bool unique_indices() const override { return unique_indices_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+  int64_t scatter_operand_count() const { return operand_count() / 2; }
+  absl::Span<HloInstruction* const> scatter_operands() const {
+    return absl::MakeConstSpan(operands()).first(scatter_operand_count());
+  }
+  absl::Span<HloInstruction* const> scatter_updates() const {
+    return absl::MakeConstSpan(operands()).last(scatter_operand_count());
+  }
+  const HloInstruction* scatter_indices() const {
+    return operand(scatter_operand_count());
+  }
+  HloInstruction* scatter_indices() {
+    return mutable_operand(scatter_operand_count());
+  }
+
+  // Creates an instance of ScatterDimensionNumbers.
+  static ScatterDimensionNumbers MakeScatterDimNumbers(
+      absl::Span<const int64_t> update_window_dims,
+      absl::Span<const int64_t> inserted_window_dims,
+      absl::Span<const int64_t> scatter_dims_to_operand_dims,
+      int64_t index_vector_dim,
+      absl::Span<const int64_t> input_batching_dims = {},
+      absl::Span<const int64_t> scatter_indices_batching_dims = {});
+  // Returns the dump string of the given scatter dimension numbers.
+  static std::string ScatterDimensionNumbersToString(
+      const ScatterDimensionNumbers& dim_numbers);
+  // Prints the dump string of the given scatter dimension numbers.
+  static void PrintScatterDimensionNumbers(
+      Printer* printer, const ScatterDimensionNumbers& dim_numbers);
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kScatter;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  std::unique_ptr<ScatterDimensionNumbers> scatter_dimension_numbers_;
+  bool indices_are_sorted_;
+  bool unique_indices_;
+};
+
+class HloIotaInstruction : public HloInstruction {
+ public:
+  explicit HloIotaInstruction(const Shape& shape, int64_t iota_dimension);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  int64_t iota_dimension() const { return iota_dimension_; }
+  absl::Span<const int64_t> dimensions() const override {
+    return absl::MakeConstSpan(&iota_dimension_, 1);
+  }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kIota;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t iota_dimension_;
+};
+
+class HloDotInstruction : public HloInstruction {
+ public:
+  static const int kOperands = 2;
+
+  // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
+  // dimensions specified in 'dimension_numbers'. If 'sparsity' is set, then
+  // 'sparse_meta' must also be present (and have the same size).
+  explicit HloDotInstruction(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig& precision_config,
+      std::vector<SparsityDescriptor> sparsity = {},
+      absl::Span<HloInstruction* const> sparse_meta = {});
+
+  // Returns data on the dimension numbers used for a dot operation.
+  const DotDimensionNumbers& dot_dimension_numbers() const {
+    return dot_dimension_numbers_;
+  }
+
+  // Sets dimension numbers used for a dot operation.
+  DotDimensionNumbers* mutable_dot_dimension_numbers() {
+    return &dot_dimension_numbers_;
+  }
+
+  // Returns the information used to tell the implementation information about
+  // what sort of precision is requested. The meaning of the field is backend
+  // specific. At the moment, it is only supported for kConvolution, kDot, and
+  // kRaggedDot. Transformations on one k(Ragged)Dot or kConvolution to another
+  // will preserve this information. Transformations to other HLOs will not
+  // preserve this information but it is presumed that the alternate lowering is
+  // strictly superior.
+  const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
+
+  // Sparsity descriptors are optional. If present, additional operands define
+  // how the data is read for the dot inputs.
+  int sparse_operands() const { return sparsity_.size(); }
+  absl::Span<const SparsityDescriptor> sparsity() const {
+    return absl::MakeSpan(sparsity_);
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kDot;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes the dimension numbers used for a dot.
+  DotDimensionNumbers dot_dimension_numbers_;
+
+  // Information used to communicate to the implementation about the algorithm
+  // used to produce results. See the documentation on precision_config().
+  PrecisionConfig precision_config_;
+
+  // Sparsity descriptors are set if some operands are sparse. In this case, the
+  // additional metadata operands contain the information that defines how
+  // the data is read.
+  std::vector<SparsityDescriptor> sparsity_;
+};
+
+class HloRaggedDotInstruction : public HloInstruction {
+ public:
+  static const int kOperands = 3;
+
+  // Creates a ragged dot op with operands 'lhs', 'rhs', and 'group_sizes'.
+  // The `dimension_numbers` are for specifying:
+  //   - batch and contracting dims for 'lhs'/'rhs' (as in HloDotInstruction),
+  //   - exactly one 'lhs' ragged dimension, and
+  //   - up to one 'rhs' group dimension.
+  // The op takes on one of three modes, based on the kind of the ragged dim:
+  // 1. [b,m,k], [g,b,k,n], [g] -> [b,m,n], where the ragged dimension is the
+  //    non-contracting dimension (m) of the 'lhs'.
+  // 2. [b,m,k], [b,k,n], [g] -> [g,b,m,n], where the ragged dimension is the
+  //    contracting dimension (k) of the 'lhs' and 'rhs'.
+  // 3. [b,m,k], [b,k,n], [g] -> [b,m,n], where the ragged dimension is the
+  //    batch dimension (b) of the 'lhs' and 'rhs'.
+  explicit HloRaggedDotInstruction(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      HloInstruction* group_sizes,
+      const RaggedDotDimensionNumbers& dimension_numbers,
+      const PrecisionConfig& precision_config);
+
+  // Returns data on the dimension numbers used for a ragged dot operation.
+  const RaggedDotDimensionNumbers& ragged_dot_dimension_numbers() const {
+    return ragged_dot_dimension_numbers_;
+  }
+
+  // Sets dimension numbers used for a ragged dot operation.
+  RaggedDotDimensionNumbers* mutable_ragged_dot_dimension_numbers() {
+    return &ragged_dot_dimension_numbers_;
+  }
+
+  // Returns the information used to tell the implementation information about
+  // what sort of precision is requested. The meaning of the field is backend
+  // specific. At the moment, it is only supported for kConvolution, kDot, and
+  // kRaggedDot. Transformations on one k(Ragged)Dot or kConvolution to another
+  // will preserve this information. Transformations to other HLOs will not
+  // preserve this information but it is presumed that the alternate lowering is
+  // strictly superior.
+  const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRaggedDot;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes the dimension numbers used for a ragged dot.
+  RaggedDotDimensionNumbers ragged_dot_dimension_numbers_;
+
+  // Information used to communicate to the implementation about the algorithm
+  // used to produce results. See the documentation on precision_config().
+  PrecisionConfig precision_config_;
+};
+
+class HloDomainInstruction : public HloInstruction {
+ public:
+  explicit HloDomainInstruction(
+      const Shape& shape, HloInstruction* operand,
+      std::unique_ptr<DomainMetadata> operand_side_metadata,
+      std::unique_ptr<DomainMetadata> user_side_metadata);
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  // Retrieves the operand side metadata of a kDomain instruction.
+  const DomainMetadata& operand_side_metadata() const {
+    return *operand_side_metadata_;
+  }
+  // Retrieves the user side metadata of a kDomain instruction.
+  const DomainMetadata& user_side_metadata() const {
+    return *user_side_metadata_;
+  }
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kDomain;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  std::unique_ptr<DomainMetadata> operand_side_metadata_;
+  std::unique_ptr<DomainMetadata> user_side_metadata_;
+};
+
+class HloGetDimensionSizeInstruction : public HloInstruction {
+ public:
+  explicit HloGetDimensionSizeInstruction(const Shape& shape,
+                                          HloInstruction* operand,
+                                          int64_t dimension);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  int64_t dimension() const { return dimension_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kGetDimensionSize;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t dimension_;
+};
+
+class HloSetDimensionSizeInstruction : public HloInstruction {
+ public:
+  explicit HloSetDimensionSizeInstruction(const Shape& shape,
+                                          HloInstruction* operand,
+                                          HloInstruction* val,
+                                          int64_t dimension);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  int64_t dimension() const { return dimension_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kSetDimensionSize;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t dimension_;
+};
+
+class HloRngGetAndUpdateStateInstruction : public HloInstruction {
+ public:
+  explicit HloRngGetAndUpdateStateInstruction(const Shape& shape,
+                                              int64_t delta);
+
+  // Returns the delta value.
+  int64_t delta() const { return delta_; }
+  void set_delta(int64_t delta) { delta_ = delta; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRngGetAndUpdateState;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t delta_;
+};
+
+class HloRngBitGeneratorInstruction : public HloInstruction {
+ public:
+  HloRngBitGeneratorInstruction(const Shape& shape, HloInstruction* state,
+                                RandomAlgorithm algorithm);
+
+  RandomAlgorithm algorithm() const { return algorithm_; }
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kRngBitGenerator;
+  }
+
+ private:
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  RandomAlgorithm algorithm_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_INSTRUCTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module.h
new file mode 100644
index 00000000..c9a33280
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -0,0 +1,792 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_MODULE_H_
+#define XLA_HLO_IR_HLO_MODULE_H_
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dynamic_parameter_binding.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module_metadata.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/iterator_util.h"
+#include "xla/printer.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/name_uniquer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/lib/gtl/iterator_range.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+using LayoutCanonicalizationCallback =
+    std::function<absl::StatusOr<std::pair<std::vector<Shape>, Shape>>(
+        const HloModule& module)>;
+
+// Describes a compilation unit at the HLO level.
+//
+// HloModule is the top-level unit in the HLO IR.  It corresponds to a whole
+// "program".  Running a module, from beginning to end, is the only way to run
+// an XLA program.
+//
+// A module contains one "entry computation"; this HloComputation is like main()
+// in a C program.  The result of running the module is the result of running
+// this computation.
+//
+// A module also contains some number of "nested computations".  Each nested
+// computation is attached to an HloInstruction within some other computation.
+// The meaning of the nested computation depends on the instruction it's
+// attached to.
+class HloModule {
+ public:
+  HloModule(const std::string& name, HloModuleConfig config);
+  // REQUIRED: comp_envs must not be null.
+  HloModule(const std::string& name, HloModuleConfig config,
+            std::unique_ptr<CompilationEnvironments> comp_envs);
+
+  // You can share a config from other modules by passing
+  // HloModule::shared_config()
+  HloModule(const std::string& name,
+            std::shared_ptr<const HloModuleConfig> config,
+            std::unique_ptr<CompilationEnvironments> comp_envs);
+  virtual ~HloModule() = default;
+
+  // Adds an entry computation to the module. A module can only have one entry
+  // computation. Returns a pointer to the newly added computation.
+  HloComputation* AddEntryComputation(
+      std::unique_ptr<HloComputation> computation);
+
+  // Same as the AddEntryComputation function above but the module's
+  // entry_computation_layout is updated to match the layout of the new entry
+  // computation.
+  HloComputation* AddEntryComputationWithLayouts(
+      std::unique_ptr<HloComputation> computation);
+
+  // Replaces the current entry computation with another computation.
+  // The new entry computation must be a computation that is already in the
+  // module.
+  void ReplaceEntryComputation(HloComputation* entry_computation);
+
+  // Adds an embedded computation to the module.
+  HloComputation* AddEmbeddedComputation(
+      std::unique_ptr<HloComputation> computation);
+
+  // Removes an embedded computation.
+  absl::Status RemoveEmbeddedComputation(HloComputation* to_remove);
+
+  // Removes unused computations.
+  absl::Status RemoveUnusedComputations();
+
+  // Marks duplicate fusions with the same name to be able to group them for
+  // analysis purposes (e.g. through Xprof).
+  void MarkFusionDuplications(
+      const absl::flat_hash_map<HloComputation*, HloComputation*>&
+          replacements);
+
+  // Replaces all uses of computations that are keys of 'replacements' with
+  // the corresponding values in 'replacements'. Replaces the entry computation,
+  // if applicable.
+  //
+  // This function iterates over all instructions in the module to find
+  // computations to replace. We could speed it up by keeping track of users of
+  // computations.
+  //
+  // N.B.: This function does not update the computations_ field of the
+  // HloModule with the newly added compututations. Therefore, along with
+  // invoking this function, if a replacement computation is not already present
+  // in module, it should be separately added into the module using
+  // `AddEmbeddedComputation`.
+  void ReplaceComputations(
+      const absl::flat_hash_map<HloComputation*, HloComputation*>&
+          replacements);
+
+  const std::string& name() const { return name_; }
+  void set_name(std::string name) { name_ = std::move(name); }
+
+  // Move computations from the input module to this one, while ensuring that
+  // the names of instructions within the computations are unchanged.
+  void MoveComputationsFrom(HloModule* module, bool make_names_unique = false);
+
+  // Returns a deep copy of this module including all reachable computations.
+  // Optionally, a custom config can be provided.
+  std::unique_ptr<HloModule> Clone(
+      const std::string& suffix = "clone",
+      std::optional<const HloModuleConfig> config = std::nullopt) const;
+
+  // Performs a deep clone of the computation, by recursively cloning all
+  // the called computations as well. If the clone context is specified, it
+  // will be populated with the cloned object mappings.
+  HloComputation* DeepCloneComputation(HloComputation* computation,
+                                       HloCloneContext* context = nullptr);
+
+  // Return a pointer to the entry computation of the module.
+  HloComputation* entry_computation() const {
+    CHECK_NE(nullptr, entry_computation_);
+    return entry_computation_;
+  }
+
+  bool has_entry_computation() const { return entry_computation_ != nullptr; }
+
+  // Returns the root instruction shape of entry computation.
+  //
+  // Precondition: entry_computation_ is not nullptr.
+  const Shape& result_shape() const {
+    CHECK_NE(nullptr, entry_computation_);
+    return entry_computation()->root_instruction()->shape();
+  }
+
+  // Creates the ComputationLayout which describes the current status of the HLO
+  // module entry computation.
+  ComputationLayout compute_computation_layout() const {
+    return ComputationLayout(entry_computation()->ComputeProgramShape(),
+                             /*ignore_layouts=*/false);
+  }
+
+  ComputationLayout* mutable_entry_computation_layout() {
+    return mutable_config().mutable_entry_computation_layout();
+  }
+
+  const ComputationLayout& entry_computation_layout() const {
+    return config().entry_computation_layout();
+  }
+
+  void set_frontend_attributes(FrontendAttributes frontend_attributes) {
+    frontend_attributes_ = std::move(frontend_attributes);
+  }
+
+  void add_frontend_attributes(FrontendAttributes frontend_attributes) {
+    frontend_attributes_.mutable_map()->insert(
+        frontend_attributes.map().begin(), frontend_attributes.map().end());
+  }
+
+  const FrontendAttributes& frontend_attributes() const {
+    return frontend_attributes_;
+  }
+
+  void set_use_auto_spmd_partitioning(bool use) {
+    use_auto_spmd_partitioning_ = use;
+  }
+
+  bool use_auto_spmd_partitioning() const {
+    return use_auto_spmd_partitioning_;
+  }
+
+  // Based on module's entry_computation sharded shapes,
+  // layout_canonicalization_callback_ computes and
+  // returns <argument_layouts, result_layout> for module's entry computation.
+  // argument_layouts is std::vector<Shape> and results_layout is Shape.
+  // layout_canonicalization_callback_ is used only when
+  // use_auto_spmd_partitioning_ = true.
+  void set_layout_canonicalization_callback(
+      LayoutCanonicalizationCallback callback) {
+    layout_canonicalization_callback_ = std::move(callback);
+  }
+
+  LayoutCanonicalizationCallback layout_canonicalization_callback() const {
+    return layout_canonicalization_callback_;
+  }
+
+  // Generates a hash value of an HLO module. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO modules,
+  // with respect to HloInstruction::Identical() method.
+  template <typename H>
+  friend H AbslHashValue(H h, const HloModule& module) {
+    if (module.config().has_entry_computation_layout())
+      h = H::combine(std::move(h), module.entry_computation_layout());
+    // Use MakeComputationSorted() instead of MakeComputationPostOrder()
+    // because naming may affect the order of MakeComputationPostOrder() but not
+    // MakeComputationSorted().
+    auto computations = module.MakeComputationSorted();
+    for (auto* computation : computations) {
+      h = H::combine(std::move(h), *computation);
+    }
+    return H::combine(std::move(h), computations.size());
+  }
+
+  // Gets the computations in this module.
+  //
+  // Returns a view of HloComputation*s, so you can iterate over this in the
+  // natural way:
+  //
+  //   for (HloComputation* c : module->computations()) { ... }
+  //
+  tsl::gtl::iterator_range<UnwrappingIterator<
+      std::vector<std::unique_ptr<HloComputation>>::const_iterator>>
+  computations() const {
+    return {MakeUnwrappingIterator(computations_.begin()),
+            MakeUnwrappingIterator(computations_.end())};
+  }
+  tsl::gtl::iterator_range<UnwrappingIterator<
+      std::vector<std::unique_ptr<HloComputation>>::iterator>>
+  computations() {
+    return {MakeUnwrappingIterator(computations_.begin()),
+            MakeUnwrappingIterator(computations_.end())};
+  }
+
+  // Similar as above, but return a filtered view of computations for specified
+  // `execution_threads`. Empty `execution_threads` list means all execution
+  // threads are included.
+  tsl::gtl::iterator_range<FilteringUnwrappingIterator<
+      std::vector<std::unique_ptr<HloComputation>>::const_iterator,
+      std::function<bool(const HloComputation*)>>>
+  computations(
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const {
+    // Pass execution_threads by value to the predicate to ensure it lives
+    // beyond this function.
+    std::function<bool(const HloComputation*)> pred =
+        [execution_threads](const HloComputation* computation) {
+          if (execution_threads.empty()) {
+            return true;
+          }
+          return execution_threads.contains(computation->execution_thread());
+        };
+    return MakeFilteringUnwrappingIteratorRange(computations_.begin(),
+                                                computations_.end(), pred);
+  }
+
+  // Returns the computation in this module that has the name `name`.  Returns
+  // null if there is no such computation.
+  HloComputation* GetComputationWithName(absl::string_view name);
+
+  // Gets the number of computations in this module.
+  int64_t computation_count() const { return computations_.size(); }
+
+  // Returns the mutable computation for the given index.
+  HloComputation* mutable_computation(int64_t idx) {
+    CHECK(idx >= 0 && idx < computations_.size());
+    return computations_[idx].get();
+  }
+
+  // Gets the number of instructions in this module.
+  int64_t instruction_count() const;
+
+  // Deallocate removed instructions in each computation.
+  void Cleanup() {
+    for (auto& comp : computations_) {
+      comp->Cleanup();
+    }
+  }
+
+  // Compute and return a post order of all computations in the module. The sort
+  // is defined like so: if computation A has an instruction which calls
+  // computation B, then A will appear after B in the sort.
+  std::vector<HloComputation*> MakeComputationPostOrder() const {
+    return MakeComputationPostOrder({});
+  }
+  // Similar as above but only returns computations with specified
+  // `execution_threads`. Empty `execution_threads` list means all execution
+  // threads are included.
+  std::vector<HloComputation*> MakeComputationPostOrder(
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+  // Same as MakeComputationPostOrder() but only returns the computations that
+  // are on specified `execution_threads` and are also found in the passed in
+  // allowList. Empty `execution_threads` list means all execution threads are
+  // included.
+  std::vector<HloComputation*> MakeComputationPostOrder(
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      const absl::flat_hash_set<HloComputation*>& allow_list) const;
+
+  // If config().content_aware_computation_sorting() is true, sorts computations
+  // by their contents, otherwise returns MakeComputationPostOrder().
+  std::vector<HloComputation*> MakeComputationSorted() const {
+    return MakeComputationSorted({});
+  }
+  // Same as above but only for specified `execution_threads`. Empty
+  // `execution_threads` list means all execution threads are included.
+  std::vector<HloComputation*> MakeComputationSorted(
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+
+  // Gets the computations in this module which aren't for fusion nodes.
+  //
+  // Postcondition: All computations in the returned list have
+  // !IsFusionComputation().
+  //
+  // Note: Callers can and do rely on the return value here being a *snapshot*
+  // of the module's non-fusion computations -- that is, it's OK to add or
+  // remove computations from a module while iterating over
+  // MakeNonfusionComputations().
+  std::vector<HloComputation*> MakeNonfusionComputations() const {
+    return MakeNonfusionComputations({});
+  }
+  // Same as above but only for specified `execution_threads`. Empty
+  // `execution_threads` list means all execution threads are included.
+  std::vector<HloComputation*> MakeNonfusionComputations(
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+
+  // Same as MakeNonfusionComputations() but sorting computations by content.
+  std::vector<HloComputation*> MakeNonfusionComputationsSorted() const {
+    return MakeNonfusionComputationsSorted({});
+  }
+  // Same as above but only for specified `execution_threads`. Empty
+  // `execution_threads` list means all execution threads are included.
+  std::vector<HloComputation*> MakeNonfusionComputationsSorted(
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+
+  // Returns a config for modifications in current module. If the config is
+  // shared with other modules, it creates a copy.
+  HloModuleConfig& mutable_config() {
+    if (config_.use_count() > 1) {
+      config_ = std::make_shared<const HloModuleConfig>(*config_);
+    }
+    return const_cast<HloModuleConfig&>(*config_);
+  }
+
+  // Returns a config for read-only purposes assuming the config won't be
+  // changed during the life time of the returned object.
+  const HloModuleConfig& config() const { return *config_; }
+
+  void set_config(HloModuleConfig config) {
+    config_ = std::make_shared<const HloModuleConfig>(std::move(config));
+  }
+
+  // Shares the config which can be used in other HloModules,
+  // thus reducing the memory footprint. It can also be used to access the
+  // config for read-only purposes. Modules can modify their own config
+  // afterwards through mutable_config().
+  std::shared_ptr<const HloModuleConfig> shared_config() const {
+    return config_;
+  }
+
+  bool is_dynamic() const { return is_dynamic_; }
+  void set_is_dynamic(bool is_dynamic) { is_dynamic_ = is_dynamic; }
+
+  // Prints a string representation of the module.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  void Print(Printer* printer) const {
+    return Print(printer, HloPrintOptions::Default());
+  }
+  void Print(Printer* printer, const HloPrintOptions& options) const;
+
+  // Return a string representation of the module.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  std::string ToString() const { return ToString(HloPrintOptions::Default()); }
+  std::string ToString(const HloPrintOptions& options) const;
+
+  // Returns a Cord representation of the module.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  absl::Cord ToCord() const { return ToCord(HloPrintOptions::Default()); }
+  absl::Cord ToCord(const HloPrintOptions& options) const;
+
+  // Convert an HloModule to or from a proto.
+  HloModuleProto ToProto() const;
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
+      const HloModuleProto& proto, const HloModuleConfig& module_config,
+      bool prohibit_empty_literal = true);
+
+  // Convert an HloModule to or from a proto that includes module configuration
+  HloModuleProtoWithConfig ToProtoWithConfig() const;
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProtoWithConfig(
+      const HloModuleProtoWithConfig& proto,
+      bool prohibit_empty_literal = true);
+
+  // Creates and returns an HloModuleConfig with an appropriate program shape
+  // for the HLO module in the given proto.
+  static absl::StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
+      const HloModuleProto& module, const DebugOptions& debug_options,
+      const ExecutionOptions* execution_options = nullptr);
+
+  // Creates and returns an HloModuleConfig with an appropriate program shape
+  // for the HLO module in the given proto.
+  static absl::StatusOr<HloModuleConfig> CreateModuleConfigFromShape(
+      const ProgramShape& program_shape, const DebugOptions& debug_options,
+      const ExecutionOptions* execution_options = nullptr);
+
+  // Outlines the given expression from the given computation.
+  // instructions_to_outline contains the instructions that form the expression.
+  //
+  // Precondition: instructions in instructions_to_outline are in topological
+  // order (root of outlined instructions last). TODO(jingyue): takes a set of
+  // instructions and topologically sorts them.
+  HloInstruction* OutlineExpressionFromComputation(
+      absl::Span<HloInstruction* const> instructions_to_outline,
+      const std::string& outlined_computation_name,
+      HloComputation* computation);
+
+  // Returns a randomly generated uint64_t.
+  uint64_t RandomNew64() const;
+
+  // Returns the NameUniquer for uniquing instruction names in this module.
+  NameUniquer& instruction_name_uniquer() { return instruction_name_uniquer_; }
+
+  // Returns the NameUniquer for uniquing computation names in this module.
+  NameUniquer& computation_name_uniquer() { return computation_name_uniquer_; }
+
+  // Assign a new unique dense id for an instruction
+  int NewUniqueInstructionId() {
+    int result = next_unique_id_;
+    next_unique_id_++;
+    return result;
+  }
+
+  // input_output_alias_config indicates the list of aliased buffers that are
+  // expected from the module.
+  HloInputOutputAliasConfig& input_output_alias_config() {
+    return input_output_alias_config_;
+  }
+  const HloInputOutputAliasConfig& input_output_alias_config() const {
+    return input_output_alias_config_;
+  }
+  void set_input_output_alias_config(HloInputOutputAliasConfig config) {
+    input_output_alias_config_ = std::move(config);
+  }
+
+  // buffer_donor_config_ indicates the set of input buffer donors that are
+  // expected from the module.
+  HloBufferDonorConfig& buffer_donor_config() { return buffer_donor_config_; }
+  const HloBufferDonorConfig& buffer_donor_config() const {
+    return buffer_donor_config_;
+  }
+  void set_buffer_donor_config(HloBufferDonorConfig config) {
+    buffer_donor_config_ = std::move(config);
+  }
+
+  // Returns an id that is unique to this module across all modules created over
+  // the lifetime of this process.
+  int unique_id() const { return unique_id_; }
+
+  // Sets the schedule of the module to the given schedule.
+  absl::Status set_schedule(HloSchedule schedule);
+
+  // Clears the schedule of the module.
+  void clear_schedule() { schedule_.reset(); }
+
+  // Returns true if the module has a schedule set.
+  bool has_schedule() const { return schedule_.has_value(); }
+
+  // Returns the schedule of the module. CHECK fails if no schedule is set.
+  const HloSchedule& schedule() const { return *schedule_; }
+  HloSchedule& schedule() { return *schedule_; }
+
+  HloComputation* AddComputation(std::unique_ptr<HloComputation> computation,
+                                 bool is_entry) {
+    return AddComputationInternal(std::move(computation), is_entry,
+                                  /*uniquify_identifiers=*/false,
+                                  /*preserve_entry_layouts=*/true);
+  }
+
+  HloComputation* AddComputationAndUnifyNamesAndIds(
+      std::unique_ptr<HloComputation> computation, bool is_entry) {
+    computation->ClearUniqueIdInternal();
+    for (auto* instruction : computation->instructions()) {
+      instruction->ClearUniqueIdInternal();
+    }
+    return AddComputationInternal(std::move(computation), is_entry,
+                                  /*uniquify_identifiers=*/true,
+                                  /*preserve_entry_layouts=*/true);
+  }
+
+  void SetAndUniquifyInstrName(HloInstruction* instr, absl::string_view name) {
+    instr->SetAndSanitizeName(name);
+    instr->UniquifyName(&instruction_name_uniquer_);
+  }
+
+  void SetAndUniquifyComputationName(HloComputation* computation,
+                                     absl::string_view name) {
+    computation->SetAndSanitizeName(name);
+    computation->UniquifyName(&computation_name_uniquer_);
+  }
+
+  absl::Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
+
+  // Checks if this config has a list of entry parameters' HLO shardings for
+  // SPMD.
+  bool has_spmd_parameters_shardings() const {
+    return spmd_parameters_shardings_.has_value();
+  }
+
+  // Getter and setter for the list of entry parameters' HLO shardings for SPMD.
+  const std::vector<HloSharding>& spmd_parameters_shardings() const {
+    CHECK(spmd_parameters_shardings_.has_value());
+    return *spmd_parameters_shardings_;
+  }
+  void set_spmd_parameters_shardings(
+      const std::vector<HloSharding>& shardings) {
+    spmd_parameters_shardings_ = shardings;
+  }
+
+  // Checks if this config has the entry computation output's HLO sharding for
+  // SPMD.
+  bool has_spmd_output_sharding() const {
+    return spmd_output_sharding_.has_value();
+  }
+
+  // Getter and setter for the entry computation output's HLO shardings for
+  // SPMD.
+  const HloSharding& spmd_output_sharding() const {
+    CHECK(spmd_output_sharding_.has_value());
+    return *spmd_output_sharding_;
+  }
+  void set_spmd_output_sharding(const HloSharding& sharding) {
+    spmd_output_sharding_ = sharding;
+  }
+
+  // Describes a buffer to be used for cross program prefetching.
+  struct CrossProgramPrefetchInfo {
+    // The parameter to prefetch.
+    int64_t parameter;
+    // Index of the buffer within a tuple-typed parameter.
+    ShapeIndex index;
+    // Offset into alt memory where the cross program pretched buffer will be
+    // stored.
+    std::optional<int64_t> alt_memory_offset;
+  };
+
+  // Add a program argument to be prefetched across programs.
+  void AddCrossProgramPrefetch(
+      int64_t parameter, const ShapeIndex& index,
+      std::optional<int64_t> alt_memory_offset = std::nullopt) {
+    cross_program_prefetches_.emplace_back(
+        CrossProgramPrefetchInfo{parameter, index, alt_memory_offset});
+  }
+
+  absl::Status SetCrossProgramPrefetchOffset(int64_t prefetch_index,
+                                             int64_t offset) {
+    TF_RET_CHECK(prefetch_index < cross_program_prefetches_.size());
+    auto& [parameter, index, optional_offset] =
+        cross_program_prefetches_[prefetch_index];
+    TF_RET_CHECK(!optional_offset.has_value());
+    optional_offset = offset;
+    return absl::OkStatus();
+  }
+
+  // Get the list of program arguments to be prefetch across programs.
+  absl::Span<const CrossProgramPrefetchInfo> CrossProgramPrefetches() const {
+    return cross_program_prefetches_;
+  }
+
+  const HloModuleMetadata& metadata() const { return metadata_; }
+  HloModuleMetadata* metadata() { return &metadata_; }
+
+  // Moves (not copies) metadata from this HloModule to `module`. To be used
+  // in cases like HloModuleGroup::ReplaceModule when metadata should be
+  // transferred out of a module before it's destroyed.
+  void MoveMetadataToModule(HloModule* module) {
+    module->metadata_ = std::move(metadata_);
+  }
+
+  int64_t profile_version() const { return profile_version_; }
+
+  void set_profile_version(int64_t profile_version) {
+    profile_version_ = profile_version;
+  }
+
+  void add_profile_info(const HloModuleProto::ProfileInfo& profile_info) {
+    profile_info_list_.push_back(profile_info);
+  }
+
+  void set_profile_info(
+      const std::vector<HloModuleProto::ProfileInfo>& profile_info) {
+    profile_info_list_ = profile_info;
+  }
+
+  const std::vector<HloModuleProto::ProfileInfo>& profile_info() const {
+    return profile_info_list_;
+  }
+
+  void set_autofdo_profile_key(HloModuleProto::ProfileType profile_type,
+                               absl::string_view profile_key) {
+    autofdo_profile_keys_[profile_type] = std::string(profile_key);
+  }
+
+  void set_autofdo_profile_keys(
+      const absl::flat_hash_map<HloModuleProto::ProfileType, std::string>&
+          profile_keys) {
+    for (const auto& [profile_type, profile_key] : profile_keys) {
+      autofdo_profile_keys_[profile_type] = profile_key;
+    }
+  }
+
+  const absl::flat_hash_map<HloModuleProto::ProfileType, std::string>&
+  autofdo_profile_keys() const {
+    return autofdo_profile_keys_;
+  }
+
+  bool has_module_autofdo_profiles() const {
+    return !profile_info_list_.empty();
+  }
+
+  void set_relative_speedup(double relative_speedup) {
+    relative_speedup_ = relative_speedup;
+  }
+
+  // Sets the **unoptimized** fingerprint for the module. This fingerprint is
+  // prior to any optimizations.
+  void set_autofdo_fingerprint(absl::string_view fingerprint) {
+    autofdo_fingerprint_ = std::string(fingerprint);
+  }
+
+  std::string autofdo_fingerprint() const { return autofdo_fingerprint_; }
+
+  CompilationEnvironments& comp_envs() const { return *comp_envs_; }
+
+  // Get 128-bit fingerprint of the module by printing it using the given print
+  // options.
+  std::string GetFingerprint128(const HloPrintOptions& options =
+                                    HloPrintOptions::ModuleFingerprint()) const;
+
+  // Describes a stack frame.
+  struct StackFrame {
+    absl::string_view file_name;
+    absl::string_view function_name;
+    int line = 0;
+    int column = 0;
+
+    // 1-based index of the parent frame.
+    // 0 value indicates that the current frame is the root.
+    int parent_frame_id = 0;
+
+    bool empty() const {
+      return line == 0 && column == 0 && file_name.empty() &&
+             function_name.empty();
+    }
+  };
+
+  // Getter for the specific stack frame. Argument is a 1-based index.
+  StackFrame get_stack_frame(int id) const;
+
+ private:
+  HloComputation* AddComputationInternal(
+      std::unique_ptr<HloComputation> computation, bool is_entry,
+      bool uniquify_identifiers, bool preserve_entry_layouts);
+
+  std::string name_;
+
+  // Sharabled copy-on-write instance.
+  // If you want to modify it, use mutable_config().
+  std::shared_ptr<const HloModuleConfig> config_;
+
+  HloComputation* entry_computation_ = nullptr;
+  std::vector<std::unique_ptr<HloComputation>> computations_;
+
+  // Random number generator engine to use when generating random numbers per
+  // HloModule compilation.
+  // TODO(b/25995601): Replace with better seed setting or dev/random for
+  // where we don't need deterministic execution.
+  mutable std::mt19937_64 rng_{42};
+  mutable absl::Mutex rng_mutex_;
+
+  // Unique name generator for computation and instruction names, which are
+  // unique per module.
+  NameUniquer computation_name_uniquer_{/*separator=*/"."};
+  NameUniquer instruction_name_uniquer_{/*separator=*/"."};
+  int next_unique_id_ = 0;
+
+  // Used to keep track of the next unique module id that should be assigned.
+  static std::atomic<int> next_unique_module_id_;
+  // A unique id to label modules with.
+  const int unique_id_;
+
+  // The HloSchedule of the module. The schedule if it exists contains a
+  // sequential order of instructions for each non-fusion computation in the
+  // module.
+  std::optional<HloSchedule> schedule_;
+
+  // alias_config indicates the alias information of input/output buffers that
+  // are expected from the module.
+  HloInputOutputAliasConfig input_output_alias_config_;
+
+  // buffer_donor_config_ indicates the donor information of input buffers that
+  // are expected from the module.
+  HloBufferDonorConfig buffer_donor_config_;
+
+  // Attributes passed from the frontend to give hints to the backend about
+  // how to compile this HLO.
+  FrontendAttributes frontend_attributes_;
+
+  // The HLO shardings of the entry computation's parameters for
+  // SPMD-partitioned programs.
+  std::optional<std::vector<HloSharding>> spmd_parameters_shardings_;
+
+  // The HLO sharding of the entry computation's output (root) for
+  // SPMD-partitioned programs.
+  std::optional<HloSharding> spmd_output_sharding_;
+
+  // Arguments to be prefetched across programs.
+  std::vector<CrossProgramPrefetchInfo> cross_program_prefetches_;
+
+  // Metadata for this module, such as its canonical id and the HLO passes run.
+  HloModuleMetadata metadata_;
+
+  // True if the module contains dynamic computation.
+  bool is_dynamic_ = false;
+
+  // Optional compilation profile handle.
+  int64_t profile_version_ = 0;
+
+  // An array of ProfileInfo specifying what optimization profiles this module
+  // contains, along with the relative speedups.
+  std::vector<HloModuleProto::ProfileInfo> profile_info_list_;
+
+  // Relative speedup of best config compared to default config.
+  double relative_speedup_;
+
+  // The unoptimized module fingerprint.
+  std::string autofdo_fingerprint_;
+
+  // The keys used to retrieve the optimization profiles this module is compiled
+  // with, per profile type.
+  absl::flat_hash_map<HloModuleProto::ProfileType, std::string>
+      autofdo_profile_keys_;
+
+  bool use_auto_spmd_partitioning_ = false;
+
+  // Layout canonicalization callback, used only when
+  // use_auto_spmd_partitioning_ = true.
+  LayoutCanonicalizationCallback layout_canonicalization_callback_;
+
+  // Compilation environments (protos that carry command line flags and
+  // environment variables).
+  std::unique_ptr<CompilationEnvironments> comp_envs_;
+
+  // Stack frame indexes flat representation.
+  std::optional<StackFrameIndexProto> stack_frame_index_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module_group.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module_group.h
new file mode 100644
index 00000000..68c2253b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module_group.h
@@ -0,0 +1,124 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_MODULE_GROUP_H_
+#define XLA_HLO_IR_HLO_MODULE_GROUP_H_
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace xla {
+
+// An abstraction representing a ordered set of HLO module built to run
+// concurrently across different devices.
+class HloModuleGroup {
+ public:
+  // Construct an empty module group.
+  explicit HloModuleGroup(absl::string_view name) : name_(name) {}
+
+  // Construct a module group containing a single module.
+  explicit HloModuleGroup(std::unique_ptr<HloModule> module);
+
+  // Construct a module group containing any number of modules.
+  HloModuleGroup(absl::string_view name,
+                 absl::Span<std::unique_ptr<HloModule>> modules);
+  HloModuleGroup(absl::string_view name,
+                 std::vector<std::unique_ptr<HloModule>>&& modules);
+
+  HloModuleGroup(const HloModuleGroup& other) = delete;
+  HloModuleGroup(HloModuleGroup&& other) = default;
+  HloModuleGroup& operator=(const HloModuleGroup& other) = delete;
+  HloModuleGroup& operator=(HloModuleGroup&& other) = default;
+
+  // Returns the modules contained in the group.
+  const std::vector<HloModule*>& modules() const { return module_ptrs_; }
+
+  // Returns a module at a particular index.
+  HloModule& module(int index) const { return *module_ptrs_.at(index); }
+
+  // Add a module to the back of vector of modules in the group.
+  void push_back(std::unique_ptr<HloModule> module);
+
+  // Replaces the existing module at the given index with the given module. The
+  // existing module is discarded.
+  void ReplaceModule(int index, std::unique_ptr<HloModule> module);
+
+  // Moves all modules from the group into the returned vector. After this
+  // method runs, the module group will be empty.
+  std::vector<std::unique_ptr<HloModule>> ConsumeModules();
+
+  std::string name() const { return name_; }
+
+  std::string ToString() const;
+
+  // Deallocate removed instructions in each module.
+  void Cleanup() {
+    for (auto& module : modules_) {
+      module->Cleanup();
+    }
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloModuleGroup& group) {
+    for (auto& module : group.modules_) {
+      h = H::combine(std::move(h), *module);
+    }
+    return H::combine(std::move(h), group.modules_.size());
+  }
+
+  // Serialize the module group to/from a proto.
+  HloModuleGroupProto ToProto() const;
+  static absl::StatusOr<HloModuleGroup> CreateFromProto(
+      const HloModuleGroupProto& proto,
+      absl::Span<const HloModuleConfig> module_configs);
+
+  // Returns the number of modules in the module group.
+  int size() const { return modules_.size(); }
+
+  // Returns true if there are no modules in the module group.
+  bool empty() const { return modules_.empty(); }
+
+  absl::string_view cache_key() const { return cache_key_; }
+  void set_cache_key(absl::string_view cache_key) {
+    cache_key_ = std::string(cache_key);
+  }
+
+ private:
+  std::string name_;
+
+  // Vector of modules as std::unique_ptrs.
+  std::vector<std::unique_ptr<HloModule>> modules_;
+
+  // Vector of modules as normal pointers. This vector is kept in sync with
+  // modules_ as modules are added to the group with push_back.
+  std::vector<HloModule*> module_ptrs_;
+
+  std::string cache_key_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloModuleGroup& group);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_MODULE_GROUP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module_metadata.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
new file mode 100644
index 00000000..ef4c52e3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
@@ -0,0 +1,146 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_MODULE_METADATA_H_
+#define XLA_HLO_IR_HLO_MODULE_METADATA_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/status_macros.h"
+#include "xla/util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Wrapper class for HloModuleMetadataProto to avoid allowing callers to mutate
+// arbitrary fields. Specifically, callers cannot set timestamps or ids or
+// set the fields of any pass not currently running.
+class HloModuleMetadata {
+ public:
+  explicit HloModuleMetadata(tsl::Env* env) : env_(env) {}
+
+  const HloModuleMetadataProto& proto() const { return module_metadata_; }
+
+  // Creates a new HloPassMetadata. All calls to RecordPassStart should be
+  // matched by a later call to RecordPassEnd.
+  void RecordPassStart();
+
+  // Marks the currently running pass as finished. Returns NotFound if metadata
+  // for the currently running pass cannot be found.
+  absl::Status RecordPassEnd();
+
+  const std::optional<HloModuleMetadataProto>& prepartitioning_metadata()
+      const {
+    return prepartitioning_metadata_;
+  }
+  void set_prepartitioning_metadata(
+      const HloModuleMetadata& prepartitioning_metadata);
+
+  // Setters for HloModuleMetadataProto.
+  void set_module_group_name(const std::string& name) {
+    module_metadata_.set_module_group_name(name);
+  }
+  void set_canonical_module_id(int64_t id) {
+    module_metadata_.set_canonical_module_id(id);
+  }
+  void add_partitioned_module_id(int64_t id) {
+    module_metadata_.add_partitioned_module_ids(id);
+  }
+  absl::Status set_custom_metadata(const ::tsl::protobuf::Message& message);
+  // Adds a (key, value) pair metric if none was already set. Otherwise, it
+  // updates the existing value.
+  absl::Status set_key_value_metric(const std::string& key, int64_t value);
+
+  absl::StatusOr<int64_t> current_pass_id() {
+    TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
+                        GetCurrentHloPassMetadata());
+    return pass_metadata->pass_id();
+  }
+
+  // Setters for the current HloPassMetadata.
+  absl::Status set_current_pass_name(const std::string& pass_name) {
+    return MutateCurrentHloPassMetadata(
+        [&pass_name](HloPassMetadata* pass_metadata) {
+          pass_metadata->set_pass_name(pass_name);
+        });
+  }
+  absl::Status set_current_pass_pipeline_name(
+      const std::string& pipeline_name) {
+    return MutateCurrentHloPassMetadata(
+        [&pipeline_name](HloPassMetadata* pass_metadata) {
+          pass_metadata->set_pipeline_name(pipeline_name);
+        });
+  }
+  absl::Status add_current_pass_dump_filename(
+      const std::string& dump_filename) {
+    return MutateCurrentHloPassMetadata(
+        [&dump_filename](HloPassMetadata* pass_metadata) {
+          pass_metadata->add_dump_filenames(dump_filename);
+        });
+  }
+  absl::Status set_current_pass_module_changed(bool module_changed) {
+    return MutateCurrentHloPassMetadata(
+        [&module_changed](HloPassMetadata* pass_metadata) {
+          pass_metadata->set_module_changed(module_changed);
+        });
+  }
+  absl::Status set_current_pass_module_id(int64_t module_id) {
+    return MutateCurrentHloPassMetadata(
+        [&module_id](HloPassMetadata* pass_metadata) {
+          pass_metadata->set_module_id(module_id);
+        });
+  }
+  absl::Status add_current_pass_module_group_module_id(int64_t module_id) {
+    return MutateCurrentHloPassMetadata(
+        [&module_id](HloPassMetadata* pass_metadata) {
+          pass_metadata->add_module_group_module_ids(module_id);
+        });
+  }
+
+ private:
+  // Gets mutable metadata for the currently running pass. If passes are nested,
+  // finds the deepest one still running. Returns NotFound if metadata for the
+  // currently running pass cannot be found.
+  absl::StatusOr<HloPassMetadata*> GetCurrentHloPassMetadata();
+
+  absl::Status MutateCurrentHloPassMetadata(
+      absl::FunctionRef<void(HloPassMetadata*)> mutator);
+
+  HloModuleMetadataProto module_metadata_;
+  tsl::Env* env_;
+  int64_t next_pass_id_ = 1;
+
+  // Stack of metadata for passes that are currently running. Size > 1 iff
+  // passes are nested.
+  std::vector<HloPassMetadata*> running_passes_;
+
+  // Metadata from before the module was partitioned, if applicable.
+  std::optional<HloModuleMetadataProto> prepartitioning_metadata_ =
+      std::nullopt;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_MODULE_METADATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_op_metadata.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_op_metadata.h
new file mode 100644
index 00000000..acbd34c8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_op_metadata.h
@@ -0,0 +1,28 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_OP_METADATA_H_
+#define XLA_HLO_IR_HLO_OP_METADATA_H_
+
+#include <string>
+
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+std::string OpMetadataToString(const OpMetadata& metadata,
+                               bool only_op_name = false);
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_OP_METADATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_opcode.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_opcode.h
new file mode 100644
index 00000000..755289f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_opcode.h
@@ -0,0 +1,240 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_OPCODE_H_
+#define XLA_HLO_IR_HLO_OPCODE_H_
+
+#include <cstdint>
+#include <iosfwd>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// High-level optimizer instruction opcodes -- these are linear-algebra level
+// opcodes. They are a flattened form of the UnaryOp, BinaryOp, ... opcodes
+// present in the XLA service protobuf.
+//
+// See the XLA documentation for the semantics of each opcode.
+//
+// Each entry has the format:
+// (enum_name, opcode_name, arity)
+//
+// Note: Do not use ':' in opcode names. It is used as a special character
+// in these places:
+// - In extended opcode strings (HloInstruction::ExtendedOpcodeString()), to
+//   separate the opcode from the fusion kind
+// - In fully qualified names (HloInstruction::FullyQualifiedName()), to
+//   separate the qualifiers (name of the computation and potentially the
+//   fusion instruction) from the name
+//
+// If you change one of these opcodes, please make the corresponding change to
+// the MHLO opset to keep both opsets synchronized.
+// LINT.IfChange
+#define HLO_OPCODE_LIST(V)                                                     \
+  /* go/keep-sorted start */                                                   \
+  V(kAbs, "abs", 1)                                                            \
+  V(kAdd, "add", 2)                                                            \
+  V(kAddDependency, "add-dependency", 2)                                       \
+  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                              \
+  V(kAllGather, "all-gather", kHloOpcodeIsVariadic)                            \
+  V(kAllGatherDone, "all-gather-done", 1)                                      \
+  V(kAllGatherStart, "all-gather-start", kHloOpcodeIsVariadic)                 \
+  V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                            \
+  V(kAllReduceDone, "all-reduce-done", 1)                                      \
+  V(kAllReduceStart, "all-reduce-start", kHloOpcodeIsVariadic)                 \
+  V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                             \
+  V(kAnd, "and", 2)                                                            \
+  V(kAsyncDone, "async-done", 1)                                               \
+  V(kAsyncStart, "async-start", kHloOpcodeIsVariadic)                          \
+  V(kAsyncUpdate, "async-update", 1)                                           \
+  V(kAtan2, "atan2", 2)                                                        \
+  V(kBatchNormGrad, "batch-norm-grad", 5)                                      \
+  V(kBatchNormInference, "batch-norm-inference", 5)                            \
+  V(kBatchNormTraining, "batch-norm-training", 3)                              \
+  V(kBitcast, "bitcast", 1)                                                    \
+  V(kBitcastConvert, "bitcast-convert", 1)                                     \
+  V(kBroadcast, "broadcast", 1)                                                \
+  V(kCall, "call", kHloOpcodeIsVariadic)                                       \
+  V(kCbrt, "cbrt", 1)                                                          \
+  V(kCeil, "ceil", 1)                                                          \
+  V(kCholesky, "cholesky", 1)                                                  \
+  V(kClamp, "clamp", 3)                                                        \
+  V(kClz, "count-leading-zeros", 1)                                            \
+  V(kCollectiveBroadcast, "collective-broadcast", kHloOpcodeIsVariadic)        \
+  V(kCollectivePermute, "collective-permute", kHloOpcodeIsVariadic)            \
+  V(kCollectivePermuteDone, "collective-permute-done", 1)                      \
+  V(kCollectivePermuteStart, "collective-permute-start", kHloOpcodeIsVariadic) \
+  V(kCompare, "compare", 2)                                                    \
+  V(kComplex, "complex", 2)                                                    \
+  V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)                         \
+  V(kConditional, "conditional", kHloOpcodeIsVariadic)                         \
+  V(kConstant, "constant", 0)                                                  \
+  V(kConvert, "convert", 1)                                                    \
+  V(kConvolution, "convolution", 2)                                            \
+  V(kCopy, "copy", 1)                                                          \
+  V(kCopyDone, "copy-done", 1)                                                 \
+  V(kCopyStart, "copy-start", 1)                                               \
+  V(kCos, "cosine", 1)                                                         \
+  V(kCustomCall, "custom-call", kHloOpcodeIsVariadic)                          \
+  V(kDivide, "divide", 2)                                                      \
+  V(kDomain, "domain", 1)                                                      \
+  V(kDot, "dot", kHloOpcodeIsVariadic)                                         \
+  V(kDynamicReshape, "dynamic-reshape", kHloOpcodeIsVariadic)                  \
+  V(kDynamicSlice, "dynamic-slice", kHloOpcodeIsVariadic)                      \
+  V(kDynamicUpdateSlice, "dynamic-update-slice", kHloOpcodeIsVariadic)         \
+  V(kErf, "erf", 1)                                                            \
+  V(kExp, "exponential", 1)                                                    \
+  V(kExpm1, "exponential-minus-one", 1)                                        \
+  V(kFft, "fft", 1)                                                            \
+  V(kFloor, "floor", 1)                                                        \
+  V(kFusion, "fusion", kHloOpcodeIsVariadic)                                   \
+  V(kGather, "gather", 2)                                                      \
+  V(kGetDimensionSize, "get-dimension-size", 1)                                \
+  V(kGetTupleElement, "get-tuple-element", 1)                                  \
+  V(kImag, "imag", 1)                                                          \
+  V(kInfeed, "infeed", 1)                                                      \
+  V(kIota, "iota", 0)                                                          \
+  V(kIsFinite, "is-finite", 1)                                                 \
+  V(kLog, "log", 1)                                                            \
+  V(kLog1p, "log-plus-one", 1)                                                 \
+  V(kLogistic, "logistic", 1)                                                  \
+  V(kMap, "map", kHloOpcodeIsVariadic)                                         \
+  V(kMaximum, "maximum", 2)                                                    \
+  V(kMinimum, "minimum", 2)                                                    \
+  V(kMultiply, "multiply", 2)                                                  \
+  V(kNegate, "negate", 1)                                                      \
+  V(kNot, "not", 1)                                                            \
+  V(kOptimizationBarrier, "opt-barrier", 1)                                    \
+  V(kOr, "or", 2)                                                              \
+  V(kOutfeed, "outfeed", 2)                                                    \
+  V(kPad, "pad", 2)                                                            \
+  V(kParameter, "parameter", 0)                                                \
+  V(kPartitionId, "partition-id", 0)                                           \
+  V(kPopulationCount, "popcnt", 1)                                             \
+  V(kPower, "power", 2)                                                        \
+  V(kRaggedAllToAll, "ragged-all-to-all", 6)                                   \
+  V(kRaggedDot, "ragged-dot", 3)                                               \
+  V(kReal, "real", 1)                                                          \
+  V(kRecv, "recv", 1)                                                          \
+  V(kRecvDone, "recv-done", 1)                                                 \
+  V(kReduce, "reduce", kHloOpcodeIsVariadic)                                   \
+  V(kReducePrecision, "reduce-precision", 1)                                   \
+  V(kReduceScatter, "reduce-scatter", kHloOpcodeIsVariadic)                    \
+  V(kReduceWindow, "reduce-window", kHloOpcodeIsVariadic)                      \
+  V(kRemainder, "remainder", 2)                                                \
+  V(kReplicaId, "replica-id", 0)                                               \
+  V(kReshape, "reshape", 1)                                                    \
+  V(kReverse, "reverse", 1)                                                    \
+  V(kRng, "rng", kHloOpcodeIsVariadic)                                         \
+  V(kRngBitGenerator, "rng-bit-generator", 1)                                  \
+  V(kRngGetAndUpdateState, "rng-get-and-update-state", 0)                      \
+  V(kRoundNearestAfz, "round-nearest-afz", 1)                                  \
+  V(kRoundNearestEven, "round-nearest-even", 1)                                \
+  V(kRsqrt, "rsqrt", 1)                                                        \
+  V(kScatter, "scatter", kHloOpcodeIsVariadic)                                 \
+  V(kSelect, "select", 3)                                                      \
+  V(kSelectAndScatter, "select-and-scatter", 3)                                \
+  V(kSend, "send", 2)                                                          \
+  V(kSendDone, "send-done", 1)                                                 \
+  V(kSetDimensionSize, "set-dimension-size", 2)                                \
+  V(kShiftLeft, "shift-left", 2)                                               \
+  V(kShiftRightArithmetic, "shift-right-arithmetic", 2)                        \
+  V(kShiftRightLogical, "shift-right-logical", 2)                              \
+  V(kSign, "sign", 1)                                                          \
+  V(kSin, "sine", 1)                                                           \
+  V(kSlice, "slice", 1)                                                        \
+  V(kSort, "sort", kHloOpcodeIsVariadic)                                       \
+  V(kSqrt, "sqrt", 1)                                                          \
+  V(kStochasticConvert, "stochastic-convert", 2)                               \
+  V(kSubtract, "subtract", 2)                                                  \
+  V(kTan, "tan", 1)                                                            \
+  V(kTanh, "tanh", 1)                                                          \
+  V(kTopK, "topk", 1)                                                          \
+  V(kTranspose, "transpose", 1)                                                \
+  V(kTriangularSolve, "triangular-solve", 2)                                   \
+  V(kTuple, "tuple", kHloOpcodeIsVariadic)                                     \
+  V(kWhile, "while", 1)                                                        \
+  V(kXor, "xor", 2)                                                            \
+  /* go/keep-sorted end */
+// LINT.ThenChange(../../mlir_hlo/mhlo/IR/hlo_ops.td)
+
+// Upto 256 opcodes. Increase the base type if/when needed.
+enum class HloOpcode : uint8_t {
+#define DECLARE_ENUM(enum_name, opcode_name, ...) enum_name,
+  HLO_OPCODE_LIST(DECLARE_ENUM)
+#undef DECLARE_ENUM
+};
+
+// Arity value that denotes that an operator is variadic.
+enum {
+  kHloOpcodeIsVariadic = -1,
+};
+
+// Returns a string representation of the opcode.
+absl::string_view HloOpcodeString(HloOpcode opcode);
+
+// Retrieves the opcode enum by name if the opcode exists.
+absl::StatusOr<HloOpcode> StringToHloOpcode(absl::string_view opcode_name);
+
+inline std::ostream& operator<<(std::ostream& os, HloOpcode opcode) {
+  return os << HloOpcodeString(opcode);
+}
+
+// Returns true iff the given opcode is a comparison operation.
+bool HloOpcodeIsComparison(HloOpcode opcode);
+
+// Returns true iff the given opcode has variadic operands.
+bool HloOpcodeIsVariadic(HloOpcode opcode);
+
+// Returns the arity of opcode. If the opcode is variadic,
+// returns nullopt.
+std::optional<int> HloOpcodeArity(HloOpcode opcode);
+
+// Returns true if the given opcode is one of kAsyncStart, kAsyncUpdate, or
+// kAsyncDone.
+bool HloOpcodeIsAsync(HloOpcode opcode);
+
+// True if the op takes two arguments and order doesn't matter.
+inline bool HloOpcodeIsBinaryCommutative(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kAdd:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kAnd:
+    case HloOpcode::kOr:
+    case HloOpcode::kXor:
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Returns the number of HloOpcode values.
+inline constexpr uint32_t HloOpcodeCount() {
+#define HLO_COUNT_ONE(...) +1
+#define HLO_XLIST_LENGTH(list) list(HLO_COUNT_ONE)
+  return HLO_XLIST_LENGTH(HLO_OPCODE_LIST);
+}
+static_assert(HloOpcodeCount() < 256,
+              "HloOpcode is a uint8_t. You need to increase its size before "
+              "adding new op codes.");
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_OPCODE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_original_value.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_original_value.h
new file mode 100644
index 00000000..eca98ef3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_original_value.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_ORIGINAL_VALUE_H_
+#define XLA_HLO_IR_HLO_ORIGINAL_VALUE_H_
+
+#include <optional>
+#include <string>
+
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+// Stores information of original values.
+struct OriginalArray {
+  std::string instruction_name;
+  ShapeIndex shape_index;
+};
+
+using OriginalValue = ShapeTree<std::optional<OriginalArray>>;
+
+std::string OriginalValueToString(const OriginalValue& original_value);
+
+OriginalValueProto OriginalValueToProto(const OriginalValue& original_value);
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_ORIGINAL_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_reachability.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_reachability.h
new file mode 100644
index 00000000..30153bf0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_reachability.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_REACHABILITY_H_
+#define XLA_HLO_IR_HLO_REACHABILITY_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_reachability.h"
+
+#endif  // XLA_HLO_IR_HLO_REACHABILITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_schedule.h
new file mode 100644
index 00000000..b0a87284
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -0,0 +1,240 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_SCHEDULE_H_
+#define XLA_HLO_IR_HLO_SCHEDULE_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo.pb.h"
+
+namespace xla {
+
+class HloModule;
+
+// Class representing a sequence of HLO instructions such as the sequential
+// execution order of an HLO computation.
+class HloInstructionSequence {
+ public:
+  HloInstructionSequence() = default;
+  explicit HloInstructionSequence(
+      absl::Span<HloInstruction* const> instructions) {
+    for (HloInstruction* instruction : instructions) {
+      push_back(instruction);
+    }
+  }
+
+  // Adds the instruction to the end of the sequence.
+  void push_back(HloInstruction* instruction) {
+    instruction_sequence_.push_back(instruction);
+    id_sequence_.push_back(instruction->unique_id());
+  }
+
+  // Removes the instruction from the sequence.
+  void remove_instruction(HloInstruction* instruction) {
+    auto instruction_it = std::find(instruction_sequence_.begin(),
+                                    instruction_sequence_.end(), instruction);
+    if (instruction_it != instruction_sequence_.end()) {
+      auto id_it = std::find(id_sequence_.begin(), id_sequence_.end(),
+                             instruction->unique_id());
+      instruction_sequence_.erase(instruction_it);
+      id_sequence_.erase(id_it);
+    }
+  }
+
+  // Replaces the old instruction with the new instruction in the sequence.
+  void replace_instruction(HloInstruction* old_instruction,
+                           HloInstruction* new_instruction) {
+    auto instruction_it =
+        std::find(instruction_sequence_.begin(), instruction_sequence_.end(),
+                  old_instruction);
+    auto id_it = std::find(id_sequence_.begin(), id_sequence_.end(),
+                           old_instruction->unique_id());
+    CHECK(instruction_it != instruction_sequence_.end())
+        << "Do not find instruction id " << old_instruction->unique_id();
+    CHECK(id_it != id_sequence_.end());
+    *instruction_it = new_instruction;
+    *id_it = new_instruction->unique_id();
+  }
+
+  // Adds the instruction to the sequence at a specified index,
+  void insert_instruction(HloInstruction* instruction, int64_t index) {
+    CHECK(0 <= index && index < size()) << "Index out of bounds";
+    instruction_sequence_.insert(instruction_sequence_.begin() + index,
+                                 instruction);
+    id_sequence_.insert(id_sequence_.begin() + index, instruction->unique_id());
+  }
+
+  bool contains(const HloInstruction* inst) const {
+    return absl::c_find(instruction_sequence_, inst) !=
+           instruction_sequence_.end();
+  }
+
+  // Clears the sequence of all instructions.
+  void clear() {
+    instruction_sequence_.clear();
+    id_sequence_.clear();
+  }
+
+  int64_t size() const { return instruction_sequence_.size(); }
+
+  // Returns the sequence of HLO instructions.
+  const std::vector<HloInstruction*>& instructions() const {
+    return instruction_sequence_;
+  }
+
+  // Returns the unique IDs of the instructions in the sequence (in order).
+  const std::vector<int>& ids() const { return id_sequence_; }
+
+ private:
+  // The sequence as HloInstructions.
+  std::vector<HloInstruction*> instruction_sequence_;
+
+  // The sequence of HLO instructions, represented by their unique IDs. The
+  // sequence is stored as both HloInstructions and unique IDs because the
+  // sequence may be referenced after transformations to the HLO graph and HLO
+  // pointers can be invalidated or recycled in this process (see
+  // HloSchedule::Update).
+  std::vector<int> id_sequence_;
+};
+
+// A class representing a sequential schedule of instructions for an HLO
+// module. A complete HLO schedule contains an instruction sequence for every
+// non-fusion computation in the HLO module.
+class HloSchedule {
+ public:
+  explicit HloSchedule(const HloModule* module) : module_(module) {}
+
+  // (De)Serialize an HloSchedule to/from a HloScheduleProto.
+  static absl::StatusOr<HloSchedule> CreateFromProto(
+      const HloModule* module, const HloScheduleProto& proto);
+  absl::StatusOr<HloScheduleProto> ToProto() const;
+
+  // Returns a reference to the sequence for the given computation.
+  const HloInstructionSequence& sequence(
+      const HloComputation* computation) const;
+
+  // Returns the sequence for the given computation. An empty sequence is
+  // created if none exists for the computation.
+  HloInstructionSequence& GetOrCreateSequence(
+      const HloComputation* computation);
+
+  // Sets the sequence for the given computation to the given sequence.
+  void set_sequence(const HloComputation* computation,
+                    absl::Span<HloInstruction* const> sequence);
+  void set_sequence(const HloComputation* computation,
+                    HloInstructionSequence sequence);
+
+  // Returns a map from HloComputation unique ID to instruction sequence. The
+  // map contains all sequences in the schedule.
+  const absl::flat_hash_map<int64_t, HloInstructionSequence>& sequences()
+      const {
+    return sequences_;
+  }
+
+  absl::flat_hash_map<std::string, int64_t> num_sequences_by_execution_thread()
+      const;
+
+  // Returns true if the schedule has a sequence for the given computation.
+  bool is_computation_scheduled(const HloComputation* computation) const {
+    return sequences_.contains(computation->unique_id());
+  }
+
+  // Removes the computation from the sequences.
+  void remove_computation(const HloComputation* computation) {
+    auto it = sequences_.find(computation->unique_id());
+    // The computation is not scheduled. Nothing to remove.
+    if (it == sequences_.end()) return;
+    sequences_.erase(it);
+    execution_threads_.erase(computation->unique_id());
+  }
+
+  // Removes the instruction from the computation's sequence.
+  void remove_instruction(const HloComputation* computation,
+                          HloInstruction* instruction) {
+    sequences_[computation->unique_id()].remove_instruction(instruction);
+  }
+
+  // Replaces the old instruction with the new instruction in the computation's
+  // sequence.
+  void replace_instruction(const HloComputation* computation,
+                           HloInstruction* old_instruction,
+                           HloInstruction* new_instruction) {
+    sequences_[computation->unique_id()].replace_instruction(old_instruction,
+                                                             new_instruction);
+  }
+
+  // Updates the schedule for specified threads such that it is (again) a valid
+  // schedule for the module. This is used to update a schedule after the HLO
+  // module has been transformed in some way. In general, the only
+  // transformations to the module for which a schedule can be updated is the
+  // addition or removal of instructions and computations.
+  //
+  // Instructions in the module which also exist in the given schedule will
+  // remain in the same order in the updated schedule. Instructions which exist
+  // in the module but not in the given schedule will be placed as early as
+  // possible in the updated schedule.
+  absl::Status Update(
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Verifies that the given schedule is valid for the given module.
+  // Specifically, the schedule contains exactly the instructions in the
+  // non-fusion computations in the module and every dependency in the module is
+  // satisfied in the schedule.
+  absl::Status Verify() const;
+
+  std::string ToString() const;
+
+  bool empty() const { return sequences_.empty(); }
+
+  const HloModule* module() const { return module_; }
+
+ private:
+  // Updates the instruction sequence for the given computation.
+  absl::Status UpdateComputationSchedule(const HloComputation* computation);
+
+  const HloModule* module_;
+
+  // A map from computation unique ID to instruction sequence. Unique IDs are
+  // used rather than HloComputation pointers because HLO pointers are not
+  // unique across HLO transformations because pointers may be recycled.
+  absl::flat_hash_map<int64_t, HloInstructionSequence> sequences_;
+
+  // A corresponding map of `sequences_`, mapping the computation unique ID
+  // included in the shedule to execution threads. We need to store this since
+  // sometimes, computation could be removed while we still need the execution
+  // thread info for the remaining sequences.
+  absl::flat_hash_map<int64_t, std::string> execution_threads_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloSchedule& schedule);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_SCHEDULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_sharding.h
new file mode 100644
index 00000000..f8740ba4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -0,0 +1,704 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// HLO shardings describe how an HLO instruction is split across multiple
+// computations.
+
+#ifndef XLA_HLO_IR_HLO_SHARDING_H_
+#define XLA_HLO_IR_HLO_SHARDING_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/tile_assignment.h"  // IWYU pragma: export
+#include "xla/printer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// HLO shardings describe how an HLO instruction is split across multiple
+// computations.
+class HloSharding {
+ public:
+  // Creates a trivial sharding that replicates a maximal tile across all
+  // devices.
+  static HloSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(/*manual=*/false, /*replicated=*/true, /*unknown=*/false,
+                       metadata);
+  }
+
+  // Creates a sharding that represents the op is manually partitioned.
+  static HloSharding Manual(absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(/*manual=*/true, /*replicated=*/false, /*unknown=*/false,
+                       metadata);
+  }
+
+  // Creates a sharding that represents the op has a placeholder sharding.
+  static HloSharding Unknown(absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(/*manual=*/false, /*replicated=*/false, /*unknown=*/true,
+                       metadata);
+  }
+
+  // Creates a sharding that emulates device placement; a tile shape equal to
+  // the input shape (one tile) assigned to a single device.
+  static HloSharding AssignDevice(int64_t device_id,
+                                  absl::Span<const OpMetadata> metadata = {});
+
+  // Creates a new sharding which splits a shape into tiles amongst the devices
+  // specified by `tile_assignment`.
+  static HloSharding Tile(TileAssignment tile_assignment,
+                          absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(tile_assignment, /*replicate_on_last_tile_dim=*/false,
+                       metadata);
+  }
+  static HloSharding Tile(Array<int64_t> tile_assignment,
+                          absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(TileAssignment(std::make_shared<const Array<int64_t>>(
+                           std::move(tile_assignment))),
+                       /*replicate_on_last_tile_dim=*/false, metadata);
+  }
+  // Similar to `Tile` but use IotaTileAssignment format.
+  static HloSharding IotaTile(absl::Span<const int64_t> tile_assignment_dims,
+                              absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(TileAssignment(tile_assignment_dims),
+                       /*replicate_on_last_tile_dim=*/false, metadata);
+  }
+  static HloSharding IotaTile(absl::Span<const int64_t> tile_assignment_dims,
+                              absl::Span<const int64_t> reshape_dims,
+                              absl::Span<const int> transpose_perm,
+                              absl::Span<const OpMetadata> metadata = {}) {
+    return HloSharding(
+        TileAssignment(tile_assignment_dims, reshape_dims, transpose_perm),
+        /*replicate_on_last_tile_dim=*/false, metadata);
+  }
+
+  // Creates a partially replicated tiled sharding with device-level tile
+  // assignment, where the last dimension is the additional replication
+  // dimension. Replication group members will be sorted.
+  static HloSharding PartialTile(
+      const TileAssignment& tile_assignment_last_dim_replicate,
+      absl::Span<const OpMetadata> metadata = {});
+  static HloSharding PartialTile(
+      Array<int64_t> tile_assignment_last_dim_replicate,
+      absl::Span<const OpMetadata> metadata = {}) {
+    return PartialTile(TileAssignment(std::make_shared<const Array<int64_t>>(
+                           std::move(tile_assignment_last_dim_replicate))),
+                       metadata);
+  }
+
+  // Creates a subgroup sharding with device-level tile assignment, the
+  // sharding type of each subgroup is defined by subgroup_types. When creating
+  // the HloSharding, subgroup dims of the same type will be merged.
+  static HloSharding Subgroup(const TileAssignment& tile_assignment,
+                              absl::Span<const OpSharding::Type> subgroup_types,
+                              absl::Span<const OpMetadata> metadata = {});
+  static HloSharding Subgroup(Array<int64_t> tile_assignment,
+                              absl::Span<const OpSharding::Type> subgroup_types,
+                              absl::Span<const OpMetadata> metadata = {}) {
+    return Subgroup(
+        TileAssignment(std::make_shared<const Array<int64_t>>(tile_assignment)),
+        subgroup_types, metadata);
+  }
+
+  // Creates a new sharding which splits a one-dimensional input shape into
+  // `num_tiles` tiles.
+  static HloSharding Tile1D(const Shape& input_shape, int64_t num_tiles,
+                            absl::Span<const OpMetadata> metadata = {});
+
+  // Creates a new sharding for a tuple type. The given ShapeTree must have
+  // elements for every leaf shape contained in the tuple.
+  static HloSharding Tuple(const ShapeTree<HloSharding>& sub_shardings);
+
+  // Creates a new sharding for a tuple type. The number of elements in
+  // shardings must match the number of leaf nodes in tuple_shape. For
+  // empty tuples, the shardings array must have one element.
+  static HloSharding Tuple(const Shape& tuple_shape,
+                           absl::Span<const HloSharding> shardings);
+
+  // Creates a new sharding for a flat tuple type.
+  static HloSharding FlatTuple(std::vector<HloSharding> sub_shardings) {
+    return HloSharding(std::move(sub_shardings));
+  }
+
+  // Creates a new sharding for a tuple type, with a single input sharding
+  // repeated on each leaf.
+  static HloSharding SingleTuple(const Shape& tuple_shape,
+                                 const HloSharding& sharding);
+
+  // If shape is an array, returns sharding, otherwise returns the tuple shaped
+  // sharding with all the leaf nodes having the same input sharding.
+  static HloSharding Single(const Shape& shape, const HloSharding& sharding);
+
+  // Create a new sharding from a protobuf OpSharding.
+  static absl::StatusOr<HloSharding> FromProto(const OpSharding& proto);
+
+  // Checks whether device is a reserved device number. A reserved device number
+  // has usually a special meaning, with dedicated handling logic.
+  static bool IsReservedDevice(int64_t device) { return device < 0; }
+
+  OpSharding ToProto() const;
+
+  // Prints the string representation of this sharding.Note that this string
+  // canonically has outer curly braces, e.g. "{replicated}".
+  void Print(Printer* printer, bool include_metadata = false) const;
+
+  // Returns the content printed by Print as a string.
+  std::string ToString(bool include_metadata = false) const;
+
+  // Validate that this sharding can be applied to a tensor with shape `shape`.
+  absl::Status Validate(const Shape& shape,
+                        std::optional<int64_t> num_devices = {}) const;
+
+  // Returns true if the sharding has tuple type.
+  bool IsTuple() const { return tuple_; }
+
+  // Returns true if the sharding is trivial: replicate on all devices.
+  bool IsReplicated() const {
+    if (!IsTuple()) {
+      return replicated_;
+    }
+    return absl::c_all_of(
+        tuple_elements_, [](const HloSharding& s) { return s.IsReplicated(); });
+  }
+  bool IsReplicatedLeaf() const {
+    DCHECK(!IsTuple());
+    return replicated_;
+  }
+
+  // Returns true if the tile size is the same as the input size.
+  bool IsTileMaximal() const {
+    if (!IsTuple()) {
+      return maximal_;
+    }
+    return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+      return s.IsTileMaximal();
+    });
+  }
+  bool IsTileMaximalLeaf() const {
+    DCHECK(!IsTuple());
+    return maximal_;
+  }
+
+  // Returns whether the sharding represents manual partitioning.
+  bool IsManual() const {
+    if (!IsTuple()) {
+      return manual_;
+    }
+    return absl::c_all_of(tuple_elements_,
+                          [](const HloSharding& s) { return s.IsManual(); });
+  }
+  bool IsManualLeaf() const {
+    DCHECK(!IsTuple());
+    return manual_;
+  }
+
+  // Returns whether the sharding represents a placeholder sharding.
+  bool IsUnknown() const {
+    if (!IsTuple()) {
+      return unknown_;
+    }
+    return absl::c_all_of(tuple_elements_,
+                          [](const HloSharding& s) { return s.IsUnknown(); });
+  }
+  bool IsUnknownLeaf() const {
+    DCHECK(!IsTuple());
+    return unknown_;
+  }
+
+  bool IsShardGroup() const {
+    if (!IsTuple()) {
+      return shard_group_.shard_group_id != -1 &&
+             (shard_group_.shard_like || shard_group_.shard_as);
+    }
+    return !tuple_elements_.empty() &&
+           absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+             return s.IsShardGroup();
+           });
+  }
+
+  bool IsShardAs() const {
+    if (!IsTuple()) {
+      return shard_group_.shard_group_id != -1 && shard_group_.shard_as;
+    }
+    return !tuple_elements_.empty() &&
+           absl::c_all_of(tuple_elements_,
+                          [](const HloSharding& s) { return s.IsShardAs(); });
+  }
+
+  bool IsShardLike() const {
+    if (!IsTuple()) {
+      return shard_group_.shard_group_id != -1 && shard_group_.shard_like;
+    }
+    return !tuple_elements_.empty() &&
+           absl::c_all_of(tuple_elements_,
+                          [](const HloSharding& s) { return s.IsShardLike(); });
+  }
+
+  // Returns whether the sharding represents manual subgroup sharding.
+  bool IsManualSubgroup() const {
+    if (!IsTuple()) {
+      return absl::c_linear_search(subgroup_types_, OpSharding::MANUAL);
+    }
+    return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+      return s.IsManualSubgroup();
+    });
+  }
+
+  // Returns weather the sharding represents a tiled sharding where the mapping
+  // between devices and tiles is represented through 'tile_assignment()'.
+  bool IsTiled() const {
+    return !IsTileMaximal() && !IsManual() && !IsUnknown();
+  }
+  bool IsTiledLeaf() const {
+    return !IsTileMaximalLeaf() && !IsManualLeaf() && !IsUnknownLeaf();
+  }
+
+  // Returns if the sharding has partial replication and partial sharding. If
+  // true, data is sharded according to other dimensions of tile_assignment(),
+  // but replicated across devices along the last dimension.
+  bool ReplicateOnLastTileDim() const { return replicate_on_last_tile_dim_; }
+
+  // Returns whether there is any partial replication. This can be using
+  // ReplicateOnLastTileDim or subgroups with REPLICATED.
+  bool HasPartialReplication() const {
+    return replicate_on_last_tile_dim_ ||
+           absl::c_linear_search(subgroup_types_, OpSharding::REPLICATED);
+  }
+
+  // Returns true if the sharding defines an operation on the given device.
+  bool UsesDevice(int64_t device) const;
+
+  // Retrieves a histogram of the devices used by the sharding. The returned
+  // map has the device number as key, and the occurrence count as value.
+  // If a sharding does not have a device, it will not be included in the
+  // histogram. The count argument, if not nullptr, will receive the total
+  // number of elements this sharding is made of (one for array, N leaves for
+  // tuples).
+  std::map<int64_t, int64_t> UsedDevices(int64_t* count) const;
+
+  // Returns the tile that should be executed on the given device.
+  // REQUIRES: !IsTuple()
+  std::vector<int64_t> TileIndexForDevice(int64_t device) const;
+
+  // Returns the device that should execute the given tile.
+  // It is an error to call this if is_replicated() is true.
+  // When ReplicateOnLastTileDim() == true, if index.size() == data rank, it
+  // returns the first device in that replicated subgroup; otherwise,
+  // index.size() should be the same as tile_assignment()'s rank and specifies
+  // the member of the replication subgroup.
+  // REQUIRES: !IsTuple()
+  int64_t DeviceForTileIndex(absl::Span<const int64_t> index) const;
+
+  // Given a device ID, returns the offset within the specified shape of the
+  // tile that should be executed on the given core. This returns the lower
+  // extent of the tile in the input space.
+  // REQUIRES: !IsTuple()
+  std::vector<int64_t> TileOffsetForDevice(const Shape& shape,
+                                           int64_t device) const;
+
+  // Given a device ID, returns the limit within the specified shape of the
+  // tile that should be executed on the given core. This returns the upper
+  // extent of the tile in the input space.
+  // REQUIRES: !IsTuple()
+  std::vector<int64_t> TileLimitForDevice(const Shape& shape,
+                                          int64_t device) const;
+
+  // Returns the single device this op operates on. If the sharding does not
+  // span a single device, the return value will be empty.
+  // In order for a sharding to span a single device, every leaf sharding must
+  // be maximal and not replicated, and the used device must match.
+  std::optional<int64_t> UniqueDevice() const;
+
+  // Retrieves the unique device or fails with a CHECK.
+  int64_t GetUniqueDevice() const;
+
+  // Returns true if this op only uses a single device.
+  bool HasUniqueDevice() const { return UniqueDevice().has_value(); }
+
+  // Returns the ShapeTree containing the shardings for each element of this
+  // tuple, if IsTuple, or a ShapeTree with a single element containing this
+  // sharding. Only the leaf elements are populated. This creates a new
+  // ShapeTree object so is not cheap.
+  absl::StatusOr<ShapeTree<HloSharding>> AsShapeTree(const Shape& shape) const;
+  ShapeTree<HloSharding> GetAsShapeTree(const Shape& shape) const {
+    return AsShapeTree(shape).value();
+  }
+
+  // Retrieves the sub sharding at a given index, out of a tuple sharding.
+  // REQUIRES: IsTuple()
+  HloSharding GetSubSharding(const Shape& shape, const ShapeIndex& index) const;
+
+  // If the current sharding is a tuple sharding, return itself as result.
+  // Otherwise returns a tuple sharding for the input shape, with all the leaves
+  // having this object sharding.
+  absl::StatusOr<HloSharding> GetTupleSharding(const Shape& shape) const;
+
+  // If the shape is tuple and the current sharding is not a tuple, attempt to
+  // construct a sharding that is compatible with the shape by replicating the
+  // current sharding across all tuple elements. Note that the returned
+  // sharding is not guaranteed to be compatible with the input shape.
+  HloSharding NormalizeTupleSharding(const Shape& shape) const;
+
+  // Extracts the sharding that is common within the current sharding.
+  // If the current sharding is not a tuple sharding, the current sharding will
+  // be returned. If it is a tuple, and all the tuple elements are common, the
+  // common element will be returned. Otherwise the optional will contain no
+  // value.
+  std::optional<HloSharding> ExtractSingleSharding() const;
+
+  // Returns a copy of the sharding with no metadata. If sharding is of tuple
+  // type, sub shardings will have no metadata.
+  HloSharding WithoutMetadata() const;
+
+  // Returns a copy of the sharding with specified metadata. If metadata is
+  // already present, that metadata will not be replaced unless `overwrite` is
+  // set to true. If sharding is of tuple type, sub shardings metadata will be
+  // assigned instead.
+  HloSharding WithMetadata(absl::Span<const OpMetadata> metadata,
+                           bool overwrite) const;
+
+  bool operator==(const HloSharding& other) const {
+    return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
+           manual_ == other.manual_ && unknown_ == other.unknown_ &&
+           tile_assignment_ == other.tile_assignment_ &&
+           tuple_elements_ == other.tuple_elements_ &&
+           replicate_on_last_tile_dim_ == other.replicate_on_last_tile_dim_ &&
+           subgroup_types_ == other.subgroup_types_ &&
+           shard_group_ == other.shard_group_;
+  }
+  bool operator!=(const HloSharding& other) const { return !(*this == other); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloSharding& sharding) {
+    if (sharding.tuple_) {
+      return H::combine(std::move(h), sharding.tuple_elements_);
+    }
+    return H::combine(std::move(h), sharding.replicated_, sharding.manual_,
+                      sharding.unknown_, sharding.tile_assignment_.array(),
+                      sharding.replicate_on_last_tile_dim_,
+                      sharding.shard_group_.ToString());
+  }
+
+  // Gets the tile assignment tensor.
+  // REQUIRES: !IsReplicated() && !IsTuple()
+  const TileAssignment& tile_assignment() const { return tile_assignment_; }
+
+  // Gets the subgroup types array.
+  // REQUIRES: !IsTuple()
+  const std::vector<OpSharding::Type>& subgroup_types() const {
+    return subgroup_types_;
+  }
+
+  // Returns the flattened list of all the leaf shardings in a tuple shape, by
+  // pre-order walk (ShapeTree iterator order).
+  // REQUIRES: IsTuple().
+  std::vector<HloSharding>& tuple_elements() { return tuple_elements_; }
+  const std::vector<HloSharding>& tuple_elements() const {
+    return tuple_elements_;
+  }
+
+  // Gets the tile shape.
+  // REQUIRES: !IsTuple()
+  Shape TileShape(const Shape& shape) const;
+
+  // Gets the tile shape on the device.
+  // REQUIRES: !IsTuple()
+  Shape TileShape(const Shape& shape, int64_t device) const;
+
+  // Gets the total number of tiles including subgroups and partial replication.
+  int64_t TotalNumTiles() const;
+  // Gets the number of tiles. If it has partial replication, this will not
+  // equal the device count.
+  int64_t NumTiles() const;
+  int64_t NumTilesLeaf() const;
+  // Like NumTiles() but considers only some specific dimensions passed as
+  // argument
+  int64_t NumTiles(absl::Span<const int64_t> dims) const;
+
+  // Gets metadata from sharding.
+  std::vector<OpMetadata>& metadata() { return metadata_; }
+  const std::vector<OpMetadata>& metadata() const { return metadata_; }
+
+  // Returns the replication subgroup dim, or -1 if it doesn't exist.
+  int64_t SubgroupReplicationDim() const {
+    auto it = absl::c_find(subgroup_types_, OpSharding::REPLICATED);
+    if (it != subgroup_types_.end()) {
+      return (it - subgroup_types_.begin()) + TiledDataRank();
+    }
+    if (replicate_on_last_tile_dim_) {
+      return tile_assignment_.num_dimensions() - 1;
+    }
+    return -1;
+  }
+
+  // Returns the manual subgroup dim, or -1 if it doesn't exist.
+  int64_t SubgroupManualDim() const {
+    auto it = absl::c_find(subgroup_types_, OpSharding::MANUAL);
+    if (it != subgroup_types_.end()) {
+      return (it - subgroup_types_.begin()) + TiledDataRank();
+    }
+    return -1;
+  }
+
+  // Returns the data rank for tiled sharding. It doesn't include subgroup dims.
+  int64_t TiledDataRank() const {
+    CHECK(IsTiled());
+    int64_t rank = tile_assignment_.num_dimensions();
+    if (ReplicateOnLastTileDim()) {
+      rank--;
+    }
+    rank -= subgroup_types_.size();
+    return rank;
+  }
+  int64_t TiledDataRankLeaf() const {
+    DCHECK(!IsTuple());
+    CHECK(IsTiledLeaf());
+    int64_t rank = tile_assignment_.num_dimensions();
+    if (ReplicateOnLastTileDim()) {
+      rank--;
+    }
+    rank -= subgroup_types_.size();
+    return rank;
+  }
+
+  // Returns the number of tuple_elements_ entries to fit the shape.
+  static int64_t RequiredLeaves(const Shape& shape);
+
+  struct ShardGroup {
+    ShardGroup(int64_t shard_group_id, bool shard_as, bool shard_like)
+        : shard_group_id(shard_group_id),
+          shard_as(shard_as),
+          shard_like(shard_like) {}
+
+    bool operator==(const ShardGroup& rhs) const {
+      return shard_group_id == rhs.shard_group_id && shard_as == rhs.shard_as &&
+             shard_like == rhs.shard_like;
+    }
+
+    std::string ToString() const {
+      std::ostringstream result;
+      if (shard_as) {
+        result << "shard_as " << shard_group_id;
+      } else if (shard_like) {
+        result << "shard_like " << shard_group_id;
+      }
+      return result.str();
+    }
+
+    int64_t shard_group_id = 0;
+    bool shard_as;
+    bool shard_like;
+  };
+  static ShardGroup NotShardGroup() {
+    return ShardGroup(
+        /*shard_group_id=*/-1,
+        /*shard_as=*/false,
+        /*shard_like=*/false);
+  }
+
+  static ShardGroup ShardAs(int64_t shard_group_id) {
+    return ShardGroup(shard_group_id,
+                      /*shard_as=*/true,
+                      /*shard_like=*/false);
+  }
+
+  static ShardGroup ShardLike(int64_t shard_group_id) {
+    return ShardGroup(shard_group_id,
+                      /*shard_as=*/false,
+                      /*shard_like=*/true);
+  }
+
+  HloSharding& SetShardGroup(const ShardGroup& shard_group) {
+    shard_group_ = shard_group;
+    return *this;
+  }
+
+  HloSharding& SetShardGroupFromProto(const OpSharding& proto) {
+    ShardGroup shard_group = NotShardGroup();
+    if (proto.is_shard_group()) {
+      if (proto.shard_group_type() == OpSharding::AS) {
+        shard_group = ShardAs(proto.shard_group_id());
+      } else {
+        shard_group = ShardLike(proto.shard_group_id());
+      }
+    }
+    SetShardGroup(shard_group);
+    return *this;
+  }
+
+  HloSharding& ClearShardGroup() {
+    shard_group_ = NotShardGroup();
+    return *this;
+  }
+
+  const ShardGroup& GetShardGroup() const { return shard_group_; }
+
+ private:
+  explicit HloSharding(bool manual, bool replicated, bool unknown,
+                       absl::Span<const OpMetadata> metadata)
+      : metadata_(metadata.begin(), metadata.end()),
+        replicated_(replicated),
+        maximal_(replicated),
+        tuple_(false),
+        manual_(manual),
+        unknown_(unknown),
+        replicate_on_last_tile_dim_(false) {}
+  // device_id values:
+  // -2: magic number to mean unassigned device, used by spatial partitioning
+  // -1: the id of the host
+  //  0 or positive: the id of a device
+  // NOTE(dimvar): -1 is needed for outside compilation. It can be removed once
+  // we have fully switched to the side-effect tokens.
+  explicit HloSharding(int64_t device_id, absl::Span<const OpMetadata> metadata)
+      : tile_assignment_(device_id),
+        metadata_(metadata.begin(), metadata.end()),
+        replicated_(false),
+        maximal_(true),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        replicate_on_last_tile_dim_(false) {}
+  explicit HloSharding(TileAssignment tile_assignment,
+                       bool replicate_on_last_tile_dim,
+                       absl::Span<const OpMetadata> metadata = {})
+      : tile_assignment_(std::move(tile_assignment)),
+        metadata_(metadata.begin(), metadata.end()),
+        replicated_(false),
+        maximal_(false),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        replicate_on_last_tile_dim_(replicate_on_last_tile_dim) {}
+  explicit HloSharding(TileAssignment tile_assignment,
+                       absl::Span<const OpSharding::Type> subgroup_types,
+                       absl::Span<const OpMetadata> metadata = {})
+      : tile_assignment_(std::move(tile_assignment)),
+        metadata_(metadata.begin(), metadata.end()),
+        subgroup_types_(subgroup_types.begin(), subgroup_types.end()),
+        replicated_(false),
+        maximal_(false),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        replicate_on_last_tile_dim_(false) {}
+  explicit HloSharding(std::vector<HloSharding> tuple_shardings)
+      : tuple_elements_(std::move(tuple_shardings)),
+        replicated_(false),
+        maximal_(false),
+        tuple_(true),
+        manual_(false),
+        unknown_(false),
+        replicate_on_last_tile_dim_(false) {}
+
+  // Test-only constructor for sharding format code coverage. Copies the
+  // original sharding with provided tile assignment.
+  explicit HloSharding(const HloSharding& other, TileAssignment tile_assignment)
+      : tile_assignment_(std::move(tile_assignment)),
+        tuple_elements_(other.tuple_elements_),
+        metadata_(other.metadata_),
+        subgroup_types_(other.subgroup_types_),
+        replicated_(other.replicated_),
+        maximal_(other.maximal_),
+        tuple_(other.tuple_),
+        manual_(other.manual_),
+        unknown_(other.unknown_),
+        replicate_on_last_tile_dim_(other.replicate_on_last_tile_dim_) {
+    CHECK(tile_assignment_ == other.tile_assignment_)
+        << tile_assignment_.ToString() << " v.s. "
+        << other.tile_assignment_.ToString();
+  }
+  friend class HloShardingTestHelper;
+
+  // Checks that the number of elements in tuple_elements_ is consistent with
+  // the tuple shape passes as argument.
+  absl::Status CheckLeafCount(const Shape& shape) const;
+
+  // Internal helper to validate a tuple sharding.
+  absl::Status ValidateTuple(const Shape& shape,
+                             std::optional<int64_t> num_devices) const;
+
+  // Internal helper to validate a non-tuple (leaf) sharding.
+  absl::Status ValidateNonTuple(const Shape& shape,
+                                std::optional<int64_t> num_devices) const;
+
+  // This field is only used if replicated_ is false. If maximal_ is true, then
+  // the field contains a rank 1 array with a single element, which is the
+  // device the HLO is assigned to. If maximal_ is false, the field contains an
+  // array with the same rank as the corresponding HLO. The dimension sizes of
+  // the array describe the number of ways the HLO is partitioned along each
+  // dimension. The values of the array specify which device each tile of
+  // the HLO is assigned to. The index of each value determines which tile it
+  // takes.
+  // For example, {{{2, 3}}, {{5, 7}}} (whose ToString representation is
+  // "{devices=[2,1,2]2,3,5,7}"), means that dimension 1 is split two way and
+  // dimension 3 is split 2 way. Core 5, whose index is [2,1,1] will take the
+  // tile that contains the 2nd half of dimension 1 and the 1st half of
+  // dimension 3.
+  TileAssignment tile_assignment_;
+  // Only non-empty when tuple_ is true. If a tuple is empty then one entry is
+  // present for the root. This is a flattened list of all the leaf shardings in
+  // a tuple shape, by pre-order walk (ShapeTree iterator order).
+  std::vector<HloSharding> tuple_elements_;
+  // This field is used to track the source of this sharding, usually derived
+  // from instructions. Multiple metadata may be populated if sharding is
+  // combined with other shardings. Metadata are to not be populated when
+  // tuple_ == true and instead metadata should be set on individual tuple
+  // elements.
+  std::vector<OpMetadata> metadata_;
+  // This field is used to represented the sharding type of each subgroup.
+  // For example, sharding={devices=[2,2,2,2]0,1,2,...,15 last_tile_dims={
+  // replicate, manual, unreduced}} means that each of the last 3 dimensions
+  // in [2,2,2,2] represents a subgrouping in replicate, manual.
+  // When creating HloSharding, subgroup dims of the same type will be merged,
+  // so that there is at most one dim with a given type.
+  std::vector<OpSharding::Type> subgroup_types_;
+  bool replicated_ : 1;  // When non-tuple, true if the sharding is trivial.
+  bool maximal_ : 1;     // When non-tuple, true if the tile size is the same as
+                         // the input size.
+  bool tuple_ : 1;       // True if this is a tuple.
+  bool manual_ : 1;   // When non-tuple, true if the sharding represents manual
+                      // partitioning.
+  bool unknown_ : 1;  // When non-tuple, true if the sharding represents a
+                      // placeholder sharding.
+  // This flag is to support partial replication and partial sharding. If it is
+  // true, tile_assignment_ will have an extra dimension in addition to the data
+  // shape rank, and the added last dimension represents the subgroups of
+  // replications, i.e., elements in slice [..., :] will be replicated.
+  bool replicate_on_last_tile_dim_ : 1;
+  // This field is used to store the shard group information. Instructions
+  // within the same shard group(i.e. under the same shard_group_id) will be
+  // sharded alike or exactly the same as each other.
+  ShardGroup shard_group_ = NotShardGroup();
+};
+
+std::ostream& operator<<(std::ostream& out, const HloSharding& sharding);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
new file mode 100644
index 00000000..95069a5d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
@@ -0,0 +1,115 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_SHARDING_METADATA_H_
+#define XLA_HLO_IR_HLO_SHARDING_METADATA_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// A DomainMetadata implementation that internally wraps a sharding attribute.
+class ShardingMetadata : public DomainMetadata {
+ public:
+  explicit ShardingMetadata(std::shared_ptr<const HloSharding> sharding)
+      : sharding_(std::move(sharding)) {}
+
+  std::unique_ptr<DomainMetadata> Clone() const override;
+
+  absl::string_view Kind() const override { return KindName(); }
+
+  bool Matches(const DomainMetadata& other) const override;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ShardingMetadata& sharding_metadata) {
+    const bool has_sharding = sharding_metadata.sharding_ != nullptr;
+    if (has_sharding) {
+      h = H::combine(std::move(h), *sharding_metadata.sharding_);
+    }
+    return H::combine(std::move(h), has_sharding);
+  }
+
+  size_t Hash() const override { return absl::HashOf(*this); }
+
+  std::string ToString() const override;
+
+  const HloSharding* sharding() const { return sharding_.get(); }
+
+  static absl::string_view KindName() { return "sharding"; }
+
+  static absl::StatusOr<const ShardingMetadata*> ToShardingMetadata(
+      const DomainMetadata* metadata);
+
+  // Apply the specified domain metadata onto the specified domain. If no
+  // metadata is specified then apply sharding heuristics and normalize the
+  // instructions whose sharding deviates from the one which is inferred as to
+  // be the original one. Policy wise, HLO passes are allowed to create new
+  // unassigned instructions, but if they do create assigned ones, they have to
+  // conform to the ones around.
+  static absl::Status NormalizeShardingDomain(
+      const DomainMetadata::Domain& domain, const DomainMetadata* metadata);
+
+ private:
+  std::shared_ptr<const HloSharding> sharding_;
+};
+
+// If the sharding between root and instruction changes then returns a
+// ShardingMetadata based kDomain instruction what can be used to separate
+// operand and instruction.
+// Returns nullptr if there is no need for a domain separation.
+class ShardingDomainCreator {
+ public:
+  HloInstruction* operator()(HloInstruction* instruction, HloInstruction* root,
+                             HloInstruction* operand);
+
+ private:
+  // Map from instruction and user sharding to domain users to CSE identical
+  // domains.
+  struct DomainCseMapKey {
+    const HloInstruction* instruction;
+    std::shared_ptr<const HloSharding> sharding;
+
+    bool operator==(const DomainCseMapKey& other) const;
+
+    template <typename H>
+    friend H AbslHashValue(H h, const DomainCseMapKey& key) {
+      h = H::combine(std::move(h), key.instruction);
+      const bool has_sharding = key.sharding != nullptr;
+      if (has_sharding) {
+        h = H::combine(std::move(h), *key.sharding);
+      }
+      return H::combine(std::move(h), has_sharding);
+    }
+  };
+  absl::flat_hash_map<DomainCseMapKey, HloInstruction*> domain_cse_map_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_SHARDING_METADATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/ptrvec.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/ptrvec.h
new file mode 100644
index 00000000..6dd30fc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/ptrvec.h
@@ -0,0 +1,380 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_HLO_IR_PTRVEC_H_
+#define XLA_HLO_IR_PTRVEC_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// PtrVec<T*> is like a std::vector<T*> or absl::InlinedVector<T*>, but
+// optimized to use less memory for empty and single element vectors.
+//
+// T must be a pointer type (e.g., char*, const int*, double*, etc.).
+template <typename T>
+class PtrVec {
+ public:
+  static_assert(std::is_pointer<T>::value);
+
+  // Default constructible.
+  PtrVec();
+  ~PtrVec();
+
+  // Copyable.
+  PtrVec(const PtrVec& x);
+  PtrVec& operator=(const PtrVec& x);
+
+  // Movable.
+  PtrVec(PtrVec&& x);
+  PtrVec& operator=(PtrVec&& x);
+
+  // Const iteration. Non-const iteration can be easily added if necessary.
+  using difference_type = std::ptrdiff_t;
+  using value_type = T;
+  using pointer = T*;
+  using reference = T&;
+  using const_reference = T const&;
+  using const_iterator = T const*;
+  const_iterator begin() const;
+  const_iterator end() const;
+
+  // Subset of vector-like operations.
+  size_t size() const;
+  bool empty() const;
+  T* data();
+  T const* data() const;
+  T& operator[](size_t i);
+  T operator[](size_t i) const;
+  T at(size_t i) const;
+  T front() const;
+  T back() const;
+  void clear();
+  void pop_back();
+  void push_back(T x);
+  void erase(const_iterator iter);
+
+  // For compatibility with existing code, allow conversion to vector.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator std::vector<T>() const;
+
+ private:
+  // rep_ is either a T, or its bottom two bits are interpreted as a tag:
+  //    kEmptyTag       empty
+  //    kBigTag         remaining bits are a Big*
+  //
+  // kEmptyTag and kBigTag have bottom bit 1. If we attempt to store a single
+  // pointer whose bottom bit is 1, we immediately switch to the big
+  // representation to avoid ambiguity.
+  // Empty vectors are represented uniquely in the small representation.
+  static constexpr uintptr_t kEmptyTag = 0x1;
+  static constexpr uintptr_t kBigTag = 0x3;
+  static constexpr uintptr_t kTagMask = 0x3;
+
+  struct Big {
+    size_t size;
+    size_t capacity;
+    T data[];  // Beginning of variable sized portion
+  };
+
+  inline static bool can_inline(T ptr) {
+    // If T has enough alignment, ptr bottom bit must be zero, so we can store
+    // it inline without ambiguity. Otherwise we do a dynamic check.
+    if constexpr (alignof(decltype(*ptr)) >= 2) {
+      DCHECK_EQ(reinterpret_cast<uintptr_t>(ptr) & 0x1, 0);
+      return true;
+    }
+    return ((reinterpret_cast<uintptr_t>(ptr) & 0x1) == 0);
+  }
+
+  inline bool is_big() const { return (rep_ & kTagMask) == kBigTag; }
+
+  inline Big* big() const {
+    DCHECK(is_big());
+    return reinterpret_cast<Big*>(rep_ & ~kTagMask);
+  }
+
+  // big_size returns the number of bytes to allocate for a Big representation
+  // that can store up to the specified number of elements.
+  inline static size_t big_size(size_t n) {
+    // Verify that we won't overflow.
+    static constexpr size_t kMaxFit =
+        (std::numeric_limits<size_t>::max() - sizeof(Big)) / sizeof(T);
+    DCHECK_LE(n, kMaxFit);
+    const size_t result = sizeof(Big) + n * sizeof(T);
+    DCHECK_GE(result, sizeof(Big));
+    return result;
+  }
+
+  // MakeBig switches to an empty Big representation with at least the
+  // specified capacity. Caller is responsible for freeing any old Big
+  // representation.
+  inline Big* MakeBig(size_t capacity) {
+    Big* big = static_cast<Big*>(malloc(big_size(capacity)));
+    big->size = 0;
+    big->capacity = capacity;
+    rep_ = reinterpret_cast<uintptr_t>(big) | kBigTag;
+    return big;
+  }
+
+  inline static void FreeBig(Big* big) { free(big); }
+
+  uintptr_t rep_;
+};
+
+// Implementation details:
+
+template <class T>
+inline PtrVec<T>::PtrVec() : rep_(kEmptyTag) {}
+
+template <class T>
+inline PtrVec<T>::~PtrVec() {
+  if (is_big()) FreeBig(big());
+}
+
+template <class T>
+inline PtrVec<T>::PtrVec(const PtrVec& x) : rep_(kEmptyTag) {
+  *this = x;
+}
+
+template <class T>
+inline PtrVec<T>& PtrVec<T>::operator=(const PtrVec& x) {
+  if (this == &x) {
+    return *this;
+  }
+
+  const size_t n = x.size();
+  Big* b;
+  if (!is_big()) {
+    // Stick with small representation if we can.
+    if (n < 2) {
+      if (n == 0) {
+        rep_ = kEmptyTag;
+        return *this;
+      }
+      T single = x.front();
+      if (can_inline(single)) {
+        rep_ = reinterpret_cast<uintptr_t>(single);
+        DCHECK(!empty());
+        DCHECK(!is_big());
+        return *this;
+      }
+    }
+
+    // Switch to big representation.
+    b = MakeBig(x.size());
+  } else {
+    if (n == 0) {
+      // Make empty() faster by always using a unique representation for empty
+      // vectors (tag is empty).
+      clear();
+      return *this;
+    }
+    b = big();
+    if (b->capacity < n) {
+      FreeBig(b);
+      b = MakeBig(n);
+    }
+  }
+
+  memcpy(b->data, x.data(), n * sizeof(T));
+  b->size = n;
+  return *this;
+}
+
+template <class T>
+inline PtrVec<T>::PtrVec(PtrVec&& x) : rep_(x.rep_) {
+  x.rep_ = kEmptyTag;
+}
+
+template <class T>
+inline PtrVec<T>& PtrVec<T>::operator=(PtrVec&& x) {
+  if (this != &x) {
+    if (is_big()) {
+      FreeBig(big());
+    }
+    rep_ = x.rep_;
+    x.rep_ = kEmptyTag;
+  }
+  return *this;
+}
+
+template <class T>
+inline size_t PtrVec<T>::size() const {
+  return is_big() ? big()->size : (rep_ != kEmptyTag ? 1 : 0);
+}
+
+template <class T>
+inline bool PtrVec<T>::empty() const {
+  return rep_ == kEmptyTag;
+}
+
+template <class T>
+inline T* PtrVec<T>::data() {
+  return is_big() ? big()->data : reinterpret_cast<T*>(&rep_);
+}
+
+template <class T>
+inline T const* PtrVec<T>::data() const {
+  return is_big() ? big()->data : reinterpret_cast<T const*>(&rep_);
+}
+
+template <class T>
+inline T& PtrVec<T>::operator[](size_t i) {
+  DCHECK_LT(i, size());
+  return *(data() + i);
+}
+
+template <class T>
+inline T PtrVec<T>::operator[](size_t i) const {
+  DCHECK_LT(i, size());
+  return *(data() + i);
+}
+
+template <class T>
+inline T PtrVec<T>::at(size_t i) const {
+  DCHECK_LT(i, size());
+  return *(data() + i);
+}
+
+template <class T>
+inline T PtrVec<T>::front() const {
+  return (*this)[0];
+}
+
+template <class T>
+inline T PtrVec<T>::back() const {
+  return (*this)[size() - 1];
+}
+
+template <class T>
+inline typename PtrVec<T>::const_iterator PtrVec<T>::begin() const {
+  return data();
+}
+
+template <class T>
+inline typename PtrVec<T>::const_iterator PtrVec<T>::end() const {
+  return data() + size();
+}
+
+template <class T>
+inline void PtrVec<T>::clear() {
+  if (is_big()) {
+    FreeBig(big());
+  }
+  rep_ = kEmptyTag;
+}
+
+template <class T>
+inline void PtrVec<T>::pop_back() {
+  DCHECK(!empty());
+  if (is_big()) {
+    big()->size--;
+    if (big()->size == 0) {
+      // Revert to unique representation of empty vectors.
+      clear();
+    }
+  } else {
+    rep_ = kEmptyTag;  // From length 1 to length 0
+  }
+}
+
+template <class T>
+inline void PtrVec<T>::push_back(T x) {
+  if (!is_big()) {
+    if (rep_ == kEmptyTag) {
+      if (can_inline(x)) {
+        // Switch from empty to singleton representation.
+        rep_ = reinterpret_cast<uintptr_t>(x);
+        DCHECK(!empty());
+        DCHECK(!is_big());
+      } else {
+        // Avoid ambiguity by jumping from empty to big representation.
+        Big* b = MakeBig(1);
+        b->size = 1;
+        b->data[0] = x;
+      }
+    } else {
+      // Switch from singleton to Big representation.
+      T singleton = front();
+      Big* b = MakeBig(2);
+      b->size = 2;
+      b->data[0] = singleton;
+      b->data[1] = x;
+    }
+  } else {
+    // See if x fits in current Big.
+    Big* b = big();
+    const size_t n = b->size;
+    DCHECK_LE(n, b->capacity);
+    if (n == b->capacity) {
+      Big* old = b;
+      b = MakeBig(std::max<size_t>(2, 2 * old->capacity));
+      memcpy(b->data, old->data, n * sizeof(T));
+      FreeBig(old);
+    }
+    b->data[n] = x;
+    b->size = n + 1;
+  }
+}
+
+template <class T>
+inline void PtrVec<T>::erase(const_iterator iter) {
+  DCHECK_GE(iter, begin());
+  DCHECK_LT(iter, end());
+  if (!is_big()) {
+    // Must be going from single element to zero.
+    rep_ = kEmptyTag;
+  } else {
+    Big* b = big();
+    const size_t index = iter - b->data;
+    memmove(b->data + index, b->data + index + 1,
+            (b->size - index - 1) * sizeof(T));
+    b->size--;
+    if (b->size == 0) {
+      // Revert to unique representation for empty vectors.
+      clear();
+    }
+  }
+}
+
+template <class T>
+inline PtrVec<T>::operator std::vector<T>() const {
+  if (empty()) return {};
+  return std::vector<T>(begin(), end());
+}
+
+template <typename T>
+bool operator==(const PtrVec<T>& a, const PtrVec<T>& b) {
+  auto a_data = a.data();
+  auto b_data = b.data();
+  return std::equal(a_data, a_data + a.size(), b_data, b_data + b.size());
+}
+
+template <typename T>
+bool operator!=(const PtrVec<T>& a, const PtrVec<T>& b) {
+  return !(a == b);
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_PTRVEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/tile_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/tile_assignment.h
new file mode 100644
index 00000000..31d87432
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/ir/tile_assignment.h
@@ -0,0 +1,283 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_TILE_ASSIGNMENT_H_
+#define XLA_HLO_IR_TILE_ASSIGNMENT_H_
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/printer.h"
+
+namespace xla {
+
+class TileAssignment;
+
+// Describes a TileAssignment with a device array generated from reshaping and
+// transposing an iota array, a.k.a. HloShardingV2. This is a more scalable
+// format for large number of devices since it does not materialize the full
+// list of devices, while being less general since it cannot represent
+// arbitrary sequence of devices. It is however sufficient to represent the
+// most commonly generated SPMD shardings from ML frameworks that arrange
+// devices using mesh axes.
+class IotaTileAssignment {
+ public:
+  // Create a trivial (i.e. the device array is a trivial iota without reshape
+  // and transpose) IotaTileAssignment with given dimensions.
+  static IotaTileAssignment Create(absl::Span<const int64_t> dims);
+  // Creates an IotaTileAssignment canonicalizing `reshape_dims` and
+  // `transpose_perm`. More specifically the tile assignment array is as if it
+  // is produced by the following numpy code:
+  //   numpy.arange(math.prod(dims)).reshape(reshape_dims)
+  //      .transpose(transpose_perm).reshape(math.prod(dims))
+  // where:
+  // `dims`: is the dimensions of the tile assignment array.
+  // `reshape_dims`: is the dimensions the 1D iota array is reshaped to.
+  // `transpose_perm`: is the dimension permutation to transpose `reshape_dims`.
+  //
+  // e.g. dims=[4,4,1] reshape_dims=[4,2,2], transpose_perm=[0,1,2] (no
+  // transpose) corresponds to [4,4,1]<=[16] which in full array V1 format is
+  // [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15].
+  // e.g. dims=[4,4,1] reshape_dims=[4,2,2], transpose_perm=[1,0,2] (swap dim 0
+  // and dim 1) corresponds to [4,4,1]<=[4,2,2]T(1,0,2) which in full array V1
+  // format is [0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15].
+  static IotaTileAssignment Create(absl::Span<const int64_t> dims,
+                                   absl::Span<const int64_t> reshape_dims,
+                                   absl::Span<const int> transpose_perm);
+
+  ~IotaTileAssignment() = default;
+  IotaTileAssignment(const IotaTileAssignment& other);
+  IotaTileAssignment(IotaTileAssignment&& other) = default;
+  IotaTileAssignment& operator=(const IotaTileAssignment& other);
+  IotaTileAssignment& operator=(IotaTileAssignment&& other) = default;
+
+  bool operator==(const IotaTileAssignment& other) const {
+    return dims() == other.dims() && reshape_dims() == other.reshape_dims() &&
+           transpose_perm() == other.transpose_perm();
+  }
+
+  int64_t value_at(absl::Span<const int64_t> index) const;
+
+  int64_t ndims() const { return ndims_; }
+
+  absl::Span<const int64_t> dims() const {
+    return absl::MakeSpan(dims_ptr(), ndims_);
+  }
+
+  int64_t dim(int n) const { return dims_ptr()[n]; }
+
+  absl::Span<const int64_t> reshape_dims() const {
+    return absl::MakeSpan(reshape_dims_ptr(), reshape_ndims_);
+  }
+
+  absl::Span<const int> transpose_perm() const {
+    return absl::MakeSpan(transpose_perm_ptr(), reshape_ndims_);
+  }
+
+  int64_t num_elements() const {
+    return absl::c_accumulate(dims(), 1LL, std::multiplies<int64_t>());
+  }
+
+  // TODO(b/281892190): This should really not return optional, when we are
+  // sure we can handle all cases.
+  std::optional<IotaTileAssignment> Transpose(absl::Span<const int> perm) const;
+
+  void Print(Printer* printer) const;
+
+  std::string ToString() const;
+
+  // Materializes array representation of IotaTileAssignment.
+  Array<int64_t> ToArray() const;
+
+ private:
+  friend class TileAssignment;
+  static constexpr int kPerDimBytes = sizeof(int64_t);
+  static constexpr int kPerReshapeDimBytes = sizeof(int64_t) + sizeof(int);
+
+  explicit IotaTileAssignment(absl::Span<const int64_t> dims,
+                              absl::Span<const int64_t> reshape_dims,
+                              absl::Span<const int> transpose_perm);
+
+  explicit IotaTileAssignment(int ndims, int reshape_ndims);
+
+  int64_t* dims_ptr() { return reinterpret_cast<int64_t*>(storage_.get()); }
+  const int64_t* dims_ptr() const {
+    return reinterpret_cast<const int64_t*>(storage_.get());
+  }
+  const int64_t* reshape_dims_ptr() const { return dims_ptr() + ndims_; }
+  int64_t* reshape_dims_ptr() {
+    return const_cast<int64_t*>(
+        const_cast<const IotaTileAssignment*>(this)->reshape_dims_ptr());
+  }
+  const int* transpose_perm_ptr() const {
+    return reinterpret_cast<const int*>(reshape_dims_ptr() + reshape_ndims_);
+  }
+  int* transpose_perm_ptr() {
+    return const_cast<int*>(
+        const_cast<const IotaTileAssignment*>(this)->transpose_perm_ptr());
+  }
+
+  int size_bytes() const {
+    return ndims_ * kPerDimBytes + reshape_ndims_ * kPerReshapeDimBytes;
+  }
+
+  bool next_index(absl::Span<int64_t> index) const {
+    DCHECK_EQ(index.size(), ndims_);
+    for (int64_t i = ndims_ - 1; i >= 0; --i) {
+      index[i]++;
+      if (index[i] < dims_ptr()[i]) {
+        return true;
+      }
+      index[i] = 0;
+    }
+    return false;
+  }
+  int32_t ndims_;
+  int32_t reshape_ndims_;
+  // Contiguous buffer storing `int64_t dims[]`, `int64_t reshape_dims[]`,
+  // `int transpose_perm[]` in order.
+  std::unique_ptr<char[]> storage_;
+};
+
+// Internal class that represents how an ordered list of device IDs are sharded
+// along different dimensions. It manages full or compact representation of the
+// device IDs without having callers worry about what underlying format is used.
+// This class is meant to be included ONLY by HloSharding so it does not return
+// error status on invalid arguments but rather assert preconditions in its
+// implementation, assuming it should always get valid data.
+// NOTE: This class is immutable.
+class TileAssignment {
+ public:
+  TileAssignment() : array_(ReplicatedArray()) {}
+  explicit TileAssignment(std::shared_ptr<const Array<int64_t>> array)
+      : shared_array_(std::move(array)), array_(shared_array_.get()) {}
+  explicit TileAssignment(int64_t device_id)
+      : TileAssignment(std::make_shared<const Array<int64_t>>(
+            std::initializer_list<int64_t>{1}, device_id)) {}
+  explicit TileAssignment(IotaTileAssignment iota) : iota_(std::move(iota)) {}
+  explicit TileAssignment(std::initializer_list<int64_t> dims)
+      : iota_(IotaTileAssignment::Create(dims)) {}
+  explicit TileAssignment(absl::Span<const int64_t> dims)
+      : iota_(IotaTileAssignment::Create(dims)) {}
+  explicit TileAssignment(absl::Span<const int64_t> dims,
+                          absl::Span<const int64_t> reshape_dims,
+                          absl::Span<const int> transpose_perm)
+      : iota_(IotaTileAssignment::Create(dims, reshape_dims, transpose_perm)) {}
+
+  bool operator==(const TileAssignment& other) const;
+  bool operator!=(const TileAssignment& other) const {
+    return !operator==(other);
+  }
+  // Methods that mirrors those of xla::Array<int64_t>.
+  template <typename... Dims>
+  typename std::enable_if_t<array_impl::pack_is_integral<Dims...>::value,
+                            int64_t>
+  operator()(Dims... dims) const {
+    DCHECK_EQ(sizeof...(dims), num_dimensions());
+    std::array<int64_t, sizeof...(dims)> indexes{
+        {static_cast<int64_t>(dims)...}};
+    return operator()(indexes);
+  }
+  int64_t operator()(absl::Span<const int64_t> indexes) const;
+
+  absl::Span<const int64_t> dimensions() const;
+  int64_t num_dimensions() const;
+  int64_t dim(int64_t n) const;
+  int64_t num_elements() const;
+  int64_t first() const;
+
+  void Each(
+      absl::FunctionRef<void(absl::Span<const int64_t>, int64_t)> f) const;
+
+  absl::Status EachStatus(
+      absl::FunctionRef<absl::Status(absl::Span<const int64_t>, int64_t)> f)
+      const;
+
+  // Returns a tile assignment reshaped to the given dimensions.
+  // REQUIRES: new shape has the same number of elements.
+  [[nodiscard]] TileAssignment Reshape(
+      absl::Span<const int64_t> new_dimensions) const;
+
+  // Returns a tile assignment transposd using the given dimension permutations.
+  // REQUIRES: `perm` must a an array of num_dimensions elements, with unique
+  // values within [0, num_dimensions).
+  [[nodiscard]] TileAssignment Transpose(absl::Span<const int> perm) const;
+
+  void Print(Printer* printer) const;
+
+  std::string ToString() const;
+
+  bool UsesDevice(int64_t device) const;
+
+  // Returns non-nullopt iota tile assignment iff it holds that format.
+  const std::optional<IotaTileAssignment>& iota() const { return iota_; }
+  // Returns reference to the full array representation. If it holds iota
+  // format, reference to a lazily materialized array is returned.
+  const Array<int64_t>& array() const;
+  // Similar to array() but returns the underlying shared_ptr to avoid deep
+  // copy.
+  const std::shared_ptr<const Array<int64_t>>& shared_array() const;
+  // Makes a deep copy of shared_array().
+  std::shared_ptr<Array<int64_t>> shared_array_clone() const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const TileAssignment& tile) {
+    // TODO(b/281892190): Ideally hashing a TileAssignment should not force iota
+    // -> full array conversion, but a requirement is that they should have
+    // equivalence. Consider providing a faster hash function for iota tile
+    // assignment.
+    return H::combine(std::move(h), tile.array());
+  }
+
+ private:
+  friend class HloSharding;
+  // TODO(b/281892190): Consider changing int64_t to int32_t since it's unlikely
+  // to have so many devices to overflow int32_t in practice.
+  explicit TileAssignment(IotaTileAssignment iota,
+                          std::shared_ptr<const Array<int64_t>> shared_array)
+      : iota_(std::move(iota)),
+        shared_array_(std::move(shared_array)),
+        array_(shared_array_.get()) {}
+
+  void MaybeMaterializeFullArray() const;
+
+  static const Array<int64_t>* ReplicatedArray() {
+    static auto* array = new Array<int64_t>({0});
+    return array;
+  }
+
+  std::optional<IotaTileAssignment> iota_;
+  // If iota_ is set, shared_array_ is a lazy cache of the materialized array.
+  mutable std::shared_ptr<const Array<int64_t>> shared_array_;
+  // Pointer to the storage of the fully materialized array format.
+  mutable const Array<int64_t>* array_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_TILE_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/parser/hlo_lexer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/parser/hlo_lexer.h
new file mode 100644
index 00000000..f787392b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/parser/hlo_lexer.h
@@ -0,0 +1,218 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_PARSER_HLO_LEXER_H_
+#define XLA_HLO_PARSER_HLO_LEXER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/regexp.h"
+
+namespace xla {
+
+// Defines different kinds of tokens used by the HLO lexer.
+//
+// You shouldn't need to use this directly unless you're using HloLexer
+// directly, and you probably don't need to do that.  Use hlo_parser instead.
+enum class TokKind {
+  // Markers
+  kEof,
+  kError,
+
+  // Tokens with no info.
+  kEqual,         // =
+  kComma,         // ,
+  kColon,         // :
+  kAsterisk,      // *
+  kQuestionMark,  // ?
+  kOctothorp,     // #
+  kPlus,          // +
+  kTilde,         // ~
+  kLsquare,
+  kRsquare,  // [  ]
+  kLbrace,
+  kRbrace,  // {  }
+  kLparen,
+  kRparen,  // (  )
+  kDots,    // ...
+
+  kArrow,  // ->
+  kLeq,    // <=
+
+  // Keywords
+  kw_HloModule,
+  kw_ENTRY,
+  kw_ROOT,
+  kw_true,
+  kw_false,
+  kw_maximal,
+  kw_replicated,
+  kw_manual,
+  kw_last_tile_dim_replicate,
+  kw_shard_as,
+  kw_shard_like,
+  kw_unknown,
+  kw_inf,
+
+  kNegInf,  // -inf
+
+  // Typed tokens.
+  kPrimitiveType,  // F32, PRED, etc.
+  kName,           // %foo
+  kAttributeName,  // dimensions=
+  kDimLabels,      // [0-9bf?]{2,}_[0-9io?]{2,}->[0-9bf?]{2,}
+  kDxD,            // [0-9]+(x[0-9]+)+
+  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+  kSparsityDesc,   // ([LR]\.[0-9]+@[0-9]+:[0-9]+_?)+
+  kIdent,          // other identifiers
+  kString,         // "abcd\"\n"
+  kInt,            // 42
+  kDecimal,        // 4.2
+};
+
+std::string TokKindToString(TokKind kind);
+
+// Lexer for the HloModule::ToString() format text.
+//
+// This class is meant to be used by hlo_parser.cc.  You shouldn't need to use
+// it directly.
+class HloLexer {
+ public:
+  explicit HloLexer(absl::string_view buf) : buf_(buf) {
+    current_ptr_ = buf_.data();
+  }
+
+  TokKind Lex() { return token_state_.current_kind = LexToken(); }
+
+  TokKind GetKind() const { return token_state_.current_kind; }
+  std::string GetStrVal() const {
+    switch (GetKind()) {
+      case TokKind::kName:
+      case TokKind::kAttributeName:
+      case TokKind::kDimLabels:
+      case TokKind::kDxD:
+      case TokKind::kPad:
+      case TokKind::kSparsityDesc:
+      case TokKind::kString:
+      case TokKind::kIdent:
+        return token_state_.str_val;
+      default:
+        LOG(FATAL) << "This token does not have string value";
+    }
+  }
+  int64_t GetInt64Val() const {
+    CHECK(GetKind() == TokKind::kInt) << TokKindToString(GetKind());
+    return token_state_.int64_val;
+  }
+  double GetDecimalVal() const {
+    CHECK(GetKind() == TokKind::kDecimal);
+    return token_state_.decimal_val;
+  }
+  PrimitiveType GetPrimitiveTypeVal() const {
+    CHECK(GetKind() == TokKind::kPrimitiveType);
+    return token_state_.primitive_type_val;
+  }
+
+  typedef const char* LocTy;
+
+  // Returns the location of the current token.
+  LocTy GetLoc() const { return token_state_.token_start; }
+
+  // Returns the line and column of a location in the buffer.
+  std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
+
+  // Returns the whole line given the location.
+  absl::string_view GetLine(LocTy loc) const;
+
+  // Looks ahead one token and returns it. Lexer state is unchanged.
+  TokKind LookAhead();
+
+  // Lexes a string delimited by matching curly braces.  Curlies contained
+  // inside double quotes don't count.
+  //
+  // Requires that you've already lexed the open curly brace.
+  //
+  // The returned string value includes the outer curlies.
+  //
+  // Returns TokKind::kString on success.
+  TokKind LexJsonDict();
+
+ private:
+  // Returns the current character. If it's neither the end of input buffer nor
+  // an invalid character, moves the pointer forward.
+  int GetNextChar();
+
+  // Returns the current character.
+  int PeekCurrentChar() const;
+
+  // Creates string_view with the given begin and end. Exits if the begin > end,
+  // or it's out of the range of the current buffer.
+  absl::string_view StringViewFromPointers(const char* begin,
+                                           const char* end) const;
+
+  // Returns true if the given ptr is dereferenceable within the range of the
+  // current buffer.
+  bool CanDereference(const char* ptr) const;
+
+  TokKind LexToken();
+
+  TokKind LexIdentifier();
+  TokKind LexPercent();
+  TokKind LexShape();
+  TokKind LexConstant();
+  TokKind LexNumberOrPattern();
+  TokKind LexString();
+
+  std::optional<int64_t> LexNanPayload(absl::string_view& consumable);
+
+  absl::string_view buf_;
+  const char* current_ptr_;
+
+  // Information about the current token.
+  struct TokenState {
+    const char* token_start = nullptr;
+    TokKind current_kind;
+    std::string str_val;
+    int64_t int64_val;
+    double decimal_val;
+    PrimitiveType primitive_type_val;
+  };
+  TokenState token_state_;
+
+  struct LineNoCacheTy {
+    const char* last_query;
+    unsigned line_no_of_query;
+  };
+  // This caches the line number of the previous query.
+  mutable LineNoCacheTy line_no_cache_{nullptr, 0};
+};
+
+// Does this string start with "{", end with "}", and contain valid-ish JSON
+// in-between?  If so, hlo_parser can parse e.g. backend_config={blah: "blah"}
+// instead of the much uglier backend_config="{blah: \"blah\"}".
+//
+// (Technically we're not checking for fully-valid JSON, just something we can
+// find the end of reasonably.)
+bool LexesAsJsonDict(absl::string_view str);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_PARSER_HLO_LEXER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/parser/hlo_parser.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/parser/hlo_parser.h
new file mode 100644
index 00000000..3d1d2f25
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/parser/hlo_parser.h
@@ -0,0 +1,127 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_PARSER_HLO_PARSER_H_
+#define XLA_HLO_PARSER_HLO_PARSER_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/parser/hlo_lexer.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class HloParserOptions {
+ public:
+  // When a shape layout is not set (e.g. in the entry computation layout or
+  // instruction layout), set the layout to be the default (e.g. {3,2,1,0}).
+  HloParserOptions& set_fill_missing_layouts(bool value) {
+    fill_missing_layouts_ = value;
+    return *this;
+  }
+
+  bool fill_missing_layouts() const { return fill_missing_layouts_; }
+
+  // Fill short form constants (dots) with deterministic random values.
+  HloParserOptions& set_fill_shortform_constants_with_random_values(
+      bool value) {
+    fill_shortform_constants_with_random_values_ = value;
+    return *this;
+  }
+  bool fill_shortform_constants_with_random_values() const {
+    return fill_shortform_constants_with_random_values_;
+  }
+
+ private:
+  bool fill_missing_layouts_ = true;
+  bool fill_shortform_constants_with_random_values_ = true;
+};
+
+// Given a string in the HloModule::ToString() format, parses the string and
+// creates a HloModule with the given config.
+// Note: Tests derived from HloHardwareIndependentTestBase should use
+// ParseAndReturnVerifiedModule() instead!
+absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
+    absl::string_view str, const HloModuleConfig& config = HloModuleConfig(),
+    const HloParserOptions& options = HloParserOptions());
+
+// Parses sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string, e.g.,
+// "{replicated}".
+absl::StatusOr<HloSharding> ParseSharding(absl::string_view str);
+
+// Parses frontend attributes from str. str is supposed to contain the body of
+// the frontend attributes , i.e. just the rhs of the
+// "frontend_attributes={...}" attribute string, e.g.,
+// "{attr_a=a,attr_b=b}".
+absl::StatusOr<FrontendAttributes> ParseFrontendAttributes(
+    absl::string_view str);
+
+// Parses statistics viz from str. str is supposed to contain the body of the
+// statistics visualization, i.e. just the rhs of the "statistics={...}"
+// attribute string, e.g., "{visualizing_index=1,nan_percent=50}".
+absl::StatusOr<StatisticsViz> ParseStatisticsViz(absl::string_view str);
+
+// Parses parameter replication from str. str is supposed to contain the body of
+// the parameter replication, i.e. just the rhs of the
+// "parameter_replication={...}" attribute string, e.g., "{true, false}".
+absl::StatusOr<std::vector<bool>> ParseParameterReplication(
+    absl::string_view str);
+
+// Parses the result of window_util::ToString(const Window&).
+absl::StatusOr<Window> ParseWindow(absl::string_view str);
+
+// Parses the result of ConvolutionDimensionNumbersToString(), e.g.
+// "b0f_0io->b0f".
+absl::StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+    absl::string_view str);
+
+// Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
+absl::StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
+
+// Parses and returns a Shape::ToString-format string.
+absl::StatusOr<Shape> ParseShape(absl::string_view str);
+
+// Parses and returns a Layout::ToString-format string.
+absl::StatusOr<Layout> ParseLayout(absl::string_view str);
+
+// Parses and returns a std::vector<ReplicaGroup> from str. str is supposed to
+// contain a list of the replica groups, i.e. just the rhs of the
+// "replica_groups={...}" attribute string, e.g., "{{0,1}, {2,3}}".
+absl::StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly(
+    absl::string_view str);
+
+class HloParser {
+ public:
+  // Runs the parser and constructs the resulting HLO in the given (empty)
+  // HloModule. Returns the error status in case an error occurred.
+  virtual absl::Status Run(HloModule* module) = 0;
+  virtual ~HloParser() {}
+
+ private:
+  static std::unique_ptr<HloParser> CreateHloParserForTests(
+      absl::string_view str);
+  friend class VerifiedHloModule;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_PARSER_HLO_PARSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_fix.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
new file mode 100644
index 00000000..dd92eadc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
@@ -0,0 +1,136 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_PASS_HLO_PASS_FIX_H_
+#define XLA_HLO_PASS_HLO_PASS_FIX_H_
+
+#include <algorithm>
+#include <type_traits>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/status_macros.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Do an HLO pass to a fix point.
+template <typename Pass, int kIterationLimit = 25>
+class HloPassFix : public Pass {
+ public:
+  static_assert(std::is_base_of<HloPassInterface, Pass>::value,
+                "Pass must be a subclass of HloPassInterface");
+  using RunState = HloPassInterface::RunState;
+  template <typename... Args>
+  explicit HloPassFix(Args&&... args) : Pass(args...) {}
+
+  absl::Status RunOnChangedComputations(
+      HloModule* module, RunState* outer_run_state,
+      const absl::flat_hash_set<absl::string_view>& execution_threads)
+      override {
+    RunState run_state;
+    run_state.changed_last_iteration = outer_run_state->changed_last_iteration;
+    TF_RETURN_IF_ERROR(RunToFixPoint(module, &run_state, execution_threads));
+    outer_run_state->changed_this_iteration.insert(run_state.changed.begin(),
+                                                   run_state.changed.end());
+    return absl::OkStatus();
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    RunState run_state(module);
+    TF_RETURN_IF_ERROR(RunToFixPoint(module, &run_state, execution_threads));
+    return !run_state.changed.empty();
+  }
+
+  using HloPassInterface::RunOnModuleGroup;
+  absl::StatusOr<bool> RunOnModuleGroup(
+      HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads)
+      override {
+    bool changed = false;
+    bool changed_this_iteration = true;
+    int64_t iteration_count = 0;
+    VLOG(3) << "Running HloPassFix.";
+    while (changed_this_iteration) {
+      TF_ASSIGN_OR_RETURN(
+          changed_this_iteration,
+          Pass::RunOnModuleGroup(module_group, execution_threads));
+      changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
+      ++iteration_count;
+      if (iteration_count == kIterationLimit) {
+        VLOG(1) << "Unexpectedly high number of iterations in HLO passes, "
+                   "exiting fixed point loop.";
+        // Return false in case this is fixed point is nested.
+        return false;
+      }
+    }
+    return changed;
+  }
+
+ private:
+  absl::Status RunToFixPoint(
+      HloModule* module, RunState* run_state,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    VLOG(3) << "Running HloPassFix on " << Pass::name();
+    while (!run_state->changed_last_iteration.empty()) {
+      TF_RETURN_IF_ERROR(
+          RunOnChangedComputationsOnce(module, run_state, execution_threads));
+      VLOG(3) << Pass::name() << " iteration " << run_state->iteration
+              << " changed_this_iteration: "
+              << !run_state->changed_last_iteration.empty();
+      run_state->IncrementIteration();
+      if (run_state->iteration == kIterationLimit) {
+        VLOG(1) << "Unexpectedly high number of iterations in HLO passes '"
+                << Pass::name() << "' for module '" << module->name()
+                << "'. Exiting fixed point loop.";
+        // Clear changed and abort in case this is fixed point is nested.
+        run_state->changed.clear();
+        break;
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status RunOnChangedComputationsOnce(
+      HloModule* module, RunState* run_state,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    // If Pass overrides RunOnChangedComputations, just forward to it.
+    if (!std::is_same<decltype(&HloPassInterface::RunOnChangedComputations),
+                      decltype(&Pass::RunOnChangedComputations)>::value) {
+      return Pass::RunOnChangedComputations(module, run_state,
+                                            execution_threads);
+    }
+    // If Pass does not override the default
+    // HloPassInterface::RunOnChangedComputations that calls into
+    // HloPassFix<Pass>::Run, avoid infinite recursion.
+    TF_ASSIGN_OR_RETURN(bool changed, Pass::Run(module, execution_threads));
+    if (changed) {
+      auto computations = module->computations(execution_threads);
+      run_state->changed_this_iteration.insert(computations.begin(),
+                                               computations.end());
+    }
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_PASS_HLO_PASS_FIX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
new file mode 100644
index 00000000..98abe568
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
@@ -0,0 +1,183 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_PASS_HLO_PASS_INTERFACE_H_
+#define XLA_HLO_PASS_HLO_PASS_INTERFACE_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Base class for HLO passes. These are used with the HloPassPipeline to
+// organize a sequence of passes. An HLO pass should not extend this class
+// directly; it should extend HloModulePass or HloModuleGroupPass.
+class HloPassInterface {
+ public:
+  // Struct that holds states of pass runs across multiple iterations.
+  struct RunState {
+    // The current iteration number.
+    int iteration = 0;
+    // Set of all changed computations from all pass runs using this state.
+    absl::flat_hash_set<HloComputation*> changed;
+    // Set of changed computation from previous iteration.
+    absl::flat_hash_set<HloComputation*> changed_last_iteration;
+    // Set of changed computation from current iteration.
+    absl::flat_hash_set<HloComputation*> changed_this_iteration;
+
+    RunState() = default;
+    explicit RunState(HloModule* module)
+        : changed_last_iteration(module->computations().begin(),
+                                 module->computations().end()) {}
+
+    // Transition to the next iteration.
+    //
+    // Depending on the pass implmentation, one iteration includes all the work
+    // done between two IncrementIteration calls, there can be arbitrary number
+    // of passes that ran arbitrary times with this state.
+    void IncrementIteration() {
+      using std::swap;
+      changed.insert(changed_this_iteration.begin(),
+                     changed_this_iteration.end());
+      swap(changed_last_iteration, changed_this_iteration);
+      changed_this_iteration.clear();
+      ++iteration;
+    }
+  };
+  virtual ~HloPassInterface() = default;
+  virtual absl::string_view name() const = 0;
+
+  // Run the pass on the given HLO module with specified execution_threads.
+  // Empty execution_threads list means all execution_threads are included.
+  // Returns whether it modified the module.
+  //
+  // Note: C++ hides non-explicitly declared overloaded functions.
+  // You can make all overloaded variants available in the child class  by
+  // adding `using HloPassInterface::Run;` to the child class declaration.
+  absl::StatusOr<bool> Run(HloModule* module) {
+    return Run(module, /*execution_threads=*/{});
+  }
+  virtual absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
+
+  // Run the pass on computation on changed computations from last iteration in
+  // given HLO module for specified execution_threads, with caller provided
+  // RunState which holds the state information across multiple iterations.
+  //
+  // NOTE: This is a temporary default implementation that conservatively treats
+  // all computations as changed. Eventually all passes should override this
+  // method instead of Run() and Run() will call into this method instead.
+  virtual absl::Status RunOnChangedComputations(
+      HloModule* module, RunState* run_state,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    TF_ASSIGN_OR_RETURN(bool changed, Run(module, execution_threads));
+    if (changed) {
+      auto computations = module->computations(execution_threads);
+      run_state->changed_this_iteration.insert(computations.begin(),
+                                               computations.end());
+    }
+    return absl::OkStatus();
+  }
+
+  // Run the pass on the given HLO module group for specified
+  // `execution_threads`. Empty `execution_threads` list means all execution
+  // threads are included. Returns whether it modified the module group.
+  // Ideally, the module group variant would be named "Run" as well, but C++
+  // does not handle overloaded virtual methods well.
+  //
+  // See the caveat about C++ hiding overloaded functions in the Run function
+  // above.
+  absl::StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) {
+    return RunOnModuleGroup(module_group, /*execution_threads=*/{});
+  }
+  virtual absl::StatusOr<bool> RunOnModuleGroup(
+      HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
+
+  virtual bool IsPassPipeline() const { return false; }
+
+  // If an HloPassMetadata has previously been created, it adds a (key, value)
+  // pair metric if none was already set or updates the existing value.
+  // If an HloPassMetadata doesn't exist, it simply returns.
+  static void SetKVMetric(HloModule* module, const std::string& key,
+                          int64_t value) {
+    auto status = module->metadata()->set_key_value_metric(key, value);
+    if (!status.ok()) {
+      // Only logging since this should not crash the application.
+      // It usually means the pass was invoked on its own.
+      LOG(WARNING) << "Failed to set stat: " << status;
+    }
+  }
+};
+
+// Base class for passes which are module-scoped.
+class HloModulePass : public HloPassInterface {
+ public:
+  // Runs the pass on a module group by iterating through each module in the
+  // group.
+  absl::StatusOr<bool> RunOnModuleGroup(
+      HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads)
+      override {
+    bool changed = false;
+    for (HloModule* module : module_group->modules()) {
+      TF_ASSIGN_OR_RETURN(bool module_changed, Run(module, execution_threads));
+      changed |= module_changed;
+    }
+    return changed;
+  };
+
+  // Update the layout of a Shape to one that is supported by a given backend.
+  // One can call this function after modifying the Shape in case that modifying
+  // the Shape requires changes to the layout for the given Backend.
+  //
+  // TODO(b/129084868): Make this Backend dependent instead of requiring
+  // deriving from the pass and overriding this function.
+  virtual void UpdateLayout(Shape* shape) {
+    // CPU/GPU backends require shapes of subbyte types to be packed.
+    ShapeUtil::UpdateElementSizeInBits(shape, /*pack_subbyte_types=*/true);
+  }
+};
+
+// Base class for passes which are module-group scoped. These passes cannot run
+// on an HLO module.
+class HloModuleGroupPass : public HloPassInterface {
+ public:
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    return Internal("Module group pass cannot be run on a module");
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_PASS_HLO_PASS_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
new file mode 100644
index 00000000..e6b6cf4c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
@@ -0,0 +1,175 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_PASS_HLO_PASS_PIPELINE_H_
+#define XLA_HLO_PASS_HLO_PASS_PIPELINE_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/compilation_stats.h"
+#include "xla/types.h"
+
+namespace xla {
+
+class PhaseOrderPipeline;
+
+// Pipeline of HLO passes.
+class HloPassPipeline : public HloPassInterface {
+ public:
+  explicit HloPassPipeline(const std::string& name,
+                           CompilationStats* compilation_stats = nullptr)
+      : name_(name), compilation_stats_(compilation_stats) {
+    if (compilation_stats == nullptr) {
+      empty_compilation_stats_ = CompilationStats::MakeNoopStats();
+      compilation_stats_ = empty_compilation_stats_.get();
+    }
+  }
+  absl::string_view name() const override { return name_; }
+
+  // Add a pass to the pipeline. It should be called with the arguments for the
+  // pass constructor:
+  //
+  //   pipeline.AddPass<FooPass>(constructor_arg1, constructor_arg2);
+  //
+  // Returns a reference to the added pass.
+  template <typename T, typename... Args>
+  T& AddPass(Args&&... args) {
+    CHECK(!run_called_) << "AddPass cannot be called after Run";
+    auto pass = new T(std::forward<Args>(args)...);
+    passes_.push_back(std::unique_ptr<T>(pass));
+    return *pass;
+  }
+
+  // Add an invariant-checking pass to the pipeline. It will be run before and
+  // after each HLO pass. The invariant checking pass must not mutate the graph
+  // (it is required to always return "false" from its Run() method).
+  template <typename T, typename... Args>
+  T& AddInvariantChecker(Args&&... args) {
+    CHECK(!run_called_) << "AddInvariantChecker cannot be called after Run";
+    auto pass = new T(std::forward<Args>(args)...);
+    invariant_checkers_.push_back(std::unique_ptr<T>(pass));
+    return *pass;
+  }
+
+  // Add an invariant-checking pass to the pipeline on debug builds only.
+  template <typename T, typename... Args>
+  void AddInvariantCheckerDebug(Args&&... args) {
+#ifndef NDEBUG
+    AddInvariantChecker<T>(std::forward<Args>(args)...);
+#endif  // NDEBUG
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  using HloPassInterface::RunOnModuleGroup;
+  absl::StatusOr<bool> RunOnModuleGroup(
+      HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  bool IsPassPipeline() const override { return true; }
+
+  // Return size of passes_.
+  int PassesSize() { return passes_.size(); }
+  // Return reference to pass specified by index.
+  HloPassInterface& GetPass(int index) { return *passes_[index]; }
+
+ private:
+  // Returns the set of passes which are enabled. DebugOptions can selectively
+  // disable passes via --xla_disable_hlo_passes flag.
+  std::vector<HloPassInterface*> GetEnabledPasses(
+      const DebugOptions& debug_options);
+
+  // Maybe dumps the given module or module group depending on flag values
+  // contained in DebugOptions of module config. If it is dumped, saves the
+  // filenames of the dumps into module metadata.
+  void MaybeDumpHloAndSaveFilenames(HloModuleGroup& module_group,
+                                    absl::string_view after_pass_name,
+                                    absl::string_view before_pass_name);
+  void MaybeDumpHloAndSaveFilenames(HloModule& module,
+                                    absl::string_view after_pass_name,
+                                    absl::string_view before_pass_name);
+
+  // Runs the invariant checker on the given HLO for specified
+  // `execution_threads`. Empty `execution_threads` means all execution threads
+  // are included. HloT can be either HloModule or HloModuleGroup.
+  template <typename HloT>
+  absl::Status RunInvariantCheckers(HloT* hlo,
+                                    absl::string_view after_pass_name) {
+    return RunInvariantCheckers(hlo, after_pass_name, /*execution_threads=*/{});
+  }
+  template <typename HloT>
+  absl::Status RunInvariantCheckers(
+      HloT* hlo, absl::string_view after_pass_name,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Helper which runs the given pass on the given HLO. HloT can be either
+  // HloModule or HloModuleGroup.
+  template <typename HloT>
+  absl::StatusOr<bool> RunPassesInternal(
+      HloT* hlo, const DebugOptions& debug_options,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Helpers which run the given passes on the given HLO construct. Only
+  // computations with specified `execution_threads` are considered by the pass,
+  // empty thread list means all `execution_threads` are considered. These
+  // helpers enable templating of the core of the pipeline logic by providing
+  // HloModule and HloModuleGroup specific methods with the same name.
+  static absl::StatusOr<bool> RunHelper(
+      HloPassInterface* pass, HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    TF_ASSIGN_OR_RETURN(bool changed, pass->Run(module, execution_threads));
+    module->Cleanup();
+    return changed;
+  }
+  static absl::StatusOr<bool> RunHelper(
+      HloPassInterface* pass, HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    TF_ASSIGN_OR_RETURN(
+        bool changed, pass->RunOnModuleGroup(module_group, execution_threads));
+    module_group->Cleanup();
+    return changed;
+  }
+
+  const std::string name_;
+  std::vector<std::unique_ptr<HloPassInterface>> passes_;
+  std::vector<std::unique_ptr<HloPassInterface>> invariant_checkers_;
+  bool run_called_ = false;
+
+  CompilationStats* compilation_stats_;
+  // Default stats instance for when one is not passed in the constructor.
+  // Use via compilation_stats_, not directly.
+  std::unique_ptr<CompilationStats> empty_compilation_stats_;
+
+  // Allow PhaseOrderPipeline to modify private passes_ member in order to
+  // perform PhaseOrdering.
+  friend class ::xla::PhaseOrderPipeline;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_PASS_HLO_PASS_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/filecheck.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/filecheck.h
new file mode 100644
index 00000000..3ea8de22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/filecheck.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TESTLIB_FILECHECK_H_
+#define XLA_HLO_TESTLIB_FILECHECK_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Runs FileCheck with the given pattern over given input string. Provided that
+// FileCheck can execute, returns true if and only if FileCheck succeeded in
+// matching the input.
+absl::StatusOr<bool> RunFileCheck(const std::string& input,
+                                  absl::string_view pattern);
+
+// Runs FileCheck with the given pattern file over given input string. Provided
+// that FileCheck can execute, returns true if and only if FileCheck succeeded
+// in matching the input.
+absl::StatusOr<bool> RunFileCheckWithPatternFile(
+    const std::string& input, const std::string& pattern_file);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TESTLIB_FILECHECK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
new file mode 100644
index 00000000..e41bcea3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
@@ -0,0 +1,266 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TESTLIB_HLO_HARDWARE_INDEPENDENT_TEST_BASE_H_
+#define XLA_HLO_TESTLIB_HLO_HARDWARE_INDEPENDENT_TEST_BASE_H_
+
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/layout.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/shape_layout.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+
+// A base class for tests which build and manipulate HLO without running it.
+//
+class HloHardwareIndependentTestBase : public ::testing::Test {
+ public:
+  static PrecisionConfig DefaultPrecisionConfig(int operands);
+
+  // Gets the computation/instruction from the given module with the given name.
+  // Note that it is encouraged to use these functions directly via the
+  // hlo_query.h header instead since they are independent from any test-time
+  // variables or contexts.
+
+  // This is useful for tests which create HLOs from a string and then want to
+  // inspect a particular computation or instruction.
+  static HloComputation* FindComputation(HloModule* module,
+                                         absl::string_view name);
+  static HloInstruction* FindInstruction(HloModule* module,
+                                         absl::string_view name);
+  // Gets the instruction from the given module with the given opcode.
+  static HloInstruction* FindInstruction(HloModule* module, HloOpcode opcode);
+  // Gets all the instructions from the given module with the given opcode.
+  static std::vector<HloInstruction*> FindInstructions(HloModule* module,
+                                                       HloOpcode opcode);
+
+ protected:
+  explicit HloHardwareIndependentTestBase(
+      bool verifier_layout_sensitive = false,
+      bool allow_mixed_precision_in_hlo_verifier = true,
+      HloPredicate instruction_can_change_layout_func = {});
+
+  // Creates a new HLO module for a test. The module created will have
+  // TestName() for its name; it will also automatically populate its debug
+  // options from command-line flags. If you want a fresh HloModule object and
+  // then add HloComputations to it, it's recommended to use this method in your
+  // tests.
+  //
+  // This returns a vanilla HloModule that doesn't run the HLO verifier on
+  // destruction.
+  ABSL_DEPRECATED("Use CreateNewVerifiedModule instead.")
+  std::unique_ptr<HloModule> CreateNewUnverifiedModule(
+      const std::string& name = TestName()) const;
+
+  // Like CreateNewUnverifiedModule, except the HloModule returned here runs the
+  // HLO verifier on destruction.
+  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
+      const std::string& name = TestName(), int64_t replica_count = 1) const;
+
+  // Parses the given string and returns module as a VerifiedHloModule.
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               int64_t replica_count = 1,
+                               int64_t num_partitions = 1) const;
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               const HloModuleConfig& config) const;
+
+  // Runs the hlo_pass with the provided module and returns the result. This
+  // function also verifies that the module remains unchanged when hlo_pass
+  // returns false as the absl::StatusOr value.
+  //
+  // These three overloads all do the same thing.  The && overload lets you do
+  // `RunHloPass(MyPass(), module)` all in one line.  The reason for the
+  // overload that takes a pointer is that, at one point in the past, non-const
+  // lvalue references were banned in Google code.
+  static absl::StatusOr<bool> RunHloPass(HloPassInterface* hlo_pass,
+                                         HloModule* module);
+  static absl::StatusOr<bool> RunHloPass(HloPassInterface& hlo_pass,
+                                         HloModule* module) {
+    return RunHloPass(&hlo_pass, module);
+  }
+  static absl::StatusOr<bool> RunHloPass(HloPassInterface&& hlo_pass,
+                                         HloModule* module) {
+    return RunHloPass(&hlo_pass, module);
+  }
+
+  // Runs the hlo_pass with the provided module group and returns the result.
+  // This method runs the input HLO module group pass for a `HloModuleGroup` and
+  // it also verifies the module group remains unchanged when hlo_pass returns
+  // false as the absl::StatusOr value.
+  static absl::StatusOr<bool> RunHloPass(HloPassInterface&& hlo_pass,
+                                         HloModuleGroup* module_group);
+
+  // Sets most fath math options to be enabled to model the fast math flags
+  // generally used for CPU:AOT compilation.
+  static void SetAotFastMathDebugOptions(DebugOptions* options);
+
+  // Runs pass `hlo_pass` on input HLO module `hlo` with optional config, and
+  // FileChecks the result against `expected`.
+  //
+  // If the rewrite has changed the module, also runs `additional_checks` on the
+  // result.
+  void RunAndFilecheckHloRewrite(
+      absl::string_view hlo, HloPassInterface&& hlo_pass,
+      std::optional<absl::string_view> expected,
+      std::function<void(HloModule*)> after_pass_checks = nullptr,
+      const HloModuleConfig* config = nullptr) const;
+
+  // Runs pass `hlo_pass` on a group of input HLO modules `hlo_module_strs`,
+  // and FileChecks the result against `expected`.
+  void RunAndFilecheckHloModuleGroupRewrite(
+      absl::Span<const absl::string_view> hlo_module_strs,
+      HloPassInterface&& hlo_pass,
+      std::optional<absl::Span<const absl::string_view>> expected) const;
+
+  using FixedMapping =
+      std::initializer_list<std::pair<absl::string_view, absl::string_view>>;
+
+  // Creates an HLO module from a template and an optional replacement map and
+  // runs the given hlo_pass on the module. Validates whether the pass has
+  // changed the module or not based on expect_change flag.  Returns unique_ptr
+  // to the HLO module for further inspection.
+  absl::StatusOr<std::unique_ptr<HloModule>> RunAndCheckHloRewrite(
+      absl::string_view hlo_template, HloPassInterface&& hlo_pass,
+      bool expect_change = true, FixedMapping params = {}) const;
+
+  // Populates debug options from command-line flags and adjusts the options for
+  // testing. It is recommended to use this when you need to pass in
+  // DebugOptions, e.g. when creating a module from a string or a file.
+  //
+  // This function is virtual so tests can specify an alternative set of debug
+  // options (e.g. disabling additional passes).
+  virtual DebugOptions GetDebugOptionsForTest() const;
+
+  // Gets an HloModuleConfig with options appropriate for tests.
+  HloModuleConfig GetModuleConfigForTest(int64_t replica_count = 1,
+                                         int64_t num_partitions = 1) const {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    config.set_replica_count(replica_count);
+    config.set_num_partitions(num_partitions);
+    return config;
+  }
+
+  // Convenience method to force the layout of a given parameter in a module.
+  // The layout of parameter number 'param_no' in the 'module' is set to
+  // 'layout'.
+  static void ForceParameterLayout(HloModule* module, int64_t param_no,
+                                   const Layout& layout) {
+    ASSERT_LT(param_no,
+              module->mutable_entry_computation_layout()->parameter_count());
+    module->mutable_entry_computation_layout()
+        ->mutable_parameter_layout(param_no)
+        ->ResetLayout(layout);
+  }
+
+  // Convenience method to force the layout of the computation result in a
+  // module. The result layout of 'module' is set to 'layout'.
+  static void ForceResultLayout(HloModule* module, const Layout& layout) {
+    module->mutable_entry_computation_layout()
+        ->mutable_result_layout()
+        ->ResetLayout(layout);
+  }
+
+  static void ForceResultLayout(HloModule* module, const Layout& layout,
+                                ShapeIndexView shape_index) {
+    module->mutable_entry_computation_layout()
+        ->mutable_result_layout()
+        ->ResetLayout(layout, shape_index);
+  }
+
+  // Convenience method to clear the layout of the computation result in
+  // 'module'.
+  static void ForceClearResultLayout(HloModule* module) {
+    module->mutable_entry_computation_layout()
+        ->mutable_result_layout()
+        ->Clear();
+  }
+
+
+  bool verifier_layout_sensitive() const { return verifier_layout_sensitive_; }
+  void set_verifier_layout_sensitive(bool verifier_layout_sensitive) {
+    verifier_layout_sensitive_ = verifier_layout_sensitive;
+  }
+  HloPredicate instruction_can_change_layout_func() const {
+    return instruction_can_change_layout_func_;
+  }
+  void set_instruction_can_change_layout_func(
+      HloPredicate instruction_can_change_layout_func) {
+    instruction_can_change_layout_func_ =
+        std::move(instruction_can_change_layout_func);
+  }
+  // Return an HLO verifier constructed for the test backend.
+  HloVerifier& verifier() const { return *hlo_verifier_; }
+  void set_hlo_verifier(std::unique_ptr<HloVerifier> hlo_verifier) {
+    hlo_verifier_ = std::move(hlo_verifier);
+  }
+  bool allow_mixed_precision_in_hlo_verifier() const {
+    return allow_mixed_precision_in_hlo_verifier_;
+  }
+
+  static std::string TestName() {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  // Updates the entry computation layout to match the program shape. Useful
+  // when tiling assignment has been run to update the latter and we want those
+  // changes propagated into the former.
+  static absl::Status UpdateEntryComputationLayoutToMatchProgramLayout(
+      HloModule* module);
+
+  // Compares the inputs shapes of two modules and returns the list of parameter
+  // indices that mismatch. The mismatch could be either in shape or datatype.
+  // If there is no mismatch, an empty vector is returned.
+  [[nodiscard]] std::vector<int> CompareInputs(const HloModule& module_0,
+                                               const HloModule& module_1);
+
+ private:
+  bool verifier_layout_sensitive_;
+  bool allow_mixed_precision_in_hlo_verifier_;
+  HloPredicate instruction_can_change_layout_func_;
+  std::unique_ptr<HloVerifier> hlo_verifier_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TESTLIB_HLO_HARDWARE_INDEPENDENT_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/pattern_matcher_gmock.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/pattern_matcher_gmock.h
new file mode 100644
index 00000000..a2558e95
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/pattern_matcher_gmock.h
@@ -0,0 +1,108 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TESTLIB_PATTERN_MATCHER_GMOCK_H_
+#define XLA_HLO_TESTLIB_PATTERN_MATCHER_GMOCK_H_
+
+#include <ostream>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/layout.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+
+namespace pattern_matcher_gmock_detail {
+template <typename Pattern>
+class GmockMatcher {
+ public:
+  explicit GmockMatcher(Pattern p) : pattern_(std::move(p)) {}
+
+  // In service of better error messages, list out the overloads explicitly
+  // rather than just using a template.  gMock's polymorphism plus
+  // pattern_matcher yields some pretty gnarly stuff.
+  bool MatchAndExplain(const Layout& l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&l, listener);
+  }
+  bool MatchAndExplain(const Layout* l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(l, listener);
+  }
+  bool MatchAndExplain(Layout* l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(l, listener);
+  }
+
+  bool MatchAndExplain(const Shape& s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&s, listener);
+  }
+  bool MatchAndExplain(const Shape* s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(s, listener);
+  }
+  bool MatchAndExplain(Shape* s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(s, listener);
+  }
+
+  bool MatchAndExplain(const HloInstruction& instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&instr, listener);
+  }
+  bool MatchAndExplain(const HloInstruction* instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(instr, listener);
+  }
+  bool MatchAndExplain(HloInstruction* instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(instr, listener);
+  }
+
+  void DescribeTo(std::ostream* os) const { pattern_.DescribeTo(os); }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is NOT: ";
+    DescribeTo(os);
+  }
+
+ private:
+  template <typename T>
+  bool MatchAndExplainImpl(T* t,
+                           ::testing::MatchResultListener* listener) const {
+    MatchOption options{/*.capture=*/true, /*.single_user_only=*/false,
+                        /*.explain_os=*/listener->stream()};
+    return Match(t, pattern_, options);
+  }
+
+  Pattern pattern_;
+};
+}  // namespace pattern_matcher_gmock_detail
+
+template <typename Pattern>
+::testing::PolymorphicMatcher<
+    pattern_matcher_gmock_detail::GmockMatcher<Pattern>>
+GmockMatch(Pattern&& p) {
+  return ::testing::MakePolymorphicMatcher(
+      pattern_matcher_gmock_detail::GmockMatcher<Pattern>(
+          std::forward<Pattern>(p)));
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TESTLIB_PATTERN_MATCHER_GMOCK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/test.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/test.h
new file mode 100644
index 00000000..adbeaffb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/test.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TESTLIB_TEST_H_
+#define XLA_HLO_TESTLIB_TEST_H_
+
+// This header includes gmock.h and enables the use of gmock matchers in tests
+// in third_party/tensorflow/compiler/xla.
+//
+// Test including this header can use the macros EXPECT_THAT(...) and
+// ASSERT_THAT(...) in combination with gmock matchers.
+// Example:
+//  std::vector<int> vec = Foo();
+//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
+//
+// For more details on gmock matchers see:
+// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
+//
+// The advantages of using gmock matchers instead of self defined matchers are
+// better error messages, more maintainable tests and more test coverage.
+//
+// Note that while the use of gmock matchers is allowed in the xla project, the
+// use of mocks is disallowed in the whole tensorflow project!
+
+#include "tsl/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
+#include <gmock/gmock.h>  // IWYU pragma: export
+#else
+#include <gmock/gmock-actions.h>
+#include <gmock/gmock-matchers.h>            // IWYU pragma: export
+#include <gmock/gmock-more-matchers.h>       // IWYU pragma: export
+#endif
+
+#include "tsl/platform/test.h"  // IWYU pragma: export
+
+#endif  // XLA_HLO_TESTLIB_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/test_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/test_helpers.h
new file mode 100644
index 00000000..6af0436e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/test_helpers.h
@@ -0,0 +1,68 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TESTLIB_TEST_HELPERS_H_
+#define XLA_HLO_TESTLIB_TEST_HELPERS_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tsl/platform/test.h"
+
+// This module contains a minimal subset of gmock functionality just
+// sufficient to execute the currently existing tests.
+
+namespace xla {
+template <typename T>
+class Array2D;
+class Literal;
+
+namespace testing {
+
+namespace internal_status {
+// TODO(b/340953531) Eliminate this function.
+inline const absl::Status& GetStatus(const absl::Status& status) {
+  return status;
+}
+
+template <typename T>
+inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
+  return status.status();
+}
+}  // namespace internal_status
+
+}  // namespace testing
+}  // namespace xla
+
+// The following macros are similar to macros in gmock, but deliberately named
+// differently in order to avoid conflicts in files which include both.
+
+// Macros for testing the results of functions that return absl::Status or
+// absl::StatusOr<T> (for any type T).
+#define EXPECT_IS_OK(expression) \
+  EXPECT_EQ(::absl::OkStatus(),  \
+            xla::testing::internal_status::GetStatus(expression))
+#define EXPECT_IS_NOT_OK(expression) \
+  EXPECT_NE(::absl::OkStatus(),      \
+            xla::testing::internal_status::GetStatus(expression))
+#undef ASSERT_IS_OK
+#define ASSERT_IS_OK(expression) \
+  ASSERT_EQ(::absl::OkStatus(),  \
+            xla::testing::internal_status::GetStatus(expression))
+#undef ASSERT_IS_NOT_OK
+#define ASSERT_IS_NOT_OK(expression) \
+  ASSERT_NE(::absl::OkStatus(),      \
+            xla::testing::internal_status::GetStatus(expression))
+
+#endif  // XLA_HLO_TESTLIB_TEST_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/verified_hlo_module.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/verified_hlo_module.h
new file mode 100644
index 00000000..6c8f03a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/testlib/verified_hlo_module.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_TESTLIB_VERIFIED_HLO_MODULE_H_
+#define XLA_HLO_TESTLIB_VERIFIED_HLO_MODULE_H_
+
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/shape.h"
+#include "xla/types.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// An HLO module derived class which verifies itself on destruction. This class
+// is intended to be used in unit tests. Any verification errors are raised via
+// ADD_FAILURE.
+class VerifiedHloModule : public HloModule {
+ public:
+  VerifiedHloModule(const std::string& name, const HloModuleConfig& config,
+                    bool verifier_layout_sensitive,
+                    bool allow_mixed_precision_in_hlo_verifier,
+                    std::function<int64_t(const Shape&)> shape_size_function,
+                    HloPredicate instruction_can_change_layout_func = {})
+      : HloModule(name, config),
+        verifier_(verifier_layout_sensitive,
+                  allow_mixed_precision_in_hlo_verifier,
+                  instruction_can_change_layout_func, shape_size_function) {}
+
+  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
+
+  // Given a string in the HloModule::ToString() format, parses the string and
+  // builds the VerifiedHloModule in place. Before calling this method, the
+  // module must be empty (no computations). Finally verifies the module using
+  // HloVerifier and returns the status.
+  absl::Status ParseHloStringAndVerifyModule(absl::string_view str);
+
+  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
+  // included in the failure message.
+  void VerifyOrAddFailure(absl::string_view message);
+
+  // Verifies the module using HloVerifier and returns the status.
+  absl::Status Verify();
+
+ private:
+  HloVerifier verifier_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TESTLIB_VERIFIED_HLO_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.h
new file mode 100644
index 00000000..2b487916
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.h
@@ -0,0 +1,99 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TOOLS_HLO_OPT_OPT_LIB_H_
+#define XLA_HLO_TOOLS_HLO_OPT_OPT_LIB_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+
+namespace xla {
+
+// Platform-independent provider of `hlo-opt` functionality.
+class OptProvider {
+ public:
+  OptProvider() : pass_registry_() { RegisterAllHardwareIndependentPasses(); }
+
+  virtual ~OptProvider() = default;
+
+  // Generates textual output for a given stage on a given platform, returns
+  // empty optional if the stage is not supported.
+  virtual absl::StatusOr<std::optional<std::string>> GenerateStage(
+      std::unique_ptr<HloModule> module, absl::string_view stage);
+
+  // Returns a set of stages supported by the opt provider.
+  virtual std::set<std::string> SupportedStages();
+
+  // Registers a given provider for a given platform.
+  static void RegisterForPlatform(
+      std::string platform, std::unique_ptr<OptProvider> translate_provider);
+
+  // Gets a provider for a given platform.
+  static absl::StatusOr<OptProvider*> GetProviderForPlatform(
+      std::string platform);
+
+  // Runs input passes on a input module and returns the optimized module
+  // string.
+  absl::StatusOr<std::optional<std::string>> BuildAndRunTransformPipeline(
+      std::unique_ptr<HloModule> input_module,
+      const std::string& input_pass_names);
+
+  // Registers all passes and pipelines provided by this provider.
+  virtual void RegisterProviderPasses(HloModule& module);
+
+  // Returns a string of all registered pass names.
+  virtual std::string GetRegisteredPassNames();
+
+ protected:
+  // Map of pass names to pass registration functions. The pass registration
+  // function takes a HloPassPipeline and adds the corresponding pass to it.
+  absl::flat_hash_map<std::string, std::function<void(HloPassPipeline&)>>
+      pass_registry_;
+
+  // Adds an entry of pass name vs pass registration function to registry.
+  template <typename T, typename... Args>
+  void RegisterPass(Args... args) {
+    pass_registry_.insert(std::make_pair(
+        std::string(T(std::forward<Args>(args)...).name()),
+        [args...](HloPassPipeline& p) {
+          p.AddPass<T>(std::forward<decltype(std::as_const(args))>(
+              std::as_const(args))...);
+        }));
+  }
+
+  // Registers all hardware independent passes.
+  void RegisterAllHardwareIndependentPasses();
+
+  // Returns a string of all registered pass names. Helper function for
+  // GetRegisteredPassNames, avoids duplicating code for each provider.
+  std::string GetRegisteredPassNamesHelper(
+      const absl::flat_hash_map<
+          std::string, std::function<void(HloPassPipeline&)>>& pass_registry_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_OPT_OPT_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/add_original_value.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/add_original_value.h
new file mode 100644
index 00000000..f253b8ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/add_original_value.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_ADD_ORIGINAL_VALUE_H_
+#define XLA_HLO_TRANSFORMS_ADD_ORIGINAL_VALUE_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass adds to each op in the HLO graph the original_value attribute,
+// which is used for HLO value tracking. See go/hlo-value-tracking for more
+// details.
+class AddOriginalValue : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "add-original-value"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_ADD_ORIGINAL_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
new file mode 100644
index 00000000..005c68ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
@@ -0,0 +1,238 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_BFLOAT16_PROPAGATION_H_
+#define XLA_HLO_TRANSFORMS_BFLOAT16_PROPAGATION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/float_support.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// HLO pass which reduces the precision of some HLO instructions to BF16
+// according to the backend-specific FloatSupport rule provided by the
+// caller.
+//
+// This pass can be used to reduce instruction precision without affecting the
+// numerical accuracy of the module, i.e., the final output of the module would
+// be bitwise identical to that without this pass; this is possible if the
+// backend already reduces precision to BF16 on some HLO instructions.
+//
+// This pass will not modify the signature of a computation, unless it is a
+// fusion computation or its only caller is a while.
+//
+// !!! WARNING !!! This pass can introduce mixed precision in individual HLOs,
+// which has two issues:
+//
+// 1) It does not guarantee to respect the passed-in FloatSupport
+// specification in terms of mixed precision, so the backend may not support an
+// HLO that has mixed precision produced by this pass. To address this issue,
+// run FloatNormalization with the same FloatSupport after this pass.
+//
+// 2) In general, mixed precision may break the assumptions of some other HLO
+// passes even if the specific backend supports the individual HLOs. Such
+// assumptions include that there are no HLOs using mixed precision, or that the
+// precision of an HLO's output is determined by its inputs. It should be used
+// at the end of the HLO optimization pipeline but before
+// BFloat16ConversionFolding. If other passes are needed after this pass, run
+// BFloat16MixedPrecisionRemoval first to undo some of the changes made by this
+// pass.
+class BFloat16Propagation : public HloModulePass {
+ public:
+  explicit BFloat16Propagation(const FloatSupport* bfloat16_support);
+
+  ~BFloat16Propagation() override = default;
+
+  absl::string_view name() const override { return "bfloat16-propagation"; }
+
+  // Runs the pass on the given module. Returns whether the module was changed
+  // (precision reductions were added).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Returns whether we should avoid changing the precision of inst regardless
+  // of the producers and users.
+  virtual bool ShouldKeepPrecisionUnchanged(const HloInstruction* inst);
+
+  // Determines whether we should consider changing the precision of the given
+  // instruction in the forward pass.
+  virtual bool InstructionIsCandidateForBF16Output(HloInstruction* hlo);
+
+ protected:
+  const FloatSupport* bfloat16_support_;
+
+ private:
+  // ***************************
+  // Function called and state produced by the forward analysis pass (from
+  // parameters to root) that determines the candidate HLOs to use BF16 outputs.
+
+  // The set of instructions to consider using bfloat16, computed in the forward
+  // pass.
+  absl::flat_hash_set<const HloInstruction*> consider_using_bfloat16_;
+
+  // ***************************
+  // Functions called and state produced by the backward pass (from root to
+  // parameters) that finds opportunities to use BF16.
+
+  // Determines the precision for the given instruction in the
+  // opportunity-finding pass.
+  void DetermineInstructionPrecision(HloInstruction* hlo, bool skip_parameters);
+
+  // Special handling in the opportunity-finding pass for fusion computations.
+  //
+  // Precondition: hlo->opcode() == kFusion
+  void DetermineFusionComputationPrecision(HloInstruction* fusion);
+
+  // Reverts changes to BF16 that will not propagate outside a fusion
+  // computation. This avoids BF16 casts overhead inside a fusion which won't
+  // save memory bandwidth.
+  //
+  // Precondition: hlo->opcode() == kFusion
+  void RevertIfFusionInternalBF16Changes(HloInstruction* fusion);
+
+  // Special handling in the opportunity-finding pass for while computations.
+  //
+  // Precondition: hlo->opcode() == kWhile
+  void DetermineWhileComputationsPrecision(HloInstruction* while_hlo);
+
+  // Special handling in the opportunity-finding pass for conditional branches.
+  //
+  // Precondition: hlo->opcode() == kConditional
+  void DetermineConditionalComputationsPrecision(HloInstruction* cond);
+
+  // The set of HloInstructions that have been visited in the
+  // opportunity-finding pass.
+  absl::flat_hash_set<const HloInstruction*>
+      instructions_visited_in_backward_pass_;
+
+  // The set of HloComputations that have been visited in the
+  // opportunity-finding pass.
+  absl::flat_hash_set<const HloComputation*>
+      computations_visited_in_backward_pass_;
+
+  // ***************************
+  // Functions called by the final inconsistency resolving pass.
+
+  // Adjusts the output shapes of HloInstructions such that if two
+  // HloInstructions have aliasing buffers in their outputs, they must have the
+  // same precision.
+  void ResolveInconsistencyOfAliasingBuffers(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Resolves inconsistency of aliasing buffers for the given computation, and
+  // recursively runs on a while instruction's condition and body until a fixed
+  // point is reached.
+  bool ResolveInconsistencyOfAliasingBuffersHelper(
+      HloComputation* computation,
+      absl::flat_hash_set<const HloComputation*>* visited_computations);
+
+  // Makes the parameters of called computations match how they are called by
+  // the given HLO.
+  void AdjustCalledComputationParameters(HloInstruction* hlo);
+
+  // Makes the root instructions of called computations match how they are used
+  // by the given HLO.
+  void AdjustCalledComputationRoot(HloInstruction* hlo);
+
+  // ***************************
+  // Functions called after changes in changes_to_bf16_ are applied.
+
+  // Resolves inconsistencies introduced by this pass for fusions with
+  // tuple-type output.
+  absl::Status ResolveInconsistentFusions(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Converts the literals in kConstant HLOs which have their types changed to
+  // BF16 by this pass.
+  absl::Status ResolveConvertedConstants(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Skips no-op conversions (same source and target shapes) that can be
+  // produced this pass, i.e., replaces them in their uses with their operands.
+  absl::Status SkipNoopConversions(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // ***************************
+  // Functions called and state used by two or more passes.
+
+  // Returns whether all uses of the given HloInstruction can consume BF16
+  // input.
+  bool AllUsersConsumeBF16(const HloInstruction& hlo,
+                           const ShapeIndex& index) const;
+
+  // The output element type of the HLO at the given shape index after changes
+  // in changes_to_bf16_ are applied.
+  PrimitiveType OutputTypeAfterChange(HloInstruction* hlo,
+                                      const ShapeIndex& index) const;
+
+  // The element type of the HLO value after changes in changes_to_bf16_ are
+  // applied.
+  PrimitiveType ValueTypeAfterChange(const HloValue* value) const;
+
+  // If target_type == BF16, adds the HLO at the given index to
+  // changes_to_bf16_; otherwise, target_type must be F32 and this function
+  // removes the HLO at the given index from changes_to_bf16_ if it was earlier
+  // added.
+  void AddToOrRemoveFromBF16ChangeSet(HloInstruction* hlo,
+                                      const ShapeIndex& index,
+                                      PrimitiveType target_type);
+
+  // The set of F32 HLO values that must be kept in F32.
+  absl::flat_hash_set<const HloValue*> values_that_must_be_kept_as_f32_;
+
+  // Mapping from each HloComputation to the number of callers to it in the
+  // module. Populated at the beginning of this pass.
+  absl::flat_hash_map<const HloComputation*, int64_t> caller_counts_;
+
+  // We first store the potential F32-to-BF16 changes to changes_to_bf16_, which
+  // are subject to further adjustment, then finally applied to the HLOs. This
+  // avoids setting changed_ to true but all changes are reverted during
+  // adjustment.
+  //
+  // For each HloInstruction, changes_to_bf16_ stores the affected buffers in
+  // the output as a map from in-place pointers to subshapes to shape indices.
+  absl::flat_hash_map<HloInstruction*, absl::flat_hash_map<Shape*, ShapeIndex>>
+      changes_to_bf16_;
+
+  // Whether the last processed HLO module has been changed by this pass.
+  bool changed_ = false;
+
+  std::unique_ptr<HloDataflowAnalysis> dataflow_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_BFLOAT16_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
new file mode 100644
index 00000000..0a1a8d5e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_BROADCAST_REORDER_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_BROADCAST_REORDER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that reorders all-gather(broadcast(x)) -> broadcast(all-gather(x)).
+// The intent is to reduce the size of all-gather when possible by doing an
+// all-gather on the (smaller) pre-broadcasted data and then applying the
+// broadcast.
+class AllGatherBroadcastReorder : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-gather-bcast-reorder"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_BROADCAST_REORDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
new file mode 100644
index 00000000..67cd439a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_COMBINER_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_COMBINER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_domain_map.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Combines small non-dependent AllGather ops into larger combined
+// AllGather ops. A typical AllGather implementation has a minimum
+// latency-induced time for a AllGather op so a single combined op can be
+// more efficient than many small ones.
+class AllGatherCombiner : public HloModulePass {
+ public:
+  AllGatherCombiner(int64_t combine_threshold_in_bytes,
+                    int64_t combine_threshold_count, bool combine_by_dim,
+                    bool combine_different_dtypes = true,
+                    bool combine_while_loops = true);
+
+  absl::string_view name() const override { return "all-gather-combiner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // The group key encapsulates all of the properties which must match for it to
+  // be possible to combine the instructions.
+  // The field of the key corresponds to the following:
+  // 1. all_gather_dimension
+  // 2. domain_metadata_id
+  // 3. channel_id
+  // 4. use_global_device_ids
+  // 5. data_type
+  // 6. replica_groups
+  // 7. extra arguments in string format.
+  using GroupKey =
+      std::tuple<std::optional<int64_t>, int64_t, bool, bool, PrimitiveType,
+                 std::vector<std::vector<int64_t>>, std::string>;
+
+  static std::string& GetGroupKeyExtraArgs(GroupKey& key);
+
+  // Returns a key that will be equal for instructions that might be combined,
+  // or different if not.
+  static std::optional<AllGatherCombiner::GroupKey> CombineKey(
+      const HloInstruction* instruction, const HloDomainMap& domain_map,
+      bool combine_by_dim, bool combine_different_dtypes = true);
+
+ protected:
+  absl::StatusOr<bool> RunWithKeyCombiner(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      absl::FunctionRef<std::optional<AllGatherCombiner::GroupKey>(
+          const HloInstruction*, const HloDomainMap&, bool, bool)>
+          combine_key);
+
+ protected:
+  // Combine all gather ops up to this threshold.
+  int64_t combine_threshold_in_bytes_;
+
+  // Combine all gather ops up to this threshold (number of operands).
+  int64_t combine_threshold_count_;
+
+  // Combine only all-gather ops with the same gather dimension.
+  bool combine_by_dim_;
+
+  // Combine all-gather ops with different dtypes.
+  bool combine_different_dtypes_;
+
+  // Combine all-gather ops inside while loop bodies.
+  bool combine_while_loops_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
new file mode 100644
index 00000000..d0459ce1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_CSE_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_CSE_H_
+
+#include <cstdint>
+#include <tuple>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+//  This pass performs common subexpression elimination (CSE) on all-gathers
+//  of parameters. It serves as a setup pass for more advanced collective
+//  transformation strategies by ensuring there is only one all-gather per
+//  parameter. This enables subsequent passes to perform operations like
+//  reinserting all-gathers or all-gather code motion. Example:
+//
+//  Before the pass:
+//  while_loop {
+//      all-gather.1 = all-gather(param_0)
+//      some_computation.1 = compute(all-gather.1)
+//      all-gather.2 = all-gather(param_0)
+//      some_computation.2 = compute(all-gather.2)
+//  }
+//
+//  After the pass:
+//  while_loop {
+//      all-gather.0 = all-gather(param_0)
+//      some_computation.1 = compute(all-gather.0)
+//      some_computation.2 = compute(all-gather.0)
+//  }
+class AllGatherCSE : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-gather-cse"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  std::tuple<HloInstruction*, int64_t, PrimitiveType> FindRawParameter(
+      HloInstruction* instruction);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_GATHER_CSE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
new file mode 100644
index 00000000..e4206aed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
@@ -0,0 +1,79 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_REDUCE_COMBINER_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_REDUCE_COMBINER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/all_reduce_key.h"
+#include "xla/service/hlo_domain_map.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Combines small non-dependent AllReduce ops into larger combined
+// AllReduce ops. A typical AllReduce implementation has a minimum
+// latency-induced time for a AllReduce op so a single combined op can be
+// more efficient than many small ones.
+class AllReduceCombiner : public HloModulePass {
+ public:
+  AllReduceCombiner(int64_t combine_threshold_in_bytes,
+                    int64_t combine_threshold_count);
+
+  absl::string_view name() const override { return "all-reduce-combiner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  using GroupKey = std::tuple<AllReduceKey, /*extra_args*/ std::string>;
+
+  static std::string GetGroupKeyExtraArgs(AllReduceCombiner::GroupKey& key);
+
+  // Returns a key that will be equal for instructions that might be combined,
+  // or different if not.
+  static std::optional<AllReduceCombiner::GroupKey> CombineKey(
+      const HloInstruction* instruction, const HloDomainMap& domain_map);
+
+ protected:
+  absl::StatusOr<bool> RunWithKeyCombiner(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      absl::FunctionRef<std::optional<AllReduceCombiner::GroupKey>(
+          const HloInstruction*, const HloDomainMap&)>
+          combine_key);
+
+  // Combine all reduce ops up to this threshold.
+  int64_t combine_threshold_in_bytes_;
+
+  // Combine all reduce ops up to this threshold (number of operands).
+  int64_t combine_threshold_count_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_REDUCE_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
new file mode 100644
index 00000000..5262c943
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_REDUCE_CONTIGUOUS_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_REDUCE_CONTIGUOUS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Concatenates all-reduce operands together, so the all-reduce is performed
+// over a single, contiguous buffer.
+class AllReduceContiguous : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-reduce-contiguous"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_ALL_REDUCE_CONTIGUOUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
new file mode 100644
index 00000000..ad98b84d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
@@ -0,0 +1,79 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_ASYNC_COLLECTIVE_CREATOR_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_ASYNC_COLLECTIVE_CREATOR_H_
+
+#include <cstdint>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Transforms each all-reduce instruction to a pair of all-reduce-start and
+// all-reduce-done.
+class AsyncCollectiveCreator : public HloModulePass {
+ public:
+  // Function to query the shape of the "context" for collectives that use
+  // HLO async-start/async-done.
+  using ContextShapeQuery =
+      std::function<std::vector<Shape>(const HloInstruction *)>;
+  struct CollectiveCreatorConfig {
+    HloPredicate convert_all_reduce = HloPredicateFalse;
+    HloPredicate convert_all_gather = HloPredicateFalse;
+    HloPredicate convert_collective_broadcast = HloPredicateFalse;
+    HloPredicate convert_collective_permute = HloPredicateFalse;
+    HloPredicate convert_all_to_all = HloPredicateFalse;
+    HloPredicate convert_reduce_scatter = HloPredicateFalse;
+    HloPredicate convert_ragged_all_to_all = HloPredicateFalse;
+    ContextShapeQuery get_context_shapes = [](const HloInstruction *) {
+      return std::vector<Shape>{};
+    };
+    int64_t all_reduce_min_threshold_in_bytes = 0;
+    int64_t all_gather_min_threshold_in_bytes = 0;
+  };
+  explicit AsyncCollectiveCreator(CollectiveCreatorConfig creator_config)
+      : config_(std::move(creator_config)) {}
+  absl::string_view name() const override { return "async-collective-creator"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule *module,
+      const absl::flat_hash_set<absl::string_view> &execution_threads) override;
+
+  std::vector<HloInstruction *> MatchCollectives(HloComputation *computation);
+  absl::StatusOr<bool> ReplaceCollectives(
+      HloComputation *computation,
+      std::vector<HloInstruction *> &supported_collectives);
+  const CollectiveCreatorConfig *config() const { return &config_; }
+
+ private:
+  CollectiveCreatorConfig config_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_ASYNC_COLLECTIVE_CREATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
new file mode 100644
index 00000000..be572230
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVE_QUANTIZER_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVE_QUANTIZER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Reduces the amount of data transferred in all-gather, all-to-all,
+// collective-broadcast and collective-permute ops by exchanging the collectives
+// with subsequent quantizations or type conversions to a narrower type as well
+// as preceding dequantizations or type conversions to a wider type. When
+// present, unary ops such as bitcasts, copies, reshapes and slices between
+// collective and quantization/dequantiation/type conversion are shifted, i.e.
+// transforms
+//
+//   collective --> unary --> quantization/type conversion
+//
+// into
+//
+//   quantization/type conversion --> collective --> unary
+//
+// and
+//
+//   dequantization/type conversion --> unary --> collective
+//
+// into
+//
+//   unary --> collective --> dequantization/type conversion.
+class CollectiveQuantizer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "collective-quantizer"; }
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVE_QUANTIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
new file mode 100644
index 00000000..9b8071d5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
@@ -0,0 +1,76 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVE_TRANSFORMATION_REORDERER_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVE_TRANSFORMATION_REORDERER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Transforms
+//  -- all-gather + reshape into reshape + all-gather and
+//  -- reshape + all-reduce into all-reduce + reshape.
+// Both transformations require that there are no other users affected, i.e.,
+// reshape user count should be 1.
+// all-gather transformation requires the reshape to only change the shape of
+// the all-gather shards, i.e., not reshaping across the all-gather dimension.
+// all-reduce transformation requires all-reduce to be not layout constrained.
+
+// all-gather + reshape example:
+
+// input = [C_0, C_1, ..., C_i, ..., C_{n-1}, C_n] ...
+// all-gather = [C_0, C_1, ..., P*C_i, ... C_{n-1}, C_n] all-gather(input)
+// reshape = [D_0, D_1, ..., P*D_j, ..., D_{m-1}, D_m] reshape(all-gather)
+
+// can be transformed to:
+
+// input = [C_0, C_1, ..., C_i, ..., C_{n-1}, C_n] ...
+// reshape = [D_0, D_1, ..., D_j, ..., D_{m-1}, D_m] reshape(input)
+// all-gather = [D_0, D_1, ..., P*D_j, ... D_{m-1}, D_m] all-gather(input)
+
+// if and only if C_0 * C_1 * ... * C_{i-1} = D_0 * D_1 * ... * D_{j-1}
+// and C_{i+1} * ... * C_{n-1} * C_n = D_{j+1} * ... * D_{m-1} * D_{m}.
+
+class CollectiveTransformationReorder : public HloModulePass {
+ public:
+  CollectiveTransformationReorder() = default;
+  ~CollectiveTransformationReorder() override = default;
+  absl::string_view name() const override {
+    return "collective-transformation-reorderer";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> ReorderAllGatherTransformations(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  absl::StatusOr<bool> ReorderAllReduceTransformations(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVE_TRANSFORMATION_REORDERER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
new file mode 100644
index 00000000..432294ac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
@@ -0,0 +1,52 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVES_SCHEDULE_LINEARIZER_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVES_SCHEDULE_LINEARIZER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Enforces a total order on all collectives present in the module, based on the
+// order given to the instructions.
+//
+// Does not insert inter-computation dependencies, only linearizes the order
+// within each computation.
+class CollectivesScheduleLinearizer : public HloModulePass {
+ public:
+  explicit CollectivesScheduleLinearizer(HloModulePredicate is_enabled = {})
+      : is_enabled_(is_enabled) {}
+
+  absl::string_view name() const override {
+    return "collectives-schedule-linearizer";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloModulePredicate is_enabled_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_COLLECTIVES_SCHEDULE_LINEARIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
new file mode 100644
index 00000000..2c0ccaad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
@@ -0,0 +1,72 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Convert asynchronous collectives to synchronous (after HLO scheduling) if
+// there are no compute operations overlapping with them.
+
+class ConvertAsyncCollectivesToSync : public HloModulePass {
+ public:
+  explicit ConvertAsyncCollectivesToSync(HloPredicate is_nop = {})
+      : is_nop_(is_nop) {}
+  absl::string_view name() const override {
+    return "convert-async-collectives-to-sync";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  virtual absl::Status ConvertAsyncInstructionsToSync(
+      HloComputation* computation,
+      absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
+      const {
+    return ReplaceAsyncInstructionsWithSync(computation, async_pairs);
+  }
+
+  // Helper utility to replace a list of pairs of async-start/done ops in a
+  // computation with their synchronous variants and update the schedule.
+  static absl::Status ReplaceAsyncInstructionsWithSync(
+      HloComputation* computation,
+      absl::Span<const std::pair<HloInstruction*, HloInstruction*>>
+          async_pairs);
+
+  static constexpr char kAsyncCollectiveNameAttributeName[] =
+      "async_collective_name";
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+  HloPredicate is_nop_;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
new file mode 100644
index 00000000..d95f218f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_INFEED_TOKEN_PROPAGATION_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_INFEED_TOKEN_PROPAGATION_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+// Finds dangling infeed/outfeed tokens inside nested computations and bubbles
+// them up through callers until they reach the entry computation. This is
+// needed to prepare these computations to be inlined, otherwise the previous
+// computation boundaries won't be there to stop infeeds/outfeeds from being
+// reordered during scheduling.
+//
+// This pass assumes the HLO graph is flattened.
+class InfeedTokenPropagation : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "infeed-token-propagation"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::Status PropagateToken(const HloOrdering& ordering);
+  absl::Status PropagateTokenThroughWhileBody();
+  absl::Status PropagateTokenThroughConditionalBranch();
+
+  std::unique_ptr<CallGraph> call_graph_;
+
+  HloInstruction* dangling_instruction_ = nullptr;
+  HloOpcode original_opcode_;
+  HloInstruction* input_token_ = nullptr;
+  HloInstruction* output_token_ = nullptr;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_INFEED_TOKEN_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.h
new file mode 100644
index 00000000..9ba28f9e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_COLLECTIVES_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_SETUP_H_
+#define XLA_HLO_TRANSFORMS_COLLECTIVES_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_SETUP_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// Reorder the sequence of reduce-scatter, convert, transpose, and add
+// operations. This transformation changes the pattern from:
+//   add(transpose(convert(reduce-scatter(operand))), get-tuple(parameter(0))
+//   add(transpose(reduce-scatter(operand)), get-tuple(parameter(0))
+// to:
+//   add(reduce-scatter(transpose(convert(operand))), get-tuple(parameter(0))
+//   add(reduce-scatter(transpose(operand)), get-tuple(parameter(0))
+class ReorderReduceTranspose : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "reorder-reduce-transpose"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+// Reorder the reduce-scatter/all-reduce and convert operations followed
+// by an add. This transformation changes the pattern from:
+//   add(convert(reduce-scatter(operand)), get-tuple(parameter(0)))
+//   add(convert(all-reduce(operand)), get-tuple(parameter(0)))
+// to:
+//   add(reduce-scatter(convert(operand)), get-tuple(parameter(0)))
+//   add(all-reduce(convert(operand)), get-tuple(parameter(0)))
+class ReorderConvertReduceAdd : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "reorder-convert-reduce-add";
+  }
+
+  // Constructor with optional enable_reduce_scatter parameter
+  explicit ReorderConvertReduceAdd(bool enable_reduce_scatter = false)
+      : enable_reduce_scatter_(enable_reduce_scatter) {}
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+  // Enable transformation of reduce-scatter op.
+  bool enable_reduce_scatter_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_COLLECTIVES_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_SETUP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
new file mode 100644
index 00000000..ab2bc835
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
+#define XLA_HLO_TRANSFORMS_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+class ConvertMemoryPlacementToInternalAnnotations : public HloModulePass {
+ public:
+  ConvertMemoryPlacementToInternalAnnotations() = default;
+
+  absl::string_view name() const override {
+    return "convert-memory-placement-to-internal-annotations";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/defuser.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/defuser.h
new file mode 100644
index 00000000..edc2f8f9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/defuser.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_DEFUSER_H_
+#define XLA_HLO_TRANSFORMS_DEFUSER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which replaces all fusion instructions with the equivalent un-fused
+// instructions.
+class Defuser : public HloModulePass {
+ public:
+  Defuser() {}
+  ~Defuser() override {}
+  absl::string_view name() const override { return "defuser"; }
+
+  // Run defusion on the given module. Returns whether the module was
+  // changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_DEFUSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/despecializer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/despecializer.h
new file mode 100644
index 00000000..1266eed7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/despecializer.h
@@ -0,0 +1,104 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_DESPECIALIZER_H_
+#define XLA_HLO_TRANSFORMS_DESPECIALIZER_H_
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+
+namespace xla {
+
+// Creates an HloPassPipeline containing multiple HloPasses that can
+// despecialize an optimized HloModule. This is useful to run an HloModule
+// optimized for one specific platform on a different platform (undoing platform
+// specific passes) with matching numerics for comparison.
+//
+// Current despecialization passes are HloDescheduler, ControlDepRemover,
+// Defuser and BFloat16MixedPrecisionRemoval.
+class Despecializer : public HloModulePass {
+ public:
+  Despecializer();
+  void AddReduceWindowToReduceBroadcastDeconstruct();
+  void AddAssumeGatherIndicesInBoundRewriteToCopy();
+  absl::string_view name() const override { return "despecializer"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloPassPipeline pipeline_;
+};
+
+class AssumeGatherIndicesInBoundRewriteToCopy : public HloModulePass {
+ public:
+  AssumeGatherIndicesInBoundRewriteToCopy() = default;
+  absl::string_view name() const override {
+    return "AssumeGatherIndicesInBoundRewriteToCopy";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+class DeconstructReduceWindowToReduceBroadcast : public HloModulePass {
+ public:
+  DeconstructReduceWindowToReduceBroadcast() = default;
+  absl::string_view name() const override {
+    return "ReduceWindowToReduceAndBroadcast";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+// Pass which strips control dependencies from all instructions in the module.
+class ControlDepRemover : public HloModulePass {
+ public:
+  ControlDepRemover() = default;
+  absl::string_view name() const override { return "control-dep-remover"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    bool changed = false;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        changed |= !instruction->control_predecessors().empty();
+        TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+      }
+    }
+    return changed;
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_DESPECIALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.h
new file mode 100644
index 00000000..da8c63ff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_BITCAST_DTYPES_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_BITCAST_DTYPES_EXPANDER_H_
+
+namespace xla {
+
+// A pass which expands bitcast-convert between differently sized dtypes to a
+// reduction.
+class BitcastDtypesExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "bitcast_dtypes_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+ private:
+  absl::flat_hash_map<std::string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_BITCAST_DTYPES_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.h
new file mode 100644
index 00000000..868bde43
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_CHOLESKY_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_CHOLESKY_EXPANDER_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CholeskyExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "cholesky_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+  virtual absl::StatusOr<std::pair<XlaOp, XlaOp>> CholeskyUnblocked(
+      XlaOp a, PrecisionConfig::Precision precision);
+
+ private:
+  XlaOp BuildCholesky(XlaOp a, int64_t block_size,
+                      PrecisionConfig::Precision precision);
+
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<std::string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_CHOLESKY_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/comparison_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/comparison_expander.h
new file mode 100644
index 00000000..ed812c41
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/comparison_expander.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_COMPARISON_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_COMPARISON_EXPANDER_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/primitive_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A pass which performs expansion of the comparison operator to support total
+// order comparison of floating point numbers.
+class ComparisonExpander : public OpExpanderPass {
+ public:
+  explicit ComparisonExpander(
+      absl::Span<const std::pair<PrimitiveType, PrimitiveType>>
+          expand_via_upcast = {})
+      : expand_via_upcast_(expand_via_upcast.begin(), expand_via_upcast.end()) {
+  }
+  ~ComparisonExpander() override = default;
+  absl::string_view name() const override { return "comparison-expander"; }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+  std::vector<std::pair<PrimitiveType, PrimitiveType>> expand_via_upcast_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_COMPARISON_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/convolution_4d_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/convolution_4d_expander.h
new file mode 100644
index 00000000..e9804d89
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/convolution_4d_expander.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_4D_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_4D_EXPANDER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+class Convolution4DExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "convolution_4d_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_4D_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/convolution_pred_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/convolution_pred_expander.h
new file mode 100644
index 00000000..28750c72
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/convolution_pred_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_PRED_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_PRED_EXPANDER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// A pass that rewrites boolean convolutions to floating point and converts the
+// result back to boolean. This is necessary, as the convolutions on GPUs are
+// implemented using custom call to cuDNN, which only supports FP and S8 inputs.
+class ConvolutionPredExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "convolution-pred-expander";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_PRED_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
new file mode 100644
index 00000000..b399970f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_DOT_DECOMPOSER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_DOT_DECOMPOSER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// DotDecomposer is a pass which converts dots into a canonical form where
+// non-contracting and contracting dimensions are reshaped together and batch
+// dimensions are the most major dimensions.
+class DotDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_decomposer"; }
+
+  // Run DotDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_DOT_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
new file mode 100644
index 00000000..910b149d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_DYNAMIC_INDEX_SPLITTER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_DYNAMIC_INDEX_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Convert R1 index operands to DynamicSlice and DynamicUpdateSlice ops into
+// separate scalars.
+class DynamicIndexSplitter : public HloModulePass {
+ public:
+  DynamicIndexSplitter() = default;
+  absl::string_view name() const override { return "dynamic-index-splitter"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_DYNAMIC_INDEX_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.h
new file mode 100644
index 00000000..3f47d792
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_EIGH_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_EIGH_EXPANDER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+class EighExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "eigh_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+  virtual XlaOp BuildEigh(XlaOp a, bool lower, int64_t max_iter, float tol,
+                          bool sort_eigenvalues);
+
+  absl::Status SortByEigenvalues(XlaOp& v, XlaOp& w);
+
+ private:
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<std::string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_EIGH_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/logistic_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/logistic_expander.h
new file mode 100644
index 00000000..fbfb1db9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/logistic_expander.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_LOGISTIC_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_LOGISTIC_EXPANDER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// A pass which performs expansion of the logistic function.
+class LogisticExpander : public OpExpanderPass {
+ public:
+  LogisticExpander() = default;
+  ~LogisticExpander() override = default;
+  absl::string_view name() const override { return "logistic-expander"; }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_LOGISTIC_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
new file mode 100644
index 00000000..c30120ee
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_OP_EXPANDER_PASS_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_OP_EXPANDER_PASS_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass is an abstract superclass for passes that replace operations that
+// match a pattern. It is intended to be subclassed, not used directly.
+//
+// This pass is useful for legalizing HLO instructions that a particular backend
+// does not support into other HLO instructions.
+class OpExpanderPass : public HloModulePass {
+ public:
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // extra_filter: Optional extra filtering criteria for matching instructions,
+  // used in conjunction with InstructionMatchesPattern.
+  // preserve_sharding and relay_control_dependency: If we preserve sharding and
+  // relay control dependency when replacing the matched instructions.
+  explicit OpExpanderPass(HloPredicate extra_filter = nullptr,
+                          bool preserve_sharding = false,
+                          bool relay_control_dependency = false)
+      : extra_filter_(std::move(extra_filter)),
+        preserve_sharding_(preserve_sharding),
+        relay_control_dependency_(relay_control_dependency) {}
+
+ protected:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  virtual bool InstructionMatchesPattern(HloInstruction* instruction) = 0;
+
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  virtual absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) = 0;
+
+  HloPredicate extra_filter_;
+  const bool preserve_sharding_;
+  const bool relay_control_dependency_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_OP_EXPANDER_PASS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
new file mode 100644
index 00000000..a18b8e9a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_OPTIMIZATION_BARRIER_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_OPTIMIZATION_BARRIER_EXPANDER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass removes the opt-barrier operation which is functionally a no-op.
+class OptimizationBarrierExpander : public HloModulePass {
+ public:
+  OptimizationBarrierExpander() = default;
+
+  absl::string_view name() const override { return "cse_barrier_expander"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_OPTIMIZATION_BARRIER_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/qr_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/qr_expander.h
new file mode 100644
index 00000000..7ff56e28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/qr_expander.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_QR_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_QR_EXPANDER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/lib/qr.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class QrExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "qr_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+  virtual absl::StatusOr<QrDecomposition> QrBlock(
+      XlaOp a, PrecisionConfig::Precision precision);
+
+  virtual absl::StatusOr<XlaOp> CompactWYRepresentation(
+      PrimitiveType type, absl::Span<const int64_t> batch_dims, XlaOp vs,
+      XlaOp taus, int64_t m, int64_t n, PrecisionConfig::Precision precision);
+
+ private:
+  absl::StatusOr<XlaOp> BuildQrDecomposition(
+      XlaOp a, int64_t block_size, PrecisionConfig::Precision precision);
+
+  absl::StatusOr<XlaOp> ProductOfElementaryHouseholderReflectors(
+      XlaOp a, XlaOp taus, int64_t block_size,
+      PrecisionConfig::Precision precision);
+
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<std::string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_QR_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/real_imag_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/real_imag_expander.h
new file mode 100644
index 00000000..e9ae9ce6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/real_imag_expander.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_REAL_IMAG_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_REAL_IMAG_EXPANDER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// Expands real/image instructions with non-complex inputs.
+class RealImagExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "real_imag_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* inst) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_REAL_IMAG_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
new file mode 100644
index 00000000..22bcabf8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
@@ -0,0 +1,83 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_REDUCE_DECOMPOSER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_REDUCE_DECOMPOSER_H_
+
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// For each reduction R(I), ensures the postcondition:
+//
+//    !custom_layout_allowed(R)
+//      =>
+//    layout(R) == layout(I) # modulo removed dimensions
+//
+// To achieve that, decomposes layout-mutating reductions which do not satisfy
+// `custom_layout_allowed` into a reduction and a copy.
+//
+// For a singular reduction:
+//
+//   -> reduce ->
+//
+// Gets turned into:
+//
+//    -> reduce -> copy ->
+//
+// For a variadic recuction, the layout assignment guarantees that the layout
+// is the same for all outputs. This pass will transpose the variadic reduction
+// inputs which have different physical layout to the first operand.
+//
+//   A{L} \
+//   B{L} -> reduce{L'} ->
+//   C{L} /
+//
+// Get turned into:
+//
+//   A{L} \                 / GTE(1) -> copy{L'} \
+//   B{L} -> reduce{E(L)} --- GTE(2) -> copy{L'} - Tuple{L'}
+//   C{L} /                 \ GTE(3) -> copy{L'} /
+//
+//   Where E(L) is expected layout of a reduction (original layout with reduce
+//   dimensions dropped).
+//
+// PRECONDITION:
+//  In variadic reduction, all outputs have the same layout
+//  (enforced by layout assignment).
+class ReduceDecomposer : public HloModulePass {
+ public:
+  explicit ReduceDecomposer(HloPredicate custom_layout_allowed = nullptr)
+      : custom_layout_allowed_(custom_layout_allowed) {}
+
+  absl::string_view name() const override { return "reduce-decomposer"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloPredicate custom_layout_allowed_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_REDUCE_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
new file mode 100644
index 00000000..f169cdc6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_RESHAPE_DECOMPOSER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_RESHAPE_DECOMPOSER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Decomposes a reshape which does not satisfy the ReshapeIsBitcast precondition
+// into a bitcast and a copy (physical transposition). Tries to create only one
+// transposition, but when it's not possible, creates two.
+//
+// Postcondition: All reshapes are turned into bitcasts.
+class ReshapeDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "reshape-decomposer"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_RESHAPE_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/rng_bit_generator_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/rng_bit_generator_expander.h
new file mode 100644
index 00000000..40057f9f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/rng_bit_generator_expander.h
@@ -0,0 +1,74 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_RNG_BIT_GENERATOR_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_RNG_BIT_GENERATOR_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class RngBitGeneratorExpander : public OpExpanderPass {
+ public:
+  explicit RngBitGeneratorExpander(RandomAlgorithm default_algorithm)
+      : default_algorithm_(default_algorithm) {
+    CHECK_NE(default_algorithm_, RandomAlgorithm::RNG_DEFAULT);
+  }
+
+  absl::string_view name() const override {
+    return "rng-bit-generator-expander";
+  }
+
+ protected:
+  struct RngGeneratorKey {
+    Shape data_shape;
+    Shape state_shape;
+    RandomAlgorithm algorithm;
+    HloModule* module;
+
+    template <typename H>
+    friend H AbslHashValue(H h, const RngGeneratorKey& c) {
+      return H::combine(std::move(h), c.state_shape, c.data_shape, c.algorithm,
+                        c.module);
+    }
+
+    bool operator==(const RngGeneratorKey& o) const {
+      return data_shape == o.data_shape && state_shape == o.state_shape &&
+             algorithm == o.algorithm && module == o.module;
+    }
+  };
+
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* hlo) override;
+  absl::StatusOr<HloComputation*> GetGeneratorComputation(
+      const Shape& data_shape, const Shape& state_shape,
+      RandomAlgorithm algorithm, HloModule* module);
+
+  const RandomAlgorithm default_algorithm_;
+  absl::flat_hash_map<RngGeneratorKey, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_RNG_BIT_GENERATOR_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/rng_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/rng_expander.h
new file mode 100644
index 00000000..d8f41ec8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/rng_expander.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_RNG_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_RNG_EXPANDER_H_
+
+#include <tuple>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class RngExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "rng-expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* rng) override;
+
+ private:
+  // Cache RNG computations based on the distribution, output shape and shapes
+  // of the first and second operand.
+  absl::flat_hash_map<std::tuple<RandomDistribution, Shape, Shape, Shape>,
+                      HloComputation*>
+      expanded_rng_instructions_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_RNG_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.h
new file mode 100644
index 00000000..f6d84fae
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_STABLE_SORT_EXPANDER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_STABLE_SORT_EXPANDER_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// HLO pass which expands Sort ops that have the is_stable field set to true
+// into equivalent Sort ops which guarantee stable sorting without relying on
+// the is_stable field.
+class StableSortExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "stable-sort-expander"; }
+
+  // Returns the index of the sort operand that is an iota op with an iota
+  // dimension which is the same as the dimension to sort. Also it should have
+  // an integral type that is large enough for the number of elements in the
+  // sort dimension. For now, we only allow S32, because we expect to find a S32
+  // iota operand for all Sort ops which are created by TopK.
+  //
+  // If no operand of the input sort matches the conditions above, returns -1.
+  static int64_t IotaOperandIndexForStableSort(const HloSortInstruction& sort);
+
+ private:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_STABLE_SORT_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
new file mode 100644
index 00000000..e0574e4f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_STOCHASTIC_CONVERT_DECOMPOSER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_STOCHASTIC_CONVERT_DECOMPOSER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// StochasticConvertDecomposer is a pass which replaces unsupported
+// stochastic-convert with multiple hlos.
+class StochasticConvertDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "stochastic_convert_decomposer";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_STOCHASTIC_CONVERT_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offload_legalize.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
new file mode 100644
index 00000000..e08c842e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_HLO_TRANSFORMS_HOST_OFFLOAD_LEGALIZE_H_
+#define XLA_HLO_TRANSFORMS_HOST_OFFLOAD_LEGALIZE_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+class HloCostAnalysis;
+
+// This pass legalizes the graph for the "host memory offloading" pass to
+// correctly identified buffers that are meant to be move on the host. Any
+// legalization that could block that is welcome into this pass.
+class HostOffloadLegalize : public HloModulePass {
+ public:
+  explicit HostOffloadLegalize(int64_t host_memory_space_color,
+                               bool after_layout)
+      : kHostMemorySpaceColor(host_memory_space_color),
+        after_layout_(after_layout) {}
+  ~HostOffloadLegalize() override = default;
+
+  absl::string_view name() const override { return "host-offload-legalize"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const int64_t kHostMemorySpaceColor;
+  const bool after_layout_;
+
+  // For any memory offloaded to the host, return the instruction which is the
+  // start of such and offload. These will either be "MoveToHost" annotations or
+  // entry computation parameters.
+  std::vector<HloInstruction*> FindStartingInstructionsOfHostMemoryOffload(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_HOST_OFFLOAD_LEGALIZE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offloader.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offloader.h
new file mode 100644
index 00000000..8e79a449
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offloader.h
@@ -0,0 +1,174 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_HLO_TRANSFORMS_HOST_OFFLOADER_H_
+#define XLA_HLO_TRANSFORMS_HOST_OFFLOADER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/host_offload_utils.h"
+
+namespace xla {
+
+class HloCostAnalysis;
+
+// This pass does "host memory offloading". If a tensor is annotated to be moved
+// to or from the host, this pass will remove the annotations and update each
+// tensor's layout with host memory spaces and insert copies if necessary. This
+// pass checks to make sure that no compute is done on the tensors annotated for
+// host memory offload; if there is compute, it is considered a user error and
+// an error will be returned.
+// The pass will "walk down" the Hlo graph starting from either MoveToHost
+// custom calls or from parameters with host memory space in their layout. All
+// tensors along each path have their memory space set as host memory space. If
+// a MoveToHost custom call is paired with a DynamicUpdateSlice, the
+// DynamicUpdateSlice will write into host memory space. Otherwise, a copy from
+// device to host will be inserted.
+//
+// If an output of a host offloaded computation is only used on host, the memory
+// space of the usages are updated to reflect it and no copies to and from host
+// are performed. Any MoveToHost instructions for outputs used only on host, are
+// removed.
+// TODO(b/347101407): A better approach could be to remove redundant copies in a
+// generalized fashion. Should also be moved out of Host Offloader.
+//
+// All MoveToHost and MoveToDevice custom calls are removed by the end of this
+// pass.
+class HostOffloader : public HloModulePass {
+ public:
+  explicit HostOffloader(int64_t host_memory_space_color)
+      : kHostMemorySpaceColor(host_memory_space_color) {}
+  ~HostOffloader() override = default;
+
+  absl::string_view name() const override { return "host-offloader"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Process the next "MoveToHost" instruction that resides at the beginning of
+  // a host memory offload instruction chain. This ensures that redundant
+  // "MoveToHost" (those already residing inside of a host memory offload
+  // instruction chain) are ignored.
+  absl::StatusOr<bool> ProcessNextMoveToHostInstr(HloComputation* computation);
+
+  const int64_t kHostMemorySpaceColor;
+  absl::flat_hash_set<HloInstruction*>
+      already_visited_move_to_host_custom_calls_;
+  absl::flat_hash_set<HloInstruction*> dynamic_update_slices_already_allocated_;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> copies_created_after_;
+  absl::flat_hash_set<HloInstruction*> move_to_device_custom_calls_to_remove_;
+  absl::flat_hash_set<host_offload_utils::InstructionAndShapeIndex>
+      already_inserted_copy_before_;
+
+  // Sometimes previous transformations turn a DynamicSlice into a Slice. Since
+  // we're doing a DMA between the host and device, we need to turn the Slice
+  // back into a DynamicSlice.
+  absl::Status DynamifySlice(HloInstruction* slice);
+
+  // Returns true if the instruction is allowed to be in the
+  // middle of a path between a MoveToHost custom-call annotation and a
+  // DynamicUpdateSlice. Ideally the custom-call should be immediately followed
+  // by the DynamicUpdateSlice, but this is not always the case.
+  bool InstructionIsAllowedBetweenMoveToHostAndDus(
+      const HloInstruction* instruction) const;
+
+  // Returns true if the instruction is allowed to be in the
+  // middle of a path between a DynamicSlice and a MoveToDevice custom-call
+  // annotation. Ideally the DynamicSlice should be immediately followed by the
+  // custom-call, but this is not always the case.
+  bool InstructionIsAllowedBetweenDsAndMoveToDevice(
+      const HloInstruction* instruction) const;
+
+  // Walks down the graph and does "host memory offloading" starting from every
+  // host memory parameter in the entry computation.
+  absl::StatusOr<bool> HandleInputStreaming(HloComputation* entry_computation);
+
+  // Walks down the graph and does "host memory offloading" starting from every
+  // MoveToHost custom call.
+  absl::StatusOr<bool> HandleMoveToHostCustomCall(
+      HloInstruction* custom_call_instruction);
+
+  // Since we always walk the graph from the top down, this function only needs
+  // to remove these lingering custom calls. This function should only be called
+  // once all host memory offloading is done because multiple paths might lead
+  // to the same MoveToDevice custom call. Removing it too early will confuse
+  // subsequent walkings of the graph.
+  absl::StatusOr<bool> HandleMoveToDeviceCustomCall(
+      HloInstruction* custom_call_instruction);
+
+  // DynamicUpdateSlices which write into host memory must have their
+  // destination buffer allocated on the host. This function creates the
+  // allocation and updates all positions to have host memory space.
+  absl::Status CreateAllocateBufferForDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice);
+
+  // One way to move data to the device is to use a Slice or DynamicSlice. This
+  // function returns true if the slice is followed by a MoveToDevice custom
+  // call.
+  absl::StatusOr<bool> SliceLeadsToMoveToDeviceCustomCall(
+      HloInstruction* slice);
+
+  // Common function for doing the actual walking of the graph. Host memory
+  // spaces are set and copies are inserted in here.
+  absl::StatusOr<bool> WalkDownHostMemoryOffloadPaths(
+      const host_offload_utils::InstructionAndShapeIndex&
+          starting_instruction_and_index,
+      bool insert_copy_before);
+
+  // Given a custom call, this returns the first instruction and shape index to
+  // start the host memory offload path from for each use of the custom call.
+  absl::StatusOr<std::vector<host_offload_utils::InstructionAndShapeIndex>>
+  GetStartingInstructions(HloInstruction* custom_call_instruction);
+
+  // When a MoveToHost custom call is not paired with a DynamicUpdateSlice, a
+  // copy from device to host must be inserted.
+  absl::StatusOr<bool> InsertCopyBetween(
+      const host_offload_utils::InstructionAndShapeIndex&
+          before_instruction_and_index,
+      const host_offload_utils::InstructionAndShapeIndex&
+          after_instruction_and_index);
+
+  // This is a fix for scheduling. Add copies to inputs of dynamic-update-slice
+  // if the inserted value is directly a parameter of a computation. This is to
+  // avoid cases in while loop where parameter/output aliasing can stop
+  // scheduling because control-dependencies are added.
+  absl::StatusOr<bool> ApplySchedulingFix(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Starting from the outputs of the host offloaded computation, track all
+  // their usages. For the outputs that are ONLY used on host, remove redundant
+  // copies to and from host, as well as update the memory space.
+  absl::StatusOr<bool> HandleRedundantCopiesBackToHost(
+      const HloModule* module, HloInstruction* instruction);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_HOST_OFFLOADER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
new file mode 100644
index 00000000..d45336e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_HOST_OFFLOADING_PREPARE_H_
+#define XLA_HLO_TRANSFORMS_HOST_OFFLOADING_PREPARE_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This is a collection of rewrites that prepares an HLO module for host
+// offloading. These rewrites can be placed in a different parts of
+// the overall compilation pipeline to prepare HLO module for host offloading
+// for the given backend.
+class HostOffloadingPrepare : public HloModulePass {
+ public:
+  enum class Rewrite {
+    // This rewrite removes `MoveToHost` custom calls that feed directly into
+    // the a host computation.
+    //
+    // In the HLO, it will look like HBM is directly fed into the host
+    // computation. The runtime will, once the async-call-start is executed,
+    // allocate a buffer on the host and copy the HBM buffer into it. This has
+    // the benefit that the device will never be blocking directly on the
+    // tranfser, since that's clumped together with the computation.
+    kElideMoveToHost,
+
+    // Currently host compute offloading does not support tiled layouts, and
+    // because of that layouts on the call instruction arguments might be
+    // different from the layouts in the called computation body.
+    //
+    // Host offloading handles layout mismatches at run time by delinearizing
+    // arguments and linearizing results on the fly.
+    //
+    // To keep HLO module valid we rewrite calls to host offloaded computations
+    // into custom calls with the only purpose to suppress verification error.
+    // Host offloading compiler later does its own verification to check that
+    // arguments are compatible with parameters in the offloaded computation and
+    // knows how to handle mismatched layouts.
+    kConvertToCustomCall,
+  };
+
+  static std::string RewriteName(Rewrite rewrite) {
+    switch (rewrite) {
+      case Rewrite::kElideMoveToHost:
+        return "elide-move-to-host";
+      case Rewrite::kConvertToCustomCall:
+        return "convert-to-custom-call";
+    }
+  }
+
+  explicit HostOffloadingPrepare(Rewrite rewrite)
+      : rewrite_(rewrite),
+        pass_name_(absl::StrCat("host-offloading-prepare", "-",
+                                RewriteName(rewrite_))) {}
+
+  absl::string_view name() const override { return pass_name_; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  Rewrite rewrite_;
+  std::string pass_name_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_HOST_OFFLOADING_PREPARE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
new file mode 100644
index 00000000..26d1768f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_LITERAL_CANONICALIZER_H_
+#define XLA_HLO_TRANSFORMS_LITERAL_CANONICALIZER_H_
+
+#include <cstddef>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/literal_pool.h"
+
+namespace xla {
+
+// Canonicalizes literals larger than 'min_size_bytes' in the HLO module using
+// the given literal pool.
+class LiteralCanonicalizer : public HloModulePass {
+ public:
+  LiteralCanonicalizer(LiteralPool* literal_pool, size_t min_size_bytes);
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  absl::string_view name() const override { return "literal-canonicalizer"; }
+
+ protected:
+  LiteralPool* literal_pool_;
+  size_t min_size_bytes_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_LITERAL_CANONICALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/memory_space_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
new file mode 100644
index 00000000..b3998f54
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_MEMORY_SPACE_PROPAGATION_H_
+#define XLA_HLO_TRANSFORMS_MEMORY_SPACE_PROPAGATION_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This is a legalization pass that propagates the memory space in the layout to
+// the fusion computations.
+class MemorySpacePropagation : public HloModulePass {
+ public:
+  ~MemorySpacePropagation() override = default;
+  absl::string_view name() const override { return "memory-space-propagation"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Given the shape index (operand or output) and its corresponding instruction
+  // in the fused computation (parameter or root), propagates the memory space
+  // in the callee side. Returns true if the module is modified.
+  bool Propagate(ShapeIndexView index, const HloInstruction* callee_instruction,
+                 int64_t memory_space) const;
+
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_MEMORY_SPACE_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/operand_upcaster.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/operand_upcaster.h
new file mode 100644
index 00000000..3b0b6fd2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/operand_upcaster.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_OPERAND_UPCASTER_H_
+#define XLA_HLO_TRANSFORMS_OPERAND_UPCASTER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Inserts Convert to operands of instructions that allows result accumulation
+// as wider integral types.
+class OperandUpcaster : public OpExpanderPass {
+ public:
+  explicit OperandUpcaster(HloPredicate extra_filter = nullptr)
+      : OpExpanderPass(std::move(extra_filter)) {}
+
+  absl::string_view name() const override { return "operand_upcaster"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_OPERAND_UPCASTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/sharding_format_picker.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/sharding_format_picker.h
new file mode 100644
index 00000000..a6cbeb94
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/sharding_format_picker.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SHARDING_FORMAT_PICKER_H_
+#define XLA_HLO_TRANSFORMS_SHARDING_FORMAT_PICKER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Test-only pass to transform the HloSharding format of all the instructions in
+// a module to the selected format.
+class ShardingFormatPicker : public HloModulePass {
+ public:
+  enum class ShardingType {
+    kV1,            // Converts all HloSharding to V1 format.
+    kBestEffortV2,  // Best effort to convert all HloSharding to V2 format.
+  };
+  explicit ShardingFormatPicker(ShardingType sharding_type)
+      : sharding_type_(sharding_type) {}
+  absl::string_view name() const override { return "sharding-format-picker"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const ShardingType sharding_type_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SHARDING_FORMAT_PICKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
new file mode 100644
index 00000000..f3ded542
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
@@ -0,0 +1,797 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_ALGEBRAIC_SIMPLIFIER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_ALGEBRAIC_SIMPLIFIER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class AlgebraicSimplifierOptions {
+ public:
+  // Platform dependent callback to determine if a reshape `from_shape` to
+  // `to_shape` is a bitcast.
+  using ReshapeIsBitcastCallback =
+      std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
+  // Platform dependent callback to determine if a set of reverse dimensions is
+  // lowerable
+  using ConvIsLowerableCallback = std::function<bool(HloInstruction* window)>;
+
+  explicit AlgebraicSimplifierOptions(
+      ReshapeIsBitcastCallback reshape_is_bitcast_callback = {},
+      ConvIsLowerableCallback conv_is_lowerable_callback = {})
+      : reshape_is_bitcast_callback_(std::move(reshape_is_bitcast_callback)),
+        conv_is_lowerable_callback_(std::move(conv_is_lowerable_callback)) {}
+
+  // Use the platform specific callback if set. It is not sensible to return
+  // true here if the options are not layout sensitive.
+  bool ReshapeIsBitcast(const Shape& from_shape, const Shape& to_shape) const {
+    if (!is_layout_sensitive_) {
+      return false;
+    }
+    if (!reshape_is_bitcast_callback_) {
+      return ShapeUtil::ReshapeIsBitcast(from_shape, to_shape);
+    }
+    return reshape_is_bitcast_callback_(from_shape, to_shape);
+  }
+
+  // Use the platform specific callback if set. Otherwise, return true.
+  bool ConvIsLowerable(HloInstruction* reverse_dims) const {
+    if (!conv_is_lowerable_callback_) {
+      return true;
+    }
+    return conv_is_lowerable_callback_(reverse_dims);
+  }
+
+  void set_conv_is_lowerable_callback(
+      ConvIsLowerableCallback conv_is_lowerable_callback) {
+    conv_is_lowerable_callback_ = std::move(conv_is_lowerable_callback);
+  }
+
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  void set_is_layout_sensitive(bool is_layout_sensitive) {
+    is_layout_sensitive_ = is_layout_sensitive;
+  }
+
+  bool is_layout_sensitive() const { return is_layout_sensitive_; }
+
+  void set_use_associative_reordering(bool use_associative_reordering) {
+    use_associative_reordering_ = use_associative_reordering;
+  }
+
+  bool use_associative_reordering() const {
+    return use_associative_reordering_;
+  }
+
+  void set_associative_reordering_threshold(
+      double associative_reordering_threshold) {
+    associative_reordering_threshold_ = associative_reordering_threshold;
+  }
+
+  double associative_reordering_threshold() const {
+    return associative_reordering_threshold_;
+  }
+
+  void set_use_convert_constant_folding(bool use_convert_constant_folding) {
+    use_convert_constant_folding_ = use_convert_constant_folding;
+  }
+
+  bool use_convert_constant_folding() const {
+    return use_convert_constant_folding_;
+  }
+
+  void set_raise_slice_and_reduce_through_dot(
+      bool raise_slice_and_reduce_through_dot) {
+    raise_slice_and_reduce_through_dot_ = raise_slice_and_reduce_through_dot;
+  }
+
+  bool raise_slice_and_reduce_through_dot() const {
+    return raise_slice_and_reduce_through_dot_;
+  }
+
+  void set_raise_slice_and_reduce_through_dot_threshold(
+      double raise_slice_and_reduce_through_dot_threshold) {
+    raise_slice_and_reduce_through_dot_threshold_ =
+        raise_slice_and_reduce_through_dot_threshold;
+  }
+
+  double raise_slice_and_reduce_through_dot_threshold() const {
+    return raise_slice_and_reduce_through_dot_threshold_;
+  }
+
+  // Enable dot simplification on platforms where it is profitable.
+  void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
+    enable_dot_strength_reduction_ = enable_dot_strength_reduction;
+  }
+
+  bool enable_dot_strength_reduction() const {
+    return enable_dot_strength_reduction_;
+  }
+
+  // Enable dot->multiple rewrite for dot as an outer-product
+  void set_enable_dot_to_multiply_rewrite(bool enable_dot_to_multiply_rewrite) {
+    enable_dot_to_multiply_rewrite_ = enable_dot_to_multiply_rewrite;
+  }
+
+  bool enable_dot_to_multiply_rewrite() const {
+    return enable_dot_to_multiply_rewrite_;
+  }
+
+  void set_enable_move_dot_param_to_rhs(bool enable_move_dot_param_to_rhs) {
+    enable_move_dot_param_to_rhs_ = enable_move_dot_param_to_rhs;
+  }
+
+  bool enable_move_dot_param_to_rhs() const {
+    return enable_move_dot_param_to_rhs_;
+  }
+
+  // This platform will not run the DotDecomposer to canonicalize dots.
+  void set_supports_non_canonical_dots(bool supports_non_canonical_dots) {
+    supports_non_canonical_dots_ = supports_non_canonical_dots;
+  }
+  bool supports_non_canonical_dots() const {
+    return supports_non_canonical_dots_;
+  }
+
+  // Enable convolution simplification on platforms where it is profitable.
+  void set_enable_conv_simplification(bool enable_conv_simplification) {
+    enable_conv_simplification_ = enable_conv_simplification;
+  }
+  bool enable_conv_simplification() const {
+    return enable_conv_simplification_;
+  }
+
+  // Enable convolution operand swapping on platforms where it is supported.
+  void set_enable_conv_operand_swap(bool enable_conv_operand_swap) {
+    enable_conv_operand_swap_ = enable_conv_operand_swap;
+  }
+  bool enable_conv_operand_swap() const { return enable_conv_operand_swap_; }
+
+  // Enable rewrite of convolution + add + multiply -> multiply + convolution +
+  // add.
+  void set_enable_conv_add_multiply_reorder(
+      bool enable_conv_add_multiply_reorder) {
+    enable_conv_add_multiply_reorder_ = enable_conv_add_multiply_reorder;
+  }
+
+  bool enable_conv_add_multiply_reorder() const {
+    return enable_conv_add_multiply_reorder_;
+  }
+
+  // Move constant scalar multiply to one operand or output of convolutions with
+  // the smallest tensor size, to reduce the number of scalar multiply.
+  void set_enable_scalar_multiply_reduction(
+      bool enable_scalar_multiply_reduction) {
+    enable_scalar_multiply_reduction_ = enable_scalar_multiply_reduction;
+  }
+
+  bool enable_scalar_multiply_reduction() const {
+    return enable_scalar_multiply_reduction_;
+  }
+
+  // Also the algebraic simplifer to treat floating point values like real
+  // numbers.
+  void set_enable_floats_are_real(bool enable_floats_are_real) {
+    enable_floats_are_real_ = enable_floats_are_real;
+  }
+
+  bool enable_floats_are_real() const { return enable_floats_are_real_; }
+
+  // If enable_window_reduce_replacement is true, the kReduceWindow instruction
+  // can be optimized by replacement with simpler operations.
+  void set_enable_window_reduce_to_reduce_replacement(
+      bool enable_window_reduce_to_reduce_replacement) {
+    enable_window_reduce_to_reduce_replacement_ =
+        enable_window_reduce_to_reduce_replacement;
+  }
+
+  bool enable_window_reduce_to_reduce_replacement() const {
+    return enable_window_reduce_to_reduce_replacement_;
+  }
+
+  // Sets the size of a gather operand that can be unrolled into many selects.
+  void set_very_small_gather_size(int64_t size) {
+    very_small_gather_size_ = size;
+  }
+
+  int64_t very_small_gather_size() const { return very_small_gather_size_; }
+
+  void set_cudnn_batchnorm_forward_training_metadata(const std::string& c) {
+    metadata_.cudnn_batchnorm_forward_training_metadata = c;
+  }
+
+  const std::string& get_cudnn_batchnorm_forward_training_metadata() const {
+    return metadata_.cudnn_batchnorm_forward_training_metadata;
+  }
+
+  void set_enable_reduce_of_reshape(bool enable_reduce_of_reshape) {
+    enable_reduce_of_reshape_ = enable_reduce_of_reshape;
+  }
+
+  bool enable_reduce_of_reshape() const { return enable_reduce_of_reshape_; }
+
+  void set_enable_negative_padding_replacement(
+      bool enable_negative_padding_replacement) {
+    enable_negative_padding_replacement_ = enable_negative_padding_replacement;
+  }
+
+  bool enable_negative_padding_replacement() const {
+    return enable_negative_padding_replacement_;
+  }
+
+  void set_enable_sink_broadcast(bool enable_sink_broadcast) {
+    enable_sink_broadcast_ = enable_sink_broadcast;
+  }
+
+  bool enable_sink_broadcast() const { return enable_sink_broadcast_; }
+
+  // If true, always simplify reduce(transpose(x)) and reduce(reshape(x)), even
+  // if the transpose/reshape has multiple users.  This can be beneficial
+  // on platforms where the extra transpose/reshape isn't as expensive as
+  // the optimization benefits brought about by simplifying the graph.
+  bool unconditionally_simplify_reduce_of_transpose_or_reshape() const {
+    return unconditionally_simplify_reduce_of_transpose_or_reshape_;
+  }
+  void set_unconditionally_simplify_reduce_of_transpose_or_reshape(bool val) {
+    unconditionally_simplify_reduce_of_transpose_or_reshape_ = val;
+  }
+
+  // If true, min(x, NaN) = NaN.  If false, min(x, NaN) = x.
+  //
+  // TODO(b/209827141): Remove this and make minmax_propagate_nan
+  // unconditionally true.
+  bool minmax_propagate_nan() const { return minmax_propagate_nan_; }
+  void set_minmax_propagate_nan(bool val) { minmax_propagate_nan_ = val; }
+
+  // When true, always replaces Reduce(concat({a,b,...})) with
+  // map(reduce(a),map(reduce(b),...,)). If false, only does the replacement if
+  // the shapes of a,b,... have the same dimensions.
+  bool enable_unconditional_reduce_of_concat_replacement() const {
+    return enable_unconditional_reduce_of_concat_replacement_;
+  }
+  void set_enable_unconditional_reduce_of_concat_replacement(
+      bool enable_unconditional_reduce_of_concat_replacement) {
+    enable_unconditional_reduce_of_concat_replacement_ =
+        enable_unconditional_reduce_of_concat_replacement;
+  }
+
+  // Indicates whether running on CPU
+  bool executing_on_cpu() const { return executing_on_cpu_; }
+  void set_executing_on_cpu(bool executing_on_cpu) {
+    executing_on_cpu_ = executing_on_cpu;
+  }
+
+  // Option to disable conversion of dynamic-slice to slice.
+  void set_disable_dynamic_slice_to_slice_conversion(bool disable) {
+    disable_dynamic_slice_to_slice_conversion_ = disable;
+  }
+  bool disable_dynamic_slice_to_slice_conversion() const {
+    return disable_dynamic_slice_to_slice_conversion_;
+  }
+
+  // Option to set finite math.
+  void set_enable_fast_math(bool enable_fast_math) {
+    enable_fast_math_ = enable_fast_math;
+  }
+  bool enable_fast_math() const { return enable_fast_math_; }
+
+  void set_enable_broadcast_degenerate_dimension(
+      bool enable_broadcast_degenerate_dimension) {
+    enable_broadcast_degenerate_dimension_ =
+        enable_broadcast_degenerate_dimension;
+  }
+  bool enable_broadcast_degenerate_dimension() const {
+    return enable_broadcast_degenerate_dimension_;
+  }
+
+  void set_enable_remove_no_op_reduce_precision(
+      bool enable_remove_no_op_reduce_precision) {
+    enable_remove_no_op_reduce_precision_ =
+        enable_remove_no_op_reduce_precision;
+  }
+
+  bool enable_remove_no_op_reduce_precision() const {
+    return enable_remove_no_op_reduce_precision_;
+  }
+
+ private:
+  // Metadata struct can be used to store any metadata information encapsulated
+  // with the AlgebraicSimplifierOptions that can be later used in an
+  // AlgebraicSimplifier pass. For example,
+  // cudnn_batchnorm_forward_training_metadata can be used to store the name of
+  // a custom call. If the custom call is
+  // __cudnn$batchNormalizationForwardTraining, the output with index 2 is
+  // guaranteed to be positive. This property has been used to recursively
+  // determine if the operand of an instruction is always positive.
+  struct Metadata {
+    std::string cudnn_batchnorm_forward_training_metadata{""};
+    Metadata() {}
+  };
+  ReshapeIsBitcastCallback reshape_is_bitcast_callback_;
+  ConvIsLowerableCallback conv_is_lowerable_callback_;
+  bool is_layout_sensitive_{false};
+  bool enable_dot_strength_reduction_{true};
+  bool supports_non_canonical_dots_{true};
+  bool enable_dot_to_multiply_rewrite_{true};
+  bool enable_move_dot_param_to_rhs_{false};
+  bool enable_conv_simplification_{true};
+  bool enable_conv_operand_swap_{true};
+  bool enable_conv_add_multiply_reorder_{false};
+  bool enable_scalar_multiply_reduction_{false};
+  bool enable_floats_are_real_{false};
+  bool enable_window_reduce_to_reduce_replacement_{true};
+  bool enable_reduce_of_reshape_{true};
+  bool enable_negative_padding_replacement_{true};
+  bool enable_sink_broadcast_{true};
+  bool unconditionally_simplify_reduce_of_transpose_or_reshape_{false};
+  int64_t very_small_gather_size_{4};
+  bool minmax_propagate_nan_{true};
+  bool enable_unconditional_reduce_of_concat_replacement_{true};
+  bool executing_on_cpu_{false};
+  bool use_associative_reordering_{false};
+  double associative_reordering_threshold_{2.0};
+  bool raise_slice_and_reduce_through_dot_{false};
+  double raise_slice_and_reduce_through_dot_threshold_{2.0};
+  bool use_convert_constant_folding_{false};
+  bool disable_dynamic_slice_to_slice_conversion_{false};
+  bool enable_fast_math_{false};
+  bool enable_broadcast_degenerate_dimension_{true};
+  bool enable_remove_no_op_reduce_precision_{false};
+  Metadata metadata_;
+};
+
+// A pass which performs algebraic simplifications.
+class AlgebraicSimplifier : public HloModulePass {
+ public:
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  explicit AlgebraicSimplifier(const AlgebraicSimplifierOptions& options)
+      : options_(options) {}
+  ~AlgebraicSimplifier() override = default;
+  absl::string_view name() const override { return "algsimp"; }
+
+  // Run algebraic simplification on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Create constant from literal with tiles and element size updated in the
+  // constant's layout.
+  std::unique_ptr<HloInstruction> CreateConstantWithLayoutUpdated(
+      Literal literal) {
+    auto constant = HloInstruction::CreateConstant(std::move(literal));
+    UpdateLayout(constant->mutable_shape());
+    return constant;
+  }
+
+ protected:
+  AlgebraicSimplifierOptions options_;
+};
+
+// AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
+// algebraic expressions to simplified forms. Note: This only supports
+// simplifications that simply look at the operands of an instruction. For the
+// more general case a worklist based approach would be needed.
+class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit AlgebraicSimplifierVisitor(const AlgebraicSimplifierOptions& options,
+                                      AlgebraicSimplifier* simplifier)
+      : options_(options), simplifier_(simplifier) {}
+
+  absl::Status HandleAbs(HloInstruction* abs) override;
+
+  absl::Status HandleAdd(HloInstruction* add) override;
+
+  absl::Status HandleAllGather(HloInstruction* all_gather) override;
+
+  absl::Status HandleAllToAll(HloInstruction* all_to_all) override;
+
+  absl::Status HandleAnd(HloInstruction* logical_and) override;
+
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+
+  absl::Status HandleBitcastConvert(HloInstruction* bitcast) override;
+
+  absl::Status HandleBroadcast(HloInstruction* broadcast) override;
+
+  absl::Status HandleCompare(HloInstruction* compare) override;
+
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+
+  absl::Status HandleConstant(HloInstruction* constant) override;
+
+  absl::Status HandleCopy(HloInstruction* copy) override;
+
+  absl::Status HandleConvert(HloInstruction* convert) override;
+
+  absl::Status HandleComplex(HloInstruction* complex) override;
+
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+
+  absl::Status HandleExp(HloInstruction* exp) override;
+
+  absl::Status HandleReal(HloInstruction* real) override;
+
+  absl::Status HandleImag(HloInstruction* imag) override;
+
+  absl::Status HandleIota(HloInstruction* instruction) override;
+
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+
+  absl::Status HandleDivide(HloInstruction* divide) override;
+
+  absl::Status HandleDot(HloInstruction* dot) override;
+
+  absl::Status HandleGather(HloInstruction* gather) override;
+
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+
+  absl::Status HandleLog(HloInstruction* log) override;
+
+  absl::Status HandleMaximum(HloInstruction* maximum) override;
+
+  absl::Status HandleMinimum(HloInstruction* minimum) override;
+
+  absl::Status HandleClamp(HloInstruction* clamp) override;
+
+  absl::Status HandleMultiply(HloInstruction* multiply) override;
+
+  absl::Status HandleNegate(HloInstruction* negate) override;
+
+  absl::Status HandleNot(HloInstruction* logical_not) override;
+
+  absl::Status HandleOptimizationBarrier(HloInstruction* barrier) override;
+
+  absl::Status HandleOr(HloInstruction* logical_or) override;
+
+  absl::Status HandlePad(HloInstruction* pad) override;
+
+  absl::Status HandlePower(HloInstruction* power) override;
+
+  absl::Status HandleRemainder(HloInstruction* remainder) override;
+
+  absl::Status HandleReshape(HloInstruction* reshape) override;
+
+  absl::Status HandleReduce(HloInstruction* hlo) override;
+
+  absl::Status HandleReducePrecision(HloInstruction* hlo) override;
+
+  absl::Status HandleReduceWindow(HloInstruction* hlo) override;
+
+  absl::Status HandleReverse(HloInstruction* reverse) override;
+
+  absl::Status HandleRsqrt(HloInstruction* rsqrt) override;
+
+  absl::Status HandleSlice(HloInstruction* slice) override;
+
+  absl::Status HandleSqrt(HloInstruction* sqrt) override;
+
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+
+  absl::Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override;
+  absl::Status HandleScatter(HloInstruction* hlo) override;
+
+  absl::Status HandleSelect(HloInstruction* select) override;
+
+  absl::Status HandleSort(HloInstruction* sort) override;
+
+  absl::Status HandleTranspose(HloInstruction* transpose) override;
+
+  absl::Status HandleSubtract(HloInstruction* sub) override;
+
+  absl::Status HandleMap(HloInstruction* map) override;
+
+  // Runs the visitor on a computation.
+  bool Run(HloComputation* computation,
+           const AlgebraicSimplifierOptions& options,
+           AlgebraicSimplifier* simplifier);
+
+  // Compute a function that maps from bitcasted dimensions to the resulting
+  // ones. Returns the function as a vector if successful; std::optional
+  // otherwise.
+  static std::optional<std::vector<std::vector<int64_t>>> ComputeBitcastDimMap(
+      const Shape& bitcast_shape, const Shape& operand_shape);
+  // Invert the directions of the given bitcast dimension map.
+  static std::vector<std::vector<int64_t>> InvertBitcastDimMap(
+      const Shape& original_shape, const Shape& bitcast_shape,
+      const std::vector<std::vector<int64_t>>& original_map);
+
+  // Checks if the output of a given instruction is guaranteed to be
+  // non-negative. e.g. abs
+  static bool IsNonNegative(const HloInstruction* hlo,
+                            const AlgebraicSimplifierOptions& options);
+
+  // Check if the opcode of a given instruction is a non-decreasing function
+  // asymptotically satisfying |f(x)| <= |x|
+  static bool IsNondecreasingSublinear(const HloInstruction* hlo);
+
+  // Modify the layout dimensions of result_shape, so that it becomes the
+  // re-shaped result of applying bitcast to the original_shape, by using
+  // dim_map to re-shape layout dimensions of original_shape. Returns the
+  // result_shape with modified layout if the conversion succeeds; Returns
+  // std::nullopt if fails.
+  static std::optional<Shape> ReshapeLayoutDimensions(
+      const Shape& original_shape, const Shape& result_shape,
+      const std::vector<std::vector<int64_t>>& original_map,
+      const std::vector<std::vector<int64_t>>& result_map);
+
+  // Allow backend constraints on tiling etc. to invalidate optimizations.
+  virtual bool IsValidLayout(const Shape& shape) { return true; }
+  // Allow backend targets to determine whether a layout is inefficient.
+  virtual bool ShouldStrengthReduceDotToReduce(const HloInstruction* hlo) {
+    return true;
+  }
+
+ protected:
+  // The backend-specific options selected for the algebraic simplifier.
+  const AlgebraicSimplifierOptions& options_;
+
+ private:
+  // Returns whether the dot precision config is supported by simplifier.
+  virtual bool SupportedDotPrecisionConfig(const PrecisionConfig& config);
+
+  // Makes algorithm specific set of instructions for multiply with precision
+  // algorithm in mind. In the trivial case it returns just multiply.
+  // For x3 or x6 algorithms it adds the parameters split instructions and the
+  // corresponding multiply instructions.
+  virtual absl::StatusOr<HloInstruction*> MakeMultiplyForPrecisionAlgorithm(
+      HloInstruction* dot, HloInstruction* lhs, HloInstruction* rhs);
+
+  // Rewrite dot as mul(broadcast(transpose(x)),broadcast(transpose(y)))
+  absl::Status RewriteAsMultiplyDotWithZeroLhsContractingDim(
+      HloInstruction* dot, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dnums);
+
+  enum class RewriteResult {
+    kNoRewrite,
+    kRewritten,
+    kStopRewrites,
+  };
+
+  // Reorder nested dots with associativity using flops as a heuristic
+  // Could return kStopRewrites if the rewrite is too expensive.
+  absl::StatusOr<RewriteResult> AssociativeReorderNestedDot(
+      HloDotInstruction* dot, HloInstruction* lhs, HloInstruction* rhs);
+
+  // If the lhs or rhs have only batch and contracting dimensions, a dot can be
+  // rewritten as reduce(mul(broadcast(transpose(x)),broadcast(transpose(y))))
+  absl::Status RewriteBatchPlusContractingAsReduce(
+      HloDotInstruction* dot, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dnums);
+
+  // Removes degenerate dimension from dot.
+  absl::StatusOr<bool> RemoveDegenerateDimensionFromDot(HloDotInstruction* dot);
+
+  // Moves the transpose to the broadcast if possible. Can also be called with a
+  // bitcast transpose.
+  absl::Status SimplifyTransposeOfBroadcast(
+      HloInstruction* transpose, absl::Span<const int64_t> dimensions);
+
+  // Converts to primitive type if the input hlo is not that type, otherwise
+  // returns the original hlo.
+  HloInstruction* AsType(HloInstruction* hlo,
+                         const PrimitiveType element_type) {
+    if (hlo->shape().element_type() == element_type) {
+      return hlo;
+    }
+    Shape changed_shape =
+        ShapeUtil::ChangeElementType(hlo->shape(), element_type);
+    simplifier_->UpdateLayout(&changed_shape);
+    return computation_->AddInstruction(
+        HloInstruction::CreateConvert(changed_shape, hlo));
+  }
+
+  // Transposes a dot operand such that the batch dimensions are the most major,
+  // and the contracting dimensions are most minor.
+  absl::StatusOr<HloInstruction*>
+  NormalizeDotOperandToBatchMajorAndContractingMinor(
+      HloInstruction* dot_operand, absl::Span<const int64_t> batch_dimensions,
+      absl::Span<const int64_t> contracting_dimensions);
+
+  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)) (or
+  // transpose(dot(a,b)) if only the batch dims are transposed).
+  //
+  // Requires the dot has been canonicalized by DotDecomposer into
+  //
+  //   LHS [batch dims..., non-contracting dim, contracting dim]
+  //   RHS [batch dims..., contracting dim, non-contracting dim].
+  absl::StatusOr<bool> RemoveTransposesFromDotOperands(HloDotInstruction* dot);
+
+  // Swap the operands of dots, if one operand is "parameter-like" (i.e. a
+  // parameter, or a pointwise transformation of a parameter), so the
+  // "parameter-like" operand (e.g. a weight tensor) is placed on the RHS.
+  absl::StatusOr<bool> MoveDotParamToRhs(HloDotInstruction* dot);
+
+  // Helper method to perform and add reduction on a list of dimensions.
+  HloInstruction* AddReduce(HloInstruction* hlo, absl::Span<const int64_t> dims,
+                            PrimitiveType type);
+
+  // Move scalar multiply to the smallest side of convolution to
+  // reduce multiply computations.
+  absl::Status ScalarMultiplyReduction(HloInstruction* dot);
+
+  // Convenience method for replacing an instruction with a bitcast. If operand
+  // is not null, then the bitcast will use the specified operand instead of the
+  // operand of the instruction.
+  void ReplaceWithBitcast(HloInstruction* instruction,
+                          HloInstruction* operand = nullptr);
+
+  // Change copy(bitcast...(copy)) into copy(bitcast) or bitcast(copy) so that
+  // the replicated copies are combined when allowed by layout/tiling assignment
+  // constraints.
+  bool SwapCopyBitcastCopy(HloInstruction* root_copy);
+
+  // Replace old instruction with new instruction if old and new instructions
+  // are compatible (have the same shape and replacement preserves sharding).
+  // Updates uses and root instruction. Returns whether a replacement was made.
+  bool ReplaceInstructionIfCompatible(HloInstruction* old_instruction,
+                                      HloInstruction* new_instruction);
+  // Similar to above but tuplizes `new_instructions` if there are more than 1
+  // instructions.
+  bool ReplaceInstructionIfCompatible(
+      HloInstruction* old_instruction,
+      absl::Span<HloInstruction* const> new_instructions);
+
+  // Returns whether the shape of the output of the given instructions are the
+  // same for the purposes of simplification. If options_.is_layout_sensitive()
+  // is true, then this tests shape equality including layout
+  // (ShapeUtil::Equal). If options_.is_layout_sensitive() is false, then the
+  // tests shape compatibility (ShapeUtil::Compatible).
+  bool SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const;
+
+  // Same as above but takes shape arguments directly.
+  bool SameShape(const Shape& lhs, const Shape& rhs) const;
+
+  // A Broadcast that feeds an element-wise operation with a unique non-scalar
+  // operand can sink to after the operation.
+  absl::StatusOr<bool> TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
+      HloInstruction* broadcast);
+
+  absl::StatusOr<HloInstruction*> OptimizeDotOfConcat(HloInstruction* dot);
+  absl::StatusOr<HloInstruction*> OptimizeDotOfConcatHelper(
+      HloInstruction* dot, HloInstruction* lhs, int64_t lhs_contracting_dim,
+      HloInstruction* rhs, int64_t rhs_contracting_dim, bool swapped);
+
+  absl::StatusOr<HloInstruction*> OptimizeDotOfGather(HloInstruction* dot);
+
+  absl::StatusOr<HloInstruction*> OptimizeDotOfReorderContractingDims(
+      HloInstruction* dot);
+
+  absl::StatusOr<HloInstruction*> AssociativeReorderDotOperator(
+      HloDotInstruction* dot);
+
+  HloComputation* GetOrCreateScalarAddComputation(PrimitiveType type) {
+    HloComputation*& scalar_add_computation = scalar_add_computations_[type];
+    if (scalar_add_computation) {
+      return scalar_add_computation;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(type, {});
+    simplifier_->UpdateLayout(&shape);
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
+    scalar_add_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return scalar_add_computation;
+  }
+
+  // Tries to fold a kPad in the input or filter into the convolution
+  // instruction's window.
+  virtual absl::StatusOr<bool> FoldConvInputPad(HloInstruction* convolution);
+  absl::StatusOr<bool> FoldConvFilterPad(HloInstruction* convolution);
+
+  // Tries to swap convolution operands if they would result in a more efficient
+  // convolution.
+  absl::StatusOr<bool> SwapConvOperands(HloInstruction* convolution);
+
+  // Checks if the given convolution is in BF16 and is oneDNN rewritable, if not
+  // then it promotes the data type of the convolution to F32
+  absl::StatusOr<bool> IsOneDnnRewritableBF16Conv(HloInstruction** convolution);
+
+  // Tries to use a kDot in place of the given convolution.
+  absl::StatusOr<bool> SimplifyConvToDot(HloInstruction* convolution);
+  // Tries to use a multiplication in place of the given convolution.
+  absl::StatusOr<bool> SimplifyConvToMultiply(HloInstruction* convolution);
+
+  // Tries to reorder mul(add(conv(input, filter), bias), multiplier) ->
+  // add(conv(input, mul(filter, multiplier)), mul(bias, multiplier)). It only
+  // does that when the multiplier is a 1D constant of the size equal to the
+  // convolution output feature dimension.
+  absl::Status TryToReorderConvAddMultiply(HloInstruction* multiply);
+
+  // Tries to simplify a slice where the result of the slice is a scalar.
+  absl::StatusOr<bool> TrySimplifyScalarSlice(HloInstruction* slice);
+
+  // Tries to convert slice(reshape(X)) into reshape(slice(X))
+  absl::StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
+
+  // Tries to convert slice(reverse(X)) into reverse(slice(X))
+  absl::StatusOr<bool> TryToReorderSliceAndReverse(HloInstruction* slice);
+
+  // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into
+  // `(< a N)`. This is crucial for being able to figure out the loop trip
+  // count.
+  //
+  // Assumes that the input is conjunction.
+  absl::StatusOr<bool> TrySimplifyTautologicalCompare(
+      HloInstruction* conjunction);
+
+  // Tries to simlplify (bitcast-convert (concat (bitcast-convert A) ...)) where
+  // the types of inner and outer bitcast-convert cancel out.
+  absl::StatusOr<bool> TrySimplifyTautologicalBitcastConvert(
+      HloInstruction* bitcast);
+
+  // Tries to remove surrounding converts around a binary op where the op has a
+  // more precise type than its inputs and output.
+  //
+  // convert<TS>(bin_op<TL>(convert<TL>(data1<TS>),
+  //                        convert<TL>(data2<TS>)))
+  //  where TS is a smaller point type than TL (ex, TS=fp16, TL=fp32)
+  // ->
+  // bin_op<TS>(data1<TS>, data2<TS>)
+  absl::Status TryRemoveUpcastAndDowncastSurroundingBinaryOp(
+      HloInstruction* convert_instruction);
+
+  // Useful when we want to use the same visitor over multiple computations.
+  void ResetState(HloComputation* computation);
+
+  // Current HloComputation instance the AlgebraicSimplifierVisitor is
+  // traversing.
+  HloComputation* computation_;
+
+  // Cached computation for adding two scalars of a given type.
+  absl::flat_hash_map<PrimitiveType, HloComputation*> scalar_add_computations_;
+
+  AlgebraicSimplifier* simplifier_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_ALGEBRAIC_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
new file mode 100644
index 00000000..ed43d9be
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_ALL_REDUCE_FOLDER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_ALL_REDUCE_FOLDER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that folds an all-reduce feeding into another all-reduce by expanding
+// the replica groups. As an example:
+//
+//   ar0 = all-reduce(x) replica_groups={{0,1},{2,3},{4,5},{6,7}}
+//   ar1 = all-reduce(all-reduce0) replica_groups={{0,2},{1,3},{4,6},{5,7}}
+//
+//  Can be combined into a single all-reduce:
+//
+//   ar1 = all-reduce(x) replica_groups={{0,1,2,3},{4,5,6,7}}
+//
+
+class AllReduceFolder : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-reduce-folder"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_ALL_REDUCE_FOLDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
new file mode 100644
index 00000000..3b5ffc22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
@@ -0,0 +1,200 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_AR_CRS_COMBINER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_AR_CRS_COMBINER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+
+// When the HLO graph contains a cross-module AllReduce (N separate AllReduce
+// ops that share the same channel_id for MPMD partitioning, or 1 AllReduce op
+// for SPMD partitioning), followed by some simple linear operations, followed
+// by a cross-replica AllReduce (also known as cross-replica sum, or CRS), we
+// can combine the CMAR and the CRAR, to use an efficient AllReduce
+// implementation that fully utilizes the interconnect bandwidth.
+//
+// Such sequences appear in spatially partitioned models (either MPMD or SPMD).
+// This pass must run right after spatial partitioning, when the code is still
+// in a single HLO module.
+//
+// The steps are:
+// 1) Find CMARs followed by simple ops followed by CRARs.
+// 2) Group CMARs by channel_id. They must all be rewritten. For SPMD
+//    partitioning, there will only be a single CMAR for each channel_id.
+// 3) Prove that the CMAR patterns in each core produce the same result.
+// 4) Eliminate the CMAR, and if it feeds an addition/subtraction, divide the
+//    other operand by the number of spatial partitions.
+// 5) Turn the CRAR into an all-core AllReduce.
+//
+// The pass also handles the case where multiple CMARs lead to the same CRAR,
+// and eliminates all CMARs. This graph:
+//
+//        Y
+//        |
+//  X   CMAR_2   Z
+//  |      \    /
+// CMAR_1     +
+//    \     /
+//       +
+//       |
+//     CRAR
+//
+// gets rewritten to:
+//
+//           Z   num_partitions
+//            \  /
+//       Y    div
+//        \   /
+//    X     +
+//     \   /
+//       +
+//       |
+//  all-core AR
+//
+class ArCrsCombiner : public HloModulePass {
+ public:
+  ArCrsCombiner(int num_spatial_partitions, bool spmd_partition)
+      : num_spatial_partitions_(num_spatial_partitions),
+        spmd_partition_(spmd_partition) {}
+  absl::string_view name() const override { return "ar-crs-combiner"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Helper method to allow testing of InstructionsComputeSameValue.
+  static bool TestInstructionsComputeSameValue(HloInstruction* i1,
+                                               HloInstruction* i2);
+
+ private:
+  // We used this struct because multiple ARs could be paired with the same CRS.
+  // In this case, we want to select the AR that is furthest from the CRS,
+  // because it makes it easier to eliminate all ARs during RewriteGraph.
+  struct ArCrsPair {
+    HloInstruction* ar;
+    HloInstruction* crs;
+    // The length of the path from AR to CRS in the HLO graph.
+    int64_t distance;
+
+    ArCrsPair(HloInstruction* all_reduce, HloInstruction* cross_replica_sum,
+              int64_t dist)
+        : ar(all_reduce), crs(cross_replica_sum), distance(dist) {}
+
+    std::string ToString() {
+      std::string result;
+      absl::StrAppend(&result, "(");
+      HloInstruction* instruction = ar;
+      while (instruction != crs) {
+        absl::StrAppend(&result, instruction->name(), ",");
+        instruction = instruction->users()[0];
+      }
+      absl::StrAppend(&result, instruction->name(),
+                      ")[id:", *(ar->channel_id()), ",dist:", distance, "]");
+      return result;
+    }
+  };
+
+  std::optional<ArCrsCombiner::ArCrsPair> MatchesArCrsPattern(
+      HloInstruction* instruction);
+
+  // If the passed instruction is a while parameter, and the while body is only
+  // called by a single while instruction, return the while instruction.
+  std::optional<HloInstruction*> WhileFromBodyParameter(
+      HloInstruction* instruction);
+
+  // If the passed instruction is a parameter in one of the branch computations,
+  // and the branch body is only called by a single instruction, return the
+  // conditional instruction.
+  std::optional<HloInstruction*> ConditionalFromBodyParameter(
+      HloInstruction* instruction);
+
+  // Returns a vector of tuple instructions.
+  // If all instructions that flow to "instruction" are tuples, return them.
+  // Otherwise, return std::nullopt. Returns an empty vector if the instruction
+  // is already in the visited set.
+  std::optional<std::vector<HloInstruction*>> GetAllTuples(
+      HloInstruction* instruction,
+      absl::flat_hash_set<HloInstruction*>* visited);
+
+  // Checks whether two different elements in the same tuple compute the same
+  // value.
+  bool TupleElementsComputeSameValue(
+      HloInstruction* tuple_shaped_instruction, int64_t i1, int64_t i2,
+      absl::flat_hash_map<int64_t, int64_t>* visited_pairs);
+
+  // Returns whether the instructions i1 and i2 can be shown to evaluate to the
+  // same value. Handling WHILE requires recursion, which may cause us to visit
+  // the same instruction again. To avoid infinite loops, we pass a cache of
+  // visited instruction pairs.
+  bool InstructionsComputeSameValue(
+      HloInstruction* i1, HloInstruction* i2,
+      absl::flat_hash_map<int64_t, int64_t>* visited_pairs);
+
+  // Populates all_reduce_map_.
+  void GroupAllReducesById(HloModule* module);
+
+  // Looks at each AllReduce group in all_reduce_map_, and keeps only the
+  // groups for which it's safe to move the AllReduce later in the HLO graph.
+  absl::Status KeepProvablyEqualInstructionGroupsMPMD();
+
+  // Same as above, but runs on SPMD partitioned module instead of MPMD.
+  absl::Status KeepProvablyEqualInstructionGroupsSPMD(HloModule* module);
+
+  // Performs the graph rewrite that eliminates the early AllReduce and turns
+  // the later CRS into an AllReduce.
+  absl::StatusOr<bool> RewriteGraph();
+
+  int num_spatial_partitions_;
+
+  // Run this combiner pass assuming the input module is an SPMD partitioned
+  // module (as opposed to MPMD partitioned).
+  //
+  // The main difference between the two w.r.t. this pass is that there would be
+  // N all-reduce ops for each channel in MPMD mode, whereas there is only 1
+  // for each channel in SPMD mode. Also we use HloReplicationAnalysis for HLO
+  // equivalence check in SPMD mode.
+  bool spmd_partition_;
+
+  // Map from all-reduce ids to the AR/CRS pairs.
+  absl::flat_hash_map<int64_t, std::vector<ArCrsPair>> all_reduce_map_;
+
+  // Map from a CRS instruction to the all-reduce ID of the AR paired with the
+  // CRS. Sometimes, several ARs in the code could be paired with the same CRS.
+  // We use this map to pick a single AR/CRS path to rewrite.
+  absl::flat_hash_map<HloInstruction*, int64_t> crs_reserved_map_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_AR_CRS_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
new file mode 100644
index 00000000..4d82376d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_BATCH_DOT_SIMPLIFICATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_BATCH_DOT_SIMPLIFICATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+// Simplifies batch dot operations.
+//
+// Normally these would live in the algebraic simplifier, but we want to run
+// this to fixpoint (this pass reaches fixed point in one execution) before we
+// run the DotDecomposer.
+class BatchDotSimplification : public HloModulePass {
+ public:
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  absl::string_view name() const override { return "batch-dot-simplification"; }
+
+ private:
+  absl::StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
+      HloInstruction* batch_dot);
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_BATCH_DOT_SIMPLIFICATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
new file mode 100644
index 00000000..b21e512d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_BFLOAT16_CONVERSION_FOLDING_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_BFLOAT16_CONVERSION_FOLDING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/float_support.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A pass which folds F32 <-> BF16 conversions to their operands or users, when
+// it is supported by the backend.
+//
+// This pass follows the passed-in backend-specific BF16 support rules, but can
+// introduce mixed precision in individual HLOs which breaks the assumption of
+// some other HLO passes. So it should be used at the end of the HLO
+// optimization pipeline followed by a DCE pass. If other passes are needed
+// after this pass, run BFloat16MixedPrecisionRemoval first to undo some of the
+// changed made by this pass.
+class BFloat16ConversionFolding : public HloModulePass {
+ public:
+  explicit BFloat16ConversionFolding(const FloatSupport* bfloat16_support)
+      : bfloat16_support_(bfloat16_support) {
+    DCHECK(bfloat16_support->LowPrecisionType() == BF16);
+  }
+
+  ~BFloat16ConversionFolding() override = default;
+  absl::string_view name() const override { return "bfloat16-fold"; }
+
+  // Run BF16 conversion folding on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const FloatSupport* bfloat16_support_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_BFLOAT16_CONVERSION_FOLDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
new file mode 100644
index 00000000..fe21b1b9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_BROADCAST_CANONICALIZER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_BROADCAST_CANONICALIZER_H_
+
+#include <optional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This transform ensures that dimensions in all broadcast operations are
+// sorted.
+class BroadcastCanonicalizer : public HloModulePass {
+ public:
+  explicit BroadcastCanonicalizer();
+
+  absl::string_view name() const override { return "broadcast_canonicalizer"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_BROADCAST_CANONICALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
new file mode 100644
index 00000000..c36c443b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONDITIONAL_CANONICALIZER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONDITIONAL_CANONICALIZER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Canonicalize output of conditionals, make non-tuple outputs into tuple with
+// single element output. After this pass, all conditional instructions have
+// tuple outputs.
+class ConditionalCanonicalizer : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "conditional-canonicalizer";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONDITIONAL_CANONICALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
new file mode 100644
index 00000000..43732be4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVERT_MOVER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVERT_MOVER_H_
+
+#include <functional>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Moves narrowing conversions up the graph and widening conversions down the
+// graph, when we can do so with no effect on numerics. Motivations:
+//
+//  - It's preferable to spend more of our time in lower precision and less of
+//    our time in higher precision.
+//
+//  - Moving these converts exposes optimization opportunities. For example, in
+//    reshape(convert-big-to-small(reshape(convert-small-to-big(x)))), we can
+//    commute one of the converts with one of the reshapes.  This leaves us with
+//    convert(convert(reshape(reshape))), which can probably be simplified
+//    further by algsimp.
+class ConvertMover : public HloModulePass {
+ public:
+  ConvertMover() = default;
+
+  absl::string_view name() const override { return "convert-mover"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVERT_MOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convert_operand_folder.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convert_operand_folder.h
new file mode 100644
index 00000000..b1010c37
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convert_operand_folder.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVERT_OPERAND_FOLDER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVERT_OPERAND_FOLDER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// Folds Convert operands to wider types into instructions that supports wider
+// result accumulation than the shape inference type.
+//
+// e.g. s32 hlo(s32 convert(s8), s32 convert(s8)) -> s32 hlo(s8, s8)
+class ConvertOperandFolding : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "convert_operand_folding"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVERT_OPERAND_FOLDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
new file mode 100644
index 00000000..8e7059d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVOLUTION_GROUP_CONVERTER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVOLUTION_GROUP_CONVERTER_H_
+
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/status_macros.h"
+
+namespace xla {
+
+// A pass which rewrites convolutions with feature_group_count > 1 into
+// convolutions with feature_group_count = 1.
+class ConvolutionGroupConverter : public HloModulePass {
+ public:
+  ConvolutionGroupConverter(std::function<bool(HloInstruction*)> should_expand,
+                            std::function<bool(HloInstruction*)> is_cost_viable,
+                            bool convert_batch_groups_only,
+                            bool filter_expansion = true)
+      : should_expand_(should_expand),
+        is_cost_viable_(is_cost_viable),
+        convert_batch_groups_only_(convert_batch_groups_only),
+        filter_expansion_(filter_expansion) {}
+
+  absl::string_view name() const override {
+    return "convolution-group-converter";
+  }
+
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Predicate that determines whether this pass should rewrite a given
+  // convolution.
+  std::function<bool(HloInstruction*)> should_expand_;
+
+  // Lambda containing cost model that decides whether to expand
+  // batch_group_count.
+  std::function<bool(HloInstruction*)> is_cost_viable_;
+
+  // Decides whether to convert batch groups or feature groups.
+  bool convert_batch_groups_only_;
+
+  // Tells whether filter expansion is required.
+  bool filter_expansion_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_CONVOLUTION_GROUP_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
new file mode 100644
index 00000000..52bf94d2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_DOT_DIMENSION_MERGER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_DOT_DIMENSION_MERGER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Merge consecutive batch dimensions of a dot() by inserting reshapes.
+class DotDimensionMerger : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_dimension_merger"; }
+
+  // Run the pass on computations in 'module'.
+  // Return whether the 'module' was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_DOT_DIMENSION_MERGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
new file mode 100644
index 00000000..c303f45d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
@@ -0,0 +1,82 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_DOT_MERGER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_DOT_MERGER_H_
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Merges dots that share an operand.  Transforms
+//
+//   x = dot(a, b)
+//   y = dot(a, c)
+//
+// into
+//
+//   z = dot(a, concat(b, c))
+//   x = slice(z)
+//   y = slice(z).
+//
+// This requires that x and y are independent -- that is, x does not
+// transitively depend on y, and y does not transitively depend on x.
+//
+// This is a good transformation if the merged dot runs faster than the original
+// dots.  On the other hand, merging the dots results in a single result buffer
+// z whose live range is the union of x and y's live ranges, so can lead to
+// increased memory pressure.  You probably only want to do this optimization on
+// "small" dots which cannot saturate your device when run alone.
+//
+// We thus allow backends to set a max size above which an op will not be
+// merged.  The input+output bytes of at least one dot must be below the
+// threshold otherwise we won't merge.  (We don't require that both dots be
+// below the threshold because backends likely want to allow merging a "small"
+// dot into a "large" dot while preventing two large dots from being merged.)
+//
+// Will skip gemms with more than one non-contracting dimension in the dot
+// operands to be concatenated.
+class DotMerger : public HloModulePass {
+ public:
+  explicit DotMerger(
+      int64_t max_size_to_merge,
+      std::function<bool(const HloInstruction* a, const HloInstruction* b)>
+          can_merge = [](const HloInstruction* dot_a,
+                         const HloInstruction* dot_b) -> bool { return true; })
+      : max_size_to_merge_(max_size_to_merge), can_merge_(can_merge) {}
+
+  absl::string_view name() const override { return "dot-merger"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  int64_t max_size_to_merge_;
+  // Predicate function for backend-specific compatibility check.
+  std::function<bool(const HloInstruction* dot_a, const HloInstruction* dot_b)>
+      can_merge_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_DOT_MERGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
new file mode 100644
index 00000000..eb52ffaa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_DYNAMIC_DIMENSION_SIMPLIFIER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_DYNAMIC_DIMENSION_SIMPLIFIER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass simplifies operations on dynamic dimension sizes so that it can be
+// easily analyzed by later passes.
+class DynamicDimensionSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "dynamic-dimension-simplifier";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_DYNAMIC_DIMENSION_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
new file mode 100644
index 00000000..25a3abb9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Flatten the call graph for an HLO module into a tree.
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_FLATTEN_CALL_GRAPH_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_FLATTEN_CALL_GRAPH_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Flattening associates each call site with a unique computation (for
+// sequential calling contexts) This simplifies buffer assignment and
+// points-to analysis (see b/36865746 for details).
+class FlattenCallGraph : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "flatten-call-graph"; }
+
+  // Duplicates computations called from multiple call- or while-nodes to
+  // flatten the call graph.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_FLATTEN_CALL_GRAPH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
new file mode 100644
index 00000000..84a19601
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_FLOAT_NORMALIZATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_FLOAT_NORMALIZATION_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/float_support.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A pass which adds type conversions (e.g. F32 <-> BF16) for HLO instructions
+// that do not support low-precision input/output or mixed precision, according
+// to the passed-in backend-specific FloatSupport instance.
+class FloatNormalization : public HloModulePass {
+ public:
+  explicit FloatNormalization(const FloatSupport* float_support)
+      : float_support_(float_support),
+        name_("float-normalization-" +
+              primitive_util::LowercasePrimitiveTypeName(
+                  float_support_->LowPrecisionType())) {}
+
+  ~FloatNormalization() override = default;
+  absl::string_view name() const override { return name_; }
+
+  // Run float normalization on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const FloatSupport* float_support_;
+  std::string name_;
+};
+
+// A pass that unconditionally removes the mixed F32/BF16 uses in HLO
+// instructions (excluding convert) by adding F32 <-> BF16 conversions. Unlike
+// FloatNormalization, this pass does not use a backend-specific
+// FloatSupport, and does not change HLOs that have BF16 data if they do not
+// use mixed precision; it removes mixed precision even if the backend supports
+// it. This pass is used to make the HLO module valid for other HLO passes which
+// do not support mixed precision. Currently, this pass is only used by the
+// Despecializer, not by our normal compilation flow on TPU.
+class BFloat16MixedPrecisionRemoval : public HloModulePass {
+ public:
+  BFloat16MixedPrecisionRemoval() = default;
+
+  ~BFloat16MixedPrecisionRemoval() override = default;
+
+  absl::string_view name() const override {
+    return "bf16-mixed-precision-removal";
+  }
+
+  // Run mixed precision removal on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    FloatNormalization normalization(&no_mixed_precision_support_);
+    return normalization.Run(module, execution_threads);
+  }
+
+ private:
+  class BFloat16SupportForMixedPrecisionRemoval : public FloatSupport {
+   public:
+    BFloat16SupportForMixedPrecisionRemoval() : FloatSupport(BF16) {}
+
+    ~BFloat16SupportForMixedPrecisionRemoval() override = default;
+
+    bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                     int64_t operand_index) const override {
+      return true;
+    }
+
+    bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+      return true;
+    }
+
+    bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+      return false;
+    }
+  } no_mixed_precision_support_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_FLOAT_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
new file mode 100644
index 00000000..0fe609f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_FUSION_CONSTANT_SINKING_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_FUSION_CONSTANT_SINKING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which sinks constants into fusion computations.
+class FusionConstantSinking : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "fusion_constant_sinking"; }
+
+  // Run fusion constant sinking operations on the given module. Returns whether
+  // the module was changed (constant expressions folded).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_FUSION_CONSTANT_SINKING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
new file mode 100644
index 00000000..15edb691
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
@@ -0,0 +1,53 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_GATHER_SIMPLIFIER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_GATHER_SIMPLIFIER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites gather operations into a combination of transposes,
+// reshapes and a simpler gather.
+//
+// The output gather's attributes will have the following characteristics:
+// - start_indices is a two-dimensional tensor
+// - index_vector_dim is 1
+// - start_index_map is [0, 1, ...]
+// - collapsed_slice_dims is []
+// - offset_dims is [1, 2, ...]
+//
+// The purpose of this pass is to check whether this transformation has any
+// performance implications.
+class GatherSimplifier : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "gather_simplifier"; }
+
+  static bool IsSimplifiedGather(const HloGatherInstruction* gather);
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* inst) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_GATHER_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
new file mode 100644
index 00000000..7d04caed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_COMPUTATION_DEDUPLICATOR_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_COMPUTATION_DEDUPLICATOR_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Deduplicate computations inside a `HloModule`: If two computations are
+// identical then keep the first one (in postorder terms) and remove the rest.
+class HloComputationDeduplicator : public HloModulePass {
+ public:
+  // Setting mark_fusion_duplications to true will only process fusions in the
+  // HLO. The comparator in this pass will mark duplicate fusions which is
+  // needed for groupings in analysis (e.g. Xprof). Currently, the pass
+  // doesn't change the HLO if the flag is set to true.
+  explicit HloComputationDeduplicator(bool mark_fusion_duplications = false)
+      : mark_fusion_duplications_(mark_fusion_duplications) {}
+  absl::string_view name() const override { return "computation-deduplicator"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool ContainsLargeConstants(HloComputation* comp);
+  bool mark_fusion_duplications_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_COMPUTATION_DEDUPLICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
new file mode 100644
index 00000000..d3f7f704
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_CONSTANT_FOLDING_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_CONSTANT_FOLDING_H_
+
+#include <atomic>
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which performs constant folding in order to avoid unnecessary
+// computation on constants.
+class HloConstantFolding : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "constant_folding"; }
+
+  // Run constant folding operations on the given module. Returns whether the
+  // module was changed (constant expressions folded).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Number of slow constant-folds we've encountered.  Used for firing
+  // SlowOperationAlarms.
+  static std::atomic<int64_t> slow_op_counter_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_CONSTANT_FOLDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
new file mode 100644
index 00000000..2f58909f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_CONSTANT_SPLITTER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_CONSTANT_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Splits the constant instructions such that they have a single user.
+// This is typically used before domain placement, to make sure a shared
+// constant does not short-circuit domains. It is also used before sharding
+// propagation to prevent unintended propagation of sharding due to shared used
+// of constants.
+//
+// CSE passes after domain placements will ensure that all the sharable
+// constants within the same domain, will be rejoined back.
+//
+// This pass may generate dead instructions. Thus, HloDCE is recommended after
+// this pass.
+class HloConstantSplitter : public HloModulePass {
+ public:
+  explicit HloConstantSplitter(
+      bool split_expressions = false,
+      absl::FunctionRef<bool(const HloInstruction*)> extra_constraints =
+          [](const HloInstruction* instruction) { return true; })
+      : split_expressions_(split_expressions),
+        extra_constraints_(extra_constraints) {}
+  absl::string_view name() const override { return "hlo-constant-splitter"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool split_expressions_;
+  absl::FunctionRef<bool(const HloInstruction*)> extra_constraints_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_CONSTANT_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
new file mode 100644
index 00000000..a7579ad3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_DCE_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_DCE_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass which removes dead instructions from each computation in the module
+// and removes dead computations from the module.
+//
+// An instruction is dead if it is not reachable from the root. A computation is
+// dead if it is not the entry computation of the module and it is not reachable
+// from the entry computation.
+//
+// This pass does not remove dead parameter instructions, as parameter
+// instructions cannot be deleted.
+class HloDCE : public HloModulePass {
+ public:
+  HloDCE() : remove_cross_partition_collective_ops_(false) {}
+  explicit HloDCE(bool remove_cross_partition_collective_ops)
+      : remove_cross_partition_collective_ops_(
+            remove_cross_partition_collective_ops) {}
+  ~HloDCE() override {}
+  absl::string_view name() const override { return "dce"; }
+
+  // Run DCE on a computation.
+  static absl::StatusOr<bool> RunOnComputation(
+      HloComputation* computation, bool remove_cross_partition_collective_ops);
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (instructions were removed).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool remove_cross_partition_collective_ops_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_DCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
new file mode 100644
index 00000000..52a2f6cd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_ELEMENT_TYPE_CONVERTER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_ELEMENT_TYPE_CONVERTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A pass that eliminates certain element types as the input or output of ops by
+// inserting Convert ops. This allows a backend to support an element type while
+// only actually implementing the Convert op for that element type. This is
+// generally not the fastest approach, but it works.
+class HloElementTypeConverter : public HloModulePass {
+ public:
+  // eliminate_type is the type to eliminate as the input or output of ops,
+  // using Convert ops to replace it with replace_with_type.
+  HloElementTypeConverter(PrimitiveType eliminate_type,
+                          PrimitiveType replace_with_type);
+
+  absl::string_view name() const override { return "element_type_converter"; }
+
+  // Returns the pass on the module and returns whether the module was modified.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  PrimitiveType eliminate_type_;
+  PrimitiveType replace_with_type_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_ELEMENT_TYPE_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
new file mode 100644
index 00000000..231030d2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
@@ -0,0 +1,191 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_MEMORY_SCHEDULER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_MEMORY_SCHEDULER_H_
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/tuple_points_to_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/logical_buffer.h"
+
+namespace xla {
+
+// Postprocessor of the HloInstructionSequence. This is an opt-in postprocessing
+// function to MemorySchedulerAlgorithm to enforce certain hlo schedule
+// constraints desired for custom-calls.
+using MemorySchedulerPostprocessor =
+    std::function<HloInstructionSequence(const HloInstructionSequence&)>;
+
+// A memory scheduler computes an execution sequence for the HLO instructions in
+// 'computation' that minimizes peak memory (or finds a balance between memory
+// and available concurrency), given a points-to analysis result that describes
+// buffer aliasing, together with a target-specific size function that maps a
+// tensor's logical size to its padded size. peak_memory (may be nullptr) is set
+// to the peak memory of the resulting schedule according to the HeapSimulator.
+//
+// TODO(yunxing): Cleanup usage of TuplePointsToAnalysis.
+using MemorySchedulerAlgorithm =
+    std::function<absl::StatusOr<HloInstructionSequence>(
+        HloComputation*, const TuplePointsToAnalysis&, const HloAliasAnalysis&,
+        const LogicalBuffer::SizeFunction&,
+        const MemorySchedulerPostprocessor&,
+        /*peak_memory*/ int64_t*)>;
+
+// Scheduler for the entire module.
+using ModuleSchedulerAlgorithm = std::function<absl::StatusOr<HloSchedule>(
+    const HloModule*, const TuplePointsToAnalysis&, const HloAliasAnalysis&,
+    const LogicalBuffer::SizeFunction&,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    /*peak_memory*/ int64_t*)>;
+
+// Lift a computation scheduler into a module scheduler by calling the
+// computation scheduler on all computations in a module.
+ModuleSchedulerAlgorithm ComputationSchedulerToModuleScheduler(
+    const MemorySchedulerAlgorithm&, const MemorySchedulerPostprocessor& = {});
+
+// List scheduler
+absl::StatusOr<HloInstructionSequence> ListMemoryScheduler(
+    HloComputation* computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const HloAliasAnalysis& alias_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+
+// DFS-order scheduler
+absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
+    HloComputation* computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const HloAliasAnalysis& alias_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+
+// BFS-order scheduler
+//
+// BFS-order scheduler is a simple memory scheduler that schedules instructions
+// in a breadth-first order, which maximizes the available concurrency at the
+// cost of increased memory usage (HLO operations that do not have buffer
+// conflicts can be executed in parallel).
+//
+// This is the most trivial scheduling optimized for maximum concurrency. In
+// practice it is only useful for CPU backend where memory is cheap and we have
+// a lot of available compute cores, and cheap concurrency primitives.
+absl::StatusOr<HloInstructionSequence> BFSMemoryScheduler(
+    HloComputation* computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const HloAliasAnalysis& alias_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+
+// Naive Post Order scheduler
+absl::StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
+    HloComputation* computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const HloAliasAnalysis& alias_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+
+// The default scheduling algorithm. Runs the list scheduler, the DFS scheduler,
+// and the post-order scheduler and chooses whichever returns a lower min-
+// memory, not accounting for fragmentation. peak_memory (may be nullptr) is set
+// to the peak memory of the resulting schedule according to the HeapSimulator.
+absl::StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
+    HloComputation* computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const HloAliasAnalysis& alias_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+
+absl::StatusOr<HloSchedule> DefaultModuleScheduler(
+    const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+    const HloAliasAnalysis& alias_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    int64_t* peak_memory);
+
+// Returns an HloSchedule which seeks to minimize the memory required for the
+// module. size_function is the function returning the number of bytes required
+// for a LogicalBuffer. peak_memory (if not nullptr) is set to the largest peak
+// memory (according to the HeapSimulator) of all computations in the module.
+absl::StatusOr<HloSchedule> ScheduleModule(
+    const HloModule* module, const LogicalBuffer::SizeFunction& size_function,
+    const ModuleSchedulerAlgorithm& algorithm = {},
+    const absl::flat_hash_set<absl::string_view>& execution_threads = {},
+    int64_t* peak_memory = nullptr);
+
+// A pass which schedules the HLO instructions in a module. The HloModule's
+// schedule field is set to the resulting HloSchedule using
+// HloModule::set_schedule.
+class HloMemoryScheduler : public HloModulePass {
+ public:
+  // size_function is the function returning the number of bytes required for a
+  // LogicalBuffer. algorithm is the memory scheduling algorithm to use. If not
+  // specified, then DefaultMemoryScheduler is used.
+  explicit HloMemoryScheduler(const LogicalBuffer::SizeFunction& size_function,
+                              const ModuleSchedulerAlgorithm& algorithm = {});
+
+  ~HloMemoryScheduler() override = default;
+
+  absl::string_view name() const override { return "hlo-memory-scheduler"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  LogicalBuffer::SizeFunction size_function_;
+
+  ModuleSchedulerAlgorithm algorithm_;
+};
+
+// A pass which produces a naive, but correct schedule. The schedule is produced
+// using a DFS traversal of the graph with no attempt to minimize memory use.
+class HloTrivialScheduler : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "hlo-trivial-scheduler"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+// A trivial pass which clears the schedule currently set on the
+// HloModule. After this pass runs HloModule::has_schedule will return false.
+class HloDescheduler : public HloModulePass {
+ public:
+  HloDescheduler() = default;
+  ~HloDescheduler() override = default;
+  absl::string_view name() const override { return "hlo-descheduler"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_MEMORY_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
new file mode 100644
index 00000000..28f40a6a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
@@ -0,0 +1,251 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_REMATERIALIZATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_REMATERIALIZATION_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/tuple_points_to_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+// HLO pass which rematerializes instructions to reduce peak memory use, where
+// memory use is defined as the total size of all live HLO instruction
+// values. Parameters and constants are included in memory use estimates.
+//
+// CSE will undo the effects of this optimization and should not be run after
+// this pass. In general, this pass should be run very late, immediately before
+// code generation.
+class HloRematerialization : public HloModulePass {
+ public:
+  using ShapeSizeFunction = std::function<int64_t(const Shape&)>;
+
+  using CompactShapeFunction =
+      std::function<absl::StatusOr<Shape>(const Shape&)>;
+
+  // Helper struct that communicates the before / after sizes for the
+  // rematerialization process.
+  struct RematerializationSizes {
+    int64_t before_bytes = -1;
+    int64_t after_bytes = -1;
+  };
+
+  // Mode in which the rematerialization algorithm should be run.
+  struct RematerializationModeConfig {
+    RematerializationModeConfig(bool recompute, bool compress,
+                                bool host_offload)
+        : recompute(recompute),
+          compress(compress),
+          host_offload(host_offload) {}
+    bool recompute;     // Enables the kRecompute RematStrategy.
+    bool compress;      // Enables the kCompress RematStrategy.
+    bool host_offload;  // Enables the kHostOffload RematStrategy.
+  };
+
+  // This is a struct containing configuration options that are specific to the
+  // Host Memory Offload strategy.
+  struct HostMemoryOffloadConfig {
+    explicit HostMemoryOffloadConfig(int64_t host_memory_space,
+                                     float bandwidth_to_host_bytes_per_second,
+                                     float bandwidth_from_host_bytes_per_second)
+        : host_memory_space(host_memory_space),
+          bandwidth_to_host_bytes_per_second(
+              bandwidth_to_host_bytes_per_second),
+          bandwidth_from_host_bytes_per_second(
+              bandwidth_from_host_bytes_per_second) {}
+
+    // The host memory space, which is used during the host offload strategy.
+    int64_t host_memory_space;
+
+    float bandwidth_to_host_bytes_per_second;
+
+    float bandwidth_from_host_bytes_per_second;
+  };
+
+  static Shape DefaultCompactShapeFunction(const Shape& shape) { return shape; }
+
+  struct Options {
+    explicit Options(HloCostAnalysis& hlo_cost_analysis,
+                     const RematerializationModeConfig& remat_mode_config,
+                     int64_t memory_limit_bytes, int block_size_limit,
+                     float block_rematerialization_factor,
+                     int64_t min_remat_size,
+                     CompactShapeFunction compact_shape_function,
+                     std::optional<HostMemoryOffloadConfig>
+                         host_memory_offload_config = std::nullopt,
+                     absl::flat_hash_map<HloComputation*, int64_t>
+                         async_computation_parallelism = {})
+        : hlo_cost_analysis(hlo_cost_analysis),
+          remat_mode_config(remat_mode_config),
+          memory_limit_bytes(memory_limit_bytes),
+          block_size_limit(block_size_limit),
+          block_rematerialization_factor(block_rematerialization_factor),
+          min_remat_size(min_remat_size),
+          compact_shape_function(compact_shape_function == nullptr
+                                     ? DefaultCompactShapeFunction
+                                     : std::move(compact_shape_function)),
+          host_memory_offload_config(host_memory_offload_config),
+          async_computation_parallelism(async_computation_parallelism) {}
+
+    // The cost model used for decisions during rematerialization for host
+    // memory offload. It is also used for getting Shape size.
+    HloCostAnalysis& hlo_cost_analysis;
+
+    // Holds the rematerialization strategy configuration to be used by the
+    // pass.
+    RematerializationModeConfig remat_mode_config;
+
+    // Function which computes the size of the top-level buffer of a shape.
+    const ShapeSizeFunction size_function;
+
+    // The threshold number of bytes to reduce memory use to via
+    // rematerialization. Size of aliased outputs should be subtracted
+    // from this.
+    int64_t memory_limit_bytes;
+
+    // Maximum number of consecutive instructions to consider for
+    // rematerialization.
+    int block_size_limit;
+
+    // Controls the amount of effort spent trying to find large blocks for
+    // rematerialization. Larger values leads to longer compilation times in
+    // return for potentially reduced memory consumption.
+    float block_rematerialization_factor;
+
+    // The minimum size, in bytes, of a tensor to be considered for
+    // rematerialization. All tensors smaller than this size will be skipped
+    // over.
+    int64_t min_remat_size;
+
+    // Converts a shape into compact form, returns the same shape if a shape is
+    // already considered compact.
+    CompactShapeFunction compact_shape_function;
+
+    std::optional<HostMemoryOffloadConfig> host_memory_offload_config;
+
+    // Collection of async entry computations and their number of parallel
+    // invocations.
+    absl::flat_hash_map<HloComputation*, int64_t> async_computation_parallelism;
+  };
+
+  explicit HloRematerialization(Options options, RematerializationSizes& sizes)
+      : options_(std::move(options)), sizes_(sizes) {}
+
+  ~HloRematerialization() override = default;
+
+  absl::string_view name() const override { return "rematerialization"; }
+
+  // Get the next available channel id and increment count.
+  int64_t NextChannelId() { return next_channel_id_++; }
+
+  // Get the peak memory for the computation.
+  int64_t ComputationPeakMemory(const HloComputation* computation) const {
+    return computation_peak_memory_.at(computation);
+  }
+
+  // Runs rematerialization on the given module. Returns whether the module was
+  // changed. Requires that the module has a schedule set
+  // (HloModule::has_schedule() is true) before running. Returns whether any
+  // instructions were rematerialized. If memory use is already below the limit
+  // specified in the constructor then no instructions are rematerialized and
+  // false is returned.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ protected:
+  // Rematerializes instructions within the given computation. 'schedule'
+  // constrains the order in which the computation's instructions will be
+  // emitted in the backend. Rematerialized instructions will be added to the
+  // HLO computation and inserted into 'schedule'.
+  virtual absl::StatusOr<bool> RematerializeComputation(
+      HloComputation* computation, HloSchedule* schedule,
+      int64_t memory_limit_bytes, int64_t min_remat_size,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Computes and returns the peak memory used by the given computation. The
+  // peak memory is the maximum total size of all live HLO instruction values at
+  // any program point. 'order' is the order in which the HLO instructions will
+  // be emitted which is used to determine lifespans of HLO values.
+  absl::StatusOr<int64_t> ComputePeakMemory(
+      const HloComputation* computation, const HloInstructionSequence& order,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+
+  // Returns the peak memory usage of the called computations for the given
+  // instruction. Zero is returned if the instruction calls no computations.
+  absl::StatusOr<int64_t> CalledComputationsMemoryUsage(
+      const HloInstruction* instruction,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+
+  const Options options_;
+
+  // Reference to data structure which records the peak memory usage of the HLO
+  // module before/after rematerialization.
+  RematerializationSizes& sizes_;
+
+  // Call graph of the hlo_module.
+  std::unique_ptr<CallGraph> call_graph_;
+
+  // The peak memory usage of each computation. The map contains only those
+  // computations called from sequential context (CallContext::kSequential).
+  // These values are updated as rematerialization occurs.
+  absl::flat_hash_map<const HloComputation*, int64_t> computation_peak_memory_;
+
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  // Set of computations which have had rematerialization
+  // applied. Rematerialization is only applied once per computation.
+  absl::flat_hash_set<const HloComputation*> rematerialized_computations_;
+
+  // Count of the total instructions rematerialized.
+  int64_t instructions_rematerialized_ = 0;
+
+  // Count of the net instructions added to the HLO module by
+  // rematerialization. This can be different than instructions_rematerialized_
+  // because some rematerializations are effectively moves in the HLO
+  // schedule. In these cases, the rematerialization instruction replaces all
+  // uses of the original instruction and the original instruction is
+  // dead. Hence, no net instructions were added.
+  int64_t net_instructions_added_ = 0;
+
+  // Size of the largest block that has been rematerialized. This is actually an
+  // upper bound (within a factor of 2) on the block size.
+  int max_rematerialized_block_size_ = 0;
+
+  // Tracking available channel id numbers to use to apply to rematerialized
+  // channel instructions
+  int64_t next_channel_id_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_REMATERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test_utils.h
new file mode 100644
index 00000000..a0d1e678
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test_utils.h
@@ -0,0 +1,150 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class to create computations for testing rematerialization methods.
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_REMATERIALIZATION_TEST_UTILS_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_REMATERIALIZATION_TEST_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class RematerializationTestBase : public HloHardwareIndependentTestBase {
+ protected:
+  // Creates and returns a computation which can benefit from
+  // rematerialization. The computation looks like:
+  //
+  //   F32[1] %param = {...}
+  //   F32[] %reshape = reshape(F32[], param)
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %negate = negate(%bcast)
+  //   F32[2048] %concat_1 = concat({%negate, %negate})
+  //   F32[1] %slice_1 = slice(%concat_1, {0:1})
+  //   F32[1025] %concat_2 = concat({%bcast, %slice_1})
+  //   F32[1] %slice_2 = slice(%concat_2, {0:1});
+  //
+  // The instruction %bcast can be rematerialized before its use at %concat_2
+  // to reduce peak memory usage. This avoids %bcast and %concat_1 being
+  // simultaneously live. Peak memory use is about 16KB before rematerialization
+  // (during execution of %concat_1) and about 12KB after rematerializing %bcast
+  // for its use in %concat_2.
+  std::unique_ptr<HloComputation> MakeRematerializableComputation(
+      const std::string& suffix = "") {
+    auto builder = HloComputation::Builder(TestName() + suffix);
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+    auto reshape = builder.AddInstruction(
+        HloInstruction::CreateReshape(scalar_shape_, param));
+    auto bcast = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(vec1024_shape_, reshape, {}));
+    auto negate = builder.AddInstruction(
+        HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, bcast));
+    auto concat_1 = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {negate, negate},
+        /*dimension=*/0));
+    auto slice_1 = builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1_shape_, concat_1, /*start_indices=*/{0},
+        /*limit_indices=*/{1},
+        /*strides=*/{1}));
+    auto concat_2 = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, slice_1},
+        /*dimension=*/0));
+    // Add a final slice to make the parameter shape match the output shape
+    // which is necessary to use this computation in a while.
+    builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat_2,
+                                                       /*start_indices=*/{0},
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
+    return builder.Build();
+  }
+
+  // Creates and returns a computation which includes a while and can benefit
+  // from rematerialization. The computation looks like:
+  //
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1] %slice_1 = slice(%bcast, {0:1})
+  //   F32[1] %while = while(%slice_1, while_body, while_cond)
+  //   F32[1025] %concat = concat({%bcast, %while})
+  //   F32[1] %slice_2 = slice(%concat, {0:1});
+  //
+  // The instruction %bcast can be rematerialized before its use at %concat to
+  // reduce peak memory usage. This avoids %bcast being live during execution of
+  // the while. Peak memory use is maximum of 8K and 4K plus the memory use of
+  // the while subcomputations.
+  std::unique_ptr<HloComputation> MakeRematerializableWhileComputation(
+      HloComputation* while_cond, HloComputation* while_body,
+      const std::string& suffix = "") {
+    auto builder = HloComputation::Builder(TestName() + suffix);
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+    auto reshape = builder.AddInstruction(
+        HloInstruction::CreateReshape(scalar_shape_, param));
+    auto bcast = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(vec1024_shape_, reshape, {}));
+    auto slice_1 = builder.AddInstruction(
+        HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0},
+                                    /*limit_indices=*/{1},
+                                    /*strides=*/{1}));
+    auto while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
+        vec1_shape_, while_cond, while_body, slice_1));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, while_inst},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat,
+                                                       /*start_indices=*/{0},
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
+    return builder.Build();
+  }
+
+  // Create and return a trivial computation appropriate for use as a while
+  // condition.
+  std::unique_ptr<HloComputation> MakeConditionComputation() {
+    auto builder = HloComputation::Builder(TestName() + ".cond");
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+    builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+    return builder.Build();
+  }
+
+  // Return the byte size of the top-level buffer of the given shape.
+  static int64_t ByteSizeOf(const Shape& shape) {
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+
+ protected:
+  // Various shapes used in the canned computations.
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape vec1_shape_ = ShapeUtil::MakeShape(xla::F32, {1});
+  const Shape vec1024_shape_ = ShapeUtil::MakeShape(xla::F32, {1024});
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HLO_REMATERIALIZATION_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
new file mode 100644
index 00000000..7ec27b9d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+/*
+This pass finds copies between the host memory and device memory and converts
+them into the async ops. This includes, but is not limited to:
+ - device to host DynamicUpdateSlice
+ - host to device DynamicSlice
+* The examples below are not yet supported *
+ - host to device DynamicUpdateSlice
+ - device to host DynamicSlice
+ - host to device Copy
+ - device to host Copy
+*/
+class HostMemoryTransferAsyncifier : public HloModulePass {
+ public:
+  explicit HostMemoryTransferAsyncifier(int64_t host_memory_space_color)
+      : kHostMemorySpaceColor(host_memory_space_color) {}
+  ~HostMemoryTransferAsyncifier() override = default;
+
+  absl::string_view name() const override {
+    return "host-memory-transfer-asyncifier";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const int64_t kHostMemorySpaceColor;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
new file mode 100644
index 00000000..64cad2f6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
@@ -0,0 +1,50 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_INSTRUCTION_HOISTER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_INSTRUCTION_HOISTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that hoists parameters and constants to increase opportunities for
+// prefetching.
+class InstructionHoister : public HloModulePass {
+ public:
+  explicit InstructionHoister(bool hoist_parameters = true,
+                              bool host_constants = true)
+      : hoist_parameters_(hoist_parameters), host_constants_(host_constants) {}
+
+  ~InstructionHoister() override = default;
+
+  absl::string_view name() const override { return "instruction-hoister"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool hoist_parameters_;
+  bool host_constants_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_INSTRUCTION_HOISTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
new file mode 100644
index 00000000..d33512c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
@@ -0,0 +1,89 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+// This pass finds input and output buffers that can be aliased, and writes the
+// alias config into the HloModule.
+//
+// The input and the output buffers can be in any shape, and each output buffer
+// can alias with an input buffer with the same shape. Each input buffer may
+// only alias with a single output buffer. For example, for the following
+// parameter and the output buffers,
+//
+//  Parameters : { P1(f32[3]), P2(s32[3]), P3(f32[3,12]), P4(f32[16,12]), ... }
+//  Outputs    : { O1(s32[3]), O2(f32[3]), O3(f32[16,12]), ... }
+//
+// one potential aliasing would be (O1, P2), (O2, P1), (O3, P4), ..
+class OptimizeInputOutputBufferAlias : public HloModulePass {
+ public:
+  OptimizeInputOutputBufferAlias() = default;
+  explicit OptimizeInputOutputBufferAlias(
+      bool registered_buffer_donor_only,
+      std::function<int64_t(const Shape&)> shape_size_fn =
+          [](const Shape& shape) { return ShapeUtil::ByteSizeOf(shape); })
+      : registered_buffer_donor_only_(registered_buffer_donor_only),
+        shape_size_fn_(shape_size_fn) {}
+  ~OptimizeInputOutputBufferAlias() override = default;
+
+  absl::string_view name() const override {
+    return "optimize_input_output_buffer_alias";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  friend class OptimizeInputOutputBufferAliasTest;
+
+  // If true, we only consider the registered buffer donor in
+  // HloBufferDonorConfig, ignoring unregistered input parameters. If false, we
+  // treat all input parameters as buffer donors.
+  bool registered_buffer_donor_only_ = false;
+
+  // Match buffer donors and donees and save the matched paired in the
+  // alias_config. The availability of buffer donors is controlled by the flag
+  // registered_buffer_donor_only_.
+  absl::StatusOr<bool> Build(absl::Span<const Shape> input_shapes,
+                             const Shape& output_shape,
+                             HloInputOutputAliasConfig* alias_config,
+                             HloBufferDonorConfig* buffer_donor_config);
+
+  std::function<int64_t(const Shape&)> shape_size_fn_ = [](const Shape& shape) {
+    return ShapeUtil::ByteSizeOf(shape);
+  };
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
new file mode 100644
index 00000000..b68308fd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
@@ -0,0 +1,72 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_REDUCE_WINDOW_REWRITER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_REDUCE_WINDOW_REWRITER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Rewrite ReduceWindow to be more performant in cases it is written in a
+// quadratic way:
+//
+// 1) Work around unimplemented cases in the implementation of ReduceWindow.
+//
+// This rewrites all R1 ReduceWindow nodes. We reshape the operand to an
+// R2, perform the operation, and reshape back to R1. The reshapes correspond to
+// a bitcast if the tensor length is less than or equal to a passed parameter.
+// The motivation for this is to avoid use of overly large reductions and the
+// complexities and restrictions therein.
+//
+// 2) Rewrite ReduceWindow ops that represent a CumSum/CumProd into a
+// tree-reduction (see details in the implementation).
+// Note that this may itself generate R1 ReduceWindow ops, which means this pass
+// needs to be run to a fixed point.
+class ReduceWindowRewriter : public HloModulePass {
+ public:
+  // `base_length` is a size of a reduce-window we are comfortable with
+  // executing.
+  explicit ReduceWindowRewriter(int64_t base_length)
+      : base_length_(base_length) {}
+
+  absl::string_view name() const override { return "reduce-window-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::Status ReplaceReduceWindowWithReshape(
+      HloReduceWindowInstruction* reduce_window);
+
+  absl::StatusOr<bool> TryOptimizeCumSumOrProd(
+      HloReduceWindowInstruction* reduce_window);
+
+  int64_t base_length_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_REDUCE_WINDOW_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
new file mode 100644
index 00000000..20a16b29
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
@@ -0,0 +1,79 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_RESHAPE_MOVER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_RESHAPE_MOVER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass sinks kReshape and kTranspose operations (known as "rearrange" ops)
+// down through elementwise ops:
+//
+//   op(rearrange(x), rearrange(y)) => rearrange(op(x, y)).
+//
+// We also handle the case where one of the operands is not itself a rearrange
+// op but can be trivially rearranged.  For example:
+//
+//   op(rearrange(x), broadcast(scalar_y)) =>
+//   rearrange(x, broadcast'(scalar_y)).
+//
+// This pass should be run to a fixed point.  It also expects algsimp to be run
+// after each iteration.
+
+struct ReshapeMoverOptions {
+  // On some platforms, it's cheap to do `reshape(broadcast(f32[n] x))`.  The
+  // reshape and broadcast can always be fused, and the index calculations are
+  // not expensive.  In such cases it can be beneficial for us to create these
+  // reshapes eagerly, allowing us to get rid of more expensive ones.
+  bool reshape_of_1d_broadcast_is_cheap = false;
+};
+
+class ReshapeMover : public HloModulePass {
+ public:
+  explicit ReshapeMover(
+      const ReshapeMoverOptions& options = ReshapeMoverOptions{})
+      : options_(options) {}
+
+  absl::string_view name() const override { return "reshape-mover"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> TryReshapeMoveOnCandidates(
+      HloInstructionSet* candidates);
+  absl::StatusOr<bool> SinkRearrangeOperands(HloInstruction* instruction);
+  absl::StatusOr<HloInstruction*> ApplyInverseRearrange(
+      const HloInstruction* rearrange, HloInstruction* operand);
+  bool IsReshapeMoveCandidate(HloInstruction* instruction);
+  const HloInstruction* FirstNontrivialRearrange(
+      absl::Span<const HloInstruction* const> instrs);
+  bool CanTriviallyRearrange(const HloInstruction* instr,
+                             const HloInstruction* rearrange);
+
+  ReshapeMoverOptions options_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_RESHAPE_MOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/result_caster.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/result_caster.h
new file mode 100644
index 00000000..01d1980a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/result_caster.h
@@ -0,0 +1,51 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_RESULT_CASTER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_RESULT_CASTER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Inserts Convert to result of instructions to the preferred element type
+// specified by the instructions when direct accumulation of that type isn't
+// supported by the backend. This pass is run in combination with
+// OperandUpcaster. If the inferred accumulation type has less precision,
+// OperandUpcaster will convert the operands to the higher precision type if
+// necessary.
+class ResultCaster : public OpExpanderPass {
+ public:
+  explicit ResultCaster(HloPredicate extra_filter = nullptr)
+      : OpExpanderPass(std::move(extra_filter)) {}
+
+  absl::string_view name() const override { return "result_caster"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_RESULT_CASTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
new file mode 100644
index 00000000..4ead2aa5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_ROOT_INSTRUCTION_SINKER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_ROOT_INSTRUCTION_SINKER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Given a scheduled HLO module, this pass sinks the ROOT of the instruction to
+// the bottom of the non-fusion computations. To avoid dependency violations of
+// moving the ROOT instruction, it creates a new ROOT instruction that looks
+// like the following:
+//   - For tuple ROOT type:
+//        new_root = tuple(gte(old_root), gte(old_root), ...)
+//   - For non-tuple ROOT type:
+//        new_root = bitcast(old_root)
+class RootInstructionSinker : public HloModulePass {
+ public:
+  ~RootInstructionSinker() override = default;
+  absl::string_view name() const override { return "root-instruction-sinker"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_ROOT_INSTRUCTION_SINKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
new file mode 100644
index 00000000..a2266f34
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_SIMPLIFY_FP_CONVERSIONS_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_SIMPLIFY_FP_CONVERSIONS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Simplifies chains of floating-point conversions.
+//
+// The algebraic simplifier will remove convert pairs of the form `X -> Y -> X`,
+// only when they are a no-op, e.g. `bf16 -> f32 -> bf16` or
+// `f32 -> bf16 -> f32`. Note that the latter optimization might lead to
+// increased precision.
+class SimplifyFPConversions : public HloModulePass {
+ public:
+  explicit SimplifyFPConversions() = default;
+
+  absl::string_view name() const override { return "simplify-fp-conversions"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_SIMPLIFY_FP_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
new file mode 100644
index 00000000..66a58ed2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
@@ -0,0 +1,41 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_SLICE_SINKER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_SLICE_SINKER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// An HLO pass that sinks slice operations used by a group of elementwise
+// operations and merges the group of elementwise operations.
+class SliceSinker : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "slice-sinker"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_SLICE_SINKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
new file mode 100644
index 00000000..44779f1e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_SORT_SIMPLIFIER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_SORT_SIMPLIFIER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass which removes unused operands from sort, where an unused operand is
+// defined as an operand at some index 'x' at which the output is not used.
+class SortSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "simplify-sorts"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_SORT_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
new file mode 100644
index 00000000..b8918d63
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_SUB_BYTE_NORMALIZATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_SUB_BYTE_NORMALIZATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that can modify the sub-byte element_size_in_bits annotation on
+// layouts. Depending on the constructor argument, it either removes the
+// element_size_in_bits annotation for platforms that don't support packed
+// types, or it sets element_size_in_bits to N for N-bit values.
+class SubByteNormalization : public HloModulePass {
+ public:
+  enum Mode {
+    // Remove element_size_in_bits on all layouts. Useful for platforms which
+    // do not support packed types.
+    REMOVE_ELEMENT_SIZE,
+    // Set element_size_in_bits to bitwidth(type) for layouts of types < 8 bits
+    // (S4, U4, etc.), and to 0 for all other layouts. Useful for platforms
+    // which support packed types.
+    SET_ELEMENT_SIZE,
+  };
+
+  explicit SubByteNormalization(Mode mode) : mode_(mode) {}
+
+  ~SubByteNormalization() override = default;
+
+  absl::string_view name() const override {
+    switch (mode_) {
+      case REMOVE_ELEMENT_SIZE:
+        return "sub-byte-size-removal";
+      case SET_ELEMENT_SIZE:
+        return "sub-byte-size-setter";
+    }
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  Mode mode_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_SUB_BYTE_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
new file mode 100644
index 00000000..b5b6f91d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_TREE_REDUCTION_REWRITER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_TREE_REDUCTION_REWRITER_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Increase precision for the reduction operation by applying the reduce-window
+// first.
+//
+// E.g. suppose we want to reduce f32[1024] to a scalar. This pass first applies
+// a reduce-window (with kSame padding) of size `reduce_window_size`, and then
+// reduces the resulting array f32[32]. The rewrite is not applied if any of the
+// reduced dimensions is smaller than the `reduce_window_size`.
+//
+// Applying this pass until a fixed point performs a variant of pairwise
+// summation (https://en.wikipedia.org/wiki/Pairwise_summation), which is
+// guaranteed to have an asymptotically smaller error bound provided that
+// intermediate roundoff errors are random and have random sign.
+//
+// If this pass lowers the performance too much, the window size can always be
+// increased to a larger value.
+class TreeReductionRewriter : public HloModulePass {
+ public:
+  explicit TreeReductionRewriter(int64_t reduce_window_size = 32)
+      : reduce_window_size_(reduce_window_size) {}
+  ~TreeReductionRewriter() override = default;
+  absl::string_view name() const override { return "tree_reduction_rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  int64_t reduce_window_size_;
+};
+
+}  // end namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_TREE_REDUCTION_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
new file mode 100644
index 00000000..ba3113ef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_TUPLE_SIMPLIFIER_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_TUPLE_SIMPLIFIER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which simplifies patterns of Tuple and GetTupleElement instructions in
+// the module.
+class TupleSimplifier : public HloModulePass {
+ public:
+  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
+  explicit TupleSimplifier(bool exclude_entry_computation);
+  ~TupleSimplifier() override {}
+  absl::string_view name() const override { return "tuple-simplifier"; }
+
+  // Runs tuple simplification on the given module. Returns whether the module
+  // was changed.
+  using HloPassInterface::Run;
+  using HloPassInterface::RunOnModuleGroup;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // When set, this pipeline stage will perform optimization of all computations
+  // apart from the module's entry computation. This is used by Graphcore's
+  // backend.
+  bool exclude_entry_computation_;
+
+  // Collapse the following structure into just 'Tuple-shaped Op', iff the
+  // sequence of GTE ops is order-preserving:
+  //
+  //   Tuple-shaped Op
+  //         |
+  //   +-----+-----+
+  //   |     |     |
+  //  GTE   GTE   GTE
+  //   |     |     |
+  //   +-----+-----+
+  //         |
+  //       Tuple
+  //
+  absl::StatusOr<bool> RemoveWholeTuple(HloInstruction* tuple);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_TUPLE_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
new file mode 100644
index 00000000..7823b7e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_ZERO_SIZED_HLO_ELIMINATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_ZERO_SIZED_HLO_ELIMINATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+// HLO pass that replaces zero sized Hlos with a zero sized constant literal.
+namespace xla {
+class ZeroSizedHloElimination : public HloModulePass {
+ public:
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  absl::string_view name() const override {
+    return "zero_sized_hlo_elimination";
+  }
+};
+}  // namespace xla
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/tests/dummy_passes.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/tests/dummy_passes.h
new file mode 100644
index 00000000..fb1644dc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/tests/dummy_passes.h
@@ -0,0 +1,76 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_TESTS_DUMMY_PASSES_H_
+#define XLA_HLO_TRANSFORMS_TESTS_DUMMY_PASSES_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A module pass which renames instructions named 'foo' to 'bar'.
+class FooToBarModulePass : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "foo2bar"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    bool changed = false;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        if (instruction->name() == "foo") {
+          instruction->SetAndSanitizeName("bar");
+          changed = true;
+        }
+      }
+    }
+    return changed;
+  }
+};
+
+// A module pass which renames instructions named 'bar' to 'hello'.
+class BarToHelloModulePass : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "bar2hello"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    bool changed = false;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        if (instruction->name() == "bar") {
+          instruction->SetAndSanitizeName("hello");
+          changed = true;
+        }
+      }
+    }
+    return changed;
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_TESTS_DUMMY_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
new file mode 100644
index 00000000..0cab15b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
+#define XLA_HLO_TRANSFORMS_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass that annotates `while` loops with known trip counts.
+//
+// The annotation is stored as a backend-config on the while loop node.
+//
+// This pass should run after all passes that might semantically modify a while
+// loop, e.g. by unrolling it.  Otherwise, a loop could end up with a
+// backend-config that doesn't match its true trip-count.
+//
+// This pass does some pattern-matching on loop bodies and conditions, so it
+// should run after most HLO simplifications and before fusion and layout
+// assignment, which make pattern matching much more difficult by e.g.
+// introducing `copy` nodes.
+class WhileLoopTripCountAnnotator : public HloModulePass {
+ public:
+  ~WhileLoopTripCountAnnotator() override {}
+  absl::string_view name() const override {
+    return "while-loop-trip-count-annotator";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h
new file mode 100644
index 00000000..116d17f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_ASYNC_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_ASYNC_IMPORTER_H_
+
+#include <functional>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace xla {
+
+// Op Converters
+absl::StatusOr<mlir::Operation*> ImportSend(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    mlir::SymbolTable& symbol_table);
+
+absl::StatusOr<mlir::Operation*> ImportRecv(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    mlir::SymbolTable& symbol_table);
+
+// Async Collectives
+absl::StatusOr<mlir::Operation*> ImportAllGatherStart(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    mlir::SymbolTable& symbol_table);
+
+absl::StatusOr<mlir::Operation*> ImportAllReduceStart(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    std::function<absl::Status(mlir::mhlo::AllReduceOp)> mutate_op,
+    mlir::SymbolTable& symbol_table);
+
+absl::StatusOr<mlir::Operation*> ImportCollectivePermuteStart(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    mlir::SymbolTable& symbol_table);
+
+absl::StatusOr<mlir::Operation*> ImportCopyStart(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    mlir::SymbolTable& symbol_table);
+
+absl::StatusOr<mlir::Operation*> ImportAsyncOpDone(
+    const HloInstruction* instruction, mlir::Location loc,
+    const llvm::SmallVectorImpl<mlir::Value>& operands,
+    llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
+    mlir::Type result_type, mlir::OpBuilder* builder,
+    std::optional<HloOpcode> consolidate_if_parent = std::nullopt);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_ASYNC_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
new file mode 100644
index 00000000..93e4514a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Converts an XLA PrecisionConfig to the corresponding MLIR attribute.
+mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
+                                       mlir::Builder* builder);
+
+// Converts the gather dimensions to attributes.
+mlir::mhlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
+    const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the scatter dimensions to attributes.
+mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
+    const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the dot algorithm to attributes.
+mlir::mhlo::DotAlgorithmAttr ConvertDotAlgorithm(
+    PrecisionConfig::Algorithm algorithm, mlir::Builder* builder);
+
+// Converts the dot dimensions to attributes.
+mlir::mhlo::DotDimensionNumbersAttr ConvertDotDimensionNumbers(
+    const DotDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts ragged dot dimensions to attributes.
+mlir::mhlo::RaggedDotDimensionNumbersAttr ConvertRaggedDotDimensionNumbers(
+    const RaggedDotDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the conv dimensions to attributes.
+mlir::mhlo::ConvDimensionNumbersAttr ConvertConvDimensionNumbers(
+    const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the output operand aliasing to attributes.
+mlir::ArrayAttr ConvertOutputOperandAliasing(
+    const std::vector<std::pair<xla::ShapeIndex,
+                                std::pair<int64_t, xla::ShapeIndex>>>& aliaInfo,
+    mlir::Builder* builder);
+
+// Converts the sparsity descriptor to attributes.
+absl::StatusOr<mlir::mhlo::SparsityDescriptorAttr> ConvertSparsityDescriptor(
+    xla::SparsityDescriptor sparsity_descriptor, mlir::Builder* builder);
+
+absl::StatusOr<mlir::mhlo::FftType> ConvertFftType(FftType type);
+absl::StatusOr<mlir::mhlo::Transpose> ConvertTranspose(
+    TriangularSolveOptions_Transpose transpose);
+
+absl::StatusOr<mlir::mhlo::CustomCallApiVersion> ConvertCustomCallApiVersion(
+    xla::CustomCallApiVersion api_version);
+
+mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
+                                          mlir::Builder* builder);
+mlir::NamedAttribute ConvertChannelHandle(std::optional<int64_t> channel_id,
+                                          mlir::Builder* builder);
+
+mlir::NamedAttribute ConvertReplicaGroups(
+    absl::Span<const ReplicaGroup> replica_groups, mlir::Builder* builder);
+
+mlir::NamedAttribute ConvertSourceTargetPairs(
+    const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+    mlir::Builder* builder);
+
+mlir::NamedAttribute ConvertUseGlobalDeviceIds(mlir::Builder* builder);
+
+// Extracts layouts from shapes and converts it into layout attributes (array of
+// rank-1 index tensors). Returns an error if any of the shapes is a tuple.
+absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromShapes(
+    const absl::Span<const Shape> shapes_with_layouts, mlir::Builder* builder);
+
+// Extracts the layouts of each element from a tuple shape and returns them as
+// an array of rank-1 index tensors. Returns an error in presence of nested
+// tuple shapes.
+absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromTuple(const xla::Shape shape,
+                                                        mlir::Builder* builder);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.h
new file mode 100644
index 00000000..92424e85
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_CUSTOM_CALL_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_CUSTOM_CALL_IMPORTER_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/ValueRange.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla {
+
+// Imports custom_calls prefixed with `mhlo.` from HLO to MHLO.
+// This is used for ops in MHLO / StableHLO that don't exist in HLO. Many of
+// these ops are needed for XlaBuilder clients that need to raise HLO to
+// StableHLO.
+absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
+    const HloCustomCallInstruction* instruction, mlir::Location loc,
+    mlir::Type result_type, mlir::ValueRange operands,
+    mlir::OpBuilder* builder);
+
+// Indicates whether a custom call is an encoded MHLO op.
+// Currently returns true for `mhlo.` prefixed custom calls.
+bool IsOpEncodedCustomCall(const HloCustomCallInstruction* instruction);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_CUSTOM_CALL_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h
new file mode 100644
index 00000000..c65c41e5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -0,0 +1,258 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
+
+#include <cstdint>
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class HloModule;
+class HloComputation;
+class HloInstruction;
+class Shape;
+
+// HLO bounded dynamic shapes can be converted to either MLIR dynamic shapes
+// (which lose the bound information) or casted to static shape using the
+// bounds.
+enum class DynamicShapeHandlingMode { kDynamic, kConvertToStatic };
+
+// Helper class for importing HloComputations.
+class HloFunctionImporter {
+ public:
+  // Imports the given computation as a function in the given symbol table and
+  // returns the FuncOp. This also imports any computations referred by
+  // instructions in this computation.
+  static absl::StatusOr<mlir::func::FuncOp> ImportAsFunc(
+      const HloComputation& computation, mlir::SymbolTable& symbol_table,
+      std::unordered_map<const HloComputation*, mlir::func::FuncOp>*
+          function_map,
+      mlir::Builder* builder, bool is_main,
+      bool flatten_computation_args_result = false);
+
+  // Imports the given hlo computation to the specified region.
+  //
+  // Flattens the tuple-typed region argument(s) and return value(s).
+  static absl::Status ImportAsRegion(
+      const HloComputation& computation, mlir::SymbolTable& symbol_table,
+      mlir::Region* region, mlir::Builder* builder,
+      bool flatten_computation_args_result = false);
+
+  // Imports the given computation to the given place specified by `builder`.
+  // `arguments` contains values for all parameters.
+  static absl::StatusOr<mlir::Value> ImportInstructions(
+      const HloComputation& computation,
+      const llvm::SmallVectorImpl<mlir::Value>& arguments,
+      mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
+      bool flatten_computation_args_result = false);
+
+  static absl::StatusOr<mlir::Operation*> ImportInstruction(
+      const HloInstruction* instr,
+      const llvm::SmallVectorImpl<mlir::Value>& operands,
+      mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
+      bool flatten_computation_args_result = false,
+      DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
+
+  static void SetLayoutForMlir(mlir::Operation* op, const Shape& shape,
+                               llvm::StringRef attr_name);
+
+  // For mlir::IfOp or mlir::CaseOp, replace the uses of their region's block
+  // arguments with 'implicit_operands'. Here | implicit_operands | == sum of
+  // the number of arguments in all the regions in IfOp or CaseOp.
+  void ReplaceBlockArgumentsWithImplicitOperands(
+      mlir::Operation* op, llvm::ArrayRef<mlir::Value> implicit_operands);
+
+  // FlattenTupleType flattens the types in (nested) tuple-type 'type' and
+  // stores them in 'flattened_types'.
+  static void FlattenTupleType(
+      mlir::Type type, llvm::SmallVectorImpl<mlir::Type>& flattened_types);
+
+  // FlattenTupleValue flattens the values in (nested) tuple-typed 'value' and
+  // stores them in 'flattened_values'.
+  static void FlattenTupleValue(
+      mlir::OpBuilder* func_builder, mlir::Location loc, mlir::Value value,
+      llvm::SmallVectorImpl<mlir::Value>& flattened_values);
+
+  // FlattenTupleValues flattens the values in (nested) tuple-typed 'values' and
+  // returns the flattened values.
+  static llvm::SmallVector<mlir::Value> FlattenTupleValues(
+      mlir::OpBuilder* func_builder, mlir::Location loc,
+      mlir::ValueRange values, std::optional<int> reserve_size = std::nullopt);
+
+ private:
+  HloFunctionImporter(mlir::SymbolTable& symbol_table,
+                      std::unordered_map<const HloComputation*,
+                                         mlir::func::FuncOp>* function_map,
+                      mlir::Builder* builder,
+                      bool flatten_computation_args_result)
+      : context_(symbol_table.getOp()->getContext()),
+        symbol_table_(symbol_table),
+        builder_(builder),
+        function_map_(function_map),
+        flatten_computation_args_result_(flatten_computation_args_result) {
+    context_->loadDialect<mlir::arith::ArithDialect>();
+    context_->loadDialect<mlir::func::FuncDialect>();
+    context_->loadDialect<mlir::mhlo::MhloDialect>();
+    context_->loadDialect<mlir::sparse_tensor::SparseTensorDialect>();
+  }
+
+  // Imports the given computation as a new function, if it hasn't been already
+  // imported.
+  absl::StatusOr<mlir::func::FuncOp> ImportAsFunc(
+      const HloComputation& computation, bool is_main);
+
+  // Imports the given computation in the specified region.
+  absl::Status ImportAsRegion(const HloComputation& computation,
+                              mlir::Region* region);
+
+  // Imports instructions from the given computation in the specified block.
+  // Assumes that the block already has correct arguments populated.
+  absl::Status ImportInstructions(const HloComputation& computation,
+                                  mlir::Block* block);
+  absl::StatusOr<mlir::Value> ImportInstructionsImpl(
+      const HloComputation& computation,
+      const llvm::SmallVectorImpl<mlir::Value>& arguments,
+      mlir::OpBuilder* builder);
+
+  // Imports an instruction.
+  absl::StatusOr<mlir::Operation*> ImportInstructionWithLayout(
+      const HloInstruction* instruction,
+      const llvm::SmallVectorImpl<mlir::Value>& operands,
+      mlir::OpBuilder* func_builder,
+      DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
+
+  absl::StatusOr<mlir::Operation*> ImportInstructionImpl(
+      const HloInstruction* instruction,
+      const llvm::SmallVectorImpl<mlir::Value>& operands,
+      mlir::OpBuilder* func_builder,
+      DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
+
+  // Gets the MLIR operand values from an HLO Instruction.
+  absl::StatusOr<llvm::SmallVector<mlir::Value, 4>> GetOperands(
+      const HloInstruction* instruction);
+
+  // Converts xla Tensor type to the corresponding MLIR type.
+  absl::StatusOr<mlir::RankedTensorType> ConvertTensorType(const Shape& shape);
+
+  // Converts an XLA shape/layout to the corresponding MLIR layout, in
+  // flattened_attr, while flattening the tuple layout.
+  absl::Status ConvertShapeToMlirLayout(
+      const Shape& shape,
+      llvm::SmallVectorImpl<mlir::Attribute>& flattened_attr);
+
+  // Returns the output type of an HloInstruction.
+  absl::StatusOr<mlir::Type> GetReturnType(const HloInstruction* instruction);
+
+  // Takes a list of HloInstructions and generates the list of types used for
+  // input, bypassing tuples to subsets.
+  absl::Status GetMlirTypes(
+      absl::Span<const HloInstruction* const> instructions,
+      llvm::SmallVectorImpl<mlir::Type>* types);
+
+  // Returns the Mlir Value for the corresponding HloInstruction.
+  absl::StatusOr<mlir::Value> GetMlirValue(const HloInstruction* instruction);
+
+  // TODO(b/179166199): Move attribute converters to attribute_importer.
+  // Converts an XLA ComparisonDirection to the corresponding MLIR attribute.
+  mlir::NamedAttribute ConvertComparisonDirection(
+      ComparisonDirection direction);
+
+  // Converts an XLA Comparison::Type to the corresponding MLIR attribute.
+  mlir::NamedAttribute ConvertComparisonType(Comparison::Type type);
+
+  // Converts an XLA CustomCallSchedule to the corresponding MLIR attribute.
+  mlir::NamedAttribute ConvertCustomCallSchedule(CustomCallSchedule schedule);
+
+  // Converts the dimensions of an HLO instruction into an MLIR attribute.
+  mlir::DenseIntElementsAttr ConvertDimensions(
+      absl::Span<const int64_t> op_dimensions);
+
+  // Converts Array ref to an DenseIntElementsAttr.
+  mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<int64_t> elements);
+
+  // Converts Array ref of bools to a DenseIntElementsAttr of I1 type.
+  mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<bool> elements);
+
+  // Converts Array ref to padding attribute. Input is a flattened list of
+  // padding low and padding high for each of the spatial dimensions.
+  mlir::NamedAttribute ConvertPadding(llvm::ArrayRef<int64_t> padding);
+
+  mlir::MLIRContext* context_;
+
+  // SymbolTable to which new functions should be inserted.
+  mlir::SymbolTable& symbol_table_;
+
+  mlir::Builder* builder_;
+
+  // Mapping from HloComputation to the created MLIR function.
+  std::unordered_map<const HloComputation*, mlir::func::FuncOp>* function_map_;
+
+  // Mapping from HloInstructions to the associative MLIR values.
+  std::unordered_map<const HloInstruction*, mlir::Value> instruction_value_map_;
+
+  bool flatten_computation_args_result_;
+};
+
+// Returns a StringAttr that carries a prettyprinted representation of the
+// given HLO C++ input_output_alias_config.
+// Always succeeds and returns a non-empty attribute.
+mlir::Attribute ConvertInputOutputAlias(const HloInputOutputAliasConfig& alias,
+                                        mlir::Builder* builder);
+
+// Returns a StringAttr that carries a prettyprinted representation of the
+// given HLO C++ sharding.
+// Always succeeds and returns a non-empty attribute.
+mlir::Attribute ConvertSharding(const HloSharding& sharding,
+                                mlir::Builder* builder);
+
+// Returns a StringAttr that carries a prettyprinted representation of the
+// given HLO proto sharding.
+// Will fail and return an empty attribute if the proto sharding cannot be
+// converted to the C++ sharding.
+mlir::Attribute ConvertSharding(const OpSharding& sharding,
+                                mlir::Builder* builder);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h
new file mode 100644
index 00000000..8937f673
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
+
+#include <unordered_map>
+
+#include "absl/status/status.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/SymbolTable.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+class HloModule;
+class HloModuleProto;
+class HloComputation;
+class HloInstruction;
+class Shape;
+
+// Importer that takes an HloModule and imports it as an MLIR module in the XLA
+// dialect. HloModuleImporter does not take ownership.
+class HloModuleImporter {
+ public:
+  explicit HloModuleImporter(mlir::ModuleOp module,
+                             bool import_all_computation = false,
+                             bool flatten_computation_args_result = false);
+
+  // Import the HloModule into the MLIR Module.
+  absl::Status Import(const xla::HloModule& module);
+
+  // Import the HloModuleProto into the MLIR Module.
+  absl::Status Import(const xla::HloModuleProto& module);
+
+ private:
+  bool import_all_computation_;
+  bool flatten_computation_args_result_;
+  mlir::SymbolTable symbol_table_;
+  mlir::Builder builder_;
+
+  // Map for tracking which MLIR function map to which HLO Computation. This
+  // tracks functions as they are imported and provides a quick lookup for
+  // functions invoked by control flow related operations (e.g. while, call).
+  std::unordered_map<const xla::HloComputation*, mlir::func::FuncOp>
+      function_map_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
new file mode 100644
index 00000000..24891065
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
+
+#include <stdbool.h>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+
+namespace mlir {
+class ModuleOp;
+}  // namespace mlir
+
+namespace xla {
+class HloModule;
+class HloModuleProto;
+
+// Converts an HLO module proto to a MLIR module in HLO dialect.
+//
+// If `import_all_computation` is set to true, imports all computations
+// irrespective if transitively called from entry computation.
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToMlirHlo(
+    mlir::MLIRContext& ctx, xla::HloModuleProto const* hlo_module,
+    bool import_all_computations = false,
+    bool flatten_computation_args_result = false);
+
+absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
+                                 xla::HloModuleProto const* hlo_module,
+                                 bool import_all_computations = false,
+                                 bool flatten_computation_args_result = false);
+
+// Converts an HLO module to a MLIR module in HLO dialect.
+//
+// If `import_all_computation` is set to true, imports all computations
+// irrespective if transitively called from entry computation.
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToMlirHlo(
+    mlir::MLIRContext& ctx, const xla::HloModule* hlo_module,
+    bool import_all_computations = false,
+    bool flatten_computation_args_result = false);
+
+absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
+                                 const xla::HloModule* hlo_module,
+                                 bool import_all_computations = false,
+                                 bool flatten_computation_args_result = false);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
new file mode 100644
index 00000000..34c2a0ec
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
@@ -0,0 +1,258 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines helpers useful when creating or manipulating lhlo/hlo.
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/SparseTensor/IR/Enums.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/mlir/utils/type_util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+absl::StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
+    const LiteralBase& literal, mlir::Builder builder);
+
+// Creates an DenseIntElementsAttr using the elements of the vector and the
+// optional shape.
+mlir::DenseIntElementsAttr CreateDenseIntElementsAttrFromVector(
+    const llvm::ArrayRef<int64_t> vector, mlir::Builder builder,
+    llvm::ArrayRef<int64_t> shape = {});
+
+// Converts the given XLA shape for tensors to the template MLIR type.
+template <typename TypeT>
+static absl::StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
+                                                      mlir::Builder builder) {
+  auto element_type_or =
+      ConvertPrimitiveTypeToMlirType(xla_ty.element_type(), builder);
+  if (!element_type_or.ok()) return element_type_or.status();
+
+  bool is_bounded_dynamic = false;
+  int64_t rank = xla_ty.rank();
+  llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamic);
+  llvm::SmallVector<int64_t, 4> bounds(rank, mlir::ShapedType::kDynamic);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    int64_t dim_size = xla_ty.dimensions(dim);
+    if (xla_ty.is_dynamic_dimension(dim)) {
+      if (!xla_ty.is_unbounded_dynamic_dimension(dim)) {
+        bounds[dim] = dim_size;
+        is_bounded_dynamic = true;
+      }
+    } else {
+      shape[dim] = dim_size;
+    }
+  }
+  using mlir::mhlo::TypeExtensionsAttr;
+  mlir::Attribute encoding;
+  if (is_bounded_dynamic) {
+    encoding = TypeExtensionsAttr::get(builder.getContext(), bounds);
+  }
+
+  using mlir::sparse_tensor::SparseTensorEncodingAttr;
+  // TODO(b/238903065): We don't yet support bounded dynamism shapes and
+  // sparsity at the same time, as we can currently only have one `encoding` on
+  // a RankedTensorType, and we don't currently have a meet of
+  // SparseTensorEncodingAttr and TypeExtensionsAttr (which holds bounds).
+  //
+  // For example, we wouldn't be able to represent the xla type
+  // `f32[4,<=4]{1,0:D(D,C)}`.
+  if (xla_ty.has_layout()) {
+    auto layout = xla_ty.layout();
+    if (LayoutUtil::IsSparse(layout)) {
+      if (is_bounded_dynamic)
+        return Unimplemented(
+            "MHLO doesn't support bounded dynamic shapes for sparse tensors");
+      llvm::SmallVector<mlir::sparse_tensor::LevelType> lts;
+      for (size_t i = 0, e = layout.dim_level_types_size(); i < e; ++i) {
+        auto dlt = layout.dim_level_type(i);
+        bool ordered =
+            i < layout.dim_ordered_size() ? layout.dim_ordered(i) : true;
+        bool unique =
+            i < layout.dim_unique_size() ? layout.dim_unique(i) : true;
+        switch (dlt) {
+          case DimLevelType::DIM_DENSE:
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
+                mlir::sparse_tensor::LevelFormat::Dense, ordered, unique));
+            break;
+          case DimLevelType::DIM_COMPRESSED:
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
+                mlir::sparse_tensor::LevelFormat::Compressed, ordered, unique));
+            break;
+          case DimLevelType::DIM_SINGLETON:
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
+                mlir::sparse_tensor::LevelFormat::Singleton, ordered, unique));
+            break;
+          case DimLevelType::DIM_LOOSE_COMPRESSED:
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
+                mlir::sparse_tensor::LevelFormat::LooseCompressed, ordered,
+                unique));
+            break;
+          default:
+            return InvalidArgument("Unknown DimLevelType from HLO");
+        }
+      }
+      auto ordering = layout.minor_to_major();
+      llvm::SmallVector<uint32_t> major_to_minor = {ordering.rbegin(),
+                                                    ordering.rend()};
+      auto id_map = mlir::AffineMap::getPermutationMap(major_to_minor,
+                                                       builder.getContext());
+      // TODO(atondwal): support sizes other than 32 when XLA does
+      encoding = SparseTensorEncodingAttr::get(
+          builder.getContext(), lts, id_map, mlir::AffineMap(), 32, 32);
+    }
+  }
+  return TypeT::get(shape, element_type_or.value(), encoding);
+}
+
+absl::StatusOr<mlir::MemRefType> ConvertTensorShapeToMemRefType(
+    const Shape& shape, mlir::Builder builder);
+
+template <>
+inline absl::StatusOr<mlir::MemRefType> ConvertTensorShapeToType(
+    const Shape& shape, mlir::Builder builder) {
+  if (shape.is_dynamic()) {
+    return FailedPrecondition(  // NOLINT
+        "MemRefType don't support dynamic shapes");
+  }
+  return ConvertTensorShapeToMemRefType(shape, builder);
+}
+
+// Converts the given XLA shape to the template MLIR type.
+template <typename TypeT>
+static absl::StatusOr<mlir::Type> ConvertShapeToType(const Shape& shape,
+                                                     mlir::Builder builder) {
+  if (shape.IsTuple()) {
+    llvm::SmallVector<mlir::Type, 4> contents;
+    contents.reserve(shape.tuple_shapes_size());
+    for (const auto& subtype : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(auto mlir_subtype,
+                          ConvertShapeToType<TypeT>(subtype, builder));
+      contents.push_back(mlir_subtype);
+    }
+    return builder.getTupleType(contents);
+  }
+  if (shape.IsToken()) {
+    return mlir::mhlo::TokenType::get(builder.getContext());
+  }
+  return ConvertTensorShapeToType<TypeT>(shape, builder);
+}
+
+// CreateTupleValue creates a root TupleOp of (nested) tuple-type 'type' using
+// the non-tuple-typed values in 'flatten_values'.
+//
+// e.g., Given 'flatten_values': [V1, V2, V3] &'type': tuple<T1,tuple<T1,T2>>,
+//      The function returns %t2 such that:
+//       %t1 = mhlo.tuple(V2,V3) : (T2,T3) -> tuple<T2,T3>
+//       %t2 = mhlo.tuple(V1,%t1): (T1,tuple<T2,T3>) -> tuple<T1,tuple<T1,T2>>
+//
+// Note: 1. FlattenTupleValue and CreateTupleValue is a pair of functions to
+//          resp. flatten and create tuples in the exact same order.
+//       2. `flatten_values`, initially storing the flattened values, will be
+//          mutated to a 0-length array by the end of function invocation.
+mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder, mlir::Location loc,
+                             mlir::ValueRange& flatten_values, mlir::Type type);
+
+// Create a TupleOp using the results of 'op' if 'type' is a mlir::TupleType.
+// Otherwise, return 'op'.
+mlir::Operation* CreateTupleFromOpResults(mlir::OpBuilder* func_builder,
+                                          mlir::Location loc,
+                                          mlir::Operation* op, mlir::Type type);
+
+// Create a TupleOp using the results of 'op'.
+mlir::Operation* WrapVariadicResultsInTuple(mlir::OpBuilder* builder,
+                                            mlir::Location loc,
+                                            mlir::Operation* op);
+
+// Returns true if the type is a tuple with no elements.
+bool IsEmptyTuple(const mlir::Type& type);
+
+mlir::TypeRange Untuple(const mlir::Type& type);
+
+static std::pair<mlir::Attribute, mlir::ArrayAttr> GetLayoutAttribute(
+    mlir::Builder& b, const Shape& shape,
+    std::optional<const Layout> maybe_layout = std::nullopt) {
+  if (shape.IsTuple()) {
+    llvm::SmallVector<mlir::Attribute> element_attrs;
+    llvm::SmallVector<mlir::Attribute> tile_attrs;
+    for (const auto& tuple_shape : shape.tuple_shapes()) {
+      // TODO here we do not dissect the layout of a tuple into sublayouts.
+      // Presently ShapeLayout cannot represent an explicit layout for a tuple
+      // type so this should never occur. However, if this function were to
+      // be used in another context where this assumption were to be lifted.
+      // users should be aware of this limitation which will use the default
+      // layout for tuple subshapes.
+      std::pair<mlir::Attribute, mlir::Attribute> inner =
+          tuple_shape.has_layout()
+              ? GetLayoutAttribute(b, tuple_shape, tuple_shape.layout())
+              : GetLayoutAttribute(b, tuple_shape);
+      element_attrs.push_back(inner.first);
+      tile_attrs.push_back(inner.second);
+    }
+    return std::make_pair((mlir::Attribute)b.getArrayAttr(element_attrs),
+                          b.getArrayAttr(tile_attrs));
+  }
+
+  Layout layout = maybe_layout.value_or(
+      shape.has_layout() ? shape.layout()
+                         : LayoutUtil::GetDefaultLayoutForShape(shape));
+
+  llvm::SmallVector<mlir::Attribute> vec_of_tiles;
+  for (const Tile& tile : layout.tiles()) {
+    llvm::SmallVector<int64_t> tile_vec = {tile.dimensions().begin(),
+                                           tile.dimensions().end()};
+    vec_of_tiles.push_back(b.getIndexTensorAttr(tile_vec));
+  }
+  llvm::SmallVector<int64_t> layout_vec = {layout.minor_to_major().begin(),
+                                           layout.minor_to_major().end()};
+  return std::make_pair(b.getIndexTensorAttr(layout_vec),
+                        b.getArrayAttr(vec_of_tiles));
+}
+
+static bool HasCustomLayout(const Shape& shape) {
+  if (shape.IsTuple()) {
+    return llvm::any_of(shape.tuple_shapes(), HasCustomLayout);
+  }
+  return shape.has_layout() && !shape.layout().minor_to_major().empty() &&
+         shape.layout() != LayoutUtil::GetDefaultLayoutForShape(shape);
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h
new file mode 100644
index 00000000..0137fa44
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_LOCATION_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_LOCATION_IMPORTER_H_
+
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Returns an MLIR Location generated from HLO Instruction. Uses instruction
+// metadata if present or instruction name.
+mlir::Location GenerateInstructionLocation(
+    const xla::HloInstruction* instruction, mlir::MLIRContext* context);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_LOCATION_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.h
new file mode 100644
index 00000000..7be3cd20
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_MODULE_ATTRIBUTES_IMPORTER_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_MODULE_ATTRIBUTES_IMPORTER_H_
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Imports the HLO module config into the MLIR module as module attributes
+// prefixed with `mhlo.`.
+// TODO (b/345755258) Support roundtrip of all HLO module config fields.
+void ImportCrossProgramPrefetches(const HloModule& hlo_module,
+                                  mlir::ModuleOp module,
+                                  bool flatten_computation_args_result,
+                                  mlir::Builder builder);
+
+void ImportEntryComputationLayoutAndTiles(const HloModule& hlo_module,
+                                          mlir::ModuleOp module,
+                                          bool flatten_computation_args_result,
+                                          mlir::Builder builder);
+
+void ImportFrontendAttributes(const HloModule& hlo_module,
+                              mlir::ModuleOp module, mlir::Builder builder);
+
+void ImportInputOutputAlias(const HloModule& hlo_module, mlir::ModuleOp module,
+                            mlir::Builder builder);
+
+void ImportIsDynamic(const HloModule& hlo_module, mlir::ModuleOp module,
+                     mlir::Builder builder);
+
+void ImportNumPartitions(const HloModule& hlo_module, mlir::ModuleOp module,
+                         mlir::Builder builder);
+
+void ImportNumReplicas(const HloModule& hlo_module, mlir::ModuleOp module,
+                       mlir::Builder builder);
+
+void ImportSpmdOutputSharding(const HloModule& hlo_module,
+                              mlir::ModuleOp module, mlir::Builder builder);
+
+void ImportSpmdParametersShardings(const HloModule& hlo_module,
+                                   mlir::ModuleOp module,
+                                   bool flatten_computation_args_result,
+                                   mlir::Builder builder);
+
+void ImportUseAutoSpmdPartitioning(const HloModule& hlo_module,
+                                   mlir::ModuleOp module,
+                                   mlir::Builder builder);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_MODULE_ATTRIBUTES_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h
new file mode 100644
index 00000000..f5210d55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_STACK_LOCATION_UTILS_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_STACK_LOCATION_UTILS_H_
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
+
+namespace mlir {
+namespace mhlo {
+// Construct MLIR location from frame index.
+// Returns unknown location if frame is not presented.
+mlir::Location GetLocationFromFrameIndex(int frame_id, mlir::Builder &builder,
+                                         const xla::HloModule *hlo_module);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_STACK_LOCATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h
new file mode 100644
index 00000000..69b06bf9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
+#define XLA_HLO_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
+
+namespace llvm {
+class StringRef;
+}  // namespace llvm
+
+namespace mlir {
+class MLIRContext;
+class ModuleOp;
+template <typename OpTy>
+class OwningOpRef;
+}  // namespace mlir
+
+namespace xla {
+
+// Converts a HloModuleProto stored in the file with the given `input_filename`
+// into a MHLO module. Creates MLIR entities into the given MLIR `context`.
+//
+// If `import_all_computation` is set to true, imports all computations
+// irrespective if transitively called from entry computation.
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
+mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
+    llvm::StringRef input, mlir::MLIRContext* context,
+    bool import_all_computations = false,
+    bool flatten_computation_args_result = false);
+
+// Converts a HloModule stored in text form for a file with the given
+// `input_filename` into a MHLO module. Creates MLIR entities into the given
+// MLIR `context`.
+//
+// If `import_all_computation` is set to true, imports all computations
+// irrespective if transitively called from entry computation.
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
+mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
+    llvm::StringRef input, mlir::MLIRContext* context,
+    bool import_all_computations = false,
+    bool flatten_computation_args_result = false);
+
+// Converts a HloModuleProto stored in the file with the given `input_filename`
+// into a StableHLO module. Creates MLIR entities into the given MLIR `context`.
+//
+mlir::OwningOpRef<mlir::ModuleOp> HloToStablehloTranslateFunction(
+    llvm::StringRef input, mlir::MLIRContext* context);
+
+// Converts a HloModule stored in text form for a file with the given
+// `input_filename` into a StableHLO module. Creates MLIR entities into the
+// given MLIR `context`.
+//
+mlir::OwningOpRef<mlir::ModuleOp> HloTextToStablehloTranslateFunction(
+    llvm::StringRef input, mlir::MLIRContext* context);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h
new file mode 100644
index 00000000..bc8344ce
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Converts the conv dimensions attribute to XLA HLO.
+ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
+    mlir::mhlo::ConvDimensionNumbersAttr input);
+
+// Converts the dot algorithm attribute to XLA HLO.
+absl::StatusOr<xla::PrecisionConfig::Algorithm> ConvertDotAlgorithm(
+    mlir::mhlo::DotAlgorithmAttr attr);
+
+absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
+    mlir::DenseIntElementsAttr input);
+
+// Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
+// and source-target pairs are defined in HLO.
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
+    std::optional<mlir::DenseIntElementsAttr> optional_attr);
+
+absl::StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
+    llvm::StringRef transpose_string);
+
+absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+    mlir::mhlo::CustomCallSchedule schedule);
+
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+    mlir::mhlo::CustomCallApiVersion api_version);
+
+absl::StatusOr<
+    std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
+ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
+
+// Returns an OpSharding that represents the result of parsing the given string:
+// first, as serialized protobuf, and then as prettyprinted representation.
+// Will fail if both attempts at parsing failed.
+std::optional<xla::OpSharding> ConvertSharding(mlir::StringRef sharding);
+
+std::optional<xla::HloInputOutputAliasProto> ConvertInputOutputAlias(
+    llvm::ArrayRef<mlir::Attribute> aliasing);
+
+}  // namespace xla
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.h
new file mode 100644
index 00000000..ab00814a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for working with XLA layout and shapes.
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
+
+#include <functional>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir {
+
+// XLA Layout preferences. Currently, when it comes to TPU, there are two
+// primary layout choices for any XLA arguments (parameter or resource): (1)
+// CompactChunkPadded and (2) Linear. CompactChunkPadded is the native TPU
+// layout while Linear is native host (CPU) layout.
+// This enum allows the caller of XLA to propagate layout preference to the XLA
+// compiler.
+//   kNoPreference: the generic layout where the XLA compiler has the freedom
+//                  to assign any layout.
+//   kTpuPreferCompactChunkPaddedLayout: use native TPU layout on TPU.
+//   kTpuPreferLinearLayout: use native CPU layout on TPU. The compiler may
+//                           insert transformation TPU kernels.
+// As the layout of any argument will change from a native host layout to a
+// native TPU layout either on host or on device, XLA compiler and TPU runtime
+// must be in coordination to transform the parameters in a consistent way.
+enum class XlaLayoutPreference {
+  kNoPreference = 0,
+  kTpuPreferCompactChunkPaddedLayout = 1,
+  kTpuPreferLinearLayout = 2
+};
+
+// The following defines the layout preference of an xla tensor.
+// The return value of LayoutPreferenceFn can be used in
+// ShapeRepresentationFn.
+typedef std::function<absl::StatusOr<XlaLayoutPreference>(
+    const xla::Shape& shape)>
+    LayoutPreferenceFn;
+
+typedef std::function<absl::StatusOr<xla::Shape>(
+    const xla::Shape& shape, bool fast_mem,
+    XlaLayoutPreference layout_preference)>
+    ShapeRepresentationFn;
+
+// Return a LayoutPreferenceFn that always uses kNoPreference layout.
+LayoutPreferenceFn UseNoPreferenceLayoutFn();
+
+// Rewrites the layout of xla_shape if there is tiled sharding.
+absl::Status RewriteLayoutWithShardedShape(
+    const std::optional<xla::HloSharding>& sharding, bool use_fast_memory,
+    const LayoutPreferenceFn& layout_preference_fn,
+    const ShapeRepresentationFn& shape_representation_fn,
+    xla::Shape* xla_shape);
+
+// Adds reshapes to fix the layout of an output, if a shape_representation_fn or
+// sharding is present.
+absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
+    const LayoutPreferenceFn& layout_preference_fn,
+    const ShapeRepresentationFn& shape_representation_fn,
+    std::optional<xla::OpSharding> sharding, bool fast_mem);
+
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/literal_exporter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/literal_exporter.h
new file mode 100644
index 00000000..f5cb3c74
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/literal_exporter.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_LITERAL_EXPORTER_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_LITERAL_EXPORTER_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+
+namespace mlir {
+namespace mhlo {
+
+absl::StatusOr<xla::Literal> CreateLiteralFromAttribute(mlir::ElementsAttr attr,
+                                                        xla::Layout layout);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_LITERAL_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/location_exporter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/location_exporter.h
new file mode 100644
index 00000000..70ab1d63
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/location_exporter.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
+
+#include <string>
+
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "xla/hlo/translate/mhlo_to_hlo/stack_frame_index_builder.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Returns a OpMetadata proto based on the location of the op. If the location
+// is unknown, an empty proto is returned. `op_name` are populated with the op
+// location (converted). FileLineColLoc locations are populated by taking the
+// file name and line number, and populating `source_file` and `source_line`
+// respectively.
+xla::OpMetadata CreateOpMetadataFromLocation(
+    Operation* op, StackFrameIndexBuilder* frame_index_builder);
+
+// Returns a name that can be used for debugging purposes, e.g., naming
+// variable names in generated IR or producing logging output.
+std::string GetDebugNameFromLocation(Location location);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
new file mode 100644
index 00000000..6a90e32b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
@@ -0,0 +1,100 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
+
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/translate/mhlo_to_hlo/layout_util.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace mlir {
+
+struct MlirToHloConversionOptions {
+  // Best-effort propagation of the layouts. These layouts serve as performance
+  // hints to the backend.
+  //
+  // Note that non-array shapes are not carrying layouts, and users have to
+  // figure out the proper layouts of them through context. This is one of the
+  // reasons why the attribute-based solution is temporary.
+  //
+  // TODO(timshen): Investigate the necessity of having layouts in MHLO.
+  bool propagate_layouts = false;
+
+  // Propagate the source and result layouts from mhlo bitcast op into the
+  // backend config for the bitcast. This is required for XLA:GPU backend to
+  // use elemental IR emitters for fused bitcasts without propagating layouts.
+  bool propagate_bitcast_layouts_to_backend_config = false;
+
+  LayoutPreferenceFn layout_preference_fn;
+  ShapeRepresentationFn shape_representation_fn;
+
+  // If use_tuple_args is set, then the entry computations's arguments are
+  // converted to a tuple and passed as a single parameter.
+  bool use_tuple_args = false;
+
+  // If return tuple is true, then the entry function's return values
+  // are converted to a tuple even when there is only a single return value.
+  // Multiple return values are always converted to a tuple and returned as a
+  // single value.
+  bool return_tuple = true;
+};
+
+// Prefer `ConvertMlirHloToHloModule` over this method when possible, as it
+// preserves more information and abstracts away the proto. This method is
+// preserved for legacy reasons.
+// TODO (b/345806521): Migrate callsites to ConvertMlirHloToHloModule,
+// and delete this method.
+//
+// Converts a MLIR module in HLO dialect into a HloModuleProto.
+//
+absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
+                                 ::xla::HloProto* hlo_proto,
+                                 bool use_tuple_args, bool return_tuple,
+                                 MlirToHloConversionOptions options = {});
+
+absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
+                                 ::xla::HloProto* hlo_proto,
+                                 MlirToHloConversionOptions options);
+
+// Converts a MLIR module in HLO dialect into a HloModule with HloModuleConfig.
+// This method preserves config data stored in MHLO module attributes.
+//
+// See `MlirToHloConversionOptions` for details on conversion flags.
+absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertMlirHloToHloModule(
+    mlir::ModuleOp module, MlirToHloConversionOptions options = {});
+
+// Transforms a Block into HLO, where the HLO is represented as calls into an
+// XlaBuilder. Callee functions are allowed in the Block's ancestor ModuleOp.
+// xla_params are inputs to block. returns are the returned XlaOps.
+absl::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
+                                 llvm::ArrayRef<xla::XlaOp> xla_params,
+                                 std::vector<xla::XlaOp>& returns,
+                                 MlirToHloConversionOptions options = {});
+
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.h
new file mode 100644
index 00000000..2081f24a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_MODULE_ATTRIBUTES_EXPORTER_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_MODULE_ATTRIBUTES_EXPORTER_H_
+
+#include "absl/status/status.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Exportes HLO Module Config info stored in the MHLO module as module
+// attributes prefixed with `mhlo.`.
+void ExportHloModuleConfig(xla::HloModuleConfig& config, mlir::ModuleOp module);
+
+absl::Status ExportModuleEntryComputationParameterLayouts(
+    const mlir::ArrayAttr& xla_entry_computation_parameter_layout,
+    xla::HloModuleProto& hlo_module);
+
+absl::Status ExportModuleEntryComputationParameterTiles(
+    const mlir::ArrayAttr& xla_entry_computation_parameter_tiles,
+    xla::HloModuleProto& hlo_module);
+
+absl::Status ExportModuleEntryComputationResultLayout(
+    const mlir::ArrayAttr& xla_entry_computation_result_layout,
+    xla::HloModuleProto& hlo_module);
+
+absl::Status ExportModuleEntryComputationResultTiles(
+    const mlir::ArrayAttr& xla_entry_computation_result_tiles,
+    xla::HloModuleProto& hlo_module);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_MODULE_ATTRIBUTES_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/stack_frame_index_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/stack_frame_index_builder.h
new file mode 100644
index 00000000..9e1c3408
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/stack_frame_index_builder.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_STACK_FRAME_INDEX_BUILDER_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_STACK_FRAME_INDEX_BUILDER_H_
+
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "absl/strings/string_view.h"
+#include "mlir/IR/Location.h"
+#include "xla/service/hlo.pb.h"
+
+namespace mlir {
+class StackFrameIndexBuilder {
+ public:
+  constexpr static int kInvalidIndex = 0;
+
+  xla::StackFrameIndexProto Build() const;
+
+  struct AddStackFrameResult {
+    int last_frame_id;
+    std::string last_frame_file;
+    int last_frame_line;
+  };
+
+  AddStackFrameResult AddCallStackAndGetFirstFrameId(
+      const mlir::Location &root_loc);
+
+ private:
+  int AddStackFrameLocation(const mlir::NameLoc &name_location,
+                            int parent_frame_id);
+
+  xla::StackFrameIndexProto indexes_;
+
+  std::map<absl::string_view, int> function_name_to_id_;
+  std::map<absl::string_view, int> file_name_to_id_;
+  std::map<std::tuple<int, int, int, int>, int> file_location_to_id_;
+  std::map<std::tuple<int, int>, int> frame_to_id_;
+};
+}  // namespace mlir
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_STACK_FRAME_INDEX_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h
new file mode 100644
index 00000000..064db339
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
+
+#include <memory>
+#include <utility>
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace xla {
+
+mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
+                                                  llvm::raw_ostream& output,
+                                                  bool emit_return_tuple,
+                                                  bool emit_use_tuple_arg);
+
+mlir::LogicalResult MlirHloToHloTextTranslateFunction(
+    mlir::ModuleOp module, llvm::raw_ostream& output, bool emit_return_tuple,
+    bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
+    bool print_sugar, bool via_builder, bool with_layouts);
+
+// Translate the MHLO program in in-memory file 'buffer' to a HLO program
+// written in a file represented with handle 'output_stream';
+mlir::LogicalResult MlirHloToHloTextMain(
+    std::unique_ptr<llvm::MemoryBuffer> buffer,
+    llvm::raw_ostream& output_stream, bool emit_return_tuple,
+    bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
+    bool print_sugar, bool via_builder, bool with_layouts);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate_registration.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate_registration.h
new file mode 100644
index 00000000..42c480a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate_registration.h
@@ -0,0 +1,63 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_TRANSLATE_REGISTRATION_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_TRANSLATE_REGISTRATION_H_
+
+#include "llvm/Support/CommandLine.h"
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_use_tuple_arg(
+    "emit-use-tuple-args",
+    llvm::cl::desc(
+        "Emit HLO modules using tuples as args for the entry computation"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_return_tuple(
+    "emit-return-tuple",
+    llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> with_layouts(
+    "with-layouts",
+    llvm::cl::desc("Propagate layouts when translating MHLO->XLA HLO"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> print_layouts(
+    "print-layouts", llvm::cl::desc("Print layouts in the generated HLO text"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> print_large_constants(
+    "print-large-constants",
+    llvm::cl::desc("Print large constants in the generated HLO text"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> print_sugar(
+    "print-sugar",
+    llvm::cl::desc(
+        "Print async ops using syntactic sugar in the generated HLO text"),
+    llvm::cl::init(true));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> via_builder(
+    "via-builder", llvm::cl::desc("Translate MHLO->XLA HLO via XLA Builder"),
+    llvm::cl::init(false));
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_TRANSLATE_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.h
new file mode 100644
index 00000000..eb641ce4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
+#define XLA_HLO_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/Types.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns a XLA Shape equivalent of a MLIR Type, else returns empty shape.
+Shape TypeToShape(mlir::Type type);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/portable_api.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/portable_api.h
new file mode 100644
index 00000000..3069c4e9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/portable_api.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_PORTABLE_API_H_
+#define XLA_HLO_TRANSLATE_PORTABLE_API_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+// This file is a portable version of the HLO API.
+// Is offers a string API passthrough for MLIR datatypes and is intended
+// to offer a safe means of using StableHLO opaquely in non-MLIR code.
+
+namespace xla {
+
+absl::StatusOr<std::string> ConvertHloToStablehlo(
+    xla::HloModule const& hlo_module, bool emit_bytecode = false);
+
+}
+
+#endif  // XLA_HLO_TRANSLATE_PORTABLE_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/stablehlo.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/stablehlo.h
new file mode 100644
index 00000000..1c649344
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/stablehlo.h
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_STABLEHLO_H_
+#define XLA_HLO_TRANSLATE_STABLEHLO_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
+
+namespace xla {
+
+// Registers dialects necessary for converting MLIR to HLO.
+void RegisterMlirToHloDependentDialects(mlir::DialectRegistry& registry);
+
+// Convert HloModule to StableHLO module.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToStablehlo(
+    mlir::MLIRContext& ctx, const xla::HloModule* hlo_module);
+
+// Convert HloModuleProto to StableHLO module.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToStablehlo(
+    mlir::MLIRContext& ctx, const xla::HloModuleProto* hlo_module);
+
+// Convert StableHLO module to HloModule.
+absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertStablehloToHlo(
+    mlir::ModuleOp module);
+
+// Convert StableHLO module to HloModuleProto.
+absl::Status ConvertStablehloToHloProto(mlir::ModuleOp module,
+                                        xla::HloProto* hlo_proto);
+
+// Convert StableHLO module to HloModuleProto.
+// Some platforms run out of memory when the argument list is too long.
+// This API wraps the arguments in a tuple (if use_tuple_args = true)
+// as a workaround. The long-term solution is to add an HLO pass to do this.
+// In general, prefer the other ConvertStablehloToHloProto method.
+absl::Status ConvertStablehloWithManyArgsToHloProto(
+    mlir::ModuleOp module, xla::HloProto* hlo_proto,
+    bool use_tuple_args = false);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_STABLEHLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.h
new file mode 100644
index 00000000..c3f0a86c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
+#define XLA_HLO_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
+
+#include <memory>
+#include <utility>
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace xla {
+
+mlir::LogicalResult StablehloToHloTranslateFunction(mlir::ModuleOp module,
+                                                    llvm::raw_ostream& output,
+                                                    bool emit_return_tuple,
+                                                    bool emit_use_tuple_arg);
+
+mlir::LogicalResult StablehloToHloTextTranslateFunction(
+    mlir::ModuleOp module, llvm::raw_ostream& output, bool emit_return_tuple,
+    bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
+    bool print_sugar, bool via_builder, bool with_layouts);
+
+// Translate the StableHLO program in in-memory file 'buffer' to a HLO program
+// written in a file represented with handle 'output_stream';
+mlir::LogicalResult StablehloToHloTextMain(
+    std::unique_ptr<llvm::MemoryBuffer> buffer,
+    llvm::raw_ostream& output_stream, bool emit_return_tuple,
+    bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
+    bool print_sugar, bool via_builder, bool with_layouts);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_container_util.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_container_util.h
new file mode 100644
index 00000000..8ab037c4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_container_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_HLO_CONTAINER_UTIL_H_
+#define XLA_HLO_UTILS_HLO_CONTAINER_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+
+namespace xla {
+
+// Returns the indices for sorted `data`.
+template <typename T>
+std::vector<int64_t> ArgSort(absl::Span<const T> data) {
+  std::vector<int64_t> indices(data.size());
+  absl::c_iota(indices, 0);
+  absl::c_sort(indices,
+               [&data](int64_t i, int64_t j) { return data[i] < data[j]; });
+  return indices;
+}
+
+}  // namespace xla
+
+#endif  // XLA_HLO_UTILS_HLO_CONTAINER_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_live_range.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_live_range.h
new file mode 100644
index 00000000..da6e93b9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_live_range.h
@@ -0,0 +1,228 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+License for the specific language governing permissions and limitations under
+the License.
+==============================================================================*/
+#ifndef XLA_HLO_UTILS_HLO_LIVE_RANGE_H_
+#define XLA_HLO_UTILS_HLO_LIVE_RANGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/hlo_value.h"
+
+namespace xla {
+
+// Class which computes live range of the output buffers of HLOs and their
+// interference by flattening all computations. The live range is only available
+// when all global computations (while, if, call, etc) have total order
+// sequential orders.
+class HloLiveRange {
+ public:
+  // Constructs a hlo live range object for the given module and computation
+  // assuming the given HLO instruction ordering.
+  static absl::StatusOr<std::unique_ptr<HloLiveRange>> Run(
+      const HloSchedule& schedule, const HloAliasAnalysis& alias_analysis,
+      const HloComputation* computation, bool module_scoped_analysis = true);
+
+  // LogicalTime represents the time in a virtual clock. Each instruction has
+  // one monotonically increasing logical time assigned according to the
+  // schedule.
+  using LogicalTime = int64_t;
+
+  struct TimeBound {
+    LogicalTime start;
+    LogicalTime end;
+    // The buffer can hold multiple instructions during its life time (each
+    // tenant exclusively owns the buffer at any given time). `end_instruction`
+    // represents the last instruction that the buffer holds.
+    HloPosition end_position;
+
+    bool friend operator==(const TimeBound& a, const TimeBound& b) {
+      return a.start == b.start && a.end == b.end;
+    }
+    bool friend operator!=(const TimeBound& a, const TimeBound& b) {
+      return !(a == b);
+    }
+  };
+
+  std::string ToString() const;
+
+  const HloInstructionSequence& flattened_instruction_sequence() const {
+    return flattened_instruction_sequence_;
+  }
+
+  // Returns the map from instruction to the end time of that instruction.
+  const absl::flat_hash_map<const HloInstruction*, LogicalTime>&
+  instruction_schedule() const {
+    return instruction_schedule_;
+  }
+
+  // Returns the map from a hlo value to the definition time of that hlo value.
+  const absl::flat_hash_map<const HloValue*, TimeBound>& buffer_live_ranges()
+      const {
+    return buffer_live_ranges_;
+  }
+
+  absl::flat_hash_map<const HloValue*, TimeBound>& buffer_live_ranges() {
+    return buffer_live_ranges_;
+  }
+
+  // Returns the map from a computation and its time span in the schedule.
+  const absl::flat_hash_map<const HloComputation*, TimeBound>&
+  computation_span_times() const {
+    return computation_span_times_;
+  }
+
+  // Returns the time stamp of the end of the program.
+  LogicalTime schedule_end_time() const {
+    return flattened_instruction_sequence_.size();
+  }
+
+  // Returns whether hlo live range is available on this entire module. Hlo live
+  // range is not available if the module is partially ordered.
+  bool total_order_scheduled() const { return total_order_scheduled_; }
+
+ private:
+  explicit HloLiveRange(const HloSchedule& schedule,
+                        const HloAliasAnalysis& alias_analysis,
+                        bool module_scoped_analysis)
+      : schedule_(schedule),
+        alias_analysis_(alias_analysis),
+        module_scoped_analysis_(module_scoped_analysis) {}
+
+  // FlattenSchedule walks through the instructions in `computation`, and
+  // recurse into each called computations in module_scoped_analysis mode. As it
+  // walks it also tracks down the ordinal number of each instruction in the
+  // schedule and store it in the `instruction_schedule` and
+  // 'flattened_instruction_sequence`. async_context contains the asynchronous
+  // computation that this computation is in, if any. When this value is
+  // non-null, it means that this computation is called by an async op or
+  // another op in an asynchronous context.
+  void FlattenSchedule(const HloComputation& computation,
+                       const HloComputation* async_context = nullptr);
+
+  // Returns the last position of a value.
+  TimeBound GetLastPosition(const HloValue& value,
+                            LogicalTime definition_end_time) const;
+
+  // Returns the time of the last use of a value.
+  LogicalTime GetLastUsageTime(const HloValue& value) const;
+
+  // Based on the flattened schedule, calculate the start and end of each
+  // buffer.
+  void CalculateBufferStartEndMap();
+
+  // The aliased buffers could have overlapping live ranges.
+  // NormalizeAliasedBuffers normalizes the buffer such that each alias buffer
+  // has disjoint live range while keeping the live range union the same. This
+  // avoid double counting aliased buffer sizes.
+  //
+  // Before(buffer1 and 2 are aliased):
+  //
+  //           +----+          live range of buffer1
+  //   +------------------+    live range of buffer2
+  //
+  // After:
+  //
+  //           +----------+    live range of buffer1
+  //   +-------+               live range of buffer2
+  //
+  // Before(buffer1 and 2 are aliased):
+  //
+  //           +----------+    live range of buffer1
+  //   +------------+          live range of buffer2
+  //
+  // After:
+  //
+  //           +----------+    live range of buffer1
+  //   +-------+               live range of buffer2
+  //
+  // Before(buffer1 and 2 are aliased):
+  //
+  //           +----------+    live range of buffer1
+  //   +---+                   live range of buffer2
+  //
+  // After(unchanged):
+  //
+  //           +----------+    live range of buffer1
+  //   +---+                   live range of buffer2
+  //
+  // As another example, imagine we have the following code sequence with live
+  // ranges of each while-aliased buffers:
+  //
+  //                     a      p1    p2    e     b
+  // a = ...             +
+  //                     |
+  // {                   |
+  //   p1 = param        |       +
+  //   ROOT true         |       |
+  // }                   |       +
+  // { // body           |
+  //   p2 = param        +             +
+  //   c = p2 + 1                      +
+  //   d = c + 1
+  //   ROOT e = d + 1                       +
+  // }                                      |
+  //                                        |
+  // b = while (a)                          +     +
+  //                                              |
+  // f = b + 1                                    +
+  //
+  // After normalization it becomes:
+  //
+  //                     a      p1    p2    e     b
+  // a = ...             +
+  //                     |
+  // {                   |
+  //   p1 = param        +       +
+  //   ROOT true                 |
+  // }                           |
+  // { // body                   |
+  //   p2 = param                +     +
+  //   c = p2 + 1                      +
+  //   d = c + 1
+  //   ROOT e = d + 1                       +
+  // }                                      |
+  //                                        |
+  // b = while (a)                          +     +
+  //                                              |
+  // f = b + 1                                    +
+  //
+  // Note there is no overlap of live ranges after normalization.
+  void NormalizeAliasedBuffers();
+
+  LogicalTime ComputePeakMemoryMoment() const;
+
+  const HloSchedule& schedule_;
+  const HloAliasAnalysis& alias_analysis_;
+  bool module_scoped_analysis_;
+  bool total_order_scheduled_ = true;
+
+  HloInstructionSequence flattened_instruction_sequence_;
+  absl::flat_hash_map<const HloInstruction*, LogicalTime> instruction_schedule_;
+  absl::flat_hash_map<const HloComputation*, TimeBound> computation_span_times_;
+  absl::flat_hash_map<const HloValue*, TimeBound> buffer_live_ranges_;
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      computations_in_async_context_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_UTILS_HLO_LIVE_RANGE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_matchers.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_matchers.h
new file mode 100644
index 00000000..2c00ddb7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_matchers.h
@@ -0,0 +1,599 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_HLO_MATCHERS_H_
+#define XLA_HLO_UTILS_HLO_MATCHERS_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/test.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace testing {
+
+class HloMatcher : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  HloMatcher(HloOpcode opcode,
+             std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : opcode_(opcode), operands_(operands) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+  void DescribeTo(::std::ostream* os) const override;
+
+ private:
+  HloOpcode opcode_;
+  std::vector<::testing::Matcher<const HloInstruction*>> operands_;
+};
+
+// Custom matcher for parameters, which accepts a parameter number.
+class HloParameterMatcher : public HloMatcher {
+ public:
+  explicit HloParameterMatcher(int64_t parameter_number)
+      : HloMatcher(HloOpcode::kParameter, /*operands=*/{}),
+        parameter_number_(parameter_number) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+ private:
+  int64_t parameter_number_;
+};
+
+// Custom matcher for comparisons, which accepts a comparison direction.
+class HloComparisonMatcher : public HloMatcher {
+ public:
+  explicit HloComparisonMatcher(
+      ComparisonDirection direction,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCompare, operands), direction_(direction) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+ private:
+  ComparisonDirection direction_;
+};
+
+// Custom matcher for get-tuple-element instructions, which accepts a tuple
+// index to match.
+class HloGetTupleElementMatcher : public HloMatcher {
+ public:
+  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
+                            int64_t tuple_index)
+      : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
+        tuple_index_(tuple_index) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+ private:
+  int64_t tuple_index_;
+};
+
+// Custom matcher for custom-call instructions, which accepts a matcher for its
+// call target.
+class HloCustomCallMatcher : public HloMatcher {
+ public:
+  HloCustomCallMatcher(
+      ::testing::Matcher<std::string> call_target_matcher,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCustomCall, operands),
+        call_target_matcher_(call_target_matcher) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<std::string> call_target_matcher_;
+};
+
+class HloShapeMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShapeMatcher(const Shape& shape) : shape_(shape) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Shape shape_;
+};
+
+class HloShapeAndLayoutMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShapeAndLayoutMatcher(const Shape& shape,
+                                    bool minor_to_major_only = false)
+      : shape_(shape), minor_to_major_only_(minor_to_major_only) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Shape shape_;
+  bool minor_to_major_only_;
+};
+
+// Verify the sharding of an instruction against the provided HloSharding. If a
+// nullopt is provided for the expected sharding then it checks that no sharding
+// is present for an instruction.
+class HloShardingMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShardingMatcher(const std::optional<HloSharding>& sharding)
+      : sharding_(sharding) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  std::optional<HloSharding> sharding_;
+};
+
+// Matches a Dot HLO instruction with specific LHS and RHS contracting
+// dimensions.
+class HloDotWithContractingDimsMatcher : public HloMatcher {
+ public:
+  explicit HloDotWithContractingDimsMatcher(
+      ::testing::Matcher<const HloInstruction*> lhs,
+      ::testing::Matcher<const HloInstruction*> rhs,
+      int64_t lhs_contracting_dim, int64_t rhs_contracting_dim)
+      : HloMatcher(HloOpcode::kDot, /*operands=*/{lhs, rhs}),
+        lhs_contracting_dim_(lhs_contracting_dim),
+        rhs_contracting_dim_(rhs_contracting_dim) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  int64_t lhs_contracting_dim_;
+  int64_t rhs_contracting_dim_;
+};
+
+// Custom matcher for asynchronous copy (CopyStart/CopyDone pair) with specified
+// source and destination memory spaces.
+class HloAsyncCopyMatcher : public HloMatcher {
+ public:
+  HloAsyncCopyMatcher(int64_t to_space, int64_t from_space,
+                      ::testing::Matcher<const HloInstruction*> operand)
+      : HloMatcher(HloOpcode::kCopyDone,
+                   {::testing::MakeMatcher(
+                       new HloMatcher(HloOpcode::kCopyStart, {operand}))}),
+        to_space_(to_space),
+        from_space_(from_space) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  int64_t to_space_;
+  int64_t from_space_;
+};
+
+class HloConstantMatcher : public HloMatcher {
+ public:
+  explicit HloConstantMatcher(Literal literal)
+      : HloMatcher(HloOpcode::kConstant, /*operands=*/{}),
+        literal_(std::move(literal)) {}
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Literal literal_;
+};
+
+class HloReplicaGroupsMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloReplicaGroupsMatcher(
+      std::vector<std::vector<int64_t>> replica_groups)
+      : replica_groups_(std::move(replica_groups)) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  std::vector<std::vector<int64_t>> replica_groups_;
+};
+
+class HloSourceTargetPairsMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloSourceTargetPairsMatcher(
+      std::vector<std::pair<int64_t, int64_t>> source_target_pairs)
+      : source_target_pairs_(std::move(source_target_pairs)) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  std::vector<std::pair<int64_t, int64_t>> source_target_pairs_;
+};
+
+class HloMetadataMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloMetadataMatcher(OpMetadata metadata)
+      : metadata_(std::move(metadata)) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  OpMetadata metadata_;
+};
+
+// HloInstruction* matchers for opcode and operands. Example:
+//   namespace op = xla::opcode_matchers;
+//   EXPECT_THAT(instruction,
+//               op::Add(op::Reshape(), op::Add(op::Reshape(), _)));
+namespace opcode_matchers {
+#define HLO_MATCHER(opcode)                                                \
+  template <typename... M>                                                 \
+  ::testing::Matcher<const ::xla::HloInstruction*> opcode(M... operands) { \
+    return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(          \
+        ::xla::HloOpcode::k##opcode, {operands...}));                      \
+  }
+HLO_MATCHER(Abs);
+HLO_MATCHER(Add);
+HLO_MATCHER(AddDependency);
+HLO_MATCHER(AfterAll);
+HLO_MATCHER(AsyncStart);
+HLO_MATCHER(AsyncUpdate);
+HLO_MATCHER(AsyncDone);
+HLO_MATCHER(AllGather);
+HLO_MATCHER(AllGatherStart);
+HLO_MATCHER(AllGatherDone);
+HLO_MATCHER(AllReduce);
+HLO_MATCHER(AllReduceStart);
+HLO_MATCHER(AllReduceDone);
+HLO_MATCHER(AllToAll);
+HLO_MATCHER(And);
+HLO_MATCHER(BatchNormGrad);
+HLO_MATCHER(Bitcast);
+HLO_MATCHER(BitcastConvert);
+HLO_MATCHER(Broadcast);
+HLO_MATCHER(Call);
+HLO_MATCHER(Ceil);
+HLO_MATCHER(Clamp);
+HLO_MATCHER(CollectiveBroadcast);
+HLO_MATCHER(CollectivePermute);
+HLO_MATCHER(CollectivePermuteStart);
+HLO_MATCHER(CollectivePermuteDone);
+HLO_MATCHER(Compare);
+HLO_MATCHER(Concatenate);
+HLO_MATCHER(Conditional);
+HLO_MATCHER(Convert);
+HLO_MATCHER(Convolution);
+HLO_MATCHER(Copy);
+HLO_MATCHER(CopyDone);
+HLO_MATCHER(CopyStart);
+HLO_MATCHER(Divide);
+HLO_MATCHER(Domain);
+HLO_MATCHER(DynamicSlice);
+HLO_MATCHER(DynamicUpdateSlice);
+HLO_MATCHER(Erf);
+HLO_MATCHER(Exp);
+HLO_MATCHER(Fft);
+HLO_MATCHER(Floor);
+HLO_MATCHER(Fusion);
+HLO_MATCHER(Gather);
+HLO_MATCHER(GetDimensionSize);
+HLO_MATCHER(Infeed);
+HLO_MATCHER(Iota);
+HLO_MATCHER(IsFinite);
+HLO_MATCHER(Log);
+HLO_MATCHER(Map);
+HLO_MATCHER(Maximum);
+HLO_MATCHER(Minimum);
+HLO_MATCHER(Multiply);
+HLO_MATCHER(Negate);
+HLO_MATCHER(Not);
+HLO_MATCHER(Or);
+HLO_MATCHER(Outfeed);
+HLO_MATCHER(Pad);
+HLO_MATCHER(PartitionId);
+HLO_MATCHER(Power);
+HLO_MATCHER(RaggedAllToAll);
+HLO_MATCHER(Recv);
+HLO_MATCHER(RecvDone);
+HLO_MATCHER(Reduce);
+HLO_MATCHER(ReducePrecision);
+HLO_MATCHER(ReduceScatter);
+HLO_MATCHER(ReduceWindow);
+HLO_MATCHER(Remainder);
+HLO_MATCHER(ReplicaId);
+HLO_MATCHER(Reshape);
+HLO_MATCHER(Reverse);
+HLO_MATCHER(Rng);
+HLO_MATCHER(RngBitGenerator);
+HLO_MATCHER(RngGetAndUpdateState);
+HLO_MATCHER(Scatter);
+HLO_MATCHER(Select);
+HLO_MATCHER(SelectAndScatter);
+HLO_MATCHER(Send);
+HLO_MATCHER(SendDone);
+HLO_MATCHER(SetDimensionSize);
+HLO_MATCHER(ShiftLeft);
+HLO_MATCHER(ShiftRightArithmetic);
+HLO_MATCHER(ShiftRightLogical);
+HLO_MATCHER(Sign);
+HLO_MATCHER(Slice);
+HLO_MATCHER(Sort);
+HLO_MATCHER(Subtract);
+HLO_MATCHER(Tan);
+HLO_MATCHER(Tanh);
+HLO_MATCHER(Transpose);
+HLO_MATCHER(Tuple);
+HLO_MATCHER(While);
+HLO_MATCHER(Xor);
+HLO_MATCHER(OptimizationBarrier);
+
+#define HLO_MATCHER_VECTOR_OPERANDS(opcode)                              \
+  template <>                                                            \
+  inline ::testing::Matcher<const ::xla::HloInstruction*> opcode(        \
+      std::vector<::testing::Matcher<const HloInstruction*>> operands) { \
+    return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(        \
+        ::xla::HloOpcode::k##opcode, operands));                         \
+  }
+
+HLO_MATCHER_VECTOR_OPERANDS(DynamicSlice);
+
+// The special cases below let you check additional information about the
+// HloInstruction, beyond just its opcode and operands.  In all cases you can
+// still use the generic matcher which doesn't check this info.
+//
+// Feel free to add additional custom matchers below.
+
+//  - Parameter(N) matches parameter number N.
+//  - Parameter() matches any parameter.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Parameter(
+    int64_t parameter_number) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloParameterMatcher(parameter_number));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> Parameter() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kParameter, {}));
+}
+
+// Comparison matchers below do not require any additional arguments.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Eq(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kEq, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Ne(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kNe, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Ge(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kGe, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Gt(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kGt, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Le(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kLe, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Lt(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kLt, {operands...}));
+}
+
+// GetTupleElement(operand, N) matches a GTE instruction which gets the N'th
+// tuple element of operand, while GetTupleElement(operand) matches any GTE
+// operation on operand, and GetTupleElement() matches any GTE operation at all.
+inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement(
+    ::testing::Matcher<const HloInstruction*> operand, int64_t tuple_index) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloGetTupleElementMatcher(operand, tuple_index));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement(
+    ::testing::Matcher<const HloInstruction*> operand) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {operand}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
+}
+
+// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
+//   target T and the given operands.
+//
+// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
+//   given operands.
+//
+// - CustomCall() matches any CustomCall HLO at all.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    ::testing::Matcher<std::string> call_target_matcher, M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
+      call_target_matcher, {operands...}));
+}
+// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
+// ::testing::Matcher<std::string>.  In that case, we want to prefer the
+// overload above.
+template <
+    typename FirstM, typename... M,
+    typename Dummy = typename std::enable_if<
+        !std::is_convertible<FirstM, ::testing::Matcher<std::string>>::value,
+        void>::type*>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    FirstM operands_first, M... operands_rest) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
+}
+
+// Verifies the shape or the shape and the layout of an HLO instruction against
+// the provided shape object.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
+    const class Shape& shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
+    absl::string_view shape) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShapeMatcher(ParseShape(shape).value()));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
+    const class Shape& shape) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShapeAndLayoutMatcher(shape));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
+    absl::string_view shape, bool minor_to_major_only = false) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher(
+      ParseShape(shape).value(), minor_to_major_only));
+}
+
+// Verifies the value of the HloSharing against the provided sharding object.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
+    const HloSharding& sharding) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(sharding));
+}
+// Matcher for Sharding from sharding string
+inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
+    absl::string_view sharding) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(ParseSharding(sharding).value()));
+}
+// Verifies that no HloSharding is set for an HLO instruction.
+inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(std::nullopt));
+}
+
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(::xla::HloOpcode::kDot, {}));
+}
+
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
+    ::testing::Matcher<const HloInstruction*> lhs_matcher,
+    ::testing::Matcher<const HloInstruction*> rhs_matcher) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      ::xla::HloOpcode::kDot, {lhs_matcher, rhs_matcher}));
+}
+
+// Matches a Dot HLO instruction if it has exactly one lhs contracting dimension
+// equal to `lhs_contracting_dim` and exactly one rhs contracting dimension
+// equal to `rhs_contracting_dim`.
+//
+// Currently the HLO verifier rejects Dot operations with more than one
+// contracting dimension (even though we can represent these in the
+// DotDimensionNumbers proto) so there is no need to generalize this to support
+// multiple contracting dimensions.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
+    ::testing::Matcher<const HloInstruction*> lhs_matcher,
+    ::testing::Matcher<const HloInstruction*> rhs_matcher,
+    int64_t lhs_contracting_dim, int64_t rhs_contracting_dim) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloDotWithContractingDimsMatcher(
+          lhs_matcher, rhs_matcher, lhs_contracting_dim, rhs_contracting_dim));
+}
+
+// Matcher for asynchronous copies from one memory space to another. Implies
+// CopyDone(CopyStart(...)) where from_space and to_space is the source and
+// destination memory spaces, respectively.
+inline ::testing::Matcher<const ::xla::HloInstruction*> AsyncCopy(
+    int64_t to_space, int64_t from_space,
+    ::testing::Matcher<const HloInstruction*> operand_matcher) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloAsyncCopyMatcher(
+      to_space, from_space, operand_matcher));
+}
+
+//  - Constant() matches any constant.
+//  - Constant(V) matches a constant with the given value.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Constant() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kConstant, {}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> Constant(
+    Literal value) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloConstantMatcher(std::move(value)));
+}
+
+inline ::testing::Matcher<const ::xla::HloInstruction*> ReplicaGroups(
+    std::vector<std::vector<int64_t>> replica_groups) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloReplicaGroupsMatcher(std::move(replica_groups)));
+}
+
+inline ::testing::Matcher<const ::xla::HloInstruction*> SourceTargetPairs(
+    std::vector<std::pair<int64_t, int64_t>> source_target_pairs) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloSourceTargetPairsMatcher(
+      std::move(source_target_pairs)));
+}
+
+inline ::testing::Matcher<const ::xla::HloInstruction*> Metadata(
+    OpMetadata metadata) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMetadataMatcher(std::move(metadata)));
+}
+
+#undef HLO_MATCHER
+}  // namespace opcode_matchers
+
+// Helper to convert smart to raw pointers for matching.
+template <typename Container>
+std::vector<const HloInstruction*> Pointers(const Container& container) {
+  std::vector<const HloInstruction*> result;
+  result.reserve(container.size());
+  for (const auto& entry : container) result.push_back(entry.get());
+  return result;
+}
+
+}  // namespace testing
+
+// Tell GMock to print HloInstruction* by value, so error messages are nice.
+// Has to be in the same namespace as 'HloInstruction'.
+void PrintTo(const HloInstruction* inst, ::std::ostream* os);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_UTILS_HLO_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_query.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_query.h
new file mode 100644
index 00000000..56ee054b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_query.h
@@ -0,0 +1,209 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_HLO_QUERY_H_
+#define XLA_HLO_UTILS_HLO_QUERY_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Helper interface for making queries about the HLO IR.
+namespace hlo_query {
+
+// Returns whether the given opcode is a collective communications operation
+// that is represented as HloCollectiveInstruction.
+//
+// Do not rely on this to detect any async computation. In particular wrapped
+// async op `kCall` is not considered an async collective, even if it is
+// wrapping `kAsyncStart` or `kAsyncDone` ops.
+bool IsCollectiveCommunicationOp(HloOpcode op);
+
+// Returns whether the given instruction represents the start operation for a
+// collective communication, may include send & recv operations.
+// Do not rely on this to detect any async computation. See caveats in
+// `IsCollectiveCommunicationOp`.
+bool IsAsyncCollectiveStartOp(const HloInstruction* instruction,
+                              bool include_send_recv = false);
+// Returns whether the given instruction represents the done operation for a
+// collective communication, may include send & recv operations.
+// Do not rely on this to detect any async computation. See caveats in
+// `IsCollectiveCommunicationOp`.
+bool IsAsyncCollectiveDoneOp(const HloInstruction* instruction,
+                             bool include_send_recv = false);
+
+// Returns whether the instruction provided is a constant rank-0 float32, and
+// if so, places the constant value into out.
+// Precondition: out != nullptr
+bool IsConstantR0F32(HloInstruction* instruction, float* out);
+
+// Returns whether all of an instruction's operands are of the types constants
+// and parameters.
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction);
+
+// Returns whether all of an instruction's operands are of the type constant
+// or parameter and the instruction is their only user.
+bool AllOperandsAreParametersOrConstantsWithSingleUser(
+    const HloInstruction& instruction);
+
+// Returns whether all of an instruction's operands are parameters.
+bool AllOperandsAreParameters(const HloInstruction& instruction);
+
+// Returns whether all of an instruction's operands are constants.
+bool AllOperandsAreConstants(const HloInstruction& instruction);
+
+// Returns whether the instruction is a scalar constant.
+bool IsScalarConstant(const HloInstruction* instruction);
+
+// Returns whether the `instr` is either a constant, a scalar, or a
+// broadcasted constant/scalar.
+bool IsBroadcastedConstantOrScalar(const HloInstruction& instr);
+
+// Returns whether the `instr` is a broadcast and its input is a
+// scalar constant.
+bool IsBroadcastOfScalarConstant(const HloInstruction& instr);
+
+// Returns whether the `instr` is a broadcast and its input is a parameter.
+bool IsBroadcastOfParameter(const HloInstruction& instr);
+
+// Returns true for a parameter or a parameter followed by a chain of no-op
+// instructions (bitcast, get-tuple-element).
+bool IsEffectiveParameter(const HloInstruction&);
+
+// Returns first HLO of the computation with the opcode, otherwise nullptr.
+HloInstruction* GetFirstInstructionWithOpcode(const HloComputation& computation,
+                                              HloOpcode opcode);
+
+// Applies `fn` to a collection of instruction with `opcode` for a given
+// `computation`.
+template <typename Fn>
+void ForEachInstructionWithOpcode(HloComputation& computation, HloOpcode opcode,
+                                  Fn&& fn) {
+  for (HloInstruction* instr : computation.instructions()) {
+    if (instr->opcode() == opcode) {
+      fn(instr);
+    }
+  }
+}
+
+// Applies `fn` to a collection of instruction with `opcode` for a given
+// `module`.
+template <typename Fn>
+void ForEachInstructionWithOpcode(HloModule& module, HloOpcode opcode,
+                                  Fn&& fn) {
+  for (HloComputation* computation : module.computations()) {
+    ForEachInstructionWithOpcode(*computation, opcode, fn);
+  }
+}
+
+// Applies `fn` to a collection of instruction satisfying `pred` for a given
+// `computation`.
+template <typename Fn>
+void ForEachInstructionWithPred(HloComputation& computation, HloPredicate pred,
+                                Fn&& fn) {
+  for (HloInstruction* instr : computation.instructions()) {
+    if (pred(instr)) {
+      fn(instr);
+    }
+  }
+}
+
+// Applies `fn` to a collection of instruction satisfying `pred` for a given
+// `module`.
+template <typename Fn>
+void ForEachInstructionWithPred(HloModule& module, HloPredicate pred, Fn&& fn) {
+  for (HloComputation* computation : module.computations()) {
+    ForEachInstructionWithPred(*computation, pred, fn);
+  }
+}
+
+// Determines whether the given computation contains an instruction with one of
+// the given opcodes.  Checks both comp's instructions and the instructions of
+// any computations nested within it.
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes);
+
+// Returns an operand of an instruction with the given opcode. If there are
+// multiple matching operands, then the first matching operand is returned. If
+// there are no matching operands then nullptr is returned.
+HloInstruction* GetMatchingOperand(const HloPredicate& matcher,
+                                   HloInstruction* instruction);
+
+// Returns whether a binary instruction has a matching operand. Sets
+// matching_operand to the matching operand and the other operand to
+// other_operand. Note: in the case where both operands match, the first operand
+// of the instruction is returned.
+bool MatchBinaryInstructionOperand(const HloPredicate& matcher,
+                                   HloInstruction* instruction,
+                                   HloInstruction** matching_operand,
+                                   HloInstruction** other_operand);
+
+// Returns whether a binary instruction has a operand with a given opcode.
+// This is a special case of MatchingBinaryInstructionOperand.
+bool MatchBinaryInstructionOperandOpcode(HloOpcode opcode,
+                                         HloInstruction* instruction,
+                                         HloInstruction** matching_operand,
+                                         HloInstruction** other_operand);
+
+// Returns whether the module contains the given collective communication
+// instructions with constrained layout.
+bool ContainsLayoutConstrainedCollective(const HloModule& module, HloOpcode op);
+
+// Returns whether the module contains all-reduce instructions with constrained
+// layout.
+inline bool ContainsLayoutConstrainedAllReduce(const HloModule& module) {
+  return ContainsLayoutConstrainedCollective(module, HloOpcode::kAllReduce);
+}
+
+// Returns the next available channel id that can be used in the given module
+// (for HloChannelInstructions).
+int64_t NextChannelId(const HloModule& module);
+
+// Returns whether the module contains host send/recv with X64 data type.
+// This function is called after X64Rewriter, so X64 host transfers are already
+// rewritten into tuple shaped transfers.
+bool HasX64TransformedHostTransfer(const HloModule& module);
+
+// Returns the unique GTE instruction with the given operand and index. Returns
+// nullptr if no such instruction exists or is not unique.
+HloInstruction* GetUniqueGteInstruction(const HloInstruction* operand,
+                                        int64_t index);
+
+// Gets the computation from the given module with the given name.
+HloComputation* FindComputation(HloModule* module, absl::string_view name);
+
+// Gets the instruction from the given computation with the given instruction
+// name. Returns nullptr if no such instruction can be found.
+HloInstruction* FindInstruction(const HloComputation* computation,
+                                absl::string_view name);
+
+// Gets any instruction from the given computation with the given opcode.
+// Returns nullptr if no such instruction can be found.
+HloInstruction* FindInstruction(const HloComputation* computation,
+                                HloOpcode opcode);
+
+}  // namespace hlo_query
+}  // namespace xla
+
+#endif  // XLA_HLO_UTILS_HLO_QUERY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
new file mode 100644
index 00000000..049ddca5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -0,0 +1,589 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_HLO_SHARDING_UTIL_H_
+#define XLA_HLO_UTILS_HLO_SHARDING_UTIL_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/dot_as_convolution_util.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+// Class representing a formatting step
+// TODO(tongfei): We have a similar thing in tpu_cross_replica_sharding_util,
+// but it is buried in sharding config specific to cross-replica sharding.
+// Refactoring could allow us to unify.
+struct FormattingStep {
+  Shape input_shape;
+  Shape output_shape;
+  std::optional<Shape> reverse_input_shape;
+  HloOpcode formatting_opcode;
+  HloInstruction* padding_value;
+};
+
+struct GatherScatterDims {
+  DimensionVector operand_dims;
+  DimensionVector indices_dims;
+  DimensionVector output_dims;
+
+  void append(const GatherScatterDims& other);
+
+  // Fills the output dimensions with the given indices dimensions. Other than
+  // index_vector_dim, each dimension in indices corresponds to one output
+  // dimension.
+  void FillOutputDimsWithIndicesDims(
+      int64_t index_vector_dim,
+      absl::Span<const int64_t> offset_or_window_dims);
+};
+
+// Apply the formatting steps.
+HloInstruction* FormatShape(HloInstruction* data,
+                            absl::Span<const FormattingStep> formatting_steps,
+                            HloComputation* computation);
+
+// Reverse the formatting steps.
+HloInstruction* ReverseFormatShape(
+    HloInstruction* data, absl::Span<const FormattingStep> formatting_steps,
+    HloComputation* computation);
+
+// Determines if the first operand 'potential_subsharding' is a subsharding of
+// the second operand 'sharding'. Subsharding means that the tiles in
+// 'potential_subsharding' define tiles that have a subset or the same data that
+// the tiles in 'sharding' define.
+bool IsSubTilingOrEqualSharding(const Shape& shape,
+                                const HloSharding& potential_subsharding,
+                                const HloSharding& sharding);
+
+// Returns true if the lhs sharding is preferable over the rhs sharding.
+// The most specific sharding is tile maximal followed by single device tile
+// maximal and finally replicated. This order aims to primarily reduce memory
+// usage and secondly reduce total compute.
+// Note: This does NOT provide a total ordering as we can have 2 different
+// sharding with same preference level.
+bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs);
+
+// Tries to refine `to_merge` by combining with `old`. Returns if the final
+// `to_merge` is more specific than `old`.
+bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
+                   bool may_combine_partial_sharding);
+
+// Merges `to_merge` into `dst` only if they are compatible, and the merged
+// sharding has >= `minimum_tiles` tiles. Returns if merging happened.
+bool MergeShardingIfCompatible(const HloSharding& to_merge,
+                               int64_t minimum_tiles, HloSharding* dst);
+
+// Same as above, but with `minimum_tiles` = `dst->NumTiles() + 1`.
+bool MergeShardingIfCompatible(const HloSharding& to_merge, HloSharding* dst);
+
+// Find a reasonable common sharding for a list of shardings. The reasonable
+// sharding should incur little(the least) amount of total resharding cost when
+// resharding all the shardings to this common sharding.
+HloSharding FindCommonSharding(
+    absl::Span<const HloSharding> shardings,
+    std::optional<HloSharding> default_sharding = std::nullopt);
+
+// Given a map<device, occurrence_count>, selects the device with higher
+// occurrence count (if any). If top_count in not nullptr, it will receive the
+// count of the dominant device returned.
+std::optional<int64_t> SelectDominantDevice(
+    const std::map<int64_t, int64_t>& device_map, int64_t* top_count);
+
+// Assigns all the instructions of a computation, to a given device.
+// This API does not recurse into called computations, and does not assign
+// instructions which already have sharding.
+void AssignComputationDevice(HloComputation* computation, int64_t device);
+
+// Given an instruction container, returns the device which is most commonly
+// occurring among the instructions.
+std::optional<int64_t> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions);
+
+// Given a set of computations, tries to extract the dominant device. A device
+// is dominant if the combined occurrence among all the instructions of the
+// input computations, is greater/equal than/to dominant_factor (real number
+// from 0 to 1).
+// This API does not recurse into called computations.
+// If no device exists that satisfies the condition, the returned optional will
+// hold no value.
+std::optional<int64_t> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor);
+
+// Given a tiled sharding, move the tiles from source_dim and merge it into
+// target_dim. For example, given a sharding with tile assignment [a, b, c, d,
+// e], source_dim = 1, target_dim = 3, the function will return a sharding with
+// tile assignment [a, 1, c, db, e].
+HloSharding MoveAndMergeShardingTiles(const HloSharding& sharding,
+                                      int64_t source_dim, int64_t target_dim);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// transposed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              absl::Span<const int64_t> dimensions);
+
+// Returns the HloSharding with the tile shape reshaped based on the source and
+// target shapes and the tile assignment adjusted to correspond to the new tile
+// shape or std::nullopt if the resulting reshape would create an invalid
+// sharding (non continuous or non uniformly sized tiles). In case of a tile
+// maximal sharding returns the original sharding.
+std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                           const Shape& target_shape,
+                                           const HloSharding& source_sharding);
+
+// Propagates sharding through reshape. It tries to find partial matches on
+// subsets of dimensions that could satisfy ReshapeSharding() constraints, then
+// combine them. It doesn't require all dimensions to satisfy the constraints
+// of ReshapeSharding().
+HloSharding PropagateShardingThroughReshape(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// reversed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding ReverseSharding(const HloSharding& sharding,
+                            absl::Span<const int64_t> dimensions);
+
+// Returns a sharding tiled on unique dimension dim by reshaping the tile
+// assignment of the sharding argument. Only dimensions in the dims span
+// argument are considered for reshaping, the others are ignored.
+// Assumptions: sharding is tile sharded, and dim must be included in dims.
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64_t dim,
+                                   absl::Span<const int64_t> dims);
+
+// Returns true if the provided module includes one or more instructions with
+// a tile sharding.
+bool ContainsTileSharding(const HloModule& module);
+
+// Returns the preferred output sharding for a gather op based on the sharding
+// of the indices.
+HloSharding GatherOutputShardingFromIndex(const HloSharding& index_sharding,
+                                          const HloInstruction* hlo);
+
+// Returns the preferred index sharding for a gather op based on the sharding
+// of the output.
+HloSharding GatherIndexShardingFromOutput(const HloSharding& output_sharding,
+                                          const HloInstruction* hlo);
+
+// Returns a new HloSharding for a gather op so that only non offset dimensions
+// are sharded. Assume "result" is returned by this function. It is ensured that
+// "GetIndexSharding(result, hlo)" will have the same number of elements as
+// "result".
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo);
+
+// Returns the preferred index sharding for a scatter op based on the sharding
+// of the data.
+HloSharding ScatterIndexShardingFromUpdate(
+    const HloSharding& update_sharding, const HloScatterInstruction* scatter);
+
+// Returns the preferred data sharding for a scatter op based on the sharding
+// of the index.
+HloSharding ScatterUpdateShardingFromIndex(
+    const HloSharding& index_sharding, const HloScatterInstruction* scatter);
+
+// Returns a new index sharding for a scatter op so that we only shard on first
+// "number of scatter_window_dims" dimensions. Assume "result" is returned by
+// this function. It is ensured that "ScatterUpdateShardingFromIndex(result,
+// hlo)" will have the same number of elements as "result".
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloScatterInstruction& scatter);
+
+// Returns a new data sharding for a scatter op so that we only shard on
+// scatter_window_dims. Assume "result" is returned by this function. It is
+// ensured that "ScatterIndexShardingFromUpdate(result, hlo)" will have the same
+// number of elements as "result".
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloScatterInstruction& scatter);
+
+// Returns an output sharding of gather by passing through the data operand's
+// sharding.
+std::optional<HloSharding>
+GatherOutputShardingFromOperandOperandPassthroughDimensions(
+    const HloSharding& operand_sharding, const HloInstruction& hlo);
+
+// Returns an output sharding of gather by passing through the data operand's
+// sharding.
+std::optional<HloSharding>
+GatherOutputShardingFromOperandOperandPassthroughDimensions(
+    const Shape& operand_shape, const HloSharding& operand_sharding,
+    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
+
+// Returns an output sharding of gather by passing through the data operand's
+// sharding on index parallel dimensions
+std::optional<HloSharding> GatherOperandShardingFromOutputParallelDimensions(
+    const HloSharding& output_sharding, const HloScatterInstruction& scatter,
+    const CallGraph& call_graph);
+
+// Returns a data operand sharding of gather by passing through the output's
+// sharding.
+std::optional<HloSharding> GatherOperandShardingFromOutput(
+    const HloSharding& output_sharding, const HloInstruction& hlo,
+    const CallGraph& call_graph);
+
+// Returns the slice size for a scatter with given operand and update shapes.
+std::vector<int64_t> GetScatterSliceSize(const Shape& operand_shape,
+                                         const Shape& update_shape,
+                                         const ScatterDimensionNumbers& dnums);
+
+// Returns an output sharding of scatter by passing through the update operand's
+// sharding.
+std::optional<HloSharding> ScatterOutputShardingFromUpdate(
+    const HloSharding& update_sharding, const HloScatterInstruction& scatter);
+
+// Returns an update operand sharding of scatter by passing through the output's
+// sharding.
+std::optional<HloSharding> ScatterUpdateShardingFromOutput(
+    const HloSharding& per_output_sharding,
+    const HloScatterInstruction& scatter, const CallGraph& call_graph);
+
+// Returns an update operand sharding of scatter by passing through the output's
+// sharding on operand pass-through dimensions.
+std::optional<HloSharding>
+ScatterUpdateShardingFromOutputOperandPassthroughDimensions(
+    const HloSharding& output_sharding, const HloInstruction& hlo);
+
+// Returns an update operand sharding of scatter by passing through the output's
+// sharding on operand pass-through dimensions.
+std::optional<HloSharding>
+ScatterUpdateShardingFromOutputOperandPassthroughDimensions(
+    const Shape& output_shape, const HloSharding& output_sharding,
+    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
+
+// Returns an update operand sharding of scatter by passing through the output's
+// sharding on index parallel dimensions.
+std::optional<HloSharding> ScatterUpdateShardingFromOutputParallelDimensions(
+    const HloSharding& output_sharding, const HloScatterInstruction& scatter,
+    const CallGraph& call_graph);
+
+// Returns an identity value and an HloOpcode for reduce computation of scatter
+// instruction.
+// - If computation is add/or, return 0/false with corresponding op code;
+// - If computation is multiply/and, return 1/true with corresponding op code.
+// - If computation is min/max, return max value/min value with corresponding op
+//   code.
+// - Otherwise, return error status.
+absl::StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter);
+
+// Given a sharding and a list of devices in the topology, return a
+// list of the devices that `sharding` applies to.
+std::vector<int64_t> DevicesForSharding(
+    const HloSharding& sharding, absl::Span<const int64_t> available_devices);
+
+// Returns a sharding that replicates data across devices along the given
+// dimensions in the original sharding.
+HloSharding PartiallyReplicateTiledShardingOnDims(
+    const HloSharding& sharding, absl::Span<const int64_t> dims_to_replicate);
+
+// Returns a sharding that replicates data across devices along all dimensions
+// but the given ones to keep in the original sharding.
+HloSharding PartiallyReplicateTiledShardingOnAllDimsExcept(
+    const HloSharding& sharding, absl::Span<const int64_t> dims_to_keep);
+
+// Returns a sharding that replicates all data dimensions, but keep manual
+// subgroups. If data_rank is provided >= 0, the result sharding's data rank
+// will be set to it.
+HloSharding ReplicateAllDataDims(const HloSharding& sharding,
+                                 int64_t data_rank = -1);
+
+// Returns a sharding the removes given tile dimensions.
+//
+// Precondition: if not tile maximal, the size of each tile dimension must be 1.
+HloSharding RemoveShapeDimensions(const HloSharding& sharding,
+                                  absl::Span<const int64_t> dims_to_remove);
+
+// Similar to TransposeSharding(), but allows removing/adding non-partitioned
+// dimensions. In src_to_tgt and tgt_to_src, -1 represents a non-existing
+// dimension.
+std::optional<HloSharding> TransposeShardingWithCollapsedDims(
+    const HloSharding& source, absl::Span<int64_t const> src_to_tgt,
+    absl::Span<int64_t const> tgt_to_src);
+
+// Given a `source_sharding`, preserve the tiles along the `source_dims` and
+// replicate the rest. The `target_dims` are used to determine the order of the
+// dimensions in the resulting sharding. If `source_dims` and `target_dims` are
+// in the different order (i.e., different ArgSort results), we need to
+// transpose the tile assignment.
+//
+// Given the following input,
+//   * source_sharding = {devices=[2,3,5,7,11]<=[2310]}
+//   * source_dims = [2, 4, 1]
+//   * target_dims = [2, 1, 3]
+//   * target_shape_rank = 5
+// The result shoule be {devices=[1,11,5,3,1,14]<=[2,3,5,7,11]T(4,2,1,0,3)
+// last_tile_dim_replicate}.
+HloSharding PropagateShardingAlongDimsAndReplicateOthers(
+    const HloSharding& source_sharding, absl::Span<const int64_t> source_dims,
+    absl::Span<const int64_t> target_dims, int64_t target_shape_rank);
+
+// Returns the iota dimension if maybe_iota is an kIota instruction or
+// equivalent to kIota.
+std::optional<int64_t> GetDimensionForIota(const HloInstruction* maybe_iota,
+                                           const CallGraph& call_graph);
+
+// Returns a set of parallel dimensions for Gather/Scatter instructions given
+// the parameters for the op.
+std::optional<GatherScatterDims> GetGatherScatterBatchParallelDims(
+    const HloInstruction* operand, const HloInstruction* indices,
+    absl::Span<const int64_t> slice_sizes, int64_t index_vector_dim,
+    absl::Span<const int64_t> index_map,
+    absl::Span<const int64_t> indices_batching_dims,
+    absl::Span<const int64_t> offset_or_window_dims,
+    const CallGraph& call_graph);
+
+// Returns identified parallel dimensions of operands and indices for Gather.
+std::optional<GatherScatterDims> GetGatherParallelBatchDims(
+    const HloInstruction& hlo, const CallGraph& call_graph);
+
+// Returns identified parallel dimensions of operands and indices for Scatter.
+std::optional<GatherScatterDims> GetScatterParallelBatchDims(
+    const HloInstruction& hlo, const CallGraph& call_graph);
+
+// Returns the operand pass-through dimensions for a gather instruction.
+GatherScatterDims GetGatherOperandPassthroughDims(
+    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
+
+// Returns the operand pass-through dimensions for a scatter instruction.
+GatherScatterDims GetScatterOperandPassthroughDims(
+    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
+
+// Returns the dims along which sharding can be propagated between indices and
+// output/update for gather/scatter operations. `excluded_indices_dims` are
+// excluded from the result.
+GatherScatterDims GetGatherConnectedDimsAcrossIndicesAndOutput(
+    int64_t indices_rank, int64_t index_vector_dim, int64_t output_rank,
+    absl::Span<const int64_t> offset_dims,
+    absl::Span<const int64_t> excluded_indices_dims = {});
+
+// Returns the index pass-through dimensions, which are defined by
+// GetGatherConnectedDimsAcrossIndicesAndOutput - ExplictBatchDims -
+// GetGatherScatterBatchParallelDims.
+GatherScatterDims GetGatherScatterIndexPassThroughDims(
+    const HloInstruction& hlo, const CallGraph& call_graph);
+
+// Infer output sharding on index parallel dimensions for gather/scatter from
+// gather operand/indices or scatter operands/indices/updates.
+HloSharding InferGatherScatterParallelShardingFromOperandSharding(
+    const HloSharding& operand_sharding, const Shape& shape,
+    absl::Span<const int64_t> output_aligned_operand_parallel_dims,
+    absl::Span<const int64_t> output_parallel_dims);
+
+// Represents grouping devices in a tiled sharding along certain dimensions.
+// Elements in group dimensions define different device groups, and the sharding
+// represents the in-group sharding.
+struct GroupedSharding {
+  GroupedSharding(std::vector<std::vector<int64_t>> device_groups,
+                  DimensionVector group_dims, DimensionVector group_dim_sizes,
+                  int64_t data_rank, HloSharding grouped_sharding,
+                  bool subgroup_manual = false)
+      : device_groups(std::move(device_groups)),
+        group_dims(std::move(group_dims)),
+        group_dim_sizes(std::move(group_dim_sizes)),
+        data_rank(data_rank),
+        sharding(std::move(grouped_sharding)),
+        subgroup_manual(subgroup_manual) {}
+  std::string ToString() const;
+  // TODO(b/316622399): Migrate this to be a TileAssignment.
+  std::vector<std::vector<int64_t>> device_groups;
+  DimensionVector group_dims;
+  DimensionVector group_dim_sizes;
+  int64_t data_rank;
+  HloSharding sharding;
+  bool subgroup_manual;
+};
+
+// Creates a GroupedSharding for a tiled sharding with group dim shard sizes.
+GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
+                                    absl::Span<const int64_t> group_dims,
+                                    absl::Span<const int64_t> group_dim_shards,
+                                    bool subgroup_manual = false);
+
+// Creates a GroupedSharding for a tiled sharding.
+GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
+                                    absl::Span<const int64_t> group_dims,
+                                    bool subgroup_manual = false);
+
+// Same as above, but exclude group dims instead.
+GroupedSharding GroupShardingOnAllDimsExcept(
+    const HloSharding& sharding, absl::Span<const int64_t> non_group_dims,
+    bool subgroup_manual = false);
+
+// Creates a GroupedSharding by trying to do the following in sequence:
+//
+// 1. Group on partially replicated dimensions, which preserves the existing
+// tiled sharding in the group.
+// 2. If option 1 doesn't have enough dimensions, try borrowing dimensions from
+// replicable_dims in order, until it has enough dimensions. This partly
+// preserves the existing tiled sharding in the group. (e.g. if we need 4
+// groups, while our sharding is {[4,8,2]<=[64] last_tile_dim_replicate}, and if
+// we borrow 2 dimensions from the first dimension(i.e. the 4-way partition),
+// combined with the partially replicated 2, we will be able to group the
+// sharding into 4 groups, and we have grouped sub-sharding [2,8]<=[16] instead.
+// 3. Otherwise replicate the whole thing.
+//
+// This does not guarantee the consistency of the ordering of the tile
+// assignment, and should be used with AlignGroup where its tile assignment
+// doesn't matter and will always align to some other tile assignment.
+GroupedSharding GroupShardingOnReplicatedDim(
+    const HloSharding& sharding, int64_t num_groups, int64_t num_tiles,
+    int64_t data_rank, absl::Span<const int64_t> replicable_dims = {});
+
+// Get group sharding for replicated sharding.
+GroupedSharding GetGroupedReplicatedSharding(const int64_t num_groups,
+                                             const int64_t num_tiles,
+                                             const int64_t data_rank);
+
+// Get group sharding for each manual subgroup.
+GroupedSharding GetManualSubgroupSharding(const HloSharding& sharding);
+
+// Create a group sharding over the partially replicated dimension re-using an
+// existing device group subdivision to avoid unexpected devices reordering.
+std::optional<GroupedSharding>
+PartialReplicatedGroupShardingWithAssignedDeviceGroups(
+    const HloSharding& sharding, int64_t num_shards,
+    const std::vector<std::vector<int64_t>>& device_groups);
+
+// Reconstructs the ungrouped sharding from a GroupedSharding.
+HloSharding UngroupSharding(const GroupedSharding& grouped_sharding);
+
+// Check if the device groups are match for the LHS or RHS group shardings.
+bool DeviceGroupsAreMatch(GroupedSharding& lhs, GroupedSharding& rhs,
+                          bool ignore_group_order = true);
+
+// Spawns a new dimension by splitting an existing dimension and generating a
+// new dimension to its right of the passed down size. The original dimension
+// will be of size "original_dim_size / new_dim_size". The original dimension
+// size needs to be divisible by new_dim_size.
+HloSharding SplitShardingDimension(const HloSharding& sharding,
+                                   int64_t dimension, int64_t new_dim_size);
+
+// Merges a dimension
+// to its left. The new dimension will be of size
+// dimensions[dimension] * dimensions[dimension+1}.
+HloSharding MergeShardingDimension(const HloSharding& sharding,
+                                   int64_t dimension);
+
+// Creates a tuple sharding by combining sharding on the elements of the tuple.
+// If none of the elements have a sharding, return nullptr.
+std::shared_ptr<const HloSharding> CreateTupleSharding(
+    const Shape& shape, absl::Span<const HloInstruction* const> elements);
+
+// We intend to move the sharding tiles from the source dimension to a target
+// dimension. Returns the first target dimension, which satisfies:
+// 1. The source dimension is sharded. The size of the source dimension is
+// larger than 1.
+// 2. The target dimension and source dimension are different.
+// 3. The target dimension satisfies the can_be_target_dim predicate.
+// 4. The size of the target dimension is divisible by the merged tile size,
+// which is the product of the tile sizes of the source dim and the target dim.
+//
+// If there is no such dimension, returns std::nullopt.
+std::optional<int64_t> GetFirstTargetDimToMoveShardingTiles(
+    const Shape& shape, const HloSharding& sharding, int64_t source_dim,
+    std::function<bool(int64_t)> can_be_target_dim = [](int64_t) {
+      return true;
+    });
+
+// Returns the sharding of an output of an instruction. Some instructions have
+// special handling like Outfeed and this function takes care of those.
+std::optional<HloSharding> GetOutputSharding(const HloInstruction* instruction);
+
+// Returns the un-tiled shape.
+Shape UntileShape(const HloSharding& sharding, const Shape& shape);
+
+// Returns the un-tiled shape.
+// REQUIRES: !sharding.IsTuple()
+Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape);
+
+// Returns the tiled shape.
+Shape TileShape(const HloSharding& sharding, const Shape& shape);
+
+// Returns the tiled shape.
+// REQUIRES: !sharding.IsTuple()
+Shape TileLeafShape(const HloSharding& sharding, const Shape& shape);
+
+// Canonicalizes entry_computation_layout by calling
+// module->layout_canonicalization_callback(), which gives canonicalized
+// argument and result layouts based on current module. Currently used by PJRT
+// that assigns layouts based on runtime shapes. Refer to
+// DetermineArgumentLayoutsFromCompileOptions() in
+// tensorflow/compiler/xla/pjrt/utils.h.
+absl::Status CanonicalizeLayoutAfterShardingPropagation(
+    HloModule* module, bool update_output_layout,
+    bool update_parameters_layout);
+
+// Returns true iff the specified hlo or sharding has a spatially partitioned
+// sharding (tiled or replicated) that can be propagated by sharding
+// propagation.
+bool IsSpatiallyPartitioned(const HloSharding& sharding);
+
+// Similar to above but takes a instruction as an input.
+inline bool IsSpatiallyPartitioned(const HloInstruction* hlo) {
+  return hlo->has_sharding() && IsSpatiallyPartitioned(hlo->sharding());
+}
+
+// Implementation for returning a improved sharding from another sharding.
+std::optional<HloSharding> ReturnImprovedShardingImpl(
+    HloSharding from, const HloSharding* to_improved,
+    const Shape& to_improved_shape, bool may_combine_partial_sharding,
+    bool allow_aggressive_resharding = false);
+
+// Infers the sharding of the operand of a dot operation.
+//
+// If `operand_index` is 0, the sharding of the LHS is inferred. If it is 1,
+// the sharding of the RHS is inferred.
+//
+// If `consider_other_operand` is true, the sharding of the other operand is
+// considered. `may_combine_partial_sharding` is used when considering other
+// operand.
+HloSharding InferDotOperandSharding(
+    const HloInstruction* dot, int64_t operand_index,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
+    bool consider_other_operand, bool may_combine_partial_sharding);
+
+// Same as above, but takes the sharding of the dot and the other operand as
+// input.
+HloSharding InferDotOperandSharding(
+    const HloSharding* dot_sharding, const HloSharding* other_operand_sharding,
+    int64_t operand_index,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
+    bool consider_other_operand, bool may_combine_partial_sharding);
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
+
+#endif  // XLA_HLO_UTILS_HLO_SHARDING_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_traversal.h b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_traversal.h
new file mode 100644
index 00000000..71a8fe5c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/hlo/utils/hlo_traversal.h
@@ -0,0 +1,229 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_HLO_UTILS_HLO_TRAVERSAL_H_
+#define XLA_HLO_UTILS_HLO_TRAVERSAL_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+class HloFusionAdaptor;
+
+// Treats HloInstructions as if they were unfused.
+class HloInstructionAdaptor {
+ public:
+  HloInstructionAdaptor() = delete;
+  HloInstructionAdaptor(const HloInstruction& instruction,
+                        const HloFusionAdaptor* parent);
+
+  HloOpcode opcode() const { return instruction_->opcode(); }
+  absl::string_view name() const { return instruction_->name(); }
+
+  HloInstructionAdaptor GetOperand(int index) const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetOperands() const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetUsers() const;
+  const xla::Shape& shape() const { return instruction_->shape(); }
+  std::string ToString() const { return instruction_->ToString(); }
+
+  friend bool operator==(const HloInstructionAdaptor& lhs,
+                         const HloInstructionAdaptor& rhs);
+  friend bool operator!=(const HloInstructionAdaptor& lhs,
+                         const HloInstructionAdaptor& rhs);
+  template <typename H>
+  friend H AbslHashValue(H h, const HloInstructionAdaptor& m);
+
+  // Use sparingly; prefer extending the interface.
+  const HloInstruction& instruction() const { return *instruction_; }
+  const HloFusionAdaptor& parent() const { return *parent_; }
+
+ private:
+  const HloInstruction* instruction_;
+
+  // Pointer to the parent fusion adaptor. Is never null.
+  const HloFusionAdaptor* parent_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const HloInstructionAdaptor& m) {
+  return H::combine(std::move(h), m.instruction_->GetModule(),
+                    m.instruction_->unique_id());
+}
+
+template <HloOpcode op, HloOpcode... rest>
+bool IsOpcodeAnyOf(const HloInstruction* instr) {
+  return (instr->opcode() == op) || ((instr->opcode() == rest) || ...);
+}
+
+namespace internal {
+
+// An interface to abstract away the difference between single instruction
+// fusion and fused computations.
+class HloFusionInstructionAdaptor {
+ public:
+  virtual ~HloFusionInstructionAdaptor() = default;
+  virtual bool ContainsInstruction(const HloInstruction* instruction) const = 0;
+  // If it is a regular multi-output fusion, the order of the returned roots
+  // matches the order of the tuple elements of the tuple root of the fusion
+  // computation. We do not deduplicate fusion roots.
+  virtual absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const = 0;
+  virtual absl::InlinedVector<const HloInstruction*, 2> GetParameters()
+      const = 0;
+  virtual const HloInstruction& FusionInstruction() const = 0;
+  virtual absl::InlinedVector<HloInstructionAdaptor, 2>
+  MakeInstructionPostOrder() const = 0;
+  virtual void ForEach(
+      const std::function<void(HloInstructionAdaptor)>& fn) const = 0;
+  virtual std::string ToString() const = 0;
+};
+
+}  // namespace internal
+
+// Treats a set of HloInstructions as if they were fused.
+class HloFusionAdaptor {
+ public:
+  bool ContainsInstruction(HloInstructionAdaptor instruction) const;
+  bool ContainsInstruction(const HloInstruction* instruction) const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const;
+  absl::InlinedVector<const HloInstruction*, 2> GetParameters() const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> MakeInstructionPostOrder()
+      const;
+
+  // Calls `fn` for each instruction in the fusion.
+  void ForEach(const std::function<void(HloInstructionAdaptor)>& fn) const;
+
+  std::string ToString() const;
+
+  static std::unique_ptr<HloFusionAdaptor> ForInstruction(
+      const HloInstruction* instruction);
+  static std::unique_ptr<HloFusionAdaptor> ForProducerConsumer(
+      const HloInstruction* producer, const HloInstruction* consumer);
+  static std::unique_ptr<HloFusionAdaptor> ForComputation(
+      const HloComputation* computation);
+
+ private:
+  HloFusionAdaptor() = default;
+  HloFusionAdaptor(const HloFusionAdaptor&) = delete;
+  HloFusionAdaptor& operator=(const HloFusionAdaptor&) = delete;
+
+  void AddInstruction(const HloInstruction* instruction);
+  void AddComputation(const HloComputation* computation);
+
+  absl::InlinedVector<std::unique_ptr<internal::HloFusionInstructionAdaptor>, 2>
+      fusion_instructions_;
+};
+
+enum class TraversalResult {
+  // Visit the operands/users of this node.
+  kAdvance,
+  // Do not visit any more nodes.
+  kInterrupt,
+  // Do not visit the operands/users of this node (but continue the traversal
+  // otherwise).
+  kSkip,
+};
+
+// Visit the HLO nodes starting from `roots` in BFS order (consumers before
+// producers). Each node will be visited exactly once.
+void HloBfsConsumersFirstTraversal(
+    absl::Span<const HloInstructionAdaptor> roots,
+    const HloFusionAdaptor& fusion,
+    const std::function<TraversalResult(HloInstructionAdaptor node)>&
+        visit_node);
+
+// Visit the HLO nodes starting from `producers` in BFS order following the
+// `user` edges. Each node will be visited exactly once.
+void HloBfsProducersFirstTraversal(
+    absl::Span<const HloInstructionAdaptor> producers,
+    const HloFusionAdaptor& fusion,
+    const std::function<TraversalResult(HloInstructionAdaptor node)>&
+        visit_node);
+
+// Visit the HLO nodes starting from `roots`, returning true if the return value
+// of `visit` for any of nodes is true. Uses the same order as
+// `HloBfsConsumersFirstTraversal` if `visit_operands` is true. Otherwise the
+// same order as `HloBfsProducersFirstTraversal` is used.
+bool HloBfsAnyOf(absl::Span<const HloInstructionAdaptor> roots,
+                 const HloFusionAdaptor& fusion,
+                 const std::function<bool(HloInstructionAdaptor node)>& visit,
+                 bool visit_operands = true);
+
+// Visit the HLO nodes starting from `roots`, returning true if the return value
+// of `visit` for any of nodes is true. If `visit_operands` is true, the
+// search is going towards the operands, otherwise towards the users. Doesn't
+// require instruction and fusion adaptors.
+bool HloBfsAnyOf(absl::Span<const HloInstruction* const> roots,
+                 const std::function<bool(const HloInstruction* node)>& visit,
+                 bool visit_operands = true);
+
+// Visit the HLO nodes starting from `roots`, returning the first
+// node for which `visit` returns true, or `nullopt` if no node matches. Uses
+// the same order as `HloBfsConsumersFirstTraversal` if `visit_operands` is
+// true. Otherwise the same order as `HloBfsProducersFirstTraversal` is used.
+std::optional<HloInstructionAdaptor> HloBfsFindIf(
+    absl::Span<const HloInstructionAdaptor> roots,
+    const HloFusionAdaptor& fusion,
+    const std::function<bool(HloInstructionAdaptor node)>& visit,
+    bool visit_operands = true);
+
+// Visit the HLO nodes starting from `roots`. If `visit_operands` is true, the
+// search is going towards the operands, otherwise towards the users. Returns
+// the first node for which `visit` returns true, or `nullopt` if no node
+// matches.
+std::optional<const HloInstruction*> HloBfsFindIf(
+    absl::Span<const HloInstruction* const> roots,
+    const std::function<bool(const HloInstruction* node)>& visit,
+    bool visit_operands = true);
+
+// Visit the HLO nodes starting from `roots`.  If `visit_operands` is true, the
+// search is going towards the operands, otherwise towards the users. Returns
+// all nodes for which `visit` returns true. If no node matches, returns an
+// empty vector.
+std::vector<const HloInstruction*> HloBfsFindAll(
+    absl::Span<const HloInstruction* const> roots,
+    const std::function<bool(const HloInstruction* node)>& visit,
+    bool visit_operands = true);
+
+// Returns true if any instruction in the fusion adaptor matches the predicate.
+template <typename Pred>
+bool HloAnyOf(const HloFusionAdaptor& fusion, Pred&& pred) {
+  bool is_any = false;
+  fusion.ForEach([&](HloInstructionAdaptor node) {
+    if (pred(node)) {
+      is_any = true;
+    }
+  });
+  return is_any;
+}
+
+// Find a use chain from `parent` to `root`. Empty if no chain exists.
+// `[parent]` if `parent` is `root`.
+std::vector<HloInstructionAdaptor> HloFindUseChain(HloInstructionAdaptor parent,
+                                                   HloInstructionAdaptor root);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_UTILS_HLO_TRAVERSAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/index_util.h b/third_party/tflite-hdrs/third_party/xla/xla/index_util.h
new file mode 100644
index 00000000..f2ed1be4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/index_util.h
@@ -0,0 +1,163 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility functions related to layouts of Shapes.
+
+#ifndef XLA_INDEX_UTIL_H_
+#define XLA_INDEX_UTIL_H_
+
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Namespaced collection of (static) utilities related to indexing into
+// multidimensional arrays.
+class IndexUtil {
+ public:
+  // Converts a multidimensional index (eg {x, y, z}) into a linear index based
+  // on the shape and its layout. The first index in the multi_index is
+  // dimension 0.
+  static inline int64_t MultidimensionalIndexToLinearIndex(
+      const Shape& shape, absl::Span<const int64_t> multi_index) {
+    return MultidimensionalIndexToLinearIndex(
+        shape, LayoutUtil::MinorToMajor(shape), multi_index);
+  }
+
+  // Converts a multidimensional index (eg {x, y, z}) into a linear index based
+  // on the shape and its layout. The first index in the multi_index is
+  // dimension 0.
+  //
+  // This version can be used when the caller already has the minor_to_major
+  // array for shape available (and can therefore be faster).
+  //
+  // REQUIRES: minor_to_major provided is equal to
+  // shape.layout().minor_to_major()
+  static inline int64_t MultidimensionalIndexToLinearIndex(
+      const Shape& shape, absl::Span<const int64_t> minor_to_major,
+      absl::Span<const int64_t> multi_index) {
+    // Let the array be sized like so for dimensions i from 0 to n-1:
+    //
+    //   [D{n-1} x D{n-2} x .. x D{0}]
+    //
+    // Let the order of the dimensions in the minor_to_major field in
+    // Layout be:
+    //
+    //   L(0), L(1), ... , L(n-1)
+    //
+    // where L(0) is the most-minor dimension and L(n-1) the most-major. The
+    // multidimensional index:
+    //
+    //   [I{0}, I{1}, ... , I{n-1}]
+    //
+    // then corresponds to the following linear index:
+    //
+    // linear_index =
+    //   (((  ... + I{L(2)}) * D{L(1)} + I{L(1)}) * D{L(0)} + I{L(0)}
+    //
+    // or equivalently:
+    //
+    // linear_index =
+    //   I{L(n-1)} * (D{L(n-2)} * D{L(n-3)} * D{L(n-4)} *     ....    D{L(0)}) +
+    //   I{L(n-2)} *             (D{L(n-3)} * D{L(n-4)} *     ....    D{L(0)}) +
+    //   I{L(n-3)} *                         (D{L(n-4)} *     ....    D{L(0)}) +
+    //                                   ...                                   +
+    //   I{L(2)} *                                         (D{L(1)} * D{L(0)}) +
+    //   I{L(1)} *                                                    D{L(0)}  +
+    //   I{L(0)}
+    //
+    // We compute the linear index value by accumulating the terms above from
+    // I{L(0)} up to I{L(n-1)}. Scale accumulates the product term D{L(0}} *
+    // D{L(1)} * ...
+
+    // Scale factor holding the growing product of D{L(i)} terms.
+    for (size_t i = 0; i < multi_index.size(); ++i) {
+      DCHECK_GE(multi_index[i], 0);
+      DCHECK_LT(multi_index[i], shape.dimensions(i))
+          << "indexing beyond extent in dimension " << i << ":"
+          << "\n\tindex: " << absl::StrJoin(multi_index, ",")
+          << "\n\tshape: " << ShapeUtil::HumanString(shape);
+    }
+    if (minor_to_major.empty()) {
+      return 0;
+    }
+    int64_t linear_index = multi_index[minor_to_major[0]];
+    int64_t scale = 1;
+    for (int i = 1; i < minor_to_major.size(); ++i) {
+      scale *= shape.dimensions(minor_to_major[i - 1]);
+      linear_index += scale * multi_index[minor_to_major[i]];
+    }
+    return linear_index;
+  }
+
+  // Converts a linear index into multidimensional index (eg {x, y, z}) based on
+  // the shape and its layout. The first index in the returned multidimensional
+  // index is dimension 0.
+  static DimensionVector LinearIndexToMultidimensionalIndex(
+      const Shape& shape, int64_t linear_index);
+
+  // Bumps a sequence of indices; e.g. {0,0,0,0} up by one index value; e.g. to
+  // {0,0,0,1}. This is akin to std::next_permutation. If the index hits a limit
+  // for the provided shape, the next most significant index is bumped, in a
+  // counting-up process.
+  //
+  // E.g. for shape f32[2,3]
+  //  {0,0}=>{0,1}
+  //  {0,1}=>{0,2}
+  //  {0,2}=>{1,0}
+  //  etc.
+  //
+  // This is useful for traversing the indices in a literal.
+  //
+  // Returns true iff the indices were successfully bumped; false if we've hit
+  // the limit where it can no longer be bumped in-bounds.
+  static bool BumpIndices(const Shape& shape, absl::Span<int64_t> indices);
+
+  // Calculates the stride size (in number of elements, not byte size) of a
+  // given logical shape dimension (from 0 to rank-1).
+  // Example:
+  //  GetDimensionStride(F32[5,8,10,4]{3,2,1,0}, 1) ==
+  //    sizeof(dimension(3)) * sizeof(dimension(2)) == 4 * 10
+  static int64_t GetDimensionStride(const Shape& shape, int64_t dimension);
+
+  // Returns true iff the given multi-index is contained in the bounds for the
+  // shape.
+  static bool IndexInBounds(const Shape& shape,
+                            absl::Span<const int64_t> index);
+
+  // Compares the given indices in lexicographic order.  lhs[0] and rhs[0] are
+  // compared first, and lhs[rank-1] and rhs[rank-1] last.  If lhs is larger,
+  // then -1 is returned. If rhs is larger, then 1 is returned.  Otherwise, 0 is
+  // returned.
+  static int CompareIndices(absl::Span<const int64_t> lhs,
+                            absl::Span<const int64_t> rhs);
+
+ private:
+  IndexUtil(const IndexUtil&) = delete;
+  IndexUtil& operator=(const IndexUtil&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_INDEX_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/iterator_util.h b/third_party/tflite-hdrs/third_party/xla/xla/iterator_util.h
new file mode 100644
index 00000000..2457348e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/iterator_util.h
@@ -0,0 +1,178 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_ITERATOR_UTIL_H_
+#define XLA_ITERATOR_UTIL_H_
+
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+#include "xla/tsl/lib/gtl/iterator_range.h"
+
+namespace xla {
+
+// UnwrappingIterator is a transforming iterator that calls get() on the
+// elements it returns.
+//
+// Together with tsl::gtl::iterator_range, this lets classes which
+// contain a collection of smart pointers expose a view of raw pointers to
+// consumers.  For example:
+//
+//  class MyContainer {
+//   public:
+//    tsl::gtl::iterator_range<
+//        UnwrappingIterator<std::vector<std::unique_ptr<Thing>>::iterator>>
+//    things() {
+//      return {MakeUnwrappingIterator(things_.begin()),
+//              MakeUnwrappingIterator(things_.end())};
+//    }
+//
+//    tsl::gtl::iterator_range<UnwrappingIterator<
+//        std::vector<std::unique_ptr<Thing>>::const_iterator>>
+//    things() const {
+//      return {MakeUnwrappingIterator(things_.begin()),
+//              MakeUnwrappingIterator(things_.end())};
+//    }
+//
+//   private:
+//    std::vector<std::unique_ptr<Thing>> things_;
+//  };
+//
+//  MyContainer container = ...;
+//  for (Thing* t : container.things()) {
+//    ...
+//  }
+//
+// For simplicity, UnwrappingIterator is currently unconditionally an
+// input_iterator -- it doesn't inherit any superpowers NestedIterator may have.
+template <typename NestedIter>
+class UnwrappingIterator {
+ public:
+  using iterator_category = std::input_iterator_tag;
+  using value_type = decltype(std::declval<NestedIter>()->get());
+  using difference_type = ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  explicit UnwrappingIterator(NestedIter iter) : iter_(std::move(iter)) {}
+
+  auto operator*() -> value_type { return iter_->get(); }
+  UnwrappingIterator& operator++() {
+    ++iter_;
+    return *this;
+  }
+  UnwrappingIterator operator++(int) {
+    UnwrappingIterator temp(iter_);
+    operator++();
+    return temp;
+  }
+
+  friend bool operator==(const UnwrappingIterator& a,
+                         const UnwrappingIterator& b) {
+    return a.iter_ == b.iter_;
+  }
+
+  friend bool operator!=(const UnwrappingIterator& a,
+                         const UnwrappingIterator& b) {
+    return !(a == b);
+  }
+
+ private:
+  NestedIter iter_;
+};
+
+template <typename NestedIter>
+UnwrappingIterator<NestedIter> MakeUnwrappingIterator(NestedIter iter) {
+  return UnwrappingIterator<NestedIter>(std::move(iter));
+}
+
+// An iterator that filters out values where the predicate(value) evaluates to
+// false. An unwrapping iterator can be nested inside a filtering iterator to
+// also unwrap smart pointers.
+template <typename NestedIter, typename UnaryPredicate>
+class FilteringIterator {
+ public:
+  using iterator_category = std::input_iterator_tag;
+  using value_type = decltype(*std::declval<NestedIter>());
+  using difference_type = ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  FilteringIterator(NestedIter iter, NestedIter end_iter, UnaryPredicate pred)
+      : iter_(std::move(iter)),
+        end_iter_(std::move(end_iter)),
+        pred_(std::move(pred)) {
+    if (iter_ != end_iter_ && !pred_(**this)) {
+      ++*this;
+    }
+  }
+
+  auto operator*() -> value_type { return *iter_; }
+  FilteringIterator& operator++() {
+    do {
+      ++iter_;
+    } while (iter_ != end_iter_ && !pred_(**this));
+    return *this;
+  }
+  FilteringIterator operator++(int) {
+    FilteringIterator temp(iter_, end_iter_, pred_);
+    operator++();
+    return temp;
+  }
+
+  friend bool operator==(const FilteringIterator& a,
+                         const FilteringIterator& b) {
+    return a.iter_ == b.iter_;
+  }
+
+  friend bool operator!=(const FilteringIterator& a,
+                         const FilteringIterator& b) {
+    return !(a == b);
+  }
+
+ private:
+  NestedIter iter_;
+  NestedIter end_iter_;
+  UnaryPredicate pred_;
+};
+
+template <typename NestedIter, typename UnaryPredicate>
+using FilteringUnwrappingIterator =
+    FilteringIterator<UnwrappingIterator<NestedIter>, UnaryPredicate>;
+
+// Create and return a filtering unwrapping iterator.
+template <typename NestedIter, typename UnaryPredicate>
+FilteringUnwrappingIterator<NestedIter, UnaryPredicate>
+MakeFilteringUnwrappingIterator(NestedIter iter, NestedIter end_iter,
+                                UnaryPredicate pred) {
+  return FilteringUnwrappingIterator<NestedIter, UnaryPredicate>(
+      MakeUnwrappingIterator(iter), MakeUnwrappingIterator(end_iter),
+      std::move(pred));
+}
+
+// Create and return a filtering unwrapping iterator range.
+template <typename NestedIter, typename UnaryPredicate>
+tsl::gtl::iterator_range<
+    FilteringUnwrappingIterator<NestedIter, UnaryPredicate>>
+MakeFilteringUnwrappingIteratorRange(NestedIter begin_iter, NestedIter end_iter,
+                                     UnaryPredicate pred) {
+  return {MakeFilteringUnwrappingIterator(begin_iter, end_iter, pred),
+          MakeFilteringUnwrappingIterator(end_iter, end_iter, pred)};
+}
+
+}  // namespace xla
+
+#endif  // XLA_ITERATOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/layout.h b/third_party/tflite-hdrs/third_party/xla/xla/layout.h
new file mode 100644
index 00000000..24cee4b8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/layout.h
@@ -0,0 +1,529 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_LAYOUT_H_
+#define XLA_LAYOUT_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "xla/printer.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace xla {
+
+class Shape;
+
+// Describes a tile used in tiling-based layout. Refer to
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/tiled_layout.md for
+// details.
+class Tile {
+ public:
+  Tile() = default;
+  explicit Tile(absl::Span<const int64_t> dimensions)
+      : dimensions_(dimensions.begin(), dimensions.end()) {}
+
+  // De/Serialize a Tile to and from a TileProto.
+  static Tile CreateFromProto(const TileProto& tile_proto) {
+    return Tile(tile_proto.dimensions());
+  }
+  TileProto ToProto() const;
+  void SetProto(TileProto& tile_proto) const;
+
+  bool operator==(const Tile& other) const {
+    return dimensions() == other.dimensions();
+  }
+  bool operator!=(const Tile& other) const { return !(*this == other); }
+
+  void Print(Printer* printer) const;
+
+  std::string ToString() const;
+
+  // Returns the bound of the tile in the given dimension index.
+  int64_t dimension(int i) const { return dimensions_[i]; }
+
+  // Returns the dimensions of the tile.
+  absl::Span<const int64_t> dimensions() const { return dimensions_; }
+
+  Tile& add_dimensions(int64_t value) {
+    dimensions_.push_back(value);
+    return *this;
+  }
+
+  Tile& clear_dimensions() {
+    dimensions_.clear();
+    return *this;
+  }
+
+  // This dimension size means the corresponding dimension in the shape is
+  // combined with the next minor dimension before tiling is applied.
+  static constexpr int64_t kCombineDimension =
+      std::numeric_limits<int64_t>::min();
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Tile& t) {
+    return H::combine(std::move(h), t.dimensions_);
+  }
+
+ private:
+  // The bounds of the tile.
+  absl::InlinedVector<int64_t, 2> dimensions_;
+};
+
+using TileVector = absl::InlinedVector<Tile, 3>;
+
+// Describes how data is split between different memories. Each SplitConfig
+// object represents a split in one dimension. Each SplitConfig is associated
+// with a vector of split indices which point to the points in the iteration
+// where the splits occur. For example, if the dimension contains 1024 elements,
+// a split indices value of {512} indicates splitting this dimension into two
+// right through the middle. The dimension here refers to the physical dimension
+// such that 0 is the majormost dimension and rank-1 is the minormost dimension.
+class SplitConfig {
+ public:
+  SplitConfig(int64_t dimension, absl::Span<const int64_t> split_indices)
+      : dimension_(dimension),
+        split_indices_(split_indices.begin(), split_indices.end()) {}
+
+  static SplitConfig CreateFromProto(
+      const SplitConfigProto& split_config_proto) {
+    return SplitConfig(split_config_proto.dimension(),
+                       split_config_proto.split_indices());
+  }
+  SplitConfigProto ToProto() const;
+  void SetProto(SplitConfigProto& split_config_proto) const;
+
+  bool operator==(const SplitConfig& other) const {
+    return dimension() == other.dimension() &&
+           split_indices() == other.split_indices();
+  }
+  bool operator!=(const SplitConfig& other) const { return !(*this == other); }
+
+  std::string ToString() const;
+
+  // Returns the dimension that is split.
+  int64_t dimension() const { return dimension_; }
+  SplitConfig& set_dimension(int64_t dimension) {
+    dimension_ = dimension;
+    return *this;
+  }
+
+  // Returns the indices where splits occur.
+  absl::Span<const int64_t> split_indices() const { return split_indices_; }
+  int64_t split_indices(int64_t idx) const { return split_indices_.at(idx); }
+  int64_t split_indices_size() const { return split_indices_.size(); }
+  SplitConfig& add_split_indices(int64_t split_index) {
+    split_indices_.push_back(split_index);
+    return *this;
+  }
+  SplitConfig& clear_split_indices() {
+    split_indices_.clear();
+    return *this;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const SplitConfig& t) {
+    return H::combine(std::move(h), t.dimension_, t.split_indices_);
+  }
+
+ private:
+  int64_t dimension_;
+  absl::InlinedVector<int64_t, 1> split_indices_;
+};
+
+// TODO: Rename the `dim_level_types` field to `lvl_types`, so that it
+// matches `mlir::sparse_tensor::SparseTensorEncodingAttr`.
+class Layout {
+ public:
+  Layout();
+  Layout(const Layout& other);
+  Layout(Layout&& other);
+  ~Layout();
+
+  // Constructs a dense layout with the given minor-to-major order.
+  explicit Layout(absl::Span<const int64_t> minor_to_major);
+
+  explicit Layout(absl::Span<const int64_t> minor_to_major,
+                  absl::Span<const Tile> tiles, int64_t element_size_in_bits);
+
+  // Constructs a dense tiled layout with the given minor-to-major order, dim
+  // level types, and tiles.
+  explicit Layout(absl::Span<const int64_t> minor_to_major,
+                  absl::Span<const DimLevelType> dim_level_types,
+                  absl::Span<const bool> dim_unique,
+                  absl::Span<const bool> dim_ordered,
+                  absl::Span<const Tile> tiles,
+                  int64_t tail_padding_alignment_in_elements = 1,
+                  PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
+                  PrimitiveType element_primitive_type = PRIMITIVE_TYPE_INVALID,
+                  int64_t element_size_in_bits = 0, int64_t memory_space = 0,
+                  absl::Span<const SplitConfig> split_configs = {},
+                  std::unique_ptr<Shape> physical_shape = nullptr,
+                  int64_t dynamic_shape_metadata_prefix_bytes = 0);
+
+  Layout& operator=(const Layout& other);
+  Layout& operator=(Layout&& other);
+
+  // Construct a shape from a LayoutProto.
+  static Layout CreateFromProto(const LayoutProto& proto);
+
+  // Returns a LayoutProto representation of the Layout.
+  LayoutProto ToProto() const;
+  // Sets a LayoutProto to the representation of the Layout.
+  void SetProto(LayoutProto& proto) const;
+
+  // Prints a human-readable string that represents this layout.
+  void Print(Printer* printer) const;
+
+  // Returns a human-readable string that represents this layout.
+  std::string ToString() const;
+
+  // Equal is a configurable functor to check the equality of two layouts.
+  //
+  // Examples:
+  //
+  // - Comparing two layouts ignoring their difference in tiles:
+  //   Equal().IgnoreTiles()(layout1, layout2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Layout& lhs, const Layout& rhs);
+
+    Equal& IgnoreTiles() {
+      ignore_tiles_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreTailPaddingAlignmentInElements() {
+      ignore_tail_padding_alignment_in_elements_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreIndexPrimitiveType() {
+      ignore_index_primitive_type_ = true;
+      return *this;
+    }
+
+    Equal& IgnorePointerPrimitiveType() {
+      ignore_pointer_primitive_type_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreMemorySpace() {
+      ignore_memory_space_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreSplitConfigs() {
+      ignore_split_configs_ = true;
+      return *this;
+    }
+
+    Equal& IgnorePhysicalShape() {
+      ignore_physical_shape_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreElementSize() {
+      ignore_element_size_ = true;
+      return *this;
+    }
+
+    Equal& MinorToMajorOnly() {
+      return IgnoreTiles()
+          .IgnoreIndexPrimitiveType()
+          .IgnorePointerPrimitiveType()
+          .IgnoreMemorySpace()
+          .IgnorePhysicalShape()
+          .IgnoreElementSize()
+          .IgnoreTailPaddingAlignmentInElements();
+    }
+
+   private:
+    bool ignore_tiles_ = false;
+    bool ignore_tail_padding_alignment_in_elements_ = false;
+    bool ignore_element_size_ = false;
+    bool ignore_index_primitive_type_ = false;
+    bool ignore_pointer_primitive_type_ = false;
+    bool ignore_memory_space_ = false;
+    bool ignore_split_configs_ = false;
+    bool ignore_physical_shape_ = false;
+  };
+
+  bool operator==(const Layout& other) const;
+  bool operator!=(const Layout& other) const { return !(*this == other); }
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message LayoutProto. This enabled easy migration of this data structure
+  // from a proto to a proper C++ class.
+  //
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing the DimLevelType array.
+  int dim_level_types_size() const { return n_dim_level_types_; }
+  DimLevelType dim_level_type(int index) const {
+    return dim_attributes_[index].dim_level_type;
+  }
+  Layout& set_dim_level_type(int index, DimLevelType dim_level_type) {
+    dim_attributes_[index].dim_level_type = dim_level_type;
+    return *this;
+  }
+  Layout& add_dim_level_type(DimLevelType dim_level_type) {
+    while (n_dim_level_types_ >= dim_attributes_.size()) {
+      dim_attributes_.push_back(DimInfo());
+    }
+    dim_attributes_[n_dim_level_types_].dim_level_type = dim_level_type;
+    n_dim_level_types_++;
+    return *this;
+  }
+  Layout& clear_dim_level_types() {
+    n_dim_level_types_ = 0;
+    return *this;
+  }
+
+  // Methods for accessing the dim_unique array.
+  int dim_unique_size() const { return n_dim_unique_; }
+  bool dim_unique(int index) const { return dim_attributes_[index].dim_unique; }
+  Layout& set_dim_unique(int index, bool unique) {
+    dim_attributes_[index].dim_unique = unique;
+    return *this;
+  }
+  Layout& add_dim_unique(bool unique) {
+    while (n_dim_unique_ >= dim_attributes_.size()) {
+      dim_attributes_.push_back(DimInfo());
+    }
+    dim_attributes_[n_dim_unique_].dim_unique = unique;
+    n_dim_unique_++;
+    return *this;
+  }
+
+  // Methods for accessing the dim_ordered array.
+  int dim_ordered_size() const { return n_dim_ordered_; }
+  bool dim_ordered(int index) const {
+    return dim_attributes_[index].dim_ordered;
+  }
+  Layout& set_dim_ordered(int index, bool ordered) {
+    dim_attributes_[index].dim_ordered = ordered;
+    return *this;
+  }
+  Layout& add_dim_ordered(bool ordered) {
+    while (n_dim_ordered_ >= dim_attributes_.size()) {
+      dim_attributes_.push_back(DimInfo());
+    }
+    dim_attributes_[n_dim_ordered_].dim_ordered = ordered;
+    n_dim_ordered_++;
+    return *this;
+  }
+
+  // Methods for accessing the minor-to-major array.
+  int minor_to_major_size() const { return minor_to_major_.size(); }
+  int64_t minor_to_major(int index) const { return minor_to_major_[index]; }
+  Layout& set_minor_to_major(int index, int64_t value) {
+    minor_to_major_[index] = value;
+    return *this;
+  }
+  Layout& add_minor_to_major(int64_t value) {
+    minor_to_major_.push_back(value);
+    return *this;
+  }
+  Layout& clear_minor_to_major() {
+    minor_to_major_.clear();
+    return *this;
+  }
+  // Removes the given dimension from 'minor_to_major_', and adjusts the other
+  // dimensions accordingly. Also adjusts 'dim_level_types_', 'dim_ordered_' and
+  // 'dim_unique_' in case it is a sparse layout.
+  Layout& DeleteDimension(int64_t dim_to_delete);
+  absl::Span<const int64_t> minor_to_major() const { return minor_to_major_; }
+  DimensionVector* mutable_minor_to_major() { return &minor_to_major_; }
+
+  // Methods for accessing the tile field.
+  int64_t tiles_size() const { return tiles_.size(); }
+  const Tile& tiles(int index) const { return tiles_[index]; }
+  Tile* mutable_tiles(int index) { return &tiles_[index]; }
+  Tile* add_tiles() {
+    tiles_.push_back(Tile());
+    return &tiles_.back();
+  }
+  Layout& clear_tiles() {
+    tiles_.clear();
+    return *this;
+  }
+  absl::Span<const Tile> tiles() const { return tiles_; }
+  TileVector* mutable_tiles() { return &tiles_; }
+
+  int64_t element_size_in_bits() const { return element_size_in_bits_; }
+  Layout& set_element_size_in_bits(int64_t value) {
+    element_size_in_bits_ = value;
+    return *this;
+  }
+
+  int64_t tail_padding_alignment_in_elements() const {
+    return tail_padding_alignment_in_elements_;
+  }
+  Layout& set_tail_padding_alignment_in_elements(int64_t value) {
+    tail_padding_alignment_in_elements_ = value;
+    return *this;
+  }
+
+  PrimitiveType index_primitive_type() const { return index_primitive_type_; }
+  Layout& set_index_primitive_type(PrimitiveType value) {
+    index_primitive_type_ = value;
+    return *this;
+  }
+
+  PrimitiveType pointer_primitive_type() const {
+    return pointer_primitive_type_;
+  }
+  Layout& set_pointer_primitive_type(PrimitiveType value) {
+    pointer_primitive_type_ = value;
+    return *this;
+  }
+
+  static constexpr int64_t kDefaultMemorySpace = 0;
+  static constexpr int64_t kGenericFastMemorySpace = 1;
+  static constexpr int64_t kHostMemorySpace = 5;
+  int64_t memory_space() const { return memory_space_; }
+  Layout& set_memory_space(int64_t value) {
+    memory_space_ = value;
+    return *this;
+  }
+
+  int split_configs_size() const { return split_configs_.size(); }
+  const SplitConfig& split_configs(int index) const {
+    return split_configs_.at(index);
+  }
+  SplitConfig* mutable_split_configs(int index) {
+    return &split_configs_.at(index);
+  }
+  Layout& add_split_configs(const SplitConfig& split_config) {
+    split_configs_.push_back(split_config);
+    return *this;
+  }
+  void clear_split_configs() { split_configs_.clear(); }
+  absl::Span<const SplitConfig> split_configs() const { return split_configs_; }
+
+  // Methods for accessing the physical shape.
+  bool has_physical_shape() const { return physical_shape_ != nullptr; }
+  const Shape& physical_shape() const {
+    CHECK(has_physical_shape());
+    return *physical_shape_;
+  }
+  Shape* mutable_physical_shape();
+  void clear_physical_shape();
+
+  int64_t dynamic_shape_metadata_prefix_bytes() const {
+    return dynamic_shape_metadata_prefix_bytes_;
+  }
+  void set_dynamic_shape_metadata_prefix_bytes(int64_t bytes) {
+    dynamic_shape_metadata_prefix_bytes_ = bytes;
+  }
+
+  void Swap(Layout* other) {
+    using std::swap;
+    swap(*this, *other);
+  }
+
+  void Clear() { *this = Layout(); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Layout& l) {
+    return H::combine(std::move(h), l.minor_to_major_, l.tiles_,
+                      l.element_size_in_bits_, l.index_primitive_type_,
+                      l.pointer_primitive_type_, l.memory_space_,
+                      l.split_configs_, l.tail_padding_alignment_in_elements_);
+  }
+
+ private:
+  // We store a single inlined vector to hold
+  struct DimInfo {
+    DimInfo()
+        : dim_level_type(DIM_DENSE), dim_unique(false), dim_ordered(false) {}
+
+    DimLevelType dim_level_type : 6;
+    bool dim_unique : 1;
+    bool dim_ordered : 1;
+  };
+  absl::InlinedVector<DimInfo, InlineRank()> dim_attributes_;
+
+  uint8_t n_dim_level_types_ = 0;
+  uint8_t n_dim_unique_ = 0;
+  uint8_t n_dim_ordered_ = 0;
+
+  // The primitive type to use for sparse array indices and pointers.  Each of
+  // these must either be INVALID, or an unsigned integer type.
+  PrimitiveType index_primitive_type_ : 8;
+  PrimitiveType pointer_primitive_type_ : 8;
+
+  // The assigned memory space.
+  int8_t memory_space_ = 0;
+
+  // The number of bits used to store an individual array element.
+  // When the value is 0, default to ShapeUtil::ByteSizeOfPrimitiveType.
+  int64_t element_size_in_bits_ = 0;
+
+  // A map from physical dimension numbers to logical dimension numbers.
+  // The first element is the most minor physical dimension (fastest varying
+  // index) and the last the most major (slowest varying index). The contents of
+  // the vector are the indices of the *logical* dimensions in the shape.
+  //
+  // For example, in shape f32[8,100,100,3]{3,0,2,1}, the logical dimensions
+  // are [8,100,100,3] and minor_to_major_ is {3,0,2,1}.
+  // So, the most minor physical dimension is [8,100,100,3][3], which is size 3.
+  // The second most minor is [8,100,100,3][0], which is size 8.
+  // The third most minor is [8,100,100,3][2], which is size 100.
+  // And the major dim is [8,100,100,3][1], which is size 100.
+  DimensionVector minor_to_major_;
+
+  // The tiles used in tiling-based layout.
+  TileVector tiles_;
+
+  // The split configurations of the shape, which describes how the storage of
+  // the tensor is split between different physical memories.
+  absl::InlinedVector<SplitConfig, 1> split_configs_;
+
+  // The shape is padded at the end to multiple of, in terms of number of
+  // elements. This is useful when tiling does not bring the shape to certain
+  // desired granules. Tiling effectively pads/reshapes/transposes the shape
+  // to another shape. This field pads the total number of elements of that
+  // new shape to a multiple of certain number of elements. This is useful such
+  // as we want a layout which does not tile the data but still requires it to
+  // be padded to certain number of elements.
+  int64_t tail_padding_alignment_in_elements_ = 1;
+
+  // The physical on-device shape used to represent a sparse array.
+  std::unique_ptr<Shape> physical_shape_;
+
+  // The dynamic shape metadata size in bytes in front of the shape data. The
+  // field may be non-zero for a static shape whose associated buffer is for a
+  // dynamic shape, e.g. a result of SliceToDynamic.
+  int64_t dynamic_shape_metadata_prefix_bytes_ = 0;
+};
+
+std::ostream& operator<<(std::ostream& out, const Tile& Tile);
+std::ostream& operator<<(std::ostream& out, const Layout& layout);
+
+}  // namespace xla
+
+#endif  // XLA_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/layout_util.h b/third_party/tflite-hdrs/third_party/xla/xla/layout_util.h
new file mode 100644
index 00000000..f49c25b3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/layout_util.h
@@ -0,0 +1,310 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility functions related to layouts of Shapes.
+
+#ifndef XLA_LAYOUT_UTIL_H_
+#define XLA_LAYOUT_UTIL_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/printer.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// Namespaced collection of (static) Layout utilities.
+class LayoutUtil {
+ public:
+  LayoutUtil(const LayoutUtil&) = delete;
+  LayoutUtil& operator=(const LayoutUtil&) = delete;
+
+  // Creates a layout with the given minor-to-major dimension order. (This is a
+  // convenience function for protobuf construction.)
+  static Layout MakeLayout(
+      absl::Span<const int64_t> minor_to_major,
+      absl::Span<const DimLevelType> dim_level_types = {},
+      absl::Span<const bool> dim_unique = {},
+      absl::Span<const bool> dim_ordered = {},
+      absl::Span<const Tile> tiles = {},
+      int64_t tail_padding_alignment_in_elements = 1,
+      PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
+      PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
+      int64_t element_size_in_bits = 0, int64_t memory_space = 0,
+      absl::Span<const SplitConfig> split_configs = {},
+      std::optional<Shape> physical_shape = std::nullopt,
+      int64_t dynamic_shape_metadata_prefix_bytes = 0);
+
+  // Similar to MakeLayout, but take indices in reverse order.
+  static Layout MakeLayoutFromMajorToMinor(
+      absl::Span<const int64_t> major_to_minor);
+
+  // Returns a layout with descending ((i.e. {n-1, n-2, ... 0}) minor-to-major
+  // dimensions.
+  static Layout MakeDescendingLayout(int64_t rank);
+
+  // Returns a layout with ascending ((i.e. {0, 1, ... n-1}) minor-to-major
+  // dimensions.
+  static Layout MakeAscendingLayout(int64_t rank);
+
+  // Returns default layout for the given shape.
+  static Layout GetDefaultLayoutForShape(const Shape& shape);
+
+  // Helper functions that create default layouts for various ranks.
+  static Layout GetDefaultLayoutForRank(int64_t rank);
+  static Layout GetDefaultLayoutForR2();
+  static Layout GetDefaultLayoutForR3();
+  static Layout GetDefaultLayoutForR4();
+
+  // Sets the default layout on the Shape.
+  static void SetToDefaultLayout(Shape* shape);
+
+  // Returns a shape with the same dimensions as `shape` but with the default
+  // layout.
+  static Shape GetWithDefaultLayout(const Shape& shape);
+
+  // Sets the layouts of all Shapes within the given ProgramShape to the
+  // default.
+  static void SetToDefaultLayout(ProgramShape* program_shape);
+
+  // Validates that the layout within the given shape is correct. The check
+  // is performed for all subshapes as well. If missing layouts are allowed
+  // the check does not fail on array shapes without layouts.
+  static absl::Status ValidateLayoutInShape(const Shape& shape,
+                                            bool allow_missing_layouts = false);
+
+  // Validates that the provided layout satisfies invariants for the given
+  // shape.
+  static absl::Status ValidateLayoutForShape(const Layout& layout,
+                                             const Shape& shape);
+
+  // Clears the layout in the given Shape. After this function is called,
+  // HasLayout will return false for the shape.
+  static void ClearLayout(Shape* shape);
+
+  // Clears the layout on all Shapes within the given ProgramShape.
+  static void ClearLayout(ProgramShape* program_shape);
+
+  // Clears the tiling fields from the shape and/or all of its subshapes.
+  static void ClearTiles(Shape* shape);
+
+  // Returns whether the given Shape is an array and has a dense in-memory
+  // representation.
+  static bool IsDenseArray(const Shape& shape);
+
+  // Returns whether the given Shape is an array and has a sparse in-memory
+  // representation.
+  static bool IsSparseArray(const Shape& shape);
+
+  // Returns whether the given Shape is a sparse array and has a COO (coordinate
+  // matrix) in-memory representation.
+  static bool IsCOOArray(const Shape& shape);
+
+  // Returns whether the given Shape is a sparse array and has a CSR (compressed
+  // sparse row) in-memory representation.
+  static bool IsCSRArray(const Shape& shape);
+
+  // Returns whether the given Shape is a sparse array and has a CSR (compressed
+  // sparse row) in-memory representation.
+  static bool IsCSCArray(const Shape& shape);
+
+  // Returns whether the given Layout has a dense in-memory representation.
+  static bool IsDense(const Layout& layout);
+
+  // Returns whether the given Layout has a sparse in-memory representation.
+  static bool IsSparse(const Layout& layout);
+
+  // Returns whether the given Layout represents a COO (coordinate matrix)
+  // sparse array.
+  static bool IsCOO(const Layout& layout);
+
+  // Returns whether the given Layout represents a CSC (compressed sparse
+  // column) array.
+  static bool IsCSR(const Layout& layout);
+
+  // Returns whether the given Layout represents a CSC (compressed sparse
+  // column) array.
+  static bool IsCSC(const Layout& layout);
+
+  // Returns whether the layout is monotonic and dim 0 is minor in the layout.
+  // * R0 and R1: this is always trivially true.
+  // * R2+: equivalent to column-major. Dimension 0 is the minor, dimension 1 is
+  //        more major, and so on until dimension N-1 which is the major.
+  static bool IsMonotonicWithDim0Minor(const Layout& layout);
+
+  // Returns whether the layout is monotonic and dim 0 is major in the layout.
+  // * R0 and R1: this is always trivially true.
+  // * R2+: equivalent to row-major. Dimension 0 is the major, dimension 1 is
+  //        more minor, and so on until dimension N-1 which is the minor.
+  //
+  // Returns `true` for "default", major-to-minor layouts (e.g. {3,2,1,0}).
+  static bool IsMonotonicWithDim0Major(const Layout& layout);
+
+  // Returns whether the given shape has a layout. For tuple shapes, true is
+  // returned only if all elements have layouts.
+  static bool HasLayout(const Shape& shape);
+  static bool HasAnyLayout(const Shape& shape);
+
+  // Returns whether all Shapes within the given ProgramShape have layouts.
+  static bool HasLayout(const ProgramShape& program_shape);
+
+  // Returns whether any subshapes of the shape have custom (!= 0)
+  // element_size_in_bits.
+  static bool HasCustomElementSizeInBits(const Shape& shape);
+
+  // Returns whether lhs and rhs are identical.
+  static bool Equal(const Layout& lhs, const Layout& rhs);
+
+  // Returns the minor_to_major array for the given Shape.  Requires that the
+  // shape is an array.
+  static inline absl::Span<const int64_t> MinorToMajor(const Shape& shape) {
+    DCHECK(shape.IsArray());
+    return shape.layout().minor_to_major();
+  }
+
+  static inline absl::Span<const int64_t> MinorToMajor(const Layout& layout) {
+    return layout.minor_to_major();
+  }
+
+  // Major(0) is the most major logical dimension number, Major(1) is the
+  // second-most-major logical dimension number and so on.
+  //
+  // This can be used to translate physical dimension numbers to logical
+  // dimension numbers. Assume that we are numbering the physical dimensions so
+  // that the most major physical dimension has physical dimension number 0 and
+  // so on. Then a physical dimension number p corresponds to the logical
+  // dimension number Major(p). So this function could also be called
+  // PhysicalToLogical().
+  //
+  // As an example, consider physical dimension number 0, which by definition is
+  // the most major. Then Major(0) is the most major logical dimension, so Major
+  // maps the physical dimension number 0 to the most major logical dimension
+  // number Major(0).
+  static int64_t Major(const Layout& layout,
+                       int64_t physical_dimension_number) {
+    DCHECK_LE(0, physical_dimension_number);
+    DCHECK_LT(physical_dimension_number, layout.minor_to_major_size());
+    return Minor(layout,
+                 layout.minor_to_major_size() - 1 - physical_dimension_number);
+  }
+
+  // Minor(0) is the most minor logical dimension number, minor(1) is the
+  // second-most-minor logical dimension number and so on.
+  static inline int64_t Minor(const Layout& layout,
+                              int64_t physical_dimension_number) {
+    DCHECK_LE(0, physical_dimension_number);
+    DCHECK_LT(physical_dimension_number, layout.minor_to_major_size());
+    return layout.minor_to_major(physical_dimension_number);
+  }
+
+  // Returns the inverse mapping of the Major() function. More precisely, return
+  // a vector v such that if l == Major(p), then v[l] == p.
+  //
+  // This can be used to translate logical dimension numbers into physical
+  // dimension numbers. Assume that we are numbering the physical dimensions so
+  // that the most major physical dimension has physical dimension number 0 and
+  // so on. Then a logical dimension number l corresponds to the physical
+  // dimension number MakeLogicalToPhysical(layout)[l].
+  //
+  // In the returned vector, the first element represents the most major logical
+  // dimension. The element whose contents are 0 represents the most major
+  // physical dimension, and the element with contents (rank - 1) represents
+  // the most minor physical dimension.
+  static std::vector<int64_t> MakeLogicalToPhysical(const Layout& layout);
+
+  // Prints a human-readable string that represents the given layout.
+  static void PrintHumanString(Printer* printer, const Layout& layout);
+
+  // Returns a human-readable string that represents the given layout.
+  static std::string HumanString(const Layout& layout);
+
+  // Copies the layout from 'src' to 'dst'. Recursively copies layouts of
+  // tuples.  'src' and 'dst' need not be compatible but the two shapes must
+  // have the same tuple structure (if any) and arrays must have the same
+  // rank. within the shapes must have the same number of dimensions.
+  static absl::Status CopyLayoutBetweenShapes(const Shape& src, Shape* dst);
+
+  // Returns true if the layouts of lhs and rhs are equal, false
+  // otherwise. Recursively compares layouts of tuples.
+  //
+  // lhs and rhs need not be compatible to have the same layout but the two
+  // shapes must have the same tuple structure (if any) and arrays must have the
+  // same rank. Element type is ignored.
+  static bool LayoutsInShapesEqual(
+      const Shape& lhs, const Shape& rhs,
+      std::optional<Layout::Equal> equal = std::nullopt);
+
+  // Returns whether the given dimensions are consecutive in the given layout,
+  // not necessarily in the order given.
+  static bool AreDimensionsConsecutive(const Layout& layout,
+                                       absl::Span<const int64_t> dims);
+
+  // Constructs a new layout by making the given dimension `dim` in the given
+  // layout `layout` as the most major dimension.
+  static Layout MoveDimToMajor(const Layout& layout, int64_t dim);
+
+  // Returns the linearized index of the cell at the given indices. The unit
+  // of the offset is in elements of the shape.
+  //
+  // NOTE: this method only uses the top-level tile and disregards the sub-tile
+  // in the layout. This method is also performance critical.
+  static int64_t LinearIndex(const Shape& shape,
+                             absl::Span<const int64_t> indices);
+
+  // If the shape has a layout, returns the contained memory space.  Otherwise,
+  // returns Layout::kDefaultMemorySpace.
+  static int64_t MemorySpace(const Shape& shape);
+
+  static xla::DimLevelType GetDimLevelType(const Layout& layout, int64_t dim);
+  static bool DimUnique(const Layout& layout, int64_t dim);
+  static bool DimOrdered(const Layout& layout, int64_t dim);
+
+  // Return true iff the given DimLevelType and dim_unique/dim_ordered values
+  // represent a valid encoding.
+  static bool ValidateDimLevel(xla::DimLevelType dim_level_type,
+                               bool dim_unique, bool dim_ordered);
+
+  // Returns true if `byte_strides` is major to minor order, i.e. the strides
+  // form a cumulative product of the byte size and dimensions in reverse order
+  // and the smallest stride is the byte size for `element_type`.
+  static bool ByteStridesIsMajorToMinor(absl::Span<const int64_t> byte_strides,
+                                        absl::Span<const int64_t> dims,
+                                        PrimitiveType element_type);
+
+  // The max size of the split in the given dimension. If the layout doesn't
+  // have a split config in the given dimension, the value returned from this
+  // function is equal to the Shape::dimensions(). If there is a split config in
+  // the given dimension, we then find the size of the largest split in that
+  // dimension.
+  static int64_t MaxSplitSize(const Shape& shape, int64_t dim);
+
+  // This function is analogous to ShapeUtil::ElementsIn, except we use the max
+  // split sizes for each dimension to calculate the max number of elements
+  // stored in a particular split. This can be useful for calculating how much
+  // memory to allocate in each of the memories.
+  static int64_t MaxElementsInPerSplit(const Shape& shape);
+};
+
+}  // namespace xla
+
+#endif  // XLA_LAYOUT_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/lazy.h b/third_party/tflite-hdrs/third_party/xla/xla/lazy.h
new file mode 100644
index 00000000..8e960dd3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/lazy.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_LAZY_H_
+#define XLA_LAZY_H_
+
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+
+namespace xla {
+
+template <typename T>
+class Lazy {
+ public:
+  explicit Lazy(absl::AnyInvocable<T() &&> func)
+      : maybe_value_(std::move(func)) {}
+
+  bool has_value() const { return std::holds_alternative<T>(maybe_value_); }
+
+  const T& get() const {
+    if (!std::holds_alternative<T>(maybe_value_)) {
+      maybe_value_ =
+          std::move(std::get<absl::AnyInvocable<T() &&>>(maybe_value_))();
+    }
+    return std::get<T>(maybe_value_);
+  }
+
+ private:
+  mutable std::variant<absl::AnyInvocable<T() &&>, T> maybe_value_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_LAZY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/literal.h b/third_party/tflite-hdrs/third_party/xla/xla/literal.h
new file mode 100644
index 00000000..1b76f2ef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/literal.h
@@ -0,0 +1,1990 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_LITERAL_H_
+#define XLA_LITERAL_H_
+
+#include <algorithm>
+#include <climits>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <initializer_list>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/casts.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/index_util.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/maybe_owning.h"
+#include "xla/primitive_util.h"
+#include "xla/printer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/lib/core/bitmap.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/macros.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Forward declare Literal and LiteralSlice class to be used by the creation
+// methods in the base class.
+class Literal;
+class LiteralSlice;
+
+// Abstract base class for literals.
+class LiteralBase {
+ public:
+  using DynamicSizeType = ShapeUtil::DynamicSizeType;
+
+  virtual ~LiteralBase() = 0;
+
+  // Literals are equal if they have compatible shapes and the same data
+  // values. Layout is not compared. For a layout sensitive comparison
+  // call Equal() with layout_sensitive=true.
+  bool operator==(const LiteralBase& other) const {
+    return Equal(other, false);
+  }
+  bool operator!=(const LiteralBase& other) const { return !(*this == other); }
+
+  // Compares two literals with optional layout sensitivity. If you use
+  // literals in a hash map, together with AbslHashValue or Hash defined below,
+  // you must use this method instead of operator== to ensure proper layout
+  // handling.
+  bool Equal(const LiteralBase& other, bool layout_sensitive) const;
+
+  // Returns the shape of the literal.
+  const Shape& shape() const;
+
+  // Serialize to proto.
+  LiteralProto ToProto() const;
+
+  // Returns a Span of the array for this literal for the given NativeT
+  // (e.g., float). CHECKs if the subshape of the literal at the given
+  // ShapeIndex is not array. See primitive_util.h for the mapping from XLA type
+  // to native type.
+  template <typename NativeT>
+  absl::Span<const NativeT> data(const ShapeIndex& shape_index = {}) const;
+
+  // Returns a const pointer to (or size of) the underlying buffer holding the
+  // array at the given shape index. CHECKs if the subshape of the literal at
+  // the given ShapeIndex is not array.
+  const void* untyped_data(const ShapeIndex& shape_index = {}) const;
+  int64_t size_bytes(const ShapeIndex& shape_index = {}) const;
+
+  // Computes the size in bytes of the output of the Serialize method.
+  absl::StatusOr<int64_t> SerializedSize() const {
+    return ShapeUtil::SerializedSize(shape());
+  }
+
+  // Serialize the Literal into the given output iterator, whose value_type must
+  // be char.  It's up to the caller to ensure that output can store
+  // SerializedSize() bytes of data.  This can be ensured by using
+  // std::back_inserter, or by manually resizing the target container.
+  // This serializer is useful for bypassing the 2GB protobuf serialization
+  // limit with very large literals, and it should be faster than protobuf
+  // serialization when performance is a concern.
+  // The serialization format should not be relied on for forward/backward
+  // compatibility.  If compatibility is required, you should use protobuf
+  // serialization instead.
+  template <typename OutputIterator>
+  absl::Status Serialize(OutputIterator output) const {
+    return SerializeWithShapeProto(shape().ToProto(), output);
+  }
+
+  // Serialize the Literal into the given string.  This method has the same
+  // caveats as the Serialize() method above.
+  absl::Status SerializeToString(std::string* output) const;
+
+  // Serialize the Literal into a string and return it.  This method has the
+  // same caveats as the Serialize() method above.
+  absl::StatusOr<std::string> SerializeAsString() const;
+
+  // Returns this literal's data as a string. This literal must be a rank-1 U8
+  // array.
+  std::string GetR1U8AsString() const;
+
+  // Prints a string representation of the literal value. The Shape of the
+  // literal is a prefix of the literal value in the string.
+  //
+  // Warning: this function can take minutes for multi-million element Literals.
+  void Print(Printer* printer) const;
+
+  // Similar to Print, but prints the result in a compact one-line form.
+  void PrintOneline(Printer* printer) const;
+
+  // Prints a string representation of the literal value which does *not*
+  // include the shape string.
+  void PrintWithoutShape(Printer* printer) const;
+
+  // Similar to PrintWithoutShape, but prints the result in a compact one-line
+  // form.
+  void PrintWithoutShapeOneline(Printer* printer) const;
+
+  // Prints a string representation of the literal value which includes the
+  // shape string with its layout.does *not* include the shape string.
+  void PrintWithLayout(Printer* printer) const;
+
+  // Similar to PrintWithLayout, but prints the result in a compact one-line
+  // form.
+  void PrintWithLayoutOneline(Printer* printer) const;
+
+  // Returns a string representation of the literal value. The Shape of the
+  // literal is a prefix of the literal value in the string.
+  //
+  // Warning: this function can take minutes for multi-million element Literals.
+  std::string ToString() const;
+
+  // Similar to ToString, but return the result in a compact one-line form.
+  std::string ToStringOneline() const;
+
+  // Returns a string representation of the literal value which does *not*
+  // include the shape string.
+  std::string ToStringWithoutShape() const;
+
+  // Similar to ToStringWithoutShape, but return the result in a compact
+  // one-line form.
+  std::string ToStringWithoutShapeOneline() const;
+
+  // Returns a string representation of the literal value which includes the
+  // shape string with its layout.does *not* include the shape string.
+  std::string ToStringWithLayout() const;
+
+  // Similar to ToStringWithLayout, but return the result in a compact one-line
+  // form.
+  std::string ToStringWithLayoutOneline() const;
+
+  // Gets an element in the literal at the given index. The multi_index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  NativeT Get(absl::Span<const int64_t> multi_index,
+              const ShapeIndex& shape_index) const;
+  // Overloads of Get for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
+  template <typename NativeT>
+  NativeT Get(absl::Span<const int64_t> multi_index) const;
+
+  // Get the dynamic size on dim_index in the literal at the given shape_index.
+  DynamicSizeType GetDynamicSize(int64_t dim_index,
+                                 const ShapeIndex& shape_index) const;
+  DynamicSizeType GetDynamicSize(int64_t dim_index) const;
+
+  // Returns the element value at index (0, ..., 0), however many zeroes are
+  // required for that index.
+  template <typename NativeT>
+  NativeT GetFirstElement() const;
+
+  // As above but returns any integer type casted to an int64_t.
+  std::optional<int64_t> GetFirstInteger() const;
+
+  // As Get(), but determines the correct type and converts the value
+  // into text.
+  std::string GetAsString(absl::Span<const int64_t> multi_index,
+                          const ShapeIndex& shape_index = {}) const;
+
+  // Return whether the value at the specified index is equal to the provided
+  // generic `value` (T must be an arithmetic type).
+  //
+  // Precondition: must be an array.
+  template <typename T>
+  typename std::enable_if<std::numeric_limits<T>::is_specialized, bool>::type
+  IsEqualAt(absl::Span<const int64_t> multi_index, T value) const {
+    if (auto as_s64 = GetIntegralAsS64(multi_index)) {
+      return *as_s64 == value;
+    }
+    complex128 as_complex128 = *GetAsComplex128(multi_index);
+    return as_complex128.imag() == 0 && as_complex128.real() == value;
+  }
+
+  bool IsEqualAt(absl::Span<const int64_t> multi_index,
+                 complex128 value) const {
+    if (auto as_s64 = GetIntegralAsS64(multi_index)) {
+      return *as_s64 == value.real() && value.imag() == 0;
+    }
+    auto as_complex128 = GetAsComplex128(multi_index);
+    return *as_complex128 == value;
+  }
+
+  // As Get(), but determines the correct type and converts the value into
+  // int64_t.  This literal must be an array.
+  std::optional<int64_t> GetIntegralAsS64(
+      absl::Span<const int64_t> multi_index) const;
+
+  // As Get(), but determines the correct type, and converts the value into
+  // double. This literal must be an array.
+  std::optional<double> GetAsDouble(
+      absl::Span<const int64_t> multi_index) const;
+
+  // As Get(), but determines the correct type, and converts the value into
+  // complex128. All floating point types can be converted into complex128.
+  //
+  // This literal must be an array.
+  std::optional<complex128> GetAsComplex128(
+      absl::Span<const int64_t> multi_index) const;
+
+  // Convert each element whose *linear* index is listed in "linear_indices"
+  // to a double and return the sum of all of these elements.
+  std::optional<double> GetSumAsDouble(
+      absl::Span<const int64_t> linear_indices) const;
+
+  // Invokes the "per cell" callback for each element in the provided
+  // literal with the element's indices and a string representation of
+  // the element's value.
+  //
+  // This function is useful if you want a polymorphic representation
+  // of the tensor's elements (turning it to a string for something
+  // like representation in a protobuf).
+  //
+  // This literal must have a dense layout.
+  void EachCellAsString(
+      absl::FunctionRef<void(absl::Span<const int64_t> indices,
+                             const std::string& value)>
+          per_cell) const;
+  template <typename NativeT>
+  void EachCell(
+      absl::FunctionRef<void(absl::Span<const int64_t> indices, NativeT value)>
+          per_cell) const;
+
+  // Checks whether all of this literal's values are equal to the given scalar
+  // literal.
+  //
+  // If `this` is not an array (e.g. it's a tuple), returns false.  This is
+  // simpler than trying to handle subshapes here, and it's almost always what
+  // you want.
+  //
+  // Preconditions:
+  //  - `scalar` is a scalar.
+  //  - `scalar` has the same element-type as `this`.
+  bool IsAll(const Literal& scalar) const;
+
+  // Returns whether every element in this literal is equal to value.
+  //
+  // value is an int8_t because we expect this to be called with small
+  // compile-time constants (0, -1, etc.) and so that whatever value you pass
+  // can be represented exactly by floating-point types as small as 16 bits.
+  //
+  // If value doesn't fit in this literal's type, returns false.  Values of 1/0
+  // are considered equal to true/false; other values are not considered equal
+  // to true.
+  //
+  // Returns false if this literal is not array-shaped.
+  bool IsAll(int8_t value) const;
+
+  // Like IsAll(int8_t), except we check whether the literal is equal to a
+  // particular floating-point or complex number.
+  //
+  // Returns false if this literal is not a floating-point / complex value, or
+  // if it's not an array.
+  //
+  // This casts value to the type of literal, then compares using ==, with the
+  // caveat that NaNs are considered equal. Unlike IsAll, this does not
+  // necessarily return false if the value does not fit in this literal's type.
+  bool IsAllFloat(float value) const;
+  bool IsAllComplex(complex64 value) const;
+
+  // Determines if this literal consists entirely of the first element of the
+  // literal.
+  //
+  // Returns false if this literal is not an array.
+  bool IsAllFirst() const;
+
+  // Returns the number of elements that have value equal to the given value.
+  // Returns 0 if value does not fit in this literal's type or if the literal
+  // is not an array.
+  template <typename T>
+  int64_t CountEqual(T value) const;
+
+  // Returns the number of elements that have value equal to the given complex
+  // value. Returns 0 if value does not fit in this literal's type or if the
+  // literal is not an array.
+  template <typename T>
+  int64_t CountEqual(std::complex<T> value) const;
+
+  // Literal consists entirely of an iota.
+  bool IsR1Iota() const;
+
+  // Returns the stride if the literal is a strided iota.
+  std::optional<int64_t> IsR1StridedIota() const;
+
+  // Returns whether this literal is zero at the specified index. This literal
+  // must be an array with a dense layout.
+  bool IsZero(absl::Span<const int64_t> indices) const;
+
+  // Returns the count of the elements in the array at the given shape index in
+  // this literal.
+  int64_t element_count(const ShapeIndex& index = {}) const {
+    if (index.empty()) {
+      // Common case, avoid GetSubshape().
+      return ShapeUtil::ElementsIn(shape());
+    }
+    return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index));
+  }
+
+  // This definition is here to ensure that nobody accidentally implements this
+  // function which would lead to inconsistencies. Use Hash instead.
+  //
+  // Note: code below should really be static_assert(false, ...), but that is
+  // unfortunately not possible, as some compilers consider it invalid code,
+  // see https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html.
+  template <typename H>
+  friend H AbslHashValue(H state, const LiteralBase& value) {
+    static_assert(sizeof(H) == 0,
+                  "Do not use Literal directly as a hash key, because it has "
+                  "multiple definitions of equality - layout sensitive or "
+                  "insensitive. Instead, use AbslHashable<...>() to create a "
+                  "wrapper with layout sensitivity specified suitable for "
+                  "passing to Absl::Hash");
+  }
+
+  // Always use this together with the Equal method and not operator== in order
+  // to handle layout sensitivity properly.
+  template <typename H, bool kIsLayoutSensitive = true,
+            int64_t kByteLimit = std::numeric_limits<int64_t>::max()>
+  static H Hash(H state, const LiteralBase& literal) {
+    state =
+        Shape::Hash<H, kIsLayoutSensitive>(std::move(state), literal.shape());
+
+    ShapeUtil::ForEachSubshape(literal.shape(), [&](const Shape& subshape,
+                                                    const ShapeIndex& index) {
+      if (!subshape.IsArray()) {
+        return;
+      }
+
+      CHECK(LayoutUtil::IsDenseArray(subshape));
+      const int64_t size_bytes = literal.size_bytes(index);
+      const int64_t bytes_to_hash = std::min(size_bytes, kByteLimit);
+      // When layout insensitive, we need to hash the data bytes in logical
+      // order rather than physical order.
+      const bool use_physical_order =
+          kIsLayoutSensitive || !subshape.has_layout();
+      auto data = absl::MakeConstSpan(
+          static_cast<const char*>(literal.untyped_data(index)), size_bytes);
+      if (use_physical_order) {
+        state = H::combine(std::move(state), data.first(bytes_to_hash));
+        return;
+      }
+      const int64_t elem_size =
+          ShapeUtil::ByteSizeOfPrimitiveType(subshape.element_type());
+      absl::Span<const int64_t> minor_to_major =
+          subshape.layout().minor_to_major();
+      DimensionVector elem_index(subshape.dimensions_size());
+      absl::Span<int64_t> elem_index_span(elem_index.data(), elem_index.size());
+      int64_t bytes_hashed = 0;
+      while (bytes_hashed < bytes_to_hash) {
+        int64_t offset =
+            elem_size * IndexUtil::MultidimensionalIndexToLinearIndex(
+                            subshape, minor_to_major, elem_index);
+        state = H::combine(std::move(state), data.subspan(offset, elem_size));
+        if (!IndexUtil::BumpIndices(subshape, elem_index_span)) return;
+        bytes_hashed += elem_size;
+      }
+    });
+
+    return std::move(state);
+  }
+
+  // Templated wrapper struct to control layout sensitivity during Absl::Hash.
+  template <bool layout_sensitive>
+  struct AbslHashable {
+    const LiteralBase& literal;
+    explicit AbslHashable(const LiteralBase& l) : literal(l) {}
+    template <typename H>
+    friend H AbslHashValue(H h, const AbslHashable& w) {
+      return LiteralBase::Hash<H, layout_sensitive>(std::move(h), w.literal);
+    }
+  };
+
+  // Converts this literal to the given shape. Returns an error is the
+  // conversion is not possible.
+  absl::StatusOr<Literal> ConvertToShape(const Shape& dest_shape) const;
+
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. Returns an error if the conversion is not possible. This
+  // literal must be array-shaped.
+  absl::StatusOr<Literal> BitcastConvert(const Shape& dest_shape) const;
+
+  // Converts this literal to another primitive type. Returns an error if the
+  // conversion is not possible. This literal must be array-shaped.
+  absl::StatusOr<Literal> Convert(PrimitiveType primitive_dest_type) const;
+
+  // Clones the underlying buffers into a new Literal.
+  Literal Clone() const;
+  std::unique_ptr<Literal> CloneToUnique() const;
+
+  // TODO(b/67651157): The methods below which perform computation on Literals
+  // (Reshape, Slice, etc) should be moved elsewhere, and perhaps combined with
+  // evaluator code which operates on Literals.
+  //
+  // Creates a new value that has the equivalent value as this
+  // literal, but conforms to new_layout; e.g. a literal matrix that was in {0,
+  // 1} minor-to-major dimension layout can be re-layed-out as {1, 0}
+  // minor-to-major dimension layout and the value in the cell at any given
+  // logical index (i0, i1) will be the same.
+  //
+  // For tuple shaped literals, shape_index should be used to select the inner
+  // array that the new layout applies to.
+  //
+  // Note: this is useful when the client wants to ensure that a value placed in
+  // the XLA allocation tracker has a particular layout; for efficiency
+  // purposes or avoiding unimplemented operation/layout combinations.
+  Literal Relayout(const Layout& new_layout,
+                   const ShapeIndex& shape_index = {}) const;
+
+  // An overload of Relayout which changes the layout of the entire shape rather
+  // than being limited to a single array within the shape.
+  Literal Relayout(const Shape& shape_with_layout) const;
+
+  // Generate a new literal whose static sizes are equal to the previous
+  // literal's dynamic sizes.
+  Literal ToStatic() const;
+
+  // Expand a static literal into a new one with a bounded dynamic literal. The
+  // static dimensions of the original literal becomes dynamic dimensions of the
+  // new literal, where the argument `bounded_shape` becomes the bounded shape
+  // of the new literal.
+  //
+  // Precondition: bounded_shape.is_dynamic()
+  Literal ToBoundedDynamic(const Shape& bounded_shape) const;
+
+  // Creates a new literal by reshaping this literal to have the given
+  // dimensions. The total number of elements must not change; The
+  // implementation currently only supports monotonic dim0-major layouts.
+  // This literal must be an array.
+  absl::StatusOr<Literal> Reshape(absl::Span<const int64_t> dimensions) const;
+
+  // Creates a new literal by broadcasting this literal with `dimensions` to
+  // yield a literal of shape `result_shape`.
+  absl::StatusOr<Literal> Broadcast(const Shape& result_shape,
+                                    absl::Span<const int64_t> dimensions) const;
+
+  // Creates a new literal by reordering the dimensions of this literal.
+  // The given `permutation` must be a permutation of the dimension numbers
+  // in the original literal, and it specifies the order of the new dimensions
+  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
+  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
+  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
+  // This literal must be an array.
+  Literal Transpose(absl::Span<const int64_t> permutation) const;
+
+  // Creates a sub-array from this literal by extracting the indices
+  // [start_index, limit_index) of each dimension. The result literal has the
+  // same rank and layout as for the given literal. The number of indices in
+  // start_indices and limit_indices must be the rank of the literal, and the
+  // indices follow the order of the dimensions.
+  // This literal must be an array.
+  Literal Slice(absl::Span<const int64_t> start_indices,
+                absl::Span<const int64_t> limit_indices) const;
+
+  // Creates a literal with a prepended dimension with bound "times"; e.g. a
+  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
+  // literal replicated four times.
+  // This literal must be an array.
+  template <typename NativeT>
+  Literal Replicate(int64_t times) const;
+
+  // Returns true if the leaf arrays of the literal within the given shape index
+  // are all determined.
+  // See comments on ArrayValueState for detailed explanation.
+  bool IsDetermined(const ShapeIndex& shape_index = {}) const;
+
+  // Returns true if the leaf arrays of the literal within the given shape index
+  // are all known.
+  // See comments on ArrayValueState for detailed explanation.
+  bool IsKnown(const ShapeIndex& shape_index = {}) const;
+
+  // Creates a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  //
+  // Note: It's an antipattern to use this method then immediately call
+  // MutableLiteralBase::Populate on the result (since that results in zero
+  // initialization, then reinitialization. Consider if a call to
+  // std::make_unique<Literal>(shape), followed by the call to
+  // MutableLiteralBase::Populate can be used instead.
+  static Literal CreateFromShape(const Shape& shape);
+
+  // WARNING: These two functions are only supposed to be used by HloEvaluator.
+  // The rest of XLA assumes all literals are known.
+  // Similar to CreateFromShape() but marks all leaf arrays as unknown.
+  static Literal CreateFromShapeWithUnknownLeafArrays(const Shape& shape);
+  // Similar to CreateFromShape() but marks all leaf arrays as undetermined.
+  static Literal CreateFromShapeWithUndeterminedLeafArrays(const Shape& shape);
+
+ protected:
+  class Piece;
+
+  // Recursively builds the subtree for the given piece and sets the subshapes
+  // of the given piece with the given shape.
+  void BuildPieceSubtree(const Shape& shape, Piece* piece);
+
+  template <typename OutputIterator>
+  absl::Status SerializeWithShapeProto(const ShapeProto& proto,
+                                       OutputIterator output) const;
+
+  template <typename OutputIterator>
+  class SerializeState {
+   public:
+    SerializeState(const ShapeProto& shape, OutputIterator output)
+        : output_(output) {
+      WriteShape(shape);
+    }
+
+    int64_t num_written() const { return num_written_; }
+
+    template <typename NativeT>
+    void WriteElement(NativeT element) {
+      constexpr PrimitiveType primitive_type =
+          primitive_util::NativeToPrimitiveType<NativeT>();
+      static_assert(primitive_util::BitWidth(primitive_type) % 8 == 0);
+      if constexpr (primitive_util::IsComplexType(primitive_type)) {
+        WriteElement(element.real());
+        WriteElement(element.imag());
+      } else {
+        constexpr PrimitiveType unsigned_type =
+            primitive_util::UnsignedIntegralTypeForBitWidth(
+                primitive_util::BitWidth(primitive_type));
+        using UnsignedT = primitive_util::NativeTypeOf<unsigned_type>;
+        UnsignedT unsigned_element = absl::bit_cast<UnsignedT>(element);
+        if constexpr (sizeof(UnsignedT) == 1) {
+          *output_++ = absl::bit_cast<char>(unsigned_element);
+          ++num_written_;
+        } else {
+          for (int i = 0; i < sizeof unsigned_element; ++i) {
+            *output_++ = static_cast<char>(unsigned_element);
+            unsigned_element >>= CHAR_BIT;
+            ++num_written_;
+          }
+        }
+      }
+    }
+
+    template <typename NativeT>
+    void WriteElements(absl::Span<const NativeT> elements) {
+      constexpr PrimitiveType primitive_type =
+          primitive_util::NativeToPrimitiveType<NativeT>();
+      constexpr int bits_per_element = primitive_util::BitWidth(primitive_type);
+      if constexpr (bits_per_element < 8) {
+        static_assert(!primitive_util::IsFloatingPointType(primitive_type));
+        static_assert(!primitive_util::IsComplexType(primitive_type));
+        static_assert(8 % bits_per_element == 0);
+        constexpr int elements_per_byte = 8 / bits_per_element;
+
+        int64_t bytes = elements.size() / elements_per_byte;
+        for (int64_t i = 0; i < bytes; ++i) {
+          uint8_t byte = 0;
+          for (int b = 0; b < elements_per_byte; ++b) {
+            uint8_t src =
+                static_cast<uint8_t>(elements[i * elements_per_byte + b]) &
+                LsbMask<uint8_t>(bits_per_element);
+            byte |= src << (b * bits_per_element);
+          }
+          WriteElement(byte);
+        }
+        int64_t rest = elements.size() % elements_per_byte;
+        if (rest != 0) {
+          uint8_t byte = 0;
+          for (int64_t b = 0; b < rest; ++b) {
+            uint8_t src =
+                static_cast<uint8_t>(elements[bytes * elements_per_byte + b]) &
+                LsbMask<uint8_t>(bits_per_element);
+            byte |= src << (b * bits_per_element);
+          }
+          WriteElement(byte);
+        }
+      } else {
+        for (NativeT element : elements) {
+          WriteElement(element);
+        }
+      }
+    }
+
+    void WriteDynamicSizes(absl::Span<const DynamicSizeType> sizes) {
+      WriteElements(sizes);
+    }
+
+   private:
+    void WriteShape(const ShapeProto& proto) {
+      std::string shape_bytes = proto.SerializeAsString();
+      uint64_t shape_size = shape_bytes.size();
+      WriteElement(shape_size);
+      output_ = std::copy(shape_bytes.begin(), shape_bytes.end(), output_);
+      num_written_ += shape_bytes.size();
+    }
+
+    OutputIterator output_;
+    int64_t num_written_ = 0;
+  };
+
+  template <typename InputIterator>
+  class DeserializeState {
+   public:
+    DeserializeState(InputIterator input, InputIterator end)
+        : input_(input), end_(end) {}
+
+    int64_t num_read() const { return num_read_; }
+
+    template <typename NativeT>
+    ABSL_MUST_USE_RESULT bool ReadElement(NativeT& element) {
+      constexpr PrimitiveType primitive_type =
+          primitive_util::NativeToPrimitiveType<NativeT>();
+      static_assert(primitive_util::BitWidth(primitive_type) % 8 == 0);
+      if constexpr (primitive_util::IsComplexType(primitive_type)) {
+        using ComponentT =
+            primitive_util::NativeTypeOf<primitive_util::ComplexComponentType(
+                primitive_type)>;
+        ComponentT real;
+        if (!ReadElement(real)) {
+          return false;
+        }
+        ComponentT imag;
+        if (!ReadElement(imag)) {
+          return false;
+        }
+        element = NativeT(real, imag);
+      } else {
+        constexpr PrimitiveType unsigned_type =
+            primitive_util::UnsignedIntegralTypeForBitWidth(
+                primitive_util::BitWidth(primitive_type));
+        using UnsignedT = primitive_util::NativeTypeOf<unsigned_type>;
+        if constexpr (sizeof(UnsignedT) == 1) {
+          if (at_end()) {
+            return false;
+          }
+          element = absl::bit_cast<NativeT>(*input_++);
+          ++num_read_;
+        } else {
+          UnsignedT unsigned_element = 0;
+          for (int i = 0, shift = 0; i < sizeof unsigned_element;
+               ++i, shift += CHAR_BIT) {
+            if (at_end()) {
+              return false;
+            }
+            unsigned_element |=
+                static_cast<UnsignedT>(static_cast<unsigned char>(*input_++))
+                << shift;
+            ++num_read_;
+          }
+          element = absl::bit_cast<NativeT>(unsigned_element);
+        }
+      }
+      return true;
+    }
+
+    template <typename NativeT>
+    ABSL_MUST_USE_RESULT bool ReadElements(absl::Span<NativeT> elements) {
+      constexpr PrimitiveType primitive_type =
+          primitive_util::NativeToPrimitiveType<NativeT>();
+      constexpr int bits_per_element = primitive_util::BitWidth(primitive_type);
+      if constexpr (bits_per_element < 8) {
+        static_assert(!primitive_util::IsFloatingPointType(primitive_type));
+        static_assert(!primitive_util::IsComplexType(primitive_type));
+        static_assert(8 % bits_per_element == 0);
+        constexpr int elements_per_byte = 8 / bits_per_element;
+
+        int64_t bytes = elements.size() / elements_per_byte;
+        for (int64_t i = 0; i < bytes; ++i) {
+          uint8_t byte;
+          if (!ReadElement(byte)) {
+            return false;
+          }
+          for (int b = 0; b < elements_per_byte; ++b) {
+            elements[i * elements_per_byte + b] =
+                static_cast<NativeT>(byte & LsbMask<uint8_t>(bits_per_element));
+            byte >>= bits_per_element;
+          }
+        }
+        int64_t rest = elements.size() % elements_per_byte;
+        if (rest != 0) {
+          uint8_t byte;
+          if (!ReadElement(byte)) {
+            return false;
+          }
+          for (int64_t b = 0; b < rest; ++b) {
+            elements[bytes * elements_per_byte + b] =
+                static_cast<NativeT>(byte & LsbMask<uint8_t>(bits_per_element));
+            byte >>= bits_per_element;
+          }
+        }
+      } else {
+        for (NativeT& element : elements) {
+          if (!ReadElement(element)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+
+    bool ReadDynamicSizes(absl::Span<DynamicSizeType> sizes) {
+      return ReadElements(sizes);
+    }
+
+    absl::StatusOr<Shape> ReadShape(uint64_t size) {
+      std::string shape_bytes;
+      shape_bytes.reserve(size);
+      while (shape_bytes.size() < size) {
+        if (at_end()) {
+          return InvalidArgument("Failed to read shape data");
+        }
+        shape_bytes.push_back(*input_++);
+        ++num_read_;
+      }
+      ShapeProto proto;
+      if (!proto.ParseFromString(shape_bytes)) {
+        return InvalidArgument("Failed to parse shape protobuf");
+      }
+      Shape shape(proto);
+      TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+      return std::move(shape);
+    }
+
+    bool at_end() const { return input_ == end_; }
+
+   private:
+    InputIterator input_;
+    InputIterator end_;
+    int64_t num_read_ = 0;
+  };
+
+  // Array literals could be in one of the following three states:
+  //   1) Known: we have evaluated and known the value of the array literal.
+  //   2) Unknown: we have tried to evaluate the array literal, but its value
+  //               cannot be evaluated statically.
+  //   3) Undetermined: we haven't tried to evaluate the array literal.
+  //  Unknown and Undetermined states are only meant to be used within
+  //  HloEvaluator. The rest of XLA assumes array literals are all known.
+  //  Literals that are unknown or undetermined can be copied from, using
+  //  CopyFrom and Clone, or moved from using move constructor. Accessing values
+  //  of such literals causes undefined behavior.
+  enum class ArrayValueState { kKnown = 0, kUnknown = 1, kUndetermined = 2 };
+
+  // A data structure representing a subshape at a particular ShapeIndex within
+  // the literal. For array-shaped ShapeIndexes, this data structure holds the
+  // pointer to the memory allocated for the array data.
+  class Piece {
+   public:
+    ArrayValueState get_array_value_state() const;
+    void set_array_value_state(ArrayValueState state);
+    // Returns the buffer holding the array data for this piece as an array
+    // slice. This piece must be array-shaped.
+    template <typename NativeT>
+    absl::Span<const NativeT> data() const;
+    template <typename NativeT>
+    absl::Span<NativeT> data();
+
+    // Returns the buffer holding the array data for this piece as a void*. This
+    // piece must be array-shaped.
+    void* untyped_data();
+    const void* untyped_data() const;
+
+    // Gets or sets an element in the array at the given index. The multi_index
+    // is CHECKed against the dimension sizes of the array.  This piece must be
+    // array-shaped.
+    template <typename NativeT>
+    NativeT Get(absl::Span<const int64_t> index) const;
+    template <typename NativeT>
+    void Set(absl::Span<const int64_t> index, NativeT value);
+
+    DynamicSizeType GetDynamicSize(int64_t dim_index) const;
+    void SetDynamicSize(int64_t dim_index, DynamicSizeType size);
+    void AllocateBuffers();
+    void DeallocateBuffers();
+    // Gets/sets the buffer holding the array data.
+    const char* buffer() const;
+    char* buffer() {
+      return const_cast<char*>(const_cast<const Piece*>(this)->buffer());
+    }
+    void set_buffer(char* buffer) {
+      DCHECK(LayoutUtil::IsDenseArray(*subshape_));
+      auto* dense_rep = std::holds_alternative<Uninitialized>(rep_)
+                            ? &rep_.emplace<DenseRep>()
+                            : GetDenseRep();
+      DCHECK(dense_rep);
+      dense_rep->data = buffer;
+    }
+    void MoveDataFrom(Piece& from) {
+      DCHECK(!std::holds_alternative<DenseRep>(rep_));
+      DCHECK(!std::holds_alternative<TupleRep>(rep_));
+      if (auto* dense_rep = from.GetDenseRep()) {
+        rep_.emplace<DenseRep>().data = dense_rep->data;
+      } else if (auto* inlined_rep = from.GetDenseInlinedRep()) {
+        std::memcpy(rep_.emplace<DenseInlinedRep>().data, inlined_rep->data,
+                    from.total_bytes_dense());
+      }
+      from.rep_.emplace<Uninitialized>();
+    }
+
+    // Gets/sets the buffer holding dynamic sizes.
+    const DynamicSizeType* dynamic_size_buffer() const {
+      DCHECK(LayoutUtil::IsDenseArray(*subshape_));
+      return reinterpret_cast<const DynamicSizeType*>(
+          buffer() + dynamic_size_buffer_offset());
+    }
+    DynamicSizeType* dynamic_size_buffer() {
+      return const_cast<DynamicSizeType*>(
+          const_cast<const Piece*>(this)->dynamic_size_buffer());
+    }
+
+    int64_t dynamic_size_buffer_bytes() const {
+      DCHECK(LayoutUtil::IsDenseArray(*subshape_));
+      return subshape().dimensions_size() * sizeof(DynamicSizeType);
+    }
+
+    // Gets or sets the subshape of this piece. This reference points to a
+    // subshape within the shape in the containing Literal (Literal::shape_).
+    const Shape& subshape() const { return *subshape_; }
+    void set_subshape(const Shape* subshape) {
+      subshape_ = subshape;
+      if (std::holds_alternative<Uninitialized>(rep_)) {
+        if (subshape_->IsTuple()) {
+          rep_.emplace<TupleRep>();
+        }
+      }
+    }
+
+    // Returns the size in bytes of the buffer holding the dense array data.
+    int64_t size_bytes_dense() const {
+      DCHECK(LayoutUtil::IsDenseArray(*subshape_));
+      return ShapeUtil::ByteSizeOf(subshape());
+    }
+
+    // The dynamic metadata starts at the end of the data in the literal.
+    // The literal can have any number of bytes. For example, it could be a PRED
+    // with 7 elements. `dynamic_size_buffer_offset` returns the number of bytes
+    // before the dynamic size information including whatever padding is needed
+    // to align the start of the dynamic size information so that it is aligned
+    // to a multiple of `sizeof(DynamicSizeType)`.
+    int64_t dynamic_size_buffer_offset() const {
+      // Make sure the dynamic buffer starts on a boundary aligned to
+      // `sizeof(DynamicSizeType)`.
+      return RoundUpTo<int64_t>(size_bytes_dense(), sizeof(DynamicSizeType));
+    }
+
+    // Total size in bytes, including the dynamic size addition.
+    //
+    // The shape can become dynamic after this literal is allocated, so we
+    // over-allocate the margin for the dynamic shape description in case we
+    // need it.
+    int64_t total_bytes_dense() const {
+      return dynamic_size_buffer_offset() + dynamic_size_buffer_bytes();
+    }
+
+    // Returns the number of elements in this piece's array.
+    int64_t element_count() const { return ShapeUtil::ElementsIn(subshape()); }
+
+    // Returns the child piece at 'index' of this piece.
+    Piece& child(int64_t index) {
+      return const_cast<Piece&>(const_cast<const Piece*>(this)->child(index));
+    }
+    const Piece& child(int64_t index) const {
+      auto* tuple_rep = GetTupleRep();
+      DCHECK(tuple_rep);
+      return tuple_rep->children[index];
+    }
+
+    // Adds a child piece to this piece's children.
+    void emplace_back(Piece child_piece) {
+      auto* tuple_rep = GetTupleRep();
+      DCHECK(tuple_rep);
+      tuple_rep->children.emplace_back(std::move(child_piece));
+    }
+
+    // Returns the size of children pieces of this piece.
+    int64_t children_size() const {
+      if (auto* tuple_rep = GetTupleRep()) {
+        return tuple_rep->children.size();
+      }
+      return 0;
+    }
+
+    // Visitor functions that recursively traverses the piece and calls the
+    // given function at each child piece. The function has the type:
+    //    void (const ShapeIndex& index, const Piece& piece)
+    template <typename Fn>
+    void ForEachSubpiece(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelper(
+                 [&func](const ShapeIndex& index, const Piece& piece) {
+                   func(index, piece);
+                   return absl::OkStatus();
+                 },
+                 *this, &index)
+          .IgnoreError();
+    }
+    // Same as above, but the function has the type:
+    //    absl::Status (const ShapeIndex& index, const Piece& piece)
+    // The first non-OK return value is returned by the function.
+    template <typename Fn>
+    absl::Status ForEachSubpieceWithStatus(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelper(func, *this, &index);
+    }
+    // Same as above, but the function has the type:
+    //    Bool (const ShapeIndex& index, const Piece& piece)
+    // The first non-true return value is returned by the function.
+    template <typename Fn>
+    bool ForEachSubpieceWithBool(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelperBool(func, *this, &index);
+    }
+    // Same as above, but the function has the type:
+    //    Void (const ShapeIndex& index, Piece& piece)
+    template <typename Fn>
+    void ForEachMutableSubpiece(const Fn& func) {
+      ShapeIndex index;
+      return ForEachMutableHelper(
+                 [&func](const ShapeIndex& index, Piece* piece) {
+                   func(index, piece);
+                   return absl::OkStatus();
+                 },
+                 const_cast<xla::LiteralBase::Piece*>(this), &index)
+          .IgnoreError();
+    }
+    // Same as above, but the function has the type:
+    //    absl::Status (const ShapeIndex& index, Piece& piece)
+    // The first non-OK return value is returned by the function.
+    template <typename Fn>
+    absl::Status ForEachMutableSubpieceWithStatus(const Fn& func) {
+      ShapeIndex index;
+      return ForEachMutableHelper(
+          func, const_cast<xla::LiteralBase::Piece*>(this), &index);
+    }
+
+    // Checks whether all elements of this Piece are equal to the given literal.
+    //
+    // Returns false if this Piece is not an array.
+    //
+    // Preconditions:
+    //  - `scalar` is a scalar.
+    //  - `scalar`'s type matches that of `this`.
+    bool IsAll(const Literal& scalar) const;
+
+    // Returns the number of elements with equal value to the given literal.
+    // Returns 0 if this Piece is not an array.
+    int64_t CountAll(const Literal& scalar) const;
+
+    // Returns true if this piece and 'other' contain the same data. This piece
+    // and 'other' must be array-shaped and compatible. If a literal has dynamic
+    // shape, comparison is done only for the valid elements.
+    bool EqualElements(const Piece& other) const;
+
+    // Returns true if this piece and other pieces have the same dynamic
+    // dimension sizes.
+    bool EqualDynamicSize(const Piece& other) const;
+
+    // Writes the shape and data (if array-shaped) into the given proto.
+    void WriteToProto(LiteralProto* proto) const;
+
+    // Copy the data from 'src' into this piece's buffer. Shapes of this piece
+    // and src must be compatible. If only_dynamic_bound is true, only elements
+    // within dynamic bounds will be copied.
+    absl::Status CopyFrom(const Piece& src, bool only_dynamic_bound);
+
+    // Copies the data from the given proto into this piece. The shape of this
+    // piece must be equal (not just compatible) to the shape of the proto.
+    absl::Status CopyFromProto(const LiteralProto& proto);
+
+    // See comments on ArrayValueState for detailed explanation.
+    bool IsDetermined() const;
+
+    bool IsKnown() const;
+
+    // Serialize the data contained by this Piece into the given serialization
+    // state.
+    template <typename NativeT, typename OutputIterator>
+    void SerializeData(SerializeState<OutputIterator>& state) const;
+
+    // Deserialize the data for this Piece from the given serialization state.
+    template <typename NativeT, typename InputIterator>
+    bool DeserializeData(DeserializeState<InputIterator>& state);
+
+   private:
+    // Uninitialized state representation.
+    struct Uninitialized {};
+    // Out of line dense array storage.
+    union DenseRep {
+      char* data;
+    };
+    struct TupleRep {
+      // Children pieces for tuple shaped pieces.
+      std::vector<Piece> children = {};
+    };
+
+    // Literals can be used as DMA targets, which can require alignment. We
+    // force a tsl::Allocator::kAllocatorAlignment-byte minimum
+    // alignment.
+    static inline constexpr size_t kMinimumAlignment = 64;
+
+    // Use just so many bytes that we don't increase the sizeof(Piece).
+    static inline constexpr size_t kMaxInlinedBytes =
+        std::max(sizeof(DenseRep), sizeof(TupleRep));
+
+    // Inlined dense array storage.
+    struct DenseInlinedRep {
+      alignas(kMinimumAlignment) char data[kMaxInlinedBytes];
+    };
+
+    const DenseInlinedRep* GetDenseInlinedRep() const {
+      return std::get_if<DenseInlinedRep>(&rep_);
+    }
+    DenseInlinedRep* GetDenseInlinedRep() {
+      return std::get_if<DenseInlinedRep>(&rep_);
+    }
+
+    const DenseRep* GetDenseRep() const { return std::get_if<DenseRep>(&rep_); }
+    DenseRep* GetDenseRep() { return std::get_if<DenseRep>(&rep_); }
+
+    const TupleRep* GetTupleRep() const { return std::get_if<TupleRep>(&rep_); }
+    TupleRep* GetTupleRep() { return std::get_if<TupleRep>(&rep_); }
+    // Helpers for traversing the piece via ForEachSubpiece rooted at 'index'.
+    // The first non-OK (or non-true) value is returned by the function.
+    // The callable 'func' has the same signature as described above in
+    // ForEachSubpiece*.
+    template <typename Fn>
+    absl::Status ForEachHelper(const Fn& func, const Piece& piece,
+                               ShapeIndex* index) const {
+      TF_RETURN_IF_ERROR(func(*index, piece));
+      if (auto* tuple_rep = piece.GetTupleRep()) {
+        for (int64_t i = 0; i < tuple_rep->children.size(); ++i) {
+          index->push_back(i);
+          TF_RETURN_IF_ERROR(
+              ForEachHelper(func, tuple_rep->children[i], index));
+          index->pop_back();
+        }
+      }
+      return absl::OkStatus();
+    }
+    template <typename Fn>
+    bool ForEachHelperBool(const Fn& func, const Piece& piece,
+                           ShapeIndex* index) const {
+      if (!func(*index, piece)) {
+        return false;
+      }
+      if (auto* tuple_rep = piece.GetTupleRep()) {
+        for (int64_t i = 0; i < tuple_rep->children.size(); ++i) {
+          index->push_back(i);
+          if (!ForEachHelperBool(func, tuple_rep->children[i], index)) {
+            return false;
+          }
+          index->pop_back();
+        }
+      }
+      return true;
+    }
+    template <typename Fn>
+    absl::Status ForEachMutableHelper(const Fn& func, Piece* piece,
+                                      ShapeIndex* index) {
+      TF_RETURN_IF_ERROR(func(*index, piece));
+      if (auto* tuple_rep = piece->GetTupleRep()) {
+        for (int64_t i = 0; i < tuple_rep->children.size(); ++i) {
+          index->push_back(i);
+          TF_RETURN_IF_ERROR(
+              ForEachMutableHelper(func, &tuple_rep->children[i], index));
+          index->pop_back();
+        }
+      }
+      return absl::OkStatus();
+    }
+
+    // Recursive helper for EqualElements.
+    template <typename NativeT>
+    bool EqualElementsInternal(const Piece& other,
+                               std::vector<int64_t>* multi_index) const;
+
+    // Internal helper to copy elements from another given piece
+    template <typename NativeT>
+    void CopyElementsWithDynamicBound(const LiteralBase::Piece& src);
+
+    // Storage representation of this piece.
+    std::variant<Uninitialized, DenseInlinedRep, DenseRep, TupleRep> rep_;
+
+    // The shape of piece. This points into the shape of the containing Literal
+    // (Literal::shape_).
+    const Shape* subshape_ = nullptr;
+
+    ArrayValueState array_value_state_ = ArrayValueState::kKnown;
+  };  // class Piece
+
+  const Piece& piece(const ShapeIndex& shape_index) const;
+
+  // Returns the piece at the root of the shape.
+  virtual const Piece& root_piece() const = 0;
+
+  // LiteralSlice and Literal must access Pieces of other Literals.
+  friend class MutableLiteralBase;
+  friend class LiteralSlice;
+  friend class BorrowingLiteral;
+
+ private:
+  // Like IsAllFloat, but if round_value is false and the value is not
+  // representable with the literal's type (e.g., due to rounding error or
+  // overflow/underflow when casting the value to the literal's type), returns
+  // false.
+  bool IsAllFloatImpl(float value, bool round_value) const;
+};
+
+// Abstract base class representing a mutable literal in XLA.
+class MutableLiteralBase : public LiteralBase {
+ public:
+  ~MutableLiteralBase() override = 0;
+
+  // Returns a Span view of the array for this literal for the
+  // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
+  // given ShapeIndex is not array. See primitive_util.h for the mapping from
+  // XLA type to native type.
+  template <typename NativeT>
+  absl::Span<NativeT> data(const ShapeIndex& shape_index = {});
+  // Unhide const method from parent class.
+  using LiteralBase::data;
+
+  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
+  // mutate the shape as this can produce malformed Literals.
+  Shape* mutable_shape_do_not_use();
+
+  // Set the dynamic size on dim_index in the literal at the given shape_index.
+  void SetDynamicSize(int64_t dim_index, const ShapeIndex& shape_index,
+                      DynamicSizeType size);
+  void SetDynamicSize(int64_t dim_index, DynamicSizeType size);
+
+  // Returns a pointer to the underlying buffer holding the array at the given
+  // shape index. CHECKs if the subshape of the literal at the given ShapeIndex
+  // is not array.
+  void* untyped_data(const ShapeIndex& shape_index = {});
+  // Unhide const method from parent class.
+  using LiteralBase::untyped_data;
+
+  template <typename NativeT>
+  void MutableEachCell(absl::FunctionRef<NativeT(
+                           absl::Span<const int64_t> indices, NativeT value)>
+                           per_cell);
+
+  // Copy values from 'src_literal' rooted at 'src_shape_index' into this
+  // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
+  // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
+  // rooted at 'src_shape_index', but need not be arrays. If only_dynamic_bound
+  // is true, only elements within dynamic bounds will be copied.
+  absl::Status CopyFrom(const LiteralSlice& src_literal,
+                        const ShapeIndex& dest_shape_index = {},
+                        const ShapeIndex& src_shape_index = {},
+                        bool only_dynamic_bound = false);
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to this literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  // The src_literal and this literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  // Note: if either src_literal or this literal contains dimensions with zero
+  // element, then copy_size must be 0 in these dimensions while the
+  // corresponding base indices being 0.
+  // This literal and 'src_literal' must be arrays.
+  absl::Status CopySliceFrom(const LiteralSlice& src_literal,
+                             absl::Span<const int64_t> src_base,
+                             absl::Span<const int64_t> dest_base,
+                             absl::Span<const int64_t> copy_size);
+
+  // Copies one element from src_literal[src_index] to (*this)[dest_index].
+  void CopyElementFrom(const LiteralSlice& src_literal,
+                       absl::Span<const int64_t> src_index,
+                       absl::Span<const int64_t> dest_index);
+
+  // Sets an element in the literal at the given index. The multi_index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  void Set(absl::Span<const int64_t> multi_index, const ShapeIndex& shape_index,
+           NativeT value);
+  // Overloads of Set for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
+  template <typename NativeT>
+  void Set(absl::Span<const int64_t> multi_index, NativeT value);
+
+  // As Set(), but truncates `value` to the literal element type before storing.
+  // This literal must be an array.
+  absl::Status SetIntegralAsS64(absl::Span<const int64_t> multi_index,
+                                int64_t value);
+
+  // As Set(), but truncates `value` to the literal element type before storing.
+  // This literal must be an array.
+  absl::Status SetFromDouble(absl::Span<const int64_t> multi_index,
+                             double value);
+
+  // Populate this literal with the given values. Examples:
+  //
+  //   // Populate with floats.
+  //   Array2D<float> float_values = ...
+  //   literal.PopulateR2FromArray2D(values);
+  //
+  //   // Populate with int32s.
+  //   literal.PopulateR2<int32_t>({{1, 2}, {3, 4}});
+  //
+  // The shape and element type of this literal must match given values. For
+  // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2
+  // array of S32.
+  template <typename NativeT>
+  void PopulateR1(absl::Span<const NativeT> values);
+  void PopulateR1(const tsl::core::Bitmap& values);
+  template <typename NativeT>
+  void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  void PopulateFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR4FromArray4D(const Array4D<NativeT>& values);
+
+  // Populates literal values by calling the generator function for every cell
+  // in this literal object.
+  //
+  // generator must be a callable of the type
+  // NativeT(absl::Span<const int64_t> indexes) or compatible.
+  //
+  // This literal must have a dense layout.
+  template <typename NativeT>
+  absl::Status Populate(
+      absl::FunctionRef<NativeT(absl::Span<const int64_t>)> generator);
+
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT>
+  absl::Status PopulateParallel(
+      absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator);
+
+  // Similar to Populate() but takes a populator function that allows caller
+  // specify how to write to the destination buffer rather than a generator that
+  // returns the values. This is useful when the value population simply does
+  // memcpy without compute therefore can be written in a type agnostic way, so
+  // that we can avoid templatizing the method for better code size.
+  //
+  // This literal must have a dense layout.
+  absl::Status PopulateInplace(
+      absl::FunctionRef<void(void*, absl::Span<const int64_t>)> populator);
+
+  // A parallel version of PopulateInplace(). This can be used if the generator
+  // is thread-safe and the values for the shape's different elements are
+  // independent.
+  absl::Status PopulateInplaceParallel(
+      absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator);
+
+  // Fills this literal with the given value.
+  template <typename NativeT>
+  void PopulateWithValue(NativeT value);
+
+  // This operation is the inverse of DecomposeTuple. The given elements are
+  // moved into the tuple elements of a new tuple-shaped Literal which is
+  // returned. Upon return, each of the Literals in 'elements' is set to a nil
+  // shape (empty tuple).
+  static Literal MoveIntoTuple(absl::Span<Literal> elements);
+
+  // Serialize from a proto.
+  static absl::StatusOr<Literal> CreateFromProto(
+      const LiteralProto& proto, bool prohibit_empty_literal = true);
+
+ protected:
+  // Returns the piece at the given ShapeIndex.
+  Piece& piece(const ShapeIndex& shape_index) {
+    return const_cast<Piece&>(LiteralBase::piece(shape_index));
+  }
+
+  Piece& mutable_root_piece() { return const_cast<Piece&>(root_piece()); }
+
+  // Internal template helper for the Literal::CopySliceFrom(), matching its
+  // arguments one by one.
+  template <typename NativeT>
+  absl::Status CopySliceFromInternal(const LiteralBase& src_literal,
+                                     absl::Span<const int64_t> src_base,
+                                     absl::Span<const int64_t> dest_base,
+                                     absl::Span<const int64_t> copy_size);
+
+  // The literal may or may not own the storage of the shape. Creating/copying a
+  // shape can incur significant overhead which in many case we'd like to avoid,
+  // esp. for small literals.
+  using MaybeOwningShapePtr = MaybeOwning<Shape>;
+
+  // The parent class borrows this shape.
+  MaybeOwningShapePtr shape_;
+
+  // Implementation details shared between Populate() and PopulateParallel()
+  //  template <typename NativeT, typename FnType>
+  //  absl::Status PopulateInternal(const FnType& generator, bool parallel);
+  template <typename NativeT>
+  absl::Status PopulateInternal(
+      absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator,
+      bool parallel);
+  void PopulateInplaceInternal(
+      absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator,
+      bool parallel);
+
+  friend class LiteralBase;
+  friend class MutableBorrowingLiteral;
+};
+std::ostream& operator<<(std::ostream& out, const Literal& literal);
+
+// The underlying buffer and shape is always owned by this class.
+class Literal : public MutableLiteralBase {
+ public:
+  Literal();
+
+  // Create a literal of the given shape. The literal is allocated sufficient
+  // memory to hold the shape. Memory is uninitialized.
+  explicit Literal(const Shape& shape);
+  ~Literal() override;
+
+  // Literals are moveable, but not copyable. To copy a literal use
+  // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies
+  // of literals which can be expensive.
+  Literal(const Literal& other) = delete;
+  Literal& operator=(const Literal& other) = delete;
+  Literal(Literal&& other);
+  // 'allocate_arrays' indicates whether to allocate memory for the arrays in
+  // the shape. If false, buffer pointers inside of the Literal::Pieces are set
+  // to nullptr.
+  Literal(const Shape& shape, bool allocate_arrays,
+          ArrayValueState leaf_array_value_state = ArrayValueState::kKnown);
+  Literal& operator=(Literal&& other);
+
+  // Similar to CopyFrom, but with move semantics. The subshape of this literal
+  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
+  // (layouts and shapes must match), but need not be arrays. The memory
+  // allocated in this literal for the subshape at dest_shape_index is
+  // deallocated, and the respective buffers are replaced with those in
+  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
+  virtual absl::Status MoveFrom(Literal&& src_literal,
+                                const ShapeIndex& dest_shape_index);
+  absl::Status MoveFrom(Literal&& src_literal) {
+    return MoveFrom(std::move(src_literal), /*dest_shape_index=*/{});
+  }
+
+  // Returns a vector containing the tuple elements of this Literal as separate
+  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
+  // elements are moved into the new Literals; no data is copied. Upon return
+  // this Literal is set to a nil shape (empty tuple)
+  //
+  // TODO(jlebar): Because this function invalidates `this`, it should be
+  // ref-qualified with &&.
+  std::vector<Literal> DecomposeTuple();
+
+  // Returns a subliteral specified by given shape_index. No data is copied, the
+  // current literal becomes invalid after this function call.
+  //
+  // TODO(jlebar): Because this function invalidates `this`, it should be
+  // ref-qualified with &&.
+  Literal SubLiteral(ShapeIndexView shape_index);
+
+  // Deserialize a Literal from the given iterator range, whose value type must
+  // be char.  See the comments on the Serialize() method for caveats.
+  template <typename InputIterator>
+  static absl::StatusOr<Literal> Deserialize(InputIterator begin,
+                                             InputIterator end);
+
+  static absl::StatusOr<Literal> DeserializeFromString(absl::string_view data) {
+    return Deserialize(data.data(), data.data() + data.size());
+  }
+
+ private:
+  friend class LiteralBase;
+  friend class MutableLiteralBase;
+  const Piece& root_piece() const override { return root_piece_; };
+  // Deallocate the buffers held by this literal.
+  void DeallocateBuffers();
+
+  // Sets the shape_ field from a Shape. shape_'s element_size_in_bits field
+  // on the layout is always set to 0 since Literals do not support packed
+  // subbyte elements.
+  void SetShape(const Shape& shape);
+
+  // Recursively sets the subshapes and buffers of all subpieces rooted at
+  // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
+  // the shape.
+  void SetPiece(
+      const Shape& shape, Piece* piece, bool allocate_arrays,
+      ArrayValueState leaf_array_value_state = ArrayValueState::kKnown);
+  Piece root_piece_;
+};
+
+// The underlying buffer is not owned by this class and is always owned by
+// others. The shape is not owned by this class and not mutable.
+class MutableBorrowingLiteral : public MutableLiteralBase {
+ public:
+  ~MutableBorrowingLiteral() override;
+
+  MutableBorrowingLiteral() : MutableLiteralBase() {}
+
+  MutableBorrowingLiteral(const MutableBorrowingLiteral& literal);
+  MutableBorrowingLiteral& operator=(const MutableBorrowingLiteral& literal);
+
+  // Implicit conversion constructors.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  MutableBorrowingLiteral(MutableLiteralBase* literal);
+  MutableBorrowingLiteral(MutableBorrowingLiteral literal,
+                          const ShapeIndex& view_root);
+
+  // 'src_buf_ptr' is not owned by this class and must outlive the
+  // lifetime of this class. It points to an appropriately sized buffer with
+  // data interpreted as indicated by 'shape'.
+  // This constructor is only used for array shapes.
+  MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
+
+  // Similar as above, except to be used for constructing non-nested tuples.
+  MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs, const Shape& shape);
+
+  // Similar as above, except to be used for constructing literals with
+  // potentially nested tuples (same shape as `src_buf_ptrs`) with borrowed
+  // buffers for each shape index.
+  explicit MutableBorrowingLiteral(ShapeTree<char*> src_buf_ptrs);
+
+ private:
+  const Piece& root_piece() const override { return *root_piece_; };
+  // Recursively copies the subtree from the `src_piece` at the given child
+  // index to the `dest_piece`. For buffers only the pointers are copied, but
+  // not the content.
+  void CopyPieceSubtree(const Shape& shape, const Piece* src_piece,
+                        Piece* dest_piece);
+  Piece* root_piece_ = nullptr;
+};
+
+// A read-only view of a Literal. A LiteralSlice contains pointers to shape and
+// literal buffers always owned by others.
+class LiteralSlice : public LiteralBase {
+ public:
+  LiteralSlice() : LiteralBase() {}
+
+  // Implicit conversion constructors.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LiteralSlice(const LiteralBase& literal);
+  LiteralSlice(const LiteralBase& literal, const ShapeIndex& view_root);
+
+ private:
+  const Piece& root_piece() const override { return *root_piece_; };
+
+  const Piece* root_piece_;  // Not owned.
+};
+
+// A read-only Literal where the underlying buffers are never owned by this
+// class.
+class BorrowingLiteral : public LiteralBase {
+ public:
+  BorrowingLiteral() : LiteralBase() {}
+
+  // 'src_buf_ptr' is not owned by this class and must outlive the
+  // lifetime of this class. It points to an appropriately sized buffer with
+  // data interpreted as indicated by 'shape'.
+  // This constructor is only used for array shapes.
+  BorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
+
+  // Similar as above, except to be used for constructing non-nested tuples.
+  BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
+                   const Shape& shape);
+
+  // Similar as above, except to be used for constructing literals with
+  // potentially nested tuples (same shape as `src_buf_ptrs`) with borrowed
+  // buffers for each shape index.
+  explicit BorrowingLiteral(ShapeTree<const char*> src_buf_ptrs);
+
+ private:
+  // Accessor for the root piece of this literal.
+  const Piece& root_piece() const override { return root_piece_; };
+  Piece root_piece_;
+
+  // Shape of this literal. Stored as unique_ptr such that the (default) move
+  // construction of this class would be trivially correct: the pointer to Shape
+  // root_piece_ stores will still point to the correct address.
+  std::unique_ptr<Shape> shape_;
+};
+
+template <typename NativeT, typename OutputIterator>
+void LiteralBase::Piece::SerializeData(
+    SerializeState<OutputIterator>& state) const {
+  CHECK_EQ(subshape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  if (subshape().is_dynamic()) {
+    absl::Span<const DynamicSizeType> sizes(dynamic_size_buffer(),
+                                            subshape().rank());
+    state.WriteDynamicSizes(sizes);
+  }
+  state.WriteElements(data<NativeT>());
+}
+
+template <typename NativeT, typename InputIterator>
+bool LiteralBase::Piece::DeserializeData(
+    DeserializeState<InputIterator>& state) {
+  CHECK_EQ(subshape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  if (subshape().is_dynamic()) {
+    absl::Span<DynamicSizeType> sizes(dynamic_size_buffer(), subshape().rank());
+    if (!state.ReadDynamicSizes(sizes)) {
+      return false;
+    }
+  }
+  return state.ReadElements(data<NativeT>());
+}
+
+// Description of the native serialization format:
+//
+// - All data are stored in little-endian order.
+//
+// - The serialized format begins with a header.
+//
+// - The first 8 bytes (int64_t) of the header are the size of the serialized
+//   ShapeProto that provides the shape of the literal.
+//
+// - The remaining bytes of the header provide the serialized ShapeProto itself.
+//
+// - After the header, each piece of the literal is serialized, as produced
+//   through a depth-first traversal of the tuple tree.
+//
+// - If a piece is dynamic, we first write the sizes of the dynamic dimensions.
+//
+// - The elements of the piece are then written.  Elements smaller than a single
+//   byte (PRED, S4, U4) are packed into bytes.  Otherwise, they are written in
+//   little-endian byte order.
+template <typename OutputIterator>
+absl::Status LiteralBase::SerializeWithShapeProto(const ShapeProto& shape_proto,
+                                                  OutputIterator output) const {
+  SerializeState<OutputIterator> state(shape_proto, output);
+  TF_RETURN_IF_ERROR(root_piece().ForEachSubpieceWithStatus(
+      [&](const ShapeIndex& shape_index, const Piece& piece) -> absl::Status {
+        const Shape& subshape = piece.subshape();
+        if (subshape.IsTuple()) {
+          return absl::OkStatus();
+        }
+        if (!subshape.IsArray()) {
+          return InvalidArgument("Shape cannot be serialized: %s",
+                                 shape().ToString());
+        }
+        primitive_util::ArrayTypeSwitch<void>(
+            [&](auto primitive_type) {
+              using NativeT = primitive_util::NativeTypeOf<primitive_type>;
+              piece.SerializeData<NativeT>(state);
+            },
+            subshape.element_type());
+        return absl::OkStatus();
+      }));
+  DCHECK_EQ(state.num_written(), SerializedSize().value())
+      << shape().ToString();
+  return absl::OkStatus();
+}
+
+template <typename InputIterator>
+absl::StatusOr<Literal> Literal::Deserialize(InputIterator begin,
+                                             InputIterator end) {
+  DeserializeState<InputIterator> state(begin, end);
+  uint64_t shape_size;
+  if (!state.ReadElement(shape_size)) {
+    return InvalidArgument("Failed to read shape size");
+  }
+  TF_ASSIGN_OR_RETURN(Shape shape, state.ReadShape(shape_size));
+  Literal literal(shape);
+  TF_RETURN_IF_ERROR(
+      literal.mutable_root_piece().ForEachMutableSubpieceWithStatus(
+          [&](const ShapeIndex& shape_index, Piece* piece) -> absl::Status {
+            const Shape& subshape = piece->subshape();
+            if (subshape.IsTuple()) {
+              return absl::OkStatus();
+            }
+            if (!subshape.IsArray()) {
+              return InvalidArgument("Shape cannot be deserialized: %s",
+                                     shape.ToString());
+            }
+            bool ok = primitive_util::ArrayTypeSwitch<bool>(
+                [&](auto primitive_type) {
+                  using NativeT = primitive_util::NativeTypeOf<primitive_type>;
+                  return piece->DeserializeData<NativeT>(state);
+                },
+                subshape.element_type());
+            if (!ok) {
+              return InvalidArgument(
+                  "Failed to deserialize all data for shape: %s",
+                  shape.ToString());
+            }
+            return absl::OkStatus();
+          }));
+  DCHECK_EQ(state.num_read(), ShapeUtil::SerializedSize(shape).value())
+      << shape.ToString();
+  if (!state.at_end()) {
+    return InvalidArgument("Did not consume all input data");
+  }
+  return std::move(literal);
+}
+
+template <typename NativeT>
+absl::Span<const NativeT> LiteralBase::Piece::data() const {
+  DCHECK(LayoutUtil::IsDenseArray(subshape()))
+      << __func__ << " is only supported for dense arrays: " << subshape();
+  DCHECK(!subshape().has_layout() ||
+         subshape().layout().element_size_in_bits() == 0)
+      << __func__
+      << " is not supported for layouts with custom bit size: " << subshape();
+  DCHECK_EQ(subshape().element_type(),
+            primitive_util::NativeToPrimitiveType<NativeT>())
+      << "Attempting to access "
+      << PrimitiveType_Name(primitive_util::NativeToPrimitiveType<NativeT>())
+      << " type, but literal element type is "
+      << PrimitiveType_Name(subshape().element_type());
+  return absl::Span<const NativeT>(reinterpret_cast<const NativeT*>(buffer()),
+                                   element_count());
+}
+
+template <typename NativeT>
+absl::Span<NativeT> LiteralBase::Piece::data() {
+  DCHECK(LayoutUtil::IsDenseArray(subshape()))
+      << __func__ << " is only supported for dense arrays: " << subshape();
+  DCHECK(!subshape().has_layout() ||
+         subshape().layout().element_size_in_bits() == 0)
+      << __func__
+      << " is not supported for layouts with custom bit size: " << subshape();
+  DCHECK_EQ(subshape().element_type(),
+            primitive_util::NativeToPrimitiveType<NativeT>())
+      << "Attempting to access "
+      << PrimitiveType_Name(primitive_util::NativeToPrimitiveType<NativeT>())
+      << " type, but literal element type is "
+      << PrimitiveType_Name(subshape().element_type());
+  return absl::Span<NativeT>(reinterpret_cast<NativeT*>(buffer()),
+                             element_count());
+}
+
+template <typename NativeT>
+NativeT LiteralBase::Piece::Get(absl::Span<const int64_t> multi_index) const {
+  DCHECK(LayoutUtil::IsDenseArray(subshape()))
+      << __func__ << " is only supported for dense arrays: " << subshape();
+  return data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
+      subshape(), multi_index)];
+}
+
+template <typename NativeT>
+void LiteralBase::Piece::Set(absl::Span<const int64_t> multi_index,
+                             NativeT value) {
+  DCHECK(LayoutUtil::IsDenseArray(subshape()))
+      << __func__ << " is only supported for dense arrays: " << subshape();
+  data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
+      subshape(), multi_index)] = value;
+}
+
+template <typename NativeT>
+absl::Span<const NativeT> LiteralBase::data(
+    const ShapeIndex& shape_index) const {
+  return piece(shape_index).data<NativeT>();
+}
+
+template <typename NativeT>
+absl::Span<NativeT> MutableLiteralBase::data(const ShapeIndex& shape_index) {
+  return piece(shape_index).data<NativeT>();
+}
+
+template <typename NativeT>
+inline NativeT LiteralBase::Get(absl::Span<const int64_t> multi_index,
+                                const ShapeIndex& shape_index) const {
+  return piece(shape_index).Get<NativeT>(multi_index);
+}
+
+template <typename NativeT>
+inline NativeT LiteralBase::Get(absl::Span<const int64_t> multi_index) const {
+  return root_piece().Get<NativeT>(multi_index);
+}
+
+template <typename NativeT>
+inline void MutableLiteralBase::Set(absl::Span<const int64_t> multi_index,
+                                    const ShapeIndex& shape_index,
+                                    NativeT value) {
+  return piece(shape_index).Set<NativeT>(multi_index, value);
+}
+
+template <typename NativeT>
+inline void MutableLiteralBase::Set(absl::Span<const int64_t> multi_index,
+                                    NativeT value) {
+  return mutable_root_piece().Set<NativeT>(multi_index, value);
+}
+
+template <typename NativeT>
+NativeT LiteralBase::GetFirstElement() const {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  return data<NativeT>().at(0);
+}
+
+template <typename T>
+int64_t LiteralBase::CountEqual(T value) const {
+  PrimitiveType ty = shape().element_type();
+  if (!primitive_util::IsArrayType(ty)) {
+    return 0;
+  }
+  Literal scalar(ShapeUtil::MakeScalarShape(ty));
+  return primitive_util::ArrayTypeSwitch<int64_t>(
+      [&](auto primitive_type_constant) -> int64_t {
+        using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+        scalar.Set<NativeT>({}, static_cast<NativeT>(value));
+        return root_piece().CountAll(scalar);
+      },
+      ty);
+}
+
+template <typename T>
+int64_t LiteralBase::CountEqual(std::complex<T> value) const {
+  PrimitiveType ty = shape().element_type();
+  if (!primitive_util::IsComplexType(ty)) {
+    return 0;
+  }
+  Literal scalar(ShapeUtil::MakeScalarShape(ty));
+  return primitive_util::ComplexTypeSwitch<int64_t>(
+      [&](auto primitive_type_constant) -> int64_t {
+        using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+        scalar.Set<NativeT>({}, static_cast<NativeT>(value));
+        return root_piece().CountAll(scalar);
+      },
+      ty);
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE void LiteralBase::EachCell(
+    absl::FunctionRef<void(absl::Span<const int64_t> indices, NativeT value)>
+        per_cell) const {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  if (ShapeUtil::IsZeroElementArray(shape())) {
+    return;
+  }
+  std::vector<int64_t> indices(shape().rank(), 0);
+
+  Shape shape_dynamic = shape();
+  for (int64_t i = 0; i < shape_dynamic.rank(); ++i) {
+    shape_dynamic.set_dimensions(i, GetDynamicSize(i));
+  }
+  do {
+    per_cell(indices, Get<NativeT>(indices));
+  } while (IndexUtil::BumpIndices(shape_dynamic, absl::MakeSpan(indices)));
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::MutableEachCell(
+    absl::FunctionRef<NativeT(absl::Span<const int64_t> indices, NativeT value)>
+        per_cell) {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  if (ShapeUtil::IsZeroElementArray(shape())) {
+    return;
+  }
+  std::vector<int64_t> indices(shape().rank(), 0);
+  Shape shape_dynamic = shape();
+  for (int64_t i = 0; i < shape_dynamic.rank(); ++i) {
+    shape_dynamic.set_dimensions(i, GetDynamicSize(i));
+  }
+  do {
+    Set<NativeT>(indices, per_cell(indices, Get<NativeT>(indices)));
+  } while (IndexUtil::BumpIndices(shape_dynamic, absl::MakeSpan(indices)));
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::PopulateR1(
+    absl::Span<const NativeT> values) {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  CHECK_EQ(shape().rank(), 1);
+  if (shape().is_static()) {
+    CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
+  } else {
+    CHECK_EQ(GetDynamicSize(0), values.size());
+  }
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  auto data_span = data<NativeT>();
+  std::copy(values.begin(), values.end(), data_span.begin());
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::PopulateR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  CHECK_EQ(shape().rank(), 2);
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+
+  const int64_t values_dim0_size = values.size();
+  const int64_t values_dim1_size = values.begin()->size();
+  const int64_t literal_dim0_size = shape().is_dynamic_dimension(0)
+                                        ? GetDynamicSize(0)
+                                        : shape().dimensions(0);
+  const int64_t literal_dim1_size = shape().is_dynamic_dimension(1)
+                                        ? GetDynamicSize(1)
+                                        : shape().dimensions(1);
+
+  CHECK_EQ(values_dim0_size, literal_dim0_size);
+  CHECK_EQ(values_dim1_size, literal_dim1_size);
+
+  int64_t dim0 = 0;
+  for (auto inner_list : values) {
+    int64_t dim1 = 0;
+    for (auto value : inner_list) {
+      Set({dim0, dim1}, value);
+      ++dim1;
+    }
+    CHECK_EQ(values_dim1_size, dim1);
+    ++dim0;
+  }
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::PopulateFromArray(
+    const Array<NativeT>& values) {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  CHECK_EQ(shape().rank(), values.num_dimensions());
+  for (int dim = 0; dim < values.num_dimensions(); ++dim) {
+    int64_t shape_size = shape().is_dynamic_dimension(dim)
+                             ? GetDynamicSize(dim)
+                             : shape().dimensions(dim);
+    CHECK_EQ(values.dim(dim), shape_size);
+  }
+  values.Each([this](absl::Span<const int64_t> indices, NativeT value) {
+    this->Set(indices, value);
+  });
+}
+
+template <typename NativeT>
+void MutableLiteralBase::PopulateR2FromArray2D(const Array2D<NativeT>& values) {
+  PopulateFromArray(values);
+}
+
+template <typename NativeT>
+void MutableLiteralBase::PopulateR3FromArray3D(const Array3D<NativeT>& values) {
+  PopulateFromArray(values);
+}
+
+template <typename NativeT>
+void MutableLiteralBase::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
+  PopulateFromArray(values);
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE absl::Status MutableLiteralBase::PopulateInternal(
+    absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator,
+    bool parallel) {
+  const Shape& this_shape = shape();
+  DCHECK(LayoutUtil::IsDenseArray(this_shape));
+  TF_RET_CHECK(this_shape.element_type() ==
+               primitive_util::NativeToPrimitiveType<NativeT>())
+      << "Failing to populate literal with element type "
+      << primitive_util::LowercasePrimitiveTypeName(this_shape.element_type())
+      << " using data of type "
+      << primitive_util::LowercasePrimitiveTypeName(
+             primitive_util::NativeToPrimitiveType<NativeT>());
+  PopulateInplaceInternal(
+      [&](void* dest, absl::Span<const int64_t> indices, int thread_id) {
+        *static_cast<NativeT*>(dest) = generator(indices, thread_id);
+      },
+      parallel);
+  return absl::OkStatus();
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE absl::Status MutableLiteralBase::Populate(
+    absl::FunctionRef<NativeT(absl::Span<const int64_t>)> generator) {
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  return PopulateInternal<NativeT>(
+      [&](absl::Span<const int64_t> indexes, int /*thread_id*/) {
+        return generator(indexes);
+      },
+      /*parallel=*/false);
+}
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE absl::Status MutableLiteralBase::PopulateParallel(
+    absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator) {
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  return PopulateInternal<NativeT>(generator,
+                                   /*parallel=*/data<NativeT>().size() > 32);
+}
+
+template <typename NativeT>
+void MutableLiteralBase::PopulateWithValue(NativeT value) {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  for (NativeT& element : data<NativeT>()) {
+    element = value;
+  }
+}
+
+template <typename NativeT>
+Literal LiteralBase::Replicate(int64_t times) const {
+  CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  DimensionVector bounds = {times};
+  bounds.reserve(shape().dimensions_size() + 1);
+  for (int64_t bound : shape().dimensions()) {
+    bounds.push_back(bound);
+  }
+  Literal literal(ShapeUtil::MakeShape(shape().element_type(), bounds));
+  int64_t elements = ShapeUtil::ElementsIn(literal.shape());
+  if (elements == 0) {
+    return literal;
+  }
+
+  DimensionVector output_indices(bounds.size(), 0);
+  absl::Span<const int64_t> input_indices = output_indices;
+  input_indices.remove_prefix(1);
+
+  bool done = false;
+  while (!done) {
+    const auto element = Get<NativeT>(input_indices);
+    literal.Set<NativeT>(output_indices, element);
+
+    done = true;
+    for (int n = 0; n < output_indices.size(); ++n) {
+      ++output_indices[n];
+      if (output_indices[n] < bounds[n]) {
+        done = false;
+        break;
+      }
+      output_indices[n] = 0;
+    }
+  }
+  return literal;
+}
+
+}  // namespace xla
+
+#endif  // XLA_LITERAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/literal_comparison.h b/third_party/tflite-hdrs/third_party/xla/xla/literal_comparison.h
new file mode 100644
index 00000000..113ac224
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/literal_comparison.h
@@ -0,0 +1,104 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Library for comparing literals without taking a dependency on testing
+// libraries.
+
+#ifndef XLA_LITERAL_COMPARISON_H_
+#define XLA_LITERAL_COMPARISON_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/error_spec.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace literal_comparison {
+
+// Returns ok if the given shapes have the same rank, dimension sizes, and
+// primitive types.
+absl::Status EqualShapes(const Shape& expected, const Shape& actual);
+
+// Returns ok if the given literals share identical dynamic shapes and
+// dimension sizes.
+absl::Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
+                                             const LiteralSlice& actual);
+
+// Returns ok if the expected and actual literals are (bitwise) equal for all
+// elements in the literal. Also, asserts that the rank, dimensions sizes, and
+// primitive type are equal.
+absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual);
+
+// Structure that contains the distribution of absolute and relative errors,
+// bucketized into five buckets: [0.0001, 0.001, 0.01, 0.1, 1].
+// Useful to understand the distribution of errors and set the permissible
+// error bounds in an ErrorSpec.
+struct ErrorBuckets {
+  explicit ErrorBuckets(const std::vector<int64_t>& absolute_error_buckets = {},
+                        const std::vector<int64_t>& rel_error_buckets = {})
+      : abs_error_buckets(absolute_error_buckets),
+        rel_error_buckets(rel_error_buckets) {}
+
+  const std::vector<int64_t> abs_error_buckets;
+  const std::vector<int64_t> rel_error_buckets;
+};
+
+using MiscompareCallback = std::function<void(
+    const LiteralSlice& expected, const LiteralSlice& actual,
+    const LiteralSlice& mismatches, const ShapeIndex& shape_index,
+    const ErrorBuckets& error_buckets)>;
+
+// Inspects whether the expected and actual literals are within the given error
+// bound for all elements. Also, inspects whether the rank, dimensions sizes,
+// and dimension bounds are equivalent.
+//
+// Tuples are matched recursively.
+//
+// When comparing tensors of non-floating-point type, this inspects for exact
+// equality, ignoring the ErrorSpec.
+//
+// If the shape of the literals is neither a complex/floating-point tensor nor a
+// tuple which contains a complex/floating-point tensor, Near() is equivalent to
+// Equal(). We don't raise an error in this case, because we want to allow
+// callers to call Near() even if they have no preconceptions about the shapes
+// being compared.
+//
+// If detailed_message is true, then the error message in the assertion result
+// will contain a more detailed breakdown of mismatches.  By default, we display
+// a detailed message only for "large" inputs.
+//
+// If miscompare_callback is nullptr, Near will return an error on the first
+// detected mismatch.
+absl::Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const ErrorSpec& error, std::optional<bool> detailed_message,
+                  const MiscompareCallback& miscompare_callback);
+
+// Calling ToString on a literal with over 100 million elements takes around
+// 3 minutes.  The utility of printing a literal with >1000 elements is
+// questionable, especially when writing the Literal proto to disk is orders
+// of magnitude faster.
+std::string ToStringTruncated(const LiteralSlice& literal);
+
+}  // namespace literal_comparison
+}  // namespace xla
+
+#endif  // XLA_LITERAL_COMPARISON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/literal_pool.h b/third_party/tflite-hdrs/third_party/xla/xla/literal_pool.h
new file mode 100644
index 00000000..4e53181b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/literal_pool.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_LITERAL_POOL_H_
+#define XLA_LITERAL_POOL_H_
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+// Literal pool provides a mechanism to deduplicate identical literals and
+// share them across multiple HLO modules.
+class LiteralPool {
+ public:
+  // Returns a default literal pool that can be used across multiple HLO modules
+  // in a process.
+  static LiteralPool* Default();
+
+  // Returns a canonical literal from the pool. If the literal is not in the
+  // pool, it is added to the pool and returned back.
+  std::shared_ptr<Literal> GetCanonicalLiteral(const Literal& literal);
+
+  // Returns a canonical literal from the pool. If the literal is not in the
+  // pool, it is added to the pool and returned back.
+  std::shared_ptr<Literal> GetCanonicalLiteral(
+      std::shared_ptr<Literal> literal);
+
+  // Runs garbage collection on all the literals in the pool. Returns the number
+  // of literals that were garbage collected.
+  size_t GarbageCollect();
+
+  // Runs garbage collection on literals with the given shape. Returns the
+  // number of literals that were garbage collected.
+  size_t GarbageCollect(Shape shape);
+
+ private:
+  // We keep weak pointers to the literals in the pool to allow for garbage
+  // collection when owning HLO modules are destroyed. We run periodic garbage
+  // collection to clean up the literals that are no longer referenced.
+  absl::Mutex mu_;
+  absl::flat_hash_map<Shape, std::vector<std::weak_ptr<Literal>>> literals_
+      ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_LITERAL_POOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/literal_util.h b/third_party/tflite-hdrs/third_party/xla/xla/literal_util.h
new file mode 100644
index 00000000..db8e958f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/literal_util.h
@@ -0,0 +1,655 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for dealing with Literal protobufs.
+
+#ifndef XLA_LITERAL_UTIL_H_
+#define XLA_LITERAL_UTIL_H_
+
+#include <array>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <optional>
+#include <ostream>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/primitive_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/lib/core/bitmap.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace xla {
+
+class LiteralUtil {
+ public:
+  LiteralUtil() = delete;
+
+  // Returns a literal scalar representing the first element.
+  static Literal GetFirstScalarLiteral(const LiteralSlice& literal);
+  // Returns a literal scalar representing the element at `multi_index`.
+  static Literal GetScalarLiteral(const LiteralBase& literal,
+                                  absl::Span<const int64_t> multi_index);
+  // Sets the value of the element at `multi_index` with a scalar literal.
+  static void SetScalarLiteral(MutableLiteralBase& literal,
+                               absl::Span<const int64_t> multi_index,
+                               const LiteralBase& scalar);
+
+  // Creates a new literal of a given rank. To minimize ambiguity (for users
+  // and the compiler) these CreateR[0-2] methods should explicitly specify the
+  // native type. For example:
+  //
+  //  CreateR1<float>({1.0, 42.0});
+  //  CreateR2<uint32_t>({{1, 2}, {3, 4}});
+  //
+  // The variants not ending with WithLayout use the default XLA layout for the
+  // literal's linear representation in memory.
+  template <typename NativeT>
+  static Literal CreateR0(NativeT value);
+  template <typename T>
+  static Literal CreateR0(PrimitiveType primitive_type, T value);
+  template <typename NativeT>
+  static Literal CreateR1(absl::Span<const NativeT> values);
+  static Literal CreateR1(const tsl::core::Bitmap& values);
+  template <typename NativeT>
+  static Literal CreateR2(
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  static Literal CreateR2WithLayout(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      const Layout& layout);
+  template <typename NativeT>
+  static Literal CreateR3(std::initializer_list<
+                          std::initializer_list<std::initializer_list<NativeT>>>
+                              values);
+  template <typename NativeT>
+  static Literal CreateR3WithLayout(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>
+          values,
+      const Layout& layout);
+  template <typename NativeT>
+  static Literal CreateR4(
+      std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>>
+          values);
+  template <typename NativeT>
+  static Literal CreateR4WithLayout(
+      std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>>
+          values,
+      const Layout& layout);
+
+  // Creates a scalar literal value zero of the given primitive type.
+  static Literal Zero(PrimitiveType primitive_type);
+  // Creates a scalar literal value one of the given primitive type.
+  static Literal One(PrimitiveType primitive_type);
+  // Creates a scalar literal value containing the minimum value of the given
+  // primitive type. For floating-point types supporting inf, returns -inf.
+  static Literal MinValue(PrimitiveType primitive_type);
+  // Creates a scalar literal value containing the maximum value of the given
+  // primitive type. For floating-point types supporting inf, returns inf.
+  static Literal MaxValue(PrimitiveType primitive_type);
+  // Creates a scalar literal value containing the NaN value of the given
+  // primitive type. Fail for non-inexact types. For complex types, returns a
+  // nan + nan * j value.
+  static absl::StatusOr<Literal> NanValue(PrimitiveType primitive_type);
+  // Creates a literal of the given shape where each element is `value`.
+  template <typename NativeT>
+  static Literal CreateFullWithDescendingLayout(
+      absl::Span<const int64_t> dimensions, NativeT value);
+  template <typename NativeT>
+  static Literal CreateFull(absl::Span<const int64_t> dimensions,
+                            NativeT value);
+
+  // Creates a new literal from an Array type. The variants not ending with
+  // WithLayout use the default XLA layout for the literal's linear
+  // representation in memory.
+  template <typename NativeT>
+  static Literal CreateFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  static Literal CreateFromArrayWithLayout(const Array<NativeT>& values,
+                                           const Layout& layout);
+  template <typename NativeT>
+  static Literal CreateR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  static Literal CreateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  static Literal CreateR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  static Literal CreateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  static Literal CreateR4FromArray4D(const Array4D<NativeT>& values);
+  template <typename NativeT>
+  static Literal CreateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                               const Layout& layout);
+
+  // Creates a new vector of U8s literal value from a string.
+  static Literal CreateR1U8(absl::string_view value);
+
+  // Creates a linspace-populated literal with the given number of rows and
+  // columns.
+  static Literal CreateR2F32Linspace(float from, float to, int64_t rows,
+                                     int64_t cols);
+
+  // Creates a literal that projects the (x, y) dimensions given in values into
+  // the z dimension given by "projection".
+  template <typename NativeT>
+  static Literal CreateR3Projected(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      int64_t projection);
+
+  // Creates a literal that projects the (x, y) dimensions given in values into
+  // the z and p dimensions given.
+  template <typename NativeT>
+  static Literal CreateR4Projected(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      int64_t projection_p, int64_t projection_z);
+
+  // Returns a scalar matrix (rank 2) of the given size and scalar value.
+  template <typename NativeT>
+  static Literal MakeScalarMatrixR2(int64_t size, NativeT scalar);
+
+  // Returns an identity matrix (rank 2) of the given size.
+  template <typename NativeT>
+  static Literal MakeIdentityR2(int64_t size);
+
+  // Creates fingerprint input where each entry encodes its row and column
+  // scaled by the given scale.
+  template <typename NativeT>
+  static Literal CreateFingerprintMatixR2(int64_t m, int64_t n,
+                                          NativeT scale = 1);
+
+  // Returns a tuple literal composed of given literals. Data is copied from the
+  // given elements into the returned literal.
+  static Literal MakeTuple(absl::Span<const Literal* const> elements);
+
+  static Literal MakeTupleFromSlices(absl::Span<const LiteralSlice> elements);
+
+  // As above, but intended to be invoked with move semantics; i.e.
+  //
+  //  std::vector<Literal> elements = ...;
+  //  auto result = LiteralUtil::MakeTupleOwned(std::move(elements));
+  //
+  // This would have been declared as an overload, but there is ambiguity
+  // in invocation between the above signature and this one.
+  static Literal MakeTupleOwned(std::vector<Literal> elements);
+
+  // This overload lets you pass a list of Literals to MakeTupleOwned:
+  //
+  //   LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1(...), ...).
+  //
+  // Simply relying on the MakeTupleOwned(std::vector<Literal>)
+  // overload doesn't work because std::initializer_list's elements are always
+  // const.
+  //
+  // The arguments to this function must all be Literal.
+  template <typename... Ts>
+  static Literal MakeTupleOwned(Ts... elements) {
+    std::array<Literal, sizeof...(Ts)> arr{std::move(elements)...};
+    std::vector<Literal> v;
+    v.insert(v.begin(), std::make_move_iterator(arr.begin()),
+             std::make_move_iterator(arr.end()));
+    return MakeTupleOwned(std::move(v));
+  }
+
+  // Create a constant token literal. Token types have no value.
+  static Literal CreateToken();
+
+  // Creates a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static Literal CreateFromDimensions(PrimitiveType primitive_type,
+                                      absl::Span<const int64_t> dimensions);
+
+  // Convert<SrcType>To<DstType> family of functions:
+  // If the given literal's data type is <SrcType>, converts it to a <DstType>
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static Literal ConvertS8ToF32(const LiteralSlice& s8_literal);
+  static Literal ConvertBF16ToF32(const LiteralSlice& bf16_literal);
+  static Literal ConvertBF16ToF64(const LiteralSlice& bf16_literal);
+  static Literal ConvertF32ToF8E4M3FNUZ(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToF8E5M2FNUZ(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToF8E5M2(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToF8E4M3FN(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToBF16(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToS8(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToF64(const LiteralSlice& f32_literal);
+  static Literal ConvertF64ToBF16(const LiteralSlice& f64_literal);
+  static Literal ConvertF64ToF32(const LiteralSlice& f64_literal);
+  static Literal ConvertS32ToF32(const LiteralSlice& s32_literal);
+  static Literal ConvertS32ToS1(const LiteralSlice& s32_literal);
+
+  // Creates a scalar literal whose value is the maximum value of a given
+  // literal slice.
+  static Literal MaxElement(const LiteralSlice& literal);
+
+  // Creates a literal with a new shape with the given new dimensions using the
+  // data in the given input literal. For reshaping purposes the (flat) data
+  // buffer of the input literal is assumed to have the given minor_to_major
+  // layout order.
+  static Literal ReshapeSlice(absl::Span<const int64_t> new_dimensions,
+                              absl::Span<const int64_t> minor_to_major,
+                              const LiteralSlice& literal);
+
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error absl::Status if failed.
+  template <PrimitiveType type, typename T = primitive_util::NativeTypeOf<type>>
+  static absl::StatusOr<Literal> CreateLiteralWithGenerator(
+      const Shape& shape,
+      absl::FunctionRef<T(absl::Span<const int64_t>)> generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error absl::Status if failed.
+  template <PrimitiveType type, typename E,
+            typename T = primitive_util::NativeTypeOf<type>>
+  static absl::StatusOr<Literal> CreateRandomLiteral(const Shape& shape,
+                                                     E* engine, T mean,
+                                                     T stddev);
+  // Same as the above, but takes mean and stddev as doubles.
+  template <PrimitiveType type, typename E,
+            typename T = primitive_util::NativeTypeOf<type>>
+  static absl::StatusOr<Literal> CreateRandomLiteral(const Shape& shape,
+                                                     E* engine, double mean,
+                                                     double stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error absl::Status if failed.
+  template <PrimitiveType type, typename T = primitive_util::NativeTypeOf<type>>
+  static absl::StatusOr<Literal> CreateRandomLiteral(const Shape& shape, T mean,
+                                                     T stddev);
+
+  //
+  // End of factory methods.
+
+  // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
+  // be returned for a 2-dimensional index with dimension 0 index equal to 7,
+  // dimension 1 equal to 8.
+  static std::string MultiIndexAsString(absl::Span<const int64_t> multi_index);
+
+  // Converts the given literal to a scalar int64_t, if possible.
+  //
+  // Fails if the literal is not an integral type or if the value it contains
+  // cannot be represented as an int64_t.
+  static std::optional<int64_t> LiteralAsScalarInt64(const Literal& l);
+};
+
+std::ostream& operator<<(std::ostream& out, const Literal& literal);
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR0(NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {}));
+  literal.Set({}, value);
+  return literal;
+}
+
+template <typename T>
+/* static */ Literal LiteralUtil::CreateR0(PrimitiveType primitive_type,
+                                           T value) {
+  return primitive_util::ArrayTypeSwitch<Literal>(
+      [&value](auto type) {
+        using NativeT = primitive_util::NativeTypeOf<type>;
+        return CreateR0(static_cast<NativeT>(value));
+      },
+      primitive_type);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR1(absl::Span<const NativeT> values) {
+  Literal literal(
+      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<NativeT>(),
+                           {static_cast<int64_t>(values.size())}));
+  literal.PopulateR1(values);
+  return literal;
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR2WithLayout(
+    std::initializer_list<std::initializer_list<NativeT>> values,
+    const Layout& layout) {
+  Literal literal(ShapeUtil::MakeShapeWithDenseLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(),
+      {static_cast<int64_t>(values.size()),
+       static_cast<int64_t>(values.begin()->size())},
+      layout.minor_to_major()));
+  literal.PopulateR2(values);
+  return literal;
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  return CreateR2WithLayout(values, LayoutUtil::GetDefaultLayoutForR2());
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR3WithLayout(
+    std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
+        values,
+    const Layout& layout) {
+  const int64_t d0 = values.size();
+  const int64_t d1 = values.begin()->size();
+  const int64_t d2 = values.begin()->begin()->size();
+  Array3D<NativeT> tmp(d0, d1, d2);
+  int64_t i0 = 0;
+  for (auto d1_values : values) {
+    int64_t i1 = 0;
+    for (auto d2_values : d1_values) {
+      int64_t i2 = 0;
+      for (auto value : d2_values) {
+        tmp(i0, i1, i2) = value;
+        ++i2;
+      }
+      ++i1;
+    }
+    ++i0;
+  }
+  return CreateR3FromArray3DWithLayout(tmp, layout);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR3(
+    std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
+        values) {
+  return CreateR3WithLayout(values, LayoutUtil::GetDefaultLayoutForR3());
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR4WithLayout(
+    std::initializer_list<std::initializer_list<
+        std::initializer_list<std::initializer_list<NativeT>>>>
+        values,
+    const Layout& layout) {
+  const int64_t d0 = values.size();
+  const int64_t d1 = values.begin()->size();
+  const int64_t d2 = values.begin()->begin()->size();
+  const int64_t d3 = values.begin()->begin()->begin()->size();
+  Array4D<NativeT> tmp(d0, d1, d2, d3);
+  int64_t i0 = 0;
+  for (auto d1_values : values) {
+    int64_t i1 = 0;
+    for (auto d2_values : d1_values) {
+      int64_t i2 = 0;
+      for (auto d3_values : d2_values) {
+        int64_t i3 = 0;
+        for (auto value : d3_values) {
+          tmp(i0, i1, i2, i3) = value;
+          ++i3;
+        }
+        ++i2;
+      }
+      ++i1;
+    }
+    ++i0;
+  }
+  return CreateR4FromArray4DWithLayout(tmp, layout);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR4(
+    std::initializer_list<std::initializer_list<
+        std::initializer_list<std::initializer_list<NativeT>>>>
+        values) {
+  return CreateR4WithLayout(values, LayoutUtil::GetDefaultLayoutForR4());
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateFromArrayWithLayout(
+    const Array<NativeT>& values, const Layout& layout) {
+  Literal literal(ShapeUtil::MakeShapeWithDenseLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(), values.dimensions(),
+      layout.minor_to_major()));
+  literal.PopulateFromArray(values);
+  return literal;
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateFromArray(
+    const Array<NativeT>& values) {
+  return CreateFromArrayWithLayout(
+      values, LayoutUtil::GetDefaultLayoutForRank(values.num_dimensions()));
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR2FromArray2DWithLayout(
+    const Array2D<NativeT>& values, const Layout& layout) {
+  return CreateFromArrayWithLayout(values, layout);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR2FromArray2D(
+    const Array2D<NativeT>& values) {
+  return CreateFromArray(values);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR3FromArray3DWithLayout(
+    const Array3D<NativeT>& values, const Layout& layout) {
+  return CreateFromArrayWithLayout(values, layout);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR3FromArray3D(
+    const Array3D<NativeT>& values) {
+  return CreateFromArray(values);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR3Projected(
+    std::initializer_list<std::initializer_list<NativeT>> values,
+    int64_t projection) {
+  int64_t dim0_size = projection;
+  int64_t dim1_size = values.size();
+  int64_t dim2_size = values.begin()->size();
+
+  Array3D<NativeT> array(dim0_size, dim1_size, dim2_size);
+  for (int64_t dim0 = 0; dim0 < dim0_size; ++dim0) {
+    int64_t dim1 = 0;
+    for (auto inner_list : values) {
+      int64_t dim2 = 0;
+      for (auto value : inner_list) {
+        array(dim0, dim1, dim2) = value;
+        ++dim2;
+      }
+      CHECK_EQ(dim2_size, dim2);
+      ++dim1;
+    }
+    CHECK_EQ(dim1_size, dim1);
+  }
+  return CreateR3FromArray3D(array);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR4Projected(
+    std::initializer_list<std::initializer_list<NativeT>> values,
+    int64_t projection_p, int64_t projection_z) {
+  int64_t dim0_size = projection_p;
+  int64_t dim1_size = projection_z;
+  int64_t dim2_size = values.size();
+  int64_t dim3_size = values.begin()->size();
+
+  Array4D<NativeT> array(dim0_size, dim1_size, dim2_size, dim3_size);
+  for (int64_t dim0 = 0; dim0 < dim0_size; ++dim0) {
+    for (int64_t dim1 = 0; dim1 < dim1_size; ++dim1) {
+      int64_t dim2 = 0;
+      for (auto inner_list : values) {
+        int64_t dim3 = 0;
+        for (auto value : inner_list) {
+          array(dim0, dim1, dim2, dim3) = value;
+          ++dim3;
+        }
+        CHECK_EQ(dim3_size, dim3);
+        ++dim2;
+      }
+      CHECK_EQ(dim2_size, dim2);
+    }
+  }
+  return CreateR4FromArray4D(array);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR4FromArray4D(
+    const Array4D<NativeT>& values) {
+  return CreateFromArray(values);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateR4FromArray4DWithLayout(
+    const Array4D<NativeT>& values, const Layout& layout) {
+  return CreateFromArrayWithLayout(values, layout);
+}
+
+// Creates a squared scalar matrix of given size.
+template <typename NativeT>
+/* static */ Literal LiteralUtil::MakeScalarMatrixR2(int64_t size,
+                                                     NativeT scalar) {
+  Array2D<NativeT> array(size, size, NativeT(0));
+  for (int64_t i = 0; i < size; ++i) {
+    array(i, i) = scalar;
+  }
+  return CreateR2FromArray2D(array);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::MakeIdentityR2(int64_t size) {
+  return MakeScalarMatrixR2<NativeT>(size, NativeT(1));
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateFingerprintMatixR2(int64_t m, int64_t n,
+                                                           NativeT scale) {
+  NativeT row_factor = log10(m) + 1;
+  NativeT col_factor = log10(n) + 1;
+  Array2D<NativeT> array(m, n, NativeT(0));
+  for (int64_t i = 0; i < m; ++i) {
+    for (int64_t j = 0; j < n; ++j) {
+      array(i, i) = scale * (row_factor * i + col_factor * j);
+    }
+  }
+  return CreateR2FromArray2D(array);
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateFullWithDescendingLayout(
+    absl::Span<const int64_t> dimensions, NativeT value) {
+  Literal literal(ShapeUtil::MakeShapeWithDescendingLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions));
+  literal.PopulateWithValue(value);
+  return literal;
+}
+
+template <typename NativeT>
+/* static */ Literal LiteralUtil::CreateFull(
+    absl::Span<const int64_t> dimensions, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions));
+  literal.PopulateWithValue(value);
+  return literal;
+}
+
+template <PrimitiveType type, typename T>
+/* static */ absl::StatusOr<Literal> LiteralUtil::CreateLiteralWithGenerator(
+    const Shape& shape,
+    absl::FunctionRef<T(absl::Span<const int64_t>)> generator) {
+  using NativeT = primitive_util::NativeTypeOf<type>;
+  TF_RET_CHECK(shape.element_type() == type);
+  Literal literal(shape);
+  TF_RETURN_IF_ERROR(literal.Populate<NativeT>(
+      [=](absl::Span<const int64_t> indexes) { return generator(indexes); }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ absl::StatusOr<Literal> LiteralUtil::CreateRandomLiteral(
+    const Shape& shape, E* engine, T mean, T stddev) {
+  return CreateRandomLiteral<type>(shape, engine, static_cast<double>(mean),
+                                   static_cast<double>(stddev));
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ absl::StatusOr<Literal> LiteralUtil::CreateRandomLiteral(
+    const Shape& shape, E* engine, double mean, double stddev) {
+  using NativeT = primitive_util::NativeTypeOf<type>;
+  std::normal_distribution<double> generator(mean, stddev);
+  return CreateLiteralWithGenerator<type, NativeT>(
+      shape, [&](absl::Span<const int64_t> /*indexes*/) {
+        return static_cast<NativeT>(generator(*engine));
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ absl::StatusOr<Literal> LiteralUtil::CreateRandomLiteral(
+    const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
+// Generates fake data in a literal of the given shape, or returns an error
+// status if the element type is currently unhandled for fake data
+// generation. See below for documentation of pseudo_random and use_large_range.
+absl::StatusOr<Literal> MakeFakeLiteral(const Shape& shape,
+                                        bool pseudo_random = true,
+                                        bool use_large_range = false);
+
+// Similar to MakeFakeLiteral above but takes a random number generator engine
+// to enable reusing the engine across randomly generated literals. 'limit' is a
+// optional pair that contains the min and the max values to be sample for
+// integers (integer format only). 'is_sorted' sorts the sample data for
+// integers (integer format only). 'no_duplicates' indicates that there should
+// be no duplicate values in each generated array. This is uniqueness is
+// best-effort only. Some types (half and bfloat16) are not supported and
+// uniqueness cannot be guaranteed if the number of elements exceeds the number
+// of different values supported by the type. (floating point format only)
+// 'use_large_range' indicates the sampled data is from the full range of the
+// floating point format. (floating point format only)
+// 'max_bits_of_precision' sets the data to have the given number of bits or
+// less (integer or floating point formats only).
+absl::StatusOr<Literal> MakeFakeLiteral(
+    const Shape& shape, std::minstd_rand0* engine,
+    std::optional<std::pair<int64_t, int64_t>> limit, bool is_sorted,
+    bool no_duplicates, bool use_large_range,
+    std::optional<int64_t> max_bits_of_precision);
+
+}  // namespace xla
+
+#endif  // XLA_LITERAL_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/map_util.h b/third_party/tflite-hdrs/third_party/xla/xla/map_util.h
new file mode 100644
index 00000000..80f34992
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/map_util.h
@@ -0,0 +1,125 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MAP_UTIL_H_
+#define XLA_MAP_UTIL_H_
+
+#include <functional>
+#include <sstream>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// FindOrDie returns a const reference to the value associated with
+// the given key if it exists. Crashes otherwise.
+//
+// This is intended as a replacement for operator[] as an rvalue (for reading)
+// when the key is guaranteed to exist.
+template <class Collection>
+const typename Collection::value_type::second_type& FindOrDie(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  CHECK(it != collection.end()) << "Map key not found: " << key;
+  return it->second;
+}
+
+// Same as above, but returns a non-const reference.
+template <class Collection>
+typename Collection::value_type::second_type& FindOrDie(
+    Collection& collection,  // NOLINT
+    const typename Collection::value_type::first_type& key) {
+  typename Collection::iterator it = collection.find(key);
+  CHECK(it != collection.end()) << "Map key not found: " << key;
+  return it->second;
+}
+
+// Like FindOrDie but returns an error instead of dying if `key` is not in
+// `container`.
+template <class Collection>
+absl::StatusOr<
+    std::reference_wrapper<const typename Collection::value_type::second_type>>
+MaybeFind(const Collection& collection,
+          const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    std::ostringstream os;
+    os << key;
+    return NotFound("key not found: %s", os.str());
+  }
+  return {it->second};
+}
+
+// Returns a const reference to the value associated with the given key if it
+// exists, otherwise returns a const reference to the provided default value.
+//
+// WARNING: If a temporary object is passed as the default "value,"
+// this function will return a reference to that temporary object,
+// which will be destroyed at the end of the statement. A common
+// example: if you have a map with string values, and you pass a char*
+// as the default "value," either use the returned value immediately
+// or store it in a string (not string&).
+template <class Collection>
+const typename Collection::value_type::second_type& FindOrDefault(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key,
+    const typename Collection::value_type::second_type& value) {
+  auto it = collection.find(key);
+  if (it != collection.end()) return it->second;
+  return value;
+}
+
+// Inserts the key-value pair into the collection. Dies if key was already
+// present.
+template <class Collection, class Key, class Value>
+void InsertOrDie(Collection* const collection, Key&& key, Value&& value) {
+  auto p = collection->insert(
+      std::make_pair(std::forward<Key>(key), std::forward<Value>(value)));
+  CHECK(p.second) << "duplicate key: " << key;
+}
+
+// Returns true if and only if the given collection contains the given key.
+template <class Collection, class Key>
+bool ContainsKey(const Collection& collection, const Key& key) {
+  return collection.find(key) != collection.end();
+}
+
+// Returns a function that returns whether the map contains the given key.
+template <class Key, class Value>
+auto IsKeyIn(const absl::flat_hash_map<Key, Value>& map) {
+  return [&](const Key& key) { return map.contains(key); };
+}
+
+// Returns a function that returns whether the set contains the given value.
+template <class T>
+auto IsValueIn(const absl::flat_hash_set<T>& set) {
+  return [&](const T& value) { return set.contains(value); };
+}
+
+// Inserts `value` into `set`. Dies if it was already present.
+template <class Set, class Value>
+void InsertOrDie(Set* const set, Value&& value) {
+  CHECK(set->insert(std::forward<Value>(value)).second)
+      << "duplicate value: " << value;
+}
+
+}  // namespace xla
+
+#endif  // XLA_MAP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/maybe_owning.h b/third_party/tflite-hdrs/third_party/xla/xla/maybe_owning.h
new file mode 100644
index 00000000..4f32472e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/maybe_owning.h
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MAYBE_OWNING_H_
+#define XLA_MAYBE_OWNING_H_
+
+#include <cstdint>
+#include <memory>
+
+// A unique_ptr like class which may or may not have ownership of its pointer.
+// Uses least significant bit of the pointer to indicate ownership.
+template <typename T>
+class MaybeOwning final {
+ public:
+  MaybeOwning() = default;
+  explicit MaybeOwning(std::unique_ptr<T> unique)
+      : ptr_and_owning_bit_(TakeUnique(std::move(unique))) {}
+
+  explicit MaybeOwning(const T* borrowed)
+      : ptr_and_owning_bit_(Borrow(borrowed)) {}
+
+  ~MaybeOwning() { MaybeDeleteOwned(); }
+
+  const T* get() const { return RemoveMask(); }
+
+  T* get_mutable() { return RemoveMask(); }
+
+  const T* operator->() const { return get(); }
+  const T& operator*() const { return *get(); }
+
+  MaybeOwning<T>& operator=(std::unique_ptr<T> unique) {
+    MaybeDeleteOwned();
+    ptr_and_owning_bit_ = TakeUnique(std::move(std::move(unique)));
+    return *this;
+  }
+
+  MaybeOwning& operator=(const T* borrowed) {
+    MaybeDeleteOwned();
+    ptr_and_owning_bit_ = Borrow(borrowed);
+    return *this;
+  }
+
+  MaybeOwning& operator=(MaybeOwning&& other) {
+    using std::swap;
+    swap(ptr_and_owning_bit_, other.ptr_and_owning_bit_);
+    return *this;
+  }
+
+  MaybeOwning(const MaybeOwning&) = delete;
+  MaybeOwning(MaybeOwning&& other)
+      : ptr_and_owning_bit_(other.ptr_and_owning_bit_) {
+    other.ptr_and_owning_bit_ = 0;
+  }
+
+  MaybeOwning Clone() const {
+    const T* ptr = get();
+    if (ptr && OwnsPtr()) {
+      return MaybeOwning(std::make_unique<T>(*ptr));
+    }
+    return MaybeOwning(ptr);
+  }
+
+  bool OwnsPtr() const { return kOwningBitMask & ptr_and_owning_bit_; }
+
+ private:
+  enum : uint64_t {
+    kOwningBitMask = 1UL,
+    kPointerMask = ~kOwningBitMask,
+  };
+
+  T* RemoveMask() const {
+    return reinterpret_cast<T*>(ptr_and_owning_bit_ & kPointerMask);
+  }
+
+  static intptr_t TakeUnique(std::unique_ptr<T> unique) {
+    T* released = unique.release();
+    DCHECK_EQ(reinterpret_cast<intptr_t>(released) & kOwningBitMask, 0);
+    return reinterpret_cast<intptr_t>(released) | kOwningBitMask;
+  }
+
+  static intptr_t Borrow(const T* borrowed) {
+    DCHECK_EQ(reinterpret_cast<intptr_t>(borrowed) & kOwningBitMask, 0);
+    return reinterpret_cast<intptr_t>(borrowed);
+  }
+
+  void MaybeDeleteOwned() {
+    if (OwnsPtr()) {
+      delete get();
+    }
+  }
+
+  intptr_t ptr_and_owning_bit_ = 0;
+};
+
+#endif  // XLA_MAYBE_OWNING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/metric_table_report.h b/third_party/tflite-hdrs/third_party/xla/xla/metric_table_report.h
new file mode 100644
index 00000000..a8485549
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/metric_table_report.h
@@ -0,0 +1,175 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_METRIC_TABLE_REPORT_H_
+#define XLA_METRIC_TABLE_REPORT_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+
+namespace xla {
+
+// Class for creating a text format table showing entries with a metric
+// (e.g. cycles) and a text (e.g. name of function taking that many
+// cycles). Entries are grouped by a category and sorted in decreasing order of
+// the metric.
+//
+// Example of a categories table generated using this class:
+//
+// ********** microseconds report **********
+// There are 3,912,517 microseconds in total.
+// There are 123 microseconds ( 0.00%) not accounted for by the data.
+// There are 3002 ops.
+//
+// ********** categories table **********
+// The left hand side numbers are microseconds.
+// 1,749,414 (44.71% Σ44.72%)   convolution (206 ops)
+//                             * 10.51% %convolution.202
+//                             * 10.51% %convolution.204
+//                             * 10.51% %convolution.203
+//                             * ... (203 more ops)
+//   884,939 (22.62% Σ67.33%)   convolution window-dilated (7 ops)
+//                             *  7.50% %convolution-window-dilated.7
+// [...]
+//
+// The entry table is similar, it just has the entries directly as the entries
+// instead of grouping by categories first.
+class MetricTableReport {
+ public:
+  // Represents an entry in the table.
+  struct Entry {
+    // Text to show in the entry table for this entry.
+    std::string text;
+
+    // Text to show in the category table for this entry.
+    std::string short_text;
+
+    // Text that represents the category of this entry - entries with the same
+    // category are grouped together in the category table.
+    std::string category_text;
+
+    // The value of the metric for this entry.
+    double metric = 0.0;
+  };
+
+  void AddEntry(Entry entry);
+
+  // The default name for the metric is "units", this function allows setting a
+  // more meaningful name.
+  void SetMetricName(std::string metric_name);
+
+  // The default name for referring to entries is "entries", this functions
+  // allows setting a more meaningful name.
+  void SetEntryName(std::string entry_name);
+
+  // By default the size of the table is limited. Calling this function forces
+  // all entries to be shown no matter how many there are.
+  void SetShowAllEntries();
+
+  // Set option to show a table with data on the categories of entries.
+  void SetShowCategoryTable();
+
+  // Set option to show a table with data on the entries.
+  void SetShowEntryTable();
+
+  // Returns the report as a string. expected_metric_sum is the expected sum of
+  // the metric across the entries. It is not an error for the actual sum to be
+  // different from the expectation - the report will include the
+  // discrepancy. All metric percentages are for the ratio with respect to the
+  // expected sum, not the actual sum.
+  std::string MakeReport(double expected_metric_sum);
+
+  // As MakeReport(), but writes the report to the INFO log in a way that avoids
+  // cutting the report off if it is longer than the maximum line length for a
+  // logged line. Individual lines in the report may still be cut off, but they
+  // would have to be very long for that to happen.
+  void WriteReportToInfoLog(double expected_metric_sum);
+
+ private:
+  static constexpr double kDefaultMaxMetricProportionToShow = 0.99;
+  static constexpr int64_t kDefaultMaxEntriesToShow = 100;
+  static constexpr int64_t kDefaultMaxEntriesPerCategoryToShow = 5;
+
+  // Append all parameters to the report.
+  template <typename... Args>
+  void AppendLine(Args... args) {
+    absl::StrAppend(&report_, std::forward<Args>(args)..., "\n");
+  }
+
+  // Represents a set of entries with the same category_text.
+  struct Category {
+    std::string category_text;
+    double metric_sum = 0.0;  // Sum of metric across entries.
+    std::vector<const Entry*> entries;
+  };
+
+  // Returns a vector of categories of entries with the same category_text. The
+  // vector is sorted in order of decreasing metric sum.
+  //
+  // The returned categories contain pointers into the entries parameter. The
+  // style guide requires parameters to which references/pointers are retained
+  // to be taken by pointer, even for const parameters, so that is why entries
+  // is taken by pointer.
+  static std::vector<Category> MakeCategories(
+      const std::vector<Entry>* entries);
+
+  // Append a header to the report.
+  void AppendHeader();
+
+  // Append a table of categories to the report.
+  void AppendCategoryTable();
+
+  // Append a table of entries to the report.
+  void AppendEntryTable();
+
+  // Appends a row of a table to the report.
+  void AppendTableRow(const std::string& text, const double metric,
+                      const double running_metric_sum);
+
+  // Returns the discrepancy between the expected sum of the metric of the
+  // entries and the actual sum.
+  double UnaccountedMetric();
+
+  // Formats the metric value as a string.
+  std::string MetricString(double metric);
+
+  // Returns a string representing the metric value as a proportion of the
+  // expected metric sum.
+  std::string MetricPercent(double metric);
+
+  // The entries to make a report about.
+  std::vector<Entry> entries_;
+
+  double expected_metric_sum_ = 0.0;
+  std::string metric_name_ = "units";
+  std::string entry_name_ = "entries";
+  bool show_category_table_ = false;
+  bool show_entry_table_ = false;
+
+  // These members control how many categories and entries to show in tables.
+  int64_t max_entries_to_show_ = kDefaultMaxEntriesToShow;
+  int64_t max_entries_per_category_to_show_ =
+      kDefaultMaxEntriesPerCategoryToShow;
+  double max_metric_proportion_to_show_ = kDefaultMaxMetricProportionToShow;
+
+  // The report that is being created.
+  std::string report_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_METRIC_TABLE_REPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/framework/ir/xla_framework.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/framework/ir/xla_framework.h
new file mode 100644
index 00000000..f10decd9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/framework/ir/xla_framework.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations and types used in the XLAFramework dialect.
+//
+#ifndef XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_H_
+#define XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+
+#define GET_TYPEDEF_CLASSES
+#include "xla/mlir/framework/ir/xla_framework_types.h.inc"
+#define GET_OP_CLASSES
+#include "xla/mlir/framework/ir/xla_framework.h.inc"
+#include "xla/mlir/framework/ir/xla_framework_dialect.h.inc"
+
+#undef GET_OP_CLASSES
+
+#endif  // XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/framework/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/framework/transforms/passes.h
new file mode 100644
index 00000000..aafcc860
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/framework/transforms/passes.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_FRAMEWORK_TRANSFORMS_PASSES_H_
+#define XLA_MLIR_FRAMEWORK_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+
+namespace func {
+class FuncOp;
+}  // namespace func
+template <typename T>
+class OperationPass;
+
+namespace xla_framework {
+
+// Wrap function with XLA:CPU's C interface.
+std::unique_ptr<OperationPass<ModuleOp>> CreateOutlineWithXLAFrameworkPass();
+
+// Convert XLAFramework operations to LLVM operations.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeXLAFrameworkToLLVMPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_LEGALIZEXLAFRAMEWORKTOLLVM
+#define GEN_PASS_DECL_OUTLINEWITHXLAFRAMEWORK
+#include "xla/mlir/framework/transforms/passes.h.inc"
+
+}  // namespace xla_framework
+}  // namespace mlir
+
+#endif  // XLA_MLIR_FRAMEWORK_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
new file mode 100644
index 00000000..5c261309
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
+#define XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#define REGISTER_MLIR_REDUCE_STRATEGY(name)                      \
+  static int name##_init = []() {                                \
+    ::mlir::bisect::detail::RegisterReduceStrategy(#name, name); \
+    return 1;                                                    \
+  }();
+
+namespace mlir {
+namespace bisect {
+
+class BisectState {
+ public:
+  void SetTrace(mlir::interpreter::ExecutionTrace trace) {
+    trace_ = std::move(trace);
+  }
+
+  // Returns all executions of the given op.
+  llvm::SmallVector<const interpreter::InstructionTrace*> GetExecutions(
+      mlir::Operation* op) const {
+    return interpreter::FindOpExecutionsInTrace(trace_, op);
+  }
+
+ private:
+  mlir::interpreter::ExecutionTrace trace_;
+};
+
+std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op);
+Operation* FindInClone(Operation* op, ModuleOp clone);
+
+template <typename Op>
+std::pair<OwningOpRef<ModuleOp>, Op> CloneModuleFor(Op op) {
+  auto [module, op_clone] = CloneModuleFor(op.getOperation());
+  return {std::move(module), llvm::cast<Op>(op_clone)};
+}
+
+namespace detail {
+
+using CandidateVector = SmallVector<std::function<OwningOpRef<ModuleOp>()>>;
+
+CandidateVector GetCandidates(
+    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
+    BisectState& state, ModuleOp op);
+
+DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
+GetStrategies();
+
+// Registers a strategy that applies to all ops.
+void RegisterReduceStrategy(
+    StringRef name,
+    std::function<CandidateVector(BisectState&, Operation*)> fn);
+
+// Registers a strategy that applies to specific ops.
+template <typename Op>
+void RegisterReduceStrategy(StringRef name,
+                            CandidateVector (*fn)(BisectState&, Op)) {
+  RegisterReduceStrategy(
+      name, [fn](BisectState& state, Operation* op) -> CandidateVector {
+        if (auto cast = llvm::dyn_cast<Op>(op)) {
+          return fn(state, cast);
+        }
+        return {};
+      });
+}
+
+}  // namespace detail
+
+}  // namespace bisect
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
new file mode 100644
index 00000000..b90f7e6b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
+#define XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
+
+namespace mlir {
+namespace bisect {
+namespace test {
+
+void RegisterTestPasses();
+
+}
+}  // namespace bisect
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/comparators.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/comparators.h
new file mode 100644
index 00000000..c26b75af
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/comparators.h
@@ -0,0 +1,105 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include "llvm/Support/ErrorHandling.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Despite the name, this works on integers and complex too.
+template <int64_t v, bool r, bool nan_result>
+struct FloatCompare : CwiseAll {
+  template <typename T>
+  static bool Apply(T a, T b) {
+    if (isnan(a) || isnan(b)) return nan_result;
+    if constexpr (v == 0) {
+      // For complex eq/ne.
+      return (a == b) == r;
+    } else if constexpr (std::is_floating_point_v<T> || std::is_integral_v<T>) {
+      auto cmp = a > b ? 1 : (a < b ? -1 : 0);
+      return (cmp == v) == r;
+    } else {
+      llvm_unreachable("operation not supported for this type");
+    }
+  }
+
+  template <typename T>
+  static bool isnan(T a) {
+    return std::isnan(a);
+  }
+  template <typename T>
+  static bool isnan(std::complex<T> a) {
+    return std::isnan(std::real(a)) || std::isnan(std::imag(a));
+  }
+};
+
+using Foeq = FloatCompare<0, true, false>;
+using Foge = FloatCompare<-1, false, false>;
+using Fogt = FloatCompare<1, true, false>;
+using Fole = FloatCompare<1, false, false>;
+using Folt = FloatCompare<-1, true, false>;
+using Fone = FloatCompare<0, false, false>;
+using Ford = FloatCompare<99, false, false>;
+using Fueq = FloatCompare<0, true, true>;
+using Fuge = FloatCompare<-1, false, true>;
+using Fugt = FloatCompare<1, true, true>;
+using Fule = FloatCompare<1, false, true>;
+using Fult = FloatCompare<-1, true, true>;
+using Fune = FloatCompare<0, false, true>;
+using Funo = FloatCompare<99, true, true>;
+
+template <int64_t v, bool r>
+struct UnsignedCompare : CwiseInt {
+  template <typename T>
+  static bool Apply(T a, T b) {
+    using U = std::make_unsigned_t<T>;
+    auto a_u = static_cast<U>(a);
+    auto b_u = static_cast<U>(b);
+    auto cmp = a_u > b_u ? 1 : (a_u < b_u ? -1 : 0);
+    return (cmp == v) == r;
+  }
+};
+
+using Iuge = UnsignedCompare<-1, false>;
+using Iule = UnsignedCompare<1, false>;
+using Iugt = UnsignedCompare<1, true>;
+using Iult = UnsignedCompare<-1, true>;
+
+struct Iumax {
+  template <typename T>
+  static T apply(T a, T b) {
+    return Iuge::Apply(a, b) ? a : b;
+  }
+};
+
+struct Iumin {
+  template <typename T>
+  static T apply(T a, T b) {
+    return Iule::Apply(a, b) ? a : b;
+  }
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h
new file mode 100644
index 00000000..6f36f8a6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h
@@ -0,0 +1,242 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
+
+#include <climits>
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct ATan2 : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::atan2(a, b);
+  }
+};
+
+struct Clz : CwiseInt {
+  template <typename T>
+  static T Apply(T a) {
+    if (!a) {
+      // Return something well-defined for zeroes.
+      return sizeof(T{}) * CHAR_BIT;
+    }
+    return __builtin_clzl(
+               static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a))) -
+           (sizeof(uint64_t) - sizeof(T{})) * CHAR_BIT;
+  }
+};
+
+struct Ctz : CwiseInt {
+  template <typename T>
+  static T Apply(T a) {
+    if (!a) {
+      // Return something well-defined for zeroes.
+      return sizeof(T{}) * CHAR_BIT;
+    }
+    return __builtin_ctzl(static_cast<uint64_t>(a));
+  }
+};
+
+struct Complex : CwiseFloat {
+  template <typename T>
+  static std::complex<T> Apply(T a, T b) {
+    return {a, b};
+  }
+};
+
+struct Max : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::max(a, b);
+  }
+};
+
+struct Min : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::min(a, b);
+  }
+};
+
+struct Power : CwiseArith {
+  template <typename T>
+  static T Apply(T a, T b) {
+    if constexpr (std::is_integral_v<T>) {
+      if constexpr (std::is_signed_v<T>) {
+        if (b < 0) {
+          return a == 1 ? 1 : 0;
+        }
+      }
+      T result = 1;
+      while (b > 0) {
+        if (b & 1) result *= a;
+        b >>= 1;
+        if (b) {
+          a *= a;
+        }
+      }
+      return result;
+    } else {
+      return std::pow(a, b);
+    }
+  }
+};
+
+struct Remainder : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    if constexpr (std::is_integral_v<T>) {
+      return a % b;
+    } else {
+      return std::fmod(a, b);
+    }
+  }
+};
+
+struct ShiftRightArith : CwiseInt {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT ? 0 : (a >> b);
+  }
+};
+
+struct ShiftRightLogical : CwiseInt {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT
+               ? 0
+               : static_cast<std::make_unsigned_t<T>>(a) >> b;
+  }
+};
+
+struct ShiftLeft : CwiseInt {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT ? 0 : (a << b);
+  }
+};
+
+namespace detail {
+template <template <typename T> class F, typename trait>
+struct Wrap : trait {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return F<T>{}(a, b);
+  }
+};
+}  // namespace detail
+
+using Plus = detail::Wrap<std::plus, CwiseArith>;
+using Divide = detail::Wrap<std::divides, CwiseArith>;
+using Multiply = detail::Wrap<std::multiplies, CwiseArith>;
+using Minus = detail::Wrap<std::minus, CwiseAll>;
+using BitAnd = detail::Wrap<std::bit_and, CwiseIntegral>;
+using BitOr = detail::Wrap<std::bit_or, CwiseIntegral>;
+using BitXor = detail::Wrap<std::bit_xor, CwiseIntegral>;
+
+struct RSqrt : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    return static_cast<T>(T{1} / std::sqrt(a));
+  }
+};
+
+#define DEFINE_WRAPPER(name, std_fun, trait) \
+  struct name : trait {                      \
+    template <typename T>                    \
+    static auto Apply(T a) {                 \
+      return std_fun(a);                     \
+    }                                        \
+  };
+
+DEFINE_WRAPPER(ATan, std::atan, CwiseNonIntegral);
+DEFINE_WRAPPER(Abs, std::abs, CwiseSignedOrComplex);
+DEFINE_WRAPPER(Cbrt, std::cbrt, CwiseFloat);
+DEFINE_WRAPPER(Ceil, std::ceil, CwiseFloat);
+DEFINE_WRAPPER(Cos, std::cos, CwiseNonIntegral);
+DEFINE_WRAPPER(Erf, std::erf, CwiseFloat);
+DEFINE_WRAPPER(Exp, std::exp, CwiseNonIntegral);
+DEFINE_WRAPPER(Exp2, std::exp2, CwiseFloat);
+DEFINE_WRAPPER(Floor, std::floor, CwiseFloat);
+DEFINE_WRAPPER(Imag, std::imag, CwiseComplex);
+DEFINE_WRAPPER(IsFinite, std::isfinite, CwiseFloat);
+DEFINE_WRAPPER(Log, std::log, CwiseNonIntegral);
+DEFINE_WRAPPER(Log10, std::log10, CwiseNonIntegral);
+DEFINE_WRAPPER(Log2, std::log2, CwiseFloat);
+DEFINE_WRAPPER(NearbyInt, std::nearbyint, CwiseFloat);
+DEFINE_WRAPPER(Neg, std::negate<T>{}, CwiseSignedOrComplex);
+DEFINE_WRAPPER(Real, std::real, CwiseComplex);
+DEFINE_WRAPPER(Round, std::round, CwiseFloat);
+DEFINE_WRAPPER(Sin, std::sin, CwiseNonIntegral);
+DEFINE_WRAPPER(Sqrt, std::sqrt, CwiseNonIntegral);
+DEFINE_WRAPPER(Tan, std::tan, CwiseNonIntegral);
+DEFINE_WRAPPER(TanH, std::tanh, CwiseNonIntegral);
+DEFINE_WRAPPER(Trunc, std::trunc, CwiseFloat);
+
+#undef DEFINE_WRAPPER
+
+struct ExpM1 : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::expm1(a);
+    } else {
+      auto r = std::real(a);
+      auto i = std::imag(a);
+      auto s = std::sin(i / 2);
+      auto real = std::expm1(r) * std::cos(i) - 2 * s * s;
+      auto imag = std::exp(r) * std::sin(i);
+      return {real, imag};
+    }
+  }
+};
+
+struct Log1P : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::log1p(a);
+    } else {
+      auto r = std::real(a);
+      auto i = std::imag(a);
+      auto l = std::hypot(r + 1, i);
+      auto real = std::log(l);
+      auto imag = std::atan2(i, r + 1);
+      return {real, imag};
+    }
+  }
+};
+
+struct PopCount : CwiseInt {
+  template <typename T>
+  static T Apply(T a) {
+    return __builtin_popcountl(
+        static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a)));
+  }
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder.h
new file mode 100644
index 00000000..debd633f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_SYMBOL_FINDER_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_SYMBOL_FINDER_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+
+namespace mlir::interpreter {
+
+// Returns either the symbol's address or an error status if the symbol is not
+// found.
+absl::StatusOr<void*> FindSymbolInProcess(const std::string& symbol_name);
+
+}  // namespace mlir::interpreter
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_SYMBOL_FINDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.h
new file mode 100644
index 00000000..1bc77789
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.h
@@ -0,0 +1,85 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
+
+#include <cstdint>
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct OffsetsSizesStrides {
+  llvm::SmallVector<int64_t> offsets;
+  llvm::SmallVector<int64_t> sizes;
+  llvm::SmallVector<int64_t> strides;
+};
+
+// Replaces dynamic placeholders in static_vals using elements from the front
+// of args, which are removed.
+SmallVector<int64_t> ReplaceDynamicVals(ArrayRef<int64_t> static_vals,
+                                        ArrayRef<InterpreterValue>& args);
+// Same as above, but the values are already unpacked. `dynamicVals.size` must
+// match the number of dynamic values in `staticVals`.
+SmallVector<int64_t> ReplaceDynamicVals(ArrayRef<int64_t> static_vals,
+                                        ArrayRef<int64_t> dynamic_vals);
+
+OffsetsSizesStrides ExtractOffsetsSizesStrides(
+    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op);
+OffsetsSizesStrides ExtractOffsetsSizesStrides(
+    ArrayRef<int64_t> dynamic_offsets, ArrayRef<int64_t> dynamic_sizes,
+    ArrayRef<int64_t> dynamic_strides, OffsetSizeAndStrideOpInterface op);
+
+InterpreterValue ReshapeTensor(const InterpreterValue& in,
+                               ArrayRef<int64_t> shape);
+
+// gets the given operand, cloning its storage if it is a tensor.
+InterpreterValue GetInitOperand(mlir::Operation* op, int64_t index,
+                                MutableArrayRef<InterpreterValue> args);
+InterpreterValue GetInitOperand(mlir::ValueRange values, int64_t index,
+                                ArrayRef<InterpreterValue> args);
+
+// Common implementations for ops from different dialects but sharing the same
+// semantics.
+InterpreterValue TransposeImpl(const InterpreterValue& in,
+                               ArrayRef<int64_t> permutation);
+int64_t DimImpl(const InterpreterValue& in, int64_t index,
+                InterpreterState& state);
+
+// Terminator that just returns its args.
+llvm::SmallVector<InterpreterValue> NoOpTerminator(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
+    InterpreterState&);
+
+int64_t EvalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
+                       ArrayRef<int64_t> symbols);
+llvm::SmallVector<int64_t> EvalAffineMap(AffineMap map, ArrayRef<int64_t> dims,
+                                         ArrayRef<int64_t> symbols);
+llvm::SmallVector<int64_t> EvalAffineMap(AffineMap map,
+                                         ArrayRef<int64_t> operands);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.h
new file mode 100644
index 00000000..46334c7d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.h
@@ -0,0 +1,192 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+class InterpreterScope;
+
+class InterpreterListener {
+ public:
+  virtual ~InterpreterListener() = default;
+  virtual void BeforeOp(ArrayRef<InterpreterValue> args, mlir::Operation*) {}
+  virtual void AfterOp(ArrayRef<InterpreterValue> results) {}
+  virtual void EnterRegion(ArrayRef<InterpreterValue> args,
+                           mlir::Region& region) {}
+  virtual void LeaveRegion(ArrayRef<InterpreterValue> terminator_args) {}
+};
+
+struct InterpreterStats {
+  // Memrefs only.
+  int64_t heap_size = 0;
+  int64_t peak_heap_size = 0;
+  int64_t num_allocations = 0;
+  int64_t num_deallocations = 0;
+};
+
+struct InterpreterOptions {
+  InterpreterListener* listener = nullptr;
+  std::optional<int64_t> max_steps = std::nullopt;
+  // If set, ignore deallocations. Normally, accessing a deallocated memref will
+  // trigger an assertion. This flag disables all allocations, which can be
+  // useful when debugging IR that includes a use-after-free bug.
+  bool disable_deallocations = false;
+  std::function<void(llvm::StringRef)> error_handler =
+      [](llvm::StringRef failure) {
+        llvm::errs() << "Interpreter failure: " << failure << "\n";
+      };
+  InterpreterStats* stats = nullptr;
+};
+
+class InterpreterState {
+ public:
+  InterpreterState(const mlir::SymbolTable& symbols,
+                   InterpreterOptions options);
+
+  void Step() {
+    if (remaining_steps_ == 0) {
+      AddFailure("maximum number of steps exceeded");
+      return;
+    }
+    --remaining_steps_;
+  }
+  void AddFailure(llvm::StringRef failure);
+  bool HasFailure() const { return failed_; }
+  void CheckSuccess(LogicalResult result, llvm::StringRef failure) {
+    if (!result.succeeded()) {
+      AddFailure(failure);
+    }
+  }
+
+  InterpreterScope* GetTopScope() { return top_scope_; }
+  const mlir::SymbolTable& GetSymbols() const { return symbols_; }
+  const InterpreterOptions& GetOptions() { return options_; }
+
+ private:
+  const mlir::SymbolTable& symbols_;
+  InterpreterScope* top_scope_ = nullptr;
+  bool failed_ = false;
+  InterpreterOptions options_;
+  int64_t remaining_steps_ = std::numeric_limits<int64_t>::max();
+
+  friend class InterpreterScope;
+  friend class InterpreterScopeStash;
+};
+
+// Used for passing arbitrary data to ops in sub-regions.
+class InterpreterSideChannel {
+ public:
+  virtual ~InterpreterSideChannel() = default;
+};
+
+// Holds a mapping from SSA values to InterpreterValues and registered side
+// channels. There's typically one scope per region, but ops can add additional
+// scopes if needed (for example, to register a side channel).
+class InterpreterScope {
+ public:
+  InterpreterScope(InterpreterScope&&) = delete;
+  explicit InterpreterScope(InterpreterState& state)
+      : state_(state), parent_scope_(state.top_scope_) {
+    state.top_scope_ = this;
+  }
+  ~InterpreterScope();
+
+  void Set(Value v, InterpreterValue iv) { values_[v] = std::move(iv); }
+
+  const InterpreterValue& Get(Value v) {
+    auto ret = values_.find(v);
+    if (ret == values_.end()) {
+      if (!parent_scope_) {
+        v.dump();
+      }
+
+      assert(parent_scope_ && "value not found");
+      return parent_scope_->Get(v);
+    }
+    return ret->second;
+  }
+
+  void Verify() const;
+
+  // Retrieves the side channel of the given type in this scope or one of its
+  // ancestor scopes. If `optional` is set, returns nullptr if not found,
+  // otherwise asserts.
+  template <typename T>
+  T* GetSideChannel(bool optional = false) {
+    for (auto& side_channel : side_channels_) {
+      if (auto it = dynamic_cast<T*>(side_channel.get())) {
+        return it;
+      }
+    }
+    if (!parent_scope_ && optional) return nullptr;
+    assert(parent_scope_ && "side channel not found");
+    return parent_scope_->GetSideChannel<T>(optional);
+  }
+
+  // Registers the given side channel. Will shadow a side channel of the same
+  // type if registered in an outer scope.
+  // The behavior of registering two side channels of the same type in the same
+  // scope is undefined.
+  void SetSideChannel(std::shared_ptr<InterpreterSideChannel> side_channel) {
+    side_channels_.push_back(std::move(side_channel));
+  }
+
+  InterpreterScope* GetParentScope() const { return parent_scope_; }
+
+ private:
+  DenseMap<Value, InterpreterValue> values_;
+  SmallVector<std::shared_ptr<InterpreterSideChannel>> side_channels_;
+
+  InterpreterState& state_;
+  InterpreterScope* parent_scope_;
+
+  friend class InterpreterScopeStash;
+};
+
+// Interprets the given region and returns the terminator's arguments. The
+// region must have a single block.
+SmallVector<InterpreterValue> Interpret(InterpreterState& state, Region& region,
+                                        ArrayRef<InterpreterValue> bbargs);
+
+// Interprets the given function.
+absl::StatusOr<SmallVector<InterpreterValue>> RunInterpreter(
+    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
+    ArrayRef<InterpreterValue> args, InterpreterOptions options = {});
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h
new file mode 100644
index 00000000..af2331f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h
@@ -0,0 +1,227 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
+
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct InterpreterValue;
+
+struct Tuple {
+  bool operator==(const Tuple& other) const;
+
+  SmallVector<std::shared_ptr<InterpreterValue>> values;
+};
+
+// Holds a scalar, a tensor/memref or a tuple. Tensors/memrefs can also
+// represent vectors.
+struct InterpreterValue {
+  void Print(llvm::raw_ostream& os) const;
+  std::string ToString() const;
+
+  // Returns the element at the given indices. If the value is a scalar, returns
+  // itself.
+  InterpreterValue ExtractElement(llvm::ArrayRef<int64_t> indices) const;
+  // Sets the element at the given index. If the value is a scalar, sets its
+  // value.
+  void InsertElement(llvm::ArrayRef<int64_t> indices,
+                     const InterpreterValue& value);
+  // Initializes all elements of the underlying tensor.
+  void Fill(
+      const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>&
+          f);
+
+  // Converts a scalar to a unit tensor or vector.
+  InterpreterValue AsUnitTensor(bool is_vector = false) const;
+  // For integral interpreter values, casts them to int64.
+  int64_t AsInt() const;
+  // For integral interpreter values, first casts them to the unsigned integer
+  // type of the same size, and then to uint64. For example, the result for
+  // int8_t{-1} is 255.
+  uint64_t AsUInt() const;
+  // For floating point scalars, casts them to double.
+  double AsDouble() const;
+  // Must be a tensor or memref.
+  int64_t GetByteSizeOfElement() const;
+
+  // Creates a new tensor InterpreterValue (backed a new buffer) with the same
+  // element type as this, but a different shape. If this is not a tensor, it is
+  // used as the element type.
+  // If `layout` is empty, the clone uses the default layout.
+  InterpreterValue Clone(ArrayRef<int64_t> layout = {}) const;
+  // Returns either this tensor InterpreterValue (if its layout matches the
+  // requested layout) or a clone.
+  InterpreterValue CoerceLayout(llvm::ArrayRef<int64_t> layout) const;
+  // Returns a tensor interpreter value with a newly allocated buffer of the
+  // given shape, with a default layout and the same element type as this
+  // interpreter value.
+  InterpreterValue TypedAlike(llvm::ArrayRef<int64_t> shape) const;
+
+  // Creates a tensor with the given element type and shape. `element_type` may
+  // be a vector type, in which case the shape only specifies the non-vector
+  // dimensions.
+  static InterpreterValue MakeTensor(mlir::Type element_type,
+                                     SmallVector<int64_t> shape);
+
+  // Returns the underlying tensor's view. Must be a tensor.
+  BufferView& View();
+  const BufferView& View() const;
+  // Returns the underlying tensor's buffer. Must be a tensor.
+  std::shared_ptr<Buffer> GetBuffer() const;
+
+  bool IsTensor() const;
+
+  bool operator==(const InterpreterValue& other) const {
+    if (storage.index() != other.storage.index()) return false;
+    if (IsTensor() || std::holds_alternative<Tuple>(storage))
+      return storage == other.storage;
+    // Tensors treat NaNs as equal, so just wrap the values.
+    return AsUnitTensor() == other.AsUnitTensor();
+  }
+
+  std::variant<
+      Tuple, bool, float, double, uint8_t, int8_t, uint16_t, int16_t, uint32_t,
+      int32_t, uint64_t, int64_t, std::complex<float>, std::complex<double>,
+      TensorOrMemref<bool>, TensorOrMemref<float>, TensorOrMemref<double>,
+      TensorOrMemref<uint8_t>, TensorOrMemref<int8_t>, TensorOrMemref<uint16_t>,
+      TensorOrMemref<int16_t>, TensorOrMemref<uint32_t>,
+      TensorOrMemref<int32_t>, TensorOrMemref<uint64_t>,
+      TensorOrMemref<int64_t>, TensorOrMemref<std::complex<float>>,
+      TensorOrMemref<std::complex<double>>>
+      storage;
+};
+
+template <typename T>
+constexpr static bool is_valid_interpreter_value_v =  // NOLINT
+    std::is_constructible_v<decltype(InterpreterValue::storage), T>;
+
+// Attempts to cast the given value to the requested type, returning nullopt if
+// no cast is possible. This allows casts to the concrete type of the value
+// (e.g. an `InterpreterValue` containing a `Tuple` can be cast to `Tuple`),
+// casts from a unit tensor to their contents, and casts of scalars to any
+// convertible type.
+// NOTE: When casting to an unsigned type, this behaves differently than
+// InterpreterValue::AsUint. That function preserves the content's bit width,
+// so InterpreterValueDynCast<uint64_t>({int8_t{-1}}) will return 2^64-1,
+// whereas AsUInt will return 255.
+template <typename T>
+std::optional<T> InterpreterValueDynCast(InterpreterValue v) {
+  if constexpr (std::is_same_v<T, InterpreterValue>) {
+    return v;
+  }
+  if constexpr (is_valid_interpreter_value_v<T>) {
+    if (std::holds_alternative<T>(v.storage)) {
+      return std::get<T>(v.storage);
+    }
+  }
+  if (v.IsTensor() && !is_tensor_or_memref_v<T>) {
+    if (v.View().GetNumElements() != 1) {
+      return std::nullopt;
+    }
+    return InterpreterValueDynCast<T>(v.ExtractElement({}));
+  }
+  return std::visit(
+      [](auto v) -> std::optional<T> {
+        if constexpr (std::is_convertible_v<decltype(v), T>) {
+          return v;
+        } else {
+          return std::nullopt;
+        }
+      },
+      v.storage);
+}
+
+template <typename T>
+T InterpreterValueCast(InterpreterValue v) {
+  auto ret = InterpreterValueDynCast<T>(v);
+  assert(ret && "cast failed");
+  return *std::move(ret);
+}
+
+// Calls functor with a value of the C++ type corresponding to the given `Type`,
+// (or its element type).
+template <class Fn>
+auto DispatchScalarType(mlir::Type ty, Fn&& functor) {
+  ty = getElementTypeOrSelf(ty);
+  if (ty.isF32()) {
+    return functor(float{});
+  }
+  if (ty.isF64()) {
+    return functor(double{});
+  }
+  if (ty.isUnsignedInteger(64)) {
+    return functor(uint64_t{});
+  }
+  if (ty.isInteger(64) || ty.isIndex()) {
+    return functor(int64_t{});
+  }
+  if (ty.isUnsignedInteger(32)) {
+    return functor(uint32_t{});
+  }
+  if (ty.isInteger(32)) {
+    return functor(int32_t{});
+  }
+  if (ty.isUnsignedInteger(16)) {
+    return functor(uint16_t{});
+  }
+  if (ty.isInteger(16)) {
+    return functor(int16_t{});
+  }
+  if (ty.isUnsignedInteger(8)) {
+    return functor(uint8_t{});
+  }
+  if (ty.isInteger(8)) {
+    return functor(int8_t{});
+  }
+  if (ty.isInteger(1)) {
+    return functor(bool{});
+  }
+  if (auto complex = mlir::dyn_cast<ComplexType>(ty)) {
+    if (complex.getElementType().isF32()) {
+      return functor(std::complex<float>{});
+    }
+    if (complex.getElementType().isF64()) {
+      return functor(std::complex<double>{});
+    }
+  }
+
+  llvm::errs() << "DispatchScalarType unimplemented for " << ty << "\n";
+  llvm_unreachable("unimplemented");
+}
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h
new file mode 100644
index 00000000..f5579a41
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h
@@ -0,0 +1,190 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
+
+#include <cassert>
+#include <complex>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+
+template <typename T>
+struct is_complex : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};  // NOLINT
+
+template <typename Fn>
+struct InterpreterValueMapVisitor {
+  template <typename T>
+  InterpreterValue operator()(const TensorOrMemref<T>& t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      using out_elem_t = decltype(Fn::Apply(T()));
+      auto out = TensorOrMemref<out_elem_t>::EmptyLike(t.view);
+      for (const auto& index : out.view.Indices(true)) {
+        out.at(index) = Fn::Apply(t.at(index));
+      }
+      return {out};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+
+  InterpreterValue operator()(const Tuple& t) {
+    Tuple out;
+    for (const auto& value : t.values) {
+      out.values.push_back(std::make_unique<InterpreterValue>(
+          std::move(std::visit(*this, value->storage))));
+    }
+    return {out};
+  }
+
+  template <typename T>
+  InterpreterValue operator()(const T& t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      return {Fn::Apply(t)};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+};
+
+template <typename Fn>
+struct InterpreterValueBiMapVisitor {
+  const InterpreterValue& rhs;
+
+  template <typename T>
+  InterpreterValue operator()(const TensorOrMemref<T>& lhs_t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      using OutElemT = decltype(Fn::Apply(T(), T()));
+      auto out = TensorOrMemref<OutElemT>::EmptyLike(lhs_t.view);
+      const auto& rhs_t = std::get<TensorOrMemref<T>>(rhs.storage);
+      for (const auto& index : out.view.Indices(true)) {
+        out.at(index) = Fn::Apply(lhs_t.at(index), rhs_t.at(index));
+      }
+      return {out};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+
+  InterpreterValue operator()(const Tuple& lhs_t) {
+    const auto& rhs_t = std::get<Tuple>(rhs.storage);
+    Tuple out;
+    for (const auto& [lhs_v, rhs_v] : llvm::zip(lhs_t.values, rhs_t.values)) {
+      out.values.push_back(std::make_unique<InterpreterValue>(std::move(
+          std::visit(InterpreterValueBiMapVisitor{*rhs_v}, lhs_v->storage))));
+    }
+    return {std::move(out)};
+  }
+
+  template <typename T>
+  InterpreterValue operator()(const T& t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      return {Fn::Apply(t, std::get<T>(rhs.storage))};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+};
+
+}  // namespace detail
+
+template <typename T>
+inline constexpr bool is_complex_v = detail::is_complex<T>::value;  // NOLINT
+
+template <bool allow_bools, bool allow_ints, bool allow_floats,
+          bool allow_complex, bool allow_unsigned = true>
+struct FilterMapTraits {
+  template <typename T>
+  static constexpr bool SupportedType() {
+    constexpr bool is_bool = std::is_same_v<T, bool>;
+    constexpr bool is_int = std::is_integral_v<T> && !is_bool;
+    constexpr bool is_unsigned = std::is_unsigned_v<T>;
+    return (allow_bools && is_bool) ||
+           (allow_ints && is_int && (allow_unsigned || !is_unsigned)) ||
+           (allow_floats && std::is_floating_point_v<T>) ||
+           (allow_complex && is_complex_v<T>);
+  }
+};
+
+using CwiseAll = FilterMapTraits<true, true, true, true>;
+using CwiseArith = FilterMapTraits<false, true, true, true>;
+using CwiseComplex = FilterMapTraits<false, false, false, true>;
+using CwiseFloat = FilterMapTraits<false, false, true, false>;
+using CwiseInt = FilterMapTraits<false, true, false, false>;
+using CwiseIntegral = FilterMapTraits<true, true, false, false>;
+using CwiseNonIntegral = FilterMapTraits<false, false, true, true>;
+using CwiseReal = FilterMapTraits<false, true, true, false>;
+using CwiseSignedOrComplex = FilterMapTraits<false, true, true, true, false>;
+using CwiseSigned = FilterMapTraits<false, true, true, false, false>;
+
+template <typename Fn>
+InterpreterValue ApplyCwiseMap(const InterpreterValue& value) {
+  return std::visit(detail::InterpreterValueMapVisitor<Fn>{}, value.storage);
+}
+
+template <typename Fn>
+InterpreterValue ApplyCwiseBinaryMap(const InterpreterValue& lhs,
+                                     const InterpreterValue& rhs) {
+  assert(lhs.storage.index() == rhs.storage.index());
+  return std::visit(detail::InterpreterValueBiMapVisitor<Fn>{rhs}, lhs.storage);
+}
+
+// Unboxes (and casts if necessary) the given interpreter values. Asserts if the
+// types are incompatible.
+template <typename T>
+SmallVector<T> UnpackInterpreterValues(ArrayRef<InterpreterValue> values) {
+  SmallVector<T> result;
+  for (const auto& value : values) {
+    result.push_back(InterpreterValueCast<T>(value));
+  }
+  return result;
+}
+
+// Boxes the given values in InterpreterValues.
+template <typename T>
+SmallVector<InterpreterValue> PackInterpreterValues(ArrayRef<T> values) {
+  SmallVector<InterpreterValue> result;
+  for (const auto& value : values) {
+    result.push_back({value});
+  }
+  return result;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.h
new file mode 100644
index 00000000..b770f776
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.h
@@ -0,0 +1,225 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TypeName.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+#define MLIR_INTERPRETER_CONCAT_IMPL(x, y) x##y
+#define MLIR_INTERPRETER_CONCAT(x, y) MLIR_INTERPRETER_CONCAT_IMPL(x, y)
+#define REGISTER_MLIR_INTERPRETER_OP(args...)                     \
+  static int MLIR_INTERPRETER_CONCAT(init_, __COUNTER__) = []() { \
+    ::mlir::interpreter::detail::RegisterInterpreterOp(args);     \
+    return 1;                                                     \
+  }();
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+
+// The generic signature for interpreter functions. Typically the type-checked
+// form should be used instead.
+using InterpreterFunction = std::function<SmallVector<InterpreterValue>(
+    MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>;
+
+// Returns the given registered function, or nullptr if not found.
+InterpreterFunction GetFunction(llvm::StringRef name);
+
+// Simple unary ops.
+void RegisterInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&));
+
+// Simple binary ops.
+void RegisterInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&,
+                                                  const InterpreterValue&));
+
+template <typename T>
+struct is_optional : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_optional<std::optional<T>> : std::true_type {};  // NOLINT
+
+// Converts the given arguments to the requested type. Supported target types:
+// - InterpreterValue storage types (e.g. uint8_t, TensorOrMemref<float>).
+//   Scalars will be cast if necessary.
+// - std::optional of InterpreterValue storage types
+// - ArrayRef of InterpreterValue storage types. Will cast if necessary (e.g.
+//   int32_t -> int64_t).
+// - no-op conversions: InterpreterValue, ArrayRef<InterpreterValue>
+template <typename ArgT>
+auto TypedInterpreterOpConvertArg(MutableArrayRef<InterpreterValue> args,
+                                  InterpreterState& state) {
+  constexpr bool optional = is_optional<ArgT>::value;
+  constexpr bool single = std::is_same_v<ArgT, InterpreterValue> ||
+                          is_valid_interpreter_value_v<ArgT>;
+  if constexpr (optional) {
+    if (args.empty()) {
+      return ArgT{};
+    }
+  }
+
+  if constexpr (single || optional) {
+    if (args.size() != 1) {
+      state.AddFailure("Expected a single argument for the operand");
+      return ArgT{};
+    }
+  }
+
+  auto fail = [&]() {
+    state.AddFailure(absl::StrCat("Unable to convert argument (variant index ",
+                                  args[0].storage.index(), ") to ",
+                                  llvm::getTypeName<ArgT>().str()));
+    return ArgT{};
+  };
+
+  if constexpr (single) {
+    if (auto arg = InterpreterValueDynCast<ArgT>(args[0])) {
+      return ArgT{*arg};
+    }
+    return fail();
+  } else if constexpr (optional) {
+    using T = std::decay_t<decltype(*std::declval<ArgT>())>;
+    if (auto arg = InterpreterValueDynCast<T>(args[0])) {
+      return arg;
+    }
+    return fail();
+  } else {
+    using E = std::decay_t<decltype(*std::declval<ArgT>().begin())>;
+    // Container argument types (e.g. MutableArrayRef<InterpreterValue>,
+    // ArrayRef<int64_t>).
+    if constexpr (std::is_same_v<E, InterpreterValue>) {
+      return ArgT{args};
+    } else {
+      // Note: we don't cast to ArgT here, because that's typically an ArrayRef,
+      // which would lead to returning a reference to a temporary.
+      return UnpackInterpreterValues<E>(args);
+    }
+  }
+}
+
+// Converts the given return value. Supported target types:
+// - InterpreterValue (no-op conversion)
+// - InterpreterValue storage types (the value will be boxed)
+// - SmallVector<InterpreterValue>
+template <typename RetT>
+SmallVector<InterpreterValue> TypedInterpreterOpConvertRet(RetT ret) {
+  if constexpr (std::is_same_v<RetT, InterpreterValue>) {
+    return {ret};
+  } else if constexpr (is_valid_interpreter_value_v<RetT>) {
+    return {InterpreterValue{ret}};
+  } else if constexpr (std::is_same_v<RetT, SmallVector<InterpreterValue>>) {
+    return ret;
+  } else {
+    using E = std::decay_t<decltype(*std::declval<RetT>().begin())>;
+    return PackInterpreterValues(ArrayRef<E>(ret));
+  }
+}
+
+// Adapts the given function to the generic handler signature
+// (SmallVector<InterpreterValue>(MutableArrayRef<InterpreterValue>, Operation*,
+// InterpreterState&)).
+// See the function below for usage.
+template <typename Op, typename Ret, typename... T, size_t... Indices>
+void RegisterTypedInterpreterOpImpl(Ret (*fn)(InterpreterState&, Op, T... args),
+                                    std::index_sequence<Indices...>) {
+  RegisterInterpreterOp(
+      Op::getOperationName(),
+      [fn](MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+           InterpreterState& state) -> SmallVector<InterpreterValue> {
+        auto cast = llvm::dyn_cast<Op>(op);
+        if (!cast) {
+          state.AddFailure(absl::StrCat(
+              "failed to cast op '", op->getName().getStringRef().str(),
+              "' to expected type (", llvm::getTypeName<Op>().str(), ")"));
+          return {};
+        }
+        int64_t used_args = 0;
+        for (auto i : llvm::seq<uint64_t>(0, sizeof...(T))) {
+          used_args += cast.getODSOperandIndexAndLength(i).second;
+        }
+        if (args.size() != used_args) {
+          state.AddFailure("Op handler did not use all arguments");
+          return {};
+        }
+
+        auto extract_arg = [&](auto index, auto* dummy) {
+          auto [pos, length] = cast.getODSOperandIndexAndLength(index);
+          using ArgT = std::decay_t<decltype(*dummy)>;
+          return TypedInterpreterOpConvertArg<ArgT>(args.slice(pos, length),
+                                                    state);
+        };
+
+        if constexpr (std::is_same_v<Ret, void>) {
+          fn(state, cast, extract_arg(Indices, (std::decay_t<T>*){nullptr})...);
+          return {};
+        } else {
+          Ret ret = fn(state, cast,
+                       extract_arg(Indices, (std::decay_t<T>*){nullptr})...);
+          return TypedInterpreterOpConvertRet(ret);
+        }
+      });
+}
+
+// registers the given function. The function should take one argument per
+// Op operand, in the same order as the Op.
+// The argument types should match the operation's operand types:
+// - Variadic<...> becomes ArrayRef<...>
+// - Optional<...> becomes std::optional<...>
+// - Unboxing is optionally supported, e.g. an Optional<Index> operand can be
+//   passed to either a std::optional<int64_t> or a
+//   std::optional<InterpreterValue>.
+// Valid return types are InterpreterValue, SmallVector<InterpreterValue>, void,
+// and any type boxable in an InterpreterValue.
+template <typename Op, typename Ret, typename... T>
+void RegisterInterpreterOp(Ret (*fn)(InterpreterState&, Op, T...)) {
+  RegisterTypedInterpreterOpImpl(fn, std::index_sequence_for<T...>{});
+}
+
+// Simple variadic ops (single output).
+void RegisterInterpreterOp(
+    llvm::StringRef name,
+    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>));
+
+// Generic ops.
+void RegisterInterpreterOp(
+    llvm::StringRef name,
+    std::function<llvm::SmallVector<InterpreterValue>(
+        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
+        fn);
+
+void RegisterInterpreterOp(llvm::StringRef name, llvm::StringRef original);
+
+}  // namespace detail
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
new file mode 100644
index 00000000..6a1ff9cc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
@@ -0,0 +1,365 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
+
+#include <math.h>
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace interpreter {
+
+template <typename T>
+bool IsEqual(T a, T b) {
+  return a == b;
+}
+
+// TODO(jreiffers): Replace ifndef with a command line flag.
+#ifndef MLIR_INTERPRETER_COMPARE_DOUBLES_EXACT
+// Compare double precision float with a small tolerance, because complex
+// computations in the interpreter don't always produce the exact same result.
+template <>
+inline bool IsEqual(double a, double b) {
+  if (isinf(a) || isinf(b)) {
+    return a == b;
+  }
+
+  return fabs(a - b) < 1e-14;
+}
+
+template <>
+inline bool IsEqual(std::complex<double> a, std::complex<double> b) {
+  return IsEqual(a.real(), b.real()) && IsEqual(a.imag(), b.imag());
+}
+#endif
+
+// Represents a view into a physical buffer.
+struct BufferView {
+  int64_t offset;
+  llvm::SmallVector<int64_t> sizes;    // [10, 11, 12]
+  llvm::SmallVector<int64_t> strides;  // [132, 12, 1]
+  // Number of vector element dimensions in the tensor. nullopt if this is a
+  // vector itself (is_vector is set). {0} if this is a tensor of a unit vector.
+  std::optional<int64_t> num_vector_dims = std::nullopt;
+  bool is_vector = false;
+
+  int64_t Rank() const { return sizes.size() - num_vector_dims.value_or(0); }
+
+  // Removes the dimension from the view. If you need to keep it, use the
+  // overload below with dim_size = 1.
+  LogicalResult Slice(int64_t dim_index, int64_t dim_offset);
+  LogicalResult Slice(int64_t dim_index, int64_t dim_offset, int64_t dim_size,
+                      int64_t dim_stride = 1);
+  LogicalResult Subview(ArrayRef<int64_t> subview_offsets,
+                        ArrayRef<int64_t> subview_sizes,
+                        ArrayRef<int64_t> subview_strides);
+  int64_t GetNumElements(bool include_vector_dimsms = false) const;
+
+  class LogicalIndexView {
+   public:
+    class Iterator {
+     public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = llvm::SmallVector<int64_t>;
+      using difference_type = std::ptrdiff_t;
+      using pointer = llvm::SmallVector<int64_t>*;
+      using reference = llvm::SmallVector<int64_t>&;
+
+      const llvm::SmallVector<int64_t>& operator*() const {
+        return view_indices_;
+      }
+      const llvm::SmallVector<int64_t>* operator->() const {
+        return &view_indices_;
+      }
+
+      Iterator& operator++() {
+        auto index_it = view_indices_.rbegin();
+        auto size_it = view_->sizes.rbegin();
+        if (!include_vector_dims_) {
+          std::advance(size_it, view_->num_vector_dims.value_or(0));
+        }
+
+        for (auto e = view_indices_.rend(); index_it != e;
+             ++index_it, ++size_it) {
+          ++*index_it;
+          if (*index_it < *size_it) {
+            return *this;
+          }
+          *index_it = 0;
+        }
+
+        view_indices_.clear();
+        view_indices_.push_back(-1);
+        return *this;
+      }
+
+      Iterator operator++(int) {
+        auto tmp = *this;
+        ++(*this);
+        return tmp;
+      }
+
+      bool operator==(const Iterator& other) const {
+        return view_indices_ == other.view_indices_;
+      }
+
+      bool operator!=(const Iterator& other) const { return !(*this == other); }
+
+     private:
+      friend class LogicalIndexView;
+
+      Iterator(const BufferView* view, llvm::SmallVector<int64_t> indices,
+               bool include_vector_dims)
+          : view_(view),
+            view_indices_(std::move(indices)),
+            include_vector_dims_(include_vector_dims) {}
+
+      const BufferView* view_;
+      llvm::SmallVector<int64_t> view_indices_;
+      bool include_vector_dims_;
+    };
+
+    Iterator begin() const {
+      if (view_->GetNumElements() == 0) return end();
+      return {
+          view_,
+          llvm::SmallVector<int64_t>(
+              view_->Rank() +
+              (include_vector_dims_ ? view_->num_vector_dims.value_or(0) : 0)),
+          include_vector_dims_};
+    }
+    Iterator end() const { return {view_, {-1}, false}; }
+
+   private:
+    friend class BufferView;
+
+    LogicalIndexView(const BufferView* view, bool include_vector_dims)
+        : view_(view), include_vector_dims_(include_vector_dims) {}
+
+    const BufferView* view_;
+    bool include_vector_dims_;
+  };
+
+  // Returns nullopt if the index is out of bounds.
+  std::optional<int64_t> GetPhysicalIndex(
+      llvm::ArrayRef<int64_t> view_indices) const;
+  LogicalIndexView Indices(bool include_vector_dims = false) const {
+    return LogicalIndexView{this, include_vector_dims};
+  }
+  // Returns the stride resulting from collapsing the given dimensions, if
+  // possible.
+  std::optional<int64_t> GetCollapsedStride(llvm::ArrayRef<int64_t> dims) const;
+
+  bool InBounds(llvm::ArrayRef<int64_t> view_indices) const;
+  static SmallVector<int64_t> GetDefaultStrides(ArrayRef<int64_t> sizes);
+  static SmallVector<int64_t> GetStridesForLayout(ArrayRef<int64_t> sizes,
+                                                  ArrayRef<int64_t> layout);
+};
+
+// Backing for a TensorOrMemref.
+class Buffer {
+ private:
+  struct Dummy {};
+
+ public:
+  template <typename T>
+  static std::shared_ptr<Buffer> Allocate(size_t size) {
+    return std::make_shared<Buffer>(Dummy{}, size, sizeof(T));
+  }
+
+  char* at(std::optional<int64_t> idx, int64_t element_size) {
+    auto byte_offset = GetByteOffset(idx, element_size);
+    if (!byte_offset) {
+      return &storage_.data()[0];
+    }
+    return &storage_.data()[*byte_offset];
+  }
+
+  const char* at(std::optional<int64_t> idx, int64_t element_size) const {
+    auto byte_offset = GetByteOffset(idx, element_size);
+    if (!byte_offset) {
+      return &storage_.data()[0];
+    }
+    return &storage_.data()[*byte_offset];
+  }
+
+  Buffer(Dummy, size_t num_elements, size_t element_size)
+      : storage_(num_elements * element_size) {}
+
+  int64_t GetByteSize() const { return storage_.size(); }
+
+  void Deallocate(mlir::Operation* op) {
+    if (is_alloca_) {
+      SetFailure("deallocated stack buffer");
+    } else if (freed_by_ != nullptr) {
+      std::string failure;
+      llvm::raw_string_ostream os(failure);
+      os << "double-free\n";
+      os << "  Note: allocated by " << *allocated_by_ << "\n";
+      os << "  Note: previously freed by " << *freed_by_ << "\n";
+      SetFailure(failure);
+    } else {
+      freed_by_ = op;
+    }
+  }
+
+  bool Deallocated() const { return freed_by_ != nullptr; }
+  mlir::Operation* FreedByOp() const { return freed_by_; }
+  void SetAllocatedBy(mlir::Operation* allocated_by) {
+    this->allocated_by_ = allocated_by;
+  }
+
+  void SetFailure(llvm::StringRef failure) const {
+    this->failure_ = failure.str();
+  }
+  llvm::StringRef GetFailure() const { return failure_; }
+
+  void SetIsAlloca() { is_alloca_ = true; }
+
+ private:
+  std::optional<size_t> GetByteOffset(std::optional<int64_t> idx,
+                                      int64_t element_size) const {
+    if (!idx) {
+      SetFailure("out of bounds access");
+      return std::nullopt;
+    }
+
+    if (freed_by_ != nullptr) {
+      std::string failure;
+      llvm::raw_string_ostream os(failure);
+      os << "use-after-free\n";
+      os << "  Note: allocated by " << *allocated_by_ << "\n";
+      os << "  Note: previously freed by " << *freed_by_ << "\n";
+      SetFailure(failure);
+      return std::nullopt;
+    }
+
+    return *idx * element_size;
+  }
+
+  llvm::SmallVector<char> storage_;
+  mlir::Operation* freed_by_ = nullptr;
+  mlir::Operation* allocated_by_ = nullptr;
+  bool is_alloca_ = false;
+  mutable std::string failure_;
+};
+
+template <typename T>
+struct TensorOrMemref {
+  using element_type = T;
+
+  static TensorOrMemref<T> Empty(ArrayRef<int64_t> sizes,
+                                 ArrayRef<int64_t> layout = {}) {
+    BufferView dummy{0, SmallVector<int64_t>(sizes), {}};
+    return EmptyLike(dummy, layout);
+  }
+
+  static TensorOrMemref<T> EmptyLike(const BufferView& view,
+                                     ArrayRef<int64_t> layout = {}) {
+    BufferView new_view = view;
+    new_view.offset = 0;
+    new_view.strides = BufferView::GetStridesForLayout(view.sizes, layout);
+    return {Buffer::Allocate<T>(view.GetNumElements(true)), new_view};
+  }
+
+  TensorOrMemref<T> Clone(ArrayRef<int64_t> layout = {}) const {
+    auto out = EmptyLike(view, layout);
+    for (auto [src_index, dst_index] :
+         llvm::zip(view.Indices(true), out.view.Indices(true))) {
+      out.at(dst_index) = at(src_index);
+    }
+    return out;
+  }
+
+  const T& at(ArrayRef<int64_t> indices) const {
+    return *reinterpret_cast<const T*>(
+        buffer->at(view.GetPhysicalIndex(indices), sizeof(T)));
+  }
+
+  T& at(ArrayRef<int64_t> indices) {
+    return *reinterpret_cast<T*>(
+        buffer->at(view.GetPhysicalIndex(indices), sizeof(T)));
+  }
+
+  TensorOrMemref VectorAt(ArrayRef<int64_t> indices) const {
+    auto offset = view.GetPhysicalIndex(indices);
+    BufferView subview;
+    subview.strides = {view.strides.begin() + view.Rank(), view.strides.end()};
+    subview.sizes = {view.sizes.begin() + view.Rank(), view.sizes.end()};
+    if (offset) {
+      subview.offset = *offset;
+    } else {
+      buffer->SetFailure("out of bounds access");
+    }
+    subview.is_vector = true;
+    subview.num_vector_dims = std::nullopt;
+    return {buffer, subview};
+  }
+
+  bool operator==(const TensorOrMemref& other) const {
+    if (buffer->Deallocated() || other.buffer->Deallocated()) return false;
+    if (other.view.sizes != view.sizes) return false;
+    if (other.view.num_vector_dims != view.num_vector_dims) return false;
+    for (const auto& indices : view.Indices(true)) {
+      // Treat NaNs as equal.
+      if constexpr (std::is_floating_point_v<T>) {
+        bool thisnan = isnan(at(indices));
+        bool othernan = isnan(other.at(indices));
+        if (thisnan || othernan) {
+          if (thisnan && othernan) continue;
+          return false;
+        }
+      }
+      if (!IsEqual(at(indices), other.at(indices))) return false;
+    }
+    return true;
+  }
+
+  std::shared_ptr<Buffer> buffer;
+  BufferView view;
+};
+
+template <typename T>
+struct is_tensor_or_memref : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_tensor_or_memref<TensorOrMemref<T>> : std::true_type {};  // NOLINT
+
+template <typename T>
+inline constexpr bool is_tensor_or_memref_v =  // NOLINT
+    is_tensor_or_memref<std::decay_t<T>>::value;
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
new file mode 100644
index 00000000..1c209def
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
+#define XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/service/hlo.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Runs the given IR on the inputs from `snapshot` and returns the result.
+absl::StatusOr<SmallVector<InterpreterValue>> Run(
+    MLIRContext& context, const std::string& mlir_ir,
+    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
+    const std::vector<std::string>& entry);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
new file mode 100644
index 00000000..7a437420
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
+#define XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
+
+#include <string>
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassInstrumentation.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Instrumentation that logs the state of the IR after each pass.
+class MlirCompilerTraceInstrumentation : public PassInstrumentation {
+ public:
+  explicit MlirCompilerTraceInstrumentation(MlirCompilationTrace& trace)
+      : trace_(trace) {}
+  void runAfterPass(Pass* pass, Operation* op) override;
+
+ private:
+  MlirCompilationTrace& trace_;
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
new file mode 100644
index 00000000..c272900b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
@@ -0,0 +1,77 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
+#define XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/literal.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Interpreter listener that builds a trace of all executed ops and regions.
+class ExecutionTraceListener : public InterpreterListener {
+ public:
+  explicit ExecutionTraceListener(ExecutionTrace* trace) : trace_(trace) {}
+
+  void BeforeOp(ArrayRef<InterpreterValue> args, Operation* op) override;
+  void AfterOp(ArrayRef<InterpreterValue> results) override;
+  void EnterRegion(ArrayRef<InterpreterValue> bbargs, Region& region) override;
+  void LeaveRegion(ArrayRef<InterpreterValue> yielded) override;
+
+ private:
+  ExecutionTrace* trace_;
+  SmallVector<RegionTrace*> regions_;
+};
+
+// Returns an attribute with the given contents and type.
+llvm::SmallVector<mlir::Attribute> ValueToAttribute(
+    const InterpreterValue& value, mlir::Type type);
+
+// Deserializes the given literal.
+absl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal);
+// Deserializes the given literal and then casts it to the given type.
+absl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal, mlir::Type type);
+
+// Deserializes the given literal.
+absl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal);
+
+// Serializes the given interpreter value.
+TracedValue ValueToTracedValue(const InterpreterValue& value);
+
+// Deserializes the given traced value.
+absl::StatusOr<InterpreterValue> TracedValueToValue(
+    const TracedValue& traced_value);
+
+// Returns all executions of the given op in the given trace.
+llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
+    const ExecutionTrace& trace, mlir::Operation* op);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/utils/error_util.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/utils/error_util.h
new file mode 100644
index 00000000..38ed8a3c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/utils/error_util.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_UTILS_ERROR_UTIL_H_
+#define XLA_MLIR_UTILS_ERROR_UTIL_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+
+// Error utilities for MLIR when interacting with code using absl::Status
+// returns.
+namespace mlir {
+// Diagnostic handler that collects all the diagnostics reported and can
+// produce a absl::Status to return to callers. This is for the case where MLIR
+// functions are called from a function that will return a absl::Status: MLIR
+// code still uses the default error reporting, and the final return function
+// can return the absl::Status constructed from the diagnostics collected.
+class BaseScopedDiagnosticHandler : public SourceMgrDiagnosticHandler {
+ public:
+  explicit BaseScopedDiagnosticHandler(MLIRContext* context,
+                                       bool propagate = false,
+                                       bool filter_stack = false);
+  // On destruction error consumption is verified.
+  ~BaseScopedDiagnosticHandler();
+  // Returns whether any errors were reported.
+  bool ok() const;
+
+  // Returns absl::Status corresponding to the diagnostics reported. This
+  // consumes the diagnostics reported and returns a absl::Status of type
+  // Unknown. It is required to consume the error status, if there is one,
+  // before destroying the object.
+  absl::Status ConsumeStatus();
+
+  // Returns the combination of the passed in status and consumed diagnostics.
+  // This consumes the diagnostics reported and either appends the diagnostics
+  // to the error message of 'status' (if 'status' is already an error state),
+  // or returns an Unknown status (if diagnostics reported), otherwise OK.
+  absl::Status Combine(absl::Status status);
+
+ protected:
+  LogicalResult handler(Diagnostic* diag);
+
+  // String stream to assemble the final error message.
+  std::string diag_str_;
+  llvm::raw_string_ostream diag_stream_;
+
+  // A SourceMgr to use for the base handler class.
+  llvm::SourceMgr source_mgr_;
+
+  // Whether to propagate diagnostics to the old diagnostic handler.
+  bool propagate_;
+};
+}  // namespace mlir
+
+#endif  // XLA_MLIR_UTILS_ERROR_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir/utils/type_util.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir/utils/type_util.h
new file mode 100644
index 00000000..01206aa4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir/utils/type_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_UTILS_TYPE_UTIL_H_
+#define XLA_MLIR_UTILS_TYPE_UTIL_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Types.h"
+#include "xla/xla_data.pb.h"
+
+// Type utilities to match MLIR types to XLA primitive types and vice versa.
+namespace xla {
+// Converts an XLA primitive type to the corresponding MLIR type.
+// Signed XLA primitive types are converted to signless MLIR types;
+// unsigned XLA primitive types are converted to unsigned MLIR types.
+absl::StatusOr<mlir::Type> ConvertPrimitiveTypeToMlirType(
+    xla::PrimitiveType type, mlir::Builder b);
+
+// Returns an XLA xla::PrimitiveType equivalent of an MLIR Type that represents
+// a primitive type (e.g., i8, f32), else returns PRIMITIVE_TYPE_INVALID.
+// Signless MLIR types are converted to signed XLA primitive types.
+xla::PrimitiveType ConvertMlirTypeToPrimitiveType(mlir::Type type);
+}  // namespace xla
+
+#endif  // XLA_MLIR_UTILS_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h
new file mode 100644
index 00000000..23fc4f11
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.h
@@ -0,0 +1,434 @@
+/* Copyright 2021 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef MLIR_HLO_BINDINGS_C_ATTRIBUTES_H
+#define MLIR_HLO_BINDINGS_C_ATTRIBUTES_H
+
+#include <sys/types.h>
+
+#include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Creates a new ScatterDimensionNumbers attribute with the given parameters.
+// The first three pairs of arguments are interpreted as arrays.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloScatterDimensionNumbersGet(
+    MlirContext ctx,                                                  //
+    intptr_t nUpdateWindowDims, const int64_t *updateWindowDims,      //
+    intptr_t nInsertedWindowDims, const int64_t *insertedWindowDims,  //
+    intptr_t nInputBatchingDims, const int64_t *inputBatchingDims,    //
+    intptr_t nScatterIndicesBatchingDims,                             //
+    const int64_t *scatterIndicesBatchingDims,                        //
+    intptr_t nScatteredDimsToOperandDims,                             //
+    const int64_t *scatteredDimsToOperandDims,                        //
+    int64_t indexVectorDim);
+
+// Returns true if the given attribute is a ScatterDimenionNumbers attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAScatterDimensionNumbers(
+    MlirAttribute attr);
+
+// Returns the properties of ScatterDimensionNumbers attributes.
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetUpdateWindowDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetUpdateWindowDimsElem(MlirAttribute attr,
+                                                       intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetInsertedWindowDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetInsertedWindowDimsElem(MlirAttribute attr,
+                                                         intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetInputBatchingDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetInputBatchingDimsElem(MlirAttribute attr,
+                                                        intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsSize(
+    MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetScatterIndicesBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloScatterDimensionNumbersGetScatteredDimsToOperandDimsSize(
+    MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloScatterDimensionNumbersGetScatteredDimsToOperandDimsElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloDimensionNumbersGetIndexVectorDim(MlirAttribute attr);
+
+// Creates a new GatherDimensionNumbers attribute with the given parameters. The
+// first three pairs of arguments are interpreted as arrays.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloGatherDimensionNumbersGet(
+    MlirContext ctx,                                                    //
+    intptr_t nOffsetDims, const int64_t *offsetDims,                    //
+    intptr_t nCollapsedSliceDims, const int64_t *collapsedSliceDims,    //
+    intptr_t nOperandBatchingDims, const int64_t *operandBatchingDims,  //
+    intptr_t nStartIndicesBatchingDims,
+    const int64_t *startIndicesBatchingDims,                //
+    intptr_t nStartIndexMap, const int64_t *startIndexMap,  //
+    int64_t indexVectorDim);
+
+// Returns true if the given attribute is a GatherDimensionNumbers attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAGatherDimensionNumbers(
+    MlirAttribute attr);
+
+// Returns the properties of GatherDimensionNumbers attributes.
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetOffsetDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t mlirMhloGatherDimensionNumbersGetOffsetDimsElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsElem(MlirAttribute attr,
+                                                        intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetOperandBatchingDimsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloGatherDimensionNumbersGetOperandBatchingDimsElem(MlirAttribute attr,
+                                                         intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsSize(
+    MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloGatherDimensionNumbersGetStartIndicesBatchingDimsElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloGatherDimensionNumbersGetStartIndexMapSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t mlirMhloGatherDimensionNumbersGetStartIndexMapElem(
+    MlirAttribute attr, intptr_t pos);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloGatherDimensionNumbersGetIndexVectorDim(MlirAttribute attr);
+
+// Creates a new DotDimensionNumbers attribute with the given parameters. The
+// argument pairs are interpreted as arrays with the leading argument being the
+// number of elements and the trailing argument being the pointer to the first
+// element of the array.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloDotDimensionNumbersGet(
+    MlirContext ctx,                                                        //
+    intptr_t nLhsBatchingDimensions, const int64_t *lhsBatchingDimensions,  //
+    intptr_t nRhsBatchingDimensions, const int64_t *rhsBatchingDimensions,  //
+    intptr_t nLhsContractingDimensions,                                     //
+    const int64_t *lhsContractingDimensions,                                //
+    intptr_t nRhsContractingDimensions,                                     //
+    const int64_t *rhsContractingDimensions);
+
+// Returns true of the given attribute is a DotDimensionNumbers attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsADotDimensionNumbers(
+    MlirAttribute attr);
+
+// Returns the properties of DotDimensionNumbers attributes.
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloDotDimensionNumbersGetLhsBatchingDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloDotDimensionNumbersGetLhsBatchingDimensionsElem(MlirAttribute attr,
+                                                        intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloDotDimensionNumbersGetRhsBatchingDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloDotDimensionNumbersGetRhsBatchingDimensionsElem(MlirAttribute attr,
+                                                        intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloDotDimensionNumbersGetLhsContractingDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloDotDimensionNumbersGetLhsContractingDimensionsElem(MlirAttribute attr,
+                                                           intptr_t pos);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloDotDimensionNumbersGetRhsContractingDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloDotDimensionNumbersGetRhsContractingDimensionsElem(MlirAttribute attr,
+                                                           intptr_t pos);
+
+// Creates a new ConvDimensionNumbers attribute with the given parameters. The
+// pairs of consecutive intptr_t / int64_t* arguments are interpeted as sized
+// arrays.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloConvDimensionNumbersGet(
+    MlirContext ctx, int64_t inputBatchDimension, int64_t inputFeatureDimension,
+    intptr_t nInputSpatialDimensions, const int64_t *inputSpatialDimensions,
+    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
+    intptr_t nKernelSpatialDimensions, const int64_t *kernelSpatialDimensions,
+    int64_t outputBatchDimension, int64_t outputFeatureDimension,
+    intptr_t nOutputSpatialDimensions, const int64_t *outputSpatialDimensions);
+
+// Returns true of the given attribute is a ConvDimensionNumbers attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAConvDimensionNumbers(
+    MlirAttribute attr);
+
+// Returns the properties of ConvDimensionNumbers attributes.
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetInputBatchDimension(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetInputFeatureDimension(MlirAttribute attr);
+
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloConvDimensionNumbersGetInputSpatialDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetInputSpatialDimensionsElem(MlirAttribute attr,
+                                                          intptr_t pos);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetKernelInputFeatureDimension(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetKernelOutputFeatureDimension(MlirAttribute attr);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloConvDimensionNumbersGetKernelSpatialDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetKernelSpatialDimensionsElem(MlirAttribute attr,
+                                                           intptr_t pos);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetOutputBatchDimension(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetOutputFeatureDimension(MlirAttribute attr);
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloConvDimensionNumbersGetOutputSpatialDimensionsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloConvDimensionNumbersGetOutputSpatialDimensionsElem(MlirAttribute attr,
+                                                           intptr_t pos);
+
+// Creates a new OutputOperandAlias attribute with the given parameters. The
+// pairs of consecutive intptr_t / int64_t* arguments are interpeted as sized
+// arrays.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloOutputOperandAliasGet(
+    MlirContext ctx, intptr_t nOutputTupleIndices,
+    const int64_t *outputTupleIndices, int64_t operandIndex,
+    intptr_t nOperandTupleIndices, const int64_t *operandTupleIndices);
+
+// Returns true of the given attribute is a OutputOperandAlias attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAOutputOperandAlias(
+    MlirAttribute attr);
+
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloOutputOperandAliasGetOutputTupleIndicesSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t mlirMhloOutputOperandAliasGetOutputTupleIndicesElem(
+    MlirAttribute attr, intptr_t pos);
+
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloOutputOperandAliasGetOperandIndex(MlirAttribute attr);
+
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloOutputOperandAliasGetOperandTupleIndicesSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t mlirMhloOutputOperandAliasGetOperandTupleIndicesElem(
+    MlirAttribute attr, intptr_t pos);
+
+//
+// ComparisonDirectionAttr.
+//
+// Creates a new ComparisonDirection attribute with the given
+// 'value' string parameter.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirMhloComparisonDirectionAttrGet(MlirContext ctx, MlirStringRef value);
+
+// Returns true if the given attribute is a ComparisonDirection attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAComparisonDirectionAttr(
+    MlirAttribute attr);
+
+// Returns the direction string associated with ComparisonDirection attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloComparisonDirectionAttrGetValue(MlirAttribute attr);
+
+//
+// ComparisonTypeAttr.
+//
+// Creates a new ComparisonType attribute with the given 'value' string
+// parameter.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirMhloComparisonTypeAttrGet(MlirContext ctx, MlirStringRef value);
+
+// Returns true if the given attribute is a ComparisonType attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAComparisonTypeAttr(
+    MlirAttribute attr);
+
+// Returns the type string associated with ComparisonType attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloComparisonTypeAttrGetValue(MlirAttribute attr);
+
+//
+// DomainKindAttr.
+//
+// Creates a new DomainKind attribute with the given 'value' string
+// parameter.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloDomainKindAttrGet(MlirContext ctx,
+                                                           MlirStringRef value);
+
+// Returns true if the given attribute is a DomainKind attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsADomainKindAttr(MlirAttribute attr);
+
+// Returns the value string associated with DomainKind attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloDomainKindAttrGetValue(MlirAttribute attr);
+
+//
+// PrecisionAttr.
+//
+// Creates a new Precision attribute with the given 'value' string
+// parameter.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloPrecisionAttrGet(MlirContext ctx,
+                                                          MlirStringRef value);
+
+// Returns true if the given attribute is a Precision attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAPrecisionAttr(MlirAttribute attr);
+
+// Returns the value string associated with Precision attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloPrecisionAttrGetValue(MlirAttribute attr);
+
+//
+// FftTypeAttr.
+//
+// Creates a new FftType attribute with the given 'value' string parameter.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloFftTypeAttrGet(MlirContext ctx,
+                                                        MlirStringRef value);
+
+// Returns true if the given attribute is a FftType attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAFftTypeAttr(MlirAttribute attr);
+
+// Returns the value string associated with FftType attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloFftTypeAttrGetValue(MlirAttribute attr);
+
+//
+// DequantizeModeAttr.
+//
+// Creates a new DequantizeMode attribute with the given 'value' string
+// parameter.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirMhloDequantizeModeAttrGet(MlirContext ctx, MlirStringRef value);
+
+// Returns true if the given attribute is a DequantizeMode attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsADequantizeModeAttr(
+    MlirAttribute attr);
+
+// Returns the value string associated with DequantizeMode attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloDequantizeModeAttrGetValue(MlirAttribute attr);
+
+//
+// TransposeAttr.
+//
+// Creates a new Transpose attribute with the given 'value' string parameter.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloTransposeAttrGet(MlirContext ctx,
+                                                          MlirStringRef value);
+
+// Returns true if the given attribute is a Transpose attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsATransposeAttr(MlirAttribute attr);
+
+// Returns the value string associated with Transpose attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloTransposeAttrGetValue(MlirAttribute attr);
+
+//
+// FusionKindAttr.
+//
+// Creates a new FusionKind attribute with the given 'value' string parameter.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloFusionKindAttrGet(MlirContext ctx,
+                                                           MlirStringRef value);
+
+// Returns true if the given attribute is a FusionKind attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAFusionKindAttr(MlirAttribute attr);
+
+// Returns the value string associated with FusionKind attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloFusionKindAttrGetValue(MlirAttribute attr);
+
+//
+// RngDistributionAttr.
+//
+// Creates a new RngDistribution attribute with the given 'value' string
+// parameter.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirMhloRngDistributionAttrGet(MlirContext ctx, MlirStringRef value);
+
+// Returns true if the given attribute is a RngDistribution attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsARngDistributionAttr(
+    MlirAttribute attr);
+
+// Returns the value string associated with RngDistribution attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloRngDistributionAttrGetValue(MlirAttribute attr);
+
+//
+// RngAlgorithmAttr.
+//
+// Creates a new RngAlgorithm attribute with the given 'value' string
+// parameter.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirMhloRngAlgorithmAttrGet(MlirContext ctx, MlirStringRef value);
+
+// Returns true if the given attribute is a RngAlgorithm attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsARngAlgorithmAttr(
+    MlirAttribute attr);
+
+// Returns the value string associated with RngAlgorithm attribute.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirMhloRngAlgorithmAttrGetValue(MlirAttribute attr);
+
+//
+// ChannelHandle
+//
+// Creates a new ChannelHandle attribute with the given 'handle' int64_t
+// parameter and the given 'type' int64_t parameter.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloChannelHandleGet(MlirContext ctx,
+                                                          int64_t handle,
+                                                          int64_t type);
+
+// Returns true if the given attribute is a ChannelHandle attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsChannelHandle(MlirAttribute attr);
+
+// Returns the handle integer associated with the ChannelHandle attribute.
+MLIR_CAPI_EXPORTED int64_t mlirMhloChannelHandleGetHandle(MlirAttribute attr);
+
+// Returns the type integer associated with the ChannelHandle attribute.
+MLIR_CAPI_EXPORTED int64_t mlirMhloChannelHandleGetType(MlirAttribute attr);
+
+//
+// TypeExtensions
+//
+// Creates a new TypeExtensions attribute with the given 'bounds' which
+// is interpreted as an array.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloTypeExtensionsGet(
+    MlirContext ctx, intptr_t nBounds, const int64_t *bounds);
+
+// Returns true if the given attribute is a TypeExtensions attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsTypeExtensions(MlirAttribute attr);
+
+// Returns the size and the elements of the bounds associated with the
+// TypeExtensions attributes.
+MLIR_CAPI_EXPORTED intptr_t
+mlirMhloTypeExtensionsGetBoundsSize(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloTypeExtensionsGetBoundsElem(MlirAttribute attr, intptr_t pos);
+
+//
+// SparsityDescriptor
+//
+// Creates a SparseDescriptor attribute with the given sparsity configurations.
+MLIR_CAPI_EXPORTED MlirAttribute mlirMhloSparsityDescriptorGet(
+    MlirContext ctx, int64_t dimension, int64_t n, int64_t m);
+
+// Returns true if the given attribute is a SparsityDescriptor attribute.
+MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsASparsityDescriptor(
+    MlirAttribute attr);
+
+// Returns the dimension and N:M sparsity configurations.
+MLIR_CAPI_EXPORTED int64_t
+mlirMhloSparsityDescriptorGetDimension(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t mlirMhloSparsityDescriptorGetN(MlirAttribute attr);
+MLIR_CAPI_EXPORTED int64_t mlirMhloSparsityDescriptorGetM(MlirAttribute attr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MLIR_HLO_BINDINGS_C_ATTRIBUTES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Dialects.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Dialects.h
new file mode 100644
index 00000000..a21cc205
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Dialects.h
@@ -0,0 +1,29 @@
+/* Copyright 2021 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_BINDINGS_C_DIALECTS_H
+#define MLIR_HLO_BINDINGS_C_DIALECTS_H
+
+#include "mlir-c/RegisterEverything.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(Chlo, chlo);
+MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(Mhlo, mhlo);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MLIR_HLO_BINDINGS_C_DIALECTS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Passes.h
new file mode 100644
index 00000000..85cf8a9f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Passes.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_BINDINGS_C_PASSES_H
+#define MLIR_HLO_BINDINGS_C_PASSES_H
+
+#include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Register all compiler passes of MHLO.
+MLIR_CAPI_EXPORTED void mlirRegisterAllMhloPasses();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MLIR_HLO_BINDINGS_C_PASSES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Types.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Types.h
new file mode 100644
index 00000000..bd0d825e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/bindings/c/Types.h
@@ -0,0 +1,33 @@
+/* Copyright 2021 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_BINDINGS_C_TYPES_H
+#define MLIR_HLO_BINDINGS_C_TYPES_H
+
+#include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Creates a token type in the given context.
+MLIR_CAPI_EXPORTED MlirType mlirMhloTokenTypeGet(MlirContext ctx);
+
+// Returns true if the type is an MHLO Token type.
+MLIR_CAPI_EXPORTED bool mlirMhloTypeIsAToken(MlirType type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MLIR_HLO_BINDINGS_C_TYPES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h
new file mode 100644
index 00000000..b0ead40c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_DEALLOCATION_TRANSFORMS_PASSES_H
+#define MLIR_HLO_DEALLOCATION_TRANSFORMS_PASSES_H
+
+#include <memory>
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace deallocation {
+
+// Pass to reuse buffers (hoisting, double buffering, dealloc/alloc
+// coalescing).
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createBufferReusePass();
+
+#define GEN_PASS_REGISTRATION
+#include "deallocation/transforms/passes.h.inc"
+
+}  // namespace deallocation
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DEALLOCATION_TRANSFORMS_PASSES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
new file mode 100644
index 00000000..c18c8f9d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
@@ -0,0 +1,150 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_DEALLOCATION_UTILS_UTIL_H_
+#define MLIR_HLO_DEALLOCATION_UTILS_UTIL_H_
+
+#include <optional>
+#include <set>
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+
+namespace mlir {
+namespace deallocation {
+
+struct RegionEdge {
+  // The op in the predecessor that holds the values that are passed. This is
+  // either the parent op or a terminator in the predecessor region.
+  Operation* predecessorOp;
+  // The index in `opWithOperands`' operands where `operands` start.
+  int64_t predecessorOperandIndex;
+  // The op or region where the values are passed. If the successor is the
+  // parent region, this is the parent op.
+  llvm::PointerUnion<Operation*, Region*> successorOpOrRegion;
+  // The index in the successor's arguments or op results where `operands`
+  // start.
+  int64_t successorValueIndex;
+  RegionBranchPoint predecessorRegionPoint = RegionBranchPoint::parent();
+  RegionBranchPoint successorRegionPoint = RegionBranchPoint::parent();
+
+  ValueRange getPredecessorOperands() const {
+    return predecessorOp->getOperands().drop_front(predecessorOperandIndex);
+  }
+
+  Value getPredecessorOperand(unsigned successorIndex) const {
+    return getPredecessorOperands()[successorIndex - successorValueIndex];
+  }
+
+  ValueRange getSuccessorValues() const {
+    if (successorOpOrRegion.is<Operation*>()) {
+      return successorOpOrRegion.get<Operation*>()->getResults().drop_front(
+          successorValueIndex);
+    }
+    return successorOpOrRegion.get<Region*>()->getArguments().drop_front(
+        successorValueIndex);
+  }
+
+  Value getSuccessorValue(unsigned predecessorIndex) const {
+    return getSuccessorValues()[predecessorIndex - predecessorOperandIndex];
+  }
+};
+
+// Returns predecessors of the given region.
+SmallVector<RegionEdge> getPredecessorRegions(RegionBranchOpInterface op,
+                                              RegionBranchPoint index);
+
+SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
+                                            RegionBranchPoint index);
+
+// Replaces the op with a new op with proper return types. The old op is not
+// removed and it still has uses.
+RegionBranchOpInterface moveRegionsToNewOpButKeepOldOp(
+    RegionBranchOpInterface op);
+
+namespace detail {
+// An arbitrary deterministic Value order.
+struct ValueComparator {
+  bool operator()(const Value& lhs, const Value& rhs) const {
+    if (lhs == rhs) return false;
+
+    // Block arguments are less than results.
+    bool lhsIsBBArg = isa<BlockArgument>(lhs);
+    if (lhsIsBBArg != isa<BlockArgument>(rhs)) {
+      return lhsIsBBArg;
+    }
+
+    Region* lhsRegion;
+    Region* rhsRegion;
+    if (lhsIsBBArg) {
+      auto lhsBBArg = llvm::cast<BlockArgument>(lhs);
+      auto rhsBBArg = llvm::cast<BlockArgument>(rhs);
+      if (lhsBBArg.getArgNumber() != rhsBBArg.getArgNumber()) {
+        return lhsBBArg.getArgNumber() < rhsBBArg.getArgNumber();
+      }
+      lhsRegion = lhsBBArg.getParentRegion();
+      rhsRegion = rhsBBArg.getParentRegion();
+      assert(lhsRegion != rhsRegion &&
+             "lhsRegion == rhsRegion implies lhs == rhs");
+    } else if (lhs.getDefiningOp() == rhs.getDefiningOp()) {
+      return llvm::cast<OpResult>(lhs).getResultNumber() <
+             llvm::cast<OpResult>(rhs).getResultNumber();
+    } else {
+      lhsRegion = lhs.getDefiningOp()->getParentRegion();
+      rhsRegion = rhs.getDefiningOp()->getParentRegion();
+      if (lhsRegion == rhsRegion) {
+        return lhs.getDefiningOp()->isBeforeInBlock(rhs.getDefiningOp());
+      }
+    }
+
+    // lhsRegion != rhsRegion, so if we look at their ancestor chain, they
+    // - have different heights
+    // - or there's a spot where their region numbers differ
+    // - or their parent regions are the same and their parent ops are
+    //   different.
+    while (lhsRegion && rhsRegion) {
+      if (lhsRegion->getRegionNumber() != rhsRegion->getRegionNumber()) {
+        return lhsRegion->getRegionNumber() < rhsRegion->getRegionNumber();
+      }
+      if (lhsRegion->getParentRegion() == rhsRegion->getParentRegion()) {
+        return lhsRegion->getParentOp()->isBeforeInBlock(
+            rhsRegion->getParentOp());
+      }
+      lhsRegion = lhsRegion->getParentRegion();
+      rhsRegion = rhsRegion->getParentRegion();
+    }
+    if (rhsRegion) return true;
+    assert(lhsRegion && "this should only happen if lhs == rhs");
+    return false;
+  }
+};
+}  // namespace detail
+
+namespace breaks_if_you_move_ops {
+
+// The comparator depends on the location of ops, so if you insert an op into
+// a set and then move it, it may end up in the wrong location.
+using ValueEquivalenceClasses =
+    llvm::EquivalenceClasses<Value, detail::ValueComparator>;
+using ValueSet = std::set<Value, detail::ValueComparator>;
+template <typename T>
+using ValueMap = std::map<Value, T, detail::ValueComparator>;
+
+}  // namespace breaks_if_you_move_ops
+
+}  // namespace deallocation
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DEALLOCATION_UTILS_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h
new file mode 100644
index 00000000..843e3919
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the MHLO dialect.
+
+#ifndef MLIR_HLO_MHLO_IR_HLO_OPS_H
+#define MLIR_HLO_MHLO_IR_HLO_OPS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/TensorEncoding.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "stablehlo/dialect/Base.h"
+
+// Forward declaration for hlo_ops_typedefs.h.inc.
+namespace mlir::mhlo::detail {
+struct AsyncBundleTypeStorage;
+}  // namespace mlir::mhlo::detail
+
+// Include order below matters.
+#include "mhlo/IR/hlo_ops_enums.h.inc"
+#define GET_ATTRDEF_CLASSES
+#include "mhlo/IR/hlo_ops_attrs.h.inc"
+#define GET_TYPEDEF_CLASSES
+#include "mhlo/IR/hlo_ops_typedefs.h.inc"
+
+namespace mlir {
+class OpBuilder;
+
+namespace mhlo {
+
+class MhloDialect : public Dialect {
+ public:
+  explicit MhloDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "mhlo"; }
+
+  // Registered hook to materialize a constant operation from a given attribute
+  // value with the desired resultant type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+
+  // Registered hook to verify region arg attributes on operations.
+  LogicalResult verifyRegionArgAttribute(mlir::Operation *op,
+                                         unsigned regionIndex,
+                                         unsigned argIndex,
+                                         mlir::NamedAttribute attr) override;
+
+  // Registered hook to verify an attribute from this dialect on operations.
+  LogicalResult verifyOperationAttribute(mlir::Operation *op,
+                                         mlir::NamedAttribute attr) override;
+
+  // Parses a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  // Prints a type registered to this dialect.
+  void printType(Type type, DialectAsmPrinter &os) const override;
+
+  // Parses an attribute registered to this dialect.
+  Attribute parseAttribute(DialectAsmParser &parser, Type type) const override;
+
+  // Prints an attribute registered to this dialect.
+  void printAttribute(Attribute attr, DialectAsmPrinter &os) const override;
+};
+
+class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
+ public:
+  using Base::Base;
+  static constexpr StringLiteral name = "mhlo.token";
+};
+
+void printConvolutionDimensions(AsmPrinter &p, ConvDimensionNumbersAttr dnums);
+void printConvolutionDimensions(AsmPrinter &p, Operation *,
+                                ConvDimensionNumbersAttr dnums);
+ParseResult parseConvolutionDimensions(AsmParser &parser,
+                                       ConvDimensionNumbersAttr &dnums);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "mhlo/IR/hlo_ops.h.inc"
+
+namespace mlir::mhlo {
+
+SortOp createSortOp(PatternRewriter *rewriter, const Location &loc,
+                    const llvm::ArrayRef<Value> &operands,
+                    const llvm::ArrayRef<Type> &elementTypes, int64_t dimension,
+                    bool isStable, ComparisonDirection direction);
+
+}  // namespace mlir::mhlo
+
+#endif  // MLIR_HLO_MHLO_IR_HLO_OPS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.h
new file mode 100644
index 00000000..854e9867
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.h
@@ -0,0 +1,58 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_IR_HLO_OPS_COMMON_H
+#define MLIR_HLO_MHLO_IR_HLO_OPS_COMMON_H
+
+// This file defines functionality shared between chlo/mhlo/lhlo dialects.
+
+#include <algorithm>
+#include <optional>
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+
+namespace mlir {
+namespace hlo {
+
+// Verifies the source target pairs attached to collective permute.
+LogicalResult verifyCollectivePermuteSourceTargetPairs(
+    Operation* op, DenseIntElementsAttr attr);
+
+LogicalResult verifyReduceScatter(Operation* op, TypeRange operandTypes,
+                                  TypeRange resultTypes,
+                                  uint64_t scatterDimension);
+
+// Custom formatting for convolution window attributes.
+void printWindowAttributes(OpAsmPrinter& p, Operation* op,
+                           std::optional<DenseIntElementsAttr> windowStrides,
+                           std::optional<DenseIntElementsAttr> padding,
+                           std::optional<DenseIntElementsAttr> lhsDilation,
+                           std::optional<DenseIntElementsAttr> rhsDilation,
+                           std::optional<DenseElementsAttr> windowReversal);
+
+ParseResult parseWindowAttributes(OpAsmParser& parser,
+                                  DenseIntElementsAttr& windowStrides,
+                                  DenseIntElementsAttr& padding,
+                                  DenseIntElementsAttr& lhsDilation,
+                                  DenseIntElementsAttr& rhsDilation,
+                                  DenseElementsAttr& windowReversal);
+
+}  // namespace hlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_IR_HLO_OPS_COMMON_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.h
new file mode 100644
index 00000000..53cc3854
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_IR_MHLO_BYTECODE_H
+#define MLIR_HLO_MHLO_IR_MHLO_BYTECODE_H
+
+namespace mlir {
+namespace mhlo {
+class MhloDialect;
+
+// Add the interface necessary for encoding and decoding StableHLO dialect
+// components in bytecode.
+void addBytecodeInterface(MhloDialect *dialect);
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_IR_MHLO_BYTECODE_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/register.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/register.h
new file mode 100644
index 00000000..f104221d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/IR/register.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
+#define MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
+
+namespace mlir {
+class DialectRegistry;
+namespace mhlo {
+
+// Add chlo and mhlo dialects to the provided registry.
+void registerAllMhloDialects(DialectRegistry &registry);
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
new file mode 100644
index 00000000..27d3a643
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
@@ -0,0 +1,170 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+#define MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+
+#include <optional>
+
+#include "llvm/Support/raw_ostream.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+
+// Analysis to infer shape information.
+//
+// This lazily analyzes the individual components of a shape (e.g., the
+// dimensions of a tensor) or value (e.g, the elements of a shape tensor).
+// Results are cached but the cache is not consistent across IR mutations and
+// needs to be reset in that case.
+class ShapeComponentAnalysis {
+ public:
+  // Represents the analysis request for a specific value. We are either
+  // interested in the shape of a value or the value itself.
+  class ShapeOrValueInfo {
+    llvm::PointerIntPair<Value, 1, bool> p;
+
+    explicit ShapeOrValueInfo(decltype(p) p) : p(p) {}
+    ShapeOrValueInfo(Value v, bool isValueInfo) : p(v, isValueInfo) {}
+
+   public:
+    static ShapeOrValueInfo getShapeInfoOf(Value v) { return {v, false}; }
+    static ShapeOrValueInfo getValueInfoOf(Value v) { return {v, true}; }
+    Value value() const { return p.getPointer(); }
+    bool isValueInfo() const { return p.getInt(); }
+    bool isShapeInfo() const { return !isValueInfo(); }
+
+    bool operator==(ShapeOrValueInfo rhs) const { return p == rhs.p; }
+    bool operator!=(ShapeOrValueInfo rhs) const { return !(*this == rhs); }
+
+    // Forward p's DenseMapInfo.
+    struct DenseMapInfo {
+      using PairInfo = llvm::DenseMapInfo<decltype(p)>;
+      static inline ShapeOrValueInfo getEmptyKey() {
+        return ShapeOrValueInfo(PairInfo::getEmptyKey());
+      }
+      static inline ShapeOrValueInfo getTombstoneKey() {
+        return ShapeOrValueInfo(PairInfo::getTombstoneKey());
+      }
+      static unsigned getHashValue(ShapeOrValueInfo val) {
+        return PairInfo::getHashValue(val.p);
+      }
+      static bool isEqual(ShapeOrValueInfo lhs, ShapeOrValueInfo rhs) {
+        return lhs == rhs;
+      }
+    };
+  };
+
+  // Symbolically represents one component of a shape (e.g., the dimensions of a
+  // tensor) or value (e.g, the elements of a shape tensor). This is used to tie
+  // symbolic expressions to components of shapes or values.
+  struct Symbol {
+    ShapeOrValueInfo source;
+    size_t index;
+
+    bool operator==(const Symbol &rhs) const {
+      return source == rhs.source && index == rhs.index;
+    }
+    bool operator!=(const Symbol &rhs) const { return !(*this == rhs); }
+  };
+
+  // Represents the analysis result for a one component of a shape (e.g., the
+  // dimensions of a tensor) or value (e.g, the elements of a shape tensor).
+  // This can be a constant or an expression over symbols.
+  struct SymbolicExpr {
+    SmallVector<Symbol, 1> symbols;
+    AffineExpr expr;
+
+    // Returns true if this symbolic expression is known to be a constant equal
+    // to `value`.
+    bool isConstant(int64_t value) const;
+    // Returns true if this symbolic expression is known to be different from
+    // `-1`. This is useful for reshapes.
+    bool isKnownNotNegativeOne() const;
+    // Returns true if thus symbolic expression is known to be different from
+    // `1`. This is useful for broadcasts.
+    bool isKnownNotOne() const;
+    // If this is a reference to a singular symbol, return it.
+    std::optional<Symbol> singleton() const;
+
+    bool operator==(const SymbolicExpr &rhs) const {
+      return expr == rhs.expr && symbols == rhs.symbols;
+    }
+    bool operator!=(const SymbolicExpr &rhs) const { return !(*this == rhs); }
+
+    void dump(llvm::raw_ostream &os = llvm::outs()) const;
+  };
+
+  using SymbolicExprsMap = DenseMap<ShapeOrValueInfo, std::vector<SymbolicExpr>,
+                                    ShapeOrValueInfo::DenseMapInfo>;
+  using SymbolicShapeConstraintsMap = DenseMap<int, Symbol>;
+
+ private:
+  // Mapping from the analysis requests to the results, i.e. to an array of
+  // symbolic expressions. This is essentially a cache for all the results of
+  // this analysis.
+  SymbolicExprsMap symbolicExprsMap;
+
+  // Mapping from symbolic shape constraints, derived from the argument
+  // attributes, to the symbols used in this analysis.
+  SymbolicShapeConstraintsMap symbolicShapeConstraintsMap;
+
+  // Run the analysis to request either shape or value information.
+  void compute(ShapeOrValueInfo v);
+
+ public:
+  // Return the computed components for the shape of a value, e.g., the
+  // dimensions of a tensor.
+  std::optional<ArrayRef<SymbolicExpr>> GetShapeInfo(Value value);
+  // Return the computed components for the value of a value, e.g, the elements
+  // of a shape tensor.
+  std::optional<ArrayRef<SymbolicExpr>> GetValueInfo(Value shape);
+
+  // Clear analysis data structures.
+  void reset();
+};
+}  // namespace mlir
+
+namespace llvm {
+
+template <>
+struct DenseMapInfo<mlir::ShapeComponentAnalysis::Symbol> {
+  static inline mlir::ShapeComponentAnalysis::Symbol getEmptyKey() {
+    return {mlir::ShapeComponentAnalysis::ShapeOrValueInfo::DenseMapInfo::
+                getEmptyKey(),
+            llvm::DenseMapInfo<size_t>::getEmptyKey()};
+  }
+  static inline mlir::ShapeComponentAnalysis::Symbol getTombstoneKey() {
+    return {mlir::ShapeComponentAnalysis::ShapeOrValueInfo::DenseMapInfo::
+                getTombstoneKey(),
+            llvm::DenseMapInfo<size_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(mlir::ShapeComponentAnalysis::Symbol symbol) {
+    return llvm::hash_combine(
+        mlir::ShapeComponentAnalysis::ShapeOrValueInfo::DenseMapInfo::
+            getHashValue(symbol.source),
+        llvm::DenseMapInfo<size_t>::getHashValue(symbol.index));
+  }
+  static bool isEqual(mlir::ShapeComponentAnalysis::Symbol lhs,
+                      mlir::ShapeComponentAnalysis::Symbol rhs) {
+    return lhs == rhs;
+  }
+};
+
+}  // namespace llvm
+
+#endif  // MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h
new file mode 100644
index 00000000..ba886da0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+#define MLIR_HLO_MHLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+
+#include <functional>
+#include <memory>
+
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+
+namespace mlir {
+namespace mhlo {
+
+/// Register the external models for bufferizing mhlo ops.
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_chlo_to_hlo_op.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_chlo_to_hlo_op.h
new file mode 100644
index 00000000..fc043d24
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_chlo_to_hlo_op.h
@@ -0,0 +1,137 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
+#define MLIR_HLO_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
+
+#include <optional>
+#include <type_traits>
+
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/IR/PatternMatch.h"
+#include "stablehlo/dialect/ChloOps.h"
+
+namespace mlir {
+namespace chlo {
+
+template <typename FromOpTy, typename ToOpTy>
+struct HloNaryElementwiseAdaptor {
+  static ToOpTy createOp(FromOpTy fromOp, Type resultType,
+                         ValueRange broadcastedOperands, OpBuilder &builder) {
+    return builder.create<ToOpTy>(fromOp.getLoc(), resultType,
+                                  broadcastedOperands);
+  }
+};
+
+inline std::optional<mhlo::ComparisonDirection> mhloComparisonDirection(
+    chlo::ComparisonDirection value) {
+  switch (value) {
+    case chlo::ComparisonDirection::EQ:
+      return mhlo::ComparisonDirection::EQ;
+    case chlo::ComparisonDirection::NE:
+      return mhlo::ComparisonDirection::NE;
+    case chlo::ComparisonDirection::GE:
+      return mhlo::ComparisonDirection::GE;
+    case chlo::ComparisonDirection::GT:
+      return mhlo::ComparisonDirection::GT;
+    case chlo::ComparisonDirection::LE:
+      return mhlo::ComparisonDirection::LE;
+    case chlo::ComparisonDirection::LT:
+      return mhlo::ComparisonDirection::LT;
+  }
+  return {};
+}
+
+inline std::optional<mhlo::ComparisonType> mhloComparisonType(
+    chlo::ComparisonType value) {
+  switch (value) {
+    case chlo::ComparisonType::NOTYPE:
+      return mhlo::ComparisonType::NOTYPE;
+    case chlo::ComparisonType::FLOAT:
+      return mhlo::ComparisonType::FLOAT;
+    case chlo::ComparisonType::TOTALORDER:
+      return mhlo::ComparisonType::TOTALORDER;
+    case chlo::ComparisonType::SIGNED:
+      return mhlo::ComparisonType::SIGNED;
+    case chlo::ComparisonType::UNSIGNED:
+      return mhlo::ComparisonType::UNSIGNED;
+  }
+  return {};
+}
+
+struct HloCompareAdaptor {
+  static mhlo::CompareOp createOp(BroadcastCompareOp fromOp, Type resultType,
+                                  ValueRange broadcastedOperands,
+                                  OpBuilder &builder) {
+    auto chloDirection = fromOp.getComparisonDirection();
+    auto mhloDirection = mhloComparisonDirection(chloDirection);
+    if (!mhloDirection) return nullptr;
+    auto chloType = fromOp.getCompareType().value_or(ComparisonType::NOTYPE);
+    auto mhloType = mhloComparisonType(chloType);
+    if (!mhloType) return nullptr;
+    auto mhloTypeAttr =
+        fromOp.getCompareType()
+            ? mhlo::ComparisonTypeAttr::get(builder.getContext(), *mhloType)
+            : nullptr;
+    return builder.create<mhlo::CompareOp>(
+        fromOp.getLoc(), resultType, broadcastedOperands[0],
+        broadcastedOperands[1], *mhloDirection, mhloTypeAttr);
+  }
+};
+
+// Populate a pattern for each Broadcasting CHlo op. This requires the pattern
+// to take a ChloOpTy, NonBroadcastingOpTy, and an Adaptor as templated values.
+template <template <typename, typename, typename> class Pattern,
+          typename... ConstructorArgs>
+void populateForBroadcastingBinaryOp(MLIRContext *context,
+                                     RewritePatternSet *patterns,
+                                     ConstructorArgs &&...args) {
+#define POPULATE_BCAST(ChloOp, HloOp)                                          \
+  patterns                                                                     \
+      ->add<Pattern<ChloOp, HloOp, HloNaryElementwiseAdaptor<ChloOp, HloOp>>>( \
+          context, args...);
+
+  POPULATE_BCAST(BroadcastAddOp, mhlo::AddOp);
+  POPULATE_BCAST(BroadcastAndOp, mhlo::AndOp);
+  POPULATE_BCAST(BroadcastAtan2Op, mhlo::Atan2Op);
+  POPULATE_BCAST(BroadcastComplexOp, mhlo::ComplexOp);
+  POPULATE_BCAST(BroadcastDivOp, mhlo::DivOp);
+  POPULATE_BCAST(BroadcastMaxOp, mhlo::MaxOp);
+  POPULATE_BCAST(BroadcastMinOp, mhlo::MinOp);
+  POPULATE_BCAST(BroadcastMulOp, mhlo::MulOp);
+  POPULATE_BCAST(BroadcastNextAfterOp, NextAfterOp);
+  POPULATE_BCAST(BroadcastOrOp, mhlo::OrOp);
+  POPULATE_BCAST(BroadcastPolygammaOp, PolygammaOp);
+  POPULATE_BCAST(BroadcastPowOp, mhlo::PowOp);
+  POPULATE_BCAST(BroadcastRemOp, mhlo::RemOp);
+  POPULATE_BCAST(BroadcastShiftLeftOp, mhlo::ShiftLeftOp);
+  POPULATE_BCAST(BroadcastShiftRightArithmeticOp, mhlo::ShiftRightArithmeticOp);
+  POPULATE_BCAST(BroadcastShiftRightLogicalOp, mhlo::ShiftRightLogicalOp);
+  POPULATE_BCAST(BroadcastSubOp, mhlo::SubtractOp);
+  POPULATE_BCAST(BroadcastXorOp, mhlo::XorOp);
+  POPULATE_BCAST(BroadcastZetaOp, ZetaOp);
+
+  // Broadcasting ops requiring special construction.
+  patterns
+      ->insert<Pattern<BroadcastCompareOp, mhlo::CompareOp, HloCompareAdaptor>>(
+          context, args...);
+
+#undef POPULATE_BCAST
+}
+
+}  // namespace chlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
new file mode 100644
index 00000000..7cd36729
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -0,0 +1,1283 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
+#define MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
+
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/transformation_helpers.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace mhlo {
+namespace impl {
+
+// A struct to map MhloBinaryOpTy type to the corresponding floating-point and
+// integer scalar operation types.
+template <typename MhloBinaryOpTy>
+struct MhloToScalarOp {
+  using FOp = void;
+  using IOp = void;
+  using UOp = void;
+  using COp = void;
+};
+
+template <>
+struct MhloToScalarOp<mhlo::AddOp> {
+  using FOp = ::mlir::arith::AddFOp;
+  using IOp = ::mlir::arith::AddIOp;
+  using UOp = ::mlir::arith::AddIOp;
+  using COp = ::mlir::complex::AddOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::AndOp> {
+  using IOp = ::mlir::arith::AndIOp;
+  using UOp = ::mlir::arith::AndIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CbrtOp> {
+  using FOp = ::mlir::math::CbrtOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CompareOp> {
+  using FOp = ::mlir::arith::CmpFOp;
+  using IOp = ::mlir::arith::CmpIOp;
+  using UOp = ::mlir::arith::CmpIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CeilOp> {
+  using FOp = ::mlir::math::CeilOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ClzOp> {
+  using IOp = ::mlir::math::CountLeadingZerosOp;
+  using UOp = ::mlir::math::CountLeadingZerosOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CosineOp> {
+  using FOp = ::mlir::math::CosOp;
+  using COp = ::mlir::complex::CosOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ErfOp> {
+  using FOp = ::mlir::math::ErfOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ExpOp> {
+  using FOp = ::mlir::math::ExpOp;
+  using COp = ::mlir::complex::ExpOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::Expm1Op> {
+  using FOp = ::mlir::math::ExpM1Op;
+  using COp = ::mlir::complex::Expm1Op;
+};
+template <>
+struct MhloToScalarOp<mhlo::FloorOp> {
+  using FOp = ::mlir::math::FloorOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::LogOp> {
+  using FOp = ::mlir::math::LogOp;
+  using COp = ::mlir::complex::LogOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::Log1pOp> {
+  using FOp = ::mlir::math::Log1pOp;
+  using COp = ::mlir::complex::Log1pOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::MulOp> {
+  using FOp = ::mlir::arith::MulFOp;
+  using IOp = ::mlir::arith::MulIOp;
+  using UOp = ::mlir::arith::MulIOp;
+  using COp = ::mlir::complex::MulOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::OrOp> {
+  using IOp = ::mlir::arith::OrIOp;
+  using UOp = ::mlir::arith::OrIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::PopulationCountOp> {
+  using IOp = ::mlir::math::CtPopOp;
+  using UOp = ::mlir::math::CtPopOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::RsqrtOp> {
+  using FOp = ::mlir::math::RsqrtOp;
+  using COp = ::mlir::complex::RsqrtOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::RoundNearestEvenOp> {
+  using FOp = ::mlir::math::RoundEvenOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::RoundOp> {
+  using FOp = ::mlir::math::RoundOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::SubtractOp> {
+  using FOp = ::mlir::arith::SubFOp;
+  using IOp = ::mlir::arith::SubIOp;
+  using UOp = ::mlir::arith::SubIOp;
+  using COp = ::mlir::complex::SubOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::SqrtOp> {
+  using FOp = ::mlir::math::SqrtOp;
+  using COp = ::mlir::complex::SqrtOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::SineOp> {
+  using FOp = ::mlir::math::SinOp;
+  using COp = ::mlir::complex::SinOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::TanOp> {
+  using FOp = ::mlir::math::TanOp;
+  using COp = ::mlir::complex::TanOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::Atan2Op> {
+  using FOp = ::mlir::math::Atan2Op;
+  using COp = ::mlir::complex::Atan2Op;
+};
+template <>
+struct MhloToScalarOp<mhlo::TanhOp> {
+  using FOp = ::mlir::math::TanhOp;
+  using COp = ::mlir::complex::TanhOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::XorOp> {
+  using IOp = ::mlir::arith::XOrIOp;
+  using UOp = ::mlir::arith::XOrIOp;
+};
+
+// Alias for the map from MHLO binary op type to STD floating-point op type.
+template <typename MhloOp>
+using ScalarFOp = typename MhloToScalarOp<MhloOp>::FOp;
+// Alias for the map from MHLO binary op type to STD signed integer op type.
+template <typename MhloOp>
+using ScalarIOp = typename MhloToScalarOp<MhloOp>::IOp;
+// Alias for the map from MHLO binary op type to STD unsigned integer op type.
+template <typename MhloOp>
+using ScalarUOp = typename MhloToScalarOp<MhloOp>::UOp;
+// Alias for the map from MHLO binary op type to STD complex op type.
+template <typename MhloOp>
+using ScalarCOp = typename MhloToScalarOp<MhloOp>::COp;
+
+template <typename... Args>
+struct MapMhloOpToScalarOpImpl {
+  Value operator()(Location /*loc*/, ArrayRef<Type> /*ResultTypes*/,
+                   ArrayRef<Type> /*argTypes*/, ValueRange /*args*/,
+                   ArrayRef<NamedAttribute> /*attributes*/, OpBuilder* /*b*/) {
+    return nullptr;
+  }
+};
+
+template <typename StdScalarOp>
+struct MapMhloOpToScalarOpImpl<StdScalarOp> {
+  Value operator()(Location loc, ArrayRef<Type> resultTypes,
+                   ArrayRef<Type> /*argTypes*/, ValueRange args,
+                   ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+    return b->template create<StdScalarOp>(loc, resultTypes, args, attributes);
+  }
+};
+
+template <typename SupportedType, typename StdScalarOp, typename... Args>
+struct MapMhloOpToScalarOpImpl<SupportedType, StdScalarOp, Args...> {
+  Value operator()(Location loc, ArrayRef<Type> resultTypes,
+                   ArrayRef<Type> argTypes, ValueRange args,
+                   ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+    Type elementType = getElementTypeOrSelf(argTypes.front());
+    if (SupportedType{}(elementType)) {
+      return b->template create<StdScalarOp>(loc, resultTypes, args,
+                                             attributes);
+    }
+    return MapMhloOpToScalarOpImpl<Args...>{}(loc, resultTypes, argTypes, args,
+                                              attributes, b);
+  }
+};
+
+template <typename SupportedType, typename... Args>
+struct MapMhloOpToScalarOpImpl<SupportedType, void, Args...> {
+  Value operator()(Location loc, ArrayRef<Type> resultTypes,
+                   ArrayRef<Type> argTypes, ValueRange args,
+                   ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+    return MapMhloOpToScalarOpImpl<Args...>{}(loc, resultTypes, argTypes, args,
+                                              attributes, b);
+  }
+};
+
+struct IsAnyIntegerType {
+  bool operator()(Type t) { return mlir::isa<IntegerType>(t); }
+};
+
+struct IsSignedIntegerType {
+  bool operator()(Type t) {
+    // Pretend that signless is signed. This will change eventually.
+    return mlir::isa<IntegerType>(t) && !t.isUnsignedInteger() &&
+           !t.isSignlessInteger(1);
+  }
+};
+
+struct IsUnsignedIntegerType {
+  bool operator()(Type t) {
+    return t.isUnsignedInteger() || t.isSignlessInteger(1);
+  }
+};
+
+struct IsFloatType {
+  bool operator()(Type t) { return mlir::isa<FloatType>(t); }
+};
+
+struct IsComplexType {
+  bool operator()(Type t) { return mlir::isa<ComplexType>(t); }
+};
+
+template <template <typename T> class MapTy, typename OpTy,
+          typename PredTy = llvm::is_detected<MapTy, OpTy>>
+struct MapableIf {
+  using type = void;
+};
+template <template <typename T> class MapTy, typename OpTy>
+struct MapableIf<MapTy, OpTy, std::true_type> {
+  using type = MapTy<OpTy>;
+};
+
+// Inserts the computation that corresponds to the body of the loop for lowered
+// MHLO unary/binary op. Returns the value for the result.
+template <typename MhloOpTy>
+inline Value mapMhloOpToStdScalarOp(Location loc, ArrayRef<Type> resultTypes,
+                                    ArrayRef<Type> argTypes,
+                                    typename MhloOpTy::Adaptor adaptor,
+                                    ArrayRef<NamedAttribute> attributes,
+                                    OpBuilder* b) {
+  using ScalarIOpOrVoid = typename MapableIf<ScalarIOp, MhloOpTy>::type;
+  using ScalarUOpOrVoid = typename MapableIf<ScalarUOp, MhloOpTy>::type;
+  using ScalarFOpOrVoid = typename MapableIf<ScalarFOp, MhloOpTy>::type;
+  using ScalarCOpOrVoid = typename MapableIf<ScalarCOp, MhloOpTy>::type;
+  return MapMhloOpToScalarOpImpl<IsSignedIntegerType, ScalarIOpOrVoid,
+                                 IsUnsignedIntegerType, ScalarUOpOrVoid,
+                                 IsFloatType, ScalarFOpOrVoid, IsComplexType,
+                                 ScalarCOpOrVoid>{}(
+      loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::AbsOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::AbsOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  Type elementType = getElementTypeOrSelf(argTypes.front());
+  if (mlir::isa<FloatType>(elementType)) {
+    return MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::math::AbsFOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+  }
+  if (mlir::isa<ComplexType>(elementType)) {
+    return MapMhloOpToScalarOpImpl<IsComplexType, ::mlir::complex::AbsOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+  }
+  if (elementType.isSignlessInteger() || elementType.isSignedInteger()) {
+    // lmhlo.abs(x, result) ->  result = select((x > 0), x, sub(0, x))
+    Value lhs = adaptor.getOperand();
+    Value zeroIntval =
+        b->create<arith::ConstantOp>(loc, b->getZeroAttr(lhs.getType()));
+    auto lhsGtZero = b->create<ScalarIOp<CompareOp>>(
+        loc, arith::CmpIPredicate::sge, lhs, zeroIntval);
+    auto negVal = b->create<ScalarIOp<mhlo::SubtractOp>>(loc, zeroIntval, lhs);
+    return b->create<::mlir::arith::SelectOp>(loc, lhsGtZero, lhs, negVal);
+  }
+  return nullptr;
+}
+
+// Return a constant for v of type t, splat if t is a vector type.
+inline Value getConstantOrSplat(OpBuilder* b, Location loc, Type t,
+                                Attribute v) {
+  if (VectorType vecType = mlir::dyn_cast<VectorType>(t)) {
+    v = SplatElementsAttr::get(vecType, v);
+  }
+  return b->create<arith::ConstantOp>(loc, t, cast<TypedAttr>(v));
+}
+
+template <typename PredicateType>
+inline std::optional<PredicateType> getCmpPredicate(mhlo::ComparisonDirection,
+                                                    bool) {
+  return std::nullopt;
+}
+
+template <>
+inline std::optional<arith::CmpFPredicate>
+getCmpPredicate<arith::CmpFPredicate>(
+    mhlo::ComparisonDirection comparisonDirection, bool isSigned) {
+  assert(isSigned && "cannot have an unsigned float!");
+  return llvm::StringSwitch<std::optional<arith::CmpFPredicate>>(
+             stringifyComparisonDirection(comparisonDirection))
+      .Case("EQ", arith::CmpFPredicate::OEQ)
+      .Case("NE", arith::CmpFPredicate::UNE)
+      .Case("GE", arith::CmpFPredicate::OGE)
+      .Case("GT", arith::CmpFPredicate::OGT)
+      .Case("LE", arith::CmpFPredicate::OLE)
+      .Case("LT", arith::CmpFPredicate::OLT)
+      .Default(std::nullopt);
+}
+
+template <>
+inline std::optional<arith::CmpIPredicate>
+getCmpPredicate<arith::CmpIPredicate>(
+    mhlo::ComparisonDirection comparisonDirection, bool isSigned) {
+  return llvm::StringSwitch<std::optional<arith::CmpIPredicate>>(
+             stringifyComparisonDirection(comparisonDirection))
+      .Case("EQ", arith::CmpIPredicate::eq)
+      .Case("NE", arith::CmpIPredicate::ne)
+      .Case("GE",
+            isSigned ? arith::CmpIPredicate::sge : arith::CmpIPredicate::uge)
+      .Case("GT",
+            isSigned ? arith::CmpIPredicate::sgt : arith::CmpIPredicate::ugt)
+      .Case("LE",
+            isSigned ? arith::CmpIPredicate::sle : arith::CmpIPredicate::ule)
+      .Case("LT",
+            isSigned ? arith::CmpIPredicate::slt : arith::CmpIPredicate::ult)
+      .Default(std::nullopt);
+}
+
+inline Value cmpComplex(Location loc, Value lhs, Value rhs,
+                        ComparisonDirection comparisonDirection, OpBuilder* b) {
+  auto complexType = mlir::cast<ComplexType>(lhs.getType());
+  if (mlir::isa<FloatType>(complexType.getElementType())) {
+    if (comparisonDirection == ComparisonDirection::EQ) {
+      return b->create<complex::EqualOp>(loc, lhs, rhs);
+    }
+    if (comparisonDirection == ComparisonDirection::NE) {
+      return b->create<complex::NotEqualOp>(loc, lhs, rhs);
+    }
+
+    // Perform a lexicographical comparison for the (real, imaginary) pair.
+    Type complexFloatTy = complexType.getElementType();
+
+    Value lhsReal = b->create<complex::ReOp>(loc, complexFloatTy, lhs);
+    Value rhsReal = b->create<complex::ReOp>(loc, complexFloatTy, rhs);
+
+    Value lhsImag = b->create<complex::ImOp>(loc, complexFloatTy, lhs);
+    Value rhsImag = b->create<complex::ImOp>(loc, complexFloatTy, rhs);
+
+    auto predicate = getCmpPredicate<arith::CmpFPredicate>(comparisonDirection,
+                                                           /*is_signed=*/true);
+    assert(predicate.has_value() && "expected valid comparison direction");
+
+    //   if (lhsReal == rhsReal && lhsImag `predicate` rhsImag ||
+    //       lhsReal `predicate` rhsReal)
+    Value realsAreEq = b->create<arith::CmpFOp>(loc, arith::CmpFPredicate::OEQ,
+                                                lhsReal, rhsReal);
+    Value imagsAreOrdered =
+        b->create<arith::CmpFOp>(loc, *predicate, lhsImag, rhsImag);
+    Value realsAreOrdered =
+        b->create<arith::CmpFOp>(loc, *predicate, lhsReal, rhsReal);
+
+    Value orLhs = b->create<arith::AndIOp>(loc, realsAreEq, imagsAreOrdered);
+    return b->create<arith::OrIOp>(loc, orLhs, realsAreOrdered);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
+    Location loc, ArrayRef<Type> /*resultTypes*/, ArrayRef<Type> argTypes,
+    mhlo::CompareOp::Adaptor adaptor, ArrayRef<NamedAttribute> /*attributes*/,
+    OpBuilder* b) {
+  ComparisonDirection comparisonDirection = adaptor.getComparisonDirection();
+  const auto& lhs = adaptor.getLhs();
+  const auto& rhs = adaptor.getRhs();
+  Type elementType = getElementTypeOrSelf(argTypes.front());
+  if (mlir::isa<IntegerType>(elementType)) {
+    bool isUnsigned = IsUnsignedIntegerType{}(elementType);
+    std::optional<arith::CmpIPredicate> predicate =
+        getCmpPredicate<arith::CmpIPredicate>(comparisonDirection, !isUnsigned);
+    assert(predicate.has_value() && "expected valid comparison direction");
+    return b->create<ScalarIOp<mhlo::CompareOp>>(loc, predicate.value(), lhs,
+                                                 rhs);
+  }
+  if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
+    if (adaptor.getCompareType() &&
+        *adaptor.getCompareType() == mhlo::ComparisonType::TOTALORDER) {
+      // The semantics of totalorder fp compare are
+      // -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
+      auto intType = b->getIntegerType(floatType.getWidth());
+      auto zero =
+          b->create<arith::ConstantOp>(loc, intType, b->getZeroAttr(intType));
+      auto max = b->create<arith::ConstantOp>(
+          loc, intType,
+          b->getIntegerAttr(intType,
+                            APInt::getSignedMaxValue(floatType.getWidth())));
+      // Switch from a floating point value to a integer value in such a way
+      // that when using the integer value to compare, we get the same result
+      // for normal values, and -NaN is treated as the smallest value, and NaN
+      // is treated as the largest value.
+      // If f is a float, and
+      // x = bit_cast<int32_t>(f);
+      // y = x < 0 ? numeric_limits<int32_t>::max() - x : x;
+      // then y is ordered as an int32_t such that finite values have the
+      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at the
+      // beginning and end of the ordering.
+      auto toIntegral = [&](Value v) {
+        auto x = b->create<arith::BitcastOp>(loc, intType, v);
+        auto cmp =
+            b->create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, x, zero);
+        auto sub = b->create<arith::SubIOp>(loc, max, x);
+        return b->create<arith::SelectOp>(loc, cmp, sub, x);
+      };
+      auto lhsInt = toIntegral(lhs);
+      auto rhsInt = toIntegral(rhs);
+      auto predicate =
+          getCmpPredicate<arith::CmpIPredicate>(comparisonDirection,
+                                                /*is_signed=*/true);
+      assert(predicate.has_value() && "expected valid comparison direction");
+      return b->create<arith::CmpIOp>(loc, *predicate, lhsInt, rhsInt);
+    }
+    std::optional<arith::CmpFPredicate> predicate =
+        getCmpPredicate<arith::CmpFPredicate>(comparisonDirection,
+                                              /*is_signed=*/true);
+    assert(predicate.has_value() && "expected valid comparison direction");
+    return b->create<ScalarFOp<mhlo::CompareOp>>(loc, predicate.value(), lhs,
+                                                 rhs);
+  }
+  if (auto complexType = mlir::dyn_cast<ComplexType>(elementType))
+    return cmpComplex(loc, lhs, rhs, comparisonDirection, b);
+  return nullptr;
+}
+
+static bool HasDefaultMantissaBits(Type type, uint32_t mantissa_bits) {
+  if (auto float_ty = mlir::dyn_cast<FloatType>(type)) {
+    return float_ty.getFPMantissaWidth() == mantissa_bits;
+  }
+  return false;
+}
+
+static bool HasDefaultExponentBits(Type type, uint32_t exponent_bits) {
+  if (auto float_ty = mlir::dyn_cast<FloatType>(type)) {
+    return float_ty.getWidth() - float_ty.getFPMantissaWidth() - 1 ==
+           exponent_bits;
+  }
+  return false;
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ReducePrecisionOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> /*argTypes*/,
+    mhlo::ReducePrecisionOp::Adaptor adaptor,
+    ArrayRef<NamedAttribute> /*attributes*/, OpBuilder* builder) {
+  // TODO(b/373787166): This should actually be a folder, but JAX is adding
+  // no-op ReducePrecision ops to workaround an issue with some simplifications
+  // allowed with the xla_allow_excess_precision flag. We would already fold
+  // these ops away before they reach HLO. Folding them away at emission time
+  // keeps the workaround intact.
+  if (HasDefaultExponentBits(resultTypes[0], adaptor.getExponentBits()) &&
+      HasDefaultMantissaBits(resultTypes[0], adaptor.getMantissaBits())) {
+    return adaptor.getOperand();
+  }
+  return reducePrecision<arith::BitcastOp>(loc, adaptor.getOperand(),
+                                           adaptor.getExponentBits(),
+                                           adaptor.getMantissaBits(), builder);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::CopyOp>(
+    Location /*loc*/, ArrayRef<Type> /*ResultTypes*/,
+    ArrayRef<Type> /*argTypes*/, mhlo::CopyOp::Adaptor adaptor,
+    ArrayRef<NamedAttribute> /*attributes*/, OpBuilder* /*b*/) {
+  return adaptor.getOperands().front();
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ComplexOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::ComplexOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  return MapMhloOpToScalarOpImpl<complex::CreateOp>{}(
+      loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::MaxOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::MaxOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  ValueRange operands = adaptor.getOperands();
+  Value lhs = operands.front();
+  Type complexTy = lhs.getType();
+
+  if (!mlir::isa<ComplexType>(complexTy))
+    return MapMhloOpToScalarOpImpl<IsFloatType, arith::MaximumFOp,
+                                   IsSignedIntegerType, arith::MaxSIOp,
+                                   IsUnsignedIntegerType, arith::MaxUIOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+
+  assert(resultTypes.size() == 1 && "MaxOp should return a single result");
+  assert(operands.size() == 2 && "MaxOp should take exactly two arguments");
+
+  Value rhs = operands.back();
+  // 'max' performs a lexicographical comparison for the (real, imaginary) pair.
+  Value cond = cmpComplex(loc, lhs, rhs, ComparisonDirection::GE, b);
+
+  return b->create<arith::SelectOp>(loc, cond, lhs, rhs).getResult();
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::MinOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::MinOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  ValueRange operands = adaptor.getOperands();
+  Value lhs = operands.front();
+  Type complexTy = lhs.getType();
+
+  if (!mlir::isa<ComplexType>(complexTy))
+    return MapMhloOpToScalarOpImpl<IsFloatType, arith::MinimumFOp,
+                                   IsSignedIntegerType, arith::MinSIOp,
+                                   IsUnsignedIntegerType, arith::MinUIOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+
+  assert(resultTypes.size() == 1 && "MinOp should return a single result");
+  assert(operands.size() == 2 && "MinOp should take exactly two arguments");
+
+  Value rhs = operands.back();
+  // 'min' performs a lexicographical comparison for the (real, imaginary) pair.
+  Value cond = cmpComplex(loc, lhs, rhs, ComparisonDirection::LE, b);
+
+  return b->create<arith::SelectOp>(loc, cond, lhs, rhs).getResult();
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::RealOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::RealOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  if (!mlir::isa<ComplexType>(adaptor.getOperand().getType()))
+    return adaptor.getOperand();
+  return MapMhloOpToScalarOpImpl<complex::ReOp>{}(
+      loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ImagOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::ImagOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  if (!mlir::isa<ComplexType>(adaptor.getOperand().getType()))
+    return b->create<arith::ConstantOp>(
+        loc, b->getZeroAttr(adaptor.getOperand().getType()));
+  return MapMhloOpToScalarOpImpl<complex::ImOp>{}(
+      loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+}
+
+// 'target_types' is the unconverted type (signed or unsigned if integer),
+// 'ResultTypes' is the converted type (signless if integer).
+inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
+                                       ArrayRef<Type> resultTypes,
+                                       ArrayRef<Type> argTypes, ValueRange args,
+                                       ArrayRef<NamedAttribute> attributes,
+                                       OpBuilder* b) {
+  assert(targetTypes.size() == 1 && "ConvertOp should return a single result");
+  assert(resultTypes.size() == 1 && "ConvertOp should return a single result");
+  assert(argTypes.size() == 1 && "ConvertOp should take a single argument");
+  assert(args.size() == 1 && "ConvertOp should take a single argument");
+
+  Type sourceType = getElementTypeOrSelf(argTypes.front());
+  Type targetType = getElementTypeOrSelf(targetTypes.front());
+  Type convertedSourceType = getElementTypeOrSelf(args.front());
+
+  // A boolean value is considered to be unsigned when converting to
+  // floating-point. Otherwise, it will become `-1`.
+  if (IsUnsignedIntegerType{}(sourceType) &&
+      mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
+                                               targetType)) {
+    return b->create<mlir::arith::UIToFPOp>(loc, resultTypes, args, attributes);
+  }
+  if (mlir::arith::SIToFPOp::areCastCompatible(sourceType, targetType)) {
+    return b->create<mlir::arith::SIToFPOp>(loc, resultTypes, args, attributes);
+  }
+  if (mlir::isa<FloatType>(sourceType) && mlir::isa<FloatType>(targetType)) {
+    if (sourceType == targetType) {
+      return args.front();
+    }
+
+    mlir::Value src = args.front();
+    auto dst = mlir::cast<FloatType>(targetType);
+    if (sourceType.getIntOrFloatBitWidth() == dst.getWidth()) {
+      // There are no ops for conversions between floats of equal width, so we
+      // go through the next-larger standard type.
+      sourceType = dst.getWidth() == 8 ? b->getF16Type() : b->getF32Type();
+      src = b->create<mlir::arith::ExtFOp>(loc, sourceType, src).getResult();
+    }
+    assert(sourceType.getIntOrFloatBitWidth() != dst.getWidth());
+
+    if (sourceType.getIntOrFloatBitWidth() > dst.getWidth()) {
+      return b->create<mlir::arith::TruncFOp>(loc, resultTypes, src,
+                                              attributes);
+    }
+    return b->create<mlir::arith::ExtFOp>(loc, resultTypes, src, attributes);
+  }
+  if (targetType.isInteger(/*width=*/1)) {
+    // When casting to bool, we need to compare whether the value is equal to
+    // zero.
+    if (sourceType.isSignlessInteger() || sourceType.isUnsignedInteger()) {
+      Value zeroIntval = b->create<arith::ConstantOp>(
+          loc, b->getZeroAttr(args.front().getType()));
+      return b->create<mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
+                                            args.front(), zeroIntval);
+    }
+    if (mlir::isa<FloatType>(sourceType)) {
+      Value zero = b->create<arith::ConstantOp>(
+          loc, b->getZeroAttr(args.front().getType()));
+      return b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
+                                            args.front(), zero);
+    }
+  }
+  if (mlir::isa<IntegerType>(sourceType) &&
+      mlir::isa<IntegerType>(targetType)) {
+    auto src = mlir::cast<IntegerType>(sourceType);
+    auto res = mlir::cast<IntegerType>(targetType);
+    if (src.getWidth() > res.getWidth()) {
+      return b->create<mlir::arith::TruncIOp>(loc, resultTypes, args,
+                                              attributes);
+    }
+    if (src.getWidth() < res.getWidth()) {
+      // Special case boolean values, so they get casted to `1` instead of `-1`.
+      if (IsUnsignedIntegerType{}(src)) {
+        return b->create<mlir::arith::ExtUIOp>(loc, resultTypes, args,
+                                               attributes);
+      }
+      return b->create<mlir::arith::ExtSIOp>(loc, resultTypes, args,
+                                             attributes);
+    }
+    // No conversion is needed for the same width integers
+    return args.front();
+  }
+  if (targetType.isUnsignedInteger() &&
+      mlir::arith::FPToUIOp::areCastCompatible(convertedSourceType,
+                                               targetType)) {
+    return b->create<mlir::arith::FPToUIOp>(loc, resultTypes, args, attributes);
+  }
+  if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType,
+                                               targetType)) {
+    return b->create<mlir::arith::FPToSIOp>(loc, resultTypes, args, attributes);
+  }
+  if (mlir::isa<ComplexType>(targetType)) {
+    Type targetElementType =
+        mlir::cast<ComplexType>(targetType).getElementType();
+    assert(!mlir::isa<ComplexType>(targetElementType) &&
+           "elements of complex numbers should not be complex");
+    Value targetReal;
+    Value targetImag;
+    if (mlir::isa<ComplexType>(sourceType)) {
+      // We are converting from complex type: convert real and imaginary parts
+      // separately.
+      Type sourceElementType =
+          mlir::cast<ComplexType>(sourceType).getElementType();
+      assert(!mlir::isa<ComplexType>(sourceElementType) &&
+             "elements of complex numbers should not be complex");
+      Value sourceReal =
+          b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
+      targetReal = mapConvertOpToStdScalarOp(
+          loc, targetElementType, targetElementType, sourceElementType,
+          sourceReal, attributes, b);
+      Value sourceImag =
+          b->create<mlir::complex::ImOp>(loc, sourceElementType, args.front());
+      targetImag = mapConvertOpToStdScalarOp(
+          loc, targetElementType, targetElementType, sourceElementType,
+          sourceImag, attributes, b);
+    } else {
+      // We are converting from real (float, integer, etc.) type, convert the
+      // real part and set the imaginary part to 0.
+      targetReal =
+          mapConvertOpToStdScalarOp(loc, targetElementType, targetElementType,
+                                    argTypes, args, attributes, b);
+      targetImag = b->create<mlir::arith::ConstantOp>(
+          loc, b->getFloatAttr(targetElementType, 0.0));
+    }
+    return b->create<mlir::complex::CreateOp>(loc, targetType, targetReal,
+                                              targetImag);
+  }
+  if (auto sourceComplexType = mlir::dyn_cast<ComplexType>(sourceType)) {
+    auto sourceElementType = sourceComplexType.getElementType();
+    // When converting from complex to a non-complex type, we take just the real
+    // part of the complex number.
+    Value sourceReal =
+        b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
+    return mapConvertOpToStdScalarOp(loc, targetTypes, resultTypes,
+                                     sourceElementType, sourceReal, attributes,
+                                     b);
+  }
+  return nullptr;
+}
+
+/// Lower bitcast operations where the input and resulting type are the same
+/// bitwidth, thus implying that the operation is fully defined by parallel
+/// loops and scalar operations without any shape dimension changes.
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::BitcastConvertOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::BitcastConvertOp::Adaptor adaptor,
+    ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+  Type argType = getElementTypeOrSelf(argTypes.front());
+  Type resultType = getElementTypeOrSelf(resultTypes.front());
+
+  if (resultType.getIntOrFloatBitWidth() != argType.getIntOrFloatBitWidth())
+    return nullptr;
+
+  return b->create<mlir::arith::BitcastOp>(loc, resultTypes,
+                                           adaptor.getOperands(), attributes);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::DotOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::DotOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  // Dot Op converter from lhlo to affine only accepts float and integer types.
+  const auto& lhs = adaptor.getOperands()[0];
+  const auto& rhs = adaptor.getOperands()[1];
+  const auto& result = adaptor.getOperands()[2];
+  Type elementType = lhs.getType();
+  if (mlir::isa<FloatType>(elementType)) {
+    Value floatMul =
+        MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::arith::MulFOp>{}(
+            loc, resultTypes, argTypes, {lhs, rhs}, attributes, b);
+    return MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::arith::AddFOp>{}(
+        loc, resultTypes, argTypes, {floatMul, result}, attributes, b);
+  }
+  if (mlir::isa<IntegerType>(elementType)) {
+    Value intMul =
+        MapMhloOpToScalarOpImpl<IsAnyIntegerType, ::mlir::arith::MulIOp>{}(
+            loc, resultTypes, argTypes, {lhs, rhs}, attributes, b);
+    return MapMhloOpToScalarOpImpl<IsAnyIntegerType, ::mlir::arith::AddIOp>{}(
+        loc, resultTypes, argTypes, {intMul, result}, attributes, b);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::IsFiniteOp>(
+    Location loc, ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
+    mhlo::IsFiniteOp::Adaptor adaptor, ArrayRef<NamedAttribute> /*attributes*/,
+    OpBuilder* b) {
+  if (mlir::isa<FloatType>(adaptor.getX().getType())) {
+    auto posInf = APFloat::getInf(
+        mlir::cast<FloatType>(adaptor.getX().getType()).getFloatSemantics());
+    auto constPosInf = b->create<arith::ConstantOp>(
+        loc, b->getFloatAttr(adaptor.getX().getType(), posInf));
+    Value absX = b->create<::mlir::math::AbsFOp>(loc, adaptor.getX());
+    return b->create<::mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::ONE,
+                                            absX, constPosInf);
+  }
+  return nullptr;
+}
+
+/// Implements the conversion of HLO op to scalar op (to use within region of a
+/// linalg.generic op) for compare-select style operations like min/max.
+template <typename... Args>
+struct CompareSelectOpToStdScalarOp {
+  static Value map(Location /*loc*/,
+                   ComparisonDirection /*comparison_direction*/,
+                   ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
+                   ValueRange /*args*/, OpBuilder* /*b*/) {
+    return nullptr;
+  }
+};
+
+/// Specialization which allows converting to a comparison operation in standard
+/// dialect with a given predicate based on the element type of the operand.
+template <typename SupportedType, typename StdCompareOp, typename Predicate,
+          typename... Args>
+struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
+                                    Args...> {
+  static Value map(Location loc, ComparisonDirection comparisonDirection,
+                   ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+                   ValueRange args, OpBuilder* b) {
+    Type elementType = getElementTypeOrSelf(argTypes.front());
+    if (isa<SupportedType>(elementType)) {
+      auto predicate = getCmpPredicate<Predicate>(
+          comparisonDirection, !elementType.isUnsignedInteger());
+      assert(predicate.has_value() && "expected valid comparison direction");
+      auto cmp = b->template create<StdCompareOp>(loc, predicate.getValue(),
+                                                  args[0], args[1]);
+      return b->create<::mlir::arith::SelectOp>(loc, cmp, args[0], args[1]);
+    }
+    return CompareSelectOpToStdScalarOp<Args...>::map(
+        loc, comparisonDirection, resultTypes, argTypes, args, b);
+  }
+};
+
+inline Value mhloAlwaysPropagateNaN(Value v, ValueRange args, Location loc,
+                                    OpBuilder* b) {
+  Type elementType = getElementTypeOrSelf(args.front().getType());
+  if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
+    Value isnan = b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
+                                                 args[0], args[1]);
+
+    auto nanApfloat = APFloat::getQNaN(floatType.getFloatSemantics());
+    Value nan = getConstantOrSplat(b, loc, args[0].getType(),
+                                   b->getFloatAttr(floatType, nanApfloat));
+    v = b->create<mlir::arith::SelectOp>(loc, isnan, nan, v);
+  }
+  return v;
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ClampOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::ClampOp::Adaptor op, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  // clamp(lb, x, ub) = min(max(lb, x), ub)
+  Value maxLbX = mapMhloOpToStdScalarOp<mhlo::MaxOp>(
+      loc, resultTypes, argTypes, ValueRange{op.getMin(), op.getOperand()},
+      attributes, b);
+  return mapMhloOpToStdScalarOp<mhlo::MinOp>(loc, resultTypes, argTypes,
+                                             ValueRange{maxLbX, op.getMax()},
+                                             attributes, b);
+}
+
+template <typename U, typename S>
+inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, Type originalType,
+                            Value lhs, Value rhs, Value returnedOnZero,
+                            Value returnedOnSignedOverflow) {
+  Type type = lhs.getType();
+  auto elementType = mlir::cast<IntegerType>(getElementTypeOrSelf(type));
+  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  auto makeConstant = [&](const APInt& i) {
+    return getConstantOrSplat(&lb, lb.getLoc(), type,
+                              lb.getIntegerAttr(elementType, i));
+  };
+  Value one = makeConstant(APInt(elementType.getWidth(), 1));
+  Value rhsIsZero =
+      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, zero);
+
+  // For unsigned just set the divisor to 1 when it would be 0.
+  if (originalType.isUnsignedInteger()) {
+    Value safeRhs = lb.create<arith::SelectOp>(rhsIsZero, one, rhs);
+    Value safeDiv = lb.create<U>(lhs, safeRhs);
+    return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeDiv);
+  }
+
+  // For signed also check for INT_MIN / -1.
+  Value smin = makeConstant(APInt::getSignedMinValue(elementType.getWidth()));
+  Value lhsIsSmin =
+      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, lhs, smin);
+  Value minusOne = makeConstant(APInt::getAllOnes(elementType.getWidth()));
+  Value rhsIsMinusOne =
+      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, minusOne);
+  Value hasIntMinOverflow = lb.create<arith::AndIOp>(lhsIsSmin, rhsIsMinusOne);
+  Value rhsIsUnsafe = lb.create<arith::OrIOp>(rhsIsZero, hasIntMinOverflow);
+  Value safeRhs = lb.create<arith::SelectOp>(rhsIsUnsafe, one, rhs);
+  Value safeDiv = lb.create<S>(lhs, safeRhs);
+  Value safeSmin = lb.create<arith::SelectOp>(
+      hasIntMinOverflow, returnedOnSignedOverflow, safeDiv);
+  return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeSmin);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::DivOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::DivOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  Type originalType = getElementTypeOrSelf(argTypes.front());
+  if (mlir::isa<ComplexType, FloatType>(originalType)) {
+    return MapMhloOpToScalarOpImpl<IsFloatType, arith::DivFOp, IsComplexType,
+                                   complex::DivOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+  }
+
+  // Integer division overflow behavior:
+  //
+  // X / 0 == -1
+  // INT_SMIN /s -1 = INT_SMIN
+  ImplicitLocOpBuilder lb(loc, *b);
+  Type type = adaptor.getLhs().getType();
+  auto elementType = mlir::cast<IntegerType>(getElementTypeOrSelf(type));
+  auto makeConstant = [&](const APInt& i) {
+    return getConstantOrSplat(&lb, lb.getLoc(), type,
+                              lb.getIntegerAttr(elementType, i));
+  };
+  Value minusOne = makeConstant(APInt::getAllOnes(elementType.getWidth()));
+  Value smin = makeConstant(APInt::getSignedMinValue(elementType.getWidth()));
+  return makeSafeIntDiv<arith::DivUIOp, arith::DivSIOp>(
+      lb, originalType, adaptor.getLhs(), adaptor.getRhs(),
+      /*returnedOnZero=*/minusOne,
+      /*returnedOnSignedOverflow=*/smin);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::RemOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::RemOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  Type originalType = getElementTypeOrSelf(argTypes.front());
+  if (mlir::isa<ComplexType, FloatType>(originalType)) {
+    return MapMhloOpToScalarOpImpl<IsFloatType, arith::RemFOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+  }
+
+  // Integer remainder overflow behavior:
+  //
+  // X % 0 == X
+  // INT_SMIN %s -1 = 0
+  ImplicitLocOpBuilder lb(loc, *b);
+  Type type = adaptor.getLhs().getType();
+  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  return makeSafeIntDiv<arith::RemUIOp, arith::RemSIOp>(
+      lb, originalType, adaptor.getLhs(), adaptor.getRhs(),
+      /*returnedOnZero=*/adaptor.getLhs(),
+      /*returnedOnSignedOverflow=*/zero);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::NegOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::NegOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  Type elementType = getElementTypeOrSelf(adaptor.getOperand().getType());
+  if (mlir::isa<ComplexType, FloatType>(elementType)) {
+    return MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::arith::NegFOp,
+                                   IsComplexType, ::mlir::complex::NegOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+  }
+  if (mlir::isa<IntegerType>(elementType)) {
+    // lmhlo.neg(x, result) -> result = sub(0, x)
+    Value lhs = adaptor.getOperand();
+    Value zeroIntval =
+        b->create<arith::ConstantOp>(loc, b->getZeroAttr(lhs.getType()));
+    return b->create<ScalarIOp<mhlo::SubtractOp>>(loc, zeroIntval, lhs);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::NotOp>(
+    Location loc, ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
+    mhlo::NotOp::Adaptor adaptor, ArrayRef<NamedAttribute> /*attributes*/,
+    OpBuilder* b) {
+  Type elementType = getElementTypeOrSelf(adaptor.getOperand().getType());
+  if (auto integerType = mlir::dyn_cast<IntegerType>(elementType)) {
+    // lmhlo.not(x) -> x ^ -1
+    Value allOnes = getConstantOrSplat(
+        b, loc, adaptor.getOperand().getType(),
+        b->getIntegerAttr(integerType,
+                          APInt::getAllOnes(integerType.getWidth())));
+    return b->create<::mlir::arith::XOrIOp>(loc, allOnes, adaptor.getOperand());
+  }
+  return nullptr;
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::LogisticOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> /*argTypes*/,
+    mhlo::LogisticOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  // 1.0 / (1.0 + exp(-x))
+  Value negX = mapMhloOpToStdScalarOp<mhlo::NegOp>(
+      loc, resultTypes, resultTypes, {adaptor.getOperand()}, attributes, b);
+  Value expNegX = mapMhloOpToStdScalarOp<mhlo::ExpOp>(
+      loc, resultTypes, resultTypes, {{negX}}, attributes, b);
+
+  Type type = getElementTypeOrSelf(resultTypes[0]);
+  Value oneFloat =
+      mlir::isa<ComplexType>(type)
+          ? b->create<arith::ConstantOp>(loc, b->getF32FloatAttr(1.0))
+          : getConstantOrSplat(b, loc, resultTypes[0],
+                               FloatAttr::get(type, 1.0f));
+  Value one = mapConvertOpToStdScalarOp(loc, resultTypes, resultTypes,
+                                        {oneFloat.getType()}, {{oneFloat}},
+                                        attributes, b);
+  Value oneAddExprNegX = mapMhloOpToStdScalarOp<mhlo::AddOp>(
+      loc, resultTypes, resultTypes, {{expNegX, one}}, attributes, b);
+  return mapMhloOpToStdScalarOp<mhlo::DivOp>(
+      loc, resultTypes, resultTypes, {{one, oneAddExprNegX}}, attributes, b);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::PowOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  auto lb = ImplicitLocOpBuilder(loc, *b);
+  // TODO: b/315868720 Consider alternate lowerings of mhlo::PowOp with integer
+  // operands. Floating point can use std::powf
+  auto resultType = getElementTypeOrSelf(resultTypes.front());
+  if (mlir::isa<ComplexType, FloatType>(resultType)) {
+    return MapMhloOpToScalarOpImpl<IsFloatType, math::PowFOp, IsComplexType,
+                                   complex::PowOp>{}(
+        loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+  }
+
+  // Exponentiation by squaring:
+  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
+  Value negOne =
+      lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, -1));
+  Value zero = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 0));
+  Value one = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 1));
+  Value two = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 2));
+  Value step = lb.create<arith::ConstantIndexOp>(1);
+  Value lowerBound = lb.create<arith::ConstantIndexOp>(0);
+  // Everything else would overflow for any exponent > 1, as 2^64
+  // is the larget possible exponent for a 64-bit integer, and
+  // that's 1 << 6.
+  Value upperBound = lb.create<arith::ConstantIndexOp>(6);
+  auto originalBase = adaptor.getLhs();
+  auto originalExponent = adaptor.getRhs();
+
+  Value accum =
+      lb.create<scf::ForOp>(
+            lowerBound, upperBound, step,
+            SmallVector<Value>({one, originalBase, originalExponent}),
+            [&](OpBuilder& b, Location, Value /*v*/, ValueRange iters) {
+              Value accum = iters[0];
+              Value base = iters[1];
+              Value exponent = iters[2];
+
+              Value condition = b.create<arith::CmpIOp>(
+                  loc, arith::CmpIPredicate::eq,
+                  b.create<::mlir::arith::AndIOp>(loc, exponent, one), one);
+              Value multiplied =
+                  b.create<::mlir::arith::MulIOp>(loc, accum, base);
+              accum = b.create<::mlir::arith::SelectOp>(loc, condition,
+                                                        multiplied, accum);
+              base = b.create<::mlir::arith::MulIOp>(loc, base, base);
+              exponent = b.create<::mlir::arith::ShRUIOp>(loc, exponent, one);
+              b.create<scf::YieldOp>(
+                  loc, SmallVector<Value>({accum, base, exponent}));
+            })
+          .getResult(0);
+
+  Value rhsIsEven = lb.create<arith::CmpIOp>(
+      arith::CmpIPredicate::eq,
+      lb.create<arith::RemSIOp>(adaptor.getRhs(), two), zero);
+  Value rhsIsNegative = lb.create<arith::CmpIOp>(arith::CmpIPredicate::slt,
+                                                 adaptor.getRhs(), zero);
+  Value lhsIsOne =
+      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, adaptor.getLhs(), one);
+  Value lhsIsNegOne = lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
+                                               adaptor.getLhs(), negOne);
+
+  // The accum is correct when the rhs is non-negative. When rhs is
+  // negative, we return 0 for integer, with the exception of lhs values of 1
+  // and -1 which have integer results for negative exponents. Specifically, the
+  // calculation is the following:
+  //
+  // - Return accum if the rhs is not negative.
+  // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
+  // - Return 1 if lhs is 1.
+  // - Else return 0.
+  Value ifLhsIsOne = lb.create<::mlir::arith::SelectOp>(lhsIsOne, one, zero);
+  Value ifLhsIsNegOne = lb.create<::mlir::arith::SelectOp>(
+      lhsIsNegOne, lb.create<::mlir::arith::SelectOp>(rhsIsEven, one, negOne),
+      ifLhsIsOne);
+  return lb.create<::mlir::arith::SelectOp>(rhsIsNegative, ifLhsIsNegOne,
+                                            accum);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::SelectOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
+    mhlo::SelectOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
+    OpBuilder* b) {
+  return MapMhloOpToScalarOpImpl<::mlir::arith::SelectOp>{}(
+      loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(
+    Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> /*argTypes*/,
+    mhlo::SignOp::Adaptor adaptor, ArrayRef<NamedAttribute> /*attributes*/,
+    OpBuilder* b) {
+  Value operand = adaptor.getOperand();
+  Type elementType = getElementTypeOrSelf(operand.getType());
+  if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
+    Value zero =
+        b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
+    Value ne0I1 = b->create<::mlir::arith::CmpFOp>(
+        loc, arith::CmpFPredicate::ONE, operand, zero);
+    Value ne0Float =
+        b->create<::mlir::arith::UIToFPOp>(loc, zero.getType(), ne0I1);
+    Value copySign = b->create<::mlir::math::CopySignOp>(loc, resultTypes,
+                                                         ne0Float, operand);
+    auto isNan = b->create<::mlir::arith::CmpFOp>(
+        loc, arith::CmpFPredicate::UNO, operand, operand);
+    return b->create<::mlir::arith::SelectOp>(loc, isNan, operand, copySign);
+  }
+  if (auto integerType = mlir::dyn_cast<IntegerType>(elementType)) {
+    // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
+    Value zero =
+        b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
+    Value bitwidthMinusOne = getConstantOrSplat(
+        b, loc, operand.getType(),
+        b->getIntegerAttr(integerType, integerType.getWidth() - 1));
+    Value one = getConstantOrSplat(b, loc, operand.getType(),
+                                   b->getIntegerAttr(integerType, 1));
+    Value cmp = b->create<::mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                                 operand, zero);
+    Value ashr =
+        b->create<::mlir::arith::ShRSIOp>(loc, operand, bitwidthMinusOne);
+    Value orOp = b->create<::mlir::arith::OrIOp>(loc, ashr, one);
+    return b->create<::mlir::arith::SelectOp>(loc, cmp, zero, orOp);
+  }
+  if (mlir::isa<ComplexType>(elementType)) {
+    return b->create<::mlir::complex::SignOp>(loc, elementType, operand);
+  }
+  return nullptr;
+}
+
+/// Construct operations to select the saturated value if the shift amount is
+/// greater than the bitwidth of the type.
+inline Value selectShiftedOrSaturated(ImplicitLocOpBuilder& lb, Value rhs,
+                                      Value shifted, Value saturated,
+                                      Type type) {
+  Type etype = mlir::isa<ShapedType>(type)
+                   ? mlir::cast<ShapedType>(type).getElementType()
+                   : type;
+  auto bitWidthInt = etype.getIntOrFloatBitWidth();
+  Value bitWidth = getConstantOrSplat(&lb, lb.getLoc(), type,
+                                      lb.getIntegerAttr(etype, bitWidthInt));
+  Value cmp = lb.create<mlir::arith::CmpIOp>(mlir::arith::CmpIPredicate::ugt,
+                                             bitWidth, rhs);
+  return lb.create<mlir::arith::SelectOp>(cmp, shifted, saturated);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ShiftLeftOp>(
+    Location loc, ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
+    mhlo::ShiftLeftOp::Adaptor adaptor, ArrayRef<NamedAttribute> /*attributes*/,
+    OpBuilder* b) {
+  ImplicitLocOpBuilder lb(loc, *b);
+  Value lhs = adaptor.getLhs();
+  Value rhs = adaptor.getRhs();
+  Type type = lhs.getType();
+
+  // "Saturate" if the shift is greater than the bitwidth of the type
+  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  Value shifted = lb.create<mlir::arith::ShLIOp>(lhs, rhs);
+
+  return selectShiftedOrSaturated(lb, rhs, shifted, zero, type);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightLogicalOp>(
+    Location loc, ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
+    mhlo::ShiftRightLogicalOp::Adaptor adaptor,
+    ArrayRef<NamedAttribute> /*attributes*/, OpBuilder* b) {
+  ImplicitLocOpBuilder lb(loc, *b);
+  Value lhs = adaptor.getLhs();
+  Value rhs = adaptor.getRhs();
+  Type type = lhs.getType();
+
+  // "Saturate" if the shift is greater than the bitwidth of the type
+  Value zero = lb.create<arith::ConstantOp>(b->getZeroAttr(type));
+  Value shifted = lb.create<mlir::arith::ShRUIOp>(lhs, rhs);
+
+  return selectShiftedOrSaturated(lb, rhs, shifted, zero, type);
+}
+
+template <>
+inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightArithmeticOp>(
+    Location loc, ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
+    mhlo::ShiftRightArithmeticOp::Adaptor adaptor,
+    ArrayRef<NamedAttribute> /*attributes*/, OpBuilder* b) {
+  ImplicitLocOpBuilder lb(loc, *b);
+  Value lhs = adaptor.getLhs();
+  Value rhs = adaptor.getRhs();
+  Type type = lhs.getType();
+  Type etype = mlir::isa<ShapedType>(type)
+                   ? mlir::cast<ShapedType>(type).getElementType()
+                   : type;
+  auto bitWidthInt = etype.getIntOrFloatBitWidth();
+
+  // "Saturate" if the shift is greater than the bitwidth of the type
+  Value maxShift = getConstantOrSplat(
+      b, loc, type, lb.getIntegerAttr(etype, bitWidthInt - 1));
+  Value saturatedShifted = lb.create<mlir::arith::ShRSIOp>(lhs, maxShift);
+  Value shifted = lb.create<mlir::arith::ShRSIOp>(lhs, rhs);
+
+  return selectShiftedOrSaturated(lb, rhs, shifted, saturatedShifted, type);
+}
+}  // namespace impl
+
+struct MhloOpToStdScalarOp {
+  // Converts mhlo 'op' to linalg and arith ops.
+  template <typename MhloOpTy>
+  static Value mapOp(MhloOpTy op, ArrayRef<Type> resultTypes, ValueRange args,
+                     ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+    auto argTypes = llvm::to_vector(op->getOperandTypes());
+    return mapOpWithArgTypes(op, resultTypes, argTypes, args, attributes, b);
+  }
+
+  // Converts mhlo 'op' to linalg and arith ops. The types of 'args' may already
+  // be converted, 'argTypes' are their original types.
+  template <typename MhloOpTy>
+  static Value mapOpWithArgTypes(MhloOpTy op, ArrayRef<Type> resultTypes,
+                                 ArrayRef<Type> argTypes, ValueRange args,
+                                 ArrayRef<NamedAttribute> attributes,
+                                 OpBuilder* b) {
+    static_assert(!std::is_same<MhloOpTy, mhlo::ConvertOp>::value);
+    typename MhloOpTy::Adaptor adaptor(args, op->getAttrDictionary(),
+                                       op->getPropertiesStorage(),
+                                       op->getRegions());
+    return mapOpOfType<MhloOpTy>(op.getLoc(), resultTypes, argTypes, adaptor,
+                                 attributes, b);
+  }
+  // Overload for mhlo::ConvertOp.
+  static Value mapOpWithArgTypes(mhlo::ConvertOp op, ArrayRef<Type> resultTypes,
+                                 ArrayRef<Type> argTypes, ValueRange args,
+                                 ArrayRef<NamedAttribute> attributes,
+                                 OpBuilder* b) {
+    return impl::mapConvertOpToStdScalarOp(
+        op.getLoc(), op.getType(), resultTypes, argTypes, args, attributes, b);
+  }
+
+  // Converts mhlo 'op' to linalg and arith ops.
+  template <typename MhloOpTy>
+  static Value mapOpOfType(Location loc, ArrayRef<Type> resultTypes,
+                           ArrayRef<Type> argTypes,
+                           typename MhloOpTy::Adaptor adaptor,
+                           ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+    return impl::mapMhloOpToStdScalarOp<MhloOpTy>(loc, resultTypes, argTypes,
+                                                  adaptor, attributes, b);
+  }
+
+  static Value mapConvertOpToStdScalarOp(
+      Location loc, ArrayRef<Type> targetTypes, ArrayRef<Type> resultTypes,
+      ArrayRef<Type> argTypes, ValueRange args,
+      ArrayRef<NamedAttribute> attributes, OpBuilder* b) {
+    return impl::mapConvertOpToStdScalarOp(loc, targetTypes, resultTypes,
+                                           argTypes, args, attributes, b);
+  }
+};
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
new file mode 100644
index 00000000..390dfb80
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
@@ -0,0 +1,168 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
+#define MLIR_HLO_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
+
+#include <type_traits>
+
+#include "mhlo/IR/hlo_ops.h"
+#include "stablehlo/dialect/StablehloOps.h"
+
+namespace mlir {
+namespace stablehlo {
+
+template <typename HloOpTy>
+struct HloToStablehloOpImpl;
+template <typename HloOpTy>
+using HloToStablehloOp = typename HloToStablehloOpImpl<HloOpTy>::Type;
+
+template <typename StablehloOpTy>
+struct StablehloToHloOpImpl;
+template <typename StablehloOpTy>
+using StablehloToHloOp = typename StablehloToHloOpImpl<StablehloOpTy>::Type;
+
+#define MAP_STABLEHLO_TO_HLO(OpName)               \
+  template <>                                      \
+  struct HloToStablehloOpImpl<mhlo::OpName> {      \
+    using Type = stablehlo::OpName;                \
+  };                                               \
+  template <>                                      \
+  struct StablehloToHloOpImpl<stablehlo::OpName> { \
+    using Type = mhlo::OpName;                     \
+  };
+
+MAP_STABLEHLO_TO_HLO(AbsOp)
+MAP_STABLEHLO_TO_HLO(AddOp)
+MAP_STABLEHLO_TO_HLO(AfterAllOp)
+MAP_STABLEHLO_TO_HLO(AllGatherOp)
+MAP_STABLEHLO_TO_HLO(AllReduceOp)
+MAP_STABLEHLO_TO_HLO(AllToAllOp)
+MAP_STABLEHLO_TO_HLO(AndOp)
+MAP_STABLEHLO_TO_HLO(Atan2Op)
+MAP_STABLEHLO_TO_HLO(BatchNormGradOp)
+MAP_STABLEHLO_TO_HLO(BatchNormInferenceOp)
+MAP_STABLEHLO_TO_HLO(BatchNormTrainingOp)
+MAP_STABLEHLO_TO_HLO(BitcastConvertOp)
+MAP_STABLEHLO_TO_HLO(BroadcastInDimOp)
+MAP_STABLEHLO_TO_HLO(BroadcastOp)
+MAP_STABLEHLO_TO_HLO(CaseOp)
+MAP_STABLEHLO_TO_HLO(CbrtOp)
+MAP_STABLEHLO_TO_HLO(CeilOp)
+MAP_STABLEHLO_TO_HLO(CholeskyOp)
+MAP_STABLEHLO_TO_HLO(ClampOp)
+MAP_STABLEHLO_TO_HLO(ClzOp)
+MAP_STABLEHLO_TO_HLO(CollectiveBroadcastOp)
+MAP_STABLEHLO_TO_HLO(CollectivePermuteOp)
+MAP_STABLEHLO_TO_HLO(CompareOp)
+MAP_STABLEHLO_TO_HLO(ComplexOp)
+MAP_STABLEHLO_TO_HLO(CompositeOp)
+MAP_STABLEHLO_TO_HLO(ConcatenateOp)
+MAP_STABLEHLO_TO_HLO(ConstantOp)
+MAP_STABLEHLO_TO_HLO(ConvertOp)
+MAP_STABLEHLO_TO_HLO(ConvolutionOp)
+MAP_STABLEHLO_TO_HLO(CosineOp)
+MAP_STABLEHLO_TO_HLO(CreateTokenOp)
+MAP_STABLEHLO_TO_HLO(CrossReplicaSumOp)
+MAP_STABLEHLO_TO_HLO(CustomCallOp)
+MAP_STABLEHLO_TO_HLO(DivOp)
+MAP_STABLEHLO_TO_HLO(DotGeneralOp)
+MAP_STABLEHLO_TO_HLO(DotOp)
+MAP_STABLEHLO_TO_HLO(DynamicBroadcastInDimOp)
+MAP_STABLEHLO_TO_HLO(DynamicConvOp)
+MAP_STABLEHLO_TO_HLO(DynamicGatherOp)
+MAP_STABLEHLO_TO_HLO(DynamicIotaOp)
+MAP_STABLEHLO_TO_HLO(DynamicPadOp)
+MAP_STABLEHLO_TO_HLO(DynamicReshapeOp)
+MAP_STABLEHLO_TO_HLO(DynamicSliceOp)
+MAP_STABLEHLO_TO_HLO(DynamicUpdateSliceOp)
+MAP_STABLEHLO_TO_HLO(EinsumOp)
+MAP_STABLEHLO_TO_HLO(Expm1Op)
+MAP_STABLEHLO_TO_HLO(ExpOp)
+MAP_STABLEHLO_TO_HLO(FftOp)
+MAP_STABLEHLO_TO_HLO(FloorOp)
+MAP_STABLEHLO_TO_HLO(GatherOp)
+MAP_STABLEHLO_TO_HLO(GetDimensionSizeOp)
+MAP_STABLEHLO_TO_HLO(GetTupleElementOp)
+MAP_STABLEHLO_TO_HLO(IfOp)
+MAP_STABLEHLO_TO_HLO(ImagOp)
+MAP_STABLEHLO_TO_HLO(InfeedOp)
+MAP_STABLEHLO_TO_HLO(IotaOp)
+MAP_STABLEHLO_TO_HLO(IsFiniteOp)
+MAP_STABLEHLO_TO_HLO(Log1pOp)
+MAP_STABLEHLO_TO_HLO(LogisticOp)
+MAP_STABLEHLO_TO_HLO(LogOp)
+MAP_STABLEHLO_TO_HLO(MapOp)
+MAP_STABLEHLO_TO_HLO(MaxOp)
+MAP_STABLEHLO_TO_HLO(MinOp)
+MAP_STABLEHLO_TO_HLO(MulOp)
+MAP_STABLEHLO_TO_HLO(NegOp)
+MAP_STABLEHLO_TO_HLO(NotOp)
+MAP_STABLEHLO_TO_HLO(OptimizationBarrierOp)
+MAP_STABLEHLO_TO_HLO(OrOp)
+MAP_STABLEHLO_TO_HLO(OutfeedOp)
+MAP_STABLEHLO_TO_HLO(PadOp)
+MAP_STABLEHLO_TO_HLO(PartitionIdOp)
+MAP_STABLEHLO_TO_HLO(PopulationCountOp)
+MAP_STABLEHLO_TO_HLO(PowOp)
+MAP_STABLEHLO_TO_HLO(RealDynamicSliceOp)
+MAP_STABLEHLO_TO_HLO(RealOp)
+MAP_STABLEHLO_TO_HLO(RecvOp)
+MAP_STABLEHLO_TO_HLO(ReduceOp)
+MAP_STABLEHLO_TO_HLO(ReducePrecisionOp)
+MAP_STABLEHLO_TO_HLO(ReduceScatterOp)
+MAP_STABLEHLO_TO_HLO(ReduceWindowOp)
+MAP_STABLEHLO_TO_HLO(RemOp)
+MAP_STABLEHLO_TO_HLO(ReplicaIdOp)
+MAP_STABLEHLO_TO_HLO(ReshapeOp)
+MAP_STABLEHLO_TO_HLO(ReturnOp)
+MAP_STABLEHLO_TO_HLO(ReverseOp)
+MAP_STABLEHLO_TO_HLO(RngBitGeneratorOp)
+MAP_STABLEHLO_TO_HLO(RngOp)
+MAP_STABLEHLO_TO_HLO(RoundOp)
+MAP_STABLEHLO_TO_HLO(RoundNearestEvenOp)
+MAP_STABLEHLO_TO_HLO(RsqrtOp)
+MAP_STABLEHLO_TO_HLO(ScatterOp)
+MAP_STABLEHLO_TO_HLO(SelectAndScatterOp)
+MAP_STABLEHLO_TO_HLO(SelectOp)
+MAP_STABLEHLO_TO_HLO(SendOp)
+MAP_STABLEHLO_TO_HLO(SetDimensionSizeOp)
+MAP_STABLEHLO_TO_HLO(ShiftLeftOp)
+MAP_STABLEHLO_TO_HLO(ShiftRightArithmeticOp)
+MAP_STABLEHLO_TO_HLO(ShiftRightLogicalOp)
+MAP_STABLEHLO_TO_HLO(SignOp)
+MAP_STABLEHLO_TO_HLO(SineOp)
+MAP_STABLEHLO_TO_HLO(SliceOp)
+MAP_STABLEHLO_TO_HLO(SortOp)
+MAP_STABLEHLO_TO_HLO(SqrtOp)
+MAP_STABLEHLO_TO_HLO(SubtractOp)
+MAP_STABLEHLO_TO_HLO(TanhOp)
+MAP_STABLEHLO_TO_HLO(TanOp)
+MAP_STABLEHLO_TO_HLO(TorchIndexSelectOp)
+MAP_STABLEHLO_TO_HLO(TransposeOp)
+MAP_STABLEHLO_TO_HLO(TriangularSolveOp)
+MAP_STABLEHLO_TO_HLO(TupleOp)
+// (deprecated) MAP_STABLEHLO_TO_HLO(UnaryEinsumOp)
+MAP_STABLEHLO_TO_HLO(UniformDequantizeOp)
+MAP_STABLEHLO_TO_HLO(UniformQuantizeOp)
+MAP_STABLEHLO_TO_HLO(WhileOp)
+MAP_STABLEHLO_TO_HLO(XorOp)
+
+#undef MAP_STABLEHLO_TO_HLO
+
+}  // namespace stablehlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
new file mode 100644
index 00000000..979a1611
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_PASSES_H
+#define MLIR_HLO_MHLO_TRANSFORMS_PASSES_H
+
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+namespace func {
+class FuncOp;
+}  // namespace func
+
+namespace mhlo {
+
+#define GEN_PASS_DECL
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+/// Lowers from HLO dialect to Arithmetic dialect.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToArithmeticPass();
+
+/// Lowers from HLO dialect to Linalg dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeHloToLinalgPass(
+    bool enablePrimitiveOps = false);
+
+// Sinks constants implicitly captured in control flow regions. This is
+// necessary to export to XLA.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createSinkConstantsToControlFlowPass();
+
+/// Lowers trigonometric operations from the standard dialect to approximations
+/// that do not use intrinsics.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeTrigonometricToApproximationPass();
+
+// Move dynamic broadcasts up over element-wise operations and broadcast the
+// operands rather than the result. This will eventually allow for larger
+// fusions.
+std::unique_ptr<OperationPass<func::FuncOp>> createBroadcastPropagationPass();
+
+// Prepare moving dynamic broadcasts up over element-wise operations and
+// broadcast the operands rather than the result. This will eventually allow for
+// larger fusions.
+std::unique_ptr<OperationPass<func::FuncOp>> createMergeAssumingOpsPass();
+
+/// Creates a pass to analyze shapes and to use that information for
+/// shape-related optimizations.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createSymbolicShapeOptimizationPass();
+
+// Pass to simplify shape ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createShapeSimplification();
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeDotToDotGeneralPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeEinsumToDotGeneralPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeTorchIndexSelectToGatherPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createFlattenTuplePass();
+
+// Creates a pass for expanding mhlo.tuple ops.
+std::unique_ptr<OperationPass<ModuleOp>> createExpandHloTuplesPass(
+    const std::string& entryFunctionName = "main");
+
+// Creates a pass for collapsing the mhlo.map if the map only has elementwise
+// op.
+std::unique_ptr<OperationPass<func::FuncOp>> createCollapseElementwiseMapPass();
+
+// Pass to replace unsigned types with signless integers.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertToSignlessPass();
+
+// Legalizes from the MHLO dialect to the StableHLO dialect.
+std::unique_ptr<OperationPass<ModuleOp>> createHloLegalizeToStablehloPass();
+
+// Legalizes from the StableHLO dialect to the MHLO dialect.
+std::unique_ptr<OperationPass<ModuleOp>> createStablehloLegalizeToHloPass();
+
+// Legalizes from the Shape dialect to the MHLO dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createShapeLegalizeToHloPass(
+    bool legalizeConstraints = false);
+
+// Test passes.
+std::unique_ptr<Pass> createTestInferShapedTypeMethodsPass();
+std::unique_ptr<Pass> createTestMaterializeBroadcastsPass();
+std::unique_ptr<Pass> createTestUnfuseBatchNormPass();
+
+#define GEN_PASS_REGISTRATION
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_PASSES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
new file mode 100644
index 00000000..ac694955
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -0,0 +1,155 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_REWRITERS_H
+#define MLIR_HLO_MHLO_TRANSFORMS_REWRITERS_H
+
+#include <functional>
+#include <memory>
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Collection of rewrite patterns for lowering a general dot product.
+void populateGeneralDotOpLoweringPatterns(RewritePatternSet *patterns,
+                                          MLIRContext *ctx);
+
+// Collection of rewrite patterns for lowering complex operations to equivalent
+// float operations.
+void populateComplexLoweringPatterns(MLIRContext *context,
+                                     RewritePatternSet *patterns);
+
+void populateOptimizeMhloPatterns(MLIRContext *context,
+                                  RewritePatternSet *patterns);
+
+// Rewrite patterns for dot to equivalent dot_general legalization.
+void populateDotToDotGeneralPatterns(mlir::MLIRContext *context,
+                                     RewritePatternSet *patterns);
+
+// Rewrite patterns for einsum to equivalent dot_general legalization.
+void populateEinsumToDotGeneralPatterns(mlir::MLIRContext *context,
+                                        RewritePatternSet *patterns);
+
+// Rewrite patterns for torch index select to equivalent gather legalization.
+void populateTorchIndexSelectToGatherPatterns(mlir::MLIRContext *context,
+                                              RewritePatternSet *patterns);
+
+// Collection of rewrite patterns for lowering of HLO to arithmetic dialect.
+void populateHloToArithmeticConversionPatterns(RewritePatternSet *patterns);
+
+// Collection of rewrite patterns for lowering pointwise HLO ops with scalar
+// arguments to arithmetic dialect.
+void populateScalarHloToArithmeticConversionPatterns(
+    MLIRContext *context, TypeConverter &typeConverter,
+    RewritePatternSet *patterns,
+    llvm::function_ref<bool(Operation *)> filterFn = nullptr);
+
+// Collection of rewrite patterns for lowering of HLO to Linalg dialect.
+void populateHloToLinalgConversionPattern(MLIRContext *context,
+                                          TypeConverter &typeConverter,
+                                          RewritePatternSet *patterns,
+                                          bool enablePrimitiveOps = false);
+
+// Converter to signless intergers to be used with linalg conversion patterns.
+std::unique_ptr<TypeConverter> createHloToLinalgTypeConverter();
+
+// Sets up legality definitions for materializing broadcasts.
+void setupMaterializeBroadcastsLegality(MLIRContext *context,
+                                        ConversionTarget *conversionTarget);
+
+// Populates a collection of rewrite patterns for materializing broadcast
+// attributes to equivalent sequences of ops.
+void populateMaterializeBroadcastsPatterns(MLIRContext *context,
+                                           RewritePatternSet *patterns);
+
+// Populate a collection of conversion patterns for un-fusing
+// batch_norm_inference into constituent HLO ops.
+void populateUnfuseBatchNormInferencePattern(MLIRContext *context,
+                                             RewritePatternSet *patterns);
+
+// Populate a collection of conversion patterns for un-fusing
+// batch_norm_training into constituent HLO ops.
+void populateUnfuseBatchNormTrainingPattern(MLIRContext *context,
+                                            RewritePatternSet *patterns);
+
+// Populate a collection of conversion patterns for un-fusing
+// // batch_norm_inference and batch_norm_training into constituent HLO ops.
+inline void populateUnfuseBatchNormPatterns(MLIRContext *context,
+                                            RewritePatternSet *patterns) {
+  populateUnfuseBatchNormInferencePattern(context, patterns);
+  populateUnfuseBatchNormTrainingPattern(context, patterns);
+}
+
+// Populates patterns that translate the trigonometric operations from the
+// standard dialect to approximations that do not use intrinsics.
+void populateTrigonometricToApproximationPatterns(MLIRContext *context,
+                                                  RewritePatternSet *patterns);
+
+// Populate patterns to prepare moving dynamic broadcasts up over element-wise
+// operations and broadcast the operands rather than the result. This will
+// eventually allow for larger fusions.
+void populateMergeAssumingOpsPatterns(MLIRContext *context,
+                                      RewritePatternSet *patterns);
+
+// Populate patterns to group reduction and parallel dimensions of reduction
+// operations and realize them through equivalent 1D or 2D reductions.
+void populateGroupReductionDimensionsPatterns(MLIRContext *context,
+                                              RewritePatternSet *patterns,
+                                              bool preferColumnsReductions);
+}  // namespace mhlo
+
+namespace chlo {
+
+// Populates direct translations between CHLO and MHLO ops for higher level
+// MHLO ops like TopK and Erf.
+void populateChloToHighLevelMhloOpPatterns(MLIRContext *context,
+                                           RewritePatternSet *patterns);
+
+// Populates direct translations between CHLO->MHLO high level ops
+// and CHLO->StableHLO->MHLO patterns.
+void populateChloToHloPatterns(MLIRContext *context,
+                               TypeConverter *typeConverter,
+                               RewritePatternSet *patterns);
+
+}  // namespace chlo
+
+namespace stablehlo {
+
+// Populates MHLO ops to StableHLO ops rewriting patterns.
+// Also see `stablehlo::registerFuncOpsForTypeConversion` for helper patterns
+// which make sure `func.func`, `func.call` and `func.return` which involve
+// illegal types also get converted.
+void populateHloToStablehloPatterns(RewritePatternSet *patterns,
+                                    TypeConverter *converter,
+                                    MLIRContext *context,
+                                    bool allowExperimentalFeatures);
+
+// Populates StableHLO ops to MHLO ops rewriting patterns.
+// Also see `stablehlo::registerFuncOpsForTypeConversion` for helper patterns
+// which make sure `func.func`, `func.call` and `func.return` which involve
+// illegal types also get converted.
+void populateStablehloToHloPatterns(RewritePatternSet *patterns,
+                                    TypeConverter *converter,
+                                    MLIRContext *context);
+
+}  // namespace stablehlo
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_REWRITERS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/transformation_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/transformation_helpers.h
new file mode 100644
index 00000000..9a85afab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/transforms/transformation_helpers.h
@@ -0,0 +1,161 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_HLO_MHLO_TRANSFORMS_TRANSFORMATION_HELPERS_H_
+#define XLA_MLIR_HLO_MHLO_TRANSFORMS_TRANSFORMATION_HELPERS_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir::mhlo {
+
+// Returns the input value with a reduced precision as specified by the target
+// exponent and mantissa bits. This function will preserve the input shape on
+// the output - i.e. it works with both scalars and tensors.
+//
+// The templated bitcast type allows this function to work with different kinds
+// of bitcats, e.g. `arith.bitcast` or `triton.bitcast`.
+template <typename BitCastOp>
+Value reducePrecision(Location loc, Value input, int destExponentBits,
+                      int destMantissaBits, OpBuilder* builder) {
+  using llvm::APInt;
+  mlir::ImplicitLocOpBuilder b(loc, *builder);
+
+  // Integer and float types for casting and constant generation.
+  auto floatType = mlir::cast<FloatType>(getElementTypeOrSelf(input.getType()));
+  int64_t nbits = floatType.getWidth();
+  auto intScalarType = mlir::IntegerType::get(loc.getContext(), nbits);
+
+  Type intType = intScalarType;
+  std::optional<std::vector<int64_t>> shape;
+  if (auto shapedType = dyn_cast<ShapedType>(input.getType())) {
+    shape = shapedType.getShape().vec();
+    intType = shapedType.clone(intScalarType);
+  }
+
+  Value xAsInt = b.create<BitCastOp>(intType, input);
+
+  // SignificandWidth includes the implicit extra bit.
+  auto srcMantissaBits = floatType.getFPMantissaWidth() - 1;
+  int srcExponentBits = nbits - 1 - srcMantissaBits;
+
+  // Clear the sign bit, it does not participate in rounding and we will restore
+  // it later.
+  APInt signBitMask(nbits, 1);
+  signBitMask <<= nbits - 1;
+
+  APInt expBitsMask(nbits, 1);
+  expBitsMask = ((expBitsMask << srcExponentBits) - 1) << srcMantissaBits;
+
+  auto createConstant = [&](const APInt& v) {
+    return createScalarOrSplatConstant(b, loc, intType, v);
+  };
+
+  Value xAbsBits =
+      b.create<arith::AndIOp>(xAsInt, createConstant(~signBitMask));
+  Value xIsNan = b.create<arith::CmpIOp>(arith::CmpIPredicate::ugt, xAbsBits,
+                                         createConstant(expBitsMask));
+
+  if (destMantissaBits < static_cast<int>(srcMantissaBits)) {
+    // Last remaining mantissa bit.
+    APInt lastMantissaBitMask(nbits, 1);
+    lastMantissaBitMask <<= srcMantissaBits - destMantissaBits;
+
+    // Compute rounding bias for round-to-nearest with ties to even.  This is
+    // equal to a base value of 0111... plus one bit if the last remaining
+    // mantissa bit is 1.
+    APInt baseRoundingBias = lastMantissaBitMask.lshr(1) - 1;
+
+    Value mantissaDiff =
+        createConstant(APInt(nbits, srcMantissaBits - destMantissaBits));
+
+    Value highestMantissaMaskVal = createConstant(lastMantissaBitMask);
+    Value baseRoundingBiasVal = createConstant(baseRoundingBias);
+    Value xLastMantissaBit = b.create<arith::ShRUIOp>(
+        b.create<arith::AndIOp>(xAsInt, highestMantissaMaskVal), mantissaDiff);
+    Value xRoundingBias =
+        b.create<arith::AddIOp>(xLastMantissaBit, baseRoundingBiasVal);
+
+    // Add rounding bias, and mask out truncated bits.  Note that the case
+    // where adding the rounding bias overflows into the exponent bits is
+    // correct; the non-masked mantissa bits will all be zero, and the
+    // exponent will be incremented by one.
+    APInt truncationMask = ~(lastMantissaBitMask - 1);
+    Value xRounded = b.create<arith::AddIOp>(xAsInt, xRoundingBias);
+    xAsInt = b.create<arith::AndIOp>(xRounded, createConstant(truncationMask));
+  }
+
+  if (destExponentBits < srcExponentBits) {
+    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
+    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
+    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
+    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
+    // exponent (corresponding to 0.0f).
+    //
+    // Thus, the f32 exponent corresponding to the highest non-infinite
+    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+    // exponent corresponding to the lowest exponent for a bit size of n is
+    // (2^7-1) - 2^(n-1)-1.
+    //
+    // Note that we have already checked that exponents_bits >= 1.
+    APInt exponentBias(nbits, 1);
+    exponentBias = (exponentBias << (srcExponentBits - 1)) - 1;
+
+    APInt reducedExponentBias(nbits, 1);
+    reducedExponentBias = (reducedExponentBias << (destExponentBits - 1)) - 1;
+
+    APInt reducedMaxExponent = exponentBias + reducedExponentBias;
+    APInt reducedMinExponent = exponentBias - reducedExponentBias;
+
+    // Do we overflow or underflow?
+    Value xExponent =
+        b.create<arith::AndIOp>(xAsInt, createConstant(expBitsMask));
+    Value xOverflows = b.create<arith::CmpIOp>(
+        arith::CmpIPredicate::ugt, xExponent,
+        createConstant(reducedMaxExponent << srcMantissaBits));
+    Value xUnderflows = b.create<arith::CmpIOp>(
+        arith::CmpIPredicate::ule, xExponent,
+        createConstant(reducedMinExponent << srcMantissaBits));
+
+    // Compute appropriately-signed values of zero and infinity.
+    Value xSignedZero =
+        b.create<arith::AndIOp>(xAsInt, createConstant(signBitMask));
+    Value xSignedInf =
+        b.create<arith::OrIOp>(xSignedZero, createConstant(expBitsMask));
+
+    // Force to zero or infinity if overflow or underflow.  (Note that this
+    // truncates all denormal values to zero, rather than rounding them.)
+    xAsInt = b.create<arith::SelectOp>(xOverflows, xSignedInf, xAsInt);
+    xAsInt = b.create<arith::SelectOp>(xUnderflows, xSignedZero, xAsInt);
+  }
+
+  Value result = b.create<BitCastOp>(input.getType(), xAsInt);
+  return b.create<arith::SelectOp>(xIsNan, input, result);
+}
+}  // namespace mlir::mhlo
+
+#endif  // XLA_MLIR_HLO_MHLO_TRANSFORMS_TRANSFORMATION_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
new file mode 100644
index 00000000..ce6db083
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
@@ -0,0 +1,190 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file supports the lowering of CHLO/HLO/LHLO dialect to Linalg dialect.
+
+#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_LEGALIZE_TO_LINALG_UTILS_H_
+#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_LEGALIZE_TO_LINALG_UTILS_H_
+
+#include <algorithm>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace mhlo {
+
+/// Returns an ArrayAttr that contains `nLoops` attributes. All the attributes
+/// are "parallel" except the last `nReduction` elements, where are "reduction"
+/// attributes.
+SmallVector<utils::IteratorType, 3> getParallelAndReductionIterators(
+    unsigned nLoops, unsigned nReduction);
+
+/// Returns an ArrayAttr that contains `nParallelLoops` "parallel" attributes.
+SmallVector<utils::IteratorType, 3> getNParallelLoopsAttrs(
+    unsigned nParallelLoops);
+
+/// Generates an init sparse tensor.
+Value getEmptySparseTensor(OpBuilder& b, Location loc, ShapedType type,
+                           ArrayRef<Value> dynSizes);
+
+/// Generates a tensor.empty op.
+Value getEmptyTensor(OpBuilder& b, Location loc, ShapedType type,
+                     ArrayRef<Value> dynSizes);
+
+/// Generates an empty tensor for the result of the operation, which could be a
+/// dense tensor or a sparse tensor.
+Value getEmptyTensorFor(OpBuilder& b, Location loc, ShapedType resultType,
+                        Operation* op, ValueRange operands);
+
+/// Sparsifies a (block of) operation(s) that cannot be handled directly
+/// by the sparse compiler but has well-known semi-ring semantics.
+///
+/// This yields something of the following form:
+///
+///   %result = sparse_tensor.unary %values[0]
+///     present={
+///       ^bb1(%val):
+///         ... codegen proceeds here using %val ....
+///         sparse_tensor.yield
+///     }
+///     absent={}
+///   linalg.yield %result
+Value preSparsify(Operation* op, llvm::SmallVector<Value, 2>& values, Type rtp,
+                  OpBuilder* b);
+
+/// Finalizes sparse semi-ring construction.
+Value postSparsify(Operation* op, Value semiring, Value result, OpBuilder* b);
+
+/// Returns true if all operands are tensors with rank 0.
+bool allOperandsAreScalarTensors(Operation* op);
+
+/// Returns true if parent op is linalg.
+bool isInBodyOfLinalgOps(Operation* op);
+
+/// Converts a HLO operation to a linalg.generic op that contains the
+/// corresponding scalar operations.
+template <typename OpTy>
+class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
+ public:
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      OpTy op, typename OpTy::Adaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    auto loc = op.getLoc();
+    // Find maximum rank / number of loops.
+    auto getRank = [](Value v) {
+      return mlir::cast<ShapedType>(v.getType()).getRank();
+    };
+    auto isScalar = [&](Value v) { return getRank(v) == 0; };
+    auto it = llvm::find_if_not(adaptor.getOperands(), isScalar);
+    Value maxRankArg =
+        it != adaptor.getOperands().end() ? *it : adaptor.getOperands().front();
+    int64_t nloops = getRank(maxRankArg);
+
+    // Apply only if all operands are scalar or have the same rank. Some ops,
+    // like `mhlo.select`, support implicit broadcasting of scalars.
+    if (!llvm::all_of(adaptor.getOperands(), [&](Value v) {
+          int64_t r = getRank(v);
+          return r == 0 || r == nloops;
+        })) {
+      return rewriter.notifyMatchFailure(
+          op, "Operands must be os same rank or scalar.");
+    }
+
+    // Find result type, if on tensors.
+    std::optional<ShapedType> resultTy;
+    resultTy = mlir::dyn_cast<ShapedType>(
+        this->typeConverter->convertType(op->getResultTypes().front()));
+
+    // Check result type compatibility.
+    if (!resultTy || !resultTy->hasRank() || resultTy->getRank() != nloops ||
+        !(resultTy->getElementType().isSignlessIntOrFloat() ||
+          isa<ComplexType>(resultTy->getElementType()))) {
+      return rewriter.notifyMatchFailure(
+          op, "mismatched operand/result types or iterator count");
+    }
+
+    if (allOperandsAreScalarTensors(op) && isInBodyOfLinalgOps(op))
+      return failure();
+
+    // Find input/output values and types.
+    ValueRange inputs = adaptor.getOperands();
+    Value output =
+        getEmptyTensorFor(rewriter, loc, *resultTy, op, adaptor.getOperands());
+
+    // Create indexing maps.
+    AffineMap scalarMap = AffineMap::get(nloops, 0, rewriter.getContext());
+    AffineMap idMap = rewriter.getMultiDimIdentityMap(nloops);
+    SmallVector<AffineMap, 4> maps;
+    for (Value v : inputs) maps.push_back(isScalar(v) ? scalarMap : idMap);
+    maps.push_back(idMap);
+
+    // Build `linalg.generic` op.
+    bool failed = false;
+    auto linalgOp = rewriter.create<linalg::GenericOp>(
+        loc, resultTy ? *resultTy : TypeRange{}, inputs, output, maps,
+        getNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location /*nested_loc*/,
+            ValueRange args) {
+          Type innerResultTy = getElementTypeOrSelf(output);
+          auto argvec = llvm::to_vector<2>(args.take_front(inputs.size()));
+          auto semiring = preSparsify(op, argvec, innerResultTy, &rewriter);
+          Value innerResult = mhlo::MhloOpToStdScalarOp::mapOp(
+              op, innerResultTy, argvec, /*attributes=*/std::nullopt,
+              &rewriter);
+          if (innerResult == nullptr) {
+            failed = true;
+          } else {
+            innerResult = postSparsify(op, semiring, innerResult, &rewriter);
+            nestedBuilder.create<linalg::YieldOp>(loc, innerResult);
+          }
+        },
+        linalg::getPrunedAttributeList(op));
+    if (failed) return failure();
+
+    rewriter.replaceOp(op, linalgOp->getResults());
+    return success();
+  }
+};
+
+}  // namespace mhlo
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_LEGALIZE_TO_LINALG_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h
new file mode 100644
index 00000000..03cd2bb9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_UTILS_MHLO_RNG_UTILS_H_
+#define MLIR_HLO_MHLO_UTILS_MHLO_RNG_UTILS_H_
+
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+namespace mhlo {
+
+LogicalResult generateLinalgThreeFry(OpBuilder& builder, Location loc,
+                                     ShapedType resultTy, Value& state,
+                                     Value& result);
+
+LogicalResult generateLinalgPhilox(OpBuilder& builder, Location loc,
+                                   ShapedType resultTy, Value& state,
+                                   Value& result);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_UTILS_MHLO_RNG_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
new file mode 100644
index 00000000..2a4c5d7c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements utilities for the canonicalization of ScatterOp and
+// GatherOp.
+
+#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MHLO_SCATTER_GATHER_UTILS_H_
+#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MHLO_SCATTER_GATHER_UTILS_H_
+
+#include <utility>
+
+#include "mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Checks if the scatter has the following characteristics:
+// - scatter_indices is a two-dimensional tensor
+// - index_vector_dim is 1
+// - inserted_window_dims is []
+// - update_window_dims is [0, 1, ...]
+// - scatter_dims_to_operand_dims is [0, 1, ...]
+bool isCanonicalScatter(ScatterOp scatterOp);
+
+// Checks if the gather has the following characteristics:
+// - start_indices is a two-dimensional tensor
+// - index_vector_dim is 1
+// - collapsed_slice_dims is []
+// - offset_dims is [1, 2, ...]
+// - start_index_map is [0, 1, ...]
+bool isCanonicalGather(GatherOp gatherOp);
+
+// Expands the shape of `tensor`, inserting degenerate dimensions.
+//
+// For example tensor<10x4xf32> and dimsToInsert = {0, 2}
+// will result in tensor<1x10x1x4xf32>.
+Value insertDegenerateDimensions(OpBuilder& b, Location loc, Value tensor,
+                                 ArrayRef<int64_t> dimsToInsert);
+
+// Given a map from index vector positions to dimension numbers, creates a
+// permutation that when applied to the operand, let you replace the map with
+// the identity permutation. Also returns its inverse. In gather, the map is
+// called `start_index_map`. In scatter, it's `scatter_dims_to_operand_dims`.
+std::pair<SmallVector<int64_t>, SmallVector<int64_t>>
+makeOperandStartIndexPermutations(ArrayRef<int64_t> dimMap, int operandRank);
+
+// Insert transposes and reshapes to bring `indices` to the 2D shape, where
+// the dim0 is the product of all dimensions that are not equal to
+// `indexVectorDim` and dim1 is the index vector dim.
+//
+// Examples.
+//
+// [a, I, b] will be transposed to [a, b, I], then reshaped into [ab, I].
+// [a, b] will be reshaped to [a, b, I(1)] and then reshaped into [ab, I(1)].
+Value canonicalizeStartIndices(OpBuilder& b, Location loc, Value indices,
+                               int64_t indexVectorDim);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MHLO_SCATTER_GATHER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h
new file mode 100644
index 00000000..0bcba717
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h
@@ -0,0 +1,99 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_UTILS_TYPE_CONVERSION_H
+#define MLIR_HLO_MHLO_UTILS_TYPE_CONVERSION_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace mhlo {
+
+// Type converter to use as part of lowerings from dialects that carry signs
+// in their types to those that are signless.
+class RemoveSignTypeConverter : public TypeConverter {
+ public:
+  RemoveSignTypeConverter();
+};
+
+// Type converter which adds additional materializations (beyond signless)
+// that are needed as part of the HloToLinalg conversion patterns.
+// This is the type converter used by the test pass and is the sanctioned
+// way to use the underlying patterns.
+class LinalgTypeConverter : public RemoveSignTypeConverter {
+ public:
+  LinalgTypeConverter();
+};
+
+}  // namespace mhlo
+
+namespace stablehlo {
+
+// Type converter that handles types which are common to HLO dialects.
+// This consists of:
+//   * Boolean types (i1).
+//   * Signless integer types (i4/i8/i16/i32/i64).
+//   * Unsigned integer types (ui4/ui8/ui16/ui32/ui64).
+//   * Floating-point types (bf1/f16/f32/f64).
+//   * Complex types (complex of f32/f64).
+//   * Index types (index).
+//   * Tensor types.
+//   * Tuple types.
+// Types which are specific to individual dialects like !stablehlo.token
+// and !mhlo.token are handled in subclasses.
+class HloTypeConverter : public TypeConverter {
+ public:
+  HloTypeConverter();
+  virtual ~HloTypeConverter() = default;
+
+  // Checks whether the given dialect is the source dialect of the type
+  // conversion (e.g. MHLO for HloToStablehloTypeConverter).
+  virtual bool isSourceDialect(Dialect& dialect) = 0;
+
+  // Convert an encoding defined by the source dialect.
+  virtual Attribute convertSourceDialectEncoding(Attribute attr) = 0;
+};
+
+// Type converter that changes all !mhlo.foo types to !stablehlo.foo types.
+// Also changes MHLO-defined encodings to StableHLO equivalents.
+class HloToStablehloTypeConverter : public HloTypeConverter {
+ public:
+  HloToStablehloTypeConverter();
+  bool isSourceDialect(Dialect& dialect) override;
+  Attribute convertSourceDialectEncoding(Attribute attr) override;
+};
+
+// Type converter that changes all !stablehlo.foo types to !mhlo.foo types.
+// Also changes StableHLO-defined encodings to MHLO equivalents.
+class StablehloToHloTypeConverter : public HloTypeConverter {
+ public:
+  StablehloToHloTypeConverter();
+  bool isSourceDialect(Dialect& dialect) override;
+  Attribute convertSourceDialectEncoding(Attribute attr) override;
+};
+
+// Complements StableHLO <=> MHLO conversion patterns with boilerplate that
+// makes sure `func.func`, `func.call` and `func.return` ops which involve
+// illegal types get converted to use legal types.
+void registerFuncOpsForTypeConversion(ConversionTarget& target,
+                                      RewritePatternSet& patterns,
+                                      TypeConverter& converter);
+
+}  // namespace stablehlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_UTILS_TYPE_CONVERSION_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/IR/base.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/IR/base.h
new file mode 100644
index 00000000..93feed6f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/IR/base.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+   Copyright 2022 The StableHLO Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef STABLEHLO_EXT_DIALECT_BASE_H
+#define STABLEHLO_EXT_DIALECT_BASE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+
+namespace mlir {
+namespace hlo {
+
+DenseIntElementsAttr getPaddingAttr(MLIRContext *context,
+                                    ArrayRef<int64_t> value);
+DenseIntElementsAttr getPaddingAttr(Builder *builder, ArrayRef<int64_t> value);
+
+}  // namespace hlo
+}  // namespace mlir
+
+#endif  // STABLEHLO_EXT_DIALECT_BASE_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/IR/stablehlo_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/IR/stablehlo_ops.h
new file mode 100644
index 00000000..1e89e306
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/IR/stablehlo_ops.h
@@ -0,0 +1,299 @@
+/* Copyright 2023 The StableHLO Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef STABLEHLO_EXT_DIALECT_STABLEHLO_OPS_H
+#define STABLEHLO_EXT_DIALECT_STABLEHLO_OPS_H
+
+// This file supports XLA-specific extensions with the StableHLO opset.
+// These are currently implemented as custom-call pseudo-ops, but it is likely
+// that they will be upstreamed to StableHLO or CHLO in the future.
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LogicalResult.h"
+#include "stablehlo/dialect/StablehloOps.h"
+
+namespace mlir {
+namespace stablehlo_ext {
+
+// The DynamicReduceWindowOp experiment provides a dynamic version of
+// ReduceWindowOp. Once the dynamism RFC is figured out, we expect to have an
+// upstream representation for this notion.
+//
+// Within this experiment, DynamicReduceWindowOp is represented via the
+// `stablehlo.custom_call @stablehlo.dynamic_reduce_window` custom call.
+// This custom call has the following operands which represent a dynamic version
+// of operands and attributes of ReduceWindowOp:
+//   * [0:N]   => inputs
+//   * [N:2*N] => init_values
+//   * [-5]    => window_dimensions
+//   * [-4]    => window_strides
+//   * [-3]    => base_dilations
+//   * [-2]    => window_dilations
+//   * [-1]    => padding
+// Additionally, to represent the body of DynamicReduceWindowOp, the custom call
+// has a satellite function attached to the custom call via called_computations.
+//
+// Semantics of DynamicReduceWindowOp are inherited from semantics of
+// https://github.com/openxla/stablehlo/blob/main/docs/spec.md#reduce_window
+// with the following exceptions:
+//   1) All tensor constants, i.e. window_dimensions, window_strides,
+//      base_dilations, window_dilations and padding, become tensors of
+//      integer type.
+//   2) As a result, some of the constraints can no longer be validated
+//      statically. However, this operation still expects these constraints
+//      to hold dynamically, and if they don't hold, the behavior is undefined.
+class DynamicReduceWindowOpAdaptor {
+ public:
+  DynamicReduceWindowOpAdaptor(stablehlo::CustomCallOp op) : op_(op) {}
+  operator Operation*() { return op_; }
+  Operation* operator->() { return op_; }
+
+  // Same accessors as for stablehlo::ReduceWindowOp, except that all the
+  // std::optional<DenseIntElementsAttr> attributes have turned into values.
+  // These accessors assume that the operation is well-formed (i.e. that it
+  // can pass verification).
+  ValueRange getInputs();
+  ValueRange getInitValues();
+  TypedValue<ShapedType> getWindowDimensions();
+  TypedValue<ShapedType> getWindowStrides();
+  TypedValue<ShapedType> getBaseDilations();
+  TypedValue<ShapedType> getWindowDilations();
+  TypedValue<ShapedType> getPadding();
+  Region& getBody();
+  ValueRange getResults();
+
+  // Verifies the constraints documented above.
+  // Emits errors if errors are detected.
+  LogicalResult verify();
+
+ private:
+  stablehlo::CustomCallOp op_;
+};
+
+// Wraps a custom call in a DynamicReduceWindowOpAdaptor.
+// Fails if the call_target_name of the custom call doesn't match
+// "stablehlo.dynamic_reduce_window".
+std::optional<DynamicReduceWindowOpAdaptor> getDynamicReduceWindowOp(
+    stablehlo::CustomCallOp op);
+
+// The DynamicRngBitGeneratorOp experiment provides a dynamic version of
+// RngBitGeneratorOp. Once the dynamism RFC is figured out, we expect to have an
+// upstream representation for this notion.
+//
+// Within this experiment, DynamicRngBitGeneratorOp is represented via the
+// `stablehlo.custom_call @stablehlo.dynamic_rng_bit_generator` custom call.
+// This custom call has the regular operand of RngBitGeneratorOp plus an
+// additional `output_shape` operand that determines the shape of the output:
+//   * [0] => initial_state
+//   * [1] => output_shape
+//
+// Semantics of DynamicRngBitGeneratorOp are inherited from semantics of
+// https://github.com/openxla/stablehlo/blob/main/docs/spec.md#rng_bit_generator
+// extended with an additional input (I3) and an additional constraint (C3):
+//
+// #### Inputs
+//
+// | Label | Name            | Type                                         |
+// |-------|-----------------|----------------------------------------------|
+// | (I1)  | `rng_algorithm` | enum of `DEFAULT`, `THREE_FRY`, and `PHILOX` |
+// | (I2)  | `initial_state` | 1-dimensional tensor of type `ui64`          |
+// | (I3)  | `output_shape`  | 1-dimensional tensor of integer type         |
+//
+// #### Outputs
+//
+// | Name           | Type                                     |
+// |----------------|------------------------------------------|
+// | `output_state` | 1-dimensional tensor of type `ui64`      |
+// | `output`       | tensor of integer or floating-point type |
+//
+// #### Constraints
+//
+// * (C1) `type(initial_state) = type(output_state)`.
+// * (C2) `size(initial_state)` is defined as:
+//   * implementation-defined if `rng_algorithm = DEFAULT`.
+//   * `2` if `rng_algorithm = THREE_FRY`.
+//   * `2` or `3` if `rng_algorithm = PHILOX`.
+// * (C3) `shape(output) = output_shape`.
+class DynamicRngBitGeneratorOpAdaptor {
+ public:
+  DynamicRngBitGeneratorOpAdaptor(stablehlo::CustomCallOp op) : op_(op) {}
+  operator Operation*() { return op_; }
+  Operation* operator->() { return op_; }
+
+  // Same accessors as for stablehlo::RngBitGeneratorOp, extended with the
+  // additional `output_shape` operand.
+  // These accessors assume that the operation is well-formed (i.e. that it
+  // can pass verification).
+  stablehlo::RngAlgorithm getRngAlgorithm();
+  TypedValue<ShapedType> getInitialState();
+  TypedValue<ShapedType> getOutputShape();
+  TypedValue<ShapedType> getOutputState();
+  TypedValue<ShapedType> getOutput();
+
+  // Verifies the constraints documented above.
+  // Emits errors if errors are detected.
+  LogicalResult verify();
+
+ private:
+  stablehlo::CustomCallOp op_;
+};
+
+// Wraps a custom call in a DynamicRngBitGeneratorOpAdaptor.
+// Fails if the call_target_name of the custom call doesn't match
+// "stablehlo.dynamic_rng_bit_generator".
+std::optional<DynamicRngBitGeneratorOpAdaptor> getDynamicRngBitGeneratorOp(
+    stablehlo::CustomCallOp op);
+
+// The DynamicTopKOp experiment provides a dynamic version of
+// TopKOp. Once the dynamism RFC is figured out, we expect to have an
+// upstream representation for this notion.
+//
+// Within this experiment, DynamicTopKOp is represented via the
+// `stablehlo.custom_call @stablehlo.dynamic_top_k` custom call.
+// This custom call has the regular operand of TopKOp plus an
+// additional `k` operand that determines the shape of the output.
+//
+// Semantics of DynamicTopKOp are inherited from semantics of Chlo.TopKOp.
+//
+// #### Inputs
+//
+// | Label | Name            | Type                                         |
+// |-------|-----------------|----------------------------------------------|
+// | (I1)  | `operand`       | tensor of integer or floating-point type     |
+// | (I2)  | `k`             | 0-dimensional tensor of integer or index type|
+//
+// #### Outputs
+//
+// | Name           | Type                                     |
+// |----------------|------------------------------------------|
+// | `values`       | tensor of integer or floating-point type |
+// | `indices`      | tensor of si32 type                      |
+//
+// #### Constraints
+//
+// * (C1) `shape(values)[:-1] = shape(operand)[:-1]`
+// * (C2) `element_type(values) = element_type(operand)`
+// * (C3) `shape(values)[-1] <= shape(operand)[-1]`
+// * (C4) `shape(indices) = shape(values)`
+class DynamicTopKOpAdaptor {
+ public:
+  DynamicTopKOpAdaptor(stablehlo::CustomCallOp op) : op_(op) {}
+  operator Operation*() { return op_; }
+  Operation* operator->() { return op_; }
+
+  // These accessors assume that the operation is well-formed (i.e. that it
+  // can pass verification).
+  TypedValue<ShapedType> getOperand();
+  TypedValue<ShapedType> getK();
+  TypedValue<ShapedType> getValues();
+  TypedValue<ShapedType> getIndices();
+
+  // Verifies the constraints documented above.
+  // Emits errors if errors are detected.
+  LogicalResult verify();
+
+ private:
+  stablehlo::CustomCallOp op_;
+};
+
+// Wraps a custom call in a DynamicTopKOpAdaptor.
+// Fails if the call_target_name of the custom call doesn't match
+// "stablehlo.dynamic_top_k".
+std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(
+    stablehlo::CustomCallOp op);
+
+// The DynamicApproxTopKOp experiment provides a dynamic version of
+// ApproxTopKOp.
+//
+// Within this experiment, DynamicApproxTopKOp is represented via the
+// `stablehlo.custom_call @stablehlo.dynamic_approx_top_k` custom call.
+// This custom call has the regular operands of ApproxTopKOp plus an
+// additional `k` operand that determines the shape of the output.
+//
+// Semantics of DynamicApproTopKOp are inherited from semantics of ApproxTopKOp.
+//
+// #### Inputs
+//
+// | Label | Name            | Type                                         |
+// |-------|-----------------|----------------------------------------------|
+// | (I1)  | `inputs`        | N tensors of integer or floating-point type   |
+// | (I2)  | `initial_values`| N 0-dimensional tensors of same element type |
+// |       |                 | as the corresponding input element type      |
+// | (I3)  | `k`             | 0-dimensional tensor of integer or index type|
+//
+// #### Attributes
+//
+// * api_version: always 2 if present
+// * has_side_effect: always False if present
+// * called_computations: the comparator for scoring entries
+// * mhlo.backend_config: does not include `top_k` and includes:
+//   * `reduction_dim`
+//
+// #### Outputs
+//
+// | Name           | Type                                                |
+// |----------------|-----------------------------------------------------|
+// | `outputs`      | N tensor of same type as the corresponding input    |
+//
+// #### Constraints
+//
+// * (C1) the `mhlo.backend_config` attribute does not contain `top_k`
+// * (C2) the `mhlo.backend_config` attribute contains `reduction_dim`
+// * (C3) len(inputs) == len(initial_values) == len(outputs)
+// * (C4) inputs have ranked type and have the same shape
+// * (C5) initial_values have ranked type and have rank 0
+// * (C6) initial_values have the same element type as the corresponding input
+// * (C7) outputs have same shape
+// * (C8) outputs have the same element type as the corresponding input
+// * (C9) 0 <= reduction_dim < rank(inputs[0])
+// * (C10) shape(inputs[0])[i] == shape(outputs[0])[i] except for i ==
+// reduction_dim
+
+class DynamicApproxTopKOpAdaptor {
+ public:
+  DynamicApproxTopKOpAdaptor(stablehlo::CustomCallOp op) : op_(op) {}
+  operator Operation*() { return op_; }
+  Operation* operator->() { return op_; }
+
+  // These accessors assume that the operation is well-formed (i.e. that it
+  // can pass verification).
+  size_t getNumInputs();
+  TypedValue<ShapedType> getInput(size_t idx);
+  TypedValue<ShapedType> getInitialValue(size_t idx);
+  TypedValue<ShapedType> getK();
+
+  TypedValue<ShapedType> getOutput(size_t idx);
+
+  // Verifies the constraints documented above.
+  // Emits errors if errors are detected.
+  LogicalResult verify();
+
+ private:
+  stablehlo::CustomCallOp op_;
+};
+
+// Wraps a custom call in a DynamicApproxTopKOpAdaptor.
+// Fails if the call_target_name of the custom call doesn't match
+// "stablehlo.dynamic_approx_top_k".
+std::optional<DynamicApproxTopKOpAdaptor> getDynamicApproxTopKOp(
+    stablehlo::CustomCallOp op);
+
+}  // namespace stablehlo_ext
+}  // namespace mlir
+
+#endif  // STABLEHLO_EXT_DIALECT_STABLEHLO_OPS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.h
new file mode 100644
index 00000000..e2b77594
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The StableHLO Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef STABLEHLO_EXT_TRANSFORMS_PASSES_H
+#define STABLEHLO_EXT_TRANSFORMS_PASSES_H
+
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace stablehlo_ext {
+
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "stablehlo_ext/transforms/passes.h.inc"
+
+void createChloLegalizeToStablehloPipeline(OpPassManager &pm);
+
+}  // namespace stablehlo_ext
+}  // namespace mlir
+
+#endif  // STABLEHLO_EXT_TRANSFORMS_PASSES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h
new file mode 100644
index 00000000..a36f45e0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TRANSFORMS_GPU_PASSES_H
+#define MLIR_HLO_TRANSFORMS_GPU_PASSES_H
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class ModuleOp;
+class PassManager;
+namespace gpu {
+class GPUModuleOp;
+}  // namespace gpu
+
+#define GEN_PASS_DECL
+#include "transforms/gpu_passes.h.inc"
+
+// Returns array of bool attributes. The value of each element specifies whether
+// the corresponding operand is written. This attribute is attached to
+// 'gpu.launc_func' ops during the fusion rewrite pass above.
+ArrayAttr getWrittenOperandsAttribute(Operation* op);
+
+/// Pass that transforms gpu modules in standard dialect to NNVM.
+std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
+createGpuKernelToNvvmPass(bool useBarePtrCallConv = false);
+
+/// Pass that transforms gpu modules in standard dialect to ROCDL.
+std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
+createGpuKernelToRocdlPass();
+
+#define GEN_PASS_REGISTRATION
+#include "transforms/gpu_passes.h.inc"
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TRANSFORMS_GPU_PASSES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/passes.h
new file mode 100644
index 00000000..b8da3442
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/passes.h
@@ -0,0 +1,102 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TRANSFORMS_PASSES_H
+#define MLIR_HLO_TRANSFORMS_PASSES_H
+
+#include <functional>
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class ModuleOp;
+class MLIRContext;
+class ConversionTarget;
+class DialectRegistry;
+class PassManager;
+class TypeConverter;
+class RewritePatternSet;
+
+namespace func {
+class FuncOp;
+}  // namespace func
+
+using BufferizeDialectsCallback = std::function<void(DialectRegistry&)>;
+using BufferizePatternsCallback = std::function<void(
+    ConversionTarget&, MLIRContext*, TypeConverter*, RewritePatternSet*)>;
+
+//===----------------------------------------------------------------------===//
+// Passes
+//===----------------------------------------------------------------------===//
+
+#define GEN_PASS_DECL_FINALBUFFERIZEPASS
+#define GEN_PASS_DECL_TILELOOPSPASS
+#define GEN_PASS_DECL_GENERICHOSTTOLLVMPASS
+#define GEN_PASS_DECL_VECTORIZECOPYPASS
+#include "transforms/passes.h.inc"
+
+// Pass to lower index cast on tensors to tensor dialect.
+// Note: dependency from XLA:CPU:NEXT.
+std::unique_ptr<OperationPass<func::FuncOp>> createLowerIndexCastPass();
+
+// Pass to tranform compute computations (hlo and linalg) on values to their
+// corresponding counterparts on buffers. Also bufferizes function signatures.
+// Note: dependency from kernelgen.
+std::unique_ptr<OperationPass<ModuleOp>> createComputeOpAndFuncBufferizePass();
+
+// Pass to tranform computations on values to their corresponding parts on
+// buffers.
+// Note: dependency from kernelgen.
+std::unique_ptr<OperationPass<ModuleOp>> createFinalBufferizePass();
+
+std::unique_ptr<OperationPass<ModuleOp>> createFinalBufferizePass(
+    uint64_t alignment, BufferizeDialectsCallback dc = {},
+    BufferizePatternsCallback pc = {});
+
+// Creates a pass for collapsing multidimensional parallel loops into 1D loops.
+std::unique_ptr<OperationPass<>> createCollapseParallelLoopsTo1DPass();
+
+// Creates a TileLoopsPass with tiles sizes provided through `tile_sizes`
+// and unroll factors provided through `unroll_factors`.
+std::unique_ptr<OperationPass<func::FuncOp>> createTileLoopsPass(
+    ArrayRef<int64_t> tileSizes = {}, ArrayRef<int64_t> unrollFactors = {});
+
+// Detensorizes loop-carried variables and block arguments of scf.while, scf.for
+// and scf.if.
+std::unique_ptr<OperationPass<func::FuncOp>> createDetensorizeScfOpsPass();
+
+/// Pass to remove redundant `memref.copy` ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createNaiveCopyRemovalPass();
+
+/// Pass to vectorize `memref.copy`.
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass();
+
+namespace hlo {
+std::unique_ptr<OperationPass<ModuleOp>> createOneShotBufferizePass();
+
+std::unique_ptr<OperationPass<ModuleOp>> createGenericHostToLLVMPass(
+    const GenericHostToLLVMPassOptions& options = {});
+
+std::unique_ptr<OperationPass<func::FuncOp>> createUnbufferizePass();
+std::unique_ptr<OperationPass<func::FuncOp>> createAllocToArgPass();
+
+#define GEN_PASS_REGISTRATION
+#include "transforms/passes.h.inc"
+
+}  // namespace hlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TRANSFORMS_PASSES_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/rewriters.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/rewriters.h
new file mode 100644
index 00000000..11256a09
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/transforms/rewriters.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TRANSFORMS_REWRITERS_H
+#define MLIR_HLO_TRANSFORMS_REWRITERS_H
+
+#include "mlir/IR/MLIRContext.h"
+
+namespace mlir {
+class MLIRContext;
+class RewritePatternSet;
+class TypeConverter;
+
+/// Collects a set of patterns that bufferize operations from the standard and
+/// other dialects.
+void populateExtraBufferizePatterns(MLIRContext *context,
+                                    TypeConverter *converter,
+                                    RewritePatternSet *patterns);
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TRANSFORMS_REWRITERS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/codegen_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/codegen_utils.h
new file mode 100644
index 00000000..307d1198
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/codegen_utils.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_UTILS_CODEGEN_UTILS_H
+#define MLIR_HLO_UTILS_CODEGEN_UTILS_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+
+namespace mlir {
+class Value;
+class ValueRange;
+class OpBuilder;
+class Location;
+class Operation;
+namespace codegen_utils {
+
+Value emitNumElementsComputation(OpBuilder& b, Location loc, Operation* op);
+Value emitNumElementsComputation(OpBuilder& b, Location loc, Value memref);
+
+llvm::SmallVector<Value> calcMultiDimIndex(OpBuilder& b, Location loc,
+                                           Value linearIndex, Value memref);
+
+llvm::SmallVector<Value> calcMultiDimIndex(OpBuilder& b, Location loc,
+                                           Value linearIndex,
+                                           llvm::ArrayRef<Value> shape);
+
+}  // namespace codegen_utils
+}  // namespace mlir
+
+#endif  // MLIR_HLO_UTILS_CODEGEN_UTILS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.h
new file mode 100644
index 00000000..39b3d56d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_UTILS_CONVERT_OP_FOLDER_H
+#define MLIR_HLO_UTILS_CONVERT_OP_FOLDER_H
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+namespace mlir {
+namespace hlo {
+
+// Converts the given elements attr to the specified elements type.
+// Requires type of the elements and new_type to be either integer or float
+// type.
+mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
+                                       mlir::Type newType);
+}  // namespace hlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_UTILS_CONVERT_OP_FOLDER_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/cycle_detector.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/cycle_detector.h
new file mode 100644
index 00000000..9f08b754
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/cycle_detector.h
@@ -0,0 +1,166 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_UTILS_CYCLE_DETECTOR_H
+#define MLIR_HLO_UTILS_CYCLE_DETECTOR_H
+
+#include <optional>
+#include <vector>
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+// -------------------------------------------------------------------
+
+// This file contains a light version of GraphCycles implemented in
+// tensorflow/compiler/jit/graphcycles/graphcycles.h
+//
+// We re-implement it here because we do not want to rely
+// on TensorFlow data structures, and hence we can move
+// corresponding passes to llvm repo. easily in case necessnary.
+
+// --------------------------------------------------------------------
+
+// This is a set data structure that provides a deterministic iteration order.
+// The iteration order of elements only depends on the sequence of
+// inserts/deletes, so as long as the inserts/deletes happen in the same
+// sequence, the set will have the same iteration order.
+//
+// Assumes that T can be cheaply copied for simplicity.
+template <typename T>
+class OrderedSet {
+ public:
+  // Inserts `value` into the ordered set.  Returns true if the value was not
+  // present in the set before the insertion.
+  bool Insert(T value) {
+    bool new_insertion =
+        value_to_index_.insert({value, value_sequence_.size()}).second;
+    if (new_insertion) {
+      value_sequence_.push_back(value);
+    }
+    return new_insertion;
+  }
+
+  // Removes `value` from the set.  Assumes `value` is already present in the
+  // set.
+  void Erase(T value) {
+    auto it = value_to_index_.find(value);
+
+    // Since we don't want to move values around in `value_sequence_` we swap
+    // the value in the last position and with value to be deleted and then
+    // pop_back.
+    value_to_index_[value_sequence_.back()] = it->second;
+    std::swap(value_sequence_[it->second], value_sequence_.back());
+    value_sequence_.pop_back();
+    value_to_index_.erase(it);
+  }
+
+  void Reserve(size_t new_size) {
+    value_to_index_.reserve(new_size);
+    value_sequence_.reserve(new_size);
+  }
+
+  void Clear() {
+    value_to_index_.clear();
+    value_sequence_.clear();
+  }
+
+  bool Contains(T value) const { return value_to_index_.count(value); }
+  size_t Size() const { return value_sequence_.size(); }
+
+  const std::vector<T>& GetSequence() const { return value_sequence_; }
+
+ private:
+  // The stable order that we maintain through insertions and deletions.
+  std::vector<T> value_sequence_;
+
+  // Maps values to their indices in `value_sequence_`.
+  llvm::DenseMap<T, int> value_to_index_;
+};
+
+// ---------------------------------------------------------------------
+
+// GraphCycles detects the introduction of a cycle into a directed
+// graph that is being built up incrementally.
+//
+// Nodes are identified by small integers.  It is not possible to
+// record multiple edges with the same (source, destination) pair;
+// requests to add an edge where one already exists are silently
+// ignored.
+//
+// It is also not possible to introduce a cycle; an attempt to insert
+// an edge that would introduce a cycle fails and returns false.
+//
+// GraphCycles uses no internal locking; calls into it should be
+// serialized externally.
+
+// Performance considerations:
+//   Works well on sparse graphs, poorly on dense graphs.
+//   Extra information is maintained incrementally to detect cycles quickly.
+//   InsertEdge() is very fast when the edge already exists, and reasonably fast
+//   otherwise.
+//   FindPath() is linear in the size of the graph.
+// The current implementation uses O(|V|+|E|) space.
+
+class GraphCycles {
+ public:
+  explicit GraphCycles(int32_t num_nodes);
+  ~GraphCycles();
+
+  // Attempt to insert an edge from x to y.  If the
+  // edge would introduce a cycle, return false without making any
+  // changes. Otherwise add the edge and return true.
+  bool InsertEdge(int32_t x, int32_t y);
+
+  // Remove any edge that exists from x to y.
+  void RemoveEdge(int32_t x, int32_t y);
+
+  // Return whether there is an edge directly from x to y.
+  bool HasEdge(int32_t x, int32_t y) const;
+
+  // Contracts the edge from 'a' to node 'b', merging nodes 'a' and 'b'. One of
+  // the nodes is removed from the graph, and edges to/from it are added to
+  // the remaining one, which is returned. If contracting the edge would create
+  // a cycle, does nothing and return no value.
+  std::optional<int32_t> ContractEdge(int32_t a, int32_t b);
+
+  // Return whether dest_node `y` is reachable from source_node `x`
+  // by following edges. This is non-thread-safe version.
+  bool IsReachable(int32_t x, int32_t y);
+
+  // Return a copy of the successors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32_t> SuccessorsCopy(int32_t node) const;
+
+  // Returns all nodes in post order.
+  //
+  // If there is a path from X to Y then X appears after Y in the
+  // returned vector.
+  std::vector<int32_t> AllNodesInPostOrder() const;
+
+  // ----------------------------------------------------
+  struct Rep;
+
+ private:
+  GraphCycles(const GraphCycles&) = delete;
+  GraphCycles& operator=(const GraphCycles&) = delete;
+
+  Rep* rep_;  // opaque representation
+};
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_UTILS_CYCLE_DETECTOR_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
new file mode 100644
index 00000000..74dfa373
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
@@ -0,0 +1,145 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_UTILS_HLO_UTILS_H
+#define MLIR_HLO_UTILS_HLO_UTILS_H
+
+#include <complex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/ChloOps.h"
+
+namespace mlir {
+namespace hlo {
+// Computes the broadcast dimensions attr for an elementwise binary operator
+// between two ranked tensors.
+// If `allow_empty` is true, then null can be returned to mean that the
+// broadcast is an "identity".
+mlir::DenseI64ArrayAttr getBroadcastDimensionsAttr(mlir::Builder* b,
+                                                   mlir::Value x, mlir::Value y,
+                                                   bool allowEmpty = true);
+
+// Get a constant splat for the given value of type. Requires value to be of
+// type static shaped RankedTensorType.
+template <typename T>
+static ElementsAttr getSplat(Builder* b, RankedTensorType ty, T constant) {
+  Type elementTy = getElementTypeOrSelf(ty);
+
+  if (elementTy.isSignlessInteger())
+    return DenseElementsAttr::get(ty, b->getIntegerAttr(elementTy, constant));
+
+  if (mlir::isa<FloatType>(elementTy))
+    return DenseElementsAttr::get(ty, b->getFloatAttr(elementTy, constant));
+
+  if (auto complexTy = mlir::dyn_cast<ComplexType>(elementTy)) {
+    auto complexElementTy = complexTy.getElementType();
+    if (complexElementTy.isF32())
+      return DenseElementsAttr::get(ty,
+                                    static_cast<std::complex<float>>(constant));
+    if (complexElementTy.isF64())
+      return DenseElementsAttr::get(
+          ty, static_cast<std::complex<double>>(constant));
+  }
+  llvm_unreachable("unhandled element type");
+}
+
+template <typename T>
+static ElementsAttr getSplat(Builder* b, Value val, T constant) {
+  return getSplat(b, mlir::cast<RankedTensorType>(val.getType()), constant);
+}
+
+// Returns DenseElementsAttr of rank zero with the given element type and the
+// value.
+// Requires `ty` to be either FloatType, IntegerType, or ComplexType.
+DenseElementsAttr getScalarOfType(Type ty, int64_t rawValue);
+
+// Returns DenseElementsAttr of rank zero with the given element type and the
+// value which is the neutral element for additions.
+// Requires `ty` to be either FloatType, IntegerType, or ComplexType.
+DenseElementsAttr getScalarNegZeroOfType(Type ty);
+
+// Enum type used to specify scalar argument to GetScalarLimitOfType.
+enum ScalarLimit {
+  kLowest,          // The scalar corresponding to numeric_limits<T>::lowest.
+  kInfinityLowest,  // Like kLowest, but returns -infinity where available.
+  kMax,             // The scalar corresponding to numeric_limits<T>::max.
+  kInfinityMax,     // Like kMax, but returns infinity where available.
+};
+
+// Returns a scalar limit value for the given type.
+//
+// The argument 'limit' describes which scalar value to return.
+//
+// Requires `ty` to be either FloatType or IntegerType.
+DenseElementsAttr getScalarLimitOfType(Type ty, ScalarLimit limit);
+
+// Given `op_name` from LMHLO, returns the corresponding op name in MHLO.
+// Returns empty string if no such op exists.
+std::string lmhloToMhloOpName(llvm::StringRef opName,
+                              mlir::MLIRContext* context);
+
+// Return true if Attr has values [0, 1, ...].
+bool isSequenceStartingWith0(Attribute attr);
+
+// Returns the argument index for the giving FuncOp and its operand value.
+int64_t getArgumentIndex(func::FuncOp op, Value value);
+
+/// Computes the memory usage of the given allocations.
+std::pair<size_t, size_t> computeMemory(const std::vector<Value>& allocs);
+
+}  // namespace hlo
+}  // namespace mlir
+
+namespace mlir {
+namespace chlo {
+
+template <typename T>
+static Value getConstantLike(OpBuilder& b, Location loc, T constant,
+                             Value val) {
+  Type ty = getElementTypeOrSelf(val.getType());
+  auto getAttr = [&]() -> Attribute {
+    if (mlir::isa<IntegerType>(ty)) return b.getIntegerAttr(ty, constant);
+    if (mlir::isa<FloatType>(ty)) return b.getFloatAttr(ty, constant);
+    if (auto complexTy = mlir::dyn_cast<ComplexType>(ty))
+      return complex::NumberAttr::get(complexTy, constant, 0);
+    llvm_unreachable("unhandled element type");
+  };
+  return b.create<ConstantLikeOp>(loc, cast<TypedAttr>(getAttr()), val);
+}
+
+Value getConstantLike(OpBuilder& b, Location loc, const APFloat& constant,
+                      Value val);
+
+Value getConstantLikeMaxFiniteValue(OpBuilder& b, Location loc, Value val);
+
+Value getConstantLikeInfValue(OpBuilder& b, Location loc, Value val,
+                              bool negative);
+
+Value getConstantLikeSmallestFiniteValue(OpBuilder& b, Location loc, Value val);
+
+}  // namespace chlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_UTILS_HLO_UTILS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/placement_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/placement_utils.h
new file mode 100644
index 00000000..8f544c95
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/mlir_hlo/utils/placement_utils.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_UTILS_PLACEMENT_UTILS_H
+#define MLIR_HLO_UTILS_PLACEMENT_UTILS_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace mhlo {
+namespace placement_utils {
+
+constexpr llvm::StringRef cCpu = "cpu";
+constexpr llvm::StringRef cGpu = "gpu";
+
+}  // namespace placement_utils
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_UTILS_PLACEMENT_UTILS_H
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/overflow_util.h b/third_party/tflite-hdrs/third_party/xla/xla/overflow_util.h
new file mode 100644
index 00000000..00fa1c45
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/overflow_util.h
@@ -0,0 +1,99 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_OVERFLOW_UTIL_H_
+#define XLA_OVERFLOW_UTIL_H_
+
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+
+namespace xla {
+
+// Multiply two non-negative int64_t's, returning the two's complement result
+// and a bool which is true when overflow or negative inputs occurs and false
+// otherwise.
+ABSL_ATTRIBUTE_ALWAYS_INLINE inline std::pair<int64_t, bool>
+OverflowSafeMultiply(const int64_t x, const int64_t y) {
+#if ABSL_HAVE_BUILTIN(__builtin_mul_overflow)
+  int64_t result;
+  bool bad = __builtin_mul_overflow(x, y, &result);
+  bad |= x < 0;
+  bad |= y < 0;
+  return std::make_pair(result, bad);
+#else
+  // Multiply in uint64_t rather than int64_t since signed overflow is
+  // undefined. Negative values will wrap around to large unsigned values in the
+  // casts (see section 4.7 [conv.integral] of the C++14 standard).
+  const uint64_t ux = x;
+  const uint64_t uy = y;
+  const uint64_t uxy = ux * uy;
+
+  // Cast back to signed.
+  int64_t result = static_cast<int64_t>(uxy);
+  bool bad = result < 0;
+
+  // Check if we overflow uint64_t, using a cheap check if both inputs are small
+  if (ABSL_PREDICT_FALSE((ux | uy) >> 32 != 0)) {
+    if (x < 0 || y < 0) {
+      // Ensure nonnegativity.  Note that negative numbers will appear "large"
+      // to the unsigned comparisons above.
+      bad = true;
+    } else if (ux != 0 && uxy / ux != uy) {
+      // Otherwise, detect overflow using a division
+      bad = true;
+    }
+  }
+  return std::make_pair(result, bad);
+#endif
+}
+
+// Computes x + y and returns nullopt if it overflows.
+//
+// x and y must be signed integers.
+template <typename T>
+ABSL_ATTRIBUTE_ALWAYS_INLINE inline std::optional<T> OverflowSafeAdd(T x, T y) {
+  static_assert(std::is_signed<T>::value,
+                "Only implemented for signed numbers T.");
+  static_assert(std::is_integral<T>::value, "Only implemented for integers T.");
+#if ABSL_HAVE_BUILTIN(__builtin_add_overflow)
+  T result;
+  if (ABSL_PREDICT_FALSE(__builtin_add_overflow(x, y, &result))) {
+    return std::nullopt;
+  }
+  return result;
+#else
+  // "Signed integer overflow occurs on integer addition iff the operands have
+  // the same sign and the sum has a sign opposite to that of the operands."
+  // Hacker's Delight 2nd ed, p 28.
+  using U = typename std::make_unsigned<T>::type;
+  const U ux = x;
+  const U uy = y;
+  const U usum = ux + uy;
+  const T sum = usum;
+  if (x >= 0 == y >= 0 && sum >= 0 != x >= 0) {
+    return std::nullopt;
+  }
+  return sum;
+#endif
+}
+
+}  // namespace xla
+
+#endif  // XLA_OVERFLOW_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/packed_literal_reader.h b/third_party/tflite-hdrs/third_party/xla/xla/packed_literal_reader.h
new file mode 100644
index 00000000..4a5d5fdb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/packed_literal_reader.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PACKED_LITERAL_READER_H_
+#define XLA_PACKED_LITERAL_READER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/file_system.h"
+
+namespace xla {
+
+// Reads packed data from a metadata-less file as requested by a user (who must
+// know its internal format). These are yielded as (structured) literal values.
+class PackedLiteralReader {
+ public:
+  // Ownership of file is passed to this instance -- this instance takes
+  // responsibility for closing it.
+  explicit PackedLiteralReader(tsl::RandomAccessFile* file);
+  ~PackedLiteralReader();
+
+  // Yields the next packed literal with shape "shape" as read from the
+  // underlying file stream.
+  //
+  // Layout is optional. If it is not provided, no layout is set on the literal
+  // that is produced.
+  absl::StatusOr<Literal> Read(const Shape& shape,
+                               const Layout* layout = nullptr);
+
+  // Returns whether the input file has been fully exhausted; i.e. all available
+  // packed literals have been read and we're at the end of the file.
+  bool IsExhausted() const;
+
+ private:
+  tsl::RandomAccessFile* file_;  // We own and close in our destructor
+  uint64_t offset_;              // Next file offset to read from
+
+  PackedLiteralReader(const PackedLiteralReader&) = delete;
+  PackedLiteralReader& operator=(const PackedLiteralReader&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PACKED_LITERAL_READER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/parse_flags_from_env.h b/third_party/tflite-hdrs/third_party/xla/xla/parse_flags_from_env.h
new file mode 100644
index 00000000..e135a664
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/parse_flags_from_env.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PARSE_FLAGS_FROM_ENV_H_
+#define XLA_PARSE_FLAGS_FROM_ENV_H_
+
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environment variable, or (if the first
+// non-whitespace in the variable value is not '-'), a file named by that
+// environment variable.
+//
+// The accepted syntax is that flags arguments are of the form --flag=value or
+// (for boolean flags) --flag, and are whitespace separated.  The <value> may be
+// one of:
+//
+//  - <non-whitespace, non-nul not starting with single-quote or double-quote>
+//    in which case the effective value is the string itself
+//  - <single-quote><characters string not containing nul or
+//    single-quote><single_quote> in which case the effective value is the
+//    string with the single-quotes removed
+//  - <double-quote><character string not containing nul or unescaped
+//    double-quote><double_quote> in which case the effective value if the
+//    string with the double-quotes removed, and escaped sequences of
+//    <backslash><char> replaced by <char>.
+//
+// Flags values inconsistent with the type of the flag will be rejected by the
+// flag parser.
+//
+// Examples:
+//
+//  - TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
+//  - TF_XLA_FLAGS=/tmp/flagfile
+//
+// where /tmp/flagfile might contain
+//
+//  --some_flag="This is a string containing a \" and a '."
+//  --another_flag=wombats
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Calls tsl::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`.
+//
+// Raises a fatal error if any flags in `envvar` were not recognized, or if flag
+// parsing failed.
+void ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
+                                      const std::vector<tsl::Flag>& flag_list);
+
+// Calls tsl::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`, and returns
+// its return value.
+//
+// Ignores unknown flags, raises a fatal if flag parsing failed.
+void ParseFlagsFromEnvAndIgnoreUnknown(absl::string_view envvar,
+                                       const std::vector<tsl::Flag>& flag_list);
+
+// Used only for testing.  Not to be used by clients.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv);
+
+}  // namespace xla
+
+#endif  // XLA_PARSE_FLAGS_FROM_ENV_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/permutation_util.h b/third_party/tflite-hdrs/third_party/xla/xla/permutation_util.h
new file mode 100644
index 00000000..61dde9ca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/permutation_util.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for working with permutations.
+
+#ifndef XLA_PERMUTATION_UTIL_H_
+#define XLA_PERMUTATION_UTIL_H_
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/types.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Returns true if permutation is a permutation of the integers
+// [0, permutation.size()).
+bool IsPermutation(absl::Span<const int64_t> permutation);
+
+// Applies `permutation` on `input` and returns the permuted array.
+// For each i, output[i] = input[permutation[i]].
+//
+// Precondition:
+// 1. `permutation` is a permutation of 0..permutation.size()-1.
+// 2. permutation.size() == input.size().
+template <typename Container>
+std::vector<typename Container::value_type> Permute(
+    const Container& input, absl::Span<const int64_t> permutation) {
+  using T = typename Container::value_type;
+  absl::Span<const T> data(input);
+  CHECK_EQ(permutation.size(), data.size());
+  CHECK(IsPermutation(permutation));
+  std::vector<T> output(data.size());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    output[i] = data[permutation[i]];
+  }
+  return output;
+}
+// Applies the inverse of `permutation` on `input` and returns the permuted
+// array. For each i, output[permutation[i]] = input[i].
+//
+// Precondition:
+// 1. `permutation` is a permutation of 0..permutation.size()-1.
+// 2. permutation.size() == input.size().
+template <typename Container>
+std::vector<typename Container::value_type> PermuteInverse(
+    const Container& input, absl::Span<const int64_t> permutation) {
+  using T = typename Container::value_type;
+  absl::Span<const T> data(input);
+  CHECK_EQ(permutation.size(), data.size());
+  CHECK(IsPermutation(permutation));
+  std::vector<T> output(data.size());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    output[permutation[i]] = data[i];
+  }
+  return output;
+}
+
+// Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
+std::vector<int64_t> InversePermutation(
+    absl::Span<const int64_t> input_permutation);
+
+// Composes two permutations: output[i] = p1[p2[i]].
+std::vector<int64_t> ComposePermutations(absl::Span<const int64_t> p1,
+                                         absl::Span<const int64_t> p2);
+
+// Returns true iff permutation == {0, 1, 2, ...}.
+bool IsIdentityPermutation(absl::Span<const int64_t> permutation);
+
+}  // namespace xla
+
+#endif  // XLA_PERMUTATION_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api.h
new file mode 100644
index 00000000..f2fc3b1c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -0,0 +1,2378 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_H_
+#define XLA_PJRT_C_PJRT_C_API_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+// Read more on C API ABI versioning and compatibility here:
+// https://docs.google.com/document/d/1TKB5NyGtdzrpgw5mpyFjVAhJjpSNdF31T6pjPl_UT2o/edit?usp=sharing
+
+#define PJRT_STRUCT_SIZE(struct_type, last_field) \
+  offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
+
+// Must update PJRT_DEFINE_STRUCT_TRAITS with the new `last_field` after
+// adding a new member to a struct.
+#define PJRT_DEFINE_STRUCT_TRAITS(sname, last_field) \
+  typedef struct sname sname;                        \
+  enum { sname##_STRUCT_SIZE = PJRT_STRUCT_SIZE(sname, last_field) }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------- Extensions ----------------------------------
+
+typedef enum {
+  PJRT_Extension_Type_Gpu_Custom_Call = 0,
+  PJRT_Extension_Type_Profiler,
+  PJRT_Extension_Type_Custom_Partitioner,
+  PJRT_Extension_Type_Stream,
+  PJRT_Extension_Type_Layouts,
+  PJRT_Extension_Type_FFI,
+  PJRT_Extension_Type_MemoryDescriptions,
+} PJRT_Extension_Type;
+
+// PJRT_Extension_Base contains a type and a pointer to next
+// PJRT_Extension_Base. The framework can go through this chain to find an
+// extension and identify it with the type.
+typedef struct PJRT_Extension_Base {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  struct PJRT_Extension_Base* next;
+} PJRT_Extension_Base;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
+
+// --------------------------------- Version -----------------------------------
+
+// Incremented when an ABI-incompatible change is made to the interface.
+// Changes include:
+// * Deleting a method or argument
+// * Changing the type of an argument
+// * Rearranging fields in the PJRT_Api or argument structs
+#define PJRT_API_MAJOR 0
+
+// Incremented when the interface is updated in a way that is potentially
+// ABI-compatible with older versions, if supported by the caller and/or
+// implementation.
+//
+// Callers can implement forwards compatibility by using PJRT_Api_Version to
+// check if the implementation is aware of newer interface additions.
+//
+// Implementations can implement backwards compatibility by using the
+// `struct_size` fields to detect how many struct fields the caller is aware of.
+//
+// Changes include:
+// * Adding a new field to the PJRT_Api or argument structs
+// * Renaming a method or argument (doesn't affect ABI)
+#define PJRT_API_MINOR 61
+
+// The plugin should set the major_version and minor_version of
+// PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
+// this header that the implementation was compiled with.
+struct PJRT_Api_Version {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  int major_version;  // out
+  int minor_version;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Api_Version, minor_version);
+
+// ---------------------------------- Errors -----------------------------------
+
+// PJRT C API methods generally return a PJRT_Error*, which is nullptr if there
+// is no error and set if there is. The implementation allocates any returned
+// PJRT_Errors, but the caller is always responsible for freeing them via
+// PJRT_Error_Destroy.
+
+typedef struct PJRT_Error PJRT_Error;
+
+struct PJRT_Error_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Error* error;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Error_Destroy_Args, error);
+
+// Frees `error`. `error` can be nullptr.
+typedef void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args);
+
+struct PJRT_Error_Message_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_Error* error;
+  // Has the lifetime of `error`.
+  const char* message;  // out
+  size_t message_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Error_Message_Args, message_size);
+
+// Gets the human-readable reason for `error`. `message` has the lifetime of
+// `error`.
+typedef void PJRT_Error_Message(PJRT_Error_Message_Args* args);
+
+// Codes are based on https://abseil.io/docs/cpp/guides/status-codes
+typedef enum {
+  PJRT_Error_Code_CANCELLED = 1,
+  PJRT_Error_Code_UNKNOWN = 2,
+  PJRT_Error_Code_INVALID_ARGUMENT = 3,
+  PJRT_Error_Code_DEADLINE_EXCEEDED = 4,
+  PJRT_Error_Code_NOT_FOUND = 5,
+  PJRT_Error_Code_ALREADY_EXISTS = 6,
+  PJRT_Error_Code_PERMISSION_DENIED = 7,
+  PJRT_Error_Code_RESOURCE_EXHAUSTED = 8,
+  PJRT_Error_Code_FAILED_PRECONDITION = 9,
+  PJRT_Error_Code_ABORTED = 10,
+  PJRT_Error_Code_OUT_OF_RANGE = 11,
+  PJRT_Error_Code_UNIMPLEMENTED = 12,
+  PJRT_Error_Code_INTERNAL = 13,
+  PJRT_Error_Code_UNAVAILABLE = 14,
+  PJRT_Error_Code_DATA_LOSS = 15,
+  PJRT_Error_Code_UNAUTHENTICATED = 16
+} PJRT_Error_Code;
+
+struct PJRT_Error_GetCode_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_Error* error;
+  PJRT_Error_Code code;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Error_GetCode_Args, code);
+
+typedef PJRT_Error* PJRT_Error_GetCode(PJRT_Error_GetCode_Args* args);
+
+// Function for PJRT implementation to pass to callback functions provided by
+// caller so the callback can create a PJRT_Error* on error (to return to the
+// implementation). `message` is only required to live for the
+// PJRT_CallbackError call, i.e. the PJRT_CallbackError implementation must copy
+// `message` into the PJRT_Error.
+typedef PJRT_Error* (*PJRT_CallbackError)(PJRT_Error_Code code,
+                                          const char* message,
+                                          size_t message_size);
+
+// ---------------------------- Named Values -----------------------------------
+
+typedef enum {
+  PJRT_NamedValue_kString = 0,
+  PJRT_NamedValue_kInt64,
+  PJRT_NamedValue_kInt64List,
+  PJRT_NamedValue_kFloat,
+  PJRT_NamedValue_kBool,
+} PJRT_NamedValue_Type;
+
+// Named value for key-value pairs.
+struct PJRT_NamedValue {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* name;
+  size_t name_size;
+  PJRT_NamedValue_Type type;
+  union {
+    const char* string_value;
+    int64_t int64_value;
+    const int64_t* int64_array_value;
+    float float_value;
+    bool bool_value;
+  };
+  // `value_size` is the number of elements for array/string and 1 for scalar
+  // values.
+  size_t value_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_NamedValue, value_size);
+
+// ---------------------------------- Plugin -----------------------------------
+
+struct PJRT_Plugin_Initialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, extension_start);
+
+// One-time plugin setup. Must be called before any other functions are called.
+typedef PJRT_Error* PJRT_Plugin_Initialize(PJRT_Plugin_Initialize_Args* args);
+
+struct PJRT_Plugin_Attributes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  // Returned attributes have the lifetime of the process.
+  const PJRT_NamedValue* attributes;  // out
+  size_t num_attributes;              // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Attributes_Args, attributes);
+
+// Returns an array of plugin attributes which are key-value pairs. Common keys
+// include `xla_version`, `stablehlo_current_version`, and
+// `stablehlo_minimum_version`.
+typedef PJRT_Error* PJRT_Plugin_Attributes(PJRT_Plugin_Attributes_Args* args);
+
+// ---------------------------------- Events -----------------------------------
+
+// Represents a notifying event that is returned by PJRT APIs that enqueue
+// asynchronous work, informing callers when the work is complete and reporting
+// a value of type `PJRT_Error*` or `nullptr` as error status.
+//
+// Callers are always responsible for freeing `PJRT_Event`s by calling
+// `PJRT_Event_Destroy`.
+typedef struct PJRT_Event PJRT_Event;
+
+struct PJRT_Event_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Event* event;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Destroy_Args, event);
+
+// Frees `event`. `event` can be `nullptr`.
+typedef PJRT_Error* PJRT_Event_Destroy(PJRT_Event_Destroy_Args* args);
+
+struct PJRT_Event_IsReady_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Event* event;
+  bool is_ready;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_IsReady_Args, is_ready);
+
+// Returns true if this PJRT_Event has completed, including if an error has
+// occurred.
+typedef PJRT_Error* PJRT_Event_IsReady(PJRT_Event_IsReady_Args* args);
+
+struct PJRT_Event_Error_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Event* event;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Error_Args, event);
+
+// Should only be called if PJRT_Event_IsReady returns true.
+// Returns `nullptr` if there is no error.
+// The returned error should be freed with `PJRT_Error_Destroy`.
+//
+// If `PJRT_Event_Await` has been called, this will return a pointer to an
+// identical error status as that call, as will subsequent calls to
+// `PJRT_Event_Error`. However, each of these `PJRT_Error *` pointers are
+// independent of `PJRT_Error *`s returned by other function calls, so they must
+// each be freed separately using `PJRT_Error_Destroy`.
+typedef PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args);
+
+struct PJRT_Event_Await_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Event* event;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Await_Args, event);
+
+// Blocks the calling thread until `event` is ready, then returns the error
+// status (with `nullptr` indicating no error). The returned status should be
+// freed with `PJRT_Error_Destroy`.
+typedef PJRT_Error* PJRT_Event_Await(PJRT_Event_Await_Args* args);
+
+// A callback to be performed once an event is ready. It will be called on the
+// event's error state and a pointer to an object of the caller's choice.
+// Ownership of `error` is passed to the callback. The callback must destroy
+// `error` via `PJRT_Error_Destroy`. The caller retains ownership of `user_arg`.
+typedef void (*PJRT_Event_OnReadyCallback)(PJRT_Error* error, void* user_arg);
+
+struct PJRT_Event_OnReady_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Event* event;
+  PJRT_Event_OnReadyCallback callback;
+  // `user_arg` allows `callback` to be called with arbitrary arguments (e.g.
+  // via pointers in a struct cast to void*).
+  void* user_arg;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_OnReady_Args, user_arg);
+
+// Registers `callback` to be called once `event` is ready, with `event`'s
+// error status and a pointer to an object of the caller's choice as arguments.
+typedef PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args);
+
+// ---------------------------------- Client -----------------------------------
+
+typedef struct PJRT_Client PJRT_Client;
+typedef struct PJRT_Device PJRT_Device;
+typedef struct PJRT_Memory PJRT_Memory;
+typedef struct PJRT_ShapeSpec PJRT_ShapeSpec;
+typedef struct PJRT_DeviceDescription PJRT_DeviceDescription;
+typedef struct PJRT_TopologyDescription PJRT_TopologyDescription;
+typedef struct PJRT_Executable PJRT_Executable;
+typedef struct PJRT_LoadedExecutable PJRT_LoadedExecutable;
+typedef struct PJRT_Buffer PJRT_Buffer;
+typedef struct PJRT_AsyncHostToDeviceTransferManager
+    PJRT_AsyncHostToDeviceTransferManager;
+
+// The caller of PJRT_Client_Create can optionally provide a key-value store
+// accessible across nodes and/or processes. KV store access may be necessary to
+// create some multi-node/multi-process clients. The caller can provide the two
+// callbacks below to access the key-value store.
+
+// A callback to delete the value returned by PJRT_KeyValueGetCallback.
+typedef void (*PJRT_KeyValueGetCallback_ValueDeleter)(char* value);
+
+struct PJRT_KeyValueGetCallback_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* key;
+  size_t key_size;
+  int timeout_in_ms;
+  PJRT_CallbackError* callback_error;
+  void* user_arg;
+  char* value;        // out
+  size_t value_size;  // out
+  // The caller needs to set a PJRT_KeyValueGetCallback_ValueDeleter to delete
+  // the value returned by PJRT_KeyValueGetCallback. The implementation is
+  // responsible for copying `value` and then calling value_deleter_callback.
+  PJRT_KeyValueGetCallback_ValueDeleter value_deleter_callback;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValueGetCallback_Args,
+                          value_deleter_callback);
+
+// Requirements for PJRT_KeyValueGetCallback implementation: (1) Thread-safe.
+// (2) The caller that provides the two callbacks is responsible for avoiding
+// key collisions between different users of key-value store (i.e. between
+// different plugins, but not between different nodes in one plugin). (3)
+// Blocking.
+typedef PJRT_Error* (*PJRT_KeyValueGetCallback)(
+    PJRT_KeyValueGetCallback_Args* args);
+
+// Same as KeyValueGet, but returns `NotFoundError` immediately if the key is
+// not found.
+typedef void (*PJRT_KeyValueTryGetCallback_ValueDeleter)(char* value);
+
+struct PJRT_KeyValueTryGetCallback_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* key;
+  size_t key_size;
+  PJRT_CallbackError* callback_error;
+  void* user_arg;
+  char* value;        // out
+  size_t value_size;  // out
+  // The caller needs to set a PJRT_KeyValueTryGetCallback_ValueDeleter to
+  // delete the value returned by PJRT_KeyValueTryGetCallback. The
+  // implementation is responsible for copying `value` and then calling
+  // value_deleter_callback.
+  PJRT_KeyValueTryGetCallback_ValueDeleter value_deleter_callback;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValueTryGetCallback_Args,
+                          value_deleter_callback);
+
+// Requirements for PJRT_KeyValueTryGetCallback implementation: (1) Thread-safe.
+// (2) The caller that provides the two callbacks is responsible for avoiding
+// key collisions between different users of key-value store (i.e. between
+// different plugins, but not between different nodes in one plugin).
+typedef PJRT_Error* (*PJRT_KeyValueTryGetCallback)(
+    PJRT_KeyValueTryGetCallback_Args* args);
+
+struct PJRT_KeyValuePutCallback_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* key;
+  size_t key_size;
+  // Only needs to stay alive for the duration of the PJRT_KeyValuePutCallback
+  // call.
+  const char* value;
+  size_t value_size;
+  PJRT_CallbackError* callback_error;
+  void* user_arg;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValuePutCallback_Args, user_arg);
+
+// Requirements for PJRT_KeyValuePutCallback implementation: (1) Thread-safe.
+// (2) The caller that provides the two callbacks is responsible for avoiding
+// key collisions between different users of key-value store (i.e. between
+// different plugins, but not between different nodes in one plugin).
+typedef PJRT_Error* (*PJRT_KeyValuePutCallback)(
+    PJRT_KeyValuePutCallback_Args* args);
+
+struct PJRT_Client_Create_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  // Extra platform-specific options to create a client.
+  const PJRT_NamedValue* create_options;
+  size_t num_options;
+  // Key-value get/put callback provided by the caller of PJRT_Client_Create.
+  // PJRT client can use these callbacks to share information between
+  // processes/nodes.
+  PJRT_KeyValueGetCallback kv_get_callback;
+  // Will be passed to `kv_get_callback` as `user_arg` argument.
+  void* kv_get_user_arg;
+  PJRT_KeyValuePutCallback kv_put_callback;
+  // Will be passed to `kv_put_callback` as `user_arg` argument.
+  void* kv_put_user_arg;
+
+  PJRT_Client* client;  // out
+
+  // Key-value try-get callback provided by the caller of PJRT_Client_Create.
+  // Same as key-value get callback, but returns `NotFoundError` immediately if
+  // the key is not found.
+  PJRT_KeyValueTryGetCallback kv_try_get_callback;
+  // Will be passed to `kv_try_get_callback` as `user_arg` argument.
+  void* kv_try_get_user_arg;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Create_Args, kv_try_get_user_arg);
+
+// Creates and initializes a new PJRT_Client and returns in `client`.
+typedef PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args);
+
+struct PJRT_Client_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Destroy_Args, client);
+
+// Shuts down and frees `client`. `client` can be nullptr.
+typedef PJRT_Error* PJRT_Client_Destroy(PJRT_Client_Destroy_Args* args);
+
+struct PJRT_Client_PlatformName_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  // `platform_name` has the same lifetime as `client`. It is owned by `client`.
+  const char* platform_name;  // out
+  size_t platform_name_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_PlatformName_Args, platform_name_size);
+
+// Returns a string that identifies the platform (e.g. "cpu", "gpu", "tpu").
+typedef PJRT_Error* PJRT_Client_PlatformName(
+    PJRT_Client_PlatformName_Args* args);
+
+struct PJRT_Client_ProcessIndex_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  int process_index;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_ProcessIndex_Args, process_index);
+
+// Return the process index of this client. Always 0 in single-process
+// settings.
+typedef PJRT_Error* PJRT_Client_ProcessIndex(
+    PJRT_Client_ProcessIndex_Args* args);
+
+struct PJRT_Client_PlatformVersion_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  // `platform_version` has the same lifetime as `client`. It's owned by
+  // `client`.
+  const char* platform_version;  // out
+  size_t platform_version_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_PlatformVersion_Args,
+                          platform_version_size);
+
+// Returns a string containing human-readable, platform-specific version info
+// (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
+typedef PJRT_Error* PJRT_Client_PlatformVersion(
+    PJRT_Client_PlatformVersion_Args* args);
+
+struct PJRT_Client_TopologyDescription_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  // Is owned by and has the same lifetime as `client`.
+  PJRT_TopologyDescription* topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_TopologyDescription_Args, topology);
+
+// Returns the topology description of the runtime topology. The returned
+// topology is owned by the client and should not be deleted by the caller.
+typedef PJRT_Error* PJRT_Client_TopologyDescription(
+    PJRT_Client_TopologyDescription_Args* args);
+
+struct PJRT_Client_Devices_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_Device* const* devices;  // out
+  size_t num_devices;           // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Devices_Args, num_devices);
+
+// Returns a list of all devices visible to the runtime, including addressable
+// and non-addressable devices.
+typedef PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
+
+struct PJRT_Client_AddressableDevices_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_Device* const* addressable_devices;  // out
+  size_t num_addressable_devices;           // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_AddressableDevices_Args,
+                          num_addressable_devices);
+
+// Returns a list of devices that are addressable from the client.
+// Addressable devices are those that the client can issue commands to.
+// All devices are addressable in a single-process environment.
+typedef PJRT_Error* PJRT_Client_AddressableDevices(
+    PJRT_Client_AddressableDevices_Args* args);
+
+struct PJRT_Client_LookupDevice_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  int id;
+  // `device` has the same lifetime as `client`. It is owned by `client`.
+  PJRT_Device* device;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_LookupDevice_Args, device);
+
+// Returns a PJRT_Device* with the specified ID as returned by
+// PJRT_DeviceDescription_Id.
+typedef PJRT_Error* PJRT_Client_LookupDevice(
+    PJRT_Client_LookupDevice_Args* args);
+
+struct PJRT_Client_LookupAddressableDevice_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  int local_hardware_id;
+  // `addressable_device` has the same lifetime as `client`. It is owned by
+  // `client`.
+  PJRT_Device* addressable_device;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_LookupAddressableDevice_Args,
+                          addressable_device);
+
+// Returns an addressable PJRT_Device* with the specified ID as returned by
+// PJRT_DeviceDescription_LocalHardwareId.
+typedef PJRT_Error* PJRT_Client_LookupAddressableDevice(
+    PJRT_Client_LookupAddressableDevice_Args* args);
+
+struct PJRT_Client_AddressableMemories_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_Memory* const* addressable_memories;  // out
+  size_t num_addressable_memories;           // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_AddressableMemories_Args,
+                          num_addressable_memories);
+
+// Returns a list of memories that are addressable from the client. Addressable
+// memories are those that the client can directly transfer data to and from.
+// All memories are addressable in a single-process environment.
+typedef PJRT_Error* PJRT_Client_AddressableMemories(
+    PJRT_Client_AddressableMemories_Args* args);
+
+struct PJRT_Program {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  // Serialized code in the specified format below.
+  // String is owned by the caller.
+  char* code;  // in/out depending on usage
+  size_t code_size;
+  // Supported formats are:
+  // "hlo": code string takes serialized HloModuleProto.
+  // "hlo_with_config": code string takes serialized HloModuleProtoWithConfig.
+  // "mlir": code string takes MLIR module bytecode (or string).
+  // Ownership of `format` varies across API functions.
+  const char* format;
+  size_t format_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Program, format_size);
+
+struct PJRT_Client_Compile_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  // Only needs to stay alive for the duration of the Compile call.
+  // `program->format` and `program->format_size` are owned by the caller.
+  const PJRT_Program* program;
+  // TODO(b/240560013): consider putting some of option fields in priv.
+  // Serialized CompileOptionsProto
+  // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
+  const char* compile_options;
+  size_t compile_options_size;
+  PJRT_LoadedExecutable* executable;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Compile_Args, executable);
+
+// Compiles a program in specified format (such as MLIR or HLO) with given
+// `options`.
+typedef PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args);
+
+struct PJRT_Client_DefaultDeviceAssignment_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  int num_replicas;
+  int num_partitions;
+  // Must be greater than or equal to `num_replicas * num_partitions`
+  size_t default_assignment_size;
+  // Points to an array of size `default_assignment_size`.
+  // This API writes `num_replicas * num_partitions` ints within that buffer.
+  // The caller retains ownership of this memory.
+  int* default_assignment;  // pointer to array in; values written as out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DefaultDeviceAssignment_Args,
+                          default_assignment);
+
+typedef PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
+    PJRT_Client_DefaultDeviceAssignment_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_Destroy_Args,
+                          transfer_manager);
+
+// Frees `transfer_manager`. `transfer_manager` can be nullptr.
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_Destroy(
+    PJRT_AsyncHostToDeviceTransferManager_Destroy_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_TransferData_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  const void* data;
+  int64_t offset;
+  int64_t transfer_size;
+  bool is_last_transfer;
+  PJRT_Event* done_with_h2d_transfer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args,
+    done_with_h2d_transfer);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_TransferData(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args* args);
+
+typedef enum {
+  // Invalid primitive type to serve as default.
+  PJRT_Buffer_Type_INVALID,
+
+  // Predicates are two-state booleans.
+  PJRT_Buffer_Type_PRED,
+
+  // Signed integral values of fixed width.
+  PJRT_Buffer_Type_S8,
+  PJRT_Buffer_Type_S16,
+  PJRT_Buffer_Type_S32,
+  PJRT_Buffer_Type_S64,
+
+  // Unsigned integral values of fixed width.
+  PJRT_Buffer_Type_U8,
+  PJRT_Buffer_Type_U16,
+  PJRT_Buffer_Type_U32,
+  PJRT_Buffer_Type_U64,
+
+  // Floating-point values of fixed width.
+  PJRT_Buffer_Type_F16,
+  PJRT_Buffer_Type_F32,
+  PJRT_Buffer_Type_F64,
+
+  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+  // floating-point format, but uses 1 bit for the sign, 8 bits for the exponent
+  // and 7 bits for the mantissa.
+  PJRT_Buffer_Type_BF16,
+
+  // Complex values of fixed width.
+  //
+  // Paired F32 (real, imag), as in std::complex<float>.
+  PJRT_Buffer_Type_C64,
+  // Paired F64 (real, imag), as in std::complex<double>.
+  PJRT_Buffer_Type_C128,
+
+  // Truncated 8 bit floating-point formats.
+  PJRT_Buffer_Type_F8E5M2,
+  PJRT_Buffer_Type_F8E4M3FN,
+  PJRT_Buffer_Type_F8E4M3B11FNUZ,
+  PJRT_Buffer_Type_F8E5M2FNUZ,
+  PJRT_Buffer_Type_F8E4M3FNUZ,
+
+  // 4-bit integer types
+  PJRT_Buffer_Type_S4,
+  PJRT_Buffer_Type_U4,
+
+  PJRT_Buffer_Type_TOKEN,
+
+  // 2-bit integer types
+  PJRT_Buffer_Type_S2,
+  PJRT_Buffer_Type_U2,
+
+  // More truncated 8 bit floating-point formats.
+  PJRT_Buffer_Type_F8E4M3,
+  PJRT_Buffer_Type_F8E3M4,
+} PJRT_Buffer_Type;
+
+typedef enum {
+  // The runtime may not hold references to `data` after the call to
+  // `PJRT_Client_BufferFromHostBuffer` completes. The caller promises that
+  // `data` is immutable and will not be freed only for the duration of the
+  // PJRT_Client_BufferFromHostBuffer call.
+  PJRT_HostBufferSemantics_kImmutableOnlyDuringCall,
+
+  // The runtime may hold onto `data` after the call to
+  // `PJRT_Client_BufferFromHostBuffer`
+  // returns while the runtime completes a transfer to the device. The caller
+  // promises not to mutate or free `data` until the transfer completes, at
+  // which point `done_with_host_buffer` will be triggered.
+  PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes,
+
+  // The PjRtBuffer may alias `data` internally and the runtime may use the
+  // `data` contents as long as the buffer is alive. The runtime promises not
+  // to mutate contents of the buffer (i.e. it will not use it for aliased
+  // output buffers). The caller promises to keep `data` alive and not to mutate
+  // its contents as long as the buffer is alive; to notify the caller that the
+  // buffer may be freed, the runtime will call `done_with_host_buffer` when the
+  // PjRtBuffer is freed.
+  PJRT_HostBufferSemantics_kImmutableZeroCopy,
+
+  // The PjRtBuffer may alias `data` internally and the runtime may use the
+  // `data` contents as long as the buffer is alive. The runtime is allowed
+  // to mutate contents of the buffer (i.e. use it for aliased output
+  // buffers). The caller promises to keep `data` alive and not to mutate its
+  // contents as long as the buffer is alive (otherwise it could be a data
+  // race with the runtime); to notify the caller that the buffer may be
+  // freed, the runtime will call `on_done_with_host_buffer` when the
+  // PjRtBuffer is freed. On non-CPU platforms this acts identically to
+  // kImmutableUntilTransferCompletes.
+  PJRT_HostBufferSemantics_kMutableZeroCopy,
+} PJRT_HostBufferSemantics;
+
+typedef enum {
+  PJRT_Buffer_MemoryLayout_Type_Tiled = 0,
+  PJRT_Buffer_MemoryLayout_Type_Strides,
+} PJRT_Buffer_MemoryLayout_Type;
+
+struct PJRT_Buffer_MemoryLayout_Tiled {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  // A map from physical dimension numbers to logical dimension numbers.
+  // The first element is the most minor physical dimension (fastest varying
+  // index) and the last the most major (slowest varying index). The contents of
+  // the vector are the indices of the *logical* dimensions in the shape. Must
+  // be the same size as the number of dimensions of the buffer.
+  const int64_t* minor_to_major;
+  size_t minor_to_major_size;
+  // A concatenated list of tile dimensions.
+  const int64_t* tile_dims;
+  // The list of tile dimension sizes. The size of this list is `num_tiles`.
+  const size_t* tile_dim_sizes;
+  size_t num_tiles;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Tiled, num_tiles);
+
+struct PJRT_Buffer_MemoryLayout_Strides {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  // Number of bytes to traverse per dimension. Must be the same size as
+  // the number of dimensions of the data. Caution: `byte_strides` are allowed
+  // to be negative, in which case data may need to point to the interior of
+  // the buffer, not necessarily its start.
+  const int64_t* byte_strides;
+  size_t num_byte_strides;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Strides, num_byte_strides);
+
+// Describe the memory layout. It can be (1) a list of minor-to-major order and
+// optional tilings (each tile is a list of dimensions), or (2) a list of
+// strides.
+struct PJRT_Buffer_MemoryLayout {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  union {
+    PJRT_Buffer_MemoryLayout_Tiled tiled;
+    PJRT_Buffer_MemoryLayout_Strides strides;
+  };
+  PJRT_Buffer_MemoryLayout_Type type;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout, type);
+
+struct PJRT_Client_BufferFromHostBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  // Pointer to the host buffer
+  const void* data;
+  // The type of the `data`, and the type of the resulting output `buffer`
+  PJRT_Buffer_Type type;
+  // The array dimensions of `data`.
+  const int64_t* dims;
+  size_t num_dims;
+
+  // Number of bytes to traverse per dimension of the input data. Must be the
+  // same size as `dims`, or empty. If empty, the array is assumed to have a
+  // dense layout with dimensions in major-to-minor order
+  // Caution: `byte_strides` are allowed to be negative, in which case `data`
+  // may need to point to the interior of the buffer, not necessarily its start.
+  const int64_t* byte_strides;
+  size_t num_byte_strides;
+
+  PJRT_HostBufferSemantics host_buffer_semantics;
+
+  // Device to copy host data to.
+  PJRT_Device* device;
+
+  // If nullptr, host data will be copied to `device`, otherwise we copy data to
+  // `memory`.
+  PJRT_Memory* memory;
+
+  // The caller is responsible to keep the data (tiled or strides) in the
+  // device_layout alive during the call. If nullptr, the device layout is
+  // assumed to be a dense layout with dimensions in major-to-minor order.
+  PJRT_Buffer_MemoryLayout* device_layout;
+
+  // Event indicating when it's safe to free `data`. The caller is responsible
+  // for calling PJRT_Event_Destroy.
+  PJRT_Event* done_with_host_buffer;  // out
+
+  // Output device buffer. The caller is responsible for calling
+  // PJRT_Buffer_Destroy.
+  PJRT_Buffer* buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_BufferFromHostBuffer_Args, buffer);
+
+// Asynchronously copies a buffer stored on host to device memory.
+typedef PJRT_Error* PJRT_Client_BufferFromHostBuffer(
+    PJRT_Client_BufferFromHostBuffer_Args* args);
+
+struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  // A pointer to a non-owned device buffer. A PJRT_Buffer that is a non-owned
+  // view of this device buffer will be created.
+  void* device_buffer_ptr;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Buffer_Type element_type;
+  PJRT_Buffer_MemoryLayout* layout;
+  // The device that `device_buffer_ptr` is on.
+  PJRT_Device* device;
+  // A callback to be performed when the PJRT_Buffer is done with the on-device
+  // buffer. This callback is optional and can be a nullptr.
+  void (*on_delete_callback)(void* device_buffer_ptr, void* user_arg);
+  // `on_delete_callback_arg` will be passed to `on_delete_callback` as
+  // `user_arg` argument.
+  void* on_delete_callback_arg;
+  // A platform-specific stream handle that should contain the work or events
+  // needed to materialize the on-device buffer. It is optional and can be
+  // casted from a nullptr. PJRT_Client_CreateViewOfDeviceBuffer_Args will
+  // append an event to `stream` that indicates when the returned buffer is
+  // ready to use. This is intended to support dlpack on GPU and is not expected
+  // to be supported on all hardware platforms.
+  intptr_t stream;
+  PJRT_Buffer* buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, buffer);
+
+// Creates a PJRT buffer that is a non-owned view of an on-device buffer
+// (typically allocated by another library). The buffer may be mutated,
+// for example, if the buffer is donated to an Execute operation. This method is
+// not required on all hardware platforms.
+typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
+    PJRT_Client_CreateViewOfDeviceBuffer_Args* args);
+
+struct PJRT_ShapeSpec {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Buffer_Type element_type;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ShapeSpec, element_type);
+
+struct PJRT_Client_CreateBuffersForAsyncHostToDevice_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_ShapeSpec* shape_specs;
+  size_t num_shape_specs;
+  PJRT_Buffer_MemoryLayout** device_layouts;  // optional
+  size_t num_device_layouts;
+  PJRT_Memory* memory;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateBuffersForAsyncHostToDevice_Args,
+                          transfer_manager);
+typedef PJRT_Error* PJRT_Client_CreateBuffersForAsyncHostToDevice(
+    PJRT_Client_CreateBuffersForAsyncHostToDevice_Args* args);
+
+// -------------------------- Device Descriptions ------------------------------
+
+// Device descriptions may be associated with an actual device
+// (via PJRT_Device_GetDescription), but they can also be used to describe a
+// device that isn't currently available to the plugin. This is useful for
+// compiling executables without hardware available, which can then be
+// serialized and written somewhere durable, and then loaded and run on actual
+// hardware later.
+
+struct PJRT_DeviceDescription_Id_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  int id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Id_Args, id);
+
+// The ID of this device. IDs are unique among devices of this type
+// (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
+// hosts' devices.
+typedef PJRT_Error* PJRT_DeviceDescription_Id(
+    PJRT_DeviceDescription_Id_Args* args);
+
+struct PJRT_DeviceDescription_ProcessIndex_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  int process_index;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_ProcessIndex_Args,
+                          process_index);
+
+// The index of the process that this device belongs to, i.e. is addressable
+// from. This is not always identical to PJRT_Client_ProcessIndex in a
+// multi-process setting, where each client can see devices from all
+// processes, but only a subset of them are addressable and have the same
+// process_index as the client.
+typedef PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
+    PJRT_DeviceDescription_ProcessIndex_Args* args);
+
+struct PJRT_DeviceDescription_Attributes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  size_t num_attributes;              // out
+  const PJRT_NamedValue* attributes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Attributes_Args, attributes);
+
+// Returns an array of device specific attributes with attribute name, value
+// and value type.
+typedef PJRT_Error* PJRT_DeviceDescription_Attributes(
+    PJRT_DeviceDescription_Attributes_Args* args);
+
+struct PJRT_DeviceDescription_Kind_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  // `device_kind` string is owned by `device` and has same lifetime as
+  // `device`.
+  const char* device_kind;  // out
+  size_t device_kind_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Kind_Args, device_kind_size);
+
+// A vendor-dependent string that uniquely identifies the kind of device,
+// e.g., "Tesla V100-SXM2-16GB".
+typedef PJRT_Error* PJRT_DeviceDescription_Kind(
+    PJRT_DeviceDescription_Kind_Args* args);
+
+struct PJRT_DeviceDescription_DebugString_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  const char* debug_string;  // out
+  size_t debug_string_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_DebugString_Args,
+                          debug_string_size);
+
+// Debug string suitable for logging when errors occur. Should be verbose
+// enough to describe the current device unambiguously.
+typedef PJRT_Error* PJRT_DeviceDescription_DebugString(
+    PJRT_DeviceDescription_DebugString_Args* args);
+
+struct PJRT_DeviceDescription_ToString_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  const char* to_string;  // out
+  size_t to_string_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_ToString_Args, to_string_size);
+
+// Debug string suitable for reading by end users, should be reasonably terse,
+// for example: "CpuDevice(id=0)".
+typedef PJRT_Error* PJRT_DeviceDescription_ToString(
+    PJRT_DeviceDescription_ToString_Args* args);
+
+// --------------------------------- Devices -----------------------------------
+
+struct PJRT_Device_GetDescription_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  PJRT_DeviceDescription* device_description;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_GetDescription_Args, device_description);
+
+// Fetch the DeviceDescription associated with this device.
+typedef PJRT_Error* PJRT_Device_GetDescription(
+    PJRT_Device_GetDescription_Args* args);
+
+struct PJRT_Device_IsAddressable_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  bool is_addressable;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_IsAddressable_Args, is_addressable);
+
+// Whether client can issue command to this device.
+typedef PJRT_Error* PJRT_Device_IsAddressable(
+    PJRT_Device_IsAddressable_Args* args);
+
+struct PJRT_Device_LocalHardwareId_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  int local_hardware_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_LocalHardwareId_Args, local_hardware_id);
+
+// Opaque hardware ID, e.g., the CUDA device number. In general, not guaranteed
+// to be dense, and -1 if undefined.
+typedef PJRT_Error* PJRT_Device_LocalHardwareId(
+    PJRT_Device_LocalHardwareId_Args* args);
+
+struct PJRT_Device_AddressableMemories_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  // Has the lifetime of `device`.
+  PJRT_Memory* const* memories;  // out
+  size_t num_memories;           // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_AddressableMemories_Args, num_memories);
+
+// Returns the memories that a device can address.
+typedef PJRT_Error* PJRT_Device_AddressableMemories(
+    PJRT_Device_AddressableMemories_Args* args);
+
+struct PJRT_Device_DefaultMemory_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  // `memory` has the same lifetime as `device`.
+  PJRT_Memory* memory;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_DefaultMemory_Args, memory);
+
+// Returns the default memory of a device, i.e. which memory data processed by
+// this device should be stored in by default.
+typedef PJRT_Error* PJRT_Device_DefaultMemory(
+    PJRT_Device_DefaultMemory_Args* args);
+
+struct PJRT_Device_MemoryStats_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+
+  // Number of bytes in use.
+  int64_t bytes_in_use;  // out
+
+  // The peak bytes in use.
+  int64_t peak_bytes_in_use;      // out
+  bool peak_bytes_in_use_is_set;  // out
+  // Number of allocations.
+  int64_t num_allocs;      // out
+  bool num_allocs_is_set;  // out
+  // The largest single allocation seen.
+  int64_t largest_alloc_size;      // out
+  bool largest_alloc_size_is_set;  // out
+  // The upper limit of user-allocatable device memory in bytes.
+  int64_t bytes_limit;      // out
+  bool bytes_limit_is_set;  // out
+
+  // Number of bytes reserved.
+  int64_t bytes_reserved;      // out
+  bool bytes_reserved_is_set;  // out
+  // The peak number of bytes reserved.
+  int64_t peak_bytes_reserved;      // out
+  bool peak_bytes_reserved_is_set;  // out
+  // The upper limit on the number bytes of reservable memory.
+  int64_t bytes_reservable_limit;      // out
+  bool bytes_reservable_limit_is_set;  // out
+
+  // Largest free block size in bytes.
+  int64_t largest_free_block_bytes;      // out
+  bool largest_free_block_bytes_is_set;  // out
+
+  // Number of bytes of memory held by the allocator.  This may be higher than
+  // bytes_in_use if the allocator holds a pool of memory (e.g. BFCAllocator).
+  int64_t pool_bytes;           // out
+  bool pool_bytes_is_set;       // out
+  int64_t peak_pool_bytes;      // out
+  bool peak_pool_bytes_is_set;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_MemoryStats_Args, peak_pool_bytes_is_set);
+
+// Device memory/allocator statistics. All returned stats except `bytes_in_use`
+// are optional and may not be returned by all platforms. Implementations may
+// also return PJRT_Error_Code_UNIMPLEMENTED. Intended for diagnostic purposes.
+typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
+
+//-------------------------------- Memory --------------------------------------
+
+struct PJRT_Memory_Id_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  int id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Id_Args, id);
+
+// The ID of this memory. IDs are unique among memories of this type.
+typedef PJRT_Error* PJRT_Memory_Id(PJRT_Memory_Id_Args* args);
+
+struct PJRT_Memory_Kind_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  // `memory_kind` has same lifetime as `memory`.
+  const char* kind;  // out
+  size_t kind_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, kind_size);
+
+// A platform-dependent string that uniquely identifies the kind of the memory.
+typedef PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
+
+struct PJRT_Memory_Kind_Id_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  int kind_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Id_Args, kind_id);
+
+// A platform-dependent ID that uniquely identifies the kind of the memory.
+typedef PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args);
+
+struct PJRT_Memory_DebugString_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  const char* debug_string;  // out
+  size_t debug_string_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_DebugString_Args, debug_string_size);
+
+// Debug string suitable for logging when errors occur. Should be verbose
+// enough to describe the current memory unambiguously.
+typedef PJRT_Error* PJRT_Memory_DebugString(PJRT_Memory_DebugString_Args* args);
+
+struct PJRT_Memory_ToString_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  const char* to_string;  // out
+  size_t to_string_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_ToString_Args, to_string_size);
+
+// Debug string suitable for reading by end users, should be reasonably terse.
+typedef PJRT_Error* PJRT_Memory_ToString(PJRT_Memory_ToString_Args* args);
+
+struct PJRT_Memory_AddressableByDevices_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  PJRT_Device* const* devices;  // out
+  size_t num_devices;           // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_AddressableByDevices_Args, num_devices);
+
+// Returns the devices that can address this memory.
+typedef PJRT_Error* PJRT_Memory_AddressableByDevices(
+    PJRT_Memory_AddressableByDevices_Args* args);
+
+// ------------------------------- Execute Context -----------------------------
+
+// An opaque context passed to an execution that may be used to supply
+// additional arguments to a derived class of PJRT_Executable. It is a caller
+// responsibility to ensure that the context is valid for the duration of the
+// execution.
+typedef struct PJRT_ExecuteContext PJRT_ExecuteContext;
+
+struct PJRT_ExecuteContext_Create_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_ExecuteContext* context;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteContext_Create_Args, context);
+
+// Creates an execute context.
+typedef PJRT_Error* PJRT_ExecuteContext_Create(
+    PJRT_ExecuteContext_Create_Args* args);
+
+struct PJRT_ExecuteContext_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_ExecuteContext* context;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteContext_Destroy_Args, context);
+
+// Frees an execute context. `context` can be nullptr.
+typedef PJRT_Error* PJRT_ExecuteContext_Destroy(
+    PJRT_ExecuteContext_Destroy_Args* args);
+
+// ------------------------------- Executables ---------------------------------
+
+struct PJRT_Executable_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Destroy_Args, executable);
+
+// Frees `executable`. `executable` can be nullptr.
+typedef PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
+
+struct PJRT_LoadedExecutable_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Destroy_Args, executable);
+
+// Frees `executable` and deletes the underlying runtime object as if
+// `PJRT_LoadedExecutable_Delete` were called. `executable` can be nullptr.
+typedef PJRT_Error* PJRT_LoadedExecutable_Destroy(
+    PJRT_LoadedExecutable_Destroy_Args* args);
+
+struct PJRT_LoadedExecutable_GetExecutable_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* loaded_executable;
+  PJRT_Executable* executable;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_GetExecutable_Args, executable);
+
+// Constructs a PJRT_Executable from a PJRT_LoadedExecutable. The returned
+// executable should be freed by the caller with PJRT_Executable_Destroy.
+typedef PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
+    PJRT_LoadedExecutable_GetExecutable_Args* args);
+
+struct PJRT_Executable_Name_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  // `executable_name` has the same lifetime as `executable`. It is owned by
+  // `executable`.
+  const char* executable_name;  // out
+  size_t executable_name_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Name_Args, executable_name_size);
+
+// Returns a string that identifies the executable.
+typedef PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
+
+// TODO(b/269178731): Revisit whether num_replicas is needed.
+struct PJRT_Executable_NumReplicas_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_replicas;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_NumReplicas_Args, num_replicas);
+
+// Returns the number of replicas of the executable.
+typedef PJRT_Error* PJRT_Executable_NumReplicas(
+    PJRT_Executable_NumReplicas_Args* args);
+
+struct PJRT_Executable_NumPartitions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_partitions;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_NumPartitions_Args, num_partitions);
+
+// Returns the number of partitions of the executable.
+typedef PJRT_Error* PJRT_Executable_NumPartitions(
+    PJRT_Executable_NumPartitions_Args* args);
+
+struct PJRT_LoadedExecutable_AddressableDevices_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+  PJRT_Device* const* addressable_devices;  // out
+  size_t num_addressable_devices;           // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_AddressableDevices_Args,
+                          num_addressable_devices);
+
+// Returns a list of devices this executable will run on.
+typedef PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
+    PJRT_LoadedExecutable_AddressableDevices_Args* args);
+
+struct PJRT_Executable_OptimizedProgram_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  PJRT_Program* program;  // out, but read below
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_OptimizedProgram_Args, program);
+
+// Retrieves the optimized program for a given PJRT_Executable (SPMD).
+// The caller should populate `program->format` and `format_size`.
+//
+// The implementation will set `program->format` and `program->format_size`
+// to inform callers of the format of the optimized program returned.
+// These members are owned by the implementation.
+//
+// If called with nullptr as `program->code`, `PJRT_Executable_OptimizedProgram`
+// will populate `program->code_size` as an output indicating the number of
+// bytes the string `program->code` requires.
+//
+// If `program->code` is not null, `PJRT_Executable_OptimizedProgram` will fill
+// the buffer pointed to by `program->code` with the serialization of the
+// optimized HLO program. `program->code` must point to a client-owned buffer of
+// size >= `program->code_size`, which must be at large enough to hold the
+// serialization of the optimized program.
+//
+// Callers should generally call this function twice with the same `args`.
+// In the first call, `program->code` must be nullptr. This call will populate
+// `program->code_size`. Clients should then allocate a buffer `code_buff` of at
+// least `code_size` bytes. Before the second call, callers should set
+// `program->code = code_buff`. The second call will then write the serialized
+// program to `code_buff`.
+typedef PJRT_Error* PJRT_Executable_OptimizedProgram(
+    PJRT_Executable_OptimizedProgram_Args* args);
+
+struct PJRT_LoadedExecutable_Delete_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Delete_Args, executable);
+
+// Drops `executable`'s reference to the internal runtime object and
+// associated resources, without freeing the `executable` object itself.
+// `executable` can only be used with PJRT_LoadedExecutable_IsDeleted and
+// PJRT_LoadedExecutable_Destroy after calling this method. The internal runtime
+// executable will be freed after the last execution completes.
+typedef PJRT_Error* PJRT_LoadedExecutable_Delete(
+    PJRT_LoadedExecutable_Delete_Args* args);
+
+struct PJRT_LoadedExecutable_IsDeleted_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+  bool is_deleted;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_IsDeleted_Args, is_deleted);
+
+// True if and only if PJRT_LoadedExecutable_Delete has previously been called.
+typedef PJRT_Error* PJRT_LoadedExecutable_IsDeleted(
+    PJRT_LoadedExecutable_IsDeleted_Args* args);
+
+typedef struct PJRT_Chunk {
+  void* data;
+  size_t size;
+  void (*deleter)(void* data, void* deleter_arg);
+  // `deleter_arg` will be passed to `deleter` as `deleter_arg` argument.
+  void* deleter_arg;
+} PJRT_Chunk;
+
+// TODO(b/263390934) implement C API that calls `AddChunk` and other
+// `xla::CopyToDeviceStream`.
+typedef struct PJRT_CopyToDeviceStream PJRT_CopyToDeviceStream;
+
+struct PJRT_TransferMetadata;
+
+// Returns PJRT_Error* created by PJRT_CallbackError in case of error.
+// Otherwise, returns nullptr. The callback must call
+// `chunk->deleter(chunk->data, chunk->deleter_arg)` when it's finished with
+// `chunk`.
+typedef PJRT_Error* (*PJRT_SendCallback)(PJRT_Chunk* chunk,
+                                         PJRT_CallbackError* callback_error,
+                                         size_t total_size_in_bytes, bool done,
+                                         void* user_arg);
+// The callback takes the ownership of the stream object. The callback must call
+// `PJRT_CopyToDeviceStream_Destroy` when it is done with the stream.
+typedef void (*PJRT_RecvCallback)(PJRT_CopyToDeviceStream* stream,
+                                  void* user_arg);
+
+struct PJRT_SendCallbackInfo {
+  // Used to associate this callback with the correct send op.
+  int64_t channel_id;
+  // Will be passed to `send_callback` as `user_arg` argument.
+  void* user_arg;
+  PJRT_SendCallback send_callback;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_SendCallbackInfo, send_callback);
+
+struct PJRT_RecvCallbackInfo {
+  // Used to associate this callback with the correct recv op.
+  int64_t channel_id;
+  // Will be passed to `recv_callback` as `user_arg` argument.
+  void* user_arg;
+  PJRT_RecvCallback recv_callback;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_RecvCallbackInfo, recv_callback);
+
+struct PJRT_ExecuteOptions {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  // Callbacks for when send/recv ops are executed. The outer lists correspond
+  // to each device returned by `PJRT_Executable_AddressableDevices` for
+  // `executable` (i.e. they will have length `num_devices`). Each inner list
+  // contains callback info for each send/recv op in `executable`; the order
+  // doesn't matter as the channel IDs are used instead. The callbacks can be
+  // stateful and the user code is responsible for managing state. The callback
+  // functions must outlive the execution (but not the info structs or lists).
+  PJRT_SendCallbackInfo** send_callbacks;
+  PJRT_RecvCallbackInfo** recv_callbacks;
+  size_t num_send_ops;
+  size_t num_recv_ops;
+  // If non-zero, identifies this execution as part of a potentially
+  // multi-device launch. This can be used to detect scheduling errors, e.g. if
+  // multi-host programs are launched in different orders on different hosts,
+  // the launch IDs may be used by the runtime to detect the mismatch.
+  int launch_id;
+  // A list of indices denoting the input buffers that should not be donated.
+  // An input buffer may be non-donable, for example, if it is referenced more
+  // than once. Since such runtime information is not available at compile time,
+  // the compiler might mark the input as `may-alias`, which could lead PjRt to
+  // donate the input buffer when it should not. By defining this list of
+  // indices, a higher-level PJRT caller can instruct PJRT client not to donate
+  // specific input buffers. The caller needs to make sure to keep it alive
+  // during the call.
+  const int64_t* non_donatable_input_indices;
+  size_t num_non_donatable_input_indices;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, num_non_donatable_input_indices);
+
+struct PJRT_LoadedExecutable_Execute_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+  // Only needs to stay alive for the duration of the Execute call.
+  PJRT_ExecuteOptions* options;
+  // Execution input of size [`num_devices`, `num_args`].
+  PJRT_Buffer* const* const* argument_lists;
+  size_t num_devices;
+  size_t num_args;
+  // Execution output of size [`num_devices`, num_outputs`], where `num_outputs`
+  // is the number of outputs returned by this executable per device. Both the
+  // outer (`PJRT_Buffer***`) and inner lists (`PJRT_Buffer**`) must be
+  // allocated and deallocated by the caller. PJRT_Buffer_Destroy must be called
+  // on the output PJRT_Buffer*.
+  PJRT_Buffer** const* output_lists;  // in/out
+  // If `device_complete_events` isn't nullptr, `device_complete_events` needs
+  // to be the same length as `output_lists` (i.e. of length `num_devices`), and
+  // each `PJRT_Event` will become ready once the corresponding device execution
+  // is complete. If Execute returns an error, then `device_complete_events`
+  // will not be populated. The caller is responsible for calling
+  // PJRT_Event_Destroy on the returned PJRT_Event*s.
+  PJRT_Event** device_complete_events;  // in/out
+  // The device to execute on. If nullptr, will execute on the device(s)
+  // specified at compile time. If set, must be an addressable device, and
+  // `num_devices` should be 1 with `argument_lists` only containing arguments
+  // for `execute_device`. Can be set with a multi-device executable to launch
+  // just on this device. In this case, it's the responsibility of the caller to
+  // make sure the executable is launched on all participating devices specified
+  // at compile time. Setting this field may not be supported on all platforms
+  // or executables.
+  PJRT_Device* execute_device;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Execute_Args, execute_device);
+
+// Executes on devices addressable by the client.
+typedef PJRT_Error* PJRT_LoadedExecutable_Execute(
+    PJRT_LoadedExecutable_Execute_Args* args);
+
+struct PJRT_Executable_NumOutputs_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_outputs;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_NumOutputs_Args, num_outputs);
+
+// Gets the number of outputs per device produced by `executable`.
+typedef PJRT_Error* PJRT_Executable_NumOutputs(
+    PJRT_Executable_NumOutputs_Args* args);
+
+struct PJRT_Executable_SizeOfGeneratedCodeInBytes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  int64_t size_in_bytes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_SizeOfGeneratedCodeInBytes_Args,
+                          size_in_bytes);  // last field in the struct
+
+typedef PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
+    PJRT_Executable_SizeOfGeneratedCodeInBytes_Args* args);
+
+struct PJRT_Executable_Fingerprint_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  // Has the lifetime of `executable`
+  const char* executable_fingerprint;  // out
+  size_t executable_fingerprint_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Fingerprint_Args,
+                          executable_fingerprint_size);
+
+// A unique fingerprint for `executable`. Two executables that were produced by
+// compiling with identical inputs (same program, compile options, compiler
+// version, etc.) should have the same fingerprint. May not be implemented by
+// all platforms.
+typedef PJRT_Error* PJRT_Executable_Fingerprint(
+    PJRT_Executable_Fingerprint_Args* args);
+
+struct PJRT_Executable_GetCostAnalysis_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_properties;  // out
+  // `properties` and any embedded data are owned by and have the same lifetime
+  // as `executable`.
+  const PJRT_NamedValue* properties;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCostAnalysis_Args, properties);
+
+// Get the cost properties for the executable. Different platforms may return
+// different properties; for example, some platforms may return the number of
+// operations, or memory size of the input/output of the executable, based on
+// program analysis.
+typedef PJRT_Error* PJRT_Executable_GetCostAnalysis(
+    PJRT_Executable_GetCostAnalysis_Args* args);
+
+struct PJRT_Executable_GetCompiledMemoryStats_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+
+  // Mirrors xla::CompiledMemoryStats.
+  // Device default memory (e.g., HBM for GPU/TPU) usage stats.
+  int64_t generated_code_size_in_bytes;  // out
+  int64_t argument_size_in_bytes;        // out
+  int64_t output_size_in_bytes;          // out
+  // How much argument is reused for output.
+  int64_t alias_size_in_bytes;  // out
+  int64_t temp_size_in_bytes;   // out
+
+  // Host memory usage stats.
+  int64_t host_generated_code_size_in_bytes;  // out
+  int64_t host_argument_size_in_bytes;        // out
+  int64_t host_output_size_in_bytes;          // out
+  int64_t host_alias_size_in_bytes;           // out
+  int64_t host_temp_size_in_bytes;            // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCompiledMemoryStats_Args,
+                          host_temp_size_in_bytes);
+
+// Return memory stats that allow callers to estimate memory usage when running
+// this executable. The memory stats could contain usage info from different
+// memory spaces, like default memory (e.g., HBM for GPU/TPU) and host memory.
+typedef PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
+    PJRT_Executable_GetCompiledMemoryStats_Args* args);
+
+struct PJRT_Executable_OutputElementTypes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  PJRT_Buffer_Type* output_types;  // out
+  size_t num_output_types;         // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_OutputElementTypes_Args,
+                          num_output_types);
+
+// Returns a list of element types for outputs.
+typedef PJRT_Error* PJRT_Executable_OutputElementTypes(
+    PJRT_Executable_OutputElementTypes_Args* args);
+
+struct PJRT_Executable_OutputDimensions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_outputs;
+  // Has length: sum of all elements in the list `dim_sizes`.
+  const int64_t* dims;  // out
+  // Has length `num_outputs`.
+  const size_t* dim_sizes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_OutputDimensions_Args, dim_sizes);
+
+// Returns a list of dimensions for outputs. Each output has an array shape,
+// which is represented by a list of dimensions. The array shapes of all outputs
+// are concatenated into a single list of dimensions.
+typedef PJRT_Error* PJRT_Executable_OutputDimensions(
+    PJRT_Executable_OutputDimensions_Args* args);
+
+struct PJRT_Executable_OutputMemoryKinds_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_outputs;
+  // Has length `num_outputs`.
+  const char* const* memory_kinds;  // out
+  // Has length `num_outputs`.
+  const size_t* memory_kind_sizes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_OutputMemoryKinds_Args,
+                          memory_kind_sizes);
+
+// Returns a list of memory kind strings for outputs.
+typedef PJRT_Error* PJRT_Executable_OutputMemoryKinds(
+    PJRT_Executable_OutputMemoryKinds_Args* args);
+
+typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
+
+struct PJRT_Executable_Serialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_Executable* executable;
+
+  // Lives only as long as serialized_executable
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_SerializedExecutable* serialized_executable;  // backs serialized_bytes.
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_executable.
+  void (*serialized_executable_deleter)(
+      PJRT_SerializedExecutable* exec);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Serialize_Args,
+                          serialized_executable_deleter);
+
+// Returns a platform-specific serialization of `executable`. The serialization
+// is not guaranteed to be stable over time.
+typedef PJRT_Error* PJRT_Executable_Serialize(
+    PJRT_Executable_Serialize_Args* args);
+
+struct PJRT_Executable_DeserializeAndLoad_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  const char* serialized_executable;
+  size_t serialized_executable_size;
+  PJRT_LoadedExecutable* loaded_executable;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_DeserializeAndLoad_Args,
+                          loaded_executable);
+
+// Deserializes an executable serialized by `PJRT_Executable_Serialize`.
+// `serialized_executable` must have been produced by the same platform and
+// library version as this one.
+typedef PJRT_Error* PJRT_Executable_DeserializeAndLoad(
+    PJRT_Executable_DeserializeAndLoad_Args* args);
+
+struct PJRT_LoadedExecutable_Fingerprint_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+  // Has the lifetime of `executable`
+  const char* executable_fingerprint;  // out
+  size_t executable_fingerprint_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Fingerprint_Args,
+                          executable_fingerprint_size);
+// DEPRECATED. Will be removed in PJRT version 2.0. Please use
+// PJRT_Executable_Fingerprint instead. A unique fingerprint for `executable`.
+// Two executables that were produced by compiling with identical inputs (same
+// program, compile options, compiler version, etc.) should have the same
+// fingerprint. May not be implemented by all platforms.
+typedef PJRT_Error* PJRT_LoadedExecutable_Fingerprint(
+    PJRT_LoadedExecutable_Fingerprint_Args* args);
+
+// ---------------------------------- Buffers ----------------------------------
+
+struct PJRT_Buffer_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Destroy_Args, buffer);
+
+// Deletes the underlying runtime objects as if 'PJRT_Buffer_Delete' were
+// called and frees `buffer`. `buffer` can be nullptr.
+typedef PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
+
+struct PJRT_Buffer_ElementType_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Buffer_Type type;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_ElementType_Args, type);
+
+// Returns the type of the array elements of a buffer.
+typedef PJRT_Error* PJRT_Buffer_ElementType(PJRT_Buffer_ElementType_Args* args);
+
+struct PJRT_Buffer_Dimensions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  // Has the lifetime of `buffer` and length `num_dims`.
+  const int64_t* dims;  // out
+  size_t num_dims;      // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Dimensions_Args, num_dims);
+
+// Returns the array shape of `buffer`, i.e. the size of each dimension.
+typedef PJRT_Error* PJRT_Buffer_Dimensions(PJRT_Buffer_Dimensions_Args* args);
+
+struct PJRT_Buffer_UnpaddedDimensions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  // Has the lifetime of `buffer` and length `num_dims`.
+  const int64_t* unpadded_dims;  // out
+  size_t num_dims;               // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_UnpaddedDimensions_Args, num_dims);
+
+// Returns the unpadded array shape of `buffer`. This usually is equivalent to
+// PJRT_Buffer_Dimensions, but for implementations that support
+// dynamically-sized dimensions via padding to a fixed size, any dynamic
+// dimensions may have a smaller unpadded size than the padded size reported by
+// PJRT_Buffer_Dimensions. ("Dynamic" dimensions are those whose length is
+// only known at runtime, vs. "static" dimensions whose size is fixed at compile
+// time.)
+typedef PJRT_Error* PJRT_Buffer_UnpaddedDimensions(
+    PJRT_Buffer_UnpaddedDimensions_Args* args);
+
+struct PJRT_Buffer_DynamicDimensionIndices_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  // Has the lifetime of `buffer` and length `num_dynamic_dims`.
+  const size_t* dynamic_dim_indices;  // out
+  size_t num_dynamic_dims;            // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DynamicDimensionIndices_Args,
+                          num_dynamic_dims);
+
+// Returns the indices of dynamically-sized dimensions, or an empty list if all
+// dimensions are static. ("Dynamic" dimensions are those whose length is
+// only known at runtime, vs. "static" dimensions whose size is fixed at compile
+// time.)
+typedef PJRT_Error* PJRT_Buffer_DynamicDimensionIndices(
+    PJRT_Buffer_DynamicDimensionIndices_Args* args);
+
+struct PJRT_Buffer_GetMemoryLayout_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  // Layout data is owned by and has the lifetime of `buffer`.
+  PJRT_Buffer_MemoryLayout layout;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_GetMemoryLayout_Args, layout);
+
+// DEPRECATED. Please use layout extension instead.
+// https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api_layouts_extension.h
+// Returns the memory layout of the data in this buffer.
+typedef PJRT_Error* PJRT_Buffer_GetMemoryLayout(
+    PJRT_Buffer_GetMemoryLayout_Args* args);
+
+struct PJRT_Buffer_ToHostBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* src;
+
+  // The caller can specify an optional host layout. If nullptr, the layout of
+  // the src buffer will be used. The caller is responsible to keep the data
+  // (tiled or strides) in the host_layout alive during the call.
+  PJRT_Buffer_MemoryLayout* host_layout;
+  // `dst` can be nullptr to query required size which will be set into
+  // `dst_size`.
+  void* dst;  // in/out
+  // Size of `dst` in bytes. If `dst` is nullptr, then `dst_size` is set to the
+  // size needed. Otherwise, `dst_size` must be greater than or equal to the
+  // needed size.
+  size_t dst_size;  // in/out
+
+  // Event that signals when the copy has completed.
+  PJRT_Event* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_ToHostBuffer_Args, event);
+
+// Asynchronously copies the buffer's value into a preallocated host buffer.
+typedef PJRT_Error* PJRT_Buffer_ToHostBuffer(
+    PJRT_Buffer_ToHostBuffer_Args* args);
+
+struct PJRT_Buffer_OnDeviceSizeInBytes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  size_t on_device_size_in_bytes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_OnDeviceSizeInBytes_Args,
+                          on_device_size_in_bytes);
+
+// Gets the number of bytes of the buffer storage on the device
+typedef PJRT_Error* PJRT_Buffer_OnDeviceSizeInBytes(
+    PJRT_Buffer_OnDeviceSizeInBytes_Args* args);
+
+struct PJRT_Buffer_Delete_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Delete_Args, buffer);
+
+// Drop the buffer's reference to its associated device memory, without freeing
+// the `buffer` object itself. `buffer` can only be used with
+// PJRT_Buffer_IsDeleted and PJRT_Buffer_Destroy after calling this method. The
+// device memory will be freed when all async operations using the buffer have
+// completed, according to the allocation semantics of the underlying platform.
+typedef PJRT_Error* PJRT_Buffer_Delete(PJRT_Buffer_Delete_Args* args);
+
+struct PJRT_Buffer_IsDeleted_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  bool is_deleted;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IsDeleted_Args, is_deleted);
+
+// True if and only if PJRT_Buffer_Delete has previously been called.
+typedef PJRT_Error* PJRT_Buffer_IsDeleted(PJRT_Buffer_IsDeleted_Args* args);
+
+struct PJRT_Buffer_CopyRawToHost_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  void* dst;
+  int64_t offset;
+  int64_t transfer_size;
+  PJRT_Event* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHost_Args, event);
+
+typedef PJRT_Error* PJRT_Buffer_CopyRawToHost(
+    PJRT_Buffer_CopyRawToHost_Args* args);
+
+struct PJRT_Buffer_CopyToDevice_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Device* dst_device;
+  PJRT_Buffer* dst_buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyToDevice_Args, dst_buffer);
+
+// Copies the buffer to device `dst_device` within the same client. Caller is
+// responsible for freeing returned `dst_buffer` with PJRT_Buffer_Destroy.
+// Returns an error if the buffer is already on `dst_device`.
+typedef PJRT_Error* PJRT_Buffer_CopyToDevice(
+    PJRT_Buffer_CopyToDevice_Args* args);
+
+struct PJRT_Buffer_CopyToMemory_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Memory* dst_memory;
+  PJRT_Buffer* dst_buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyToMemory_Args, dst_buffer);
+
+// Copies the buffer to memory `dst_memory` within the same client. Caller is
+// responsible for freeing returned `dst_buffer` with PJRT_Buffer_Destroy.
+// Returns an error if the buffer is already on `dst_memory`.
+typedef PJRT_Error* PJRT_Buffer_CopyToMemory(
+    PJRT_Buffer_CopyToMemory_Args* args);
+
+struct PJRT_Buffer_IsOnCpu_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  bool is_on_cpu;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IsOnCpu_Args, is_on_cpu);
+
+// Whether this buffer is on CPU and thus allows for certain optimizations.
+typedef PJRT_Error* PJRT_Buffer_IsOnCpu(PJRT_Buffer_IsOnCpu_Args* args);
+
+struct PJRT_Buffer_Device_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Device* device;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Device_Args, device);
+
+// Returns this buffer's storage device.
+typedef PJRT_Error* PJRT_Buffer_Device(PJRT_Buffer_Device_Args* args);
+
+struct PJRT_Buffer_Memory_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Memory* memory;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Memory_Args, memory);
+
+// Returns this buffer's storage memory.
+typedef PJRT_Error* PJRT_Buffer_Memory(PJRT_Buffer_Memory_Args* args);
+
+struct PJRT_Buffer_ReadyEvent_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  // The caller is responsible for calling PJRT_Event_Destroy on `event`.
+  PJRT_Event* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_ReadyEvent_Args, event);
+
+// Returns an event that is triggered when either of the following happens:
+// * the data in the PJRT_Buffer becomes ready, or
+// * an error has occurred.
+//
+// TODO(b/241967811): change these weird semantics
+// If the buffer has been deleted or donated, the returned event will
+// immediately indicate an error. However, if PJRT_Buffer_ReadyEvent() is
+// called on the buffer before PJRT_Buffer_Delete() is, the returned event will
+// not transition to an error state after PJRT_Buffer_Delete() is called.
+typedef PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args);
+
+struct PJRT_Buffer_UnsafePointer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  uintptr_t buffer_pointer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_UnsafePointer_Args, buffer_pointer);
+
+// Returns platform-dependent address for the given buffer that is often but
+// not guaranteed to be the physical/device address.
+typedef PJRT_Error* PJRT_Buffer_UnsafePointer(
+    PJRT_Buffer_UnsafePointer_Args* args);
+
+struct PJRT_Buffer_IncreaseExternalReferenceCount_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IncreaseExternalReferenceCount_Args,
+                          buffer);
+
+// Increments the reference count for the buffer. The reference count indicates
+// the raw buffer data is being shared with another framework (e.g. NumPy,
+// dlpack) and should not be deleted or moved by the PJRT implementation (e.g.
+// for memory compaction). TODO(b/295230663): document more API contract
+// details, e.g. does this block, can the buffer be modified in-place.
+typedef PJRT_Error* PJRT_Buffer_IncreaseExternalReferenceCount(
+    PJRT_Buffer_IncreaseExternalReferenceCount_Args* args);
+
+struct PJRT_Buffer_DecreaseExternalReferenceCount_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DecreaseExternalReferenceCount_Args,
+                          buffer);
+
+// Decrements the reference count for the buffer. Returns an error if the
+// reference count is zero (i.e. PJRT_Buffer_IncreaseExternalReferenceCount is
+// not called beforehand).
+typedef PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
+    PJRT_Buffer_DecreaseExternalReferenceCount_Args* args);
+
+struct PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  void* device_memory_ptr;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args,
+                          device_memory_ptr);
+
+// Returns the opaque device memory data pointer of the buffer. The returned
+// data pointer may become invalid at any point unless the external reference
+// count is greater than 0 via PJRT_Buffer_IncreaseExternalReferenceCount.
+typedef PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
+    PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args* args);
+
+// ---------------------------- CopyToDeviceStream -----------------------------
+
+struct PJRT_CopyToDeviceStream_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_CopyToDeviceStream* stream;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_Destroy_Args, stream);
+
+// Frees `stream`. `stream` can be nullptr.
+typedef PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
+    PJRT_CopyToDeviceStream_Destroy_Args* args);
+
+struct PJRT_CopyToDeviceStream_AddChunk_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_CopyToDeviceStream* stream;
+  // Takes ownership of `chunk` (i.e. implementation will call chunk.deleter).
+  PJRT_Chunk* chunk;
+  PJRT_Event* transfer_complete;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_AddChunk_Args,
+                          transfer_complete);
+
+// Emplaces a new chunk of data to copy to the device. The transfer is started
+// immediately, and the returned event is triggered when the transfer completes
+// or fails.
+//
+// The returned event will indicate an error if the chunk's size causes the
+// amount of transferred data to exceed the total bytes, if the stream is
+// already complete, or if the chunk is not a multiple of the granule size.
+typedef PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
+    PJRT_CopyToDeviceStream_AddChunk_Args* args);
+
+struct PJRT_CopyToDeviceStream_TotalBytes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_CopyToDeviceStream* stream;
+  int64_t total_bytes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_TotalBytes_Args, total_bytes);
+
+// Returns the total amount of data the stream expects to be transferred.
+typedef PJRT_Error* PJRT_CopyToDeviceStream_TotalBytes(
+    PJRT_CopyToDeviceStream_TotalBytes_Args* args);
+
+struct PJRT_CopyToDeviceStream_GranuleSize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_CopyToDeviceStream* stream;
+  int64_t granule_size_in_bytes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_GranuleSize_Args,
+                          granule_size_in_bytes);
+
+// Returns the granule size in bytes. The size of the chunk added to this stream
+// must be a multiple of this number.
+typedef PJRT_Error* PJRT_CopyToDeviceStream_GranuleSize(
+    PJRT_CopyToDeviceStream_GranuleSize_Args* args);
+
+struct PJRT_CopyToDeviceStream_CurrentBytes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_CopyToDeviceStream* stream;
+  int64_t current_bytes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_CurrentBytes_Args,
+                          current_bytes);
+
+// Returns the amount of data the stream currently has either transferred or has
+// buffered to transfer.
+typedef PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
+    PJRT_CopyToDeviceStream_CurrentBytes_Args* args);
+
+// ------------------------------ Device Topology ------------------------------
+
+struct PJRT_TopologyDescription_Create_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* topology_name;
+  size_t topology_name_size;
+  // Extra platform-specific options to create a client.
+  const PJRT_NamedValue* create_options;
+  size_t num_options;
+  PJRT_TopologyDescription* topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Create_Args, topology);
+
+// Creates and initializes a new PJRT_TopologyDescription and returns in
+// `topology`.
+typedef PJRT_Error* PJRT_TopologyDescription_Create(
+    PJRT_TopologyDescription_Create_Args* args);
+
+struct PJRT_TopologyDescription_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_TopologyDescription* topology;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Destroy_Args, topology);
+
+// Frees `topology`. `topology` can be nullptr.
+typedef PJRT_Error* PJRT_TopologyDescription_Destroy(
+    PJRT_TopologyDescription_Destroy_Args* args);
+
+struct PJRT_TopologyDescription_PlatformVersion_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_TopologyDescription* topology;
+  // `platform_version` has the same lifetime as `topology`. It's owned by
+  // `topology`.
+  const char* platform_version;  // out
+  size_t platform_version_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_PlatformVersion_Args,
+                          platform_version_size);
+
+// Returns a string containing human-readable, platform-specific version info
+// (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
+typedef PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
+    PJRT_TopologyDescription_PlatformVersion_Args* args);
+
+struct PJRT_TopologyDescription_PlatformName_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_TopologyDescription* topology;
+  // `platform_name` has the same lifetime as `topology`. It is owned by
+  // `topology`.
+  const char* platform_name;  // out
+  size_t platform_name_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_PlatformName_Args,
+                          platform_name_size);
+
+// Returns a string that identifies the platform (e.g. "cpu", "gpu", "tpu").
+typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
+    PJRT_TopologyDescription_PlatformName_Args* args);
+
+struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_TopologyDescription* topology;
+  // Has the same lifetime as topology.
+  PJRT_DeviceDescription* const* descriptions;  // out
+  size_t num_descriptions;                      // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_GetDeviceDescriptions_Args,
+                          num_descriptions);
+
+// Returns descriptions for all devices in this topology. The device
+// descriptions can be returned in any order, but will be in the same order
+// across calls within a process.
+typedef PJRT_Error* PJRT_TopologyDescription_GetDeviceDescriptions(
+    PJRT_TopologyDescription_GetDeviceDescriptions_Args* args);
+
+typedef struct PJRT_SerializedTopology PJRT_SerializedTopology;
+
+struct PJRT_TopologyDescription_Serialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_TopologyDescription* topology;
+
+  // Lives only as long as serialized_topology.
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_SerializedTopology* serialized_topology;  // out
+  // Must be called exactly once to free the backing memory for
+  // serialized_bytes.
+  void (*serialized_topology_deleter)(
+      PJRT_SerializedTopology* serialized_topology);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Serialize_Args,
+                          serialized_topology_deleter);
+
+// Serializes the TopologyDescription to a string for use in cache keys.
+typedef PJRT_Error* PJRT_TopologyDescription_Serialize(
+    PJRT_TopologyDescription_Serialize_Args* args);
+
+struct PJRT_TopologyDescription_Attributes_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_TopologyDescription* topology;
+
+  // Only lives as long as topology.
+  const PJRT_NamedValue* attributes;  // out
+  size_t num_attributes;              // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Attributes_Args,
+                          num_attributes);
+
+// Returns platform-specific topology attributes.
+typedef PJRT_Error* PJRT_TopologyDescription_Attributes(
+    PJRT_TopologyDescription_Attributes_Args* args);
+
+struct PJRT_Compile_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_TopologyDescription* topology;
+  // Only needs to stay alive for the duration of the Compile call.
+  // `program->format` and `program->format_size` are owned by the caller.
+  const PJRT_Program* program;
+  // TODO(b/240560013): consider putting some of option fields in priv.
+  // Serialized CompileOptionsProto
+  // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
+  const char* compile_options;
+  size_t compile_options_size;
+  // Optionally provided for performance-guided optimizations.
+  PJRT_Client* client;
+  PJRT_Executable* executable;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Compile_Args, executable);
+
+// Compiles a program in specified format (such as MLIR or HLO) with given
+// `options`. The returned executable must be loaded by a compatible
+// PJRT_Client before execution.
+typedef PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
+
+// -------------------------------- API access ---------------------------------
+
+#define _PJRT_API_STRUCT_FIELD(fn_type) fn_type* fn_type
+
+// Please modify PJRT_Api_STRUCT_SIZE if the last field of PJRT_Api is changed.
+typedef struct PJRT_Api {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+
+  PJRT_Api_Version pjrt_api_version;
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Error_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Error_Message);
+  _PJRT_API_STRUCT_FIELD(PJRT_Error_GetCode);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Plugin_Initialize);
+  _PJRT_API_STRUCT_FIELD(PJRT_Plugin_Attributes);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Event_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Event_IsReady);
+  _PJRT_API_STRUCT_FIELD(PJRT_Event_Error);
+  _PJRT_API_STRUCT_FIELD(PJRT_Event_Await);
+  _PJRT_API_STRUCT_FIELD(PJRT_Event_OnReady);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_Create);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_PlatformName);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_ProcessIndex);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_PlatformVersion);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_Devices);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_AddressableDevices);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_LookupDevice);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_LookupAddressableDevice);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_AddressableMemories);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_Compile);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_DefaultDeviceAssignment);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_BufferFromHostBuffer);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_Id);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_ProcessIndex);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_Attributes);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_Kind);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_DebugString);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_ToString);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_GetDescription);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_IsAddressable);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_LocalHardwareId);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_AddressableMemories);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_DefaultMemory);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_MemoryStats);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_Id);
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_Kind);
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_DebugString);
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_ToString);
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_AddressableByDevices);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Name);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_NumReplicas);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_NumPartitions);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_NumOutputs);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_SizeOfGeneratedCodeInBytes);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCostAnalysis);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_OutputMemoryKinds);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_OptimizedProgram);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Serialize);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetExecutable);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_AddressableDevices);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Delete);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_IsDeleted);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Execute);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_DeserializeAndLoad);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Fingerprint);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_ElementType);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_Dimensions);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_UnpaddedDimensions);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_DynamicDimensionIndices);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_GetMemoryLayout);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_OnDeviceSizeInBytes);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_Device);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_Memory);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_Delete);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_IsDeleted);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyToDevice);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_ToHostBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_IsOnCpu);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_ReadyEvent);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_UnsafePointer);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_IncreaseExternalReferenceCount);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_DecreaseExternalReferenceCount);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_OpaqueDeviceMemoryDataPointer);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_AddChunk);
+  _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_TotalBytes);
+  _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_GranuleSize);
+  _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_CurrentBytes);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Create);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_PlatformName);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_PlatformVersion);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_GetDeviceDescriptions);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Serialize);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Attributes);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Compile);
+
+  // Always add new fields to the end of the struct. Move fields below to their
+  // corresponding places after each major version bump.
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_OutputElementTypes);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_OutputDimensions);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyToMemory);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateViewOfDeviceBuffer);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Fingerprint);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_TopologyDescription);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompiledMemoryStats);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_Kind_Id);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_ExecuteContext_Create);
+  _PJRT_API_STRUCT_FIELD(PJRT_ExecuteContext_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHost);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferData);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateBuffersForAsyncHostToDevice);
+} PJRT_Api;
+
+enum {
+  PJRT_Api_STRUCT_SIZE =
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_CreateBuffersForAsyncHostToDevice)
+};
+
+#undef _PJRT_API_STRUCT_FIELD
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu.h
new file mode 100644
index 00000000..f2599d05
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_CPU_H_
+#define XLA_PJRT_C_PJRT_C_API_CPU_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Does not pass ownership of returned PJRT_Api* to caller.
+const PJRT_Api* GetPjrtApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_CPU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.h
new file mode 100644
index 00000000..5db2b378
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_CPU_INTERNAL_H_
+#define XLA_PJRT_C_PJRT_C_API_CPU_INTERNAL_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+namespace pjrt {
+namespace cpu_plugin {
+
+const PJRT_Api* GetCpuPjrtApi();
+
+}  // namespace cpu_plugin
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_CPU_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
new file mode 100644
index 00000000..82573461
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
@@ -0,0 +1,134 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_CUSTOM_PARTITIONER_EXTENSION_VERSION 0
+
+struct JAX_CustomCallPartitioner_string {
+  const char* data;
+  size_t size;
+};
+
+struct JAX_CustomCallPartitioner_aval {
+  JAX_CustomCallPartitioner_string shape;
+  bool has_sharding;
+  JAX_CustomCallPartitioner_string sharding;
+};
+
+// General callback information containing api versions, the result error
+// message and the cleanup function to free any temporary memory that is backing
+// the results. Arguments are always owned by the caller, and results are owned
+// by the cleanup_fn. These should never be used directly. Args and results
+// should be serialized via the PopulateArgs, ReadArgs, PopulateResults,
+// ConsumeResults functions defined below.
+struct JAX_CustomCallPartitioner_version_and_error {
+  int64_t api_version;
+  void* data;  // out
+  // cleanup_fn cleans up any returned results. The caller must finish with all
+  // uses by the point the cleanup is called.
+  void (*cleanup_fn)(void* data);  // out
+  bool has_error;
+  PJRT_Error_Code code;                        // out
+  JAX_CustomCallPartitioner_string error_msg;  // out
+};
+
+struct JAX_CustomCallPartitioner_Partition_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  size_t num_args;
+  JAX_CustomCallPartitioner_aval* op_args;
+  JAX_CustomCallPartitioner_aval op_result;
+  JAX_CustomCallPartitioner_string backend_config;
+
+  // out
+  JAX_CustomCallPartitioner_string mlir_module;
+  JAX_CustomCallPartitioner_string* args_sharding;
+  JAX_CustomCallPartitioner_string result_sharding;
+};
+
+struct JAX_CustomCallPartitioner_InferShardingFromOperands_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  size_t num_args;
+  JAX_CustomCallPartitioner_aval* op_args;
+  JAX_CustomCallPartitioner_string result_shape;
+  JAX_CustomCallPartitioner_string backend_config;
+
+  bool has_result_sharding;
+  JAX_CustomCallPartitioner_string result_sharding;
+};
+
+struct JAX_CustomCallPartitioner_PropagateUserSharding_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  JAX_CustomCallPartitioner_string backend_config;
+
+  JAX_CustomCallPartitioner_string result_shape;
+
+  JAX_CustomCallPartitioner_string result_sharding;  // inout
+};
+
+struct JAX_CustomCallPartitioner_Callbacks {
+  int64_t version;
+  void* private_data;
+  void (*dtor)(JAX_CustomCallPartitioner_Callbacks* data);
+  void (*partition)(JAX_CustomCallPartitioner_Callbacks* data,
+                    JAX_CustomCallPartitioner_Partition_Args* args);
+  void (*infer_sharding)(
+      JAX_CustomCallPartitioner_Callbacks* data,
+      JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
+  void (*propagate_user_sharding)(
+      JAX_CustomCallPartitioner_Callbacks* data,
+      JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
+  bool can_side_effecting_have_replicated_sharding;
+};
+
+struct PJRT_Register_Custom_Partitioner_Args {
+  size_t struct_size;
+  const char* name;  // lifetime of the call.
+  size_t name_size;
+  JAX_CustomCallPartitioner_Callbacks* callbacks;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Register_Custom_Partitioner_Args, callbacks);
+
+// Registers a custom partitioner.
+typedef PJRT_Error* PJRT_Register_Custom_Partitioner(
+    PJRT_Register_Custom_Partitioner_Args* args);
+
+typedef struct PJRT_Custom_Partitioner_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_Register_Custom_Partitioner* register_custom_partitioner;
+} PJRT_Custom_Partitioner_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Custom_Partitioner_Extension,
+                          register_custom_partitioner);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
new file mode 100644
index 00000000..c5766f2a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_FFI_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_FFI_EXTENSION_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// PJRT FFI extension provides capabilities for integrating with a
+// backend-specific FFI (foreign function interface) library, i.e. for XLA CPU
+// and GPU backends it gives access to the XLA FFI internals.
+//
+// See: https://en.wikipedia.org/wiki/Foreign_function_interface
+#define PJRT_API_FFI_EXTENSION_VERSION 2
+
+struct PJRT_FFI_TypeID_Register_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+
+  const char* type_name;
+  size_t type_name_size;
+  int64_t type_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_TypeID_Register_Args, type_id);
+
+// Registers external type in a static type registry.
+typedef PJRT_Error* PJRT_FFI_TypeID_Register(
+    PJRT_FFI_TypeID_Register_Args* args);
+
+// User-data that will be forwarded to the FFI handlers. Deleter is optional,
+// and can be nullptr. Deleter will be called when the context is destroyed.
+struct PJRT_FFI_UserData {
+  int64_t type_id;
+  void* data;
+  void (*deleter)(void* data);
+};
+
+struct PJRT_FFI_UserData_Add_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+
+  PJRT_ExecuteContext* context;
+  PJRT_FFI_UserData user_data;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_UserData_Add_Args, user_data);
+
+// Adds a user data to the execute context.
+typedef PJRT_Error* PJRT_FFI_UserData_Add(PJRT_FFI_UserData_Add_Args* args);
+
+typedef struct PJRT_FFI_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_FFI_TypeID_Register* type_id_register;
+  PJRT_FFI_UserData_Add* user_data_add;
+} PJRT_FFI;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_Extension, user_data_add);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_FFI_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.h
new file mode 100644
index 00000000..29722524
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.h
@@ -0,0 +1,28 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_FFI_INTERNAL_H_
+#define XLA_PJRT_C_PJRT_C_API_FFI_INTERNAL_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_ffi_extension.h"
+
+namespace pjrt {
+
+PJRT_FFI_Extension CreateFfiExtension(PJRT_Extension_Base* next);
+
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_FFI_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu.h
new file mode 100644
index 00000000..0ef092ff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu.h
@@ -0,0 +1,33 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_GPU_H_
+#define XLA_PJRT_C_PJRT_C_API_GPU_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Does not pass ownership of returned PJRT_Api* to caller.
+PJRT_CAPI_EXPORT const PJRT_Api* GetPjrtApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_GPU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
new file mode 100644
index 00000000..28b17e54
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
@@ -0,0 +1,57 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_GPU_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_GPU_EXTENSION_H_
+
+#include <stddef.h>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_GPU_EXTENSION_VERSION 2
+
+struct PJRT_Gpu_Register_Custom_Call_Args {
+  size_t struct_size;
+  const char* function_name;
+  size_t function_name_size;
+  int api_version;  // 0 for an untyped call, 1 -- for typed
+  void* handler_instantiate;
+  void* handler_prepare;
+  void* handler_initialize;
+  void* handler_execute;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Gpu_Register_Custom_Call_Args, handler_execute);
+
+// Registers a custom call.
+typedef PJRT_Error* PJRT_Gpu_Register_Custom_Call(
+    PJRT_Gpu_Register_Custom_Call_Args* args);
+
+typedef struct PJRT_Gpu_Custom_Call {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_Gpu_Register_Custom_Call* custom_call;
+} PJRT_Gpu_Custom_Call;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Gpu_Custom_Call, custom_call);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_GPU_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.h
new file mode 100644
index 00000000..04c26e5b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_GPU_INTERNAL_H_
+#define XLA_PJRT_C_PJRT_C_API_GPU_INTERNAL_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+namespace pjrt {
+namespace gpu_plugin {
+
+const PJRT_Api* GetGpuPjrtApi();
+
+}  // namespace gpu_plugin
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_GPU_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
new file mode 100644
index 00000000..44b56cc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -0,0 +1,366 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_HELPERS_H_
+#define XLA_PJRT_C_PJRT_C_API_HELPERS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace pjrt {
+
+ABSL_CONST_INIT extern const absl::string_view kHloFormat;
+ABSL_CONST_INIT extern const absl::string_view kMlirFormat;
+ABSL_CONST_INIT extern const absl::string_view kHloWithConfigFormat;
+
+// Return error status if not success and frees the PJRT_Error returned by
+// `expr`.
+#define RETURN_STATUS_IF_PJRT_ERROR(expr, c_api)                         \
+  do {                                                                   \
+    PJRT_Error* error = (expr);                                          \
+    std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> _error(         \
+        error, pjrt::MakeErrorDeleter(c_api));                           \
+    absl::Status _status = pjrt::PjrtErrorToStatus(_error.get(), c_api); \
+    if (!_status.ok()) {                                                 \
+      return _status;                                                    \
+    }                                                                    \
+  } while (false)
+
+using PJRT_ClientDeleter = std::function<void(PJRT_Client*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the client.
+PJRT_ClientDeleter MakeClientDeleter(const PJRT_Api* api);
+
+using PJRT_AsyncHostToDeviceTransferManagerDeleter =
+    std::function<void(PJRT_AsyncHostToDeviceTransferManager*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the transfer manager.
+PJRT_AsyncHostToDeviceTransferManagerDeleter
+MakeAsyncHostToDeviceTransferManagerDeleter(const PJRT_Api* api);
+
+using PJRT_ErrorDeleter = std::function<void(PJRT_Error*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the error.
+PJRT_ErrorDeleter MakeErrorDeleter(const PJRT_Api* api);
+
+using PJRT_BufferDeleter = std::function<void(PJRT_Buffer*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the buffer.
+PJRT_BufferDeleter MakeBufferDeleter(const PJRT_Api* api);
+
+using PJRT_ExecutableDeleter = std::function<void(PJRT_Executable*)>;
+
+// Creates a custom deleter for smart pointers.
+// Pass in pointer `api` to the PJRT C API.
+// The lifetime of the Api pointed to must be longer than the executable.
+PJRT_ExecutableDeleter MakeExecutableDeleter(const PJRT_Api* api);
+
+using PJRT_LoadedExecutableDeleter =
+    std::function<void(PJRT_LoadedExecutable*)>;
+
+// Creates a custom deleter for smart pointers.
+// Pass in pointer `api` to the PJRT C API.
+// The lifetime of the Api pointed to must be longer than the executable.
+PJRT_LoadedExecutableDeleter MakeLoadedExecutableDeleter(const PJRT_Api* api);
+
+using PJRT_EventDeleter = std::function<void(PJRT_Event*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the event.
+PJRT_EventDeleter MakeEventDeleter(const PJRT_Api* api);
+
+using PJRT_SerializedExecutableDeleter =
+    std::function<void(PJRT_SerializedExecutable*)>;
+
+using PJRT_TopologyDescriptionDeleter =
+    std::function<void(PJRT_TopologyDescription*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the client.
+PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
+    const PJRT_Api* api);
+
+using PJRT_Layouts_MemoryLayoutDeleter =
+    std::function<void(PJRT_Layouts_MemoryLayout*)>;
+
+// The lifetime of `api` must be longer than the layout object to be
+// deleted. This function requires that `api` includes the PJRT_Layouts
+// extension.
+PJRT_Layouts_MemoryLayoutDeleter MakeMemoryLayoutDeleter(const PJRT_Api* api);
+
+// Fatal error logging if status is not success. This terminates the process
+// and frees the PJRT_Error passed in.
+void LogFatalIfPjrtError(PJRT_Error* error, const PJRT_Api* api);
+
+absl::string_view GetPjrtErrorMessage(const PJRT_Error* error,
+                                      const PJRT_Api* api);
+
+PJRT_Error_Code GetErrorCode(const PJRT_Error* error, const PJRT_Api* api);
+
+absl::Status PjrtErrorToStatus(const PJRT_Error* error, const PJRT_Api* api);
+
+absl::StatusCode PjrtErrorToStatusCode(const PJRT_Error* error,
+                                       const PJRT_Api* api);
+
+absl::StatusCode PjrtErrorCodeToStatusCode(PJRT_Error_Code code);
+PJRT_Error_Code StatusCodeToPjrtErrorCode(absl::StatusCode code);
+
+// Conversion helper from xla::PrimitiveType to PJRT_Buffer_Type.
+PJRT_Buffer_Type ConvertToPjRtBufferType(xla::PrimitiveType type);
+
+// Conversion helper from PJRT_Buffer_type to xla::PrimitiveType.
+xla::PrimitiveType ConvertFromPjRtBufferType(PJRT_Buffer_Type type);
+
+// Conversion helper from xla::PjRtClient::HostBufferSemantics to
+// PJRT_HostBufferSemantics.
+PJRT_HostBufferSemantics ConvertToPjRtHostBufferSemantics(
+    xla::PjRtClient::HostBufferSemantics buffer_semantics);
+
+// Conversion helper to xla::PjRtClient::HostBufferSemantics from
+// PJRT_HostBufferSemantics.
+xla::PjRtClient::HostBufferSemantics ConvertFromPjRtHostBufferSemantics(
+    PJRT_HostBufferSemantics buffer_semantics);
+
+// Create and return a `PjRtFuture`  which will be set when `c_event` is ready.
+// This also deletes `c_event` when the `PjRtFuture` is set.
+xla::PjRtFuture<> ConvertCEventToCppFuture(PJRT_Event* c_event,
+                                           const PJRT_Api* c_api);
+
+// The data of returned variable-length PJRT_NamedValue list is backed by
+// `cpp_value_map`, so `cpp_value_map` must outlive the returned list. It will
+// raise errors for unsupported PjRtValueType.
+absl::StatusOr<std::vector<PJRT_NamedValue>> ConvertToPjRtNamedValueList(
+    const absl::flat_hash_map<std::string, xla::PjRtValueType>& cpp_value_map);
+
+absl::flat_hash_map<std::string, xla::PjRtValueType>
+ConvertFromPjRtNamedValueList(const PJRT_NamedValue* c_value_list,
+                              size_t list_size);
+
+// Validates that all entries in value_map have a matching name and type in
+// expected_name_and_type. expected_name_and_type may contain extra entries
+// not in value_map without error.
+absl::Status ValidateCreateOptions(
+    const absl::flat_hash_map<std::string, xla::PjRtValueType>& value_map,
+    const absl::flat_hash_map<std::string, PJRT_NamedValue_Type>&
+        expected_name_and_types);
+
+// Returns attributes for plugin that uses XLA compiler. The attributes have the
+// lifetime of the process.
+const std::vector<PJRT_NamedValue>& GetXlaPluginCAttributes();
+
+// Helper function for checking the actual C API argument struct size is greater
+// than or equal to the expected size. The actual struct size can be larger if
+// it comes from a forwards-compatible caller built at a later version than this
+// check. Returns a non-OK status if the expected is smaller.
+absl::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
+                                              size_t expected_size,
+                                              size_t actual_size);
+
+absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api);
+absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api);
+
+absl::StatusOr<PJRT_TopologyDescription*> GetTopologyDescription(
+    PJRT_Client* client, const PJRT_Api* api);
+
+// Releases `chunk`.
+PJRT_Chunk ConvertFromCppChunk(xla::PjRtChunk chunk);
+
+// Returned PjRtChunk takes ownership of data in PJRT_Chunk (i.e. chunk.deleter
+// should not be called).
+xla::PjRtChunk ConvertToCppChunk(const PJRT_Chunk& chunk);
+
+PJRT_DeviceDescription* GetDeviceDescription(const PJRT_Api* api,
+                                             PJRT_Device* device);
+
+absl::Span<PJRT_Memory* const> GetAddressableMemories(const PJRT_Api* api,
+                                                      PJRT_Device* device);
+
+int GetId(const PJRT_Api* api, PJRT_DeviceDescription* device_desc);
+
+using PJRT_KeyValueGetCFunc =
+    std::function<PJRT_Error*(PJRT_KeyValueGetCallback_Args* args)>;
+
+using PJRT_KeyValueTryGetCFunc =
+    std::function<PJRT_Error*(PJRT_KeyValueTryGetCallback_Args* args)>;
+
+using PJRT_KeyValuePutCFunc =
+    std::function<PJRT_Error*(PJRT_KeyValuePutCallback_Args* args)>;
+
+// Groups data needed to support key value get/put callbacks.
+struct PJRT_KeyValueCallbackData {
+  PJRT_KeyValueCallbackData() = default;
+  PJRT_KeyValueCallbackData(const PJRT_KeyValueCallbackData&) = delete;
+
+  std::shared_ptr<xla::KeyValueStoreInterface> kv_store;
+
+  // kv_get_c_func, kv_try_get_c_func and kv_put_c_func are holding pointers to
+  // kv_store.
+  pjrt::PJRT_KeyValueGetCFunc kv_get_c_func;
+  pjrt::PJRT_KeyValuePutCFunc kv_put_c_func;
+  // c_kv_get, c_kv_try_get and c_kv_put are holding pointers to kv_get_c_func,
+  // kv_try_get_c_func and kv_put_c_func.
+  PJRT_KeyValueGetCallback c_kv_get;
+  PJRT_KeyValuePutCallback c_kv_put;
+  pjrt::PJRT_KeyValueTryGetCFunc kv_try_get_c_func;
+  PJRT_KeyValueTryGetCallback c_kv_try_get;
+};
+
+// The returned &kv_get_c_func, &kv_try_get_c_func and &kv_put_c_func must be
+// set as PJRT_Client_Create_Args.kv_get_user_arg,
+// PJRT_Client_Create_Args.kv_try_get_user_arg and
+// PJRT_Client_Create_Args.kv_put_user_arg, respectively. The entire
+// PJRT_KeyValueCallbackData must be kept alive as long as c_kv_get and c_kv_put
+// may be called.
+std::unique_ptr<PJRT_KeyValueCallbackData> ConvertToCKeyValueCallbacks(
+    std::shared_ptr<xla::KeyValueStoreInterface> kv_store);
+
+// std::function version of PJRT_SendCallback
+using PJRT_SendCallbackFunction =
+    std::function<PJRT_Error*(PJRT_Chunk*, PJRT_CallbackError*, size_t, bool)>;
+// std::function version of PJRT_RecvCallback
+using PJRT_RecvCallbackFunction = std::function<void(PJRT_CopyToDeviceStream*)>;
+
+// Wraps original `xla::SendCallback` inside `PJRT_Callback` using
+// 1) void* `user_arg` to capture `cpp_send_callback.callback` (std::function)
+// 2) `PJRT_SendCallback` function pointer, which reinterprets and calls
+// `user_arg` to call `cpp_send_callback.callback` function.
+PJRT_SendCallbackInfo CppSendCallbackToCSendCallback(
+    xla::SendCallback cpp_send_callback,
+    PJRT_SendCallbackFunction* send_callback_function);
+
+// Wraps original `xla::RecvCallback` inside `PJRT_Callback` using
+// 1) void* `user_arg` to capture `cpp_send_callback.callback` (std::function)
+// 2) `PJRT_RecvCallback` function pointer, which reinterprets and calls
+// `user_arg` to call `cpp_recv_callback.callback` function.
+PJRT_RecvCallbackInfo CppRecvCallbackToCRecvCallback(
+    xla::RecvCallback cpp_recv_callback,
+    PJRT_RecvCallbackFunction* recv_callback_function);
+
+// Data needed to support PJRT_Buffer_MemoryLayout. `minor_to_major` holds the
+// data in PJRT_Buffer_MemoryLayout_Tiled.minor_to_major. `tile_dims` and
+// `tile_dim_sizes` holds the data in PJRT_Buffer_MemoryLayout_Tiled.tile_dims
+// and PJRT_Buffer_MemoryLayout_Tiled.tile_dim_sizes.
+struct BufferMemoryLayoutData {
+  PJRT_Buffer_MemoryLayout c_layout;
+  std::vector<int64_t> minor_to_major;
+  std::vector<int64_t> tile_dims;
+  std::vector<size_t> tile_dim_sizes;
+};
+absl::StatusOr<BufferMemoryLayoutData> ConvertToBufferMemoryLayoutData(
+    const xla::Layout& cpp_layout);
+absl::StatusOr<BufferMemoryLayoutData> ConvertToBufferMemoryLayoutData(
+    absl::Span<int64_t const> byte_strides);
+
+absl::StatusOr<xla::Layout> ConvertToLayout(
+    const PJRT_Buffer_MemoryLayout_Tiled& c_tiled);
+
+PJRT_Buffer_Type GetElementType(const PJRT_Api* api, PJRT_Buffer* buffer);
+absl::Span<const int64_t> GetDimensions(const PJRT_Api* api,
+                                        PJRT_Buffer* buffer);
+std::unique_ptr<PJRT_Layouts_MemoryLayout, PJRT_Layouts_MemoryLayoutDeleter>
+GetMemoryLayout(const PJRT_Api* api, PJRT_Buffer* buffer);
+
+absl::StatusOr<xla::Shape> BuildXlaShapeFromC(PJRT_Buffer_Type element_type,
+                                              const int64_t* dims,
+                                              size_t num_dims,
+                                              PJRT_Buffer_MemoryLayout* layout);
+
+absl::string_view PlatformName(const PJRT_Api* api,
+                               const PJRT_TopologyDescription* topo_desc);
+absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
+    const PJRT_Api* api, const PJRT_TopologyDescription* topo_desc);
+
+absl::StatusOr<xla::CompiledMemoryStats> GetCompiledMemoryStats(
+    const PJRT_Api* api, PJRT_Executable* executable);
+
+PJRT_ShapeSpec ConvertToPjRtShapeSpec(
+    const xla::PjRtClient::ShapeSpec& shape_spec);
+
+xla::PjRtClient::ShapeSpec ConvertFromPjrtShapeSpec(
+    PJRT_ShapeSpec c_shape_spec);
+
+// Creates a PJRT_Profiler_Extension and adds a producer trace with
+// the given name. The created PJRT_Profiler_Extension will be used in argument
+// structs to pass the producer traceme context id to add a corresponding
+// consumer trace in the API implementation.
+PJRT_Profiler_Extension CreatePjrtProfilerExtension(
+    absl::string_view traceme_name);
+
+// Traverses an extension chain to find an extension struct with type
+// `type`. `in` can either be a PJRT_Api* or a pointer to an Args struct --
+// anything with an `extension_start` field. The ExtType template parameter
+// specifies the C extension type of the returned struct, if found (i.e. a
+// specific extension struct that is layout-compatible with
+// PJRT_Extension_Base).
+template <typename ExtType, typename InputType>
+ExtType* FindExtension(InputType* in, PJRT_Extension_Type type) {
+  PJRT_Extension_Base* ext = in->extension_start;
+  while (ext != nullptr) {
+    if (ext->type == type) {
+      return reinterpret_cast<ExtType*>(ext);
+    }
+    ext = ext->next;
+  }
+  // 'type' wasn't found in extension chain
+  return nullptr;
+}
+
+// Gets a traceme context id attached to PJRT_Profiler_Extension.
+// Returns -1 if there is no PJRT_Profiler_Extension in args.
+template <typename InputType>
+int64_t GetTracemeContextId(InputType* args) {
+  PJRT_Profiler_Extension* profiler_extension =
+      FindExtension<PJRT_Profiler_Extension>(
+          args, PJRT_Extension_Type::PJRT_Extension_Type_Profiler);
+  int64_t traceme_context_id = -1;
+  if (profiler_extension != nullptr) {
+    traceme_context_id = profiler_extension->traceme_context_id;
+  }
+  return traceme_context_id;
+}
+
+std::vector<xla::PjRtMemorySpaceDescription> GetMemorySpaceDescriptions(
+    PJRT_DeviceDescription* device_description, const PJRT_Api* c_api,
+    absl::StatusOr<xla::PjRtMemorySpaceDescription*>* default_memory);
+
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
new file mode 100644
index 00000000..2b390c24
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
@@ -0,0 +1,133 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_LAYOUTS_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_LAYOUTS_EXTENSION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This extension provides capabilities around custom on-device memory layouts
+// for PJRT_Buffers. The extension is both optional and experimental, meaning
+// ABI-breaking and other incompatible changes may be introduced at any time.
+//
+// If this extension is provided, JAX and possibly other frameworks will assume
+// that the compiler MLIR input can contain "mhlo.layout_mode" attributes on
+// program inputs and outputs, which should then be reflected by the runtime
+// methods in this extension. See
+// https://github.com/openxla/xla/blob/main/xla/pjrt/layout_mode.h for more
+// details.
+
+#define PJRT_API_LAYOUTS_EXTENSION_VERSION 1
+
+// -------------------------------- Data types ---------------------------------
+
+typedef struct PJRT_Layouts_MemoryLayout PJRT_Layouts_MemoryLayout;
+typedef struct PJRT_Layouts_SerializedLayout PJRT_Layouts_SerializedLayout;
+
+// ---------------------------------- Methods ----------------------------------
+
+struct PJRT_Layouts_MemoryLayout_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Layouts_MemoryLayout* layout;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_MemoryLayout_Destroy_Args, layout);
+
+// Frees `layout`. `layout` can be nullptr.
+typedef PJRT_Error* PJRT_Layouts_MemoryLayout_Destroy(
+    PJRT_Layouts_MemoryLayout_Destroy_Args* args);
+
+struct PJRT_Layouts_MemoryLayout_Serialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Layouts_MemoryLayout* layout;
+
+  // Lives only as long as serialized_layout
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_Layouts_SerializedLayout* serialized_layout;  // backs serialized_bytes.
+
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_layout.
+  void (*serialized_layout_deleter)(
+      PJRT_Layouts_SerializedLayout* s_layout);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_MemoryLayout_Serialize_Args,
+                          serialized_layout_deleter);
+
+// Serializes the memory layout into a string.
+typedef PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
+    PJRT_Layouts_MemoryLayout_Serialize_Args* args);
+
+struct PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Layouts_MemoryLayout* layout;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args, layout);
+
+// Returns the memory layout of the data in this buffer. Returned `layout` must
+// be freed via PJRT_Layouts_MemoryLayout_Destroy.
+typedef PJRT_Error* PJRT_Layouts_PJRT_Buffer_MemoryLayout(
+    PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args* args);
+
+struct PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_Buffer_Type type;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Layouts_MemoryLayout* layout;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args,
+                          layout);
+
+// Returns the default memory layout of the client given buffer type and dims.
+typedef PJRT_Error* PJRT_Layouts_PJRT_Client_GetDefaultLayout(
+    PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args* args);
+
+// --------------------------- Extension entrypoint ----------------------------
+
+typedef struct PJRT_Layouts_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+
+  PJRT_Layouts_MemoryLayout_Destroy* PJRT_Layouts_MemoryLayout_Destroy;
+  PJRT_Layouts_MemoryLayout_Serialize* PJRT_Layouts_MemoryLayout_Serialize;
+
+  PJRT_Layouts_PJRT_Client_GetDefaultLayout*
+      PJRT_Layouts_PJRT_Client_GetDefaultLayout;
+
+  PJRT_Layouts_PJRT_Buffer_MemoryLayout* PJRT_Layouts_PJRT_Buffer_MemoryLayout;
+} PJRT_Layouts_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_Extension,
+                          PJRT_Layouts_PJRT_Buffer_MemoryLayout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_LAYOUTS_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_macros.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_macros.h
new file mode 100644
index 00000000..1ee7d932
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_macros.h
@@ -0,0 +1,25 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_MACROS_H_
+#define XLA_PJRT_C_PJRT_C_API_MACROS_H_
+
+#if defined(_WIN32)
+#define PJRT_CAPI_EXPORT __declspec(dllexport)
+#else
+#define PJRT_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+#endif  // XLA_PJRT_C_PJRT_C_API_MACROS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h
new file mode 100644
index 00000000..91f61961
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h
@@ -0,0 +1,86 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_MEMORY_DESCRIPTIONS_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_MEMORY_DESCRIPTIONS_EXTENSION_H_
+
+#include <cstddef>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Optional and experimental extension.
+// This extension allows to retrieve all supported types of memory
+// supported by a given device description. This is useful for specifying
+// non-default memories in AOT computations (as opposed to the
+// physically-present memories associated with a PJRT_Client).
+
+#define PJRT_API_MEMORY_DESCRIPTIONS_EXTENSION_VERSION 1
+
+typedef struct PJRT_MemoryDescription PJRT_MemoryDescription;
+
+struct PJRT_DeviceDescription_MemoryDescriptions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_DeviceDescription* device_description;
+  const PJRT_MemoryDescription* const* memory_descriptions;  // out
+  size_t num_memory_descriptions;                            // out
+  // Index into memory_descriptions. -1 if there's no default:
+  size_t default_memory_index;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_MemoryDescriptions_Args,
+                          default_memory_index);
+
+// Returns all memory descriptions attached to this device.
+// The memories are in no particular order.
+typedef PJRT_Error* PJRT_DeviceDescription_MemoryDescriptions(
+    PJRT_DeviceDescription_MemoryDescriptions_Args* args);
+
+struct PJRT_MemoryDescription_Kind_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_MemoryDescription* memory_description;
+  // `kind` has same lifetime as `memory_description`.
+  const char* kind;  // out
+  size_t kind_size;  // out
+  int kind_id;       // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_MemoryDescription_Kind_Args, kind_id);
+
+// Returns the kind of a given memory space description. This is a
+// platform-dependent string and numeric ID that uniquely identifies the kind of
+// memory space among those possible on this platform.
+typedef PJRT_Error* PJRT_MemoryDescription_Kind(
+    PJRT_MemoryDescription_Kind_Args* args);
+
+typedef struct PJRT_MemoryDescriptions_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_DeviceDescription_MemoryDescriptions*
+      PJRT_DeviceDescription_MemoryDescriptions;
+  PJRT_MemoryDescription_Kind* PJRT_MemoryDescription_Kind;
+} PJRT_MemoryDescriptions_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_MemoryDescriptions_Extension,
+                          PJRT_MemoryDescription_Kind);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_MEMORY_DESCRIPTIONS_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
new file mode 100644
index 00000000..77603203
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_PROFILER_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_PROFILER_EXTENSION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/backends/profiler/plugin/profiler_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_PROFILER_EXTENSION_VERSION 1
+
+typedef struct PJRT_Profiler_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  // can be nullptr if PJRT_Profiler_Extension is used as an args extension
+  PLUGIN_Profiler_Api* profiler_api;
+  // valid only when used as an args extension
+  int64_t traceme_context_id;
+} PJRT_Profiler_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Profiler_Extension, traceme_context_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_PROFILER_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
new file mode 100644
index 00000000..292db35d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PJRT_C_PJRT_C_API_STREAM_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_STREAM_EXTENSION_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_STREAM_EXTENSION_VERSION 0
+
+struct PJRT_Get_Stream_For_External_Ready_Events_Args {
+  size_t struct_size;
+  PJRT_Device* device;
+  intptr_t stream;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Get_Stream_For_External_Ready_Events_Args,
+                          stream);
+
+// Returns a platform-specific stream handle that should be used to track when
+// an externally-managed buffer is ready to use on this device.
+typedef PJRT_Error* PJRT_Get_Stream_For_External_Ready_Events(
+    PJRT_Get_Stream_For_External_Ready_Events_Args* args);
+
+struct PJRT_Wait_Until_Buffer_Ready_On_Stream_Args {
+  size_t struct_size;
+  intptr_t stream;
+  PJRT_Buffer* buffer;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Wait_Until_Buffer_Ready_On_Stream_Args, buffer);
+
+// Waits until buffer is ready on stream.
+typedef PJRT_Error* PJRT_Wait_Until_Buffer_Ready_On_Stream(
+    PJRT_Wait_Until_Buffer_Ready_On_Stream_Args* args);
+
+typedef struct PJRT_Stream_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_Get_Stream_For_External_Ready_Events* get_stream;
+  PJRT_Wait_Until_Buffer_Ready_On_Stream* wait_stream;
+} PJRT_Stream_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Stream_Extension, wait_stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_STREAM_EXTENSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_test.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_test.h
new file mode 100644
index 00000000..768a54ea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_test.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_TEST_H_
+#define XLA_PJRT_C_PJRT_C_API_TEST_H_
+
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+namespace pjrt {
+
+// Registers a function that generates a PJRT_Api to the test factory. Including
+// tensorflow/compiler/xla/pjrt/c/pjrt_c_api_test.h in the test file will run
+// all the tests in this test factory with the PJRT_Api generated by the input
+// to  RegisterPjRtCApiTestFactory. See
+// tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc for an example usage
+void RegisterPjRtCApiTestFactory(std::function<const PJRT_Api*()> factory,
+                                 absl::string_view platform_name);
+
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
new file mode 100644
index 00000000..f6b7c97f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
@@ -0,0 +1,80 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/shape.h"
+
+#ifndef XLA_PJRT_C_PJRT_C_API_TEST_BASE_H_
+#define XLA_PJRT_C_PJRT_C_API_TEST_BASE_H_
+
+namespace pjrt {
+
+class PjrtCApiTestBase : public ::testing::Test {
+ public:
+  explicit PjrtCApiTestBase(const PJRT_Api* api);
+  ~PjrtCApiTestBase() override;
+
+ protected:
+  const PJRT_Api* api_;
+  PJRT_Client* client_;
+  void destroy_client(PJRT_Client* client);
+
+  int GetDeviceId(PJRT_DeviceDescription* device_desc) const;
+
+  int GetDeviceId(PJRT_Device* device) const;
+
+  bool IsValidDeviceId(PJRT_Device* device) const;
+
+  int GetLocalHardwareId(PJRT_Device* device) const;
+
+  absl::Span<PJRT_Device* const> GetClientDevices() const;
+
+  int GetNumDevices() const;
+
+  std::string BuildSingleDeviceCompileOptionStr();
+
+  absl::Span<PJRT_Device* const> GetClientAddressableDevices() const;
+
+  PJRT_Client_BufferFromHostBuffer_Args CreateBufferFromHostBufferArgs(
+      const std::vector<float>& data, const xla::Shape& shape,
+      xla::PjRtClient::HostBufferSemantics host_buffer_semantics,
+      PJRT_Device* device = nullptr);
+
+  std::pair<std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter>,
+            xla::PjRtFuture<>>
+  create_buffer(PJRT_Device* device = nullptr);
+
+  std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> ToUniquePtr(
+      PJRT_Error* error);
+
+ private:
+  PjrtCApiTestBase(const PjrtCApiTestBase&) = delete;
+  void operator=(const PjrtCApiTestBase&) = delete;
+};
+
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_tpu.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_tpu.h
new file mode 100644
index 00000000..469e5a3d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_tpu.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_TPU_H_
+#define XLA_PJRT_C_PJRT_C_API_TPU_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+enum PjRtCApiTpuInitType {
+  // Build with static linking and deploy internally.
+  kPjRtCApiTpuInitTypeInternalStaticLinking,
+  // Build with static linking and deploy on cloud.
+  kPjRtCApiTpuInitTypeExternalStaticLinking,
+  // Build with dynamic linking and deploy on cloud.
+  kPjRtCApiTpuInitTypeDynamicLinking
+};
+extern enum PjRtCApiTpuInitType kPjRtCApiTpuInitType;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Does not pass ownership of returned PJRT_Api* to caller.
+const PJRT_Api* GetPjrtApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_TPU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
new file mode 100644
index 00000000..27b1cac0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -0,0 +1,493 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_WRAPPER_IMPL_H_
+#define XLA_PJRT_C_PJRT_C_API_WRAPPER_IMPL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/shape.h"
+
+struct PJRT_Error {
+  absl::Status status;
+};
+
+struct PJRT_TopologyDescription {
+  // nullptr iff the PjRtTopologyDescription isn't owned by the caller. The PJRT
+  // C API sometimes returns a topo desc that's owned by the caller and must be
+  // freed using PJRT_TopologyDescription_Destroy
+  // (e.g. PJRT_TopologyDescription_Create), and sometimes returns a topo desc
+  // that's owned by something else (e.g. PJRT_Client_TopologyDescription).
+  std::unique_ptr<xla::PjRtTopologyDescription> owned_topology;
+  const xla::PjRtTopologyDescription* topology;
+  std::vector<std::unique_ptr<const xla::PjRtDeviceDescription>>
+      cpp_descriptions;
+  std::vector<PJRT_DeviceDescription> descriptions;
+  std::vector<PJRT_DeviceDescription*> description_pointers;
+  std::vector<PJRT_NamedValue> attributes;
+};
+
+struct PJRT_Client {
+  std::unique_ptr<xla::PjRtClient> client;
+  std::vector<PJRT_Device> owned_devices;
+  // `devices` contains the addresses of the contents of `owned_devices`.
+  std::vector<PJRT_Device*> devices;
+  // `addressable_devices` contains pointers to the `owned_devices` that the
+  // client can issue commands to.
+  std::vector<PJRT_Device*> addressable_devices;
+  // Map from wrapped C++ devices to C devices. The values are the same as
+  // `owned_devices`.
+  absl::flat_hash_map<xla::PjRtDevice*, PJRT_Device*> c_device_from_cpp_device;
+  // TODO(yueshengys): Add a `memories` member when global memories are
+  // supported.
+  std::vector<PJRT_Memory> owned_memories;
+  // `addressable_memories` contains pointers to the `owned_memories` that the
+  // client can transfer to and from.
+  std::vector<PJRT_Memory*> addressable_memories;
+  // Map from wrapped C++ memories to C memories. The values are the same as
+  // `owned_memories`.
+  absl::flat_hash_map<xla::PjRtMemorySpace*, PJRT_Memory*>
+      c_memory_from_cpp_memory;
+  absl::StatusOr<std::unique_ptr<PJRT_TopologyDescription>> topology;
+
+  explicit PJRT_Client(std::unique_ptr<xla::PjRtClient> cpp_client);
+};
+
+struct PJRT_MemoryDescription {
+  xla::PjRtMemorySpaceDescription memory_space_description;
+};
+
+// PJRT_AsyncHostToDeviceTransferManager is owned by its corresponding
+// PJRT_Client.
+struct PJRT_AsyncHostToDeviceTransferManager {
+  std::unique_ptr<xla::PjRtClient::AsyncHostToDeviceTransferManager>
+      transfer_manager;
+  PJRT_Client* client;
+};
+
+// PJRT_DeviceDescriptions are owned by their corresponding PJRT_Device.
+struct PJRT_DeviceDescription {
+  // The xla::PjRtDeviceDescription* is owned transitively by the
+  // corresponding xla::PjRtClient.
+  const xla::PjRtDeviceDescription* device_description;
+  // The device specific attributes which are initialized once per device.
+  std::vector<PJRT_NamedValue> attributes;
+};
+
+// PJRT_Devices are owned by their corresponding PJRT_Client.
+struct PJRT_Device {
+  // The xla::PjRtDevice* is owned by the corresponding xla::PjRtClient.
+  xla::PjRtDevice* device;
+  PJRT_DeviceDescription description;
+  std::vector<PJRT_Memory*> addressable_memories;
+  PJRT_Client* client;
+};
+
+struct PJRT_Memory {
+  // The xla::PjRtMemorySpace* is owned by the corresponding xla::PjRtClient.
+  xla::PjRtMemorySpace* memory_space;
+  std::vector<PJRT_Device*> devices;
+  PJRT_Client* client;
+};
+
+struct PJRT_ExecuteContext {
+  std::shared_ptr<xla::ExecuteContext> execute_context;
+};
+
+struct PJRT_Executable {
+  // Must be shared_ptr so that we can share with PJRT_LoadedExecutable.
+  std::shared_ptr<xla::PjRtExecutable> executable;
+
+  absl::StatusOr<std::string> fingerprint;
+
+  // Used to synchronize concurrent setting of cached values.
+  mutable absl::Mutex mutex;
+
+  // Cost analysis properties and name strings are populated after cost analysis
+  // has been run. These are returned from cost analysis calls, and do not
+  // change after the first call.
+  bool cost_analysis_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<std::string> cost_analysis_names;
+  std::vector<PJRT_NamedValue> cost_analysis_properties;
+
+  bool memory_kind_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<const char*> memory_kinds;
+  std::vector<size_t> memory_kind_sizes;
+
+  bool out_type_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<PJRT_Buffer_Type> out_types;
+
+  bool out_dimension_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<int64_t> out_dimensions;
+  std::vector<size_t> out_dimension_sizes;
+
+  explicit PJRT_Executable(std::shared_ptr<xla::PjRtExecutable> executable);
+
+  const xla::PjRtExecutable* get() const { return executable.get(); }
+  xla::PjRtExecutable* get() { return executable.get(); }
+};
+
+struct PJRT_LoadedExecutable {
+  // Must be shared_ptr so that we can share with PJRT_Executable.
+  std::shared_ptr<xla::PjRtLoadedExecutable> executable;
+  PJRT_Client* client;
+  // These pointers are a subset of `client`'s `addressable_devices`, i.e. those
+  // addressed by the compiled executable program. `client` owns the objects
+  // these point to.
+  std::vector<PJRT_Device*> addressable_devices;
+
+  PJRT_LoadedExecutable(std::shared_ptr<xla::PjRtLoadedExecutable> executable,
+                        PJRT_Client* client);
+
+  const xla::PjRtLoadedExecutable* get() const { return executable.get(); }
+  xla::PjRtLoadedExecutable* get() { return executable.get(); }
+};
+
+struct PJRT_Buffer {
+  std::unique_ptr<xla::PjRtBuffer> buffer;
+  PJRT_Client* client;
+  // Set and cached the first time PJRT_Buffer_GetMemoryLayout is called.
+  std::optional<pjrt::BufferMemoryLayoutData> layout_data;
+  // Set and cached the first time PJRT_Buffer_UnpaddedDimensions is called.
+  std::optional<std::vector<int64_t>> unpadded_dims;
+  // Set and cached the first time PJRT_Buffer_DynamicDimensionIndices is
+  // called.
+  std::optional<std::vector<size_t>> dynamic_dim_indices;
+  // Used to synchronize concurrent setting of cached values.
+  absl::Mutex mu;
+  // Manages, holds, and takes ownership of external references.
+  std::vector<std::unique_ptr<xla::PjRtBuffer::ExternalReference>>
+      external_references;
+};
+
+struct PJRT_Event {
+  xla::PjRtFuture<> future;
+};
+
+struct PJRT_SerializedExecutable {
+  std::string serialized;
+};
+
+struct PJRT_SerializedTopology {
+  std::string serialized;
+};
+
+struct PJRT_TransferMetadata {
+  // Decompose xla::Shape into C API type fields, without any Tuple information.
+  // TODO(b/238999986) support other `xla::Shape` fields when they are fully
+  // implemented.
+  xla::Shape device_shape;
+};
+
+struct PJRT_CopyToDeviceStream {
+  std::unique_ptr<xla::CopyToDeviceStream> stream;
+};
+
+struct PJRT_Layouts_MemoryLayout {
+  std::shared_ptr<const xla::PjRtLayout> layout;
+};
+
+struct PJRT_Layouts_SerializedLayout {
+  std::string serialized;
+};
+
+namespace pjrt {
+// C API definitions
+
+void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args);
+void PJRT_Error_Message(PJRT_Error_Message_Args* args);
+PJRT_Error* PJRT_Error_GetCode(PJRT_Error_GetCode_Args* args);
+
+PJRT_Error* PJRT_Plugin_Attributes_Empty(PJRT_Plugin_Attributes_Args* args);
+PJRT_Error* PJRT_Plugin_Attributes_Xla(PJRT_Plugin_Attributes_Args* args);
+
+PJRT_Error* PJRT_Event_Destroy(PJRT_Event_Destroy_Args* args);
+PJRT_Error* PJRT_Event_IsReady(PJRT_Event_IsReady_Args* args);
+PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args);
+PJRT_Error* PJRT_Event_Await(PJRT_Event_Await_Args* args);
+PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args);
+
+PJRT_Error* PJRT_Client_Destroy(PJRT_Client_Destroy_Args* args);
+PJRT_Error* PJRT_Client_PlatformName(PJRT_Client_PlatformName_Args* args);
+PJRT_Error* PJRT_Client_ProcessIndex(PJRT_Client_ProcessIndex_Args* args);
+PJRT_Error* PJRT_Client_PlatformVersion(PJRT_Client_PlatformVersion_Args* args);
+PJRT_Error* PJRT_Client_TopologyDescription(
+    PJRT_Client_TopologyDescription_Args* args);
+PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
+PJRT_Error* PJRT_Client_AddressableDevices(
+    PJRT_Client_AddressableDevices_Args* args);
+PJRT_Error* PJRT_Client_LookupDevice(PJRT_Client_LookupDevice_Args* args);
+PJRT_Error* PJRT_Client_LookupAddressableDevice(
+    PJRT_Client_LookupAddressableDevice_Args* args);
+PJRT_Error* PJRT_Client_AddressableMemories(
+    PJRT_Client_AddressableMemories_Args* args);
+PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args);
+PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
+    PJRT_Client_DefaultDeviceAssignment_Args* args);
+PJRT_Error* PJRT_Client_BufferFromHostBuffer(
+    PJRT_Client_BufferFromHostBuffer_Args* args);
+PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
+    PJRT_Client_CreateViewOfDeviceBuffer_Args* args);
+PJRT_Error* PJRT_Client_CreateBuffersForAsyncHostToDevice(
+    PJRT_Client_CreateBuffersForAsyncHostToDevice_Args* args);
+PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_Destroy(
+    PJRT_AsyncHostToDeviceTransferManager_Destroy_Args* args);
+PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_TransferData(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args* args);
+PJRT_Error* PJRT_DeviceDescription_Id(PJRT_DeviceDescription_Id_Args* args);
+PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
+    PJRT_DeviceDescription_ProcessIndex_Args* args);
+PJRT_Error* PJRT_DeviceDescription_Attributes(
+    PJRT_DeviceDescription_Attributes_Args* args);
+PJRT_Error* PJRT_DeviceDescription_Kind(PJRT_DeviceDescription_Kind_Args* args);
+PJRT_Error* PJRT_DeviceDescription_DebugString(
+    PJRT_DeviceDescription_DebugString_Args* args);
+PJRT_Error* PJRT_DeviceDescription_ToString(
+    PJRT_DeviceDescription_ToString_Args* args);
+
+PJRT_Error* PJRT_Device_GetDescription(PJRT_Device_GetDescription_Args* args);
+PJRT_Error* PJRT_Device_IsAddressable(PJRT_Device_IsAddressable_Args* args);
+PJRT_Error* PJRT_Device_LocalHardwareId(PJRT_Device_LocalHardwareId_Args* args);
+PJRT_Error* PJRT_Device_AddressableMemories(
+    PJRT_Device_AddressableMemories_Args* args);
+PJRT_Error* PJRT_Device_DefaultMemory(PJRT_Device_DefaultMemory_Args* args);
+PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
+
+PJRT_Error* PJRT_Memory_Id(PJRT_Memory_Id_Args* args);
+PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
+PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args);
+PJRT_Error* PJRT_Memory_DebugString(PJRT_Memory_DebugString_Args* args);
+PJRT_Error* PJRT_Memory_ToString(PJRT_Memory_ToString_Args* args);
+PJRT_Error* PJRT_Memory_AddressableByDevices(
+    PJRT_Memory_AddressableByDevices_Args* args);
+
+PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
+PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
+PJRT_Error* PJRT_Executable_NumReplicas(PJRT_Executable_NumReplicas_Args* args);
+PJRT_Error* PJRT_Executable_NumPartitions(
+    PJRT_Executable_NumPartitions_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
+    PJRT_LoadedExecutable_AddressableDevices_Args* args);
+PJRT_Error* PJRT_Executable_NumOutputs(PJRT_Executable_NumOutputs_Args* args);
+PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
+    PJRT_Executable_SizeOfGeneratedCodeInBytes_Args* args);
+PJRT_Error* PJRT_Executable_Fingerprint(PJRT_Executable_Fingerprint_Args* args);
+PJRT_Error* PJRT_Executable_GetCostAnalysis(
+    PJRT_Executable_GetCostAnalysis_Args* args);
+PJRT_Error* PJRT_Executable_OutputElementTypes(
+    PJRT_Executable_OutputElementTypes_Args* args);
+PJRT_Error* PJRT_Executable_OutputDimensions(
+    PJRT_Executable_OutputDimensions_Args* args);
+PJRT_Error* PJRT_Executable_OutputMemoryKinds(
+    PJRT_Executable_OutputMemoryKinds_Args* args);
+PJRT_Error* PJRT_Executable_OptimizedProgram(
+    PJRT_Executable_OptimizedProgram_Args* args);
+PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args);
+PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
+    PJRT_Executable_GetCompiledMemoryStats_Args* args);
+
+PJRT_Error* PJRT_LoadedExecutable_Destroy(
+    PJRT_LoadedExecutable_Destroy_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_Delete(
+    PJRT_LoadedExecutable_Delete_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_IsDeleted(
+    PJRT_LoadedExecutable_IsDeleted_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_Execute(
+    PJRT_LoadedExecutable_Execute_Args* args);
+PJRT_Error* PJRT_Executable_DeserializeAndLoad(
+    PJRT_Executable_DeserializeAndLoad_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
+    PJRT_LoadedExecutable_GetExecutable_Args* args);
+// TODO: b/306669267 - this method is deprecated. Return unimplemented error,
+// until the next major version upgrade.
+PJRT_Error* PJRT_LoadedExecutable_Fingerprint(
+    PJRT_LoadedExecutable_Fingerprint_Args* args);
+
+PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
+PJRT_Error* PJRT_Buffer_ElementType(PJRT_Buffer_ElementType_Args* args);
+PJRT_Error* PJRT_Buffer_Dimensions(PJRT_Buffer_Dimensions_Args* args);
+PJRT_Error* PJRT_Buffer_UnpaddedDimensions(
+    PJRT_Buffer_UnpaddedDimensions_Args* args);
+PJRT_Error* PJRT_Buffer_DynamicDimensionIndices(
+    PJRT_Buffer_DynamicDimensionIndices_Args* args);
+PJRT_Error* PJRT_Buffer_GetMemoryLayout(PJRT_Buffer_GetMemoryLayout_Args* args);
+PJRT_Error* PJRT_Buffer_OnDeviceSizeInBytes(
+    PJRT_Buffer_OnDeviceSizeInBytes_Args* args);
+PJRT_Error* PJRT_Buffer_Device(PJRT_Buffer_Device_Args* args);
+PJRT_Error* PJRT_Buffer_Memory(PJRT_Buffer_Memory_Args* args);
+PJRT_Error* PJRT_Buffer_Delete(PJRT_Buffer_Delete_Args* args);
+PJRT_Error* PJRT_Buffer_IsDeleted(PJRT_Buffer_IsDeleted_Args* args);
+PJRT_Error* PJRT_Buffer_CopyRawToHost(PJRT_Buffer_CopyRawToHost_Args* args);
+PJRT_Error* PJRT_Buffer_CopyToDevice(PJRT_Buffer_CopyToDevice_Args* args);
+PJRT_Error* PJRT_Buffer_CopyToMemory(PJRT_Buffer_CopyToMemory_Args* args);
+PJRT_Error* PJRT_Buffer_ToHostBuffer(PJRT_Buffer_ToHostBuffer_Args* args);
+PJRT_Error* PJRT_Buffer_IsOnCpu(PJRT_Buffer_IsOnCpu_Args* args);
+PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args);
+PJRT_Error* PJRT_Buffer_UnsafePointer(PJRT_Buffer_UnsafePointer_Args* args);
+PJRT_Error* PJRT_Buffer_IncreaseExternalReferenceCount(
+    PJRT_Buffer_IncreaseExternalReferenceCount_Args* args);
+PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
+    PJRT_Buffer_DecreaseExternalReferenceCount_Args* args);
+PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
+    PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args* args);
+
+PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
+    PJRT_CopyToDeviceStream_Destroy_Args* args);
+PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
+    PJRT_CopyToDeviceStream_AddChunk_Args* args);
+PJRT_Error* PJRT_CopyToDeviceStream_TotalBytes(
+    PJRT_CopyToDeviceStream_TotalBytes_Args* args);
+PJRT_Error* PJRT_CopyToDeviceStream_GranuleSize(
+    PJRT_CopyToDeviceStream_GranuleSize_Args* args);
+PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
+    PJRT_CopyToDeviceStream_CurrentBytes_Args* args);
+
+PJRT_Error* PJRT_TopologyDescription_Destroy(
+    PJRT_TopologyDescription_Destroy_Args* args);
+PJRT_Error* PJRT_TopologyDescription_PlatformName(
+    PJRT_TopologyDescription_PlatformName_Args* args);
+PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
+    PJRT_TopologyDescription_PlatformVersion_Args* args);
+PJRT_Error* PJRT_TopologyDescription_GetDeviceDescriptions(
+    PJRT_TopologyDescription_GetDeviceDescriptions_Args* args);
+PJRT_Error* PJRT_TopologyDescription_Serialize(
+    PJRT_TopologyDescription_Serialize_Args* args);
+PJRT_Error* PJRT_TopologyDescription_Attributes(
+    PJRT_TopologyDescription_Attributes_Args* args);
+
+PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
+
+PJRT_Error* PJRT_Layouts_MemoryLayout_Destroy(
+    PJRT_Layouts_MemoryLayout_Destroy_Args* args);
+PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
+    PJRT_Layouts_MemoryLayout_Serialize_Args* args);
+PJRT_Error* PJRT_Layouts_PJRT_Client_GetDefaultLayout(
+    PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args* args);
+PJRT_Error* PJRT_Layouts_PJRT_Buffer_MemoryLayout(
+    PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args* args);
+
+// Helper macros and functions
+
+#define PJRT_RETURN_IF_ERROR(expr)                                \
+  do {                                                            \
+    absl::Status _status = (expr);                                \
+    if (!_status.ok()) {                                          \
+      PJRT_Error* _c_status = new PJRT_Error{std::move(_status)}; \
+      return _c_status;                                           \
+    }                                                             \
+  } while (false)
+
+#define PJRT_ASSIGN_OR_RETURN(lhs, rexpr)                                  \
+  _PJRT_ASSIGN_OR_RETURN_IMPL(_PJRT_CONCAT(_status_or_value, __COUNTER__), \
+                              lhs, rexpr,                                  \
+                              _PJRT_CONCAT(_c_status, __COUNTER__));
+
+#define _PJRT_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr, c_status) \
+  auto statusor = (rexpr);                                          \
+  if (!statusor.ok()) {                                             \
+    PJRT_Error* c_status = new PJRT_Error();                        \
+    c_status->status = statusor.status();                           \
+    return c_status;                                                \
+  }                                                                 \
+  lhs = std::move(*statusor)
+
+#define _PJRT_CONCAT(x, y) _PJRT_CONCAT_IMPL(x, y)
+#define _PJRT_CONCAT_IMPL(x, y) x##y
+
+// Returns a specific error message when the program format is unknown.
+// Does not check the program format itself.
+std::string ProgramFormatErrorMsg(absl::string_view program_format);
+
+// Creates a C PJRT execute context from a C++ PJRT execute context.
+//
+// The returned execute context is owned by the caller and should be destroyed
+// with PJRT_ExecuteContext_Destroy.
+PJRT_ExecuteContext* CreateWrapperExecuteContext(
+    std::unique_ptr<xla::ExecuteContext> cpp_execute_context);
+
+// Creates a C PJRT topology from a C++ PJRT topology.
+//
+// The returned topology is owned by the caller and should be destroyed with
+// PJRT_TopologyDescription_Destroy. This can be used to implement functions
+// like PJRT_TopologyDescription_Create that return an owned topo desc.
+PJRT_TopologyDescription* CreateWrapperDeviceTopology(
+    std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology);
+
+// Creates a C PJRT topology from a C++ PJRT topology.
+//
+// The returned topology is *not* owned by the caller and should *not* be
+// destroyed with PJRT_TopologyDescription_Destroy. This can be used to
+// implement functions like PJRT_Client_TopologyDescription that return a topo
+// desc owned by something else.
+PJRT_TopologyDescription* CreateWrapperDeviceTopology(
+    const xla::PjRtTopologyDescription* cpp_topology);
+
+// Creates a C PJRT client from a C++ PJRT client and creates C PJRT devices
+// from cpp_client's devices. The returned client is owned by the caller and
+// should be destroyed with PJRT_Client_Destroy.
+PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client);
+
+// Helper functions for converting C key-value store callbacks to C++ callbacks.
+std::shared_ptr<xla::KeyValueStoreInterface> ToCppKeyValueStore(
+    PJRT_KeyValueGetCallback c_get_callback, void* get_user_arg,
+    PJRT_KeyValueTryGetCallback c_try_get_callback, void* try_get_user_arg,
+    PJRT_KeyValuePutCallback c_put_callback, void* put_user_arg);
+
+// A method that does not nothing other than returning a nullptr. Can be used as
+// the implementation of PJRT_Plugin_Initialize for plugins that do not require
+// specific initialization.
+PJRT_Error* PJRT_Plugin_Initialize_NoOp(PJRT_Plugin_Initialize_Args* args);
+
+PJRT_Layouts_Extension CreateLayoutsExtension(
+    PJRT_Extension_Base* next = nullptr);
+
+PJRT_MemoryDescriptions_Extension CreateMemoryDescriptionsExtension(
+    PJRT_Extension_Base* next = nullptr);
+
+// Creates a PJRT_Api with create_fn from the input and other functions in
+// pjrt_c_api_wrapper_impl.
+PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
+                       PJRT_ExecuteContext_Create* execute_context_create_fn,
+                       PJRT_TopologyDescription_Create* topology_create_fn,
+                       PJRT_Plugin_Initialize* plugin_initialize_fn,
+                       PJRT_Extension_Base* extension_start = nullptr,
+                       PJRT_Plugin_Attributes* plugin_attributes_fn =
+                           pjrt::PJRT_Plugin_Attributes_Empty);
+
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_C_PJRT_C_API_WRAPPER_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
new file mode 100644
index 00000000..9a78d566
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
@@ -0,0 +1,393 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
+#define XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/service/cpu/cpu_event.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A RAII helper class used to set an AsyncValueRef<CpuEvent> to a ready state
+// upon destruction. In many cases in PjRt implementation, there will be
+// multiple return statements in the function, all of which require setting some
+// AsyncValueRef<CpuEvent> to be ready. This class could make such code more
+// robust by using setting the AsyncValue in the destructor.
+class MarkEventReadyOnExit {
+ public:
+  explicit MarkEventReadyOnExit(tsl::AsyncValueRef<CpuEvent> event)
+      : event_(std::move(event)) {}
+
+  MarkEventReadyOnExit(const MarkEventReadyOnExit&) = delete;
+  MarkEventReadyOnExit& operator=(const MarkEventReadyOnExit&) = delete;
+  MarkEventReadyOnExit(MarkEventReadyOnExit&&) noexcept = default;
+  MarkEventReadyOnExit& operator=(MarkEventReadyOnExit&&) noexcept = default;
+
+  ~MarkEventReadyOnExit() {
+    if (event_) event_.SetStateConcrete();
+  }
+
+  tsl::AsyncValueRef<CpuEvent> Release() && { return std::move(event_); }
+
+ private:
+  tsl::AsyncValueRef<CpuEvent> event_;
+};
+
+// Async work runner abstracts away the implementation of the underlying thread
+// pool (or concurrent work queue).
+class AsyncWorkRunner {
+ public:
+  virtual ~AsyncWorkRunner() = default;
+
+  // `work` euqueued by `Schedule` may run on the calling thread.
+  virtual void Schedule(absl::AnyInvocable<void()> work) = 0;
+  virtual void ScheduleWhenReady(
+      absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
+      absl::AnyInvocable<void()> work) = 0;
+};
+
+class AbstractTfrtCpuBuffer : public PjRtBuffer {
+ public:
+  AbstractTfrtCpuBuffer(
+      Shape on_device_shape,
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer);
+  ~AbstractTfrtCpuBuffer() override;
+
+  const Shape& on_device_shape() const override { return on_device_shape_; }
+
+  absl::StatusOr<Shape> logical_on_device_shape() override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override;
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return PjRtFuture<>(Unimplemented("CopyRawToHost not implemented"));
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override {
+    return Unimplemented("CopyToMemorySpace not implemented");
+  }
+
+  void Delete() override;
+
+  bool IsDeleted() override;
+
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    on_done(Unimplemented("CopyToRemoteDevice not implemented."),
+            /*sends_were_enqueued=*/false);
+  }
+
+  void CopyToRemoteDeviceScattered(
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const xla::PjRtBuffer::ScatterDetails& scatter_details) override {
+    for (const auto& on_done : callbacks) {
+      on_done(Unimplemented("Implement CopyToRemoteDeviceScattered."),
+              /*sends_were_enqueued=*/false);
+    }
+  }
+
+  PjRtFuture<> GetReadyFuture() override;
+
+  bool IsOnCpu() const override { return true; }
+
+  // Acquires the device buffer for shared read-only usages, and it also adds
+  // the `usage_event` to it. Any donation event in the future is expected to be
+  // serialized after all the usage events added through this method. Returns
+  // nullptr if the buffer is already donated or there is outstanding external
+  // references.
+  TrackedTfrtCpuDeviceBuffer* AcquireUsage(
+      tsl::AsyncValueRef<CpuEvent> usage_event);
+
+  // A helper class for managing a pending donation. It should be committed upon
+  // success. Otherwise, the donated buffer is returned to the
+  // AbstractTfrtCpuBuffer.
+  class DonationTransaction {
+   public:
+    explicit DonationTransaction(
+        AbstractTfrtCpuBuffer* buffer,
+        std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer)
+        : buffer_(buffer), device_buffer_(std::move(device_buffer)) {
+      CHECK(buffer_);
+    }
+    DonationTransaction(const DonationTransaction&) = delete;
+    DonationTransaction& operator=(const DonationTransaction&) = delete;
+    DonationTransaction(DonationTransaction&&) = default;
+    DonationTransaction& operator=(DonationTransaction&& other) noexcept {
+      Abort();
+
+      buffer_ = other.buffer_;
+      device_buffer_ = std::move(other.device_buffer_);
+      return *this;
+    }
+
+    ~DonationTransaction() { Abort(); }
+
+    // Commit the donation. The rvalue ref qualifier is used to ensure the
+    // semantic that it can be committed at most once.
+    void Commit() && {
+      buffer_->CommitDonation();
+      device_buffer_.reset();
+    }
+
+    TrackedTfrtCpuDeviceBuffer* device_buffer() const {
+      return device_buffer_.get();
+    }
+
+   private:
+    void Abort() {
+      if (device_buffer_) buffer_->AbortDonation(std::move(device_buffer_));
+    }
+
+    AbstractTfrtCpuBuffer* buffer_ = nullptr;
+    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer_;
+  };
+
+  // Acquires the device buffer for exclusive donation. The caller of this
+  // method is expected to use the usage events and definition events to
+  // serialize this donation with previous usages. After this method is called,
+  // calls to AcquireUsage() will fail. Returns error status if the buffer is
+  // already donated or there is outstanding external references.
+  absl::StatusOr<DonationTransaction> AcquireDonation();
+
+  // A helper function for PjRtClient::BufferFromHostLiteral. Copy the literal
+  // to the current buffer asynchronously. `avs` is used to signal when the copy
+  // is complete and `async_work_runner` is used to schedule the async work into
+  // the underlying thread pool or work queue (usually owned by the client).
+  void CopyFromLiteral(
+      const LiteralSlice& literal, const Shape& shape,
+      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
+      AsyncWorkRunner* async_work_runner);
+
+  // Allocates a new `TrackedTfrtCpuDeviceBuffer` with the given shape and
+  // definition events.
+  static absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+  AllocateTrackedDeviceBuffer(
+      const Shape& on_device_shape,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
+
+  // Allocates new cpu events to `avs` and `definition_events`. If `shape` is a
+  // tuple, multiple events will be allocated. Otherwise, `avs` and
+  // `definition_events` will only contain one event.
+  static void AllocateAvsAndEvents(
+      const Shape& shape,
+      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events);
+
+  // A helper function for PjRtClient::BufferFromHostBuffer. Creates a new cpu
+  // device buffer from the host buffer (maybe zero-copy or async).
+  // `transpose_mu` and `transpose_cache` are used to transpose the input
+  // layout.
+  static absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+  BufferFromHostBufferHelper(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      PjRtClient::HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      const Shape& shape, AsyncWorkRunner* async_work_runner,
+      absl::Mutex* transpose_mu, TransposePlanCache* transpose_cache);
+
+ protected:
+  virtual absl::string_view buffer_name() const = 0;
+
+  PjRtFuture<> ToLiteralHelper(MutableLiteralBase* literal,
+                               AsyncWorkRunner* async_work_runner);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceAcrossClients(
+      PjRtDevice* dst_device);
+
+  absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+  CopyToDeviceHelper(AsyncWorkRunner* async_work_runner);
+
+  bool IsEmptyTuple() const {
+    return on_device_shape_.IsTuple() &&
+           on_device_shape_.tuple_shapes_size() == 0;
+  }
+
+  void DropExternalReference();
+
+  // Commits the pending donation by setting `pending_donation_` to false.
+  // `pending_donation_` must be true before calling this method.
+  void CommitDonation();
+
+  // Aborts the pending donation by returning the donated buffer, and setting
+  // `pending_donation_` to false. `pending_donation_` must be true before
+  // calling this method.
+  void AbortDonation(std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer);
+
+  // Similar to Delete, drops the buffer's reference to its associated device
+  // memory, leaving the buffer in an invalid state, but returns the
+  // TrackedTfrtCpuDeviceBuffer rather than freeing the device memory, so that
+  // another framework can take ownership of it. The buffer returned from
+  // Release may be safely dropped at any time even if it still has pending
+  // async operations. The client should call Await before calling Release with
+  // wait_for_operations_to_complete=false, to ensure that the host has
+  // synchronized past any outstanding write operations to the buffer. If
+  // wait_for_operations_to_complete=true the host will block until any
+  // potentially outstanding asynchronous operations have completed before
+  // returning, in which case it is safe to read or mutate the returned buffer.
+  // If the buffer was shared via an external reference it is the client's
+  // responsibility that accesses via that reference do not interfere with
+  // accesses via the buffer returned from Release.
+  absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> Release(
+      bool wait_for_operations_to_complete);
+
+  // Releases the device buffer by returning a unique_ptr of it. If there is
+  // outstanding donation or usage holds, this method blocks until those holds
+  // are committed or dropped.
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> ReleaseBufferLocked()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const Shape on_device_shape_;
+
+  mutable absl::Mutex mu_;
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_
+      ABSL_GUARDED_BY(mu_);
+  // Count of external references on the buffer.
+  int external_reference_counter_ ABSL_GUARDED_BY(mu_) = 0;
+
+  // If this buffer has external references when Delete() is called, this event
+  // is populated by Delete(). When the last external reference is released,
+  // the event is triggered, which is a precondition for the buffer being
+  std::optional<tsl::AsyncValueRef<CpuEvent>> external_references_dropped_event_
+      ABSL_GUARDED_BY(mu_);
+
+  // `pending_donation_` indicates whether a donation is pending. The destructor
+  // of the AbstractTfrtCpuBuffer will wait for a pending donation, as the
+  // donation might fail. Note that concurrent calls to AcquireUsage() and
+  // AcquireDonation() might fail even if the pending donation is aborted later.
+  bool pending_donation_ ABSL_GUARDED_BY(mu_) = false;
+};
+
+class AbstractAsyncHostToHostMemoryTransferManager
+    : public PjRtClient::AsyncHostToDeviceTransferManager {
+ public:
+  ~AbstractAsyncHostToHostMemoryTransferManager() override;
+
+  size_t buffer_count() const override { return buffer_sizes_.size(); }
+
+  size_t buffer_size(int buffer_index) const override;
+
+  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override;
+
+  absl::Status TransferLiteralToBuffer(
+      int buffer_index, const LiteralSlice& literal,
+      absl::AnyInvocable<void() &&> on_done) override;
+
+  absl::Status TransferRawDataToBuffer(
+      int buffer_index, absl::string_view data,
+      absl::AnyInvocable<void() &&> on_done) override;
+
+  absl::Status TransferRawDataToSubBuffer(
+      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
+      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override;
+
+  void SetBufferError(int buffer_index, absl::Status error) override;
+
+  void AddTransferMetadata(const TransferMetadata& meta) override {
+    LOG(WARNING) << "AddTransferMetadata not implemented for "
+                    "AbstractAsyncHostToHostMemoryTransferManager";
+  }
+
+ protected:
+  AbstractAsyncHostToHostMemoryTransferManager(
+      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs,
+      absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers,
+      absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes,
+      absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight,
+      absl::InlinedVector<bool, 4> last_transfer_finished,
+      AsyncWorkRunner* async_work_runner);
+
+  // Initialize `device_buffers`, `buffer_sizes`, `buffer_transfers_in_flight`,
+  // and `last_transfer_finished` from `buffers`.
+  static absl::Status PopulateAsyncTransferManagerData(
+      absl::Span<const std::unique_ptr<AbstractTfrtCpuBuffer>> buffers,
+      absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4>& device_buffers,
+      absl::InlinedVector<size_t, 4>& buffer_sizes,
+      absl::InlinedVector<int64_t, 4>& buffer_transfers_in_flight,
+      absl::InlinedVector<bool, 4>& last_transfer_finished);
+
+  absl::Status FillRawDataToSubBuffer(
+      int buffer_index,
+      absl::AnyInvocable<void(void* data, int64_t size)> fill_fn,
+      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done);
+
+  mutable absl::Mutex mu_;
+  // The number of transfers that are currently in flight.
+  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
+  // AsyncValues used to mark buffers as ready for consumption.
+  absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs_
+      ABSL_GUARDED_BY(mu_);
+  // Holds the number of in-flight transfers for each buffer.
+  absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight_
+      ABSL_GUARDED_BY(mu_);
+  // Flag to indicate whether we have seen the last transfer of each buffer.
+  absl::InlinedVector<bool, 4> last_transfer_finished_ ABSL_GUARDED_BY(mu_);
+  // The newly created buffers, which will be returned to the caller via
+  // Retrieve.
+  absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers_
+      ABSL_GUARDED_BY(mu_);
+  // Device buffers which we use to get the underlying memory to populate.
+  absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers_
+      ABSL_GUARDED_BY(mu_);
+  // Cached versions of the sizes of all the buffers. Not modified after
+  // creation, so not guarded by mu_.
+  absl::InlinedVector<size_t, 4> buffer_sizes_;
+
+  AsyncWorkRunner* async_work_runner_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/cpu_client.h
new file mode 100644
index 00000000..f4074534
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -0,0 +1,543 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_CPU_CLIENT_H_
+#define XLA_PJRT_CPU_CPU_CLIENT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/cpu/cpu_device.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_device_description.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/cpu/cpu_event.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/fingerprint.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+
+class TfrtCpuClient final : public PjRtClient {
+ public:
+  TfrtCpuClient(
+      int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
+      std::shared_ptr<cpu::CpuCollectives> collectives, size_t num_threads,
+      bool asynchronous,
+      std::function<void(HloModuleConfig&)> customize_hlo_module_config);
+  ~TfrtCpuClient() override;
+
+  int process_index() const override { return process_index_; }
+
+  int device_count() const override { return devices_.size(); }
+
+  int addressable_device_count() const override {
+    return addressable_devices_.size();
+  }
+
+  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override;
+
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override;
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  PjRtPlatformId platform_id() const override {
+    return tsl::Fingerprint64(CpuName());
+  }
+
+  absl::string_view platform_name() const override { return CpuName(); }
+
+  absl::string_view platform_version() const override { return CpuName(); }
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
+
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp module, CompileOptions options) override;
+
+  // For TfrtCpuClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                              PjRtDevice* device,
+                              PjRtCrossHostRecvNotifier notifier) override {
+    return Unimplemented("MakeCrossHostReceiveBuffers not implemented.");
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffersForGather(
+      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
+      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
+    return Unimplemented(
+        "MakeCrossHostReceiveBuffersForGather not implemented.");
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtDevice* device,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override;
+
+  absl::Status Defragment() override {
+    return Unimplemented("Defragment not implemented.");
+  }
+
+  tsl::thread::ThreadPool* pjrt_client_thread_pool() const {
+    return pjrt_client_thread_pool_.get();
+  }
+
+  AsyncWorkRunner* async_work_runner() const {
+    return async_work_runner_.get();
+  }
+
+  Eigen::ThreadPoolDevice* eigen_intraop_device() const {
+    return eigen_intraop_device_.get();
+  }
+
+  tsl::AsyncValueRef<CpuEvent> GetLastCollectiveLaunchEvent() {
+    absl::MutexLock lock(&mu_);
+    return last_collective_launch_event_.CopyRef();
+  }
+
+  void SetLastCollectiveLaunchEvent(tsl::AsyncValueRef<CpuEvent> event) {
+    absl::MutexLock lock(&mu_);
+    last_collective_launch_event_ = std::move(event);
+  }
+
+  tsl::AsyncValueRef<CpuEvent> GetLastEnqueueEvent() {
+    return last_enqueue_event_.CopyRef();
+  }
+
+  void SetLastEnqueueEvent(tsl::AsyncValueRef<CpuEvent> event) {
+    last_enqueue_event_ = std::move(event);
+  }
+
+  absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
+      const override {
+    return &topology_;
+  }
+
+ private:
+  friend class TfrtCpuExecutable;
+
+  int process_index_;
+  // Includes all devices, including non-addressable devices.
+  std::vector<std::unique_ptr<TfrtCpuDevice>> owned_devices_;
+  // Pointers to `owned_devices_`.
+  std::vector<PjRtDevice*> devices_;
+  // Maps Device::id() to the corresponding Device. Includes all devices.
+  absl::flat_hash_map<PjRtGlobalDeviceId, TfrtCpuDevice*> id_to_device_;
+  // Addressable devices indexed by core_id.
+  std::vector<PjRtDevice*> addressable_devices_;
+  std::unique_ptr<ComputationPlacer> computation_placer_;
+
+  // Addressable memory spaces.
+  std::vector<std::unique_ptr<PjRtMemorySpace>> owned_memory_spaces_;
+  // Pointers to `owned_memory_spaces_`.
+  std::vector<PjRtMemorySpace*> memory_spaces_;
+
+  // TODO(zhangqiaorjc): Use tsl::compat::EigenHostContextThreadPool.
+  std::unique_ptr<tsl::thread::ThreadPool> eigen_intraop_pool_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_intraop_device_;
+
+  // Thread pool for running PjRtClient tasks.
+  std::unique_ptr<tsl::thread::ThreadPool> pjrt_client_thread_pool_;
+  std::unique_ptr<AsyncWorkRunner> async_work_runner_;
+
+  // Launching collectives are prone to deadlock when we use fixed-sized
+  // threadpools since ExecuteHelper will block until all replicas reach the
+  // barrier. We ensure that
+  // 1. Threadpool size is at least as large as device_count so one collective
+  //    launch over all devices can succeed.
+  // 2. Gang-schedule each collective by conservatively ensuring a total order
+  //    of collectives and launching only one collective at a time to avoid
+  //    having no active threads to make progress
+  // TODO(zhangqiaorjc): Explore alternatives that allow multiple concurrent
+  // collectives.
+  mutable absl::Mutex mu_;
+  tsl::AsyncValueRef<CpuEvent> last_collective_launch_event_
+      ABSL_GUARDED_BY(mu_);
+
+  // A cache for transpose plans. We use transposes to convert
+  // (possibly strided) buffers provided to BufferFromHostBuffer into dense
+  // major-to-minor layout.
+  absl::Mutex transpose_mu_;
+  TransposePlanCache transpose_cache_ ABSL_GUARDED_BY(transpose_mu_);
+
+  std::shared_ptr<cpu::CpuCollectives> collectives_;
+
+  xla::CpuTopologyDescription topology_;
+
+  // Used to control whether asynchronous computation dispatch is available for
+  // this client. Only applies to non-parallel computations.
+  bool asynchronous_;
+
+  // A callback to customize the HloModuleConfig for each compiled module.
+  std::function<void(HloModuleConfig&)> customize_hlo_module_config_;
+
+  // Used to prevent too much parallelism: we will not enqueue next non-parallel
+  // computation until last one is done within each user thread.
+  // TODO(yueshengys): Consider moving the enqueuing/ordering logic to JAX via
+  // token threading.
+  inline static thread_local tsl::AsyncValueRef<CpuEvent> last_enqueue_event_ =
+      tsl::MakeAvailableAsyncValueRef<CpuEvent>();
+};
+
+class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
+ public:
+  TfrtCpuBuffer(
+      Shape on_device_shape,
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      TfrtCpuClient* client, TfrtCpuDevice* device,
+      PjRtMemorySpace* memory_space);
+
+  TfrtCpuBuffer(const TfrtCpuBuffer&) = delete;
+  TfrtCpuBuffer(TfrtCpuBuffer&&) = delete;
+  TfrtCpuBuffer& operator=(const TfrtCpuBuffer&) = delete;
+  TfrtCpuBuffer& operator=(TfrtCpuBuffer&&) = delete;
+
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
+  TfrtCpuDevice* device() const override { return device_; }
+  TfrtCpuClient* client() const override { return client_; }
+
+  using PjRtBuffer::ToLiteralSync;
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+  PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
+      override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override;
+
+ private:
+  absl::string_view buffer_name() const override { return "TfrtCpuBuffer"; }
+
+  TfrtCpuClient* client_;
+  TfrtCpuDevice* const device_;
+  PjRtMemorySpace* const memory_space_;
+};
+
+class TfrtCpuExecutable final : public PjRtLoadedExecutable {
+ public:
+  TfrtCpuExecutable(
+      int num_replicas, int num_partitions,
+      std::shared_ptr<DeviceAssignment> device_assignment,
+      bool parameter_is_tupled_arguments, CompileOptions compile_options,
+      std::unique_ptr<Executable> cpu_executable,
+      BufferAllocation::Index result_buffer_index,
+      absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
+      std::vector<PjRtDevice*> addressable_devices, TfrtCpuClient* client);
+
+  ~TfrtCpuExecutable() override = default;
+
+  TfrtCpuClient* client() const override { return client_; }
+
+  absl::string_view name() const override {
+    return cpu_executable_->shared_module()->name();
+  }
+
+  int num_replicas() const override { return num_replicas_; }
+
+  int num_partitions() const override { return num_partitions_; }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    return cpu_executable_->SizeOfGeneratedCodeInBytes();
+  }
+
+  const DeviceAssignment& device_assignment() const override {
+    return *device_assignment_;
+  }
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    return addressable_device_logical_ids_;
+  }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return std::vector<std::shared_ptr<HloModule>>{
+        cpu_executable_->shared_module()};
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return Unimplemented("GetOutputMemoryKinds is not supported.");
+  }
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    CompiledMemoryStats memory_stats = CompiledMemoryStats();
+    memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
+    const HloProto* proto = cpu_executable_->hlo_proto();
+    if (!proto) {
+      return tsl::errors::FailedPrecondition(
+          "cpu_executable_ has no hlo_proto.");
+    }
+    memory_stats.serialized_hlo_proto = proto->SerializeAsString();
+    memory_stats.PopulateBufferStatsFromAllocations(
+        cpu_executable_->GetAllocations());
+    return memory_stats;
+  }
+
+  using PjRtLoadedExecutable::Execute;
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
+
+  using PjRtLoadedExecutable::ExecuteSharded;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  using PjRtLoadedExecutable::ExecutePortable;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  void Delete() override;
+
+  bool IsDeleted() override;
+
+  absl::StatusOr<std::string> SerializeExecutable() const override;
+
+  bool IsReturnedFutureSupported() const override { return true; }
+
+  std::shared_ptr<Executable> cpu_executable() const { return cpu_executable_; }
+
+  absl::StatusOr<std::optional<std::string>> Fingerprint() const {
+    return fingerprint_;
+  }
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
+    return fingerprint_;
+  }
+
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return compile_options_;
+  }
+
+ private:
+  friend class TfrtCpuClient;
+
+  absl::Status SetUpDonation(bool tuple_inputs);
+
+  // Checks that the input buffers passed in by the user have the correct size
+  // on device for the compiled program.
+  absl::Status CheckBufferCompatibilities(
+      absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
+          input_buffers) const;
+
+  absl::StatusOr<Result> ExecuteHelper(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const RunId& run_id, const ExecuteOptions& options,
+      tsl::AsyncValueRef<CpuEvent> last_collective_launch_event,
+      bool fill_future, TfrtCpuDevice* device = nullptr);
+
+  TfrtCpuClient* client_;
+
+  int num_replicas_;
+  int num_partitions_;
+  std::shared_ptr<DeviceAssignment> device_assignment_;
+  bool parameter_is_tupled_arguments_;
+  CompileOptions compile_options_;
+
+  std::shared_ptr<Executable> cpu_executable_;
+
+  // Caching `result_buffer_index_` and `result_buffer_indices_` to avoid lookup
+  // HLO dataflow analysis data structures in program execution critical path.
+
+  // Buffer allocation index corresponding to root buffer buffer.
+  BufferAllocation::Index result_buffer_index_;
+  // Buffer allocation indices corresponding to each result buffer leaf buffer.
+  absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices_;
+
+  // Size on device of each leaf buffer of the compiled program, cached here
+  // for performance reasons.
+  std::vector<int64_t> input_buffer_sizes_in_bytes_;
+
+  // A sorted vector of parameters that have any aliased buffers and thus must
+  // be donated when executing the computation.
+  std::vector<int> parameters_that_must_be_donated_;
+
+  // The replica and partition indices of device_assignment_ to be run by this
+  // client. On single-host platforms without partitioning, this is all
+  // replicas (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may
+  // not be the case on multi-host platforms. If there are 4 replicas and 2
+  // partitions on a single host platform, size of
+  // addressable_device_logical_ids_ is 4*2 = 8.
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
+
+  // addressable_devices_[i] is the Device to which
+  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
+  // unique_ptrs to play well with the Python bindings (see xla.cc).
+  std::vector<PjRtDevice*> addressable_devices_;
+
+  // Cached result of comparing HloCostAnalysis FLOP estimate for execute
+  // critical path.
+  bool cheap_computation_;
+
+  std::string fingerprint_;
+};
+
+absl::StatusOr<std::unique_ptr<PjRtClient>> ABSL_DEPRECATED(
+    "Use public XLA:CPU GetXlaPjrtCpuClient instead")
+    GetTfrtCpuClient(CpuClientOptions options);
+
+// Deprecated. Use the overload that takes 'options' instead.
+inline absl::StatusOr<std::unique_ptr<PjRtClient>> ABSL_DEPRECATED(
+    "Use public XLA:CPU GetXlaPjrtCpuClient instead")
+    GetTfrtCpuClient(bool asynchronous) {
+  CpuClientOptions options;
+  options.asynchronous = asynchronous;
+  return GetTfrtCpuClient(std::move(options));
+}
+
+// Deprecated. Use the overload that takes 'options' instead.
+inline absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+    bool asynchronous, int cpu_device_count,
+    int max_inflight_computations_per_device = 32) {
+  CpuClientOptions options;
+  options.asynchronous = asynchronous;
+  options.cpu_device_count = cpu_device_count;
+  options.max_inflight_computations_per_device =
+      max_inflight_computations_per_device;
+  return GetTfrtCpuClient(std::move(options));
+}
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_CPU_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/cpu_device.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/cpu_device.h
new file mode 100644
index 00000000..c6b5c8f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/cpu_device.h
@@ -0,0 +1,104 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_CPU_DEVICE_H_
+#define XLA_PJRT_CPU_CPU_DEVICE_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_device_description.h"
+#include "xla/pjrt/semaphore.h"
+
+namespace xla {
+
+class TfrtCpuDevice final : public PjRtDevice {
+ public:
+  explicit TfrtCpuDevice(int process_id, int local_device_id,
+                         int max_inflight_computations = 32);
+
+  const CpuDeviceDescription& description() const override {
+    return description_;
+  }
+
+  void SetClient(PjRtClient* client) {
+    CHECK(client_ == nullptr);
+    client_ = client;
+  }
+
+  PjRtClient* client() const override { return client_; }
+
+  bool IsAddressable() const override {
+    return process_index() == client()->process_index();
+  }
+
+  PjRtLocalDeviceId local_device_id() const override {
+    return PjRtLocalDeviceId(local_hardware_id().value());
+  }
+
+  PjRtLocalHardwareId local_hardware_id() const override {
+    return PjRtLocalHardwareId(description_.local_hardware_id());
+  }
+
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override;
+
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
+
+  void AttachMemorySpace(PjRtMemorySpace* memory_space);
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view memory_space_kind) const override;
+
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind_id(int id) const;
+
+  // Returns a semaphore for admission control on inflight computations.
+  Semaphore& max_inflight_computations_semaphore() {
+    return max_inflight_computations_semaphore_;
+  }
+
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    return nullptr;
+  }
+
+ private:
+  PjRtClient* client_ = nullptr;
+  CpuDeviceDescription description_;
+  absl::InlinedVector<PjRtMemorySpace*, 1> memory_spaces_;
+  absl::flat_hash_map<int, PjRtMemorySpace*> memory_spaces_by_id_;
+
+  // TODO(zhangqiaorjc): Optimize semaphore related overhead.
+  // Semaphore used to limit how many programs can be enqueued by the host
+  // ahead of the device.
+  Semaphore max_inflight_computations_semaphore_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_CPU_DEVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
new file mode 100644
index 00000000..8d22bd89
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
@@ -0,0 +1,193 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
+#define XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/cpu_function_runtime.h"
+#include "xla/service/cpu/cpu_event.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/util.h"
+#include "tsl/platform/mem.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+class MaybeOwningCpuMemory {
+ public:
+  using OwnedDataPtr = std::unique_ptr<uint8_t[], void (*)(void*)>;
+
+  MaybeOwningCpuMemory() = default;
+
+  // Non-owning.
+  MaybeOwningCpuMemory(void* buf, size_t size) : buf_(buf), size_(size) {}
+
+  // Owning.
+  MaybeOwningCpuMemory(OwnedDataPtr data, size_t size)
+      : buf_(data.get()), data_(std::move(data)), size_(size) {}
+
+  // Move-only.
+  MaybeOwningCpuMemory(MaybeOwningCpuMemory&&) = default;
+  MaybeOwningCpuMemory& operator=(MaybeOwningCpuMemory&&) = default;
+  MaybeOwningCpuMemory(const MaybeOwningCpuMemory&) = delete;
+  MaybeOwningCpuMemory& operator=(const MaybeOwningCpuMemory&) = delete;
+
+  // Allocates owning memory wrapped in an available `AsyncValueRef`.
+  static absl::StatusOr<tsl::AsyncValueRef<MaybeOwningCpuMemory>>
+  AllocateAvailableAvr(size_t size) {
+    TF_ASSIGN_OR_RETURN(auto memory, Allocate(size));
+    return tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
+        std::move(memory));
+  }
+
+  // Allocates raw owning memory. The typical usage is for delayed allocation.
+  static absl::StatusOr<MaybeOwningCpuMemory> Allocate(size_t size) {
+    uint8_t* data = static_cast<uint8_t*>(
+        tsl::port::AlignedMalloc(size, cpu_function_runtime::MinAlign()));
+    if (!data) {
+      return ResourceExhausted("Out of memory allocating %d bytes.", size);
+    }
+    return MaybeOwningCpuMemory(OwnedDataPtr{data, tsl::port::AlignedFree},
+                                size);
+  }
+
+  void* data() const { return buf_; }
+  size_t size() const { return size_; }
+
+ private:
+  void* buf_ = nullptr;                  // Non-owning data pointer.
+  OwnedDataPtr data_ = {nullptr, free};  // Owning data pointer;
+  size_t size_ = 0;                      // Size in number of bytes.
+};
+
+// Class that represents CPU buffers. It optionally owns the buffers. It also
+// tracks the definition and usage of the memory to allow for synchronized usage
+// and deletion of CPU memory. This class is thread-compatible.
+class TrackedTfrtCpuDeviceBuffer {
+ public:
+  // For non-tuple, takes a single buffer.
+  // For tuple, takes the leaf buffers. Tuple index table created internally.
+  // Nested tuple is not supported.
+
+  // Constructor for allocated cpu memory, i.e., `buffers` should have concrete
+  // states. Definition event is after the list of `definition_events`.
+  TrackedTfrtCpuDeviceBuffer(
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
+      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
+
+  // Variant with single definition event.
+  TrackedTfrtCpuDeviceBuffer(
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      tsl::AsyncValueRef<CpuEvent> definition_event,
+      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
+
+  // Constructor for unallocated cpu memory, i.e., `buffers` have unconstructed
+  // states, also needs to provide `buffer_sizes` which will be the sizes of
+  // the `buffers` after allocation. Definition event is after the list of
+  // `definition_events`. Callers need to ensure cpu memory is allocated before
+  // the definition event is ready.
+  TrackedTfrtCpuDeviceBuffer(
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
+      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
+
+  // Variant with single definition event.
+  TrackedTfrtCpuDeviceBuffer(
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes,
+      tsl::AsyncValueRef<CpuEvent> definition_event,
+      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
+
+  // Move-only.
+  TrackedTfrtCpuDeviceBuffer(TrackedTfrtCpuDeviceBuffer&&) noexcept = default;
+  TrackedTfrtCpuDeviceBuffer& operator=(TrackedTfrtCpuDeviceBuffer&&) noexcept =
+      default;
+  TrackedTfrtCpuDeviceBuffer(const TrackedTfrtCpuDeviceBuffer&) = delete;
+  TrackedTfrtCpuDeviceBuffer& operator=(const TrackedTfrtCpuDeviceBuffer&) =
+      delete;
+
+  ~TrackedTfrtCpuDeviceBuffer();
+
+  absl::Span<const tsl::AsyncValueRef<MaybeOwningCpuMemory>> Buffers() {
+    return buffers_;
+  }
+
+  absl::Span<const size_t> BufferSizes() { return buffer_sizes_; }
+
+  tsl::AsyncValueRef<MaybeOwningCpuMemory> Buffer(
+      const ShapeIndex& shape_index);
+
+  size_t BufferSize(const ShapeIndex& shape_index);
+
+  const tsl::AsyncValueRef<CpuEvent>& definition_event() const {
+    return definition_event_;
+  }
+
+  absl::Span<const tsl::AsyncValueRef<CpuEvent>> UsageEvents() const {
+    return usage_events_;
+  }
+
+  void AddUsageEvents(absl::Span<tsl::AsyncValueRef<CpuEvent>> events);
+
+  // Return the usage events for the buffers. After
+  // LockUseAndTransferUsageEvents is called, it is illegal to AddUsageEvent.
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
+  LockUseAndTransferUsageEvents();
+
+  // Relinquishes ownership of the buffer's device memory, e.g., after the
+  // buffer is passed to a computation that aliases its inputs to outputs.
+  void ReleaseDeviceMemory();
+
+  bool owns_buffers() const { return owns_buffers_; }
+
+ private:
+  bool is_tuple_;
+  bool owns_buffers_;
+  // If tuple, tuple index table is created and stored.
+  tsl::AsyncValueRef<MaybeOwningCpuMemory> tuple_index_table_;
+  // If non-tuple, `buffers_` contains 1 buffer; otherwise all leaf buffers.
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers_;
+  // Should correspond to size of each buffer in `buffers_` when `buffers_` is
+  // available.
+  absl::InlinedVector<size_t, 4> buffer_sizes_;
+  // The definition event are associated with CPU operations that write to the
+  // buffers.
+  tsl::AsyncValueRef<CpuEvent> definition_event_;
+  // Usage events are associated with CPU operations that read from the buffers.
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> usage_events_;
+  // A callback to call when the TrackedTfrtCpuDeviceBuffer is about to be
+  // destroyed.
+  absl::AnyInvocable<void() &&> on_delete_callback_;
+};
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/client.h
new file mode 100644
index 00000000..58f4fe36
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/client.h
@@ -0,0 +1,164 @@
+/* Copyright 2020 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_CLIENT_H_
+#define XLA_PJRT_DISTRIBUTED_CLIENT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "grpcpp/channel.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "tsl/platform/env.h"
+
+namespace tsl {
+class CoordinationServiceAgent;
+}  // namespace tsl
+
+namespace xla {
+
+class DistributedRuntimeClient {
+ public:
+  struct Options {
+    // This node's global ID. Required.
+    int32_t node_id = -1;
+
+    // Environment used for starting threads.
+    tsl::Env* env = tsl::Env::Default();
+
+    // RPC timeout used for RPC that don't have their own timeouts.
+    absl::Duration rpc_timeout = absl::Seconds(120);
+
+    // Time period for which Connect() should be retried. The client will keep
+    // trying to open the initial connection for this period, even if any
+    // individual Connect() RPC fails. May be zero, in which case Connect() will
+    // only be attempted once.
+    absl::Duration init_timeout = absl::ZeroDuration();
+
+    // How long to wait for all nodes to call Shutdown(). If the timeout
+    // expires, then shutdown() reports an error and returns control.
+    absl::Duration shutdown_timeout = absl::Minutes(5);
+
+    // Interval at which the client should send heartbeat RPCs to the
+    // coordinator.
+    absl::Duration heartbeat_interval = absl::Seconds(10);
+
+    // How many failed heartbeat RPCs may fail due to a possibly-ephemeral
+    // reason before we decide the coordinator has vanished and that we should
+    // shut down.
+    int max_missing_heartbeats = 10;
+
+    // Callback invoked by the client when notification of a missing heartbeat
+    // is reported by the coordinator, or we have not heard from the coordinator
+    // recently. `coordinator_reported_failure` is true in the former case.
+    // Exposed so tests can override this behavior to something non-fatal.
+    std::function<void(absl::Status)> missed_heartbeat_callback =
+        [](const absl::Status& status) {
+          LOG(QFATAL) << "Terminating process because the JAX distributed "
+                         "service detected fatal errors. This most likely "
+                         "indicates that another task died; see the other task "
+                         "logs for more details. Disable Python buffering, "
+                         "i.e. `python -u`, to be sure to see all the "
+                         "previous output. "
+                         "absl::Status: "
+                      << status;
+        };
+
+    // For testing. Should the client explicitly Shutdown() on destruction?
+    bool shutdown_on_destruction = true;
+
+    // Whether the client should send a request to wait for error from the
+    // coordination service at the startup.
+    // TODO(b/355706798): eventually remove this option.
+    bool poll_for_error_from_service_at_startup = true;
+  };
+
+  virtual ~DistributedRuntimeClient() = default;
+
+  // Connects to the master, and blocks until all clients have successfully
+  // connected.
+  // Not thread-safe, i.e., calls to Connect()/Shutdown() must be serialized by
+  // some other means.
+  virtual absl::Status Connect() = 0;
+
+  // Reports to the master that the client is ready to shutdown, and blocks
+  // until all clients are ready to shutdown or the shutdown timeout expires.
+  // Not thread-safe.
+  virtual absl::Status Shutdown() = 0;
+
+  // The following APIs are thread-safe.
+
+  // Key-value store API.
+  // There are no concurrency guarantees. To avoid a race / impose an ordering
+  // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
+  virtual absl::StatusOr<std::string> BlockingKeyValueGet(
+      absl::string_view key, absl::Duration timeout) = 0;
+
+  // Returns `NotFoundError` immediately if the key is not found.
+  virtual absl::StatusOr<std::string> KeyValueTryGet(absl::string_view key) = 0;
+
+  // Get all key-value pairs under a directory (key).
+  // A value is considered to be in the directory if its key is prefixed with
+  // the directory.
+  // This is not a blocking call. If no keys are found, an empty vector is
+  // returned immediately.
+  virtual absl::StatusOr<std::vector<std::pair<std::string, std::string>>>
+  KeyValueDirGet(absl::string_view key) = 0;
+
+  virtual absl::Status KeyValueSet(absl::string_view key,
+                                   absl::string_view value) = 0;
+  virtual absl::Status KeyValueSet(absl::string_view key,
+                                   absl::string_view value,
+                                   bool allow_overwrite) = 0;
+
+  // Delete the key-value. If the key is a directory, recursively clean
+  // up all key-values under the directory.
+  virtual absl::Status KeyValueDelete(absl::string_view key) = 0;
+
+  // Blocks until all nodes (or the ones specified in `nodes`) are at the
+  // barrier or the barrier times out. `barrier_id` should be unique across
+  // barriers.
+  virtual absl::Status WaitAtBarrier(
+      std::string barrier_id, absl::Duration timeout,
+      std::optional<absl::Span<const int32_t>> nodes) = 0;
+
+  // Returns pointer to coordination service agent, or InternalError if the
+  // client does not use coordination service.
+  virtual absl::StatusOr<tsl::CoordinationServiceAgent*>
+  GetCoordinationServiceAgent() = 0;
+};
+
+// Creates a distributed runtime client.
+std::unique_ptr<DistributedRuntimeClient> GetDistributedRuntimeClient(
+    std::shared_ptr<::grpc::Channel> channel,
+    const DistributedRuntimeClient::Options& options);
+
+std::shared_ptr<KeyValueStoreInterface> GetDistributedKeyValueStore(
+    std::shared_ptr<DistributedRuntimeClient> client, std::string key_prefix);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/distributed.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/distributed.h
new file mode 100644
index 00000000..e0e6a319
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/distributed.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_DISTRIBUTED_H_
+#define XLA_PJRT_DISTRIBUTED_DISTRIBUTED_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/distributed/client.h"
+#include "xla/pjrt/distributed/service.h"
+
+namespace xla {
+
+// APIs for starting the distributed runtime service and client. Note that these
+// variants use insecure credentials; the functions to build the service and
+// client are kept separate so that other implementations using more secure
+// credentials may be provided by the user.
+
+// Builds a distributed runtime service. `address` is the address on which
+// the service should listen, e.g., [::]:1234 . `num_nodes` is the number
+// of nodes in the cluster.
+absl::StatusOr<std::unique_ptr<DistributedRuntimeService>>
+GetDistributedRuntimeService(std::string address,
+                             const CoordinationServiceImpl::Options& options);
+
+// Builds a distributed runtime client, connecting to a service at `address`,
+// where address is a gRPC-style address such as `dns:///localhost:1234`.
+std::shared_ptr<DistributedRuntimeClient> GetDistributedRuntimeClient(
+    std::string address, const DistributedRuntimeClient::Options& options,
+    bool use_compression = false);
+
+// Builds the gRPC channel used by the runtime client. Exposed for testing.
+std::shared_ptr<::grpc::Channel> GetDistributedRuntimeClientChannel(
+    std::string address, std::shared_ptr<::grpc::ChannelCredentials> creds,
+    bool use_compression = false);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_DISTRIBUTED_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h
new file mode 100644
index 00000000..13f50c72
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_IN_MEMORY_KEY_VALUE_STORE_H_
+#define XLA_PJRT_DISTRIBUTED_IN_MEMORY_KEY_VALUE_STORE_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+
+namespace xla {
+
+class InMemoryKeyValueStore : public KeyValueStoreInterface {
+ public:
+  absl::StatusOr<std::string> Get(absl::string_view key,
+                                  absl::Duration timeout) override;
+
+  absl::StatusOr<std::string> TryGet(absl::string_view key) override;
+
+  absl::Status Set(absl::string_view key, absl::string_view value) override;
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::string, std::string> kv_store_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_IN_MEMORY_KEY_VALUE_STORE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/key_value_store_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/key_value_store_interface.h
new file mode 100644
index 00000000..312ebb8a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/key_value_store_interface.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_INTERFACE_H_
+#define XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_INTERFACE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+
+namespace xla {
+
+// In the multi-node case, the caller of PjRtClient can provide a key-value
+// store accessible across nodes. The caller can provide the two callbacks
+// below to access the key-value store. There are a few requirements:
+// (1) Get and Set must be thread-safe.
+// (2) The caller that provides the two callbacks is responsible for avoiding
+// key collisions between different users of key-value store (i.e. between
+// different plugins, but not between different GPU plugin nodes).
+class KeyValueStoreInterface {
+ public:
+  virtual ~KeyValueStoreInterface() = default;
+
+  // Blocking Get().
+  // Useful for listening for a key-value pair that may be set later on.
+  // There are no concurrency guarantees. To avoid a race / impose an ordering
+  // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
+  virtual absl::StatusOr<std::string> Get(absl::string_view key,
+                                          absl::Duration timeout) = 0;
+
+  // Returns `NotFoundError` immediately if the key is not found.
+  // Useful for checking key existence.
+  // There are no concurrency guarantees. To avoid a race / impose an ordering
+  // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
+  virtual absl::StatusOr<std::string> TryGet(absl::string_view key) = 0;
+
+  virtual absl::Status Set(absl::string_view key, absl::string_view value) = 0;
+};
+
+struct MultiProcessKeyValueStore {
+  std::shared_ptr<KeyValueStoreInterface> key_value_store;
+  int process_index = 0;
+  int process_count = 1;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/service.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/service.h
new file mode 100644
index 00000000..d1e3279a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/service.h
@@ -0,0 +1,114 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_SERVICE_H_
+#define XLA_PJRT_DISTRIBUTED_SERVICE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/server_credentials.h"
+#include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/types.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+
+typedef int NodeId;
+
+class CoordinationServiceImpl {
+ public:
+  struct Options {
+    // Number of nodes in the job. Mandatory. Must be non-negative.
+    int num_nodes = -1;
+
+    tsl::Env* env = tsl::Env::Default();
+
+    // Interval at which the service should check for missed heartbeat RPCs
+    // from the clients.
+    absl::Duration heartbeat_interval = absl::Seconds(10);
+
+    // Number of heartbeats that a client may miss in a row before the
+    // coordinator concludes that a client has vanished.
+    int max_missing_heartbeats = 10;
+
+    // How long should we wait for all clients to call Connect() before
+    // giving up?
+    absl::Duration cluster_register_timeout = absl::Minutes(60);
+
+    // How long should we wait for all clients to call Shutdown() before giving
+    // up and returning a failure?
+    absl::Duration shutdown_timeout = absl::Minutes(5);
+  };
+
+  CoordinationServiceImpl(const Options& options,
+                          ::grpc::ServerBuilder* builder);
+  ~CoordinationServiceImpl();
+
+  // Must be called after gRPC server has started.
+  void StartRpcThread();
+
+  CoordinationServiceImpl(const CoordinationServiceImpl&) = delete;
+  CoordinationServiceImpl(CoordinationServiceImpl&&) = delete;
+  CoordinationServiceImpl& operator=(const CoordinationServiceImpl&) = delete;
+  CoordinationServiceImpl&& operator=(CoordinationServiceImpl&&) = delete;
+
+ private:
+  tsl::Env* env_ = nullptr;  // Not owned.
+  std::unique_ptr<tsl::CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
+  std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
+  std::unique_ptr<tsl::Thread> coord_rpc_thread_;
+};
+
+class DistributedRuntimeService {
+ public:
+  static absl::StatusOr<std::unique_ptr<DistributedRuntimeService>> Get(
+      const std::string& address,
+      std::shared_ptr<::grpc::ServerCredentials> credentials,
+      const CoordinationServiceImpl::Options& options);
+
+  explicit DistributedRuntimeService(
+      const CoordinationServiceImpl::Options& options,
+      ::grpc::ServerBuilder* builder);
+  ~DistributedRuntimeService();
+
+  DistributedRuntimeService(const DistributedRuntimeService&) = delete;
+  DistributedRuntimeService(DistributedRuntimeService&&) = delete;
+  DistributedRuntimeService& operator=(const DistributedRuntimeService&) =
+      delete;
+  DistributedRuntimeService& operator=(DistributedRuntimeService&&) = delete;
+
+  void Shutdown();
+
+  ::grpc::Server* server() const { return server_.get(); }
+
+ private:
+  std::unique_ptr<CoordinationServiceImpl> coord_impl_;
+  std::unique_ptr<::grpc::Server> server_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_SERVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/topology_util.h
new file mode 100644
index 00000000..2e492d9c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
+#define XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
+
+namespace xla {
+
+// Retrieve content of /proc/sys/kernel/random/boot_id as a string.
+// Empty on non-Linux platforms.
+absl::StatusOr<std::string> GetBootIdString();
+
+// Performs a distributed exchange of topologies using a KV store. Each process
+// provides its local topology, and the local topologies are exchanged to
+// form a global topology.
+// If assign_global_device_ids is true, assigns global IDs to each node in the
+// topology in the order they appear in the input. Otherwise leaves the global
+// IDs as they were in the local topologies..
+// TODO(phawkins): deprecate and remove assign_global_device_ids.
+absl::Status ExchangeTopologies(absl::string_view platform, int node_id,
+                                int num_nodes,
+                                absl::Duration get_local_topology_timeout,
+                                absl::Duration get_global_topology_timeout,
+                                KeyValueStoreInterface* kv_store,
+                                const LocalTopologyProto& local_topology,
+                                GlobalTopologyProto* global_topology,
+                                bool assign_global_device_ids);
+
+// Functions below this point are public only for testing.
+
+// Given a LocalTopologyProto object from each node, builds a
+// GlobalTopologyProto that describes all nodes. Steals the contents of the
+// LocalTopologyProtos.
+GlobalTopologyProto BuildGlobalTopology(
+    absl::Span<LocalTopologyProto> local_topologies,
+    bool assign_global_device_ids);
+
+// Builds a GpuTopologyProto representing the GPU configuration described in the
+// given GlobalTopologyProto.
+absl::StatusOr<GpuTopologyProto> BuildGpuTopology(
+    const GlobalTopologyProto& global_topology);
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/util.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/util.h
new file mode 100644
index 00000000..e554d35c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/distributed/util.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DISTRIBUTED_UTIL_H_
+#define XLA_PJRT_DISTRIBUTED_UTIL_H_
+
+#include "absl/status/status.h"
+#include "grpcpp/support/status.h"
+
+namespace xla {
+
+inline absl::Status FromGrpcStatus(const ::grpc::Status& s) {
+  if (s.ok()) {
+    return absl::OkStatus();
+  } else {
+    return absl::Status(static_cast<absl::StatusCode>(s.error_code()),
+                        s.error_message());
+  }
+}
+
+inline ::grpc::Status ToGrpcStatus(const absl::Status& s) {
+  if (s.ok()) {
+    return ::grpc::Status::OK;
+  } else {
+    return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()),
+                          std::string(s.message()));
+  }
+}
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DISTRIBUTED_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/event_pool.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/event_pool.h
new file mode 100644
index 00000000..a0b33e55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/event_pool.h
@@ -0,0 +1,103 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_EVENT_POOL_H_
+#define XLA_PJRT_EVENT_POOL_H_
+
+#include <cstdint>
+#include <memory>
+#include <stack>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+
+namespace xla {
+
+class EventPool {
+ public:
+  class Handle {
+   public:
+    Handle() = default;
+    ~Handle();
+
+    Handle(const Handle&) = delete;
+    Handle(Handle&&) = default;
+    Handle& operator=(const Handle&) = delete;
+    Handle& operator=(Handle&&) = default;
+
+    // There is a total order on events handed out by the event pool. The most
+    // useful aspect of this total order is that two events returned by
+    // ThenAllocateAndRecordEvent on the same stream can be compared to see
+    // which was recorded earlier on that stream.
+    // Valid sequence numbers are > 0.
+    inline bool operator<(const Handle& rhs) const {
+      return sequence_number_ < rhs.sequence_number_;
+    }
+    inline bool operator>(const Handle& rhs) const { return rhs < *this; }
+    inline bool operator<=(const Handle& rhs) const { return !(*this > rhs); }
+    inline bool operator>=(const Handle& rhs) const { return !(*this < rhs); }
+
+    se::Event* event() const { return event_.get(); }
+    uint64_t sequence_number() const { return sequence_number_; }
+
+   private:
+    friend class EventPool;
+
+    EventPool* pool_ = nullptr;
+    std::unique_ptr<se::Event> event_;
+    uint64_t sequence_number_;
+  };
+
+  // Initializes a new EventPool. If `allow_reuse` is true, then events will be
+  // returned to the pool when their handles are deleted and made available to
+  // subsequent allocations. Reuse only works on the GPU platform.
+  explicit EventPool(bool allow_reuse);
+
+  // Allocates a new (or reused) event from the pool, and records the event on
+  // `stream`.
+  //
+  // Reuse is only possible on GPU. Event allocation and recording are coupled
+  // in a single operation because on GPU it is recording an event that makes it
+  // a "new" event. According to the CUDA documentation it is safe to call
+  // cudaEventRecord even if that event may still be in use on the device; APIs
+  // such as cudaStreamWaitEvent capture the state of the event at the time of
+  // the host-side call and are not affected by a later host-side
+  // cudaEventRecord.
+  absl::StatusOr<Handle> ThenAllocateAndRecordEvent(se::Stream* stream);
+
+  // Version of ThenAllocateAndRecordEvent split into two phases; this is
+  // sometimes helpful if we want to avoid failures by preallocating events.
+  absl::StatusOr<Handle> AllocateEvent(se::StreamExecutor* executor);
+  void ThenRecordEvent(se::Stream* stream, EventPool::Handle& handle);
+
+ private:
+  const bool allow_reuse_;
+
+  absl::Mutex mu_free_events_;
+  std::stack<std::unique_ptr<se::Event>> free_events_
+      ABSL_GUARDED_BY(mu_free_events_);
+
+  absl::Mutex mu_sequence_number_;
+  uint64_t next_sequence_number_ ABSL_GUARDED_BY(mu_sequence_number_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_EVENT_POOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/exceptions.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/exceptions.h
new file mode 100644
index 00000000..f2845893
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/exceptions.h
@@ -0,0 +1,68 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PJRT_EXCEPTIONS_H_
+#define XLA_PJRT_EXCEPTIONS_H_
+
+#include <cstdlib>
+#include <cstring>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// Custom exception type used such that we can raise XlaRuntimeError in
+// Python code instead of RuntimeError.
+class XlaRuntimeError : public std::runtime_error {
+ public:
+  explicit XlaRuntimeError(absl::Status status)
+      : std::runtime_error(StatusToString(status)), status_(status) {
+    CHECK(!status_->ok());
+  }
+
+  explicit XlaRuntimeError(const std::string what) : std::runtime_error(what) {}
+
+  std::optional<absl::Status> status() const { return status_; }
+
+ private:
+  static std::string StatusToString(const absl::Status& st) {
+    if (!ShowStackTraces()) {
+      return st.ToString(absl::StatusToStringMode::kWithNoExtraData);
+    }
+    std::stringstream ss;
+    ss << st;
+    return ss.str();
+  }
+
+  static bool ShowStackTraces() {
+    if (char* env = getenv("JAX_TRACEBACK_FILTERING")) {
+      return absl::string_view(env) == "off";
+    }
+    return false;
+  }
+
+  std::optional<absl::Status> status_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_EXCEPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
new file mode 100644
index 00000000..3f6472d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_GPU_HELPERS_H_
+#define XLA_PJRT_GPU_GPU_HELPERS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/bfc_allocator.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Builds an xla::LocalClient for the GPU platform.
+absl::StatusOr<LocalClient*> GetGpuXlaClient(
+    const std::optional<std::string>& platform_name,
+    const std::optional<std::set<int>>& allowed_devices);
+
+// Enables peer access between all pairs of GPUs where possible.
+void EnablePeerAccess(absl::Span<se::StreamExecutor* const> executors);
+
+std::unique_ptr<tsl::BFCAllocator> GetGpuHostAllocator(
+    se::StreamExecutor* executor);
+
+// Builds a BFCAllocator for all local GPUs.
+absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
+    se::StreamExecutor* executor, double memory_fraction, bool preallocate,
+    std::optional<int64_t> gpu_system_memory_size);
+
+// Builds a BFCAllocator for all local GPUs that uses collective memory.
+absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
+    se::StreamExecutor* executor, double memory_fraction,
+    size_t collective_memory_size);
+
+// Represents topology of devices.
+struct TopologySizes {
+  int num_slices = 0;
+  int num_hosts_per_slice = 0;
+  int num_devices_per_host = 0;
+
+  // Returns number of devices in the topology.
+  int GetDeviceCount();
+  // Parses the topology description of the form
+  // "<num_slices> x <num_hosts_per_slice> x <num_devices_per_host>"
+  // and returns the parsed components on success.
+  static absl::StatusOr<TopologySizes> FromString(
+      absl::string_view topology_string);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_GPU_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_metrics.h
new file mode 100644
index 00000000..a1dd5a08
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_metrics.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_GPU_METRICS_H_
+#define XLA_PJRT_GPU_GPU_METRICS_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace gpu_metrics {
+
+inline constexpr absl::string_view freeGpuSystemMemoryMetricName =
+    "/pjrt/gpu/free_gpu_system_memory";
+
+void RecordFreeGpuSystemMemory(int device_ordinal, int64_t free_memory);
+
+int64_t GetFreeGpuSystemMemory(int gpu_id);
+
+}  // namespace gpu_metrics
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_GPU_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_topology.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_topology.h
new file mode 100644
index 00000000..609c7fba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/gpu_topology.h
@@ -0,0 +1,80 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+#define XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
+
+namespace xla {
+class GpuTopology {
+ public:
+  explicit GpuTopology(const std::vector<int>& gpu_device_ids,
+                       absl::string_view platform_version, int32_t num_slices,
+                       int32_t num_hosts_per_slice,
+                       int32_t num_devices_per_host)
+      : devices_ids_(gpu_device_ids),
+        platform_version_(platform_version),
+        num_slices_(num_slices),
+        num_hosts_per_slice_(num_hosts_per_slice),
+        num_devices_per_host_(num_devices_per_host) {}
+
+  bool operator==(const GpuTopology& other) const {
+    return devices_ids_ == other.devices_ids_ &&
+           platform_version_ == other.platform_version_ &&
+           num_slices_ == other.num_slices_ &&
+           num_hosts_per_slice_ == other.num_hosts_per_slice_ &&
+           num_devices_per_host_ == other.num_devices_per_host_;
+  }
+
+  int number_of_devices() const {
+    return is_topology_symmetric() ? number_of_hosts() * num_devices_per_host_
+                                   : devices_ids_.size();
+  }
+  const std::vector<int>& device_ids() const { return devices_ids_; }
+  int number_of_hosts() const {
+    return is_topology_symmetric() ? num_slices_ * num_hosts_per_slice_ : -1;
+  }
+
+  static std::unique_ptr<const GpuTopology> FromProto(
+      const GpuTopologyProto& proto);
+  GpuTopologyProto ToProto() const;
+
+  absl::string_view platform_version() const { return platform_version_; }
+  int32_t num_slices() const { return num_slices_; }
+  int32_t num_hosts_per_slice() const { return num_hosts_per_slice_; }
+  int32_t num_devices_per_host() const { return num_devices_per_host_; }
+
+ private:
+  const std::vector<int> devices_ids_;
+  const std::string platform_version_;
+  const int32_t num_slices_;
+  const int32_t num_hosts_per_slice_;
+  const int32_t num_devices_per_host_;
+
+  bool is_topology_symmetric() const {
+    return num_slices_ != -1 && num_hosts_per_slice_ != -1 &&
+           num_devices_per_host_ != -1;
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_GPU_TOPOLOGY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/nccl_id_store.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
new file mode 100644
index 00000000..fe8b060c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_NCCL_ID_STORE_H_
+#define XLA_PJRT_GPU_NCCL_ID_STORE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
+
+namespace xla {
+
+// A table mapping GpuCliqueKeys to CliqueIds. In a distributed setup the
+// table of NCCL IDs is kept on the master node (node 0). The node of the first
+// participating device will create the unique id.
+class NcclIdStore {
+ public:
+  NcclIdStore(int node_id,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
+              std::shared_ptr<KeyValueStoreInterface> kv_store)
+      : node_id_(node_id),
+        device_to_node_(std::move(device_to_node)),
+        kv_store_(std::move(kv_store)) {}
+
+  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key);
+
+ private:
+  const int node_id_;
+  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+  const std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<gpu::GpuCliqueKey, CliqueId> cache_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_NCCL_ID_STORE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
new file mode 100644
index 00000000..a60a65c4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -0,0 +1,288 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
+#define XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/client/local_client.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/layout.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/gpu/gpu_helpers.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/pjrt/local_device_state.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace stream_executor {
+
+class MultiDeviceAdapter;
+
+}
+
+namespace xla {
+using DeviceTopologyPair =
+    std::pair<std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>,
+              GpuTopologyProto>;
+
+class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
+ public:
+  StreamExecutorGpuTopologyDescription(
+      const PjRtPlatformId platform_id, const absl::string_view platform_name,
+      std::shared_ptr<const GpuTopology> gpu_topology,
+      const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& attributes =
+          {},
+      std::optional<stream_executor::GpuTargetConfigProto> target_config =
+          std::nullopt)
+      : platform_id_(platform_id),
+        platform_name_(platform_name),
+        gpu_topology_(std::move(gpu_topology)),
+        attributes_(attributes),
+        target_config_(std::move(target_config)) {}
+
+  bool operator==(const StreamExecutorGpuTopologyDescription& other) const {
+    return this->platform_id() == other.platform_id() &&
+           this->platform_name() == other.platform_name() &&
+           this->platform_version() == other.platform_version() &&
+           this->gpu_topology() == other.gpu_topology();
+  }
+
+  PjRtPlatformId platform_id() const override { return platform_id_; }
+
+  absl::string_view platform_name() const override { return platform_name_; }
+
+  absl::string_view platform_version() const override {
+    return gpu_topology_->platform_version();
+  }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override {
+    std::vector<std::unique_ptr<const PjRtDeviceDescription>> devices;
+    devices.reserve(gpu_topology_->number_of_devices());
+    for (const int device_id : gpu_topology_->device_ids()) {
+      devices.push_back(std::make_unique<PjRtStreamExecutorDeviceDescription>(
+          device_id, std::string(platform_version())));
+    }
+    return devices;
+  }
+
+  const GpuTopology& gpu_topology() const { return *gpu_topology_; }
+  const GpuTopology* gpu_topology_ptr() const { return gpu_topology_.get(); }
+
+  // No subslice is supported.
+  bool is_subslice_topology() const override { return false; }
+
+  absl::StatusOr<int> ProcessCount() const override {
+    return gpu_topology_->number_of_hosts();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultType() const override {
+    return gpu_topology_->number_of_devices();
+  }
+
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
+    return gpu_topology_->number_of_devices();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
+    return gpu_topology_->number_of_devices();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+    return 1;
+  }
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  const std::optional<stream_executor::GpuTargetConfigProto>& target_config()
+      const {
+    return target_config_;
+  }
+
+  // Returns vendor specific attributes about the topology.
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override;
+
+ private:
+  const PjRtPlatformId platform_id_;
+  const std::string platform_name_;
+  std::shared_ptr<const GpuTopology> gpu_topology_;
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+  std::optional<stream_executor::GpuTargetConfigProto> target_config_;
+};
+
+class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
+ public:
+  StreamExecutorGpuDevice(int id,
+                          std::unique_ptr<LocalDeviceState> local_device_state,
+                          std::string device_kind, std::string device_vendor,
+                          std::string compute_capability, int core_count,
+                          int node_id, int slice_index = 0);
+
+  int slice_index() const;
+
+  absl::string_view device_vendor() const;
+
+  absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
+
+  absl::Span<int const> coords() const;
+
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
+ private:
+  std::string device_vendor_;
+  int slice_index_;
+};
+
+class StreamExecutorGpuHbmMemorySpace : public PjRtStreamExecutorMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "device";
+  static const int kKindId;
+
+  StreamExecutorGpuHbmMemorySpace(int id, PjRtDevice* device);
+};
+
+// A custom PjRtClient that overrides the device assignment method.
+class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
+ public:
+  using xla::PjRtStreamExecutorClient::PjRtStreamExecutorClient;
+
+  StreamExecutorGpuClient(
+      std::string platform_name, LocalClient* client,
+      std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
+      int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      bool should_stage_host_to_device_transfers,
+      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
+      std::shared_ptr<KeyValueStoreInterface> kv_store,
+      std::shared_ptr<const GpuTopology> gpu_topology);
+
+  std::optional<std::shared_ptr<KeyValueStoreInterface>> key_value_store()
+      const override {
+    return kv_store_;
+  }
+
+  absl::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::string_view platform_version() const override;
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const PjRtClient::ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const PjRtClient::ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override;
+
+  PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, PjRtFuture<void*> dst,
+                                      int64_t offset,
+                                      int64_t transfer_size) override;
+
+  absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
+      const override {
+    return &topology_;
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable,
+      const LoadOptions& load_options) override {
+    return absl::WrapUnique<PjRtLoadedExecutable>(
+        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable);
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerialized(
+      absl::string_view serialized, std::optional<CompileOptions> options,
+      const LoadOptions& load_options);
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+
+ private:
+  xla::StreamExecutorGpuTopologyDescription topology_;
+  std::shared_ptr<KeyValueStoreInterface> kv_store_;
+};
+
+std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id);
+
+std::string MakeComputeCapabilityString(const se::DeviceDescription* desc);
+
+absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
+    absl::string_view platform_name,
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id, int num_nodes,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
+    std::optional<absl::string_view> mock_gpu_topology = std::nullopt,
+    absl::Duration get_local_topology_timeout = absl::Minutes(2),
+    absl::Duration get_global_topology_timeout = absl::Minutes(5));
+
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
+    const GpuClientOptions& options);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
new file mode 100644
index 00000000..80197b5f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
+#define XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/stream_executor/platform.h"
+
+namespace xla {
+// Implements the interfaces that are needed for the registered compiler.
+class StreamExecutorGpuCompiler : public PjRtCompiler {
+ public:
+  // Constructs a compiler for the default "gpu" platform.
+  explicit StreamExecutorGpuCompiler() = default;
+
+  // Constructs a compiler for the given platform.
+  explicit StreamExecutorGpuCompiler(stream_executor::Platform::Id platform_id);
+
+  // Setting CompileOptions.TargetConfig field will trigger deviceless
+  // compilation, which will not query the GPU attached to the machine.
+  // In this case, the `client` argument could be left as `nullptr`.
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, const XlaComputation& computation,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, mlir::ModuleOp module,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+ private:
+  std::optional<stream_executor::Platform::Id> requested_platform_id_;
+};
+}  // namespace xla
+#endif  // XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/host_callback.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/host_callback.h
new file mode 100644
index 00000000..2777cd36
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/host_callback.h
@@ -0,0 +1,175 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_HOST_CALLBACK_H_
+#define XLA_PJRT_HOST_CALLBACK_H_
+
+#include <atomic>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/shape.h"
+#include "tsl/platform/logging.h"
+
+// The following provides an API for implementing host callbacks on top of
+// PjRT's send/recv interface (see xla::SendCallback and xla::RecvCallback).
+// While this is not the only way to implement host callbacks using send/recv,
+// it is provided as an example implementation that encapsulates common
+// mechanisms for host callbacks in a framework-agnostic manner.
+
+namespace xla {
+
+bool ThisThreadIsInsideHostCallback();
+
+void EnterHostCallback();
+
+void LeaveHostCallback();
+
+// A thread-safe queue for passing PjRtChunk objects for e.g. from Send ops to
+// Recv ops.
+class ThreadSafePjRtChunkQueue {
+ public:
+  // Push a PjRtChunk into the queue.
+  void Push(PjRtChunk chunk) {
+    absl::MutexLock lock(&mu_);
+    if (promises_.empty()) {
+      queue_.push_back(std::move(chunk));
+      return;
+    }
+    auto pop_promise = promises_.front();
+    pop_promise.Set(std::move(chunk));
+    promises_.pop_front();
+  }
+
+  // Pop a PjRtChunk future from the queue.
+  PjRtFuture<PjRtChunk> Pop() {
+    absl::MutexLock lock(&mu_);
+    if (queue_.empty()) {
+      auto promise = PjRtFuture<PjRtChunk>::CreatePromise();
+      promises_.push_back(promise);
+      return PjRtFuture<PjRtChunk>(std::move(promise));
+    }
+
+    auto chunk = PjRtFuture<PjRtChunk>(std::move(queue_.front()));
+    queue_.pop_front();
+    return chunk;
+  }
+
+ private:
+  absl::Mutex mu_;
+  std::deque<PjRtChunk> queue_ ABSL_GUARDED_BY(mu_);
+  // Contains unfulfilled pop promises.
+  std::deque<PjRtFuture<PjRtChunk>::Promise> promises_ ABSL_GUARDED_BY(mu_);
+};
+
+struct HostCallbackArgInfo {
+  // The channel_id associated with this value in HLO.
+  uint16_t channel_id;
+  // The host shape for this value.
+  Shape shape;
+};
+
+struct HostCallback {
+  // The metadata (e.g. channel_id, shape) for the operands and results.
+  std::vector<HostCallbackArgInfo> operands;
+  std::vector<HostCallbackArgInfo> results;
+
+  // The host callback function takes two pointer arrays, each element of which
+  // points to allocated host buffer according to corresponding operand or
+  // result's shape. The first is for the outputs and the second is for the
+  // inputs. The buffers are only guaranteed to be alive during the call. The
+  // callback can also return error status to indicate the entire execution
+  // should fail.
+  std::function<absl::Status(void**, void**)> callback;
+};
+
+// A helper class that maintains the send/recv states for a host callback.
+class HostCallbackContext {
+ public:
+  HostCallbackContext(
+      HostCallback host_callback,
+      bool use_major_to_minor_data_layout_for_callbacks,
+      PjRtHostMemoryForDeviceManager* host_memory_for_device_manager)
+      : host_callback_(std::move(host_callback)),
+        use_major_to_minor_data_layout_for_callbacks_(
+            use_major_to_minor_data_layout_for_callbacks),
+        host_memory_for_device_manager_(host_memory_for_device_manager),
+        args_(host_callback_.operands.size()),
+        result_channels_(host_callback_.results.size()),
+        ready_count_(args_.size()) {
+    if (!use_major_to_minor_data_layout_for_callbacks_) {
+      CHECK(host_memory_for_device_manager_);
+    }
+    for (auto& channel : result_channels_) {
+      channel = std::make_unique<ThreadSafePjRtChunkQueue>();
+    }
+  }
+
+  absl::Status OnSend(int arg_num, const PjRtTransferMetadata& metadata,
+                      PjRtChunk data);
+
+  void Receive(int res_num, const PjRtTransferMetadata& metadata,
+               std::unique_ptr<CopyToDeviceStream> stream);
+
+  const HostCallback& host_callback() const { return host_callback_; }
+
+ private:
+  HostCallback host_callback_;
+  bool use_major_to_minor_data_layout_for_callbacks_;
+  PjRtHostMemoryForDeviceManager* host_memory_for_device_manager_ = nullptr;
+  std::vector<PjRtChunk> args_;
+  std::vector<std::unique_ptr<ThreadSafePjRtChunkQueue>> result_channels_;
+  std::atomic<int> ready_count_;
+};
+
+// The execution states for host callbacks for all replicas. The states are kept
+// as vectors of vectors. The outer vector corresponds to the execution
+// replicas. The inner vector is a list of host callback states for a single
+// execution replica.
+struct HostCallbackStates {
+  std::vector<std::vector<std::unique_ptr<HostCallbackContext>>> contexts;
+  std::vector<std::vector<SendCallback>> send_callbacks;
+  std::vector<std::vector<RecvCallback>> recv_callbacks;
+};
+
+// Creates the execution context for the `host_callback` for one
+// replica.
+//
+// `use_major_to_minor_data_layout_for_callbacks` should match the value set in
+// the corresponding ExecuteOptions; see the comment there for more
+// info. `host_memory_for_device_manager` may be nullptr if
+// `use_major_to_minor_data_layout_for_callbacks` is true.
+std::unique_ptr<HostCallbackContext>
+CreateHostCallbackStateAndAppendSendRecvCallbacks(
+    HostCallback host_callback,
+    PjRtHostMemoryForDeviceManager* host_memory_for_device_manager,
+    std::vector<SendCallback>& send_callbacks,
+    std::vector<RecvCallback>& recv_callbacks,
+    bool use_major_to_minor_data_layout_for_callbacks);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_HOST_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/host_memory_spaces.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/host_memory_spaces.h
new file mode 100644
index 00000000..4aa790ef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/host_memory_spaces.h
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_HOST_MEMORY_SPACES_H_
+#define XLA_PJRT_HOST_MEMORY_SPACES_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla {
+
+// Represents the unpinned host memory accessible to a `PjRtDevice`.
+// An "unpinned" host memory space accommodates ordinary host buffers that are
+// not mapped to any virtual memory of the attached `PjRtDevice`.
+class UnpinnedHostMemorySpace : public PjRtMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "unpinned_host";
+  static const int kKindId;
+
+  UnpinnedHostMemorySpace(int id, PjRtDevice* device);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kKind; }
+
+  int kind_id() const override { return kKindId; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+// Represents the pinned host memory accessible to a `PjRtDevice`.
+// A "pinned" host memory space accommodates host buffers that are mapped to a
+// virtual memory of the attached `PjRtDevice`. The `PjRtDevice` may have the
+// capability to direct-memory-access (DMA) the buffers in this memory space.
+class PinnedHostMemorySpace : public PjRtMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "pinned_host";
+  static const int kKindId;
+
+  PinnedHostMemorySpace(int id, PjRtDevice* device);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kKind; }
+
+  int kind_id() const override { return kKindId; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_HOST_MEMORY_SPACES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/interpreter/interpreter_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
new file mode 100644
index 00000000..aab05065
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
@@ -0,0 +1,454 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_INTERPRETER_INTERPRETER_CLIENT_H_
+#define XLA_PJRT_INTERPRETER_INTERPRETER_CLIENT_H_
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/die_if_null.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/dynamic_dimension_inference.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace xla {
+
+class InterpreterDescription final : public PjRtDeviceDescription {
+ public:
+  static const InterpreterDescription& Singleton();
+
+  int id() const override { return 0; }
+
+  int process_index() const override { return 0; }
+
+  absl::string_view device_kind() const override { return "interpreter"; }
+
+  absl::string_view DebugString() const override { return "interpreter:0"; }
+
+  absl::string_view ToString() const override {
+    return "InterpreterDevice(id=0)";
+  }
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  InterpreterDescription() = default;
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_;
+};
+
+class InterpreterDevice final : public PjRtDevice {
+ public:
+  explicit InterpreterDevice(absl::Nonnull<PjRtClient*> client)
+      : client_(ABSL_DIE_IF_NULL(client)) {}
+
+  // Return the client that owns this device.
+  PjRtClient* client() const override { return client_; }
+
+  bool IsAddressable() const override { return true; };
+
+  const InterpreterDescription& description() const override {
+    return InterpreterDescription::Singleton();
+  }
+
+  PjRtLocalDeviceId local_device_id() const override {
+    return PjRtLocalDeviceId(0);
+  }
+
+  PjRtLocalHardwareId local_hardware_id() const override {
+    return PjRtLocalHardwareId(0);
+  }
+
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    return nullptr;
+  }
+
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
+    return Unimplemented("Interpreter does not suppot transfer to infeed.");
+  }
+
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+    return Unimplemented("Interpreter does not support transfer from outfeed.");
+  }
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
+    return {};
+  }
+
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override {
+    return Unimplemented("default_memory_space not implemented");
+  }
+
+ private:
+  PjRtClient* client_ = nullptr;
+};
+
+// A buffer that wraps a Literal.
+class InterpreterLiteralWrapperBuffer final : public PjRtBuffer {
+ public:
+  InterpreterLiteralWrapperBuffer(absl::Nonnull<PjRtClient*> client,
+                                  absl::Nonnull<PjRtDevice*> device,
+                                  const LiteralSlice& literal)
+      : client_(client), device_(device), literal_(literal.Clone()) {}
+  InterpreterLiteralWrapperBuffer(absl::Nonnull<PjRtClient*> client,
+                                  absl::Nonnull<PjRtDevice*> device,
+                                  Literal literal)
+      : client_(client), device_(device), literal_(std::move(literal)) {}
+
+  const Shape& on_device_shape() const override { return literal_.shape(); }
+
+  PjRtMemorySpace* memory_space() const override { return nullptr; }
+
+  PjRtDevice* device() const override { return device_; }
+
+  PjRtClient* client() const override { return client_; }
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override {
+    return absl::UnimplementedError(
+        "AcquireExternalReference not supported by "
+        "InterpreterLiteralWrapperBuffer.");
+  }
+
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override {
+    return PjRtFuture<>(ShapeUtil::ForEachSubshapeWithStatus(
+        literal_.shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
+          if (!subshape.IsArray()) {
+            return absl::OkStatus();
+          }
+          const int64_t src_size = literal_.size_bytes(index);
+          const int64_t dst_size = literal->size_bytes(index);
+          if (src_size < dst_size) {
+            return absl::FailedPreconditionError(absl::StrFormat(
+                "Cannot copy more data than available: Tried to copy %d bytes, "
+                "but only %d bytes are available (%d < %d).",
+                dst_size, src_size, src_size, dst_size));
+          }
+          std::memcpy(/*dst=*/literal->untyped_data(index),
+                      /*src=*/literal_.untyped_data(index), dst_size);
+          return absl::OkStatus();
+        }));
+  }
+
+  PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
+      override {
+    // Underlying buffer is always ready, so we can immediately call the
+    // generator.
+    absl::StatusOr<MutableLiteralBase*> literal = std::move(generator)();
+    if (!literal.ok()) {
+      return PjRtFuture<>(literal.status());
+    }
+    return ToLiteral(*literal);
+  }
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override {
+    return literal_.size_bytes();
+  }
+
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return PjRtFuture<>(absl::UnimplementedError(
+        "CopyRawToHost not supported by InterpreterLiteralWrapperBuffer."));
+  }
+
+  void Delete() override {
+    // Delete does not need to do anything for this type of buffer.
+    //
+    // This buffer does not support ownership transfers of the underlying
+    // buffer. The buffer memory is owned by the Literal field, deleted when
+    // this buffer's object is deleted.
+    is_deleted_ = true;
+  }
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override {
+    return absl::UnimplementedError(
+        "ReleaseDeviceMemoryOwnership not supported by "
+        "InterpreterLiteralWrapperBuffer.");
+  }
+
+  bool IsDeleted() override { return is_deleted_; }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) override {
+    return absl::UnimplementedError(
+        "CopyToDevice not supported by InterpreterLiteralWrapperBuffer.");
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override {
+    return absl::UnimplementedError(
+        "CopyToMemorySpace not supported by InterpreterLiteralWrapperBuffer.");
+  }
+
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    LOG(ERROR) << "InterpreterLiteralWrapperBuffer::CopyToRemoteDevice was "
+                  "called but is not implemented.";
+  }
+
+  void CopyToRemoteDeviceScattered(
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const ScatterDetails& scatter_details) override {
+    LOG(ERROR)
+        << "InterpreterLiteralWrapperBuffer::CopyToRemoteDeviceScattered "
+           "was called but is not implemented.";
+  }
+
+  PjRtFuture<> GetReadyFuture() override {
+    return PjRtFuture<>(absl::OkStatus());
+  }
+
+  bool IsOnCpu() const override { return true; }
+
+  const Literal& literal() const { return literal_; }
+  Literal& mutable_literal() { return literal_; }
+
+ private:
+  PjRtClient* client_ = nullptr;
+  PjRtDevice* device_ = nullptr;
+  Literal literal_;
+  bool is_deleted_ = false;
+};
+
+class InterpreterLoadedExecutable final : public PjRtLoadedExecutable {
+ public:
+  explicit InterpreterLoadedExecutable(
+      absl::Nonnull<PjRtClient*> client, std::unique_ptr<HloModule> hlo_module,
+      std::unique_ptr<HloEvaluator> hlo_evaluator,
+      std::optional<DynamicDimensionInference> dynamic_dimension_inference,
+      std::shared_ptr<DeviceAssignment> device_assignment,
+      CompileOptions compile_options,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
+      std::vector<PjRtDevice*> addressable_devices)
+      : client_(ABSL_DIE_IF_NULL(client)),
+        hlo_module_(std::move(hlo_module)),
+        hlo_evaluator_(std::move(hlo_evaluator)),
+        dynamic_dimension_inference_(std::move(dynamic_dimension_inference)),
+        device_assignment_(std::move(device_assignment)),
+        compile_options_(std::move(compile_options)),
+        addressable_device_logical_ids_(
+            std::move(addressable_device_logical_ids)),
+        addressable_devices_(std::move(addressable_devices)) {
+    if (dynamic_dimension_inference_.has_value()) {
+      hlo_evaluator_->set_dynamic_dimension_inference(
+          &dynamic_dimension_inference_.value());
+    }
+  }
+
+  int num_replicas() const override {
+    return hlo_module_->config().replica_count();
+  }
+
+  int num_partitions() const override {
+    return hlo_module_->config().num_partitions();
+  }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override { return -1; }
+
+  absl::string_view name() const override { return hlo_module_->name(); }
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    std::vector<std::shared_ptr<HloModule>> hlo_modules;
+    hlo_modules.push_back(hlo_module_);
+    return hlo_modules;
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return absl::UnimplementedError("GetOutputMemoryKinds is not supported.");
+  }
+
+  PjRtClient* client() const override { return client_; }
+
+  const DeviceAssignment& device_assignment() const override {
+    return *device_assignment_;
+  }
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    return addressable_device_logical_ids_;
+  }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  void Delete() override { hlo_module_ = nullptr; }
+
+  bool IsDeleted() override { return hlo_module_ == nullptr; }
+
+ private:
+  absl::StatusOr<Literal> Evaluate(
+      const HloComputation& computation,
+      absl::Span<const Literal* const> arg_literals)
+      ABSL_LOCKS_EXCLUDED(hlo_evaluator_lock_);
+
+  PjRtClient* client_ = nullptr;
+  std::shared_ptr<HloModule> hlo_module_;
+  mutable absl::Mutex hlo_evaluator_lock_;
+  std::unique_ptr<HloEvaluator> hlo_evaluator_
+      ABSL_PT_GUARDED_BY(hlo_evaluator_lock_);
+  std::optional<DynamicDimensionInference> dynamic_dimension_inference_;
+  std::shared_ptr<DeviceAssignment> device_assignment_;
+  CompileOptions compile_options_;
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
+  std::vector<PjRtDevice*> addressable_devices_;
+};
+
+class InterpreterClient final : public PjRtClient {
+ public:
+  InterpreterClient()
+      : interpreter_device_{this}, devices_({&interpreter_device_}) {}
+  // Not copyable or movable
+  InterpreterClient(const InterpreterClient&) = delete;
+  InterpreterClient& operator=(const InterpreterClient&) = delete;
+  InterpreterClient(InterpreterClient&&) = delete;
+  InterpreterClient& operator=(InterpreterClient&&) = delete;
+
+  static Shape DeviceShapeRepresentation(const Shape& shape) { return shape; }
+
+  static int64_t ShapeSizeBytes(const Shape& shape) {
+    if (shape.IsOpaque()) {
+      return sizeof(void*);
+    }
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+
+  int process_index() const override { return 0; }
+
+  int device_count() const override { return devices().size(); }
+
+  int addressable_device_count() const override {
+    return addressable_devices().size();
+  }
+
+  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return devices_;
+  }
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
+    return interpreter_device_.memory_spaces();
+  }
+
+  PjRtPlatformId platform_id() const override {
+    static const PjRtPlatformId kPlatformId = tsl::Fingerprint64("interpreter");
+    return kPlatformId;
+  }
+
+  absl::string_view platform_name() const override { return "interpreter"; }
+
+  absl::string_view platform_version() const override { return "<unknown>"; }
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
+
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override {
+    return std::make_unique<HloCostAnalysis>(ShapeSizeBytes);
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp module, CompileOptions options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device,
+      const Layout* device_layout) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileInternal(
+      const XlaComputation& computation,
+      const std::vector<const Shape*>& argument_shapes,
+      LayoutCanonicalizationCallback layout_canonicalization_callback,
+      CompileOptions options);
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module);
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> RunBackend(
+      std::unique_ptr<HloModule> hlo_module, CompileOptions& options);
+
+  InterpreterDevice interpreter_device_;
+  // Pointer array of devices (just one) so that we can create a span of it.
+  std::array<PjRtDevice*, 1> devices_;
+};
+}  // namespace xla
+
+#endif  // XLA_PJRT_INTERPRETER_INTERPRETER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/interpreter_device.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/interpreter_device.h
new file mode 100644
index 00000000..6b987a4e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/interpreter_device.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_INTERPRETER_DEVICE_H_
+#define XLA_PJRT_INTERPRETER_DEVICE_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+
+namespace xla {
+
+class InterpreterDevice : public PjRtStreamExecutorDevice {
+ public:
+  InterpreterDevice(int id,
+                    std::unique_ptr<LocalDeviceState> local_device_state);
+};
+
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient();
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_INTERPRETER_DEVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/layout_mode.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/layout_mode.h
new file mode 100644
index 00000000..6ee3f3dc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/layout_mode.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_LAYOUT_MODE_H_
+#define XLA_PJRT_LAYOUT_MODE_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/layout.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+// Helper struct for specifying how to choose the layout for a value in a
+// program to be compiled (e.g. a computation argument).
+//
+// The source of truth for this info is the "mhlo.layout_mode" string attribute
+// of input MLIR modules. This struct can help manage the attribute. The
+// ToString and FromString methods can be used to convert between this struct
+// and the "mhlo.layout_mode" string attr.
+struct LayoutMode {
+  enum class Mode {
+    // Use the default compact layout.
+    kDefault = 0,
+    // Use `layout`.
+    kUserSpecified,
+    // Let compiler choose layout.
+    kAuto
+  };
+  Mode mode = Mode::kDefault;
+
+  // Only set iff layout_mode == kUserSpecified. This is the layout of the
+  // per-device data, i.e. if the computation is sharded, the caller must choose
+  // both the sharding and layout for this value such that they're compatible.
+  std::optional<Layout> user_layout;
+
+  LayoutMode() = default;
+  explicit LayoutMode(Mode layout_mode,
+                      std::optional<Layout> layout = std::nullopt);
+  explicit LayoutMode(const Layout& layout)
+      : LayoutMode(Mode::kUserSpecified, layout) {}
+  explicit LayoutMode(const Shape& shape_with_layout)
+      : LayoutMode(Mode::kUserSpecified, shape_with_layout.layout()) {}
+
+  // Produces a human-readable string representing this LayoutMode. Is also in
+  // the correct format for the "mhlo.layout_mode" attribute.
+  std::string ToString() const;
+  // Parses a string produced by LayoutMode::ToString() or Layout::ToString().
+  static absl::StatusOr<LayoutMode> FromString(std::string s);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_LAYOUT_MODE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/local_device_state.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/local_device_state.h
new file mode 100644
index 00000000..a7ed7add
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/local_device_state.h
@@ -0,0 +1,276 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_LOCAL_DEVICE_STATE_H_
+#define XLA_PJRT_LOCAL_DEVICE_STATE_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <random>
+#include <stack>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/event_pool.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/semaphore.h"
+#include "xla/pjrt/worker_thread.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+
+// Class that encapsulates state relating to a device (e.g., a GPU) on which we
+// can perform computation and transfers. LocalDeviceState objects only exist
+// for devices local to this host.
+class LocalDeviceState {
+ public:
+  // There are three different semantics used by memory allocators on different
+  // devices.
+  enum AllocationModel {
+    // kSynchronous is used by CPU devices.
+    //
+    // A buffer returned from the allocator can be used immediately.
+    //
+    // A buffer cannot be freed until after the last stream operation
+    // referencing the buffer has completed, so the client is responsible for
+    // keeping buffers alive until all device-side activity that consumes those
+    // buffers has completed.
+    //
+    // The client's use of the device allocator corresponds to a view of the
+    // tail of the last stream using a buffer.
+    kSynchronous,
+
+    // kComputeSynchronous is used by GPU devices.
+    //
+    // A buffer returned from the allocator at time t can be used after the
+    // compute stream has finished executing the last computation enqueued
+    // before time t.
+    //
+    // A buffer b can be freed after:
+    //   1) The last use of b on the compute stream has been enqueued, and
+    //   2) For any non-compute stream s on which an operation o using b is
+    //      enqueued, either:
+    //     a) The host has been notified that o has completed, or
+    //     b) The next operation to be enqueued on the compute stream is
+    //        guaranteed to be started after o has completed.
+    //
+    // The client's use of the device allocator corresponds to a view of the
+    // tail of the compute stream.
+    kComputeSynchronized,
+
+    // kAsynchronous is used by TPU devices.
+    //
+    // A buffer returned from the allocator can be used immediately.
+    //
+    // A buffer b can be freed as soon as the last stream operation using b has
+    // been enqueued.
+    //
+    // The allocator and lower-level runtime are responsible for keeping buffers
+    // alive (if that is needed) from the perspective of the device until any
+    // device-side work actually completes.
+    //
+    // The only exception is when a buffer is transferred between devices since
+    // only one of the device executors knows about the transfer, so the buffer
+    // must be manually kept alive from the perspective of the other executor.
+    kAsynchronous
+  };
+
+  // Options for stream creations.
+  struct StreamOptions {
+    int priority = 0;
+    int num_device_to_host_streams = 1;
+    int num_device_to_device_streams = 1;
+  };
+
+  // `device_ordinal` is the logical local device ordinal (returned by
+  // `local_device_id()`), and it's used to look up an addressable device local
+  // to a given client. If it is not set (-1 by default), the device's logical
+  // device ordinal will be the same as its physical device ordinal (returned by
+  // `local_hardware_id()`). In general, different PJRT devices have different
+  // logical device ordinals, and several PJRT devices can have the same
+  // physical device ordinal if they share the same physical device.
+  LocalDeviceState(se::StreamExecutor* executor, LocalClient* client,
+                   AllocationModel allocation_model,
+                   int max_inflight_computations, bool allow_event_reuse,
+                   bool use_callback_stream, int device_ordinal = -1,
+                   std::optional<StreamOptions> stream_options = std::nullopt);
+  virtual ~LocalDeviceState();
+
+  se::StreamExecutor* executor() const { return executor_; }
+
+  PjRtLocalDeviceId local_device_id() { return local_device_id_; }
+  PjRtLocalHardwareId local_hardware_id() { return local_hardware_id_; }
+
+  LocalClient* client() const { return client_; }
+
+  AllocationModel allocation_model() const { return allocation_model_; }
+
+  EventPool& event_pool() { return event_pool_; }
+
+  se::Stream* compute_stream() const { return compute_stream_.get(); }
+  se::Stream* host_to_device_stream() const {
+    return host_to_device_stream_.get();
+  }
+
+  // Returns a device to host stream. Allocates streams in a round-robin fashion
+  // amongst the available streams.
+  se::Stream* GetDeviceToHostStream();
+
+  // Returns a device to device stream. Allocates streams in a round-robin
+  // fashion amongst the available streams.
+  se::Stream* GetDeviceToDeviceStream();
+
+  // Returns a usage stream. Allocates streams in a round-robin fashion amongst
+  // the available streams. When the overhead from BorrowStreamFromPool is too
+  // large for a use case, consider using this API instead.
+  se::Stream* GetFixedSizePoolUsageStream();
+
+  // Return a stream that should be used to track when an externally-managed
+  // buffer is ready. This is intended to support dlpack on GPU. Allocates
+  // streams in a round-robin fashion amongst the available streams.
+  se::Stream* GetExternalReadyEventStream();
+
+  // Maps a raw platform-specific stream to an se::Stream* owned by this
+  // LocalDeviceState. `stream` should have been derived from a se::Stream*
+  // returned by GetExternalReadyEventStream.
+  // TODO(skyewm): this function could map other raw streams if needed. It's
+  // currently only used with external ready event streams.
+  absl::StatusOr<se::Stream*> GetStreamFromExternalStream(std::intptr_t stream);
+
+  // Returns a vector of device to device streams.
+  std::vector<se::Stream*> GetDeviceToDeviceStreams();
+
+  // Borrows a stream from a pool. The stream is guaranteed not to have any
+  // currently outstanding work at its tail.
+  std::unique_ptr<se::Stream> BorrowStreamFromPool();
+  // Returns a stream to the pool. The caller must ensure the stream does not
+  // have any outstanding work at its tail.
+  void ReturnStreamToPool(std::unique_ptr<se::Stream> stream);
+
+  // Enqueues a copy of `src_buffer` to `dst_buffer` onto `transfer_stream`.
+  virtual absl::Status ThenMemcpyDeviceToDevice(
+      se::Stream* transfer_stream, se::Stream* dst_stream,
+      se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer);
+
+  WorkerThread* execute_thread() const { return execute_thread_.get(); }
+
+  WorkerThread* cleanup_thread() const { return cleanup_thread_.get(); }
+
+  // Enqueues a host callback on 'stream'. `stream` may, but need not, wait for
+  // `callback` to complete. It is safe to call runtime methods from the
+  // callback.
+  // This API differs from ThenDoHostCallback in two ways:
+  // a) ThenDoHostCallback is often constrained in what it can do, in
+  //    particular, on GPU the callback runs on a thread belonging to the GPU
+  //    runtime and cannot perform GPU operations itself. On GPU, callbacks
+  //    execute in a separate thread.
+  // b) ThenDoHostCallback waits for the callback to complete.
+  absl::Status ThenExecuteCallback(se::Stream* stream,
+                                   std::function<void()> callback);
+
+  // Helpers for releasing values on a worker thread at the tail of a stream on
+  // a worker thread. Copies `object`, and destroys the copy when the tail of
+  // the stream is reached. The destruction happens either in the caller's
+  // thread or on the worker thread (depending on thread schedules), not a
+  // device callback, so it is safe if the destructor frees device resource
+  // (e.g., GPU objects).
+  template <typename T>
+  absl::Status ThenRelease(se::Stream* stream, T&& object) {
+    return ThenExecuteCallback(
+        stream, [object = std::forward<T>(object)]() { /* releases object */ });
+  }
+
+  Semaphore& compute_semaphore() { return compute_semaphore_; }
+
+  // Returns a fresh, PRNG-generated random seed for an XLA computation.
+  int GetNewPrngSeed();
+
+  // Whether to allow deleting a buffer before the operation fulfilling the
+  // buffer is scheduled by the host.
+  bool allow_delete_before_fulfill() const {
+    return allow_delete_before_fulfill_;
+  }
+
+ private:
+  absl::Status SynchronizeAllActivity();
+
+  AllocationModel allocation_model_;
+
+  EventPool event_pool_;
+
+  // Semaphore used to limit how many programs can be enqueued on the compute
+  // stream by the host ahead of the device.
+  Semaphore compute_semaphore_;
+
+  PjRtLocalDeviceId local_device_id_;
+  PjRtLocalHardwareId local_hardware_id_;
+  se::StreamExecutor* const executor_;
+  LocalClient* const client_;
+  std::unique_ptr<se::Stream> compute_stream_;
+  std::unique_ptr<se::Stream> host_to_device_stream_;
+  std::vector<std::unique_ptr<se::Stream>> device_to_host_streams_;
+  std::vector<std::unique_ptr<se::Stream>> device_to_device_streams_;
+  std::vector<std::unique_ptr<se::Stream>> fixed_size_pool_usage_streams_;
+  std::vector<std::unique_ptr<se::Stream>> external_ready_event_streams_;
+
+  static constexpr int kNumDeviceToHostStreams = 4;
+  static constexpr int kNumDeviceToDeviceStreams = 4;
+  static constexpr int kNumFixedSizePoolUsageStreams = 4;
+  static constexpr int kNumExternalReadyEventStreams = 4;
+
+  absl::Mutex mu_;
+  int next_device_to_host_stream_ ABSL_GUARDED_BY(mu_) = 0;
+  int next_device_to_device_stream_ ABSL_GUARDED_BY(mu_) = 0;
+  int next_fixed_size_pool_usage_stream_ ABSL_GUARDED_BY(mu_) = 0;
+  int next_external_ready_event_stream_ ABSL_GUARDED_BY(mu_) = 0;
+
+  std::random_device prng_seed_device_ ABSL_GUARDED_BY(mu_);
+  std::mt19937 prng_seed_generator_ ABSL_GUARDED_BY(mu_);
+  std::uniform_int_distribution<> prng_seed_distribution_ ABSL_GUARDED_BY(mu_);
+
+  absl::Mutex stream_pool_mu_;
+  std::stack<std::unique_ptr<se::Stream>> usage_stream_pool_
+      ABSL_GUARDED_BY(stream_pool_mu_);
+
+  // Callback map pairs callback stream with a device stream and is used for
+  // running short host-side callbacks after device side events, without
+  // preventing the device-side stream from doing useful work.
+  absl::Mutex callback_stream_map_mu_;
+  std::optional<absl::flat_hash_map<se::Stream*, std::unique_ptr<se::Stream>>>
+      callback_stream_map_;
+
+  // A worker thread, used for replicated computation launches.
+  std::unique_ptr<WorkerThread> execute_thread_;
+
+  // A worker thread, used for callbacks. It is necessary that this be a
+  // different thread to the execute thread because we acquire the compute
+  // semaphore during calls to Execute but release it from a callback and if
+  // they are the same thread we might deadlock.
+  std::unique_ptr<WorkerThread> callback_thread_;
+
+  // One thread dedicated to cleaning up buffers. Scheduled work on this thread
+  // may wait for other threads to schedule writes to buffers.
+  std::unique_ptr<WorkerThread> cleanup_thread_;
+
+  bool allow_delete_before_fulfill_ = true;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_LOCAL_DEVICE_STATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/lru_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/lru_cache.h
new file mode 100644
index 00000000..82dbf6f3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/lru_cache.h
@@ -0,0 +1,196 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_LRU_CACHE_H_
+#define XLA_PJRT_LRU_CACHE_H_
+
+#include <optional>
+#include <unordered_map>
+
+#include "absl/container/node_hash_map.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// A simple LRU cache. Not thread-safe.
+// Value must be copyable and moveable. The intent is that Value is typically
+// a smart-pointer type.
+template <typename Key, typename Value,
+          typename Hash = typename absl::node_hash_map<Key, Value>::hasher,
+          typename Eq = typename absl::node_hash_map<Key, Value>::key_equal>
+class LRUCache {
+ private:
+  struct LRUListEntry {
+    LRUListEntry* next;
+    LRUListEntry* prev;
+  };
+
+ public:
+  // Multiple LRUCaches can share a LRU list, meaning that the capacity and
+  // eviction policy is shared. The user provides an LRU list
+  // to the cache constructor, and must ensure that it remains alive as long
+  // as the cache does.
+  class LRUList {
+   public:
+    explicit LRUList(int capacity) : capacity_(capacity) {
+      head_.next = &head_;
+      head_.prev = &head_;
+    }
+    ~LRUList() {
+      CHECK(head_.next == &head_);
+      CHECK(head_.prev == &head_);
+    }
+
+    LRUList(const LRUList&) = delete;
+    LRUList(LRUList&&) = delete;
+    LRUList& operator=(const LRUList&) = delete;
+    LRUList& operator=(LRUList&&) = delete;
+
+    int Capacity() const { return capacity_; }
+    int Size() const { return size_; }
+
+    void Clear();
+
+   private:
+    friend class LRUCache;
+    int capacity_;
+    int size_ = 0;
+
+    // Root of a circular doubly-linked list of entries, in order from least
+    // recently used to most recently used. An "empty" cache always contains
+    // this element in the LRU list.
+    LRUListEntry head_;
+  };
+
+  explicit LRUCache(LRUList* lru_list) : lru_list_(lru_list) {}
+  ~LRUCache();
+
+  LRUCache(const LRUCache&) = delete;
+  LRUCache(LRUCache&&) = delete;
+  LRUCache& operator=(const LRUCache&) = delete;
+  LRUCache& operator=(LRUCache&&) = delete;
+
+  // Returns the `value` associated with `key`. Creates a value with `factory`
+  // and inserts it if absent.
+  Value GetOrCreateIfAbsent(const Key& key,
+                            const std::function<Value(const Key&)>& factory);
+
+  void Remove(const Key& key);
+
+  // Removes all entries from the cache.
+  void Clear();
+
+  int Size() const { return entries_.size(); }
+  int Capacity() const { return lru_list_->Capacity(); }
+
+  auto begin() const { return entries_.begin(); }
+  auto end() const { return entries_.end(); }
+
+ private:
+  LRUList* lru_list_;
+
+  struct Entry : public LRUListEntry {
+    Entry() = default;
+
+    // Pointer to the key in `entries_`. std::unordered_map<> promises
+    // pointer stability for keys.
+    const Key* key;
+    LRUCache* container;
+    std::optional<Value> value;
+  };
+
+  // We use `unordered_map` because (a) we want to guarantee pointer stability
+  // for keys and values, and (b) we need exception safety so we can't use
+  // absl hashtables.
+  std::unordered_map<Key, Entry, Hash, Eq> entries_;
+};
+
+template <typename Key, typename Value, typename Hash, typename Eq>
+void LRUCache<Key, Value, Hash, Eq>::LRUList::Clear() {
+  while (head_.next != &head_) {
+    static_cast<Entry*>(head_.next)->container->Clear();
+  }
+  size_ = 0;
+}
+
+template <typename Key, typename Value, typename Hash, typename Eq>
+void LRUCache<Key, Value, Hash, Eq>::Clear() {
+  for (auto& e : entries_) {
+    LRUListEntry* l = &e.second;
+    l->next->prev = l->prev;
+    l->prev->next = l->next;
+    --lru_list_->size_;
+  }
+  entries_.clear();
+}
+
+template <typename Key, typename Value, typename Hash, typename Eq>
+LRUCache<Key, Value, Hash, Eq>::~LRUCache() {
+  Clear();
+}
+
+template <typename Key, typename Value, typename Hash, typename Eq>
+void LRUCache<Key, Value, Hash, Eq>::Remove(const Key& key) {
+  LRUListEntry* l = &entries_[key];
+  l->next->prev = l->prev;
+  l->prev->next = l->next;
+  --lru_list_->size_;
+
+  entries_.erase(key);
+}
+
+template <typename Key, typename Value, typename Hash, typename Eq>
+Value LRUCache<Key, Value, Hash, Eq>::GetOrCreateIfAbsent(
+    const Key& key, const std::function<Value(const Key&)>& factory) {
+  auto [it, inserted] = entries_.try_emplace(key);
+  Entry& entry = it->second;
+  if (inserted) {
+    entry.key = &it->first;
+    entry.value = factory(*entry.key);
+    ++lru_list_->size_;
+  } else {
+    // Removes the entry from the LRU list, in preparation for adding it
+    // to the back of the list.
+    entry.prev->next = entry.next;
+    entry.next->prev = entry.prev;
+  }
+  // (Re-)adds entry to the back of the LRU list. Since it is now the
+  // most recently used element, it goes at the back.
+  LRUListEntry& lru_head = lru_list_->head_;
+  entry.container = this;
+  entry.prev = lru_head.prev;
+  entry.next = &lru_head;
+  lru_head.prev->next = &entry;
+  lru_head.prev = &entry;
+
+  Value v = *entry.value;
+
+  // Evict an LRU entry if we are over capacity.
+  if (lru_list_->size_ > lru_list_->capacity_) {
+    Entry* to_remove = static_cast<Entry*>(lru_head.next);
+    to_remove->next->prev = &lru_head;
+    lru_head.next = to_remove->next;
+    // Extract instead of erase in case the kv pair contains python objects
+    // whose destruction could call back into this code. Extract causes the
+    // dtor to be delayed until the kv pair is fully removed from the map.
+    to_remove->container->entries_.extract(*to_remove->key);
+    --lru_list_->size_;
+  }
+  return v;
+}
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_LRU_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/metrics.h
new file mode 100644
index 00000000..e9df0d3e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/metrics.h
@@ -0,0 +1,42 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_METRICS_H_
+#define XLA_PJRT_METRICS_H_
+
+#include "absl/base/attributes.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/lib/monitoring/counter.h"
+
+// Simplified version of tensorflow/core/framework/metrics.h for JAX.
+
+namespace xla {
+namespace metrics {
+
+inline constexpr absl::string_view kPjrtCompilerCompileComputationMetricName =
+    "/pjrt/compiler/is_compiling_computation";
+inline constexpr absl::string_view kPjrtCompilerCompileModuleMetricName =
+    "/pjrt/compiler/is_compiling_module";
+
+void ReportExecutableEnqueueTime(uint64_t running_time_usecs);
+
+void RecordPjrtCompilerCompileComputationStatus(bool is_compiling);
+
+void RecordPjrtCompilerCompileModuleStatus(bool is_compiling);
+
+}  // namespace metrics
+}  // namespace xla
+
+#endif  // XLA_PJRT_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/mlir_to_hlo.h
new file mode 100644
index 00000000..2413851c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -0,0 +1,89 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_MLIR_TO_HLO_H_
+#define XLA_PJRT_MLIR_TO_HLO_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+
+namespace xla {
+
+// Converts an MHLO/CHLO module string to an mlir::Module.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
+    absl::string_view mlir_module_str, mlir::MLIRContext& context);
+
+// Converts an CHLO/MHLO module to XLA HLO.
+// TODO(b/345414638): Delete `use_shardy` when we move Shardy as the first pass
+// in the XLA pipeline.
+absl::Status MlirToXlaComputation(mlir::ModuleOp module,
+                                  XlaComputation& xla_computation,
+                                  bool use_tuple_args, bool return_tuple,
+                                  bool use_shardy);
+
+// Converts an MHLO/CHLO module string to an XLA computation.
+absl::Status ParseMlirModuleStringAndConvertToXlaComputation(
+    absl::string_view mlir_module_str, XlaComputation& xla_computation,
+    bool use_tuple_args, bool return_tuple);
+
+// Export an MHLO + Shardy module into a pure MHLO module, to prepare for a
+// round trip to HLO, such that the Shardy ops and attributes are preserved when
+// going back to MLIR for Shardy propagation.
+absl::Status ExportShardyForHloRoundTrip(mlir::ModuleOp module);
+
+// Returns a version of StableHLO ~12w old, for forward compatibility with PJRT
+// plugins on a quarterly update cycle.
+std::string GetDefaultStablehloVersion(
+    std::optional<int64_t> plugin_version = std::nullopt);
+
+// Serialize using MLIR Bytecode Format which does not guarantee forward or
+// backward compatiblity of the dialects used. If passing StableHLO with forward
+// or backward compatibility requirements, use SerializeUsingVersionedStablehlo.
+//
+// VHLO support was added in PJRT plugin version 41.
+//   For plugin_version < 41, returns `SerializeUsingNativeBytecode`.
+//   For plugin_version >= 41, returns `SerializeUsingVersionedStablehlo`.
+absl::StatusOr<std::string> Serialize(mlir::ModuleOp mlir_module,
+                                      absl::string_view target,
+                                      bool inplace = false);
+
+// Serializes an MLIR module to a portable artifact with forward and backward
+// compatibility. Supports modules using StableHLO/MHLO/CHLO/Func dialects.
+// The `requested_target` parameter is a StableHLO version string ("0.9.0")
+// which can be used for forward compatibility to specify the target downgrade
+// version. Most commonly should use:
+//   `mlir::stablehlo::getCurrentVersion()` for backward compat but not forward.
+//   `mlir::stablehlo::getMinimumVersion()` for maximum forward compatibility.
+// In PJRT, the `requested_target` should be the current version of the PJRT
+// plugin. Serialize will use `min(framework_version, plugin_version)` to
+// serialize. If program contains dialects that aren't supported in StableHLO
+// portable artifacts, use SerializeUsingNativeBytecode.
+absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
+    mlir::ModuleOp mlir_module, absl::string_view requested_target,
+    bool inplace = false);
+
+// Given a module that might be a portable artifact, deserialize and upgrade it
+// back to StableHLO.
+// If module is not a portable artifact, this method is identity. Only fails
+// on portable artifacts that are outside of the compatibility window.
+// `ParseMlirModuleString` uses this method, and should be preferred to directly
+// calling `UpgradeVersionedStablehlo` where possible.
+absl::Status UpgradeVersionedStablehlo(mlir::ModuleOp mlir_module);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_MLIR_TO_HLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_api.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_api.h
new file mode 100644
index 00000000..d2220627
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_api.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_API_H_
+#define XLA_PJRT_PJRT_API_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "tsl/platform/platform.h"
+
+namespace pjrt {
+
+// Gets and sets the global map for PJRT_Api*. Not thread safe. `device_type` is
+// case insensitive.
+absl::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type);
+absl::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api);
+
+// Loads a PJRT plugin. The library provided by library_path must export a
+// symbol called `GetPjrtApi` with function signature `const PJRT_Api*
+// GetPjrtApi()`. This method dlopen the plugin library, dlsym `GetPjrtApi`,
+// calls `GetPjrtApi` and `SetPjrtApi`. Returns the loaded PJRT_Api* if
+// successful.
+absl::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
+                                               absl::string_view library_path);
+
+// Requires that SetPjrtApi has been successfully called on `device_type` before
+// calling this method.
+absl::StatusOr<bool> IsPjrtPluginInitialized(absl::string_view device_type);
+// Initializes a PJRT plugin with `PJRT_Plugin_Initialize`.
+absl::Status InitializePjrtPlugin(absl::string_view device_type);
+
+}  // namespace pjrt
+
+#endif  // XLA_PJRT_PJRT_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_c_api_client.h
new file mode 100644
index 00000000..0c8500c5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -0,0 +1,832 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_C_API_CLIENT_H_
+#define XLA_PJRT_PJRT_C_API_CLIENT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class PjRtCApiClient;
+
+class PjRtCApiDeviceDescription : public PjRtDeviceDescription {
+ public:
+  PjRtCApiDeviceDescription(const PJRT_Api* c_api,
+                            PJRT_DeviceDescription* device_description);
+
+  int id() const override;
+
+  int process_index() const override;
+
+  absl::string_view device_kind() const override;
+
+  absl::string_view DebugString() const override;
+
+  absl::string_view ToString() const override;
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override;
+
+  absl::Span<const PjRtMemorySpaceDescription* const> memory_spaces()
+      const override;
+
+  absl::StatusOr<const PjRtMemorySpaceDescription*> default_memory_space()
+      const override;
+
+ private:
+  const PJRT_Api* c_api_;
+  // `device_description_` is owned by the `PJRT_Client` wrapped by `client_`
+  PJRT_DeviceDescription* device_description_;
+  // Device specific attributes with corresponding values.
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+  mutable std::vector<PjRtMemorySpaceDescription> memory_space_descriptions_;
+  mutable std::vector<PjRtMemorySpaceDescription*>
+      memory_space_description_pointers_;
+  mutable absl::StatusOr<PjRtMemorySpaceDescription*>
+      default_memory_space_description_;
+
+  // Initializes device specific attributes.
+  void InitAttributes();
+  // Initialize device specific memory descriptions.
+  void InitMemoryDescriptions() const;
+};
+
+class PjRtCApiMemorySpace : public PjRtMemorySpace {
+ public:
+  explicit PjRtCApiMemorySpace(PJRT_Memory* c_memory, PjRtCApiClient* client)
+      : client_(client), c_memory_(c_memory) {}
+
+  PjRtClient* client() const override;
+
+  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
+
+  int id() const override;
+
+  absl::string_view kind() const override;
+  int kind_id() const override;
+
+  absl::string_view DebugString() const override;
+
+  absl::string_view ToString() const override;
+
+  const PJRT_Api* pjrt_c_api() const;
+
+  PJRT_Memory* c_memory() const { return c_memory_; }
+
+ private:
+  friend class PjRtCApiClient;
+
+  PjRtCApiClient* client_;
+  PJRT_Memory* c_memory_;
+  std::vector<PjRtDevice*> devices_;
+};
+
+class PjRtCApiDevice : public PjRtDevice {
+ public:
+  explicit PjRtCApiDevice(PJRT_Device* device, PjRtCApiClient* client);
+
+  PjRtClient* client() const override;
+
+  bool IsAddressable() const override;
+
+  PjRtLocalHardwareId local_hardware_id() const override;
+
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
+    return Unimplemented(
+        "PJRT C API does not support TransferToInfeed. Please report an issue "
+        "at https://github.com/google/jax/issues if you need this feature.");
+  }
+
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+    return Unimplemented(
+        "PJRT C API does not support TransferFromOutfeed. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
+  }
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
+    return memory_spaces_;
+  }
+
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    LOG(FATAL)
+        << "PJRT C API does not support CreateAsyncTrackingEvent. Please "
+           "report an issue at https://github.com/google/jax/issues if you "
+           "need this feature.";
+    return nullptr;
+  }
+
+  PJRT_Device* c_device() const { return device_; }
+
+  const PjRtCApiDeviceDescription& description() const override {
+    return description_;
+  }
+
+  absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents()
+      const override;
+
+ private:
+  friend class PjRtCApiClient;
+
+  PjRtCApiClient* client_ = nullptr;
+  // `device_` is owned by the `PJRT_Client` wrapped by `client_`
+  PJRT_Device* device_;
+  PjRtCApiDeviceDescription description_;
+  std::vector<PjRtMemorySpace*> memory_spaces_;
+};
+
+class PjRtCApiCompiler : public PjRtCompiler {
+ public:
+  explicit PjRtCApiCompiler(const PJRT_Api* c_api) : c_api_(c_api) {}
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, const XlaComputation& computation,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, mlir::ModuleOp module,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+ private:
+  const PJRT_Api* c_api_;
+};
+
+class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
+ public:
+  // `owned` indicates whether this PjRtCApiTopologyDescription should take
+  // ownership of `c_topology`, i.e., if owned is true,
+  // PJRT_TopologyDescription_Destroy will be called on `c_topology` when this
+  // PjRtCApiTopologyDescription is destroyed.
+  PjRtCApiTopologyDescription(const PJRT_Api* c_api,
+                              PJRT_TopologyDescription* c_topology, bool owned);
+
+  PjRtPlatformId platform_id() const override {
+    CHECK(false) << "PJRT C API does not support platform_id.";
+  }
+
+  absl::string_view platform_name() const override;
+
+  absl::string_view platform_version() const override;
+
+  std::optional<PjRtCompiler*> compiler() const override {
+    return compiler_.get();
+  }
+
+  PJRT_TopologyDescription* c_topology() const { return c_topology_; }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  // Returns vendor specific attributes about the topology.
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override {
+    return Unimplemented("PJRT C API does not support GetDefaultLayout");
+  }
+
+ private:
+  std::unique_ptr<PjRtCApiCompiler> compiler_;
+  const PJRT_Api* c_api_;
+  // nullptr iff the PJRT_TopologyDescription isn't owned by this wrapper
+  // (i.e. by the caller).
+  std::unique_ptr<PJRT_TopologyDescription,
+                  ::pjrt::PJRT_TopologyDescriptionDeleter>
+      owned_c_topology_;
+  PJRT_TopologyDescription* c_topology_;
+  // Device specific attributes with corresponding values.
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+
+  // Initializes device specific attributes.
+  void InitAttributes();
+};
+
+class PjRtCApiClient : public PjRtClient {
+ public:
+  PjRtCApiClient(
+      const PJRT_Api* c_api, PJRT_Client* c_client,
+      std::unique_ptr<::pjrt::PJRT_KeyValueCallbackData> kv_callback_data);
+
+  int process_index() const override;
+
+  int device_count() const override;
+  int addressable_device_count() const override;
+
+  absl::Span<PjRtDevice* const> devices() const override;
+  absl::Span<PjRtDevice* const> addressable_devices() const override;
+
+  absl::StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override;
+
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override;
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  PjRtPlatformId platform_id() const override { return platform_id_; }
+
+  absl::string_view platform_name() const override { return platform_name_; };
+
+  absl::string_view platform_version() const override;
+
+  std::optional<PjRtPluginAttributes> plugin_attributes() const override;
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override {
+    return Unimplemented(
+        "PJRT C API does not support GetHloCostAnalysis. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
+  }
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp module, CompileOptions options) override;
+
+  // `PjRtCApiClient::DeserializeExecutable()` ignores `CompileOptions` arg
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device) override {
+    return Unimplemented(
+        "PJRT C API does not support CreateUninitializedBuffer. Please report "
+        "an issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
+  }
+
+  absl::StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+      const override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) override {
+    return Unimplemented(
+        "PJRT C API does not support BufferFromHostLiteral. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtDevice* device,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override;
+
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
+      PjRtBuffer* buffer) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                              PjRtDevice* device,
+                              PjRtCrossHostRecvNotifier notifier) override {
+    return Unimplemented(
+        "PJRT C API does not support MakeCrossHostReceiveBuffers. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffersForGather(
+      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
+      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
+    return Unimplemented(
+        "PJRT C API does not support MakeCrossHostReceiveBuffers. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
+  }
+
+  absl::Status Defragment() override {
+    return Unimplemented(
+        "PJRT C API does not support Defragment. Please report an issue at "
+        "https://github.com/google/jax/issues if you need this feature.");
+  }
+
+  const PJRT_Api* pjrt_c_api() const;
+
+  PJRT_Client* pjrt_c_client() { return c_client_.get(); }
+
+  PjRtCApiDevice* GetCppDevice(PJRT_Device* c_device) const {
+    auto it = c_to_cpp_device_map_.find(c_device);
+    CHECK(it != c_to_cpp_device_map_.end());
+    return it->second;
+  }
+
+  PjRtCApiMemorySpace* GetCppMemory(PJRT_Memory* c_memory) const {
+    auto it = c_to_cpp_memory_map_.find(c_memory);
+    CHECK(it != c_to_cpp_memory_map_.end());
+    return it->second;
+  }
+
+  PjRtHostMemoryForDeviceManager* GetPjRtHostMemoryForDeviceManager()
+      const override {
+    return nullptr;
+  }
+
+ private:
+  void InitDevicesAndMemorySpaces();
+  void InitAttributes();
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternalImpl(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      std::variant<PjRtDevice*, PjRtMemorySpace*> device_or_memory,
+      const Layout* device_layout);
+
+  const PJRT_Api* c_api_;
+  std::unique_ptr<PJRT_Client, ::pjrt::PJRT_ClientDeleter> c_client_;
+  std::unique_ptr<::pjrt::PJRT_KeyValueCallbackData> kv_callback_data_;
+  std::vector<std::unique_ptr<PjRtCApiDevice>> owned_devices_;
+  std::vector<PjRtDevice*> devices_;
+  std::vector<PjRtDevice*> addressable_devices_;
+  absl::flat_hash_map<PJRT_Device*, PjRtCApiDevice*> c_to_cpp_device_map_;
+  std::vector<std::unique_ptr<PjRtCApiMemorySpace>> owned_memory_spaces_;
+  std::vector<PjRtMemorySpace*> addressable_memory_spaces_;
+  absl::flat_hash_map<PJRT_Memory*, PjRtCApiMemorySpace*> c_to_cpp_memory_map_;
+  // There may be an error fetching the topology desc via the C API
+  // (e.g. unimplemented). Save the error during client init so we can return it
+  // from GetTopologyDescription().
+  absl::StatusOr<const PjRtCApiTopologyDescription> topo_desc_;
+
+  const std::string platform_version_;
+  const std::string platform_name_;
+  const PjRtPlatformId platform_id_;
+  absl::flat_hash_map<std::string, xla::PjRtValueType> attributes_;
+};
+
+class PjRtCApiBuffer : public PjRtBuffer {
+ public:
+  PjRtCApiBuffer(PjRtCApiClient* client, PJRT_Buffer* buffer);
+
+  PrimitiveType element_type() const override;
+
+  absl::Span<const int64_t> dimensions() const override;
+
+  std::shared_ptr<const PjRtLayout> layout() const override;
+
+  // PJRT C API doesn't support tuple buffers.
+  bool IsTuple() const override { return false; }
+
+  const Shape& on_device_shape() const override {
+    LOG(FATAL) << "PjRtBuffer::on_device_shape() not implemented in PJRT C API";
+  }
+
+  bool has_dynamic_dimensions() const override;
+
+  absl::Span<const bool> is_dynamic_dimension() const override;
+
+  absl::StatusOr<std::vector<int64_t>> logical_dimensions() override;
+
+  absl::StatusOr<Shape> logical_on_device_shape() override {
+    LOG(FATAL) << "PjRtBuffer::on_logical_device_shape() not implemented in "
+                  "PJRT C API";
+  }
+
+  PjRtMemorySpace* memory_space() const override;
+
+  PjRtDevice* device() const override;
+
+  PjRtClient* client() const override { return client_; }
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override;
+
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+  PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
+      override;
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override;
+
+  void Delete() override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override {
+    return Unimplemented(
+        "PJRT C API does not support ReleaseDeviceMemoryOwnership");
+  }
+
+  bool IsDeleted() override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override;
+
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    LOG(ERROR) << "PJRT C API does not support CopyToRemoteDevice. Please "
+                  "report an issue at https://github.com/google/jax/issues if "
+                  "you need this feature.";
+  }
+
+  void CopyToRemoteDeviceScattered(
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const ScatterDetails& scatter_details) override {
+    LOG(ERROR)
+        << "PJRT C API does not support CopyToRemoteDeviceScattered. Please "
+           "report an issue at https://github.com/google/jax/issues if you "
+           "need this feature.";
+  }
+
+  PjRtFuture<> GetReadyFuture() override;
+
+  bool IsOnCpu() const override;
+
+  PJRT_Buffer* c_buffer() const { return buffer_.get(); }
+
+  const PJRT_Api* pjrt_c_api() const { return client_->pjrt_c_api(); }
+
+ private:
+  // Gets the raw pointer to `readiness_event_`. If `readiness_event_` has not
+  // yet been initialized, this function does so before returning the pointer.
+  PJRT_Event* GetReadyEvent();
+
+  // `MakePromiseTrackEvent` sets `readiness_promise_` up to track
+  // `readiness_event_`. This is used to implement `GetReadyFuture()`.
+  // `readiness_promise_` should be created before calling this function.
+  void MakePromiseTrackEvent();
+
+  PjRtCApiClient* client_;
+  std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer_;
+  std::unique_ptr<PJRT_Event, ::pjrt::PJRT_EventDeleter> readiness_event_;
+  // This is a shared_ptr to keep the underlying future alive even if
+  // `readiness_promise` is destroyed before `readiness_event`, and the callback
+  // we set on `readiness_event` modifies `readiness_promise_`.
+  std::shared_ptr<PjRtFuture<>::Promise> readiness_promise_;
+  // Set and cached the first time layout() is called.
+  mutable std::shared_ptr<const PjRtLayout> layout_;
+  // Set and cached the first time is_dynamic_dimension() is called.
+  mutable std::optional<absl::InlinedVector<bool, InlineRank()>>
+      is_dynamic_dimension_;
+  // Used to synchronize concurrent setting of cached values.
+  mutable absl::Mutex mu_;
+};
+
+class PjRtCApiExternalReference : public PjRtBuffer::ExternalReference {
+ public:
+  PjRtCApiExternalReference(PjRtCApiClient* client, PjRtCApiBuffer* buffer,
+                            void* data_ptr)
+      : client_(client), buffer_(buffer) {
+    data_ptr_ = data_ptr;
+  }
+  ~PjRtCApiExternalReference() override;
+
+  absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) override;
+
+ private:
+  PjRtCApiClient* client_;
+  PjRtCApiBuffer* buffer_;
+};
+
+class PjRtCApiExecutable : public PjRtExecutable {
+ public:
+  PjRtCApiExecutable(const PJRT_Api* c_api, PJRT_Executable* executable);
+
+  absl::string_view name() const override;
+  int num_replicas() const override;
+  int num_partitions() const override;
+
+  int64_t SizeOfGeneratedCodeInBytes() const override;
+
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override;
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override;
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    return pjrt::GetCompiledMemoryStats(c_api_, executable_.get());
+  }
+
+  absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override {
+    LOG(FATAL) << "PjRtExecutable::GetOutputShapes() not implemented in PJRT C "
+                  "API. Please use PjRtExecutable::GetOutputElementTypes() or "
+                  "PjRtExecutable::GetOutputDimensions().";
+  }
+
+  absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
+  GetOutputElementTypes() const override;
+
+  absl::StatusOr<std::vector<std::vector<DimensionVector>>>
+  GetOutputDimensions() const override;
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
+
+  const PJRT_Api* pjrt_c_api() const { return c_api_; }
+  PJRT_Executable* c_executable() const { return executable_.get(); }
+
+  absl::StatusOr<std::string> SerializeExecutable() const override;
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override;
+
+ private:
+  const PJRT_Api* c_api_;
+  std::unique_ptr<PJRT_Executable, ::pjrt::PJRT_ExecutableDeleter> executable_;
+};
+
+class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
+ public:
+  PjRtCApiLoadedExecutable(PjRtCApiClient* client,
+                           PJRT_LoadedExecutable* executable);
+
+  PjRtClient* client() const override { return client_; }
+  absl::string_view name() const override { return executable_->name(); }
+  int num_replicas() const override { return executable_->num_replicas(); }
+  int num_partitions() const override { return executable_->num_partitions(); }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    return executable_->SizeOfGeneratedCodeInBytes();
+  }
+
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override {
+    return executable_->GetCostAnalysis();
+  }
+
+  const DeviceAssignment& device_assignment() const override {
+    CHECK(false) << "PJRT C API does not support device_assignment";
+  }
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    CHECK(false)
+        << "PJRT C API does not support addressable_device_logical_ids";
+  }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return executable_->GetHloModules();
+  }
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    return executable_->GetCompiledMemoryStats();
+  }
+
+  absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override {
+    LOG(FATAL)
+        << "PjRtLoadedExecutable::GetOutputShapes() not implemented in PJRT C "
+           "API. Please use PjRtLoadedExecutable::GetOutputElementTypes() or "
+           "PjRtLoadedExecutable::GetOutputDimensions().";
+  }
+
+  absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
+  GetOutputElementTypes() const override {
+    return executable_->GetOutputElementTypes();
+  }
+
+  absl::StatusOr<std::vector<std::vector<DimensionVector>>>
+  GetOutputDimensions() const override {
+    return executable_->GetOutputDimensions();
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return executable_->GetOutputMemoryKinds();
+  }
+
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  void Delete() override;
+  bool IsDeleted() override;
+
+  absl::StatusOr<std::string> SerializeExecutable() const override {
+    return executable_->SerializeExecutable();
+  }
+
+  const PJRT_Api* pjrt_c_api() const { return client_->pjrt_c_api(); }
+  PJRT_Executable* c_executable() const { return executable_->c_executable(); }
+
+  PJRT_LoadedExecutable* c_loaded_executable() const {
+    return loaded_executable_.get();
+  }
+
+  // True if the `returned_futures` output parameter is supported in the
+  // Execute*() methods.
+  bool IsReturnedFutureSupported() const override { return true; }
+
+  // std::function version of PJRT_SendCallback
+  using SendCallbackFunction = std::function<PJRT_Error*(
+      PJRT_Chunk*, PJRT_CallbackError*, size_t, bool)>;
+  // std::function version of PJRT_RecvCallback
+  using RecvCallbackFunction = std::function<void(PJRT_CopyToDeviceStream*)>;
+
+  // Override to call FingerprintExecutable through the wrapped
+  // PjRtCApiExecutable.
+  absl::StatusOr<std::string> FingerprintExecutable() const override;
+
+ private:
+  // Groups data needed to support send/recv execution callbacks.
+  struct SendRecvCallbackData {
+    std::vector<std::vector<PJRT_SendCallbackInfo>> c_send_callbacks;
+    std::vector<PJRT_SendCallbackInfo*> c_send_callback_lists;
+    std::vector<std::vector<PJRT_RecvCallbackInfo>> c_recv_callbacks;
+    std::vector<PJRT_RecvCallbackInfo*> c_recv_callback_lists;
+    std::vector<SendCallbackFunction> send_callback_functions;
+    std::vector<RecvCallbackFunction> recv_callback_functions;
+  };
+
+  // Gets common Execute_Args between Execute, ExecuteSharded and
+  // ExecutePortable. device_complete_events in the return is set if the input
+  // device_complete_events has value.
+  absl::StatusOr<PJRT_LoadedExecutable_Execute_Args> GetCommonExecuteArgs(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options, PJRT_ExecuteOptions& c_options,
+      std::vector<std::vector<PJRT_Buffer*>>& c_argument_lists_storage,
+      std::vector<PJRT_Buffer**>& c_arguments,
+      std::vector<std::vector<PJRT_Buffer*>>& c_output_lists_storage,
+      std::vector<PJRT_Buffer**>& c_output_lists,
+      std::optional<std::vector<PJRT_Event*>>& device_complete_events,
+      SendRecvCallbackData& send_recv_callback_data,
+      std::vector<int64_t>& non_donatable_input_indices_storage);
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecuteWithSingleDevice(absl::Span<PjRtBuffer* const> argument_handles,
+                          PjRtDevice* device, const ExecuteOptions& options,
+                          std::optional<PjRtFuture<>>& returned_future,
+                          bool fill_future);
+
+  PjRtCApiClient* client_;
+  std::unique_ptr<PJRT_LoadedExecutable, ::pjrt::PJRT_LoadedExecutableDeleter>
+      loaded_executable_;
+  std::unique_ptr<PjRtCApiExecutable> executable_;
+  std::vector<PjRtDevice*> addressable_devices_;
+
+  void InitDevices();
+};
+
+class CApiCopyToDeviceStream : public CopyToDeviceStream {
+ public:
+  CApiCopyToDeviceStream(PJRT_CopyToDeviceStream* c_stream,
+                         const PJRT_Api* c_api);
+  ~CApiCopyToDeviceStream() override;
+
+  PjRtFuture<> AddChunk(PjRtChunk chunk) override;
+
+ private:
+  PJRT_CopyToDeviceStream* c_stream_;
+  const PJRT_Api* c_api_;
+};
+
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
+    absl::string_view device_type,
+    const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {},
+    std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr);
+
+absl::StatusOr<std::unique_ptr<PjRtClient>> WrapClientAroundCApi(
+    const PJRT_Api* c_api,
+    const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {},
+    std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr);
+
+absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
+    const PJRT_Api* c_api, absl::string_view topology_name,
+    const absl::flat_hash_map<std::string, PjRtValueType>& create_options);
+
+// A variant that takes `device_type` as an input, used for plugins that are not
+// registered with standard way (xla_bridge.register_plugin).
+// TODO(b/322357665): Delete this method after TPU plugin changes to use the
+// standard registration.
+absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
+    absl::string_view device_type, absl::string_view topology_name,
+    const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {});
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_C_API_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_client.h
new file mode 100644
index 00000000..7beb4c1a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_client.h
@@ -0,0 +1,1614 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_CLIENT_H_
+#define XLA_PJRT_PJRT_CLIENT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+// API notes:
+// PjRt stands for "Pretty much Just another RunTime".
+
+namespace xla {
+
+class PjRtClient;
+class PjRtDevice;
+
+class PjRtMemorySpace {
+ public:
+  virtual ~PjRtMemorySpace() = default;
+
+  // The owner of this memory space.
+  virtual PjRtClient* client() const = 0;
+
+  // The devices that this memory space is attached to.
+  virtual absl::Span<PjRtDevice* const> devices() const = 0;
+
+  // The ID of this memory space. IDs are globally unique across all hosts.
+  virtual int id() const = 0;
+
+  // A platform-dependent string that uniquely identifies the kind of the
+  // memory space.
+  virtual absl::string_view kind() const = 0;
+
+  // An ID uniquely identifies the kind of the memory space among those attached
+  // to the same `PjRtClient`. The IDs assigned to a kind is implementation
+  // specific.
+  virtual int kind_id() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current memory space unambiguously.
+  virtual absl::string_view DebugString() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse.
+  virtual absl::string_view ToString() const = 0;
+};
+
+class PjRtDevice {
+ public:
+  virtual ~PjRtDevice() = default;
+
+  // Return the client that owns this device.
+  virtual PjRtClient* client() const = 0;
+
+  // Whether client can issue command to this device.
+  virtual bool IsAddressable() const = 0;
+
+  virtual const PjRtDeviceDescription& description() const {
+    LOG(FATAL) << "PjRtDeviceDescription not available (must override "
+                  "PjRtDevice::description).";
+  }
+
+  // The ID of this device. IDs are unique among devices of this type
+  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
+  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
+  ABSL_DEPRECATED("Use global_device_id() instead")
+  virtual int id() const { return global_device_id().value(); }
+
+  // There are several different IDs for a PJRT device.
+  //
+  // - global_device_id: The logical global device ID. This is unique among
+  // devices of this type (e.g. CPUs, GPUs). On multi-host platforms, this will
+  // be unique across all hosts' devices.  This is the ID that should be used in
+  // a DeviceAssignment.
+  //
+  // - local_device_id: The logical local device ID. This will be used to look
+  // up an addressable device local to a given client. It is -1 if undefined.
+  //
+  // - local_hardware_id: The physical local device ID, e.g., the CUDA device
+  // number. Multiple PJRT devices can have the same local_hardware_id if
+  // these PJRT devices share the same physical device. This is useful for
+  // identifying which physical device when interacting with non-JAX code. In
+  // general, not guaranteed to be dense, and -1 if undefined.
+
+  // TODO(b/314368788): Remove `id()` and replace it with this function.
+  virtual PjRtGlobalDeviceId global_device_id() const {
+    return PjRtGlobalDeviceId(description().id());
+  }
+
+  virtual PjRtLocalDeviceId local_device_id() const {
+    // By default, local_device_id is the same as local_hardware_id when there
+    // is only one PJRT device on a physical device.
+    return PjRtLocalDeviceId(local_hardware_id().value());
+  }
+
+  // Opaque hardware ID, e.g., the CUDA device number, useful for identifying
+  // which GPU when interacting with non-JAX code. In general, not guaranteed to
+  // be dense, and -1 if undefined.
+  virtual PjRtLocalHardwareId local_hardware_id() const = 0;
+
+  // The index of the process that this device belongs to, i.e. is addressable
+  // from. This is not always identical to PjRtClient::process_index() in a
+  // multi-process setting, where each client can see devices from all
+  // processes, but only a subset of them are addressable and have the same
+  // process_index as the client.
+  virtual int process_index() const { return description().process_index(); }
+
+  // A vendor-dependent string that uniquely identifies the kind of device,
+  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
+  // compatible compilation.
+  virtual absl::string_view device_kind() const {
+    return description().device_kind();
+  }
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  virtual absl::string_view DebugString() const {
+    return description().DebugString();
+  }
+
+  // Debug string suitable for reading by end users, should be reasonably terse,
+  // for example: "CpuDevice(id=0)".
+  virtual absl::string_view ToString() const {
+    return description().ToString();
+  }
+
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the PjRtDevice.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const {
+    return description().Attributes();
+  }
+
+  // Returns a scoped event that the caller uses to tell the PjRtClient that
+  // there is asynchronous work happening that depends on activity on the
+  // PjRtDevice. See comment on class definition in pjrt_future.h.
+  //
+  // Only some PjRtDevice implementations support ScopedAsyncTrackingEvent, and
+  // those that do not will return nullptr.
+  virtual std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const = 0;
+
+  // Transfer the given literal to the infeed queue.
+  virtual absl::Status TransferToInfeed(const LiteralSlice& literal) = 0;
+
+  // Transfer and return a value of the given shape from the outfeed queue.
+  virtual absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) = 0;
+
+  // Returns allocator stats for the device. Only some PjRtDevice
+  // implementations support allocator_stats, and those that do not will return
+  // an Unimplemented error.
+  virtual absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const {
+    return Unimplemented("GetAllocatorStats is not supported");
+  }
+
+  // Returns all memory spaces attached to this device.
+  // The memory spaces are in no particular order.
+  virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const = 0;
+
+  // Returns the default memory space attached to this device.
+  virtual absl::StatusOr<PjRtMemorySpace*> default_memory_space() const = 0;
+
+  virtual absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view memory_space_kind) const {
+    return Unimplemented("memory_space_by_kind not implemented");
+  }
+
+  // Returns a platform-specific stream handle that should be used to track when
+  // an externally-managed buffer is ready to use on this device. This is
+  // intended to support dlpack on GPU and is not expected to be implemented for
+  // all hardware platforms.
+  virtual absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents()
+      const {
+    return Unimplemented(
+        "PjRtDevice::GetStreamForExternalReadyEvents only implemented for "
+        "GPU");
+  }
+
+  // Experimental: Poisons the earliest execution on this device with given
+  // launch_id if it's not finished yet, i.e. makes its output buffers error.
+  //
+  // Returns true if the output buffers have been successfully poisoned.
+  //
+  // Returns false if the output buffers were not successfully poisoned because
+  // launch_id is not in the list of executions that have not yet completed.
+  // This may happen either because the execution corresponding to launch_id has
+  // already completed, or because an incorrect launch_id was supplied.
+  //
+  // Returns error otherwise, including in the case that poisoning is not
+  // implemented by this client.
+  virtual absl::StatusOr<bool> PoisonExecution(int32_t launch_id,
+                                               absl::Status error) {
+    return Unimplemented("PoisonExecution is not supported");
+  }
+};
+
+// Forward declaration.
+class PjRtBuffer;
+
+// Helper struct for cross host transfers, returned by the callback from a call
+// to PjRtBuffer::MakeCrossHostReceiveBuffers or
+// PjRtBuffer::MakeCrossHostReceiveBuffersForGather.
+struct PjRtCrossHostRecvDescriptors {
+  // There is one serialized_descriptor per sub-buffer being gathered (i.e. a
+  // single descriptor if the buffer is returned from a call to
+  // MakeCrossHostReceiveBuffers). The descriptor should be transmitted to the
+  // sender(s) and passed to a call to src_buffer->CopyToRemoteDevice.
+  absl::InlinedVector<std::string, 1> serialized_descriptors;
+};
+// Function that the client should call at the receiver if it needs to cancel a
+// cross-host send, for example because the buffer that the remote host wanted
+// to send is not available. The serialized descriptor should match one of the
+// descriptors returned in a PjRtCrossHostRecvDescriptors. on_canceled will be
+// called once cancellation is complete and indicates whether cancellation was
+// successful or not.
+//
+// For each serialized_descriptor provided in a PjRtCrossHostRecvDescriptors,
+// *either* the sending host must successfully complete a CopyToRemoteDevice
+// for that descriptor, *or* the receiving host must cancel. If there is a
+// duplicate (e.g., both send and cancel) then the system will be left in an
+// undefined state. If there is no send or cancellation then the system will
+// hang indefinitely.
+using PjRtCrossHostSendCancelNotifier = std::function<void(
+    absl::string_view serialized_descriptor, absl::Status reason,
+    std::function<void(absl::Status)> on_canceled)>;
+// State asynchronously returned by MakeCrossHostReceiveBuffers. "descriptors"
+// will match the returned PjRtBuffer objects 1:1. Specifically, each PjRtBuffer
+// returned by MakeCrossHostReceiveBuffers will have one
+// PjRtCrossHostRecvDescriptors object containing it descriptor(s).
+struct PjRtCrossHostRecvState {
+  std::vector<PjRtCrossHostRecvDescriptors> descriptors;
+  PjRtCrossHostSendCancelNotifier cancel_notifier;
+};
+using PjRtCrossHostRecvNotifier =
+    std::function<void(absl::StatusOr<PjRtCrossHostRecvState>)>;
+
+// A sized chunk of host data. The host data can be either in host layout or in
+// device layout, and it can be one part of the entire buffer. The PjRt
+// implementations can customize how the memory is allocated and deallocated.
+class PjRtChunk {
+ public:
+  // Allocate a PjRtChunk using malloc.
+  static PjRtChunk AllocateDefault(size_t size) {
+    return PjRtChunk(malloc(size), size, [](void* ptr) { free(ptr); });
+  }
+
+  PjRtChunk() = default;
+  PjRtChunk(void* data, size_t size, std::function<void(void*)> deleter)
+      : data_(static_cast<uint8_t*>(data)),
+        size_(size),
+        deleter_(std::move(deleter)) {}
+
+  ~PjRtChunk() {
+    if (data_) {
+      deleter_(data_);
+    }
+  }
+
+  PjRtChunk(PjRtChunk&& other)
+      : data_(other.data_),
+        size_(other.size_),
+        deleter_(std::move(other.deleter_)) {
+    other.data_ = nullptr;
+  }
+  PjRtChunk& operator=(PjRtChunk&& other) {
+    if (data_) {
+      deleter_(data_);
+    }
+    data_ = other.data_;
+    size_ = other.size_;
+    deleter_ = std::move(other.deleter_);
+    other.data_ = nullptr;
+    return *this;
+  }
+
+  PjRtChunk(const PjRtChunk&) = delete;
+  PjRtChunk& operator=(const PjRtChunk&) = delete;
+
+  uint8_t* data() { return data_; }
+  const uint8_t* data() const { return data_; }
+  int64_t size() const { return size_; }
+  std::function<void(void*)> deleter() const { return deleter_; }
+
+  // Release the ownership of the data. Note that this does not free the data;
+  // the caller should copy `data()` and `deleter()` to manage the ownership
+  // before calling `release()`. This PjRtChunk is invalidated after calling.
+  void release() {
+    data_ = nullptr;
+    size_ = 0;
+    deleter_ = nullptr;
+  }
+
+ private:
+  // The ownership of the bytes pointed to by `data_` is controlled by the
+  // `deleter_`.
+  uint8_t* data_ = nullptr;
+  size_t size_ = 0;
+  std::function<void(void*)> deleter_;
+};
+
+// A stream of Chunks from the host to the device. Once the stream enters
+// Complete state it never changes state again.
+//
+// This class is thread-safe.
+class CopyToDeviceStream {
+ public:
+  CopyToDeviceStream(int64_t total_bytes, int64_t granule_bytes)
+      : total_bytes_(total_bytes), granule_bytes_(granule_bytes) {}
+
+  virtual ~CopyToDeviceStream();
+
+  // Emplaces a new Chunk of data to copy to the device. Returns an error future
+  // if the Chunk's size causes the amount of transferred data to exceed
+  // total_bytes(), if the stream is already complete, or if the chunk is not a
+  // multiple of granule_size_in_bytes().
+  //
+  // The transfer is started immediately, and the returned future is fulfilled
+  // when the transfer completes or fails.
+  virtual PjRtFuture<> AddChunk(PjRtChunk chunk) = 0;
+
+  // Returns the total amount of data the stream expects to be transferred.
+  int64_t total_bytes() const { return total_bytes_; }
+
+  // Returns the granule size in bytes. The size of the chunk added to this
+  // stream must be a multiple of this number.
+  int64_t granule_size_in_bytes() const { return granule_bytes_; }
+
+  // Returns the amount of data the stream currently has either transferred or
+  // has buffered to transfer.
+  int64_t current_bytes() const ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    return current_bytes_;
+  }
+
+  // Returns true if the stream is complete; all expected bytes have been
+  // transferred or are buffered to transfer.
+  bool IsComplete() const ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    return IsCompleteLocked();
+  }
+
+  // Returns true if the stream is empty; no data has been queued.
+  bool empty() const { return current_bytes() == 0; }
+
+ protected:
+  bool IsCompleteLocked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return current_bytes_ == total_bytes_;
+  }
+
+  int64_t total_bytes_;
+  int64_t granule_bytes_;
+  int64_t current_bytes_ ABSL_GUARDED_BY(mu_) = 0;
+  mutable absl::Mutex mu_;
+};
+
+class PjRtHostMemoryForDeviceManager {
+ public:
+  virtual ~PjRtHostMemoryForDeviceManager();
+
+  // Transforms the host memory representations of a shape with the host layout
+  // to the host memory representation of the same shape with the device layout.
+  // `src_shape` and `dst_shape` may only differ in their layouts.
+  virtual absl::StatusOr<PjRtChunk> ToDeviceLayout(
+      const void* src_data, size_t src_size, const Shape& host_shape,
+      const Shape& device_shape) = 0;
+
+  // Transforms the host memory representations of a shape with the device
+  // layout to the host memory representation of the same shape with the host
+  // layout. `src_shape` and `dst_shape` may only differ in their layouts.
+  virtual absl::Status ToHostLayout(const void* src_data, size_t src_size,
+                                    const Shape& src_shape, void* dst_data,
+                                    size_t dst_size,
+                                    const Shape& dst_shape) = 0;
+};
+
+class PjRtLoadedExecutable;
+
+struct PjRtPluginAttributes {
+  int64_t pjrt_c_api_major_version;
+  int64_t pjrt_c_api_minor_version;
+  absl::flat_hash_map<std::string, PjRtValueType> attributes;
+};
+
+// Encapsulates the state of Python session with XLA.
+//
+// It is the responsibility of the client of this API to keep the PjRtClient
+// alive as long as any of the other runtime objects are alive.
+//
+// A note on the semantics of cross-device copies.
+//
+// There are two mechanisms to transfer a buffer from one device to another.
+// When both devices are on the same host (more specifically, the user program
+// ends up with pointers to both the source and destination buffers in the same
+// address space), the caller can use:
+//   dst_buffer = src_buffer->CopyToDevice(dst_device)
+//
+// When the source and destination are on different hosts, but the transfer is
+// made via native device networking (as opposed to the user program fetching
+// the buffer and sending it using its own networking code), the caller can
+// use:
+//   DstHost: dst_client->MakeCrossHostReceiveBuffers(...)
+//   DstHost: [...]
+//   DstHost: gets callback containing PjRtCrossHostRecvDescriptors
+//   DstHost: sends cross-host recv serialized descriptors to SrcHost
+//   SrcHost: src_buffer->CopyToRemoteDevice(serialized_descriptors)
+//
+// Note that in the cross-host case, the dst_client may call
+// MakeCrossHostReceiveBuffers before the action that produces src_buffer has
+// been enqueued at SrcHost.
+//
+// On some platforms, device-to-device transfers consume scarce hardware
+// resources. If dst_client->MakeCrossHostReceiveBuffers immediately claimed
+// those resources, then there would be a risk of system-wide deadlock, if the
+// resources claimed by the recv prevented other transfers that are necessary
+// to generate src_buffer from acquiring enough resources to proceed.
+//
+// In order to allow clients to avoid deadlocks such as those in the preceding
+// paragraph, PjRtClient guarantees progress but not fairness with respect to
+// the order that cross-device transfers are enqueued on a given host, as
+// follows:
+//
+// The progress guarantee is that a cross-device transfer T on host A will not
+// claim scarce hardware resources until it is guaranteed that all transfers
+// enqueued on A before T have already either completed, or been assigned enough
+// resources to ensure that they can eventually complete.
+//
+// The lack of a fairness guarantee means that, if cross-device transfer T1 is
+// enqueued before transfer T2 at A, then T2 may complete before T1. T1 may be
+// delayed for an unbounded time waiting for T2 if T2 is large, even though T1
+// will eventually be able to make progress.
+class PjRtClient {
+ public:
+  struct ShapeSpec {
+    PrimitiveType element_type;
+    DimensionVector dims;
+  };
+
+  PjRtClient() = default;
+  explicit PjRtClient(std::unique_ptr<PjRtHostMemoryForDeviceManager>
+                          host_memory_for_device_manager)
+      : host_memory_for_device_manager_(
+            std::move(host_memory_for_device_manager)) {}
+
+  virtual ~PjRtClient() = default;
+
+  // Return the process index of this client. Always 0 in single-process
+  // settings.
+  virtual int process_index() const = 0;
+
+  // Return the number of devices in the entire computation. In multi-headed
+  // client setting, some are addressable by this client, some are not. In a
+  // single-client setting, this is equal to the number of addressable devices.
+  virtual int device_count() const = 0;
+
+  // Return number of addressable devices. Addressable devices are those that
+  // the client can issue commands to.
+  virtual int addressable_device_count() const = 0;
+
+  // Return all devices known to the client, including addressable and
+  // non-addressable devices.
+  virtual absl::Span<PjRtDevice* const> devices() const = 0;
+
+  // Return only addressable devices. The devices are in no particular order.
+  virtual absl::Span<PjRtDevice* const> addressable_devices() const = 0;
+
+  // Lookup any PjRtDevice for a given PjRtDevice::id().
+  virtual absl::StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const {
+    return Unimplemented("LookupDevice is not supported.");
+  }
+
+  // Return an addressable PjRtDevice for a given
+  // PjRtDevice::local_device_id().
+  virtual absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const {
+    return Unimplemented("LookupAddressableDevice is not supported.");
+  }
+
+  // Return all memory spaces owned by the client.
+  // The memory spaces are in no particular order.
+  virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const = 0;
+
+  // Return an ID that identifies the platform (CPU/GPU/TPU).
+  virtual PjRtPlatformId platform_id() const = 0;
+
+  // Returns a string that identifies the platform (CPU/GPU/TPU).
+  virtual absl::string_view platform_name() const = 0;
+
+  // Returns a string containing human-readable, platform-specific version info
+  // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
+  virtual absl::string_view platform_version() const = 0;
+
+  // Returns the key value store used by the client.
+  virtual std::optional<std::shared_ptr<KeyValueStoreInterface>>
+  key_value_store() const {
+    return std::nullopt;
+  }
+
+  // Returns information about the underlying PJRT C API plugin if such a plugin
+  // is being used, otherwise returns nullopt.
+  virtual std::optional<PjRtPluginAttributes> plugin_attributes() const {
+    return std::nullopt;
+  }
+
+  // Return a device-specific default device assignment, e.g., GPU and TPU may
+  // be different.
+  virtual absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const {
+    return Unimplemented("GetDefaultDeviceAssignment is not supported.");
+  }
+
+  // Returns a device-specific default device assignment for multi-slice system.
+  // If num_replicas_per_slice is not defined (nullopt) then we assume that
+  // all the partitions live entirely on a single slice and that all cross slice
+  // communication happens across replicas assuming then that
+  // num_replicas_per_slice is going to be "num_replicas / num_slices".
+  // TODO(zhangqiaorjc): Convert this to pure virtual and push down.
+  virtual absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, std::optional<int> num_replicas_per_slice,
+      int num_partitions, const MultiSliceConfig* multi_slice_config) const {
+    return Unimplemented("Multi slice device assignment is not supported.");
+  }
+
+  // Returns the default device layout for a buffer with `element_type` and
+  // `dims`. The default layout is a platform-specific layout used when no other
+  // layout is specified, e.g. for host-to-device transfers. When compiling, the
+  // default layout is used for program arguments and outputs unless
+  // user-specified or compiler-chosen layouts are requested via the
+  // "mhlo.layout_mode" attribute.
+  virtual absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) {
+    return Unimplemented("GetDefaultLayout is not supported.");
+  }
+
+  // Returns a backend-specific HLO cost analysis visitor.
+  virtual absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const {
+    return Unimplemented("GetHloCostAnalysis is not supported.");
+  }
+
+  // Compile `computation` with given `options`.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) {
+    return Unimplemented("Compile with options is not supported.");
+  }
+
+  // Variant of `Compile` that accepts an MLIR module.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp module, CompileOptions options) {
+    return Unimplemented("Compile with MLIR Module is not supported.");
+  }
+
+  // Deserializes a serialized executable as produced by
+  // PjRtExecutable::SerializeExecutable(). `serialized` must have been
+  // produced by a compiler of the same platform and version as this one.
+  //
+  // Pending completion of b/237720161, `options` is a mandatory argument in
+  // most implementations of this interface. They _are_ optional for
+  // implementations related to the PJRT C API.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  DeserializeExecutable(absl::string_view serialized,
+                        std::optional<CompileOptions> options) {
+    return Unimplemented("Deserialize is not supported.");
+  }
+
+  // LoadSerializedExecutable takes the serialized output of PjRtExecutable. The
+  // returned executable is loaded by this client. The same checks are made as
+  // in Load that the serialized executable is compatible with the client.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) {
+    return Unimplemented("Loading serialized executable not supported.");
+  }
+
+  // Loads the executable returns aa PjRtLoadedExecutable runnable by this
+  // client. Returns an error if the PjRtExecutable was created with an
+  // incompatible topology or client.
+  // PjRtExecutable contains a copy of the CompileOptions that was used to
+  // generate the executable. Load will use the CompileOptions from within the
+  // executable.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable,
+      const LoadOptions& load_options) {
+    return Unimplemented("Loading executable not supported.");
+  }
+
+  // Creates a buffer on the device without initializing or copying any data.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device) {
+    return Unimplemented("CreateUnitializedBuffer is not supported.");
+  }
+
+  // Creates buffer in the given memory space that carries an error future
+  // without allocating memory.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) {
+    return Unimplemented("CreateErrorBuffer not supported.");
+  }
+
+  // Creates buffer in the given device that carries an error future without
+  // allocating memory.
+  ABSL_DEPRECATED(
+      "Use CreateErrorBuffer(absl::Status, Shape, PjRtMemorySpace*)")
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtDevice* device) {
+    auto default_memory_space = device->default_memory_space();
+    if (!default_memory_space.ok()) {
+      return default_memory_space.status();
+    }
+    return CreateErrorBuffer(std::move(error), shape, *default_memory_space);
+  }
+
+  // Gets the pointer to the topology description held by the client.
+  virtual absl::StatusOr<const PjRtTopologyDescription*>
+  GetTopologyDescription() const {
+    return Unimplemented("GetTopologyDescription not supported on platform %s",
+                         platform_name());
+  }
+
+  // A client may want to create a buffer, and hand the buffer to other PjRt
+  // methods, before the data to store in the buffer is available to the client.
+  // This is supported using CreateBuffersForAsyncHostToDevice, which returns an
+  // AsyncHostToDeviceTransferManager helper object.
+  //
+  // The PjRtBuffers can be retrieved from the AsyncHostToDeviceTransferManager
+  // and safely passed immediately to downstream PjRt method calls. Subsequently
+  // the client can call methods on the AsyncHostToDeviceTransferManager object
+  // to copy data into the buffers, and once the data copies are complete, the
+  // buffers' definition events will automatically become ready, unblocking
+  // downstream consumers of the buffers.
+  //
+  // Depending on the backend's implementation, a single call to
+  // CreateBuffersForAsyncHostToDevice may either:
+  //   - Create a "batch" of buffers that share a single definition event, which
+  //   may amortize some performance overheads, but means that none of the
+  //   buffers are available to downstream consumers until all the transfers
+  //   have completed, in which case multiple calls to
+  //   CreateBuffersForAsyncHostToDevice should be made if it is desirable for
+  //   buffers to become available as soon as transfers into them complete.
+  //
+  //   - Create a "batch" of buffers with multiple underlying definitions
+  //   events, and individual buffers become available to downstream consumers
+  //   as soon as transfers into them complete.
+
+  // Helper class to all clients to asynchronously transfer data into buffers
+  // that are created uninitialized, see comments immediately above.
+  class AsyncHostToDeviceTransferManager {
+   public:
+    virtual ~AsyncHostToDeviceTransferManager() = default;
+
+    // Returns the number of buffers managed by this object.
+    virtual size_t buffer_count() const = 0;
+
+    // Returns the destination device of the transfers.
+    virtual PjRtDevice* device() const = 0;
+
+    // Returns buffer_index, which can be passed to downstream consumers
+    // immediately and will become available once transfers complete. May not
+    // be called more than once for a given buffer_index.
+    //
+    // RetrieveBuffer can be called at any convenient time; transfer methods
+    // can safely be called for a buffer index after RetrieveBuffer has been
+    // called.
+    virtual std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) = 0;
+
+    // Transfers 'literal' into buffer_index. No transfer calls into
+    // buffer_index can be made after this call. on_done is called when the
+    // transfer is complete but before the buffers are made available to
+    // their consumers. 'literal' must remain in scope until on_done is
+    // called.
+    virtual absl::Status TransferLiteralToBuffer(
+        int buffer_index, const LiteralSlice& literal,
+        absl::AnyInvocable<void() &&> on_done) = 0;
+
+    // Returns the on-device size in bytes of buffer buffer_index.
+    virtual size_t buffer_size(int buffer_index) const = 0;
+
+    // Transfers 'data' into buffer_index. 'data' must be already laid out in
+    // the correct on-device format, for example returned by a call to
+    // buffer->CopyRawToHost. No transfer calls (or SetBufferError calls) into
+    // buffer_index can be made after this call. on_done is called when the
+    // transfer is complete but before the buffers are made available to their
+    // consumers. 'data' must remain in scope until on_done is called.
+    virtual absl::Status TransferRawDataToBuffer(
+        int buffer_index, absl::string_view data,
+        absl::AnyInvocable<void() &&> on_done) = 0;
+
+    // Transfers 'data' into a sub-buffer of buffer_index starting at offset, of
+    // length transfer_size. 'data' must be already laid out in the correct
+    // on-device format, for example returned by a call to
+    // buffer->CopyRawToHost. If is_last_transfer is false then the buffer
+    // remains unavailable to consumers after the transfer completes. If
+    // is_last_transfer is true then the buffer becomes available to consumers
+    // after the transfer completes, and no transfer calls (or SetBufferError
+    // calls) into buffer_index can be made after this call. on_done is called
+    // when the transfer is complete but before the buffers are made available
+    // to their consumers. 'data' must remain in scope until on_done is called.
+    virtual absl::Status TransferRawDataToSubBuffer(
+        int buffer_index, const void* data, int64_t offset,
+        int64_t transfer_size, bool is_last_transfer,
+        absl::AnyInvocable<void() &&> on_done) = 0;
+
+    // Indicates that a specific buffer should result in an error status. No
+    // transfer calls (or further SetBufferError calls) into buffer_index can
+    // be made after this call.
+    virtual void SetBufferError(int buffer_index, absl::Status error) = 0;
+
+    // Adds the specified key/value metadata for the transfer operation.
+    // This is typically used for debugging purposes, such as adding a handle
+    // that can be used to identify transfer operations.
+    using TransferMetadata = absl::flat_hash_map<std::string, std::string>;
+    virtual void AddTransferMetadata(const TransferMetadata& metadata) = 0;
+  };
+
+  // Returns a manager for async transfers into a set of buffers with on-host
+  // shapes defined by 'shape_specs' and optional `device_layouts`.
+  //
+  // If the desired layout of one or more buffers is not specified in
+  // `device_layouts`, then those buffers will use the default device layout. If
+  // `device_layouts` itself is not specified, then all buffers will use the
+  // default device layout.
+  virtual absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtDevice* device) {
+    return absl::UnimplementedError(absl::StrCat(
+        "CreateBuffersForAsyncHostToDevice with ShapeSpec and Layout is "
+        "not implemented on platform: ",
+        platform_name()));
+  }
+
+  // Variant of CreateBuffersForAsyncHostToDevice with PjRtMemorySpace.
+  virtual absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtMemorySpace* memory_space) {
+    return absl::UnimplementedError(absl::StrCat(
+        "CreateBuffersForAsyncHostToDevice with ShapeSpec and Layout is "
+        "not implemented on platform: ",
+        platform_name()));
+  }
+
+  // Returns a manager for async transfers into a set of buffers with on-host
+  // shapes 'shapes'.
+  virtual absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) {
+    return Unimplemented(
+        "CreateBuffersForAsyncHostToDevice with on host is not implemented.");
+  }
+
+  // Variant of CreateBuffersForAsyncHostToDevice with PjRtMemorySpace.
+  virtual absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) {
+    return Unimplemented(
+        "CreateBuffersForAsyncHostToDevice with PjRtMemorySpace is not "
+        "implemented.");
+  }
+
+  // Creates a shapeless buffer on the device that can be partitioned into
+  // multiple PjRtBuffer. This class is an Arena version of
+  // `AsyncHostToDeviceTransferManager`.
+  // As a low-level interface, the user must make sure that invocations of
+  // `Slice` match properly with the writes from `TransferRawDataToSubBuffer`.
+  //
+  // For the intended application to Arena allocation / transfer, the user can
+  // use `GetOnDeviceSizeInBytes` to calculate the offsets for the host buffers
+  // that need to be transferred.
+  class PjRtRawDeviceBuffer {
+   public:
+    virtual ~PjRtRawDeviceBuffer() = default;
+
+    // Transfers data to the device buffer. Data should already be in the
+    // device layout.
+    virtual absl::Status TransferRawDataToSubBuffer(
+        const void* data, int64_t offset, int64_t transfer_size,
+        bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) = 0;
+
+    // The resulting buffer becomes ready when all transfers complete.
+    virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> Slice(
+        int64_t offset, PrimitiveType type, absl::Span<int64_t const> dims,
+        const Layout& layout) = 0;
+  };
+  // Creates a raw device buffer of a given size in bytes.
+  virtual absl::StatusOr<std::unique_ptr<PjRtRawDeviceBuffer>>
+  CreateRawDeviceBuffer(int64_t size, PjRtDevice* device) {
+    return Unimplemented("CreateRawDeviceBuffer is not implemented.");
+  }
+
+  // On-device bytes required for a PjRt buffer with these `Shape` attributes.
+  virtual absl::StatusOr<int64_t> GetOnDeviceSizeInBytes(
+      PrimitiveType type, absl::Span<int64_t const> dims,
+      const Layout& layout) {
+    return Unimplemented("GetOnDeviceSizeInBytes is not implemented.");
+  };
+
+  // Describes the semantics the caller to BufferFromHostBuffer expects from the
+  // runtime, in a total order from most restrictive to least restrictive.
+  enum class HostBufferSemantics {
+    // The runtime may not hold references to `data` after the call to
+    // `BufferFromHostBuffer` completes. The caller promises that `data` is
+    // immutable and will not be freed only for the duration of the
+    // BufferFromHostBuffer call. `on_done_with_host_buffer` will be called
+    // before `BufferFromHostBuffer` returns.
+    kImmutableOnlyDuringCall,
+
+    // The runtime may hold onto `data` after the call to `BufferFromHostBuffer`
+    // returns while the runtime completes a transfer to the device. The caller
+    // promises not to mutate or free `data` until the transfer completes, at
+    // which point the runtime will call `on_done_with_host_buffer`. It is also
+    // correct to wait on the host (directly or indirectly) for the buffer's
+    // definition event to complete.
+    kImmutableUntilTransferCompletes,
+
+    // The PjRtBuffer may alias `data` internally and the runtime may use the
+    // `data` contents as long as the buffer is alive. The runtime promises not
+    // to mutate contents of the buffer (i.e. it will not use it for aliased
+    // output buffers). The caller promises to keep `data` alive and also not to
+    // mutate its contents as long as the buffer is alive; to notify the caller
+    // that the buffer may be freed, the runtime will call
+    // `on_done_with_host_buffer` when the PjRtBuffer is freed. On non-CPU
+    // platforms this acts identically to kImmutableUntilTransferCompletes.
+    kImmutableZeroCopy,
+
+    // The PjRtBuffer may alias `data` internally and the runtime may use the
+    // `data` contents as long as the buffer is alive. The runtime is allowed
+    // to mutate contents of the buffer (i.e. use it for aliased output
+    // buffers). The caller promises to keep `data` alive and not to mutate its
+    // contents as long as the buffer is alive (otherwise it could be a data
+    // race with the runtime); to notify the caller that the buffer may be
+    // freed, the runtime will call `on_done_with_host_buffer` when the
+    // PjRtBuffer is freed. On non-CPU platforms this acts identically to
+    // kImmutableUntilTransferCompletes.
+    kMutableZeroCopy,
+  };
+
+  // on_done_with_host_buffer is optional and may be null.
+  // on_done_with_host_buffer will be called iff an OK status is returned.
+  //
+  // `data` points to the backing array of the host buffer. Caution:
+  // `byte_strides` are allowed to be negative, in which case `data` may need
+  // to point to the interior of the buffer, not necessarily its start.
+  //
+  // If byte_strides is omitted, the array is assumed to have a dense layout
+  // with dimensions in major-to-minor order.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device) {
+    return Unimplemented("BufferFromHostBuffer is not implemented.");
+  }
+
+  // Variant of BufferFromHostBuffer that takes an optional device layout. It is
+  // used when non-compact layout is preferred.
+  // TODO(b/275645543): remove BufferFromHostBuffer without optional device
+  // layout after all the inherited classes and call sites are updated.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout) {
+    return tsl::errors::Unimplemented(
+        "BufferFromHostBuffer with an optional device layout is not "
+        "implemented on platform: ",
+        platform_name());
+  }
+
+  // TODO(b/277820585): remove BufferFromHostBuffer with PjRtDevice after the
+  // migration is done.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) {
+    return tsl::errors::Unimplemented(
+        "BufferFromHostBuffer with PjRtMemorySpace is not implemented on "
+        "platform: ",
+        platform_name());
+  }
+
+  // Note that literal must remain in scope until the transfer has completed, so
+  // the caller should, for example, wait for GetReadyFuture().Await()
+  // completes on the return value before letting literal go out of scope.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) {
+    return Unimplemented("BufferFromHostLiteral is not implemented.");
+  }
+
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device,
+      const Layout* device_layout) {
+    if (device_layout) {
+      return absl::UnimplementedError(absl::StrCat(
+          "BufferFromHostLiteral with device_layout is not implemented on "
+          "platform: ",
+          platform_name()));
+    }
+
+    return this->BufferFromHostLiteral(literal, device);
+  }
+
+  // TODO(b/277820585): remove BufferFromHostLiteral with PjRtDevice after the
+  // migration is done.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space) {
+    return tsl::errors::Unimplemented(
+        "BufferFromHostLiteral with PjRtMemorySpace is not implemented on "
+        "platform: ",
+        platform_name());
+  }
+
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space,
+      const Layout* device_layout) {
+    if (device_layout) {
+      return absl::UnimplementedError(absl::StrCat(
+          "BufferFromHostLiteral with device_layout is not implemented on "
+          "platform: ",
+          platform_name()));
+    }
+    return this->BufferFromHostLiteral(literal, memory_space);
+  }
+
+  // Creates a PjRtBuffer that is a non-owned view of an on-device
+  // buffer (typically allocated by another library).
+  // on_delete_callback is called when the PjRtBuffer is done with the on-device
+  // buffer. The buffer may be mutated, for example, if the buffer is donated
+  // to an Execute operation.
+  //
+  // `stream`, if specified, is a platform-specific stream handle that should
+  // contain the work or events needed to materialize the on-device
+  // buffer. CreateViewOfDeviceBuffer will append an event to `stream` that
+  // indicates when the returned buffer is ready to use. This is intended to
+  // support dlpack on GPU and is not expected to be supported on all hardware
+  // platforms.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtDevice* device,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream = std::nullopt) {
+    return Unimplemented("CreateViewOfDeviceBuffer is not implemented.");
+  }
+
+  // Returns platform-dependent address for the given buffer that is often but
+  // not guaranteed to be the physical/device address.
+  virtual absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
+      PjRtBuffer* buffer);
+
+  // Returns a vector of PjRtBuffers that can be used to receive
+  // cross host transfers using `client` on `device'. Asynchronously calls
+  // `notifier` once receive descriptors are ready to be communicated to the
+  // sender. `shapes` must be the exact shapes, with identical layouts,
+  // corresponding to the buffers that will be sent. When resources for the
+  // transfer are available, notifier will be called with a vector of
+  // PjRtCrossHostRecvDescriptors structs, one for each shape in `shapes`. Each
+  // struct contains an opaque string that should be transmitted to the sending
+  // host and used in a call to CopyToRemoteDevice. None of the recv buffers
+  // will become ready until *all* of the sends have completed.
+  //
+  // If MakeCrossHostReceiveBuffers returns an error, then `notifier` will not
+  // be called. Otherwise `notifier` will be called exactly once. In the case
+  // where `notifier` is called with an error status, then the PjRtBuffers
+  // returned by MakeCrossHostReceiveBuffers will never yield data.
+  //
+  // See note on semantics of cross-device copies in the class definition
+  // comment for PjRtClient.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                              PjRtDevice* device,
+                              PjRtCrossHostRecvNotifier notifier) {
+    return Unimplemented("MakeCrossHostReceiveBuffers is not implemented.");
+  }
+
+  // Asynchronously makes a vector of PjRtBuffers that can be used to receive
+  // cross host transfers, as in MakeCrossHostReceiveBuffers above, however
+  // each buffer expects to be "gathered" using multiple sends, one for each of
+  // a set of sub-slices of the destination buffer.
+  //
+  // For each value in shapes there is a corresponding FullGatherDetails struct
+  // that describes the sub-slices.
+  struct GatherDetails {
+    // The dimensions of the corresponding buffer that the gather slices
+    // into. These dimensions must be the major dimensions in the on-device
+    // layout of the buffer, and must all be untiled. The scatter acts as if
+    // the buffer were transposed/reshaped so that all of these dimensions were
+    // combined into a single dimension whose size is the product of the
+    // dimensions, and the slice indices correspond to indices in that single
+    // combined dimension.
+    //
+    // For example, if the shape is [3, 4, 128, 128] with [3, 4] as the major
+    // dimensions in the layout, and dimensions = {0, 1}, then the buffer is
+    // treated as if it were shape [12, 128, 128] and the indices in
+    // slice_boundaries range in [0, 12].
+    absl::InlinedVector<int, 3> dimensions;
+    // The cumulative indices in dimension of the slices. For example, if
+    // shape.dimensions(dimension)==10, setting slice_boundaries to {2, 5, 10}
+    // would correspond to 3 slices of sizes {2, 3, 5} respectively. If the last
+    // entry in slice_boundaries is less than the size of the combined gather
+    // dimension, the trailing data in the buffer is undefined after the receive
+    // completes.
+    std::vector<int64_t> slice_boundaries;
+  };
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffersForGather(
+      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
+      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) {
+    return Unimplemented(
+        "MakeCrossHostReceiveBuffersForGather is not implemented.");
+  }
+
+  // TODO(zhangqiaorjc): Experimental API to be removed.
+  // Defragment device memory.
+  virtual absl::Status Defragment() {
+    return Unimplemented("Defragment is not implemented.");
+  }
+
+  // Return the PjRtHostMemoryForDeviceManager for this client. It can be
+  // nullptr if the implementation does not provide one.
+  virtual PjRtHostMemoryForDeviceManager* GetPjRtHostMemoryForDeviceManager()
+      const {
+    return host_memory_for_device_manager_.get();
+  }
+
+ private:
+  std::unique_ptr<PjRtHostMemoryForDeviceManager>
+      host_memory_for_device_manager_;
+};
+
+// Holds a reference from Python to a tuple of device buffers. A PjRtBuffer
+// can be either valid or invalid. An invalid buffer is one that has never been
+// initialized, or a buffer that has been deleted (e.g., by calling Delete, or
+// by donating it to a computation that aliases an input parameter to an
+// output). We allow PjRtBuffer objects to outlive the underlying device
+// buffers so we can decouple buffer lifetimes from the corresponding Python
+// references if needed. Thread-safe.
+class PjRtBuffer {
+ public:
+  virtual ~PjRtBuffer() = default;
+
+  virtual PrimitiveType element_type() const {
+    return on_device_shape().element_type();
+  }
+
+  // Returned dimensions have lifetime of this buffer.
+  virtual absl::Span<const int64_t> dimensions() const {
+    return on_device_shape().dimensions();
+  }
+
+  // The on-device memory layout of this buffer. Returned via shared_ptr to make
+  // memory management easier -- PjRtLayout is an abstract base class, so cannot
+  // be easily copied.
+  virtual std::shared_ptr<const PjRtLayout> layout() const {
+    CHECK(on_device_shape().has_layout());
+    return std::make_shared<PjRtLayout>(on_device_shape().layout());
+  }
+
+  // PjRtBuffers can either represent a single array buffer or a tuple of array
+  // buffers. Returns true if this buffer represents a tuple, false if an array.
+  virtual bool IsTuple() const { return on_device_shape().IsTuple(); }
+
+  virtual const Shape& on_device_shape() const = 0;
+
+  virtual bool has_dynamic_dimensions() const {
+    return on_device_shape().is_dynamic();
+  }
+
+  // Each returned element is true if the corresponding dimensions is dynamic,
+  // false if static.
+  virtual absl::Span<const bool> is_dynamic_dimension() const {
+    return on_device_shape().dynamic_dimensions();
+  }
+
+  // Same as dimensions() when the shape is static. When the shape is dynamic,
+  // it gathers the metadata from the device and returns a static shape
+  // representing the logical shape of the data. This approach is identical to
+  // how tensorflow and xrt setup the output buffer in the graph.
+  //
+  // Since this method actually acquires locks and communicate with the device,
+  // it does not have the const qualifier, similar to what ToLiteral does.
+  virtual absl::StatusOr<std::vector<int64_t>> logical_dimensions() {
+    TF_ASSIGN_OR_RETURN(Shape logical_shape, logical_on_device_shape());
+    absl::Span<const int64_t> dims = logical_shape.dimensions();
+    return std::vector<int64_t>(dims.begin(), dims.end());
+  }
+
+  // Same as on_device_shape when the shape is static. When the shape is
+  // dynamic, it gathers the metadata from the device and returns a static shape
+  // representing the logical shape of the data. This approach is identical to
+  // how tensorflow and xrt setup the output buffer in the graph.
+  //
+  // Since this method actually acquires locks and communicate with the device,
+  // it does not have the const qualifier, similar to what ToLiteral does.
+  virtual absl::StatusOr<Shape> logical_on_device_shape() {
+    const Shape& shape = on_device_shape();
+    CHECK(shape.is_static())
+        << "logical_on_device_shape needs to be overridden for platform '"
+        << client()->platform_name() << "'";
+    return shape;
+  }
+
+  virtual PjRtMemorySpace* memory_space() const = 0;
+  // TODO(b/277820585): remove device() after the migration is done.
+  virtual PjRtDevice* device() const = 0;
+  virtual PjRtClient* client() const = 0;
+
+  // ExternalReference is a potentially long-lived reference held while a buffer
+  // is being shared by an external framework, e.g., NumPy. A client acquires an
+  // external reference by calling PjRtBuffer::AcquireExternalReference() and
+  // releases it by deleting the ExternalReference. The external framework
+  // should not modify the underlying buffer unless it is confident via its own
+  // synchronization that modifications do not race with reads from the
+  // PjRtBuffer.
+  class ExternalReference {
+   public:
+    virtual ~ExternalReference() = 0;
+    // Return opaque device memory pointer to root buffer.
+    void* OpaqueDeviceMemoryDataPointer() const { return data_ptr_; }
+
+    // Stream is platform-specific. This is intended to support dlpack on GPU
+    // and is not expected to be implemented for all hardware platforms.
+    virtual absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) {
+      return Unimplemented(
+          "WaitUntilBufferReadyOnStream is only implemented for GPU.");
+    }
+
+   protected:
+    void* data_ptr_;
+  };
+  virtual absl::StatusOr<std::unique_ptr<ExternalReference>>
+  AcquireExternalReference() = 0;
+
+  // Asynchronously copies the buffer's value into `literal`.
+  //
+  // Return value is a future the caller can use to discover when the copy has
+  // completed. The transfer respects the layout of `literal`; to specify a
+  // particular layout, set the layout before calling `ToLiteral`.
+  virtual PjRtFuture<> ToLiteral(MutableLiteralBase* literal) = 0;
+  // This version of ToLiteral allows the implementation to defer the
+  // construction of the literal (e.g. until the underlying buffer is ready).
+  // The specific timing of calling `generator` is implementation defined, and
+  // might be done eagerly, but it is guaranteed to be earlier than when the
+  // returned future becomes ready.
+  virtual PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&>
+          generator) = 0;
+
+  // Synchronous overload of ToLiteral, as a convenience.
+  absl::Status ToLiteralSync(MutableLiteralBase* literal) {
+    absl::Notification done;
+    absl::Status status;
+    ToLiteral(literal).OnReady([&](absl::Status s) {
+      status = std::move(s);
+      done.Notify();
+    });
+    done.WaitForNotification();
+    return status;
+  }
+
+  absl::StatusOr<Shape> HostShape() {
+    Shape device_shape;
+    if (!IsTuple()) {
+      absl::Span<const int64_t> literal_dims;
+      std::optional<std::vector<int64_t>> logical_dims_storage;
+      if (has_dynamic_dimensions()) {
+        TF_ASSIGN_OR_RETURN(std::vector<int64_t> logical_dims,
+                            logical_dimensions());
+        logical_dims_storage.emplace(std::move(logical_dims));
+        literal_dims = *logical_dims_storage;
+      } else {
+        literal_dims = dimensions();
+      }
+      if (element_type() == TOKEN) {
+        device_shape = ShapeUtil::MakeTokenShape();
+      } else {
+        device_shape = ShapeUtil::MakeShape(element_type(), literal_dims);
+        // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
+        *device_shape.mutable_layout() = layout()->xla_layout();
+      }
+    } else {
+      // TODO(skyewm): does anything need to create tuple literals? The PJRT C
+      // API doesn't support tuples or {logical_}on_device_shape(), so we prefer
+      // to use the above non-tuple code path where possible.
+      device_shape = on_device_shape();
+      if (device_shape.is_dynamic()) {
+        TF_ASSIGN_OR_RETURN(device_shape, logical_on_device_shape());
+      }
+    }
+    return ShapeUtil::DeviceShapeToHostShape(device_shape);
+  }
+
+  // Convenience synchronous overload that allocates a literal with a default
+  // layout.
+  absl::StatusOr<std::shared_ptr<Literal>> ToLiteralSync() {
+    TF_ASSIGN_OR_RETURN(Shape host_shape, HostShape());
+    auto literal = std::make_shared<Literal>(host_shape);
+    TF_RETURN_IF_ERROR(ToLiteralSync(literal.get()));
+    return literal;
+  }
+
+  // Returns the number of bytes of the buffer storage on the device.
+  virtual absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const = 0;
+
+  // Transfers a sub-range of the on-device representation of the buffer.
+  // offset+transfer_size must be less than GetOnDeviceSizeInBytes. The
+  // returned future transitions to ready on error, or after the transfer has
+  // completed.
+  //
+  // Note that the underlying driver may have requirements
+  // on the alignment of `dst` and `offset` as well. Look at implementations of
+  // this method for specific alignment requirements.
+  virtual PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                                     int64_t transfer_size) = 0;
+
+  // As above, but the transfer will not happen until `dst` is fulfilled with a
+  // valid pointer. If `dst` is fulfilled with a non-Ok status, then the
+  // transfer will be cancelled. The implementation must ensure that the
+  // underlying buffer is kept alive even if the `PjRtBuffer` is deleted before
+  // the `dst` future is fulfilled.
+  //
+  // In error cases it is possible for the returned Future to become ready
+  // before `dst` is fulfilled.
+  //
+  // The default implementation always returns a future that is fulfilled with
+  // an UNIMPLEMENTED error.
+  virtual PjRtFuture<> CopyRawToHostFuture(PjRtFuture<void*> dst,
+                                           int64_t offset,
+                                           int64_t transfer_size);
+
+  // Drops the buffer's reference to its associated device memory, leaving the
+  // buffer in an invalid state. The memory will be freed lazily when all async
+  // operations using the buffer have completed, according to the allocation
+  // semantics of the underlying platform. Delete may briefly block if another
+  // thread is in the process of enqueuing an operation on this buffer, but it
+  // will never block for a stream operation to complete. If an external
+  // framework holds a reference to the TrackedDeviceBuffer via
+  // GetBufferWithExternalReference, the memory will not be freed until the
+  // external framework drops the reference.
+  virtual void Delete() = 0;
+
+  // Similar to Delete, drops the buffer's reference to its associated device
+  // memory, leaving the buffer in an invalid state, but transfers the device
+  // memory ownership out via an ExternalReference rather than
+  // freeing the device memory, so that another framework can take ownership of
+  // it. A return value of nullptr indicates that PjRtBuffer has been
+  // deleted. The buffer returned from Release may be safely dropped at any time
+  // even if it still has pending async operations. The client should call
+  // GetReadyFuture().Await before calling ReleaseDeviceMemoryOwnership with
+  // wait_for_operations_to_complete=false, to ensure that the host has
+  // synchronized past any outstanding write operations to the buffer. If
+  // wait_for_operations_to_complete=true the host will block until any
+  // potentially outstanding asynchronous operations have completed before
+  // returning, in which case it is safe to read or mutate the returned buffer.
+  // If the buffer was shared via an external reference it is the client's
+  // responsibility that accesses via that reference do not interfere with
+  // accesses via the buffer returned from ReleaseDeviceMemoryOwnership.
+  virtual absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) = 0;
+
+  // True if and only if Delete or Release has previously been called.
+  virtual bool IsDeleted() = 0;
+
+  // Copies the buffer to device `dst_device`, performing a d2d transfer when
+  // `dst_device` is sharing the same Client, and performing a d2h and h2d copy
+  // if `dst_device` lives on a different Client.
+  // Returns an error if the buffer is already on dst_device.
+  //
+  // See note on semantics of cross-device copies in the class definition
+  // comment for PjRtClient.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) = 0;
+
+  // Copies the buffer to memory space `dst_memory_space`.
+  //
+  // The destination memory space may be attached to any client, but optimized
+  // implementations may apply when the copy is within the same client.
+  //
+  // Returns an error if the buffer is already in dst_memory_space.
+  //
+  // See note on semantics of cross-device copies in the class definition
+  // comment for PjRtClient.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) = 0;
+
+  // Prepares to send a copy of the buffer to a remote device. The destination
+  // device is encoded in `serialized_descriptor`, which must be fulfilled by
+  // the result of call to MakeCrossHostReceiveBuffers on the remote host's
+  // destination device. MakeCrossHostReceiveBuffers takes an array of shapes to
+  // construct the destination buffers, and a callback supplies an array
+  // containing both the destination buffers, and a serialized descriptor for
+  // each buffer. For each destination buffer there should be a matching call to
+  // src->CopyToRemoteDevice on a remote host for a src buffer of the
+  // corresponding shape. If `serialized_descriptor` is fulfilled with a non-Ok
+  // status, then the transfer is canceled, otherwise it must be the string
+  // returned by the MakeCrossHostReceiveBuffers callback corresponding to the
+  // destination buffer.
+  //
+  // When the send either completes or fails, `on_done` will be called. If
+  // `status` is Ok then it is guaranteed that sends_were_enqueued==true.
+  // Otherwise, if sends_were_enqueued==false then the sender should contact
+  // the receiver out of band to request cancellation of the transfer. If
+  // !status.ok() and sends_were_enqueued==true then it is not possible to
+  // determine whether the transfer succeeded and the system is in an
+  // undefined state. This undefined state almost certainly indicates an
+  // unrecoverable hardware error. Note that in some error cases, `on_done` may
+  // be called before `serialized_descriptor` is fulfilled.
+  //
+  // Some implementations of this method may immediately block on the
+  // `serialized_descriptor` future (and not return until that future has been
+  // fulfilled).
+  //
+  // See note on semantics of cross-device copies in the class definition
+  // comment for PjRtClient.
+  using RemoteSendCallback =
+      std::function<void(absl::Status status, bool sends_were_enqueued)>;
+  virtual void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                                  RemoteSendCallback on_done) = 0;
+  struct ScatterDetails {
+    // The dimensions of the corresponding buffer that the scatter slices
+    // across. These dimensions must be the major dimensions in the on-device
+    // layout of the buffer, and must all be untiled. The scatter acts as if
+    // the buffer were transposed/reshaped so that all of these dimensions were
+    // combined into a single dimension whose size is the product of the
+    // dimensions, and the slice indices correspond to indices in that single
+    // combined dimension.
+    //
+    // For example, if the shape is [3, 4, 128, 128] with [3, 4] as the major
+    // dimensions in the layout, and dimensions = {0, 1}, then the buffer is
+    // treated as if it were shape [12, 128, 128] and the indices in slices
+    // range in [0, 12].
+    absl::InlinedVector<int, 3> dimensions;
+    // The start and end indices of the slices.
+    std::vector<std::pair<int64_t, int64_t>> slices;
+  };
+  // Each entry in `callbacks` will be called exactly once. As above, in error
+  // situations, this may happen before the corresponding entry in
+  // `serialaized_descriptors` is fulfilled. This method requires that both
+  // `calbacks.size()` and (if Ok) `serialized_descriptors.size()` match the
+  // product of the major dimensions specified in `scatter_details`.
+  virtual void CopyToRemoteDeviceScattered(
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const ScatterDetails& scatter_details) = 0;
+
+  // Donates 'this' and returns a new buffer that is ready only when both 'this'
+  // and 'dependency' are ready.
+  //
+  // Once ready, the new buffer's contents will be exactly the contents of
+  // 'this'.
+  //
+  // If either 'this' or 'dependency' transitions to error, then the returned
+  // buffer will transition to error.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+  DonateWithControlDependency(PjRtFuture<> dependency) {
+    return Unimplemented("DonateWithControlDependency is not supported.");
+  }
+
+  // Returns a future that can be used to discover when the data in the
+  // PjRtBuffer has been computed, or an error has occurred.
+  //
+  // TODO(b/241967811): change these weird semantics
+  // If the buffer has been deleted or donated the returned future will
+  // immediately hold an error, however if GetReadyFuture() is called before
+  // the buffer has been deleted or donated then the returned future will stay
+  // valid (will not transition to error as a consequence of buffer deletion)
+  // even if the buffer is subsequently donated or deleted.
+  virtual PjRtFuture<> GetReadyFuture() = 0;
+
+  // Blocks the host until the buffer's value has been computed and is ready for
+  // immediate use on the device. Useful in particular for timing benchmarks.
+  ABSL_DEPRECATED("Use GetReadyFuture().Await() instead")
+  absl::Status BlockHostUntilReady() {
+    auto s = GetReadyFuture().Await();
+    // Fix up error string because some clients rely on it.
+    if (!s.ok() &&
+        s.message() == "GetReadyFuture() called on deleted or donated buffer") {
+      return InvalidArgument(
+          "BlockHostUntilReady() called on deleted or donated buffer");
+    }
+    return s;
+  }
+
+  // Whether this buffer is on CPU and thus allows for certain optimizations.
+  virtual bool IsOnCpu() const = 0;
+};
+
+// Represents a compiled computation that can be executed given handles to
+// device-allocated literals. If any input/output alias has been specified in
+// the computation, the parameter containing the input buffer will be donated
+// when passed to the execution.
+class PjRtLoadedExecutable : public PjRtExecutable {
+ public:
+  ~PjRtLoadedExecutable() override = default;
+
+  virtual PjRtClient* client() const = 0;
+
+  virtual const DeviceAssignment& device_assignment() const = 0;
+
+  // Returns named values for cost properties of this executable (such as
+  // operations, size of input/outputs, and run time estimate). Properties may
+  // differ for different platforms.
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override;
+
+  // The replica and partition indices of device_assignment to be run by this
+  // client. On single-host platforms without partitioning, this is all replicas
+  // (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may not be the
+  // case on multi-host platforms. If there are 4 replicas and 2 partitions on a
+  // single host platform, size of addressable_device_logical_ids_ is 4*2 = 8.
+  struct LogicalDeviceIds {
+    int replica;
+    int partition;
+  };
+  virtual absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const = 0;
+
+  // An addressable_device is one which the client can issue commands to.
+  // addressable_devices()[i] is the Device to which
+  // addressable_device_logical_ids()[i] is assigned.
+  virtual absl::Span<PjRtDevice* const> addressable_devices() const = 0;
+
+  // Donation Semantics:
+  //
+  // The following Execute*() methods will donate the input buffer to the
+  // execution if it is specified in the executable. Donation is usually
+  // implemented as a transaction: it is acquired in the beginning and committed
+  // when the device execution is successully launched. Concurrent donations
+  // might either block or return failures.
+  //
+  // TODO(chky): It is generally desired that concurrent donations do not block,
+  // as it otherwise results in deadlock easily. Consider always returning
+  // failure on concurrent donations.
+
+  // Executes on devices addressable by the client. Requires executable has a
+  // device_assignment and all devices in the device_assignment are addressable
+  // by the client.
+  //
+  // `argument_handles` is `[num_devices, num_args]`.
+  //
+  // If returned_futures.has_value():
+  //   if Execute does not return an error status:
+  //     *returned_futures will be resized to be the same length as the return
+  //     vector, and each future will become ready once the corresponding device
+  //     execute has completed.
+  //   else:
+  //     *returned_futures is undefined.
+  //
+  // The caller is *NOT* required to ensure that PjRtLoadedExecutable stays
+  // alive until futures are ready.
+  virtual absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+  Execute(absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+          const ExecuteOptions& options,
+          std::optional<std::vector<PjRtFuture<>>>& returned_futures) = 0;
+  // Convenience wrapper for Execute that never returns futures.
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options) {
+    std::optional<std::vector<PjRtFuture<>>> returned_futures;
+    return Execute(std::move(argument_handles), options, returned_futures);
+  }
+
+  // Execute the assigned replica/partition on a given `device`. Requires
+  // executable has a device_assignment, `device` is present in the
+  // device_assignment and addressable by the client.
+  //
+  // If fill_future is true:
+  //   if ExecuteSharded does not return an error status:
+  //     returned_future will be filled with a future that will become ready
+  //     once the execution has completed.
+  //    else:
+  //     returned_future will not be modified.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecuteSharded(absl::Span<PjRtBuffer* const> argument_handles,
+                 PjRtDevice* device, const ExecuteOptions& options,
+                 std::optional<PjRtFuture<>>& returned_future,
+                 bool fill_future) = 0;
+  // Convenience wrapper for ExecuteSharded that always returns a future.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future) {
+    return ExecuteSharded(std::move(argument_handles), device, options,
+                          returned_future, /*fill_future=*/true);
+  }
+  // Convenience wrapper for ExecuteSharded that never returns a future.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options) {
+    std::optional<PjRtFuture<>> returned_future;
+    return ExecuteSharded(std::move(argument_handles), device, options,
+                          returned_future, /*fill_future=*/false);
+  }
+
+  // Execute on a given `device`. Requires `device` to be addressable by client.
+  // Requires executable has exactly 1 replica and 1 partition and no
+  // device_assignment (thus portable).
+  //
+  // If fill_future is true:
+  //   if ExecutePortable does not return an error status:
+  //     returned_future will be filled with a future that will become ready
+  //     once the execution has completed.
+  //    else:
+  //     returned_future will not be modified.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecutePortable(absl::Span<PjRtBuffer* const> argument_handles,
+                  PjRtDevice* device, const ExecuteOptions& options,
+                  std::optional<PjRtFuture<>>& returned_future,
+                  bool fill_future) = 0;
+  // Convenience wrapper for ExecutePortable that always returns a future.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future) {
+    return ExecutePortable(std::move(argument_handles), device, options,
+                           returned_future, /*fill_future=*/true);
+  }
+  // Convenience wrapper for ExecutePortable that never returns a future.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options) {
+    std::optional<PjRtFuture<>> returned_future;
+    return ExecutePortable(std::move(argument_handles), device, options,
+                           returned_future, /*fill_future=*/false);
+  }
+
+  // Asynchronously free resources after the last execution completes.
+  virtual void Delete() = 0;
+
+  // True if on-device resources associated with the executable are freed.
+  virtual bool IsDeleted() = 0;
+
+  // True if the `returned_futures` output parameter is supported in the
+  // Execute*() methods.
+  //
+  // TODO(b/240696624): Although the PjRt interface require `returned_futures`
+  // to be resized correctly if it is not nullopt, some implementation does not
+  // implement this. So we have to check whether returned_futures is empty.
+  // Remove this method once the implementation is fixed.
+  virtual bool IsReturnedFutureSupported() const { return false; }
+
+ protected:
+  // Value returned internally from routines that enqueue an execution,
+  // combining the result buffers with a future that becomes ready when the
+  // execution completes.
+  struct Result {
+    std::optional<PjRtFuture<>> future;
+    std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  };
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_client_test.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_client_test.h
new file mode 100644
index 00000000..2d4d4890
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_client_test.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_CLIENT_TEST_H_
+#define XLA_PJRT_PJRT_CLIENT_TEST_H_
+
+#include <functional>
+#include <memory>
+
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla {
+
+void RegisterTestClientFactory(
+    std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> factory);
+}
+
+#endif  // XLA_PJRT_PJRT_CLIENT_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_common.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_common.h
new file mode 100644
index 00000000..8d11cdae
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_common.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_COMMON_H_
+#define XLA_PJRT_PJRT_COMMON_H_
+
+#include <cstdint>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla {
+
+// bool comes before int64_t because when pybind11 tries to convert a Python
+// object to a C++ type, it will try to convert it to the first type in the list
+// of possible types that it can be converted to (b/309163973).
+using PjRtValueType =
+    std::variant<std::string, bool, int64_t, std::vector<int64_t>, float>;
+
+// The strong-typed integer classes to better disambiguate different IDs for
+// PJRT devices.
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtGlobalDeviceId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalDeviceId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalHardwareId, int32_t);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_compiler.h
new file mode 100644
index 00000000..3e5a1583
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_COMPILER_H_
+#define XLA_PJRT_PJRT_COMPILER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace xla {
+
+using PjRtPlatformId = uint64_t;
+
+inline const char* CpuName() {
+  static constexpr char kCpuName[] = "cpu";
+  return kCpuName;
+}
+inline const char* CudaName() {
+  static constexpr char kCudaName[] = "cuda";
+  return kCudaName;
+}
+inline const char* RocmName() {
+  static constexpr char kRocmName[] = "rocm";
+  return kRocmName;
+}
+inline const char* SyclName() {
+  static constexpr char kSyclName[] = "sycl";
+  return kSyclName;
+}
+inline const char* TpuName() {
+  static constexpr char kTpuName[] = "tpu";
+  return kTpuName;
+}
+inline PjRtPlatformId CpuId() {
+  static const PjRtPlatformId kCpuId = tsl::Fingerprint64(CpuName());
+  return kCpuId;
+}
+inline PjRtPlatformId CudaId() {
+  static const PjRtPlatformId kCudaId = tsl::Fingerprint64(CudaName());
+  return kCudaId;
+}
+inline PjRtPlatformId RocmId() {
+  static const PjRtPlatformId kRocmId = tsl::Fingerprint64(RocmName());
+  return kRocmId;
+}
+inline PjRtPlatformId SyclId() {
+  static const PjRtPlatformId kSyclId = tsl::Fingerprint64(SyclName());
+  return kSyclId;
+}
+inline PjRtPlatformId TpuId() {
+  static const PjRtPlatformId kTpuId = tsl::Fingerprint64(TpuName());
+  return kTpuId;
+}
+
+class PjRtCompiler;
+class PjRtClient;
+
+// TODO(b/240299401): Move CompileOptions to this file.
+
+// Abstract interface to represent device topology that is used by the compiler.
+class PjRtTopologyDescription {
+ public:
+  virtual ~PjRtTopologyDescription() = default;
+
+  // Return an ID that identifies the platform (CPU/GPU/TPU).
+  virtual PjRtPlatformId platform_id() const = 0;
+
+  // Returns a string that identifies the platform (CPU/GPU/TPU).
+  virtual absl::string_view platform_name() const = 0;
+
+  // Returns a string containing human-readable, platform-specific version info
+  // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
+  virtual absl::string_view platform_version() const = 0;
+
+  // If non-null, overrides the compiler for this topology.
+  virtual std::optional<PjRtCompiler*> compiler() const { return std::nullopt; }
+
+  // Returns an unordered list of descriptions for all devices in this topology.
+  virtual std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+  DeviceDescriptions() const = 0;
+
+  // Returns true if the topology represents subslice.
+  virtual bool is_subslice_topology() const { return false; }
+
+  // Returns the number of processes (usually the number of hosts, except in
+  // topologies with multiple processes per host).
+  virtual absl::StatusOr<int> ProcessCount() const {
+    return absl::UnimplementedError("ProcessCount is unsupported.");
+  }
+
+  // Returns the total number of cores of the default type.
+  virtual absl::StatusOr<int> CoreCountOfDefaultType() const {
+    return absl::UnimplementedError("CoreCountOfDefaultType is unsupported.");
+  }
+
+  // Returns the total number of logical devices of the default type.
+  virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const {
+    return absl::UnimplementedError(
+        "LogicalDeviceCountOfDefaultType is unsupported.");
+  }
+
+  // Returns the number of cores of the default type per process.
+  virtual absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const {
+    return absl::UnimplementedError(
+        "CoreCountOfDefaultTypePerProcess is unsupported.");
+  }
+
+  // Returns the number of cores per chip for the default type.
+  virtual absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const {
+    return absl::UnimplementedError(
+        "CoreCountOfDefaultTypePerChip is unsupported.");
+  }
+
+  // Serializes the topology for use in cache keys. (No guarantees on
+  // stability).
+  virtual absl::StatusOr<std::string> Serialize() const = 0;
+
+  // Returns vendor specific attributes about the topology.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const = 0;
+
+  // Returns the default device layout for a buffer with `element_type` and
+  // `dims`. The default layout is a platform-specific layout used when no other
+  // layout is specified, e.g. for host-to-device transfers. When compiling, the
+  // default layout is used for program arguments and outputs unless
+  // user-specified or compiler-chosen layouts are requested via the
+  // "mhlo.layout_mode" attribute.
+  virtual absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) const = 0;
+};
+
+// Abstract interface that all registered compilers must implement.
+class PjRtCompiler {
+ public:
+  virtual ~PjRtCompiler() = default;
+
+  // Compiles the 'computation' and returns a 'PjRtExecutable'. The returned
+  // PjRtExecutable must be loaded by a compatible client before execution.
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, const XlaComputation& computation,
+      const PjRtTopologyDescription& topology, PjRtClient* client) = 0;
+
+  // Variant of `Compile` that accepts an MLIR module.
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, mlir::ModuleOp module,
+      const PjRtTopologyDescription& topology, PjRtClient* client) = 0;
+};
+
+// Registers a compiler to compile programs for 'platform_name'.
+// Takes ownership of 'compiler'.
+//
+// REQUIRES: No compiler has been registered for the platform yet.
+void PjRtRegisterCompiler(absl::string_view platform_name,
+                          std::unique_ptr<PjRtCompiler> compiler);
+
+// Compiles a 'computation' and generates a 'PjRtExecutable' using the compiler
+// registered for the platform using PjRtRegisterCompiler. The returned
+// PjRtExecutable must be loaded by a compatible client before execution.
+//
+// The actual compiler used may be overridden by Topology::compiler().
+//
+// Returns error::NotFound if a compiler has not been registered for the
+// platform. Forwards errors returned from the registered compiler in case of a
+// compilation failure.
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
+    CompileOptions options, const XlaComputation& computation,
+    const PjRtTopologyDescription& topology, PjRtClient* client = nullptr);
+
+// Variant of `PjRtCompile` that accepts an MLIR module.
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
+    CompileOptions options, mlir::ModuleOp module,
+    const PjRtTopologyDescription& topology, PjRtClient* client = nullptr);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_device_description.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_device_description.h
new file mode 100644
index 00000000..95e2367a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_device_description.h
@@ -0,0 +1,102 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
+#define XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_common.h"
+
+namespace xla {
+
+using PjRtDeviceAttribute = PjRtValueType;
+
+class PjRtMemorySpaceDescription {
+ public:
+  PjRtMemorySpaceDescription(absl::string_view kind, int kind_id)
+      : kind_(kind), kind_id_(kind_id) {}
+
+  // A platform-dependent string that uniquely identifies the kind of the
+  // memory space.
+  absl::string_view kind() const { return absl::string_view(kind_); }
+
+  // An ID uniquely identifies the kind of the memory space among those attached
+  // to the same `PjRtClient`. The IDs assigned to a kind is implementation
+  // specific.
+  int kind_id() const { return kind_id_; }
+
+ private:
+  std::string kind_;
+  int kind_id_;
+};
+
+class PjRtDeviceDescription {
+ public:
+  virtual ~PjRtDeviceDescription() = default;
+
+  // The ID of this device. IDs are unique among devices of this type
+  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
+  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
+  virtual int id() const = 0;
+
+  // The index of the process that this device belongs to, i.e. is addressable
+  // from. This is not always identical to PjRtClient::process_index() in a
+  // multi-process setting, where each client can see devices from all
+  // processes, but only a subset of them are addressable and have the same
+  // process_index as the client.
+  virtual int process_index() const = 0;
+
+  // A vendor-dependent string that uniquely identifies the kind of device,
+  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
+  // compatible compilation.
+  virtual absl::string_view device_kind() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  virtual absl::string_view DebugString() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse,
+  // for example: "CpuDevice(id=0)".
+  virtual absl::string_view ToString() const = 0;
+
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the PjRtDevice.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const = 0;
+
+  // Returns all memory spaces attached to this device.
+  // The memory spaces are in no particular order.
+  virtual absl::Span<const PjRtMemorySpaceDescription* const> memory_spaces()
+      const {
+    return {};
+  }
+
+  // Returns the default memory space attached to this device.
+  virtual absl::StatusOr<const PjRtMemorySpaceDescription*>
+  default_memory_space() const {
+    return absl::UnimplementedError("default_memory_space Not implemented.");
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_executable.h
new file mode 100644
index 00000000..1244039e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -0,0 +1,401 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_EXECUTABLE_H_
+#define XLA_PJRT_PJRT_EXECUTABLE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/pjrt/compile_options.pb.h"
+#include "xla/pjrt/executable_metadata.pb.h"
+#include "xla/pjrt/execute_options.pb.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/service/compiler.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Provides configuration for implementations that support compile and execute
+// spanning multiple slices. A slice is a set of devices connected by dedicated
+// high speed interconnect. Connectivity between slices is typically over data
+// center networks. Concrete implementations of MultiSliceConfig contain
+// environment specific information to enable communication between devices on
+// different slices. Passed as options during compile and execute.
+// Implementations that do not support this are allowed to pass nullptr.
+class MultiSliceConfig {
+ public:
+  virtual ~MultiSliceConfig();
+
+  // Returns the total number of slices.
+  virtual int32_t NumSlices() const = 0;
+
+  // Returns the SliceID at this host - an integer in [0, NumSlices)
+  virtual int32_t SliceId() const = 0;
+
+  // Returns the number of devices on each slice indexed by SliceId.
+  virtual absl::flat_hash_map<int32_t, int32_t> NumDevicesPerSlice() const = 0;
+
+  // Returns a serialized proto representing MultiSliceConfig.
+  virtual std::string Serialize() const = 0;
+};
+
+struct CompileOptions {
+  // The layouts of the arguments that the computation should expect.
+  std::optional<std::vector<Shape>> argument_layouts;
+
+  // If true, the supplied computation expects its arguments to be wrapped in a
+  // tuple and passed as a single parameter.
+  bool parameter_is_tupled_arguments = false;
+
+  // XLA's compilation time options.
+  ExecutableBuildOptions executable_build_options;
+
+  // If true, the executable can be run on any device. May only be true if
+  // !executable_build_options.has_device_assignment(), so only applies to
+  // single-device executables. Beware: on GPUs, sometimes an executable
+  // compiled for one device doesn't run on another.
+  bool compile_portable_executable = false;
+
+  // XLA compilation profile version.
+  int64_t profile_version = 0;
+
+  // Set multi_slice_config to trigger compilation for DCN connected multi
+  // slice operation.
+  const MultiSliceConfig* multi_slice_config = nullptr;
+
+  // Key-value string pairs, parsed in order to set miscellaneous options,
+  // overriding if appropriate.
+  using OptionOverride = std::variant<std::string, bool, int64_t, double>;
+  using EnvironmentOptionOverrides =
+      std::vector<std::pair<std::string, OptionOverride>>;
+  EnvironmentOptionOverrides env_option_overrides;
+
+  std::optional<xla::Compiler::TargetConfig> target_config;
+
+  // Used to indicate the precision configuration.
+  PrecisionConfig::Precision matrix_unit_operand_precision =
+      PrecisionConfig::DEFAULT;
+
+  // Applies env_option_overrides to executable_build_options.debug_options().
+  absl::Status ApplyAllOptionOverrides();
+
+  // Applies a single option to executable_build_options.debug_options().
+  absl::Status ApplyOption(const std::string& key, const OptionOverride& value);
+
+  absl::Status ApplyOptionFromString(
+      const tsl::protobuf::FieldDescriptor* field, const std::string& value);
+
+  static absl::StatusOr<
+      std::vector<std::pair<std::string, CompileOptions::OptionOverride>>>
+  LoadEnvOptionOverrides(
+      const google::protobuf::Map<std::string, xla::OptionOverrideProto>&
+          env_option_overrides);
+
+  // Serialize the CompileOptions into a CompileOptionsProto.
+  absl::StatusOr<CompileOptionsProto> ToProto() const;
+
+  // Deserialize the CompileOptionsProto into a CompileOptions.
+  static absl::StatusOr<CompileOptions> FromProto(
+      const CompileOptionsProto& proto);
+};
+
+struct LoadOptions {
+  // Origin of the subslice of the target topology to run computation on.
+  struct ComputationOrigin {
+    int x = 0;
+    int y = 0;
+    int z = 0;
+  };
+  std::optional<ComputationOrigin> computation_origin;
+
+  // multi_slice_config to associate with the executable during load of a multi
+  // slice operation.
+  const MultiSliceConfig* multi_slice_config = nullptr;
+};
+
+class ExecuteContext {
+ public:
+  virtual ~ExecuteContext() = default;
+
+  ffi::ExecutionContext& ffi_context() { return ffi_context_; }
+  const ffi::ExecutionContext& ffi_context() const { return ffi_context_; }
+
+ private:
+  // XLA FFI execution context is a mechanism to attach arbitrary user data to
+  // a particular call of PjRtLoadedExecutable::Execute and forward it to custom
+  // calls implemented as XLA FFI handlers.
+  ffi::ExecutionContext ffi_context_;
+};
+
+struct PjRtTransferMetadata {
+  // May be invalid if
+  // ExecuteOptions::use_major_to_minor_data_layout_for_callbacks is true for
+  // this execution.
+  Shape device_shape;
+};
+
+class PjRtChunk;
+class CopyToDeviceStream;
+
+struct SendCallback {
+  int64_t channel_id;
+  // The callback for retrieving the send value. It will be invoked once for
+  // each invocation of the corresponding Send op in the HLO program (So it can
+  // be invoked multiple times if it is in a loop). Currently there is no
+  // guarantee that the callback here will be invoked in the same order as their
+  // corresponding HLO Send ops. The callback can also return errors to indicate
+  // the execution should fail.
+  //
+  // IMPORTANT: the implementation might NOT signal the error to the execution,
+  // and the execution will run to completion with UNDEFINED DATA returned by
+  // the callback. If there is any potential control flow that depends on the
+  // value of the returned data, an error return is unsafe.
+  //
+  // TODO(chky): Currently the callback invocation order may not be consistent
+  // with the HLO send op invocation order, due to limitations in some PjRt
+  // implementation. Consider making it strictly the same order as HLO program.
+  std::function<absl::Status(const PjRtTransferMetadata& metadata,
+                             PjRtChunk chunk, size_t total_size_in_bytes,
+                             bool done)>
+      callback;
+};
+
+struct RecvCallback {
+  int64_t channel_id;
+  // The callback for feeding the recv value. It will be invoked once for each
+  // invocation of the corresponding Recv op in the HLO program (So it can be
+  // invoked multiple times if it is in a loop). Currently there is no
+  // guarantee that the callback here will be invoked in the same order as their
+  // corresponding HLO Recv ops.
+  std::function<void(const PjRtTransferMetadata& metadata,
+                     std::unique_ptr<CopyToDeviceStream> stream)>
+      callback;
+};
+
+struct ExecuteOptions {
+  // If true, the client must pass a single PjRtBuffer which contains all of
+  // the arguments as a single XLA tuple, otherwise each argument must be
+  // passed in its own PjRtBuffer. May only be true if the executable was
+  // compiled with parameter_is_tupled_arguments==true.
+  bool arguments_are_tupled = false;
+  // If true, the computation must return a tuple, which will be destructured
+  // into its elements.
+  bool untuple_result = false;
+  // If non-zero, identifies this execution as part of a potentially
+  // multi-device launch. This can be used to detect scheduling errors, e.g. if
+  // multi-host programs are launched in different orders on different hosts,
+  // the launch IDs may be used by the runtime to detect the mismatch.
+  int32_t launch_id = 0;
+  // If non-null, an opaque context passed to an execution that may be used to
+  // supply additional arguments to a derived class of PjRtExecutable. It is
+  // a caller responsibility to ensure that the context is valid for the
+  // duration of the execution.
+  const ExecuteContext* context = nullptr;
+  // If true, check that the PjRtBuffer argument shapes match the compiled
+  // shapes. Otherwise, any shape with the right size on device may be passed.
+  bool strict_shape_checking = true;
+
+  // Set multi_slice_config when the computation spans multiple slices. The
+  // config should match what was used during compilation to generate this
+  // executable.
+  const MultiSliceConfig* multi_slice_config = nullptr;
+
+  // The send/recv callbacks for PjRt execution. The first level span is for
+  // multi-device parallel execution, the second level vector contains the
+  // callbacks for all send/recv ops in the executable. These callbacks can be
+  // stateful and the user code is responsible for managing the states here.
+  // These callbacks must outlive the execution.
+  absl::Span<const std::vector<SendCallback>> send_callbacks;
+  absl::Span<const std::vector<RecvCallback>> recv_callbacks;
+
+  // If true, send callbacks are passed PjRtChunks in major-to-minor layout, and
+  // recv functions should pass major-to-minor chunks to
+  // CopyToDeviceStream::AddChunk.
+  //
+  // If false, send callbacks are passed PjRtChunks in the on-device layout
+  // specified in the PjRtTransferMetadata, and recv functions should similarly
+  // pass device-layout chunks to CopyToDeviceStream::AddChunk.
+  bool use_major_to_minor_data_layout_for_callbacks = false;
+
+  // The `execution_mode` decides whether the execution will be invoked in the
+  // caller thread or launched to a separate thread. By default, the
+  // implementation may choose either strategy or use a heuristic to decide.
+  // Currently it is only applied to CPU implementations
+  enum class ExecutionMode { kDefault = 0, kSynchronous, kAsynchronous };
+  ExecutionMode execution_mode = ExecutionMode::kDefault;
+
+  // A set of indices denoting the input buffers that should not be donated.
+  // An input buffer may be non-donable, for example, if it is referenced more
+  // than once. Since such runtime information is not available at compile time,
+  // the compiler might mark the input as `may-alias`, which could lead PjRt to
+  // donate the input buffer when it should not. By defining this set of
+  // indices, a higher-level PjRt caller can instruct PjRtClient not to donate
+  // specific input buffers.
+  absl::flat_hash_set<int> non_donatable_input_indices;
+
+  absl::StatusOr<ExecuteOptionsProto> ToProto() const;
+  static absl::StatusOr<ExecuteOptions> FromProto(
+      const ExecuteOptionsProto& proto);
+};
+
+// Static memory usage for a compiled program.
+// The on-device memory needed to run an executable is at least
+//   generated_code_size_in_bytes
+//   + argument_size_in_bytes + output_size_in_bytes - alias_size_in_bytes
+//   + temp_size_in_bytes.
+struct CompiledMemoryStats {
+  // Device default memory (e.g., HBM for GPU/TPU) usage stats.
+  int64_t generated_code_size_in_bytes = 0;
+  int64_t argument_size_in_bytes = 0;
+  int64_t output_size_in_bytes = 0;
+  // How much argument is reused for output.
+  int64_t alias_size_in_bytes = 0;
+  int64_t temp_size_in_bytes = 0;
+
+  // Host memory usage stats.
+  int64_t host_generated_code_size_in_bytes = 0;
+  int64_t host_argument_size_in_bytes = 0;
+  int64_t host_output_size_in_bytes = 0;
+  int64_t host_alias_size_in_bytes = 0;
+  int64_t host_temp_size_in_bytes = 0;
+
+  std::string serialized_hlo_proto = "";
+  std::string DebugString() const;
+
+  CompiledMemoryStatsProto ToProto() const;
+
+  static CompiledMemoryStats FromProto(const CompiledMemoryStatsProto& proto);
+
+  void PopulateBufferStatsFromAllocations(
+      absl::Span<const BufferAllocation> allocs);
+};
+
+class PjRtExecutable {
+ public:
+  virtual ~PjRtExecutable() = default;
+
+  virtual int num_replicas() const = 0;
+
+  virtual int num_partitions() const = 0;
+
+  virtual int64_t SizeOfGeneratedCodeInBytes() const = 0;
+
+  // Unique name for this executable, e.g., HloModule name.
+  virtual absl::string_view name() const = 0;
+
+  // Return an HloModule (optimized) per partition.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+  GetHloModules() const = 0;
+
+  // Returns an output Shape per program, the size should be equal to
+  // `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<Shape>> GetOutputShapes() const;
+
+  // Returns a list of element types for each output, the size of the outer list
+  // should be equal to `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
+  GetOutputElementTypes() const;
+
+  // Returns a list of dimensions for each output, the size of the outer list
+  // should be equal to `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<std::vector<DimensionVector>>>
+  GetOutputDimensions() const;
+
+  // Returns the layout of each input parameter.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetParameterLayouts() const;
+
+  // Returns the layout of each output.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const;
+
+  // Returns a list of lists of memory kind strings for output. The returned
+  // value is `[num_programs, num_output]`. The size of the outer list should be
+  // equal to `GetHloModules()`. Under SPMD, one can use
+  // `GetOutputMemoryKinds().front()`.
+  virtual absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const = 0;
+
+  // Returns a list of parameter OpSharding protos.
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings() const;
+
+  // Returns a list of output OpSharding protos.
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const;
+
+  // Return memory stats that allow callers to estimate device memory usage
+  // when running this executable.
+  virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
+    return Unimplemented("Retrieving CompiledMemoryStats is not supported.");
+  }
+
+  // Returns named values for cost properties of this executable (such as
+  // operations, size of input/outputs, and run time estimate). Properties may
+  // differ for different platforms.
+  virtual absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const = 0;
+
+  // Serialize this executable into a string and return the value.
+  virtual absl::StatusOr<std::string> SerializeExecutable() const {
+    return Unimplemented("Serializing executable is not supported.");
+  }
+
+  // Return a fingerprint of this executable.
+  virtual absl::StatusOr<std::string> FingerprintExecutable() const {
+    return Unimplemented("Fingerprinting executable is not supported.");
+  }
+
+  virtual absl::StatusOr<struct CompileOptions> GetCompileOptions() const {
+    return Unimplemented("CompileOptions not available.");
+  }
+};
+
+class PjRtExecutableUtil {
+ public:
+  static absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  RunHloCostAnalysis(const PjRtExecutable& executable,
+                     HloCostAnalysis* hlo_cost_analysis);
+
+  static absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  RunHloCostAnalysis(
+      const std::vector<std::shared_ptr<xla::HloModule>>& hlo_modules,
+      HloCostAnalysis* hlo_cost_analysis);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_future.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_future.h
new file mode 100644
index 00000000..ac76be51
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_future.h
@@ -0,0 +1,491 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_FUTURE_H_
+#define XLA_PJRT_PJRT_FUTURE_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+template <class T = void>
+class PjRtFuture;
+
+namespace internal {
+template <class T, bool unique>
+class PjRtFutureBase;
+}
+
+// Returns a `PjRtFuture` that will be successful if all `futures` complete
+// successfully, or return a first encountered error.
+PjRtFuture<> JoinFutures(absl::Span<const PjRtFuture<>> futures);
+
+// An RAII event that a caller can use to tell the PjRtClient about asynchronous
+// actions outside PjRt.
+//
+// A ScopedAsyncTrackingEvent can be generated by the caller by calling a method
+// on PjRtDevice, and the creation of a ScopedAsyncTrackingEvent tells the
+// PjRtClient that the client is creating some outstanding asynchronous work
+// that depends on activities happening on the PjRtDevice.
+//
+// The caller can indicate that a ScopedAsyncTrackingEvent event cannot complete
+// until after some PjRtFuture becomes ready, by calling
+// future.AssertHappensBefore(event).
+//
+// The caller indicates that the work tracked by the ScopedAsyncTrackingEvent
+// has completed by letting the event go out of scope.
+//
+// ScopedAsyncTrackingEvents are used by some PjRtClient implementations to
+// monitor system-wide dependencies.
+class ScopedAsyncTrackingEvent {
+ public:
+  virtual ~ScopedAsyncTrackingEvent() = default;
+
+ private:
+  template <class T, bool unique>
+  friend class internal::PjRtFutureBase;
+
+  // Indicates that the ScopedAsyncTrackingEvent won't complete until dependency
+  // becomes available. Called only by PjRtFuture.
+  virtual void AddDependency(tsl::RCReference<tsl::AsyncValue> dependency) = 0;
+};
+
+// Helpers for using PjRtFutures.
+struct PjRtFutureHelpers {
+ public:
+  // Keys that are returned by an implementation-specific handler when a client
+  // starts to block on a promise.
+  //
+  // For now, contains a single UID that can be used to identify a TraceMe, but
+  // made extensible to allow support for other profilers such as endoscope.
+  struct ProfilingKeys {
+    uint64_t traceme_context_id = -1;
+  };
+
+  // Signature of handler called by the PjRtFuture class before it starts to
+  // block a thread.
+  using OnBlockStartFn = std::function<ProfilingKeys()>;
+
+  // Signature of handler called by the PjRtFuture class after it finishes
+  // blocking a thread.
+  using OnBlockEndFn = std::function<void(ProfilingKeys)>;
+};
+
+namespace internal {
+
+// Detects absl::StatusOr<T> specializations to disable them for PjRtFuture<T>.
+template <typename T>
+struct IsStatusOr : public std::false_type {};
+template <typename T>
+struct IsStatusOr<absl::StatusOr<T>> : public std::true_type {};
+
+// A base class to conditionally disable copy constructor and assignment for a
+// PjRtFuture<T> (by default we always disable copy constructor when `T` is not
+// copyable), which makes PjRtFuture<T> an `std::unique_ptr`-like container for
+// move-only types.
+template <bool unique>
+class PjRtFutureMoveControl;
+
+template <>
+class PjRtFutureMoveControl</*unique=*/true> {
+ protected:
+  PjRtFutureMoveControl() = default;
+
+  PjRtFutureMoveControl(const PjRtFutureMoveControl&) = delete;
+  PjRtFutureMoveControl& operator=(const PjRtFutureMoveControl&) = delete;
+
+  PjRtFutureMoveControl(PjRtFutureMoveControl&&) = default;
+  PjRtFutureMoveControl& operator=(PjRtFutureMoveControl&&) = default;
+};
+
+template <>
+class PjRtFutureMoveControl</*unique=*/false> {
+ protected:
+  PjRtFutureMoveControl() = default;
+
+  PjRtFutureMoveControl(const PjRtFutureMoveControl&) = default;
+  PjRtFutureMoveControl& operator=(const PjRtFutureMoveControl&) = default;
+
+  PjRtFutureMoveControl(PjRtFutureMoveControl&&) = default;
+  PjRtFutureMoveControl& operator=(PjRtFutureMoveControl&&) = default;
+};
+
+// A base class for a stateful future PjRtFuture<T> and a stateless future
+// PjRtFuture<>. If `unique` is true, PjRtFuture derived from this class acts
+// as a move-only type and the value can be passed to the caller only using move
+// assignment (applied to Await and OnReady APIs).
+template <typename T, bool unique = !std::is_copy_constructible_v<T>>
+class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
+ protected:
+  // A protected constructor that hides AsyncValueRef implementation detail
+  // from the end users of PjRtFuture and Promise. Must not be made public!
+  PjRtFutureBase(tsl::AsyncValueRef<T> promise,
+                 PjRtFutureHelpers::OnBlockStartFn on_block_start,
+                 PjRtFutureHelpers::OnBlockEndFn on_block_end)
+      : promise_(std::move(promise)),
+        on_block_start_(std::move(on_block_start)),
+        on_block_end_(std::move(on_block_end)) {}
+
+ public:
+  PjRtFutureBase() = default;
+
+  // Constructor for an already-available PjRtFuture.
+  //
+  // Typically used to eagerly return error values when async work will not
+  // be enqueued, e.g., due to invalid arguments.
+  explicit PjRtFutureBase(
+      T t, PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : PjRtFutureBase(tsl::MakeAvailableAsyncValueRef<T>(std::move(t)),
+                       std::move(on_block_start), std::move(on_block_end)) {}
+
+  bool IsValid() const { return promise_ != nullptr; }
+
+  // Two functions exist to know whether the future is ready, to accommodate
+  // the fact some backends (e.g. distributed ones) could take a non-trivial
+  // time to check the state of a future.
+  //
+  // `IsReady()` is guaranteed to return true if the future became ready
+  // before `IsReady()` was called. `IsReady()` will return immediately if a
+  // call to `Await()` has already returned, or any callback passed to
+  // `OnReady` has already been triggered. Otherwise IsReady() may block for
+  // the duration of a network message on some backends.
+  bool IsReady() {
+    CHECK(IsValid());
+    return promise_.IsAvailable();
+  }
+  // `IsKnownReady()` is guaranteed to return immediately. `IsKnownReady()` will
+  // always return true if a call to `Await()` has already returned, or any
+  // callback passed to `OnReady` has already been triggered. Otherwise,
+  // `IsKnownReady()` may return false in some cases in which the future was
+  // ready before `IsKnownReady()` was called.
+  bool IsKnownReady() {
+    CHECK(IsValid());
+    return promise_.IsAvailable();
+  }
+
+  // Indicates that event will not complete until after this becomes ready.
+  //
+  // May safely be called with event==nullptr in which case AssertHappensBefore
+  // has no effect.
+  void AssertHappensBefore(ScopedAsyncTrackingEvent* event) {
+    CHECK(IsValid());
+    if (event) event->AddDependency(promise_.CopyRCRef());
+  }
+
+ protected:
+  static constexpr bool is_unique() { return unique; }
+
+  // PjRtFuture<T>::Promise provides a facility to store a value or an error
+  // that is later acquired asynchronously via a PjRtFuture<T> constructed from
+  // the promise object. Note that the promise object is meant to be used only
+  // once (set value or error).
+  class Promise {
+   public:
+    Promise() = default;
+
+    Promise(Promise&& other) = default;
+    Promise& operator=(Promise&& other) = default;
+
+    Promise(const Promise& other) = default;
+    Promise& operator=(const Promise& other) = default;
+
+    operator bool() const { return static_cast<bool>(promise_); }  // NOLINT
+
+   protected:
+    explicit Promise(tsl::AsyncValueRef<T> promise)
+        : promise_(std::move(promise)) {}
+
+    template <typename... Args>
+    void emplace(Args&&... args) const {
+      DCHECK(promise_) << "Promise must wrap an async value";
+      promise_.template emplace<T>(std::forward<Args>(args)...);
+    }
+
+    // Releases the underlying AsyncValueRef container to the caller.
+    tsl::AsyncValueRef<T> release() { return std::move(promise_); }
+
+    // Returns a pointer to the underlying AsyncValue that can be used to
+    // track completion of a promise. It is undefined behavior to access the
+    // value stored in the AsyncValue.
+    tsl::AsyncValue* async_value() const { return promise_.GetAsyncValue(); }
+
+#ifndef NDEBUG
+    int64_t AddFuture() { return num_futures_->fetch_add(1); }
+#endif
+
+   private:
+    tsl::AsyncValueRef<T> promise_;
+
+#ifndef NDEBUG
+    // In debug builds we track the number of futures created from a promise to
+    // detect when a promise for a move-only type can be accidentally shared by
+    // multiple futures. We wrap the counter into shared pointer because promise
+    // for a unique future is still copyable, but only one future can be created
+    // from all the copies.
+    std::shared_ptr<std::atomic<int64_t>> num_futures_ =
+        std::make_shared<std::atomic<int64_t>>(0);
+#endif
+  };
+
+  PjRtFutureHelpers::ProfilingKeys OnBlockStart() const {
+    return on_block_start_ ? on_block_start_()
+                           : PjRtFutureHelpers::ProfilingKeys();
+  }
+
+  void OnBlockEnd(PjRtFutureHelpers::ProfilingKeys keys) const {
+    if (on_block_end_) on_block_end_(std::move(keys));
+  }
+
+  // Blocks the calling thread until the future is ready.
+  void BlockUntilReady() const {
+    CHECK(IsValid());
+    if (!promise_.IsAvailable()) {
+      PjRtFutureHelpers::ProfilingKeys keys = OnBlockStart();
+      tsl::BlockUntilReady(promise_);
+      OnBlockEnd(std::move(keys));
+    }
+    DCHECK(promise_.IsConcrete());
+  }
+
+  // Blocks the calling thread until the future is ready, then returns the
+  // final value.
+  const T& Await() const& {
+    BlockUntilReady();
+    return *promise_;
+  }
+
+  // Blocks the calling thread until the future is ready, then returns the
+  // final value.
+  std::conditional_t<unique, T, const T&> Await() && {
+    BlockUntilReady();
+
+    if constexpr (unique) {
+      return std::move(*promise_);
+    } else {
+      // We can't move from the promise to the caller because for non-unique
+      // futures we can have multiple copies of the PjRtFuture sharing the
+      // same underlying promise object.
+      return *promise_;
+    }
+  }
+
+  // Registers callback to be called once the promise is ready, with the final
+  // value.
+  //
+  // callback may be called on an internal system thread or the calling thread.
+  // The client should avoid any potentially re-entrant API calls within the
+  // callback, for example by using the callback to enqueue work on a
+  // client-owned threadpool.
+  template <typename F, std::enable_if_t<std::is_invocable_v<F, const T&> &&
+                                         !unique>* = nullptr>
+  void OnReady(F&& f) const& {
+    CHECK(IsValid());
+    promise_.AndThen(
+        [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+          DCHECK(promise.IsConcrete());
+          f(*promise);
+        });
+  }
+
+  // Registers callback to be called once the promise is ready, with the final
+  // value.
+  //
+  // callback may be called on an internal system thread or the calling thread.
+  // The client should avoid any potentially re-entrant API calls within the
+  // callback, for example by using the callback to enqueue work on a
+  // client-owned threadpool.
+  template <
+      typename F,
+      std::enable_if_t<unique ? std::is_invocable_v<F, T>
+                              : std::is_invocable_v<F, const T&>>* = nullptr>
+  void OnReady(F&& f) && {
+    CHECK(IsValid());
+    promise_.AndThen(
+        [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+          DCHECK(promise.IsConcrete());
+          if constexpr (unique) {
+            f(std::move(*promise));
+          } else {
+            // We can't move from the promise to the caller because for
+            // non-unique futures we can have multiple copies of the PjRtFuture
+            // sharing the same underlying promise object.
+            f(*promise);
+          }
+        });
+  }
+
+ private:
+  tsl::AsyncValueRef<T> promise_;
+
+  // Function that is called before a thread starts blocking on the promise.
+  PjRtFutureHelpers::OnBlockStartFn on_block_start_;
+  // Function that is called after a thread finishes blocking on the promise.
+  PjRtFutureHelpers::OnBlockEndFn on_block_end_;
+};
+
+}  // namespace internal
+
+// PjRtFuture<T> is a simple future that is returned by PjRt APIs that
+// enqueue asynchronous work, reporting a value of type T when the work is
+// complete.
+//
+// PjRtFuture can be used by the client to wait for work to complete, either via
+// a blocking call or a callback.
+//
+// The implementation wraps a tsl::AsyncValueRef<T>, but we prefer to
+// encapsulate the AVR rather than returning it directly for three reasons.
+//
+// First, in contrast to AsyncValueRef which has a smart-pointer semantics,
+// future has more of a value semantics, i.e. future of a move-only type also
+// is a move-only type. You can think of a move-only (unique) future as a box to
+// pass a value of type T between asynchronous producer/consumer: you can open
+// the box once to put the value into it and you can open the box only once to
+// take the value out of it. For copyable types PjRtFuture<T> is a copyable
+// type, although all copies share the same underlying value.
+//
+// Second, we want to retain portability in case a future implementation moves
+// away from AsyncValueRef ---- we don't want clients to call arbitrary
+// AsyncValueRef APIs.
+//
+// Third, we want to export different semantics, for example we support
+// integration between blocking and profiling (e.g., TraceMe).
+template <class T>
+class PjRtFuture : public internal::PjRtFutureBase<absl::StatusOr<T>> {
+  using Base = internal::PjRtFutureBase<absl::StatusOr<T>>;
+
+  static_assert(!std::is_same_v<T, absl::Status>,
+                "Use PjRtFuture<> specialization for stateless futures");
+
+  static_assert(
+      !internal::IsStatusOr<T>::value,
+      "PjRtFuture<T> already has an implicit absl::StatusOr<T> semantics");
+
+ public:
+  class Promise : public Base::Promise {
+   public:
+    using Base::Promise::Promise;
+
+    // Sets the value of the promise. Must be called at most once.
+    //
+    // After Set is called, value will be delivered to waiters on the PjRtFuture
+    // constructed from a promise, via blocking or callbacks.
+    void Set(absl::StatusOr<T> value) {
+      Base::Promise::emplace(std::move(value));
+    }
+
+   private:
+    friend class PjRtFuture<T>;
+  };
+
+  // Returns a Promise that can be used to construct a PjRtFuture, and then Set
+  // later.
+  static Promise CreatePromise() {
+    return Promise(tsl::MakeUnconstructedAsyncValueRef<absl::StatusOr<T>>());
+  }
+
+  // Bring PjRtFutureBase constructors in scope.
+  using Base::Base;
+
+  // Constructor for unavailable future that will be fulfilled later via the
+  // promise object.
+  //
+  // - on_block_start is called before Await starts to block.
+  //  - on_block_end is called after Await finishes blocking.
+  explicit PjRtFuture(
+      Promise promise,
+      PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : Base(promise.release(), std::move(on_block_start),
+             std::move(on_block_end)) {
+#ifndef NDEBUG
+    if constexpr (Base::is_unique()) {
+      DCHECK_EQ(promise.AddFuture(), 0)
+          << "Unique PjRtFuture cannot share a promise object";
+    }
+#endif
+  }
+
+  using Base::Await;
+  using Base::OnReady;
+};
+
+// PjRtFuture<void> specialization for communicating stateless events.
+//
+// See PjRtFuture<T> documentation above for more details.
+template <>
+class PjRtFuture<void> : public internal::PjRtFutureBase<absl::Status> {
+  using Base = internal::PjRtFutureBase<absl::Status>;
+
+ public:
+  class Promise : public Base::Promise {
+   public:
+    using Base::Promise::async_value;
+    using Base::Promise::Promise;
+
+    // Sets the promise completed with a given status. Must be called at most
+    // once.
+    //
+    // After Set is called, completion event will be delivered to waiters on the
+    // PjRtFuture constructed from a promise, via blocking or callbacks.
+    void Set(absl::Status status = absl::OkStatus()) {
+      Base::Promise::emplace(std::move(status));
+    }
+
+   private:
+    friend class PjRtFuture<void>;
+  };
+
+  // Returns a Promise that can be used to construct a PjRtFuture, and then Set
+  // later.
+  static Promise CreatePromise() {
+    return Promise(tsl::MakeUnconstructedAsyncValueRef<absl::Status>());
+  }
+
+  // Bring PjRtFutureBase constructors in scope.
+  using Base::Base;
+
+  // Constructor for unavailable future that will be fulfilled later via the
+  // promise object.
+  //
+  // - on_block_start is called before Await starts to block.
+  //  - on_block_end is called after Await finishes blocking.
+  explicit PjRtFuture(
+      Promise promise,
+      PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : Base(promise.release(), std::move(on_block_start),
+             std::move(on_block_end)) {}
+
+  using Base::Await;
+  using Base::OnReady;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_FUTURE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_layout.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_layout.h
new file mode 100644
index 00000000..e4318102
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_layout.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_LAYOUT_H_
+#define XLA_PJRT_PJRT_LAYOUT_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/layout.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Represents the memory layout of a PjRtBuffer.
+class PjRtLayout {
+ public:
+  explicit PjRtLayout(Layout layout) : xla_layout_(std::move(layout)) {
+    // Strip memory space and set it to the default. PJRT tracks memory space
+    // separately from layout.
+    xla_layout_.set_memory_space(xla::Layout::kDefaultMemorySpace);
+  }
+
+  PjRtLayout(PjRtLayout& other) = delete;
+  PjRtLayout& operator=(const PjRtLayout& other) = delete;
+
+  static absl::StatusOr<std::shared_ptr<const PjRtLayout>> Deserialize(
+      absl::string_view serialized) {
+    TF_ASSIGN_OR_RETURN(Layout xla_layout, ParseLayout(serialized));
+    return std::make_shared<PjRtLayout>(std::move(xla_layout));
+  }
+
+  const Layout& xla_layout() const { return xla_layout_; }
+
+  // Returns the serialized layout as a string.
+  std::string Serialize() const { return xla_layout_.ToString(); }
+
+  // Human-readable string for error messages, user introspection, etc.
+  std::string ToString() const { return xla_layout_.ToString(); }
+
+  bool operator==(const PjRtLayout& other) const {
+    return xla_layout_ == other.xla_layout_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H state, const PjRtLayout& layout) {
+    return H::combine(std::move(state), layout.xla_layout_);
+  }
+
+ private:
+  Layout xla_layout_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
new file mode 100644
index 00000000..f753df6d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -0,0 +1,1110 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
+#define XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/client/local_client.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/local_device_state.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/tracked_device_buffer.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/pjrt/utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+
+class PjRtStreamExecutorDeviceDescription : public PjRtDeviceDescription {
+ public:
+  explicit PjRtStreamExecutorDeviceDescription(int id, std::string device_kind,
+                                               int process_index = 0)
+      : id_(id),
+        process_index_(process_index),
+        device_kind_(std::move(device_kind)) {}
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return process_index_; }
+
+  absl::string_view device_kind() const override { return device_kind_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::Span<int const> coords() const { return absl::MakeSpan(coords_); }
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+  void SetAttributes(
+      absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes) {
+    attributes_ = std::move(attributes);
+  }
+
+  void SetDebugString(std::string debug_string) {
+    debug_string_ = std::move(debug_string);
+  }
+
+  void SetToString(std::string to_string) { to_string_ = std::move(to_string); }
+
+  void SetCoords(std::array<int, 1> coords) { coords_ = coords; }
+
+ private:
+  const int id_;
+  const int process_index_;
+  const std::string device_kind_;
+  std::string debug_string_ = "<unknown SE device>";
+  std::string to_string_ = "<unknown SE device>";
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_;
+  std::array<int, 1> coords_;
+};
+
+class PjRtStreamExecutorDevice : public PjRtDevice {
+ public:
+  explicit PjRtStreamExecutorDevice(
+      int id, std::unique_ptr<LocalDeviceState> local_device_state,
+      std::string device_kind, int process_index = 0)
+      : description_(id, std::move(device_kind), process_index),
+        local_device_id_(local_device_state
+                             ? local_device_state->local_device_id()
+                             : PjRtLocalDeviceId(-1)),
+        local_hardware_id_(local_device_state
+                               ? local_device_state->local_hardware_id()
+                               : PjRtLocalHardwareId(-1)),
+        local_device_state_(std::move(local_device_state)) {}
+  ~PjRtStreamExecutorDevice() override = default;
+
+  // Must set client exactly once.
+  void SetClient(PjRtClient* client) {
+    CHECK(client_ == nullptr);
+    client_ = client;
+    // We have to define debug_string_ and to_string_ here, because
+    // platform_name() requires client_ to be set.
+    std::string device_name =
+        absl::StrCat(MakeAsciiTitlecase(platform_name()), "Device");
+
+    description().SetDebugString(absl::StrCat(platform_name(), ":", id()));
+    description().SetToString(absl::StrCat(device_name, "(id=", id(), ")"));
+  }
+
+  PjRtStreamExecutorDeviceDescription& description() { return description_; }
+  const PjRtStreamExecutorDeviceDescription& description() const override {
+    return description_;
+  }
+
+  // Return `platform_id` from client.
+  PjRtPlatformId platform_id() const;
+
+  // Return `platform_name` from client.
+  absl::string_view platform_name() const;
+
+  PjRtClient* client() const override { return client_; }
+
+  bool IsAddressable() const override { return local_device_id_ != -1; }
+
+  PjRtLocalDeviceId local_device_id() const override {
+    return local_device_id_;
+  }
+
+  PjRtLocalHardwareId local_hardware_id() const override {
+    return local_hardware_id_;
+  }
+
+  // If this is a device local to this host, returns a LocalDeviceState object
+  // that can be used to manipulate the device. Returns nullptr if the device is
+  // not local to this host.
+  LocalDeviceState* local_device_state() const {
+    return local_device_state_.get();
+  }
+
+  // If this is a device local to this host, returns a LocalDeviceState object
+  // that can be used to manipulate the device. Returns an error if the device
+  // is not local to this host.
+  absl::StatusOr<LocalDeviceState*> GetLocalDeviceState() const;
+
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override;
+
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
+
+  void AttachMemorySpace(PjRtMemorySpace* memory_space);
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view memory_space_kind) const override;
+
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind_id(int id) const;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents()
+      const override;
+
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    return nullptr;
+  }
+
+ private:
+  PjRtStreamExecutorDeviceDescription description_;
+  const PjRtLocalDeviceId local_device_id_;
+  const PjRtLocalHardwareId local_hardware_id_;
+  const std::unique_ptr<LocalDeviceState> local_device_state_;
+  PjRtClient* client_ = nullptr;
+  absl::InlinedVector<PjRtMemorySpace*, 1> memory_spaces_;
+  absl::flat_hash_map<int, PjRtMemorySpace*> memory_spaces_by_id_;
+};
+
+class PjRtStreamExecutorMemorySpace : public PjRtMemorySpace {
+ public:
+  PjRtStreamExecutorMemorySpace(int id, PjRtDevice* device,
+                                absl::string_view kind, int kind_id);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kind_; }
+
+  int kind_id() const override { return kind_id_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  absl::string_view kind_;
+  int kind_id_;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+class PjRtStreamExecutorClient : public PjRtClient {
+ public:
+  // `allocator` may null, in which case the platform default allocator is used.
+  explicit PjRtStreamExecutorClient(
+      std::string platform_name, LocalClient* client,
+      std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
+      int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      bool should_stage_host_to_device_transfers,
+      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
+  ~PjRtStreamExecutorClient() override = default;
+
+  int process_index() const override { return process_index_; }
+
+  int device_count() const override { return devices_.size(); }
+  int addressable_device_count() const override {
+    return addressable_devices_.size();
+  }
+  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override {
+    auto it = id_to_device_.find(global_device_id.value());
+    if (it != id_to_device_.end()) {
+      return it->second;
+    }
+    return InvalidArgument("No matching device found for device_id %d",
+                           global_device_id.value());
+  }
+
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override;
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  PjRtPlatformId platform_id() const override { return platform_id_; }
+  absl::string_view platform_name() const override { return platform_name_; }
+  absl::string_view platform_version() const override { return "<unknown>"; }
+
+  // Most platforms expect device-to-device transfers to be enqueued on the
+  // source d2d stream, but some platforms use the destination d2d stream. This
+  // function specifies which one the platform expects.
+  virtual bool EnqueueD2DTransfersOnSrcStream() const { return true; }
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options) override;
+
+  virtual absl::StatusOr<std::string> SerializeExecutable(
+      const PjRtLoadedExecutable& executable) const;
+
+  // For PjRtStreamExecutorClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) override;
+
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override;
+
+  // Creates a buffer on the device without initializing or copying any data.
+  // An optional `definition_event` may be speficied that can be used to
+  // ensure the buffer isn't referenced until some external mechanism has
+  // initialized the data.
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device) override;
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device,
+      std::shared_ptr<BufferSequencingEvent> definition_event);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override {
+    return Unimplemented("Async transfer to buffers not implemented");
+  };
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override {
+    return Unimplemented("Async transfer to buffers not implemented");
+  };
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                              PjRtDevice* device,
+                              PjRtCrossHostRecvNotifier notifier) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffersForGather(
+      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
+      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtDevice* device,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override;
+
+  // TODO(zhangqiaorjc): Experimental. Will be removed.
+  absl::Status Defragment() override {
+    return Unimplemented("Defragment not implemented");
+  }
+
+  LocalDeviceState& device_state(int device_ordinal) const {
+    return *tensorflow::down_cast<PjRtStreamExecutorDevice*>(
+                LookupAddressableDevice(xla::PjRtLocalDeviceId(device_ordinal))
+                    .value())
+                ->local_device_state();
+  }
+  LocalClient* client() const { return client_; }
+  se::DeviceMemoryAllocator* allocator() const { return allocator_; }
+  tsl::Allocator* host_memory_allocator() const {
+    return host_memory_allocator_.get();
+  }
+  bool should_stage_host_to_device_transfers() const {
+    return should_stage_host_to_device_transfers_;
+  }
+
+  gpu::GpuExecutableRunOptions* gpu_run_options() const {
+    return gpu_run_options_.get();
+  }
+
+  tsl::thread::ThreadPool* thread_pool() { return &thread_pool_; }
+
+ protected:
+  friend class PjRtStreamExecutorBuffer;
+
+  virtual absl::Status EnqueueCrossHostReceive(
+      absl::Span<const std::unique_ptr<PjRtBuffer>> buffers,
+      std::shared_ptr<BufferSequencingEvent> definition_event,
+      PjRtCrossHostRecvNotifier notifier,
+      std::optional<std::vector<GatherDetails>> gather_details) const {
+    return Unimplemented("Cross host receives not implemented.");
+  }
+
+  virtual void CopyToRemoteDevice(
+      PjRtBuffer* buffer, absl::string_view serialized_descriptor,
+      PjRtBuffer::RemoteSendCallback on_done) const {
+    on_done(Unimplemented("Cross host sends not implemented."),
+            /*sends_were_enqueued=*/false);
+  }
+
+  virtual void CopyToRemoteDeviceScattered(
+      PjRtBuffer* buffer, std::vector<std::string> serialized_descriptors,
+      std::vector<PjRtBuffer::RemoteSendCallback> callbacks,
+      const PjRtBuffer::ScatterDetails& scatter_details) const {
+    for (const auto& cb : callbacks) {
+      cb(Unimplemented("Scattered cross host sends not implemented."),
+         /*sends_were_enqueued=*/false);
+    }
+  }
+
+  virtual PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer,
+                                              PjRtFuture<void*> dst,
+                                              int64_t offset,
+                                              int64_t transfer_size) {
+    return PjRtFuture<>(Unimplemented("Raw copies to host not implemented."));
+  }
+
+  // Helper function for creating PjRtStreamExecutorExecutables. Modifies
+  // `options` in-place.
+  struct ExecutableExtras {
+    std::shared_ptr<DeviceAssignment> device_assignment;
+    std::vector<PjRtLoadedExecutable::LogicalDeviceIds>
+        addressable_device_logical_ids;
+    std::vector<PjRtDevice*> addressable_devices;
+  };
+  absl::StatusOr<ExecutableExtras> GetExecutableExtras(CompileOptions* options);
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileInternal(
+      const XlaComputation& computation,
+      const std::vector<const Shape*>& argument_layout_pointers,
+      LayoutCanonicalizationCallback layout_canonicalization_callback,
+      CompileOptions options);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternal(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout,
+      PjRtMemorySpace* memory_space);
+
+  const PjRtPlatformId platform_id_;
+  const std::string platform_name_;
+  LocalClient* client_;
+
+  // Allocator to be used for staging memory transfers to devices.
+  std::unique_ptr<tsl::Allocator> host_memory_allocator_;
+
+  // Device memory allocator. If owned, the allocator must outlive the devices,
+  // because it is the device destructor that waits for any outstanding work to
+  // complete.
+  se::DeviceMemoryAllocator* allocator_;
+  std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
+
+  // Includes all devices, including non-local devices on multi-host platforms.
+  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> owned_devices_;
+  // Pointers to `owned_devices_`.
+  std::vector<PjRtDevice*> devices_;
+  // Maps Device::id() to the corresponding Device. Includes all devices.
+  std::map<int, PjRtDevice*> id_to_device_;
+  // Local devices indexed by local device ordinal.
+  std::vector<PjRtDevice*> addressable_devices_;
+  int process_index_;
+
+  std::vector<std::unique_ptr<PjRtMemorySpace>> owned_memory_spaces_;
+  // Pointers to `owned_memory_spaces_`.
+  std::vector<PjRtMemorySpace*> memory_spaces_;
+
+  // Should we always prefer to stage host-to-device transfers via memory
+  // allocated on host_memory_allocator_? True only on GPU, where we prefer to
+  // transfer via pinned memory.
+  bool should_stage_host_to_device_transfers_;
+
+  std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options_;
+
+  tsl::thread::ThreadPool thread_pool_;
+
+  absl::Mutex transpose_mu_;
+  TransposePlanCache transpose_cache_ ABSL_GUARDED_BY(transpose_mu_);
+};
+
+// Converts a 2D set of Device objects indexed by [replica][partition] into an
+// xla::DeviceAssignment.
+absl::StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
+    absl::Span<const std::vector<PjRtDevice*>> devices);
+
+class PjRtStreamExecutorBuffer : public PjRtBuffer {
+ public:
+  // Helper class to retain a "hold" on a PjRtStreamExecutorBuffer. A ScopedHold
+  // may not outlive its parent PjRtStreamExecutorBuffer.
+  //
+  // There are three types of hold, as follows:
+  //
+  // 1) Usage hold: a transient hold while an operation using the buffer is
+  //    being enqueued onto a stream.
+  // A client acquires a usage hold by calling
+  // PjRtStreamExecutorBuffer::GetBufferWithHold(kUsage) or the convenience
+  // wrapper GetBufferWithUsageHold(). If the enqueue completes successfully the
+  // hold should be released using a call to ConvertUsageHold. If the ScopedHold
+  // is deleted without ConvertUsageHold being called, e.g., on error, the hold
+  // is dropped. It is legal to drop a usage hold instead of calling
+  // ConvertUsageHold, even if the buffer was successfully enqueued, as long as
+  // the client ensures that all necessary synchronization has been done.
+  //
+  // 2) External hold: a potentially long-lived hold while the buffer is being
+  //    shared by an external framework, e.g., NumPy.
+  // A client acquires an external hold by calling
+  // PjRtStreamExecutorBuffer::GetBufferWithHold(kExternal) or the convenience
+  // wrapper GetBufferWithExternalReference and releases it by deleting the
+  // ScopedHold. The external framework should not modify the underlying buffer
+  // unless it is confident via its own synchronization that modifications do
+  // not race with reads from the PjRtStreamExecutorBuffer.
+  //
+  // 3) Donation hold: a transient hold while an execution that donates the
+  //    buffer is being enqueued onto the compute stream.
+  // A client acquires a donation hold by calling
+  // PjRtStreamExecutorBuffer::GetBufferWithHold(kDonation). If the enqueue
+  // completes successfully the hold should be released using a call to
+  // ConfirmDonation after which the buffer is invalid. If the ScopedHold is
+  // deleted without ConfirmDonation being called, e.g., on error, the hold is
+  // dropped and the buffer remains valid. If the buffer is successfully
+  // enqueued the client *must* call ConfirmDonation.
+  //
+  // Donation holds behave like exclusive write locks: when a donation hold
+  // has been acquired, any attempt to acquire another hold of any type will
+  // block until the donation hold is dropped or confirmed. Acquiring a donation
+  // hold will fail with an error if there is any outstanding external hold, and
+  // will block if there are any outstanding usage holds until those holds are
+  // dropped or converted.
+  //
+  // Calls to PjRtStreamExecutorBuffer::Release (and transitively to
+  // PjRtStreamExecutorBuffer::Delete() and ~PjRtStreamExecutorBuffer()) will
+  // block until all usage and donation holds are either deleted or
+  // converted/confirmed.
+  class ScopedHold {
+   public:
+    enum Type { kUsage = 0, kExternalReference, kDonation, kMaxValue };
+    // Use a State enum instead of encoding the state in an error absl::Status
+    // to avoid creating absl::Status values in non-error cases. Creating a
+    // absl::Status entails several allocations and can add O(us) to every use
+    // of a hold.
+    enum State {
+      kUninitialized = 0,
+      kValid,
+      kMoved,
+      kConverted,
+      kReleased,
+      kDonated,
+      kError
+    };
+
+    ~ScopedHold();
+    ScopedHold(ScopedHold&& other);
+    ScopedHold(const ScopedHold&) = delete;
+    ScopedHold& operator=(const ScopedHold&) = delete;
+
+    Type type() const { return type_; }
+
+    absl::Status status() const {
+      // Lazily create absl::Status values only when they are requested.
+      switch (state_) {
+        case kUninitialized:
+          return InvalidArgument("Buffer has not been initialized");
+        case kValid:
+          return absl::OkStatus();
+        case kMoved:
+          return InvalidArgument("Buffer has been moved.");
+        case kConverted:
+          return InvalidArgument("Buffer has been converted");
+        case kReleased:
+          return InvalidArgument("Buffer has been released");
+        case kDonated:
+          return InvalidArgument("Buffer has been donated");
+        case kError:
+          return status_;
+        default:
+          CHECK(false) << "Unexpected state value " << state_;
+      }
+    }
+    bool ok() const { return state_ == kValid; }
+
+    // Access to the underlying device buffer storage. Requires this->ok().
+    const std::shared_ptr<TrackedDeviceBuffer>& buffer() const {
+      CHECK_EQ(state_, kValid);
+      CHECK_NE(buffer_, nullptr);
+      return buffer_;
+    }
+    TrackedDeviceBuffer* operator->() const { return buffer().get(); }
+    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
+
+    // Converts the hold into a usage event. Only valid for holds of type
+    // kUsage.
+    //
+    //   usage_stream:   the stream that the buffer was used on.
+    //   event:          an event that has been recorded on usage_stream after
+    //                   the buffer was used.
+    //   reference_held: true if and only if the caller has caused a
+    //                   reference to this->buffer() to stay live until after
+    //                   the host is sure that the usage (transfer or execution)
+    //                   has completed.
+    void ConvertUsageHold(se::Stream* usage_stream,
+                          std::shared_ptr<BufferSequencingEvent> event,
+                          bool reference_held);
+
+    // Confirms that the buffer was successfully donated to an execution.
+    // Only valid for holds of type kDonation. Causes the buffer to become
+    // invalid.
+    void ConfirmDonation();
+
+    // Adds the held device buffers in order to 'iterator'. Used to add the
+    // buffers to an ExecutionInput. We require but do not verify that
+    // 'iterator' when passed in is pointing to a sub-tuple of the
+    // ExecutionInput whose on_device_shape matches that of the
+    // TrackedDeviceBuffer. 'end' is used to check that 'iterator' doesn't run
+    // out of bounds. Donates the device buffers if the hold type is kDonation,
+    // otherwise retains ownership of the device buffers.
+    void AddToInput(ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
+                    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
+                    ExecutionInput* execution_input,
+                    se::DeviceMemoryAllocator* allocator) const;
+
+   private:
+    friend class PjRtStreamExecutorBuffer;
+    friend class PjRtStreamExecutorClient;
+
+    // Helper struct that makes it possible to move a ScopedHold through a
+    // closure.
+    using ForClosure =
+        std::tuple<PjRtStreamExecutorBuffer*, Type, State, absl::Status,
+                   std::shared_ptr<TrackedDeviceBuffer>>;
+
+    ScopedHold(PjRtStreamExecutorBuffer* parent, Type type)
+        : parent_(parent), type_(type), state_(kUninitialized) {}
+    explicit ScopedHold(const ForClosure& closure_helper)
+        : parent_(std::get<0>(closure_helper)),
+          type_(std::get<1>(closure_helper)),
+          state_(std::get<2>(closure_helper)),
+          status_(std::get<3>(closure_helper)),
+          buffer_(std::get<4>(closure_helper)) {
+      // Check the buffer is not in an error state.
+      CHECK(status_.ok() && buffer_ != nullptr);
+    }
+
+    // Sets buffer state.
+    void SetState(State state) { state_ = state; }
+
+    // Sets buffer_ and status_. Called by parent_ to initialize the hold.
+    void Acquire(
+        absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or);
+    // Releases the contents of *this, so *this can subsequently be
+    // deleted without releasing the parent's hold. Should be passed to the
+    // appropriate constructor of another ScopedHold, e.g., when a hold must be
+    // passed through a closure that is incompatible with std::move.
+    ForClosure ToClosure();
+
+    PjRtStreamExecutorBuffer* const parent_;
+    const Type type_;
+
+    // There is an invariant that if ok() then
+    // buffer_.value() != nullptr.
+    State state_;
+    absl::Status status_;
+    std::shared_ptr<TrackedDeviceBuffer> buffer_;
+  };
+
+  PjRtStreamExecutorBuffer(Shape on_device_shape,
+                           std::shared_ptr<TrackedDeviceBuffer> device_buffer,
+                           PjRtClient* client, PjRtDevice* device,
+                           PjRtMemorySpace* memory_space);
+  ~PjRtStreamExecutorBuffer() override;
+
+  PjRtStreamExecutorBuffer(const PjRtStreamExecutorBuffer&) = delete;
+  PjRtStreamExecutorBuffer(PjRtStreamExecutorBuffer&&) = delete;
+  PjRtStreamExecutorBuffer& operator=(const PjRtStreamExecutorBuffer&) = delete;
+  PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
+
+  const Shape& on_device_shape() const override { return on_device_shape_; }
+  absl::StatusOr<Shape> logical_on_device_shape() override;
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
+  PjRtStreamExecutorDevice* device() const override { return device_; }
+  PjRtPlatformId platform_id() const { return client_->platform_id(); }
+  absl::string_view platform_name() const { return client_->platform_name(); }
+  PjRtStreamExecutorClient* client() const override { return client_; }
+  bool IsEmptyTuple() const {
+    return on_device_shape_.IsTuple() &&
+           on_device_shape_.tuple_shapes_size() == 0;
+  }
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override;
+
+  using PjRtBuffer::ToLiteralSync;
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+  PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
+      override;
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override;
+
+  PjRtFuture<> CopyRawToHostFuture(PjRtFuture<void*> dst, int64_t offset,
+                                   int64_t transfer_size) override;
+
+  // Drops the buffer's reference to its associated device memory, leaving the
+  // buffer in an invalid state. The memory will be freed lazily when all async
+  // operations using the buffer have completed, according to the allocation
+  // semantics of the underlying platform. Delete may briefly block if another
+  // thread is in the process of enqueuing an operation on this buffer, but it
+  // will never block for a stream operation to complete. If an external
+  // framework holds a reference to the TrackedDeviceBuffer via
+  // GetBufferWithExternalReference, the memory will not be freed until the
+  // external framework drops the reference.
+  void Delete() override;
+
+  bool IsDeleted() override;
+
+  // Returns a view of the PjRtBuffer device memory as a ShapedBuffer. The
+  // PjRtBuffer retains ownership of the device buffers.
+  absl::StatusOr<ShapedBuffer> AsShapedBuffer() const;
+
+  // Returns a hold on the TrackedDeviceBuffer holding the device
+  // buffers. See comment on ScopedHold.
+  ScopedHold GetBufferWithHold(ScopedHold::Type type);
+  ScopedHold GetBufferWithUsageHold() {
+    return GetBufferWithHold(ScopedHold::kUsage);
+  }
+  ScopedHold GetBufferWithExternalReference() {
+    return GetBufferWithHold(ScopedHold::kExternalReference);
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override;
+
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override;
+
+  void CopyToRemoteDeviceScattered(
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const ScatterDetails& scatter_details) override;
+
+  PjRtFuture<> GetReadyFuture() override;
+
+  bool IsOnCpu() const override;
+
+  // Similar to Delete, drops the buffer's reference to its associated device
+  // memory, leaving the buffer in an invalid state, but returns the
+  // TrackedDeviceBuffer rather than freeing the device memory, so that another
+  // framework can take ownership of it.
+  //
+  // When called with wait_for_operations_to_complete=false, the buffer returned
+  // from Release should be dropped on the compute stream, since the only events
+  // that Release doesn't wait for are events defined on the compute stream.
+  //
+  // If wait_for_operations_to_complete=true, the host will block until any
+  // potentially outstanding asynchronous operations have completed before
+  // returning, in which case it is safe to read or mutate the returned buffer.
+  // If the buffer was shared via an external reference it is the client's
+  // responsibility that accesses via that reference do not interfere with
+  // accesses via the buffer returned from Release.
+  absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
+      bool wait_for_operations_to_complete);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      PjRtFuture<> dependency) override;
+
+ private:
+  friend class PjRtClient;
+
+  // Blocks in mu_.Await until there are no more usage holds.
+  void WaitForOutstandingUsageHolds() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Blocks in mu_.Await until there is no donation hold.
+  void WaitForOutstandingDonationHold() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Adds a hold of 'type' and returns device_buffer_. Returns an error if
+  // device_buffer_ is null, or if a donation hold was requested when there is
+  // an outstanding external hold.
+  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
+  // must be called first.)
+  absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
+      ScopedHold::Type type) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
+  // Initializes hold with an error if device_buffer_ is null, or if a donation
+  // hold was requested when there is an outstanding external hold.
+  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
+  // must be called first.)
+  void AcquireHoldLocked(ScopedHold* hold) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
+  // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
+  // device_buffer_ was successfully enqueued on a stream.
+  void ConvertUsageHold(TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
+                        std::shared_ptr<BufferSequencingEvent> event,
+                        bool reference_held);
+
+  // Drops a donation hold and makes *this invalid for further use. Does a
+  // sanity check that buffer==device_buffer_. Called after device_buffer_ was
+  // successfully donated to an execution.
+  void ConfirmDonation(TrackedDeviceBuffer* device_buffer);
+
+  // Drops a hold without taking any other action. Does a sanity check that
+  // buffer==device_buffer_ or device_buffer_==nullptr.
+  void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
+
+  absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                           std::shared_ptr<BufferSequencingEvent>>>
+  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
+                     PjRtMemorySpace* dst_memory_space,
+                     LocalDeviceState* transfer_local_device,
+                     LocalDeviceState* src_local_device,
+                     se::Stream* transfer_stream,
+                     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceMemorySpace(
+      PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space = nullptr);
+
+  PjRtStreamExecutorClient* const client_;
+  const Shape on_device_shape_;
+  PjRtStreamExecutorDevice* const device_;
+  PjRtMemorySpace* const memory_space_;
+
+  mutable absl::Mutex mu_;
+  std::shared_ptr<TrackedDeviceBuffer> device_buffer_ ABSL_GUARDED_BY(mu_);
+  // Count of holds on the buffer.
+  std::array<int, ScopedHold::Type::kMaxValue> holds_ ABSL_GUARDED_BY(mu_);
+  PjRtFuture<>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
+};
+
+// Allocates the device buffers for a buffer that will be used as the
+// destination of a copy, either from the host or another device. copy_stream
+// may be nullptr, e.g., when allocating a buffer for a cross-host copy. If the
+// buffer is a tuple then the tuple tables are allocated, and all necessary
+// synchronization for them is dealt with, before the buffer is returned.
+//
+// It is safe to delete the returned PjRtBuffer without further
+// synchronization if an error occurs before the buffer is used.
+//
+// The caller may optionally provide a definition event to be recorded in
+// the buffer.
+// TODO(phawkins): replace on_host_shape here with on_device_shape.
+absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
+AllocateDestinationBuffer(
+    const Shape& on_host_shape, PjRtDevice* device,
+    LocalDeviceState* local_device, se::Stream* copy_stream,
+    bool is_uninitialized_create, PjRtStreamExecutorClient* client,
+    std::shared_ptr<BufferSequencingEvent> definition_event = nullptr,
+    PjRtMemorySpace* memory_space = nullptr);
+
+// Wraps one or more XLA LocalExecutables (one per partition, as specified by
+// the build options).
+class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
+ public:
+  PjRtStreamExecutorLoadedExecutable(
+      std::vector<std::unique_ptr<LocalExecutable>> executables,
+      bool parameter_is_tupled_arguments,
+      std::shared_ptr<DeviceAssignment> device_assignment,
+      CompileOptions compile_options,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
+      std::vector<PjRtDevice*> addressable_devices,
+      PjRtStreamExecutorClient* client);
+
+  ~PjRtStreamExecutorLoadedExecutable() override = default;
+
+  PjRtStreamExecutorClient* client() const override { return client_; }
+
+  absl::string_view name() const override;
+
+  int num_replicas() const override {
+    return executables_[0]->build_options().num_replicas();
+  }
+
+  int num_partitions() const override {
+    return executables_[0]->build_options().num_partitions();
+  }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    int64_t size = 0;
+    for (auto& executable : executables_) {
+      size += executable->executable()->SizeOfGeneratedCodeInBytes();
+    }
+    return size;
+  }
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    if (executables_.size() != 1) {
+      return Unimplemented(
+          "Retrieving CompiledMemoryStats is not supported for multiple "
+          "executables.");
+    }
+    CompiledMemoryStats memory_stats = CompiledMemoryStats();
+    memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
+    const HloProto* proto = executables_[0]->executable()->hlo_proto();
+    if (proto != nullptr) {
+      memory_stats.serialized_hlo_proto = proto->SerializeAsString();
+    }
+    memory_stats.PopulateBufferStatsFromAllocations(
+        executables_[0]->executable()->GetAllocations());
+    return memory_stats;
+  }
+
+  const DeviceAssignment& device_assignment() const override {
+    return *device_assignment_;
+  }
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    return addressable_device_logical_ids_;
+  }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  // Return an HloModule per partition.
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override;
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
+
+  using PjRtLoadedExecutable::Execute;
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
+
+  using PjRtLoadedExecutable::ExecuteSharded;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  using PjRtLoadedExecutable::ExecutePortable;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  void Delete() override { executables_.clear(); }
+
+  bool IsDeleted() override { return executables_.empty(); }
+
+  absl::StatusOr<std::string> SerializeExecutable() const override {
+    return client_->SerializeExecutable(*this);
+  }
+
+  bool IsReturnedFutureSupported() const override { return true; }
+
+  absl::Span<const std::shared_ptr<LocalExecutable>> executables() const {
+    return executables_;
+  }
+
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return compile_options_;
+  }
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
+    return fingerprint_;
+  };
+
+  void SetInputHloSnapshotBits(HloModuleProto hlo_module,
+                               DebugOptions debug_options) {
+    input_hlo_snapshot_bits_ =
+        std::make_optional<InputHloSnapshotBits>(InputHloSnapshotBits{
+            HloModuleProto(std::move(hlo_module)), std::move(debug_options)});
+  }
+
+ protected:
+  bool parameter_is_tupled_arguments() const {
+    return parameter_is_tupled_arguments_;
+  }
+
+ private:
+  friend class PjRtStreamExecutorClient;
+  friend class PjRtTpuClient;
+  friend class InternalPjRtTpuClient;
+  friend class StreamExecutorGpuClient;
+  // Initializes information about which arguments to which executables must be
+  // donated due to aliases that were specified by the computation.
+  absl::Status SetUpDonation(bool tuple_inputs);
+
+  // Returns a sorted list of the parameters that must be donated. Derived
+  // classes may use custom logic.
+  virtual absl::Span<int const> ParametersThatMustBeDonated(
+      int executable_idx) const;
+
+  virtual absl::StatusOr<std::vector<ExecutionInput>>
+  MakeExecutionInputsAndWaitForEvents(
+      int device_ordinal, const ExecuteOptions& options,
+      absl::Span<const Shape> executable_parameter_shapes,
+      absl::Span<PjRtBuffer* const> argument_handles,
+      absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
+      absl::flat_hash_set<BufferSequencingEvent*>& events) const;
+
+  absl::StatusOr<ScopedShapedBuffer> EnqueueExecution(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, int executable_idx, const RunId& run_id,
+      const ExecuteOptions& options, PjRtDevice* device,
+      std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
+      std::shared_ptr<DeviceAssignment> device_assignment,
+      std::vector<std::function<void()>>& compute_callbacks) const;
+
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeOutputBuffers(int device_ordinal, const ExecuteOptions& options,
+                    ScopedShapedBuffer result_buffer,
+                    std::shared_ptr<BufferSequencingEvent> definition_event,
+                    PjRtDevice* device,
+                    std::vector<std::function<void()>>& compute_callbacks,
+                    std::vector<std::shared_ptr<TrackedDeviceBuffer>>&
+                        buffers_to_release) const;
+
+  absl::StatusOr<Result> ExecuteHelper(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const RunId& run_id, const ExecuteOptions& options,
+      bool fill_future, PjRtDevice* device = nullptr) const;
+
+  absl::Status VerifyCompatibleDevices() const;
+
+  // Create shared pointers so we can free them after the execution: with
+  // asynchronous execution, the process being executed can outlive the
+  // executable itself.
+  PjRtStreamExecutorClient* const client_;
+  // One executable per partition.
+  std::vector<std::shared_ptr<LocalExecutable>> executables_;
+  // On device shapes of the executable parameters.
+  std::vector<std::vector<Shape>> on_device_executable_parameter_shapes_;
+  // Per-executable sorted vector of parameters that have any aliased buffers
+  // and thus must be donated when executing the computation.
+  std::vector<std::vector<int>> parameters_that_must_be_donated_;
+  std::shared_ptr<DeviceAssignment> device_assignment_;
+  CompileOptions compile_options_;
+
+  // True if the executables were compiled expecting arguments in a single
+  // tuple.
+  const bool parameter_is_tupled_arguments_;
+
+  // The replica and partition indices of device_assignment_ to be run by this
+  // client. On single-host platforms without partitioning, this is all replicas
+  // (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may not be the
+  // case on multi-host platforms. If there are 4 replicas and 2 partitions on a
+  // single host platform, size of addressable_device_logical_ids_ is 4*2 = 8.
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
+
+  // addressable_devices_[i] is the Device to which
+  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
+  // unique_ptrs to play well with the Python bindings (see xla.cc).
+  std::vector<PjRtDevice*> addressable_devices_;
+  std::string fingerprint_;
+
+  struct InputHloSnapshotBits {
+    HloModuleProto hlo_module;
+    DebugOptions debug_options;
+  };
+
+  // The unoptimized (unsharded) HloModule. Primarily used for debugging.
+  std::optional<InputHloSnapshotBits> input_hlo_snapshot_bits_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.h
new file mode 100644
index 00000000..ed6c9af8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_EXAMPLE_PLUGIN_MYPLUGIN_C_PJRT_H_
+#define XLA_PJRT_PLUGIN_EXAMPLE_PLUGIN_MYPLUGIN_C_PJRT_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+extern "C" {
+
+// Does not pass ownership of returned PJRT_Api* to caller.
+const PJRT_Api* GetPjrtApi();
+}
+
+#endif  // XLA_PJRT_PLUGIN_EXAMPLE_PLUGIN_MYPLUGIN_C_PJRT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt.h
new file mode 100644
index 00000000..dceab2ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_EXAMPLE_PLUGIN_MYPLUGIN_CPP_PJRT_H_
+#define XLA_PJRT_PLUGIN_EXAMPLE_PLUGIN_MYPLUGIN_CPP_PJRT_H_
+
+#include <memory>
+
+#include "xla/pjrt/pjrt_client.h"
+
+namespace myplugin_pjrt {
+
+// Wrapper to create the C++ PjRtClient. Class definition inside the .cc file.
+std::unique_ptr<xla::PjRtClient> CreateMyPluginPjrtClient();
+
+}  // namespace myplugin_pjrt
+
+#endif  // XLA_PJRT_PLUGIN_EXAMPLE_PLUGIN_MYPLUGIN_CPP_PJRT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
new file mode 100644
index 00000000..aec80176
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_CPU_CPU_CLIENT_OPTIONS_H_
+#define XLA_PJRT_PLUGIN_XLA_CPU_CPU_CLIENT_OPTIONS_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace xla {
+
+// Options for creating an XLA:CPU PjRtClient.
+struct CpuClientOptions {
+  // Used to control whether asynchronous computation dispatch is available for
+  // this client. Only applies to non-parallel computations, because collectives
+  // may exist when there are multiple cpu devices and we need to do async
+  // dispatch in that case. If it is set to be `false`, we will always run
+  // computations inline.
+  bool asynchronous = true;
+
+  // Number of CPU devices. If not provided, the value of
+  // --xla_force_host_platform_device_count is used.
+  std::optional<int> cpu_device_count = std::nullopt;
+
+  int max_inflight_computations_per_device = 32;
+
+  // My process ID.
+  int process_id = 0;
+
+  // Distributed collectives implementation. Optional. If not provided, an
+  // in-process collectives implementation will be used.
+  std::shared_ptr<cpu::CpuCollectives> collectives;
+
+  // If defined this function will be called on the HloModuleConfig before
+  // compilation, and allows users to set custom flags.
+  std::function<void(HloModuleConfig&)> customize_hlo_module_config;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_CPU_CPU_CLIENT_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_device_description.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_device_description.h
new file mode 100644
index 00000000..0ea1861e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_device_description.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_CPU_CPU_DEVICE_DESCRIPTION_H_
+#define XLA_PJRT_PLUGIN_XLA_CPU_CPU_DEVICE_DESCRIPTION_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_device_description.h"
+
+namespace xla {
+
+class CpuDeviceDescription final : public PjRtDeviceDescription {
+ public:
+  explicit CpuDeviceDescription(int process_id, int local_device_id);
+
+  int id() const override { return id_.value(); }
+
+  int process_index() const override { return process_index_; }
+
+  int local_hardware_id() const { return local_hardware_id_; }
+
+  absl::string_view device_kind() const override;
+
+  absl::string_view DebugString() const override;
+
+  absl::string_view ToString() const override;
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  PjRtGlobalDeviceId id_;
+  int process_index_;
+  int local_hardware_id_;
+  std::string debug_string_;
+  std::string to_string_;
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_CPU_CPU_DEVICE_DESCRIPTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology.h
new file mode 100644
index 00000000..24c5e1c9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_CPU_CPU_TOPOLOGY_H_
+#define XLA_PJRT_PLUGIN_XLA_CPU_CPU_TOPOLOGY_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology.pb.h"
+
+namespace xla {
+class CpuTopology {
+ public:
+  struct CpuDevice {
+    int process_id;
+    int local_device_id;
+
+    bool operator==(const CpuDevice& other) const {
+      return process_id == other.process_id &&
+             local_device_id == other.local_device_id;
+    }
+  };
+
+  explicit CpuTopology(std::vector<CpuDevice> cpu_devices,
+                       std::vector<std::string> machine_attributes)
+      : cpu_devices_(std::move(cpu_devices)),
+        machine_attributes_(std::move(machine_attributes)) {}
+
+  int number_of_devices() const { return cpu_devices_.size(); }
+  absl::Span<const CpuDevice> devices() const { return cpu_devices_; }
+  absl::Span<const std::string> machine_attributes() const {
+    return machine_attributes_;
+  }
+
+  static std::unique_ptr<const CpuTopology> FromProto(
+      const CpuTopologyProto& proto);
+  CpuTopologyProto ToProto() const;
+
+ private:
+  const std::vector<CpuDevice> cpu_devices_;
+  const std::vector<std::string> machine_attributes_;
+};
+
+static const int kMaxCpuDevicesPerProcess = 1 << 17;
+
+inline PjRtGlobalDeviceId PackCpuDeviceId(int process_index, int device_id) {
+  return PjRtGlobalDeviceId(kMaxCpuDevicesPerProcess * process_index +
+                            device_id);
+}
+
+inline int UnpackCpuProcessIndex(PjRtGlobalDeviceId global_device_id) {
+  return global_device_id.value() / kMaxCpuDevicesPerProcess;
+}
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_CPU_CPU_TOPOLOGY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
new file mode 100644
index 00000000..545644c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
@@ -0,0 +1,125 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_CPU_CPU_TOPOLOGY_DESCRIPTION_H_
+#define XLA_PJRT_PLUGIN_XLA_CPU_CPU_TOPOLOGY_DESCRIPTION_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
+
+namespace xla {
+
+class CpuTopologyDescription : public PjRtTopologyDescription {
+ public:
+  static CpuTopologyDescription Create(
+      PjRtPlatformId platform_id, absl::string_view platform_name,
+      absl::string_view platform_version,
+      absl::Span<const std::unique_ptr<PjRtDevice>> devices,
+      absl::Span<const std::string> machine_attributes);
+
+  // `cpu_device_ids` is the list of logical device ids for the CPU devices and
+  // will be used to initialize the CPU topology.
+  CpuTopologyDescription(const PjRtPlatformId platform_id,
+                         const absl::string_view platform_name,
+                         const absl::string_view platform_version,
+                         const std::vector<CpuTopology::CpuDevice> cpu_devices,
+                         absl::Span<const std::string> machine_attributes)
+      : platform_id_(platform_id),
+        platform_name_(platform_name),
+        platform_version_(platform_version),
+        cpu_topology_(std::move(cpu_devices),
+                      std::vector<std::string>(machine_attributes.begin(),
+                                               machine_attributes.end())) {}
+
+  bool operator==(const CpuTopologyDescription& other) const {
+    return this->platform_id() == other.platform_id() &&
+           this->platform_name() == other.platform_name() &&
+           this->platform_version() == other.platform_version() &&
+           this->cpu_topology().devices() == other.cpu_topology().devices();
+  }
+
+  PjRtPlatformId platform_id() const override { return platform_id_; }
+
+  absl::string_view platform_name() const override { return platform_name_; }
+
+  absl::string_view platform_version() const override {
+    return platform_version_;
+  }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override;
+
+  const CpuTopology& cpu_topology() const { return cpu_topology_; }
+  const CpuTopology* cpu_topology_ptr() const { return &cpu_topology_; }
+
+  // No subslice is supported.
+  bool is_subslice_topology() const override { return false; }
+
+  // TODO(b/319478189): We support multi-host CPU computations and should
+  // correctly report process count.
+  absl::StatusOr<int> ProcessCount() const override { return 1; }
+
+  absl::StatusOr<int> CoreCountOfDefaultType() const override {
+    return cpu_topology_.number_of_devices();
+  }
+
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
+    return cpu_topology_.number_of_devices();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
+    return cpu_topology_.number_of_devices();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+    return 1;
+  }
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  // Returns vendor specific attributes about the topology.
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override;
+
+ private:
+  const PjRtPlatformId platform_id_;
+  const std::string platform_name_;
+  const std::string platform_version_;
+  const CpuTopology cpu_topology_;
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_CPU_CPU_TOPOLOGY_DESCRIPTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h
new file mode 100644
index 00000000..ac16b9aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_CPU_XLA_CPU_PJRT_CLIENT_H_
+#define XLA_PJRT_PLUGIN_XLA_CPU_XLA_CPU_PJRT_CLIENT_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla {
+
+// Public entry point to get an XLA:CPU PjRtClient
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetXlaPjrtCpuClient(
+    CpuClientOptions options);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_CPU_XLA_CPU_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h
new file mode 100644
index 00000000..56947853
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_ALLOCATOR_CONFIG_H_
+#define XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_ALLOCATOR_CONFIG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+namespace xla {
+
+struct GpuAllocatorConfig {
+  enum class Kind {
+    kDefault,   // Client picks the best option for the platform.
+    kPlatform,  // The platform's default.
+    kBFC,  // Allocator using a "Best-Fit with Coalescing" algorithm. Currently
+           // only available for GPU.
+    kCudaAsync,  // Use the CUDA async allocator.
+  };
+  Kind kind = Kind::kDefault;
+
+  // Only used if kind == kBFC. The maximum fraction of available memory to
+  // allocate. This is the default value of XLA_CLIENT_MEM_FRACTION.
+  //
+  // If `gpu_system_memory_size` is set, it determines memory allocation.
+  // `memory_fraction` won't be used in this case.
+  double memory_fraction = 0.75;
+
+  // Only used if kind == kBFC. The absolute size of reserved memory space for
+  // GPU system in bytes.
+  //
+  // If null, the default value `memory_fraction` will be used.
+  std::optional<int64_t> gpu_system_memory_size = std::nullopt;
+
+  // Only used if kind == kBFC. If true, the allocator will immediately allocate
+  // the maximum amount allowed by `memory_fraction`. This reduces
+  // fragmentation, allowing more of the total memory to be used. If false, the
+  // allocator will allocate more memory as allocations are requested.
+  bool preallocate = true;
+
+  // Amount of collective memory (ncclMemAlloc) to preallocate. If this value is
+  // 0, collective memory space will be grown as needed to fit the application's
+  // usage, with the drawback of potentially higher fragmentation. If set,
+  // should be set to a multiple of 512MB to avoid wasting memory due to
+  // granularity requirements.
+  size_t collective_memory_size = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_ALLOCATOR_CONFIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
new file mode 100644
index 00000000..e40be6b4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_CLIENT_OPTIONS_H_
+#define XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_CLIENT_OPTIONS_H_
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
+
+namespace xla {
+
+// Options for creating a XLA:GPU PjRtClient.
+struct GpuClientOptions {
+  GpuAllocatorConfig allocator_config;
+
+  int node_id = 0;
+
+  int num_nodes = 1;
+
+  std::optional<std::set<int>> allowed_devices = std::nullopt;
+
+  std::optional<std::string> platform_name = std::nullopt;
+
+  bool should_stage_host_to_device_transfers = true;
+
+  // kv_store must be non-null if num_nodes > 1.
+  std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
+
+  bool enable_mock_nccl = false;
+
+  std::optional<std::string> mock_gpu_topology;
+};
+
+}  //  namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_CLIENT_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h
new file mode 100644
index 00000000..fb01f943
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_PJRT_CLIENT_H_
+#define XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_PJRT_CLIENT_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla {
+
+// Public entry point to get an XLA:GPU PjRtClient
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetXlaPjrtGpuClient(
+    GpuClientOptions options);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_GPU_XLA_GPU_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_tpu/xla_tpu_pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_tpu/xla_tpu_pjrt_client.h
new file mode 100644
index 00000000..39050706
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/plugin/xla_tpu/xla_tpu_pjrt_client.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_XLA_TPU_XLA_TPU_PJRT_CLIENT_H_
+#define XLA_PJRT_PLUGIN_XLA_TPU_XLA_TPU_PJRT_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+
+namespace xla {
+
+// Public entry point to get an XLA:TPU PjRtClient with default options
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetXlaPjrtTpuClient(
+    const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {},
+    std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PLUGIN_XLA_TPU_XLA_TPU_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/semaphore.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/semaphore.h
new file mode 100644
index 00000000..acbda4dc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/semaphore.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_SEMAPHORE_H_
+#define XLA_PJRT_SEMAPHORE_H_
+
+#include <cstdint>
+
+#include "absl/synchronization/mutex.h"
+#include "xla/types.h"
+
+namespace xla {
+
+class Semaphore {
+ public:
+  explicit Semaphore(int64_t capacity);
+
+  // Acquires `amount` units. Blocks until `amount` units are available.
+  void Acquire(int64_t amount);
+
+  // Tries to acquire `amount` units. Returns true if successful. Returns false
+  // immediately if not enough units are available.
+  bool TryAcquire(int64_t amount);
+
+  // Returns `amount` units to the semaphore.
+  void Release(int64_t amount);
+
+  // Returns the capacity of the semaphore.
+  int64_t capacity() const { return max_capacity_; }
+
+  class ScopedReservation {
+   public:
+    ScopedReservation(Semaphore* semaphore, int64_t amount)
+        : semaphore_(semaphore), amount_(amount) {}
+    ~ScopedReservation();
+
+    ScopedReservation(const ScopedReservation&) = delete;
+    ScopedReservation(ScopedReservation&& other) noexcept;
+    ScopedReservation& operator=(const ScopedReservation&) = delete;
+    ScopedReservation& operator=(ScopedReservation&& other) noexcept;
+    int64_t amount() const { return amount_; }
+
+   private:
+    Semaphore* semaphore_;
+    int64_t amount_;
+  };
+  // RAII version of Acquire. Releases the reservation when the
+  // ScopedReservation is destroyed.
+  ScopedReservation ScopedAcquire(int64_t amount);
+
+ private:
+  struct CanAcquireArgs {
+    Semaphore* semaphore;
+    int64_t amount;
+  };
+  static bool CanAcquire(CanAcquireArgs* args)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(args->semaphore->mu_);
+
+  absl::Mutex mu_;
+  int64_t value_ ABSL_GUARDED_BY(mu_);
+  const int64_t max_capacity_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_SEMAPHORE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/status_casters.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/status_casters.h
new file mode 100644
index 00000000..5dd48e2f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/status_casters.h
@@ -0,0 +1,218 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_STATUS_CASTERS_H_
+#define XLA_PJRT_STATUS_CASTERS_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/exceptions.h"
+#include "tsl/platform/macros.h"
+
+namespace xla {
+
+// C++ -> Python caster helpers.
+//
+// Failing statuses become Python exceptions; OK absl::Status() becomes None.
+//
+// Given there can be only a single global pybind11 type_caster for the
+// `absl::Status` type, and given XLA wants a custom exception being raised,
+// we use a dedicated helper to implement this feature without relying on a
+// global `type_caster`.
+//
+// For example:
+//
+// - Functions without arguments:
+//   m.def("my_func", []() { xla::ThrowIfError(MyFunc()); }
+// - Classes with a single argument:
+//   py_class.def("delete", [](Buffer& self) {
+//     xla::ThrowIfError(self.Delete());
+//   }
+//
+// For functions with more arguments, you can either inline the arguments,
+// or use the `ThrowIfErrorWrapper` wrapper defined below:
+//
+// m.def("my_func", xla::ThrowIfErrorWrapper(MyFunc));
+//
+// Nonstatic member functions can be wrapped by passing a
+// pointer-to-member-function:
+// xla::ThrowIfErrorWrapper(&MyClass::MyMethod)
+
+inline void ThrowIfError(absl::Status src) {
+  if (!src.ok()) {
+    throw xla::XlaRuntimeError(src);
+  }
+}
+
+// If one does not want to have to define a lambda specifying the inputs
+// arguments, on can use the `ThrowIfErrorWrapper` wrapper.
+//
+// There are three specializations:
+// - For free functions, `Sig` is the function type and `F` is `Sig&`.
+// - For callable types, `Sig` is the pointer to member function type
+//   and `F` is the type of the callable.
+// - For a nonstatic member function of a class `C`, `Sig` is the function type
+//   and `F` is Sig C::*.
+//
+// In the first two cases, the wrapper returns a callable with signature `Sig`;
+// in the third case, the wrapper returns callable with a modified signature
+// that takes a C instance as the first argument.
+template <typename Sig, typename F>
+struct ThrowIfErrorWrapper;
+
+// C++17 "deduction guide" that guides class template argument deduction (CTAD)
+// For free functions.
+template <typename F>
+ThrowIfErrorWrapper(F) -> ThrowIfErrorWrapper<decltype(&F::operator()), F>;
+
+// For callable types (with operator()).
+template <typename... Args>
+ThrowIfErrorWrapper(absl::Status (&)(Args...))
+    -> ThrowIfErrorWrapper<absl::Status(Args...), absl::Status (&)(Args...)>;
+
+// For unbound nonstatic member functions.
+template <typename C, typename... Args>
+ThrowIfErrorWrapper(absl::Status (C::*)(Args...))
+    -> ThrowIfErrorWrapper<absl::Status(Args...), C>;
+
+// Template specializations.
+
+// For free functions.
+template <typename... Args>
+struct ThrowIfErrorWrapper<absl::Status(Args...), absl::Status (&)(Args...)> {
+  explicit ThrowIfErrorWrapper(absl::Status (&f)(Args...)) : func(f) {}
+  void operator()(Args... args) const {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  absl::Status (&func)(Args...);
+};
+
+// For callable types (with operator()), non-const and const versions.
+template <typename C, typename... Args, typename F>
+struct ThrowIfErrorWrapper<absl::Status (C::*)(Args...), F> {
+  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
+  void operator()(Args... args) const {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+template <typename C, typename... Args, typename F>
+struct ThrowIfErrorWrapper<absl::Status (C::*)(Args...) const, F> {
+  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
+  void operator()(Args... args) const {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+
+// For unbound nonstatic member functions, non-const and const versions.
+// `ptmf` stands for "pointer to member function".
+template <typename C, typename... Args>
+struct ThrowIfErrorWrapper<absl::Status(Args...), C> {
+  explicit ThrowIfErrorWrapper(absl::Status (C::*ptmf)(Args...)) : ptmf(ptmf) {}
+  void operator()(C& instance, Args... args) const {
+    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  absl::Status (C::*ptmf)(Args...);
+};
+template <typename C, typename... Args>
+struct ThrowIfErrorWrapper<absl::Status(Args...) const, C> {
+  explicit ThrowIfErrorWrapper(absl::Status (C::*ptmf)(Args...) const)
+      : ptmf(ptmf) {}
+  void operator()(const C& instance, Args... args) const {
+    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  absl::Status (C::*ptmf)(Args...) const;
+};
+
+// Utilities for `StatusOr`.
+template <typename T>
+T ValueOrThrow(absl::StatusOr<T> v) {
+  if (!v.ok()) {
+    throw xla::XlaRuntimeError(v.status());
+  }
+  return std::move(v).value();
+}
+
+template <typename Sig, typename F>
+struct ValueOrThrowWrapper;
+
+template <typename F>
+ValueOrThrowWrapper(F) -> ValueOrThrowWrapper<decltype(&F::operator()), F>;
+
+template <typename R, typename... Args>
+ValueOrThrowWrapper(absl::StatusOr<R> (&)(Args...))
+    -> ValueOrThrowWrapper<absl::StatusOr<R>(Args...),
+                           absl::StatusOr<R> (&)(Args...)>;
+
+template <typename C, typename R, typename... Args>
+ValueOrThrowWrapper(absl::StatusOr<R> (C::*)(Args...))
+    -> ValueOrThrowWrapper<absl::StatusOr<R>(Args...), C>;
+
+// Deduction guide for const methods.
+template <typename C, typename R, typename... Args>
+ValueOrThrowWrapper(absl::StatusOr<R> (C::*)(Args...) const)
+    -> ValueOrThrowWrapper<absl::StatusOr<R>(Args...) const, C>;
+
+template <typename R, typename... Args>
+struct ValueOrThrowWrapper<absl::StatusOr<R>(Args...),
+                           absl::StatusOr<R> (&)(Args...)> {
+  explicit ValueOrThrowWrapper(absl::StatusOr<R> (&f)(Args...)) : func(f) {}
+  R operator()(Args... args) const {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  absl::StatusOr<R> (&func)(Args...);
+};
+template <typename R, typename C, typename... Args, typename F>
+struct ValueOrThrowWrapper<absl::StatusOr<R> (C::*)(Args...), F> {
+  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
+  R operator()(Args... args) const {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+template <typename R, typename C, typename... Args, typename F>
+struct ValueOrThrowWrapper<absl::StatusOr<R> (C::*)(Args...) const, F> {
+  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
+  R operator()(Args... args) const {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+
+// For unbound nonstatic member functions, non-const and const versions.
+// `ptmf` stands for "pointer to member function".
+template <typename R, typename C, typename... Args>
+struct ValueOrThrowWrapper<absl::StatusOr<R>(Args...), C> {
+  explicit ValueOrThrowWrapper(absl::StatusOr<R> (C::*ptmf)(Args...))
+      : ptmf(ptmf) {}
+  R operator()(C& instance, Args... args) const {
+    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  absl::StatusOr<R> (C::*ptmf)(Args...);
+};
+template <typename R, typename C, typename... Args>
+struct ValueOrThrowWrapper<absl::StatusOr<R>(Args...) const, C> {
+  explicit ValueOrThrowWrapper(absl::StatusOr<R> (C::*ptmf)(Args...) const)
+      : ptmf(ptmf) {}
+  R operator()(const C& instance, Args... args) const {
+    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  absl::StatusOr<R> (C::*ptmf)(Args...) const;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_STATUS_CASTERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/stream_executor_executable.h
new file mode 100644
index 00000000..826e4f29
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -0,0 +1,100 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
+#define XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/compiler.h"
+
+namespace xla {
+class StreamExecutorExecutable : public PjRtExecutable {
+ public:
+  StreamExecutorExecutable(
+      const CompileOptions& compile_options,
+      std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
+      int num_replicas, int num_partitions, absl::string_view name,
+      absl::string_view fingerprint,
+      std::optional<std::vector<std::vector<absl::string_view>>>
+          output_memory_kinds)
+      : compile_options_(compile_options),
+        aot_executables_(std::move(executables)),
+        num_replicas_(num_replicas),
+        num_partitions_(num_partitions),
+        name_(name),
+        fingerprint_(fingerprint) {}
+
+  absl::StatusOr<std::string> SerializeExecutable() const override;
+
+  absl::string_view name() const override { return name_; }
+  int num_replicas() const override { return num_replicas_; }
+  int num_partitions() const override { return num_partitions_; }
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return compile_options_;
+  }
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return absl::UnimplementedError("GetHloModules is not supported.");
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    if (output_memory_kinds_.has_value()) {
+      return *output_memory_kinds_;
+    }
+    return absl::UnimplementedError("GetOutputMemoryKinds is not supported.");
+  }
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override {
+    return absl::UnimplementedError("GetCostAnalysis is not supported.");
+  }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override { return 0; }
+
+  const CompileOptions& compile_options() const { return compile_options_; }
+  std::vector<std::unique_ptr<xla::AotCompilationResult>>& aot_executables() {
+    return aot_executables_;
+  }
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
+    return fingerprint_;
+  }
+
+ private:
+  CompileOptions compile_options_;
+  std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_executables_;
+  int num_replicas_;
+  int num_partitions_;
+  std::string name_;
+  std::string fingerprint_;
+  std::optional<std::vector<std::vector<absl::string_view>>>
+      output_memory_kinds_;
+};
+}  // namespace xla
+
+#endif  // XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tf_pjrt_client.h
new file mode 100644
index 00000000..49b8d5db
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -0,0 +1,395 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_TF_PJRT_CLIENT_H_
+#define XLA_PJRT_TF_PJRT_CLIENT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+
+class TfPjRtClient;
+
+// Wrapper for PjRtBuffer that translates the device.
+class TfPjRtBuffer : public PjRtBuffer {
+ public:
+  TfPjRtBuffer(TfPjRtClient* client, std::unique_ptr<PjRtBuffer> wrapped);
+  ~TfPjRtBuffer() override;
+
+  PjRtBuffer* wrapped() const { return wrapped_.get(); }
+  const Shape& on_device_shape() const override {
+    return wrapped_->on_device_shape();
+  }
+  absl::StatusOr<Shape> logical_on_device_shape() override {
+    return wrapped_->logical_on_device_shape();
+  }
+  PjRtMemorySpace* memory_space() const override {
+    return wrapped_->memory_space();
+  }
+  PjRtDevice* device() const override { return wrapped_->device(); }
+  PjRtClient* client() const override;
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override {
+    return wrapped_->AcquireExternalReference();
+  }
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override {
+    return wrapped_->ToLiteral(literal);
+  }
+  PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
+      override {
+    return wrapped_->LazyToLiteral(std::move(generator));
+  }
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override {
+    return wrapped_->GetOnDeviceSizeInBytes();
+  }
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return wrapped_->CopyRawToHost(dst, offset, transfer_size);
+  }
+  void Delete() override { wrapped_->Delete(); }
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override {
+    return wrapped_->ReleaseDeviceMemoryOwnership(
+        wait_for_operations_to_complete);
+  }
+  bool IsDeleted() override { return wrapped_->IsDeleted(); }
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) override;
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override {
+    return Unimplemented("CopyToMemorySpace not implemented");
+  }
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    wrapped_->CopyToRemoteDevice(std::move(serialized_descriptor),
+                                 std::move(on_done));
+  }
+  void CopyToRemoteDeviceScattered(
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const ScatterDetails& scatter_details) override {
+    return wrapped_->CopyToRemoteDeviceScattered(
+        std::move(serialized_descriptors), std::move(callbacks),
+        scatter_details);
+  }
+  PjRtFuture<> GetReadyFuture() override { return wrapped_->GetReadyFuture(); }
+  bool IsOnCpu() const override { return wrapped_->IsOnCpu(); }
+
+  // Not thread-safe. The caller should promises to have some external
+  // synchronization that ensures that all uses of the buffer have completed
+  // (and a thread synchronization has occurred that involves all the necessary
+  // memory barriers) before this method is called.
+  void DestroyWrappedBuffer() { wrapped_.reset(nullptr); }
+
+ private:
+  TfPjRtClient* client_;
+  std::unique_ptr<PjRtBuffer> wrapped_;
+};
+
+// Wrapper for PjRtLoadedExecutable that wraps and unwraps argument and result
+// buffers.
+class TfPjRtExecutable : public PjRtLoadedExecutable {
+ public:
+  TfPjRtExecutable(TfPjRtClient* client,
+                   std::unique_ptr<PjRtLoadedExecutable> wrapped);
+
+  PjRtLoadedExecutable* wrapped() const { return wrapped_.get(); }
+
+  PjRtClient* client() const override;
+  absl::string_view name() const override { return wrapped_->name(); }
+  int num_replicas() const override { return wrapped_->num_replicas(); }
+  int num_partitions() const override { return wrapped_->num_partitions(); }
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    return wrapped_->SizeOfGeneratedCodeInBytes();
+  }
+  const DeviceAssignment& device_assignment() const override {
+    return wrapped_->device_assignment();
+  }
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    return wrapped_->addressable_device_logical_ids();
+  }
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return wrapped_->addressable_devices();
+  }
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return wrapped_->GetHloModules();
+  }
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return wrapped_->GetOutputMemoryKinds();
+  }
+  using PjRtLoadedExecutable::Execute;
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
+  using PjRtLoadedExecutable::ExecuteSharded;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+  using PjRtLoadedExecutable::ExecutePortable;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  void Delete() override { return wrapped_->Delete(); }
+  bool IsDeleted() override { return wrapped_->IsDeleted(); }
+  bool IsReturnedFutureSupported() const override {
+    return wrapped_->IsReturnedFutureSupported();
+  }
+
+  absl::StatusOr<std::string> SerializeExecutable() const override {
+    return wrapped_->SerializeExecutable();
+  }
+
+  absl::StatusOr<struct CompileOptions> GetCompileOptions() const override {
+    return wrapped_->GetCompileOptions();
+  }
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
+    return wrapped_->FingerprintExecutable();
+  }
+
+ private:
+  TfPjRtClient* client_;
+  std::unique_ptr<PjRtLoadedExecutable> wrapped_;
+};
+
+// A thin wrapper of PjRtClient that includes management of PjRtBuffer it
+// creates.
+class TfPjRtClient : public PjRtClient {
+ public:
+  static std::unique_ptr<TfPjRtClient> CreateTfPjRtClient(
+      std::unique_ptr<PjRtClient> wrapped);
+  explicit TfPjRtClient(std::unique_ptr<PjRtClient> wrapped);
+  ~TfPjRtClient() override;
+  int process_index() const override { return wrapped_->process_index(); }
+  int device_count() const override { return wrapped_->device_count(); }
+  int addressable_device_count() const override {
+    return wrapped_->addressable_device_count();
+  }
+  absl::Span<PjRtDevice* const> devices() const override {
+    return wrapped_->devices();
+  }
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return wrapped_->addressable_devices();
+  }
+  absl::StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override {
+    return wrapped_->LookupDevice(global_device_id);
+  }
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override {
+    if (wrapped_ == nullptr) {
+      return tsl::errors::Internal(
+          "Wrapped PJRT client in TfPjRtClient is already destroyed.");
+    }
+    return wrapped_->LookupAddressableDevice(local_device_id);
+  }
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
+    return wrapped_->memory_spaces();
+  }
+  PjRtPlatformId platform_id() const override {
+    return wrapped_->platform_id();
+  }
+  absl::string_view platform_name() const override {
+    return wrapped_->platform_name();
+  }
+  absl::string_view platform_version() const override {
+    return wrapped_->platform_version();
+  }
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override {
+    return wrapped_->GetDefaultDeviceAssignment(num_replicas, num_partitions);
+  }
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override {
+    return wrapped_->GetDefaultLayout(element_type, dims);
+  }
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override {
+    return wrapped_->GetHloCostAnalysis();
+  }
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override {
+    return WrapExecutable(wrapped_->Compile(computation, options));
+  }
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp module, CompileOptions options) override {
+    return WrapExecutable(wrapped_->Compile(std::move(module), options));
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override {
+    return WrapExecutable(wrapped_->DeserializeExecutable(serialized, options));
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device) override {
+    return Unimplemented(
+        "CreateUninitializedBuffer not supported for TfPjRtClient.");
+  }
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override {
+    return Unimplemented(
+        "AsyncHostToDeviceTransferManager not supported for Tf.");
+  }
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override {
+    return Unimplemented(
+        "AsyncHostToDeviceTransferManager not supported for Tf.");
+  }
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device) override {
+    return WrapBuffer(wrapped_->BufferFromHostBuffer(
+        data, type, dims, byte_strides, host_buffer_semantics,
+        std::move(on_done_with_host_buffer), device));
+  }
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout) override {
+    return WrapBuffer(wrapped_->BufferFromHostBuffer(
+        data, type, dims, byte_strides, host_buffer_semantics,
+        std::move(on_done_with_host_buffer), device, device_layout));
+  }
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) override {
+    return WrapBuffer(wrapped_->BufferFromHostLiteral(literal, device));
+  }
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtDevice* device,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override {
+    return WrapBuffer(wrapped_->CreateViewOfDeviceBuffer(
+        device_ptr, shape, device, on_delete_callback, stream));
+  }
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
+      PjRtBuffer* buffer) override {
+    return wrapped_->UnsafeBufferPointer(UnwrapBuffer(buffer));
+  }
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                              PjRtDevice* device,
+                              PjRtCrossHostRecvNotifier notifier) override {
+    return wrapped_->MakeCrossHostReceiveBuffers(shapes, device,
+                                                 std::move(notifier));
+  }
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffersForGather(
+      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
+      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
+    return wrapped_->MakeCrossHostReceiveBuffersForGather(
+        shapes, std::move(gather_details), device, std::move(notifier));
+  }
+  absl::StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+      const override {
+    return wrapped_->GetTopologyDescription();
+  }
+  absl::Status Defragment() override { return wrapped_->Defragment(); }
+
+  PjRtClient* wrapped() const { return wrapped_.get(); }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> WrapBuffer(
+      absl::StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap);
+
+  // Tracks a non-owning pointer of TfPjRtBuffer in TfPjRtClient.
+  void TrackBuffer(TfPjRtBuffer* buffer);
+
+  // Untracks a TfPjRtBuffer in TfPjRtClient.
+  void UntrackBuffer(const TfPjRtBuffer* buffer);
+
+  // Destroy all the wrapped PjRtBuffer in the tracked set of TfPjRtBuffers and
+  // then destroy the wrapped PjRtClient.
+  void DestroyWrappedBuffersAndClient();
+
+ private:
+  // Unwraps a TfPjRtBuffer.
+  PjRtBuffer* UnwrapBuffer(PjRtBuffer* buffer) const {
+    return tensorflow::down_cast<TfPjRtBuffer*>(buffer)->wrapped();
+  }
+
+  // Unwraps a TfPjRtExecutable.
+  const PjRtLoadedExecutable& UnwrapExecutable(
+      const PjRtLoadedExecutable& executable) const {
+    return *tensorflow::down_cast<const TfPjRtExecutable*>(&executable)
+                ->wrapped();
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> WrapExecutable(
+      absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap);
+
+  std::unique_ptr<PjRtClient> wrapped_;
+
+  absl::flat_hash_map<int, int> mutex_id_from_device_id_;
+
+  // Depending on `sizeof(absl::flat_hash_set<TfPjRtBuffer*>)`, might need to
+  // add some padding to the struct.
+  struct DeviceBuffers {
+    absl::Mutex mu;
+    absl::flat_hash_set<TfPjRtBuffer*> alive_buffers ABSL_GUARDED_BY(mu);
+  };
+  std::vector<DeviceBuffers> alive_buffers_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_TF_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h
new file mode 100644
index 00000000..b19cd949
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h
@@ -0,0 +1,23 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
+#define XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
+
+// Transitional forwarding header. Please include cpu/cpu_client.h directly.
+
+#include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
+
+#endif  // XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tracked_device_buffer.h
new file mode 100644
index 00000000..4dbf7881
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -0,0 +1,339 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
+#define XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "xla/pjrt/event_pool.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/executable.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+
+// A BufferSequencingEvent keeps track of dependencies of a buffer on each
+// stream it has been used on.
+//
+// Each logical buffer in an XLA computation may be defined (i.e., written to)
+// at most once. We call the operation that writes the buffer's value on some
+// stream (e.g., a transfer or compute kernel) the buffer's definition event.
+//
+// After the operation that populates the value of a buffer has been enqueued on
+// 'stream', SetSequencingEvent() should also be called to trigger the
+// definition event after the operation has completed.
+//
+// After the buffer is read on 'stream' another event should be added so that
+// it is possible to sequence buffer donation after all reads have completed.
+//
+// Since different streams are not necessarily synchronized with one another,
+// if we wish to consume the value of the buffer on a different stream, we
+// should first call WaitForEventOnStream(stream), which add a cross-stream
+// from 'stream' to the buffer's definition event, causing 'stream' to pause
+// until the definition event has been triggered, if needed. Operations on
+// 'stream' may then assume that the buffer is valid and its contents correspond
+// to the desired buffer.
+//
+// The dependency logic caches the set of streams at the tail of which the
+// definition event is known to have occurred; waiting for the same event on the
+// same stream causes no additional waiting.
+class BufferSequencingEvent {
+ public:
+  explicit BufferSequencingEvent(tsl::thread::ThreadPool* thread_pool)
+      : thread_pool_(thread_pool),
+        defined_status_(tsl::MakeUnconstructedAsyncValueRef<absl::Status>()) {}
+
+  // Sets the sequencing event to 'event', which is recorded on 'stream'. Must
+  // be called at most once. Unblocks any other host threads that are blocked in
+  // WaitForEventOnStream.
+  void SetSequencingEvent(EventPool::Handle event, se::Stream* stream);
+
+  // Adds synchronization events to 'stream' that wait for this event to be
+  // defined on 'stream'. Does nothing if the event is already known to have
+  // occurred by the tail of 'stream'. If SetSequencingEvent has not yet been
+  // called, blocks the calling thread until the event has been recorded.
+  void WaitForEventOnStream(se::Stream* stream);
+
+  // Same as WaitForEventOnStream, but takes a raw platform-specific
+  // stream. Currently on implemented for CUDA and ROCM GPU, where stream is a
+  // GpuStreamHandle (e.g. a cudaStream_t).
+  absl::Status WaitForEventOnExternalStream(std::intptr_t stream);
+
+  // Returns true if the event is known by the host to have already occurred. If
+  // SetSequencingEvent has not yet been called, blocks the calling thread
+  // until the event has been recorded.
+  bool IsComplete();
+
+  // Compares the sequence numbers of two recorded events. It is illegal to call
+  // the comparison operators unless both events have been recorded.
+  inline bool operator<(const BufferSequencingEvent& rhs) const {
+    return sequence_number() < rhs.sequence_number();
+  }
+  inline bool operator>(const BufferSequencingEvent& rhs) const {
+    return rhs < *this;
+  }
+  inline bool operator<=(const BufferSequencingEvent& rhs) const {
+    return !(*this > rhs);
+  }
+  inline bool operator>=(const BufferSequencingEvent& rhs) const {
+    return !(*this < rhs);
+  }
+
+  inline bool operator==(int number) const {
+    return sequence_number() == number;
+  }
+
+  // Executes the `task` if the event is ready; otherwise adds the `task`
+  // callback to `on_ready_tasks_callback_` that can not be executed until the
+  // the event is ready.
+  void ExecuteOrAddToFutureTasks(const std::string& task_name,
+                                 std::function<void()> task);
+
+  // Executes all the callbacks in `on_ready_tasks_callback_`. Those callbacks
+  // can only proceed until the event is ready.
+  void ExecuteFutureTasks();
+
+  bool IsDefined() {
+    absl::MutexLock lock(&mu_);
+    return IsDefinedNoLock();
+  }
+
+  bool IsDefinedNoLock() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void SetDefinedStatus(absl::Status status) {
+    {
+      absl::MutexLock lock(&mu_);
+      defined_status_.emplace(status);
+    }
+
+    this->ExecuteFutureTasks();
+  }
+
+  absl::Status GetDefinedStatus() {
+    absl::MutexLock lock(&mu_);
+    CHECK(defined_status_.IsConcrete());
+    return defined_status_.get();
+  }
+
+  bool IsPredeterminedError() {
+    absl::MutexLock lock(&mu_);
+    return defined_status_.IsConcrete() && !defined_status_.get().ok();
+  }
+
+  // Returns true if either:
+  // 1. The event IsPredeterminedError
+  // Or:
+  // 2. The event is known to have occurred by the tail of 'stream'.
+  // If SetSequencingEvent and SetDefinedStatus has not yet been called,
+  // blocks the calling thread until either of those 2 happens.
+  // This is checking the above 2 conditions with a single lock. This is needed
+  // in case a buffer is set as an error buffer in a different thread after
+  // IsPredeterminedError() check and before DefinedOn() check, in which case
+  // DefinedOn() would indefinitely wait since the event is never recorded when
+  // the buffer is predetermined error.
+  bool IsPredeterminedErrorOrDefinedOn(se::Stream* stream);
+
+ private:
+  bool EventHasBeenRecorded() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  uint64_t sequence_number() const;
+
+  // An event that is triggered when the content of one or more buffers has been
+  // read or written. If this event is used as a definition event and is
+  // nullptr, it is assumed that the buffer's content is always defined for
+  // example because it uses storage borrowed from elsewhere.
+  EventPool::Handle event_;
+
+  // Cache of event_->sequence_number that avoids synchronization overhead.
+  // TODO(phawkins): In fact, event_->sequence_number is unused beyond the
+  // initial population of sequence_number_, and we could remove it if we
+  // refactored the EventPool API.
+  std::atomic<uint64_t> sequence_number_{0};
+
+  mutable absl::Mutex mu_;
+  // A list of all streams for which the buffer's content is known to be defined
+  // at the tail of the queue, i.e., for any newly enqueued command.
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ ABSL_GUARDED_BY(mu_);
+
+  // A map of the task name and callback to execute when the
+  // TrackedDeviceBuffer's `definition_events_` are all recorded and ready to be
+  // consumed by other tasks.
+  absl::flat_hash_map<std::string, std::function<void()>>
+      on_ready_tasks_callback_ ABSL_GUARDED_BY(mu_);
+
+  tsl::thread::ThreadPool* thread_pool_;
+
+  // Indicates if the buffer is in an error status. And error status is used to
+  // propagate the error to the buffer consumers.
+  tsl::AsyncValueRef<absl::Status> defined_status_ ABSL_GUARDED_BY(mu_);
+};
+
+// Class that represents a tuple of device buffers. Like a ScopedShapedBuffer it
+// owns all of the device memory in the tuple. It also tracks the definition and
+// usage of the memory on streams, to allow for synchronized usage and deletion
+// of memory under all of the allocation model semantics.
+class TrackedDeviceBuffer {
+ public:
+  // Helper object to keep track of usage of the buffer on streams.
+  struct StreamAndEvent {
+    // A stream the buffer has been used on.
+    se::Stream* stream;
+    // An event that is later than the most recent usage of the buffer on
+    // stream.
+    std::shared_ptr<BufferSequencingEvent> event;
+    // True if and only if a reference to the buffer is kept live until after
+    // the host knows that event is complete.
+    bool reference_held;
+  };
+
+  // Converts a ScopedShapedBuffer into a TrackedDeviceBuffer. Takes ownership
+  // of the buffers of the shaped_buffer.
+  static std::shared_ptr<TrackedDeviceBuffer> FromScopedShapedBuffer(
+      ScopedShapedBuffer* shaped_buffer,
+      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
+          definition_events,
+      PjRtDevice* device);
+
+  // Builds a ShapedBuffer view onto the buffers of 'tree'.
+  ShapedBuffer AsShapedBuffer(const Shape& on_device_shape) const;
+
+  // Adds the owned device buffers in order to 'iterator'. Used to add the
+  // buffers to an ExecutionInput. We require but do not verify that 'iterator'
+  // when passed in is pointing to a sub-tuple of the ExecutionInput whose
+  // on_device_shape matches that of the TrackedDeviceBuffer. 'end' is used to
+  // check that 'iterator' doesn't run out of bounds.
+  void AddToInputAsImmutable(
+      ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
+      const ShapeTree<MaybeOwningDeviceMemory>::iterator& end) const;
+
+  // Adds the owned device buffers in order to 'iterator', marking them as
+  // available to be donated. If donation succeeds, i.e., execution_input is
+  // subsequently successfully enqueued to a computation,
+  // this->ReleaseDeviceMemory() must be called to avoid freeing the device
+  // memory twice. We require but do not verify that 'iterator' when passed in
+  // is pointing to a sub-tuple of execution_input whose on_device_shape matches
+  // that of the TrackedDeviceBuffer. 'end' is used to check that 'iterator'
+  // doesn't run out of bounds.
+  void AddToInputAsDonated(
+      ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
+      const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
+      ExecutionInput* execution_input,
+      se::DeviceMemoryAllocator* allocator) const;
+
+  se::DeviceMemoryAllocator* allocator() const { return allocator_; }
+  absl::InlinedVector<se::DeviceMemoryBase, 1>& device_memory() {
+    return device_memory_;
+  }
+  const absl::InlinedVector<se::DeviceMemoryBase, 1>& device_memory() const {
+    return device_memory_;
+  }
+  absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events()
+      const {
+    return definition_events_;
+  }
+  absl::Span<const StreamAndEvent> usage_events() const {
+    return usage_events_;
+  }
+
+  // Relinquishes ownership of the buffer's device memory, e.g., after the
+  // buffer is passed to a computation that aliases its inputs to outputs.
+  void ReleaseDeviceMemory() { device_memory_.clear(); }
+
+  // Indicates that the buffer has been used on a stream.
+  //
+  //   usage_stream:   a stream that the buffer was used on.
+  //   event:          an event that has been recorded on usage_stream after the
+  //                   buffer was used.
+  //   reference_held: true if and only if the caller has caused a memory
+  //                   reference to *this to stay live until after the host
+  //                   is sure that the usage (transfer or execution) has
+  //                   completed.
+  void AddUsageEvent(se::Stream* usage_stream,
+                     std::shared_ptr<BufferSequencingEvent> event,
+                     bool reference_held);
+
+  using StreamAndEventContainer = absl::InlinedVector<StreamAndEvent, 3>;
+  // Returns the set of streams that the buffer was used on, and for each stream
+  // an event later than the last use of the buffer. After
+  // LockUseAndTransferUsageEvents is called it is illegal to use the buffer on
+  // any stream and, e.g. AddUsageHold will CHECK fail.
+  StreamAndEventContainer LockUseAndTransferUsageEvents();
+
+  TrackedDeviceBuffer() : in_use_(true) {}
+  TrackedDeviceBuffer(se::DeviceMemoryAllocator* allocator, PjRtDevice* device,
+                      absl::Span<se::DeviceMemoryBase const> device_memory,
+                      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
+                          definition_events,
+                      absl::AnyInvocable<void() &&> on_delete_callback);
+  ~TrackedDeviceBuffer();
+
+ private:
+  // Are the buffers in device_memory_ owned? If so, which allocator and device?
+  // May be nullptr, indicating the buffers are not owned.
+  se::DeviceMemoryAllocator* allocator_;
+  PjRtDevice* device_;
+
+  // Each host-side buffer may have several buffers on-device.
+  absl::InlinedVector<se::DeviceMemoryBase, 1> device_memory_;
+
+  // Events that are triggered when the content of one or more buffers is ready
+  // during multistream execution. May be nullptr, which is used in the
+  // single-stream execution case where events are not necessary for buffer
+  // event sequencing. All events must be triggered before the buffers can be
+  // used.
+  absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 2>
+      definition_events_;
+
+  // in_use_ starts out true, and is set to false when the buffer is released
+  // from its owning PjRtBuffer. Once in_use_ is false, the buffer may no
+  // longer be used on any stream.
+  bool in_use_;
+  // Set of streams that the buffer has ever been used on, see comment on
+  // StreamAndEvent.
+  StreamAndEventContainer usage_events_;
+
+  // A callback to call when the TrackedDeviceBuffer is about to be destroyed.
+  absl::AnyInvocable<void() &&> on_delete_callback_;
+};
+
+// Populates 'events' with the set of buffer events for buffer. If
+// get_usage_events=true populates with the latest usage events, otherwise
+// populates with the definition events.
+void GetDeviceBufferEvents(const TrackedDeviceBuffer& buffer,
+                           bool get_usage_events,
+                           absl::flat_hash_set<BufferSequencingEvent*>* events);
+
+// Waits for all of the definition events in a buffer on 'stream'.
+void WaitForBufferDefinitionEventsOnStream(const TrackedDeviceBuffer& buffer,
+                                           se::Stream* stream);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/transpose.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/transpose.h
new file mode 100644
index 00000000..714db857
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/transpose.h
@@ -0,0 +1,300 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements a out-of-place multidimensional array transpose
+// inspired by the paper:
+//
+// Springer, P., Su, T. and Bientinesi, P., 2017, June. HPTT: A high-performance
+// tensor transposition C++ library. In Proceedings of the 4th ACM SIGPLAN
+// International Workshop on Libraries, Languages, and Compilers for Array
+// Programming (pp. 56-62).
+// https://arxiv.org/abs/1704.04374
+//
+
+#ifndef XLA_PJRT_TRANSPOSE_H_
+#define XLA_PJRT_TRANSPOSE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "xla/pjrt/lru_cache.h"
+
+namespace xla {
+
+class TransposePlan {
+ public:
+  // elem_size_in_bytes: size of each element in bytes.
+  // dims: the input shape, in elements.
+  // permutation: for each output dimension, gives the number of the
+  //   corresponding input dimension. Must be a permutation of [0..dims.size())
+  // input_layout: either byte strides or an input tiling.
+  //
+  // A Striding represents the strides of the input array in bytes. (N.B. not
+  // elements).
+  //
+  // A Tiling is a tiling specification for the input or output array. May
+  // have fewer dimensions that `dims`, in which case the tiling applies to the
+  // minormost dimensions and any remaining dimensions are left untiled (i.e.,
+  // tile size 1). An empty tiling corresponds to an untiled dense
+  // major-to-minor layout.
+  //
+  // For more information about tiling, see
+  // https://www.tensorflow.org/xla/tiled_layout
+  // This class supports a single level of tiling. In addition, the same
+  // dimension currently cannot have different non-trivial tiling values in
+  // both the input and output.
+  //
+  // The size of the plan may be exponential in the number of non-trivial
+  // tiled dimensions. This is acceptable because in the intended use case for
+  // this code we expect at most 2 tiled dimensions on input and output.
+  //
+  // The input may have either a striding or a tiling but not both.
+  //
+  // num_threads: is the number of threads requested. The actual number of
+  //   threads used may be smaller if there isn't enough work per thread.
+  struct Tiling {
+    absl::Span<int64_t const> tiling;
+  };
+  struct Striding {
+    absl::Span<int64_t const> strides_in_bytes;
+  };
+  enum class Transformation {
+    // Apply no transformations to the data.
+    kNone = 0,
+
+    // Convert doubles into the ef57 extended precision pair-of-floats
+    // representation used on TPU.
+    kF64ToEf57 = 1,
+  };
+
+  struct Options {
+    size_t elem_size_in_bytes;
+    absl::Span<int64_t const> dims;
+    absl::Span<int64_t const> permutation;
+    std::variant<Tiling, Striding> input_layout = Tiling{};
+    Tiling output_tiling;
+    Transformation transformation = Transformation::kNone;
+    int num_threads = 1;
+  };
+
+  static absl::StatusOr<std::unique_ptr<TransposePlan>> Create(
+      const Options& options);
+
+  TransposePlan();
+  ~TransposePlan();
+
+  // Executes the transposition.
+  // `a` is the input array and `b` is the output array. The input and output
+  // arrays must not overlap.
+  // Currently there are no alignment requirements on either `a` or `b`. However
+  // performance may be better if either or both are aligned.
+  void Execute(const void* a, void* b,
+               std::optional<absl::FunctionRef<void(std::function<void(void)>)>>
+                   schedule_work = std::nullopt) const;
+
+  // Returns a human-readable description of the plan.
+  std::string ToString() const;
+
+  size_t ElemSizeInBytes() const { return elem_size_in_bytes_; }
+
+  // Input and output size, in number of elements. Ignores any input striding,
+  // but accounts for tiling.
+  int64_t InputNumElems() const;
+  int64_t OutputNumElems() const;
+
+  absl::Span<int64_t const> InputDims() const { return original_a_dims_; }
+  absl::Span<int64_t const> OutputDims() const { return original_b_dims_; }
+
+  absl::Span<int64_t const> InputStrides() const { return original_a_strides_; }
+
+  // Returns the number of items of parallel work in the plan.
+  int Parallelism() const { return nodes_.size(); }
+
+  struct Node;
+
+ protected:
+  // Methods protected so they can be accessed by tests.
+
+  // Removes any size-1 dimensions.
+  static void RemoveTrivialDimensions(
+      absl::InlinedVector<int64_t, 4>& a_dims,
+      absl::InlinedVector<int64_t, 4>& permutation,
+      absl::InlinedVector<int64_t, 4>& lda,
+      absl::InlinedVector<int64_t, 4>& lda_tile,
+      absl::InlinedVector<int64_t, 4>& a_tiling,
+      absl::InlinedVector<int64_t, 4>& b_tiling);
+
+  // Collapses together dimensions that are adjacent both in `dims` and
+  // `permutation`.
+  static void CoalesceDimensions(absl::InlinedVector<int64_t, 4>& a_dims,
+                                 absl::InlinedVector<int64_t, 4>& permutation,
+                                 absl::InlinedVector<int64_t, 4>& lda,
+                                 absl::InlinedVector<int64_t, 4>& lda_tile,
+                                 absl::InlinedVector<int64_t, 4>& a_tiling,
+                                 absl::InlinedVector<int64_t, 4>& b_tiling);
+
+ private:
+  // Performs plan initialization that cannot fail.
+  void Initialize();
+
+  void BuildPlanNodes(absl::Span<int64_t const> inverse_permutation,
+                      int thread_id, std::vector<Node>& output_nodes);
+
+  std::vector<int> ChooseParallelizationStrategy(
+      absl::Span<int64_t const> inverse_permutation);
+
+  // The signature of ExecuteTyped uses char* pointers because we perform
+  // address calculations with strides in bytes; the strides need not be
+  // multiples of the element size.
+  template <typename T, Transformation transformation>
+  void ExecuteTyped(const char* a, char* b, absl::Span<Node const> nodes) const;
+
+  // Number of threads requested.
+  int num_threads_requested_ = 1;
+
+  // Size of each element in bytes.
+  int64_t elem_size_in_bytes_;
+
+  // Number of elements in the input array.
+  int64_t num_elems_;
+
+  // Description of the transpose, before any optimizations such as coalescing
+  // dimensions have been applied.
+  absl::InlinedVector<int64_t, 4> original_a_dims_;
+  absl::InlinedVector<int64_t, 4> original_a_strides_;
+  std::vector<int64_t> original_b_dims_;
+
+  // Dimensions of the input array A.
+  absl::InlinedVector<int64_t, 4> a_dims_;
+  absl::InlinedVector<int64_t, 4> a_strides_;
+
+  // Dimensions of the output array B.
+  std::vector<int64_t> b_dims_;
+
+  // Dimension permutation to apply to form B. For each dimension of B, what is
+  // the corresponding dimension of A?
+  absl::InlinedVector<int64_t, 4> permutation_;
+
+  // Leading-dimension sizes (byte strides) of each dimension.
+  absl::InlinedVector<int64_t, 4> lda_;
+  absl::InlinedVector<int64_t, 4> lda_tile_;
+  absl::InlinedVector<int64_t, 4> ldb_;
+  absl::InlinedVector<int64_t, 4> ldb_tile_;
+
+  // Tile sizes in each dimension. Has size equal to the number of dimensions.
+  // A 1 entry means that dimension is not tiled.
+  absl::InlinedVector<int64_t, 4> a_tiling_;
+  absl::InlinedVector<int64_t, 4> b_tiling_;
+  bool a_is_tiled_;
+  bool b_is_tiled_;
+
+  // Order to traverse dimensions, from slowest-varying to fastest-varying.
+  struct Loop {
+    // The integers are dimension numbers in A.
+    int dim_in_a;
+    // If true, the loop iterates over the interior of a tile.
+    bool tile_interior;
+  };
+  std::vector<Loop> loop_order_;
+  std::vector<int> loop_parallelism_;
+
+  // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
+  // nest. The outer vector is indexed on the thread ID.
+  absl::InlinedVector<std::vector<Node>, 1> nodes_;
+
+  // Are the innermost (stride-1) dimensions the same dimension? This determines
+  // whether the inner kernel is a transpose or a memcpy.
+  bool inner_kernel_is_memcpy_;
+
+  // Size of the inner (microkernel) block size. This is the unit of work for
+  // our vectorized kernels.
+  int inner_block_elems_ = 1;
+  // Size of the outer (macrokernel) block size. This is the unit of work for
+  // cache blocking and need not be equal between input and output.
+  int outer_block_elems_a_ = 4;
+  int outer_block_elems_b_ = 4;
+
+  // Transformations to apply to the input before transposition.
+  // Currently the only supported transformation is EF57 conversion, which is
+  // a pair-of-floats extended precision representation used on TPU. We
+  // support fusing transformations with the transpose for two reasons:
+  // (a) it makes sense to fuse cheap computations with a memory-bandwidth
+  //     bound transformation, and
+  // (b) it allows us to support non-trivial striding.
+  Transformation transformation_;
+
+  // Size of the per-thread scratch buffer. 0 means "no scratch buffer required"
+  int64_t scratch_size_ = 0;
+};
+
+struct TransposePlanCacheKey {
+  template <typename H>
+  friend H AbslHashValue(H h, const TransposePlanCacheKey& key);
+
+  size_t elem_size_in_bytes;
+  absl::InlinedVector<int64_t, 4> dims;
+  absl::InlinedVector<int64_t, 4> permutation;
+  bool input_layout_is_tiling;
+  absl::InlinedVector<int64_t, 4> input_layout;
+  absl::InlinedVector<int64_t, 4> output_tiling;
+  TransposePlan::Transformation transformation;
+  int num_threads;
+
+  bool operator==(const TransposePlanCacheKey& other) const;
+};
+
+template <typename H>
+H AbslHashValue(H h, const TransposePlanCacheKey& key);
+
+// An LRU cache for transpose plans. Not thread-safe.
+// Transpose plans aren't cheap to build, but once computed for a particular set
+// of inputs can be cached and reused for arrays. TransposePlanCache implements
+// such a cache.
+class TransposePlanCache {
+ public:
+  explicit TransposePlanCache(int capacity);
+  ~TransposePlanCache();
+
+  TransposePlanCache(const TransposePlanCache&) = delete;
+  TransposePlanCache(TransposePlanCache&&) = delete;
+  TransposePlanCache& operator=(const TransposePlanCache&) = delete;
+  TransposePlanCache& operator=(TransposePlanCache&&) = delete;
+
+  // Creates or returns a cached copy of a transpose plan.
+  absl::StatusOr<std::shared_ptr<TransposePlan>> GetOrCreate(
+      const TransposePlan::Options& options);
+
+ private:
+  LRUCache<TransposePlanCacheKey,
+           absl::StatusOr<std::shared_ptr<TransposePlan>>>::LRUList lru_list_;
+  LRUCache<TransposePlanCacheKey,
+           absl::StatusOr<std::shared_ptr<TransposePlan>>>
+      cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_TRANSPOSE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/transpose_kernels.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/transpose_kernels.h
new file mode 100644
index 00000000..cba611d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/transpose_kernels.h
@@ -0,0 +1,696 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_TRANSPOSE_KERNELS_H_
+#define XLA_PJRT_TRANSPOSE_KERNELS_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
+#include "xla/compiler_macros.h"
+
+#ifdef XLA_HAS_SSE2
+#include <immintrin.h>  // IWYU pragma: keep
+#endif
+
+#ifdef XLA_HAS_ARM_NEON
+#include <arm_neon.h>
+#endif  // XLA_HAS_ARM_NEON
+
+#if defined(XLA_HAS_SSE2) || defined(XLA_HAS_ARM_NEON)
+#define XLA_HAS_VEC128
+#endif  // defined(XLA_HAS_SSE2) || defined(XLA_HAS_ARM_NEON)
+
+namespace xla {
+
+// The transpose microkernels use a general approach of zipping elements from
+// different rows together. We start zipping together elements of size 1, size 2
+// and so-on until we have achieved our transpose. As we increase the number of
+// consecutive elements we are zipping together, we also increase the distance
+// between the rows which are getting zipped.
+//
+// For example, let's say we want to transpose a 4x4 matrix:
+//
+// row 0: w0 w1 w2 w3
+// row 1: x0 x1 x2 x3
+// row 2: y0 y1 y2 y3
+// row 3: z0 z1 z2 z3
+//
+// We first zip groups of single elements; we will zip row 0 and row 1 together
+// and row 2 and row 3 together:
+//
+// row 0: w0 x0 w1 x1
+// row 1: w2 x2 w3 x3
+// row 2: y0 z0 y1 z1
+// row 3: y2 z2 y3 z3
+//
+// Then, we zip groups of two elements; we will zip row 0 and row 2 together and
+// row 1 and row 3 together:
+//
+// row 0: w0 x0 y0 z0
+// row 1: w1 x1 y1 z1
+// row 2: w2 x2 y2 z2
+// row 3: w3 x3 y3 z3
+//
+// Note that as we double the number of elements we are zipping, we are also
+// doubling the distance between rows which get zipped together.
+//
+// This standard algorithm gets slightly tweaked for what we will call
+// "rectangular" transposes. Such transposes are trying to use vectors larger
+// than the row length to speed up the transpose. This is accomplished by doing
+// the outermost transpose first via loads and stores. If we go back to our
+// original example, we would have:
+//
+// row 0: w0 w1 w2 w3 y0 y1 y2 y3
+// row 1: x0 x1 x2 x3 z0 z1 z2 z3
+//
+// Now, we do a half zip of our elements:
+//
+// row 0: w0 x0 w1 x1 y0 z0 y1 z1
+// row 1: w2 x2 w3 x3 y2 z2 y3 z3
+//
+// We can see that we have w{0-3} and y{0-3} in row 0 and x{0-3} and z{0-3} in
+// row 1 but they are not in the right order. We need to shuffle them once to
+// get them in the right order:
+//
+// row 0: w0 x0 y0 z0 w2 x2 y2 z2
+// row 1: w1 x1 y1 z1 w3 x3 y3 z3
+//
+// Now, we can extract two rows of 4 elements from row 0 and two rows of 4
+// elements from row 1 to store into memory.
+
+enum class Extract { kLo, kHi };
+
+#ifdef __AVX__
+template <size_t element_size, Extract>
+__m256i Unpack(__m256i a, __m256i b);
+
+#if defined(__AVX2__)
+template <>
+inline __m256i Unpack<1, Extract::kLo>(__m256i a, __m256i b) {
+  return _mm256_unpacklo_epi8(a, b);
+}
+template <>
+inline __m256i Unpack<1, Extract::kHi>(__m256i a, __m256i b) {
+  return _mm256_unpackhi_epi8(a, b);
+}
+
+template <>
+inline __m256i Unpack<2, Extract::kLo>(__m256i a, __m256i b) {
+  return _mm256_unpacklo_epi16(a, b);
+}
+template <>
+inline __m256i Unpack<2, Extract::kHi>(__m256i a, __m256i b) {
+  return _mm256_unpackhi_epi16(a, b);
+}
+
+template <>
+inline __m256i Unpack<4, Extract::kLo>(__m256i a, __m256i b) {
+  return _mm256_unpacklo_epi32(a, b);
+}
+template <>
+inline __m256i Unpack<4, Extract::kHi>(__m256i a, __m256i b) {
+  return _mm256_unpackhi_epi32(a, b);
+}
+
+template <>
+inline __m256i Unpack<8, Extract::kLo>(__m256i a, __m256i b) {
+  return _mm256_unpacklo_epi64(a, b);
+}
+template <>
+inline __m256i Unpack<8, Extract::kHi>(__m256i a, __m256i b) {
+  return _mm256_unpackhi_epi64(a, b);
+}
+#else
+template <>
+inline __m256i Unpack<1, Extract::kLo>(__m256i a, __m256i b) {
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);
+  __m128i a_lo = _mm256_castsi256_si128(a);
+  __m128i b_lo = _mm256_castsi256_si128(b);
+  __m128i hi = _mm_unpacklo_epi8(a_hi, b_hi);
+  __m128i lo = _mm_unpacklo_epi8(a_lo, b_lo);
+  return _mm256_set_m128i(hi, lo);
+}
+template <>
+inline __m256i Unpack<1, Extract::kHi>(__m256i a, __m256i b) {
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);
+  __m128i a_lo = _mm256_castsi256_si128(a);
+  __m128i b_lo = _mm256_castsi256_si128(b);
+  __m128i hi = _mm_unpackhi_epi8(a_hi, b_hi);
+  __m128i lo = _mm_unpackhi_epi8(a_lo, b_lo);
+  return _mm256_set_m128i(hi, lo);
+}
+
+template <>
+inline __m256i Unpack<2, Extract::kLo>(__m256i a, __m256i b) {
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);
+  __m128i a_lo = _mm256_castsi256_si128(a);
+  __m128i b_lo = _mm256_castsi256_si128(b);
+  __m128i hi = _mm_unpacklo_epi16(a_hi, b_hi);
+  __m128i lo = _mm_unpacklo_epi16(a_lo, b_lo);
+  return _mm256_set_m128i(hi, lo);
+}
+template <>
+inline __m256i Unpack<2, Extract::kHi>(__m256i a, __m256i b) {
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);
+  __m128i a_lo = _mm256_castsi256_si128(a);
+  __m128i b_lo = _mm256_castsi256_si128(b);
+  __m128i hi = _mm_unpackhi_epi16(a_hi, b_hi);
+  __m128i lo = _mm_unpackhi_epi16(a_lo, b_lo);
+  return _mm256_set_m128i(hi, lo);
+}
+
+template <>
+inline __m256i Unpack<4, Extract::kLo>(__m256i a, __m256i b) {
+  return _mm256_castps_si256(
+      _mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+template <>
+inline __m256i Unpack<4, Extract::kHi>(__m256i a, __m256i b) {
+  return _mm256_castps_si256(
+      _mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+
+template <>
+inline __m256i Unpack<8, Extract::kLo>(__m256i a, __m256i b) {
+  return _mm256_castpd_si256(
+      _mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
+}
+template <>
+inline __m256i Unpack<8, Extract::kHi>(__m256i a, __m256i b) {
+  return _mm256_castpd_si256(
+      _mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
+}
+#endif
+#endif
+
+#ifdef XLA_HAS_SSE2
+template <size_t element_size, Extract>
+__m128i Unpack(__m128i a, __m128i b);
+
+template <>
+inline __m128i Unpack<1, Extract::kLo>(__m128i a, __m128i b) {
+  return _mm_unpacklo_epi8(a, b);
+}
+template <>
+inline __m128i Unpack<1, Extract::kHi>(__m128i a, __m128i b) {
+  return _mm_unpackhi_epi8(a, b);
+}
+
+template <>
+inline __m128i Unpack<2, Extract::kLo>(__m128i a, __m128i b) {
+  return _mm_unpacklo_epi16(a, b);
+}
+template <>
+inline __m128i Unpack<2, Extract::kHi>(__m128i a, __m128i b) {
+  return _mm_unpackhi_epi16(a, b);
+}
+
+template <>
+inline __m128i Unpack<4, Extract::kLo>(__m128i a, __m128i b) {
+  return _mm_unpacklo_epi32(a, b);
+}
+template <>
+inline __m128i Unpack<4, Extract::kHi>(__m128i a, __m128i b) {
+  return _mm_unpackhi_epi32(a, b);
+}
+
+template <>
+inline __m128i Unpack<8, Extract::kLo>(__m128i a, __m128i b) {
+  return _mm_unpacklo_epi64(a, b);
+}
+template <>
+inline __m128i Unpack<8, Extract::kHi>(__m128i a, __m128i b) {
+  return _mm_unpackhi_epi64(a, b);
+}
+
+using Vec128 = __m128i;
+
+template <size_t bytes>
+__m128i LoadElementIntoVec128(const void* p);
+
+template <>
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(uint16_t)>(
+    const void* p) {
+  // Note: We would ideally use `_mm_loadu_si16` here but older compilers do
+  // not support it. However, we can replicate it using a sequence such that
+  // even older compilers will turn this into a single movd instruction.
+  // memcpy is used because `p` is not guaranteed to be aligned to a 2-byte
+  // address.
+  uint16_t load;
+  memcpy(&load, p, sizeof(load));
+  return _mm_cvtsi32_si128(load);
+}
+
+template <>
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(uint32_t)>(
+    const void* p) {
+  // Note: We would ideally use `_mm_loadu_si32` here but older compilers do
+  // not support it. However, we can replicate it using a sequence such that
+  // even older compilers will turn this into a single movd instruction.
+  // memcpy is used because `p` is not guaranteed to be aligned to a 4-byte
+  // address.
+  uint32_t load;
+  memcpy(&load, p, sizeof(load));
+  return _mm_cvtsi32_si128(load);
+}
+
+template <>
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(uint64_t)>(
+    const void* p) {
+  // Note: We would ideally use `_mm_loadu_si64` here but older compilers do
+  // not support it. However, we can replicate it using a sequence such that
+  // even older compilers will turn this into a single movd instruction.
+  // memcpy is used because `p` is not guaranteed to be aligned to a 8-byte
+  // address.
+  uint64_t load;
+  memcpy(&load, p, sizeof(load));
+  return _mm_cvtsi64_si128(load);
+}
+
+template <>
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(__m128i)>(const void* p) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+}
+
+template <size_t bytes, int lane>
+inline void StoreElementFromVec128(void* p, __m128i v) {
+  constexpr size_t element_start = bytes * lane;
+  constexpr size_t element_end = element_start + bytes;
+  static_assert(element_start >= 0);
+  static_assert(element_end <= sizeof(Vec128));
+  constexpr bool halfway = element_start == sizeof(Vec128) / 2;
+  if constexpr (bytes == sizeof(uint16_t)) {
+    // Note: We would ideally use `_mm_storeu_si16` here but older compilers do
+    // not support it. However, we can replicate it using a sequence such that
+    // even older compilers will turn this into a single movd instruction.
+    // memcpy is used because `p` is not guaranteed to be aligned to a 4-byte
+    // address.
+    const uint16_t scalar = _mm_extract_epi16(v, lane);
+    memcpy(p, &scalar, bytes);
+  } else if constexpr (bytes == sizeof(uint32_t)) {
+    if constexpr (halfway) {
+      v = Unpack<bytes, Extract::kHi>(v, v);
+    } else if constexpr (lane != 0) {
+      v = _mm_shuffle_epi32(v, _MM_SHUFFLE(lane, lane, lane, lane));
+    }
+    // Note: We would ideally use `_mm_storeu_si32` here but older compilers do
+    // not support it. However, we can replicate it using a sequence such that
+    // even older compilers will turn this into a single movd instruction.
+    // memcpy is used because `p` is not guaranteed to be aligned to a 4-byte
+    // address.
+    const uint32_t scalar = _mm_cvtsi128_si32(v);
+    memcpy(p, &scalar, bytes);
+  } else if constexpr (bytes == sizeof(uint64_t)) {
+    if constexpr (halfway) {
+      v = Unpack<bytes, Extract::kHi>(v, v);
+    } else {
+      static_assert(lane == 0);
+    }
+    // Note: We would ideally use `_mm_storeu_si64` here but older compilers do
+    // not support it. However, we can replicate it using a sequence such that
+    // even older compilers will turn this into a single movd instruction.
+    // memcpy is used because `p` is not guaranteed to be aligned to a 8-byte
+    // address.
+    const uint64_t scalar = _mm_cvtsi128_si64(v);
+    memcpy(p, &scalar, bytes);
+  } else if constexpr (bytes == sizeof(__m128i)) {
+    static_assert(lane == 0);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
+  } else {
+    static_assert(bytes == 0);
+  }
+}
+#endif
+
+#ifdef XLA_HAS_ARM_NEON
+template <size_t element_size, Extract>
+uint64x2_t Unpack(uint64x2_t a, uint64x2_t b);
+
+template <>
+inline uint64x2_t Unpack<1, Extract::kLo>(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_u8(
+      vzipq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b)).val[0]);
+}
+template <>
+inline uint64x2_t Unpack<1, Extract::kHi>(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_u8(
+      vzipq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b)).val[1]);
+}
+
+template <>
+inline uint64x2_t Unpack<2, Extract::kLo>(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_u16(
+      vzipq_u16(vreinterpretq_u16_u64(a), vreinterpretq_u16_u64(b)).val[0]);
+}
+template <>
+inline uint64x2_t Unpack<2, Extract::kHi>(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_u16(
+      vzipq_u16(vreinterpretq_u16_u64(a), vreinterpretq_u16_u64(b)).val[1]);
+}
+
+template <>
+inline uint64x2_t Unpack<4, Extract::kLo>(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_u32(
+      vzipq_u32(vreinterpretq_u32_u64(a), vreinterpretq_u32_u64(b)).val[0]);
+}
+template <>
+inline uint64x2_t Unpack<4, Extract::kHi>(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_u32(
+      vzipq_u32(vreinterpretq_u32_u64(a), vreinterpretq_u32_u64(b)).val[1]);
+}
+
+template <>
+inline uint64x2_t Unpack<8, Extract::kLo>(uint64x2_t a, uint64x2_t b) {
+  uint64x1_t a_lo = vget_low_u64(a);
+  uint64x1_t b_lo = vget_low_u64(b);
+  return vcombine_u64(a_lo, b_lo);
+}
+template <>
+inline uint64x2_t Unpack<8, Extract::kHi>(uint64x2_t a, uint64x2_t b) {
+  uint64x1_t a_hi = vget_high_u64(a);
+  uint64x1_t b_hi = vget_high_u64(b);
+  return vcombine_u64(a_hi, b_hi);
+}
+
+using Vec128 = uint64x2_t;
+
+template <size_t>
+uint64x2_t LoadElementIntoVec128(const void* p);
+
+template <>
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint16_t)>(
+    const void* p) {
+  // Ideally, we would use `vld1q_lane_u16` but it assumes that its input is
+  // aligned to a 16-bit boundary. We can only promise 8-bit aligned. That said,
+  // this sequence will compile to `ldr St, [Xn]` but without an alignment hint.
+  uint16_t x;
+  memcpy(&x, p, sizeof(x));
+  return vreinterpretq_u64_u16(vsetq_lane_u16(x, vdupq_n_u16(0), 0));
+}
+
+template <>
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint32_t)>(
+    const void* p) {
+  // Ideally, we would use `vld1q_lane_u32` but it assumes that its input is
+  // aligned to a 32-bit boundary. We can only promise 8-bit aligned. That said,
+  // this sequence will compile to `ldr St, [Xn]` but without an alignment hint.
+  uint32_t x;
+  memcpy(&x, p, sizeof(x));
+  return vreinterpretq_u64_u32(vsetq_lane_u32(x, vdupq_n_u32(0), 0));
+}
+
+template <>
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint64_t)>(
+    const void* p) {
+  // Ideally, we would use `vld1q_lane_u64` but it assumes that its input is
+  // aligned to a 64-bit boundary. We can only promise 8-bit aligned. That said,
+  // this sequence will compile to `ldr Dt, [Xn]` but without an alignment hint.
+  return vreinterpretq_u64_u8(
+      vcombine_u8(vld1_u8(reinterpret_cast<const uint8_t*>(p)), vdup_n_u8(0)));
+}
+
+template <>
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint64x2_t)>(
+    const void* p) {
+  return vreinterpretq_u64_u8(vld1q_u8(reinterpret_cast<const uint8_t*>(p)));
+}
+
+template <size_t bytes, int lane>
+inline void StoreElementFromVec128(void* p, uint64x2_t v) {
+  static_assert(bytes * (lane + 1) <= sizeof(uint64x2_t));
+  if constexpr (bytes == sizeof(uint64x2_t)) {
+    static_assert(lane == 0);
+    vst1q_u8(reinterpret_cast<uint8_t*>(p), vreinterpretq_u8_u64(v));
+  } else {
+    if constexpr (bytes == sizeof(uint64_t)) {
+      // Ideally, we would use `vst1q_lane_u64` but it assumes that its input is
+      // aligned to a 64-bit boundary. We can only promise 8-bit aligned. That
+      // said, this sequence will compile to `st1 {vt.d}[lane], [xn]` but
+      // without an alignment hint.
+      uint64_t extracted = vgetq_lane_u64(v, lane);
+      memcpy(p, &extracted, sizeof(extracted));
+    } else if constexpr (bytes == sizeof(uint32_t)) {
+      // Ideally, we would use `vst1q_lane_u32` but it assumes that its input is
+      // aligned to a 32-bit boundary. We can only promise 8-bit aligned. That
+      // said, this sequence will compile to `st1 {vt.s}[lane], [xn]` but
+      // without an alignment hint.
+      uint32_t extracted = vgetq_lane_u32(vreinterpretq_u32_u64(v), lane);
+      memcpy(p, &extracted, sizeof(extracted));
+    } else if constexpr (bytes == sizeof(uint16_t)) {
+      // Ideally, we would use `vst1q_lane_u16` but it assumes that its input is
+      // aligned to a 16-bit boundary. We can only promise 8-bit aligned. That
+      // said, this sequence will compile to `st1 {vt.h}[lane], [xn]` but
+      // without an alignment hint.
+      uint16_t extracted = vgetq_lane_u16(vreinterpretq_u16_u64(v), lane);
+      memcpy(p, &extracted, sizeof(extracted));
+    } else {
+      static_assert(bytes == 0);
+    }
+  }
+}
+#endif
+
+#ifdef XLA_HAS_VEC128
+template <size_t element_size, size_t step_size, typename T, size_t N>
+inline std::array<T, N> UnpackStep(const std::array<T, N>& last_transpose) {
+  static_assert(N % (step_size * 2) == 0);
+  std::array<T, N> unpack;
+  XLA_UNROLL
+  for (int i = 0; i < N; i += step_size * 2) {
+    XLA_UNROLL
+    for (int j = 0; j < step_size; ++j) {
+      unpack[i + 2 * j + 0] = Unpack<element_size * step_size, Extract::kLo>(
+          last_transpose[i + j], last_transpose[i + j + step_size]);
+      unpack[i + 2 * j + 1] = Unpack<element_size * step_size, Extract::kHi>(
+          last_transpose[i + j], last_transpose[i + j + step_size]);
+    }
+  }
+  return unpack;
+}
+
+template <size_t element_size, size_t step_size, size_t unpack_limit,
+          typename T, size_t N>
+inline std::array<T, N> UnpackSequence(const std::array<T, N>& last_transpose) {
+  if constexpr (element_size * step_size < unpack_limit) {
+    std::array<T, N> unpack =
+        UnpackStep<element_size, step_size>(last_transpose);
+    return UnpackSequence<element_size, step_size * 2, unpack_limit>(unpack);
+  }
+  return last_transpose;
+}
+
+template <size_t element_size, size_t bs, typename T, size_t N>
+inline auto UnpackLowSequence(const std::array<T, N>& last_transpose) {
+  if constexpr (N > 1 && element_size * bs < sizeof(T)) {
+    static_assert(N % 2 == 0);
+    std::array<T, N / 2> unpack;
+    for (int i = 0; i < N; i += 2) {
+      unpack[i / 2] = Unpack<element_size, Extract::kLo>(last_transpose[i],
+                                                         last_transpose[i + 1]);
+    }
+    return UnpackLowSequence<element_size * 2, bs>(unpack);
+  } else {
+    return last_transpose;
+  }
+}
+
+template <size_t bytes, size_t... lane>
+inline void StoreElementsFromVec128(char* b, int64_t ldb, Vec128 x, size_t i,
+                                    std::index_sequence<lane...>) {
+  (StoreElementFromVec128</*bytes=*/bytes, lane>(b + ldb * (i + lane), x), ...);
+}
+
+template <typename T, int bs>
+struct Vec128RectangularTransposeMicroKernelImpl {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    constexpr size_t element_size = sizeof(T);
+    static_assert(sizeof(Vec128) % element_size == 0);
+    std::array<Vec128, bs> loads;
+    // `loads`:
+    // [  0,  1,  2,  3 ]
+    // [  4,  5,  6,  7 ]
+    // [  8,  9, 10, 11 ]
+    // [ 12, 13, 14, 15 ]
+    XLA_UNROLL
+    for (int i = 0; i < bs; ++i) {
+      loads[i] = LoadElementIntoVec128<element_size * bs>(a + lda * i);
+    }
+
+    // Each load may not have filled the entire vector. Combine rows to get a
+    // fully populated vector.
+    auto last_transpose = UnpackLowSequence<element_size, bs>(loads);
+
+    constexpr int kBytesInMatrix = element_size * bs * bs;
+    static_assert(kBytesInMatrix <= sizeof(Vec128) * last_transpose.size());
+    static_assert(bs % last_transpose.size() == 0);
+    constexpr int kStoresPerCombinedRow = bs / last_transpose.size();
+
+    // Each row of the matrix originally occupied some fraction of a vector.
+    // We may need to finish the transpose if `last_transpose.size() > 1`.
+    // `last_transpose`:
+    // [ 0,  4,  1,  5,  2,  6,  3,  7 ]
+    // [ 8, 12,  9, 13, 10, 14, 11, 15 ]
+    last_transpose =
+        UnpackSequence<kBytesInMatrix / (bs * last_transpose.size()),
+                       /*step_size=*/1,
+                       /*unpack_limit=*/kBytesInMatrix / bs>(last_transpose);
+    // `last_transpose`:
+    // [ 0,  4,  8, 12,  1,  5,  9, 13 ]
+    // [ 2,  6, 10, 14,  3,  7, 11, 15 ]
+    XLA_UNROLL
+    for (size_t i = 0; i < last_transpose.size(); ++i) {
+      StoreElementsFromVec128<element_size * bs>(
+          b, ldb, last_transpose[i], i * kStoresPerCombinedRow,
+          std::make_index_sequence<kStoresPerCombinedRow>{});
+    }
+  }
+};
+
+#endif
+
+#ifdef __AVX__
+template <typename T, int bs>
+struct AvxSquareTransposeMicroKernelImpl {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    constexpr size_t element_size = sizeof(T);
+    static_assert(element_size <= sizeof(__m128i));
+    static_assert(sizeof(__m128i) % element_size == 0);
+    static_assert(bs % 2 == 0);
+    static_assert(element_size * bs == sizeof(__m256i));
+    std::array<__m256i, bs> last_transpose;
+    XLA_UNROLL
+    for (int i = 0; i < bs / 2; ++i) {
+      auto* row0_low = reinterpret_cast<const __m128i*>(a + lda * (i + 0));
+      auto* row0_high = row0_low + 1;
+      auto* row1_low = reinterpret_cast<const __m128i*>(a + lda * (i + bs / 2));
+      auto* row1_high = row1_low + 1;
+
+      last_transpose[i] = _mm256_set_m128i(_mm_loadu_si128(row1_low),
+                                           _mm_loadu_si128(row0_low));
+      last_transpose[i + bs / 2] = _mm256_set_m128i(_mm_loadu_si128(row1_high),
+                                                    _mm_loadu_si128(row0_high));
+    }
+
+    last_transpose =
+        UnpackSequence<element_size, /*step_size=*/1,
+                       /*unpack_limit=*/sizeof(__m128i)>(last_transpose);
+
+    XLA_UNROLL
+    for (int i = 0; i < bs; ++i) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(b + ldb * i),
+                          last_transpose[i]);
+    }
+  }
+};
+#endif
+
+#ifdef __AVX__
+template <typename T, int bs>
+struct AvxRectangularTransposeMicroKernelImpl {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    constexpr size_t element_size = sizeof(T);
+    static_assert(element_size <= sizeof(__m128i));
+    static_assert(sizeof(__m128i) % element_size == 0);
+    static_assert(bs % 2 == 0);
+    static_assert(element_size * bs * 2 == sizeof(__m256i));
+    std::array<__m256i, bs / 2> last_transpose;
+    XLA_UNROLL
+    for (int i = 0; i < bs / 2; ++i) {
+      auto* lo = reinterpret_cast<const __m128i*>(a + lda * (i + 0));
+      auto* hi = reinterpret_cast<const __m128i*>(a + lda * (i + bs / 2));
+      last_transpose[i] =
+          _mm256_set_m128i(_mm_loadu_si128(hi), _mm_loadu_si128(lo));
+    }
+
+    last_transpose =
+        UnpackSequence<element_size, /*step_size=*/1,
+                       /*unpack_limit=*/sizeof(__m128i) / 2>(last_transpose);
+
+    if constexpr (element_size <= sizeof(__m128i) / 2) {
+      XLA_UNROLL
+      for (int i = 0; i < bs / 2; ++i) {
+#if defined(__AVX2__)
+        last_transpose[i] = _mm256_permute4x64_epi64(last_transpose[i],
+                                                     _MM_SHUFFLE(3, 1, 2, 0));
+#else
+        auto a = last_transpose[i];
+        auto hi = _mm256_permute2f128_si256(a, a, 0b0001'0001);
+        auto lo = _mm256_insertf128_si256(a, _mm256_castsi256_si128(a), 1);
+        last_transpose[i] = _mm256_castpd_si256(_mm256_shuffle_pd(
+            _mm256_castsi256_pd(lo), _mm256_castsi256_pd(hi), 0b1100));
+#endif
+      }
+    }
+
+    XLA_UNROLL
+    for (int i = 0; i < bs / 2; ++i) {
+      auto* lo = reinterpret_cast<__m128i*>(b + ldb * (i * 2 + 0));
+      auto* hi = reinterpret_cast<__m128i*>(b + ldb * (i * 2 + 1));
+
+      _mm_storeu_si128(lo, _mm256_castsi256_si128(last_transpose[i]));
+      _mm_storeu_si128(hi, _mm256_extractf128_si256(last_transpose[i], 1));
+    }
+  }
+};
+#endif
+
+// The transpose kernel requires its input to be contiguous in one of the two
+// dimensions being transposed, and the output to be contiguous in the other
+// dimension.
+//
+// lda, ldb are strides in bytes.
+template <typename T, int bs>
+struct TransposeMicroKernel {
+  static void Apply(const char* __restrict a, int64_t lda, char* __restrict b,
+                    int64_t ldb) {
+    if constexpr (bs % 2 == 0) {
+#ifdef __AVX__
+      if constexpr (sizeof(T) * bs == sizeof(__m256i)) {
+        return AvxSquareTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
+      } else if constexpr (sizeof(T) * bs == sizeof(__m128i)) {
+        return AvxRectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b,
+                                                                    ldb);
+      }
+#endif
+#ifdef XLA_HAS_VEC128
+      if constexpr (sizeof(T) * bs <= sizeof(Vec128)) {
+        return Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda,
+                                                                       b, ldb);
+      }
+#endif
+    }
+    for (int i = 0; i < bs; ++i) {
+      for (int j = 0; j < bs; ++j) {
+        *reinterpret_cast<T*>(b + i * ldb + j * sizeof(T)) =
+            *reinterpret_cast<T const*>(a + j * lda + i * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_TRANSPOSE_KERNELS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/utils.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/utils.h
new file mode 100644
index 00000000..d726ecd2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/utils.h
@@ -0,0 +1,172 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_UTILS_H_
+#define XLA_PJRT_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/layout_mode.h"
+#include "xla/service/computation_placer.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+using MemorySpaceColor = int;
+
+// Returns the num_replicas, num_partitions and device assignment given a
+// ExecutableBuildOptions and whether we want a portable executable.
+absl::Status ParseDeviceAssignmentCompileOptions(
+    bool compile_portable_executable, ExecutableBuildOptions* build_options,
+    std::function<absl::StatusOr<DeviceAssignment>(int, int)>
+        GetDefaultDeviceAssignmentFunction,
+    int* num_replicas, int* num_partitions,
+    std::shared_ptr<DeviceAssignment>* device_assignment);
+
+// Returns the LayoutMode for each argument of the main function in the
+// module. Checks for the "mhlo.layout_mode" attr, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+    mlir::ModuleOp module);
+// Returns the LayoutMode for each output of the main function in the
+// module. Checks for the "mhlo.layout_mode" attr, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+    mlir::ModuleOp module);
+
+// Returns the memory space for each argument of the computations. Checks
+// for the "mhlo.memory_kind" frontend attribute, and if not present, assumes 0.
+absl::StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
+    mlir::ModuleOp module);
+// Returns the memory space for each output of the computations. Checks for
+// the "mhlo.memory_kind" frontend attribute, and if not present, assumes 0.
+absl::StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
+    mlir::ModuleOp module);
+
+// Returns the LayoutMode for each argument of the computations. Checks for the
+// "arg_layout_mode" frontend attribute, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+    const XlaComputation& computation);
+// Returns the LayoutMode for each argument of the computations. Checks for the
+// "out_layout_mode" frontend attribute, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+    const XlaComputation& computation);
+
+// Returns the memory space for each argument of the computations. Checks for
+// the "arg_memory_kind" frontend attribute, and if not present, assumes 0.
+absl::StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
+    const XlaComputation& computation);
+// Returns the memory space for each argument of the computations. Checks for
+// the "out_memory_kind" frontend attribute, and if not present, assumes 0.
+absl::StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
+    const XlaComputation& computation);
+
+// Returns xla shape with layout set to reflect the given layout mode.
+absl::StatusOr<Shape> LayoutModeToXlaShape(
+    const LayoutMode& layout_mode, const Shape& unsharded_shape,
+    const Shape& sharded_shape, MemorySpaceColor memory_space,
+    std::function<absl::StatusOr<Shape>(Shape)>
+        choose_compact_layout_for_shape_function);
+
+// Returns (arg shapes, output shape) with properly-set Layouts that can
+// be passed to XLA to reflect arg_layout_modes and out_layout_modes.
+absl::StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
+    const XlaComputation& computation, std::vector<LayoutMode> arg_layout_modes,
+    std::vector<LayoutMode> out_layout_modes,
+    const std::vector<MemorySpaceColor>& arg_memory_spaces,
+    const std::vector<MemorySpaceColor>& out_memory_spaces,
+    std::function<absl::StatusOr<Shape>(Shape)>
+        choose_compact_layout_for_shape_function);
+
+// Generates useful data structures for communciating desired layouts to XLA:
+// * Returns a vector of argument xla::Shapes with properly-set Layouts
+// * Returns vector of pointers to those Shapes to create HloModuleConfig
+// * Modifies `build_options` to have the correct result_layout set or unset
+absl::StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
+LayoutModesToXla(const XlaComputation& computation,
+                 std::vector<LayoutMode> arg_layout_modes,
+                 std::vector<LayoutMode> out_layout_modes,
+                 const std::vector<MemorySpaceColor>& arg_memory_spaces,
+                 const std::vector<MemorySpaceColor>& out_memory_spaces,
+                 std::function<absl::StatusOr<Shape>(Shape)>
+                     choose_compact_layout_for_shape_function,
+                 ExecutableBuildOptions& build_options);
+
+// Returns pointers to the argument layouts given an XlaComputation and
+// ExecutableBuildOptions.
+absl::Status DetermineArgumentLayoutsFromCompileOptions(
+    const XlaComputation& computation,
+    std::function<absl::StatusOr<Shape>(Shape)>
+        choose_compact_layout_for_shape_function,
+    std::optional<std::vector<Shape>>& argument_layouts,
+    ExecutableBuildOptions* build_options,
+    std::vector<const Shape*>* argument_layout_pointers);
+
+// Executables can donate buffers so that buffers can be aliased from inputs
+// to outputs. This function returns a sorted vector of parameters that must be
+// donated when executable is run. tuple_inputs reflects the option that
+// executable was compiled with.
+absl::StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
+    const HloModule& hlo_module, bool tuple_inputs);
+
+// Return max parallelism level.
+int DefaultThreadPoolSize();
+
+// Returns true if the striding of an array corresponds to a major-to-minor
+// layout.
+bool HasMajorToMinorLayout(PrimitiveType type, absl::Span<int64_t const> dims,
+                           absl::Span<int64_t const> byte_strides);
+
+// Constructs a new dense array shape with the given byte strides. Supports only
+// trivial (compact) byte_strides that represents a transposition of a dense
+// buffer.
+absl::StatusOr<Shape> MakeShapeWithTrivialByteStrides(
+    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+    absl::Span<const int64_t> byte_strides);
+
+// If a buffer `is_donated`, then it can only be used once. This function
+// records the use into donation_clashes and tests for incompatible uses.
+// Multiple uses are valid iff they are all not donations.  The provided map
+// stores the opaque buffer identity, a bool to denote if the previous use is a
+// donation, and the index of the previous use for better error messages.
+absl::Status TestBufferDonationClashes(
+    void* opaque_key,
+    absl::flat_hash_map<const void*, std::pair<bool, int>>& donation_clashes,
+    bool is_donated, int arg_idx, int replica, int partition);
+
+// Capitalizes the first character in a string, which can be empty.
+std::string MakeAsciiTitlecase(absl::string_view s);
+void MakeAsciiTitlecase(std::string* s);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/pjrt/worker_thread.h b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/worker_thread.h
new file mode 100644
index 00000000..343ace96
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/pjrt/worker_thread.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_WORKER_THREAD_H_
+#define XLA_PJRT_WORKER_THREAD_H_
+
+#include <memory>
+#include <queue>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/platform/env.h"
+
+namespace xla {
+
+// A worker thread that runs a sequence of closures. Equivalent to a thread
+// pool of size 1.
+class WorkerThread {
+ public:
+  // 'name' is a name for the thread for debugging purposes.
+  WorkerThread(tsl::Env* env, const std::string& name);
+
+  // Blocks until all enqueued closures have completed.
+  ~WorkerThread();
+
+  // Adds 'fn' to the queue of closures to be executed by the worker thread.
+  void Schedule(absl::AnyInvocable<void() &&> fn);
+
+ private:
+  bool WorkAvailable() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void WorkLoop();
+
+  absl::Mutex mu_;
+  std::queue<absl::AnyInvocable<void() &&>> work_queue_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<tsl::Thread> thread_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_WORKER_THREAD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/primitive_util.h b/third_party/tflite-hdrs/third_party/xla/xla/primitive_util.h
new file mode 100644
index 00000000..b9c1c978
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/primitive_util.h
@@ -0,0 +1,859 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for dealing with XLA primitive types.
+
+#ifndef XLA_PRIMITIVE_UTIL_H_
+#define XLA_PRIMITIVE_UTIL_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/ml_dtypes.h"
+
+namespace xla {
+namespace primitive_util {
+
+// Returns the count of significand (mantissa) bits for float datatypes.
+// This includes the implicit leading mantissa bit. For example, returns 24 for
+// F32. For non-float datatypes, results in a LOG(FATAL).
+int SignificandWidth(PrimitiveType type);
+
+// Returns the count of exponent bits for float datatypes. For example, returns
+// 8 for F32. For non-float datatypes, results in a LOG(FATAL).
+int ExponentWidth(PrimitiveType type);
+
+// Returns the smallest integer n such that 2**(n-1) is a normalized number for
+// the given float datatype. In other words, returns one plus the exponent of
+// the smallest normalized number. For example, returns -125 for F32. For
+// non-float datatypes, results in a LOG(FATAL).
+int UnderflowExponent(PrimitiveType type);
+
+// Returns the largest integer n such that 2**(n-1) is a finite number for the
+// given float datatype. In other words, returns the smallest exponent that
+// causes overflow. For example, returns 128 for F32. For non-float datatypes,
+// results in a LOG(FATAL).
+int OverflowExponent(PrimitiveType type);
+
+// Returns the exponent bias of the given floating point type.
+// For non-float datatypes, results in a LOG(FATAL).
+int ExponentBias(PrimitiveType type);
+
+// Returns whether the type has a value for infinity.
+bool HasInfinity(PrimitiveType type);
+
+// Returns whether the type has a value for negative zero.
+bool HasNegativeZero(PrimitiveType type);
+
+// Returns the XLA primitive type (eg, F32) corresponding to the given
+// template parameter native type (eg, float).
+template <typename NativeT>
+constexpr PrimitiveType NativeToPrimitiveType() {
+  // Make the expression depend on the template parameter NativeT so
+  // that this compile-time error only appears if this function is
+  // instantiated with some concrete type that is not specialized
+  // below.
+  static_assert(!std::is_same<NativeT, NativeT>::value,
+                "Cannot map native type to primitive type.");
+  return PRIMITIVE_TYPE_INVALID;
+}
+
+// Declarations of specializations for each native type which correspond to a
+// XLA primitive type.
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<bool>() {
+  return PRED;
+}
+
+// Unsigned integer
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<u1>() {
+  return U1;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<u2>() {
+  return U2;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<u4>() {
+  return U4;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<uint8_t>() {
+  return U8;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<uint16_t>() {
+  return U16;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<uint32_t>() {
+  return U32;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<uint64_t>() {
+  return U64;
+}
+
+// Signed integer
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<s1>() {
+  return S1;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<s2>() {
+  return S2;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<s4>() {
+  return S4;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<int8_t>() {
+  return S8;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<int16_t>() {
+  return S16;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<int32_t>() {
+  return S32;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<int64_t>() {
+  return S64;
+}
+
+// Floating point
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<float>() {
+  return F32;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<double>() {
+  return F64;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<half>() {
+  return F16;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<bfloat16>() {
+  return BF16;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2>() {
+  return F8E5M2;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3>() {
+  return F8E4M3;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fn>() {
+  return F8E4M3FN;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3b11fnuz>() {
+  return F8E4M3B11FNUZ;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2fnuz>() {
+  return F8E5M2FNUZ;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fnuz>() {
+  return F8E4M3FNUZ;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e3m4>() {
+  return F8E3M4;
+}
+
+// Complex
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<complex64>() {
+  return C64;
+}
+
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<complex128>() {
+  return C128;
+}
+
+// Returns the native type (eg, float) corresponding to the given template
+// parameter XLA primitive type (eg, F32).
+template <PrimitiveType>
+struct PrimitiveTypeToNative;
+
+// Declarations of specializations for each native type which correspond to a
+// XLA primitive type.
+template <>
+struct PrimitiveTypeToNative<PRED> {
+  using type = bool;
+};
+
+// Unsigned integer
+template <>
+struct PrimitiveTypeToNative<U1> {
+  using type = u1;
+};
+
+template <>
+struct PrimitiveTypeToNative<U2> {
+  using type = u2;
+};
+
+template <>
+struct PrimitiveTypeToNative<U4> {
+  using type = u4;
+};
+
+template <>
+struct PrimitiveTypeToNative<U8> {
+  using type = uint8_t;
+};
+
+template <>
+struct PrimitiveTypeToNative<U16> {
+  using type = uint16_t;
+};
+
+template <>
+struct PrimitiveTypeToNative<U32> {
+  using type = uint32_t;
+};
+
+template <>
+struct PrimitiveTypeToNative<U64> {
+  using type = uint64_t;
+};
+
+// Signed integer
+template <>
+struct PrimitiveTypeToNative<S1> {
+  using type = s1;
+};
+
+template <>
+struct PrimitiveTypeToNative<S2> {
+  using type = s2;
+};
+
+template <>
+struct PrimitiveTypeToNative<S4> {
+  using type = s4;
+};
+
+template <>
+struct PrimitiveTypeToNative<S8> {
+  using type = int8_t;
+};
+
+template <>
+struct PrimitiveTypeToNative<S16> {
+  using type = int16_t;
+};
+
+template <>
+struct PrimitiveTypeToNative<S32> {
+  using type = int32_t;
+};
+
+template <>
+struct PrimitiveTypeToNative<S64> {
+  using type = int64_t;
+};
+
+// Floating point
+template <>
+struct PrimitiveTypeToNative<F32> {
+  using type = float;
+};
+template <>
+struct PrimitiveTypeToNative<F64> {
+  using type = double;
+};
+template <>
+struct PrimitiveTypeToNative<F16> {
+  using type = half;
+};
+
+template <>
+struct PrimitiveTypeToNative<BF16> {
+  using type = bfloat16;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E5M2> {
+  using type = tsl::float8_e5m2;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E4M3> {
+  using type = tsl::float8_e4m3;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E4M3FN> {
+  using type = tsl::float8_e4m3fn;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E4M3B11FNUZ> {
+  using type = tsl::float8_e4m3b11fnuz;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E5M2FNUZ> {
+  using type = tsl::float8_e5m2fnuz;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E4M3FNUZ> {
+  using type = tsl::float8_e4m3fnuz;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E3M4> {
+  using type = tsl::float8_e3m4;
+};
+
+// Complex
+template <>
+struct PrimitiveTypeToNative<C64> {
+  using type = complex64;
+};
+
+template <>
+struct PrimitiveTypeToNative<C128> {
+  using type = complex128;
+};
+
+// Token
+template <>
+struct PrimitiveTypeToNative<TOKEN> {
+  using type = void;
+};
+
+template <PrimitiveType kType>
+using NativeTypeOf =
+    typename primitive_util::PrimitiveTypeToNative<kType>::type;
+
+template <PrimitiveType kPrimitiveType>
+using PrimitiveTypeConstant =
+    std::integral_constant<PrimitiveType, kPrimitiveType>;
+
+// Returns true if values of the given primitive type are held in array shapes.
+inline constexpr bool IsArrayType(PrimitiveType primitive_type) {
+  return primitive_type != TUPLE && primitive_type != OPAQUE_TYPE &&
+         primitive_type != TOKEN && primitive_type > PRIMITIVE_TYPE_INVALID &&
+         primitive_type < PrimitiveType_ARRAYSIZE;
+}
+
+constexpr bool IsF8Type(PrimitiveType type) {
+  return type == F8E5M2 || type == F8E4M3 || type == F8E4M3FN ||
+         type == F8E4M3B11FNUZ || type == F8E5M2FNUZ || type == F8E4M3FNUZ ||
+         type == F8E3M4;
+}
+
+constexpr bool IsFloatingPointType(PrimitiveType type) {
+  return type == F16 || type == F32 || type == F64 || type == BF16 ||
+         IsF8Type(type);
+}
+
+constexpr bool IsComplexType(PrimitiveType type) {
+  return type == C64 || type == C128;
+}
+
+constexpr bool IsSignedIntegralType(PrimitiveType type) {
+  return type == S1 || type == S2 || type == S4 || type == S8 || type == S16 ||
+         type == S32 || type == S64;
+}
+
+constexpr bool IsUnsignedIntegralType(PrimitiveType type) {
+  return type == U1 || type == U2 || type == U4 || type == U8 || type == U16 ||
+         type == U32 || type == U64;
+}
+
+constexpr bool IsIntegralType(PrimitiveType type) {
+  return IsUnsignedIntegralType(type) || IsSignedIntegralType(type);
+}
+
+template <typename R, typename F>
+constexpr R IntegralTypeSwitch(F&& f, PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsIntegralType(type))) {
+    switch (type) {
+      case S1:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S1>());
+      case S2:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S2>());
+      case S4:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S4>());
+      case S8:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S8>());
+      case S16:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S16>());
+      case S32:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S32>());
+      case S64:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S64>());
+      case U1:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U1>());
+      case U2:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U2>());
+      case U4:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U4>());
+      case U8:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U8>());
+      case U16:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U16>());
+      case U32:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U32>());
+      case U64:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U64>());
+      default:
+        ABSL_UNREACHABLE();
+    }
+  }
+  LOG(FATAL) << "Not an integral data type " << type;
+}
+
+template <typename R, typename F>
+constexpr R FloatingPointTypeSwitch(F&& f, PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsFloatingPointType(type))) {
+    switch (type) {
+      case F8E3M4:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E3M4>());
+      case F8E4M3:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E4M3>());
+      case F8E4M3FN:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E4M3FN>());
+      case F8E4M3B11FNUZ:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E4M3B11FNUZ>());
+      case F8E4M3FNUZ:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E4M3FNUZ>());
+      case F8E5M2:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E5M2>());
+      case F8E5M2FNUZ:
+        return std::forward<F>(f)(
+            PrimitiveTypeConstant<PrimitiveType::F8E5M2FNUZ>());
+      case F16:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F16>());
+      case BF16:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::BF16>());
+      case F32:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F32>());
+      case F64:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F64>());
+      default:
+        ABSL_UNREACHABLE();
+    }
+  }
+  LOG(FATAL) << "Not a floating point data type " << type;
+}
+
+template <typename R, typename F>
+constexpr R ComplexTypeSwitch(F&& f, PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsComplexType(type))) {
+    switch (type) {
+      case C64:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::C64>());
+      case C128:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::C128>());
+      default:
+        ABSL_UNREACHABLE();
+    }
+  }
+  LOG(FATAL) << "Not a complex data type " << type;
+}
+
+template <typename R, typename F>
+constexpr R ArrayTypeSwitch(F&& f, PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsArrayType(type))) {
+    if (IsFloatingPointType(type)) {
+      return FloatingPointTypeSwitch<R>(std::forward<F>(f), type);
+    }
+    if (IsIntegralType(type)) {
+      return IntegralTypeSwitch<R>(std::forward<F>(f), type);
+    }
+    if (IsComplexType(type)) {
+      return ComplexTypeSwitch<R>(std::forward<F>(f), type);
+    }
+    if (type == PRED) {
+      return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::PRED>());
+    }
+  }
+  LOG(FATAL) << "Not an array data type " << type;
+}
+
+template <typename R, typename F>
+constexpr R PrimitiveTypeSwitch(F&& f, PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsArrayType(type))) {
+    return ArrayTypeSwitch<R>(std::forward<F>(f), type);
+  }
+  if (type == TUPLE) {
+    return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::TUPLE>());
+  }
+  if (type == TOKEN) {
+    return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::TOKEN>());
+  }
+  if (type == OPAQUE_TYPE) {
+    return std::forward<F>(f)(
+        PrimitiveTypeConstant<PrimitiveType::OPAQUE_TYPE>());
+  }
+  LOG(FATAL) << "unhandled type " << type;
+}
+
+namespace internal {
+template <PrimitiveType primitive_type>
+inline constexpr int PrimitiveTypeBitWidth() {
+  if constexpr (IsArrayType(primitive_type)) {
+    using NativeT = primitive_util::NativeTypeOf<primitive_type>;
+    if constexpr (IsIntegralType(primitive_type)) {
+      static_assert(is_specialized_integral_v<NativeT>);
+      static_assert(std::numeric_limits<NativeT>::is_signed ==
+                    IsSignedIntegralType(primitive_type));
+      static_assert(std::numeric_limits<NativeT>::radix == 2);
+      return std::numeric_limits<NativeT>::digits +
+             (IsSignedIntegralType(primitive_type) ? 1 : 0);
+    }
+    if constexpr (primitive_type == PRED) {
+      return std::numeric_limits<NativeT>::digits;
+    }
+    if constexpr (IsFloatingPointType(primitive_type)) {
+      return sizeof(NativeT) * std::numeric_limits<uint8_t>::digits;
+    }
+    if constexpr (IsComplexType(primitive_type)) {
+      static_assert(is_complex_v<NativeT>);
+      return sizeof(NativeT) * std::numeric_limits<uint8_t>::digits;
+    }
+  }
+  return 0;
+}
+template <int... Types>
+inline constexpr auto BitWidthArrayHelper(
+    std::integer_sequence<int, Types...>) {
+  return std::array{PrimitiveTypeBitWidth<PrimitiveType{Types}>()...};
+}
+
+inline constexpr auto kBitWidths = BitWidthArrayHelper(
+    std::make_integer_sequence<int, PrimitiveType_ARRAYSIZE>{});
+
+template <int... Types>
+inline constexpr auto ByteWidthArrayHelper(
+    std::integer_sequence<int, Types...>) {
+  return std::array{
+      CeilOfRatio(PrimitiveTypeBitWidth<PrimitiveType{Types}>(), 8)...};
+}
+inline constexpr auto kByteWidths = ByteWidthArrayHelper(
+    std::make_integer_sequence<int, PrimitiveType_ARRAYSIZE>{});
+
+template <const std::array<int, PrimitiveType_ARRAYSIZE>& kWidths>
+inline constexpr int WidthForType(PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsArrayType(type))) {
+    return kWidths[type];
+  }
+  LOG(FATAL) << "Unhandled primitive type " << type;
+}
+}  // namespace internal
+
+// Returns the number of bits in the representation for a given type.
+inline constexpr int BitWidth(PrimitiveType type) {
+  return internal::WidthForType<internal::kBitWidths>(type);
+}
+
+// Returns the number of bytes in the representation for a given type.
+inline constexpr int ByteWidth(PrimitiveType type) {
+  return internal::WidthForType<internal::kByteWidths>(type);
+}
+
+constexpr PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
+  switch (src_bitwidth) {
+    case 1:
+      return xla::U1;
+    case 2:
+      return xla::U2;
+    case 4:
+      return xla::U4;
+    case 8:
+      return xla::U8;
+    case 16:
+      return xla::U16;
+    case 32:
+      return xla::U32;
+    case 64:
+      return xla::U64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
+
+PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth);
+
+// Returns the real, imag component type underlying the given complex type.
+// LOG(FATAL)'s if complex_type is not complex.
+constexpr PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
+  switch (complex_type) {
+    case C64:
+      return F32;
+    case C128:
+      return F64;
+    default:
+      LOG(FATAL) << "Primitive type is not complex: "
+                 << PrimitiveType_Name(complex_type);
+  }
+}
+
+constexpr PrimitiveType ComplexType(PrimitiveType base_type) {
+  if (base_type == F32) {
+    return C64;
+  }
+  if (base_type == F64) {
+    return C128;
+  }
+  return PRIMITIVE_TYPE_INVALID;
+}
+
+// Returns the higher-precision element type if a and b are both floating
+// point types; otherwise, checks that they have the same element type
+// and returns it.
+inline PrimitiveType HigherPrecisionType(PrimitiveType a, PrimitiveType b) {
+  // Returns a tuple where the elements are lexicographically ordered in terms
+  // of importance.
+  auto type_properties = [](PrimitiveType type) {
+    auto component_type =
+        IsComplexType(type) ? ComplexComponentType(type) : type;
+    return std::make_tuple(
+        // Prefer complex types over non-complex types.
+        IsComplexType(type),
+        // Prefer floating point types with more range over other
+        // floating-point types or non-floating point types.
+        IsFloatingPointType(component_type) ? OverflowExponent(component_type)
+                                            : -1,
+        // Prefer floating point types with more precision over less precise
+        // types.
+        IsFloatingPointType(component_type) ? SignificandWidth(component_type)
+                                            : -1,
+        // Prefer wider types over narrower types.
+        BitWidth(component_type),
+        // Prefer signed integer types over unsigned integer types.
+        IsSignedIntegralType(component_type));
+  };
+  auto a_properties = type_properties(a);
+  auto b_properties = type_properties(b);
+  if (a_properties > b_properties) {
+    return a;
+  }
+  if (b_properties > a_properties) {
+    return b;
+  }
+  CHECK_EQ(a, b);
+  return a;
+}
+
+// Returns true if a convert from from_type to to_type loses no precision.
+inline bool CastPreservesValues(PrimitiveType from_type,
+                                PrimitiveType to_type) {
+  // * -> *
+  if (from_type == to_type) {
+    return true;
+  }
+  // PRED -> *
+  if (from_type == PRED) {
+    return true;
+  }
+  // ~PRED -> PRED is not safe because it drops almost all numbers.
+  if (to_type == PRED) {
+    return false;
+  }
+  // * -> C is safe if the components of * and C can be safely converted.
+  if (primitive_util::IsComplexType(to_type)) {
+    auto from_component_type =
+        primitive_util::IsComplexType(from_type)
+            ? primitive_util::ComplexComponentType(from_type)
+            : from_type;
+    auto to_component_type = primitive_util::ComplexComponentType(to_type);
+    return CastPreservesValues(from_component_type, to_component_type);
+  }
+  // ~C -> C is not safe because it drops imaginary components.
+  if (primitive_util::IsComplexType(from_type)) {
+    return false;
+  }
+  // F -> F is safe if the exponent/significand are preserved and `to_type`
+  // preserves infinities in `from_type.
+  if (primitive_util::IsFloatingPointType(from_type) &&
+      primitive_util::IsFloatingPointType(to_type)) {
+    return (!primitive_util::HasInfinity(from_type) ||
+            primitive_util::HasInfinity(to_type)) &&
+           primitive_util::SignificandWidth(from_type) <=
+               primitive_util::SignificandWidth(to_type) &&
+           primitive_util::ExponentWidth(from_type) <=
+               primitive_util::ExponentWidth(to_type) &&
+           (primitive_util::UnderflowExponent(from_type) -
+            primitive_util::SignificandWidth(from_type)) >=
+               (primitive_util::UnderflowExponent(to_type) -
+                primitive_util::SignificandWidth(to_type)) &&
+           primitive_util::OverflowExponent(from_type) <=
+               primitive_util::OverflowExponent(to_type);
+  }
+  // F -> I is not safe because it drops fractional numbers.
+  if (!primitive_util::IsIntegralType(from_type)) {
+    return false;
+  }
+  // An n-bit unsigned integer takes on values from [0, 2^n - 1].
+  // An n-bit signed integer takes on values from [-2^(n-1), 2^(n-1) - 1].
+  // from_bits/to_bits considers the number of non-sign bits.
+  const int from_bits = primitive_util::IsSignedIntegralType(from_type)
+                            ? primitive_util::BitWidth(from_type) - 1
+                            : primitive_util::BitWidth(from_type);
+  const int to_bits = primitive_util::IsSignedIntegralType(to_type)
+                          ? primitive_util::BitWidth(to_type) - 1
+                          : primitive_util::BitWidth(to_type);
+  // I -> F is safe if the integer can be represented exactly.
+  if (primitive_util::IsFloatingPointType(to_type)) {
+    // In both cases, we need to handle an exponent of n-1.
+    // However, the significand needed to represent signed two's complement
+    // numbers is smaller by one bit because it will only have a non-zero
+    // trailing significand field when the exponent is smaller than n-1.
+    return from_bits <= primitive_util::SignificandWidth(to_type) &&
+           primitive_util::BitWidth(from_type) - 1 <
+               primitive_util::OverflowExponent(to_type);
+  }
+  // S -> U is not safe because it drops negative numbers.
+  if (primitive_util::IsSignedIntegralType(from_type) &&
+      primitive_util::IsUnsignedIntegralType(to_type)) {
+    return false;
+  }
+  // I -> I is safe if the integer can be represented exactly; we've already
+  // ensured that signed to unsigned conversions won't happen here.
+  CHECK(primitive_util::IsIntegralType(to_type));
+  return from_bits <= to_bits;
+}
+
+// Returns the lower-case name of the given primitive type.
+const std::string& LowercasePrimitiveTypeName(PrimitiveType s);
+
+// Returns the PrimitiveType matching the given name. The given name is expected
+// to be lower-case.
+absl::StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name);
+
+// Returns true if the given name is a primitive type string (lower-case).
+bool IsPrimitiveTypeName(absl::string_view name);
+
+// Returns whether `type` can be expressed as an instance of T.
+// For example,
+//  IsCanonicalRepresentation<float>(F32)          // true
+//  IsCanonicalRepresentation<xla::bfloat16>(BF16) // true
+//  IsCanonicalRepresentation<int32_t>(S8)         // true, 8 <= 32
+//  IsCanonicalRepresentation<uint16_t>(S16)       // false, unsigned.
+template <typename T>
+bool IsCanonicalRepresentation(PrimitiveType type) {
+  return PrimitiveTypeSwitch<bool>(
+      [](auto primitive_type) -> bool {
+        if constexpr (primitive_util::IsFloatingPointType(primitive_type) ||
+                      primitive_util::IsComplexType(primitive_type)) {
+          return NativeToPrimitiveType<T>() == primitive_type;
+        }
+        if constexpr (primitive_util::IsSignedIntegralType(primitive_type)) {
+          return std::numeric_limits<T>::is_integer &&
+                 std::numeric_limits<T>::is_signed &&
+                 BitWidth(primitive_type) <=
+                     (std::numeric_limits<T>::digits + 1);
+        }
+        if constexpr (primitive_util::IsUnsignedIntegralType(primitive_type) ||
+                      primitive_type == PRED) {
+          return std::numeric_limits<T>::is_integer &&
+                 !std::numeric_limits<T>::is_signed &&
+                 BitWidth(primitive_type) <= std::numeric_limits<T>::digits;
+        }
+        return false;
+      },
+      type);
+}
+
+inline bool FitsInIntegralType(int64_t x, PrimitiveType ty) {
+  return primitive_util::IntegralTypeSwitch<bool>(
+      [&](auto primitive_type) -> bool {
+        using NativeT = primitive_util::NativeTypeOf<primitive_type>;
+        return std::numeric_limits<NativeT>::min() <= x &&
+               std::numeric_limits<NativeT>::max() >= x;
+      },
+      ty);
+}
+
+constexpr bool IsSubByteNonPredType(PrimitiveType type) {
+  return IsArrayType(type) && type != PRED &&
+         primitive_util::BitWidth(type) < 8;
+}
+
+inline void PackIntN(PrimitiveType input_type, absl::Span<const char> input,
+                     absl::Span<char> output) {
+  xla::PackIntN(primitive_util::BitWidth(input_type), input, output);
+}
+
+inline void UnpackIntN(PrimitiveType input_type, absl::Span<const char> input,
+                       absl::Span<char> output) {
+  xla::UnpackIntN(primitive_util::BitWidth(input_type), input, output);
+}
+
+}  // namespace primitive_util
+}  // namespace xla
+
+#endif  // XLA_PRIMITIVE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/printer.h b/third_party/tflite-hdrs/third_party/xla/xla/printer.h
new file mode 100644
index 00000000..fd6c4314
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/printer.h
@@ -0,0 +1,125 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PRINTER_H_
+#define XLA_PRINTER_H_
+
+#include <iterator>
+#include <string>
+
+#include "absl/base/optimization.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/cord_buffer.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// Abstract "printer" interface.
+//
+// This interface is used in XLA to print data structures into a human-readaable
+// form. Most users can use either `StringPrinter` or `CordPrinter` to retrieve
+// the result in a string format; power users may implement their own printer to
+// implement "streamed printing" if needed.
+class Printer {
+ public:
+  virtual ~Printer() = default;
+
+  // Appends the given string to the printer.
+  virtual void Append(const absl::AlphaNum& a) = 0;
+};
+
+// A printer implementation that accumulates printed strings into `std::string`.
+class StringPrinter : public Printer {
+ public:
+  void Append(const absl::AlphaNum& a) override;
+
+  std::string ToString() &&;
+
+ private:
+  std::string result_;
+};
+
+// A printer implementation that accumulates printed strings into `absl::Cord`.
+class CordPrinter : public Printer {
+ public:
+  void Append(const absl::AlphaNum& a) override;
+
+  absl::Cord ToCord() &&;
+
+ private:
+  void AppendImpl(const absl::AlphaNum& a);
+  void AppendBuffer();
+
+  absl::CordBuffer buffer_;
+  absl::Cord result_;
+};
+
+// Utility functions that appends a list of elements to a Printer as if by
+// calling printer->Append(absl::StrJoin(...)), but does it in-place.
+template <typename Range, typename PrintFunc>
+void AppendJoin(Printer* printer, const Range& range,
+                absl::string_view separator, PrintFunc&& print) {
+  AppendJoin(printer, range.begin(), range.end(), separator,
+             std::forward<PrintFunc>(print));
+}
+
+template <typename Iterator, typename PrintFunc>
+void AppendJoin(Printer* printer, Iterator start, Iterator end,
+                absl::string_view separator, PrintFunc&& print) {
+  if (ABSL_PREDICT_FALSE(start == end)) return;
+  print(printer, *start);
+  std::advance(start, 1);
+  while (start != end) {
+    printer->Append(separator);
+    print(printer, *start);
+    std::advance(start, 1);
+  }
+}
+
+template <typename Range>
+void AppendJoin(Printer* printer, const Range& range,
+                absl::string_view separator) {
+  AppendJoin(printer, range, separator,
+             [](Printer* printer, auto& element) { printer->Append(element); });
+}
+
+// Utility function that appends multiple elements to a Printer as if by calling
+// printer->Append(absl::StrCat(...)), but does it in-place.
+inline void AppendCat(Printer* printer, const absl::AlphaNum& a,
+                      const absl::AlphaNum& b) {
+  printer->Append(a);
+  printer->Append(b);
+}
+
+inline void AppendCat(Printer* printer, const absl::AlphaNum& a,
+                      const absl::AlphaNum& b, const absl::AlphaNum& c) {
+  printer->Append(a);
+  printer->Append(b);
+  printer->Append(c);
+}
+
+inline void AppendCat(Printer* printer, const absl::AlphaNum& a,
+                      const absl::AlphaNum& b, const absl::AlphaNum& c,
+                      const absl::AlphaNum& d) {
+  printer->Append(a);
+  printer->Append(b);
+  printer->Append(c);
+  printer->Append(d);
+}
+
+}  // namespace xla
+
+#endif  // XLA_PRINTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/protobuf_util.h b/third_party/tflite-hdrs/third_party/xla/xla/protobuf_util.h
new file mode 100644
index 00000000..4ba58f2f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/protobuf_util.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PROTOBUF_UTIL_H_
+#define XLA_PROTOBUF_UTIL_H_
+
+#include <cstddef>
+#include <functional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+namespace protobuf_util {
+
+// Returns true if m1 is equal to m2.
+//
+// WARNING: We use protocol buffer serialization and then check for
+// equality of the serialized representation, which may miss some
+// cases of equality.  However, for the purposes of the XLA code
+// base, this form of equality checking is sufficient.
+extern bool ProtobufEquals(const tsl::protobuf::Message& m1,
+                           const tsl::protobuf::Message& m2);
+
+// Return the hash of the message "m".
+//
+// WARNING: This uses the same serialization approach used by ProtobufEquals,
+// so the WARNING for that function applies here.
+size_t ProtobufHash(const tsl::protobuf::Message& m);
+
+// Wrappers for above methods so that they can be used in containers.
+class ProtobufEqualsWrapper {
+ public:
+  bool operator()(const tsl::protobuf::Message& m1,
+                  const tsl::protobuf::Message& m2) const {
+    return ProtobufEquals(m1, m2);
+  }
+};
+
+class ProtobufHashWrapper {
+ public:
+  size_t operator()(const tsl::protobuf::Message& m) const {
+    return ProtobufHash(m);
+  }
+};
+
+}  // namespace protobuf_util
+}  // namespace xla
+
+#endif  // XLA_PROTOBUF_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/aggregate_profile.h b/third_party/tflite-hdrs/third_party/xla/xla/python/aggregate_profile.h
new file mode 100644
index 00000000..7e7e8e2b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/aggregate_profile.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_AGGREGATE_PROFILE_H_
+#define XLA_PYTHON_AGGREGATE_PROFILE_H_
+
+#include "absl/types/span.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
+
+namespace xla {
+
+// Aggregates and gets given percentile of multiple ProfiledInstructionsProtos
+// into one ProfiledInstructionsProto.
+void AggregateProfiledInstructionsProto(
+    absl::Span<const tensorflow::profiler::ProfiledInstructionsProto> profiles,
+    int percentile,
+    tensorflow::profiler::ProfiledInstructionsProto *result_profile);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_AGGREGATE_PROFILE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/callback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/callback.h
new file mode 100644
index 00000000..e77647aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/callback.h
@@ -0,0 +1,91 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_CALLBACK_H_
+#define XLA_PYTHON_CALLBACK_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "nanobind/nanobind.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/python/nb_numpy.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CpuCallback {
+ public:
+  struct Arg {
+    xla::PrimitiveType type;               // XLA type
+    nb_dtype dtype;                        // NumPy type, for array types.
+    absl::InlinedVector<int64_t, 4> dims;  // Dimensions, for array types.
+    std::vector<int64_t> strides;          // Byte strides, for array types.
+    size_t size_in_bytes;                  // Size of the array in bytes.
+  };
+  struct Result {
+    xla::PrimitiveType type;  // XLA type
+    // Expected output shape, for array types
+    absl::InlinedVector<int64_t, 4> expected_dims;
+    // Expected output byte strides, for array types. If the strides do not
+    // match the output will be transposed into the expected layout.
+    std::vector<int64_t> expected_strides;
+    // The desired order of output dimensions in major-to-minor order.
+    absl::InlinedVector<int64_t, 4> reversed_layout;
+    // Size of the array in bytes.
+    size_t size_in_bytes;
+  };
+
+  explicit CpuCallback(nanobind::callable callable, std::vector<Arg> args,
+                       std::vector<Result> results)
+      : callable_(std::move(callable)),
+        args_(std::move(args)),
+        results_(std::move(results)),
+        transpose_cache_(/*capacity=*/16) {}
+
+  ~CpuCallback();
+
+  const std::vector<Arg>& args() const { return args_; }
+  size_t num_args() const { return args_.size(); }
+
+  const std::vector<Result>& results() const { return results_; }
+  size_t num_results() const { return results_.size(); }
+
+  xla::TransposePlanCache& transpose_cache() { return transpose_cache_; }
+
+  absl::Status PrepareAndCall(void* result, void** arg_ptrs);
+
+  absl::StatusOr<nanobind::tuple> Call(nanobind::tuple args);
+
+ private:
+  nanobind::callable callable_;
+  std::vector<Arg> args_;
+  std::vector<Result> results_;
+  xla::TransposePlanCache transpose_cache_;
+};
+
+void XlaPythonCpuCallback(void* output, void** inputs,
+                          XlaCustomCallStatus* status);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/config.h b/third_party/tflite-hdrs/third_party/xla/xla/python/config.h
new file mode 100644
index 00000000..b322b4ce
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/config.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_CONFIG_H_
+#define XLA_PYTHON_CONFIG_H_
+
+#include <vector>
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace jax {
+
+// Returns the set of configuration values that should be included in the JIT
+// cache key.
+std::vector<nanobind::object> JitConfigs();
+
+void BuildConfigSubmodule(nanobind::module_& m);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_CONFIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/custom_call_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/python/custom_call_sharding.h
new file mode 100644
index 00000000..0f219b25
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/custom_call_sharding.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_CUSTOM_CALL_SHARDING_H_
+#define XLA_PYTHON_CUSTOM_CALL_SHARDING_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildCustomCallShardingPybindAPI(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_CUSTOM_CALL_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/custom_partition_callback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/custom_partition_callback.h
new file mode 100644
index 00000000..6ba1789a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/custom_partition_callback.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PYTHON_CUSTOM_PARTITION_CALLBACK_H_
+#define XLA_PYTHON_CUSTOM_PARTITION_CALLBACK_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
+#include "xla/service/custom_call_sharding_helper.h"
+
+namespace jax {
+
+struct PartitionScratch {
+  std::vector<std::string> strings;
+  std::vector<JAX_CustomCallPartitioner_aval> op_args_storage;
+};
+PartitionScratch PopulateArgs(JAX_CustomCallPartitioner_Partition_Args* args,
+                              const xla::HloInstruction* instruction);
+absl::StatusOr<std::tuple<
+    std::vector<xla::Shape>, std::vector<std::optional<xla::HloSharding>>,
+    xla::Shape, std::optional<xla::HloSharding>, absl::string_view>>
+ReadArgs(JAX_CustomCallPartitioner_Partition_Args* args);
+void PopulateResults(
+    absl::StatusOr<std::tuple<std::string, std::vector<xla::HloSharding>,
+                              xla::HloSharding>>
+        results,
+    JAX_CustomCallPartitioner_Partition_Args* args);
+absl::StatusOr<
+    std::tuple<std::string, std::vector<xla::HloSharding>, xla::HloSharding>>
+ConsumeResults(JAX_CustomCallPartitioner_Partition_Args* args);
+
+absl::StatusOr<std::tuple<std::vector<xla::Shape>,
+                          std::vector<std::optional<xla::HloSharding>>,
+                          xla::Shape, absl::string_view>>
+ReadArgs(JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
+PartitionScratch PopulateArgs(
+    JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args,
+    const xla::HloInstruction* instruction);
+void PopulateResults(
+    absl::StatusOr<std::optional<xla::HloSharding>> result,
+    JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
+absl::StatusOr<std::optional<xla::HloSharding>> ConsumeResults(
+    JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
+
+absl::StatusOr<std::tuple<xla::HloSharding, xla::Shape, absl::string_view>>
+ReadArgs(JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
+PartitionScratch PopulateArgs(
+    JAX_CustomCallPartitioner_PropagateUserSharding_Args* args,
+    const xla::HloInstruction* instruction, const xla::HloSharding& sharding);
+void PopulateResults(
+    absl::StatusOr<xla::HloSharding> result,
+    JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
+absl::StatusOr<xla::HloSharding> ConsumeResults(
+    JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
+
+// Wraps c-api callbacks with the custom-call partitioner.
+std::unique_ptr<xla::CustomCallPartitioner> CreateCApiCustomCallPartitioner(
+    JAX_CustomCallPartitioner_Callbacks* c_fns);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_CUSTOM_PARTITION_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/dlpack.h b/third_party/tflite-hdrs/third_party/xla/xla/python/dlpack.h
new file mode 100644
index 00000000..0305e258
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/dlpack.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_DLPACK_H_
+#define XLA_PYTHON_DLPACK_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/ndarray.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+
+namespace xla {
+
+// If take_ownership is true, ownership of the buffer is handed to DLPack, and
+// the receiver may mutate the buffer as they see fit. Otherwise PjRt retains
+// ownership of the buffer and it should be immutable.
+//
+// stream, if set, is a GPU stream, e.g. cudaStream_t for CUDA GPUs, that should
+// be synchronized to the buffer as per
+// https://dmlc.github.io/dlpack/latest/python_spec.html#python-specification-for-dlpack.
+absl::StatusOr<nanobind::capsule> BufferToDLPackManagedTensor(
+    nanobind::handle buffer, std::optional<std::intptr_t> stream);
+
+absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
+    const nanobind::capsule& tensor,
+    std::optional<nb_class_ptr<PyClient>> cpu_client,
+    std::optional<nb_class_ptr<PyClient>> gpu_client);
+
+absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
+    const nanobind::capsule& tensor, ifrt::Device* device,
+    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream);
+
+// Converts a PrimitiveType to the nanobind specific implementation of
+// DLDataType.
+absl::StatusOr<nanobind::dlpack::dtype> PrimitiveTypeToNbDLDataType(
+    PrimitiveType type);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_DLPACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/gpu_support.h b/third_party/tflite-hdrs/third_party/xla/xla/python/gpu_support.h
new file mode 100644
index 00000000..371d25d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/gpu_support.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_GPU_SUPPORT_H_
+#define XLA_PYTHON_GPU_SUPPORT_H_
+
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+// Registers `make_gpu_client` and `GpuAllocatorConfig` in the parameter
+// nanobind module.
+void RegisterGpuClientAndDefineGpuAllocatorConfig(nanobind::module_& m_nb);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_GPU_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/guard_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/python/guard_lib.h
new file mode 100644
index 00000000..1ff668ff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/guard_lib.h
@@ -0,0 +1,115 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_GUARD_LIB_H_
+#define XLA_PYTHON_GUARD_LIB_H_
+
+#include <optional>
+#include <string>
+
+// placeholder for index annotation headers
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "nanobind/nanobind.h"
+
+namespace jax {
+
+// Transfer guard level chosen by the user code.
+enum class TransferGuardLevel {
+  // Explicit transfers: allow
+  // Implicit transfers: allow
+  kAllow,
+  // Explicit transfers: allow
+  // Implicit transfers: log
+  kLog,
+  // Explicit transfers: allow
+  // Implicit transfers: disallow
+  kDisallow,
+  // Explicit transfers: log
+  // Implicit transfers: log
+  kLogExplicit,
+  // Explicit transfers: disallow
+  // Implicit transfers: disallow
+  kDisallowExplicit,
+};
+
+// Garbage collection guard level chose by the user code.
+enum class GarbageCollectionGuardLevel {
+  // Silently allow the object to be garbage collected.
+  kAllow,
+  // Log and allow the object to be garbage collected.
+  kLog,
+  // Fatal crash on object garbage collection.
+  kFatal,
+};
+
+// Flags for guard levels are controlled by:
+// - a global flag value,
+//   e.g., associated to --jax_transfer_guard_device_to_host
+//   which defaults to TransferGuardLevel::kAllow.
+// - possibly a thread-local value, which initially is std::nullopt and
+//   overrides the global value if set. The thread-local state is used to
+//   implement context managers that locally override the global state.
+//
+// Explicit device_put/device_get contexts are tracked by context managers.
+struct GuardState {
+  std::optional<TransferGuardLevel> host_to_device;
+  std::optional<TransferGuardLevel> device_to_device;
+  std::optional<TransferGuardLevel> device_to_host;
+  bool explicit_device_put = false;
+  bool explicit_device_get = false;
+
+  std::optional<GarbageCollectionGuardLevel> garbage_collect_array;
+};
+
+// Resulting action for a transfer given the transfer guard level and the
+// transfer type.
+enum class TransferGuardAction {
+  // Silently allow the transfer.
+  kAllow,
+  // Log and allow the transfer.
+  kLog,
+  // Disallow the transfer.
+  kDisallow,
+};
+
+// Guards a host-to-device transfer. formatter is called to describe the
+// transfer in a log message or error status.
+// REQUIRES: Python GIL.
+absl::Status ApplyTransferGuardToHostToDevice(
+    absl::FunctionRef<std::string()> formatter);
+
+// Guards a device-to-device transfer. formatter is called to describe the
+// transfer in a log message or error status.
+// REQUIRES: Python GIL.
+absl::Status ApplyTransferGuardToDeviceToDevice(
+    absl::FunctionRef<std::string()> formatter);
+
+// Guards a device-to-host transfer. formatter is called to describe the
+// transfer in a log message or error status.
+// REQUIRES: Python GIL.
+absl::Status ApplyTransferGuardToDeviceToHost(
+    absl::FunctionRef<std::string()> formatter);
+
+// Returns the garbage collection guard level for "jax.Array" objects.
+// REQUIRES: Python GIL.
+GarbageCollectionGuardLevel GetGarbageCollectArrayGuard();
+
+// The function to call in `xla.cc` to add the bindings for this module.
+void BuildGuardSubmodule(nanobind::module_& m);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_GUARD_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/array.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/array.h
new file mode 100644
index 00000000..e31a2600
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/array.h
@@ -0,0 +1,142 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_ARRAY_H_
+#define XLA_PYTHON_IFRT_ARRAY_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Semantics for operations that may copy or move sharded buffers in an array.
+enum class ArrayCopySemantics : int {
+  // Always creates new buffers to construct an output array. Mutation of the
+  // output array buffers will not mutate the input array buffers.
+  kAlwaysCopy = 0,
+
+  // Tries to use the existing buffers of the input array to construct an output
+  // array. In-place mutation of the output array buffers may also mutate the
+  // input array buffers.
+  kReuseInput,
+
+  // Tries to use existing buffers of the input array to construct an output
+  // array. Drops the ownership of unused buffers in the input array, making the
+  // input array no longer usable and reclaiming its on-device resources.
+  kDonateInput,
+};
+
+// Represents a single logical array from one or more sharded buffers.
+// Implementations must be thread-safe.
+class Array : public llvm::RTTIExtends<Array, Value> {
+ public:
+  Array() = default;
+
+  // Not copyable or movable.
+  Array(const Array&) = delete;
+  Array(Array&&) = delete;
+  Array& operator=(const Array&) = delete;
+  Array& operator=(Array&&) = delete;
+
+  virtual DType dtype() const = 0;
+  virtual const Shape& shape() const = 0;
+  virtual const Sharding& sharding() const = 0;
+  virtual absl::Nonnull<std::shared_ptr<const Sharding>> shared_ptr_sharding()
+      const = 0;
+  // The device memory layout for each shard of the Array. All shards are
+  // assumed to have the same layout. Cannot be nullptr; implementations should
+  // return UNIMPLEMENTED instead.
+  virtual absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const = 0;
+
+  // Breaks an array up into per-device arrays. This is the elimination
+  // counterpart of `Client::AssembleArrayFromSingleDeviceArrays()`.
+  // TODO(hyeontaek): Replace this API with the version that takes
+  // `SingleDeviceShardSemantics`.
+  virtual absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) = 0;
+  virtual absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) = 0;
+
+  // Returns a shard of an Array which is fully replicated. This is an
+  // optimization so that instead of disassembling into all the shards when
+  // the Array is fully replicated, we can just get 1 shard out and create an
+  // Array from it.
+  virtual absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+      ArrayCopySemantics semantics) = 0;
+
+  // Fetches the array to host and stores it as unreplicated, unsharded data.
+  //
+  // DType whose sizes are unknown are unsupported.
+  //
+  // It may fail if sharding has insufficient information to
+  // unreplicated/unshard the data (e.g., `OpaqueSharding`), or the sharding
+  // contains an unaddressable device from the local runtime.
+  //
+  // If byte_strides is omitted, it defaults to a dense layout with dimensions
+  // in major-to-minor order. The runtime may return `UNIMPLEMENTED` if
+  // byte_strides does not equate to a reordering of the dimensions.
+  //
+  // `data` must remain valid until the returned future becomes ready. It will
+  // contain a valid data only if the returned future has an OK. Otherwise, its
+  // content is undefined.
+  //
+  // TODO(hyeontaek): Add a `size` argument or change the type of `data` to
+  // `absl::Span<char>` to guard against buffer underflows and overflows.
+  //
+  // TODO(hyeontaek): Clarify memory alignment issues and document them.
+  // Implementations may impose alignment requirements on `data`. They can fail
+  // if the requirements are not satisfied so that they avoid extra memory
+  // copies that could incur performance overhead or extra memory use. The
+  // required alignments may be different across backends (e.g., depending on
+  // they use DMA) and across different `DType` and `Shape`. We may need to add
+  // an API that lets users query the alignment requirement of the specific
+  // implementation.
+  ABSL_MUST_USE_RESULT
+  virtual Future<> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) = 0;
+
+  static char ID;  // NOLINT
+};
+
+// Convenience function to create a list of pointer Arrays from a list of
+// RCReference<Array>s.
+std::vector<Array*> MakeArrayPointerList(
+    absl::Span<const tsl::RCReference<Array>> arrays);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/array_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/array_spec.h
new file mode 100644
index 00000000..329ef0ab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/array_spec.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_ARRAY_SPEC_H_
+#define XLA_PYTHON_IFRT_ARRAY_SPEC_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/base/nullability.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array_spec.pb.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+
+namespace xla {
+namespace ifrt {
+
+// Specification of an array that groups the static properties of an `Array`
+// together. Typically used for describing expected or requested static
+// properties of an input/output array of an operation.
+struct ArraySpec {
+  DType dtype;
+  Shape shape;
+  absl::Nonnull<std::shared_ptr<const Sharding>> sharding;
+  absl::Nullable<std::shared_ptr<const xla::PjRtLayout>> layout;
+
+  // Constructs `ArraySpec` from `ArraySpecProto`.
+  static absl::StatusOr<ArraySpec> FromProto(
+      DeviceList::LookupDeviceFunc lookup_device, const ArraySpecProto& proto);
+
+  // Returns a `ArraySpecProto` representation.
+  absl::StatusOr<ArraySpecProto> ToProto() const;
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  std::string DebugString() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ArraySpec& array_spec) {
+    sink.Append(array_spec.DebugString());
+  }
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_ARRAY_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/attribute_map.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/attribute_map.h
new file mode 100644
index 00000000..69a56763
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/attribute_map.h
@@ -0,0 +1,108 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_ATTRIBUTE_MAP_H_
+#define XLA_PYTHON_IFRT_ATTRIBUTE_MAP_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/attribute_map.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Attribute map that contains UTF-8 keys and variant values.
+class AttributeMap {
+ public:
+  // Supported value types for `AttributeMap`. Modeled after
+  // `xla::PjRtValueType`, but they add a layer of structs that prevent implicit
+  // conversion. This ensures that `Value` to be constructed with a correct
+  // type. See
+  // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0608r3.html
+  // construction of `Value` with a wrong type.
+  struct StringValue {
+    explicit StringValue(std::string value) : value(std::move(value)) {}
+    std::string value;
+    bool operator==(const StringValue& other) const {
+      return value == other.value;
+    }
+  };
+  struct BoolValue {
+    explicit BoolValue(bool value) : value(value) {}
+    bool operator==(const BoolValue& other) const {
+      return value == other.value;
+    }
+    bool value;
+  };
+  struct Int64Value {
+    explicit Int64Value(int64_t value) : value(value) {}
+    bool operator==(const Int64Value& other) const {
+      return value == other.value;
+    }
+    int64_t value;
+  };
+  struct Int64ListValue {
+    explicit Int64ListValue(std::vector<int64_t> value)
+        : value(std::move(value)) {}
+    bool operator==(const Int64ListValue& other) const {
+      return value == other.value;
+    }
+    std::vector<int64_t> value;
+  };
+  struct FloatValue {
+    explicit FloatValue(float value) : value(value) {}
+    bool operator==(const FloatValue& other) const {
+      return value == other.value;
+    }
+    float value;
+  };
+  using Value = std::variant<StringValue, BoolValue, Int64Value, Int64ListValue,
+                             FloatValue>;
+
+  using Map = absl::flat_hash_map<std::string, Value>;
+
+  explicit AttributeMap(Map map) : map_(std::move(map)) {}
+
+  const Map& map() const { return map_; }
+
+  // Deserializes `AttributeMapProto` into `AttributeMap`.
+  static absl::StatusOr<AttributeMap> FromProto(const AttributeMapProto& proto);
+
+  // Serializes `AttributeMap` into `AttributeMapProto`.
+  AttributeMapProto ToProto() const;
+
+  std::string DebugString(size_t max_string_length = 64,
+                          size_t max_int64_list_size = 16) const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const AttributeMap& attribute_map) {
+    sink.Append(attribute_map.DebugString());
+  }
+
+ private:
+  Map map_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_ATTRIBUTE_MAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/client.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/client.h
new file mode 100644
index 00000000..13d797ec
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/client.h
@@ -0,0 +1,254 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_CLIENT_H_
+#define XLA_PYTHON_IFRT_CLIENT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/service/computation_placer.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+using PlatformId = ::xla::PjRtPlatformId;
+
+// TODO(hyeontaek): Generalize DeviceAssignment or hide it from the top-level
+// API.
+using DeviceAssignment = ::xla::DeviceAssignment;
+
+// Represents an IFRT client. It wraps a runtime that interacts with computation
+// devices and memory attached to it.
+class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
+ public:
+  // Describes the semantics the caller to `MakeArrayFromHostBuffer` expects
+  // from the runtime, in a total order from most restrictive to least
+  // restrictive.
+  //
+  // kImmutableOnlyDuringCall:
+  // The runtime may not hold references to `data` after the call to
+  // `MakeArrayFromHostBuffer` completes. The caller promises that `data` is
+  // immutable and will not be freed only for the duration of the
+  // `MakeArrayFromHostBuffer` call. `on_done_with_host_buffer` will be called
+  // before `MakeArrayFromHostBuffer` returns.
+
+  // kImmutableUntilTransferCompletes:
+  // The runtime may hold onto `data` after the call to
+  // `MakeArrayFromHostBuffer` returns while the runtime completes transfers to
+  // devices. The caller promises not to mutate or free `data` until the
+  // transfer completes, at which point the runtime will call
+  // `on_done_with_host_buffer`. It is also correct to wait (directly or
+  // indirectly) for the `Array`'s ready event. The runtime does not promise a
+  // certain ordering between an `on_done_with_host_buffer` call and the
+  // `Array`'s ready event.
+
+  // kZeroCopy:
+  // The `Array` may alias `data` internally and the runtime may use the `data`
+  // contents as long as the buffer is alive. The caller promises to keep `data`
+  // alive and not to mutate its contents as long as the buffer is alive; to
+  // notify the caller that the buffer may be freed, the runtime will call
+  // `on_done_with_host_buffer` when the `Array` is freed. The implementation is
+  // free to make a copy and downgrade the semantics to
+  // `kImmutableUntilTransferCompletes`. Many non-CPU runtimes will make a copy
+  // by default.
+  using HostBufferSemantics = ::xla::PjRtClient::HostBufferSemantics;
+
+  // Creates a new array from a host buffer.
+  //
+  // `data` points to the backing array of the host buffer. Caution:
+  // `byte_strides` are allowed to be negative, in which case `data` may need to
+  // point to the interior of the buffer, not necessarily its start.
+  //
+  // If `byte_strides` is omitted, it defaults to a dense layout with dimensions
+  // in major-to-minor order. The runtime may return `UNIMPLEMENTED` if
+  // `byte_strides` does not equate to a reordering of the dimensions.
+  //
+  // `on_done_with_host_buffer` is optional and may be null.
+  // `on_done_with_host_buffer` will be called iff OK is returned.
+  //
+  // TODO(hyeontaek): Consider changing `on_done_with_host_buffer` into a
+  // returned `Future<absl::Status>` for consistency with other IFRT APIs.
+  virtual absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+      HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) = 0;
+
+  // Builds a larger array out of individual per-device shards.
+  // TODO(hyeontaek): Replace this API with the version that takes
+  // `SingleDeviceShardSemantics`.
+  virtual absl::StatusOr<tsl::RCReference<Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      Shape shape, absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+      absl::Span<tsl::RCReference<Array>> arrays,
+      ArrayCopySemantics semantics) = 0;
+  virtual absl::StatusOr<tsl::RCReference<Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      Shape shape, absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+      absl::Span<tsl::RCReference<Array>> arrays,
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) = 0;
+
+  // Copies the arrays to a new set of devices.
+  //
+  // This method copies individual buffers of each array to the destination
+  // devices without altering their physical layout.
+  //
+  // This API should be used only with arrays that have the same source device
+  // list and memory kind. Every IFRT implementation must enforce this by
+  // returning an `INVALID_ARGUMENT` error if `arrays` contains different device
+  // lists or memory kinds.
+  //
+  // Implementations may return `UNIMPLEMENTED` if they do not know how to copy
+  // the data as instructed.
+  //
+  // It may fail if the buffer data would be sent from/to an unaddressable
+  // device.
+  virtual absl::StatusOr<std::vector<tsl::RCReference<Array>>> CopyArrays(
+      absl::Span<tsl::RCReference<Array>> arrays,
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind, ArrayCopySemantics semantics) = 0;
+
+  // Remaps shards across input `Array`s to create new `Array`s based on `plan`.
+  // This array remapping is a metadata-only operation that can shuffle or
+  // extract shards without changing their per-shard interpretation and causing
+  // data copy/transfer.
+  //
+  // There are constraints on `semantics`:
+  //
+  // * `ArrayCopySemantics::kAlwaysCopy` has an undefined behavior because
+  // `RemapArrays` does not copy data.
+  // * `ArrayCopySemantics::kReuseInput` is allowed only if the number of inputs
+  // is 1. This is safe because each input shard can be used only once.
+  // * `ArrayCopySemantics::kDonateInput` is always allowed.
+  virtual absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  RemapArrays(const RemapPlan& plan,
+              absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+              ArrayCopySemantics semantics) = 0;
+
+  // Returns a future that becomes ready once all of the values become ready.
+  //
+  // Timing and error semantics:
+  //
+  // * The returned future is fulfilled only after all values in `values` become
+  //   ready, regardless of their error statuses.
+  // * If none of the values have errors, the returned future is fulfilled with
+  //   `absl::OkStatus()` once all values are ready.
+  // * If there is one or more values with errors, the implementation will pick
+  //   one of them arbitrarily to fulfill the returned future.
+  //
+  // Note: this API currently accepts a span of `tsl::RCReference<Array>` for
+  // consistency with other APIs. We may change this to take a span of `Array*`
+  // instead to reflect its read-only semantics.
+  virtual Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<Value>> values) = 0;
+
+  // Builds a tuple from a sequence of values.
+  virtual absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<Value>> values) = 0;
+
+  // Identifies the IFRT implementation. Most C++ users should use LLVM RTTI to
+  // determine the runtime type. This is a string exposed to users mostly for
+  // informational reasons.
+  virtual absl::string_view runtime_type() const = 0;
+
+  // The following APIs are taken from `xla::PjRtClient` for fast prototyping.
+  // Most of the APIs will be factored out as a `Platform`/`Topology` in the
+  // future to facilitate topology discovery and ahead-of-time compilation.
+  // TODO(hyeontaek): Factor them out to a `Platform`/`Topology` class.
+  virtual absl::string_view platform_name() const = 0;
+  virtual absl::string_view platform_version() const = 0;
+  virtual PlatformId platform_id() const = 0;
+
+  // Returns the attributes of the client. In principle, these try to describe
+  // capabilities of a client rather than being a "feature flag".
+  //
+  // List of officially supported attributes:
+  //
+  // * supports_executable_serialization (bool; default = true): Whether IFRT
+  //   executables produced by this client are serializable. If false, all
+  //   executables from this client are considered not serializable.
+  virtual const AttributeMap& Attributes() const = 0;
+
+  virtual int device_count() const = 0;
+  virtual int addressable_device_count() const = 0;
+  virtual absl::Span<Device* const> devices() const = 0;
+  virtual absl::Span<Device* const> addressable_devices() const = 0;
+  virtual int process_index() const = 0;
+
+  // Returns all devices. The result includes primary devices that are included
+  // in `devices()` as well as any other devices that are associated with
+  // the primary devices.
+  virtual absl::Span<xla::ifrt::Device* const> GetAllDevices() const = 0;
+
+  // TODO(hyeontaek): Consider removing this API. This API is potentially not
+  // being used by JAX or will be replaced with explicit device assignment.
+  virtual absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const = 0;
+  virtual absl::StatusOr<Device*> LookupDevice(DeviceId device_id) const = 0;
+  virtual absl::StatusOr<Device*> LookupAddressableDevice(
+      int local_hardware_id) const = 0;
+
+  // TODO(hyeontaek): Potentially remove this method to encourage supporting
+  // only ahead-of-time compilation.
+  virtual Compiler* GetDefaultCompiler() = 0;
+
+  // Returns a topology that covers the provided devices.
+  virtual absl::StatusOr<std::shared_ptr<Topology>> GetTopologyForDevices(
+      const tsl::RCReference<DeviceList>& devices) const = 0;
+
+  // Returns the default layout on `device` with `memory_kind` for a buffer with
+  // `dtype` and single-shard dimensions `dims`.
+  // TODO(hyeontaek): Change the API to take `Shape` and `Sharding` instead of
+  // single-shard dimensions and device.
+  virtual absl::StatusOr<std::shared_ptr<const PjRtLayout>> GetDefaultLayout(
+      DType dtype, absl::Span<const int64_t> dims, Device* device,
+      xla::ifrt::MemoryKind memory_kind) const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/compiler.h
new file mode 100644
index 00000000..36cbdebd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/compiler.h
@@ -0,0 +1,82 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_COMPILER_H_
+#define XLA_PYTHON_IFRT_COMPILER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/executable_serdes.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/topology.h"
+
+namespace xla {
+namespace ifrt {
+
+// Abstract options for compiling a program and load it as `LoadedExecutable`.
+// Ideally, compile options should be present in the program being compiled to
+// help static checking and completeness. This option structure is to express
+// legacy compilation options that are not included in the program.
+//
+// TODO(hyeontaek): Make an new `LoadOptions` that is specific for loading.
+struct CompileOptions : llvm::RTTIExtends<CompileOptions, Serializable> {
+  static char ID;  // NOLINT
+};
+
+// Represents a compiler that creates an `Executable` that can run a computation
+// on devices.
+//
+// TODO(hyeontaek): All `Compiler` methods should take target information such
+// as "Platform" or "Topology" that is not tied to a real hardware allocation,
+// and return unloaded objects only. `Client` should take over the role of
+// loading of compiled objects into the target low-level runtime and hardware to
+// ready the them for execution. This will enable ahead-of-time compilation,
+// better separation between compilation, loading, and serialization and
+// deserialization.
+class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
+ public:
+  // Compiles `mlir_module` and returns a `LoadedExecutable`.
+  // TODO(hyeontaek): Move executable loading to `Client`.
+  virtual absl::StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
+      std::unique_ptr<Program> program,
+      std::unique_ptr<CompileOptions> options) = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<Program> program, const Topology& topology,
+      std::unique_ptr<CompileOptions> options) = 0;
+
+  // Deserializes a serialized executable as produced by
+  // `LoadedExecutable::Serialize()`. The compatibility of `serialized` is
+  // implementation specific.
+  // TODO(hyeontaek): Move executable loading to `Client`. Then, the user can
+  // use standard IFRT deserialization instead of this custom deserialization
+  // function.
+  virtual absl::StatusOr<std::unique_ptr<LoadedExecutable>>
+  DeserializeLoadedExecutable(
+      absl::string_view serialized,
+      std::unique_ptr<DeserializeExecutableOptions> options) = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/custom_call_program.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/custom_call_program.h
new file mode 100644
index 00000000..99b5b9f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/custom_call_program.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_CUSTOM_CALL_PROGRAM_H_
+#define XLA_PYTHON_IFRT_CUSTOM_CALL_PROGRAM_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/cord.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wraps a custom call program that expresses a runtime-specific execution.
+struct CustomCallProgram
+    : public llvm::RTTIExtends<CustomCallProgram, Program> {
+  // Specification for a single array. The sharding of all input and output
+  // specs must use only the devices in `devices`.
+  CustomCallProgram(std::string type, std::string name,
+                    absl::Cord serialized_program_text,
+                    tsl::RCReference<DeviceList> devices,
+                    std::vector<ArraySpec> input_specs,
+                    std::vector<ArraySpec> output_specs)
+      : type(std::move(type)),
+        name(std::move(name)),
+        serialized_program_text(std::move(serialized_program_text)),
+        devices(std::move(devices)),
+        input_specs(std::move(input_specs)),
+        output_specs(std::move(output_specs)) {}
+  ~CustomCallProgram() override = default;
+
+  // Type of this custom call program recognized by IFRT implementations. It
+  // indicates what this program represents, e.g., a runtime-specific feature or
+  // a pickled Python function.
+  std::string type;
+
+  // Name of this program. Used for debugging.
+  std::string name;
+
+  // Serialized custom call program. The interpretation of the program text
+  // depends `type`.
+  absl::Cord serialized_program_text;
+
+  // List of devices to compile and run the custom call program on.
+  tsl::RCReference<DeviceList> devices;
+
+  // Specification for input and output arrays. The custom call program must
+  // expect to receive input arrays and return output arrays both following the
+  // specification.
+  std::vector<ArraySpec> input_specs;
+  std::vector<ArraySpec> output_specs;
+
+  static char ID;  // NOLINT
+};
+
+// Compile options for a custom call program. It is currently empty because
+// the custom call program does not use any other runtime objects for
+// compilation.
+struct CustomCallCompileOptions
+    : llvm::RTTIExtends<CustomCallCompileOptions, CompileOptions> {
+  CustomCallCompileOptions() = default;
+  ~CustomCallCompileOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_CUSTOM_CALL_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device.h
new file mode 100644
index 00000000..c20b4000
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device.h
@@ -0,0 +1,112 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_DEVICE_H_
+#define XLA_PYTHON_IFRT_DEVICE_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/device.pb.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+class Memory;
+
+// Globally unique device IDs.
+TSL_LIB_GTL_DEFINE_INT_TYPE(DeviceId, int32_t);
+
+// `Device` represents a single device that can run computations. The types of
+// supported computations depend on the runtime.
+class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
+ public:
+  Device() = default;
+
+  // Not copyable or movable.
+  Device(const Device&) = delete;
+  Device(Device&&) = delete;
+  Device& operator=(const Device&) = delete;
+  Device& operator=(Device&&) = delete;
+
+  virtual Client* client() const = 0;
+
+  // The ID of this device. Globally unique across all processes.
+  virtual DeviceId Id() const = 0;
+
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the Device.
+  virtual const AttributeMap& Attributes() const = 0;
+
+  // A vendor-dependent string that uniquely identifies the kind of device,
+  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
+  // compatible compilation.
+  virtual absl::string_view Kind() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse,
+  // for example: "CpuDevice(id=0)".
+  virtual absl::string_view ToString() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  //
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  virtual absl::string_view DebugString() const = 0;
+
+  // Returns the default memory space attached to this device.
+  virtual absl::StatusOr<Memory*> DefaultMemory() const = 0;
+
+  // Returns all memory spaces attached to this device.
+  // The memory spaces are in no particular order.
+  virtual absl::Span<Memory* const> Memories() const = 0;
+
+  // Whether client can issue commands to this device.
+  virtual bool IsAddressable() const = 0;
+
+  // The index of the process that this device belongs to, i.e. is addressable
+  // from. This is not always identical to Client::process_index() in a
+  // multi-process setting, where each client can see devices from all
+  // processes, but only a subset of them are addressable and have the same
+  // process_index as the client.
+  virtual int ProcessIndex() const = 0;
+
+  template <class Sink>
+  friend void AbslStringify(Sink& sink, const Device& device) {
+    sink.Append(device.DebugString());
+  }
+
+  template <class Sink>
+  friend void AbslStringify(Sink& sink, const Device* device) {
+    if (device == nullptr) {
+      sink.Append("<nullptr>");
+    } else {
+      sink.Append(device->DebugString());
+    }
+  }
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_DEVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device_list.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device_list.h
new file mode 100644
index 00000000..27479428
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device_list.h
@@ -0,0 +1,223 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_DEVICE_LIST_H_
+#define XLA_PYTHON_IFRT_DEVICE_LIST_H_
+
+#include <atomic>
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device.pb.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// Ordered immutable list of devices.
+class DeviceList : public tsl::ReferenceCounted<DeviceList>,
+                   public llvm::RTTIExtends<DeviceList, llvm::RTTIRoot> {
+ public:
+  // Function that matches the semantics of `Client::LookupDevice()`.
+  // TODO(hyeontaek): Remove this type. In the future, a deserialization option
+  // will take `Client*` to allow constructing a complex `DeviceList` that is
+  // not just `BasicDeviceList`.
+  using LookupDeviceFunc = absl::FunctionRef<absl::StatusOr<Device*>(DeviceId)>;
+
+  // Not copyable or movable. `DeviceList` is a runtime object that may contain
+  // runtime-specific state that cannot be trivially copied or moved.
+  DeviceList(const DeviceList&) = delete;
+  DeviceList(DeviceList&&) = delete;
+  DeviceList& operator=(const DeviceList&) = delete;
+  DeviceList& operator=(DeviceList&&) = delete;
+
+  // Constructs `DeviceList` from `DeviceListProto`. Devices are looked up using
+  // `lookup_device`. Device ids in the proto must be consistent with the
+  // devices returned by `lookup_device`.
+  static absl::StatusOr<tsl::RCReference<DeviceList>> FromProto(
+      LookupDeviceFunc lookup_device, const DeviceListProto& proto);
+
+  // Returns a `DeviceListProto` representation.
+  DeviceListProto ToProto() const;
+
+  // Returns the number of devices.
+  // TODO(hyeontaek): Make this a virtual method and make it possible for a
+  // subclass to lazily materialize devices for `devices()`.
+  int size() const { return devices().size(); }
+
+  // Returns a list of `Devices*` represented by this `DeviceList`.
+  virtual absl::Span<Device* const> devices() const = 0;
+
+  // Returns a `DeviceList*` containing only addressable devices from this
+  // `DeviceList`. It returns itself if all devices are addressable. It points
+  // to a heap-allocated object; the pointer is valid at least until this
+  // `DeviceList` is destroyed, and it can be persisted beyond this
+  // `DeviceList`'s lifetime by using `tsl::FormRef()`.
+  virtual DeviceList* AddressableDeviceList() const = 0;
+
+  // Returns true if all devices are addressable.
+  bool IsFullyAddressable() const { return AddressableDeviceList() == this; }
+
+  virtual bool operator==(const DeviceList& other) const = 0;
+  bool operator!=(const DeviceList& other) const { return !(*this == other); }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const DeviceList& device_list) {
+    sink.Append(device_list.ToString());
+  }
+
+  template <class Sink>
+  friend void AbslStringify(Sink& sink,
+                            const tsl::RCReference<DeviceList>& device_list) {
+    if (device_list == nullptr) {
+      sink.Append("<nullptr>");
+    } else {
+      sink.Append(device_list->ToString());
+    }
+  }
+
+  // Returns the hash of devices. This hash is stable only within the process.
+  virtual uint64_t hash() const = 0;
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  std::string DebugString() const { return ToString(); }
+
+  static char ID;  // NOLINT
+
+ protected:
+  DeviceList() = default;
+
+  virtual std::string ToString() const = 0;
+};
+
+// Simple implementation of `DeviceList` that contains a list of devices without
+// creating any runtime object in the IFRT implementation.
+//
+// This is a transitory type that will be replaced with (1) a non-IFRT container
+// defined by the user code (e.g., `std::vector<Device*>`) or (2) IFRT
+// implementation's own `DeviceList` constructed from its `xla::ifrt::Client`
+// API implementation.
+class BasicDeviceList : public llvm::RTTIExtends<BasicDeviceList, DeviceList> {
+ public:
+  // Number of devices to inline in `Devices`.
+  static constexpr int kInlineDeviceSize = 1;
+
+  // TODO(hyeontaek): Consider using variant<Device*, std::vector<Device*>> for
+  // better performance.
+  using Devices = absl::InlinedVector<Device*, kInlineDeviceSize>;
+
+  // Constructor with a pre-populated `devices`.
+  static tsl::RCReference<DeviceList> Create(Devices devices);
+  static tsl::RCReference<DeviceList> Create(absl::Span<Device* const> devices);
+  static tsl::RCReference<DeviceList> Create(
+      std::initializer_list<Device*> devices);
+
+  ~BasicDeviceList() override = default;
+
+  // Constructs `DeviceList` from `DeviceListProto`. Devices are looked up
+  // using `lookup_device`. Device ids in the proto must be consistent with
+  // the devices returned by `lookup_device`.
+  static absl::StatusOr<tsl::RCReference<DeviceList>> FromProto(
+      LookupDeviceFunc lookup_device, const DeviceListProto& proto);
+
+  // Returns a `DeviceListProto` representation.
+  DeviceListProto ToProto() const;
+
+  absl::Span<Device* const> devices() const override { return devices_; }
+
+  DeviceList* AddressableDeviceList() const override;
+
+  bool operator==(const DeviceList& other) const override {
+    if (this == &other) {
+      return true;
+    }
+    const auto* other_basic_device_list =
+        llvm::dyn_cast<BasicDeviceList>(&other);
+    if (other_basic_device_list == nullptr) {
+      return false;
+    }
+    return devices_ == other_basic_device_list->devices_;
+  }
+
+  uint64_t hash() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit BasicDeviceList(Devices devices);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  std::string ToString() const override;
+
+  Devices devices_;
+
+  // Addressable device list is dynamically computed and cached.
+  struct AddressableDeviceListCache {
+    absl::once_flag once_flag;
+    DeviceList* device_list = nullptr;
+    tsl::RCReference<DeviceList> device_list_holder;
+  };
+  mutable AddressableDeviceListCache addressable_device_list_cache_;
+
+  // Cached hash. 0 indicates the hash needs to be computed and cached.
+  // May be written multiple times with the same non-zero value.
+  static constexpr uint64_t kUnsetHash = 0;
+  mutable std::atomic<uint64_t> hash_;
+};
+
+// Returns the id of each device in `device_list`.
+std::vector<DeviceId> GetDeviceIds(
+    const tsl::RCReference<DeviceList>& device_list);
+
+// Hash function for `DeviceList`. Assumes that every unique device has a unique
+// `Device` object, not duplicate `Device` objects ("d1 == d2 if d1->id() ==
+// d2->id()").
+template <typename H>
+H AbslHashValue(H h, const tsl::RCReference<DeviceList>& devices) {
+  return H::combine(std::move(h), devices->hash());
+}
+
+// Prevent comparing two tsl::RCReference<DeviceList>. If attempted, the
+// compilation will fail with an ambiguous use of these operators.
+inline bool operator==(const tsl::RCReference<DeviceList>& lhs,
+                       const tsl::RCReference<DeviceList>& rhs) {
+  CHECK(false) << "Comparing two tsl::RCReference<DeviceList> directly is "
+                  "typically unintended. Do a comparison after deferenceing "
+                  "them, or compare their raw pointers.";
+}
+inline bool operator!=(const tsl::RCReference<DeviceList>& lhs,
+                       const tsl::RCReference<DeviceList>& rhs) {
+  CHECK(false) << "Comparing two tsl::RCReference<DeviceList> directly is "
+                  "typically unintended. Do a comparison after deferenceing "
+                  "them, or compare their raw pointers.";
+}
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_DEVICE_LIST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device_test_util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device_test_util.h
new file mode 100644
index 00000000..e47144dd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/device_test_util.h
@@ -0,0 +1,65 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_DEVICE_TEST_UTIL_H_
+#define XLA_PYTHON_IFRT_DEVICE_TEST_UTIL_H_
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+// Parameters for DeviceTest.
+// Requests `num_devices` total devices, where `num_addressable_devices` of them
+// are addressable, and the rest of devices are non-addressable.
+struct DeviceTestParam {
+  int num_devices;
+  int num_addressable_devices;
+};
+
+// Test fixture for device tests.
+class DeviceTest : public testing::TestWithParam<DeviceTestParam> {
+ public:
+  void SetUp() override;
+  Client* client() { return client_.get(); }
+
+  // Returns `DeviceList` containing devices at given indexes (not ids) within
+  // `client.devices()`.
+  // REQUIRES: 0 <= device_indices[i] < num_devices
+  tsl::RCReference<DeviceList> GetDevices(absl::Span<const int> device_indices);
+
+  // Returns `DeviceList` containing devices at given indexes (not ids) within
+  // `client.addressable_devices()`.
+  // REQUIRES: 0 <= device_indices[i] < num_addressable_devices
+  tsl::RCReference<DeviceList> GetAddressableDevices(
+      absl::Span<const int> device_indices);
+
+ private:
+  std::shared_ptr<Client> client_;
+};
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_DEVICE_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/dtype.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/dtype.h
new file mode 100644
index 00000000..d23efc55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/dtype.h
@@ -0,0 +1,150 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_DTYPE_H_
+#define XLA_PYTHON_IFRT_DTYPE_H_
+
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/dtype.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Data type of an element.
+//
+// Based on `xla::PrimitiveType`. Differences:
+//
+// * Match the Google C++ style guide for enumerator naming.
+// * Rename PRIMITIVE_TYPE_INVALID to kInvalid.
+// * Remove TUPLE, OPAQUE_TYPE.
+// * Add kString.
+class DType {
+ public:
+  // LINT.IfChange
+  enum Kind {
+    // Invalid data type.
+    kInvalid = 0,
+
+    // Predicates are two-state booleans.
+    kPred = 1,
+
+    // Signed integral values of fixed width.
+    kS2 = 26,
+    kS4 = 21,
+    kS8 = 2,
+    kS16 = 3,
+    kS32 = 4,
+    kS64 = 5,
+
+    // Unsigned integral values of fixed width.
+    kU2 = 27,
+    kU4 = 22,
+    kU8 = 6,
+    kU16 = 7,
+    kU32 = 8,
+    kU64 = 9,
+
+    // Floating-point values of fixed width.
+    kF16 = 10,
+    kF32 = 11,
+    kF64 = 12,
+
+    // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+    // floating-point format, but uses 1 bit for the sign, 8 bits for the
+    // exponent and 7 bits for the mantissa.
+    kBF16 = 16,
+
+    // Complex values of fixed width.
+    kC64 = 15,   // Paired F32 (real, imag), as in std::complex<float>.
+    kC128 = 18,  // Paired F64 (real, imag), as in std::complex<double>.
+
+    // A token type threaded between side-effecting operations. Shapes of this
+    // dtype will have empty dimensions.
+    kToken = 17,
+
+    // Opaque objects.
+    kOpaque = 14,
+
+    kF8E3M4 = 29,
+    kF8E4M3 = 28,
+    kF8E4M3FN = 20,
+    kF8E4M3B11FNUZ = 23,
+    kF8E4M3FNUZ = 25,
+    kF8E5M2 = 19,
+    kF8E5M2FNUZ = 24,
+
+    // Next = 30
+
+    // Variable-length string represented as raw bytes, as in `bytes` in Python,
+    // i.e., no encoding enforcement. String is not support in XLA. DType.Kind
+    // needs to match xla.PrimitiveType enum, so choose a large enum to avoid
+    // collision.
+    kString = 99,
+  };
+  // LINT.ThenChange(dtype.proto:DTypeProtoKind)
+
+  explicit DType(Kind kind) : kind_(kind) {}
+  DType(const DType&) = default;
+  DType(DType&&) = default;
+  DType& operator=(const DType&) = default;
+  DType& operator=(DType&&) = default;
+
+  Kind kind() const { return kind_; }
+
+  bool operator==(const DType& other) const { return kind_ == other.kind_; }
+  bool operator!=(const DType& other) const { return kind_ != other.kind_; }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const DType& value) {
+    return H::combine(std::move(h), value.kind());
+  }
+
+  // Returns the byte size of a single element of this DType. Returns
+  // std::nullopt if not aligned to a byte boundary or there is no fixed size
+  // (such as kString).
+  std::optional<int> byte_size() const;
+
+  // Returns the bit size of a single element of this DType. Returns
+  // std::nullopt if there is no fixed size.
+  std::optional<int> bit_size() const;
+
+  // Constructs `DType` from `DTypeProto`.
+  static absl::StatusOr<DType> FromProto(const DTypeProto& proto);
+
+  // Returns a `DTypeProto` representation.
+  DTypeProto ToProto() const;
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  std::string DebugString() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const DType& dtype) {
+    sink.Append(dtype.DebugString());
+  }
+
+ private:
+  Kind kind_;
+};
+
+std::ostream& operator<<(std::ostream& os, const DType& dtype);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_DTYPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/executable.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/executable.h
new file mode 100644
index 00000000..7e8ecc3b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/executable.h
@@ -0,0 +1,261 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_EXECUTABLE_H_
+#define XLA_PYTHON_IFRT_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/execute_options.pb.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+struct CompileOptions;
+struct DeserializeExecutableOptions;
+
+// Wraps a computation that has been partially compiled and can be loaded.
+class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
+ public:
+  using DeserializeOptions = DeserializeExecutableOptions;
+
+  // Unique name for this executable.
+  virtual absl::string_view name() const = 0;
+
+  // Returns a fingerprint of this executable.
+  virtual absl::StatusOr<std::optional<std::string>> Fingerprint() const = 0;
+
+  // Serializes this executable into a string. The compatibility of the
+  // serialized executable is implementation-specific.
+  virtual absl::StatusOr<std::string> Serialize() const = 0;
+
+  // The following APIs are taken from `xla::PjRtExecutable` for fast
+  // prototyping.
+  // TODO(hyeontaek): Factor some of them out as `XlaCompatibleExecutable`.
+  virtual int num_devices() const = 0;
+  virtual int64_t SizeOfGeneratedCodeInBytes() const = 0;
+  virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats()
+      const = 0;
+
+  // TODO(hyeontaek): Move the following XLA-specific methods to
+  // pjrt_executable.h and put it in an `XlaCompatibleExecutable`.
+
+  // Returns a list of parameter `OpSharding`.
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const = 0;
+  // Returns a list of output `OpSharding`.
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
+  // Returns a list of parameter layouts.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+  GetParameterLayouts() const = 0;
+  // Returns a list of output/result layouts.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+  GetOutputLayouts() const = 0;
+  // Returns an `HloModule` (optimized) per partition.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+  GetHloModules() const = 0;
+
+  // Returns a list of lists of memory kind strings for output. The returned
+  // value is `[num_programs, num_output]`. The size of the outer list should be
+  // equal to `GetHloModules()`. Under SPMD, one can use
+  // `GetOutputMemoryKinds().front()`.
+  virtual absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const = 0;
+
+  // Returns named values for cost properties of this executable (such as
+  // operations, size of input/outputs, and run time estimate). Properties may
+  // differ for different implementations and platforms.
+  virtual absl::StatusOr<AttributeMap> GetCostAnalysis() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+struct ExecuteOptions {
+  // If non-zero, identifies this execution as part of a potentially
+  // multi-device launch. This can be used to detect scheduling errors, e.g. if
+  // multi-host programs are launched in different orders on different hosts,
+  // the launch IDs may be used by the runtime to detect the mismatch.
+  int32_t launch_id = 0;
+
+  // A set of indices denoting the input arrays that should not be donated. An
+  // input array may be non-donable, for example, if it is referenced more than
+  // once. Since such runtime information is not available at compile time, the
+  // compiler might mark the input as `may-alias`, which could lead IFRT to
+  // donate the input array when it should not. By defining this set of indices,
+  // a higher-level IFRT caller can instruct IFRT client not to donate specific
+  // input arrays.
+  absl::flat_hash_set<int> non_donatable_input_indices;
+
+  // If true, populate `ExecuteResult::status`. Otherwise, the status is left as
+  // an invalid future.
+  bool fill_status = false;
+
+  // Custom execution options specific to the runtime. The user and the runtime
+  // are responsible for ensuring version compatibility.
+  std::optional<AttributeMap> custom_options;
+
+  absl::StatusOr<ExecuteOptionsProto> ToProto() const;
+
+  static absl::StatusOr<ExecuteOptions> FromProto(
+      const ExecuteOptionsProto& proto);
+};
+
+// Wraps a computation that has been fully compiled and loaded for execution.
+class LoadedExecutable
+    : public llvm::RTTIExtends<LoadedExecutable, llvm::RTTIRoot> {
+ public:
+  virtual Client* client() const = 0;
+
+  // Executable methods. Note that LoadedExecutable does not inherit from
+  // Executable to avoid multiple inheritance in LoadedExecutable
+  // implementations.
+
+  // Unique name for this executable.
+  virtual absl::string_view name() const = 0;
+
+  // Returns a fingerprint of this executable.
+  virtual absl::StatusOr<std::optional<std::string>> Fingerprint() const = 0;
+
+  // Serializes this executable into a string. The compatibility of the
+  // serialized executable is implementation-specific.
+  virtual absl::StatusOr<std::string> Serialize() const = 0;
+
+  // Returns a future that becomes ready when the executable is ready to be
+  // used for execution.
+  //
+  // This can be used by implementations that support async compilation, where
+  // `Compiler::Compile()` returns an executable ~immediately and does heavy
+  // compilation work in the background. Implementations must still ensure that
+  // all other methods can be used even without explicitly waiting for the ready
+  // future (e.g., via blocking).
+  virtual Future<> GetReadyFuture() const = 0;
+
+  // The following APIs are taken from `xla::PjRtExecutable` for fast
+  // prototyping.
+
+  // TODO(hyeontaek): Factor some of them out as `XlaCompatibleExecutable`.
+  virtual int num_devices() const = 0;
+  virtual int64_t SizeOfGeneratedCodeInBytes() const = 0;
+  virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats()
+      const = 0;
+
+  // The following APIs are taken from `xla::PjRtLoadedExecutable` for fast
+  // prototyping.
+
+  // TODO(hyeontaek): Move the following to pjrt_executable.h and put it in an
+  // `XlaCompatibleExecutable`.
+  // Returns a list of parameter Sharding.
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const = 0;
+  // Returns a list of output OpSharding.
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
+  // Returns a list of parameter layouts.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+  GetParameterLayouts() const = 0;
+  // Returns a list of output/result layouts.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+  GetOutputLayouts() const = 0;
+  // Return an HloModule (optimized) per partition.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+  GetHloModules() const = 0;
+  // Returns a list of lists of memory kind strings for output. The returned
+  // value is `[num_programs, num_output]`. The size of the outer list should be
+  // equal to `GetHloModules()`. Under SPMD, one can use
+  // `GetOutputMemoryKinds().front()`.
+  virtual absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const = 0;
+
+  // Returns named values for cost properties of this executable (such as
+  // operations, size of input/outputs, and run time estimate). Properties may
+  // differ for different implementations and platforms.
+  virtual absl::StatusOr<AttributeMap> GetCostAnalysis() const = 0;
+
+  // `LoadedExecutable` methods.
+
+  using ExecuteOptions = ::xla::ifrt::ExecuteOptions;
+
+  // Result from an execution.
+  struct ExecuteResult {
+    // Resulting status of the execution. Filled only if
+    // `ExecuteOptions::fill_status` is true.
+    Future<> status;
+    // Output arrays.
+    std::vector<tsl::RCReference<Array>> outputs;
+  };
+
+  // Executes the executable on devices.
+  //
+  // The runtime expects input arrays to be present on the execution devices.
+  //
+  // If `devices` is specified, the execution runs on the devices if the runtime
+  // supports. Otherwise, the execution runs on the devices where the executable
+  // has been compiled and loaded onto.
+  //
+  // TODO(hyeontaek): This call does not have strict "barrier" semantics, and
+  // thus it is up to the backend implementation: Some backends will wait all
+  // arguments to be available to run any computation (which may be composed of
+  // individually dispatchable sub-computations), while others may run the
+  // computation incrementally. Some backends will mark outputs to become ready
+  // roughly at the same time, while others may make outputs ready
+  // incrementally. We need to have a stricter way to control this behavior
+  // (e.g., having per-argument/output booleans or providing a separate barrier
+  // API).
+  virtual absl::StatusOr<ExecuteResult> Execute(
+      absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
+      std::optional<tsl::RCReference<DeviceList>> devices) = 0;
+
+  // Deletes the executable from the devices. The operation may be asynchronous.
+  // The returned future will have the result of the deletion on the devices.
+  // Implementations that do not track the completion of the deletion operation
+  // may make the future immediately ready with an OK status.
+  virtual Future<> Delete() = 0;
+  // Returns whether the executable has been enqueued for deletion from the
+  // devices.
+  virtual bool IsDeleted() const = 0;
+
+  // The following APIs are taken from xla::PjRtLoadedExecutable for fast
+  // prototyping.
+  // TODO(hyeontaek): Move the following XLA-specific methods to
+  // pjrt_executable.h and put it in an `XlaCompatibleExecutable`.
+
+  virtual absl::Span<Device* const> addressable_devices() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/executable_serdes.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/executable_serdes.h
new file mode 100644
index 00000000..b1592432
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/executable_serdes.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_EXECUTABLE_SERDES_H_
+#define XLA_PYTHON_IFRT_EXECUTABLE_SERDES_H_
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+// Abstract options for deserializing an `Executable` and load it as
+// `LoadedExecutable`. This option structure is to express legacy compilation
+// options that are not included in the program.
+//
+// TODO(hyeontaek): Make an new `LoadOptions` that is specific for loading.
+struct DeserializeExecutableOptions
+    : llvm::RTTIExtends<DeserializeExecutableOptions, DeserializeOptions> {
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_EXECUTABLE_SERDES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/future.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/future.h
new file mode 100644
index 00000000..49680ce3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/future.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_FUTURE_H_
+#define XLA_PYTHON_IFRT_FUTURE_H_
+
+#include "xla/pjrt/pjrt_future.h"
+
+namespace xla {
+namespace ifrt {
+
+// Future reuses `xla::PjRtFuture` as the short-term implementation.
+//
+// We will address the following properties in a new `Future` implementation.
+//
+// * Creating and destroying Future should be very cheap if no one ever awaits
+// on the `Future`.
+//
+// * Awaiting on a `Future` should possibly be cancellable to lower overhead
+// when the `Future` value woudld be no longer useful or relevant.
+//
+// * Ideally, there should be a move-only version of `Future`, which will enable
+// (1) no reference counting of `Future`s sharing the same `Promise` and (2)
+// safe mutable access to the value when the `Future` becomes ready, including
+// moving the value out of the `Future`/`Promise`.
+template <typename T = void>
+using Future = ::xla::PjRtFuture<T>;
+
+template <typename T = void>
+using Promise = typename ::xla::PjRtFuture<T>::Promise;
+
+using ::xla::JoinFutures;
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_FUTURE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/hlo/hlo_program.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/hlo/hlo_program.h
new file mode 100644
index 00000000..37802019
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/hlo/hlo_program.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_HLO_HLO_PROGRAM_H_
+#define XLA_PYTHON_IFRT_HLO_HLO_PROGRAM_H_
+
+#include <memory>
+#include <utility>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "xla/python/ifrt/program.h"
+
+namespace xla {
+namespace ifrt {
+
+struct HloProgram : llvm::RTTIExtends<HloProgram, Program> {
+  HloProgram() = default;
+  explicit HloProgram(mlir::ModuleOp module) : mlir_module(module) {}
+  HloProgram(std::unique_ptr<mlir::MLIRContext> context,
+             mlir::OwningOpRef<mlir::ModuleOp> module)
+      : mlir_module(*module),
+        mlir_context(std::move(context)),
+        owning_mlir_module(std::move(module)) {}
+
+  mlir::ModuleOp mlir_module;
+
+  static char ID;  // NOLINT
+
+ private:
+  std::unique_ptr<mlir::MLIRContext> mlir_context;
+  mlir::OwningOpRef<mlir::ModuleOp> owning_mlir_module;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_HLO_HLO_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/host_callback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/host_callback.h
new file mode 100644
index 00000000..32e744c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/host_callback.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_HOST_CALLBACK_H_
+#define XLA_PYTHON_IFRT_HOST_CALLBACK_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Abstract (unloaded) host callback. It wraps a serializable host computation
+// that can be loaded as`LoadedHostCallback`.
+//
+// TODO(hyeontaek): Unify `HostCallback` with `Executable` once `Executable` is
+// added.
+class HostCallback : public llvm::RTTIExtends<HostCallback, llvm::RTTIRoot> {
+ public:
+  // Returns a serialized host callback.
+  virtual std::string Serialize() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+// Abstract loaded host callback. It wraps a host computation that may be called
+// during an execution of a `LoadedExecutable`. This interface only represents
+// an opaque reference of the host computation; the details of the host
+// computation call are implementation specific.
+//
+// TODO(hyeontaek): Merge `LoadedHostCallback` into `LoadedExecutable`. They
+// share a similar lifecycle, and only how their execution is invoked:
+// `LoadedExecutable` runs as a top-level standalone runnable, while
+// `LoadedHostCallback` runs as a sub-computation of another `LoadedExecutable`
+// execution.
+class LoadedHostCallback
+    : public tsl::ReferenceCounted<LoadedHostCallback>,
+      public llvm::RTTIExtends<LoadedHostCallback, llvm::RTTIRoot> {
+ public:
+  virtual Client* client() const = 0;
+
+  // Returns a serialized host callback.
+  //
+  // The implementation may return an error if this `LoadedHostCallback` is not
+  // serializable, or the information required for serialization is not
+  // preserved within this `LoadedHostCallback`.
+  //
+  // TODO(hyeontaek): Change `Serialize()` to return `HostCallback` instead of a
+  // serialized host callback directly.
+  virtual absl::StatusOr<std::string> Serialize() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_HOST_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/index.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/index.h
new file mode 100644
index 00000000..d64321a8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/index.h
@@ -0,0 +1,114 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_INDEX_H_
+#define XLA_PYTHON_IFRT_INDEX_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace ifrt {
+
+// Multi-dimensional index. Every element must be equal to or greater than 0.
+class Index {
+ public:
+  // Maximum elements to inline.
+  static constexpr int kInlineElementSize = 6;
+
+  using Elements = absl::InlinedVector<int64_t, kInlineElementSize>;
+
+  explicit Index(absl::Span<const int64_t> elements)
+      : elements_(Elements(elements.begin(), elements.end())) {}
+
+  static Index Zeros(int num_elements) {
+    return Index(Elements(/*n=*/num_elements));
+  }
+
+  Index(const Index&) = default;
+  Index(Index&&) = default;
+  Index& operator=(const Index&) = default;
+  Index& operator=(Index&&) = default;
+
+  absl::Span<const int64_t> elements() const { return elements_; }
+
+  bool operator==(const Index& other) const {
+    return elements_ == other.elements_;
+  }
+  bool operator!=(const Index& other) const {
+    return elements_ != other.elements_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Index& index);
+
+  Index operator+(const Index& offset) const {
+    CHECK_EQ(elements_.size(), offset.elements_.size());
+    Index result = *this;
+    for (int i = 0; i < elements_.size(); ++i) {
+      result.elements_[i] += offset.elements_[i];
+    }
+    return result;
+  }
+  Index operator-(const Index& offset) const {
+    CHECK_EQ(elements_.size(), offset.elements_.size());
+    Index result = *this;
+    for (int i = 0; i < elements_.size(); ++i) {
+      result.elements_[i] -= offset.elements_[i];
+    }
+    return result;
+  }
+  Index operator*(absl::Span<const int64_t> multiplier) const {
+    CHECK_EQ(elements_.size(), multiplier.size());
+    Index result = *this;
+    for (int i = 0; i < elements_.size(); ++i) {
+      result.elements_[i] *= multiplier[i];
+    }
+    return result;
+  }
+  Index& operator+=(const Index& offset) { return *this = *this + offset; }
+  Index& operator-=(const Index& offset) { return *this = *this - offset; }
+  Index& operator*=(absl::Span<const int64_t> multiplier) {
+    return *this = *this * multiplier;
+  }
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  std::string DebugString() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Index& index) {
+    sink.Append(index.DebugString());
+  }
+
+ private:
+  Elements elements_;
+};
+
+std::ostream& operator<<(std::ostream& os, const Index& index);
+
+template <typename H>
+H AbslHashValue(H h, const Index& index) {
+  return H::combine(std::move(h), index.elements_);
+}
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_INDEX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/index_domain.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/index_domain.h
new file mode 100644
index 00000000..506a077f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/index_domain.h
@@ -0,0 +1,99 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_INDEX_DOMAIN_H_
+#define XLA_PYTHON_IFRT_INDEX_DOMAIN_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "xla/python/ifrt/index.h"
+#include "xla/python/ifrt/shape.h"
+
+namespace xla {
+namespace ifrt {
+
+// Domain of a multi-dimensional index space. Informally, it represents a slice
+// that is defined by the origin (lower inclusive bound) of the slice and the
+// shape of the slice.
+class IndexDomain {
+ public:
+  // General `IndexDomain` construction.
+  IndexDomain(Index origin, Shape shape)
+      : origin_(std::move(origin)), shape_(std::move(shape)) {}
+
+  // `IndexDomain` construction with a zeros origin.
+  explicit IndexDomain(Shape shape)
+      : origin_(Index::Zeros(shape.dims().size())), shape_(std::move(shape)) {}
+
+  IndexDomain(const IndexDomain&) = default;
+  IndexDomain(IndexDomain&&) = default;
+  IndexDomain& operator=(const IndexDomain&) = default;
+  IndexDomain& operator=(IndexDomain&&) noexcept = default;
+
+  const Index& origin() const { return origin_; }
+  const Shape& shape() const { return shape_; }
+
+  bool operator==(const IndexDomain& other) const {
+    return origin_ == other.origin_ && shape_ == other.shape_;
+  }
+  bool operator!=(const IndexDomain& other) const {
+    return origin_ != other.origin_ || shape_ != other.shape_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const IndexDomain& index_domain);
+
+  IndexDomain operator+(const Index& offset) const {
+    return IndexDomain(origin_ + offset, shape_);
+  }
+  IndexDomain operator-(const Index& offset) const {
+    return IndexDomain(origin_ - offset, shape_);
+  }
+  IndexDomain& operator+=(const Index& offset) {
+    origin_ += offset;
+    return *this;
+  }
+  IndexDomain& operator-=(const Index& offset) {
+    origin_ -= offset;
+    return *this;
+  }
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  std::string DebugString() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const IndexDomain& index_domain) {
+    sink.Append(index_domain.DebugString());
+  }
+
+ private:
+  Index origin_;
+  Shape shape_;
+};
+
+std::ostream& operator<<(std::ostream& os, const IndexDomain& index_domain);
+
+template <typename H>
+H AbslHashValue(H h, const IndexDomain& index_domain) {
+  return H::combine(std::move(h), index_domain.origin_, index_domain.shape_);
+}
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_INDEX_DOMAIN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h
new file mode 100644
index 00000000..ffec3a3d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_ATOM_PROGRAM_COMPILER_H_
+#define XLA_PYTHON_IFRT_IR_ATOM_PROGRAM_COMPILER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
+#include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/shape.h"
+
+namespace xla {
+namespace ifrt {
+
+// Loaded executable and unique name for a compiled atom program.
+struct AtomProgramCompileResult {
+  std::string name;
+  std::shared_ptr<LoadedExecutable> executable;
+};
+
+using AtomExecutableMap =
+    absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>;
+
+class AtomProgramCompiler {
+ public:
+  virtual ~AtomProgramCompiler() = default;
+
+  // Delegates the compilation of an atom XLA program.
+  // `options` uses logical device id in the main mlir module.
+  virtual absl::StatusOr<AtomProgramCompileResult> CompileXla(
+      std::unique_ptr<HloProgram> computation, xla::CompileOptions options) = 0;
+
+  // Delegates the compilation of an MPMD reshard program.
+  virtual absl::StatusOr<AtomProgramCompileResult> CompileMpmdReshard(
+      std::vector<DType> dtypes, std::vector<Shape> shapes,
+      std::vector<IfrtArrayType> in_array_types,
+      std::vector<IfrtArrayType> out_array_types) = 0;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_ATOM_PROGRAM_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/constants.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/constants.h
new file mode 100644
index 00000000..512b2225
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/constants.h
@@ -0,0 +1,89 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_CONSTANTS_H_
+#define XLA_PYTHON_IFRT_IR_CONSTANTS_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace xla {
+namespace ifrt {
+
+// Name of UnitAttr on FuncOp to indicate it's an IFRT IR function, telling it
+// apart from atom program FuncOps (callee of `ifrt.Call`).
+inline constexpr llvm::StringLiteral kIfrtFunctionAttrName = "ifrt.function";
+
+// Name of UnitAttr on FuncOp to indicate it's a VIFRT IR function, telling it
+// apart from atom program FuncOps.
+inline constexpr llvm::StringLiteral kVifrtFunctionAttrName = "vifrt.function";
+
+// Name of UnitAttr on FuncOp to indicate it's an IFRT IR function that
+// only reshards arrays. While functions with kIfrtFunctionAttrName attribute
+// cannot be `ifrt.Call`ed, kIfrtReshardFunctionAttrName can be called.
+inline constexpr llvm::StringLiteral kIfrtReshardFunctionAttrName =
+    "ifrt.reshard_function";
+
+// Name of UnitAttr on arguments of FuncOp to indicate a donated input.
+// Must be used in a FuncOp with `ifrt.function` attr.
+inline constexpr llvm::StringLiteral kIfrtDonatedArgAttrName = "ifrt.donated";
+
+// Name of UnitAttr on CallOp used to indicate that the atom program is
+// in "local" view (i.e., already sharded).
+inline constexpr llvm::StringLiteral kIfrtLocalViewAttrName = "ifrt.local_view";
+
+// Name of StringAttr on CallOp used to store an optional key to use into a
+// mapping of user-provided compile options.
+inline constexpr llvm::StringLiteral kIfrtCompileOptionsKey =
+    "ifrt.compile_options_key";
+
+inline constexpr llvm::StringLiteral kIfrtDevicesAttrName = "ifrt.devices";
+inline constexpr llvm::StringLiteral kIfrtNumDevicesAttrName =
+    "ifrt.num_devices";
+inline constexpr llvm::StringLiteral kIfrtShardingAttrName = "ifrt.sharding";
+inline constexpr llvm::StringLiteral kIfrtMemoryKindAttrName =
+    "ifrt.memory_kind";
+inline constexpr llvm::StringLiteral kIfrtEntryFunctionAttrName =
+    "ifrt.entry_function";
+
+// Name of UnitAttr on CallOp used to indicate that an atom program was
+// partitioned by the Sdy partitioner.
+inline constexpr llvm::StringLiteral kIsSdyPartitioned =
+    "ifrt.is_sdy_partitioned";
+
+inline constexpr llvm::StringLiteral kCalleeMainFuncName = "main";
+
+// Name of StringAttr used to store the HloSharding.
+inline constexpr llvm::StringLiteral kHloShardingAttrName = "mhlo.sharding";
+// Name of StringAttr used to store memory kind.
+inline constexpr llvm::StringLiteral kHloMemoryKindAttrName =
+    "mhlo.memory_kind";
+// Name of StringAttr used to store layout mode.
+inline constexpr llvm::StringLiteral kHloLayoutAttrName = "mhlo.layout_mode";
+
+inline constexpr llvm::StringLiteral kIfrtModuleTypeAttrName =
+    "ifrt.module_type";
+
+inline constexpr llvm::StringLiteral kIfrtModuleTypeXla = "xla";
+inline constexpr llvm::StringLiteral kIfrtModuleTypeMpmdReshard =
+    "mpmd_reshard";
+
+// String value used as a default value for an optional `mlir::StringAttr` when
+// converting to and from VIFRT.
+inline constexpr llvm::StringLiteral kVifrtDefaultString = "vifrt.default";
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h
new file mode 100644
index 00000000..fe2c790b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
+#define XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "xla/pjrt/layout_mode.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/memory.h"
+
+// Generated definitions.
+#include "xla/python/ifrt/ir/ifrt_dialect.h.inc"  // IWYU pragma: export
+#define GET_ATTRDEF_CLASSES
+#include "xla/python/ifrt/ir/ifrt_attrs.h.inc"  // IWYU pragma: export
+#define GET_TYPEDEF_CLASSES
+#include "xla/python/ifrt/ir/ifrt_types.h.inc"  // IWYU pragma: export
+
+#endif  // XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
new file mode 100644
index 00000000..53173ceb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
@@ -0,0 +1,95 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
+#define XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/python/ifrt/ir/constants.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+
+namespace mlir {
+namespace OpTrait {
+namespace xla {
+namespace ifrt {
+
+namespace impl {
+
+// Verifies `op` used in a FuncOp with `ifrt.function` attr.
+LogicalResult verifyNestedInIfrtFunc(Operation* op);
+
+}  // namespace impl
+
+template <typename ConcreteType>
+class NestedInIfrtFuncTrait
+    : public TraitBase<ConcreteType, NestedInIfrtFuncTrait> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    return impl::verifyNestedInIfrtFunc(op);
+  }
+};
+
+template <typename CalleeOpType>
+class IfrtCallLikeTrait {
+ public:
+  template <typename ConcreteType>
+  class Impl : public TraitBase<ConcreteType, Impl> {
+   public:
+    // Verifies getCallee() is a valid SymbolRefAttr to CalleeOpType.
+    static LogicalResult verifyTrait(Operation* op) {
+      mlir::SymbolTableCollection symbol_table;
+      ConcreteType concrete = llvm::cast<ConcreteType>(op);
+      CalleeOpType callee = concrete.getCalleeOp(symbol_table);
+      if (callee == nullptr) {
+        return op->emitOpError() << "requires '" << concrete.getCallee()
+                                 << "' to reference a valid `"
+                                 << CalleeOpType::getOperationName() << "`";
+      }
+      if (callee->hasAttr(::xla::ifrt::kIfrtFunctionAttrName)) {
+        return op->emitOpError() << "requires callee not with attr `"
+                                 << ::xla::ifrt::kIfrtFunctionAttrName << "`";
+      }
+      return success();
+    }
+
+    CalleeOpType getCalleeOp(mlir::SymbolTableCollection& symbol_table) {
+      SymbolRefAttr callee_attr = static_cast<ConcreteType*>(this)->getCallee();
+      return symbol_table.lookupNearestSymbolFrom<CalleeOpType>(
+          this->getOperation(), callee_attr);
+    }
+  };
+};
+
+}  // namespace ifrt
+}  // namespace xla
+}  // namespace OpTrait
+}  // namespace mlir
+
+// IWYU pragma: begin_exports
+
+// Generated definitions.
+#define GET_ATTR_INTERFACE_CLASSES
+#include "xla/python/ifrt/ir/ifrt_attr_interfaces.h.inc"
+
+#define GET_OP_INTERFACE_CLASSES
+#include "xla/python/ifrt/ir/ifrt_op_interfaces.h.inc"
+
+// IWYU pragma: end_exports
+
+#endif  // XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h
new file mode 100644
index 00000000..14816387
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h
@@ -0,0 +1,153 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_IFRT_IR_PROGRAM_H_
+#define XLA_PYTHON_IFRT_IR_IFRT_IR_PROGRAM_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/ir/ifrt_ir_compile_options.pb.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+struct IfrtIRProgram : llvm::RTTIExtends<IfrtIRProgram, Program> {
+  IfrtIRProgram() = default;
+  explicit IfrtIRProgram(mlir::ModuleOp mlir_module)
+      : mlir_module(std::move(mlir_module)) {}
+  IfrtIRProgram(std::unique_ptr<mlir::MLIRContext> context,
+                mlir::OwningOpRef<mlir::ModuleOp> module)
+      : mlir_module(*module),
+        mlir_context(std::move(context)),
+        owning_mlir_module(std::move(module)) {}
+
+  static absl::string_view type_name() { return "xla::ifrt::IfrtIRProgram"; }
+
+  mlir::ModuleOp mlir_module;
+
+  static char ID;  // NOLINT
+
+ private:
+  std::unique_ptr<mlir::MLIRContext> mlir_context;
+  mlir::OwningOpRef<mlir::ModuleOp> owning_mlir_module;
+};
+
+// Options for serializing IFRT IR programs.
+struct SerializeIfrtIRProgramOptions
+    : llvm::RTTIExtends<SerializeIfrtIRProgramOptions, SerializeOptions> {
+  explicit SerializeIfrtIRProgramOptions(std::string ifrt_version,
+                                         std::string atom_program_version,
+                                         bool version_in_place = true)
+      : ifrt_version(std::move(ifrt_version)),
+        atom_program_version(std::move(atom_program_version)),
+        version_in_place(version_in_place) {}
+
+  static char ID;  // NOLINT
+
+  // String of the form "major.minor.patch", representing the IFRT IR version.
+  std::string ifrt_version;
+  // String of the form "major.minor.patch", representing the atom program
+  // version (currently VHLO version).
+  std::string atom_program_version;
+  // Whether to version the IFRT IR ModuleOp in-place.
+  bool version_in_place;
+};
+
+// Options for deserializing IFRT IR programs.
+// If `context` is not nullptr then deserialization will create a new MLIR
+// context, which will be owned by the deserialized program. Otherwise, the
+// deserialization will use the provided MLIR context and the returned program
+// will not own a MLIR context.
+struct DeserializeIfrtIRProgramOptions
+    : llvm::RTTIExtends<DeserializeIfrtIRProgramOptions, DeserializeOptions> {
+  explicit DeserializeIfrtIRProgramOptions(mlir::MLIRContext* context)
+      : context(context) {}
+
+  static char ID;  // NOLINT
+
+  mlir::MLIRContext* context;
+};
+
+// CompileOptions for an IFRT IR program.
+struct IfrtIRCompileOptions
+    : llvm::RTTIExtends<IfrtIRCompileOptions, CompileOptions> {
+  IfrtIRCompileOptions() = default;
+  explicit IfrtIRCompileOptions(
+      std::vector<DeviceId> device_assignments,
+      absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>
+          loaded_exec_binding = {},
+      std::shared_ptr<absl::flat_hash_map<
+          std::string, std::unique_ptr<xla::ifrt::CompileOptions>>>
+          compile_options_overrides = {},
+      bool propagate_shardings = false)
+      : device_assignments(std::move(device_assignments)),
+        loaded_exec_binding(std::move(loaded_exec_binding)),
+        compile_options_overrides(std::move(compile_options_overrides)),
+        propagate_shardings(propagate_shardings) {}
+
+  // Mapping from logical device ids in IFRT IR MLIR module to runtime device
+  // ids obtained from IFRT client.
+  std::vector<DeviceId> device_assignments;
+
+  // Map from symbol names of LoadedExecutableOp in the IFRT IR MLIR module
+  // to pre-compiled `LoadedExecutable` instance. The `LoadedExecutable`s must
+  // outlive the `LoadedExecutable` of the IFRT IR program.
+  absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>
+      loaded_exec_binding;
+
+  // Mapping from values of `ifrt.compile_option_key` attribute of a `CallOp` to
+  // compile options. If a `CallOp` does not have have the attribute set or does
+  // not have an entry in this map then default compile options are used.
+  std::shared_ptr<absl::flat_hash_map<
+      std::string, std::unique_ptr<xla::ifrt::CompileOptions>>>
+      compile_options_overrides;
+
+  // Whether to propagate shardings from atom program executables for
+  // unspecified shardings.
+  bool propagate_shardings;
+
+  // Constructs `IfrtIRCompileOptions` from `IfrtIrCompileOptionsProto`.
+  static absl::StatusOr<std::unique_ptr<IfrtIRCompileOptions>> FromProto(
+      const IfrtIrCompileOptionsProto& proto);
+
+  // Returns a `IfrtIrCompileOptionsProto` representation.
+  absl::StatusOr<IfrtIrCompileOptionsProto> ToProto() const;
+
+  static char ID;  // NOLINT
+};
+
+// Gets `xla::ifrt::IfrtIRCompileOptions` from `xla::ifrt::CompileOptions`.
+absl::StatusOr<std::unique_ptr<IfrtIRCompileOptions>> GetIfrtIRCompileOptions(
+    std::unique_ptr<CompileOptions> options);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_IFRT_IR_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_ops.h
new file mode 100644
index 00000000..394dc57d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/ifrt_ops.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
+#define XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/python/ifrt/ir/constants.h"
+#include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
+
+// Generated definitions.
+#define GET_OP_CLASSES
+#include "xla/python/ifrt/ir/ifrt_ops.h.inc"  // IWYU pragma: export
+
+#endif  // XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/sharding_param.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/sharding_param.h
new file mode 100644
index 00000000..fe851be1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/sharding_param.h
@@ -0,0 +1,172 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_SHARDING_PARAM_H_
+#define XLA_PYTHON_IFRT_IR_SHARDING_PARAM_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/python/ifrt/ir/sharding_param.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Represents the sharding of an array in IFRT IR.
+//
+// The assembly format is
+//   $dim_shards to $permutation on $axis_sizes
+//
+// `dim_shards` has rank matching the tensor. Its sizes tell how to distribute
+// the corresponding dimensions of the tensor to the mesh axes. The `dim_shards`
+// then will be mapped to the `permutation` of axes in `minor_to_major`,
+// uniquely determining the slice of tensor on each logical device. For example:
+//
+// 2x1x3 to [1,0] on 3x2
+//   means to shard a rank-3 tensor into 2 slices in dim-0 and 3 slices in
+//   dim-2. The 6 slices will be distributed to 6 logical devices in the order
+//   of 0,3,1,4,2,5.
+//
+// 2x1 to [0,1] on 2x3
+//   means to shard a rank-2 tensor into 2 slices in dim-0. The 2 slices will
+//   be distributed to 2 groups replicated on the 3 devices in each group. The
+//   groups of logical devices are (0,1,2), (3,4,5).
+//
+// 4 to [1,0] on 2x2
+//   means to shard a rank-1 tensor into 4 slices. The 4 slices will be
+//   distributed to 4 logical devices in the order of 0,2,1,3.
+//
+// 1x1 to [0,1] on 2
+//   is invalid, because `permutation` and `axis_sizes` has different sizes.
+//
+// 2x2 to [0] on 2
+//   is invalid, because the 4 slices can't be distributed to 2 devices.
+//
+// 1x2 to [0,1] on 3x2
+//   is invalid, because the 2 slices on dim-1 can't be distributed to 3 devices
+//   in axis-0.
+//
+// See `support` directory for conversions with other sharding annotations.
+class ShardingParam {
+ public:
+  // Represents a permutation of mesh dimensions from minor to major.
+  //
+  // Sizes of `permutation` and `sizes` must be equal.
+  struct MinorToMajor {
+    // A permutation of range [0...n].
+    llvm::SmallVector<int, 4> permutation;
+    // The size of mesh dimensions before the permutation.
+    llvm::SmallVector<int, 4> axis_sizes;
+
+    absl::Status verify() const;
+    mlir::LogicalResult verify(
+        llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
+
+    bool operator==(const MinorToMajor& other) const {
+      return permutation == other.permutation && axis_sizes == other.axis_sizes;
+    }
+
+    // Produces a flat list of device ids according to the permutation.
+    void ToDeviceList(llvm::SmallVectorImpl<int>& out_devices) const;
+  };
+
+  ShardingParam(std::vector<int64_t> dim_shards, MinorToMajor minor_to_major)
+      : dim_shards_(std::move(dim_shards)),
+        minor_to_major_(std::move(minor_to_major)) {}
+
+  static mlir::FailureOr<ShardingParam> Parse(mlir::AsmParser& ods_parser);
+
+  // Parses V1 of ShardingParam. This method is meant to be used in the VIFRT
+  // dialect to parse versioned ShardingParams.
+  static mlir::FailureOr<ShardingParam> ParseV1(mlir::AsmParser& ods_parser);
+
+  // Prints V1 of ShardingParam. This method is meant to be used in the VIFRT
+  // dialect to print versioned ShardingParams.
+  static void PrintV1(mlir::AsmPrinter& ods_printer,
+                      const ShardingParam& sharding);
+
+  absl::Status verify() const;
+  mlir::LogicalResult verify(
+      llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
+
+  // Verifies if the sharding can be applied to the array.
+  mlir::LogicalResult CanApplyTo(
+      llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+      mlir::RankedTensorType shape, llvm::ArrayRef<int> device_ids) const;
+
+  absl::StatusOr<llvm::SmallVector<int64_t>> GlobalShapeFromLocalShape(
+      llvm::ArrayRef<int64_t> local_shape) const;
+
+  absl::StatusOr<llvm::SmallVector<int64_t>> LocalShapeFromGlobalShape(
+      llvm::ArrayRef<int64_t> global_shape) const;
+
+  // Returns the number of devices the array is sharded over.
+  int NumDevices() const;
+
+  llvm::ArrayRef<int64_t> dim_shards() const { return dim_shards_; }
+  const MinorToMajor& minor_to_major() const { return minor_to_major_; }
+
+  bool operator==(const ShardingParam& other) const {
+    return dim_shards_ == other.dim_shards_ &&
+           minor_to_major_ == other.minor_to_major_;
+  }
+
+  bool operator!=(const ShardingParam& other) const {
+    return !(*this == other);
+  }
+
+  llvm::hash_code hash_value() const {
+    return llvm::hash_combine(dim_shards(),
+                              llvm::ArrayRef<int>(minor_to_major_.permutation),
+                              llvm::ArrayRef<int>(minor_to_major_.axis_sizes));
+  }
+
+  std::string DebugString() const;
+
+  // Returns a `ShardingParamProto` representation.
+  absl::StatusOr<ShardingParamProto> ToProto() const;
+
+  // Constructs `ShardingParam` from `ShardingParamProto`.
+  static absl::StatusOr<ShardingParam> FromProto(
+      const ShardingParamProto& proto);
+
+ private:
+  std::vector<int64_t> dim_shards_;
+  MinorToMajor minor_to_major_;
+};
+
+llvm::hash_code hash_value(ShardingParam sharding);
+
+mlir::AsmPrinter& operator<<(mlir::AsmPrinter& os, ShardingParam sharding);
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, ShardingParam sharding);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_SHARDING_PARAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
new file mode 100644
index 00000000..1297a604
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
@@ -0,0 +1,85 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TESTS_EXECUTABLE_IMPL_TEST_BASE_H_
+#define XLA_PYTHON_IFRT_IR_TESTS_EXECUTABLE_IMPL_TEST_BASE_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/ir/ifrt_ir_program.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/ir/version.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+// Base class to help create tests that compile and execute IFRT IR.
+class IfrtIrExecutableImplTestBase : public testing::Test {
+ public:
+  IfrtIrExecutableImplTestBase();
+  void SetUp() override;
+
+ protected:
+  // Loads mlir from source string.
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromSource(
+      absl::string_view source);
+
+  // Loads mlir from file.
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromFile(
+      absl::string_view file_path);
+
+  // Serializes the program with the requested compatibility requirement, and
+  // deserializes it back. Returns the deserialized program.
+  // This helper method should be used in tests that verify program results
+  // after the IFRT -> VIFRT -> IFRT round trip.
+  absl::StatusOr<std::unique_ptr<IfrtIRProgram>> SerDeRoundTrip(
+      std::unique_ptr<IfrtIRProgram> program,
+      Version::CompatibilityRequirement compatibility_requirement,
+      bool propagate_shardings = false);
+
+  // Creates an Array from per shard data.
+  // TODO(hyeontaek): Remove this when MakeArrayFromHostBuffer supports it
+  // directly.
+  absl::StatusOr<tsl::RCReference<Array>> CreateArray(
+      absl::Span<void* const> per_shard_data, Shape shape, DType dtype,
+      ShardingParam sharding_param, tsl::RCReference<DeviceList> device_list);
+
+  // Picks a given number of devices.
+  // Error when `count` is larger than the total number of devices.
+  absl::StatusOr<tsl::RCReference<DeviceList>> PickDevices(int count);
+
+  mlir::MLIRContext mlir_context_;
+  std::shared_ptr<Client> client_;
+};
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TESTS_EXECUTABLE_IMPL_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/built_in_spmd_expansions.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/built_in_spmd_expansions.h
new file mode 100644
index 00000000..32827422
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/built_in_spmd_expansions.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_BUILT_IN_SPMD_EXPANSIONS_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_BUILT_IN_SPMD_EXPANSIONS_H_
+
+#include "mlir/IR/MLIRContext.h"
+
+namespace xla {
+namespace ifrt {
+
+// Attaches `IfrtSpmdExpandable` OpInterface to MLIR built-in ops.
+void AttachBuiltInSpmdExpansions(mlir::DialectRegistry& registry);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_BUILT_IN_SPMD_EXPANSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/map_ifrt_to_vifrt.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/map_ifrt_to_vifrt.h
new file mode 100644
index 00000000..8106ccfa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/map_ifrt_to_vifrt.h
@@ -0,0 +1,87 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_MAP_IFRT_TO_VIFRT_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_MAP_IFRT_TO_VIFRT_H_
+
+#include <type_traits>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "xla/python/ifrt/ir/ifrt_ops.h"
+#include "xla/python/ifrt/ir/vifrt_dialect.h"  // IWYU pragma: export
+
+namespace xla {
+namespace ifrt {
+
+// Templates used to be able to define templated op converters for
+// IFRT <--> VIFRT.
+template <typename VifrtOpTy>
+struct VifrtToIfrtOpImpl {
+  using Type = std::false_type;
+};
+template <typename VifrtOpTy>
+using VifrtToIfrtOp = typename VifrtToIfrtOpImpl<VifrtOpTy>::Type;
+
+template <typename IfrtOpTy>
+struct IfrtToVifrtOpImpl {
+  using Type = std::false_type;
+};
+template <typename IfrtOpTy>
+using IfrtToVifrtOp = typename IfrtToVifrtOpImpl<IfrtOpTy>::Type;
+
+#define MAP_IFRT_TO_VIFRT(OpName, OpVer)               \
+  template <>                                          \
+  struct IfrtToVifrtOpImpl<xla::ifrt::OpName> {        \
+    using Type = xla::ifrt::OpName##OpVer;             \
+  };                                                   \
+  template <>                                          \
+  struct VifrtToIfrtOpImpl<xla::ifrt::OpName##OpVer> { \
+    using Type = xla::ifrt::OpName;                    \
+  };
+
+// Mappings between IFRT and current VIFRT ops.
+MAP_IFRT_TO_VIFRT(CallOp, V1)
+MAP_IFRT_TO_VIFRT(ReshardOp, V1)
+MAP_IFRT_TO_VIFRT(CopyArraysOp, V1)
+MAP_IFRT_TO_VIFRT(AssembleOp, V1)
+MAP_IFRT_TO_VIFRT(DisassembleOp, V1)
+MAP_IFRT_TO_VIFRT(RemapArraysOp, V1)
+MAP_IFRT_TO_VIFRT(AfterOp, V1)
+MAP_IFRT_TO_VIFRT(CallLoadedExecutableOp, V1)
+MAP_IFRT_TO_VIFRT(LoadedExecutableOp, V1)
+
+#undef MAP_IFRT_TO_VIFRT
+
+// Mappings from dialects other than IFRT.
+#define MAP_OTHER_TO_VIFRT(UpstreamOpName, VifrtOpName, OpVer) \
+  template <>                                                  \
+  struct IfrtToVifrtOpImpl<UpstreamOpName> {                   \
+    using Type = VifrtOpName##OpVer;                           \
+  };                                                           \
+  template <>                                                  \
+  struct VifrtToIfrtOpImpl<VifrtOpName##OpVer> {               \
+    using Type = UpstreamOpName;                               \
+  };
+
+MAP_OTHER_TO_VIFRT(::mlir::func::FuncOp, ::xla::ifrt::FuncOp, V1)
+MAP_OTHER_TO_VIFRT(::mlir::func::CallOp, ::xla::ifrt::CallFuncOp, V1)
+MAP_OTHER_TO_VIFRT(::mlir::func::ReturnOp, ::xla::ifrt::ReturnOp, V1)
+
+#undef MAP_OTHER_TO_VIFRT
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_MAP_IFRT_TO_VIFRT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h
new file mode 100644
index 00000000..c004b522
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_MULTI_THREADED_ATOM_PROGRAM_COMPILER_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_MULTI_THREADED_ATOM_PROGRAM_COMPILER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/ir/atom_program_compiler.h"
+#include "xla/python/ifrt/ir/ifrt_ops.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace ifrt {
+
+using CompileFuture = Future<AtomProgramCompileResult>;
+
+// Wrapper around `AtomProgramCompiler` that offers multi-threaded dispatch
+// of atom program compilations.
+class MultiThreadedAtomProgramCompiler {
+ public:
+  explicit MultiThreadedAtomProgramCompiler(
+      std::shared_ptr<AtomProgramCompiler> compiler,
+      std::shared_ptr<
+          absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+          compile_options_overrides,
+      bool enable_sharding_propagation)
+      : compiler_(std::move(compiler)),
+        compile_options_overrides_(std::move(compile_options_overrides)),
+        enable_sharding_propagation_{enable_sharding_propagation} {}
+
+  // Dispatches compilation of an atom program module.
+  // Depending on the type of module, a MLIR pipeline might be executed before
+  // the compilation is dispatched.
+  absl::StatusOr<CompileFuture> CompileModule(CallOp, mlir::ModuleOp module_op);
+
+ private:
+  // Compiles an atom XLA program.
+  // Returns a future of a AtomProgramCompileResult for the compiled module.
+  //
+  // Note that the method runs `ifrt-compile-xla-preprocessing-pipeline`
+  // before dispatching compilation.
+  absl::StatusOr<CompileFuture> CompileXla(
+      CallOp call_op, mlir::ModuleOp module_op,
+      tsl::thread::ThreadPool* thread_pool);
+
+  // Returns a future of a AtomProgramCompileResult for the MPMD reshard module.
+  absl::StatusOr<CompileFuture> CompileMpmdReshard(mlir::ModuleOp module_op);
+
+  // Gets the XLA compile options for the given atom program module.
+  absl::StatusOr<xla::CompileOptions> GetXlaCompileOptions(
+      CallOp call_op, mlir::ModuleOp module_op);
+
+  std::shared_ptr<AtomProgramCompiler> compiler_;
+
+  std::shared_ptr<
+      absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+      compile_options_overrides_;
+
+  // Whether to allow sharding propagation from inputs to outputs that do not
+  // have sharding specified (i.e., their mhlo.sharding attribute is not set).
+  bool enable_sharding_propagation_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_MULTI_THREADED_ATOM_PROGRAM_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
new file mode 100644
index 00000000..4b6263f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
@@ -0,0 +1,230 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/ir/atom_program_compiler.h"
+#include "xla/python/ifrt/ir/ifrt_ir_program.pb.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+namespace ifrt {
+
+#define GEN_PASS_DECL
+#include "xla/python/ifrt/ir/transforms/passes.h.inc"  // IWYU pragma: export
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSpmdExpandableInterfaceVerificationPass(
+    SpmdExpandableInterfaceVerificationPassOptions options = {});
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateSpmdExpansionPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtDuplicatedCalleeEliminationPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateIfrtMergeReshardsPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtOutlineAtomProgramToModulePass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateIfrtVerifyDonationPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyShardingSpecifiedPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtPopulateAtomProgramMetadataPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtReshardToCopyArraysPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtLowerAtomProgramMetadataToXlaPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtRemoveIfrtAttrsPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtRemoveAttrsFromOtherDialectsPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtLowerMpmdReshardToCallPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyBoundExternalLoadedExecutablePass(
+    std::shared_ptr<AtomExecutableMap> bound_executable_map);
+
+// Compiles every atom program ModuleOp into LoadedExecutableOp, and
+// lowers every CallOp to CallLoadedExecutableOp.
+//
+// This pass is not declared in td file as it doesn't have a default
+// constructor. It uses an outside AtomProgramCompiler to delegate the
+// compilation of atom programs.
+//
+// For example, the following code
+// ```
+// %0, %ctrl_0 = ifrt.Call @callee::@main(%arg0) on devices [0, 1]
+//
+// module @callee attributes {
+//   func.func @main() {}
+// }
+// ```
+//
+// will be replaced by
+// ```
+// %0, %ctrl_0 = ifrt.CallLoadedExecutable @component__method(%arg0)
+//
+// ifrt.LoadedExecutable @component__method on devices [0, 1]
+// ```
+// }
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtCompileAtomProgramPass(
+    std::shared_ptr<AtomProgramCompiler> compiler,
+    std::shared_ptr<
+        absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+        compile_options,
+    std::shared_ptr<AtomExecutableMap> atom_executable_map);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtCompileAndPropagateShardingsPass(
+    std::shared_ptr<AtomProgramCompiler> compiler,
+    std::shared_ptr<
+        absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+        compile_options,
+    std::shared_ptr<AtomExecutableMap> atom_executable_map);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtPrecompileAtomProgramPreprocessingPass(
+    IfrtPrecompileAtomProgramPreprocessingPassOptions options = {});
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyDeviceTypeConsistencyPass(
+    IfrtVerifyDeviceTypeConsistencyPassOptions options = {});
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtAtomProgramsToVhloPass(
+    tsl::protobuf::RepeatedPtrField<IfrtIrAtomProgramProto>* atom_programs,
+    std::string vhlo_target_version);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtAtomProgramsFromVhloPass(
+    const tsl::protobuf::RepeatedPtrField<IfrtIrAtomProgramProto>&
+        atom_programs);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateVifrtToVersionPass(
+    VifrtToVersionPassOptions options = {});
+
+void populateIfrtToVifrtPatterns(mlir::RewritePatternSet* patterns,
+                                 mlir::TypeConverter* converter,
+                                 mlir::MLIRContext* context);
+
+void populateVifrtToIfrtPatterns(mlir::RewritePatternSet* patterns,
+                                 mlir::TypeConverter* converter,
+                                 mlir::MLIRContext* context);
+
+void populateVifrtToVersionPatterns(mlir::RewritePatternSet* patterns,
+                                    mlir::TypeConverter* converter,
+                                    mlir::MLIRContext* context);
+
+// Generated definitions. This should be placed after all Pass creations.
+#define GEN_PASS_REGISTRATION
+#include "xla/python/ifrt/ir/transforms/passes.h.inc"  // IWYU pragma: export
+
+// Registers IfrtCompileAtomProgramPass to ifrt-opt.
+void RegisterIfrtCompileAtomProgramPass(
+    std::shared_ptr<AtomProgramCompiler> compiler,
+    std::shared_ptr<
+        absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+        compile_options_overrides,
+    std::shared_ptr<AtomExecutableMap> atom_executable_map);
+
+// Registers IfrtCompileAndPropagateShardingsPass to ifrt-opt.
+void RegisterIfrtCompileAndPropagateShardingsPass(
+    std::shared_ptr<AtomProgramCompiler> compiler,
+    std::shared_ptr<
+        absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+        compile_options_overrides,
+    std::shared_ptr<AtomExecutableMap> atom_executable_map);
+
+// Registers IfrtVerifyBoundExternalLoadedExecutablePass to ifrt-opt.
+void RegisterIfrtVerifyBoundExternalLoadedExecutablePass(
+    std::shared_ptr<AtomExecutableMap> bound_executable_map);
+
+struct IfrtToOutlinedAtomProgramsPipelineOptions
+    : mlir::PassPipelineOptions<IfrtToOutlinedAtomProgramsPipelineOptions> {
+  Option<bool> propagate_shardings{
+      *this, "propagate_shardings",
+      llvm::cl::desc("Whether to propagate shardings from executables for "
+                     "unspecified shardings.")};
+};
+
+// Creates pipeline of all the IFRT IR passes that do not require
+// compilation-time information (e.g., device assignments).
+void CreateIfrtToOutlinedAtomProgramsPipeline(
+    mlir::OpPassManager& pm,
+    const IfrtToOutlinedAtomProgramsPipelineOptions& options);
+
+// Creates a pipeline that populates metadata info for each atom program.
+void CreateIfrtPopulateAtomProgramMetadataPipeline(mlir::OpPassManager& pm);
+
+// Creates pipeline to lower an IFRT XLA program to be ready for compilation.
+void CreateIfrtCompileXlaPreprocessingPipeline(mlir::OpPassManager& pm);
+
+// Creates a pipeline that converts an IFRT IR program to a versioned IFRT IR
+// program, and a versioned VHLO programs populated within `IfrtIrProgramProto`.
+void CreateIfrtToVersionedPipeline(mlir::OpPassManager& pm,
+                                   std::string ifrt_target_version,
+                                   std::string vhlo_target_version,
+                                   IfrtIrProgramProto& ifrt_ir_program);
+
+// Creates a pipeline that converts a versioned IFRT IR program to an IFRT IR
+// program.
+// The pipeline runs over a versioned IFRT IR module without any atom programs
+// present inside the module. The atom programs are expected to be present and
+// versioned in the given `ifrt_ir_program`. The pipeline will convert these
+// atom programs (i.e., from VHLO to StableHLO) and add them to the IFRT IR
+// program.
+void CreateIfrtFromVersionedPipeline(mlir::OpPassManager& pm,
+                                     const IfrtIrProgramProto& ifrt_ir_program);
+
+// Registers passes and pipelines to ifrt-opt.
+void RegisterIfrtPassesAndPipelines(
+    std::shared_ptr<AtomProgramCompiler> compiler,
+    std::shared_ptr<
+        absl::flat_hash_map<std::string, std::unique_ptr<CompileOptions>>>
+        compile_options_overrides,
+    std::shared_ptr<AtomExecutableMap> atom_executable_map,
+    std::shared_ptr<AtomExecutableMap> bound_executable_map);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/noop_ifrt_spmd_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/noop_ifrt_spmd_expander.h
new file mode 100644
index 00000000..820060f3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/noop_ifrt_spmd_expander.h
@@ -0,0 +1,61 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_NOOP_IFRT_SPMD_EXPANDER_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_NOOP_IFRT_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+
+namespace xla::ifrt {
+
+// SPMD expander for operations that does not requires actual expansion.
+template <typename OpT>
+class NoOpIfrtSpmdExpander
+    : public xla::ifrt::IfrtSpmdExpandable::ExternalModel<
+          NoOpIfrtSpmdExpander<OpT>, OpT> {
+ public:
+  mlir::FailureOr<mlir::Operation*> SpmdExpand(mlir::Operation* op) const {
+    return op;
+  }
+
+  mlir::FailureOr<llvm::DenseMap<int, ShardingParam>> ComputeShardingForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, ShardingParam>& input_shardings) const {
+    // TODO(b/261623129): implement this method when sharding propagation pass
+    // is implemented.
+    op->emitOpError(
+        "Interface method `ComputeShardingForward` not implemented.");
+    return mlir::failure();
+  }
+
+  mlir::FailureOr<llvm::DenseMap<int, ShardingParam>> ComputeShardingBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, ShardingParam>& output_shardings) const {
+    // TODO(b/261623129): implement this method when sharding propagation pass
+    // is implemented.
+    op->emitOpError(
+        "Interface method `ComputeShardingBackward` not implemented.");
+    return mlir::failure();
+  }
+};
+
+}  // namespace xla::ifrt
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_NOOP_IFRT_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/terminator_ifrt_spmd_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/terminator_ifrt_spmd_expander.h
new file mode 100644
index 00000000..ea36b3b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/terminator_ifrt_spmd_expander.h
@@ -0,0 +1,115 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_TERMINATOR_IFRT_SPMD_EXPANDER_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_TERMINATOR_IFRT_SPMD_EXPANDER_H_
+
+#include <optional>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+
+namespace xla::ifrt {
+
+// SPMD expander for `mlir::func::ReturnOp`-like operations.
+template <typename OpT>
+class TerminatorIfrtSpmdExpander
+    : public xla::ifrt::IfrtSpmdExpandable::ExternalModel<
+          TerminatorIfrtSpmdExpander<OpT>, OpT> {
+ public:
+  mlir::FailureOr<mlir::Operation*> SpmdExpand(mlir::Operation* op) const;
+
+  mlir::FailureOr<llvm::DenseMap<int, ShardingParam>> ComputeShardingForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, ShardingParam>& input_shardings) const;
+
+  mlir::FailureOr<llvm::DenseMap<int, ShardingParam>> ComputeShardingBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, ShardingParam>& output_shardings) const;
+};
+
+template <typename OpT>
+mlir::FailureOr<mlir::Operation*> TerminatorIfrtSpmdExpander<OpT>::SpmdExpand(
+    mlir::Operation* op) const {
+  mlir::Operation* parent_op = op->getBlock()->getParentOp();
+  if (!parent_op) return mlir::success();
+
+  auto output_types = llvm::to_vector<8>(op->getOperandTypes());
+  if (auto function = llvm::dyn_cast<mlir::func::FuncOp>(parent_op)) {
+    // Update function output type to have local shape.
+    mlir::FunctionType new_func_type = mlir::FunctionType::get(
+        function.getContext(), function.getFunctionType().getInputs(),
+        output_types);
+    function.setType(new_func_type);
+
+    // Update function callsite operations to reflect local output shapes.
+    std::optional<mlir::SymbolTable::UseRange> function_uses =
+        mlir::SymbolTable::getSymbolUses(
+            function,
+            &(function->getParentOfType<mlir::ModuleOp>().getBodyRegion()));
+    if (!function_uses) return mlir::success();
+    for (const mlir::SymbolTable::SymbolUse& function_use : *function_uses) {
+      mlir::Operation* callsite_op = function_use.getUser();
+      if (!callsite_op) continue;
+
+      for (const auto& output_type_and_index : llvm::enumerate(output_types)) {
+        int index = output_type_and_index.index();
+        const mlir::Type& type = output_type_and_index.value();
+        callsite_op->getResult(index).setType(type);
+      }
+    }
+  } else {
+    for (const auto& output_type_and_index : llvm::enumerate(output_types)) {
+      int index = output_type_and_index.index();
+      const mlir::Type& type = output_type_and_index.value();
+      parent_op->getResult(index).setType(type);
+    }
+  }
+  return op;
+}
+
+template <typename OpT>
+mlir::FailureOr<llvm::DenseMap<int, ShardingParam>>
+TerminatorIfrtSpmdExpander<OpT>::ComputeShardingForward(
+    mlir::Operation* op,
+    const llvm::DenseMap<int, ShardingParam>& input_shardings) const {
+  // TODO(b/261623129): implement this method when sharding propagation pass is
+  // implemented.
+  op->emitOpError("Interface method `ComputeShardingForward` not implemented.");
+  return mlir::failure();
+}
+
+template <typename OpT>
+mlir::FailureOr<llvm::DenseMap<int, ShardingParam>>
+TerminatorIfrtSpmdExpander<OpT>::ComputeShardingBackward(
+    mlir::Operation* op,
+    const llvm::DenseMap<int, ShardingParam>& output_shardings) const {
+  // TODO(b/261623129): implement this method when sharding propagation pass is
+  // implemented.
+  op->emitOpError(
+      "Interface method `ComputeShardingBackward` not implemented.");
+  return mlir::failure();
+}
+
+}  // namespace xla::ifrt
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_TERMINATOR_IFRT_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/unimplemented_ifrt_spmd_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/unimplemented_ifrt_spmd_expander.h
new file mode 100644
index 00000000..f923cedf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/unimplemented_ifrt_spmd_expander.h
@@ -0,0 +1,63 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_UNIMPLEMENTED_IFRT_SPMD_EXPANDER_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_UNIMPLEMENTED_IFRT_SPMD_EXPANDER_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LogicalResult.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+
+namespace xla {
+namespace ifrt {
+
+// A temporary placeholder used for operations whose SPMD expanders have not
+// been implemented. Using this class as operation's SPMD expander will suppress
+// error from `SpmdExpandableInterfaceVerificationPass`.The usage of this class
+// should be temporary and is generally discouraged as it will delay the error
+// in the pipeline.
+template <typename OpT>
+class UnimplementedIfrtSpmdExpander
+    : public xla::ifrt::IfrtSpmdExpandable::ExternalModel<
+          UnimplementedIfrtSpmdExpander<OpT>, OpT> {
+ public:
+  mlir::FailureOr<mlir::Operation*> SpmdExpand(mlir::Operation* op) const {
+    op->emitOpError("Interface method `SpmdExpand` not implemented.");
+    return mlir::failure();
+  }
+
+  mlir::FailureOr<llvm::DenseMap<int, ShardingParam>> ComputeShardingForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, ShardingParam>& input_shardings) const {
+    op->emitOpError(
+        "Interface method `ComputeShardingForward` not implemented.");
+    return mlir::failure();
+  }
+
+  mlir::FailureOr<llvm::DenseMap<int, ShardingParam>> ComputeShardingBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, ShardingParam>& output_shardings) const {
+    op->emitOpError(
+        "Interface method `ComputeShardingBackward` not implemented.");
+    return mlir::failure();
+  }
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_SPMD_EXPANDERS_UNIMPLEMENTED_IFRT_SPMD_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/utils.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/utils.h
new file mode 100644
index 00000000..9b1f212c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/transforms/utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_TRANSFORMS_UTILS_H_
+#define XLA_PYTHON_IFRT_IR_TRANSFORMS_UTILS_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Types.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/ir/ifrt_ops.h"
+
+namespace xla {
+namespace ifrt {
+
+// Used for comparing CallOps without including control dependencies.
+struct IfrtCallOpInfo : llvm::DenseMapInfo<xla::ifrt::CallOp> {
+  static unsigned getHashValue(xla::ifrt::CallOp call_op);
+  static bool isEqual(xla::ifrt::CallOp lhs, xla::ifrt::CallOp rhs);
+};
+
+// Retrieves the function named "main" from the given module, if it exists, and
+// fails otherwise.
+mlir::func::FuncOp GetMainFunction(mlir::ModuleOp module);
+
+// Returns true if transferring between from and to array requires a reshard.
+bool IsReshard(xla::ifrt::IfrtArrayType from, xla::ifrt::IfrtArrayType to);
+
+// Updates the FunctionType of the given `func_op` to match the block arguments
+// types and return operands types in its region.
+void UpdateFunctionType(mlir::func::FuncOp func_op);
+
+// Converts a mlir::Type to a ifrt DType.
+absl::StatusOr<DType> ToIfrtDType(mlir::Type type);
+
+// Prints the MLIR operation as a string.
+std::string OperationToString(mlir::Operation* op,
+                              const mlir::OpPrintingFlags& flags);
+
+// Clones a given mlir::ModuleOp using a mlir::OpBuilder. This function is used
+// to clone a module into a new MLIR context, which was used to construct the
+// builder. For other cases, regular mlir::ModuleOp::clone() should be used.
+mlir::ModuleOp CloneModuleUsingBuilder(mlir::ModuleOp module,
+                                       mlir::OpBuilder& builder);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/version.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/version.h
new file mode 100644
index 00000000..6a32bd58
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/version.h
@@ -0,0 +1,99 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_VERSION_H_
+#define XLA_PYTHON_IFRT_IR_VERSION_H_
+
+#include <cstdint>
+#include <sstream>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace xla {
+namespace ifrt {
+
+class Version {
+ public:
+  // Convenience method to extract major, minor, patch and create a Version
+  // from a StringRef of the form `#.#.#`. Returns failure if the string is
+  // invalid.
+  static mlir::FailureOr<Version> fromString(llvm::StringRef version_ref);
+
+  // Returns a Version representing the current IFRT IR version.
+  static Version getCurrentVersion() { return Version(0, 1, 0); }
+
+  /// Returns a Version representing the minimum supported IFRT IR version.
+  static Version getMinimumVersion() { return Version(0, 1, 0); }
+
+  // CompatibilityRequirement is used to get a viable target version to use for
+  // `xla::ifrt::Serialize` given a compatibility requirement specified as a
+  // duration.
+  //
+  // Values represent a minimum requirement, i.e. WEEK_4 will return a version
+  // that is at least 4 weeks old.
+  enum class CompatibilityRequirement {
+    NONE = 0,     // No compat requirement, use latest version.
+    WEEK_4 = 1,   // 1 month requirement
+    WEEK_12 = 2,  // 3 month requirement
+    MAX = 3,      // Maximum compat, use minimum supported version
+  };
+
+  // Get a viable target version to use for `xla::ifrt::Serialize` for a given
+  // compatibility requirement.
+  static Version fromCompatibilityRequirement(
+      CompatibilityRequirement requirement);
+
+  // Return the MLIR Bytecode version associated with the IFRT IR version
+  // instance. Returns failure if the version is not in compatibility window.
+  mlir::FailureOr<int64_t> getBytecodeVersion() const;
+
+  // Construct Version from major, minor, patch integers.
+  Version(int64_t major, int64_t minor, int64_t patch)
+      : major_minor_patch_({major, minor, patch}) {}
+
+  int64_t getMajor() const { return major_minor_patch_[0]; }
+  int64_t getMinor() const { return major_minor_patch_[1]; }
+  int64_t getPatch() const { return major_minor_patch_[2]; }
+
+  bool operator<(const Version& other) const {
+    return major_minor_patch_ < other.major_minor_patch_;
+  }
+  bool operator==(const Version& other) const {
+    return major_minor_patch_ == other.major_minor_patch_;
+  }
+  bool operator<=(const Version& other) const {
+    return major_minor_patch_ <= other.major_minor_patch_;
+  }
+  std::string toString() const {
+    std::ostringstream os;
+    os << getMajor() << '.' << getMinor() << '.' << getPatch();
+    return os.str();
+  }
+
+ private:
+  llvm::SmallVector<int64_t, 3> major_minor_patch_;
+};
+
+mlir::Diagnostic& operator<<(mlir::Diagnostic& diag, const Version& version);
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const Version& version);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_VERSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/vifrt_bytecode.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/vifrt_bytecode.h
new file mode 100644
index 00000000..ac8cb0fa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/vifrt_bytecode.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_VIFRT_BYTECODE_H_
+#define XLA_PYTHON_IFRT_IR_VIFRT_BYTECODE_H_
+
+namespace xla {
+namespace ifrt {
+
+class VifrtDialect;
+
+// Add the interface necessary for encoding and decoding VIFRT dialect
+// types and attributes in bytecode.
+void addBytecodeInterface(VifrtDialect *dialect);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_IR_VIFRT_BYTECODE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/vifrt_dialect.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/vifrt_dialect.h
new file mode 100644
index 00000000..9d54dcd6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/ir/vifrt_dialect.h
@@ -0,0 +1,99 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_IR_VIFRT_DIALECT_H_
+#define XLA_PYTHON_IFRT_IR_VIFRT_DIALECT_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "xla/python/ifrt/ir/sharding_param.h"  // IWYU pragma: export
+#include "xla/python/ifrt/ir/version.h"  // IWYU pragma: export
+
+namespace xla {
+namespace ifrt {
+
+class VifrtDialect : public mlir::Dialect {
+ public:
+  explicit VifrtDialect(mlir::MLIRContext *context);
+
+  static mlir::StringRef getDialectNamespace() { return "vifrt"; }
+
+  // Parses a type registered in the VIFRT dialect.
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+
+  // Prints a type registered in the VIFRT dialect.
+  void printType(mlir::Type type, mlir::DialectAsmPrinter &os) const override;
+
+  // Parses an attribute registered in the VIFRT dialect.
+  mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
+                                 mlir::Type type) const override;
+
+  // Prints an attribute registered in the VIFRT dialect.
+  void printAttribute(mlir::Attribute attr,
+                      mlir::DialectAsmPrinter &os) const override;
+};
+
+class VifrtTypeConverterBase : public mlir::TypeConverter {
+ public:
+  VifrtTypeConverterBase() : mlir::TypeConverter() {};
+
+  ~VifrtTypeConverterBase() override = default;
+};
+
+// Class used to manage conversions between VIFRT and Builtin types.
+class VifrtTypeConverterBuiltin : public VifrtTypeConverterBase {
+ public:
+  // A subclass can call this method to add conversions from VIFRT to Builtin
+  // types. Conversions are applied in reverse order, with the most recently
+  // added conversion attempted to be applied first.
+  void addVifrtToBuiltinConversions();
+
+  // A subclass can call this method to add conversions from Builtin to VIFRT
+  // types. Conversions are applied in reverse order, with the most recently
+  // added conversion attempted to be applied first.
+  void addBuiltinToVifrtConversions();
+};
+
+// Auto-generated VIFRT type printers and parsers.
+mlir::LogicalResult printVifrtType(mlir::Type type, mlir::AsmPrinter &printer);
+mlir::OptionalParseResult parseVifrtType(mlir::AsmParser &parser,
+                                         llvm::StringRef *mnemonic,
+                                         mlir::Type &type);
+
+}  // namespace ifrt
+}  // namespace xla
+
+// Generated definitions.
+// Attributes
+#include "xla/python/ifrt/ir/vifrt_attr_interfaces.h.inc"
+#define GET_ATTRDEF_CLASSES
+#include "xla/python/ifrt/ir/vifrt_attrs.h.inc"
+// Types
+#include "xla/python/ifrt/ir/vifrt_type_interfaces.h.inc"
+#define GET_TYPEDEF_CLASSES
+#include "xla/python/ifrt/ir/vifrt_types.h.inc"
+// Ops
+#include "xla/python/ifrt/ir/vifrt_op_interfaces.h.inc"
+#define GET_OP_CLASSES
+#include "xla/python/ifrt/ir/vifrt_ops.h.inc"
+
+#endif  // XLA_PYTHON_IFRT_IR_VIFRT_DIALECT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/memory.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/memory.h
new file mode 100644
index 00000000..599bcc27
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/memory.h
@@ -0,0 +1,140 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_MEMORY_H_
+#define XLA_PYTHON_IFRT_MEMORY_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/device.h"
+
+namespace xla {
+namespace ifrt {
+
+// `MemoryKind` uniquely identifies a group of memory spaces with a
+// platform-dependent string. When no specific memory kind is chosen, the
+// platform should use the default memory kind for a platform's device that is
+// being used.
+class MemoryKind {
+ public:
+  // Creates `MemoryKind` with no memory kind chosen.
+  MemoryKind() = default;
+
+  // Creates `MemoryKind` from a platform-dependent identifier of a memory kind.
+  // `MemoryKind` will be stable even after the string referenced by
+  // `memory_kind` is deallocated.
+  explicit MemoryKind(std::optional<absl::string_view> memory_kind);
+
+  bool operator==(const MemoryKind& other) const {
+    // Use a pointer comparison. *memory_kind_ always points to a deduplicated
+    // string.
+    if (!memory_kind_.has_value() && !other.memory_kind_.has_value()) {
+      return true;
+    }
+    if (memory_kind_.has_value() && other.memory_kind_.has_value() &&
+        memory_kind_->data() == other.memory_kind_->data()) {
+      return true;
+    }
+    return false;
+  }
+  bool operator!=(const MemoryKind& other) const { return !(*this == other); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const MemoryKind& memory_kind) {
+    return H::combine(std::move(h), memory_kind.memory_kind_);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const MemoryKind& memory_kind) {
+    sink.Append(memory_kind.ToString());
+  }
+
+  // Returns a platform-dependent identifier of a memory kind.
+  std::optional<absl::string_view> memory_kind() const { return memory_kind_; }
+
+ private:
+  std::string ToString() const;
+
+  std::optional<absl::string_view> memory_kind_;
+};
+
+// Canonicalizes `MemoryKind`. If `MemoryKind` has no memory kind chosen,
+// returns a default `MemoryKind` chosen for the device. If there is no default
+// indicated by the device, simply returns `MemoryKind` with no memory kind
+// chosen.
+//
+// TODO(b/356623715): Harden `MemoryKind` creation paths so that every
+// `MemoryKind` is canonicalized and does not require on-demand
+// canonicalization.
+MemoryKind CanonicalizeMemoryKind(MemoryKind memory_kind, Device* device);
+
+TSL_LIB_GTL_DEFINE_INT_TYPE(MemoryId, int32_t);
+
+// `Memory` represents a memory space that one or more devices can be attached
+// to. A platform may have multiple memory spaces with different backing
+// hardware or memory region types.
+class Memory : public llvm::RTTIExtends<Memory, llvm::RTTIRoot> {
+ public:
+  Memory() = default;
+
+  // Not copyable or movable.
+  Memory(const Memory&) = delete;
+  Memory(Memory&&) = delete;
+  Memory& operator=(const Memory&) = delete;
+  Memory& operator=(Memory&&) = delete;
+
+  virtual MemoryId Id() const = 0;
+
+  // A platform-dependent string that uniquely identifies the kind of the
+  // memory.
+  virtual const MemoryKind& Kind() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse.
+  virtual absl::string_view ToString() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  //
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  virtual absl::string_view DebugString() const = 0;
+
+  // The devices to which this memory space is attached.
+  virtual absl::Span<Device* const> Devices() const = 0;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Memory& memory) {
+    sink.Append(memory.DebugString());
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Memory* memory) {
+    if (memory == nullptr) {
+      sink.Append("<nullptr>");
+    } else {
+      sink.Append(memory->DebugString());
+    }
+  }
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_MEMORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/mock.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/mock.h
new file mode 100644
index 00000000..f49597cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/mock.h
@@ -0,0 +1,397 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_MOCK_H_
+#define XLA_PYTHON_IFRT_MOCK_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/executable_serdes.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/index_domain.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/test.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// array.h
+
+class MockArray : public llvm::RTTIExtends<MockArray, Array> {
+ public:
+  MockArray() = default;
+  explicit MockArray(tsl::RCReference<xla::ifrt::Array> delegated);
+
+  // LINT.IfChange
+  MOCK_METHOD(Client*, client, (), (const, final));
+  MOCK_METHOD(Future<>, GetReadyFuture, (), (const, final));
+  MOCK_METHOD(Future<>, Delete, (), (final));
+  MOCK_METHOD(bool, IsDeleted, (), (const, final));
+
+  MOCK_METHOD(DType, dtype, (), (const, final));
+  MOCK_METHOD(const Shape&, shape, (), (const, final));
+  MOCK_METHOD(const Sharding&, sharding, (), (const, final));
+  MOCK_METHOD(absl::Nonnull<std::shared_ptr<const Sharding>>,
+              shared_ptr_sharding, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<const PjRtLayout>>, layout, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
+              DisassembleIntoSingleDeviceArrays, (ArrayCopySemantics semantics),
+              (final));
+  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
+              DisassembleIntoSingleDeviceArrays,
+              (ArrayCopySemantics array_copy_semantics,
+               SingleDeviceShardSemantics single_device_shard_semantics),
+              (final));
+  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, FullyReplicatedShard,
+              (ArrayCopySemantics semantics), (final));
+  MOCK_METHOD(Future<>, CopyToHostBuffer,
+              (void* data,
+               std::optional<absl::Span<const int64_t>> byte_strides,
+               ArrayCopySemantics semantics),
+              (final));
+  // LINT.ThenChange(mock.cc:MockArrayDelegation)
+
+  tsl::RCReference<xla::ifrt::Array> delegated() const { return delegated_; }
+
+  std::string DebugString() const final { return "MockArray"; }
+
+  static char ID;  // NOLINT
+
+ private:
+  const tsl::RCReference<xla::ifrt::Array> delegated_;
+};
+
+// client.h
+
+class MockClient : public llvm::RTTIExtends<MockClient, Client> {
+ public:
+  MockClient() = default;
+  explicit MockClient(std::unique_ptr<xla::ifrt::Client> delegated);
+
+  // LINT.IfChange
+  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, MakeArrayFromHostBuffer,
+              (const void* data, DType dtype, Shape shape,
+               std::optional<absl::Span<const int64_t>> byte_strides,
+               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+               HostBufferSemantics semantics,
+               std::function<void()> on_done_with_host_buffer),
+              (final));
+  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>,
+              AssembleArrayFromSingleDeviceArrays,
+              (Shape shape,
+               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+               absl::Span<tsl::RCReference<Array>> arrays,
+               ArrayCopySemantics semantics),
+              (final));
+  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>,
+              AssembleArrayFromSingleDeviceArrays,
+              (Shape shape,
+               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+               absl::Span<tsl::RCReference<Array>> arrays,
+               ArrayCopySemantics array_copy_semantics,
+               SingleDeviceShardSemantics single_device_shard_semantics),
+              (final));
+  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>, CopyArrays,
+              (absl::Span<tsl::RCReference<Array>> arrays,
+               std::optional<tsl::RCReference<DeviceList>> devices,
+               std::optional<MemoryKind> memory_kind,
+               ArrayCopySemantics semantics),
+              (final));
+  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>, RemapArrays,
+              (const RemapPlan& plan,
+               absl::Span<tsl::RCReference<Array>> arrays,
+               ArrayCopySemantics semantics),
+              (final));
+  MOCK_METHOD(Future<>, GetReadyFuture,
+              (absl::Span<const tsl::RCReference<Value>> values), (final));
+  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Tuple>>, MakeTuple,
+              (absl::Span<tsl::RCReference<Value>> values), (final));
+  MOCK_METHOD(absl::string_view, runtime_type, (), (const, final));
+  MOCK_METHOD(absl::string_view, platform_name, (), (const, final));
+  MOCK_METHOD(absl::string_view, platform_version, (), (const, final));
+  MOCK_METHOD((const AttributeMap&), Attributes, (), (const, final));
+  MOCK_METHOD(int, device_count, (), (const, final));
+  MOCK_METHOD(PlatformId, platform_id, (), (const, final));
+  MOCK_METHOD(int, addressable_device_count, (), (const, final));
+  MOCK_METHOD(absl::Span<Device* const>, devices, (), (const, final));
+  MOCK_METHOD(absl::Span<Device* const>, addressable_devices, (),
+              (const, final));
+  MOCK_METHOD(int, process_index, (), (const, final));
+  MOCK_METHOD(absl::Span<xla::ifrt::Device* const>, GetAllDevices, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<DeviceAssignment>, GetDefaultDeviceAssignment,
+              (int num_replicas, int num_partitions), (const, final));
+  MOCK_METHOD(absl::StatusOr<Device*>, LookupDevice, (DeviceId device_id),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<Device*>, LookupAddressableDevice,
+              (int local_hardware_id), (const, final));
+  MOCK_METHOD(Compiler*, GetDefaultCompiler, (), (final));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<Topology>>, GetTopologyForDevices,
+              (const tsl::RCReference<xla::ifrt::DeviceList>& devices),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<const PjRtLayout>>,
+              GetDefaultLayout,
+              (xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
+               xla::ifrt::Device* device, xla::ifrt::MemoryKind memory_kind),
+              (const, final));
+  // LINT.ThenChange(mock.cc:MockClientDelegation)
+
+  xla::ifrt::Client* delegated() const { return delegated_.get(); }
+
+  static char ID;  // NOLINT
+
+ private:
+  const std::unique_ptr<xla::ifrt::Client> delegated_;
+};
+
+// compiler.h
+
+class MockCompiler : public llvm::RTTIExtends<MockCompiler, Compiler> {
+ public:
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<LoadedExecutable>>, Compile,
+              (std::unique_ptr<Program> program,
+               std::unique_ptr<CompileOptions> options),
+              (final));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Executable>>, Compile,
+              (std::unique_ptr<Program> program, const Topology& topology,
+               std::unique_ptr<CompileOptions> options),
+              (final));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<LoadedExecutable>>,
+              DeserializeLoadedExecutable,
+              (absl::string_view serialized,
+               std::unique_ptr<DeserializeExecutableOptions> options),
+              (final));
+
+  static char ID;  // NOLINT
+};
+
+// device.h
+
+class MockDevice : public Device {
+ public:
+  MockDevice() = default;
+  explicit MockDevice(Device* delegated);
+
+  // LINT.IfChange
+  MOCK_METHOD(Client*, client, (), (const, final));
+  MOCK_METHOD(bool, IsAddressable, (), (const, final));
+  MOCK_METHOD(int, ProcessIndex, (), (const, final));
+  MOCK_METHOD(DeviceId, Id, (), (const, final));
+  MOCK_METHOD(absl::string_view, Kind, (), (const, final));
+  MOCK_METHOD((const AttributeMap&), Attributes, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<Memory*>, DefaultMemory, (), (const, final));
+  MOCK_METHOD(absl::Span<Memory* const>, Memories, (), (const, final));
+  // LINT.ThenChange(mock.cc:MockDeviceDelegation)
+
+  Device* delegated() const { return delegated_; }
+
+  absl::string_view DebugString() const final { return "MockDevice"; }
+  absl::string_view ToString() const final { return "MockDevice"; }
+
+ private:
+  Device* const delegated_ = nullptr;
+};
+
+// memory.h
+
+class MockMemory : public Memory {
+ public:
+  MOCK_METHOD(MemoryId, Id, (), (const, final));
+  MOCK_METHOD(absl::Span<Device* const>, Devices, (), (const, final));
+  MOCK_METHOD(const MemoryKind&, Kind, (), (const, final));
+  MOCK_METHOD(absl::string_view, ToString, (), (const, final));
+
+  absl::string_view DebugString() const final { return "MockMemory"; }
+};
+
+// executable.h
+
+class MockExecutable : public llvm::RTTIExtends<MockExecutable, Executable> {
+ public:
+  MOCK_METHOD(absl::string_view, name, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::optional<std::string>>, Fingerprint, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::string>, Serialize, (), (const, final));
+  MOCK_METHOD(int, num_devices, (), (const, final));
+  MOCK_METHOD(int64_t, SizeOfGeneratedCodeInBytes, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<CompiledMemoryStats>, GetCompiledMemoryStats, (),
+              (const, final));
+  MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetParameterShardings, (),
+              (const, final));
+  MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
+              GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
+              GetOutputLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
+              GetHloModules, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<xla::ifrt::AttributeMap>, GetCostAnalysis, (),
+              (const, final));
+
+  static char ID;  // NOLINT
+};
+
+class MockLoadedExecutable
+    : public llvm::RTTIExtends<MockLoadedExecutable, LoadedExecutable> {
+ public:
+  MOCK_METHOD(Client*, client, (), (const, final));
+  MOCK_METHOD(absl::string_view, name, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::optional<std::string>>, Fingerprint, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::string>, Serialize, (), (const, final));
+  MOCK_METHOD(Future<>, GetReadyFuture, (), (const, override));
+  MOCK_METHOD(int, num_devices, (), (const, final));
+  MOCK_METHOD(int64_t, SizeOfGeneratedCodeInBytes, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<CompiledMemoryStats>, GetCompiledMemoryStats, (),
+              (const, final));
+  MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetParameterShardings, (),
+              (const, final));
+  MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
+              GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
+              GetOutputLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::vector<absl::string_view>>>,
+              GetOutputMemoryKinds, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
+              GetHloModules, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<xla::ifrt::AttributeMap>, GetCostAnalysis, (),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<ExecuteResult>, Execute,
+              (absl::Span<tsl::RCReference<Array>> args,
+               const ExecuteOptions& options,
+               std::optional<tsl::RCReference<DeviceList>> devices),
+              (final));
+  MOCK_METHOD(Future<>, Delete, (), (final));
+  MOCK_METHOD(bool, IsDeleted, (), (const, final));
+  MOCK_METHOD(absl::Span<Device* const>, addressable_devices, (),
+              (const, final));
+
+  static char ID;  // NOLINT
+};
+
+// host_callback.h
+
+class MockHostCallback final
+    : public llvm::RTTIExtends<MockHostCallback, HostCallback> {
+ public:
+  MOCK_METHOD(std::string, Serialize, (), (const, final));
+
+  static char ID;  // NOLINT
+};
+
+class MockLoadedHostCallback final
+    : public llvm::RTTIExtends<MockLoadedHostCallback, LoadedHostCallback> {
+ public:
+  MOCK_METHOD(Client*, client, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::string>, Serialize, (), (const, final));
+
+  static char ID;  // NOLINT
+};
+
+// sharding.h
+
+class MockSharding : public llvm::RTTIExtends<MockSharding, Sharding> {
+ public:
+  MockSharding()
+      : llvm::RTTIExtends<MockSharding, Sharding>(
+            BasicDeviceList::Create({}), MemoryKind(),
+            /*is_fully_replicated=*/false) {}
+
+  MockSharding(tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+               bool is_fully_replicated)
+      : llvm::RTTIExtends<MockSharding, Sharding>(devices, memory_kind,
+                                                  is_fully_replicated) {}
+
+  MOCK_METHOD(
+      (absl::StatusOr<std::vector<
+           std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
+      Disassemble, (const Shape& shape), (const, final));
+  MOCK_METHOD(
+      (absl::StatusOr<std::vector<
+           std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
+      Disassemble,
+      (const Shape& shape,
+       SingleDeviceShardSemantics single_device_shard_semantics),
+      (const, final));
+  MOCK_METHOD(
+      (absl::StatusOr<std::vector<std::pair<
+           DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
+      Disassemble, (const DynamicShape& dynamic_shape), (const final));
+  MOCK_METHOD(
+      (absl::StatusOr<std::vector<std::pair<
+           DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
+      Disassemble,
+      (const DynamicShape& dynamic_shape,
+       SingleDeviceShardSemantics single_device_shard_semantics),
+      (const final));
+  MOCK_METHOD(absl::StatusOr<std::vector<IndexDomain>>, IndexDomains,
+              (const Shape& shape), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<IndexDomain>>, IndexDomains,
+              (const Shape& shape,
+               SingleDeviceShardSemantics single_device_shard_semantics),
+              (const, final));
+  MOCK_METHOD(absl::StatusOr<Shape>, GetShardShape, (const Shape& shape),
+              (const, final));
+  MOCK_METHOD(bool, HasSamePartitioning, (const Sharding& other),
+              (const final));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Sharding>>, WithDeviceAssignment,
+              (std::optional<tsl::RCReference<DeviceList>> devices,
+               std::optional<MemoryKind> memory_kind),
+              (const final));
+
+  std::string DebugString() const final { return "MockSharding"; }
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_MOCK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/plugin_program.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/plugin_program.h
new file mode 100644
index 00000000..c4e18460
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/plugin_program.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PLUGIN_PROGRAM_H_
+#define XLA_PYTHON_IFRT_PLUGIN_PROGRAM_H_
+
+#include <string>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/program.h"
+
+namespace xla {
+namespace ifrt {
+
+// `PluginProgram` is a subclass of `xla::ifrt::Program` used mainly with
+// the IFRT proxy as of Apr 2024, and facilitates generic RPCs from the IFRT
+// frontend (on the proxy-client) to the IFRT backend (on the proxy-server). A
+// `PluginProgram` and its compiled executable need not be associated with a
+// particular `xla::ifrt::Device`; instead, IFRT backends are expected to
+// intercept and act on the compilation and subsequent executions of
+// PluginProgram without passing them to particular devices.
+//
+// Another way to think of `PluginProgram` is that it is associated with a
+// 'controller device', as opposed to CPU or GPU devices, where the term
+// 'controller' means the same as in 'JAX uses a multi-controller programming
+// model'.
+struct PluginProgram
+    : public llvm::RTTIExtends<PluginProgram, xla::ifrt::Program> {
+  std::string data;
+  static char ID;  // NOLINT
+};
+
+struct PluginCompileOptions
+    : llvm::RTTIExtends<PluginCompileOptions, CompileOptions> {
+  PluginCompileOptions() = default;
+  ~PluginCompileOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PLUGIN_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/program.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/program.h
new file mode 100644
index 00000000..989d7871
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/program.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROGRAM_H_
+#define XLA_PYTHON_IFRT_PROGRAM_H_
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+// Abstract program. A program is any serializable code that can be compiled and
+// loaded onto devices to create a `LoadedExecutable`.
+struct Program : llvm::RTTIExtends<Program, Serializable> {
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/program_serdes.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/program_serdes.h
new file mode 100644
index 00000000..eb87481e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/program_serdes.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROGRAM_SERDES_H_
+#define XLA_PYTHON_IFRT_PROGRAM_SERDES_H_
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+// Abstract options for deserializing an `Program`.
+struct DeserializeProgramOptions
+    : llvm::RTTIExtends<DeserializeProgramOptions, DeserializeOptions> {
+  explicit DeserializeProgramOptions(DeviceList::LookupDeviceFunc lookup_device)
+      : lookup_device(lookup_device) {}
+
+  static char ID;  // NOLINT
+
+  // Function that converts device ids to devices.
+  DeviceList::LookupDeviceFunc lookup_device;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROGRAM_SERDES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/remap_plan.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/remap_plan.h
new file mode 100644
index 00000000..3d0663db
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/remap_plan.h
@@ -0,0 +1,114 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_REMAP_PLAN_H_
+#define XLA_PYTHON_IFRT_REMAP_PLAN_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/remap_plan.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Remap plan that describes how the shards from input `Array`s are mapped to
+// the shards of output `Array`s.
+//
+// * All input (or output) `Array`s must have the same dtype and per-shard
+// shape.
+// * An input shard can be used at most once.
+// * Every output shard must have exactly one input shard mapped.
+//
+// There is no API-level constraint on their global shapes and shardings.
+struct RemapPlan {
+  // Half-open interval with optional skips. Represents elements at offset
+  // `[start, start + step, start + step * 2, ..., end)` (`end` is excluded).
+  // Using the Python slice representation, it corresponds to
+  // `[start:end:step]`. `start` and `end` must be zero or positive. `step`
+  // must be positive (reverse iteration is disallowed for simplicity).
+  struct Interval {
+    int64_t start;
+    int64_t end;
+    int64_t step;
+
+    bool operator==(const Interval& other) const {
+      return start == other.start && end == other.end && step == other.step;
+    }
+
+    std::string DebugString() const;
+  };
+
+  // Mapping of shards from an input array to an output array. The shards whose
+  // index is chosen by `from` in `arrays[in_array]` will be used for the shards
+  // whose index is chosen by `to` in `out_arrays[out_array]`. `from` and `to`
+  // must contain the same number of `Interval`s, and each corresponding pair of
+  // `Interval` from `from` and `to` must represent the same number of shards.
+  struct Mapping {
+    int in_array;
+    int out_array;
+    std::vector<Interval> from;
+    std::vector<Interval> to;
+
+    bool operator==(const Mapping& other) const {
+      return in_array == other.in_array && out_array == other.out_array &&
+             from == other.from && to == other.to;
+    }
+
+    std::string DebugString() const;
+  };
+
+  // Specification of inputs.
+  std::vector<ArraySpec> input_specs;
+
+  // Specification of outputs.
+  std::vector<ArraySpec> output_specs;
+
+  // Mappings.
+  std::shared_ptr<std::vector<Mapping>> mappings;
+
+  // Validates this plan against the requirements (see `RemapPlan` comment).
+  // This is a slow operation. It should not be performed repeatedly.
+  // Implementations of `Client::RemapArrays()` may bypass runtime checks on a
+  // plan's validity, delegating the role to this method.
+  absl::Status Validate() const;
+
+  // Constructs `RemapPlan` from `RemapPlanProto`. Devices are looked up
+  // using `lookup_device`. Device ids in the proto must be consistent with
+  // the devices returned by `lookup_device`.
+  static absl::StatusOr<RemapPlan> FromProto(
+      DeviceList::LookupDeviceFunc lookup_device, const RemapPlanProto& proto);
+
+  // Returns a `RemapPlanProto` representation.
+  absl::StatusOr<RemapPlanProto> ToProto() const;
+
+  std::string DebugString() const;
+
+  // Checks whether the RemapPlan is valid with `semantics`.
+  absl::Status CheckArrayCopySemantics(
+      xla::ifrt::ArrayCopySemantics semantics) const;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_REMAP_PLAN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/serdes.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/serdes.h
new file mode 100644
index 00000000..1e80d156
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/serdes.h
@@ -0,0 +1,137 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_SERDES_H_
+#define XLA_PYTHON_IFRT_SERDES_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/serdes.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+// Base class for serialization options to be passed to `Serialize`.
+struct SerializeOptions : llvm::RTTIExtends<SerializeOptions, llvm::RTTIRoot> {
+  static char ID;  // NOLINT
+};
+
+// Base class for deserialization options to be passed to `Deserialize`.
+struct DeserializeOptions
+    : llvm::RTTIExtends<DeserializeOptions, llvm::RTTIRoot> {
+  static char ID;  // NOLINT
+};
+
+// Base class for serializable IFRT types.
+class Serializable : public llvm::RTTIExtends<Serializable, llvm::RTTIRoot> {
+ public:
+  static char ID;  // NOLINT
+
+  // Expected `SerializeOptions` and `DeserializeOptions` types. A subclass of
+  // `Serializable` can customize them.
+  using SerializeOptions = ::xla::ifrt::SerializeOptions;
+  using DeserializeOptions = ::xla::ifrt::DeserializeOptions;
+};
+
+// Serializer and deserializer implementations for one `Serializable` type.
+// This, combined with the registration mechanism below, allows extending IFRT
+// object serialization without having to extend the base IFRT itself.
+class SerDes : public llvm::RTTIExtends<SerDes, llvm::RTTIRoot> {
+ public:
+  // Type name. Must be unique. The recommended convention is to use the fully
+  // qualified type name of the class that implements `Serializable`.
+  virtual absl::string_view type_name() const = 0;
+
+  virtual absl::StatusOr<std::string> Serialize(
+      Serializable& serializable,
+      std::unique_ptr<SerializeOptions> options) = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) = 0;
+
+  static char ID;  // NOLINT
+};
+
+// Registers a `SerDes` implementation to be used for the given `Serializable`
+// type. `type_id` must be returned from `SerializableT::classID()`.
+//
+// Typically, this function should be called from a module initializer.
+// Registering a serdes more than once for the same type crashes the process.
+void RegisterSerDes(const void* type_id, std::unique_ptr<SerDes> serdes);
+
+// Syntactic sugar of the above function that takes a `Serializable` class as a
+// template argument.
+template <typename T>
+void RegisterSerDes(std::unique_ptr<SerDes> serdes) {
+  static_assert(std::is_base_of_v<Serializable, T>,
+                "Types must implement `xla::ifrt::Serializable` to have a "
+                "serdes implementation");
+  RegisterSerDes(T::classID(), std::move(serdes));
+}
+
+namespace serdes_internal {
+
+// Internal implementation of Deserialize(). Performs deserialization with type
+// erased.
+absl::StatusOr<std::unique_ptr<Serializable>> DeserializeUnchecked(
+    const Serialized& serialized, std::unique_ptr<DeserializeOptions> options);
+
+}  // namespace serdes_internal
+
+// Serializes the given `Serializable` object. The returned proto message can be
+// deserialized by `Deserialize`.
+//
+// Returns an error if the `Serializable` type does not have a corresponding
+// `SerDes` registered or the `SerDes` returns an error.
+absl::StatusOr<Serialized> Serialize(Serializable& serializable,
+                                     std::unique_ptr<SerializeOptions> options);
+
+// Deserializes the given proto message produced by `Serialize()` back to an
+// object of type `InterfaceType`, where `serialized.type_name()` is expected to
+// be the same type or a subclass of `InterfaceType`.
+//
+// `options` is passed as-is to `SerDes::Deserialize()`, so it can be nullptr as
+// long as the `SerDes` implementation can handle nullptr options.
+//
+// Returns an error if the type indicated by `serialized.type_name()` does not
+// have a corresponding `SerDes` registered or the if the registered `SerDes`
+// returns an error.
+template <typename InterfaceType>
+absl::StatusOr<std::unique_ptr<InterfaceType>> Deserialize(
+    const Serialized& serialized,
+    std::unique_ptr<typename InterfaceType::DeserializeOptions> options) {
+  TF_ASSIGN_OR_RETURN(auto result, serdes_internal::DeserializeUnchecked(
+                                       serialized, std::move(options)));
+  if (!llvm::isa<InterfaceType>(result.get())) {
+    return absl::InternalError(
+        "Unexpected Serializable type after deserialization");
+  }
+  return std::unique_ptr<InterfaceType>(
+      static_cast<InterfaceType*>(result.release()));
+}
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_SERDES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/shape.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/shape.h
new file mode 100644
index 00000000..e617aaee
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/shape.h
@@ -0,0 +1,188 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_SHAPE_H_
+#define XLA_PYTHON_IFRT_SHAPE_H_
+
+#include <stdbool.h>
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/shape.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Shape of an array. Only supports static shapes (dynamic shapes are supported
+// through `ifrt::DynamicShape`). Every dimension size must be equal to or
+// greater than 0.
+class Shape {
+ public:
+  // Maximum dimensions to inline.
+  static constexpr int kInlineDimensionSize = 6;
+
+  using Dimensions = absl::InlinedVector<int64_t, kInlineDimensionSize>;
+
+  explicit Shape(absl::Span<const int64_t> dims)
+      : dims_(Dimensions(dims.begin(), dims.end())) {}
+  Shape(const Shape&) = default;
+  Shape(Shape&&) = default;
+  Shape& operator=(const Shape&) = default;
+  Shape& operator=(Shape&&) = default;
+
+  // Constructs `Shape` from `ShapeProto`.
+  static absl::StatusOr<Shape> FromProto(const ShapeProto& proto);
+
+  // Returns a `ShapeProto` representation.
+  ShapeProto ToProto() const;
+
+  absl::Span<const int64_t> dims() const { return dims_; }
+
+  bool operator==(const Shape& other) const { return dims_ == other.dims_; }
+  bool operator!=(const Shape& other) const { return dims_ != other.dims_; }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Shape& shape);
+
+  // Total number of elements in this shape.
+  int64_t num_elements() const;
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  std::string DebugString() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Shape& shape) {
+    sink.Append(shape.DebugString());
+  }
+
+ private:
+  Dimensions dims_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const Shape& shape) {
+  return H::combine(std::move(h), shape.dims_);
+}
+
+// A tag for `Shape` to indicate bounded dynamism. Should be used together with
+// `Shape` to represent a bounded dynamic shape where the number of dimensions
+// of the shape is fixed, but certain dimensions in the shape have no fixed
+// size and only a size upper bound.
+class BoundedDynamicShapeTag {
+ public:
+  // Maximum dimensions to inline.
+  static constexpr int kInlineDimensionSize = 6;
+
+  using DynamicDimensions = absl::InlinedVector<bool, kInlineDimensionSize>;
+
+  explicit BoundedDynamicShapeTag(absl::Span<const bool> dynamic_dims)
+      : dynamic_dims_(
+            DynamicDimensions(dynamic_dims.begin(), dynamic_dims.end())) {
+    CHECK(absl::c_any_of(dynamic_dims_, [](bool b) { return b; }))
+        << "At least one dimension needs to be dynamically sized.";
+  }
+
+  BoundedDynamicShapeTag(const BoundedDynamicShapeTag&) = default;
+  BoundedDynamicShapeTag(BoundedDynamicShapeTag&&) = default;
+  BoundedDynamicShapeTag& operator=(const BoundedDynamicShapeTag&) = default;
+  BoundedDynamicShapeTag& operator=(BoundedDynamicShapeTag&&) = default;
+
+  absl::Span<const bool> DynamicDims() const { return dynamic_dims_; }
+
+  bool operator==(const BoundedDynamicShapeTag& other) const {
+    return dynamic_dims_ == other.dynamic_dims_;
+  }
+
+  bool operator!=(const BoundedDynamicShapeTag& other) const {
+    return !(*this == other);
+  }
+
+  // Constructs `BoundedDynamicShapeTag` from `BoundedDynamicShapeTagProto`.
+  static absl::StatusOr<BoundedDynamicShapeTag> FromProto(
+      const BoundedDynamicShapeTagProto& proto);
+
+  // Returns a `BoundedDynamicShapeTagProto` representation.
+  BoundedDynamicShapeTagProto ToProto() const;
+
+ private:
+  // This vector is the same size as `Shape`'s 'dims()' and indicates whether
+  // the respective dimension is dynamically sized.
+  DynamicDimensions dynamic_dims_;
+};
+
+// Use static polymorphism to facilitate type checking. Currently only support
+// one type of dynamism.
+using DynamicShapeTag = std::variant<BoundedDynamicShapeTag>;
+
+// Shape with dynamism in dimension sizes, etc.
+class DynamicShape {
+ public:
+  // Constructs `DynamicShape` from `Shape` and `DynamicShapeTag`. Fails if
+  // the dimensions mismatch.
+  //
+  // When `tag` is a `BoundedDynamicShapeTag`: for any dimension that is dynamic
+  // as indicated by `tag`, the corresponding dimension in `shape` represents
+  // the upper bound of the dimension size.
+  static absl::StatusOr<DynamicShape> Create(Shape shape, DynamicShapeTag tag);
+
+  DynamicShape(const DynamicShape&) = default;
+  DynamicShape(DynamicShape&&) = default;
+  DynamicShape& operator=(const DynamicShape&) = default;
+  DynamicShape& operator=(DynamicShape&&) = default;
+
+  const DynamicShapeTag& GetTag() const { return tag_; }
+
+  bool operator==(const DynamicShape& other) const {
+    return tag_ == other.tag_ && shape_ == other.shape_;
+  }
+  bool operator!=(const DynamicShape& other) const { return !(*this == other); }
+
+  // Gets the shape after padding. Only works for bounded dynamic shape for now.
+  absl::StatusOr<Shape> GetPaddedShape() const;
+
+  // Returns whether a certain dimension in the shape is dynamic.
+  bool IsDynamicDim(int dimension) const;
+
+  // Constructs `DynamicShape` from `DynamicShapeProto`.
+  static absl::StatusOr<DynamicShape> FromProto(const DynamicShapeProto& proto);
+
+  // Returns a `DynamicShapeProto` representation.
+  DynamicShapeProto ToProto() const;
+
+  std::string DebugString() const;
+
+ private:
+  DynamicShape(Shape shape, DynamicShapeTag tag)
+      : shape_(std::move(shape)), tag_(std::move(tag)) {}
+
+  Shape shape_;
+  DynamicShapeTag tag_;
+};
+
+std::ostream& operator<<(std::ostream& os, const Shape& shape);
+std::ostream& operator<<(std::ostream& os, const DynamicShape& dynamic_shape);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_SHAPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/sharding.h
new file mode 100644
index 00000000..4fc40852
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/sharding.h
@@ -0,0 +1,571 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_SHARDING_H_
+#define XLA_PYTHON_IFRT_SHARDING_H_
+
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/log/check.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/index_domain.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.pb.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// TODO(hyeontaek): Unify sharding types with jax::Sharding.
+
+struct DeserializeShardingOptions;
+
+// Semantics for operations that take or return single-device shards of arrays
+// or shardings.
+enum class SingleDeviceShardSemantics : int {
+  // Processes only the single-device shards on addresable devices.
+  //
+  // * Assembly takes single-device arrays/shards for every addressable shard of
+  // an assembled array/sharding.
+  //
+  // * Disassembly returns single-device arrays/shards for every addressable
+  // shard of an assembled array/sharding.
+  kAddressableShards = 0,
+
+  // Processes single-device shards on all devices.
+  //
+  // * Assembly takes single-device arrays/shards for every
+  // addressable/non-addressable shard of an assembled array/sharding.
+  //
+  // * Disassembly returns single-device arrays/shards for every
+  // addressable/non-addressable shard of an assembled array/sharding.
+  //
+  // Runtimes that cannot express single-device arrays on a non-addressable
+  // device does not support this semantics no array operations.
+  kAllShards,
+};
+
+// Abstract sharding type.
+//
+// TODO(hyeontaek): There is an indication that we may prefer to split logical
+// partitioning and device assignment into two separate data structures. It is
+// common that an operation preserves the logical partitioning and only updates
+// devices (e.g., "copy to devices" and portable execution). This fine-grained
+// sharding design may help reduce overhead around these operations.
+class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
+ public:
+  using DeserializeOptions = DeserializeShardingOptions;
+
+  // All devices in this sharding. Devices may appear more than once.
+  const tsl::RCReference<DeviceList>& devices() const { return devices_; }
+
+  // Returns the memory kind for all shards in this sharding.
+  MemoryKind memory_kind() const { return memory_kind_; }
+
+  // Returns if this sharding is fully replicated. A fully replicated sharding
+  // means that the logical shape and shard shapes are identical
+  // (`GetShardShape(shape) == shape`), and every shard of the array contains
+  // the entire data of the logical array.
+  bool IsFullyReplicated() const { return is_fully_replicated_; }
+
+  // Returns if this sharding is equal to `other`.
+  bool operator==(const Sharding& other) const;
+  bool operator!=(const Sharding& other) const { return !(*this == other); }
+
+  // Returns a shard shape if the sharding always has the equal shape for all
+  // shards. Returns an error if the sharding may not have a single shard
+  // shape, or `shape` is not a valid shape for this sharding.
+  virtual absl::StatusOr<Shape> GetShardShape(const Shape& shape) const = 0;
+
+  // Returns if this sharding has the same logical partitioning as `other`. By
+  // the same logical partitioning, we mean that `Sharding` type is the same,
+  // and the partitioning scheme within the sharding is equivalent. It does not
+  // need to check if `Disassemble()` would return the same result.
+  virtual bool HasSamePartitioning(const Sharding& other) const = 0;
+
+  // Returns a new sharding with the same logical partitioning as this sharding,
+  // but with different devices and/or a different memory kind. If `devices` is
+  // provided, the number of devices must be the same as the number of devices
+  // in this sharding. If `memory_kind` is provided, it must be a valid memory
+  // kind for the devices used.
+  virtual absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const = 0;
+
+  // Breaks a shape up into per-device shapes and shardings. See
+  // Array::DisassembleIntoSingleDeviceArrays(). It may return an error if
+  // disassembly is unsupported.
+  // TODO(hyeontaek): Replace this API with the version that takes
+  // `SingleDeviceShardSemantics`.
+  virtual absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const Shape& shape) const = 0;
+  virtual absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const = 0;
+
+  // Variant of `Disassemble` that takes a dynamic shape.
+  // TODO(hyeontaek): Replace this API with the version that takes
+  // `SingleDeviceShardSemantics`.
+  virtual absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const = 0;
+  virtual absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const = 0;
+
+  // Maps each shard to an `IndexDomain` over `shape`. The result is a list of
+  // `index_domain_i` such that `array[index_domain_i] = disassembled_array_i`.
+  // Note that multiple shards may map onto equal `IndexDomain`. For instance, a
+  // fully replicated sharding would return a vector of `[IndexDomain(shape)] *
+  // devices().size()` if `single_device_shard_semantics ==
+  // SingleDeviceShardSemantics::kAllShards`.
+  // TODO(hyeontaek): Replace this API with the version that takes
+  // `SingleDeviceShardSemantics`.
+  virtual absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const = 0;
+  virtual absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const = 0;
+
+  // Deserializes `ShardingProto` into `Sharding`.
+  // Note that `Sharding` serialization uses `SerDes` to handle an open set of
+  // `Sharding` subclasses. See `serdes.h`.
+  static absl::StatusOr<std::unique_ptr<Sharding>> FromProto(
+      DeviceList::LookupDeviceFunc lookup_device,
+      const ShardingProto& sharding_proto);
+
+  // Serializes `Sharding` into `ShardingProto`.
+  // Note that `Sharding` serialization uses `SerDes` to handle an open set of
+  // `Sharding` subclasses. See `serdes.h`.
+  absl::StatusOr<ShardingProto> ToProto() const;
+
+  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
+  virtual std::string DebugString() const = 0;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Sharding& sharding) {
+    sink.Append(sharding.DebugString());
+  }
+
+  template <class Sink>
+  friend void AbslStringify(Sink& sink,
+                            std::shared_ptr<const Sharding>& sharding) {
+    if (sharding == nullptr) {
+      sink.Append("<nullptr>");
+    } else {
+      sink.Append(sharding->DebugString());
+    }
+  }
+
+  static char ID;  // NOLINT
+
+ protected:
+  Sharding(tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+           bool is_fully_replicated);
+
+  tsl::RCReference<DeviceList> devices_;
+  MemoryKind memory_kind_;
+  bool is_fully_replicated_;
+};
+
+std::ostream& operator<<(std::ostream& os, const Sharding& sharding);
+
+// TODO(hyeontaek): Move the subclasses of `Sharding` to a seperate file,
+// making this sharding.{h,cc} only define interface and common functions.
+
+// Single-device sharding.
+//
+// TODO(hyeontaek): `SingleDeviceSharding` tends to be created or consumed in a
+// large quantity. It may be useful for performance optimization to special-case
+// this sharding type rather than expressing it as a general `Sharding`.
+class SingleDeviceSharding final
+    : public llvm::RTTIExtends<SingleDeviceSharding, Sharding> {
+ public:
+  // Creates a single-device sharding.
+  static std::unique_ptr<SingleDeviceSharding> Create(Device* device,
+                                                      MemoryKind memory_kind);
+
+  // Sharding implementation.
+
+  ~SingleDeviceSharding() override = default;
+
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const Shape& shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit SingleDeviceSharding(Device* device, MemoryKind memory_kind)
+      : llvm::RTTIExtends<SingleDeviceSharding, Sharding>(
+            BasicDeviceList::Create({device}), memory_kind,
+            /*is_fully_replicated=*/true) {}
+};
+
+// Opaque sharding that does not define a fixed semantics for conversion between
+// a logical shape and per-device shapes, and device placements.
+class OpaqueSharding : public llvm::RTTIExtends<OpaqueSharding, Sharding> {
+ public:
+  // Creates an opaque sharding. `Disassemble()` will fail.
+  // REQUIRES: !devices.empty()
+  static std::unique_ptr<OpaqueSharding> Create(
+      tsl::RCReference<DeviceList> devices, MemoryKind memory_kind);
+
+  // Sharding implementation.
+
+  ~OpaqueSharding() override = default;
+
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const Shape& shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit OpaqueSharding(tsl::RCReference<DeviceList> devices,
+                          MemoryKind memory_kind);
+};
+
+// Opaque sharding that does not define a fixed semantics for conversion between
+// a logical shape and shard shapes, and device placements. It can disassemble a
+// certain shape into shard shapes that may not be identical. It is advised to
+// use `ConcreteEvenSharding` if all shard shapes are identical.
+class ConcreteSharding : public llvm::RTTIExtends<ConcreteSharding, Sharding> {
+ public:
+  // Creates a concrete sharding that may contain non-identical shard shapes.
+  // REQUIRES: `devices`.size() == `shard_shapes`.size()
+  // REQUIRES: !devices.empty()
+  static std::unique_ptr<ConcreteSharding> Create(
+      tsl::RCReference<DeviceList> devices, MemoryKind memory_kind, Shape shape,
+      std::vector<Shape> shard_shapes);
+
+  // Creates a concrete sharding that may contain non-identical shard dynamic
+  // shapes.
+  // REQUIRES: `devices`.size() == `shard_dynamic_shapes`.size()
+  // REQUIRES: !devices.empty()
+  static std::unique_ptr<ConcreteSharding> Create(
+      tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+      DynamicShape dynamic_shape,
+      std::vector<DynamicShape> shard_dynamic_shapes);
+
+  bool has_dynamic_shape() const {
+    DCHECK(this);
+    return std::holds_alternative<DynamicShape>(shape_) &&
+           std::holds_alternative<std::vector<DynamicShape>>(shard_shapes_);
+  }
+
+  bool has_static_shape() const {
+    DCHECK(this);
+    return std::holds_alternative<Shape>(shape_) &&
+           std::holds_alternative<std::vector<Shape>>(shard_shapes_);
+  }
+
+  const Shape& shape() const {
+    DCHECK(has_static_shape());
+    return std::get<Shape>(shape_);
+  }
+
+  const DynamicShape& dynamic_shape() const {
+    DCHECK(has_dynamic_shape());
+    return std::get<DynamicShape>(shape_);
+  }
+
+  const std::vector<Shape>& shard_shapes() const {
+    DCHECK(this);
+    DCHECK(std::holds_alternative<std::vector<Shape>>(shard_shapes_));
+    return std::get<std::vector<Shape>>(shard_shapes_);
+  }
+
+  const std::vector<DynamicShape>& shard_dynamic_shapes() const {
+    DCHECK(this);
+    DCHECK(std::holds_alternative<std::vector<DynamicShape>>(shard_shapes_));
+    return std::get<std::vector<DynamicShape>>(shard_shapes_);
+  }
+
+  // Sharding implementation.
+
+  ~ConcreteSharding() override = default;
+
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const Shape& shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  ConcreteSharding(tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+                   Shape shape, std::vector<Shape> shard_shapes);
+
+  ConcreteSharding(tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+                   DynamicShape dynamic_shape,
+                   std::vector<DynamicShape> shard_dynamic_shapes);
+
+  std::variant<Shape, DynamicShape> shape_;
+  std::variant<std::vector<Shape>, std::vector<DynamicShape>> shard_shapes_;
+  std::optional<Shape> shard_shape_;
+};
+
+// Opaque sharding that does not define a fixed semantics for conversion between
+// a logical shape and shard shapes, and device placements. It can disassemble a
+// certain shape into shard shapes that are identical.
+class ConcreteEvenSharding
+    : public llvm::RTTIExtends<ConcreteEvenSharding, Sharding> {
+ public:
+  // Creates a concrete even sharding.
+  // TODO(hyeontaek): Remove the default value of `is_fully_replicated` once all
+  // callers are updated to provide it explicitly.
+  // REQUIRES: !devices.empty()
+  static std::unique_ptr<ConcreteEvenSharding> Create(
+      tsl::RCReference<DeviceList> devices, MemoryKind memory_kind, Shape shape,
+      Shape shard_shape, bool is_fully_replicated = false);
+
+  Shape shape() const {
+    DCHECK(this);
+    return shape_;
+  }
+  const Shape& shard_shape() const {
+    DCHECK(this);
+    return shard_shape_;
+  }
+
+  // Sharding implementation.
+
+  ~ConcreteEvenSharding() override = default;
+
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const Shape& shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  ConcreteEvenSharding(tsl::RCReference<DeviceList> devices,
+                       MemoryKind memory_kind, Shape shape, Shape shard_shape,
+                       bool is_fully_replicated);
+
+  Shape shape_;
+  Shape shard_shape_;
+};
+
+// Sharding derived from an IR ShardingParam.
+class ShardingParamSharding
+    : public llvm::RTTIExtends<ShardingParamSharding, Sharding> {
+ public:
+  // REQUIRES: !devices.empty()
+  static absl::StatusOr<std::unique_ptr<ShardingParamSharding>> Create(
+      ShardingParam sharding_param, tsl::RCReference<DeviceList> devices,
+      MemoryKind memory_kind);
+
+  const ShardingParam& sharding_param() const { return sharding_param_; }
+
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const Shape& shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<
+      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  ShardingParamSharding(ShardingParam sharding_param,
+                        tsl::RCReference<DeviceList> devices,
+                        MemoryKind memory_kind);
+
+  ShardingParam sharding_param_;
+};
+
+// Options for deserializing shardings. Function referenced by `lookup_device`
+// must remain valid during deserialization.
+struct DeserializeShardingOptions
+    : llvm::RTTIExtends<DeserializeShardingOptions, DeserializeOptions> {
+  explicit DeserializeShardingOptions(
+      DeviceList::LookupDeviceFunc lookup_device)
+      : lookup_device(lookup_device) {}
+
+  static char ID;  // NOLINT
+
+  // Function that converts device ids to devices.
+  DeviceList::LookupDeviceFunc lookup_device;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/support/module_parsing.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/support/module_parsing.h
new file mode 100644
index 00000000..f9394cca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/support/module_parsing.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_SUPPORT_MODULE_PARSING_H_
+#define XLA_PYTHON_IFRT_SUPPORT_MODULE_PARSING_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+
+namespace xla {
+namespace ifrt {
+namespace support {
+
+// Initializes the given MLIR dialect registry with dialects that are required
+// by IFRT IR passes.
+void InitializeMlirDialectRegistry(mlir::DialectRegistry& registry);
+
+// Registers all dialects required by IFRT IR modules.
+void RegisterMlirDialects(mlir::MLIRContext& context);
+
+// Converts an IFRT IR module string to an mlir::Module.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
+    absl::string_view mlir_module_str, mlir::MLIRContext& context);
+
+}  // namespace support
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_SUPPORT_MODULE_PARSING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/support/sharding_conversions.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/support/sharding_conversions.h
new file mode 100644
index 00000000..e593d986
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/support/sharding_conversions.h
@@ -0,0 +1,74 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_SUPPORT_SHARDING_CONVERSIONS_H_
+#define XLA_PYTHON_IFRT_SUPPORT_SHARDING_CONVERSIONS_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace support {
+
+// Converts a Sharding to an OpSharding.
+//
+// The function currently supports only `ShardingParamSharding`. It assumes that
+// `sharding_param` is valid.
+//
+// Returns error if the given sharding is of a type for which conversion is
+// not supported.
+absl::StatusOr<OpSharding> ToOpSharding(const Sharding& sharding);
+
+// Converts ShardingParam and a device_mapping to OpSharding.
+//
+// The function assumes that `sharding_param` is valid. The logical device
+// ids from `sharding_param` are used as indices into the device_mapping to
+// obtain the device ids to create the OpSharding.
+//
+// Returns error when `device_mapping` can't map the logical devices in
+// `sharding_param`.
+absl::StatusOr<OpSharding> ToOpSharding(
+    const ShardingParam& sharding_param,
+    const tsl::RCReference<xla::ifrt::DeviceList>& device_mapping);
+
+// Converts ShardingParam to HloSharding.
+//
+// This assumes that `sharding_param` is valid.
+// The returned HloSharding uses the same logical device ids as the
+// given ShardingParam.
+absl::StatusOr<HloSharding> ToHloSharding(const ShardingParam& sharding_param);
+
+// Converts HloSharding to ShardingParam.
+//
+// It assumes that `hlo_sharding` is valid.
+//
+// Returns error when `hlo_sharding` cannot be converted to sharding param.
+// Only a subset of HloShardings are supported: REPLICATED (including MAXIMAL
+// on single-device), partially replicated, fully partitioned shardings.
+// (Non-fully-replicated) MAXIMAL and MANUAL shardings are not supported.
+absl::StatusOr<ShardingParam> ToShardingParam(const HloSharding& hlo_sharding,
+                                              int rank, int num_devices);
+
+}  // namespace support
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_SUPPORT_SHARDING_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/test_util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/test_util.h
new file mode 100644
index 00000000..5ca24b31
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/test_util.h
@@ -0,0 +1,101 @@
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_TEST_UTIL_H_
+#define XLA_PYTHON_IFRT_TEST_UTIL_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+// Registers an IFRT client factory function. Must be called only once.
+void RegisterClientFactory(
+    std::function<absl::StatusOr<std::shared_ptr<Client>>()> factory);
+
+// Returns true iff an IFRT client factory function has been registered.
+bool IsClientFactoryRegistered();
+
+// Gets a new IFRT client using the registered client factory.
+absl::StatusOr<std::shared_ptr<Client>> GetClient();
+
+// Set a default test filter if user doesn't provide one using --gtest_filter.
+void SetTestFilterIfNotUserSpecified(absl::string_view custom_filter);
+
+// Asserts the content of an Array.
+// This will blocking copy the data to host buffer.
+template <typename ElementT>
+void AssertPerShardData(
+    tsl::RCReference<Array> actual, DType expected_dtype,
+    Shape expected_per_shard_shape,
+    absl::Span<const absl::Span<const ElementT>> expected_per_shard_data,
+    tsl::RCReference<DeviceList> expected_device_list) {
+  ASSERT_EQ(actual->dtype(), expected_dtype);
+  EXPECT_THAT(GetDeviceIds(actual->sharding().devices()),
+              testing::ElementsAreArray(GetDeviceIds(expected_device_list)));
+  TF_ASSERT_OK_AND_ASSIGN(auto actual_per_shard_arrays,
+                          actual->DisassembleIntoSingleDeviceArrays(
+                              ArrayCopySemantics::kAlwaysCopy));
+  ASSERT_EQ(actual_per_shard_arrays.size(), expected_per_shard_data.size());
+  for (int i = 0; i < actual_per_shard_arrays.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("Shard ", i));
+    const tsl::RCReference<Array>& array = actual_per_shard_arrays[i];
+    ASSERT_EQ(array->shape(), expected_per_shard_shape);
+    std::vector<ElementT> actual_data(expected_per_shard_shape.num_elements());
+    TF_ASSERT_OK(array
+                     ->CopyToHostBuffer(actual_data.data(),
+                                        /*byte_strides=*/std::nullopt,
+                                        ArrayCopySemantics::kAlwaysCopy)
+                     .Await());
+    EXPECT_THAT(actual_data,
+                testing::ElementsAreArray(expected_per_shard_data[i]));
+  }
+}
+
+// Helper function that makes `DeviceList` containing devices at given
+// indexes (not ids) within `client.devices()`.
+absl::StatusOr<tsl::RCReference<DeviceList>> GetDevices(
+    Client* client, absl::Span<const int> device_indices);
+
+// Helper function that makes `DeviceList` containing devices at given
+// indexes (not ids) within `client.addressable_devices()`.
+absl::StatusOr<tsl::RCReference<DeviceList>> GetAddressableDevices(
+    Client* client, absl::Span<const int> device_indices);
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/topology.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/topology.h
new file mode 100644
index 00000000..f7713239
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/topology.h
@@ -0,0 +1,81 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_TOPOLOGY_H_
+#define XLA_PYTHON_IFRT_TOPOLOGY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::ifrt {
+
+class Topology : public llvm::RTTIExtends<Topology, llvm::RTTIRoot> {
+ public:
+  // Returns a string that identifies the platform (CPU/GPU/TPU).
+  virtual absl::string_view platform_name() const = 0;
+
+  // Returns a string containing human-readable, platform-specific version info
+  // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
+  virtual absl::string_view platform_version() const = 0;
+
+  // Returns an ID that identifies the platform (CPU/GPU/TPU).
+  virtual PjRtPlatformId platform_id() const = 0;
+
+  // Returns the topology description.
+  // TODO(hyeontaek): Consider introducing an IFRT-specific API here instead of
+  // delegating to PJRT.
+  virtual const std::shared_ptr<const xla::PjRtTopologyDescription>&
+  description() const = 0;
+
+  // Returns an unordered list of descriptions for all devices in this topology.
+  // TODO(hyeontaek): Consider introducing an IFRT-specific API here instead of
+  // delegating to PJRT.
+  virtual std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+  DeviceDescriptions() const = 0;
+
+  // Returns the default device layout for a buffer with `element_type` and
+  // `dims`. The default layout is a platform-specific layout used when no other
+  // layout is specified, e.g. for host-to-device transfers. When compiling, the
+  // default layout is used for program arguments and outputs unless
+  // user-specified or compiler-chosen layouts are requested via the
+  // "mhlo.layout_mode" attribute.
+  virtual absl::StatusOr<xla::Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) const = 0;
+
+  // Serializes the topology for use in cache keys. (No guarantees on
+  // stability).
+  virtual absl::StatusOr<std::string> Serialize() const = 0;
+
+  // Returns vendor specific attributes about the topology.
+  virtual const AttributeMap& Attributes() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace xla::ifrt
+
+#endif  // XLA_PYTHON_IFRT_TOPOLOGY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/tuple.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/tuple.h
new file mode 100644
index 00000000..54cb1a1f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/tuple.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_TUPLE_H_
+#define XLA_PYTHON_IFRT_TUPLE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// A tuple containing zero or more Values.
+//
+// IsDeleted() returns true if either the tuple itself has been deleted or
+// any of its elements have been deleted.
+// The future returned by Delete() triggers with an Ok status when the tuple and
+// all of its elements have been deleted.
+class Tuple : public llvm::RTTIExtends<Tuple, Value> {
+ public:
+  Tuple() = default;
+
+  // Not copyable or movable.
+  Tuple(const Tuple&) = delete;
+  Tuple(Tuple&&) = delete;
+  Tuple& operator=(const Tuple&) = delete;
+  Tuple& operator=(Tuple&&) = delete;
+
+  // Returns the arity of the tuple.
+  virtual int Arity() = 0;
+
+  // Unpacks the tuple into its constituent pieces.
+  virtual absl::Status Unpack(absl::Span<tsl::RCReference<Value>> values) = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_TUPLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/value.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/value.h
new file mode 100644
index 00000000..623390cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt/value.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_VALUE_H_
+#define XLA_PYTHON_IFRT_VALUE_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Abstract superclass of values such as arrays.
+class Value : public tsl::ReferenceCounted<Value>,
+              public llvm::RTTIExtends<Value, llvm::RTTIRoot> {
+ public:
+  Value() = default;
+
+  // Not copyable or movable.
+  Value(const Value&) = delete;
+  Value(Value&&) = delete;
+  Value& operator=(const Value&) = delete;
+  Value& operator=(Value&&) = delete;
+
+  virtual Client* client() const = 0;
+
+  // Returns a future that becomes ready when the buffer is computed or has an
+  // error.
+  virtual Future<> GetReadyFuture() const = 0;
+
+  // Deletes the value from the devices. The operation may be asynchronous. The
+  // returned future will have the result of the deletion on the devices, and
+  // will be triggered after all values have been deleted.
+  // Implementations that do not track the completion of the deletion operation
+  // may make the future immediately ready with an OK status.
+  //
+  // Deletion is idempotent. Deleting an already deleted value is allowed, and
+  // all the futures returned by different calls to Delete() will become ready
+  // with the same status.
+  virtual Future<> Delete() = 0;
+
+  // Returns whether the value has been enqueued for deletion from the devices.
+  virtual bool IsDeleted() const = 0;
+
+  virtual std::string DebugString() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/array.h
new file mode 100644
index 00000000..5c4b4247
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_ARRAY_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_ARRAY_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Implementation of the xla::ifrt::Array interface.
+class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
+ public:
+  // `Array::MakeArrayFromHostBuffer()` implements
+  // `Client::MakeArrayFromHostBuffer()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  MakeArrayFromHostBuffer(xla::ifrt::Client* client,
+                          std::shared_ptr<RpcHelper> rpc_helper,
+                          const void* data, DType dtype, Shape shape,
+                          std::optional<absl::Span<const int64_t>> byte_strides,
+                          std::shared_ptr<const Sharding> sharding,
+                          xla::ifrt::Client::HostBufferSemantics semantics,
+                          std::function<void()> on_done_with_host_buffer);
+
+  // `Array::AssembleArrayFromSingleDeviceArrays()` implements
+  // `Client::AssembleArrayFromSingleDeviceArrays()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics);
+
+  // `Array::RemapArrays()` implements `Client::RemapArrays()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  RemapArrays(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+              const RemapPlan& plan,
+              absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+              ArrayCopySemantics semantics);
+
+  // Destructs the array associated with the given handle. The corresponding
+  // array becomes unusable afterwards.
+  static void Destruct(RpcHelper* rpc_helper, ArrayHandle handle);
+
+  Array(xla::ifrt::Client* const client, std::shared_ptr<RpcHelper> rpc_helper,
+        DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
+        ArrayHandle handle)
+      : client_(client),
+        rpc_helper_(std::move(rpc_helper)),
+        dtype_(dtype),
+        shape_(std::move(shape)),
+        sharding_(std::move(sharding)),
+        handle_(handle) {}
+
+  ~Array() override { Destruct(rpc_helper_.get(), handle_); }
+
+  ArrayHandle handle() const { return handle_; }
+
+  xla::ifrt::Client* client() const override;
+  Future<> GetReadyFuture() const override;
+  Future<> Delete() override;
+  bool IsDeleted() const override;
+  std::string DebugString() const override;
+
+  DType dtype() const override { return dtype_; }
+  const Shape& shape() const override { return shape_; }
+  const Sharding& sharding() const override { return *sharding_; }
+  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+    return sharding_;
+  }
+  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override {
+    return absl::UnimplementedError(
+        "Array::layout() not implemented for IFRT proxy");
+  };
+
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  DisassembleIntoSingleDeviceArrays(
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> FullyReplicatedShard(
+      xla::ifrt::ArrayCopySemantics semantics) override;
+
+  ABSL_MUST_USE_RESULT
+  Future<> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) override;
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  Future<> CopyToStringHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics);
+
+  // Not owned. Used only for implementing `client()` interface method. Note
+  // that `client()` will still return the pointer even if the pointed-to memory
+  // is freed; this unfortunate behavior currently exists in all IFRT
+  // implementations.
+  xla::ifrt::Client* const client_;
+
+  const std::shared_ptr<RpcHelper> rpc_helper_;
+  const DType dtype_;
+  const Shape shape_;
+  const std::shared_ptr<const Sharding> sharding_;
+  const ArrayHandle handle_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/client.h
new file mode 100644
index 00000000..29edb78c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/ifrt_proxy/client/compiler.h"
+#include "xla/python/ifrt_proxy/client/device.h"
+#include "xla/python/ifrt_proxy/client/memory.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Implementation of the xla::ifrt::Client interface.
+class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
+ public:
+  static absl::StatusOr<std::unique_ptr<Client>> Create(
+      std::shared_ptr<RpcHelper> rpc_helper, InitResponse init_response);
+
+  ~Client() override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      std::shared_ptr<const Sharding> sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics) override;
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>> CopyArrays(
+      absl::Span<tsl::RCReference<Array>> arrays,
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
+      const RemapPlan& plan,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics) override;
+
+  xla::ifrt::Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<Value>> values) override;
+
+  absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<Value>> values) override {
+    return absl::UnimplementedError(
+        "MakeTuple is not supported for the IFRT proxy client.");
+  }
+
+  absl::string_view runtime_type() const override { return runtime_type_; }
+  absl::string_view platform_name() const override { return platform_name_; }
+  absl::string_view platform_version() const override {
+    return platform_version_;
+  }
+  PlatformId platform_id() const override { return platform_id_; }
+  const AttributeMap& Attributes() const override { return attributes_; }
+  int device_count() const override { return devices().size(); }
+  int addressable_device_count() const override {
+    return addressable_devices().size();
+  }
+  absl::Span<xla::ifrt::Device* const> devices() const override {
+    return primary_device_ptrs_;
+  }
+  absl::Span<xla::ifrt::Device* const> addressable_devices() const override {
+    return addressable_device_ptrs_;
+  }
+  int process_index() const override { return process_index_; }
+  absl::Span<xla::ifrt::Device* const> GetAllDevices() const override;
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+  absl::StatusOr<xla::ifrt::Device*> LookupDevice(
+      DeviceId device_id) const override;
+  absl::StatusOr<xla::ifrt::Device*> LookupAddressableDevice(
+      int local_hardware_id) const override {
+    return absl::UnimplementedError(
+        "LookupAddressableDevice is not supported for the IFRT proxy client.");
+  }
+  xla::ifrt::Compiler* GetDefaultCompiler() override {
+    return &default_compiler_;
+  }
+  absl::StatusOr<std::shared_ptr<xla::ifrt::Topology>> GetTopologyForDevices(
+      const tsl::RCReference<xla::ifrt::DeviceList>& devices) const override {
+    return absl::UnimplementedError(
+        "GetTopologyForDevices is not supported for the IFRT proxy client.");
+  }
+  absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> GetDefaultLayout(
+      xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
+      xla::ifrt::Device* device,
+      xla::ifrt::MemoryKind memory_kind) const override {
+    return absl::UnimplementedError(
+        "GetDefaultLayout is not supported for the IFRT proxy client.");
+  }
+
+  // For llvm::RTTIExtends.
+  static char ID;  // NOLINT
+
+ private:
+  Client(std::shared_ptr<RpcHelper> rpc_helper, uint64_t session_id,
+         std::string platform_name, std::string platform_version,
+         uint64_t platform_id, uint64_t process_index, std::string runtime_type,
+         absl::flat_hash_map<int, std::unique_ptr<Device>> devices,
+         std::vector<xla::ifrt::Device*> primary_device_ptrs,
+         std::vector<xla::ifrt::Device*> addressable_device_ptrs,
+         std::vector<xla::ifrt::Device*> all_device_ptrs,
+         absl::flat_hash_map<int, std::unique_ptr<Memory>> memories);
+
+  // rpc_helper_ will be referenced by various IFRT objects whose lifetime is
+  // managed by the layer above the IFRT interface, so shared_ptr is
+  // appropriate.
+  const std::shared_ptr<RpcHelper> rpc_helper_;
+
+  const std::string platform_name_;
+  const std::string platform_version_;
+  const uint64_t platform_id_;
+  const uint64_t process_index_;
+  const std::string runtime_type_;
+
+  const AttributeMap attributes_;
+
+  const absl::flat_hash_map<int, std::unique_ptr<Device>> devices_;
+  const std::vector<xla::ifrt::Device*> primary_device_ptrs_;
+  const std::vector<xla::ifrt::Device*> addressable_device_ptrs_;
+  const std::vector<xla::ifrt::Device*> all_device_ptrs_;
+
+  const absl::flat_hash_map<int, std::unique_ptr<Memory>> memories_;
+
+  Compiler default_compiler_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/client_session.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
new file mode 100644
index 00000000..8258f415
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_SESSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_SESSION_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Base class that defines the interface between IFRT service protocol and the
+// stream implementation that is responsible for sending requests and receiving
+// responses.
+//
+// `ClientSession` implementation must be thread-safe.
+class ClientSession {
+ public:
+  using Response = std::shared_ptr<IfrtResponse>;
+
+  virtual ~ClientSession() = default;
+
+  // Enqueues `request` to be sent via the stream; enqueued requests are sent in
+  // FIFO order. The caller must ensure that `request->op_id()` is unique
+  // throughout the stream's lifetime. The returned future becomes ready when a
+  // response for the given op id becomes ready.
+  virtual Future<Response> Enqueue(std::unique_ptr<IfrtRequest> request) = 0;
+
+  // Terminates the `ClientSession` if it has not already been terminated.
+  virtual void Finish(const absl::Status& s) {}
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_SESSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
new file mode 100644
index 00000000..3ad562d4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_COMPILER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_COMPILER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class Compiler final : public llvm::RTTIExtends<Compiler, xla::ifrt::Compiler> {
+ public:
+  Compiler(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper);
+
+  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compile(
+      std::unique_ptr<xla::ifrt::Program> program,
+      std::unique_ptr<xla::ifrt::CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<Program> program, const Topology& topology,
+      std::unique_ptr<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+  DeserializeLoadedExecutable(
+      absl::string_view serialized,
+      std::unique_ptr<xla::ifrt::DeserializeExecutableOptions> options)
+      override;
+
+  static char ID;  // NOLINT
+
+ private:
+  xla::ifrt::Client* client_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/device.h
new file mode 100644
index 00000000..7e0c684d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class Client;
+
+class DeviceDescription final : public xla::PjRtDeviceDescription {
+ public:
+  DeviceDescription(
+      int id, int process_index, std::string device_kind,
+      std::string debug_string, std::string to_string,
+      absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes)
+      : id_(id),
+        process_index_(process_index),
+        device_kind_(device_kind),
+        debug_string_(std::move(debug_string)),
+        to_string_(std::move(to_string)),
+        attributes_(std::move(attributes)) {}
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return process_index_; }
+
+  absl::string_view device_kind() const override { return device_kind_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+  const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  int id_;
+  int process_index_;
+  std::string device_kind_;
+  std::string debug_string_;
+  std::string to_string_;
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+};
+
+class Device final : public llvm::RTTIExtends<Device, xla::ifrt::Device> {
+ public:
+  Device(DeviceDescription description, int local_device_id,
+         int local_hardware_id, bool is_addressable);
+
+  ifrt::Client* client() const override;
+  bool IsAddressable() const override;
+
+  DeviceId Id() const override;
+  absl::string_view Kind() const override;
+  absl::string_view ToString() const override;
+  absl::string_view DebugString() const override;
+  int ProcessIndex() const override;
+
+  const AttributeMap& Attributes() const override;
+
+  absl::Span<ifrt::Memory* const> Memories() const override;
+  absl::StatusOr<ifrt::Memory*> DefaultMemory() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  friend class Client;  // For `memories_` initialization.
+
+  ifrt::Client* client_;
+  const DeviceDescription description_;
+
+  const AttributeMap attributes_;
+
+  const int local_device_id_;
+  const int local_hardware_id_;
+  const bool is_addressable_;
+
+  std::vector<xla::ifrt::Memory*> memories_;
+  xla::ifrt::Memory* default_memory_ = nullptr;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/executable.h
new file mode 100644
index 00000000..0af4a14a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_EXECUTABLE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class LoadedExecutable final
+    : public llvm::RTTIExtends<LoadedExecutable, xla::ifrt::LoadedExecutable> {
+ public:
+  LoadedExecutable(xla::ifrt::Client* client,
+                   std::shared_ptr<RpcHelper> rpc_helper, uint64_t handle,
+                   std::string name, int num_devices,
+                   std::vector<xla::ifrt::Device*> addressable_devices,
+                   absl::StatusOr<std::optional<std::string>> fingerprint,
+                   Future<> ready_future,
+                   std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
+                       loaded_host_callbacks,
+                   std::vector<uint64_t> loaded_host_callback_handles);
+
+  ~LoadedExecutable() override;
+
+  xla::ifrt::Client* client() const override;
+  absl::string_view name() const override;
+  absl::StatusOr<std::optional<std::string>> Fingerprint() const override;
+  absl::StatusOr<std::string> Serialize() const override;
+  Future<> GetReadyFuture() const override;
+
+  int num_devices() const override;
+  int64_t SizeOfGeneratedCodeInBytes() const override;
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override;
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings() const override;
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const override;
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+  GetParameterLayouts() const override;
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+  GetOutputLayouts() const override;
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override;
+
+  absl::StatusOr<xla::ifrt::AttributeMap> GetCostAnalysis() const override;
+
+  absl::StatusOr<ExecuteResult> Execute(
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
+      const ExecuteOptions& options,
+      std::optional<tsl::RCReference<xla::ifrt::DeviceList>> devices) override;
+
+  Future<> Delete() override;
+  bool IsDeleted() const override;
+
+  absl::Span<xla::ifrt::Device* const> addressable_devices() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  struct Metadata {
+    std::optional<std::vector<xla::OpSharding>> parameter_shardings;
+    std::optional<std::vector<xla::OpSharding>> output_shardings;
+
+    absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        parameter_layouts;
+    absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        output_layouts;
+
+    // Elements in `output_memory_kinds` point to elements in `memory_kinds`.
+    // Required since `GetOutputMemoryKinds()` returns `absl::string_view`.
+    // `memory_kinds` uses `absl::node_hash_set` for pointer stability.
+    absl::node_hash_set<std::string> memory_kinds;
+    absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+        output_memory_kinds;
+  };
+
+  void PollLoadedHostCallback(
+      uint64_t handle,
+      tsl::RCReference<xla::ifrt::LoadedHostCallback> loaded_host_callback);
+
+  xla::ifrt::Client* client_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+
+  const uint64_t handle_;
+  const std::string name_;
+  const int num_devices_;
+  const std::vector<xla::ifrt::Device*> addressable_devices_;
+  const absl::StatusOr<std::optional<std::string>> fingerprint_;
+  const Future<> ready_future_;
+
+  class OutputSpecCache;
+  const std::unique_ptr<OutputSpecCache> output_spec_cache_;
+
+  // Metadata queried when the executable is created. Declared as `mutable`
+  // since `Future::Await()` is not const.
+  mutable Future<std::shared_ptr<Metadata>> metadata_future_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
new file mode 100644
index 00000000..188a5aa9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_GLOBAL_FLAGS_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_GLOBAL_FLAGS_H_
+
+#include <ostream>
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Flags that are set based on command-line options or environment variables.
+// As of November 2024, the OSSed code does not actually have any mechanism
+// to configure these flags (global_flags_oss.cc has default values that are
+// compile-time constants); Google-internal code allows it to be configured from
+// command-line options.
+struct GlobalClientFlags {
+  // Setting to true reverts to implementation from before Nov 2024, where
+  // host buffer stores were issued synchronously and waited upon.
+  // TODO(madthanu): Remove flag once there is confidence that the asynchronous
+  // codepath works well.
+  bool synchronous_host_buffer_store;
+
+  // TODO(b/375021159): Implement faster is_delete without needing a hack.
+  bool array_is_deleted_hack;
+};
+
+GlobalClientFlags* GetGlobalClientFlags();
+
+inline std::ostream& operator<<(std::ostream& os, GlobalClientFlags flags) {
+  return os << "xla::ifrt::proxy::GlobalClientFlags{"
+            << "synchronous_host_buffer_store="
+            << flags.synchronous_host_buffer_store << ","
+            << "array_is_deleted_hack=" << flags.array_is_deleted_hack << "}";
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_GLOBAL_FLAGS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
new file mode 100644
index 00000000..b3e9b0b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_CLIENT_SESSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_CLIENT_SESSION_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/base/call_once.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "grpcpp/client_context.h"
+#include "grpcpp/support/client_callback.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/threadpool.h"
+#include "tsl/platform/unbounded_work_queue.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// `GrpcClientSession` implements the client side of an `IfrtSession`
+// stream(ing RPC) and allows users to enqueue `IfrtRequest`s on the
+// stream and register callbacks for when `IfrtResponse`s are received.
+class GrpcClientSession : public ClientSession {
+ public:
+  // `StreamTerminatedCallback` represents a function that will be called when
+  // the underlying streaming RPC is terminated permanently. The callback may be
+  // invoked by the "primary" thread and with various mutex locks held, so the
+  // callback should both return soon and not block on any events (deadlocks may
+  // happen otherwise).
+  using StreamTerminatedCallback = std::function<void(absl::Status)>;
+
+  // Returns an instantiation of GrpcClientSession on the given `stub`.
+  // `stream_terminated_cb` is guaranteed to be called exactly once (unless the
+  // process terminates beforehand). It is guaranteed that no registered
+  // `ResponseCallback` (see below) will be called after `stream_terminated_cb`.
+  static std::shared_ptr<GrpcClientSession> Create(
+      std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+      GrpcIfrtSessionMetadata metadata,
+      StreamTerminatedCallback stream_terminated_cb);
+
+  Future<std::shared_ptr<IfrtResponse>> Enqueue(
+      std::unique_ptr<IfrtRequest> request) override;
+
+  // `ResponseCallback` represents a function that can be invoked when
+  // `ClientSession` receives an `IfrtResponse`. May be invoked by the "primary"
+  // thread and with various mutex locks held.
+  using ResponseCallback =
+      std::function<void(absl::StatusOr<std::shared_ptr<IfrtResponse>>)>;
+
+  absl::Status Enqueue(std::unique_ptr<IfrtRequest> req,
+                       ResponseCallback callback);
+
+  // Terminates the `GrpcClientSession` if it has not already been terminated.
+  // Waits until `stream_terminated_cb` returns.
+  void Finish(const absl::Status& client_status) override;
+
+  // Not copyable (or moveable)
+  GrpcClientSession(const GrpcClientSession&) = delete;
+  GrpcClientSession& operator=(const GrpcClientSession&) = delete;
+
+  // Calls `Finish()`. Also waits for the destruction of
+  // `user_futures_work_queue_` (see below) and thus can block on user-level
+  // callbacks.
+  ~GrpcClientSession() override;
+
+ private:
+  class ResponseCallbackTable;
+
+  GrpcClientSession(std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+                    std::unique_ptr<::grpc::ClientContext> context,
+                    StreamTerminatedCallback stream_terminated_cb);
+
+  // Repeatedly waits for a `IfrtResponse` message to arrive; for each message,
+  // looks up the corresponding callback registered in `response_callbacks_` and
+  // invokes it inline.
+  void ReadLoop();
+
+  // Thread-safe table that logically maps from RequestMetadata.OpId to
+  // ResponseCallback.
+  const std::unique_ptr<ResponseCallbackTable> response_callbacks_;
+
+  // Thread that invokes `ReadLoop()`.
+  std::unique_ptr<tsl::thread::ThreadPool> reader_thread_;
+
+  // A notification (waited on by `Finish()`) for when `ReadLoop()` exits.
+  absl::Notification reader_thread_stopped_;
+
+  // Set by `Finish()`, respected by `Enqueue()` calls.
+  bool writes_stopped_ ABSL_GUARDED_BY(writer_mu_) = false;
+
+  // A mutex that ensures serialization between various `Enqueue()` calls, since
+  // only one thread is allowed to write to the gRPC stream at a time.
+  absl::Mutex writer_mu_;
+
+  using OpId = uint64_t;
+  OpId writer_next_op_id_ ABSL_GUARDED_BY(writer_mu_) = 1;
+
+  // Ensures logic inside `Finish()` is internally called only once.
+  absl::once_flag finish_once_;
+
+  // References to gRPC objects used to read and write to the stream.
+  const std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub_;
+  const std::unique_ptr<::grpc::ClientContext> context_;
+  const std::unique_ptr<
+      ::grpc::ClientReaderWriterInterface<IfrtRequest, IfrtResponse>>
+      stream_;
+
+  const StreamTerminatedCallback stream_terminated_cb_;
+
+  // Threadpool used to perform `Future<>::Promise::Set()` for Futures returned
+  // to callers of `Enqueue(std::unique_ptr<IfrtRequest> request)`. We do this
+  // because `Set()` may block on arbitrary `OnReady` callbacks set by those
+  // callers.
+  std::unique_ptr<tsl::UnboundedWorkQueue> user_futures_work_queue_;
+};
+
+// Creates a gRPC stub that connects to `server_address`. It can be used for
+// `GrpcClientSession`. The same stub can be reused across multiple sessions.
+std::shared_ptr<grpc::GrpcIfrtService::StubInterface> CreateGrpcStub(
+    absl::string_view server_address);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_CLIENT_SESSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
new file mode 100644
index 00000000..8e937504
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_HOST_BUFFER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/unbounded_work_queue.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class GrpcClientHostBufferStore : public ClientHostBufferStore {
+ public:
+  GrpcClientHostBufferStore(
+      std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+      IfrtProxyVersion version, uint64_t session_id);
+
+  ~GrpcClientHostBufferStore() override;
+
+  // Implements ClientHostBufferStore.
+
+  Future<> Store(uint64_t handle, absl::string_view data) override;
+  Future<> Store(uint64_t handle, const absl::Cord& data) override;
+  Future<absl::Cord> Lookup(uint64_t handle) override;
+  Future<> Delete(uint64_t handle) override;
+
+ private:
+  const std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub_;
+  const IfrtProxyVersion version_;
+  const uint64_t session_id_;
+
+  // Implementation note: `work_queue_` may have closures that invoke
+  // user-defined code. Each `Store()` and `Lookup()` call is associated with a
+  // scheduled closure, and the closure is used to first perform synchronous
+  // RPC reads or writes, and then to do `promise.Set()` for the Future returned
+  // to the caller.
+  std::unique_ptr<tsl::UnboundedWorkQueue> work_queue_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_HOST_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
new file mode 100644
index 00000000..ce470656
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_HOST_BUFFER_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/future.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class ClientHostBufferStore {
+ public:
+  virtual ~ClientHostBufferStore() = default;
+
+  // Stores the data associated with the given handle. Returns an error if the
+  // handle already exists.
+  virtual Future<> Store(uint64_t handle, absl::string_view data) = 0;
+
+  // Stores the data associated with the given handle. Returns an error if the
+  // handle already exists.
+  // TODO(b/315023499) Find a way to increase the chunk size
+  virtual Future<> Store(uint64_t handle, const absl::Cord& data) = 0;
+
+  // Retrieves the data associated with the handle. Returns an error if the
+  // handle does not exist.
+  virtual Future<absl::Cord> Lookup(uint64_t handle) = 0;
+
+  // Deletes the host buffer associated with the handle. Returns an error if the
+  // handle does not exist.
+  virtual Future<> Delete(uint64_t handle) = 0;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_HOST_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/memory.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/memory.h
new file mode 100644
index 00000000..29eae8f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/memory.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_MEMORY_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_MEMORY_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class Client;
+
+class Memory : public llvm::RTTIExtends<Memory, xla::ifrt::Memory> {
+ public:
+  Memory(int id, std::string memory_space_kind, int kind_id,
+         std::string debug_string, std::string to_string)
+      : id_(id),
+        kind_(std::move(memory_space_kind)),
+        debug_string_(std::move(debug_string)),
+        to_string_(std::move(to_string)) {}
+
+  // Not copyable or movable: IFRT expects `string_view` from
+  // `kind()` to be stable throughout the client's lifetime.
+  Memory(const Memory& other) = delete;
+  Memory& operator=(const Memory& other) = delete;
+
+  MemoryId Id() const override { return MemoryId(id_); }
+  const MemoryKind& Kind() const override { return kind_; }
+
+  absl::Span<xla::ifrt::Device* const> Devices() const override {
+    return devices_;
+  }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  friend class Client;  // For `devices_` initialization.
+
+  int id_;
+  std::vector<xla::ifrt::Device*> devices_;
+  MemoryKind kind_;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_MEMORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/mock_client_session.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/mock_client_session.h
new file mode 100644
index 00000000..a97b14cc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/mock_client_session.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_CLIENT_SESSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_CLIENT_SESSION_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/test_utils.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class MockClientSession final : public ClientSession {
+ public:
+  MOCK_METHOD(Future<Response>, Enqueue, (std::unique_ptr<IfrtRequest> req),
+              (override));
+  MOCK_METHOD(void, Finish, (const absl::Status& s), (override));
+};
+
+MATCHER_P(IfrtRequestOfType, req_type_param, "") {
+  const std::unique_ptr<IfrtRequest>& req = arg;
+  const IfrtRequest::RequestCase& req_type = req_type_param;
+  return req->request_case() == req_type;
+}
+
+ACTION_P(MockClientCaptureAndReturn, requests_queue_param,
+         response_proto_param) {
+  auto response = std::make_unique<IfrtResponse>(response_proto_param);
+  TestQueue<IfrtRequest>* requests_queue = requests_queue_param;
+  const std::unique_ptr<IfrtRequest>& req = arg0;
+  requests_queue->Push(*req);
+  response->mutable_response_metadata()->set_op_id(
+      arg0->request_metadata().op_id());
+  return Future<ClientSession::Response>(std::move(response));
+}
+
+ACTION_P(MockClientSessionReturnResponse, response_proto) {
+  auto response = std::make_unique<IfrtResponse>(response_proto);
+  response->mutable_response_metadata()->set_op_id(
+      arg0->request_metadata().op_id());
+  return Future<ClientSession::Response>(std::move(response));
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_CLIENT_SESSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
new file mode 100644
index 00000000..ea10be8a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_HOST_BUFFER_H_
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class MockClientHostBufferStore final : public ClientHostBufferStore {
+ public:
+  MOCK_METHOD(Future<>, Store, (uint64_t handle, absl::string_view data),
+              (override));
+  MOCK_METHOD(Future<>, Store, (uint64_t handle, const absl::Cord& data),
+              (override));
+  MOCK_METHOD(Future<absl::Cord>, Lookup, (uint64_t handle), (override));
+  MOCK_METHOD(Future<>, Delete, (uint64_t handle), (override));
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_HOST_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/py_module.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
new file mode 100644
index 00000000..c990b6e1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
+
+#include "nanobind/nanobind.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+void BuildIfrtProxySubmodule(nanobind::module_& m);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/registry.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/registry.h
new file mode 100644
index 00000000..ebf04532
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/registry.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_REGISTRY_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/python/ifrt/client.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+struct ClientConnectionOptions {
+  // Timeout for establishing the connection.
+  absl::Duration connection_timeout = absl::Minutes(2);
+
+  // A callback that (if it is not set to nullptr) will be called if there was a
+  // successful connection to the proxy server, but there was a later
+  // disconnect. The callback may be called synchronously from a thread that
+  // performs various important activities, and therefore should not block on
+  // any events (or deadlocks may happen).
+  std::function<void(absl::Status)> on_disconnect = nullptr;
+
+  // Captures logs related to establishing the connection. Logs may be generated
+  // synchronously from a thread that performs various important activities,
+  // so the function should not block (or deadlocks may happen).
+  std::function<void(absl::string_view)> on_connection_update = nullptr;
+};
+
+// Registers a new factory for client backend implementation. Crashes if the
+// same backend name is registered more than once.
+void RegisterClientFactory(
+    absl::string_view transport_name,
+    std::function<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>(
+        absl::string_view address, const ClientConnectionOptions& options)>
+        factory);
+
+// Creates a client for the given backend target. The backend target string must
+// be in the form of `<backend-type>:<backend-address>`.
+absl::StatusOr<std::unique_ptr<xla::ifrt::Client>> CreateClient(
+    absl::string_view proxy_server_address,
+    const ClientConnectionOptions& options = ClientConnectionOptions());
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
new file mode 100644
index 00000000..ec225b98
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_RPC_HELPER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_RPC_HELPER_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// RpcHelper helps establish a connection with the IFRT server and perform
+// logical RPCs on the connection.
+//
+// TODO(b/266635130): RpcHelper currently makes each logical RPC order-dependent
+// on the previous RPC it was asked to make. Instead, allow users of RpcHelper
+// specify the necessary dependency.
+class RpcHelper {
+ public:
+  RpcHelper(IfrtProxyVersion version, std::shared_ptr<ClientSession> session);
+
+  void Disconnect();
+
+  RpcHelper(const RpcHelper&) = delete;
+  RpcHelper& operator=(const RpcHelper&) = delete;
+  ~RpcHelper();
+
+  // IFRT Proxy version negotiated between the client and the server.
+  const IfrtProxyVersion& version() const { return version_; }
+
+  // Initializes the host buffer store for this RpcHelper instance. This must be
+  // called exactly once during initialization before `host_buffer_store()` is
+  // called.
+  void set_host_buffer_store(
+      std::shared_ptr<ClientHostBufferStore> host_buffer_store) {
+    CHECK(host_buffer_store_ == nullptr);
+    host_buffer_store_ = std::move(host_buffer_store);
+  }
+
+  const std::shared_ptr<ClientHostBufferStore>& host_buffer_store() const {
+    return host_buffer_store_;
+  }
+
+  template <typename T>
+  using ResponseFuture = Future<std::shared_ptr<T>>;
+
+  class Batcher;
+  enum BatchOperation { kDeleteArray, kDestructArray, kSentinelDoNotUse };
+
+  // Adds the given operation to an impending batch of operations and returns
+  // immediately. The batch of operation is sent later (as a single logical
+  // RPC).  The RPC is guaranteed to be sent before any unbatched RPCs resulting
+  // from the wrapper functions below.
+  void Batch(BatchOperation op, ArrayHandle handle);
+
+  // Wrapper function for various logical RPCs defined in ifrt_service.proto.
+  // Whenever the RPC finishes, `on_done` will be called with the result or the
+  // return status. `on_done` can be called with various locks held and should
+  // return quickly without blocking on any event. `on_done` is guaranteed to be
+  // called exactly once.
+  //
+  // The functions can be invoked after the connection is broken, but will
+  // result in `on_done` getting called with an error (see
+  // "WrapAsConnectionError" in `rpc_helper.cc`).
+
+  ResponseFuture<InitResponse> Init(std::unique_ptr<InitRequest> req);
+  ResponseFuture<GetDefaultDeviceAssignmentResponse> GetDefaultDeviceAssignment(
+      std::unique_ptr<GetDefaultDeviceAssignmentRequest> req);
+
+  ResponseFuture<CheckFutureResponse> CheckFuture(
+      std::unique_ptr<CheckFutureRequest> req);
+  ResponseFuture<CheckValueReadyResponse> CheckValueReady(
+      std::unique_ptr<CheckValueReadyRequest> req);
+
+  ResponseFuture<MakeArrayFromHostBufferResponse> MakeArrayFromHostBuffer(
+      std::unique_ptr<MakeArrayFromHostBufferRequest> req);
+  ResponseFuture<AssembleArrayFromSingleDeviceArraysResponse>
+  AssembleArrayFromSingleDeviceArrays(
+      std::unique_ptr<AssembleArrayFromSingleDeviceArraysRequest> req);
+  ResponseFuture<RemapArraysResponse> RemapArrays(
+      std::unique_ptr<RemapArraysRequest> req);
+  ResponseFuture<DisassembleIntoSingleDeviceArraysResponse>
+  DisassembleIntoSingleDeviceArrays(
+      std::unique_ptr<DisassembleIntoSingleDeviceArraysRequest> req);
+  ResponseFuture<CopyToHostBufferResponse> CopyToHostBuffer(
+      std::unique_ptr<CopyToHostBufferRequest> req);
+  ResponseFuture<CopyArraysResponse> CopyArrays(
+      std::unique_ptr<CopyArraysRequest> req);
+  ResponseFuture<FullyReplicatedShardResponse> FullyReplicatedShard(
+      std::unique_ptr<FullyReplicatedShardRequest> req);
+  ResponseFuture<IsArrayDeletedResponse> IsArrayDeleted(
+      std::unique_ptr<IsArrayDeletedRequest> req);
+  ResponseFuture<DeleteArrayResponse> DeleteArray(
+      std::unique_ptr<DeleteArrayRequest> req);
+  ResponseFuture<DestructArrayResponse> DestructArray(
+      std::unique_ptr<DestructArrayRequest> req);
+
+  ResponseFuture<CompileResponse> Compile(std::unique_ptr<CompileRequest> req);
+
+  ResponseFuture<LoadedExecutableMetadataResponse> LoadedExecutableMetadata(
+      std::unique_ptr<LoadedExecutableMetadataRequest> req);
+  ResponseFuture<LoadedExecutableExecuteResponse> LoadedExecutableExecute(
+      std::unique_ptr<LoadedExecutableExecuteRequest> req);
+  ResponseFuture<LoadedExecutableDeleteResponse> LoadedExecutableDelete(
+      std::unique_ptr<LoadedExecutableDeleteRequest> req);
+  ResponseFuture<LoadedExecutableIsDeletedResponse> LoadedExecutableIsDeleted(
+      std::unique_ptr<LoadedExecutableIsDeletedRequest> req);
+  ResponseFuture<LoadedExecutableDestructResponse> LoadedExecutableDestruct(
+      std::unique_ptr<LoadedExecutableDestructRequest> req);
+
+  ResponseFuture<LoadedHostCallbackPollResponse> LoadedHostCallbackPoll(
+      std::unique_ptr<LoadedHostCallbackPollRequest> req);
+  ResponseFuture<LoadedHostCallbackReturnResponse> LoadedHostCallbackReturn(
+      std::unique_ptr<LoadedHostCallbackReturnRequest> req);
+
+  // Utility functions.
+
+  // Generates a handle for new arrays, array data stored in HostBufferStore,
+  // etc. Guarantees that the generated handle will not conflict with those
+  // generated at the server side by IfrtBackend.
+  uint64_t NextHandle();
+
+  Future<> CheckFuture(uint64_t handle);
+
+ private:
+  const std::unique_ptr<Batcher> batcher_;
+
+  const IfrtProxyVersion version_;
+  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+
+  std::atomic<uint64_t> next_handle_ = 1;
+
+  absl::Mutex mu_;
+  uint64_t next_op_id_ ABSL_GUARDED_BY(mu_) = 1;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_RPC_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/version.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/version.h
new file mode 100644
index 00000000..dbf032e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/client/version.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_VERSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_VERSION_H_
+
+#include "xla/python/ifrt_proxy/common/versions.h"
+
+// TODO(madthanu): Delete this file and use ifrt_proxy/common/versions.h
+// directly.
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// LINT.IfChange
+// TODO(b/296144873): Document the version upgrade policy.
+inline constexpr int kClientMinVersion = protocol_version::kClientMin;
+inline constexpr int kClientMaxVersion = protocol_version::kCurrent;
+// LINT.ThenChange(//tensorflow/compiler/xla/python/ifrt_proxy/common/VERSION.md)
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_VERSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/array_util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/array_util.h
new file mode 100644
index 00000000..baeefd6b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/array_util.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_ARRAY_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_ARRAY_UTIL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Returns the byte-strides corresponding to the compact major-to-minor layout.
+absl::StatusOr<std::vector<int64_t>> DefaultByteStrides(DType dtype,
+                                                        const Shape& shape);
+
+// Denotes a chunk of contiguous memory that contains all elements of the
+// in-host (RAM) representation of an Array.
+class ArrayMemRegion {
+ public:
+  // Nullopt implies compact major-to-minor layout, as returned by
+  // `DefaultByteStrides()`.
+  using ByteStrides = std::optional<absl::Span<const int64_t>>;
+
+  // Constructs an ArrayMemRegion given `mem_region`, where `mem_region` is
+  // minimal, i.e., the lower-most and upper-most addresses of `mem_region` are
+  // necessary to retrieve elements from the array.
+  static absl::StatusOr<ArrayMemRegion> FromMinimalMemRegion(
+      absl::string_view mem_region, DType dtype, const Shape& shape,
+      ByteStrides byte_strides);
+
+  // Constructs an ArrayMemRegion given a pointer to the zeroth-element of the
+  // (in-host representation of the) Array.
+  static absl::StatusOr<ArrayMemRegion> FromZerothElementPointer(
+      const void* zeroth_element, DType dtype, const Shape& shape,
+      ByteStrides byte_strides);
+
+  // Returns a region of memory whose lower-most and upper-most addresses are
+  // necessary to retrieve elements of the (in-host representation of) the
+  // array.
+  absl::string_view mem_region() const;
+
+  // Returns a pointer to the zeroth-element of the (in-host representation of
+  // the) Array.
+  void* zeroth_element() const;
+
+ private:
+  ArrayMemRegion(void* mem_region_start, size_t nbytes)
+      : mem_region_start_(mem_region_start), nbytes_(nbytes) {}
+
+  void* const mem_region_start_;
+  const size_t nbytes_;
+};
+
+// Utilities for serializing and deserializing a host buffer of dtype
+// `DType::kString` (represented as arrays of absl::Cords).
+absl::StatusOr<std::unique_ptr<std::string>> SerializeStringHostBuffer(
+    absl::Span<const absl::Cord> cords);
+
+absl::StatusOr<std::vector<absl::Cord>> DeserializeStringHostBufferFromString(
+    const std::string& serialized_string_buffer);
+
+// Callers must ensure that the `preallocated_buffer` consists of `N`
+// `absl::Cord` objects, where N is the number of string elements in the
+// `serialized_string_buffer`.
+absl::Status DeserializeFromCordIntoPreallocatedStringHostBuffer(
+    const absl::Cord& serialized_string_buffer,
+    absl::Cord* preallocated_buffer);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_ARRAY_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.h
new file mode 100644
index 00000000..46435a4a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_GRPC_CREDENTIALS_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_GRPC_CREDENTIALS_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Get credentials to use in the client gRPC.
+// TODO(b/323079791): Migrate to use utility library from tsl/platform.
+std::shared_ptr<::grpc::ChannelCredentials> GetClientCredentials();
+
+// Get credentials to use in the server gRPC.
+// TODO(b/323079791): Migrate to use utility library from tsl/platform.
+std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials();
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_GRPC_CREDENTIALS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/prof_util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/prof_util.h
new file mode 100644
index 00000000..54eef195
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/prof_util.h
@@ -0,0 +1,101 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_PROF_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_PROF_UTIL_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+#include "tsl/platform/random.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// XFlowHelper makes it easier to create trace spans with a flow between them.
+// Typical usage:
+//
+// XFlowHelper flow("my_request");
+// ...
+//
+// auto response_handler = [flow](ResponseMsg msg) {
+//   flow.InstantActivity<kRecv>();
+//   LOG(INFO) << "Received response: " << msg;
+// }
+//
+// {
+//   auto request_span = flow.Span<kSend>();
+//   auto request_protobuf = CreateRequestProtobuf();
+//   transport.Send(request_protobuf, response_handler);
+// }
+//
+class XFlowHelper {
+ public:
+  explicit XFlowHelper(absl::string_view name)
+      : xflow_id_(tsl::random::New64() >> 8 /*XFlow IDs are 56 bits*/),
+        name_(name) {}
+
+  typedef enum { kSend, kRecv, kRecvSend } Direction;
+
+  template <Direction D>
+  tsl::profiler::TraceMe Span() const {
+    return tsl::profiler::TraceMe([xflow_id = xflow_id_, name = name_] {
+      return Encode<D>(xflow_id, name);
+    });
+  }
+
+  template <Direction D>
+  void InstantActivity() const {
+    return tsl::profiler::TraceMe::InstantActivity(
+        [xflow_id = xflow_id_, name = name_] {
+          return Encode<D>(xflow_id, name);
+        });
+  }
+
+ private:
+  template <Direction D>
+  static std::string Encode(uint64_t xflow_id, absl::string_view name) {
+    using XFlow = ::tsl::profiler::XFlow;
+    switch (D) {
+      case kSend:
+        return tsl::profiler::TraceMeEncode(
+            name, {{"dir", "send"},
+                   {"flow", XFlow(xflow_id, XFlow::kFlowOut).ToStatValue()}});
+      case kRecv:
+        return tsl::profiler::TraceMeEncode(
+            name, {{"dir", "recv"},
+                   {"flow", XFlow(xflow_id, XFlow::kFlowIn).ToStatValue()}});
+      case kRecvSend:
+        return tsl::profiler::TraceMeEncode(
+            name, {{"dir", "recv_send"},
+                   {"flow", XFlow(xflow_id, XFlow::kFlowInOut).ToStatValue()}});
+    }
+  };
+
+  const uint64_t xflow_id_;
+  const absl::string_view name_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_PROF_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/proto_util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/proto_util.h
new file mode 100644
index 00000000..d999d14f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/proto_util.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_PROTO_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_PROTO_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Makes an IfrtResponse proto with the given metadata.
+std::unique_ptr<IfrtResponse> NewIfrtResponse(
+    uint64_t op_id, absl::Status status = absl::OkStatus());
+
+// Converts an `absl::string_view` into a type that is appropriate for doing
+// `proto->set_string_field(...)`. This type can be absl::string_view in the
+// newest versions of protobuf, but needs to be std::string for previous
+// versions. (As of Feb 2024, OpenXLA uses an old version.)
+#if defined(PLATFORM_GOOGLE)
+inline absl::string_view AsProtoStringData(
+    absl::string_view s ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  return s;
+}
+#else
+inline std::string AsProtoStringData(absl::string_view s) {
+  LOG_FIRST_N(WARNING, 5) << "AsProtoStringData(): copying string_view->string";
+  return std::string(s);
+}
+#endif
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_PROTO_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/test_utils.h
new file mode 100644
index 00000000..002394fc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/test_utils.h
@@ -0,0 +1,114 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_TEST_UTILS_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_TEST_UTILS_H_
+
+#include <deque>
+#include <functional>
+#include <optional>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// TestQueue implements a thread-safe queue that manages values of type T.
+template <typename T>
+class TestQueue {
+ public:
+  explicit TestQueue(absl::Duration pop_timeout)
+      : pop_timeout_(std::move(pop_timeout)) {}
+
+  // Pushes `t` into the queue.
+  void Push(T t) {
+    absl::MutexLock l(&mu_);
+    queue_.push_back(std::move(t));
+  }
+
+  // Pops the first element in the queue if a element is already available or
+  // appears within `pop_timeout` (because `Push` is called). Otherwise returns
+  // std::nullopt.
+  std::optional<T> PopOrTimeout() {
+    absl::MutexLock l(&mu_);
+    auto cond = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) -> bool {
+      return !queue_.empty();
+    };
+    mu_.AwaitWithTimeout(absl::Condition(&cond), pop_timeout_);
+    if (queue_.empty()) {
+      return std::nullopt;
+    }
+    T result = std::move(queue_.front());
+    queue_.pop_front();
+    return result;
+  }
+
+  // Pops the first element in the queue if a element is already available or
+  // appears within `pop_timeout`, and fails otherwise.
+  T Pop() {
+    std::optional<T> result = PopOrTimeout();
+    CHECK(result.has_value()) << "Timeout!";
+    return std::move(*result);
+  }
+
+  // Sets whether the queue is allowed to be destructed while it contains
+  // unpopped elements.
+  void AllowNonEmptyDestruction(bool allow) {
+    absl::MutexLock l(&mu_);
+    allow_non_empty_destruction_ = allow;
+  }
+
+  // Checks that the queue is either empty, or `AllowNonEmptyDestruction(true)`
+  // has been called.
+  ~TestQueue() {
+    absl::MutexLock l(&mu_);
+    if (!allow_non_empty_destruction_) CHECK(queue_.empty()) << " " << this;
+  }
+
+ private:
+  const absl::Duration pop_timeout_;
+
+  absl::Mutex mu_;
+  std::deque<T> queue_ ABSL_GUARDED_BY(mu_);
+  bool allow_non_empty_destruction_ ABSL_GUARDED_BY(mu_) = false;
+};
+
+// TestHook provides a lightweight mechanism to modify the behavior of
+// production code from tests.
+// TODO(b/266635130): Extend for more hook types (as of Sep 2023, only allows
+// `void(bool*)`) and make more lightweight.
+enum class TestHookName {
+  kRpcBatcherPausePeriodicFlush,
+};
+
+// Allows test code to override the default noop behavior for hook `h`.
+void TestHookSet(TestHookName h, std::function<void(bool*)> fn);
+
+// Resets hook `h` to the default noop behavior.
+void TestHookClear(TestHookName h);
+
+// Calls hook `h` if it has been overridden by test setup; noop otherwise.
+void TestHookCall(TestHookName h, bool* param1);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/types.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/types.h
new file mode 100644
index 00000000..3fa85454
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/types.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+struct ArrayHandle {
+  uint64_t handle;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ArrayHandle& h) {
+    absl::Format(&sink, "arr_%v", h.handle);
+  }
+};
+
+absl::StatusOr<ArrayCopySemantics> FromArrayCopySemanticsProto(
+    proto::ArrayCopySemantics s);
+proto::ArrayCopySemantics ToArrayCopySemanticsProto(ArrayCopySemantics s);
+
+absl::StatusOr<SingleDeviceShardSemantics> FromSingleDeviceShardSemanticsProto(
+    proto::SingleDeviceShardSemantics s);
+proto::SingleDeviceShardSemantics ToSingleDeviceShardSemanticsProto(
+    SingleDeviceShardSemantics s);
+
+absl::StatusOr<xla::PjRtValueType> FromVariantProto(
+    const proto::Variant& variant_proto);
+absl::StatusOr<proto::Variant> ToVariantProto(const xla::PjRtValueType& value);
+
+std::vector<int64_t> FromByteStridesProto(const proto::ByteStrides& strides);
+proto::ByteStrides ToByteStridesProto(absl::Span<const int64_t> strides);
+
+constexpr uint64_t kServerGeneratedHandlesMinValue = 1ULL << 48;
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/versions.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/versions.h
new file mode 100644
index 00000000..0a953370
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/common/versions.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_VERSIONS_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_VERSIONS_H_
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace protocol_version {
+
+// The minimum protocol_version that the current client code understands.
+inline constexpr int kClientMin = 3;
+
+// The minimum protocol_version that the current server code understands.
+inline constexpr int kServerMin = 3;
+
+enum {
+  // Versions kAncient are named and are only referred to by their numbers. See
+  // VERSIONS.md to see the explanation of the numbers.
+  kAncient = 10,
+
+  // kClientHandlesOptimization2 introduces a set of performance optimizations
+  // where the client generates array handles.
+  kClientHandlesOptimization2,
+
+  // kClientHandlesExecutableOptimization introduces a set of performance
+  // optimizations where the client generates array handles for operations
+  // related to LoadedExecutable.
+  kClientHandlesExecutableOptimization,
+
+  // kSentiel is used to derive kCurrent below. Keep this as the last value of
+  // the enum.
+  kSentiel,
+};
+
+// The maximum protocol_version that the current client and server code
+// understand.
+constexpr int kCurrent = kSentiel - 1;
+
+}  // namespace protocol_version
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_VERSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
new file mode 100644
index 00000000..af225939
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVER_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "grpcpp/server.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Makes and runs a gRPC server with the given implementation and address.
+// Destroying this object shuts down the underlying gRPC server, and so can
+// block.
+class GrpcServer {
+ public:
+  // The address parameter must be in the standard URI format - as needed by the
+  // ::grpc::ServerBuilder::AddListentingPort. See the ::grpc::ServerBuilder
+  // documentation for more details.
+  static absl::StatusOr<std::unique_ptr<GrpcServer>> Create(
+      absl::string_view address,
+      std::unique_ptr<grpc::GrpcIfrtService::Service> impl);
+
+  static absl::StatusOr<std::unique_ptr<GrpcServer>>
+  CreateFromIfrtClientFactory(
+      absl::string_view address,
+      absl::AnyInvocable<absl::StatusOr<std::shared_ptr<xla::ifrt::Client>>()>
+          backend_ifrt_client_factory);
+
+  // Starts shutting down the server and waits until it properly shuts down.
+  ~GrpcServer();
+
+  // Address this server is listening on.
+  std::string address() const { return address_; }
+
+  // Blocks until the server shuts down.
+  void Wait() { server_->Wait(); }
+
+ private:
+  GrpcServer(absl::string_view address,
+             std::unique_ptr<grpc::GrpcIfrtService::Service> impl,
+             std::unique_ptr<::grpc::Server> server)
+      : address_(address), impl_(std::move(impl)), server_(std::move(server)) {}
+
+  const std::string address_;  // Address this server is listening on.
+
+  // Make sure that impl_ outlives the server_.
+  std::unique_ptr<grpc::GrpcIfrtService::Service> impl_;
+  std::unique_ptr<::grpc::Server> server_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.h
new file mode 100644
index 00000000..c75709b4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVICE_IMPL_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVICE_IMPL_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Implementation for `GrpcIfrtService`.
+class GrpcServiceImpl : public grpc::GrpcIfrtService::Service {
+ public:
+  using BackendFactory =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<BackendInterface>>(
+          IfrtProxyVersion version, uint64_t session_id,
+          std::shared_ptr<xla::ifrt::proxy::HostBufferStore>
+              host_buffer_store)>;
+
+  explicit GrpcServiceImpl(BackendFactory backend_factory)
+      : backend_factory_(ABSL_DIE_IF_NULL(std::move(backend_factory))) {}
+
+  ::grpc::Status GetVersion(::grpc::ServerContext* context,
+                            const GrpcGetVersionRequest* request,
+                            GrpcGetVersionResponse* response) override;
+
+  ::grpc::Status IfrtSession(
+      ::grpc::ServerContext* context,
+      ::grpc::ServerReaderWriter<IfrtResponse, IfrtRequest>* stream) override;
+
+  ::grpc::Status HostBufferStore(
+      ::grpc::ServerContext* context,
+      ::grpc::ServerReader<GrpcHostBufferStoreRequest>* stream,
+      GrpcHostBufferStoreResponse* response) override;
+
+  ::grpc::Status HostBufferLookup(
+      ::grpc::ServerContext* context,
+      const GrpcHostBufferLookupRequest* request,
+      ::grpc::ServerWriter<GrpcHostBufferLookupResponse>* stream) override;
+
+  ::grpc::Status HostBufferDelete(
+      ::grpc::ServerContext* context,
+      const GrpcHostBufferDeleteRequest* request,
+      GrpcHostBufferDeleteResponse* response) override;
+
+  // Test-only method that adds a new session in the host buffer store map.
+  // Returns false if the session id already exists.
+  bool Test_InsertHostBufferStore(
+      uint64_t session_id,
+      std::shared_ptr<xla::ifrt::proxy::HostBufferStore> store);
+
+  // Test-only method that removes the given session id from the host buffer
+  // store map. Returns false if the session id does not exist.
+  bool Test_DeleteHostBufferStore(uint64_t session_id);
+
+ private:
+  absl::StatusOr<std::shared_ptr<xla::ifrt::proxy::HostBufferStore>>
+  GetHostBufferStore(uint64_t session_id)
+      ABSL_LOCKS_EXCLUDED(host_buffer_store_mu_);
+
+  BackendFactory backend_factory_;
+  std::atomic<uint64_t> next_session_id_ = 1;
+
+  absl::Mutex host_buffer_store_mu_;
+  absl::flat_hash_map<uint64_t,
+                      std::shared_ptr<xla::ifrt::proxy::HostBufferStore>>
+      host_buffer_stores_ ABSL_GUARDED_BY(host_buffer_store_mu_);
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.h
new file mode 100644
index 00000000..a11a00f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_HOST_BUFFER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Keeps host buffers transferred from the client so that `IfrtBackend` can
+// access them when requests with pointers to host buffers arrive.
+//
+// We expect one `HostBufferStore` to exist per session (i.e., per `IfrtBackend`
+// instance) so that host buffers are cleaned up on session termination.
+class HostBufferStore {
+ public:
+  // Stores the data associated with the given handle. Returns an error if the
+  // handle already exists.
+  absl::Status Store(uint64_t handle, std::string data);
+
+  // Retrieves the data associated with the handle. Returns an error if the
+  // handle does not exist within the given timeout or if `Shutdown()` is
+  // called.
+  absl::StatusOr<std::shared_ptr<const std::string>> Lookup(
+      uint64_t handle, absl::Duration timeout = absl::ZeroDuration());
+
+  // Deletes the host buffer associated with the handle. Returns an error if the
+  // handle does not exist.
+  absl::Status Delete(uint64_t handle);
+
+  // Deletes all handles and permanently prevents addition of any new handles.
+  void Shutdown(std::string reason);
+
+  ~HostBufferStore() { Shutdown("HostBufferStore is being destroyed"); }
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<uint64_t, std::shared_ptr<const std::string>> buffers_
+      ABSL_GUARDED_BY(mu_);
+  std::optional<std::string> shutdown_msg_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_HOST_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
new file mode 100644
index 00000000..6310487d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_HOST_CALLBACK_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_HOST_CALLBACK_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Command queue interface between `RemoteLoadedHostCallback` and `IfrtBackend`.
+// Responsible for keeping track of in-flight execution requests.
+class RemoteLoadedHostCallbackQueue {
+ public:
+  struct Buffer {
+    void* data;
+    int64_t size;
+  };
+
+  // Encapsulates a host buffer execution. Operand and result buffers are
+  // pre-allocated and the caller is expected to fill them in-place before
+  // fulfilling the `status` promise.
+  struct ExecutionRequest {
+    std::vector<Buffer> operands;
+    std::vector<Buffer> results;
+    Future<>::Promise status;
+  };
+
+  ~RemoteLoadedHostCallbackQueue();
+
+  // Pushes a new execution request to the queue. Returns an error if the queue
+  // has already been closed.
+  absl::Status Push(ExecutionRequest request);
+
+  // Blocks until this host callback queue has at least one pending execution
+  // and returns its information needed to perform execution. Returns nullopt if
+  // the request queue has already been closed by `Close()`.
+  std::optional<ExecutionRequest> Pop();
+
+  // Closes this request queue. After this call, all pending executions are
+  // unblocked with an error and no more executions can be enqueued.
+  void Close();
+
+ private:
+  absl::Mutex mu_;
+  bool closed_ ABSL_GUARDED_BY(mu_) = false;
+  std::deque<ExecutionRequest> requests_ ABSL_GUARDED_BY(mu_);
+};
+
+// Host callback that delegates its execution to an external executor. The
+// executor waits for execution requests to be enqueued to the given
+// `RemoteLoadedHostCallbackQueue` and returns results after execution by
+// fulfilling the returned promise.
+//
+// This class is thread-safe.
+//
+// Note: The current implementation inherits from PjRt's host callback
+// implementation. Even though this is a violation of the IFRT proxy's layering
+// principle, it is unavoidable right now because the base `LoadedHostCallback`
+// in IFRT has no associated execution semantics. For now, the IFRT proxy
+// focuses on supporting host callbacks on PjRt-like IFRT implementations.
+class RemoteLoadedHostCallback
+    : public llvm::RTTIExtends<RemoteLoadedHostCallback,
+                               PjRtHostSendAndRecvLoadedHostCallback> {
+ public:
+  // Creates from a serialized string returned by `Serialize()`.
+  static absl::StatusOr<tsl::RCReference<RemoteLoadedHostCallback>>
+  CreateFromSerialized(xla::ifrt::Client* client, absl::string_view serialized,
+                       std::shared_ptr<RemoteLoadedHostCallbackQueue> queue);
+
+  // Create from operand/result specs.
+  RemoteLoadedHostCallback(
+      xla::ifrt::Client* client, std::vector<xla::HostCallbackArgInfo> operands,
+      std::vector<xla::HostCallbackArgInfo> results,
+      std::shared_ptr<RemoteLoadedHostCallbackQueue> queue);
+
+  ~RemoteLoadedHostCallback() override;
+
+  // Serializes the remote host callback instance. The returned string can be
+  // deserialized into `RmeoteLoadedHostCallback` using `CreateFromSerialized`.
+  absl::StatusOr<std::string> Serialize() const override;
+
+ private:
+  // Implements the interface required by `xla::HostCallback`.
+  absl::Status Execute(void** result_ptrs, void** operand_ptrs);
+
+  std::shared_ptr<RemoteLoadedHostCallbackQueue> queue_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_HOST_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
new file mode 100644
index 00000000..37c06dc7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/host_callback.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// The abstract class `BackendInterface` defines the interface used by the IFRT
+// service to interact with a variety of backend runtime system it can utilize.
+class BackendInterface {
+ public:
+  virtual ~BackendInterface() = default;
+
+  // Currently, responses (particularly those that carry buffer contents) can be
+  // of non-trivial size. Once we figured out how best to move the data, we may
+  // want to revise the shared_ptr below to the `IfrtResponse` proto itself.
+  // Also, if and when we have a move-only Future in xla::ifrt, we may consider
+  // changing it to std::unique_ptr.
+  using Response = std::shared_ptr<IfrtResponse>;
+
+  // Processes a given IFRT Request and returns a Future of an IfrtResponse.
+  virtual Future<Response> Process(std::unique_ptr<IfrtRequest> request) = 0;
+};
+
+// IfrtBackend implements a backend that already has a linkable C++ client that
+// conforms to the xla::ifrt API.
+class IfrtBackend final : public BackendInterface {
+ public:
+  // Creates an returns an IfrtBackend that uses the given IFRT Client to
+  // process the incoming proxy client requests. The `ifrt_client` param cannot
+  // be a nullptr.
+  static absl::StatusOr<std::unique_ptr<IfrtBackend>> Create(
+      IfrtProxyVersion version, uint64_t session_id,
+      std::shared_ptr<xla::ifrt::Client> ifrt_client,
+      std::shared_ptr<HostBufferStore> host_buffer_store);
+
+  ~IfrtBackend() override;
+
+  // IFRT Proxy version negotiated between the client and the server.
+  const IfrtProxyVersion& version() const { return version_; }
+
+  Future<Response> Process(std::unique_ptr<IfrtRequest> request) override;
+
+ private:
+  // Generates unique handles for returning to the client. Guaranteed to return
+  // handles that do not conflict with client-generated handles (via client-side
+  // RpcHelper). All object types currently use this single "handle space".
+  class HandleGenerator {
+   public:
+    explicit HandleGenerator(IfrtBackend* parent);
+
+    // Returns the client-generated handle after performing some convenience
+    // checks, provided that the client is of a protocol_version capable of
+    // doing this. If the client has old protocol versions, generate a handle at
+    // the server.
+    absl::StatusOr<uint64_t> FromClientGenerated(uint64_t from_client);
+
+    // Performs the same function as `FromClientGenerated` but in bulk, and
+    // saves them into the provided Span.
+    absl::Status FromClientGeneratedBulk(
+        const tsl::protobuf::RepeatedField<uint64_t>& from_client,
+        absl::Span<uint64_t> result_handles);
+
+    uint64_t GenerateAtServer();
+
+    void GenerateAtServerBulk(absl::Span<uint64_t> result_handles);
+
+   private:
+    IfrtBackend* const parent_;
+    absl::Mutex mu_;
+    uint64_t current_ ABSL_GUARDED_BY(mu_);
+  };
+
+  IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
+              std::shared_ptr<xla::ifrt::Client> ifrt_client,
+              std::shared_ptr<HostBufferStore> host_buffer_store);
+
+  // Executes the given function on the given thread pool and returns a future
+  // that becomes ready when the function returns. If the thread pool is not
+  // given, uses a default thread pool implementation that does not limit the
+  // maximum number of threads.
+  Future<Response> AsyncExecute(
+      std::function<absl::StatusOr<Response>()> handle_fn,
+      tsl::thread::ThreadPool* thread_pool = nullptr);
+
+  Future<Response> ProcessInternal(std::unique_ptr<IfrtRequest> request);
+
+  //////////////////////////////////////////////////////////////////////
+  // Handlers for individual requests
+  //
+
+  absl::StatusOr<Response> HandleInit(std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleCheckFutureRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleCheckValueReadyRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  absl::StatusOr<Response> HandleMakeArrayFromHostBufferRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleAssembleArrayFromSingleDeviceArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleRemapArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Future<Response> HandleCopyToHostBufferRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleDisassembleIntoSingleDeviceArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleCopyArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleReshardRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleFullyReplicatedShardRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleDeleteArrayRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleIsArrayDeletedRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleDestructArrayRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleCompileRequest(std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleLoadedExecutableMetadataRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleLoadedExecutableExecuteRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleLoadedExecutableDeleteRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleLoadedExecutableIsDeletedRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleLoadedExecutableDestructRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleLoadedHostCallbackPollRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleLoadedHostCallbackReturnRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  absl::StatusOr<Response> HandleGetDefaultDeviceAssignmentRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  //////////////////////////////////////////////////////////////////////
+  // Auxiliary/Helper methods for the handler methods above
+  //
+
+  Future<BackendInterface::Response> HandleCopyToStringHostBufferRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  //////////////////////////////////////////////////////////////////////
+  // Convenient methods for object lookups
+  //
+
+  struct LoadedExecutableWithInfo;
+  absl::StatusOr<std::shared_ptr<LoadedExecutableWithInfo>> GetLoadedExecutable(
+      uint64_t handle);
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetArray(uint64_t handle);
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetArrayLocked(
+      uint64_t handle) ABSL_SHARED_LOCKS_REQUIRED(arrays_mutex_);
+
+  HandleGenerator handle_generator_;
+
+  // Must not change during the life of this object.
+  const IfrtProxyVersion version_;
+  const uint64_t session_id_;
+  const std::shared_ptr<xla::ifrt::Client> client_;
+  const std::shared_ptr<HostBufferStore> host_buffer_store_;
+
+  absl::Mutex futures_mutex_;
+  absl::flat_hash_map<uint64_t, Future<>> futures_
+      ABSL_GUARDED_BY(futures_mutex_);
+
+  absl::Mutex arrays_mutex_;
+  absl::flat_hash_map<uint64_t, tsl::RCReference<xla::ifrt::Array>> arrays_
+      ABSL_GUARDED_BY(arrays_mutex_);
+
+  absl::Mutex executables_mutex_;
+  absl::flat_hash_map<uint64_t, std::shared_ptr<LoadedExecutableWithInfo>>
+      executables_ ABSL_GUARDED_BY(executables_mutex_);
+
+  absl::Mutex host_callback_queues_mutex_;
+  absl::flat_hash_map<uint64_t, std::shared_ptr<RemoteLoadedHostCallbackQueue>>
+      host_callback_queues_ ABSL_GUARDED_BY(host_callback_queues_mutex_);
+
+  absl::Mutex host_callback_executions_mutex_;
+  absl::flat_hash_map<uint64_t, RemoteLoadedHostCallbackQueue::ExecutionRequest>
+      host_callback_executions_
+          ABSL_GUARDED_BY(host_callback_executions_mutex_);
+
+  absl::Mutex in_flight_count_mutex_;
+  int64_t in_flight_count_ ABSL_GUARDED_BY(in_flight_count_mutex_) = 0;
+
+  // Use a separate thread pool for compilation as XLA compilation often
+  // requires a bigger stack.
+  tsl::thread::ThreadPool compile_thread_pool_;
+
+  class InOrderRequestsProcessor;
+  std::unique_ptr<InOrderRequestsProcessor> in_order_requests_processor_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/mock_ifrt_backend.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/mock_ifrt_backend.h
new file mode 100644
index 00000000..620808f9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/mock_ifrt_backend.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_MOCK_IFRT_BACKEND_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_MOCK_IFRT_BACKEND_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class MockIfrtBackend final : public BackendInterface {
+ public:
+  MOCK_METHOD(Future<Response>, Process, (std::unique_ptr<IfrtRequest> request),
+              (final));
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_MOCK_IFRT_BACKEND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/version.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/version.h
new file mode 100644
index 00000000..6e1c524f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ifrt_proxy/server/version.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_VERSION_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_VERSION_H_
+
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt_proxy/common/versions.h"
+
+// TODO(madthanu): Delete this file and use ifrt_proxy/common/versions.h
+// directly.
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// LINT.IfChange
+// TODO(b/296144873): Document the version upgrade policy.
+inline constexpr int kServerMinVersion = protocol_version::kServerMin;
+inline constexpr int kServerMaxVersion = protocol_version::kCurrent;
+// LINT.ThenChange(//tensorflow/compiler/xla/python/ifrt_proxy/common/VERSION.md)
+
+// Returns a version that both the client and the server support, or an error if
+// there is no such a version.
+absl::StatusOr<int> ChooseVersion(int client_min_version,
+                                  int client_max_version,
+                                  int server_min_version = kServerMinVersion,
+                                  int server_max_version = kServerMaxVersion);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_VERSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/inspect_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/python/inspect_sharding.h
new file mode 100644
index 00000000..c6ee4250
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/inspect_sharding.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_INSPECT_SHARDING_H_
+#define XLA_PYTHON_INSPECT_SHARDING_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "xla/hlo/ir/hlo_sharding.h"
+
+// Marshalls xla::HloSharding across the .so boundary between jaxlib and a
+// compiler plugin. This library must be linked into xla-based compiler plugins
+// that want to support InspectSharding.
+
+// Use "C" linkage to force the struct layouts to use the c rules.
+extern "C" {
+
+struct JAX_InspectSharding_Callback_Args {
+  // Serialized xla::HloSharding.
+  char* sharding_spec;
+  size_t sharding_spec_size;
+  const char* error_txt;  // out
+  void* error_scratch;
+  // Deleter for the returned error.
+  void (*free_error)(JAX_InspectSharding_Callback_Args* args);
+};
+
+// Memcpy-ed into the `backend_config` field of the "InspectSharding" custom
+// call. During compilation, the provided callback will be called with both
+// the provided data argument and the serialized xla::HloSharding (in args).
+//
+// All pointers here must outlive compilation.
+struct JAX_InspectSharding_Callback {
+  void (*call)(void* data, JAX_InspectSharding_Callback_Args* args);
+  void* data;
+};
+
+}  // extern "C"
+
+namespace jax {
+
+// Helpers for reading and writing to JAX_InspectSharding_Callback_Args.
+void InspectShardingSetError(JAX_InspectSharding_Callback_Args* args,
+                             std::string msg);
+std::optional<xla::HloSharding> InspectShardingReadArgs(
+    JAX_InspectSharding_Callback_Args* args);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_INSPECT_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/jax_jit.h b/third_party/tflite-hdrs/third_party/xla/xla/python/jax_jit.h
new file mode 100644
index 00000000..59d35abf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/jax_jit.h
@@ -0,0 +1,263 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_JAX_JIT_H_
+#define XLA_PYTHON_JAX_JIT_H_
+
+#include <Python.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+// placeholder for index annotation headers
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/nb_helpers.h"
+#include "xla/python/py_values.h"
+#include "xla/python/python_ref_manager.h"
+#include "xla/python/pytree.h"
+#include "xla/python/sharding.h"
+#include "tsl/platform/logging.h"
+
+namespace jax {
+
+// Flags, such as JIT disable and the x64 mode, are controlled by:
+// - a global flag value, e.g., associated to --jax_enable_x64
+// - possibly a thread-local value, which initially is std::nullopt and
+//   overrides the global value if set. The thread-local state is
+//   used to implement context managers that locally override the global state.
+struct JitState {
+  ~JitState() {
+    if (extra_jit_context) {
+      // We likely do not hold the GIL if this JitState is thread-local, so we
+      // hand the Python object to the global reference manager to destroy.
+      nanobind::object o = std::move(*extra_jit_context);
+      xla::GlobalPyRefManager()->AddGarbage(absl::MakeSpan(&o, 1));
+      extra_jit_context = std::nullopt;
+    }
+  }
+
+  std::optional<bool> disable_jit;
+  std::optional<bool> enable_x64;
+
+  // Used to manually set the default device jax should use. May be unset even
+  // in global state, indicating there is no manual override.
+  // TODO(skyewm): make this a C++ type when all JAX backends support a single
+  // C++ device interface
+  std::optional<nanobind::object> default_device;
+
+  // Extra context that should be included in the JIT cache key. Must be
+  // hashable and have an equality defined.
+  std::optional<nanobind::object> extra_jit_context;
+
+  // A callback that, if present, is called when a JITted function is executed
+  // from cache. May be unset even in global state.
+  std::optional<nanobind::callable> post_hook;
+};
+
+JitState& GlobalJitState();
+
+// Requires the GIL.
+JitState& ThreadLocalJitState();
+
+// Getters for JitState fields that first look in thread-local state, then
+// fallback to global state.
+bool GetDisableJit();
+bool GetEnableX64();
+
+// TODO(skyewm): return a C++ type when all JAX backends support a single C++
+// device interface
+std::optional<nanobind::object> GetDefaultDevice();
+std::optional<nanobind::callable> GetPostHook();
+
+// An ArgumentSignature describes the static arguments to a function call, and
+// how the dynamic arguments are related to the arguments. Together with the
+// values of the dynamic arguments, this fully describes the arguments.
+struct ArgumentSignature {
+  // A PyTreeDef for each dynamic argument, positional arguments first
+  // followed by keyword arguments. Keyword arguments are in the order given
+  // by dynamic_arg_names.
+  absl::InlinedVector<xla::PyTreeDef, 2> dynamic_arg_treedefs;
+
+  // Dynamic keyword argument names. Interned, and sorted by the keyword
+  // name. Interned values are safe to compare by pointer.
+  std::vector<nanobind::object> dynamic_arg_names;
+
+  // Static arguments. Contains the positional arguments sorted in argument
+  // order, followed by static keyword arguments in the order given by
+  // `static_arg_names`.
+  std::vector<nanobind::object> static_args;
+
+  // Static keyword argument names. Interned, and sorted by keyword name.
+  std::vector<nanobind::object> static_arg_names;
+
+  bool operator==(const ArgumentSignature& other) const;
+  bool operator!=(const ArgumentSignature& other) const {
+    return !(*this == other);
+  }
+
+  std::string DebugString() const;
+};
+
+template <typename H>
+H AbslHashValue(H h, const ArgumentSignature& s) {
+  h = H::combine(std::move(h), s.dynamic_arg_treedefs,
+                 s.dynamic_arg_names.size(), s.static_args.size(),
+                 s.static_arg_names.size());
+
+  for (const auto& name : s.dynamic_arg_names) {
+    h = H::combine(std::move(h), name.ptr());
+  }
+  for (size_t i = 0; i < s.static_args.size(); ++i) {
+    const auto& static_arg = s.static_args[i];
+    Py_hash_t hash;
+    try {
+      hash = nanobind::hash(static_arg);
+    } catch (const nanobind::python_error& e) {
+      if (!e.matches(PyExc_TypeError)) throw;
+      throw std::invalid_argument(absl::StrCat(
+          "Non-hashable static arguments are not supported. An error occurred "
+          "while trying to hash an object of type ",
+          nanobind::cast<absl::string_view>(nanobind::str(static_arg.type())),
+          ", ", nanobind::cast<absl::string_view>(nanobind::str(static_arg)),
+          ". The error was:\n", e.what(), "\n"));
+    }
+    h = H::combine(std::move(h), hash);
+  }
+  for (const auto& name : s.static_arg_names) {
+    h = H::combine(std::move(h), name.ptr());
+  }
+  return h;
+}
+
+// Filter out static arguments, flatten and concatenate other arguments (i.e.
+// dynamic positional and keyword arguments), filling `arguments` in place.
+// Args:
+// positional_args: positional arguments
+// keyword_args: the values of the keyword arguments
+// kwnames: either None or a tuple containing the keyword argument names
+// static_argnums: the indices of the static arguments in the positional
+//   arguments
+// static_argnames: the names of the static arguments, which must be interned.
+// pytree_registry: the registry to use to convert the arguments to pytrees
+// signature: output; describes the static arguments and the identities of the
+//  dynamic arguments.
+// flat_dynamic_args: output; the concatenation of the dynamic positional
+//  arguments and sorted keyword arguments.
+absl::Status ParseArguments(
+    absl::Span<PyObject* const> positional_args,
+    absl::Span<PyObject* const> keyword_args, nanobind::handle kwnames,
+    absl::Span<int const> static_argnums,
+    absl::Span<nanobind::str const> static_argnames,
+    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
+    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args);
+
+// The signature of Python jitted function call, partitioned into:
+// - dynamic positional arguments (i.e. positional args which are not static)
+// - static positional arguments (i.e. the args associated to static_argnums)
+// - keyword arguments
+// The CallSignature should unambiguously identify a function call, thus,
+// equality is based on:
+// (a) Same PyTree for all dynamic positional arguments and keyword arguments
+// (a) equality of the arguments and keyword arguments ArgSignature
+// (a) equality (delegated to Python) of the static arguments.
+struct CallSignature {
+  // Not part of the signature, but we need it for error messages.
+  absl::string_view function_name;
+
+  ArgumentSignature arg_signature;
+
+  // Shape and dtype for both the dynamic positional arguments and the keyword
+  // arguments (sorted by keyword name).
+  absl::InlinedVector<xla::PyArgSignature, 2> dynamic_arg_signatures;
+
+  // The sharding of the jax.Array arguments.
+  std::vector<nanobind::object> dynamic_arg_shardings;
+
+  // The layout of the jax.Array arguments.
+  std::vector<std::shared_ptr<const xla::PjRtLayout>> dynamic_arg_layouts;
+
+  absl::InlinedVector<bool, 2> committed_args;
+
+  // For JIT, we need this in the key because computation follows the data, so
+  // we may have multiple executables depending on the devices the data is on.
+  // This is not the case for PMAP, and is set to `nullptr`.
+  xla::PjRtDevice* device = nullptr;
+  bool jax_enable_x64;
+
+  // For JIT on PJIT, we need to fallback to python whenever default_device
+  // changes.
+  std::optional<nanobind::object> default_device;
+
+  // Opaque additional context that should be included as part of the cache key.
+  std::optional<nanobind::object> global_extra_jit_context;
+  std::optional<nanobind::object> thread_local_extra_jit_context;
+
+  std::vector<nanobind::object> configs;
+
+  bool operator==(const CallSignature& other) const;
+  bool operator!=(const CallSignature& other) const {
+    return !(*this == other);
+  }
+
+  std::string DebugString() const;
+};
+
+template <typename H>
+H AbslHashValue(H h, const CallSignature& s) {
+  h = H::combine(std::move(h), s.arg_signature, s.dynamic_arg_signatures);
+
+  DCHECK(s.dynamic_arg_shardings.empty() ||
+         s.dynamic_arg_shardings.size() == s.dynamic_arg_signatures.size());
+
+  DCHECK(s.dynamic_arg_layouts.empty() ||
+         s.dynamic_arg_layouts.size() == s.dynamic_arg_signatures.size());
+
+  // TODO(chky): For now, we are only hashing the pointer of shardings to avoid
+  // slow python hashing function. Consider implementing hashing function and
+  // equality checks in C++ in jax::Sharding and use those here.
+  for (const auto& sharding : s.dynamic_arg_shardings) {
+    h = H::combine(std::move(h), ShardingHash(sharding));
+  }
+
+  for (const auto& layout : s.dynamic_arg_layouts) {
+    if (layout != nullptr) {
+      h = H::combine(std::move(h), *layout);
+    }
+  }
+
+  h = H::combine(std::move(h), s.committed_args, s.device, s.jax_enable_x64);
+
+  // We do not hash the extra_jit_context fields since calling Python hash
+  // functions is expensive (~300ns) and we don't expect a large number of
+  // different contexts.
+  return h;
+}
+
+// The function to call in `xla.cc` to add the bindings for this module.
+void BuildJaxjitSubmodule(nanobind::module_& m);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_JAX_JIT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/logging.h b/third_party/tflite-hdrs/third_party/xla/xla/python/logging.h
new file mode 100644
index 00000000..9a791611
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/logging.h
@@ -0,0 +1,25 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_LOGGING_H_
+#define XLA_PYTHON_LOGGING_H_
+
+namespace xla {
+
+void InitializeAbslLogging();
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_LOGGING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/python/mlir.h
new file mode 100644
index 00000000..18002406
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/mlir.h
@@ -0,0 +1,28 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_MLIR_H_
+#define XLA_PYTHON_MLIR_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildMlirSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_flat_hash_map.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_flat_hash_map.h
new file mode 100644
index 00000000..a52c1635
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_flat_hash_map.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_ABSL_FLAT_HASH_MAP_H_
+#define XLA_PYTHON_NB_ABSL_FLAT_HASH_MAP_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/stl/detail/nb_dict.h"
+
+namespace nanobind {
+namespace detail {
+
+template <typename Key, typename T, typename Hash, typename Eq, typename Alloc>
+struct type_caster<absl::flat_hash_map<Key, T, Hash, Eq, Alloc>>
+    : dict_caster<absl::flat_hash_map<Key, T, Hash, Eq, Alloc>, Key, T> {};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_ABSL_FLAT_HASH_MAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_flat_hash_set.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_flat_hash_set.h
new file mode 100644
index 00000000..9b371c03
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_flat_hash_set.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
+#define XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/stl/detail/nb_set.h"
+
+namespace nanobind {
+namespace detail {
+
+template <typename Key, typename Hash, typename Eq, typename Alloc>
+struct type_caster<absl::flat_hash_set<Key, Hash, Eq, Alloc>>
+    : set_caster<absl::flat_hash_set<Key, Hash, Eq, Alloc>, Key> {};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_inlined_vector.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_inlined_vector.h
new file mode 100644
index 00000000..0d273c28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_inlined_vector.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_ABSL_INLINED_VECTOR_H_
+#define XLA_PYTHON_NB_ABSL_INLINED_VECTOR_H_
+
+#include "absl/container/inlined_vector.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/stl/detail/nb_list.h"
+
+namespace nanobind {
+namespace detail {
+
+template <typename Type, size_t N, typename Alloc>
+struct type_caster<absl::InlinedVector<Type, N, Alloc>>
+    : list_caster<absl::InlinedVector<Type, N, Alloc>, Type> {};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_ABSL_INLINED_VECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_span.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_span.h
new file mode 100644
index 00000000..2a32a512
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_absl_span.h
@@ -0,0 +1,68 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_ABSL_SPAN_H_
+#define XLA_PYTHON_NB_ABSL_SPAN_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/stl/detail/nb_list.h"
+#include "nanobind/stl/vector.h"  // IWYU pragma: keep
+
+namespace nanobind {
+namespace detail {
+
+template <typename T>
+struct type_caster<absl::Span<T const>> {
+  NB_TYPE_CASTER(absl::Span<T const>,
+                 const_name("Span[") + make_caster<T>::Name + const_name("]"))
+
+  using Caster = make_caster<T>;
+
+  list_caster<std::vector<T>, T> vec_caster;
+
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (!vec_caster.from_python(src, flags, cleanup)) {
+      return false;
+    }
+    value = vec_caster.value;
+    return true;
+  }
+
+  static handle from_cpp(absl::Span<T const> src, rv_policy policy,
+                         cleanup_list *cleanup) noexcept {
+    object ret = steal(PyList_New(src.size()));
+    if (ret.is_valid()) {
+      Py_ssize_t i = 0;
+      for (const T &value : src) {
+        handle h = Caster::from_cpp(value, policy, cleanup);
+        if (!h.is_valid()) {
+          ret.reset();
+          break;
+        }
+        PyList_SET_ITEM(ret.ptr(), i++, h.ptr());
+      }
+    }
+    return ret.release();
+  }
+};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_ABSL_SPAN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_class_ptr.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_class_ptr.h
new file mode 100644
index 00000000..e6aa68f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_class_ptr.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_CLASS_PTR_H_
+#define XLA_PYTHON_NB_CLASS_PTR_H_
+
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+// A reference-counting smart pointer to a nanobind-wrapped class on the Python
+// heap. Type T must be a class known to nanobind via a nanobind::class_
+// declaration. nb_class_ptr is useful for managing C++ classes that may be
+// allocated inline in Python objects on the Python heap.
+template <typename T>
+class nb_class_ptr : public nanobind::object {
+ public:
+  inline nb_class_ptr() : nanobind::object() {}
+  inline nb_class_ptr(nanobind::handle h, ::nanobind::detail::borrow_t)
+      : nanobind::object(h, ::nanobind::detail::borrow_t{}) {}
+  inline nb_class_ptr(nanobind::handle h, ::nanobind::detail::steal_t)
+      : nanobind::object(h, ::nanobind::detail::steal_t{}) {}
+  inline static bool check_(nanobind::handle h) {
+    nanobind::handle type = nanobind::type<T>();
+    return h.type().is(type);
+  };
+
+  T* operator->() const { return nanobind::inst_ptr<T>(ptr()); }
+  T& operator*() const { return *nanobind::inst_ptr<T>(ptr()); }
+  T* get() const { return ptr() ? nanobind::inst_ptr<T>(ptr()) : nullptr; }
+};
+
+// This function is analogous to std::make_unique<T>(...), but instead it
+// allocates the object on the Python heap
+template <typename T, class... Args>
+nb_class_ptr<T> make_nb_class(Args&&... args) {
+  nanobind::handle type = nanobind::type<T>();
+  nanobind::object instance = nanobind::inst_alloc(type);
+  T* ptr = nanobind::inst_ptr<T>(instance);
+  new (ptr) T(std::forward<Args>(args)...);
+  nanobind::inst_mark_ready(instance);
+  return nb_class_ptr<T>(instance.release(), ::nanobind::detail::steal_t{});
+}
+
+}  // namespace xla
+
+#endif  //  XLA_PYTHON_NB_CLASS_PTR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_helpers.h
new file mode 100644
index 00000000..9e029d66
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_helpers.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_HELPERS_H_
+#define XLA_PYTHON_NB_HELPERS_H_
+
+#include <Python.h>
+
+#include "absl/strings/str_format.h"
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+// Issues a Python deprecation warning. Throws a C++ exception if issuing the
+// Python warning causes a Python exception to be raised.
+template <typename... Args>
+void PythonDeprecationWarning(int stacklevel,
+                              const absl::FormatSpec<Args...>& format,
+                              const Args&... args) {
+  if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                   absl::StrFormat(format, args...).c_str(), stacklevel) < 0) {
+    throw nanobind::python_error();
+  }
+}
+
+// Variant of NB_TYPE_CASTER that doesn't define from_cpp()
+#define NB_TYPE_CASTER_FROM_PYTHON_ONLY(Value_, descr)   \
+  using Value = Value_;                                  \
+  static constexpr auto Name = descr;                    \
+  template <typename T_>                                 \
+  using Cast = movable_cast_t<T_>;                       \
+  template <typename T_>                                 \
+  static constexpr bool can_cast() {                     \
+    return true;                                         \
+  }                                                      \
+  explicit operator Value*() { return &value; }          \
+  explicit operator Value&() { return (Value&)value; }   \
+  explicit operator Value&&() { return (Value&&)value; } \
+  Value value;
+
+template <typename Func>
+nanobind::object nb_property_readonly(Func&& get) {
+  nanobind::handle property(reinterpret_cast<PyObject*>(&PyProperty_Type));
+  return property(nanobind::cpp_function(std::forward<Func>(get)),
+                  nanobind::none(), nanobind::none(), "");
+}
+
+template <typename GetFunc, typename SetFunc>
+nanobind::object nb_property(GetFunc&& get, SetFunc&& set) {
+  nanobind::handle property(reinterpret_cast<PyObject*>(&PyProperty_Type));
+  return property(nanobind::cpp_function(std::forward<GetFunc>(get)),
+                  nanobind::cpp_function(std::forward<SetFunc>(set)),
+                  nanobind::none(), "");
+}
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_NB_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/nb_numpy.h b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_numpy.h
new file mode 100644
index 00000000..94820d46
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/nb_numpy.h
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Nanobind wrappers for NumPy types.
+//
+// Unlike pybind11, nanobind does not provide direct wrappers for NumPy types.
+// This file provides nanobind equivalents of pybind11::dtype and
+// pybind11::array.
+
+#ifndef XLA_PYTHON_NB_NUMPY_H_
+#define XLA_PYTHON_NB_NUMPY_H_
+
+#include <Python.h>
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+#include "xla/tsl/python/lib/core/numpy.h"
+
+#if NPY_ABI_VERSION < 0x02000000
+#define PyDataType_ELSIZE(descr) ((descr)->elsize)
+#endif
+
+namespace xla {
+
+// Caution: to use this type you must call tsl::ImportNumpy() in your module
+// initialization function. Otherwise PyArray_DescrCheck will be nullptr.
+class nb_dtype : public nanobind::object {
+ public:
+  NB_OBJECT_DEFAULT(nb_dtype, object, "dtype", PyArray_DescrCheck);  // NOLINT
+
+  explicit nb_dtype(const nanobind::str& format)
+      : nb_dtype(from_args(format)) {}
+  explicit nb_dtype(absl::string_view format)
+      : nb_dtype(from_args(nanobind::str(format.data(), format.size()))) {}
+
+  static nb_dtype from_args(const nanobind::object& args);
+
+  int char_() const {
+    auto* descr = reinterpret_cast<PyArray_Descr*>(ptr());
+    return descr->type;
+  }
+
+  int itemsize() const {
+    auto* descr = reinterpret_cast<PyArray_Descr*>(ptr());
+    return PyDataType_ELSIZE(descr);
+  }
+
+  /// Single-character code for dtype's kind.
+  /// For example, floating point types are 'f' and integral types are 'i'.
+  char kind() const {
+    auto* descr = reinterpret_cast<PyArray_Descr*>(ptr());
+    return descr->kind;
+  }
+};
+
+class nb_numpy_ndarray : public nanobind::object {
+ public:
+  NB_OBJECT_DEFAULT(nb_numpy_ndarray, object, "ndarray",
+                    PyArray_Check);  // NOLINT
+
+  nb_numpy_ndarray(nb_dtype dtype, absl::Span<int64_t const> shape,
+                   std::optional<absl::Span<int64_t const>> strides,
+                   const void* ptr = nullptr,
+                   nanobind::handle base = nanobind::handle());
+
+  // Ensures that the given handle is a numpy array. If provided,
+  // extra_requirements flags (NPY_ARRAY_...) are passed to PyArray_FromAny.
+  // In case of an error, nullptr is returned and the Python error is cleared.
+  static nb_numpy_ndarray ensure(nanobind::handle h,
+                                 int extra_requirements = 0);
+
+  // Constructs a numpy ndarray via the PyArray_From Any API. This throws an
+  // error if an exception occurs.
+  static nb_numpy_ndarray from_any(nanobind::handle h, int extra_requirements);
+
+  nb_dtype dtype() const;
+  npy_intp ndim() const;
+  const npy_intp* shape() const;
+  npy_intp shape(npy_intp dim) const;
+  const npy_intp* strides() const;
+  npy_intp strides(npy_intp dim) const;
+  npy_intp itemsize() const;
+  npy_intp size() const;
+  const void* data() const;
+  void* mutable_data();
+  int flags() const;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_NB_NUMPY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/ops.h b/third_party/tflite-hdrs/third_party/xla/xla/python/ops.h
new file mode 100644
index 00000000..936aeb0c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/ops.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_OPS_H_
+#define XLA_PYTHON_OPS_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildOpsSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjit.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjit.h
new file mode 100644
index 00000000..798152c7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjit.h
@@ -0,0 +1,27 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJIT_H_
+#define XLA_PYTHON_PJIT_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace jax {
+
+void BuildPjitSubmodule(nanobind::module_& m);
+}
+
+#endif  // XLA_PYTHON_PJIT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
new file mode 100644
index 00000000..c7ce68d8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
@@ -0,0 +1,170 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_BASIC_STRING_ARRAY_H_
+#define XLA_PYTHON_PJRT_IFRT_BASIC_STRING_ARRAY_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/hash/hash.h"
+#include "absl/log/check.h"
+#include "absl/strings/cord.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// `BasicStringArray` implements an `ifrt::Array` by wrapping a local (aka host)
+// string buffer. This object is expected to live exclusively in the IFRT layer,
+// and thus is not specific to any particular backend. However, it is currently
+// located in the pjrt_ifrt directory because we expect the main use of this
+// class is to implement IO Callable support in pjrt_ifrt.
+class BasicStringArray final
+    : public llvm::RTTIExtends<BasicStringArray, Array> {
+ public:
+  // Must be in dense major to minor order.
+  using Buffer = absl::Span<const absl::Cord>;
+
+  // One Buffer per shard.
+  static constexpr int kBuffersInlineSize = 1;
+  using Buffers = absl::InlinedVector<Buffer, kBuffersInlineSize>;
+
+  // Called when this object is done with the string buffer provided at the
+  // construction time.
+  using OnDoneWithBuffer = std::function<void()>;
+
+  // General array construction. The `buffers` and their elements
+  // (absl::Cords) must live until the `on_done_with_buffer` is called.
+  // The number and order of buffers must match the number and order of devices
+  // in `sharding`.
+  static absl::StatusOr<tsl::RCReference<BasicStringArray>> Create(
+      Client* client, Shape shape, std::shared_ptr<const Sharding> sharding,
+      Future<Buffers> buffers, OnDoneWithBuffer on_done_with_buffer);
+
+  ~BasicStringArray() override;
+
+  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+      ArrayCopySemantics semantics) override;
+
+  // ifrt::Array API
+
+  Client* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+
+  DType dtype() const override {
+    DCHECK(this);
+    return DType(DType::kString);
+  }
+
+  const Shape& shape() const override {
+    DCHECK(this);
+    return shape_;
+  }
+
+  const Sharding& sharding() const override {
+    DCHECK(this);
+    return *sharding_;
+  }
+
+  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+    DCHECK(this);
+    return sharding_;
+  }
+
+  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) override;
+
+  ABSL_MUST_USE_RESULT
+  Future<> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<tsl::RCReference<Array>> Copy(
+      std::optional<tsl::RCReference<xla::ifrt::DeviceList>> devices,
+      std::optional<xla::ifrt::MemoryKind> memory_kind,
+      ArrayCopySemantics semantics);
+
+  Future<> GetReadyFuture() const override;
+
+  Future<> Delete() override;
+  bool IsDeleted() const override;
+
+  std::string DebugString() const override;
+
+  // Methods specific to this Array variant (i.e., not from `ifrt::Array`).
+
+  // Returns a future holding the string buffers underlying this array. Valid
+  // only while this Array object is alive.
+  Future<Buffers> buffers() const {
+    return buffers_;  // Future copying is not considered expensive.
+  }
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  BasicStringArray(Client* client, Shape shape,
+                   std::shared_ptr<const Sharding> sharding,
+                   Future<Buffers> buffers, Future<> ready_future,
+                   OnDoneWithBuffer on_done_with_buffer);
+
+  // Internal implementation of delete.
+  void DeleteInternal() ABSL_LOCKS_EXCLUDED(mu_);
+
+  Client* client_;
+  Shape shape_;
+  std::shared_ptr<const Sharding> sharding_;
+  Future<Buffers> buffers_;
+  Future<> ready_future_;
+
+  mutable absl::Mutex mu_;
+  OnDoneWithBuffer on_done_with_buffer_ ABSL_GUARDED_BY(mu_);
+  bool is_deleted_ ABSL_GUARDED_BY(mu_) = false;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_BASIC_STRING_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
new file mode 100644
index 00000000..7a88f708
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -0,0 +1,206 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_ARRAY_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_ARRAY_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// Creates IFRT `MemoryKind` from an XLA `PjRtBuffer`.
+MemoryKind MakeMemoryKindFromPjRtBuffer(PjRtBuffer* pjrt_buffer);
+
+// PjRt-compatible `Array` interface that wraps a list of `xla::PjRtBuffer`s.
+class PjRtCompatibleArray
+    : public llvm::RTTIExtends<PjRtCompatibleArray, Array> {
+ public:
+  // APIs that allow direct access to `PjRtBuffer` for PjRt-only operations.
+  virtual absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() = 0;
+  virtual absl::StatusOr<absl::Span<std::shared_ptr<PjRtBuffer>>>
+  mutable_pjrt_buffers() = 0;
+
+  static char ID;  // NOLINT
+};
+
+// `Array` implementation that wraps a list of `xla::PjRtBuffer`s.
+class PjRtArray final
+    : public llvm::RTTIExtends<PjRtArray, PjRtCompatibleArray> {
+ public:
+  static constexpr int kPjRtBufferInlineSize = 1;
+  using PjRtBuffers =
+      absl::InlinedVector<std::shared_ptr<PjRtBuffer>, kPjRtBufferInlineSize>;
+
+  // General array construction (with static shape).
+  static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, DType dtype, Shape shape,
+      std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+
+  // General array construction (with dynamic shape).
+  static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, DType dtype, DynamicShape dynamic_shape,
+      std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+
+  // Shorthand for a single-shard array construction.
+  static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer);
+
+  // Shorthand for a multi-shard array construction using ConcreteSharding.
+  // TODO(hyeontaek): Remove this once IFRT Sharding and JAX Sharding is unified
+  // so that ConcreteSharding can be replaced with a real Sharding.
+  static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers);
+
+  // Shorthand for a multi-shard array construction using ConcreteSharding with
+  // DynamicShape.
+  static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, DynamicShape dynamic_shape,
+      PjRtBuffers pjrt_buffers);
+
+  // PjRtCompatibleArray implementation.
+
+  absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() override {
+    DCHECK(this);
+    return pjrt_buffers_;
+  }
+  absl::StatusOr<absl::Span<std::shared_ptr<PjRtBuffer>>> mutable_pjrt_buffers()
+      override {
+    DCHECK(this);
+    return absl::MakeSpan(pjrt_buffers_);
+  }
+
+  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+      ArrayCopySemantics semantics) override;
+
+  // Array implementation.
+
+  ~PjRtArray() override = default;
+
+  PjRtCompatibleClient* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+
+  DType dtype() const override {
+    DCHECK(this);
+    return dtype_;
+  }
+
+  bool has_dynamic_shape() const {
+    DCHECK(this);
+    return std::holds_alternative<DynamicShape>(shape_);
+  }
+
+  bool has_static_shape() const {
+    DCHECK(this);
+    return std::holds_alternative<Shape>(shape_);
+  }
+
+  const Shape& shape() const override {
+    DCHECK(has_static_shape());
+    return std::get<Shape>(shape_);
+  }
+
+  const DynamicShape& dynamic_shape() const {
+    DCHECK(has_dynamic_shape());
+    return std::get<DynamicShape>(shape_);
+  }
+
+  const Sharding& sharding() const override {
+    DCHECK(this);
+    return *sharding_;
+  }
+  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+    DCHECK(this);
+    return sharding_;
+  }
+
+  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) override;
+
+  ABSL_MUST_USE_RESULT
+  Future<> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<tsl::RCReference<Array>> Copy(
+      std::optional<tsl::RCReference<xla::ifrt::DeviceList>> devices,
+      std::optional<xla::ifrt::MemoryKind> memory_kind,
+      ArrayCopySemantics semantics);
+
+  Future<> GetReadyFuture() const override;
+
+  std::shared_ptr<PjRtBuffer> GetPjRtBuffer(ArrayCopySemantics semantics,
+                                            int index) const;
+
+  Future<> Delete() override;
+  bool IsDeleted() const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtArray(PjRtCompatibleClient* client, DType dtype, Shape shape,
+            std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+
+  PjRtArray(PjRtCompatibleClient* client, DType dtype,
+            DynamicShape dynamic_shape,
+            std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  PjRtCompatibleClient* client_;
+  DType dtype_;
+  std::variant<Shape, DynamicShape> shape_;
+  std::shared_ptr<const Sharding> sharding_;
+  PjRtBuffers pjrt_buffers_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.h
new file mode 100644
index 00000000..1ee74687
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_ATTRIBUTE_MAP_UTIL_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_ATTRIBUTE_MAP_UTIL_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/python/ifrt/attribute_map.h"
+
+namespace xla {
+namespace ifrt {
+
+// Converts a PjRt attribute map into an IFRT attribute map.
+AttributeMap FromPjRtAttributeMap(
+    absl::flat_hash_map<std::string, xla::PjRtValueType> attributes);
+
+// Converts an IFRT attribute map into a PjRt attribute map.
+absl::flat_hash_map<std::string, xla::PjRtValueType> ToPjRtAttributeMap(
+    AttributeMap attributes);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_ATTRIBUTE_MAP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
new file mode 100644
index 00000000..634f74d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -0,0 +1,322 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/literal.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/pjrt_ifrt/pjrt_compiler.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtCompatibleArray;
+class PjRtCompatibleDevice;
+class PjRtCompatibleMemory;
+class PjRtDevice;
+class PjRtMemory;
+
+// PjRt-compatible `Client` interface.
+class PjRtCompatibleClient
+    : public llvm::RTTIExtends<PjRtCompatibleClient, Client> {
+ public:
+  static constexpr int kPjRtBufferInlineSize = 1;
+  using PjRtBuffers =
+      absl::InlinedVector<std::shared_ptr<PjRtBuffer>, kPjRtBufferInlineSize>;
+
+  // APIs that allow direct access to `xla::PjRtClient` for PjRt-only
+  // operations.
+  virtual xla::PjRtClient* pjrt_client() = 0;
+  virtual std::shared_ptr<xla::PjRtClient> shared_ptr_pjrt_client() = 0;
+  virtual absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      std::shared_ptr<PjRtBuffer> pjrt_buffer) = 0;
+  virtual absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      Shape shape, PjRtBuffers pjrt_buffers) = 0;
+  virtual absl::StatusOr<PjRtCompatibleDevice*> LookupPjRtDevice(
+      xla::PjRtDevice* pjrt_device) const = 0;
+  virtual absl::StatusOr<PjRtCompatibleMemory*> LookupPjRtMemory(
+      xla::PjRtMemorySpace* pjrt_memory) const = 0;
+
+  static char ID;  // NOLINT
+};
+
+// `Client` implementation that wraps `xla::PjRtClient`.
+class PjRtClient final
+    : public llvm::RTTIExtends<PjRtClient, PjRtCompatibleClient> {
+ public:
+  struct CreateOptions {
+    std::shared_ptr<xla::PjRtClient> pjrt_client;
+
+    // KV store for sharing topology information. If present, PJRT-IFRT will do
+    // its own topology exchange. If omitted, we will trust whatever topology
+    // information the PJRT client reports.
+    std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr;
+
+    // Number of distributed processes. Ignored if kv_store is omitted.
+    int num_processes = 1;
+
+    // My process ID. Ignored if kv_store is omitted.
+    int process_id = 0;
+
+    absl::Duration get_local_topology_timeout = absl::Minutes(2);
+    absl::Duration get_global_topology_timeout = absl::Minutes(5);
+
+    // Device mapping to construct a global view consisting of both addressable
+    // and non-addressable devices.
+    //
+    // If omitted, the PjRt client's device view will be used as-is.
+    //
+    // Currently supported only if `kv_store` is unspecified.
+    struct GlobalDeviceMapping {
+      // Device IDs to use for addressable devices exported by `pjrt_client`.
+      // It must have the same number of addressable devices as
+      // `pjrt_client`.
+      absl::flat_hash_set<DeviceId> addressable_device_ids;
+
+      // Mapping of device ID to process index for all processes. The local
+      // process index is identified by the entry whose device ID matches one in
+      // `addressable_device_ids`.
+      absl::flat_hash_map<DeviceId, int> device_id_to_process_index;
+    };
+    std::optional<GlobalDeviceMapping> global_device_mapping;
+
+    // Whether to sort devices by (process index, device ID). If false, sort
+    // devices only by device ID.
+    bool sort_devices_by_process_index = true;
+  };
+
+  static absl::StatusOr<std::unique_ptr<PjRtClient>> Create(
+      CreateOptions options);
+
+  // Creates a `Client` with a `PjRtClient`.
+  // Dies if Create() fails.
+  // Deprecated, use the overload that accepts `CreateOptions`.
+  static std::unique_ptr<PjRtClient> Create(
+      std::shared_ptr<xla::PjRtClient> pjrt_client);
+
+  // PjRtCompatibleClient implementation.
+
+  xla::PjRtClient* pjrt_client() override { return pjrt_client_.get(); }
+  std::shared_ptr<xla::PjRtClient> shared_ptr_pjrt_client() override {
+    return pjrt_client_;
+  }
+  absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      std::shared_ptr<PjRtBuffer> pjrt_buffer) override;
+  absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      Shape shape, PjRtBuffers pjrt_buffers) override;
+
+  // Client implementation.
+
+  ~PjRtClient() override;
+
+  // For making Arrays with `dtype` as kString:
+  //   (1) the `data` argument should point to an array of `absl::Cord`
+  //   in major-to-minor order,
+  //   (2) `byte_strides` are not supported, and non-`nullopt` values cause this
+  //   function to fail.
+  //   (3) only the `kImmutableDuringCall` semantics is supported currently.
+  //   Fails for other values of `HostBufferSemantics`.
+  absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      std::shared_ptr<const Sharding> sharding,
+      Client::HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) override;
+
+  absl::StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<Array>> arrays,
+      ArrayCopySemantics semantics) override;
+  absl::StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<Array>> arrays,
+      ArrayCopySemantics array_copy_semantics,
+      SingleDeviceShardSemantics single_device_shard_semantics) override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>> CopyArrays(
+      absl::Span<tsl::RCReference<Array>> arrays,
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
+      const RemapPlan& plan,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics) override;
+
+  Future<> GetReadyFuture(
+      absl::Span<const tsl::RCReference<Value>> values) override;
+
+  absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<Value>> values) override;
+
+  absl::string_view runtime_type() const override { return "pjrt_ifrt"; }
+
+  absl::string_view platform_name() const override {
+    DCHECK(this);
+    return pjrt_client_->platform_name();
+  }
+  absl::string_view platform_version() const override {
+    DCHECK(this);
+    return pjrt_client_->platform_version();
+  }
+  PlatformId platform_id() const override {
+    DCHECK(this);
+    return pjrt_client_->platform_id();
+  }
+
+  const AttributeMap& Attributes() const override;
+
+  int device_count() const override {
+    DCHECK(this);
+    return devices_.size();
+  }
+  int addressable_device_count() const override {
+    DCHECK(this);
+    return pjrt_client_->addressable_device_count();
+  }
+  absl::Span<Device* const> devices() const override {
+    DCHECK(this);
+    return devices_;
+  }
+  absl::Span<Device* const> addressable_devices() const override {
+    DCHECK(this);
+    return addressable_devices_;
+  }
+  int process_index() const override { return my_process_index_; }
+
+  absl::Span<Device* const> GetAllDevices() const override {
+    DCHECK(this);
+    return devices_;
+  }
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override {
+    DCHECK(this);
+    return pjrt_client_->GetDefaultDeviceAssignment(num_replicas,
+                                                    num_partitions);
+  }
+  absl::StatusOr<Device*> LookupDevice(DeviceId device_id) const override;
+
+  absl::StatusOr<Device*> LookupAddressableDevice(
+      int local_hardware_id) const override;
+
+  Compiler* GetDefaultCompiler() override {
+    DCHECK(this);
+    return &default_compiler_;
+  }
+
+  absl::StatusOr<std::shared_ptr<Topology>> GetTopologyForDevices(
+      const tsl::RCReference<DeviceList>& devices) const override;
+
+  absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> GetDefaultLayout(
+      DType dtype, absl::Span<const int64_t> dims, Device* device,
+      MemoryKind memory_kind) const override;
+
+  absl::StatusOr<PjRtCompatibleDevice*> LookupPjRtDevice(
+      xla::PjRtDevice* pjrt_device) const override;
+  absl::StatusOr<PjRtCompatibleMemory*> LookupPjRtMemory(
+      xla::PjRtMemorySpace* pjrt_memory) const override;
+
+  // Returns the PjRt global device ID for the given IFRT device ID. This
+  // succeeds only if the PjRt global device ID was available in `pjrt_client_`
+  // or it has been discovered through topology exchange; in other words, it
+  // also supports getting the PjRt global device ID for non-addressable IFRT
+  // device IDs, unlike `xla::ifrt::PjRtDevice` that does not keep
+  // `xla::PjRtDevice` around for non-addressable devices.
+  //
+  // Note that it does not yet support non-addressable IFRT device IDs created
+  // by PjRt-IFRT with the global device mapping because there is no well-agreed
+  // PjRt device ID allocation that PjRt-IFRT can assume.
+  absl::StatusOr<xla::PjRtGlobalDeviceId> GetPjRtGlobalDeviceId(
+      DeviceId device_id) const;
+
+  // Transfer the given literal to the infeed queue.
+  absl::Status TransferToInfeed(PjRtDevice* device,
+                                const LiteralSlice& literal);
+
+  // Transfer and return a value of the given shape from the outfeed queue.
+  absl::Status TransferFromOutfeed(PjRtDevice* device,
+                                   MutableBorrowingLiteral literal);
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client);
+
+  std::shared_ptr<xla::PjRtClient> pjrt_client_;
+  PjRtCompiler default_compiler_;
+
+  // My process ID used as an IFRT client.
+  int my_process_index_;
+  // Mapping from IFRT device ID to PjRt global device ID. Made for the devices
+  // that are accessible via `pjrt_client_->devices()`.
+  absl::flat_hash_map<DeviceId, xla::PjRtGlobalDeviceId>
+      ifrt_device_id_to_pjrt_global_device_id_;
+
+  AttributeMap attributes_;
+
+  std::vector<std::unique_ptr<PjRtDevice>> owned_devices_;
+  std::vector<std::unique_ptr<PjRtMemory>> owned_memories_;
+
+  std::vector<Device*> devices_;
+  std::vector<Device*> addressable_devices_;
+  absl::flat_hash_map<xla::PjRtDevice*, PjRtDevice*> device_map_;
+  absl::flat_hash_map<xla::PjRtMemorySpace*, PjRtMemory*> memory_map_;
+  absl::flat_hash_map<DeviceId, PjRtDevice*> device_id_map_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
new file mode 100644
index 00000000..930f89d8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtClient;
+
+// Compiler that produces PjRt executables.
+//
+// TODO(hyeontaek): Move executable loading to `PjRtClient` and remove the
+// requirement of `PjRtClient`, which will enable ahead-of-time compilation.
+class PjRtCompiler final : public llvm::RTTIExtends<PjRtCompiler, Compiler> {
+ public:
+  explicit PjRtCompiler(PjRtClient* client) : client_(client) {}
+
+  // Compiler implementation.
+
+  ~PjRtCompiler() override = default;
+
+  absl::StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
+      std::unique_ptr<Program> program,
+      std::unique_ptr<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<Program> program, const Topology& topology,
+      std::unique_ptr<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<LoadedExecutable>> DeserializeLoadedExecutable(
+      absl::string_view serialized,
+      std::unique_ptr<DeserializeExecutableOptions> options) override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtClient* client_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
new file mode 100644
index 00000000..596db196
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtCompatibleDevice : public llvm::RTTIExtends<PjRtDevice, Device> {
+ public:
+  virtual xla::PjRtDevice* pjrt_device() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+class PjRtDevice final
+    : public llvm::RTTIExtends<PjRtDevice, PjRtCompatibleDevice> {
+ public:
+  PjRtDevice(PjRtClient* client, DeviceId id, std::string kind,
+             std::string to_string, std::string debug_string, int process_index,
+             absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
+             xla::PjRtDevice* pjrt_device);
+
+  // Non-null only for addressable devices. nullptr for non-addressable devices.
+  xla::PjRtDevice* pjrt_device() const override { return pjrt_device_; }
+
+  // Device implementation.
+
+  PjRtClient* client() const override { return client_; }
+
+  DeviceId Id() const final;
+  const AttributeMap& Attributes() const final;
+  absl::string_view Kind() const final;
+  absl::string_view ToString() const final;
+  absl::string_view DebugString() const final;
+  bool IsAddressable() const final;
+  absl::StatusOr<Memory*> DefaultMemory() const final;
+  absl::Span<Memory* const> Memories() const final;
+  int ProcessIndex() const final;
+
+  static char ID;  // NOLINT
+
+ private:
+  friend class PjRtClient;
+
+  PjRtClient* client_;
+
+  DeviceId id_;
+  AttributeMap attributes_;
+  std::string kind_;
+  std::string to_string_;
+  std::string debug_string_;
+  absl::StatusOr<Memory*> default_memory_;
+  std::vector<Memory*> memories_;
+  int process_index_;
+
+  xla::PjRtDevice* pjrt_device_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.h
new file mode 100644
index 00000000..f0ace029
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_DTYPE_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_DTYPE_H_
+
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Converts IFRT `DType` into `xla::PrimitiveType`.
+absl::StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype);
+
+// Converts `xla::PrimitiveType` into IFRT `DType`.
+absl::StatusOr<DType> ToDType(xla::PrimitiveType primitive_type);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_DTYPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
new file mode 100644
index 00000000..b6c8c359
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -0,0 +1,350 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_EXECUTABLE_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/pjrt_ifrt/pjrt_attribute_map_util.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+// PjRt-compatible `Executable` interface.
+class PjRtCompatibleExecutable
+    : public llvm::RTTIExtends<PjRtCompatibleExecutable, Executable> {
+ public:
+  // APIs that allow direct access to `xla::PjRtExecutable` for PjRt-only
+  // operations.
+  virtual xla::PjRtExecutable* pjrt_executable() = 0;
+
+  static char ID;  // NOLINT
+};
+
+// PjRt-compatible `LoadedExecutable` interface.
+class PjRtCompatibleLoadedExecutable
+    : public llvm::RTTIExtends<PjRtCompatibleLoadedExecutable,
+                               LoadedExecutable> {
+ public:
+  // APIs that allow direct access to `xla::PjRtLoadedExecutable` for PjRt-only
+  // operations.
+  virtual xla::PjRtLoadedExecutable* pjrt_loaded_executable() = 0;
+  virtual std::shared_ptr<xla::PjRtLoadedExecutable>
+  shared_ptr_pjrt_loaded_executable() = 0;
+
+  static char ID;  // NOLINT
+};
+
+// `Executable` implementation that wraps a `xla::PjRtExecutable`.
+class PjRtExecutable final
+    : public llvm::RTTIExtends<PjRtExecutable, PjRtCompatibleExecutable> {
+ public:
+  // Creates PjRtExecutable from xla::PjRtExecutable.
+  static absl::StatusOr<std::unique_ptr<Executable>> Create(
+      std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
+
+  // PjRtCompatibleExecutable implementation.
+
+  xla::PjRtExecutable* pjrt_executable() override {
+    DCHECK(this);
+    return pjrt_executable_.get();
+  }
+
+  // Executable implementation.
+
+  ~PjRtExecutable() override = default;
+
+  absl::string_view name() const override {
+    DCHECK(this);
+    return pjrt_executable_->name();
+  }
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const override {
+    DCHECK(this);
+    return pjrt_executable_->GetParameterShardings();
+  }
+
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetOutputShardings();
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetParameterLayouts() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetParameterLayouts();
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetOutputLayouts();
+  }
+
+  absl::StatusOr<std::optional<std::string>> Fingerprint() const override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  int num_devices() const override {
+    DCHECK(this);
+    return pjrt_executable_->num_replicas() *
+           pjrt_executable_->num_partitions();
+  }
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    DCHECK(this);
+    return pjrt_executable_->SizeOfGeneratedCodeInBytes();
+  }
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetCompiledMemoryStats();
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    DCHECK(this);
+    return pjrt_executable_->GetHloModules();
+  }
+
+  absl::StatusOr<xla::ifrt::AttributeMap> GetCostAnalysis() const override {
+    TF_ASSIGN_OR_RETURN(auto result, pjrt_executable_->GetCostAnalysis());
+    return xla::ifrt::FromPjRtAttributeMap(std::move(result));
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return pjrt_executable_->GetOutputMemoryKinds();
+  }
+
+  static char ID;  // NOLINT
+
+ protected:
+  explicit PjRtExecutable(std::shared_ptr<xla::PjRtExecutable> pjrt_executable)
+      : pjrt_executable_(std::move(pjrt_executable)) {}
+
+  std::shared_ptr<xla::PjRtExecutable> pjrt_executable_;
+};
+
+// `LoadedExecutable` implementation that wraps a `xla::PjRtLoadedExecutable`.
+class PjRtLoadedExecutable final
+    : public llvm::RTTIExtends<PjRtLoadedExecutable,
+                               PjRtCompatibleLoadedExecutable> {
+ public:
+  using LoadedExecutable::ExecuteOptions;
+  using LoadedExecutable::ExecuteResult;
+
+  // Creates PjRtExecutable from xla::PjRtLoadedExecutable. We expect that
+  // xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings.
+  // PjRtLoadedExecutable::GetHloModules() must be implemented.
+  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+      PjRtCompatibleClient* client,
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
+
+  // Creates PjRtExecutable from an MHLO or StableHLO MLIR module. We expect
+  // that xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings. If
+  // options.executable_build_options has use_auto_spmd_partitioning or
+  // allow_spmd_sharding_propagation_to_output enabled,
+  // PjRtLoadedExecutable::GetHloModules() must be implemented.
+  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+      PjRtCompatibleClient* client, mlir::ModuleOp module,
+      xla::CompileOptions compile_options,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
+
+  // PjRtCompatibleLoadedExecutable implementation.
+
+  xla::PjRtLoadedExecutable* pjrt_loaded_executable() override {
+    DCHECK(this);
+    return pjrt_loaded_executable_.get();
+  }
+  std::shared_ptr<xla::PjRtLoadedExecutable> shared_ptr_pjrt_loaded_executable()
+      override {
+    DCHECK(this);
+    return pjrt_loaded_executable_;
+  }
+
+  // LoadedExecutable implementation.
+
+  ~PjRtLoadedExecutable() override;
+
+  absl::string_view name() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->name();
+  }
+
+  Future<> GetReadyFuture() const override {
+    // PjRtCompiler blocks until compilation finishes and returns only the
+    // executables that are ready.
+    return Future<>(absl::OkStatus());
+  }
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetParameterShardings();
+  }
+
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetOutputShardings();
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetParameterLayouts() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetParameterLayouts();
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetOutputLayouts();
+  }
+
+  absl::StatusOr<std::optional<std::string>> Fingerprint() const override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  int num_devices() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->num_replicas() *
+           pjrt_loaded_executable_->num_partitions();
+  }
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
+  }
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetCompiledMemoryStats();
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetHloModules();
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetOutputMemoryKinds();
+  }
+
+  PjRtCompatibleClient* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+  absl::StatusOr<ExecuteResult> Execute(
+      absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
+      std::optional<tsl::RCReference<DeviceList>> devices) override;
+
+  Future<> Delete() override;
+  bool IsDeleted() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->IsDeleted();
+  }
+
+  absl::Span<Device* const> addressable_devices() const override {
+    DCHECK(this);
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<xla::ifrt::AttributeMap> GetCostAnalysis() const override {
+    TF_ASSIGN_OR_RETURN(auto result,
+                        pjrt_loaded_executable_->GetCostAnalysis());
+    return xla::ifrt::FromPjRtAttributeMap(std::move(result));
+  }
+
+  static char ID;  // NOLINT
+
+ private:
+  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> CreateInternal(
+      PjRtCompatibleClient* client,
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      absl::Span<const xla::PrimitiveType> result_element_types,
+      absl::Span<const xla::DimensionVector> result_dimensions,
+      const std::optional<xla::HloSharding>& result_hlo_sharding,
+      const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
+
+  PjRtLoadedExecutable(
+      PjRtCompatibleClient* client,
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      tsl::RCReference<DeviceList> devices,
+      std::vector<Device*> addressable_devices,
+      std::vector<tsl::RCReference<LoadedHostCallback>>
+          all_loaded_host_callbacks,
+      std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+          host_send_recv_callbacks,
+      std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+      std::vector<std::shared_ptr<const Sharding>> output_shardings);
+
+  PjRtCompatibleClient* client_;
+  std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
+  // Devices that `pjrt_loaded_executable_` runs on. Empty if the executable is
+  // portable.
+  tsl::RCReference<DeviceList> devices_;
+  std::vector<Device*> addressable_devices_;
+  std::shared_ptr<std::vector<tsl::RCReference<LoadedHostCallback>>>
+      all_loaded_host_callbacks_;
+  std::vector<PjRtHostSendAndRecvLoadedHostCallback*> host_send_recv_callbacks_;
+
+  // Output array specs. If the executable is portable, shardings in
+  // `output_shardings_` will use an arbitrary addressable device, and will be
+  // overridden by a `SingleDeviceSharding` generated on the fly at execution
+  // time.
+  std::vector<DType> output_dtypes_;
+  std::vector<Shape> output_shapes_;
+  std::vector<std::shared_ptr<const Sharding>> output_shardings_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h
new file mode 100644
index 00000000..8c539af8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h
@@ -0,0 +1,70 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_HOST_CALLBACK_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_HOST_CALLBACK_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/python/ifrt/host_callback.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wrapper of a PjRt `xla::HostCallback` that uses XLA host send and recv. This
+// object is expected to be passed to the compiler when creating
+// `xla::ifrt::PjRtLoadedExecutable`.
+//
+// `PjRtLoadedHostCallback` does not support serialization by default, but it
+// may be implemented by subclassing it.
+//
+// TODO(hyeontaek): Update the comment (compiler to client) after splitting
+// compilation and loading.
+class PjRtHostSendAndRecvLoadedHostCallback
+    : public llvm::RTTIExtends<PjRtHostSendAndRecvLoadedHostCallback,
+                               LoadedHostCallback> {
+ public:
+  PjRtHostSendAndRecvLoadedHostCallback(
+      Client* client, std::unique_ptr<xla::HostCallback> host_callback)
+      : client_(client), host_callback_(std::move(host_callback)) {}
+
+  const xla::HostCallback& host_callback() const { return *host_callback_; }
+
+  // LoadedHostCallback implementation.
+
+  ~PjRtHostSendAndRecvLoadedHostCallback() override = default;
+
+  Client* client() const override { return client_; }
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  Client* client_;
+  std::unique_ptr<xla::HostCallback> host_callback_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_HOST_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
new file mode 100644
index 00000000..f6517f9e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
@@ -0,0 +1,102 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_MEMORY_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_MEMORY_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/memory.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtClient;
+
+class PjRtCompatibleMemory
+    : public llvm::RTTIExtends<PjRtCompatibleMemory, Memory> {
+ public:
+  virtual xla::PjRtMemorySpace* pjrt_memory() = 0;
+
+  static char ID;  // NOLINT
+};
+
+class PjRtMemory final
+    : public llvm::RTTIExtends<PjRtMemory, PjRtCompatibleMemory> {
+ public:
+  PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory);
+
+  PjRtClient* client() const { return client_; }
+  xla::PjRtMemorySpace* pjrt_memory() override { return pjrt_memory_; }
+
+  MemoryId Id() const override;
+  const MemoryKind& Kind() const override;
+  absl::string_view ToString() const override;
+  absl::string_view DebugString() const override;
+  absl::Span<Device* const> Devices() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtClient* client_;
+  xla::PjRtMemorySpace* pjrt_memory_;
+  MemoryKind kind_;
+  std::vector<Device*> devices_;
+};
+
+class PjRtMemoryDescription final
+    : public llvm::RTTIExtends<PjRtMemoryDescription, PjRtCompatibleMemory> {
+ public:
+  PjRtMemoryDescription(PjRtClient* client, absl::Span<Device*> devices,
+                        const xla::PjRtMemorySpaceDescription* desc);
+
+  PjRtClient* client() const { return client_; }
+  xla::PjRtMemorySpace* pjrt_memory() override { return nullptr; }
+
+  MemoryId Id() const override;
+  const MemoryKind& Kind() const override;
+  absl::string_view ToString() const override;
+  absl::string_view DebugString() const override;
+  absl::Span<Device* const> Devices() const override { return devices_; }
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtClient* client_;
+  const xla::PjRtMemorySpaceDescription* desc_;
+  MemoryKind kind_;
+  std::vector<Device*> devices_;
+};
+
+// Canonicalizes `MemoryKind`. If `MemoryKind` has no memory kind chosen,
+// returns a default `MemoryKind` chosen for the PjRt device. If there is no
+// default indicated by the device, simply returns `MemoryKind` with no memory
+// kind chosen.
+//
+// TODO(hyeontaek,yashkatariya): Harden `MemoryKind` creation paths so that
+// every `MemoryKind` is canonicalized and does not require on-demand
+// canonicalization.
+MemoryKind CanonicalizeMemoryKindWithPjRtDevice(MemoryKind memory_kind,
+                                                xla::PjRtDevice* device);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_MEMORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
new file mode 100644
index 00000000..ed9083e1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_REMAP_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_REMAP_H_
+
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtCompatibleClient;
+
+// Common implementation of `xla::ifrt::Client::RemapArrays` for
+// `PjRtCompatibleClient`.
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+PjRtCompatibleClientRemapArrays(
+    PjRtCompatibleClient* client, const RemapPlan& plan,
+    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+    ArrayCopySemantics semantics);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_REMAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h
new file mode 100644
index 00000000..2543fe75
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_topology.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_TOPOLOGY_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_TOPOLOGY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::ifrt {
+
+class PjRtTopology final : public llvm::RTTIExtends<PjRtTopology, Topology> {
+ public:
+  explicit PjRtTopology(
+      std::shared_ptr<const xla::PjRtTopologyDescription> description);
+
+  absl::string_view platform_name() const override;
+  absl::string_view platform_version() const override;
+  PjRtPlatformId platform_id() const override;
+
+  const std::shared_ptr<const xla::PjRtTopologyDescription>& description()
+      const override {
+    return description_;
+  }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override;
+
+  absl::StatusOr<xla::Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  const AttributeMap& Attributes() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  std::shared_ptr<const xla::PjRtTopologyDescription> description_;
+  AttributeMap attributes_;
+};
+
+}  // namespace xla::ifrt
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_TOPOLOGY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
new file mode 100644
index 00000000..2140d194
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_TUPLE_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_TUPLE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtTuple final : public llvm::RTTIExtends<PjRtTuple, Tuple> {
+ public:
+  static absl::StatusOr<tsl::RCReference<PjRtTuple>> Create(
+      PjRtCompatibleClient* client, absl::Span<tsl::RCReference<Value>> values);
+
+  ~PjRtTuple() override = default;
+
+  PjRtCompatibleClient* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+
+  Future<> GetReadyFuture() const override;
+
+  Future<> Delete() override;
+
+  bool IsDeleted() const override;
+
+  std::string DebugString() const override;
+
+  int Arity() override;
+
+  absl::Status Unpack(absl::Span<tsl::RCReference<Value>> values) override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtTuple(PjRtCompatibleClient* client,
+            absl::Span<tsl::RCReference<Value>> values);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  PjRtCompatibleClient* client_;
+  absl::InlinedVector<tsl::RCReference<Value>, 4> values_;
+
+  absl::Mutex mu_;
+
+  // Notifying requires holding mu_.
+  absl::Notification is_deleted_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_TUPLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
new file mode 100644
index 00000000..1cf81e25
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
@@ -0,0 +1,103 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_XLA_COMPILER_H_
+#define XLA_PYTHON_PJRT_IFRT_XLA_COMPILER_H_
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/host_callback.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wraps compilation options for an XLA computation.
+//
+// TODO(hyeontaek): Move this class out of pjrt_ifrt.
+//
+// TODO(hyeontaek): Move `loaded_host_callbacks` to a (new) `LoadOptions`
+// because compilation (without loading) should not take them.
+struct XlaCompileOptions
+    : llvm::RTTIExtends<XlaCompileOptions, CompileOptions> {
+  XlaCompileOptions() = default;
+  explicit XlaCompileOptions(xla::CompileOptions compile_options,
+                             std::vector<tsl::RCReference<LoadedHostCallback>>
+                                 loaded_host_callbacks = {})
+      : compile_options(std::move(compile_options)),
+        loaded_host_callbacks(std::move(loaded_host_callbacks)) {}
+
+  xla::CompileOptions compile_options;
+  std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks;
+
+  // CompileOptions implementation.
+
+  ~XlaCompileOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+// Wraps deserialization options for an XLA computation.
+//
+// TODO(hyeontaek): Move this class out of pjrt_ifrt.
+//
+// TODO(hyeontaek): Move `loaded_host_callbacks` to a (new) `LoadOptions`
+// because deserialization (without loading) should not take them.
+struct XlaDeserializeExecutableOptions
+    : llvm::RTTIExtends<XlaDeserializeExecutableOptions,
+                        DeserializeExecutableOptions> {
+  XlaDeserializeExecutableOptions() = default;
+  explicit XlaDeserializeExecutableOptions(
+      std::optional<xla::CompileOptions> compile_options,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks =
+          {})
+      : compile_options(std::move(compile_options)),
+        loaded_host_callbacks(std::move(loaded_host_callbacks)) {}
+
+  // `compile_options` may be unspecified if deserialization does not override
+  // it.
+  std::optional<xla::CompileOptions> compile_options;
+  std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks;
+
+  // DeserializeExecutableOptions implementation.
+
+  ~XlaDeserializeExecutableOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+// Gets `xla::ifrt::XlaCompileOptions` from `xla::ifrt::CompileOptions`.
+absl::StatusOr<std::unique_ptr<XlaCompileOptions>> GetXlaCompileOptions(
+    std::unique_ptr<CompileOptions> options);
+
+// Gets `xla::ifrt::XlaDeserializeExecutableOptions` from
+// `xla::ifrt::DeserializeExecutableOptions`.
+absl::StatusOr<std::unique_ptr<XlaDeserializeExecutableOptions>>
+GetXlaDeserializeExecutableOptions(
+    std::unique_ptr<DeserializeExecutableOptions> options);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_XLA_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
new file mode 100644
index 00000000..44645fdf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
@@ -0,0 +1,117 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_XLA_SHARDING_H_
+#define XLA_PYTHON_PJRT_IFRT_XLA_SHARDING_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/index_domain.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// XLA-compatible sharding types.
+class XlaCompatibleSharding
+    : public llvm::RTTIExtends<XlaCompatibleSharding, Sharding> {
+ public:
+  using llvm::RTTIExtends<XlaCompatibleSharding, Sharding>::RTTIExtends;
+
+  static char ID;  // NOLINT
+};
+
+// XLA `HloSharding` wrapper. `HloSharding` is the main sharding representation
+// in XLA. This class holds an `HloSharding` to be used with IFRT.
+class HloSharding final
+    : public llvm::RTTIExtends<HloSharding, XlaCompatibleSharding> {
+ public:
+  // Creates an `HloSharding` wrapper. This bypasses consistency checks against
+  // devices to optimize the common path of passing it to the user or to a
+  // lower-level runtime. It is instead validated when the information in the
+  // sharding is used within IFRT, e.g., in `Disassemble()`.
+  static std::unique_ptr<HloSharding> Create(
+      tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+      xla::HloSharding xla_hlo_sharding);
+
+  // Returns the wrapped XLA `HloSharding`.
+  const xla::HloSharding& xla_hlo_sharding() const { return xla_hlo_sharding_; }
+
+  // Sharding implementation.
+
+  ~HloSharding() override = default;
+
+  absl::StatusOr<Shape> GetShardShape(const Shape& shape) const override;
+
+  bool HasSamePartitioning(const Sharding& other) const override;
+
+  absl::StatusOr<std::unique_ptr<Sharding>> WithDeviceAssignment(
+      std::optional<tsl::RCReference<DeviceList>> devices,
+      std::optional<MemoryKind> memory_kind) const override;
+
+  absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+  Disassemble(const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+  Disassemble(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<
+      std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+  Disassemble(const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<
+      std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+  Disassemble(
+      const DynamicShape& dynamic_shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape,
+      SingleDeviceShardSemantics single_device_shard_semantics) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  HloSharding(tsl::RCReference<DeviceList> devices, MemoryKind memory_kind,
+              xla::HloSharding xla_hlo_sharding);
+
+  xla::HloSharding xla_hlo_sharding_;
+};
+
+// Test only: returns `HloSharding::IndexDomains()`, using `xla::HloSharding`
+// APIs internally.
+std::vector<IndexDomain> TEST_HloShardingIndexDomainsSlowPath(
+    const HloSharding& sharding, const Shape& shape,
+    SingleDeviceShardSemantics single_device_shard_semantics);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_XLA_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pmap_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pmap_lib.h
new file mode 100644
index 00000000..0cb83079
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pmap_lib.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PMAP_LIB_H_
+#define XLA_PYTHON_PMAP_LIB_H_
+
+#include <optional>
+#include <utility>
+#include <vector>
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+// TODO(jblespiau): The current implementation moves the Python logic to C++,
+// as a preliminary step to executing the `pmap` execution path from C++.
+// It implements the current Python behavior (thus, it may not be optimal, and
+// we will be able to modify it later).
+
+namespace jax {
+
+void BuildPmapSubmodule(nanobind::module_& m);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_PMAP_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pprof_profile_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pprof_profile_builder.h
new file mode 100644
index 00000000..8c1ee9af
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pprof_profile_builder.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PPROF_PROFILE_BUILDER_H_
+#define XLA_PYTHON_PPROF_PROFILE_BUILDER_H_
+
+#include <Python.h>
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "nanobind/nanobind.h"
+#include "tsl/profiler/protobuf/profile.pb.h"
+
+namespace xla {
+
+// Helper class for building pprof::Profile profiles.
+class PprofProfileBuilder {
+ public:
+  PprofProfileBuilder();
+  tensorflow::tfprof::pprof::Profile& profile() { return profile_; }
+
+  // Adds or returns the ID of `s` in the table.
+  int StringId(absl::string_view s);
+
+  // Adds or returns the ID of a function.
+  int FunctionId(PyCodeObject* code);
+
+  // Adds or returns the ID of a code location.
+  int LocationId(PyCodeObject* code, int instruction);
+
+ private:
+  tensorflow::tfprof::pprof::Profile profile_;
+
+  absl::flat_hash_map<std::string, int> strings_;
+  absl::flat_hash_map<PyCodeObject*, int> functions_;
+  absl::flat_hash_map<std::pair<PyCodeObject*, int>, int> locations_;
+};
+
+// Converts the JSON representation of a pprof profile protocol buffer into
+// a serialized protocol buffer. We want to allow Python code to construct pprof
+// protocol buffers, but we don't want to export the generated protocol buffer
+// bindings for Python because they cause conflicts between multiple Python
+// extensions that contain the same protocol buffer message. Instead, we accept
+// a JSON representation from Python and use this function to serialize it to
+// a uncompressed binary protocol buffer.
+absl::StatusOr<nanobind::bytes> JsonToPprofProfile(std::string json);
+
+// The reverse, useful for testing.
+absl::StatusOr<std::string> PprofProfileToJson(nanobind::bytes binary_proto);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PPROF_PROFILE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/profiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler.h
new file mode 100644
index 00000000..8732f980
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PROFILER_H_
+#define XLA_PYTHON_PROFILER_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildProfilerSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/internal/python_hooks.h b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/internal/python_hooks.h
new file mode 100644
index 00000000..8ddc6ea9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/internal/python_hooks.h
@@ -0,0 +1,203 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PYTHON_PROFILER_INTERNAL_PYTHON_HOOKS_H_
+#define XLA_PYTHON_PROFILER_INTERNAL_PYTHON_HOOKS_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "pybind11/cast.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/types.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+
+namespace py = ::pybind11;
+
+struct PythonHooksOptions {
+  bool enable_trace_python_function = false;
+  bool enable_python_traceme = true;
+  bool end_to_end_mode = false;
+  // Incomplete events are defined as those python calls which we only see
+  // either start or end, but not both. If we want to include them in the final
+  // result, profiler start, end time are used respectively to the absent
+  // timestamps.
+  bool include_incomplete_events = true;
+};
+
+struct PythonTraceEntry {
+  // Capture the source/line information for a PyCodeObject object.
+  // In eager mode, keeping a reference to PyCodeObject leaks device memory.
+  PythonTraceEntry(uint64_t start, uint64_t end, PyCodeObject* py_code_object)
+      : start_time_ns(start),
+        end_time_ns(end),
+        co_filename(py_code_object->co_filename),
+        co_name(py_code_object->co_name),
+        co_firstlineno(py_code_object->co_firstlineno) {
+    Py_XINCREF(co_filename);
+    Py_XINCREF(co_name);
+  }
+  // Capture the source/line information for a PyCFunctionObject object.
+  // In eager mode, keeping a reference to PyCFunctionObject leaks device
+  // memory.
+  PythonTraceEntry(uint64_t start, uint64_t end,
+                   PyCFunctionObject* py_c_function)
+      : start_time_ns(start),
+        end_time_ns(end),
+        m_module(py_c_function->m_module) {
+    Py_XINCREF(m_module);
+    if (auto* method_def = py_c_function->m_ml;
+        method_def != nullptr && method_def->ml_name != nullptr) {
+      method_name = method_def->ml_name;
+    }
+  }
+
+  ~PythonTraceEntry() {
+    Py_XDECREF(co_filename);
+    Py_XDECREF(co_name);
+    Py_XDECREF(m_module);
+  }
+
+  PythonTraceEntry(PythonTraceEntry&& other) noexcept {
+    start_time_ns = other.start_time_ns;
+    end_time_ns = other.end_time_ns;
+    co_firstlineno = other.co_firstlineno;
+    co_filename = other.co_filename;
+    co_name = other.co_name;
+    method_name = std::move(other.method_name);
+    m_module = other.m_module;
+    other.co_filename = nullptr;
+    other.co_name = nullptr;
+    other.method_name = "";
+    other.m_module = nullptr;
+  }
+
+  std::string Name() const;
+
+  uint64_t start_time_ns;
+  uint64_t end_time_ns;
+  PyObject* co_filename = nullptr;
+  PyObject* co_name = nullptr;
+  int co_firstlineno = 0;
+  std::string method_name;
+  PyObject* m_module = nullptr;
+
+  PythonTraceEntry(const PythonTraceEntry& other) = delete;
+  void operator=(const PythonTraceEntry&) = delete;
+  void operator=(PythonTraceEntry&&) = delete;
+};
+
+struct PerThreadEvents {
+  std::deque<PythonTraceEntry> completed;
+  std::stack<PythonTraceEntry> active;
+};
+
+class PythonHooks;
+
+class PythonHookContext {
+ public:
+  void Finalize(tensorflow::profiler::XSpace* space);
+
+  friend class ::xla::profiler::PythonHooks;
+
+ private:
+  void Start(const PythonHooksOptions& option);
+  void Stop();
+  void ProfileFast(PyFrameObject* frame, int what, PyObject* arg);
+  void CollectData(tensorflow::profiler::XPlane* raw_plane);
+  static void EnableTraceMe(bool enable);
+
+  static void SetProfilerInAllThreads();
+  static void ClearProfilerInAllThreads();
+
+  void operator=(const PythonHookContext&) = delete;
+  void operator=(PythonHookContext&&) = delete;
+
+  // The thread id to entries map, Note: by convention the thread id is
+  // int64_t to be consistent with cpu tracer when serialize to Xspace.
+  absl::flat_hash_map<int64_t, PerThreadEvents> entries_;
+  uint64_t start_timestamp_ns_;
+  PythonHooksOptions options_;
+  // In end to end mode, Python get uninitialized before Stop()/Finalize(), we
+  // need to buffer the result.
+  std::optional<tensorflow::profiler::XPlane> end_to_end_xplane_;
+};
+
+// Singleton for tracing python function calls.
+class PythonHooks {
+ public:
+  static PythonHooks* GetSingleton();
+
+  void Start(const PythonHooksOptions& option) {
+    if (active_context_) return;
+    active_context_ = std::make_unique<PythonHookContext>();
+    active_context_->Start(option);
+  }
+
+  std::unique_ptr<PythonHookContext> Stop() {
+    if (e2e_context_) {
+      auto* e2e_context = e2e_context_;
+      e2e_context_ = nullptr;
+      return absl::WrapUnique(e2e_context);
+    }
+
+    if (!active_context_) return nullptr;
+    active_context_->Stop();
+    std::unique_ptr<PythonHookContext> output = std::move(active_context_);
+    active_context_.reset();
+    return output;
+  }
+
+  friend class ::xla::profiler::PythonHookContext;
+
+ private:
+  void ProfileSlow(const py::object& frame, const std::string& event,
+                   const py::object& arg);
+
+  void ProfileFast(PyFrameObject* frame, int what, PyObject* arg) {
+    if (TF_PREDICT_TRUE(active_context_)) {
+      active_context_->ProfileFast(frame, what, arg);
+    }
+  }
+
+  static void set_e2e_context(PythonHookContext* e2e_context) {
+    e2e_context_ = e2e_context;
+  }
+
+  static int ProfileFunction(PyObject* obj, PyFrameObject* frame, int what,
+                             PyObject* arg);
+
+  // active_context_ are accessed when GIL is held, therefore no race
+  // conditions.
+  std::unique_ptr<PythonHookContext> active_context_;
+  static PythonHookContext* e2e_context_;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_INTERNAL_PYTHON_HOOKS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/internal/traceme_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/internal/traceme_wrapper.h
new file mode 100644
index 00000000..26cb8d03
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/internal/traceme_wrapper.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
+#define XLA_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/pytypes.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/types.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+namespace profiler {
+
+// Wraps TraceMe with an interface that takes python types.
+class TraceMeWrapper {
+ public:
+  // pybind11::str and pybind11::kwargs are taken by const reference to avoid
+  // python reference-counting overhead.
+  TraceMeWrapper(const pybind11::str& name, const pybind11::kwargs& kwargs)
+      : traceme_(
+            [&]() {
+              std::string name_and_metadata(name);
+              if (!kwargs.empty()) {
+                AppendMetadata(&name_and_metadata, kwargs);
+              }
+              return name_and_metadata;
+            },
+            /*level=*/1) {}
+
+  // pybind11::kwargs is taken by const reference to avoid python
+  // reference-counting overhead.
+  void SetMetadata(const pybind11::kwargs& kwargs) {
+    if (TF_PREDICT_FALSE(!kwargs.empty())) {
+      traceme_.AppendMetadata([&]() {
+        std::string metadata;
+        AppendMetadata(&metadata, kwargs);
+        return metadata;
+      });
+    }
+  }
+
+  void Stop() { traceme_.Stop(); }
+
+ private:
+  // Converts kwargs to strings and appends them to name encoded as TraceMe
+  // metadata.
+  static void AppendMetadata(std::string* name,
+                             const pybind11::kwargs& kwargs) {
+    name->push_back('#');
+    for (const auto& kv : kwargs) {
+      absl::StrAppend(name, std::string(pybind11::str(kv.first)), "=",
+                      EncodePyObject(kv.second), ",");
+    }
+    name->back() = '#';
+  }
+
+  static std::string EncodePyObject(const pybind11::handle& handle) {
+    if (pybind11::isinstance<pybind11::bool_>(handle)) {
+      return handle.cast<bool>() ? "1" : "0";
+    }
+    return std::string(pybind11::str(handle));
+  }
+
+  tsl::profiler::TraceMe traceme_;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/profile_data.h b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/profile_data.h
new file mode 100644
index 00000000..68843036
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler/profile_data.h
@@ -0,0 +1,192 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PROFILER_PROFILE_DATA_H_
+#define XLA_PYTHON_PROFILER_PROFILE_DATA_H_
+
+#include <nanobind/nanobind.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+
+#include "tsl/platform/logging.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow::profiler::python {
+
+namespace nb = nanobind;
+
+// A simple iterator that converts a proto repeated field to a Python iterable
+// with a customized conversion function.
+template <typename OutputType, typename InputType>
+class VisitorIterator
+    : public std::iterator<std::input_iterator_tag, OutputType> {
+ public:
+  VisitorIterator(
+      const tsl::protobuf::RepeatedPtrField<InputType>* values,
+      const std::function<OutputType(const InputType&)>& make_visitor,
+      int pos = 0)
+      : values_(values), make_visitor_(make_visitor), pos_(pos) {
+    CHECK_NOTNULL(values_);
+    CHECK_GE(pos_, 0);
+    CHECK_LE(pos_, values_->size());
+  }
+
+  // Prefix increment operator.
+  VisitorIterator& operator++() {
+    ++pos_;
+    return *this;
+  }
+
+  // Postfix increment operator.
+  VisitorIterator operator++(int) {
+    VisitorIterator tmp(*this);
+    operator++();
+    return tmp;
+  }
+
+  bool operator==(const VisitorIterator& rhs) const {
+    return pos_ == rhs.pos_ && values_ == rhs.values_;
+  }
+
+  bool operator!=(const VisitorIterator& rhs) const {
+    return pos_ != rhs.pos_ || values_ != rhs.values_;
+  }
+
+  OutputType operator*() { return make_visitor_((*values_)[pos_]); }
+
+ private:
+  const tsl::protobuf::RepeatedPtrField<InputType>* values_;
+  const std::function<OutputType(const InputType&)> make_visitor_;
+  int pos_ = 0;
+};
+
+class ProfileEvent {
+ public:
+  ProfileEvent() = delete;
+
+  ProfileEvent(const tensorflow::profiler::XEvent* event,
+               int64_t line_timestamp_ns,
+               const tensorflow::profiler::XPlane* plane,
+               std::shared_ptr<const tensorflow::profiler::XSpace> xspace);
+
+  double start_ns() const;
+
+  double duration_ns() const;
+
+  double end_ns() const;
+
+  std::string name() const;
+
+  VisitorIterator<nb::tuple, tensorflow::profiler::XStat> stats_begin();
+  VisitorIterator<nb::tuple, tensorflow::profiler::XStat> stats_end();
+
+ private:
+  const XEvent* event_;
+  const XPlane* plane_;
+  const int64_t line_timestamp_ns_;
+  // The actual XSpace protobuf we are wrapping around. A shared ptr is used so
+  // the different levels of  visitors (ProfileData, ProfilePlane,
+  // ProfileLine, etc.) don't depend on the lifetime of others.
+  const std::shared_ptr<const XSpace> xspace_;
+};
+
+class ProfileLine {
+ public:
+  ProfileLine() = delete;
+
+  ProfileLine(const tensorflow::profiler::XLine* line,
+              const tensorflow::profiler::XPlane* plane,
+              std::shared_ptr<const tensorflow::profiler::XSpace> xspace);
+
+  const std::string& name() const;
+
+  VisitorIterator<ProfileEvent, tensorflow::profiler::XEvent> events_begin();
+  VisitorIterator<ProfileEvent, tensorflow::profiler::XEvent> events_end();
+
+ private:
+  const XLine* line_;
+  const XPlane* plane_;
+  // The actual XSpace protobuf we are wrapping around. A shared ptr is used so
+  // the different levels of  visitors (ProfileData, ProfilePlane,
+  // ProfileLine, etc.) don't depend on the lifetime of others.
+  const std::shared_ptr<const XSpace> xspace_;
+};
+
+class ProfilePlane {
+ public:
+  ProfilePlane() = delete;
+
+  ProfilePlane(const tensorflow::profiler::XPlane* plane,
+               std::shared_ptr<const tensorflow::profiler::XSpace> xspace);
+
+  const std::string& name() const;
+
+  VisitorIterator<ProfileLine, tensorflow::profiler::XLine> lines_begin();
+  VisitorIterator<ProfileLine, tensorflow::profiler::XLine> lines_end();
+
+  VisitorIterator<nb::tuple, tensorflow::profiler::XStat> stats_begin();
+
+  VisitorIterator<nb::tuple, tensorflow::profiler::XStat> stats_end();
+
+ private:
+  const XPlane* plane_;
+  // The actual XSpace protobuf we are wrapping around. A shared ptr is used so
+  // the different levels of  visitors (ProfileData, ProfilePlane,
+  // ProfileLine, etc.) don't depend on the lifetime of others.
+  const std::shared_ptr<const XSpace> xspace_;
+};
+
+class ProfileData {
+ public:
+  static ProfileData from_serialized_xspace(const nb::bytes& serialized_xspace);
+
+  static ProfileData from_file(const std::string& proto_file_path);
+
+  static ProfileData from_raw_cpp_ptr(nb::capsule capsule);
+
+  ProfileData() = delete;
+
+  ProfileData(const char* serialized_xspace_ptr, size_t serialized_xspace_size);
+
+  explicit ProfileData(std::shared_ptr<XSpace> xspace_ptr);
+
+  explicit ProfileData(const nb::bytes& serialized_xspace);
+
+  VisitorIterator<ProfilePlane, XPlane> planes_begin();
+
+  VisitorIterator<ProfilePlane, XPlane> planes_end();
+
+  ProfilePlane* find_plane_with_name(const std::string& name) const;
+
+ private:
+  // The actual XSpace protobuf we are wrapping around. A shared ptr is used so
+  // the different levels of  visitors (ProfileData, ProfilePlane,
+  // ProfileLine, etc.) don't depend on the lifetime of others.
+  std::shared_ptr<XSpace> xspace_;
+};
+
+ProfileData from_serialized_xspace(const std::string& serialized_xspace);
+
+ProfileData from_file(const std::string& proto_file_path);
+
+}  // namespace tensorflow::profiler::python
+
+#endif  // XLA_PYTHON_PROFILER_PROFILE_DATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/profiler_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler_utils.h
new file mode 100644
index 00000000..5633a1e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/profiler_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PROFILER_UTILS_H_
+#define XLA_PYTHON_PROFILER_UTILS_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+namespace xla {
+
+void RegisterProfiler(const PJRT_Api* pjrt_api);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_array.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_array.h
new file mode 100644
index 00000000..d3bf0ca3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_array.h
@@ -0,0 +1,340 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_ARRAY_H_
+#define XLA_PYTHON_PY_ARRAY_H_
+
+#include <Python.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+// placeholder for index annotation headers
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "nanobind/nanobind.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/nb_numpy.h"
+#include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/python/py_client.h"
+#include "xla/python/traceback.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Private to PyArray, but you cannot forward declare member classes.
+// Not thread safe; assumes the GIL is held.
+class PyHostValue {
+ public:
+  PyHostValue();
+  ~PyHostValue();
+
+  PyHostValue(const PyHostValue&) = delete;
+  PyHostValue(PyHostValue&&) = delete;
+  PyHostValue& operator=(const PyHostValue&) = delete;
+  PyHostValue& operator=(PyHostValue&&) = delete;
+
+  absl::Status CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
+                               ifrt::Array* ifrt_array);
+
+  absl::StatusOr<nanobind::object> AsNumPyArray(
+      std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array);
+
+ private:
+  ifrt::Future<> ready_;
+  nb_numpy_ndarray value_;
+};
+
+// Private to PyArray, but you cannot forward declare member classes.
+struct PyArray_Storage {
+  PyArray_Storage(nanobind::object aval, bool weak_type, nb_dtype dtype,
+                  std::vector<int64_t> shape, nanobind::object sharding,
+                  bool committed, nb_class_ptr<PyClient> py_client,
+                  std::optional<nb_traceback> traceback,
+                  tsl::RCReference<ifrt::Array> ifrt_array,
+                  xla::PjRtFuture<> result_status);
+
+  ~PyArray_Storage();
+  nanobind::handle AsHandle();
+
+  nanobind::object aval;
+  bool weak_type = false;
+  nb_dtype dtype;
+  std::vector<int64_t> shape;
+
+  nanobind::object sharding;
+  nanobind::object npy_value = nanobind::none();
+  bool committed = false;
+
+  nb_class_ptr<PyClient> py_client;
+  std::optional<nb_traceback> traceback;
+  tsl::RCReference<ifrt::Array> ifrt_array;
+  nanobind::object fully_replicated_array = nanobind::none();
+
+  // optional field, used only in python
+  std::vector<PyArray> py_arrays;
+  PyHostValue host_value;  // Protected by the GIL.
+  std::optional<Shape> dynamic_shape = std::nullopt;
+  // Only set if this Array was generated by a computation that has effects.
+  // This is the result status of the XLA computation that generated this
+  // array.
+  xla::PjRtFuture<> result_status;
+
+  // Doubly-linked list of all PyArrays known to the client. Protected by the
+  // GIL. Since multiple PyArrays may share the same PjRtBuffer, there may be
+  // duplicate PjRtBuffers in this list.
+  PyArray_Storage* next;
+  PyArray_Storage* prev;
+
+  uint8_t thread_id_bucket;
+};
+
+// The C++ implementation of jax.Array. A few key methods and data members are
+// implemented in C++ for performance, while most of the functionalities are
+// still implemented in python.
+class PyArray : public nanobind::object {
+ public:
+  NB_OBJECT(PyArray, nanobind::object, "Array", PyArray::IsPyArray);
+  PyArray() = default;
+
+  // "__init__" methods. Only used in python
+  static void PyInit(PyArray self, nanobind::object aval,
+                     nanobind::object sharding,
+                     absl::Span<const PyArray> py_arrays, bool committed,
+                     bool skip_checks);
+
+  // Only used in C++. `skip_checks` should only be set for Arrays created by
+  // jax that cannot possibly have consistency issues (e.g. `sharding` devices
+  // different than `ifrt_array` devices). Arrays created by users should be
+  // checked.
+  PyArray(nanobind::object aval, bool weak_type, nb_dtype dtype,
+          std::vector<int64_t> shape, nanobind::object sharding,
+          nb_class_ptr<PyClient> py_client,
+          std::optional<nb_traceback> traceback,
+          tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
+          bool skip_checks,
+          xla::PjRtFuture<> result_status = xla::PjRtFuture<>());
+
+  static PyArray MakeFromSingleDeviceArray(
+      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+      tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
+      xla::PjRtFuture<> result_status = xla::PjRtFuture<>());
+
+  static PyArray MakeFromIfrtArrayAndSharding(
+      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+      tsl::RCReference<ifrt::Array> ifrt_array, nanobind::object sharding,
+      bool weak_type, bool committed, bool skip_checks);
+
+  static absl::Status RegisterTypes(nanobind::module_& m);
+
+  static PyArray borrow(PyObject* ptr) {
+    return nanobind::borrow<xla::PyArray>(ptr);
+  }
+
+  using Storage = PyArray_Storage;
+
+  const nanobind::object& aval() const { return GetStorage().aval; }
+  void set_aval(nanobind::object aval) { GetStorage().aval = std::move(aval); }
+
+  bool weak_type() const { return GetStorage().weak_type; }
+
+  const nb_dtype& dtype() const { return GetStorage().dtype; }
+  absl::Span<const int64_t> shape() const { return GetStorage().shape; }
+
+  const nanobind::object& sharding() const { return GetStorage().sharding; }
+
+  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() {
+    return ifrt_array()->layout();
+  }
+
+  bool committed() const { return GetStorage().committed; }
+
+  const nanobind::object& npy_value() const { return GetStorage().npy_value; }
+  void set_npy_value(nanobind::object v) {
+    GetStorage().npy_value = std::move(v);
+  }
+
+  const nb_class_ptr<PyClient>& py_client() const {
+    return GetStorage().py_client;
+  }
+
+  const std::optional<nb_traceback>& traceback() const {
+    return GetStorage().traceback;
+  }
+
+  // Returns xla::InvalidArgument if the buffer has been deleted.
+  // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
+  absl::StatusOr<bool> IsReady() {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr->IsDeleted()) {
+      return InvalidArgument("Array has been deleted.");
+    }
+    return ifrt_array_ptr->GetReadyFuture().IsReady();
+  }
+
+  const xla::PjRtFuture<>& result_status() const {
+    return GetStorage().result_status;
+  }
+
+  ifrt::Array* ifrt_array() const { return GetStorage().ifrt_array.get(); }
+
+  // Short-term escape hatch to get PjRtBuffers from PyArray.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() const {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr == nullptr) {
+      return {};
+    }
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_ptr);
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return arr->pjrt_buffers();
+  }
+
+  int num_addressable_shards() const {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr == nullptr) {
+      return 0;
+    }
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_ptr);
+    if (arr == nullptr) {
+      // TODO(hyeontaek): Add num_addressable_shards to ifrt.
+      return num_shards();
+    }
+    return arr->pjrt_buffers().size();
+  }
+
+  std::vector<PyArray>& py_arrays() { return GetStorage().py_arrays; }
+  const std::vector<PyArray>& py_arrays() const {
+    return GetStorage().py_arrays;
+  }
+  const std::vector<PyArray>& py_arrays_cached();
+
+  nanobind::object arrays();
+  absl::Status set_arrays(nanobind::object obj);
+  absl::StatusOr<PyArray> FullyReplicatedShard();
+
+  int num_shards() const {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr == nullptr) {
+      return 0;
+    }
+    return ifrt_array_ptr->sharding().devices()->size();
+  }
+
+  static nanobind::handle type() {
+    DCHECK(type_);
+    return nanobind::handle(type_);
+  }
+
+  static bool IsPyArray(nanobind::handle arg) {
+    return arg.type().is(PyArray::type());
+  }
+
+  absl::Status BlockUntilReady() const;
+
+  absl::Status BlockUntilResultStatusIsReady();
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes();
+  absl::StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
+  absl::Status CopySingleDeviceArrayToHostAsync();
+  nanobind::dict CudaArrayInterface();
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer();
+
+  absl::Status Delete();
+
+  bool IsDeleted() const;
+
+  PyArray Clone() const;
+
+  static absl::StatusOr<std::vector<PyArray>> BatchedCopyToDeviceWithSharding(
+      absl::Span<const PyArray> py_arrays,
+      absl::Span<const tsl::RCReference<ifrt::DeviceList>> dst_device_lists,
+      absl::Span<const nanobind::object> dst_shardings,
+      absl::Span<const ifrt::ArrayCopySemantics> array_copy_semantics);
+
+  static absl::StatusOr<PyArray> BatchedDevicePut(
+      nanobind::object aval, nanobind::object sharding,
+      std::vector<nanobind::object> xs,
+      absl::Span<const PyDevice* const> dst_devices, bool committed,
+      bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
+      bool jax_enable_x64);
+
+  static absl::Status BatchedBlockUntilReady(
+      std::vector<nanobind::object> objs);
+
+ private:
+  absl::StatusOr<PyArray> AssertUnsharded(absl::string_view api);
+
+  void CheckAndRearrange();
+
+  void SetIfrtArray(tsl::RCReference<ifrt::Array> ifrt_array);
+
+  Storage& GetStorage();
+  const Storage& GetStorage() const;
+
+  inline static PyObject* type_ = nullptr;
+};
+
+class PyArrayResultHandler {
+ public:
+  PyArrayResultHandler(nanobind::object aval, nanobind::object sharding,
+                       bool committed, bool skip_checks);
+
+  PyArray Call(absl::Span<const PyArray> py_arrays) const;
+  PyArray Call(PyArray py_array) const;
+
+  PyArray Call(nb_class_ptr<PyClient> py_client,
+               tsl::RCReference<ifrt::Array> ifrt_array,
+               xla::PjRtFuture<> result_status = xla::PjRtFuture<>()) const;
+
+ private:
+  nanobind::object aval_;
+  nanobind::object sharding_;
+  bool weak_type_;
+  bool committed_;
+  bool skip_checks_;
+
+  nb_dtype dtype_;
+  std::vector<int64_t> shape_;
+};
+
+absl::StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
+    const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client,
+    std::optional<int> device_id);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_client.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_client.h
new file mode 100644
index 00000000..a8893a0b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_client.h
@@ -0,0 +1,266 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_CLIENT_H_
+#define XLA_PYTHON_PY_CLIENT_H_
+
+#include <Python.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "nanobind/nanobind.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+class PyClient;
+class PyLoadedExecutable;
+class PyArray;
+class PyDevice;
+class PyMemorySpace;
+struct PyArray_Storage;
+
+// Python wrapper around PjRtClient.
+// We use a wrapper class to add Python-specific functionality.
+class PyClient {
+ public:
+  static nb_class_ptr<PyClient> Make(std::shared_ptr<ifrt::Client> ifrt_client);
+
+  // Do not call the constructor directly. Use `PyClient::Make` instead.
+  explicit PyClient(std::shared_ptr<ifrt::Client> ifrt_client);
+  virtual ~PyClient();
+
+  ifrt::Client* ifrt_client() const { return ifrt_client_.get(); }
+  const std::shared_ptr<ifrt::Client>& shared_ptr_ifrt_client() const {
+    return ifrt_client_;
+  }
+
+  // Short-term escape hatch to get PjRtClient from PyClient.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  xla::PjRtClient* pjrt_client() const {
+    auto* pjrt_client =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+    if (pjrt_client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return pjrt_client->pjrt_client();
+  }
+  std::shared_ptr<PjRtClient> shared_ptr_pjrt_client() {
+    auto* pjrt_client =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+    if (pjrt_client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return pjrt_client->shared_ptr_pjrt_client();
+  }
+
+  // Legacy alises.
+  std::shared_ptr<PjRtClient> shared_pjrt_client() {
+    return shared_ptr_pjrt_client();
+  }
+
+  absl::string_view platform_name() const {
+    // TODO(phawkins): this is a temporary backwards compatibility shim. We
+    // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
+    // we haven't yet updated JAX clients that expect "gpu". Migrate users and
+    // remove this code.
+    if (ifrt_client_->platform_name() == "cuda" ||
+        ifrt_client_->platform_name() == "rocm") {
+      return "gpu";
+    } else {
+      return ifrt_client_->platform_name();
+    }
+  }
+  absl::string_view raw_platform_name() const {
+    // TODO(parkers): Once platform_name() is the same, remove this.
+    return ifrt_client_->platform_name();
+  }
+  absl::string_view platform_version() const {
+    return ifrt_client_->platform_version();
+  }
+  absl::string_view runtime_type() const {
+    return ifrt_client_->runtime_type();
+  }
+
+  // Returns implementation-specific attributes about this client, e.g. the PJRT
+  // C API version if applicable.
+  const xla::ifrt::AttributeMap& Attributes() const {
+    return client_attributes_;
+  }
+
+  int addressable_device_count() const {
+    return ifrt_client_->addressable_device_count();
+  }
+  int device_count() const { return ifrt_client_->device_count(); }
+  int process_index() const { return ifrt_client_->process_index(); }
+
+  std::vector<nb_class_ptr<PyDevice>> Devices();
+  std::vector<nb_class_ptr<PyDevice>> LocalDevices();
+  // Returns all devices in the client. Private API; only use this method for
+  // implementing backend._get_all_devices().
+  // TODO(hyeontaek): Remove this method once we have a unified API for
+  // enumerating devices with different criteria.
+  std::vector<nb_class_ptr<PyDevice>> GetAllDevices();
+  absl::StatusOr<nb_class_ptr<PyDevice>> DeviceFromLocalHardwareId(
+      int local_hardware_id);
+
+  // Returns the PyDevice associated with the given ifrt::Device.
+  nb_class_ptr<PyDevice> GetPyDevice(ifrt::Device* device);
+
+  // Returns the PyMemorySpace associated with the given ifrt::Memory.
+  nb_class_ptr<PyMemorySpace> GetPyMemorySpace(ifrt::Memory* memory_space);
+
+  // Returns a vector of live PyArray objects. PyArray objects may share
+  // PjRtBuffers, so there may be duplicates of the same underlying device
+  // buffer.
+  std::vector<nanobind::object> LiveBuffersOnDevice(ifrt::Device* device);
+
+  nanobind::list LiveExecutables();
+
+  // TODO(zhangqiaorjc): Remove when we have transparent defragmentation.
+  absl::Status Defragment();
+
+  static absl::StatusOr<nanobind::object> BufferFromPyval(
+      nb_class_ptr<PyClient> client, nanobind::handle argument,
+      ifrt::Device* device, bool force_copy,
+      ifrt::Client::HostBufferSemantics host_buffer_semantics);
+
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> CompileIfrtProgram(
+      nb_class_ptr<PyClient> client,
+      std::unique_ptr<ifrt::Program> ifrt_program,
+      std::unique_ptr<ifrt::CompileOptions> ifrt_options);
+
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> Compile(
+      nb_class_ptr<PyClient> client, std::string mlir_module,
+      CompileOptions options, std::vector<nanobind::capsule> host_callbacks);
+
+  absl::StatusOr<nanobind::bytes> SerializeExecutable(
+      const PyLoadedExecutable& executable) const;
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> DeserializeExecutable(
+      nb_class_ptr<PyClient> client, nanobind::bytes serialized,
+      std::optional<CompileOptions> options,
+      std::vector<nanobind::capsule> host_callbacks);
+
+  absl::StatusOr<nanobind::bytes> HeapProfile();
+
+  // `GetEmitPythonCallbackDescriptor` takes in an input Python callable that
+  // takes in arguments of shapes `operand_shapes` and returns values of shapes
+  // `result_shapes`. It returns a pair of a `uint64_t` descriptor and a Python
+  // object whose reference will keep the Python callback alive. The descriptor
+  // should be passed into a 'xla_python_cpu_callback' or
+  // 'xla_python_gpu_callback' CustomCall as its first argument. Typically the
+  // callback may be kept alive by attaching the keep-alive object to the
+  // executable built from this computation.
+  //
+  // The callable receives as arguments NumPy arrays for arguments with array
+  // types, and None for Token argument. The callable must return a tuple of
+  // either arrays or None values.
+  absl::StatusOr<std::pair<uint64_t, nanobind::object>>
+  GetEmitPythonCallbackDescriptor(nanobind::callable callable,
+                                  absl::Span<Shape const> operand_shapes,
+                                  absl::Span<Shape const> result_shapes);
+
+  // `MakePythonCallbackUsingHostSendAndRecv` takes in an input Python callable
+  // that takes in arguments of shapes `operand_shapes` and returns results of
+  // shapes `result_shapes`. The arguments correspond to Send ops in the HLO
+  // program through `send_channel_ids` and the results correspond to Recv ops
+  // through `recv_channel_ids`. It returns the host callback as an opaque
+  // object whose reference will keep the Python callback alive. The host
+  // callback can be passed to `PyClient::Compile` or
+  // `PyClient::DeserializeExecutable`. The corresponding Send/Recv ops in the
+  // XLA computation can trigger the execution of this host callback.
+  // `serializer` is a function that takes `callable` as an argument and returns
+  // a serialized callable as a string.
+  //
+  // The callable receives as arguments NumPy arrays for arguments with array
+  // types, and None for Token argument. The callable must return a tuple of
+  // either arrays or None values.
+  absl::StatusOr<nanobind::object> MakePythonCallbackUsingHostSendAndRecv(
+      nanobind::callable callable, absl::Span<Shape const> operand_shapes,
+      absl::Span<Shape const> result_shapes,
+      absl::Span<uint16_t const> send_channel_ids,
+      absl::Span<uint16_t const> recv_channel_ids,
+      nanobind::callable serializer);
+
+  std::vector<PyArray> LiveArrays() const;
+
+  static void RegisterPythonTypes(nanobind::module_& m);
+
+ protected:
+  static void Initialize(nb_class_ptr<PyClient> client);
+
+ private:
+  friend class PyLoadedExecutable;
+  friend class PyArray;
+  friend struct PyArray_Storage;
+
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  std::shared_ptr<ifrt::Client> ifrt_client_;
+  xla::ifrt::AttributeMap client_attributes_;
+  // Pointers to intrusive doubly-linked lists of arrays and executables, used
+  // to iterate over all known objects when heap profiling. The list structure
+  // is protected by the GIL.
+
+  nanobind::ft_mutex executables_mutex_;
+  // List guarded by executables_mutex_.
+  PyLoadedExecutable* executables_ = nullptr;
+
+#ifdef NB_FREE_THREADING
+  static constexpr size_t kNumArraysShards = 16;
+#else
+  static constexpr size_t kNumArraysShards = 1;
+#endif
+  struct ArraysShard {
+    mutable nanobind::ft_mutex mutex;
+    PyArray_Storage* arrays;
+  };
+  std::array<ArraysShard, kNumArraysShards> arrays_;
+
+  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices_;
+  absl::flat_hash_map<ifrt::Memory*, nb_class_ptr<PyMemorySpace>>
+      memory_spaces_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_client_gpu.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_client_gpu.h
new file mode 100644
index 00000000..d7675e1b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_client_gpu.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_CLIENT_GPU_H_
+#define XLA_PYTHON_PY_CLIENT_GPU_H_
+
+#if TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#else
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+#include "xla/service/custom_call_status.h"
+
+#if TENSORFLOW_USE_ROCM
+#define gpuStreamHandle hipStream_t
+#else
+#define gpuStreamHandle CUstream
+#endif
+
+namespace xla {
+
+void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
+                          const char* opaque, size_t opaque_len,
+                          XlaCustomCallStatus* status);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_CLIENT_GPU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_compile_only_client.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_compile_only_client.h
new file mode 100644
index 00000000..470bcdd9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_compile_only_client.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
+#define XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
+
+#include <memory>
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
+#include "xla/python/py_client.h"
+
+namespace xla {
+
+// This is a workaround for AOT compilation until topologies and device
+// descriptions are better integrated into jax's Python code. It returns a
+// PyClient that will return errors for all non-AOT methods. It also exposes a
+// different compile method that returns an unloaded executable (vs. PyClient
+// usually returns a loaded executable). RegisterCompileOnlyClient() overloads
+// the Python "compile" method to return the unloaded executable, and we rely on
+// Python duck typing to treat the unloaded executable like a loaded executable
+// (except it will raise errors if you try to run it, which is what we want for
+// AOT environments).
+nb_class_ptr<PyClient> MakeCompileOnlyClient(
+    std::shared_ptr<ifrt::PjRtTopology>);
+
+void RegisterCompileOnlyClient(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_device.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_device.h
new file mode 100644
index 00000000..6acd35b1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_device.h
@@ -0,0 +1,82 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_DEVICE_H_
+#define XLA_PYTHON_PY_DEVICE_H_
+
+#include <Python.h>
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "nanobind/nanobind.h"
+#include "xla/literal.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+class PyDevice {
+ public:
+  PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device);
+
+  // Devices are compared using Python object identity, so we don't allow them
+  // to be copied or moved.
+  PyDevice(const PyDevice&) = delete;
+  PyDevice(PyDevice&&) = delete;
+  PyDevice& operator=(const PyDevice&) = delete;
+  PyDevice& operator=(PyDevice&&) = delete;
+
+  const nb_class_ptr<PyClient>& client() const { return client_; }
+  ifrt::Device* device() const { return device_; }
+
+  int id() const;
+  int process_index() const;
+  absl::string_view platform() const;
+  absl::string_view device_kind() const;
+  std::optional<int> local_hardware_id() const;
+
+  absl::string_view Str() const;
+  absl::string_view Repr() const;
+
+  absl::Status TransferToInfeed(LiteralSlice literal);
+  absl::StatusOr<nanobind::object> TransferFromOutfeed(Shape shape);
+
+  absl::StatusOr<nb_class_ptr<PyMemorySpace>> Memory(
+      absl::string_view kind) const;
+  absl::StatusOr<nb_class_ptr<PyMemorySpace>> DefaultMemory() const;
+  nanobind::list AddressableMemories() const;
+  absl::StatusOr<std::optional<nanobind::dict>> MemoryStats() const;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents() const;
+
+  static void RegisterPythonType(nanobind::module_& m);
+
+ private:
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  nb_class_ptr<PyClient> client_;
+  ifrt::Device* device_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_DEVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_device_list.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_device_list.h
new file mode 100644
index 00000000..d44065f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_device_list.h
@@ -0,0 +1,139 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_DEVICE_LIST_H_
+#define XLA_PYTHON_PY_DEVICE_LIST_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "absl/status/statusor.h"
+#include "nanobind/nanobind.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace jax {
+
+// Device list with various caching and direct access to IFRT DeviceList.
+class PyDeviceList {
+ public:
+  PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
+               tsl::RCReference<xla::ifrt::DeviceList> device_list);
+  explicit PyDeviceList(nanobind::tuple py_device_assignment);
+  ~PyDeviceList();
+
+  PyDeviceList(const PyDeviceList&) = delete;
+  PyDeviceList(PyDeviceList&&) = delete;
+  PyDeviceList& operator=(const PyDeviceList&) = delete;
+  PyDeviceList& operator=(PyDeviceList&&) = delete;
+
+  static nanobind::handle type() {
+    static auto type = nanobind::type<PyDeviceList>();
+    return type;
+  }
+
+  // These two methods are safe to call from C++ without GIL.
+  xla::nb_class_ptr<xla::PyClient> py_client() const { return py_client_; }
+  absl::StatusOr<tsl::RCReference<xla::ifrt::DeviceList>> ifrt_device_list()
+      const;
+
+  int Len() const;                      // Requires the GIL in GIL mode.
+  nanobind::object GetItem(int index);  // Requires the GIL in GIL mode.
+
+  // Requires the GIL in GIL mode. Acquires the self lock in non-GIL mode.
+  static xla::nb_class_ptr<PyDeviceList> AddressableDeviceList(
+      xla::nb_class_ptr<PyDeviceList> self);
+
+  // Requires the GIL in GIL mode. Acquires the self lock in non-GIL mode.
+  static absl::StatusOr<nanobind::object> DefaultMemoryKind(
+      xla::nb_class_ptr<PyDeviceList> self);
+
+  // Requires the GIL in GIL mode. Acquires the self lock in non-GIL mode.
+  static absl::StatusOr<nanobind::tuple> MemoryKinds(
+      xla::nb_class_ptr<PyDeviceList> self);
+
+  // go/pywald-pybind-annotation BEGIN
+  // refs {
+  //   module_path: "third_party/tensorflow/compiler/xla/python/xla.cc"
+  //   module_arg {}
+  // }
+  // go/pywald-pybind-annotation END
+  static void Register(nanobind::module_& m);
+
+ private:
+  nanobind::tuple AsTuple() const;
+
+  // Methods below require GIL.
+  nanobind::object GetSlice(nanobind::slice slice);
+  nanobind::iterator Iter();
+
+  std::string Str();
+
+  nanobind::tuple Dump() const;
+
+  int64_t Hash();  // Mutates hash_, needs self lock.
+
+  static bool Equal(xla::nb_class_ptr<PyDeviceList> self,
+                    nanobind::handle other);
+  static bool NotEqual(xla::nb_class_ptr<PyDeviceList> self,
+                       nanobind::handle other);
+
+  // Finds the memory kind info from an addressable device. Requires the GIL
+  // or self lock.
+  void PopulateMemoryKindInfo();
+  // Same as `PopulateMemoryKindInfo()`, but uses `py_device_assignment_`
+  // instead of `ifrt_device_list_` to support duck-typed device objects.
+  // Requires the GIL or self lock.
+  void PopulateMemoryKindInfoForDuckTypedDevices();
+
+  // Requires the self lock or GIL is held.
+  bool IsFullyAddressable();
+
+  // Valid only if `device_list_` contains `xla::ifrt::DeviceList` and
+  // non-empty.
+  xla::nb_class_ptr<xla::PyClient> py_client_;
+
+  // Either C++ `ifrt::DeviceList` or Python duck-type devices.
+  // TODO(hyeontaek): Remove support for Python duck-type devices once all
+  // JAX backends and tests are migrated to use an `xla::ifrt::Device` type
+  // for JAX devices.
+  // Immutable after constructor; no locking needed.
+  std::variant<tsl::RCReference<xla::ifrt::DeviceList>, nanobind::tuple>
+      device_list_;
+
+  // Populated on demand. Guarded by the object's self lock.
+  std::optional<ssize_t> hash_;
+  // TODO(hyeontaek): Make the following property cached within
+  // `xla::ifrt::DeviceList`.
+  // Populated on demand. Guarded by the object's self lock.
+  std::optional<bool> is_fully_addressable_;
+  // Populated on demand. Guarded by the object's self lock.
+  std::optional<xla::nb_class_ptr<PyDeviceList>> addressable_device_list_;
+
+  struct MemoryKindInfo {
+    nanobind::object default_memory_kind;
+    nanobind::tuple memory_kinds;
+  };
+  // Populated on demand. Guarded by the object's self lock.
+  std::optional<absl::StatusOr<MemoryKindInfo>> memory_kind_info_;
+};
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_PY_DEVICE_LIST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_executable.h
new file mode 100644
index 00000000..f4c22b52
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_executable.h
@@ -0,0 +1,252 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_EXECUTABLE_H_
+#define XLA_PYTHON_PY_EXECUTABLE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "nanobind/nanobind.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "xla/python/py_array.h"
+#include "xla/python/py_client.h"
+#include "xla/python/traceback.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+class PyToken {
+ public:
+  PyToken() = default;
+  explicit PyToken(PjRtFuture<> future) : future_(std::move(future)) {}
+
+  static PyToken ReadyPyToken() {
+    return PyToken(PjRtFuture<>(absl::OkStatus()));
+  }
+
+  absl::Status Await();
+
+ private:
+  PjRtFuture<> future_;
+};
+
+// PyShardedToken contains a PyToken for each device's execution.
+class PyShardedToken {
+ public:
+  // Default construction creates a always-ready token.
+  PyShardedToken() = default;
+  explicit PyShardedToken(std::vector<PjRtFuture<>> futures)
+      : futures_(std::move(futures)) {}
+
+  PyToken GetPyToken(int device_id) const {
+    if (futures_.empty()) return PyToken::ReadyPyToken();
+    return PyToken(futures_.at(device_id));
+  }
+
+  absl::Status Await();
+
+ private:
+  std::vector<PjRtFuture<>> futures_;
+};
+
+class PyExecuteResults {
+ public:
+  PyExecuteResults(const nb_class_ptr<PyClient>& client,
+                   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
+                   int num_computations, PyShardedToken token,
+                   PjRtFuture<> result_status = PjRtFuture<>());
+
+  std::vector<std::vector<PyArray>> DisassembleIntoSingleDeviceArrays();
+
+  std::vector<std::vector<PyArray>> DisassemblePrefixIntoSingleDeviceArrays(
+      size_t n);
+
+  std::vector<nanobind::object> ConsumeWithHandlers(
+      std::vector<std::variant<const PyArrayResultHandler*, nanobind::object>>
+          out_handlers);
+
+  std::vector<tsl::RCReference<ifrt::Array>> Consume();
+
+  PyShardedToken ConsumeToken();
+
+  size_t Size() const {
+    CheckNotDisassembled();
+    return ifrt_arrays_.size();
+  }
+
+  void CheckNotDisassembled() const;
+
+ private:
+  bool is_exploded_ = false;
+  bool token_consumed_ = false;
+  nb_class_ptr<PyClient> client_;
+  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays_;
+  int num_computations_;
+  PyShardedToken token_;
+  // Only set if the computation has tokens.
+  PjRtFuture<> result_status_;
+};
+
+using ExecuteShardedArg = std::variant<PyArray, std::vector<PyArray>>;
+
+// Python wrapper around PjRtExecutable. We use a wrapper class:
+// a) to keep the PyClient alive via a std::shared_ptr<>
+// b) to add Python-specific functionality.
+class PyLoadedExecutable {
+ public:
+  PyLoadedExecutable(
+      nb_class_ptr<PyClient> client,
+      std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
+      std::optional<nb_traceback> traceback,
+      std::optional<std::string> fingerprint);
+  ~PyLoadedExecutable();
+
+  nb_class_ptr<PyClient> client() const { return client_; }
+  ifrt::LoadedExecutable* ifrt_loaded_executable() const {
+    return ifrt_loaded_executable_.get();
+  }
+
+  std::shared_ptr<ifrt::LoadedExecutable> shared_ifrt_loaded_executable() {
+    return ifrt_loaded_executable_;
+  }
+
+  std::vector<nb_class_ptr<PyDevice>> AddressableDevices() const;
+
+  int64_t SizeOfGeneratedCodeInBytes() const {
+    return ifrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
+  }
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
+    nanobind::gil_scoped_release scope;
+    return ifrt_loaded_executable_->GetCompiledMemoryStats();
+  }
+
+  absl::StatusOr<xla::ifrt::AttributeMap> GetCostAnalysis() const {
+    return ifrt_loaded_executable_->GetCostAnalysis();
+  }
+
+  void Delete() {
+    // TODO(hyeontaek): Return absl::Status.
+    TF_CHECK_OK(ifrt_loaded_executable_->Delete().Await());
+  }
+
+  bool is_deleted() { return ifrt_loaded_executable_->IsDeleted(); }
+
+  // Takes args indexed by argid then deviceid, transposes them, and passes to
+  // PjRtExecutable::Execute. The result is similarly transposed back into the
+  // argid,deviceid format.
+  // args is [num_args x num_devices].
+  absl::StatusOr<std::vector<std::vector<PyArray>>>
+  ExecuteShardedOnLocalDevices(absl::Span<const ExecuteShardedArg> args);
+
+  absl::StatusOr<std::pair<std::vector<std::vector<PyArray>>, PyShardedToken>>
+  ExecuteShardedOnLocalDevicesWithTokens(
+      absl::Span<const ExecuteShardedArg> args);
+
+  absl::StatusOr<PyExecuteResults> ExecuteSharded(
+      std::vector<ExecuteShardedArg> args, bool with_tokens);
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> HloModules() const;
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const;
+
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetParameterLayouts() const;
+
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const;
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings() const;
+
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const;
+
+  const std::optional<nb_traceback>& traceback() { return traceback_; }
+
+  ifrt::LoadedExecutable* ifrt_executable() const {
+    return ifrt_loaded_executable_.get();
+  }
+
+  // Short-term escape hatch to get PjRtLoadedExecutable from PyExecutable.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  std::shared_ptr<PjRtLoadedExecutable> shared_ptr_pjrt_executable() {
+    auto* exec = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleLoadedExecutable>(
+        ifrt_loaded_executable_.get());
+    if (exec == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return exec->shared_ptr_pjrt_loaded_executable();
+  }
+
+  const ifrt::ExecuteOptions& options() const { return options_; }
+  const std::optional<std::string>& fingerprint() const { return fingerprint_; }
+
+  // Keep `obj` alive as long as PyLoadedExecutable.
+  void KeepAlive(nanobind::object obj);
+
+ private:
+  friend class PyClient;
+
+  nb_class_ptr<PyClient> client_;
+  std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
+  std::optional<nb_traceback> traceback_;
+
+  // Identical executables (i.e. representing the same program) will have the
+  // same fingerprint. nullopt on platforms or executables where fingerprints
+  // aren't implemented.
+  std::optional<std::string> fingerprint_;
+
+  // The options to pass to `executable_.Execute`.
+  ifrt::ExecuteOptions options_;
+
+  // Python objects to keep alive as requested by user.
+  std::vector<nanobind::object> keepalives_;
+
+  // Doubly-linked list of all executables known to the client. Protected by the
+  // GIL.
+  PyLoadedExecutable* next_;
+  PyLoadedExecutable* prev_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_host_callback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_host_callback.h
new file mode 100644
index 00000000..da0287aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_host_callback.h
@@ -0,0 +1,138 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_HOST_CALLBACK_H_
+#define XLA_PYTHON_PY_HOST_CALLBACK_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "nanobind/nanobind.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/python/callback.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+
+using PyLoadedHostCallback = ::xla::ifrt::LoadedHostCallback;
+
+// `PyCpuLoadedHostCallback` implements a Python host callback that uses a
+// descriptor (a raw pointer to JAX `CpuCallback`). The descriptor should be
+// passed into a 'xla_python_cpu_callback' or 'xla_python_gpu_callback'
+// CustomCall as its first argument.
+//
+// Serialization is not supported. Once the descriptor is embedded in
+// CustomCall in an XLA computation, the computation will not be serializable.
+class PyCpuLoadedHostCallback final
+    : public llvm::RTTIExtends<PyCpuLoadedHostCallback,
+                               ifrt::LoadedHostCallback> {
+ public:
+  static absl::StatusOr<tsl::RCReference<PyCpuLoadedHostCallback>> Create(
+      ifrt::Client* ifrt_client, nanobind::callable callable,
+      absl::Span<const Shape> operand_shapes,
+      absl::Span<const Shape> result_shapes);
+
+  // Returns the descriptor of `CpuCallback`.
+  uint64_t descriptor() const {
+    return absl::bit_cast<uint64_t>(cpu_callback_.get());
+  }
+
+  // LoadedHostCallback implementation.
+
+  ~PyCpuLoadedHostCallback() override = default;
+
+  ifrt::Client* client() const override { return ifrt_client_; }
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PyCpuLoadedHostCallback(ifrt::Client* ifrt_client,
+                          std::unique_ptr<CpuCallback> cpu_callback)
+      : ifrt_client_(ifrt_client), cpu_callback_(std::move(cpu_callback)) {}
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  ifrt::Client* ifrt_client_;
+  std::unique_ptr<CpuCallback> cpu_callback_;
+};
+
+// `PyHostSendAndRecvLoadedHostCallback` implements a Python host callback that
+// uses XLA host send and recv. This object should be passed to the compiler
+// when creating `xla::ifrt::LoadedExecutable`.
+//
+// Serialization is supported if the Python host callback using the
+// `cloudpickle` third-party library.
+//
+// TODO(hyeontaek): Update the comment ("compiler" to "client") after splitting
+// compilation and loading.
+class PyHostSendAndRecvLoadedHostCallback final
+    : public llvm::RTTIExtends<PyHostSendAndRecvLoadedHostCallback,
+                               ifrt::PjRtHostSendAndRecvLoadedHostCallback> {
+ public:
+  static absl::StatusOr<tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>>
+  Create(ifrt::Client* ifrt_client, nanobind::callable callable,
+         absl::Span<const Shape> operand_shapes,
+         absl::Span<const Shape> result_shapes,
+         absl::Span<const uint16_t> send_channel_ids,
+         absl::Span<const uint16_t> recv_channel_ids,
+         nanobind::callable serializer);
+
+  // PjRtLoadedHostCallback implementation.
+
+  ~PyHostSendAndRecvLoadedHostCallback() override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PyHostSendAndRecvLoadedHostCallback(
+      ifrt::Client* ifrt_client,
+      std::unique_ptr<xla::HostCallback> xla_host_callback,
+      nanobind::callable callable, absl::Span<const Shape> operand_shapes,
+      absl::Span<const Shape> result_shapes,
+      absl::Span<const uint16_t> send_channel_ids,
+      absl::Span<const uint16_t> recv_channel_ids,
+      nanobind::callable serializer);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  // Retained arguments for host callback serialization.
+  nanobind::callable callable_;
+  std::vector<Shape> operand_shapes_;
+  std::vector<Shape> result_shapes_;
+  std::vector<uint16_t> send_channel_ids_;
+  std::vector<uint16_t> recv_channel_ids_;
+  nanobind::callable serializer_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_HOST_CALLBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_memory_space.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_memory_space.h
new file mode 100644
index 00000000..bc0773ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_memory_space.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_MEMORY_SPACE_H_
+#define XLA_PYTHON_PY_MEMORY_SPACE_H_
+
+#include <Python.h>
+
+#include "nanobind/nanobind.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+
+namespace xla {
+
+class PyMemorySpace {
+ public:
+  PyMemorySpace(nb_class_ptr<PyClient> client, ifrt::Memory* memory_space);
+
+  // Memory spaces are compared using Python object identity, so we don't allow
+  // them to be copied or moved.
+  PyMemorySpace(const PyMemorySpace&) = delete;
+  PyMemorySpace(PyMemorySpace&&) = delete;
+  PyMemorySpace& operator=(const PyMemorySpace&) = delete;
+  PyMemorySpace& operator=(PyMemorySpace&&) = delete;
+
+  const nb_class_ptr<PyClient>& client() const { return client_; }
+  ifrt::Memory* memory_space() const { return memory_; }
+
+  int process_index() const;
+  absl::string_view platform() const;
+  absl::string_view kind() const;
+
+  absl::string_view Str() const;
+  absl::string_view Repr() const;
+
+  nanobind::list AddressableByDevices() const;
+
+  static void RegisterPythonType(nanobind::module_& m);
+
+ private:
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  nb_class_ptr<PyClient> client_;
+  ifrt::Memory* memory_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_MEMORY_SPACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_program.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_program.h
new file mode 100644
index 00000000..e9654d8a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_program.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_PROGRAM_H_
+#define XLA_PYTHON_PY_PROGRAM_H_
+
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildIfrtProgramsSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_PROGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/py_values.h b/third_party/tflite-hdrs/third_party/xla/xla/python/py_values.h
new file mode 100644
index 00000000..51bfdb91
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/py_values.h
@@ -0,0 +1,125 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for converting Python values into buffers.
+
+#ifndef XLA_PYTHON_PY_VALUES_H_
+#define XLA_PYTHON_PY_VALUES_H_
+
+#include <cstdint>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/nb_numpy.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+struct DevicePutResult {
+  explicit DevicePutResult(
+      tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type,
+      nanobind::object owning_pybuffer = nanobind::object())
+      : ifrt_array(std::move(ifrt_array)),
+        weak_type(weak_type),
+        owning_pybuffer(owning_pybuffer) {}
+
+  // Disallow copy since copying `DevicePutResult` without holding GIL may be
+  // dangerous due to `owning_pybuffer`.
+  DevicePutResult(const DevicePutResult&) = delete;
+  DevicePutResult& operator=(const DevicePutResult&) = delete;
+  DevicePutResult(DevicePutResult&&) noexcept = default;
+  DevicePutResult& operator=(DevicePutResult&&) noexcept = default;
+
+  // Points to the on-device array. Not owned.
+  tsl::RCReference<ifrt::Array> ifrt_array;
+  bool weak_type;
+
+  nanobind::object owning_pybuffer;
+};
+
+// Copies a buffer-like object to be on device.
+//
+// If `arg` is not convertible to a `PjRtBuffer` from C++, an error will be
+// returned; float0s are not supported yet.
+// If the value is known to be a PyBuffer object, py_buffer can be passed as
+// an optimization to avoid a Python->C++ cast.
+//
+// This function performs Python work inline but postpones C++ work until the
+// returned function is called. The returned function must be called after
+// releasing GIL. Useful for batching GIL release when there are many device_put
+// to execute.
+//
+// May throw exceptions from nanobind in addition to failing via an error
+// absl::Status. (We could catch these if needed, but there seems little point.)
+struct DevicePutOptions {
+  bool squash_64bit_types = false;
+  bool allow_zero_copy = true;
+};
+using DevicePutResultFn =
+    absl::AnyInvocable<absl::StatusOr<DevicePutResult>() &&>;
+absl::StatusOr<DevicePutResultFn> DevicePut(nanobind::handle arg,
+                                            ifrt::Client* client,
+                                            ifrt::Device* to_device,
+                                            const DevicePutOptions& options,
+                                            ifrt::MemoryKind to_memory_kind);
+
+// Returns `true` if `arg` is a JAX float0 array.
+bool IsFloat0(xla::nb_numpy_ndarray arg);
+
+// Describes the abstract shape and dtype of an argument.
+struct PyArgSignature {
+  PyArgSignature(PrimitiveType dtype, absl::Span<const int64_t> shape,
+                 bool weak_type)
+      : dtype(dtype), shape(shape.begin(), shape.end()), weak_type(weak_type) {}
+  // This is the XLA dtype of the object.
+  const PrimitiveType dtype;
+  const absl::InlinedVector<int64_t, 4> shape;
+  // JAX arguments can be of weak type, if and only if they are Python scalars
+  // or `DeviceArray` values such that `aval.weak_type` is true.
+  const bool weak_type;
+  bool operator==(const PyArgSignature& other) const {
+    return std::tie(dtype, weak_type, shape) ==
+           std::tie(other.dtype, other.weak_type, other.shape);
+  }
+  bool operator!=(const PyArgSignature& other) const {
+    return !(*this == other);
+  }
+  std::string DebugString() const;
+};
+
+// Returns the PyArgSignature associated with an argument. Returns an error if
+// the argument is not supported.
+absl::StatusOr<PyArgSignature> PyArgSignatureOfValue(nanobind::handle arg,
+                                                     bool jax_enable_x64);
+
+template <typename H>
+H AbslHashValue(H h, const xla::PyArgSignature& s) {
+  h = H::combine(std::move(h), s.dtype);
+  h = H::combine_contiguous(std::move(h), s.shape.data(), s.shape.size());
+  return h;
+}
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_VALUES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/python_ref_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/python/python_ref_manager.h
new file mode 100644
index 00000000..4f1d8212
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/python_ref_manager.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PYTHON_REF_MANAGER_H_
+#define XLA_PYTHON_PYTHON_REF_MANAGER_H_
+
+#include <Python.h>
+
+#include <atomic>
+#include <deque>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+// Class that manages destruction of Python objects.
+//
+// We must not destroy Python objects without holding the GIL. However, we
+// frequently want to hold references to Python objects for the duration of
+// an asynchronous transfer on a Stream, and release our reference when the
+// transfer completes.
+//
+// This class holds references to Python objects outside a GIL scope, that can
+// be collected later when the GIL is held by calling CollectGarbage().
+class PythonRefManager {
+ public:
+  PythonRefManager() = default;
+
+  // Holds references to a set of nanobind::objects, adding the references to
+  // the PythonRefManager on destruction.
+  class ManagedPyObjects {
+   public:
+    ManagedPyObjects() = default;
+    ManagedPyObjects(PythonRefManager* manager,
+                     absl::Span<nanobind::object> objects);
+
+    ~ManagedPyObjects();
+
+    ManagedPyObjects(const ManagedPyObjects& other) = delete;
+    ManagedPyObjects(ManagedPyObjects&& other) = default;
+    ManagedPyObjects& operator=(const ManagedPyObjects& other) = delete;
+    ManagedPyObjects& operator=(ManagedPyObjects&& other) noexcept = default;
+
+   private:
+    PythonRefManager* manager_ = nullptr;
+    absl::InlinedVector<nanobind::object, 1> objects_;
+  };
+
+  // Creates a managed std::shared_ptr to an object. When the shared_ptr is
+  // destroyed, the reference to 'object' will be added to python_garbage_,
+  // and collected next time CollectGarbage() is called.
+  std::shared_ptr<ManagedPyObjects> ManageReference(nanobind::object object);
+  std::shared_ptr<ManagedPyObjects> ManageReferences(
+      absl::Span<nanobind::object> objects);
+
+  // Adds garbage objects to the manager.
+  void AddGarbage(nanobind::object garbage);
+  void AddGarbage(absl::Span<nanobind::object> garbage);
+  void AddGarbage(absl::Span<std::pair<PyCodeObject*, int> const> garbage);
+
+  // Releases the contents of python_garbage_. Requires that the GIL is held.
+  // The client calls this method during API entry points where the GIL is held
+  // to free any garbage that has accumulated.
+  void CollectGarbage();
+
+  // Cheaper version of CollectGarbage() with relaxed consistency and frequency.
+  // The purpose of this function is to amortize lock acquisition costs over
+  // a larger number of API calls.
+  void MaybeCollectGarbage() {
+    if (garbage_count_.load(std::memory_order_relaxed) >= 100) {
+      CollectGarbage();
+    }
+  }
+
+ private:
+  absl::Mutex mu_;
+  std::deque<nanobind::object> python_garbage_ ABSL_GUARDED_BY(mu_);
+
+  // Writes to garbage_count_ are protected by mu_, reads are not protected.
+  std::atomic<int> garbage_count_{0};
+};
+
+// A global PythonRefManager. Unless `CollectGarbage()` is called before
+// shutdown, this container will hold on to Python objects and thus cause a
+// leak. This behavior is similar to `tensorflow::ClearDecRefCache()`.
+PythonRefManager* GlobalPyRefManager();
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PYTHON_REF_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/pytree.h b/third_party/tflite-hdrs/third_party/xla/xla/python/pytree.h
new file mode 100644
index 00000000..fc16fdd4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/pytree.h
@@ -0,0 +1,407 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PYTREE_H_
+#define XLA_PYTHON_PYTREE_H_
+
+// See https://jax.readthedocs.io/en/latest/pytrees.html for the documentation
+// about pytree.
+
+#include <Python.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+// placeholder for index annotation headers
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/hash/hash.h"
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/pytree.pb.h"
+
+namespace xla {
+
+enum class PyTreeKind {
+  kLeaf,        // An opaque leaf node
+  kNone,        // None.
+  kTuple,       // A tuple
+  kNamedTuple,  // A collections.namedtuple
+  kList,        // A list
+  kDict,        // A dict
+  kCustom,      // A custom type.
+  kDataclass,   // A dataclass.
+};
+
+// Registry of custom node types.
+class PyTreeRegistry {
+ public:
+  PyTreeRegistry(bool enable_none, bool enable_tuple, bool enable_namedtuple,
+                 bool enable_list, bool enable_dict);
+
+  PyTreeRegistry(const PyTreeRegistry&) = delete;
+  PyTreeRegistry(PyTreeRegistry&&) = delete;
+  PyTreeRegistry& operator=(const PyTreeRegistry&) = delete;
+  PyTreeRegistry& operator=(PyTreeRegistry&&) = delete;
+
+  struct Registration {
+    PyTreeKind kind;
+
+    // The following values are populated for custom types.
+    // The Python type object, used to identify the type.
+    nanobind::object type;
+    // A function with signature: object -> (iterable, aux_data)
+    nanobind::callable to_iterable;
+    // A function with signature: (aux_data, iterable) -> object
+    nanobind::callable from_iterable;
+    // A function with signature: (aux_data, iterable(keypath, leaf)) -> object
+    std::optional<nanobind::callable> to_iterable_with_keys;
+
+    // Helper that calls to_iterable and validates that it returns a pair
+    // of an iterable and an aux_data object
+    std::pair<nanobind::iterable, nanobind::object> ToIterable(
+        nanobind::handle o) const;
+    // Helper that calls to_iterable_with_keys and validates that it returns a
+    // pair of an iterable of key-leaf pairs and an aux_data object. If
+    // to_iterable_with_keys is not available, return a dummy key for each leaf,
+    // similar to the current jax.tree_util.FlattenedIndexKey.
+    std::pair<std::vector<std::pair<nanobind::object, nanobind::object>>,
+              nanobind::object>
+    ToIterableWithKeys(nanobind::handle o) const;
+
+    // For dataclasses.
+    std::vector<nanobind::str> data_fields;
+    std::vector<nanobind::str> meta_fields;
+
+    int tp_traverse(visitproc visit, void* arg);
+  };
+
+  // Registers a new custom type. Objects of `type` will be treated as container
+  // node types in PyTrees.
+  void Register(
+      nanobind::object type, nanobind::callable to_iterable,
+      nanobind::callable from_iterable,
+      std::optional<nanobind::callable> to_iterable_with_keys = std::nullopt);
+  // Same, but for dataclasses.
+  void RegisterDataclass(nanobind::object type,
+                         std::vector<nanobind::str> data_fields,
+                         std::vector<nanobind::str> meta_fields);
+
+  // Finds the custom type registration for `type`. Returns nullptr if none
+  // exists.
+  const Registration* Lookup(nanobind::handle type) const;
+
+  PyTreeKind KindOfObject(nanobind::handle obj,
+                          PyTreeRegistry::Registration const** custom) const;
+
+  // Flattens a pytree one level, returning either a tuple of the leaves and
+  // the node data, or None, if the entry is a leaf.
+  nanobind::object FlattenOneLevel(nanobind::handle x) const;
+  // Similar to above but returns a key-leaf pair for each leaf.
+  nanobind::object FlattenOneLevelWithKeys(nanobind::handle x) const;
+  // Underlying implementation of FlattenOneLevel and FlattenOneLevelWithKeys.
+  nanobind::object FlattenOneLevelImpl(nanobind::handle x,
+                                       bool with_keys) const;
+
+  static PyType_Slot slots_[];
+
+ private:
+  struct TypeHash {
+    using is_transparent = void;
+    size_t operator()(const nanobind::object& t) const {
+      return absl::HashOf(t.ptr());
+    }
+    size_t operator()(const nanobind::handle& t) const {
+      return absl::HashOf(t.ptr());
+    }
+  };
+  struct TypeEq {
+    using is_transparent = void;
+    bool operator()(const nanobind::object& a,
+                    const nanobind::object& b) const {
+      return a.ptr() == b.ptr();
+    }
+    bool operator()(const nanobind::object& a,
+                    const nanobind::handle& b) const {
+      return a.ptr() == b.ptr();
+    }
+  };
+  absl::flat_hash_map<nanobind::object, std::unique_ptr<Registration>, TypeHash,
+                      TypeEq>
+      registrations_;
+  bool enable_namedtuple_;
+
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+};
+
+class SequenceKey {
+ public:
+  explicit SequenceKey(int idx) : idx_(idx) {};
+  std::string ToReprString() const;
+  std::string ToString() const;
+  bool Equals(const nanobind::object& other);
+  int idx() const { return idx_; }
+  static nanobind::tuple MatchArgs(nanobind::handle unused);
+
+ private:
+  int idx_;
+};
+
+class DictKey {
+ public:
+  explicit DictKey(nanobind::object key) : key_(key) {};
+  std::string ToReprString() const;
+  std::string ToString() const;
+  bool Equals(const nanobind::object& other);
+  nanobind::object key() const { return key_; }
+  static nanobind::tuple MatchArgs(nanobind::handle unused);
+  static PyType_Slot slots_[];
+
+ private:
+  nanobind::object key_;
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+};
+
+class GetAttrKey {
+ public:
+  explicit GetAttrKey(nanobind::str name) : name_(name) {};
+  std::string ToReprString() const;
+  std::string ToString() const;
+  bool Equals(const nanobind::object& other);
+  nanobind::str name() const { return name_; }
+  static nanobind::tuple MatchArgs(nanobind::handle unused);
+
+ private:
+  nanobind::str name_;
+};
+
+class FlattenedIndexKey {
+ public:
+  explicit FlattenedIndexKey(int key) : key_(key) {};
+  std::string ToReprString() const;
+  std::string ToString() const;
+  bool Equals(const nanobind::object& other);
+  int key() const { return key_; }
+  static nanobind::tuple MatchArgs(nanobind::handle unused);
+
+ private:
+  int key_;
+};
+
+// A PyTreeDef describes the tree structure of a PyTree. A PyTree is a tree of
+// Python values, where the interior nodes are tuples, lists, dictionaries, or
+// user-defined containers, and the leaves are other objects.
+class PyTreeDef {
+ public:
+  // Unowned registry: the registry must remain live at least as long as the
+  // PyTreeDef. It is the caller's responsibility to enforce this.
+  explicit PyTreeDef(PyTreeRegistry* registry) : registry_(registry) {}
+
+  explicit PyTreeDef(nb_class_ptr<PyTreeRegistry> registry)
+      : registry_(registry.get()), registry_ref_(std::move(registry)) {}
+
+  // Flattens a Pytree into a list of leaves and a PyTreeDef.
+  // Returns references to the flattened objects, which might be temporary
+  // objects in the case of custom pytype handlers.
+  static std::pair<std::vector<nanobind::object>, nb_class_ptr<PyTreeDef>>
+  Flatten(nanobind::handle x, nb_class_ptr<PyTreeRegistry> registry,
+          std::optional<nanobind::callable> leaf_predicate = std::nullopt);
+
+  // Flattens a Pytree into a list of `leaves` and a PyTreeDef (this).
+  // `leaves` owns references to the flattened objects, which might be
+  // temporary objects in the case of custom pytype handlers.
+  void Flatten(nanobind::handle handle, std::vector<nanobind::object>& leaves,
+               std::optional<nanobind::callable> leaf_predicate = std::nullopt);
+  void Flatten(nanobind::handle handle,
+               absl::InlinedVector<nanobind::object, 2>& leaves,
+               std::optional<nanobind::callable> leaf_predicate = std::nullopt);
+  void Flatten(nanobind::handle handle, nanobind::list& leaves,
+               std::optional<nanobind::callable> leaf_predicate = std::nullopt);
+
+  void FlattenWithPath(
+      nanobind::handle handle, nanobind::list& leaves,
+      std::optional<nanobind::callable> leaf_predicate = std::nullopt);
+
+  // Tests whether the given list is a flat list of leaves.
+  static bool AllLeaves(PyTreeRegistry* registry, const nanobind::iterable& x);
+
+  // Flattens a Pytree up to this PyTreeDef. 'this' must be a tree prefix of
+  // the tree-structure of 'x'. For example, if we flatten a value
+  // [(1, (2, 3)), {"foo": 4}] with a treedef [(*, *), *], the result is the
+  // list of leaves [1, (2, 3), {"foo": 4}].
+  nanobind::list FlattenUpTo(nanobind::handle x) const;
+
+  // Returns an unflattened PyTree given an iterable of leaves and a PyTreeDef.
+  nanobind::object Unflatten(nanobind::iterable leaves) const;
+  nanobind::object Unflatten(absl::Span<const nanobind::object> leaves) const;
+
+  // Composes two PyTreeDefs, replacing the leaves of this tree with copies of
+  // `inner`. The returned PyTreeDef holds a reference to its registry.
+  nb_class_ptr<PyTreeDef> Compose(const PyTreeDef& inner) const;
+
+  // Makes a Tuple PyTreeDef out of a vector of PyTreeDefs.
+  static nb_class_ptr<PyTreeDef> Tuple(nb_class_ptr<PyTreeRegistry> registry,
+                                       nanobind::list defs);
+
+  // The returned PyTreeDefs hold a reference to the registry.
+  std::vector<nb_class_ptr<PyTreeDef>> Children() const;
+
+  // Maps a function over a PyTree structure, applying f_leaf to each leaf, and
+  // f_node(node, node_data) to each container node.
+  nanobind::object Walk(const nanobind::callable& f_node,
+                        nanobind::handle f_leaf,
+                        nanobind::iterable leaves) const;
+
+  // Given a tree of iterables with the same node/leaf structure as this PyTree,
+  // build the corresponding PyTree.
+  // TODO(phawkins): use flattening everywhere instead and delete this method.
+  nanobind::object FromIterableTree(nanobind::handle xs) const;
+
+  int num_leaves() const {
+    if (traversal_.empty()) {
+      return 0;
+    }
+    return traversal_.back().num_leaves;
+  }
+
+  int num_nodes() const { return traversal_.size(); }
+
+  PyTreeRegistry* registry() const { return registry_; }
+
+  size_t Hash() const;
+
+  bool operator==(const PyTreeDef& other) const;
+  bool operator!=(const PyTreeDef& other) const { return !(*this == other); }
+
+  std::string ToString() const;
+
+  // Transforms the PyTreeDef into a pickleable object. Used to implement
+  // `PyTreeDef.__getstate__`.
+  nanobind::object ToPickle() const;
+
+  // Transforms the object returned by `ToPickleable()` back to PyTreeDef. Used
+  // to implement `PyTreeDef.__setstate__`.
+  void FromPickle(nanobind::object pickleable);
+
+  void SerializeTo(jax::PyTreeDefProto& result) const;
+
+  static nb_class_ptr<PyTreeDef> DeserializeFrom(
+      nb_class_ptr<PyTreeRegistry> registry, const jax::PyTreeDefProto& input);
+
+  std::optional<std::pair<nanobind::object, nanobind::object>> GetNodeData()
+      const;
+
+  static nb_class_ptr<PyTreeDef> MakeFromNodeDataAndChildren(
+      nb_class_ptr<PyTreeRegistry> registry,
+      std::optional<std::pair<nanobind::object, nanobind::object>> node_data,
+      nanobind::iterable children);
+
+  static PyType_Slot slots_[];
+
+ private:
+  void SetNumLeavesAndNumNodes();
+
+  struct Node {
+    PyTreeKind kind = PyTreeKind::kLeaf;
+
+    // Arity for non-kLeaf types.
+    int arity = 0;
+
+    // Kind-specific auxiliary data. For a kNamedTuple, contains the tuple type
+    // object. For a kDict, use `sorted_dict_keys` field below. For a kCustom
+    // type, contains the auxiliary data returned by the `to_iterable` function.
+    nanobind::object node_data;
+
+    // Kind-specific auxiliary data specialized for kDict. Use a c++ vector
+    // to hold the sorted dict keys instead of a py::list to avoid creating
+    // a new python list object when flattening kDict. For deeply nested dict,
+    // using c++ vector instead of py::list avoids creating too many python
+    // objects that make python gc sweep slow.
+    std::vector<nanobind::object> sorted_dict_keys;
+
+    // Custom type registration. Must be null for non-custom types.
+    const PyTreeRegistry::Registration* custom = nullptr;
+
+    // Number of leaf nodes in the subtree rooted at this node.
+    int num_leaves = 0;
+
+    // Number of leaf and interior nodes in the subtree rooted at this node.
+    int num_nodes = 0;
+
+    int tp_traverse(visitproc visit, void* arg) const;
+  };
+  template <typename H>
+  friend H AbslHashValue(H h, const Node& n);
+
+  template <typename H>
+  friend H AbslHashValue(H h, const PyTreeDef& t);
+
+  // Helper that manufactures an instance of a node given its children.
+  static nanobind::object MakeNode(const Node& node,
+                                   absl::Span<nanobind::object> children);
+
+  // Recursive helper used to implement FromIterableTree()
+  nanobind::object FromIterableTreeHelper(
+      nanobind::handle xs,
+      absl::InlinedVector<PyTreeDef::Node, 1>::const_reverse_iterator* it)
+      const;
+
+  template <typename T>
+  void FlattenImpl(nanobind::handle handle, T& leaves,
+                   const std::optional<nanobind::callable>& leaf_predicate,
+                   std::optional<std::vector<nanobind::object>>& keypath);
+
+  template <typename T>
+  nanobind::object UnflattenImpl(T leaves) const;
+
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+
+  // Pytree registry. Not owned.
+  PyTreeRegistry* registry_;
+  // If this class holds a reference to `registry`, it is held by
+  // `registry_ref_`.
+  nb_class_ptr<PyTreeRegistry> registry_ref_;
+
+  // Nodes, in a post-order traversal. We use an ordered traversal to minimize
+  // allocations, and post-order corresponds to the order we need to rebuild the
+  // tree structure.
+  absl::InlinedVector<Node, 1> traversal_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const PyTreeDef::Node& n) {
+  h = H::combine(std::move(h), n.kind, n.arity, n.custom);
+  return h;
+}
+
+template <typename H>
+H AbslHashValue(H h, const PyTreeDef& t) {
+  h = H::combine(std::move(h), t.traversal_);
+  return h;
+}
+
+void BuildPytreeSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PYTREE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/refine_polymorphic_shapes.h b/third_party/tflite-hdrs/third_party/xla/xla/python/refine_polymorphic_shapes.h
new file mode 100644
index 00000000..da60a0cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/refine_polymorphic_shapes.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The JAX Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
+#define XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
+
+#include "absl/status/status.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+
+namespace xla {
+
+// Refines the dynamic shapes for a module whose "main" has static shapes
+// and all the intermediate dynamic shapes depend only on the input static
+// shapes. Upon refinement, validates that the module does not contain remaining
+// dynamic shapes.
+// If `enable_shape_assertions` is false, then the shape assertions
+// are removed from the module, otherwise they are removed only if the
+// assertions hold, and result in an error otherwise.
+absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
+                                     bool enable_shape_assertions);
+
+// Like the above but with serialized input and output modules.
+// If `validate_static_shapes` is true, then checks that only static shapes
+// are left after refinement.
+absl::Status RefinePolymorphicShapes(llvm::StringRef module_str,
+                                     llvm::raw_ostream &os,
+                                     bool enable_shape_assertions,
+                                     bool validate_static_shapes);
+
+// Validates that the module has only static shapes.
+absl::Status ValidateStaticShapes(mlir::ModuleOp module);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/sharded_device_array.h b/third_party/tflite-hdrs/third_party/xla/xla/python/sharded_device_array.h
new file mode 100644
index 00000000..8bbec37e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/sharded_device_array.h
@@ -0,0 +1,217 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
+#define XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
+
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/stl/variant.h"  // IWYU pragma: keep
+#include "xla/python/types.h"
+
+// TODO(jblespiau): The current implementation moves the Python logic to C++,
+// as a preliminary step to executing the `pmap` execution path from C++.
+// It implements the current Python behavior (thus, it may not be optimal, and
+// we will be able to modify it later).
+
+namespace jax {
+
+// High level introduction.
+//
+// pmap and other parallel computation functions distribute some computation on
+// several devices. On December 2020, the devices mesh (i.e. N-dimentional array
+// of devices on which we map the computation) is defined by the user.
+//
+// We describe how to shard the inputs, and how to map it to the mesh of devices
+// using `ShardingSpec`. It's mainly based on 2 components:
+// - `sharding`, which specifies how to shard the inputs.
+// - `mesh_mapping`, which specifies how to map shards to devices.
+//
+// The 3 following structs define how to shard one dimension of an ndarry.
+//
+// `NoSharding` (`None` in Python) means no sharding.
+struct NoSharding {
+  bool operator==(const NoSharding& other) const { return true; }
+  bool operator!=(const NoSharding& other) const { return false; }
+};
+
+template <typename H>
+H AbslHashValue(H h, const NoSharding& key) {
+  return h;
+}
+
+// `Chunked` means that the dimension is split into np.prod(chunks) chunks
+// and the split dimension itself is preserved inside the map.
+// Those chunks are distributed over `len(chunks)` ShardedAxes axes
+// (major-to-minor).
+// For example, for a tensor `t` of shape [N] sharded using [Chunked([p])] (with
+// p  dividing N, let S = N // p) the tensor will be split into p chunks of
+// shape [S], such sharded_t[k] = t[k * S: (k+1)*S] (left included, right
+// excluded) for k in {0, ... p-1}.
+struct Chunked {
+ public:
+  explicit Chunked(std::vector<int> chunks_) : chunks(std::move(chunks_)) {}
+  // The number of chunks per axis.
+  std::vector<int> chunks;
+
+  bool operator==(const Chunked& other) const { return chunks == other.chunks; }
+  bool operator!=(const Chunked& other) const { return chunks != other.chunks; }
+};
+
+template <typename H>
+H AbslHashValue(H h, const Chunked& key) {
+  h = H::combine(std::move(h), key.chunks);
+  return h;
+}
+
+// `Unstacked` means that the dimension is split into chunks of size 1, and
+// doesn't appear inside the map. `size` is always the dimension size.
+// For example, a Tensor t of shape [N] will be sharded into N tensors of shape
+// [], when using `Unstacked(N)`.
+struct Unstacked {
+ public:
+  explicit Unstacked(int sz) : size(sz) {}
+  int size;
+
+  bool operator==(const Unstacked& other) const { return size == other.size; }
+  bool operator!=(const Unstacked& other) const { return size != other.size; }
+};
+
+template <typename H>
+H AbslHashValue(H h, const Unstacked& key) {
+  h = H::combine(std::move(h), key.size);
+  return h;
+}
+
+using AvalDimSharding = std::variant<NoSharding, Chunked, Unstacked>;
+
+// Assigns sharded axes to mesh dimensions.
+//
+// The devices will be for each dimension which has a sharded `AvalDimSharding`
+// When no axis is assigned, the data is replicated.
+// As indices are 0-indexed, `ShardedAxis(1)` refers to the second actually
+// sharded axis (i.e. counting as if the None dimensions of sharding were
+// filtered out).
+// For example, given the sharding `[Unstacked(n), None, Chunked(m)]`, an entry
+// of `ShardedAxis(1)` refers to the `Chunked(m)` axis, not the `None`.
+
+struct ShardedAxis {
+  int axis;
+  bool operator==(const ShardedAxis& other) const { return axis == other.axis; }
+  bool operator!=(const ShardedAxis& other) const { return axis != other.axis; }
+};
+
+template <typename H>
+H AbslHashValue(H h, const ShardedAxis& key) {
+  h = H::combine(std::move(h), key.axis);
+  return h;
+}
+
+struct Replicated {
+  int replicas;
+  bool operator==(const Replicated& other) const {
+    return replicas == other.replicas;
+  }
+  bool operator!=(const Replicated& other) const {
+    return replicas != other.replicas;
+  }
+};
+
+template <typename H>
+H AbslHashValue(H h, const Replicated& key) {
+  h = H::combine(std::move(h), key.replicas);
+  return h;
+}
+
+using MeshDimAssignment = std::variant<ShardedAxis, Replicated>;
+
+// Describes how each axis is sharded (if it is), and how it's mapped to the
+// devices mesh. See Jax pxla.py for the documentation.
+//
+// ShardingSpec is shared across pmap, pjit and xpmap. For pmap, an input
+// `sharding`  is composed of `NoSharding` and at most one `Unstacked`.
+// If `axis_size=None`, at least one the inputs has a dimension associated to
+// `Unstacked`.
+//
+// Examples:
+//
+// 1. For pmap, with a tensor of shape [8, 2, 2], to unstack along the first
+//    dimension into [8] devices:
+//
+//    sharding = [Unstacked(8), NoSharding, NoSharding]
+//    mesh_mapping = [ShardedAxis(0)]
+//
+// 2. With an input array of shape [6], that we want to chunk into [2, 3]
+//    Assuming an device mesh [3, 4, 2] of devices, we will have:
+//
+//    sharding = [Chunked([2, 3])]
+//    mesh_mapping = [ShardedAxis(1), Replicated, ShardedAxis(0)]
+//
+//    In particular, in the above example, the ShardedAxis refers to indices
+//    of the sharded shape [2, 3]. (only the `Chunked` sharding can produce more
+//    than one dimension).
+class ShardingSpec {
+ public:
+  ShardingSpec(std::vector<AvalDimSharding> sharding,
+               std::vector<MeshDimAssignment> mesh_mapping)
+      : sharding_(std::move(sharding)),
+        mesh_mapping_(std::move(mesh_mapping)) {}
+  ShardingSpec(nanobind::iterable py_sharding,
+               nanobind::iterable py_mesh_mapping)
+      : sharding_(xla::IterableToVector<AvalDimSharding>(py_sharding)),
+        mesh_mapping_(
+            xla::IterableToVector<MeshDimAssignment>(py_mesh_mapping)) {}
+
+  const std::vector<AvalDimSharding>& GetSharding() const { return sharding_; }
+  const std::vector<MeshDimAssignment>& GetMeshMapping() const {
+    return mesh_mapping_;
+  }
+
+  bool operator==(const ShardingSpec& other) const {
+    return sharding_ == other.sharding_ && mesh_mapping_ == other.mesh_mapping_;
+  }
+
+  bool operator!=(const ShardingSpec& other) const { return !(*this == other); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ShardingSpec& key);
+
+ private:
+  //  `sharding` specifies how the array is supposed to get partitioned into
+  //  chunks. Its length matchs the rank of the array. See the docstring
+  //  of `AvalDimSharding` for the supported partitioning schemes.
+  std::vector<AvalDimSharding> sharding_;
+  //  `mesh_mapping` describes an assignments of the array chunks created by
+  //  `sharding` to a logical device mesh. The length of the tuple is equal to
+  //  the rank of the mesh. Each mesh dimension can either get partitions of
+  //  data varying along one of the sharded dimensions, or the data can be
+  //  replicated.
+  std::vector<MeshDimAssignment> mesh_mapping_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const ShardingSpec& key) {
+  h = H::combine(std::move(h), key.sharding_);
+  h = H::combine(std::move(h), key.mesh_mapping_);
+  return h;
+}
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/python/sharding.h
new file mode 100644
index 00000000..3d484e3c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/sharding.h
@@ -0,0 +1,244 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_SHARDING_H_
+#define XLA_PYTHON_SHARDING_H_
+
+#include <cstddef>
+#include <optional>
+#include <utility>
+
+// placeholder for index annotation headers
+#include "absl/hash/hash.h"
+#include "absl/status/statusor.h"
+#include "nanobind/nanobind.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/nb_numpy.h"
+#include "xla/python/py_client.h"
+#include "xla/python/py_device_list.h"
+#include "xla/python/sharded_device_array.h"
+#include "xla/xla_data.pb.h"
+
+namespace jax {
+
+class Sharding {
+ public:
+  Sharding() = default;
+
+  // This constructor is used in the fast path to retrieve the number of devices
+  // without falling back to python. This is only used in the cpp path.
+  explicit Sharding(int num_devices) : num_devices_(num_devices) {}
+
+  virtual ~Sharding() = default;
+
+  static int SafeNumDevices(nanobind::handle sharding);
+
+ private:
+  std::optional<int> num_devices_;
+};
+
+// Checks if the memory kind is valid, and canonicalizes the
+// memory kind to default memory on backends that support memories.
+nanobind::object CheckAndCanonicalizeMemoryKind(
+    nanobind::object memory_kind,
+    const xla::nb_class_ptr<PyDeviceList>& device_list);
+
+// Returns a hash that may sometimes return different hashes for equal values.
+// It is not a correct implementation of `__hash__` in python, but it's fine
+// for jit/pjit dispatch since it only causes spurious cache misses.
+size_t ShardingHash(nanobind::handle sharding);
+
+bool ShardingEqual(nanobind::handle a, nanobind::handle b);
+
+class NamedSharding : public Sharding {
+ public:
+  NamedSharding(nanobind::object mesh, nanobind::object spec,
+                nanobind::object memory_kind, nanobind::object parsed_pspec,
+                nanobind::object manual_axes,
+                nanobind::object logical_device_ids);
+
+  const nanobind::object& mesh() const { return mesh_; }
+  const nanobind::object& spec() const { return spec_; }
+  const nanobind::object& memory_kind() const { return memory_kind_; }
+  const nanobind::object& parsed_pspec() const { return parsed_pspec_; }
+  const nanobind::object& manual_axes() const { return manual_axes_; }
+  const nanobind::object& logical_device_ids() const {
+    return logical_device_ids_;
+  }
+  void set_parsed_pspec(nanobind::object parsed_pspec) {
+    parsed_pspec_ = std::move(parsed_pspec);
+  }
+
+  static nanobind::handle type() {
+    static auto type = nanobind::type<NamedSharding>();
+    return type;
+  }
+
+  absl::StatusOr<xla::nb_class_ptr<PyDeviceList>> internal_device_list() const {
+    if (internal_device_list_) {
+      return *internal_device_list_;
+    }
+    return xla::InvalidArgument(
+        "internal_device_list is not implemented for "
+        "`jax.sharding.AbstractMesh`");
+  }
+
+ private:
+  nanobind::object mesh_;
+  nanobind::object spec_;
+  nanobind::object memory_kind_;
+  nanobind::object parsed_pspec_;
+  nanobind::object manual_axes_;
+  nanobind::object logical_device_ids_;
+  std::optional<xla::nb_class_ptr<PyDeviceList>> internal_device_list_;
+};
+
+class SingleDeviceSharding : public Sharding {
+ public:
+  explicit SingleDeviceSharding(
+      nanobind::object device, nanobind::object memory_kind = nanobind::none());
+
+  // Used only in C++ to accelerate `PyArray::MakeFromSingleDeviceArray()`.
+  SingleDeviceSharding(xla::nb_class_ptr<xla::PyClient> client,
+                       tsl::RCReference<xla::ifrt::DeviceList> device_list,
+                       nanobind::object memory_kind);
+
+  const nanobind::object& device() const { return device_; }
+  const nanobind::object& memory_kind() const { return memory_kind_; }
+
+  static nanobind::handle type() {
+    static auto type = nanobind::type<SingleDeviceSharding>();
+    return type;
+  }
+
+  xla::nb_class_ptr<PyDeviceList> internal_device_list() const {
+    return internal_device_list_;
+  }
+
+ private:
+  nanobind::object device_;
+  nanobind::object memory_kind_;
+  xla::nb_class_ptr<PyDeviceList> internal_device_list_;
+};
+
+// The C++ implementation of jax.PmapSharding in python. It contains a few key
+// data members and methods that are performance-critical.
+class PmapSharding : public Sharding {
+ public:
+  PmapSharding(xla::nb_numpy_ndarray devices, ShardingSpec sharding_spec);
+
+  ~PmapSharding() override = default;
+
+  xla::nb_numpy_ndarray devices() const { return devices_; }
+
+  const ShardingSpec& sharding_spec() const { return sharding_spec_; }
+
+  static nanobind::handle type() {
+    static auto type = nanobind::type<PmapSharding>();
+    return type;
+  }
+
+  xla::nb_class_ptr<PyDeviceList> internal_device_list() const {
+    return internal_device_list_;
+  }
+
+ private:
+  xla::nb_numpy_ndarray devices_;
+  ShardingSpec sharding_spec_;
+  xla::nb_class_ptr<PyDeviceList> internal_device_list_;
+};
+
+class GSPMDSharding : public Sharding {
+ public:
+  GSPMDSharding(nanobind::sequence devices, xla::OpSharding op_sharding,
+                nanobind::object memory_kind, nanobind::object device_list)
+      : GSPMDSharding(
+            std::move(devices),
+            xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding)),
+            std::move(memory_kind), std::move(device_list)) {}
+
+  GSPMDSharding(nanobind::sequence devices, xla::HloSharding op_sharding,
+                nanobind::object memory_kind, nanobind::object device_list);
+
+  const nanobind::tuple& devices() const { return devices_; }
+  const nanobind::object& memory_kind() const { return memory_kind_; }
+
+  size_t Hash() {
+    if (!hash_.has_value()) {
+      hash_ = CalculateHash();
+    }
+    return *hash_;
+  }
+
+  static nanobind::handle type() {
+    static auto type = nanobind::type<GSPMDSharding>();
+    return type;
+  }
+
+  const xla::HloSharding& hlo_sharding() const { return hlo_sharding_; }
+
+  bool operator==(const GSPMDSharding& other) const {
+    return AreOpShardingsEqual(*this, other) &&
+           this->devices().equal(other.devices()) &&
+           this->memory_kind().equal(other.memory_kind());
+  }
+
+  xla::nb_class_ptr<PyDeviceList> internal_device_list() const {
+    return internal_device_list_;
+  }
+
+ private:
+  size_t CalculateHash() const {
+    // We only hash `hlo_sharding_` here for performance.
+    return absl::Hash<xla::HloSharding>()(hlo_sharding_);
+  }
+
+  static bool AreOpShardingsEqual(const GSPMDSharding& a,
+                                  const GSPMDSharding& b) {
+    // If the OpSharding object is the same, return true
+    if (&a.hlo_sharding() == &b.hlo_sharding()) {
+      return true;
+    }
+    // If both OpShardings are replicated, return true
+    if (a.IsOpShardingReplicated() && b.IsOpShardingReplicated()) {
+      return true;
+    }
+    return a.hlo_sharding() == b.hlo_sharding();
+  }
+
+  bool IsOpShardingReplicated() const {
+    // For JAX, shardings with 1 device are considered as replicated in its
+    // semantics so that downstream things continue to work.
+    if (hlo_sharding_.tile_assignment().num_elements() == 1) {
+      return true;
+    }
+    return hlo_sharding().IsReplicated();
+  }
+
+  nanobind::tuple devices_;
+  xla::HloSharding hlo_sharding_;
+  nanobind::object memory_kind_;
+  std::optional<size_t> hash_;
+  xla::nb_class_ptr<PyDeviceList> internal_device_list_;
+};
+
+void RegisterSharding(nanobind::module_& m);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/to_ifrt_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/python/to_ifrt_sharding.h
new file mode 100644
index 00000000..dad74f5d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/to_ifrt_sharding.h
@@ -0,0 +1,47 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_TO_IFRT_SHARDING_H_
+#define XLA_PYTHON_TO_IFRT_SHARDING_H_
+
+#include "nanobind/nanobind.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/sharding.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Gets `xla::HloSharding` from a JAX Sharding.
+xla::HloSharding GetXlaHloSharding(nanobind::handle sharding,
+                                   int64_t num_dimensions);
+
+// Gets `xla::ifrt::DeviceList` from a JAX Sharding.
+absl::StatusOr<tsl::RCReference<xla::ifrt::DeviceList>> GetIfrtDeviceList(
+    nanobind::handle sharding_py);
+
+// Converts a JAX Sharding into `xla::ifrt::HloSharding`.
+absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>> GetIfrtHloSharding(
+    nanobind::handle sharding, const xla::ifrt::Shape& shape);
+
+// Converts a JAX Sharding into `xla::ifrt::ConcreteEvenSharding`.
+absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>>
+GetIfrtConcreteEvenSharding(nanobind::handle sharding, xla::ifrt::DType dtype,
+                            const xla::ifrt::Shape& shape);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_TO_IFRT_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/traceback.h b/third_party/tflite-hdrs/third_party/xla/xla/python/traceback.h
new file mode 100644
index 00000000..da803627
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/traceback.h
@@ -0,0 +1,108 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_TRACEBACK_H_
+#define XLA_PYTHON_TRACEBACK_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+// placeholder for index annotation headers
+#include "absl/container/inlined_vector.h"
+#include "nanobind/nanobind.h"
+#include "xla/python/nb_class_ptr.h"
+
+namespace xla {
+
+// Represents a Python traceback. This object is designed to be allocated on
+// the Python heap; creating or destroying a traceback requires the GIL.
+class Traceback {
+ public:
+  // Requires GIL. Creates a Traceback object that requires destructor to be
+  // invoked with GIL held as well.
+  static std::optional<nb_class_ptr<Traceback>> Get();
+
+  // Requires GIL.
+  static bool enabled() { return enabled_; }
+  // Requires GIL.
+  static void SetEnabled(bool enabled);
+
+  // Requires GIL. Don't call this directly, you're looking for Get().
+  Traceback();
+  // Requires GIL.
+  ~Traceback();
+
+  Traceback(const Traceback&) = delete;
+  Traceback(Traceback&& other) noexcept;
+  Traceback& operator=(const Traceback&) = delete;
+  Traceback& operator=(Traceback&&) = delete;
+
+  // Requires the GIL be held.
+  std::string ToString() const;
+
+  struct Frame {
+    nanobind::str file_name;
+    nanobind::str function_name;
+    int function_start_line;
+    int line_num;
+
+    std::string ToString() const;
+  };
+  std::vector<Frame> Frames() const;
+
+  const absl::InlinedVector<std::pair<PyCodeObject*, int>, 32>& raw_frames()
+      const {
+    return frames_;
+  }
+
+  // Returns the traceback as a fake Python Traceback object, suitable for
+  // using as an exception traceback.
+  nanobind::object AsPythonTraceback() const;
+
+  bool operator==(const Traceback& other) const {
+    return frames_ == other.frames_;
+  }
+  bool operator!=(const Traceback& other) const {
+    return frames_ != other.frames_;
+  }
+
+ private:
+  // Each frame is a pair of a code object and a "lasti" instruction location
+  // in bytes. The size of _Py_CODEUNIT has changed across different Python
+  // versions; the lasti value here has already been multiplied by
+  // sizeof(_Py_CODEUNIT) if needed and is suitable for passing to functions
+  // like PyCode_Addr2Line().
+  absl::InlinedVector<std::pair<PyCodeObject*, int>, 32> frames_;
+
+  // Protected by GIL.
+  static bool enabled_;
+};
+
+using nb_traceback = nb_class_ptr<Traceback>;
+
+template <typename H>
+H AbslHashValue(H h, const Traceback& traceback) {
+  h = H::combine(std::move(h), traceback.raw_frames());
+  return h;
+}
+
+void BuildTracebackSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_TRACEBACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/types.h b/third_party/tflite-hdrs/third_party/xla/xla/python/types.h
new file mode 100644
index 00000000..aacfea1a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/types.h
@@ -0,0 +1,250 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_TYPES_H_
+#define XLA_PYTHON_TYPES_H_
+
+#include <Python.h>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "nanobind/nanobind.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/nb_numpy.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Converts a NumPy dtype to a PrimitiveType.
+absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const nb_dtype& np_type);
+
+// Converts a PrimitiveType to a Numpy dtype.
+absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type);
+
+// Converts an IFRT dtype to a NumPy dtype.
+absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype);
+
+absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
+
+// Converts an IFRT dtype to a NumPy dtype. It specially converts `kToken` into
+// bool to avoid exposing the token type to the JAX dtype system, expecting JAX
+// internals to use a bool array to express a token input/output.
+absl::StatusOr<nb_dtype> IfrtDtypeToDtypeWithTokenCanonicalization(
+    ifrt::DType dtype);
+
+// Returns a Python buffer protocol (PEP 3118) format descriptor string for
+// `type`. Return nullptr if there is no suitable choice of format string.
+const char* PEP3118FormatDescriptorForPrimitiveType(PrimitiveType type);
+
+// Returns a numpy-style typestr for `type`, as returned by np.dtype(...).str
+absl::StatusOr<nanobind::str> TypeDescriptorForPrimitiveType(
+    PrimitiveType type);
+
+struct NumpyScalarTypes {
+  nanobind::object np_bool;
+  // Remove std::optional once the minimum ml_dtypes in JAX is >= 0.4.1.
+  std::optional<nanobind::object> np_int2;
+  nanobind::object np_int4;
+  nanobind::object np_int8;
+  nanobind::object np_int16;
+  nanobind::object np_int32;
+  nanobind::object np_int64;
+  // Remove std::optional once the minimum ml_dtypes in JAX is >= 0.4.1.
+  std::optional<nanobind::object> np_uint2;
+  nanobind::object np_uint4;
+  nanobind::object np_uint8;
+  nanobind::object np_uint16;
+  nanobind::object np_uint32;
+  nanobind::object np_uint64;
+  nanobind::object np_bfloat16;
+  // Remove std::optional once the minimum ml_dtypes in JAX is >= 0.5.0.
+  std::optional<nanobind::object> np_float8_e3m4;
+  std::optional<nanobind::object> np_float8_e4m3;
+  nanobind::object np_float8_e4m3fn;
+  nanobind::object np_float8_e4m3b11fnuz;
+  nanobind::object np_float8_e4m3fnuz;
+  nanobind::object np_float8_e5m2;
+  nanobind::object np_float8_e5m2fnuz;
+  nanobind::object np_float16;
+  nanobind::object np_float32;
+  nanobind::object np_float64;
+  nanobind::object np_complex64;
+  nanobind::object np_complex128;
+  nanobind::object np_longlong;
+  nanobind::object np_intc;
+};
+const NumpyScalarTypes& GetNumpyScalarTypes();
+
+// For S64/U64/F64/C128 types, returns the largest 32-bit equivalent.
+PrimitiveType Squash64BitTypes(PrimitiveType type);
+
+// Returns the strides for `shape`.
+std::vector<int64_t> ByteStridesForShape(const Shape& shape);
+std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
+                                         absl::Span<const int64_t> dimensions,
+                                         const xla::Layout& layout);
+std::vector<int64_t> StridesForShape(PrimitiveType element_type,
+                                     absl::Span<const int64_t> dimensions,
+                                     const xla::Layout& layout);
+
+// Converts a literal to (possibly-nested tuples of) NumPy arrays.
+// The literal's leaf arrays are not copied; instead the NumPy arrays share
+// buffers with the literals. Takes ownership of `literal` and keeps the
+// necessary pieces alive using Python reference counting.
+// Requires the GIL.
+absl::StatusOr<nanobind::object> LiteralToPython(
+    std::shared_ptr<Literal> literal);
+
+template <typename T>
+nanobind::tuple SpanToNbTuple(absl::Span<T const> xs) {
+  nanobind::tuple out =
+      nanobind::steal<nanobind::tuple>(PyTuple_New(xs.size()));
+  for (int i = 0; i < xs.size(); ++i) {
+    PyTuple_SET_ITEM(out.ptr(), i, nanobind::cast(xs[i]).release().ptr());
+  }
+  return out;
+}
+
+// Converts a sequence of Python objects to a Python tuple, stealing the
+// references to the objects.
+nanobind::tuple MutableSpanToNbTuple(absl::Span<nanobind::object> xs);
+
+template <typename T>
+std::vector<T> IterableToVector(const nanobind::iterable& iterable) {
+  std::vector<T> output;
+  for (auto item : iterable) {
+    output.push_back(nanobind::cast<T>(item));
+  }
+  return output;
+}
+template <typename T>
+std::vector<T> SequenceToVector(const nanobind::sequence& sequence) {
+  std::vector<T> output;
+  output.reserve(PySequence_Size(sequence.ptr()));
+  for (auto item : sequence) {
+    output.push_back(nanobind::cast<T>(item));
+  }
+  return output;
+}
+
+// Private helper function used in the implementation of the type caster for
+// xla::BorrowingLiteral. Converts a Python array-like object into a buffer
+// pointer and shape.
+struct CastToArrayResult {
+  nanobind::object array;  // Holds a reference to the array to keep it alive.
+  const char* buf_ptr;
+  xla::Shape shape;
+};
+std::optional<CastToArrayResult> CastToArray(nanobind::handle h);
+
+}  // namespace xla
+
+namespace nanobind {
+namespace detail {
+
+// Literals.
+// Literal data can be passed to XLA as a NumPy array; its value can be
+// cast to an xla::BorrowingLiteral or xla::LiteralSlice in a zero-copy way.
+// We don't have any literal -> numpy conversions here, since all the methods
+// that want to return arrays build Python objects directly.
+
+template <>
+struct type_caster<xla::BorrowingLiteral> {
+ public:
+  using Value = xla::BorrowingLiteral;
+  static constexpr auto Name = const_name("xla::BorrowingLiteral");  // NOLINT
+  template <typename T_>
+  using Cast = movable_cast_t<T_>;
+  explicit operator Value*() { return &value; }
+  explicit operator Value&() { return (Value&)value; }
+  explicit operator Value&&() { return (Value&&)value; }
+  Value value;
+
+  // Pybind appears to keep type_casters alive until the callee has run.
+  absl::InlinedVector<nanobind::object, 1> arrays;
+
+  bool from_python(handle input, uint8_t, cleanup_list*) noexcept {
+    // TODO(b/79707221): support nested tuples if/when XLA adds support for
+    // nested BorrowingLiterals.
+    if (nanobind::isinstance<nanobind::tuple>(input)) {
+      nanobind::tuple tuple = nanobind::borrow<nanobind::tuple>(input);
+      std::vector<xla::Shape> shapes;
+      std::vector<const char*> buffers;
+      arrays.reserve(tuple.size());
+      shapes.reserve(tuple.size());
+      buffers.reserve(tuple.size());
+      for (nanobind::handle entry : tuple) {
+        auto c = xla::CastToArray(entry);
+        if (!c) {
+          return false;
+        }
+        arrays.push_back(c->array);
+        buffers.push_back(c->buf_ptr);
+        shapes.push_back(c->shape);
+      }
+      value = xla::BorrowingLiteral(buffers,
+                                    xla::ShapeUtil::MakeTupleShape(shapes));
+    } else {
+      auto c = xla::CastToArray(input);
+      if (!c) {
+        return false;
+      }
+      arrays.push_back(c->array);
+      value = xla::BorrowingLiteral(c->buf_ptr, c->shape);
+    }
+    return true;
+  }
+};
+
+template <>
+struct type_caster<xla::LiteralSlice> {
+ public:
+  NB_TYPE_CASTER(xla::LiteralSlice, const_name("xla::LiteralSlice"));
+
+  // Pybind appears to keep type_casters alive until the callee has run.
+  type_caster<xla::BorrowingLiteral> literal_caster;
+
+  bool from_python(handle handle, uint8_t flags,
+                   cleanup_list* cleanup) noexcept {
+    if (!literal_caster.from_python(handle, flags, cleanup)) {
+      return false;
+    }
+    value = static_cast<const xla::BorrowingLiteral&>(literal_caster);
+    return true;
+  }
+
+  static handle from_cpp(xla::LiteralSlice src, rv_policy policy,
+                         cleanup_list* cleanup) noexcept {
+    PyErr_Format(PyExc_NotImplementedError,
+                 "LiteralSlice::from_cpp not implemented");
+    return handle();
+  }
+};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/util.h b/third_party/tflite-hdrs/third_party/xla/xla/python/util.h
new file mode 100644
index 00000000..fa71dbc2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/util.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_UTIL_H_
+#define XLA_PYTHON_UTIL_H_
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+
+namespace xla {
+
+// Requests if given buffers are ready, awaits for results and returns OK if
+// all of the buffers are ready or the last non-ok status.
+absl::Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/weakref_lru_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/python/weakref_lru_cache.h
new file mode 100644
index 00000000..a04086a6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/weakref_lru_cache.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_WEAKREF_LRU_CACHE_H_
+#define XLA_PYTHON_WEAKREF_LRU_CACHE_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace jax {
+
+void BuildWeakrefLRUCacheAPI(nanobind::module_& m);
+
+}  // namespace jax
+
+#endif  // XLA_PYTHON_WEAKREF_LRU_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/xla_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/python/xla_compiler.h
new file mode 100644
index 00000000..0c1a445d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/xla_compiler.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_XLA_COMPILER_H_
+#define XLA_PYTHON_XLA_COMPILER_H_
+
+// placeholder for index annotation headers
+#include "nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildXlaCompilerSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_XLA_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/python/xplane_to_profile_instructions.h b/third_party/tflite-hdrs/third_party/xla/xla/python/xplane_to_profile_instructions.h
new file mode 100644
index 00000000..4375480a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/python/xplane_to_profile_instructions.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_XPLANE_TO_PROFILE_INSTRUCTIONS_H_
+#define XLA_PYTHON_XPLANE_TO_PROFILE_INSTRUCTIONS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+
+// Separator for fingerprint and hlo_name in the cost name of
+// ProfiledInstructionsProto.
+extern const char kCostNameSep[];
+
+// Latency info for a single HLO instruction.
+struct HloLatencyInfo {
+  std::vector<double> durations;
+};
+
+// Convert XSpace to ProfiledInstructionsProto. This function will aggregate
+// all the xplane.pb info into ProfiledInstructionsProto.
+absl::Status ConvertXplaneToProfiledInstructionsProto(
+    std::vector<tensorflow::profiler::XSpace> xspaces,
+    tensorflow::profiler::ProfiledInstructionsProto*
+        profiled_instructions_proto);
+
+// Convert XSpace to ProfiledInstructionsProto. This function will aggregate
+// all the xplane.pb info under logdir into ProfiledInstructionsProto.
+absl::Status ConvertXplaneUnderLogdirToProfiledInstructionsProto(
+    const std::string& logdir, tensorflow::profiler::ProfiledInstructionsProto*
+                                   profiled_instructions_proto);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_XPLANE_TO_PROFILE_INSTRUCTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/reference_util.h b/third_party/tflite-hdrs/third_party/xla/xla/reference_util.h
new file mode 100644
index 00000000..9585cb6b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/reference_util.h
@@ -0,0 +1,639 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_REFERENCE_UTIL_H_
+#define XLA_REFERENCE_UTIL_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/hlo/builder/padding.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Utility class for reference implementations of linear algebra routines.
+class ReferenceUtil {
+ public:
+  // Returns the result of a transpose operation on the input matrix.
+  template <typename T>
+  static std::unique_ptr<Array2D<T>> TransposeArray2D(
+      const Array2D<T>& operand) {
+    auto result =
+        std::make_unique<Array2D<T>>(operand.width(), operand.height());
+    for (int64_t w = 0; w < operand.width(); ++w) {
+      for (int64_t h = 0; h < operand.height(); ++h) {
+        (*result)(w, h) = operand(h, w);
+      }
+    }
+
+    return result;
+  }
+
+  // Returns the result of a matrix multiply `lhs x rhs`.
+  template <typename T>
+  static std::unique_ptr<Array2D<T>> MatmulArray2D(const Array2D<T>& lhs,
+                                                   const Array2D<T>& rhs) {
+    return HloEvaluator::MatmulArray2D(lhs, rhs);
+  }
+
+  // Converts the input operand to use f64 values instead of f32 values.
+  static std::unique_ptr<Array2D<double>> Array2DF32ToF64(
+      const Array2D<float>& input);
+
+  // Returns the result of a convolution `lhs <conv> rhs`, with the default
+  // convolution dimension numbers returned from
+  // ComputationBuilder::CreateDefaultConvDimensionNumbers().
+  static std::unique_ptr<Array4D<float>> ConvArray4D(
+      const Array4D<float>& lhs, const Array4D<float>& rhs,
+      std::pair<int64_t, int64_t> kernel_stride, Padding padding);
+
+  // Returns the result of a convolution `lhs <conv> rhs`, with the given
+  // convolution dimension numbers.
+  static std::unique_ptr<Array4D<float>> ConvArray4DGeneralDimensions(
+      const Array4D<float>& lhs, const Array4D<float>& rhs,
+      std::pair<int64_t, int64_t> kernel_stride, Padding padding,
+      ConvolutionDimensionNumbers dimension_numbers);
+
+  // Returns the result of a convolution `lhs <conv> rhs`, with the given
+  // dilation factors.
+  static std::unique_ptr<Array4D<float>> ConvArray4DGeneralDimensionsDilated(
+      const Array4D<float>& lhs, const Array4D<float>& rhs,
+      std::pair<int64_t, int64_t> kernel_stride, Padding padding,
+      std::pair<int64_t, int64_t> lhs_dilation,
+      std::pair<int64_t, int64_t> rhs_dilation,
+      ConvolutionDimensionNumbers dnums);
+
+  // Returns the result of a convolution `lhs <conv> rhs`, with the default
+  // convolution dimension numbers returned from
+  // ComputationBuilder::CreateDefaultConvDimensionNumbers().
+  static std::unique_ptr<Array3D<float>> ConvArray3D(const Array3D<float>& lhs,
+                                                     const Array3D<float>& rhs,
+                                                     int64_t kernel_stride,
+                                                     Padding padding);
+
+  // Returns the result of a convolution `lhs <conv> rhs`.
+  static std::unique_ptr<Array3D<float>> ConvArray3DGeneralDimensionsDilated(
+      const Array3D<float>& lhs, const Array3D<float>& rhs,
+      int64_t kernel_stride, Padding padding, int64_t lhs_dilation,
+      int64_t rhs_dilation, const ConvolutionDimensionNumbers& dnums);
+
+  // Returns the result of a separable  convolution with the given parameters.
+  // kernel_stride and padding applies to the depthwise convolution during
+  // the separable convolution. pointwise_weights.depth() must be equal to
+  // input.depth() * depthwise_weights.planes().
+  static std::unique_ptr<Array4D<float>> SeparableConvArray4D(
+      const Array4D<float>& input, const Array4D<float>& depthwise_weights,
+      const Array4D<float>& pointwise_weights,
+      std::pair<int64_t, int64_t> kernel_stride, Padding padding);
+
+  // Returns the result of reducing a matrix to a column vector. init is the
+  // initial value for the reduce operation, and reduce_function is the function
+  // to apply for each reduction step.
+  static std::unique_ptr<std::vector<float>> ReduceToColArray2D(
+      const Array2D<float>& matrix, float init,
+      absl::FunctionRef<float(float, float)> reduce_function);
+
+  // Returns the result of reducing a matrix to a row vector. init is the
+  // initial value for the reduce operation, and reduce_function is the function
+  // to apply for each reduction step.
+  static std::unique_ptr<std::vector<float>> ReduceToRowArray2D(
+      const Array2D<float>& matrix, float init,
+      absl::FunctionRef<float(float, float)> reduce_function);
+
+  // Performs a R2=>R1 reduction by reducing away the dimension specified in
+  // 'dimension_to_reduce'.
+  template <typename T>
+  static std::vector<T> ReduceR2ToR1(const Array2D<T>& input,
+                                     int dimension_to_reduce, T init,
+                                     absl::FunctionRef<T(T, T)> freduce) {
+    std::vector<T> result(dimension_to_reduce == 0 ? input.n2() : input.n1(),
+                          init);
+    for (int i0 = 0; i0 < input.n1(); ++i0) {
+      for (int i1 = 0; i1 < input.n2(); ++i1) {
+        int output = dimension_to_reduce == 0 ? i1 : i0;
+        result[output] = freduce(result[output], input(i0, i1));
+      }
+    }
+    return result;
+  }
+
+  // Returns the result of reducing the 4D array to a vector, reducing away
+  // the dimensions specified in dims.
+  static std::vector<float> Reduce4DTo1D(
+      const Array4D<float>& array, float init, absl::Span<const int64_t> dims,
+      absl::FunctionRef<float(float, float)> reduce_function);
+
+  // Broadcast 1D dimension to 4D, from the dimension `broadcast_from_dim`.
+  static std::unique_ptr<Array4D<float>> Broadcast1DTo4D(
+      const std::vector<float>& array, const std::vector<int64_t>& bounds,
+      int64_t broadcast_from_dim);
+
+  // Returns the result of reducing the 3D array to a 2D array, reducing away
+  // the dimensions specified in dims.
+  static std::unique_ptr<Array2D<float>> Reduce3DTo2D(
+      const Array3D<float>& array, float init, absl::Span<const int64_t> dims,
+      absl::FunctionRef<float(float, float)> reduce_function);
+
+  // Applies map_function to each element in the input (2D array) and returns
+  // the result.
+  static std::unique_ptr<Array2D<float>> MapArray2D(
+      const Array2D<float>& matrix,
+      absl::FunctionRef<float(float)> map_function);
+
+  // Applies map_function to each pair of corresponding elements in the two
+  // inputs arrays and returns the result.
+  static std::unique_ptr<Array2D<float>> MapArray2D(
+      const Array2D<float>& lhs, const Array2D<float>& rhs,
+      absl::FunctionRef<float(float, float)> map_function);
+
+  // Applies map_function to each element in the input (3D array) and returns
+  // the result.
+  static std::unique_ptr<Array3D<float>> MapArray3D(
+      const Array3D<float>& array,
+      absl::FunctionRef<float(float)> map_function);
+
+  // Applies map_function to each pair of corresponding elements in the two
+  // inputs arrays and returns the result.
+  static std::unique_ptr<Array3D<float>> MapArray3D(
+      const Array3D<float>& lhs, const Array3D<float>& rhs,
+      absl::FunctionRef<float(float, float)> map_function);
+
+  // Number of windows in a given dimension. Calculation taken from
+  // xla::MakePadding().
+  static int64_t WindowCount(int64_t unpadded_width, int64_t window_len,
+                             int64_t stride, Padding padding);
+
+  // Windowed reductions with Add as the function to apply.
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DAdd(
+      const Array4D<float>& operand, float init,
+      absl::Span<const int64_t> window, absl::Span<const int64_t> stride,
+      Padding padding);
+
+  // Windowed reductions with a generic reduce function.
+  static std::unique_ptr<std::vector<float>> ReduceWindow1DGeneric(
+      absl::Span<const float> operand, float init,
+      absl::FunctionRef<float(float, float)> reduce_func,
+      absl::Span<const int64_t> window, absl::Span<const int64_t> stride,
+      absl::Span<const std::pair<int64_t, int64_t>> padding);
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
+      const Array4D<float>& operand, float init,
+      absl::FunctionRef<float(float, float)> reduce_func,
+      absl::Span<const int64_t> window, absl::Span<const int64_t> stride,
+      Padding padding);
+  // With arbitrary padding.
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
+      const Array4D<float>& operand, float init,
+      absl::FunctionRef<float(float, float)> reduce_func,
+      absl::Span<const int64_t> window, absl::Span<const int64_t> stride,
+      absl::Span<const std::pair<int64_t, int64_t>> padding);
+
+  // Batch normalize data.
+  static std::unique_ptr<Array4D<float>> BatchNorm4D(
+      const Array4D<float>& input, const Array4D<float>& mean,
+      const Array4D<float>& var, const Array4D<float>& scale,
+      const Array4D<float>& offset, float epsilon);
+
+  // Performs select and scatter with Greater Than or equal as the select, plus
+  // as the scatter, and Same Padding.
+  // TODO(b/74533103) Switch tests to evaluator and remove this implementation.
+  static std::unique_ptr<Array4D<float>> SelectAndScatter4DGePlus(
+      const Array4D<float>& operand, const Array4D<float>& source, float init,
+      absl::Span<const int64_t> window, absl::Span<const int64_t> stride,
+      bool same_padding);
+
+  // Concatenates the lhs and rhs arrays along the concatenate_dimension.
+  // E.g. if concatenate_dimension is 0, the "n1"/height dimension is
+  // concatenated, so the arrays are stacked on top of each other.
+  template <typename T>
+  static std::unique_ptr<Array2D<T>> Concat2D(const Array2D<T>& lhs,
+                                              const Array2D<T>& rhs,
+                                              int concatenate_dimension) {
+    CHECK(0 <= concatenate_dimension && concatenate_dimension < 2);
+    auto result = std::make_unique<Array2D<T>>(
+        concatenate_dimension == 0 ? lhs.n1() + rhs.n1() : lhs.n1(),
+        concatenate_dimension == 1 ? lhs.n2() + rhs.n2() : lhs.n2());
+    for (int64_t i0 = 0; i0 < result->n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < result->n2(); ++i1) {
+        // If we exceed the bounds of the LHS, draw from the RHS, where the
+        // result index is adjusted by the number of values present in the LHS.
+        (*result)(i0, i1) = i0 < lhs.n1() && i1 < lhs.n2()
+                                ? lhs(i0, i1)
+                                : rhs(i0 >= lhs.n1() ? i0 - lhs.n1() : i0,
+                                      i1 >= lhs.n2() ? i1 - lhs.n2() : i1);
+      }
+    }
+    return result;
+  }
+
+  // Concatenates the lhs and rhs 3D arrays along the concatenate_dimension. lhs
+  // and rhs must have the same dimensions except for the concatenate dimension.
+  template <typename T>
+  static std::unique_ptr<Array3D<T>> Concat3D(const Array3D<T>& lhs,
+                                              const Array3D<T>& rhs,
+                                              int concatenate_dimension) {
+    CHECK(0 <= concatenate_dimension && concatenate_dimension < 3);
+    const int64_t lhs_dims[] = {lhs.n1(), lhs.n2(), lhs.n3()};
+    const int64_t rhs_dims[] = {rhs.n1(), rhs.n2(), rhs.n3()};
+    int64_t out_dims[] = {rhs.n1(), rhs.n2(), rhs.n3()};
+    for (int i = 0; i < 3; ++i) {
+      if (i != concatenate_dimension) {
+        out_dims[i] = lhs_dims[i];
+        CHECK_EQ(lhs_dims[i], rhs_dims[i]);
+      } else {
+        out_dims[i] = lhs_dims[i] + rhs_dims[i];
+      }
+    }
+    auto result =
+        std::make_unique<Array3D<T>>(out_dims[0], out_dims[1], out_dims[2]);
+    for (int64_t i0 = 0; i0 < result->n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < result->n2(); ++i1) {
+        for (int64_t i2 = 0; i2 < result->n3(); ++i2) {
+          (*result)(i0, i1, i2) =
+              i0 < lhs.n1() && i1 < lhs.n2() && i2 < lhs.n3()
+                  ? lhs(i0, i1, i2)
+                  : rhs(i0 >= lhs.n1() ? i0 - lhs.n1() : i0,
+                        i1 >= lhs.n2() ? i1 - lhs.n2() : i1,
+                        i2 >= lhs.n3() ? i2 - lhs.n3() : i2);
+        }
+      }
+    }
+    return result;
+  }
+
+  // Concatenates the lhs and rhs 4D arrays along the concatenate_dimension. lhs
+  // and rhs must have the same dimensions except for the concatenate dimension.
+  template <typename T>
+  static std::unique_ptr<Array4D<T>> Concat4D(const Array4D<T>& lhs,
+                                              const Array4D<T>& rhs,
+                                              int concatenate_dimension) {
+    CHECK(0 <= concatenate_dimension && concatenate_dimension < 4);
+    const int64_t lhs_dims[] = {lhs.n1(), lhs.n2(), lhs.n3(), lhs.n4()};
+    const int64_t rhs_dims[] = {rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()};
+    int64_t out_dims[] = {rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()};
+    for (int i = 0; i < 4; ++i) {
+      if (i != concatenate_dimension) {
+        out_dims[i] = lhs_dims[i];
+        CHECK_EQ(lhs_dims[i], rhs_dims[i]);
+      } else {
+        out_dims[i] = lhs_dims[i] + rhs_dims[i];
+      }
+    }
+    auto result = std::make_unique<Array4D<T>>(out_dims[0], out_dims[1],
+                                               out_dims[2], out_dims[3]);
+    for (int64_t i0 = 0; i0 < result->n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < result->n2(); ++i1) {
+        for (int64_t i2 = 0; i2 < result->n3(); ++i2) {
+          for (int64_t i3 = 0; i3 < result->n4(); ++i3) {
+            (*result)(i0, i1, i2, i3) =
+                i0 < lhs.n1() && i1 < lhs.n2() && i2 < lhs.n3() && i3 < lhs.n4()
+                    ? lhs(i0, i1, i2, i3)
+                    : rhs(i0 >= lhs.n1() ? i0 - lhs.n1() : i0,
+                          i1 >= lhs.n2() ? i1 - lhs.n2() : i1,
+                          i2 >= lhs.n3() ? i2 - lhs.n3() : i2,
+                          i3 >= lhs.n4() ? i3 - lhs.n4() : i3);
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  // Slices with index clamping
+  template <typename T>
+  static std::vector<T> ClampSlice1D(absl::Span<const T> input, int64_t start,
+                                     int64_t size) {
+    start = std::min<int64_t>(std::max<int64_t>(0, start), input.size() - size);
+    std::vector<T> result;
+    for (int64_t i = 0; i < size; ++i) {
+      result.push_back(input[(start + i)]);
+    }
+    return result;
+  }
+
+  // Slices the input array given starting indices, limit indices, and strides
+  // in each dimension.
+  template <typename T>
+  static std::unique_ptr<Array2D<T>> Slice2D(const Array2D<T>& input,
+                                             std::array<int64_t, 2> starts,
+                                             std::array<int64_t, 2> limits,
+                                             std::array<int64_t, 2> strides) {
+    CHECK_LE(starts[0], input.n1());
+    CHECK_LE(starts[1], input.n2());
+    CHECK_LE(limits[0], input.n1());
+    CHECK_LE(limits[1], input.n2());
+    CHECK_GE(strides[0], 1);
+    CHECK_GE(strides[1], 1);
+    auto result = std::make_unique<Array2D<T>>(
+        CeilOfRatio(limits[0] - starts[0], strides[0]),
+        CeilOfRatio(limits[1] - starts[1], strides[1]));
+    for (int64_t i0 = 0; i0 < result->n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < result->n2(); ++i1) {
+        (*result)(i0, i1) =
+            input(starts[0] + i0 * strides[0], starts[1] + i1 * strides[1]);
+      }
+    }
+    return result;
+  }
+
+  template <typename T>
+  static std::unique_ptr<Array3D<T>> Slice3D(const Array3D<T>& input,
+                                             std::array<int64_t, 3> starts,
+                                             std::array<int64_t, 3> limits,
+                                             std::array<int64_t, 3> strides) {
+    CHECK_LE(starts[0], input.n1());
+    CHECK_LE(starts[1], input.n2());
+    CHECK_LE(starts[2], input.n3());
+    CHECK_LE(limits[0], input.n1());
+    CHECK_LE(limits[1], input.n2());
+    CHECK_LE(limits[2], input.n3());
+    CHECK_GE(strides[0], 1);
+    CHECK_GE(strides[1], 1);
+    CHECK_GE(strides[2], 1);
+    auto result = std::make_unique<Array3D<T>>(
+        CeilOfRatio(limits[0] - starts[0], strides[0]),
+        CeilOfRatio(limits[1] - starts[1], strides[1]),
+        CeilOfRatio(limits[2] - starts[2], strides[2]));
+
+    for (int64_t i0 = 0; i0 < result->n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < result->n2(); ++i1) {
+        for (int64_t i2 = 0; i2 < result->n3(); ++i2) {
+          (*result)(i0, i1, i2) =
+              input(starts[0] + i0 * strides[0], starts[1] + i1 * strides[1],
+                    starts[2] + i2 * strides[2]);
+        }
+      }
+    }
+    return result;
+  }
+
+  template <typename T>
+  static std::unique_ptr<Array4D<T>> Slice4D(const Array4D<T>& input,
+                                             std::array<int64_t, 4> starts,
+                                             std::array<int64_t, 4> limits,
+                                             std::array<int64_t, 4> strides) {
+    CHECK_LE(starts[0], input.n1());
+    CHECK_LE(starts[1], input.n2());
+    CHECK_LE(starts[2], input.n3());
+    CHECK_LE(starts[3], input.n4());
+    CHECK_LE(limits[0], input.n1());
+    CHECK_LE(limits[1], input.n2());
+    CHECK_LE(limits[2], input.n3());
+    CHECK_LE(limits[3], input.n4());
+    CHECK_GE(strides[0], 1);
+    CHECK_GE(strides[1], 1);
+    CHECK_GE(strides[2], 1);
+    CHECK_GE(strides[3], 1);
+    auto result = std::make_unique<Array4D<T>>(
+        CeilOfRatio(limits[0] - starts[0], strides[0]),
+        CeilOfRatio(limits[1] - starts[1], strides[1]),
+        CeilOfRatio(limits[2] - starts[2], strides[2]),
+        CeilOfRatio(limits[3] - starts[3], strides[3]));
+    for (int64_t i0 = 0; i0 < result->n1(); ++i0) {
+      for (int64_t i1 = 0; i1 < result->n2(); ++i1) {
+        for (int64_t i2 = 0; i2 < result->n3(); ++i2) {
+          for (int64_t i3 = 0; i3 < result->n4(); ++i3) {
+            (*result)(i0, i1, i2, i3) =
+                input(starts[0] + i0 * strides[0], starts[1] + i1 * strides[1],
+                      starts[2] + i2 * strides[2], starts[3] + i3 * strides[3]);
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  // Applies map_function to each element in the input (2D array) and returns
+  // the result.
+  // (row, column) index of each element is also provided as arguments to
+  // map_function.
+  static std::unique_ptr<Array2D<float>> MapWithIndexArray2D(
+      const Array2D<float>& matrix,
+      absl::FunctionRef<float(float, int64_t, int64_t)> map_function);
+
+  // Applies map_function to each element in the input (4D array) and returns
+  // the result.
+  template <typename F>
+  static std::unique_ptr<Array4D<float>> MapArray4D(const Array4D<float>& input,
+                                                    F&& map_function) {
+    return MapWithIndexArray4D(
+        input, [&](float value, int64_t, int64_t, int64_t, int64_t) {
+          return map_function(value);
+        });
+  }
+
+  // Applies map_function to each element in the input (4D array) and returns
+  // the result.
+  // (plane, depth, height, width) index of each element is also provided as
+  // arguments to map_function.
+  template <typename F>
+  static std::unique_ptr<Array4D<float>> MapWithIndexArray4D(
+      const Array4D<float>& input, F&& map_function) {
+    auto result = std::make_unique<Array4D<float>>(
+        input.planes(), input.depth(), input.height(), input.width());
+    for (int64_t plane = 0; plane < input.planes(); ++plane) {
+      for (int64_t depth = 0; depth < input.depth(); ++depth) {
+        for (int64_t height = 0; height < input.height(); ++height) {
+          for (int64_t width = 0; width < input.width(); ++width) {
+            (*result)(plane, depth, height, width) =
+                map_function(input(plane, depth, height, width), plane, depth,
+                             height, width);
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  // Applies map_function to each pair of elements in the input lhs and rhs
+  // (4D array) and returns the result.
+  template <typename F>
+  static std::unique_ptr<Array4D<float>> MapArray4D(const Array4D<float>& lhs,
+                                                    const Array4D<float>& rhs,
+                                                    F&& map_function) {
+    return MapWithIndexArray4D(
+        lhs, rhs,
+        [&](float lhs, float rhs, int64_t, int64_t, int64_t, int64_t) {
+          return map_function(lhs, rhs);
+        });
+  }
+
+  // Applies map_function to each pair of element in lhs and rhs (4D array) and
+  // returns the result.
+  // (plane, depth, height, width) index of each element is also provided as
+  // arguments to map_function.
+  template <typename F>
+  static std::unique_ptr<Array4D<float>> MapWithIndexArray4D(
+      const Array4D<float>& lhs, const Array4D<float>& rhs, F&& map_function) {
+    auto result = std::make_unique<Array4D<float>>(lhs.planes(), lhs.depth(),
+                                                   lhs.height(), lhs.width());
+    for (int64_t plane = 0; plane < lhs.planes(); ++plane) {
+      for (int64_t depth = 0; depth < lhs.depth(); ++depth) {
+        for (int64_t height = 0; height < lhs.height(); ++height) {
+          for (int64_t width = 0; width < lhs.width(); ++width) {
+            (*result)(plane, depth, height, width) = map_function(
+                lhs(plane, depth, height, width),
+                rhs(plane, depth, height, width), plane, depth, height, width);
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  // Returns the result of a 2D pad on an input matrix.
+  template <typename NativeT>
+  static std::unique_ptr<Array2D<NativeT>> PadArray2D(
+      const Array2D<NativeT>& operand, const PaddingConfig& padding,
+      const NativeT pad) {
+    int64_t in0 = operand.n1();
+    int64_t high_padding0 = padding.dimensions(0).edge_padding_high();
+    int64_t low_padding0 = padding.dimensions(0).edge_padding_low();
+    int64_t interior_padding0 = padding.dimensions(0).interior_padding();
+    int64_t out0 =
+        in0 + low_padding0 + high_padding0 + (in0 - 1) * interior_padding0;
+
+    int64_t in1 = operand.n2();
+    int64_t high_padding1 = padding.dimensions(1).edge_padding_high();
+    int64_t low_padding1 = padding.dimensions(1).edge_padding_low();
+    int64_t interior_padding1 = padding.dimensions(1).interior_padding();
+    int64_t out1 =
+        in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1;
+
+    auto result = std::make_unique<Array2D<NativeT>>(out0, out1);
+    result->Fill(pad);
+    int64_t o0 = low_padding0;
+    for (int64_t i0 = 0; i0 < in0; ++i0) {
+      int64_t o1 = low_padding1;
+      for (int64_t i1 = 0; i1 < in1; ++i1) {
+        if (o0 >= 0 && o1 >= 0 && o0 < out0 && o1 < out1) {
+          (*result)(o0, o1) = operand(i0, i1);
+        }
+        o1 += interior_padding1 + 1;
+      }
+      o0 += interior_padding0 + 1;
+    }
+    return result;
+  }
+
+  // Returns the result of a 4D pad on an input array.
+  template <typename NativeT>
+  static Array4D<NativeT> PadArray4D(const Array4D<NativeT>& operand,
+                                     const PaddingConfig& padding,
+                                     const NativeT pad) {
+    CHECK_EQ(padding.dimensions_size(), 4);
+
+    const int64_t input_bounds[] = {operand.n1(), operand.n2(), operand.n3(),
+                                    operand.n4()};
+    int64_t pad_low[4];
+    int64_t pad_high[4];
+    int64_t pad_interior[4];
+    int64_t output_bounds[4];
+    for (int64_t i = 0; i < 4; ++i) {
+      pad_low[i] = padding.dimensions(i).edge_padding_low();
+      pad_high[i] = padding.dimensions(i).edge_padding_high();
+      CHECK_LE(0, padding.dimensions(i).interior_padding())
+          << "not implemented";
+      pad_interior[i] = padding.dimensions(i).interior_padding();
+
+      output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                         (input_bounds[i] - 1) * pad_interior[i];
+    }
+
+    Array4D<NativeT> result(output_bounds[0], output_bounds[1],
+                            output_bounds[2], output_bounds[3]);
+    result.Each([&](absl::Span<const int64_t> indices, NativeT* value) {
+      for (int i = 0; i < 4; ++i) {
+        bool in_low_padding = indices[i] < pad_low[i];
+        bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+        if (in_low_padding || in_high_padding) {
+          *value = pad;
+          return;
+        }
+        if (pad_interior[i] &&
+            (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+          *value = pad;
+          return;
+        }
+      }
+      *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                       (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                       (indices[2] - pad_low[2]) / (pad_interior[2] + 1),
+                       (indices[3] - pad_low[3]) / (pad_interior[3] + 1));
+    });
+    return result;
+  }
+
+  // ApplyElementwise2D(f, x, y, ...) returns the Array2D formed by running
+  // f(x[i], y[i], ...) for each array element in the Array2Ds x, y, ....
+  //
+  // The given arrays must have the same size and element type, and the return
+  // type of f must be implicitly convertible to the arrays' element type.
+  //
+  // Example usage:
+  //
+  //   Array2D<float> x, y, z = ...;
+  //   std::unique_ptr<Array2D> result = ReferenceUtil::ApplyElementwise2D(
+  //     [](float a, float b, float c) { return a * b + c; }, x, y, z);
+  //
+  template <typename F, typename T1, typename... Ts>
+  static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
+      F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
+    AssertSameSize2D(array1, arrays...);
+    auto result = std::make_unique<Array2D<T1>>(array1.n1(), array1.n2());
+    for (int64_t i = 0; i < array1.n1(); ++i) {
+      for (int64_t j = 0; j < array1.n2(); ++j) {
+        (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
+      }
+    }
+    return result;
+  }
+
+ private:
+  template <typename T1, typename T2, typename... Ts>
+  static void AssertSameSize2D(const Array2D<T1>& array1,
+                               const Array2D<T2>& array2,
+                               const Array2D<Ts>&... arrays) {
+    static_assert(std::is_same<T1, T2>::value, "Args must be same type.");
+    CHECK_EQ(array1.n1(), array2.n1());
+    CHECK_EQ(array1.n2(), array2.n2());
+    AssertSameSize2D(array2, arrays...);
+  }
+
+  // Recursive base case for AssertSameSize2D.
+  template <typename Array1>
+  static void AssertSameSize2D(const Array1& array1) {}
+
+  ReferenceUtil(const ReferenceUtil&) = delete;
+  ReferenceUtil& operator=(const ReferenceUtil&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_REFERENCE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/runtime/buffer_use.h b/third_party/tflite-hdrs/third_party/xla/xla/runtime/buffer_use.h
new file mode 100644
index 00000000..a1a698cd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/runtime/buffer_use.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_RUNTIME_BUFFER_USE_H_
+#define XLA_RUNTIME_BUFFER_USE_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla {
+
+// BufferUse tracks memory access type for a buffer slice, so that XLA can
+// correctly insert synchronization primitives at run time to avoid read/write
+// conflicts. Synchronization primitives are specific to the target backend.
+class BufferUse {
+ public:
+  enum class MemoryAccess { kRead, kWrite };
+
+  static constexpr MemoryAccess kRead = MemoryAccess::kRead;
+  static constexpr MemoryAccess kWrite = MemoryAccess::kWrite;
+
+  BufferUse(BufferAllocation::Slice slice, MemoryAccess access)
+      : slice_(slice), access_(access) {}
+
+  static BufferUse Read(BufferAllocation::Slice slice) {
+    return BufferUse(slice, MemoryAccess::kRead);
+  }
+
+  static BufferUse Write(BufferAllocation::Slice slice) {
+    return BufferUse(slice, MemoryAccess::kWrite);
+  }
+
+  // ReadWriteSet tracks a set of read and write buffer slices.
+  class ReadWriteSet {
+   public:
+    ReadWriteSet();
+
+    void Add(BufferUse use);
+    void AddRead(BufferAllocation::Slice slice);
+    void AddWrite(BufferAllocation::Slice slice);
+
+    void AddAll(absl::Span<const BufferUse> uses);
+
+    // Returns true if any of the buffer use(s) has a conflict with tracked
+    // buffer slice reads or writes.
+    bool HasConflicts(const BufferUse& use) const;
+    bool HasConflicts(const ReadWriteSet& other);
+
+   private:
+    absl::flat_hash_set<BufferAllocation::Slice> read_;
+    absl::flat_hash_set<BufferAllocation::Slice> write_;
+  };
+
+  bool operator==(const BufferUse& other) const {
+    return slice_ == other.slice_ && access_ == other.access_;
+  }
+
+  bool operator!=(const BufferUse& other) const { return !(*this == other); }
+
+  const BufferAllocation::Slice& slice() const { return slice_; }
+  MemoryAccess access() const { return access_; }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const BufferUse& use) {
+    return H::combine(std::move(h), use.slice_, use.access_);
+  }
+
+ private:
+  BufferAllocation::Slice slice_;
+  MemoryAccess access_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_RUNTIME_BUFFER_USE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/add_original_value.h b/third_party/tflite-hdrs/third_party/xla/xla/service/add_original_value.h
new file mode 100644
index 00000000..2a68cca8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/add_original_value.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ADD_ORIGINAL_VALUE_H_
+#define XLA_SERVICE_ADD_ORIGINAL_VALUE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/add_original_value.h"
+
+#endif  // XLA_SERVICE_ADD_ORIGINAL_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/algebraic_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/algebraic_simplifier.h
new file mode 100644
index 00000000..82fc9439
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/algebraic_simplifier.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALGEBRAIC_SIMPLIFIER_H_
+#define XLA_SERVICE_ALGEBRAIC_SIMPLIFIER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
+
+#endif  // XLA_SERVICE_ALGEBRAIC_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/algorithm_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/algorithm_util.h
new file mode 100644
index 00000000..e990ae97
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/algorithm_util.h
@@ -0,0 +1,75 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALGORITHM_UTIL_H_
+#define XLA_SERVICE_ALGORITHM_UTIL_H_
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// We try to keep most algorithm-specific queries in this file, so that we only
+// have to update one file when we add a new one.
+// We can also add some platform-specific queries as long as we don't need to
+// depend on specific targets, such as the "gpu" folder.
+namespace algorithm_util {
+
+// Get the ComputationType corresponding to an algorithm. See the
+// ComputationType definition for more info.
+absl::StatusOr<stream_executor::blas::ComputationType> GetBlasComputationType(
+    PrecisionConfig::Algorithm algorithm);
+
+// Get the accumulator type of an algorithm.
+absl::StatusOr<PrimitiveType> GetDotAccumulatorType(
+    PrecisionConfig::Algorithm algorithm);
+
+// Are the AType & BType TF32?
+bool HasTf32InputType(PrecisionConfig::Algorithm algorithm);
+
+// Checks if the algorithm uses fast accumulation as in
+// CUBLASLT_MATMUL_DESC_FAST_ACCUM.
+bool HasFastAccum(PrecisionConfig::Algorithm algorithm);
+
+// Checks if we support the given algorithm using cuBLAS or cuBLASLt.
+//
+// It's clear that those libraries could support more, but we only list the ones
+// which we explicitly test for now.
+//
+// We may want to also check storage types, but for now those are checked in
+// IsSupportedDotAlgorithmOnGpu.
+bool IsSupportedByCublasOrCublasLt(
+    PrecisionConfig::Algorithm algorithm,
+    stream_executor::GpuComputeCapability gpu_compute_capability);
+
+// Checks if we support the given algorithm using cuDNN.
+bool IsSupportedByCudnn(PrecisionConfig::Algorithm algorithm);
+
+// Checks if we support the given algorithm using the elemental IR emitter.
+bool IsSupportedByElementalIrEmitter(PrecisionConfig::Algorithm algorithm);
+
+// Is the given algorithm supported on GPU with the given compute capability and
+// input/output storage types.
+bool IsSupportedDotAlgorithmOnGpu(
+    PrecisionConfig::Algorithm algorithm,
+    stream_executor::GpuComputeCapability gpu_compute_capability,
+    PrimitiveType input_storage_type, PrimitiveType output_storage_type);
+
+}  // namespace algorithm_util
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALGORITHM_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_broadcast_reorder.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_broadcast_reorder.h
new file mode 100644
index 00000000..ce722207
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_broadcast_reorder.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
+#define XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h"
+
+#endif  // XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_combiner.h
new file mode 100644
index 00000000..9c702920
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_combiner.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_GATHER_COMBINER_H_
+#define XLA_SERVICE_ALL_GATHER_COMBINER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/all_gather_combiner.h"
+
+#endif  // XLA_SERVICE_ALL_GATHER_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_decomposer.h
new file mode 100644
index 00000000..cf271324
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_gather_decomposer.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+#define XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+// AllGatherDecomposer is a pass which converts unsupported all-gathers into
+// dynamic-update-slices and all-reduces.
+class AllGatherDecomposer : public HloModulePass {
+ public:
+  explicit AllGatherDecomposer(
+      std::function<bool(const HloAllGatherInstruction&)> should_decompose)
+      : should_decompose_(std::move(should_decompose)) {}
+  AllGatherDecomposer()
+      : should_decompose_(
+            [](const HloAllGatherInstruction& ag) { return true; }) {}
+  absl::string_view name() const override { return "all_gather_decomposer"; }
+
+  // Run AllGatherDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ protected:
+  virtual HloInstruction* TranslateAllGatherToAllReducePerOperand(
+      CollectiveOpGroupMode group_mode, const HloAllGatherInstruction& ag,
+      const Shape& output_shape, HloInstruction* operand, HloComputation* comp,
+      int64_t ag_dim);
+
+  virtual bool ShouldDecompose(const HloAllGatherInstruction& ag) const {
+    return should_decompose_(ag);
+  }
+
+  absl::Status DecomposeAllGather(HloAllGatherInstruction* ag,
+                                  HloComputation* comp);
+
+ private:
+  std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_combiner.h
new file mode 100644
index 00000000..f0f3a200
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_combiner.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_COMBINER_H_
+#define XLA_SERVICE_ALL_REDUCE_COMBINER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/all_reduce_combiner.h"
+
+#endif  // XLA_SERVICE_ALL_REDUCE_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_contiguous.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_contiguous.h
new file mode 100644
index 00000000..7dc1a650
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_contiguous.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
+#define XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/all_reduce_contiguous.h"
+
+#endif  // XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_folder.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_folder.h
new file mode 100644
index 00000000..6054de62
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_folder.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_FOLDER_H_
+#define XLA_SERVICE_ALL_REDUCE_FOLDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/all_reduce_folder.h"
+
+#endif  // XLA_SERVICE_ALL_REDUCE_FOLDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_key.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_key.h
new file mode 100644
index 00000000..fd72f7e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_key.h
@@ -0,0 +1,47 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_KEY_H_
+#define XLA_SERVICE_ALL_REDUCE_KEY_H_
+
+#include <cstdint>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_domain_map.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Encapsulates all of the properties which must match for two all-reduce
+// instructions to be compatible with each other (and hence be possible to
+// combine the instructions).
+using AllReduceKey =
+    std::tuple<HloOpcode, PrimitiveType,
+               /*domain metadata id*/ int64_t,
+               /*has channel id*/ bool,
+               /*use_global_device_ids*/ bool,
+               /*replica_groups*/ std::vector<std::vector<int64_t>>>;
+
+std::optional<AllReduceKey> GetAllReduceKey(
+    const HloInstruction* instruction, const HloDomainMap* domain_map = nullptr,
+    bool ignore_replica_groups = false);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_REDUCE_KEY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_promotion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_promotion.h
new file mode 100644
index 00000000..e6459f82
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_promotion.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_PROMOTION_H_
+#define XLA_SERVICE_ALL_REDUCE_PROMOTION_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/change_op_data_type.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class AllReducePromotion : public HloModulePass {
+ public:
+  explicit AllReducePromotion(
+      absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types);
+  absl::string_view name() const override { return "all-reduce-promotion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  ChangeOpDataType pass_;
+};
+
+}  // namespace xla
+#endif  // XLA_SERVICE_ALL_REDUCE_PROMOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_reassociate.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_reassociate.h
new file mode 100644
index 00000000..9fbeb32e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_reassociate.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_REASSOCIATE_H_
+#define XLA_SERVICE_ALL_REDUCE_REASSOCIATE_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that reassociates all-reduce feeding into compatible elementwise
+// operations. As an example: add(all-reduce(x), all-reduce(y)) will be replaced
+// with all-reduce(add(x,y)). Mathematically, this is replacing
+//   add(x0, x1, ... xk) + add(y0, y1, ... yk) with
+//   add((x0+y0), (x1+y), ... (xk+yk)
+//
+//  i.e., reassociating the reduction operation.
+
+class AllReduceReassociate : public HloModulePass {
+ public:
+  explicit AllReduceReassociate(bool reassociate_converted_ar = false)
+      : reassociate_converted_ar_(reassociate_converted_ar) {}
+
+  absl::string_view name() const override { return "all-reduce-reassociate"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Specify whether we should reassociate allreduces that are consumed by
+  // Convert nodes.
+  bool reassociate_converted_ar_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_REDUCE_REASSOCIATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_simplifier.h
new file mode 100644
index 00000000..1a846307
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_reduce_simplifier.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
+#define XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that detects all-reduces whose inputs are already the same across
+// replicas using the replication analysis, then replaces those all-reduces with
+// local computations. E.g., a sum all-reduce on replicated input will be
+// replaced by a multiply with the replica count.
+class AllReduceSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-reduce-simp"; }
+
+  // Run all-reduce simplification on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/all_to_all_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/all_to_all_decomposer.h
new file mode 100644
index 00000000..f05e5866
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/all_to_all_decomposer.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_TO_ALL_DECOMPOSER_H_
+#define XLA_SERVICE_ALL_TO_ALL_DECOMPOSER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// AllToAllDecomposer is a pass which converts unsupported array all_to_all
+// into tuple all_to_all or array all_to_all with a minimum rank where the split
+// dimension is the size of the replica_groups.
+class AllToAllDecomposer : public OpExpanderPass {
+ public:
+  explicit AllToAllDecomposer(bool decompose_to_tuple = true,
+                              int64_t min_array_rank = 0)
+      : decompose_to_tuple_(decompose_to_tuple),
+        min_array_rank_(min_array_rank) {}
+  ~AllToAllDecomposer() override = default;
+  absl::string_view name() const override { return "all_to_all_decomposer"; }
+
+ private:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+  bool decompose_to_tuple_;
+  int64_t min_array_rank_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_TO_ALL_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/allocation_tracker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/allocation_tracker.h
new file mode 100644
index 00000000..cea193ea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/allocation_tracker.h
@@ -0,0 +1,160 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALLOCATION_TRACKER_H_
+#define XLA_SERVICE_ALLOCATION_TRACKER_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/backend.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Tracks allocations for the XLA service; allocations can be registered
+// with shape/device/tag and resolved from a handle for later use.
+class AllocationTracker {
+ public:
+  // The allocator is used for deallocating memory when allocations are
+  // deregistered. All registered allocations must have the same platform as the
+  // allocator.
+  AllocationTracker(Backend* backend) : backend_(backend), next_handle_(1) {}
+
+  // Registers a shaped buffer of device memory, and returns a corresponding
+  // handle that can be used for talking to XLA clients. The given shaped buffer
+  // will be treated as the buffer corresponding to the only replica.
+  absl::StatusOr<GlobalDataHandle> Register(ScopedShapedBuffer shaped_buffer,
+                                            const std::string& tag);
+
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  absl::StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers,
+      const std::string& tag);
+
+  // Unregister the allocation for the given data handle.
+  absl::Status Unregister(const GlobalDataHandle& data);
+
+  // Returns a vector of global data handles that point to the tuple elements.
+  absl::StatusOr<std::vector<GlobalDataHandle>> DeconstructTuple(
+      const GlobalDataHandle& Data);
+
+  // Resolve a handle from an XLA client to a vector of shaped buffers, one per
+  // replica, or provide an error status to say whether any of those buffers
+  // were not found (or found, but found deallocated).
+  absl::StatusOr<std::vector<const ShapedBuffer*>> Resolve(
+      const GlobalDataHandle& data) const;
+
+  // Resolves a handle from an XLA client and replica id to a shaped buffer, or
+  // provide an error status to say whether it was not found (or found, but
+  // found deallocated).
+  absl::StatusOr<const ShapedBuffer*> ResolveForReplica(
+      const GlobalDataHandle& data, int replica_id) const;
+
+ private:
+  // Data structure encapsulating single memory allocation on the device.
+  struct Allocation {
+    // The pointer to this allocation.
+    se::OwningDeviceMemory device_memory;
+
+    // This is the number of times this memory allocation is referred to by
+    // registered data handles.
+    int ref_count;
+  };
+
+  // Internal helper which resolves the given GlobalDataHandle to a
+  // list of ScopedShapedBuffers.
+  absl::StatusOr<std::vector<const ShapedBuffer*>> ResolveInternal(
+      const GlobalDataHandle& data) const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Internal helper which registers a vector of shaped buffers, one per
+  // replica.  ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer.  If
+  // it's ShapedBuffer, all of the given buffers must already be tracked by this
+  // object -- presumably this is a call from DeconstructTuple.
+  template <typename ShapedBufferTy>
+  absl::StatusOr<GlobalDataHandle> RegisterInternal(
+      std::vector<ShapedBufferTy> replicated_buffers, const std::string& tag)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Adds the given device address to the allocation tracker, or if it already
+  // exists, then increment its reference count.
+  void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
+                                        int device_ordinal)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Decrements the reference count of the given device memory. Then, if it is
+  // zero, deallocate the memory.
+  absl::Status DecrementRefCount(se::DeviceMemoryBase device_memory,
+                                 int device_ordinal)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // A map from device memory opaque value to allocation. One such map is
+  // maintained per device ordinal.
+  using AllocationMap = absl::flat_hash_map<const void*, Allocation>;
+
+  mutable absl::Mutex mutex_;
+
+  // Backend to use with this tracker. The backend supplies the memory allocator
+  // to use when deallocating memory.
+  Backend* backend_;
+
+  // The next handle to assign to an allocation, guarded by the same mutex as
+  // the mapping as they'll be mutated at the same time.
+  int64_t next_handle_ ABSL_GUARDED_BY(mutex_);
+
+  // A map from device ordinal to AllocationMap.
+  absl::flat_hash_map<int, AllocationMap> opaque_to_allocation_map_
+      ABSL_GUARDED_BY(mutex_);
+
+  // A map from data handle to a vector of shaped buffers that represent the
+  // buffers for different replicas.
+  //
+  // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our
+  // public API returns pointers to them.  We expect the concrete class to be
+  // ShapedBuffer and never ScopedShapedBuffer; deallocation of buffers is
+  // handled by opaque_to_allocation_map_.
+  //
+  // The elements of the vectors need to be unique_ptrs because we return
+  // pointers to them.  (In theory we could use std::list or something instead,
+  // but we also want to be able to null out these elements.)
+  //
+  // The reason that the elements can't be unique_ptr<ScopedShapedBuffer>s is
+  // the existence of DeconstructTuple().  This function allows us to create a
+  // non-owning "view" into a tuple's sub-buffers.  The sub-buffers are then
+  // free'd when both the view *and* the original tuple are Unregistered.  This
+  // refcounting is managed in opaque_to_allocation_map_.
+  absl::flat_hash_map<int64_t, std::vector<std::unique_ptr<ShapedBuffer>>>
+      handle_to_shaped_buffers_ ABSL_GUARDED_BY(mutex_);
+
+  AllocationTracker(const AllocationTracker&) = delete;
+  AllocationTracker& operator=(const AllocationTracker&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALLOCATION_TRACKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/ar_crs_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/ar_crs_combiner.h
new file mode 100644
index 00000000..57b36ee2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/ar_crs_combiner.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_AR_CRS_COMBINER_H_
+#define XLA_SERVICE_AR_CRS_COMBINER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/ar_crs_combiner.h"
+
+#endif  // XLA_SERVICE_AR_CRS_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/async_collective_creator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/async_collective_creator.h
new file mode 100644
index 00000000..f3141f50
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/async_collective_creator.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
+#define XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/async_collective_creator.h"
+
+#endif  // XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/backend.h b/third_party/tflite-hdrs/third_party/xla/xla/service/backend.h
new file mode 100644
index 00000000..85dbfea6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/backend.h
@@ -0,0 +1,216 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BACKEND_H_
+#define XLA_SERVICE_BACKEND_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/service/compiler.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/stream_pool.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "tsl/platform/threadpool.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}
+
+namespace xla {
+
+// Options to configure the backend when it is created.
+class BackendOptions {
+ public:
+  // Set the platform backing the backend, or nullptr for the default platform.
+  BackendOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
+
+  // Sets the thread pool size for parallel execution of an individual operator.
+  // The default value of -1 will result in initializing the thread pool with
+  // the number of threads equal to the number of cores in the system.
+  BackendOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
+  // Sets the allowed_devices for selectively constructing stream executors
+  // on the platform.
+  BackendOptions& set_allowed_devices(
+      const std::optional<std::set<int>>& allowed_devices);
+  const std::optional<std::set<int>>& allowed_devices() const;
+
+ private:
+  se::Platform* platform_ = nullptr;
+  int intra_op_parallelism_threads_ = -1;
+  std::optional<std::set<int>> allowed_devices_;
+};
+
+// Class which encapsulates an XLA backend. It includes everything necessary
+// to compile and execute computations on a particular platform.
+//
+// It also offers a pooling API for creation/use of initialized streams:
+//
+//    StreamPool::Ptr stream = backend->BorrowStream().value();
+class Backend {
+ public:
+  // Creates a new backend.
+  static absl::StatusOr<std::unique_ptr<Backend>> CreateBackend(
+      const BackendOptions& options);
+
+  // Creates a backend for the default platform. The default platform is defined
+  // in PlatformUtil.
+  static absl::StatusOr<std::unique_ptr<Backend>> CreateDefaultBackend();
+
+  ~Backend();
+
+  // Accessors for the various objects.
+  se::Platform* platform() const { return platform_; }
+  Compiler* compiler() const { return compiler_; }
+  se::DeviceMemoryAllocator* memory_allocator() const {
+    return memory_allocator_.get();
+  }
+  std::shared_ptr<se::DeviceMemoryAllocator> shared_memory_allocator() const {
+    return memory_allocator_;
+  }
+  TransferManager* transfer_manager() const { return transfer_manager_; }
+  ComputationPlacer* computation_placer() const { return computation_placer_; }
+
+  // Returns the number of devices of the platform type which are visible. Not
+  // all of these devices may be usable by XLA.
+  int device_count() const { return stream_executors_.size(); }
+
+  // Returns the device ordinal number of the default device.
+  int default_device_ordinal() const;
+
+  // Returns stream executors of all supported devices for this backend. The
+  // executors are ordered by the device ordinal.
+  const std::vector<se::StreamExecutor*>& stream_executors() const {
+    return stream_executors_;
+  }
+
+  // Returns the stream executor for the given device ordinal.
+  absl::StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;
+
+  // Returns the stream executor for the default device ordinal. This stream
+  // executor can only be used when the number of computations is 1 (replication
+  // can be > 1).
+  se::StreamExecutor* default_stream_executor() const {
+    CHECK(!stream_executors_.empty());
+    return stream_executors_[0];
+  }
+
+  // Borrows a stream for use by the caller with a given priority, either by
+  // grabbing it from an internal pool, or by constructing/initializating it,
+  // and returns the result to the caller.
+  absl::StatusOr<StreamPool::Ptr> BorrowStream(
+      int device_ordinal,
+      se::StreamPriority priority = se::StreamPriority::Default);
+  absl::StatusOr<StreamPool::Ptr> BorrowStream(
+      se::StreamExecutor* executor,
+      se::StreamPriority priority = se::StreamPriority::Default);
+  absl::StatusOr<std::vector<StreamPool::Ptr>> BorrowStreams(
+      int device_ordinal, int num_streams,
+      se::StreamPriority priority = se::StreamPriority::Default);
+
+  // Returns a function to borrow streams with a given priority,
+  // as `BorrowStreams` above does.
+  // Purely for convenience, the caller could rather make this anonymous
+  // function itself.
+  std::function<absl::StatusOr<std::vector<StreamPool::Ptr>>(
+      int, int, se::StreamPriority)>
+  StreamBorrowerWithPriority() {
+    return [this](int device_ordinal, int num_streams,
+                  se::StreamPriority priority) {
+      return BorrowStreams(device_ordinal, num_streams, priority);
+    };
+  }
+
+  // Returns whether the given device ordinal of the backend is supported.
+  bool device_ordinal_supported(int device_ordinal) const {
+    return (device_ordinal >= 0 && device_ordinal < device_count() &&
+            stream_executors_[device_ordinal] != nullptr);
+  }
+
+  // Return a string identifier for the given device, eg: "GPU:3".
+  std::string device_name(int device_ordinal) const {
+    return absl::StrCat(platform_->Name(), ":", device_ordinal);
+  }
+
+  // Returns true if the devices with the given ordinals are equivalent from
+  // XLA's perspective. That is, an executable compiled for one device would
+  // be equivalent to an executable compiled for the other.
+  absl::StatusOr<bool> devices_equivalent(int device_ordinal_a,
+                                          int device_ordinal_b) const;
+
+  // For the host platform, returns the configured eigen threadpool device to be
+  // used for scheduling work. For other platforms, returns NULL.
+  const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
+  tsl::thread::ThreadPool* eigen_intra_op_thread_pool() const;
+
+  // Resets the devices associated with this backend.
+  absl::Status ResetDevices();
+
+ private:
+  Backend(se::Platform* platform, Compiler* compiler,
+          absl::Span<se::StreamExecutor* const> stream_executors,
+          TransferManager* transfer_manager,
+          ComputationPlacer* computation_placer,
+          int intra_op_parallelism_threads);
+  Backend(const Backend&) = delete;
+  Backend& operator=(const Backend&) = delete;
+
+  se::Platform* platform_;
+  Compiler* compiler_;
+  TransferManager* transfer_manager_;
+  ComputationPlacer* computation_placer_;
+
+  // Vector of stream executors. stream_executors_[0] is the default executor.
+  std::vector<se::StreamExecutor*> stream_executors_;
+
+  absl::Mutex mu_;
+
+  // Mapping from stream executor to stream pools, used by `BorrowStream` above.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamPool>>
+      stream_pools_ ABSL_GUARDED_BY(mu_);
+
+  // The default memory allocator to use.
+  // This must be a shared_ptr, as this is passed all the way down to the
+  // cluster compilation. This allows asynchronous compilation to hold a
+  // referecence until the compilation is finished.
+  std::shared_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;
+
+  // For the CPU backend, an Eigen threadpool device for use by Eigen code.
+  struct IntraOpThreadPool;
+  std::unique_ptr<IntraOpThreadPool> intra_op_thread_pool_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_BACKEND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/batch_dot_simplification.h b/third_party/tflite-hdrs/third_party/xla/xla/service/batch_dot_simplification.h
new file mode 100644
index 00000000..381b6795
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/batch_dot_simplification.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
+#define XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/batch_dot_simplification.h"
+
+#endif  // XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/batched_gather_scatter_normalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/batched_gather_scatter_normalizer.h
new file mode 100644
index 00000000..50c1d43d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/batched_gather_scatter_normalizer.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_BATCHED_GATHER_SCATTER_NORMALIZER_H_
+#define XLA_SERVICE_BATCHED_GATHER_SCATTER_NORMALIZER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites normalize batch gather and scatter operations into a
+// non-batch version.
+class BatchedGatherScatterNormalizer : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "gather_scatter_normalizer";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_BATCHED_GATHER_SCATTER_NORMALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/batchnorm_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/batchnorm_expander.h
new file mode 100644
index 00000000..15738efd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/batchnorm_expander.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BATCHNORM_EXPANDER_H_
+#define XLA_SERVICE_BATCHNORM_EXPANDER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which rewrites batch norm operations into more operations. Breaking a
+// big operation into smaller operations helps leverage our generic fusion
+// logic.
+class BatchNormExpander : public HloModulePass {
+ public:
+  // When use_fusion is set, a multi-output fusion node is created.
+  explicit BatchNormExpander(bool rewrite_training_op = false,
+                             bool rewrite_inference_op = false,
+                             bool rewrite_grad_op = false)
+      : rewrite_training_op_(rewrite_training_op),
+        rewrite_inference_op_(rewrite_inference_op),
+        rewrite_grad_op_(rewrite_grad_op) {}
+  ~BatchNormExpander() override = default;
+  absl::string_view name() const override { return "batchnorm_expander"; }
+
+  // Run operation expander on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool rewrite_training_op_;
+  bool rewrite_inference_op_;
+  bool rewrite_grad_op_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_BATCHNORM_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/bfloat16_conversion_folding.h b/third_party/tflite-hdrs/third_party/xla/xla/service/bfloat16_conversion_folding.h
new file mode 100644
index 00000000..deb5675f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/bfloat16_conversion_folding.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
+#define XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h"
+
+#endif  // XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/bfloat16_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/bfloat16_propagation.h
new file mode 100644
index 00000000..e3a0e0fa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/bfloat16_propagation.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BFLOAT16_PROPAGATION_H_
+#define XLA_SERVICE_BFLOAT16_PROPAGATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/bfloat16_propagation.h"
+
+#endif  // XLA_SERVICE_BFLOAT16_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/bitcast_dtypes_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/bitcast_dtypes_expander.h
new file mode 100644
index 00000000..7824af39
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/bitcast_dtypes_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BITCAST_DTYPES_EXPANDER_H_
+#define XLA_SERVICE_BITCAST_DTYPES_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h"
+
+#endif  // XLA_SERVICE_BITCAST_DTYPES_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/broadcast_canonicalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/broadcast_canonicalizer.h
new file mode 100644
index 00000000..efedf3ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/broadcast_canonicalizer.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BROADCAST_CANONICALIZER_H_
+#define XLA_SERVICE_BROADCAST_CANONICALIZER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h"
+
+#endif  // XLA_SERVICE_BROADCAST_CANONICALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_assignment.h
new file mode 100644
index 00000000..99c08a1f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_assignment.h
@@ -0,0 +1,826 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BUFFER_ASSIGNMENT_H_
+#define XLA_SERVICE_BUFFER_ASSIGNMENT_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_assignment.pb.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Walk the call graph of the HLO module and place each computation into either
+// thread_local_computations or global_computations depending upon whether the
+// computation requires thread-local allocations or global allocations. The
+// elements in thread_local_computations and global_computations are in post
+// order (if computation A has an instruction which calls computation B, then A
+// will appear after B in the vector).
+absl::Status GatherComputationsByAllocationType(
+    const HloModule* module,
+    std::vector<const HloComputation*>* thread_local_computations,
+    std::vector<const HloComputation*>* global_computations);
+
+// This class abstracts an allocation of contiguous memory which can hold the
+// values described by LogicalBuffers. Each LogicalBuffer occupies a sub-range
+// of the allocation, represented by a Slice. A single BufferAllocation may hold
+// LogicalBuffers with disjoint liveness, which may have overlapping Slices. A
+// single BufferAllocation may also hold LogicalBuffers with overlapping
+// liveness, which must have disjoint Slices.
+//
+// The abstraction includes information required by the backends for allocation,
+// use, and deallocation of the buffer. This includes the LogicalBuffers which
+// are held in this allocation through the execution of the computation.
+class BufferAllocation {
+ public:
+  // Holds a unique identifier for each allocation. Values are assigned
+  // contiguously and can be used as array indexes.
+  using Index = int64_t;
+
+  BufferAllocation(Index index, int64_t size, LogicalBuffer::Color color)
+      : index_(index), size_(size), color_(color) {}
+
+  // Returns the index of this allocation.
+  Index index() const { return index_; }
+
+  // Whether this allocation is used in a parallel calling context such as
+  // inside of a map or reduce computation. Such allocations need to be thread
+  // local.
+  bool is_thread_local() const { return is_thread_local_; }
+  void set_is_thread_local(bool is_thread_local) {
+    is_thread_local_ = is_thread_local;
+  }
+
+  // Whether this allocation can be used by more than one logical buffer.
+  bool is_reusable() const {
+    // We do not reuse thread-local buffers for now, because they are
+    // dynamically allocated and their lifetimes are hard to compute.
+    //
+    // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
+    // assumes longer buffer liveness than indicated by the analysis.
+    return !is_thread_local() && !is_tuple();
+  }
+
+  // Whether this allocation is read-only i.e. backed by memory we cannot write
+  // to.
+  bool is_readonly() const {
+    // Entry parameters are generally readonly, except when they are aliased
+    // with any output.
+    return (is_entry_computation_parameter() &&
+            !is_parameter_aliased_with_output_) ||
+           is_constant();
+  }
+
+  bool is_tuple() const { return is_tuple_; }
+  void set_is_tuple(bool is_tuple) { is_tuple_ = is_tuple; }
+
+  // Whether this allocation holds a LogicalBuffer from a parameter of the entry
+  // computation. These buffers have lifetimes which may be longer than the
+  // XLA computation.
+  bool is_entry_computation_parameter() const {
+    return is_entry_computation_parameter_;
+  }
+
+  bool is_parameter_aliased_with_output() const {
+    return is_parameter_aliased_with_output_;
+  }
+
+  // Whether this allocation holds a constant.  On the CPU and GPU backends
+  // constant allocations are not allocated dynamically, instead we resolve
+  // references to these buffer allocations to a global in the readonly section
+  // of the binary.
+  bool is_constant() const { return is_constant_; }
+
+  // If this allocation holds a Buffer from a parameter of the entry
+  // computation, this methods returns the parameter number. CHECKs otherwise.
+  int64_t parameter_number() const {
+    CHECK(is_entry_computation_parameter_);
+    return parameter_number_;
+  }
+
+  // If this allocation is for a parameter of the entry computation, this
+  // function returns which subshape of the parameter the allocation is for.
+  const ShapeIndex& param_shape_index() const {
+    CHECK(is_entry_computation_parameter_);
+    return param_shape_index_;
+  }
+
+  // Returns whether this allocation is assigned a LogicalBuffer which may
+  // be live out of the entry computation.
+  bool maybe_live_out() const { return maybe_live_out_; }
+
+  void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
+
+  // Returns the size of the allocation. Necessarily this must be at least as
+  // large as any LogicalBuffer assigned to this allocation.
+  int64_t size() const { return size_; }
+
+  // Returns the color of the allocation. Only logical buffers with a matching
+  // color can reside in this allocation.
+  LogicalBuffer::Color color() const { return color_; }
+
+  void set_color(LogicalBuffer::Color color) { color_ = color; }
+  struct OffsetSize {
+    int64_t offset = 0;
+    int64_t size = 0;
+  };
+
+  // Access to the logical buffers assigned to this allocation, and their
+  // associated logical offsets and sizes.
+  const absl::flat_hash_map<const HloValue*, OffsetSize>& assigned_buffers()
+      const {
+    return assigned_buffers_;
+  }
+
+  // A Slice represents a contiguous portion of a memory allocation. It is used
+  // to identify the memory range that a LogicalBuffer corresponds to.
+  class Slice {
+   public:
+    Slice() {}
+    Slice(const BufferAllocation* allocation, int64_t offset, int64_t size)
+        : allocation_(allocation), offset_(offset), size_(size) {}
+
+    const BufferAllocation* allocation() const { return allocation_; }
+    Index index() const { return allocation_->index(); }
+    int64_t offset() const { return offset_; }
+    int64_t size() const { return size_; }
+
+    bool operator==(const Slice& other) const {
+      return index() == other.index() && offset_ == other.offset_ &&
+             size_ == other.size_;
+    }
+    bool operator!=(const Slice& other) const { return !(*this == other); }
+    bool operator<(const Slice& other) const {
+      if (index() != other.index()) return index() < other.index();
+      if (offset_ != other.offset_) return offset_ < other.offset_;
+      return size_ < other.size_;
+    }
+
+    // Returns true iff this slice's memory range has a non-empty intersection
+    // with the other slice's memory range.
+    bool OverlapsWith(const Slice& other) const {
+      const int64_t end = offset_ + size_;
+      const int64_t other_end = other.offset_ + other.size_;
+      return index() == other.index() && offset_ < other_end &&
+             end > other.offset_;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Slice& s) {
+      return H::combine(std::move(h), s.index(), s.offset(), s.size());
+    }
+
+    std::string ToString() const;
+
+   private:
+    const BufferAllocation* allocation_ = nullptr;
+    int64_t offset_ = 0;
+    int64_t size_ = 0;
+  };
+
+  // GetSlice returns the Slice of contiguous memory that holds the value
+  // described by the given 'buffer'.
+  // REQUIRES: 'buffer' must be assigned to this allocation.
+  Slice GetSlice(const HloValue& buffer) const;
+
+  std::string ToString() const;
+  std::string ToShortString(bool human_readable_size = false) const;
+  std::string ValuesToString() const;
+
+  // The function returns memory usage report for the values belonging to the
+  // buffer allocation. The values are grouped by their offset in the
+  // allocation. The groups are sorted by the max size(Z-A) of the values in the
+  // group. Percentile and more_than_k are used to control the number of groups
+  // being reported.
+  std::string MemoryUsageReport(const std::string& prefix,
+                                float percentile = 0.05,
+                                int64_t more_than_k = 50) const;
+
+  BufferAllocationProto ToProto() const;
+
+  // Whether the buffer is a parameter to or live out of the entry computation.
+  bool IsInputOrOutput() const {
+    return is_entry_computation_parameter() || maybe_live_out();
+  }
+
+  // Whether the buffer is a temporary buffer allocated before
+  // Executable::ExecuteOnStream.
+  bool IsPreallocatedTempBuffer() const {
+    // Parameters do not need temporary buffers.
+    return !is_entry_computation_parameter() &&
+           // LogicalBuffers that maybe pointed to by the output should live out
+           // of the computation.
+           !maybe_live_out() &&
+           // Thread-local buffers are allocated using `alloca`s.
+           !is_thread_local() &&
+           // Constant buffers are allocated as global values.
+           !is_constant();
+  }
+
+  // Add a heap trace which was used to assign slices to logical buffers in this
+  // allocation. A single BufferAllocation may include multiple heap traces
+  // in the case of the temporary block where there is a heap trace per
+  // computation.
+  void AddHeapTrace(const HeapSimulatorTrace& heap_trace) {
+    heap_traces_.push_back(heap_trace);
+    heap_traces_.back().set_buffer_allocation_index(index());
+  }
+
+  // Return the set of heap traces used to assign slices to logical buffers in
+  // this allocation.
+  std::vector<HeapSimulatorTrace> HeapTraces() const { return heap_traces_; }
+
+  // Returns the LogicalBuffers which are live at the point of peak memory usage
+  // for this allocation. The point of peak memory usage is the point at which
+  // the total size of all live logical buffers is maximal. If peak memory is
+  // reached at multiple points, the set of logical buffers live at the earliest
+  // maximal point is returned. The vector is stably sorted by
+  // BufferValue::Index.
+  const std::vector<const HloValue*>& PeakMemoryLogicalBuffers() const {
+    return peak_buffers_;
+  }
+
+  // Get the number of bytes lost to fragmentation. This is equal to the
+  // difference between the size of the allocation and the size of the maximal
+  // live set.
+  int64_t fragmentation_bytes() const { return fragmentation_bytes_; }
+
+  bool operator==(const BufferAllocation& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const BufferAllocation& other) const {
+    return !(*this == other);
+  }
+  bool operator<(const BufferAllocation& other) const {
+    return index() < other.index();
+  }
+
+  void set_entry_computation_parameter(int64_t parameter_number,
+                                       ShapeIndex param_shape_index,
+                                       bool parameter_aliased_with_output) {
+    is_entry_computation_parameter_ = true;
+    is_parameter_aliased_with_output_ = parameter_aliased_with_output;
+    parameter_number_ = parameter_number;
+    param_shape_index_ = std::move(param_shape_index);
+  }
+
+  void set_constant(bool is_constant) { is_constant_ = is_constant; }
+
+ private:
+  // Only BufferAssigner and BufferAssignment can modify BufferAllocation.
+  friend class BufferAssigner;
+  friend class BufferAssignment;
+
+  // Adds a LogicalBuffer to the set assigned to this buffer.
+  void AddAssignment(const HloValue& buffer, int64_t offset, int64_t size);
+
+  void set_index(Index index) { index_ = index; }
+  void set_size(int64_t size) { size_ = size; }
+
+  // The index of the allocation in the BufferAssignment.
+  Index index_;
+
+  // Size of the allocation in bytes.
+  int64_t size_;
+
+  // Whether this buffer needs to be thread-local.
+  bool is_thread_local_ = false;
+
+  // Whether this buffer holds a tuple.
+  bool is_tuple_ = false;
+
+  // Color of the allocation.
+  LogicalBuffer::Color color_;
+
+  // Whether this allocation holds an entry computation parameter. Entry
+  // computation parameters are special because they have lifetimes which may
+  // outlast the computation.
+  bool is_entry_computation_parameter_ = false;
+
+  // Whether this entry computation parameter is aliased with output.
+  bool is_parameter_aliased_with_output_ = false;
+
+  // If this allocation holds an entry computation parameter, this field
+  // indicates the index (starting from 0) of the parameter.
+  int64_t parameter_number_ = 0;
+
+  // If this buffer is for an entry computation parameter, which subshape of the
+  // parameter is it for?
+  ShapeIndex param_shape_index_;
+
+  // Whether the allocation contains a LogicalBuffer which may be live-out of
+  // the entry computation. Note that this flag is conservatively computed by
+  // TuplePointsToAnalysis.  That is, an allocation marked `maybe_live_out_`
+  // might not actually escape.
+  bool maybe_live_out_ = false;
+
+  // See comment on the is_constant() accessor.
+  bool is_constant_ = false;
+
+  // Mapping from the set of buffers assigned to this allocation to their
+  // logical offsets and sizes.
+  absl::flat_hash_map<const HloValue*, OffsetSize> assigned_buffers_;
+
+  int64_t fragmentation_bytes_ = 0;
+  std::vector<HeapSimulatorTrace> heap_traces_;
+
+  // Set of buffers live at the point of peak memory usage for this allocation.
+  std::vector<const HloValue*> peak_buffers_;
+};
+
+// Add stream operators for nicer output of CHECK/RET_CHECK failures.
+std::ostream& operator<<(std::ostream& out, const BufferAllocation& s);
+std::ostream& operator<<(std::ostream& out, const BufferAllocation::Slice& s);
+
+// This class encapsulates an assignment of the LogicalBuffers in an XLA
+// module to a set of BufferAllocations.
+class BufferAssignment {
+ public:
+  // This is a think wrapper around BufferIsolationConfig. Please see the
+  // documentation for BufferIsolationConfig for details on how buffer isolation
+  // works. hlo_value_compare is the concrete implementation of the HloValue
+  // comparison that uses the isolation_order_salt value in the
+  // BufferIsolationConfig.
+  struct BufferIsolationOptions {
+    std::function<bool(const HloValue*, const HloValue*)> hlo_value_compare;
+    buffer_assignment::BufferIsolationConfig config;
+  };
+
+  // Returns the vector containing all buffer allocations in this assignment.
+  const std::vector<BufferAllocation>& Allocations() const {
+    return allocations_;
+  }
+
+  // Returns the total size allocation holding all temporary buffers.
+  int64_t temp_allocation_total_size() const {
+    return temp_allocation_total_size_;
+  }
+
+  uint64_t multiheap_size_constraint_per_heap() const {
+    return multiheap_size_constraint_per_heap_;
+  }
+
+  // Returns whether the given buffer has been assigned an allocation.
+  bool HasAllocation(const HloValue& value) const;
+
+  // Returns whether the given (logical) buffer with the id has been assigned an
+  // allocation.
+  bool HasAllocation(HloValue::Id value_id) const;
+
+  bool HasAllocation(const HloBuffer& buffer) const;
+
+  // Returns the allocation that a particular LogicalBuffer has been assigned
+  // to. CHECKs if buffer has not been assigned an allocation.
+  const BufferAllocation& GetAssignedAllocation(const HloValue& value) const;
+
+  const BufferAllocation& GetAssignedAllocation(
+      const HloBuffer& hlo_buffer) const;
+
+  // Returns the allocation with the given index. CHECKs if no allocation exists
+  // with the given index.
+  const BufferAllocation& GetAllocation(BufferAllocation::Index index) const;
+
+  // Returns the allocation with the given instruction and shape index. nullptr
+  // if no allocation exists.
+  const BufferAllocation* GetInstructionAllocation(
+      const HloInstruction* hlo, const ShapeIndex& shape_index) const;
+
+  // Builds and returns a vector containing the slices which might contain the
+  // subvalue at the given index of given instruction.
+  std::set<BufferAllocation::Slice> GetAllSlices(
+      const HloInstruction* instruction, const ShapeIndex& index) const;
+
+  // Convenience function which returns whether the buffer of the
+  // instruction at the given index is assigned an allocation.
+  bool HasAllocationAt(const HloInstruction* instruction,
+                       const ShapeIndex& index) const;
+
+  // Convenience function which returns whether the top-level buffer of the
+  // instruction (index == {}) is assigned an allocation.
+  bool HasTopLevelAllocation(const HloInstruction* instruction) const;
+
+  // Convenience function which returns the unique slice containing the buffer
+  // at the given index of the given instruction. If a slice is not assigned or
+  // the slice cannot be determined at compile time then an error is returned.
+  absl::StatusOr<BufferAllocation::Slice> GetUniqueSlice(
+      const HloInstruction* instruction, const ShapeIndex& index) const;
+  // Like GetUniqueSlice but fixes the index to the top-level of the shape
+  // (index = {}).
+  absl::StatusOr<BufferAllocation::Slice> GetUniqueTopLevelSlice(
+      const HloInstruction* instruction) const;
+  // Like GetUniqueTopLevelSlice but returns the slice for the output of the
+  // entry computation of the HLO module (ie, the result of the XLA
+  // computation).
+  absl::StatusOr<BufferAllocation::Slice> GetUniqueTopLevelOutputSlice() const;
+
+  // Returns the set BufferValues which may be the source of the value at the
+  // given index and instruction.
+  const std::vector<const HloValue*>& GetSourceBuffers(
+      const HloInstruction* instruction, const ShapeIndex& index) const {
+    return dataflow_analysis().GetValueSet(instruction, index).values();
+  }
+
+  // Returns true if 'hlo_a{shape_index_a}' and 'hlo_b{shape_index_b}'
+  // share the same BufferAllocation::Slice.
+  // Returns false otherwise.
+  // REQUIRES: BufferAssignment assigned allocations to both instructions.
+  bool SharesSliceAtIndex(const HloInstruction* hlo_a,
+                          const ShapeIndex& shape_index_a,
+                          const HloInstruction* hlo_b,
+                          const ShapeIndex& shape_index_b) const;
+
+  // Returns true if the top-level buffers of hlo_a and hlo_b are the same.
+  // REQUIRES: HasTopLevelAllocation(hlo_a) && HasTopLevelAllocation(hlo_b).
+  bool SharesTopLevelSlice(const HloInstruction* hlo_a,
+                           const HloInstruction* hlo_b) const {
+    return SharesSliceAtIndex(hlo_a, {}, hlo_b, {});
+  }
+
+  // Returns true if hlo_a and hlo_b both have at least one buffer assigned for
+  // their top-level and each of their nested shape indices, and if hlo_a's
+  // buffers are all different from hlo_b's buffers.
+  bool HaveDisjointSlices(const HloInstruction* hlo_a,
+                          const HloInstruction* hlo_b) const;
+
+  const HloDataflowAnalysis& dataflow_analysis() const {
+    return alias_analysis_->dataflow_analysis();
+  }
+
+  HloAliasAnalysis& alias_analysis() const { return *alias_analysis_; }
+
+  const HloOrdering& hlo_ordering() const { return *hlo_ordering_; }
+
+  // Returns the HloLiveRange object used to construct this assignment.
+  const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
+
+  // Is in use by many compilers to dump the buffer-assignment info.
+  std::string ToString() const;
+
+  // Returns a memory usage report with the list of buffer allocations ordered
+  // by the size(Z-A) and the values assigned to each buffer allocation.
+  std::string MemoryUsageReport(float percentile = 0.05,
+                                int64_t more_than_k = 50) const;
+  // Verbose string tailored to debugging OOMs, includes the Hlo op metadata for
+  // every buffer associated with each allocation.
+  std::string ToVerboseString(size_t max_buffers_to_show) const;
+
+  // Is in use by tpu compiler to dump the buffer info.
+  std::string BufferInfoString() const;
+
+  // Convert BufferAssignment to or from a proto.
+  BufferAssignmentProto ToProto() const;
+  static absl::StatusOr<std::unique_ptr<BufferAssignment>> FromProto(
+      const BufferAssignmentProto& proto, const HloModule* module,
+      BufferValue::SizeFunction buffer_size,
+      HloDataflowAnalysis::CanShareBuffer can_share_buffer);
+
+  // Returns string representation of buffer assignment statistics. Also
+  // calculates and returns the total fragmentation if
+  // report_total_fragmentation is true.
+  std::string StatsString(bool report_total_fragmentation = false) const;
+
+  // Statistics for the assignment.  Values initialized to -1 are not always
+  // collected; fragmentation is only collected for instructions that have a
+  // sequential total ordering.
+  struct Stats {
+    int64_t parameter_allocation_count = 0;
+    int64_t parameter_allocation_bytes = 0;
+    int64_t constant_allocation_count = 0;
+    int64_t constant_allocation_bytes = 0;
+    int64_t maybe_live_out_allocation_count = 0;
+    int64_t maybe_live_out_allocation_bytes = 0;
+    int64_t preallocated_temp_allocation_count = 0;
+    int64_t preallocated_temp_allocation_bytes = 0;
+    int64_t preallocated_temp_fragmentation_bytes = -1;
+    int64_t total_allocation_count = 0;
+    int64_t total_allocation_bytes = 0;
+  };
+  const Stats& GetStats() const { return stats_; }
+
+ private:
+  // Only BufferAssigner can build or modify BufferAssignments.
+  friend class BufferAssigner;
+
+  BufferAssignment(const HloModule* module,
+                   std::unique_ptr<HloOrdering> hlo_ordering,
+                   BufferValue::SizeFunction buffer_size,
+                   LogicalBuffer::AlignmentFunction color_alignment,
+                   std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                   std::unique_ptr<HloLiveRange> hlo_live_range)
+      : module_(module),
+        hlo_ordering_(std::move(hlo_ordering)),
+        buffer_size_(std::move(buffer_size)),
+        color_alignment_(std::move(color_alignment)),
+        alias_analysis_(std::move(alias_analysis)),
+        hlo_live_range_(std::move(hlo_live_range)) {
+    int32_t raw_value = module->config()
+                            .debug_options()
+                            .xla_multiheap_size_constraint_per_heap();
+    // -1 means no constraint.
+    multiheap_size_constraint_per_heap_ =
+        (raw_value == -1) ? UINT64_MAX : raw_value;
+  }
+
+  // Creates and returns a new BufferAllocation, with no assigned
+  // LogicalBuffers. Ownership is maintained internally.
+  BufferAllocation* NewEmptyAllocation(int64_t size,
+                                       LogicalBuffer::Color color);
+
+  // Helper that calls NewEmptyAllocation and AddAssignment in one call,
+  // creating an allocation containing a single LogicalBuffer.
+  BufferAllocation* NewAllocation(const HloBuffer& buffer, int64_t size);
+
+  // Adds a LogicalBuffer to the set assigned to the given allocation.
+  void AddAssignment(BufferAllocation* allocation, const HloBuffer& buffer,
+                     int64_t offset, int64_t size);
+
+  void AddAssignment(BufferAllocation* allocation, const HloValue& value,
+                     int64_t offset, int64_t size);
+
+  // Returns the HloModule used to construct this assignment.
+  const HloModule& module() const { return *module_; }
+
+  // Mutable accessors for allocations.
+  BufferAllocation* GetMutableAssignedAllocation(const HloBuffer& buffer);
+  BufferAllocation* GetMutableAllocation(BufferAllocation::Index index);
+
+  int64_t HloBufferSize(const HloBuffer& buffer) {
+    auto iter = cached_buffer_sizes_.find(buffer.id());
+    if (iter != cached_buffer_sizes_.end()) return iter->second;
+    int64_t result = 0;
+    for (const HloValue* value : buffer.values()) {
+      result = std::max(result, buffer_size_(*value));
+    }
+    cached_buffer_sizes_.insert({buffer.id(), result});
+    return result;
+  }
+
+  // Combines allocations of temporary buffers into one big BufferAllocation.
+  void CombineTempAllocations(
+      const absl::flat_hash_set<BufferValue::Color>& private_stack_colors,
+      std::optional<BufferValue::Color> temp_buffer_color);
+
+  // Computes stats for the assignment, to be retrieved by GetStats.
+  void ComputeSummaryStats();
+
+  // Calculates and returns the total fragmentation in bytes.
+  absl::StatusOr<int64_t> ComputeTotalFragmentationBytes() const;
+
+  // The vector of buffer allocations. Indexed by BufferAllocation::Index.
+  std::vector<BufferAllocation> allocations_;
+
+  // The total size of all temporary buffers.
+  int64_t temp_allocation_total_size_ = 0;
+
+  uint64_t multiheap_size_constraint_per_heap_;
+
+  // Maps Buffers to the index of the BufferAllocation which holds the buffer.
+  absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
+      allocation_index_for_value_;
+
+  const HloModule* module_;
+
+  const std::unique_ptr<HloOrdering> hlo_ordering_;
+
+  // Function which returns the buffer size for a given logical buffer (shape).
+  BufferValue::SizeFunction buffer_size_;
+
+  // Function which returns the alignment for a given logical buffer color.
+  LogicalBuffer::AlignmentFunction color_alignment_;
+
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+
+  std::unique_ptr<HloLiveRange> hlo_live_range_;
+
+  Stats stats_;
+
+  absl::flat_hash_map<HloBuffer::Id, int64_t> cached_buffer_sizes_;
+
+  BufferAssignment(const BufferAssignment&) = delete;
+  BufferAssignment& operator=(const BufferAssignment&) = delete;
+};
+
+// A class which constructs a buffer assignment.
+class BufferAssigner {
+ public:
+  using Colorer =
+      std::function<absl::Status(HloAliasAnalysis*, const HloOrdering&)>;
+  using MustNotLiveOut = std::function<bool(
+      const HloAliasAnalysis&, const HloInstruction*, const ShapeIndex&)>;
+  using PrivateStacks = absl::flat_hash_map<BufferValue::Color,
+                                            std::vector<const HloComputation*>>;
+
+  static Colorer DefaultColorer() {
+    return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+      for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
+        const HloPosition& defining_position = value->defining_position();
+        if (defining_position.shape().has_layout()) {
+          value->set_color(BufferValue::Color(
+              defining_position.shape().layout().memory_space()));
+        } else {
+          value->set_color(BufferValue::Color(0));
+        }
+      }
+      return absl::OkStatus();
+    };
+  }
+
+  // Returns false if a buffer cannot be assigned to given allocation.
+
+  // Build and return a BufferAssignment for the given module. The given
+  // HloOrdering is used to determine buffer liveness. buffer_size and
+  // color_alignment are functions which returns the size and alignment of a
+  // LogicalBuffer. If preset_assignments is provided, those pre-set assignment
+  // offsets will be used. The caller guarantees that those assignments are
+  // valid and they do not overwrite each other.
+  static absl::StatusOr<std::unique_ptr<BufferAssignment>> Run(
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+      BufferValue::SizeFunction buffer_size,
+      LogicalBuffer::AlignmentFunction color_alignment,
+      bool allocate_buffers_for_constants = false,
+      Colorer colorer = DefaultColorer(),
+      std::optional<MustNotLiveOut> must_not_live_out = std::nullopt,
+      HloDataflowAnalysis::CanShareBuffer can_share_buffer = nullptr,
+      std::unique_ptr<memory_space_assignment::PresetAssignments>
+          preset_assignments = {},
+      const PrivateStacks& private_stacks = {},
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
+          heap_buffer_interval_compare = nullptr,
+      std::optional<BufferAssignment::BufferIsolationOptions>
+          isolation_options = std::nullopt,
+      std::optional<BufferValue::Color> temp_buffer_color = std::nullopt);
+
+ private:
+  BufferAssigner(bool allocate_buffers_for_constants, Colorer colorer,
+                 std::optional<MustNotLiveOut> must_not_live_out,
+                 std::unique_ptr<memory_space_assignment::PresetAssignments>
+                     preset_assignments)
+      : allocate_buffers_for_constants_(allocate_buffers_for_constants),
+        colorer_(colorer),
+        must_not_live_out_(must_not_live_out),
+        preset_assignments_(std::move(preset_assignments)) {}
+  virtual ~BufferAssigner() = default;
+
+  // Create a buffer assignment.
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+      BufferValue::SizeFunction buffer_size,
+      LogicalBuffer::AlignmentFunction color_alignment,
+      HloDataflowAnalysis::CanShareBuffer can_share_buffer,
+      const PrivateStacks& private_stacks,
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
+          heap_buffer_interval_compare,
+      std::optional<BufferAssignment::BufferIsolationOptions> isolation_options,
+      std::optional<BufferValue::Color> temp_buffer_color);
+
+  // Assigns buffers to the instructions in the given computations. "assignment"
+  // is modified to reflect the new buffer assignments. If is_thread_local is
+  // true, then all assigned buffers have the is_thread_local flag set to
+  // true.
+  absl::Status AssignBuffersForComputations(
+      const std::vector<const HloComputation*>& computations,
+      bool is_thread_local,
+      absl::flat_hash_map<const HloComputation*,
+                          absl::flat_hash_set<const HloValue*>>*
+          buffers_to_assign_sequentially,
+      BufferAssignment* assignment);
+
+  // Returns true if buffer's live range interferences with buffer2's.
+  bool LiveRangeInterferes(const HloValue* buffer1, const HloValue* buffer2,
+                           BufferAssignment* assignment);
+
+  // Assigns pre-set assignments, if provided. These assignments will be added
+  // to assigned_buffers and skip buffer allocation.
+  absl::Status AssignPresetBuffers(
+      absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
+      BufferAssignment* assignment);
+
+  // Assigns a single hlo buffer to an HLO allocation.
+  absl::Status AssignSingleHloBuffer(
+      const HloBuffer* hlo_buffer, bool is_thread_local,
+      absl::flat_hash_map<const HloComputation*,
+                          absl::flat_hash_set<const HloValue*>>*
+          buffers_to_assign_sequentially,
+      std::vector<BufferAllocation::Index>* allocation_indices,
+      BufferAssignment* assignment);
+
+  // Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
+  // the HLO instructions will be executed in the sequential order given by
+  // assignment->liveness().hlo_ordering().SequentialOrder. If
+  // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
+  // assuming all global computations are sequentially ordered.
+  absl::Status AssignBuffersWithSequentialOrdering(
+      const absl::flat_hash_map<const HloComputation*,
+                                absl::flat_hash_set<const HloValue*>>&
+          buffers_to_assign_sequentially,
+      bool run_whole_module_heap_simulation, BufferAssignment* assignment,
+      const PrivateStacks& private_stacks,
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
+          heap_buffer_interval_compare,
+      std::optional<BufferAssignment::BufferIsolationOptions>
+          isolation_options);
+
+  // Isolates the buffers packed by heap simulator using the provided isolation
+  // options. Please see the documentation for BufferIsolationConfig for more
+  // details.
+  void IsolateHeapBuffers(
+      std::optional<BufferAssignment::BufferIsolationOptions> isolation_options,
+      const BufferAssignment* assignment, LogicalBuffer::Color color,
+      HeapSimulator::Result<HloValue>& result) const;
+
+  // Uses the results of the heap simulator to create a single allocation, with
+  // LogicalBuffers packed to specific offsets.
+  void AssignBuffersFromHeapSimulator(
+      HeapSimulator::Result<HloValue>& result, BufferAssignment* assignment,
+      LogicalBuffer::Color color,
+      std::optional<BufferAssignment::BufferIsolationOptions>
+          isolation_options);
+
+  // Tries to assign the given instruction to the given buffer. Returns if the
+  // assignment was successful.
+  bool MaybeAssignBuffer(BufferAllocation* allocation, const HloBuffer& buffer,
+                         BufferAssignment* assignment);
+
+  // Split a set of buffers into several sets, each of which contains buffers
+  // colored with the same color.
+  absl::flat_hash_map<LogicalBuffer::Color,
+                      absl::flat_hash_set<const HloValue*>>
+  SplitBuffersByColor(
+      const absl::flat_hash_set<const HloValue*>& buffers) const;
+
+  // Split a set of buffers into several sets, each of which contains buffers
+  // with defining instructions that are dominated by the given private stack
+  // computation. This function CHECK-fails if there are outstanding buffers
+  // that do not have a dominating private stack computation.
+  absl::flat_hash_map<const HloComputation*,
+                      absl::flat_hash_set<const HloValue*>>
+  SplitBuffersByPrivateStackComputation(
+      const absl::flat_hash_set<const HloValue*>& buffers,
+      absl::Span<const HloComputation* const> private_stack_computations,
+      const CallGraph& call_graph) const;
+
+  // If true, allocate buffers for constant instructions.
+  bool allocate_buffers_for_constants_;
+
+  // Functor used to assign colors to newly allocated logical buffers.
+  Colorer colorer_;
+
+  // An optional function that returns true if the given instruction can't live
+  // out of a computation.
+  std::optional<MustNotLiveOut> must_not_live_out_;
+
+  // Description of any buffer offsets that are already set by an earlier pass.
+  std::unique_ptr<memory_space_assignment::PresetAssignments>
+      preset_assignments_;
+
+  BufferAssigner(const BufferAssigner&) = delete;
+  BufferAssigner& operator=(const BufferAssigner&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_BUFFER_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_value.h b/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_value.h
new file mode 100644
index 00000000..f75ec9b3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_value.h
@@ -0,0 +1,185 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BUFFER_VALUE_H_
+#define XLA_SERVICE_BUFFER_VALUE_H_
+
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Abstract class describing a value used by one of the dataflow analyses -
+// TuplePointsToAnalysis or HloDataflowAnalysis.
+// TODO(b/78906445) Delete this class when TuplePointsToAnalysis is unused.
+//
+// XLA arrays are trivially a single BufferValue. Tuples are made up of more
+// than one BufferValue: a BufferValue for the pointer vector, and a
+// BufferValue for each child element.
+//
+// Every BufferValue is defined by a particular instruction and most
+// instructions define only a single BufferValue. Instructions which define a
+// single BufferValue include array-shaped instructions such as Add but also
+// includes Tuple-shaped instructions such as Tuple. The Tuple instruction
+// defines a single BufferValue which is a vector of pointers to the values
+// containing the Tuple instruction's operands. Though the result of the Tuple
+// instruction includes multiple values only the top-level BufferValue (the
+// vector of pointers) is defined by the Tuple instruction. The values
+// containing the tuple elements are defined by earlier instructions, usually
+// the operands of the Tuple instruction.
+//
+// Instructions which construct both the tuple *and* the tuple elements define
+// more than one BufferValue. This includes (at least) tuple-shaped Constant,
+// Parameter, Infeed and While instructions. These tuple-shaped instructions do
+// not assemble a tuple from existing BufferValues like the Tuple instruction
+// does, but rather define all the BufferValues in the tuple.
+//
+// Some instructions, such as Bitcast, define no buffers. These instructions
+// simply forward buffers from their operands.
+//
+// The BufferValue object describes which HLO instruction defines a buffer and
+// where within that instruction's output shape the buffer is defined. The
+// location within the output shape is indicated by BufferValue::index() which
+// is defined identically to the index used in ShapeUtil::GetSubshape().
+// Examples:
+//
+// %add = Add(%foo, %bar)
+// %tuple_constant = Constant({1, {42, 43}})
+//
+// %add defines a single array-shaped buffer BufferValue(%add, {}) which holds
+// the array result of the add operation. The nested-tuple-shaped
+// %tuple_constant defines 5 buffers described by the following BufferValue
+// objects:
+//
+//   BufferValue(%tuple_constant, {})      // "Top-level" buffer: vector of
+//                                         //  pointers to BufferValues at
+//                                         //  indices {0} and {1}
+//   BufferValue(%tuple_constant, {0})     // Holds value "1"
+//   BufferValue(%tuple_constant, {1})     // Holds nested tuple: vector of
+//                                         //  pointers to BufferValues at
+//                                         //  indices {1, 0} and {1, 1}
+//   BufferValue(%tuple_constant, {1, 0})  // Holds value "42"
+//   BufferValue(%tuple_constant, {1, 1})  // Holds value "43"
+
+class BufferValue {
+ public:
+  using Color = int64_t;
+
+  // Id is a unique identifier for the BufferValue to facilitate efficient
+  // collections of BufferValues with stable iteration order.
+  using Id = int64_t;
+
+  // Functions which return the size and alignment of a logical buffer in bytes.
+  using SizeFunction = std::function<int64_t(const BufferValue&)>;
+  using AlignmentFunction = std::function<int64_t(BufferValue::Color)>;
+
+  // Prevent value being copied, allowing comparison by pointer,
+  BufferValue(const BufferValue&) = delete;
+  BufferValue& operator=(const BufferValue&) = delete;
+  // ... but allow moves.
+  BufferValue(BufferValue&&) = default;
+  BufferValue& operator=(BufferValue&&) = default;
+
+  virtual ~BufferValue() {}
+
+  Id id() const { return id_; }
+
+  // Return the instruction that defines the buffer.
+  virtual HloInstruction* instruction() const = 0;
+
+  // Return the index within the output of the instruction where the buffer is
+  // defined. Index used defined as in ShapeUtil::GetSubshape()
+  virtual const ShapeIndex& index() const = 0;
+
+  // Return the color of the BufferValue. Differently colored buffers can not be
+  // parts of the same allocation.
+  ABSL_DEPRECATED("Use Layout::memory_space instead.")
+  Color color() const {
+    CHECK_NE(color_, kInvalidColor)
+        << "Should not query the color of a buffer that was never colored";
+    return color_;
+  }
+
+  ABSL_DEPRECATED("Use Layout::memory_space instead.")
+  void set_color(Color color) {
+    CHECK_NE(color, kInvalidColor)
+        << "Should not set the color of a buffer to the invalid color";
+    color_ = color;
+  }
+
+  ABSL_DEPRECATED("Use Layout::memory_space instead.")
+  bool has_color() const { return color_ != kInvalidColor; }
+
+  // Return the shape of the buffer. This reference points into the shape field
+  // of the instruction defining the buffer.  Therefore, the returned shape will
+  // contain the layout of instruction, if any.
+  virtual const Shape& shape() const = 0;
+
+  // Returns true if this buffer is the top-level output buffer of the defining
+  // HLO instruction. This is equivalent to index == {}.
+  bool IsTopLevel() const { return index().empty(); }
+
+  // Whether this buffer contains a tuple.
+  bool IsTuple() const { return is_tuple_; }
+
+  // Whether this buffer contains an array.
+  bool IsArray() const { return is_array_; }
+
+  bool operator<(const BufferValue& other) const { return id_ < other.id_; }
+
+  virtual std::string ToString() const = 0;
+
+  // TODO(lauj) rename LogicalBufferProto to BufferValueProto.
+  LogicalBufferProto ToProto(const SizeFunction& size_fn) const;
+
+  // Returns the LogicalBufferProto::Location that serializes the given
+  // instruction and index.
+  static LogicalBufferProto::Location ToLocationProto(
+      const HloInstruction& instruction, const ShapeIndex& index);
+
+  static constexpr Color kInvalidColor = -1;
+
+ protected:
+  BufferValue(HloInstruction* instruction, const ShapeIndex& index, Id id);
+
+ private:
+  // The defining instruction and index are not stored here; they can be found
+  // in the LogicalBuffer and HloValue subclasses. This class exists only to
+  // support migrations from TuplePointsToAnalysis to HloDataflowAnalysis, by
+  // allowing abstract use of LogicalBuffer or HloValue. After those migrations
+  // are complete, this class should be deleted (b/78906445). Because we plan to
+  // delete LogicalBuffer and this class, we don't refactor all the shared
+  // features from LogicalBuffer and HloValue into this class.
+  Id id_ : 62;
+  bool is_array_ : 1;
+  bool is_tuple_ : 1;
+  Color color_ = kInvalidColor;
+};
+
+std::ostream& operator<<(std::ostream& out, const BufferValue& buffer);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_BUFFER_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_value_containers.h b/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_value_containers.h
new file mode 100644
index 00000000..9b2cfaff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/buffer_value_containers.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
+#define XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/tsl/lib/gtl/compactptrset.h"
+
+namespace xla {
+
+// Define various containers of BufferValues, and utilities to convert from
+// containers of LogicalBuffers to containers of BufferValues.
+
+using BufferValueCompactPointerSet =
+    tsl::gtl::CompactPointerSet<const BufferValue*>;
+template <class LogicalBufferContainerT>
+BufferValueCompactPointerSet ToBufferValueCompactPointerSet(
+    const LogicalBufferContainerT& logical_buffer_container) {
+  BufferValueCompactPointerSet output;
+  for (const LogicalBuffer* buffer : logical_buffer_container) {
+    output.insert(buffer);
+  }
+  return output;
+}
+
+using BufferValueFlatSet = absl::flat_hash_set<const BufferValue*>;
+template <class LogicalBufferContainerT>
+BufferValueFlatSet ToBufferValueFlatSet(
+    const LogicalBufferContainerT& logical_buffer_container) {
+  BufferValueFlatSet output;
+  output.reserve(logical_buffer_container.size());
+  for (const LogicalBuffer* buffer : logical_buffer_container) {
+    output.insert(buffer);
+  }
+  return output;
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/call_graph.h b/third_party/tflite-hdrs/third_party/xla/xla/service/call_graph.h
new file mode 100644
index 00000000..0d15a64c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/call_graph.h
@@ -0,0 +1,400 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Call graph for an HLO module.
+
+#ifndef XLA_SERVICE_CALL_GRAPH_H_
+#define XLA_SERVICE_CALL_GRAPH_H_
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// The context in which a computation is called by another computation.
+enum class CallContext {
+  // In an embedded call context, the body of the function cannot allocate
+  // buffers.
+  kEmbedded,
+
+  // A control flow call context can allocate buffers.
+  kControlFlow,
+
+  // A computation is called from both an embedded and control flow context.
+  kBoth,
+
+  // During call graph construction kNone is used to indicate that the context
+  // has not been determined. This is the top value for the context
+  // lattice. After construction, no call sites or call graph nodes should have
+  // this value.
+  kNone
+};
+
+std::string CallContextToString(CallContext context);
+std::ostream& operator<<(std::ostream& out, const CallContext& context);
+
+CallContext GetInstructionCallContext(HloOpcode opcode);
+
+// Represents an HLO instruction which calls one or more computations.
+class CallSite {
+ public:
+  CallSite(HloInstruction* instruction,
+           absl::Span<HloComputation* const> called_computations,
+           CallContext context)
+      : instruction_(CHECK_NOTNULL(instruction)),
+        called_computations_(called_computations.begin(),
+                             called_computations.end()),
+        context_(context) {}
+
+  // Returns the instruction associated with this call site.
+  HloInstruction* instruction() const { return instruction_; }
+
+  // Returns the computations called at this call site.
+  absl::Span<HloComputation* const> called_computations() const {
+    return called_computations_;
+  }
+
+  // Returns the context in which computations are called at this call site.
+  CallContext context() const { return context_; }
+
+  std::string ToString() const;
+
+ private:
+  // The calling instruction.
+  HloInstruction* instruction_;
+
+  // The computations called by this callsite.
+  const absl::InlinedVector<HloComputation*, 2> called_computations_;
+
+  // The context in which the computations are called.
+  const CallContext context_;
+};
+
+// A node in the call graph representing an HLO computation.
+class CallGraphNode {
+ public:
+  explicit CallGraphNode(HloComputation* computation);
+
+  // Returns the computation represented by this call graph node.
+  HloComputation* computation() const { return computation_; }
+
+  // Returns the call sites in this computation. These are the instructions in
+  // this computation which call other computations.
+  absl::Span<const CallSite> callsites() const { return callsites_; }
+
+  // Returns the callsite associated with the given instruction. If this
+  // instruction calls no computations nullptr is returned.
+  // Prerequisite: instruction is in the computation associated with this call
+  // graph node.
+  const CallSite* GetCallSite(const HloInstruction* instruction) const;
+
+  // Returns the computations called by this computation.
+  absl::Span<HloComputation* const> callees() const { return callees_; }
+
+  // Returns the call sites in other computations which call this computation.
+  absl::Span<const CallSite> caller_callsites() const {
+    return caller_callsites_;
+  }
+
+  // Returns the computations which call this computation.
+  absl::Span<HloComputation* const> callers() const { return callers_; }
+
+  // Returns the context in which this computation is called.
+  CallContext context() const { return context_; }
+
+  // Returns the depth of this node in the call graph. The depth is defined as
+  // the length of the longest call chain from a computation with no callers
+  // (usually the entry computation node) to this node.
+  int depth() const { return depth_; }
+
+  absl::string_view ToString() const;
+
+  CallGraphNode(const CallGraphNode&) = delete;
+  CallGraphNode& operator=(const CallGraphNode&) = delete;
+  CallGraphNode(CallGraphNode&&) = default;
+  CallGraphNode& operator=(CallGraphNode&&) noexcept = default;
+
+ private:
+  // Only CallGraph can modify CallGraphNode.
+  friend class CallGraph;
+
+  // Sets the context in which this computation is called.
+  void set_context(CallContext value) { context_ = value; }
+
+  // Sets the depth of this node in the graph.
+  void set_depth(int value) { depth_ = value; }
+
+  // Adds a callsite which calls this computation. Updates callers to include
+  // the calling computation.
+  void AddCallerCallSite(const CallSite& caller_callsite);
+
+  // If instruction calls any computations adds a call site for this instruction
+  // to the call graph node. If the instruction calls no computations then no
+  // call site is added.
+  void AddCallSiteForInstruction(
+      HloInstruction* instruction,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Computation represented by this call graph node.
+  HloComputation* computation_;
+
+  // The computations called by this computation. The vector is used for a
+  // stable ordering and the set enables fast membership testing.
+  absl::InlinedVector<HloComputation*, 1> callees_;
+  absl::flat_hash_set<HloComputation*> callee_set_;
+
+  // The computations which call this computation. The vector is used for a
+  // stable ordering and the set enables fast membership testing.
+  absl::InlinedVector<HloComputation*, 1> callers_;
+  absl::flat_hash_set<HloComputation*> caller_set_;
+
+  // The call sites in this computation
+  absl::InlinedVector<CallSite, 1> callsites_;
+
+  // The map from instruction to index in callsites_ for looking up the callsite
+  // (if any) associated with a particular instruction in this computation.
+  absl::flat_hash_map<const HloInstruction*, int64_t> callsite_instructions_;
+
+  // The call sites in other computations which call this computation.
+  absl::InlinedVector<CallSite, 1> caller_callsites_;
+
+  // The context in which this computation is called.
+  CallContext context_ = CallContext::kNone;
+
+  // The depth of this node in the call graph.
+  int depth_ = 0;
+};
+
+// The call graph for an HLO module. The graph includes a node for each
+// computation in the module.
+class CallGraph {
+ public:
+  using VisitorFunction = absl::FunctionRef<absl::Status(const CallGraphNode&)>;
+
+  // Builds and returns a call graph for the given HLO module. If a non-empty
+  // execution_threads is provided, only computations that are in
+  // execution_threads will be part of the returned call graph.
+  static std::unique_ptr<CallGraph> Build(
+      const HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Returns the node associated with the given computation.
+  const CallGraphNode& GetNode(const HloComputation* computation) const;
+  CallGraphNode& GetNode(const HloComputation* computation);
+
+  // Returns the vector of all nodes in the call graph.
+  const std::vector<CallGraphNode>& nodes() const { return nodes_; }
+
+  // Calls the given function on each node in the call graph. Nodes are visited
+  // in post order (callees before callers). If visit_unreachable_nodes is true
+  // then all nodes in the call graph are visited. Otherwise only those nodes
+  // reachable from the entry computation are visited.
+  absl::Status VisitNodes(VisitorFunction visitor_func,
+                          bool visit_unreachable_nodes = true) const;
+
+  // Returns true if 'a' dominates 'b' in the call graph. Computation 'a'
+  // dominates computation 'b' iff all callgraph paths in the caller-to-callee
+  // direction from a root computation to 'b' pass through computation
+  // 'a'. Trivially, a computation dominates itself.
+  bool Dominates(const HloComputation* a, const HloComputation* b) const;
+
+  // Returns true if 'a' can reach 'b' in the call graph. 'a' can reach 'b' if
+  // 'a' is 'b' or 'a' can reach one of the callers of 'b'.
+  bool CanReach(const HloComputation* a, const HloComputation* b) const;
+
+  // Returns whether 'instruction' is contained in 'computation' either directly
+  // ('instruction->parent' is 'computation') or indirectly ('computation'
+  // dominates 'instruction->parent' in the call graph).
+  bool InstructionIsNestedIn(const HloInstruction* instruction,
+                             const HloComputation* computation) const {
+    return Dominates(computation, instruction->parent());
+  }
+
+  // Returns the nearest call graph ancestors of instructions 'a' and 'b' for
+  // which the ancestors are in the same computation. An instruction is an call
+  // graph ancestor of 'a' if the instruction calls the computation containing
+  // 'a' either directly or transitively. Degeneratively an instruction is an
+  // ancestor of itself. nullptr is returned if there is no common ancestor or
+  // if the caller chain of 'a' or 'b' diverges (has multiple callers) before
+  // the nearest common ancestor.
+  //
+  // Example:
+  //
+  // Entry computation:
+  //   %x = Call(A, {Constant(42.0)})
+  //   %y = Call(B, {%x})
+  //
+  // Computation A:
+  //   %a = Negate(Param())
+  //
+  // Computation B:
+  //   %b = Exp(Param());
+  //
+  // If called with %a and %b, this function would return (%x, %y). %x is an
+  // ancestor of %a, and %y is an ancestor of %b, and %x and %y are in the same
+  // computation.
+  std::pair<HloInstruction*, HloInstruction*> NearestAncestorsInSameComputation(
+      HloInstruction* a, HloInstruction* b) const;
+
+  // Given a set of instructions within a computation, returns nearest common
+  // ancestors as Hlo instructions (There could be multiple nearest common
+  // ancestors in a DAG). If the given instructions are not in the same
+  // computation, this function would report FAILURE.
+  //
+  // Unlike the `NearestAncestorsInSameComputation` defined above, it:
+  //
+  // (1) Only compute the nearest common ancestors within a computation, instead
+  // of across computations (that's the function
+  // `ComputationsNearestCommonAncestors` that defined below).
+  //
+  // (2) Takes in **a set of** Hlo instructions, instead of two Hlo
+  // instructions, and find their nearest common ancestors.
+  //
+  // Example:
+  //
+  // Computation A:
+  //   %p0   = Param(0)
+  //   %p1   = Param(1)
+  //   %p2   = Param(2)
+  //   %add0 = Add(%p0, %p1)
+  //   %mul0 = Mul(%p1, %p2)
+  //   %sub0 = Sub(%add0, %mul0)
+  //
+  // If called with {%p0, %p1}, this function would return {%add0}.
+  //
+  // Please check the detailed example in
+  // `CallGraphTest.NearestCommonAncestorInstructions`.
+  absl::flat_hash_set<const HloInstruction*> NearestCommonAncestorInstructions(
+      std::vector<const HloInstruction*> instructions);
+
+  // Given a set of computations within a module, returns nearest common
+  // ancestors as Hlo computations (There could be multiple nearest common
+  // ancestors in a DAG).
+  //
+  // Entry_computation:
+  //   %x = Call(A, {Constant(42.0)})
+  //   %y = Call(B, {%x})
+  //
+  // Computation_A:
+  //   %a = Negate(Param())
+  //
+  // Computation_B:
+  //   %b = Exp(Param());
+  //
+  // If called with {Computation_A, Computation_B}, this function would return
+  // {Entry_computation}.
+  //
+  // Please check the detailed example in
+  // `CallGraphTest.NearestCommonAncestorComputations`.
+  absl::flat_hash_set<const HloComputation*> NearestCommonAncestorComputations(
+      std::vector<const HloComputation*> computations);
+
+  // A template helper function that computes the nearest common ancestors among
+  // instructions/computations. `T` can be either `HloInstruction` or
+  // `HloComputation`. Computing nearest common ancestors are basically the same
+  // for HloInstruction and HloComputation. The only difference is that they
+  // require different ways to access the ancestors of one node. Specifically,
+  // the ancestors are users_instruction for instructions, and are
+  // caller_computations for computations.
+  //
+  // The overall idea is to conduct BFS from the `starting_nodes`, and keep
+  // track of the visited ancestors of each node. For each BFS step, we check if
+  // there is a common node in all the visited ancestors, and if yes, that
+  // common node is the nearest ancestor we are looking for. Note that, since we
+  // are traversing DAG, there could be multiple nearest common ancestors. And
+  // there must be at least one common ancestor (i.e., entry computations among
+  // computations or root instruction among instructions).
+  template <typename T>
+  absl::flat_hash_set<const T*> NearestCommonAncestorsHelper(
+      std::vector<const T*>& starting_nodes);
+
+  // Returns whether the call graph is flattened. A call graph is flattened if
+  // every computation called in a sequential context (eg, kWhile or kCall) has
+  // zero or one callsite, and no computation is called from both a parallel and
+  // sequential context. The call graph of a module can be flattened with
+  // FlattenCallGraph.
+  bool IsFlattened() const;
+
+  // Returns a vector of instructions calling the passed computation.
+  // (Often a vector of size 1.)
+  std::vector<HloInstruction*> GetComputationCallers(
+      const HloComputation* c) const;
+
+  std::string ToString() const;
+
+ private:
+  explicit CallGraph(
+      const HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Not copyable.
+  CallGraph(const CallGraph&) = delete;
+  CallGraph& operator=(const CallGraph&) = delete;
+
+  // Sets the call contexts for every node in the graph.
+  void SetCallContexts();
+
+  // Sets the call node depths for every node in the graph.
+  void SetNodeDepths();
+
+  // Helper method for VisitNodes(). Traverses the call graph from 'node' in DFS
+  // post order (callee before caller) calling visitor_func on each node. Adds
+  // nodes to 'visited' as each node is visited. Skips nodes already in
+  // 'visited'.
+  absl::Status VisitNodesInternal(
+      VisitorFunction visitor_func, const CallGraphNode& node,
+      absl::flat_hash_set<const CallGraphNode*>* visited) const;
+
+  // Recursive helper for computing whether 'a' dominates 'b' in the call
+  // graph. 'b_ancestor' is the currently visited node (which starts at 'b'),
+  // and 'visited' is the set of computations which have been visited.
+  bool DominatesHelper(
+      const HloComputation* a, const HloComputation* b,
+      absl::flat_hash_set<const HloComputation*>* visited) const;
+
+  // The HLO module represented by this call graph.
+  const HloModule* module_ = nullptr;
+
+  // Vector of all nodes in the call graph.
+  std::vector<CallGraphNode> nodes_;
+
+  // Map from HLO computation to the index of the corresponding call graph node
+  // in nodes_.
+  absl::flat_hash_map<const HloComputation*, int64_t> node_indices_;
+
+  // The execution threads that the call graph is built for.
+  absl::flat_hash_set<absl::string_view> execution_threads_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CALL_GRAPH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/call_inliner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/call_inliner.h
new file mode 100644
index 00000000..3eb2b7f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/call_inliner.h
@@ -0,0 +1,73 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CALL_INLINER_H_
+#define XLA_SERVICE_CALL_INLINER_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// For every kCall operation in the main computation, we inline the body of the
+// called function, and proceed recursively.
+class CallInliner : public HloModulePass {
+ public:
+  using InlinedInstructionMap =
+      absl::flat_hash_map<HloInstruction*, HloInstruction*>;
+
+  // Inlines one call instruction.  Returns a mapping from the original
+  // instructions to their inlined versions.
+  static absl::StatusOr<InlinedInstructionMap> Inline(HloInstruction* call);
+
+  // If single_call_site is true, only functions with a single call site will be
+  // inlined.
+  // If update_domain is true, the exit domains could be updated for calls which
+  // are being inlined if necessary.
+  explicit CallInliner(
+      bool single_call_site = false, bool update_domain = false,
+      absl::flat_hash_set<std::string> composites_to_preserve = {})
+      : single_call_site_(single_call_site),
+        update_domain_(update_domain),
+        composites_to_preserve_(std::move(composites_to_preserve)) {}
+  ~CallInliner() override = default;
+  absl::string_view name() const override { return "call-inliner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Returns true if the instruction is a kCall operation and is eligible for
+  // inlining.
+  virtual bool IsInlineableCallOp(HloInstruction* instruction) const;
+
+ private:
+  bool single_call_site_;
+  bool update_domain_;
+  absl::flat_hash_set<std::string> composites_to_preserve_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CALL_INLINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/change_op_data_type.h b/third_party/tflite-hdrs/third_party/xla/xla/service/change_op_data_type.h
new file mode 100644
index 00000000..43e929d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/change_op_data_type.h
@@ -0,0 +1,80 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CHANGE_OP_DATA_TYPE_H_
+#define XLA_SERVICE_CHANGE_OP_DATA_TYPE_H_
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Changes `from_ty op(from_ty a, from_ty b)` into
+// `from_ty convert(op(to_ty convert(a), to_ty convert(b)))`.
+//
+// One place where this pass is useful is for fp16 dots/convs in XLA:CPU.
+// Although XLA:CPU supports fp16 dots/convs, they are significantly slower than
+// fp32 convs.   This pass lets us run the fp16 dot/conv as "convert to fp32,
+// run in fp32, then convert back to fp16".  (This is of course not
+// mathematically the same, but it's close enough for our purposes.)
+//
+// This pass only considers ops that match `op_matcher` and where all operands
+// have type `from_ty`.  It will not do the correct thing for ops like
+// dynamic-slice where only some of the arguments should be converted; it's up
+// to you to avoid matching such ops with `op_matcher`.
+//
+// The pass support multiple <from_ty, to_ty> pairs and will apply the transform
+// if all operands match one of the types in from_ty.
+//
+// It uses provided `cloner` to clone an instruction with shape and converted
+// operands. If the cloner is not provided, it will uses `CloneWithNewOperands`.
+class ChangeOpDataType : public HloModulePass {
+ public:
+  using HloCloner = std::function<std::unique_ptr<HloInstruction>(
+      const HloInstruction*, const Shape&, absl::Span<HloInstruction* const>)>;
+  ChangeOpDataType(
+      absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types,
+      HloPredicate op_matcher, HloCloner cloner = nullptr)
+      : op_matcher_(op_matcher), cloner_(cloner) {
+    for (const std::pair<PrimitiveType, PrimitiveType>& pair : from_to_types) {
+      to_type_map_[pair.first] = pair.second;
+    }
+  }
+
+  ChangeOpDataType(PrimitiveType from_ty, PrimitiveType to_ty,
+                   HloPredicate op_matcher, HloCloner cloner = nullptr)
+      : op_matcher_(op_matcher), cloner_(cloner) {
+    to_type_map_[from_ty] = to_ty;
+  }
+
+  absl::string_view name() const override { return "change-op-data-type"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // map with key = from_type and value = to_type.
+  absl::flat_hash_map<PrimitiveType, PrimitiveType> to_type_map_;
+  HloPredicate op_matcher_;
+  HloCloner cloner_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CHANGE_OP_DATA_TYPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/channel_tracker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/channel_tracker.h
new file mode 100644
index 00000000..0f1bc594
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/channel_tracker.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CHANNEL_TRACKER_H_
+#define XLA_SERVICE_CHANNEL_TRACKER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Tracks channels between computations in the XLA service. Channels
+// are associated with a unique handle and can be resolved from the handle for
+// later use.
+//
+// TODO(b/34027823): Destruct channels when all the associated computations that
+// communicate via each channel are destructed.
+class ChannelTracker {
+ public:
+  ChannelTracker() = default;
+
+  // Creates a new Channel object and returns the corresponding
+  // ChannelHandle for it.
+  absl::StatusOr<ChannelHandle> NewChannel(ChannelHandle::ChannelType type);
+
+ private:
+  // Guards the channel mapping.
+  absl::Mutex channel_mutex_;
+
+  // The next sequence number to assign to a channel.
+  int64_t next_channel_ ABSL_GUARDED_BY(channel_mutex_) = 1;
+
+  ChannelTracker(const ChannelTracker&) = delete;
+  ChannelTracker& operator=(const ChannelTracker&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CHANNEL_TRACKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cholesky_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cholesky_expander.h
new file mode 100644
index 00000000..7e9e7332
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cholesky_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CHOLESKY_EXPANDER_H_
+#define XLA_SERVICE_CHOLESKY_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/cholesky_expander.h"
+
+#endif  // XLA_SERVICE_CHOLESKY_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_combiner_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_combiner_utils.h
new file mode 100644
index 00000000..5fb45edf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_combiner_utils.h
@@ -0,0 +1,164 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_COMBINER_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_COMBINER_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+
+// Combines instructions with matching keys together.
+//
+// Instructions are combined in topological post-order.
+//
+// `key_fn` should return equal keys for two instructions that might be combined
+// together. Instructions will be combined until the threshold for output byte
+// size or instruction count is reached.
+template <typename K>
+absl::StatusOr<bool> CombineInstructionsByKey(
+    HloComputation* computation,
+    absl::FunctionRef<std::optional<K>(const HloInstruction*)> key_fn,
+    absl::FunctionRef<absl::Status(absl::Span<HloInstruction* const>)>
+        combine_fn,
+    int64_t combine_threshold_bytes, int64_t combine_threshold_count) {
+  // Cache keys for each instruction and build sets of instructions with the
+  // same key that might be combined together.
+  absl::flat_hash_map<HloInstruction*, K> keys;
+  absl::flat_hash_map<K, absl::flat_hash_set<HloInstruction*>> groups;
+
+  for (HloInstruction* instruction : computation->instructions()) {
+    std::optional<K> key = key_fn(instruction);
+    if (key) {
+      keys.insert({instruction, *key});
+      groups[*key].insert(instruction);
+    }
+  }
+
+  bool changed = false;
+
+  // Keys are removed after the instruction is combined (or never will be).
+  while (!keys.empty()) {
+    std::vector<HloInstruction*> to_combine;
+    int64_t to_combine_bytes = 0;
+    absl::flat_hash_set<HloInstruction*>* group = nullptr;
+
+    // Recompute reachability after every combine group because we can't
+    // maintain a cross group topological order to be able to rely on the
+    // transitive dependencies to detect cycles.
+    std::unique_ptr<HloReachabilityMap> reachability =
+        HloReachabilityMap::Build(computation);
+
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      auto it = keys.find(instruction);
+      if (it == keys.end()) continue;
+
+      // If this is the first instruction, set the active group.
+      if (to_combine.empty()) {
+        group = &groups.find(it->second)->second;
+      }
+
+      // Check instruction is in the active group.
+      if (group->find(instruction) == group->end()) {
+        continue;
+      }
+
+      VLOG(1) << "Considering HLO " << instruction->ToString()
+              << " with current set size of " << to_combine_bytes
+              << " and current operand count of " << to_combine.size();
+
+      // We do not handle ops that have more than one operand since that is
+      // simpler and this pass is the only way to generate such ops.
+      if (instruction->operands().size() != 1) {
+        VLOG(1) << "Skipping due to " << instruction->operands().size()
+                << " operands";
+        keys.erase(it);
+        continue;
+      }
+
+      TF_RET_CHECK(instruction->shape().IsArray());
+      int64_t instruction_bytes = ShapeUtil::ByteSizeOf(instruction->shape());
+
+      // If the instruction is greater than the threshold, then we can never
+      // combine it with anything.
+      if (instruction_bytes > combine_threshold_bytes) {
+        VLOG(1) << "Size " << instruction_bytes << " above threshold.";
+        keys.erase(it);
+        continue;
+      }
+
+      if (to_combine_bytes + instruction_bytes > combine_threshold_bytes) {
+        VLOG(1) << "Combined size threshold exceeded.";
+        break;
+      }
+
+      // We can't combine dependent instructions.
+      bool is_reachable =
+          absl::c_any_of(to_combine, [&](HloInstruction* to_combine_inst) {
+            bool reachable =
+                reachability->IsReachable(to_combine_inst, instruction);
+            if (reachable) {
+              VLOG(2) << "<< Instruction {" << instruction->ToShortString()
+                      << "} is reachable from {"
+                      << to_combine_inst->ToShortString() << "}";
+            }
+            return reachable;
+          });
+      if (is_reachable) {
+        VLOG(1) << "Instruction is reachable.";
+        break;
+      }
+
+      VLOG(1) << "Adding instruction to set.";
+      to_combine.push_back(instruction);
+      to_combine_bytes += instruction_bytes;
+      keys.erase(it);
+
+      if (to_combine.size() >= combine_threshold_count) {
+        VLOG(1) << "Combined count threshold reached.";
+        break;
+      }
+    }
+
+    if (to_combine.size() > 1) {
+      TF_RETURN_IF_ERROR(combine_fn(to_combine));
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_COMBINER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_decomposer_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_decomposer_utils.h
new file mode 100644
index 00000000..905ab12c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_decomposer_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/collective_ops_utils.h"
+
+#ifndef XLA_SERVICE_COLLECTIVE_DECOMPOSER_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_DECOMPOSER_UTILS_H_
+
+namespace xla {
+
+absl::StatusOr<std::vector<HloInstruction *>>
+CreateStartIndicesForCollectiveDecomposition(
+    CollectiveOpGroupMode group_mode,
+    absl::Span<const ReplicaGroup> replica_groups, const Shape &shard_shape,
+    int64_t shard_dimension, HloComputation *computation,
+    std::function<void(Shape &)> update_layout = nullptr);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_DECOMPOSER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_ops_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_ops_utils.h
new file mode 100644
index 00000000..833e9b9e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_ops_utils.h
@@ -0,0 +1,501 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_OPS_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_OPS_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/stream_executor/device_memory.h"
+#include "tsl/platform/blocking_counter.h"
+
+namespace xla {
+
+enum class ReductionKind { SUM, PRODUCT, MIN, MAX };
+
+constexpr absl::string_view ReductionKindToString(
+    ReductionKind reduction_kind) {
+  switch (reduction_kind) {
+    case ReductionKind::SUM:
+      return "sum";
+    case ReductionKind::PRODUCT:
+      return "prod";
+    case ReductionKind::MIN:
+      return "min";
+    case ReductionKind::MAX:
+      return "max";
+  }
+}
+
+// Attempts to match instruction to one of the possible cases for ReductionKind.
+std::optional<ReductionKind> MatchReductionInstruction(
+    const HloInstruction* hlo);
+
+// Attempts to match computation to one of the possible cases in ReductionKind.
+std::optional<ReductionKind> MatchReductionComputation(
+    const HloComputation* computation);
+
+// Returns the reduction identity value for a certain ReductionKind and
+// PrimitiveType.
+std::optional<Literal> GetReductionIdentity(ReductionKind kind,
+                                            PrimitiveType type);
+
+// There are broadly 4 modes that collective communication ops use to describe
+// which sets of devices are participating with a given device in the operation.
+// These modes are determined by the values of channel_id (optional) and
+// use_global_device_ids (optional). The modes are as follows:
+//
+// kCrossReplica:
+//    implied by: no channel id, use_global_device_ids = false, or
+//                no channel_id, no use_global_device_ids:
+//    replica_groups contain replica_id, group contains all replicas for the
+//    current partition
+//
+// kCrossPartition:
+//    implied by: channel_id is set, no use_global_device_ids:
+//    replica_groups contain partition_id, group contains all partitions for the
+//    current replica.
+//
+// kCrossReplicaAndPartition:
+//    implied by: channel_id is set, use_global_device_ids = false:
+//    replica_groups contain replica_id, group contains all replicas for all
+//    partitions (as opposed to just current partition).
+//
+// kFlattenedID:
+//    implied by: channel_id is set, use_global_device_ids = true:
+//    replica_groups contain flattened-ids, group contains devices that are
+//    listed in the flattened-id list.
+//
+// Rest of the combinations are invalid.
+//
+// Since the actual value of channel_id does not matter, we use a bool argument
+// `has_channel_id`, and optional<bool> for use_global_device_ids.
+// Note that use_global_device_ids true requires channel_id to be set as well.
+// Additionally, if use_global_device_ids = true, replica groups cannot be
+// empty (verified in the HLO verifier).
+enum class CollectiveOpGroupMode {
+  kCrossReplica,
+  kCrossPartition,
+  kCrossReplicaAndPartition,
+  kFlattenedID,
+};
+
+// Figures out which IDs are participating in the collective subgroup.
+// An empty `groups` indicates that all [0, total_participant_count) IDs
+// are participating. Note that for CollectiveOpGroupMode::kFlattenedID,
+// groups cannot be empty, so `total_participant_count` is an optional.
+absl::StatusOr<std::vector<int>> GetParticipatingIDs(
+    CollectiveOpGroupMode group_mode, int current_id,
+    std::optional<int> total_participant_count,
+    absl::Span<const ReplicaGroup> groups);
+
+absl::string_view CollectiveOpGroupModeToString(
+    CollectiveOpGroupMode group_mode);
+
+absl::StatusOr<bool> GetCollectiveUseGlobalDeviceIds(const HloInstruction* hlo);
+
+std::optional<int64_t> GetCollectiveChannelId(const HloInstruction* hlo);
+
+const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo);
+
+const std::vector<ReplicaGroup>& GetCollectiveReplicaGroups(
+    const HloInstruction* hlo);
+
+// Returns the group formation mode of instr, assuming that instr is, or is
+// dervied from, an HloAllGatherInstruction, HloAllReduceInstructionBase,
+// HloAllToAllInstruction, HloCollectiveBroadcastInstruction or
+// HloCollectivePermuteInstruction.
+absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
+    const HloInstruction* instr);
+
+// Returns the group formation mode implied by (a) whether the operation has
+// channel_id and (b) if it has use_global_device_ids and if yes, its value.
+absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
+    bool has_channel_id, std::optional<bool> use_global_device_ids);
+
+// Figures out subgroups of participating devices from given replica_groups and
+// group_mode.
+//
+// Returns list of participants, where each participant is a list of
+// GlobalDeviceIds.
+//
+// For example:
+//   device_assignment={{33, 34}, {44, 45}, {55, 56}}  3 replicas 2 partitions
+//   replica_groups={{0}, {1, 2}}
+//   group_mode=CollectiveOpGroupMode::kCrossReplica
+//
+//   This functions returns {{33}, {34}, {44, 45}, {55, 56}}.
+//   Partition 0 has 2 subgroups of participating devices {33}, {44, 55} and
+//   partition 1 has 2 subgroups of participating devices {34}, {45, 56}.
+//
+// Another example:
+//   device_assignment={{33, 34}, {44, 45}, {55, 56}}  3 replicas 2 partitions
+//   replica_groups={{0}, {1, 2}, {3, 4, 5}}
+//   group_mode=CollectiveOpGroupMode::kFlattenedID
+//
+//   This functions returns {{33}, {34, 44}, {45, 55, 56}}. The replica_ids map
+//   into a flattened version of device_assignment.
+absl::StatusOr<std::vector<std::vector<GlobalDeviceId>>>
+GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
+                              absl::Span<const ReplicaGroup> replica_groups,
+                              CollectiveOpGroupMode group_mode);
+
+// Same as above, except taking an HloInstruction instead.
+absl::StatusOr<std::vector<std::vector<GlobalDeviceId>>>
+GetParticipatingDevicesGroups(const HloInstruction* collective);
+
+// Same as above, except that it returns the flattened id in the replica groups
+// instead of device id.
+absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    const DeviceAssignment& device_assignment,
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode group_mode);
+
+// Same as above, but take replica/partition count instead of device assignment.
+absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode group_mode, int replica_count, int partition_count);
+
+// Same as above, with collective group mode determined by the collective
+// instruction.
+absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    const HloInstruction* hlo, const DeviceAssignment& device_assignment);
+
+// Same as above, used for cases where static_device_assignment is not present.
+absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    const HloInstruction* hlo, int replica_count, int partition_count);
+
+// Figures out which devices are participating in the collective subgroup.
+absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
+    GlobalDeviceId device_id, const DeviceAssignment& device_assignment,
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode group_mode);
+
+// Figures out how many ranks are participating in each collective subgroup.
+absl::StatusOr<std::vector<int64_t>> GetPariticipantCountsForReplicaGroups(
+    int64_t num_replicas, int64_t num_partitions,
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode group_mode);
+
+absl::StatusOr<std::optional<std::pair<int64_t, int64_t>>>
+GetReplicaGroupCountAndSize(const HloInstruction* hlo);
+
+// Returns true if the two replica group are orthogonal.
+bool ReplicaGroupsOrthogonal(absl::Span<const ReplicaGroup> first,
+                             absl::Span<const ReplicaGroup> second);
+
+// Returns true if the two replica group are Equal.
+bool ReplicaGroupsEqual(absl::Span<const ReplicaGroup> first,
+                        absl::Span<const ReplicaGroup> second);
+
+// Returns true if all subgroups in replica_groups are exclusively cross-module.
+bool IsExclusivelyCrossModule(absl::Span<const ReplicaGroup> replica_groups,
+                              bool use_global_ids, bool has_channel_id,
+                              const DeviceAssignment& device_assignment);
+
+// A custom call target that can be used to create a nop that can legally
+// replace a collective op.
+inline constexpr absl::string_view kNopCustomCallTarget = "AllocateBuffer";
+// A custom call target that can be used to create a nop that can legally
+// replace a collective op and it returns a token.
+inline constexpr absl::string_view kNopReturnTokenCustomCallTarget =
+    "NopReturnToken";
+
+// Returns true if instruction is a collective op that is not a collective
+// fusion.
+bool IsNonFusionCollective(const HloInstruction* instruction);
+
+// Returns true if instruction is a collective op or a collective fusion.
+bool IsCollective(const HloInstruction* instruction);
+
+// Returns the collective instruction if argument is a collective op (or a
+// collective fusion) with channel_id.
+HloInstruction* IsOrHasCollectiveWithChannelId(HloInstruction* instruction);
+
+// Returns true if instruction is a synchronous collective op.
+bool IsSyncCollective(const HloInstruction* instr);
+
+// Returns true if the (a, b) pairs form a forward cycle with all participants
+// in the cycle, such as {{0,1},{1,2},{2,3},{3,0}}. We assume that the (a, b)
+// pairs are ordered as they are generated by SPMD partitioning.
+bool IsForwardCycle(const std::vector<std::pair<int64_t, int64_t>>& pairs);
+
+// Returns true if the (a, b) pairs form a backward cycle with all participants
+// in the cycle, such as {{0,3},{1,0},{2,1},{3,2}}. We assume that the (a, b)
+// pairs are ordered as they are generated by SPMD partitioning.
+bool IsBackwardCycle(const std::vector<std::pair<int64_t, int64_t>>& pairs);
+
+// Key that identifies a particular Rendezvous object in our global hashtable.
+// This determines which calls to ExecuteOnStream communicate with each other.
+// The rules are as follows.
+//
+// * Only ops with the same RunId can communicate with each other. (This is the
+//   whole purpose of RunId).
+//
+// * Only ops with the same set of participating replicas can communicate with
+//   each other.  This is how we separate out different replica groups (e.g. a
+//   single AllReduce HLO might do two reductions, between say GPUs {0,2} and
+//   {1,3}).
+//
+// * Only ops with the same opcode can communicate with each other.  At the
+//   moment we only support kAllReduce, so we don't check for this explicitly.
+//
+// * For cross-module all-reduces (i.e. instr->channel_id().has_value()),
+//   only ops with the same value for channel_id() can communicate with each
+//   other.
+//
+// * For cross-replica (i.e. same-module) all-reduces (i.e.
+//   !channel_id().has_value()), only ops from the same module (as
+//   identified by its unique_id()) can communicate with each other.
+//
+struct RendezvousKey {
+  enum CollectiveOpKind {
+    kCrossModule,
+    kCrossReplica,
+  };
+
+  explicit RendezvousKey(const RunId& run_id,
+                         std::vector<GlobalDeviceId> global_devices,
+                         int num_local_participants,
+                         CollectiveOpKind collective_op_kind, int64_t op_id)
+      : run_id(run_id),
+        global_devices(std::move(global_devices)),
+        num_local_participants(num_local_participants),
+        collective_op_kind(collective_op_kind),
+        op_id(op_id) {}
+
+  template <typename H>
+  friend H AbslHashValue(H h, const RendezvousKey& k) {
+    return H::combine(std::move(h), k.run_id, k.global_devices,
+                      k.num_local_participants, k.collective_op_kind, k.op_id);
+  }
+  friend bool operator==(const RendezvousKey& a, const RendezvousKey& b) {
+    return a.run_id == b.run_id && a.global_devices == b.global_devices &&
+           a.num_local_participants == b.num_local_participants &&
+           a.collective_op_kind == b.collective_op_kind &&  //
+           a.op_id == b.op_id;
+  }
+  friend bool operator!=(const RendezvousKey& a, const RendezvousKey& b) {
+    return !(a == b);
+  }
+
+  absl::string_view CollectiveOpKindString() const {
+    switch (collective_op_kind) {
+      case kCrossModule:
+        return "cross_module";
+      case kCrossReplica:
+        return "cross_replica";
+    }
+  }
+
+  std::string ToString() const {
+    return absl::StrFormat(
+        "RendezvousKey{run_id=%s, global_devices=[%s], "
+        "num_local_participants=%d, collective_op_kind=%s, op_id=%d}",
+        run_id.ToString(), GlobalDeviceIdsToString(global_devices),
+        num_local_participants, CollectiveOpKindString(), op_id);
+  }
+
+  RunId run_id;
+  std::vector<GlobalDeviceId> global_devices;
+  int num_local_participants;
+  CollectiveOpKind collective_op_kind;
+  int64_t op_id;
+};
+
+template <typename DescFn>
+void WaitAndLogIfStuck(tsl::BlockingCounter* counter, const DescFn& desc_fn) {
+  VLOG(3) << "Begin: " << desc_fn();
+  const std::chrono::milliseconds timeout(5000);
+  bool ok = counter->WaitFor(timeout);
+  if (ok) {
+    VLOG(3) << "Finished: " << desc_fn();
+    return;
+  }
+  LOG(ERROR) << "This thread has been waiting for " << timeout.count()
+             << "ms for and may be stuck: " << desc_fn();
+  counter->Wait();
+  LOG(ERROR) << "Thread is unstuck! Warning above was a false-positive. "
+                "Perhaps the timeout is too short: "
+             << desc_fn();
+}
+
+// Participant data for each rendezvous.
+struct ParticipantData {
+  ParticipantData(const RendezvousKey& rendezvous_key, int local_rank)
+      : rendezvous_key(rendezvous_key), local_rank(local_rank) {}
+
+  virtual ~ParticipantData() {}
+
+  RendezvousKey rendezvous_key;
+  int local_rank;  // Which of the local participants is this?
+
+  virtual std::string ToString() const = 0;
+};
+
+// The set of threads that want to do a collective op together all pick the same
+// Rendezvous object out of the global cache and call SubmitParticipant.
+//
+// The Rendezvous instance handles waiting for all threads to join, ensuring
+// that a clique exists for the desired set of GPUs, etc.
+//
+// Rendezvous objects can only be used once.
+//
+// I: Participant data.
+// O: Participant output.
+template <typename I, typename O,
+          typename =
+              std::enable_if_t<std::is_base_of<ParticipantData, I>::value>>
+class Rendezvous {
+ public:
+  virtual ~Rendezvous() {}
+  explicit Rendezvous(const RendezvousKey& k)
+      : participants_(k.num_local_participants), key_(k) {}
+
+  // Submit a participant to the rendezvous. We get the rendezvous from
+  // `rendezvous_getter`, which we can then use to drop the existing reference.
+  static absl::StatusOr<O> SubmitParticipant(
+      absl::FunctionRef<std::shared_ptr<Rendezvous<I, O>>()> rendezvous_getter,
+      I participant) {
+    std::shared_ptr<Rendezvous<I, O>> rendezvous = rendezvous_getter();
+    TF_ASSIGN_OR_RETURN(auto p, rendezvous->SubmitParticipant(participant));
+
+    // Drop our reference to the Rendezvous and wait for all other threads to do
+    // the same.  If we didn't do this, one of the threads could run past this
+    // point, reenter ExecuteOnStream for another all-reduce, and attempt to
+    // reuse the Rendezvous!
+    //
+    // An alternative way of accomplishing this goal would be to implement
+    // RefcountingHashMap::erase() and call it during SubmitParticipant.  But
+    // erase() is deceptively complex to implement correctly.
+    std::shared_ptr<tsl::BlockingCounter> blocking_counter = p.second;
+    rendezvous.reset();
+    blocking_counter->DecrementCount();
+    xla::WaitAndLogIfStuck(blocking_counter.get(), [&] {
+      return absl::StrFormat(
+          "participant waiting for all threads to drop their reference to the "
+          "rendezvous: %p",
+          rendezvous.get());
+    });
+    return std::move(p.first);
+  }
+
+ protected:
+  // Returns domain-specific output O and whether this replica is primary.
+  virtual absl::StatusOr<O> RunCollectiveOp(const I& participant) = 0;
+
+  // Adding participants_ requires holding mu_.
+  // Not annotated with ABSL_GUARDED_BY(mu_) because we do not require the lock
+  // to be held during CollectiveOp(), since at that point all the data is known
+  // to be present due to the global barrier.
+  std::vector<std::optional<I>> participants_;
+
+ private:
+  absl::Mutex mu_;
+
+  // Runs the all-reduce on the given thread.  If successful, returns
+  //  - a handle to the clique that was used, so that the caller may keep the
+  //    clique alive if it chooses.
+  //  - a BlockingCounter initialized to the number of participants, so that
+  //    the caller can coordinate with the participants one last time if it
+  //    chooses.  This is useful for coordinating destruction of the Rendezvous.
+  absl::StatusOr<std::pair<O, std::shared_ptr<tsl::BlockingCounter>>>
+  SubmitParticipant(const I& participant) {
+    {
+      absl::MutexLock lock(&mu_);
+      CHECK(!participants_[participant.local_rank].has_value());
+      participants_[participant.local_rank] = participant;
+    }
+
+    // Wait for all participants to arrive.
+    all_participants_present_.DecrementCount();
+    WaitAndLogIfStuck(&all_participants_present_, [&] {
+      return absl::StrFormat(
+          "participant %s waiting for all participants to arrive at rendezvous "
+          "%s",
+          participant.ToString(), key_.ToString());
+    });
+
+    TF_ASSIGN_OR_RETURN(O output, RunCollectiveOp(participant));
+    return std::make_pair(std::move(output), returned_blocking_counter_);
+  }
+
+  const RendezvousKey key_;
+
+  tsl::BlockingCounter all_participants_present_{key_.num_local_participants};
+
+  // tsl::BlockingCounter returned by SubmitParticipant.
+  std::shared_ptr<tsl::BlockingCounter> returned_blocking_counter_{
+      std::make_shared<tsl::BlockingCounter>(key_.num_local_participants)};
+};
+
+// We only pipeline Send-Recv chains with channel_id > 0, where each chain
+// has a unique channel_id, and allows multiple Send-Recv chains using
+// channel_id 0.
+inline bool MayPipelineSendRecvChannel(int64_t channel_id) {
+  return channel_id > 0;
+}
+
+constexpr char kSendRecvSourceTargetPairsAttr[] =
+    "_xla_send_recv_source_target_pairs";
+
+// When a Send or Recv is annotated with frontend attribute
+// _xla_send_recv_pipeline="1", asynchronous stream kP2P1 is used to execute the
+// Send or Recv. For all other cases, asynchronous stream kP2P0 is used.
+constexpr char kSendRecvPipelineAttr[] = "_xla_send_recv_pipeline";
+
+// This frontend attribute conveys the following information:
+// (1) _xla_send_recv_validation="invalid": the runtime should skip sending or
+// receiving data when the instruction is executed.
+// (2) the absent of the attribute: the runtime should faithfully perform the
+// Send or Recv operation when the instruction is executed.
+// (3) _xla_send_recv_validation={list-of-bounds}: the list-of-bounds
+// corresponds to the value of _xla_send_recv_source_target_pairs, and specifies
+// the execution instances for which the runtime should faithfully perform the
+// Send or Recv operation. Here is an example:
+//   _xla_send_recv_source_target_pairs={{0,1}, {1,2}}
+//   _xla_send_recv_validation={{2,3}, {5,7}}
+// The Send or Recv instruction with the above two attributes have the
+// following semantics:
+// The communication between device 0 and 1 will only send or receive data
+// for execution instances 2 and 3 of the instruction on devices 0 and 1.
+// For execution instances 0, 1, and beyond 3, the runtime should skip sending
+// or receiving any data.
+// Similarly, the communication between device 1 and 2 will only send or
+// receive data on execution instances 5 and 7.
+constexpr char kSendRecvValidationAttr[] = "_xla_send_recv_validation";
+
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_OPS_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_opt_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_opt_utils.h
new file mode 100644
index 00000000..c11cb7eb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_opt_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_OPT_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_OPT_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla {
+
+struct ReduceScatterSpec {
+  int64_t split_dim = -1;
+  int64_t sharded_partitions = 1;
+  int64_t sharded_replicas = 1;
+  int64_t group_size;
+  std::vector<int64_t> original_split_dims;
+  HloInstruction* dynamic_slice;
+};
+
+// Matches the given all-reduce operation to a reduce-scatter pattern.
+std::optional<ReduceScatterSpec> MatchReduceScatter(
+    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims = false,
+    bool allow_intervening_reshape = false, int64_t min_rank = 1,
+    HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
+    bool allow_intervening_bitcast = false);
+
+// Check whether AG(ICI) and its user DS(ICI) can be canceled out.
+std::optional<ReduceScatterSpec> AllGatherDynamicSliceCancellation(
+    const HloAllGatherInstruction* ag, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims = false,
+    bool allow_intervening_reshape = false, int64_t min_rank = 1,
+    HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
+    bool allow_intervening_bitcast = false, bool allow_multiple_users = false);
+
+// Check if a given instruction (AllReduce or AllGather) matches a DynamicSlice;
+// the DynamicSlice has to be the user of the given instruction.
+std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
+    const HloChannelInstruction* instruction, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims = false,
+    bool allow_intervening_reshape = false, int64_t min_rank = 1,
+    HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
+    bool is_constrain_layout = false, bool use_global_device_ids = false,
+    bool is_cross_module = false, bool allow_intervening_bitcast = false,
+    bool allow_multiple_users = false);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_OPT_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_permute_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_permute_decomposer.h
new file mode 100644
index 00000000..33716f24
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_permute_decomposer.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_PERMUTE_DECOMPOSER_H_
+#define XLA_SERVICE_COLLECTIVE_PERMUTE_DECOMPOSER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// CollectivePermuteDecomposer is a pass that (1) converts CollectivePermute
+// operations without any cycle in their (source, target) relationship to
+// Send/Recv, and (2) annotates the Send/Recv for pipelining with a frontend
+// attribute. We currently restrict the decomposition to CollectivePermute with
+// one input and without any context data.
+//
+// before transformation:
+//     cp = (<rt>, <rt>) collective-permute(data),
+//       source_target_pairs={...}
+//
+// after transformation:
+//    after-all = token[] after-all()
+//    recv = (<rt>, token[]) recv(after-all), channel_id=0,
+//     frontend_attributes={_xla_send_recv_source_target_pairs="{...}"}
+//    send = (<rt>, token[]) send(data, after-all), channel_id=0,
+//      control-predecessors={recv}, frontend_attributes={
+//      _xla_send_recv_source_target_pairs="{...}"}
+//    recv-done = (<rt>, token[]) recv-done(recv), channel_id=0
+//    send-done = token[] send-done(send), channel_id=0,
+//      control-predecessors={recv-done}
+//    cp = <rt> get-tuple-element(recv-done), index=0
+//
+// For pipelining, we first make pipelining decision on CollectivePermute
+// operations, and then record the decision on the decomposed Send/Recv via
+// frontend attributes. We currently only pipeline CollectivePermute operations
+// that send loop input data. As a simple heuristics, we pick the first
+// encountered pipelineable CollectivePermute for pipelining. Then, if there is
+// another pipelineable CollectivePermute that forms a forward or backward
+// cycle with the first CollectivePermute, we mark both CollectivePermute
+// for pipelining. Otherwise, we only mark one CollectivePermute for pipelining.
+//
+class CollectivePermuteDecomposer : public HloModulePass {
+ public:
+  explicit CollectivePermuteDecomposer(int64_t threshold_in_bytes)
+      : threshold_in_bytes_(threshold_in_bytes) {}
+  absl::string_view name() const override {
+    return "collective-permute-decomposer";
+  }
+
+  // Runs CollectivePermuteDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Transform only if the size of the collective permute is >= threshold.
+  int64_t threshold_in_bytes_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_PERMUTE_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_permute_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_permute_utils.h
new file mode 100644
index 00000000..46c62ea2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_permute_utils.h
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_PERMUTE_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_PERMUTE_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla {
+namespace cp_utils {
+
+using SourceTargetPair = std::pair<int64_t, int64_t>;
+using SourceTargetPairs = std::vector<SourceTargetPair>;
+
+// Source Targe Pairs to a cannoical string such as {{0,1},{1,2},{2,3},{3,0}}.
+std::string SourceTargetPairsString(const HloCollectivePermuteInstruction& cp);
+
+// Returns true if the (source, target) relationship has a cycle.
+bool HasCycles(const SourceTargetPairs& pairs);
+
+// Returns true if the (source, target) pairs form a forward cycle with all
+// participants in the cycle, such as {{0,1},{1,2},{2,3},{3,0}}. We assume that
+// the (source, target) pairs are ordered via increasing source IDs, as they are
+// currently generated by SPMD partitioning.
+bool IsForwardCycle(const SourceTargetPair& backedge,
+                    const SourceTargetPairs& others);
+
+// Returns true if the (source, target) pairs form a backward cycle with all
+// participants in the cycle, such as {{0,3},{1,0},{2,1},{3,2}}. We assume that
+// the (source, target) pairs are ordered via increasing source IDs, as they are
+// currently generated by SPMD partitioning.
+bool IsBackwardCycle(const SourceTargetPair& backedge,
+                     const SourceTargetPairs& others);
+
+}  // namespace cp_utils
+}  // namespace xla
+#endif  // XLA_SERVICE_COLLECTIVE_PERMUTE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_pipeliner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_pipeliner.h
new file mode 100644
index 00000000..0e7373c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_pipeliner.h
@@ -0,0 +1,158 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_PIPELINER_H_
+#define XLA_SERVICE_COLLECTIVE_PIPELINER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This transformation peels off loop iterations of models with stacked layers
+// that perform data parallelism using reduce-scatter/all-reduce/all-gather.
+// Collective instructions are pushed to the next iteration in which they can
+// overlap with the entirely of the next layer rather than with a more limited
+// amount of computation in the current iteration. An example of transformation
+// is this:
+//
+// while (i < LAYERS) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   x = computation(p0)
+//   xg = all-reduce(x)
+//   y = computation(p1)
+//   yg = all-reduce(y)
+// }
+//
+// to
+//
+// x_prev = computation(p0)
+// y_prev = computation(p1)
+// i = i + 1
+// while (i < LAYERS, x_prev, y_prev) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   xg = all-reduce(x_prev)
+//   yg = all-reduce(y_prev)
+//   x = computation(p0)
+//   y = computation(p1)
+//   x_prev = x
+//   y_prev = y
+// }
+class CollectivePipeliner : public HloModulePass {
+ public:
+  enum PipeliningDirection {
+    kBackward,
+    kForward,
+    kForwardSink,
+  };
+
+  // Postprocessing cloned collective instructions, such as for modifying loop
+  // iteration related frontend attributes to reflect loop pipelining.
+  using HloPostprocessor =
+      std::optional<std::function<absl::Status(HloInstruction* instr)>>;
+
+  struct Config {
+    int64_t level_to_operate_on = 0;
+    // Maximum number of HLOs to pipeline per loop. (Meant to help controlling
+    // memory pressure manually).
+    int64_t max_pipelining_per_loop = 0;
+    bool last_run = true;
+    // The pipeliner should try to pipeline instructions that have a tree of
+    // uses of allowed instructions. This could increase memory pressure as
+    // multiple instructions might have to be saved to be pushed to the next
+    // iteration.
+    bool pipeline_use_tree = false;
+    bool process_different_sized_ops = false;
+    PipeliningDirection pipelining_direction = PipeliningDirection::kForward;
+    HloPredicate should_process;
+    // Filter acceptable formatting ops for for forward pipelining to discard
+    // cases that pipeline formatting operations that we don't want to support.
+    HloPredicate acceptable_formatting;
+    // If the pipelined op has same input/output size the we reuse  the same
+    // buffer we are storing the value in in the output loop for forward
+    // pipelining. This function allows to not do it for certain ops.
+    HloPredicate reuse_pipelined_op_buffer;
+    // Determine whether a loop variant parameter should be allowed in
+    // pipelining chains. This is currently only used to support kBackward
+    // pipelinining.
+    HloPredicate should_allow_loop_variant_parameter_in_chain =
+        HloPredicateFalse;
+    // Whether we allow control dependencies on the Collective operation being
+    // pipelined. The control dependencies will be dropped when the operation is
+    // pipelined. This is currently only used to support kBackward pipelining.
+    bool should_allow_control_dependencies = false;
+    HloPostprocessor postprocess_backward_peeled_op = std::nullopt;
+    HloPostprocessor postprocess_backward_rotated_op = std::nullopt;
+    // Determines whether a loop invariant instruction can be considered
+    // in the pipelining chain.
+    bool should_add_loop_invariant_op_in_chain = false;
+    // Postprocessing hook which runs for every successfully pipelined op.
+    HloPostprocessor postprocess_pipelined_ops = std::nullopt;
+    int64_t collective_size_threshold_to_stop_sinking = INT64_MAX;
+  };
+  static const char* const kInsertedByPreviousStep;
+  static const char* const kSunkByPreviousStep;
+  explicit CollectivePipeliner(const Config& config) : config_(config) {}
+  CollectivePipeliner(CollectivePipeliner&& other) = default;
+  CollectivePipeliner& operator=(CollectivePipeliner&& other) = default;
+  absl::string_view GetPipelineDirectionString(PipeliningDirection direction) {
+    switch (direction) {
+      case PipeliningDirection::kForward: {
+        return "forward";
+      }
+      case PipeliningDirection::kBackward: {
+        return "backward";
+      }
+      case PipeliningDirection::kForwardSink: {
+        return "forwardsink";
+      }
+    }
+  }
+
+  absl::string_view name() const override {
+    if (config_.pipelining_direction == kForward) {
+      return "collective-pipeliner-forward";
+    } else if (config_.pipelining_direction == kBackward) {
+      return "collective-pipeliner-backward";
+    } else {
+      return "collective-pipeliner-forwardsink";
+    }
+  }
+
+  // Pipelines the collectives that do not have any other pipelineable
+  // collectives in their user subtree.
+  absl::StatusOr<bool> RunPipeliner(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const Config config_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_PIPELINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_quantizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_quantizer.h
new file mode 100644
index 00000000..b63a3138
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_quantizer.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_QUANTIZER_H_
+#define XLA_SERVICE_COLLECTIVE_QUANTIZER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/collective_quantizer.h"
+
+#endif  // XLA_SERVICE_COLLECTIVE_QUANTIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_transformation_reorderer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_transformation_reorderer.h
new file mode 100644
index 00000000..2bbae612
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_transformation_reorderer.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
+#define XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/collective_transformation_reorderer.h"
+
+#endif  // XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collective_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_utils.h
new file mode 100644
index 00000000..dc690094
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collective_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_UTILS_H_
+
+#include <cstdint>
+
+namespace xla {
+
+// Defines the default threshold for `AllReduceCombiner` up to which the pass
+// will combine collectives.
+constexpr int64_t kDefaultAllReduceCombineThreshold = 30 * 1024 * 1024 + 7;
+
+// Defines the default threshold for `AllGatherCombiner` up to which the pass
+// will combine collectives.
+constexpr int64_t kDefaultAllGatherCombineThreshold = 30 * 1024 * 1024 + 7;
+
+// Defines the default threshold for `ReduceScatterCombiner` up to which the
+// pass will combine collectives.
+constexpr int64_t kDefaultReduceScatterCombineThreshold = 30 * 1024 * 1024 + 7;
+
+// Defines the default coefficient for the SoL NCCL collective cost model.
+// Note: XLA flags allow a user to override the default values of the model.
+constexpr float kDefaultNcclCostModelCoeff = 0.45f;
+constexpr int64_t kDefaultNcclCostModelChunkSizeBytes = 4194304;  // 4MB
+constexpr int64_t kDefaultNcclCostModelGPUsPerNode = 8;
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/collectives_schedule_linearizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/collectives_schedule_linearizer.h
new file mode 100644
index 00000000..27f0de00
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/collectives_schedule_linearizer.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
+#define XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/collectives_schedule_linearizer.h"
+
+#endif  // XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/comparison_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/comparison_expander.h
new file mode 100644
index 00000000..33337547
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/comparison_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPARISON_EXPANDER_H_
+#define XLA_SERVICE_COMPARISON_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/comparison_expander.h"
+
+#endif  // XLA_SERVICE_COMPARISON_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_cache.h
new file mode 100644
index 00000000..65384bf8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_cache.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILATION_CACHE_H_
+#define XLA_SERVICE_COMPILATION_CACHE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// A cache which stores Executables indexed by computation handle and version.
+//
+// TODO(b/119042872): Provide mechanism for removing computations from the
+// compilation cache.
+class CompilationCache {
+ public:
+  CompilationCache() {}
+
+  ExecutionHandle Insert(std::unique_ptr<Executable> executable);
+
+  // Lookup the Executable for the specified handle in the cache. Return a
+  // shared_ptr to the Executable if it exists in the cache.
+  absl::StatusOr<std::shared_ptr<Executable>> LookUp(
+      const ExecutionHandle& handle) const;
+
+ protected:
+  mutable absl::Mutex mutex_;
+
+  using CacheKey = int64_t;
+
+  absl::flat_hash_map<CacheKey, std::shared_ptr<Executable>> cache_
+      ABSL_GUARDED_BY(mutex_);
+
+ private:
+  CompilationCache(const CompilationCache&) = delete;
+  CompilationCache& operator=(const CompilationCache&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILATION_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_environments.h b/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_environments.h
new file mode 100644
index 00000000..fe845c23
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_environments.h
@@ -0,0 +1,167 @@
+#include "tsl/platform/status.h"
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
+#define XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <typeindex>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+// A class for holding CompilationEnvironments, i.e., protos holding the values
+// of command line flags and environment variables that affect compilation.
+//
+// CompilationEnvironments uses lazy initialization, (see GetEnv() for more
+// details). Lazy initialization is used so we can avoid:
+// A) Requiring every code path to explitily construct all needed compilation
+//    environments, particularly when the default constructed environment is
+//    all we need AND
+// B) Requiring CompilationEnvironments to implicitly construct all needed
+//    environments, thereby requiring it to statically know the types of all
+//    such environments
+//
+// CompilationEnvironments is not thread-safe.
+class CompilationEnvironments {
+ public:
+  using ProcessNewEnvFn =
+      std::function<absl::StatusOr<std::unique_ptr<tsl::protobuf::Message>>(
+          std::unique_ptr<tsl::protobuf::Message>)>;
+
+  CompilationEnvironments() = default;
+  CompilationEnvironments(const CompilationEnvironments& rhs) { *this = rhs; }
+  CompilationEnvironments& operator=(const CompilationEnvironments& rhs);
+  ~CompilationEnvironments() = default;
+
+  // Deserializes the given CompilationEnvironments proto.
+  static absl::StatusOr<std::unique_ptr<CompilationEnvironments>>
+  CreateFromProto(const CompilationEnvironmentsProto& proto);
+
+  // Whenever an environment is added to CompilationEnvironments, even when
+  // GetEnv() adds a lazily initialized one, it is passed to the function
+  // registered by this method, corresponding to the environment's proto
+  // descriptor. The result is the environment that is used by
+  // CompilationEnvironments. This allows environment authors to
+  // do things like populate missing fields in an added environment.
+  //
+  // Users of CompilationEnvironments must register their `ProcessNewEnvFn`
+  // function via this method for each type of CompilationEnvironment they wish
+  // to use in code.
+  //
+  // The input env to a ProcessNewEnvFn may be null.
+  //
+  // REQUIRES:
+  // - The output is *not* allowed to be null, even for null input.
+  static void RegisterProcessNewEnvFn(
+      const tsl::protobuf::Descriptor* descriptor,
+      ProcessNewEnvFn process_new_env);
+
+  // Adds env to the list of CompilationEnvironments. If an environment with
+  // the same proto descriptor has already been added, env will replace it.
+  //
+  // All added environments are processed via registered ProcessNewEnvFns. If
+  // such a function was not regitered for env's proto descriptor or env's
+  // proto type is unknown, an error will be returned.
+  absl::Status AddEnv(std::unique_ptr<tsl::protobuf::Message> env);
+
+  // Returns the CompilationEnvironment corresponding to T. If such an
+  // environment has not been added, ProcessNewEnvFn(nullptr) will be added and
+  // returned.
+  //
+  // GetMutableEnv()/GetEnv() are not const because they can perform lazy
+  // initialization, thereby modifying the CompilationEnvironments's data
+  // members.
+  template <typename T>
+  T& GetMutableEnv();
+  template <typename T>
+  const T& GetEnv();
+  template <typename T>
+  bool HasEnv();
+
+  // Removes all added environments.
+  void Clear() { environments_.clear(); }
+
+  // Serializes this CompilationEnvironments into a protobuf message.
+  CompilationEnvironmentsProto ToProto() const;
+
+ private:
+  // Returns the ProcessNewEnvFn for the given env type. Returns nullptr if no
+  // ProcessNewEnvFn has been registered for the env type.
+  static ProcessNewEnvFn GetProcessNewEnvFn(
+      const tsl::protobuf::Descriptor& descriptor);
+
+  // Called by GetEnv(), when it lazily creates a new environment, to globally
+  // track stats about how many such environments are created by
+  // CompilationEnvironments.
+  static void DefaultEnvCreatedByCompilationEnvironments(
+      absl::string_view env_type);
+
+  // Called by AddEnv(), to globally track stats about how many environments
+  // are added to CompilationEnvironments.
+  static void EnvAdded(absl::string_view env_type);
+
+  absl::Status AddEnvImpl(const tsl::protobuf::Descriptor& descriptor,
+                          std::unique_ptr<tsl::protobuf::Message> env);
+
+  absl::flat_hash_map<const tsl::protobuf::Descriptor*,
+                      std::unique_ptr<tsl::protobuf::Message>>
+      environments_;
+};
+
+// ----- Template implementation below -----
+
+template <typename T>
+T& CompilationEnvironments::GetMutableEnv() {
+  auto descriptor = T::descriptor();
+  auto it = environments_.find(descriptor);
+  if (it == environments_.end()) {
+    TF_CHECK_OK(AddEnvImpl(*descriptor, nullptr));
+    DefaultEnvCreatedByCompilationEnvironments(descriptor->full_name());
+    it = environments_.find(descriptor);
+  }
+
+  // TODO(b/302086111): Remove after XLA has an updated protobuf version.
+#if TSL_IS_IN_OSS
+  return tensorflow::down_cast<T&>(*it->second);
+#else
+  return tsl::protobuf::DownCastToGenerated<T>(*it->second);
+#endif
+}
+
+template <typename T>
+const T& CompilationEnvironments::GetEnv() {
+  return GetMutableEnv<T>();
+}
+
+template <typename T>
+bool CompilationEnvironments::HasEnv() {
+  auto descriptor = T::descriptor();
+  return environments_.find(descriptor) != environments_.end();
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_stats.h b/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_stats.h
new file mode 100644
index 00000000..d5c851e1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/compilation_stats.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILATION_STATS_H_
+#define XLA_SERVICE_COMPILATION_STATS_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// This class is used to collect information about HLO passes and print some
+// statistics at the end of compilation. From HloPassPipeline, we call StartPass
+// before the execution of a pass, and EndPass after. Currently, we only collect
+// timing information and how many times each pass was run. In the future, we
+// can add more things, such as the size of the HLO graph after each pass.
+class CompilationStats {
+ public:
+  virtual ~CompilationStats() = default;
+
+  static std::unique_ptr<CompilationStats> MakeNoopStats();
+
+  static std::unique_ptr<CompilationStats> MakeStats();
+
+  virtual void StartPass(absl::string_view pass_name) = 0;
+
+  virtual void EndPass(absl::string_view pass_name) = 0;
+
+  virtual void CompilationReport() = 0;
+
+  virtual int GetPassesSize() = 0;
+
+  virtual void RecordPassError(absl::string_view pass_name,
+                               absl::string_view err) = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILATION_STATS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/compile_only_service.h b/third_party/tflite-hdrs/third_party/xla/xla/service/compile_only_service.h
new file mode 100644
index 00000000..0238a16f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/compile_only_service.h
@@ -0,0 +1,88 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+#define XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+
+#include "absl/status/statusor.h"
+#include "xla/service/backend.h"
+#include "xla/service/compiler.h"
+#include "xla/service/service.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// An XLA Service specialization for ahead-of-time compilation.  This only
+// instantiates a Compiler object for the relevant platform; it does not
+// instantiate or require an execution backend.
+class CompileOnlyService : public Service {
+ public:
+  // Factory for creating a CompileOnlyService. The parameter platform is the
+  // platform that the service should target. If platform is null then the
+  // default platform is used.
+  static absl::StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      se::Platform* platform);
+  static absl::StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      const ServiceOptions& options);
+
+  // A description of a xla computation to compile using CompileAheadOfTime.
+  struct AotXlaComputationInstance {
+    HloModuleProto computation;
+    std::vector<const Shape*> argument_layouts;
+    Shape result_layout;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(absl::Span<const AotXlaComputationInstance> computations,
+                     const AotCompilationOptions& options,
+                     std::unique_ptr<AotCompilationMetadata>* metadata);
+
+  absl::StatusOr<std::vector<DeviceHandle>> GetDeviceHandles(
+      int64_t device_count) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+
+  absl::StatusOr<std::unique_ptr<GlobalData>> TransferToServer(
+      const LiteralSlice& literal_slice,
+      const DeviceHandle* device_handle) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+
+  absl::Status TransferToInfeed(const LiteralSlice& literal, int64_t replica_id,
+                                const DeviceHandle* device_handle) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+
+ private:
+  explicit CompileOnlyService(const ServiceOptions& options,
+                              Compiler* compiler);
+  CompileOnlyService(const CompileOnlyService&) = delete;
+  void operator=(const CompileOnlyService&) = delete;
+
+  // The compiler for the target platform.  This is included in place of
+  // the Service::execute_backend_'s compiler, since execute_backend_ is a
+  // nullptr in CompileOnlyService.
+  Compiler* compiler_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/compile_time_cap.h b/third_party/tflite-hdrs/third_party/xla/xla/service/compile_time_cap.h
new file mode 100644
index 00000000..875448e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/compile_time_cap.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILE_TIME_CAP_H_
+#define XLA_SERVICE_COMPILE_TIME_CAP_H_
+#include <algorithm>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+// Provide a common way to bound compiler analyses that potentially have
+// overhead that is non-linear to the number of instructions in a module.
+class BoundNonLinearCompilerAnalysis {
+ public:
+  // Sampling_rate specifies the proportion of all instructions expected to be
+  // analyzed. e.g., if sampling_rate_=2, then every other instructions are
+  // expected to be analyzed. If sample_rate <= 0, the analysis will be always
+  // allowed to complete. Each analysis is allowed at least a constant number of
+  // abstract cost units, before it is considered for early termination.
+  explicit BoundNonLinearCompilerAnalysis(HloModule* m,
+                                          absl::string_view pass_name,
+                                          std::optional<int64_t> sampling_rate)
+      : analysis_allowance_(
+            (!sampling_rate.has_value() || sampling_rate.value() <= 0 ||
+             m->config().GetAnalysisAllowance(pass_name) < 0)
+                ? -1
+                : std::max(m->config().GetAnalysisAllowance(pass_name),
+                           m->instruction_count() / sampling_rate.value())) {}
+  // Return whether the cost is deducted successfully. If not, the analysis
+  // should be terminated as its overhead is too high.
+  bool DeductCost(int64_t cost_now) {
+    if (analysis_allowance_ > 0 && cost_now > 0) {
+      analysis_allowance_ -= cost_now;
+      if (analysis_allowance_ < 0) {
+        analysis_allowance_ = 0;
+      }
+    }
+    return analysis_allowance_ != 0;
+  }
+
+  bool ContinueAnalysis() const { return analysis_allowance_ != 0; }
+  int64_t analysis_allowance() const { return analysis_allowance_; }
+
+ private:
+  int64_t analysis_allowance_;
+};
+
+};  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILE_TIME_CAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/compiler.h
new file mode 100644
index 00000000..dd923a4c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/compiler.h
@@ -0,0 +1,468 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The compiler API is used by the XLA service to generate executables that
+// run on a given platform. This is a registry and abstract interface, for
+// pluggability by the various platforms.
+
+#ifndef XLA_SERVICE_COMPILER_H_
+#define XLA_SERVICE_COMPILER_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/metrics_hook_interface.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/threadpool.h"
+
+namespace mlir {
+class DialectRegistry;
+}  // namespace mlir
+
+namespace xla {
+
+// The following types are used for ahead of time compilation.
+
+// Contains the object file data created as a result of ahead-of-time
+// computation.
+using ObjectFileData = std::vector<char>;
+
+class Compiler;
+class AotCompilationOptions;
+
+// Abstract superclass describing the result of an ahead-of-time compilation.
+class AotCompilationResult {
+ public:
+  AotCompilationResult(const AotCompilationResult&) = delete;
+  AotCompilationResult& operator=(AotCompilationResult const&) = delete;
+
+  virtual ~AotCompilationResult() = default;
+
+  virtual absl::StatusOr<std::string> SerializeAsString() const {
+    return Unimplemented("SerializeAsString unimplemented.");
+  }
+
+  virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler, const se::StreamExecutor* executor) const {
+    return Unimplemented("LoadExecutable unimplemented.");
+  }
+
+  // Returns the optimized HLO module if one was computed and the implementation
+  // supports it.
+  virtual const HloModule* optimized_module() const = 0;
+  virtual std::unique_ptr<HloModule> consume_optimized_module() = 0;
+
+ protected:
+  AotCompilationResult() = default;
+};
+
+// Abstract superclass describing metadata produced during ahead-of-time
+// compilation.
+class AotCompilationMetadata {
+ public:
+  AotCompilationMetadata(const AotCompilationMetadata&) = delete;
+  AotCompilationMetadata& operator=(AotCompilationMetadata const&) = delete;
+  virtual std::string ToString() const { return ""; }
+  virtual ~AotCompilationMetadata() = default;
+
+ protected:
+  AotCompilationMetadata() = default;
+};
+
+// Abstract compiler interface that is subclassed for compilation on a
+// particular platform.
+//
+// The compiler ties together high level optimization (HLO) and low level
+// optimization (LLO) / codegen (CG) to generate efficient executables for the
+// target platform.
+//
+// The platform-based compiler singletons are registered via module initializers
+// in their corresponding XLA compiler libraries, and are registered via the
+// RegisterCompilerFactory API below.
+//
+// Thread-safety: subclasses of Compiler must be thread-safe, as multiple
+// XLA clients may be requesting compilation concurrently for a given
+// platform.
+class Compiler {
+ public:
+  // Description of a target device for compilation.
+  struct TargetConfig {
+    explicit TargetConfig(const se::GpuTargetConfigProto& proto);
+    explicit TargetConfig(se::StreamExecutor* s);
+
+    se::GpuTargetConfigProto ToProto() const;
+
+    bool operator==(const TargetConfig& other) const {
+      // TODO(cheshire): More efficient comparator, this is currently just for
+      // tests.
+      return ToProto().SerializeAsString() ==
+             other.ToProto().SerializeAsString();
+    }
+
+    std::string ToString() { return ToProto().DebugString(); }
+
+    se::DeviceDescription device_description;
+    std::string platform_name;
+    se::dnn::VersionInfo dnn_version_info;
+    std::string device_description_str;
+  };
+
+  struct CompileOptions {
+    // If device_allocator is not null, the compiler may use it to allocate temp
+    // space on the device for use during compilation.  For example, the
+    // compiler may allocate buffers on the device and then run variants of a
+    // given algorithm over those buffers, to see which variant is fastest.  Any
+    // space allocated will be deallocated before the compilation returns.
+    se::DeviceMemoryAllocator* device_allocator = nullptr;
+
+    // An optional thread pool for parallel compilation.
+    tsl::thread::ThreadPool* thread_pool = nullptr;
+
+    std::function<absl::StatusOr<std::pair<std::vector<Shape>, Shape>>(
+        const HloModule& module)>
+        layout_canonicalization_callback = {};
+
+    bool is_autotuning_compilation = false;
+
+    // AOT device description. If provided, used instead of querying the device
+    // on which compilation is performed.
+    std::optional<TargetConfig> target_config;
+
+    MultiProcessKeyValueStore key_value_store;
+  };
+
+  virtual ~Compiler() = default;
+
+  // Returns the ID of the platform that this compiler targets.
+  virtual se::Platform::Id PlatformId() const = 0;
+
+  // Runs Hlo passes to optimize the given Hlo module, returns the optimized
+  // module.
+  virtual absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+      const CompileOptions& options) = 0;
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+      se::DeviceMemoryAllocator* device_allocator) {
+    return RunHloPasses(std::move(module), executor,
+                        CompileOptions{device_allocator});
+  }
+
+  // Compiles the HLO module for execution on a device given by the executor,
+  // and returns an executable object or an error status. No HLO passes are
+  // applied to module. Generally a module should be passed through RunHloPasses
+  // prior to calling this method because some HLO passes are required for
+  // correctness. Takes ownership of the HLO module.
+  //
+  // The compiler may optionally specialize to the individual device
+  // (not just type of device) indicated by the executor.
+  virtual absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+      const CompileOptions& options) = 0;
+  absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
+      se::DeviceMemoryAllocator* device_allocator) {
+    return RunBackend(std::move(module), executor,
+                      CompileOptions{device_allocator});
+  }
+
+  // The following two interfaces are same as the above two, except they
+  // facilitate the loading of buffer assignment from proto if available.
+
+  // Note: The default implementation of the API here does not utilize the given
+  // buffer assignment. Different backends are a expected to override the
+  // following method to achieve this functionality.
+  virtual absl::StatusOr<std::unique_ptr<Executable>>
+  RunBackendWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* /*buffer_assignment_proto*/,
+      se::StreamExecutor* executor, const CompileOptions& options) {
+    LOG(WARNING) << "Ignoring the buffer assignment proto provided.";
+    return RunBackend(std::move(module), executor, options);
+  }
+
+  absl::StatusOr<std::unique_ptr<Executable>> RunBackendWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* buffer_assignment_proto,
+      se::StreamExecutor* executor,
+      se::DeviceMemoryAllocator* device_allocator) {
+    return RunBackendWithBufferAssignment(std::move(module),
+                                          buffer_assignment_proto, executor,
+                                          CompileOptions{device_allocator});
+  }
+
+  // Returns a (deserialized) AotCompilationResult from a serialized
+  // AotCompilationResult.
+  virtual absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  LoadAotCompilationResult(const std::string& serialized_aot_result) {
+    return Unimplemented("LoadAotCompilationResult unimplemented.");
+  }
+
+  // Compiles a set of HLO modules that can run in parallel, potentially
+  // communicating data between the modules, and returns a corresponding
+  // sequence of executable objects.
+  //
+  // TODO(b/68666782): Remove this method after adding support for multiple
+  // modules to RunHloPasses and RunBackends.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+      const CompileOptions& options) = 0;
+  absl::StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+      se::DeviceMemoryAllocator* device_allocator) {
+    return Compile(std::move(module_group), stream_exec,
+                   CompileOptions{device_allocator});
+  }
+
+  // Returns the backend configurations that the backend will consider for the
+  // given HLO. Returns no configurations if the backend does not support
+  // configurations for the given HLO.
+  //
+  // The stream executor is passed in to provide information about the hardware
+  // that the backend configurations would be targeting.
+  virtual std::vector<std::unique_ptr<tsl::protobuf::Message>>
+  ComputeBackendConfigs(const HloInstruction& hlo,
+                        se::StreamExecutor* executor) const;
+
+  // Returns the backend configuration that the backend chooses by default for
+  // the given HLO. Returns no configuration if the backend does not support
+  // configurations for the given HLO.
+  //
+  // The stream executor is passed in to provide information about the hardware
+  // that the backend configurations would be targeting.
+  virtual std::unique_ptr<tsl::protobuf::Message> ComputeDefaultBackendConfig(
+      const HloInstruction& hlo, se::StreamExecutor* executor) const;
+
+  // Compiles the HLO module group for ahead-of-time execution.  This is
+  // intended for use in static compilation.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     const AotCompilationOptions& options) = 0;
+
+  // Similar to CompileAheadOfTime above but AotCompilationMetadata
+  // has an argument that can be populated during compilation.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     const AotCompilationOptions& options,
+                     std::unique_ptr<AotCompilationMetadata>* metadata);
+
+  /////
+  // The Compiler class also serves as a point to register compiler objects
+  // for the various platforms.
+
+  using CompilerFactory = std::function<std::unique_ptr<Compiler>()>;
+
+  // Registers the compiler singleton for the platform. This is assumed to
+  // be a singleton, so no ownership is transferred.
+  //
+  // Precondition: a platform kind must not be registered more than once.
+  static void RegisterCompilerFactory(se::Platform::Id platform_id,
+                                      CompilerFactory compiler_factory);
+
+  // Returns the compiler singleton pointer if it is available for the given
+  // platform, or an error status if it is not.
+  static absl::StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
+
+  // Returns a function that computes the size in bytes of the logical
+  // buffer that contains a shape.
+  virtual HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const = 0;
+
+  // Returns a function that computes the size in bytes of a given
+  // logical buffer.
+  std::function<int64_t(const BufferValue&)> BufferSizeBytesFunction() const {
+    HloCostAnalysis::ShapeSizeFunction shape_size = ShapeSizeBytesFunction();
+    return [shape_size](const BufferValue& buffer) {
+      return shape_size(buffer.shape());
+    };
+  }
+
+  virtual Shape DefaultDeviceShapeRepresentation(const Shape& shape) const {
+    return shape;
+  }
+
+  // Returns an AotCompilationResult of the executable for serialization.
+  virtual absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
+      Executable* executable) const {
+    return Unimplemented("Export unimplemented");
+  }
+
+  // Returns a MetricsHookInterface object used to instrument Compiler's
+  // compilation stages.
+  virtual std::unique_ptr<MetricsHookInterface> CreateMetricsHook(
+      absl::string_view filename_prefix) const;
+
+ private:
+  // Mutex that guards the platform-compiler map.
+  static absl::Mutex platform_compiler_mutex_;
+
+  // Map from platform kind to compiler factory.
+  static absl::flat_hash_map<se::Platform::Id, CompilerFactory>*
+  GetPlatformCompilerFactories();
+
+  // Map from platform kind to compiler instance, if we made one already (based
+  // on the factories above).
+  static absl::flat_hash_map<se::Platform::Id, std::unique_ptr<Compiler>>*
+  GetPlatformCompilers();
+};
+
+// Abstract superclass describing options to an ahead-of-time compilation.
+class AotCompilationOptions {
+ public:
+  AotCompilationOptions(const AotCompilationOptions&) = delete;
+  AotCompilationOptions& operator=(AotCompilationOptions const&) = delete;
+
+  explicit AotCompilationOptions(se::Platform::Id platform_id)
+      : platform_id_(platform_id), debug_options_(GetDebugOptionsFromFlags()) {}
+  virtual ~AotCompilationOptions() = default;
+
+  // Returns the ID of the platform to which these options apply.
+  virtual se::Platform::Id PlatformId() const { return platform_id_; }
+
+  virtual int64_t replica_count() const { return 0; }
+  virtual int64_t num_cores() const { return 0; }
+  virtual bool use_spmd_partitioning() const { return false; }
+  virtual bool use_auto_spmd_partitioning() const { return false; }
+  virtual std::vector<int64_t> auto_spmd_partitioning_mesh_shape() const {
+    return {};
+  }
+  virtual std::vector<int64_t> auto_spmd_partitioning_mesh_ids() const {
+    return {};
+  }
+  virtual bool deduplicate_hlo() const { return false; }
+  virtual PrecisionConfig::Precision matrix_unit_operand_precision() const {
+    return PrecisionConfig::DEFAULT;
+  }
+
+  // Optional allocator that may be used for allocating temp space on the device
+  // during compilation.
+  se::DeviceMemoryAllocator* device_allocator() const {
+    return device_allocator_;
+  }
+  void set_device_allocator(se::DeviceMemoryAllocator* device_allocator) {
+    device_allocator_ = device_allocator;
+  }
+
+  const DebugOptions& debug_options() const { return debug_options_; }
+  DebugOptions* mutable_debug_options() { return &debug_options_; }
+
+  bool has_static_device_assignment() const {
+    return static_device_assignment_.has_value();
+  }
+  const DeviceAssignment& static_device_assignment() const {
+    CHECK(static_device_assignment_.has_value());
+    return *static_device_assignment_;
+  }
+  void set_static_device_assignment(const DeviceAssignment& device_assignment) {
+    static_device_assignment_ = device_assignment;
+  }
+
+  FusionConfigCollection fusion_config_collection() const {
+    return fusion_config_collection_;
+  }
+  void set_fusion_config_collection(
+      FusionConfigCollection fusion_config_collection) {
+    fusion_config_collection_ = fusion_config_collection;
+  }
+
+  const std::vector<std::vector<bool>>& fusion_config() const {
+    return fusion_config_;
+  }
+  void set_fusion_config(const std::vector<std::vector<bool>>& fusion_config) {
+    fusion_config_ = fusion_config;
+  }
+
+  se::StreamExecutor* executor() const { return executor_; }
+  void set_executor(se::StreamExecutor* executor) { executor_ = executor; }
+
+  // Optional profile_version and cache key may be used to trigger recompilation
+  // when a compilation cache is used.
+  int64_t profile_version() const { return profile_version_; }
+  void set_profile_version(int64_t profile_version) {
+    profile_version_ = profile_version;
+  }
+
+  absl::string_view cache_key() const { return cache_key_; }
+  void set_cache_key(absl::string_view cache_key) {
+    cache_key_ = std::string(cache_key);
+  }
+
+  bool run_backend_only() const { return run_backend_only_; }
+  void set_run_backend_only(bool run_backend_only) {
+    run_backend_only_ = run_backend_only;
+  }
+
+  bool sanitize_dataflow() const { return sanitize_dataflow_; }
+  void set_sanitize_dataflow(bool sanitize_dataflow) {
+    sanitize_dataflow_ = sanitize_dataflow;
+  }
+
+  const std::vector<std::string>& sanitize_abilists_dataflow() const {
+    return sanitize_abilists_dataflow_;
+  }
+  void set_sanitize_abilists_dataflow(
+      const std::vector<std::string>& abilists) {
+    sanitize_abilists_dataflow_ = abilists;
+  }
+
+  const std::optional<Compiler::TargetConfig>& target_config() const {
+    return target_config_;
+  }
+  void set_target_config(const Compiler::TargetConfig& target_config) {
+    target_config_ = target_config;
+  }
+
+ protected:
+  AotCompilationOptions();
+
+ private:
+  se::Platform::Id platform_id_;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
+  DebugOptions debug_options_;
+  std::optional<DeviceAssignment> static_device_assignment_;
+  std::vector<std::vector<bool>> fusion_config_;
+  FusionConfigCollection fusion_config_collection_ =
+      FusionConfigCollection::kOff;
+  se::StreamExecutor* executor_ = nullptr;
+  int64_t profile_version_ = 0;
+  std::string cache_key_;
+  bool run_backend_only_ = false;
+  bool sanitize_dataflow_ = false;
+  std::vector<std::string> sanitize_abilists_dataflow_;
+  // Contains target-specific information required by AOT compilation.
+  std::optional<Compiler::TargetConfig> target_config_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/computation_layout.h b/third_party/tflite-hdrs/third_party/xla/xla/service/computation_layout.h
new file mode 100644
index 00000000..b6c947b2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/computation_layout.h
@@ -0,0 +1,126 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPUTATION_LAYOUT_H_
+#define XLA_SERVICE_COMPUTATION_LAYOUT_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/printer.h"
+#include "xla/shape_layout.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Class which contains the layouts of the parameters and results of a
+// computation. The layouts are stored as ShapeLayouts with immutable shapes and
+// mutable layouts.
+class ComputationLayout {
+ public:
+  // Creates a new ComputationLayout with the given result layout.
+  explicit ComputationLayout(ShapeLayout result_layout)
+      : result_layout_(std::move(result_layout)) {}
+
+  // Constructs a ComputationLayout from a ProgramShape. The layouts of the
+  // parameters and results are set to the default layout. Layouts in the
+  // ProgramShape are ignored if ignore_layouts is true.
+  explicit ComputationLayout(const ProgramShape& program_shape,
+                             bool ignore_layouts = true);
+
+  // Adds a new parameter layout to the computation layout.
+  void add_parameter_layout(ShapeLayout shape_layout) {
+    parameter_layouts_.push_back(std::move(shape_layout));
+  }
+
+  // Returns the layout of a particular parameter.
+  const ShapeLayout& parameter_layout(int64_t param_no) const {
+    return parameter_layouts_[param_no];
+  }
+  ShapeLayout* mutable_parameter_layout(int64_t param_no) {
+    return &parameter_layouts_[param_no];
+  }
+
+  // Returns the number of parameters in the computation.
+  int parameter_count() const { return parameter_layouts_.size(); }
+
+  // Returns the ShapeLayouts of the parameters of the computation.
+  const std::vector<ShapeLayout>& parameter_layouts() const {
+    return parameter_layouts_;
+  }
+
+  // Returns the ShapeLayout of a result of the computation.
+  const ShapeLayout& result_layout() const { return result_layout_; }
+  ShapeLayout* mutable_result_layout() { return &result_layout_; }
+
+  // Returns the shape of the particular parameter or result of the computation
+  // with layout.
+  const Shape& parameter_shape(int64_t param_no) const {
+    return parameter_layouts_[param_no].shape();
+  }
+  const Shape& result_shape() const { return result_layout_.shape(); }
+
+  // Sets layouts of all parameters and the result to the default layout.
+  void SetToDefaultLayout();
+
+  // Returns true if all layouts (parameters and result) have been set.
+  bool LayoutIsSet() const;
+  // Returns true if any layouts (parameters and result) have been set.
+  bool AnyLayoutSet() const;
+
+  // Returns a list of each parameter's layout. If the parameters are tupled,
+  // returns an untupled list. Must only be called if all parameters have
+  // layouts set (check with LayoutIsSet()).
+  absl::StatusOr<std::vector<Layout>> FlattenedParameterLayouts() const;
+
+  // Returns a list of each output's layout. If the result shape is a tuple,
+  // returns an untupled list. Must only be called if all outputs have layouts
+  // set (check with LayoutIsSet()).
+  absl::StatusOr<std::vector<Layout>> FlattenedResultLayouts() const;
+
+  // Prints a string representation of this object.
+  void Print(Printer* printer) const;
+
+  // Returns a string representation of this object.
+  std::string ToString() const;
+
+  // Create a ProgramShape proto based on the parameter and result shapes held
+  // within this object.
+  ProgramShape ComputeProgramShape() const;
+
+  bool operator==(const ComputationLayout& other) const;
+  bool operator!=(const ComputationLayout& other) const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ComputationLayout& computation_layout) {
+    h = H::combine(std::move(h), computation_layout.result_layout_.shape());
+    for (const auto& parameter_layout : computation_layout.parameter_layouts_) {
+      h = H::combine(std::move(h), parameter_layout.shape());
+    }
+    h = H::combine(std::move(h), computation_layout.parameter_layouts_.size());
+    return h;
+  }
+
+ private:
+  std::vector<ShapeLayout> parameter_layouts_;
+  ShapeLayout result_layout_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPUTATION_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/computation_placer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/computation_placer.h
new file mode 100644
index 00000000..552fce2d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/computation_placer.h
@@ -0,0 +1,133 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPUTATION_PLACER_H_
+#define XLA_SERVICE_COMPUTATION_PLACER_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/array2d.h"
+#include "xla/service/global_device_id.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Class that represents the device assignment for a set of XLA replicated
+// computations. For R replicas and C computations, R * C devices are required
+// execute the computation in parallel. The assigned device ids can be accessed
+// by assignment(replica, computation).
+class DeviceAssignment : public Array2D<int64_t> {
+ public:
+  DeviceAssignment() {}
+  DeviceAssignment(int replica_count, int computation_count)
+      : Array2D<int64_t>(replica_count, computation_count, -1) {
+    CHECK_GT(replica_count, 0);
+    CHECK_GT(computation_count, 0);
+  }
+
+  int replica_count() const { return height(); }
+  int computation_count() const { return width(); }
+
+  // The logical ID of a device is its (replica ID, computation ID) pair.
+  struct LogicalID {
+    int replica_id;
+    int computation_id;
+  };
+
+  // Finds the (replica ID, computation ID) pair for the given device.
+  absl::StatusOr<LogicalID> LogicalIdForDevice(GlobalDeviceId device_id) const;
+  // Finds the replica ID for the given device.
+  absl::StatusOr<int> ReplicaIdForDevice(GlobalDeviceId device_id) const;
+  // Finds the partition ID for the given device.
+  absl::StatusOr<int> PartitionIdForDevice(GlobalDeviceId device_id) const;
+  // Returns a map from device ID to logical ID. Querying this map is much more
+  // efficient than `LogicalIdForDevice` if queried repeatedly.
+  absl::flat_hash_map<GlobalDeviceId, LogicalID> GetDeviceToLogicalIdMap()
+      const;
+
+  // Protocol buffer serialization and deserialization.
+  void Serialize(DeviceAssignmentProto* proto) const;
+
+  // Return a std::unique_ptr<DeviceAssignment> instead of a DeviceAssignment
+  // directly because one of the supported TF platforms (mac) does not compile
+  // due to a absl::StatusOr of an incomplete type (DeviceAssignment).
+  static absl::StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
+      const DeviceAssignmentProto& proto);
+
+  std::string ToString() const;
+};
+
+// A generic implementation of the XLA computation placer, which assigns device
+// ids to a set of replicated computations.
+class ComputationPlacer {
+ public:
+  ComputationPlacer() {}
+  virtual ~ComputationPlacer() {}
+
+  // Returns the device id assigned to the given replica and computation
+  // instance for [replica_count x computation_count] setup. The returned device
+  // id must match the assignment from PlaceReplicatedComputation().
+  virtual absl::StatusOr<int> DeviceId(int replica, int computation,
+                                       int replica_count,
+                                       int computation_count);
+
+  // Returns the device ids assigned to a set of replicated computations, given
+  // the number of replicas and the number of computations.
+  virtual absl::StatusOr<DeviceAssignment> AssignDevices(int replica_count,
+                                                         int computation_count);
+
+  using ComputationPlacerCreationFunction =
+      std::unique_ptr<ComputationPlacer> (*)();
+
+  // Registers a computation placer creation function for a particular platform.
+  static void RegisterComputationPlacer(
+      se::Platform::Id platform_id,
+      ComputationPlacerCreationFunction creation_function);
+
+  // Returns the computation placer singleton pointer if it is available for the
+  // given platform, or an error status if it is not.
+  static absl::StatusOr<ComputationPlacer*> GetForPlatform(
+      const se::Platform* platform);
+
+ private:
+  // The mutex that guards the platform-to-computation placer map.
+  static absl::Mutex platform_computation_placer_mutex_;
+
+  // State kept for each kind of ComputationPlacer. Registration functions set
+  // up creation_function, and then we use that to lazily create "placer" the
+  // first time GetForPlatform is invoked for a particular id.
+  struct State {
+    std::unique_ptr<ComputationPlacer> placer;
+    ComputationPlacerCreationFunction creation_function = nullptr;
+  };
+
+  // Map from platform kind to computation placer singleton.
+  static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
+
+  ComputationPlacer(const ComputationPlacer&) = delete;
+  ComputationPlacer& operator=(const ComputationPlacer&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPUTATION_PLACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_canonicalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_canonicalizer.h
new file mode 100644
index 00000000..6a857fc4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_canonicalizer.h
@@ -0,0 +1,21 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
+#define XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/conditional_canonicalizer.h"
+
+#endif  // XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_code_motion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_code_motion.h
new file mode 100644
index 00000000..4f36e501
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_code_motion.h
@@ -0,0 +1,244 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONDITIONAL_CODE_MOTION_H_
+#define XLA_SERVICE_CONDITIONAL_CODE_MOTION_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+namespace conditional_opt {
+// At the conceptual level, a boundary can be thought of as representing a
+// single virtual operation, except this virtual operation is conditionally
+// instantiated into different concrete operations at each conditional branch.
+// So a boundary is mapped to a single concrete operation if it is outside of
+// conditional branches, and is mapped to a list of instructions if inside the
+// branches. This data structure therefore allows a common data structure
+// representation of the instructions to be moved, whether  they are inside or
+// outside of the branches. Subsequently, it allows a common implementation
+// basis to be used for both moving instructions out of and for moving them
+// inside branches.
+class Boundary {
+ public:
+  enum class Position {
+    kInsideBranch,
+    kOutsideBranchUser,
+    kOutsideBranchOperand,
+    kUndefined
+  };
+  Boundary() : position_(Position::kUndefined) {}
+  explicit Boundary(Position p) : position_(p) {}
+  std::vector<HloInstruction*>& mutable_operands() { return operands_; }
+  const std::vector<HloInstruction*>& operands() const { return operands_; }
+  bool IsInsideBranch() const { return position_ == Position::kInsideBranch; }
+  bool IsOutsideBranchUser() const {
+    return position_ == Position::kOutsideBranchUser;
+  }
+  bool IsOutsideBranchOperand() const {
+    return position_ == Position::kOutsideBranchOperand;
+  }
+  Position GetPosition() const { return position_; }
+  bool IsEmpty() const { return operands_.empty(); }
+  std::string ToString() const {
+    std::string res;
+    for (HloInstruction* op : operands_) {
+      res += op->ToString() + ";";
+    }
+    return res;
+  }
+  bool operator==(const Boundary& that) const {
+    return absl::c_equal(operands_, that.operands_);
+  }
+  template <typename H>
+  friend H AbslHashValue(H h, const Boundary& boundary) {
+    return H::combine(std::move(h), boundary.operands_);
+  }
+
+ private:
+  // Boundary instructions in the conditional branches, one from each branch
+  // of the conditional; or a single operand from outside the conditional.
+  std::vector<HloInstruction*> operands_;
+  Position position_;
+};
+
+// HLO pass that moves identical ops in/out of conditional.
+// - The definition of identical are the shape of the operands are identical
+// and their properties are identical.
+// - Only the identical ops that won't share operands with other ops will
+// be moved out of conditional.
+// The cost model of the code motion optimization includes two components:
+// represented by the move_config_ and reuse_config_ arrays of the optimization.
+// The move_config_ array uses 1 vs 0 to dictate whether each Hlo Opcode, when
+// used with its first operand being another given Hlo Opcode, is allowed to
+// move across any conditional boundary; the reuse_config_ array uses an integer
+// to represent the force between each pair of HloOpcode regarding how
+// attractive it is to place these instructions together (both inside or outside
+// of a conditional). Both arrays use Hlo Opcode only to drive the
+// configuration, regardless of where the operations are located in the
+// module.
+class ConditionalCodeMotion : public HloModulePass {
+ public:
+  // If is_layout_sensitive is true, then the hoist process preserves layout
+  // during identical comparison. Otherwise, layout is ignored.
+  // The search configuration is a single integer but is split into four parts:
+  // (sign, n, m, p), where n,m,p each occupy 8 bits and together make the 24
+  // bits at the end of the int32_t. For the sign part, if search_config is <0,
+  // the reuse_config_ cost model is modified (tuned); if search_config is >0,
+  // the move_config_ cost model is modified (tuned); if search_config == 0,
+  // the default cost model is used with no tuning. When tuning, the entries in
+  // the designated configuration array (move_config_ or reuse_config_) are
+  // flipped between 0 and another default integer, starting from the pth entry
+  // being queried by the optimization and repeated every nth time a new entry
+  // is visited, until a maximal of m entries have been changed. The tuning
+  // start over when optimizing a new model.
+  explicit ConditionalCodeMotion(bool is_layout_sensitive,
+                                 bool pursue_full_conditional_code_motion,
+                                 int64_t search_config = 0,
+                                 int64_t memory_increase_allowance = 5000)
+      : is_layout_sensitive_(is_layout_sensitive),
+        pursue_full_conditional_code_motion_(
+            /*turn off special case if tuning*/
+            pursue_full_conditional_code_motion && search_config == 0),
+        search_config_index_(0),
+        memory_increase_allowance_(memory_increase_allowance) {
+    search_config_.push_back(search_config);
+    if (search_config != 0) {
+      search_config_map_[0] = search_config_;
+    }
+  }
+  explicit ConditionalCodeMotion(bool is_layout_sensitive,
+                                 bool pursue_full_conditional_code_motion,
+                                 std::string search_config,
+                                 int64_t memory_increase_allowance = 5000)
+      : is_layout_sensitive_(is_layout_sensitive),
+        pursue_full_conditional_code_motion_(
+            /*turn off special case if tuning*/
+            pursue_full_conditional_code_motion && search_config.empty()),
+        search_config_index_(-1),
+        memory_increase_allowance_(memory_increase_allowance) {
+    ParseSearchConfiguration(search_config);
+  }
+  // Parse a given string in the format of a sequence of i,s,m,t into a
+  // list of transformation search configurations, each configuration generated
+  // by invoking MakeSearchConfig(s,m,t) and will be used for the ith
+  // conditional encountered when optimizing a given module.
+  void ParseSearchConfiguration(const std::string& search_config);
+  // Make a single search configuration for changing transformation decisions:
+  // flip the decisions at position n = flip_start + flip_stride * m, and
+  // m = 0..max_flip.
+  // The following defines how the int64_t search configuration is composed, as
+  // flip_start + (flip_max << kMaxPos) + (flip_stride << kStridePos).
+  // Position (digit) for maximum number of flips.
+  static constexpr int kMaxPos = 16;
+  // Position (digit) for the count-down to the first flip.
+  static constexpr int kStartPos = 0;
+  // Position (digit) for the count-down to the next flip.
+  static constexpr int kStridePos = 32;
+  // Bit mask for extracting the last digits of value.
+  static constexpr int kValueMask = 0xffff;
+  static int64_t MakeSearchConfig(int64_t start, int64_t max, int64_t stride) {
+    const int64_t config =
+        (max << kMaxPos) + (start << kStartPos) + (stride << kStridePos);
+    VLOG(2) << "flip stride = " << flip_stride(config) << "\n";
+    VLOG(2) << "flig config = " << config << "\n";
+    return config;
+  }
+
+  static int16_t flip_start(int64_t search_config) {
+    return (search_config >> kStartPos) & kValueMask;
+  }
+
+  static int16_t flip_stride(int64_t search_config) {
+    return (search_config >> kStridePos) & kValueMask;
+  }
+
+  static int16_t DecrementMaxFlip(int64_t* search_config) {
+    const int16_t max_flip = ((*search_config) >> kMaxPos) & kValueMask;
+    // Decrement flip count so we can stop if it reaches 0.
+    if (max_flip > 0) {
+      *search_config -= (1 << kMaxPos);
+    }
+    return max_flip;
+  }
+
+  absl::string_view name() const override { return "conditional-code-motion"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Optimization decision for each boundary of the conditional instruction.
+  class Decision {
+   public:
+    enum class Direction : uint8_t {
+      kMoveOutOfBranch,
+      kMoveIntoBranch,
+      kNoChange
+    };
+
+   public:
+    Decision(Direction direction, int benefit)
+        : direction_(direction), benefit_(benefit) {}
+    Direction GetDirection() const { return direction_; }
+    int GetBenefit() const { return benefit_; }
+
+   private:
+    Direction direction_;
+    int benefit_;
+  };
+  // If the optimization decision is NO_CHANGE, new_boundary is set to nullptr;
+  // otherwise, it is set to the new boundary after proposed optimization.
+  virtual Decision ConsiderCodeMotion(
+      HloInstruction* conditional, const Boundary& cur_boundary,
+      std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries,
+      absl::flat_hash_map<HloInstruction*, int>& visited_count);
+
+ private:
+  const bool is_layout_sensitive_;
+  const bool pursue_full_conditional_code_motion_;
+  // The following parameterizes the transformation decisions and cost model.
+  std::vector<int64_t> search_config_;
+  int64_t search_config_index_;
+  // Map each conditional to a vector of its search configurations. The key of
+  // the map is the index number of the conditional in a module when traversed
+  // in post order, and the value of the map is the sequence of search
+  // configurations specified with the same index number for the conditional.
+  absl::flat_hash_map<int64_t, std::vector<int64_t>> search_config_map_;
+  std::vector<std::vector<int64_t>> move_config_, reuse_config_;
+  // How much memory increase, calculated using
+  // ShapeUtil::ByteSizeOf(hlo->shape(), 1) >> 9, is allowed per instruction
+  // moved.
+  int64_t memory_increase_allowance_ = 5000;
+  int64_t memory_increase_ = 0;
+  absl::StatusOr<bool> MoveInstructionOut(
+      HloInstruction* conditional, std::vector<Boundary>& to_move_out,
+      std::vector<Boundary>& new_boundaries);
+  absl::StatusOr<bool> MoveUserInstructionsIn(
+      HloInstruction* conditional, std::vector<Boundary>& to_move_in);
+  absl::StatusOr<bool> MoveOperandInstructionsIn(
+      HloInstruction* conditional, std::vector<Boundary>& to_move_in);
+  void SetDefaultMoveConfig();
+};
+}  // namespace conditional_opt
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CONDITIONAL_CODE_MOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_simplifier.h
new file mode 100644
index 00000000..01363f94
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_simplifier.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
+#define XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that removes kConditional with a constant predicate, replacing them
+// with their true or false computation as appropriate.
+class ConditionalSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "simplify-conditional"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> TryRemoveConditional(HloInstruction* conditional);
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_to_select.h b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_to_select.h
new file mode 100644
index 00000000..4e367646
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/conditional_to_select.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
+#define XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which transforms conditionals to selects in places where conditionals
+// are legal, but not currently supported by the backends (e.g. inside kMap)
+class ConditionalToSelect : public HloModulePass {
+ public:
+  ~ConditionalToSelect() override = default;
+  absl::string_view name() const override { return "conditional-to-select"; }
+
+  // Run conditional to select on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/constant_value.h b/third_party/tflite-hdrs/third_party/xla/xla/service/constant_value.h
new file mode 100644
index 00000000..418fcf56
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/constant_value.h
@@ -0,0 +1,90 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONSTANT_VALUE_H_
+#define XLA_SERVICE_CONSTANT_VALUE_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/literal.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Class used to represent a constant. Can contain signed/unsigned values
+// and values of many types type erasing the actual type and handling corner
+// cases like going out of bound.
+class ConstantValue {
+ public:
+  // Constructor makes sure the extra bits of the value are masked away. Handles
+  // signed and unsigned cases.
+  ConstantValue(uint64_t value, int32_t bitwidth, bool is_signed)
+      : value_(is_signed
+                   ? absl::bit_cast<uint64_t>(
+                         absl::bit_cast<int64_t>(
+                             value << (8 * sizeof(uint64_t) - bitwidth)) >>
+                         (8 * sizeof(uint64_t) - bitwidth))
+                   : KeepLowerBits(value, bitwidth)),
+        bitwidth_(bitwidth),
+        is_signed_(is_signed) {}
+  static ConstantValue GetZero(int32_t bitwidth, bool is_signed) {
+    return ConstantValue(0, bitwidth, is_signed);
+  }
+  static ConstantValue GetOne(int32_t bitwidth, bool is_signed) {
+    return ConstantValue(1, bitwidth, is_signed);
+  }
+  static ConstantValue Get(int64_t value, int32_t bitwidth, bool is_signed) {
+    return ConstantValue(absl::bit_cast<uint64_t>(value), bitwidth, is_signed);
+  }
+  static ConstantValue GetSigned(int64_t value, int32_t bitwidth) {
+    return ConstantValue(absl::bit_cast<uint64_t>(value), bitwidth,
+                         /*is_signed=*/true);
+  }
+  static ConstantValue GetUnsigned(uint64_t value, int32_t bitwidth) {
+    return ConstantValue(value, bitwidth, /*is_signed=*/false);
+  }
+  static absl::StatusOr<ConstantValue> FromLiteral(const Literal& literal);
+  ConstantValue add(const ConstantValue& other) const {
+    return ConstantValue(value_ + other.value_, bitwidth_, is_signed_);
+  }
+  ConstantValue sub(const ConstantValue& other) const {
+    return ConstantValue(value_ - other.value_, bitwidth_, is_signed_);
+  }
+  ConstantValue div(const ConstantValue& other) const;
+  ConstantValue mod(const ConstantValue& other) const;
+  ConstantValue mul(const ConstantValue& other) const;
+  bool lt(const ConstantValue& other) const;
+  bool gt(const ConstantValue& other) const;
+  bool eq(const ConstantValue& other) const { return *this == other; }
+  int64_t GetSignedValue() const { return absl::bit_cast<int64_t>(value_); }
+  uint64_t GetUnsignedValue() const { return value_; }
+  int32_t GetBitwidth() const { return bitwidth_; }
+  bool IsSigned() const { return is_signed_; }
+  bool operator==(const ConstantValue& other) const {
+    return value_ == other.value_ && bitwidth_ == other.bitwidth_ &&
+           is_signed_ == other.is_signed_;
+  }
+  std::string ToString() const;
+
+ private:
+  uint64_t value_;
+  int32_t bitwidth_;
+  bool is_signed_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CONSTANT_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convert_async_collectives_to_sync.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_async_collectives_to_sync.h
new file mode 100644
index 00000000..3e3884b9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_async_collectives_to_sync.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+#define XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h"
+
+#endif  // XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h
new file mode 100644
index 00000000..17f629fd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
+#define XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h"
+
+#endif  // XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convert_mover.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_mover.h
new file mode 100644
index 00000000..a335a458
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_mover.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVERT_MOVER_H_
+#define XLA_SERVICE_CONVERT_MOVER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/convert_mover.h"
+
+#endif  // XLA_SERVICE_CONVERT_MOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convert_operand_folding.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_operand_folding.h
new file mode 100644
index 00000000..863cd7da
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convert_operand_folding.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
+#define XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/convert_operand_folder.h"
+
+#endif  // XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_4d_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_4d_expander.h
new file mode 100644
index 00000000..2a290290
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_4d_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
+#define XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/convolution_4d_expander.h"
+
+#endif  // XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_group_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_group_converter.h
new file mode 100644
index 00000000..21d68d27
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_group_converter.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
+#define XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/convolution_group_converter.h"
+
+#endif  // XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_pred_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_pred_expander.h
new file mode 100644
index 00000000..84c57681
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/convolution_pred_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
+#define XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/convolution_pred_expander.h"
+
+#endif  // XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/copy_insertion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/copy_insertion.h
new file mode 100644
index 00000000..0b2ba86e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/copy_insertion.h
@@ -0,0 +1,127 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COPY_INSERTION_H_
+#define XLA_SERVICE_COPY_INSERTION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+
+// Copy insertion is a legalization HLO pass which inserts copies (kCopy
+// instructions) to eliminate several kinds of problems in the HLO module.
+//
+//   (1) Entry parameter or a constant live out of the entry computation.  Entry
+//       computation arguments and constants have different lifetimes than the
+//       computation result and cannot share the same allocation. Parameters and
+//       constants live out of non-entry computations do not need copies.
+//
+//   (2) Different values which are simultaneously live and which must be held
+//       in the same buffer. This can occur in while bodies. Specifically, the
+//       while loop state (the arguments to the while instruction) is updated
+//       in-place and the update may clobber the value from the previous
+//       iteration before the previous value is dead. Computations called from
+//       kCall instructions do not need such copies because kCall has no update
+//       in-place semantics.
+//
+//   (3) The buffer set of the root instruction of the entry computation must be
+//       unambiguous and distinct. That is, InstructionAliasSet::IsAmbiguous and
+//       InstructionAliasSet::IsDistinct return true.
+class CopyInsertion : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "copy-insertion"; }
+  static constexpr int64_t kUseRegionAnalysisLimit = 0;
+
+  // backend specific function that decides whether an instruction
+  // can share buffer with its operand.
+  //
+  // TODO(b/80315712): Find a better way to tell whether a fusion can share
+  // buffer.
+  explicit CopyInsertion(
+      const HloDataflowAnalysis::CanShareBuffer& can_share_buffer = nullptr,
+      int64_t use_region_based_live_range_analysis = kUseRegionAnalysisLimit)
+      : can_share_buffer_(can_share_buffer),
+        use_region_based_live_range_analysis_(
+            use_region_based_live_range_analysis) {}
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (copies were inserted).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Try to remove as many copies from the module as possible without
+  // introducing live range interference. Only copy instructions that are
+  // eligible for copy elision are considered for removal.
+  // If check_live_range_ordering is true, check that live ranges are ordered
+  // in all the existing aliased buffers.
+  absl::Status RemoveUnnecessaryCopies(
+      HloModule* module, bool check_live_range_ordering = false,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Add copies to address special constraints on the roots of computations not
+  // related to live range interference:
+  //
+  //    (1) Entry computation root must be unambiguous and distinct.
+  //
+  //    (2) Any computation called by a kCall instruction must have an
+  //        unambiguous root.
+  //
+  //    (3) Constants and parameters cannot be live out of the entry computation
+  //
+  absl::Status AddSpecialCaseCopies(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+ protected:
+  // Override which requires the caller to pass in a call graph.
+  virtual absl::Status AddSpecialCaseCopies(
+      const CallGraph& call_graph,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      HloModule* module);
+
+  // Add copies for conditional instructions.
+  virtual absl::Status AddCopiesForConditional(
+      const HloAliasAnalysis& alias_analysis, HloInstruction* conditional);
+
+  // Add copies for async send/recv instructions.
+  absl::Status AddCopiesForAsyncSendRecv(const HloAliasAnalysis& alias_analysis,
+                                         HloInstruction* async);
+
+  // Backend specific function that decides whether an instruction can share
+  // buffer with its operand.
+  HloDataflowAnalysis::CanShareBuffer can_share_buffer_;
+
+ private:
+  absl::Status AddCopiesToResolveInterference(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  int64_t use_region_based_live_range_analysis_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COPY_INSERTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cost_modelling/op_cost.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cost_modelling/op_cost.h
new file mode 100644
index 00000000..35659970
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cost_modelling/op_cost.h
@@ -0,0 +1,365 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COST_MODELLING_OP_COST_H_
+#define XLA_SERVICE_COST_MODELLING_OP_COST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape_util.h"
+
+// This file introduces a simplified cost interface for use in passes like
+// Memory Space Assignment.
+//
+// In addition to the simplified interface, this file provides the following
+// common cost functionality:
+// * Delegation between different cost implementations
+// * Support for allowing cost implementations to be partial
+// * Caching of cost values
+// * Logging of cost values
+//
+// The design of the simplified cost interface is as follows:
+// * CostMetricId: uniquely identifies a metric we want to compute, e.g.,
+//   latency of a particular HLO instruction.
+// * CostValue: the value assigned to a metric.
+// * MetricCalculator: a function that takes a CostMetricId and returns a
+//   CostValue, for a given instruction.
+// * OpCostCalculator: a function that takes an HLO instruction and returns a
+//   MetricCalculator (computing metrics for that instruction).
+//   - We use a 2 layer approach (i.e., OpCostCalculator and MetricCalculator)
+//     to support cases where we compute intermediate computations for an HLO
+//     that can be refined into specific metric values.
+// * OpCostManager: a class that manages the computation of costs. it supports
+//   delegation, partial cost implementation, caching, and logging.
+
+namespace xla {
+
+// A unique identifier for a cost metric. For example, the latency of a
+// particular HLO instruction.
+class CostMetricId {
+ public:
+  // Supported metric types.
+  enum class MetricType : std::uint8_t {
+    kLatencySeconds,
+    kComputeSeconds,
+    kOperandBytesAccessed,
+    kOutputBytesAccessed,
+    kTotalBytesAccessed,
+  };
+
+  // Factory constructors, one for each type of metric.
+  static CostMetricId LatencySeconds(const HloInstruction& instruction);
+  static CostMetricId ComputeSeconds(const HloInstruction& instruction);
+  static CostMetricId OperandBytesAccessed(const HloInstruction& instruction,
+                                           int64_t operand_num,
+                                           const ShapeIndex& shape_index);
+  static CostMetricId OutputBytesAccessed(const HloInstruction& instruction,
+                                          const ShapeIndex& shape_index);
+  static CostMetricId TotalBytesAccessed(const HloInstruction& instruction);
+
+  // The names of logging columns that correspond to values output by
+  // LoggingColumns(). For use with LOG(INFO) type logging. For example, the
+  // return columns include "metric_id", "module_name", ...
+  static std::vector<std::string> LoggingColumnNames();
+
+  CostMetricId(const CostMetricId& other) = default;
+  CostMetricId& operator=(const CostMetricId& other) = default;
+
+  bool operator==(const CostMetricId& other) const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const CostMetricId& id) {
+    return H::combine(std::move(h), id.MakeTuple());
+  }
+
+  MetricType type() const { return type_; }
+
+  // The instruction the metric is calculated for.
+  const HloInstruction& instruction() const { return *instruction_; }
+
+  // REQUIRES: operand_num_.has_value().
+  int64_t operand_num() const;
+
+  // REQUIRES: shape_index_.has_value().
+  const ShapeIndex& shape_index() const;
+
+  // Values suitable for logging analysis via LOG(INFO).
+  std::vector<std::string> LoggingColumns() const;
+
+  // Suitable for errors, warnings, and debugging.
+  std::string ToString() const;
+
+ private:
+  using Tuple = std::tuple<MetricType, const HloInstruction*,
+                           std::optional<int64_t>, std::optional<ShapeIndex>>;
+
+  CostMetricId() = delete;
+  CostMetricId(MetricType type, const HloInstruction& instruction,
+               std::optional<int64_t> operand_num,
+               std::optional<ShapeIndex> shape_index);
+
+  std::string Identifier() const;
+  std::string MetricTypeName() const;
+  std::string ModuleName() const;
+  std::string ComputationName() const;
+  std::string InstructionTypeStr() const;
+  std::string OperandNumStr() const;
+  std::string ShapeIndexStr() const;
+
+  // Returns a tuple of private data members for use in equality and hashing.
+  Tuple MakeTuple() const;
+
+  MetricType type_;
+  const HloInstruction* instruction_;
+  // Null unless type_ is kOperandBytesAccessed.
+  std::optional<int64_t> operand_num_ = std::nullopt;
+  // Null unless type_ is kOperandBytesAccessed or kOutputBytesAccessed.
+  std::optional<ShapeIndex> shape_index_ = std::nullopt;
+};
+
+// A value assigned to a cost metric.
+class CostValue {
+ public:
+  // Not found should be used for cases where the cost is not implemented.
+  static CostValue MakeNotFound();
+  // An error should be used for cases where there is a problem computing a
+  // cost.
+  static CostValue MakeError();
+  static CostValue MakeValue(double value);
+
+  bool operator==(const CostValue& other) const;
+
+  bool IsOk() const { return type_ == Type::kOk; }
+  bool IsNotFound() const { return type_ == Type::kNotFound; }
+  bool IsError() const { return type_ == Type::kError; }
+
+  // REQUIRES: IsOk().
+  double value() const;
+
+  // Suitable for logging analysis for debugging.
+  std::string ToString() const;
+  friend std::ostream& operator<<(std::ostream& os, const CostValue& value) {
+    return os << value.ToString();
+  }
+
+ private:
+  enum class Type : std::uint8_t { kNotFound, kError, kOk };
+  using DataTuple = std::tuple<Type, double>;
+
+  CostValue() = default;
+  CostValue(Type type, double value) : type_(type), value_(value) {}
+
+  DataTuple MakeTuple() const { return DataTuple(type_, value_); }
+
+  Type type_ = Type::kNotFound;
+  double value_ = 0.0;
+};
+
+// A calculator that computes the values of cost metrics, for a given HLO
+// instruction.
+class MetricCalculator {
+ public:
+  virtual ~MetricCalculator() = default;
+
+  virtual CostValue Calculate(const CostMetricId& metric_id) {
+    return CostValue::MakeError();
+  }
+
+ protected:
+  MetricCalculator() = default;
+};
+
+// A calculator that creates a MetricCalculator, for a given HLO instruction.
+class OpCostCalculator {
+ public:
+  virtual ~OpCostCalculator() = default;
+
+  virtual std::unique_ptr<MetricCalculator> CreateMetricCalculator(
+      const HloInstruction& instruction) = 0;
+
+ protected:
+  OpCostCalculator() = default;
+};
+
+// A manager that computes the values of cost metrics.
+class OpCostManager {
+ public:
+  // Options for the OpCostManager.
+  struct Options {
+    // If true, the OpCostManager will cache CostValues for CostMetricIds.
+    bool enable_cache = false;
+
+    // Enables extra info logging that is suitable for analysis.
+    bool enable_analysis_logging = false;
+  };
+
+  // Costs are calculated using a tree structure of CalculationNodes. Leaf nodes
+  // wrap an OpCostCalculator, and calculate costs. Non-leaf nodes delegate
+  // calculation to their children.
+  //
+  // The names of all nodes passed to an OpCostManager should be unique.
+  class CalculationNode {
+   public:
+    // The type used to index the children of a delegation node.
+    using CalculatorIndex = size_t;
+
+    // A map from the names of leaf calculators to the values they computed for
+    // a given metric.
+    using LeafCalculatorValueMap = absl::flat_hash_map<std::string, CostValue>;
+
+    // Delegation nodes delegate calculation to their children. A
+    // DelegationOrderFn returns DelegationInfo, describing the order of
+    // delegation. This can be set differently per instruction type.
+    struct DelegationInfo {
+      // The order a delegation node should delegate to its children.
+      std::vector<CalculatorIndex> order;
+
+      // When analysis logging is enabled, we may want to log the costs of
+      // additional children calculators, not just the calculator whose value we
+      // choose. The indices of these additional calculators are stored here.
+      std::vector<CalculatorIndex> additional_calculators_to_log;
+    };
+    using DelegationOrderFn = absl::AnyInvocable<DelegationInfo(
+        const HloInstruction& instruction, bool enable_analysis_logging)>;
+
+    // Creates a leaf node.
+    //
+    // If enable_cache is true, the leaf node will cache the MetricCalculators
+    // it creates per HLO instruction.
+    static std::unique_ptr<CalculationNode> CreateLeaf(
+        absl::string_view name, std::unique_ptr<OpCostCalculator> calculator,
+        bool enable_cache);
+
+    // Creates a delegation node.
+    //
+    // If delegation_order_fn is nullptr, the node will delegate to its children
+    // in the order they are placed in children.
+    static std::unique_ptr<CalculationNode> CreateDelegationNode(
+        absl::string_view name,
+        std::vector<std::unique_ptr<CalculationNode>> children,
+        DelegationOrderFn delegation_order_fn = nullptr);
+
+    virtual ~CalculationNode() = default;
+
+    virtual std::optional<double> GetMetricValue(
+        const CostMetricId& metric_id,
+        LeafCalculatorValueMap* calculator_value_map) = 0;
+
+    virtual absl::string_view Name() const = 0;
+
+    // Returns the names of leaf calculators at or below the node (in the tree).
+    // Leaf calculator names are used to uniquely identify the costs associated
+    // with a leaf node. They are also used to as additional column names in
+    // analysis logging.
+    virtual std::vector<std::string> LeafCalculatorNames() const = 0;
+
+   protected:
+    CalculationNode() = default;
+  };
+
+  OpCostManager(Options options, std::unique_ptr<CalculationNode> root);
+
+  double LatencySeconds(const HloInstruction& instruction);
+
+  double ComputeSeconds(const HloInstruction& instruction);
+
+  double OperandBytesAccessed(const HloInstruction& instruction,
+                              int64_t operand_num,
+                              const ShapeIndex& shape_index);
+
+  double OutputBytesAccessed(const HloInstruction& instruction,
+                             const ShapeIndex& shape_index);
+
+  double TotalBytesAccessed(const HloInstruction& instruction);
+
+ private:
+  OpCostManager() = delete;
+
+  // Returns the final value for a given metric.
+  double GetMetricValue(const CostMetricId& metric_id);
+
+  // Returns the list of logging column names.
+  std::string AnalysisLoggingColumns() const;
+
+  // Returns an analysis logging line for a metric with the specified computed
+  // costs.
+  std::string AnalysisLoggingLine(
+      const CostMetricId& metric_id,
+      const CalculationNode::LeafCalculatorValueMap& calculator_costs) const;
+
+  Options options_;
+  std::unique_ptr<CalculationNode> root_;
+  std::vector<std::string> leaf_calculator_names_;
+
+  // Caching.
+  absl::flat_hash_map<CostMetricId, double> metric_cache_;
+};
+
+// A wrapper around HloCostAnalysis that calls
+// HloModule::entry_computation()->Accept(cost_analysis) as needed.
+class HloCostAnalysisWithAcceptState {
+ public:
+  explicit HloCostAnalysisWithAcceptState(
+      std::unique_ptr<HloCostAnalysis> cost_analysis);
+
+  explicit HloCostAnalysisWithAcceptState(HloCostAnalysis& cost_analysis);
+
+  HloCostAnalysis& cost_analysis(const HloInstruction& instruction);
+
+ private:
+  HloCostAnalysisWithAcceptState() = delete;
+
+  std::unique_ptr<HloCostAnalysis> cost_analysis_storage_;
+  HloCostAnalysis& cost_analysis_;
+  bool accepted_entry_computation_ = false;
+};
+
+// Creates an OpCostCalculator that uses HloCostAnalysis.
+//
+// REQUIRES:
+// - cost_analysis must outlive the return OpCostCalculator.
+std::unique_ptr<OpCostCalculator> CreateHloCostAnalysisCalculator(
+    HloCostAnalysisWithAcceptState& cost_analysis_wrapper);
+
+// Creates an OpCostCalculator whose initial values are computed by
+// initial_calculator, before being post-processed by the specified
+// post_process_fn.
+std::unique_ptr<OpCostCalculator> CreateCalculatorWithPostProcessedCostValues(
+    std::unique_ptr<OpCostCalculator> initial_calculator,
+    absl::AnyInvocable<CostValue(const CostMetricId& metric_id,
+                                 CostValue cost_value)>
+        post_process_fn);
+
+// Creates an OpCostCalculator that returns the values resulting from initial
+// calculator, except in the case of TotalByteAccessed. For TotalByteAccessed,
+// the calculator returns the sum of the operand and output bytes accessed.
+std::unique_ptr<OpCostCalculator> CreateCalculatorWithDefaultTotalBytesAccessed(
+    std::unique_ptr<OpCostCalculator> initial_calculator);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COST_MODELLING_OP_COST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h
new file mode 100644
index 00000000..5891f648
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/benchmarks/hlo_benchmark_runner.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_BENCHMARKS_HLO_BENCHMARK_RUNNER_H_
+#define XLA_SERVICE_CPU_BENCHMARKS_HLO_BENCHMARK_RUNNER_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/tsl/platform/test_benchmark.h"
+
+namespace xla::cpu {
+
+// A string-to-string mapping that allows to parametrize HLO benchmarks.
+using StrToStrMapping =
+    std::initializer_list<std::pair<absl::string_view, absl::string_view>>;
+
+// Runs the given HLO module as a benchmark.
+//
+// The HLO text can be interpolated using the given string replacements. Each
+// replacement is a mapping that will be applied to the HLO module before
+// running the benchmark.
+//
+// If `disable_parallel_task_assigner` is true, the parallel task assigner will
+// not be run on the HLO module before running the benchmark. Therefore,
+// parallel backend will not be executed.
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             absl::string_view hlo_module,
+                             absl::Span<const Literal* const> args,
+                             StrToStrMapping replacements = {},
+                             bool disable_parallel_task_assigner = false);
+
+// Benchmarks the given HLO's compilation time.
+//
+// Takes the same options as RunHloBenchmark, except no arguments since the
+// HLO is only compiled, not run.
+absl::Status CompileHloBenchmark(benchmark::State& state,
+                                 absl::string_view hlo_module,
+                                 StrToStrMapping replacements = {},
+                                 bool disable_parallel_task_assigner = false);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_BENCHMARKS_HLO_BENCHMARK_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/buffer_desc.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/buffer_desc.h
new file mode 100644
index 00000000..8d606250
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/buffer_desc.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_BUFFER_DESC_H_
+#define XLA_SERVICE_CPU_BUFFER_DESC_H_
+
+#include <cstddef>
+
+namespace xla {
+namespace cpu {
+
+// BufferDesc for passing raw `buffer` (i.e. void ptr + size) arguments.
+class BufferDesc {
+ public:
+  BufferDesc(void* data, size_t size) : data_(data), size_(size) {}
+  void* data() const { return data_; }
+  size_t size() const { return size_; }
+
+ private:
+  void* data_;
+  size_t size_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_BUFFER_DESC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/buffer_info_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/buffer_info_util.h
new file mode 100644
index 00000000..c21ea2f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/buffer_info_util.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
+#define XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/cpu_function_runtime.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla {
+namespace cpu {
+// Creates and returns a list of BufferInfo instances containing relevant
+// information from `buffer_assignment`.
+std::vector<cpu_function_runtime::BufferInfo>
+CreateBufferInfosFromBufferAssignment(
+    const HloModule& module, const BufferAssignment& buffer_assignment);
+
+// Creates and returns a table containing the mapping from entry computation
+// parameters to buffer allocation indices.
+//
+// If this function returns V then entry parameter i has buffer allocation index
+// V[i].
+std::vector<int32_t> CreateArgIndexTableFromBufferInfos(
+    absl::Span<const cpu_function_runtime::BufferInfo> buffer_infos);
+
+std::vector<int32_t> CreateResultIndexTableFromBufferInfos(
+    absl::Span<const cpu_function_runtime::BufferInfo> buffer_infos);
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/conv_canonicalization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/conv_canonicalization.h
new file mode 100644
index 00000000..ee6c1a44
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/conv_canonicalization.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
+#define XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace cpu {
+
+// An HLO pass that canonicalizes the dimension numbers of all top-level
+// convolutions in the given module.
+//
+// In order to hit the fast path of using Eigen's convolution implementation, a
+// convolution's dimension numbers need to satisfy certain constraints (so
+// called canonical convolutions). This pass expands non-canonical convolutions
+// into reshapes and canonical convolutions, so that these non-canonical
+// convolutions can run faster.
+class ConvCanonicalization : public HloModulePass {
+ public:
+  explicit ConvCanonicalization(
+      const TargetMachineFeatures* target_machine_features)
+      : target_machine_features_(*target_machine_features) {}
+
+  ~ConvCanonicalization() override {}
+  absl::string_view name() const override {
+    return "convolution-canonicalization";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const TargetMachineFeatures& target_machine_features_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_compiler.h
new file mode 100644
index 00000000..b38409f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -0,0 +1,231 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_COMPILER_H_
+#define XLA_SERVICE_CPU_CPU_COMPILER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/Target/TargetMachine.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/cpu_function_runtime.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/compiler.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_profile_printer_data.pb.h"
+#include "xla/service/llvm_compiler.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/util.h"
+
+namespace mlir {
+class DialectRegistry;
+}  // namespace mlir
+
+namespace xla {
+namespace cpu {
+
+class CpuExecutable;
+
+// This class wraps the configurability options that LLVM exposes including: the
+// target triple, the target cpu and the target features.  It also includes the
+// desired linkage name for the computation entry point.
+class CpuAotCompilationOptions : public AotCompilationOptions {
+ public:
+  // Relocation models available for compilation.
+  enum class RelocationModel {
+    // Corresponds to the -fno-pic compiler option.
+    Static,
+    // Corresponds to the -fpic compiler option.
+    SmallPic,
+    // Corresponds to the -fPIC compiler option.
+    BigPic,
+    // Corresponds to the -fpie compiler option.
+    SmallPie,
+    // Corresponds to the -fPIE compiler option.
+    BigPie
+  };
+
+  CpuAotCompilationOptions(std::string triple, std::string cpu_name,
+                           std::string features, std::string entry_point_name,
+                           RelocationModel relocation_model);
+
+  ~CpuAotCompilationOptions() override;
+
+  se::Platform::Id PlatformId() const override;
+
+  // The triple used for compilation, similar to clang's -target flag.
+  const std::string& triple() const { return triple_; }
+  // The CPU name used for compilation, similar to clang's -mcpu flag.
+  const std::string& cpu_name() const { return cpu_name_; }
+  // The target features used for compilation ("+avx2", "+neon", etc).
+  const std::string& features() const { return features_; }
+  // The name to be used for the compiled code's entry point.
+  const std::string& entry_point_name() const { return entry_point_name_; }
+  // The relocation model used for compilation.
+  RelocationModel relocation_model() const { return relocation_model_; }
+
+  bool use_mlir_hlo_lowering() const { return use_mlir_hlo_lowering_; }
+  void set_use_mlir_hlo_lowering(bool value) { use_mlir_hlo_lowering_ = value; }
+
+ private:
+  const std::string triple_;
+  const std::string cpu_name_;
+  const std::string features_;
+  const std::string entry_point_name_;
+  const RelocationModel relocation_model_;
+  bool use_mlir_hlo_lowering_ = false;
+};
+
+class CpuAotCompilationResult : public AotCompilationResult {
+ public:
+  CpuAotCompilationResult(
+      ObjectFileData object_file_data,
+      std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
+      int64_t result_buffer_index, std::unique_ptr<HloModule> module,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+  ~CpuAotCompilationResult() override = default;
+
+  HloProfilePrinterData* hlo_profile_printer_data() const {
+    return hlo_profile_printer_data_.get();
+  }
+
+  const ObjectFileData& object_file_data() const { return object_file_data_; }
+  const std::vector<cpu_function_runtime::BufferInfo>& buffer_infos() const {
+    return buffer_infos_;
+  }
+  int64_t result_buffer_index() const { return result_buffer_index_; }
+
+  const HloModule* optimized_module() const override;
+  std::unique_ptr<HloModule> consume_optimized_module() override;
+
+ private:
+  // Contains the compiled computation: an object file.
+  const ObjectFileData object_file_data_;
+
+  // A list of BufferInfo objects describing the buffers used by the XLA
+  // computation.
+  const std::vector<cpu_function_runtime::BufferInfo> buffer_infos_;
+
+  // Contains which buffer index into |buffer_sizes| was designated to the
+  // result of the computation.  This buffer should be passed into the output
+  // parameter when calling the compiled computation.
+  const int64_t result_buffer_index_;
+
+  // Contains the optimized HLO module.
+  std::unique_ptr<HloModule> module_;
+
+  // Contains an instance of HloProfilePrinterData if HLO profiling is enabled,
+  // otherwise is nullptr.
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
+};
+
+// CPU-targeting implementation of the XLA Compiler interface.
+//
+// The compiler translates XLA HLO code into LLVM IR and uses LLVM's JIT
+// infrastructure to create an executable "blob" that can then be returned
+// wrapped in CpuExecutable and actually invoked.
+class CpuCompiler : public LLVMCompiler {
+ public:
+  CpuCompiler();
+  ~CpuCompiler() override = default;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     const AotCompilationOptions& options) override;
+
+  se::Platform::Id PlatformId() const override;
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
+      Executable* executable) const override;
+
+  // Returns a (deserialized) AotCompilationResult from a serialized
+  // AotCompilationResult.
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  LoadAotCompilationResult(const std::string& serialized_aot_result) override;
+
+  // The optional `registry` supports MLIR dialects and plugins to be loaded
+  // during optimization. If non-null, it will be used to construct relevant
+  // MLIR contexts.
+  absl::StatusOr<std::unique_ptr<CpuExecutable>> CompileXlaRuntimeCpuExecutable(
+      std::unique_ptr<HloModule> module,
+      mlir::DialectRegistry* registry = nullptr);
+
+  absl::StatusOr<HloSchedule> CreateHloSchedule(
+      const HloModule& hlo_module) const;
+
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> CreateBufferAssignment(
+      const HloModule& module) const;
+
+ private:
+  // Initialize the LLVM target.
+  static void InitializeLLVMTarget();
+
+  // Runs the HLO passes which are necessary for both optimizations and
+  // correctness.
+  absl::Status RunHloPasses(HloModule* module, bool is_aot_compile,
+                            llvm::TargetMachine* target_machine,
+                            const CompileOptions& compile_options,
+                            bool is_mlir_compile = false);
+
+  // Runs HLO passes up to and including layout assignment.
+  absl::Status RunHloPassesThroughLayoutAssn(
+      HloModule* module, bool /*is_aot_compile*/,
+      TargetMachineFeatures* target_machine_features,
+      bool is_mlir_compile = false);
+
+  // Runs HLO passes after layout assignment.
+  absl::Status RunHloPassesAfterLayoutAssn(
+      HloModule* module, bool is_aot_compile,
+      TargetMachineFeatures* target_machine_features,
+      const CompileOptions& compile_options, bool is_mlir_compile);
+
+  absl::StatusOr<std::unique_ptr<CpuExecutable>> CompileLegacyCpuExecutable(
+      std::unique_ptr<HloModule> module);
+
+  CpuCompiler(const CpuCompiler&) = delete;
+  CpuCompiler& operator=(const CpuCompiler&) = delete;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_event.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_event.h
new file mode 100644
index 00000000..e5a8d803
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_event.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_EVENT_H_
+#define XLA_SERVICE_CPU_CPU_EVENT_H_
+
+namespace xla {
+
+// Typical use case: AsyncValueRef<CpuEvent> is used to indicate the
+// completion of a CPU operation, e.g., data transfer or running a program.
+struct CpuEvent {
+  CpuEvent() = default;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_EVENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_executable.h
new file mode 100644
index 00000000..0e75a05a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -0,0 +1,230 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_EXECUTABLE_H_
+#define XLA_SERVICE_CPU_CPU_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk_executor.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_status_internal.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_profile_printer_data.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+
+namespace xla {
+namespace cpu {
+
+// CPU-targeting implementation of the XLA Executable interface.
+//
+// Wraps a JIT-ed object that can be executed "on device". We JIT for the host
+// architecture, so JIT-ed code and host code share the same ABI.
+class CpuExecutable : public Executable {
+ public:
+  // A storage (or an alias) for constant allocations data.
+  struct ConstantAllocation {
+    se::DeviceMemoryBase AsDeviceMemoryBase() const;
+
+    BufferAllocation::Index index = -1;
+    std::variant<std::monostate, std::unique_ptr<Literal>,
+                 absl::Span<const uint8_t>>
+        data;
+  };
+
+  // Creates a CpuExecutable from JIT compiled cpu function by resolving
+  // `entry_function_name` in the `jit`.
+  static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
+      std::unique_ptr<FunctionLibrary> function_library,
+      std::unique_ptr<const BufferAssignment> assignment,
+      std::unique_ptr<HloModule> hlo_module,
+      const std::string& entry_function_name,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+
+  // Creates a CpuExecutable from a thunk sequence.
+  static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
+      std::unique_ptr<FunctionLibrary> function_library,
+      std::unique_ptr<const BufferAssignment> assignment,
+      std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
+      std::vector<ConstantAllocation> constants,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+
+  ~CpuExecutable() override;
+
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments) override;
+
+  // Calls the generated function performing the computation with the given
+  // arguments using the supplied buffers.
+  absl::Status ExecuteComputeFunction(
+      const ExecutableRunOptions* run_options,
+      absl::Span<MaybeOwningDeviceMemory const> buffers);
+
+  // Calls emitted thunk sequence with the given arguments using the supplied
+  // buffers.
+  absl::Status ExecuteThunks(const ExecutableRunOptions* run_options,
+                             absl::Span<MaybeOwningDeviceMemory const> buffers);
+
+  absl::Span<const std::string> obj_files() const { return obj_files_; }
+
+  void set_obj_files(std::vector<std::string> obj_files) {
+    obj_files_ = std::move(obj_files);
+  }
+
+  // This should be called after set_ir_module_string.
+  const std::string& ir_module_string() const { return ir_module_string_; }
+
+  void set_ir_module_string(const std::string& ir_module_string) {
+    ir_module_string_ = ir_module_string;
+  }
+
+  const std::string& module_name() const { return module_name_; }
+
+  static int64_t ShapeSizeBytes(const Shape& shape);
+
+  // Type of the computation function we expect in the JIT.
+  using ComputeFunctionType =
+      void (*)(void* /*result*/, const ExecutableRunOptions* /*run_options*/,
+               const void** /*args*/, void** /*buffer_table*/,
+               XlaCustomCallStatus* /*status*/, int64_t* /*profile_counters*/);
+
+  bool has_compute_function() const { return compute_function_ != nullptr; }
+  ComputeFunctionType compute_function() const { return compute_function_; }
+
+  bool has_thunks() const { return thunks_.has_value(); }
+  ThunkExecutor& thunks() { return *thunks_; }
+
+  const BufferAssignment& buffer_assignment() const { return *assignment_; }
+  absl::Span<const ConstantAllocation> constants() const { return constants_; }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override;
+
+  absl::Span<const BufferAllocation> GetAllocations() const override {
+    return assignment_->Allocations();
+  }
+
+  FunctionLibrary* function_library() const { return function_library_.get(); }
+
+ private:
+  // Creates an array suitable for passing as the "buffer_table" argument to the
+  // JIT compiled function pointer.
+  //
+  // Returns (unowning_buffers, owning_buffers) where:
+  //
+  //  - unowning_buffers.data() can be passed as the buffer_table argument as-is
+  //    and includes pointers to the scratch storage required by the
+  //    computation, the live-out buffer into which the result will be written
+  //    and entry computation parameters.
+  //
+  //  - owning_buffers contains owning pointers to the buffers that were
+  //    allocated by this routine.  This routine allocates buffers for temporary
+  //    storage and the live-out buffer into which the computation writes it
+  //    result.
+  //
+  //  - buffers_to_free: buffers whose ownership was donated by the caller that
+  //    are to be freed by the caller.
+  absl::StatusOr<std::vector<MaybeOwningDeviceMemory>> CreateBufferTable(
+      se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+      absl::Span<ExecutionInput const> arguments);
+
+  // Creates an Execution output holding ScopedShapedBuffer for holding the
+  // result of the computation, moving buffers out of allocated_buffers and into
+  // the result as appropriate.  The addresses are set according to buffer
+  // assignment.
+  absl::StatusOr<ExecutionOutput> CreateResultShapedBuffer(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<MaybeOwningDeviceMemory> buffers,
+      absl::Span<ExecutionInput> arguments);
+
+  // Returns the instruction value set of the root instruction of the entry
+  // computation. Uses dataflow analysis from buffer assignment.
+  const InstructionValueSet& GetRootValueSet() const;
+
+  // The FunctionLibrary containing compiled modules.
+  std::unique_ptr<FunctionLibrary> function_library_;
+
+  // Object files (machine code) compiled from an HLO module by the JIT
+  // compiler. We capture all object files created by JitCompiler so we can
+  // export them to AOT compilation result.
+  std::vector<std::string> obj_files_;
+
+  // Buffer assignment for the buffers we need to allocate.
+  const std::unique_ptr<const BufferAssignment> assignment_;
+
+  // The LLVM IR, in string format, of the unoptimized module generated for this
+  // CpuExecutable. We save a string instead of an llvm::Module* because leaving
+  // llvm::Module* in a singleton can cause the heap checker to emit false
+  // positives.
+  std::string ir_module_string_;
+
+  // Unique identifier.
+  std::string module_name_;
+
+  // We have two execution modes:
+  //
+  //   (1) HLO module compiled to a single function using LLVM JIT and we get
+  //       a function pointer to it.
+  //   (2) HLO module compiled to a thunk sequence that gets interpreted at run
+  //       time.
+  //
+  // We are currently transitioning from (1) to (2) with a long term plan to
+  // unify thunk-based runtime with all XLA backends.
+
+  // A function pointer to the jit-compiled entry function.
+  ComputeFunctionType compute_function_ = nullptr;
+
+  // A thunk executor created from the compiled thunk sequence.
+  std::optional<ThunkExecutor> thunks_;
+  // Vector indexed by BufferAllocation::Index for efficient access.
+  std::vector<ConstantAllocation> constants_;
+
+  // Entry function name for the computation.
+  const std::string entry_function_name_;
+
+  CpuExecutable(std::unique_ptr<HloModule> hlo_module,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+                std::unique_ptr<const BufferAssignment> assignment);
+  CpuExecutable(const CpuExecutable&) = delete;
+  CpuExecutable& operator=(const CpuExecutable&) = delete;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_executable_run_options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_executable_run_options.h
new file mode 100644
index 00000000..6d78723c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_executable_run_options.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_EXECUTABLE_RUN_OPTIONS_H_
+#define XLA_SERVICE_CPU_CPU_EXECUTABLE_RUN_OPTIONS_H_
+
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+
+namespace xla::cpu {
+
+// CPU-specific executable options.
+// We keep these separate from ExecutableRunOptions to avoid adding
+// dependencies to ExecutableRunOptions.
+class CpuExecutableRunOptions {
+ public:
+  CpuExecutableRunOptions& set_collectives(CpuCollectives* collectives) {
+    collectives_ = collectives;
+    return *this;
+  }
+  CpuCollectives* collectives() const { return collectives_; }
+
+ private:
+  // For cross-process collectives, use this collective implementation to
+  // communicate.
+  CpuCollectives* collectives_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_CPU_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_float_support.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_float_support.h
new file mode 100644
index 00000000..38c6a9bd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_float_support.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
+#define XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/float_support.h"
+
+namespace xla {
+namespace cpu {
+
+class CpuFloatSupport : public FloatSupport {
+ public:
+  explicit CpuFloatSupport(PrimitiveType low_precision_type)
+      : FloatSupport(low_precision_type) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
+           IsSupported(hlo);
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
+  }
+
+ private:
+  bool IsSupported(const HloInstruction& hlo) const;
+  // Performs early check for things that cannot be delayed becuase some later
+  // passes may change the shape of dot inputs.
+  bool DotSupported(const HloInstruction& hlo) const;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
new file mode 100644
index 00000000..e5c4c54b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_INSTRUCTION_FUSION_H_
+#define XLA_SERVICE_CPU_CPU_INSTRUCTION_FUSION_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/fusion_node_indexing_evaluation.h"
+#include "xla/service/instruction_fusion.h"
+
+namespace xla {
+namespace cpu {
+
+class CpuInstructionFusion : public InstructionFusion {
+ public:
+  CpuInstructionFusion()
+      : InstructionFusion(CpuInstructionFusion::IsExpensive) {}
+  ~CpuInstructionFusion() override = default;
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    fusion_node_evaluations_.clear();
+    return InstructionFusion::Run(module, execution_threads);
+  }
+
+  // Returns the threshold for a constant to be considered a large constant.
+  static constexpr int64_t GetLargeConstantThresholdBytes() {
+    constexpr int64_t kLargeConstantThresholdBytes = 10000;
+    return kLargeConstantThresholdBytes;
+  }
+
+ protected:
+  FusionDecision ShouldFuse(HloInstruction* consumer,
+                            int64_t operand_index) override;
+  HloInstruction::FusionKind ChooseKind(
+      const HloInstruction* producer, const HloInstruction* consumer) override;
+
+ private:
+  HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
+                                  HloInstruction* producer) override;
+
+  // Returns if a constant is large enough to be considered a large constant.
+  bool IsLargeConstant(const HloInstruction* constant) const;
+
+  // Keep track of the number of times each instruction inside a fusion node is
+  // indexed with different index vectors.
+  absl::flat_hash_map<const HloInstruction*, FusionNodeIndexingEvaluation>
+      fusion_node_evaluations_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_INSTRUCTION_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_layout_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_layout_assignment.h
new file mode 100644
index 00000000..f3e88c7c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_layout_assignment.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
+#define XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
+
+#include "absl/status/status.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/layout_assignment.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+namespace cpu {
+
+// CPU-specific layout assignment pass which preassigns layouts to satisfy
+// layout constraints for operands and results of library calls.
+class CpuLayoutAssignment : public LayoutAssignment {
+ public:
+  explicit CpuLayoutAssignment(
+      ComputationLayout* entry_computation_layout,
+      const TargetMachineFeatures* target_machine_features,
+      ChannelLayoutConstraints* channel_constraints = nullptr)
+      : LayoutAssignment(entry_computation_layout, channel_constraints),
+        target_machine_features_(*target_machine_features) {}
+  ~CpuLayoutAssignment() override {}
+
+ protected:
+  absl::Status AddBackendConstraints(LayoutConstraints* constraints) override;
+
+  const TargetMachineFeatures& target_machine_features_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_options.h
new file mode 100644
index 00000000..57ee1549
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_options.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_OPTIONS_H_
+#define XLA_SERVICE_CPU_CPU_OPTIONS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+
+#include "xla/service/hlo_module_config.h"
+
+// Helper functions for querying options that are specific to the CPU backend.
+
+namespace xla {
+namespace cpu {
+namespace options {
+
+bool OptimizeForSizeRequested(const HloModuleConfig& config);
+bool VectorizedReduceDisabled(const HloModuleConfig& config);
+bool SlpVectorizerDisabled(const HloModuleConfig& config);
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+std::optional<int64_t> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
+std::optional<std::tuple<int64_t, int64_t, int64_t>> LlvmIrGemmTileSize(
+    const HloModuleConfig& config);
+
+}  // namespace options
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_runtime.h
new file mode 100644
index 00000000..1212e776
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -0,0 +1,233 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header declares functions which may be called by the generated code on
+// the CPU. Calls to these functions must be resolved explicitly in the JIT in
+// xla::cpu::SimpleResolver.  It also defines a per-CpuExecutable context
+// which is used to cache expensive state and resources utilized by the
+// aforementioned functions.
+//
+// Other functions are declared in individual libraries as well, such as
+// runtime_conv2d and runtime_matmul. As individual libraries, callers for
+// ahead-of-time compilation can link only the required subset.
+
+#ifndef XLA_SERVICE_CPU_CPU_RUNTIME_H_
+#define XLA_SERVICE_CPU_CPU_RUNTIME_H_
+
+#include <cstdint>
+
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/xfeed_manager.h"
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+// Names of runtime functions. These get resolved from the generated code to the
+// right symbol at link time in one of two ways:
+// 1. When using the JIT, the symbol resolver (xla::cpu::RuntimeSymbolGenerator)
+//    maps this symbol name to the actual symbol.
+// 2. When using ahead-of-time compilation, the linker can resolve the name
+//    because it is a symbol in the cpu_runtime library.
+extern const char* const kEigenMatMulF16SymbolName;
+extern const char* const kEigenMatMulF32SymbolName;
+extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kEigenMatMulC64SymbolName;
+extern const char* const kEigenMatMulC128SymbolName;
+extern const char* const kEigenMatMulS32SymbolName;
+extern const char* const kEigenMatMulU8SymbolName;
+extern const char* const kEigenBatchMatMulF32SymbolName;
+extern const char* const kMKLConv2DF32SymbolName;
+extern const char* const kACLConv2DF32SymbolName;
+extern const char* const kACLMatMulF32SymbolName;
+extern const char* const kACLBatchMatMulF32SymbolName;
+extern const char* const kEigenConv2DF16SymbolName;
+extern const char* const kEigenConv2DF32SymbolName;
+extern const char* const kEigenConv3DF16SymbolName;
+extern const char* const kEigenConv3DF32SymbolName;
+extern const char* const kDuccFftSymbolName;
+extern const char* const kDuccSingleThreadedFftSymbolName;
+extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
+extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
+extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
+extern const char* const kEigenSingleThreadedMatMulF8E4M3FNSymbolName;
+extern const char* const kEigenSingleThreadedMatMulF8E5M2SymbolName;
+extern const char* const kEigenSingleThreadedMatMulC64SymbolName;
+extern const char* const kEigenSingleThreadedMatMulC128SymbolName;
+extern const char* const kEigenSingleThreadedMatMulS32SymbolName;
+extern const char* const kEigenSingleThreadedMatMulU8SymbolName;
+extern const char* const kEigenSingleThreadedConv2DF16SymbolName;
+extern const char* const kEigenSingleThreadedConv2DF32SymbolName;
+extern const char* const kEigenSingleThreadedConv3DF16SymbolName;
+extern const char* const kEigenSingleThreadedConv3DF32SymbolName;
+extern const char* const kAcquireInfeedBufferForDequeueSymbolName;
+extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
+extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
+extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
+extern const char* const kParallelForkJoinSymbolName;
+extern const char* const kPrintfToStderrSymbolName;
+extern const char* const kStatusIsSuccessSymbolName;
+extern const char* const kKeyValueSortSymbolName;
+extern const char* const kTopKF32SymbolName;
+extern const char* const kAllReduceSymbolName;
+extern const char* const kCollectivePermuteSymbolName;
+extern const char* const kPartitionIdSymbolName;
+extern const char* const kReplicaIdSymbolName;
+extern const char* const kTracingStartSymbolName;
+extern const char* const kTracingEndSymbolName;
+extern const char* const kAllToAllSymbolName;
+extern const char* const kAllGatherSymbolName;
+extern const char* const kReduceScatterSymbolName;
+extern const char* const kOneDnnMatMulSymbolName;
+extern const char* const kOneDnnSoftmaxSymbolName;
+extern const char* const kOneDnnLayerNormSymbolName;
+extern const char* const kOneDnnConvolutionSymbolName;
+extern const char* const kOneDnnMatMulReorderSymbolName;
+extern const char* const kHandleFfiCallSymbolName;
+
+// All symbol names for XLA CPU runtime functions need to start with this
+// prefix.
+extern const char* const kXlaCpuRuntimeSymbolNamePrefix;
+
+// Returns the infeed manager used by the CPU runtime for the CPU device
+// `device_ordinal`.  Note the device ordinal does not name a CPU
+XfeedManager* GetXfeedManager(int device_ordinal);
+
+int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options);
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+extern "C" {
+
+extern int __xla_cpu_runtime_PrintfToStderr(const char* format, ...);
+
+extern int64_t __xla_cpu_runtime_TracingStart(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    const char* name, const char* hlo_module, int64_t program_id);
+extern void __xla_cpu_runtime_TracingEnd(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int64_t id);
+
+// Some things common to all of the runtime entry points below:
+//
+//  * The shape pointer and shape_length reflect values that can be deserialized
+//    via llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass
+//    reified type information from the generated program to the runtime, which
+//    helps check the type safety and contract for the emitted-code/runtime
+//    communication.
+//
+//  * run_options is used to look up the device ordinal for the stream executor
+//    we're executing under.  If it is null the device ordinal is assumed to be
+//    0 (this behavior helps in writing tests).
+
+// Note: in the runtime entry points below, the shape pointer and shape_length
+// reflect values that can be deserialized via
+// llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass reified
+// type information from the generated program to the runtime, which helps check
+// the type safety and contract for the emitted-code/runtime communication.
+
+// Blocks until the next infeed buffer is ready to be dequeued, then
+// returns it. Fails catastrophically if the next enqueued buffer is
+// not of the correct length in bytes. Checking the shape rather than
+// the length would be more exact, but the length check is chosen as a
+// tradeoff between error checking and speed/simplicity.
+extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    const void* shape, int32_t shape_length);
+
+// Relinquishes the next infeed buffer that was returned by
+// __xla_cpu_runtime_AcquireInfeedBufferForDequeue. Once this call
+// completes the data at buffer_ptr may no longer be
+// accessed. buffer_length must match the length passed to the call to
+// __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
+// buffer_ptr. This function must be called before the next buffer is
+// acquired, i.e., there may only be one outstanding infeed buffer in
+// use by the runtime.  TODO(b/31340454) investigate whether or not it
+// is worth supporting zero-copy infeed where the buffer is retained
+// by the compiled code until it has been used. If zero-copy infeed is
+// implemented we will add support for multiple outstanding buffers
+// that can be returned out of order.
+extern void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    void* buffer_ptr, const void* shape_ptr, int32_t shape_length);
+
+// Blocks until the next outfeed buffer is available to be populated, then
+// returns it.
+extern void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    const void* shape_ptr, int32_t shape_length);
+
+// Relinquishes the outfeed buffer after it has been populated.
+// buffer_ptr must have been previously returned by
+// __xla_cpu_runtime_AcquireOutfeedBufferForPopulation.
+// Once this call completes, buffer_ptr may no longer be accessed.
+// buffer_length must match the length passed to the call to
+// __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
+// buffer_ptr. This function must be called before the next buffer is
+// acquired, i.e., there may only be one outstanding outfeed buffer in
+// use by the runtime.
+extern void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    void* buffer_ptr, const void* shape_ptr, int32_t shape_length);
+
+// Perform all reduce on a CPU.
+//
+// participating_replicas: array of replica IDs participating in the reduction,
+// cf. GetParticipatingIDs.
+// channel_id_present, op_id: whether op_id is a channel ID or a module ID.
+// reduction_kind: operator used for a reduction, cf. ReductionKind.
+// shape_ptr: shape of all input/output buffers.
+extern void __xla_cpu_runtime_AllReduce(
+    const xla::ExecutableRunOptions* run_options,
+    const void* replica_groups_str, int32_t replica_groups_str_size,
+    int32_t channel_id_present, int32_t use_global_device_ids, int64_t op_id,
+    int32_t reduction_kind, const void* shape_ptr, int32_t shape_length,
+    int32_t num_buffers, void** input_buffers, void** output_buffers);
+
+extern void __xla_cpu_runtime_CollectivePermute(
+    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
+    int64_t op_id, int32_t byte_size, void* input_buffer, void* output_buffer,
+    const void* source_target_pairs, int32_t source_target_pairs_size);
+
+extern void __xla_cpu_runtime_AllToAll(
+    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
+    int64_t op_id, const void* replica_groups_str,
+    int32_t replica_groups_str_size, int32_t num_buffers, int64_t buffer_size,
+    void** source_buffers, void** destination_buffers);
+
+extern void __xla_cpu_runtime_AllGather(
+    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
+    int32_t use_global_device_ids, int64_t op_id,
+    const void* replica_groups_str, int32_t replica_groups_str_size,
+    int64_t buffer_size, void* source_buffer, void* destination_buffer);
+
+void __xla_cpu_runtime_ReduceScatter(
+    const xla::ExecutableRunOptions* run_options,
+    const void* replica_groups_str, int32_t replica_groups_str_size,
+    int32_t channel_id_present, int32_t use_global_device_ids, int64_t op_id,
+    int32_t reduction_kind, int32_t element_type, int64_t chunk_elems,
+    void* input_buffer, void* output_buffer);
+
+// Write the partition ID into the output buffer.
+extern void __xla_cpu_runtime_PartitionId(
+    const xla::ExecutableRunOptions* run_options, void* output_buffer);
+// Write the replica ID into the output buffer.
+extern void __xla_cpu_runtime_ReplicaId(
+    const xla::ExecutableRunOptions* run_options, void* output_buffer);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_CPU_RUNTIME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_symbol_repository.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_symbol_repository.h
new file mode 100644
index 00000000..425ca211
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_symbol_repository.h
@@ -0,0 +1,29 @@
+#ifndef XLA_SERVICE_CPU_CPU_SYMBOL_REPOSITORY_H_
+#define XLA_SERVICE_CPU_CPU_SYMBOL_REPOSITORY_H_
+
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/symbol_repository.h"
+#include "xla/xla.pb.h"
+
+namespace xla::cpu {
+
+// CPU-specific fields for SymbolRepositories.
+struct CpuBackendSpecificData : public BackendSpecificData {};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_CPU_SYMBOL_REPOSITORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_transfer_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_transfer_manager.h
new file mode 100644
index 00000000..bd1c8f50
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_transfer_manager.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_
+#define XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/cpu/xfeed_manager.h"
+#include "xla/service/generic_transfer_manager.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// An implementation of the XLA GenericTransferManager that
+// handles CPU-specific infeed.
+class CpuTransferManager : public GenericTransferManager {
+ public:
+  CpuTransferManager();
+  ~CpuTransferManager() override {}
+
+  absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                       const LiteralSlice& literal) override;
+  absl::Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, MutableBorrowingLiteral literal) override;
+
+  bool CanShapedBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const ShapedBuffer& device_buffer) const override {
+    return true;
+  }
+
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override {
+    return true;
+  }
+
+  absl::Status ReadDynamicShapes(se::Stream* stream,
+                                 const ShapedBuffer* device_buffer,
+                                 Shape* device_shape) override;
+
+ private:
+  bool PackSubbyteTypes() const override { return true; }
+
+  CpuTransferManager(const CpuTransferManager&) = delete;
+  CpuTransferManager& operator=(const CpuTransferManager&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_xfeed.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_xfeed.h
new file mode 100644
index 00000000..4a5791d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/cpu_xfeed.h
@@ -0,0 +1,46 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_XFEED_H_
+#define XLA_SERVICE_CPU_CPU_XFEED_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/literal.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/shaped_buffer.h"
+
+// This provides a lower level API than TransferManager that does not depend on
+// StreamExecutor. It is intended to be used by callers that do not want to use
+// Stream or StreamExecutor.
+
+namespace xla {
+
+// Helper function to transfers to infeed on CPU.
+absl::Status TransferLiteralToInfeedOnCpu(int device_ordinal,
+                                          const LiteralSlice& literal);
+
+// Helper function to transfers from outfeed on CPU.
+absl::Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
+                                             MutableBorrowingLiteral literal);
+
+// Helper function to retrieve dynamic shape on CPU.
+absl::Status ReadDynamicShapesOnCpu(
+    const ShapedBuffer* device_buffer, Shape* device_shape,
+    HloCostAnalysis::ShapeSizeFunction shape_size_fn);
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_CPU_XFEED_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/dot_op_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/dot_op_emitter.h
new file mode 100644
index 00000000..f267cc48
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/dot_op_emitter.h
@@ -0,0 +1,139 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
+#define XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trivial dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
+};
+
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  DotInfo() = default;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
+  }
+};
+
+// Returns true if `instr` is a batch dot.
+bool IsBatchDot(const HloInstruction& instr);
+
+// Returns true if `dot_info` is a batch dot.
+bool IsBatchDot(const DotInfo& dot_info);
+
+// Returns `DotInfo` for the inner dot operation of the `batch_dot`.
+DotInfo InnerDotInfo(const DotInfo& batch_dot);
+
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const HloInstruction& instr,
+    const TargetMachineFeatures& target_machine_features);
+
+// Returns true if the two operands and the output of `dot_instr` must have row
+// major layout.
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features);
+
+// Returns true our lowering strategy for `dot_instr` can fold in transposes to
+// the either of the inputs.
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features);
+
+// Returns the index for an operand to `hlo` that should ideally be column
+// major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
+// or a fusion containing a dot.
+std::optional<int64_t> ProfitableToMakeDotOperandColumnMajor(
+    const HloInstruction& hlo);
+
+// Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
+// place the result in target_array. IR is emitted at current insert point of
+// the builder. Upon completion of the method, the insert point is set to the
+// end of all instructions emitted for this operation.
+//
+// If `addend_array` is not nullptr then it must be an array of the same
+// dimensions as the result, and the result is computed as `addend_array` +
+// dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
+// for Matrix-vector products.
+//
+// If `allow_runtime_calls` is false and DotEmitter tries to emit a call to a
+// runtime API, it will return an error.
+absl::Status EmitDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilderBase* b,
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls = true);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/elemental_ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
new file mode 100644
index 00000000..921df54d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ELEMENTAL_IR_EMITTER_H_
+#define XLA_SERVICE_CPU_ELEMENTAL_IR_EMITTER_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/elemental_ir_emitter.h"
+
+namespace xla::cpu {
+
+class CpuElementalIrEmitter final : public ElementalIrEmitter {
+ public:
+  using ThreadLocalCallPrototype = absl::StatusOr<std::vector<llvm::Value*>>(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer);
+  using ThreadLocalCallCallback = absl::AnyInvocable<ThreadLocalCallPrototype>;
+
+  CpuElementalIrEmitter(llvm::Module* llvm_module, llvm::IRBuilderBase* builder,
+                        ThreadLocalCallCallback thread_local_call_fn,
+                        bool use_truncate_f32_to_bf16_conversion,
+                        bool fast_min_max)
+      : ElementalIrEmitter(llvm_module, builder,
+                           Options{use_truncate_f32_to_bf16_conversion}),
+        thread_local_call_fn_(std::move(thread_local_call_fn)),
+        fast_min_max_(fast_min_max) {}
+
+ private:
+  absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                         llvm::Value* lhs, llvm::Value* rhs,
+                                         absl::string_view) override;
+
+  absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                        llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
+  absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer) override;
+
+  bool fast_min_max() override { return fast_min_max_; }
+
+ private:
+  ThreadLocalCallCallback thread_local_call_fn_;
+  bool fast_min_max_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_ELEMENTAL_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/elemental_math_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/elemental_math_emitter.h
new file mode 100644
index 00000000..c85c07a7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/elemental_math_emitter.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ELEMENTAL_MATH_EMITTER_H_
+#define XLA_SERVICE_CPU_ELEMENTAL_MATH_EMITTER_H_
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<llvm::Value*> EmitAtan2(llvm::Module* module,
+                                       llvm::IRBuilderBase& b,
+                                       PrimitiveType prim_type,
+                                       llvm::Value* lhs, llvm::Value* rhs);
+
+absl::StatusOr<llvm::Value*> EmitTanh(llvm::Module* module,
+                                      llvm::IRBuilderBase& b,
+                                      PrimitiveType prim_type,
+                                      llvm::Value* value);
+
+absl::StatusOr<llvm::Value*> EmitErf(llvm::Module* module,
+                                     llvm::IRBuilderBase& b,
+                                     PrimitiveType prim_type,
+                                     llvm::Value* value);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_ELEMENTAL_MATH_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emission_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emission_utils.h
new file mode 100644
index 00000000..7c081511
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emission_utils.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
+#define XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "llvm/IR/Value.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace cpu {
+
+bool PotentiallyImplementedAsEigenConvolution(
+    const HloInstruction& convolution,
+    const TargetMachineFeatures& target_machine_features);
+
+// Computes the minimum alignment guaranteed for a tensor of shape `shape` on
+// the target machine.
+int64_t GetMinimumAlignmentForArray(
+    const Shape& shape, const TargetMachineFeatures& target_machine_features);
+
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// See IrFunction and ParallelLoopEmitter for details.
+using DynamicLoopBounds = std::vector<std::pair<llvm::Value*, llvm::Value*>>;
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emitter.h
new file mode 100644
index 00000000..f96d43fa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -0,0 +1,867 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_IR_EMITTER_H_
+#define XLA_SERVICE_CPU_IR_EMITTER_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/TargetParser/Triple.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/elemental_ir_emitter.h"
+#include "xla/service/cpu/ir_function.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/alias_analysis.h"
+#include "xla/service/llvm_ir/fused_ir_emitter.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/ir_builder_mixin.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/service/name_uniquer.h"
+#include "xla/xla_data.pb.h"
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+#include "xla/service/cpu/onednn_memory_util.h"
+#endif
+
+namespace xla {
+namespace cpu {
+
+// Forward declare emitter for XLA:CPU thunks.
+class IrEmitter2;
+
+bool IsNativeConvertSupportedOnTargetCPU(std::string feature_string);
+
+// This class is the top-level API for the XLA HLO --> LLVM IR compiler.  It
+// implements the DfsHloVisitor interface and emits HLO computations as LLVM IR
+// functions.
+// NOTE: A lot of functionality in this class (e.g. ElementTypesSameAndSupported
+// helper function) is duplicated by ThunkEmitter and IrEmitter2. These two
+// classes are part of the new runtime and will eventually replace IrEmitter.
+class IrEmitter : public DfsHloVisitorWithDefault,
+                  public IrBuilderMixin<IrEmitter> {
+  class ElementalIrEmitter;
+
+ public:
+  using GeneratorForOperandIrArrays =
+      std::function<std::vector<llvm_ir::IrArray>()>;
+
+  // Create a new LLVM IR emitter.
+  //
+  // hlo_module: the HLO module we are emitting IR for.
+  // assignment: a BufferAssignment from which we know which buffers are used by
+  //             the HLO nodes.
+  // mlir_context: the MLIR context used for IR emission.
+  // llvm_module: the LLVM module to emit IR into. It's built using the LLVM
+  //              context inside of mlir_context.
+  // instruction_to_profile_idx: the mapping from HLO instructions to their
+  //              index in the profiling array.
+  // computation_to_profile_idx: the mapping from HLO computations to their
+  //              index in the profiling array.
+  // computation_transitively_contains_custom_call: the mapping from HLO
+  //   computations to whether or not they transitively contain a custom-call
+  //   instruction. All computations in the module must have a key in this
+  //   map.
+  // emit_code_for_msan: whether emitted code should be compatible with msan.
+  IrEmitter(mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+            const BufferAssignment& assignment, llvm::Module* llvm_module,
+            absl::flat_hash_map<const HloInstruction*, int64_t>
+                instruction_to_profile_idx,
+            absl::flat_hash_map<const HloComputation*, int64_t>
+                computation_to_profile_idx,
+            absl::flat_hash_map<const HloComputation*, bool>
+                computation_transitively_contains_custom_call,
+            const TargetMachineFeatures* target_machine,
+            bool emit_code_for_msan);
+  ~IrEmitter() override;
+
+  // Emit and return the given HLO computation as an LLVM IR
+  // function.
+  //
+  // function_name_prefix is the desired name of the function. If the name is
+  // not unique among already emitted functions then a suffix is appended to
+  // make the name unique.
+  //
+  // 'is_top_level_computation' has the following meanings for each CPU backend:
+  // *) sequential: indicates that this is the entry computation of the HLO
+  //    module.
+  // *) parallel: indices that this is the callee of a kCall HLO in the entry
+  //    computation of the HLO module.
+  //
+  // If 'instruction_order' is not NULL, then the HLO instructions are emitted
+  // in the given order.  In this case, 'instruction_order' must be a
+  // topological sort of the set of nodes accessible from the root of the
+  // computation.
+  //
+  // If 'allow_reassociation' is true, the fast-math reassociation flag will
+  // be enabled in the function's body. This is used when emitting reducers.
+  absl::StatusOr<llvm::Function*> EmitComputation(
+      HloComputation* computation, absl::string_view function_name_prefix,
+      bool is_top_level_computation,
+      absl::Span<HloInstruction* const> instruction_order,
+      bool allow_reassociation,
+      absl::Span<const llvm::Attribute::AttrKind> function_attributes = {});
+
+  llvm::IRBuilderBase* b() { return current_builder_; }
+  const llvm::IRBuilderBase* b() const { return current_builder_; }
+  // builder() is for IrBuilderMixin.
+  llvm::IRBuilderBase* builder() { return current_builder_; }
+  const llvm::IRBuilderBase* builder() const { return current_builder_; }
+
+  IrFunction* compute_function() { return &compute_function_.top(); }
+
+  // Used by IrEmitter
+  void PushComputeFunction(const std::string& function_name,
+                           llvm::Function::LinkageTypes linkage,
+                           const HloModuleConfig& module_config,
+                           llvm::Module* llvm_module,
+                           int64_t num_dynamic_loop_bounds) {
+    compute_function_.emplace(function_name, linkage, module_config,
+                              llvm_module, b(), num_dynamic_loop_bounds);
+  }
+
+  // Used by IrEmitter2
+  void PushComputeFunction(llvm::IRBuilderBase* b, llvm::Module* llvm_module,
+                           int64_t num_dynamic_loop_bounds,
+                           llvm::Function* function,
+                           llvm::Value* dynamic_loop_bounds_arg,
+                           llvm::BasicBlock* return_block) {
+    function->getEntryBlock().getTerminator()->eraseFromParent();
+    b->SetInsertPoint(&function->getEntryBlock());
+    compute_function_.emplace(b, llvm_module, num_dynamic_loop_bounds, function,
+                              dynamic_loop_bounds_arg, return_block);
+  }
+
+  void PopComputeFunction() {
+    // At this point, the compute function destructor adds a branch to the
+    // return block.
+    compute_function_.pop();
+  }
+
+  // Emit LLVM global variable for a small constant buffer allocation.
+  absl::Status EmitSmallConstantGlobals();
+
+  // Emit LLVM global variables for all constant buffer allocations.
+  absl::Status EmitAllConstantGlobals();
+
+  // Emits a call to a thread local function (e.g. to the computation nested
+  // within a reduce or a map).  Thread local callees (by definition) only write
+  // to and read from thread local allocations.
+  // Supports only functions returning scalars or tuples of scalars.
+  //
+  // `parameters` holds the *scalar values* that need to be passed to the
+  // callee.  The return value is the scalar returned by the callee.
+  //
+  // If `in_compute_function` is true, the call is emitted inside the compute
+  // function emitted by a legacy IrEmitter and has access to executable run
+  // options, status flag, etc. If `in_compute_function` is false, then the call
+  // is inside nested computation of a host kernel emitted for thunks and it
+  // can only emit simple scalar computations and has no way to call back into
+  // the runtime.
+  std::vector<llvm::Value*> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer, bool in_compute_function = true);
+
+  // Returns true if given computation has been emitted.
+  bool is_computation_emitted(const HloComputation& callee,
+                              bool allow_reassociation) {
+    return emitted_functions_.contains({&callee, allow_reassociation});
+  }
+
+  const TargetMachineFeatures& target_machine_features() const {
+    return target_machine_features_;
+  }
+
+  const BufferAssignment& assignment() const { return assignment_; }
+
+  // IRBuilderGuard is a RAII class that temporarily replaces the IRBuilder.
+  // This is convenient for reusing the same logic with a different builder.
+  class IRBuilderGuard {
+   public:
+    IRBuilderGuard() = default;
+    explicit IRBuilderGuard(IrEmitter* ir_emitter, llvm::IRBuilderBase* builder)
+        : ir_emitter_(ir_emitter),
+          original_builder_(ir_emitter->current_builder_) {
+      ir_emitter_->current_builder_ = builder;
+    }
+
+    IRBuilderGuard(IRBuilderGuard&& other) = delete;
+    IRBuilderGuard& operator=(IRBuilderGuard&& other) = delete;
+
+    ~IRBuilderGuard() {
+      if (ir_emitter_ != nullptr) {
+        ir_emitter_->current_builder_ = original_builder_;
+      }
+    }
+
+   private:
+    IrEmitter* ir_emitter_ = nullptr;
+    llvm::IRBuilderBase* original_builder_ = nullptr;
+  };
+
+  // WithBuilder is a convenience function that creates and returns a
+  // IRBuilderGuard for the current IrEmitter.
+  [[nodiscard]] IRBuilderGuard WithBuilder(llvm::IRBuilderBase& builder) {
+    return IRBuilderGuard(this, &builder);
+  }
+
+  absl::Status EmitNestedComputation(const HloComputation& callee,
+                                     absl::string_view name, bool is_reducer);
+
+ protected:
+  friend class IrEmitter2;
+
+  // Emit an LLVM global variable for every constant buffer allocation.
+  absl::Status EmitConstantGlobals(std::optional<size_t> max_size_bytes);
+
+  //
+  // The following methods implement the DfsHloVisitor interface.
+  //
+  // Default action which emits code for most operations. Operations which are
+  // special in some way are handled explicitly in HandleFoo methods.
+  absl::Status DefaultAction(HloInstruction* hlo) override;
+
+  absl::Status HandleAllGather(HloInstruction* instruction) override;
+  absl::Status HandleAllToAll(HloInstruction* instruction) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleFft(HloInstruction* fft) override;
+  absl::Status HandleAllReduce(HloInstruction* crs) override;
+  absl::Status HandleReduceScatter(HloInstruction* crs) override;
+  absl::Status HandleCollectivePermute(HloInstruction* crs) override;
+  absl::Status HandleInfeed(HloInstruction* instruction) override;
+  absl::Status HandleOutfeed(HloInstruction* outfeed) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleParameter(HloInstruction* parameter) override;
+  absl::Status HandleReduce(HloInstruction* reduce) override;
+  absl::Status HandleReduceWindow(HloInstruction* reduce_window) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandlePad(HloInstruction* pad) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleAfterAll(HloInstruction* after_all) override;
+  absl::Status HandleGetDimensionSize(HloInstruction* get_size) override;
+  absl::Status HandleSetDimensionSize(HloInstruction* set_size) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status HandleReplicaId(HloInstruction* hlo) override;
+  absl::Status HandleRng(HloInstruction* rng) override;
+  absl::Status HandleRngBitGenerator(HloInstruction* rng) override;
+  absl::Status HandleRngGetAndUpdateState(HloInstruction* rng_state) override;
+  absl::Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
+  absl::Status HandleBatchNormTraining(
+      HloInstruction* batch_norm_training) override;
+  absl::Status HandleStochasticConvert(HloInstruction* instruction) override;
+  absl::Status FinishVisit(HloInstruction* root) override;
+
+  absl::Status Preprocess(HloInstruction* hlo) override;
+  absl::Status Postprocess(HloInstruction* hlo) override;
+
+  absl::Status HandlePad(HloInstruction* pad,
+                         const llvm_ir::IrArray& operand_array,
+                         const llvm_ir::IrArray& padding_value_array,
+                         const llvm_ir::IrArray& output_array);
+
+  // A convenient helper for calling BufferAssignment::GetUniqueSlice.
+  BufferAllocation::Slice GetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index = {}) const {
+    return assignment_.GetUniqueSlice(&hlo, index).value();
+  }
+
+ private:
+  absl::Status HandleSliceToDynamic(HloInstruction* hlo);
+  absl::Status HandlePadToStatic(HloInstruction* hlo);
+  absl::Status HandleTopK(HloInstruction* hlo) override;
+  absl::Status HandleAllReduceSingleReplica(HloInstruction* crs);
+  absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs);
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+  std::vector<StackAlloca> EmitOneDnnOperandsAlloca(HloInstruction* custom_call,
+                                                    llvm::Value*& args_val,
+                                                    int& arg_indx);
+  absl::Status HandleOneDnnMatMulCalls(HloInstruction* hlo,
+                                       std::string runtime_symbol_name);
+  absl::Status HandleOneDnnSoftmax(HloInstruction* hlo);
+  absl::Status HandleOneDnnLayerNorm(HloInstruction* hlo);
+  absl::Status HandleOneDnnConvolution(HloInstruction* hlo);
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+  // Private helper to initialize an IR function for the computation.
+  void InitializeIrFunction(const std::string& function_name);
+
+  // Emits the copying epilogue for the function,
+  // where it copies the returned value to the reserved alloca.
+  // This is only necessary for thread-local functions.
+  // Note that since the call graph is flattened, if the same function is
+  // called in both thread-local and non-thread-local it would be codegen'd
+  // twice, and we would know whether it's thread-local at codegen time.
+  void EmitThreadLocalFunctionEpilogue(HloComputation* computation);
+
+  // Convenience functions to generate a GEP into the profile counter parameter
+  // which would correspond to the index for a given HLO instruction or
+  // computation.
+  llvm::Value* GetProfileCounterFor(const HloInstruction& instruction);
+  llvm::Value* GetProfileCounterFor(const HloComputation& computation);
+
+  // Helper function template for the implementation of the above two functions.
+  template <typename T>
+  llvm::Value* GetProfileCounterCommon(
+      const T& hlo,
+      const absl::flat_hash_map<const T*, int64_t>& profile_index_map);
+
+  // Gets the IR Value emitted previously for the given hlo.
+  //
+  // Prefer calling GetIrArrayFor if the value you're reading is a buffer,
+  // because GetIrArrayFor annotates buffer's loads/stores with noalias
+  // metadata.
+  //
+  // Make sure to call this only when you're certain a value *was* emitted - if
+  // not found, this will log a fatal error.
+  llvm::Value* GetEmittedValueFor(const HloInstruction* hlo);
+
+  // Gets an IrArray representing the given hlo.
+  llvm_ir::IrArray GetIrArrayFor(const HloInstruction* hlo);
+
+  // Gets a list of IrArrays, one for each of hlo's operands.
+  std::vector<llvm_ir::IrArray> GetIrArraysForOperandsOf(
+      const HloInstruction* hlo);
+
+  // Bind all argument IrArrays of `fusion` to `fused_emitter`.
+  void BindFusionArguments(const HloInstruction* fusion,
+                           FusedIrEmitter* fused_emitter);
+
+  // Augments IrArray with aliasing information.
+  void AddAliasingInformationToIrArray(const HloInstruction& hlo,
+                                       llvm_ir::IrArray* array) {
+    alias_analysis_.AddAliasingInformationToIrArray(hlo, array);
+  }
+
+  // Convenience function to get the IR type matching the given shape.
+  llvm::Type* IrShapeType(const Shape& shape);
+
+  // Get the llvm::Value* that represents the "prof_counters" argument of the
+  // computation function being emitted by this emitter.
+  llvm::Value* GetProfileCountersArgument();
+
+  // Get the llvm::Value* that represents the "status" argument of the
+  // computation function being emitted by this emitter.
+  llvm::Value* GetStatusArgument();
+
+  // Get the xla::ExecutableRunOptions that represents the "run_options"
+  // argument of the computation function being emitted by this emitter.
+  llvm::Value* GetExecutableRunOptionsArgument();
+
+  // Get the llvm::Value* that represents the "buffer_table" argument of the
+  // computation function being emitted by this emitter.
+  llvm::Value* GetBufferTableArgument();
+
+  // Get the llvm::BasicBlock that contains the return instruction.
+  llvm::BasicBlock* GetReturnBlock();
+
+  // Emits code to check the state of the status object being threaded through
+  // each computation and return early if it's in an error state.
+  void EmitEarlyReturnIfErrorStatus();
+
+  // Helper for EmitBufferPointer.
+  llvm::Value* EmitGlobalBufferPointer(const BufferAllocation::Slice& slice,
+                                       const Shape& target_shape);
+
+  // Helper for EmitBufferPointer.
+  llvm::Value* EmitThreadLocalBufferPointer(
+      const BufferAllocation::Slice& slice, const Shape& target_shape);
+
+  // Emits code that computes the address of the given buffer allocation slice.
+  llvm::Value* EmitBufferPointer(const BufferAllocation::Slice& slice,
+                                 const Shape& target_shape);
+
+  // Similar to EmitThreadLocal, yet assumes that the function returns a scalar.
+  llvm::Value* EmitScalarReturningThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name);
+
+  // Emits a call to a "global" function (e.g. to the computation nested within
+  // a kWhile or a kCall).  Buffer assignment unabiguously assigns buffers to
+  // the parameters and return values for these computations so there is no need
+  // to explicitly pass parameters or return results.
+  void EmitGlobalCall(const HloComputation& callee, absl::string_view name);
+
+  // Returns the buffer to which a global call to `callee` would have written
+  // its result.
+  llvm::Value* GetBufferForGlobalCallReturnValue(const HloComputation& callee);
+
+  // Verifies that the element types of all of the given operand instructions
+  // match and are of one of the given supported types.
+  absl::Status ElementTypesSameAndSupported(
+      const HloInstruction& instruction,
+      absl::Span<const HloInstruction* const> operands,
+      absl::Span<const PrimitiveType> supported_types);
+
+  // Emit IR to perform a computation for every element in the given target op.
+  // This produces a series of nested loops (one for each dimension of the op's
+  // shape). The body of the inner-most loop is provided by the body_emitter
+  // function.
+  //
+  // desc is an optional human-readable string that's added to the loop name in
+  // IR.  Regardless of whether desc is provided, target_op->name() is included
+  // in the loop name.
+  absl::Status EmitTargetElementLoop(
+      const HloInstruction* target_op, absl::string_view desc,
+      const llvm_ir::ElementGenerator& element_generator,
+      std::optional<llvm_ir::IrArray> result_array_opt);
+
+  // Emits a memcpy from the source instruction's result value to the
+  // destination's.  Both source and destination must have an entry in the
+  // emitted_value_ table.
+  absl::Status EmitMemcpy(const HloInstruction& source,
+                          const HloInstruction& destination);
+
+  // Emits IR to compute the target address of the buffer for the given op.
+  // After calling this function, you can get a pointer to this buffer by
+  // calling GetIrArrayForOp or GetEmittedValueFor.
+  absl::Status EmitTargetAddressForOp(const HloInstruction* op);
+
+  // Structurizes "array_elements" into an MD array that represents "shape".
+  // This is a recursive function, and "dimension_index" indicates the index of
+  // the current dimension that the function is considering (0 means the
+  // most-minor dimension).
+  llvm::Constant* CreateInitializerForConstantArray(
+      const std::vector<llvm::Constant*>& array_elements, const Shape& shape,
+      int64_t dimension_index);
+
+  // Tries to codegen a reduction operation using vectorized instructions.
+  // Returns true if successful, and false on failure.  On failure, sets
+  // "failure_reason" to a string describing why it could not vectorize the
+  // reduction.
+  //
+  // TODO(sanjoy): Some of the things we do here can be abstracted out into
+  // concepts that generalize over other vectorizable operations.  We should
+  // consider pulling out these abstractions into a VectorizingIrEmitter or
+  // something similar.
+  absl::StatusOr<bool> EmitVectorizedReduce(
+      HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
+      absl::Span<const int64_t> dimensions, HloComputation* function,
+      std::string* failure_reason);
+
+  // We'd like to keep one or two one cache-line's worth of data in registers
+  // without generating IR with illegal (e.g. excessively large or
+  // non-power-of-two) vector types.  We do this by introducing a layer of
+  // abstraction: we introduce a high level vector-like concept called a
+  // "sharded vector" that models data parallelism, and is mapped to a sequence
+  // scalar and vector llvm::Value s.
+  //
+  // For example, we can represent 29 f32 elements by a sharded vector mapped to
+  // a sequence of LLVM values of types [<16 x f32>, <8 x f32>, <4 x f32>, f32].
+  // Note that the last element is scalar.
+  //
+  // There is no requirement on the ordering or the uniqueness of the elements
+  // mapped to sharded vectors -- we allow repeated elements, and we allow
+  // elements to appear in any order.
+  using ShardedVector = std::vector<llvm::Value*>;
+
+  // A sharded vector type is the element-wise llvm::Type's of some
+  // ShardedVector.
+  using ShardedVectorType = std::vector<llvm::Type*>;
+
+  // Create a sharded vector type corresponding to a "element_count" long
+  // sequence of "element_type" values.
+  ShardedVectorType CreateShardedVectorType(PrimitiveType element_type,
+                                            unsigned element_count);
+
+  // Emit LLVM IR to store the sharded vector "value_to_store" to
+  // "store_address".
+  void EmitShardedVectorStore(llvm::Value* store_address,
+                              const ShardedVector& value_to_store,
+                              llvm::Align alignment,
+                              const llvm_ir::IrArray& containing_array);
+
+  using ReductionGenerator = std ::function<llvm::Value*(
+      llvm::IRBuilderBase*, llvm::Value*, llvm::Value*)>;
+
+  // Tries to match the reduction function "function" to a known reduction
+  // pattern.  Returns a non-null ReductionGenerator on a successful match,
+  // which can be used to generate the LLVM IR corresponding to said reduction.
+  // On failure, this stores a reason string into "failure_reason".
+  ReductionGenerator MatchReductionGenerator(HloComputation* function,
+                                             std::string* failure_reason) const;
+
+  // Emits the inner loop nest that runs the reduction.  Helper function for
+  // EmitVectorizedReduce.
+  absl::StatusOr<ShardedVector> EmitInnerLoopForVectorizedReduction(
+      const ReductionGenerator& reduction_generator,
+      const llvm_ir::IrArray::Index& output_index,
+      const ShardedVectorType& accumulator_type, HloInstruction* init_value,
+      HloInstruction* arg, absl::Span<const int64_t> dimensions,
+      llvm::Align element_alignment);
+
+  // Checks if the given concatenate instruction can use a fast (memcpy)
+  // implementation.
+  absl::Status CanDoFastConcatenate(const HloInstruction* instr) const;
+
+  // Emits a fast concatenate operation using memcpy. Assumes all preconditions
+  // are met prior to calling this function (see CanDoFastConcatenate).
+  absl::Status EmitFastConcatenate(
+      const HloInstruction* instr,
+      absl::Span<const llvm_ir::IrArray> source_arrays,
+      const llvm_ir::IrArray& target_array);
+
+  // Emits LLVM IR to transfer "element_count" elements of type "primitive_type"
+  // from the address "source" to the address "target".
+  void EmitTransferElements(llvm::Value* target, llvm::Value* source,
+                            int64_t element_count, PrimitiveType primitive_type,
+                            const llvm_ir::IrArray& target_array,
+                            const llvm_ir::IrArray& source_array);
+
+  // Emit slice-to-dynamic.
+  absl::Status EmitSliceToDynamic(
+      const HloInstruction* hlo,
+      absl::Span<const llvm_ir::IrArray> source_arrays,
+      const llvm_ir::IrArray& target_array);
+
+  // Emits printing during the execution.
+  llvm::Value* EmitPrintf(absl::string_view fmt,
+                          absl::Span<llvm::Value* const> arguments);
+  llvm::Value* EmitPrintfToStderr(absl::string_view fmt,
+                                  absl::Span<llvm::Value* const> arguments);
+
+  // Emits a call to a non-variadic function `func_name` with arguments
+  // `arguments` assuming C calling convention.
+  llvm::Value* EmitCallToFunc(
+      std::string func_name, const std::vector<llvm::Value*>& arguments,
+      llvm::Type* return_type, bool does_not_throw = true,
+      bool only_accesses_arg_memory = false,
+      bool only_accesses_inaccessible_mem_or_arg_mem = false);
+
+  // Emits a call to a proxy that builds an FFI call frame for `custom_call`
+  llvm::Value* EmitCallToFfi(HloCustomCallInstruction* custom_call,
+                             llvm::AllocaInst* results_alloca,
+                             llvm::AllocaInst* operands_alloca);
+
+  // Assignment of the buffers needed by the computation and their shape
+  // information.
+  const BufferAssignment& assignment_;
+
+  // The LLVM module into which IR will be emitted.
+  llvm::Module* module_;
+
+  // The target architecture.
+  llvm::Triple::ArchType arch_type_;
+
+  // Used to produce unique names for generated functions.
+  NameUniquer name_uniquer_;
+
+  struct ComputationToEmit {
+    const HloComputation* computation;
+    bool allow_reassociation;
+
+    bool operator==(const ComputationToEmit& other) const {
+      return computation == other.computation &&
+             allow_reassociation == other.allow_reassociation;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const ComputationToEmit& c) {
+      return H::combine(std::move(h), c.computation, c.allow_reassociation);
+    }
+    friend std::ostream& operator<<(std::ostream& os,
+                                    const ComputationToEmit& c) {
+      return os << c.computation->name() << ", " << c.allow_reassociation;
+    }
+  };
+
+  // Map containing all previously emitted computations.
+  absl::flat_hash_map<ComputationToEmit, llvm::Function*> emitted_functions_;
+
+  // Map containing all previously emitted thread-local temporary buffers.
+  std::map<std::pair<llvm::Function*, BufferAllocation::Slice>, llvm::Value*>
+      thread_local_buffers_;
+
+  // The following fields track the IR emission state. According to LLVM memory
+  // management rules, their memory is owned by the module (Note that IrFunction
+  // creates the encapsulated llvm::Function s.t. it is added to the llvm
+  // module's function list).
+  // N.B. `main_builder_` must be ordered before `compute_function_` as
+  // `IrFunction::~IrFunction` references `main_builder_`. This will ensure that
+  // the destructor for `compute_function_` will run before the destructor for
+  // `main_builder_`.
+  llvm::IRBuilder<> main_builder_;
+  // The current builder to use for IR emission. This is either `main_builder_`
+  // or a temporary builder that replaces it.
+  llvm::IRBuilderBase* current_builder_;
+  std::stack<IrFunction> compute_function_;
+  mlir::MLIRContext* mlir_context_;
+  // The state of allow_reassociation_ is required so that that it is
+  // transitive to all nested computations.
+  bool allow_reassociation_ = false;
+
+  // The buffer allocation slice for the root of the computation being compiled.
+  // Only relevant for thread local computations.
+  BufferAllocation::Slice computation_root_allocation_;
+
+  // Maps the buffer allocation slices for the parameters to the computation
+  // being compiled to their parameter numbers.  Only relevant for thread local
+  // computations.
+  absl::flat_hash_map<BufferAllocation::Index, int64_t>
+      computation_parameter_allocations_;
+
+  // Maps HLO instructions to their index into the profile counter array.
+  const absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx_;
+
+  // Maps HLO computations to their index into the profile counter array.
+  const absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx_;
+
+  // Maps HLO computations to whether they contain a custom-call instruction
+  // (either directly, or transitively by e.g. calling another computation that
+  // does).
+  const absl::flat_hash_map<const HloComputation*, bool>
+      computation_transitively_contains_custom_call_;
+
+  // Accessor for the custom-call mapping that enforces the precondition that
+  // all computations must have a key in the map.
+  bool ComputationTransitivelyContainsCustomCall(
+      const HloComputation* computation) const {
+    auto it = computation_transitively_contains_custom_call_.find(computation);
+    CHECK(it != computation_transitively_contains_custom_call_.cend())
+        << "Must provide 'contains CustomCall' annotation for all computations "
+           "in the module";
+    return it->second;
+  }
+
+  // Maps HLOs to Values emitted for them.
+  absl::flat_hash_map<const HloInstruction*, llvm::Value*> emitted_value_;
+
+  llvm_ir::AliasAnalysis alias_analysis_;
+
+  // The number of outer dimensions of the root instruction's shape that
+  // will be partitioned when emitting parallel loops. (See
+  // ParallelLoopEmitter).
+  int64_t num_dynamic_loop_bounds_ = 0;
+
+  // Returns whether the given instruction should be emitted as a parallel loop.
+  bool ShouldEmitParallelLoopFor(const HloInstruction& op) const {
+    // Emit parallel loop for root instruction if dynamic outer-dimension loop
+    // bounds were specified.
+    return num_dynamic_loop_bounds_ > 0 &&
+           op.parent()->root_instruction() == &op;
+  }
+
+  // This struct contains all the state needed to emit instructions for
+  // profiling a computation.
+  class ProfilingState {
+   public:
+    ProfilingState() : use_rdtscp_(false) {}
+    explicit ProfilingState(bool use_rdtscp) : use_rdtscp_(use_rdtscp) {}
+
+    // Record the cycle counter before an HLO executes.
+    void RecordCycleStart(llvm::IRBuilderBase* b, HloInstruction* hlo);
+    // Record the number of cycles it took for an HLO to execute.
+    void RecordCycleDelta(llvm::IRBuilderBase* b, HloInstruction* hlo,
+                          llvm::Value* prof_counter);
+    // Record the number of cycles it took for the entire computation to
+    // execute.
+    void RecordCompleteComputation(llvm::IRBuilderBase* b,
+                                   llvm::Value* prof_counter);
+
+    // Convenience function to generate a call to an intrinsic which reads the
+    // CPU cycle counter.
+    llvm::Value* ReadCycleCounter(llvm::IRBuilderBase* b);
+
+    // Store the cycle counter delta to the per-HLO profile counter.
+    void UpdateProfileCounter(llvm::IRBuilderBase* b, llvm::Value* prof_counter,
+                              llvm::Value* cycle_end, llvm::Value* cycle_start);
+
+   private:
+    // Should we use the x86-specific rdtscp or the generic readcyclecounter
+    // intrinsic?
+    bool use_rdtscp_;
+
+    // The first read cycle counter in the program.
+    llvm::Value* first_read_cycle_start_ = nullptr;
+
+    // The last read cycle counter in the program.
+    llvm::Value* last_read_cycle_end_ = nullptr;
+
+    // Maps HLOs to the value the cycle counter contained right before the HLO
+    // began to execute.
+    absl::flat_hash_map<const HloInstruction*, llvm::Value*> cycle_starts_;
+  };
+
+  ProfilingState profiling_state_;
+
+  class TracingState {
+   public:
+    TracingState() : enabled_(false) {}
+    void set_enabled(bool value) { enabled_ = value; }
+    void EmitTracingStart(llvm::IRBuilderBase* b, HloInstruction* hlo,
+                          llvm::Value* run_options);
+    void EmitTracingEnd(llvm::IRBuilderBase* b, HloInstruction* hlo,
+                        llvm::Value* run_options);
+
+   private:
+    bool enabled_;
+    // Maps from HLO to the activity id returned by xprof::TraceMe.
+    absl::flat_hash_map<const HloInstruction*, llvm::Value*> activity_ids_;
+  };
+  TracingState tracing_state_;
+
+  // Given a load instruction and a shape or buffer size, annotate the load's
+  // result with the alignment required by the shape or size.
+  void AttachAlignmentMetadataForLoad(llvm::LoadInst* load, const Shape& shape);
+  void AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
+                                      int64_t buffer_size);
+
+  // Given a load instruction and a shape or buffer size, annotate the load's
+  // result with the dereferenceable bytes required by the shape / buffer size.
+  void AttachDereferenceableMetadataForLoad(llvm::LoadInst* load,
+                                            const Shape& shape);
+  static void AttachDereferenceableMetadataForLoad(llvm::LoadInst* load,
+                                                   int64_t buffer_size);
+
+  // Given a load instruction, annotate the load's result with the invariant
+  // load metadata.
+  void AttachInvariantLoadMetadataForLoad(llvm::LoadInst* load) const;
+  static void AttachInvariantLoadMetadataForLoad(llvm::LoadInst* load,
+                                                 const HloModuleConfig& config);
+
+  // Calculate the alignment of a buffer allocated for a given shape.
+  int MinimumAlignmentForShape(const Shape& shape);
+
+  // Calculate the alignment of a buffer allocated for a given primitive type.
+  int MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type);
+
+  // Returns the number of bytes within the shape.
+  int64_t ByteSizeOf(const Shape& shape) const;
+
+  enum class XfeedKind {
+    kInfeed,
+    kOutfeed,
+  };
+
+  // Emit IR to transfer between a {infeed,outfeed} buffer and an in-program
+  // address.
+  absl::Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
+                                 llvm::Value* program_buffer_address);
+
+  // Returns a ConstExpr bitcast.
+  llvm::Constant* EmitGlobalForLiteral(const Literal& literal);
+
+  CpuElementalIrEmitter ElementalIrEmmiterFactory();
+
+  const HloModule& hlo_module_;
+  const HloModuleConfig& hlo_module_config_;
+
+  bool is_top_level_computation_;
+
+  const TargetMachineFeatures& target_machine_features_;
+
+  struct LayoutSensitiveLiteralWrapper {
+    const Literal& literal;
+
+    template <typename H>
+    friend H AbslHashValue(H h, const LayoutSensitiveLiteralWrapper& wrapper) {
+      return Literal::Hash<H, /*layout_sensitive=*/true>(std::move(h),
+                                                         wrapper.literal);
+    }
+
+    bool operator==(const LayoutSensitiveLiteralWrapper& other) const {
+      return literal.Equal(other.literal, /*layout_sensitive=*/true);
+    }
+
+    // This is needed for InsertOrDie to work.
+    friend std::ostream& operator<<(
+        std::ostream& out, const LayoutSensitiveLiteralWrapper& wrapper) {
+      return out << wrapper.literal;
+    }
+  };
+
+  absl::flat_hash_map<LayoutSensitiveLiteralWrapper, llvm::Constant*>
+      emitted_literals_;
+
+  absl::flat_hash_map<BufferAllocation::Index, llvm::Constant*>
+      constant_buffer_to_global_;
+
+  std::vector<const HloComputation*> thread_local_computations_;
+  std::vector<const HloComputation*> global_computations_;
+
+  bool emit_code_for_msan_;
+
+  IrEmitter(const IrEmitter&) = delete;
+  IrEmitter& operator=(const IrEmitter&) = delete;
+};
+
+// Decoupled implementation of IrEmitter::EmitTransferElements.
+void EmitTransferElements(llvm::Value* target, llvm::Value* source,
+                          int64_t element_count, PrimitiveType primitive_type,
+                          const llvm_ir::IrArray& target_array,
+                          const llvm_ir::IrArray& source_array,
+                          llvm::Module* module, llvm::IRBuilderBase& b);
+
+// Decoupled implementation of IrEmitter::EmitFastConcatenate.
+absl::Status EmitFastConcatenate(
+    const HloInstruction* instr,
+    absl::Span<const llvm_ir::IrArray> source_arrays,
+    const llvm_ir::IrArray& target_array, llvm::Module* module,
+    llvm::IRBuilderBase& b);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emitter2.h
new file mode 100644
index 00000000..77ea6647
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -0,0 +1,200 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_IR_EMITTER2_H_
+#define XLA_SERVICE_CPU_IR_EMITTER2_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/elemental_ir_emitter.h"
+#include "xla/service/cpu/ir_emitter.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::cpu {
+
+// IrEmitter emits host kernels form HLO instructions into the LLVM module(s).
+//
+// Host kernel is simply a function that implements StreamExecutor HostKernel
+// interface (defined as C API for ABI stability), and XLA:CPU runtime is
+// responsible for launching host kernels on the host as a part of the Thunk
+// sequence execution.
+//
+// In addition to a host kernel function itself, host kernel defines how much
+// concurrency it can support by picking the right thread and block sizes.
+// Runtime might launch host kernel blocks and threads on a thread pool, with an
+// assumption that threads and blocks that are close to each other in three
+// dimensional space are likely to touch the same memory, and thus should be
+// executed on the same thread (or same NUMA node).
+//
+// At run time thunks resolve kernel functions by name in the compiled LLVM
+// module.
+//
+// WARNING: This is under construction and will eventually replace IrEmitter.
+class IrEmitter2 {
+ public:
+  friend class IrEmitter2Test;
+
+ private:
+  using KernelParameter = KernelApiIrBuilder::KernelParameter;
+  using KernelPrototype = KernelApiIrBuilder::KernelPrototype;
+
+ public:
+  IrEmitter2(const HloModule& hlo_module, llvm::Module* module,
+             IrEmitter* nested_ir_emitter);
+
+  // Emitted kernel information that defines how to launch it at run time.
+  struct KernelInfo {
+    explicit KernelInfo(KernelPrototype prototype,
+                        const se::BlockDim& block_dims,
+                        const se::ThreadDim& thread_dims);
+
+    std::string name;
+    se::BlockDim block_dims;
+    se::ThreadDim thread_dims;
+    absl::flat_hash_set<int64_t> invariant_arguments;
+  };
+
+  // Emitted comparator function information (for sort operation).
+  struct ComparatorInfo {
+    std::string name;
+  };
+
+  // Returns all the kernels emitted so far via this emitter.
+  absl::Span<const KernelInfo> kernels() const { return kernels_; }
+
+  absl::Span<const ComparatorInfo> comparators() const { return comparators_; }
+
+  // Emits a host kernel for the pad instruction.
+  absl::StatusOr<KernelInfo> EmitPadHostKernel(const HloInstruction* pad);
+
+  // Emits a host kernel for the given fusion instruction.
+  absl::StatusOr<KernelInfo> EmitFusionHostKernel(
+      const HloFusionInstruction* fusion);
+
+  // Emits a host kernel for the given dot instruction. Small dot operations
+  // are emitted as LLVM IR directly, while larger ones are emitted as a dot
+  // thunk that calls into libraries.
+  absl::StatusOr<KernelInfo> EmitDotHostKernel(const HloInstruction* instr);
+
+  // Emits a host kernel for the given concatenate instruction.
+  absl::StatusOr<KernelInfo> EmitConcatenateHostKernel(
+      const HloInstruction* instr);
+
+  // Emits a host kernel for the given dot fusion instruction (output fusion).
+  absl::StatusOr<KernelInfo> EmitDotFusionHostKernel(
+      const HloFusionInstruction* fusion);
+
+  // Emits a host kernel for the given slice-to-dynamic instruction.
+  absl::StatusOr<KernelInfo> EmitSliceToDynamicHostKernel(
+      const HloInstruction* instr);
+
+  // Emits a host kernel for the given dynamic-update-slice instruction.
+  absl::StatusOr<KernelInfo> EmitDynamicUpdateSliceHostKernel(
+      const HloInstruction* instr);
+
+  // Emits a comparator function for the given sort instruction.
+  absl::StatusOr<ComparatorInfo> EmitSortComparator(HloComputation* comparator);
+
+  absl::Status CanDoFastConcatenate(const HloInstruction* concatenate) const;
+  bool CanUpdateDynamicSliceInPlace(const HloInstruction* update) const;
+
+ private:
+  class ElementalIrEmitter;
+
+  // Emits a host kernel prototype for the given HLO instruction.
+  absl::StatusOr<KernelPrototype> EmitKernelPrototype(
+      const HloInstruction* instr);
+
+  // Parallel partition bounds for parallelized outer dimensions:
+  //   vector<[i64 lower_bound, i64 upper_bound]>
+  using ParallelPartitionBounds =
+      std::vector<std::pair<llvm::Value*, llvm::Value*>>;
+
+  // A config for running kernel in parallel. We rely on partitioning iteration
+  // space along the outer dimension(s) and run each partition as a separate
+  // task inside a runtime-managed thread pool.
+  struct ParallelConfig {
+    std::vector<int64_t> outer_dimension_partitions;
+  };
+
+  // Returns parallel config for the given instruction or std::nullopt if
+  // the instruction has to be compiled to a single threaded loop.
+  std::optional<ParallelConfig> GetParallelConfig(const HloInstruction* instr);
+
+  // Emits LLVM IR that computes parallel partition bounds from the call frame's
+  // block and thread dimensions and parallel execution config.
+  ParallelPartitionBounds EmitParallelPartitionBounds(
+      llvm::IRBuilderBase& b, const KernelPrototype& kernel_prototype,
+      const ParallelConfig& parallel_config, const Shape& shape,
+      absl::string_view name);
+
+  // Emits LLVM IR using elemental loop emitter and the given element generator.
+  // If the instruction is parallelized, it will emit a parallel loop partition
+  // and return the requested number of execution threads.
+  absl::StatusOr<se::ThreadDim> EmitElementalLoops(
+      llvm::IRBuilderBase& b, const HloInstruction* instr,
+      const KernelPrototype& kernel_prototype,
+      const llvm_ir::ElementGenerator& element_generator);
+
+  bool fast_min_max() const;
+
+  // Returns the number of bytes within the shape.
+  int64_t ByteSizeOf(const Shape& shape) const;
+
+  // Given a load instruction, annotate the load's result with the invariant
+  // load metadata.
+  void AttachInvariantLoadMetadataForLoad(llvm::LoadInst* instr) const;
+
+  CpuElementalIrEmitter ElementalIrEmmiterFactory(llvm::IRBuilderBase* b) const;
+
+  const HloModule& hlo_module_;
+  llvm::Module* module_;
+
+  // Nested IrEmitter to emit embedded computations (e.g. computations attached
+  // to reductions inside fusions).
+  IrEmitter* nested_ir_emitter_;
+
+  KernelApiIrBuilder kernel_api_ir_builder_;
+
+  // Keeps track of all the functions emitted so far.
+  std::vector<KernelInfo> kernels_;
+  std::vector<ComparatorInfo> comparators_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_IR_EMITTER2_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_function.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_function.h
new file mode 100644
index 00000000..f19d1ceb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/ir_function.h
@@ -0,0 +1,176 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_IR_FUNCTION_H_
+#define XLA_SERVICE_CPU_IR_FUNCTION_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/cpu/ir_emission_utils.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape_util.h"
+#include "xla/types.h"
+
+namespace xla {
+namespace cpu {
+
+// IrFunction creates and encapsulates an llvm::Function, exposing methods to
+// emitters for function and function argument access.
+// The llvm::Function is created with the standard function signature
+// used in the XLA CPU backend (see ir_function.cc for argument details).
+// In addition IrFunction saves the callers IR insert point during construction,
+// and restores it after destruction.
+//
+// Example usage:
+//
+//    // Create and initialize new IrFunction.
+//    std::unique_ptr<IrFunction> compute_function(new IrFunction(...));
+//    // Emit IR for function body using IrFunction helper methods.
+//    ...
+//    // Store reference to llvm::Function for future invocation.
+//    ir_functions.push_back(compute_function.function());
+//    // Delete IrFunction (finalizes IR function and restores caller insertion
+//    // point).
+//    compute_function.reset();
+//
+
+class IrFunction {
+ public:
+  IrFunction(const std::string& function_name,
+             llvm::Function::LinkageTypes linkage,
+             const HloModuleConfig& module_config, llvm::Module* llvm_module,
+             llvm::IRBuilderBase* b, int64_t num_dynamic_loop_bounds);
+
+  // Initialize an llvm::Function with existing function, created somewhere
+  // else, omit any extra work.
+  IrFunction(llvm::IRBuilderBase* b, llvm::Module* llvm_module,
+             int64_t num_dynamic_loop_bounds, llvm::Function* function,
+             // Function argument IR values.
+             // llvm::Argument* result_arg, llvm::Value* exec_run_options_arg,
+             // llvm::Value* parameters_arg, llvm::Value* buffer_table_arg,
+             llvm::Value* dynamic_loop_bounds_arg,
+             // llvm::Value* profile_counters_arg, llvm::Value* status_arg,
+             //  Basic block containing return.
+             llvm::BasicBlock* return_block);
+
+  ~IrFunction();
+
+  // Emit IR to read and return the set of IR values representing the dynamic
+  // loop bounds argument of this function. These bounds delimit the subset
+  // of the output that will be written by the computation's root instruction at
+  // runtime. This is used for parallel computations, where a single computation
+  // is partitioned into N calls to a function with parallel loop bounds, and
+  // then called N times in parallel with loop bounds limiting each call to
+  // producing 1/N of the output.
+  //
+  // Each element in returned vector is a pair of ir values representing the
+  // loop bounds for a specific dimension, where the first element of the pair
+  // is the dimension start index, and the second element of the pair is the
+  // dimension limit.
+  //
+  // EX: [dimension_i_index_start_ir_value, // dimension_i_index_limit_ir_value]
+  DynamicLoopBounds GetDynamicLoopBounds();
+
+  // Returns the encapculated llvm::Function.
+  llvm::Function* function() { return function_; }
+
+  // Get the llvm::Value* that represents this functions "retval" argument.
+  llvm::Argument* result_arg() { return result_arg_; }
+
+  // Get the xla::ExecutableRunOptions that represents this functions
+  // "run_options" argument.
+  llvm::Value* exec_run_options_arg() { return exec_run_options_arg_; }
+
+  // Get the llvm::Value* that represents this functions parameters argument.
+  llvm::Value* parameters_arg() { return parameters_arg_; }
+
+  // Get the llvm::Value* that represents this functions "buffer_table"
+  // argument.
+  llvm::Value* buffer_table_arg() { return buffer_table_arg_; }
+
+  // Get the llvm::Value* that represents this functions "prof_counters"
+  // argument.
+  llvm::Value* profile_counters_arg() { return profile_counters_arg_; }
+
+  // Get the llvm::BasicBlock* that contains this function's "ret" instruction.
+  llvm::BasicBlock* return_block() { return return_block_; }
+
+  // Get the llvm::Value* that represents this function's "status" argument.
+  llvm::Value* status_arg() { return status_arg_; }
+
+ private:
+  // Initialize an llvm::Function with standard signature based on arguments.
+  void Initialize(const std::string& function_name,
+                  llvm::Function::LinkageTypes linkage,
+                  const HloModuleConfig& module_config);
+
+  // Emit ir to read and return the ir value for the dynamic loop bound at
+  // 'offset' from the "dynamic_loop_bounds" argument of this function.
+  llvm::Value* GetDynamicLoopBound(int64_t offset);
+
+  llvm::IRBuilderBase* b_;
+  llvm::Module* llvm_module_;
+  llvm::IRBuilderBase::InsertPointGuard caller_insert_point_guard_;
+
+  int64_t num_dynamic_loop_bounds_ = 0;
+  // Encapsulated llvm::Function.
+  llvm::Function* function_;
+  // Function argument IR values.
+  llvm::Argument* result_arg_;
+  llvm::Value* exec_run_options_arg_;
+  llvm::Value* parameters_arg_;
+  llvm::Value* buffer_table_arg_;
+  llvm::Value* dynamic_loop_bounds_arg_ = nullptr;
+  llvm::Value* profile_counters_arg_;
+  llvm::Value* status_arg_;
+  // Basic block containing return.
+  llvm::BasicBlock* return_block_;
+};
+
+// Returns arguments in `arguments` encoded as a single buffer, suitable for a
+// function call.
+llvm::Value* EncodeArrayFunctionArguments(
+    absl::Span<llvm::Value* const> arguments, absl::string_view name,
+    llvm::IRBuilderBase* b);
+
+// Returns an array of compute function call argument ir values.
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    absl::Span<llvm::Value* const> parameter_addresses, llvm::IRBuilderBase* b,
+    absl::string_view name, llvm::Value* return_value_buffer,
+    llvm::Value* exec_run_options_arg, llvm::Value* buffer_table_arg,
+    llvm::Value* status_arg, llvm::Value* profile_counters_arg);
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+absl::Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    absl::Span<const int64_t> dimension_partition_counts,
+    llvm::IRBuilderBase* b, llvm::Function* parallel_function,
+    absl::string_view name);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_IR_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/metrics.h
new file mode 100644
index 00000000..560621e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/metrics.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_METRICS_H_
+#define XLA_SERVICE_CPU_METRICS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace cpu {
+
+// Records the stacktrace of the CPU compiler.
+void RecordCpuCompilerStacktrace();
+
+// Returns the number of times the GPU compiler was called with the given
+// stacktrace.
+int GetCpuCompilerStacktraceCount(absl::string_view stacktrace);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
new file mode 100644
index 00000000..f56b663c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
@@ -0,0 +1,90 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_CONTRACTION_REWRITER_H_
+#define XLA_SERVICE_CPU_ONEDNN_CONTRACTION_REWRITER_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/cpu/onednn_convolution.h"
+#include "xla/service/cpu/onednn_matmul.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace cpu {
+
+// This pass pattern-matches HLO Dot and Convolution instructions and rewrites
+// them into custom calls.
+class OneDnnContractionRewriter : public HloModulePass {
+ public:
+  OneDnnContractionRewriter(int intra_op_parallelism,
+                            const tsl::thread::ThreadPool* compile_threadpool)
+      : intra_op_parallelism_(intra_op_parallelism),
+        compile_threadpool_(compile_threadpool) {}
+  OneDnnContractionRewriter() = default;
+  absl::string_view name() const override {
+    return "onednn-contraction-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  static bool ShouldRewriteDot(const HloInstruction* dot_instr,
+                               bool before_layout_assignment = false);
+  static bool ShouldRewriteConv(const HloInstruction* conv_instr);
+  static bool ShouldRewriteInstr(const HloInstruction* instr,
+                                 bool before_layout_assignment = false) {
+    return ShouldRewriteDot(instr, before_layout_assignment) ||
+           ShouldRewriteConv(instr);
+  }
+
+ private:
+  int intra_op_parallelism_;
+  const tsl::thread::ThreadPool* compile_threadpool_;
+};
+
+using OneDnnContractionVariant =
+    std::variant<PrimitiveTrait<kOnednnConvConfig>,
+                 PrimitiveTrait<kOnednnMatmulConfig>>;
+
+template <BackendConfigOneofCase config>
+struct PrimitiveTrait<config, OneDnnFusionConfig*> {
+  static OneDnnFusionConfig* GetTransformationConfig(
+      typename PrimitiveTrait<config>::pointer_type kernel_config) {
+    return kernel_config->mutable_fusions();
+  }
+};
+
+template <BackendConfigOneofCase config>
+struct PrimitiveTrait<config, OneDnnOptimizationConfig*> {
+  static OneDnnOptimizationConfig* GetTransformationConfig(
+      typename PrimitiveTrait<config>::pointer_type kernel_config) {
+    return kernel_config->mutable_optimization_config();
+  }
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_CONTRACTION_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_convolution.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_convolution.h
new file mode 100644
index 00000000..7d61193b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_convolution.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_CONVOLUTION_H_
+#define XLA_SERVICE_CPU_ONEDNN_CONVOLUTION_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/cpu/onednn_util.h"
+
+namespace xla {
+namespace cpu {
+
+constexpr auto kOnednnConvConfig = BackendConfigOneofCase::kOnednnConvConfig;
+
+extern "C" {
+extern void __xla_cpu_runtime_OneDnnConvolution(void* result, void** args);
+}  // extern "C"
+
+template <>
+struct PrimitiveTrait<kOnednnConvConfig> {
+  using pointer_type = xla::cpu::OneDnnConvolutionConfig*;
+  static const BackendConfigOneofCase kConfigVal = kOnednnConvConfig;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_CONVOLUTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_layer_norm.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_layer_norm.h
new file mode 100644
index 00000000..e3f1634e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_layer_norm.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_LAYER_NORM_H_
+#define XLA_SERVICE_CPU_ONEDNN_LAYER_NORM_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+namespace xla {
+namespace cpu {
+
+extern "C" {
+extern void __xla_cpu_runtime_OneDnnLayerNorm(void* result, void** args);
+}  // extern "C"
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_LAYER_NORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_matmul.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_matmul.h
new file mode 100644
index 00000000..8429e8a2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_matmul.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_MATMUL_H_
+#define XLA_SERVICE_CPU_ONEDNN_MATMUL_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "dnnl.hpp"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/onednn_util.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace cpu {
+
+constexpr auto kOnednnMatmulConfig =
+    BackendConfigOneofCase::kOnednnMatmulConfig;
+
+Shape OneDnnMatMulOptWeightsShape(const Shape& input_shape,
+                                  const Shape& weights_shape,
+                                  const Shape& bias_shape,
+                                  const Shape& output_shape,
+                                  const OneDnnMatMulConfig* matmul_config);
+
+extern "C" {
+extern void __xla_cpu_runtime_OneDnnMatMul(void* result, void* scratch,
+                                           void** args);
+extern void __xla_cpu_runtime_OneDnnMatMulReorder(void* result, void** args);
+}  // extern "C"
+
+template <>
+struct PrimitiveTrait<kOnednnMatmulConfig> {
+  using pointer_type = xla::cpu::OneDnnMatMulConfig*;
+  static const BackendConfigOneofCase kConfigVal = kOnednnMatmulConfig;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_MATMUL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_memory_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_memory_util.h
new file mode 100644
index 00000000..18841d27
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_memory_util.h
@@ -0,0 +1,141 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_MEMORY_UTIL_H_
+#define XLA_SERVICE_CPU_ONEDNN_MEMORY_UTIL_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include <memory>
+
+#include "dnnl.hpp"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "xla/literal.h"
+#include "xla/service/cpu/runtime_lightweight_check.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+
+static const int kOneDnnMaxNDims = DNNL_MAX_NDIMS;
+
+struct StackAlloca {
+  llvm::IRBuilderBase* builder;
+  llvm::Value* value;
+  void EmitLifetimeEnd() {
+    builder->CreateLifetimeEnd(value, builder->getInt64(-1));
+  }
+};
+
+// Declare as opaque to put structure definition together with dependant code.
+struct MemrefInfoPOD;
+using MemrefInfoHandler = std::shared_ptr<MemrefInfoPOD>;
+
+MemrefInfoHandler CreateMemrefInfoFromLiteral(const Literal* literal);
+
+MemrefInfoHandler CreateMemrefFromShape(const Shape& shape, void* buf);
+
+StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilderBase& builder,
+                                       const llvm_ir::IrArray& ir_array);
+
+inline dnnl::memory::data_type ToOneDnnDataType(PrimitiveType ptype) {
+  using dt = dnnl::memory::data_type;
+  switch (ptype) {
+    case S32:
+      return dt::s32;
+    case U8:
+      return dt::u8;
+    case S8:
+      return dt::s8;
+    case F16:
+      return dt::f16;
+    case BF16:
+      return dt::bf16;
+    case F32:
+      return dt::f32;
+    case F64:
+      return dt::f64;
+
+    // TODO(intel-tf): properly handle not supported types:
+    // S16, S64, U16, U32, U64, C64, C128, F8E5M2, F8E4M3FN, S4, U4,
+    // F8E4M3B11FNUZ, F8E4M3, F8E3M4
+    default:
+      return dt::undef;
+  }
+}
+
+inline PrimitiveType ToXlaPrimitiveType(dnnl::memory::data_type dtype) {
+  using dt = dnnl::memory::data_type;
+  switch (dtype) {
+    case dt::s32:
+      return PrimitiveType::S32;
+    case dt::u8:
+      return PrimitiveType::U8;
+    case dt::s8:
+      return PrimitiveType::S8;
+    case dt::f16:
+      return PrimitiveType::F16;
+    case dt::bf16:
+      return PrimitiveType::BF16;
+    case dt::f32:
+      return PrimitiveType::F32;
+    case dt::f64:
+      return PrimitiveType::F64;
+    // TODO(intel-tf): properly handle not supported type:
+    default:
+      return PRIMITIVE_TYPE_INVALID;
+  }
+}
+
+class MemrefInfo {
+ public:
+  explicit MemrefInfo(void* pod_data);
+
+  dnnl::memory::dims GetOneDnnDims() const;
+  dnnl::memory::dims GetOneDnnStrides() const;
+  dnnl::memory::data_type GetOneDnnDataType() const;
+  dnnl::memory::desc GetOneDnnMemDesc() const;
+  void* Data();
+
+  void Print();
+
+  int64_t GetChannels() const;
+  int64_t GetRank() const;
+
+ private:
+  MemrefInfoPOD* pod_;
+};
+
+absl::StatusOr<dnnl::memory::desc> TransposeLastTwoDims(
+    const dnnl::memory::desc& md);
+#define TRANSPOSE_LAST_TWO_DIMS_IF(pred, mem_desc)        \
+  if (pred) {                                             \
+    auto trans_mem_desc = TransposeLastTwoDims(mem_desc); \
+    XLA_LIGHTWEIGHT_CHECK(trans_mem_desc.ok());           \
+    mem_desc = *trans_mem_desc;                           \
+  }
+
+dnnl::memory::desc ShapeToMemDesc(const Shape& shape);
+
+Shape MemDescToXlaShapeFlattened(const dnnl::memory::desc& md);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_MEMORY_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
new file mode 100644
index 00000000..e56e0135
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_CPU_ONEDNN_OPS_REWRITER_H_
+#define XLA_SERVICE_CPU_ONEDNN_OPS_REWRITER_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace cpu {
+
+// This pass fuses hlo instructions that can be fused into single oneDNN
+// operation and rewrites into custom calls.
+class OneDnnOpsRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "onednn-ops-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_OPS_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_pattern_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_pattern_utils.h
new file mode 100644
index 00000000..08f77a4e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_pattern_utils.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_PATTERN_UTILS_H_
+#define XLA_SERVICE_CPU_ONEDNN_PATTERN_UTILS_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/cpu/onednn_util.h"
+#include "xla/service/pattern_matcher.h"
+
+namespace xla {
+namespace cpu {
+
+namespace onednn_pattern_utils_internal {
+namespace m = match;
+
+template <typename Pattern>
+auto OptionalConvert(Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
+}
+
+template <typename Pattern>
+auto OptionalBroadcast(Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Broadcast(pattern), std::move(pattern));
+}
+
+// Type conversion from and to any of BF16 and FP32.
+// TODO(intel-tf): Support more types when enabled.
+template <typename Pattern>
+inline auto SupportedConvert(Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return IsSupportedType(instr->shape().element_type()) &&
+           IsSupportedType(instr->operand(0)->shape().element_type());
+  };
+  return m::Convert(pattern).WithPredicate(supported_convert);
+}
+
+template <typename Pattern>
+inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return IsSupportedType(instr->shape().element_type()) &&
+           IsSupportedType(instr->operand(0)->shape().element_type());
+  };
+  return m::Convert(convert, pattern).WithPredicate(supported_convert);
+}
+}  // namespace onednn_pattern_utils_internal
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_PATTERN_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_rewriter.h
new file mode 100644
index 00000000..95d2692e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_rewriter.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_REWRITER_H_
+#define XLA_SERVICE_CPU_ONEDNN_REWRITER_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace cpu {
+
+// This pass pattern-matches hlo instructions and rewrites into custom calls.
+class OneDnnRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "onednn-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_softmax.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_softmax.h
new file mode 100644
index 00000000..65890104
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_softmax.h
@@ -0,0 +1,33 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_SOFTMAX_H_
+#define XLA_SERVICE_CPU_ONEDNN_SOFTMAX_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+namespace xla {
+namespace cpu {
+
+extern "C" {
+extern void __xla_cpu_runtime_OneDnnSoftmax(const void* run_options_ptr,
+                                            void* input, void* result,
+                                            void* softmax_config_ptr);
+}  // extern "C"
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_SOFTMAX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_util.h
new file mode 100644
index 00000000..48fc2e31
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/onednn_util.h
@@ -0,0 +1,93 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_UTIL_H_
+#define XLA_SERVICE_CPU_ONEDNN_UTIL_H_
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "dnnl.hpp"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/onednn_config.pb.h"
+#include "xla/tsl/util/onednn_threadpool.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace xla {
+namespace cpu {
+
+inline bool IsSupportedType(xla::PrimitiveType dtype) {
+  using tsl::port::CPUFeature;
+  // TODO(intel-tf): Enable more types.
+  switch (dtype) {
+    case F32:
+      return true;
+    case BF16:
+      return TestCPUFeature(CPUFeature::AVX512F) ||
+             TestCPUFeature(CPUFeature::AVX_NE_CONVERT) ||
+             TestCPUFeature(CPUFeature::AMX_BF16);
+    case F16:
+      return (TestCPUFeature(CPUFeature::AVX512BW) &&
+              (TestCPUFeature(CPUFeature::AVX512_FP16) ||
+               TestCPUFeature(CPUFeature::AMX_FP16))) ||
+             TestCPUFeature(CPUFeature::AVX_NE_CONVERT);
+    default:
+      return false;
+  }
+  return false;
+}
+
+struct FusedOperandsRef {
+  const std::vector<void*>& bufs;
+  std::vector<std::pair<int, dnnl::memory>>& postop_args;
+};
+
+std::unique_ptr<tsl::OneDnnThreadPool> CreateOneDnnThreadPool(
+    const Eigen::ThreadPoolDevice* threadpool_device);
+
+dnnl::stream MakeOneDnnStream(
+    const dnnl::engine& cpu_engine,
+    dnnl::threadpool_interop::threadpool_iface* thread_pool);
+
+typedef BackendConfig::BackendConfigOneofCase BackendConfigOneofCase;
+
+// These template functions must have explicit specialization at the definition
+// site.
+template <typename PrimDesc>
+std::unique_ptr<PrimDesc> CreateOneDnnPrimDesc(HloInstruction*);
+
+template <BackendConfigOneofCase config, typename TransformationType = void>
+struct PrimitiveTrait;
+
+template <BackendConfigOneofCase config>
+typename PrimitiveTrait<config>::pointer_type GetKernelConfig(
+    absl::StatusOr<BackendConfig>*);
+
+dnnl::post_ops PopulateOneDnnPostOps(
+    const dnnl::engine& cpu_engine,
+    const std::vector<dnnl::memory::desc>& fused_mds,
+    const OneDnnFusionConfig* fusion_config,
+    FusedOperandsRef* fused_operands_ref = nullptr,
+    dnnl::memory::desc* bias_md = nullptr);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.h
new file mode 100644
index 00000000..e40af88c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#define XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+
+#include <memory>
+
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+
+namespace xla {
+namespace cpu {
+
+namespace orc_jit_memory_mapper {
+// Returns the registered memory mapper if there is one.  Returns nullptr if no
+// memory mapper is registered.
+llvm::SectionMemoryManager::MemoryMapper* GetInstance();
+
+class Registrar {
+ public:
+  // Registers the `mapper` as a memory mapper.  This is a no-op if `mapper` is
+  // null.  Precondition:  no other memory mapper has been registered yet.
+  explicit Registrar(
+      std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper> mapper);
+};
+}  // namespace orc_jit_memory_mapper
+
+#define XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER(mapper_instance, ctr) \
+  static ::xla::cpu::orc_jit_memory_mapper::Registrar                     \
+      XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_NAME(ctr)(mapper_instance)
+
+// __COUNTER__ must go through another macro to be properly expanded
+#define XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_NAME(ctr) \
+  __orc_jit_memory_mapper_registrar_##ctr
+
+// Registers the std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper>
+// returned by the `factory` expression.  `factory` is allowed to evaluate to
+// a null unique_ptr in which case this macro does nothing.
+#define XLA_REGISTER_ORC_JIT_MEMORY_MAPPER(factory) \
+  XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER(factory, __COUNTER__)
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/parallel_loop_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/parallel_loop_emitter.h
new file mode 100644
index 00000000..924544f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/parallel_loop_emitter.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#define XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/cpu/ir_emission_utils.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+namespace cpu {
+
+// ParallelLoopEmitter emits a loop nest for the target array shape.
+// The outer loop bounds of the loop nest are passed as ir values at runtime
+// (specified in 'dynamic_loop_bounds'), and the inner loop bounds are static.
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// Code emitted by ParallelLoopEmitter will be called in a multi-threaded
+// context where each thread will be assigned a different set of outer dimension
+// partitions, and where all threads will collectively iterate over the
+// entire target array shape.
+//
+// Outer dimension partitions can be generated using the ShapePartitionAssigner
+// and ShapePartitionIterator utility classes from shape_partition.cc.
+//
+class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
+ public:
+  // Constructs a ParallelLoopEmitter which uses 'target_element_generator' to
+  // generate elements, 'dynamic_loop_bounds' to set the loop bounds of the
+  // most-major dimensions, and 'target_array.' shape to set the static loop
+  // bounds for the most-minor dimensions.
+  ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
+                      const llvm_ir::IrArray& target_array,
+                      const DynamicLoopBounds* dynamic_loop_bounds,
+                      llvm::IRBuilderBase* b);
+
+  ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
+  ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
+  ~ParallelLoopEmitter() override = default;
+
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
+      absl::string_view loop_name, llvm::Type* index_type,
+      llvm::Value* base_index) override;
+
+ private:
+  const DynamicLoopBounds* dynamic_loop_bounds_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/parallel_task_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/parallel_task_assignment.h
new file mode 100644
index 00000000..9b0f06c5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/parallel_task_assignment.h
@@ -0,0 +1,124 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#define XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace cpu {
+
+// Simple interface for different parallel cost model implementations.
+class ParallelCostModel {
+ public:
+  virtual ~ParallelCostModel() = default;
+  virtual int64_t GetParallelTaskCount(HloInstruction* instruction) = 0;
+};
+
+// ParallelTaskAssignment computes parallel task counts for HLOs in 'module'.
+class ParallelTaskAssignment {
+ public:
+  // 'max_parallelism': the maximum parallel task count per instruction.
+  // 'shape_size': shape size function used by HloCostAnalysis during parallel
+  //               task assignment.
+  // 'module': the containing HloModule.
+  ParallelTaskAssignment(int64_t max_parallelism,
+                         const HloCostAnalysis::ShapeSizeFunction& shape_size,
+                         HloModule* module,
+                         const TargetMachineFeatures* target_machine_features);
+  ~ParallelTaskAssignment() {}
+
+  // Computes and returns the target parallel task count for 'instruction'.
+  int64_t GetTargetParallelTaskCount(HloInstruction* instruction);
+
+ private:
+  std::unique_ptr<ParallelCostModel> cost_model_;
+  const TargetMachineFeatures& target_machine_features_;
+};
+
+// ParallelTaskAssigner computes target parallel task counts for all HLOs
+// in the module, then assigns parallel task counts to HLOs in the entry
+// computation, or to HLOs in embedded computations invoked by (potentially
+// nested) kWhile or kCall instructions.
+// Each HLO which is assigned parallel task counts is outlined into its
+// own embedded computation, which is compiled as a parallel compute function,
+// and which is invoked from a kCall instruction that is lowered in codegen to
+// a runtime parallel fork/join call.
+class ParallelTaskAssigner : public HloModulePass {
+ public:
+  // 'max_parallelism': the maximum parallel task count per instruction.
+  // 'shape_size': shape size function used by HloCostAnalysis during parallel
+  //               task assignment.
+  ParallelTaskAssigner(const int64_t max_parallelism,
+                       const HloCostAnalysis::ShapeSizeFunction& shape_size,
+                       const TargetMachineFeatures* target_machine_features)
+      : max_parallelism_(max_parallelism),
+        shape_size_function_(shape_size),
+        target_machine_features_(*target_machine_features) {}
+  ~ParallelTaskAssigner() override {}
+
+  absl::string_view name() const override {
+    return "cpu-parallel-task-assigner";
+  }
+
+  // Run parallel task assigner on computations with specified
+  // `execution_threads` in 'module'. By default, all `execution_threads` are
+  // included. Returns true if the computation was changed, false otherwise.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  using HloToParallelTasks =
+      absl::flat_hash_map<const HloInstruction*, int64_t>;
+
+  // Assigns target parallel tasks from 'hlo_to_parallel_tasks' to HLOs in
+  // 'module'.
+  // Returns true if the computation was changed, false otherwise.
+  bool AssignParallelTasks(HloModule* module,
+                           const HloToParallelTasks& hlo_to_parallel_tasks);
+  bool AssignParallelTasksHelper(
+      HloModule* module, HloComputation* computation,
+      const HloToParallelTasks& hlo_to_parallel_tasks);
+
+  // Computes target parallel task counts (returned in 'parallel_task_counts')
+  // for parallelizable instructions in 'module'.
+  void ComputeTargetParallelTasks(HloModule* module,
+                                  HloToParallelTasks* hlo_to_parallel_tasks);
+
+  int64_t max_parallelism_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
+  const TargetMachineFeatures& target_machine_features_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d.h
new file mode 100644
index 00000000..14bb2195
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
+#define XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
+
+#include <stdint.h>
+
+#include "Eigen/Core"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenConv2DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
+    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
+    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
+    int64_t output_rows, int64_t output_cols, int64_t row_stride,
+    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
+    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
+    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
+    int64_t rhs_col_dilation, int64_t feature_group_count);
+
+extern void __xla_cpu_runtime_EigenConv2DF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
+    int64_t input_rows, int64_t input_cols, int64_t input_channels,
+    int64_t kernel_rows, int64_t kernel_cols, int64_t kernel_channels,
+    int64_t kernel_filters, int64_t output_rows, int64_t output_cols,
+    int64_t row_stride, int64_t col_stride, int64_t padding_top,
+    int64_t padding_bottom, int64_t padding_left, int64_t padding_right,
+    int64_t lhs_row_dilation, int64_t lhs_col_dilation,
+    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
+    int64_t feature_group_count);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h
new file mode 100644
index 00000000..69a2429f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h
@@ -0,0 +1,96 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
+#define XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
+
+#include "tsl/platform/types.h"
+
+#ifdef XLA_CPU_USE_ACL
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "utils/Utils.h"
+
+extern "C" {
+struct acl_depthwise_conv_obj_t {
+  arm_compute::NEDepthwiseConvolutionLayer depthwise_conv;
+  arm_compute::NEArithmeticAddition add;
+  arm_compute::NEActivationLayer act;
+  arm_compute::Tensor input_tensor;
+  arm_compute::Tensor kernel_tensor;
+  arm_compute::Tensor bia_tensor;
+  arm_compute::Tensor output_tensor;
+  arm_compute::Tensor output_acc_tensor;
+};
+
+struct acl_gemm_conv_obj_t {
+  arm_compute::NEGEMMConvolutionLayer gemm_conv;
+  arm_compute::NEArithmeticAddition add;
+  arm_compute::NEActivationLayer act;
+  arm_compute::Tensor input_tensor;
+  arm_compute::Tensor kernel_tensor;
+  arm_compute::Tensor bia_tensor;
+  arm_compute::Tensor output_tensor;
+  arm_compute::Tensor output_acc_tensor;
+};
+
+struct acl_conv_conf_t {
+  bool with_bias;
+  bool is_int8;
+  bool sum_with_eltwise;
+  bool fast_math;
+  arm_compute::TensorInfo input_info;
+  arm_compute::TensorInfo kernel_info;
+  arm_compute::TensorInfo bia_info;
+  arm_compute::TensorInfo output_info;
+  arm_compute::PadStrideInfo padstride_info;
+  arm_compute::Size2D dilation_info;
+  arm_compute::WeightsInfo kernel_wei_info;
+  arm_compute::ActivationLayerInfo act_info;
+};
+
+extern void __xla_cpu_runtime_ACLConv2DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
+    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
+    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
+    int64_t output_rows, int64_t output_cols, int64_t row_stride,
+    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
+    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
+    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
+    int64_t rhs_col_dilation, int64_t feature_group_count);
+}
+#else
+#include <iostream>
+
+extern "C" {
+inline extern void __xla_cpu_runtime_ACLConv2DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
+    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
+    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
+    int64_t output_rows, int64_t output_cols, int64_t row_stride,
+    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
+    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
+    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
+    int64_t rhs_col_dilation, int64_t feature_group_count) {
+  std::cerr
+      << "Attempt to call ACL Conv2D runtime library without defining "
+         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
+  exit(1);
+}
+}
+#endif  // XLA_CPU_USE_ACL
+#endif  // XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d_mkl.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d_mkl.h
new file mode 100644
index 00000000..bf006e07
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv2d_mkl.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
+#define XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
+
+#include <cstdint>
+#include <iostream>
+
+extern "C" {
+
+extern void __xla_cpu_runtime_MKLConv2DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
+    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
+    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
+    int64_t output_rows, int64_t output_cols, int64_t row_stride,
+    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
+    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
+    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
+    int64_t rhs_col_dilation);
+}
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv3d.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv3d.h
new file mode 100644
index 00000000..7ae829e3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_conv3d.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_CONV3D_H_
+#define XLA_SERVICE_CPU_RUNTIME_CONV3D_H_
+
+#include <stdint.h>
+
+#include "Eigen/Core"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenConv3DF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
+    int64_t input_x, int64_t input_y, int64_t input_z, int64_t input_channels,
+    int64_t kernel_x, int64_t kernel_y, int64_t kernel_z,
+    int64_t kernel_channels, int64_t kernel_filters, int64_t output_x,
+    int64_t output_y, int64_t output_z, int64_t x_stride, int64_t y_stride,
+    int64_t z_stride, int64_t padding_x_before, int64_t padding_x_after,
+    int64_t padding_y_before, int64_t padding_y_after, int64_t padding_z_before,
+    int64_t padding_z_after, int64_t lhs_x_dilation, int64_t lhs_y_dilation,
+    int64_t lhs_z_dilation, int64_t rhs_x_dilation, int64_t rhs_y_dilation,
+    int64_t rhs_z_dilation, int64_t feature_group_count);
+
+extern void __xla_cpu_runtime_EigenConv3DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_x,
+    int64_t input_y, int64_t input_z, int64_t input_channels, int64_t kernel_x,
+    int64_t kernel_y, int64_t kernel_z, int64_t kernel_channels,
+    int64_t kernel_filters, int64_t output_x, int64_t output_y,
+    int64_t output_z, int64_t x_stride, int64_t y_stride, int64_t z_stride,
+    int64_t padding_x_before, int64_t padding_x_after, int64_t padding_y_before,
+    int64_t padding_y_after, int64_t padding_z_before, int64_t padding_z_after,
+    int64_t lhs_x_dilation, int64_t lhs_y_dilation, int64_t lhs_z_dilation,
+    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_CONV3D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_custom_call_status.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_custom_call_status.h
new file mode 100644
index 00000000..e243b46e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_custom_call_status.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
+#define XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
+
+extern "C" {
+
+// Returns true iff the given 'XlaCustomCallStatus' is in a success state, so
+// that generated code can return early if a CustomCall fails.
+extern bool __xla_cpu_runtime_StatusIsSuccess(
+    const void* /* XlaCustomCallStatus* */ status_ptr);
+}
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fft.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fft.h
new file mode 100644
index 00000000..2997d04f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fft.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_FFT_H_
+#define XLA_SERVICE_CPU_RUNTIME_FFT_H_
+
+#include <stdint.h>
+
+extern "C" {
+
+extern void __xla_cpu_runtime_DuccFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, int32_t fft_type, int32_t double_precision, int32_t fft_rank,
+    const int64_t* input_shape, const int64_t* fft_length);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_FFT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fork_join.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fork_join.h
new file mode 100644
index 00000000..11fc141e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fork_join.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#define XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+
+#include <stdint.h>
+
+extern "C" {
+
+// Dispatches 'num_partitions' parallel calls to 'function_ptr' and joins
+// threads before returning. See comments in runtime_fork_join.cc for details.
+extern void __xla_cpu_runtime_ParallelForkJoin(
+    void* result_ptr, const void* run_options_ptr, const void** params,
+    void** buffer_table, void* status, uint64_t* prof_counters,
+    int32_t num_partitions, int64_t* partitions, int32_t num_partitioned_dims,
+    void* function_ptr);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fp16.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fp16.h
new file mode 100644
index 00000000..c86d6dc3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_fp16.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_FP16_H_
+#define XLA_SERVICE_CPU_RUNTIME_FP16_H_
+
+#include <stdint.h>
+
+// _Float16 always gets us the correct ABI type, so use that if available.
+// AArch64 GCC defines __FLT16_MANT_DIG__ even when _Float16 is not available.
+#if defined(__FLT16_MANT_DIG__) && \
+    (defined(__clang__) || !(defined(__GNUC__) && defined(__aarch64__)))
+using XlaF16ABIType = _Float16;
+#elif defined(__x86_64__)
+// Older versions of Clang don't have _Float16. Since both float and _Float16
+// are passed in the same register we can use the wider type and careful casting
+// to conform to x86_64 psABI. This only works with the assumption that we're
+// dealing with little-endian values passed in wider registers.
+using XlaF16ABIType = float;
+#else
+// Default to uint16_t if we have nothing else.
+using XlaF16ABIType = uint16_t;
+#endif
+
+// Converts an F32 value to a F16.
+extern "C" XlaF16ABIType __gnu_f2h_ieee(float);
+
+// Converts an F16 value to a F32.
+extern "C" float __gnu_h2f_ieee(XlaF16ABIType);
+
+// Converts an F64 value to a F16.
+extern "C" XlaF16ABIType __truncdfhf2(double);
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_FP16_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.h
new file mode 100644
index 00000000..e6517c5a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_HANDLE_FFI_CALL_H_
+#define XLA_SERVICE_CPU_RUNTIME_HANDLE_FFI_CALL_H_
+
+#include <cstdint>
+
+extern "C" {
+
+extern void __xla_cpu_runtime_HandleFfiCall(
+    const void* run_options_ptr, const char* target_name_ptr,
+    int64_t target_name_len, void** outputs, void** inputs,
+    const char* opaque_str_ptr, int64_t opaque_str_len, void* status_opaque,
+    int32_t* operand_types, int64_t operand_count, int64_t* operand_dims,
+    int32_t* result_types, int64_t result_count, int64_t* result_dims);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_HANDLE_FFI_CALL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_key_value_sort.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_key_value_sort.h
new file mode 100644
index 00000000..dfd99f64
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_key_value_sort.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
+#define XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
+
+#include <stdint.h>
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+extern "C" {
+
+// Each entry in 'values' represents a 3-dimensional shape with dimensions
+// [a, b, c]. The 'b' dimension of each shape is sorted into ascending order
+// according to the results of comparisons using the provided 'less_than'
+// function. 'values_count' must be > 0 and specifies the number of entries in
+// 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
+// type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
+// bytes. 'is_stable' specifies whether the sorting should be stable.
+// 'run_options' and 'prof_counters' are passed through to the less-than
+// function, which expects the following arguments:
+// - pointer to the return value buffer (char*)
+// - xla::ExecutableRunOptions = 'run_options' (char*)
+// - pointers to the parameter buffers (char**)
+// - pointers to the buffer tables = nullptr for thread local functions (char**)
+// - profile counters = 'prof_counters' (int64_t*)
+extern void __xla_cpu_runtime_KeyValueSort(
+    int64_t a, int64_t b, int64_t c, char** values, int32_t values_count,
+    int32_t* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, int64_t* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, int64_t*));
+}
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_lightweight_check.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_lightweight_check.h
new file mode 100644
index 00000000..49fc9cb3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_lightweight_check.h
@@ -0,0 +1,36 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_LIGHTWEIGHT_CHECK_H_
+#define XLA_SERVICE_CPU_RUNTIME_LIGHTWEIGHT_CHECK_H_
+
+#include <cstdlib>
+#include <iostream>
+
+// Aborts the program if the condition is false.
+//
+// This is like QCHECK, except it doesn't pull in the TF/XLA logging framework.
+// This makes it suitable for use from within the XLA:CPU runtime files, which
+// need to be lightweight.
+#define XLA_LIGHTWEIGHT_CHECK(cond)                                         \
+  do {                                                                      \
+    if (!(cond)) {                                                          \
+      std::cerr << __FILE__ << ":" << __LINE__                              \
+                << " Failed XLA_LIGHTWEIGHT_QCHECK " << #cond << std::endl; \
+      std::abort();                                                         \
+    }                                                                       \
+  } while (0)
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_LIGHTWEIGHT_CHECK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul.h
new file mode 100644
index 00000000..a7d79bf1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul.h
@@ -0,0 +1,68 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
+#define XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
+
+#include <stdint.h>
+
+#include <complex>
+
+#include "Eigen/Core"
+
+extern "C" {
+
+// Performs a multi-threaded matrix multiplication using Eigen. 'lhs' and 'rhs'
+// are pointers to buffers containing input matrices in column-major
+// order. 'out' is a pointer to a buffer sufficiently large to hold the result
+// of the operation. Following standard nomenclature: lhs is m x k,
+// rhs is k x n, and out is m x n.
+extern void __xla_cpu_runtime_EigenMatMulF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t m, int64_t n,
+    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, int64_t m, int64_t n,
+    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, int64_t m, int64_t n,
+    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulS32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int32_t* out,
+    int32_t* lhs, int32_t* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenBatchMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul_acl.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul_acl.h
new file mode 100644
index 00000000..94f4f56d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul_acl.h
@@ -0,0 +1,87 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
+#define XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
+
+#include <iostream>
+
+#include "tsl/platform/types.h"
+
+#ifdef XLA_CPU_USE_ACL
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "utils/Utils.h"
+
+extern "C" {
+struct acl_matmul_obj_t {
+  arm_compute::NEGEMM gemm;
+  arm_compute::NETranspose trans_lhs;
+  arm_compute::NETranspose trans_rhs;
+  arm_compute::Tensor rhs_tensor;
+  arm_compute::Tensor rhs_acc_tensor;
+  arm_compute::Tensor lhs_tensor;
+  arm_compute::Tensor lhs_acc_tensor;
+  arm_compute::Tensor out_tensor;
+};
+
+struct acl_matmul_conf_t {
+  bool with_bias;
+  bool is_trans_lhs;
+  bool is_trans_rhs;
+  arm_compute::TensorInfo lhs_info;
+  arm_compute::TensorInfo lhs_acc_info;
+  arm_compute::TensorInfo rhs_info;
+  arm_compute::TensorInfo rhs_acc_info;
+  arm_compute::TensorInfo out_info;
+  arm_compute::GEMMInfo gemm_info;
+  float alpha;
+};
+
+extern void __xla_cpu_runtime_ACLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_ACLBatchMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+}  // extern "C"
+#else
+extern "C" {
+inline extern void __xla_cpu_runtime_ACLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs) {
+  std::cerr
+      << "Attempt to call ACL MatMul runtime library without defining "
+         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
+  exit(1);
+}
+
+inline extern void __xla_cpu_runtime_ACLBatchMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
+    int32_t transpose_lhs, int32_t transpose_rhs) {
+  std::cerr
+      << "Attempt to call ACL MatMul runtime library without defining "
+         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
+  exit(1);
+}
+}  // extern "C"
+#endif  // XLA_CPU_USE_ACL
+#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul_common.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul_common.h
new file mode 100644
index 00000000..069915aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_matmul_common.h
@@ -0,0 +1,154 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_COMMON_H_
+#define XLA_SERVICE_CPU_RUNTIME_MATMUL_COMMON_H_
+
+#include <cstdint>
+
+#define EIGEN_USE_THREADS
+
+#include "absl/base/dynamic_annotations.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/runtime_lightweight_check.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+namespace xla {
+
+static inline bool Is16BytesAligned(void* ptr) {
+  return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+}
+
+template <typename T, Eigen::AlignmentType Alignment>
+void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64_t m,
+            int64_t n, int64_t k, int32_t transpose_lhs,
+            int32_t transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+
+  int64_t lhs_rows = m;
+  int64_t lhs_cols = k;
+  if (transpose_lhs) {
+    std::swap(lhs_rows, lhs_cols);
+  }
+
+  int64_t rhs_rows = k;
+  int64_t rhs_cols = n;
+  if (transpose_rhs) {
+    std::swap(rhs_rows, rhs_cols);
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> A(lhs, lhs_rows,
+                                                                 lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> B(rhs, rhs_rows,
+                                                                 rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<T, 2>, Alignment> C(out, m, n);
+
+  typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+  int lhs_contract_dim = transpose_lhs ? 0 : 1;
+  int rhs_contract_dim = transpose_rhs ? 1 : 0;
+  const Eigen::array<DimPair, 1> dims(
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
+
+  // Matrix multiply is a special case of the "contract" operation where
+  // the contraction is performed along dimension 1 of the lhs and dimension
+  // 0 of the rhs.
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  C.device(*run_options->intra_op_thread_pool()) = A.contract(B, dims);
+}
+
+template <typename T, Eigen::AlignmentType Alignment>
+void MatMul_Batch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+                  int64_t m, int64_t n, int64_t k, Eigen::Index batch_size,
+                  int32_t transpose_lhs, int32_t transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+
+  int64_t lhs_rows = m;
+  int64_t lhs_cols = k;
+  if (transpose_lhs) {
+    std::swap(lhs_rows, lhs_cols);
+  }
+
+  int64_t rhs_rows = k;
+  int64_t rhs_cols = n;
+  if (transpose_rhs) {
+    std::swap(rhs_rows, rhs_cols);
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const T, 3>, Alignment> A(
+      lhs, lhs_rows, lhs_cols, batch_size);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 3>, Alignment> B(
+      rhs, rhs_rows, rhs_cols, batch_size);
+  Eigen::TensorMap<Eigen::Tensor<T, 3>, Alignment> C(out, m, n, batch_size);
+
+  typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+  int lhs_contract_dim = transpose_lhs ? 0 : 1;
+  int rhs_contract_dim = transpose_rhs ? 1 : 0;
+
+  const Eigen::array<DimPair, 1> dims(
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
+
+  // Matrix multiply is a special case of the "contract" operation where
+  // the contraction is performed along dimension 1 of the lhs and dimension
+  // 0 of the rhs.
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+
+  for (int64_t i = 0; i < batch_size; ++i) {
+    C.chip(i, 2).device(*run_options->intra_op_thread_pool()) =
+        A.chip(i, 2).contract(B.chip(i, 2), dims);
+  }
+}
+
+template <typename T>
+void MatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+                    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
+                    int32_t transpose_rhs) {
+  bool all_buffers_16b_aligned =
+      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
+
+  if (!all_buffers_16b_aligned) {
+    MatMul<T, Eigen::Unaligned>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                transpose_lhs, transpose_rhs);
+    return;
+  }
+
+  MatMul<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m, n, k,
+                              transpose_lhs, transpose_rhs);
+}
+
+template <typename T>
+void BatchMatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+                         int64_t m, int64_t n, int64_t k, int64_t batch_size,
+                         int32_t transpose_lhs, int32_t transpose_rhs) {
+  bool all_buffers_16b_aligned =
+      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
+
+  if (!all_buffers_16b_aligned) {
+    MatMul_Batch<T, Eigen::Unaligned>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                      batch_size, transpose_lhs, transpose_rhs);
+    return;
+  }
+  MatMul_Batch<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                    batch_size, transpose_lhs, transpose_rhs);
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_pow.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_pow.h
new file mode 100644
index 00000000..ae3196cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_pow.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_POW_H_
+#define XLA_SERVICE_CPU_RUNTIME_POW_H_
+
+#include <stdint.h>
+
+// Raises F32 value a to the power of b.
+extern "C" float __powisf2(float a, int32_t b);
+
+// Raises F64 value a to the power of b.
+extern "C" double __powidf2(double a, int32_t b);
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_POW_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.h
new file mode 100644
index 00000000..8f154f27
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
+#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
+
+#include <stdint.h>
+
+#include "Eigen/Core"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
+    int64_t input_rows, int64_t input_cols, int64_t input_channels,
+    int64_t kernel_rows, int64_t kernel_cols, int64_t kernel_channels,
+    int64_t kernel_filters, int64_t output_rows, int64_t output_cols,
+    int64_t row_stride, int64_t col_stride, int64_t padding_top,
+    int64_t padding_bottom, int64_t padding_left, int64_t padding_right,
+    int64_t lhs_row_dilation, int64_t lhs_col_dilation,
+    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
+    int64_t feature_group_count);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
+    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
+    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
+    int64_t output_rows, int64_t output_cols, int64_t row_stride,
+    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
+    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
+    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
+    int64_t rhs_col_dilation, int64_t feature_group_count);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.h
new file mode 100644
index 00000000..d0b70880
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV3D_H_
+#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV3D_H_
+
+#include <stdint.h>
+
+#include "Eigen/Core"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
+    int64_t input_x, int64_t input_y, int64_t input_z, int64_t input_channels,
+    int64_t kernel_x, int64_t kernel_y, int64_t kernel_z,
+    int64_t kernel_channels, int64_t kernel_filters, int64_t output_x,
+    int64_t output_y, int64_t output_z, int64_t x_stride, int64_t y_stride,
+    int64_t z_stride, int64_t padding_x_before, int64_t padding_x_after,
+    int64_t padding_y_before, int64_t padding_y_after, int64_t padding_z_before,
+    int64_t padding_z_after, int64_t lhs_x_dilation, int64_t lhs_y_dilation,
+    int64_t lhs_z_dilation, int64_t rhs_x_dilation, int64_t rhs_y_dilation,
+    int64_t rhs_z_dilation, int64_t feature_group_count);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t input_batch, int64_t input_x,
+    int64_t input_y, int64_t input_z, int64_t input_channels, int64_t kernel_x,
+    int64_t kernel_y, int64_t kernel_z, int64_t kernel_channels,
+    int64_t kernel_filters, int64_t output_x, int64_t output_y,
+    int64_t output_z, int64_t x_stride, int64_t y_stride, int64_t z_stride,
+    int64_t padding_x_before, int64_t padding_x_after, int64_t padding_y_before,
+    int64_t padding_y_after, int64_t padding_z_before, int64_t padding_z_after,
+    int64_t lhs_x_dilation, int64_t lhs_y_dilation, int64_t lhs_z_dilation,
+    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV3D_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_fft.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 00000000..23a84bad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include <stdint.h>
+
+extern "C" {
+
+extern void __xla_cpu_runtime_DuccSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, int32_t fft_type, int32_t double_precision, int32_t fft_rank,
+    const int64_t* input_shape, const int64_t* fft_length);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h
new file mode 100644
index 00000000..f23291b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
+#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
+
+#include <complex>
+#include <cstdint>
+
+#include "Eigen/Core"
+#include "tsl/platform/ml_dtypes.h"
+
+extern "C" {
+
+// Performs a single-threaded matrix multiplication using Eigen. 'lhs' and 'rhs'
+// are pointers to buffers containing input matrices in column-major order.
+// 'out' is a pointer to a buffer sufficiently large to hold the result of the
+// operation. Following standard nomenclature: lhs is m x k, rhs is k x n, and
+// out is m x n.
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t m, int64_t n,
+    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    std::complex<float>* out, std::complex<float>* lhs,
+    std::complex<float>* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC128(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    std::complex<double>* out, std::complex<double>* lhs,
+    std::complex<double>* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulS32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int32_t* out,
+    int32_t* lhs, int32_t* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulU8(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, uint8_t* out,
+    uint8_t* lhs, uint8_t* rhs, int64_t m, int64_t n, int64_t k,
+    int32_t transpose_lhs, int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF8E5M2(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    tsl::float8_e5m2* out, tsl::float8_e5m2* lhs, tsl::float8_e5m2* rhs,
+    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
+    int32_t transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF8E4M3FN(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    tsl::float8_e4m3fn* out, tsl::float8_e4m3fn* lhs, tsl::float8_e4m3fn* rhs,
+    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
+    int32_t transpose_rhs);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_common.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_common.h
new file mode 100644
index 00000000..295fb14e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_common.h
@@ -0,0 +1,88 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_COMMON_H_
+#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_COMMON_H_
+
+#include <cstdint>
+
+#include "absl/base/attributes.h"
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+namespace xla {
+
+static inline bool Is16BytesAligned(void* ptr) {
+  return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+}
+
+template <typename T, Eigen::AlignmentType Alignment>
+void SingleThreadedMatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+                          int64_t m, int64_t n, int64_t k,
+                          int32_t transpose_lhs, int32_t transpose_rhs) {
+  int64_t lhs_rows = m;
+  int64_t lhs_cols = k;
+  if (transpose_lhs) {
+    std::swap(lhs_rows, lhs_cols);
+  }
+
+  int64_t rhs_rows = k;
+  int64_t rhs_cols = n;
+  if (transpose_rhs) {
+    std::swap(rhs_rows, rhs_cols);
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> A(lhs, lhs_rows,
+                                                                 lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> B(rhs, rhs_rows,
+                                                                 rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<T, 2>, Alignment> C(out, m, n);
+
+  typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+  int lhs_contract_dim = transpose_lhs ? 0 : 1;
+  int rhs_contract_dim = transpose_rhs ? 1 : 0;
+  const Eigen::array<DimPair, 1> dims(
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
+
+  // Matrix multiply is a special case of the "contract" operation where
+  // the contraction is performed along dimension 1 of the lhs and dimension
+  // 0 of the rhs.
+  C = A.contract(B, dims);
+}
+
+template <typename T>
+void SingleThreadedMatMulDispatch(const void* run_options_ptr, T* out, T* lhs,
+                                  T* rhs, int64_t m, int64_t n, int64_t k,
+                                  int32_t transpose_lhs,
+                                  int32_t transpose_rhs) {
+  bool all_buffers_16b_aligned =
+      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
+
+  if (!all_buffers_16b_aligned) {
+    SingleThreadedMatMul<T, Eigen::Unaligned>(
+        run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  }
+
+  SingleThreadedMatMul<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m,
+                                            n, k, transpose_lhs, transpose_rhs);
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_symbol_generator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_symbol_generator.h
new file mode 100644
index 00000000..4173717a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_symbol_generator.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
+#define XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
+
+#include <optional>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Error.h"
+
+namespace xla::cpu {
+
+// Generates symbol definitions for XLA runtime symbols, which are linked into
+// the compiled XLA kernels.
+class RuntimeSymbolGenerator : public llvm::orc::DefinitionGenerator {
+ public:
+  explicit RuntimeSymbolGenerator(llvm::DataLayout data_layout);
+
+  llvm::Error tryToGenerate(llvm::orc::LookupState&, llvm::orc::LookupKind,
+                            llvm::orc::JITDylib& jit_dylib,
+                            llvm::orc::JITDylibLookupFlags,
+                            const llvm::orc::SymbolLookupSet& names) final;
+
+ private:
+  std::optional<llvm::orc::ExecutorSymbolDef> ResolveRuntimeSymbol(
+      llvm::StringRef name);
+
+  llvm::DataLayout data_layout_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_topk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_topk.h
new file mode 100644
index 00000000..13e922d1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/runtime_topk.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_TOPK_H_
+#define XLA_SERVICE_CPU_RUNTIME_TOPK_H_
+
+#include <stdint.h>
+
+extern "C" {
+
+// Calculates `batch_size` topk operations with `input_size` inputs each. The
+// outputs are written to `out_values` and `out_indices`.
+extern void __xla_cpu_runtime_TopKF32(int64_t batch_size, int64_t input_size,
+                                      int64_t k, const float* values,
+                                      float* out_values, int32_t* out_indices);
+}
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_TOPK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/shape_partition.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/shape_partition.h
new file mode 100644
index 00000000..7f46a9ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/shape_partition.h
@@ -0,0 +1,108 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#define XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace cpu {
+
+// ShapePartitionAssigner partitions the most-major dimensions of 'shape' such
+// that the total partition count <= 'target_partition_count'.
+//
+// Example 1:
+//
+//   Let 'shape' = [8, 16, 32] and 'target_partition_count' = 6.
+//
+//   Because the most-major dimension size is <= 'target_partition_count', we
+//   can generate our target number of partitions by partition the most-major
+//   dimensions.
+//
+//   This will result in the following partitions of the most-major dimension:
+//
+//     [0, 1), [1, 2), [2, 3), [3, 4), [4, 5) [5, 8)
+//
+//   Note that the last partition has residual because the dimension size is
+//   not a multiple of the partition count.
+//
+//
+// Example 2:
+//
+//   Let 'shape' = [8, 16, 32] and 'target_partition_count' = 16.
+//
+//   Because the most-major dimension only has size 8, we must also partition
+//   the next most-major dimension to generate the target of 16 partitions.
+//   We factor 'target_partition_count' by the number of most-major dimensions
+//   we need to partition, to get a per-dimension target partition count:
+//
+//     target_dimension_partition_count = 16 ^ (1 / 2) == 4
+//
+//   This will result in the following partitions of the most-major dimension:
+//
+//     [0, 2), [2, 4), [4, 6), [6, 8)
+//
+//   This will result in the following partitions of the second most-major
+//   dimension:
+//
+//     [0, 4), [4, 8), [8, 12), [12, 16)
+//
+class ShapePartitionAssigner {
+ public:
+  explicit ShapePartitionAssigner(const Shape& shape) : shape_(shape) {}
+
+  // Returns dimension partition counts (starting at outermost dimension).
+  std::vector<int64_t> Run(int64_t target_partition_count);
+
+  // Returns the total partition count based on 'dimension_partition_counts'.
+  static int64_t GetTotalPartitionCount(
+      const std::vector<int64_t>& dimension_partition_counts);
+
+ private:
+  const Shape& shape_;
+};
+
+// ShapePartitionIterator iterates through outer-dimension partitions of
+// 'shape' as specified by 'dimension_partition_counts'.
+class ShapePartitionIterator {
+ public:
+  ShapePartitionIterator(const Shape& shape,
+                         absl::Span<const int64_t> dimension_partition_counts);
+
+  // Returns a partition [start, size] for each dimension.
+  // Partitions are listed starting from outer-most dimension first.
+  std::vector<std::pair<int64_t, int64_t>> GetPartition(int64_t index) const;
+
+  int64_t GetTotalPartitionCount() const;
+
+ private:
+  const Shape& shape_;
+  const std::vector<int64_t> dimension_partition_counts_;
+
+  std::vector<int64_t> dimensions_;
+  std::vector<int64_t> dimension_partition_sizes_;
+  std::vector<int64_t> dimension_partition_strides_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_SHAPE_PARTITION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/target_machine_features_stub.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/target_machine_features_stub.h
new file mode 100644
index 00000000..edacaa36
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/target_machine_features_stub.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_STUB_H_
+#define XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_STUB_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "llvm/IR/Function.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+// Delegates calls to minimum_alignment_for_allocation to a user provided
+// std::function, crashes on all other methods.
+class TargetMachineFeaturesStub : public TargetMachineFeatures {
+ public:
+  explicit TargetMachineFeaturesStub(
+      std::function<int64_t(int64_t)> min_alignment)
+      : TargetMachineFeatures(/*target_machine=*/nullptr),
+        min_alignment_(std::move(min_alignment)) {}
+
+  int vectorization_factor_in_bytes() const final {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_byte_size(const llvm::Function& function) const final {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_num_elements(const llvm::Function& function,
+                                   PrimitiveType type) const final {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_count(const llvm::Function& function) const final {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int64_t minimum_alignment_for_allocation(int64_t size_bytes) const final {
+    return min_alignment_(size_bytes);
+  }
+
+  std::string get_target_feature_string() const final {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+ private:
+  std::function<int64_t(int64_t)> min_alignment_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_STUB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/test_target_triple_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/test_target_triple_helper.h
new file mode 100644
index 00000000..0b057bf4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/test_target_triple_helper.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_TEST_TARGET_TRIPLE_HELPER_H_
+#define XLA_SERVICE_CPU_TEST_TARGET_TRIPLE_HELPER_H_
+
+#if defined(__aarch64__)
+static const char kTargetCpuForHost[] = "";
+static const char kTargetTripleForHost[] = "aarch64-unknown-linux-gnu";
+#elif (defined(__powerpc__) || \
+       defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+static const char kTargetCpuForHost[] = "ppc";
+static const char kTargetTripleForHost[] = "ppc64le-ibm-linux-gnu";
+#elif defined(__s390x__)
+static const char kTargetCpuForHost[] = "s390x";
+static const char kTargetTripleForHost[] = "systemz-none-linux-gnu";
+#else
+static const char kTargetCpuForHost[] = "";
+static const char kTargetTripleForHost[] = "x86_64-pc-linux";
+#endif
+
+#endif  // XLA_SERVICE_CPU_TEST_TARGET_TRIPLE_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/tests/cpu_codegen_test.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/tests/cpu_codegen_test.h
new file mode 100644
index 00000000..786d0bf9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/tests/cpu_codegen_test.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_TESTS_CPU_CODEGEN_TEST_H_
+#define XLA_SERVICE_CPU_TESTS_CPU_CODEGEN_TEST_H_
+
+#include "xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace cpu {
+
+// Tests that verify IR emitted by the CPU backend is as expected.
+class CpuCodegenTest : public LlvmIrGenTestBase {};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_TESTS_CPU_CODEGEN_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/thunk_emitter.h
new file mode 100644
index 00000000..78725435
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -0,0 +1,247 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_THUNK_EMITTER_H_
+#define XLA_SERVICE_CPU_THUNK_EMITTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "xla/backends/cpu/codegen/llvm_ir_kernel_spec.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/backends/cpu/runtime/sort_thunk.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/ir_emitter2.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// ThunkEmitter is responsible for converting optimized HLO module into a
+// sequence of thunks that will launch "work" on the CPU: launch host kernels,
+// call into the libraries (oneDNN, Eigen, etc.).
+//
+// During the thunk emission it emits IR (LLVM IR) for the host kernels via the
+// IrEmitter that later will be compiled into the executable binary (one or
+// multiple LLVM modules compiled to object files).
+class ThunkEmitter {
+ public:
+  struct EmittedKernel {
+    std::string kernel_name;
+    llvm::orc::ThreadSafeModule module;
+  };
+
+  ThunkEmitter(IrEmitter2& ir_emitter,
+               const BufferAssignment& buffer_assignment,
+               const TargetMachineFeatures& target_machine_features,
+               const HloModuleConfig& hlo_module_config);
+
+  // Emits HLO module entry computation as a sequence of thunks.
+  absl::StatusOr<ThunkSequence> EmitEntryComputation(const HloModule& module);
+
+  std::vector<EmittedKernel>& kernels() { return kernels_; }
+
+ private:
+  struct HostKernelAllocationSlices {
+    std::vector<BufferAllocation::Slice> arguments;
+    std::vector<BufferAllocation::Slice> results;
+  };
+
+  std::optional<SortThunk::SortDirection> MatchSortDirection(
+      const HloComputation* hlo_comparator) const;
+
+  // Returns the buffer allocation slice assigned to the given instruction at
+  // the given shape index. Instruction must have a unique slice assigned to it!
+  absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
+      const HloInstruction* instruction, const ShapeIndex& index = {});
+
+  // Returns a token resource corresponding to the given instruction result.
+  absl::StatusOr<std::shared_ptr<Resource>> GetTokenResource(
+      const HloInstruction* instruction, const ShapeIndex& index = {});
+
+  absl::StatusOr<ThunkSequence> EmitHloComputation(
+      const HloComputation* computation);
+
+  absl::StatusOr<ThunkSequence> EmitHloInstruction(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitCallThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitConcatenateKernelThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitGetDimensionSizeThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitSetDimensionSizeThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitBatchNormGradThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitBatchNormTrainingThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitConvolutionThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitCopyThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitElementalKernelThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitPadKernelThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitFftThunk(const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitFusionKernelThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitReductionKernelThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitRngThunk(const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitRngBitGeneratorThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitRngGetAndUpdateStateThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitStochasticConvertThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitInfeedThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitOutfeedThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitConditionThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitWhileThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitDotThunk(const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitReplicaIdThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitPartitionIdThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitAllGatherThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitAllReduceThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitAllToAllThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitCollectivePermuteThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitReduceScatterThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitCustomCallThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitSliceToDynamicThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitTopKThunk(
+      const HloCustomCallInstruction* custom_call);
+
+  absl::StatusOr<ThunkSequence> EmitSliceThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitDynamicUpdateSliceThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitSortThunk(
+      const HloInstruction* instruction);
+
+  absl::StatusOr<ThunkSequence> EmitXnnFusionThunk(
+      const HloInstruction* instruction);
+
+  // Returns the list of buffer allocation slices assigned to the given
+  // instruction that will be passed to the host kernel as arguments: a
+  // flattened list of all the leaf buffers for all operands and result. We do
+  // not materialize tuples at run time and only read and write from buffers
+  // corresponding to arrays.
+  absl::StatusOr<HostKernelAllocationSlices> GetHostKernelAllocationSlices(
+      const HloInstruction* instruction);
+
+  // Verifies that the element types of all of the given operand instructions
+  // match and are of one of the given supported types.
+  absl::Status ElementTypesSameAndSupported(
+      const HloInstruction& instruction,
+      absl::Span<const HloInstruction* const> operands,
+      absl::Span<const PrimitiveType> supported_types);
+
+  // Convenience function that creates a thunk sequence containing given kernel.
+  static absl::StatusOr<ThunkSequence> MakeKernelThunkSequence(
+      const HloInstruction* instruction,
+      const ThunkEmitter::HostKernelAllocationSlices& buffers,
+      const IrEmitter2::KernelInfo& kernel,
+      std::optional<uint64_t> min_alignment = std::nullopt);
+
+  static absl::StatusOr<ThunkSequence> MakeKernelThunkSequence(
+      const HloInstruction* instruction,
+      std::unique_ptr<LlvmIrKernelSpec> kernel_spec,
+      std::optional<uint64_t> min_alignment = std::nullopt);
+
+  IrEmitter2& ir_emitter_;
+  const BufferAssignment& buffer_assignment_;
+
+  const TargetMachineFeatures& target_machine_features_;
+  const HloModuleConfig& hlo_module_config_;
+
+  // A global resource that is used to order all collective operations.
+  std::shared_ptr<Resource> communicator_resource_;
+
+  // Token resources that correspond to the token buffer allocation slices. We
+  // rely on buffer assignment to assign unique "identity" to each token, and
+  // create a separate resource for each unique allocation slice.
+  absl::flat_hash_map<BufferAllocation::Slice, std::shared_ptr<Resource>>
+      token_resources_;
+
+  std::vector<EmittedKernel> kernels_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_THUNK_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/tiled_dot_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/tiled_dot_emitter.h
new file mode 100644
index 00000000..ca4d9dca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/tiled_dot_emitter.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+#define XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+
+#include <cstdint>
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+
+// These routines emit LLVM IR implementing tiled GEMM and GEMV routines.
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, int64_t tile_rows,
+                      int64_t tile_cols, int64_t m, int64_t k, llvm::Value* lhs,
+                      llvm::Value* rhs, llvm::Value* addend,
+                      llvm::Value* result, llvm::IRBuilderBase* b,
+                      const HloModuleConfig& module_config);
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, int64_t tile_rows,
+                         int64_t tile_cols, int64_t m, int64_t k,
+                         llvm::Value* lhs, llvm::Value* rhs,
+                         llvm::Value* addend, llvm::Value* result,
+                         llvm::IRBuilderBase* b,
+                         const HloModuleConfig& module_config);
+
+void EmitSmallGemm(PrimitiveType scalar_type, int64_t m, int64_t k, int64_t n,
+                   int64_t max_vectorization_width, int64_t max_vector_count,
+                   int64_t min_vectorization_width, int64_t tile_size_m,
+                   int64_t tile_size_k, llvm::Value* lhs, llvm::Value* rhs,
+                   llvm::Value* result, llvm::IRBuilderBase* b,
+                   const HloModuleConfig& module_config);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/windows_compatibility.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/windows_compatibility.h
new file mode 100644
index 00000000..4e10087e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/windows_compatibility.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
+#define XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
+
+#ifdef _MSC_VER
+
+extern "C" {
+
+// MSVC does not have sincos[f].
+void sincos(double x, double *sinv, double *cosv);
+void sincosf(float x, float *sinv, float *cosv);
+
+}
+
+#endif  // _MSC_VER
+
+#endif  // XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/xfeed_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/xfeed_manager.h
new file mode 100644
index 00000000..3dee7629
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/xfeed_manager.h
@@ -0,0 +1,117 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header declares the abstract class for the infeed manager that
+// is used by the CPU runtime to transfer buffers into an executing
+// CPU computation, e.g., to feed data into a while loop.
+
+#ifndef XLA_SERVICE_CPU_XFEED_MANAGER_H_
+#define XLA_SERVICE_CPU_XFEED_MANAGER_H_
+
+#include <cstdint>
+#include <deque>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/shape.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+// Abstract class defining an infeed buffer that is passed to the
+// runtime by a client. The client manages the storage of the buffer.
+class XfeedBuffer {
+ public:
+  virtual ~XfeedBuffer() = default;
+
+  virtual int32_t length() = 0;
+  virtual void* data() = 0;
+
+  // The 'shape' parameter reflects what shape the embedded program was
+  // expecting / producing with respect to this XfeedBuffer. E.g. this will
+  // contain information about the layout of an outfed buffer.
+  virtual void Done(absl::StatusOr<Shape> shape) = 0;
+};
+
+// Reusable component for managing the infeed and outfeed queue state.
+class XfeedQueueManager {
+ public:
+  XfeedQueueManager(std::string queue_name) : queue_name_(queue_name) {}
+
+  // Adds a sequence of buffers to the queue atomically. buffer->Done will be
+  // called when the buffer will no longer be accessed by the XfeedManager,
+  // either as a result of a call to Reset or because the runtime has dequeued
+  // and used the buffer.
+  void EnqueueBuffersAtomically(absl::Span<XfeedBuffer* const> buffers);
+
+  // Blocks until the queue is non-empty, then returns the buffer at the head of
+  // the queue. Sets the current buffer to be the returned buffer. It is an
+  // error to call BlockingDequeueBuffer if there is an unreleased current
+  // buffer, i.e., ReleaseCurrentBuffer must be called between calls to
+  // BlockingDequeueBuffer.
+  XfeedBuffer* BlockingDequeueBuffer();
+
+  // Releases the current buffer, which is the last buffer returned by
+  // BlockingDequeuBuffer and not yet released. length and data must
+  // match the buffer->length() and buffer->data() for the current
+  // buffer.
+  //
+  // 'shape' communicates the shape of the buffer being released. If the program
+  // passed a value that could not be decoded as a shape, 'shape' will be an
+  // error status. In the case of outfeed, this indicates the layout of the
+  // shape that has been outfed. In the case of infeed, this can be used for
+  // sanity checking purposes.
+  void ReleaseCurrentBuffer(int32_t length, void* data,
+                            absl::StatusOr<Shape> shape);
+
+ private:
+  const std::string queue_name_;
+
+  absl::Mutex mu_;
+
+  // XfeedBuffer* queue contents are not owned, but buffer->Done must
+  // be called when the buffer is no longer needed by the runtime.
+  std::deque<XfeedBuffer*> enqueued_buffers_;
+
+  // If non-NULL, the buffer that is currently being processed by the
+  // runtime. Not owned.
+  XfeedBuffer* current_buffer_ = nullptr;
+};
+
+// Client-side class used to enqueue infeed buffers.
+class XfeedManager {
+ public:
+  XfeedManager() = default;
+
+  XfeedQueueManager* infeed() { return &infeed_; }
+  XfeedQueueManager* outfeed() { return &outfeed_; }
+
+ private:
+  XfeedQueueManager infeed_ = {"infeed"};
+  XfeedQueueManager outfeed_ = {"outfeed"};
+};
+
+int64_t GetByteSizeRequirement(const Shape& shape, int64_t pointer_size);
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_XFEED_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/xla_framework.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/xla_framework.h
new file mode 100644
index 00000000..e3a0ff07
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu/xla_framework.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_XLA_FRAMEWORK_H_
+#define XLA_SERVICE_CPU_XLA_FRAMEWORK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "xla/service/cpu/xla_framework.pb.h"
+
+namespace xla {
+namespace cpu {
+
+// Maps the descriptor table with inputs/outputs. Note that flattened_outputs
+// and result are mutually exclusive -- see below.
+//
+// Contains the same info as "xla_framework" MLIR annotations. That is:
+// - inputs: indices in the descriptor table of the input arguments.
+// - output_is_tuple: if set, the output is a tuple.
+// - flattened_outputs: if the output is a tuple, this contains the indices
+//   (if any) in the descriptor table that correspond to the expanded tuple.
+// - result: if the output is NOT a tuple, contains the index in the descriptor
+//   table of the result.
+struct XlaFrameworkMapping {
+  std::vector<int64_t> inputs;
+  std::vector<int64_t> flattened_outputs;
+  int64_t result = -1;
+  bool output_is_tuple = false;
+
+  XlaFrameworkMappingProto ToProto() const {
+    XlaFrameworkMappingProto proto;
+    *proto.mutable_inputs() = {inputs.begin(), inputs.end()};
+    *proto.mutable_flattened_outputs() = {flattened_outputs.begin(),
+                                          flattened_outputs.end()};
+    proto.set_result(result);
+    proto.set_output_is_tuple(output_is_tuple);
+    return proto;
+  }
+
+  void FromProto(const XlaFrameworkMappingProto& proto) {
+    inputs = {proto.inputs().begin(), proto.inputs().end()};
+    flattened_outputs = {proto.flattened_outputs().begin(),
+                         proto.flattened_outputs().end()};
+    result = proto.result();
+    output_is_tuple = proto.output_is_tuple();
+  }
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_XLA_FRAMEWORK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/cpu_gpu_shape_verifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
new file mode 100644
index 00000000..7adee3b4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_GPU_SHAPE_VERIFIER_H_
+#define XLA_SERVICE_CPU_GPU_SHAPE_VERIFIER_H_
+
+#include <memory>
+#include <utility>
+
+#include "xla/service/hlo_verifier.h"
+
+namespace xla {
+
+// Verifies that HLO Shapes are supported by the XLA-CPU and XLA-GPU compilers.
+class CpuGpuShapeVerifier : public ShapeVerifier {
+ public:
+  explicit CpuGpuShapeVerifier(const HloVerifierOpts& opts)
+      : ShapeVerifier(opts) {}
+
+  absl::Status Preprocess(HloInstruction* hlo) override;
+};
+
+// A verifier metadata class that uses the CpuGpuShapeVerifier.
+class CpuGpuVerifierMetadata : public TargetVerifierMetadata {
+ public:
+  explicit CpuGpuVerifierMetadata(HloVerifierOpts&& opts)
+      : TargetVerifierMetadata(std::move(opts)) {}
+
+  std::unique_ptr<ShapeVerifier> GetVerifier() const override {
+    return std::make_unique<CpuGpuShapeVerifier>(GetVerifierOpts());
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_GPU_SHAPE_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_sharding_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_sharding_helper.h
new file mode 100644
index 00000000..f4287e39
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_sharding_helper.h
@@ -0,0 +1,83 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+
+#ifndef XLA_SERVICE_CUSTOM_CALL_SHARDING_HELPER_H_
+#define XLA_SERVICE_CUSTOM_CALL_SHARDING_HELPER_H_
+
+namespace xla {
+
+// Helper class that helps implement sharding propagation policies for
+// CustomCalls. It is called and used by the ShardingPropagation pass. Meant to
+// be overridden by targets.
+class CustomCallShardingHelper {
+ public:
+  // Function that manipulates an instruction sharding based on a user wanting
+  // to update the sharding of an instruction.
+  virtual HloSharding PropagateUserSharding(const HloInstruction* instruction,
+                                            const HloInstruction* user,
+                                            const HloSharding& sharding) const;
+  // Infer sharding from the operands of an instruction.
+  virtual std::optional<HloSharding> InferShardingFromOperands(
+      const HloInstruction* instruction) const;
+  // Returns if the instruction passed as parameter is a supported custom-call
+  // for which the functions of this class are implemented.
+  virtual bool IsCustomCallShardable(const HloInstruction* instruction) const;
+  // Returns the list of instructions in sub-computations that must be sharded
+  // in the same way as `instruction`.
+  virtual std::vector<HloInstruction*> GetRelatedInstructions(
+      HloInstruction* instruction) const {
+    return {};
+  }
+  // Returns if the given custom-call instruction can propagate sharding to its
+  // operands.
+  virtual bool CanPropagateShardingToOperands(
+      const HloInstruction* instruction) const;
+  virtual ~CustomCallShardingHelper() = default;
+};
+
+namespace spmd {
+class SpmdPartitioningVisitor;
+}  // namespace spmd
+
+// Helper class that provides a partitioning function in addition to sharding
+// policies.
+class CustomCallPartitioner : public CustomCallShardingHelper {
+ public:
+  virtual absl::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
+                                 HloInstruction* hlo) const;
+
+  // Returns if the given side-effecting custom-call is allowed to have
+  // replicated sharding.
+  virtual bool CanSideEffectingHaveReplicatedSharding() const { return false; }
+};
+
+// Fetch partitioning overrides on a per-custom_call_target basis.
+const CustomCallPartitioner* GetCustomCallPartitioner(
+    const std::string& custom_call_target);
+// Register partitioning overrides on a per-custom_call_target basis.
+void RegisterCustomCallPartitioner(
+    const std::string& custom_call_target,
+    std::unique_ptr<CustomCallPartitioner> partitioner);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CUSTOM_CALL_SHARDING_HELPER_H__
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status.h b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status.h
new file mode 100644
index 00000000..68287b22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CUSTOM_CALL_STATUS_H_
+#define XLA_SERVICE_CUSTOM_CALL_STATUS_H_
+
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ABI-stable public interfaces for XlaCustomCallStatus.
+
+// Represents the result of a CustomCall: success or failure, with an associated
+// error message in the failure case.
+typedef struct XlaCustomCallStatus_ XlaCustomCallStatus;
+
+// Set the XlaCustomCallStatus to a success state. This is the default state.
+void XlaCustomCallStatusSetSuccess(XlaCustomCallStatus* status);
+
+// Set the XlaCustomCallStatus to a failure state with the given error message.
+// Does not take ownership of the supplied message string; instead copies the
+// first 'message_len' bytes, or up to the null terminator, whichever comes
+// first.
+void XlaCustomCallStatusSetFailure(XlaCustomCallStatus* status,
+                                   const char* message, size_t message_len);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // XLA_SERVICE_CUSTOM_CALL_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status_internal.h
new file mode 100644
index 00000000..e2ccc511
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status_internal.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CUSTOM_CALL_STATUS_INTERNAL_H_
+#define XLA_SERVICE_CUSTOM_CALL_STATUS_INTERNAL_H_
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "xla/service/custom_call_status.h"
+
+struct XlaCustomCallStatus_ {
+  // The message being present means "failure". Otherwise means "success".
+  std::optional<std::string> message;
+};
+
+namespace xla {
+
+// Get a view of the internal error message of the XlaCustomCallStatus. Only
+// lives as long as the XlaCustomCallStatus. Returns an empty optional if the
+// result was "success".
+std::optional<absl::string_view> CustomCallStatusGetMessage(
+    const XlaCustomCallStatus* status);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CUSTOM_CALL_INTERNAL_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status_test_c_caller.h b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status_test_c_caller.h
new file mode 100644
index 00000000..380f91f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_status_test_c_caller.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CUSTOM_CALL_STATUS_TEST_C_CALLER_H_
+#define XLA_SERVICE_CUSTOM_CALL_STATUS_TEST_C_CALLER_H_
+
+#include "xla/service/custom_call_status.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void CSetSuccess(XlaCustomCallStatus* status);
+
+void CSetFailure(XlaCustomCallStatus* status, const char* message,
+                 size_t message_len);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // XLA_SERVICE_CUSTOM_CALL_STATUS_TEST_C_CALLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_target_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_target_registry.h
new file mode 100644
index 00000000..e78d1884
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/custom_call_target_registry.h
@@ -0,0 +1,122 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
+#define XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
+
+// This file is depended on by kernels that have to build for mobile devices.
+// For this reason, we avoid relying on TensorFlow and instead only use the
+// standard C++ library.
+
+#include <cstddef>
+#include <functional>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace xla {
+
+// XLA JIT compilers use this registry to resolve symbolic CustomCall targets;
+// so when using XLA as a JIT, CustomCall targets need to be registered here
+// with the symbol name used in the CustomCall.
+//
+// The XLA:CPU ahead-of-time (AOT) compiler links using a standard offline
+// linker; so when compiling in CPU AOT mode, you *also* need to make sure the
+// name of the callee (presumably implemented in C++) matches up with the
+// symbolic name used in the CustomCall. Be careful with the name of the symbol
+// you register with the macros: C++ namespaces are not included, including
+// anonymous namespaces,so if two libraries attempt to register functions with
+// the same name in separate namespaces the registrations will collide. Either
+// call the registration macro from the global namespace so that you have to
+// refer to the function in a fully-qualified manner (which also requires you to
+// emit HLO-based calls to it by the fully-qualified name *and* complicates
+// future refactoring!) or use C-style namespacing directly in the symbol name.
+//
+// We maintain the registry in both the JIT and the AOT cases for simplicity,
+// but we only use it when running in JIT mode.
+class CustomCallTargetRegistry {
+ public:
+  static CustomCallTargetRegistry* Global();
+
+  void Register(const std::string& symbol, void* address,
+                const std::string& platform);
+  void* Lookup(const std::string& symbol, const std::string& platform) const;
+
+  std::unordered_map<std::string, void*> registered_symbols(
+      const std::string& platform) const;
+
+ private:
+  // hash<pair<T,T>> is surprisingly not provided by default in stl.  It would
+  // be better to use absl's hash function, but we're avoiding an absl
+  // dependency here because this library is pulled in by all XLA:CPU AoT
+  // binaries.
+  struct HashPairOfStrings {
+    size_t operator()(const std::pair<std::string, std::string>& k) const {
+      std::hash<std::string> hasher;
+      size_t h1 = hasher(k.first);
+      size_t h2 = hasher(k.second);
+      // This is a bad hash function, but it's good enough for our purposes
+      // here.  Nobody is going to try to DoS this hashtable.  :)
+      return h1 ^ 31 * h2;
+    }
+  };
+
+  // Maps the pair (symbol, platform) to a C function implementing a custom call
+  // named `symbol` for StreamExecutor platform `platform`.
+  //
+  // Different platforms have different ABIs.  TODO(jlebar): Describe them!
+  //
+  // (We use std::unordered_map and std::mutex rather than absl::flat_hash_map
+  // and absl::mutex because we want to avoid an absl dependency, because this
+  // library is pulled in by all XLA:CPU AoT binaries.)
+  std::unordered_map<std::pair<std::string, std::string>, void*,
+                     HashPairOfStrings>
+      registered_symbols_;
+  mutable std::mutex mu_;
+};
+
+class RegisterCustomCallTarget {
+ public:
+  explicit RegisterCustomCallTarget(const std::string& name, void* address,
+                                    const std::string& platform) {
+    CustomCallTargetRegistry::Global()->Register(name, address, platform);
+  }
+};
+
+#define XLA_REGISTER_CUSTOM_CALL_CONCAT(a, b) a##b
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address,   \
+                                                        platform, counter) \
+  static ::xla::RegisterCustomCallTarget XLA_REGISTER_CUSTOM_CALL_CONCAT(  \
+      custom_call_target_register, counter)(                               \
+      symbol, reinterpret_cast<void*>(address), platform)
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET(function, platform) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function, platform)
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address, platform)  \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, platform, \
+                                                  __COUNTER__)
+
+// Convenience overloads for registering custom-call targets on the CPU.
+#define XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(function) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function, "Host")
+
+#define XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address, "Host")
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/defuser.h b/third_party/tflite-hdrs/third_party/xla/xla/service/defuser.h
new file mode 100644
index 00000000..46ad0263
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/defuser.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DEFUSER_H_
+#define XLA_SERVICE_DEFUSER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/defuser.h"
+
+#endif  // XLA_SERVICE_DEFUSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/despecializer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/despecializer.h
new file mode 100644
index 00000000..c230c278
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/despecializer.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DESPECIALIZER_H_
+#define XLA_SERVICE_DESPECIALIZER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/despecializer.h"
+
+#endif  // XLA_SERVICE_DESPECIALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dot_as_convolution_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_as_convolution_util.h
new file mode 100644
index 00000000..9bed1699
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_as_convolution_util.h
@@ -0,0 +1,106 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
+#define XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace dot_as_convolution_util {
+
+// Type of Batch representation for a convolution that has a spatial dimension
+// that is effectively a batch dimension. We currently have two
+// representations that we detect as "batch equivalent" and this enum allows
+// differentiating between the two.
+enum class SpatialBatchRepresentation {
+  kNone,
+  kUnpaddedVersion,
+  kPaddedVersion,
+};
+
+// Describes the dimensions of a convolution that can be interpreted as a dot
+// or a normal convolution.
+struct DotConvolutionDimsInfo {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimNums {
+    int64_t lhs;
+    int64_t rhs;
+    int64_t output;
+    // The corresponding spatial dimension in the convolution's config. Set to
+    // -1 if it's not mapped to a spatial dimension.
+    int64_t spatial_dim;
+  };
+  std::vector<DimNums> batch_dims;
+  std::vector<DimNums> contracting_dims;
+  std::vector<DimNums> lhs_non_contracting_dims;
+  std::vector<DimNums> rhs_non_contracting_dims;
+  std::vector<DimNums> conv_spatial_dims;
+
+  int64_t lhs_shape_rank;
+  int64_t rhs_shape_rank;
+  int64_t output_shape_rank;
+};
+
+// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo. If it can
+// be interpreted as a dot, there is no conv_spatial_dims.
+DotConvolutionDimsInfo ParseConvolutionDimsInfo(const HloInstruction* conv);
+
+// Creates sharded convolution instruction that can be interpreted as a dot.
+// This is a utility for per-op partitioners.
+//  - 'conv' is the original convolution instruction.
+//  - 'dot_dnums' is the result of ParseDotConvolutionDimsInfo() for 'conv'.
+//  - 'sharded_lhs_hlo' and 'sharded_rhs_hlo' are sharded inputs for the result
+//    convolution instruction.
+absl::StatusOr<std::unique_ptr<HloInstruction>>
+CreateShardedConvForDotGeneralConvolution(
+    const HloInstruction& conv, const DotConvolutionDimsInfo& dot_dnums,
+    HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo);
+
+// Check if a spatial dim is parallel batch dimension.
+// A parallel batch dimension in DotGeneral is represented as a spatial
+// dimension with window size B (batch dimension size), stride B - 1, and base
+// dilation B or an alternative representation of window size B, stride B,
+// padding low/high B - 1, base dilation B - 1 and window reversal
+SpatialBatchRepresentation SpatialIsBatch(int64_t lhs_spatial_size,
+                                          const WindowDimension& spatial_wd);
+// Returns if the spatial dimension represented by 'spatial_wd' is an LHS non
+// contracting dimension.
+bool SpatialIsLhsNonContracting(int64_t rhs_spatial_size,
+                                const WindowDimension& spatial_wd);
+// Returns if the spatial dimension represented by 'spatial_wd' is an RHS non
+// contracting dimension.
+bool SpatialIsRhsNonContracting(int64_t lhs_spatial_size,
+                                int64_t rhs_spatial_size,
+                                const WindowDimension& spatial_wd);
+// Returns if the spatial dimension represented by 'spatial_wd' endsup being
+// equivalent to a contracting dimension.
+bool SpatialIsContracting(int64_t lhs_spatial_size, int64_t rhs_spatial_size,
+                          const WindowDimension& spatial_wd);
+// Returns a DotConvolutionDimsInfo from a kDot instruction, where all
+// the spatial_dim values are set to -1.
+DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot);
+
+}  // namespace dot_as_convolution_util
+}  // namespace xla
+
+#endif  // XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dot_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_decomposer.h
new file mode 100644
index 00000000..1e6f4015
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_decomposer.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DOT_DECOMPOSER_H_
+#define XLA_SERVICE_DOT_DECOMPOSER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/dot_decomposer.h"
+
+#endif  // XLA_SERVICE_DOT_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dot_dimension_merger.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_dimension_merger.h
new file mode 100644
index 00000000..dcc23bc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_dimension_merger.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DOT_DIMENSION_MERGER_H_
+#define XLA_SERVICE_DOT_DIMENSION_MERGER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/dot_dimension_merger.h"
+
+#endif  // XLA_SERVICE_DOT_DIMENSION_MERGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dot_merger.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_merger.h
new file mode 100644
index 00000000..5f8c1160
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dot_merger.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DOT_MERGER_H_
+#define XLA_SERVICE_DOT_MERGER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/dot_merger.h"
+
+#endif  // XLA_SERVICE_DOT_MERGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dump.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dump.h
new file mode 100644
index 00000000..263da1af
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dump.h
@@ -0,0 +1,199 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DUMP_H_
+#define XLA_SERVICE_DUMP_H_
+
+#include <string>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/Operation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_graph_dumper.h"
+#include "xla/xla.pb.h"
+
+// Consolidated utilities for logging information during compilation, usually
+// based on the options specified in the DebugOptions proto.
+//
+// Most functions here take an HloModule and read the DebugOptions from the
+// module's config.
+
+namespace xla {
+
+// Argument used when calling DumpHloModuleIfEnabled before optimizations are
+// performed on an HloModule.
+constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
+constexpr char kAfterOptimizationsDumpName[] = "after_optimizations";
+
+class BufferAssignment;
+class HloSnapshot;
+
+// Creates dir if doesn't exist (analogue of `mkdir -p`), tries to get around
+// race conditions by trying again on collision.
+absl::Status CreateDirIfNeeded(const std::string& dir, tsl::Env* env);
+
+// Get a timestamp which we can use as a filename prefix specific to this
+// module.
+std::string TimestampFor(const HloModule& module);
+
+// Create the filename we will use to dump in DumpToFileInDir.
+std::string FilenameFor(int unique_id, absl::string_view module_name,
+                        absl::string_view prefix, absl::string_view suffix);
+std::string FilenameFor(const HloModule& module, absl::string_view prefix,
+                        absl::string_view suffix);
+
+// Writes the given string to a file in the xla_dump_to directory specified by
+// module's DebugOptions.
+//
+// If module doesn't have an xla_dump_to directory, does nothing.
+void DumpToFileInDir(const HloModule& module, absl::string_view file_prefix,
+                     absl::string_view file_suffix, absl::string_view contents);
+void DumpToFileInDir(const DebugOptions& debug_options,
+                     absl::string_view filename, absl::string_view contents);
+
+// Like DumpToFileInDir, except if module doesn't have an xla_dump_to directory
+// specified, or if that directory is equal to "-", writes to stdout instead.
+void DumpToFileInDirOrStdout(const HloModule& module,
+                             absl::string_view file_prefix,
+                             absl::string_view file_suffix,
+                             absl::string_view contents);
+
+// Like DumpToFileInDir, except if debug_options doesn't have an xla_dump_to
+// directory specified, or if that directory is equal to "-", writes to stdout
+// instead.
+void DumpToFileInDirOrStdout(const DebugOptions& debug_options, int unique_id,
+                             absl::string_view module_name,
+                             absl::string_view file_prefix,
+                             absl::string_view file_suffix,
+                             absl::string_view contents);
+
+// Writes the given op to a file in the xla_dump_to directory specified by
+// module's DebugOptions. Sets the op's source locations to that file.
+//
+// If module doesn't have an xla_dump_to directory, does nothing.
+void DumpToFileInDirOrStdout(const HloModule& module,
+                             absl::string_view file_prefix,
+                             mlir::Operation* op);
+
+// Dumps the given protobuf to the given filename if dumping is enabled.
+// Exactly where and in what formats it's dumped is determined by the debug
+// options. Allows for an optional custom serialization function to be used for
+// added customization.
+void DumpProtobufToFile(const tsl::protobuf::Message& proto,
+                        const DebugOptions& debug_options,
+                        absl::string_view filename,
+                        absl::AnyInvocable<absl::StatusOr<std::string>(
+                            tsl::Env*, const tsl::protobuf::Message&)>
+                            text_formatter = nullptr);
+
+// Render graph in a given format.
+std::string RenderGraph(absl::string_view label, const HloModule& module,
+                        RenderedGraphFormat format,
+                        bool show_fusion_subcomputations = true);
+
+// Similar to above, but the filename depends on module's information and the
+// given name. Also allows for the optional serialization function.
+void DumpPerModuleProtobufToFile(const HloModule& module,
+                                 const tsl::protobuf::Message& proto,
+                                 const DebugOptions& debug_options,
+                                 absl::string_view name,
+                                 absl::AnyInvocable<absl::StatusOr<std::string>(
+                                     tsl::Env*, const tsl::protobuf::Message&)>
+                                     text_formatter = nullptr);
+
+// Dumps the given HLO module if dumping is enabled for the module. Exactly
+// where and in what formats it's dumped is determined by the module's config.
+// Returns the full file paths of all dumps of the module, or an empty vector if
+// nothing was dumped.
+std::vector<std::string> DumpHloModuleIfEnabled(const HloModule& module,
+                                                absl::string_view name);
+std::vector<std::string> DumpHloModuleIfEnabled(
+    const HloModule& module, const BufferAssignment& buffer_assn,
+    absl::string_view name);
+
+// Dumps the given HLO module after running one HLO pass and before running
+// another, if that's enabled. Returns the full file paths of all dumps of the
+// module, or an empty vector if nothing was dumped.
+std::vector<std::string> DumpHloModuleBetweenPassesIfEnabled(
+    absl::string_view pipeline_name, absl::string_view before_pass_name,
+    absl::string_view after_pass_name, const HloModule& module);
+
+// Dumps the given HLO module during the given HLO pass, if that's enabled.
+//
+// "step" is a human-readable description of where we are in the middle of this
+// pass.  For example, "before-assigning-layouts".
+void DumpHloModuleDuringPassIfEnabled(absl::string_view pass_name,
+                                      absl::string_view step,
+                                      const HloModule& module);
+
+// Dumps the given HloSnapshot to the module's xla_dump_dir, if this is enabled.
+//
+// Prefer the first overload below, as this will give filenames that are
+// consistent with the other methods here.  The second overload (which doesn't
+// take an HloModule) is useful in the cases when you're dumping an HloSnapshot
+// and simply don't have an HloModule.
+void DumpHloSnapshotIfEnabled(const HloModule& module,
+                              const HloSnapshot& snapshot);
+void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
+                              const DebugOptions& opts);
+
+// Dumps the given HloUnoptimisedSnapshot to the module's xla_dump_dir, if this
+// is enabled.
+void DumpHloUnoptimizedSnapshotIfEnabled(
+    const HloUnoptimizedSnapshot& hlo_snapshot, const DebugOptions& opts);
+
+void DumpHloModuleMetadataIfEnabled(const std::vector<HloModule*>& modules);
+
+// Returns true if we should dump data for an HloModule.  This is useful if you
+// want to check if DumpToFileInDir{,OrStdout} will do anything before
+// generating an expensive string.
+bool DumpingEnabledForHloModule(absl::string_view hlo_module_name,
+                                const DebugOptions& opts);
+
+// Returns true if we should dump data for an HLO pass
+bool DumpingEnabledForHloPass(absl::string_view hlo_pass_name,
+                              const DebugOptions& opts);
+
+inline bool DumpingEnabledForHloModule(const HloModule& module) {
+  return DumpingEnabledForHloModule(module.name(),
+                                    module.config().debug_options());
+}
+
+// Returns true if DumpToFileInDirOrStdout and DumpHloModuleIfEnabled will write
+// to stdout, rather than to a file on disk.
+//
+// This is useful if you want to do something different when writing to stdout.
+// For example, maybe you have (almost-)duplicate data that you wouldn't mind
+// writing to two files, but you don't want to print twice.
+bool DumpingToStdout(const DebugOptions& opts);
+
+// Writes the given message in binary proto to the path formed by joining
+// 'directory/file_name.pb'. The 'directory' is recursively created if it
+// doesn't already exist, and the 'file_name' is sanitized by replacing
+// illegal characters with underscore '_'.
+//
+// If 'full_name' is not null then it is set to the name of the file the
+// protobuf was written to.
+absl::Status DumpProtoToDirectory(const tsl::protobuf::Message& message,
+                                  const std::string& directory,
+                                  const std::string& file_name,
+                                  std::string* full_path = nullptr);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_DUMP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_dimension_inference.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_dimension_inference.h
new file mode 100644
index 00000000..e063ee3e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_dimension_inference.h
@@ -0,0 +1,231 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+#define XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+// Each instruction can have one of the three modes in supporting dynamic
+// lowering.
+enum OpDynamismSupport : uint8_t {
+  // There is no support for dynamic lowering -- dynamic padder will make sure
+  // the input to that op has static bound by rewriting the op (e.g, extra space
+  // in reduce_sum will be padded with 0).
+  kNoSupport = 0,
+  // The op can take either dynamic input or static input.
+  kOptional,
+  // The op only has a dynamic lowering, dynamic padder will make sure the input
+  // to this op is in dynamic form.
+  kRequired,
+};
+
+// Returns true if given instruction supports native dynamic lowering. If
+// so, dynamic padder will not attempt to pad it.
+using OpSupportsDynamismHandler =
+    std::function<OpDynamismSupport(HloInstruction*)>;
+
+// DynamicDimensionInference analyzes each HLO instruction in a graph and
+// inferences which dimensions are dynamic and which scalar instructions
+// represent the runtime real size of those dynamic dimensions.
+class DynamicDimensionInference {
+ public:
+  enum ShapeCheckMode {
+    kInvalid = 0,
+    // At compile time, pessimisticly assumes runtime shape checks may fail and
+    // returns a compile-time error.
+    kCompileTime,
+    // Insert runtime checks as Hlo ops.
+    kRuntime,
+    // Ignore shape check.
+    kIgnore,
+  };
+  using CustomCallInferenceHandler =
+      std::function<absl::Status(HloInstruction*, DynamicDimensionInference*)>;
+
+  // Generate an assertion which fails the execution if the instruction value is
+  // false.
+  using AssertionGenerator = std::function<void(HloInstruction*)>;
+
+  static absl::StatusOr<DynamicDimensionInference> Run(
+      HloModule* module,
+      OpSupportsDynamismHandler op_supports_dynamism_handler = nullptr,
+      CustomCallInferenceHandler custom_call_handler = nullptr,
+      ShapeCheckMode shape_check_mode = ShapeCheckMode::kIgnore,
+      const AssertionGenerator& assertion_generator = nullptr,
+      const absl::flat_hash_set<absl::string_view>& execution_threads_ = {});
+
+  std::string ToString() const;
+
+  // If the dimension `dim` of instruction `inst` at `index` has a dynamic size,
+  // returns a scalar HloInstruction that represents the runtime size of that
+  // dimension. Otherwise returns nullptr.
+  HloInstruction* GetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
+                                 int64_t dim) const;
+
+  const HloInstruction* GetDynamicSize(const HloInstruction* inst,
+                                       const ShapeIndex& index,
+                                       int64_t dim) const;
+
+  // Returns dynamic sizes of all dimensions of `inst`'s leaf node at `index`.
+  // Static sizes are represented by nullptr.
+  std::vector<HloInstruction*> GetDynamicSizes(HloInstruction* inst,
+                                               const ShapeIndex& index) const;
+
+  // Returns if `index` at `inst` contains any dynamic dimension.
+  // Recursively go into tuples.
+  bool HasDynamicDimension(HloInstruction* inst,
+                           ShapeIndexView index = {}) const;
+
+  // Forward dynamic dimension size at `dim` from `inst` to `new_inst`.
+  absl::Status ForwardDynamicSize(HloInstruction* inst,
+                                  HloInstruction* new_inst,
+                                  const ShapeIndex& index);
+
+  // Update the dynamic mapping so that we know dimension `dim` of instruction
+  // `inst` at `index` has a dynamic size, and its runtime size is represented
+  // by a scalar instruction `size`.
+  void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
+                      int64_t dim, HloInstruction* size);
+
+  // For all tensors whose dynamic dimension is `replace`, replace them with
+  // `with`.
+  void ReplaceAllDynamicDimensionUsesWith(HloInstruction* replace,
+                                          HloInstruction* with);
+
+  // Get the original dynamic shape of the given instruction.
+  Shape GetDynamicShape(HloInstruction* inst);
+
+  // Returns true iff all dynamic dimensions on the operands of the given
+  // instruction have inferred dynamic sizes.
+  bool CanInfer(HloInstruction* hlo);
+
+  // Returns true iff DynamicDimensionInferenceVisitor made changes to the
+  // module.
+  bool changed() const { return changed_; }
+
+  friend class DynamicDimensionInferenceVisitor;
+
+ private:
+  explicit DynamicDimensionInference(
+      HloModule* module, OpSupportsDynamismHandler op_supports_dynamism_handler,
+      CustomCallInferenceHandler custom_call_handler,
+      ShapeCheckMode shape_check_mode, AssertionGenerator assertion_generator,
+      const absl::flat_hash_set<absl::string_view>& execution_threads_);
+
+  // DynamicDimension is used as a key in the dynamic key-value mapping. It
+  // unambiguously represents a dynamic dimension of a instruction at a given
+  // index.
+  struct DynamicDimension {
+    // HloInstruction that holds the dimension.
+    HloInstruction* inst;
+    // Subshape of the instruction that holds the dimension.
+    ShapeIndex index;
+    // The dimension number of the dynamic dimension at given index of a given
+    // instruction.
+    int64_t dim;
+
+    // Artifacts needed to make this struct able to be used as a `key` in absl
+    // maps. "friend" keywords are added so these functions can be found through
+    // ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.inst, m.index, m.dim);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.inst == rhs.inst && lhs.index == rhs.index &&
+             lhs.dim == rhs.dim;
+    }
+
+    std::tuple<int, int, std::string, int64_t> ToTuple() const {
+      return std::make_tuple(
+          inst && inst->GetModule() ? inst->GetModule()->unique_id() : -1,
+          inst ? inst->unique_id() : -1, index.ToString(), dim);
+    }
+
+    friend bool operator<(const DynamicDimension& lhs,
+                          const DynamicDimension& rhs) {
+      return lhs.ToTuple() < rhs.ToTuple();
+    }
+  };
+
+  // Copies the internal mapping from instruction `from` to instruction `to`.
+  // This is useful when an instruction is replaced by the other during the
+  // inferencing process.
+  // For cases where the `from` and `to` instructions are in different
+  // computations, a `dynamic_size_map` can be provided which maps the dynamic
+  // size instructions in the `from` computation into the corresponding
+  // instruction in the `to` computation.
+  void CopyMapping(HloInstruction* from, HloInstruction* to,
+                   const absl::flat_hash_map<HloInstruction*, HloInstruction*>*
+                       dynamic_size_map = nullptr);
+
+  // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
+  // module_.
+  absl::Status AnalyzeDynamicDimensions();
+
+  // HloModule being analyzed.
+  HloModule* module_;
+
+  // dynamic_mapping_ holds the result of the analysis. It maps a dynamic
+  // dimension to a scalar HloInstruction that represents the real dynamic size
+  // of the dynamic dimension.
+  using DynamicMapping = std::map<DynamicDimension, HloInstruction*>;
+  DynamicMapping dynamic_mapping_;
+
+  // A convenient mapping from an hlo to the set of dynamic dimensions that it
+  // holds.
+  using PerHloDynamicDimensions =
+      ConstHloInstructionMap<std::set<DynamicDimension>>;
+  PerHloDynamicDimensions per_hlo_dynamic_dimensions_;
+
+  OpSupportsDynamismHandler op_supports_dynamism_handler_;
+
+  // A handler for custom calls.
+  CustomCallInferenceHandler custom_call_handler_;
+
+  // Indicates what to do at places where shape check is needed.
+  ShapeCheckMode shape_check_mode_;
+
+  AssertionGenerator assertion_generator_;
+
+  bool changed_ = false;
+
+  const absl::flat_hash_set<absl::string_view>& execution_threads_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_dimension_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_dimension_simplifier.h
new file mode 100644
index 00000000..0824118c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_dimension_simplifier.h
@@ -0,0 +1,21 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_DYNAMIC_DIMENSION_SIMPLIFIER_H_
+#define XLA_SERVICE_DYNAMIC_DIMENSION_SIMPLIFIER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h"
+
+#endif  // XLA_SERVICE_DYNAMIC_DIMENSION_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_index_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_index_splitter.h
new file mode 100644
index 00000000..670d297d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_index_splitter.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+#define XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/dynamic_index_splitter.h"
+
+#endif  // XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_padder.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_padder.h
new file mode 100644
index 00000000..8ec029ac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_padder.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DYNAMIC_PADDER_H_
+#define XLA_SERVICE_DYNAMIC_PADDER_H_
+
+#include <functional>
+
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/dynamic_dimension_inference.h"
+
+namespace xla {
+
+// With bounded shapes, only part of the shape contains effective data and the
+// rest contains padded data, whose value can be anything depending on the
+// source of the data. When a bounded shape is directly consumed by an
+// instruction that collapses dimensions (reduce for example), the padding data
+// would affect result of the instruction.
+//
+// DynamicPadder uses DynamicDimensionInference to detect bounded shapes in a
+// hlo module, it then inserts certain instructions to reset the padding into an
+// identity value so that in doesn't affect the result of subsequent
+// instruction. For example, it'd reset the padding to 0 before a bounded shape
+// is consumed by a reduce-sum.
+//
+// Dynamic_padder removes dynamic shapes from the entry computation, and inserts
+// custom calls (with dynamic shapes), which are lowered by specialized
+// emitters: PadToStatic and SliceToDynamic.
+//
+// Note that it is not currently possible to send the output of PadToStatic
+// across thread boundaries, and such shapes will be sent across the boundary in
+// dynamic form. The DynamicPadder should be run separately for each thread that
+// requires static shapes, and the dynamic shapes will be padded within the
+// thread's computation.
+
+struct DynamicPadderOptions {
+  // Determines the form of dynamism supported by an HLO op.
+  OpSupportsDynamismHandler op_supports_dynamism_handler = nullptr;
+
+  // Instruct how to inference output dynamic dimensions of custom calls.
+  DynamicDimensionInference::CustomCallInferenceHandler custom_call_handler =
+      nullptr;
+
+  // If `slice_dynamic_output` is true, insert 'slice_to_dynamic' ops to all
+  // outputs that are inferred to be dynamic.
+  bool slice_dynamic_output = true;
+
+  // Assertion generator for shape checks, only used if shape check mode is
+  // "runtime".
+  DynamicDimensionInference::AssertionGenerator assertion_generator;
+
+  // If set to true, pessimisticly assumes runtime shape checks may fail and
+  // returns a compile-time error.
+  DynamicDimensionInference::ShapeCheckMode shape_check_mode =
+      DynamicDimensionInference::ShapeCheckMode::kIgnore;
+};
+
+class DynamicPadder : public HloModulePass {
+ public:
+  explicit DynamicPadder(DynamicPadderOptions options = DynamicPadderOptions())
+      : options_(options) {}
+
+  absl::string_view name() const override { return "dynamic_padder"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  DynamicPadderOptions options_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_DYNAMIC_PADDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_window_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_window_utils.h
new file mode 100644
index 00000000..d0c25413
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/dynamic_window_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_DYNAMIC_WINDOW_UTILS_H_
+#define XLA_SERVICE_DYNAMIC_WINDOW_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/types.h"
+namespace xla {
+struct DynamicWindowDims {
+  HloInstruction* padding_before;
+  HloInstruction* output_size;
+};
+
+// This mirrors the logic in GetWindowedOutputSizeVerbose but with HLOs as
+// inputs and outputs.
+DynamicWindowDims GetWindowedOutputSize(HloInstruction* input_size,
+                                        int64_t window_size,
+                                        int64_t window_dilation,
+                                        int64_t window_stride,
+                                        PaddingType padding_type);
+
+DynamicWindowDims GetWindowedInputGradSize(HloInstruction* input_size,
+                                           int64_t window_size,
+                                           int64_t window_dilation,
+                                           int64_t window_stride,
+                                           PaddingType padding_type);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_DYNAMIC_WINDOW_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/eigh_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/eigh_expander.h
new file mode 100644
index 00000000..5ef10cff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/eigh_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_EIGH_EXPANDER_H_
+#define XLA_SERVICE_EIGH_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/eigh_expander.h"
+
+#endif  // XLA_SERVICE_EIGH_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/elemental_ir_emitter.h
new file mode 100644
index 00000000..b45c7e0d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -0,0 +1,361 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
+#define XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
+
+#include <tuple>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/ir_builder_mixin.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+
+class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
+ public:
+  struct Options {
+    // Instead of relying on builtin `fpext` and `fpcast` emit a bitcast and
+    // truncate to convert f32 to bf16 (and emit extend to convert bf16 to f32).
+    bool xla_cpu_use_truncate_f32_to_bf16_conversion = false;
+  };
+
+  using HloToElementGeneratorMap =
+      absl::flat_hash_map<const HloInstruction*, llvm_ir::ElementGenerator>;
+
+  ElementalIrEmitter(llvm::Module* module, llvm::IRBuilderBase* b,
+                     const Options& options)
+      : b_(b), module_(module), options_(options) {}
+
+  ElementalIrEmitter(llvm::Module* module, llvm::IRBuilderBase* b)
+      : ElementalIrEmitter(module, b, Options()) {}
+
+  virtual ~ElementalIrEmitter() = default;
+
+  // Returns a function to generate an element of the output of `hlo`, given a
+  // map of functions to generate elements of its operands.
+  llvm_ir::ElementGenerator MakeElementGenerator(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator);
+
+  llvm::IRBuilderBase* b() { return b_; }
+
+  // builder() is for IrBuilderMixin.
+  llvm::IRBuilderBase* builder() { return b_; }
+
+  llvm::Module* module() { return module_; }
+
+  // Returns which ops invalidate the cache of emitted instructions by creating
+  // a new BasicBlock and setting the insertion point to the newly created
+  // BasicBlock. We can only reuse cached values if they were emitted in the
+  // same BasicBlock as the current BasicBlock.
+  static bool OpInvalidatesCache(const HloInstruction* hlo);
+
+ protected:
+  virtual llvm_ir::IrArray::Index GetSourceIndexOfBitcast(
+      const llvm_ir::IrArray::Index& index, const HloInstruction* hlo) {
+    return index.SourceIndexOfBitcast(hlo->shape(), hlo->operand(0)->shape(),
+                                      b_);
+  }
+
+  virtual absl::StatusOr<llvm::Value*> EmitFloatBinaryOp(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value);
+
+  virtual llvm::Value* EmitExtractReal(llvm::Value* value);
+  virtual llvm::Value* EmitExtractImag(llvm::Value* value);
+
+ private:
+  virtual absl::StatusOr<llvm::Value*> EmitUnaryOp(const HloInstruction* op,
+                                                   llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitBinaryOp(const HloInstruction* op,
+                                                    llvm::Value* lhs_value,
+                                                    llvm::Value* rhs_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitIntegerUnaryOp(
+      const HloInstruction* op, llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitFloatUnaryOp(
+      const HloInstruction* op, llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexUnaryOp(
+      const HloInstruction* op, llvm::Value* operand_value);
+
+  llvm::Value* IsZero(llvm::Value* v);
+  llvm::Value* IsIntMinDivisionOverflow(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* GetZero(llvm::Type* type);
+  llvm::Value* GetOne(llvm::Type* type);
+  llvm::Value* GetIntSMin(llvm::Type* type);
+  llvm::Value* GetMinusOne(llvm::Type* type);
+
+  llvm::Value* EmitIntegerDivide(llvm::Value* lhs, llvm::Value* rhs,
+                                 bool is_signed);
+  llvm::Value* EmitIntegerRemainder(llvm::Value* lhs, llvm::Value* rhs,
+                                    bool is_signed);
+  llvm::Value* EmitIntegerPow(llvm::Value* lhs, llvm::Value* rhs,
+                              bool is_signed);
+
+  virtual absl::StatusOr<llvm::Value*> EmitPredBinaryOp(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitIntegerBinaryOp(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value,
+      bool is_signed);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexBinaryOp(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value);
+
+  virtual llvm::Value* EmitFloatMax(llvm::Value* lhs_value,
+                                    llvm::Value* rhs_value,
+                                    absl::string_view name);
+
+  virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value,
+                                    llvm::Value* rhs_value,
+                                    absl::string_view name);
+
+  llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed);
+
+  llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed);
+
+  virtual absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                                 llvm::Value* lhs,
+                                                 llvm::Value* rhs,
+                                                 absl::string_view name);
+
+  virtual absl::StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
+                                               llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
+                                                llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitCbrt(PrimitiveType prim_type,
+                                                llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
+                                               llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
+                                               llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitCosm1(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitTan(PrimitiveType prim_type,
+                                               llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
+                                               llvm::Value* value,
+                                               absl::string_view name);
+
+  virtual absl::StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type,
+                                               llvm::Value* lhs,
+                                               llvm::Value* rhs,
+                                               absl::string_view name);
+
+  virtual absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                               llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                                llvm::Value* value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitReducePrecision(
+      const HloInstruction* hlo, llvm::Value* x);
+
+  virtual absl::StatusOr<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*>>
+  EmitComplexAbsHelper(PrimitiveType prim_type, llvm::Value* real,
+                       llvm::Value* imag, bool return_sqrt);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexAbs(
+      PrimitiveType prim_type, llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitSqrtComplexAbs(
+      PrimitiveType prim_type, llvm::Value* operand_value);
+  virtual absl::StatusOr<llvm::Value*> EmitRsqrtComplexAbs(
+      PrimitiveType prim_type, llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexAdd(const HloInstruction* op,
+                                                      llvm::Value* lhs_value,
+                                                      llvm::Value* rhs_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexSubtract(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexMultiply(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexDivide(
+      const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexLog(
+      const HloInstruction* op, llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexSqrt(
+      const HloInstruction* op, PrimitiveType prim_type,
+      llvm::Value* operand_value);
+
+  virtual absl::StatusOr<llvm::Value*> EmitComplexRsqrt(
+      const HloInstruction* op, PrimitiveType prim_type,
+      llvm::Value* operand_value);
+
+  absl::StatusOr<llvm::Value*> EmitAccumResult(
+      absl::Span<llvm::Value* const> accumulator_addrs,
+      llvm::ArrayRef<llvm::Type*> accumulator_types, bool is_variadic);
+
+  // Composes a complex struct. imag may be nullptr for simple cast operations.
+  llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
+                                  llvm::Value* imag);
+
+  // Emit `accumulator + lhs * rhs` for the given primitive type.
+  llvm::Value* EmitMulAdd(llvm::Value* lhs, llvm::Value* rhs,
+                          llvm::Value* accumulator,
+                          xla::PrimitiveType primitive_type);
+
+  absl::StatusOr<llvm::Value*> EmitElementalSelect(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalClamp(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalConcatenate(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& target_index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalDynamicSlice(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalGather(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalDynamicUpdateSlice(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalPad(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& padded_index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalDot(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& dot_result_index);
+
+  virtual absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer) = 0;
+
+  absl::StatusOr<llvm::Value*> EmitElementalMap(
+      const HloMapInstruction* map_instr,
+      absl::Span<llvm::Value* const> elemental_operands);
+
+  absl::StatusOr<llvm::Value*> EmitElementalReduceWindow(
+      const HloReduceWindowInstruction* reduce_window,
+      std::vector<llvm_ir::ElementGenerator> input_generators,
+      std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+      const llvm_ir::IrArray::Index& index);
+
+  absl::StatusOr<llvm::Value*> EmitElementalReduce(
+      const HloReduceInstruction* reduce,
+      std::vector<llvm_ir::ElementGenerator> input_generators,
+      std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+      const llvm_ir::IrArray::Index& index);
+
+  virtual absl::StatusOr<llvm::Value*> EmitConvolution(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  // Computes the complex power function.
+  absl::StatusOr<llvm::Value*> EmitComplexPower(const HloInstruction* op,
+                                                llvm::Value* lhs_value,
+                                                llvm::Value* rhs_value);
+
+  // Evaluates a polynomial using Horner's method.
+  absl::StatusOr<llvm::Value*> EvaluatePolynomial(
+      llvm::Type* type, llvm::Value* x, absl::Span<const double> coefficients);
+
+  virtual bool fast_min_max() = 0;
+
+  llvm::IRBuilderBase* const b_;
+
+  llvm::Module* module_;
+
+  Options options_;
+
+  friend class ElementalIrEmitterForTests;
+};
+
+// Allow to instantiate IR emitter in tests.
+class ElementalIrEmitterForTests : public ElementalIrEmitter {
+ public:
+  ElementalIrEmitterForTests(llvm::Module* module, llvm::IRBuilderBase* builder)
+      : ElementalIrEmitter(module, builder) {}
+
+  absl::Status TestElementalDot(const HloInstruction* hlo,
+                                const llvm_ir::IrArray::Index& index) {
+    return EmitElementalDot(hlo, generator_map_, index).status();
+  }
+
+ private:
+  absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name, bool is_reducer) override {
+    return absl::UnimplementedError("");
+  }
+  bool fast_min_max() override { return false; }
+
+  HloToElementGeneratorMap generator_map_;
+};
+
+absl::StatusOr<llvm::Value*> EmitReducePrecisionIR(
+    PrimitiveType src_ty, llvm::Value* x, int64_t dest_exponent_bits,
+    int64_t dest_mantissa_bits, bool quiet_nans, llvm::IRBuilderBase* b);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/executable.h b/third_party/tflite-hdrs/third_party/xla/xla/service/executable.h
new file mode 100644
index 00000000..12ee0969
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/executable.h
@@ -0,0 +1,447 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_EXECUTABLE_H_
+#define XLA_SERVICE_EXECUTABLE_H_
+
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_execution_profile.h"
+#include "xla/service/hlo_graph_dumper.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// TODO(b/150633678): Both the ExecutionInput and ExecutionOutput need to be
+// revisited, with the execute APIs taking data structure which can better model
+// shareable buffers.
+//
+// ExecutionInput buffers are in one of three states:
+//
+// 1) Owned by the caller and immutable.
+// 2) Donated by the caller but returned on error.
+// 3) Donated by the caller and freed on error.
+//
+// Case (1) buffers are stored as MaybeOwningDeviceMemory(DeviceMemoryBase).
+// Case (2) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory),
+//   with their indices present in unowned_indices_.
+// Case (3) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory),
+//   with their indices absent from unowned_indices_.
+class ExecutionInput {
+ public:
+  explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
+  }
+  // TODO(b/170310047): remove this overload.
+  ExecutionInput(xla::Shape shape, xla::Shape host_shape)
+      : buffers_(std::move(shape)) {
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
+  }
+
+  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
+      : buffers_(std::move(buffers)) {
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
+  }
+  // TODO(b/170310047): remove this overload.
+  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
+                 xla::Shape host_shape)
+      : buffers_(std::move(buffers)) {
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
+  }
+
+  ExecutionInput(ExecutionInput&&) = default;
+
+  ~ExecutionInput();
+
+  ExecutionInput& operator=(ExecutionInput&&) noexcept = default;
+
+  const Shape& shape() const {
+    return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
+  }
+
+  const Shape& host_shape() const {
+    return host_shape_ != nullptr ? *host_shape_ : shape();
+  }
+
+  absl::Status SetDynamicShape(Shape dynamic_shape);
+
+  void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
+    *buffers_.mutable_element(index) = std::move(buffer);
+  }
+
+  void SetUnownedBuffer(const ShapeIndex& index,
+                        MaybeOwningDeviceMemory buffer);
+
+  void SetUnownedIndex(const ShapeIndex& index) {
+    unowned_indices_.insert(index);
+  }
+
+  void ClearUnownedIndex(const ShapeIndex& index) {
+    unowned_indices_.erase(index);
+  }
+
+  const std::set<ShapeIndex>& unowned_indices() { return unowned_indices_; }
+
+  const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
+
+  ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; }
+
+  MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) {
+    return buffers_.mutable_element(index);
+  }
+
+  const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const {
+    return buffers_.element(index);
+  }
+
+ private:
+  void SetHostShape(xla::Shape host_shape) {
+    if (shape() != host_shape) {
+      host_shape_ = std::make_unique<Shape>(std::move(host_shape));
+    }
+  }
+
+  ShapeTree<MaybeOwningDeviceMemory> buffers_;
+  // Set of indices of buffers that should be returned to the caller if an error
+  // occurs when enqueuing the computation.
+  std::set<ShapeIndex> unowned_indices_;
+  std::unique_ptr<Shape> dynamic_shape_;
+  std::unique_ptr<Shape> host_shape_;
+};
+
+// ExecutionOutput encapsulates the output buffers of a execution and the
+// leftover buffers to be released by the caller.
+class ExecutionOutput {
+ public:
+  explicit ExecutionOutput(ScopedShapedBuffer result)
+      : result_(std::move(result)) {}
+  ExecutionOutput(ScopedShapedBuffer result,
+                  std::vector<se::OwningDeviceMemory> to_be_released)
+      : result_(std::move(result)),
+        to_be_released_(std::move(to_be_released)) {}
+  // TODO(b/170310047): remove this overload.
+  ExecutionOutput(Shape on_host_shape, Shape on_device_shape,
+                  se::DeviceMemoryAllocator* allocator, int device_ordinal,
+                  int physical_device_ordinal = -1)
+      : result_(std::move(on_device_shape), allocator, device_ordinal,
+                physical_device_ordinal) {}
+  ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator,
+                  int device_ordinal, int physical_device_ordinal = -1)
+      : result_(std::move(on_device_shape), allocator, device_ordinal,
+                physical_device_ordinal) {}
+  ExecutionOutput(ExecutionOutput&&) noexcept = default;
+  ExecutionOutput& operator=(ExecutionOutput&&) noexcept = default;
+
+  ~ExecutionOutput() {
+    // If the ExecutionOutput has not been committed, and if there are aliased
+    // indices, clear them off the ScopedShapedBuffer to prevent them to be
+    // released.
+    for (auto& index : aliased_indices_) {
+      result_.set_buffer(se::OwningDeviceMemory(), index);
+    }
+  }
+
+  void AddAliasedIndex(ShapeIndex index) {
+    aliased_indices_.push_back(std::move(index));
+  }
+
+  void AddToBeReleased(se::OwningDeviceMemory mem) {
+    to_be_released_.push_back(std::move(mem));
+  }
+
+  // Should be called once it is known that the execute operation succeeded,
+  // before returning the ExecutionOutput to the caller.
+  ExecutionOutput& Commit() {
+    aliased_indices_.clear();
+    return *this;
+  }
+
+  const ScopedShapedBuffer& Result() const { return result_; }
+
+  ScopedShapedBuffer* MutableResult() { return &result_; }
+
+  ScopedShapedBuffer ConsumeResult() {
+    aliased_indices_.clear();
+    return std::move(result_);
+  }
+
+  const std::vector<se::OwningDeviceMemory>& ToBeReleased() const {
+    return to_be_released_;
+  }
+
+  std::vector<se::OwningDeviceMemory> ConsumeToBeReleased() {
+    return std::move(to_be_released_);
+  }
+
+  std::vector<ShapeIndex> ConsumeAliasedIndices() {
+    auto aliased = std::move(aliased_indices_);
+    aliased_indices_.clear();
+    return aliased;
+  }
+
+ private:
+  ScopedShapedBuffer result_;
+
+  // Leftover buffers for the caller to release. Elements in this list are
+  // donated input memory buffers that are not reused by XLA as outputs.
+  std::vector<se::OwningDeviceMemory> to_be_released_;
+
+  // These are the indices in result_ which have been aliased from the caller.
+  // If the execution operation fails, the caller should maintain ownership of
+  // the buffer, so we track the indices here, and unless the ExecutionOutput is
+  // committed, we remove them from the result_ before destruction.
+  std::vector<ShapeIndex> aliased_indices_;
+
+  // A shape table is a continuous region in memory that is used to hold the
+  // runtime dimension sizes of dynamic output shapes.
+  se::OwningDeviceMemory output_shape_table_;
+};
+
+// A given platform's compiler will produce an Executable -- this is a uniform
+// interface that is used for launching compiled programs across platforms.
+class Executable {
+ public:
+  // The hlo_module parameter may be nullptr, if the given executable type
+  // doesn't need it for execution.
+  explicit Executable(std::shared_ptr<HloModule> hlo_module)
+      : hlo_module_(std::move(hlo_module)) {}
+
+  // TODO(b/172012028): Remove this constructor.
+  // The hlo_module parameter may be nullptr, if the given executable type
+  // doesn't need it for execution.
+  explicit Executable(
+      std::shared_ptr<HloModule> hlo_module,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+      : hlo_module_(std::move(hlo_module)),
+        hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
+        hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
+    CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
+             hlo_profile_index_map_.get() == nullptr);
+  }
+  virtual ~Executable() {}
+
+  // Enqueues the compilation result on the provided stream, passing the given
+  // arguments. This call is blocking and returns after the execution is done.
+  //
+  // Returns a shaped buffer containing the result of the computation.
+  absl::StatusOr<ScopedShapedBuffer> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments);
+
+  // Starts the given program executing on the given stream/executor.
+  //
+  // `arguments` are ShapeTree containing the input parameters. For each element
+  // in the shape tree, if the element holds the ownership of the memory, it is
+  // considered donated and XLA will potentially reuse it as output buffers. For
+  // all donated inputs, XLA is also responsible for freeing them.
+  //
+  // If an input is donated to XLA but is not reused as output, it is returned
+  // as an leftover buffer for the caller to release.
+  //
+  // This call should be non-blocking and may return as soon as all of the
+  // operations are enqueued for launch on the stream. Note that some
+  // implementations may in fact block or may block in some circumstances (e.g.,
+  // when profiling); i.e., asynchronous is a "may" not a "must".
+  virtual absl::StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments);
+
+  // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to
+  // complete.
+  absl::StatusOr<ExecutionOutput> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments);
+
+  virtual absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments) = 0;
+
+  // Same as ExecuteOnStream(), but runs this executable on multiple
+  // streams. arguments[i] contains the arguments to the execution on
+  // run_options[i]->stream() and the returned value is at index i of the
+  // returned vector.
+  virtual absl::StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams(
+      absl::Span<const ServiceExecutableRunOptions> run_options,
+      absl::Span<const absl::Span<const ShapedBuffer* const>> arguments);
+
+  // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
+  // timer for the execution, sets up HLO profiling if enabled, and fills in the
+  // given ExecutionProfile if non-null.
+  absl::StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments);
+
+  absl::StatusOr<ExecutionOutput> ExecuteOnStreamWrapper(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments);
+
+  absl::StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments);
+
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStreamWrapper(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments);
+
+  const HloProfilePrinterData& hlo_profile_printer_data() const {
+    CHECK(hlo_profiling_enabled());
+    return *hlo_profile_printer_data_;
+  }
+
+  const HloProfileIndexMap& hlo_profile_index_map() const {
+    CHECK(hlo_profiling_enabled());
+    return *hlo_profile_index_map_;
+  }
+
+  // Returns whether this executable was compiled with HLO profilings support
+  // enabled. If not, the caller should not expect an hlo_execution_profile
+  // passed to ExecuteOnStream above to be populated during execution.
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
+
+  HloModule& module() const {
+    CHECK(hlo_module_ != nullptr);
+    return *hlo_module_;
+  }
+  std::shared_ptr<HloModule> shared_module() const { return hlo_module_; }
+
+  bool has_module() const { return hlo_module_ != nullptr; }
+
+  const HloModuleConfig& module_config() const {
+    CHECK(hlo_module_ != nullptr);
+    return hlo_module_->config();
+  }
+
+  // The shape (including layout) that results from this execution. This is the
+  // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
+  const Shape& result_shape() const {
+    CHECK(hlo_module_ != nullptr);
+    return hlo_module_->config().entry_computation_layout().result_shape();
+  }
+
+  // Returns the size of the executable in bytes. Returns -1 if this query is
+  // not supported by the executable.
+  //
+  // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.).
+  virtual int64_t SizeOfGeneratedCodeInBytes() const;
+
+  // Dumping helpers.
+  void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) {
+    // Despite the mutex lock, this function is NOT thread-safe.
+    // The mutex is needed for the lazy HLO module loading in `hlo_proto()`.
+    // Since both `hlo_proto()` and `buffer_assignment_proto()` return a
+    // pointer to hlo_proto_, having the mutex is not enough to make this
+    // function thread-safe.
+    absl::MutexLock lock(&hlo_proto_mutex_);
+    hlo_proto_ = std::move(hlo_proto);
+  }
+  bool dumping_snapshot() const {
+    return has_module()
+               ? module_config().debug_options().xla_dump_hlo_snapshots()
+               : false;
+  }
+
+  HloProto const* hlo_proto() const {
+    absl::MutexLock lock(&hlo_proto_mutex_);
+    if (hlo_proto_ != nullptr && !hlo_proto_->has_hlo_module()) {
+      *hlo_proto_->mutable_hlo_module() = module().ToProto();
+    }
+    return hlo_proto_.get();
+  }
+
+  const BufferAssignmentProto* buffer_assignment_proto() const {
+    absl::MutexLock lock(&hlo_proto_mutex_);
+    return hlo_proto_ != nullptr && hlo_proto_->has_buffer_assignment()
+               ? &hlo_proto_->buffer_assignment()
+               : nullptr;
+  }
+
+  std::string& debug_info() { return debug_info_; }
+  void set_debug_info(const std::string& debug_info) {
+    debug_info_ = debug_info;
+  }
+  // Gather unused but donated buffers, return them to the caller of this API.
+  // We don't free buffers inside this function since the caller could have
+  // different preferences for buffer deallocation. For example, in TensorFlow,
+  // buffers are mostly efficiently deallocated as soon as a program has been
+  // launched. However, in XRT, the buffers are expected to be deallocated after
+  // the program has finished since XRT doesn't support async deallocation.
+  void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments,
+                                 ExecutionOutput& result);
+
+  // Returns the allocations resulting from buffer assignment, or an empty span
+  // if unimplemented.
+  virtual absl::Span<const BufferAllocation> GetAllocations() const {
+    return {};
+  }
+
+ protected:
+  // HloModule this was compiled from. BufferAssignment keeps pointers to
+  // HloInstructions owned by the HloModule so we need to keep the HloModule
+  // around if we keep the BufferAssignment around.
+  //
+  // This member may be nullptr, if the given executable type doesn't need it
+  // for execution.
+  const std::shared_ptr<HloModule> hlo_module_;
+
+  // Execution count, used to generate a unique filename for each dumped
+  // execution.
+  int64_t execution_count_ = 0;
+
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
+
+  // Generic debug information as a string.
+  std::string debug_info_;
+
+ private:
+  // The serialized HLO proto. Non-null only if dumping snapshots is enabled.
+  // This field may also be only partially set: if only
+  // hlo_proto_->buffer_assignment is set and hlo_proto_->hlo_module isn't, the
+  // hlo_module proto will be computed on the fly when requested with
+  // hlo_proto(). This avoids wasting CPU and memory if the proto isn't needed.
+  std::unique_ptr<HloProto> hlo_proto_ ABSL_GUARDED_BY(hlo_proto_mutex_);
+  mutable absl::Mutex hlo_proto_mutex_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/execution_tracker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/execution_tracker.h
new file mode 100644
index 00000000..6e206ff7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/execution_tracker.h
@@ -0,0 +1,100 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_EXECUTION_TRACKER_H_
+#define XLA_SERVICE_EXECUTION_TRACKER_H_
+
+#include <map>
+#include <memory>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/backend.h"
+#include "xla/service/stream_pool.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Represents an asynchronously launched execution. Owns the stream (from the
+// passed run_options->stream()) on which the execution is launched and releases
+// the stream when destructed.
+class AsyncExecution {
+ public:
+  AsyncExecution(Backend* backend, std::vector<StreamPool::Ptr> streams,
+                 const ExecutionProfile& profile, GlobalDataHandle result);
+
+  absl::Status BlockUntilDone() const;
+
+  const GlobalDataHandle& result() const { return result_; }
+
+  const ExecutionProfile& profile() const { return profile_; }
+
+ private:
+  // Backend to execute the computation on.
+  Backend* backend_;
+
+  // Stream on which the execution is launched.
+  std::vector<StreamPool::Ptr> streams_;
+
+  // Profile object of the execution to be returned to the user.
+  ExecutionProfile profile_;
+
+  // Data handle to the result of the execution. Data represented by this handle
+  // is valid only after BlockUntilDone() is called.
+  GlobalDataHandle result_;
+};
+
+// Tracks asynchronously launched executions for the XLA service.
+class ExecutionTracker {
+ public:
+  ExecutionTracker();
+
+  // Registers an execution with its backend, streams, and data handle to the
+  // execution result. Returns a handle for the registered execution.
+  ExecutionHandle Register(Backend* backend,
+                           std::vector<StreamPool::Ptr> stream,
+                           const ExecutionProfile& profile,
+                           GlobalDataHandle data);
+
+  // Unregisters the execution for the given handle.
+  absl::Status Unregister(const ExecutionHandle& handle);
+
+  // Resolves the given ExecutionHandle to an AsyncExecution. Returns an
+  // error status if the given handle is not found, which means that the
+  // execution is not yet registered or already unregistered.
+  absl::StatusOr<const AsyncExecution*> Resolve(const ExecutionHandle& handle);
+
+ private:
+  // The next handle to assign to an execution.
+  int64_t next_handle_ ABSL_GUARDED_BY(execution_mutex_);
+
+  // Mapping from ExecutionHandle handle to the corresponding registered
+  // AsyncExecution object.
+  std::map<int64_t, std::unique_ptr<AsyncExecution>> handle_to_execution_
+      ABSL_GUARDED_BY(execution_mutex_);
+
+  absl::Mutex execution_mutex_;  // Guards the execution mapping.
+
+  ExecutionTracker(const ExecutionTracker&) = delete;
+  ExecutionTracker& operator=(const ExecutionTracker&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_EXECUTION_TRACKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/export_hlo.h b/third_party/tflite-hdrs/third_party/xla/xla/service/export_hlo.h
new file mode 100644
index 00000000..e0e0456c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/export_hlo.h
@@ -0,0 +1,120 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_EXPORT_HLO_H_
+#define XLA_SERVICE_EXPORT_HLO_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/stream_executor/device_description.pb.h"
+
+// Functionality to enable HLO uploads from XLA to HLO repositories. Unoptimized
+// HLO means the HLO given to XLA, while optimized HLO refers to HLO that has
+// been successfully compiled. Errors in upload should not block compilation.
+
+namespace xla {
+
+// Uploads HLO to a repository. The only non-dummy implementation is
+// Google-internal as of 2023-10.
+class SymbolUploader {
+ public:
+  virtual ~SymbolUploader() = default;
+
+  // Returns a string identifying the uploaded HLO, or empty if the upload did
+  // not complete. We use optional rather than absl::StatusOr because an upload
+  // error is not a compiler error.
+  virtual std::optional<std::string> MaybeUploadUnoptimizedHloModule(
+      HloModule* module,
+      const stream_executor::GpuTargetConfigProto& gpu_target_config) = 0;
+
+  virtual std::optional<std::string> MaybeUploadOptimizedHloModule(
+      HloModule* module, const AutotuneResults& autotune_results) = 0;
+
+  virtual void MaybeUploadSymbolMapping(
+      absl::string_view unoptimized_fingerprint,
+      absl::string_view optimized_fingerprint) = 0;
+
+  virtual void WaitForUploads() = 0;
+};
+
+// Registers a single process-wide XSymbolUploader to use. The registry is used
+// to provide a hook for internal infrastructure and ensure that only one
+// background thread is uploading.
+class SymbolUploaderRegistry {
+ public:
+  SymbolUploaderRegistry() : xsymbol_uploader_(nullptr) {}
+  void Register(std::unique_ptr<SymbolUploader> xsymbol_uploader) {
+    xsymbol_uploader_ = std::move(xsymbol_uploader);
+  }
+
+  SymbolUploader* uploader() const { return xsymbol_uploader_.get(); }
+
+ private:
+  std::unique_ptr<SymbolUploader> xsymbol_uploader_;
+};
+
+inline SymbolUploaderRegistry& GetGlobalSymbolUploaderRegistry() {
+  static auto* const registry = new SymbolUploaderRegistry;
+  return *registry;
+}
+
+// The actual entry points from XLA start here.
+inline std::optional<std::string> MaybeUploadUnoptimizedGpuSymbols(
+    HloModule* module,
+    const stream_executor::GpuTargetConfigProto& gpu_target_config) {
+  if (SymbolUploader* uploader = GetGlobalSymbolUploaderRegistry().uploader();
+      uploader != nullptr) {
+    return uploader->MaybeUploadUnoptimizedHloModule(module, gpu_target_config);
+  }
+
+  return std::nullopt;
+}
+
+inline std::optional<std::string> MaybeUploadOptimizedGpuSymbols(
+    HloModule* module, const AutotuneResults& autotune_results) {
+  if (SymbolUploader* uploader = GetGlobalSymbolUploaderRegistry().uploader();
+      uploader != nullptr) {
+    return uploader->MaybeUploadOptimizedHloModule(module, autotune_results);
+  }
+
+  return std::nullopt;
+}
+
+inline void MaybeUploadGpuSymbolMapping(
+    absl::string_view unoptimized_fingerprint,
+    absl::string_view optimized_fingerprint) {
+  if (SymbolUploader* uploader = GetGlobalSymbolUploaderRegistry().uploader();
+      uploader != nullptr) {
+    return uploader->MaybeUploadSymbolMapping(unoptimized_fingerprint,
+                                              optimized_fingerprint);
+  }
+}
+
+inline void MaybeWaitForUploads() {
+  if (SymbolUploader* uploader = GetGlobalSymbolUploaderRegistry().uploader();
+      uploader != nullptr) {
+    uploader->WaitForUploads();
+  }
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_EXPORT_HLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/flatten_call_graph.h b/third_party/tflite-hdrs/third_party/xla/xla/service/flatten_call_graph.h
new file mode 100644
index 00000000..ff5af703
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/flatten_call_graph.h
@@ -0,0 +1,24 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Flatten the call graph for an HLO module into a tree.
+
+#ifndef XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
+#define XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
+
+#endif  // XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/float8_fnuz_ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/float8_fnuz_ir_emitter.h
new file mode 100644
index 00000000..d8b37fe7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/float8_fnuz_ir_emitter.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_FLOAT8_FNUZ_IR_EMITTER_H_
+#define XLA_SERVICE_FLOAT8_FNUZ_IR_EMITTER_H_
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace float8_fnuz_ir_emitter {
+
+// Convert the given floating point input to the output type. input_type must
+// be one of BF16, F16, F32, and F64. output_type must be one of F8E4M3FNUZ and
+// F8E5M2FNUZ.
+absl::StatusOr<llvm::Value*> EmitFloatingToF8fnuz(PrimitiveType input_type,
+                                                  llvm::Value* input_value,
+                                                  PrimitiveType output_type,
+                                                  llvm::IRBuilderBase* b);
+
+// Convert the given floating point input to the output type. input_type must
+// be one of F8E4M3FNUZ and F8E5M2FNUZ. output_type must be one of BF16, F16,
+// F32, and F64.
+absl::StatusOr<llvm::Value*> EmitF8fnuzToFloating(PrimitiveType input_type,
+                                                  llvm::Value* f8_value,
+                                                  PrimitiveType output_type,
+                                                  llvm::IRBuilderBase* b,
+                                                  llvm::Module* module);
+}  // namespace float8_fnuz_ir_emitter
+}  // namespace xla
+
+#endif  // XLA_SERVICE_FLOAT8_FNUZ_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/float_normalization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/float_normalization.h
new file mode 100644
index 00000000..db54be02
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/float_normalization.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_FLOAT_NORMALIZATION_H_
+#define XLA_SERVICE_FLOAT_NORMALIZATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/float_normalization.h"
+
+#endif  // XLA_SERVICE_FLOAT_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/float_support.h b/third_party/tflite-hdrs/third_party/xla/xla/service/float_support.h
new file mode 100644
index 00000000..51b782ab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/float_support.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_FLOAT_SUPPORT_H_
+#define XLA_SERVICE_FLOAT_SUPPORT_H_
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// This class has methods to query if a certain low-precision types, such as
+// bfloat16, is supported in certain instructions on a given backend.
+// TODO(reedwm): Rename this to NumberSupport, as it supports int4 in additional
+// to float types
+class FloatSupport {
+ public:
+  explicit FloatSupport(PrimitiveType low_precision_type,
+                        PrimitiveType high_precision_type = F32)
+      : low_precision_type_(low_precision_type),
+        high_precision_type_(high_precision_type) {}
+  virtual ~FloatSupport() = default;
+
+  // The low-precision type. Callers can use this class to query whether the
+  // backend supports this type.
+  PrimitiveType LowPrecisionType() const { return low_precision_type_; }
+
+  // A high-precision type that should be used in place of the low-precision
+  // type if the backend does not support the low-precision type for a certain
+  // instruction.
+  PrimitiveType HighPrecisionType() const { return high_precision_type_; }
+
+  // Returns whether the backend supports a low-precision operand for the HLO
+  // instruction at the given index.
+  virtual bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                           int64_t operand_index) const;
+
+  // Returns whether the backend supports a low-precision output for the HLO
+  // instruction.
+  virtual bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const;
+
+  // Returns whether the backend support mixed precision: the operands, output,
+  // and parameters/output of the called computations can have different
+  // precisions (both the low-precision and the high-precision types).
+  virtual bool SupportsMixedPrecisions(const HloInstruction& hlo) const;
+
+  // Returns whether the given HLO preserves its low-precision operand precision
+  // at the given index, so even if the output is the high-precision type,
+  // elements in the output that depend on the low-precision operand will still
+  // effectively have low precision even if they are in the high-precision
+  // format. Similarly, this also means if the output is low-precision then
+  // increasing the operand precision from the low-precision type to the
+  // high-precision type will not change the output. This typically includes
+  // HLOs that pass elements from the operand to the output without arithmetic
+  // operations.
+  static bool EffectiveOperandPrecisionIsOutputPrecision(
+      const HloInstruction& hlo, int64_t operand_index);
+
+  // Returns if the backend only uses low precision for the operand at the
+  // specified index, even if the operand is in the high-precision type.
+  virtual bool EffectiveOperandPrecisionIsLowPrecision(
+      const HloInstruction& hlo, int64_t operand_index) const;
+
+ private:
+  PrimitiveType low_precision_type_;
+  PrimitiveType high_precision_type_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_FLOAT_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_constant_sinking.h b/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_constant_sinking.h
new file mode 100644
index 00000000..15f15e41
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_constant_sinking.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_FUSION_CONSTANT_SINKING_H_
+#define XLA_SERVICE_FUSION_CONSTANT_SINKING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/fusion_constant_sinking.h"
+
+#endif  // XLA_SERVICE_FUSION_CONSTANT_SINKING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_node_indexing_evaluation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_node_indexing_evaluation.h
new file mode 100644
index 00000000..3132bbc5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_node_indexing_evaluation.h
@@ -0,0 +1,99 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_FUSION_NODE_INDEXING_EVALUATION_H_
+#define XLA_SERVICE_FUSION_NODE_INDEXING_EVALUATION_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/types.h"
+
+namespace xla {
+class FusionNodeIndexingEvaluation {
+ public:
+  explicit FusionNodeIndexingEvaluation(const HloInstruction* fusion,
+                                        int64_t root_usage_count = 1);
+
+  // Evaluate the number of times 'producer' would be emitted if it is fused
+  // into 'fusion_'. If the duplication is "too high" (some arbitrary chosen
+  // constant), returns true.
+  bool CodeDuplicationTooHigh(const HloInstruction* producer) const;
+
+  // Evaluate the maximum code duplication inside the fusion node. If the
+  // maximum code duplication is "too high" (some arbitrary chosen constant),
+  // returns true.
+  bool MaxCodeDuplicationTooHigh() const;
+
+  // Evaluate the number of times 'producer' would be emitted if it is fused
+  // into 'fusion_'.
+  int64_t EvaluateEmittedInstructions(const HloInstruction* producer) const;
+
+  // Update the evaluation cache after having fused 'producer' into 'fusion_'.
+  // 'producer' is the cloned instruction which is now part of the fusion
+  // computation. 'indexing_users_of_producer' are the direct or indirect users
+  // of 'producer' which pass index values created by them.
+  void UpdateEvaluationCache(
+      const HloInstruction* producer,
+      absl::flat_hash_set<const HloInstruction*> indexing_users_of_producer);
+
+  // Prior to fusing, we need to erase the indexing_users_ entry of the
+  // producer to be fused, because the HloInstruction pointer will be
+  // invalidated. We return the set of direct or indirect users which pass index
+  // values created by them to the fusion parameter corresponding to this
+  // producer. This will be needed for updating the evaluation cache (see
+  // UpdateEvaluationCache).
+  absl::flat_hash_set<const HloInstruction*> RemoveFusionOperand(
+      HloInstruction* fusion_operand);
+
+ private:
+  // We don't want to have too much code duplication, because it slows down the
+  // compilation time. There is a tradeoff between compilation time and runtime.
+  // This constant defines the maximum amount of times that we allow to emit the
+  // same op (indexed with different index values).
+  static const int64_t kAllowedCodeDuplication;
+
+  // Computes the 'indexing_users_' and 'index_usage_count_' maps based on the
+  // current instructions inside the fusion node. Also updates
+  // 'total_emitted_instructions_' accordingly.
+  void RecomputeCache();
+
+  // Computes the 'index_usage_count_' entry for 'instruction'.
+  void UpdateIndexUsageCount(const HloInstruction* instruction);
+
+  // Updates the 'indexing_users_' entry of the operands of 'instruction'.
+  void UpdateIndexingUsersOfOperands(const HloInstruction* instruction);
+
+  // Collects for each instruction in a fusion node from which direct or
+  // indirect users newly created index values are passed. Roughly speaking, we
+  // reuse index values if the shapes are equal when ignoring the element type
+  // (we may reuse also if the shape change is a bitcast, but we don't consider
+  // that here). By ignoring potential reuses our estimate of which instruction
+  // generates a new index value is a bit more conservative than necessary.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      indexing_users_;
+
+  // Stores the number of different index accesses for each instruction in a
+  // fusion node. The fusion emitter caches access with the same index, so this
+  // value indicates how many times a specific instruction will be emitted.
+  absl::flat_hash_map<const HloInstruction*, int64_t> index_usage_count_;
+
+  // The fusion instruction.
+  const HloInstruction* fusion_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_FUSION_NODE_INDEXING_EVALUATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_queue.h b/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_queue.h
new file mode 100644
index 00000000..d9c9c1ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/fusion_queue.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_FUSION_QUEUE_H_
+#define XLA_SERVICE_FUSION_QUEUE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+
+// A queue interface that allows implementations to choose fusion candidates in
+// custom order.
+class FusionQueue {
+ public:
+  FusionQueue() = default;
+  virtual ~FusionQueue() = default;
+
+  // Dequeues the next fusion candidates: a consumer and the list of producers
+  // as operand indices.
+  virtual std::pair<HloInstruction*, std::vector<int64_t>>
+  DequeueNextInstructionAndOperandsToFuseInOrder() = 0;
+
+  // A callback passed to the queue implementation right before the producer is
+  // fused into the consumer.
+  virtual void PreFusion(HloInstruction* producer, HloInstruction* consumer) {}
+
+  // A callback passed to the queue implementation right after the fusion is
+  // created. Note that original_producer could have been destroyed.
+  virtual void OnFusingInstruction(HloInstruction* fusion,
+                                   HloInstruction* original_producer,
+                                   HloInstruction* original_consumer) {}
+
+  // A callback passed to the queue implementation when a proposed fusion does
+  // not happen.
+  virtual void NotFusingInstruction(HloInstruction* producer,
+                                    HloInstruction* consumer) {}
+
+  // A callback passed to the queue implementation to notify the removal of an
+  // instruction.
+  virtual void RemoveInstruction(HloInstruction* instruction) = 0;
+
+  // Returns the fusion configuration.
+  virtual const std::vector<bool>* FusionConfiguration() = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_FUSION_QUEUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/fuzzy_matcher.h b/third_party/tflite-hdrs/third_party/xla/xla/service/fuzzy_matcher.h
new file mode 100644
index 00000000..6e5cd3e0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/fuzzy_matcher.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_FUZZY_MATCHER_H_
+#define XLA_SERVICE_FUZZY_MATCHER_H_
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/pattern_matcher.h"
+
+namespace xla {
+
+// Fuzzy matchers for HLOs.
+namespace fm {
+
+// TODO(b/355972677): Extend this to support opcodes other than convert
+template <typename Pattern>
+auto OptConvert(Pattern pattern) {
+  auto shared = match::SharedSubpattern(pattern);
+  return match::AnyOf<HloInstruction>(match::Convert(shared), shared);
+}
+
+#define XLA_FUZZY_UNOP_PATTERN(NAME)                                           \
+  template <typename HloInstructionType>                                       \
+  inline auto NAME(HloInstructionType** matched_inst) {                        \
+    return OptConvert(match::Op(matched_inst).WithOpcode(HloOpcode::k##NAME)); \
+  }                                                                            \
+                                                                               \
+  template <typename Arg>                                                      \
+  inline auto NAME(Arg&& arg) {                                                \
+    return OptConvert(match::Op()                                              \
+                          .WithOpcode(HloOpcode::k##NAME)                      \
+                          .WithOperand(0, std::forward<Arg>(arg)));            \
+  }                                                                            \
+                                                                               \
+  template <typename HloInstructionType, typename Arg>                         \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg) {             \
+    return OptConvert(match::Op(matched_inst)                                  \
+                          .WithOpcode(HloOpcode::k##NAME)                      \
+                          .WithOperand(0, std::forward<Arg>(arg)));            \
+  }
+XLA_FUZZY_UNOP_PATTERN(Tanh)
+XLA_FUZZY_UNOP_PATTERN(Exp)
+XLA_FUZZY_UNOP_PATTERN(Broadcast)
+#undef XLA_FUZZY_UNOP_PATTERN
+
+#define XLA_FUZZY_BINOP_PATTERN(NAME)                                         \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return OptConvert(match::Op(matched_inst)                                 \
+                          .WithOpcode(HloOpcode::k##NAME)                     \
+                          .WithOperand(0, std::forward<Lhs>(lhs))             \
+                          .WithOperand(1, std::forward<Rhs>(rhs)));           \
+  }                                                                           \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return OptConvert(match::Op()                                             \
+                          .WithOpcode(HloOpcode::k##NAME)                     \
+                          .WithOperand(0, std::forward<Lhs>(lhs))             \
+                          .WithOperand(1, std::forward<Rhs>(rhs)));           \
+  }
+XLA_FUZZY_BINOP_PATTERN(Dot)
+XLA_FUZZY_BINOP_PATTERN(Divide)
+XLA_FUZZY_BINOP_PATTERN(Subtract)
+XLA_FUZZY_BINOP_PATTERN(Multiply)
+// Currently we only use binary matcher for reduce.
+XLA_FUZZY_BINOP_PATTERN(Reduce)
+#undef XLA_FUZZY_BINOP_PATTERN
+
+#define XLA_FUZZY_TERNOP_PATTERN(NAME)                                 \
+  template <typename Arg0, typename Arg1, typename Arg2>               \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2) {            \
+    return OptConvert(match::Op()                                      \
+                          .WithOpcode(HloOpcode::k##NAME)              \
+                          .WithOperand(0, std::forward<Arg0>(arg0))    \
+                          .WithOperand(1, std::forward<Arg1>(arg1))    \
+                          .WithOperand(2, std::forward<Arg2>(arg2)));  \
+  }                                                                    \
+                                                                       \
+  template <typename HloInstructionType, typename Arg0, typename Arg1, \
+            typename Arg2>                                             \
+  inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
+                   Arg1&& arg1, Arg2&& arg2) {                         \
+    return OptConvert(match::Op(matched_inst)                          \
+                          .WithOpcode(HloOpcode::k##NAME)              \
+                          .WithOperand(0, std::forward<Arg0>(arg0))    \
+                          .WithOperand(1, std::forward<Arg1>(arg1))    \
+                          .WithOperand(2, std::forward<Arg2>(arg2)));  \
+  }
+XLA_FUZZY_TERNOP_PATTERN(Select);
+#undef XLA_FUZZY_TERNOP_PATTERN
+
+}  // namespace fm
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_FUZZY_MATCHER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gather_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gather_expander.h
new file mode 100644
index 00000000..28a7cc3d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gather_expander.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GATHER_EXPANDER_H_
+#define XLA_SERVICE_GATHER_EXPANDER_H_
+
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites gather operations into (roughly) while loops of dynamic
+// slices.
+//
+// This pass can be used two ways:
+//
+//  - kEliminateAllGathers: For backends that don't support gather, this pass
+//    can convert every gather to a loop.
+//
+//  - kEliminateSimpleGathers: For backends that *do* support gather, this pass
+//    can strength-reduce "simple" gathers -- specifically, gathers that can be
+//    represented without a loop -- to dynamic-slices.
+//
+// Note that even in kEliminateSimpleGathers mode, this pass may still expand a
+// gather into a loop (with a trip-count of 1).  It's up to other simplification
+// passes to remove the loop.
+//
+class GatherExpander : public OpExpanderPass {
+ public:
+  enum Mode {
+    kEliminateAllGathers,
+    kEliminateSimpleGathers,
+  };
+
+  explicit GatherExpander(Mode m) : mode_(m) {}
+
+  absl::string_view name() const override { return "gather_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* gather_inst) override;
+
+ private:
+  Mode mode_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GATHER_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gather_scatter_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gather_scatter_utils.h
new file mode 100644
index 00000000..147e622b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gather_scatter_utils.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GATHER_SCATTER_UTILS_H_
+#define XLA_SERVICE_GATHER_SCATTER_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+// Transforms the given index tensor to make it two-dimensional, with the index
+// vector dimension being dimension 1.
+// Example:
+//   input: indices = tensor<4x2x3xi32>, index_vector_dim = 1
+//   output: tensor<12x2xi32>
+absl::StatusOr<HloInstruction*> TransformStartIndices(HloInstruction* indices,
+                                                      int64_t index_vector_dim);
+
+// Given a map from index vector positions to dimension numbers, returns a pair
+// of permutations that when applied to the operand, let you replace the map
+// with the identity permutation.
+// In gather, the map is called `start_index_map`. In scatter, it's
+// `scatter_dims_to_operand_dims`.
+std::pair<std::vector<int64_t>, std::vector<int64_t>>
+MakeOperandStartIndexPermutations(absl::Span<const int64_t>, int operand_rank);
+
+absl::StatusOr<HloInstruction*> MaybeTranspose(
+    HloInstruction* operand, absl::Span<const int64_t> permutation);
+
+absl::StatusOr<std::vector<HloInstruction*>> MaybeTranspose(
+    absl::Span<HloInstruction* const> operands,
+    const std::vector<int64_t>& operand_permutation);
+
+// Moves the given dimension to the last dimension.
+// Example: MoveDimensionToEnd(tensor<1x2x3xi1>, 0): tensor<2x3x1xi1>.
+absl::StatusOr<HloInstruction*> MoveDimensionToEnd(HloInstruction* operand,
+                                                   size_t dimension,
+                                                   size_t rank);
+
+// Expands an index vector from the start_indices tensor into a vector that can
+// be used to dynamic-slice out of the gather/scatter operand.
+absl::StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
+    const Shape& start_indices_shape, int64_t operand_rank,
+    int64_t index_vector_dim, absl::Span<const int64_t> start_index_map,
+    absl::Span<const int64_t> start_indices_batching_dims,
+    absl::Span<const int64_t> operand_batching_dims,
+    HloInstruction* index_vector, HloInstruction* induction_var);
+
+// Returns true if the given dimension is a collapsed or batching dimension.
+bool IsCollapsedOrBatchingDim(absl::Span<const int64_t> collapsed_dims,
+                              absl::Span<const int64_t> batching_dims,
+                              int64_t dim);
+
+// Returns a map from start_indices explicit batching dims to their
+// corresponding output dims.
+absl::flat_hash_map<int64_t, int64_t>
+GetStartIndicesDimToOutputDimForExplicitBatchingDims(
+    absl::Span<const int64_t> start_indices_batching_dims,
+    int64_t index_vector_dim, absl::Span<const int64_t> offset_dims,
+    int64_t start_indices_rank, int64_t output_rank);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GATHER_SCATTER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gather_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gather_simplifier.h
new file mode 100644
index 00000000..0cbcadc0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gather_simplifier.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GATHER_SIMPLIFIER_H_
+#define XLA_SERVICE_GATHER_SIMPLIFIER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
+
+#endif  // XLA_SERVICE_GATHER_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/generic_transfer_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/generic_transfer_manager.h
new file mode 100644
index 00000000..22a21787
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/generic_transfer_manager.h
@@ -0,0 +1,136 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GENERIC_TRANSFER_MANAGER_H_
+#define XLA_SERVICE_GENERIC_TRANSFER_MANAGER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A generic implementation of the XLA TransferManager interface
+// that is the base class for both CPU and GPU. For GPU, it transfers
+// data between host and device (GPU). For CPU, since the "device"
+// here is the host itself, there's not much for this transfer manager
+// to do except memcpy the result. There is a CpuTransferManager that
+// inherits from GenericTransferManager and handles CPU-specific
+// infeed.
+class GenericTransferManager : public TransferManager {
+ public:
+  struct LiteralFromDeviceMetadata : public TransferManager::TransferMetadata {
+    bool callback_is_host_callback_safe = false;
+  };
+
+  GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
+
+  se::Platform::Id PlatformId() const override;
+
+  void TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      MutableBorrowingLiteral literal, std::function<void(absl::Status)> done,
+      const TransferMetadata* transfer_metadata) override;
+
+  absl::Status TransferLiteralToDeviceAsync(
+      se::Stream* stream, const LiteralSlice& literal,
+      const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata) override;
+
+  absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                       const LiteralSlice& literal) override;
+
+  absl::Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, MutableBorrowingLiteral literal) override;
+
+  absl::Status ResetDevices(
+      absl::Span<se::StreamExecutor* const> executors) override;
+
+  int64_t GetByteSizeRequirement(const Shape& shape) const override;
+
+  absl::Status WriteSingleTupleIndexTable(
+      se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) override;
+
+  Shape HostShapeToDeviceShape(const Shape& host_shape) const override;
+
+  absl::StatusOr<Shape> ChooseCompactLayoutForShape(
+      const Shape& host_shape) const override;
+
+ private:
+  // Transfer a memory block of the given size from the device source into the
+  // 'destination' buffer.
+  //
+  // size is the size to transfer to destination in bytes.
+  virtual absl::Status TransferBufferFromDevice(
+      se::Stream* stream, const se::DeviceMemoryBase& source, int64_t size,
+      void* destination);
+
+  // Transfer a memory block of the given size from 'source' buffer to the given
+  // destination of the device.
+  //
+  // size is the size to transfer from source in bytes.
+  virtual absl::Status TransferBufferToDevice(
+      se::Stream* stream, int64_t size, const void* source,
+      se::DeviceMemoryBase* destination);
+
+  // Transfers a buffer of packed int4 values from the device to the host, then
+  // unpacks them on the host. 'source' is a buffer with (num_elements+1)/2
+  // bytes where each byte stores two int4 values. 'destination' is a buffer
+  // with num_elements bytes, where a single int4 value will be written to each
+  // byte in the lower 4 bits.
+  virtual absl::Status TransferIntNArrayFromDevice(
+      se::Stream* stream, const se::DeviceMemoryBase& source,
+      PrimitiveType element_type, int64_t num_elements, void* destination);
+
+  // Packs an array of int4 values then transfers the packed buffer from the
+  // host to the device. 'source' is a buffer with num_elements bytes, where the
+  // lower 4 bits of each byte stores an int4 value. 'destination' is a buffer
+  // with (num_elements+1)/2 bytes, where two int4 values will be written into
+  // each byte.
+  virtual absl::Status TransferIntNArrayToDevice(
+      se::Stream* stream, PrimitiveType element_type, int64_t num_elements,
+      const void* source, se::DeviceMemoryBase* destination);
+
+  // The platform this transfer manager targets.
+  const se::Platform::Id platform_id_;
+
+  // The size in bytes of pointers on this platform.
+  const size_t pointer_size_;
+
+  GenericTransferManager(const GenericTransferManager&) = delete;
+  GenericTransferManager& operator=(const GenericTransferManager&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GENERIC_TRANSFER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/global_device_id.h b/third_party/tflite-hdrs/third_party/xla/xla/service/global_device_id.h
new file mode 100644
index 00000000..92f30b9f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/global_device_id.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GLOBAL_DEVICE_ID_H_
+#define XLA_SERVICE_GLOBAL_DEVICE_ID_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/types/span.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla {
+
+// Strongly-typed integer type for naming a device globally within a distributed
+// system. XLA doesn't have a strong opinion about what global numbering scheme
+// is applied to GPUs; the user must provide a local -> global mapping via
+// GpuExecutableRunOptions for the local GPUs.
+TSL_LIB_GTL_DEFINE_INT_TYPE(GlobalDeviceId, int64_t);
+
+// Returns a comma-separated string of global device IDs.
+std::string GlobalDeviceIdsToString(absl::Span<GlobalDeviceId const> ids);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GLOBAL_DEVICE_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/all_gather_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/all_gather_combiner.h
new file mode 100644
index 00000000..5ed8af75
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/all_gather_combiner.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_ALL_GATHER_COMBINER_H_
+#define XLA_SERVICE_GPU_ALL_GATHER_COMBINER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/transforms/collectives/all_gather_combiner.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Similarly to `AllGatherCombiner` pass, combines `AllGather` ops into a single
+// larger `AllGather` op to maximize network bandwidth usage. Additionally, if
+// no flags are set for combiner thresholds, the pass will try to figure out the
+// optimal combiner threshold by itself.
+class GpuAllGatherCombiner : public AllGatherCombiner {
+ public:
+  GpuAllGatherCombiner(const se::DeviceDescription& device_info,
+                       const int default_combine_threshold_in_bytes,
+                       int64_t combine_threshold_in_bytes,
+                       int64_t combine_threshold_count, bool combine_by_dim,
+                       bool combine_different_dtypes, int64_t pointer_size)
+      : AllGatherCombiner(combine_threshold_in_bytes, combine_threshold_count,
+                          combine_by_dim, combine_different_dtypes),
+        device_info_(device_info),
+        default_combine_threshold_in_bytes_(default_combine_threshold_in_bytes),
+        pointer_size_(pointer_size) {}
+
+  absl::string_view name() const override { return "gpu-all-gather-combiner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_info_;
+  const int default_combine_threshold_in_bytes_;
+  int64_t pointer_size_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_ALL_GATHER_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/all_reduce_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/all_reduce_combiner.h
new file mode 100644
index 00000000..178164f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/all_reduce_combiner.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_ALL_REDUCE_COMBINER_H_
+#define XLA_SERVICE_GPU_ALL_REDUCE_COMBINER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/transforms/collectives/all_reduce_combiner.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Similarly to `AllReduceCombiner` pass, combines `AllReduce` ops into
+// a single larger `AllReduce` op to maximize network bandwidth usage.
+// Additionally, if no flags are set for combiner thresholds, the pass will try
+// to figure out the optimal combiner threshold by itself.
+class GpuAllReduceCombiner : public AllReduceCombiner {
+ public:
+  GpuAllReduceCombiner(const se::DeviceDescription& device_info,
+                       const int default_combine_threshold_in_bytes,
+                       int64_t combine_threshold_in_bytes,
+                       int64_t combine_threshold_count, int64_t pointer_size)
+      : AllReduceCombiner(combine_threshold_in_bytes, combine_threshold_count),
+        device_info_(device_info),
+        default_combine_threshold_in_bytes_(default_combine_threshold_in_bytes),
+        pointer_size_(pointer_size) {}
+
+  absl::string_view name() const override { return "gpu-all-reduce-combiner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_info_;
+  const int default_combine_threshold_in_bytes_;
+  int64_t pointer_size_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_ALL_REDUCE_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/amdgpu_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/amdgpu_compiler.h
new file mode 100644
index 00000000..1afbb697
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/amdgpu_compiler.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
+#define XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/gpu_compiler.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace gpu {
+
+// AMDGPUCompiler generates efficient GPU executables for AMDGPU target.
+class AMDGPUCompiler : public GpuCompiler {
+ public:
+  AMDGPUCompiler();
+
+  absl::Status OptimizeHloConvolutionCanonicalization(
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      se::dnn::VersionInfo dnn_version,
+      const se::SemanticVersion& toolkit_version) override;
+
+  absl::Status OptimizeHloPostLayoutAssignment(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options, const TargetConfig& gpu_target_config,
+      tsl::thread::ThreadPool* thread_pool) override;
+
+  bool RequiresCollectiveScheduleLinearizer(
+      const HloModule* module, se::StreamExecutor* stream_exec) override;
+
+  absl::Status AddConvAndGemmAutotuningPasses(
+      HloPassPipeline* pipeline, const se::GpuComputeCapability& gpu_version,
+      const CompileOptions& options, HloModule* hlo_module,
+      AutotuneConfig& autotune_config,
+      tsl::thread::ThreadPool* thread_pool) override;
+
+  absl::StatusOr<BackendCompileResult> CompileTargetBinary(
+      const HloModuleConfig& module_config, llvm::Module* llvm_module,
+      const se::DeviceDescription& device_description, bool relocatable,
+      const HloModule* debug_module, const CompileOptions& options) override;
+
+  absl::Status AddGemmFusionAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+      const MultiProcessKeyValueStore& key_value_store,
+      const se::SemanticVersion& toolkit_version) override;
+
+ private:
+  AMDGPUCompiler(const AMDGPUCompiler&) = delete;
+  AMDGPUCompiler& operator=(const AMDGPUCompiler&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AMDGPU_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h
new file mode 100644
index 00000000..0e0fcc71
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h
@@ -0,0 +1,174 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_COMPILE_UTIL_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_COMPILE_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Autotuning utils which require compiling fusions separately. Requires a
+// separate target, as runtime autotuning cannot perform compilation.
+class AutotunerCompileUtil {
+ public:
+  // The GenerateModuleFn must generate/extract a module using the provided
+  // debug options. Typically it should set the debug options of the extracted
+  // module before it would transform it, to ensure that the transforms can use
+  // the debug options. In justified cases, it may override some of the provided
+  // debug options.
+  using GenerateModuleFn =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<HloModule>>(
+          const DebugOptions&)>;
+
+  // Generates a compile util for a platform associated with the `stream`.
+  static absl::StatusOr<AutotunerCompileUtil> Create(
+      const AutotuneConfig& config, const DebugOptions& opts);
+
+  struct ProfilingOutput {
+    ProfilingOutput(absl::Duration duration, ScopedShapedBuffer&& buffer)
+        : duration(duration), output(std::move(buffer)) {}
+
+    absl::Duration duration;
+    ScopedShapedBuffer output;
+  };
+
+  // Generates an executable first, given the module generator function in
+  // `extractor`.
+  //
+  // Runs the resulting executable with the given extractor, cached with
+  // `(cache_key, config)`.
+  absl::StatusOr<ProfilingOutput> ProfileExecutable(
+      Executable* executable, se::Stream* stream,
+      absl::Span<se::DeviceMemoryBase const> input_buffers,
+      absl::Span<Shape const> input_shapes);
+
+  // Generic method to compile a generated module from `extractor` in isolation.
+  //
+  // Returns:
+  //  - `nullptr` on *expected* failure
+  //  - `Executable` if everything goes fine.
+  //  - `Status` on *unexpected* failure.
+  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      GenerateModuleFn extractor);
+
+  // Generic method to extract an HLO using the debug options of the
+  // AutotunerCompileUtil.
+  //
+  // Typically we can use Compile directly.
+  absl::StatusOr<std::unique_ptr<HloModule>> ExtractModule(
+      GenerateModuleFn extractor);
+
+ private:
+  AutotunerCompileUtil(const AutotuneConfig& config, Compiler* compiler,
+                       se::StreamExecutor& stream_executor, se::Stream& stream,
+                       se::DeviceMemoryAllocator& allocator,
+                       const DebugOptions& opts);
+
+  absl::StatusOr<ExecutionOutput> Execute(Executable& executable,
+                                          std::vector<ExecutionInput> arguments,
+                                          ExecutionProfile* profile = nullptr);
+
+  AutotuneConfig config_;
+  Compiler* compiler_;
+  se::StreamExecutor& stream_executor_;
+  se::Stream& stream_;
+  se::DeviceMemoryAllocator& allocator_;
+  DebugOptions opts_;
+};
+
+// A RedZone allocator and a collection of buffers that store the inputs and
+// outputs of an HloInstruction. These are used when running the instruction
+// for autotuning.
+class RedzoneBuffers {
+ public:
+  enum BuffersToCreate {
+    // Create a buffer for all of the instruction's operands. The result shape
+    // is ignored.
+    kAllInputs = 0,
+    // Create a buffer for all of the instruction's operands and the entire
+    // result shape. If the result shape is a tuple, a separate buffer is
+    // created for each subshape.
+    kAllInputsAllOutputs = 1,
+    // Create a buffer for all of the instruction's operands and all of the
+    // subshapes of the result tuple, except for the last one. The last subshape
+    // is considered a scratch buffer and is assumed to be allocated elsewhere.
+    // If the result shape is not a tuple, this will create a buffer
+    // corresponding to the entire shape - equivalent to `kAllInputsAllOutputs`.
+    kAllInputsOutputsNoScratch = 2,
+  };
+  static absl::StatusOr<RedzoneBuffers> FromInstruction(
+      const HloInstruction& instruction, const AutotuneConfig& config,
+      const DebugOptions& debug_options, BuffersToCreate buffers_to_create);
+
+  const std::vector<se::DeviceMemoryBase>& input_buffers() const {
+    return input_buffers_;
+  }
+  const std::vector<Shape>& input_shapes() const { return input_shapes_; }
+  const std::vector<se::DeviceMemoryBase>& output_buffers() const {
+    return output_buffers_;
+  }
+  const Shape& output_shape() const { return output_shape_; }
+  se::RedzoneAllocator& RedzoneAllocator() const { return *redzone_allocator_; }
+
+ private:
+  absl::Status CreateInputs(const HloInstruction& instruction,
+                            const AutotuneConfig& config,
+                            const DebugOptions& debug_options,
+                            int64_t& rng_state);
+
+  absl::Status CreateOutputs(const HloInstruction& instruction,
+                             const AutotuneConfig& config,
+                             const DebugOptions& debug_options,
+                             BuffersToCreate buffers_to_create,
+                             int64_t& rng_state);
+
+  std::unique_ptr<se::RedzoneAllocator> redzone_allocator_;
+  std::vector<se::DeviceMemoryBase> input_buffers_;
+  std::vector<Shape> input_shapes_;
+  std::vector<se::DeviceMemoryBase> output_buffers_;
+  Shape output_shape_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_COMPILE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_status_key.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_status_key.h
new file mode 100644
index 00000000..6b1c8b89
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_status_key.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_STATUS_KEY_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_STATUS_KEY_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla::gpu {
+
+// Status payload key to put errors at when autotune cache hits are required.
+// See absl::Status docs for full details, but methods like
+// {Get,Set,Clear}Payload allow manipulating it. The value of the payload is not
+// specified and individual sources of this error may provide different values.
+extern const absl::string_view kAutotuneCacheRequiredErrorPayloadKey;
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_STATUS_KEY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
new file mode 100644
index 00000000..3dd57d9d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
@@ -0,0 +1,351 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_UTIL_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_UTIL_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+struct DeviceConfig {
+  se::StreamExecutor* stream_exec;  // never null
+
+  // If the `allocator` parameter is not null, we will use it to allocate temp
+  // memory while timing the various convolution algorithms.  If it's null,
+  // we'll use the default allocator on the StreamExecutor.
+  se::DeviceMemoryAllocator* allocator = nullptr;  // may be null
+};
+
+struct DevicelessConfig {
+  // The device description of the target device.
+  se::DeviceDescription device_description;
+};
+
+class AutotuneCacheKey {
+ public:
+  AutotuneCacheKey(const se::DeviceDescription& device_description,
+                   const HloInstruction& instruction)
+      : AutotuneCacheKey(DeviceDescriptionToCacheKey(device_description),
+                         instruction.ToString()) {}
+
+  AutotuneCacheKey(absl::string_view model_str,
+                   const HloInstruction& instruction);
+
+  explicit AutotuneCacheKey(absl::string_view model_str,
+                            absl::string_view hlo_canonical)
+      : model_str_(model_str), hlo_canonical_(hlo_canonical) {}
+
+  absl::string_view GetModelStr() const { return model_str_; }
+
+  absl::string_view GetHlo() const { return hlo_canonical_; }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const AutotuneCacheKey& w) {
+    return H::combine(std::move(h), w.model_str_, w.hlo_canonical_);
+  }
+
+  bool operator==(const AutotuneCacheKey& w) const {
+    return model_str_ == w.model_str_ && hlo_canonical_ == w.hlo_canonical_;
+  }
+
+  std::string ToString() const {
+    return absl::StrFormat("<key model='%s', hlo='%s'>", model_str_,
+                           hlo_canonical_);
+  }
+
+  static std::string DeviceDescriptionToCacheKey(
+      const se::DeviceDescription& device_description);
+
+ private:
+  std::string model_str_;
+  std::string hlo_canonical_;
+};
+
+using AutotuneCacheKeySet = absl::flat_hash_set<AutotuneCacheKey>;
+
+class AutotuneConfig {
+ public:
+  bool should_init_buffers() const { return autotune_level_ >= 2; }
+  bool should_reinit_output_buffer() const { return autotune_level_ >= 3; }
+  bool should_check_correctness() const { return autotune_level_ >= 4; }
+  bool should_skip_wrong_results() const { return autotune_level_ >= 5; }
+  bool should_crash_on_check_failure() const {
+    return should_crash_on_check_failure_;
+  }
+  bool should_require_complete_aot_autotune_results() const {
+    return require_complete_aot_autotune_results_;
+  }
+  // Empty string means no cache is used.
+  const std::string& autotune_cache_dir() const { return autotune_cache_dir_; }
+  const DebugOptions::AutotuneCacheMode& autotune_cache_mode() const {
+    return autotune_cache_mode_;
+  }
+
+  AutotuneConfig(const AutotuneConfig& right)
+      : config_(right.config_),
+        autotune_level_(right.autotune_level_),
+        should_crash_on_check_failure_(right.should_crash_on_check_failure_),
+        exhaustive_tiling_search_(right.exhaustive_tiling_search_),
+        require_complete_aot_autotune_results_(
+            right.require_complete_aot_autotune_results_),
+        autotune_cache_dir_(right.autotune_cache_dir_),
+        autotune_cache_mode_(right.autotune_cache_mode_) {}
+
+  AutotuneConfig(const std::variant<DeviceConfig, DevicelessConfig>& config,
+                 const DebugOptions& debug_options)
+      : config_(config),
+        autotune_level_(debug_options.xla_gpu_autotune_level()),
+        should_crash_on_check_failure_(
+            debug_options.xla_gpu_crash_on_verification_failures()),
+        exhaustive_tiling_search_(
+            debug_options.xla_gpu_exhaustive_tiling_search()),
+        require_complete_aot_autotune_results_(
+            debug_options.xla_gpu_require_complete_aot_autotune_results()),
+        autotune_cache_dir_(
+            debug_options.xla_gpu_per_fusion_autotune_cache_dir()),
+        autotune_cache_mode_(
+            debug_options.xla_gpu_experimental_autotune_cache_mode()) {}
+
+  std::string GetModelStr() const {
+    return AutotuneCacheKey::DeviceDescriptionToCacheKey(
+        GetDeviceDescription());
+  }
+
+  se::StreamExecutor* GetExecutor() const {
+    CHECK(std::holds_alternative<DeviceConfig>(config_));
+    return std::get<DeviceConfig>(config_).stream_exec;
+  }
+
+  se::DeviceMemoryAllocator* GetAllocator() const {
+    CHECK(std::holds_alternative<DeviceConfig>(config_));
+    auto& cf = std::get<DeviceConfig>(config_);
+    if (cf.allocator != nullptr) {
+      return cf.allocator;
+    }
+    if (allocator_ == nullptr) {
+      allocator_ =
+          std::make_unique<se::StreamExecutorMemoryAllocator>(GetExecutor());
+    }
+    return allocator_.get();
+  }
+
+  absl::StatusOr<se::Stream*> GetStream() const {
+    CHECK(std::holds_alternative<DeviceConfig>(config_));
+    return GetAllocator()->GetStream(GetExecutor()->device_ordinal());
+  }
+
+  const se::GpuComputeCapability& GetGpuComputeCapability() const {
+    return GetDeviceDescription().gpu_compute_capability();
+  }
+
+  const se::DeviceDescription& GetDeviceDescription() const {
+    if (auto* device_config = std::get_if<DeviceConfig>(&config_)) {
+      return device_config->stream_exec->GetDeviceDescription();
+    }
+    return std::get<DevicelessConfig>(config_).device_description;
+  }
+
+  bool IsDeviceless() const {
+    return std::holds_alternative<DevicelessConfig>(config_);
+  }
+
+  bool ExhaustiveTilingSearch() const { return exhaustive_tiling_search_; }
+
+ private:
+  std::variant<DeviceConfig, DevicelessConfig> config_;
+  int32_t autotune_level_;
+  bool should_crash_on_check_failure_;
+  bool exhaustive_tiling_search_;
+  bool require_complete_aot_autotune_results_;
+  mutable std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+  std::string autotune_cache_dir_;
+  DebugOptions::AutotuneCacheMode autotune_cache_mode_;
+};
+
+using AutotuneNoCacheFn = std::function<absl::StatusOr<AutotuneResult>()>;
+
+struct AutotunerUtil {
+  static absl::StatusOr<AutotuneResult> Autotune(
+      const HloInstruction* instr, const AutotuneConfig& config,
+      const AutotuneNoCacheFn& autotune_fn);
+
+  // Returns the same cache key that would be used inside Autotune().
+  //
+  // Normally, we don't have to use this low level method.
+  static AutotuneCacheKey GetKey(const HloInstruction* instr,
+                                 const AutotuneConfig& config);
+
+  // Checks if the key is in the autotune cache.
+  //
+  // Normally, we don't have to use this low level method.
+  static absl::StatusOr<bool> IsInCache(const AutotuneCacheKey& key,
+                                        const AutotuneConfig& config);
+
+  // Adds the result to the autotune cache.
+  //
+  // Returns true if the entry is inserted.
+  //
+  // Normally, we don't have to use this low level method.
+  static absl::StatusOr<bool> AddResult(const AutotuneCacheKey& key,
+                                        AutotuneResult result,
+                                        const AutotuneConfig& config);
+
+  // Functions to save/load XLA's autotuning results.
+  //
+  // This is used for ahead-of-time autotuning.  Specifically:
+  //
+  // When XLA calls cublas (for matmuls, aka "gemm" or "dot") or cudnn (for
+  // convolutions), it usually has to choose an "algorithm" for the particular
+  // dot/conv.  XLA queries cublas/cudnn for a list of candidate algorithms.
+  // Then it runs all of them and picks the fastest one.  This is what we call
+  // "autotuning". It happens in GemmAlgorithmPicker and GpuConvAlgorithmPicker.
+  //
+  // Autotuning is necessary to get good performance for dot/conv.  But it also
+  // has some disadvantages.
+  //
+  //  - Because it relies on timing data, it is fundamentally nondeterministic.
+  //    But even if two algorithms have similar runtimes, our choice of
+  //    algorithm may be visible to the user: Different algorithms can have
+  //    different numerics, and sometimes they can even have different bugs!
+  //
+  //  - Trying all the candidate algorithms can be slow, especially if when some
+  //    of the candidates are "very bad" and run especially slowly compared to
+  //    the optimal candidate.  This slows down compilation.
+  //
+  // To address the disadvantages above, we allow users to save/restore the
+  // autotuning choices that XLA has made, using the functions below.
+  //
+  // Loading autotuning results does not erase existing autotuning choices, but
+  // in the event of a disagreement between the existing data and the new data,
+  // the new algorithm is chosen.
+  //
+  // Note that even if you call LoadAutotuneResults(), if XLA encounters a
+  // dot/conv that is *not* covered by the loaded data, it will go ahead and
+  // autotune it like normal.  In other words, the behavior of XLA should be
+  // identical with or without ahead-of-time autotuning, modulo nondeterminism.
+  //
+  // This is important if you want to be able to use the same autotuning file
+  // with different versions of XLA, because as XLA changes, exactly which
+  // dots/convs it wants to run can also change.  For example, XLA might change
+  // the conv padding heuristics it uses, and we don't want that to mean that
+  // all users of ahead-of-time autotuning are broken.
+  static absl::StatusOr<std::string> SerializeAutotuneResults(
+      bool as_textproto = false);
+
+  // Serializes autotune results into the given proto. If optional keys are
+  // provided, serializes results only for these keys.
+  static absl::Status SerializeAutotuneResults(
+      AutotuneResults* results,
+      std::optional<const AutotuneCacheKeySet*> keys = {});
+
+  // Loads autotune results from the given string of bytes.
+  //
+  // Warning: The results are only loaded to the in-memory cache.
+  static absl::Status LoadAutotuneResults(absl::string_view data,
+                                          bool as_textproto = false);
+
+  // Loads autotune results from the given proto.
+  //
+  // Warning: The results are only loaded to the in-memory cache.
+  static absl::Status LoadAutotuneResults(const AutotuneResults& results);
+
+  // Serializes autotune results into a file.
+  //
+  // If `file_path` ends with ".txt" or ".textproto", then the textproto format
+  // is used, otherwise the binary protobuf format.
+  static absl::Status SerializeAutotuneResultsToFile(
+      absl::string_view file_path);
+
+  // As above, but if you already called SerializeAutotuneResults to get a
+  // proto.
+  static absl::Status SerializeAutotuneResultsToFile(
+      const AutotuneResults& results, absl::string_view file_path);
+
+  // Loads autotune results from a file.
+  //
+  // If `file_path` ends with ".txt" or ".textproto", then the file is
+  // considered to be in the textproto format, otherwise the binary protobuf
+  // format.
+  //
+  // Warning: The results are only loaded to the in-memory cache.
+  static absl::Status LoadAutotuneResultsFromFile(absl::string_view file_path);
+
+  // Warning: This only clears the in-memory cache. If you use a file based
+  // cache you're responsible for clearing the cache directory when you want to.
+  static void ClearAutotuneResults();
+
+  // Warning: This only checks the in-memory cache. If you use a file based
+  // cache, you're responsible for checking whether the cache directory is
+  // empty.
+  static bool ResultCacheIsEmpty();
+
+  struct CacheStats {
+    int64_t cache_hits = 0;
+    int64_t cache_misses = 0;
+  };
+
+  // Returns Cache statistics since the last call to ClearCacheStats or since
+  // the program was started.
+  //
+  // This method counts both in-memory and on disk caches. Every time the
+  // Autotune() or IsInCache() methods are called, the key is looked up in the
+  // two caches, first in the in-memory cache, then in the on-disk cache. If the
+  // key is found in any of the two caches, the global cache_hits is
+  // incremented, otherwise cache_misses is incremented. Note that client code
+  // that first calls IsInCache() and then Autotune() in case of a miss, will
+  // actually cause cache_misses to be incremented twice.
+  static CacheStats GetCacheStats();
+
+  // Resets the global CacheStats that is returned by GetCacheStats().
+  static void ClearCacheStats();
+};
+
+absl::StatusOr<std::string> AutotuneResultsToString(
+    const AutotuneResults& results, bool as_textproto);
+
+// Returns the SHA-256 hash of the input string, encoded in base64.
+//
+// SHA-256 was chosen to follow industry best practices and avoid collisions.
+// Git is also transitioning to SHA-256. This is probably better than
+// tsl::Fingerprint128.
+absl::StatusOr<std::string> GetBase64EncodedSha256Hash(absl::string_view s);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_AUTOTUNER_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
new file mode 100644
index 00000000..fd5d6a8c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
@@ -0,0 +1,154 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_CONV_ALGORITHM_PICKER_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_CONV_ALGORITHM_PICKER_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_conv_runner.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Choose the fastest algorithm for each conv.
+// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
+// each and adding explicit scratch space to the CustomCalls.
+//
+// We pick the algorithm before fusion so that we can generate better HLO. After
+// GpuConvRewriter, our convolutions are CustomCalls which return a
+// tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of
+// scratch:
+//
+//   customcall = (f32[...], f32[0])
+//   return gte(customcall, 0)
+//
+// The algorithm picker then chooses the best algorithm, and potentially
+// increases the scratch space.  It replaces customcall with new_tuple,
+// giving us the following:
+//
+//   new_customcall = (f32[...], f32[N])
+//   new_tuple = tuple(gte(new_customcall, 0), constant f32[0])
+//   return gte(new_tuple, 0)
+//
+// The new tuple and gte instructions can be simplified away, because
+// nobody is expected to use the scratch value.
+//
+// However, if we were to run GpuConvAlgorithmPicker after fusion
+// the gte(customcall, 0) would probably already be into a fusion node.  We
+// can't simplify across HloComputation boundaries, so in this case we
+// wouldn't be able to simplify away the new_tuple bits.
+//
+// It supports two modes: device and deviceless.
+// In device mode, we run autotuning on the device and store autotune results.
+//
+// In deviceless mode, we pass in some information related to the device and
+// use stored autotune results to rewrite convolutions. If the required autotune
+// result is not stored, then the performance of convolution will be suboptimal.
+class GpuConvAlgorithmPicker : public HloModulePass {
+ public:
+  explicit GpuConvAlgorithmPicker(AutotuneConfig config) : config_(config) {}
+
+  absl::string_view name() const override {
+    return "gpu-conv-algorithm-picker";
+  }
+
+  static bool IsEnabled(const HloModule* module) {
+    return module->config().debug_options().xla_gpu_autotune_level() != 0;
+  }
+
+  static bool IsCandidate(const HloInstruction* instr) {
+    return IsCustomCallToDnnConvolution(*instr);
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+  absl::StatusOr<bool> RunOnInstruction(HloInstruction* instr);
+
+  absl::StatusOr<AutotuneResult> PickBestAlgorithm(
+      const HloCustomCallInstruction* instr);
+  absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCache(
+      const HloCustomCallInstruction* instr);
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+  // Simple bundle of an algorithm and its output, for comparing results across
+  // autotuned algorithms.
+  struct ReferenceResult {
+    stream_executor::dnn::AlgorithmDesc algorithm;
+    std::vector<stream_executor::DeviceMemoryBase> buffers;
+  };
+
+  // Execution environment for autotuning. Runtime autotuning requires runtime
+  // information such as input/output buffers in order to run. It can be
+  // constructed from the autotuned instruction by FromInstruction.
+  struct AutotuneRuntimeArguments {
+    const HloModuleConfig hlo_module_config;
+    RedzoneBuffers rz_buffers;
+    const GpuConvConfig gpu_conv_config;
+    std::optional<std::string> canonical_hlo;
+
+    static absl::StatusOr<AutotuneRuntimeArguments> FromInstruction(
+        const HloCustomCallInstruction* instr, const AutotuneConfig& config,
+        const DebugOptions& debug_options);
+  };
+
+  absl::StatusOr<AutotuneResult> AutotuneOneConvRunner(
+      GenericConvRunner* runner,
+      std::optional<ReferenceResult>* reference_result,
+      absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos,
+      absl::string_view instr_str,
+      const AutotuneRuntimeArguments& runtime_arguments);
+
+  // Pick the best algorithm for CUDA platform.
+  absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCacheCuda(
+      const HloCustomCallInstruction* instr);
+#endif
+
+  absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCacheRocm(
+      const HloCustomCallInstruction* instr);
+
+ private:
+  AutotuneConfig config_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_CONV_ALGORITHM_PICKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.h
new file mode 100644
index 00000000..7ea7f4b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_CUSTOM_KERNEL_FUSION_AUTOTUNER_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_CUSTOM_KERNEL_FUSION_AUTOTUNER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Find best custom kernel for custom kernel fusions.
+class CustomKernelFusionAutotuner : public HloModulePass {
+ public:
+  explicit CustomKernelFusionAutotuner(const AutotuneConfig& config)
+      : config_(config) {}
+
+  absl::string_view name() const override {
+    return "custom_kernel-fusion-autotuner";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const AutotuneConfig config_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_CUSTOM_KERNEL_FUSION_AUTOTUNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h
new file mode 100644
index 00000000..40a57e02
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_GEMM_ALGORITHM_PICKER_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_GEMM_ALGORITHM_PICKER_H_
+
+#include <cstddef>
+#include <functional>
+#include <optional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// GemmAlgorithmPicker supports two modes: device and deviceless.
+// In device mode, we run autotuning on the device and store autotune results.
+// In deviceless mode, we pass in some information related to the device and
+// use stored autotune results to rewrite Gemm instructions. If the required
+// autotune result is not stored, then algorithm is set to kRuntimeAutotuning.
+class GemmAlgorithmPicker : public HloModulePass {
+ public:
+  explicit GemmAlgorithmPicker(AutotuneConfig config) : config_(config) {}
+
+  absl::string_view name() const override { return "gemm-algorithm-picker"; }
+
+  size_t num_algorithms_left() const { return num_algorithms_left_; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  AutotuneConfig config_;
+  // The number of valid algorithms used for autotuning (from the last call),
+  // to be used for testing purposes.
+  size_t num_algorithms_left_ = 0;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_GEMM_ALGORITHM_PICKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
new file mode 100644
index 00000000..87dddee1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
@@ -0,0 +1,215 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_GEMM_FUSION_AUTOTUNER_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_GEMM_FUSION_AUTOTUNER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace gpu {
+
+// Uses profile results to rewrite a gemm fusion to use the best backend.
+class GemmFusionAutotunerRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit GemmFusionAutotunerRewriterVisitor(const AutotuneConfig& config)
+      : config_(config) {}
+
+  absl::Status HandleFusion(HloInstruction* fusion_instr) override;
+
+ private:
+  AutotuneConfig config_;
+};
+
+// Takes a gemm fusion and chooses between cuBLAS, cuDNN, and Triton backends.
+// In the case of Triton, it also chooses the best tiling configuration.
+//
+// This pass uses three steps:
+// 1. Generate all possible configs for each dot operation in the fusion.
+// 2. Compile all the configs and profile them.
+// 3. Rewrite HLO to use the best config.
+//
+// Note: this pass does not rewrite the fusion to use cuBLAS or cuDNN. This is
+// done in a separate pass.
+class GemmFusionAutotuner : public HloModulePass {
+ public:
+  explicit GemmFusionAutotuner(const AutotuneConfig& config,
+                               const se::SemanticVersion& toolkit_version,
+                               tsl::thread::ThreadPool* thread_pool,
+                               const MultiProcessKeyValueStore& key_value_store)
+      : config_(config),
+        toolkit_version_(toolkit_version),
+        thread_pool_(thread_pool),
+        key_value_store_(key_value_store) {}
+
+  absl::string_view name() const override { return "gemm-fusion-autotuner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  AutotuneConfig config_;
+  se::SemanticVersion toolkit_version_;
+  tsl::thread::ThreadPool* thread_pool_;
+  MultiProcessKeyValueStore key_value_store_;
+};
+
+class GemmFusionAutotunerImpl {
+ public:
+  GemmFusionAutotunerImpl(
+      AutotuneConfig& config,
+      const stream_executor::SemanticVersion& toolkit_version,
+      DebugOptions debug_options, tsl::thread::ThreadPool* thread_pool)
+      : config_(std::move(config)),
+        toolkit_version_(toolkit_version),
+        debug_options_(std::move(debug_options)),
+        thread_pool_(thread_pool) {}
+
+  struct CuBlasConfig {
+    bool operator<(const CuBlasConfig& other) const;
+  };
+  struct CuDnnConfig {
+    int64_t plan_id;
+    bool operator<(const CuDnnConfig& other) const;
+  };
+  struct CustomKernelFusionConfig {
+    int64_t kernel_index;
+    bool operator<(const CustomKernelFusionConfig& other) const;
+  };
+  using BackendConfig =
+      std::variant<CuBlasConfig, CuDnnConfig, CustomKernelFusionConfig,
+                   TritonGemmConfig>;
+  using BackendConfigs = std::vector<
+      std::pair<const HloFusionInstruction*, std::vector<BackendConfig>>>;
+
+  struct ExecutableCandidate {
+    BackendConfig config;
+    std::unique_ptr<Executable> executable;
+    std::optional<AutotuneResult> result;
+  };
+
+  // Generate all possible configs for a dot operation.
+  absl::StatusOr<std::vector<BackendConfig>> GenerateConfigs(
+      const HloFusionInstruction& fusion);
+  absl::StatusOr<std::vector<TritonGemmConfig>> GenerateTritonConfigs(
+      const HloDotInstruction& dot);
+
+  // Compile all executables for all fusions.
+  absl::StatusOr<absl::flat_hash_map<const HloFusionInstruction*,
+                                     std::vector<ExecutableCandidate>>>
+  CompileAll(AutotunerCompileUtil& compile_util, const BackendConfigs& task);
+
+  // Profile all executables for a fusion.
+  absl::StatusOr<std::vector<AutotuneResult>> Profile(
+      AutotunerCompileUtil& compile_util, const HloFusionInstruction& fusion,
+      absl::Span<const ExecutableCandidate> candidates);
+
+  // Autotune and save the results to the autotuning cache.
+  absl::StatusOr<AutotuneCacheKeySet> Autotune(
+      AutotunerCompileUtil& compile_util,
+      const BackendConfigs& gemm_config_sets,
+      absl::flat_hash_map<AutotuneCacheKey, uint64_t> fusion_count_map);
+
+  // Helper methods.
+  const AutotuneConfig& GetConfig() const { return config_; }
+  bool IsAutotuningEnabled() const;
+
+  static const int64_t BLAS_GEMM_DEFAULT;
+
+ private:
+  // Measures the performance of a single executable candidate.
+  //
+  // If required and the candidate is cuBLAS, this will save the output to the
+  // reference buffer.
+  //
+  // If the candidate is not cuBLAS, this will check the redzones and compare
+  // the outputs with the reference buffer.
+  absl::StatusOr<AutotuneResult> MeasurePerformance(
+      AutotunerCompileUtil& compile_util, const HloFusionInstruction& fusion,
+      const ExecutableCandidate& candidate,
+      std::optional<ScopedShapedBuffer>& reference_buffer);
+
+  // Checks that the redzone buffers are correct, updates `res` otherwise.
+  // Returns true if the redzones are correct, false otherwise.
+  absl::StatusOr<bool> CheckRedZones(const RedzoneBuffers& rz_buffers,
+                                     AutotuneResult& res);
+
+  // Compares the outputs of the fusion with the reference buffer.
+  // Updates `res` if the outputs do not match.
+  absl::Status CompareBuffers(const HloFusionInstruction& fusion,
+                              const ScopedShapedBuffer& reference_buffer,
+                              const ScopedShapedBuffer& buffer,
+                              AutotuneResult& res);
+
+  se::GpuComputeCapability GetComputeCapability() const {
+    return config_.GetGpuComputeCapability();
+  }
+
+  bool isRocm() const {
+    return std::holds_alternative<se::RocmComputeCapability>(
+        GetComputeCapability());
+  }
+
+  bool IsFusionKind(const HloInstruction& hlo, absl::string_view kind);
+
+  bool AddLibConfigs(const HloFusionInstruction& fusion,
+                     const HloDotInstruction* dot,
+                     std::vector<BackendConfig>& configs);
+
+  std::vector<TritonGemmConfig> GetDefaultTritonConfigs() const;
+  std::vector<TritonGemmConfig> GetExhaustiveTritonConfigs() const;
+
+  AutotuneConfig config_;
+  se::SemanticVersion toolkit_version_;
+  DebugOptions debug_options_;
+  tsl::thread::ThreadPool* thread_pool_;
+  std::vector<TritonGemmConfig> triton_configs_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_GEMM_FUSION_AUTOTUNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_allocations.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_allocations.h
new file mode 100644
index 00000000..f521b433
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_allocations.h
@@ -0,0 +1,99 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_BUFFER_ALLOCATIONS_H_
+#define XLA_SERVICE_GPU_BUFFER_ALLOCATIONS_H_
+
+#include <cstddef>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+
+namespace xla {
+namespace gpu {
+
+// A thread-compatible class that encapsulates the base addresses of the
+// allocated device buffers.
+class BufferAllocations {
+ public:
+  BufferAllocations(absl::Span<se::DeviceMemoryBase const> buffers,
+                    int device_ordinal,
+                    se::DeviceMemoryAllocator* memory_allocator)
+      : buffers_(buffers.begin(), buffers.end()),
+        device_ordinal_(device_ordinal),
+        memory_allocator_(memory_allocator) {}
+
+  BufferAllocations(BufferAllocations&& other) = default;
+  BufferAllocations& operator=(BufferAllocations&& other) = default;
+  BufferAllocations(const BufferAllocations&) = delete;
+  BufferAllocations& operator=(const BufferAllocations&) = delete;
+
+  se::DeviceMemoryAllocator* memory_allocator() const {
+    return memory_allocator_;
+  }
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Returns the device address of buffer `buffer_index`. `buffer_index` must be
+  // a valid index, i.e., in [0, buffer_count). This function returns null if
+  // `buffer_index` is not assigned to a buffer address.
+  se::DeviceMemoryBase GetDeviceAddress(
+      BufferAllocation::Index buffer_index) const;
+
+  // Returns a mutable value for the allocation at a given `buffer_index`.
+  se::DeviceMemoryBase& GetMutableDeviceAddress(
+      BufferAllocation::Index buffer_index);
+
+  // Same as above, but also adjusts the returned address for the offset and
+  // size contained in the given slice.
+  se::DeviceMemoryBase GetDeviceAddress(
+      const BufferAllocation::Slice& buffer_slice) const;
+
+  // Tears down all buffers allocated by this object that are not in
+  // `live_addresses`.
+  absl::Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
+                        absl::Span<const BufferAllocation> allocations);
+
+  std::string ToString() const {
+    std::string out;
+    for (BufferAllocation::Index i = 0; i < buffers_.size(); ++i) {
+      const auto& buf = buffers_[i];
+      absl::StrAppendFormat(&out, "Buffer %d -> %p (%d B)", i, buf.opaque(),
+                            buf.size());
+    }
+    return out;
+  }
+
+  size_t size() const { return buffers_.size(); }
+
+ private:
+  // An array of device pointers that stores the address of each buffer
+  // indexed by Index. Each element can point to a temporary buffer, an
+  // input buffer, or nullptr if no buffer is needed for that Index.
+  std::vector<se::DeviceMemoryBase> buffers_;
+  int device_ordinal_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_BUFFER_ALLOCATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_comparator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_comparator.h
new file mode 100644
index 00000000..76e605bb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_comparator.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+#define XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+
+#include "absl/status/statusor.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+// A device-side comparator that compares buffers.
+class BufferComparator {
+ public:
+  BufferComparator(const BufferComparator&) = delete;
+  BufferComparator(BufferComparator&&) = default;
+
+  explicit BufferComparator(const Shape& shape, double tolerance = 0.1,
+                            bool verbose = true);
+
+  // Returns true if the two buffers compare equal. The definition of "equal"
+  // is:
+  // * All NaNs equal.
+  // * All fp16 infs are treated as 65505 or -65505. Otherwise,
+  //   infs and negative infs compare equal.
+  // * With NaNs and infs taken care of, a and b compare equal iff:
+  //     abs(a - b) / (max(abs(a), abs(b)) + 1) < tolerance
+  //
+  // See the implementation for the tolerance value.
+  absl::StatusOr<bool> CompareEqual(se::Stream* stream,
+                                    se::DeviceMemoryBase current,
+                                    se::DeviceMemoryBase expected) const;
+ private:
+  Shape shape_;
+  double relative_tol_;  // relative tolerance for comparison
+  bool verbose_;         // whether to print out error message on mismatch
+};
+
+namespace buffer_comparator {
+
+// Returns a pointer to CUDA C++ device function implementing comparison.
+void* comparison_fn(xla::PrimitiveType type);
+
+}  // namespace buffer_comparator
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_sharing.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_sharing.h
new file mode 100644
index 00000000..a42f65fb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/buffer_sharing.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_BUFFER_SHARING_H_
+#define XLA_SERVICE_GPU_BUFFER_SHARING_H_
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+std::optional<bool> FusionCanShareBufferHint(
+    const HloInstruction* user, const HloInstruction* operand,
+    const ShapeIndex& user_index,
+    const se::DeviceDescription& device_description);
+
+std::optional<bool> CanShareBufferHint(
+    const HloInstruction* user, const HloInstruction* operand,
+    const ShapeIndex& user_index,
+    const se::DeviceDescription& device_description);
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_BUFFER_SHARING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
new file mode 100644
index 00000000..d3ddc4a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -0,0 +1,80 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
+#define XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+struct CompileModuleResults {
+  std::unique_ptr<llvm::Module> llvm_module;
+  std::unique_ptr<llvm::Module> llvm_module_constants;
+  std::unique_ptr<BufferAssignment> buffer_assignment;
+  std::unique_ptr<ExecutionStreamAssignment> execution_stream_assignment;
+  std::vector<BufferAllocation> allocations;
+  std::unique_ptr<SequentialThunk> executable;
+  std::vector<GpuExecutable::ConstantInfo> constants;
+  absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> output_info;
+  Shape output_shape;
+  std::string module_name;
+  CompilationCacheProto kernel_compilation_cache;
+
+  // If true, the compiled module uses buffer allocations owned by
+  // buffer_assignment. Otherwise the compiled module uses buffer allocations
+  // stored in allocations.
+  bool use_original_allocations;
+};
+
+absl::Status LoadCache(IrEmitterContext& ir_emitter_context,
+                       absl::string_view cache_file_path);
+
+absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, se::Platform::Id platform_id,
+    const se::DeviceDescription& gpu_device_info,
+    const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
+    const BufferValue::SizeFunction& buffer_size_bytes_function,
+    bool split_constants_module = false);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/conv_layout_normalization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/conv_layout_normalization.h
new file mode 100644
index 00000000..b8723d92
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/conv_layout_normalization.h
@@ -0,0 +1,34 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CONV_LAYOUT_NORMALIZATION_H_
+#define XLA_SERVICE_GPU_CONV_LAYOUT_NORMALIZATION_H_
+
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<std::optional<HloInstruction*>> NormalizeLayoutForGpuCustomCalls(
+    HloCustomCallInstruction*);
+
+}  // end namespace gpu
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_GPU_CONV_LAYOUT_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cub_sort_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cub_sort_kernel.h
new file mode 100644
index 00000000..29b163e7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cub_sort_kernel.h
@@ -0,0 +1,73 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CUB_SORT_KERNEL_H_
+#define XLA_SERVICE_GPU_CUB_SORT_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/stream_executor/gpu/gpu_types.h"
+
+namespace xla {
+namespace gpu {
+
+// Returns nullptr if no error, otherwise the error message as a null-terminated
+// string (cudaGetErrorString or similar).
+#define XLA_CUB_DECLARE_SORT_KEYS(suffix)                                     \
+  const char* CubSortKeys_##suffix(                                           \
+      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,        \
+      void* d_keys_out, size_t num_items, bool descending, size_t batch_size, \
+      stream_executor::gpu::GpuStreamHandle gpu_stream_handle);
+
+// Returns nullptr if no error, otherwise the error message as a null-terminated
+// string (cudaGetErrorString or similar).
+#define XLA_CUB_DECLARE_SORT_PAIRS(suffix)                             \
+  const char* CubSortPairs_##suffix(                                   \
+      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in, \
+      void* d_keys_out, const void* d_values_in, void* d_values_out,   \
+      size_t num_items, bool descending, size_t batch_size,            \
+      stream_executor::gpu::GpuStreamHandle gpu_stream_handle);
+
+XLA_CUB_DECLARE_SORT_KEYS(bf16)
+XLA_CUB_DECLARE_SORT_KEYS(f16)
+XLA_CUB_DECLARE_SORT_KEYS(f32)
+XLA_CUB_DECLARE_SORT_KEYS(f64)
+XLA_CUB_DECLARE_SORT_KEYS(s8)
+XLA_CUB_DECLARE_SORT_KEYS(s16)
+XLA_CUB_DECLARE_SORT_KEYS(s32)
+XLA_CUB_DECLARE_SORT_KEYS(s64)
+XLA_CUB_DECLARE_SORT_KEYS(u8)
+XLA_CUB_DECLARE_SORT_KEYS(u16)
+XLA_CUB_DECLARE_SORT_KEYS(u32)
+XLA_CUB_DECLARE_SORT_KEYS(u64)
+
+XLA_CUB_DECLARE_SORT_PAIRS(u8_b16)
+XLA_CUB_DECLARE_SORT_PAIRS(u8_b32)
+XLA_CUB_DECLARE_SORT_PAIRS(u8_b64)
+XLA_CUB_DECLARE_SORT_PAIRS(u16_b16)
+XLA_CUB_DECLARE_SORT_PAIRS(u16_b32)
+XLA_CUB_DECLARE_SORT_PAIRS(u16_b64)
+XLA_CUB_DECLARE_SORT_PAIRS(u32_b16)
+XLA_CUB_DECLARE_SORT_PAIRS(u32_b32)
+XLA_CUB_DECLARE_SORT_PAIRS(u32_b64)
+XLA_CUB_DECLARE_SORT_PAIRS(u64_b16)
+XLA_CUB_DECLARE_SORT_PAIRS(u64_b32)
+XLA_CUB_DECLARE_SORT_PAIRS(u64_b64)
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_CUB_SORT_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cublas_cudnn.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cublas_cudnn.h
new file mode 100644
index 00000000..d3f0a1ce
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cublas_cudnn.h
@@ -0,0 +1,221 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CUBLAS_CUDNN_H_
+#define XLA_SERVICE_GPU_CUBLAS_CUDNN_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+namespace gpu {
+
+// Different types of convolutions supported by cudnn.
+//
+// A way to think about these is that a convolution is defined by three arrays
+// -- the "input", the "filter", and the "output" -- and given any two of these,
+// we can compute the third.  For example, a backward-input convolution takes as
+// input a filter and an "output" and produces an "input" such that if one were
+// to do a forward convolution of "input" using filter, the result would be
+// something with the same shape as "output".
+//
+// This way of thinking is not correct if you look at the values produced. For
+// example, a backward-input convolution is not actually the mathematical
+// inverse of a forward convolution.  But it's right as far as the shapes and
+// "connectivity" (i.e. which elements of the input affect which elements of
+// the output) are concerned.
+enum class CudnnConvKind {
+  kForward,            // input  + filter => output
+  kBackwardInput,      // filter + output => input
+  kBackwardFilter,     // input  + output => filter
+  kForwardActivation,  // activation(conv(input, filter) + broadcast(bias) +
+                       // (optionally) side_input) => output
+  kForwardGraph,       // pointwise(...pointwise(conv(input, filter))...)
+                       // => output
+};
+
+enum class CudnnNormKind {
+  kLayerForwardInfer,
+  kLayerForwardTrain,
+  kLayerBackward,
+};
+
+enum class CudnnfMHAKind {
+  kSoftmaxDropout,
+  kSoftmax,
+  kScaleBiasSoftmax,
+  kScaleBiasSoftmaxDropout,
+  kBackwardSoftmaxDropout,
+  kBackwardSoftmax,
+  kBackwardScaleBiasSoftmax,
+  kBackwardScaleBiasSoftmaxDropout,
+  kSoftmaxF8,
+  kBackwardSoftmaxF8,
+};
+
+enum class CudnnfMHAMaskKind {
+  kNoMask,
+  kPadding,
+  kCausal,
+  kPaddingCausal,
+  kAlibi,
+};
+
+absl::StatusOr<CudnnConvKind> GetCudnnConvKind(
+    const HloCustomCallInstruction* instr);
+
+// Converts a CudnnConvKind value to a string.
+std::string CudnnConvKindToString(CudnnConvKind kind);
+
+// Matrix multiplication rewritten into a GEMM custom call.
+// All matrix multiplications should be rewritten as such custom calls
+// after a GemmRewriter lowering pass.
+bool IsCublasGemm(const HloInstruction& hlo);
+
+// Matrix multiplication that calls into legacy cublas.
+bool IsLegacyCublasMatmul(const HloInstruction& hlo);
+
+// Matrix multiplication that calls into cublasLt.
+bool IsCublasLtMatmul(const HloInstruction& hlo);
+
+// Scaled matrix multiplication in FP8. Calls into cublasLt.
+bool IsCublasLtMatmulF8(const HloInstruction& hlo);
+
+// Triangular solve that calls into legacy cublas.
+bool IsTriangularSolve(const HloInstruction& hlo);
+
+// A call to cuBLAS general matrix multiplication API.
+extern const absl::string_view kGemmCallTarget;
+
+// A call to cuBLAS Lt API matrix multiplication.
+extern const absl::string_view kCublasLtMatmulCallTarget;
+
+// A call to cuBLASLt for scaled matrix multiplication in FP8.
+extern const absl::string_view kCublasLtMatmulF8CallTarget;
+
+// A call to cuBLAS for a triangular solve.
+//
+// Like cudnn convolutions, this op returns a tuple (result, scratch_memory).
+extern const absl::string_view kTriangularSolveCallTarget;
+
+// A call to cuDNN for convolution (forward, backward filter, or backward input)
+// is represented as a CustomCall HLO with a call target equal to one of these
+// strings.
+//
+// These CustomCalls have window() and convolution_dimension_numbers() set like
+// regular convolution ops.  They have the same LHS and RHS operands, plus two
+// additional constant operands: an int64_t operand for the cudnn algorithm and
+// a bool operand for whether tensor_ops is enabled. A value of -1 for the cudnn
+// algorithm means that the implementation is free to choose the best algorithm
+// it can.
+//
+// These calls output a tuple (conv_result, scratch_memory), where conv_result
+// is the actual result of the convolution, and scratch_memory is temporary
+// memory used by cudnn.  Callers shouldn't inspect scratch_memory, as its value
+// is not well-defined.
+//
+// GpuConvRewriter lowers kConvolution HLOs to these custom calls.
+// When it does so, it chooses algorithm -1 and 0 bytes of scratch space.  Later
+// on in the pipeline, CudnnConvAlgorithmChooser chooses an explicit
+// algorithm for each conv and sets the amount of scratch space needed.
+//
+// (Representing the scratch memory as an output may seem strange at first, but
+// it's quite sensible, from a certain point of view.  The scratch buffer is a
+// location in memory that the conv can write into, but which it can't legally
+// read from, at least until it's written something first.  But that's exactly
+// the definition of an output buffer.)
+extern const absl::string_view kCudnnConvForwardCallTarget;
+extern const absl::string_view kCudnnConvBackwardInputCallTarget;
+extern const absl::string_view kCudnnConvBackwardFilterCallTarget;
+extern const absl::string_view kCudnnConvBiasActivationForwardCallTarget;
+extern const absl::string_view kCudnnConvForwardGraphCallTarget;
+
+// cuDNN specific convolution helper (emitted together with a int8x32
+// convolution, if reordering is required).
+extern const absl::string_view kCudnnConvReorderFilterCallTarget;
+extern const absl::string_view kCudnnConvReorderFilterAndBiasCallTarget;
+
+// Returns true if `hlo` will be implemented as a call to a cuDNN convolution
+// routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a
+// kConvolution opcode.
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
+
+// Returns true if `hlo` will be implemented as a call to cuDNN convolution
+// reordering helper (required for int8x32 convolutions).
+bool IsCudnnConvolutionReorder(const HloInstruction& hlo);
+
+// A call to cuDNN for a fused norm.
+extern const absl::string_view kCudnnNormCallTarget;
+
+// Returns true if `hlo` will be implemented as a call to a cuDNN norm kernel.
+bool IsCustomCallToDnnNorm(const HloInstruction& hlo);
+
+// The fused_mha_rewriter phase where each of the MHA signatures are pattern
+// matched and rewritten into a custom-call with specific custom-call target.
+// The custom-call target specifies the MHA signature. For example,  BMM1 -Scale
+// - Bias - Softmax - BMM2 pattern can have the target as
+// cudnn$fmhaScaleBiasSoftmax. The fMHA signatures currently supported by cudnn
+// are:
+// 1. BMM1 - Softmax - BMM2
+// 2. BMM1 - Softmax - Dropout - BMM2
+// 3. BMM1 - scale - Bias - Softmax - BMM2
+// 4. BMM1 - scale - Bias - Softmax - Dropout - BMM2
+// Forward calls
+extern const absl::string_view kCudnnfMHASoftmaxF8CallTarget;
+extern const absl::string_view kCudnnfMHASoftmaxCallTarget;
+extern const absl::string_view kCudnnfMHASoftmaxDropoutCallTarget;
+extern const absl::string_view kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget;
+extern const absl::string_view kCudnnfMHAScaleBiasSoftmaxCallTarget;
+// Backward calls
+extern const absl::string_view kCudnnfMHASoftmaxBackwardF8CallTarget;
+extern const absl::string_view kCudnnfMHASoftmaxBackwardCallTarget;
+extern const absl::string_view kCudnnfMHASoftmaxDropoutBackwardCallTarget;
+extern const absl::string_view
+    kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
+extern const absl::string_view kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget;
+
+bool IsFwdCustomCallTofMHAF8(const HloInstruction& hlo);
+bool IsBwdCustomCallTofMHAF8(const HloInstruction& hlo);
+bool IsCustomCallTofMHAF8(const HloInstruction& hlo);
+bool IsFwdCustomCallTofMHA(const HloInstruction& hlo);
+bool IsBwdCustomCallTofMHA(const HloInstruction& hlo);
+bool IsCustomCallTofMHA(const HloInstruction& hlo);
+
+absl::StatusOr<CudnnfMHAKind> GetCudnnfMHAKind(
+    const HloCustomCallInstruction* instr);
+
+std::string CudnnfMHAKindToString(CudnnfMHAKind kind);
+absl::Status SetFMHAInstructionName(HloModule* module, HloInstruction* fmha);
+
+bool MHACallHasDropout(absl::string_view fmha_call_name);
+
+// CUB library calls.
+// Reference: https://nvlabs.github.io/cub/
+extern const absl::string_view kCubDeviceRadixSortTarget;
+
+bool IsCubDeviceRadixSort(const HloInstruction& hlo);
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_CUBLAS_CUDNN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cublas_padding_requirements.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
new file mode 100644
index 00000000..6bfd23c9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CUBLAS_PADDING_REQUIREMENTS_H_
+#define XLA_SERVICE_GPU_CUBLAS_PADDING_REQUIREMENTS_H_
+
+#include <array>
+
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+struct CublasPaddingRequirement {
+  int min_compute_capability;
+  PrimitiveType data_type;
+  int multiple_of;
+};
+
+struct HipblasPaddingRequirement {
+  PrimitiveType data_type;
+  int multiple_of;
+};
+
+// List of padding requirements per compute capability and data type.
+constexpr std::array<CublasPaddingRequirement, 3> CublasPaddingRequirements{
+    {{se::CudaComputeCapability::VOLTA, S8, 4},
+     {se::CudaComputeCapability::VOLTA, F16, 8},
+     {se::CudaComputeCapability::AMPERE, BF16, 8}}};
+
+constexpr std::array<HipblasPaddingRequirement, 2> HipblasPaddingRequirements{
+    {{/*rocm gpu arch,*/ F16, 8}, {/*rocm gpu arch,*/ BF16, 8}}};
+
+// Tell if either of the operands of the dot requires padding.
+bool CublasRequiresPadding(const HloDotInstruction& dot,
+                           const se::GpuComputeCapability& cc);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_CUBLAS_PADDING_REQUIREMENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cudnn_support_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cudnn_support_utils.h
new file mode 100644
index 00000000..4a1f362b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/cudnn_support_utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CUDNN_SUPPORT_UTILS_H_
+#define XLA_SERVICE_GPU_CUDNN_SUPPORT_UTILS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Indicates whether the `compute_capability` supports an optimized integer
+// implementation of the given `conv` operation vectorized to `vector_size`.
+//
+// This function does not guarantee that a convolution will be padded and/or
+// vectorized. It only checks that it is a valid candiate for such optimization.
+absl::StatusOr<bool> CudnnSupportsOptimizedIntegerConvolution(
+    const se::CudaComputeCapability& compute_capability,
+    HloCustomCallInstruction& conv, int vector_size);
+
+// Represents configuration for the reshape-transpose-reshape operations that
+// are equivalent to `cudnnReorderFilterAndBias`. This is used by int8x32
+// vectorized convolutions.
+//
+// For filter reordering the equivalent HLO is:
+//   %reshape = s8[$S] reshape(%input)
+//   %transpose = s8[I/32,H,W,O/8,2,8,4,4] transpose(%reshape), dimensions={$D}
+//   %result = s8[O,I/32,H,W,32] reshape(%transpose)
+//
+// For bias reordering the HLO is similar, but the op shapes are s8[O/32,4,2,4]
+// for %transpose, and s8[O/32,2,4,4] for %result.
+//
+// The helper functions below calculate the shape $S (transpose_shape) and
+// dimensions $D (permutation) from the convolution dimensions numbers config.
+// The result_shape is fixed and is present for the convenience.
+struct CudnnReorderTransposeConfig {
+  Shape transpose_shape;
+  Shape result_shape;
+  std::vector<int64_t> permutation;
+};
+
+// Create a transposition for an int8x32 convolution filter that effectively
+// does the same thing as cudnnReorderFilterAndBias, but could also be constant
+// folded or fused.
+absl::StatusOr<CudnnReorderTransposeConfig>
+CudnnInferTransposeForFilterReordering(
+    const Shape& shape, const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Create a transposition for an int8x32 convolution bias that effectively
+// does the same thing as cudnnReorderFilterAndBias, but could also be constant
+// folded or fused.
+absl::StatusOr<CudnnReorderTransposeConfig>
+CudnnInferTransposeForBiasReordering(const Shape& shape);
+
+inline constexpr absl::string_view kWorkspaceAllocationCustomCallTarget =
+    "__nop";
+
+// Detects `ROOT tuple(..., custom-call())` used to allocate workspace buffers.
+bool IsWorkspaceAllocationRoot(const HloInstruction& root);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_CUDNN_SUPPORT_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/elemental_ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
new file mode 100644
index 00000000..a5770744
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_ELEMENTAL_IR_EMITTER_H_
+#define XLA_SERVICE_GPU_ELEMENTAL_IR_EMITTER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/target_util.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuElementalIrEmitter : public ElementalIrEmitter {
+ public:
+  GpuElementalIrEmitter(IrEmitterContext& ir_emitter_context,
+                        llvm::IRBuilderBase* b);
+
+ protected:
+  llvm_ir::IrArray::Index GetSourceIndexOfBitcast(
+      const llvm_ir::IrArray::Index& index, const HloInstruction* hlo) override;
+
+  absl::StatusOr<llvm::Value*> EmitFloatBinaryOp(
+      const HloInstruction* op, llvm::Value* lhs_value,
+      llvm::Value* rhs_value) override;
+
+  absl::StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
+                                         llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitTan(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
+                                       llvm::Value* value,
+                                       absl::string_view name) override;
+
+  absl::StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
+                                         llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
+                                        llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
+                                         llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type,
+                                       llvm::Value* lhs, llvm::Value* rhs,
+                                       absl::string_view name) override;
+
+  absl::StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                         llvm::Value* lhs, llvm::Value* rhs,
+                                         absl::string_view name) override;
+
+  absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                        llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
+                                              llvm::Value* value) override;
+
+  absl::StatusOr<llvm::Value*> EmitCbrt(PrimitiveType prim_type,
+                                        llvm::Value* value) override;
+
+  absl::StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view, bool /*is_reducer*/) override;
+
+  bool fast_min_max() override {
+    return ir_emitter_context_.debug_options().xla_gpu_enable_fast_min_max();
+  }
+
+ private:
+  // Emits IR for op, which must have opcode kPower.
+  absl::StatusOr<llvm::Value*> EmitPowerOp(const HloInstruction* op,
+                                           llvm::Value* lhs_value,
+                                           llvm::Value* rhs_value);
+
+  // Emits IR to call a device function of type [T] -> T.  Adjusts
+  // callee_name according to T.  Returns the IR value that represents the
+  // return value of the function.
+  absl::StatusOr<llvm::Value*> EmitDeviceMathCall(
+      TargetDeviceFunctionID funcid, absl::Span<llvm::Value* const> operands,
+      absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
+      absl::string_view name = "");
+
+  // Emits IR to call a function of type [T] -> T.  Does not munge callee_name.
+  // Returns the IR value that represents the return value of the function.
+  absl::StatusOr<llvm::Value*> EmitMathCall(
+      const std::string& callee_name, absl::Span<llvm::Value* const> operands,
+      absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
+      absl::string_view name = "");
+
+  IrEmitterContext& ir_emitter_context_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_ELEMENTAL_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/execution_stream_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/execution_stream_assignment.h
new file mode 100644
index 00000000..54de3082
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/execution_stream_assignment.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_EXECUTION_STREAM_ASSIGNMENT_H_
+#define XLA_SERVICE_GPU_EXECUTION_STREAM_ASSIGNMENT_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+struct ExecutionStreamAssignmentOptions {
+  // The `ExecutionStreamAssignment` will round-robin across this many
+  // `ExecutionStreams`.
+  int number_of_execution_streams = 4;
+};
+
+// `ExecutionStreamAssignments` represent a mapping from `HloInstructions` to
+// `ExecutionStreamIds`. Asynchronous calls (`async-start`, `async-update`, and
+// `async-done`) result in the target computations being assigned new
+// `ExecutionStreamIds` to support concurrent execution.
+class ExecutionStreamAssignment {
+ public:
+  // The `HloModule` must be flat. In other words, there must be a one-to-one
+  // mapping between callsites and computations. One way to guarantee this is to
+  // pass the module through the `FlattenCallGraph` pass.
+  //
+  // The ExecutionStreamAssignment does not take ownership of the `HloModule`.
+  explicit ExecutionStreamAssignment(
+      const HloModule* module, ExecutionStreamAssignmentOptions options = {});
+
+  // Returns the `ExecutionStreamId` for the given instruction, which *must* be
+  // synchronous. Returns an error if the instruction is either not reachable
+  // from the module's entrypoint, or is only reachable through embedded calls.
+  absl::StatusOr<ExecutionStreamId> GetSyncExecutionStreamId(
+      const HloInstruction* instruction) const;
+
+  // Returns the source and destination `ExecutionStreamIds` for the given
+  // instruction, which *must* be asynchronous. Returns an error if the
+  // instruction is either not reachable from the module's entrypoint, or is
+  // only reachable through embedded calls.
+  struct AsyncExecutionStreamIds {
+    // The `ExecutionStreamId` for the calling instruction (e.g. the computation
+    // that invokes `async-start`).
+    ExecutionStreamId source_stream_id;
+    // The `ExecutionStreamId` for the callee computation (e.g. the callee of an
+    // `async-start` instruction).
+    ExecutionStreamId destination_stream_id;
+  };
+  absl::StatusOr<AsyncExecutionStreamIds> GetAsyncExecutionStreamIds(
+      const HloInstruction* instruction) const;
+
+ private:
+  // Maps from `HloInstructions` to `ExecutionStreamIds` for synchronous and
+  // asynchronous instructions, respectively. All instructions reachable through
+  // non-embedded calls must be present.
+  absl::flat_hash_map<HloInstruction*, ExecutionStreamId> sync_instructions_;
+  absl::flat_hash_map<HloInstruction*, AsyncExecutionStreamIds>
+      async_instructions_;
+};
+
+inline bool operator==(
+    const ExecutionStreamAssignment::AsyncExecutionStreamIds& first,
+    const ExecutionStreamAssignment::AsyncExecutionStreamIds& second) {
+  return first.source_stream_id == second.source_stream_id &&
+         first.destination_stream_id == second.destination_stream_id;
+}
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_EXECUTION_STREAM_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/flag_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/flag_utils.h
new file mode 100644
index 00000000..52705746
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/flag_utils.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FLAG_UTILS_H_
+#define XLA_SERVICE_GPU_FLAG_UTILS_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/collective_pipeliner.h"
+#include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/latency_hiding_scheduler.h"
+
+namespace xla {
+namespace gpu {
+
+// Defines the optimization effort to trigger additional passes which optimize
+// communication compute overlap.
+constexpr float kExtraCollectiveOptimizations = 0.2;
+
+// Returns true if the pass is enabled via `exec_time_optimization_effort` at
+// the potential expense of compile time.
+template <typename Pass>
+bool IsPassEnabledAtOptimizationEffort(const HloModule& module) {
+  float exec_effort = module.config().exec_time_optimization_effort();
+
+  bool is_collective_optimization_pass =
+      std::is_same_v<Pass, CollectivePipeliner> ||
+      std::is_same_v<Pass, DoubleBufferLoopUnrolling> ||
+      std::is_same_v<Pass, LatencyHidingScheduler>;
+
+  if (is_collective_optimization_pass) {
+    return exec_effort >= kExtraCollectiveOptimizations;
+  }
+
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FLAG_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h
new file mode 100644
index 00000000..1f505fa6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h
@@ -0,0 +1,118 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSION_DEDUPLICATION_CACHE_H_
+#define XLA_SERVICE_GPU_FUSION_DEDUPLICATION_CACHE_H_
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace gpu {
+
+// A cache that helps to track identical HLO instructions and their fusions. The
+// cache assigns an InstructionId to each instruction. Instructions that are the
+// same in terms of `HloInstruction::Identical` have the same id.
+//
+// The id depends on the fusion order. If we have the following chain of HLO
+// instructions:
+//
+//   a = instr1()
+//   b = instr2(a)
+//   c = instr3(b)
+//
+// It depends if we fuse `a` and `b` first or `b` and `c` first.
+// `id(fuse(fuse(a, b), c))` != `id(fuse(a, fuse(b, c)))`
+//
+// This is usually not a problem in practice and allows us to catch most of the
+// cases, because similar HLO instructions are usually fused together in the
+// same order.
+class FusionDeduplicationCache {
+ public:
+  // An id for an HLO instruction.
+  using InstructionId = int64_t;
+
+  // An id for a fusion of `producer` and `consumer`. Values in the tuple should
+  // not be interpreted by API users are subject to change. The id should be
+  // used as an opaque key to compare different fusions.
+  using FusionId =
+      std::tuple</*producer=*/InstructionId, /*consumer=*/InstructionId,
+                 /*operand_index=*/int64_t>;
+
+  // Initializes the cache for all fusible instructions in the given module.
+  // `is_fusible_fn` callback returns true if the instruction is fusible.
+  // Identical HLO instructions (in terms of `HloInstruction::Identical`) will
+  // be assigned the same id.
+  static FusionDeduplicationCache Create(
+      const HloModule& module,
+      absl::FunctionRef<bool(const HloInstruction&)> is_fusible_fn);
+
+  // Returns the id for the given instruction. The instruction should have an id
+  // already assigned, either during the initialization process in `Create` or
+  // manually after the fusion by `SetFusedInstructionId`.
+  InstructionId GetInstructionId(const HloInstruction& instruction);
+
+  // Returns the id for the fusion of `producer` and `consumer`.
+  FusionId GetFusionId(const HloInstruction& producer,
+                       const HloInstruction& consumer);
+
+  // Sets the new id for the `fusion_instruction`.
+  //
+  // The `fusion_instruction` should be the result of fusing `original_producer`
+  // and `original_consumer`. It can happen that `fusion_instruction` is equal
+  // to `original_consumer`. That means that `producer` was fused into
+  // `consumer` fusion and `fusion_instruction` gets a new id.
+  //
+  // `consumer_operand_index` is the operand index of `original_producer` in
+  // `original_consumer`.
+  //
+  // The operand index needs to be obtained before the fusion happened and
+  // provided explicitly, because at this point `original_producer` and
+  // `original_consumer` have been modified and became disconnected.
+  void UpdateFusedInstructionId(const HloInstruction& fusion_instruction,
+                                const HloInstruction& original_producer,
+                                const HloInstruction& original_consumer,
+                                int64_t consumer_operand_index);
+
+ private:
+  FusionDeduplicationCache(
+      int64_t next_id, absl::flat_hash_map<const HloInstruction*, InstructionId>
+                           instruction_id_map)
+      : next_id_(next_id), instruction_id_map_(std::move(instruction_id_map)) {}
+
+  FusionId GetFusionId(const HloInstruction& producer,
+                       const HloInstruction& consumer,
+                       int64_t consumer_operand_index);
+
+  int64_t next_id_ = 0;
+
+  // A map from an HLO instruction pointers to ids.
+  absl::flat_hash_map<const HloInstruction*, InstructionId> instruction_id_map_;
+
+  // A map from producer-consumer fusions to ids. After `producer` and
+  // `consumer` are fused, the id of the resulting fusion instruction will be
+  // equal to the id from this map.
+  absl::flat_hash_map<FusionId, InstructionId> fusion_id_map_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSION_DEDUPLICATION_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
new file mode 100644
index 00000000..7256f9d2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
+#define XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
+
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Returns a pipeline that attempts to redirect fusions to the most efficient
+// emitter possible.
+HloPassPipeline FusionDispatchPipeline(
+    const se::DeviceDescription& device_description,
+    HloCostAnalysis::ShapeSizeFunction shape_size_fn);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_pipeline.h
new file mode 100644
index 00000000..fab99248
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSION_PIPELINE_H_
+#define XLA_SERVICE_GPU_FUSION_PIPELINE_H_
+
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace gpu {
+
+// Function wrapper around the (non-horizontal) XLA GPU fusion pipeline.
+// Thread pool may be nullptr.
+HloPassPipeline FusionPipeline(
+    const DebugOptions& debug_options,
+    HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
+    tsl::thread::ThreadPool* thread_pool,
+    const se::DeviceDescription& gpu_device_info);
+
+// Function wrapper around the horizontal XLA GPU fusion pipeline.
+HloPassPipeline HorizontalFusionPipeline(
+    const se::DeviceDescription& gpu_device_info);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSION_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_process_dump.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_process_dump.h
new file mode 100644
index 00000000..4f06eada
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusion_process_dump.h
@@ -0,0 +1,118 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSION_PROCESS_DUMP_H_
+#define XLA_SERVICE_GPU_FUSION_PROCESS_DUMP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/fusion_process_dump.pb.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Helper class to work with fusion process dump.
+class FusionProcessDump {
+ public:
+  static absl::StatusOr<FusionProcessDump> LoadFromFile(
+      const std::string& path);
+  static absl::StatusOr<FusionProcessDump> LoadFromData(
+      const std::string& data, absl::string_view format);
+  static absl::StatusOr<FusionProcessDump> LoadFromProto(
+      const FusionProcessDumpProto& fusion_process_dump_proto);
+
+  const FusionProcessDumpProto& proto() { return fusion_process_dump_proto_; }
+
+  HloModule* module() { return hlo_module_.get(); }
+
+  const se::DeviceDescription& device_info() { return device_info_; }
+
+  int64_t current_step_idx() { return current_step_idx_; }
+
+  // Returns computation that contains producer (and other instructions) of the
+  // current step.
+  HloComputation* GetCurrentComputation();
+
+  // Returns the instruction with `name`.
+  HloInstruction* GetInstructionWithName(absl::string_view name);
+
+  // Returns producer of the current step. Should not be null, since all step
+  // types have a producer.
+  HloInstruction* GetProducer();
+
+  // Returns a list of consumers of the current step. The list contains one
+  // instruction is the current step is fusion. The list is empty if the current
+  // step is `producer_ineligible`.
+  absl::InlinedVector<HloInstruction*, 2> GetConsumers();
+
+  // Returns result instruction of the last fusion step. Returns nullptr before
+  // the first fusion.
+  HloInstruction* GetLastFusion() { return last_fusion_; }
+
+  // Returns current step. If current step is `fusion`, the `module` is in the
+  // state *before* the fusion. Next call to `FusionProcessDump::Advance` will
+  // actualy perform the fusion.
+  const FusionStep& CurrentStep();
+
+  // Returns true if there are fusion steps.
+  bool HasNext();
+
+  // Advances to the next fusion step. If current step is `fusion`, modifies the
+  // `module` accordingly.
+  void Advance();
+
+ private:
+  FusionProcessDump(FusionProcessDumpProto fusion_process_dump_proto,
+                    std::unique_ptr<HloModule> hlo_module,
+                    se::DeviceDescription device_info,
+                    absl::flat_hash_map<std::string, HloComputation*>
+                        instruction_name_to_computation_map)
+      : fusion_process_dump_proto_(std::move(fusion_process_dump_proto)),
+        hlo_module_(std::move(hlo_module)),
+        device_info_(std::move(device_info)),
+        instruction_name_to_computation_map_(
+            std::move(instruction_name_to_computation_map)) {}
+
+  FusionProcessDumpProto fusion_process_dump_proto_;
+  std::unique_ptr<HloModule> hlo_module_;
+  se::DeviceDescription device_info_;
+
+  // A map from instructions to computations. HLO module doesn't have a
+  // convenient way to get an instruction by name. This map saves the need to
+  // iterator over all computations in the module.
+  absl::flat_hash_map<std::string, HloComputation*>
+      instruction_name_to_computation_map_;
+
+  // Index of the current step.
+  int64_t current_step_idx_ = 0;
+
+  // Tracks result of the last fusion step.
+  HloInstruction* last_fusion_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSION_PROCESS_DUMP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
new file mode 100644
index 00000000..7db46247
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_CONCATENATE_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_CONCATENATE_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+class MlirConcatenateFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirConcatenateFusion(const HloFusionAnalysis& analysis);
+
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  Shape largest_shape_;
+  LaunchDimensionsConfig config_;
+  int unroll_factor_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_CONCATENATE_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/copy.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/copy.h
new file mode 100644
index 00000000..87215e79
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/copy.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_COPY_H_
+#define XLA_SERVICE_GPU_FUSIONS_COPY_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+// Special case of a fusion consisting only of `kCopy` instructions that can be
+// implemented using `memcpy`s.
+class MemcpyFusion : public FusionInterface {
+ public:
+  MemcpyFusion(const HloFusionAnalysis& analysis,
+               const BufferAssignment* buffer_assignment)
+      : analysis_(analysis), buffer_assignment_(buffer_assignment) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  const BufferAssignment* buffer_assignment_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_COPY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/cudnn.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/cudnn.h
new file mode 100644
index 00000000..c6575082
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/cudnn.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_CUDNN_H_
+#define XLA_SERVICE_GPU_FUSIONS_CUDNN_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+// Creates thunks from compiled cuDNN graphs serialized in backend
+// configs of corresponding fusions.
+class CuDnnFusion : public FusionInterface {
+ public:
+  explicit CuDnnFusion(const HloFusionAnalysis&) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+};
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_CUDNN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/custom.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/custom.h
new file mode 100644
index 00000000..bd0ccc71
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/custom.h
@@ -0,0 +1,79 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_CUSTOM_H_
+#define XLA_SERVICE_GPU_FUSIONS_CUSTOM_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+// A wrapper for fusions implemented using the mechanism in
+// xla/service/gpu/kernels. See custom_kernel_fusion.h in that folder for
+// details.
+class CustomFusion : public FusionInterface {
+ public:
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+};
+
+// Emitter for custom fusions implementing address computation. An address
+// computation contains a custom call hero, with at least one of its operands
+// coming from a static contiguous slice. E.g. operand `%cast` of `%gemm` coming
+// from `%slice`:
+// %address_computation {
+//   %p0 = f32[2, 1024, 1024]
+//   %p1 = f32[1024, 1024]
+//   %slice = f32[1, 1024, 1024] slice(%p0)
+//   %cast = f32[1024, 1024] bitcast(%slice)
+//   ROOT %gemm = custom_call(%cast, %p1) __cublas$Gemm
+// }
+//
+// The goal is to compute the buffer addresses for such operands (`%cast`) at
+// compile-time instead of allocating a new buffer for it at runtime by
+// translating the static slice into offset + size of the original buffer passed
+// into the custom call `%gemm`.
+//
+// It is possible to inscribe the results of the custom call within a larger
+// array. In that case, the affected results are each fed into a
+// `dynamic-update-slice` operation, whose result is one of the fusion's
+// outputs.
+//
+// The pass makes the assumption that, for each one of the custom-call's outputs
+// there is exactly one path to the fusion root. The resulting shape for the
+// dynamic slice fusion may be an unwrapped array, a flat tuple, or even a
+// nested tuple.
+class DynamicSliceFusion : public FusionInterface {
+ public:
+  explicit DynamicSliceFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_CUSTOM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/emitter_loc_op_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/emitter_loc_op_builder.h
new file mode 100644
index 00000000..151f05e9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/emitter_loc_op_builder.h
@@ -0,0 +1,206 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_EMITTER_LOC_OP_BUILDER_H_
+#define XLA_SERVICE_GPU_FUSIONS_EMITTER_LOC_OP_BUILDER_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "tsl/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+// The source_location.h is not available in open source.
+#include "absl/types/source_location.h"
+#else
+#include <string_view>
+#endif
+
+namespace xla::gpu {
+
+// The builder that could add the NameLoc attribute to the newly created
+// operations and fills this attribute with the SourceLocation(file:line) of the
+// create<OpTy>(...) calls. The location info will be added to the current_loc_
+// location that the builder got through the constructor. The copy constructor
+// also remembers the source location where the copy was created.
+//
+// Why: it is useful for tracking up the emitter file and line from the
+// generated MLIR.
+//
+// How:
+// 1. create<OpTy>(...) functions have absl::SourceLocation as the last
+// argument with the default value of SourceLocation::current(). Every time they
+// construct a new NameLoc attribute that contains the string from the
+// current_loc_ and file:line from the source location parameter.
+//
+// 2. The copy constructor also gets the source location as the argument and
+// remembers it in the current_loc_ as a join of the original current_loc_ and
+// the place where the copy was created.
+class EmitterLocOpBuilder : public mlir::ImplicitLocOpBuilder {
+ public:
+  // TODO(b/382419919): Remove ifdefs once we have absl::SourceLocation in absl
+  // OSS builds.
+#if defined(PLATFORM_GOOGLE)
+  using SourceLocation = absl::SourceLocation;
+  constexpr static bool kSourceLocationSupported = true;
+#else
+  // Mimicking absl::SourceLocation and doing nothing.
+  class FakeSourceLocation {
+   public:
+    static FakeSourceLocation current() { return FakeSourceLocation(); }
+    absl::string_view file_name() const { return ""; }
+    int line() const { return 0; }
+  };
+  using SourceLocation = FakeSourceLocation;
+  constexpr static bool kSourceLocationSupported = false;
+#endif
+
+  // Constructor that takes the op builder and a flag indicating whether to
+  // annotate the location of the operations.
+  EmitterLocOpBuilder(mlir::ImplicitLocOpBuilder& op_builder, bool annotate_loc)
+      : mlir::ImplicitLocOpBuilder(op_builder),
+        annotate_loc_(annotate_loc),
+        current_loc_(op_builder.getLoc()) {}
+
+  // A few constructors below that could be used when we replace the
+  // mlir::ImplicitLocOpBuilder and mlir::OpBuilder one by one.
+  // The intent is to use EmitterLocOpBuilder everywhere in the emitters.
+
+  // The constructor that should be used instead of mlir::ImplicitLocOpBuilder.
+  EmitterLocOpBuilder(mlir::Location loc, mlir::OpBuilder& op_builder,
+                      bool annotate_loc = false)
+      : mlir::ImplicitLocOpBuilder(loc, op_builder),
+
+        annotate_loc_(annotate_loc),
+        current_loc_(loc) {}
+
+  // The constructor that should be used instead of mlir::ImplicitLocOpBuilder.
+  EmitterLocOpBuilder(mlir::Location loc, mlir::MLIRContext* mlir_context,
+                      bool annotate_loc = false)
+      : mlir::ImplicitLocOpBuilder(loc, mlir_context),
+        annotate_loc_(annotate_loc),
+        current_loc_(loc) {}
+
+  EmitterLocOpBuilder& operator=(const EmitterLocOpBuilder&) = delete;
+
+  // Copy constructor that also remembers the source location where the copy
+  // was created. If the helper functions that gets the builder as the argument
+  // receives the argument by value then the current location points to the
+  // place where the copy was created.
+  EmitterLocOpBuilder(const EmitterLocOpBuilder& builder,
+                      SourceLocation location = SourceLocation::current())
+      : mlir::ImplicitLocOpBuilder(builder),
+        annotate_loc_(builder.annotate_loc_),
+        current_loc_(builder.Loc(location)) {}
+
+  // Formats the MLIR IR with annotations to make it easier to read.
+  static std::string FormatTritonIrWithAnnotations(absl::string_view mlir_ir);
+
+  // Below is the set of create() methods that are used to create operations.
+  // These are all templated to allow for the creation of operations with
+  // different numbers of arguments.
+  //
+  // For some reason the version of create that accepts the variadic arguments
+  // and a source location with the default value does not work.
+
+  template <typename OpTy>
+  OpTy create(SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(Loc(location));
+  }
+
+  // Creates an operation with the given type and one argument.
+  template <typename OpTy, typename Arg0>
+  OpTy create(Arg0&& arg, SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(Loc(location), std::forward<Arg0>(arg));
+  }
+
+  template <typename OpTy, typename Arg0, typename Arg1>
+  OpTy create(Arg0&& arg0, Arg1&& arg1,
+              SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(Loc(location), std::forward<Arg0>(arg0),
+                                   std::forward<Arg1>(arg1));
+  }
+
+  template <typename OpTy, typename Arg0, typename Arg1, typename Arg2>
+  OpTy create(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2,
+              SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(Loc(location), std::forward<Arg0>(arg0),
+                                   std::forward<Arg1>(arg1),
+                                   std::forward<Arg2>(arg2));
+  }
+
+  template <typename OpTy, typename Arg0, typename Arg1, typename Arg2,
+            typename Arg3>
+  OpTy create(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3,
+              SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(
+        Loc(location), std::forward<Arg0>(arg0), std::forward<Arg1>(arg1),
+        std::forward<Arg2>(arg2), std::forward<Arg3>(arg3));
+  }
+
+  template <typename OpTy, typename Arg0, typename Arg1, typename Arg2,
+            typename Arg3, typename Arg4>
+  OpTy create(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Arg4&& arg4,
+              SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(
+        Loc(location), std::forward<Arg0>(arg0), std::forward<Arg1>(arg1),
+        std::forward<Arg2>(arg2), std::forward<Arg3>(arg3),
+        std::forward<Arg4>(arg4));
+  }
+
+  template <typename OpTy, typename Arg0, typename Arg1, typename Arg2,
+            typename Arg3, typename Arg4, typename Arg5>
+  OpTy create(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Arg4&& arg4,
+              Arg5&& arg5,
+              SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(
+        Loc(location), std::forward<Arg0>(arg0), std::forward<Arg1>(arg1),
+        std::forward<Arg2>(arg2), std::forward<Arg3>(arg3),
+        std::forward<Arg4>(arg4), std::forward<Arg5>(arg5));
+  }
+
+  template <typename OpTy, typename Arg0, typename Arg1, typename Arg2,
+            typename Arg3, typename Arg4, typename Arg5, typename Arg6>
+  OpTy create(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Arg4&& arg4,
+              Arg5&& arg5, Arg6&& arg6,
+              SourceLocation location = SourceLocation::current()) {
+    return OpBuilder::create<OpTy>(
+        Loc(location), std::forward<Arg0>(arg0), std::forward<Arg1>(arg1),
+        std::forward<Arg2>(arg2), std::forward<Arg3>(arg3),
+        std::forward<Arg4>(arg4), std::forward<Arg5>(arg5),
+        std::forward<Arg6>(arg6));
+  }
+
+  mlir::Location current_loc() const { return current_loc_; }
+
+  bool annotate_loc() const { return annotate_loc_; }
+
+ private:
+  // Helper function to create a location from a source location.
+  mlir::Location Loc(SourceLocation location) const;
+
+  // Keep the current location of the builder and use it for annotating the
+  // newly created operations.
+  const bool annotate_loc_;
+  const mlir::Location current_loc_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_EMITTER_LOC_OP_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
new file mode 100644
index 00000000..c8c6a1e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -0,0 +1,135 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_FUSION_EMITTER_H_
+#define XLA_SERVICE_GPU_FUSIONS_FUSION_EMITTER_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+struct FusionEmissionResult {
+  std::vector<std::unique_ptr<Thunk>> thunks;
+};
+
+class FusionInterface {
+ public:
+  virtual ~FusionInterface() = default;
+
+  virtual absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const = 0;
+};
+
+// Interface for fusions that are implemented using cuda kernels.
+class KernelFusionInterface : public FusionInterface {
+ public:
+  virtual ~KernelFusionInterface() = default;
+
+  // Returns the fusion's launch dimensions.
+  virtual LaunchDimensions launch_dimensions() const = 0;
+
+  // Computes an indexing map from thread to output element(s) of the **hero**.
+  //
+  // The dimensions in the resulting map are
+  //   d0, d1, d2: threadIdx.{x,y,z}
+  //   d3, d4, d5: blockIdx.{x,y,z}
+  // If one thread computes multiple elements, this will be represented using a
+  // symbol.
+  //
+  // Cases where the exact element cannot be statically determined are currently
+  // unsupported (scatter, in-place DUS). Implementations will return nullopt.
+  // Note: Work in progress, not implemented for all emitters.
+  virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const = 0;
+
+  // Computes an indexing map from thread to input element(s) of the root's
+  // **hero**. Note that in many cases this is not computable from the output
+  // indexing. The indexing may only be known for some operands of the hero.
+  virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const = 0;
+
+  static constexpr std::array<int, 3> kIndexingMapThreadIdxDims = {0, 1, 2};
+  static constexpr std::array<int, 3> kIndexingMapBlockIdxDims = {3, 4, 5};
+
+ protected:
+  // Returns the default mapping for the given launch dimensions: linearizes
+  // the thread index and then reshapes it into the given layout.
+  // Populates the ranges for d0, d1, d2, d3, d4, d5 from the thread counts and
+  // block sizes in the given launch dimensions.
+  static IndexingMap GetDefaultThreadIdIndexingMap(
+      const LaunchDimensions& launch_dims, int unroll_factor,
+      const Shape& shape, mlir::MLIRContext* ctx);
+};
+
+absl::StatusOr<
+    std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray /*inputs*/>,
+               std::vector<llvm_ir::IrArray> /*outputs*/>>
+BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
+                     const std::string& suggested_name,
+                     absl::Span<const KernelArgument> arguments,
+                     size_t num_inputs,
+                     const LaunchDimensions& launch_dimensions,
+                     llvm::IRBuilderBase* builder);
+absl::StatusOr<
+    std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray /*inputs*/>,
+               std::vector<llvm_ir::IrArray> /*outputs*/>>
+BuildKernelPrototypeFromUniqueName(IrEmitterContext& ir_emitter_context,
+                                   const std::string& unique_name,
+                                   absl::Span<const KernelArgument> arguments,
+                                   size_t num_inputs,
+                                   const LaunchDimensions& launch_dimensions,
+                                   llvm::IRBuilderBase* builder);
+
+// Compute the kernel name. The opcode string may contain "-" which cannot be
+// in a PTX function name, so sanitize the name before uniquifying it.
+std::string GetSanitizedUniqueName(IrEmitterContext& ir_emitter_context,
+                                   const std::string& suggested_name);
+
+absl::Status AnnotateKernelLaunchDimensions(
+    const se::DeviceDescription& device_info,
+    const LaunchDimensions& launch_dims, const std::string& kernel_name,
+    llvm::Module* llvm_module);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_FUSION_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/fusions.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/fusions.h
new file mode 100644
index 00000000..6e9f16aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/fusions.h
@@ -0,0 +1,99 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_FUSIONS_H_
+#define XLA_SERVICE_GPU_FUSIONS_FUSIONS_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+
+namespace xla {
+namespace gpu {
+
+class FusionInfo {
+ public:
+  explicit FusionInfo(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
+  virtual ~FusionInfo() = default;
+
+  const HloFusionAnalysis& analysis() const { return analysis_; }
+
+  // If the fusion is a DUS fusion, returns whether it can be emitted in place.
+  // Undefined if the fusion is not a DUS fusion.
+  virtual bool CanEmitDynamicUpdateSliceInPlace() const = 0;
+
+  // Attempts to create a memcpy fusion, if possible. Returns nullopt if the
+  // fusion failed to pattern match.
+  // TODO(b/204548848): Find a proper abstraction for this once LMHLO is gone.
+  virtual std::optional<std::unique_ptr<FusionInterface>> GetCopyFusion()
+      const = 0;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+};
+
+class HloFusionInfo : public FusionInfo {
+ public:
+  HloFusionInfo(const HloFusionAnalysis& analysis,
+                const HloFusionInstruction* instr,
+                const BufferAssignment* buffer_assignment)
+      : FusionInfo(analysis),
+        instr_(instr),
+        buffer_assignment_(buffer_assignment) {}
+
+  bool CanEmitDynamicUpdateSliceInPlace() const override;
+  std::optional<std::unique_ptr<FusionInterface>> GetCopyFusion()
+      const override;
+
+ private:
+  const HloFusionInstruction* instr_;
+  const BufferAssignment* buffer_assignment_;
+};
+
+class PreBufferAssignmentFusionInfo : public FusionInfo {
+ public:
+  explicit PreBufferAssignmentFusionInfo(const HloFusionAnalysis& analysis)
+      : FusionInfo(analysis) {}
+
+  bool CanEmitDynamicUpdateSliceInPlace() const override {
+    auto ret = CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
+        analysis().fusion(), /*get_allocation_slice=*/{});
+    return ret.value_or(false);
+  }
+
+  std::optional<std::unique_ptr<FusionInterface>> GetCopyFusion()
+      const override {
+    // Copy fusions can't be created without buffer assignment. Note:
+    // technically, this is only needed to generate the chunk, the validation
+    // itself could be done without a buffer assignment. However, we currently
+    // have no use for this, so it's OK to always fall back to the loop fusion.
+    return std::nullopt;
+  }
+};
+
+// Returns the emitter for the given fusion.
+std::unique_ptr<FusionInterface> GetFusionEmitter(
+    const FusionInfo& fusion_info);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_FUSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
new file mode 100644
index 00000000..d0803e1d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -0,0 +1,89 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/launch_dimensions.h"
+
+namespace xla {
+namespace gpu {
+
+// Fusion node where the root is either:
+// 1. a dynamic-update-slice op
+// 2. a bitcast of a dynamic-update-slice op
+// 3. a tuple op returning the result of several dynamic-update-slice ops
+// 4. a tuple op returning the result of several bitcast
+//    dynamic-update-slice ops
+//
+// Lowers to LLVM via MLIR.
+class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirInPlaceDynamicUpdateSliceFusion(
+      const HloFusionAnalysis& analysis)
+      : analysis_(analysis),
+        dus_ops_(GetOutputDefiningDynamicUpdateSlices(analysis.fusion_roots())),
+        config_(ComputeLoopFusionConfig(
+            analysis, dus_ops_[0].instruction().operand(1)->shape())) {}
+
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* indexing_context) const override {
+    // The mapping cannot be statically computed in general, since the offsets
+    // are unknown.
+    return std::nullopt;
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* indexing_context) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  std::vector<HloInstructionAdaptor> dus_ops_;
+  LaunchDimensionsConfig config_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
new file mode 100644
index 00000000..14bf9aa3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_INPUT_SLICES_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_INPUT_SLICES_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// Generates code for input-fusible slices. Lowers to LLVM via MLIR.
+class MlirInputSlicesFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirInputSlicesFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis),
+        unroll_factor_(CeilOfRatio(
+            8, analysis.input_output_info().smallest_output_dtype_bits)) {}
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t output_id, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    // TODO(b/319081342): Implement this.
+    return std::nullopt;
+  }
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  const int unroll_factor_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_INPUT_SLICES_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/loop_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
new file mode 100644
index 00000000..b43fd2bf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_LOOP_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_LOOP_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+
+namespace xla {
+namespace gpu {
+
+// Generic loop fusion. Lowers to LLVM via MLIR.
+class MlirLoopFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirLoopFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {}
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  LaunchDimensionsConfig config_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_LOOP_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
new file mode 100644
index 00000000..f81fe200
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
@@ -0,0 +1,213 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_MLIR_COMPUTATION_PARTITIONER_H_
+#define XLA_SERVICE_GPU_FUSIONS_MLIR_COMPUTATION_PARTITIONER_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+namespace mlir_converter {
+
+struct EpilogueSpecification {
+  // Creates an epilogue with output indices matching the given root's shape.
+  static EpilogueSpecification FromIdentityIndexing(
+      const HloInstruction* hero, const HloInstruction* root,
+      mlir::MLIRContext* mlir_context);
+
+  std::vector<const HloInstruction*> heroes;
+  std::vector<const HloInstruction*> roots;
+
+  // The ranges of the indices that the subgraph is called with.
+  std::vector<int64_t> index_ranges;
+
+  // Indexing maps for each root output. All maps must have the same number of
+  // input dimensions.
+  std::vector<IndexingMap> root_indexing;
+};
+
+// Partitions an HLO computation into subgraphs so that all users of a node have
+// consistent indexing, i. e. when we compute a node `a` with users `b` and `c`,
+// all three nodes will have the same indexing - neither of `b` or `c` will be a
+// transpose, reshape, reduce, etc.
+//
+// Consider the following example, where we assume all nodes affect indexing:
+//
+//     a   b      Here we create four subgraphs: `a,d,c,e`, `b`, `f` and `g`. If
+//      \ /|      `f` and `g` didn't change the indexing, they would be included
+//    d  c |      in the `a,d,c,e` subgraph, so we'd have `b` and the rest.
+//     \ | |
+//       e |      Note that if some users have the same indexing as a node (e.g.
+//      / \|      `e` and `g` in the graph to the left), we still have to create
+//     f   g      separate subgraphs for `f` and `g`.
+//
+// The purpose of this partitioning is to allow us to generate code without ever
+// having to duplicate instructions: users with incompatible indexing will be in
+// different subgraphs, each of which will emit a call to the producer graph.
+//
+// Note that this partitioning will sometimes create silly subgraphs that should
+// (and will) be inlined, e. g. containing only a constant or only a broadcast.
+//
+// There is a hooks to customize this partitioning:
+// is_subgraph_root: forces the clusterer to start a new subgraph at a given
+// instruction. The instruction is guaranteed to be a in a different subgraph
+// than its users.
+class PartitionedComputation {
+ public:
+  explicit PartitionedComputation(const HloComputation* computation,
+                                  mlir::MLIRContext* mlir_context,
+                                  std::function<bool(const HloInstruction*)>
+                                      is_subgraph_root = HloPredicateFalse);
+
+  struct Subgraph {
+    // A unique name of the subgraph. Used for function names.
+    std::string name;
+
+    // The instructions that make up this subgraph.
+    absl::flat_hash_set<const HloInstruction*> instructions;
+
+    // The roots (return values of the function).
+    std::vector<const HloInstruction*> roots;
+
+    // The ranges of the indices that the subgraph is called with (dimensions
+    // and symbols).
+    std::vector<int64_t> index_ranges;
+
+    // Maps from raw indices to root indices.
+    std::vector<IndexingMap> root_indexing;
+
+    // For values that are function arguments (not function calls), stores
+    // the mapping from value to the starting argument index. The arguments
+    // always come after the tensor parameters and output indices; the indices
+    // are relative to the argument after the last index argument.
+    absl::flat_hash_map<const HloInstruction*, int> injected_value_starts;
+    // The sum of the arity of the injected values.
+    int num_injected_values = 0;
+
+    std::string ToString(int indentation = 0) const;
+
+    // Creates a subgraph for the given heroes' epilogue. The heroes values will
+    // be injected into the subgraph.
+    static Subgraph ForEpilogue(const EpilogueSpecification& epilogue);
+  };
+
+  absl::Span<const Subgraph> subgraphs() const { return subgraphs_; }
+
+  const HloComputation& computation() const { return *computation_; }
+
+  const Subgraph& GetRootSubgraph() const {
+    return FindSubgraph(computation_->root_instruction());
+  }
+
+  // Returns the subgraph containing the given instruction.
+  const Subgraph& FindSubgraph(const HloInstruction* instr) const {
+    return *instructions_to_subgraphs_.at(instr);
+  }
+
+  std::string ToString(int indentation = 0) const;
+
+ private:
+  const HloComputation* computation_;
+  std::vector<Subgraph> subgraphs_;
+  absl::flat_hash_map<const HloInstruction*, const Subgraph*>
+      instructions_to_subgraphs_;
+};
+
+// Given a root of a subgraph, returns the corresponding function.
+using CallTargetProvider =
+    std::function<mlir::func::FuncOp(const HloInstruction* instr)>;
+
+// A collection of PartitionedComputations, starting at a fusion computation and
+// including all transitively called computations.
+class PartitionedComputations {
+ public:
+  // Partition the given fusion computation and optionally generate an epilogue
+  // for the given heroes.
+  explicit PartitionedComputations(
+      const HloComputation* fusion, mlir::MLIRContext* mlir_context,
+      std::vector<EpilogueSpecification> epilogues = {});
+
+  const PartitionedComputation& FindPartitionedComputation(
+      const HloComputation* computation) const {
+    return *computation_to_partitioning_.at(computation);
+  }
+
+  const PartitionedComputation::Subgraph& FindSubgraph(
+      const HloInstruction* instr) const;
+
+  absl::Span<const PartitionedComputation> partitioned_computations() const {
+    return partitioned_computations_;
+  }
+
+  // If the fusion has an epilogue (i.e., the heroes are inside the fusion),
+  // returns it.
+  const std::vector<PartitionedComputation::Subgraph>& epilogues() const {
+    return epilogues_;
+  }
+
+  const HloComputation* fusion() const { return fusion_; }
+
+  // Creates a call target lookup function for use with SubgraphToMlir.
+  CallTargetProvider CreateCallTargetProvider(
+      const absl::flat_hash_map<const PartitionedComputation::Subgraph*,
+                                mlir::func::FuncOp>& subgraph_to_func) const;
+
+  // Declares func.func ops for each subgraph in each computation and returns a
+  // mapping from subgraph to declared function.
+  absl::flat_hash_map<const PartitionedComputation::Subgraph*,
+                      mlir::func::FuncOp>
+  DeclareFunctions(mlir::ModuleOp module) const;
+
+ private:
+  std::vector<PartitionedComputation> partitioned_computations_;
+  absl::flat_hash_map<const HloComputation*, const PartitionedComputation*>
+      computation_to_partitioning_;
+  const HloComputation* fusion_;
+  std::vector<PartitionedComputation::Subgraph> epilogues_;
+};
+
+// Returns an MLIR function declaration for the given subgraph. For subgraphs of
+// fusions, the signature is:
+//   (ptr, ptr, ..., index, index, ...) -> element type(s)
+// For subgraphs of called computations, the signature is:
+//   (elemen type, ...) -> element type(s)
+//
+// Subgraphs of fusions will also have range (xla.range = [lower_bound,
+// upper_bound], both bounds are inclusive) annotations on their index
+// arguments.
+mlir::func::FuncOp CreateSubgraphMlirFunction(
+    const PartitionedComputation::Subgraph& subgraph,
+    mlir::ImplicitLocOpBuilder& b);
+
+}  // namespace mlir_converter
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_MLIR_COMPUTATION_PARTITIONER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
new file mode 100644
index 00000000..af91ea23
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
@@ -0,0 +1,151 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_MLIR_ELEMENTAL_HLO_TO_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_MLIR_ELEMENTAL_HLO_TO_MLIR_H_
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+namespace mlir_converter {
+
+using OperandProvider =
+    std::function<absl::StatusOr<llvm::SmallVector<mlir::Value, 1>>(
+        const HloInstruction* instr, int index, mlir::ValueRange indices)>;
+
+// Emits MLIR to produce the value of a parameter. The parameter must be located
+// outside the subgraph. By default, the caller subgraph will be determined by
+// searching in 'computation' for the subgraph that contains 'instr'. If
+// 'instr' does not belong to 'computation', the caller subgraph can be passed
+// directly.
+mlir::ValueRange ProvideParameter(
+    const PartitionedComputation& computation, const HloInstruction* instr,
+    int operand_index, mlir::ValueRange indices,
+    const CallTargetProvider& call_target_provider, mlir::func::FuncOp this_fn,
+    mlir::ImplicitLocOpBuilder& builder,
+    const PartitionedComputation::Subgraph* caller = nullptr);
+
+// Emits MLIR to produce the values of a range of parameters. The parameters
+// must all be scalars. The parameters are all evaluated at the same indices.
+llvm::SmallVector<mlir::Value, 2> ProvideParameterRange(
+    const PartitionedComputation& computation, const HloInstruction* instr,
+    int start, int num, mlir::ValueRange indices,
+    const CallTargetProvider& call_target_provider, mlir::func::FuncOp this_fn,
+    mlir::ImplicitLocOpBuilder& builder);
+
+// Converts a function (subgraph) to an MLIR function producing one element of
+// the result. The function must have the correct interface.
+absl::Status SubgraphToMlirFunction(
+    const PartitionedComputation& computation,
+    const PartitionedComputation::Subgraph& subgraph, mlir::func::FuncOp& func,
+    const CallTargetProvider& call_target_provider);
+
+// Creates an `apply_indexing` op for the given map.
+llvm::SmallVector<mlir::Value, 3> ApplyIndexing(IndexingMap map,
+                                                mlir::ValueRange dims,
+                                                mlir::ValueRange symbols,
+                                                mlir::ImplicitLocOpBuilder& b);
+
+// Checks all the constraints and dimension ranges in the map.
+mlir::Value CheckConstraints(const IndexingMap& map, mlir::ValueRange dims,
+                             mlir::ValueRange symbols,
+                             mlir::ImplicitLocOpBuilder& b);
+
+// Emits a loop nest over the entire domain of the indexing_map at a point
+// `dim_values`.
+// If `vectorize` is set, the loop essentially turns into multiple independent
+// loops, and the results of all the loops are returned as a vector. The last
+// symbol dimension is used as the vectorized dimension.
+// If `vectorize` is set:
+// - the body will still be called with scalars and should return scalars.
+// - the loop for the last symbol in `indexing_map` will be vectorized
+// - the symbol range should be [0, 2] or [0, 4] for vectorization to work.
+//   [0, 1] is supported and will have no effect. The lower bound must be 0.
+// - all scalar results of `EmitLoopNest` will become vectors instead. Scalar
+//   inits will be initialized with a vector splat. Passing a vector init is
+//   supported.
+// - Tensor arguments and results are unaffected.
+mlir::ValueRange EmitLoopNest(
+    mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
+    mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
+    mlir::function_ref<llvm::SmallVector<mlir::Value>(
+        mlir::ValueRange iter_args, mlir::ValueRange dim_values,
+        mlir::ValueRange symbol_values)>
+        create_body,
+    bool vectorize = false);
+
+// Same as EmitLoopNest, but uses xla_gpu.loop.
+mlir::ValueRange EmitXlaLoopOp(
+    mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
+    mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
+    mlir::function_ref<llvm::SmallVector<mlir::Value>(
+        mlir::ImplicitLocOpBuilder& nested_b, mlir::ValueRange ivs,
+        mlir::ValueRange map_results, mlir::ValueRange iter_args)>
+        create_body,
+    bool vectorize = false);
+
+// Same as EmitLoopNest, but the body building function can return an error
+// which gets returned from EmitLoopNestWithStatus.
+absl::StatusOr<mlir::ValueRange> EmitLoopNestWithStatus(
+    mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
+    mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
+    mlir::function_ref<absl::StatusOr<llvm::SmallVector<mlir::Value>>(
+        mlir::ValueRange iter_args, mlir::ValueRange dim_values,
+        mlir::ValueRange symbol_values)>
+        create_body);
+
+// Clamps `index` to [0, high] boundaries.
+mlir::Value ClampIndex(mlir::Value index, bool is_unsigned, int64_t high,
+                       mlir::ImplicitLocOpBuilder& b);
+
+// Inlines `src_block` using `mapped_args` to initialize IRMapping from the
+// block arguments of `src_block` to `mapped_args`. Return remapped values of
+// the terminator.
+mlir::SmallVector<mlir::Value, 2> InlineBlock(mlir::OpBuilder& builder,
+                                              mlir::Block& src_block,
+                                              mlir::ValueRange mapped_args);
+
+// Populates `lbs`, `ubs` and `steps` with the loop bounds from `indexing_map`.
+void GetLoopBoundsFromIndexingMap(mlir::ImplicitLocOpBuilder& b,
+                                  const IndexingMap& indexing_map,
+                                  llvm::SmallVectorImpl<mlir::Value>* lbs,
+                                  llvm::SmallVectorImpl<mlir::Value>* ubs,
+                                  llvm::SmallVectorImpl<mlir::Value>* steps);
+
+}  // namespace mlir_converter
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_MLIR_ELEMENTAL_HLO_TO_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
new file mode 100644
index 00000000..05a5a6ef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
@@ -0,0 +1,141 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
+#define XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/PassManager.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class MlirFusionEmitterBase : public KernelFusionInterface {
+ public:
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+  // Visible for testing. `buffer_assignment` is optional for testing (assigns
+  // a different buffer to each tensor).
+  absl::StatusOr<std::unique_ptr<llvm::Module>> CreateLLVMModule(
+      mlir::MLIRContext& mlir_context, llvm::LLVMContext& llvm_context,
+      const se::DeviceDescription& device, const HloFusionInstruction& fusion,
+      const std::string& entry_function_name,
+      const BufferAssignment* buffer_assignment) const;
+
+  // Visible for testing. `buffer_assignment` is optional for testing (assigns
+  // a different buffer to each tensor).
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
+      mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+      const std::string& entry_function_name,
+      const BufferAssignment* buffer_assignment,
+      mlir::interpreter::MlirCompilationTrace* trace = nullptr) const;
+
+ protected:
+  // Returns the set of instructions that will be isolated in the partitioned,
+  // i.e., they will get their own subgraph. We won't automatically emit
+  // functions for these instructions.
+  virtual std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const {
+    return {};
+  }
+
+  // Creates an epilogue with the raw thread/block/symbol indices, as defined
+  // by the fusion's thread->output mapping.
+  mlir_converter::EpilogueSpecification GetEpilogueForOutputIndexing(
+      const HloFusionAnalysis& analysis,
+      const std::vector<const HloInstruction*>& heroes,
+      const std::vector<const HloInstruction*>& roots,
+      mlir::MLIRContext* mlir_context) const;
+
+  virtual absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const = 0;
+
+  // Evaluates the epilogue of the fusion. Returns the results for each epilogue
+  // root.
+  absl::flat_hash_map<const HloInstruction*, mlir::ValueRange> EmitEpilogue(
+      int epilogue_index,
+      const mlir_converter::PartitionedComputations& computations,
+      mlir::func::FuncOp entry_fn,
+      const absl::flat_hash_map<const HloInstruction*,
+                                llvm::SmallVector<mlir::Value>>& injected,
+      mlir::ValueRange output_indices,
+      mlir::ImplicitLocOpBuilder& builder) const;
+
+  mlir::Value EmitBlockId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
+  mlir::Value EmitThreadId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
+  llvm::SmallVector<mlir::Value> EmitThreadAndBlockIds(
+      mlir::ImplicitLocOpBuilder& builder) const;
+
+ private:
+  // Emits MLIR for the given fusion. The entry function has one tensor argument
+  // per fusion parameter and output and one tensor result per fusion output.
+  // The fuson outputs may only be used with `tensor.insert` ops.a
+  absl::Status EmitMlir(mlir::ModuleOp module,
+                        mlir::func::FuncOp entry_function,
+                        const HloFusionInstruction& fusion) const;
+  absl::Status RunPassPipeline(
+      mlir::ModuleOp module, mlir::PassManager& pm,
+      mlir::interpreter::MlirCompilationTrace* trace) const;
+};
+
+// Adds passes that simplify arithmetic operations and remove dead code.
+void AddXlaGpuOpsOptimizationPasses(mlir::OpPassManager& pm);
+
+// Adds passes that transform XLA_GPU and SCF loops, e.g. peel, pipeline,
+// vectorize.
+void AddLoopTransformationPasses(mlir::OpPassManager& pm,
+                                 const se::DeviceDescription& device);
+
+// Adds passes that lower transformed loops to LLVM.
+void AddLoweringPasses(mlir::OpPassManager& pm,
+                       const se::DeviceDescription& device);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/type_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/type_util.h
new file mode 100644
index 00000000..2e9eeae1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/mlir/type_util.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_MLIR_TYPE_UTIL_H_
+#define XLA_SERVICE_GPU_FUSIONS_MLIR_TYPE_UTIL_H_
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Types.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace mlir_converter {
+
+// Converts an XLA tensor to an MLIR ranked tensor. The layout is stored in the
+// encoding attribute, if it is not the default layout. `shape` must be an
+// array.
+mlir::Type TensorShapeToMlirType(const Shape& shape, mlir::OpBuilder& b);
+
+// Converts an XLA primitive type to an MLIR type. All integers are converted to
+// signless integers.
+mlir::Type PrimitiveTypeToMlirType(PrimitiveType type, mlir::OpBuilder& b);
+// Converts an XLA primitive type to an MLIR type, preserving the sign.
+mlir::Type PrimitiveTypeToMlirTypeWithSign(PrimitiveType type,
+                                           mlir::OpBuilder& b);
+
+// If `shape` is a tuple, returns the converted tuple shapes. Otherwise returns
+// just the converted shape. Nested tuples are not supported.
+llvm::SmallVector<mlir::Type> ShapeToMlirTypes(const Shape& shape,
+                                               mlir::OpBuilder& b);
+
+}  // namespace mlir_converter
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_MLIR_TYPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/reduction_base.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/reduction_base.h
new file mode 100644
index 00000000..d215718c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/reduction_base.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_REDUCTION_BASE_H_
+#define XLA_SERVICE_GPU_FUSIONS_REDUCTION_BASE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/reduction_utils.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+struct ReductionGroups {
+  std::vector<std::vector<const HloInstruction*>> grouped_roots;
+
+  // For each root of the fusion, returns the index of the group it was placed
+  // in.
+  std::vector<int> group_id_per_root;
+
+  // For each root of the fusion, returns whether it is a reduction root, or
+  // an additional output.
+  std::vector<bool> is_reduction_root;
+};
+
+ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis);
+
+int GetVectorSizeForMlir(const HloFusionAnalysis& analysis, int64_t minor_dim,
+                         int num_threads);
+
+void AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
+                          const ReductionGroups& groups);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_REDUCTION_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
new file mode 100644
index 00000000..8d56895b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
@@ -0,0 +1,241 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_REDUCTION_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_REDUCTION_MLIR_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/fusions/reduction_base.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/reduction_utils.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+using HloValueMap =
+    absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<mlir::Value>>;
+
+// Reduction fusion. Lowers to LLVM via MLIR. Currently not fully
+// implemented: only single reduction groups, no side outputs, only row
+// reductions.
+class MlirReductionFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirReductionFusion(const HloFusionAnalysis& analysis);
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
+
+  LaunchDimensions launch_dimensions() const override;
+
+  const ReductionGroups& GetGroups() const { return groups_; }
+
+ protected:
+  struct EmitterState;
+  friend struct EmitterState;
+
+  // Returns the init values for reductions (scalars).
+  HloValueMap GetInits(int group_id, EmitterState& state) const;
+
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
+  llvm::SmallVector<mlir::Value> EvaluateEpilogue(
+      const HloValueMap& results, llvm::SmallVector<mlir::Value> outputs,
+      EmitterState& state, int group_id, mlir::ValueRange symbol_values) const;
+
+  virtual llvm::SmallVector<mlir::Value> EmitReduction(
+      int group_id, EmitterState& state) const = 0;
+
+  // Returns a reduction indexing map with the given results.
+  IndexingMap GetIndexingMap(llvm::ArrayRef<mlir::AffineExpr> results,
+                             absl::Span<int64_t const> symbol_sizes = {}) const;
+  // Returns an indexing map whose domain is (thread ID)[s...].
+  IndexingMap GetThreadIndexingMap(
+      llvm::ArrayRef<mlir::AffineExpr> results,
+      absl::Span<std::pair<mlir::AffineExpr, Interval> const> constraints,
+      absl::Span<int64_t const> symbol_sizes = {}) const;
+
+  // Returns the input indexing. The inputs are given in the projected shape
+  // (i.e., the indexing map has three results).
+  virtual IndexingMap ComputeReductionInputIndexing(
+      mlir::MLIRContext* ctx) const = 0;
+  // Returns the output indexing. The outputs are given in the  projected
+  // reduced shape (i.e., one or two results, depending on the reduction type).
+  virtual IndexingMap ComputeReductionOutputIndexing(
+      mlir::MLIRContext* ctx) const = 0;
+
+  // Returns the (thread ID, vector index) -> (shared index...) map for the
+  // shared memory reduction.
+  virtual IndexingMap GetSharedMemoryReductionReadMap(
+      mlir::MLIRContext* ctx) const {
+    return IndexingMap::GetUndefined();
+  }
+
+  // Returns the (thread ID, vector index) -> (shared index...) map for the
+  // write to shared memory.
+  virtual IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const {
+    return IndexingMap::GetUndefined();
+  }
+
+  int64_t WarpSize() const {
+    return ::xla::gpu::WarpSize(analysis_.device_info());
+  }
+
+  // The reduction heroes for each reduction group.
+  std::vector<std::vector<const HloInstruction*>> reduction_heroes_;
+  // The roots that have reduction heroes for each reduction group.
+  std::vector<std::vector<const HloInstruction*>> reduction_roots_;
+  // The side output roots for each reduction group.
+  std::vector<std::vector<const HloInstruction*>> side_output_roots_;
+  const HloFusionAnalysis& analysis_;
+
+  // The number of elements in each dimension.
+  absl::InlinedVector<int64_t, 4> input_shape_;
+
+  // The number of elements for each dimension of a tile.
+  absl::InlinedVector<int64_t, 4> tile_sizes_per_thread_;
+
+  absl::InlinedVector<int64_t, 4> num_threads_;
+  absl::InlinedVector<int64_t, 4> num_blocks_;
+  int64_t vector_size_ = 1;
+
+  ReductionDimensions reduction_dimensions_;
+  ReductionGroups groups_;
+  const HloInstruction* first_reduce_;
+};
+
+class MlirRowReductionFusion : public MlirReductionFusion {
+ public:
+  explicit MlirRowReductionFusion(const HloFusionAnalysis& analysis);
+
+ protected:
+  // The number of warps working on one output element.
+  int GetWarpsPerRow() const;
+  llvm::SmallVector<mlir::Value> EmitReduction(
+      int group_id, EmitterState& state) const override;
+  IndexingMap ComputeReductionInputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap ComputeReductionOutputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap GetSharedMemoryReductionReadMap(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const override;
+
+  absl::InlinedVector<int64_t, 4> tile_sizes_per_block_;
+};
+
+class MlirMultiRowReductionFusion : public MlirReductionFusion {
+ public:
+  MlirMultiRowReductionFusion(const HloFusionAnalysis& analysis,
+                              int vector_size);
+
+  // Attempts to create a multi-row reduction emitter for the given analysis.
+  // Returns nullptr if the fusion is not supported.
+  static std::unique_ptr<MlirReductionFusion> TryCreate(
+      const HloFusionAnalysis& analysis);
+
+ protected:
+  // Returns the number of {kept, reduced} threads for the given reduction and
+  // vector size.
+  static absl::InlinedVector<int64_t, 4> GetNumThreads(
+      const ReductionDimensions& reduction_dimensions, int vector_size);
+  static int64_t GetNumBlocks(
+      const ReductionDimensions& reduction_dimensions,
+      const absl::InlinedVector<int64_t, 4>& num_threads);
+
+  llvm::SmallVector<mlir::Value> EmitReduction(
+      int group_id, EmitterState& state) const override;
+  IndexingMap ComputeReductionInputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap ComputeReductionOutputIndexing(
+      mlir::MLIRContext* ctx) const override;
+};
+
+class MlirColumnReductionFusion : public MlirReductionFusion {
+ public:
+  explicit MlirColumnReductionFusion(const HloFusionAnalysis& analysis);
+
+ protected:
+  llvm::SmallVector<mlir::Value> EmitReduction(
+      int group_id, EmitterState& state) const override;
+  IndexingMap ComputeReductionInputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap ComputeReductionOutputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap GetSharedMemoryReductionReadMap(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const override;
+};
+
+// Special emitter for column reductions whose minor reduced dimension divides
+// the warp size.
+class MlirSmallColumnReductionFusion : public MlirReductionFusion {
+ public:
+  explicit MlirSmallColumnReductionFusion(const HloFusionAnalysis& analysis);
+
+ protected:
+  llvm::SmallVector<mlir::Value> EmitReduction(
+      int group_id, EmitterState& state) const override;
+  IndexingMap ComputeReductionInputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap ComputeReductionOutputIndexing(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap GetSharedMemoryReductionReadMap(
+      mlir::MLIRContext* ctx) const override;
+  IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const override;
+
+  int64_t shared_rows_;
+  int64_t loop_size_;
+};
+
+std::unique_ptr<MlirReductionFusion> CreateMlirReductionFusion(
+    const HloFusionAnalysis& analysis);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_REDUCTION_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
new file mode 100644
index 00000000..676123d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
@@ -0,0 +1,217 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_SCATTER_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_SCATTER_MLIR_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+class EmitterHelper;
+
+// Full description of the scatter operation.
+// The shape of the indices tensor is <num_slices x index_vector_length>.
+// The shape of the updates tensor is <num_slices x slice_shape>.
+struct ScatterDescription {
+  const HloScatterInstruction* scatter;
+  int64_t num_slices;
+  int64_t index_vector_length;
+  PrimitiveType elem_type;
+  // The shape of the updates tensor
+  Shape update_shape;
+  llvm::SmallVector<int64_t, 2> slice_shape;
+  llvm::SmallVector<int64_t, 2> output_shape;
+};
+ScatterDescription GetScatterDescription(const HloFusionAnalysis& analysis);
+
+class MlirScatterFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirScatterFusion(const HloFusionAnalysis& analysis,
+                             const ScatterDescription& description,
+                             int64_t vector_size);
+
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  LaunchDimensions launch_dimensions() const override {
+    return LaunchDimensions(num_blocks_, num_warps_ * warp_size_);
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
+    // Since the access pattern to the output is not statically known, we cannot
+    // compute the output->input indexing map.
+    return std::nullopt;
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
+
+ protected:
+  virtual absl::Status EmitEntryFunctionImpl(
+      mlir::ImplicitLocOpBuilder& b, const EmitterHelper& helper,
+      const IndexingMap& updates_map, const IndexingMap& indices_map,
+      mlir::ValueRange thread_and_block_ids,
+      mlir::Value output_tensor) const = 0;
+
+  virtual void ComputeIndexing(mlir::MLIRContext* ctx, IndexingMap* updates_map,
+                               IndexingMap* indices_map) const = 0;
+
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const final;
+
+  const HloFusionAnalysis& analysis_;
+  ScatterDescription description_;
+
+  // The grid is {num_warps_ * WarpSize(), 1, 1, num_blocks_, 1, 1}.
+  int64_t warp_size_;
+  int64_t num_warps_;
+  int64_t num_blocks_;
+
+  // The number of elements that every thread will read from the updates tensor
+  // and write to the output tensor.
+  int64_t vector_size_;
+};
+
+// The distribution happens similarly to the loop emitter, but the iteration
+// space corresponds to the shape of the updates tensor. In this case, GPU
+// performs a grid-stride loop over the updates and every warp computes at what
+// index to scatter an element(s) of the update.
+class ScatterWithDistributedUpdates : public MlirScatterFusion {
+ public:
+  explicit ScatterWithDistributedUpdates(const HloFusionAnalysis& analysis,
+                                         const ScatterDescription& description,
+                                         int64_t vector_size);
+
+ protected:
+  absl::Status EmitEntryFunctionImpl(mlir::ImplicitLocOpBuilder& b,
+                                     const EmitterHelper& helper,
+                                     const IndexingMap& updates_map,
+                                     const IndexingMap& indices_map,
+                                     mlir::ValueRange thread_and_block_ids,
+                                     mlir::Value output_tensor) const override;
+
+  void ComputeIndexing(mlir::MLIRContext* ctx, IndexingMap* updates_map,
+                       IndexingMap* indices_map) const override;
+};
+
+// Every warp will process one or more indices, i.e. there won't be two threads
+// in a warp that scatter different indices at a time. In this case, every warp
+// iterates its fraction of the indices, and then computes what updates to
+// scatter.
+// It implements the following algorithm:
+
+/*
+ %indices = -1
+ %inbounds = false
+ %acc = vector<num_iter x vector_size>
+
+ // #indices_map
+ %updated_accumulator, %updated_out = for %i = 0 to %num_indices_per_warp_ {
+  %new_indices = PadWithZeros(ExtractOffsets(%indices_operand, %i))
+  %indices_changed = EmitInequalityCheck(%new_indices, %indices)
+  if (%indices_changed && %i != 0) {
+    %output_tensor = WriteAccumulatorToOutput(%current_acc, %current_out);
+  }
+  if (%indices_changed) {
+    %inbounds = EmitBoundsCheck(%new_indices, %slice_shape, %output_shape)
+  }
+  if (%inbounds) {
+    if (%indices_changed) {
+     // updates_map(%i)
+     for %j = 0 to %num_slice_iterations_per_warp step 1 {
+       for %k = 0 to %vector_size step 1 {
+         %update_elem = GetUpdateElement
+         %acc = %update_elem
+       }
+     }
+    } else {
+     // updates_map(%i)
+     for %j = 0 to %num_slice_iterations_per_warp step 1 {
+       for %k = 0 to %vector_size step 1 {
+         %update_elem = GetUpdateElement
+         %acc = Reduce(%update_elem, %acc)
+       }
+     }
+    }
+  }
+}
+%final_out = WriteAccumulatorToOutput(%updated_accumulator, %updated_out);
+*/
+class ScatterWithDistributedIndices : public MlirScatterFusion {
+ public:
+  explicit ScatterWithDistributedIndices(const HloFusionAnalysis& analysis,
+                                         const ScatterDescription& description,
+                                         int64_t vector_size,
+                                         int64_t num_warps_per_slice,
+                                         int64_t num_indices_per_warp);
+
+ protected:
+  void ComputeIndexing(mlir::MLIRContext* ctx, IndexingMap* updates_map,
+                       IndexingMap* indices_map) const override;
+
+  absl::Status EmitEntryFunctionImpl(mlir::ImplicitLocOpBuilder& b,
+                                     const EmitterHelper& helper,
+                                     const IndexingMap& updates_map,
+                                     const IndexingMap& indices_map,
+                                     mlir::ValueRange thread_and_block_ids,
+                                     mlir::Value output_tensor) const override;
+
+ private:
+  // Creates a 2D vector to store the accumulated updates in each thread.
+  mlir::Value InitializeAccumulator(mlir::ImplicitLocOpBuilder& b) const;
+
+  // The number of warps that process a single slice of the update.
+  int64_t num_warps_per_slice_;
+  // The number of indices that every warp iterates over. This is a useful
+  // setting, if we know that the indices tensor is sorted.
+  int64_t num_indices_per_warp_;
+};
+
+std::unique_ptr<MlirScatterFusion> CreateMlirScatterFusion(
+    const HloFusionAnalysis& analysis);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_SCATTER_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/tools/test_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/tools/test_lib.h
new file mode 100644
index 00000000..5dfa3009
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/tools/test_lib.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_TOOLS_TEST_LIB_H_
+#define XLA_SERVICE_GPU_FUSIONS_TOOLS_TEST_LIB_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+
+namespace gpu {
+
+// Loads a test module from the given filename, ensuring it has a single fusion.
+// If the file contains more than one fusion, the function fails. If the file
+// contains no fusions, the function generates a fusion from the entry
+// computation.
+absl::StatusOr<std::unique_ptr<HloModule>> LoadTestModule(
+    absl::string_view filename);
+
+// Returns the MLIR fusion emitter for the given module, which should have been
+// loaded using LoadTestModule.
+struct EmitterData {
+  HloFusionInstruction* fusion;
+  std::optional<se::DeviceDescription> device;
+  std::optional<HloFusionAnalysis> analysis;
+  std::unique_ptr<MlirFusionEmitterBase> emitter;
+};
+absl::StatusOr<std::unique_ptr<EmitterData>> GetMlirFusionEmitter(
+    const HloModule& module);
+
+// Returns an MLIR context with all the dialects needed for testing.
+mlir::MLIRContext GetMlirContextForTest();
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TOOLS_TEST_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
new file mode 100644
index 00000000..f2145110
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -0,0 +1,127 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRANSPOSE_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRANSPOSE_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+// Lowers kTranspose fusion to LLVM via MLIR using GPU's shared memory.
+
+// Each thread block of `kWarpSize` x `kNumRows` threads
+// transposes one tile: each thread copies kWarpSize/kNumRows elements from
+// the input to a shared memory tile.
+
+// This is similar to the following CUDA algorithm in TensorFlow:
+// https://goo.gl/MStRV6.
+class MlirTransposeFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirTransposeFusion(const HloFusionAnalysis& analysis);
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* mlir_context) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
+  struct WriteResult {
+    // All output tensors of the fusion, with side outputs written to them.
+    mlir::SmallVector<mlir::Value> updated_outputs;
+    // Shared memory tiles for transpose heroes.
+    mlir::ValueRange shmem_tensors;
+  };
+
+  WriteResult EmitWriteToShMemMlir(
+      mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion,
+      const mlir_converter::PartitionedComputation& root_computation,
+      const mlir_converter::CallTargetProvider& call_target_provider,
+      mlir::ValueRange output_args,
+      mlir::ValueRange thread_and_block_ids) const;
+  void EmitReadFromShMemMlir(
+      mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion,
+      const mlir_converter::PartitionedComputations& computations,
+      const WriteResult& written, mlir::ValueRange thread_and_block_ids) const;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+
+  IndexingMap GetIndexing(bool input, const xla::Shape& shape,
+                          mlir::MLIRContext* ctx) const;
+  IndexingMap GetSharedMemoryIndexing(bool read, mlir::MLIRContext* ctx) const;
+  llvm::SmallVector<mlir::AffineExpr, 4> GetThreadOffsets(
+      bool read, mlir::MLIRContext* ctx) const;
+  bool MostMinorDimensionUnchanged() const;
+
+  TransposeDescription transpose_;
+  absl::InlinedVector<int64_t, 3> permutation_;
+  std::vector<int64_t> input_shape_;
+  std::vector<int64_t> block_sizes_;  // In input elements.
+  std::vector<int64_t> output_block_sizes_;
+  std::vector<int64_t> block_counts_;
+  int vector_size_;
+  int block_size_;
+  int64_t base_block_size_;
+
+  std::vector<const HloInstruction*> shmem_transposes_;
+  std::vector<const HloInstruction*> shmem_transpose_roots_;
+  std::vector<int> shmem_transpose_root_indices_;
+  std::vector<const HloInstruction*> side_output_roots_;
+  std::vector<int> side_output_root_indices_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRANSPOSE_MLIR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton.h
new file mode 100644
index 00000000..253ffc3e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_H_
+
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/fusions/triton/triton_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class TritonFusion : public FusionInterface {
+ public:
+  struct LaunchConfig {
+    LaunchDimensions launch_dimensions;
+    BlockLevelParameters block_level_parameters;
+  };
+
+  explicit TritonFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+  // Returns the launch config for Triton fusions that have a block level fusion
+  // config.
+  // Not supported for MatMul fusions yet.
+  std::optional<LaunchConfig> launch_config() const;
+
+  // Generates a Triton kernel for the given fusion into the provided LLVM
+  // module, and returns the `TritonWrapperResult` corresponding to the
+  // generated kernel.
+  absl::StatusOr<TritonWrapperResult> GenerateTritonKernelAndWrapper(
+      const HloFusionInstruction& fusion, absl::string_view impl_fn_name,
+      const se::DeviceDescription& device_info, llvm::Module* llvm_module,
+      mlir::MLIRContext* mlir_context) const;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/compilation_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/compilation_pipeline.h
new file mode 100644
index 00000000..e6a8b2f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/compilation_pipeline.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_COMPILATION_PIPELINE_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_COMPILATION_PIPELINE_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir::triton::nvidia_gpu {
+
+// Forward declaration to avoid including a GPU-only header.
+struct ClusterInfo;
+
+}  // namespace mlir::triton::nvidia_gpu
+
+namespace xla {
+namespace gpu {
+
+// Creates a Triton compilation pipeline.
+//
+// `out_cluster_info` must be kept alive at least until pm.run() is called.
+// It should be read after that. We have to pass the cluster dims to
+// LaunchDimensions. Triton currently uses this as an out-parameter to return
+// the cluster dims determined based on `config.num_ctas` and a heuristic. There
+// are some signs that show that this was intended to be used as an in-out
+// parameter which would give a hint to Triton which cluster dims we prefer to
+// use, but that's not the case currently.
+absl::Status CreateTritonPipeline(
+    mlir::OpPassManager* pm, std::string arch_name, int num_warps, int num_ctas,
+    int num_stages, mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info,
+    bool is_xla_fusion);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_COMPILATION_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/emitter_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/emitter_helpers.h
new file mode 100644
index 00000000..7e20b6b3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/emitter_helpers.h
@@ -0,0 +1,196 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_EMITTER_HELPERS_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_EMITTER_HELPERS_H_
+
+#include <cstdint>
+#include <variant>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/literal.h"
+#include "xla/service/gpu/fusions/emitter_loc_op_builder.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla::gpu::triton {
+
+// This is a wrapper around mlir::Value that can hold either a scalar or a
+// non-0D tensor. An attempt to use this class with 0D tensors will CHECK-fail
+// because 0D tensors are not supported by Triton.
+class ScalarOrTensor {
+  using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
+
+ public:
+  ScalarOrTensor() = default;
+
+  // Wraps the given value in a ScalarOrTensor. CHECK-fails if the
+  // value is a 0D tensor, because Triton does not support 0D tensors.
+  explicit ScalarOrTensor(mlir::Value value);
+
+  bool IsScalar() const { return !IsTensor(); }
+  bool IsTensor() const { return mlir::isa<TensorValue>(value_); }
+
+  mlir::Value UnwrapScalar() const {
+    CHECK(IsScalar());
+    return value_;
+  }
+
+  TensorValue UnwrapTensor() const {
+    CHECK(IsTensor());
+    return mlir::cast<TensorValue>(value_);
+  }
+
+  // Returns the underlying value regardless of whether it is a scalar or a
+  // tensor. Only call this method in contexts where the consumer of the result
+  // both needs to use an `mlir::Value` and functions identically for scalars
+  // and tensors. In other cases, prefer to use the `UnwrapScalar` or
+  // `UnwrapTensor` methods.
+  mlir::Value UnwrapUnsafe() const { return value_; }
+
+  mlir::Type getType() const { return value_.getType(); }
+
+ private:
+  mlir::Value value_;
+};
+
+// Triton requires that all block dimensions are a power of 2.
+// TODO(b/353484968): Delete this function once we have constraints to only
+// propagate tile sizes that are a power of 2.
+llvm::SmallVector<int64_t> GetPaddedTileSizes(
+    llvm::ArrayRef<int64_t> tile_sizes);
+
+// XLA -> Triton type conversions.
+absl::StatusOr<mlir::Type> TritonType(EmitterLocOpBuilder& b, PrimitiveType t);
+
+mlir::Type StorageType(EmitterLocOpBuilder& b, mlir::Type t);
+
+// Get the value of the scalar constant's literal in a C++ type.
+template <typename T>
+T ScalarConstantValue(const HloInstruction& instr, PrimitiveType dst_type) {
+  CHECK_EQ(instr.opcode(), HloOpcode::kConstant);
+  CHECK(ShapeUtil::IsEffectiveScalar(instr.shape()));
+  absl::StatusOr<Literal> converted = instr.literal().Convert(dst_type);
+  TF_CHECK_OK(converted.status());
+  return converted.value().GetFirstElement<T>();
+}
+
+// Create a scalar constant.
+template <typename T>
+ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value) {
+  if (mlir::isa<mlir::IntegerType>(type)) {
+    auto result =
+        b.create<mlir::arith::ConstantOp>(b.getIntegerAttr(type, value));
+    return ScalarOrTensor(result);
+  }
+  if (mlir::isa<mlir::FloatType>(type)) {
+    auto result = b.create<mlir::arith::ConstantOp>(
+        b.getFloatAttr(type, static_cast<double>(value)));
+    return ScalarOrTensor(result);
+  }
+  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
+}
+
+// Create a tensor constant.
+template <typename T>
+ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value,
+                           llvm::ArrayRef<int64_t> shape) {
+  if (shape.empty()) {
+    return CreateConst<T>(b, type, value);
+  }
+  auto tensor_type = mlir::RankedTensorType::get(shape, type);
+  if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(type)) {
+    auto result =
+        b.create<mlir::arith::ConstantOp>(mlir::DenseElementsAttr::get(
+            tensor_type,
+            mlir::APInt(int_type.getIntOrFloatBitWidth(), value,
+                        /*isSigned=*/false, /*implicitTrunc=*/true)));
+    return ScalarOrTensor(result);
+  }
+  if (auto float_type = mlir::dyn_cast<mlir::FloatType>(type)) {
+    auto result =
+        b.create<mlir::arith::ConstantOp>(mlir::DenseElementsAttr::get(
+            tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
+    return ScalarOrTensor(result);
+  }
+  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
+}
+
+// Create a constant of the same shape as `like` but with a new type and value.
+template <typename T>
+mlir::Value ConstLike(EmitterLocOpBuilder& b, mlir::Value like, T new_value) {
+  if (auto src_shaped_ty = mlir::dyn_cast<mlir::ShapedType>(like.getType())) {
+    mlir::Type src_ty = src_shaped_ty.getElementType();
+    return CreateConst(b, src_ty, new_value, src_shaped_ty.getShape())
+        .UnwrapUnsafe();
+  }
+  return CreateConst(b, like.getType(), new_value).UnwrapUnsafe();
+}
+
+inline mlir::Value ZerosLike(EmitterLocOpBuilder& b, mlir::Value x) {
+  return ConstLike(b, x, 0);
+}
+
+inline mlir::Value OnesLike(EmitterLocOpBuilder& b, mlir::Value x) {
+  return ConstLike(b, x, 1);
+}
+
+bool IsFp8Type(mlir::Type t);
+
+ScalarOrTensor Splat(EmitterLocOpBuilder& b, ScalarOrTensor value,
+                     llvm::ArrayRef<int64_t> shape);
+
+// Triton type conversions.
+mlir::Value Cast(EmitterLocOpBuilder& b, mlir::Value value,
+                 mlir::Type dst_element_ty);
+
+// Emits a scalar constant.
+absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
+                                            const HloInstruction& constant);
+
+bool IsSupportedElementwiseLibdeviceFunction(const HloInstruction& hlo);
+
+// Should only be called if IsSupportedElementwiseLibdeviceFunction() returns
+// true for `hlo`, otherwise an error is returned.
+absl::StatusOr<mlir::Value> EmitElementwiseLibdeviceFunction(
+    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info, const HloInstruction& hlo,
+    mlir::ValueRange inputs);
+
+absl::StatusOr<mlir::Value> EmitElementwise(
+    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info, const HloInstruction& hlo,
+    mlir::ValueRange inputs);
+
+}  // namespace xla::gpu::triton
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_EMITTER_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/kernel_name_tracer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/kernel_name_tracer.h
new file mode 100644
index 00000000..487228ee
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/kernel_name_tracer.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_KERNEL_NAME_TRACER_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_KERNEL_NAME_TRACER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace xla::gpu {
+
+// In some cases we need to know what exact kernel was used. It happens when we
+// have no direct way to get this information from the HLO. For example, when we
+// have a fusion with a custom call to cuBLAS or another third party library.
+// This class allows to get the names of the kernels that were used.
+class KernelNameTracer {
+ public:
+  static std::unique_ptr<KernelNameTracer> Create();
+
+  virtual void start() = 0;
+
+  // It should return the names of the kernels that were executed on GPU:0.
+  virtual std::vector<std::string> stop() = 0;
+
+  virtual ~KernelNameTracer() = default;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_KERNEL_NAME_TRACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/passes.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/passes.h
new file mode 100644
index 00000000..ebfa37b9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/passes.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_PASSES_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla::gpu {
+
+#define GEN_PASS_DECL
+#include "xla/service/gpu/fusions/triton/passes.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateGeneralizeKernelSignaturePass();
+
+#define GEN_PASS_REGISTRATION
+#include "xla/service/gpu/fusions/triton/passes.h.inc"
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_fusion_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_fusion_emitter.h
new file mode 100644
index 00000000..0181bff7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_fusion_emitter.h
@@ -0,0 +1,127 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_FUSION_EMITTER_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_FUSION_EMITTER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/PassManager.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/emitter_loc_op_builder.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir {
+namespace triton {
+namespace nvidia_gpu {
+struct ClusterInfo;
+}
+}  // namespace triton
+}  // namespace mlir
+
+namespace xla {
+namespace gpu {
+
+struct TritonWrapperResult {
+  int64_t shmem_bytes = 0;
+  std::optional<se::ClusterDim> cluster_dim;
+};
+
+// Load the MLIR dialects required for Triton IR generation.
+void LoadMlirDialectsForTriton(mlir::MLIRContext& mlir_context);
+
+// Generate Triton IR by running the provided generator and compile it into LLVM
+// IR.
+absl::StatusOr<TritonWrapperResult> TritonWrapper(
+    absl::string_view fn_name, const HloFusionInstruction* fusion,
+    const se::GpuComputeCapability& cc,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    llvm::Module* llvm_module, mlir::MLIRContext& mlir_context);
+
+// Creates the initial Triton module for the given fusion. Visible for testing,
+// use TritonWrapper instead.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
+    absl::string_view fn_name, const HloFusionInstruction* fusion,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::MLIRContext& mlir_context);
+
+// Compiles a given Triton module to LLVM IR.
+// If `emit_kernels` is false, then the function skips emitting
+// the kernels, but it still returns correctly filled TritonWrapperResult.
+// That is useful when deserializing from the compilation cache.
+absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+    const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
+    mlir::MLIRContext& mlir_context, bool is_xla_fusion,
+    bool emit_kernel = true);
+
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
+                             const se::DeviceDescription& device_info);
+
+// Exposed for testing purposes only. Do not use.
+namespace ir_emitter_triton_internal {
+
+// Computes the transformation from a 1-d program_id to a tile multi-index.
+llvm::SmallVector<mlir::Value, 3> ComputeDelinearizedTileIndex(
+    EmitterLocOpBuilder& b, absl::Span<const int64_t> num_output_tiles_per_dim);
+
+// Used for creating Triton Load and Store ops.
+struct MakeTensorPtrOpAndBoundaryChecks {
+  ::mlir::triton::MakeTensorPtrOp op;
+
+  // Indices of dimensions where the original tile size is not a power of 2 and
+  // requires a boundary check.
+  llvm::SmallVector<int32_t> boundary_checks;
+};
+
+absl::StatusOr<MakeTensorPtrOpAndBoundaryChecks> CreateMakeTensorPtrOp(
+    EmitterLocOpBuilder& b, mlir::ValueRange tile_multi_index,
+    const TiledHloInstruction& tiled_hlo, mlir::Value parent_base_ptr);
+}  // namespace ir_emitter_triton_internal
+
+// Dumps the Triton IR to a string.
+//
+// If `dump_annotations` is true, then the function also dumps the loc
+// attributes of the instructions. Otherwise, it dumps the IR without
+// annotations.
+std::string DumpTritonIR(mlir::ModuleOp triton_module, bool dump_annotations);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_FUSION_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_fusion_emitter_legacy_matmul.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_fusion_emitter_legacy_matmul.h
new file mode 100644
index 00000000..e56eb7de
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_fusion_emitter_legacy_matmul.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/fusions/emitter_loc_op_builder.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/stream_executor/device_description.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace xla::gpu {
+
+// Compute the launch dimensions for the given Triton MatMul.
+absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
+    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
+    const TritonGemmConfig& config, const se::DeviceDescription& device_info);
+
+// Use tiling and execution parameters from 'config'. BlockLevelParameters are
+// ignored.
+// Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
+absl::Status EmitMatMul(EmitterLocOpBuilder& builder,
+                        absl::string_view libdevice_path,
+                        const se::DeviceDescription& device_info,
+                        const HloFusionInstruction* fusion,
+                        mlir::triton::FuncOp fn, const BlockLevelParameters&);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_support.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_support.h
new file mode 100644
index 00000000..879a8ea3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_support.h
@@ -0,0 +1,79 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_SUPPORT_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_SUPPORT_H_
+
+// This file is the home of the basic Triton support checks which are used by
+// multiple other components.
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using CodegenDecision = FusionDecision;
+
+// Checks that Triton officially supports the provided compute capability.
+//
+// Currently does not perform any check for non-CUDA compute capabilities.
+absl::Status EnsureTritonSupportsComputeCapability(
+    const se::GpuComputeCapability& gpu_compute_capability);
+
+// Return `CodegenDecision`'s equivalent of `true` if the parameter instruction
+// is supported by the Triton emitters for the given compute capability. Note
+// that this function makes no assumption about what happens if
+// `FloatNormalization` is run, unlike the legacy Triton utils.
+//
+// Note: this function is entirely dissociated from the legacy Triton emitters.
+// If you intend to add a feature to the legacy Triton emitters (which you
+// probably shouldn't), use `legacy_triton::IsTritonSupportedInstruction`
+// instead.
+CodegenDecision IsTritonSupportedInstruction(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
+
+// Returns `CodegenDecision`'s equivalent of `true` if all the instructions in
+// the parameter computation are supported by the Triton emitters for the given
+// compute capability.
+//
+// This function has the same caveats as `IsTritonSupportedInstruction` as
+// defined in the present namespace.
+CodegenDecision IsTritonSupportedComputation(
+    const HloComputation& computation,
+    const se::GpuComputeCapability& gpu_compute_capability);
+
+// Returns `true` if the parameter computation is a Triton fused computation,
+// i.e. the calling fusion instruction has `FusionKind::kCustom` and
+// `backend_config<gpu::GpuBackendConfig>()` with `kind` set to
+// `kTritonGemmFusionKind`.
+bool IsTritonFusedComputation(const HloComputation& computation);
+
+namespace internal {
+// TODO(b/363981282): Remove the function below once all ops are tested via
+// HLOs. This is exposed for testing purposes only and will be removed in the
+// near future. Do not use. This functions only returns a partial result.
+bool IsTritonUnsupportedOpcode(HloOpcode opcode);
+}  // namespace internal
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_support_legacy.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_support_legacy.h
new file mode 100644
index 00000000..da088465
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_support_legacy.h
@@ -0,0 +1,110 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_SUPPORT_LEGACY_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_SUPPORT_LEGACY_H_
+
+// This file is the home of the basic Triton support checks which are used by
+// multiple other components.
+
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using CodegenDecision = FusionDecision;
+
+namespace legacy_triton {
+
+// Tells if f(a+b) == f(a) + f(b).
+bool IsDistributiveOverAddition(const HloInstruction& hlo);
+
+// Allowlist of unary elementwise operations supported by the legacy Triton
+// emitters.
+//
+// Note: this is not an accurate representation of what is actually supported by
+// the Triton emitters, because operations affected by FloatNormalization may
+// be tagged as "supported" here, even though FloatNormalization is required to
+// make them work. We could fix this, but this is code we aim to delete soon, so
+// it doesn't seem worth it. We'll revisit this decision if the code doesn't go
+// away soon.
+std::vector<HloOpcode> TritonSupportedUnaryElementwiseUpToFloatNormalization(
+    PrimitiveType);
+
+// Allowlist of binary elementwise operations supported by the legacy Triton
+// emitters.
+//
+// Note: this is not an accurate representation of what is actually supported by
+// the Triton emitters, because operations affected by FloatNormalization may
+// be tagged as "supported" here, even though FloatNormalization is required to
+// make them work. We could fix this, but this is code we aim to delete soon, so
+// it doesn't seem worth it. We'll revisit this decision if the code doesn't go
+// away soon.
+std::vector<HloOpcode> TritonSupportedBinaryElementwiseUpToFloatNormalization(
+    PrimitiveType);
+
+// Allowlist of ternary elementwise operations supported by the legacy Triton
+// emitters.
+//
+// Note: this is not an accurate representation of what is actually supported by
+// the Triton emitters, because operations affected by FloatNormalization may
+// be tagged as "supported" here, even though FloatNormalization is required to
+// make them work. We could fix this, but this is code we aim to delete soon, so
+// it doesn't seem worth it. We'll revisit this decision if the code doesn't go
+// away soon.
+std::vector<HloOpcode> TritonSupportedTernaryElementwiseUpToFloatNormalization(
+    PrimitiveType);
+
+// Data types that are supported by the legacy Triton emitters.
+bool IsTritonSupportedDataType(PrimitiveType, const se::GpuComputeCapability&);
+
+// Checks elementwise operation against unary, binary, and ternary elementwise
+// operations supported by the legacy Triton emitters.
+//
+// Note: this is not an accurate representation of what is actually supported by
+// the Triton emitters, because operations affected by FloatNormalization may
+// be tagged as "supported" here, even though FloatNormalization is required to
+// make them work. We could fix this, but this is code we aim to delete soon, so
+// it doesn't seem worth it. We'll revisit this decision if the code doesn't go
+// away soon.
+bool IsTritonSupportedElementwiseUpToFloatNormalization(HloOpcode,
+                                                        PrimitiveType);
+
+CodegenDecision CanTritonHandleGEMM(
+    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version);
+
+// Checks instruction against the requirements of the legacy Triton emitters.
+CodegenDecision IsTritonSupportedInstruction(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
+
+// Checks dynamic slice against the requirements of the legacy Triton emitters.
+//
+// This is exposed separately from IsTritonSupportedInstruction because we can
+// use it in the dimension order propagation without adding a dependency on the
+// GPU version.
+CodegenDecision IsTritonSupportedDynamicSlice(
+    const HloDynamicSliceInstruction& instr);
+
+}  // namespace legacy_triton
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_SUPPORT_LEGACY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_test_utils.h
new file mode 100644
index 00000000..b8e178fc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/triton_test_utils.h
@@ -0,0 +1,154 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_TEST_UTILS_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_TEST_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla::gpu {
+
+bool SupportsBF16(const stream_executor::GpuComputeCapability& cc);
+
+absl::Status CreateTritonIrAndFileCheck(
+    HloTestBase* test, absl::string_view hlo_text,
+    absl::string_view triton_fusion_name, absl::string_view filecheck_pattern);
+
+absl::Status CreateTritonIrAndFileCheck(
+    const HloComputation& computation,
+    const BlockLevelParameters& block_level_parameters,
+    absl::string_view filecheck_pattern);
+
+absl::Status CreateTritonIrAndFileCheckForDot(
+    HloTestBase* test, absl::string_view hlo_text,
+    absl::string_view triton_fusion_name, absl::string_view filecheck_pattern);
+
+absl::Status CreateTritonIrAndFileCheckForDot(
+    const HloComputation& computation, absl::string_view filecheck_pattern);
+
+inline BlockLevelParameters FromOutputTileSizes(
+    std::vector<int64_t> output_tile_sizes) {
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = std::move(output_tile_sizes);
+  return block_level_parameters;
+}
+
+absl::StatusOr<bool> ApplyFloatNormalization(
+    HloModule* module, const stream_executor::GpuComputeCapability& cc);
+
+class TritonSupportTestBase : public HloTestBase {
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override;
+
+  // An HLO module together with a reference to the instruction of interest
+  // that's being tested. See ParseTemplateAndGetInstruction for more details.
+  class TestedInstruction {
+   public:
+    // Returns the HLO module.
+    std::unique_ptr<HloModule>& Module() { return module_; };
+
+    // The fusion instruction that calls the `triton_computation`.
+    const HloFusionInstruction& TritonFusion() {
+      return *Cast<HloFusionInstruction>(
+          module_->entry_computation()->root_instruction());
+    }
+
+    // Returns the `triton_computation`.
+    const HloComputation& TritonComputation() { return *instruction_.parent(); }
+
+    // Returns the instruction within the `triton_computation` that has the
+    // opcode provided to ParseAndGetInstruction.
+    const HloInstruction& Instruction() { return instruction_; }
+
+   private:
+    friend TritonSupportTestBase;
+
+    TestedInstruction(std::unique_ptr<HloModule> module,
+                      const HloInstruction& instruction)
+        : module_(std::move(module)), instruction_(instruction) {};
+    std::unique_ptr<HloModule> module_;
+    const HloInstruction& instruction_;
+  };
+
+  // Parses the given HLO template and returns the instruction that matches the
+  // given opcode.
+  //
+  // The provided template must contain a computation called
+  // `triton_computation`. If the template contains parameters $0 and $1, they
+  // will be replaced with the data type and opcode respectively.
+  // If the template's entry computation does not have a root fusion
+  // instruction, a new entry computation will be created. The new computation
+  // will have a root fusion instruction that has the same parameters as the
+  // `triton_computation` and contains a fusion instruction that calls the
+  // `triton_computation` with the generic Triton emitter. Tests that need
+  // the `__triton_gemm` backend kind should provide their own ENTRY
+  // computation.
+  absl::StatusOr<TestedInstruction> ParseTemplateAndGetInstruction(
+      absl::string_view hlo_template, xla::PrimitiveType data_type,
+      xla::HloOpcode opcode);
+
+  llvm::LLVMContext llvm_ctx_;
+  llvm::Module llvm_module_{"module", llvm_ctx_};
+  mlir::MLIRContext mlir_context_;
+  TritonGemmConfig config_{16, 32, 512, 1, 4, 8};
+};
+
+class TritonSupportTestBaseWithParam
+    : public TritonSupportTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, HloOpcode>> {};
+
+std::string TritonSupportTestTypeAndOpcodeToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode>>& data);
+
+std::string TritonSupportTestTypeAndDeviceToString(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, se::GpuComputeCapability>>& data);
+
+std::string TritonSupportTestTypeAndOpcodeAndDeviceToString(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, HloOpcode, se::GpuComputeCapability>>& data);
+
+std::string TritonSupportTestTwoTypesAndDeviceToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, PrimitiveType,
+                                              se::GpuComputeCapability>>& data);
+
+std::string TritonSupportTestTypeToString(
+    const ::testing::TestParamInfo<PrimitiveType>& data);
+}  //  namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_TRITON_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/xla_triton_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/xla_triton_ops.h
new file mode 100644
index 00000000..fc5c8dd9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/xla_triton_ops.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_XLA_TRITON_OPS_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_XLA_TRITON_OPS_H_
+
+#include "mlir/IR/Attributes.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
+#include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+#include "mlir/IR/MLIRContext.h"  // IWYU pragma: keep
+#include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
+#include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // IWYU pragma: keep
+#include "xla/service/gpu/fusions/triton/xla_triton_dialect.h.inc"  // IWYU pragma: keep
+#include "triton/Dialect/Triton/IR/Dialect.h"  // IWYU pragma: keep
+#include "triton/Dialect/Triton/IR/Traits.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"  // IWYU pragma: keep
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"  // IWYU pragma: keep
+
+namespace mlir::triton::xla {
+class SparseDotOp;
+}
+namespace mlir::OpTrait {
+// Template specialization for DotLike<SparseDotOp> to skip verification, which
+// would fail because the sparse dot has different shapes and operands.
+template <>
+class DotLike<triton::xla::SparseDotOp>
+    : public TraitBase<triton::xla::SparseDotOp, DotLike> {
+ public:
+  // TODO (b/350928208) : Add a proper verifier for SparseDotOp.
+  static LogicalResult verifyTrait(Operation *op) { return success(); }
+};
+}  // namespace mlir::OpTrait
+
+#define GET_ATTRDEF_CLASSES
+#include "xla/service/gpu/fusions/triton/xla_triton_attrs.h.inc"
+#define GET_OP_CLASSES
+#include "xla/service/gpu/fusions/triton/xla_triton_ops.h.inc"
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_XLA_TRITON_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/xla_triton_passes.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/xla_triton_passes.h
new file mode 100644
index 00000000..67034fe1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/fusions/triton/xla_triton_passes.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSIONS_TRITON_XLA_TRITON_PASSES_H_
+#define XLA_SERVICE_GPU_FUSIONS_TRITON_XLA_TRITON_PASSES_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::triton::xla {
+
+#define GEN_PASS_DECL
+#include "xla/service/gpu/fusions/triton/xla_triton_passes.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateSparseAddEncodingPass(
+    int32_t num_warps = 4, int32_t threads_per_warp = 32, int32_t num_ctas = 1);
+std::unique_ptr<mlir::Pass> CreateSparseBlockedToMMAPass();
+std::unique_ptr<mlir::Pass> CreateSparseRemoveLayoutConversionPass();
+std::unique_ptr<mlir::Pass> CreateSparseLocalLoadToLLVMPass();
+std::unique_ptr<mlir::Pass> CreateSparseDotOpToLLVMPass();
+std::unique_ptr<mlir::Pass> CreateSparseWGMMAOpToLLVMPass();
+std::unique_ptr<mlir::Pass> CreatePreventMmaV3LoopUnrollingPass();
+std::unique_ptr<mlir::Pass> CreateInt4ToPackedInt4RewritePass();
+
+// Returns true if the `op` contains an operation in it's regions that satisfies
+// the `fn`.
+bool ContainsOp(mlir::Operation* op,
+                llvm::function_ref<bool(mlir::Operation*)> fn);
+
+#define GEN_PASS_REGISTRATION
+#include "xla/service/gpu/fusions/triton/xla_triton_passes.h.inc"
+
+}  // namespace mlir::triton::xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_TRITON_XLA_TRITON_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_asm_opts_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_asm_opts_util.h
new file mode 100644
index 00000000..a37ee709
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_asm_opts_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_ASM_OPTS_UTIL_H_
+#define XLA_SERVICE_GPU_GPU_ASM_OPTS_UTIL_H_
+
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Create GpuAsmOpts out of DebugOptions.
+stream_executor::GpuAsmOpts PtxOptsFromDebugOptions(
+    const DebugOptions& debug_options);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_ASM_OPTS_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_collective_combiner_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_collective_combiner_utils.h
new file mode 100644
index 00000000..d78abf55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_collective_combiner_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_COLLECTIVE_COMBINER_UTILS_H_
+#define XLA_SERVICE_GPU_GPU_COLLECTIVE_COMBINER_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Suggests a combiner threshold to the caller (combiner). At the moment it only
+// suggests a lower value than a default combiner threshold if it exceeds
+// available memory on a device. If the scheduling of a `module` failed for any
+// reason the method return a default value of a combiner threshold for
+// `collective_opcode`.
+int64_t ComputeSuggestedCombinerThreshold(
+    const HloModule& module, const se::DeviceDescription& device_info,
+    HloOpcode collective_opcode, int64_t pointer_size);
+
+// Adds information that `instr` has been pipelined to the
+// `CollectiveBackendInfo`. It is up to the caller to decide when to invoke
+// this.
+absl::Status AppendPipelinedInstruction(HloInstruction* instr);
+
+// Returns true if module contains any pipelined instruction. False otherwise.
+bool ContainsPipelinedInstruction(const HloModule& module);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_GPU_COLLECTIVE_COMBINER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_compiler.h
new file mode 100644
index 00000000..dce73941
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -0,0 +1,271 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_COMPILER_H_
+#define XLA_SERVICE_GPU_GPU_COMPILER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/buffer_sharing.h"
+#include "xla/service/gpu/compile_module_to_llvm_ir.h"
+#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_compiler.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace gpu {
+
+// The GPU compiler generates efficient GPU executables.
+class GpuCompiler : public LLVMCompiler {
+ public:
+  GpuCompiler(se::Platform::Id platform_id, const char* target_triple,
+              const char* data_layout);
+
+  using LLVMCompiler::Compile;
+
+  // An attached device is passed in via stream_exec. We get GPU configuration
+  // from the attached device OR from the `options` struct (in which case the
+  // attached device is ignored during the compilation).
+  // If you call this directly, follow it with RunBackend rather than Compile.
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                     AotCompilationOptions const& options) override;
+
+  se::Platform::Id PlatformId() const override { return platform_id_; }
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
+  // Returns a (deserialized) AotCompilationResult from a serialized
+  // AotCompilationResult.
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  LoadAotCompilationResult(const std::string& serialized_aot_result) override;
+
+  // Stateless version of the same function.
+  static absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  LoadAotCompilationResultStatic(const std::string& serialized_aot_result);
+
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
+      Executable* executable) const override;
+
+  absl::Status RunPostSchedulingPipelines(
+      HloModule* module, int64_t scheduler_mem_limit,
+      const se::DeviceDescription& gpu_device_info) const;
+
+  std::string target_triple() const { return target_triple_; }
+  std::string data_layout() const { return data_layout_; }
+
+  const char* GetDataLayout() const { return data_layout_; }
+
+  const char* GetTargetTriple() const { return target_triple_; }
+
+  int64_t GetPointerSize() const { return pointer_size_; }
+
+  static absl::StatusOr<Compiler::TargetConfig> GetTargetConfig(
+      const Compiler::CompileOptions& options, const DebugOptions& debug_opts,
+      se::StreamExecutor* executor);
+
+  virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer(
+      const se::DeviceDescription& device_description) const {
+    return [&](const HloInstruction* user, const HloInstruction* operand,
+               const ShapeIndex& user_index) {
+      return FusionCanShareBufferHint(user, operand, user_index,
+                                      device_description);
+    };
+  }
+
+  virtual absl::StatusOr<bool> CanUseLinkModules(
+      const HloModuleConfig& config,
+      const stream_executor::DeviceDescription& device_description) {
+    return false;
+  }
+
+  static AlgebraicSimplifierOptions GetAlgebraicSimplifierOptions(
+      const HloModuleConfig& config);
+
+ protected:
+  struct BackendCompileResult {
+    std::string asm_text;
+    std::vector<uint8_t> binary;
+    BinaryMap dnn_compiled_graphs;
+  };
+
+  // During compilation with device, stream_exec != null and autotune_results
+  // == null. During deviceless AOT compilation, stream_exec == null and
+  // autotune_results != null.
+  // thread_pool is used to speed up compilation during autotuning.
+  virtual absl::Status OptimizeHloPostLayoutAssignment(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options, const TargetConfig& gpu_target_config,
+      tsl::thread::ThreadPool* thread_pool);
+
+  // CollectivesScheduleLinearizer enforces a total ordering between collectives
+  // to work around divergence in executables introduced due to auto tuning,
+  // specifically the use of extra scratch space for convolutions. This
+  // function decided whether to apply this pass. If convolutions are present in
+  // the code and we are using "online" autotuning (i.e., not AOT) we need to
+  // use the pass, else we do not need to enable the pass.
+  virtual bool RequiresCollectiveScheduleLinearizer(
+      const HloModule* module, se::StreamExecutor* stream_exec) {
+    return false;
+  }
+
+  // Add autotuning passes for convolution and gemm (except triton).
+  virtual absl::Status AddConvAndGemmAutotuningPasses(
+      HloPassPipeline* pipeline, const se::GpuComputeCapability& gpu_version,
+      const CompileOptions& options, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
+    return absl::OkStatus();
+  }
+
+  // Add autotuning passes for GEMM fusions.
+  virtual absl::Status AddGemmFusionAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+      const MultiProcessKeyValueStore& key_value_store,
+      const se::SemanticVersion& toolkit_version) {
+    return absl::OkStatus();
+  }
+
+  // Runs cuDNN fusion and custom call compiler passes.
+  virtual absl::Status RunCudnnCompilerPasses(HloModule* module,
+                                              se::StreamExecutor* stream_exec,
+                                              BinaryMap* dnn_compiled_graphs) {
+    return absl::OkStatus();
+  }
+
+ private:
+  struct CompileResultWithMetadata {
+    BackendCompileResult backend_result;
+    CompileModuleResults compile_module_results;
+  };
+
+  // Schedule and compile the module.
+  absl::StatusOr<CompileResultWithMetadata> CompileToBackendResult(
+      HloModule* module, llvm::LLVMContext* llvm_context,
+      se::StreamExecutor* executor, const CompileOptions& options,
+      const se::DeviceDescription& gpu_device_info);
+
+  absl::StatusOr<BackendCompileResult> CompileAndLink(
+      const HloModuleConfig& module_config,
+      CompileModuleResults& compile_module_results,
+      const stream_executor::DeviceDescription& device_description,
+      se::StreamExecutor* stream_exec, const CompileOptions& options,
+      const HloModule* debug_module);
+
+  absl::StatusOr<BackendCompileResult> CompileSingleModule(
+      const HloModuleConfig& module_config,
+      const stream_executor::DeviceDescription& device_description,
+      const HloModule* debug_module, llvm::Module* llvm_module,
+      bool relocatable, const CompileOptions& options,
+      std::optional<int> shard_number);
+
+  absl::Status LoadAutotuneResultsFromFile(const DebugOptions& debug_options);
+  absl::Status SerializeAutotuneResultsToFile(
+      const DebugOptions& debug_options);
+
+  absl::Status RunPreSchedulingPasses(
+      HloModule* module, se::StreamExecutor* stream_exec,
+      const se::DeviceDescription& gpu_device_info);
+  absl::Status RunCollectiveScheduleLinearizerPasses(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec);
+
+  // During compilation with device, stream_exec != null and autotune_results
+  // == null. During deviceless AOT compilation, stream_exec == null and
+  // autotune_results != null.
+  absl::Status OptimizeHloModule(HloModule* hlo_module,
+                                 se::StreamExecutor* stream_exec,
+                                 const CompileOptions& options,
+                                 const TargetConfig& gpu_target_config);
+
+  virtual absl::Status OptimizeHloConvolutionCanonicalization(
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      se::dnn::VersionInfo dnn_version,
+      const se::SemanticVersion& toolkit_version) = 0;
+
+  // TODO(timshen): Replace `debug_module` with some portable debug information
+  // that accommodates both HLO and MLIR.
+  virtual absl::StatusOr<BackendCompileResult> CompileTargetBinary(
+      const HloModuleConfig& module_config, llvm::Module* llvm_module,
+      const stream_executor::DeviceDescription& device_description,
+      bool relocatable, const HloModule* debug_module,
+      const CompileOptions& options) = 0;
+
+  absl::Status PrepareHloModuleForIrEmitting(
+      HloModule* hlo_module, const se::DeviceDescription& device_description);
+
+  virtual absl::StatusOr<std::vector<uint8_t>> LinkModules(
+      const stream_executor::DeviceDescription& device_description,
+      se::StreamExecutor* stream_exec,
+      std::vector<std::vector<uint8_t>> modules,
+      const DebugOptions& debug_options) {
+    return Unimplemented("LinkModules is not implemented.");
+  }
+
+  se::Platform::Id platform_id_;
+
+  // The triple that represents our target.
+  const char* target_triple_;
+
+  // The data layout of the emitted module.
+  const char* data_layout_;
+
+  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
+  const int64_t pointer_size_;
+
+  GpuCompiler(const GpuCompiler&) = delete;
+  GpuCompiler& operator=(const GpuCompiler&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_constants.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_constants.h
new file mode 100644
index 00000000..e9bb204e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_constants.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+#define XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+
+#include <cstdint>
+
+namespace xla {
+namespace gpu {
+
+// Minimum alignment for buffers passed as incoming arguments by TensorFlow.
+//
+// kEntryParameterAlignBytes is equal to EIGEN_MAX_ALIGN_BYTES, though including
+// Eigen headers here to get that symbol may not be a good idea.
+// EIGEN_MAX_ALIGN_BYTES may differ between CUDA-enabled builds vs CUDA-disabled
+// builds and we don't want the IR generated by XLA:GPU to depend on that.
+//
+// TODO(b/111767313): Consider raising EIGEN_MAX_ALIGN_BYTES if it helps.
+inline constexpr int64_t kEntryParameterAlignBytes = 16;
+
+// Minimum alignment for buffers allocated by XLA: the temp buffers and the live
+// out (result) buffers.
+//
+// cudnn requires 128-bit (16-byte) alignment for TensorCore operations, but
+// says that 1024-bit (128-byte) alignment "may deliver better performance".
+// https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#tensor-ops-guidelines-for-dl-compiler
+//
+inline constexpr int64_t kXlaAllocatedBufferAlignBytes = 128;
+
+// Minimum alignment for constant buffers.
+inline constexpr int64_t kConstantBufferAlignBytes =
+    kXlaAllocatedBufferAlignBytes;
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_conv_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_conv_runner.h
new file mode 100644
index 00000000..11f1a3b3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_conv_runner.h
@@ -0,0 +1,270 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
+#define XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/lazy_op_runner.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Structure to describe static properties of a GPU convolution.
+struct GpuConvConfig {
+  // Field related to cuDNN's fused convolution are in FusionConfig &
+  // FusionParams structures. The result thus is defined as:
+  //   activation(conv_result_scale * conv(x, w) +
+  //       side_input_scale * side_input + broadcast(bias))
+  //
+  // The most common fused conv is conv forward + relu/identity, for example.
+  //
+  // bias_buf is a single-dimensional array, with the length equal to the number
+  // of output features. It'll be broadcasted to the output shape in order to be
+  // added to the final results.
+  //
+  // side_input_buf, if valid, must have the same shape as the output buffer.
+  struct FusionConfig {
+    se::dnn::ActivationMode mode;
+    double side_input_scale;
+    double leakyrelu_alpha = 0.0;
+  };
+
+  PrimitiveType input_type;
+  PrimitiveType output_type;
+  CudnnConvKind kind;
+  se::dnn::AlgorithmDesc algorithm;
+  double conv_result_scale;
+
+  se::dnn::BatchDescriptor input_descriptor;
+  se::dnn::FilterDescriptor filter_descriptor;
+  se::dnn::BatchDescriptor output_descriptor;
+  se::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::BatchDescriptor bias_descriptor;
+
+  Shape input_shape;
+  Shape filter_shape;
+  Shape output_shape;
+  std::optional<FusionConfig> fusion;
+
+  // String serialization of the subgraph of adjacent ops to be fused into the
+  // cuDNN convolution Custom Call. Currently used for FP8 convolutions only.
+  // Additional information is provided in gpu_fused_conv_rewriter.cc.
+  std::string serialized_graph;
+};
+
+// Implementation struct exposed for debugging and log analysis.
+struct GpuConvParams {
+  const GpuConvConfig* config;  // Not owned
+  struct FusionParams {
+    se::DeviceMemoryBase bias_buf;
+    se::DeviceMemoryBase side_input_buf;  // nullable
+  };
+
+  se::DeviceMemoryBase input_buf;
+  se::DeviceMemoryBase filter_buf;
+  se::DeviceMemoryBase output_buf;
+
+  // Buffers for operands of ops to be fused into the cuDNN
+  // convolution Custom Call.
+  std::vector<se::DeviceMemoryBase> operand_bufs;
+
+  // Buffers for additional outputs of ops to be fused into the cuDNN
+  // convolution Custom Call.
+  std::vector<se::DeviceMemoryBase> aux_bufs;
+
+  std::optional<FusionParams> fusion;
+};
+
+// The XLA convolution plumbing is all dynamically-typed w.r.t. whether a
+// convolution is fused (and has extra arguments) or unfused, which doesn't
+// naturally play well with the typed APIs provided by StreamExecutor; rather
+// than rewriting everything here, just propagate the dynamic typing to one more
+// place by having a ConvRunner, FusedConvRunner or GraphConvRunner.
+class GenericConvRunner {
+ public:
+  GenericConvRunner() = default;
+
+  explicit GenericConvRunner(
+      std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>> runner)
+      : repr_(std::move(runner)) {}
+
+  explicit GenericConvRunner(
+      std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::GraphConvOp>> runner)
+      : repr_(std::move(runner)) {}
+
+  explicit GenericConvRunner(
+      std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>> runner)
+      : repr_(std::move(runner)) {}
+
+  explicit GenericConvRunner(const GpuConvConfig& config)
+      : GenericConvRunner(FromGpuConvConfig(config)) {}
+
+  static GenericConvRunner FromGpuConvConfig(const GpuConvConfig& config) {
+    if (config.kind == CudnnConvKind::kForwardGraph) {
+      return GenericConvRunner(
+          std::make_unique<se::dnn::LazyOpRunner<se::dnn::GraphConvOp>>(
+              config.algorithm));
+    } else if (config.kind == CudnnConvKind::kForwardActivation) {
+      return GenericConvRunner(
+          std::make_unique<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>(
+              config.algorithm));
+    } else {
+      return GenericConvRunner(
+          std::make_unique<se::dnn::LazyOpRunner<se::dnn::ConvOp>>(
+              config.algorithm));
+    }
+  }
+
+  se::dnn::AlgorithmDesc ToAlgorithmDesc() const {
+    return std::visit(ToAlgorithmDescVisitor{}, repr_);
+  }
+
+  se::dnn::LazyOpRunner<se::dnn::ConvOp>* AsConvRunner() {
+    CHECK(std::holds_alternative<
+          std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>(repr_));
+    return std::get<std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>(
+               repr_)
+        .get();
+  }
+
+  se::dnn::LazyOpRunner<se::dnn::GraphConvOp>* AsGraphConvRunner() {
+    CHECK(std::holds_alternative<
+          std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::GraphConvOp>>>(repr_));
+    return std::get<
+               std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::GraphConvOp>>>(
+               repr_)
+        .get();
+  }
+
+  se::dnn::LazyOpRunner<se::dnn::FusedConvOp>* AsFusedConvRunner() {
+    CHECK(std::holds_alternative<
+          std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>>(repr_));
+    return std::get<
+               std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>>(
+               repr_)
+        .get();
+  }
+
+ private:
+  struct ToAlgorithmDescVisitor {
+    template <typename RunnerPtr>
+    se::dnn::AlgorithmDesc operator()(const RunnerPtr& runner) {
+      return runner->ToAlgorithmDesc();
+    }
+
+    se::dnn::AlgorithmDesc operator()(const std::monostate&) {
+      CHECK(false) << "Internal error: uninitialized runner in ToAlgorithmDesc";
+    }
+  };
+
+  using Repr =
+      std::variant<std::monostate,  // To allow GpuConvConfig default ctor
+                   std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>,
+                   std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::GraphConvOp>>,
+                   std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>;
+  Repr repr_;
+};
+
+struct RunConvOptions {
+  // Nullable output-parameter pointer for profiling results.
+  se::dnn::ProfileResult* profile_result = nullptr;
+
+  // Use this runner cache (and its configured algorithm), instead of the one
+  // from the instruction.
+  GenericConvRunner* runner_cache;
+};
+
+// Calls into cudnn to run the specified convolution.
+//
+// We provide one overload which takes a scratch buffer, and another which takes
+// an allocator which is responsible for allocating the scratch space.  In
+// theory the second one shouldn't be necessary -- users of this function could
+// just ask cudnn how much scratch space it needs for a particular convolution.
+// But in practice, StreamExecutor does not expose such an API, and in the name
+// of parsimony, perhaps it's better not to add it.  Instead, the first time you
+// call a convolution, you should call the version that takes a scratch
+// allocator and take note of how much memory is used.  The next time you call
+// the same conv, you can provide an explicitly preallocated scratch buffer of
+// that size, if you like.
+absl::Status RunGpuConv(const GpuConvConfig& conv_config,
+                        absl::Span<const se::DeviceMemoryBase> operand_buffers,
+                        absl::Span<const se::DeviceMemoryBase> result_buffers,
+                        se::DeviceMemoryBase scratch_memory, se::Stream* stream,
+                        RunConvOptions = {});
+
+// Struct to describe properties of a convolution without being tied to specific
+// IR. Will be used to help build Convolution thunks from either XLA HLO or
+// LHLO GPU dialect in MLIR.
+struct GpuConvDescriptor {
+  CudnnConvKind kind;
+  CudnnConvBackendConfig backend_config;
+  Shape operand0_shape;
+  Shape operand1_shape;
+  Shape result_shape;
+  size_t scratch_size;
+  Window window;
+  ConvolutionDimensionNumbers dnums;
+  int64_t feature_group_count;
+};
+
+// Returns the convolution configuration given a XLA HLO instruction.
+absl::StatusOr<GpuConvConfig> GetGpuConvConfig(
+    const HloCustomCallInstruction* cudnn_call);
+
+// Returns the convolution configuration given a convolution descriptor `desc`
+// and a string representation of the convolution instruction `inst_as_string`
+// (for error reporting).
+absl::StatusOr<GpuConvConfig> GetGpuConvConfig(
+    const GpuConvDescriptor& desc, absl::string_view inst_as_string);
+
+// Implementation details exposed for debugging and log analysis.
+absl::StatusOr<GpuConvParams> GetGpuConvParams(
+    const GpuConvConfig& conv_config,
+    absl::Span<const se::DeviceMemoryBase> operand_buffers,
+    absl::Span<const se::DeviceMemoryBase> result_buffers);
+
+inline se::dnn::DataType BiasTypeForInputType(se::dnn::DataType input_type) {
+  switch (input_type) {
+    default:
+      return input_type;
+    case se::dnn::DataType::kInt8:
+      return se::dnn::DataType::kFloat;
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
new file mode 100644
index 00000000..9085763f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
+#define XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
+
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class TestGpuDeviceInfo {
+ public:
+  static stream_executor::DeviceDescription RTXA6000DeviceInfo(
+      stream_executor::GpuComputeCapability cc =
+          stream_executor::CudaComputeCapability(8, 9));
+  static stream_executor::DeviceDescription AMDMI210DeviceInfo();
+  // Returns deafult RTXA6000 or AMDMI210 device info
+  static stream_executor::DeviceDescription CudaOrRocmDeviceInfo();
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_executable.h
new file mode 100644
index 00000000..fd2d51d1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -0,0 +1,293 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
+#define XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/runtime/annotation.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/scoped_module_handle.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// GPU-targeting implementation of the XLA Executable interface.
+//
+// Launches the given GPU kernel via the StreamExecutor.
+//
+// This is an immutable data type after initialization, and thus thread safe.
+class GpuExecutable : public Executable {
+ public:
+  struct ConstantInfo {
+    std::string symbol_name;
+    DenseDataIntermediate content;
+    int allocation_index = -1;
+  };
+
+  struct OutputInfo {
+    // Corresponding allocation index.
+    int allocation_index;
+
+    // Output is passed-through from a parameter.
+    bool passthrough = false;
+
+    // Whether this output is hinted to alias a parameter (BufferAllocation*
+    // would indicate the aliased parameter), and what kind of alias it is.
+    std::optional<HloInputOutputAliasConfig::Alias> alias_config;
+  };
+
+  struct Params {
+    std::string asm_text;
+    std::vector<uint8_t> binary;
+    BinaryMap dnn_compiled_graphs;
+    se::GpuComputeCapability gpu_version;
+    std::unique_ptr<SequentialThunk> executable;
+    std::vector<ConstantInfo> constants;
+    absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
+    std::string module_name;
+    xla::Shape output_shape;
+    std::optional<std::vector<BufferAllocation>> mlir_allocations;
+    std::unique_ptr<const BufferAssignment> buffer_assignment;
+    int64_t debug_buffer_assignment_show_max;
+    std::unique_ptr<HloModule> debug_module = nullptr;
+    bool enable_debug_info_manager = true;
+  };
+
+  static absl::StatusOr<std::unique_ptr<GpuExecutable>> Create(Params params);
+  ~GpuExecutable() override;
+
+  int64_t SizeOfGeneratedCodeInBytes() const override;
+
+  // This should be called after set_ir_module_string.
+  const std::string& ir_module_string() const { return ir_module_string_; }
+
+  // This should be called before ExecuteOnStream.
+  void set_ir_module_string(const std::string& ir_module_string) {
+    ir_module_string_ = ir_module_string;
+  }
+
+  // Returns the compiled code for the computation.
+  //
+  // The compiled code is PTX in Cuda and unused empty string in ROCm.
+  // This may be left empty for saving memory if we have a non-empty binary.
+  // If both text() and binary() are empty, that means the HLO required no
+  // custom kernels to be compiled.
+  const std::string& text() const { return text_; }
+
+  // Returns the binary stored in this GpuExecutable.
+  //
+  // The binary is cubin in Cuda, and HSA code object in ROCm. It may be empty,
+  // in which case compilation is left up to the GPU driver. If both text() and
+  // binary() are empty, that means the HLO required no custom kernels to be
+  // compiled.
+  const std::vector<uint8_t>& binary() const { return binary_; }
+
+  const BinaryMap& dnn_compiled_graphs() const { return dnn_compiled_graphs_; }
+
+  // ExecuteAsyncOnStream will fail if the compute capability of the stream
+  // doesn't match the compute capability passed to this object's constructor.
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments) override;
+
+  absl::StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments) override;
+
+  using VariantArguments = std::variant<absl::Span<const ShapedBuffer* const>,
+                                        absl::Span<ExecutionInput>>;
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl(
+      const ServiceExecutableRunOptions* run_options,
+      VariantArguments arguments);
+
+  absl::Span<const BufferAllocation> GetAllocations() const override {
+    // A GpuExecutable can get its allocations in three ways:
+    // 1 - From a regular compilation that uses allocations from MLIR.
+    // 2 - From a regular compilation that uses the original allocations from
+    //     the buffer assignment.
+    // 3 - From loading the executable from an object file.
+    //
+    // In cases 1 and 3, the allocations are stored in allocations_ and in
+    // case 2, they are part of the buffer_assignment.
+    //
+    // This function chooses the correct allocations to be used within the
+    // GpuExecutable code.
+    return allocations_.has_value() ? *allocations_
+                                    : buffer_assignment_->Allocations();
+  }
+
+  const std::vector<ConstantInfo>& constants() const { return constants_; }
+
+  const BufferAssignment* buffer_assignment() const {
+    return buffer_assignment_.get();
+  }
+
+  const SequentialThunk& GetThunk() { return *thunks_; }
+
+ private:
+  // Use GpuExecutable::Create() to create an instance.
+  explicit GpuExecutable(Params params);
+
+  using BufferAllocToDeviceMemoryMap =
+      absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
+
+  // Loads the PTX or CUBIN for this executable and initializes all
+  // constants that haven't already been initialized by the CUDA driver. Loaded
+  // modules are owned by this executable.
+  //
+  // Returns a map from buffer allocation indices to device memory pointers
+  // (only for allocations that contain constants).
+  //
+  // The returned map is cached. If the above process has already been run for
+  // the given stream, it is skipped and the cached map is immediately returned
+  // instead.
+  absl::StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
+      stream_executor::Stream* stream);
+
+  // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
+  // version for compute capability, depending on the hardware.
+  absl::Status CheckCompatibilityWithServiceExecutableRunOptions(
+      const ServiceExecutableRunOptions* run_options);
+
+  absl::StatusOr<BufferAllocations> GenerateBufferAllocations(
+      VariantArguments arguments,
+      const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
+      se::DeviceMemoryAllocator* memory_allocator, int device_ordinal);
+
+  absl::StatusOr<se::DeviceMemoryBase> BufferForAllocation(
+      VariantArguments arguments,
+      const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
+      const BufferAllocation& allocation,
+      se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+      int64_t arg_idx);
+
+  // The LLVM IR, in string format, of the unoptimized module generated for
+  // this GpuExecutable. We save a string instead of an llvm::Module* because
+  // leaving llvm::Module* in a singleton can cause the heap checker to emit
+  // false positives.
+  //
+  // This string should be modified only before ExecuteOnStream.
+  std::string ir_module_string_;
+
+  // The compiled code for the computation.
+  const std::string text_;
+
+  // The GPU machine code for the computation, targeting GPUs at
+  // compute_capability_.
+  //
+  // May be empty, in which case we leave compilation up to the GPU driver.
+  std::vector<uint8_t> binary_;
+
+  BinaryMap dnn_compiled_graphs_;
+
+  // The GPU version for compute compatibility check.
+  se::GpuComputeCapability gpu_version_;
+
+  // The thunks to be invoked by this GpuExecutable. They are generated by the
+  // IrEmitter (null if XLA:GPU runtime is enabled).
+  std::unique_ptr<SequentialThunk> thunks_;
+
+  // Additional execution streams requested by `thunks_`.
+  absl::flat_hash_set<ExecutionStreamId> execution_stream_ids_;
+
+  std::string module_name_;
+
+  xla::Shape output_shape_;
+
+  // The allocations_ object contains allocations that **may** be used to
+  // provide information for allocating memory for every output/temp buffer.
+  // See the comment on GetAllocations().
+  std::optional<const std::vector<BufferAllocation>> allocations_;
+
+  // The buffer_assignment_ object contains allocations that **may** be used to
+  // provide information for allocating memory for every output/temp buffer.
+  // See the comment on GetAllocations().
+  //
+  // This object is also used for dumping debug info.
+  std::unique_ptr<const xla::BufferAssignment> buffer_assignment_;
+
+  ModuleAnnotations module_annotations_ = [this] {
+    if (has_module()) {
+      return ModuleAnnotations(module());
+    }
+    return ModuleAnnotations(module_name_);
+  }();
+
+  int64_t debug_buffer_assignment_show_max_;
+
+  absl::Mutex module_handle_mutex_;
+  // Cache of module handles. Required to keep loaded modules alive until this
+  // executable is destroyed.
+  absl::flat_hash_map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
+      module_handles_ ABSL_GUARDED_BY(module_handle_mutex_);
+  // Cache of constant buffer allocation maps used by `ResolveConstantGlobals`.
+  absl::flat_hash_map<stream_executor::StreamExecutor*,
+                      std::unique_ptr<BufferAllocToDeviceMemoryMap>>
+      module_globals_ ABSL_GUARDED_BY(module_handle_mutex_);
+
+  // Cache previous memory allocations for current module, this is used to help
+  // identify if user's model have unstable pointers by turning on VLOG(5).
+  absl::flat_hash_map<stream_executor::StreamExecutor*,
+                      std::vector<se::DeviceMemoryBase>>
+      module_allocations_ ABSL_GUARDED_BY(module_handle_mutex_);
+
+  std::vector<ConstantInfo> constants_;
+  const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_;
+  // Retains shared ownership of on-device constants that are managed by XLA and
+  // potentially shared with other executables.
+  std::vector<std::shared_ptr<se::DeviceMemoryBase>> shared_constants_;
+  bool enable_debug_info_manager_;
+
+  GpuExecutable(const GpuExecutable&) = delete;
+  GpuExecutable& operator=(const GpuExecutable&) = delete;
+};
+
+absl::StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
+GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_executable_run_options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
new file mode 100644
index 00000000..7cede3e6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
@@ -0,0 +1,91 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
+#define XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
+
+#include <functional>
+#include <map>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/global_device_id.h"
+
+namespace xla::gpu {
+
+// A callback to get a unique clique id.
+//
+// TODO(b/380457503): Delete this alias and switch to
+// GpuCollectives::CliqueIdCallback.
+using CliqueIdCallback =  // NOLINT
+    std::function<absl::StatusOr<CliqueId>(const CliqueKey&)>;
+
+// GPU-specific executable options.
+// We keep these separate from ExecutableRunOptions to avoid adding
+// dependencies to ExecutableRunOptions.
+class GpuExecutableRunOptions {
+ public:
+  // Sets a mapping from local device ordinals to global device IDs.
+  // Used only on NVidia GPUs for cross-host NCCL collectives. If set, the
+  // elements of `device_assignment` are interpreted as global device IDs, not
+  // local device ordinals.
+  GpuExecutableRunOptions& set_gpu_global_device_ids(
+      std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids);
+  const std::optional<std::map<int, GlobalDeviceId>>& gpu_global_device_ids()
+      const;
+
+  // Callback that returns a unique clieque id for a given clique key.
+  GpuExecutableRunOptions& set_clique_id_callback(
+      CliqueIdCallback clique_id_callback);
+  const CliqueIdCallback& clique_id_callback() const;
+
+  // Collectives API for running collective operations on the GPU devices.
+  GpuExecutableRunOptions& set_collectives(GpuCollectives* collectives);
+  GpuCollectives* collectives() const;
+
+  // Whether the run requires an exclusive lock on the GPU.
+  bool requires_exclusive_lock_on_gpu() const {
+    return requires_exclusive_lock_on_gpu_;
+  }
+
+  // Require writers lock on the GPU.
+  GpuExecutableRunOptions& set_requires_exclusive_lock_on_gpu() {
+    requires_exclusive_lock_on_gpu_ = true;
+    return *this;
+  }
+
+  bool enable_mock_collectives() const { return enable_mock_collectives_; }
+
+  // Enables mocking nccl collective operations on the GPU.
+  GpuExecutableRunOptions& set_enable_mock_collectives() {
+    enable_mock_collectives_ = true;
+    return *this;
+  }
+
+ private:
+  bool requires_exclusive_lock_on_gpu_ = false;
+  bool enable_mock_collectives_ = false;
+  std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids_;
+  CliqueIdCallback clique_id_callback_;
+  GpuCollectives* collectives_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_float_support.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_float_support.h
new file mode 100644
index 00000000..ee63dd3d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_float_support.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
+#define XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
+
+#include <cstdint>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/float_support.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuFloatSupport : public FloatSupport {
+ public:
+  explicit GpuFloatSupport(se::GpuComputeCapability cc,
+                           PrimitiveType low_precision_type,
+                           PrimitiveType high_precision_type = F32)
+      : FloatSupport(low_precision_type, high_precision_type),
+        compute_capability_(cc) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
+           IsSupported(hlo);
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override;
+
+ private:
+  bool IsSupported(const HloInstruction& hlo) const;
+
+  const se::GpuComputeCapability compute_capability_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_fusible.h
new file mode 100644
index 00000000..cae7ff9f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -0,0 +1,238 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_FUSIBLE_H_
+#define XLA_SERVICE_GPU_GPU_FUSIBLE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+
+// TODO(b/112957171): Extract logic to determine fusibility of HLO ops from
+// GpuInstructionFusion, FusionMerger, and MultiOutputFusion.
+
+namespace xla {
+namespace gpu {
+
+// Fusion passes frequently do checks across all pairs of "interesting" nodes.
+// Computing e.g. FusionFitsInBudget(a, b) requires computing expensive
+// properties of `a` and `b` individually.  This cache lets us avoid recomputing
+// those properties n^2 times.
+//
+// Invariant: After modifying or removing a fusion node, call Invalidate(node).
+class FusionInfoCache {
+ public:
+  explicit FusionInfoCache(const se::DeviceDescription& device_info)
+      : device_info_(device_info) {}
+  // Must be called after modifying or removing a fusion node (or other node
+  // that's part of this cache).
+  void Invalidate(const HloInstruction* instr) {
+    shared_memory_usage_.erase(instr);
+    num_unnested_reductions_.erase(instr);
+  }
+
+  // Returns expected shared memory usage of a given instruction in bytes.
+  int64_t GetSharedMemoryUsage(const HloInstruction& instr);
+
+  // Returns the number of unnested reductions in the instruction output.
+  int64_t GetNumUnnestedReductions(const HloInstruction& instr);
+
+ private:
+  const se::DeviceDescription& device_info_;
+
+  absl::Mutex mutex_;
+
+  absl::flat_hash_map<const HloInstruction*, int64_t> shared_memory_usage_;
+  absl::flat_hash_map<const HloInstruction*, int64_t> num_unnested_reductions_;
+};
+
+// Returns the computations within `module` whose instructions can still be
+// fused: computations that are not fusion computations, and not called
+// computations that are inlined (reducers, scatter combiners, etc.).
+std::vector<HloComputation*> GetFusibleComputations(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+inline constexpr int64_t MaxOperandsAndOutputsPerFusion() { return 96; }
+
+// Whether the op transposes the physical data layout. Fusing such ops may lead
+// to uncoalesced data access and may thus not be beneficial.
+bool IsPhysicallyTransposing(const HloInstruction& instr);
+
+// Whether the op transposes the minor-most dimension. In the case of fusions,
+// whether the fusion contains some op that does this.
+// If the minor-most dimension is transposed, this results in uncoalesced memory
+// accesses in untiled code generators. If some other dimension is transposed,
+// this just results in additional index computations.
+// Note that this function makes several simplifying assumptions:
+// - For non-fusion instructions, we assume the output is materialized as is.
+//   For internal instructions, this may not be the case.
+// - For fusions, it simply checks the output of this function for each
+//   instruction in the fusion's computation.
+// - There's no way to tell which parameters of the fusion are transposed.
+// TODO(jreiffers): Take into account the size of the transposed dimension as
+// well.
+bool TransposesMinorDimension(const HloInstruction* instr);
+
+// Note that reduction ops are lowered in different ways. Reduce input fusions
+// are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
+// reduction-to-vector ops. Other reduction ops are lowered by
+// GpuElementalIrEmitter and fused like elementwise ops.
+
+// Whether `instr` is an input fusion rooted at a reduction-to-vector op or a
+// multi-output input fusion with at least one reduction-to-vector op root.
+bool IsReduceInputFusion(const HloInstruction& instr,
+                         const se::DeviceDescription& device_info);
+
+// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
+// is either an unfused reduction-to-vector op or a reduce input fusion.
+bool IsInputFusibleReduction(const HloInstruction& instr,
+                             const se::DeviceDescription& device_info);
+
+// Whether `instr` is a nestable variadic reduction
+// or a loop fusion rooted with such.
+bool IsNestableVariadicReduction(const HloInstruction& instr,
+                                 const se::DeviceDescription& device_info);
+
+// Whether `instr` is a nestable variadic reduce-window
+// or a loop fusion rooted with such.
+bool IsNestableVariadicReduceWindow(const HloInstruction& instr);
+
+// Whether `instr` is fusible as root of a scatter input fusions, i.e. `instr`
+// is either an unfused scatter op or a scatter input fusion.
+bool IsInputFusibleScatter(const HloInstruction& instr);
+
+// Determines whether the combination of `instr1` and `instr2` into a (possibly
+// multi-output) fusion fits within a "budget" -- i.e., does have more operands
+// and outputs than is allowed or occupy too much shared memory. If the fusion
+// is a producer/consumer fusion and `instr1` is the consumer and `instr2` is
+// the producer, set consumer_producer_fusion to true to enable more fusion.
+FusionDecision FusionFitsInBudget(const HloInstruction& instr1,
+                                  const HloInstruction& instr2,
+                                  const se::DeviceDescription& device_info,
+                                  bool is_consumer_producer_fusion = false,
+                                  FusionInfoCache* cache = nullptr);
+
+// Returns the instruction that determines the emitter used for lowering,
+// sometimes referred to as "the real hero".
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr, const se::DeviceDescription& device_info);
+
+// Whether 'hero1' and 'hero2' are compatible if the two fusions containing
+// 'hero1' and 'hero2' are merged together. For example merging two fusions with
+// a reduction hero and a transpose here, respectively, does not work.
+FusionDecision FusionHeroesAreCompatible(
+    const HloInstruction* hero1, const HloInstruction* hero2,
+    const se::DeviceDescription& device_info);
+
+// Whether instruction shapes are compatible for multi-output fusion, i.e.
+// whether the emitters support lowering the resulting fusion.
+// This function works for both, sibling and producer-consumer multi-output
+// fusion.
+// So far, multi-output fusion is supported for loop fusions and reduce
+// input fusions only. It is up to the caller to ensure the instructions
+// themselves are fusible!
+FusionDecision ShapesCompatibleForMultiOutputFusion(
+    const HloInstruction& instr1, const HloInstruction& instr2,
+    const se::DeviceDescription& device_info);
+
+// Whether fusing producer into consumer creates a scatter fusion that cannot be
+// handled by the scatter emitter.
+FusionDecision CanEmitInputFusedScatter(const HloInstruction& producer,
+                                        const HloInstruction& consumer);
+
+// Whether the producer is a valid candidate for a multi-output fusion.
+// That is, the root tuple of the multi-output fusion will contain the results
+// of both, the producer and consumer.
+FusionDecision IsProducerMultiOutputFusible(
+    const HloInstruction& producer, const se::DeviceDescription& device_info);
+// Whether `instr` is a candidate for sibling fusion or as a consumer in
+// a producer-consumer multi-output fusion.
+bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr,
+                                      const se::DeviceDescription& device_info);
+
+// Determines the fusion kind to be used when fusing into `consumer`.
+HloInstruction::FusionKind ChooseFusionKind(
+    const HloInstruction& producer, const HloInstruction& consumer,
+    const se::DeviceDescription& device_info);
+
+// Returns whether `consumer` is the only non-root user of `instr`.
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer);
+
+// Returns number of instructions in the fusible `instr`. If `instr` is not a
+// fusion instruction, 1 is returned.
+int64_t GetInstrCountOfFusible(const HloInstruction& instr);
+
+// Returns the outputs of the fusible `instr`.
+absl::InlinedVector<const HloInstruction*, 2> GetOutputsOfFusible(
+    const HloInstruction& instr);
+
+// Returns the output size of the fusible `instr`.
+size_t GetOutputSizeOfFusible(const HloInstruction& instr);
+
+// Returns instructions which are roots of the fusion, following the operands of
+// GTE instructions in the root tuple that extract from a tuple.
+//
+// For input: (tuple (gte tuple(R1)) (gte tuple(R1)) O2)
+// Expected output: [R1, R1, O2]
+//
+// For input: (tuple R1 R2 O2)
+// Expected output: [R1, R2, O2]
+//
+// For input: (tuple (gte tuple(R1)) R2 (gte tuple(R1)) O3)
+// Expected output: [R1, R2, R1, O3]
+//
+// For input: (tuple (gte R1) R2 (gte R1) O3)
+// Expected output: [R1, R2, R1, O3]
+//
+// For input: R1
+// Expected output: [R1]
+std::vector<const HloInstruction*> GetFusionRoots(
+    const HloComputation& computation);
+
+// Whether the instruction is a generic Triton fusion.
+bool IsGenericTritonFusion(const HloInstruction& instr);
+
+// Whether the fusion will likely behave poorly with vectorization due to the
+// instructions it contains.
+bool MayPreventVectorization(const HloFusionAdaptor& fusion);
+
+// Returns the max loop unroll factor.
+inline constexpr int64_t MaxUnrollFactor() { return 4; }
+
+LaunchDimensionsConfig ComputeLoopFusionConfig(
+    const HloFusionAnalysis& analysis);
+
+LaunchDimensionsConfig ComputeLoopFusionConfig(
+    const HloFusionAnalysis& analysis, const Shape& shape);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_FUSIBLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
new file mode 100644
index 00000000..af0e7dd3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_
+#define XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/stream_executor/device_description.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
+
+namespace xla {
+namespace gpu {
+
+struct ScheduleMetadata {
+  int64_t scheduler_mem_limit;
+};
+
+// Determines the schedule of HLO instructions for a module run on the GPU.
+absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
+    HloModule* module, int64_t pointer_size,
+    const se::DeviceDescription& gpu_device_info);
+
+// Schedules a GPU module with `DefaultMemoryScheduler` and
+// `PostProcessSchedule` postprocessing. If `peak_memory_bytes` is not nullptr,
+// then the it will be set to peak memory usage in bytes.
+absl::StatusOr<HloSchedule> ScheduleGpuModuleWithMemoryScheduler(
+    const HloModule* module, int64_t pointer_size,
+    int64_t* peak_memory_bytes = nullptr);
+
+HloInstructionSequence PostProcessSchedule(const HloInstructionSequence& input);
+
+constexpr absl::string_view kFingerprintBeforeLHS = "fingerprint_before_lhs";
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.h
new file mode 100644
index 00000000..95629670
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.h
@@ -0,0 +1,156 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_LATENCY_HIDING_SCHEDULER_H_
+#define XLA_SERVICE_GPU_GPU_LATENCY_HIDING_SCHEDULER_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/latency_hiding_scheduler.h"
+#include "xla/service/profile_guided_latency_estimator.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+// Breaks down higher level collectives into collective primitives.
+// E.g. AllReduceStart is broken down into Reduce + AsyncStart.
+CanonicalAsyncOp GpuGetCanonicalAsyncOp(const HloInstruction& hlo);
+
+// Returns size of the `shape` given the `pointer_size`.
+int64_t GetSizeOfShape(const Shape& shape, int pointer_size);
+
+// GPU overlap limit rule rule for scheduling candidate.
+// On top of the default rule, we do not allow collectives with more than 1
+// overlapping ranks to overlap. This is because the execution order of NCCL
+// kernels is not deterministic and cannot be controlled by launch order at the
+// moment. A cyclic dependency can be formed with at least 2 overlapping ranks.
+bool GpuScheduleCrossesOverlapLimit(
+    const DefaultSchedulerCore::SchedulingState& sched_state,
+    const HloGraphNode* node);
+
+// GPU specific resources for latency hiding scheduler.
+//
+// We use two different set of resources to model the scheduling of asynchronous
+// collective operations and P2P Send and Recv operations. This corresponds to
+// the fact that the runtime use a stream to run asynchronous collective
+// operations and two other streams to run P2P Send and Recv operations.
+enum class GpuResourceType {
+  kGpuAsyncStreamSend0 = ResourceTypeToIndex(
+      ResourceType::kTargetDefinedResourceTypeBegin),  // A resource for P2P
+                                                       // Send operation.
+  kGpuAsyncStreamSend1,        // Another resource for P2P Send operation.
+  kGpuAsyncStreamRecv0,        // A resource for P2P Recv operation.
+  kGpuAsyncStreamRecv1,        // Another resource for P2P Recv operation.
+  kGpuAsyncStreamCollectives,  // The resource for collective operations.
+  kGpuAsyncStreamComputes,     // The resource for async compute operations.
+  kGpuResourceTypeEnd,
+};
+
+constexpr int32_t kP2pResourceCount = 4;
+
+// Base GPU async tracker that enables async tracking only for async collectives
+// that are marked for async execution.
+class GpuAsyncTrackerBase : public AsyncTracker {
+ public:
+  explicit GpuAsyncTrackerBase(
+      const SchedulerConfig& config,
+      GetCanonicalAsyncOpFunc func = GpuGetCanonicalAsyncOp);
+
+  // Returns if this is an Async op done that the scheduler supports.
+  bool IsSupportedAsyncDone(const HloInstruction& hlo) const override;
+
+  // Returns if this is an Async op start that the scheduler supports.
+  bool IsSupportedAsyncStart(const HloInstruction& hlo) const override;
+
+  // Post processing the scheduling graph.
+  void PostProcessScheduleGraph(
+      HloScheduleGraph* schedule_graph,
+      const LatencyEstimator* latency_estimator) const override;
+};
+
+// GPU async tracker maps all collectives onto an async stream resource.
+class GpuAsyncTracker : public GpuAsyncTrackerBase {
+ public:
+  explicit GpuAsyncTracker(const SchedulerConfig& config);
+
+  // Returns resources used (occupied or released) by `instr`.
+  ResourcesVector GetResourcesFromInstructionImpl(
+      const HloInstruction& instr) const override;
+
+  // Returns the number of target defined resources
+  int64_t GetNumTargetDefinedResources() const override;
+
+  // Returns how many instructions using the given resource_type we can overlap
+  int64_t GetNumAvailableResources(int64_t resource_type) const override;
+
+  // Returns the name of the given resource
+  absl::string_view GetResourceName(int64_t resource_type) const override;
+
+  // Returns the hazard type that describes how to resolve the conflicts when
+  // multiple instructions attempt to use the given resource type concurrently.
+  // Default resources have a hazard type of kUnshareable.
+  ResourceHazardType GetResourceHazardType(
+      int64_t resource_type) const override;
+
+  // Returns the number of resources (of type resource_type) that are used by
+  // this instruction.
+  int64_t GetNumResourcesPerInstruction(
+      int64_t resource_type, const HloInstruction& instr) const override;
+};
+
+// GPU approximate latency estimator. It is a set of hardcoded heuristics
+// for every instruction and async instruction pairs.
+class GpuLatencyEstimator : public ApproximateLatencyEstimator {
+ public:
+  explicit GpuLatencyEstimator(
+      int64_t pointer_size,
+      GetCanonicalAsyncOpFunc func = GpuGetCanonicalAsyncOp);
+
+  // Uses the approximate node for an instruction `instr`.
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+
+  // Returns a latency estimation between nodes `from` and `to`.
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& to) const override;
+
+ private:
+  int64_t pointer_size_;
+};
+
+// GPU PGLE statistics tracker.
+class GPUProfileStatisticsAggregator : public ProfileStatisticsAggregator {
+ public:
+  // Counts `instruction` as missing if is not a NOP.
+  void HandleMissingInstructionCost(const HloInstruction& instruction) override;
+
+  // Counts `instruction` as found.
+  void HandleFoundInstructionCost(const HloInstruction& instruction) override;
+
+  // Counts `from` -> `to` pair as missing if it is an async pair.
+  void HandleMissingInstructionLatency(const HloInstruction& from,
+                                       const HloInstruction& to) override;
+
+  // Counts `from` -> `to` pair as found.
+  void HandleFoundInstructionLatency(const HloInstruction& from,
+                                     const HloInstruction& to) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_LATENCY_HIDING_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
new file mode 100644
index 00000000..9dc80eb4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_MEMORY_SPACE_ASSIGNMENT_H_
+#define XLA_SERVICE_GPU_GPU_MEMORY_SPACE_ASSIGNMENT_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/hlo_value.h"
+
+namespace xla {
+namespace gpu {
+
+inline constexpr int64_t kCollectiveMemorySpaceColor = 1;
+inline constexpr int64_t kTempBufferMemorySpaceColor = 2;
+
+// Set memory space to kCollectiveMemorySpaceColor for all allocations used by
+// all-reduce, all-gather, and reduce-scatter. This memory space maps to
+// collective memory using ncclMemAlloc in the runtime.
+inline BufferAssigner::Colorer CollectiveColorer() {
+  return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+    static const auto* kSupportedOpcodes = new absl::flat_hash_set<HloOpcode>{
+        HloOpcode::kAllReduce,
+        HloOpcode::kAllReduceStart,
+        HloOpcode::kAllReduceDone,
+        HloOpcode::kAllGather,
+        HloOpcode::kAllGatherStart,
+        HloOpcode::kAllGatherDone,
+        HloOpcode::kReduceScatter,
+        HloOpcode::kCollectivePermute,
+        HloOpcode::kCollectivePermuteStart,
+        HloOpcode::kCollectivePermuteDone,
+        HloOpcode::kAllToAll,
+    };
+    for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
+      auto& buffer = alias_analysis->GetBufferContainingValue(*value);
+      for (const auto& alias : buffer.values()) {
+        // opcode or async wrapped opcode is in kSupportedOpcodes.
+        if (kSupportedOpcodes->contains(alias->instruction()->opcode()) ||
+            ((alias->instruction()->opcode() == HloOpcode::kAsyncStart ||
+              alias->instruction()->opcode() == HloOpcode::kAsyncDone) &&
+             kSupportedOpcodes->contains(
+                 alias->instruction()->async_wrapped_opcode()))) {
+          value->set_color(kCollectiveMemorySpaceColor);
+        }
+      }
+      if (!value->has_color()) {
+        value->set_color(0);
+      }
+    }
+    return absl::OkStatus();
+  };
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_MEMORY_SPACE_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_norm_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_norm_runner.h
new file mode 100644
index 00000000..f13b0507
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_norm_runner.h
@@ -0,0 +1,197 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_NORM_RUNNER_H_
+#define XLA_SERVICE_GPU_GPU_NORM_RUNNER_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/stream_executor_util.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/lazy_op_runner.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+inline absl::StatusOr<xla::gpu::CudnnNormKind> AsCudnnNormKind(
+    xla::gpu::CudnnNormBackendConfig_Kind kind) {
+  switch (kind) {
+    case xla::gpu::CudnnNormBackendConfig::LAYER_FWD_INFER:
+      return xla::gpu::CudnnNormKind::kLayerForwardInfer;
+    case xla::gpu::CudnnNormBackendConfig::LAYER_FWD_TRAIN:
+      return xla::gpu::CudnnNormKind::kLayerForwardTrain;
+    case xla::gpu::CudnnNormBackendConfig::LAYER_BWD:
+      return xla::gpu::CudnnNormKind::kLayerBackward;
+    default:
+      return xla::Internal("Unknown norm kind.");
+  }
+}
+
+// Intermediate structure used as input to construct GpuNormConfig.
+struct GpuNormDescriptor {
+  CudnnNormBackendConfig backend_config;
+  Shape x_shape;
+  Shape scale_shape;
+  std::optional<Shape> bias_shape;
+  Shape y_or_dx_shape;
+  std::optional<Shape> expectation_shape;
+  std::optional<Shape> norm_factor_shape;
+  std::optional<Shape> dy_shape;
+  std::optional<Shape> dscale_shape;
+  std::optional<Shape> dbias_shape;
+  size_t scratch_size;
+};
+
+// Structure to describe static properties of a fused norm op.
+struct GpuNormConfig {
+  static absl::StatusOr<GpuNormConfig> For(const GpuNormDescriptor& desc) {
+    std::vector<PrimitiveType> y_or_dx_types;
+
+    GpuNormConfig config;
+    config.epsilon = desc.backend_config.epsilon();
+    config.algorithm = se::dnn::AlgorithmDesc(desc.backend_config.algorithm());
+    TF_ASSIGN_OR_RETURN(config.kind,
+                        AsCudnnNormKind(desc.backend_config.kind()));
+
+    auto tensor_descriptor_from_shape =
+        [](Shape shape) -> absl::StatusOr<se::dnn::TensorDescriptor> {
+      TF_ASSIGN_OR_RETURN(
+          se::dnn::DataType data_type,
+          GetDNNDataTypeFromPrimitiveType(shape.element_type()));
+      return se::dnn::TensorDescriptor::For(data_type, shape.dimensions(),
+                                            shape.layout().minor_to_major());
+    };
+
+    TF_ASSIGN_OR_RETURN(config.x_descriptor,
+                        tensor_descriptor_from_shape(desc.x_shape));
+    TF_ASSIGN_OR_RETURN(config.scale_descriptor,
+                        tensor_descriptor_from_shape(desc.scale_shape));
+    TF_ASSIGN_OR_RETURN(config.y_or_dx_descriptor,
+                        tensor_descriptor_from_shape(desc.y_or_dx_shape));
+    if (desc.bias_shape) {
+      TF_ASSIGN_OR_RETURN(config.bias_descriptor, tensor_descriptor_from_shape(
+                                                      desc.bias_shape.value()));
+    }
+    if (desc.expectation_shape) {
+      TF_ASSIGN_OR_RETURN(
+          config.expectation_descriptor,
+          tensor_descriptor_from_shape(desc.expectation_shape.value()));
+      TF_ASSIGN_OR_RETURN(
+          config.norm_factor_descriptor,
+          tensor_descriptor_from_shape(desc.norm_factor_shape.value()));
+    }
+    if (desc.dscale_shape) {
+      TF_ASSIGN_OR_RETURN(config.dy_descriptor,
+                          tensor_descriptor_from_shape(desc.dy_shape.value()));
+      TF_ASSIGN_OR_RETURN(
+          config.dscale_descriptor,
+          tensor_descriptor_from_shape(desc.dscale_shape.value()));
+      TF_ASSIGN_OR_RETURN(
+          config.dbias_descriptor,
+          tensor_descriptor_from_shape(desc.dbias_shape.value()));
+    }
+    return config;
+  }
+
+  absl::StatusOr<se::dnn::NormOp::Config> AsDnnNormOpConfig() const {
+    TF_ASSIGN_OR_RETURN(se::dnn::NormKind norm_kind,
+                        GetDNNNormKindFromCudnnNormKind(kind));
+    return se::dnn::NormOp::Config{norm_kind,
+                                   epsilon,
+                                   x_descriptor,
+                                   scale_descriptor,
+                                   y_or_dx_descriptor,
+                                   bias_descriptor,
+                                   dy_descriptor,
+                                   expectation_descriptor,
+                                   norm_factor_descriptor,
+                                   dscale_descriptor,
+                                   dbias_descriptor};
+  }
+
+  double epsilon;
+  CudnnNormKind kind;
+  se::dnn::AlgorithmDesc algorithm;
+  se::dnn::TensorDescriptor x_descriptor;
+  se::dnn::TensorDescriptor scale_descriptor;
+  std::optional<se::dnn::TensorDescriptor> bias_descriptor;
+  se::dnn::TensorDescriptor y_or_dx_descriptor;
+  std::optional<se::dnn::TensorDescriptor> expectation_descriptor;
+  std::optional<se::dnn::TensorDescriptor> norm_factor_descriptor;
+  std::optional<se::dnn::TensorDescriptor> dy_descriptor;
+  std::optional<se::dnn::TensorDescriptor> dscale_descriptor;
+  std::optional<se::dnn::TensorDescriptor> dbias_descriptor;
+};
+
+class NormRunner {
+ public:
+  NormRunner() = default;
+
+  explicit NormRunner(
+      std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::NormOp>> runner)
+      : repr_(std::move(runner)) {}
+
+  explicit NormRunner(const GpuNormConfig& config)
+      : NormRunner(std::make_unique<se::dnn::LazyOpRunner<se::dnn::NormOp>>(
+            config.algorithm)) {}
+
+  se::dnn::AlgorithmDesc ToAlgorithmDesc() const {
+    return repr_->ToAlgorithmDesc();
+  }
+
+  se::dnn::LazyOpRunner<se::dnn::NormOp>* AsNormRunner() { return repr_.get(); }
+
+ private:
+  std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::NormOp>> repr_;
+};
+
+struct RunNormOptions {
+  // Nullable output-parameter pointer for profiling results.
+  se::dnn::ProfileResult* profile_result = nullptr;
+
+  // Cannot be nullptr.
+  NormRunner* norm_runner;
+};
+
+absl::Status RunGpuNorm(const GpuNormConfig& conv_config,
+                        const se::DeviceMemoryBase& x_buffer,
+                        const se::DeviceMemoryBase& scale_buffer,
+                        const se::DeviceMemoryBase& y_or_dx_buffer,
+                        std::optional<se::DeviceMemoryBase> bias_buffer,
+                        std::optional<se::DeviceMemoryBase> dy_buffer,
+                        std::optional<se::DeviceMemoryBase> expectation_buffer,
+                        std::optional<se::DeviceMemoryBase> norm_factor_buffer,
+                        std::optional<se::DeviceMemoryBase> dscale_buffer,
+                        std::optional<se::DeviceMemoryBase> dbias_buffer,
+                        const se::DeviceMemoryBase& scratch_memory,
+                        se::Stream* stream, RunNormOptions options = {});
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_NORM_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.h
new file mode 100644
index 00000000..ea5eec8c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_P2P_PIPELINER_H_
+#define XLA_SERVICE_GPU_GPU_P2P_PIPELINER_H_
+
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+
+namespace xla {
+namespace gpu {
+
+// Adds a collective-pipeliner pass for pipelining P2P Send-Recv chains.
+void AddP2PPipeliner(HloPassPipeline& pipeline,
+                     bool enable_experimental_pipeline_parallelism_opt);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_P2P_PIPELINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_prim.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_prim.h
new file mode 100644
index 00000000..83605864
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_prim.h
@@ -0,0 +1,115 @@
+/* Copyright 2023 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+To in writing unless required by applicable law or agreed,
+distributed on an, software distributed under the license is "AS IS"
+BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
+or implied. For the specific language governing permissions and
+limitations under the license, the license you must see.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_GPU_PRIM_H_
+#define XLA_SERVICE_GPU_GPU_PRIM_H_
+
+#include "tsl/platform/bfloat16.h"
+
+#if GOOGLE_CUDA
+#include "cub/block/block_load.cuh"
+#include "cub/block/block_scan.cuh"
+#include "cub/block/block_store.cuh"
+#include "cub/device/device_histogram.cuh"
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_reduce.cuh"
+#include "cub/device/device_scan.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "cub/device/device_segmented_reduce.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/counting_input_iterator.cuh"
+#include "cub/iterator/transform_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "cub/warp/warp_reduce.cuh"
+#include "third_party/gpus/cuda/include/cusparse.h"
+
+namespace gpuprim = ::cub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace cub {
+template <>
+__device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::half>(
+    Eigen::half *ptr, Eigen::half val, Int2Type<true> /*is_primitive*/) {
+  *reinterpret_cast<volatile uint16_t *>(ptr) =
+      Eigen::numext::bit_cast<uint16_t>(val);
+}
+
+__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer(
+    const Eigen::half *ptr, Int2Type<true> /*is_primitive*/) {
+  uint16_t result = *reinterpret_cast<volatile const uint16_t *>(ptr);
+  return Eigen::numext::bit_cast<Eigen::half>(result);
+}
+
+template <>
+__device__ __forceinline__ void ThreadStoreVolatilePtr<tsl::bfloat16>(
+    tsl::bfloat16 *ptr, tsl::bfloat16 val, Int2Type<true> /*is_primitive*/) {
+  *reinterpret_cast<volatile uint16_t *>(ptr) =
+      Eigen::numext::bit_cast<uint16_t>(val);
+}
+
+__device__ __forceinline__ tsl::bfloat16 ThreadLoadVolatilePointer(
+    tsl::bfloat16 *ptr, Int2Type<true> /*is_primitive*/) {
+  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
+  return Eigen::numext::bit_cast<tsl::bfloat16>(result);
+}
+
+template <>
+struct NumericTraits<Eigen::half>
+    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
+                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
+                 /*T=*/Eigen::half> {};
+template <>
+struct NumericTraits<tsl::bfloat16>
+    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
+                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
+                 /*T=*/tsl::bfloat16> {};
+}  // namespace cub
+#elif TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hipcub/hipcub.hpp"
+#include "rocm/rocm_config.h"
+namespace gpuprim = ::hipcub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace rocprim {
+namespace detail {
+
+#if (TF_ROCM_VERSION >= 50200)
+template <>
+struct float_bit_mask<Eigen::half> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7C00;
+  static constexpr uint16_t mantissa = 0x03FF;
+  using bit_type = uint16_t;
+};
+
+template <>
+struct float_bit_mask<tsl::bfloat16> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7F80;
+  static constexpr uint16_t mantissa = 0x007F;
+  using bit_type = uint16_t;
+};
+#endif  // TF_ROCM_VERSION >= 50200
+template <>
+struct radix_key_codec_base<Eigen::half>
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
+template <>
+struct radix_key_codec_base<tsl::bfloat16>
+    : radix_key_codec_floating<tsl::bfloat16, uint16_t> {};
+};  // namespace detail
+};  // namespace rocprim
+
+#endif  // TENSORFLOW_USE_ROCM
+
+#endif  // XLA_SERVICE_GPU_GPU_PRIM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.h
new file mode 100644
index 00000000..9b567d55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_SPMD_PIPELINE_H_
+#define XLA_SERVICE_GPU_GPU_SPMD_PIPELINE_H_
+
+#include <optional>
+
+#include "absl/functional/function_ref.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
+
+namespace xla {
+namespace gpu {
+
+// Adds SPMD passes to the pipeline.
+void AddSPMDPasses(
+    const HloModule* hlo_module,
+    const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
+    const se::GpuComputeCapability& compute_capability,
+    HloPassPipeline& spmd_pipeline,
+    std::optional<const absl::FunctionRef<void(HloPassPipeline&)>>
+        auto_sharding_func = std::nullopt);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_SPMD_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_symbol_repository.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_symbol_repository.h
new file mode 100644
index 00000000..3ad94802
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_symbol_repository.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_SYMBOL_REPOSITORY_H_
+#define XLA_SERVICE_GPU_GPU_SYMBOL_REPOSITORY_H_
+
+#include <optional>
+
+#include "xla/autotune_results.pb.h"
+#include "xla/service/symbol_repository.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+// GPU-specific fields for SymbolRepositories.
+struct GpuBackendSpecificData : public BackendSpecificData {
+  std::optional<GpuCompilationEnvironment> gpu_compilation_environment;
+  std::optional<AutotuneResults> autotune_results;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_GPU_SYMBOL_REPOSITORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_transfer_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_transfer_manager.h
new file mode 100644
index 00000000..baea37e8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/gpu_transfer_manager.h
@@ -0,0 +1,159 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
+#define XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/literal.h"
+#include "xla/service/generic_transfer_manager.h"
+#include "xla/service/gpu/infeed_manager.h"
+#include "xla/service/gpu/outfeed_manager.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// An implementation of the XLA GenericTransferManager that
+// handles GPU-specific infeed.
+class GpuTransferManager : public GenericTransferManager {
+ public:
+  GpuTransferManager(se::Platform::Id id, unsigned pointer_size);
+
+  absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                       const LiteralSlice& literal) override;
+  absl::Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, MutableBorrowingLiteral literal) override;
+  absl::Status ReadDynamicShapes(se::Stream* stream,
+                                 const ShapedBuffer* device_buffer,
+                                 Shape* device_shape) override;
+
+  // Creates or returns a singleton InfeedManager for the given executor.
+  static InfeedManager* GetOrCreateInfeedManager(se::StreamExecutor* executor);
+  // Creates or returns a singleton OutfeedManager for the given executor.
+  static OutfeedManager* GetOrCreateOutfeedManager(
+      se::StreamExecutor* executor);
+
+ private:
+  // We use a fixed-size staging buffers and split transfer into multiple
+  // operations if literal does not fit into it.
+  static constexpr int64_t kStagingBufferSize = 128 * 1024 * 1024;
+
+  // We use host memory allocation (pinned host memory) as a staging buffer for
+  // transfering literals to and from device. We keep a separate staging
+  // allocation per device so we don't need to do cross-device synchronization.
+  // All transfers to and from a device are ordered via stream dependencies.
+  struct StagingBuffer {
+    StagingBuffer(std::unique_ptr<se::MemoryAllocation> allocation,
+                  std::unique_ptr<se::Event> transfer_completed);
+
+    absl::Mutex mutex;
+    std::unique_ptr<se::MemoryAllocation> allocation ABSL_GUARDED_BY(mutex);
+    std::unique_ptr<se::Event> transfer_completed ABSL_GUARDED_BY(mutex);
+  };
+
+  GpuTransferManager(const GpuTransferManager&) = delete;
+  GpuTransferManager& operator=(const GpuTransferManager&) = delete;
+
+  bool PackSubbyteTypes() const override { return true; }
+
+  // Returns or creates the staging buffer for the given executor.
+  absl::StatusOr<StagingBuffer*> GetOrCreateStagingBuffer(
+      se::StreamExecutor* executor);
+
+  absl::Status TransferBufferFromDevice(se::Stream* stream,
+                                        const se::DeviceMemoryBase& source,
+                                        int64_t size,
+                                        void* destination) override;
+
+  absl::Status TransferBufferToDevice(
+      se::Stream* stream, int64_t size, const void* source,
+      se::DeviceMemoryBase* destination) override;
+
+  // TODO(ezhulenev): Unify this with staged buffers for transfering literals.
+
+  // This class keeps a pool of pinned memory
+  // (StreamExecutor::HostMemoryAllocate()) that serves ReadDynamicShapes().
+  // This is a bit of a hack: Callers like TensorFlow already have a full pinned
+  // memory allocator, and we could in theory use it here and elsewhere in XLA.
+  // But because GpuTransferManager is a singleton, we can't really access that.
+  //
+  // To keep things relatively simple, our allocator does the following.
+  //
+  //  - Allocate one chunk of 128 KiB pinned memory.
+  //  - Divide each chunk into 128-byte buffers.
+  //  - During ReadDynamicShapes(), "check out" one buffer for each dynamic
+  //    subshape.  Copy one subshape into one buffer.  If it doesn't fit or
+  //    there are no free buffers, fall back to an unpinned memcpy.
+  //
+  // A 128-byte buffer is large enough to hold a shape of rank 128/sizeof(int32)
+  // = 32, which is much larger than we normally see in XLA programs.  A 128 KiB
+  // chunk is large enough to hold 128 KiB/128B = 1024 dynamically-shaped
+  // buffers, which is also way larger than we should need, even if we're
+  // running multiple programs in parallel.
+  //
+  // This pool is lazily initialized on first use.  It would be better to
+  // initialize it in the constructor, but doing so poses a challenge in the
+  // presence of multiple GPUs.  We need a StreamExecutor in order to allocate
+  // pinned memory.  We don't care which GPU's SE we use, because SE allocates
+  // pinned memory with the PORTABLE flag, making it available to all CUDA
+  // contexts.  But we do need to avoid calling platform->ExecutorForDevice for
+  // a device that we're not "supposed" to use, because this will create a CUDA
+  // context for that device, consuming significant resources on the GPU,
+  // b/228207839.
+  //
+  // Lazy initialization works around this, because at that point we have a
+  // stream, and therefore we have an already-initialized StreamExecutor.
+  absl::Status EnsurePinnedBuffersAllocated(se::StreamExecutor* executor)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  static constexpr int64_t kPinnedChunkBytes = 128 * 1024;
+  static constexpr int64_t kPinnedBufferBytes = 128;
+
+  absl::Mutex mu_;
+
+  // Chunk of pinned memory of size kPinnedChunkBytes.  The pointers in
+  // pinned_buffers_ point into this chunk.  Lazily initialized.
+  std::unique_ptr<se::MemoryAllocation> pinned_chunk_ ABSL_GUARDED_BY(mu_);
+
+  // Host buffers for reading dynamic shapes.  Each buffer has size
+  // kPinnedBufferBytes.  Lazily initialized.
+  std::vector<void*> pinned_buffers_ ABSL_GUARDED_BY(mu_);
+
+  // Staging buffers allocated for transfers to and from device.
+  absl::Mutex mutex_;
+  absl::node_hash_map<se::StreamExecutor*, StagingBuffer> staging_buffers_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h
new file mode 100644
index 00000000..aefeed0d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
+#define XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/autotuning.pb.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+// Get the list of convolution algorithms which are disabled for the given 'hlo'
+// when using compute capability 'cc', cudnn version 'cudnn_version' and blas
+// version 'blas_version'. In addition to the hardcoded denylist used in this
+// function, extra entries for the denylist can be added via a file pointed to
+// by the --xla_gpu_algorithm_denylist_path flag.
+std::vector<stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
+    ComputeCapability cc, CudnnVersion cudnn_version,
+    const std::string& blas_version, const std::string& hlo);
+
+// Attaches a serialized backend config to the given HLO string.
+std::string HloStringWithGpuBackendConfig(const std::string& hlo,
+                                          GpuBackendConfig config);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
new file mode 100644
index 00000000..22aa7fb5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -0,0 +1,145 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_HLO_FUSION_ANALYSIS_H_
+#define XLA_SERVICE_GPU_HLO_FUSION_ANALYSIS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class HloFusionAnalysis {
+ public:
+  // The type of emitted fusion.
+  enum class EmitterFusionKind {
+    kLoop,
+    kCustomFusion,
+    kTriton,
+    kReduction,
+    kTranspose,
+    kConcatenate,
+    kInputSlices,
+    kScatter,
+    kCuDnn,
+  };
+
+  // Precomputed information about inputs (arguments) and outputs (roots) of the
+  // fusion.
+  struct InputOutputInfo {
+    int smallest_input_dtype_bits;
+    int smallest_output_dtype_bits;
+  };
+
+  static HloFusionAnalysis Create(FusionBackendConfig backend_config,
+                                  std::unique_ptr<HloFusionAdaptor> fusion,
+                                  const se::DeviceDescription* device_info);
+
+  // Creates a HloFusionAnalysis that analyzes just instruction as a standalone
+  // fusion.
+  static HloFusionAnalysis Create(const HloInstruction& instruction,
+                                  const se::DeviceDescription& device_info);
+
+  // Creates a HloFusionAnalysis that analyzes a hypothetical fusion of producer
+  // into consumer.
+  static HloFusionAnalysis Create(const HloInstruction& producer,
+                                  const HloInstruction& consumer,
+                                  const se::DeviceDescription& device_info);
+
+  const HloFusionAdaptor& fusion() const { return *fusion_; }
+
+  const absl::InlinedVector<HloInstructionAdaptor, 2>& fusion_roots() const {
+    return fusion_roots_;
+  }
+  HloInstructionAdaptor fusion_root(int64_t i) const {
+    return fusion_roots_[i];
+  }
+  int64_t fusion_root_count() const { return fusion_roots_.size(); }
+
+  const absl::InlinedVector<HloInstructionAdaptor, 2>& fusion_heroes() const {
+    return fusion_heroes_;
+  }
+  HloInstructionAdaptor fusion_hero(int64_t i) const {
+    return fusion_heroes_[i];
+  }
+
+  // Determines the fusion type for the emitter.
+  EmitterFusionKind GetEmitterFusionKind() const;
+
+  // Returns the hero reduction of the computation.
+  const HloInstruction* FindHeroReduction() const;
+
+  const se::DeviceDescription& device_info() const { return *device_info_; }
+
+  const FusionBackendConfig& fusion_backend_config() const {
+    return fusion_backend_config_;
+  }
+
+  // Returns the tiled transpose description. Requires that GetEmitterFusionKind
+  // returns kTranspose.
+  const TransposeDescription& tiled_transpose() const {
+    CHECK(tiled_transpose_.has_value());
+    return *tiled_transpose_;
+  }
+
+  const InputOutputInfo& input_output_info() const {
+    return input_output_info_;
+  }
+
+ private:
+  HloFusionAnalysis(FusionBackendConfig fusion_backend_config,
+                    std::unique_ptr<HloFusionAdaptor> fusion,
+                    absl::InlinedVector<HloInstructionAdaptor, 2> fusion_roots,
+                    absl::InlinedVector<HloInstructionAdaptor, 2> fusion_heroes,
+                    const se::DeviceDescription* device_info,
+                    std::optional<TransposeDescription> tiled_transpose,
+                    InputOutputInfo input_output_info);
+
+  bool HasConsistentTransposeHeros() const;
+
+  FusionBackendConfig fusion_backend_config_;
+
+  // Owning pointer to the fusion adaptor object.
+  std::unique_ptr<HloFusionAdaptor> fusion_;
+
+  // A list of all roots of the fusion. The instruction adaptors have `fusion_`
+  // as their parent and should not outlive `fusion_`.
+  absl::InlinedVector<HloInstructionAdaptor, 2> fusion_roots_;
+
+  // A list of all heroes of the fusion. The instruction adaptors have `fusion_`
+  // as their parent and should not outlive `fusion_`.
+  absl::InlinedVector<HloInstructionAdaptor, 2> fusion_heroes_;
+
+  const se::DeviceDescription* device_info_;
+  std::optional<TransposeDescription> tiled_transpose_;
+  InputOutputInfo input_output_info_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_HLO_FUSION_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_fusion_stats.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_fusion_stats.h
new file mode 100644
index 00000000..0978a914
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_fusion_stats.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_HLO_FUSION_STATS_H_
+#define XLA_SERVICE_GPU_HLO_FUSION_STATS_H_
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+// Read-only pass logging statistics about HLO fusion ops in the module. Enabled
+// at VLOG level 1 only.
+namespace xla {
+namespace gpu {
+
+class HloOpcodeHistogram : public std::map<std::set<std::string>, int64_t> {
+ public:
+  std::string ToString();
+};
+
+class HloFusionStatsVisitor : public ConstDfsHloVisitorWithDefault {
+ public:
+  std::string ToString();
+
+ protected:
+  absl::Status DefaultAction(const xla::HloInstruction* instr) final;
+
+  absl::Status HandleFusion(const HloInstruction* fusion) override;
+
+ private:
+  int64_t num_fusions_ = 0;
+  int64_t num_loop_fusions_ = 0;
+  int64_t num_input_fusions_ = 0;
+  HloOpcodeHistogram loop_fusion_opcode_histogram_;
+  HloOpcodeHistogram input_fusion_opcode_histogram_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_HLO_FUSION_STATS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
new file mode 100644
index 00000000..56e0e4f4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_HLO_TO_IR_BINDINGS_H_
+#define XLA_SERVICE_GPU_HLO_TO_IR_BINDINGS_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace gpu {
+
+// This class encapsulates the bindings between HloInstructions and LLVM IR
+// values that represent their addresses.
+class HloToIrBindings {
+ public:
+  HloToIrBindings(llvm::IRBuilderBase* b, llvm::Module* llvm_module,
+                  bool is_nested)
+      : is_nested_(is_nested), b_(b), module_(llvm_module) {}
+
+  void EmitBasePointersForHlos(
+      absl::Span<const HloInstruction* const> io_hlos,
+      absl::Span<const HloInstruction* const> non_io_hlos);
+
+  // Rebinds the given HLO to the LLVM IR value that represent its address.
+  void BindHloToIrValue(const HloInstruction& hlo, llvm::Value* ir_value,
+                        ShapeIndexView shape_index = {});
+
+  // Returns whether `hlo` is bound to an LLVM IR value.
+  bool BoundToIrValue(const HloInstruction& hlo) const {
+    return base_ptrs_.contains(&hlo);
+  }
+
+  // A helper method that returns the base pointer of the IrArray containing the
+  // output of "inst".at the given ShapeIndex.
+  llvm::Value* GetBasePointer(const HloInstruction& hlo,
+                              ShapeIndexView shape_index = {}) const {
+    auto it = base_ptrs_.find(&hlo);
+    CHECK(it != base_ptrs_.end()) << hlo.ToString();
+    return it->second.element(shape_index);
+  }
+
+  // Returns the IrArray which contains the output of hlo.
+  //
+  // consumer is the HLO in which this IrArray is used -- we use this to (try
+  // to) add metadata indicating that the array is invariant within consumer.
+  //
+  // To get the buffer into which hlo should write its own output, call
+  // GetIrArray(hlo, hlo).
+  llvm_ir::IrArray GetIrArray(const HloInstruction& hlo,
+                              const HloInstruction& consumer,
+                              const ShapeIndex& shape_index = {});
+
+ private:
+  const bool is_nested_;
+
+  llvm::IRBuilderBase* b_;
+  llvm::Module* module_;
+
+  // Stores the underlying llvm::IrArray for each HloInstruction.
+  // For an instruction that generates multiple outputs, the root will be a
+  // tuple shape. The IrArray for each element output is stored in the subnode
+  // in the ShapeTree.
+  absl::flat_hash_map<const HloInstruction*, ShapeTree<llvm::Value*>>
+      base_ptrs_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_HLO_TO_IR_BINDINGS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/infeed_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/infeed_manager.h
new file mode 100644
index 00000000..8f20e8fd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/infeed_manager.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header declares classes for the infeed manager and the infeed
+// buffer that are used by the GPU runtime to transfer buffers into an
+// executing GPU computation, e.g., to feed data into a while loop.
+
+#ifndef XLA_SERVICE_GPU_INFEED_MANAGER_H_
+#define XLA_SERVICE_GPU_INFEED_MANAGER_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "xla/literal.h"
+#include "xla/service/gpu/xfeed_queue.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// TODO(b/30467474) Once GPU infeed implementation settles, consider
+// folding back the cpu and gpu infeed implementations into a generic
+// one if possible.
+//
+// Current limitations:
+// * Does not handle multiple devices/replicas.
+//
+// * Buffer space on GPU is allocated on every infeed enqueue request,
+// and it does not handle the case when it runs out of
+// memory. Potential solution is to pre-allocate a fixed amount of
+// memory and block when that memory is full.
+
+// Client-side class used to enqueue infeed buffers.
+class InfeedManager
+    : public BlockingXfeedQueue<ShapeTree<se::DeviceMemoryHandle>> {
+ public:
+  explicit InfeedManager(se::StreamExecutor* executor);
+
+  absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                       const LiteralSlice& literal);
+
+ private:
+  se::Stream* stream() const { return stream_.get(); }
+
+  // Stream used to enqueue infeed device copies.
+  std::unique_ptr<se::Stream> stream_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_INFEED_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emission_utils.h
new file mode 100644
index 00000000..3432cec2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -0,0 +1,255 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
+#define XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/backend_config.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/literal.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// <HLO computation fingerprint, serialized compiled object>.
+using BinaryMap = absl::flat_hash_map<std::string, std::string>;
+
+// If a dimensions is smaller than this, untiled transposition may be more
+// efficient.
+inline constexpr int64_t kMinDimensionToTransposeTiled = 16;
+// But if both swap dimensions are larger than 'kMinDimensionToTransposeTiled2',
+// and the product of the dimensions to be swapped is larger than
+// 'kMinTotalDimensionsToTransposeTiled', tiled transposition may be more
+// efficient.
+inline constexpr int64_t kMinDimensionToTransposeTiled2 = 8;
+inline constexpr int64_t kMinTotalDimensionsToTransposeTiled = 64 * 128;
+// As the amount of shared memory is limited, we need to make sure that we don't
+// detect 102 transposes that would require too much bytes for the most minor
+// dimension.
+inline constexpr int64_t kMaxBytesInMostMinorDimension = 8;
+
+// Matrix multiplication before the rewrite.
+bool IsMatrixMultiplication(const HloInstruction& dot);
+bool IsMatrixVectorMultiplication(const HloInstruction& dot);
+
+inline constexpr int64_t WarpSize(
+    const se::DeviceDescription& gpu_device_info) {
+  return gpu_device_info.threads_per_warp();
+}
+
+// Fusions that implemented with pre-compiled device kernels have
+// FusionBackendConfig.kind requel to this string.
+inline constexpr absl::string_view kCustomFusionKind = "__custom_fusion";
+
+// Generic fusions that use Triton have FusionBackendConfig.kind equal to this
+// string. This fusion kind will eventually subsume all usages of
+// kTritonGemmFusionKind and kTritonSoftmaxFusionKind.
+inline constexpr absl::string_view kTritonFusionKind = "__triton";
+
+// Fusions that use Triton have FusionBackendConfig.kind equal to this string.
+inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
+
+inline constexpr absl::string_view kCuDnnFusionKind = "__cudnn$fusion";
+
+inline constexpr absl::string_view kUncompilableFusion =
+    "__uncompilable_fusion";
+
+inline constexpr absl::string_view kTopKCustomCallTarget = "__gpu$TopK";
+
+// Returns true if `hlo` will be implemented as a call to a cuSolver routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCusolver... constants, but returns *false* for HLOs with
+// say, a kCholesky opcode.
+bool IsCustomCallToCusolver(const HloInstruction& hlo);
+
+// Returns true if `hlo` will be implemented as a call to a TopK routine.
+bool IsCustomCallToTopK(const HloInstruction& hlo);
+
+// Cholesky decomposition. Takes a (batched) matrix as input, and returns a
+// tuple of (result, workspace, info), where result is the result of the
+// Cholesky decomposition, workspace is scratch space for cuSolver, and info
+// is a success/failure code per batch element.
+extern const char* const kCusolverCholeskyCallTarget;
+
+// Returns true if `instr` is a non-strided slice.
+bool IsSliceWithUnitStrides(const HloInstruction* instr);
+
+// Returns true if `instr` is a slice (or dynamic slice) instruction and
+// operates on a contiguous slice of the input buffer.
+bool IsContiguousSlice(const HloInstruction& instr);
+
+// Emits code that determines whether the current thread is thread 0 within
+// block 0 of the kernel.
+llvm::Value* IsBlock0Thread0(llvm::IRBuilderBase* b);
+
+absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
+    const BufferAssignment& buffer_assignment, const HloInstruction* instr,
+    const ShapeIndex& index);
+
+// Returns whether the fusion represented by 'fusion_adaptor' can be emitted
+// with the dynamic update slice in-place emitter. If 'fusion_adaptor'
+// represents a single fusion computation, 'fusion' should provide the fusion
+// instruction corresponding to that fusion computation. 'get_allocation_slice'
+// is a callback for getting the allocated buffer slice, given an instruction
+// and a shape index. This is ignored in case 'fusion' is a nullptr.
+absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
+    const HloFusionAdaptor& fusion_adaptor,
+    std::function<absl::StatusOr<BufferAllocation::Slice>(
+        const HloInstruction* instr, const ShapeIndex& index)>
+        get_allocation_slice,
+    const HloInstruction* fusion = nullptr);
+
+// Returns the dynamic-update-slice instructions defining the results of a
+// fusion node. A dynamic slice update is said to be "defining" of a result if
+// that result is the output of a dynamic slice update, or if that result is the
+// output of a bitcast of a dynamic slice update---since such bitcast may be
+// handled as a no-op.
+std::vector<HloInstructionAdaptor> GetOutputDefiningDynamicUpdateSlices(
+    absl::Span<HloInstructionAdaptor const> roots);
+
+// Returns the first hero instruction reachable from `instr` as root. Hero
+// instruction can be in a different computation if the parent HloFusionAdaptor
+// is a producer-consumer fusion.
+HloInstructionAdaptor FindNonTrivialHero(const HloInstructionAdaptor& instr);
+
+// Same as above, but fusion is the parent computation of the hlo instruction.
+const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
+
+/// Description of how to emit a given transposition.
+struct TransposeDescription {
+  // Transpose instruction.
+  const HloInstruction* instr;
+
+  // Normalized transpose dimensions.
+  absl::InlinedVector<int64_t, 3> dimensions;
+
+  // Permutations of normalized transpose dimensions.
+  absl::InlinedVector<int64_t, 3> permutation;
+
+  TransposeDescription(absl::InlinedVector<int64_t, 3> dimensions,
+                       absl::InlinedVector<int64_t, 3> permutation)
+      : TransposeDescription(/*instr=*/nullptr, dimensions, permutation) {}
+
+  TransposeDescription(const HloInstruction* instr,
+                       absl::InlinedVector<int64_t, 3> dimensions,
+                       absl::InlinedVector<int64_t, 3> permutation)
+      : instr(instr), dimensions(dimensions), permutation(permutation) {}
+
+  // Transpose instruction input shape.
+  const Shape& input_shape() const { return instr->operand(0)->shape(); }
+
+  // Returns true, if both descriptions have the same dimensions and
+  // permutation, even if they're produced by different instructions.
+  bool IsEquivalent(const TransposeDescription& other) const {
+    return dimensions == other.dimensions && permutation == other.permutation;
+  }
+};
+
+std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
+    const HloInstruction& hero);
+
+// Checks if the instruction is elementwise.
+bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1);
+
+// Log the given module if the VLOG level is >= level.
+void VLogModule(int level, const llvm::Module& module);
+
+// Verify the given module, and crash if it failed.
+void VerifyModule(const llvm::Module& module);
+
+// Returns the llvm type for the indices used in the kernel that contains the
+// hlo instruction. Such indices include the index for the parallel loop and
+// the indices for the tensors accessed by the kernel. The return type is i32
+// iff the following conditions are met:
+//  . The launch_size of the kernel is within the range of i32.
+//  . The sizes of all the tensors accessed within the kernel are within the
+//    range of i32.
+// Otherwise, the return type is i64.
+llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo,
+                                  int64_t launch_size, llvm::IRBuilderBase* b);
+
+// This class stores either a non-owning reference or owns data that represents
+// a dense array in XLA format. It is used for intermediate storage during IR
+// constant emission.
+class DenseDataIntermediate {
+ public:
+  // Creates an instance of DenseDataIntermediate that owns the provided vector.
+  static DenseDataIntermediate Own(std::vector<uint8_t> owned) {
+    DenseDataIntermediate di;
+    di.data_ = std::move(owned);
+    return di;
+  }
+
+  // Creates an instance of DenseDataIntermediate that aliases the input.
+  static DenseDataIntermediate Alias(absl::Span<const uint8_t> aliased) {
+    DenseDataIntermediate di;
+    di.data_ = aliased;
+    return di;
+  }
+
+  // Returns a reference to the data this object represents.
+  absl::Span<const uint8_t> span() const {
+    return data_.index() == 0 ? absl::Span<const uint8_t>(std::get<0>(data_))
+                              : std::get<1>(data_);
+  }
+
+ private:
+  std::variant<std::vector<uint8_t>, absl::Span<const uint8_t>> data_;
+};
+
+absl::StatusOr<DenseDataIntermediate> LiteralToXlaFormat(
+    const Literal& literal);
+
+// Returns a deterministic encoded string representation of the proto message.
+absl::StatusOr<std::string> GetProtoFingerprint(
+    const tsl::protobuf::MessageLite&);
+
+// Returns concatenated fingerprint of an HLO instruction without its backend
+// config and its backend config's deterministic fingerprint.
+template <typename ConfigType>
+absl::StatusOr<std::string> FingerprintWithBackendConfig(
+    const HloInstruction& hlo) {
+  TF_ASSIGN_OR_RETURN(const auto config, hlo.backend_config<ConfigType>());
+  TF_ASSIGN_OR_RETURN(const std::string fingerprint,
+                      GetProtoFingerprint(config));
+  return absl::StrCat(hlo.ToString(HloPrintOptions::Fingerprint()),
+                      ", backend_config_fingerprint=", fingerprint);
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter.h
new file mode 100644
index 00000000..dc22e84b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter.h
@@ -0,0 +1,146 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_IR_EMITTER_H_
+#define XLA_SERVICE_GPU_IR_EMITTER_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_to_ir_bindings.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/llvm_ir/fused_ir_emitter.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/ir_builder_mixin.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace gpu {
+
+// Abstract base class for translating HLO graphs to LLVM IR for a GPU.
+//
+// There are two concrete subclasses of IrEmitter: IrEmitterNested and
+// IrEmitterUnnested.  In the unnested variety, each HLO gets its own kernel
+// function, whereas in the nested version the whole computation is emitted as
+// one *non-kernel* function.
+//
+// In XLA, kernel functions never call other kernel functions.  This means that
+// if we have a kernel -- e.g. implementing a kReduce HLO -- that wants to use
+// an HLO computation as a "subroutine" -- e.g. the HLO computation that
+// specifies how to reduce two elements -- then the subroutine computation must
+// be emitted using IrEmitterNested.
+//
+// Fusion nodes are a special case.  A fusion node is emitted using
+// IrEmitterUnnested, but the code is generated using FusedIrEmitter, which is
+// not a subclass of gpu::IrEmitter, and in fact is better understood as an IR
+// generator generator.  See comments on that class.
+class IrEmitter : public DfsHloVisitorWithDefault,
+                  public IrBuilderMixin<IrEmitter> {
+ public:
+  IrEmitter(const IrEmitter&) = delete;
+  IrEmitter& operator=(const IrEmitter&) = delete;
+
+  absl::Status DefaultAction(HloInstruction* hlo) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleFft(HloInstruction* fft) override;
+  absl::Status HandleAllReduce(HloInstruction* crs) override;
+  absl::Status HandleInfeed(HloInstruction* infeed) override;
+  absl::Status HandleOutfeed(HloInstruction* outfeed) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleParameter(HloInstruction* parameter) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleBatchNormInference(HloInstruction* batch_norm) override;
+  absl::Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
+  absl::Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+
+  absl::Status FinishVisit(HloInstruction* root) override {
+    return absl::OkStatus();
+  }
+
+  llvm::IRBuilderBase* builder() { return &b_; }
+
+ protected:
+  // Constructs an IrEmitter with the given IrEmitter context.
+  // ir_emitter_context is owned by the caller and should outlive the IrEmitter
+  // object.
+  explicit IrEmitter(IrEmitterContext* ir_emitter_context, bool is_nested);
+
+  // Helper for calling HloToIrBindings::GetIrArray.
+  //
+  // Gets the IrArray which contains inst.  This array has metadata that makes
+  // it valid only within the IR that implements consumer.  If you are
+  // implementing an HLO and want to get its own output buffer, call
+  // GetIrArray(hlo, hlo).
+  llvm_ir::IrArray GetIrArray(const HloInstruction& inst,
+                              const HloInstruction& consumer,
+                              const ShapeIndex& shape_index = {}) {
+    return bindings_.GetIrArray(inst, consumer, shape_index);
+  }
+  // A convenient helper for calling HloToIrBindings::GetBasePointer.
+  llvm::Value* GetBasePointer(const HloInstruction& inst,
+                              ShapeIndexView shape_index = {}) const {
+    return bindings_.GetBasePointer(inst, shape_index);
+  }
+
+  // Generates the IrArray for each output of an hlo instruction and returns
+  // a vector containing such IrArrays.
+  std::vector<llvm_ir::IrArray> ConstructIrArrayForOutputs(
+      const HloInstruction& hlo);
+
+  // Emit a single-threaded or multi-threaded loop that computes every element
+  // in the result of the given HLO instruction. This produces a series of
+  // nested loops (e.g. one for each dimension of the `hlo`'s shape). The body
+  // of the inner-most loop is provided by the body_emitter function.
+  virtual absl::Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& body_emitter) = 0;
+
+  IrEmitterContext* ir_emitter_context_;
+  llvm::Module* module_;
+
+  // The following fields track the IR emission state. According to LLVM memory
+  // management rules, their memory is owned by the module.
+  llvm::IRBuilder<> b_;
+
+  // Mapping from HLO to its underlying LLVM value.
+  HloToIrBindings bindings_;
+
+  // Bind all argument IrArrays of `fusion` to `fused_emitter`.
+  void BindFusionArguments(const HloInstruction* fusion,
+                           FusedIrEmitter* fused_emitter);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_context.h
new file mode 100644
index 00000000..366e2f0e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -0,0 +1,157 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_IR_EMITTER_CONTEXT_H_
+#define XLA_SERVICE_GPU_IR_EMITTER_CONTEXT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/execution_stream_assignment.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/name_uniquer.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+// Maps async start ops to their async events so we can emit done thunk
+// sharing events with corresponding start thunk. Async events may be null if
+// the start op is degenerate (so not emitted). For Send and Recv, this maps
+// <isRecv, channel_id> to the asyn events, as multiple Recv and Recv-done or
+// multiple Send and Send-done may map to the same async events and a Recv-done
+// or Send-done operand may not be its corresponding Recv or Send, when a
+// Send-Recv chain inside a loop is pipelined.
+using CollectivesAsyncEvents =
+    absl::flat_hash_map<std::variant<mlir::Operation*, const HloInstruction*,
+                                     std::pair<bool, uint64_t>>,
+                        std::shared_ptr<NcclCollectiveThunk::AsyncEvents>>;
+
+// IrEmitterContext encapsulates common (mutable and immutable) data structures
+// used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
+// assignment and the name uniquer.
+class IrEmitterContext {
+ public:
+  IrEmitterContext(const HloModule* hlo_module,
+                   const BufferAssignment* buffer_assignment,
+                   const ExecutionStreamAssignment* execution_stream_assignment,
+                   std::string platform_name,
+                   const se::DeviceDescription& gpu_device_info,
+                   mlir::MLIRContext* mlir_context, llvm::Module* llvm_module,
+                   llvm::Module* llvm_module_constants, bool emit_kernels)
+      : hlo_module_(hlo_module),
+        buffer_assignment_(buffer_assignment),
+        execution_stream_assignment_(execution_stream_assignment),
+        platform_name_(std::move(platform_name)),
+        gpu_device_info_(gpu_device_info),
+        mlir_context_(mlir_context),
+        llvm_module_(llvm_module),
+        llvm_module_constants_(llvm_module_constants),
+        emit_kernels_(emit_kernels) {}
+  // Disallow copy and assign.
+  IrEmitterContext(const IrEmitterContext&) = delete;
+  IrEmitterContext& operator=(const IrEmitterContext&) = delete;
+
+  // Simple accessors.
+  const HloModule& hlo_module() const { return *hlo_module_; }
+  const BufferAssignment& buffer_assignment() const {
+    return *buffer_assignment_;
+  }
+  const ExecutionStreamAssignment& execution_stream_assignment() const {
+    return *execution_stream_assignment_;
+  }
+  absl::string_view platform_name() const { return platform_name_; }
+  const se::DeviceDescription& gpu_device_info() const {
+    return gpu_device_info_;
+  }
+  const se::GpuComputeCapability& gpu_compute_capability() const {
+    return gpu_device_info_.gpu_compute_capability();
+  }
+  se::CudaComputeCapability cuda_compute_capability() const {
+    auto* cc =
+        std::get_if<se::CudaComputeCapability>(&gpu_compute_capability());
+    return cc != nullptr ? *cc : se::CudaComputeCapability();
+  }
+  se::RocmComputeCapability rocm_compute_capability() const {
+    auto* cc =
+        std::get_if<se::RocmComputeCapability>(&gpu_compute_capability());
+    return cc != nullptr ? *cc : se::RocmComputeCapability();
+  }
+  mlir::MLIRContext* mlir_context() { return mlir_context_; }
+  llvm::Module* llvm_module() { return llvm_module_; }
+  // A separate module can optionally be used to emit constants.
+  llvm::Module* llvm_module_constants() {
+    return (llvm_module_constants_ == nullptr) ? llvm_module_
+                                               : llvm_module_constants_;
+  }
+  NameUniquer* name_uniquer() { return &name_uniquer_; }
+
+  std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
+
+  // Emit a constant with a given number of element, given byte size of the
+  // element, given symbol name and content.
+  void emit_constant(int64_t num_elements, int64_t bytes_per_element,
+                     absl::string_view symbol_name, int allocation_idx,
+                     DenseDataIntermediate content, llvm::IRBuilderBase* b);
+
+  const DebugOptions& debug_options() const {
+    return hlo_module_->config().debug_options();
+  }
+
+  KernelReuseCache& kernel_cache() { return kernel_cache_; }
+  CollectivesAsyncEvents& collectives_async_events() {
+    return collectives_async_events_;
+  }
+
+  bool emit_kernels() const { return emit_kernels_; }
+
+ private:
+  const HloModule* hlo_module_;
+  const BufferAssignment* buffer_assignment_;
+  const ExecutionStreamAssignment* execution_stream_assignment_;
+  std::string platform_name_;
+  const se::DeviceDescription& gpu_device_info_;
+  mlir::MLIRContext* mlir_context_;
+  llvm::Module* llvm_module_;
+  llvm::Module* llvm_module_constants_;
+  NameUniquer name_uniquer_;
+  std::vector<GpuExecutable::ConstantInfo> constants_;
+  KernelReuseCache kernel_cache_;
+
+  CollectivesAsyncEvents collectives_async_events_;
+
+  // We should not emit kernels when loading thunks from a compilation result.
+  const bool emit_kernels_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_IR_EMITTER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_nested.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_nested.h
new file mode 100644
index 00000000..552ce9fb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_nested.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
+#define XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits LLVM IR for a "nested computation" into a non-kernel device function.
+//
+// This is used to emit code for HloComputations that don't require a separate
+// kernel call.  For example, IrEmitterNested is used to emit code for a kReduce
+// HLO's elementwise reduction computation.  Notably, IrEmitterNested is *not*
+// used to emit code for fusion nodes -- fusion nodes use FusedIrEmitter, which
+// is a different beast altogether.
+//
+// IrEmitterNested generates a non-kernel function with the following
+// parameters:
+//
+//   - N pointers to the buffers of each of the N parameters to the computation,
+//   - a pointer to the output buffer of the computation, and
+//   - a pointer to the top-level temp buffer.
+absl::Status CallNestedComputation(llvm::IRBuilderBase* builder,
+                                   IrEmitterContext& ir_emitter_context,
+                                   const HloComputation& computation,
+                                   absl::Span<llvm::Value* const> operands,
+                                   llvm::Value* output);
+
+// Like CallNestedComputation, but parameters and results are scalars.
+absl::StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalars(
+    llvm::IRBuilderBase* builder, IrEmitterContext& ir_emitter_context,
+    const HloComputation& computation,
+    absl::Span<llvm::Value* const> parameter_elements);
+
+// Like CallNestedComputationWithScalars, but parameters are scalar addresses.
+absl::StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalarAddrs(
+    llvm::IRBuilderBase* builder, IrEmitterContext& ir_emitter_context,
+    const HloComputation& computation,
+    absl::Span<llvm::Value* const> parameter_elements_addrs);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
new file mode 100644
index 00000000..0b102a85
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -0,0 +1,339 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+#define XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/elemental_ir_emitter.h"
+#include "xla/service/gpu/ir_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/runtime/copy_thunk.h"
+#include "xla/service/gpu/runtime/send_recv_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits LLVM IR for an "unnested computation".
+//
+// An unnested computation is an HloComputation which you run by executing one
+// or more kernels for each HloInstruction it contains.  Examples of unnested
+// computations:
+//
+//  - An HloModule's root computation,
+//  - The body of an HLO while loop,
+//  - The true/false computation of an HLO conditional.
+//
+// Note the opportunity for confusion -- the while loop's computation is nested
+// within the root computation, but it's emitted using IrEmitterUnnested!  Don't
+// think about it too hard.
+//
+// Examples of things that are not unnested computations:
+//
+//  - The body of a fusion node.  IrEmitterUnnested emits the relevant code
+//    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
+//    really an IrEmitter, but is more an "IR generator generator".)
+//
+class IrEmitterUnnested : public IrEmitter {
+ public:
+  absl::string_view platform_name() const {
+    return ir_emitter_context_->platform_name();
+  }
+
+  using ValueVector3 = std::array<llvm::Value*, 3>;
+  using ValueVector2 = std::array<llvm::Value*, 2>;
+
+  using ConstantGenerator = std::function<llvm::Value*(int64_t)>;
+
+  IrEmitterUnnested(const IrEmitterUnnested&) = delete;
+  IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
+
+  static std::unique_ptr<IrEmitterUnnested> Create(
+      IrEmitterContext* ir_emitter_context);
+
+  // Transfers the ownship of thunk_sequence_ out.
+  std::unique_ptr<SequentialThunk> ConsumeThunkSequence() {
+    return std::make_unique<SequentialThunk>(Thunk::ThunkInfo{},
+                                             std::move(thunk_sequence_));
+  }
+
+  // Emits code for the given HLO computation.
+  //
+  // Also populates related information to 'ir_emitter_context_' for
+  // large-constant initializations. Large constants don't get initializers in
+  // the generated code and so must be initialized by XLA. The value of these
+  // constants will be stored in 'content'. Constants with initializers in the
+  // generated code will have empty 'content'.
+  absl::Status EmitHloComputation(const HloComputation* computation);
+
+ private:
+  explicit IrEmitterUnnested(IrEmitterContext* ir_emitter_context);
+
+  absl::Status EmitCommandBufferThunk(const HloInstruction* instr);
+
+  // IrEmitterUnnested handles the following instructions differently from
+  // IrEmitter. It also mixes in some special handling for custom kernels
+  // via the ThunkEmitter.
+  absl::Status EmitConstant(const HloConstantInstruction* instr);
+
+  absl::Status EmitConditional(const HloInstruction* instr);
+  absl::Status EmitConvolutionThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitGemmThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitCublasLtMatmulThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitCublasLtMatmulThunkF8(const HloCustomCallInstruction* instr);
+  absl::Status EmitConvolutionReorderThunk(
+      const HloCustomCallInstruction* instr);
+  absl::Status EmitNormThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitCuDnnThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitCubDeviceRadixSort(const HloCustomCallInstruction* instr);
+  absl::Status EmitCholeskyThunk(const HloInstruction* instr);
+  absl::Status EmitCustomCallThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitFftThunk(const HloFftInstruction* instr);
+  absl::Status EmitFusion(const HloFusionInstruction* instr);
+  absl::Status EmitCopy(const HloInstruction* instr);
+  absl::Status EmitAsyncCustomCallStart(const HloInstruction* instr);
+  absl::Status EmitWhile(const HloInstruction* instr);
+  absl::Status EmitInfeed(const HloInfeedInstruction* instr);
+  absl::Status EmitOutfeed(const HloOutfeedInstruction* instr);
+  absl::Status EmitRngGetAndUpdateState(
+      const HloRngGetAndUpdateStateInstruction* instr);
+
+  absl::Status EmitSort(const HloSortInstruction* sort);
+  absl::Status EmitTriangularSolveCustomCall(const HloInstruction* instr);
+  absl::Status EmitTopKCustomCall(const HloCustomCallInstruction* instr);
+  absl::Status EmitTritonCustomCall(const HloCustomCallInstruction* instr);
+
+  absl::Status EmitSendThunk(const HloSendInstruction* instr);
+  absl::Status EmitSendDoneThunk(const HloSendDoneInstruction* instr);
+
+  absl::Status EmitRecvThunk(const HloRecvInstruction* instr);
+  absl::Status EmitRecvDoneThunk(const HloRecvDoneInstruction* instr);
+
+  template <typename NcclThunkType, typename HloInstType>
+  absl::Status EmitNcclThunk(Thunk::Kind kind,
+                             const HloInstruction* async_start,
+                             const HloInstType* inst,
+                             std::optional<bool> use_global_device_ids);
+
+  absl::Status EmitNcclAsyncDone(Thunk::Kind kind, const HloInstruction* instr);
+
+  template <typename ThunkType>
+  absl::Status EmitReplicaOrPartitionId(const HloInstruction* instr);
+
+  absl::Status EmitCollectivePermute(
+      const HloCollectivePermuteInstruction* instr);
+
+  absl::Status EmitCopyStartThunk(const HloCopyStartInstruction* instr);
+
+  absl::Status EmitCopyDoneThunk(const HloInstruction* instr);
+
+  absl::Status EmitHloInstruction(const HloInstruction* instr);
+
+  absl::Status EmitNcclGroupStartThunk(const HloInstruction* instr);
+
+  absl::Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& body_emitter) override;
+
+  // Add a owning Thunk object to the thunk sequence.
+  void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
+    if (emit_group_thunks_) {
+      scoped_thunk_sequence_.emplace_back(std::move(thunk));
+      return;
+    }
+    thunk_sequence_.emplace_back(std::move(thunk));
+  }
+
+  // Load data from potentially unaligned address. If address is offset by
+  // `alignment_bytes`, data is read in the unit of `alignment_bytes` to avoid
+  // memory read misalignment in CUDA; otherwise, the entire data are loaded
+  // from the given memory address.
+  //
+  //   address: the memory address to load data from.
+  //   data_type: the type of data to load.
+  //   alignment_bytes: the number of bytes required to align. The number of
+  //     bytes of the data_type must be divisible by alignment_bytes.
+  llvm::Value* CreateLoad(llvm::Value* address, llvm::Type* data_type,
+                          int alignment_bytes);
+
+  // Store data at a potentially unaligned address. If the address is offset by
+  // `alignment_bytes`, data is stored in the unit of `alignment_bytes` to avoid
+  // memory write misalignment in CUDA; otherwise, the entire data is stored at
+  // the given memory address.
+  //
+  //   data: the data to be stored.
+  //   address: the memory address to store data.
+  //   alignment_bytes: the number of bytes required to align. The number of
+  //     bytes of the data_type must be divisible by alignment_bytes.
+  void CreateStore(llvm::Value* data, llvm::Value* address,
+                   int alignment_bytes);
+
+  // Input = {static array, dynamic_dim0, dynamic_dim1}
+  // Output = {dynamic array(with dynamic dimension meta data at the end)}
+  // For a tensor with static dimension [2][<=5] and dynamic dimension [2][3]
+  // (`_` stands for padding)
+  // Input = {{1,2,3,_,_,4,5,6_,_}, 2, 3}
+  // Output = {{1,2,3,4,5,6,_,_,_,_,2,3}}
+
+  // pseudo code for padToStatic on a 2d array
+  //   ```
+  // void padToStatic(int** input, int** output, int threads_per_block,
+  //                  int meta_data_offset, int max_num_element,
+  //                  int static_dim0_size, int static_dim1_size) {
+  //   int* source_array = input[0];
+  //   int* dest_array = output[0];
+
+  //   // extract the dynamic dimension from the source array's metadata
+  //   int* dyn_dim0_size = source_array + meta_data_offset;
+  //   int* dyn_dim1_size = source_array + meta_data_offset + sizeof(int);
+
+  //   // only one thread need to store the dynamic index
+  //   int thread_id = GetThreadId();
+  //   int block_id = GetBlockId();
+  //   if (thread_id == 0 && block_id == 0) {
+  //     *output[1] = *dyn_dim0_size;
+  //     *output[2] = *dyn_dim1_size;
+  //   }
+
+  //   int dyn_element_total = 1;
+  //   dyn_element_total *= *dyn_dim0_size;
+  //   dyn_element_total *= *dyn_dim1_size;
+  //   linear_index = block_id * threads_per_block + thread_id;
+  //   if (linear_index < max_num_element) {
+  //     Index static_index =
+  //         delinerized(linerized_index, static_dim0_size, static_dim1_size);
+  //     if (linerized_index < dyn_element_total) {
+  //       Index dyn_index =
+  //           delinerized(linerized_index, *dyn_dim0_size, *dyn_dim1_size);
+  //       dest_array[dyn_index.dim0][dyn_index.dim1] =
+  //           source_array[static_index.dim0][static_index.dim1];
+  //     }
+  //   }
+  //   return;
+  // }
+  //   ```
+  absl::Status EmitPadToStatic(const HloCustomCallInstruction* instr);
+
+  // Input = {dynamic array(with dynamic dimension meta data at the end)}
+  // Output = {static array, dynamic_dim0, dynamic_dim1}
+  // For a tensor with static dimension [2][<=5] and dynamic dimension [2][3]
+  // (`_` stands for padding)
+  // Input = {{1,2,3,4,5,6,_,_,_,_,2,3}}
+  // Output = {{1,2,3,_,_,4,5,6_,_}, 2, 3}
+
+  // pseudo code for sliceToDynamic on a 2d array
+  //   ```
+  // void sliceToDynamic(int** input, int** output, int threads_per_block,
+  //                  int meta_data_offset, int max_num_element,
+  //                  int static_dim0_size, int static_dim1_size) {
+  //   int* source_array = input[0];
+  //   int* dest_array = output[0];
+
+  //   // calculate the location where metadata needs to be inserted
+  //   int* dyn_dim0_size = dest_array + meta_data_offset;
+  //   int* dyn_dim1_size = dest_array + meta_data_offset + sizeof(int);
+
+  //   // only one thread need to store the dynamic index
+  //   int thread_id = GetThreadId();
+  //   int block_id = GetBlockId();
+  //   if (thread_id == 0 && block_id == 0) {
+  //     *dyn_dim0_size = *output[1];
+  //     *dyn_dim1_size = *output[2];
+  //   }
+
+  //   int dyn_element_total = 1;
+  //   dyn_element_total *= *dyn_dim0_size;
+  //   dyn_element_total *= *dyn_dim1_size;
+  //   linear_index = block_id * threads_per_block + thread_id;
+  //   if (linear_index < max_num_element) {
+  //     Index static_index =
+  //         delinerized(linerized_index, static_dim0_size, static_dim1_size);
+  //     if (linerized_index < dyn_element_total) {
+  //       Index dyn_index =
+  //           delinerized(linerized_index, *dyn_dim0_size, *dyn_dim1_size);
+  //       dest_array[static_index.dim0][static_index.dim1] =
+  //           source_array[dyn_index.dim0][dyn_index.dim1];
+  //     }
+  //   }
+  //   return;
+  // }
+  //   ```
+  absl::Status EmitSliceToDynamic(const HloCustomCallInstruction* instr);
+
+  absl::StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
+                           std::vector<llvm_ir::IrArray> /*outputs*/>>
+  BuildKernelThunkForNonFusionOp(
+      const HloInstruction* hlo,
+      absl::Span<const HloInstruction* const> needed_operands,
+      const LaunchDimensions& launch_dimensions);
+
+  // Returns a WhileThunk that invokes thunk sequences for 'condition' and
+  // 'body' sub-computations of while instruction.
+  absl::StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(
+      const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info,
+      std::optional<int64_t> trip_count);
+
+  absl::Status AssertNonDeterminismIsOkay(const std::string& op_name);
+
+  absl::StatusOr<BufferAllocation::Slice> GetAllocationSliceForHlo(
+      const HloInstruction* instr, const ShapeIndex& index = {}) const;
+
+  CollectivesAsyncEvents& GetCollectivesAsyncEvents() {
+    return ir_emitter_context_->collectives_async_events();
+  }
+
+  // The thunk sequence this IrEmitter generates for the input computation.
+  ThunkSequence thunk_sequence_;
+  ThunkSequence scoped_thunk_sequence_;
+  bool emit_group_thunks_ = false;
+
+  // Container for async send/recv events shared by send/recv thunks.
+  std::shared_ptr<SendRecvAsyncEvents> send_recv_events_;
+
+  // Container for async copy-start/copy-done events.
+  std::shared_ptr<CopyThunk::AsyncEvents> copy_events_;
+
+  GpuElementalIrEmitter elemental_emitter_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernel_arguments.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernel_arguments.h
new file mode 100644
index 00000000..a9da0f53
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernel_arguments.h
@@ -0,0 +1,91 @@
+/*Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_KERNEL_ARGUMENTS_H_
+#define XLA_SERVICE_GPU_KERNEL_ARGUMENTS_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+// An argument descriptor for kernels.
+// Thread-safe.
+class KernelArgument {
+ public:
+  const Shape& shape() const { return shape_; }
+  const BufferAllocation::Slice& slice() const { return slice_; }
+  bool written() const { return written_; }
+  int64_t alignment() const { return alignment_; }
+  std::optional<int> first_with_same_slice() const {
+    return first_with_same_slice_;
+  }
+  bool aliased() const { return aliased_; }
+  int llvm_arg_index() const { return llvm_arg_index_; }
+
+ private:
+  KernelArgument(Shape shape, BufferAllocation::Slice slice, bool written)
+      : shape_(shape), slice_(slice), written_(written) {}
+
+  Shape shape_;
+  BufferAllocation::Slice slice_;
+  bool aliased_ = true;
+  int64_t alignment_ = 1;
+  bool written_ = true;
+  int llvm_arg_index_;
+  // Holds the index of the first argument which has the same slice as this,
+  // if this is not the first such argument.
+  std::optional<int> first_with_same_slice_;
+
+  friend class KernelArguments;
+};
+
+class KernelArguments {
+ public:
+  static absl::StatusOr<KernelArguments> Create(
+      const BufferAssignment& buffer_assignment,
+      const HloFusionInstruction* fusion);
+
+  static absl::StatusOr<KernelArguments> Create(
+      const BufferAssignment& buffer_assignment,
+      const HloInstruction* hlo_instruction,
+      absl::Span<const HloInstruction* const> needed_operands,
+      bool dedup = true);
+
+  const std::vector<KernelArgument>& args() const { return args_; }
+
+ private:
+  explicit KernelArguments(std::vector<KernelArgument> args, bool dedup = true)
+      : args_(ProcessArguments(std::move(args), dedup)) {}
+
+  static std::vector<KernelArgument> ProcessArguments(
+      std::vector<KernelArgument> kernel_arguments, bool dedup);
+
+  std::vector<KernelArgument> args_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_KERNEL_ARGUMENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernel_reuse_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
new file mode 100644
index 00000000..14ea5385
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
@@ -0,0 +1,121 @@
+/*Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_KERNEL_REUSE_CACHE_H_
+#define XLA_SERVICE_GPU_KERNEL_REUSE_CACHE_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla {
+namespace gpu {
+
+// Caches identical Kernels for deduplication.
+// Thread-compatible.
+class KernelReuseCache {
+ public:
+  struct Entry {
+    std::string kernel_name;
+    LaunchDimensions launch_dimensions;
+    std::optional<se::ClusterDim> cluster_dim;
+    int64_t shmem_bytes = 0;
+    std::string binary;
+  };
+  struct NamedBinary {
+    std::string name;
+    std::vector<uint8_t> binary;
+  };
+
+  absl::Status Load(const CompilationCacheProto& proto);
+  // Exporting skips kernels that were loaded but not used during emission.
+  // See comment for hits_ below.
+  CompilationCacheProto Export() const;
+  bool IsEmpty() const { return cache_.empty(); }
+  void Clear() {
+    cache_.clear();
+    hits_.clear();
+  }
+
+  // Retrieves the cache entry for the given computation, or generates it using
+  // the given generator function and stores it in the cache.
+  //
+  // The returned pointer is never nullptr.
+  //
+  // A non-OK status is returned if the entry is not found and the generator
+  // failed.
+  std::pair<absl::StatusOr<const Entry*>, bool /*was_cached*/> GetWithStatus(
+      const HloComputation* fused_computation,
+      absl::Span<const KernelArgument> kernel_arguments,
+      absl::string_view discriminator,
+      const std::function<absl::StatusOr<Entry>()>& generator);
+
+  // Retrieves the cache entry for the given fingerprint, or generates it using
+  // the given generator function and stores it in the cache.
+  //
+  // The returned pointer is never nullptr.
+  //
+  // A non-OK status is returned if the entry is not found and the generator
+  // failed.
+  std::pair<absl::StatusOr<const Entry*>, bool /*was_cached*/> GetWithStatus(
+      std::string fingerprint,
+      const std::function<absl::StatusOr<Entry>()>& generator);
+
+ private:
+  absl::flat_hash_map<std::string /*fingerprint*/, Entry> cache_;
+  // Track which fingerprints are in use. Unused ones can appear from loading a
+  // partially compatible cache file. These should not be exported to avoid
+  // linking the corresponding kernels later.
+  absl::flat_hash_set<std::string> hits_;
+};
+
+// Add kernels to the cache file. Binaries are taken from binaries_to_cache,
+// all other kernel properties are taken from current_cache.
+// do_append makes an existing file be loaded first.
+absl::Status UpdateDiskKernelCache(
+    absl::string_view path, bool do_append,
+    const CompilationCacheProto& current_cache,
+    absl::Span<const KernelReuseCache::NamedBinary> binaries_to_cache);
+
+// Calculates the fingerprint of a (fused_computation, kernel_arguments,
+// discriminator) tuple.
+//
+// If a given fusion is implemented using multiple kernels, then for each
+// kernel we should provide a discriminator, such as "init" and "impl".
+//
+// If the same fingerprint is returned twice, then we can reuse the kernel
+// generated for the first computation.
+std::string GetComputationFingerprint(
+    const HloComputation* fused_computation,
+    absl::Span<const KernelArgument> kernel_arguments,
+    absl::string_view discriminator = "");
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_KERNEL_REUSE_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
new file mode 100644
index 00000000..a6e6eb5b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+
+// WARNING: This header (and a build target) should have minimal dependencies as
+// it's included into all device kernel implementations, and we want to minimize
+// the number of (very expensive!) recompilations.
+
+namespace xla::gpu {
+namespace se = ::stream_executor;  // NOLINT
+
+// Custom kernel is a mechanism for plugging pre-compiled device kernels into
+// XLA GPU runtime. Custom kernel defines how to load the kernel on an executor
+// and what are the grid size requirements for running it.
+//
+// We use this API to hide kernel implementation details from XLA (e.g. we can
+// export CUTLASS gemm kernels as custom kernels to XLA), so that XLA can only
+// implement generic interfaces, e.g. command buffer implementation for CUDA
+// can automatically add a kernel node to a graph for arbitrary custom kernel.
+//
+// TODO(ezhulenev): Add custom kernel signature to track number and types of
+// buffer arguments, and a way to mark one of an arguments as a workspace and
+// define if it has to be zeroed first.
+class CustomKernel {
+ public:
+  CustomKernel(std::string name, se::MultiKernelLoaderSpec kernel_spec,
+               se::BlockDim block_dims, se::ThreadDim thread_dims,
+               size_t shared_memory_bytes);
+
+  CustomKernel(std::string name, se::MultiKernelLoaderSpec kernel_spec,
+               se::BlockDim block_dims, se::ThreadDim thread_dims,
+               se::ClusterDim cluster_dims, size_t shared_memory_bytes);
+
+  absl::string_view name() const;
+
+  const se::MultiKernelLoaderSpec& kernel_spec() const;
+
+  se::BlockDim block_dims() const;
+
+  se::ThreadDim thread_dims() const;
+
+  std::optional<se::ClusterDim> cluster_dims() const;
+
+  size_t shared_memory_bytes() const;
+
+  std::string ToString() const;
+
+ private:
+  std::string name_;
+  se::MultiKernelLoaderSpec kernel_spec_;
+  se::BlockDim block_dims_;
+  se::ThreadDim thread_dims_;
+  std::optional<se::ClusterDim> cluster_dims_;
+  size_t shared_memory_bytes_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
new file mode 100644
index 00000000..2b12cc4d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
@@ -0,0 +1,157 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/device_description.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// CustomKernelFusion
+//===----------------------------------------------------------------------===//
+
+// Custom kernel fusion is a mechanism for registering custom kernels
+// corresponding to HLO fusions.
+//
+// Example: row-major mixed dtype gemm with fused bitcast
+//
+//   %gemm (parameter_0: s8[19,17], parameter_1: f16[15,19]) -> f16[15,17] {
+//     %parameter_1 = f16[15,19]{1,0} parameter(1)
+//     %parameter_0 = s8[19,17]{1,0} parameter(0)
+//     %cp1.1 = f16[19,17]{1,0} convert(%parameter_0)
+//     ROOT %r.1 = f16[15,17]{1,0} dot(%parameter_1, %cp1.1),
+//                                   lhs_contracting_dims={1},
+//                                   rhs_contracting_dims={0}
+//  }
+//
+//  ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
+//    %p1 = s8[19,17]{1,0} parameter(1)
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    ROOT %gemm = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom,
+//                                 <implementation detail backend config>
+//  }
+//
+// XLA:GPU has multiple strategies for executing this fusion on device:
+//
+// (1) cuBLAS library call: a lot of simple gemm operations are supported by
+//     cuBLAS out of the box. However some combinations of parameters casting
+//     and epilogue fusion are not supported, which means that XLA has to form
+//     smaller fusions or use code generation to compiled a device kernel.
+//
+// (2) Triton: XLA:GPU uses Triton to codegen gemm fusion into device kernels
+//     (PTX and CUBIN for NVIDIA gpus).
+//
+// (3) Custom kernel fusion is another mechanism to execute fusion on device,
+// which
+//     relies on pre-compiled libraries of custom kernels authored by CUDA C++
+//     experts. Custom kernel fusion implements one particular fusion pattern
+//     (e.g. type casting plus a dot operation like in the example above) with
+//     custom kernels that XLA has to choose from at run time based on auto
+//     tuning.
+//
+//     In practice custom kernel fusion almost always implemented with multiple
+//     kernels, because input shapes are not known at compile time, and custom
+//     fusion has multiple kernels with different tiling schemes.
+//
+// What differentiates custom kernel fusions from custom calls, is that custom
+// kernel fusion should be implemented with a device kernel, and this allows
+// XLA:GPU to treat custom kernel fusion just like any other device kernel: it's
+// launched as a regular KernelThunk and automatically captured into command
+// buffers.
+//
+// Custom calls (registered with XLA:FFI) on the other hand gives much more
+// flexibility, and can be implemented as a combination of a non-trivial host
+// side code plus multiple kernel launches or library calls.
+//
+// Also XLA:FFI offers a stable C API that allows registering external functions
+// loaded from dynamic libraries compiled with a different toolchain of XLA
+// version. Custom kernel fusions integration relies on C++ ABI and static
+// linking.
+//
+// TODO(ezhulenev): It should be possible to lower `stablehlo.custom_call`
+// operations to custom kernel fusions, albeit with a static linking
+// restriction.
+class CustomKernelFusion {
+ public:
+  virtual ~CustomKernelFusion() = default;
+
+  // Loads kernels implementing `hlo_computation` optimized for a given device.
+  virtual absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
+      const se::DeviceDescription& device,
+      const HloComputation* computation) const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// CustomKernelFusionRegistry
+//===----------------------------------------------------------------------===//
+
+// Custom fusion registry is a mapping from a custom kernel fusion name to the
+// custom fusion implementation, and XLA compiler uses this registry to lower
+// fusion operations to kernels when emitting thunks.
+class CustomKernelFusionRegistry {
+ public:
+  // Returns a pointer to a default custom fusion registry, which is a global
+  // static registry.
+  static CustomKernelFusionRegistry* Default();
+
+  // Registers custom kernel fusion in the registry. Returns error if fusion
+  // with the given name already registered.
+  absl::Status Register(std::string name,
+                        std::unique_ptr<CustomKernelFusion> fusion);
+
+  // Looks up custom kernel fusion by name. Return nullptr if it's not found.
+  CustomKernelFusion* Lookup(absl::string_view name) const;
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<std::string, std::unique_ptr<CustomKernelFusion>>
+      registry_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla::gpu
+
+#define XLA_REGISTER_CUSTOM_FUSION(NAME, FUSION) \
+  XLA_REGISTER_CUSTOM_FUSION_(NAME, FUSION, __COUNTER__)
+
+#define XLA_REGISTER_CUSTOM_FUSION_(NAME, FUSION, N) \
+  XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)
+
+#define XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)                      \
+  [[maybe_unused]] static const bool xla_custom_fusion_##N##_registered_ = \
+      [] {                                                                 \
+        absl::Status status =                                              \
+            ::xla::gpu::CustomKernelFusionRegistry::Default()->Register(   \
+                NAME, std::make_unique<FUSION>());                         \
+        if (!status.ok()) LOG(ERROR) << status;                            \
+        return status.ok();                                                \
+      }()
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.h
new file mode 100644
index 00000000..a9366182
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.h
@@ -0,0 +1,147 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_PATTERN_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_PATTERN_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// CustomKernelFusionPattern
+//===----------------------------------------------------------------------===//
+
+// Custom kernel fusion pattern matches HLO instruction to custom kernels.
+class CustomKernelFusionPattern {
+ public:
+  // A name of a custom call that can be added to a custom kernel fusion body to
+  // allocate a workspace buffer require for the custom kernel fusion
+  // implementation.
+  static constexpr const char *kWorkspace = "__custom_kernel_fusion$workspace";
+
+  virtual ~CustomKernelFusionPattern() = default;
+
+  // Matched sequence of instructions that can be handled by a custom kernel
+  // fusion.
+  class Match {
+   public:
+    Match(CustomFusionConfig config, std::vector<HloInstruction *> instructions,
+          int64_t workspace_size_bytes = 0);
+
+    // If some of operations matched by a pattern have users outside of the
+    // custom kernel fusion, pattern can optionally provide a replacement that
+    // can be derived from the fusion instruction result, or from other
+    // instructions in the parent computation.
+    using Replacement =
+        std::function<absl::StatusOr<HloInstruction *>(HloFusionInstruction *)>;
+
+    void AddReplacement(HloInstruction *instr, Replacement replacement);
+    bool HasReplacement(HloInstruction *instr) const;
+
+    // Builds a replacement for `instr` using a `fusion` instruction constructed
+    // for a pattern match.
+    absl::StatusOr<HloInstruction *> BuildReplacement(
+        HloInstruction *instr, HloFusionInstruction *fusion) const;
+
+    const CustomFusionConfig &config() const { return config_; }
+    absl::Span<HloInstruction *const> instructions() const {
+      return instructions_;
+    }
+
+    HloInstruction *root() const { return instructions_.back(); }
+
+    int64_t workspace_size_bytes() const { return workspace_size_bytes_; }
+
+   private:
+    CustomFusionConfig config_;
+    std::vector<HloInstruction *> instructions_;
+    absl::flat_hash_map<const HloInstruction *, Replacement> replacements_;
+    int64_t workspace_size_bytes_;
+  };
+
+  // Returns custom fusion config and a list of instructions that matched to a
+  // custom kernel fusion (one or more custom kernels). Custom kernel fusion
+  // pass will outline matched instructions into a custom kernel fusion
+  // operation if possible.
+  //
+  // TODO(ezhulenev): Today the last instruction defines custom kernel fusion
+  // root (results), however we need to add support for custom kernel fusion
+  // that can return intermediate result, and custom kernel fusions that require
+  // an extra workspace.
+  virtual std::optional<Match> TryMatch(const se::DeviceDescription &device,
+                                        HloInstruction *instr) const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// CustomKernelFusionPatternRegistry
+//===----------------------------------------------------------------------===//
+
+class CustomKernelFusionPatternRegistry {
+ public:
+  // Returns a pointer to a default custom kernel fusion pattern registry, which
+  // is a global static registry.
+  static CustomKernelFusionPatternRegistry *Default();
+
+  std::vector<CustomKernelFusionPattern::Match> Match(
+      const se::DeviceDescription &device, HloInstruction *instr) const;
+
+  void Add(std::unique_ptr<CustomKernelFusionPattern> pattern);
+
+  template <typename... Ts, typename = std::enable_if_t<sizeof...(Ts) != 0>>
+  void Emplace() {
+    (Add(std::make_unique<Ts>()), ...);
+  }
+
+  template <typename... Ts, typename Arg,
+            typename = std::enable_if_t<sizeof...(Ts) != 0>>
+  void Emplace(Arg &&arg) {
+    (Add(std::make_unique<Ts>(std::forward<Arg>(arg))), ...);
+  }
+
+ private:
+  std::vector<std::unique_ptr<CustomKernelFusionPattern>> patterns_;
+};
+
+}  // namespace xla::gpu
+
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN(PATTERN) \
+  XLA_REGISTER_CUSTOM_FUSION_PATTERN_(PATTERN, __COUNTER__)
+
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN_(PATTERN, N) \
+  XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)
+
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)         \
+  [[maybe_unused]] static const bool                             \
+      xla_custom_fusion_pattern_##N##_registered_ = [] {         \
+        ::xla::gpu::CustomKernelFusionPatternRegistry::Default() \
+            ->Emplace<PATTERN>();                                \
+        return true;                                             \
+      }()
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_PATTERN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.h
new file mode 100644
index 00000000..efb64916
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.h
@@ -0,0 +1,265 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_H_
+
+//===-------------------------------------------------------------------------//
+//                 ! ! ! ! !      WARNING      ! ! ! ! !                      //
+//===-------------------------------------------------------------------------//
+//                                                                            //
+//   Do not add external dependencies to this header. Use only std library.   //
+//                                                                            //
+//===-------------------------------------------------------------------------//
+//                 ! ! ! ! !      WARNING      ! ! ! ! !                      //
+//===-------------------------------------------------------------------------//
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+namespace xla::gpu::kernel::gemm_universal {
+
+//===----------------------------------------------------------------------===//
+// Tag based GEMM dispatching
+//===----------------------------------------------------------------------===//
+
+// We use tag-based template specializations to carefully avoid including
+// CUTLASS headers into regular libraries, and specialize templates in separate
+// CUDA build targets that have no dependencies on other parts of XLA or ABSL to
+// enable parallel compilation and minimize recompilations on code changes.
+//
+// Here we re-define some of the enums and types defined in CUTLASS and CUTE to
+// break a dependency on them from XLA.
+
+enum class Arch { kDefault, kSm80, kSm90 };
+
+// Keep in sync with cutlass::gemm::GemmUniversalMode.
+enum class GemmMode { kGemm, kGemmSplitKParallel, kBatched, kArray, kInvalid };
+
+template <Arch arch>
+struct Bf16xBf16ToBf16 {};
+
+template <Arch arch>
+struct F32xF32ToF32 {};
+
+template <Arch arch>
+struct Bf16xBf16ToF32 {};
+
+template <Arch arch>
+struct F32xBf16ToF32 {};
+
+template <Arch arch>
+struct Bf16xS8ToF32 {};
+
+// A tag to specialize CUTLASS kernel adaptors for loading kernels from shared
+// libraries using dlopen.
+struct DlOpenedKernel {};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS gemm arguments
+//===----------------------------------------------------------------------===//
+
+// Indices of a custom fusion parameters corresponding to Gemm kernel arguments.
+//
+// Example:
+//   se::KernelArgsDeviceMemoryArray args = ...
+//   void* lhs = args->device_memory_ptr(indices.lhs);
+//
+// Custom fusion instruction can have parameters in arbitrary order, and we need
+// a mapping from a custom kernel argument to the fusion instruction parameter.
+struct ArgsIndices {
+  int64_t lhs;
+  int64_t rhs;
+  int64_t out;
+
+  // Workspace parameter is a special case, as it's always passed as a last
+  // parameter at run time (only if requested).
+  bool has_workspace;
+};
+
+// Custom CUTLASS gemm kernels support on-device address arithmetics for input
+// and output buffers, so that we can fuse dynamic-slice/dynamic-update-slice
+// operations into the GEMM kernel.
+//
+// Base pointers and memory layout known on the host before kernel launch, but
+// offsets are computed on device and available only in device memory. We can't
+// load offsets to the host as it would require stream synchronization.
+//
+// Following structs encode how dynamic offsets passed to custom kernels.
+//
+// Example: CUTLASS gemm with a dynamic-update-slice
+//
+//   cutlass_gemm {
+//     p0 = f32[2,2]{1,0} parameter(0)
+//     p1 = f32[2,2,2]{2,1,0} parameter(1)
+//     p2 = s32[] parameter(2)               <--- major dim offset
+//     p3 = s32[] parameter(3)               <--- minor dims offset
+//     dot = f32[2,2]{1,0} dot(p0, p0)
+//     ...
+//     ROOT r = f32[2,2,2]{2,1,0} dynamic-update-slice(p1, ..., p2, p3, p3)
+//   }
+//
+// In this example `p2` parameter defines a dynamic slice offset along the
+// major dimension (0-th dimension for a row major layout). In practice
+// parameters can be passed to fusions in arbitrary order, and when we pack
+// custom kernel arguments into device kernel parameters we need to know
+// how to find correct device pointers in the list of fusion arguments.
+//
+// For this example:
+//
+//   DynamicSliceIndices::out = 2
+//   DynamicSliceArguments::out = <pointer to p2 buffer>
+//
+// `DynamicSliceIndices` used in the host-code to fetch device memory pointers
+// from arguments and pass it as `DynamicSliceArguments` to a device kernel.
+//
+// Kernel arguments packing function can pass dynamic slices as a part of
+// CUTLASS kernel parameters, or as a separate argument to a device kernel entry
+// function (CUTLASS 3x vs 2x).
+
+// Indices of a custom fusion parameters corresponding to dynamic slice offsets.
+struct DynamicSliceIndices {
+  // Index of a dynamic slice offset along the major dimension.
+  std::optional<int64_t> out;
+};
+
+// Pointers to buffers (s32[] buffers in HLO) holding dynamic slice offsets.
+struct DynamicSliceArguments {
+  int32_t* out = nullptr;
+};
+
+// Type-erased CUTLASS gemm arguments structure that has all of the details
+// required for packing CUTLASS kernel parameters.
+struct Arguments {
+  GemmMode mode;
+
+  // Number of batches when mode is `kBatched`.
+  // Number of k-slices when mode is `kGemmSplitKParallel`.
+  int32_t batch_count;
+
+  int32_t m;
+  int32_t n;
+  int32_t k;
+
+  void* lhs;
+  void* rhs;
+  void* out;
+  void* workspace;
+
+  DynamicSliceArguments slices;
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS Host Side Adaptor
+//===----------------------------------------------------------------------===//
+
+template <typename Tag>
+struct Traits;
+
+struct Dim3 {
+  uint32_t x = 1;
+  uint32_t y = 1;
+  uint32_t z = 1;
+};
+
+// This is a type-erased adaptor that has all details required for launching
+// CUTLASS kernel on a device. At run time device kernel parameters is really
+// just a bag of bytes that driver sends to a kernel, so we rely on it to hide
+// CUTLASS templates inside individual build targets and don't leak them into
+// XLA, as they contain device code and can't be parsed by regular clang.
+template <typename Tag>
+class Adaptor {
+ public:
+  std::optional<Dim3> ClusterDim() const;
+  Dim3 BlockDim(int32_t m, int32_t n, int32_t k) const;
+  Dim3 ThreadDim() const;
+
+  int32_t SharedMemoryBytes() const;
+
+  bool CanImplement(const Arguments& args) const;
+  int64_t WorkspaceSize(const Arguments& args) const;
+
+  void Initialize(void* params, const Arguments& args, int32_t device_sms,
+                  int32_t sm_occupancy) const;
+};
+
+// This is a specialization of adaptor that can load CUTLASS kernels from
+// pre-compiled shared libraries on disk. Libraries can be compiled ahead of
+// time using external toolchain, e.g. NVCC, as long as they export required
+// symbols with a plain C calling convention.
+template <>
+class Adaptor<DlOpenedKernel> {
+ public:
+  static std::optional<Adaptor> Load(const std::string& path);
+
+  std::optional<Dim3> ClusterDim() const;
+  Dim3 BlockDim(int32_t m, int32_t n, int32_t k) const;
+  Dim3 ThreadDim() const;
+
+  int32_t SharedMemoryBytes() const;
+
+  bool CanImplement(const Arguments& args) const;
+  int64_t WorkspaceSize(const Arguments& args) const;
+
+  void Initialize(void* params, const Arguments& args, int32_t device_sms,
+                  int32_t sm_occupancy) const;
+
+ private:
+  Adaptor(void* handle, void* block_dim_fn, void* thread_dim_fn,
+          void* shared_memory_bytes_fn, void* can_implement_fn,
+          void* workspace_size_fn, void* initialize_fn);
+
+  void* handle_;
+  void* block_dim_fn_;
+  void* thread_dim_fn_;
+  void* shared_memory_bytes_fn_;
+  void* can_implement_fn_;
+  void* workspace_size_fn_;
+  void* initialize_fn_;
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS Device Side Adaptor
+//===----------------------------------------------------------------------===//
+
+// We keep device side adaptor separate from host side adaptor so that we could
+// easily split host and device code compilation if needed.
+
+template <typename Tag>
+class DeviceKernel {
+ public:
+  void* symbol() const;
+};
+
+// This is a specialization of device kernel for loading CUTLASS kernels from
+// shared libraries on disk (see Adaptor specialization above).
+template <>
+class DeviceKernel<DlOpenedKernel> {
+ public:
+  static std::optional<DeviceKernel> Load(const std::string& path);
+
+  void* symbol() const;
+
+ private:
+  DeviceKernel(void* handle, void* symbol_fn);
+
+  void* handle_;
+  void* symbol_fn_;
+};
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h
new file mode 100644
index 00000000..1478dc8f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h
@@ -0,0 +1,465 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_ADAPTOR_CU_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_ADAPTOR_CU_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "third_party/gpus/cuda/include/vector_types.h"
+#include "cute/layout.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+// This is a template library implementing adaptor from a CUTLASS kernel to
+// StreamExecutor primitives for kernel arguments packing and kernel launching.
+//
+// This library is based on `GemmUniversalAdaptor` from CUTLASS itself, but
+// instead of targeting CUDA runtime for launching kernels, it targets XLA
+// StreamExecutor abstractions, but conceptually it has the same role: wrapping
+// device kernels into C++ API to make them launchable on streams.
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 2x vs 3x
+//===----------------------------------------------------------------------===//
+
+// Cutlass 2x and 3x have slightly different APIs, with a little bit of template
+// metaprogramming and constexpr ifs we dispatch to the correct version at
+// compile time based on a kernel template.
+template <typename Tag>
+static constexpr bool is_cutlass_3x =
+    cutlass::gemm::detail::IsCutlass3GemmKernel<
+        typename Traits<Tag>::Kernel>::value;
+
+//===----------------------------------------------------------------------===//
+// Gemm strides computation
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): CUTLASS already has functions in cute to compute strides for
+// a GEMM operations/kernels. Remove custom LdA/B/C functions.
+
+template <typename Gemm>
+int64_t LdA(const cutlass::gemm::GemmCoord &problem_size) {
+  using LayoutA = typename Gemm::LayoutA;
+
+  if constexpr (std::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
+    return problem_size.k();
+  } else {
+    static_assert(sizeof(Gemm) == 0, "unsupported layout type");
+  }
+}
+
+template <typename Gemm>
+int64_t LdB(const cutlass::gemm::GemmCoord &problem_size) {
+  using LayoutB = typename Gemm::LayoutB;
+
+  if constexpr (std::is_same_v<LayoutB, cutlass::layout::RowMajor>) {
+    return problem_size.n();
+  } else {
+    static_assert(sizeof(Gemm) == 0, "unsupported layout type");
+  }
+}
+
+template <typename Gemm>
+int64_t LdC(const cutlass::gemm::GemmCoord &problem_size) {
+  using LayoutC = typename Gemm::LayoutA;
+
+  if constexpr (std::is_same_v<LayoutC, cutlass::layout::RowMajor>) {
+    return problem_size.n();
+  } else {
+    static_assert(sizeof(Gemm) == 0, "unsupported layout type");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 2x host side adaptor
+//===----------------------------------------------------------------------===//
+
+namespace adaptor_2x {
+
+template <typename Tag>
+static std::optional<Dim3> ClusterDim() {
+  return std::nullopt;
+}
+
+template <typename Tag>
+static Dim3 BlockDim(int32_t m, int32_t n, int32_t k) {
+  using Operation = typename Traits<Tag>::Operation;
+  using ThreadblockSwizzle = typename Operation::ThreadblockSwizzle;
+  using ThreadblockShape = typename Operation::ThreadblockShape;
+
+  cutlass::gemm::GemmCoord problem_size(m, n, k);
+  cutlass::gemm::GemmCoord tile_size(ThreadblockShape::kM, ThreadblockShape::kN,
+                                     ThreadblockShape::kK);
+  cutlass::gemm::GemmCoord grid_tiled_shape =
+      ThreadblockSwizzle::get_tiled_shape(problem_size, tile_size,
+                                          /*split_k_slices=*/1);
+
+  auto grid = ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
+  return Dim3{grid.x, grid.y, grid.z};
+}
+
+template <typename Tag>
+static int32_t SharedMemoryBytes() {
+  return sizeof(typename Traits<Tag>::Kernel::SharedStorage);
+};
+
+template <typename Tag>
+static Dim3 ThreadDim() {
+  return Dim3{Traits<Tag>::Kernel::kThreadCount, 1, 1};
+}
+
+template <typename Tag>
+static bool CanImplement(const Arguments &args) {
+  cutlass::gemm::GemmCoord problem_size(args.m, args.n, args.k);
+  return Traits<Tag>::Kernel::can_implement(problem_size) ==
+         cutlass::Status::kSuccess;
+}
+
+inline cutlass::gemm::GemmUniversalMode ToGemmUniversalMode(GemmMode mode) {
+  switch (mode) {
+    case GemmMode::kGemm:
+      return cutlass::gemm::GemmUniversalMode::kGemm;
+    case GemmMode::kGemmSplitKParallel:
+      return cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel;
+    case GemmMode::kBatched:
+      return cutlass::gemm::GemmUniversalMode::kBatched;
+    case GemmMode::kArray:
+      return cutlass::gemm::GemmUniversalMode::kArray;
+    case GemmMode::kInvalid:
+      return cutlass::gemm::GemmUniversalMode::kInvalid;
+  }
+}
+
+// Converts type-erased gemm arguments to the underlying CUTLASS operation
+// arguments.
+template <typename Tag>
+static typename Traits<Tag>::Arguments OpArguments(const Arguments &args) {
+  cutlass::gemm::GemmCoord problem_size(args.m, args.n, args.k);
+
+  // TODO(ezhulenev): Replace with cute::stride instead of custom templates.
+  auto lda = LdA<typename Traits<Tag>::Operation>(problem_size);
+  auto ldb = LdB<typename Traits<Tag>::Operation>(problem_size);
+  auto ldc = LdC<typename Traits<Tag>::Operation>(problem_size);
+
+  cutlass::gemm::GemmUniversalMode mode = ToGemmUniversalMode(args.mode);
+
+  // TODO(ezhulenev): We hardcode parameters for `LinearCombination`
+  // epilogue, however `Gemm` template can be compiled with arbitrary
+  // epilogues. We have to support custom epilogues in a way that does not
+  // leak cutlass types via the public API function signature.
+  using Accumulator = typename Traits<Tag>::Operation::ElementAccumulator;
+  Accumulator alpha{1.0};
+  Accumulator beta{0.0};
+
+  return typename Traits<Tag>::Arguments(      // CUTLASS Operation arguments
+      mode, problem_size,                      //
+      args.batch_count,                        // batch or k-split slices
+      {alpha, beta},                           // epilogue
+      args.lhs, args.rhs, args.out, args.out,  // pointers
+      0, 0, 0, 0,                              // batch strides
+      lda, ldb, ldc, ldc                       // strides
+  );
+}
+
+template <typename Tag>
+int64_t WorkspaceSize(const Arguments &args) {
+  return Traits<Tag>::Operation::get_workspace_size(OpArguments<Tag>(args));
+}
+
+template <typename Tag>
+void Initialize(void *params, const Arguments &args, int32_t device_sms,
+                int32_t sm_occupancy) {
+  // Sanity check that parameters struct is compatible with parameters storage
+  // defined by custom gemm kernel.
+  static_assert(sizeof(typename Traits<Tag>::Params) <= 1024,
+                "Params struct size is too large");
+  static_assert(alignof(typename Traits<Tag>::Params) <= 32,
+                "Params struct alignment is too large");
+
+  // Convert CUTLASS operation arguments to a device kernel parameters.
+  new (params) typename Traits<Tag>::Params(OpArguments<Tag>(args), device_sms,
+                                            sm_occupancy);
+}
+
+};  // namespace adaptor_2x
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 3x host side adaptor
+//===----------------------------------------------------------------------===//
+
+namespace adaptor_3x {
+
+template <typename Tag>
+static std::optional<Dim3> ClusterDim() {
+  typename Traits<Tag>::Kernel::DispatchPolicy::ClusterShape cluster;
+  return Dim3{static_cast<uint32_t>(cute::get<0>(cluster)),
+              static_cast<uint32_t>(cute::get<1>(cluster)),
+              static_cast<uint32_t>(cute::get<2>(cluster))};
+}
+
+template <typename Tag>
+static Dim3 BlockDim(int32_t m, int32_t n, int32_t k) {
+  return adaptor_2x::BlockDim<Tag>(m, n, k);
+}
+
+template <typename Tag>
+static Dim3 ThreadDim() {
+  auto block_shape = Traits<Tag>::Kernel::get_block_shape();
+  return Dim3{block_shape.x, block_shape.y, block_shape.z};
+}
+
+template <typename Tag>
+static int32_t SharedMemoryBytes() {
+  return Traits<Tag>::Kernel::SharedStorageSize;
+};
+
+template <typename Tag>
+static typename Traits<Tag>::Arguments OpArguments(const Arguments &args) {
+  using Kernel = typename Traits<Tag>::Kernel;
+  using Operation = typename Traits<Tag>::Operation;
+
+  auto stride_a = cutlass::make_cute_packed_stride(
+      typename Kernel::StrideA{}, cute::make_shape(args.m, args.k, 1));
+  auto stride_b = cutlass::make_cute_packed_stride(
+      typename Kernel::StrideB{}, cute::make_shape(args.n, args.k, 1));
+  auto stride_c = cutlass::make_cute_packed_stride(
+      typename Kernel::StrideC{}, cute::make_shape(args.m, args.n, 1));
+  auto stride_d = cutlass::make_cute_packed_stride(
+      typename Kernel::StrideD{}, cute::make_shape(args.m, args.n, 1));
+
+  // TODO(ezhulenev): Pass device id and sm_count in arguments.
+  cutlass::KernelHardwareInfo hw_info{/*device_id=*/0, /*sm_count=*/128};
+
+  cutlass::gemm::GemmUniversalMode mode =
+      static_cast<cutlass::gemm::GemmUniversalMode>(
+          static_cast<int>(args.mode));
+  typename Kernel::ProblemShape problem_shape = {args.m, args.n, args.k,
+                                                 /*batch=*/1};
+
+  // TODO(ezhulenev): We hardcode parameters for `LinearCombination`
+  // epilogue, however `Gemm` template can be compiled with arbitrary
+  // epilogues. We have to support custom epilogues in a way that does not
+  // leak cutlass types via the public API function signature.
+  using Accumulator = typename Traits<Tag>::Operation::ElementAccumulator;
+  Accumulator alpha{1.0};
+  Accumulator beta{0.0};
+
+  typename Kernel::MainloopArguments mainloop_args{
+      reinterpret_cast<typename Operation::ElementA *>(args.lhs), stride_a,
+      reinterpret_cast<typename Operation::ElementB *>(args.rhs), stride_b};
+
+  typename Kernel::EpilogueArguments epilogue_args{
+      {alpha, beta},
+      reinterpret_cast<typename Operation::ElementC *>(args.out),
+      stride_c,
+      reinterpret_cast<typename Operation::ElementC *>(args.out),
+      stride_d,
+      {{args.slices.out}, {args.m * args.n}},  // dynamic offsets for C
+      {{args.slices.out}, {args.m * args.n}},  // dynamic offsets for D
+  };
+
+  return typename Operation::Arguments{mode, problem_shape, mainloop_args,
+                                       epilogue_args, hw_info};
+}
+
+template <typename Tag>
+static bool CanImplement(const Arguments &args) {
+  return Traits<Tag>::Kernel::can_implement(OpArguments<Tag>(args));
+}
+
+template <typename Tag>
+static int64_t WorkspaceSize(const Arguments &args) {
+  return Traits<Tag>::Operation::get_workspace_size(OpArguments<Tag>(args));
+}
+
+template <typename Tag>
+static void Initialize(void *params, const Arguments &args, int32_t device_sms,
+                       int32_t sm_occupancy) {
+  // Sanity check that parameters struct is compatible with parameters storage
+  // defined by custom gemm kernel.
+  static_assert(sizeof(typename Traits<Tag>::Params) <= 1024,
+                "Params struct size is too large");
+  // The alignment check here needs to be consistent with the definition of
+  // Params in file cutlass_gemm_custom_kernel.cc
+  static_assert(alignof(typename Traits<Tag>::Params) <= 128,
+                "Params struct alignment is too large");
+
+  // Convert CUTLASS operation arguments to a device kernel parameters.
+  using Kernel = typename Traits<Tag>::Kernel;
+  new (params) typename Traits<Tag>::Params(
+      Kernel::to_underlying_arguments(OpArguments<Tag>(args), args.workspace));
+}
+
+};  // namespace adaptor_3x
+
+//===----------------------------------------------------------------------===//
+// Dispatch between CUTLASS 2x and 3x host adaptors
+//===----------------------------------------------------------------------===//
+
+template <typename Tag>
+std::optional<Dim3> Adaptor<Tag>::ClusterDim() const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::ClusterDim<Tag>();
+  } else {
+    return adaptor_2x::ClusterDim<Tag>();
+  }
+}
+
+template <typename Tag>
+Dim3 Adaptor<Tag>::ThreadDim() const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::ThreadDim<Tag>();
+  } else {
+    return adaptor_2x::ThreadDim<Tag>();
+  }
+}
+
+template <typename Tag>
+Dim3 Adaptor<Tag>::BlockDim(int32_t m, int32_t n, int32_t k) const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::BlockDim<Tag>(m, n, k);
+  } else {
+    return adaptor_2x::BlockDim<Tag>(m, n, k);
+  }
+}
+
+template <typename Tag>
+int32_t Adaptor<Tag>::SharedMemoryBytes() const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::SharedMemoryBytes<Tag>();
+  } else {
+    return adaptor_2x::SharedMemoryBytes<Tag>();
+  }
+};
+
+template <typename Tag>
+bool Adaptor<Tag>::CanImplement(const Arguments &args) const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::CanImplement<Tag>(args);
+  } else {
+    return adaptor_2x::CanImplement<Tag>(args);
+  }
+}
+
+template <typename Tag>
+int64_t Adaptor<Tag>::WorkspaceSize(const Arguments &args) const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::WorkspaceSize<Tag>(args);
+  } else {
+    return adaptor_2x::WorkspaceSize<Tag>(args);
+  }
+}
+
+template <typename Tag>
+void Adaptor<Tag>::Initialize(void *params, const Arguments &args,
+                              int32_t device_sms, int32_t sm_occupancy) const {
+  if constexpr (is_cutlass_3x<Tag>) {
+    return adaptor_3x::Initialize<Tag>(params, args, device_sms, sm_occupancy);
+  } else {
+    return adaptor_2x::Initialize<Tag>(params, args, device_sms, sm_occupancy);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 2x device kernel entry point
+//===----------------------------------------------------------------------===//
+
+// This entry point is based on `cutlass::Kernel2` template with an extra
+// parameter to pass dynamic slices.
+//
+// TODO(ezhulenev): Dynamic slices should be encoded in kernel parameters.
+template <typename Kernel>
+__global__ void Kernel2EntryPoint(typename Kernel::Params params,
+                                  DynamicSliceArguments dynamic_slices) {
+  extern __shared__ int SharedStorageBase[];
+  typename Kernel::SharedStorage *shared_storage =
+      reinterpret_cast<typename Kernel::SharedStorage *>(SharedStorageBase);
+
+  // Adjust output pointer to account for dynamic offsets.
+  if (dynamic_slices.out) {
+    auto m = params.problem_size.m();
+    auto n = params.problem_size.n();
+
+    using ElementC = typename Kernel::ElementC;
+    int64_t offset = sizeof(ElementC) * *dynamic_slices.out * (m * n);
+
+    char *ptr_c = reinterpret_cast<char *>(params.ptr_C);
+    char *ptr_d = reinterpret_cast<char *>(params.ptr_D);
+
+    params.ptr_C = ptr_c + offset;
+    params.ptr_D = ptr_d + offset;
+  }
+
+  Kernel::invoke(params, *shared_storage);
+}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 3x device kernel entry point
+//===----------------------------------------------------------------------===//
+
+template <typename Kernel>
+__global__ void Kernel3EntryPoint(
+    CUTLASS_GRID_CONSTANT const typename Kernel::Params params) {
+  extern __shared__ char shared_memory[];
+
+  Kernel kernel;
+  kernel(params, shared_memory);
+}
+
+//===----------------------------------------------------------------------===//
+// Dispatch between CUTLASS 2x and 3x kernel entry points
+//===----------------------------------------------------------------------===//
+
+template <typename Tag>
+void *DeviceKernel<Tag>::symbol() const {
+  using Kernel = typename Traits<Tag>::Kernel;
+
+  if constexpr (is_cutlass_3x<Tag>) {
+    return reinterpret_cast<void *>(Kernel3EntryPoint<Kernel>);
+  } else {
+    return reinterpret_cast<void *>(Kernel2EntryPoint<Kernel>);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS kernel traits helper
+//===----------------------------------------------------------------------===//
+
+#define XLA_GPU_DEFINE_CUTLASS_GEMM_TRAITS(TAG, OPERATION) \
+  template <>                                              \
+  struct Traits<TAG> {                                     \
+    using Operation = OPERATION;                           \
+    using Arguments = typename Operation::Arguments;       \
+    using Kernel = typename Operation::GemmKernel;         \
+    using Params = typename Kernel::Params;                \
+  }
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_ADAPTOR_CU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h
new file mode 100644
index 00000000..04b09251
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_CUSTOM_KERNEL_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+// Returns pre-compiled custom kernels for a given data type and problem size.
+absl::StatusOr<std::vector<CustomKernel>> GetCutlassGemmKernels(
+    std::string name, PrimitiveType dot_type, PrimitiveType lhs_type,
+    PrimitiveType rhs_type, int32_t m, int32_t n, int32_t k,
+    const ArgsIndices& indices, const DynamicSliceIndices& slices,
+    const se::DeviceDescription& device);
+
+// Loads custom kernel for a given data type and problem size from a shared
+// library. It's up to the caller to guarantee that CUTLASS kernel in the shared
+// library is compatible with the data type and problem size.
+absl::StatusOr<CustomKernel> LoadCutlassGemmKernel(
+    std::string name, const std::string& library_path, PrimitiveType dtype,
+    int32_t m, int32_t n, int32_t k, const ArgsIndices& indices,
+    const DynamicSliceIndices& slices, const se::DeviceDescription& device);
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_CUSTOM_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_epilogue.cu.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_epilogue.cu.h
new file mode 100644
index 00000000..1c48e6f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_epilogue.cu.h
@@ -0,0 +1,309 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_EPILOGUE_CU_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_EPILOGUE_CU_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "cute/config.hpp"
+#include "cute/container/array.hpp"
+#include "cute/layout.hpp"
+#include "cute/numeric/int.hpp"
+#include "cute/numeric/integral_constant.hpp"
+#include "cute/tensor.hpp"
+#include "cute/underscore.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/numeric_conversion.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+using cutlass::epilogue::collective::detail::get_epilogue_stride;
+
+//===----------------------------------------------------------------------===//
+// Custom CUTLASS epilogue fusions
+//===----------------------------------------------------------------------===//
+
+template <typename ElementOutput, typename ElementCompute,
+          unsigned dynamic_offset = 0,
+          cutlass::FloatRoundStyle round_style =
+              cutlass::FloatRoundStyle::round_to_nearest>
+struct LinearCombinationWithDynamicSlice
+    : cutlass::epilogue::fusion::ScaledAcc<ElementOutput, ElementCompute,
+                                           ElementCompute, round_style> {
+  static constexpr bool IsSourceSupported = true;  // NOLINT
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS gemm epilogue with an on-device offset support
+//===----------------------------------------------------------------------===//
+
+// This epilogue is derived from CUTLASS default epilogue with an additional
+// support for dynamic slice offsets.
+//
+// Original: cutlass/epilogue/collective/default_epilogue.hpp
+
+// Applies an element wise operation to all elements within the fragment
+// and writes them out to destination storage. C and D storage can have
+// optional dynamic offsets (offsets stored in a device memory).
+template <typename StrideC_, typename StrideD_, typename ThreadEpilogueOp_,
+          typename EpilogueSchedule_, unsigned dynamic_offset>
+class DynamicSliceEpilogue {
+ public:
+  using EpilogueSchedule = EpilogueSchedule_;
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType =
+      typename cute::uint_bit<cute::sizeof_bits<ElementOutput>::value *
+                              kOutputAlignment>::type;
+
+  static_assert(cute::rank(StrideC{}) == 3,
+                "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3,
+                "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage {};
+
+  // Offset into C and D computed as a dot product of `offset` and `stride`.
+  struct DynamicOffset {
+    cute::array<int32_t const*, dynamic_offset> offset{};
+    cute::array<int64_t, dynamic_offset> stride{};
+  };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const* ptr_c = nullptr;
+    StrideC stride_c{};
+    ElementD* ptr_d = nullptr;
+    StrideD stride_d{};
+    DynamicOffset offset_c{};
+    DynamicOffset offset_d{};
+  };
+
+  // Device side epilogue params_
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& _,
+                                                  Arguments const& args,
+                                                  void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const& problem_shape,
+                                   Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(ProblemShape const& problem_shape,
+                                              Arguments const& args,
+                                              void* workspace,
+                                              cudaStream_t stream) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement(
+      ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit DynamicSliceEpilogue(Params const& params__)
+      : params_(params__), epilogue_op_(params__.thread) {}
+
+  CUTLASS_DEVICE
+  bool is_source_needed() { return epilogue_op_.is_source_needed(); }
+
+  template <class ProblemShapeMNKL, class BlockShapeMNK, class BlockCoordMNKL,
+            class FrgEngine, class FrgLayout, class TiledMma, class ResidueMNK>
+  CUTLASS_HOST_DEVICE void operator()(
+      ProblemShapeMNKL problem_shape_mnkl, BlockShapeMNK blk_shape_mnk,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma, ResidueMNK residue_mnk, int thread_idx,
+      char* smem_buf) {
+    using cute::_;
+    using cute::_1;
+    using cute::local_tile;
+    using cute::make_coord;
+    using cute::make_gmem_ptr;
+    using cute::make_identity_tensor;
+    using cute::make_shape;
+    using cute::make_tensor;
+    using cute::shape;
+    using cute::Tensor;
+    using cute::unwrap;
+
+    using X = cute::Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4,
+                  "ProblemShapeMNKL must be rank 4");
+    static_assert(cute::is_static<BlockShapeMNK>::value,
+                  "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3,
+                  "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4,
+                  "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto m = cute::get<0>(problem_shape_mnkl);
+    auto n = cute::get<1>(problem_shape_mnkl);
+    auto l = cute::get<3>(problem_shape_mnkl);
+
+    auto stride_c = get_epilogue_stride<EpilogueSchedule>(params_.stride_c);
+    auto stride_d = get_epilogue_stride<EpilogueSchedule>(params_.stride_d);
+
+    ElementC const* ptr_c = params_.ptr_c;
+    ElementD* ptr_d = params_.ptr_d;
+
+    // Apply dynamic offsets to base pointers.
+    for (unsigned i = 0; i < dynamic_offset; ++i) {
+      if (params_.offset_c.offset[i])
+        ptr_c += *params_.offset_c.offset[i] * params_.offset_c.stride[i];
+    }
+    for (unsigned i = 0; i < dynamic_offset; ++i) {
+      if (params_.offset_d.offset[i])
+        ptr_d += *params_.offset_d.offset[i] * params_.offset_d.stride[i];
+    }
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_c), make_shape(m, n, l),
+                                stride_c);  // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(ptr_d), make_shape(m, n, l),
+                                stride_d);  // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_mnk, make_coord(_, _, _),
+                               cute::Step<_1, _1, X>{});  // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_mnk, make_coord(_, _, _),
+                               cute::Step<_1, _1, X>{});  // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_, _, m_coord, n_coord, l_coord);  // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_, _, m_coord, n_coord, l_coord);  // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator
+    // partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);  // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);  // (VEC,THR_M,THR_N)
+
+    static_assert(cute::is_static<FrgLayout>::value,
+                  "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(
+        size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(
+        size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(
+        make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    if (epilogue_op_.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(cute::get<0>(residue_mnk),
+                                          cute::get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op_(accumulators(i), tCgC(i));
+        }
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(cute::get<0>(residue_mnk),
+                                          cute::get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op_(accumulators(i));
+        }
+      }
+    }
+  }
+
+ private:
+  Params params_;
+  ThreadEpilogueOp epilogue_op_;
+};
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+namespace cutlass::epilogue::collective {
+
+//===----------------------------------------------------------------------===//
+// Collective builder specialization for LinearCombinationWithDynamicSlice
+//===----------------------------------------------------------------------===//
+
+// Specialization for `NoSmemWarpSpecialized` schedule.
+template <typename TileShape_MNK, typename ClusterShape_MNK,
+          typename EpilogueTileType, typename ElementAccumulator,
+          typename ElementCompute, typename ElementC_, typename GmemLayoutTagC_,
+          int AlignmentC, typename ElementD, typename GmemLayoutTagD,
+          int AlignmentD, cutlass::FloatRoundStyle RoundStyle,
+          unsigned dynamic_offset>
+struct CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape_MNK,
+    ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
+    ElementC_, GmemLayoutTagC_, AlignmentC, ElementD, GmemLayoutTagD,
+    AlignmentD, cutlass::epilogue::NoSmemWarpSpecialized,
+    xla::gpu::kernel::gemm_universal::LinearCombinationWithDynamicSlice<
+        ElementD, ElementCompute, dynamic_offset, RoundStyle>,
+    void> {
+  // Passing void C disables source load
+  using ElementC =
+      cute::conditional_t<cute::is_void_v<ElementC_>, ElementD, ElementC_>;
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
+                                             GmemLayoutTagD, GmemLayoutTagC_>;
+
+  static constexpr cutlass::epilogue::thread::ScaleType::Kind ScaleType =
+      cute::is_void_v<ElementC_>
+          ? cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+          : cutlass::epilogue::thread::ScaleType::Default;
+
+  static constexpr int FragmentSize = 1;
+  using ThreadOp = cutlass::epilogue::thread::LinearCombination<
+      ElementD, FragmentSize, ElementAccumulator, ElementCompute, ScaleType,
+      RoundStyle, ElementC>;
+
+  using CollectiveOp =
+      cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+          xla::gpu::kernel::gemm_universal::DynamicSliceEpilogue<
+              cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+              cutlass::detail::TagToStrideC_t<GmemLayoutTagD>, ThreadOp,
+              cutlass::gemm::EpilogueDefault, dynamic_offset>>;
+};
+
+}  // namespace cutlass::epilogue::collective
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_EPILOGUE_CU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
new file mode 100644
index 00000000..e7027c90
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_FUSION_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_FUSION_H_
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Pattern matches simple row-major gemms to CUTLASS kernels.
+class CutlassGemmPattern : public CustomKernelFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override;
+};
+
+// Pattern matches simple row-major gemms with dynamic-update-slice.
+class CutlassGemmWithDynamicUpdateSlicePattern
+    : public CustomKernelFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override;
+};
+
+// Pattern matches mixed dtype gemms when one of the operands is upcasted to an
+// accumulator (output) dtype, i.e. BF16 <= BF16 x S8.
+class CutlassGemmWithUpcastPattern : public CustomKernelFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.h
new file mode 100644
index 00000000..d39d6ca1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_PTX_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_PTX_CUSTOM_KERNEL_H_
+
+#include <cstddef>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::gpu::kernel {
+
+absl::StatusOr<CustomKernel> GetPtxCustomKernel(std::string kernel_name,
+                                                absl::string_view ptx,
+                                                int num_args,
+                                                se::BlockDim block_dim,
+                                                se::ThreadDim thread_dim,
+                                                size_t shared_memory_bytes = 0);
+}
+
+#endif  // XLA_SERVICE_GPU_KERNELS_PTX_CUSTOM_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h
new file mode 100644
index 00000000..715f92f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
+
+#include <cstddef>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::topk {
+
+// Creates a CustomKernel for TopK operation.
+absl::StatusOr<CustomKernel> GetTopKKernel(std::string name,
+                                           PrimitiveType dtype,
+                                           size_t num_elements, size_t k,
+                                           size_t batch_size);
+
+}  // namespace xla::gpu::kernel::topk
+
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
new file mode 100644
index 00000000..a0435866
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
@@ -0,0 +1,322 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
+
+// This file contains bespoke and optimized implementation for TopK shapes. When
+// adding support for new shapes/dtypes, you also need to modify the rewriter
+// on topk_specializer.cc for these changes to be picked up.
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include "xla/service/gpu/kernels/topk_kernel_common.h"
+#include "xla/tsl/lib/math/math_util.h"
+
+#if GOOGLE_CUDA
+
+#define WAVEFRONT_SIZE 32
+#define FORCEINLINE __forceinline__
+
+#elif TENSORFLOW_USE_ROCM  // GOOGLE_CUDA
+
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
+#else
+#define WAVEFRONT_SIZE 64
+#endif
+#define FORCEINLINE __forceinline__
+
+#endif  // TENSORFLOW_USE_ROCM
+
+namespace xla::gpu {
+
+enum class ShflType { kSync, kUp, kDown, kXor };
+
+template <ShflType Type, class NT>
+__device__ FORCEINLINE NT GpuShuffle(NT val, uint32_t idx,
+                                     uint32_t allmsk = 0xffffffffu) {
+  constexpr uint32_t SZ =
+      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
+  union S {
+    NT v;
+    uint32_t d[SZ];
+  };
+  S in{val}, res{};
+
+#pragma unroll
+  for (uint32_t i = 0; i < SZ; i++) {
+#if GOOGLE_CUDA
+    if constexpr (Type == ShflType::kSync)
+      res.d[i] = __shfl_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kUp)
+      res.d[i] = __shfl_up_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kDown)
+      res.d[i] = __shfl_down_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kXor)
+      res.d[i] = __shfl_xor_sync(allmsk, in.d[i], idx);
+#elif TENSORFLOW_USE_ROCM  // ROcm does not support sync shuffle intrinsics
+    if constexpr (Type == ShflType::kSync)
+      res.d[i] = __shfl(in.d[i], idx);
+    else if constexpr (Type == ShflType::kUp)
+      res.d[i] = __shfl_up(in.d[i], idx);
+    else if constexpr (Type == ShflType::kDown)
+      res.d[i] = __shfl_down(in.d[i], idx);
+    else if constexpr (Type == ShflType::kXor)
+      res.d[i] = __shfl_xor(in.d[i], idx);
+#endif
+  }
+  return res.v;
+}
+
+// Default implementation for KV holder. Useful for testing while adding support
+// for a new type, but generally bitpacking those values is more efficient. See
+// implementations below.
+template <typename T, typename V>
+struct Descending {
+  struct KVT {
+    T key;
+    V idx;
+  };
+
+  __device__ FORCEINLINE static bool cmp(const KVT& lhs, const KVT& rhs) {
+    return lhs.key == rhs.key ? lhs.idx < rhs.idx : lhs.key > rhs.key;
+  }
+};
+
+// TopK implements a faster TopK for K < 16.
+//
+// To compute the final largest K elements, we shard the data threads and each
+// of them computes the top k elements for the data in its slice. When all lanes
+// in a warp are done with their TopK, we merge all the lane-local topks into
+// lane 0 using warp-local reductions. The lane-local topk is computed at
+// PerWarpTopK() and the warp reduction is computed in Reduce(). The warp-local
+// results are stored in shared memory.
+//
+// Once all warps are done, we load all previously produced results into a
+// single warp and repeat the reduction described above. This is implemented in
+// MergeTopKs() and we reuse the Reduce() implementation described above. On
+// MergeTopKs we also write the final results to the user-provided buffer.
+//
+// === Detailed Design
+//
+// The high level goals of this implementations are:
+//  - Low latency for small N (i.e. kilobytes).
+//  - High throughput for large N and/or large batch.
+//
+// Non-goals:
+//  - K > 32. Register pressure will be too high.
+//  - Sharding over multiple SMs. As explained later, we can use TopK's
+//    structure to get this "for free".
+//
+// The core observation of this implementation is that reading/writing to main
+// memory is the bottleneck in usual the Sort/TopK implementations and that for
+// K<16 a linear scan with in-register data is faster than using a heap with
+// shared memory, especially when K is a power of two.
+//
+// The heap for K=7 looks like:
+//
+//             a0
+//        a1        a2
+//      a3  a4    a5  a6
+//
+// When performing a push/pop, in the worst case scenario we need to compare it
+// with the root, both of its children, and one of the two subtrees. This means
+// that using a heap for K=7 only save us 2/7 comparison. Additionally, if the
+// tree were unbalanced(e.g. K=8), we would not be able to unroll this
+// computation.
+//
+// If we're using linear insertion, the worst case results in the full K
+// comparisons comparisons, but with care all of those values can be kept in
+// registers, replacing somewhat load/store instructions with movs. This
+// performance are more than enough to surpass the heap.
+//
+// We split the data evenly over T (<=1024) threads, and use the algorithm above
+// to maintain a sorted list of K elements in registers and perform linear
+// insertions on every new element. Once a warp is done with their local slice,
+// we reduce the slice-local data using shfl and the insertion described above,
+// by adding the other lane's TopK results to the local lane. Once the warp is
+// done, lane 0 writes its results to shared memory. This step has complexity:
+//    theta(k * slice_size + k^2 * log2(k))
+//
+// On a second pass, we use a single warp to consume the results of the previous
+// step and merge them into a final topk, using an analogous algorithm to what
+// has been previously described. Complexity of this stage is:
+//    theta(k^2 * log2(k)).
+//
+// This algorithm only uses a single block per batch dimension, but for large N,
+// we can split the input into B batches of size N/B, calculate each of their
+// topks and then compute a final topk, fixing the indices in the process.
+//
+// Future improvements:
+//  - Use optimal sort/merge networks to reduce the complexity the algorithm and
+//    allow better scaling past K=16. This is fairly tricky to implement
+//    efficiently, so it was let out of v1.
+//
+
+template <size_t K, typename KT, typename VT,
+          template <class, class> class Traits = Descending>
+struct TopK {
+  using Trait = Traits<KT, VT>;
+  using KVT = typename Trait::KVT;
+
+  __device__ TopK(void* buffer, int num_outputs)
+      : buffer_(reinterpret_cast<KVT*>(buffer)), num_outputs_(num_outputs) {}
+
+  __device__ FORCEINLINE uint32_t Idx(uint32_t i) {
+    return blockDim.x * i + threadIdx.x;
+  }
+
+  // Compute a per-warp topk of a slice of data.
+  __device__ void PerWarpTopK(KT* key, int n) {
+    KVT tmp[K];
+    // TODO(doak): Use bitonic sort.
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      tmp[i] = {key[Idx(i)], VT(Idx(i))};
+    }
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+#pragma unroll
+      for (int j = i + 1; j < K; j++) {
+        KVT ti = tmp[i];
+        KVT tj = tmp[j];
+        bool res = Trait::cmp(ti, tj);
+        tmp[i] = res ? ti : tj;
+        tmp[j] = res ? tj : ti;
+      }
+    }
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+
+    for (int idx = K; idx < n; idx++) {
+      KVT kv{key[Idx(idx)], VT(Idx(idx))};
+      Push(tmp, kv);
+    }
+    Reduce(tmp, WarpSize);
+
+    if (threadIdx.x % WarpSize != 0) return;
+    int warp_id = threadIdx.x / WarpSize;
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      buffer_[i * WarpSize + warp_id] = tmp[i];
+    }
+  }
+
+  // Merge the per-warp topks into a single topk. The final data is written to
+  // `keys` and `idxs`
+  __device__ void MergeTopKs(KT* keys, uint32_t* idxs) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    KVT tmp[K];
+    // We only use one warp for this step.
+    if (threadIdx.x >= WarpSize) return;
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      tmp[i] = buffer_[i * WarpSize + threadIdx.x];
+    }
+    Reduce(tmp, blockDim.x / WarpSize);
+    if (threadIdx.x != 0) return;
+    for (int i = 0; i < num_outputs_; ++i) {
+      keys[i] = tmp[i].key;
+      idxs[i] = tmp[i].idx;
+    }
+  }
+
+  // Merge `tmp` (a reverse-sorted array) from (0, `num_lanes`) lanes. The
+  // resulting array is stored in the tmp array of lane 0. For all other lanes,
+  // `tmp` is unspecified after this function is called.
+  __device__ FORCEINLINE void Reduce(KVT tmp[K], int num_lanes) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    int lane_id = threadIdx.x % WarpSize;
+    for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
+#pragma unroll
+      for (int i = 0; i < K; i++) {
+        KVT kv = GpuShuffle<ShflType::kDown>(tmp[i], offset);
+        if (lane_id >= offset) continue;
+        Push(tmp, kv);
+      }
+    }
+  }
+
+  // Given a K-array of previously reverse-sorted KVTs, add kv to it and
+  // remove the smallest element of the resulting array. Preserves the sorted
+  // order of `tmp`.
+  // We are careful to write this code in a way that nvcc/ptxas will use
+  // predication rather than branching. If we don't get this right, then we
+  // can greatly expands the code size of the generated PTX and SASS by
+  // tens of thousands of instructions. This increased the size of the
+  // compressed JAX wheel by 25MiB, so be very careful to check the generated
+  // code size when changing this function.
+  static __device__ FORCEINLINE void Push(KVT tmp[K], const KVT& kv) {
+    bool p = Trait::cmp(tmp[K - 1], kv);
+    tmp[K - 1] = p ? tmp[K - 1] : kv;
+#pragma unroll
+    for (int i = static_cast<int>(K) - 2; i >= 0; --i) {
+      // Note: even though we could exit early as soon as the first time we
+      // see a value greater than kv, we don't do this because it makes nvcc
+      // generate terrible code.
+      bool p = Trait::cmp(tmp[i], kv);
+      auto t = tmp[i];
+      tmp[i] = p ? tmp[i] : tmp[i + 1];
+      tmp[i + 1] = p ? tmp[i + 1] : t;
+    }
+  }
+
+  KVT* buffer_;
+  int num_outputs_;
+};
+
+// This shared memory buffer needs to be declared outside of the templated
+// Run(), as otherwise it would generate name conflicts from the multiple
+// instantiations of Run() from the multiple monomorphizations of Run().
+extern __device__ __shared__ int shmem[];
+
+template <size_t K, typename KT, typename VT>
+__launch_bounds__(kTopKMaxThreadsPerBlock, 1) __global__
+    void Run(KT* data, int n, KT* result, uint32_t* result_idxs, int k) {
+  TopK<K, KT, VT> obj(shmem, k);
+
+  const uint32_t bidx = blockIdx.x;
+  auto in = data + n * bidx;
+  auto vals_out = result + k * bidx;
+  auto idxs_out = result_idxs + k * bidx;
+  int slice_size = n / blockDim.x;
+  if (threadIdx.x < n % blockDim.x) {
+    slice_size++;
+  }
+
+  obj.PerWarpTopK(in, slice_size);
+  obj.MergeTopKs(vals_out, idxs_out);
+}
+
+template <typename T, size_t K>
+void* GetTopKKernelForK(int n) {
+  // TODO(doak): Switch to uint32_t if we don't have an efficient
+  // implementation for uint16_t.
+  return n < std::numeric_limits<uint16_t>::max()
+             ? reinterpret_cast<void*>(&Run<K, T, uint16_t>)
+             : reinterpret_cast<void*>(&Run<K, T, uint32_t>);
+}
+
+template <typename T>
+int32_t GetTopKWaveFrontSize() {
+  return WAVEFRONT_SIZE;
+}
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel.h
new file mode 100644
index 00000000..8e15483f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
+
+#include <stddef.h>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+// Input: [batch_size, num_elements]dtype
+// Output:
+//  - top_elements: [batch_size, k] dtype
+//  - top_indices: [batch_size, k] u32
+// Where `top_elements` contains the largest elements of the input, and
+// `top_indices` their original indices.
+absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
+                     se::DeviceMemoryBase data, size_t num_elements,
+                     se::DeviceMemoryBase top_elements,
+                     se::DeviceMemoryBase top_indices, size_t k,
+                     size_t batch_size);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
new file mode 100644
index 00000000..5ddd9ed5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
+
+#include <cstddef>
+
+// Contains shared declarations between topk_kernel.cc and topk_kernel.cu.cc
+// but avoids including ABSL, etc. which some CUDA compilers cannot
+// handle.
+
+namespace xla::gpu {
+
+// We perform 2 32-way reductions, which means the largest number of threads per
+// block we support is 1024.
+static constexpr size_t kTopKMaxThreadsPerBlock = 1024;
+
+template <typename T, size_t K>
+void* GetTopKKernelForK(int n);
+
+template <typename T>
+int32_t GetTopKWaveFrontSize();
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/launch_dimensions.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/launch_dimensions.h
new file mode 100644
index 00000000..7295048f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/launch_dimensions.h
@@ -0,0 +1,104 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
+#define XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla {
+namespace gpu {
+
+// Encapsulates the launch dimensions of a kernel, e.g., the block count and the
+// number of threads per block.
+class LaunchDimensions {
+ public:
+  // The default constructor creates a launch dimension that indicate
+  // single-threaded execution.
+  LaunchDimensions()
+      : block_counts_(se::BlockDim()),
+        thread_counts_per_block_(se::ThreadDim()) {}
+
+  LaunchDimensions(uint64_t block_x_count, uint64_t thread_x_count_per_block)
+      : block_counts_(block_x_count, 1, 1),
+        thread_counts_per_block_(thread_x_count_per_block, 1, 1) {}
+
+  LaunchDimensions(const se::BlockDim& block_counts,
+                   const se::ThreadDim& thread_counts_per_block)
+      : block_counts_(block_counts),
+        thread_counts_per_block_(thread_counts_per_block) {}
+
+  se::BlockDim block_counts() const { return block_counts_; }
+
+  se::ThreadDim thread_counts_per_block() const {
+    return thread_counts_per_block_;
+  }
+
+  // Returns the total number of blocks.
+  uint64_t num_blocks() const {
+    return block_counts_.x * block_counts_.y * block_counts_.z;
+  }
+
+  // Returns the total number of threads in a block.
+  uint64_t num_threads_per_block() const {
+    return thread_counts_per_block_.x * thread_counts_per_block_.y *
+           thread_counts_per_block_.z;
+  }
+
+  uint64_t launch_bound() const {
+    return num_blocks() * num_threads_per_block();
+  }
+
+  std::string ToString() const {
+    return absl::StrCat("blocks: {", block_counts_.x, ", ", block_counts_.y,
+                        ", ", block_counts_.z, "}, threads/block: {",
+                        thread_counts_per_block_.x, ", ",
+                        thread_counts_per_block_.y, ", ",
+                        thread_counts_per_block_.z, "}");
+  }
+
+ private:
+  se::BlockDim block_counts_;
+  se::ThreadDim thread_counts_per_block_;
+};
+
+struct LaunchDimensionsConfig {
+  // The kernel implementation will be unrolled if `unroll_factor` is
+  // greater than one.
+  int unroll_factor = 1;
+};
+
+// Returns -1 if the shape doesn't allow the row vectorization code path.
+// If supported, return the number of threads to use in that case.
+int64_t ThreadsPerBlockRowVectorized(
+    const Shape& shape, const se::DeviceDescription& gpu_device_info,
+    LaunchDimensionsConfig dim_config);
+
+// Calculates the launch dimensions used to invoke `hlo`.
+LaunchDimensions CalculateLaunchDimensions(
+    const Shape& shape, const se::DeviceDescription& gpu_device_info,
+    LaunchDimensionsConfig dim_config = {});
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h
new file mode 100644
index 00000000..f44218c1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LLVM-based compiler backend.
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_AMDGPU_BACKEND_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_AMDGPU_BACKEND_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu::amdgpu {
+// Get path to libdevice file.
+std::string LibDevicePath(std::string gcn_arch_name,
+                          const std::string& rocdl_dir_path);
+// Compiles the argument module and returns it with LLVM AMDGPU backend.
+// rocdl_dir_path is the parent directory of ROCm-Device-Libs bitcode libraries.
+// The contents of the module may be changed.
+absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
+    llvm::Module* module, stream_executor::GpuComputeCapability gpu_version,
+    const DebugOptions& debug_options,
+    const std::string& module_config_cache_key);
+}  // namespace xla::gpu::amdgpu
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_AMDGPU_BACKEND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
new file mode 100644
index 00000000..39fe8b4e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -0,0 +1,67 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LLVM-based compiler backend.
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Initializes LLVM passes. Uses the PassRegistry mechanism.
+void InitializePasses(llvm::PassRegistry* pass_registry);
+
+// Returns the TargetMachine, given a triple.
+std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
+    llvm::Triple triple, absl::string_view cpu_name,
+    const DebugOptions& debug_options, absl::string_view feature_str);
+
+// Returns whether the module could use any device bitcode library functions.
+bool CouldNeedDeviceBitcode(const llvm::Module& module);
+
+// Links the module with a vector of path to bitcode modules.
+// The caller must guarantee that the paths exist.
+absl::Status LinkWithBitcodeVector(
+    llvm::Module* module, const std::vector<std::string>& bitcode_path_vector);
+
+using TargetModuleLinker = std::function<absl::Status(
+    llvm::Module*, stream_executor::GpuComputeCapability, const DebugOptions&,
+    const std::string&)>;
+
+// Links and optimizes the module.
+absl::Status LinkAndOptimizeModule(
+    llvm::Module* module, stream_executor::GpuComputeCapability gpu_version,
+    const DebugOptions& debug_options, const std::string& device_bitcode_path,
+    TargetModuleLinker module_linker, llvm::Triple default_target_triple,
+    llvm::TargetMachine* target_machine, int inline_threshold);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/load_ir_module.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/load_ir_module.h
new file mode 100644
index 00000000..c5c610c7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/load_ir_module.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_LOAD_IR_MODULE_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_LOAD_IR_MODULE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace llvm {
+class LLVMContext;
+class Module;
+}  // namespace llvm
+
+namespace xla::gpu {
+
+// Convenience function for loading a LLVM module from an IR file. The module
+// is created in the given LLVM context.
+//
+// If loading fails for some reason, dies printing a diagnostic error.
+std::unique_ptr<llvm::Module> LoadIRModule(const std::string& filename,
+                                           llvm::LLVMContext* llvm_context);
+
+// Convenience function for replacing the extension of the given filename.
+// If the filename has no extension, the new extension is appended to its name.
+//
+// For example:
+//   ReplaceFilenameExtension("/foo/baz.txt", "cc") --> "/foo/baz.cc"
+std::string ReplaceFilenameExtension(absl::string_view filename,
+                                     absl::string_view new_extension);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_LOAD_IR_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h
new file mode 100644
index 00000000..9d42dc44
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LLVM-based compiler backend.
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_H_
+
+#include <functional>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu::nvptx {
+
+// Gets the GPU name as it's known to LLVM for a given compute
+// capability.  If we see an unrecognized compute capability, we
+// return the highest one that is known and below the selected device.
+std::string GetSmName(
+    stream_executor::CudaComputeCapability compute_capability);
+
+// Compiles the argument module and returns it. libdevice_dir_path is the
+// parent directory of the libdevice bitcode libraries. The contents of the
+// module may be changed.
+//
+// The Compile.* interfaces each create their own llvm::LLVMContext objects
+// for thread safety, but note that LLVM's multithreaded support is very
+// preliminary; multithreaded use is not recommended at this time.
+absl::StatusOr<std::string> CompileToPtx(
+    llvm::Module* module, stream_executor::GpuComputeCapability gpu_version,
+    const DebugOptions& debug_options,
+    std::function<void(llvm::TargetMachine*)> configure_target = nullptr);
+
+// Determine PTX version from CUDA version.
+stream_executor::SemanticVersion
+DetermineHighestSupportedPtxVersionFromCudaVersion(
+    stream_executor::SemanticVersion cuda_version);
+
+}  // namespace xla::gpu::nvptx
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h
new file mode 100644
index 00000000..b9be9cf6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h
@@ -0,0 +1,25 @@
+/* Copyright 2024 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_LIBDEVICE_PATH_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_LIBDEVICE_PATH_H_
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace xla::gpu::nvptx {
+
+// Returns path to libdevice file.
+std::string LibDevicePath(absl::string_view xla_gpu_data_dir);
+
+}  // namespace xla::gpu::nvptx
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_LIBDEVICE_PATH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_utils.h
new file mode 100644
index 00000000..b4545f96
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_utils.h
@@ -0,0 +1,26 @@
+/* Copyright 2024 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_UTILS_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_UTILS_H_
+
+namespace xla::gpu::nvptx {
+
+std::string CantFindCudaMessage(absl::string_view msg,
+                                absl::string_view xla_gpu_cuda_data_dir);
+
+}
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/utils.h
new file mode 100644
index 00000000..e99e0332
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/llvm_gpu_backend/utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_UTILS_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace gpu {
+
+// Convenience function for replacing the extension of the given filename.
+// If the filename has no extension, the new extension is appended to its name.
+//
+// For example:
+//   ReplaceFilenameExtension("/foo/baz.txt", "cc") --> "/foo/baz.cc"
+std::string ReplaceFilenameExtension(absl::string_view filename,
+                                     absl::string_view new_extension);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/make_batch_pointers.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/make_batch_pointers.h
new file mode 100644
index 00000000..6e437faf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/make_batch_pointers.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
+#define XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
+
+#include <cstddef>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla::gpu {
+
+// In GPU memory, does
+//
+//   char* base_ptr = ...;
+//   void* ptrs_out = ...;
+//   for (i = 0; i < n; i++) {
+//     ptrs_out[i] = base_ptr + i * stride;
+//   }
+//
+// This is useful for functions like cublasTrsmBatched that operate on an array
+// of pointers in GPU memory.  In XLA these aren't usually arbitrary pointers
+// but rather are all contiguous values.
+//
+// Instead of using a kernel, a simpler way of doing this would be to create
+// this buffer on the host and then copy it to device.  But using a kernel
+// instead of an H2D copy avoids a few performance pitfalls.
+//
+//  - Only one H2D copy can run on a given GPU at a time.  If there's already
+//    a copy ongoing as part of other work on the GPU, our copy here will
+//    block.  In contrast, multiple kernels can run simultaneously.
+//
+//  - H2D copies from CUDA unpinned memory can acquire a global lock in the
+//    driver and slow down *all* work on the GPU.  So to do this right, we'd
+//    need to allocate the host memory as pinned, one alloc per stream.  Then
+//    we'd need to manage this memory without leaks.  This becomes complex!
+absl::Status MakeBatchPointers(se::Stream* stream,
+                               se::DeviceMemoryBase base_ptr,
+                               size_t stride_bytes, size_t n,
+                               se::DeviceMemoryBase ptrs_out);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/matmul_indexing_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/matmul_indexing_utils.h
new file mode 100644
index 00000000..9ceb4b78
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/matmul_indexing_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MATMUL_INDEXING_UTILS_H_
+#define XLA_SERVICE_GPU_MATMUL_INDEXING_UTILS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Ordered non-contracting dimensions for a dot instruction operand.
+absl::StatusOr<std::vector<int64_t>> GetNonContractingDims(
+    const Shape& shape, absl::Span<const int64_t> batch_dims,
+    absl::Span<const int64_t> contracting_dims);
+
+// Batch dimensions of an operand of a dot instruction.
+// Just an unified accessor to lhs_batch_dimensions and rhs_batch_dimensions.
+const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
+    const HloInstruction& dot, int operand_number);
+
+// Index of the only contracting dimension of dot instruction operand.
+absl::StatusOr<int64_t> ContractingDimensionIndex(const HloInstruction& dot,
+                                                  int operand_number);
+
+// Index of the only non-contracting dimension of dot instruction operand.
+absl::StatusOr<int64_t> NonContractingDimensionIndex(const HloInstruction& dot,
+                                                     int operand_number);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MATMUL_INDEXING_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/matmul_utils.h
new file mode 100644
index 00000000..7f3be061
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -0,0 +1,215 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MATMUL_UTILS_H_
+#define XLA_SERVICE_GPU_MATMUL_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Normalize shape to (batch, rows, columns) logical dimensions.
+absl::StatusOr<Shape> GetBatchRowColumnShape(
+    const Shape& shape, absl::Span<const int64_t> batch_dims,
+    absl::Span<const int64_t> row_dims, absl::Span<const int64_t> col_dims);
+
+// GPU folding rule for the `TransposeFolding` pass.
+absl::StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
+                                                    int64_t operand_idx);
+
+// Returns true if the sum of the sizes of the unbatched operand matrices
+// for the dot is smaller than the given threshold.
+absl::StatusOr<bool> IsMatrixMultiplicationTooSmallForRewriting(
+    const HloInstruction& dot, int64_t threshold);
+
+// Returns true if the backend can lower the dot. Currently the classical
+// emitters cannot handle some dots, e.g., i8[] x i8[] -> i32[] dots,
+// so we need to always use cuBLAS or Triton for those.
+bool IsDotSupportedByClassicalEmitters(const HloInstruction& dot);
+
+// extending plain MatrixLayout struct with creator functions
+struct MatrixLayout : public se::gpu::MatrixLayout {
+  // Returns the matrix layout for a logical shape (batch, rows, columns).
+  static absl::StatusOr<MatrixLayout> For(const Shape& shape);
+  // Returns the matrix layout with the given batch, row, col dimensions.
+  static absl::StatusOr<MatrixLayout> For(const Shape& shape,
+                                          absl::Span<const int64_t> batch_dims,
+                                          absl::Span<const int64_t> row_dims,
+                                          absl::Span<const int64_t> col_dims);
+  // Returns the matrix layout for the output.
+  static absl::StatusOr<MatrixLayout> For(const Shape& shape,
+                                          size_t lhs_num_batch_dims,
+                                          size_t lhs_num_row_dims,
+                                          size_t rhs_num_batch_dims,
+                                          size_t rhs_num_col_dims);
+};
+
+struct GemmConfig : public se::gpu::GemmConfig {
+  // For legacy Gemm operations XLA:GPU allocates its own workspace and passes
+  // it to all BLAS API calls.
+  //
+  // Size of the workspace based on NVIDIA recommendation:
+  // https://docs.nvidia.com/cuda/cublas/#cublassetworkspace
+  static constexpr int64_t kHopperWorkspace = 32 * 1024 * 1024;  // 32 MiB
+  static constexpr int64_t kDefaultWorkspace = 4 * 1024 * 1024;  // 4 MiB
+
+  static absl::StatusOr<GemmConfig> For(
+      const HloInstruction* gemm, const se::GpuComputeCapability& gpu_version);
+
+  // Gets the GemmConfig of the `gemm` instruction with overridden
+  // GemmBackendConfig.
+  static absl::StatusOr<GemmConfig> For(
+      const HloInstruction* gemm, const GemmBackendConfig& config,
+      const se::GpuComputeCapability& gpu_version);
+
+  static absl::StatusOr<GemmConfig> For(
+      const Shape& lhs_shape, absl::Span<const int64_t> lhs_batch_dims,
+      absl::Span<const int64_t> lhs_contracting_dims, const Shape& rhs_shape,
+      absl::Span<const int64_t> rhs_batch_dims,
+      absl::Span<const int64_t> rhs_contracting_dims, const Shape& output_shape,
+      double alpha_real, double alpha_imag, double beta,
+      PrecisionConfig::Algorithm precision_algorithm,
+      std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
+      bool grad_y, const se::GpuComputeCapability& gpu_version);
+
+  // As above with additional `c_shape` and `bias_shape_ptr` parameter, both
+  // which are only necessarily for F8 gemms.
+  static absl::StatusOr<GemmConfig> For(
+      const Shape& lhs_shape, absl::Span<const int64_t> lhs_batch_dims,
+      absl::Span<const int64_t> lhs_contracting_dims, const Shape& rhs_shape,
+      absl::Span<const int64_t> rhs_batch_dims,
+      absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
+      const Shape* bias_shape_ptr, const Shape& output_shape, double alpha_real,
+      double alpha_imag, double beta,
+      PrecisionConfig::Algorithm precision_algorithm,
+      std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
+      bool grad_y, const se::GpuComputeCapability& gpu_version);
+
+  struct DescriptorsTuple {
+    se::gpu::MatrixDescriptor lhs;
+    se::gpu::MatrixDescriptor rhs;
+    se::gpu::OutputMatrixDescriptor output;
+    bool operands_swapped;
+  };
+  absl::StatusOr<DescriptorsTuple> GetMatrixDescriptors(
+      se::DeviceMemoryBase lhs_buf, se::DeviceMemoryBase rhs_buf,
+      se::DeviceMemoryBase out_buf) const;
+};
+
+// Run the given GEMM instruction `gemm` subject to the configuration
+// in `gemm_config` and the passed buffers.
+//
+// If `algorithm` is provided, it overrides the one specified in `config`.
+absl::Status RunGemm(
+    const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
+    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
+    se::DeviceMemoryBase workspace_buffer, bool deterministic_ops,
+    se::Stream* stream,
+    std::optional<se::blas::AlgorithmType> algorithm = std::nullopt,
+    se::blas::ProfileResult* profile_result = nullptr);
+
+namespace gpublas_lt {
+
+absl::StatusOr<bool> EpilogueAddsVectorBias(
+    GemmBackendConfig_Epilogue epilogue);
+absl::StatusOr<bool> EpilogueHasAuxiliaryOutput(
+    GemmBackendConfig_Epilogue epilogue);
+
+absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue);
+
+}  // namespace gpublas_lt
+
+// We should use this in code instead of AutotuneResult::TritonGemmKey.
+// This has some advantages, for example it can be used in hashmaps.
+struct TritonGemmConfig {
+  constexpr TritonGemmConfig() = default;
+  constexpr TritonGemmConfig(int block_m, int block_n, int block_k, int split_k,
+                             int num_stages, int num_warps, int num_ctas = 1)
+      : block_m(block_m),
+        block_n(block_n),
+        block_k(block_k),
+        split_k(split_k),
+        num_stages(num_stages),
+        num_warps(num_warps),
+        num_ctas(num_ctas) {}
+  int block_m = 0;
+  int block_n = 0;
+  int block_k = 0;
+  int split_k = 0;
+  int num_stages = 0;
+  int num_warps = 0;
+  // Number of blocks in a block cluster.
+  int num_ctas = 0;
+
+  // When adding new members, please update all methods, such as ToTuple,
+  // FromProto, ToProto, ToString, etc. Updating ToTuple is not enough.
+  // Please also add new members to AutotuneResult::TritonGemmKey in
+  // autotuning.proto. Also kVersion has to be incremented in autotuner_util.cc
+  // and all the autotuning results stored in tests, repos, etc. will have to
+  // be updated.
+
+ private:
+  auto ToTuple() const {
+    return std::make_tuple(block_m, block_n, block_k, split_k, num_stages,
+                           num_warps, num_ctas);
+  }
+
+ public:
+  // Creates a TritonGemmConfig from the supplied proto, doing a simple sanity
+  // check.
+  static absl::StatusOr<TritonGemmConfig> FromProto(
+      const AutotuneResult::TritonGemmKey& proto);
+  AutotuneResult::TritonGemmKey ToProto() const;
+
+  std::string ToString() const;
+
+  bool operator==(const TritonGemmConfig& other) const {
+    return ToTuple() == other.ToTuple();
+  }
+
+  bool operator<(const TritonGemmConfig& other) const {
+    return ToTuple() < other.ToTuple();
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const TritonGemmConfig& config) {
+    return H::combine(std::move(h), config.ToTuple());
+  }
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MATMUL_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/metrics.h
new file mode 100644
index 00000000..8244ff77
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/metrics.h
@@ -0,0 +1,66 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_METRICS_H_
+#define XLA_SERVICE_GPU_METRICS_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// HLO passes (HLO -> HLO).
+void RecordHloPassesDuration(uint64_t time_usecs);
+
+// Compiling HLO to LLVM.
+void RecordHloToLlvmDuration(uint64_t time_usecs);
+
+// The entire LLVM to PTX pipeline, including both LLVM optimization (LLVM ->
+// LLVM) and compiling LLVM -> PTX.
+void RecordLlvmPassesAndLlvmToPtxDuration(uint64_t time_usecs);
+
+// LLVM passes (linking and optimization) only (LLVM -> LLVM).
+void RecordLlvmPassesDuration(uint64_t time_usecs);
+
+// Compiling LLVM to PTX only.
+void RecordLlvmToPtxDuration(uint64_t time_usecs);
+
+// Compiling PTX to cubin.
+void RecordPtxToCubinDuration(uint64_t time_usecs);
+
+// Counts compiled programs count.
+void IncrementCompiledProgramsCount();
+
+// DO NOT USE---this is exposed only for testing.
+// Resets compiled programs count.
+void ResetCompiledProgramsCountForTesting();
+
+// Gets compiled programs count.
+int64_t GetCompiledProgramsCount();
+
+// Records the size of the XLA device binary in bytes.
+void RecordXlaDeviceBinarySize(int64_t size);
+
+// Records the stacktrace of the GPU compiler.
+void RecordGpuCompilerStacktrace();
+
+// Returns the number of times the GPU compiler was called with the given
+// stacktrace.
+int GetGpuCompilerStacktraceCount(absl::string_view stacktrace);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/affine_map_evaluator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/affine_map_evaluator.h
new file mode 100644
index 00000000..c8918abe
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/affine_map_evaluator.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_AFFINE_MAP_EVALUATOR_H_
+#define XLA_SERVICE_GPU_MODEL_AFFINE_MAP_EVALUATOR_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+
+namespace xla {
+namespace gpu {
+
+// Given an AffineExpr and the values for its dimensions and symbols, evaluates
+// the result.
+int64_t EvaluateAffineExpr(mlir::AffineExpr expr,
+                           absl::Span<int64_t const> dim_values,
+                           absl::Span<int64_t const> symbol_values = {});
+
+// Given an AffineMap and the values for its dimensions and symbols, evaluates
+// the results.
+llvm::SmallVector<int64_t> EvaluateAffineMap(
+    mlir::AffineMap affine_map, absl::Span<int64_t const> dim_values,
+    absl::Span<int64_t const> symbol_values = {});
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_AFFINE_MAP_EVALUATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
new file mode 100644
index 00000000..3f3c67f9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_ANALYTICAL_LATENCY_ESTIMATOR_H_
+#define XLA_SERVICE_GPU_MODEL_ANALYTICAL_LATENCY_ESTIMATOR_H_
+
+#include <memory>
+#include <optional>
+
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/latency_hiding_scheduler.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+// Implementation of AnalyticalLatencyEstimator using HloAnalysis and
+// GPUPerformanceModel to estimate latencies for instructions.
+class AnalyticalLatencyEstimator : public LatencyEstimator {
+ public:
+  AnalyticalLatencyEstimator(
+      const SchedulerConfig& config,
+      std::unique_ptr<LatencyEstimator> latency_estimator,
+      const se::DeviceDescription& gpu_info,
+      HloCostAnalysis::ShapeSizeFunction shape_size_function,
+      HloComputation* computation);
+
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& target) const override;
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+  int CyclesPerMicrosecond() const override {
+    return latency_estimator_->CyclesPerMicrosecond();
+  }
+
+  static constexpr TimeCost kLowCost = 1.0;
+  static constexpr TimeCost kLowLatency = 1.0;
+
+ private:
+  const SchedulerConfig config_;
+  const se::DeviceDescription& gpu_info_;
+  std::optional<GpuHloCostAnalysis> cost_analysis_;
+  std::unique_ptr<LatencyEstimator> latency_estimator_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_ANALYTICAL_LATENCY_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
new file mode 100644
index 00000000..6681c19c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -0,0 +1,101 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Computes read coalescing for operands of an instruction or a
+// producer-consumer fusion.
+// Note, that later, after we migrate away from using the heuristic, we might
+// want to use HloFusionAdaptor instead of having two different constructors.
+class CoalescingAnalysis {
+ public:
+  // Computes read coalescing for operands of `instr`.
+  CoalescingAnalysis(const HloInstruction* instr,
+                     absl::Span<const HloInstruction* const> operands,
+                     const HloFusionAnalysis& fusion_analysis,
+                     KernelFusionInterface* fusion_interface = nullptr,
+                     mlir::MLIRContext* mlir_context = nullptr,
+                     bool use_heuristic = true);
+
+  // Computes read coalescing for operands of fused `producer` and `consumer`.
+  CoalescingAnalysis(const HloInstruction* producer,
+                     const HloInstruction* consumer,
+                     absl::Span<const HloInstruction* const> operands,
+                     const HloFusionAnalysis& fusion_analysis,
+                     KernelFusionInterface* fusion_interface = nullptr,
+                     mlir::MLIRContext* mlir_context = nullptr,
+                     bool use_heuristic = true);
+
+  // Returns true if the operand is read coalesced.
+  bool IsReadCoalesced(const HloInstruction* operand) const;
+
+ private:
+  bool ComputeCoalescingForAllOperands(
+      const HloFusionAdaptor& fusion_adaptor,
+      absl::Span<const HloInstruction* const> operands,
+      const HloFusionAnalysis& fusion_analysis,
+      KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context);
+
+  absl::flat_hash_map<const HloInstruction*, bool> coalescing_per_operand_;
+  bool is_coalesced_computed_by_heuristic_ = false;
+};
+
+// Returns true if all input reads are coalesced. If consumer is not nullptr,
+// producer and consumer are considered as one fusion, otherwise it's only the
+// producer.
+bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
+                              const se::DeviceDescription& device_info,
+                              const HloInstruction* producer,
+                              const HloInstruction* consumer = nullptr);
+
+// Returns the bandwidth utilization rate of the memory access for the given
+// tiled HLO instruction. Naturally, values are between 0 and 1, where a
+// perfectly coalesced read has a utilization rate of 1.
+//
+// Note: the assumption is that the tile sizes do not include padding beyond
+// the end of the shape.
+double BandwidthUtilizationRateHeuristicForTiledMemoryAccess(
+    const TiledHloInstruction& hbm_access_instr,
+    const se::DeviceDescription& device_info);
+
+// Returns true if read of this tiled hlo operand is coalesced.
+//
+// We consider a read coalesced if the operand tile consist of contiguous chunk
+// of memory that saturate DRAM->L2 cache line. For post-V100 NVIDIA GPUs, that
+// is 64 bytes by default.
+//
+// TODO(b/332714755): check whether we should bump up the granularity of
+// memory transactions.
+bool IsTiledReadCoalescedHeuristic(const TiledHloInstruction& operand,
+                                   const se::DeviceDescription& device_info);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
new file mode 100644
index 00000000..9eacee0a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_FUSION_ANALYSIS_CACHE_H_
+#define XLA_SERVICE_GPU_MODEL_FUSION_ANALYSIS_CACHE_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Caches HloFusionAnalyses. `Get` can be called concurrently, but `Invalidate`
+// and `Clear` shouldn't. Analyses are cached based on unique_ids, no checking
+// or tracking of changes is done.
+class HloFusionAnalysisCache {
+ public:
+  explicit HloFusionAnalysisCache(
+      const stream_executor::DeviceDescription& device_info)
+      : device_info_(device_info) {}
+
+  // Returns the analysis for the given instruction, creating it if it doesn't
+  // exist yet. Do not call concurrently with `Invalidate` for the same key.
+  const HloFusionAnalysis& Get(const HloInstruction& instruction);
+
+  // Returns the analysis for the given producer/consumer pair.
+  const HloFusionAnalysis& Get(const HloInstruction& producer,
+                               const HloInstruction& consumer);
+
+  // Removes the cache entry for the given instruction, if it exists. Also
+  // removes all producer-consumer fusions that involve this instruction.
+  void Invalidate(const HloInstruction& instruction);
+
+  // Delete all cache entries.
+  void Clear();
+
+ private:
+  const stream_executor::DeviceDescription& device_info_;
+
+  absl::Mutex mutex_;
+
+  // All `int` keys and values here are unique instruction IDs.
+  absl::node_hash_map<int, HloFusionAnalysis> analyses_;
+  absl::node_hash_map<std::pair<int, int>, HloFusionAnalysis>
+      producer_consumer_analyses_;
+
+  // For each instruction `producer`, contains the `consumer`s for which we have
+  // entries {`producer`, `consumer`} in `producer_consumer_analyses_`.
+  absl::flat_hash_map<int, std::vector<int>> consumers_for_producers_;
+  // For each instruction `consumer`, contains the `producer`s for which we have
+  // entries {`producer`, `consumer`} in `producer_consumer_analyses_`.
+  absl::flat_hash_map<int, std::vector<int>> producers_for_consumers_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MODEL_FUSION_ANALYSIS_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
new file mode 100644
index 00000000..e1bcff0b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
@@ -0,0 +1,130 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_COLLECTIVE_PERFORMANCE_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_COLLECTIVE_PERFORMANCE_MODEL_H_
+
+#include <array>
+#include <cstdint>
+
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/stream_executor/device_description.h"
+
+#if GOOGLE_CUDA
+#if defined(PLATFORM_POSIX) || defined(PLATFORM_GOOGLE)
+#include <dlfcn.h>
+#endif
+
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
+// Below is a list of function pointers to be used
+// for querying device properties through nvml library.
+#define NVML_FUNCTOR(name, rettype, args) \
+  inline rettype(*xla_##name) args = nullptr;
+
+NVML_FUNCTOR(nvmlInit, nvmlReturn_t, ())
+NVML_FUNCTOR(nvmlShutdown, nvmlReturn_t, ())
+NVML_FUNCTOR(nvmlDeviceGetHandleByIndex, nvmlReturn_t,
+             (unsigned int index, nvmlDevice_t* device))
+NVML_FUNCTOR(nvmlDeviceGetNvLinkCapability, nvmlReturn_t,
+             (nvmlDevice_t device, unsigned int link,
+              nvmlNvLinkCapability_t capability, unsigned int* capResult))
+
+#endif
+
+namespace xla {
+namespace gpu {
+
+class GpuPerformanceWithCollectiveModel : public GpuPerformanceModelBase {
+ public:
+  // Different algorithms that can be used to perform the collective.
+  enum CollectiveAlgo {
+    RING = 0,
+    TREE,
+  };
+
+  // Table for max system bandwidths GB/s for using NCCL's low latency
+  // algorithm. This is used for intra-node estimate.
+  static constexpr std::array<double, 5> kLowLatencyMaxBandwidths = {
+      39.0 /* Volta */,      87.7 /* Ampere */,    141.0 /* Hopper */,
+      141.0 /* Blackwell */, 141.0 /* next-gen */,
+  };
+
+  // Max bandwidth in GB/s for ring low latency 128 algorithm per channel on a
+  // single-node
+  static constexpr std::array<double, 5> kPerChannelMaxRingLL128Bandwidths = {
+      20.0 /* Volta */,     20.0 /* Ampere */,   36.7 /* Hopper */,
+      36.7 /* Blackwell */, 36.7 /* next-gen */,
+  };
+
+  // Nvlink unidirectional bandwidth for different compute cap. Note this is per
+  // lane bandwidth.
+  static constexpr double kSm60NvlinkBandwidth = 18.0;
+  static constexpr double kSm70NvlinkBandwidth = 20.0;
+  static constexpr double kSm80NvlinkBandwidth = 20.0;
+  static constexpr double kSm90NvlinkBandwidth = 20.0;
+
+  // PCIE bandwidth for PCI Gen3 x16
+  static constexpr double kPciBandwidth = 12.0;
+
+  // Discount factor for ring algorithm
+  static constexpr double kRingAlgorithmDiscountFactor = 0.92;
+
+  // Different tiers for intra-node bandwidth.
+  static constexpr std::array<double, 13> kIntraNodeSpeeds = {
+      40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0};
+  // SM90 has different bandwidths.
+  static constexpr std::array<double, 9> kIntraNodeSpeedsSm90 = {
+      60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0};
+
+  // Maximum number of channels allowed by NCCL
+  static constexpr int64_t kMaxNumChannelsRing = 16;
+
+  // ll128 is by default enabled for Volta, Ampere and Hopper, ll128 by default
+  // launches 640 threads.
+  static constexpr int64_t kLL128NumThreads = 640;
+
+  static constexpr absl::Duration kNcclKernelLaunchOverhead =
+      absl::Microseconds(5);
+
+  static absl::Duration ComputeCollectiveTime(
+      const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
+      const se::DeviceDescription& gpu_device_info);
+
+  // Returns NVLink bw in GB/s
+  static float GetNvlinkBw(se::CudaComputeCapability compute_capability);
+
+  // Initialize nvml library.
+  static bool InitNvml();
+
+  // Shut down nvml library.
+  static bool ShutdownNvml();
+
+  // This checks if the nvlink supports direct P2P communication,
+  // If not, we will use PCIE bandwidth to estimate latency.
+  static uint32_t CheckIfNvlinkSupportsP2P();
+
+ private:
+  static absl::Duration ComputeAllreduceTime(
+      const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
+      const se::DeviceDescription& gpu_device_info);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_COLLECTIVE_PERFORMANCE_MODEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
new file mode 100644
index 00000000..cec9bf69
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_COST_MODEL_STATS_COLLECTION_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_COST_MODEL_STATS_COLLECTION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Calculates costs for each fusion op and stores the result in backend
+// config.
+class GpuCostModelStatsCollection : public HloModulePass {
+ public:
+  explicit GpuCostModelStatsCollection(
+      const se::DeviceDescription& d,
+      const GpuHloCostAnalysis::Options& cost_analysis_options)
+      : device_info_(d), cost_analysis_(cost_analysis_options, device_info_) {}
+
+  absl::string_view name() const override {
+    return "gpu_cost_model_stats_collection";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::DeviceDescription device_info_;
+  GpuHloCostAnalysis cost_analysis_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_COST_MODEL_STATS_COLLECTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
new file mode 100644
index 00000000..64cb9db1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
@@ -0,0 +1,138 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_HLO_COST_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_HLO_COST_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Cost analysis for GPUs.
+class GpuHloCostAnalysis : public HloCostAnalysis {
+  // Each instruction creating a new basic block roughly doubles the total
+  // number of basic blocks and the IR code size accordingly.
+  static constexpr int64_t kMaxBasicBlockSplitsPerFusion = 10;
+  static constexpr int64_t kMaxIRSize = 10000;
+
+ public:
+  GpuHloCostAnalysis(
+      const Options& options,
+      const HloOpProfiles::HloOpProfile& hlo_elementwise_op_profile)
+      : HloCostAnalysis(options),
+        hlo_elementwise_op_profile_(hlo_elementwise_op_profile) {}
+
+  explicit GpuHloCostAnalysis(const Options& options)
+      : GpuHloCostAnalysis(options,
+                           HloOpProfiles::Singleton().GetDefaultProfile()) {}
+
+  GpuHloCostAnalysis(const Options& options,
+                     const se::DeviceDescription& device_info)
+      : GpuHloCostAnalysis(
+            options, HloOpProfiles::Singleton().GetProfile(device_info)) {}
+
+  absl::Status Preprocess(const HloInstruction* hlo) override;
+
+  float ScalingRatio(const HloInstruction& hlo) const;
+  int64_t NumOfDevices(const HloInstruction& hlo) const;
+
+  absl::Status HandleCustomCall(const HloInstruction* call) override;
+
+  int64_t GetConvolutionFlops(const HloInstruction* convolution) override;
+
+  absl::Status HandleElementwiseOp(const HloInstruction* hlo) override;
+
+  absl::Status HandleConcatenate(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduce(const HloInstruction* allreduce) override;
+  absl::Status HandleReduce(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduceStart(const HloInstruction* hlo) override;
+  absl::Status HandleAllGather(const HloInstruction* hlo) override;
+  absl::Status HandleAllGatherStart(const HloInstruction* hlo) override;
+  absl::Status HandleAsyncStart(const HloInstruction* hlo) override;
+
+  // Estimate the total size of IR accounting for both duplication
+  // of producer code by consumer and the total number of basic blocks.
+  // Tell if merged IR size would be too slow to compile.
+  bool ProducerConsumerMergedTooLarge(const HloInstruction& producer,
+                                      const HloInstruction& consumer);
+
+  // IR size scale of an instruction: 1 for most instructions,
+  // but for fusions is the number of instructions emitted including the
+  // duplication due to non-element-wise accesses.
+  float IrSize(const HloInstruction& hlo) const;
+
+  // Total common elementwise utilization of two instructions within a fusion.
+  // If two parameters have several common elementwise use roots returned is
+  // the sum of these utilizations. Can also be used to query if a parameter
+  // is used elementwise from the fusion's root.
+  float CommonElementwiseUtilization(const HloInstruction* a,
+                                     const HloInstruction* b) const;
+
+  // Returns the number of FLOPs needed to compute an element of the given
+  // elementwise instruction.
+  int64_t GetFlopsPerElementwiseOpElement(PrimitiveType type, HloOpcode opcode);
+
+  // Returns the number of FLOPs needed to compute the output of the elementwise
+  // instruction.
+  int64_t GetFlopsForElementwiseOp(HloOpcode op_code, const Shape& shape);
+  int64_t GetFlopsForElementwiseOp(const HloInstruction* instr);
+
+ protected:
+  std::unique_ptr<HloCostAnalysis> CreateNestedCostAnalysis() override;
+  int64_t FusionParameterReadBytes(const HloInstruction* hlo) const override;
+  absl::Status FusionCalculateUtilizations(
+      const HloInstruction* fusion) override;
+
+  size_t immediate_constant_max_elements() const override { return 8; }
+
+  bool KeyToCopyFromSubcomputation(absl::string_view key) const override;
+
+  // To estimate where within the computation an instruction output can be
+  // reused and where it has to be recomputed again we group accesses to the
+  // instruction by their origin from "element-wise use roots". All access
+  // paths from such a root to the instruction are element-wise.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      elementwise_use_roots_;
+
+  // Elementwise utilization of instruction's input subtree if it is a root.
+  // This is different from hlo_properties_[instr][kUtilizationKey] which
+  // is the utilization of the instruction by other roots.
+  absl::flat_hash_map<const HloInstruction*, float> root_utilizations_;
+
+  // Contains a map from (opcode, element_type) to FLOPs per element estimate
+  // for elementwise instructions.
+  const HloOpProfiles::HloOpProfile& hlo_elementwise_op_profile_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_HLO_COST_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
new file mode 100644
index 00000000..e293fbbd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -0,0 +1,143 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <variant>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Contains informations about block level parameters and run time of a fusion.
+struct TiledRunTimeData {
+  EstimateRunTimeData runtime_data;
+  BlockLevelParameters block_level_parameters;
+};
+
+using TiledRunTimeDataOrError = std::variant<TiledRunTimeData, FusionDecision>;
+
+// Implementation of Cost Model that uses indexing analysis to estimate amount
+// of compute and memory access time.
+class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
+ public:
+  explicit GpuPerformanceModelWithIndexingAnalysis(
+      const se::DeviceDescription* device_info,
+      HloFusionAnalysisCache* fusion_analysis_cache,
+      HloCostAnalysis::ShapeSizeFunction shape_size,
+      mlir::MLIRContext* mlir_context)
+      : hlo_op_profile_(&HloOpProfiles::Singleton().GetProfile(*device_info)),
+        device_info_(device_info),
+        fusion_analysis_cache_(fusion_analysis_cache),
+        shape_size_(shape_size),
+        cost_analysis_(
+            GpuHloCostAnalysis::Options{shape_size_,
+                                        /*per_second_rates=*/{},
+                                        /*min_latencies_seconds=*/{},
+                                        /*count_multiple_input_accesses=*/true},
+            *device_info_),
+        mlir_context_(mlir_context) {}
+
+  // Returns the launch dimensions for the given tiled HLO computation.
+  static LaunchDimensions GetLaunchDimensionsForTiledFusion(
+      const TiledHloComputation& tiled_hlo_computation,
+      const se::DeviceDescription& device_info);
+
+  EstimateRunTimeData EstimateRunTimeForFusion(
+      const HloFusionAnalysis& fusion_analysis, bool is_coalesced = true);
+
+  EstimateRunTimeData EstimateRunTimeForInstruction(
+      const HloInstruction* producer);
+
+  EstimateRunTimeData EstimateRunTimeForProducerConsumer(
+      const HloInstruction* producer, const HloInstruction* consumer);
+
+  RunTimes EstimateRunTimes(
+      const HloInstruction* producer,
+      absl::Span<const HloInstruction* const> fused_consumers = {});
+
+  absl::StatusOr<EstimateRunTimeData> EstimateRunTimeForTiledHloComputation(
+      const HloFusionAdaptor& fusion_adaptor,
+      const TiledHloComputation& tiled_hlo_computation,
+      const LaunchDimensions& launch_dimensions);
+
+  // Estimate the run time of the fusion with the given launch dimensions and
+  // output tile sizes.
+  //
+  // The model uses SymbolicTileAnalysis to build a TiledHloComputation with the
+  // given tile sizes. This way it can better estimate the amount of memory
+  // access and computation.
+  absl::StatusOr<EstimateRunTimeData> EstimateRunTimeForTiledFusion(
+      const HloFusionAdaptor& fusion_adaptor,
+      const LaunchDimensions& launch_dimensions,
+      absl::Span<const int64_t> output_tile_sizes);
+
+  // Estimate the run time of producer and consumer fused together, assuming
+  // that they will be emitted with Triton.
+  // If consumer is nullptr, estimate run time of the producer alone.
+  absl::StatusOr<EstimateRunTimeData> EstimateRunTimeForTriton(
+      const HloInstruction* producer, const HloInstruction* consumer = nullptr);
+
+  // Estimates the best tile sizes for the given fusion. Iterates over all the
+  // good tile sizes provided by SymbolicTileAnalysis, estimates the run time
+  // for each of them.
+  //
+  // Returns status if there is an error that we can't recover from.
+  // Returns FusionDecision if the fusion can't be tiled or there are no valid
+  // block level parameters.
+  // Otherwise returns block level parameters that give the best execution time.
+  absl::StatusOr<TiledRunTimeDataOrError> TryFindBestTilingForFusion(
+      const HloFusionAdaptor& fusion_adaptor);
+
+  // Returns an estimate how many FLOPs will be used to produce one element of
+  // the output.
+  int64_t FlopsPerElement(const HloInstruction* instr);
+
+ private:
+  int64_t GetShapeSizeRecursive(const Shape& shape) const;
+
+  const HloOpProfiles::HloOpProfile* hlo_op_profile_;
+  const se::DeviceDescription* device_info_;
+  HloFusionAnalysisCache* fusion_analysis_cache_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_;
+  GpuHloCostAnalysis cost_analysis_;
+  mlir::MLIRContext* mlir_context_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
new file mode 100644
index 00000000..7f31d543
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -0,0 +1,80 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
+
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuPerformanceModel : public GpuPerformanceModelBase {
+ public:
+  static EstimateRunTimeData EstimateRunTimeForInstruction(
+      const HloInstruction* instr, const se::DeviceDescription& device_info,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config);
+
+  static EstimateRunTimeData EstimateRunTimeForInstructionCached(
+      const HloInstruction* instr, const se::DeviceDescription& device_info,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config);
+
+  // TODO(shyshkov): Unify interface with EstimateRunTimeForInstruction.
+  static absl::Duration EstimateRunTimeForFusion(
+      const HloInstruction* producer, const HloInstruction* consumer,
+      const EstimateRunTimeData& producer_runtime,
+      const EstimateRunTimeData& consumer_runtime,
+      const se::DeviceDescription& device_info,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config,
+      bool producer_writes_side_output = false);
+
+  static absl::Duration EstimateRunTimeForFusionCached(
+      const HloInstruction* producer, const HloInstruction* consumer,
+      const EstimateRunTimeData& producer_runtime,
+      const EstimateRunTimeData& consumer_runtime,
+      const se::DeviceDescription& device_info,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config);
+
+  static RunTimes EstimateRunTimes(
+      const HloInstruction* producer, const se::DeviceDescription& device_info,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config,
+      absl::Span<const HloInstruction* const> fused_consumers = {});
+
+  static RunTimes EstimateRunTimesForMultiOutputFusion(
+      const HloInstruction* producer, const HloInstruction* consumer,
+      const se::DeviceDescription& device_info,
+      const GpuHloCostAnalysis* cost_analysis);
+
+  // Writes estimated execution time to FusionBackendConfig.reification_cost.
+  static void RecordEstimatedRunTime(HloInstruction* instruction,
+                                     const se::DeviceDescription& device_info,
+                                     const GpuHloCostAnalysis* cost_analysis,
+                                     const GpuPerformanceModelOptions& config);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
new file mode 100644
index 00000000..0ac09b5d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -0,0 +1,251 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_BASE_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_BASE_H_
+
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+struct EstimateRunTimeData {
+  int64_t flops;
+  int64_t bytes_read;
+  int64_t bytes_written;
+  absl::Duration read_time;
+  absl::Duration write_time;
+  absl::Duration compute_time;
+  absl::Duration exec_time;
+
+  // Returns an estimate that is guaranteed to be zero.
+  static EstimateRunTimeData Zero() {
+    return EstimateRunTimeData{/*flops=*/0,
+                               /*bytes_read=*/0,
+                               /*bytes_written=*/0,
+                               /*read_time=*/absl::ZeroDuration(),
+                               /*write_time=*/absl::ZeroDuration(),
+                               /*compute_time=*/absl::ZeroDuration(),
+                               /*exec_time=*/absl::ZeroDuration()};
+  }
+
+  // Returns an estimate that is guaranteed to be larger than any real runtime.
+  static EstimateRunTimeData Infinite() {
+    return EstimateRunTimeData{
+        /*flops=*/std::numeric_limits<int64_t>::max(),
+        /*bytes_read=*/std::numeric_limits<int64_t>::max(),
+        /*bytes_written=*/std::numeric_limits<int64_t>::max(),
+        /*read_time=*/absl::InfiniteDuration(),
+        /*write_time=*/absl::InfiniteDuration(),
+        /*compute_time=*/absl::InfiniteDuration(),
+        /*exec_time=*/absl::InfiniteDuration()};
+  }
+
+  // Returns true if the estimate is guaranteed to be larger than any real
+  // runtime.
+  bool IsInfinite() const { return exec_time == absl::InfiniteDuration(); }
+
+  std::string ToString() const {
+    return absl::StrFormat(
+        "EstimateRunTimeData{\n"
+        " flops: %d\n"
+        " bytes_read: %d\n"
+        " bytes_written: %d\n"
+        " read_time: %s\n"
+        " write_time: %s\n"
+        " compute_time: %s\n"
+        " exec_time: %s\n"
+        "}",
+        flops, bytes_read, bytes_written, absl::FormatDuration(read_time),
+        absl::FormatDuration(write_time), absl::FormatDuration(compute_time),
+        absl::FormatDuration(exec_time));
+  }
+};
+
+class GpuPerformanceModelCache {
+ public:
+  // Returns cached runtime data for the instruction or producer-consumer pair.
+  // Returns nullopt if there is no data in cache.
+  std::optional<EstimateRunTimeData> Get(const HloInstruction& instruction);
+  std::optional<absl::Duration> Get(const HloInstruction& producer,
+                                    const HloInstruction& consumer);
+  const absl::flat_hash_map<const HloInstruction*, absl::Duration>&
+  // Returns cache entries for all consumers of this producer.
+  GetAllConsumers(const HloInstruction& producer);
+  // Checks if producer-consumer pair cache entries exist for this producer.
+  bool ContainsConsumers(const HloInstruction& producer);
+  // Sets cache value for the instruction or producer-consumer pair.
+  void Set(const HloInstruction& instruction,
+           const EstimateRunTimeData& runtime_data);
+  void Set(const HloInstruction& producer, const HloInstruction& consumer,
+           absl::Duration runtime);
+
+  // Removes all cache entries for this instruction. The cache contains entries
+  // for individual instructions in instruction_runtime_data_ and for
+  // producer-consumer pairs in fusion_runtime_data_.
+  void Invalidate(const HloInstruction& instruction);
+
+ private:
+  absl::Mutex mutex_;
+
+  // Stores unfused runtime data for individual instructions.
+  absl::flat_hash_map<const HloInstruction*, EstimateRunTimeData>
+      instruction_runtime_data_;
+
+  // Stores fused runtime data for producer-consumer pairs.
+  absl::flat_hash_map<
+      const HloInstruction*,
+      absl::flat_hash_map<const HloInstruction*, absl::Duration>>
+      fusion_runtime_data_;
+};
+
+struct GpuPerformanceModelOptions {
+  // Factor for how much parallelism between compute and memory accesses should
+  // be assumed. If 1.0, assume perfect parallelism (the run time is the maximum
+  // of both times). If 0.0, assume no parallelism (the run time is the sum of
+  // both times).
+  //
+  // This constant was chosen empirically in early 2024, based on runtime
+  // performance on a set of benchmarks internal to Google. Intuitively, we
+  // expect it to be close to 1, but not quite 1 (i.e., sometimes, compute
+  // or memory accesses will be stalled waiting for the other, but usually
+  // they won't).
+  double memory_compute_parallelism = 0.95;
+
+  // If present, use this to retrieve fusion analyses.
+  HloFusionAnalysisCache* fusion_analysis_cache = nullptr;
+
+  GpuPerformanceModelCache* gpu_performance_model_cache = nullptr;
+
+  static GpuPerformanceModelOptions Default(
+      HloFusionAnalysisCache* fusion_analysis_cache = nullptr,
+      GpuPerformanceModelCache* gpu_performance_model_cache = nullptr) {
+    GpuPerformanceModelOptions config;
+    config.fusion_analysis_cache = fusion_analysis_cache;
+    config.gpu_performance_model_cache = gpu_performance_model_cache;
+    return config;
+  }
+};
+
+class GpuPerformanceModelBase {
+ public:
+  struct RunTimes {
+    absl::Duration time_unfused;
+    absl::Duration time_fused;
+  };
+
+  // Estimated values in the absence of easy ways to query them.
+  static constexpr absl::Duration kKernelLaunchOverhead = absl::Microseconds(1);
+  static constexpr absl::Duration kNcclKernelLaunchOverhead =
+      absl::Microseconds(5);
+  static constexpr float kL2CacheSpeedup = 2.5;
+  static constexpr float kL1CacheSpeedup = 8;
+
+  // Uses HloFusionAnalysis for computing the actual number of threads and
+  // blocks that the IR emitter will use.
+  static LaunchDimensions EstimateFusionLaunchDimensions(
+      const HloFusionAnalysis& fusion_analysis);
+
+  // Returns bytes accessed of operand output by instruction. Returns 0, if the
+  // operand is not used by the instruction.
+  static int64_t GetOperandBytesAccessed(
+      const GpuHloCostAnalysis* cost_analysis, const HloInstruction* instr,
+      const HloInstruction* operand);
+
+  // Returns utilization of operand by instruction. Returns 0, if the operand is
+  // not used by the instruction.
+  static float GetOperandUtilization(const GpuHloCostAnalysis* cost_analysis,
+                                     const HloInstruction* instr,
+                                     const HloInstruction* operand);
+
+  // Returns utilization `overlap` between a common operand of producer and
+  // consumer on merge. `utilization > 0` means that the operand will be
+  // accessed more efficiently after fusion.
+  //
+  // Currently covers two cases:
+  // 1) Producer has to use the common operand elementwise from its root if it
+  //    is a fusion or just be an elementwise instruction.
+  // 2) Consumer has to have common elementwise roots for the producer and the
+  //    common operand if it is a fusion or just be an elementwise instruction.
+  static float GetCommonUtilization(const GpuHloCostAnalysis* cost_analysis,
+                                    const HloInstruction* producer,
+                                    int64_t producer_idx_of_operand,
+                                    const HloInstruction* consumer);
+
+  // Returns bytes accessed of operand after producer and consumer are fused
+  // together. `GetCommonUtilization` works only for a limited set of
+  // elementwise cases.
+  static int64_t GetSharedOperandBytesAccessed(
+      const GpuHloCostAnalysis* cost_analysis, const HloInstruction* producer,
+      const HloInstruction* consumer, const HloInstruction* operand);
+
+  // Estimate read time of n_bytes_total bytes from global memory on a
+  // given GPU.
+  //
+  // Assumes that the first n_bytes_net are always read from DRAM, but next
+  // reads can be cached. Restricts the effective HBM bandwidth using the
+  // utilization rate passed as a parameter to model not-fully-coalesced reads.
+  static absl::Duration ReadTimeWithDRAMHeuristic(
+      const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+      int64_t n_bytes_net, int64_t n_bytes_total, PrimitiveType element_type,
+      double hbm_bandwidth_utilization_rate);
+
+  static absl::Duration WriteTime(const se::DeviceDescription& gpu_device_info,
+                                  int64_t bytes_written);
+
+  static absl::Duration ComputeTime(
+      const se::DeviceDescription& gpu_device_info, int64_t flops,
+      int64_t num_blocks, int64_t num_threads_per_block);
+
+  static absl::Duration CombineComputeAndMemoryAccessTime(
+      absl::Duration compute_time, absl::Duration memory_access_time,
+      const GpuPerformanceModelOptions& config);
+
+  // Logs estimates for the operand read if VLOG is enabled.
+  static void VLogOperandRead(const HloInstruction* operand,
+                              int64_t n_bytes_total, int64_t n_bytes_net,
+                              bool coalesced);
+};
+
+// Given an element type and whether the read is coalesced, returns the
+// utilization rate of the HBM bandwidth.
+//
+// TODO(b/332714755): to avoid interfering with the cost model as it exists
+// right now, this duplicates pre-existing logic and doesn't take into account
+// how much of the memory access is actually useful and just assumes the worst
+// possible utilization if the read is uncoalesced.
+double GetCoalescingUtilizationRate(
+    PrimitiveType element_type, const se::DeviceDescription& gpu_device_info,
+    bool coalesced);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiler.h
new file mode 100644
index 00000000..f9d83f63
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiler.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILER_H_
+#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+class HloOpProfiler {
+  static std::unique_ptr<HloModule> MakeModuleForMeasurements(
+      HloOpcode op, PrimitiveType data_type, int chain_length);
+
+  absl::StatusOr<absl::Duration> MeasureOpChainDuration(HloOpcode op,
+                                                        PrimitiveType data_type,
+                                                        int chain_length);
+
+ public:
+  explicit HloOpProfiler(HloRunner& runner);
+  absl::StatusOr<HloInstructionProfile> MeasureClockCyclesPerOp(
+      HloOpcode op, PrimitiveType data_type);
+
+ private:
+  HloRunner& runner_;
+  const se::DeviceDescription& dev_info_;
+  absl::Duration min_duration_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h
new file mode 100644
index 00000000..109f6b59
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_H_
+#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+class HloOpProfiles {
+ public:
+  using HloOpProfile =
+      absl::flat_hash_map<std::pair<HloOpcode, PrimitiveType>, int64_t>;
+  using ProfilesNestedMap =
+      absl::flat_hash_map<std::string,  // compute capability.
+                          HloOpProfile>;
+
+  // Returns singleton with profiler data.
+  static const HloOpProfiles& Singleton();
+
+  // Returns profile name for the given device.
+  // For CUDA, the format is "sm_XX".
+  // Returns "<unknown>" for unknown devices.
+  static std::string GetProfileName(const se::DeviceDescription& device_info);
+
+  // Loads profiles from the given text proto data.
+  static std::unique_ptr<HloOpProfiles> Load(
+      absl::string_view profiles_text_proto,
+      absl::string_view default_profile_name);
+
+  const HloOpProfile& GetProfile(
+      const se::DeviceDescription& device_info) const;
+
+  const HloOpProfile& GetDefaultProfile() const { return default_profile_; }
+
+ private:
+  HloOpProfiles(ProfilesNestedMap profiles,
+                absl::string_view default_profile_name)
+      : profiles_(std::move(profiles)),
+        default_profile_(profiles_.at(default_profile_name)) {}
+
+  ProfilesNestedMap profiles_;
+  const HloOpProfile& default_profile_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h
new file mode 100644
index 00000000..043596a5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h
@@ -0,0 +1,3720 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_
+#define XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_
+
+namespace xla {
+namespace gpu {
+
+// The data below is obtained with
+// xla/service/gpu/model:hlo_op_profiler_run
+
+constexpr char kDeviceHloOpProfiles[] = R"pb(
+  entries {
+    key: "sm_90"  # "NVIDIA H100 80GB HBM3"
+    value {
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 356
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 122
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 364
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 122
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 297
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 71
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 685
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 253
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 300
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 122
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 304
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 126
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 122
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 71
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 629
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 253
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 201
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 997
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 102
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 217
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 182
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 245
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 95
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 993
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 95
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 502
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 451
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 43
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 526
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 178
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 978
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 79
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 190
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 166
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 229
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 75
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 958
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 75
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 467
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 431
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 19
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 510
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 586
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 558
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 376
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 712
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 815
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1259
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 277
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 554
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 332
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 431
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 930
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 526
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2205
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2415
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 641
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2055
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 756
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 633
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 3148
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2324
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 4344
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2379
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 6462
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 498
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 79
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 5532
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1750
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1342
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1275
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2455
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2403
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5500
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1999
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6636
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4613
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 13131
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2280
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 39
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 8363
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15
+      }
+    }
+  }
+
+  entries {
+    key: "sm_86"
+    value {
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 370
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 392
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 367
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 396
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 306
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 918
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 601
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 306
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 388
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 302
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 399
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 115
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 838
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 604
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 925
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 691
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 108
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 396
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 266
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 284
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 226
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 97
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 97
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 212
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 482
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 975
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 867
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 662
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 86
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 381
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 244
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 262
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 176
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 75
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 662
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 75
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 190
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 486
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 925
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 6339
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1717
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1652
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1900
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 608
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2073
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2412
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 698
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1789
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 986
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1609
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 97
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 3747
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2016
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 97
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 5511
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 97
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1360
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1400
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 950
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 842
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2383
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 3193
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 5353
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 687
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 3351
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6613
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4028
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4161
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 7599
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6962
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 11318
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5878
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15606
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 9939
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 97
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 39027
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 7941
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 270
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 18205
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 97
+      }
+    }
+  }
+
+  entries {
+    key: "sm_80"  # "NVIDIA A100-SXM4-40GB"
+    value {
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 417
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 468
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 1094
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 420
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 417
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 391
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 454
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 908
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 744
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 1195
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 321
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 346
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 124
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 499
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 259
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 504
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 1221
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1638
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 572
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 699
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1223
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 329
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 597
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 397
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 733
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1080
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 831
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1861
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1037
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1029
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 6618
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 4131
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2309
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2371
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2405
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3945
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2284
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5304
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3618
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 13564
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3037
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6054
+      }
+    }
+  }
+
+  entries {
+    key: "sm_70"  # "Tesla V100-SXM2-16GB"
+    value {
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 336
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 189
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 345
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 183
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 287
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 104
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 685
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 12
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 376
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 293
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 189
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 293
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 183
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 113
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 104
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 599
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 12
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 376
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 226
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 425
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 128
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 241
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 232
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 266
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 122
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 425
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 122
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 284
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 449
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 73
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 709
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 9
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 189
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 373
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 79
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 205
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 180
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 217
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 76
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 373
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 76
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 269
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 6
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 406
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 21
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 6
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 673
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 6
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 599
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 624
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 358
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 410
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 318
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 633
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 263
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 618
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 324
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 406
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 973
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 501
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2099
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 780
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 722
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 703
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 758
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 654
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 3261
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 789
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 6282
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1924
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 12
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 8151
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 480
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 42
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 8105
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 12
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1808
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1487
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1334
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1805
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1618
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 7261
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2013
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 8237
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6343
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15355
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2423
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 45
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 9810
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15
+      }
+    }
+  }
+
+  entries {
+    key: "sm_60"  # "Tesla P100-SXM2-16GB"
+    value {
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 426
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 5
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 216
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 420
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 5
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 216
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 444
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 14
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 417
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 1018
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 82
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 1569
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 299
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 5
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 213
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 307
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 5
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 216
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 189
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 14
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 420
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 2
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 888
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 79
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 1548
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 233
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 532
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 142
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 364
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 325
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 373
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 100
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 497
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 100
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 458
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 675
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 68
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 1012
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 213
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 494
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 109
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 337
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 284
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 328
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 71
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 473
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 71
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 426
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 663
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 35
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 988
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 645
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1427
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 405
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 544
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 441
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 784
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 355
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1640
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 417
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 473
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 14
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1169
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 565
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 14
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2682
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 14
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1128
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1021
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 991
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1107
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 994
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2158
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1139
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2934
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1883
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 20
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 16282
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 760
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 65
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 8335
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 20
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4302
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3665
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3656
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2057
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1806
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6135
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4169
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 8595
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5294
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 20
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 22278
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3194
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 65
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 17893
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 20
+      }
+    }
+  }
+
+  entries {
+    key: "sm_75"
+    value {
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 360
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 336
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 357
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 339
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 296
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 979
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 495
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 293
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 334
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 290
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 336
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 118
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 812
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 515
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 792
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 815
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 132
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 342
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 239
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 239
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 262
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 126
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 794
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 123
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 175
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 414
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 74
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 1120
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 783
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 737
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 83
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 319
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 201
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 218
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 181
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 74
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 717
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 74
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 167
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 414
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 1085
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 6494
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1800
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1630
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1929
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 596
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1774
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 2430
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 705
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1805
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 984
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1535
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 95
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 3744
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1915
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 95
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 5538
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 95
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1702
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1503
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1474
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 835
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 737
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2232
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 1632
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2989
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2263
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 4847
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 3219
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6474
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4962
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4037
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 7286
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6848
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 10748
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5391
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15981
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 9653
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 95
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 38206
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 8040
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 273
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 18550
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 97
+      }
+    }
+  }
+)pb";
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_HLO_OP_PROFILES_DATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
new file mode 100644
index 00000000..b359f196
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
@@ -0,0 +1,84 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SOL_GPU_COST_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_SOL_GPU_COST_MODEL_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+namespace gpu {
+inline constexpr absl::string_view kSplitMaskWorldLevel = "0x0";
+
+class SolGPUCostModel {
+  // Speed-of-Light (SoL) analytical cost model for NCCL collectives.
+ public:
+  // Tunable system configuration, see
+  // xla_gpu_analytical_latency_estimator_options
+  struct Config {
+    absl::Duration nccl_op_launch_time;
+    double nic_speed_gbps;  // it's GBytes/s, not Gbit/s (ex: 40Gb/s = 5GB/s)
+    absl::Duration chunk_prep_time;
+    absl::Duration rtt;
+    int64_t gpus_per_node;
+    int64_t chunk_size_bytes;
+  };
+  enum CollectiveAlgorithmType {
+    RING = 0,
+    TREE,
+  };
+  enum class CollectiveType {
+    kAllReduce,
+    kAllGather,
+    kReduceScatter,
+    kSendRecv,
+  };
+  explicit SolGPUCostModel(const Config& sys_config);
+
+  // Extract the SoL-related configuration from XLA flags.
+  static SolGPUCostModel::Config GetConfig(const HloModule* module);
+
+  // Returns the latency of a NCCL ring collective.
+  //
+  // `buff_size_bytes`: the size of the message to be transferred.
+  // `num_nodes`: the number of nodes participating in the ring.
+  // `coll_type`: the type of the collective (eg AllGather).
+  // `mask`: the mask of the collective (AllWorld 0x0 vs RailAligned 0x7).
+  absl::Duration RingLatency(
+      int64_t buff_size_bytes, int num_nodes, const CollectiveType& coll_type,
+      absl::string_view mask = kSplitMaskWorldLevel) const;
+
+ private:
+  // Helper functions to estimate the latency subcomponents
+  absl::Duration ChunkPrepLatency(int64_t per_gpu_msg_size_bytes) const;
+
+  absl::Duration TransferDuration(int64_t per_gpu_msg_size_bytes) const;
+  // NumGpusPerComm returns  GPUs number participating in a given NCCL
+  // collective operation.
+  int NumGpusPerComm(int num_nodes, const CollectiveType& coll_type,
+                     absl::string_view mask) const;
+
+  // SoL-related configuration for NCCL cost modelling passed by user as flags.
+  Config xla_flag_config_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SOL_GPU_COST_MODEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
new file mode 100644
index 00000000..67fe7963
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SOL_GPU_COST_MODEL_STATS_COLLECTION_H_
+#define XLA_SERVICE_GPU_MODEL_SOL_GPU_COST_MODEL_STATS_COLLECTION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+class SolGpuCostModelStatsCollection : public HloModulePass {
+ public:
+  explicit SolGpuCostModelStatsCollection(
+      const se::DeviceDescription& device_description,
+      ShapeSizeFn shape_size_in_bytes_fn)
+      : device_info_(device_description),
+        shape_size_in_bytes_fn_(shape_size_in_bytes_fn) {}
+
+  absl::string_view name() const override {
+    return "sol-gpu-cost-model-stats-collection";
+  }
+
+  using HloPassInterface::Run;
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::DeviceDescription device_info_;
+  ShapeSizeFn shape_size_in_bytes_fn_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MODEL_SOL_GPU_COST_MODEL_STATS_COLLECTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
new file mode 100644
index 00000000..0c9da3d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SOL_LATENCY_ESTIMATOR_H_
+#define XLA_SERVICE_GPU_MODEL_SOL_LATENCY_ESTIMATOR_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/sol_gpu_cost_model.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/latency_hiding_scheduler.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class SolLatencyEstimator : public LatencyEstimator {
+ public:
+  // Implementation of SolLatencyEstimator using HloAnalysis and
+  // GPUPerformanceModel to estimate latencies for instructions.
+  SolLatencyEstimator(const SchedulerConfig& config,
+                      std::unique_ptr<LatencyEstimator> latency_estimator,
+                      const se::DeviceDescription& gpu_info,
+                      HloCostAnalysis::ShapeSizeFunction shape_size_function,
+                      HloComputation* computation);
+
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& target) const override;
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+  int CyclesPerMicrosecond() const override {
+    return latency_estimator_->CyclesPerMicrosecond();
+  }
+
+  static absl::Duration ComputeCollectiveTime(
+      const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
+      HloCostAnalysis::ShapeSizeFunction shape_size_fn,
+      const SolGPUCostModel::Config& sol_flags);
+
+  static absl::Duration ComputeCollectiveTime(
+      const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
+      HloCostAnalysis::ShapeSizeFunction shape_size_fn,
+      const SolGPUCostModel::Config& sol_flags,
+      const GpuHloCostAnalysis& cost_analysis);
+
+  static constexpr TimeCost kLowCost = 1.0;
+  static constexpr TimeCost kLowLatency = 1.0;
+
+ private:
+  const SchedulerConfig config_;
+  const se::DeviceDescription& gpu_info_;
+  std::optional<GpuHloCostAnalysis> cost_analysis_;
+  std::unique_ptr<LatencyEstimator> latency_estimator_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
+  const SolGPUCostModel::Config sol_flags_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SOL_LATENCY_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tile.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tile.h
new file mode 100644
index 00000000..bbc470b8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tile.h
@@ -0,0 +1,365 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "xla/hlo/analysis/indexing_map.h"
+
+namespace xla {
+namespace gpu {
+
+// `ConstraintExpression` represents a "flat" constraint expression of the form
+//   ((expr0 in interval0) && (expr1 in interval1)...) ||
+//   ((expr{n} in interval{n}) &&...)...
+//
+// The underlying constraints are stored in a vector of vectors, such that each
+// innermost vector represents the conjunction of some constraints, and the
+// outermost vector represents the disjunction of all its elements
+// (conjunctions). This representation is effective because `&&` (`And`) is
+// distributive over `||` (`Or`), ensuring that we can always flatten any given
+// `ConstraintExpression` in this way, and that we have reasonable combinators
+// for `&&` and `||`.
+//
+// We store a boolean `is_satisfiable_` to indicate whether we expect that the
+// constraints can be satisfied. When set to `false`, we expect the
+// `ConstraintExpression` to be empty (bottom).
+class ConstraintExpression {
+ public:
+  struct Constraint {
+    mlir::AffineExpr expr;
+    Interval interval;
+
+    bool operator==(const Constraint& other) const {
+      CHECK_EQ(expr.getContext(), other.expr.getContext())
+          << "AffineExpr should be from the same MLIRContext.";
+      return expr == other.expr && interval == other.interval;
+    }
+  };
+
+ private:
+  using ConjointConstraints = llvm::SmallVector<Constraint, 2>;
+  explicit ConstraintExpression(bool is_satisfiable)
+      : is_satisfiable_(is_satisfiable) {}
+
+ public:
+  // Constructs a `ConstraintExpression` from a single `Constraint`.
+  explicit ConstraintExpression(const Constraint& constraint)
+      : disjoint_conjoint_constraints_({{constraint}}) {}
+
+  // Constructs a `ConstraintExpression` that is always satisfied.
+  static ConstraintExpression GetAlwaysSatisfied() {
+    return ConstraintExpression(true);
+  }
+
+  // Constructs a `ConstraintExpression` that is unsatisfiable.
+  static ConstraintExpression GetUnsatisfiable() {
+    return ConstraintExpression(false);
+  }
+
+  // Takes the conjunction of the constraints of `first` and `second`.
+  friend ConstraintExpression operator&&(const ConstraintExpression& first,
+                                         const ConstraintExpression& second);
+
+  // Takes the disjunction of the constraints of `first` and `second`.
+  friend ConstraintExpression operator||(const ConstraintExpression& first,
+                                         const ConstraintExpression& second);
+
+  // Whether the constraints can be satisfied.
+  bool is_satisfiable() const { return is_satisfiable_; }
+
+  // Returns `true` if the constraint expression is marked satisfiable and does
+  // not contain any constraint.
+  bool IsAlwaysSatisfied() const {
+    return is_satisfiable_ && disjoint_conjoint_constraints_.empty();
+  }
+
+  // Returns `true` if the constraint expression is satisfied by the provided
+  // dim_values, and `false` otherwise.  The caller is responsible for ensuring
+  // that the number of provided dim_values is sufficient to verify the
+  // constraints.
+  bool IsSatisfiedBy(absl::Span<const int64_t> dim_values) const;
+
+  std::string ToString() const;
+
+  void Print(std::ostream& out) const;
+
+  // Simplifies the constraint expression.
+  //
+  // We remove conjunctions that are always satisfied, and we remove
+  // disjunctions that are unsatisfiable. If we can deduce that the whole
+  // expression is unsatisfiable or always satisfied, than we change the whole
+  // expression to the canonical form.
+  //
+  // E.g., if we find that one of the conjunctions is always satisfied, we don't
+  // just throw away that part---we throw away everything and make the
+  // ConstraintExpression canonically always satisfied.
+  void Simplify();
+
+ private:
+  // This allows GUnit to print the expression.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ConstraintExpression& expr) {
+    sink.Append(expr.ToString());
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Constraint& constraint) {
+    llvm::hash_code expr_hash = mlir::hash_value(constraint.expr);
+    return H::combine(std::move(h), static_cast<size_t>(expr_hash),
+                      constraint.interval);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ConjointConstraints& conjoint_constraints) {
+    for (const auto& constraint : conjoint_constraints) {
+      h = H::combine(std::move(h), constraint);
+    }
+    return h;
+  }
+
+  // When this is set to `false`, disjoint_conjoint_constraints_ must be empty.
+  bool is_satisfiable_ = true;
+  llvm::SmallVector<ConjointConstraints, 2> disjoint_conjoint_constraints_;
+};
+
+// Logical operators between `ConstraintExpression` and `Constraint`.
+inline ConstraintExpression operator&&(
+    const ConstraintExpression::Constraint& first,
+    const ConstraintExpression& second) {
+  return ConstraintExpression(first) && second;
+}
+
+inline ConstraintExpression operator&&(
+    const ConstraintExpression& first,
+    const ConstraintExpression::Constraint& second) {
+  return first && ConstraintExpression(second);
+}
+
+inline ConstraintExpression operator&&(
+    const ConstraintExpression::Constraint& first,
+    const ConstraintExpression::Constraint& second) {
+  return ConstraintExpression(first) && ConstraintExpression(second);
+}
+
+inline ConstraintExpression operator||(
+    const ConstraintExpression::Constraint& first,
+    const ConstraintExpression& second) {
+  return ConstraintExpression(first) || second;
+}
+
+inline ConstraintExpression operator||(
+    const ConstraintExpression& first,
+    const ConstraintExpression::Constraint& second) {
+  return first || ConstraintExpression(second);
+}
+
+inline ConstraintExpression operator||(
+    const ConstraintExpression::Constraint& first,
+    const ConstraintExpression::Constraint& second) {
+  return ConstraintExpression(first) || ConstraintExpression(second);
+}
+
+// Tiling in the simpler case, when we don't have dynamic offsets (see the
+// general case later):
+//
+// An N-dimensional *tile* describes a structured subset of
+// indices inside an N-dimensional array, where the set of indices captured
+// along each dimension can be expressed as a strided expression
+//     offset + stride * iota(size)
+//
+// where offset and stride are non-negative integers, size is a strictly
+// positive integer and `iota` is the usual range function.
+//
+// An *N-dimensional symbolic tile* is a function from an M-dimensional
+// tile to an N-dimensional tile. The input tile is assumed to have all offsets
+// equal to 0 and all strides equal to 1.
+//
+// It is represented with "tile_map()", which is an IndexingMap of this form:
+// (size0, ..., size{n-1}) ->  (offset0, ..., offset{n-1},
+//                              size'0, ..., size'{n-1},
+//                              stride0, ..., stride{n-1})
+//
+// We can get three AffineMap projections of tile_map(), which are just
+// convenience methods to get the components that we need:
+//     offset_map(): (size0, ..., size{M-1}) -> (offset0, ..., offset{N-1})
+//     size_map():   (size0, ..., size{M-1}) -> (size'0, ..., size'{N-1})
+//     stride_map(): (size0, ..., size{M-1}) -> (stride0, ..., stride{N-1})
+//
+// The maps respectively encode the offset, size, and stride component of each
+// strided expression in the result tile.
+//
+// A symbolic tile with M symbols and N results is constructed using an
+// `IndexingMap` with M input dimensions and N results. The construction of the
+// symbolic tile may fail if any one of the resulting expressions is not a
+// strided expression as described above.
+//
+// Tiling in the general case:
+//
+// In general, the offsets of the tile can depend on runtime variables. Runtime
+// variables are evaluated to an element of a tensor at runtime for each
+// multi-index of the output tensor. This allows support for dynamic offsets,
+// for example in dynamic-slice. Until runtime, they are kept in a symbolic
+// form. In the following reasoning, we assume that the runtime variables would
+// evaluate to the same value for all elements of any given input tile. (This is
+// trivially true for dynamic-slice, but we have to choose tiles wisely for
+// gather for example.) In the following math, with the help of the previous
+// assumption, we represent runtime variables as integer parameters. Note that
+// the earlier concepts are here redefined in a more general form.
+//
+// Def. An n-dimensional tile is a function:
+// t: Z^k -> P(N^n) =
+//    rt_vars -> CartesianProduct_{i=1, ..., n-1}({
+//           offsets(rt_vars)[i] + strides[i] * 0,
+//           ...,
+//           offsets(rt_vars)[i] + strides[i] * (sizes[i]-1)
+//         })
+// where
+//    Z is the set of integers,
+//    P is the power set operator (set of all subsets),
+//    N is the set of non-negative integers
+//    N+ is the set of positive integers
+//    N^n meaning the set of n-tuples of non-negative integers
+//
+//    rt_vars: Z^k (so called "runtime variables")
+//    offsets: Z^k -> N^n
+//    strides: N^n
+//    sizes: (N+)^n
+//
+// Notation. We can represent n-dimensional tiles as:
+//   (offsets, strides, sizes): (Z^k -> N^n) x N^n x (N+)^n
+// where A x B means a Cartesian product.
+//
+// Def. Let Tiles(n) denote the set of n-dimensional tiles.
+//
+// Def. An n-dimensional "symbolic tile" is a function:
+// s: U_{m: N} (Tiles(m) -> Tiles(n))
+// where U represents a union of sets.
+//
+// Notation. We can represent n-dimensional symbolic tiles of the form
+// (offsets, strides, sizes) : Tiles(m)
+//   -> (offsets', strides', sizes') : Tiles(n)
+// as a vector of functions:
+//   (offset_map, stride_map, size_map) where:
+//     offset_map: ((Z^j -> N^m) x N^m x (N+)^m) -> (Z^k -> N^n)
+//     stride_map: ((Z^j -> N^m) x N^m x (N+)^m) -> N^n
+//     size_map: ((Z^j -> N^m) x N^m x (N+)^m) -> (N+)^n
+// where each "map" returns one component of the result Tile.
+//
+// If we assume that offsets=({} -> {0, ..., 0}) and strides={1, ..., 1}, then
+// we can simplify the definition:
+//     offset_map: (N+)^m -> (Z^k -> N^n)
+//     stride_map: (N+)^m -> N^n
+//     size_map: (N+)^m -> (N+)^n
+//
+// As a notation, we can further simplify the structure of offset_map:
+//   offset_map: (N+)^m x Z^k -> N^n
+// As a notation, we can call the last k parameters of offset_map "rt_vars".
+//
+// In the code we represent a symbolic tile with "tile_map()", which is an
+// IndexingMap of this form:
+// (size0, ..., size{n-1})
+// [rt_var0, ..., rt_var{k-1}] -> (offset0, ..., offset{n-1},
+//                                 size'0, ..., size'{n-1},
+//                                 stride0, ..., stride{n-1})
+//
+// We can get three AffineMap projections of tile_map(), which are just
+// convenience methods to get the components that we need:
+// offset_map(): (sizes...)[rt_vars...] -> offsets'
+// size_map():   (sizes...) -> sizes'
+// stride_map(): (sizes...) -> strides'
+//
+// The size parameters of the projections may be arbitrarily constrained, in
+// order to ensure that applying the symbolic tile on an input tile yields a
+// valid tile. Such constraints are exposed through the constraints() method.
+// It may happen that constraints are unsatisfiable; in that case, the boolean
+// is_satisfiable() is set to false. This boolean should always be checked
+// before using the content of constraints().
+//
+// To correctly evaluate the RTVars for a given tile, we have to feed an
+// index from the original tile (a tile of the output tensor) to the RTVar's
+// affine map. (The RTVars in the symbolic tile are not adjusted to take indices
+// from the result tile.)
+//
+// Note: Currently runtime offsets are relative to the whole tensor, while other
+// offsets are local to the position of the input tile. This will be probably
+// simplified later.
+class SymbolicTile {
+ public:
+  static std::optional<SymbolicTile> FromIndexingMap(IndexingMap indexing_map);
+
+  // For printing in tests.
+  std::string ToString() const;
+
+  void Print(std::ostream& out) const;
+
+  mlir::AffineMap offset_map() const;
+  mlir::AffineMap size_map() const;
+  mlir::AffineMap stride_map() const;
+
+  // Constraints on the `sizes` of the input tile. Content is irrelevant when
+  // `is_satisfiable()` is false.
+  const ConstraintExpression& constraints() const {
+    CHECK(constraints_.is_satisfiable());
+    return constraints_;
+  }
+
+  // Whether the `SymbolicTile` constraints can be satisfied. When this is set
+  // to `false`, the domain of the `SymbolicTile` must be considered empty.
+  bool is_satisfiable() const { return constraints_.is_satisfiable(); }
+
+  // A map from one tile's sizes and RTVars to another tile's offsets, sizes,
+  // and strides.
+  //
+  // (size0, ..., size{n-1})
+  // [rt_var0, ..., rt_var{k-1}] -> (offset0, ..., offset{n-1},
+  //                                 size'0, ..., size'{n-1},
+  //                                 stride0, ..., stride{n-1})
+  const IndexingMap& tile_map() const { return tile_map_; }
+
+  // This allows GUnit to print the tile.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const SymbolicTile& tile) {
+    sink.Append(tile.ToString());
+  }
+
+ private:
+  // See the comment of tile_map().
+  IndexingMap tile_map_;
+
+  // See the comment of constraints().
+  ConstraintExpression constraints_;
+
+  explicit SymbolicTile(IndexingMap tile_map, ConstraintExpression constraints)
+      : tile_map_(std::move(tile_map)), constraints_(std::move(constraints)) {}
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
new file mode 100644
index 00000000..a82e56dd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -0,0 +1,190 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/service/instruction_fusion.h"
+
+namespace xla {
+namespace gpu {
+
+class SymbolicTileAnalysis;
+using SymbolicTileAnalysisOrError =
+    std::variant<SymbolicTileAnalysis, FusionDecision>;
+
+// An interface to implement additional emitter-specific constraints. This
+// interface can be used as an extension point to further constrain the set of
+// given limitations of a particular codegen solution.
+class EmitterSpecificConstraints {
+ public:
+  virtual ~EmitterSpecificConstraints() = default;
+
+  virtual absl::StatusOr<bool> ParametersSatisfyConstraints(
+      absl::Span<const int64_t> tile_parameters) const = 0;
+};
+
+// TODO(b/367306544): get rid of the HloFusionAdaptor parameter once the
+// abstraction exists.
+using EmitterSpecificConstraintsBuilder =
+    std::function<std::unique_ptr<EmitterSpecificConstraints>(
+        const std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>&,
+        const HloFusionAdaptor&)>;
+
+// Constructs and holds symbolic tiles for all the instructions within a
+// computation. We may hold several different symbolic tiles for the same
+// instruction if the instruction is indexed in several different ways in order
+// to produce a single chunk of the output. In order to handle this properly,
+// we store a symbolic tile for each possible path starting from the root
+// instruction of the computation to the relevant instruction.
+class SymbolicTileAnalysis {
+ public:
+  // A tile size for each dimension.
+  //
+  // This is an inlined vector to avoid too many heap allocations.
+  using Tiling = absl::InlinedVector<int64_t, 4>;
+
+  // Tries to construct a symbolic tile analysis from a computation. Returns
+  // a diagnostic if the construction fails for any reason.
+  //
+  // If `emitter_specific_constraints_builder` is provided, it will be used to
+  // construct emitter-specific constraints for the analysis.
+  static SymbolicTileAnalysisOrError AnalyzeComputation(
+      const HloComputation& computation, mlir::MLIRContext* ctx,
+      EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder =
+          nullptr);
+  static SymbolicTileAnalysisOrError AnalyzeFusion(
+      const HloFusionAdaptor& fusion, mlir::MLIRContext* ctx,
+      EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder =
+          nullptr);
+
+  // Returns a graph of HLO instructions tiled with the given tile parameters.
+  // The provided tile parameters must satisfy the analysis's constraints.
+  // By default, `ComputeTiledHloInstructions` performs a check that the
+  // constraints are satisfied by the chosen tiled parameters. Setting
+  // `constraints_are_known_satisfied` to true bypasses this check.
+  //
+  // If `compute_all_tile_offset_indexing_maps == true`, all
+  // TiledHloInstructions will have tile offset indexing maps set. Otherwise,
+  // the indexing maps will be set only for instructions that have equal hash to
+  // deduplicate them.
+  absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructions(
+      absl::Span<const int64_t> tile_parameters,
+      bool constraints_are_known_satisfied = false,
+      bool compute_all_tile_offset_indexing_maps = false) const;
+
+  // Returns the tiled root instruction.
+  const SymbolicTiledHloInstruction* GetRoot() const {
+    return symbolic_tiled_hlo_instructions_.back().get();
+  }
+
+  // Returns the number of tile parameters in this symbolic analysis.
+  int64_t num_tile_parameters() const {
+    return GetRoot()->hlo()->shape().dimensions_size();
+  }
+
+  // Returns the symbolic tiled HLO instructions in def-before-use order.
+  const std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>&
+  GetSymbolicTiledHloComputation() const {
+    return symbolic_tiled_hlo_instructions_;
+  }
+
+  // Returns the constraints for the parameters of the symbolic tiled HLO
+  // computation. This is the intersection of the constraints of all the
+  // symbolic tiles encountered throughout the computation.
+  const ConstraintExpression& GetConstraints() const { return constraints_; }
+
+  // Returns true if a list of tile parameters satisfies the symbolic tile
+  // analysis's constraints. If provided, also checks the emitter-specific
+  // constraints.
+  //
+  // Returns false if the constraints are not satisfied but can be evaluated
+  // correctly. Returns an error if the constraints cannot be evaluated
+  // correctly. This is typically the case if too few tile parameters are
+  // provided to fully reduce the constraint expressions to constants.
+  absl::StatusOr<bool> ParametersSatisfyConstraints(
+      absl::Span<const int64_t> tile_parameters) const;
+
+  // Return the underlying MLIRContext.
+  mlir::MLIRContext* GetMLIRContext() const { return context_; };
+
+  // Returns a string representation of the analysis. Used only for error
+  // messages and debugging.
+  std::string ToString() const;
+
+  // Returns a list of tilings for the symbolic tiled HLO computation of the
+  // analysis that are expected to perform well.
+  //
+  // Note: This is an initial implementation where the results may not perform
+  // that well, and now we're filtering the tilings with Triton in mind
+  // (allowing only powers of 2 or the full dimension size).
+  absl::StatusOr<std::vector<Tiling>> GetGoodTilings() const;
+
+ private:
+  SymbolicTileAnalysis(
+      std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>
+          symbolic_tiled_hlo_instructions,
+      ConstraintExpression constraints,
+      std::unique_ptr<EmitterSpecificConstraints> emitter_specific_constraints,
+      mlir::MLIRContext* context)
+      : symbolic_tiled_hlo_instructions_(
+            std::move(symbolic_tiled_hlo_instructions)),
+        constraints_(std::move(constraints)),
+        emitter_specific_constraints_(std::move(emitter_specific_constraints)),
+        context_(context) {}
+
+  // The tiled HLO instructions in def-before-use order.
+  std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>
+      symbolic_tiled_hlo_instructions_;
+
+  // See the documentation of GetConstraints().
+  ConstraintExpression constraints_;
+
+  // Additional emitter-specific constraints on tile parameters. May be null if
+  // no builder was provided when constructing the analysis.
+  std::unique_ptr<EmitterSpecificConstraints> emitter_specific_constraints_;
+
+  mlir::MLIRContext* context_;
+};
+
+namespace detail {
+// Only exposed for testing, please use SymbolicTileAnalysis::GetGoodTilings()
+// instead.
+std::vector<SymbolicTileAnalysis::Tiling> GetGoodTilings(
+    absl::Span<const int64_t> dim_sizes,
+    std::function<bool(absl::Span<const int64_t>)> is_valid);
+}  // namespace detail
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
new file mode 100644
index 00000000..b2fe89d4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
@@ -0,0 +1,118 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+
+namespace xla {
+namespace gpu {
+
+// A node in the symbolic tiled representation of an HLO computation. During
+// tiling and codegen an HLO instruction may need to be emitted multiple times
+// with different tiling parameters.
+class SymbolicTiledHloInstruction {
+ public:
+  SymbolicTiledHloInstruction(const HloInstruction* hlo,
+                              IndexingMap indexing_map)
+      : hlo_(hlo), indexing_map_(std::move(indexing_map)) {}
+
+  // Evaluates the tile offsets of an instruction with given tile parameters.
+  llvm::SmallVector<int64_t> TileOffsets(
+      absl::Span<int64_t const> tile_parameters) const;
+  // Evaluates the tile sizes of an instruction with given tile parameters.
+  llvm::SmallVector<int64_t> TileSizes(
+      absl::Span<int64_t const> tile_parameters) const;
+  // Evaluates the tile strides of an instruction with given tile parameters.
+  llvm::SmallVector<int64_t> TileStrides(
+      absl::Span<int64_t const> tile_parameters) const;
+
+  const HloInstruction* hlo() const { return hlo_; }
+  const IndexingMap& indexing_map() const { return indexing_map_; }
+  void set_symbolic_tile(SymbolicTile symbolic_tile) {
+    symbolic_tile_ = std::move(symbolic_tile);
+  }
+  const SymbolicTile& symbolic_tile() const {
+    CHECK(symbolic_tile_.has_value()) << "Symbolic tile was not computed";
+    return *symbolic_tile_;
+  }
+
+  const SymbolicTiledHloInstruction* operand(int64_t operand_id) const {
+    return operands_[operand_id];
+  }
+  SymbolicTiledHloInstruction* operand(int64_t operand_id) {
+    return operands_[operand_id];
+  }
+  const std::vector<SymbolicTiledHloInstruction*>& operands() const {
+    return operands_;
+  }
+
+  // Appends an operand to the end of the operand list.
+  void AppendOperand(SymbolicTiledHloInstruction* operand) {
+    operands_.push_back(operand);
+  }
+
+  // Returns a string representation of the instruction. Used only for error
+  // messages and debugging.
+  std::string ToString() const;
+
+ private:
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo_;
+
+  // Indexing map from the computation root to this instruction output.
+  IndexingMap indexing_map_;
+
+  // Symbolic tile derived from the indexing map. Should be computed outside of
+  // this class and set before usage. Wrapped in an optional, because
+  // SymbolicTile does not have a default constructor.
+  std::optional<SymbolicTile> symbolic_tile_;
+
+  // Operands of the instruction in the tiled computation graph.
+  std::vector<SymbolicTiledHloInstruction*> operands_;
+};
+
+inline bool operator==(const SymbolicTiledHloInstruction& lhs,
+                       const SymbolicTiledHloInstruction& rhs) {
+  return lhs.hlo() == rhs.hlo() && lhs.indexing_map() == rhs.indexing_map();
+}
+
+inline bool operator!=(const SymbolicTiledHloInstruction& lhs,
+                       const SymbolicTiledHloInstruction& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename H>
+H AbslHashValue(H h, const SymbolicTiledHloInstruction& tiled_hlo_instruction) {
+  return H::combine(std::move(h), tiled_hlo_instruction.hlo(),
+                    tiled_hlo_instruction.indexing_map());
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
new file mode 100644
index 00000000..047c75d8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
@@ -0,0 +1,128 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
+#define XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/SmallVector.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/iterator_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "xla/tsl/lib/gtl/iterator_range.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// A container for block-level parameters. Currently only used for Triton
+// fusions.
+struct BlockLevelParameters {
+  std::vector<int64_t> output_tile_sizes;
+
+  // Triton-specific parameters.
+  int64_t num_warps = 1;
+  int num_ctas = 1;
+  int num_stages = 1;
+
+  // Returns a BlockLevelParameters struct from a BlockLevelFusionConfig proto.
+  static BlockLevelParameters FromBlockLevelFusionConfig(
+      const BlockLevelFusionConfig& config) {
+    return BlockLevelParameters{
+        /*output_tile_sizes=*/
+        std::vector<int64_t>(config.output_tile_sizes().begin(),
+                             config.output_tile_sizes().end()),
+        /*num_warps=*/config.num_warps()};
+  }
+
+  // Returns a BlockLevelFusionConfig proto from a BlockLevelParameters struct.
+  BlockLevelFusionConfig ToBlockLevelFusionConfig() const {
+    BlockLevelFusionConfig config;
+    config.mutable_output_tile_sizes()->Add(output_tile_sizes.begin(),
+                                            output_tile_sizes.end());
+    config.set_num_warps(num_warps);
+    return config;
+  }
+};
+
+// Stores `TiledHloInstruction`s in the computation.
+//  * Instructions reference each other with non-owning pointers.
+//  * Instructions with the same tiling parameters are CSE-ed during
+//  construction.
+//  * Instructions are stored in def-before-use order.
+//  * The last element in the vector in the root instruction.
+class TiledHloComputation {
+ public:
+  // Creates a computation from a list of instructions. The instructions are
+  // expected to be sorted in def-before-use order.
+  static TiledHloComputation FromSortedTiledHloInstructions(
+      std::vector<std::unique_ptr<TiledHloInstruction>> instructions,
+      llvm::SmallVector<int64_t> num_output_tiles_per_dim) {
+    return TiledHloComputation(std::move(instructions),
+                               std::move(num_output_tiles_per_dim));
+  }
+
+  // Returns an iterator range over the instructions in the computation in
+  // def-before-use order.
+  tsl::gtl::iterator_range<UnwrappingIterator<
+      std::vector<std::unique_ptr<TiledHloInstruction>>::const_iterator>>
+  instructions() const {
+    return {MakeUnwrappingIterator(instructions_.begin()),
+            MakeUnwrappingIterator(instructions_.end())};
+  }
+
+  // Returns the number of output tiles for each dimension.
+  llvm::ArrayRef<int64_t> num_output_tiles_per_dim() const {
+    return num_output_tiles_per_dim_;
+  }
+
+  // Returns the total number of output tiles.
+  int64_t num_output_tiles() const {
+    return Product(num_output_tiles_per_dim());
+  }
+
+  // Returns the root instruction of the computation.
+  const TiledHloInstruction* GetRoot() const {
+    return instructions_.back().get();
+  }
+
+  // Returns a string representation of the computation. Used only for error
+  // messages and debugging.
+  std::string ToString() const;
+
+ private:
+  explicit TiledHloComputation(
+      std::vector<std::unique_ptr<TiledHloInstruction>> instructions,
+      llvm::SmallVector<int64_t> num_output_tiles_per_dim)
+      : instructions_(std::move(instructions)),
+        num_output_tiles_per_dim_(std::move(num_output_tiles_per_dim)) {}
+
+  // Stores instructions in the computation in def-before-use order.
+  std::vector<std::unique_ptr<TiledHloInstruction>> instructions_;
+
+  // Stores the number of output tiles for each dimension.
+  llvm::SmallVector<int64_t> num_output_tiles_per_dim_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
new file mode 100644
index 00000000..4b213870
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
@@ -0,0 +1,209 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
+#define XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace gpu {
+
+// A wrapper around HloInstruction that represents a tiled HLO instruction.
+//
+// The class contains information required to emit this instruction in
+// block-level codegen. Tile sizes and strides are constants and do not depend
+// on the multidimensional tile index. Tile offsets are computed using an
+// indexing map of the form:
+//   `(d0, d1, ...) -> (tile_offset0, tile_offset1, ...)`
+// where (d0, d1, ...) is the tile multi-index.
+class TiledHloInstruction {
+ public:
+  // Creates an instance of TiledHloInstruction. Returns an error if any of the
+  // following preconditions is not met:
+  // * Number of tile sizes, strides should match HLO shape rank.
+  // * Number of results of `tile_offsets_indexing` should match HLO shape rank.
+  // * `tile_offsets_indexing` should have the number of dimensions equal to the
+  //   rank of the output tile and 0 symbols.
+  static absl::StatusOr<std::unique_ptr<TiledHloInstruction>> Create(
+      const HloInstruction* hlo,
+      llvm::SmallVector<const TiledHloInstruction*> operands,
+      llvm::SmallVector<int64_t> tile_sizes,
+      llvm::SmallVector<int64_t> tile_strides,
+      std::optional<IndexingMap> tile_offsets_indexing);
+
+  // Returns the original HLO instruction.
+  const HloInstruction* hlo() const { return hlo_; }
+
+  // Operands of the instruction in the tiled computation graph.
+  const TiledHloInstruction* operand(int64_t operand_id) const {
+    return operands_[operand_id];
+  }
+
+  const llvm::SmallVector<const TiledHloInstruction*>& operands() const {
+    return operands_;
+  }
+
+  // Returns the tile sizes. The number of tile sizes is equal to the rank of
+  // the output shape.
+  const llvm::SmallVector<int64_t>& tile_sizes() const { return tile_sizes_; }
+  int64_t tile_size(int64_t dim_idx) const { return tile_sizes_[dim_idx]; }
+
+  // Returns the tile strides. The number of tile strides is equal to the rank
+  // of the output shape.
+  const llvm::SmallVector<int64_t>& tile_strides() const {
+    return tile_strides_;
+  }
+  int64_t tile_stride(int64_t dim_idx) const { return tile_strides_[dim_idx]; }
+
+  // Returns the indexing map from tile multi-index to tile offsets. The map has
+  // a form of `(d0, d1, ...) -> (tile_offset0, tile_offset1, ...)`. The number
+  // of input dimensions is equal to the rank of output tile of the computation.
+  // The number of tile offsets is equal to the rank of the tiled hlo.
+  //
+  // The indexing map is not computed by default.
+  absl::StatusOr<const IndexingMap> tile_offsets_indexing() const {
+    if (!tile_offsets_indexing_.has_value()) {
+      return absl::FailedPreconditionError(
+          "tile_offsets_indexing was not computed. It is likely that "
+          "`compute_all_tile_offset_indexing_maps` should be set to true in "
+          "`SymbolicTileAnalysis::ComputeTiledHloInstructions`.");
+    }
+    return *tile_offsets_indexing_;
+  }
+
+  std::string ToString() const;
+
+  // This allows GUnit to print TiledHloInstruction.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const TiledHloInstruction& tiled_hlo) {
+    sink.Append(tiled_hlo.ToString());
+  }
+
+ protected:
+  TiledHloInstruction(const HloInstruction* hlo,
+                      llvm::SmallVector<const TiledHloInstruction*> operands,
+                      llvm::SmallVector<int64_t> tile_sizes,
+                      llvm::SmallVector<int64_t> tile_strides,
+                      std::optional<IndexingMap> tile_offsets_indexing)
+      : hlo_(hlo),
+        operands_(std::move(operands)),
+        tile_sizes_(std::move(tile_sizes)),
+        tile_strides_(std::move(tile_strides)),
+        tile_offsets_indexing_(std::move(tile_offsets_indexing)) {}
+
+ private:
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo_;
+
+  // Operands of the instruction in the tiled computation graph.
+  llvm::SmallVector<const TiledHloInstruction*> operands_;
+
+  // Tile sizes and strides.
+  llvm::SmallVector<int64_t> tile_sizes_;
+  llvm::SmallVector<int64_t> tile_strides_;
+
+  // See comment for `tile_offsets_indexing()`.
+  std::optional<IndexingMap> tile_offsets_indexing_;
+};
+
+inline bool operator==(const TiledHloInstruction& lhs,
+                       const TiledHloInstruction& rhs) {
+  if (lhs.hlo() != rhs.hlo() || lhs.tile_sizes() != rhs.tile_sizes() ||
+      lhs.tile_strides() != rhs.tile_strides()) {
+    return false;
+  }
+
+  if (lhs.operands().empty() && rhs.operands().empty()) {
+    // Tile offsets indexing is guaranteed to be computed only if tile sizes are
+    // different and the instruction has no operands.
+    return lhs.tile_offsets_indexing() == rhs.tile_offsets_indexing();
+  }
+
+  return lhs.operands() == rhs.operands();
+}
+
+inline bool operator!=(const TiledHloInstruction& lhs,
+                       const TiledHloInstruction& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename H>
+H AbslHashValue(H h, const TiledHloInstruction& tiled_hlo_instruction) {
+  // There is no default hash implementation for llvm::SmallVector neither in
+  // AbslHashValue nor in llvm::hash_value. We can use the available hash
+  // implementation for absl::Span instead.
+  return H::combine(
+      std::move(h), tiled_hlo_instruction.hlo(),
+      absl::Span<int64_t const>(tiled_hlo_instruction.tile_sizes()),
+      absl::Span<int64_t const>(tiled_hlo_instruction.tile_strides()),
+      absl::Span<const TiledHloInstruction* const>(
+          tiled_hlo_instruction.operands()));
+}
+
+class TiledHloComputation;
+
+// `TiledHloFusionInstruction` is to `TiledHloInstruction` what
+// `HloFusionInstruction` is to `HloInstruction`.
+//
+// The main use case for `TiledHloFusionInstruction`s is to support nested
+// fusions in block-level codegen.
+//
+// Similarly to `HloFusionInstruction`, this subclass holds a nested
+// `TiledHloComputation` accessible through the `called_computation()` method.
+class TiledHloFusionInstruction : public TiledHloInstruction {
+ public:
+  static absl::StatusOr<std::unique_ptr<TiledHloFusionInstruction>> Create(
+      const HloInstruction* hlo,
+      llvm::SmallVector<const TiledHloInstruction*> operands,
+      std::unique_ptr<TiledHloComputation> called_computation,
+      llvm::SmallVector<int64_t> tile_sizes,
+      llvm::SmallVector<int64_t> tile_strides,
+      std::optional<IndexingMap> tile_offsets_indexing);
+
+  // The `TiledHloComputation` called by this instruction.
+  const TiledHloComputation* called_computation() const {
+    return called_computation_.get();
+  }
+
+ private:
+  TiledHloFusionInstruction(
+      const HloInstruction* hlo,
+      llvm::SmallVector<const TiledHloInstruction*> operands,
+      std::unique_ptr<TiledHloComputation> called_computation,
+      llvm::SmallVector<int64_t> tile_sizes,
+      llvm::SmallVector<int64_t> tile_strides,
+      std::optional<IndexingMap> tile_offsets_indexing);
+
+  // See comment for `called_computation()`.
+  std::unique_ptr<TiledHloComputation> called_computation_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h
new file mode 100644
index 00000000..c8f1356c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineMap.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
+#include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+
+#ifndef XLA_SERVICE_GPU_MODEL_TRITON_EMITTER_CONSTRAINTS_H_
+#define XLA_SERVICE_GPU_MODEL_TRITON_EMITTER_CONSTRAINTS_H_
+
+namespace xla {
+namespace gpu {
+
+// Triton-specific constraints on tile sizes.
+class TritonEmitterConstraints : public EmitterSpecificConstraints {
+ public:
+  static EmitterSpecificConstraintsBuilder GetBuilder(
+      const se::DeviceDescription& device_description);
+
+  absl::StatusOr<bool> ParametersSatisfyConstraints(
+      absl::Span<const int64_t> tile_parameters) const override;
+
+  bool HasCustomConstraints() const { return !custom_constraints_.empty(); }
+
+ private:
+  // Holds a constraint expression over derived parameters (s'0, ..., s'm) where
+  //   (s'0, ..., s'm) = tile_parameters_transform(tile_parameters).
+  struct CustomConstraints {
+    mlir::AffineMap tile_parameters_transform;
+    ConstraintExpression constraints;
+  };
+
+  explicit TritonEmitterConstraints(
+      llvm::SmallVector<mlir::AffineMap, 4> tile_size_maps,
+      std::vector<CustomConstraints> custom_constraints,
+      const Shape& root_shape, const se::DeviceDescription& device_info)
+      : tile_size_maps_(std::move(tile_size_maps)),
+        custom_constraints_(std::move(custom_constraints)),
+        root_shape_(root_shape),
+        device_info_(device_info) {}
+
+  // Derives a vector of `CustomConstraints` to be checked within
+  // `ParametersSatisfyConstraints` from a vector of
+  // `SymbolicTiledHloInstruction`s representing a symbolically tiled HLO
+  // computation. The fusion adaptor is used to figure out which instructions
+  // within the computation are operands of the fusion.
+  //
+  // Currently, this is used to work around an issue with reshapes/bitcasts when
+  // instructions are tiled with non-power-of-2 shapes. The resulting custom
+  // constraints contain
+  //   * the reshape/bitcast's tile size map; this to allow deriving the
+  //     output tile sizes for the reshape/bitcast instruction;
+  //   * the constraint expression corresponding to the SymbolicTile derived
+  //     from the reshape/bitcast instruction's output-to-input indexing map
+  //     "in a vacuum" (i.e., without composing with any other indexing map).
+  //
+  // TODO(b/365727080): move tile derivation to have powers of 2 tiles
+  // everywhere, and deprecate this.
+  static std::vector<CustomConstraints> DeriveCustomConstraints(
+      const std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>&
+          instructions,
+      const HloFusionAdaptor& fusion_adaptor);
+
+  // A collection of unique size maps from all the SymbolicTiledHloInstructions.
+  //
+  // Different TiledHloInstructions often have the same size map, so we keep a
+  // collection of unique maps to improve compilation time.
+  llvm::SmallVector<mlir::AffineMap, 4> tile_size_maps_;
+
+  // Custom emitter-specific constraints to check in
+  // `ParametersSatisfyConstraints`.
+  std::vector<CustomConstraints> custom_constraints_;
+
+  // Shape of the root instruction.
+  Shape root_shape_;
+
+  se::DeviceDescription device_info_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TRITON_EMITTER_CONSTRAINTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/nvptx_compiler.h
new file mode 100644
index 00000000..ca37748e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -0,0 +1,117 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_NVPTX_COMPILER_H_
+#define XLA_SERVICE_GPU_NVPTX_COMPILER_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/gpu_compiler.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace gpu {
+
+void WarnIfBadDriverJITVersion();
+
+// NVPTXCompiler generates efficient GPU executables for NVPTX target.
+class NVPTXCompiler : public GpuCompiler {
+ public:
+  // DebugOptions are used to determine which CompilationProvider to use.
+  explicit NVPTXCompiler(const DebugOptions& debug_options);
+
+  // The same as above, but uses the default DebugOptions determined by
+  // flags.
+  explicit NVPTXCompiler();
+
+  absl::Status OptimizeHloConvolutionCanonicalization(
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      se::dnn::VersionInfo dnn_version,
+      const se::SemanticVersion& toolkit_version) override;
+
+  absl::Status OptimizeHloPostLayoutAssignment(
+      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      const CompileOptions& options, const TargetConfig& gpu_target_config,
+      tsl::thread::ThreadPool* thread_pool) override;
+
+  bool RequiresCollectiveScheduleLinearizer(
+      const HloModule* module, se::StreamExecutor* stream_exec) override;
+
+  absl::Status AddConvAndGemmAutotuningPasses(
+      HloPassPipeline* pipeline, const se::GpuComputeCapability& gpu_version,
+      const CompileOptions& options, HloModule* hlo_module,
+      AutotuneConfig& autotune_config,
+      tsl::thread::ThreadPool* thread_pool) override;
+
+  absl::Status AddGemmFusionAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+      const MultiProcessKeyValueStore& key_value_store,
+      const se::SemanticVersion& toolkit_version) override;
+
+  absl::Status RunCudnnCompilerPasses(HloModule* module,
+                                      se::StreamExecutor* stream_exec,
+                                      BinaryMap* dnn_compiled_graphs) override;
+
+  HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer(
+      const se::DeviceDescription& device_description) const override;
+
+  absl::StatusOr<BackendCompileResult> CompileTargetBinary(
+      const HloModuleConfig& module_config, llvm::Module* llvm_module,
+      const stream_executor::DeviceDescription& device_description,
+      bool relocatable, const HloModule* debug_module,
+      const CompileOptions& options) override;
+
+  absl::StatusOr<bool> CanUseLinkModules(
+      const HloModuleConfig& module_config,
+      const stream_executor::DeviceDescription& device_description) override;
+
+ private:
+  absl::StatusOr<std::vector<uint8_t>> LinkModules(
+      const stream_executor::DeviceDescription& device_description,
+      se::StreamExecutor* stream_exec,
+      std::vector<std::vector<uint8_t>> modules,
+      const DebugOptions& debug_options) override;
+
+  absl::StatusOr<std::unique_ptr<se::cuda::CompilationProvider>>
+      compilation_provider_;
+
+  NVPTXCompiler(const NVPTXCompiler&) = delete;
+  NVPTXCompiler& operator=(const NVPTXCompiler&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_NVPTX_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/outfeed_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/outfeed_manager.h
new file mode 100644
index 00000000..41e3f254
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/outfeed_manager.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_OUTFEED_MANAGER_H_
+#define XLA_SERVICE_GPU_OUTFEED_MANAGER_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "xla/literal.h"
+#include "xla/service/gpu/xfeed_queue.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/notification.h"
+
+namespace xla {
+namespace gpu {
+
+// TODO(b/30467474) Once GPU outfeed implementation settles, consider
+// folding back the cpu and gpu outfeed implementations into a generic
+// one if possible.
+
+// Defines a buffer holding the destination for an outfeed in host memory and a
+// notification when that triggers when the transfer is done.
+class OutfeedBuffer {
+ public:
+  explicit OutfeedBuffer(int64_t length) : length_(length) {}
+
+  // Waits for the device transfer to be finished.
+  void WaitUntilAvailable() { done_.WaitForNotification(); }
+
+  int64_t length() const { return length_; }
+  void set_destination(std::unique_ptr<MutableBorrowingLiteral> destination) {
+    destination_ = std::move(destination);
+  }
+  MutableBorrowingLiteral* destination() { return destination_.get(); }
+
+  // Callback to signal that this buffer is consumed.
+  void Done() { done_.Notify(); }
+
+ private:
+  std::unique_ptr<MutableBorrowingLiteral> destination_;
+  const int64_t length_;
+  tsl::Notification done_;
+};
+
+// Manages a thread-safe queue of buffers. The buffers are supposed to be
+// produced by the transfer manager and consumed by the device.
+class OutfeedManager
+    : public XfeedQueue<ShapeTree<std::unique_ptr<OutfeedBuffer>>*> {
+ public:
+  absl::Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                          MutableBorrowingLiteral literal);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_OUTFEED_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/parallel_loop_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/parallel_loop_emitter.h
new file mode 100644
index 00000000..9c095956
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/parallel_loop_emitter.h
@@ -0,0 +1,102 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_
+#define XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits a parallel loop for every element in the given array shape. This loop
+// emitted will be executed by multiple threads in parallel. Therefore, each
+// thread instance of the loop iterates over part of the array, and they
+// collectively iterates over the entire array.
+class ParallelLoopEmitter {
+ public:
+  // `launch_dimensions` specify the number of threads and blocks to
+  // parallelize the loop on.  `launch_config` specify some detail on
+  // how to parallelize.
+  ParallelLoopEmitter(llvm_ir::BodyEmitter body_emitter, const Shape& shape,
+                      const LaunchDimensions& launch_dimensions,
+                      llvm::IRBuilderBase* b,
+                      LaunchDimensionsConfig launch_config = {});
+
+  // Constructs a loop emitter for a loop that generates on element of each of N
+  // arrays on each iteration.
+  //
+  // This is used in multi-output fusion.  target_element_generator should
+  // produce a struct with N elements, one for each of target_arrays.
+  ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
+                      absl::Span<const llvm_ir::IrArray> target_arrays,
+                      const LaunchDimensions& launch_dimensions,
+                      llvm::IRBuilderBase* b,
+                      LaunchDimensionsConfig launch_config = {});
+
+  ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
+  ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
+
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
+      absl::string_view loop_name, llvm::Type* index_type,
+      llvm::Value* base_index);
+
+  absl::Status EmitLoop(absl::string_view loop_name = "",
+                        llvm::Type* index_type = nullptr);
+
+ private:
+  struct LinearBaseAndThreadIdx {
+    llvm::Value* linear_base;
+    llvm::Value* thread_idx;
+  };
+
+  LinearBaseAndThreadIdx EmitLinearBaseAndThreadIdx(llvm::Type* index_type,
+                                                    llvm::Value* base_index);
+  absl::Status EmitSerialLoop(absl::string_view loop_name,
+                              llvm::Type* index_type,
+                              llvm::Value* base_indvar = nullptr);
+
+  // The thread and block dimension to parallelize the loop on.
+  const LaunchDimensions launch_dimensions_;
+  const LaunchDimensionsConfig launch_config_;
+
+  // An IR emitter that generates the loop body.
+  llvm_ir::BodyEmitter body_emitter_;
+
+  // The shape that the emitted loop iterates through.
+  Shape shape_;
+
+  // Points to the exit block of the emitted loop.
+  llvm::BasicBlock* exit_bb_;
+
+  llvm::IRBuilderBase* b_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.h
new file mode 100644
index 00000000..ed5fcf82
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_PREPARE_HLO_FOR_IR_EMITTING_PIPELINE_H_
+#define XLA_SERVICE_GPU_PREPARE_HLO_FOR_IR_EMITTING_PIPELINE_H_
+
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Function wrapper around the XLA GPU base pre-IR emission module preparation
+// pipeline. This pipeline must be run right before IR emission to ensure
+// correctness of the input module.
+HloPassPipeline PrepareHloModuleForIrEmittingPipeline(
+    HloModule& hlo_module, HloDataflowAnalysis::CanShareBuffer can_share_buffer,
+    const se::DeviceDescription& device_description);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_PREPARE_HLO_FOR_IR_EMITTING_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.h
new file mode 100644
index 00000000..bd3bff0c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_PTX_COMPILE_OPTIONS_FROM_DEBUG_OPTIONS_H_
+#define XLA_SERVICE_GPU_PTX_COMPILE_OPTIONS_FROM_DEBUG_OPTIONS_H_
+
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+// Infers the compilation options from the given debug options.
+stream_executor::cuda::CompilationOptions PtxCompileOptionsFromDebugOptions(
+    const DebugOptions& debug_options, bool is_autotuning_compilation);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_PTX_COMPILE_OPTIONS_FROM_DEBUG_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/reduce_scatter_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/reduce_scatter_combiner.h
new file mode 100644
index 00000000..b09f39ba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/reduce_scatter_combiner.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_REDUCE_SCATTER_COMBINER_H_
+#define XLA_SERVICE_GPU_REDUCE_SCATTER_COMBINER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/reduce_scatter_combiner.h"
+#include "xla/stream_executor/device_description.h"
+namespace xla::gpu {
+
+// Similarly to `ReduceScatterCombiner` pass, combines `ReduceScatter` ops into
+// a single larger `ReduceScatter` op to maximize network bandwidth usage.
+// Additionally, if no flags are set for combiner thresholds, the pass will try
+// to figure out the optimal combiner threshold by itself.
+class GpuReduceScatterCombiner : public ReduceScatterCombiner {
+ public:
+  GpuReduceScatterCombiner(const se::DeviceDescription& device_info,
+                           const int default_combine_threshold_in_bytes,
+                           int64_t combine_threshold_in_bytes,
+                           int64_t combine_threshold_count, bool combine_by_dim,
+                           int64_t pointer_size)
+      : ReduceScatterCombiner(combine_threshold_in_bytes,
+                              combine_threshold_count, combine_by_dim),
+        device_info_(device_info),
+        default_combine_threshold_in_bytes_(default_combine_threshold_in_bytes),
+        pointer_size_(pointer_size) {}
+
+  absl::string_view name() const override {
+    return "gpu-reduce-scatter-combiner";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_info_;
+  const int default_combine_threshold_in_bytes_;
+  int64_t pointer_size_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_REDUCE_SCATTER_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/reduction_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/reduction_utils.h
new file mode 100644
index 00000000..b9f4e58e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/reduction_utils.h
@@ -0,0 +1,123 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_REDUCTION_UTILS_H_
+#define XLA_SERVICE_GPU_REDUCTION_UTILS_H_
+
+#include <cstdint>
+#include <ostream>
+
+#include "absl/container/inlined_vector.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// Need at least 1024 threads/block for reasonable tree reduction
+// performance (assuming all data fits).
+inline constexpr int64_t MinThreadsXRowReduction() { return 1024; }
+
+// When doing batched row reduction, how big the batch dimension could be.
+inline constexpr int64_t BatchedReductionRaceFreeBound() { return 8; }
+
+struct ReductionDimensions {
+  // The reduction dimension indices used below.
+  constexpr static int kRowMajorReducedDimension = 0;
+  constexpr static int kRowKeptDimension = 1;
+  constexpr static int kRowMinorReducedDimension = 2;
+
+  constexpr static int kColMajorKeptDimension = 0;
+  constexpr static int kColReducedDimension = 1;
+  constexpr static int kColMinorKeptDimension = 2;
+
+  // Indicates whether the reduction is a row reduction or a column reduction.
+  bool is_row_reduction;
+
+  // We collapse contiguous reduced or kept dimensions into a single dimension
+  // for the reduction. However, for historical reasons, this is not done at the
+  // HLO level. We only support reductions where either all the reduced or all
+  // the kept dimensions are contiguous, so we end up with two types:
+  //
+  //   row reductions:    [a, b, c] -> [b]    (a and c are reduced).
+  //   column reductions: [a, b, c] -> [a, c] (b is reduced).
+  //
+  // If the input has less than three dimensions, a (and b if it's a 1d
+  // reduction) are set to 1.
+  Vector3 dimensions;
+
+  absl::InlinedVector<int64_t, 2> GetOutputShape() const {
+    if (is_row_reduction) {
+      return {dimensions[kRowKeptDimension]};
+    }
+    return {dimensions[kColMajorKeptDimension],
+            dimensions[kColMinorKeptDimension]};
+  }
+
+  bool operator==(const ReductionDimensions& other) const {
+    return is_row_reduction == other.is_row_reduction &&
+           dimensions == other.dimensions;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const ReductionDimensions& reduction_dimensions);
+
+// Returns true if using the reduction emitter is estimated to be faster than
+// using the elemental emitter.
+bool IsUnnestedReductionFasterThanElemental(
+    const ReductionDimensions& reduction_dimensions,
+    const se::DeviceDescription& device_description);
+
+// Returns true if either the dimensions being reduced or the dimensions being
+// kept are contiguous in the input of the reduce instruction.
+bool IsReductionFromOrToContiguousDimensions(
+    const HloInstruction& reduce,
+    const se::DeviceDescription& device_description);
+
+// Given the input shape and dimensions to reduce for a reduction, returns
+// ReductionDimensions.
+//
+// Prerequisite: the reduction instruction passes the check
+// IsReductionFromOrToContiguousDimensions, which guarantees either the
+// dimensions to reduce or the dimensions to keep are consecutive.
+ReductionDimensions GetReductionKindAndContiguousComponents(
+    const HloInstruction& reduce);
+
+// Get tiling per thread for the given reduction in dimensions [D, H, W].
+Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions);
+
+// How big the reduction dimension can be to be race free.
+int64_t ReductionDimensionRaceFreeBound(
+    const ReductionDimensions& reduction_dimensions,
+    const se::DeviceDescription& device_description);
+
+// Returns whether the given reduction can be safely generated without atomics :
+// that is, at most one block will write to every output element.
+bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions,
+                         const se::DeviceDescription& device_description);
+
+// Whether the instruction is a reduction hero for the given root.
+bool IsRealReductionHero(const HloInstruction& root, const HloInstruction& hero,
+                         const se::DeviceDescription& device_description);
+
+// Whether `reduction_hero` is compatible with `first_reduce`.
+bool AreReductionsMultiOutputFusionCompatible(
+    const HloInstruction* reduce_hero, const HloInstruction* first_reduce);
+
+}  // namespace gpu
+}  // namespace xla
+#endif  // XLA_SERVICE_GPU_REDUCTION_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/annotation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/annotation.h
new file mode 100644
index 00000000..13d34e35
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/annotation.h
@@ -0,0 +1,110 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
+#define XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "tsl/profiler/lib/nvtx_utils.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
+
+namespace xla::gpu {
+
+// Prepared information for the top level NVTX/profiler range covering an
+// HloModule
+class ModuleAnnotation {
+ public:
+  explicit ModuleAnnotation(absl::string_view module_name);
+  explicit ModuleAnnotation(const HloModule& mod);
+
+  absl::string_view longest_op_name_prefix() const { return longest_prefix_; }
+  explicit operator absl::string_view() const { return title_str_; }
+  tsl::profiler::StringHandle title() const { return title_; }
+  static uint64_t NvtxSchemaId();
+  int32_t common_stack_frames() const { return common_stack_frames_; }
+
+ private:
+  friend void RangePush(tsl::profiler::ProfilerDomainHandle domain,
+                        const ModuleAnnotation& annotation) {
+    tsl::profiler::RangePush(domain, annotation.title(), annotation);
+  }
+
+  std::string longest_prefix_;
+  std::string title_str_;
+  tsl::profiler::StringHandle title_;
+  tsl::profiler::StringHandle module_name_;
+  tsl::profiler::StringHandle common_src_locations_{};
+  int32_t module_id_{-1};
+  int32_t common_stack_frames_{};
+};
+
+// Prepared information for a kernel/thunk/fusion/... within an HloModule
+struct KernelAnnotation {
+  KernelAnnotation(const ModuleAnnotation& module_annotation,
+                   const HloInstruction& inst);
+
+  explicit operator absl::string_view() const { return title_str; }
+  static uint64_t NvtxSchemaId();
+
+ private:
+  friend void RangePush(tsl::profiler::ProfilerDomainHandle domain,
+                        const KernelAnnotation& annotation) {
+    tsl::profiler::RangePush(domain, annotation.title, annotation);
+  }
+
+  std::string title_str;
+  tsl::profiler::StringHandle title;
+  tsl::profiler::StringHandle hlo_dump;
+  tsl::profiler::StringHandle src_locations;
+  tsl::profiler::StringHandle called_hlo_dump;
+};
+
+// Parsed/prepared information for an HloModule that gets propagated to NVTX
+// ranges/profilers/... at execution time.
+struct ModuleAnnotations {
+  explicit ModuleAnnotations(absl::string_view module_name);
+  explicit ModuleAnnotations(const HloModule&);
+
+  ModuleAnnotation top_level;
+  absl::flat_hash_map<absl::string_view, KernelAnnotation> kernels;
+};
+
+//===----------------------------------------------------------------------===//
+// Scoped RAII helper to set and restore thread local module annotations
+//===----------------------------------------------------------------------===//
+
+class ScopedModuleAnnotations {
+ public:
+  explicit ScopedModuleAnnotations(const ModuleAnnotations* annotations);
+  ~ScopedModuleAnnotations();
+
+ private:
+  const ModuleAnnotations* restore_;
+};
+
+const ModuleAnnotations* GetCurrentModuleAnnotations();
+
+std::optional<tsl::profiler::ScopedAnnotation> GetKernelAnnotation(
+    absl::string_view profile_annotation);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
new file mode 100644
index 00000000..95048d23
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CHOLESKY_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CHOLESKY_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu_solver_context.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a Cholesky
+// decomposition (LAPACK potrf). It is generated by IrEmitter.
+//
+// As an implementation detail, we may run potrf (potentially in a loop, if
+// batch_size >1), or potrfBatched.
+//
+// Thread-compatible.
+class CholeskyThunk : public Thunk {
+ public:
+  CholeskyThunk(
+      ThunkInfo thunk_info, const CholeskyOptions& options,
+      BufferAllocation::Slice a_buffer,
+      BufferAllocation::Slice workspace_buffer,
+      BufferAllocation::Slice info_buffer, PrimitiveType type,
+      int64_t batch_size, int64_t n,
+      absl::AnyInvocable<
+          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
+          solver_context_creator);
+
+  CholeskyThunk(const CholeskyThunk&) = delete;
+  CholeskyThunk& operator=(const CholeskyThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  se::blas::UpperLower uplo_;
+
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice workspace_buffer_;
+  const BufferAllocation::Slice info_buffer_;
+
+  const PrimitiveType type_;
+  const int64_t batch_size_;
+  const int64_t n_;
+  absl::AnyInvocable<
+      absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
+      solver_context_creator_;
+};
+
+struct CholeskyParams {
+  int64_t n;
+  int64_t batch_size;
+  se::blas::UpperLower uplo;
+  se::DeviceMemoryBase a_buffer;
+  se::DeviceMemoryBase workspace_buffer;
+  se::DeviceMemoryBase info_buffer;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CHOLESKY_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
new file mode 100644
index 00000000..eb088386
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -0,0 +1,1209 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/dynamic_slice_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+// clang-format off
+#define COMMAND_BUFFER_CMD_LIST(V)                       \
+  V(kTracedCommandBufferCmd, "TracedCommandBufferCmd")   \
+  V(kComputationIdCmd, "ComputationIdCmd")               \
+  V(kLaunchCmd, "LaunchCmd")                             \
+  V(kCustomKernelLaunchCmd, "CustomKernelLaunchCmd")     \
+  V(kCublasLtCmd, "CublasLtCmd")                         \
+  V(kCuDnnCmd, "CuDnnCmd")                               \
+  V(kGemmCmd, "GemmCmd")                                 \
+  V(kMemcpyDeviceToDeviceCmd, "MemcpyDeviceToDeviceCmd") \
+  V(kMemzeroCmd, "MemzeroCmd")                           \
+  V(kMemset32Cmd, "Memset32Cmd")                         \
+  V(kIfCmd, "IfCmd")                                     \
+  V(kIfElseCmd, "IfElseCmd")                             \
+  V(kCaseCmd, "CaseCmd")                                 \
+  V(kForCmd, "ForCmd")                                   \
+  V(kWhileCmd, "WhileCmd")                               \
+  V(kCustomCallCmd, "CustomCallCmd")                     \
+  V(kBarrierCmd, "BarrierCmd")                           \
+  V(kCollectiveCmd, "CollectiveCmd")                     \
+  V(kAllReduceCmd, "AllReduceCmd")                       \
+  V(kReduceScatter, "ReduceScatterCmd")                  \
+  V(kAllToAll, "AllToAllCmd")                            \
+  V(kAllGatherCmd, "AllGatherCmd")                       \
+  V(kCollectiveBroadcastCmd, "CollectiveBroadcastCmd")   \
+  V(kDynamicSliceFusionCmd, "DynamicSliceFusionCmd")     \
+  V(kUnknownCmd, "UnknownCmd") \
+  // clang-format on
+
+enum class CommandBufferCmdType : int32_t {
+#define DECLARE_ENUM(enum_name, cmd_name, ...) enum_name,
+  COMMAND_BUFFER_CMD_LIST(DECLARE_ENUM)
+#undef DECLARE_ENUM
+};
+
+std::string CommandBufferCmdString(CommandBufferCmdType type);
+
+//===----------------------------------------------------------------------===//
+// CommandBufferCmd
+//===----------------------------------------------------------------------===//
+
+// Command is a Thunk counterpart that instead of launching operations directly
+// on the underlying device records them into command buffers.
+//
+// Commands have the same execution stages as thunks as they are executed by a
+// command buffer thunk: Prepare, Initialize and Record (Execute). See Thunk
+// documentation for details.
+//
+// Commands must be thread safe as they can be recorded into multiple command
+// buffers concurrently on different stream executors.
+class CommandBufferCmd {
+ public:
+  CommandBufferCmd(CommandBufferCmdType cmd_type,
+                   ExecutionStreamId execution_stream_id)
+      : cmd_type_(cmd_type), execution_stream_id_(execution_stream_id) {}
+  virtual ~CommandBufferCmd() = default;
+
+  enum class MemoryAccess { kRead, kWrite };
+
+  // BufferUsage tracks memory access type for a buffer slice, so that we can
+  // correctly insert command buffer barriers to avoid read/write conflicts.
+  struct BufferUsage {
+    BufferUsage(BufferAllocation::Slice slice, MemoryAccess access)
+        : slice(slice), access(access) {}
+
+    template <typename H>
+    friend H AbslHashValue(H h, const BufferUsage& buffer) {
+      return H::combine(std::move(h), buffer.slice, buffer.access);
+    }
+
+    bool operator==(const BufferUsage& other) const {
+      return slice == other.slice && access == other.access;
+    }
+
+    BufferAllocation::Slice slice;
+    MemoryAccess access;
+  };
+
+  using BufferUsageVector = absl::InlinedVector<BufferUsage, 4>;
+
+  // A base class for externally managed command state.
+  //
+  // Commands can be executed concurrently for many stream executors (underlying
+  // devices) and command buffers. Managing per-executor state can become
+  // expensive as it requires synchronization. Furthermore the number of command
+  // buffers command is recorded into is unbounded as they come and go (command
+  // buffers evicted and reconstructed) which makes it hard to manage the
+  // lifetime of resources attached to command buffers.
+  //
+  // Externally managed state (owned and synchronized by CommandBufferThunk)
+  // allows commands to attach a piece of information to command buffer in a
+  // safe and performant way.
+  class State {
+   public:
+    virtual ~State() = default;
+  };
+
+  // An external manager for a state attached to commands.
+  class StateManager {
+   public:
+    virtual ~StateManager() = default;
+
+    template <typename ConcreteState>
+    ConcreteState* GetOrNull(const CommandBufferCmd* cmd) {
+      static_assert(std::is_base_of_v<State, ConcreteState>);
+      return static_cast<ConcreteState*>(GetOrNull(cmd));
+    }
+
+    template <typename ConcreteState>
+    ConcreteState* GetOrCreate(
+        const CommandBufferCmd* cmd,
+        absl::FunctionRef<std::unique_ptr<ConcreteState>()> create) {
+      static_assert(std::is_base_of_v<State, ConcreteState>);
+      return static_cast<ConcreteState*>(GetOrCreate(
+          cmd, [&]() -> std::unique_ptr<State> { return create(); }));
+    }
+
+    template <typename ConcreteState>
+    ConcreteState* GetOrCreate(const CommandBufferCmd* cmd) {
+      static_assert(std::is_base_of_v<State, ConcreteState>);
+      return static_cast<ConcreteState*>(
+          GetOrCreate(cmd, [] { return std::make_unique<ConcreteState>(); }));
+    }
+
+   private:
+    State* GetOrNull(const CommandBufferCmd* cmd);
+
+    State* GetOrCreate(const CommandBufferCmd* cmd,
+                       absl::FunctionRef<std::unique_ptr<State>()> create);
+
+    absl::flat_hash_map<const CommandBufferCmd*, std::unique_ptr<State>> state_;
+  };
+
+  // Parameters for recording commands into the command buffer.
+  struct RecordParams {
+    // An external state manager that gives efficient access to per-device state
+    // to commands without a need to add expensive synchronization.
+    StateManager& state;
+
+    // Execution scope id defines the default execution scope that should be
+    // used for recording commands. Each individual command uses this scope plus
+    // its own execution stream id to compute the execution scope that will be
+    // used for adding commands to command buffer. It is a command sequence
+    // responsibility to guarantee that all commands eventually will be
+    // correctly synchronized with an execution scope id passed as argument.
+    //
+    // This argument allows conditional commands to record a command sequence
+    // into non-default execution scope.
+    se::CommandBuffer::ExecutionScopeId execution_scope_id =
+        se::CommandBuffer::kDefaultExecutionScope;
+  };
+
+  // See Thunk documentation for XLA execution stages (prepare, initialize,
+  // execute). Commands mirror thunks as they are executed as CommandBufferThunk
+  // that is plugged into the Thunk execution cycle.
+
+  // Prepare command for execution by allowing command to request shared state
+  // required for recording (i.e. collective commands request cliques).
+  virtual absl::Status Prepare(const Thunk::PrepareParams& params,
+                               Thunk::ResourceRequests& resource_requests) {
+    return absl::OkStatus();
+  }
+
+  // Initialize a command for recording on a given executor. We split it into a
+  // separate function to allow expensive initialization (e.g. device kernel
+  // loading) to happen before a command buffer thunk execution.
+  virtual absl::Status Initialize(const Thunk::InitializeParams& params,
+                                  StateManager& state) {
+    return absl::OkStatus();
+  }
+
+  // Records command into the command buffer using given execution scope.
+  virtual absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                              const RecordParams& record_params,
+                              se::CommandBuffer* command_buffer) = 0;
+
+  // For some commands need to force update on Record even the input device
+  // pointers do not change, e.g. command that has state that can be changed by
+  // CPU code.
+  virtual bool force_update() { return false; }
+
+  // Returns all buffers used by the cmd. These will be used to track cmd
+  // updates, thus they need to be consistent across calls to the function.
+  virtual BufferUsageVector buffers() = 0;
+
+  // Returns true if command implemented as a nested command buffer.
+  virtual bool IsNestedCommandBuffer() const { return false; }
+
+  // Returns a command execution scope created from the specified
+  // 'execution_stream_id'.
+  se::CommandBuffer::ExecutionScopeId GetExecutionScope(
+      const RecordParams& record_params,
+      ExecutionStreamId execution_stream_id) const;
+
+  // Return the execution scope created from the execution stream id of the
+  // thunk which is lowered to current command.
+  virtual se::CommandBuffer::ExecutionScopeId GetExecutionScope(
+      const CommandBufferCmd::RecordParams& record_params) const;
+
+  absl::string_view profile_annotation() const { return profile_annotation_; }
+  void set_profile_annotation(absl::string_view profile_annotation) {
+    profile_annotation_ = profile_annotation;
+  }
+
+  CommandBufferCmdType command_type() const { return cmd_type_; }
+
+  virtual std::string ToString() const {
+    return CommandBufferCmdString(cmd_type_);
+  }
+
+  ExecutionStreamId execution_stream_id() const { return execution_stream_id_; }
+
+ private:
+  std::string profile_annotation_;
+  CommandBufferCmdType cmd_type_;
+  ExecutionStreamId execution_stream_id_;
+};
+
+//===----------------------------------------------------------------------===//
+// CommandBufferCmdSequence
+//===----------------------------------------------------------------------===//
+
+// A sequence of command buffer commands that create or update a command buffer.
+// You can think of CommandBufferCmdSequence as a mini interpreter whose sole
+// purpose is to manipulate command buffers at run time.
+class CommandBufferCmdSequence {
+ public:
+  // Synchronization mode defines how execution streams gets converted to
+  // command buffer execution scopes and barriers.
+  //
+  // Each individual Thunk assigned an execution stream id, and we have explicit
+  // inter-stream synchronization (`Thunk::Kind::kWaitForStreams`) between
+  // streams. Thunks assigned to the same stream are implicitly synchronized.
+  //
+  // Command buffers on the other hand by default can execute commands
+  // concurrently and require barriers to enforce execution order.
+  //
+  // WARNING: We do not have implicit synchronization between execution scopes
+  // corresponding to different execution streams and rely on explicit barriers
+  // emitted from thunks. Synchronization mode controls only barriers within
+  // a single exection scope (corresponds to execution stream).
+  enum class SynchronizationMode {
+    // Adds barriers between all commands recorded into the same execution scope
+    // (thunks sharing execution stream) and enforces completely serialized
+    // execution order that matches what would happen in a ThunkSequence.
+    kSerialize,
+
+    // Relies on buffer use analysis to insert barriers only between commands
+    // that have read-write conflicts into the same buffers. Conflicts are
+    // detected only between commands using the same stream id, and inter-stream
+    // synchronization is a user responsibility.
+    kAutomatic
+  };
+
+  enum class RecordMode {
+    // In exclusive mode no one else is recording commands into the command
+    // buffer argument, and cmd sequence is responsible for updating command
+    // buffer state: finalizing after all commands recorded, and
+    // switching to update state before recording updates.
+    kExclusive,
+
+    // In conditional mode multiple cmd sequences can be recorded into the
+    // command buffer argument, and with command buffer state managed externally
+    // cmd sequence should not finalize or update it. This mode is used when
+    // command buffer cmd sequence is recorded into conditional command buffers
+    // owned by the parent command buffer.
+    kConditional
+  };
+
+  explicit CommandBufferCmdSequence(SynchronizationMode synchronization_mode =
+                                        SynchronizationMode::kAutomatic);
+
+  void Append(std::unique_ptr<CommandBufferCmd> cmd);
+
+  template <typename T, typename... Args>
+  void Emplace(Args... args) {
+    Append(std::make_unique<T>(std::forward<Args>(args)...));
+  }
+
+  // Prepares all commands added to a sequence.
+  absl::Status Prepare(const Thunk::PrepareParams& params,
+                       Thunk::ResourceRequests& resource_requests);
+
+  // Initializes all commands added to a sequence.
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          CommandBufferCmd::StateManager& state);
+
+  // Records all commands added to a sequence into the given command buffer.
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const CommandBufferCmd::RecordParams& record_params,
+                      se::CommandBuffer* command_buffer,
+                      RecordMode mode = RecordMode::kExclusive);
+
+  // Returns buffers referenced by commands in this sequence.
+  const absl::flat_hash_set<CommandBufferCmd::BufferUsage>& buffers() const;
+
+  // Returns buffer allocations indices referenced by commands in this sequence.
+  const absl::flat_hash_set<BufferAllocation::Index>& allocs_indices() const;
+
+  // Returns a vector that tells if command at the given index requires a
+  // barrier.
+  std::vector<bool> barriers() const;
+
+  bool empty() const { return commands_.empty(); }
+  size_t size() const { return commands_.size(); }
+
+  bool force_update() const {
+    return absl::c_any_of(commands_, [](const CommandInfo& cmd_info) {
+      return cmd_info.cmd->force_update();
+    });
+  }
+
+ private:
+  struct CommandInfo {
+    std::unique_ptr<CommandBufferCmd> cmd;
+    bool requires_barrier;
+  };
+
+  // Functions for tracking buffer usage of recorded commands and figuring out
+  // when the next command requires a barrier for correctness.
+  bool HasConflicts(ExecutionStreamId execution_stream_id,
+                    const CommandBufferCmd::BufferUsageVector& buffers);
+  void TrackBuffers(ExecutionStreamId execution_stream_id,
+                    const CommandBufferCmd::BufferUsageVector& buffers);
+  void ClearTrackedBuffers(ExecutionStreamId execution_stream_id);
+
+  SynchronizationMode synchronization_mode_;
+  std::vector<CommandInfo> commands_;
+
+  // Buffers referenced by commands in this sequence.
+  absl::flat_hash_set<CommandBufferCmd::BufferUsage> buffers_;
+
+  // Buffer allocations indices referenced by commands in this sequence.
+  absl::flat_hash_set<BufferAllocation::Index> allocs_indices_;
+
+  // We track read and write sets of commands recorded into the command
+  // sequence to detect conflicts and insert explicit barriers. These are the
+  // buffer allocation slices used by commands appended since the last barrier.
+  struct ReadWriteSet {
+    absl::flat_hash_set<BufferAllocation::Slice> read;
+    absl::flat_hash_set<BufferAllocation::Slice> write;
+  };
+
+  absl::flat_hash_map<ExecutionStreamId, ReadWriteSet> read_write_sets_;
+};
+
+//===----------------------------------------------------------------------===//
+// TracedCommandBuffer
+//===----------------------------------------------------------------------===//
+
+// A cache for traced command buffers that will re-trace on change in buffer
+// allocations that are relevant for `buffers` passed to constructor. We use a
+// very simple most-recently-used cache of traced command buffers as in practice
+// subsequent calls to XLA executable tend to reuse the same allocations.
+class TracedCommandBuffer : public CommandBufferCmd::State {
+ public:
+  explicit TracedCommandBuffer(const CommandBufferCmd* trace_cmd,
+                               CommandBufferCmd::BufferUsageVector buffers,
+                               int64_t capacity = 16);
+
+  // Returns cached command buffer traced using the same buffer addresses or
+  // traces and caches a new command buffer using user provided callback.
+  absl::StatusOr<se::CommandBuffer*> GetOrTraceCommandBuffer(
+      const BufferAllocations* buffer_allocation, se::StreamExecutor* executor,
+      se::Stream* stream, absl::FunctionRef<absl::Status(se::Stream*)> trace);
+
+ private:
+  std::vector<BufferAllocation::Index> allocs_indices_;
+
+  struct Entry {
+    std::vector<se::DeviceMemoryBase> recorded_allocs;
+    std::unique_ptr<se::CommandBuffer> command_buffer;
+  };
+  const CommandBufferCmd* trace_cmd_;
+  int64_t capacity_;
+  std::vector<Entry> entries_;
+};
+
+//===----------------------------------------------------------------------===//
+// TracedCommandBufferCmd
+//===----------------------------------------------------------------------===//
+
+// A base class for commands implemented as tracing of stream activities.
+class TracedCommandBufferCmd : public CommandBufferCmd {
+ protected:
+  explicit TracedCommandBufferCmd(CommandBufferCmdType cmd_type,
+                                  ExecutionStreamId execution_stream_id);
+
+  // Creates a command buffer by calling a user-provided `trace` function and
+  // adds it as a nested command to `command_buffer`. Traced command buffers
+  // cached and reused in an instance of `TracedCommandBuffer` kept in `state`.
+  absl::Status AddTracedCommandBuffer(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      absl::FunctionRef<absl::Status(se::Stream*)> trace);
+};
+
+//===----------------------------------------------------------------------===//
+// ComputationIdCmd (ReplicaId and PartitionId)
+//===----------------------------------------------------------------------===//
+
+class ComputationIdCmd : public CommandBufferCmd {
+ public:
+  enum class Kind { kReplica, kPartition };
+
+  ComputationIdCmd(ExecutionStreamId execution_stream_id,
+                   BufferAllocation::Slice dest, Kind kind);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice dest_;
+  Kind kind_;
+
+  // Command sequence can be recorded concurrently for multiple command buffers
+  // on different stream executors and we need to synchronize mutable state.
+  absl::Mutex mutex_;
+
+  // TODO(ezhulenev): This is a workaround for CUDA graphs + conditional nodes
+  // bug that will be fixed in CUDA 12.4.1 release: currently it's impossible to
+  // update a memset node inside a conditional graph. Instead of using memset
+  // node we replace it with a kernel launch node of CUDA kernels doing 1D
+  // memset. This should be removed when bug is fixed in CUDA.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      memset_kernels_ ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// LaunchCmd
+//===----------------------------------------------------------------------===//
+
+class LaunchCmd : public CommandBufferCmd {
+ public:
+  LaunchCmd(ExecutionStreamId execution_stream_id, std::string kernel_name,
+            absl::Span<const BufferAllocation::Slice> args,
+            absl::Span<const MemoryAccess> args_access, LaunchDimensions dims,
+            int64_t shmem_bytes);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  std::string kernel_name_;
+  std::vector<BufferAllocation::Slice> args_;
+  std::vector<MemoryAccess> args_access_;
+  LaunchDimensions dims_;
+  int64_t shmem_bytes_;
+
+  // Command sequence can be recorded concurrently for multiple command buffers
+  // on different stream executors and we need to synchronize mutable state.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>> kernels_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// CustomKenelLaunchCmd
+//===----------------------------------------------------------------------===//
+
+class CustomKernelLaunchCmd : public CommandBufferCmd {
+ public:
+  CustomKernelLaunchCmd(ExecutionStreamId execution_stream_id,
+                        absl::Span<const BufferAllocation::Slice> args,
+                        absl::Span<const MemoryAccess> args_access,
+                        CustomKernel custom_kernel);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  std::vector<BufferAllocation::Slice> args_;
+  std::vector<MemoryAccess> args_access_;
+  CustomKernel custom_kernel_;
+
+  // Command sequence can be recorded concurrently for multiple command buffers
+  // on different stream executors and we need to synchronize mutable state.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>> kernels_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// MemcpyDeviceToDeviceCmd
+//===----------------------------------------------------------------------===//
+
+class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
+ public:
+  MemcpyDeviceToDeviceCmd(ExecutionStreamId execution_stream_id,
+                          BufferAllocation::Slice dst,
+                          BufferAllocation::Slice src, int64_t num_bytes);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice dst_;
+  BufferAllocation::Slice src_;
+  int64_t num_bytes_;
+};
+
+//===----------------------------------------------------------------------===//
+// MemzeroCmd
+//===----------------------------------------------------------------------===//
+
+class MemzeroCmd : public CommandBufferCmd {
+ public:
+  MemzeroCmd(ExecutionStreamId execution_stream_id,
+             BufferAllocation::Slice dst);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice dst_;
+};
+
+//===----------------------------------------------------------------------===//
+// Memset32Cmd
+//===----------------------------------------------------------------------===//
+
+class Memset32Cmd : public CommandBufferCmd {
+ public:
+  Memset32Cmd(ExecutionStreamId execution_stream_id,
+              BufferAllocation::Slice dst, uint32_t bit_pattern);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice dst_;
+  uint32_t bit_pattern_;
+};
+
+//===----------------------------------------------------------------------===//
+// IfCmd
+//===----------------------------------------------------------------------===//
+
+class IfCmd : public CommandBufferCmd {
+ public:
+  IfCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice pred,
+        CommandBufferCmdSequence then_commands);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  bool force_update() override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice pred_;
+  CommandBufferCmdSequence then_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// IfElseCmd
+//===----------------------------------------------------------------------===//
+
+class IfElseCmd : public CommandBufferCmd {
+ public:
+  IfElseCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice pred,
+            CommandBufferCmdSequence then_commands,
+            CommandBufferCmdSequence else_commands);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  bool force_update() override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice pred_;
+  CommandBufferCmdSequence then_commands_;
+  CommandBufferCmdSequence else_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// CaseCmd
+//===----------------------------------------------------------------------===//
+
+class CaseCmd : public CommandBufferCmd {
+ public:
+  CaseCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice index,
+          std::vector<CommandBufferCmdSequence> branches_commands);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  bool force_update() override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice index_;
+  std::vector<CommandBufferCmdSequence> branches_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// ForCmd
+//===----------------------------------------------------------------------===//
+
+class ForCmd : public CommandBufferCmd {
+ public:
+  ForCmd(ExecutionStreamId execution_stream_id, int32_t num_iterations,
+         BufferAllocation::Slice loop_counter,
+         CommandBufferCmdSequence body_commands);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  bool force_update() override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  int32_t num_iterations_;
+  BufferAllocation::Slice loop_counter_;
+  CommandBufferCmdSequence body_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// WhileCmd
+//===----------------------------------------------------------------------===//
+
+class WhileCmd : public CommandBufferCmd {
+ public:
+  WhileCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice pred,
+           CommandBufferCmdSequence cond_commands,
+           CommandBufferCmdSequence body_commands);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  bool force_update() override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice pred_;
+  CommandBufferCmdSequence cond_commands_;
+  CommandBufferCmdSequence body_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// GemmCmd
+//===----------------------------------------------------------------------===//
+
+class GemmCmd : public TracedCommandBufferCmd {
+ public:
+  GemmCmd(ExecutionStreamId execution_stream_id, GemmConfig config,
+          const BufferAllocation::Slice& lhs_buffer,
+          const BufferAllocation::Slice& rhs_buffer,
+          const BufferAllocation::Slice& output_buffer,
+          const BufferAllocation::Slice& workspace, bool deterministic);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ private:
+  const GemmConfig config_;
+  const BufferAllocation::Slice lhs_buffer_;
+  const BufferAllocation::Slice rhs_buffer_;
+  const BufferAllocation::Slice output_buffer_;
+  const BufferAllocation::Slice workspace_;
+  // Whether to run deterministically.
+  const bool deterministic_;
+};
+
+//===----------------------------------------------------------------------===//
+// CublasLtCmd
+//===----------------------------------------------------------------------===//
+
+class CublasLtCmd : public TracedCommandBufferCmd {
+ public:
+  CublasLtCmd(ExecutionStreamId execution_stream_id, GemmConfig gemm_config,
+              se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+              BufferAllocation::Slice a_buffer,
+              BufferAllocation::Slice b_buffer,
+              BufferAllocation::Slice c_buffer,
+              BufferAllocation::Slice d_buffer,
+              BufferAllocation::Slice bias_buffer /* may be null */,
+              BufferAllocation::Slice aux_buffer /* may be null */,
+              BufferAllocation::Slice a_scale_buffer /* may be null */,
+              BufferAllocation::Slice b_scale_buffer /* may be null */,
+              BufferAllocation::Slice c_scale_buffer /* may be null */,
+              BufferAllocation::Slice d_scale_buffer /* may be null */,
+              BufferAllocation::Slice d_amax_buffer /* may be null */,
+              BufferAllocation::Slice workspace_buffer);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ private:
+  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
+      const stream_executor::Stream* stream);
+
+  absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
+      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
+
+  absl::flat_hash_map<const stream_executor::Stream*,
+                      se::gpu::BlasLt::MatmulPlanPtr>
+      matmul_plans_cache_;
+
+  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
+                      se::gpu::BlasLt::MatmulAlgorithm>
+      matmul_algorithm_cache_;
+
+  const GemmConfig gemm_config_;
+  const se::gpu::BlasLt::Epilogue epilogue_;
+  const int64_t algorithm_idx_;
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice b_buffer_;
+  const BufferAllocation::Slice c_buffer_;
+  const BufferAllocation::Slice d_buffer_;
+  const BufferAllocation::Slice bias_buffer_;
+  const BufferAllocation::Slice aux_buffer_;
+  const BufferAllocation::Slice a_scale_buffer_;
+  const BufferAllocation::Slice b_scale_buffer_;
+  const BufferAllocation::Slice c_scale_buffer_;
+  const BufferAllocation::Slice d_scale_buffer_;
+  const BufferAllocation::Slice d_amax_buffer_;
+  const BufferAllocation::Slice workspace_buffer_;
+};
+
+//===----------------------------------------------------------------------===//
+// CuDnnCmd
+//===----------------------------------------------------------------------===//
+
+class CuDnnCmd : public TracedCommandBufferCmd {
+ public:
+  CuDnnCmd(ExecutionStreamId execution_stream_id,
+           absl::Span<const BufferAllocation::Slice> args,
+           std::shared_ptr<se::dnn::LazyDnnGraph> graph);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ private:
+  std::vector<BufferAllocation::Slice> args_;
+  const std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
+};
+
+//===----------------------------------------------------------------------===//
+// CustomCallCmd
+//===----------------------------------------------------------------------===//
+
+class CustomCallCmd : public CommandBufferCmd {
+ public:
+  using Slice = CustomCallThunk::Slice;
+  using CustomCallTarget = CustomCallThunk::CustomCallTarget;
+  using AttributesMap = CustomCallThunk::AttributesMap;
+
+  // This is a legacy custom call API that is discouraged, and will be
+  // deprecated once XLA:FFI mechanism is ready.
+  CustomCallCmd(ExecutionStreamId execution_stream_id, std::string target_name,
+                CustomCallTarget call_target,
+                std::vector<std::optional<Slice>> operands,
+                std::vector<std::optional<Slice>> results,
+                absl::string_view opaque)
+      : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd,
+                         execution_stream_id),
+        target_name_(std::move(target_name)),
+        call_target_(std::move(call_target)),
+        opaque_(opaque),
+        operands_(std::move(operands)),
+        results_(std::move(results)) {}
+
+  CustomCallCmd(ExecutionStreamId execution_stream_id, std::string target_name,
+                XLA_FFI_Handler* handler,
+                std::vector<std::optional<Slice>> operands,
+                std::vector<std::optional<Slice>> results,
+                AttributesMap attributes,
+                const HloComputation* called_computation)
+      : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd,
+                         execution_stream_id),
+        target_name_(std::move(target_name)),
+        handler_(handler),
+        attributes_(std::move(attributes)),
+        called_computation_(called_computation),
+        operands_(std::move(operands)),
+        results_(std::move(results)) {}
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ private:
+  absl::Status RecordLegacyCustomCall(const Thunk::ExecuteParams& execute_param,
+                                      const RecordParams& record_params,
+                                      se::CommandBuffer* command_buffer);
+  absl::Status RecordXlaFfiCall(const Thunk::ExecuteParams& execute_param,
+                                const RecordParams& record_params,
+                                se::CommandBuffer* command_buffer);
+
+  std::string target_name_;
+
+  // This is a legacy custom call API that is discouraged, and will be
+  // deprecated once XLA:FFI mechanism is ready.
+  CustomCallTarget call_target_;
+  std::string opaque_;
+
+  // XLA FFI provides a right type safe mechanism for registering external
+  // functions with XLA runtime. It's under construction, and still misses
+  // a lot of features. Long term it will replace legacy custom calls.
+  XLA_FFI_Handler* handler_ = nullptr;
+  AttributesMap attributes_;
+  const HloComputation* called_computation_;
+
+  std::vector<std::optional<Slice>> operands_;
+  std::vector<std::optional<Slice>> results_;
+};
+
+//===----------------------------------------------------------------------===//
+// BarrierCmd insert a barrier from the execution scope created from the
+// 'from_stream_id' to the execution scope created from the
+// 'execution_stream_id', e.g. Async operator lowered to command buffer requires
+// a barrier from the launching stream to the async operator's execution stream.
+//
+// In other words, all future commands added to `execution_stream_id` are
+// guaranteed to begin executing only after all already-added commands in
+// `from_stream_id` have completed.
+//===----------------------------------------------------------------------===//
+
+class BarrierCmd : public CommandBufferCmd {
+ public:
+  BarrierCmd(ExecutionStreamId execution_stream_id,
+             ExecutionStreamId from_stream_id);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  const ExecutionStreamId from_stream_id_;
+};
+
+//===----------------------------------------------------------------------===//
+// CollectiveCmd
+//===----------------------------------------------------------------------===//
+
+class CollectiveCmd : public CommandBufferCmd {
+ public:
+  CollectiveCmd(CommandBufferCmdType cmd_type,
+                ExecutionStreamId execution_stream_id,
+                ExecutionStreamId async_from_stream_id,
+                NcclCollectiveConfig config);
+
+  absl::Status Prepare(const Thunk::PrepareParams& params,
+                       Thunk::ResourceRequests& resource_requests) final;
+
+  bool force_update() override { return true; }
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+  absl::Status AddTracedCommandBuffer(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      absl::FunctionRef<absl::Status(se::Stream*)> trace);
+
+  virtual AsyncStreamKind GetAsyncStreamKind() = 0;
+
+  bool IsAsync() const {
+    return async_from_stream_id_ != execution_stream_id();
+  }
+
+  CollectiveStreamId nccl_stream_id() {
+    return xla::gpu::GetCollectiveStreamId(IsAsync(), GetAsyncStreamKind());
+  }
+
+  ExecutionStreamId async_from_stream_id() const {
+    return async_from_stream_id_;
+  }
+
+  absl::Status BarrierIfAsync(
+      se::CommandBuffer* command_buffer, se::StreamExecutor* executor,
+      const CommandBufferCmd::RecordParams& record_params);
+
+ protected:
+  const NcclCollectiveConfig& config() const { return config_; }
+
+ private:
+  ExecutionStreamId async_from_stream_id_;
+  NcclCollectiveConfig config_;
+};
+
+//===----------------------------------------------------------------------===//
+// AllReduceCmd
+//===----------------------------------------------------------------------===//
+
+class AllReduceCmd : public CollectiveCmd {
+ public:
+  AllReduceCmd(ExecutionStreamId execution_stream_id,
+               ExecutionStreamId async_from_stream_id,
+               NcclCollectiveConfig config, ReductionKind reduction_kind,
+               absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  AsyncStreamKind GetAsyncStreamKind() override {
+    return AsyncStreamKind::kCollective;
+  };
+
+ private:
+  ReductionKind reduction_kind_;
+  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+};
+
+//===----------------------------------------------------------------------===//
+// ReduceScatterCmd
+//===----------------------------------------------------------------------===//
+
+class ReduceScatterCmd : public CollectiveCmd {
+ public:
+  ReduceScatterCmd(ExecutionStreamId execution_stream_id,
+                   ExecutionStreamId async_from_stream_id,
+                   NcclCollectiveConfig config, ReductionKind reduction_kind,
+                   absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  AsyncStreamKind GetAsyncStreamKind() override {
+    return AsyncStreamKind::kCollective;
+  };
+
+ private:
+  ReductionKind reduction_kind_;
+  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+};
+
+//===----------------------------------------------------------------------===//
+// AllToAllCmd
+//===----------------------------------------------------------------------===//
+
+class AllToAllCmd : public CollectiveCmd {
+ public:
+  AllToAllCmd(ExecutionStreamId execution_stream_id,
+              ExecutionStreamId async_from_stream_id,
+              NcclCollectiveConfig config, bool has_split_dimension,
+              absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  AsyncStreamKind GetAsyncStreamKind() override {
+    return AsyncStreamKind::kCollective;
+  };
+
+ private:
+  bool has_split_dimension_;
+  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+};
+
+//===----------------------------------------------------------------------===//
+// AllGatherCmd
+//===----------------------------------------------------------------------===//
+
+class AllGatherCmd : public CollectiveCmd {
+ public:
+  AllGatherCmd(ExecutionStreamId execution_stream_id,
+               ExecutionStreamId async_from_stream_id,
+               NcclCollectiveConfig config,
+               absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  AsyncStreamKind GetAsyncStreamKind() override {
+    return AsyncStreamKind::kCollective;
+  };
+
+ private:
+  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+};
+
+//===----------------------------------------------------------------------===//
+// CollectiveBroadcastCmd
+//===----------------------------------------------------------------------===//
+
+class CollectiveBroadcastCmd : public CollectiveCmd {
+ public:
+  CollectiveBroadcastCmd(ExecutionStreamId execution_stream_id,
+                         ExecutionStreamId async_from_stream_id,
+                         NcclCollectiveConfig config,
+                         absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+};
+
+//===----------------------------------------------------------------------===//
+// DynamicSliceFusionCmd
+//===----------------------------------------------------------------------===//
+
+class DynamicSliceFusionCmd : public CommandBufferCmd {
+ public:
+  DynamicSliceFusionCmd(
+      ExecutionStreamId execution_stream_id,
+      std::unique_ptr<CommandBufferCmdSequence> embedded_commands,
+      std::vector<std::optional<BufferAllocation::Slice>> arguments,
+      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
+      std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
+          offsets,
+      std::vector<std::optional<Shape>> orig_shapes,
+      std::vector<std::optional<Shape>> sliced_shapes,
+      std::vector<std::optional<uint64_t>> offset_byte_sizes);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Prepare(const Thunk::PrepareParams& params,
+                       Thunk::ResourceRequests& resource_requests) final;
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+  bool force_update() override;
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ private:
+  std::unique_ptr<CommandBufferCmdSequence> embedded_commands_;
+  std::vector<DynamicSliceThunk::SliceDef> slices_;
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+
+  // Pinned host memory for transferring offset values from device to host.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      offsets_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  // Pre-computed size requirement for `offsets_allocs_`.
+  int64_t offsets_allocs_size_ = 0;
+
+  // A mapping from argument index to the base offset in the `offsets_allocs_`.
+  std::vector<int64_t> offsets_allocs_base_;
+
+  // mapping from original allocation index to allocation index of embedded
+  // command sequences.
+  absl::flat_hash_map<int64_t, std::optional<BufferAllocation::Slice>>
+      embeded_to_origin_slice_map_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
new file mode 100644
index 00000000..e662a28a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_EMITTER_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_EMITTER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+// Converts thunk sequence to a command buffer cmd sequence. If
+// `synchronization_mode` is kSerialize, we automatically insert barriers
+// between all commands in a sequence. Otherwise we use buffer usage aliasing to
+// allow commands to run concurrently and insert barriers only when needed for
+// correctness.
+absl::StatusOr<CommandBufferCmdSequence> ConvertToCommands(
+    const ThunkSequence& sequence,
+    CommandBufferCmdSequence::SynchronizationMode synchronization_mode);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
new file mode 100644
index 00000000..9154d8c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
@@ -0,0 +1,146 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+class CommandBufferThunk : public Thunk {
+ public:
+  CommandBufferThunk(CommandBufferCmdSequence commands, ThunkInfo thunk_info,
+                     std::unique_ptr<SequentialThunk> thunks = nullptr,
+                     bool enable_command_buffers_during_profiling = false);
+
+  const std::unique_ptr<SequentialThunk>& thunks() const { return thunks_; }
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  // Return the allocation address that was lazilly allocated inside command
+  // buffer. This API is required when the buffers are allocated inside command
+  // buffer but will be consumed by non-command buffer operations.
+  absl::StatusOr<se::DeviceMemoryBase> GetCommandBufferAllocationAddress(
+      const ExecuteParams& params, int64_t index);
+
+  void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
+
+ private:
+  // Command buffer instantiated on a `se::StreamExecutor` instance, and
+  // auxiliary state required for efficient command buffer updates.
+  struct ExecutorCommandBuffer {
+    explicit ExecutorCommandBuffer(
+        std::unique_ptr<se::CommandBuffer> command_buffer);
+
+    // Returns true if `commands` cmd sequence has to be recorded into
+    // `command_buffer` to update it (see `recorded_allocs` below).
+    bool ShouldUpdateCommandBuffer(const CommandBufferCmdSequence& commands,
+                                   const Thunk::ExecuteParams& params)
+        ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex);
+
+    // se::CommandBuffer is not thread safe, and we guard it with a mutex to
+    // guarantee that we do not mutate it concurrently.
+    absl::Mutex mutex;
+    std::unique_ptr<se::CommandBuffer> command_buffer ABSL_GUARDED_BY(mutex);
+
+    // A manager for an external state attached by commands in a command
+    // sequence to a command buffer.
+    CommandBufferCmd::StateManager state ABSL_GUARDED_BY(mutex);
+
+    // Mapping from buffer allocation index to the device memory passed at
+    // that index to the last call of `commands_.Record(...)` for
+    // `command_buffer`. We can just use a vector instead of map because
+    // `BufferAllocation::Index` is a unique identifier assigned
+    // contiguously and thus can be used as array index.
+    //
+    // If no device memory addresses changed from a previous call to
+    // `Record`, we can skip command buffer update and simply submit it for
+    // execution on a stream. All other pieces of information (like thread
+    // and block sizes) captured by commands at construction time and do not
+    // change.
+    std::vector<se::DeviceMemoryBase> recorded_allocs ABSL_GUARDED_BY(mutex);
+
+    // Number of command buffer executions since last update.
+    int64_t num_executions ABSL_GUARDED_BY(mutex) = 0;
+  };
+
+  // Command buffer thunk owns commands buffers instantiated on all executors.
+  struct State {
+    absl::Mutex mutex;
+    absl::flat_hash_map<se::StreamExecutor*,
+                        std::shared_ptr<ExecutorCommandBuffer>>
+        command_buffers ABSL_GUARDED_BY(mutex);
+  };
+
+  // Returns a command buffer instantiated for `executor` or creates new one.
+  absl::StatusOr<std::shared_ptr<ExecutorCommandBuffer>>
+  GetOrCreateCommandBuffer(se::StreamExecutor* executor);
+
+  // Each individual command buffer allocates state on device (CUDA graph) and
+  // it adds up pretty quickly. To prevent OOM errors we proactively evict
+  // command buffers from device by clearing command buffer thunk state. We use
+  // global state to track all command buffer thunks in a process and coordinate
+  // command buffer eviction.
+  struct GlobalState;
+
+  // Returns a global state of tracked command buffers thunks.
+  static GlobalState* GetGlobalState();
+
+  // Adds command buffer thunk state for tracking.
+  static void TrackCommandBuffers(std::weak_ptr<State> state);
+
+  // Evicts all previously instantiated command buffers.
+  static void EvictCommandBuffers();
+
+  // Command sequence that initializes command buffers on each executor.
+  CommandBufferCmdSequence commands_;
+
+  // Thunk sequence that executes the same commands as in `commands_` but using
+  // thunk mechanism. We use it as a fallback mechanism to work around CUPTI
+  // bugs that lead to memory corruption when CUPTI traces CUDA graph execution.
+  std::unique_ptr<SequentialThunk> thunks_;
+
+  // When true, allows command buffers to be used while profiling active.
+  // TODO(b/355487968): Remove this option when validation complete.
+  bool enable_command_buffers_during_profiling_;
+
+  // Command buffer thunk state allocated in heap to allow global (per-process)
+  // management of instantiated command buffers.
+  std::shared_ptr<State> state_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
new file mode 100644
index 00000000..833df053
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
@@ -0,0 +1,91 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CONDITIONAL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CONDITIONAL_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+struct ConditionalThunkConfig {
+  bool branch_index_is_bool;
+  int64_t branch_count;
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+};
+
+// ConditionalThunk implements the conditional instruction on GPU by reading the
+// predicate of the conditional and executing the true or the false computation
+// depending on the value of the predicate.
+//
+// ConditionalThunk assumes that the buffers of the conditional result and the
+// result of the true and false computations share the same allocation. Also,
+// the buffers of the true operand of the conditional and that of the parameter
+// instruction of the true computation share the same allocation. Similarly, the
+// buffers of the false operand and that of the parameter instruction of the
+// false computation share the same allocation.
+class ConditionalThunk : public Thunk {
+ public:
+  ConditionalThunk(ThunkInfo thunk_info, ConditionalThunkConfig config,
+                   const BufferAllocation::Slice& branch_index_buffer_index);
+
+  ConditionalThunk(const ConditionalThunk&) = delete;
+  ConditionalThunk& operator=(const ConditionalThunk&) = delete;
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  absl::Span<const std::unique_ptr<SequentialThunk>> branch_thunks() const {
+    return config_.branch_thunks;
+  }
+
+  const BufferAllocation::Slice& branch_index_buffer() const {
+    return branch_index_buffer_index_;
+  }
+
+  void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
+
+ private:
+  const ConditionalThunkConfig config_;
+  const BufferAllocation::Slice branch_index_buffer_index_;
+
+  // Pinned host memory for transferring predicate value from device to host.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      predicates_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CONDITIONAL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
new file mode 100644
index 00000000..3f9db4ea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
@@ -0,0 +1,97 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CONVOLUTION_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CONVOLUTION_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/gpu_conv_runner.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a DNN
+// convolution. It is generated by IrEmitter.
+//
+// This is thread-compatible.
+class ConvolutionThunk : public Thunk {
+ public:
+  // Constructs a thunk for launching a DNN convolution.
+  //
+  // operand_slices should be in the same order as cudnn_call->operands().
+  ConvolutionThunk(ThunkInfo thunk_info, GpuConvConfig config,
+                   std::vector<BufferAllocation::Slice> operand_slices,
+                   std::vector<BufferAllocation::Slice> result_slices,
+                   BufferAllocation::Slice scratch_slice);
+
+  ConvolutionThunk(const ConvolutionThunk&) = delete;
+  ConvolutionThunk& operator=(const ConvolutionThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::vector<BufferAllocation::Slice> operand_buffers_;
+  std::vector<BufferAllocation::Slice> result_buffers_;
+  BufferAllocation::Slice scratch_buffer_;
+  GenericConvRunner& GetOrCreateRunner(const stream_executor::Stream* stream,
+                                       bool* runner_created);
+
+  // Convolution config
+  const GpuConvConfig config_;
+  absl::Mutex mu_;
+  absl::flat_hash_map<const stream_executor::Stream*,
+                      std::unique_ptr<GenericConvRunner>>
+      runner_cache_ ABSL_GUARDED_BY(mu_);
+};
+
+// Launches the kernel that reorders input data for int8x32 convolutions.
+class ConvolutionReorderThunk : public Thunk {
+ public:
+  ConvolutionReorderThunk(
+      ThunkInfo thunk_info, absl::Span<int64_t> filter_nchw,
+      absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices,
+      absl::InlinedVector<BufferAllocation::Slice, 2> result_slices);
+
+  ConvolutionReorderThunk(const ConvolutionReorderThunk&) = delete;
+  ConvolutionReorderThunk& operator=(const ConvolutionReorderThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  static se::dnn::FilterDescriptor CreateFilterDescriptor(
+      absl::Span<int64_t> filter_nchw);
+
+  const se::dnn::FilterDescriptor filter_descriptor_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> operand_buffers_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> result_buffers_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CONVOLUTION_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/copy_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
new file mode 100644
index 00000000..82d9fc42
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
@@ -0,0 +1,174 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_COPY_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_COPY_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// A thunk that copies data from a device buffer to another device buffer.
+class DeviceToDeviceCopyThunk : public Thunk {
+ public:
+  // Constructs a CopyThunk that copies host data from `source_buffer` to the
+  // device buffer `destination_buffer`. `mem_size` is the size of the data in
+  // bytes.
+  DeviceToDeviceCopyThunk(ThunkInfo thunk_info,
+                          const BufferAllocation::Slice& source_buffer,
+                          const BufferAllocation::Slice& destination_buffer,
+                          uint64_t mem_size);
+
+  DeviceToDeviceCopyThunk(const DeviceToDeviceCopyThunk&) = delete;
+  DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const BufferAllocation::Slice& source() const { return source_buffer_; }
+  const BufferAllocation::Slice& destination() const {
+    return destination_buffer_;
+  }
+  uint64_t size_bytes() const { return mem_size_; }
+
+ private:
+  const BufferAllocation::Slice source_buffer_;
+  const BufferAllocation::Slice destination_buffer_;
+  const uint64_t mem_size_;
+};
+
+//===----------------------------------------------------------------------===//
+// CopyThunk
+//===----------------------------------------------------------------------===//
+class CopyThunk : public Thunk {
+ public:
+  class AsyncEvents {
+   public:
+    // Add a new copy-start completion event.
+    absl::Status Emplace(se::StreamExecutor* executor,
+                         const HloInstruction* instr,
+                         std::unique_ptr<se::Event> event);
+
+    // Retrieve a completion event started by copy-start instruction
+    // `instr`, and remove the event from the collection.
+    absl::StatusOr<std::unique_ptr<se::Event>> Extract(
+        se::StreamExecutor* executor, const HloInstruction* instr);
+
+   private:
+    using Key = std::pair<se::StreamExecutor*, const HloInstruction*>;
+    absl::Mutex mutex_;
+    absl::flat_hash_map<Key, std::unique_ptr<se::Event>> events_
+        ABSL_GUARDED_BY(mutex_);
+  };
+  CopyThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
+            const BufferAllocation::Slice& destination_buffer,
+            uint64_t mem_size);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  const BufferAllocation::Slice& source() const { return source_buffer_; }
+  const BufferAllocation::Slice& destination() const {
+    return destination_buffer_;
+  }
+  uint64_t size_bytes() const { return mem_size_; }
+
+ private:
+  const BufferAllocation::Slice source_buffer_;
+  const BufferAllocation::Slice destination_buffer_;
+  const uint64_t mem_size_;
+};
+
+//===----------------------------------------------------------------------===//
+// DeviceToHostCopyThunk
+//===----------------------------------------------------------------------===//
+// The memcpy between a host and a device
+
+// A thunk that copies data from a device buffer to a host buffer.
+class DeviceToHostCopyThunk : public CopyThunk {
+ public:
+  // Constructs a DeviceToHostCopyThunk that copies data from `source_buffer` to
+  // the device buffer `destination_buffer`. `mem_size` is the size of the data
+  // in bytes. `events` are the cuda record/wait events.
+  // `instr` is the copy-start instruction.
+  DeviceToHostCopyThunk(ThunkInfo thunk_info,
+                        const BufferAllocation::Slice& source_buffer,
+                        const BufferAllocation::Slice& destination_buffer,
+                        uint64_t mem_size,
+                        std::shared_ptr<CopyThunk::AsyncEvents> events,
+                        const HloInstruction* instr);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
+  const HloInstruction* instr_;
+};
+
+//===----------------------------------------------------------------------===//
+// HostToDeviceCopyThunk
+//===----------------------------------------------------------------------===//
+// The memcpy between a host and a device
+
+// A thunk that copies data from a host buffer to a device buffer.
+class HostToDeviceCopyThunk : public CopyThunk {
+ public:
+  // Constructs a HostToDeviceCopyThunk that copies data from `source_buffer` to
+  // the host buffer `destination_buffer`. `mem_size` is the size of the data
+  // in bytes. `events` are the cuda record/wait events.
+  // `instr` is the copy-start instruction.
+  HostToDeviceCopyThunk(ThunkInfo thunk_info,
+                        const BufferAllocation::Slice& source_buffer,
+                        const BufferAllocation::Slice& destination_buffer,
+                        uint64_t mem_size,
+                        std::shared_ptr<CopyThunk::AsyncEvents> events,
+                        const HloInstruction* instr);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
+  const HloInstruction* instr_;
+};
+
+//===----------------------------------------------------------------------===//
+// CopyDoneThunk
+//===----------------------------------------------------------------------===//
+
+class CopyDoneThunk : public Thunk {
+ public:
+  CopyDoneThunk(Thunk::Kind kind, ThunkInfo thunk_info,
+                std::shared_ptr<CopyThunk::AsyncEvents> events,
+                const HloInstruction* copy_start_instr);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
+  const HloInstruction* copy_start_instr_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_COPY_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
new file mode 100644
index 00000000..c267b829
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
@@ -0,0 +1,83 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CUB_SORT_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CUB_SORT_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+class CubSortRunnerInterface {
+ public:
+  virtual ~CubSortRunnerInterface() = default;
+  virtual absl::Status Run(se::DeviceMemoryBase input_keys,
+                           se::DeviceMemoryBase input_values,
+                           se::DeviceMemoryBase output_keys,
+                           se::DeviceMemoryBase output_values,
+                           se::DeviceMemoryBase scratch, bool descending,
+                           int64_t batch_size, se::Stream* stream) = 0;
+  virtual absl::Status Run(const Thunk::ExecuteParams& params,
+                           const class CubSortThunk* thunk) = 0;
+  virtual absl::StatusOr<int64_t> GetScratchSize(int64_t num_items,
+                                                 int64_t batch_size) = 0;
+
+  static absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> Create(
+      PrimitiveType type, std::optional<PrimitiveType> value_type);
+};
+
+class CubSortThunk : public Thunk {
+ public:
+  CubSortThunk(ThunkInfo thunk_info, PrimitiveType type,
+               std::optional<PrimitiveType> value_type,
+               absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+               absl::InlinedVector<BufferAllocation::Slice, 2> results,
+               BufferAllocation::Slice scratch, bool descending,
+               int64_t batch_size);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return runner_->Run(params, this);
+  }
+
+  BufferAllocation::Slice operand(int i) const { return operands_[i]; }
+  BufferAllocation::Slice result(int i) const { return results_[i]; }
+  BufferAllocation::Slice scratch() const { return scratch_; }
+  bool descending() const { return descending_; }
+  int64_t batch_size() const { return batch_size_; }
+
+ private:
+  std::unique_ptr<CubSortRunnerInterface> runner_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> operands_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> results_;
+  BufferAllocation::Slice scratch_;
+  bool descending_;
+  int64_t batch_size_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CUB_SORT_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
new file mode 100644
index 00000000..1990e207
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CUDNN_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CUDNN_THUNK_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+// Wraps executable cuDNN graph objects.
+class CuDnnThunk : public Thunk {
+ public:
+  CuDnnThunk(std::string fingerprint, ThunkInfo,
+             absl::Span<const KernelArgument>,
+             std::optional<int64_t> sdpa_dropout_seed = std::nullopt);
+  CuDnnThunk(const CuDnnThunk&) = delete;
+  CuDnnThunk& operator=(const CuDnnThunk&) = delete;
+  ~CuDnnThunk() override = default;
+
+  absl::Status Initialize(const InitializeParams&) override;
+  absl::Status ExecuteOnStream(const ExecuteParams&) override;
+
+  std::shared_ptr<se::dnn::LazyDnnGraph> graph() const { return graph_; }
+  const std::vector<BufferAllocation::Slice>& arguments() const {
+    return args_;
+  }
+
+ private:
+  absl::once_flag once_flag_;
+  std::string fingerprint_;
+  std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
+  std::vector<BufferAllocation::Slice> args_;
+  // Sdpa dropout seed
+  std::optional<int64_t> sdpa_dropout_seed_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CUDNN_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/custom_call_target.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/custom_call_target.h
new file mode 100644
index 00000000..ac8d819a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/custom_call_target.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_TARGET_H_
+#define XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_TARGET_H_
+
+#include <cstddef>
+
+#include "xla/service/custom_call_status.h"
+
+namespace xla::gpu {
+
+// Custom call signature as used by API_VERSION_ORIGINAL.
+using CustomCallWithOpaqueStreamHandle = void (*)(void* stream, void** buffers,
+                                                  const char* opaque,
+                                                  size_t opaque_len);
+
+// Custom call signature as used by API_VERSION_STATUS_RETURNING.
+using CustomCallWithStatusAndOpaqueStreamHandle =
+    void (*)(void* stream, void** buffers, const char* opaque,
+             size_t opaque_len, XlaCustomCallStatus* status);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_TARGET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
new file mode 100644
index 00000000..337d720a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -0,0 +1,157 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_THUNK_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/execution_state.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk to run a GPU custom call.
+//
+// This thunk's `ExecuteOnStream` implementation executes a host function
+// `call_target` which is expected to enqueue operations onto the GPU.
+//
+// Note that not all kCustomCall HLOs in XLA:GPU end up being run by this thunk.
+// XLA itself creates kCustomCall instructions when lowering kConvolution HLOs
+// into calls to cudnn.  These internally-created custom-calls are run using
+// ConvolutionThunk, not CustomCallThunk.  There's no ambiguity because they
+// have special call target names (e.g. "__cudnn$convForward") that only the
+// compiler is allowed to create.
+class CustomCallThunk : public Thunk {
+ public:
+  using CustomCallTarget =
+      std::function<void(stream_executor::Stream*, void**, const char*, size_t,
+                         XlaCustomCallStatus*)>;
+
+  // We keep buffer allocation slice together with its shape to be able to fill
+  // FFI arguments with required details.
+  struct Slice {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  using Attribute = ffi::CallFrameBuilder::Attribute;
+  using AttributesMap = ffi::CallFrameBuilder::AttributesMap;
+
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      ThunkInfo thunk_info, std::string target_name,
+      CustomCallTarget call_target, std::vector<std::optional<Slice>> operands,
+      std::vector<std::optional<Slice>> results, const std::string& opaque);
+
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      ThunkInfo thunk_info, std::string target_name,
+      XLA_FFI_Handler_Bundle bundle, std::vector<std::optional<Slice>> operands,
+      std::vector<std::optional<Slice>> results, AttributesMap attributes,
+      const HloComputation* called_computation);
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const std::string& target_name() const { return target_name_; }
+  CustomCallTarget call_target() const { return call_target_; }
+  std::optional<XLA_FFI_Handler_Bundle> bundle() const { return bundle_; }
+  const AttributesMap& attributes() const { return attributes_; }
+
+  const std::vector<std::optional<Slice>>& operands() const {
+    return operands_;
+  }
+  const std::vector<std::optional<Slice>>& results() const { return results_; }
+
+  absl::string_view opaque() const { return opaque_; }
+
+ private:
+  CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
+                  CustomCallTarget call_target,
+                  std::vector<std::optional<Slice>> operands,
+                  std::vector<std::optional<Slice>> results,
+                  const std::string& opaque);
+
+  CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
+                  XLA_FFI_Handler_Bundle bundle,
+                  std::vector<std::optional<Slice>> operands,
+                  std::vector<std::optional<Slice>> results,
+                  AttributesMap attributes,
+                  std::unique_ptr<ffi::ExecutionState> execution_state,
+                  const HloComputation* called_computation);
+
+  absl::Status ExecuteCustomCall(const ExecuteParams& params);
+
+  absl::Status ExecuteFfiHandler(XLA_FFI_Handler* handler,
+                                 XLA_FFI_ExecutionStage stage,
+                                 se::Stream* stream,
+                                 const ffi::ExecutionContext* execution_context,
+                                 const BufferAllocations* buffer_allocations);
+
+  std::string target_name_;
+
+  std::vector<std::optional<Slice>> operands_;
+  std::vector<std::optional<Slice>> results_;
+
+  // This is a legacy custom call API that is discouraged, and will be
+  // deprecated once XLA:FFI mechanism is ready.
+  CustomCallTarget call_target_;
+  std::string opaque_;
+
+  // XLA FFI provides a right type safe mechanism for registering external
+  // functions with XLA runtime. It's under construction, and still misses
+  // a lot of features. Long term it will replace legacy custom calls.
+  std::optional<XLA_FFI_Handler_Bundle> bundle_;
+  AttributesMap attributes_;
+
+  // Execution state bound to the FFI handler. Optional.
+  std::unique_ptr<ffi::ExecutionState> execution_state_;
+
+  // TODO(ezhulenev): Currently we assume that HloModule that owns this
+  // computation is owned by a GpuExecutable and stays alive for as long as
+  // thunk is alive, however in general it might not be true and we can destroy
+  // underlying HloModule. We have to make a copy of HloComputation for a thunk,
+  // and also pass some form of relatively-ABI-stable representation to external
+  // custom calls, i.e. we can pass it as HloComputationProto or as MLIR
+  // bytecode of the computation serialized to StableHLO. Today we assume that
+  // custom calls that access called computation can only be linked statically.
+  const HloComputation* called_computation_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.h
new file mode 100644
index 00000000..29e17f1b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk.h
@@ -0,0 +1,140 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_DYNAMIC_SLICE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_DYNAMIC_SLICE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/literal.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// DynamicSliceThunk wraps the logic to compute dynamic offsets/sizes from
+// dynamic-slice or DUS around some original thunks (e.g. custom call or NCCL
+// thunks)
+//
+// DynamicSliceThunk assumes that the slices are contiguous.
+class DynamicSliceThunk : public Thunk {
+ public:
+  // Dynamic slice offset can be either: (1) a statically known constant value
+  // or (2) a truly dynamic offset that is computed on device and have to be
+  // transferred to host.
+  using Offset = std::variant<int64_t, BufferAllocation::Slice>;
+
+  DynamicSliceThunk(
+      ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
+      std::vector<std::optional<BufferAllocation::Slice>> arguments,
+      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+      std::vector<std::optional<std::vector<Offset>>> offsets,
+      std::vector<std::optional<Shape>> orig_shapes,
+      std::vector<std::optional<Shape>> sliced_shapes,
+      std::vector<std::optional<uint64_t>> offset_byte_sizes);
+
+  DynamicSliceThunk(const DynamicSliceThunk&) = delete;
+  DynamicSliceThunk& operator=(const DynamicSliceThunk&) = delete;
+
+  const Thunk* embedded_thunk() const { return embedded_thunk_.get(); }
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  // Definition of a dynamic slice that extract a slice from the original buffer
+  // defined by `embedded_thunk_argument` at given `offsets`.
+  struct SliceDef {
+    std::optional<BufferAllocation::Slice> embedded_thunk_argument;
+    std::optional<std::vector<Offset>> offsets;
+    std::optional<Shape> orig_shape;
+    std::optional<Shape> sliced_shape;
+    std::optional<uint64_t> offset_byte_size;
+  };
+
+  const SequentialThunk* get_embeded_thunk() const {
+    return embedded_thunk_.get();
+  }
+
+  std::vector<std::optional<BufferAllocation::Slice>> get_arguments() const {
+    return arguments_;
+  }
+
+  const std::vector<std::unique_ptr<BufferAllocation>>& get_fake_allocations()
+      const {
+    return fake_allocations_;
+  }
+
+  std::vector<std::optional<std::vector<Offset>>> get_offsets() const {
+    return offsets_;
+  }
+
+  std::vector<std::optional<Shape>> get_orig_shapes() const {
+    return orig_shapes_;
+  }
+
+  std::vector<std::optional<Shape>> get_sliced_shapes() const {
+    return sliced_shapes_;
+  }
+
+  std::vector<std::optional<uint64_t>> get_offset_byte_sizes() const {
+    return offset_byte_sizes_;
+  }
+
+  void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
+
+ private:
+  std::unique_ptr<SequentialThunk> embedded_thunk_;
+  std::vector<std::optional<BufferAllocation::Slice>> arguments_;
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+  std::vector<std::optional<std::vector<Offset>>> offsets_;
+  std::vector<std::optional<Shape>> orig_shapes_;
+  std::vector<std::optional<Shape>> sliced_shapes_;
+  std::vector<std::optional<uint64_t>> offset_byte_sizes_;
+
+  std::vector<SliceDef> slices_;
+
+  // Pinned host memory for transferring offset values from device to host.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      offsets_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  // Pre-computed size requirement for `offsets_allocs_`.
+  int64_t offsets_allocs_size_ = 0;
+
+  // A mapping from argument index to the base offset in the `offsets_allocs_`.
+  std::vector<int64_t> offsets_allocs_base_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_DYNAMIC_SLICE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/fft_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
new file mode 100644
index 00000000..eedb75fb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
@@ -0,0 +1,107 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_FFT_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_FFT_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+struct FftPlan {
+  // CuFFT thread-safety requires that separate host threads not share plans;
+  // protect each plan with a mutex.
+  absl::Mutex mu;
+  std::unique_ptr<se::fft::Plan> plan ABSL_GUARDED_BY(mu);
+  uint64_t scale_factor ABSL_GUARDED_BY(mu);
+};
+
+class FftPlanCache {
+ public:
+  // Returnes Fft plan cached for the given device ordinal or creates a new one.
+  FftPlan* GetOrCreate(int device_ordinal) {
+    absl::MutexLock lock(&mu_);
+    std::unique_ptr<FftPlan>& plan = fft_plans_[device_ordinal];
+    if (!plan) plan = std::make_unique<FftPlan>();
+    return plan.get();
+  }
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<int, std::unique_ptr<FftPlan>> fft_plans_
+      ABSL_GUARDED_BY(mu_);
+};
+
+// This class stores everything that StreamExecutor needs to launch an FFT.
+// It is generated by IrEmitter.
+//
+// This is thread-compatible.
+class FftThunk : public Thunk {
+ public:
+  // Constructs a thunk for launching an FFT on a stream.
+  // Semantics of null hlo_instruction argument are as in Thunk.
+  FftThunk(ThunkInfo thunk_info, FftType fft_type,
+           absl::Span<const int64_t> fft_length,
+           const BufferAllocation::Slice& input_buffer,
+           const BufferAllocation::Slice& output_buffer,
+           const Shape& input_shape, const Shape& output_shape);
+
+  FftThunk(const FftThunk&) = delete;             // Cannot share fft_plan_
+  FftThunk& operator=(const FftThunk&) = delete;  // Cannot share fft_plan_
+
+  // Does the FFT for the thunk on "stream".
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  const se::fft::Type fft_type_;
+  const std::vector<int64_t> fft_length_;
+
+  FftPlanCache fft_plan_cache_;
+
+  const BufferAllocation::Slice input_buffer_;
+  const BufferAllocation::Slice output_buffer_;
+
+  const Shape input_shape_;
+  const Shape output_shape_;
+};
+
+absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
+                    se::DeviceMemoryBase output, const Shape& output_shape,
+                    se::fft::Type fft_type,
+                    absl::Span<const int64_t> fft_length, int device_ordinal,
+                    FftPlanCache* fft_plan_cache, se::Stream* stream,
+                    se::DeviceMemoryAllocator* memory_allocator);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_FFT_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
new file mode 100644
index 00000000..a8767e96
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_GEMM_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_GEMM_THUNK_H_
+
+#include <optional>
+
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// This is thread-compatible.
+class GemmThunk : public Thunk {
+ public:
+  // Constructs a thunk that computes "output = (lhs <dot> rhs) * alpha" using
+  // BLAS gemm (alpha is stored in the instruction GemmBackendConfig).
+  GemmThunk(ThunkInfo thunk_info, GemmConfig config,
+            const BufferAllocation::Slice& lhs_buffer,
+            const BufferAllocation::Slice& rhs_buffer,
+            const BufferAllocation::Slice& output_buffer,
+            std::optional<const BufferAllocation::Slice> workspace,
+            bool deterministic);
+
+  GemmThunk(const GemmThunk&) = delete;
+  GemmThunk& operator=(const GemmThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  GemmConfig config() const { return config_; }
+  BufferAllocation::Slice lhs_buffer() const { return lhs_buffer_; }
+  BufferAllocation::Slice rhs_buffer() const { return rhs_buffer_; }
+  BufferAllocation::Slice output_buffer() const { return output_buffer_; }
+  std::optional<const BufferAllocation::Slice> workspace() const {
+    return workspace_;
+  }
+  bool deterministic() const { return deterministic_; }
+
+ private:
+  const GemmConfig config_;
+  const BufferAllocation::Slice lhs_buffer_;
+  const BufferAllocation::Slice rhs_buffer_;
+  const BufferAllocation::Slice output_buffer_;
+  std::optional<const BufferAllocation::Slice> workspace_;
+  // Whether to run deterministically.
+  const bool deterministic_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_GEMM_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
new file mode 100644
index 00000000..9425a133
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -0,0 +1,109 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_THUNK_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+class CublasLtMatmulThunk : public Thunk {
+ public:
+  CublasLtMatmulThunk(
+      ThunkInfo thunk_info, GemmConfig gemm_config,
+      se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+      BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
+      BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
+      BufferAllocation::Slice bias_buffer /* may be null */,
+      BufferAllocation::Slice aux_buffer /* may be null */,
+      BufferAllocation::Slice a_scale_buffer /* may be null */,
+      BufferAllocation::Slice b_scale_buffer /* may be null */,
+      BufferAllocation::Slice c_scale_buffer /* may be null */,
+      BufferAllocation::Slice d_scale_buffer /* may be null */,
+      BufferAllocation::Slice d_amax_buffer /* may be null */,
+      std::optional<const BufferAllocation::Slice> workspace_buffer);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  GemmConfig config() const { return gemm_config_; }
+  se::gpu::BlasLt::Epilogue epilogue() const { return epilogue_; }
+  int64_t algorithm_idx() const { return algorithm_idx_; }
+
+  BufferAllocation::Slice a_buffer() const { return a_buffer_; }
+  BufferAllocation::Slice b_buffer() const { return b_buffer_; }
+  BufferAllocation::Slice c_buffer() const { return c_buffer_; }
+  BufferAllocation::Slice d_buffer() const { return d_buffer_; }
+  BufferAllocation::Slice bias_buffer() const { return bias_buffer_; }
+  BufferAllocation::Slice aux_buffer() const { return aux_buffer_; }
+  BufferAllocation::Slice a_scale_buffer() const { return a_scale_buffer_; }
+  BufferAllocation::Slice b_scale_buffer() const { return b_scale_buffer_; }
+  BufferAllocation::Slice c_scale_buffer() const { return c_scale_buffer_; }
+  BufferAllocation::Slice d_scale_buffer() const { return d_scale_buffer_; }
+  BufferAllocation::Slice d_amax_buffer() const { return d_amax_buffer_; }
+  std::optional<const BufferAllocation::Slice> workspace() const {
+    return workspace_buffer_;
+  }
+
+ private:
+  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
+      const stream_executor::Stream* stream);
+  absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
+      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
+
+  absl::Mutex matmul_plans_cache_mutex_;
+  absl::flat_hash_map<const stream_executor::Stream*,
+                      se::gpu::BlasLt::MatmulPlanPtr>
+      matmul_plans_cache_ ABSL_GUARDED_BY(matmul_plans_cache_mutex_);
+
+  absl::Mutex matmul_algorithm_cache_mutex_;
+  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
+                      se::gpu::BlasLt::MatmulAlgorithm>
+      matmul_algorithm_cache_ ABSL_GUARDED_BY(matmul_algorithm_cache_mutex_);
+
+  GemmConfig gemm_config_;
+  se::gpu::BlasLt::Epilogue epilogue_;
+  int64_t algorithm_idx_;
+  BufferAllocation::Slice a_buffer_;
+  BufferAllocation::Slice b_buffer_;
+  BufferAllocation::Slice c_buffer_;
+  BufferAllocation::Slice d_buffer_;
+  BufferAllocation::Slice bias_buffer_;
+  BufferAllocation::Slice aux_buffer_;
+  BufferAllocation::Slice a_scale_buffer_;
+  BufferAllocation::Slice b_scale_buffer_;
+  BufferAllocation::Slice c_scale_buffer_;
+  BufferAllocation::Slice d_scale_buffer_;
+  BufferAllocation::Slice d_amax_buffer_;
+  std::optional<const BufferAllocation::Slice> workspace_buffer_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
new file mode 100644
index 00000000..7a3db689
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_INFEED_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_INFEED_THUNK_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// A thunk that infeeds data. Data must be already resident on the
+// device. This thunk performs an intra-device copy from that location
+// to the buffer allocated for the infeed op.
+class InfeedThunk : public Thunk {
+ public:
+  // Constructs a InfeedThunk that copies data from the on-device
+  // infeed queue into the buffers in the given shape tree.
+  InfeedThunk(ThunkInfo thunk_info, std::vector<ShapedSlice> dest_slices);
+
+  InfeedThunk(const InfeedThunk&) = delete;
+  InfeedThunk& operator=(const InfeedThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  const std::vector<ShapedSlice> dest_slices_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_INFEED_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
new file mode 100644
index 00000000..caab6242
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
@@ -0,0 +1,172 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_KERNEL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_KERNEL_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla {
+namespace gpu {
+
+class GpuExecutable;
+
+// TODO(ezhulenev): Unify KernelThunk and CustomKernelThunk as they are very
+// similar. XLA:GPU should use more of kernel loading APIs provided by
+// StreamExecutor out of the box and less custom kernel loading solutions.
+//
+// Today KernelThunk is required for lowering to XLA runtime, and
+// CustomKernelThunk is only supported for thunk execution.
+
+//===----------------------------------------------------------------------===//
+// KernelThunk
+//===----------------------------------------------------------------------===//
+
+// This class stores everything that StreamExecutor needs for launching a
+// kernel. It implements the ExecuteOnStream interface for GpuExecutable to
+// invoke the corresponding kernel.
+//
+// This is thread-compatible.
+class KernelThunk : public Thunk {
+ public:
+  // Constructs a thunk for the given kernel.
+  //
+  // KernelThunk takes args as `BufferAllocation::Slice`s (wrapped in
+  // `KernelArgument`s). Each slice directly corresponds to an argument or
+  // output of the computation. Also, the values must correspond to each arg
+  // directly, not to their base allocation (e.g. they can be the result of an
+  // `mlir::memref::ViewOp`).
+  KernelThunk(const HloInstruction* instr, std::string kernel_name,
+              absl::Span<const KernelArgument> kernel_arguments,
+              LaunchDimensions launch_dimensions,
+              std::optional<se::ClusterDim> cluster_dim, int64_t shmem_bytes);
+  KernelThunk(const KernelThunk&) = delete;
+  KernelThunk& operator=(const KernelThunk&) = delete;
+  ~KernelThunk() override = default;
+
+  std::string ToString(int indent) const override;
+
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const std::vector<BufferAllocation::Slice>& arguments() const {
+    return args_;
+  }
+  const std::vector<bool>& written() const { return written_; }
+
+  const std::string& kernel_name() const { return kernel_name_; }
+  const LaunchDimensions& launch_dimensions() const {
+    return launch_dimensions_;
+  }
+  // The shared memory required by the kernel.
+  int64_t shmem_bytes() const { return shmem_bytes_; }
+
+ private:
+  // Buffer slices passed to the kernel as arguments.
+  std::vector<BufferAllocation::Slice> args_;
+
+  // args_[i] is written iff (written_[i] == true).
+  std::vector<bool> written_;
+
+  // Entry kernel name for the computation.
+  const std::string kernel_name_;
+
+  // The thread and block dimension used to launch the kernel.
+  const LaunchDimensions launch_dimensions_;
+
+  // The cluster dimensions used to launch the kernel.
+  const std::optional<se::ClusterDim> cluster_dim_;
+
+  int64_t shmem_bytes_;
+
+  // Loaded kernels for each `StreamExecutor`.
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      kernel_cache_ ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// CustomKernelThunk
+//===----------------------------------------------------------------------===//
+
+// CustomKernelThunk loads and executes kernels defined by a custom kernel
+// (which in practice means hand written CUDA C++ kernel), instead of a kernel
+// compiled by XLA and loaded from an executable source.
+class CustomKernelThunk : public Thunk {
+ public:
+  CustomKernelThunk(const HloInstruction* inst, CustomKernel custom_kernel,
+                    absl::Span<const KernelArgument> kernel_arguments);
+
+  std::string ToString(int indent) const override;
+
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const CustomKernel& custom_kernel() const { return custom_kernel_; }
+
+  const std::vector<BufferAllocation::Slice>& arguments() const {
+    return args_;
+  }
+
+  absl::string_view custom_kernel_name() const { return custom_kernel_.name(); }
+
+  const std::vector<bool>& written() const { return written_; }
+
+  LaunchDimensions launch_dimensions() const {
+    return LaunchDimensions(custom_kernel_.block_dims(),
+                            custom_kernel_.thread_dims());
+  }
+
+  int64_t shmem_bytes() const { return custom_kernel_.shared_memory_bytes(); }
+
+ private:
+  // Buffer slices passed to the kernel as arguments.
+  std::vector<BufferAllocation::Slice> args_;
+
+  // args_[i] is written iff (written_[i] == true).
+  std::vector<bool> written_;
+
+  CustomKernel custom_kernel_;
+
+  // Loaded kernels for each `StreamExecutor`.
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      kernel_cache_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_KERNEL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/memset_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
new file mode 100644
index 00000000..0d5d28c3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_MEMSET_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_MEMSET_THUNK_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+// This file contains thunks that set a buffer's elements to a particular value.
+// This can be faster than emitting a kernel to set the elements.
+
+namespace xla {
+namespace gpu {
+
+// Thunk that zeroes out a given chunk of memory.
+class MemzeroThunk : public Thunk {
+ public:
+  explicit MemzeroThunk(ThunkInfo thunk_info,
+                        const BufferAllocation::Slice& dest)
+      : Thunk(Kind::kMemzero, thunk_info), dest_(dest) {}
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const BufferAllocation::Slice& destination() const { return dest_; }
+
+ private:
+  const BufferAllocation::Slice dest_;
+};
+
+// Thunk that sets a given chunk of memory to a particular 32-bit value.  The
+// destination chunk must have size divisible by 32 bits.
+class Memset32BitValueThunk : public Thunk {
+ public:
+  explicit Memset32BitValueThunk(ThunkInfo thunk_info, uint32_t value,
+                                 const BufferAllocation::Slice& dest)
+      : Thunk(Kind::kMemset32BitValue, thunk_info),
+        value_(value),
+        dest_(dest) {}
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const BufferAllocation::Slice& destination() const { return dest_; }
+  uint32_t value() const { return value_; }
+
+ private:
+  const uint32_t value_;
+  const BufferAllocation::Slice dest_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_MEMSET_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
new file mode 100644
index 00000000..eca4d35d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct NcclAllGatherConfig {
+  NcclCollectiveConfig config;
+};
+
+// Thunk that performs a NCCL-based All-Gather among CUDA GPU-based replicas.
+class NcclAllGatherStartThunk : public NcclCollectiveThunk {
+ public:
+  NcclAllGatherStartThunk(ThunkInfo thunk_info,
+                          const HloAllGatherInstruction* inst,
+                          std::vector<Buffer> buffers,
+                          bool p2p_memcpy_enabled = false);
+
+  static const char* GetHloOpName() { return "all-gather-start"; }
+
+  static absl::Status CheckImplementable(const HloAllGatherInstruction* inst,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloAllGatherInstruction* inst);
+
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+
+ private:
+  const NcclAllGatherConfig config_;
+  const std::vector<Buffer> buffers_;
+};
+
+absl::Status RunAllGather(GpuCollectives* collectives,
+                          std::vector<DeviceBufferPair>& buffers,
+                          se::Stream& stream, Communicator* comm);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
new file mode 100644
index 00000000..40f7c5f0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
@@ -0,0 +1,124 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct NcclAllReduceConfig {
+  NcclCollectiveConfig config;
+  ReductionKind reduction_kind;
+};
+
+// Thunk that performs a NCCL-based All-Reduce or Reduce-Scatter among CUDA
+// GPU-based replicas.
+class NcclAllReduceReduceScatterThunkBase : public NcclCollectiveThunk {
+ public:
+  NcclAllReduceReduceScatterThunkBase(Kind kind, ThunkInfo thunk_info,
+                                      NcclAllReduceConfig config,
+                                      std::vector<Buffer> buffers,
+                                      bool is_sync);
+
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  ReductionKind reduction_kind() const { return config_.reduction_kind; }
+
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  const NcclAllReduceConfig config_;
+  const std::vector<Buffer> buffers_;
+};
+
+// -----------------------------------------------------------------------------
+// AllReduce thunk.
+// -----------------------------------------------------------------------------
+
+class NcclAllReduceStartThunk : public NcclAllReduceReduceScatterThunkBase {
+ public:
+  NcclAllReduceStartThunk(ThunkInfo thunk_info,
+                          const HloAllReduceInstruction* inst,
+                          std::vector<Buffer> buffers,
+                          bool p2p_memcpy_enabled = false);
+
+  static const char* GetHloOpName() { return "all-reduce-start"; }
+
+  static absl::Status CheckImplementable(const HloAllReduceInstruction* inst,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloAllReduceInstruction* inst);
+
+ protected:
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+};
+
+// -----------------------------------------------------------------------------
+// ReduceScatter thunk
+// -----------------------------------------------------------------------------
+class NcclReduceScatterStartThunk : public NcclAllReduceReduceScatterThunkBase {
+ public:
+  NcclReduceScatterStartThunk(ThunkInfo thunk_info,
+                              const HloReduceScatterInstruction* inst,
+                              std::vector<Buffer> buffers,
+                              bool p2p_memcpy_enabled = false);
+
+  static const char* GetHloOpName() { return "reduce-scatter-start"; }
+
+  static absl::Status CheckImplementable(
+      const HloReduceScatterInstruction* inst, int64_t replica_count,
+      int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloReduceScatterInstruction* inst);
+
+ protected:
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+};
+
+// -----------------------------------------------------------------------------
+
+absl::Status RunAllReduce(GpuCollectives* collectives,
+                          ReductionKind reduction_kind,
+                          std::vector<DeviceBufferPair>& buffers,
+                          se::Stream& stream, Communicator* comm);
+
+absl::Status RunReduceScatter(GpuCollectives* collectives,
+                              ReductionKind reduction_kind,
+                              std::vector<DeviceBufferPair>& buffers,
+                              se::Stream& stream, Communicator* comm);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
new file mode 100644
index 00000000..9e242c7d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
@@ -0,0 +1,105 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct NcclAllToAllConfig {
+  NcclCollectiveConfig config;
+  bool has_split_dimension;
+};
+
+// Thunk that performs a NCCL-based All-to-All among CUDA GPU-based replicas.
+class NcclAllToAllStartThunk : public NcclCollectiveThunk {
+ public:
+  NcclAllToAllStartThunk(ThunkInfo thunk_info,
+                         const HloAllToAllInstruction* instr,
+                         std::vector<Buffer> buffers, bool p2p_memcpy_enabled);
+
+  // Returns whether the given instruction can be lowered to a nccl all-to-all
+  // call.
+  static absl::Status CheckImplementable(const HloAllToAllInstruction* instr,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  absl::Status Cleanup(const CleanupParams& params) override;
+
+  static const char* GetHloOpName() { return "all-to-all-start"; }
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloAllToAllInstruction* instr);
+
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  bool has_split_dimension() const { return config_.has_split_dimension; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+
+  AsyncStreamKind GetAsyncStreamKind() const override;
+
+  bool is_local() const;
+
+ private:
+  const NcclAllToAllConfig config_;
+  const std::vector<Buffer> buffers_;
+  int64_t device_count_ = 1;
+  bool p2p_memcpy_enabled_ = false;
+  absl::Mutex pointer_maps_mutex_;
+  absl::node_hash_map<int64_t, absl::flat_hash_map<int64_t, uint64_t>>
+      send_pointer_maps_ ABSL_GUARDED_BY(pointer_maps_mutex_);
+  absl::node_hash_map<int64_t, absl::flat_hash_map<int64_t, uint64_t>>
+      receive_pointer_maps_ ABSL_GUARDED_BY(pointer_maps_mutex_);
+};
+
+absl::Status RunAllToAll(GpuCollectives* collectives, bool has_split_dimension,
+                         std::vector<DeviceBufferPair>& buffers,
+                         se::Stream& stream, Communicator* comm);
+
+absl::Status RunMemCpyAllToAll(
+    GpuCollectives* collectives, bool has_split_dimension,
+    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    Communicator* comm,
+    absl::flat_hash_map<int64_t, uint64_t>& send_pointer_map,
+    absl::flat_hash_map<int64_t, uint64_t>& receive_pointer_map);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
new file mode 100644
index 00000000..c32154b6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_BROADCAST_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_BROADCAST_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+// Thunk that performs a NCCL-based collective broadcast.
+class NcclCollectiveBroadcastStartThunk : public NcclCollectiveThunk {
+ public:
+  static absl::Status CheckImplementable(const HloInstruction* instr,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloCollectiveBroadcastInstruction* inst);
+
+  const NcclCollectiveConfig& config() const override { return config_; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+  static const char* GetHloOpName() { return "collective-broadcast-start"; }
+
+  NcclCollectiveBroadcastStartThunk(
+      ThunkInfo thunk_info, const HloCollectiveBroadcastInstruction* instr,
+      std::vector<Buffer> buffers, bool p2p_memcpy_enabled = false);
+
+ protected:
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+
+ private:
+  const NcclCollectiveConfig config_;
+  const std::vector<Buffer> buffers_;
+};
+
+absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
+                                    se::Stream& stream, Communicator* comm,
+                                    GpuCollectives* collectives);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_BROADCAST_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
new file mode 100644
index 00000000..8753df53
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
@@ -0,0 +1,133 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
+
+#include <cstdint>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla {
+namespace gpu {
+
+using tsl::AsyncValueRef;
+
+// Thunk that performs a NCCL-based collective permute.
+class NcclCollectivePermuteStartThunk : public NcclCollectiveThunk {
+ public:
+  class RecvPtrMap {
+   public:
+    bool IsInitialized(int64_t current_id) {
+      absl::MutexLock lock(&mutex_);
+      return recv_ptrs_.find(current_id) != recv_ptrs_.end();
+    }
+
+    absl::Status InitializeId(int64_t current_id) {
+      absl::MutexLock lock(&mutex_);
+      recv_ptrs_[current_id] = tsl::MakeUnconstructedAsyncValueRef<void*>();
+      return absl::OkStatus();
+    }
+
+    absl::Status PutRecvPtr(int64_t current_id, void* ptr) {
+      if (!IsInitialized(current_id)) {
+        return absl::InternalError(absl::StrCat("Current ID ", current_id,
+                                                " has not been initialized!"));
+      }
+      absl::MutexLock lock(&mutex_);
+      if (recv_ptrs_.at(current_id).IsUnavailable()) {
+        VLOG(3) << "Putting pointer: " << ptr << " current_id " << current_id;
+        recv_ptrs_.at(current_id).emplace(ptr);
+      }
+      return absl::OkStatus();
+    }
+
+    absl::StatusOr<AsyncValueRef<void*>> GetRecvPtr(int64_t target_id) {
+      if (!IsInitialized(target_id)) {
+        return absl::InternalError(absl::StrCat("Target ID ", target_id,
+                                                " has not been initialized!"));
+      }
+      absl::MutexLock lock(&mutex_);
+      return recv_ptrs_[target_id];
+    }
+
+   private:
+    absl::Mutex mutex_;
+    absl::node_hash_map<int64_t, AsyncValueRef<void*>> recv_ptrs_
+        ABSL_GUARDED_BY(mutex_);
+  };
+
+  static NcclP2PConfig GetNcclP2PConfig(
+      const HloCollectivePermuteInstruction* instr, int64_t replica_count,
+      int64_t partition_count);
+
+  static bool IsDegenerate(const HloCollectivePermuteInstruction* instr,
+                           int64_t replica_count, int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloCollectivePermuteInstruction* instr);
+
+  NcclCollectivePermuteStartThunk(ThunkInfo thunk_info,
+                                  const HloCollectivePermuteInstruction* instr,
+                                  int64_t replica_count,
+                                  int64_t partition_count, const Buffer& buffer,
+                                  bool p2p_memcpy_enabled);
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status Cleanup(const CleanupParams& params) override;
+
+  static const char* GetHloOpName() { return "collective-permute-start"; }
+
+ protected:
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+
+ private:
+  const NcclP2PConfig config_;
+  const Buffer buffer_;
+  RecvPtrMap recv_ptr_map_;
+  absl::Mutex barrier_mutex_;
+  std::unordered_map<int64_t, uint8_t> barrier_flags_;
+  bool p2p_memcpy_enabled_ = false;
+  int64_t device_count_;
+};
+
+absl::Status RunCollectivePermute(
+    GpuCollectives* collectives,
+    NcclP2PConfig::SourceTargetMapEntry source_target, DeviceBufferPair& buffer,
+    se::Stream& stream, Communicator* comm, absl::string_view device_string,
+    int64_t current_id, bool use_memcpy,
+    NcclCollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
new file mode 100644
index 00000000..acdb18d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -0,0 +1,330 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Value.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/rendezvous.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+struct NcclCollectiveConfig {
+  int64_t operand_count;
+  std::vector<PrimitiveType> operand_element_type;
+  std::vector<ReplicaGroup> replica_groups;
+  RendezvousKey::CollectiveOpKind collective_op_kind;
+  int64_t op_id;
+  CollectiveOpGroupMode group_mode;
+
+  template <typename OpT>
+  void SetCollectiveOpKindAndID(OpT op);
+  void SetCollectiveOpKindAndID(const HloCollectivePermuteInstruction* instr);
+  void SetCollectiveOpKindAndID(const HloSendRecvInstruction* instr);
+  bool IsDegenerate(int64_t replica_count, int64_t partition_count) const;
+};
+
+template <typename OpT>
+void NcclCollectiveConfig::SetCollectiveOpKindAndID(OpT op) {
+  if (op.getChannelId()) {
+    collective_op_kind = RendezvousKey::kCrossModule;
+    op_id = static_cast<int64_t>(op.getChannelId()->getHandle());
+  } else {
+    collective_op_kind = RendezvousKey::kCrossReplica;
+    mlir::ModuleOp parent = op->template getParentOfType<mlir::ModuleOp>();
+    mlir::IntegerAttr unique_id =
+        parent->getAttrOfType<mlir::IntegerAttr>("hlo.unique_id");
+    op_id = static_cast<int64_t>(unique_id.getInt());
+  }
+}
+
+NcclCollectiveConfig GetNcclCollectiveConfig(
+    const HloInstruction* hlo, std::optional<bool> use_global_device_ids);
+
+template <typename OpT>
+NcclCollectiveConfig GetNcclCollectiveConfigForMlir(
+    OpT op, std::optional<bool> use_global_device_ids) {
+  NcclCollectiveConfig config;
+  config.operand_count = op.getInputs().size();
+  config.operand_element_type.reserve(config.operand_count);
+  for (int i = 0; i < config.operand_count; i++) {
+    const Shape shape = GetShape(op.getInputs()[i]);
+    config.operand_element_type.push_back(shape.element_type());
+  }
+  config.replica_groups = ConvertReplicaGroups(op.getReplicaGroups()).value();
+  config.SetCollectiveOpKindAndID(op);
+  config.group_mode = GetCollectiveOpGroupMode(op.getChannelId().has_value(),
+                                               use_global_device_ids)
+                          .value();
+  return config;
+}
+
+// Handle to a communicator object with its `is_local` property.
+struct CommunicatorHandle {
+  CommunicatorHandle(Communicator* comm, bool is_local)
+      : comm(comm), is_local(is_local) {}
+
+  Communicator* comm;  // communicator object
+  bool is_local;       // whether this comm is a node-local comm
+};
+
+//===----------------------------------------------------------------------===//
+// NcclCollectiveThunk
+//===----------------------------------------------------------------------===//
+
+// Forward declare.
+class NcclCollectiveDoneThunk;
+
+// Thunk base class for NCCL collective operations.
+class NcclCollectiveThunk : public Thunk {
+ public:
+  NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync);
+
+  struct Buffer {
+    int64_t element_count;
+    BufferAllocation::Slice source_buffer;
+    BufferAllocation::Slice destination_buffer;
+    int64_t source_memory_space;
+    int64_t destination_memory_space;
+    mlir::Value source_value;
+    mlir::Value destination_value;
+  };
+
+  // Completion events for asynchronous collective operations (operations
+  // launched on a dedicated stream that is synchronized with main compute
+  // stream only when needed).
+  class AsyncEvents {
+   private:
+    friend class NcclCollectiveThunk;
+    friend class NcclCollectiveDoneThunk;
+    friend class NcclGroupThunk;
+
+    absl::Status Initialize(se::StreamExecutor* executor);
+    absl::StatusOr<se::Event*> GetEvent(se::StreamExecutor* executor);
+
+   private:
+    absl::Mutex mu_;
+    absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>> events_
+        ABSL_GUARDED_BY(mu_);
+  };
+
+  // Logging support.
+  static std::string GetDeviceString(
+      const Thunk::CollectiveExecuteParams& params);
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  std::shared_ptr<AsyncEvents> async_events() const { return async_events_; }
+  void set_async_events(std::shared_ptr<AsyncEvents> async_events) {
+    async_events_ = async_events;
+  }
+
+  CollectiveStreamId nccl_stream_id() const {
+    return xla::gpu::GetCollectiveStreamId(IsAsync(), GetAsyncStreamKind());
+  }
+
+  ExecutionStreamId nccl_execution_stream_id() const {
+    return ExecutionStreamId(execution_stream_id().value() +
+                             nccl_stream_id().value());
+  }
+
+ protected:
+  virtual absl::Status RunNcclCollective(const ExecuteParams& params,
+                                         se::Stream& stream,
+                                         CommunicatorHandle comm) = 0;
+  virtual const NcclCollectiveConfig& config() const = 0;
+  virtual AsyncStreamKind GetAsyncStreamKind() const {
+    return AsyncStreamKind::kCollective;
+  }
+
+  // A collective thunk is normally an independent operation in a sense that
+  // different instances of the same collective thunk communicate each other.
+  // The only exception are SendThunk and RecvThunk. Assume two devices are
+  // executing a program contains the following instructions, the Recv from
+  // device 1 will release the Send from device 0. Adding first call
+  // rendezvous on the SendThunk would cause a runtime deadlock.
+  //  Send(src_target={0,1})
+  //  Recv(src_target={0,1})
+  virtual bool NeedFirstCallRendzevous() const { return true; }
+
+ private:
+  bool IsAsync() const { return async_events_ != nullptr; }
+  std::shared_ptr<AsyncEvents> async_events_;
+
+  // After a first call to this particular instance of a NCCL collective thunk
+  // we do a round of rendezvous to make sure that all participants successfully
+  // allocated on-device state required for executing collective operation. This
+  // is required to avoid deadlocks when one device goes too far ahead and
+  // causes a deadlock in CUDA driver (root cause is mysterious).
+  //
+  // TODO(ezhulenev): Try to move this flag to NCCL clique as we need to make
+  // sure that all NCCL resources are allocated just once.
+  RendezvousSingleFlag first_call_rendezvous_flag_;
+};
+
+//===----------------------------------------------------------------------===//
+// NcclCollectiveDoneThunk
+//===----------------------------------------------------------------------===//
+
+class NcclCollectiveDoneThunk : public Thunk {
+ public:
+  NcclCollectiveDoneThunk(
+      Thunk::Kind kind, ThunkInfo thunk_info,
+      std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events,
+      AsyncStreamKind async_stream_kind);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  // return the execution stream id wheer previous async operator was launched
+  // to.
+  ExecutionStreamId nccl_execution_stream_id() const {
+    return ExecutionStreamId(
+        execution_stream_id().value() +
+        xla::gpu::GetCollectiveStreamId(true, async_stream_kind_).value());
+  }
+
+ private:
+  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events_;
+  AsyncStreamKind async_stream_kind_ = AsyncStreamKind::kCollective;
+};
+
+//===----------------------------------------------------------------------===//
+
+absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op);
+
+absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op);
+
+template <typename NcclThunkType, typename OpT>
+absl::Status AddOpDescription(absl::Status status, OpT op,
+                              int64_t replica_count, int64_t partition_count) {
+  if (status.ok()) {
+    return status;
+  }
+  CollectiveOpGroupMode group_mode = NcclThunkType::GetGroupMode(op);
+
+  int64_t operand_count = 0;
+  std::string str;
+
+  if constexpr (std::is_base_of_v<HloInstruction, std::remove_pointer_t<OpT>>) {
+    operand_count = op->operand_count();
+    str = op->ToString();
+  } else {
+    operand_count = op->getNumOperands() / 2;
+    str = llvm_ir::DumpToString(op.getOperation());
+  }
+
+  return absl::Status(
+      status.code(),
+      absl::StrFormat(
+          "%s\n"
+          "%s with replica_count: %d, partition_count: %d, group_mode: %s, "
+          "operand_count: %d\n%s",
+          status.message(), NcclThunkType::GetHloOpName(), replica_count,
+          partition_count, CollectiveOpGroupModeToString(group_mode),
+          operand_count, str));
+}
+
+//===----------------------------------------------------------------------===//
+
+absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
+    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, CollectiveStreamId stream_id,
+    AsyncStreamKind stream_kind);
+
+absl::StatusOr<size_t> GetNumLocalParticipants(
+    const Thunk::CollectiveExecuteParams& params,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode);
+
+// Returns a nccl comm and a flag indicating if it's a local communicator.
+absl::StatusOr<CommunicatorHandle> GetNcclComm(
+    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    const Thunk::CollectiveCliques& collective_cliques,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, CollectiveStreamId stream_id,
+    AsyncStreamKind stream_kind);
+
+struct DeviceBufferPair {
+  PrimitiveType element_type;
+  int64_t element_count;
+  se::DeviceMemoryBase source_buffer;
+  se::DeviceMemoryBase destination_buffer;
+  int64_t source_memory_space;
+  int64_t destination_memory_space;
+};
+
+absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
+    const Thunk::ExecuteParams& params,
+    const std::vector<NcclCollectiveThunk::Buffer>& buffers,
+    const std::vector<PrimitiveType>& element_types);
+
+absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
+    const BufferAllocations* buffer_allocations,
+    const std::vector<NcclCollectiveThunk::Buffer>& buffers,
+    const std::vector<PrimitiveType>& element_types);
+
+// Registers buffers allocated in collective memory (see ncclMemAlloc) with a
+// communicator to enable zero-copy collectives.
+//
+// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
+absl::Status MaybeRegisterBuffers(GpuCollectives* collectives,
+                                  se::StreamExecutor* executor,
+                                  const std::vector<DeviceBufferPair>& buffers,
+                                  Communicator* comm);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_group_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_group_thunk.h
new file mode 100644
index 00000000..9e40ad77
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_group_thunk.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_GROUP_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_GROUP_THUNK_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// NCCL group thunk fuses together a set of arbitrary operations into a single
+// NCCL group call in order for them to be dispatched to NCCL as a NCCL group.
+// NCCL may or may not execute them in parallel.
+
+class NcclGroupThunk : public Thunk {
+ public:
+  NcclGroupThunk(const HloInstruction* instruction, Thunk::Kind kind,
+                 std::vector<std::unique_ptr<Thunk>> thunks,
+                 AsyncStreamKind stream_kind);
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status ExecuteOnStream(const Thunk::ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events() const {
+    return async_events_;
+  }
+
+ private:
+  ThunkSequence thunks_;
+  AsyncStreamKind stream_kind_;
+  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_GROUP_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
new file mode 100644
index 00000000..5b8f6f5c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
@@ -0,0 +1,107 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_P2P_THUNK_COMMON_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_P2P_THUNK_COMMON_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+// Count the number of times a Send or Recv instruction executed on a device.
+class ExecutionCounters {
+ public:
+  absl::Status Initialize(se::StreamExecutor* executor, RunId run_id);
+  absl::StatusOr<int64_t*> GetCounter(se::StreamExecutor* executor,
+                                      RunId run_id);
+
+ private:
+  using CounterKey = std::pair<se::StreamExecutor*, RunId>;
+  absl::Mutex mu_;
+  // TODO(b/338288906): may need to clean up the counters for finished runs.
+  absl::flat_hash_map<CounterKey, int64_t> counters_ ABSL_GUARDED_BY(mu_);
+};
+
+// Records the information for implementing CollectivePermute, Send and Recv.
+struct NcclP2PConfig {
+  // Record the target ID for sending a data and the source ID from which to
+  // receive a data. Either target or source can be optional.
+  struct SourceTargetMapEntry {
+    std::optional<int64_t> source;
+    std::optional<int64_t> target;
+  };
+
+  using IdToSourceTargetMap =
+      absl::flat_hash_map<int64_t, SourceTargetMapEntry>;
+
+  enum class ValidationKind { kValid = 0, kInvalid = 1, kConditional = 2 };
+
+  using SourceTargetToBounds = absl::flat_hash_map<std::pair<int64_t, int64_t>,
+                                                   std::pair<int64_t, int64_t>>;
+
+  // Returns the source and target ID corresponding to the given ID (these IDs
+  // are replica_ids for cross replica permute or partition_ids for cross
+  // partition permute). The source ID is the id which will send data to this
+  // ID and the target ID is the id to which this ID will send its data. Either
+  // can be optional.
+  static SourceTargetMapEntry GetSourceTarget(
+      const IdToSourceTargetMap& id_to_source_target, int64_t id) {
+    auto it = id_to_source_target.find(id);
+    if (it != id_to_source_target.end()) return it->second;
+    return SourceTargetMapEntry{};
+  }
+
+  NcclCollectiveConfig config;
+  IdToSourceTargetMap id_to_source_target;
+  ValidationKind validation_kind = ValidationKind::kValid;
+  // When a Send or Recv has validation_kind = ValidationKind::kConditional,
+  // record the valid execution numbers as a pair of [lower-bound, upper-bound]
+  // for each source and target pair.
+  SourceTargetToBounds source_target_to_bounds;
+};
+
+// Extracts source/target pairs for send/recv from frontend attributes.
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> GetSourceTargetPairs(
+    mlir::DictionaryAttr frontend_attributes);
+
+// Constructs the NcclP2PConfig for an HLO Send or Recv instruction.
+NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
+                                          const Shape& shape,
+                                          int64_t replica_count,
+                                          int64_t partition_count);
+// Returns the stream kind for the asynchronous stream used to execute an HLO
+// Send or Recv instruction, by inspecting the frontend attributes of the
+// instruction.
+AsyncStreamKind GetStreamKindForSendRecv(const HloSendRecvInstruction* instr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_P2P_THUNK_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_ragged_all_to_all_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_ragged_all_to_all_thunk.h
new file mode 100644
index 00000000..d085aab4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_ragged_all_to_all_thunk.h
@@ -0,0 +1,102 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_RAGGED_ALL_TO_ALL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_RAGGED_ALL_TO_ALL_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct NcclRaggedAllToAllConfig {
+  NcclCollectiveConfig config;
+  int64_t num_ragged_rows = 1;
+  int64_t ragged_row_element_size = 1;
+};
+
+// Thunk that performs a NCCL-based Ragged-All-to-All among CUDA GPU-based
+// replicas.
+class NcclRaggedAllToAllStartThunk : public NcclCollectiveThunk {
+ public:
+  NcclRaggedAllToAllStartThunk(ThunkInfo thunk_info,
+                               const HloRaggedAllToAllInstruction* instr,
+                               std::vector<Buffer> buffers,
+                               bool p2p_memcpy_enabled);
+
+  // Returns whether the given instruction can be lowered to a nccl
+  // ragged-all-to-all call.
+  static absl::Status CheckImplementable(
+      const HloRaggedAllToAllInstruction* instr, int64_t replica_count,
+      int64_t partition_count);
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  absl::Status Cleanup(const CleanupParams& params) override;
+
+  static const char* GetHloOpName() { return "ragged-all-to-all-start"; }
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloRaggedAllToAllInstruction* instr);
+
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+
+  AsyncStreamKind GetAsyncStreamKind() const override;
+
+ private:
+  const NcclRaggedAllToAllConfig config_;
+  const std::vector<Buffer> buffers_;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::vector<std::unique_ptr<se::MemoryAllocation>>>
+      host_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  absl::flat_hash_map<se::StreamExecutor*, se::DeviceMemoryBase>
+      device_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
+};
+
+absl::Status RunRaggedAllToAll(
+    GpuCollectives* collectives, int64_t ragged_row_element_size,
+    const std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    Communicator* comm, const std::vector<int64_t*>& ragged_metadata_allocs,
+    const se::DeviceMemoryBase& output_offsets_device_buffer);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_RAGGED_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
new file mode 100644
index 00000000..757699a3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_RECV_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_RECV_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk that performs a NCCL-recv.
+class NcclRecvThunk : public NcclCollectiveThunk {
+ public:
+  NcclRecvThunk(ThunkInfo thunk_info, const HloRecvInstruction* instr,
+                int64_t replica_count, int64_t partition_count,
+                const Buffer& buffer);
+  absl::Status Initialize(const InitializeParams& params) override;
+
+ protected:
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+  AsyncStreamKind GetAsyncStreamKind() const override { return stream_kind_; }
+  bool NeedFirstCallRendzevous() const override { return false; }
+
+ private:
+  const NcclP2PConfig config_;
+  const Buffer buffer_;
+  const AsyncStreamKind stream_kind_;
+  std::shared_ptr<ExecutionCounters> execution_counters_;
+};
+
+absl::Status RunRecv(GpuCollectives* collectives,
+                     NcclP2PConfig::SourceTargetMapEntry source_target,
+                     DeviceBufferPair& buffer, se::Stream& stream,
+                     Communicator* comm, absl::string_view device_string,
+                     int64_t current_id);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_RECV_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
new file mode 100644
index 00000000..1a894c77
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_SEND_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_SEND_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk that performs a NCCL-send.
+class NcclSendThunk : public NcclCollectiveThunk {
+ public:
+  NcclSendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
+                int64_t replica_count, int64_t partition_count,
+                const Buffer& buffer);
+  absl::Status Initialize(const InitializeParams& params) override;
+
+ protected:
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 CommunicatorHandle comm_handle) override;
+  AsyncStreamKind GetAsyncStreamKind() const override { return stream_kind_; }
+  bool NeedFirstCallRendzevous() const override { return false; }
+
+ private:
+  const NcclP2PConfig config_;
+  const Buffer buffer_;
+  const AsyncStreamKind stream_kind_;
+  std::shared_ptr<ExecutionCounters> execution_counters_;
+};
+
+absl::Status RunSend(GpuCollectives* collectives,
+                     NcclP2PConfig::SourceTargetMapEntry source_target,
+                     DeviceBufferPair& buffer, se::Stream& stream,
+                     Communicator* comm, absl::string_view device_string,
+                     int64_t current_id);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_SEND_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/norm_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
new file mode 100644
index 00000000..eca5deca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NORM_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NORM_THUNK_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/gpu_norm_runner.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+class NormThunk : public Thunk {
+ public:
+  NormThunk(ThunkInfo thunk_info, GpuNormConfig config,
+            BufferAllocation::Slice x, BufferAllocation::Slice scale,
+            BufferAllocation::Slice y_or_dx,
+            std::optional<BufferAllocation::Slice> bias,
+            std::optional<BufferAllocation::Slice> expectation,
+            std::optional<BufferAllocation::Slice> norm_factor,
+            std::optional<BufferAllocation::Slice> dy,
+            std::optional<BufferAllocation::Slice> dscale,
+            std::optional<BufferAllocation::Slice> dbias,
+            BufferAllocation::Slice scratch);
+
+  NormThunk(const NormThunk&) = delete;
+  NormThunk& operator=(const NormThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+
+ private:
+  BufferAllocation::Slice x_buffer_;
+  BufferAllocation::Slice scale_buffer_;
+  BufferAllocation::Slice y_or_dx_buffer_;
+  std::optional<BufferAllocation::Slice> bias_buffer_;
+  std::optional<BufferAllocation::Slice> expectation_buffer_;
+  std::optional<BufferAllocation::Slice> norm_factor_buffer_;
+  std::optional<BufferAllocation::Slice> dy_buffer_;
+  std::optional<BufferAllocation::Slice> dscale_buffer_;
+  std::optional<BufferAllocation::Slice> dbias_buffer_;
+  BufferAllocation::Slice scratch_buffer_;
+  NormRunner& GetOrCreateRunner(const stream_executor::Stream*);
+
+  GpuNormConfig config_;
+  absl::Mutex mu_;
+  absl::flat_hash_map<const stream_executor::Stream*,
+                      std::unique_ptr<NormRunner>>
+      runner_cache_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NORM_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
new file mode 100644
index 00000000..a216431e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_OUTFEED_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_OUTFEED_THUNK_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// A thunk that outfeeds data. Data must be already resident on the host. This
+// thunk performs a device to host copy from the buffer allocated for the
+// outfeed op to the host location.
+class OutfeedThunk : public Thunk {
+ public:
+  // Constructs a OutfeedThunk that copies data to the host-side
+  // outfeed queue from the buffers in the given shape tree.
+  OutfeedThunk(ThunkInfo thunk_info, std::vector<ShapedSlice> source_slices);
+
+  OutfeedThunk(const OutfeedThunk&) = delete;
+  OutfeedThunk& operator=(const OutfeedThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  const std::vector<ShapedSlice> source_slices_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_OUTFEED_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
new file mode 100644
index 00000000..7b9aa403
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
@@ -0,0 +1,57 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_REPLICA_ID_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_REPLICA_ID_THUNK_H_
+
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk that implements the ReplicaId(Idx == 0) or PartitionId(Idx == 1).
+class ReplicaOrPartitionIdThunk : public Thunk {
+ public:
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  BufferAllocation::Slice dest() const { return dest_; }
+
+ protected:
+  ReplicaOrPartitionIdThunk(Kind kind, ThunkInfo thunk_info,
+                            const BufferAllocation::Slice& dest)
+      : Thunk(kind, thunk_info), dest_(dest) {}
+
+ private:
+  const BufferAllocation::Slice dest_;
+};
+
+class ReplicaIdThunk : public ReplicaOrPartitionIdThunk {
+ public:
+  ReplicaIdThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& dest)
+      : ReplicaOrPartitionIdThunk(Kind::kReplicaId, thunk_info, dest) {}
+};
+
+class PartitionIdThunk : public ReplicaOrPartitionIdThunk {
+ public:
+  PartitionIdThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& dest)
+      : ReplicaOrPartitionIdThunk(Kind::kPartitionId, thunk_info, dest) {}
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_REPLICA_ID_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
new file mode 100644
index 00000000..196afc66
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
@@ -0,0 +1,168 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_SEND_RECV_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_SEND_RECV_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// SendRecvAsyncEvents
+//===----------------------------------------------------------------------===//
+
+// Send/Recv operations have two levels of async behavior:
+//
+// (1) AsyncValueRef will become available only after send/recv handler
+//     schedules all activities on the device.
+//
+// (2) se::Event will become available when device activity recorded by
+//     send/recv handlers complete.
+//
+// We  keep track of Send/Recv commands in flight, and synchronize `send` and
+// `recv` operations with corresponding `send-done` and `recv-done`.
+//
+// Each channel can have at most one event in flight for a given executor.
+//
+// We have a single instance of `SendRecvAsyncEvents` for each Gpu executable,
+// and all thunks share it using a shared pointer.
+//
+// TODO(ezhulenev): Rename to `SendRecvEvents` once we remove deprecated XLA
+// runtime, as it has name conflict.
+class SendRecvAsyncEvents {
+ public:
+  // Emplace a new send/recv completion event.
+  absl::Status Emplace(se::StreamExecutor* executor, int32_t channel_id,
+                       tsl::AsyncValueRef<std::unique_ptr<se::Event>> event);
+
+  // Extract a send/recv completion event.
+  absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> Extract(
+      se::StreamExecutor* executor, int32_t channel_id);
+
+ private:
+  using Key = std::pair<se::StreamExecutor*, /*channel_id=*/int64_t>;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<Key, tsl::AsyncValueRef<std::unique_ptr<se::Event>>>
+      events_ ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// SendThunk
+//===----------------------------------------------------------------------===//
+
+class SendThunk : public Thunk {
+ public:
+  SendThunk(ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+            int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
+            absl::flat_hash_map<std::string, std::string> frontend_attrs,
+            std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  Shape shape_;
+  BufferAllocation::Slice buffer_;
+
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+//===----------------------------------------------------------------------===//
+// SendDoneThunk
+//===----------------------------------------------------------------------===//
+
+class SendDoneThunk : public Thunk {
+ public:
+  SendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+                std::shared_ptr<SendRecvAsyncEvents> events,
+                std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+//===----------------------------------------------------------------------===//
+// RecvThunk
+//===----------------------------------------------------------------------===//
+
+class RecvThunk : public Thunk {
+ public:
+  RecvThunk(ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+            int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
+            absl::flat_hash_map<std::string, std::string> frontend_attrs,
+            std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  Shape shape_;
+  BufferAllocation::Slice buffer_;
+
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+//===----------------------------------------------------------------------===//
+// RecvDoneThunk
+//===----------------------------------------------------------------------===//
+
+class RecvDoneThunk : public Thunk {
+ public:
+  RecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+                std::shared_ptr<SendRecvAsyncEvents> events,
+                std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_SEND_RECV_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
new file mode 100644
index 00000000..ea4fbdc9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
+
+#include <string>
+
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// A thunk that wraps a list of sub-thunks. Executing this thunk executes all
+// the sub-thunks sequentially. This is useful to implement instructions that
+// require multiple kernel launches or library calls.
+class SequentialThunk : public Thunk {
+ public:
+  SequentialThunk(ThunkInfo thunk_info, ThunkSequence thunks);
+  SequentialThunk(const SequentialThunk&) = delete;
+  SequentialThunk& operator=(const SequentialThunk&) = delete;
+
+  ThunkSequence& thunks() { return thunks_; }
+  const ThunkSequence& thunks() const { return thunks_; }
+  std::string ToString(int indent) const override;
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
+
+ private:
+  // The list of sub-thunks.
+  ThunkSequence thunks_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/thunk.h
new file mode 100644
index 00000000..90aae04f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/thunk.h
@@ -0,0 +1,546 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_THUNK_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_cliques.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// Execution stream id allows to specify what Gpu stream Thunk should be using
+// for launching device work (kernels, library calls, etc.). By default all
+// thunks use stream #0, which is the default compute stream of an XLA
+// executable.
+//
+// Stream synchronizations are explicit and represented as WaitForStreams thunk
+// in a ThunkSequence. When ThunkSequence converted to CommandBuffer, execution
+// streams mapped to concurrent execution scopes and barriers between them.
+//
+// IMPORTANT: Async execution semantics and execution stream id
+//
+// For async thunks (i.e. thunks corresponding to `all-reduce-start` and
+// `all-reduce-done`) execution stream id means NOT a stream where the async
+// operation must execute, but a stream that async operation must be
+// synchronized with:
+//
+//   - Start operation must wait for the completion of all launched work on the
+//     execution stream id (usually by adding a stream wait) and after that
+//     launch async work on implementation defined extra stream (can be borrowed
+//     from a pool)
+//
+//   - Corresponding Done operation must synchronize execution stream id with
+//     an implementation defined stream that is running async work, again
+//     usually by adding a stream wait.
+//
+TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionStreamId, uint64_t);
+
+// Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
+// metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
+//
+// Thunk provides the Initialize and ExecuteOnStream interface for GpuExecutable
+// to initialize and execute the invocation respectively. Its subclasses are
+// supposed to override these interfaces to launch a generated kernel or call an
+// external library function (such as operations in cuBLAS).
+//
+// Thunks have three execution stages:
+//
+// (1) Prepare: at this stage Thunk can request shared resources required at run
+//     time, i.e. collective thunks request collective cliques. Executable(s)
+//     will coordinate resource acquisition.
+//
+// (2) Initialize: at this stage Thunk must initialize all internal state
+//     required for execution, maybe using resources requested at prepare stage.
+//
+// (3) Execute: at this stage Thunk must launch "work" on underlying device
+//     using given stream, and it's expected that all expensive initialization
+//     is completed at earlier stages.
+//
+// This is thread-compatible. Thunk implementation should expect that it will be
+// called concurrently from multiple threads, for different run ids and for
+// different devices (stream executors). For partitioned XLA programs the
+// expectation is that all local participants execute simultaneously on
+// different threads and coordinate resource acquisition via rendezvous.
+class Thunk {
+ public:
+  using ExecutionStreamIdMap =
+      absl::flat_hash_map<ExecutionStreamId, se::Stream*>;
+
+  // When default execution stream id is used, operations launched by a thunk
+  // must be synchronized with a stream passed in ExecuteOptions.
+  static constexpr auto kDefaultExecutionStreamId = ExecutionStreamId(0);
+
+  enum Kind {
+    kDynamicSlice,
+    kCholesky,
+    kConditional,
+    kConvolution,
+    kConvolutionReorder,
+    kCopy,
+    kCopyDone,
+    kCommandBuffer,
+    kCubSort,
+    kCublasLtMatmul,
+    kCustomCall,
+    kCustomKernel,
+    kFft,
+    kGemm,
+    kInfeed,
+    kKernel,
+    kMemset32BitValue,
+    kMemzero,
+    kNcclAllGather,
+    kNcclAllGatherStart,
+    kNcclAllGatherDone,
+    kNcclAllReduce,
+    kNcclAllReduceStart,
+    kNcclAllReduceDone,
+    kNcclCollectiveBroadcast,
+    kNcclCollectiveBroadcastStart,
+    kNcclCollectiveBroadcastDone,
+    kNcclCollectivePermute,
+    kNcclCollectivePermuteStart,
+    kNcclCollectivePermuteDone,
+    kNcclGroupStart,
+    kNcclGroupDone,
+    kNcclReduceScatter,
+    kNcclReduceScatterStart,
+    kNcclReduceScatterDone,
+    kNcclAllToAll,
+    kNcclAllToAllStart,
+    kNcclAllToAllDone,
+    kNcclRaggedAllToAll,
+    kNcclRaggedAllToAllStart,
+    kNcclRaggedAllToAllDone,
+    kNcclSend,
+    kNcclSendDone,
+    kNcclRecv,
+    kNcclRecvDone,
+    kNorm,
+    kOutfeed,
+    kPartitionId,
+    kRecv,
+    kRecvDone,
+    kReplicaId,
+    kSequential,
+    kSend,
+    kSendDone,
+    kTriangularSolve,
+    kWhile,
+    kWaitForStreams,
+    kCuDnn
+  };
+
+  // TODO(ezhulenev): This should become a part of StreamExecutor library, but
+  // for now we keep it here as a Thunk implementation detail. It's not yet
+  // clear what else should become a part of "executable source", we likely
+  // need to keep some information about available symbols and signatures.
+  struct ExecutableSource {
+    absl::string_view text;            // PTX for NVIDIA backend
+    absl::Span<const uint8_t> binary;  // CUBIN for NVIDIA backends
+    BinaryMap dnn_compiled_graphs;
+  };
+
+  struct ThunkInfo {
+    ThunkInfo() = default;  // Disable implicit constructors.
+    static ThunkInfo WithProfileAnnotation(const HloInstruction* instr);
+
+    std::string profile_annotation;
+
+    ExecutionStreamId execution_stream_id = kDefaultExecutionStreamId;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // ResourceRequests
+  //===--------------------------------------------------------------------===//
+
+  // Each individual thunk can request various resources required for execution
+  // at prepare stage. XLA executable is responsible for allocating them before
+  // initializing and executing thunks.
+  class ResourceRequests {
+   public:
+    virtual ~ResourceRequests() = default;
+    virtual absl::Status AddClique(const GpuCliqueKey& clique_key,
+                                   int32_t num_local_participants) = 0;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // CollectiveCliques
+  //===--------------------------------------------------------------------===//
+
+  // A collection of collective cliques acquired based on resource requests
+  // collected from all thunks at prepare stage.
+  class CollectiveCliques {
+   public:
+    CollectiveCliques() = default;
+    CollectiveCliques(AcquiredCliquesMap cliques_map,
+                      int32_t num_transient_cliques);
+
+    absl::StatusOr<Communicator*> GetComm(const GpuCliqueKey& clique_key,
+                                          RankId rank) const;
+
+    // Returns the number of communicators in a collective clique. Returns error
+    // if we do not have an acquired clique for a given key.
+    absl::StatusOr<size_t> num_communicators(
+        const GpuCliqueKey& clique_key) const;
+
+    // Returns whether the clique is a local clique.
+    absl::StatusOr<bool> is_local_clique(const GpuCliqueKey& clique_key) const;
+
+    bool empty() const { return cliques_map_.empty(); }
+
+    bool num_transient_cliques() const { return num_transient_cliques_; }
+
+   private:
+    AcquiredCliquesMap cliques_map_;
+
+    // The number of acquired non-persistent clique. We need to keep track of
+    // newly created communicators to insert rendezvous after first
+    // initialization, because otherwise we observe deadlocks with NCCL
+    // collectives backends.
+    int32_t num_transient_cliques_ = 0;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // CollectiveExecuteParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters capturing all the details required for collective execution of
+  // XLA executables (multiple partitions and replicas).
+  struct CollectiveExecuteParams {
+    // Creates NCCL execution parameters from the run options for the given
+    // local device. Returns an error if run options are misconfigured (i.e.
+    // missing a global device mapping for a local device ordinal).
+    static absl::StatusOr<CollectiveExecuteParams> Create(
+        const ServiceExecutableRunOptions& run_options,
+        absl::Span<se::Stream* const> async_streams,
+        int64_t local_device_ordinal, int64_t collective_max_nchannels = 0,
+        int64_t p2p_max_nchannels = 0);
+
+    // A mapping from local device ordinals to global device IDs.
+    using GlobalDeviceIdMap = std::map<int32_t, GlobalDeviceId>;
+
+    GpuCollectives* collectives;
+    se::StreamExecutor* executor;
+
+    // XLA execution run id allows us to distinguish collective operations
+    // from different concurrent executions and avoid deadlocks.
+    RunId run_id;
+
+    // Streams for asynchronous collective communications.
+    absl::InlinedVector<se::Stream*, 4> async_streams;
+
+    int64_t local_device_ordinal;
+    GlobalDeviceId global_device_id;
+
+    const DeviceAssignment* device_assn;
+    const GlobalDeviceIdMap* global_device_id_map;
+    const CliqueIdCallback* nccl_clique_id_callback;
+
+    int64_t collective_max_nchannels;
+    int64_t p2p_max_nchannels;
+
+   private:
+    CollectiveExecuteParams(GpuCollectives* collectives,
+                            se::StreamExecutor* executor, RunId run_id,
+                            absl::Span<se::Stream* const> async_streams,
+                            int64_t local_device_ordinal,
+                            GlobalDeviceId global_device_id,
+                            const DeviceAssignment* device_assn,
+                            const GlobalDeviceIdMap* global_device_id_map,
+                            const CliqueIdCallback* nccl_clique_id_callback,
+                            int64_t collective_max_nchannels,
+                            int64_t p2p_max_nchannels);
+  };
+
+  //===--------------------------------------------------------------------===//
+  // PrepareParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters passed to Prepare. At thunk prepare time we do not launch any
+  // work or do any expensive initialization and only pass resource requirements
+  // back to executable, i.e. request collective cliques required at run time.
+  struct PrepareParams {
+    // Parameters for executing collective operations.
+    const CollectiveExecuteParams* collective_params = nullptr;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // InitializeParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters passed to Initialize. At thunk initialization time we do not
+  // launch any "work" on device and only initialize thunks for execution, i.e.
+  // we pre-load kernels on device and instantiate all command buffers.
+  struct InitializeParams {
+    se::StreamExecutor* executor = nullptr;
+    ExecutableSource src;
+
+    const BufferAllocations* buffer_allocations = nullptr;
+
+    // Main compute stream that will be used, passed via `ExecuteParams` to
+    // `ExecuteOnStream`. It can be used to initialize on-device "state" (i.e.
+    // various control structures) at command buffer recording time (we use it
+    // to initialize NCCL execution plans on device when we trace NCCL
+    // operations into command buffers);
+    se::Stream* stream = nullptr;
+
+    // Auxiliary stream for tracing command buffers. We use a separate stream to
+    // avoid accidental tracing of unrelated activities on a main stream.
+    se::Stream* command_buffer_trace_stream = nullptr;
+
+    // Parameters for executing collective operations.
+    CollectiveExecuteParams* collective_params = nullptr;
+
+    // Collective cliques acquired based on resource requests.
+    CollectiveCliques* collective_cliques = nullptr;
+
+    // XLA FFI execution context.
+    const ffi::ExecutionContext* ffi_execution_context = nullptr;
+
+    // Total local device count.
+    int local_device_count = 0;
+
+    bool requires_exclusive_lock_on_gpu = false;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // ExecuteParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters passed to ExecuteOnStream. ExecuteOnStream is responsible for
+  // launching "work" on device, i.e. it launches kernels, executes command
+  // buffers and calls into libraries (cuBLAS, cuDNN etc.).
+  struct ExecuteParams {
+    // Constructs execute parameters from an executable run options. Return
+    // error if run options are misconfigured.
+    static ExecuteParams Create(
+        const ServiceExecutableRunOptions& run_options,
+        const BufferAllocations& buffer_allocations, se::Stream* stream,
+        se::Stream* command_buffer_trace_stream,
+        CollectiveExecuteParams* collective_params,
+        CollectiveCliques* collective_cliques,
+        ExecutionStreamIdMap additional_compute_streams = {});
+
+    // Constructs execute parameters from an existing parameters but with
+    // different buffer allocations.
+    static ExecuteParams CloneWithNewAllocations(
+        const ExecuteParams& params,
+        const BufferAllocations& buffer_allocations);
+
+    const BufferAllocations* buffer_allocations;  // never null
+
+    // Main compute stream on which thunks launch operations.
+    se::Stream* stream;
+
+    // Auxiliary stream for tracing command buffers. We use a separate stream to
+    // avoid accidental tracing of unrelated activities on a main stream.
+    se::Stream* command_buffer_trace_stream;
+
+    // Parameters for executing collective operations.
+    CollectiveExecuteParams* collective_params;
+
+    // Collective cliques acquired based on resource requests.
+    CollectiveCliques* collective_cliques;
+
+    // Streams for moving data between host and device.
+    se::Stream* device_to_host_stream;
+    se::Stream* host_to_device_stream;
+
+    // Send/Recv callbacks passed to XLA from PjRt.
+    SendDeviceMemoryFunction* send_device_memory_function;
+    RecvDeviceMemoryFunction* recv_device_memory_function;
+
+    // XLA FFI execution context.
+    const ffi::ExecutionContext* ffi_execution_context;
+
+    // Additional compute streams on which thunks launch operations.
+    ExecutionStreamIdMap additional_compute_streams;
+
+    bool mock_collectives = false;
+
+    bool requires_exclusive_lock_on_gpu = false;
+
+   private:
+    friend class CommandBufferThunk;
+
+    ExecuteParams(const BufferAllocations* buffer_allocations,
+                  se::Stream* stream, se::Stream* command_buffer_trace_stream,
+                  CollectiveExecuteParams* collective_params,
+                  CollectiveCliques* collective_cliques,
+                  se::Stream* device_to_host_stream,
+                  se::Stream* host_to_device_stream,
+                  SendDeviceMemoryFunction* send_device_memory_function,
+                  RecvDeviceMemoryFunction* recv_device_memory_function,
+                  const ffi::ExecutionContext* ffi_execution_context,
+                  ExecutionStreamIdMap additional_compute_streams = {},
+                  bool mock_collectives = false,
+                  bool requires_exclusive_lock_on_gpu = false);
+  };
+
+  //===--------------------------------------------------------------------===//
+  // CleanupParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters passed to Cleanup. Before returning from executable execution,
+  // thunks may need to clean up any resource allocated or registered through
+  // runtime APIs.
+  struct CleanupParams {
+    se::StreamExecutor* executor = nullptr;
+
+    // Parameters for executing collective operations.
+    CollectiveExecuteParams* collective_params = nullptr;
+
+    // Collective cliques acquired based on resource requests.
+    CollectiveCliques* collective_cliques = nullptr;
+  };
+
+  //===--------------------------------------------------------------------===//
+
+  Thunk(Kind kind, ThunkInfo thunk_info)
+      : kind_(kind),
+        profile_annotation_(thunk_info.profile_annotation),
+        execution_stream_id_(thunk_info.execution_stream_id) {}
+  virtual ~Thunk() = default;
+  Thunk(const Thunk&) = delete;
+  Thunk& operator=(const Thunk&) = delete;
+
+  virtual std::string ToString(int indent) const { return ""; }
+  Kind kind() const { return kind_; }
+  absl::string_view profile_annotation() const { return profile_annotation_; }
+
+  // Prepares thunk for execution.
+  //
+  // This may be called multiple times. Its main purpose is to pass resource
+  // requests up to the parent executable so it can acquire them before
+  // initialization and execution.
+  virtual absl::Status Prepare(const PrepareParams& params,
+                               ResourceRequests& resource_requests) {
+    return absl::OkStatus();
+  }
+
+  // Initializes thunk for execution.
+  //
+  // This may be called multiple times. Its main purpose is to give us a chance
+  // to do initialization outside of ExecuteOnStream() so that the
+  // time spent initializing doesn't count towards our execution profile.
+  //
+  // Precondition: Prepare(initialize_params) has been called.
+  virtual absl::Status Initialize(const InitializeParams& params) {
+    return absl::OkStatus();
+  }
+
+  // Executes thunk on the given stream. This method must be called after
+  // Initialize and can be called multiple times over Thunk's lifetime.
+  //
+  // Precondition: Initialize(initialize_params) has been called.
+  virtual absl::Status ExecuteOnStream(const ExecuteParams& params) = 0;
+
+  // Cleans up any resources after thunk execution.
+  //
+  // This may be called multiple times. Its main purpose is to free up
+  // any resources occupied after initialization and execution.
+  virtual absl::Status Cleanup(const CleanupParams& params) {
+    return absl::OkStatus();
+  }
+
+  static absl::string_view KindToString(Thunk::Kind kind);
+
+  ExecutionStreamId execution_stream_id() const { return execution_stream_id_; }
+  void set_execution_stream_id(ExecutionStreamId execution_stream_id) {
+    execution_stream_id_ = execution_stream_id;
+  }
+
+  static absl::StatusOr<se::Stream*> GetStreamForExecution(
+      ExecutionStreamId stream_id, const ExecuteParams& params);
+
+  // Returns `true` if this thunk requires inter-GPU communication.
+  bool IsCollective() const;
+
+  // Invokes `fn` with this thunk and all nested thunks.
+  virtual void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const;
+
+  // A helper function to get the `GpuCollectives*` pointer from the
+  // thunk parameters. Returns an error if collectives API is not provided.
+  template <typename Params>
+  static absl::StatusOr<GpuCollectives*> GetGpuCollectives(
+      const Params& params) {
+    if (params.collective_params == nullptr) {
+      return Internal("Collective params are not provided");
+    }
+    if (params.collective_params->collectives == nullptr) {
+      return Internal("Collectives API is not provided");
+    }
+    return params.collective_params->collectives;
+  }
+
+ private:
+  Kind kind_;
+  std::string profile_annotation_;
+  ExecutionStreamId execution_stream_id_;
+};
+
+// A sequence of thunks.
+using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
+
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
+
+// A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its
+// shape.
+struct ShapedSlice {
+  BufferAllocation::Slice slice;
+  Shape shape;
+};
+
+// Returns if the thunk implements a reduction collective (all-reduce or
+// reduce-scatter).
+bool IsReductionCollective(Thunk::Kind kind);
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
new file mode 100644
index 00000000..b0e4c7fd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
@@ -0,0 +1,83 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a triangular
+// solve (BlasTrsm). It is generated by IrEmitter.
+//
+// Thread-compatible.
+class TriangularSolveThunk : public Thunk {
+ public:
+  TriangularSolveThunk(ThunkInfo thunk_info,
+                       const TriangularSolveOptions& options,
+                       const BufferAllocation::Slice& a_buffer,
+                       const BufferAllocation::Slice& b_buffer,
+                       const BufferAllocation::Slice& temp_buffer,
+                       PrimitiveType type, int64_t batch_size, int64_t m,
+                       int64_t n, int64_t a_batch_stride,
+                       int64_t b_batch_stride);
+
+  TriangularSolveThunk(const TriangularSolveThunk&) = delete;
+  TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  const se::blas::UpperLower uplo_;
+  const se::blas::Side side_;
+  const se::blas::Diagonal unit_diagonal_;
+  se::blas::Transpose transpose_a_;
+
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice b_buffer_;
+  const BufferAllocation::Slice temp_buffer_;
+
+  const PrimitiveType type_;
+  const int64_t batch_size_;
+  const int64_t m_;
+  const int64_t n_;
+  const int64_t a_batch_stride_;
+  const int64_t b_batch_stride_;
+};
+
+absl::Status RunTriangularSolve(se::DeviceMemoryBase a_data,
+                                se::DeviceMemoryBase b_data,
+                                se::DeviceMemoryBase temp_data,
+                                se::blas::UpperLower uplo, se::blas::Side side,
+                                se::blas::Diagonal unit_diagonal,
+                                se::blas::Transpose transpose_a,
+                                PrimitiveType type, int64_t batch_size,
+                                int64_t m, int64_t n, int64_t a_batch_stride,
+                                int64_t b_batch_stride, se::Stream* stream);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
new file mode 100644
index 00000000..87ec0ca5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
+
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+// This thunk
+class WaitForStreamsThunk : public Thunk {
+ public:
+  WaitForStreamsThunk(ThunkInfo thunk_info, ExecutionStreamId stream_id,
+                      ExecutionStreamId wait_for_stream_id)
+      : Thunk(Kind::kWaitForStreams, thunk_info),
+        stream_id_(stream_id),
+        wait_for_stream_id_(wait_for_stream_id) {};
+
+  WaitForStreamsThunk(const WaitForStreamsThunk&) = delete;
+  WaitForStreamsThunk& operator=(const WaitForStreamsThunk&) = delete;
+
+  const ExecutionStreamId& stream_id() const { return stream_id_; }
+  ExecutionStreamId wait_for_stream_id() const { return wait_for_stream_id_; }
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  ExecutionStreamId stream_id_;
+  ExecutionStreamId wait_for_stream_id_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/while_thunk.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/while_thunk.h
new file mode 100644
index 00000000..97a7a088
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime/while_thunk.h
@@ -0,0 +1,104 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_WHILE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_WHILE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// WhileThunk implements the while instruction on GPU by invoking a thunk
+// sequence for the while 'condition' computation, and (conditionally) another
+// thunk sequence for the while 'body' computation. WhileThunk assumes that
+// buffers for the following set of while-related instructions share the same
+// allocation:
+//   init, condition.parameter, body.parameter, body.root, while.result
+//
+// WhileThunk synchronizes the stream to test the result of the 'condition'
+// computation.
+//
+// If `trip_count` is available it means that the while loop trip count is known
+// statically and while loop is actually a for loop, and in this case at run
+// time condition thunk might not be executed and instead body thunk will be
+// executed for `trip_count` times.
+class WhileThunk : public Thunk {
+ public:
+  // Constructs a WhileThunk to compute while instruction 'hlo'.
+  WhileThunk(ThunkInfo thunk_info,
+             const BufferAllocation::Slice& condition_result_buffer_index,
+             std::unique_ptr<SequentialThunk> condition_thunk_sequence,
+             std::unique_ptr<SequentialThunk> body_thunk_sequence,
+             std::optional<int64_t> trip_count = std::nullopt);
+  WhileThunk(const WhileThunk&) = delete;
+  WhileThunk& operator=(const WhileThunk&) = delete;
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  SequentialThunk* condition_thunk_sequence() const {
+    return condition_thunk_sequence_.get();
+  }
+
+  SequentialThunk* body_thunk_sequence() const {
+    return body_thunk_sequence_.get();
+  }
+
+  const BufferAllocation::Slice& condition_result_buffer() const {
+    return condition_result_buffer_index_;
+  }
+
+  // Returns the current loop iteration if the caller is inside a while loop(s).
+  //
+  // Implementation relies on thread local storage, be careful when call it from
+  // code running on multiple threads.
+  static absl::StatusOr<int64_t> CurrentLoopIteration(int64_t depth = 0);
+
+  void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
+
+ private:
+  const BufferAllocation::Slice condition_result_buffer_index_;
+  std::unique_ptr<SequentialThunk> condition_thunk_sequence_;
+  std::unique_ptr<SequentialThunk> body_thunk_sequence_;
+  std::optional<int64_t> trip_count_;
+
+  // Pinned host memory for transfering predicate value from device to host.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      predicates_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_WHILE_THUNK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime_intrinsics.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime_intrinsics.h
new file mode 100644
index 00000000..c73f9f13
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/runtime_intrinsics.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_INTRINSICS_H_
+#define XLA_SERVICE_GPU_RUNTIME_INTRINSICS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+inline constexpr absl::string_view kXlaGpuAssertCustomCallTag =
+    "__xla_gpu_assert";
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_INTRINSICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
new file mode 100644
index 00000000..8ec8d67e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_SPLIT_K_GEMM_REWRITER_H_
+#define XLA_SERVICE_GPU_SPLIT_K_GEMM_REWRITER_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/matmul_utils.h"
+
+namespace xla {
+namespace gpu {
+
+// Is there a non-empty suffix "s" of span such that product(s) % divisor = 0
+// and for all t != s non-empty suffixes of s: d % product(t) = 0?
+bool HasDivisibleSuffixAllowingSplit(absl::Span<int64_t const> span,
+                                     int64_t divisor);
+
+// Apply split K configuration from the tiling config to the fusion instruction:
+// in addition to MakeDotComputationSplitKBatch on its computation add the
+// necessary reduction after it.
+absl::Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
+                                const TritonGemmConfig& config);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_SPLIT_K_GEMM_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/stream_executor_util.h
new file mode 100644
index 00000000..87a91c0b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -0,0 +1,153 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
+#define XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <tuple>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/autotuning.pb.h"
+#include "xla/layout.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla_data.pb.h"
+
+// Helper functions for interacting with StreamExecutor.
+
+namespace xla {
+namespace gpu {
+
+// Returns DNN version info from provided stream executor.
+absl::StatusOr<se::dnn::VersionInfo> GetDnnVersionInfo(
+    stream_executor::StreamExecutor* stream_exec);
+
+// Returns DNN version info from provided stream executor when possible,
+// fallback version otherwise.
+se::dnn::VersionInfo GetDnnVersionInfoOrDefault(
+    stream_executor::StreamExecutor* stream_exec,
+    se::dnn::VersionInfo fallback_version = se::dnn::VersionInfo{0, 0, 0});
+
+// Returns (input, filter, output) XLA Layout protos given the StreamExecutor
+// layouts.
+absl::StatusOr<std::tuple<Layout, Layout, Layout>>
+StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
+                                      se::dnn::DataLayout input,
+                                      se::dnn::FilterLayout filter,
+                                      se::dnn::DataLayout output);
+
+// Returns (input, filter, output) StreamExecutor layouts given the XLA layouts.
+absl::StatusOr<
+    std::tuple<se::dnn::DataLayout, se::dnn::FilterLayout, se::dnn::DataLayout>>
+XlaConvShapesToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums,
+                                     const Shape& input, const Shape& filter,
+                                     const Shape& output);
+
+// Finds the VECT_C dimension in input/filter/output, if present.
+//
+// A cudnn convolution may have layout NCHW_VECT_C, which means instead of
+// [N,C,H,W], the layout is [N,C/k,H,W,k] for some k (usually 4 or 32).
+//
+// ConvolutionDimensionNumbers doesn't explicitly store which is the `k`
+// dimension, because only cudnn convolutions have this feature; it's not
+// applicable elsewhere.  We find it by finding a dimension in the
+// input/filter/output shape that is *not* in dnums.
+std::tuple<std::optional<int64_t>, std::optional<int64_t>,
+           std::optional<int64_t>>
+FindVectorizedFeatureDims(const ConvolutionDimensionNumbers& dnums,
+                          const Shape& input, const Shape& filter,
+                          const Shape& output);
+
+// Generates and returns a unique lock per the provided executor.
+// Guarantees that blocks of code running for the same provided
+// executor will not be running concurrently if they lock the returned mutex.
+//
+// This is used to prevent other XLA instances from trying to autotune on a
+// device while another thread is using it.
+absl::Mutex& GetGpuMutex(const se::StreamExecutor* stream_exec);
+
+// Creates a kernel with a provided name, based from provided PTX in ptx.
+// The kernel should be executed using the provided executor.
+// The argument cubin_data represents compiled PTX and may be left empty.
+//
+// The canonical storage for both ptx and cubin_data should outlive
+// the lifetime of the kernel.
+absl::StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
+    absl::string_view kernel_name, uint64_t num_args, absl::string_view ptx,
+    absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec,
+    uint32_t shared_mem_bytes = 0);
+
+// Runs loaded kernel on the stream with the provided arguments.
+absl::Status ExecuteKernelOnStream(se::Kernel& kernel,
+                                   absl::Span<const se::DeviceMemoryBase> args,
+                                   const LaunchDimensions& dims,
+                                   se::Stream* stream);
+
+// Runs loaded kernel on the stream with the provided arguments.
+absl::Status ExecuteKernelOnStream(se::Kernel& kernel,
+                                   absl::Span<const se::DeviceMemoryBase> args,
+                                   const LaunchDimensions& dims,
+                                   const se::ClusterDim& cluster_dim,
+                                   se::Stream* stream);
+
+// Initializes `buffer` with random data on `stream`.
+// `rng_state` is an inout parameter for the pseudorandom generator state.
+// `buffer_type` determines what buffer would be filled out with.
+//
+// Precondition: `buffer_type` is a floating point type, `rng_state` needs to be
+// initialized to zero on the first use.
+void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
+                      int64_t* rng_state, se::DeviceMemoryBase buffer);
+
+absl::StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
+    CudnnConvKind kind);
+
+absl::StatusOr<se::dnn::NormKind> GetDNNNormKindFromCudnnNormKind(
+    CudnnNormKind kind);
+
+absl::StatusOr<se::dnn::FMHAMaskKind> GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(
+    CudnnfMHAMaskKind kind);
+
+absl::StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(
+    PrimitiveType type);
+
+// Returns result with the smallest time which has not failed.
+// If deterministic output is requested, returns first (not failing) result.
+absl::StatusOr<AutotuneResult> PickBestResult(
+    absl::Span<AutotuneResult const> profile_results,
+    std::optional<absl::string_view> instr_str,
+    HloModuleConfig hlo_module_config);
+
+// Returns whether determinism is required.
+bool RequireDeterminism(const HloModuleConfig& config);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/target_constants.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/target_constants.h
new file mode 100644
index 00000000..13190ae6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/target_constants.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TARGET_CONSTANTS_H_
+#define XLA_SERVICE_GPU_TARGET_CONSTANTS_H_
+
+namespace xla {
+namespace gpu {
+
+namespace nvptx {
+// The triple that represents our target.
+inline const char* TargetTriple() {
+  static constexpr char kTargetTriple[] = "nvptx64-nvidia-cuda";
+  return kTargetTriple;
+}
+
+// The data layout of the emitted module. Copied from computeDataLayout in
+// NVPTXTargetMachine.cpp.
+inline const char* DataLayout() {
+  static constexpr char kDataLayout[] =
+      "e-i64:64-i128:128-v16:16-v32:32-n16:32:64";
+  return kDataLayout;
+}
+}  // namespace nvptx
+
+namespace amdgpu {
+
+// The triple that represents our target on LLVM AMDGPU backend.
+inline const char* TargetTriple() {
+  static constexpr char kTargetTriple[] = "amdgcn-amd-amdhsa";
+  return kTargetTriple;
+}
+
+// The data layout of the emitted module.
+inline const char* DataLayout() {
+  static constexpr char kDataLayout[] =
+      "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
+  return kDataLayout;
+}
+
+}  // namespace amdgpu
+
+namespace spir {
+// The triple that represents our target on SPIR backend.
+inline const char* TargetTriple() {
+  static constexpr char kTargetTriple[] = "spir64-unknown-unknown";
+  return kTargetTriple;
+}
+
+// The data layout of the emitted module.
+inline const char* DataLayout() {
+  static constexpr char kDataLayout[] =
+      "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:"
+      "32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:"
+      "128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:"
+      "1024";
+  return kDataLayout;
+}
+}  // namespace spir
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TARGET_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/target_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/target_util.h
new file mode 100644
index 00000000..9346208c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/target_util.h
@@ -0,0 +1,106 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TARGET_UTIL_H_
+#define XLA_SERVICE_GPU_TARGET_UTIL_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/TargetParser/Triple.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Enumeration to get target specific intrinsics.
+enum class TargetIntrinsicID {
+  kThreadIdx = 0,
+  kThreadIdy,
+  kThreadIdz,
+  kBlockIdx,
+  kBlockIdy,
+  kBlockIdz,
+  kBarrierId,
+  kBlockDimx,
+  kBlockDimy,
+  kBlockDimz,
+  kGroupBarrierId,
+};
+
+// Enumeration to get target specific device math function.
+enum class TargetDeviceFunctionID {
+  kAtan2 = 0,
+  kCbrt,
+  kCos,
+  kExp,
+  kExpm1,
+  kFmod,
+  kHypot,
+  kLog,
+  kLog1p,
+  kPow,
+  kRsqrt,
+  kSin,
+  kSqrt,
+  kTan,
+  kTanh,
+  kErf,
+};
+
+// HLO opcode -> TargetDeviceFunctionID mapping. Returns std::nullopt if there
+// is no TargetDeviceFunctionID for the given HloOpcode.
+std::optional<TargetDeviceFunctionID> GetTargetDeviceFunctionID(HloOpcode op);
+
+// Emits IR to call a device function named "callee_name" on the given
+// operand. Returns the IR value that represents the return value.
+llvm::CallInst* EmitDeviceFunctionCall(
+    const std::string& callee_name, absl::Span<llvm::Value* const> operands,
+    absl::Span<const PrimitiveType> input_type, PrimitiveType output_type,
+    const llvm::AttrBuilder& attributes, llvm::IRBuilderBase* b,
+    absl::string_view name = "");
+
+// Emits a call to the specified target intrinsic with the given operands.
+// Overloaded intrinsics (for example, "minnum") must include a type
+// in overloaded_types  for each overloaded type. Typically, overloaded
+// intrinsics have only a single overloaded type.
+llvm::CallInst* EmitCallToTargetIntrinsic(
+    TargetIntrinsicID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilderBase* b);
+
+// Annotate the kernel as GPU kernel according to the GPU target.
+void AnnotateFunctionAsGpuKernel(llvm::Module* module, llvm::Function* func,
+                                 llvm::IRBuilderBase* b);
+
+// 'output_type' is the type of the math op corresponding to 'func_id' for which
+// we want to obtain the device function name.
+std::string ObtainDeviceFunctionName(TargetDeviceFunctionID func_id,
+                                     PrimitiveType output_type,
+                                     llvm::Triple target_triple);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TARGET_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
new file mode 100644
index 00000000..d77a4463
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+#define XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace gpu {
+
+// Tests that verify IR or PTX emitted by the GPU backend is as expected.
+class GpuCodegenTest : public LlvmIrGenTestBase {
+ public:
+  GpuCodegenTest()
+      : is_built_with_rocm_(
+            se::PlatformManager::PlatformWithName("ROCM").ok()) {}
+
+ protected:
+  // Converts LLVM match to be platform-specific.
+  std::string MakePlatformSpecificLlvm(absl::string_view input);
+
+  // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
+  // the ftz option.
+  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModuleWithFTZ(bool ftz);
+
+  // Compiles the given HLO module to PTX and verifies the PTX matches the given
+  // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
+  // The "VerifyPtx" part only happens on the CUDA platform,
+  // and hence the "Optionally" in function name.
+  // For ROCm platform this routine will only do the "Compile" part.
+  void CompileAndOptionallyVerifyPtx(
+      std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern);
+
+  bool is_built_with_rocm_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
new file mode 100644
index 00000000..51a4c67f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
@@ -0,0 +1,100 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALGEBRAIC_SIMPLIFIER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALGEBRAIC_SIMPLIFIER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+class GpuAlgebraicSimplifierVisitor : public AlgebraicSimplifierVisitor {
+ public:
+  explicit GpuAlgebraicSimplifierVisitor(
+      const AlgebraicSimplifierOptions& options,
+      se::GpuComputeCapability compute_capability,
+      AlgebraicSimplifier* simplifier)
+      : AlgebraicSimplifierVisitor(options, simplifier),
+        compute_capability_(std::move(compute_capability)) {}
+
+  absl::Status HandleAdd(HloInstruction* add) override;
+
+  bool ShouldStrengthReduceDotToReduce(const HloInstruction* hlo) override;
+
+ private:
+  // Returns true if the dot precision config is supported by simplifier.
+  bool SupportedDotPrecisionConfig(const PrecisionConfig& config) override;
+
+  // Makes algorithm specific set of instructions for multiply with precision
+  // algorithm in mind. In the trivial case it returns just multiply.
+  // For x3 or x6 algorithms it adds the parameters split instructions and the
+  // corresponding multiply instructions.
+  absl::StatusOr<HloInstruction*> MakeMultiplyForPrecisionAlgorithm(
+      HloInstruction* dot, HloInstruction* lhs, HloInstruction* rhs) override;
+
+  // Try to convert add(broadcast(const_0), add(broadcast(const_1), conv(...)))
+  // into add(broadcast(add(const_0, const_1)), conv(...)) and return true if
+  // successful. The particular sink happens only when enable_sink_broadcast is
+  // true and the broadcast shapes and dimensions match. The sink only happens
+  // when following a convolution to avoid having a side input when the
+  // instructions are fused to cudnnConvolutionBiasActivationForward later.
+  absl::StatusOr<bool> TryToSinkBroadcastOperandsOfChainedAdds(
+      HloInstruction* add);
+
+  se::GpuComputeCapability compute_capability_;
+};
+
+class GpuAlgebraicSimplifier : public AlgebraicSimplifier {
+ public:
+  explicit GpuAlgebraicSimplifier(const AlgebraicSimplifierOptions& options,
+                                  se::GpuComputeCapability compute_capability)
+      : AlgebraicSimplifier(options),
+        compute_capability_(std::move(compute_capability)) {}
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    XLA_VLOG_LINES(
+        2, "GpuAlgebraicSimplifier::Run(), before:\n" + module->ToString());
+    bool changed = false;
+    GpuAlgebraicSimplifierVisitor visitor(options_, compute_capability_, this);
+    for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
+      if (visitor.Run(comp, options_, this)) {
+        changed = true;
+      }
+    }
+    XLA_VLOG_LINES(
+        2, "GpuAlgebraicSimplifier::Run(), after:\n" + module->ToString());
+    return changed;
+  }
+
+ private:
+  se::GpuComputeCapability compute_capability_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALGEBRAIC_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
new file mode 100644
index 00000000..6fdddf28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALGORITHM_CHECKER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALGORITHM_CHECKER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// This checks if the requested algorithms are supported. This can give an early
+// and specific error if an unsupported algorithm is requested.
+//
+// Note: Maybe we can make this more generic and move it outside of GPU.
+class AlgorithmChecker : public HloModulePass {
+ public:
+  explicit AlgorithmChecker(se::GpuComputeCapability gpu_compute_capability)
+      : gpu_compute_capability_(std::move(gpu_compute_capability)){};
+
+  absl::string_view name() const override { return "algorithm-checker"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability gpu_compute_capability_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALGORITHM_CHECKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
new file mode 100644
index 00000000..aeb21958
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALIAS_PASSTHROUGH_PARAMS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALIAS_PASSTHROUGH_PARAMS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// This pass aliases input and output buffers that are associated with a
+// parameter that is passed through to the module root unmodified.
+//
+// This pass assumes that parameters and the root use unnested shapes, which is
+// the case for XLA:GPU.
+//
+// This pass must run prior to copy insertion.
+class AliasPassthroughParams : public HloModulePass {
+ public:
+  AliasPassthroughParams() = default;
+  ~AliasPassthroughParams() override = default;
+  absl::string_view name() const override { return "alias_passthrough_params"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALIAS_PASSTHROUGH_PARAMS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h
new file mode 100644
index 00000000..dc0b1c06
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// A pass that simplifies a dynamic-slice of an all-gather
+// whose slice is the same as the original operand of the all-gather.
+// As an example:
+//
+//   ag = all-gather(x) replica_groups={{0,1,2,3,4,5,6,7}}
+//   offset = multiply(partition_id, slice_size)
+//   ds = dynamic-slice(ag, offset, 0, 0)
+//
+//  Can be simplified to the all-gather operand.
+
+class AllGatherDynamicSliceSimplifier : public OpExpanderPass {
+ public:
+  struct Config {
+    bool allow_multiple_split_dims = false;
+    bool allow_intervening_reshape = true;
+    int min_rank = 1;
+    bool allow_intervening_bitcast = false;
+    bool allow_multiple_users = false;
+  };
+
+  static Config DefaultConfig() { return {}; }
+
+  explicit AllGatherDynamicSliceSimplifier(
+      Config config = AllGatherDynamicSliceSimplifier::DefaultConfig())
+      : config_(std::move(config)) {}
+
+  absl::string_view name() const override {
+    return "all-gather-dynamic-slice-simplifier";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+ private:
+  Config config_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.h
new file mode 100644
index 00000000..6250876f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_OPTIMIZER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_OPTIMIZER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Transforms
+//   binary_op(all-gather(op1(a)),all-gather(op2(b)))
+// to
+//   allgather(binary_op(op1(a),op2(b)))
+//
+// Where binary_op is commutative and takes exactly two operands as input.
+//
+class AllGatherOptimizer : public HloModulePass {
+ public:
+  AllGatherOptimizer() = default;
+  absl::string_view name() const override { return "all-gather-optimizer"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.h
new file mode 100644
index 00000000..087d27bc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_BLUECONNECT_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_BLUECONNECT_H_
+
+#include <cstddef>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Decomposes all-reduce operations using the BlueConnect algorithm.
+//
+// Paper: "BLUECONNECT: DECOMPOSING ALL-REDUCE FOR DEEP LEARNING ON
+// HETEROGENEOUS NETWORK HIERARCHY"
+// https://mlsys.org/Conferences/2019/doc/2019/130.pdf
+//
+// This algorithm attempts to minimize the number of levels of network hierarchy
+// traversed for as much data transfer as possible. This implementation assumes
+// that host IDs are ordered corresponding to network hierarchy.
+class AllReduceBlueConnect : public HloModulePass {
+ public:
+  explicit AllReduceBlueConnect(size_t num_devices_per_host)
+      : num_devices_per_host_(num_devices_per_host) {}
+
+  absl::string_view name() const override { return "all-reduce-blueconnect"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  size_t num_devices_per_host_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_BLUECONNECT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.h
new file mode 100644
index 00000000..2967eb79
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_SPLITTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Rewrites global AR if it is in the form of AR + DS and matches existing
+// replica groups into a logical RS followed by AR.
+//
+// If the pass detects AR followed by DS, then it checks whether
+// it is profitable to break it down into a logical RS (but AR + DS still),
+// followed by an AR to keep the rewrite numerically equivalent.
+//
+// Consider following example:
+//
+// Input program:
+//   HloModule m, num_partitions=8
+//     p = partition_id()
+//     ar = bf16[32] all-reduce(x), replica_groups={{0,1,2,3,4,5,6,7}}
+//     ds = dynamic-slice(ar, pointer(partition_id)), dynamic_slice_sizes={8}
+//
+// There is a global AR performing a reduction over 8 partitions.
+// However DS is performing 8-sized slice of a 32-sized tensor which implies
+// only 4 distinct slices of a tensor, which further implies 2 replicas of each
+// calculated slice. This can be expressed as RS within the replicas followed by
+// AR across the replicas. The transformation limits collectives to the data
+// that is actually needed for the requested slice.
+//
+// Output program:
+//   HloModule m, num_partitions=8
+//     p = partition_id()
+//     ar = bf16[32] all-reduce(x), replica_groups={{0,1,2,3},{4,5,6,7}}
+//     ds = dynamic-slice(ar, pointer(partition_id)), dynamic_slice_sizes={8}
+//     ar.2 = bf16[32] all-reduce(ds), replica_groups={{0,4},{1,5},{2,6},{3,7}}
+//
+// In addition the pass does the rewrite only if it finds it profitable to do
+// so. The profitability function is simple, and just checks whether there are
+// any collectives with same replica groups. If there are then the combiner pass
+// can pick it up, and fuse it into the same NCCL call.
+//
+// While the solution is orthogonal to existing known distribution patterns, in
+// practice it is profitable for HSDP style communication pattern.
+// https://arxiv.org/pdf/2203.11014
+//
+class AllReduceSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-reduce-splitter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/async_collective_annotator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/async_collective_annotator.h
new file mode 100644
index 00000000..8bdeeab1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/async_collective_annotator.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ASYNC_COLLECTIVE_ANNOTATOR_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ASYNC_COLLECTIVE_ANNOTATOR_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// Annotate async collectives with CollectiveBackendConfig.
+class AsyncCollectiveAnnotator : public HloModulePass {
+ public:
+  explicit AsyncCollectiveAnnotator(HloPredicate is_collective_async)
+      : is_collective_async_(std::move(is_collective_async)) {}
+  absl::string_view name() const override {
+    return "async-collective-annotator";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloPredicate is_collective_async_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ASYNC_COLLECTIVE_ANNOTATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/async_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
new file mode 100644
index 00000000..52490dab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ASYNC_WRAPPER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ASYNC_WRAPPER_H_
+
+#include <functional>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// AsyncWrappers wrap instructions that match a given `predicate` into async
+// blocks (i.e. `async-start` and `async-stop` instructions) so that they run
+// concurrently.
+class AsyncWrapper : public HloModulePass {
+ public:
+  using Predicate = std::function<bool(HloInstruction*)>;
+  explicit AsyncWrapper(Predicate predicate)
+      : predicate_(std::move(predicate)) {}
+
+  absl::string_view name() const override { return "async-wrapper"; }
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const Predicate predicate_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ASYNC_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_permute_cycle_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_permute_cycle_decomposer.h
new file mode 100644
index 00000000..7663d878
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_permute_cycle_decomposer.h
@@ -0,0 +1,70 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_CYCLE_DECOMPOSER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_CYCLE_DECOMPOSER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// CollectivePermuteCycleDecomposer is a pass that converts CollectivePermute
+// instructions with all participants forming either a forward cycle (such as
+// {{0,1},{1,2},{2,3},{3,0}) or a backward cycle (such as {{3,2},{2,1},{1,0},
+// {0,3}}) into two CollectivePermute instructions. We currently restrict
+// this transformation to CollectivePermute using partition mode, with one
+// input, without any context data. Here is an example.
+//
+// before transformation:
+//     start = (<rt>, <rt>) collective-permute(data),
+//       source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+//
+// after transformation:
+//     partition-id = u32[] partition-id()
+//     constant = u32[] constant(0)
+//     compare = pred[] compare(u32[] partition-id, u32[] constant),
+//       direction=EQ
+//     pred = pred[] broadcast(pred[] compare), dimensions={}
+//     cp1 = (<rt>, <rt>) collective-permute(data), source_target_pairs={{3,0}}
+//     cp2 = (<rt>, <rt>) collective-permute(data),
+//       source_target_pairs={{0,1},{1,2},{2,3}}
+//     data = <rt> select(pred, cp1, cp2)
+//
+class CollectivePermuteCycleDecomposer : public HloModulePass {
+ public:
+  explicit CollectivePermuteCycleDecomposer(int64_t threshold_in_bytes)
+      : threshold_in_bytes_(threshold_in_bytes) {}
+  absl::string_view name() const override {
+    return "collective-permute-cycle-decomposer";
+  }
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Transform only if the size of the CollectivePermute data >= threshold.
+  int64_t threshold_in_bytes_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_CYCLE_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h
new file mode 100644
index 00000000..cfead9b9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_VALID_ITERATION_ANNOTATOR_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_VALID_ITERATION_ANNOTATOR_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This is an unsafe transformation that is triggered only if the attribute
+// `is_pipelined_while_loop` is present on a while loop.
+//
+// If a while loop is known to be a pipelined while loop, has a known trip count
+// and increments with step=1, then this pass annotates the `collective-permute`
+// operations within the while loop with valid iterations for each GPU. This is
+// only done when the source-target pairs of the `collective-permute` operation
+// form a forward or backward cycle.
+//
+// For example, if the trip count is 10 (iteration 0 to 9), with step=1, and the
+// source-target pairs of a `collective-permute` operation are
+// `{{0,1},{1,2},{2,3},{3,0}}`, then this pass would annotate such operation
+// with `_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"`. This annotation
+// means that
+//   - for GPU index 0, the valid iterations are 0,1,2,3,4,5,6.
+//   - for GPU index 1, the valid iterations are 1,2,3,4,5,6,7.
+//   - for GPU index 2, the valid iterations are 2,3,4,5,6,7,8.
+//   - for GPU index 3, the valid iterations are 3,4,5,6,7,8,9.
+//
+// The index in the list denotes the device index and the bounds {start,end} are
+// inclusive. For more examples, look at
+// `xla/service/spmd/collective_permute_valid_iteration_annotator_tests.cc`.
+class CollectivePermuteValidIterationAnnotator : public HloModulePass {
+ public:
+  CollectivePermuteValidIterationAnnotator() = default;
+  absl::string_view name() const override {
+    return "collective-permute-valid-iteration-annotator";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_VALID_ITERATION_ANNOTATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_select_folder.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_select_folder.h
new file mode 100644
index 00000000..c53eb2ca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_select_folder.h
@@ -0,0 +1,86 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SELECT_FOLDER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SELECT_FOLDER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// If a collective-permute selects its source data based on a partition or
+// replica ID and we can prove that the condition is either always true or
+// always false, we can fold the redundant select op and use the correct source
+// data directly.
+//
+// Example:
+//
+//   condition = compare(replica-id(), X), direction=EQ
+//   snd_data = select(condition, true_data, false_data)
+//   rcv_data = collective-permute(snd_data), source_target_pairs={{X,0}}
+//
+// The condition is always true for the only relevant replica X and the IR can
+// be folded into
+//
+//   rcv_data = collective-permute(true_data), source_target_pairs={{X,0}}
+//
+// The pass only supports simple partion/replica-based predicates, comparing
+// partition/replica-id with a constant. Only comparison directions {EQ,NE} are
+// supported. The predicate may be broadcasted.
+//
+// This pass is motivated by pipeline parallelism, where it removes undesired
+// data dependencies.
+//
+// Example:
+//
+//   fwd_data = ...
+//   bwd_data =
+//   is_first_device = ...
+//   is_last_device = ...
+//   snd_data = select(is_last_device, bwd_data, fwd_data)
+//   rcv_bwd_data = collective-permute(snd_data),
+//       source_target_pairs={{LAST_ID,0}}
+//   rcv_fwd_data = collective-permute(snd_data),
+//       source_target_pairs={{0,1},{1,2},...,{LAST_ID,0}}
+//   ROOT rcv_data = select(is_first_device, rcv_bwd_data, rcv_fwd_data)
+//
+// The select can be removed on both paths resulting in
+//
+//   fwd_data = ...
+//   bwd_data =
+//   is_first_device = ...
+//   is_last_device = ...
+//   rcv_bwd_data = collective-permute(bwd_data),
+//       source_target_pairs={{LAST_ID,0}}
+//   rcv_fwd_data = collective-permute(fwd_data),
+//       source_target_pairs={{0,1},{1,2},...,{LAST_ID,0}}
+//   ROOT rcv_data = select(is_first_device, rcv_bwd_data, rcv_fwd_data)
+//
+class CollectiveSelectFolder : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "collective-select-folder"; }
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SELECT_FOLDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner.h
new file mode 100644
index 00000000..8b1de060
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SEND_RECV_COMBINER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SEND_RECV_COMBINER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// CollectiveSendRecvCombiner is a pass that scans for all send/recv pairs
+// which are part of the same computation, and transforms them into a wrapped
+// multi-op computation that can be executed asynchronously. This pass also
+// replaces the corresponding send-done and recv-done instructions with
+// async-done functions. This pass shouldn't be applied to send/recv
+// instructions that are called in a while loop, since it will force all
+// send/recv instructions in the same group to finish executing before
+// computation can continue.Partial grouping of send/recv instructions in the
+// same NCCL group will lead to deadlocks and is therefore discouraged. In
+// practice this means that there exists at least one send or recv instruction
+// in the same NCCL group that doesn't have a matching send/recv. An example of
+// partial grouping with deadlock written in HLO pseudocode:
+//   wrapped_send_recv {send1, recv1, recv2}
+//   async_start = async_start(inputs), calls=wrapped_send_recv
+//   loop_input = gte(async_done(async_start))
+//   while_loop_output = while(loop_input)
+//   send2_data = gte(while_loop_output)
+//   output_token = send2(send2_data)
+class CollectiveSendRecvCombiner : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "collective-send-recv-combiner";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SEND_RECV_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
new file mode 100644
index 00000000..71d5b421
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
@@ -0,0 +1,140 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COMMAND_BUFFER_SCHEDULING_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COMMAND_BUFFER_SCHEDULING_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Lift fusion instructions to command buffers.
+//
+// Before the pass:
+//   %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+//     ...
+//   }
+//
+//   ENTRY %main (a: s32[], b: s32[]) -> s32[] {
+//     %a = s32[] parameter(0)
+//     %b = s32[] parameter(1)
+//     ROOT %fusion = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop,
+//       calls=%fused_computation
+//   }
+//
+// After the pass:
+//   %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+//     ...
+//   }
+//
+//   %command_buffer (param_0: s32[], param_1: s32[]) -> s32[] {
+//     %param_0 = s32[] parameter(0)
+//     %param_1 = s32[] parameter(1)
+//     ROOT %fusion = s32[] fusion(s32[] %param_0, s32[] %param_1), kind=kLoop,
+//       calls=%fused_computation
+//   }
+//
+//   ENTRY %main (a: s32[], b: s32[]) -> s32[] {
+//     %a = s32[] parameter(0)
+//     %b = s32[] parameter(1)
+//     ROOT %call = s32[] call(s32[] %a, s32[] %b), to_apply=%command_buffer
+//  }
+//
+// We currently do not have a command_buffer HLO operation, so we'll start with
+// a kCall op code with an attached HLO computation. We'll consider graduating
+// custom call to a first class operation later.
+class CommandBufferScheduling : public HloModulePass {
+ public:
+  struct CommandBufferConfig {
+    // DebugOptions control which commands are enabled. Long term we want to
+    // remove that flag and enable all supported commands by default.
+    absl::flat_hash_set<DebugOptions::CommandBufferCmdType> enabled_commands;
+    absl::flat_hash_set<std::string> enabled_legacy_custom_call_targets;
+    const se::DeviceDescription& device_description;
+  };
+
+  explicit CommandBufferScheduling(
+      const se::DeviceDescription& device_description);
+
+  absl::string_view name() const override {
+    return "command-buffer-scheduling";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  static std::vector<HloInstructionSequence> CollectCommandBufferSequences(
+      HloInstructionSequence schedule, const CommandBufferConfig& config,
+      int32_t min_num_commands = 1);
+
+  // Moves kParameter and kConstant instructions in a computation to
+  // the beginning of the computation. This simplifies the construction of
+  // command buffer computations because we don't need to deal with parameters
+  // and constants that have users outside of a command buffer.
+  // Returns true if there is a change in the order of instructions, false
+  // otherwise.
+  static absl::StatusOr<bool> MoveParametersAndConstantsToFront(
+      HloComputation* computation);
+
+  struct CommandBuffer {
+    // Command buffer arguments (call instruction arguments).
+    std::vector<HloInstruction*> arguments;
+
+    // Command buffer result (call instruction result tuple).
+    std::vector<HloInstruction*> results;
+
+    // Hlo computation corresponding to a command buffer body.
+    std::unique_ptr<HloComputation> computation;
+
+    // Mapping from original instruction to their clones in the command buffer.
+    absl::flat_hash_map<HloInstruction*, HloInstruction*> inst_mapping;
+  };
+
+  // Prepares a command buffer from the instruction sequence. Used values
+  // constructed by instructions outside of the sequence are passed in as
+  // parameters. Results of instructions in the sequence are returned in a tuple
+  // (if command buffer has a single result we don't wrap it into tuple).
+  static absl::StatusOr<CommandBuffer> PrepareCommandBuffer(
+      const HloInstructionSequence& seq, HloModule* module);
+
+  // Rewrites prepared command buffer computation into Hlo operations in the
+  // parent computation (calls command buffer and replaced all users).
+  static absl::StatusOr<HloComputation*> RewriteCommandBuffer(
+      HloComputation* parent, const HloInstructionSequence& seq,
+      CommandBuffer command_buffer);
+
+ private:
+  se::DeviceDescription device_description_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COMMAND_BUFFER_SCHEDULING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
new file mode 100644
index 00000000..9be21ebd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CONV_PADDING_LEGALIZATION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CONV_PADDING_LEGALIZATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// An HLO pass that canonicalizes convolution instructions for GPU codegen. It
+// inserts Pad instructions before Convolution instructions with uncanonicalized
+// padding, so that they can be lowered to Cudnn/Miopen convolution.
+class ConvPaddingLegalization : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "conv-padding-legalization";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+  // Returns if any changes are made to the parent computation.
+  bool CanonicalizeForwardConvolution(HloInstruction* conv);
+  bool CanonicalizeBackwardFilterConvolution(HloInstruction* backward_conv);
+  bool CanonicalizeBackwardInputConvolution(HloInstruction* backward_conv);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CONV_PADDING_LEGALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
new file mode 100644
index 00000000..5ad7e711
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CONV_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CONV_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites plain convolutions, backwards-filter convolutions, and
+// backwards-input convolutions into CustomCall HLOs that call into
+// Cudnn/Miopen.
+//
+// This pass does not fuse other ops into the convolution. Instead, specific
+// patterns of ops will be matched and fused into the custom call in
+// CudnnFusedConvRewriter.
+
+class ConvRewriter : public HloModulePass {
+ public:
+  explicit ConvRewriter(const se::GpuComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {};
+
+  absl::string_view name() const override { return "conv-rewriter"; }
+
+  static bool ConvIsLowerable(HloInstruction* conv);
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability compute_capability_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CONV_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/convert_async_collectives_to_sync.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/convert_async_collectives_to_sync.h
new file mode 100644
index 00000000..85ae15ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/convert_async_collectives_to_sync.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuConvertAsyncCollectivesToSync : public ConvertAsyncCollectivesToSync {
+ public:
+  using ConvertAsyncCollectivesToSync::ConvertAsyncCollectivesToSync;
+  absl::string_view name() const override {
+    return "gpu-convert-async-collectives-to-sync";
+  }
+
+  absl::Status ConvertAsyncInstructionsToSync(
+      HloComputation* computation,
+      absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
+      const override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/copy_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
new file mode 100644
index 00000000..96ff095c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COPY_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COPY_FUSION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// CopyFusion checks if a fusion is followed by multiple copies and if so, adds
+// those copies to the fusion, replacing the copies with get_tuple_elements.
+class CopyFusion : public HloModulePass {
+ public:
+  explicit CopyFusion(const se::DeviceDescription& device_description)
+      : device_description_(device_description) {}
+
+  absl::string_view name() const override { return "copy_fusion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> DoCopyFusion(HloComputation* computation);
+
+  const se::DeviceDescription& device_description_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COPY_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
new file mode 100644
index 00000000..15c6c74d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUBLAS_PAD_FOR_GEMMS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUBLAS_PAD_FOR_GEMMS_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Adds padding to dot operations to make them run faster on GPUs.
+//
+//
+// This can be used to pad f16 dots on tensor cores, or s8 dots to multiples of
+// four.
+//
+// This pass depends on xla::DotDecomposer pass,
+// so it should go strictly later.
+class CublasPadForGemms : public HloModulePass {
+ public:
+  CublasPadForGemms(const se::GpuComputeCapability gpu_compute_capability,
+                    PrimitiveType datatype, int32_t pad_to_multiple_of)
+      : gpu_compute_capability_(gpu_compute_capability),
+        datatype_(datatype),
+        pad_to_multiple_of_(pad_to_multiple_of) {}
+
+  absl::string_view name() const override { return "cublas-pad-for-gemms"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::GpuComputeCapability gpu_compute_capability_;
+  PrimitiveType datatype_;
+  int32_t pad_to_multiple_of_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUBLAS_PAD_FOR_GEMMS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
new file mode 100644
index 00000000..b29d09d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_CUSTOM_CALL_COMPILER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_CUSTOM_CALL_COMPILER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// Compile cuDNN custom calls to binaries and serialize them.
+// Also adjust them in HLO to have correct workspace size.
+class CuDnnCustomCallCompiler : public HloModulePass {
+ public:
+  explicit CuDnnCustomCallCompiler(se::StreamExecutor& stream_exec,
+                                   BinaryMap& compilation_results)
+      : dnn_support_(*stream_exec.AsDnn()),
+        compilation_results_(compilation_results) {}
+
+  absl::string_view name() const override {
+    return "cudnn-custom-call-compiler";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::dnn::DnnSupport& dnn_support_;
+  BinaryMap& compilation_results_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_CUSTOM_CALL_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
new file mode 100644
index 00000000..26dda4b1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_CUSTOM_CALL_CONVERTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_CUSTOM_CALL_CONVERTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Converts custom calls with kCuDnnFusionKind backend config to
+// fusions with the same backend config. Frameworks can pass computations
+// outlined this way through StableHLO; after the conversion they can be
+// processed by XLA using the existing pipeline for custom fusions.
+class CuDnnCustomCallConverter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "cudnn-custom-call-converter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_CUSTOM_CALL_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
new file mode 100644
index 00000000..a95ca980
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
@@ -0,0 +1,134 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_CONV_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_CONV_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites custom-calls targeting cudnnConvolutionForward to
+// cudnnConvolutionBiasActivationForward by fusing operations following forward
+// convolution.  This transform must run after GpuConvRewriter.
+//
+// Semantics of underlying cudnn ops:
+//
+// https://docs.nvidia.com/deeplearning/cudnn/latest/api/cudnn-cnn-library.html#cudnnconvolutionforward
+// https://docs.nvidia.com/deeplearning/cudnn/latest/developer/misc.html#scaling-parameters
+//
+// ## Floating-point convs
+//
+// A "complete" fused floating-point conv has the form
+//
+//   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias)),
+//
+// which we fuse to
+//
+//   cudnnConvolutionBiasActivationForward(x, w, bias, side_input).
+//
+// You can leave out side_input, bias, alpha1, alpha2, and max(x, 0) and still
+// get a fused convolution.  alpha1/2 must be broadcasts of scalar constants.
+//
+// f16 convs accumulate in f32.  We represent this in HLO as an f32 convolution
+// whose inputs can be converted to f16 without loss of precision and whose
+// output is immediately converted to f16.  A fused f16 conv must follow one of
+// the following idioms.
+//
+//   1. convert_f16(conv_f32(x_f32, w_f32)) +
+//      side_input_f16 + broadcast(bias_f16)
+//
+//   2. convert_f16(conv_f32(x_f32, w_f32) +
+//                  side_input_f32 + broadcast(bias_f32))
+//
+// (These are not strictly mathematically equivalent, but cudnn doesn't tell us
+// which one it does, and we deem them "close enough".)
+//
+// The foo_f32 HLOs must all be losslessly-convertible to f16.  Some valid
+// examples:
+//
+//   - foo_f32 = convert_f32(foo_f16)
+//   - foo_f32 = an f32 constant whose values all fit within f16
+//   - foo_f32 = broadcast/transpose/reshape(one of the above)
+//
+// If you have a relu, it can appear before or after the convert_f16.
+//
+// Note that here `bias` must be losslessly-convertible to f16; this is
+// different than for s8 convolutions, where bias is f32.
+//
+// ## Integer convs
+//
+// In pure HLO, a "complete" integer conv is spelled as one of the following
+// `result`s.
+//
+//   base = alpha1_f32 * convert_f32(conv_s32(input_s32, filter_s32)) +
+//          alpha2_f32 * side_input +
+//          bias_f32
+//
+//   result_f32        = max(base, 0)
+//   result_s8_option1 = max(convert_s8(clamp(-128, base, 127)), 0)
+//   result_s8_option2 = convert_s8(clamp(-128, max(base, 0), 127))
+//
+// The foo_s32 HLOs must be losslessly-convertible to s8.  If the `result_s8`
+// case, side_input should be an f32 HLO that's losslessly-convertible to s8;
+// otherwise, it should be losslessly-convertible to f32.
+//
+// In the `result_s8` case where there's no bias, side-input, or alpha1, you can
+// skip the convert_f32 on conv.
+//
+// If you have an integer convolution that doesn't fit one of these idioms, this
+// pass returns an error -- cudnn will not be able to run it.
+class CudnnFusedConvRewriter : public HloModulePass {
+ public:
+  CudnnFusedConvRewriter(se::CudaComputeCapability cc,
+                         se::dnn::VersionInfo dnn_version,
+                         se::SemanticVersion toolkit_version)
+      : compute_capability_(cc),
+        dnn_version_(dnn_version),
+        toolkit_version_(toolkit_version) {}
+  CudnnFusedConvRewriter(se::RocmComputeCapability cc,
+                         se::dnn::VersionInfo dnn_version,
+                         se::SemanticVersion toolkit_version)
+      : compute_capability_(cc),
+        dnn_version_(dnn_version),
+        toolkit_version_(toolkit_version) {}
+
+  absl::string_view name() const override {
+    return "cudnn-fused-convolution-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability compute_capability_;
+  se::dnn::VersionInfo dnn_version_;
+  se::SemanticVersion toolkit_version_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_CONV_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_mha_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_mha_rewriter.h
new file mode 100644
index 00000000..8396a858
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_mha_rewriter.h
@@ -0,0 +1,59 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_MHA_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_MHA_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+class CudnnFusedMHARewriter : public HloModulePass {
+ public:
+  explicit CudnnFusedMHARewriter(se::CudaComputeCapability cc,
+                                 se::StreamExecutor* stream_executor)
+      : compute_capability_(cc), stream_executor_(stream_executor) {}
+
+  explicit CudnnFusedMHARewriter(se::CudaComputeCapability cc,
+                                 se::dnn::VersionInfo cudnn_version)
+      : compute_capability_(cc), cudnn_version_(cudnn_version) {}
+
+  absl::string_view name() const override {
+    return "cudnn-fused-multi-headed-attention-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::CudaComputeCapability compute_capability_;
+  se::StreamExecutor* stream_executor_ = nullptr;
+  const se::dnn::VersionInfo cudnn_version_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_MHA_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_mha_transpose_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_mha_transpose_fusion.h
new file mode 100644
index 00000000..7d3d8450
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fused_mha_transpose_fusion.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_MHA_TRANSPOSE_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_MHA_TRANSPOSE_FUSION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+class CudnnFusedMHATransposeFusion : public HloModulePass {
+ public:
+  CudnnFusedMHATransposeFusion() = default;
+
+  absl::string_view name() const override {
+    return "cudnn-fused-multi-headed-attention-transpose-fusion";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSED_MHA_TRANSPOSE_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
new file mode 100644
index 00000000..0729bd7a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSION_COMPILER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSION_COMPILER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// Converts HLO fusions with cuDNN backend config to cuDNN graphs,
+// compiles them using a cuDNN handle and serializes them.
+class CuDnnFusionCompiler : public HloModulePass {
+ public:
+  explicit CuDnnFusionCompiler(se::StreamExecutor& stream_exec,
+                               BinaryMap& compilation_results)
+      : dnn_support_(*stream_exec.AsDnn()),
+        compilation_results_(compilation_results) {}
+
+  absl::string_view name() const override { return "cudnn-fusion-compiler"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  static int GetAvailablePlanCount(se::StreamExecutor& stream_exec,
+                                   const HloFusionInstruction& hlo);
+
+ private:
+  se::dnn::DnnSupport& dnn_support_;
+  BinaryMap& compilation_results_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_FUSION_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
new file mode 100644
index 00000000..3c35f16c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_NORM_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_NORM_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites norm patterns into Custom Calls to the cuDNN library. Currently, the
+// forward and backward passes of layer norm patterns are implemented.
+class CudnnNormRewriter : public HloModulePass {
+ public:
+  explicit CudnnNormRewriter(se::CudaComputeCapability cuda_compute_capability);
+  absl::string_view name() const override { return "norm-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::CudaComputeCapability cuda_compute_capability_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_NORM_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
new file mode 100644
index 00000000..cb52b6ce
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_PAD_FOR_CONVOLUTIONS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_PAD_FOR_CONVOLUTIONS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+// Two zero-paddings for CuDNN thunking are done in this transform: padding for
+// tensor cores and padding for integer convolutions.  This transform also
+// add slice instruction to remove unnecessary output features.
+class CudnnPadForConvolutions : public HloModulePass {
+ public:
+  explicit CudnnPadForConvolutions(se::CudaComputeCapability compute_capability)
+      : compute_capability_(compute_capability) {}
+
+  absl::string_view name() const override {
+    return "cudnn_pad_for_convolutions";
+  }
+  // Run PadForConvolutions on the given module and return if any change is made
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::CudaComputeCapability compute_capability_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_PAD_FOR_CONVOLUTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
new file mode 100644
index 00000000..ecb86ec2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_SIMPLIFY_PADDING_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_SIMPLIFY_PADDING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Simplifies or eliminates padding introduced by CudnnPadForConvolutions and
+// CudnnVectorizeConvolutions.
+//
+// CudnnVectorizeConvolutions will generate code that does the following.
+//  - pad input and output features to a multiple of 32 (or 4),
+//  - reshape input from [N,C,H,W] to [N,C/32,H,W,32] and reshape kernel from
+//    [I,O,H,W] to [I/32,32,O,H,W],
+//  - run the conv,
+//  - reshape output from [N,C/32,H,W,32] to [N,C,H,W], and finally
+//  - slice off the padding on the C channel.
+//
+// But if this is followed by another convolution (very common), then the slice
+// is immediately followed by another pad. This may be redundant; we know that
+// the trailing channels sliced off from the first conv are 0.
+//
+// Ideally we can eliminate the whole reshape+slice+pad+reshape sequence between
+// the two convolutions.
+//
+// Specifically, this pass tries to merge the slice at the end of the sequence
+// above into the pad from the next convolution (when we can prove that the
+// sliced-off elements are all 0). We then rely on algsimp to remove the pad if
+// it's a nop and then to merge and eliminate the remaining reshapes.
+//
+// This pass should run after CudnnVectorizeConvolutions and there should be no
+// simplification passes in between that modify the reshape-transpose-reshape
+// introduced by int8x32 convolution filter reordering.
+class CudnnSimplifyPadding : public HloModulePass {
+ public:
+  CudnnSimplifyPadding() = default;
+
+  absl::string_view name() const override { return "cudnn_simplify_padding"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_SIMPLIFY_PADDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.h
new file mode 100644
index 00000000..7e811edb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.h
@@ -0,0 +1,73 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUDNN_VECTORIZE_CONVOLUTIONS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUDNN_VECTORIZE_CONVOLUTIONS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+// Changes the shape of cudnn convolutions to allow faster "vectorized"
+// algorithms.
+//
+// On sm61+ will convert int8_t convolutions from
+//
+//   - [N, C, H, W] to [N, C/4, H, W, 4],
+//
+// assuming C is divisible by 4.
+//
+// On sm75+ will convert int8_t convolutions from
+//
+//   - [N, C, H, W]      to [N, C/32, H, W, 32],
+//   - [N, C/4, H, W, 4] to [N, C/32, H, W, 32], and
+//   - [N, C, H, W]      to [N,  C/4, H, W,  4] (same as sm61+),
+//
+// assuming C is divisible by 4 or 32.
+//
+// This pass will not pad the channel dim to a multiple of 4 or 32, so you
+// should run CudnnPadForConvolutions before this.
+class CudnnVectorizeConvolutions : public HloModulePass {
+ public:
+  explicit CudnnVectorizeConvolutions(
+      se::CudaComputeCapability compute_capability,
+      se::dnn::VersionInfo cudnn_version)
+      : compute_capability_(compute_capability),
+        cudnn_version_(cudnn_version) {}
+
+  absl::string_view name() const override {
+    return "cudnn_vectorize_convolutions";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::CudaComputeCapability compute_capability_;
+  const se::dnn::VersionInfo cudnn_version_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUDNN_VECTORIZE_CONVOLUTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
new file mode 100644
index 00000000..5738bf52
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
@@ -0,0 +1,87 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_CUSTOM_KERNEL_FUSION_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_CUSTOM_KERNEL_FUSION_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Pattern matches HLO instruction to custom kernel fusions (hand written CUDA
+// C++ kernels, e.g. custom GEMMs implemented with CUTLASS) and rewrites them
+// into fusion instructions and fusion computations.
+//
+// Example: pattern matching dot operation into CUTLASS gemm
+//
+//  ENTRY %main (p0: f16[15,19], p1: f16[19,17]) -> f16[15,17] {
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    %p1 = f16[19,17]{1,0} parameter(1)
+//    ROOT %r = f16[15,17]{1,0} dot(%p0, %p1),
+//      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+//  }
+//
+// After the pass:
+//
+//  %cutlass_gemm (p0: f16[19,17], p1: f16[15,19]) -> f16[15,17] {
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    %p1 = f16[19,17]{1,0} parameter(1)
+//    ROOT %r = f16[15,17]{1,0} dot(%p0, %p1),
+//      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+//  }
+//
+//  ENTRY %main (p0: f16[15,19], p1: f16[19,17]) -> f16[15,17] {
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    %p1 = f16[19,17]{1,0} parameter(1)
+//    ROOT %r = f16[15,17]{1,0} fusion(%p0, %p1), kind=kCustom,
+//      calls==cutlass_gemm,
+//      backend_config={kind: "__custom_fusion",
+//                      custom_fusion_config: {"name":"cutlass_gemm"}}
+//  }
+//
+class CustomKernelFusionRewriter : public HloModulePass {
+ public:
+  explicit CustomKernelFusionRewriter(
+      const se::DeviceDescription* device, int kernel_index = 0,
+      const CustomKernelFusionPatternRegistry* patterns =
+          CustomKernelFusionPatternRegistry::Default());
+
+  absl::string_view name() const override {
+    return "custom-kernel-fusion-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription* device_;
+  const int kernel_index_;
+  const CustomKernelFusionPatternRegistry* patterns_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_CUSTOM_KERNEL_FUSION_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
new file mode 100644
index 00000000..ac9ddc79
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DOT_ALGORITHM_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DOT_ALGORITHM_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+class DotAlgorithmRewriter : public HloModulePass {
+ public:
+  DotAlgorithmRewriter() = default;
+  absl::string_view name() const override { return "dot-algorithm-rewriter"; }
+  using HloPassInterface::Run;
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  static absl::StatusOr<HloInstruction*> MakeMultiplyForBF16BF16F32(
+      HloInstruction* lhs, HloInstruction* rhs);
+  static absl::StatusOr<HloInstruction*> MakeMultiplyForBF16BF16F32X3(
+      HloInstruction* lhs, HloInstruction* rhs);
+  static absl::StatusOr<HloInstruction*> MakeMultiplyForBF16BF16F32X6(
+      HloInstruction* lhs, HloInstruction* rhs);
+  static absl::StatusOr<HloInstruction*> MakeMultiplyForTF32TF32F32(
+      HloInstruction* lhs, HloInstruction* rhs);
+  static absl::StatusOr<HloInstruction*> MakeMultiplyForTF32TF32F32X3(
+      HloInstruction* lhs, HloInstruction* rhs);
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DOT_ALGORITHM_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
new file mode 100644
index 00000000..5c75c50e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DOT_DIMENSION_SORTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DOT_DIMENSION_SORTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Sorts contracting dimensions of dot() operands when this reduces the
+// number of transposes. Example:
+// dot(p0, p1), lhs_contracting_dims={3,2}, rhs_contracting_dims={2,1}  ->
+// dot(p0, p1), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+// The first case gets transposes inserted by dot_decomposer, the second one
+// does not and thus is generally more efficient.
+
+// TODO(b/265688934): do the same for batch dimensions?
+
+class DotDimensionSorter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_dimension_sorter"; }
+
+  // Run the pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DOT_DIMENSION_SORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_normalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_normalizer.h
new file mode 100644
index 00000000..97e85229
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_normalizer.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DOT_NORMALIZER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DOT_NORMALIZER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+// Ensures that a dot has at least 1 contracting dimension. If there are no
+// contracting dimensions, a trivial 1-sized contracting dimension is added.
+// This pass is expected to be run after layout assignment.
+class DotNormalizer : public OpExpanderPass {
+ public:
+  explicit DotNormalizer(HloPredicate extra_filter = nullptr)
+      : OpExpanderPass(std::move(extra_filter)) {}
+
+  absl::string_view name() const override { return "dot_normalizer"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DOT_NORMALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.h
new file mode 100644
index 00000000..7b3331ef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DOT_OPERAND_CONVERTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DOT_OPERAND_CONVERTER_H_
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+// Converts both operands to the highest precision operand type.
+class DotOperandConverter : public OpExpanderPass {
+ public:
+  explicit DotOperandConverter(HloPredicate extra_filter = nullptr)
+      : OpExpanderPass(std::move(extra_filter)) {}
+
+  absl::string_view name() const override { return "operand_converter"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DOT_OPERAND_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.h
new file mode 100644
index 00000000..408dee27
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DOT_SPARSITY_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DOT_SPARSITY_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Make sure sparse dot requirements are met (sparse operand is LHS).
+class DotSparsityRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_sparsity_rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DOT_SPARSITY_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
new file mode 100644
index 00000000..aa480345
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
@@ -0,0 +1,73 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DOUBLE_BUFFER_LOOP_UNROLLING_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DOUBLE_BUFFER_LOOP_UNROLLING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// With `kDoubleBuffer` strategy:
+//   This pass performs the unrolling-by-2 loop transformation
+//   to effectively achieve double buffering between inputs and outputs
+//   of previously rolled iterations.
+//   This pass only runs on loops with known trip counts.
+//   For even number of iterations, unrolling-by-2 will be done directly.
+//   For odd number of iterations, the first iteration of the loop will be
+//   peeled outside of the while loop to make the trip count an even number,
+//   then proceed to unroll by 2.
+//   It also updates the trip count property of the loop to the correct one
+//   (n/2).
+//
+// With `kFullUnroll` strategy:
+//   This pass will perform the full unroll of the loop with the same strategy
+//   that is used with `kDoubleBuffer` but while loop trip count times.
+//   It updates the trip count of the while loop to 1, and relies on other
+//   passes (like `WhileLoopSimplifier`) to simplify/get rid of the while loop
+//   eventually.
+//
+// Note that this pass will flatten the call graph if any loop has been
+// unrolled.
+class DoubleBufferLoopUnrolling : public HloModulePass {
+ public:
+  enum class UnrollStrategy { kDoubleBuffer, kFullUnroll, kAuto };
+
+  explicit DoubleBufferLoopUnrolling(
+      UnrollStrategy unroll_strategy = UnrollStrategy::kDoubleBuffer)
+      : unroll_strategy_(unroll_strategy) {};
+  ~DoubleBufferLoopUnrolling() override = default;
+
+  absl::string_view name() const override {
+    return "loop-double-buffer-transformer";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  UnrollStrategy unroll_strategy_;
+};
+
+}  // end namespace gpu
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DOUBLE_BUFFER_LOOP_UNROLLING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
new file mode 100644
index 00000000..3bf8f37c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_DYNAMIC_SLICE_FUSION_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_DYNAMIC_SLICE_FUSION_REWRITER_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Pattern matches (slice(s) + custom call) to custom address computation
+// fusions and rewrites them into fusion instructions and fusion computations.
+//
+// Example:
+//
+//  ENTRY %main {
+//    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+//    %p1 = bf16[2,8,8]{2,1,0} parameter(1)
+//    %slice_lhs = bf16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_lhs = bf16[8,8]{1,0} bitcast(%slice_lhs)
+//    %slice_rhs = bf16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_rhs = bf16[8,8]{1,0} bitcast(%slice_rhs)
+//    ROOT %dot = bf16[8,8]{1,0} custom-call(%bitcast_lhs, %bitcast_rhs),
+//      custom_call_target="__cublas$gemm"
+//  }
+//
+// After the pass:
+//
+//  %address_computation {
+//    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+//    %p1 = bf16[2,8,8]{2,1,0} parameter(1)
+//    %slice_lhs = bf16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_lhs = bf16[8,8]{1,0} bitcast(%slice_lhs)
+//    %slice_rhs = bf16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_rhs = bf16[8,8]{1,0} bitcast(%slice_rhs)
+//    ROOT %dot = bf16[8,8]{1,0} custom-call(%bitcast_lhs, %bitcast_rhs),
+//      custom_call_target="__cublas$gemm"
+//  }
+//
+//  ENTRY %main {
+//    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+//    %p1 = bf16[2,8,8]{2,1,0} parameter(1)
+//    ROOT %fusion.2 = bf16[8,8]{1,0} fusion(%p0, %p1),
+//        kind=kCustom, calls=%address_computation,
+//        backend_config={"fusion_backend_config":{
+//            "kind":"__custom_fusion",
+//            "custom_fusion_config":{"name":"address_computation"}
+//        }}
+//  }
+//
+class DynamicSliceFusionRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "dynamic-slice-fusion-rewriter";
+  }
+
+  explicit DynamicSliceFusionRewriter(std::string platform_name)
+      : platform_name_(std::move(platform_name)) {}
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  std::string platform_name_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_DYNAMIC_SLICE_FUSION_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
new file mode 100644
index 00000000..6cf8f988
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_FUSION_BLOCK_LEVEL_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_FUSION_BLOCK_LEVEL_REWRITER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class FusionBlockLevelRewriter : public HloModulePass {
+ public:
+  explicit FusionBlockLevelRewriter(
+      const se::DeviceDescription& device_info,
+      HloCostAnalysis::ShapeSizeFunction shape_size,
+      absl::AnyInvocable<absl::StatusOr<bool>(const HloFusionInstruction*)>
+          should_try_rewrite_if)
+      : device_info_(device_info),
+        shape_size_(shape_size),
+        should_try_rewrite_if_(std::move(should_try_rewrite_if)) {}
+
+  absl::string_view name() const override {
+    return "fusion-block-level-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_info_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_;
+  absl::AnyInvocable<absl::StatusOr<bool>(const HloFusionInstruction*)>
+      should_try_rewrite_if_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_FUSION_BLOCK_LEVEL_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
new file mode 100644
index 00000000..804d0590
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_FUSION_WRAPPER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_FUSION_WRAPPER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Wraps leftover unfused instruction that are in the entry computation that
+// have no LHLO equivalent in fusions containing just that instruction.
+class FusionWrapper : public HloModulePass {
+ public:
+  explicit FusionWrapper(const se::DeviceDescription& device_description)
+      : device_description_(device_description) {}
+  absl::string_view name() const override { return "fusion-wrapper"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_description_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_FUSION_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
new file mode 100644
index 00000000..1c615b93
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_BROADCAST_FOLDING_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_BROADCAST_FOLDING_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// cuBLAS GEMM has support for strided batched calls, where the stride is used
+// to determine the offset between the batches.
+//
+// This allows (kCustomCall:gemm A kBroadcast(B)) or
+//             (kCustomCall:gemm kBroadcast(A) B)
+// to be rewritten as (kCustomCall:gemm A B) with a zero stride for the
+// broadcasted operand if the broadcast operates on all the batch dimensions.
+//
+// This pattern matches the above case and removes the unnecessary broadcast.
+class GemmBroadcastFoldingRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "cublas-gemm-broadcast-folding-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_BROADCAST_FOLDING_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
new file mode 100644
index 00000000..ddbb3a14
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
@@ -0,0 +1,57 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_FUSION_H_
+
+// This file contains the code for fusing dots and other operations into Triton
+// GEMM fusions.
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Filters GEMMs which are better to handle using Triton.
+bool ShouldTritonHandleGEMM(HloDotInstruction&,
+                            const se::GpuComputeCapability&);
+
+// Rewrite compatible dot() calls into custom calls with fused computations
+// that target Triton-based matmul emitter.
+class GemmFusion : public HloModulePass {
+ public:
+  explicit GemmFusion(const se::GpuComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {}
+  absl::string_view name() const override { return "triton-gemm-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability compute_capability_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
new file mode 100644
index 00000000..1eeedef7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_FUSION_SWAP_OPERANDS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_FUSION_SWAP_OPERANDS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+class GemmFusionSwapOperands : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "gemm-fusion-swap-operands";
+  }
+
+ public:
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_FUSION_SWAP_OPERANDS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
new file mode 100644
index 00000000..8448d260
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
@@ -0,0 +1,98 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace xla {
+namespace gpu {
+
+// cuBLAS GEMM in the most general form can run the following operation:
+//
+// (kAdd
+//    (kMultiply (kDot A B) alpha)
+//    (kMultiply C beta))
+//
+// where A, B, C are matrices or vectors and `alpha` and `beta` are host
+// constants. In matrix-vector multiplication, one operand must be a matrix and
+// the other must be a vector. The additional requirement is that C has no other
+// users (otherwise, it does not make sense to fuse it inside the custom call).
+//
+// Both multiplication and addition can be avoided (equivalent to setting
+// `alpha` to one and `beta` to zero).
+//
+// This pass pattern-matches the most general form of this instruction
+// (we assume transposes are already folded), and rewrites it into a custom call
+// where (A, B, C) are three operands respectively, and `alpha` and `beta` are
+// stored in the backend config.
+
+struct GemmRewriterOptions {
+  // The DType of the GEMM to rewrite.
+  enum class DType { kFp8Only, kNonFp8Only };
+  DType dtype = DType::kNonFp8Only;
+
+  // Disabling bias prevents using the `beta * C` term the GEMM, which can
+  // remove dependencies between multiple matrix multiplications. This, in
+  // turn, can improve the performance of overall computation by allowing
+  // multiple GEMMs to be scheduled in parallel.
+  //
+  // As an example, consider the following computation: `(A * A) + (B * B)`.
+  // With bias enabled, the `GemmRewriter` will emit the following GEMMs:
+  //
+  // AA := GEMM(A * A)
+  // ROOT := GEMM(B * B + AA)
+  //
+  // Because the second GEMM depends on the first, they cannot be scheduled in
+  // parallel. Instead, with bias disabled, the `GemmRewriter` will emit the
+  // following:
+  //
+  // AA := GEMM(A * A)
+  // BB := GEMM(B * B)
+  // ROOT := AA + BB
+  //
+  // In this case, the two GEMMs can be scheduled in parallel.
+  enum class BiasMode { kBias, kNoBias };
+  BiasMode bias_mode = BiasMode::kBias;
+};
+
+class GemmRewriter : public HloModulePass {
+ public:
+  GemmRewriter(se::GpuComputeCapability gpu_version,
+               se::SemanticVersion toolkit_version,
+               GemmRewriterOptions options = {});
+  absl::string_view name() const override { return "cublas-gemm-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability gpu_version_;
+  se::SemanticVersion toolkit_version_;
+  GemmRewriterOptions options_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h
new file mode 100644
index 00000000..44d92d9c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h
@@ -0,0 +1,84 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_REWRITER_TEST_LIB_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_REWRITER_TEST_LIB_H_
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+// Base class for GEMM rewriter tests.
+class GemmRewriteTestBase : public GpuCodegenTest {
+ protected:
+  const stream_executor::GpuComputeCapability& Capability() const;
+
+  stream_executor::SemanticVersion GetToolkitVersion() const;
+  stream_executor::SemanticVersion GetRuntimeVersion() const;
+  bool IsCuda() const;
+
+  bool IsRocm() const;
+
+  stream_executor::GpuComputeCapability CudaHopperOrRocmMI300();
+
+  DebugOptions GetDebugOptionsForTest() const override;
+
+  bool SkipGpuBlasLtTest();
+
+  bool HasFp8Support() const;
+
+  bool HasCudaComputeCapability(
+      const stream_executor::CudaComputeCapability& cc) const;
+
+ private:
+  const auto& device_desc() const;
+};
+
+// A test fixture class for tests which should have similar results with legacy
+// cublas and cublasLt
+class ParameterizedGemmRewriteTestBase
+    : public GemmRewriteTestBase,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  ParameterizedGemmRewriteTestBase();
+
+  DebugOptions GetDebugOptionsForTest() const override;
+
+  void MatchOptimizedHlo(absl::string_view hlo, absl::string_view pattern,
+                         bool print_operand_shape = false);
+
+  absl::string_view CustomCallTarget();
+
+ protected:
+  void SetUp() override;
+
+ protected:
+  absl::flat_hash_map<absl::string_view, absl::string_view> replacements_;
+
+ private:
+  static constexpr absl::string_view kCustomCallTargetPlaceholder =
+      "<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>";
+  ;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_REWRITER_TEST_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
new file mode 100644
index 00000000..89bf9d49
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMV_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMV_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrite a matrix-vector or a vector-matrix multiplication into a
+// matrix-matrix multiplication with a trivial dimension. For example,
+// [m x n] @ [n] is rewritten to [m x n] @ [n x 1], and [n] @ [m x n] is
+// rewritten to [n x 1] @ [m x n].
+class GemvRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "gemv-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMV_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h
new file mode 100644
index 00000000..405b488d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h
@@ -0,0 +1,57 @@
+#include "absl/functional/any_invocable.h"
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/gpu_solver_context.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites Cholesky calls into CustomCall HLOs that call into cuSolver.
+class GpusolverRewriter : public HloModulePass {
+ public:
+  explicit GpusolverRewriter(
+      absl::AnyInvocable<
+          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
+          solver_context_creator);
+  absl::string_view name() const override { return "gpusolver-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+  absl::AnyInvocable<
+      absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
+      solver_context_creator_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.h
new file mode 100644
index 00000000..e2c7fac3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_HORIZONTAL_INPUT_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_HORIZONTAL_INPUT_FUSION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// This optimization pass horizontally fuses kInput fusions to both reduce the
+// kernel launch overhead and increase parallelism degree. See
+// HorizontalLoopFusion for general description and motivation about horizontal
+// fusion. HorizontalLoopFusion deals with kLoop fusions while this pass deals
+// with kInput fusions.
+//
+// Following HorizontalLoopFusion, a simple yet effective heuristic is used
+// to search the fusion candidates while avoiding creating cycles. That is,
+// we simply search for fusion candidates by looking for instructions whose
+// outputs are all consumed by the same instruction. This catches the typical
+// target cases; often, the candidate instructions are just consumed by the
+// ROOT tuple of the entry computation.
+class HorizontalInputFusion : public HloModulePass {
+ public:
+  explicit HorizontalInputFusion(const se::DeviceDescription& d)
+      : device_info_(d) {}
+
+  absl::string_view name() const override { return "horizontal_input_fusion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation*);
+
+  const se::DeviceDescription& device_info_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_HORIZONTAL_INPUT_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.h
new file mode 100644
index 00000000..fa858192
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.h
@@ -0,0 +1,153 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_HORIZONTAL_LOOP_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_HORIZONTAL_LOOP_FUSION_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// This optimization pass horizontally fuses computations for reducing kernel
+// launch overhead while increasing kernel launch dims on GPU. The initial
+// motivation of this horizontal fusion is due to the observation that the
+// training optimizer phase (e.g., AdamOptimizer and L2Loss, etc.) typically
+// has many small kernels as a result of applying the same formula on many
+// training parameters (or variables in Tensorflow). Fusing these small
+// kernels, hence, provides performance gain.
+//
+// Theoretically speaking, we may implement a cycle detection algorithm to make
+// sure no cycles are created after fusion. However, cycle detection check is
+// somewhat cumbersome; also, we observe that naive horizontal fusion of
+// arbitrary kernels may not be profitable due to control divergence and
+// possible increase of memory bandwidth pressure due to uncoalesced memory
+// accesses (note that horizontal fusion does not change the amount of memory
+// read+written at all). In practice, a simple yet effective heuristic is used
+// to avoid these issues while addressing the known beneficial cases. That is,
+// we simply search for fusion candidates by looking for instructions whose
+// outputs are all consumed by the same instruction. This catches the cases in
+// the training optimizer phase, as the candidate instructions are typically
+// consumed only by the ROOT tuple of the entry computation.
+//
+// The following illustrates the mechanism of the horizontal fusion. Before
+// fusion, there are two trivial kernels in the illustrating example. One has
+// only a Mul op, while the other consists of only an Add op. Since they are
+// only consumed by the same (ROOT) tuple instruction, horizontal fusion is
+// triggered.
+//
+// i0 i1   i2 i3
+//  | |     | |
+//  v v     v v
+//  Mul     Add
+//   |       |
+//   v       v
+//  (ROOT) tuple
+//
+// We fuse into one of two possible patterns, depending on whether all the
+// fused operations have the same shape or not.
+//
+// case 1: if Mul and Add's output shape and type are the same, then we fuse
+// them into the below pattern:
+// i0 i1   i2 i3
+//  | |     | |
+//  v v     v v
+//  Mul     Add
+//   |       |
+//   v       v
+//  (ROOT) tuple
+// the fused kernel will be kLoop type, and GPU code is emitted through
+// the LoopFusion class.
+//
+// case 2: if Mul and Add's output shape are diffent, then we fuse them into
+// the below pattern that adds extra indexing:
+// i0 i1   i2 i3       +++ (Slice) Input Fusion
+//  | |     | |          +
+//  v v     v v          +
+//  Mul     Add          +
+//   |       |           +
+//   v       v           +
+// Reshape0  Reshape1    +
+//   |       |           +
+//   v       v           +
+//  Concatenate          +
+//   |       |           +
+//   v       v           +
+//  Slice0  Slice1     +++
+//   |       |
+//   v       v
+// Reshape2  Reshape3
+//   |       |
+//   v       v
+//  (ROOT) tuple
+//
+// the fused kernel will be kInput type, and, the GPU code is emitted through
+// the InputSlicesFusion class.
+//
+// In theory, the pattern in case 1 could also be fused into the case2 target
+// graph, but we prefer to fuse into kLoop type, because the codegen for it does
+// not have the slicing range check cost introduced by case 2 pattern.
+//
+// Note that the fusion style by case 2 provides an important advantage that
+// kernels of different shapes can be horizontally fused. The first pair of
+// reshapes (i.e., Reshape0 and Reshape1) reshape the dims to 1 dimension, so
+// that the outputs of the fused kernels can (always) be concatenated. The
+// second pair of reshapes (Reshape2 and Reshape3) restore the original shapes
+// to the output tensors.
+//
+// No extra copies are introduced by the horizontal fusion. Besides Reshape2
+// and Reshape3, the other instructions are fused into an input fusion; the
+// output dims of the concatenate will be used as the kernel launch dims.
+// Instruction bitcasts can be used for Reshape2 and Reshape3 as long as the
+// outputs of Mul and Add are row-major.
+//
+// Note, reshapes are added only if the tensors isn't already a vector.
+class HorizontalLoopFusion : public HloModulePass {
+ public:
+  explicit HorizontalLoopFusion(const se::DeviceDescription& device_description,
+                                absl::string_view prefix = "",
+                                bool only_entry_computation = false)
+      : device_description_(device_description),
+        prefix_(prefix),
+        only_entry_computation_(only_entry_computation) {}
+
+  absl::string_view name() const override { return "horizontal_loop_fusion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation*);
+
+  const se::DeviceDescription& device_description_;
+  std::string prefix_;
+  bool only_entry_computation_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_HORIZONTAL_LOOP_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/layout_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/layout_assignment.h
new file mode 100644
index 00000000..af1d45ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/layout_assignment.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_LAYOUT_ASSIGNMENT_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_LAYOUT_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <initializer_list>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/layout_assignment.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+// GPU-specific layout assignment pass which preassigns layouts to satisfy
+// layout constraints for operands and results of library calls.
+class GpuLayoutAssignment : public LayoutAssignment {
+ public:
+  explicit GpuLayoutAssignment(
+      ComputationLayout* entry_computation_layout,
+      const se::GpuComputeCapability& gpu_version,
+      const se::dnn::VersionInfo& dnn_version,
+      const se::DeviceDescription& device_description,
+      ChannelLayoutConstraints* channel_constraints = nullptr)
+      : LayoutAssignment(entry_computation_layout, channel_constraints),
+        gpu_version_(gpu_version),
+        dnn_version_(dnn_version),
+        device_description_(device_description) {}
+  ~GpuLayoutAssignment() override = default;
+
+ protected:
+  absl::Status AddBackendConstraints(LayoutConstraints* constraints) override;
+
+ private:
+  absl::Status AddBackendConstraintsToDnnConvCustomCall(
+      HloCustomCallInstruction* instr, LayoutConstraints* constraints);
+
+  // dim_groups are ordered from major to minor dimensions.
+  absl::Status SetOperandMajorToMinorLayout(
+      const HloInstruction* instruction, int64_t operand,
+      std::initializer_list<absl::Span<const int64_t>> dim_groups);
+
+  absl::Status SetDotOperandLayout(const HloInstruction* instruction,
+                                   int64_t operand,
+                                   absl::Span<const int64_t> batch_dims,
+                                   absl::Span<const int64_t> row_dims,
+                                   absl::Span<const int64_t> col_dims);
+
+  absl::Status SetDotLayout(const HloInstruction* instruction,
+                            LayoutConstraints* constraints);
+
+  bool PropagateReductionLayoutToOperand(const HloInstruction* user) override;
+
+  bool InstructionCanChangeLayoutInstance(
+      const HloInstruction* instruction) override;
+
+  absl::Status AddDotBackendConstraints(LayoutConstraints* constraints,
+                                        HloDotInstruction* instruction);
+
+  const se::GpuComputeCapability gpu_version_;
+  const se::dnn::VersionInfo dnn_version_;
+  const se::DeviceDescription& device_description_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_LAYOUT_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
new file mode 100644
index 00000000..e3e55501
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
@@ -0,0 +1,39 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_MOVE_COPY_TO_USERS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_MOVE_COPY_TO_USERS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Sink kCopy operations as far down the graph as possible.
+class MoveCopyToUsers : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "move_copy_to_users"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_MOVE_COPY_TO_USERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
new file mode 100644
index 00000000..a69c21c4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
@@ -0,0 +1,134 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_MULTI_OUTPUT_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_MULTI_OUTPUT_FUSION_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_dfs_reachability.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Multi-output fusion of sibling and producer-consumer instructions for the
+// GPU backend to reduce memory bandwidth requirements.
+//
+//   0) Before multi-    1) Sibling multi-    2) Producer-consumer
+//      output fusion       output fusion        multi-output fusion
+//
+//          p                    p                    p
+//          |                    |                    |
+//          v                    v                    v
+//          A                    A               +-fusion--+
+//        /   \                  |               |    A    |
+//       |     |            +-fusion--+          |   / \   |
+//       v     v            |   / \   |          |  B   |  |
+//       B     C            |  B   C  |          |  |   |  |
+//        \   /             |  |   |  |          |  v   v  |
+//         v v              |  v   v  |          |  tuple  |
+//        ROOT              |  tuple  |          +---------+
+//                          +---------+            /    \
+//                            /    \            gte_b  gte_a
+//                         gte_b  gte_c           |      |
+//                           |      |             |      v
+//                            \    /              |      C
+//                             v  v                \    /
+//                             ROOT                 v  v
+//                                                  ROOT
+//
+// Multi-output fusion ops have a tuple op at their root containing multiple
+// elements as outputs. GetTupleElement ops (depicted as gte_* above) are
+// inserted to extract tuple elements for consumers.
+//
+// The two different flavors of multi-output fusion this pass performs are
+// depicted above.
+// 1) Fusion of sibling ops reduces memory bandwidth requirements, because
+//    common input parameters have to be read only once.
+// 2) Fusion of producer-consumer ops reduces memory bandwidth requirements by
+//    saving one read from memory. In the example above, B does not need to read
+//    the output of A from memory, while C still does (using gte_a).
+// Note that sibling (1) and producer-consumer (2) multi-output fusion can be
+// combined.
+//
+// The MultiOutputFusion pass modifies the HLO in reverse post-order (defs
+// before uses). First, it attempts to fuse the consumer ops of the current op,
+// which are siblings (1). Hereafter, it attempts to fuse the current op with
+// one of its consumers (2). This order avoids a phase ordering issue (described
+// in go/fusionfusion). It ensures that all GetTupleElement ops inserted as a
+// by-product of multi-output fusion will occur before the current op in the
+// order of traversal, and hence, not get into the way of subsequent fusion
+// attempts.
+//
+// The MultiOutputFusion pass ensures several conditions are met for fusion.
+// Some of them are relevant for correctness. In particular, no cycles must be
+// introduced into the HLO module. Moreover, the code emitters for multi-output
+// fusion must support the combination of ops and their shapes. Other
+// restrictions are rather arbitrary and lifting them could be beneficial.
+// * Sibling fusion (1) requires at least one op to be a kFusion.
+// * Sibling fusion (1) does not fuse kInput fusions with kLoop fusions, i.e.
+//   the fusion kinds must match.
+
+class MultiOutputFusion : public HloModulePass {
+ public:
+  explicit MultiOutputFusion(
+      const se::DeviceDescription& device_info,
+      HloCostAnalysis::ShapeSizeFunction shape_size_function)
+      : device_info_(device_info), shape_size_function_(shape_size_function) {}
+
+  absl::string_view name() const override { return "multi_output_fusion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool FuseSiblings(HloInstruction* parent, FusionInfoCache* fusion_info_cache,
+                    GpuHloCostAnalysis* cost_analysis);
+
+  absl::StatusOr<bool> DoMultiOutputFusion();
+
+  // Recompute reachability for the current computation.
+  void RecomputeReachability();
+
+  void DumpFusionState(const HloInstruction& consumer, absl::string_view label,
+                       const HloInstruction* producer = nullptr);
+
+  // Computation for the pass.
+  HloComputation* computation_;
+
+  // The reachability map of current computation.
+  std::unique_ptr<HloDfsReachability> reachability_;
+
+  se::DeviceDescription device_info_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_MULTI_OUTPUT_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
new file mode 100644
index 00000000..aee2ece2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Rewrites Triton GEMM fusions to generic Triton fusions. Any other fusions are
+// left unchanged.
+//
+// The fusion's backend config is set to a BlockLevelFusionConfig, derived from
+// a previously set TritonGemmConfig.
+//
+// The operands of the dot (including their prologues) are fused into two new
+// nested fusions, each with their own BlockLevelFusionConfig.
+class NestGemmFusion : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "nest_gemm_fusion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
new file mode 100644
index 00000000..b2484f26
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_PGLE_ACCURACY_CHECKER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_PGLE_ACCURACY_CHECKER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/profile_guided_latency_estimator.h"
+
+namespace xla::gpu {
+
+// This pass checks the accuracy of the input feedback-driven optimization (FDO)
+// profile. If any non-NOP instruction from the given HloModule is not present
+// in the profile this pass fails.
+class PGLEAccuracyChecker : public HloModulePass {
+ public:
+  explicit PGLEAccuracyChecker(ProfileGuidedLatencyEstimator& pgle_estimator)
+      : pgle_estimator_(pgle_estimator) {}
+  absl::string_view name() const override { return "pgle-accuracy-checker"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  ProfileGuidedLatencyEstimator& pgle_estimator_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_PGLE_ACCURACY_CHECKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.h
new file mode 100644
index 00000000..9731f00c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.h
@@ -0,0 +1,133 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_PIPELINED_P2P_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_PIPELINED_P2P_REWRITER_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// PipelinedP2PRewriter is a pass that rewrites pipelined Send/Recv related
+// code for point-to-point communication to rotate SendDone and RecvDone at the
+// end of a loop iteration to the beginning of the next iteration. This pass
+// operates on scheduled module and updates the instruction sequence.
+//
+// In particular, a pipelined Send/Recv chain with one channel group with this
+// code pattern:
+//
+// main:
+//    recv
+//    send
+//    recv-done
+//    send-done
+//    while-init = (recv-done, send-done, ...)
+//    while-op = while(whiel-init) ...
+//
+// while-body:
+//    ...
+//    recv
+//    send
+//    recv-done
+//    send-done
+//    ROOT tuple(recv-done, send-done, ...)
+//
+// Will be transformed to:
+//
+// main:
+//    recv
+//    send
+//    while-init = (recv, send, ...)
+//    while-op = while(whiel-init) ...
+//    recv-done
+//    send-done
+//
+// while-body:
+//    recv-done
+//    ...
+//    send-done
+//    recv
+//    send
+//    ROOT tuple(recv, send, ...)
+//
+// A pipelined Send/Recv chain with two channel groups with this code pattern:
+//
+// main:
+//    recv.0
+//    send.0
+//    recv.1
+//    send.1
+//    recv-done.0
+//    send-done.0
+//    recv-done.1
+//    send-done.1
+//    while-init = (recv-done.0, send-done.0, recv-done.1, send-done.1, ...)
+//    while-op = while(whiel-init) ...
+//
+// while-body:
+//    ...
+//    recv.0
+//    send.0
+//    recv.1
+//    send.1
+//    recv-done.0
+//    send-done.0
+//    recv-done.1
+//    send-done.1
+//    ROOT = tuple(recv-done.0, send-done.0, recv-done.1, send-done.1, ...)
+//
+// Will be transformed to:
+//
+// main:
+//
+//    recv.0
+//    send.0
+//    recv.1
+//    send.1
+//    while-init = (recv.0, send.0, recv.1, send.1, ...)
+//    while-op = while(while-init) ...
+//    recv-done.0
+//    send-done.0
+//    recv-done.1
+//    send-done.1
+//
+// while-body:
+//    recv-done.0
+//    recv-done.1
+//    ...
+//    send-done.0
+//    send-done.1
+//    recv.0
+//    send.1
+//    recv.1
+//    send.1
+//    ROOT tuple(recv.0, send.0, recv.1, send.1, ...)
+//
+class PipelinedP2PRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "pipelined-p2p-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_PIPELINED_P2P_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/priority_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
new file mode 100644
index 00000000..f1d19532
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_PRIORITY_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_PRIORITY_FUSION_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/fusion_process_dump.pb.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace gpu {
+
+class PriorityFusion : public HloModulePass {
+ public:
+  PriorityFusion(tsl::thread::ThreadPool* thread_pool,
+                 const se::DeviceDescription& device,
+                 GpuHloCostAnalysis::Options cost_analysis_options)
+      : thread_pool_(thread_pool),
+        device_info_(device),
+        cost_analysis_options_(std::move(cost_analysis_options)),
+        fusion_analysis_cache_(device_info_) {}
+
+  absl::string_view name() const override { return "priority-fusion"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ protected:
+  HloInstruction::FusionKind ChooseKind(const HloInstruction* producer,
+                                        const HloInstruction* consumer);
+
+  HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer);
+
+ private:
+  // Consumes a unit of compiler fuel and returns true if we should
+  // continue with the transformation.
+  bool ConsumeFuel(HloInstruction* producer, HloInstruction* consumer);
+
+  // Returns the decision if the constant can be fused into the user.
+  FusionDecision CanFuseConstant(const HloInstruction* constant,
+                                 const HloInstruction* user);
+
+  tsl::thread::ThreadPool* thread_pool_;
+  se::DeviceDescription device_info_;
+
+  // Cost model options that defines priorities in the queue.
+  GpuHloCostAnalysis::Options cost_analysis_options_;
+
+  // Proto with structured logs of fusion decisions. Used only for debugging. If
+  // null, logging is disabled.
+  std::unique_ptr<FusionProcessDumpProto> fusion_process_dump_;
+
+  HloFusionAnalysisCache fusion_analysis_cache_;
+
+  mlir::MLIRContext mlir_context_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_PRIORITY_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
new file mode 100644
index 00000000..13fd87d6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_RAGGED_ALL_TO_ALL_DECOMPOSER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_RAGGED_ALL_TO_ALL_DECOMPOSER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites a `ragged-all-to-all` as a regular `all-to-all`.
+//
+// A ragged tensor is converted into a dense representation by slicing each
+// ragged row from the input and padding with zeros. Then, `all-to-all` is
+// performed on the dense representation to exchange rows between replicas.
+// Finally, the dense representation is converted back to ragged using
+// `dynamic-update-slice` and filling padded values with zero.
+//
+// This pass is intended as a temporary solution to unblock end-to-end
+// integration of `ragged-all-to-all` on GPU, to serve as a reference
+// implementation and help with writing integration tests.
+//
+// TODO(b/379629619): Remove this pass once `ragged-all-to-all` is implemented
+// natively on GPU with NCCL.
+class RaggedAllToAllDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "ragged-all-to-all-decomposer";
+  }
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_RAGGED_ALL_TO_ALL_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
new file mode 100644
index 00000000..23f9c96e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_REDUCE_SCATTER_CREATOR_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_REDUCE_SCATTER_CREATOR_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Transforms dynamic-slice(all-reduce) to a reduce-scatter.
+class ReduceScatterCreator : public HloModulePass {
+ public:
+  ReduceScatterCreator() = default;
+  absl::string_view name() const override { return "reduce-scatter-creator"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_REDUCE_SCATTER_CREATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
new file mode 100644
index 00000000..fadec687
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_DEGENERATE_DIM_REMOVER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_DEGENERATE_DIM_REMOVER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Enforces the invariant that reduction input and output have no degenerate
+// (size 1) dimension. Since these dimensions are physically meaningless, they
+// are removed using bitcasts.
+//
+// For example,
+//
+//   f[1] out = reduce(f[100, 1, 1] input, dimensions={0, 1})
+//
+// becomes:
+//
+//
+//   f[100] tmp1 = f[100] bitcast(f[100, 1, 1], input)
+//   f[] tmp2 = reduce(f[100] tmp1, dimensions={0})
+//   f[1] out = f[] bitcast(tmp2)
+//
+class ReductionDegenerateDimRemover : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "reduction-degenerate-dim-remover";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_DEGENERATE_DIM_REMOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
new file mode 100644
index 00000000..d0372920
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_DIMENSION_GROUPER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_DIMENSION_GROUPER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Groups adjacent (logically and physically) reduced dimensions in reduction
+// input.
+//
+// Precondition: ReductionLayoutNormalizer has been run (physical proximity and
+// logical proximity become the same).
+//
+// For example,
+//
+//   out = f32[] reduce(f32[10,20,30] input, dimensions={0,1,2})
+//
+// becomes:
+//
+//   tmp = f32[6000] bitcast(f32[10,20,30] input)
+//   out = f32[] reduce(f32[6000] tmp, dimensions={0})
+//
+class ReductionDimensionGrouper : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "reduction-dimension-grouper";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_DIMENSION_GROUPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
new file mode 100644
index 00000000..454fb7f9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_LAYOUT_NORMALIZER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_LAYOUT_NORMALIZER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Enforces default (minor-to-major) layout on all reduction inputs.
+// Note that since reduction output can request a custom layout,
+// this pass only guarantees standard layout for the input.
+//
+// For example,
+//
+//   f[20,30]{0,1} out = reduce(f[10,20,30]{2,0,1} input, dimensions={0})
+//
+// becomes:
+//
+//   f[20,10,30] tmp = f[20,10,30] bitcast(f[10,20,30]{2,0,1} input)
+//   f[20,30]{0,1} out = reduce(f[20,10,30]{2,1,0} tmp, dimensions={1})
+class ReductionLayoutNormalizer : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "reduction-layout-normalizer";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_LAYOUT_NORMALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
new file mode 100644
index 00000000..f5abe00c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_SPLITTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits a reduce op into two consecutive reduce ops if the reduce dimensions
+// are not contiguous. Ignores small reduce dimensions if `ignore_small_dims` is
+// set.
+//
+// Reductions with non-contiguous dimensions are emitted as simple element-wise
+// loops. This is inefficient when reducing large input shape dimensions.
+// Splitting such reductions allows using more efficient reduction emitters.
+//
+// This pass splits reduce ops into two consecutive reduce ops. Run it to a
+// fixpoint to split reduce ops along multiple dimensions.
+//
+// Precondition: ReductionDimensionGrouper has been run and adjacent reduce
+// dimensions have been grouped. Reduction layouts have been normalized.
+
+class ReductionSplitter : public HloModulePass {
+ public:
+  ReductionSplitter(const se::DeviceDescription& device_description,
+                    bool ignore_small_dims)
+      : device_description_(device_description),
+        ignore_small_dims_(ignore_small_dims) {}
+  absl::string_view name() const override { return "reduction-splitter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_description_;
+  const bool ignore_small_dims_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_REDUCTION_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/rename_fusions.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
new file mode 100644
index 00000000..9313a72d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_RENAME_FUSIONS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_RENAME_FUSIONS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// An HLO pass that gives fusions and fused computations descriptive names.
+//
+// The name is based on hero instructions and the fusion kind, i.e.
+// Fusions get name "<fusion kind>_<hero instrucitons>_fusion",
+// and fused computations get name "fused_<hero instructions>".
+// In the case of multiple roots, the hero instructions in the name are
+// underscore-separated and alphabetically sorted.
+
+class RenameFusions : public HloModulePass {
+  absl::string_view name() const override { return "rename_fusions"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_RENAME_FUSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
new file mode 100644
index 00000000..a7081455
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SANITIZE_CONSTANT_NAMES_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SANITIZE_CONSTANT_NAMES_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Sanitizes HLO instruction names for the GPU backend. Currently, it only
+// replaces . and - in the HLO constant instruction names with _ to please the
+// LLVM PTX backend.
+class SanitizeConstantNames : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "sanitize-constant-names"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SANITIZE_CONSTANT_NAMES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scatter_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scatter_expander.h
new file mode 100644
index 00000000..f86b9323
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scatter_expander.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SCATTER_EXPANDER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SCATTER_EXPANDER_H_
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/scatter_expander.h"
+
+namespace xla {
+
+// Legalizes scatters on the GPU.
+class GpuScatterExpander : public ScatterExpander {
+ public:
+  // Although we pass kEliminateAllScatters, we override this behavior in
+  // InstructionMatchesPattern and select only some scatters to expand.
+  GpuScatterExpander() : ScatterExpander(kEliminateAllScatters) {}
+
+  absl::string_view name() const override { return "gpu_scatter_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SCATTER_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
new file mode 100644
index 00000000..4ee985e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SCATTER_SLICE_SIMPLIFIER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SCATTER_SLICE_SIMPLIFIER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Replaces scatters followed by truncation slices with a new scatter using
+// a different output shape, and the slices are eliminated.
+//
+// (a) Single output     (b) Multiple outputs    (c) Elementwise users
+//
+//   T[N+1] scatter       (T1, T2) scatter         T scatter  T constant
+//          v                v       v                  v     v
+//     T[N] slice          T1 gte    T2 gte            T maximum
+//                           v       v                     v
+//                         T1 slice  T2 slice           T slice
+//
+// This pattern is used when the last element of the scatter output is intended
+// to accumulate updates from the input elements that should be ignored.
+// This is slow if there are many indices mapped to the last output index and
+// the scatter is implemented using atomics, so everything collides on that one
+// memory location.
+// As OOB scatter indices are dropped by the GPU implementation, we can remove
+// the slice step entirely and avoid the memory congestion in the scatter step.
+
+class ScatterSliceSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "scatter-slice-simplifier"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SCATTER_SLICE_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.h
new file mode 100644
index 00000000..d76faed7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SCHEDULE_POSTPROCESSING_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SCHEDULE_POSTPROCESSING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Amends a schedule result with the needed information to support a runtime
+// implementation. Currently, this pass refines attribute
+// no_parallel_custom_call for asynchronous collective operations to support
+// runtime optimization, such as skipping rendezvous of all participating
+// threads for NCCL collective operations. In particular, it sets the attribute
+// value for Collective-start operations with is_sync=false; it also keeps the
+// attribute value untouch for the operations with is_sync=true and for P2P
+// operations, assumming the runtime won't use those values.
+//
+class SchedulePostprocessing : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "schedule-postprocessing"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SCHEDULE_POSTPROCESSING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
new file mode 100644
index 00000000..e73ad81c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SCHEDULING_INSTRUCTION_ANNOTATOR_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SCHEDULING_INSTRUCTION_ANNOTATOR_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// The pass amends the `OpMetadata` with instruction name present at the
+// scheduling time. This is later being used to make sure instructions are not
+// renamed post scheduling. Enforcing this is necessary because otherwise
+class SchedulingInstructionAnnotator : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "scheduling-instruction-annotator";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SCHEDULING_INSTRUCTION_ANNOTATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/simplify_int4_dots.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/simplify_int4_dots.h
new file mode 100644
index 00000000..8e9a1a46
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/simplify_int4_dots.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SIMPLIFY_INT4_DOTS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SIMPLIFY_INT4_DOTS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+class SimplifyInt4Dots : public HloModulePass {
+ public:
+  SimplifyInt4Dots() = default;
+  absl::string_view name() const override { return "simplify-int4-dots"; }
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SIMPLIFY_INT4_DOTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
new file mode 100644
index 00000000..8f904cf8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
@@ -0,0 +1,113 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SOFTMAX_REWRITER_TRITON_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SOFTMAX_REWRITER_TRITON_H_
+
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+struct DiamondDescriptor {
+  HloInstruction* root = nullptr;
+  HloInstruction* producer = nullptr;
+};
+
+using DiamondMatchingDecision = std::variant<FusionDecision, HloInstruction*>;
+
+// Rewrites compatible normalization diamonds into custom fusions to be
+// code-generated with the Triton fusion emitter.
+//
+// If `only_fuse_if_profitable` is set to `true`, the rewriter will use the Cost
+// Model to the estimate the run time of the fused and unfused versions of the
+// normalization diamond. If the fused version is slower, the diamond will not
+// be fused.
+class SoftmaxRewriterTriton : public HloModulePass {
+ public:
+  explicit SoftmaxRewriterTriton(const se::DeviceDescription& device_info,
+                                 HloCostAnalysis::ShapeSizeFunction shape_size,
+                                 bool only_fuse_if_profitable = false)
+      : device_info_(device_info),
+        shape_size_(shape_size),
+        use_cost_model_to_evaluate_fusions_(only_fuse_if_profitable) {}
+
+  absl::string_view name() const override { return "triton-softmax-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Finds and returns all the fusible normalization diamonds in the module. The
+  // resulting vector is sorted according to a post-order matching (i.e. within
+  // the same computation, producer diamonds appear before consumer diamonds).
+  absl::StatusOr<std::vector<DiamondDescriptor>>
+  FindAllFusibleNormalizationDiamonds(
+      HloModule& module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+
+  // Constructs a normalization fusion containing all the instructions between
+  // the root and the producer of a diamond. The producer is excluded from the
+  // fusion.
+  //
+  // Returns `true` if the diamond was successfully fused. Otherwise,
+  // returns `false` if, for example, the resulting fusion cannot be tiled.
+  absl::StatusOr<bool> MaybeFuseNormalizationDiamond(
+      const DiamondDescriptor& diamond_chain);
+
+  // Return the producer of the following pattern:
+  //
+  // producer
+  // |    \
+  // |  reduce_{max,sum,...}
+  // |     |
+  // |  broadcast
+  // |   /
+  // binop (elementwise)
+  //
+  // where each edge is allowed to contain also trivial operations that can be
+  // generated by Triton. We mean by "trivial" here those operations that do not
+  // increase the amount of memory read/written by the fusion, and that are
+  // compatible with any chosen tiling.
+  //
+  // We also assume that the reduction is done on the last axis of the producer
+  // array.
+  DiamondMatchingDecision MatchesTritonCompatibleClosedReductionDiamond(
+      HloInstruction* instr) const;
+
+ private:
+  const se::DeviceDescription& device_info_;
+  const HloCostAnalysis::ShapeSizeFunction shape_size_;
+  bool use_cost_model_to_evaluate_fusions_;
+  mlir::MLIRContext mlir_context_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SOFTMAX_REWRITER_TRITON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
new file mode 100644
index 00000000..96835aab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SORT_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SORT_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites sort operations into CustomCall HLOs that call into CUB.
+// Only a subset of shapes is supported - either a single tensor with a simple
+// compare function or a pair of tensors where keys are unsigned integers.
+
+class SortRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "sort-rewriter"; }
+
+  // CUB radix sort is slower than XLA sort on small shapes, so do not rewrite
+  // tensors with sizes below this limit.
+  static int SortSizeThreshold() { return sort_size_threshold_; }
+  static void SetSortSizeThresholdForTestingOnly(int threshold) {
+    // We need to be able to reduce the threshold for testing, so that the tests
+    // can run and compare against the reference interpreter, which is quite
+    // slow.
+    sort_size_threshold_ = threshold;
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnInstruction(HloSortInstruction* sort_op);
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+
+  static inline int sort_size_threshold_ = 16385;
+};
+
+// Verify that the sort tensor shape is supported by CUB.
+bool IsCubCompatibleSort(const HloSortInstruction* sort_op);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SORT_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
new file mode 100644
index 00000000..74b20026
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
@@ -0,0 +1,68 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_STREAM_ATTRIBUTE_ANNOTATOR_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_STREAM_ATTRIBUTE_ANNOTATOR_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// This pass checks to see if:
+//  1.  there's any instruction, that
+//      consumes data from other computes streams,
+//      is missing "wait_on_operation_queues" attribute.
+//  2.  there's any fusion instruction with non-default
+//      stream fusion root.
+// It will annotate the corresponding instruction with
+// the correct attribute in GpuBackendConfig.
+// Instructions annotated with operation_queue_id > 0
+// will be wrapped with AsyncInstruction and split into
+// AsyncStart and AsyncDone in the
+// StreamAttributeAsyncWrapper pass.
+// We also check if there's any non-default-stream
+// instruction's user doesn't have the correct "wait_on_operation_queues"
+// attribute and set it with producer's operation_queue_id.
+// "wait_on_operation_queues" will need to used by the emitter to emit the
+// correct WaitForStreams thunk.
+
+class StreamAttributeAnnotator : public HloModulePass {
+ public:
+  explicit StreamAttributeAnnotator(
+      const se::DeviceDescription& device_description)
+      : device_description_(device_description) {}
+
+  absl::string_view name() const override {
+    return "stream-attribute-annotator";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_description_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_STREAM_ATTRIBUTE_ANNOTATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
new file mode 100644
index 00000000..387c6dba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_STREAM_ATTRIBUTE_ASYNC_WRAPPER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_STREAM_ATTRIBUTE_ASYNC_WRAPPER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// This pass will find the instructions that
+// are annotated with non-default stream id in backend configs
+// by the StreamAttributeAnnotator pass
+// and wrap them using AsyncStartDone pairs to achieve
+// asynchronous executions.
+class StreamAttributeAsyncWrapper : public HloModulePass {
+ public:
+  inline static constexpr char kParallelExecutionThread[] = "parallel";
+
+  absl::string_view name() const override {
+    return "async-stream-attribute-wrapper";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_STREAM_ATTRIBUTE_ASYNC_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/topk_specializer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
new file mode 100644
index 00000000..29d3e6a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_TOPK_SPECIALIZER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_TOPK_SPECIALIZER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// This pass transforms eligible TopK CustomCall into a call to be executed by
+// runtime/topk.cc.
+class TopkSpecializer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "topk-specializer"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_TOPK_SPECIALIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/topk_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
new file mode 100644
index 00000000..0a155c17
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_TOPK_SPLITTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_TOPK_SPLITTER_H_
+
+#include <cstddef>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits large TopK into batches of smaller TopKs, followed by sorting and
+// slicing the results of those smaller topks. We consider TopKs to be 'large'
+// the last dimension of the TopK is larger than `split_threshold`.
+class TopKSplitter : public HloModulePass {
+ public:
+  explicit TopKSplitter(size_t split_threshold = 1024 * 1024)
+      : split_threshold_(split_threshold) {}
+  absl::string_view name() const override { return "topk-splitter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const size_t split_threshold_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_TOPK_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
new file mode 100644
index 00000000..743d6047
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_TRANSPOSE_DIMENSION_GROUPER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_TRANSPOSE_DIMENSION_GROUPER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Groups dimensions that are adjacent (logically and physically) in the
+// transpose operand and the transpose output.
+//
+// Precondition: LayoutNormalization has been run (physical proximity and
+// logical proximity become the same).
+//
+// For example,
+//
+//   out = f32[30,10,20] transpose(f32[10,20,30] input, dimensions={2,0,1})
+//
+// becomes:
+//
+//   tmp = f32[200,30] bitcast(f32[10,20,30] input)
+//   transpose = f32[30,200] transpose(f32[200,30] tmp, dimensions={1,0})
+//   out = f32[30,0,20] bitcast(f32[30,200] transpose)
+//
+class TransposeDimensionGrouper : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "transpose-dimension-grouper";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_TRANSPOSE_DIMENSION_GROUPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
new file mode 100644
index 00000000..86496583
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_TREE_REDUCTION_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_TREE_REDUCTION_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites reductions in a way they can be implemented without atomics.
+//
+// Rule application: rewrite a single HLO reduce operation into two.
+//
+// Case 1: Row reduction, batched dimension is present, larger than
+// Z-tiling size.
+// -----------------------------------------------------------------
+//
+// Rewriting:
+//
+// f32[B] out = reduce(f32[A, B, C] input, dimensions={0, 2})
+//
+// Into:
+//
+// f32[A, B] tmp = reduce(f32[A, B, C] input, dimensions={2})
+// f32[B] out = reduce(f32[A, B] tmp, dimensions={0})
+//
+// Case 2: Row reduction
+// ------------------------------------------------------------------
+//
+// Let M be the thread tiling multiplied by the warp size.
+// We go from (assuming C > M):
+//
+// f32[B] out = reduce(f32[A, B, C] input, dimensions={0, 2})
+//
+// to:
+//
+// f32[A, B, P] padded = pad(input) // Let P = ceil(C/M) * M.
+// f32[A, B, Q, M] reshaped = bitcast(padded) // Let Q = ceil(C/M)
+// f32[B, Q] inner_reduce = reduce(reshaped, dimensions={0, 3})
+// f32[B] outer_reduce = reduce(inner_reduce, dimensions={1})
+//
+// Case 3: Column reduction
+// -------------------------------------------------------------------
+//
+// Let T be the tiling size for the column reduction.
+//
+// We go from (assuming B > T):
+//
+// f32[A, C] out = reduce(f32[A, B, C] input, dimensions={1})
+//
+// to:
+//
+// f32[A, P, C] padded = pad(input) // Let P = ceil(B/T) * T.
+// f32[A, Q, T, C] reshaped = bitcast(padded) // Let Q = ceil(B/T)
+// f32[A, Q, C] inner_reduce = reduce(reshaped, dimensions={2})
+// f32[A, C] outer_reduce = reduce(inner_reduce, dimensions={1})
+//
+class TreeReductionRewriter : public HloModulePass {
+ public:
+  explicit TreeReductionRewriter(
+      const se::DeviceDescription& device_description)
+      : device_description_(device_description) {}
+
+  ~TreeReductionRewriter() override = default;
+  absl::string_view name() const override { return "tree-reduction-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription& device_description_;
+};
+
+}  // end namespace gpu
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_TREE_REDUCTION_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
new file mode 100644
index 00000000..a19e10f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_TRIANGULAR_SOLVE_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_TRIANGULAR_SOLVE_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites HLO TriangularSolve ops into a custom-call.
+//
+// The motivation for this is that we need to add temp memory to batched
+// triangular-solve ops in order to call cublas trsmBatched.  We rewrite batch 1
+// ops as well so that we have fewer codepaths to worry about in the backend.
+//
+// cublas trsmBatched takes arrays in GPU memory of pointers to the inputs and
+// outputs, `a` and `b`.  In XLA the inputs/outputs are always contiguous, but
+// we still have to materialize out these arrays.
+//
+// We use the same trick as for cudnn convolutions: This custom-call returns a
+// tuple (actual-result, temp-memory).  In this our case the temp buffer always
+// has size 2 * sizeof(void*) * batch_size, because we need two arrays of
+// pointers.
+//
+// The custom-call has a backend-config equal to the TriangularSolveOptions
+// object.
+class TriangularSolveRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "triangular-solve-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_TRIANGULAR_SOLVE_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
new file mode 100644
index 00000000..e2e99d8e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
@@ -0,0 +1,87 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_TRITON_FUSION_NUMERICS_VERIFIER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_TRITON_FUSION_NUMERICS_VERIFIER_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+// For each Triton fusion in the Hlo module this pass checks that the output
+// of the fusion generated via Triton matches the output of the fusion if
+// generated with the regular emitters.
+class TritonFusionNumericsVerifier : public HloModulePass {
+ public:
+  explicit TritonFusionNumericsVerifier(const AutotuneConfig& config)
+      : config_(config) {}
+
+  static absl::string_view Name() { return "triton-numerics-verifier"; }
+  absl::string_view name() const override { return Name(); }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  using FusionCacheKey = std::string;
+
+  int CacheHitsForTestingOnly() const { return cache_hits_; }
+
+ private:
+  AutotuneConfig config_;
+
+  // In some models there are many identical fusions. These are cached to avoid
+  // expensive recomputations.
+  absl::flat_hash_map<FusionCacheKey, absl::Status> fusion_result_cache_;
+  int cache_hits_ = 0;  // used for testing only.
+};
+
+namespace triton_fusion_numerics_pass_internal {
+// These are exposed only for testing. Do not use.
+absl::StatusOr<ScopedShapedBuffer> CompileAndRunFusion(
+    AutotunerCompileUtil& util, const HloFusionInstruction& fusion,
+    const AutotuneConfig& config, const DebugOptions& debug_opts,
+    bool disable_triton);
+absl::Status CompareBuffers(const ScopedShapedBuffer& current,
+                            const ScopedShapedBuffer& expected,
+                            const Shape& shape, const HloModuleConfig& config,
+                            se::Stream* stream);
+absl::Status ForAllTritonFusions(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    absl::AnyInvocable<absl::Status(const HloFusionInstruction&)> fn);
+}  // namespace triton_fusion_numerics_pass_internal
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_TRITON_FUSION_NUMERICS_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
new file mode 100644
index 00000000..b0f8a789
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_VARIADIC_OP_SPLITTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_VARIADIC_OP_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits variadic ops with many operands into pieces such that we don't exceed
+// the parameter space on the GPU. Currently only concatenate ops are split up.
+class VariadicOpSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "variadic-op-splitter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_VARIADIC_OP_SPLITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
new file mode 100644
index 00000000..5ebe38e0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_WINDOWED_EINSUM_HANDLER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_WINDOWED_EINSUM_HANDLER_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// This pass is targeting the windowed einsum optimization
+// in the SPMD pipeline. The SPMD partitioner's HandleDot
+// rewrites all-gather+gemm or gemm+reduce-scatter into sharded
+// loops with p2p communication to achieve overlap between sharded
+// gemms and communication.
+// This pass does the following transformations to further optimize
+// rewritten loops for GPU:
+//  1. it annotates independent gemms with stream ids in the backend config
+//     to achieve overlap between compute kernels in addition to
+//     compute-gemm overlap
+//  2. it removes all-gathers in ag+gemm patterns if the input to the ags is
+//     also an input to another windowed einsum ag loop, ie:
+//                       input
+//                       /    |
+//                      /     |
+//                     AG    windowed loop
+//                     /
+//                    /
+//                   dot
+// to:
+//                       input
+//                       |
+//                       |
+//                     windowed loop
+//                       |
+//                       |
+//                      dot
+//  3. it moves the fp8 dequantization outside of a rewritten windowed einsum
+//     loop inside of the loop so the dq+gemm can be fused into an fp8 gemm
+//     later in gemm rewriter.
+//  4. it shards a large all-to-all+gemm into smaller independent a2a+gemm
+//     shards to pipeline compute and communication so most of the
+//     communication overhead can be hidden.
+
+class WindowedEinsumHandler : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "windowed-einsum-handler"; }
+
+  struct WindowedEinsumAgLoops {
+    explicit WindowedEinsumAgLoops(HloInstruction* loop) : loop(loop) {}
+    HloInstruction* loop;
+    bool consumed = false;
+  };
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  constexpr static const char* kWindowedEinsumRsLoopName =
+      "windowed_dot_general_body_rs";
+  constexpr static const char* kWindowedEinsumAgLoopName =
+      "windowed_dot_general_body_ag";
+
+ private:
+  std::vector<WindowedEinsumAgLoops> all_ag_loops_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_WINDOWED_EINSUM_HANDLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_call.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_call.h
new file mode 100644
index 00000000..d931bc93
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_call.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRITON_CALL_H_
+#define XLA_SERVICE_GPU_TRITON_CALL_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
+
+namespace xla::gpu {
+
+struct TritonCall {
+  std::string name;
+  std::string ir;
+  int64_t num_stages;
+  int64_t num_warps;
+  int32_t grid_x;
+  int32_t grid_y;
+  int32_t grid_z;
+
+  // Parse the metadata of a __gpu$xla.gpu.triton call.
+  static TritonCall Parse(absl::string_view backend_config,
+                          mlir::MLIRContext* mlir_context);
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRITON_CALL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_fusion_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
new file mode 100644
index 00000000..e3894a79
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
@@ -0,0 +1,144 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRITON_FUSION_ANALYSIS_H_
+#define XLA_SERVICE_GPU_TRITON_FUSION_ANALYSIS_H_
+
+// This file contains TritonFusionAnalysis and FusionContext.
+
+#include <map>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/triton_tiling_propagation.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Analysis of tensor iteration orders within tiled fusions.
+class TritonFusionAnalysis {
+  absl::Status ExecuteForDotFusion(const HloInstruction& dot, int split_k);
+
+ public:
+  // Execute the analysis of a fusion computation.
+  // `split_k` indicates whether this operation was converted to the split-K
+  // form and tells the analysis how to interpret the batch dimensions.
+  static absl::StatusOr<TritonFusionAnalysis> Execute(
+      const HloComputation& computation, int split_k = 1);
+
+  // Execute the analysis of a dot instruction until it reaches the computation
+  // boundaries.
+  static absl::StatusOr<TritonFusionAnalysis> Execute(
+      const HloDotInstruction& dot, int split_k = 1);
+
+  // A scope is an HLO graph that can be tiled efficiently using same or
+  // compatible tile shapes on all operations. GEMM fusion has 3 or 4 scopes
+  // defined by left operand, right operand, optional meta (third operand) and
+  // output.
+  enum class Scope { LHS = 0, RHS = 1, META = 2, OUTPUT = 3 };
+
+  using IterationSpecByInstructionMap =
+      ConstHloInstructionMap<TensorIterationSpec>;
+  using IterationSpecByInstructionByScopeMap =
+      std::map<Scope, IterationSpecByInstructionMap>;
+
+  // Every parameter requires a separate piece of shared memory for asynchronous
+  // loads. Multiple parameters are approximately equivalent to multiple
+  // pipeline stages.
+  // Note: This has been tuned specifically for GEMMs, where pipelining with
+  // more than 4 stages has been shown to rarely be practical. This limitation
+  // is not necessarily applicable to other operations.
+  // Note: The limit doesn't apply to the epilogue of the fusion.
+  static constexpr int kMaxParameterPerDotOperand = 4;
+
+  // Scope -> HLO -> dot dimension number -> iteration spec at the HLO's output.
+  const TensorIterationSpec::DimIterationSpec* IterSpec(Scope scope,
+                                                        const HloInstruction*,
+                                                        int dimension) const;
+  // Parameter HLO instructions used in a scope of `dot`.
+  const ConstHloInstructionSet& ScopeParameters(const Scope scope) const {
+    return parameters_.at(scope);
+  }
+
+  // Returns the given instruction's scope, if there is no scope, returns
+  // nullopt instead.
+  std::optional<Scope> QueryInstructionScope(const HloInstruction& hlo) const;
+
+  std::string ToString() const;
+
+  // Returns an error if the batch dimension of the parameter with the type S4
+  // is the minor one. This check uses the collected data about the mapping the
+  // dimensions of dot to the corresponding parameters. This is important
+  // because there could be a transpose between the dot and the parameter.
+  bool IsBatchDimMinorForInt4Parameter(const HloInstruction& dot,
+                                       Scope scope) const;
+
+ private:
+  IterationSpecByInstructionByScopeMap iter_specs_;
+  // HLO computation parameters per scope.
+  std::map<Scope, ConstHloInstructionSet> parameters_;
+};
+
+// The details of the Triton fusion / tiling propagation are in a separate
+// namespace to avoid littering the xla::gpu namespace.
+namespace triton_fusion {
+class FusionContext {
+  FusionContext(DotProperties properties, DotRequirements requirements)
+      : properties_(properties), requirements_(requirements) {}
+
+ public:
+  // Create fusion context from a dot operand according to
+  // the currently supported configurations.
+  static absl::StatusOr<FusionContext> FromDotOperand(const HloInstruction& dot,
+                                                      int operand_number,
+                                                      int split_k = 1);
+
+  // Create fusion context from dot's output.
+  static FusionContext FromDotOutput(const HloInstruction& dot, int split_k,
+                                     DotRequirements requirements);
+
+  // Add dimension orders from `update` to `dim_orders_` and update
+  // `requirements_` if all of them are compatible.
+  bool CombineDimOrdersAndReqs(const DimOrdersAndReqs& update);
+
+  // Propagate dimension orders in consumer->producer direction starting at
+  // `origin` with output `origin_dim_order` till parameters of the
+  // computation. Store the found parameters and their iteration specs.
+  absl::Status PropagateDimensionOrdersToParameters(
+      const HloInstruction& origin, ConstHloInstructionSet& parameters,
+      ConstHloInstructionMap<TensorIterationSpec>& iter_specs);
+
+  const DotProperties& dot_properties() const { return properties_; }
+  const DimOrderMap& dim_orders() const { return dim_orders_; }
+  const DotRequirements& requirements() const { return requirements_; }
+
+ private:
+  const DotProperties properties_;
+  DotRequirements requirements_;
+  DimOrderMap dim_orders_;
+};
+
+}  // namespace triton_fusion
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRITON_FUSION_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
new file mode 100644
index 00000000..ac16cdf7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -0,0 +1,288 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRITON_TILING_PROPAGATION_H_
+#define XLA_SERVICE_GPU_TRITON_TILING_PROPAGATION_H_
+
+// This file contains the logic of the Triton Tiling Propagation in a functional
+// paradigm. Stateful operations belong in triton_fusion_analysis.
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Illustration explaining why slice_start for concatenations is negative:
+
+// Slice
+// =====
+//            input
+// [--------------------------]
+// .      .        .
+// . offset        .
+// |------> output .
+//        [--------]
+//
+// output[x] = input[x + offset]
+
+// Concatenation
+// =============
+//
+//          input_n
+// [......][--------][........]
+//         .        .
+//  offset .        .
+// <-------|        .
+// .       .        .
+// .       . output .
+// [--------------------------]
+//
+// output[x] = input_n[x - offset]
+
+class TensorIterationSpec {
+ public:
+  // Description of basic iteration: `count` elements separated by `stride`
+  // with initial offset of `slice_start` and only `sliced_count` elements used.
+  struct IterationSpecFragment {
+    int64_t stride;
+    int64_t count;
+    int64_t slice_start;
+    int64_t sliced_count;
+    // Logical subfragments:
+    // These are the sizes of the HLO dimensions which make up this basic
+    // iteration.
+    std::vector<int64_t> subfragments;
+
+    bool is_sliced() const { return count != sliced_count; }
+
+    auto ToTuple() const {
+      return std::make_tuple(stride, count, slice_start, sliced_count,
+                             subfragments);
+    }
+
+    bool operator==(const IterationSpecFragment& other) const {
+      return ToTuple() == other.ToTuple();
+    }
+    template <typename H>
+    friend H AbslHashValue(H h, const IterationSpecFragment& fragment) {
+      return H::combine(std::move(h), fragment.ToTuple());
+    }
+
+    bool IsPhysicallyEquivalent(const IterationSpecFragment& other) const {
+      // Subfragments don't change the physical layout.
+      return stride == other.stride && count == other.count &&
+             slice_start == other.slice_start &&
+             sliced_count == other.sliced_count;
+    }
+
+    std::string ToString() const;
+  };
+  // Description of complex iteration over a sequence of several strides.
+  // Describes a logically contiguous dimension of a tensor physically
+  // separated into multiple fragments by other dimensions.
+  using DimIterationSpec = std::vector<IterationSpecFragment>;
+
+  const DimIterationSpec& operator[](const int dimension) const {
+    return dim_iteration_specs_.at(dimension);
+  }
+  DimIterationSpec& operator[](const int dimension) {
+    return dim_iteration_specs_[dimension];
+  }
+  // Returns nullptr if not found.
+  const DimIterationSpec* Find(int dimension) const;
+
+  std::vector<int> GetDimensions() const;
+
+  void RemoveEmptyDimensions() {
+    absl::erase_if(dim_iteration_specs_,
+                   [](const auto& it) { return it.second.empty(); });
+  }
+
+  bool operator==(const TensorIterationSpec& other) const {
+    return dim_iteration_specs_ == other.dim_iteration_specs_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const TensorIterationSpec& spec) {
+    return H::combine(std::move(h), spec.dim_iteration_specs_);
+  }
+
+  // Compares physical layouts of tensors ignoring subfragments of dimensions.
+  // Checking with this, instead of "==" allows a few more edge cases to be
+  // fused.
+  bool IsPhysicallyEquivalent(const TensorIterationSpec& other) const;
+
+  std::string ToString() const;
+
+ private:
+  // Maps dimensions to DimIterationSpecs.
+  absl::flat_hash_map<int, DimIterationSpec> dim_iteration_specs_;
+};
+
+// The details of the Triton fusion / tiling propagation are in a separate
+// namespace to avoid littering the xla::gpu namespace.
+namespace triton_fusion {
+
+// Handles numbers of dimensions of an HLO instruction
+// projected onto another one.
+// Used to calculate cumulative index transformations done by non-elementwise
+// instructions between source and target.
+class DimensionOrder {
+ public:
+  static DimensionOrder FromDotOperandOrOutput(
+      const HloInstruction& hlo, int split_k_dimension_index = -1);
+
+  // Description of a continuous fragment of one dimension of a tensor.
+  class Fragment {
+   public:
+    explicit Fragment(int dst_dim_number, int64_t count)
+        : dst_dim_number_(dst_dim_number),
+          count_(count),
+          slice_start_(0),
+          sliced_count_(count) {}
+
+    std::string ToString() const;
+
+    // Label carrying the dimension number of an defining operation.
+    int dst_dim_number() const { return dst_dim_number_; }
+    // Total number of elements in the fragment ignoring slicing.
+    int64_t full_count() const { return count_; }
+    // First used element.
+    int64_t slice_start() const { return slice_start_; }
+    // Number of used elements.
+    int64_t sliced_count() const { return sliced_count_; }
+    bool is_sliced() const { return count_ != sliced_count_; }
+    void set_slice(int64_t start, int64_t count) {
+      slice_start_ = start;
+      sliced_count_ = count;
+    }
+    void set_count(int64_t count) { count_ = count; }
+
+   private:
+    const int dst_dim_number_;
+    int64_t count_;
+    int64_t slice_start_;
+    int64_t sliced_count_;
+  };
+  using Fragments = std::vector<Fragment>;
+  using FragmentOrders = absl::flat_hash_map<int, std::vector<int>>;
+
+  const Fragments& TensorFragmentsOrder() const {
+    return tensor_fragments_order_;
+  }
+  Fragments& TensorFragmentsOrder() { return tensor_fragments_order_; }
+
+  const FragmentOrders& DimFragmentsOrders() const {
+    return dim_fragments_orders_;
+  }
+  FragmentOrders& DimFragmentsOrders() { return dim_fragments_orders_; }
+
+  std::string ToString() const;
+
+  TensorIterationSpec ToTensorIterationSpec() const;
+
+  // Tells that two dimension orders describe the same tensor physical layout.
+  bool IsPhysicallyEquivalent(const DimensionOrder& other) const {
+    return ToTensorIterationSpec().IsPhysicallyEquivalent(
+        other.ToTensorIterationSpec());
+  }
+
+ private:
+  // Sequence of all fragments of dimensions of tensor's shape
+  // in layout minor-to-major (physical) order.
+  Fragments tensor_fragments_order_;
+  // Iteration orders of fragments of each dimension of the defining operation
+  // (fragments can be physically unordered and disconnected within
+  // the shape due to reshapes and transposes).
+  FragmentOrders dim_fragments_orders_;
+};
+
+// This represents an invalid dimension index.
+inline constexpr int kNoDimensionIndex = -1;
+struct DotProperties {
+  const int noncontracting_dimension;
+  // Index of dot dimension that can be split.
+  // Currently typically LHS non-contracting one.
+  const int splittable_dimension_index;
+};
+
+// A special value for splittable_dimension_major_part_size.
+inline constexpr int kNoSplitRequirement = 1;
+struct DotRequirements {
+  explicit DotRequirements(int64_t splittable_dimension_major_part_size)
+      : splittable_dimension_major_part_size(
+            splittable_dimension_major_part_size) {
+    CHECK_GE(splittable_dimension_major_part_size, 1);
+  }
+  // If not kNoSplitRequirement, then the major part size of the splittable
+  // dimension must be the given value.
+  int64_t splittable_dimension_major_part_size;
+};
+
+using DotRequirementsOrError = std::variant<DotRequirements, FusionDecision>;
+
+DotRequirementsOrError CombineDotRequirements(
+    DotRequirements a, DotRequirementsOrError b_or_error);
+
+enum class TransformDirection { kInputToOutput, kOutputToInput };
+using DimOrderMap = absl::flat_hash_map<const HloInstruction*, DimensionOrder>;
+using DimOrderMapOrError = std::variant<DimOrderMap, FusionDecision>;
+
+// The dimension orders and requirements resulting from propagating the
+// dimension orders through an HLO.
+struct DimOrdersAndReqs {
+  DimOrderMap dim_orders;
+  DotRequirements requirements;
+};
+using DimOrdersAndReqsOrError = std::variant<DimOrdersAndReqs, FusionDecision>;
+
+// If fusing the instruction is possible then it propagates
+// the `src_dim_order` (describing one side of `hlo`) to the other side and
+// returns those dim orders and the requirements that they impose on the
+// fusion.
+DimOrdersAndReqsOrError GetPropagatedDimOrdersAndRequirements(
+    const HloInstruction& hlo, const DimensionOrder& src_dim_order,
+    TransformDirection direction, const DotProperties& properties);
+// If fusing the instruction is possible *and profitable* then it propagates
+// the `src_dim_order` (describing one side of `hlo`) to the other side and
+// returns those dim orders and the requirements that they impose on the
+// fusion.
+//
+// `src_operand_index` must be set iff `transform_direction` is
+// kInputToOutput.
+DimOrdersAndReqsOrError
+GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
+    const HloInstruction& hlo, TransformDirection transform_direction,
+    const std::optional<int>& src_operand_index,
+    const DimensionOrder& src_dim_order,
+    const se::GpuComputeCapability& gpu_version,
+    const DotProperties& properties);
+
+}  // namespace triton_fusion
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRITON_TILING_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/variant_visitor.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/variant_visitor.h
new file mode 100644
index 00000000..c4ff4aa8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/variant_visitor.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_VARIANT_VISITOR_H_
+#define XLA_SERVICE_GPU_VARIANT_VISITOR_H_
+
+namespace xla::gpu {
+// This structure is used to support C++17 overload pattern as described in
+// https://en.cppreference.com/w/cpp/utility/variant/visit
+//
+// TODO(b/319202112): Replace with absl::Overload once abs lts_2024_XXX is
+// tagged.
+template <class... Ts>
+struct VariantVisitor : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+VariantVisitor(Ts...) -> VariantVisitor<Ts...>;
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_VARIANT_VISITOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/xfeed_queue.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/xfeed_queue.h
new file mode 100644
index 00000000..737bc921
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu/xfeed_queue.h
@@ -0,0 +1,157 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_XFEED_QUEUE_H_
+#define XLA_SERVICE_GPU_XFEED_QUEUE_H_
+
+#include <deque>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+// TODO(b/30467474) Once GPU outfeed implementation settles, consider
+// folding back the cpu and gpu outfeed implementations into a generic
+// one if possible.
+
+// Manages a thread-safe queue of buffers.
+template <typename BufferType>
+class XfeedQueue {
+ public:
+  // Adds a tree of buffers to the queue. The individual buffers correspond to
+  // the elements of a tuple and may be nullptr if the buffer is a tuple index
+  // buffer.
+  void EnqueueDestination(BufferType buffers) {
+    absl::MutexLock l(&mu_);
+    enqueued_buffers_.push_back(std::move(buffers));
+
+    EnqueueHook();
+  }
+
+  // Blocks until the queue is non-empty, then returns the buffer at the head of
+  // the queue.
+  BufferType BlockingGetNextDestination() {
+    for (const auto& callback : before_get_next_dest_callbacks_) {
+      callback();
+    }
+
+    bool became_empty;
+    BufferType current_buffer;
+    {
+      absl::MutexLock l(&mu_,
+                        absl::Condition(this, &XfeedQueue::IsBufferEnqueued));
+      current_buffer = std::move(enqueued_buffers_.front());
+      enqueued_buffers_.pop_front();
+      DequeueHook();
+      became_empty = enqueued_buffers_.empty();
+    }
+    if (became_empty) {
+      for (const auto& callback : on_empty_callbacks_) {
+        callback();
+      }
+    }
+    return current_buffer;
+  }
+
+  void RegisterOnEmptyCallback(std::function<void()> callback) {
+    on_empty_callbacks_.push_back(std::move(callback));
+  }
+  void RegisterBeforeGetNextDestinationCallback(
+      std::function<void()> callback) {
+    before_get_next_dest_callbacks_.push_back(std::move(callback));
+  }
+
+  virtual ~XfeedQueue() = default;
+
+ protected:
+  virtual void DequeueHook() ABSL_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {}
+  virtual void EnqueueHook() ABSL_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {}
+
+  absl::Mutex mu_;
+
+  // The queue of trees of buffers. Buffer* queue contents are not owned.
+  std::deque<BufferType> enqueued_buffers_ ABSL_GUARDED_BY(mu_);
+
+ private:
+  // Returns true if there is a buffer in the queue.
+  bool IsBufferEnqueued() const ABSL_SHARED_LOCKS_REQUIRED(mu_) {
+    return !enqueued_buffers_.empty();
+  }
+
+  // List of callbacks which will be called when 'enqueued_buffers_' becomes
+  // empty.
+  std::vector<std::function<void()>> on_empty_callbacks_;
+
+  // List of callbacks which will be called before BlockingGetNextDestination()
+  // is called. This lets you e.g. call EnqueueDestination() for each call to
+  // BlockingGetNextDestination().
+  std::vector<std::function<void()>> before_get_next_dest_callbacks_;
+};
+
+// Like XfeedQueue but with a maximum capacity.  Clients can call
+// `BlockUntilEnqueueSlotAvailable` to block until there are fewer than
+// `max_pending_xfeeds_` capacity pending infeed items.
+//
+// We introduce a separate `BlockUntilEnqueueSlotAvailable` (as opposed to
+// overriding `EnqueueDestination` to block) because we want to block before we
+// copy the buffer to GPU memory, in order to bound the memory consumption due
+// to pending infeeds.
+template <typename BufferType>
+class BlockingXfeedQueue : public XfeedQueue<BufferType> {
+ public:
+  explicit BlockingXfeedQueue(int max_pending_xfeeds)
+      : max_pending_xfeeds_(max_pending_xfeeds) {}
+
+  void BlockUntilEnqueueSlotAvailable() {
+    absl::MutexLock l{
+        &this->mu_,
+        absl::Condition(this, &BlockingXfeedQueue::IsEnqueueSlotAvailable)};
+
+    pending_buffers_++;
+  }
+
+ protected:
+  void EnqueueHook() ABSL_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) override {
+    pending_buffers_--;
+  }
+
+  void DequeueHook() ABSL_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) override {}
+
+ private:
+  const int max_pending_xfeeds_;
+
+  bool IsEnqueueSlotAvailable() const ABSL_SHARED_LOCKS_REQUIRED(this->mu_) {
+    VLOG(2) << "Capacity "
+            << (pending_buffers_ + this->enqueued_buffers_.size())
+            << " >= max capacity " << max_pending_xfeeds_;
+    return pending_buffers_ + this->enqueued_buffers_.size() <
+           max_pending_xfeeds_;
+  }
+
+  // Keeps track of the number of buffers reserved but not added to
+  // enqueued_buffers_.
+  int pending_buffers_ ABSL_GUARDED_BY(this->mu_) = 0;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_XFEED_QUEUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/gpu_compilation_environment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu_compilation_environment.h
new file mode 100644
index 00000000..1830a25f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/gpu_compilation_environment.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
+#define XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+
+absl::StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromFlagStrings(
+    std::vector<std::string>& flags, bool strict);
+
+absl::StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromEnvVar();
+
+GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues();
+
+// Returns non-OK status if XLA_FLAGS env var has malformed values or
+// if it has conflict with the GpuCompilationEnvironment proto
+absl::Status InitializeMissingFieldsFromXLAFlags(
+    GpuCompilationEnvironment& env);
+
+}  // namespace xla
+#endif  // XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/graphcycles/graphcycles.h b/third_party/tflite-hdrs/third_party/xla/xla/service/graphcycles/graphcycles.h
new file mode 100644
index 00000000..f697f659
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/graphcycles/graphcycles.h
@@ -0,0 +1,156 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GRAPHCYCLES_GRAPHCYCLES_H_
+#define XLA_SERVICE_GRAPHCYCLES_GRAPHCYCLES_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+// GraphCycles detects the introduction of a cycle into a directed
+// graph that is being built up incrementally.
+//
+// Nodes are identified by small integers.  It is not possible to
+// record multiple edges with the same (source, destination) pair;
+// requests to add an edge where one already exists are silently
+// ignored.
+//
+// It is also not possible to introduce a cycle; an attempt to insert
+// an edge that would introduce a cycle fails and returns false.
+//
+// GraphCycles uses no internal locking; calls into it should be
+// serialized externally.
+
+// Performance considerations:
+//   Works well on sparse graphs, poorly on dense graphs.
+//   Extra information is maintained incrementally to detect cycles quickly.
+//   InsertEdge() is very fast when the edge already exists, and reasonably fast
+//   otherwise.
+//   FindPath() is linear in the size of the graph.
+// The current implementation uses O(|V|+|E|) space.
+
+#include <optional>
+
+#include "absl/types/span.h"
+
+namespace xla {
+
+// NOTE!!!
+// For now a copy of this is forked to net/plaque. If you
+// find a bug or add a feature, please inform the owners of the
+// net/plaque copy in case it should be integrated.
+// NOTE!!!
+class GraphCycles {
+ public:
+  GraphCycles();
+  ~GraphCycles();
+
+  // Allocate an unused node id and return it.
+  // The new node has a null pointer for its node data.
+  // All node identifiers passed to other routines in this interface
+  // must have been allocated by NewNode() and not yet deallocated
+  // by RemoveNode().
+  int32_t NewNode();
+
+  // Remove "node" from the graph, deleting all edges to and from it.
+  // After this call the identifier "node" it may no longer be used
+  // as an argument to any routine until it has been reallocated with
+  // NewNode().
+  void RemoveNode(int32_t node);
+
+  // Attempt to insert an edge from source_node to dest_node.  If the
+  // edge would introduce a cycle, return false without making any
+  // changes. Otherwise add the edge and return true.
+  bool InsertEdge(int32_t source_node, int32_t dest_node);
+
+  // Remove any edge that exists from source_node to dest_node.
+  void RemoveEdge(int32_t source_node, int32_t dest_node);
+
+  // Return whether there is an edge directly from source_node to dest_node.
+  bool HasEdge(int32_t source_node, int32_t dest_node) const;
+
+  // Contracts the edge from 'a' to node 'b', merging nodes 'a' and 'b'. One of
+  // the nodes is removed from the graph, and edges to/from it are added to
+  // the remaining one, which is returned. If contracting the edge would create
+  // a cycle, does nothing and return no value.
+  std::optional<int32_t> ContractEdge(int32_t a, int32_t b);
+
+  // Return true if can contract edge, otherwise return false.
+  bool CanContractEdge(int32_t a, int32_t b);
+
+  // Return whether dest_node is reachable from source_node
+  // by following edges.
+  bool IsReachable(int32_t source_node, int32_t dest_node) const;
+
+  // A faster non-thread-safe version of IsReachable.
+  bool IsReachableNonConst(int32_t source_node, int32_t dest_node);
+
+  // Return or set the node data for a node.  This data is unused
+  // by the implementation.
+  void *GetNodeData(int32_t node) const;
+  void SetNodeData(int32_t node, void *data);
+
+  // Find a path from "source" to "dest".  If such a path exists, place the
+  // node IDs of the nodes on the path in the array path[], and return the
+  // number of nodes on the path.  If the path is longer than max_path_len
+  // nodes, only the first max_path_len nodes are placed in path[].  The client
+  // should compare the return value with max_path_len" to see when this
+  // occurs.  If no path exists, return 0.  Any valid path stored in path[]
+  // will start with "source" and end with "dest".  There is no guarantee that
+  // the path is the shortest, but no node will appear twice in the path,
+  // except the source and destination node if they are identical; therefore,
+  // the return value is at most one greater than the number of nodes in the
+  // graph.
+  int FindPath(int32_t source, int32_t dest, int max_path_len,
+               int32_t path[]) const;
+
+  // Check internal invariants. Crashes on failure, returns true on success.
+  // Expensive: should only be called from graphcycles_test.cc.
+  bool CheckInvariants() const;
+
+  // Warning: Do not use these if iterating over the span and modifying the
+  // GraphCycles at the same time. Instead use SuccessorsCopy/PredecessorsCopy.
+  absl::Span<const int32_t> Successors(int32_t node) const;
+  absl::Span<const int32_t> Predecessors(int32_t node) const;
+
+  // Return a copy of the successors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32_t> SuccessorsCopy(int32_t node) const;
+  // Return a copy of the predecessors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32_t> PredecessorsCopy(int32_t node) const;
+
+  // Returns all nodes in post order.
+  //
+  // If there is a path from X to Y then X appears after Y in the
+  // returned vector.
+  std::vector<int32_t> AllNodesInPostOrder() const;
+
+  // Returns the graph in graphviz format.
+  std::string DebugString() const;
+
+  // ----------------------------------------------------
+  struct Rep;
+
+ private:
+  Rep *rep_;  // opaque representation
+  GraphCycles(const GraphCycles &) = delete;
+  GraphCycles &operator=(const GraphCycles &) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GRAPHCYCLES_GRAPHCYCLES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/graphcycles/ordered_set.h b/third_party/tflite-hdrs/third_party/xla/xla/service/graphcycles/ordered_set.h
new file mode 100644
index 00000000..4930e12b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/graphcycles/ordered_set.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GRAPHCYCLES_ORDERED_SET_H_
+#define XLA_SERVICE_GRAPHCYCLES_ORDERED_SET_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+// This is a set data structure that provides a deterministic iteration order.
+// The iteration order of elements only depends on the sequence of
+// inserts/deletes, so as long as the inserts/deletes happen in the same
+// sequence, the set will have the same iteration order.
+//
+// Assumes that T can be cheaply copied for simplicity.
+template <typename T>
+class OrderedSet {
+ public:
+  // Inserts `value` into the ordered set.  Returns true if the value was not
+  // present in the set before the insertion.
+  bool Insert(T value) {
+    bool new_insertion =
+        value_to_index_.insert({value, value_sequence_.size()}).second;
+    if (new_insertion) {
+      value_sequence_.push_back(value);
+    }
+    return new_insertion;
+  }
+
+  // Removes `value` from the set.  Assumes `value` is already present in the
+  // set.
+  void Erase(T value) {
+    auto it = value_to_index_.find(value);
+    DCHECK(it != value_to_index_.end());
+
+    // Since we don't want to move values around in `value_sequence_` we swap
+    // the value in the last position and with value to be deleted and then
+    // pop_back.
+    value_to_index_[value_sequence_.back()] = it->second;
+    std::swap(value_sequence_[it->second], value_sequence_.back());
+    value_sequence_.pop_back();
+    value_to_index_.erase(it);
+  }
+
+  void Reserve(size_t new_size) {
+    value_to_index_.reserve(new_size);
+    value_sequence_.reserve(new_size);
+  }
+
+  void Clear() {
+    value_to_index_.clear();
+    value_sequence_.clear();
+  }
+
+  bool Contains(T value) const { return value_to_index_.contains(value); }
+  size_t Size() const { return value_sequence_.size(); }
+
+  absl::Span<T const> GetSequence() const { return value_sequence_; }
+
+ private:
+  // The stable order that we maintain through insertions and deletions.
+  std::vector<T> value_sequence_;
+
+  // Maps values to their indices in `value_sequence_`.
+  absl::flat_hash_map<T, int> value_to_index_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GRAPHCYCLES_ORDERED_SET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/heap_simulator/allocation_block.h b/third_party/tflite-hdrs/third_party/xla/xla/service/heap_simulator/allocation_block.h
new file mode 100644
index 00000000..f87491e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/heap_simulator/allocation_block.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(b/319135034): create a heap_simulator sub directory and move
+// allocation_block.h/cc to it.
+
+// This file contains a number of data structures to describe how blocks of
+// data are allocated. It is used by Memory Space Assignment repacking to
+// understand how data was allocated before the repacking.
+
+#ifndef XLA_SERVICE_HEAP_SIMULATOR_ALLOCATION_BLOCK_H_
+#define XLA_SERVICE_HEAP_SIMULATOR_ALLOCATION_BLOCK_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace xla {
+
+// Data about a slice in a sliced allocation.
+struct AllocatedSlice {
+  int64_t size;
+  int64_t offset;
+  int64_t inclusive_start_time;
+
+  std::string ToString() const;
+
+  std::tuple<int64_t, int64_t, int64_t> ToTuple() const;
+
+  bool operator==(const AllocatedSlice& rhs) const;
+};
+
+// Slice data about a sliced allocation.
+struct SlicedAllocationData {
+  std::vector<AllocatedSlice> slices_sorted_by_offset;
+
+  std::vector<int64_t> SizesSortedByOffset() const;
+
+  std::vector<int64_t> SortedInclusiveStartTimes() const;
+
+  int64_t num_slices() const { return slices_sorted_by_offset.size(); }
+
+  std::string ToString() const;
+
+  bool operator==(const SlicedAllocationData& rhs) const;
+};
+
+// A contiguous block of allocation consisting of start and end (logical)
+// times, size, and the initial offset. After repacking, if the repacking was
+// successful and the allocations were modified, the offset field holds the
+// new offset. To support aliased allocations, AllocationBlock also includes a
+// pointer to the next colocated AllocationBlock called next_colocated. The
+// colocations form a circular singly-linked list. Therefore, next_colocated
+// should never be a nullptr (it should point to itself for AllocationBlocks
+// without any other colocations). All AllocationBlock objects within the
+// colocations must get the same offset. The id should be unique and is used
+// to ensure determinism for comparison tie-breaker.
+//
+// Each AllocationBlock can be treated as an allocation that requires size
+// space from start_time to end_time. However, some allocations are really
+// composed of slices. In such cases, the repacker can utilize
+// the information in the original_slice_data field to achieve an even more
+// efficient repacking.
+struct AllocationBlock {
+  int64_t inclusive_start_time;
+  int64_t end_time;
+  int64_t size;
+  int64_t offset;
+  int64_t initial_offset;
+  int64_t id;
+  AllocationBlock* next_colocated;
+
+  // Optional data structures that are used to improve repacking, when an
+  // allocation is sliced, e.g., from a sliced prefetch.
+  std::optional<SlicedAllocationData> original_slice_data;
+  std::optional<SlicedAllocationData> repacked_slice_data;
+
+  std::string ToString() const;
+
+  // Returns the number of AllocationBlocks colocated with this (including
+  // this AllocationBlock).
+  int GetColocationsCount() const;
+
+  // Returns the AllocationBlocks colocated with this (including this
+  // AllocationBlock).
+  std::vector<AllocationBlock*> GetColocations();
+
+  // This is required by BufferIntervalCompare as a tie breaker. Use a unique
+  // and deterministic id.
+  bool operator<(const AllocationBlock& other) const;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HEAP_SIMULATOR_ALLOCATION_BLOCK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/heap_simulator/heap_simulator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/heap_simulator/heap_simulator.h
new file mode 100644
index 00000000..d81b29b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/heap_simulator/heap_simulator.h
@@ -0,0 +1,1049 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HEAP_SIMULATOR_HEAP_SIMULATOR_H_
+#define XLA_SERVICE_HEAP_SIMULATOR_HEAP_SIMULATOR_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
+#if defined(__GNUC__) || defined(__clang__)
+#include "absl/container/btree_map.h"
+#endif
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/logical_buffer.h"
+
+namespace xla {
+
+// Forward declare classes defined below.
+template <typename BufferType>
+class HeapAlgorithm;
+template <typename BufferType>
+class NoFragmentationStatsHeap;
+
+// HeapSimulator assigns buffer offsets by running a simulation of a regular
+// memory heap with Alloc and Free calls.  It only works for completely
+// sequential instruction sequences.  Unlike regular heaps, we have the
+// advantage that the sequence of Alloc and Free calls is known up-front; we
+// don't need to return the assignment of buffer offsets until the very end.
+class HeapSimulator {
+ public:
+  // Chunk represents a contiguous piece of memory.  Each BufferValue will be
+  // associated with a chunk in the assignment result.
+  struct Chunk {
+    static Chunk FromOffsetEnd(int64_t offset, int64_t end);
+    static Chunk FromOffsetSize(int64_t offset, int64_t size);
+    Chunk() : Chunk(-1, 0) {}
+
+    std::string ToString() const;
+
+    int64_t offset;
+    int64_t size;
+
+    int64_t chunk_end() const { return offset + size; }
+
+    bool OverlapsWith(Chunk other_chunk) const;
+
+    bool operator==(const Chunk& other) const {
+      return offset == other.offset && size == other.size;
+    }
+
+   private:
+    Chunk(int64_t offset, int64_t size) : offset(offset), size(size) {}
+
+    friend std::ostream& operator<<(std::ostream& stream, const Chunk& chunk);
+  };
+
+  template <typename BufferType>
+  struct HeapResult {
+    // Returns the updated heap size if `chunk` is added to the heap.
+    int64_t UpdatedHeapSize(const Chunk& chunk) const {
+      return std::max(heap_size, chunk.chunk_end());
+    }
+
+    // The assignment of buffers to chunks.
+    absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
+
+    // The total size in bytes of the heap, containing all assigned chunks.
+    int64_t heap_size = 0;
+  };
+  // Result represents the result of the heap simulation.
+  template <typename BufferType>
+  struct Result {
+    // Heap results.
+    std::vector<HeapResult<BufferType>> heap_results;
+
+    // The total size in bytes of the heaps.
+    // heap_size == sum([hr.heap_size for hr in heap_results]).
+    int64_t heap_size = 0;
+
+    // The total size in bytes of heap fragmentation.
+    int64_t fragmentation_size = 0;
+
+    // A trace of heap simulation events.
+    HeapSimulatorTrace debug_trace;
+  };
+
+  // The different options to be passed to the Run() APIs.
+  struct Options {
+    Options()
+        : may_reuse_operand_buffers(true),
+          alloc_constants(false),
+          buffers_to_assign(nullptr) {}
+
+    // Whether a buffer about to be Free()-ed, can be recycled for a new born
+    // one, hence collapsing Free()+Alloc() calls (default true).
+    bool may_reuse_operand_buffers;
+    // Whether to issue Alloc() and Free() calls for constants (default false).
+    bool alloc_constants;
+    // If 'buffers_to_assign' is provided, only those buffers are assigned
+    // offsets, otherwise all buffers defined by the instructions are assigned.
+    const absl::flat_hash_set<const HloValue*>* buffers_to_assign;
+  };
+
+  // Returns the minimum memory required to compute an HLO module where all
+  // computations have been scheduled (represented by the given
+  // schedule), assuming no fragmentation.
+  static absl::StatusOr<int64_t> MinimumMemoryForModule(
+      const HloSchedule& schedule,
+      const LogicalBuffer::SizeFunction& size_function);
+
+  // Returns the minimum memory required to compute the given computation,
+  // assuming no fragmentation.
+  static absl::StatusOr<int64_t> MinimumMemoryForComputation(
+      const HloComputation& computation, const HloInstructionSequence& sequence,
+      const HloAliasAnalysis& alias_analysis,
+      const LogicalBuffer::SizeFunction& size_function);
+
+  static absl::StatusOr<int64_t> MinimumMemoryForComputation(
+      const HloComputation& computation, const HloInstructionSequence& sequence,
+      const HloAliasAnalysis& alias_analysis,
+      const LogicalBuffer::SizeFunction& size_function,
+      const HloSchedule* schedule);
+
+  // Run the heap simulation with the given algorithm, assuming the given
+  // schedule, which must contain a topologically-consistent total
+  // ordering of all instructions within each computation. The result is invalid
+  // if instructions are not run in exactly this sequence.
+  //
+  // Running heap simulation on the whole module tends to save memory, compared
+  // to running on a per-computation basis, since we can re-use buffer space for
+  // called sub-computations.
+  //
+  static absl::StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+      const HloModule& module, const HloSchedule& schedule,
+      const HloAliasAnalysis& alias_analysis,
+      const BufferValue::SizeFunction& size_fn,
+      const Options& options = Options());
+
+  // Same as above, but runs on a single computation. The 'instruction_sequence'
+  // must contain a topologically-consistent total ordering of all instructions
+  // in the computation. The result is invalid if instructions are not run in
+  // exactly this sequence.
+  static absl::StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+      const HloComputation& computation,
+      const HloInstructionSequence& instruction_sequence,
+      const HloAliasAnalysis& alias_analysis,
+      const BufferValue::SizeFunction& size_fn,
+      const Options& options = Options());
+
+  // Same as above, but runs on with a schedule that covers all nested
+  // computations.
+  static absl::StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+      const HloComputation& computation,
+      const HloInstructionSequence& instruction_sequence,
+      const HloAliasAnalysis& alias_analysis,
+      const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule,
+      const Options& options = Options());
+
+ private:
+  // If 'schedule' is non-null, it is used to find kCall and kWhile
+  // sub-computations, and the heap simulation for those sub-computations will
+  // be run recursively. I.e. the simulation is run over the whole module.
+  HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+                const BufferValue::SizeFunction& size_fn,
+                const Options& options, const HloSchedule* schedule = nullptr);
+  ~HeapSimulator();
+
+  absl::Status RunComputation(
+      const HloComputation& computation,
+      const HloInstructionSequence& instruction_sequence,
+      const HloAliasAnalysis& alias_analysis, HloLiveRange* live_range);
+
+  bool IgnoreBuffer(const HloValue* buffer) const;
+  void Alloc(const HloValue* buffer, const HloInstruction* instruction);
+  void Free(const HloValue* buffer, const HloInstruction* instruction);
+  // ShareBuffer indicates that a new buffer is defined and it has to be the
+  // same address as the shared one.
+  void ShareBuffer(const HloValue* buffer, const HloValue* shared,
+                   const HloInstruction* instruction);
+
+  // Returns the size of the HloValue, which is the max size of the HloValues
+  // that are part of the HloBuffer.
+  int64_t GetBufferSize(const HloValue* buffer) const;
+
+  // Returns true if:
+  //  Two buffers belong to the same shared group.
+  //  Eight of the buffer has no shared group assigned.
+  bool InSameSharedGroup(const HloValue* left, const HloValue* right);
+  absl::StatusOr<Result<HloValue>> Finish();
+
+  void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
+                      const HloValue* buffer, const HloInstruction* instruction,
+                      const HloValue* share_with_canonical);
+
+  // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
+  // in which case we are calculating the same allocs/frees twice in the
+  // simulation.
+  const std::unique_ptr<NoFragmentationStatsHeap<HloValue>>
+      no_fragmentation_stats_;
+  const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_;
+  const BufferValue::SizeFunction size_fn_;
+  const Options options_;
+  // schedule_ is set by buffer assignment. Then, in RunComputation, we check
+  // both in order to handle subcomputations. It would be good to unify the
+  // handling of subcomputations, but it's not clear how.
+  const HloSchedule* schedule_;
+
+  // Hold some sets for error-checking the sequence of Alloc and Free calls.
+  absl::flat_hash_set<const HloValue*> allocated_buffers_;
+  absl::flat_hash_set<const HloValue*> freed_buffers_;
+
+  absl::flat_hash_map<const HloValue*, int64_t> buffer_sizes_;
+
+  // Debugging information filled in while the heap simulator runs.
+  HeapSimulatorTrace debug_trace_;
+};
+
+// Abstract base class describing a heap simulation algorithm that assigns
+// offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
+// same semantics as a regular memory heap.  Finish will be called at the end to
+// collect the simulation results.
+template <typename BufferType>
+class HeapAlgorithm {
+ public:
+  using Chunk = HeapSimulator::Chunk;
+  using Result = HeapSimulator::Result<BufferType>;
+  using HeapResult = HeapSimulator::HeapResult<BufferType>;
+
+  virtual ~HeapAlgorithm() = default;
+
+  // Alloc allocates a buffer of 'size' bytes.
+  virtual void Alloc(const BufferType* buffer, int64_t size) = 0;
+
+  // Takes memory usage of subcomputations into account when calculating the
+  // memory usage of a computation. Currently, we don't handle buffer aliasing
+  // between computations entirely correctly. We are careful to not double count
+  // for the output buffers of whiles/conds/calls. But we don't take into
+  // account other aliases, such as for the while init. A more thorough solution
+  // would require something like BufferAssignment::BuildColocatedBufferSets.
+  // TODO(b/65835246):
+  // Since TuplePointsToAnalysis is being replaced with a module-aware alias
+  // analysis, it's not worth making major changes to HeapSimulator now.
+  virtual void AccountForSubcomputationMemory(
+      const HloInstruction* instruction,
+      // The total number of bytes allocated by instruction.
+      int64_t alloc_size_by_instruction) {}
+
+  // Free de-allocates a previously allocated buffer.
+  virtual void Free(const BufferType* buffer, int64_t size) = 0;
+
+  // Indicates that a buffer has to be collocated with another buffer. In
+  // addition to Alloc and Free, the heap simulator exposes a concept of buffer
+  // sharing.  When ShareBuffer is called, instead of allocating new space for
+  // the buffer, it associates the buffer with a previously allocated (or
+  // shared) buffer.  Each group of mutually-shared buffers points to a single
+  // SharedGroup instance, which is a shared control block.
+  virtual void ShareWith(const BufferType* buffer, const BufferType* share_with,
+                         int64_t size) {
+    Alloc(buffer, size);
+  }
+
+  // Finish collects the buffer offset assignment results.  Finish may only be
+  // called once, after all Alloc and Free calls.
+  virtual absl::StatusOr<Result> Finish() = 0;
+};
+
+// NoFragmentationStatsHeap computes the heap size assuming no fragmentation;
+// this is the absolute minimum size for a given instruction sequence.  The
+// result.chunk_map returned in Finish is always empty, since we only collect
+// stats, and don't actually compute chunk assignments.
+template <typename BufferType>
+class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> {
+ public:
+  using Result = HeapSimulator::Result<BufferType>;
+
+  NoFragmentationStatsHeap() = default;
+  ~NoFragmentationStatsHeap() override = default;
+
+  void Alloc(const BufferType* buffer, int64_t size) override;
+
+  void AccountForSubcomputationMemory(
+      const HloInstruction* instruction,
+      int64_t alloc_size_by_instruction) override;
+
+  void Free(const BufferType* buffer, int64_t size) override;
+
+  absl::StatusOr<Result> Finish() override;
+
+ private:
+  int64_t current_heap_size_ = 0;
+  int64_t max_heap_size_ = 0;
+};
+
+// Node in BufferIntervalTree that stores the alloc and free times of a buffer,
+// and the chunk assigned to it.
+struct BufferIntervalTreeNode {
+  // Alloc time.
+  int64_t start;
+  // Free time.
+  int64_t end;
+  // Maximum free time of all nodes in the subtree where this node is the root.
+  int64_t subtree_end;
+  // Allocated chunk for the buffer.
+  HeapSimulator::Chunk chunk;
+  // Left child.
+  BufferIntervalTreeNode* left;
+  // Right child.
+  BufferIntervalTreeNode* right;
+  // parent
+  BufferIntervalTreeNode* parent;
+
+  std::string ToString() const;
+};
+
+// An interval tree that can query buffers overlapping in time.
+class BufferIntervalTree {
+ public:
+  using Chunk = HeapSimulator::Chunk;
+  // Adds a buffer to the interval tree, with the time interval and allocated
+  // chunk specified.
+  void Add(int64_t start, int64_t end, const Chunk& chunk);
+
+  // Remove the interval from the tree. Returns true if the chunk is removed.
+  bool Remove(int64_t start, int64_t end, const Chunk& chunk);
+
+  // Returns the number of allocated chunks that overlap with the given time
+  // interval.
+  int NumChunksOverlappingInTime(int64_t start, int64_t end) const;
+
+  // Returns vector of allocated chunks that overlap with the given time
+  // interval.
+  std::vector<Chunk> ChunksOverlappingInTime(int64_t start, int64_t end) const;
+
+  BufferIntervalTreeNode* GetRoot() { return root_; }
+
+  // Returns a compact 2D view of memory usage over time.
+  // X axis is time, Y axis is memory.
+  //
+  // Say there are 3 buffers in the heap:
+  // - Buffer 1: memory block [0, 16), time interval [15, 25]
+  // - Buffer 2: memory block [16, 48), time interval [15, 19]
+  // - Buffer 3: memory block [32, 64), time interval [20, 22]
+  //
+  // NodesOverlappingInTimeToAsciiArt(/*start=*/18, /*end=*/23,
+  // /*group_size=*/3) returns:
+  //
+  // Memory map for time: [18,23], memory_block_size: 16, group_size: 3
+  //
+  //  ..# ##. 64
+  //  ### ##. 48
+  //  ##. ... 32
+  //  ### ### 16
+  //  890 123
+  //
+  // Explanation:
+  //
+  // The functions decides a memory block size of 16 would be most compact to
+  // display all the buffers.
+  // '#' indicates used and '.' indicates free memory.
+  //
+  // ..# ##. 64      "64" indicates memory block [48,64)
+  // ### ##. 48      "48" indicates memory block [32,48)
+  // ##. ... 32      "32" indicates memory block [16,32)
+  // ### ### 16      "16" indicates memory block [0,16)
+  // 890 123
+  //
+  // "890 123" indicate the last digits of time instants 18, 19, 20, 21, 22, 23.
+  // Only the last digit is shown for compactness.
+  // `group_size=3` inserts spaces after every 3 columns (time instants).
+  // All the memory blocks beyond 64 are free for time interval [18,23].
+  std::string NodesOverlappingInTimeToAsciiArt(int64_t start, int64_t end,
+                                               int64_t group_size = 0) const;
+
+  // Returns a vector of size `end - start + 1` where the element at index i is
+  // the memory used at the time instant `start + i`. Both `start` and `end` are
+  // inclusive.
+  std::vector<int64_t> MemoryUsedInInterval(int64_t start, int64_t end) const;
+
+  // Returns an integer denoting the largest occupied memory location in the
+  // heap within the time interval [start, end].
+  int64_t HeapSizeInInterval(int64_t start, int64_t end) const;
+
+ private:
+  // The BufferIntervalTreeNode objects inside the result vector are guaranteed
+  // to be non-null.
+  std::vector<const BufferIntervalTreeNode*> NodesOverlappingInTime(
+      int64_t start, int64_t end) const;
+
+  BufferIntervalTreeNode* root_ = nullptr;
+  std::list<BufferIntervalTreeNode> node_storage_;
+};
+
+// An iterator that is passed to
+// GlobalDecreasingSizeBestFitHeap::CreateSlicedAllocationFinder() when trying
+// to place a buffer, telling the finder which permutations of starting slice
+// times to try (and in which order to try them).
+// * The set of slice times is the set {x : x ∈ [0, num_slices - 1]}. If a
+//   buffer is not sliced, it will only have 1 permutation, containing slice
+//   time 0.
+// * The ith value in a permutation is the slice time for the slice at the
+//   ith smallest offset.
+// * Iterators skip permutations that are equivalent to previously emitted
+//   permutations. The ith smallest slice time corresponds to the ith smallest
+//   inclusive start time. Let the start_time_permutation be the mapping of a
+//   permutation to its corresponding start times. Two permutations are
+//   equivalent if their start_time_permutations are equivalent. For example,
+//   let's say slice time 0 and slice time 1 both map to inclusive start time
+//   1000. There is no difference in permutation [0, 1, x] and [1, 0, x]
+//   because the first two slices map to the same inclusive start time.
+// * When repacking slice data is provided, iterators skip invalid
+//   permutations. A permutation is invalid if the mapping from inclusive
+//   start times to slice sizes is not maintained from before the repack.
+// * Begin() must be called to initialize the iterator before it can be used.
+class SliceTimePermutationIterator {
+ public:
+  enum class Ty : std::int8_t {
+    // Include all valid permutations
+    kAll,
+    // Only include perferred valid permutations. Heap simulator is trying to
+    // optimize fitting allocations into a grid of (heap) space by time. The
+    // preferred permutation iterator only allows the following triagular
+    // shapes:
+    //
+    //     Smaller offsets      Smaller offsets      Slice times are
+    //    get smaller slice     get larger slice   distributed around
+    //         times                  times         the middle offset
+    //
+    // space                space                space
+    //   ^                    ^                    ^
+    //   |             +--+   | +--------------+   |             +--+
+    //   |          +--+  |   | +--+           |   |       +-----+  |
+    //   |       +--+     |   |    +--+        |   | +-----+        |
+    //   |    +--+        |   |       +--+     |   | +--+           |
+    //   | +--+           |   |          +--+  |   |    +-----+     |
+    //   | +--------------+   |             +--+   |          +-----+
+    //   +------------------> +------------------> +------------------> time
+    //
+    // We deviate from those shapes as needed to make valid permutations.
+    kPreferred,
+  };
+
+  // A new iterator is typically created for each buffer to be placed.
+  // - num_slices: number of slices in the buffer. 1 if not sliced.
+  // - original_sliced_allocation: For a repacking scenario, the original
+  //   details of each slice in a sliced buffer. nullptr is used if the buffer
+  //   was not sliced. (Note, if the repacker has no slicing data, it is
+  //   treated as unsliced in the repacker and by this iterator.)
+  static std::unique_ptr<SliceTimePermutationIterator> CreateForNewAllocation(
+      Ty ty, absl::Span<const int64_t> inclusive_slice_start_times);
+  static std::unique_ptr<SliceTimePermutationIterator> CreateForRepack(
+      Ty ty, const SlicedAllocationData* original_sliced_allocation);
+
+  virtual ~SliceTimePermutationIterator() = default;
+
+  virtual void Begin() = 0;
+  virtual bool Done() const = 0;
+  virtual void Next() = 0;
+
+  // A permutation of starting slice times.
+  virtual absl::Span<const int64_t> Get() const = 0;
+
+ protected:
+  SliceTimePermutationIterator() = default;
+};
+
+// GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers,
+// then allocates them in decreasing spatial or temporal size regardless of the
+// alloc/free time. It internally tracks the allocated buffers and their live
+// intervals; when allocating a buffer, it finds the best-fit free chunk during
+// its live interval.
+template <typename BufferType>
+class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
+ public:
+  using HeapResult = HeapSimulator::HeapResult<BufferType>;
+  using Result = HeapSimulator::Result<BufferType>;
+  using Chunk = HeapSimulator::Chunk;
+
+  // A mapping from a free chunk offset to the end of that chunk (exclusive).
+#if defined(__GNUC__) || defined(__clang__)
+  using FreeChunks = absl::btree_map<int64_t, int64_t, std::greater<int64_t>>;
+#else
+  using FreeChunks = std::map<int64_t, int64_t, std::greater<int64_t>>;
+#endif
+
+  enum Type {
+    kSpatial = 0,
+    kTemporal,
+    // Custom uses a custom BufferIntervalCompare function provided in the
+    // constructor.
+    kCustom
+  };
+
+  // BufferInterval stores a buffer's size and time interval.
+  struct BufferInterval {
+    // Convenience method for use with debugging and logging.
+    std::string ToString() const;
+
+    const BufferType* buffer = nullptr;
+    int64_t size = -1;
+    // Alloc time of the buffer.
+    int64_t start = -1;
+    // Free time of the buffer.
+    int64_t end = -1;
+
+    // Colocation buffers that need to be collocated with this one.
+    absl::InlinedVector<const BufferType*, 2> colocations;
+
+    // True if this buffer needs an allocation. False if it is collocated with
+    // other buffer.
+    bool need_allocation = false;
+  };
+
+  // Comparison function that is used to store buffer intervals.
+  using BufferIntervalCompare =
+      std::function<bool(const BufferInterval&, const BufferInterval&)>;
+
+  // SlicedBufferInterval is a wrapper around BufferInterval with parameters
+  // indicating whether the BufferInterval should be allocated in slices. (If
+  // NumSlices() is 1, the allocation will not be sliced.) This class is used as
+  // input to GlobalDecreasingSizeBestFitHeap::FindChunkCandidates().
+  //
+  // For example, instead of allocating A in space and time as illustrated on
+  // the left, we may wish to allocate A0 and A1 overlapping in time, contiguous
+  // in memory, (as illustrated on the right). Doing so allows us to free up
+  // allocation space between [s,i], but we only have the full allocation for A
+  // from [i,e].
+  //
+  //   ^
+  // s | +-----------+                 s |       +-----+
+  // p | |           |                 p |       |  A1 |
+  // a | |     A     |                 a | +-----+-----+
+  // c | |           |                 c | |     A0    |
+  // e | +-----------+                 e | +-----------+
+  //   --|-----------|------->           --|-----|-----|------->
+  //     s           e   time              s     i     e   time
+  class SlicedBufferInterval {
+   public:
+    // Factory constructors.
+    static const SlicedBufferInterval CreateConstInterval(
+        const BufferInterval& full_buffer_interval);
+    static SlicedBufferInterval CreateMutableInterval(
+        BufferInterval& full_buffer_interval);
+
+    SlicedBufferInterval() = delete;
+
+    // Updates the number of slices, and slice sizes. An empty
+    // slice_sizes_sorted_by_offset is treated the same as setting the number of
+    // slices to 1. Every time Slice() is called with a set of sizes > 1, it
+    // should be followed at some point by a call to UpdateSliceStartTimes, to
+    // update slice start times. Otherwise, the slice start times are
+    // meaningless.
+    //
+    // REQUIRES:
+    // - sum(slice_sizes_sorted_by_offset) == full_buffer_interval_.size
+    void Slice(absl::Span<const int64_t> slice_sizes_sorted_by_offset);
+
+    // Updates the times at which we will start each slice. However, we have not
+    // yet decided which slice size will correspond to which start time.
+    //
+    // Mutates mutable_full_buffer_interval_.
+    //
+    // REQUIRES:
+    // - The SlicedBufferInterval was constructed using CreateMutableInterval.
+    // - *_start_times.size() == NumSlices()
+    // - *_start_times should be set such that it is permissible for any
+    //   slice size to map to any start time.
+    void UpdateExclusiveSliceStartTimes(
+        const std::vector<int64_t>& exclusive_start_times);
+    void UpdateInclusiveSliceStartTimes(
+        const std::vector<int64_t>& inclusive_start_times);
+
+    // Updates the free time for all the slices.
+    //
+    // Mutates mutable_full_buffer_interval_.
+    //
+    // REQUIRES:
+    // - The SlicedBufferInterval was constructed using CreateMutableInterval.
+    void UpdateEndTime(int64_t end_time);
+
+    const BufferInterval& full_buffer_interval() const;
+    size_t num_slices() const { return slice_sizes_sorted_by_offset_.size(); }
+    const std::vector<int64_t>& SliceSizesSortedByOffset() const;
+    std::vector<int64_t> inclusive_start_times() const;
+
+    // Returns a BufferInterval with the requirements to call
+    // GlobalDecreasingSizeBestFitHeap::MakeFreeChunks at the specified slice
+    // time. The requirements are:
+    // - At the latest slice time, we need a contiguous buffer that is big
+    //   enough to fit all slices. In addition, that contiguous buffer will have
+    //   the same colocation requirements as the full_buffer_interval().
+    // - At other slice times, required chunks may be as small as the smallest
+    //   slice. Furthermore, their colocation requirements are empty.
+    // - The logical start time of the interval at slice time i is the end time
+    //   of the interval at slice time i-1.
+    const BufferInterval& IntervalForMakeFreeChunks(int64_t slice_time) const;
+
+    // Convenience method for use with debugging and logging.
+    std::string ToString() const;
+
+   private:
+    explicit SlicedBufferInterval(
+        const BufferInterval& full_buffer_interval,
+        BufferInterval* mutable_full_buffer_interval = nullptr);
+
+    const BufferInterval& full_buffer_interval_;
+    BufferInterval* mutable_full_buffer_interval_ = nullptr;
+    std::vector<int64_t> slice_sizes_sorted_by_offset_;
+    // make_free_chunks_intervals are indexed by slice time.
+    std::vector<BufferInterval> make_free_chunks_intervals_;
+  };
+
+  // A class for finding locations to allocate a sliced allocation. A sliced
+  // allocation is an allocation of a buffer, in which slices of the buffer are
+  // allocated at different times, called slice times. Slice time is a logical
+  // time. For example, a requestor may ask for 15 Mib, allocated 5 MiB at a
+  // time, at 3 slices times t0, t1, and t2.
+  //
+  // The primary data structure inside this class is free_chunks_. free_chunks_
+  // is a sorted map of the chunks of memory that are free at the latest
+  // requested slice time. For each memory offset within each of those chunks,
+  // we track the earliest slice time t, such that the memory offset is
+  // continuously free during [t, latest requested slice time].
+  //
+  // For example, the following depiction of free_chunks_ indicates that
+  // at slice time t2, we have 2 free chunks, [5,15) and [20, 25). At slice time
+  // t1, the free chunk [5,15) is still free at [6,8) and [10,12). At slice time
+  // t0, the free chunk [5,15) is still free at [7,8). The free chunk [20, 25)
+  // is also free at slice times t0 and t1. (In the depicition, `x` indicates
+  // used space and ` ` indicates free space.)
+  //
+  //    ^
+  // t2 |xxxxx          xxxxx     xxxxxx
+  // t1 |xxxxxx  xx  xxxxxxxx     xxxxxx
+  // t0 |xxxxxxx xxxxxxxxxxxx     xxxxxx
+  //    +!----|----!----|----!----|----!>
+  //          space
+  class SlicedAllocationFinder {
+   public:
+    // The chunk at index i is the chunk that should be allocated at slice time
+    // i.
+    using ChunksSortedBySliceTime = std::vector<Chunk>;
+
+    // A structure representing a piece of a free chunk that is continuously
+    // free in [piece.earliest_free_slice_time, LatestSliceTime()].
+    struct FreeChunkPiece {
+      std::string ToString() const;
+
+      int64_t earliest_free_slice_time;
+      Chunk dimensions;
+    };
+
+    // A sorted map (indexed by starting offset) describing how far back in
+    // slice time different pieces of a FreeChunkRoot are free.
+#if defined(__GNUC__) || defined(__clang__)
+    using FreeChunkPieces =
+        absl::btree_map<int64_t, FreeChunkPiece, std::greater<int64_t>>;
+#else
+    using FreeChunkPieces =
+        std::map<int64_t, FreeChunkPiece, std::greater<int64_t>>;
+#endif
+
+    // A free chunk that has been split into FreeChunkPieces.
+    struct FreeChunkRoot {
+      FreeChunkRoot(const Chunk& free_chunk, int64_t free_chunk_slice_time);
+
+      std::string ToString() const;
+
+      // Update pieces in accordance with the knowledge that free_chunk is
+      // free at free_chunk_slice_time.
+      //
+      // REQUIRES:
+      // - We must process all updates at free_chunk_slice_time x before
+      //   processing those at free time x-1.
+      void Update(const Chunk& free_chunk, int64_t free_chunk_slice_time);
+
+      Chunk chunk;
+      FreeChunkPieces pieces;
+    };
+
+    // A sorted map (indexed by starting offset) of FreeChunkRoots.
+#if defined(__GNUC__) || defined(__clang__)
+    using FreeChunkRoots =
+        absl::btree_map<int64_t, FreeChunkRoot, std::greater<int64_t>>;
+#else
+    using FreeChunkRoots =
+        std::map<int64_t, FreeChunkRoot, std::greater<int64_t>>;
+#endif
+
+    // A method that can be passed to the is_offset_allowed parameter for
+    // SlicedAllocationFinder() that permits placement at any offset.
+    static bool AllOffsetsAllowed(int64_t offset) { return true; }
+
+    // Arguments:
+    // - free_chunks_per_slice_time[i]: Describes free chunks at slice time i.
+    // - sorted_slice_sizes: A sliced allocation request. In space, the i+1th
+    //   slice immediately follows the ith slice.
+    // - max_colocation_size: The max size of any buffer that will be colocated
+    //   with the fully allocated sliced allocation.
+    // - preferred_offset: The preferred starting offset for the fully allocated
+    //   sliced allocation.
+    // - slice_time_permutation_iterator: An iterator for iterating over the
+    //   different slice time permutations for slices. Users may specify the
+    //   order in which different permutations are tried by the HeapSimulator.
+    //   Users are also responsbile for ensuring that returned permutations are
+    //   legal.
+    // - is_offset_allowed: Indicates if a the entire sliced allocation is
+    //   allowed to be allocated at a given offset.
+    //
+    // REQUIRES:
+    // - sorted_slice_sizes.size() == free_chunks_per_slice_time.size()
+    // - any slice can be allocated at any slice time
+    // - alignment >= 1
+    //
+    // In the future, if we want to restrict certain slices to be fetched at
+    // certain slice times (e.g., because certain slices don't represent enough
+    // real time to allocate a larger slice), we can take a lambda to indicate
+    // what is permitted.
+    SlicedAllocationFinder(
+        absl::Span<const FreeChunks> free_chunks_per_slice_time,
+        std::vector<int64_t> sorted_slice_sizes, int64_t max_colocation_size,
+        int64_t preferred_offset, int64_t alignment,
+        std::unique_ptr<SliceTimePermutationIterator>
+            slice_time_permutation_iterator,
+        absl::AnyInvocable<bool(int64_t) const> is_offset_allowed =
+            &AllOffsetsAllowed);
+
+    std::string FreeChunksToAsciiArt() const;
+    std::string ToString() const;
+
+    // Finds a set of chunks in which to allocate the sliced allocation request.
+    // Returns a vector of chunks in which the ith element is the chunk that
+    // should be allocated at slice time i. If no such chunks can be found, an
+    // empty vector is returned.
+    //
+    // The returned vector will always be 1 larger than the initial request,
+    // with a chunk to represent any additional allocation needed for
+    // max_colocation_size_. This extra chunk will always come at the end of
+    // the returned vector and will be present even if its size is 0.
+    ChunksSortedBySliceTime Find() const;
+
+    // Similar to Find(), but only checks placement at the specified offset. If
+    // the sliced allocation can not be placed at the specified offset, an
+    // empty vector is returned.
+    ChunksSortedBySliceTime FindForOffset(int64_t offset) const;
+
+   private:
+    // The earliest slice time for the specified sliced allocation request.
+    int64_t EarliestSliceTime() const { return 0; }
+
+    // The latest slice time for the specified sliced allocation request.
+    int64_t LatestSliceTime() const { return sorted_slice_sizes_.size() - 1; }
+
+    // Returns ok if the given permutation of slice times results in an
+    // allocation of free space in root, at the specified offset. Otherwise,
+    // returns the reason such an allocation would not fit.
+    //
+    // permutation_of_slice_times[i] is the slice time that the ith slice
+    // (spatially) should be allocated. Such a slice has size
+    // sorted_slice_sizes_[i] and would be allocated at offset +
+    // sum(sorted_slice_sizes[j], for j in [0, i-1]).
+    absl::Status DoesPermutationFit(
+        absl::Span<const int64_t> permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const;
+
+    // Only DoesSlicedPermutationFit() should call this method directly. Other
+    // callers should call DoesSlicedPermutationFit(), which contains some
+    // wrapper VLOGGING.
+    absl::Status DoesPermutationFitImpl(
+        absl::Span<const int64_t> permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const;
+
+    // Same as Find() except only checks root, to see if it can hold the sliced
+    // allocation request. If only_try_this_offset is set, we only evaluate the
+    // specified offset, when trying to find a fit for the sliced allocation
+    // request.
+    ChunksSortedBySliceTime FindInRoot(
+        const FreeChunkRoot& root,
+        std::optional<int64_t> only_try_this_offset = std::nullopt) const;
+
+    // Given a permutation of slice times (see DoesSlicedPermutationFit()),
+    // return a vector of chunks, in which the ith chunk should be allocated at
+    // slice time i, with size sorted_slice_sizes_[i] and at offset +
+    // sum(sorted_slice_sizes[j], for j in [0, i-1]).
+    //
+    // PermutationToChunks() does the additional job of adding a Chunk to the
+    // end of the result to account for an additional colocation space that
+    // need to be allocated. This Chunk is added, even if it is of size 0.
+    ChunksSortedBySliceTime PermutationToChunks(
+        absl::Span<const int64_t> permutation_of_slice_times,
+        int64_t offset) const;
+
+    std::vector<int64_t> sorted_slice_sizes_;
+    int64_t slice_size_sum_;
+    int64_t max_colocation_size_;
+    int64_t preferred_offset_;
+    int64_t alignment_;
+    FreeChunkRoots free_chunks_;
+    std::unique_ptr<SliceTimePermutationIterator>
+        slice_time_permutation_iterator_;
+    absl::AnyInvocable<bool(int64_t) const> is_offset_allowed_;
+  };
+
+  explicit GlobalDecreasingSizeBestFitHeap(
+      int64_t alignment, Type type = kSpatial,
+      BufferIntervalCompare buffer_interval_compare = nullptr,
+      SliceTimePermutationIterator::Ty slice_time_permutation_iterator_type =
+          SliceTimePermutationIterator::Ty::kAll);
+  ~GlobalDecreasingSizeBestFitHeap() override {}
+
+  void Alloc(const BufferType* buffer, int64_t size) override;
+  void Free(const BufferType* buffer, int64_t size) override;
+
+  void ShareWith(const BufferType* buffer, const BufferType* share_with,
+                 int64_t size) override;
+
+  absl::StatusOr<Result> Finish() override;
+
+  // Return a BufferIntervalCompare function that sort by spatial size. We don't
+  // look at co-locates as they should have the same size.
+  static BufferIntervalCompare GetSpatialBufferIntervalCompare();
+
+ protected:
+  // Returns the buffer intervals sorted according to buffer_interval_compare_.
+  std::vector<BufferInterval> GetSortedBufferIntervals() const;
+
+  // Compute free chunks as all memory - chunks that are allocated at some time
+  // during the lifetime of buffer_interval or during the lifetime of buffers
+  // that are colocated with buffer_interval.
+  //
+  // Imagine that we've already allocated three chunks : a, b and c.  And now
+  // we want to allocate d. Since e is colocated with d, we have to allocate
+  // chunks for them together at the same address. To do this, we first gather
+  // all chunks that overlap with d and e on the time dimension, in this case
+  // the overlapped chunks are a and b (c doesn't overlap with either of d and
+  // e), then find create a new chunk that doesn't overlap with a and b on the
+  // space dimension.
+  //
+  // space
+  //   ^
+  //   |+--d---+      +---e---+
+  //   |
+  //   |+---+  +---------------+  +-------+
+  //   ||   |  |               |  |       |
+  //   ||   |  |               |  |       |
+  //   |+-a-+  +-------b-------+  +---c---+
+  //   ----------------------------------------> time
+  //
+  // MakeFreeChunks imposes the following additional constraints on its output:
+  // - The chunks in the result will start on alignment_ boundaries.
+  // - A free chunk will not be returned if it does not have enough space to fit
+  //   max_colocation_size.
+  FreeChunks MakeFreeChunks(const BufferInterval& buffer_interval,
+                            int64_t max_colocation_size) const;
+
+  // These two methods below are exposed to other heap algorithms that inherit
+  // from this class. The Finish() method tries to find a candidate chunk for
+  // each BufferInterval, after calling GetSortedBufferIntervals. If a
+  // non-negative preferred_offset is provided, FindChunkCandidate attempts
+  // finding a chunk at this offset. The Finish() method can then call
+  // CommitChunk to associate the chunk with the BufferInterval, if the final
+  // heap size is within the limits.
+  Chunk FindChunkCandidate(const BufferInterval& buffer_interval,
+                           int64_t preferred_offset = -1) const;
+  // FindChunkCandidates is the same as FindChunkCandidate, except it finds
+  // spatially contiguous chunks candidates for a sliced buffer interval.
+  // Returned chunk i should be copied at slice time i.
+  //
+  // Given the way that FindChunkCandidates and MakeFreeChunks interact, the
+  // following properties are guaranteed about colocations.
+  // - The returned spatially contiguous chunks have enough space for every
+  //   colocation specified in sliced_buffer_interval.
+  // - The returned spatially contiguous chunks will be free for the entire
+  //   lifetime of each colocation. If a colocation is sliced, the returned
+  //   chunks will be free for the lifetime of the longest-lived slice.
+  std::vector<Chunk> FindChunkCandidates(
+      const SlicedBufferInterval& sliced_buffer_interval,
+      int64_t preferred_offset = -1) const;
+  // The following 3 methods are used to implement FindChunkCandidates.
+  int64_t GetMaxColocationSize(const BufferInterval& buffer_interval) const;
+  SlicedAllocationFinder CreateSlicedAllocationFinder(
+      const SlicedBufferInterval& sliced_interval, int64_t max_colocation_size,
+      int64_t preferred_offset,
+      std::unique_ptr<SliceTimePermutationIterator>
+          slice_time_permutation_iterator,
+      absl::AnyInvocable<bool(int64_t) const> is_offset_allowed =
+          &SlicedAllocationFinder::AllOffsetsAllowed) const;
+  std::vector<Chunk> PostProcessFindChunkCandidatesResult(
+      const SlicedBufferInterval& sliced_interval,
+      std::vector<Chunk> chunks) const;
+
+  void CommitChunk(const BufferInterval& buffer_interval, Chunk chunk);
+
+  // Adds the buffer and the chunk to the result chunk map.
+  virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
+
+  // Return a BufferIntervalCompare function that sorts by live ranges.  A live
+  // range is defined by the range between the start of the first buffer and the
+  // end of the last co-located buffer.  There could be "holes" in the live
+  // ranges of each co-located buffers, but in this heuristics we think they are
+  // contiguous.
+  BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
+
+  SliceTimePermutationIterator::Ty slice_time_permutation_iterator_type() const;
+
+  absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
+  HeapResult result_;
+  BufferIntervalCompare buffer_interval_compare_;
+  BufferIntervalTree interval_tree_;
+
+ private:
+  int64_t alignment_;
+
+  // The current time represented as an integer. It increments by 1 at each
+  // Alloc or Free call.
+  int64_t current_time_ = 0;
+
+  SliceTimePermutationIterator::Ty slice_time_permutation_iteration_type_ =
+      SliceTimePermutationIterator::Ty::kAll;
+
+ protected:
+  // Returns all transitive colocated buffers of this buffer interval. I.e., If
+  // a buffer A is colocated with B and B is colocated with C, this function
+  // returns all three of them.
+  absl::flat_hash_set<const BufferType*> GetTransitiveColocations(
+      const BufferInterval& interval) const;
+};
+
+// This class implements an algorithm that will produce multiple heaps, where
+// each heap size is constrained by a given limit. Note that the constraint is
+// soft, meaning that a valid heap result is generated even if there are some
+// buffer sizes larger than the given constraint size.
+//
+// Pseudocode:
+//   while( `buffers` is not empty ) {
+//     create a new heap `h`
+//     for (each buffer `buf` in `buffers` in the size-decreasing order) {
+//       if (buf.size() is larger than the heap size limit &&
+//           `h` is empty) {
+//         h.place(buf)
+//         buffers.remove(buf)
+//       } else if (placing `buf` into `h` does not violate size
+//           constraint) {
+//         h.place(buf)
+//         buffers.remove(buf)
+//       }
+//     }
+//   }
+class ConstrainedGlobalDecreasingSizeBestFitHeap
+    : public GlobalDecreasingSizeBestFitHeap<HloValue> {
+ public:
+  explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
+      uint64_t size_limit_per_heap, int64_t alignment, Type type = kSpatial,
+      BufferIntervalCompare buffer_interval_compare = nullptr)
+      : GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type,
+                                                  buffer_interval_compare),
+        size_limit_per_heap_(size_limit_per_heap) {}
+  ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
+
+  absl::StatusOr<Result> Finish() override;
+
+ private:
+  uint64_t size_limit_per_heap_;
+};
+
+// A heap algorithm that chooses the best results from other algorithms added to
+// it.
+template <typename BufferType>
+class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
+ public:
+  using Result = HeapSimulator::Result<BufferType>;
+
+  ChooseBestHeapAlgorithm(
+      std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>>
+          algorithms)
+      : algorithms_(std::move(*algorithms)) {}
+  ~ChooseBestHeapAlgorithm() override {}
+
+  void Alloc(const BufferType* buffer, int64_t size) override {
+    for (auto& algorithm : algorithms_) {
+      algorithm->Alloc(buffer, size);
+    }
+  }
+
+  void ShareWith(const BufferType* buffer, const BufferType* share_with,
+                 int64_t size) override {
+    for (auto& algorithm : algorithms_) {
+      algorithm->ShareWith(buffer, share_with, size);
+    }
+  }
+
+  void Free(const BufferType* buffer, int64_t size) override {
+    for (auto& algorithm : algorithms_) {
+      algorithm->Free(buffer, size);
+    }
+  }
+
+  absl::StatusOr<Result> Finish() override;
+
+ private:
+  std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
+};
+
+extern template class GlobalDecreasingSizeBestFitHeap<HloValue>;
+extern template class GlobalDecreasingSizeBestFitHeap<AllocationBlock>;
+extern template class ChooseBestHeapAlgorithm<HloValue>;
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HEAP_SIMULATOR_HEAP_SIMULATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_alias_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_alias_analysis.h
new file mode 100644
index 00000000..e2789add
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_alias_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
+#define XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+
+#endif  // XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_buffer.h
new file mode 100644
index 00000000..398b08d8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_buffer.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_BUFFER_H_
+#define XLA_SERVICE_HLO_BUFFER_H_
+
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/service/hlo_value.h"
+#include "xla/shape_tree.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A container which can hold one or more HloValues. An HLO buffer abstractly
+// represents the allocation which HLO instructions write into and read
+// from. Generally there is a one-to-one correspondence between HloBuffers and
+// HloValue where each HloValue in the module is held in a unique HloBuffer. An
+// exception is the while instruction which updates the loop state in-place. In
+// this case, we have a single HloBuffer for each HloPosition in the loop state,
+// but multiple HloValues. For example:
+//
+//   %init = ...
+//   %while = While(%init, body, condition)
+//
+//  body:
+//   %body_param = Param(0)
+//     ...
+//   %body_root = ...
+//
+//  condition:
+//   %cond_param = Param(0)
+//     ...
+//
+// For simplicity, assume that %while is array-shaped. In this case, we have a
+// single HloBuffer which holds the following HloValues: HloValue{%init},
+// HloValue{%while}, HloValue{%body_param}, HloValue{%body_root}, and
+// HloValue{%cond_param}.
+//
+// HloBuffers may appear at different HloPositions in the module mirroring the
+// same property of HloValues. For example:
+//
+//   %sub = Sub(...)
+//   %add = Add(...)
+//   %tuple = Tuple(%add, %sub)
+//   %gte = GetTupleElement(%tuple, 0)
+//
+// In this case, the HloBuffer containing %add appears at the following
+// positions: HloPosition{%add, {}}, HloPosition{%tuple, {0}}, and
+// HloPosition{%gte, {}}.
+//
+// Different HloPositions which share the same HloBuffer indicate mandatory
+// aliasing in the HLO module. These positions must share the same memory
+// allocation for correctness (the backends rely on this property). This differs
+// from incidental aliasing introduced by memory reuse in BufferAssignment where
+// different instructions may happen to get the same allocation.
+class HloBuffer {
+ public:
+  using Id = int64_t;
+
+  // Predicate comparing HloBuffers by increasing id, useful for std::sort.
+  static bool IdLessThan(const HloBuffer* a, const HloBuffer* b) {
+    return a->id() < b->id();
+  }
+
+  // Predicate comparing HloBuffers by equal id, useful for std::unique.
+  static bool IdEqual(const HloBuffer* a, const HloBuffer* b) {
+    return a->id() == b->id();
+  }
+
+  HloBuffer(Id id, std::vector<const HloValue*> values)
+      : id_(id), values_(std::move(values)) {}
+
+  // Return the unique identifier for this HloBuffer.
+  Id id() const { return id_; }
+
+  // Return all values contained in this buffer.
+  const std::vector<const HloValue*>& values() const { return values_; }
+
+  // Memory space color. Used to indicate the memory space that the hlo buffer
+  // needs to live in.
+  BufferValue::Color color() const {
+    // Invariant: All values in the buffer should have the same color.
+    BufferValue::Color result = values()[0]->color();
+    for (const HloValue* value : values()) {
+      DCHECK_EQ(result, value->color());
+    }
+    return result;
+  }
+
+  // Return the unique HLO value in the buffer. CHECK fails if the buffer does
+  // not contain exactly one value.
+  const HloValue& GetUniqueValue() const {
+    CHECK_EQ(values_.size(), 1);
+    return *values_[0];
+  }
+
+  std::vector<HloPosition> ComputePositions() const;
+
+  std::string ToString() const;
+
+  bool operator==(const HloBuffer& other) const;
+  bool operator!=(const HloBuffer& other) const { return !(*this == other); }
+
+ private:
+  // Unique identifier for this HloBuffer.
+  Id id_;
+
+  // The set of values contained in this buffer. Vector contains no duplicates
+  // and is sorted stably by HloValue::Id.
+  std::vector<const HloValue*> values_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_computation_deduplicator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_computation_deduplicator.h
new file mode 100644
index 00000000..bf82bc4f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_computation_deduplicator.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_COMPUTATION_DEDUPLICATOR_H_
+#define XLA_SERVICE_HLO_COMPUTATION_DEDUPLICATOR_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h"
+
+#endif  // XLA_SERVICE_HLO_COMPUTATION_DEDUPLICATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_constant_folding.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_constant_folding.h
new file mode 100644
index 00000000..5f82f95d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_constant_folding.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
+#define XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h"
+
+#endif  // XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_cost_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_cost_analysis.h
new file mode 100644
index 00000000..a7d159b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_cost_analysis.h
@@ -0,0 +1,730 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_COST_ANALYSIS_H_
+#define XLA_SERVICE_HLO_COST_ANALYSIS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// HloCostAnalysis traverses an HLO graph and calculates the amount of
+// computations required for the graph. Each HLO instruction handler provides
+// the computation cost of the instruction, and the values are accumulated
+// during the traversal for the entire graph. We treat normal floating point
+// operations separately from transcendental operations.
+class HloCostAnalysis : public ConstDfsHloVisitor {
+ public:
+  static inline constexpr absl::string_view kFlopsKey = "flops";
+  static inline constexpr absl::string_view kTranscendentalsKey =
+      "transcendentals";
+  static inline constexpr absl::string_view kBytesAccessedKey =
+      "bytes accessed";
+  static inline constexpr absl::string_view kOptimalSecondsKey =
+      "optimal_seconds";
+  static inline constexpr absl::string_view kUtilizationKey = "utilization";
+
+  // Key reserved for use by subclasses.  This gets the same special "fast
+  // path" treatment in Properties as the other keys above.
+  static inline constexpr absl::string_view kReserved0Key = "reserved0";
+
+  // A data structure like hash_map<string, float> for storing info about an HLO
+  // instruction or computation.
+  //
+  // Note that unlike a regular hashtable, there's no notion of an "unset" key.
+  // All keys are logically present, with value 0.
+  //
+  // This data structure *could* be simply map<string, float>, and indeed it
+  // was, once.  The problem is, XLA:GPU uses HloCostAnalysis during
+  // compilation.  This class is used *everywhere* within cost analysis, and the
+  // hashtable lookups added up to the majority (!) of its runtime.
+  //
+  // This is a bit silly, because the vast majority of the time, we're looking
+  // up a small, fixed set of keys.  So you might be tempted to convert
+  // Properties into a simple struct of floats.
+  //
+  // The problem with *that* is threefold.  (1) subclasses expect to be able to
+  // store arbitrary keys inside Properties.  This doesn't work if it's a
+  // struct.  (2) We expect to be able to store *and retrieve* values
+  // representing e.g. "the utilization of operand n at shape index i", and (3)
+  // the hashtable-ness of this class is part of XLA's public API and so is hard
+  // to change.
+  //
+  // So instead we end up with this Frankenstein's monster of a class.  It
+  // *acts* like a hashtable, but before falling back to the hashtable, it
+  // checks whether the string matches one of a list of "known keys".  If so, it
+  // returns that special value from the struct.
+  //
+  // Normally this would be much worse than just using a plain hashtable.  But
+  // we happen to know that you're almost always doing prop[kKnownKey], in which
+  // case operator[] can be inlined and the string comparison optimized away.
+  //
+  // Sorry for all this complexity, but this is the most impactful single
+  // optimization we were able make to GPU compilation time.
+  //
+  class Properties {
+   public:
+    Properties()
+        : flops_(0),
+          transcendentals_(0),
+          bytes_accessed_(0),
+          optimal_seconds_(0),
+          utilization_(0),
+          operand0_utilization_(0),
+          operand1_utilization_(0),
+          operand0_bytes_accessed_(0),
+          operand1_bytes_accessed_(0),
+          output_root_bytes_accessed_(0),
+          reserved0_(0) {
+      DCHECK_EQ(kOperand0UtilizationKey, GetOperandUtilizationKey(0, {}));
+      DCHECK_EQ(kOperand1UtilizationKey, GetOperandUtilizationKey(1, {}));
+      DCHECK_EQ(kOperand0BytesAccessedKey, GetOperandBytesAccessedKey(0, {}));
+      DCHECK_EQ(kOperand1BytesAccessedKey, GetOperandBytesAccessedKey(1, {}));
+      DCHECK_EQ(kOutputRootBytesAccessedKey, GetOutputBytesAccessedKey({}));
+    }
+
+    float& operator[](absl::string_view property) {
+      if (property == kFlopsKey) {
+        return flops_;
+      }
+      if (property == kTranscendentalsKey) {
+        return transcendentals_;
+      }
+      if (property == kBytesAccessedKey) {
+        return bytes_accessed_;
+      }
+      if (property == kOptimalSecondsKey) {
+        return optimal_seconds_;
+      }
+      if (property == kUtilizationKey) {
+        return utilization_;
+      }
+      if (property == kOperand0UtilizationKey) {
+        return operand0_utilization_;
+      }
+      if (property == kOperand1UtilizationKey) {
+        return operand1_utilization_;
+      }
+      if (property == kOperand0BytesAccessedKey) {
+        return operand0_bytes_accessed_;
+      }
+      if (property == kOperand1BytesAccessedKey) {
+        return operand1_bytes_accessed_;
+      }
+      if (property == kOutputRootBytesAccessedKey) {
+        return output_root_bytes_accessed_;
+      }
+      if (property == kReserved0Key) {
+        return reserved0_;
+      }
+
+      auto it = named_props_.lazy_emplace(property, [&](const auto& ctor) {
+        ctor(std::string(property), 0.f);
+      });
+      return it->second;
+    }
+
+    float operator[](absl::string_view property) const {
+      if (property == kFlopsKey) {
+        return flops_;
+      }
+      if (property == kTranscendentalsKey) {
+        return transcendentals_;
+      }
+      if (property == kBytesAccessedKey) {
+        return bytes_accessed_;
+      }
+      if (property == kOptimalSecondsKey) {
+        return optimal_seconds_;
+      }
+      if (property == kUtilizationKey) {
+        return utilization_;
+      }
+      if (property == kOperand0UtilizationKey) {
+        return operand0_utilization_;
+      }
+      if (property == kOperand1UtilizationKey) {
+        return operand1_utilization_;
+      }
+      if (property == kOperand0BytesAccessedKey) {
+        return operand0_bytes_accessed_;
+      }
+      if (property == kOperand1BytesAccessedKey) {
+        return operand1_bytes_accessed_;
+      }
+      if (property == kOutputRootBytesAccessedKey) {
+        return output_root_bytes_accessed_;
+      }
+      if (property == kReserved0Key) {
+        return reserved0_;
+      }
+
+      auto it = named_props_.find(property);
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+
+    template <typename Fn>
+    void ForEach(Fn&& fn) const {
+      if (flops_ != 0) {
+        fn(kFlopsKey, flops_);
+      }
+      if (transcendentals_ != 0) {
+        fn(kTranscendentalsKey, transcendentals_);
+      }
+      if (bytes_accessed_ != 0) {
+        fn(kBytesAccessedKey, bytes_accessed_);
+      }
+      if (optimal_seconds_ != 0) {
+        fn(kOptimalSecondsKey, optimal_seconds_);
+      }
+      if (utilization_ != 0) {
+        fn(kUtilizationKey, utilization_);
+      }
+      if (operand0_utilization_ != 0) {
+        fn(kOperand0UtilizationKey, operand0_utilization_);
+      }
+      if (operand1_utilization_ != 0) {
+        fn(kOperand1UtilizationKey, operand1_utilization_);
+      }
+      if (operand0_bytes_accessed_ != 0) {
+        fn(kOperand0BytesAccessedKey, operand0_bytes_accessed_);
+      }
+      if (operand1_bytes_accessed_ != 0) {
+        fn(kOperand1BytesAccessedKey, operand1_bytes_accessed_);
+      }
+      if (output_root_bytes_accessed_ != 0) {
+        fn(kOutputRootBytesAccessedKey, output_root_bytes_accessed_);
+      }
+      if (reserved0_ != 0) {
+        fn(kReserved0Key, reserved0_);
+      }
+      for (const auto& [k, v] : named_props_) {
+        if (v != 0) {
+          fn(k, v);
+        }
+      }
+    }
+
+    // No getters/setters for simple properties like flops().  For these,
+    // props[kFlopsKey] gets optimized to `return flops_` just fine.
+
+    // Getters/setters for more complex properties like operand utilization,
+    // where we have a fastpath, e.g., operand 0/1 + shape_index {}.
+    float operand_utilization(int64_t operand,
+                              const ShapeIndex& shape_index = {}) {
+      if (operand == 0 && shape_index.empty()) {
+        return operand0_utilization_;
+      }
+      if (operand == 1 && shape_index.empty()) {
+        return operand1_utilization_;
+      }
+
+      auto it =
+          named_props_.find(GetOperandUtilizationKey(operand, shape_index));
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+    void set_operand_utilization(int64_t operand, float value) {
+      set_operand_utilization(operand, /*shape_index=*/{}, value);
+    }
+    void set_operand_utilization(int64_t operand, const ShapeIndex& shape_index,
+                                 float value) {
+      if (operand == 0 && shape_index.empty()) {
+        operand0_utilization_ = value;
+      } else if (operand == 1 && shape_index.empty()) {
+        operand1_utilization_ = value;
+      } else {
+        named_props_[GetOperandUtilizationKey(operand, shape_index)] = value;
+      }
+    }
+
+    float operand_bytes_accessed(int64_t operand,
+                                 const ShapeIndex& shape_index = {}) {
+      if (operand == 0 && shape_index.empty()) {
+        return operand0_bytes_accessed_;
+      }
+      if (operand == 1 && shape_index.empty()) {
+        return operand1_bytes_accessed_;
+      }
+
+      auto it =
+          named_props_.find(GetOperandBytesAccessedKey(operand, shape_index));
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+    void set_operand_bytes_accessed(int64_t operand, float value) {
+      set_operand_bytes_accessed(operand, /*shape_index=*/{}, value);
+    }
+    void set_operand_bytes_accessed(int64_t operand,
+                                    const ShapeIndex& shape_index,
+                                    float value) {
+      if (operand == 0 && shape_index.empty()) {
+        operand0_bytes_accessed_ = value;
+      } else if (operand == 1 && shape_index.empty()) {
+        operand1_bytes_accessed_ = value;
+      } else {
+        named_props_[GetOperandBytesAccessedKey(operand, shape_index)] = value;
+      }
+    }
+
+    float output_bytes_accessed(const ShapeIndex& shape_index = {}) {
+      if (shape_index.empty()) {
+        return output_root_bytes_accessed_;
+      }
+      auto it = named_props_.find(GetOutputBytesAccessedKey(shape_index));
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+    void set_output_bytes_accessed(float value) {
+      set_output_bytes_accessed({}, value);
+    }
+    void set_output_bytes_accessed(const ShapeIndex& shape_index, float value) {
+      if (shape_index.empty()) {
+        output_root_bytes_accessed_ = value;
+      } else {
+        named_props_[GetOutputBytesAccessedKey(shape_index)] = value;
+      }
+    }
+
+    std::string ToString() const {
+      return absl::StrFormat(
+          "HloCostAnalysis::Properties{\n"
+          " flops: %f,\n"
+          " transcendentals: %f\n"
+          " bytes_accessed: %f\n"
+          " optimal_seconds: %f\n"
+          " utilization: %f\n"
+          " operand0_utilization: %f\n"
+          " operand1_utilization: %f\n"
+          " operand0_bytes_accessed: %f\n"
+          " operand1_bytes_accessed: %f\n"
+          " output_root_bytes_accessed: %f\n"
+          " reserved0: %f\n"
+          "}",
+          flops_, transcendentals_, bytes_accessed_, optimal_seconds_,
+          utilization_, operand0_utilization_, operand1_utilization_,
+          operand0_bytes_accessed_, operand1_bytes_accessed_,
+          output_root_bytes_accessed_, reserved0_);
+    }
+
+   private:
+    // These must match GetOperandUtilizationKey(0, {}) etc.
+    static inline constexpr absl::string_view kOperand0UtilizationKey =
+        "utilization0{}";
+    static inline constexpr absl::string_view kOperand1UtilizationKey =
+        "utilization1{}";
+    static inline constexpr absl::string_view kOperand0BytesAccessedKey =
+        "bytes accessed0{}";
+    static inline constexpr absl::string_view kOperand1BytesAccessedKey =
+        "bytes accessed1{}";
+    static inline constexpr absl::string_view kOutputRootBytesAccessedKey =
+        "bytes accessedout{}";
+
+    float flops_;
+    float transcendentals_;
+    float bytes_accessed_;
+    float optimal_seconds_;
+    float utilization_;
+
+    float operand0_utilization_;
+    float operand1_utilization_;
+
+    float operand0_bytes_accessed_;
+    float operand1_bytes_accessed_;
+
+    float output_root_bytes_accessed_;
+
+    // Field reserved for use by subclasses.
+    float reserved0_;
+
+    absl::flat_hash_map<std::string, float> named_props_;
+  };
+
+  // shape_size is a function which returns the size in bytes of the top-level
+  // buffer of a shape.
+  using ShapeSizeFunction = std::function<int64_t(const Shape&)>;
+
+  static constexpr int64_t kDefaultPointerSize = 8;
+  static int64_t DefaultShapeSize(const Shape& shape);
+
+  // A struct to encapsulate hardware-related options. This includes the shape
+  // size function, which is used to encode hardware-specific padding and per
+  // second rates of FLOPs, bytes per second (available bandwidth), and
+  // transcendentals per second.
+  struct Options {
+    // Function which computes the size of the top-level of a given shape (not
+    // including nested elements, if any). If null then bytes_accessed methods
+    // return an error.
+    ShapeSizeFunction shape_size = DefaultShapeSize;
+    // How much of each property can be processed per second. E.g. if the
+    // property is bytes accessed, this is the number of bytes that can be
+    // processed per second. Is empty if no rates have been set.
+    Properties per_second_rates = {};
+    // The minimum amount of time (in seconds) required to process per each
+    // property. Hardware design choices (e.g., clock speeds, memory access
+    // latencies) impose a lower bound on the duration of any operation, even
+    // the simplest ones.
+    Properties min_latencies_seconds;
+    // Operations like broadcast with reused inputs are not handled
+    // efficiently on some platforms. Depending on the goal of the analysis
+    // we may need to count or ignore them.
+    bool count_multiple_input_accesses = false;
+
+    // Set the rates used to calculate the time taken by the computation.
+    void set_flops_per_second(float value) {
+      per_second_rates[kFlopsKey] = value;
+    }
+    void set_flops_min_latency_second(float value) {
+      min_latencies_seconds[kFlopsKey] = value;
+    }
+    void set_transcendentals_per_second(float value) {
+      per_second_rates[kTranscendentalsKey] = value;
+    }
+    void set_bytes_per_second(float value) {
+      per_second_rates[kBytesAccessedKey] = value;
+    }
+    void set_bytes_min_latency_second(float value) {
+      min_latencies_seconds[kBytesAccessedKey] = value;
+    }
+
+    // Returns the specified per-second rate used by cost analysis.
+    float per_second_rate(absl::string_view key) const {
+      return per_second_rates[key];
+    }
+
+    float min_latency_seconds(absl::string_view key) const {
+      return min_latencies_seconds[key];
+    }
+
+    std::string ToString() const {
+      return absl::StrFormat(
+          "HloCostAnalysis::Options{\n"
+          " per_second_rates: %s\n"
+          " min_latency_seconds: %s\n"
+          " count_multiple_input_accesses: %d\n"
+          "}",
+          per_second_rates.ToString(), min_latencies_seconds.ToString(),
+          count_multiple_input_accesses);
+    }
+  };
+
+  explicit HloCostAnalysis(const Options& options);
+  explicit HloCostAnalysis(ShapeSizeFunction shape_size = DefaultShapeSize,
+                           const Properties& per_second_rates = {},
+                           const Properties& min_latency_seconds = {});
+
+  // For all element-wise instruction we call HandleElementwiseOp. If necessary,
+  // override HandleElementwiseOp instead.
+  absl::Status HandleElementwiseUnary(const HloInstruction* hlo) final;
+  absl::Status HandleElementwiseBinary(const HloInstruction* hlo) final;
+  absl::Status HandleSelect(const HloInstruction* hlo) final;
+  absl::Status HandleCompare(const HloInstruction* compare) final;
+  absl::Status HandleClamp(const HloInstruction* clamp) final;
+  absl::Status HandleConvert(const HloInstruction* convert) final;
+
+  // Utility function to handle all element-wise operations.
+  virtual absl::Status HandleElementwiseOp(
+      const HloInstruction* hlo_instruction);
+
+  absl::Status HandleConstant(const HloInstruction* constant) override;
+  absl::Status HandleIota(const HloInstruction* iota) override;
+  absl::Status HandleGetTupleElement(
+      const HloInstruction* get_tuple_element) override;
+  absl::Status HandleReducePrecision(const HloInstruction* hlo) override;
+  absl::Status HandleConcatenate(const HloInstruction* concatenate) override;
+  absl::Status HandleAsyncStart(const HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(const HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(const HloInstruction* async_done) override;
+  absl::Status HandleCopyStart(const HloInstruction* send) override;
+  absl::Status HandleCopyDone(const HloInstruction* send_done) override;
+  absl::Status HandleSend(const HloInstruction* send) override;
+  absl::Status HandleSendDone(const HloInstruction* send_done) override;
+  absl::Status HandleRecv(const HloInstruction* recv) override;
+  absl::Status HandleRecvDone(const HloInstruction* recv_done) override;
+  absl::Status HandleCopy(const HloInstruction* copy) override;
+  absl::Status HandleDomain(const HloInstruction* domain) override;
+  absl::Status HandleDot(const HloInstruction* dot) override;
+  absl::Status HandleRaggedDot(const HloInstruction* dot) override;
+  absl::Status HandleConvolution(const HloInstruction* convolution) override;
+  absl::Status HandleFft(const HloInstruction* fft) override;
+  absl::Status HandleTriangularSolve(const HloInstruction* hlo) override;
+  absl::Status HandleCholesky(const HloInstruction* hlo) override;
+  absl::Status HandleOptimizationBarrier(const HloInstruction* hlo) override;
+  absl::Status HandleAllGather(const HloInstruction* hlo) override;
+  absl::Status HandleAllGatherStart(const HloInstruction* hlo) override;
+  absl::Status HandleAllGatherDone(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduce(const HloInstruction* crs) override;
+  absl::Status HandleReduceScatter(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduceStart(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduceDone(const HloInstruction* hlo) override;
+  absl::Status HandleAllToAll(const HloInstruction* hlo) override;
+  absl::Status HandleRaggedAllToAll(const HloInstruction* hlo) override;
+  absl::Status HandleCollectiveBroadcast(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteStart(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteDone(const HloInstruction* hlo) override;
+  absl::Status HandleReplicaId(const HloInstruction* hlo) override;
+  absl::Status HandlePartitionId(const HloInstruction* hlo) override;
+  absl::Status HandleInfeed(const HloInstruction* infeed) override;
+  absl::Status HandleOutfeed(const HloInstruction* outfeed) override;
+  absl::Status HandleRng(const HloInstruction* random) override;
+  absl::Status HandleRngBitGenerator(const HloInstruction* random) override;
+  absl::Status HandleRngGetAndUpdateState(
+      const HloInstruction* random) override;
+  absl::Status HandleReverse(const HloInstruction* reverse) override;
+  absl::Status HandleSort(const HloInstruction* sort) override;
+  absl::Status HandleParameter(const HloInstruction* parameter) override;
+  absl::Status HandleReduce(const HloInstruction* reduce) override;
+  absl::Status HandleBatchNormTraining(
+      const HloInstruction* batch_norm_training) override;
+  absl::Status HandleBatchNormInference(
+      const HloInstruction* batch_norm_inference) override;
+  absl::Status HandleBatchNormGrad(
+      const HloInstruction* batch_norm_grad) override;
+  absl::Status HandleFusion(const HloInstruction* fusion) override;
+  absl::Status HandleCall(const HloInstruction* call) override;
+  absl::Status HandleCustomCall(const HloInstruction* custom_call) override;
+  absl::Status HandleSlice(const HloInstruction* slice) override;
+  absl::Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
+      const HloInstruction* dynamic_update_slice) override;
+  absl::Status HandleTuple(const HloInstruction* tuple) override;
+  absl::Status HandleMap(const HloInstruction* map) override;
+  absl::Status HandleReduceWindow(const HloInstruction* reduce_window) override;
+  absl::Status HandleSelectAndScatter(
+      const HloInstruction* instruction) override;
+  absl::Status HandleBitcast(const HloInstruction* bitcast) override;
+  absl::Status HandleBroadcast(const HloInstruction* broadcast) override;
+  absl::Status HandlePad(const HloInstruction* pad) override;
+  absl::Status HandleReshape(const HloInstruction* reshape) override;
+  absl::Status HandleDynamicReshape(const HloInstruction* reshape) override;
+  absl::Status HandleAddDependency(
+      const HloInstruction* add_dependency) override;
+  absl::Status HandleAfterAll(const HloInstruction* token) override;
+  absl::Status HandleTranspose(const HloInstruction* transpose) override;
+  absl::Status HandleWhile(const HloInstruction* xla_while) override;
+  absl::Status HandleConditional(const HloInstruction* conditional) override;
+  absl::Status HandleGather(const HloInstruction* gather) override;
+  absl::Status HandleScatter(const HloInstruction* hlo) override;
+  absl::Status HandleGetDimensionSize(const HloInstruction* get_size) override;
+  absl::Status HandleSetDimensionSize(const HloInstruction* set_size) override;
+  absl::Status HandleTopK(const HloInstruction* topk) override;
+  absl::Status FinishVisit(const HloInstruction* root) override;
+
+  absl::Status Preprocess(const HloInstruction* hlo) override;
+  absl::Status Postprocess(const HloInstruction* hlo) override;
+
+  // Enable efficient updates if a known small set of instructions within an
+  // HLO graph was modified.
+  // Updates the cost analysis by removing one instruction.
+  absl::Status RemoveInstruction(HloInstruction* instruction);
+  // Updates the cost analysis by re-doing the analysis of one instruction.
+  absl::Status RevisitInstruction(HloInstruction* instruction);
+
+  // Decorates shape_size_ by returning 0 immediately if the shape does not have
+  // a layout.
+  int64_t GetShapeSize(const Shape& shape) const;
+
+  // Returns properties for the computation.
+  float flop_count() const;
+  float transcendental_count() const;
+  float bytes_accessed() const;
+  float optimal_seconds() const;
+
+  Properties properties(const HloInstruction& hlo) const;
+  // Returns the respective cost computed for a particular HLO instruction, or 0
+  // if the HLO was not found to have a cost in the analysis.
+  //
+  // Note that the cost for sub HLO instructions are also returned if asked. For
+  // example, body and condition of a while, fused instructions within a
+  // fusion, or the add instruction of a reduce.
+  int64_t flop_count(const HloInstruction& hlo) const;
+  int64_t transcendental_count(const HloInstruction& hlo) const;
+  int64_t bytes_accessed(const HloInstruction& hlo) const;
+  int64_t operand_bytes_accessed(const HloInstruction& hlo, int64_t operand_num,
+                                 ShapeIndex index = {}) const;
+  // Value indicating how much each input of the instruction
+  // is used assuming its output is fully used.
+  // This is 1.0 for most cases except operations involving slicing (<1)
+  // and on some backends in addition reuse of inputs (>1).
+  float operand_utilization(const HloInstruction& hlo, int64_t operand_num,
+                            ShapeIndex index = {}) const;
+  int64_t output_bytes_accessed(const HloInstruction& hlo,
+                                ShapeIndex index = {}) const;
+  float optimal_seconds(const HloInstruction& hlo) const;
+
+  // Get bytes read/written by this HLO. If memory_space is provided, it returns
+  // the bytes read/written from/to the given memory space only.
+  int64_t GetBytesRead(
+      const HloInstruction& hlo,
+      std::optional<int64_t> memory_space = std::nullopt) const;
+  int64_t GetBytesWritten(
+      const HloInstruction& hlo,
+      std::optional<int64_t> memory_space = std::nullopt) const;
+
+  const Properties& properties() const { return properties_sum_; }
+  float property(absl::string_view key) { return properties_sum_[key]; }
+
+  // Returns the specified per-second rate used by cost analysis.
+  float per_second_rate(absl::string_view key) const {
+    return options_.per_second_rate(key);
+  }
+  // Returns the specified minimum latency used by cost analysis.
+  float min_latency_seconds(absl::string_view key) const {
+    return options_.min_latency_seconds(key);
+  }
+
+  // Return the key that is used to index into Properties for the specified
+  // input/output at the shape index.
+  static std::string GetOperandBytesAccessedKey(int64_t operand_num,
+                                                const ShapeIndex& index = {});
+  static std::string GetOperandUtilizationKey(int64_t operand_num,
+                                              const ShapeIndex& index = {});
+  static std::string GetOutputBytesAccessedKey(const ShapeIndex& index = {});
+
+  // Returns the estimated convolution flops.
+  virtual int64_t GetConvolutionFlops(const HloInstruction* convolution);
+  // Same as above but with parameters for shapes to allow for backends to
+  // refine these.
+  static int64_t GetConvolutionFlops(const HloInstruction* convolutions,
+                                     const Shape& lhs_shape,
+                                     const Shape& rhs_shape,
+                                     const Shape& result_shape);
+
+  // Returns the estimated dot flops.
+  static int64_t GetDotFlops(const Shape& lhs_shape, const Shape& result_shape,
+                             const DotDimensionNumbers& dnums);
+
+ protected:
+  // Computes the bytes accessed based on the outputs produced by the fusion
+  // instruction.
+  virtual absl::Status FusionProcessOutputBytesAccessed(
+      const HloInstruction* fusion);
+
+  // Computes the bytes accessed (read) based on the inputs consumed by the
+  // fusion instruction.
+  virtual absl::Status FusionProcessOperandBytesRead(
+      const HloInstruction* fusion);
+
+  // Computes memory access to all larger constants in the fusion instruction.
+  virtual absl::Status FusionCountConstantsMemoryAccess(
+      const HloInstruction* fusion);
+
+  // Allows exclusion of certain types of inputs from bytes accessed during
+  // FusionProcessOperandBytesRead.
+  virtual bool ShouldFilterFusionInput(const HloInstruction* fusion,
+                                       int64_t input_index) {
+    return false;
+  }
+
+  // Allows exclusion of certain instructions from FusionCalculateUtilizations.
+  virtual bool ShouldFilterFusionInstruction(
+      const HloInstruction* fusion, const HloInstruction* instruction) {
+    return false;
+  }
+
+  // Allows exclusion of certain types of output from bytes written during
+  // FusionProcessOutputBytesAccessed.
+  virtual bool ShouldFilterFusionOutputIndex(const HloInstruction* fusion,
+                                             const ShapeIndex& output_index) {
+    return false;
+  }
+
+  typedef absl::flat_hash_map<const HloInstruction*, Properties>
+      HloToProperties;
+
+  // An FMA counts as two floating point operations in these analyzes.
+  static constexpr int64_t kFmaFlops = 2;
+
+  // Small constants can be embedded in the assembly and not require
+  // memory access.
+  virtual size_t immediate_constant_max_elements() const { return 1; }
+
+  // Creates a nested instance of HloCostAnalysis using the same Options.
+  virtual std::unique_ptr<HloCostAnalysis> CreateNestedCostAnalysis();
+
+  // Returns the properties computed from visiting the computation rooted at the
+  // given hlo. The cost of visited sub HLO instructions is saved to
+  // hlo_properties_, which will be used by functions such as
+  // flop_count(hlo_instruction) to return cost of a particular HLO instruction.
+  virtual absl::StatusOr<Properties> ProcessSubcomputation(
+      HloComputation* computation);
+
+  // Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
+  // is not present in hlo_to_properties[hlo]. Otherwise, returns the value that
+  // the key maps to in the properties of the given hlo.
+  static float GetPropertyForHlo(const HloInstruction& hlo,
+                                 absl::string_view key,
+                                 const HloToProperties& hlo_to_properties);
+
+  // Traverses a fusion operand to find the actual bytes accessed by the fusion
+  // node.
+  virtual int64_t FusionParameterReadBytes(const HloInstruction* hlo) const;
+
+  // Traverses a fusion counting total utilization of every instruction inside.
+  // Currently implemented non-trivially only in the GPU cost analysis.
+  virtual absl::Status FusionCalculateUtilizations(
+      const HloInstruction* fusion);
+
+  HloToProperties hlo_properties_;
+
+  // If true, the time taken will be computed from the rates for each property
+  // and the total time will be the maximum time, which is the time of the
+  // bottleneck.
+  bool current_should_compute_bottleneck_time_;
+
+  // The properties of the currently visited instruction. A HandleFoo method
+  // modify these to change the default values computed in Preprocess.
+  Properties current_properties_;
+
+  // The sum of the properties of all HLOs in the computation.
+  Properties properties_sum_;
+
+  // The hardware-specific options that contains things like the shape size
+  // function and per-second rates.
+  Options options_;
+
+  // Determines which properties propagate from subcomputations to parents.
+  virtual bool KeyToCopyFromSubcomputation(absl::string_view key) const;
+
+  HloCostAnalysis(const HloCostAnalysis&) = delete;
+  HloCostAnalysis& operator=(const HloCostAnalysis&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_COST_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_creation_utils.h
new file mode 100644
index 00000000..0fe2da44
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_creation_utils.h
@@ -0,0 +1,435 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_CREATION_UTILS_H_
+#define XLA_SERVICE_HLO_CREATION_UTILS_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Some lightweight utilities intended to make HLO instruction creation more
+// ergonomic.  We don't have a complete set of helpers yet -- I expect we'll
+// expand this interface as needed on an ad-hoc basis.
+
+// Creates a unary HLO instruction and adds it to the computation containing
+// `operand`.
+absl::StatusOr<HloInstruction*> MakeUnaryHlo(
+    HloOpcode opcode, HloInstruction* operand,
+    const OpMetadata* metadata = nullptr);
+
+// Creates a binary HLO instruction and adds it to the computation containing
+// `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
+absl::StatusOr<HloInstruction*> MakeBinaryHlo(
+    HloOpcode opcode, HloInstruction* lhs, HloInstruction* rhs,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a kCopy HLO.
+HloInstruction* MakeCopyHlo(HloInstruction* from, const Shape& to);
+
+// Creates a compare HLO instruction and adds it to the computation containing
+// `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
+absl::StatusOr<HloInstruction*> MakeCompareHlo(
+    Comparison::Direction direction, HloInstruction* lhs, HloInstruction* rhs,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a pad HLO instruction and adds it to the computation containing
+// `operand` and `padding_value` (`operand` and `padding_value` must be in the
+// same computation).
+absl::StatusOr<HloInstruction*> MakePadHlo(
+    HloInstruction* operand, HloInstruction* padding_value,
+    const PaddingConfig& padding_config, const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a slice HLO instruction and adds it to the computation containing
+// `operand`.
+absl::StatusOr<HloInstruction*> MakeSliceHlo(
+    HloInstruction* operand, absl::Span<const int64_t> start_indices,
+    absl::Span<const int64_t> limit_indices, absl::Span<const int64_t> strides,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a convolution HLO instruction and adds it to the computation
+// containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
+// If the result shape has integral element type, an optional
+// preferred_element_type can be specified to override the element type.
+absl::StatusOr<HloInstruction*> MakeConvolveHlo(
+    HloInstruction* lhs, HloInstruction* rhs, int64_t feature_group_count,
+    int64_t batch_group_count, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    const PrecisionConfig& precision_config,
+    std::optional<PrimitiveType> preferred_element_type,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a transpose HLO instruction and adds it to the computation containing
+// `operand`.
+absl::StatusOr<HloInstruction*> MakeTransposeHlo(
+    HloInstruction* operand, absl::Span<const int64_t> dimensions);
+
+// Creates a reshape HLO instruction and adds it to the computation containing
+// `operand`.
+absl::StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                               HloInstruction* operand);
+
+absl::StatusOr<HloInstruction*> MakeReshapeHlo(
+    absl::Span<const int64_t> result_shape_dim_bounds, HloInstruction* operand);
+
+// Creates a dynamic-slice HLO instruction and adds it to the computation
+// containing `operand` and `start_indices` (`operand` and `start_indices` must
+// be in the same computation).
+absl::StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+    HloInstruction* operand, absl::Span<HloInstruction* const> start_indices,
+    absl::Span<const int64_t> slice_sizes,
+    const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+    HloInstruction* operand, HloInstruction* start_indices,
+    absl::Span<const int64_t> slice_sizes,
+    const OpMetadata* metadata = nullptr);
+
+// Creates a dynamic-update-slice HLO instruction and adds it to the computation
+// containing `operand`, `update` and `start_indices` (`operand`, `update` and
+// `start_indices` must be in the same computation).
+absl::StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+    HloInstruction* operand, HloInstruction* update,
+    HloInstruction* start_indices, const OpMetadata* metadata = nullptr);
+
+// a variant of dynamic-update-slice where `start_indices` is a vector of HLO
+// instructions
+absl::StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+    HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices,
+    const OpMetadata* metadata = nullptr);
+
+// Creates a broadcast HLO instruction and adds it to the computation containing
+// `operand`.
+HloInstruction* MakeBroadcastHlo(
+    HloInstruction* operand, absl::Span<const int64_t> broadcast_dimensions,
+    absl::Span<const int64_t> result_shape_bounds,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+HloInstruction* MakeBroadcastHlo(
+    HloInstruction* operand, absl::Span<const int64_t> broadcast_dimensions,
+    const Shape& shape, const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a GetTupleElement HLO instruction and adds it to the computation
+// containing `operand`.
+absl::StatusOr<HloInstruction*> MakeGetTupleElementHlo(
+    HloInstruction* operand, int64_t index,
+    const OpMetadata* metadata = nullptr);
+
+// Creates a Concatenate HLO instruction and adds it to the computation
+// containing `operands` (`operands` must be non-empty and every element must be
+// contained in the same computation).
+absl::StatusOr<HloInstruction*> MakeConcatHlo(
+    absl::Span<HloInstruction* const> operands, int64_t dimension,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a Convert HLO instruction that converts the given instruction to have
+// the given primitive type.
+HloInstruction* MakeConvertToHlo(HloInstruction* hlo, PrimitiveType type,
+                                 const OpMetadata* metadata = nullptr);
+
+// Creates a Bitcast HLO instruction to the given shape+layout.
+HloInstruction* MakeBitcastHlo(HloInstruction* hlo, const Shape& shape,
+                               const OpMetadata* metadata = nullptr);
+
+// Creates a BitcastConvert HLO instruction.
+HloInstruction* MakeBitcastConvertToHlo(HloInstruction* hlo, PrimitiveType type,
+                                        const OpMetadata* metadata = nullptr);
+
+// Creates an Iota HLO instruction.
+HloInstruction* MakeIotaHlo(HloComputation* computation, const Shape& shape,
+                            int64_t iota_dimension);
+
+// Creates a Dot HLO instruction and adds it to the computation containing `lhs`
+// and `rhs` (both must be in the same computation). If the result shape has
+// integral element type, an optional preferred_element_type can be specified to
+// override the element type. If 'sparsity' is set, then 'sparse_meta' must also
+// be present (and have the same size).
+absl::StatusOr<HloInstruction*> MakeDotHlo(
+    HloInstruction* lhs, HloInstruction* rhs,
+    const DotDimensionNumbers& dim_numbers,
+    const PrecisionConfig& precision_config,
+    std::optional<PrimitiveType> preferred_element_type,
+    std::vector<SparsityDescriptor> sparsity = {},
+    absl::Span<HloInstruction* const> sparse_meta = {},
+    const OpMetadata* metadata = nullptr);
+
+// Creates a RaggedDot HLO instruction and adds it to the computation containing
+// `lhs`, `rhs`, and `group_sizes` (all must be in the same computation). An
+// optional preferred_element_type can be specified to override the element
+// type.
+absl::StatusOr<HloInstruction*> MakeRaggedDotHlo(
+    HloInstruction* lhs, HloInstruction* rhs, HloInstruction* group_sizes,
+    const RaggedDotDimensionNumbers& dim_numbers,
+    const PrecisionConfig& precision_config,
+    std::optional<PrimitiveType> preferred_element_type);
+
+// Creates a Map HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation.
+absl::StatusOr<HloInstruction*> MakeMapHlo(
+    absl::Span<HloInstruction* const> operands, HloComputation* map_computation,
+    const OpMetadata* metadata = nullptr);
+
+// Creates a reduce-precision op, where operand is the data to reduce in
+// precision, and exponent_bits and mantissa_bits describe the precision to
+// reduce it to.
+HloInstruction* MakeReducePrecisionHlo(HloInstruction* operand,
+                                       int exponent_bits, int mantissa_bits,
+                                       const OpMetadata* metadata = nullptr);
+
+absl::StatusOr<HloInstruction*> MakeReduceWindowHlo(
+    HloInstruction* operand, HloInstruction* init_value, const Window& window,
+    HloComputation* reduce_computation, const OpMetadata* metadata = nullptr);
+
+absl::StatusOr<HloInstruction*> MakeReduceWindowHlo(
+    HloInstruction* operand, HloInstruction* init_value, const Window& window,
+    HloOpcode binary_opcode, const OpMetadata* metadata = nullptr);
+
+// Creates a Reduce HLO instruction and adds it to the computation containing
+// the operand. This will create the sub-computation needed for the reduction in
+// the given module. binary_opcode should represent a binary operation.
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    absl::Span<const int64_t> dimensions, HloOpcode binary_opcode,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    absl::Span<const int64_t> dimensions, HloComputation* reduce_computation,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    HloOpcode binary_opcode, HloModule* module,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Generic helper function to create a reduction.
+//
+// Precondition: size of operands is equal to the size of init values and equal
+// to the size of the computation output shape.
+//
+// Creates a non-variadic reduction if the size is singular, and a variadic one
+// otherwise.
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    absl::Span<HloInstruction* const> operands,
+    absl::Span<HloInstruction* const> init_values,
+    absl::Span<const int64_t> dimensions, HloComputation* reduce_computation,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Creates a Reverse HLO instruction and adds it to the computation containing
+// `operand`.
+absl::StatusOr<HloInstruction*> MakeReverseHlo(
+    HloInstruction* operand, absl::Span<const int64_t> dimensions,
+    const OpMetadata* metadata = nullptr);
+
+// Creates a Select HLO instruction and adds it to the computation containing
+// the predicate. The on_true and on_false instructions must also be contained
+// in the same computation. If on_true and on_false are tuples, create a tuple
+// select instead. `pred` is broadcasted up from a scalar if necessary.
+absl::StatusOr<HloInstruction*> MakeSelectHlo(
+    HloInstruction* pred, HloInstruction* on_true, HloInstruction* on_false,
+    HloInstruction* derived_from = nullptr,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+// Forwards the first operand if operands.size() == 1, or creates a tuple
+// instruction with all the operands. Crashes if `operands` is empty.
+HloInstruction* MaybeMakeTuple(absl::Span<HloInstruction* const> operands);
+
+// Creates a HloComputation in the destination module from a builder's
+// XlaComputation.
+absl::StatusOr<HloComputation*> XlaComputationToHloComputation(
+    XlaComputation& src_comp, HloModule* dest_module);
+
+// Creates a Sort HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation. Also creates a
+// default compare sub-computation which sorts the first operand into ascending
+// order. 'is_stable' specifies whether the sorting should be stable.
+absl::StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64_t dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module, const OpMetadata* metadata = nullptr);
+
+// Creates an R1 Constant HLO instruction of the given PrimitiveType with the
+// given values and adds it to the given computation.
+template <typename NativeT>
+absl::StatusOr<HloInstruction*> MakeR1ConstantHlo(
+    HloComputation* computation, PrimitiveType type,
+    absl::Span<const NativeT> values) {
+  Literal literal = LiteralUtil::CreateR1<NativeT>(values);
+  if (literal.shape().element_type() != type) {
+    TF_ASSIGN_OR_RETURN(literal, literal.Convert(type));
+  }
+  return computation->AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+}
+
+// Creates an R0 Constant HLO instruction of the PrimitiveType corresponding to
+// `NativeT` with the given value and adds it to the given computation.
+template <class NativeT>
+HloInstruction* MakeR0ConstantHlo(HloComputation* computation, NativeT value) {
+  return computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<NativeT>(value)));
+}
+
+// Makes a scalar that is elementwise compatible with the shape of the base
+// instruction.
+template <class NativeT>
+HloInstruction* MakeScalarLike(HloInstruction* base, NativeT value) {
+  auto scalar = base->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<NativeT>(value)
+                                         .Convert(base->shape().element_type())
+                                         .value()));
+  if (base->shape().rank() == 0) {
+    *scalar->mutable_shape() = base->shape();
+    return scalar;
+  }
+  return base->AddInstruction(HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeStaticShape(base->shape()), scalar, {}));
+}
+
+// Creates a fusion instruction and fuses `fused` into the created fusion
+// instruction.
+absl::StatusOr<HloInstruction*> MakeFusionInstruction(
+    HloInstruction* fused, HloInstruction::FusionKind kind);
+
+// -----------------------------------------------------------------------------
+// Some other miscellaneous helpers to generate common HLO patterns.  All of
+// these add all the instructions they generate into the computation containing
+// their operand(s).
+
+// Collapses (via reshape) the first N (logical) dimensions of `operand` into a
+// single leading dimension.  `operand` must have rank > `n` and `n` must not be
+// 0.
+//
+// For instance if `operand` has shape f32[7,8,9] and n is 2 then the output is
+// the `operand` reshaped to [56,9].
+absl::StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
+                                                   int64_t n);
+
+// Prepends `n` degenerate dimensions (dimensions with bound = 1) to `operand`
+// using a reshape.
+//
+// For instance if operand has shape f32[3,4,5] then this returns the operand
+// reshaped to f32[1,3,4,5].  If the operand is a f32 scalar (i.e. has shape
+// f32[]) then this returns the operand reshaped to f32[1].
+absl::StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                      int64_t n);
+
+// Expands (via reshape) the first (logical) dimension of `operand` into a
+// sequence of `expanded_dims` dimensions.  `operand` must at least be of rank 1
+// and the number of elements in its first dimension must be equal to the
+// product of `expanded_dims`.
+//
+// For instance if `operand` has shape f32[200,9,7] and expanded_dims is
+// {2,5,20} the result is `operand` reshaped to [2,5,20,9,7].
+absl::StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
+    HloInstruction* operand, absl::Span<const int64_t> expanded_dims);
+
+// Elides (via reshape) a set of degenerate dimensions (dimensions containing
+// exactly one element), `dims_to_elide` from `operand`.  Every dimension in
+// `dims_to_elide` must be a degenerate dimension.  `dims_to_elide` must be
+// sorted and not contain duplicates.
+//
+// For example if `operand` is of shape f32[19,1,20,1,7,1,9] and dims_to_elide
+// is {1,5} then the result is `operand` reshaped to [19,20,1,7,9].
+absl::StatusOr<HloInstruction*> ElideDegenerateDims(
+    HloInstruction* operand, absl::Span<const int64_t> dims_to_elide);
+
+// Inserts (via reshape) a set of degenerate dimensions (dimensions containing
+// exactly one element), `dims_to_insert` into `operand`. The dimensions in
+// `dims_to_insert` refer to the dimensions in the result, and hence should be
+// less than the rank of the result. Also, `dims_to_insert` must be sorted.
+//
+// For example, if `operand` is of shape f32[12,21,8,34] and dims_to_insert is
+// {0, 2}, then the result is `operand` reshaped to [1,12,1,21,8,34].
+absl::StatusOr<HloInstruction*> InsertDegenerateDims(
+    HloInstruction* operand, absl::Span<const int64_t> dims_to_insert);
+
+// Pads `operand` (which must have rank 1) with `zeros_to_prepend` zeros in the
+// front and `zeros_to_append` zeros in the back.
+absl::StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
+                                                   int64_t zeros_to_prepend,
+                                                   int64_t zeros_to_append);
+
+// Broadcasts a zero value of type `element_type` into a tensor with element
+// type `element_type` and dimension bounds `broadcast_dimensions`.  The
+// broadcast instruction is emitted into `computation`.
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64_t> broadcast_dimensions);
+
+// Same as above, but allows to specify the broadcast shape.
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               const Shape& broadcast_shape);
+
+// Same as above, but fill the tensor with ones.
+HloInstruction* BroadcastOnes(HloComputation* computation,
+                              PrimitiveType element_type,
+                              absl::Span<const int64_t> broadcast_dimensions);
+
+// Creates a HLO computation that takes arguments of type `domain` and produces
+// a value of type `range`.
+absl::StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
+    absl::Span<const Shape* const> domain, const Shape& range,
+    absl::string_view name);
+
+// Expands a general degenerate reshape operation to a sequence of degenerate
+// adding and removing reshapes that changes only a single dimension.
+HloInstruction* ExpandDegenerateReshape(HloInstruction* inst);
+
+// Creates a scalar constant with the given shape and native value.
+template <typename NativeT>
+std::unique_ptr<HloInstruction> MakeScalarConstantWithShape(const Shape& shape,
+                                                            NativeT value) {
+  return primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
+      [&](auto literal_constant) -> std::unique_ptr<HloInstruction> {
+        if constexpr (primitive_util::IsIntegralType(literal_constant) ||
+                      primitive_util::IsFloatingPointType(literal_constant)) {
+          auto constant = HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<NativeT>(value)
+                  .Convert(shape.element_type())
+                  .value());
+          *constant->mutable_shape() = shape;
+          return std::move(constant);
+        }
+        LOG(FATAL) << "Provided shape is not a float or int type.";
+      },
+      shape.element_type());
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_CREATION_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_cse.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_cse.h
new file mode 100644
index 00000000..423a99b9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_cse.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_CSE_H_
+#define XLA_SERVICE_HLO_CSE_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which performs common-subexpression elimination. Identical constants
+// and identical instructions with the same operands are commoned. The pass
+// iterates over the instructions in topological order which enables the pass to
+// find arbitrarily large common expressions.
+class HloCSE : public HloModulePass {
+ public:
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  // If ignore_control_dependencies is true, the pass will ignore control deps
+  // when replacing instructions with their equivalents.
+  explicit HloCSE(bool is_layout_sensitive,
+                  bool only_fusion_computations = false,
+                  bool ignore_control_dependencies = false,
+                  bool only_scalars = false)
+      : is_layout_sensitive_(is_layout_sensitive),
+        only_fusion_computations_(only_fusion_computations),
+        ignore_control_dependencies_(ignore_control_dependencies),
+        only_scalars_(only_scalars) {}
+  ~HloCSE() override = default;
+  absl::string_view name() const override { return "cse"; }
+
+  // Run CSE on the given module. Returns whether the module was changed (common
+  // subexpressions were found and eliminated).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Run CSE on the given computation. Returns whether the computation was
+  // changed.
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+
+ private:
+  const bool is_layout_sensitive_;
+  const bool only_fusion_computations_;
+  const bool ignore_control_dependencies_;
+  const bool only_scalars_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_CSE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_dataflow_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_dataflow_analysis.h
new file mode 100644
index 00000000..571638e5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_dataflow_analysis.h
@@ -0,0 +1,26 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Analysis for determining the possible set of values for all positions
+// (instructions and ShapeIndexes) in the HLO module. Analysis is module-scoped
+// tracking values across computation boundaries.
+
+#ifndef XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
+#define XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+
+#endif  // XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_dce.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_dce.h
new file mode 100644
index 00000000..d0ce0665
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_dce.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_DCE_H_
+#define XLA_SERVICE_HLO_DCE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+
+#endif  // XLA_SERVICE_HLO_DCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_isolator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_isolator.h
new file mode 100644
index 00000000..d10dc501
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_isolator.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
+#define XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Domain isolation is the task of placing kDomain instructions between HLO
+// instructions having different sharding. A kDomain instruction is essentially
+// used to break an HLO graph edge connecting two instructions with different
+// sharding. If a set of connected instructions have all the same sharding, no
+// kDomain instruction will be placed.
+class HloDomainIsolator : public HloModulePass {
+ public:
+  // Creates a new kDomain instruction for the edge between the use instruction
+  // (the first HloInstruction argument), and the operand instruction (the
+  // third HloInstruction argument) if the interesting attribute of the
+  // instruction differences from the attribute of the root (the second
+  // HloInstruction argument).
+  // Returns nullptr in case no domain separation is necessary.
+  using DomainCreator = std::function<HloInstruction*(
+      HloInstruction*, HloInstruction*, HloInstruction*)>;
+  using DomainCreatorFactory = std::function<DomainCreator()>;
+  explicit HloDomainIsolator(DomainCreatorFactory creator_factory_);
+
+  absl::string_view name() const override { return "domain_isolator"; }
+
+  // Update domains for an instruction.
+  absl::StatusOr<bool> UpdateDomains(HloInstruction* instruction);
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  DomainCreatorFactory creator_factory_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_map.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_map.h
new file mode 100644
index 00000000..f54627a5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_map.h
@@ -0,0 +1,133 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_DOMAIN_MAP_H_
+#define XLA_SERVICE_HLO_DOMAIN_MAP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// The HloDomainMap splits a set of instructions within a module or computation,
+// into different domains, separated by kDomain instructions.
+// A domain is composed by a set of instructions which can reach each other via
+// operand/user edges, without crossing a kDomain instruction of a given kind.
+// A domain never crosses computation boundaries.
+class HloDomainMap {
+ public:
+  // Creates a new HloDomainMap, creating all the domains within the input
+  // computation, of the given kind. If domain_kind is not empty, only the
+  // kDomain instructions of domain_kind will be considered as separators.
+  // Otherwise every kDomain instruction will be splitting domains.
+  static absl::StatusOr<std::unique_ptr<HloDomainMap>> Create(
+      HloComputation* computation, std::string domain_kind);
+
+  // Creates a new HloDomainMap, creating all the domains within the input
+  // module, of the given kind. If domain_kind is not empty, only the
+  // kDomain instructions of domain_kind will be considered as separators.
+  // Otherwise every kDomain instruction will be splitting domains.
+  static absl::StatusOr<std::unique_ptr<HloDomainMap>> Create(
+      HloModule* module, std::string domain_kind);
+
+  // Retrieves all the domains the input module or computation are composed by.
+  const std::vector<std::unique_ptr<DomainMetadata::Domain>>& GetDomains()
+      const {
+    return instruction_domains_;
+  }
+
+  // Checks whether two instructions are within the same domain.
+  bool InSameDomain(const HloInstruction* instruction1,
+                    const HloInstruction* instruction2) const;
+
+  // Checks whether instruction is a kDomain instruction of the kind we are
+  // currently processing.
+  bool IsDomainInstruction(const HloInstruction* instruction) const;
+
+  // Retrieves the domain identifier of the instruction, or -1 in case
+  // instruction is not found within any domain.
+  int64_t GetDomainId(const HloInstruction* instruction) const;
+
+  // Returns the unique id of the domain metadata for the domain the given
+  // instruction belongs to. The given instruction must not be a kDomain
+  // instruction since each domain instruction is associated with 2 domains.
+  int64_t GetDomainMetadataId(const HloInstruction* instruction) const;
+
+ private:
+  // Map used for representing instruction ordering, i.e.
+  // order_map[a] < order_map[b] means a must be ordered before b.
+  using InstructionOrderMap =
+      absl::flat_hash_map<const HloInstruction*, int64_t>;
+
+  HloDomainMap(std::string domain_kind)
+      : domain_kind_(std::move(domain_kind)) {}
+
+  // Check if the kDomain instruction is facing (via its operand link) another
+  // kDomain instruction of the same kind, hence defining an empty domain.
+  // If that is the case, create the empty domain and call the proper
+  // normalizer.
+  absl::Status TryProcessEmptyDomain(HloInstruction* instruction);
+
+  absl::Status Populate(HloComputation* computation);
+
+  // Inserts the provided domain into the ones tracked by this object,
+  // creating a new domain ID.
+  absl::Status InsertDomain(std::unique_ptr<DomainMetadata::Domain> domain);
+
+  // From the given instruction, expands operand and user wise, the set of
+  // instructions which can be reached without crossing a kDomain instruction
+  // of the kind specified by domain_kind_.
+  // The domain data structure will be populated with all the reached
+  // instructions, and the boundaries of the domain, with the kDomain
+  // instructions encountered while expanding the reach.
+  absl::Status ExpandDomain(HloInstruction* instruction,
+                            DomainMetadata::Domain* domain) const;
+
+  // Creates a domain data structure using the ExpandDomain() API.
+  absl::StatusOr<std::unique_ptr<DomainMetadata::Domain>> CreateDomain(
+      HloInstruction* instruction,
+      const InstructionOrderMap& instructions_order) const;
+
+  // Out of an instruction set, returns a vector of all the ones which are not
+  // a kDomain kind.
+  static std::vector<HloInstruction*> MakeNonDomainInstructions(
+      const absl::flat_hash_set<HloInstruction*>& instruction_set,
+      const InstructionOrderMap& instructions_order);
+
+  // Populates domain_metadata_id_ that maps each HloInstruction to the unique
+  // ID of its associated domain metatadata.
+  absl::Status PopulateDomainMetadataMap();
+
+  std::string domain_kind_;
+  std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
+  absl::flat_hash_map<const HloInstruction*, int64_t> instruction_to_domain_;
+  absl::flat_hash_map<const HloInstruction*, int64_t> domain_metadata_id_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_DOMAIN_MAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_remover.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_remover.h
new file mode 100644
index 00000000..970f4f16
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_remover.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
+#define XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
+
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Removes all the kDomain instructions of a given kind from the input module,
+// and calls the normalizer to propagate the properties on the possibly new born
+// instructions.
+class HloDomainRemover : public HloModulePass {
+ public:
+  // Creates a new HloDomainRemover object tasked at removing all the kDomain
+  // instructions of a given kind.
+  // In case a reachable set (the set of instructions within a computation,
+  // which are mutually reachable via operand/user pathways) has all the
+  // instructions in it with the same attributes (ie, sharding), a normalizer
+  // function is tasked at applying attribute normalization on the instructions
+  // within such domain.
+  HloDomainRemover(absl::string_view kind,
+                   std::function<absl::Status(const DomainMetadata::Domain&,
+                                              const DomainMetadata* metadata)>
+                       normalizer)
+      : kind_(kind), normalizer_(std::move(normalizer)) {}
+
+  absl::string_view name() const override { return "domain_remover"; }
+
+  // Remove domains of a given kind which are used as users of a specific
+  // instruction.
+  static absl::StatusOr<int64_t> RemoveExitDomains(
+      HloInstruction* instruction, absl::string_view domain_kind);
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  class RunContext;
+
+  std::string kind_;
+  std::function<absl::Status(const DomainMetadata::Domain&,
+                             const DomainMetadata* metadata)>
+      normalizer_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_verifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_verifier.h
new file mode 100644
index 00000000..32dce237
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_domain_verifier.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_DOMAIN_VERIFIER_H_
+#define XLA_SERVICE_HLO_DOMAIN_VERIFIER_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/hlo_domain_map.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Verifies that the domain instructions are consistent, and the each domain is
+// surrounded by the same metadata.
+class HloDomainVerifier : public HloModulePass {
+ public:
+  HloDomainVerifier(std::vector<std::string> kinds)
+      : kinds_(std::move(kinds)) {}
+
+  absl::string_view name() const override { return "domain_verifier"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Verify that the whole kDomain frontier bounding the instruction reach set,
+  // has matching metadata.
+  // A kDomain instruction has two sides of metadata, a user facing and an
+  // operand facing.
+  // A reachable instruction set can make contact with a kDomain instruction on
+  // a user facing side (the kDomain is operand of the instruction), or on a
+  // operand facing side (the kDomain is user of the instruction).
+  // And depending on the contact side, the proper metadata object
+  // (user_side_metadata() vs. operand_side_metadata()) needs to be used for
+  // consistency checks.
+  // Returns the DomainMetadata pointer which surrounds the domain, and
+  // represents the common metadata within such domain. If the returned
+  // DomainMetadata pointer is nullptr, the input domain had no kDomain
+  // boundary.
+  static absl::StatusOr<const DomainMetadata*> VerifyDomain(
+      const DomainMetadata::Domain& domain);
+
+ private:
+  class RunContext;
+
+  std::vector<std::string> kinds_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_DOMAIN_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_element_type_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_element_type_converter.h
new file mode 100644
index 00000000..3fed0142
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_element_type_converter.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+#define XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_element_type_converter.h"
+
+#endif  // XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_execution_profile.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_execution_profile.h
new file mode 100644
index 00000000..0540a981
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_execution_profile.h
@@ -0,0 +1,175 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_EXECUTION_PROFILE_H_
+#define XLA_SERVICE_HLO_EXECUTION_PROFILE_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/map_util.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_execution_profile_data.pb.h"
+#include "xla/service/hlo_profile_printer.h"
+#include "xla/types.h"
+
+namespace xla {
+
+class HloInstruction;
+
+// Maps all HloInstructions and HloComputations in an HloModule to integers.
+// These integers form the contiguous range [0, total_count()).
+class HloProfileIndexMap {
+ public:
+  // Scans `module` to populate this instance of HloProfileIndexMap.
+  explicit HloProfileIndexMap(const HloModule& module)
+      : HloProfileIndexMap(module, {}) {}
+  explicit HloProfileIndexMap(const HloModule& module,
+                              absl::Span<const std::string> extra_metrics);
+
+  HloProfileIndexMap(const HloProfileIndexMap&) = default;
+  HloProfileIndexMap(HloProfileIndexMap&&) = default;
+
+  HloProfileIndexMap& operator=(const HloProfileIndexMap&) = default;
+  HloProfileIndexMap& operator=(HloProfileIndexMap&&) = default;
+
+  size_t GetProfileIndexFor(const HloInstruction& instruction) const {
+    return FindOrDie(instruction_to_profile_idx(), &instruction);
+  }
+
+  size_t GetProfileIndexFor(const HloComputation& computation) const {
+    return FindOrDie(computation_to_profile_idx(), &computation);
+  }
+
+  size_t GetProfileIndexFor(const std::string& key) const {
+    return xla::FindOrDie(extra_metric_to_profile_idx(), key);
+  }
+
+  size_t instruction_count() const {
+    return instruction_to_profile_idx().size();
+  }
+
+  size_t computation_count() const {
+    return computation_to_profile_idx().size();
+  }
+
+  size_t extra_metrics_count() const {
+    return extra_metric_to_profile_idx().size();
+  }
+
+  size_t total_count() const {
+    return instruction_count() + computation_count() + extra_metrics_count();
+  }
+
+  const absl::flat_hash_map<const HloInstruction*, int64_t>&
+  instruction_to_profile_idx() const {
+    return instruction_to_profile_idx_;
+  }
+
+  const absl::flat_hash_map<const HloComputation*, int64_t>&
+  computation_to_profile_idx() const {
+    return computation_to_profile_idx_;
+  }
+
+  const absl::flat_hash_map<std::string, int64_t>& extra_metric_to_profile_idx()
+      const {
+    return extra_metric_to_profile_idx_;
+  }
+
+ private:
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx_;
+  absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx_;
+  absl::flat_hash_map<std::string, int64_t> extra_metric_to_profile_idx_;
+};
+
+// Create an instance of `HloProfilePrinterData`.
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
+    const HloProfileIndexMap& hlo_profile_index_map,
+    const HloCostAnalysis& cost_analysis,
+    absl::string_view entry_computation_name);
+
+// Describes how much time each HLO operation took.
+//
+// Each HloComputation takes a certain number of cycles.  This class helps break
+// down how much time each HLO took.
+class HloExecutionProfile {
+ public:
+  HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
+                      const HloProfileIndexMap* hlo_profile_index_map);
+
+  // Record how many cycles this HLO took to execute.
+  void SetCyclesTakenBy(const HloInstruction* hlo, uint64_t cycles_taken);
+
+  // Record how many cycles this HLO took to execute.
+  void SetCyclesTakenBy(size_t index, uint64_t cycles_taken);
+
+  // Returns how many cycles this HLO took to execute.  Profiling information
+  // may not be available for some instructions in which case zero is returned.
+  uint64_t GetCyclesTakenBy(const HloInstruction& hlo) const;
+
+  // Returns how many cycles this HLO took to execute.  Profiling information
+  // may not be available for some instructions in which case zero is returned.
+  uint64_t GetCyclesTakenBy(size_t index) const;
+
+  // Return the number of cycles this computation took to execute.
+  uint64_t total_cycles_executed(const HloComputation& computation) const {
+    return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(
+        computation)];
+  }
+
+  // Record how many cycles a computation took to execute.
+  void set_total_cycles_executed(const HloComputation& computation,
+                                 uint64_t total_cycles_executed) {
+    profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(computation)] =
+        total_cycles_executed;
+  }
+
+  // Record extra metric.
+  void set_extra_metrics(const std::string& metric, uint64_t value) {
+    profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(metric)] =
+        value;
+  }
+
+  // Returns a version of the execution profile suitable for performance
+  // debugging; e.g. emits cycle counts, execution time at the nominal device
+  // frequency, and the effective throughput given the provided cost_analysis
+  // for the operations in a given computation. Returns an empty string if it
+  // wasn't possible to generate a printable version.
+  std::string ToString(float clock_rate_ghz) const {
+    return PrintHloProfile(hlo_profile_printer_data_, profile_counters_.data(),
+                           clock_rate_ghz);
+  }
+
+  std::vector<int64_t>* mutable_profile_counters() {
+    return &profile_counters_;
+  }
+  const std::vector<int64_t>& profile_counters() const {
+    return profile_counters_;
+  }
+
+ private:
+  const HloProfilePrinterData& hlo_profile_printer_data_;
+  const HloProfileIndexMap& hlo_profile_index_map_;
+
+  // Stores per-Hlo profile counters.  This is the only thing that changes when
+  // we execute an XLA computation.
+  std::vector<int64_t> profile_counters_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_EXECUTION_PROFILE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_graph_dumper.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_graph_dumper.h
new file mode 100644
index 00000000..8c5d5ade
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_graph_dumper.h
@@ -0,0 +1,144 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_GRAPH_DUMPER_H_
+#define XLA_SERVICE_HLO_GRAPH_DUMPER_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/xla.pb.h"
+
+// This file contains routines for rendering HLO computations into a
+// human-readable graphical format.
+//
+// Fundamentally all graphs are rendered using the DOT language, but they can be
+// packaged four different ways:
+//
+//  - as a raw DOT file, which can be rendered using `graphviz`.
+//
+//  - as an HTML file with an embedded DOT file, rendered in JavaScript.
+//
+//  - as an HTML page showing the fusion progress, rendered in JavaScript.
+//
+//  - as a URL hosted somewhere which somehow embeds the DOT file.
+//
+// The last option is not implemented by default, but you can add a plugin to
+// implement it via RegisterGraphToURLRenderer.
+//
+// TODO(jlebar): Rename this file to hlo_graph_renderer.
+
+namespace xla {
+
+// Different formats that a graph can be packaged as.
+enum class RenderedGraphFormat {
+  kDot,
+  kHtml,
+  kUrl,
+};
+
+struct HloRenderOptions {
+  // Include the backend config string in the rendered graph.
+  bool show_backend_config = false;
+
+  // Include the fusion subcomputations in the rendered graph.
+  bool show_fusion_subcomputations = true;
+
+  // Include the while subcomputations in the rendered graph.
+  bool show_while_subcomputations = true;
+
+  bool override_node_colors = false;
+};
+
+// Contains color computed according to the numerical diff of an HloInstruction
+struct ColorStats {
+  std::string color;
+  std::string stats;
+};
+
+// Renders an HLO module as a human-readable visual graph.
+//
+// Note that this only works well for relatively small graphs (no more than a
+// few hundred nodes).  Beyond that, the dot is usually unrenderable,
+// unreadable, or both.  To view such graphs, use a tool such as
+// interactive_graphviz, which calls RenderNeighborhoodAround to render subsets
+// of a graph.
+absl::StatusOr<std::string> RenderGraph(
+    const HloComputation& computation, absl::string_view label,
+    const DebugOptions& debug_options, RenderedGraphFormat format,
+    HloRenderOptions hlo_render_options = {},
+    std::optional<absl::flat_hash_map<const HloInstruction*, ColorStats>>
+        color_map = std::nullopt);
+
+absl::StatusOr<std::string> RenderAllComputationsToHtml(
+    const HloModule& module);
+
+// Like RenderGraph, but renders only nodes "near" the given node in the graph.
+//
+// The number of nodes dumped is controlled by the radius parameter, which
+// (roughly) corresponds to the max distance a node may be from the primary node
+// before it's omitted from the graph.
+//
+// The optional boundary specifies a set of boundary nodes, beyond which nodes
+// will be omitted even if they are within the radius.
+absl::StatusOr<std::string> RenderNeighborhoodAround(
+    const HloInstruction& node, int radius, RenderedGraphFormat format,
+    HloRenderOptions hlo_render_options = {},
+    const absl::flat_hash_set<const HloInstruction*>& boundary = {},
+    std::optional<absl::flat_hash_map<const HloInstruction*, ColorStats>>
+        color_map = std::nullopt);
+
+// Renders nodes on any of the paths from `from` to `to`.  If there are more
+// than max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+// paths.
+absl::StatusOr<std::string> RenderAllPathsFromTo(
+    const HloInstruction& from, const HloInstruction& to, int64_t max_nodes,
+    RenderedGraphFormat format, HloRenderOptions hlo_render_options = {});
+
+// Registers the fusion state of the graph for future visualization using
+// the kFusionVisulization render format.
+//
+// The `consumer` node defines the area which should be rendered: if left null,
+// computation root is used by default.
+//
+// The `producer` remains `nullptr` if it's fused, or is set if the desire is to
+// highlight it.
+void RegisterFusionState(const HloComputation& computation,
+                         absl::string_view label,
+                         const HloInstruction& consumer,
+                         const HloInstruction* producer = nullptr);
+
+// Registers a function which implements RenderedGraphFormat::kUrl.
+//
+// The input to the function is dot, and the output should be a URL or an error.
+//
+// There can only be one active renderer, and the last call to this function
+// wins.
+void RegisterGraphToURLRenderer(
+    std::function<absl::StatusOr<std::string>(absl::string_view dot)> renderer);
+
+// Generates a fusion explorer for the given computation using the data in
+// fusion_visualizer_state.
+absl::StatusOr<std::string> WrapFusionExplorer(
+    const HloComputation& computation);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_GRAPH_DUMPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_lexer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_lexer.h
new file mode 100644
index 00000000..aad399ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_lexer.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_LEXER_H_
+#define XLA_SERVICE_HLO_LEXER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/parser/hlo_lexer.h"
+
+#endif  // XLA_SERVICE_HLO_LEXER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_liveness_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_liveness_analysis.h
new file mode 100644
index 00000000..fd590408
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_liveness_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
+#define XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_liveness_analysis.h"
+
+#endif  // XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_memory_scheduler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_memory_scheduler.h
new file mode 100644
index 00000000..09d8b432
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_memory_scheduler.h
@@ -0,0 +1,22 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_MEMORY_SCHEDULER_H_
+#define XLA_SERVICE_HLO_MEMORY_SCHEDULER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
+
+#endif  // XLA_SERVICE_HLO_MEMORY_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_config.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_config.h
new file mode 100644
index 00000000..cab597a5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_config.h
@@ -0,0 +1,590 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_MODULE_CONFIG_H_
+#define XLA_SERVICE_HLO_MODULE_CONFIG_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/debug_options_flags.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/sharding_config.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+enum class FusionConfigCollection {
+  kOff,      // Do not collect configuration.
+  kPerEdge,  // Collect per-edge configuration.
+  kPerNode,  // Collect per-node configuration.
+};
+
+// This class gathers all settings and values which affect the compiled
+// executable outside of the HLO code itself. This include layouts of inputs and
+// outputs to the module and settings such as HLO profiling. Together the
+// HloModule and HloModuleConfig unambiguously determine a particular
+// executable.
+class HloModuleConfig {
+ public:
+  // Represents a pair of input and output of the entry computation that can be
+  // considered as the original and updated values of a variable maintained by
+  // the caller, and that can be transparently sharded by XLA as an internal
+  // optimization. If sharded, XLA will create separate sharding/unsharding
+  // programs, and the caller is responsible to call the XLA-generated
+  // sharding/unsharding programs before and after the sharded main program.
+  //
+  // If the variable is not updated and there is not a corresponding output, use
+  // {-1} as the output_shape_index.
+  //
+  // The sharding/unsharding programs will include all the input/output pairs in
+  // shardable_value_update_pairs() as a flat tuple in their inputs/outputs,
+  // sorted by (input_parameter_number, parameter_shape_index).
+  //
+  // A typical usage pattern is to shard the variables first, then repeatedly
+  // invoke the main program, and finally invoke the unsharding program before
+  // they are used in full-shape.
+  struct ShardableValueUpdatePair {
+    int64_t input_parameter_number;
+    ShapeIndex parameter_shape_index;
+    ShapeIndex output_shape_index;
+  };
+
+  // A configuration can be created either with, or without an entry
+  // ComputationLayout. The default ctor creates it without -- in this case
+  // accessing entry_computation_layout will CHECK-fail. The ctor accepting a
+  // ProgramShape creates a computation layout using this shape.
+  // The layouts in the ProgramShape will be reset to default unless
+  // ignore_layouts is set to false.
+  HloModuleConfig() { debug_options_ = DefaultDebugOptionsIgnoringFlags(); }
+
+  explicit HloModuleConfig(const ProgramShape& program_shape,
+                           bool ignore_layouts = true);
+
+  explicit HloModuleConfig(ComputationLayout entry_computation_layout);
+
+  // Convert an HloModuleConfig to or from a proto.
+  HloModuleConfigProto ToProto() const;
+  static absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateFromProto(
+      const HloModuleConfigProto& proto);
+
+  // Assigns the repeated ShardableValueUpdatePairProto field to the given
+  // values in 'update_pairs'.
+  static void AssignProtoShardableValueUpdatePairs(
+      tsl::protobuf::RepeatedPtrField<ShardableValueUpdatePairProto>*
+          proto_update_pairs,
+      const std::vector<HloModuleConfig::ShardableValueUpdatePair>&
+          update_pairs);
+
+  // Assigns shardable_value_update_pairs_ field in 'config' to the given values
+  // in 'pairs'.
+  static void AssignStructShardableValueUpdatePairs(
+      HloModuleConfig& config,
+      const tsl::protobuf::RepeatedPtrField<ShardableValueUpdatePairProto>&
+          pairs);
+
+  // Checks if this config has an entry computation layout already.
+  bool has_entry_computation_layout() const {
+    return entry_computation_layout_.has_value();
+  }
+
+  // Sets the entry_computation_layout's parameter and result shapes for this
+  // config, according to the given program shape. The parameters and result
+  // are set to default layout.
+  void SetDefaultComputationLayout(const ProgramShape& program_shape);
+
+  // Same as above but if the given program contains layout for parameters or
+  // result, the entry_computation_layout's layout is updated accordingly.
+  void SetComputationLayoutIfExists(const ProgramShape& program_shape);
+
+  // Returns a constant reference to the layout of the entry computation.
+  // Assumes the layout was set.
+  const ComputationLayout& entry_computation_layout() const {
+    CHECK(entry_computation_layout_.has_value());
+    return *entry_computation_layout_;
+  }
+
+  // Returns a mutable pointer to the layout of the entry computation.
+  // Assumes the layout was set.
+  ComputationLayout* mutable_entry_computation_layout() {
+    CHECK(entry_computation_layout_.has_value());
+    return &(*entry_computation_layout_);
+  }
+
+  // Clears the entry computation layout.
+  void clear_entry_computation_layout() {
+    entry_computation_layout_ = std::nullopt;
+  }
+
+  // Returns whether to enable HLO-level profiling.
+  bool hlo_profiling_enabled() const {
+    return debug_options_.xla_hlo_profile();
+  }
+
+  bool cpu_traceme_enabled() const {
+    return debug_options_.xla_cpu_enable_xprof_traceme();
+  }
+
+  // Sets/returns the module seed set during execution.
+  void set_seed(uint64_t seed) { seed_ = seed; }
+  uint64_t seed() const { return seed_; }
+
+  // Set the launch id of the program. Launch id identifies a set of programs
+  // that should be launched together.
+  void set_launch_id(uint64_t launch_id) { launch_id_ = launch_id; }
+  int32_t launch_id() const { return launch_id_; }
+
+  void set_replica_count(int64_t replica_count) {
+    replica_count_ = replica_count;
+  }
+  int64_t replica_count() const { return replica_count_; }
+
+  void set_num_partitions(int64_t num_partitions) {
+    num_partitions_ = num_partitions;
+  }
+  int64_t num_partitions() const { return num_partitions_; }
+
+  const std::vector<bool>& param_requires_broadcast_via_collectives() const {
+    return param_requires_broadcast_via_collectives_;
+  }
+  void set_param_requires_broadcast_via_collectives(
+      std::vector<bool> require_broadcast) {
+    param_requires_broadcast_via_collectives_ = std::move(require_broadcast);
+  }
+
+  void set_use_spmd_partitioning(bool use_spmd_partitioning) {
+    use_spmd_partitioning_ = use_spmd_partitioning;
+  }
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+
+  void set_use_auto_spmd_partitioning(bool use_auto_spmd_partitioning) {
+    use_auto_spmd_partitioning_ = use_auto_spmd_partitioning;
+    if (use_auto_spmd_partitioning) {
+      // TODO(yuemmawang) Remove this warning once auto sharding is thoroughly
+      // tested with fleetwide models.
+      LOG(WARNING) << "Warning: Using auto_spmd_partitioning. It is "
+                      "experimental and may contain bugs!";
+      LOG(INFO) << "Overwriting use_spmd_partitioning to true, because "
+                   "use_auto_spmd_partitioning is true.";
+      set_use_spmd_partitioning(true);
+    }
+  }
+  bool use_auto_spmd_partitioning() const {
+    return use_auto_spmd_partitioning_;
+  }
+
+  void set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t> mesh_shape) {
+    auto_spmd_partitioning_mesh_shape_ = std::move(mesh_shape);
+  }
+  const std::vector<int64_t>& auto_spmd_partitioning_mesh_shape() const {
+    return auto_spmd_partitioning_mesh_shape_;
+  }
+
+  void set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t> mesh_ids) {
+    auto_spmd_partitioning_mesh_ids_ = std::move(mesh_ids);
+  }
+  const std::vector<int64_t>& auto_spmd_partitioning_mesh_ids() const {
+    return auto_spmd_partitioning_mesh_ids_;
+  }
+
+  void set_exec_time_optimization_effort(float exec_time_optimization_effort) {
+    exec_time_optimization_effort_ = exec_time_optimization_effort;
+  }
+  float exec_time_optimization_effort() const {
+    return exec_time_optimization_effort_;
+  }
+
+  void set_memory_fitting_effort(float memory_fitting_effort) {
+    memory_fitting_effort_ = memory_fitting_effort;
+  }
+  float memory_fitting_effort() const { return memory_fitting_effort_; }
+
+  // If enabled, deduplicate equivalent hlos into function calls to reduce code
+  // size.
+  void set_deduplicate_hlo(bool deduplicate_hlo) {
+    deduplicate_hlo_ = deduplicate_hlo;
+  }
+  bool deduplicate_hlo() const { return deduplicate_hlo_; }
+
+  void set_device_type(const std::string& device_type) {
+    device_type_ = device_type;
+  }
+  absl::string_view device_type() const { return device_type_; }
+
+  // Return a string which unambiguously represents all the fields of this data
+  // structure. Used for generating a cache key for storing the compiled
+  // executable.
+  std::string compilation_cache_key() const;
+
+  const DebugOptions& debug_options() const { return debug_options_; }
+  DebugOptions& mutable_debug_options() { return debug_options_; }
+  void set_debug_options(const DebugOptions& debug_options) {
+    debug_options_ = debug_options;
+  }
+
+  // Sets/returns the number of intra op threads for this module.
+  void set_intra_op_parallelism_threads(
+      const int intra_op_parallelism_threads) {
+    intra_op_parallelism_threads_ = intra_op_parallelism_threads;
+  }
+  int64_t intra_op_parallelism_threads() const {
+    return intra_op_parallelism_threads_;
+  }
+
+  // Checks if this config has a static device assignment.
+  bool has_static_device_assignment() const {
+    return static_device_assignment_.has_value();
+  }
+  // Getter and setter of the compile-time known device assignment.
+  const DeviceAssignment& static_device_assignment() const {
+    CHECK(static_device_assignment_.has_value());
+    return *static_device_assignment_;
+  }
+  void set_static_device_assignment(const DeviceAssignment& device_assignment) {
+    static_device_assignment_ = device_assignment;
+  }
+
+  // Checks if this config has a simulated device assignment.
+  bool has_pre_simulation_device_assignment() const {
+    return pre_simulation_device_assignment_.has_value();
+  }
+
+  // Getter and setter of the compile-time known device assignment.
+  const DeviceAssignment& pre_simulation_device_assignment() const {
+    CHECK(pre_simulation_device_assignment_.has_value());
+    return *pre_simulation_device_assignment_;
+  }
+
+  void set_pre_simulation_device_assignment(
+      const DeviceAssignment& device_assignment) {
+    pre_simulation_device_assignment_ = device_assignment;
+  }
+
+  bool allow_separate_sharding_programs() const {
+    return allow_separate_sharding_programs_;
+  }
+  void set_allow_separate_sharding_programs(
+      bool allow_separate_sharding_programs) {
+    allow_separate_sharding_programs_ = allow_separate_sharding_programs;
+  }
+
+  const std::vector<ShardableValueUpdatePair>& shardable_value_update_pairs()
+      const {
+    return shardable_value_update_pairs_;
+  }
+  void set_shardable_value_update_pairs(
+      std::vector<ShardableValueUpdatePair> pairs) {
+    shardable_value_update_pairs_ = std::move(pairs);
+  }
+
+  // Whether input and output buffers are aliased if the associated parameter is
+  // passed-through XLA modules without being changed.
+  bool alias_passthrough_params() const { return alias_passthrough_params_; }
+  void set_alias_passthrough_params(bool alias_passthrough_params) {
+    alias_passthrough_params_ = alias_passthrough_params;
+  }
+
+  bool content_aware_computation_sorting() const {
+    return content_aware_computation_sorting_;
+  }
+  void set_content_aware_computation_sorting(
+      bool content_aware_computation_sorting) {
+    content_aware_computation_sorting_ = content_aware_computation_sorting;
+  }
+
+  FusionConfigCollection fusion_config_collection() const {
+    return fusion_config_collection_;
+  }
+  void set_fusion_config_collection(
+      FusionConfigCollection fusion_config_collection) {
+    fusion_config_collection_ = fusion_config_collection;
+  }
+
+  const std::vector<std::vector<bool>>& fusion_config() const {
+    return fusion_config_;
+  }
+  void set_fusion_config(std::vector<std::vector<bool>> fusion_config) {
+    fusion_config_ = std::move(fusion_config);
+  }
+  std::vector<std::vector<bool>>& mutable_fusion_config() {
+    return fusion_config_;
+  }
+
+  const absl::flat_hash_map<std::string, std::vector<int64_t>>& dot_config()
+      const {
+    return dot_config_;
+  }
+  absl::flat_hash_map<std::string, std::vector<int64_t>>* mutable_dot_config() {
+    return &dot_config_;
+  }
+
+  const std::vector<std::vector<std::vector<int64_t>>>& layout_config() const {
+    return layout_config_;
+  }
+  std::vector<std::vector<std::vector<int64_t>>>* mutable_layout_config() {
+    return &layout_config_;
+  }
+
+  const std::vector<std::vector<bool>>& phase_ordering_config() const {
+    return phase_ordering_config_;
+  }
+  void set_phase_ordering_config(
+      std::vector<std::vector<bool>> phase_ordering_config) {
+    phase_ordering_config_ = std::move(phase_ordering_config);
+  }
+  std::vector<std::vector<bool>>& mutable_phase_ordering_config() {
+    return phase_ordering_config_;
+  }
+
+  const ShardingConfig& sharding_config() const { return sharding_config_; }
+  ShardingConfig* mutable_sharding_config() { return &sharding_config_; }
+
+  int phase_index() const { return phase_index_; }
+  void set_phase_index(const int phase_index) { phase_index_ = phase_index; }
+
+  absl::Span<const bool> allow_spmd_sharding_propagation_to_parameters() const {
+    return allow_spmd_sharding_propagation_to_parameters_;
+  }
+  absl::Span<const bool> allow_spmd_sharding_propagation_to_output() const {
+    return allow_spmd_sharding_propagation_to_output_;
+  }
+  void set_allow_spmd_sharding_propagation_to_parameters(
+      absl::Span<const bool> data) {
+    return allow_spmd_sharding_propagation_to_parameters_.assign(data.begin(),
+                                                                 data.end());
+  }
+  void set_allow_spmd_sharding_propagation_to_output(
+      absl::Span<const bool> data) {
+    return allow_spmd_sharding_propagation_to_output_.assign(data.begin(),
+                                                             data.end());
+  }
+
+  const std::vector<uint64_t>& memory_space_assignment_config() const {
+    return memory_space_assignment_config_;
+  }
+  std::vector<uint64_t>* mutable_memory_space_assignment_config() {
+    return &memory_space_assignment_config_;
+  }
+
+  int64_t GetAnalysisAllowance(absl::string_view pass_name) const {
+    auto it = analysis_allowance_map_.find(pass_name);
+    if (it == analysis_allowance_map_.end()) {
+      return -1;
+    }
+    return (*it).second;
+  }
+  void SetAnalysisAllowance(absl::string_view pass_name, int64_t allowance) {
+    analysis_allowance_map_[pass_name] = allowance;
+  }
+
+  PrecisionConfig::Precision matrix_unit_operand_precision() const {
+    return matrix_unit_operand_precision_;
+  }
+  void set_matrix_unit_operand_precision(
+      PrecisionConfig::Precision matrix_unit_operand_precision) {
+    matrix_unit_operand_precision_ = matrix_unit_operand_precision;
+  }
+
+  absl::string_view fdo_profile() const { return fdo_profile_; }
+  void set_fdo_profile(absl::string_view fdo_profile) {
+    fdo_profile_ = fdo_profile;
+  }
+
+  int64_t device_memory_size() const { return device_memory_size_; }
+  void set_device_memory_size(int64_t device_memory_size) {
+    device_memory_size_ = device_memory_size;
+  }
+
+  bool use_shardy_partitioner() const { return use_shardy_partitioner_; }
+  void set_use_shardy_partitioner(bool use_shardy_partitioner) {
+    use_shardy_partitioner_ = use_shardy_partitioner;
+  }
+
+ private:
+  // If you add new members, be sure to update compilation_cache_key and the
+  // HloModuleConfigProto.
+  // LINT.IfChange
+  std::optional<ComputationLayout> entry_computation_layout_;
+
+  // Module/graph-level seed handle.
+  uint64_t seed_ = 0;
+
+  // Program id that identifies a set of program to be launched together.
+  int32_t launch_id_ = 0;
+
+  // The number of replicas (data parallelism) to compile this binary for.
+  int64_t replica_count_ = 1;
+
+  // The number of partitions (model parallelism) to compile this binary for.
+  int64_t num_partitions_ = 1;
+
+  // Whether to broadcast args across all replicas. One entry per arg.
+  std::vector<bool> param_requires_broadcast_via_collectives_;
+
+  // Whether to use SPMD (true) or MPMD (false) when num_partitions_ > 0 and XLA
+  // needs to partition the module.
+  bool use_spmd_partitioning_ = false;
+
+  // Whether to automatically generate XLA shardings for SPMD partitioner.
+  bool use_auto_spmd_partitioning_ = false;
+
+  // Mesh shape and mesh ids used by auto spmd partitioning.
+  std::vector<int64_t> auto_spmd_partitioning_mesh_shape_;
+
+  std::vector<int64_t> auto_spmd_partitioning_mesh_ids_;
+
+  // The amount of effort to spend on optimizing for minimizing program
+  // execution time, as a value in [-1.0, +1.0]. The baseline is 0.0, which
+  // strongly prioritizes execution time at the cost of longer compile times,
+  // suitable for production workloads. A value of -0.5 would be appropriate for
+  // research use cases that prefer faster compilations to iterate more quickly.
+  // Positive values, on the other hand, might enable costly optimizations that
+  // are off by default.
+  float exec_time_optimization_effort_ = 0.0f;
+
+  // The amount of effort to spend on making the program fit in memory (where
+  // "fit in memory" here has a backend-dependent meaning), as a value in [-1.0,
+  // +1.0]. The baseline is 0.0, which expends significant effort on attempting
+  // to make the program fit. A value of -1.0 would be appropriate for use cases
+  // that wish to spend minimal effort here and fail as quickly as possible
+  // instead. Positive values, on the other hand, might enable costly algorithms
+  // to reduce memory usage that are off by default.
+  float memory_fitting_effort_ = 0.0f;
+
+  // If enabled, deduplicate equivalent hlos into function calls to reduce code
+  // size.
+  bool deduplicate_hlo_ = false;
+
+  // The target maximum parallelism at which to partition HLOs for parallel
+  // execution on the CPU backend.
+  int64_t intra_op_parallelism_threads_ = -1;
+
+  std::string device_type_;
+
+  DebugOptions debug_options_;
+
+  // Compile-time known device assignment.
+  std::optional<DeviceAssignment> static_device_assignment_;
+
+  // Compile-time known device assignment.
+  std::optional<DeviceAssignment> pre_simulation_device_assignment_;
+
+  bool allow_separate_sharding_programs_ = false;
+
+  std::vector<ShardableValueUpdatePair> shardable_value_update_pairs_;
+
+  bool alias_passthrough_params_ = false;
+
+  bool content_aware_computation_sorting_ = false;
+
+  FusionConfigCollection fusion_config_collection_ =
+      FusionConfigCollection::kOff;
+
+  // Custom fusion configuration, where fusion_config_[c][v] control if node v
+  // in computation c must be fused to all its consumers (true) or not (false).
+  std::vector<std::vector<bool>> fusion_config_;
+
+  // Custom dot canonicalization configuration, where dot_config_[v] control
+  // how to convert dot operation named 'v' to convolution.
+  absl::flat_hash_map<std::string, std::vector<int64_t>> dot_config_;
+
+  // Layout configuration, where layout_config_[v][i] controls the layout
+  // decision i of operation v.
+  std::vector<std::vector<std::vector<int64_t>>> layout_config_;
+
+  // Memory Space Assignment configuration, where
+  // memory_space_assignment_config_ controls the order of buffer intervals
+  // of this hlo module.
+  std::vector<uint64_t> memory_space_assignment_config_;
+
+  // Phase ordering configuration, where phase_ordering_config[v][i] controls
+  // whether a specific pass with index i (e.g. 0 = DCE, 1 = CSE, etc.) is
+  // inserted after pass v in pipeline. See tuning::PhaseOrderingConfig for
+  // details on what indices (i) correspond to which passes.
+  std::vector<std::vector<bool>> phase_ordering_config_;
+  // Index (v) corresponding to current passes being added for phase ordering.
+  // This is the variable that stores state to allow us to use the same
+  // config across functions during compilation.
+  int phase_index_ = 0;
+
+  // Allows sharding propagation to propagate to the parameters. This changes
+  // the input shape of the computation (which is undesirable), but it can be
+  // used to allow to run partial compilation to determine what would be the
+  // input sharding of a computation if XLA would be allowed to propagate the
+  // sharding which can be used by higher level framework as a way to query
+  // intermediate sharding of operations when multiple computation would be
+  // chained and merged together.
+  // This is a vector of bool, because the user can control which parameters can
+  // have the sharding substituted. If only one boolean value is passed in the
+  // vector that is interpreted as the value to be applied for every parameter.
+  absl::InlinedVector<bool, 1> allow_spmd_sharding_propagation_to_parameters_ =
+      {false};
+  // Allows sharding propagation to propagate to the outputs. This changes the
+  // output shape of the computation (which is undesirable), but it can be used
+  // to allow to run partial compilation to determine what would be the output
+  // sharding of a computation if XLA would be allowed to propagate the sharding
+  // which can be used by higher level framework as a way to query intermediate
+  // sharding of operations when multiple computation would be chained and
+  // merged together.
+  // Each boolean in the vector specifies if the propagation is allowed to
+  // change the sharding of a specific leaf in tuple output. One single boolean
+  // in the vector means we are applying this to every value in the tuple
+  // output. If the output is not a tuple then only a single value is valid
+  // here.
+  absl::InlinedVector<bool, 1> allow_spmd_sharding_propagation_to_output_ = {
+      false};
+
+  // Each Hlo analysis is allowed at least a constant number of
+  // abstract cost units, before it is considered for early termination.
+  absl::flat_hash_map<std::string, int64_t> analysis_allowance_map_;
+
+  PrecisionConfig::Precision matrix_unit_operand_precision_ =
+      PrecisionConfig::DEFAULT;
+
+  // Profiling data for feedback directed optimizations. Note that this is not
+  // the only way to feed FDO data into the compiler and individual backends
+  // may choose to get FDO data by other means.
+  std::string fdo_profile_;
+
+  int64_t device_memory_size_ = 0;
+
+  bool use_shardy_partitioner_ = false;
+
+  // Sharding configuration, where sharding_config_.nodes[v] controls the
+  // sharding of operation v.
+  ShardingConfig sharding_config_;
+
+  // LINT.ThenChange(//tensorflow/compiler/xla/xla.proto)
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_MODULE_CONFIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_dce.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_dce.h
new file mode 100644
index 00000000..453deb26
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_dce.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_MODULE_DCE_H_
+#define XLA_SERVICE_HLO_MODULE_DCE_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass which removes dead code from computations in the module using
+// HloModule-scoped analysis (HloLivenessAnalysis).
+//
+// Sweeps through live instructions which cross computation boundaries (kWhile),
+// and removes code at dead shape indices.
+//
+class HloModuleDCE : public HloModulePass {
+ public:
+  ~HloModuleDCE() override {}
+  absl::string_view name() const override { return "hlo-module-dce"; }
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (instructions were removed).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_MODULE_DCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_group_metadata.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_group_metadata.h
new file mode 100644
index 00000000..a53fac22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_group_metadata.h
@@ -0,0 +1,294 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_MODULE_GROUP_METADATA_H_
+#define XLA_SERVICE_HLO_MODULE_GROUP_METADATA_H_
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Class for bookkeeping the information on the given modules, in particular on
+// the interaction between computations.
+//
+// Companion instructions are one piece of information collected as we build the
+// metadata. For example, for each While instruction, companion instructions
+// refer to a set of While instructions in other computations that communicate
+// with each other.
+// In the example below with 3 modules, {While_0, While_2, While_5}, {While_1,
+// While_4}, {While_3, While_6} are companion sets.
+//
+// <Module 0>               <Module 1>                 <Module 2>
+// While_0() {              While_2() {                While_5() {
+//   While_1() { Send(0) }    While_3() { Send(1) }      While_6() { Recv(1) }
+// }                          While_4() { Recv(0) }
+//                          }
+//
+// Each instruction can belong to at most one companion set: While_0 and While_5
+// are in the same set even though they don't communicate with each other,
+// because they both communicate with While_2.
+//
+// A send and the matching recv must both have the same level of nesting of
+// companion instructions.
+//
+// Companion instructions are used to detect cycles in the graph and also for
+// global scheduling.
+class HloModuleGroupMetadata {
+ public:
+  // The kind of companion computation a given instruction can be within.
+  enum class ComputationKind {
+    kInvalid,
+    kWhileCondition,
+    kWhileBody,
+    kConditionalBranch,
+    kCallFunction,
+  };
+
+  // Tracks the instruction mapped to a given computation, and the computation
+  // kind.
+  // For example, a body computation of a while instruction, will generate a
+  // TrackedInstruction with instruction being the while instruction, and
+  // kind being ComputationKind::kWhileBody.
+  class TrackedInstruction {
+   public:
+    TrackedInstruction() = default;
+    TrackedInstruction(HloInstruction* instruction, ComputationKind kind,
+                       int index = -1)
+        : instruction_(instruction), kind_(kind), index_(index) {}
+
+    bool operator==(const TrackedInstruction& rhs) const {
+      return instruction_->opcode() == rhs.instruction_->opcode() &&
+             kind_ == rhs.kind_ && index_ == rhs.index_;
+    }
+    bool operator!=(const TrackedInstruction& rhs) const {
+      return !operator==(rhs);
+    }
+
+    HloInstruction* instruction() const { return instruction_; }
+
+    std::string ToString() const;
+
+   private:
+    HloInstruction* instruction_ = nullptr;
+    ComputationKind kind_ = ComputationKind::kInvalid;
+    int index_ = -1;
+  };
+
+  // Represents a channel and the instructions that form the channel.
+  struct Channel {
+    int64_t id = -1;
+    HloInstruction* send = nullptr;
+    HloInstruction* recv = nullptr;
+    HloInstruction* send_done = nullptr;
+    HloInstruction* recv_done = nullptr;
+  };
+
+  explicit HloModuleGroupMetadata(absl::Span<HloModule* const> modules)
+      : modules_(modules.begin(), modules.end()) {}
+
+  ~HloModuleGroupMetadata() = default;
+
+  // Build and return the metadata for the given modules.
+  static absl::StatusOr<std::unique_ptr<HloModuleGroupMetadata>> Build(
+      absl::Span<HloModule* const> modules);
+
+  // Returns true if the instruction is one of the 4 channel instructions (Send,
+  // Recv, SendDone, RecvDone).
+  bool IsChannelInstruction(const HloInstruction* instruction) const;
+
+  // Returns true if the instruction is a companion instruction. See the class
+  // comment above on companion instructions.
+  bool IsCompanionInstruction(HloInstruction* hlo) const;
+
+  // Returns true if the instruction is either a cross-module all-reduce
+  // instruction in a non-spmd module.
+  bool IsNonSpmdCrossModuleAllReduce(HloInstruction* hlo) const;
+
+  // Returns true if the instruction is either a channel instruction, a
+  // cross-module non-spmd all-reduce instruction, or a companion instruction.
+  bool InstructionCommunicates(HloInstruction* hlo) const;
+
+  // Returns the Channel instance for the given channel id.
+  const Channel& GetChannel(int64_t channel_id) const;
+
+  // Returns if the given channel id exists in metadata.
+  bool HasChannel(int64_t channel_id) const;
+
+  // Returns the all-reduce instructions with the same channel_id.
+  const std::vector<HloInstruction*>& GetAllReduceGroup(
+      int64_t channel_id) const;
+
+  // Returns the computation that contains the peer channel instructions for
+  // the given instruction.
+  //
+  // Precondition: IsChannelInstruction(instruction) is true.
+  HloComputation* PeerComputation(const HloInstruction* instruction) const;
+
+  // Returns the path of the nested companion instructions, in terms of HLO
+  // instructions. The path goes from inner to outer companions.
+  // The returned path does not include the input hlo instruction, in case it
+  // is a companion instruction.
+  std::vector<TrackedInstruction> GetCompanionsPath(
+      const HloInstruction* hlo) const;
+
+  // Checks whether two companion paths (as returned by the GetCompanionsPath()
+  // API) are compatible. The two paths are compatible if the sequence of
+  // opcodes, and the companion kinds, of the two paths matches.
+  bool CheckCompanionPathsCompatibility(
+      const std::vector<TrackedInstruction>& path0,
+      const std::vector<TrackedInstruction>& path1) const;
+
+  // Returns the unique integer for each module. The returned id is the index of
+  // the module in the module vector.
+  int64_t GetModuleId(const HloModule* module) const;
+
+  // Retrieves the device an instruction is assigned to. Either from the
+  // sharding information, or from the ordinal of the module the instruction
+  // is in.
+  std::optional<int64_t> GetInstructionDevice(
+      const HloInstruction& instruction) const;
+
+  // Returns the number of modules for devices (excluding the host module).
+  int64_t GetDeviceModulesCount() const;
+
+  // Returns the companion set for the given instruction, including the
+  // instruction itself.
+  //
+  // Precondition: IsCompanionWhile(instruction) is true.
+  const std::vector<HloInstruction*>& Companions(
+      const HloInstruction* instruction) const {
+    CHECK(companion_set_index_.contains(instruction));
+    return companion_set(companion_set_index_.at(instruction));
+  }
+
+  // Returns the companion set at the given index.
+  const std::vector<HloInstruction*>& companion_set(int64_t index) const {
+    CHECK_LT(index, companion_sets_.size());
+    return *companion_sets_[index];
+  }
+
+  // Returns the companion set index of the given instruction.
+  int64_t companion_set_index(HloInstruction* instruction) const {
+    return companion_set_index_.at(instruction);
+  }
+
+  // Returns the list of all companion sets in the HLO module group. Each
+  // returned set contains at least one HloInstruction.
+  const std::vector<std::unique_ptr<std::vector<HloInstruction*>>>&
+  companion_sets() const {
+    return companion_sets_;
+  }
+
+  // Returns all channels in the module group.
+  const std::vector<Channel>& channels() const { return channels_; }
+
+  // Returns the maximum channel id used in the module group.
+  int64_t max_channel_id() const { return max_channel_id_; }
+
+  HloAliasAnalysis* alias_analysis(HloModule* module) const {
+    return alias_analyses_.at(module).get();
+  }
+
+ private:
+  absl::Status Build();
+
+  // Record all channel instructions, cross-module AllReduce instructions, and
+  // While/Conditional/Call instructions.
+  absl::Status RecordInstructions();
+
+  // Verifies the given HloModules are well-formed and follow the specification,
+  // in particular with respect to using channel instructions.
+  //
+  // * Each channel has all 4 instructions (Send, Recv, SendDone, RecvDone).
+  // * The shape of channel instructions match.
+  // * The nest level of channel instructions match.
+  // * Channel instructions are used in allowed computations, i.e., in the
+  //   entry computation of the module or condition/body of While computations.
+  absl::Status VerifyChannelInstructions();
+
+  // Adds metadata that the given two instructions are companions.
+  absl::Status AddCompanion(HloInstruction* instruction1,
+                            HloInstruction* instruction2);
+
+  // Checks whether a communicating instruction is placed in a valid position
+  // within the graph.
+  absl::Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
+
+  // Performs a consistency check on the companion sets built for the input
+  // modules. Checks that each instruction in a companion set is in a different
+  // module/device.
+  absl::Status VerifyCompanionSets() const;
+
+  // Retrieves a pointer to the stored TrackedInstruction associated with a
+  // tracked computation, or nullptr in case such computation is not tracked.
+  const TrackedInstruction* GetTrackedInstruction(
+      const HloComputation* computation) const {
+    auto it = tracked_instructions_.find(computation);
+    return it != tracked_instructions_.end() ? &it->second : nullptr;
+  }
+
+  // Dump all the collected module group statistics to the logs.
+  void DumpCollectedStats() const;
+
+  // List of all companion instructions sets in the module.
+  std::vector<std::unique_ptr<std::vector<HloInstruction*>>> companion_sets_;
+
+  // Map from each companion while instruction to the index into companion_set_.
+  absl::flat_hash_map<const HloInstruction*, int64_t> companion_set_index_;
+
+  // Map from computation to the instruction using it (a kWhile, kConditional).
+  absl::flat_hash_map<const HloComputation*, TrackedInstruction>
+      tracked_instructions_;
+
+  // Maps tracked instructions (kWhile, kConditional, kCall, ...) to the set of
+  // communicating instructions within the proper called computation(s).
+  absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>>
+      tracked_instructions_comms_;
+
+  // All channels in the module.
+  std::vector<Channel> channels_;
+
+  // Map from channel ids to the index in channels_.
+  absl::flat_hash_map<int64_t, int64_t> channel_id_map_;
+
+  // Map from all-reduce ids to the all reduce instructions.
+  absl::flat_hash_map<int64_t, std::vector<HloInstruction*>> all_reduce_map_;
+
+  // The maximum channel id used in the module group.
+  int64_t max_channel_id_ = -1;
+
+  // The modules that this metadata was built from.
+  const std::vector<HloModule*> modules_;
+
+  absl::flat_hash_map<HloModule*, std::unique_ptr<HloAliasAnalysis>>
+      alias_analyses_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_MODULE_GROUP_METADATA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_group_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_group_util.h
new file mode 100644
index 00000000..9f3e28a6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_group_util.h
@@ -0,0 +1,123 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_MODULE_GROUP_UTIL_H_
+#define XLA_SERVICE_HLO_MODULE_GROUP_UTIL_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_module_group_metadata.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Collection of utilities for handling HloModuleGroups.
+class HloModuleGroupUtil {
+ public:
+  explicit HloModuleGroupUtil(const HloModuleGroupMetadata& metadata)
+      : metadata_(metadata) {}
+
+  // Returns all unique predecessors of the instruction. This includes:
+  // * predecessors in the same computation: operands and control predecessors
+  // * Recv is a predecessor of Send
+  // * Send is a predecessor of RecvDone
+  // * predecessors of companions (if the instruction is a companion while)
+  // * predecessors' companions (for any predecessor that is a companion while)
+  std::vector<HloInstruction*> GlobalPredecessors(HloInstruction* instruction);
+
+  // Returns all unique successors of the instruction. This includes:
+  // * successors in the same computation: users and control successors
+  // * Send is a successor of Recv
+  // * RecvDone is a successor of Send
+  // * successors of companions (if the instruction is a companion while)
+  // * successors' companions (for any successor that is a companion while)
+  std::vector<HloInstruction*> GlobalSuccessors(HloInstruction* instruction);
+
+  // Returns the root instructions of the computations.
+  std::vector<HloInstruction*> RootInstructions(
+      absl::Span<HloComputation* const> computations);
+
+  // Visit state of each instruction during DFS traversal.
+  enum VisitState {
+    kNotVisited = 0,
+    kVisiting,
+    kVisited,
+  };
+
+  // Function called on each instruction group during the DFS traversal. See the
+  // comment for VisitTopologicalOrder()).
+  using VisitFunction = absl::FunctionRef<absl::Status(
+      HloInstruction* hlo,
+      const std::vector<HloInstruction*>& instruction_group)>;
+
+  // Given the hlo instruction as the root, recursively visits all its
+  // predecessor instructions in DFS order to visit nodes in topological order.
+  //
+  // Note that the DFS traversal does not only visit nodes in the same
+  // computation (parent of the root instruction), but also visits nodes in
+  // different computations connected via communication instructions. During the
+  // traversal, companion While instructions (see the class comment in
+  // HloModuleGroupMetadata) are treated as a single instruction (called
+  // instruction group, which contains only a single instruction if the visiting
+  // node is not a companion while) -- visiting one of the instructions in the
+  // group effectively visits all other instructions in the group, and then all
+  // predecessor instructions of the group are visited.
+  //
+  // * visit_state: map from each instruction to its visit state.
+  // * visit_function: function called when each instruction group.
+  // * root: the root instruction of the traversal.
+  // * send_recv_as_one_group: if true, treat (Recv, Send, RecvDone, SendDone)
+  // as one group.
+  using VisitStates = absl::flat_hash_map<HloInstruction*, VisitState>;
+  absl::Status VisitTopologicalOrder(VisitStates* visit_state,
+                                     VisitFunction visit_function,
+                                     HloInstruction* root,
+                                     bool send_recv_as_one_group = false);
+
+  // Verifies that the computations are well-formed (e.g., no cycles).
+  absl::Status VerifyComputations(
+      absl::Span<HloComputation* const> computations);
+
+  // Below Reachability utils resemble those in HloComputation, except that
+  // they can handle instructions across multiple computations.
+  //
+  // Creates the reachability map for the instructions in the computations.
+  absl::StatusOr<std::unique_ptr<HloReachabilityMap>> ComputeReachability(
+      absl::Span<HloComputation* const> computations);
+
+  // Updates the reachability of the given instruction, taking the global
+  // predecessors and successors into account.
+  void UpdateReachabilityThroughInstruction(
+      HloInstruction* instruction, HloReachabilityMap* reachability_map);
+
+ private:
+  std::string CycleToString(HloInstruction* instruction);
+
+  const HloModuleGroupMetadata& metadata_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_MODULE_GROUP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_util.h
new file mode 100644
index 00000000..80d0ab1c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_module_util.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_MODULE_UTIL_H_
+#define XLA_SERVICE_HLO_MODULE_UTIL_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/service/compiler.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+// Creates an HloModuleConfig for a given program shape and arguments.
+// If execution_options does not set num_replicas, default_num_replicas is used.
+// num_threads is optional; if not given, intra_op_parallelism_threads not set.
+// aot_options is optional; if not given a default is used.
+absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
+    const ProgramShape& program_shape,
+    absl::Span<const Shape* const> argument_shapes,
+    const ExecutionOptions* execution_options, int default_num_replicas,
+    std::optional<int> num_threads = std::nullopt,
+    const AotCompilationOptions* aot_options = nullptr);
+
+typedef std::function<Shape(const Shape&)> DeviceShapeRepresentationFn;
+
+// Update entry computation's computation layout by translating each shape
+// with shape_representation_fn(shape). It can be used for example to add
+// tiling info for each shape.
+void UpdateEntryComputationLayout(
+    HloModule* module, DeviceShapeRepresentationFn shape_representation_fn,
+    bool empty_tiles_only = true);
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_MODULE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_ordering.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_ordering.h
new file mode 100644
index 00000000..d0353681
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_ordering.h
@@ -0,0 +1,22 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_ORDERING_H_
+#define XLA_SERVICE_HLO_ORDERING_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_ordering.h"
+
+#endif  // XLA_SERVICE_HLO_ORDERING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_parser.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_parser.h
new file mode 100644
index 00000000..6a9e8d8b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_parser.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_PARSER_H_
+#define XLA_SERVICE_HLO_PARSER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/parser/hlo_parser.h"
+
+#endif  // XLA_SERVICE_HLO_PARSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_fix.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_fix.h
new file mode 100644
index 00000000..c7dab430
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_fix.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_PASS_FIX_H_
+#define XLA_SERVICE_HLO_PASS_FIX_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/pass/hlo_pass_fix.h"
+
+#endif  // XLA_SERVICE_HLO_PASS_FIX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_interface.h
new file mode 100644
index 00000000..1b6a373b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_interface.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_PASS_INTERFACE_H_
+#define XLA_SERVICE_HLO_PASS_INTERFACE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+#endif  // XLA_SERVICE_HLO_PASS_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_pipeline.h
new file mode 100644
index 00000000..83d693cc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_pass_pipeline.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_PASS_PIPELINE_H_
+#define XLA_SERVICE_HLO_PASS_PIPELINE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+
+#endif  // XLA_SERVICE_HLO_PASS_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_phi_graph.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_phi_graph.h
new file mode 100644
index 00000000..b8c56454
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_phi_graph.h
@@ -0,0 +1,100 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_PHI_GRAPH_H_
+#define XLA_SERVICE_HLO_PHI_GRAPH_H_
+
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_value.h"
+
+namespace xla {
+// Phi graph is a graph that contains and connects phi nodes build on top of
+// HloValues with explicit edges, as well as non-phi nodes that are direct
+// inputs to the phi nodes. The graph can be viewed as an 'overlay' on top of
+// HloValues, with some edges that connect them together. After optimization,
+// some phis nodes will be simplified away and the user can then ask two useful
+// questions:
+//
+// 1. Which HloValue should a phi node being replaced with?
+// 2. TODO(yunxing): What are the set of aliased HloValues that are connecting
+// to the same phi (Must-aliasing).
+class PhiGraph {
+ public:
+  // Register an hlo value into the phi node.
+  void RegisterPhi(const HloValue& value,
+                   absl::Span<const HloValue* const> inputs);
+
+  HloValue::Id GetOptimizedId(const HloValue& value);
+
+  // Returns true if the input to a hlo value is the same as `inputs`.
+  bool InputsEqualTo(const HloValue& value,
+                     absl::Span<const HloValue* const> inputs);
+
+  // Given `id`, returns the new id that `id` should be replaced with. If the
+  // node is not optimized, returns the same value.
+  HloValue::Id FindOptimizedValue(const HloValue::Id id);
+
+  // Optimize the entire graph.
+  void Optimize();
+
+  std::string ToString();
+
+ private:
+  struct Node {
+    bool is_phi;
+    // Users of this node. Non-phi node has no operands.
+    std::vector<Node*> users;
+    // Operands of this node.
+    std::vector<Node*> operands;
+
+    // The value that the node is originally registered with.
+    HloValue::Id value_id;
+
+    // mark_as_dead is set to true when a phi node is simplified away
+    //
+    // Precondition: node is a phi.
+    bool mark_as_dead = false;
+  };
+
+  Node* CreateOrReuseNode(const HloValue& value);
+
+  // Replace `node` with `replace`. Redirect all users to the `replace` and
+  // all HloValues pointing to the `node` to `replace`. Also mark `node` as
+  // dead.
+  //
+  // Precondition: node is a phi -- It's only possible to simplify away a
+  // phi node.
+  void ReplaceNodeWith(Node* node, Node* replace);
+
+  // A reverse mapping of a node in the phi graph and all HloValues pointing
+  // to that phi.
+  absl::flat_hash_map<Node*, std::vector<HloValue::Id>> node_to_value_id_;
+
+  // A mapping from a HloValue to node in the phi graph.
+  absl::flat_hash_map<HloValue::Id, Node*> value_id_to_node_;
+  std::vector<std::unique_ptr<Node>> node_storage_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_PHI_GRAPH_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_profile_printer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_profile_printer.h
new file mode 100644
index 00000000..e15c1ab8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_profile_printer.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#define XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "xla/service/hlo_profile_printer_data.pb.h"
+#include "xla/types.h"
+
+namespace xla {
+// Pretty-print an array of profile counters using hlo_profile_printer_data.
+std::string PrintHloProfile(
+    const HloProfilePrinterData& hlo_profile_printer_data,
+    const int64_t* counters, double clock_rate_ghz);
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_proto_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_proto_util.h
new file mode 100644
index 00000000..5f4483be
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_proto_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities to manipulate data in hlo.proto.
+
+#ifndef XLA_SERVICE_HLO_PROTO_UTIL_H_
+#define XLA_SERVICE_HLO_PROTO_UTIL_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/hlo.pb.h"
+
+namespace xla {
+
+// Returns a serialized representation of the HLO state.
+HloProto MakeHloProto(const HloModule& module,
+                      const BufferAssignment& assignment);
+
+// Returns a serialized representation of the HLO state, but buffer assignment
+// will not be included in the output.
+HloProto MakeHloProto(const HloModule& module);
+
+// Create an HLO state from serialized representation. In addition to
+// creating the proto with HloModule::CreateFromProto(...) it also
+// uses HloVerifier to ensure basic invariants are held.
+// The HLO module could be a pre-optimizations (default) or post-optimizations
+// module, which affects how the HLO module is verified, e.g., mixed-precision
+// is allowed in post-optimizations HLOs.
+absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
+    const HloModuleProto& proto, const HloModuleConfig& module_config,
+    bool is_module_post_optimizations = false);
+
+// Returns the shapes of the parameters of the entry computation. Shape pointers
+// refer to shapes inside of the given HloProto.
+absl::StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
+    const HloProto& hlo_proto);
+
+// Returns the shape of the output of the entry computation. The shape pointer
+// refers to the output shape inside of the given HloProto.
+absl::StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_PROTO_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_rematerialization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_rematerialization.h
new file mode 100644
index 00000000..0dcdcee3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_rematerialization.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_SERVICE_HLO_REMATERIALIZATION_H_
+#define XLA_SERVICE_HLO_REMATERIALIZATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_rematerialization.h"
+
+#endif  // XLA_SERVICE_HLO_REMATERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_rematerialization_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_rematerialization_test_utils.h
new file mode 100644
index 00000000..8837169b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_rematerialization_test_utils.h
@@ -0,0 +1,24 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class to create computations for testing rematerialization methods.
+
+#ifndef XLA_SERVICE_HLO_REMATERIALIZATION_TEST_UTILS_H_
+#define XLA_SERVICE_HLO_REMATERIALIZATION_TEST_UTILS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/hlo_rematerialization_test_utils.h"
+
+#endif  // XLA_SERVICE_HLO_REMATERIALIZATION_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_replication_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_replication_analysis.h
new file mode 100644
index 00000000..85289cb0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_replication_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_REPLICATION_ANALYSIS_H_
+#define XLA_SERVICE_HLO_REPLICATION_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_replication_analysis.h"
+
+#endif  // XLA_SERVICE_HLO_REPLICATION_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner.h
new file mode 100644
index 00000000..4314fce1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner.h
@@ -0,0 +1,238 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_RUNNER_H_
+#define XLA_SERVICE_HLO_RUNNER_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/backend.h"
+#include "xla/service/compiler.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class BufferAssignmentProto;
+
+// A base class for running an HloModule. This executes the given HloModule on a
+// certain backend directly without using the client interface. HloModule can be
+// explicitly built, or loaded from a serialization file (e.g., hlo proto
+// file), or parsed from a hlo textual IR string.
+class HloRunner : public HloRunnerInterface {
+ public:
+  // intra_op_parallelism_threads: For the CPU backend only. It is the thread
+  // pool size for parallel execution of an individual operator. The default
+  // value of -1 will result in initializing the thread pool with the number of
+  // threads equal to the number of
+  // cores in the system.
+  explicit HloRunner(se::Platform* platform,
+                     int intra_op_parallelism_threads = -1);
+
+  ~HloRunner() override;
+
+  // Transfers data between the host and device.
+  absl::StatusOr<ScopedShapedBuffer> TransferLiteralToDevice(
+      const Literal& literal, int64_t param_no);
+  absl::StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      absl::Span<const Literal* const> literals);
+  absl::StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      absl::Span<const Literal> literals);
+  absl::StatusOr<Literal> TransferLiteralFromDevice(const ShapedBuffer& buffer);
+
+  // Executes the given module with given literals as input and returns the
+  // result as a Literal.
+  //
+  // If run_hlo_passes is false, the module will be executed without Hlo
+  // optimization.
+
+  using HloRunnerInterface::Execute;
+
+  absl::StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
+                                  absl::Span<const Literal* const> arguments,
+                                  bool run_hlo_passes,
+                                  ExecutionProfile* profile) override;
+
+  using HloRunnerInterface::ExecuteWithBufferAssignment;
+
+  absl::StatusOr<Literal> ExecuteWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* buffer_assignment_proto,
+      absl::Span<const Literal* const> arguments, bool run_hlo_passes,
+      ExecutionProfile* profile) override;
+
+  using HloRunnerInterface::ExecuteWithExecutable;
+
+  absl::StatusOr<Literal> ExecuteWithExecutable(
+      Executable* executable, absl::Span<const Literal* const> arguments,
+      ExecutionProfile* profile) override;
+
+  // As Execute(), but accepts and returns device buffers instead of host
+  // buffers.
+  //
+  // ExecuteWithMovedDeviceBuffers is more memory-safe, but it consumes the
+  // arguments. Please consider using that.
+  //
+  // This may overwrite the values of the arguments if the the module has
+  // aliasing.
+  absl::StatusOr<ExecutionOutput> ExecuteWithDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      absl::Span<ScopedShapedBuffer const> arguments,
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
+
+  absl::StatusOr<ExecutionOutput> ExecuteWithDeviceBuffers(
+      Executable* executable, absl::Span<ScopedShapedBuffer const> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  // As Execute(), but accepts and returns device buffers instead of host
+  // buffers.
+  //
+  // This is a memory-safer version of ExecuteWithDeviceBuffers, but it consumes
+  // the arguments.
+  absl::StatusOr<ExecutionOutput> ExecuteWithMovedDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      std::vector<ScopedShapedBuffer> arguments, bool run_hlo_passes = true,
+      ExecutionProfile* profile = nullptr);
+
+  absl::StatusOr<ExecutionOutput>
+  ExecuteWithMovedDeviceBuffersAndBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* buffer_assignment_proto,
+      std::vector<ScopedShapedBuffer> arguments, bool run_hlo_passes = true,
+      ExecutionProfile* profile = nullptr);
+
+  absl::StatusOr<ExecutionOutput> ExecuteWithMovedDeviceBuffers(
+      Executable* executable, std::vector<ScopedShapedBuffer> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run as part of compilation.
+  absl::StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
+
+  absl::StatusOr<std::unique_ptr<Executable>>
+  CreateExecutableWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* /*buffer_assignment_proto*/,
+      bool run_hlo_passes) override;
+
+  // Executes a given HLO module into a set of replicas, and returns a map
+  // with the replica number as key, and the corresponding returned literal as
+  // value.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options) override;
+
+  // Same as above, but with specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
+  // Same as above, but with a reusable Executable.  This may update the profile
+  // information in *executable.
+  //
+  // Note that this call ignores ReplicatedExecutionOptions::run_hlo_passes,
+  // since we've already compiled the Executable.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      Executable* executable, const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment, ExecutionProfile* profile = nullptr);
+
+  // Same as above, but with different reusable Executables. This may update the
+  // profile information in *executables.
+  //
+  // Note that this call ignores ReplicatedExecutionOptions::run_hlo_passes,
+  // since we've already compiled the Executable.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64_t)> executable_provider,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
+  // If backend is not created in the constructor, creates and returns the
+  // default backend. If creation fails, crashes the program.
+  //
+  // This creates the backend lazily so it's possible to instantiate an
+  // HloRunner in a program without any backends linked in.
+  Backend& backend();
+  const Backend& backend() const;
+
+  absl::string_view Name() const override;
+
+  DeviceShapeRepresentationFn device_shape_representation_fn() const override {
+    return device_shape_representation_fn_;
+  }
+
+  DeviceShapeSizeFn device_shape_size_fn() const override {
+    return backend().compiler()->ShapeSizeBytesFunction();
+  }
+
+  int device_count() const override { return backend().device_count(); }
+
+ private:
+  absl::StatusOr<ExecutionOutput> ExecuteWithExecutionInputs(
+      Executable* executable, std::vector<ExecutionInput> arguments,
+      ExecutionProfile* profile);
+
+  // Creates a ServiceExecutableRunOptions object to configure a run on device,
+  // using the provided stream object. If device_assignment is not nullptr, it
+  // will be used to configure the replication parameters. Replicated executions
+  // should pass the device_assignment parameter.
+  ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
+      int64_t device, se::Stream* stream, DeviceAssignment* device_assignment,
+      RunId run_id, int local_device_count);
+
+  // Common implementation code for ExecuteReplicated() above.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
+      std::function<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
+          const std::vector<ServiceExecutableRunOptions>&,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&)>
+          execution_helper,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment);
+
+  // Gets or creates the DeviceMemoryAllocator.
+  se::DeviceMemoryAllocator* GetAllocator();
+
+  std::unique_ptr<Backend> backend_;
+
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+
+  DeviceShapeRepresentationFn device_shape_representation_fn_;
+
+  const ComputationLayout* entry_computation_layout_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner_interface.h
new file mode 100644
index 00000000..23f29591
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner_interface.h
@@ -0,0 +1,239 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_RUNNER_INTERFACE_H_
+#define XLA_SERVICE_HLO_RUNNER_INTERFACE_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/literal.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class BufferAssignmentProto;
+
+// A base class for running an HloModule. This executes the given HloModule on a
+// certain backend directly without using the client interface. HloModule can be
+// explicitly built, or loaded from a serialization file (e.g., hlo proto
+// file), or parsed from a hlo textual IR string.
+class HloRunnerInterface {
+ public:
+  // The options used to configure an ExecuteReplicated() call.
+  struct ReplicatedExecuteOptions {
+    // The number of devices the HLO module should be replicated onto.
+    int64_t num_replicas = 1;
+
+    // The arguments to be fed to each replica. Since this is used for a
+    // replicated execution, all the arguments are the same for all replicas.
+    std::vector<const Literal*> arguments;
+
+    // If the HLO module being run has an infeed instruction, this will be the
+    // data which will be fed to it, for as many as infeed_steps steps.
+    std::vector<const Literal*> infeed_values;
+
+    // The number of times the infeed literal should be fed to the HLO module.
+    // For a clean exit, this should match the iterations-per-loop parameter
+    // used when generating the HLO module proto (that is usually the main
+    // while boundary counter). A value higher then iterations-per-loop would
+    // lead to infeed threads feeding to a gone computation, while a lower
+    // value would trigger a stuck ExecuteReplicated() call (the computation
+    // will be trying to infeed data which will never come).
+    int64_t infeed_steps = -1;
+
+    // The shape of the outfeed operation. If empty, the HLO module does not
+    // generate any outfeed.
+    Shape outfeed_shape;
+
+    // A pointer to a vector where the outfeed values will be stored. If
+    // nullptr, the values will be read and discarded.
+    std::vector<Literal>* outfeed_values = nullptr;
+
+    // Whether the HLO passes should be run on the input module. Usually
+    // saved modules are coming from after the HLO pass pipeline, so triggering
+    // another run will likely cause errors.
+    bool run_hlo_passes = false;
+
+    // If true, executes on multiple threads using se::Stream::ExecuteOnStream.
+    // Otherwise, executes using xla::Executable::ExecuteOnStreams.
+    bool use_threads = false;
+  };
+
+  using DeviceShapeRepresentationFn = std::function<Shape(const Shape&)>;
+  using DeviceShapeSizeFn = std::function<int64_t(const Shape&)>;
+
+  HloRunnerInterface() = default;
+  virtual ~HloRunnerInterface() = default;
+
+  // Converts an HloModule from the given hlo textual IR string (in
+  // HloModule::ToString format).
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
+      absl::string_view hlo_string, const DebugOptions& debug_options);
+
+  // Reads the proto file in xla.HloProto format, creates and returns the
+  // HloModule.
+  static absl::StatusOr<std::unique_ptr<HloModule>>
+  ReadModuleFromBinaryProtoFile(const std::string& filename,
+                                const DebugOptions& debug_options);
+
+  // Reads the proto file in xla.HloModule format, creates and returns the
+  // HloModule.
+  static absl::StatusOr<std::unique_ptr<HloModule>>
+  ReadModuleFromModuleBinaryProtofile(const std::string& filename,
+                                      const DebugOptions& debug_options);
+
+  // Reads the hlo text dump file in HloModule::ToString format, creates and
+  // returns the HloModule.
+  static absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
+      const std::string& filename, const DebugOptions& debug_options,
+      const HloParserOptions& options = HloParserOptions());
+
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run as part of compilation.
+  virtual absl::StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) = 0;
+
+  // Same as above, except it takes buffer assignment as input.
+  // Note: The default implementation of the API here does not utilize the given
+  // buffer assignment. A derived runner interface is expected to override the
+  // following method to achieve this functionality.
+  virtual absl::StatusOr<std::unique_ptr<Executable>>
+  CreateExecutableWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* /*buffer_assignment_proto*/,
+      bool run_hlo_passes) {
+    LOG(WARNING) << "Ignoring the buffer assignment proto provided.";
+    return CreateExecutable(std::move(module), run_hlo_passes);
+  }
+
+  // Executes the given module with given literals as input and returns the
+  // result as a Literal.
+  //
+  // If run_hlo_passes is false, the module will be executed without Hlo
+  // optimization
+  absl::StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
+                                  absl::Span<const Literal* const> arguments,
+                                  bool run_hlo_passes = true) {
+    return Execute(std::move(module), arguments, run_hlo_passes, nullptr);
+  }
+
+  absl::StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
+                                  absl::Span<const Literal> arguments,
+                                  bool run_hlo_passes = true,
+                                  ExecutionProfile* profile = nullptr);
+
+  virtual absl::StatusOr<Literal> Execute(
+      std::unique_ptr<HloModule> module,
+      absl::Span<const Literal* const> arguments, bool run_hlo_passes,
+      ExecutionProfile* profile) = 0;
+
+  // Same as above 3 methods, but with buffer assignment specified.
+  absl::StatusOr<Literal> ExecuteWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* buffer_assignment_proto,
+      absl::Span<const Literal* const> arguments, bool run_hlo_passes = true) {
+    return ExecuteWithBufferAssignment(std::move(module),
+                                       buffer_assignment_proto, arguments,
+                                       run_hlo_passes, nullptr);
+  }
+
+  absl::StatusOr<Literal> ExecuteWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* buffer_assignment_proto,
+      absl::Span<const Literal> arguments, bool run_hlo_passes = true,
+      ExecutionProfile* profile = nullptr);
+
+  // Note: The default implementation of the API here does not utilize the given
+  // buffer assignment. A derived runner interface is expected to override the
+  // following method to achieve this functionality.
+  virtual absl::StatusOr<Literal> ExecuteWithBufferAssignment(
+      std::unique_ptr<HloModule> module,
+      const BufferAssignmentProto* /*buffer_assignment_proto*/,
+      absl::Span<const Literal* const> arguments, bool run_hlo_passes,
+      ExecutionProfile* profile) {
+    LOG(WARNING) << "Ignoring the buffer assignment proto provided.";
+    return Execute(std::move(module), arguments, run_hlo_passes, profile);
+  }
+
+  // Same as 3 Execute methods above, but with Executable as input.
+  absl::StatusOr<Literal> ExecuteWithExecutable(
+      Executable* executable, absl::Span<const Literal> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  absl::StatusOr<Literal> ExecuteWithExecutable(
+      Executable* executable, absl::Span<const Literal* const> arguments) {
+    return ExecuteWithExecutable(executable, arguments, nullptr);
+  }
+
+  virtual absl::StatusOr<Literal> ExecuteWithExecutable(
+      Executable* executable, absl::Span<const Literal* const> arguments,
+      ExecutionProfile* profile) = 0;
+
+  // Executes a given HLO module into a set of replicas, and returns a map
+  // with the replica number as key, and the corresponding returned literal as
+  // value.
+  // TODO(b/172931928): change to non-virtual function.
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options) = 0;
+
+  // Same as above, but with specified device assignment.
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) = 0;
+
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64_t)> executable_provider,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) = 0;
+
+  // Returns the name of this runner.
+  virtual absl::string_view Name() const = 0;
+
+  // Return the device shape representation of 'host_shape'.
+  virtual DeviceShapeRepresentationFn device_shape_representation_fn()
+      const = 0;
+  // Return the device shape size of 'host_shape'.
+  // This function is used e.g. to create a VerifiedHloModule. It returns an
+  // integer representing the size of the shape in bytes as opposed to a Shape.
+  virtual DeviceShapeSizeFn device_shape_size_fn() const = 0;
+
+  // Returns the number of devices which are known. Not all of these devices may
+  // be usable by XLA.
+  virtual int device_count() const = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_RUNNER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner_pjrt.h
new file mode 100644
index 00000000..0d7c92be
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -0,0 +1,149 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_RUNNER_PJRT_H_
+#define XLA_SERVICE_HLO_RUNNER_PJRT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_module_util.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A base class for running an HloModule using the PjRt API. This class
+// abstracts execution for a given HloModule using PjRt interfaces.
+// HloModule can be explicitly built, or loaded from a serialization file (e.g.,
+// hlo proto file), or parsed from a hlo textual IR string.
+class HloRunnerPjRt : public HloRunnerInterface {
+ public:
+  explicit HloRunnerPjRt(
+      std::unique_ptr<PjRtClient> pjrt_client,
+      DeviceShapeRepresentationFn device_shape_representation_fn,
+      DeviceShapeSizeFn device_shape_size_fn,
+      bool use_parameter_layout_on_device = false);
+
+  ~HloRunnerPjRt() override;
+
+  // Transfers data between the host and device.
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> TransferLiteralToDevice(
+      const Literal& literal, const Layout& parameter_layout);
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  TransferLiteralsToDevice(const ComputationLayout& entry_layout,
+                           absl::Span<const Literal* const> literals);
+  absl::StatusOr<Literal> TransferLiteralFromDevice(PjRtBuffer& buffer);
+
+  // Executes the given module with given literals as input and returns the
+  // result as a Literal.
+  absl::StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
+                                  absl::Span<const Literal* const> arguments,
+                                  bool run_hlo_passes,
+                                  ExecutionProfile* profile) override;
+
+  // As Execute(), but accepts and returns device buffers instead of host
+  // buffers.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecuteWithDeviceBuffers(
+      PjRtLoadedExecutable* executable, const ExecuteOptions& execute_options,
+      const std::vector<std::unique_ptr<PjRtBuffer>>& arguments);
+
+  // Creates an executable object for an HloModule.
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CreateExecutable(
+      HloModule* module, CompileOptions compile_options);
+
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run as part of compilation.
+  absl::StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
+
+  absl::StatusOr<Literal> ExecuteWithExecutable(
+      Executable* executable, absl::Span<const Literal* const> arguments,
+      ExecutionProfile* profile) override;
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options) override;
+
+  // Same as above, but with specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64_t)> executable_provider,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      Executable* executable,
+      const HloRunnerInterface::ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment, ExecutionProfile* profile = nullptr);
+
+  absl::string_view Name() const override;
+
+  void UpdateEntryComputationLayout(HloModule* module) {
+    xla::UpdateEntryComputationLayout(module, device_shape_representation_fn_);
+  }
+
+  DeviceShapeRepresentationFn device_shape_representation_fn() const override {
+    return device_shape_representation_fn_;
+  }
+
+  DeviceShapeSizeFn device_shape_size_fn() const override {
+    return device_shape_size_fn_;
+  }
+
+  int device_count() const override { return pjrt_client_->device_count(); }
+
+ private:
+  absl::StatusOr<CompileOptions> GenerateDefaultCompileOptions(
+      HloModule* module, bool run_hlo_passes);
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
+      std::function<absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>(
+          absl::Span<const std::vector<PjRtBuffer*>>)>
+          execution_helper,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment);
+
+  std::unique_ptr<PjRtClient> pjrt_client_;
+  DeviceShapeRepresentationFn device_shape_representation_fn_;
+  DeviceShapeSizeFn device_shape_size_fn_;
+  bool use_parameter_layout_on_device_ = false;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_RUNNER_PJRT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_unstacker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_unstacker.h
new file mode 100644
index 00000000..bd312e89
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_unstacker.h
@@ -0,0 +1,100 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_UNSTACKER_H_
+#define XLA_SERVICE_HLO_UNSTACKER_H_
+
+#include <stdbool.h>
+
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+// This pass implements unstacking for loop operands. Generally speaking,
+// unstacking is the act of breaking a rank n tensor into n smaller n-1 rank
+// tensors without changing the semantics of the program. There are different
+// patterns that can benefit from unstacking. This pass aims to implement such
+// patterns. The patterns implemented are not exhaustive by any means. Lets
+// consider a simple example:
+// In the pattern below, `I` (the most-major dimension in the stacked tensor),
+// is equal to the trip count of the while loop and `i` is the iteration
+// variable of the loop. The stacked input is used only as input to a
+// shape-covering dynamic-slice (check the definition of a shape-covering
+// dynamic-slice: `tensorflow/compiler/xla/service/while_loop_unroller.h`)
+//
+//   +-while----------------------------------------------------+
+//   | param = tuple(..., [I,x1,...,xn]stacked, ...)            |
+//   | ...                                                      |
+//   | [1,x1,...,xn]slice = ds([I,x1,...,xn]stacked, i, 0, ...) |
+//   | ...                                                      |
+//   | ops using the slice                                      |
+//   | ...                                                      |
+//   | ROOT = tuple(..., stacked, ...)                          |
+//   +----------------------------------------------------------+
+//
+// This pattern can be unstacked and rewritten as following:
+//
+//   +-while-----------------------------------------------------------------+
+//   | param = tuple(..., ([1,x1,...,xn], ..., [1,x1,...,xn])unstacked, ...) |
+//   | ...                                                                   |
+//.  | slice_1 = get_tuple_element(unstacked), index=i                       |
+//   | ops using the slice_i                                                 |
+//   | ...                                                                   |
+//   | ROOT = tuple(..., unstacked, ...)                                     |
+//   +-----------------------------------------------------------------------+
+//
+// where the unstacked input is initialized with the slices outside of the loop:
+// unstacked = tuple(slice_1, ..., slice_n)
+// To get each slice, the pass introduces a dynamic version of the
+// kGetTupleElement instruction using a custom-call. This custom-call is then
+// replaced with a normal get-tuple-element during loop unrolling.
+//
+// Below is a high-level overview of the unstacking algorithm:
+// We unstack a module by unstacking inputs to the while loops within the entry
+// computation for every index. Given a while loop and a candidate for
+// unstacking, the algorithm performs the following two steps:
+// 1. The first step is to determine if unstacking is possible by checking if
+//  the unstacking of the while operand at the given index can be propagated
+//  through the body (and nested bodies if any). Unstacking is possible
+//  if a pair of pattern and handler is provided that can identify and handle
+//  such pattern that involves all the uses of the stacked operand at the given
+//  index.
+// 2. Apply the unstacking by executing the changes gathered in the first phase.
+class HloUnstacker : public HloModulePass {
+ public:
+  ~HloUnstacker() override = default;
+
+  explicit HloUnstacker(std::function<bool(HloInstruction*)> unfuse_slice =
+                            [](HloInstruction* instr) { return true; })
+      : unfuse_slice_(unfuse_slice) {}
+
+  absl::string_view name() const override { return "hlo_unstacker"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  std::function<bool(HloInstruction*)> unfuse_slice_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_UNSTACKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_value.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_value.h
new file mode 100644
index 00000000..3f4f4699
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_value.h
@@ -0,0 +1,298 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_VALUE_H_
+#define XLA_SERVICE_HLO_VALUE_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/lazy.h"
+#include "xla/service/buffer_value.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// Abstraction which identifies a specific point in the XLA graph. An
+// HloPosition specifies a ShapeIndex within the output of a specific
+// instruction.
+struct HloPosition {
+  HloInstruction* instruction;
+  ShapeIndex index;
+
+  // Returns the shape at this position.
+  const Shape& shape() const;
+
+  std::string ToString() const;
+
+  bool operator==(const HloPosition& other) const {
+    return instruction == other.instruction && index == other.index;
+  }
+  bool operator!=(const HloPosition& other) const { return !(*this == other); }
+
+  // Sort by instruction ID, then index.
+  bool operator<(const HloPosition& other) const {
+    return std::forward_as_tuple(instruction->unique_id(), index) <
+           std::forward_as_tuple(other.instruction->unique_id(), other.index);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloPosition& pos) {
+    return H::combine(std::move(h), pos.instruction, pos.index);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const HloPosition& position);
+
+// Defines a single use of an HLO value.
+struct HloUse {
+  // Instruction at which the value is used.
+  HloInstruction* instruction;
+
+  // The operand number in which the value appears.
+  int64_t operand_number;
+
+  // The shape index within the operand in which the value appears.
+  ShapeIndex operand_index;
+
+  HloUse() = default;
+  HloUse(HloInstruction* instruction, int64_t operand_number)
+      : instruction(instruction), operand_number(operand_number) {}
+  HloUse(HloInstruction* instruction, int64_t operand_number,
+         ShapeIndex operand_index)
+      : instruction(instruction),
+        operand_number(operand_number),
+        operand_index(std::move(operand_index)) {}
+
+  std::string ToString() const;
+
+  bool operator==(const HloUse& other) const {
+    return instruction == other.instruction &&
+           operand_number == other.operand_number &&
+           operand_index == other.operand_index;
+  }
+
+  bool operator!=(const HloUse& other) const { return !(*this == other); }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HloUse& use) {
+    return H::combine(std::move(h), use.instruction, use.operand_index,
+                      use.operand_number);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const HloUse& use);
+
+// HloDataflowAnalysis uses this subclass of BufferValue.
+class HloValue : public BufferValue {
+ public:
+  // Predicate comparing HloValues by increasing id, useful for std::sort.
+  static bool IdLessThan(const HloValue* a, const HloValue* b) {
+    return a->id() < b->id();
+  }
+
+  // Construct an HloValue defined by 'instruction' at shape index 'index'. If
+  // is_phi is true, then this value is a phi value, for example, at the
+  // parameter of a while body computation. Phi values are only used in the SSA
+  // dataflow analysis (HloDataflowAnalysis::ssa_form_ is true).
+  HloValue(Id id, HloInstruction* instruction, const ShapeIndex& index,
+           bool is_phi = false);
+
+  // Sets the positions in the module at which the HloValue appears. Should be
+  // called once and only once. The defining position should not be included in
+  // 'positions' as this is set at construction time.
+  void SetPositions(absl::Span<const HloPosition> positions);
+
+  // Returns whether this value is a phi value.
+  bool is_phi() const { return is_phi_; }
+
+  // Return the position where this value is defined.
+  const HloPosition& defining_position() const { return positions_[0]; }
+
+  // Return the instruction which defines this HloValue.
+  HloInstruction* defining_instruction() const {
+    return defining_position().instruction;
+  }
+
+  HloInstruction* instruction() const override {
+    return defining_instruction();
+  }
+
+  // Return the shape index at which this HloValue is defined in the output of
+  // its defining instruction.
+  const ShapeIndex& defining_index() const { return defining_position().index; }
+
+  const ShapeIndex& index() const override { return defining_index(); }
+
+  // Return the shape of this HloValue.
+  const Shape& shape() const override { return defining_position().shape(); }
+
+  using Positions = absl::InlinedVector<HloPosition, 3>;
+  // Return all positions of the HloValue in the module.
+  const Positions& positions() const { return positions_; }
+
+  // Return all uses of the HloValue. This computes the uses lazily, and the
+  // overhead could be non-trivial for the first invocation. Therefore even
+  // though it is marked `const`, it actually can mutate its data members. It is
+  // kept this way to allow passing around const references.
+  absl::Span<const HloUse> GetUses() const { return uses_.get(); }
+
+  // Returns true if this has a position that is the root of the given
+  // computation.
+  bool IsRootOf(const HloComputation* computation) const;
+
+  // Get whether this HloValue is live out of the module.
+  bool live_out_of_module() const { return live_out_of_module_; }
+
+  bool operator==(const HloValue& other) const { return this == &other; }
+  bool operator!=(const HloValue& other) const { return !(*this == other); }
+
+  // Return a single-line string representation of the value.
+  std::string ToShortString() const;
+  // The returned string doesn't include `uses` if the ToString is called before
+  // `GetUses` is called.
+  std::string ToString(int indent) const;
+  std::string ToString() const override { return ToString(0); }
+
+ private:
+  using Uses = absl::InlinedVector<HloUse, 3>;
+  // Called when lazily computing the uses.
+  Uses ComputeUses() const;
+
+  // The set of positions of this HloValue. The first element is always the
+  // position of the definition.
+  Positions positions_;
+
+  // The set of uses of this HloValue. This is lazily constructed until getting
+  // accessed.
+  Lazy<Uses> uses_;
+
+  // Whether this instruction is a phi value.
+  const bool is_phi_;
+
+  // Whether this value is live out of the HLO module.
+  bool live_out_of_module_ = false;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
+
+// A class representing the possible set of HloValues at a particular point
+// (shape index in the output of an instruction) in the XLA graph. This set
+// contains the set of reaching HloValue definitions. For a simple array-shaped
+// instruction like Add, the HloValueSet of the top-level of the instruction's
+// output trivially contains only the HloValue defined by the instruction. For
+// instructions which have non-trivial dataflow such as Tuple or Select, the
+// HloValueSets of the instruction's output contains one or more HloValues
+// defined by the instruction's operands or defined further up in the XLA graph.
+class HloValueSet {
+ public:
+  HloValueSet() = default;
+
+  explicit HloValueSet(absl::Span<const HloValue* const> values);
+  explicit HloValueSet(const absl::flat_hash_set<const HloValue*>& values);
+
+  // Sets this value set to the union of the given value sets. Returns whether
+  // this value set changed.
+  bool AssignUnionOf(absl::Span<const HloValueSet* const> inputs);
+
+  // Return the vector of HloValues in the set. Values in the vector are unique
+  // and stably sorted by value id.
+  const std::vector<const HloValue*>& values() const { return values_; }
+
+  // Adds the value to the set.  Returns true iff the value was added and didn't
+  // already exist in the set.
+  bool AddValue(const HloValue* value);
+
+  // Clear all values from the set.
+  void Clear() { values_.clear(); }
+
+  std::vector<const HloValue*> TakeValues() { return std::move(values_); }
+
+  // Return the unique HLO value in the set. CHECKs if the set does not contain
+  // exactly one value.
+  const HloValue& GetUniqueValue() const {
+    CHECK_EQ(values_.size(), 1);
+    return *values_[0];
+  }
+
+  bool operator==(const HloValueSet& other) const {
+    if (values_.size() != other.values_.size()) return false;
+    for (size_t i = 0; i < values_.size(); ++i) {
+      if (values_[i]->id() != other.values_[i]->id()) {
+        return false;
+      }
+    }
+    return true;
+  }
+  bool operator!=(const HloValueSet& other) const { return !(*this == other); }
+
+  std::string ToString() const;
+
+ private:
+  // Sorts value_ and removes duplicates. This should be called after adding any
+  // elements to values_.
+  void SortAndUniquifyValues();
+
+  // HloValues sorted by HloValue::Id.
+  std::vector<const HloValue*> values_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloValueSet& value_set);
+
+// A class collecting the HloValues which might be contained in the output of
+// an HLO instruction. For array-shaped instructions, an InstructionValueSet
+// trivially holds a single HloValueSet. Tuple-shaped InstructionValueSets
+// hold multiple HloValueSets.
+class InstructionValueSet : public ShapeTree<HloValueSet> {
+ public:
+  explicit InstructionValueSet(const Shape& shape)
+      : ShapeTree<HloValueSet>(shape) {}
+
+  // Sets this value set to the union of the given value sets. Returns whether
+  // this value set changed.
+  bool AssignUnionOf(absl::Span<const InstructionValueSet* const> inputs);
+
+  // Sets this value set to the input value set at the given index. Returns
+  // whether this value set changed.
+  bool AssignUnionOf(const InstructionValueSet& input,
+                     ShapeIndexView input_index);
+
+  // Returns true if any value sets for any subshape element is not a
+  // singleton.
+  bool IsAmbiguous() const;
+
+  std::string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionValueSet& instruction_value_set);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_value_semantics_analysis.h
new file mode 100644
index 00000000..4a946206
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_value_semantics_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
+#define XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/hlo_value_semantics_analysis.h"
+
+#endif  // XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_verifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_verifier.h
new file mode 100644
index 00000000..1aa71b7a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/hlo_verifier.h
@@ -0,0 +1,443 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HLO_VERIFIER_H_
+#define XLA_SERVICE_HLO_VERIFIER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Callback to return shape size, in bytes.
+using ShapeSizeFn = std::function<int64_t(const Shape&)>;
+
+struct HloVerifierOpts {
+  HloVerifierOpts&& MakeLayoutSensitive() {
+    layout_sensitive = true;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithLayoutSensitive(bool layout_sensitive_p) {
+    layout_sensitive = layout_sensitive_p;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithAllowMixedPrecision(bool allow_mixed_precision_p) {
+    allow_mixed_precision = allow_mixed_precision_p;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& AllowMixedPrecision() {
+    allow_mixed_precision = true;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& VerifyBroadcastDimensionsOrder() {
+    verify_broadcast_dimensions_order = true;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& VerifyReshapeIsBitcast() {
+    verify_reshape_is_bitcast = true;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& VerifyCustomCallNestedComputationThreadName() {
+    verify_custom_call_nested_computation_thread_name = true;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithAllowBitcastToHaveDifferentSize(bool allow) {
+    allow_bitcast_to_have_different_size = allow;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithInstructionCanChangeLayout(
+      const HloPredicate& instruction_can_change_layout_p) {
+    instruction_can_change_layout = instruction_can_change_layout_p;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithCustomShapeSize(const ShapeSizeFn& shape_size_p) {
+    shape_size = shape_size_p;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithVerifyShardingDeviceNumbers(bool verify) {
+    verify_sharding_device_numbers = verify;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithVerifyS4U4Usage(bool verify) {
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithAllowUnboundedDynamism(bool allow) {
+    allow_unbounded_dynamism = allow;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& VerifyInstructionNameUnchanged() {
+    verify_instruction_name_unchanged = true;
+    return std::move(*this);
+  }
+
+  bool IsLayoutSensitive() const { return layout_sensitive; }
+
+  bool AllowMixedPrecision() const { return allow_mixed_precision; }
+
+  const HloPredicate& InstructionCanChangeLayout() const {
+    return instruction_can_change_layout;
+  }
+
+  bool InstructionCanChangeLayout(const HloInstruction* instruction) const {
+    return !instruction_can_change_layout ||
+           instruction_can_change_layout(instruction);
+  }
+
+  int64_t ShapeSize(const Shape& shape) const { return shape_size(shape); }
+
+  // If the verifier is layout-sensitive, shapes must be equal to what's
+  // expected.  Otherwise, the shapes must simply be compatible.
+  bool layout_sensitive = false;
+
+  // Whether the inputs and output of an instruction can contain both F32s and
+  // BF16s. Tuples that include both F32s and BF16s are allowed regardless of
+  // this flag.
+  bool allow_mixed_precision = false;
+
+  // Check that `dimensions` attribute of broadcast is sorted.
+  bool verify_broadcast_dimensions_order = false;
+
+  // Check that reshape is a physical bitcast.
+  bool verify_reshape_is_bitcast = false;
+
+  // Check that custom call's called computations have same thread name as
+  // parent computation.
+  bool verify_custom_call_nested_computation_thread_name = true;
+
+  // Check device numbers in sharding verification.
+  bool verify_sharding_device_numbers = true;
+
+  // Whether bitcast should have the same size, including all paddings.
+  bool allow_bitcast_to_have_different_size = false;
+
+  // Whether unbounded dynamic sizes should be allowed for shapes.
+  bool allow_unbounded_dynamism = false;
+
+  // Check whether instruction has been renamed.
+  // Should enforce no function renames unless the name instruction has been
+  // cloned (".clone" suffix) or rematted (".remat");
+  bool verify_instruction_name_unchanged = false;
+
+  // Check if channel instructions all have unique channel ids.
+  bool verify_unique_channel_ids = true;
+
+  HloPredicate instruction_can_change_layout;
+
+  // Returns a target-specific shape size.
+  ShapeSizeFn shape_size = [](const Shape& shape) {
+    return ShapeUtil::ByteSizeOf(shape);
+  };
+};
+
+// Visitor which verifies that the output shape is correctly set. Verifies
+// against the inferred shape for the instruction.
+class ShapeVerifier : public DfsHloVisitor {
+ public:
+  explicit ShapeVerifier(const HloVerifierOpts& opts) : opts_(opts) {}
+
+  // Verifies that entry computation layout matches parameters and root shape of
+  // the module's entry computation.
+  virtual absl::Status VerifyEntryComputationLayout(const HloModule& module);
+
+  absl::Status Preprocess(HloInstruction* hlo) override;
+
+  absl::Status HandleElementwiseUnary(HloInstruction* hlo) override;
+  absl::Status HandleElementwiseBinary(HloInstruction* hlo) override;
+  absl::Status HandleClamp(HloInstruction* clamp) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleIota(HloInstruction* hlo) override;
+  absl::Status HandleConvert(HloInstruction* convert) override;
+  absl::Status HandleBitcastConvert(HloInstruction* convert) override;
+  absl::Status HandleStochasticConvert(HloInstruction* convert) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleFft(HloInstruction* fft) override;
+  absl::Status HandleCholesky(HloInstruction* hlo) override;
+  absl::Status HandleTriangularSolve(HloInstruction* hlo) override;
+  absl::Status HandleAllGather(HloInstruction* hlo) override;
+  absl::Status HandleAllGatherStart(HloInstruction* hlo) override;
+  absl::Status HandleAllGatherDone(HloInstruction* hlo) override;
+  absl::Status HandleAllReduce(HloInstruction* hlo) override;
+  absl::Status HandleAllReduceStart(HloInstruction* hlo) override;
+  absl::Status HandleAllReduceDone(HloInstruction* hlo) override;
+  absl::Status HandleAllToAll(HloInstruction* hlo) override;
+  absl::Status HandleRaggedAllToAll(HloInstruction* hlo) override;
+  absl::Status HandleCollectiveBroadcast(HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermute(HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteStart(HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteDone(HloInstruction* hlo) override;
+  absl::Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status HandleRaggedDot(HloInstruction* ragged_dot) override;
+  absl::Status HandleReplicaId(HloInstruction* hlo) override;
+  absl::Status HandleReducePrecision(HloInstruction* reduce_precision) override;
+  absl::Status HandleInfeed(HloInstruction*) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* hlo) override;
+  absl::Status HandleOutfeed(HloInstruction*) override;
+  absl::Status HandleRng(HloInstruction*) override;
+  absl::Status HandleRngBitGenerator(HloInstruction*) override;
+  absl::Status HandleRngGetAndUpdateState(HloInstruction*) override;
+  absl::Status HandleReverse(HloInstruction* reverse) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleTopK(HloInstruction* hlo) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleReduce(HloInstruction* reduce) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleBroadcast(HloInstruction* broadcast) override;
+  absl::Status HandleReshape(HloInstruction* reshape) override;
+  absl::Status HandleDynamicReshape(HloInstruction* dynamic_reshape) override;
+  absl::Status HandleTranspose(HloInstruction* transpose) override;
+  absl::Status HandleParameter(HloInstruction*) override;
+  absl::Status HandleFusion(HloInstruction*) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleCustomCall(HloInstruction*) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleMap(HloInstruction* map) override;
+  absl::Status HandleReduceScatter(HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(HloInstruction* reduce_window) override;
+  absl::Status HandleSelectAndScatter(HloInstruction* instruction) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandlePad(HloInstruction* pad) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleBatchNormTraining(
+      HloInstruction* batch_norm_training) override;
+  absl::Status HandleBatchNormInference(
+      HloInstruction* batch_norm_inference) override;
+  absl::Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
+  absl::Status HandleGather(HloInstruction* gather) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleAfterAll(HloInstruction* token) override;
+  absl::Status HandleGetDimensionSize(HloInstruction* get_size) override;
+  absl::Status HandleSetDimensionSize(HloInstruction* set_size) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+
+  absl::Status FinishVisit(HloInstruction*) override {
+    return absl::OkStatus();
+  }
+
+ protected:
+  // Helpers that switch on layout_sensitive_.
+  bool ShapesSame(const Shape& a, const Shape& b, Shape::Equal equal = {});
+
+  // Check the instruction's shape against the shape given by ShapeInference
+  // and return an appropriate error if there is a mismatch.
+  absl::Status CheckShape(const HloInstruction* instruction,
+                          const Shape& inferred_shape,
+                          bool only_compare_minor_to_major_in_layout = false);
+
+  // Overload which takes a absl::StatusOr to reduce boilerplate in the caller.
+  absl::Status CheckShape(const HloInstruction* instruction,
+                          const absl::StatusOr<Shape>& inferred_shape_status);
+
+  static absl::Status CheckParameterCount(
+      const HloInstruction* calling_instruction,
+      const HloComputation* computation, int expected);
+
+  // Check a unary (binary, etc) instruction's shape against the inferred shape.
+  absl::Status CheckUnaryShape(const HloInstruction* instruction);
+  absl::Status CheckBinaryShape(const HloInstruction* instruction);
+  absl::Status CheckTernaryShape(const HloInstruction* instruction);
+  absl::Status CheckVariadicShape(const HloInstruction* instruction);
+
+ private:
+  std::string StringifyShape(const Shape& s) {
+    return opts_.layout_sensitive ? ShapeUtil::HumanStringWithLayout(s)
+                                  : ShapeUtil::HumanString(s);
+  }
+
+  // Helpers that switch on allow_mixed_precision_.
+  bool SameElementType(const Shape& a, const Shape& b) {
+    return opts_.allow_mixed_precision
+               ? ShapeUtil::SameElementTypeIgnoringFpPrecision(a, b)
+               : ShapeUtil::SameElementType(a, b);
+  }
+
+  // Checks that the given operand of the given instruction is of type TOKEN.
+  absl::Status CheckIsTokenOperand(const HloInstruction* instruction,
+                                   int64_t operand_no);
+
+  // Checks that the shape of the given operand of the given instruction matches
+  // the given parameter of the given computation.
+  absl::Status CheckOperandAndParameter(const HloInstruction* instruction,
+                                        int64_t operand_number,
+                                        const HloComputation* computation,
+                                        int64_t parameter_number);
+
+  // Checks that the shape of async op operands and results match the called
+  // computation parameters and root.
+  absl::Status CheckAsyncOpComputationShapes(const HloInstruction* async_op,
+                                             const Shape& async_shape);
+
+  // Returns true if the shapes of the two operands have the same element type,
+  // and the result shape either has the same element type as the operand shapes
+  // or mixed precision is allowed and the result shape and the operand shapes
+  // have floating point element types.
+  bool HasCompatibleElementTypes(const Shape& shape_0, const Shape& shape_1,
+                                 const Shape& result_shape);
+
+  const HloVerifierOpts& opts_;
+};
+
+// An interface used to encapsulate target-specific verification quirks.
+class TargetVerifierMetadata {
+ public:
+  explicit TargetVerifierMetadata(HloVerifierOpts&& opts) : opts_(opts) {
+    CHECK(opts.instruction_can_change_layout == nullptr ||
+          opts.layout_sensitive);
+  }
+
+  virtual std::unique_ptr<ShapeVerifier> GetVerifier() const = 0;
+
+  TargetVerifierMetadata() = default;
+  virtual ~TargetVerifierMetadata() = default;
+
+  TargetVerifierMetadata(const TargetVerifierMetadata&) = delete;
+  TargetVerifierMetadata& operator=(const TargetVerifierMetadata&) = delete;
+
+  const HloVerifierOpts& GetVerifierOpts() const { return opts_; }
+
+ private:
+  HloVerifierOpts opts_;
+};
+
+// The default implementation of TargetVerifierMetadata, used unless the target
+// needs to override it.
+class DefaultVerifierMetadata : public TargetVerifierMetadata {
+ public:
+  explicit DefaultVerifierMetadata(HloVerifierOpts&& opts)
+      : TargetVerifierMetadata(std::move(opts)) {}
+
+  // Creates a ShapeVerifier that checks that shapes match inferred
+  // expectations. This creates a new verifier every time because ShapeVerifier,
+  // being a DfsHloVisitor, is stateful. We want a clean object for each run of
+  // the verifier.
+  std::unique_ptr<ShapeVerifier> GetVerifier() const override {
+    return std::make_unique<ShapeVerifier>(GetVerifierOpts());
+  }
+};
+
+// HLO pass that verifies invariants of HLO instructions for each computation in
+// the module.
+class HloVerifier : public HloModulePass {
+ public:
+  HloVerifier(
+      bool layout_sensitive, bool allow_mixed_precision,
+      HloPredicate instruction_can_change_layout_func = {},
+      std::function<int64_t(const Shape&)> shape_size_func =
+          [](const Shape& shape) { return ShapeUtil::ByteSizeOf(shape); })
+      : HloVerifier(HloVerifierOpts{}
+                        .WithLayoutSensitive(layout_sensitive)
+                        .WithAllowMixedPrecision(allow_mixed_precision)
+                        .WithInstructionCanChangeLayout(
+                            instruction_can_change_layout_func)
+                        .WithCustomShapeSize(shape_size_func)) {}
+
+  explicit HloVerifier(HloVerifierOpts&& opts)
+      : target_metadata_(
+            std::make_unique<DefaultVerifierMetadata>(std::move(opts))),
+        context_("Unknown") {}
+
+  // Uses custom target metadata
+  explicit HloVerifier(std::unique_ptr<TargetVerifierMetadata> target_metadata,
+                       absl::string_view context = "Unknown")
+      : target_metadata_(std::move(target_metadata)), context_(context) {}
+
+  ~HloVerifier() override = default;
+  absl::string_view name() const override { return "hlo-verifier"; }
+
+  // Never returns true; no instructions are ever modified by this pass.
+  using HloPassInterface::Run;
+  using HloPassInterface::RunOnModuleGroup;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Owns verifier config.
+  std::unique_ptr<TargetVerifierMetadata> target_metadata_;
+
+  // The hlo pass when the verifier is invoked.
+  std::string context_;
+};
+
+// Tracks debug metadata coverage on HLO Ops and reports the results as an INFO
+// log starting with a `prefix` passed to the ctor.
+// TODO(b/261216447): Remove once the work on debug metadata is finished.
+class MetadataTracker : public DfsHloVisitorWithDefault {
+ public:
+  explicit MetadataTracker(absl::string_view prefix);
+  ~MetadataTracker() override;
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  void HandleMetadata(const OpMetadata& metadata);
+
+ private:
+  const std::string prefix_;
+  int64_t instruction_count_ = 0;
+  int64_t has_op_type_count_ = 0;
+  int64_t has_op_name_count_ = 0;
+  int64_t has_source_file_count_ = 0;
+  int64_t has_dummy_source_file_count_ = 0;
+  int64_t has_source_line_count_ = 0;
+  int64_t has_creation_pass_id_count_ = 0;
+  int64_t has_logical_creation_pass_id_count_ = 0;
+  int64_t has_size_of_generated_code_in_bytes_count_ = 0;
+  int64_t has_size_of_memory_working_set_in_bytes_count_ = 0;
+  int64_t has_profile_info_count_ = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HLO_VERIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/host_memory_offload_annotations.h b/third_party/tflite-hdrs/third_party/xla/xla/service/host_memory_offload_annotations.h
new file mode 100644
index 00000000..e230fdc8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/host_memory_offload_annotations.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
+#define XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace host_memory_offload_annotations {
+
+// External annotations:
+inline const absl::string_view kDevicePlacement = "annotate_device_placement";
+inline const absl::string_view kMemoryTargetPinnedHost = "pinned_host";
+inline const absl::string_view kMemoryTargetUnpinnedHost = "unpinned_host";
+inline const absl::string_view kMemoryTargetDevice = "device";
+inline const absl::string_view kMemoryTargetDeviceSram = "device_sram";
+inline const absl::string_view kMemoryTargetPinnedDevice = "pinned_device";
+
+// Internal annotations:
+inline const absl::string_view kMoveToHostCustomCallTarget = "MoveToHost";
+inline const absl::string_view kMoveToDeviceCustomCallTarget = "MoveToDevice";
+inline const absl::string_view kPinToDeviceCustomCallTarget = "PinToDevice";
+inline const absl::string_view kPinToDeviceSramCustomCallTarget =
+    "PinToDeviceSram";
+
+}  // namespace host_memory_offload_annotations
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/host_memory_transfer_asyncifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/host_memory_transfer_asyncifier.h
new file mode 100644
index 00000000..d2677f2a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/host_memory_transfer_asyncifier.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
+#define XLA_SERVICE_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h"
+
+#endif  // XLA_SERVICE_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/host_offload_legalize.h b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offload_legalize.h
new file mode 100644
index 00000000..181c82e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offload_legalize.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
+#define XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/host_offload_legalize.h"
+
+#endif  // XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/host_offload_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offload_utils.h
new file mode 100644
index 00000000..615f331b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offload_utils.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HOST_OFFLOAD_UTILS_H_
+#define XLA_SERVICE_HOST_OFFLOAD_UTILS_H_
+
+#include <array>
+#include <cstdint>
+#include <iomanip>
+#include <memory>
+#include <optional>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal_util.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace host_offload_utils {
+
+struct InstructionAndShapeIndex {
+  explicit InstructionAndShapeIndex(HloInstruction* instruction)
+      : instruction(instruction) {}
+  InstructionAndShapeIndex(HloInstruction* instruction, ShapeIndex shape_index)
+      : instruction(instruction), shape_index(shape_index) {}
+  HloInstruction* instruction;
+  ShapeIndex shape_index;
+  std::string ToString() const;
+
+  template <typename H>
+  static H Hash(H h, const InstructionAndShapeIndex& i) {
+    h = H::combine(std::move(h), i.instruction);
+    h = H::combine(std::move(h), i.shape_index);
+    return std::move(h);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const InstructionAndShapeIndex& i) {
+    return InstructionAndShapeIndex::Hash(std::move(h), i);
+  }
+};
+
+bool operator==(const InstructionAndShapeIndex& lhs,
+                const InstructionAndShapeIndex& rhs);
+
+// If an instruction's user is a call, we descend into the call first.
+// Eventually, a later invocation of this function while walking the graph will
+// return the call itself as a successor of the ROOT instruction of the
+// computation.
+absl::StatusOr<std::vector<InstructionAndShapeIndex>> GetSuccessors(
+    const InstructionAndShapeIndex& instruction_and_shape_index);
+
+// If an instruction's operand is a call, return the call now. A follow up call
+// of this function on that call returns the ROOT. Eventually, once the given
+// instruction is a parameter, the returned predecessor will be the appropriate
+// operand of the call (not the call itself, since we already returned it).
+std::vector<InstructionAndShapeIndex> GetPredecessors(
+    const InstructionAndShapeIndex& instruction_and_shape_index);
+
+// Returns true if the instruction is allowed to be in the
+// middle of a pure memory offload path.
+bool IsValidDuringPureMemoryOffload(const HloInstruction* instruction);
+
+// Returns true if the instruction is an async-start with host thread.
+bool IsHostAsyncStart(const HloInstruction* instruction);
+
+// Returns true if the copy is from or to host memory space.
+bool IsSynchronousCopyFromOrToHost(const HloInstruction* instruction);
+
+bool ComputeTypeIsHost(const HloInstruction* hlo_instruction);
+
+}  // namespace host_offload_utils
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HOST_OFFLOAD_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/host_offloader.h b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offloader.h
new file mode 100644
index 00000000..0f68eb63
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offloader.h
@@ -0,0 +1,21 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_SERVICE_HOST_OFFLOADER_H_
+#define XLA_SERVICE_HOST_OFFLOADER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/host_offloader.h"
+
+#endif  // XLA_SERVICE_HOST_OFFLOADER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/host_offloading_prepare.h b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offloading_prepare.h
new file mode 100644
index 00000000..016bfadb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/host_offloading_prepare.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
+#define XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/host_offloading_prepare.h"
+
+#endif  // XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/human_readable_profile_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/service/human_readable_profile_builder.h
new file mode 100644
index 00000000..f404565a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/human_readable_profile_builder.h
@@ -0,0 +1,90 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_
+#define XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/types.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+// HumanReadableProfileBuilder helps you create a textual profile of a
+// computation, suitable for consumption by humans.
+class HumanReadableProfileBuilder {
+ public:
+  explicit HumanReadableProfileBuilder(absl::string_view computation_name,
+                                       bool is_entry_computation,
+                                       int64_t total_cycles,
+                                       double clock_rate_ghz)
+      : computation_name_(computation_name),
+        is_entry_computation_(is_entry_computation),
+        total_cycles_(total_cycles),
+        clock_rate_ghz_(clock_rate_ghz) {
+    CHECK_GE(clock_rate_ghz, 1e-9);
+  }
+
+  int64_t total_cycles() const { return total_cycles_; }
+
+  // Adds an operation to the profile.  If you don't know the number of
+  // floating-point ops or bytes touched by the op, or if you don't know how
+  // fast it would run optimally, pass -1 for that param.
+  void AddOp(absl::string_view op_name, absl::string_view short_name,
+             absl::string_view category, int64_t cycles, int64_t flop_count,
+             int64_t transcendental_count, int64_t bytes_accessed,
+             float optimal_seconds) {
+    op_infos_.push_back({std::string(op_name), std::string(short_name),
+                         std::string(category), cycles, flop_count,
+                         transcendental_count, bytes_accessed,
+                         optimal_seconds});
+  }
+
+  // Gets the human-readable profile.
+  std::string ToString() const;
+
+ private:
+  struct OpInfo {
+    std::string name;
+    std::string short_name;
+    std::string category;
+    int64_t cycles;
+    int64_t flop_count;  // -1 if unknown
+    int64_t transcendental_count;
+    int64_t bytes_accessed;  // -1 if unknown
+    float optimal_seconds;   // -1 if unknown
+  };
+
+  double CyclesToSeconds(int64_t cycles) const {
+    return cycles / clock_rate_ghz_ / 1e9;
+  }
+  double CyclesToMicroseconds(int64_t cycles) const {
+    return cycles / clock_rate_ghz_ / 1000.0;
+  }
+
+  std::string computation_name_;
+  bool is_entry_computation_;
+  int64_t total_cycles_;
+  double clock_rate_ghz_;
+  std::vector<OpInfo> op_infos_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/indexed_array_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/indexed_array_analysis.h
new file mode 100644
index 00000000..6dbfd2a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/indexed_array_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
+#define XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/indexed_array_analysis.h"
+
+#endif  // XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/infeed_token_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/infeed_token_propagation.h
new file mode 100644
index 00000000..31a0aa19
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/infeed_token_propagation.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_INFEED_TOKEN_PROPAGATION_H_
+#define XLA_SERVICE_INFEED_TOKEN_PROPAGATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/collectives/infeed_token_propagation.h"
+
+#endif  // XLA_SERVICE_INFEED_TOKEN_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/instruction_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/instruction_fusion.h
new file mode 100644
index 00000000..b291cea1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/instruction_fusion.h
@@ -0,0 +1,370 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_INSTRUCTION_FUSION_H_
+#define XLA_SERVICE_INSTRUCTION_FUSION_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/service/hlo_module_config.h"
+#include "tsl/platform/macros.h"
+// The source_location.h is not available in open source.
+#if defined(PLATFORM_GOOGLE)
+#include "absl/types/source_location.h"
+#endif  // PLATFORM_GOOGLE
+#include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/fusion_queue.h"
+
+namespace xla {
+
+// In the context of ShouldFuseInPlaceOp (refer to the documentation there), the
+// number of non-elementwise ops in the producer fusion is limited to 1.
+// However, there are cases where we want to relax this restriction. This struct
+// provides the necessary options to control this behavior.
+struct InPlaceFusionOptions {
+  bool relax_multiple_non_elementwise_ops = false;
+};
+
+// Propagating explanation of fusion decisions: if something could not be fused,
+// explain the reason.
+class FusionDecision {
+ public:
+  static FusionDecision Allow() { return FusionDecision(); }
+  static FusionDecision Forbid(absl::string_view explanation) {
+    return FusionDecision(explanation);
+  }
+  FusionDecision(const FusionDecision& decision) = default;
+
+  // If condition is `true` means that we CAN fuse. In that case, explanation is
+  // discarded.
+  FusionDecision(bool condition, absl::string_view explanation) {
+    if (!condition) {
+      explanation_ = std::string(explanation);
+    }
+  }
+
+#if defined(PLATFORM_GOOGLE)
+  // We can fuse iff. the decision is `true`. The source location indicates
+  // where an instance was created, making debugging easier without a need to
+  // provide explicit explanation.
+  FusionDecision(  // NOLINT
+      bool decision,
+      absl::SourceLocation source_location = absl::SourceLocation::current());
+#endif  // PLATFORM_GOOGLE
+
+  // Returns whether it can be fused.
+  explicit operator bool() const { return CanFuse(); }
+
+  // Whether the fusion decision is positive.
+  bool CanFuse() const { return !explanation_.has_value(); }
+
+  // Connects two decisions with a disjunction. This is different than just
+  // picking one, as we also have to propagate both explanations if only one of
+  // them is false to show why fusion wasn't performed.
+  FusionDecision Or(const FusionDecision& decision) const {
+    if (CanFuse() || decision.CanFuse()) {
+      return Allow();
+    }
+    return Forbid(
+        absl::StrCat(explanation_.value_or(""), " ; ", decision.Explain()));
+  }
+
+  // Connects two fusion decision with a conjunction. Unlike disjunction,
+  // propagates only one explanation (as it is enough to show that fusion could
+  // not be done).
+  FusionDecision And(const FusionDecision& decision) const {
+    if (CanFuse()) {
+      return decision;
+    }
+    if (decision.CanFuse()) {
+      return *this;
+    }
+    // Both conditions were violated: returning either is valid.
+    return *this;
+  }
+
+  // Appends to explanation, or turns the decision negative.
+  FusionDecision operator<<(absl::string_view explanation) const {
+    return Forbid(absl::StrCat(explanation_.value_or(""), explanation));
+  }
+
+  // Appends to explanation, or turns the decision negative.
+  FusionDecision operator<<(int64_t explanation) const {
+    return Forbid(absl::StrCat(explanation_.value_or(""), explanation));
+  }
+
+  // Explains why the fusion could not be performed.
+  std::string Explain() const { return *explanation_; }
+
+ private:
+  // Empty IFF fusion is possible (explanation provided for negative cases).
+  std::optional<std::string> explanation_;
+
+  FusionDecision() = default;
+
+  explicit FusionDecision(absl::string_view explanation)
+      : explanation_(explanation) {}
+
+  explicit FusionDecision(const char* explanation)
+      : explanation_(explanation) {}
+};
+
+#define RETURN_IF_NOT_FUSIBLE(...)                   \
+  do {                                               \
+    ::xla::FusionDecision _decision = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_decision.CanFuse())) {    \
+      return _decision;                              \
+    }                                                \
+  } while (0)
+
+// HLO pass which performs instruction fusion. Instructions are fused
+// "vertically", meaning producing instructions are fused into their consumers
+// with the intent that the loops which compute their values will be fused in
+// code generation. Derived classes define ShouldFuse method to select which
+// instructions to fuse.
+class InstructionFusion : public HloModulePass {
+ public:
+  explicit InstructionFusion(
+      std::function<bool(const HloInstruction& instruction)> is_expensive,
+      bool may_duplicate = true,
+      FusionConfigCollection config_collection_mode =
+          FusionConfigCollection::kOff)
+      : is_expensive_(is_expensive),
+        may_duplicate_(may_duplicate),
+        config_collection_mode_(config_collection_mode) {}
+  ~InstructionFusion() override = default;
+  absl::string_view name() const override { return "fusion"; }
+
+  // Run instruction fusion on the given computation. Returns whether the
+  // computation was changed (instructions were fused).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Returns true if the computation of the given instruction is significantly
+  // more expensive than just writing all the values of the instructions' result
+  // array. Expensive operations will not be duplicated.
+  static bool IsExpensive(const HloInstruction& instruction);
+
+  // Returns true if it's legal to fuse the producer instruction into consumer
+  // with regard to in-place semantics of the consumer. For example, it is
+  // illegal to fuse a slice into a dynamic-update-slice if the slice output is
+  // used as the update and if slice and dynamic-update-slice indices cannot be
+  // proven to be the same.
+  static FusionDecision ShouldFuseInPlaceOp(
+      const HloInstruction* producer, const HloInstruction* consumer,
+      std::optional<const InPlaceFusionOptions> in_place_fusion_options);
+
+ protected:
+  // Returns a list of computations that are not fusion computations. These
+  // computations contain instructions which are candidates for fusions.
+  virtual std::vector<HloComputation*> GetNonFusionComputations(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Returns a FusionQueue that implements custom order of instructions being
+  // fused. The default implementation processes consumers in reverse post
+  // order.
+  virtual std::unique_ptr<FusionQueue> GetFusionQueue(
+      HloComputation* computation);
+
+  // Returns whether the given producer instruction should be fused into the
+  // given consumer instruction. producer is necessarily an operand of consumer.
+  // Derived classes should define this method to specify which instructions
+  // should be fused. `operand_index` is which operand of the consumer the
+  // producer is.
+  //
+  // Instructions are traversed in reverse post order (computation root to
+  // leaves). This method is called for each operand of the instruction (where
+  // the operand is 'producer' and the instruction is 'consumer')
+  //
+  // Subtypes can override this with target-specific heuristics.
+  virtual FusionDecision ShouldFuse(HloInstruction* consumer,
+                                    int64_t operand_index);
+
+  // Returns whether a 'producer' at given operand index can be fused into the
+  // consumer. It uses the provided function to check the legality of a possible
+  // fusion when either the producer or the consumer contains an operation which
+  // updates an operand in place.
+  virtual FusionDecision ShouldFuse(
+      HloInstruction* consumer, int64_t operand_index,
+      std::function<FusionDecision(const HloInstruction*, const HloInstruction*,
+                                   std::optional<const InPlaceFusionOptions>)>
+          inplace_op_fusion_decider);
+
+  // Returns whether multi-output fusion can be applied to fuse `producer` into
+  // `consumer`. In contrast to "regular" fusion, the `producer` is not
+  // duplicated by multi-output fusion.
+  virtual FusionDecision ShouldFuseIntoMultiOutput(HloInstruction* consumer,
+                                                   int64_t operand_index) {
+    return FusionDecision::Forbid(
+        "multi-output fusion not supported by this pass");
+  }
+
+  // Chooses a fusion kind for `producer` and `consumer`.
+  // Default method chooses `kLoop`.
+  virtual HloInstruction::FusionKind ChooseKind(const HloInstruction* producer,
+                                                const HloInstruction* consumer);
+
+  // Fuses 'producer' into 'fusion_instruction'. 'fusion_instruction' needs to
+  // be a fusion instruction. Returns the newly created clone of 'producer'
+  // which is part of the fusion computation.
+  virtual HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
+                                          HloInstruction* producer);
+
+  // Fuses producer into consumer. Returns the fusion instruction.
+  virtual HloInstruction* Fuse(HloInstruction* producer,
+                               HloInstruction* consumer,
+                               HloComputation* computation);
+
+  // Creates a new fusion instruction containing `producer` and `consumer`. A
+  // tuple is added as the fusion instruction's root, which consumes from both,
+  // `producer` and `consumer`. This style of fusion is referred to as
+  // multi-output fusion.
+  virtual HloInstruction* FuseIntoMultiOutput(HloInstruction* producer,
+                                              HloInstruction* consumer,
+                                              HloComputation* computation);
+
+  // An "effectively unary" operation is one that has at most one "large"
+  // input with the others being negligible in terms of memory usage.
+  // We use "has a smaller true rank than the output" as a heuristic
+  // for "negligible" memory usage.
+  bool EffectivelyAtMostUnary(HloInstruction* hlo);
+
+  // Returns true if fusing producer into consumer would cause producer to be
+  // duplicated. This is the case if producer has uses other than consumer.
+  bool FusionWouldDuplicate(const HloInstruction& producer,
+                            const HloInstruction& consumer) {
+    return !(producer.users().size() == 1 && consumer.IsUserOf(&producer));
+  }
+
+  bool is_expensive(const HloInstruction& instruction) {
+    return is_expensive_(instruction);
+  }
+
+  // Overwrites the originally initialized is_expensive function.
+  void set_is_expensive(
+      std::function<bool(const HloInstruction& instruction)> is_expensive) {
+    is_expensive_ = is_expensive;
+  }
+
+  // Whether multi-output fusion would introduce a cycle into the HLO graph.
+  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
+                                     HloInstruction* consumer,
+                                     const HloReachabilityMap& reachability);
+
+  FusionConfigCollection config_collection_mode() {
+    return config_collection_mode_;
+  }
+
+  // Returns whether 'consumer' may reuse elements of its `operand_index`th
+  // operand.
+  bool ReusesOperandElements(const HloInstruction* consumer,
+                             int64_t operand_index);
+
+  // The set of producers whose consumers we cannot fuse into.
+  using HloInstructionSet = absl::flat_hash_set<HloInstruction*>;
+
+  // Computes the set of nodes that we do not want to fuse into any of their
+  // consumers based on a global analysis of the HLO graph.
+  virtual HloInstructionSet ComputeGloballyUnfusible(
+      absl::Span<HloInstruction* const> post_order,
+      const HloReachabilityMap& reachability);
+
+ private:
+  // Returns the reused operands of `instruction` from reused_fusion_operands_,
+  // computing them if they have not previously been computed for that
+  // instruction.
+  // The returned value has pointer stability, assuming entries are not deleted
+  // from reused_fusion_operands_.
+  absl::flat_hash_set<const HloInstruction*>& ReusedOperandsOf(
+      const HloInstruction* instruction);
+
+  // Updates reused_fusion_operands_ for a fusion when we are about to fuse
+  // `producer` into `fusion_instruction`.
+  void UpdateReusedOperandsForFusion(HloInstruction* producer,
+                                     HloInstruction* fusion_instruction);
+
+  HloInstruction* AddFusionInstruction(HloInstruction* producer,
+                                       HloInstruction* consumer,
+                                       HloComputation* computation);
+
+  // Whether or not we can fuse producer into consumer on all paths
+  // from the producer to the consumer where nodes are HLOs and edges are uses.
+  //
+  // A map from <producer, consumer> to a bool is required as the result cache
+  // to store and query the results of calls to this function, in order to avoid
+  // repeated computations.
+  bool CanFuseOnAllPaths(
+      HloInstruction* producer, HloInstruction* consumer,
+      const HloInstructionSet& do_not_fuse,
+      const HloReachabilityMap& reachability,
+      absl::flat_hash_map<std::pair<HloInstruction*, HloInstruction*>, bool>*
+          result_cache);
+
+  // Used to determine if an HLO is expensive. Expensive operations will not be
+  // duplicated.
+  std::function<bool(const HloInstruction& instruction)> is_expensive_;
+
+  // Dumps the state of computation before fusion.
+  void DumpPreFusionState(HloComputation* computation, HloInstruction* consumer,
+                          HloInstruction* producer, bool is_mof = false);
+
+  // Dumps the state of computation and the reason why the fusion was not
+  // performed.
+  void DumpNotFusingState(HloComputation* computation, HloInstruction* consumer,
+                          HloInstruction* producer, FusionDecision decision);
+
+  // Dumps the state of computation after fusion happened.
+  void DumpStateAfterFusion(HloComputation* computation,
+                            HloInstruction* fusion_instruction,
+                            const std::string& producer_name);
+
+  // Returns whether we may duplicate an instruction if we want to fuse it.
+  bool may_duplicate_;
+
+  // Configuration mode.
+  FusionConfigCollection config_collection_mode_;
+
+  // Caches which operands are reused inside fusion computations.
+  absl::flat_hash_map<
+      const HloInstruction*,
+      std::unique_ptr<absl::flat_hash_set<const HloInstruction*>>>
+      reused_fusion_operands_;
+
+  InstructionFusion(const InstructionFusion&) = delete;
+  InstructionFusion& operator=(const InstructionFusion&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_INSTRUCTION_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/instruction_hoister.h b/third_party/tflite-hdrs/third_party/xla/xla/service/instruction_hoister.h
new file mode 100644
index 00000000..bd002321
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/instruction_hoister.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_INSTRUCTION_HOISTER_H_
+#define XLA_SERVICE_INSTRUCTION_HOISTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/instruction_hoister.h"
+
+#endif  // XLA_SERVICE_INSTRUCTION_HOISTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/latency_hiding_scheduler.h
new file mode 100644
index 00000000..48397367
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -0,0 +1,1153 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
+#define XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/map_util.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape_util.h"
+#include "xla/side_effect_util.h"
+#include "xla/status_macros.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+
+struct CanonicalAsyncOp {
+  HloOpcode outer;  // kAsyncStart or kAsyncDone
+  HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectiveBroadcast,
+                    // kCollectivePermute, or kReduceScatter
+};
+
+CanonicalAsyncOp DefaultGetCanonicalAsyncOp(const HloInstruction& hlo);
+
+using GetCanonicalAsyncOpFunc =
+    std::function<CanonicalAsyncOp(const HloInstruction& hlo)>;
+
+class HloGraphNode;
+class ModulePressureState;
+
+enum class ResourceType {
+  kNoResource = 0,
+  kAllToAll,
+  kAllGather,
+  kAllReduce,
+  kCollectivePermute,
+  kCopy,
+  kReduceScatter,
+  kSendRecv,
+  kSendHost,
+  kRecvHost,
+  kCollectiveBroadcast,
+  kNumResources,
+  kTargetDefinedResourceTypeBegin,
+};
+
+enum class ResourceUsageType {
+  kNoResource,
+  kResourceOccupy,
+  kResourceRelease,
+};
+
+enum class ResourceHazardType {
+  kShareable = 0,
+  kSerial = 1,
+  // The following hazard type represents the resources that are used by the
+  // async ops and should be released right after the estimated time cost has
+  // past. This hazard type is useful to prevent increasing such ops' overlaps
+  // more than necessary.
+  kNonextendable = 2,
+  // Ops holding this resource can only have their latency/cost covered by
+  // ops that are valuable for selective overlap.
+  kSelective = 3,
+  kUnshareable = 4,
+};
+
+template <typename T, typename = typename std::enable_if_t<std::is_enum_v<T>>>
+constexpr int64_t ResourceTypeToIndex(T resource_type) {
+  return static_cast<int64_t>(resource_type);
+}
+
+constexpr int64_t ResourceUsageTypeToIndex(
+    ResourceUsageType resource_usage_type) {
+  return static_cast<int64_t>(resource_usage_type);
+}
+
+using ResourcePair = std::pair<int64_t, ResourceUsageType>;
+using ResourcesVector = absl::InlinedVector<ResourcePair, 1>;
+
+class HloGraphNode;
+class HloScheduleGraph;
+
+struct SchedulerConfig {
+  int64_t collective_broadcast_overlap_limit = 1;
+  int64_t collective_permute_overlap_limit = 1;
+  int64_t all_to_all_overlap_limit = 1;
+  int64_t all_gather_overlap_limit = 1;
+  int64_t all_reduce_overlap_limit = 1;
+  int64_t reduce_scatter_overlap_limit = 1;
+  int64_t send_recv_overlap_limit = 1;
+  int64_t send_recv_host_overlap_limit = 1;
+  int64_t copy_overlap_limit = 1;
+  uint64_t memory_limit = UINT64_MAX;
+  bool schedule_send_recvs = false;
+  // Consider send recv as the same resource. Some platforms do not take well
+  // overlapping the send/recv ops between themselves.
+  bool force_send_recv_to_use_same_resource = false;
+  bool use_real_cost_model = false;
+  bool aggressive_scheduling_policies = false;
+  bool prioritize_async_depth_over_stall = false;
+  bool enable_release_start_policy = false;
+  bool resource_sharing = false;
+  bool resource_serializing = false;
+  bool depth_based_memory_pressure_reduction = false;
+  bool enable_selective_resources = false;
+  int64_t max_hops_to_closest_selective_overlap = 0;
+  int64_t rerun = 0;
+  int64_t parallel_collective_overlap_limit = 1;
+};
+
+// Class used estimate latency between instructions and cost of HLOs.
+class LatencyEstimator {
+ public:
+  using TimeCost = double;
+  // Uses the approximate or cost model function for GetLatencyBetween based on
+  // a flag.
+  virtual TimeCost GetLatencyBetween(const HloGraphNode& from,
+                                     const HloGraphNode& target) const = 0;
+  // Uses the approximate or cost model function for NodeCost based on a flag.
+  virtual TimeCost NodeCost(const HloInstruction* node) const = 0;
+  // Returns the core frequency used in latency estimation.
+  virtual int CyclesPerMicrosecond() const = 0;
+  virtual ~LatencyEstimator() = default;
+
+  inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
+    return get_canonical_async_op_(hlo);
+  }
+  bool IsAsyncPair(const HloGraphNode& from, const HloGraphNode& target) const;
+  bool IsP2pPair(const HloGraphNode& from, const HloGraphNode& target) const;
+  explicit LatencyEstimator(
+      GetCanonicalAsyncOpFunc func = DefaultGetCanonicalAsyncOp)
+      : get_canonical_async_op_(func) {}
+
+ private:
+  GetCanonicalAsyncOpFunc get_canonical_async_op_;
+};
+
+// Implementation of LatencyEstimator using an approximate cost model.
+class ApproximateLatencyEstimator : public LatencyEstimator {
+ public:
+  explicit ApproximateLatencyEstimator(
+      GetCanonicalAsyncOpFunc func = DefaultGetCanonicalAsyncOp)
+      : LatencyEstimator(func) {}
+
+  // Returns a latency estimation between two instructions.
+  // Currently this is in abstract units. When the real/accurate cost model is
+  // implemented this will be in cycles.
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& target) const override;
+  // Uses the approximate or cost model function for NodeCost based on a flag.
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+  // ApproximateLatencyEstimator uses abstract units so this returns 1.
+  int CyclesPerMicrosecond() const override { return 1; }
+
+ public:
+  static constexpr TimeCost kLowCost = 1.0;
+  static constexpr TimeCost kMediumCost = 1000.0;
+  static constexpr TimeCost kHighCost = 5000.0;
+
+ protected:
+  // These values are empirically derived to obtain an overlap of one output
+  // fusion/convolution with 1 async op or 5 loop fusions with an async op.
+  static constexpr TimeCost kLowLatency = 1.0;
+  static constexpr TimeCost kHighLatency = 5000.0;
+};
+
+// Helper class to keep track of which instructions are to be supported and
+// how many supported instructions per-type are contained in computations
+// recursively.
+class AsyncTracker {
+ public:
+  virtual ~AsyncTracker() = default;
+
+  // Returns if this is an Async op done that the scheduler supports.
+  virtual bool IsSupportedAsyncDone(const HloInstruction& hlo) const;
+
+  // Returns if this is an Async op start that the scheduler supports.
+  virtual bool IsSupportedAsyncStart(const HloInstruction& hlo) const;
+
+  // Returns resources used (i.e., occupied or released) by this instruction
+  virtual ResourcesVector GetResourcesFromInstructionImpl(
+      const HloInstruction& hlo) const;
+
+  // Returns resources used (i.e., occupied or released) by this instruction
+  absl::Span<const ResourcePair> GetResourcesFromInstruction(
+      const HloInstruction& hlo) const;
+
+  // Modifies the schedule graph passed as input to add dependencies that are
+  // implicit based on the system we are running on.
+  virtual void PostProcessScheduleGraph(
+      HloScheduleGraph* schedule_graph,
+      const LatencyEstimator* latency_estimator) const {}
+
+  // Returns the number of resources (of type resource_type) that are used by
+  // this instruction.
+  virtual int64_t GetNumResourcesPerInstruction(
+      ResourceType resource_type, const HloInstruction& instr) const;
+  virtual int64_t GetNumResourcesPerInstruction(
+      int64_t resource_type, const HloInstruction& instr) const;
+
+  // Sets the maximum allowed number of instances for each resource
+  virtual void SetConcurrentResourceLimits(
+      absl::flat_hash_map<int64_t, int64_t>& max_concurrent_resource) const;
+
+  // Returns the name of the given resource
+  virtual absl::string_view GetResourceName(int64_t resource_type) const;
+
+  // Returns the name of the given resource usage
+  absl::string_view GetResourceUsageName(int64_t resource_usage_type) const;
+  absl::string_view GetResourceUsageName(
+      ResourceUsageType resource_usage_type) const;
+
+  // Returns the first target defined resource's id, regardless of if it exits
+  static int64_t GetTargetDefinedResourceTypeBegin() {
+    return ResourceTypeToIndex(ResourceType::kTargetDefinedResourceTypeBegin);
+  }
+
+  // Returns the number of target defined resources
+  virtual int64_t GetNumTargetDefinedResources() const;
+
+  // Returns how many instructions using the given resource_type we can overlap
+  virtual int64_t GetNumAvailableResources(int64_t resource_type) const;
+
+  // Returns the hazard type that describes how to resolve the conflicts when
+  // multiple instructions attempt to use the given resource type concurrently.
+  // Default resources have a hazard type of kUnshareable.
+  virtual ResourceHazardType GetResourceHazardType(int64_t resource_type) const;
+
+  // Returns the list of the released shareable resources filtered from the
+  // given resources vector.
+  virtual absl::InlinedVector<int64_t, 1>
+  GetReleasedShareableResourcesFromVector(
+      const ResourcesVector& resources) const;
+
+  // Returns the list of the occupied shareable resources filtered from the
+  // given resources vector.
+  virtual absl::InlinedVector<int64_t, 1>
+  GetOccupiedShareableResourcesFromVector(
+      const ResourcesVector& resources) const;
+
+  // Returns the list of the occupied serial resources filtered from the given
+  // resources vector.
+  virtual absl::InlinedVector<int64_t, 1> GetOccupiedSerialResourcesFromVector(
+      const ResourcesVector& resources) const;
+
+  // Returns the list of the released nonextendable resources filtered from the
+  // given resources vector.
+  virtual absl::InlinedVector<int64_t, 1>
+  GetReleasedNonextendableResourcesFromVector(
+      const ResourcesVector& resources) const;
+
+  // Returns whether the provided node releases a selective resource.
+  bool ReleasesSelectiveResource(const HloGraphNode* node) const;
+
+  // Returns whether the provided node occupies a selective resource.
+  bool OccupiesSelectiveResource(const HloGraphNode* node) const;
+
+  inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
+    return get_canonical_async_op_(hlo);
+  }
+
+  explicit AsyncTracker(
+      const SchedulerConfig& config,
+      GetCanonicalAsyncOpFunc func = DefaultGetCanonicalAsyncOp)
+      : get_canonical_async_op_(std::move(func)), config_(config) {}
+
+ private:
+  const absl::flat_hash_map<int64_t, int64_t>& RecursivelyComputeResourceMap(
+      const HloComputation* computation) const;
+
+  mutable absl::flat_hash_map<
+      const HloComputation*,
+      std::unique_ptr<absl::flat_hash_map<int64_t, int64_t>>>
+      async_in_computation_cache_;
+  GetCanonicalAsyncOpFunc get_canonical_async_op_;
+
+ protected:
+  const SchedulerConfig config_;
+  mutable absl::flat_hash_map<const HloInstruction*, ResourcesVector>
+      resources_cache_;
+};
+
+// Base class for the core scheduling algorithm.
+class SchedulerCore {
+ public:
+  virtual absl::Status InitializeScheduler(const HloModule* module) = 0;
+  virtual absl::StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
+      const HloComputation* computation) = 0;
+  virtual ~SchedulerCore() = default;
+  virtual int64_t GetMemoryPeak() = 0;
+  virtual void SetMemoryLimit(uint64_t new_limit) = 0;
+  virtual uint64_t GetMemoryLimit() = 0;
+  virtual int64_t GetRerunTimes() = 0;
+};
+
+// Tracks user annotations for scheduling.
+class AnnotationTracker {
+ public:
+  explicit AnnotationTracker(const HloModule* module) : module_(module) {
+    for (const HloComputation* comp : module_->MakeNonfusionComputations()) {
+      absl::flat_hash_set<int64_t> annotations;
+      for (const HloInstruction* instr : comp->instructions()) {
+        if (auto annotation = GetAnnotation(instr)) {
+          // LegalizeSchedulingAnnotations pass should have made sure that the
+          // same annotation id is not used in multiple computations.
+          if (annotations.insert(annotation.value()).second) {
+            comp_annotation_map_[comp].push_back(annotation.value());
+          }
+          annotations_[annotation.value()].push_back(instr);
+        }
+      }
+    }
+  }
+  bool HasAnnotations(const HloComputation* comp) const {
+    return ContainsKey(comp_annotation_map_, comp);
+  }
+  std::vector<int64_t> GetAnnotations(const HloComputation* comp) const {
+    return comp_annotation_map_.at(comp);
+  }
+  std::optional<int64_t> GetAnnotation(const HloInstruction* instr) const {
+    const auto& attrs = instr->frontend_attributes().map();
+    if (attrs.contains(kXlaSchedulingGroupIdAttr)) {
+      return std::stoi(attrs.at(kXlaSchedulingGroupIdAttr));
+    }
+    return std::nullopt;
+  }
+  std::vector<const HloInstruction*> GetInstructions(
+      const int64_t annotation) const {
+    return annotations_.at(annotation);
+  }
+  int64_t GetNumInstructions(const int64_t annotation) {
+    return annotations_[annotation].size();
+  }
+  void FindAnnotationRoots(const int64_t annotation) {
+    absl::flat_hash_set<const HloInstruction*> seen_instructions(
+        annotations_[annotation].begin(), annotations_[annotation].end());
+    for (const HloInstruction* instr : annotations_.at(annotation)) {
+      bool has_annotated_user = false;
+      for (HloInstruction* user : instr->users()) {
+        if (seen_instructions.contains(user)) {
+          has_annotated_user = true;
+          break;
+        }
+      }
+      if (!has_annotated_user) {
+        VLOG(3) << "Annotation: " << annotation << ", root: " << instr->name();
+        annotation_roots_[annotation].push_back(instr);
+      }
+    }
+  }
+  int64_t GetNumRootInstructions(const int64_t annotation) {
+    if (!annotation_roots_.contains(annotation)) {
+      FindAnnotationRoots(annotation);
+    }
+    return annotation_roots_[annotation].size();
+  }
+  std::vector<const HloInstruction*> GetRootInstructions(
+      const int64_t annotation) {
+    if (!annotation_roots_.contains(annotation)) {
+      FindAnnotationRoots(annotation);
+    }
+    return annotation_roots_[annotation];
+  }
+  void PrintAnnotationSets(int64_t level) const {
+    for (const auto& [annotation, instrs] : annotations_) {
+      VLOG(level) << "Annotation " << annotation << " has " << instrs.size()
+                  << " instructions";
+      for (const HloInstruction* instr : instrs) {
+        VLOG(level) << "  " << instr->name();
+      }
+    }
+  }
+
+ private:
+  const HloModule* module_;
+  absl::flat_hash_map<const HloComputation*, std::vector<int64_t>>
+      comp_annotation_map_;
+  absl::flat_hash_map<int64_t, std::vector<const HloInstruction*>> annotations_;
+  absl::flat_hash_map<int64_t, std::vector<const HloInstruction*>>
+      annotation_roots_;
+};
+
+// Represents an edge between two nodes in the schedule graph.
+class HloEdge {
+ public:
+  // Nullptr is not a valid value for 'target'.
+  HloEdge(LatencyEstimator::TimeCost latency, HloGraphNode* target)
+      : latency_(latency), original_latency_(latency), target_(target) {}
+  LatencyEstimator::TimeCost Latency() const { return latency_; }
+  LatencyEstimator::TimeCost OriginalLatency() const {
+    return original_latency_;
+  }
+  void SetLatency(LatencyEstimator::TimeCost latency) { latency_ = latency; }
+  void SetOriginalLatency(LatencyEstimator::TimeCost original_latency) {
+    original_latency_ = original_latency;
+  }
+  const HloGraphNode& Target() const { return *target_; }
+  HloGraphNode& Target() { return *target_; }
+  std::string ToString() const;
+
+ private:
+  // Latency between the two nodes connected by this edge. The other end of the
+  // edge is the owner of the HloEdge object. This latency can get updated due
+  // to various scheduling optimizations.
+  LatencyEstimator::TimeCost latency_;
+  // Original latency is the initial latency value (typically computed by a
+  // latency estimator).
+  LatencyEstimator::TimeCost original_latency_;
+  // Target node of this edge.
+  HloGraphNode* target_;
+};
+
+// Node in the schedule graph, plus information used for scheduling.
+class HloGraphNode {
+ public:
+  using TimeCost = LatencyEstimator::TimeCost;
+
+  // Nullptr is not a valid value for 'i'.
+  explicit HloGraphNode(const HloInstruction* i, int64_t original_position)
+      : instr_(i), original_position_(original_position) {}
+  const HloInstruction& GetInstr() const { return *instr_; }
+  bool IsScheduled() const { return scheduled_; }
+  int32_t GetIndegree() const { return indegree_; }
+  int32_t GetOutdegree() const { return outdegree_; }
+  TimeCost GetReadyTime() const { return ready_time_; }
+  void SetIndegree(int64_t indeg) { indegree_ = indeg; }
+  void SetOutdegree(int64_t outdeg) { outdegree_ = outdeg; }
+  void SetScheduled() { scheduled_ = true; }
+  void SetReadyTime(TimeCost ready_time) { ready_time_ = ready_time; }
+  TimeCost GetCost() const { return cost_; }
+  void SetCost(TimeCost cost) { cost_ = cost; }
+  TimeCost GetAsyncDepth() const { return async_depth_; }
+  TimeCost GetDepth() const { return depth_; }
+  TimeCost GetGraphDepth() const { return graph_depth_; }
+  void SetAsyncDepth(TimeCost async_depth) { async_depth_ = async_depth; }
+  void SetDepth(TimeCost depth) { depth_ = depth; }
+  void SetGraphDepth(TimeCost graph_depth) { graph_depth_ = graph_depth; }
+  bool GetForceDelay() const { return force_delay_; }
+  void SetForceDelay(bool force_delay) { force_delay_ = force_delay; }
+  bool GetForceEarly() const { return force_early_; }
+  void SetForceEarly(bool force_early) { force_early_ = force_early; }
+  bool GetValuableForSelectiveOverlap() const {
+    return valuable_for_selective_overlap_;
+  }
+  void SetValuableForSelectiveOverlap(bool valuable_for_selective_overlap) {
+    valuable_for_selective_overlap_ = valuable_for_selective_overlap;
+  }
+  bool ReleasesSelectiveResource() const {
+    return releases_selective_resource_;
+  }
+  bool OccupiesSelectiveResource() const {
+    return occupies_selective_resource_;
+  }
+  int64_t GetNumHopsToClosestSelectiveResourceOccupier() const {
+    return num_hops_to_closest_selective_resource_occupier_;
+  }
+  void SetNumHopsToClosestSelectiveResourceOccupier(
+      int64_t num_hops_to_closest_selective_resource_occupier) {
+    num_hops_to_closest_selective_resource_occupier_ =
+        num_hops_to_closest_selective_resource_occupier;
+  }
+  ResourcesVector GetResources() const { return resources_; }
+  bool DoesOccupyAnyResource() const {
+    return absl::c_any_of(resources_, [](const ResourcePair& resource) {
+      return resource.second == ResourceUsageType::kResourceOccupy;
+    });
+  }
+  bool DoesReleaseAnyResource() const {
+    return absl::c_any_of(resources_, [](const ResourcePair& resource) {
+      return resource.second == ResourceUsageType::kResourceRelease;
+    });
+  }
+  bool DoesOccupyShareableResource(int64_t resource) const {
+    return absl::c_linear_search(occupied_shareable_resources_, resource);
+  }
+  bool DoesReleaseResource(ResourceType res) const {
+    return absl::c_any_of(resources_, [res](const ResourcePair& resource) {
+      return resource.second == ResourceUsageType::kResourceRelease &&
+             resource.first == ResourceTypeToIndex(res);
+    });
+  }
+  std::optional<ResourceUsageType> UsesResourceType(ResourceType res) const {
+    int64_t res_type = ResourceTypeToIndex(res);
+    for (const auto& [resource_type, usage_type] : resources_) {
+      if (resource_type == res_type) {
+        return usage_type;
+      }
+    }
+    return std::nullopt;
+  }
+  std::optional<ResourceUsageType> UsesResourceType(int64_t res) const {
+    for (const auto& [resource_type, usage_type] : resources_) {
+      if (resource_type == res) {
+        return usage_type;
+      }
+    }
+    return std::nullopt;
+  }
+  std::vector<int64_t> GetShareableResourcesOnEdge(const HloEdge& edge) const {
+    HloGraphNode node = edge.Target();
+    std::vector<int64_t> resources;
+    absl::c_for_each(released_shareable_resources_,
+                     [&node, &resources](const int64_t resource) {
+                       if (node.DoesOccupyShareableResource(resource)) {
+                         resources.push_back(resource);
+                       }
+                     });
+    return resources;
+  }
+  absl::Span<HloEdge> GetPredecessors() {
+    return absl::MakeSpan(predecessors_);
+  }
+  absl::Span<const HloEdge> GetPredecessors() const {
+    return absl::MakeConstSpan(predecessors_);
+  }
+  void AddPredecessor(const HloEdge& e) { predecessors_.push_back(e); }
+  absl::Span<HloEdge> GetSuccessors() { return absl::MakeSpan(successors_); }
+  absl::Span<const HloEdge> GetSuccessors() const {
+    return absl::MakeConstSpan(successors_);
+  }
+  void AddSuccessor(const HloEdge& e) { successors_.push_back(e); }
+  int64_t GetOriginalPosition() const { return original_position_; }
+  int64_t GetAnnotation() const { return annotation_; }
+  absl::Status SetAnnotation(int64_t annotation) {
+    TF_RET_CHECK(annotation_ == -1)
+        << "Instruction " << instr_->name()
+        << " has an existing annotation: " << annotation_;
+    annotation_ = annotation;
+    return absl::OkStatus();
+  }
+  std::string ToString(const AsyncTracker* async_tracker = nullptr) const {
+    std::string result;
+    absl::StrAppend(&result, "Instr: ", instr_->ToShortString(), "\n");
+    absl::StrAppend(&result, "ReadyTime: ", ready_time_, "\n");
+    absl::StrAppend(&result, "Indegree: ", indegree_, "\n");
+    absl::StrAppend(&result, "Outdegree: ", outdegree_, "\n");
+    absl::StrAppend(&result, "Cost: ", cost_, "\n");
+    absl::StrAppend(&result, "Async Depth: ", async_depth_, "\n");
+    absl::StrAppend(&result, "Depth: ", depth_, "\n");
+    absl::StrAppend(&result, "Graph Depth: ", graph_depth_, "\n");
+    absl::StrAppend(&result, "Force Delay: ", force_delay_, "\n");
+    absl::StrAppend(&result, "Force Early: ", force_early_, "\n");
+    absl::StrAppend(&result, "Predecessors:\n");
+    for (const HloEdge& e : predecessors_) {
+      absl::StrAppend(&result, e.ToString());
+    }
+    absl::StrAppend(&result, "Successors:\n");
+    for (const HloEdge& e : successors_) {
+      absl::StrAppend(&result, e.ToString());
+    }
+    if (async_tracker != nullptr) {
+      absl::StrAppend(&result, "Resources:\n");
+      for (const auto& [resource, usage] : resources_) {
+        absl::StrAppend(
+            &result, "\tResource: ", async_tracker->GetResourceName(resource),
+            " usage: ", async_tracker->GetResourceUsageName(usage), "\n");
+      }
+    }
+    return result;
+  }
+
+ private:
+  friend class HloScheduleGraph;
+  // List of predecessor edges.
+  std::vector<HloEdge> predecessors_;
+  // List of successor edges.
+  std::vector<HloEdge> successors_;
+  // Instruction this Graph node represents
+  const HloInstruction* instr_;
+  // The prosition of this node in the original order.
+  int64_t original_position_;
+  // Estimated time at which this node is gonna be ready to be scheduled.
+  // The node should be added to the ready to be scheduled set when ready_time_
+  // is less or equal to the current time in the schedule.
+  TimeCost ready_time_ = std::numeric_limits<TimeCost>::max();
+  // Number of predecessor nodes this nodes depends on that haven't been
+  // scheduled yet.
+  int32_t indegree_ = 0;
+  // Number of successor nodes this nodes depends on that haven't been
+  // scheduled yet.
+  int32_t outdegree_ = 0;
+  // Time cost of the execution of the operation of this nodes represent.
+  TimeCost cost_ = 0.0;
+  // Depth in latency terms of a node based on Async operation cost on the path.
+  TimeCost async_depth_ = 0.0;
+  // Depth in latency terms of node based on operation cost on the path to the
+  // entry node.
+  TimeCost depth_ = 0.0;
+  // Depth in latency terms of node based on distance to the entry node.
+  int64_t graph_depth_ = 0;
+  // AsyncResources used by the node.
+  ResourcesVector resources_;
+  // Force the scheduling of the nodes with attribute set as late as possible.
+  bool force_delay_ = false;
+  // Force the scheduling of the nodes with attribute set as early as possible.
+  bool force_early_ = false;
+  // Whether this node has been scheduled or not yet.
+  bool scheduled_ = false;
+  // Shareable resources released by this node.
+  absl::InlinedVector<int64_t, 1> released_shareable_resources_;
+  // Shareable resources occupied by this node.
+  absl::InlinedVector<int64_t, 1> occupied_shareable_resources_;
+  // Whether this node can be overlapped with (can cover the latency/cost of)
+  // edges occupying selective resources.
+  bool valuable_for_selective_overlap_ = true;
+  // Whether this node releases a selective resource.
+  bool releases_selective_resource_ = false;
+  // Whether this node occupies a selective resource.
+  bool occupies_selective_resource_ = false;
+  // Nums hops to closest selective resource occupier.
+  int64_t num_hops_to_closest_selective_resource_occupier_ =
+      std::numeric_limits<int64_t>::max();
+  int64_t annotation_ = -1;
+};
+
+// Schedule graph that can be used to drive scheduling
+// of HLO instructions.
+class HloScheduleGraph {
+ public:
+  // Instructions in the list passed to the constructor shouldn't be
+  // altered/deleted during the existence of the HloScheduleGraph.
+  // Nullptr is not a valid value for 'post_order_instructions' and
+  // 'alias_analysis'.
+  HloScheduleGraph(const std::vector<HloInstruction*>* post_order_instructions,
+                   HloAliasAnalysis* alias_analysis,
+                   const LatencyEstimator* latency_estimator,
+                   const AsyncTracker* async_tracker);
+
+  std::string ToString(const AsyncTracker* async_tracker = nullptr) const;
+
+  HloGraphNode& GetNode(const HloInstruction* instr) const;
+
+  std::vector<HloGraphNode*> FindBottomRoots() const;
+
+  std::vector<HloGraphNode*> FindTopRoots() const;
+
+  void InitializeGraphAnalysis(const AsyncTracker* async_tracker);
+
+  void AnnotateGraph(const AnnotationTracker* annotation_tracker);
+
+  // List of instructions in the original scheduled order. (Before scheduling).
+  absl::Span<const HloInstruction* const> GetOriginalInstrList() const {
+    return absl::MakeConstSpan(original_order_);
+  }
+  // Returns what was the original instruction position in the original order.
+  int64_t OriginalInstructionPosition(const HloInstruction* instr) const {
+    auto it = instr_order_map_.find(instr);
+    CHECK(it != instr_order_map_.end());
+    return it->second;
+  }
+
+ private:
+  // Map that allocates the nodes of the graph.
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloGraphNode>>
+      nodes_;
+  // Map containing the ordinal value for each instruction.
+  absl::flat_hash_map<const HloInstruction*, int64_t> instr_order_map_;
+  // List containing the original order (before scheduling) of the
+  // instructions).
+  std::vector<const HloInstruction*> original_order_;
+  // Searches through node's predecessors to see if
+  // possible_predecessor can be found.
+  bool IsPredecessorTransitively(const HloGraphNode* node,
+                                 const HloGraphNode* possible_predecessor);
+};
+
+// Tracks data about HloBuffers like where the first definition is in the
+// original schedule and caches the buffer size (as Target::ShapeSize()) is
+// expensive.
+class BufferInfoTracker {
+ public:
+  struct ValueInfo {
+    const HloBuffer* value = nullptr;
+    const HloInstruction* first_definition = nullptr;
+    int64_t buffer_size = 0;
+  };
+  BufferInfoTracker(const HloModule* module,
+                    const HloAliasAnalysis* alias_analysis,
+                    const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes);
+  static ValueInfo CreateBufferInfo(
+      const HloBuffer* value, const HloInstruction* first_definition,
+      const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes) {
+    return ValueInfo{
+        /*value=*/value, /*first_definition=*/first_definition,
+        /*buffer_size=*/shape_size_bytes(value->values()[0]->shape())};
+  }
+  const ValueInfo& GetBufferInfo(HloBuffer::Id id) const {
+    return buffer_infos_[id];
+  }
+
+ private:
+  std::vector<ValueInfo> buffer_infos_;
+};
+
+// Used to track and maintain memory pressure during scheduling.
+class MemoryPressureTracker {
+ public:
+  using LiveBufferSet = absl::flat_hash_set<HloBuffer::Id>;
+  struct MemoryPressureState {
+    int64_t memory_peak = 0;
+    absl::flat_hash_set<HloBuffer::Id> live_ids_at_bottom;
+  };
+  MemoryPressureTracker(
+      const HloAliasAnalysis* hlo_alias_analysis,
+      const BufferInfoTracker& buffer_tracker,
+      const absl::flat_hash_map<const HloComputation*, MemoryPressureState>&
+          pressure_state_cache)
+      : hlo_alias_analysis_(hlo_alias_analysis),
+        live_buffers_(hlo_alias_analysis->buffers().back().id() + 1),
+        buffer_tracker_(buffer_tracker),
+        pressure_state_cache_(pressure_state_cache),
+        live_memory_usage_(0),
+        initial_memory_pressure_(0) {}
+  // Intiialize object to be ready to start tracking of computation.
+  void Initialize(const HloComputation* computation,
+                  const LiveBufferSet& initial_live_buffers);
+  // After an instruction is scheduled, update the memory pressure effect on
+  // other instructions.
+  void UpdateBuffers(const HloInstruction* instruction);
+  // Return the memory pressure difference estimation if this instruction was
+  // scheduled.
+  // Returns a pair of (increase, peak) values.
+  // "increase" determines by how much the memory pressure increases or
+  // decreases after this instruction is scheduled. "peak" determines what's the
+  // peak usage of memory of the computation. The peak can be higher than the
+  // total memory increase of the instruction (imagine a computation called by a
+  // while loop, the body of the while could use quite some more memory than the
+  // amount of memory at the interfaces of the while loop instruction).
+  std::pair<int64_t, int64_t> MemoryPressureDifference(
+      const HloInstruction* instruction) const;
+  absl::flat_hash_set<HloBuffer::Id> live_buffers() const {
+    return live_buffers_set_;
+  }
+  bool BufferIsLive(const HloValue* buffer) const {
+    CHECK_LT(buffer->id(), live_buffers_.size());
+    return live_buffers_[buffer->id()];
+  }
+  // Returns the actual memory usage at the current state. It is initial memory
+  // + current memory usage inside of the computation.
+  int64_t memory_usage() const {
+    return live_memory_usage_ + initial_memory_pressure_;
+  }
+  // Returns the initial memory pressure at the bottom of the computation.
+  int64_t initial_memory_pressure() const { return initial_memory_pressure_; }
+
+  // Returns pressure state object for this MemoryPressureTracker object.
+  const MemoryPressureState& pressure_state() const { return pressure_state_; }
+
+ private:
+  static bool ShouldSkipBufferAllocations(
+      const HloInstruction* instruction, const ShapeIndex& idx,
+      const HloInstruction* first_definition) {
+    // Make GetTupleElement/kBitcast make alive only the tuple pointer if not
+    // array shape.
+    if ((instruction->opcode() == HloOpcode::kGetTupleElement ||
+         instruction->opcode() == HloOpcode::kBitcast) &&
+        !idx.empty()) {
+      return true;
+    }
+    // Skip entry computation parameters because their memory usage is already
+    // accounted for.
+    if (first_definition->opcode() == HloOpcode::kParameter &&
+        first_definition->parent()->IsEntryComputation()) {
+      return true;
+    }
+    return false;
+  }
+  static bool ShouldSkipBufferReleases(const HloInstruction* instruction) {
+    // Do not release parameter buffers as they are still in use by the caller.
+    if (instruction->opcode() == HloOpcode::kParameter) {
+      return true;
+    }
+    return false;
+  }
+  const HloAliasAnalysis* hlo_alias_analysis_;
+  // Live buffer presence set. This is used to determine if a buffer is live or
+  // not in a fast way. Because this is checked very often in the evaluation
+  // function of the scheduler quering the live_buffer_set_ object is too slow.
+  // This is much faster in a tight loop. Also we use int8_t explicitly rather
+  // than "bool" as "bool" is optimized and bit-packed trading memory for bit
+  // extract operations.
+  std::vector<int8_t> live_buffers_;
+  // Set of live buffer ids.
+  LiveBufferSet live_buffers_set_;
+  const BufferInfoTracker& buffer_tracker_;
+  // Cache of buffer objects defined that are output of instructions.
+  absl::flat_hash_map<
+      HloInstruction*,
+      std::vector<std::pair<BufferInfoTracker::ValueInfo, ShapeIndex>>>
+      output_buffers_;
+  // Cache of buffer objects defined that are defined by instructions.
+  absl::flat_hash_map<HloInstruction*,
+                      std::vector<BufferInfoTracker::ValueInfo>>
+      defined_buffers_;
+  // Map with pressure_state object for other computations. It's updated by
+  // the user of this class.
+  const absl::flat_hash_map<const HloComputation*, MemoryPressureState>&
+      pressure_state_cache_;
+  // Current memory usage delta from the initial memory of the computation.
+  int64_t live_memory_usage_;
+  // Initial memory pressure at the bottom of the computation.
+  int64_t initial_memory_pressure_;
+  MemoryPressureState pressure_state_;
+};
+
+// Module memory pressure state object. Handles and holds all the objects used
+// to store information about memory pressure for computations.
+// Computes initial pressure state.
+class ModulePressureState {
+ public:
+  using PressureStateMap =
+      absl::flat_hash_map<const HloComputation*,
+                          MemoryPressureTracker::MemoryPressureState>;
+  ModulePressureState(
+      const HloModule* module, const HloAliasAnalysis* hlo_alias_analysis,
+      const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes)
+      : module_(module),
+        hlo_alias_analysis_(hlo_alias_analysis),
+        buffer_tracker_(module, hlo_alias_analysis, shape_size_bytes) {}
+  void InitializePressureStates();
+  bool ComputationIsMemoryTracked(const HloComputation* computation) const {
+    return ContainsKey(memory_pressure_states_, computation);
+  }
+  // Get memory pressure state for a certain computation stored in this class.
+  const MemoryPressureTracker::MemoryPressureState&
+  GetPressureStateForComputation(const HloComputation* comp) const {
+    auto it = memory_pressure_states_.find(comp);
+    CHECK(it != memory_pressure_states_.end())
+        << "No state for " << comp->name();
+    return it->second;
+  }
+  // Updates the memory pressure state cache.
+  void UpdatePressureStateForComputation(
+      const HloComputation* comp,
+      MemoryPressureTracker::MemoryPressureState state) {
+    memory_pressure_states_[comp] = state;
+    memory_peak_ = std::max(memory_peak_, state.memory_peak);
+  }
+  // Returns the underlying pressure state cache object
+  const PressureStateMap& pressure_state_cache() const {
+    return memory_pressure_states_;
+  }
+  // Returns the buffer tracker object.
+  const BufferInfoTracker& buffer_tracker() const { return buffer_tracker_; }
+  int64_t GetMemoryPeak() { return memory_peak_; }
+  void SetMemoryPeak(int64_t peak) { memory_peak_ = peak; }
+
+ private:
+  const HloModule* module_;
+  const HloAliasAnalysis* hlo_alias_analysis_;
+  absl::flat_hash_map<const HloComputation*,
+                      MemoryPressureTracker::MemoryPressureState>
+      memory_pressure_states_;
+  BufferInfoTracker buffer_tracker_;
+  int64_t memory_peak_ = 0;
+};
+
+// Implementation of the default scheduling algorithm.
+class DefaultSchedulerCore : public SchedulerCore {
+ public:
+  using ReadyQueueSet = std::vector<HloGraphNode*>;
+  using ResourceMap = absl::flat_hash_map<int64_t, int64_t>;
+  using ShouldSkipNodeFunction = std::function<bool(const HloGraphNode*)>;
+
+  // Class used to cache expensive information. Currently memory pressure
+  // changes are cached. The caching is invalidated at the end of the scheduling
+  // process for this next candidate. The information shouldn't survive across
+  // scheduling two different instructions.
+  struct ScheduleCandidate {
+    HloGraphNode* node = nullptr;
+    std::optional<std::pair<int64_t, int64_t>> pressure_change;
+    std::optional<HloGraphNode::TimeCost> estimated_connected_send_ready_time;
+    std::optional<bool> resource_constrained;
+  };
+
+  struct CandidateResult {
+    ScheduleCandidate result;
+    const char* reason;
+  };
+
+  using TargetSchedulingRule = std::function<std::optional<CandidateResult>(
+      ScheduleCandidate&, ScheduleCandidate&)>;
+
+  // Returns nullopt if both parameters are equal, otherwise true if the first
+  // parameter is true and false if the second is true
+  static std::optional<bool> TrueForOneOnly(bool first, bool second) {
+    if (first == second) {
+      return std::nullopt;
+    }
+    return first;
+  }
+
+  static std::optional<CandidateResult> ChooseBestCandidate(
+      bool first_cond, const ScheduleCandidate& first_candidate,
+      bool second_cond, const ScheduleCandidate& second_candidate,
+      const char* reason) {
+    if (auto cond = TrueForOneOnly(first_cond, second_cond)) {
+      return CandidateResult{*cond ? first_candidate : second_candidate,
+                             reason};
+    }
+    return std::nullopt;
+  }
+
+  // The scheduling state contains everything that is required for the
+  // bookkeeping of the scheduling algorithm. Functions that perform operations
+  // over the scheduling state can directly operate on the state contained into
+  // this struct instead of having to pass many individual pointers to elements
+  // of the state.
+  struct SchedulingState {
+    HloScheduleGraph sched_graph;
+    // Ready set for the nodes. Its ordered by our heuristic defined in
+    // ReadySetLt.
+    ReadyQueueSet ready_set;
+    // Maximum allowed number of overlapping instructions using the key resource
+    // type.
+    ResourceMap max_concurrent_resource;
+    // New scheduling sequence produced by the scheduler. This is in reversed
+    // order (because we schedule bottom up). This will be required to be
+    // reversed before assigning to the HloSchedule.
+    std::vector<HloInstruction*> new_sequence_reversed;
+    // Units of time passed in the schedule. To keep track of latency hiding.
+    HloGraphNode::TimeCost current_time = 0;
+    // Resources and corresponding occupiers in flight.
+    absl::flat_hash_map<int64_t, absl::flat_hash_set<const HloInstruction*>>
+        resource_occupiers_in_flight;
+    // Number of instructions using the key resource type in the set waiting to
+    // be scheduled.
+    ResourceMap resource_users_in_queue;
+    // Number of nodes scheduled.
+    int64_t scheduled_count = 0;
+    // Class returning information about instruction cost and latency between
+    // instructions.
+    const LatencyEstimator* latency_estimator;
+    // Class used to track which instructions are async instructions and which
+    // async instructions computations contain.
+    const AsyncTracker* async_tracker;
+    // Tracker of memory pressure for the computation.
+    MemoryPressureTracker* memory_pressure_tracker;
+    // Vector containing a list of nodes that aren't ready to schedule yet in
+    // order of time when they are going to become ready.
+    std::vector<const HloGraphNode*> next_ready_stack;
+    // List of the graph edges currently occupying the key shareable resource
+    // with projected finish times.
+    absl::flat_hash_map<
+        int64_t, std::vector<std::pair<HloEdge*, HloGraphNode::TimeCost>>>
+        shareable_resource_occupiers;
+    // List of the graph nodes that release selective resources.
+    std::vector<HloGraphNode*> selective_resource_releasers;
+    // Similar to ready set, but only contains the no-op instructions.
+    ReadyQueueSet nop_set;
+    // Number of nodes that are ready to be scheduled with the given annotation.
+    absl::flat_hash_map<int64_t, int64_t> ready_num_nodes_with_annotation;
+    // List of annotations that are ready to be scheduled.
+    absl::InlinedVector<int64_t, 2> ready_annotations;
+    // List of annotated nodes that are ready to be scheduled.
+    ReadyQueueSet annotation_ready;
+    // Annotation that is currently being scheduled.
+    int64_t ongoing_annotation = -1;
+    // Reference to this scheduler run configuration.
+    const SchedulerConfig& config;
+    SchedulingState(const HloInstructionSequence* instr_sequence,
+                    HloAliasAnalysis* alias_analysis,
+                    const LatencyEstimator* latency_estimator,
+                    const AsyncTracker* async_tracker,
+                    MemoryPressureTracker* memory_pressure_tracker,
+                    const SchedulerConfig& config)
+        : sched_graph(&instr_sequence->instructions(), alias_analysis,
+                      latency_estimator, async_tracker),
+          latency_estimator(latency_estimator),
+          async_tracker(async_tracker),
+          memory_pressure_tracker(memory_pressure_tracker),
+          config(config) {}
+  };
+
+  using OverlapLimitRule =
+      std::function<bool(const SchedulingState&, const HloGraphNode*)>;
+  using PostProcessingFn = std::function<void(SchedulingState&)>;
+
+  DefaultSchedulerCore(
+      HloCostAnalysis::ShapeSizeFunction shape_size_bytes,
+      const AsyncTracker* async_tracker,
+      const LatencyEstimator* latency_estimator, const SchedulerConfig& config,
+      TargetSchedulingRule target_scheduling_rule = nullptr,
+      TargetSchedulingRule early_target_scheduling_rule = nullptr,
+      PostProcessingFn post_processing_fn = nullptr,
+      OverlapLimitRule scheduling_instruction_crosses_overlap_limit = nullptr)
+      : shape_size_bytes_(shape_size_bytes),
+        async_tracker_(async_tracker),
+        latency_estimator_(latency_estimator),
+        config_(config),
+        target_scheduling_rule_(target_scheduling_rule),
+        early_target_scheduling_rule_(early_target_scheduling_rule),
+        post_processing_fn_(post_processing_fn),
+        scheduling_instruction_crosses_overlap_limit_(
+            scheduling_instruction_crosses_overlap_limit) {}
+  absl::Status InitializeScheduler(const HloModule* module) override;
+  absl::StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
+      const HloComputation* computation) override;
+  static bool AddOccupierToResource(
+      HloGraphNode::TimeCost current_time, HloEdge& new_edge,
+      std::vector<std::pair<HloEdge*, HloGraphNode::TimeCost>>& occupiers);
+  static bool DeleteOccupierFromResource(
+      HloGraphNode::TimeCost current_time, HloEdge& edge,
+      std::vector<std::pair<HloEdge*, HloGraphNode::TimeCost>>& occupiers);
+  int64_t GetMemoryPeak() override {
+    return module_pressure_state_->GetMemoryPeak();
+  }
+  uint64_t GetMemoryLimit() override { return config_.memory_limit; }
+  void SetMemoryLimit(uint64_t new_limit) override {
+    this->config_.memory_limit = new_limit;
+  }
+  int64_t GetRerunTimes() override { return config_.rerun; }
+  bool SchedulingAnnotationCrossesOverlapLimit(
+      const SchedulingState& sched_state, int64_t annotation);
+
+ protected:
+  virtual void LogInstruction(const HloInstruction* instr) const;
+  // Schedules the given annotated node.
+  absl::Status AnnotatedSchedulingStep(
+      HloGraphNode* node,
+      DefaultSchedulerCore::SchedulingState* sched_state) const;
+  // Schedules all of the nodes in the given annotation.
+  absl::Status ScheduleAnnotation(
+      int64_t annotation,
+      DefaultSchedulerCore::SchedulingState* sched_state) const;
+  // Update node that has been scheduled.
+  virtual absl::StatusOr<HloGraphNode::TimeCost> ScheduleNode(
+      HloGraphNode* n, SchedulingState* sched_state) const;
+  // Perform the scheduling of one or more instructions. Called every time the
+  // ready set is not empty.
+  virtual absl::Status SchedulingStep(SchedulingState* sched_state);
+  // Pick a node to schedule according to cost model.
+  virtual absl::StatusOr<HloGraphNode*> FindAndExtractBestNodeAvailable(
+      SchedulingState& sched_state,
+      DefaultSchedulerCore::ShouldSkipNodeFunction should_skip_node);
+  void DumpLatencyHidingSchedule(
+      const HloComputation* computation, const HloScheduleGraph& schedule_graph,
+      const std::vector<HloInstruction*>& instructions,
+      int cycles_per_microsecond, const DebugOptions& debug_options);
+
+  HloCostAnalysis::ShapeSizeFunction shape_size_bytes_;
+  std::unique_ptr<ModulePressureState> module_pressure_state_;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  const AsyncTracker* async_tracker_;
+  const LatencyEstimator* latency_estimator_;
+  SchedulerConfig config_;
+  TargetSchedulingRule target_scheduling_rule_ = nullptr;
+  TargetSchedulingRule early_target_scheduling_rule_ = nullptr;
+  PostProcessingFn post_processing_fn_ = nullptr;
+  OverlapLimitRule scheduling_instruction_crosses_overlap_limit_ = nullptr;
+  std::unique_ptr<AnnotationTracker> annotation_tracker_;
+};
+
+// A scheduler oriented to hiding latencies of operations that can run in
+// parallel with other operations.
+class LatencyHidingScheduler : public HloModulePass {
+ public:
+  struct SchedulerStatistics {
+    const HloComputation* computation = nullptr;
+    double all_gather_wasted_cycles = 0;
+    double all_reduce_wasted_cycles = 0;
+    double collective_broadcast_wasted_cycles = 0;
+    double collective_permute_wasted_cycles = 0;
+    double all_to_all_wasted_cycles = 0;
+    double reduce_scatter_wasted_cycles = 0;
+    double send_wasted_cycles = 0;
+    double recv_wasted_cycles = 0;
+    double total_cycles = 0;
+    int64_t memory_pressure_peak = 0;
+  };
+
+  LatencyHidingScheduler(
+      std::unique_ptr<LatencyEstimator> latency_estimator,
+      std::unique_ptr<AsyncTracker> async_tracker,
+      std::unique_ptr<SchedulerCore> scheduler_core,
+      const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes)
+      : latency_estimator_(std::move(latency_estimator)),
+        async_tracker_(std::move(async_tracker)),
+        scheduler_core_(std::move(scheduler_core)),
+        shape_size_bytes_(shape_size_bytes) {}
+  constexpr static absl::string_view kName = "latency-hiding-scheduler";
+  absl::string_view name() const override { return kName; }
+
+  // Returns some printable statistics about the latency hiding for
+  // operations that can run in parallel to help evaluating the performance of
+  // the scheduler and improve it.
+  static SchedulerStatistics LatencyHidingStatistics(
+      const HloComputation* computation,
+      const LatencyEstimator* latency_estimator,
+      const AsyncTracker* async_tracker,
+      const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes);
+  // Returns a string representation of the scheduler statistics object.
+  static std::string SchedulerStatisticsString(
+      const SchedulerStatistics& sched_stats);
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  virtual void LogScheduleStatistics(const HloComputation* computation);
+
+ private:
+  std::unique_ptr<LatencyEstimator> latency_estimator_;
+  std::unique_ptr<AsyncTracker> async_tracker_;
+  std::unique_ptr<SchedulerCore> scheduler_core_;
+  const HloCostAnalysis::ShapeSizeFunction shape_size_bytes_;
+  absl::flat_hash_set<HloComputation*> computations_to_schedule_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/layout_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/layout_assignment.h
new file mode 100644
index 00000000..b9258213
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/layout_assignment.h
@@ -0,0 +1,774 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
+#define XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/tuple_points_to_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/map_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/computation_layout.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_layout.h"
+#include "xla/shape_util.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+class LayoutAssignment;
+
+// Abstract base class for layout constraints. These constraint objects are
+// gathered together in LayoutConstraints object.
+class LayoutConstraint {
+ public:
+  LayoutConstraint(bool mandatory, bool dfs, int64_t priority)
+      : mandatory_(mandatory), dfs_(dfs), priority_(priority) {}
+  virtual ~LayoutConstraint() = default;
+
+  virtual std::string ToString() const = 0;
+
+  // True if this constraint cannot be overwritten by a different constraint.
+  bool mandatory() const { return mandatory_; }
+
+  // When true, propagate in DFS. When false, constraint will propagate in BFS.
+  bool dfs() const { return dfs_; }
+
+  // Return the priority of the current constraint. When conflicting constraints
+  // are encountered, the higher priority one should win.
+  int64_t priority() const { return priority_; }
+  bool IsDefaultLayout() const { return priority_ == kDefaultPriority; }
+
+  // The priority of all default layouts when not set explicitly.
+  static constexpr int64_t kDefaultPriority = -2;
+  // The beginning priority of layout assignment.
+  static constexpr int64_t kBeginningPriority = 0;
+  // The priority of layout assignment given by the user for entry computation.
+  static constexpr int64_t kGivenPriority = 3;
+
+ protected:
+  bool mandatory_;
+  bool dfs_;
+  int64_t priority_;
+};
+
+std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
+
+// Layout constraint on a single LogicalBuffer. This constrains the layout of an
+// array produced by a particular instruction.
+class BufferLayoutConstraint : public LayoutConstraint {
+ public:
+  BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer,
+                         bool mandatory, bool dfs, int64_t priority);
+
+  const LogicalBuffer& buffer() const { return *buffer_; }
+  const Layout& layout() const { return layout_[0]; }
+  bool UpdateLayout(int64_t priority, const Layout& layout, bool mandatory,
+                    bool dfs, LayoutAssignment* assignment,
+                    const HloInstruction* from_user = nullptr);
+
+  std::string ToString() const override;
+
+ private:
+  absl::InlinedVector<Layout, 2> layout_;
+  const LogicalBuffer* buffer_;
+  const HloInstruction* from_user_ = nullptr;
+};
+
+// Constraint on the layout of the operand of an instruction. The constrained
+// shape can be arbitrarily shaped (array or tuple). This is a constraint on the
+// use of a shaped value and is not a hard constraint on the instruction(s)
+// which define the value as copies may be inserted between the definition and
+// use.
+class OperandLayoutConstraint : public LayoutConstraint {
+ public:
+  OperandLayoutConstraint(const ShapeLayout& shape_layout,
+                          const HloInstruction* instruction, int64_t operand_no,
+                          bool mandatory, bool dfs, int64_t priority);
+
+  const ShapeLayout& shape_layout() const { return shape_layout_[0]; }
+  const HloInstruction* instruction() const { return instruction_; }
+  int64_t operand_no() const { return operand_no_; }
+  const HloInstruction* operand() const {
+    return instruction_->operand(operand_no_);
+  }
+  // Return whether the layout should be allowed to be modified.
+  bool UpdateLayout(int64_t priority, const Shape& new_shape, bool mandatory,
+                    bool dfs, LayoutAssignment* assignment);
+  std::string ToString() const override;
+
+ private:
+  absl::InlinedVector<ShapeLayout, 2> shape_layout_;
+  const HloInstruction* instruction_;
+  int64_t operand_no_;
+};
+
+// Constraint on the layout of a computation interface.
+class ComputationLayoutConstraint : public LayoutConstraint {
+ public:
+  static constexpr int64_t kDefaultLayoutIsUsed = 0;
+  static constexpr int64_t kResultLayoutIsSet = 1;
+  static constexpr int64_t kParameterLayoutIsSet = 2;
+  static constexpr int64_t kComputationLayoutIsSet = 3;
+  explicit ComputationLayoutConstraint(const HloComputation* computation,
+                                       ComputationLayout* computation_layout,
+                                       int64_t priority)
+      : LayoutConstraint(/*mandatory=*/true, /*dfs=*/true, priority),
+        layout_state_((computation_layout == nullptr)
+                          ? kDefaultLayoutIsUsed
+                          : kComputationLayoutIsSet),
+        computation_layout_(
+            (computation_layout == nullptr)
+                ? ComputationLayout(
+                      computation->ComputeProgramShape(),
+                      // Computation callers need layout to be set and
+                      // computation parameters may miss the layout, so we
+                      // cannot rely on them and need to reset/ignore the
+                      // layout. Entry computation is special because unset
+                      // layouts there are used to indicate that the layout
+                      // should be automatically inferred.
+                      /*ignore_layouts=*/!computation->IsEntryComputation())
+                : *computation_layout) {}
+
+  const ComputationLayout& computation_layout() const {
+    return computation_layout_;
+  }
+  void ResetComputationLayout(const ComputationLayout& layout, int64_t priority,
+                              bool prop_result_layout,
+                              bool prop_parameter_layout) {
+    computation_layout_ = layout;
+    priority_ = priority;
+    if (prop_result_layout) {
+      layout_state_ |= kResultLayoutIsSet;
+    }
+    if (prop_parameter_layout) {
+      layout_state_ |= kParameterLayoutIsSet;
+    }
+  }
+  void ResetResultLayout(const ShapeLayout& shape_layout, int64_t priority) {
+    *computation_layout_.mutable_result_layout() = shape_layout;
+    layout_state_ |= kResultLayoutIsSet;
+    priority_ = priority;
+  }
+  bool parameter_layout_is_set() const {
+    return layout_state_ & kParameterLayoutIsSet;
+  }
+  bool result_layout_is_set() const {
+    return layout_state_ & kResultLayoutIsSet;
+  }
+  bool default_layout_is_used() const {
+    return layout_state_ == kDefaultLayoutIsUsed;
+  }
+  std::string ToString() const override;
+
+ private:
+  // The layout_state_ variable is used to remember whether the layout for
+  // the overall computation is explicitly set, whether its result layout is
+  // explicitly set, or whether it only stores the default layout of the
+  // computation.
+  int64_t layout_state_;
+  ComputationLayout computation_layout_;
+};
+
+// Contains constraints on the layout of channels; sends and recvs.
+class ChannelLayoutConstraints {
+ public:
+  // Construct an empty constraint set.
+  ChannelLayoutConstraints() = default;
+
+  // Returns true if channel_id has a layout constraint.
+  bool IsChannelConstrained(int64_t channel_id) const {
+    return constraints_.contains(channel_id);
+  }
+
+  // Given `shape`, apply the layout for `channel_id`. `channel_id` must already
+  // be constrained.
+  Shape LayoutShapeForChannel(Shape shape, int64_t channel_id) const {
+    auto it = constraints_.find(channel_id);
+    CHECK(it != constraints_.end()) << "Channel " << channel_id;
+    *shape.mutable_layout() = it->second;
+    return shape;
+  }
+
+  // Returns the layout constraint for `channel_id`, which must already be
+  // constrained.
+  const Layout& LayoutForChannel(int64_t channel_id) const {
+    auto it = constraints_.find(channel_id);
+    CHECK(it != constraints_.end()) << "Channel " << channel_id;
+    return it->second;
+  }
+
+  // Adds a new layout constraint for `channel_id`. If a constraint for
+  // `channel_id` has been added, this API returns nullptr, otherwise returns
+  // the layout which has already been set for the channel.
+  const Layout* ConstrainChannel(int64_t channel_id, const Layout& layout) {
+    auto it = constraints_.emplace(std::make_pair(channel_id, layout));
+    if (it.second) {
+      return nullptr;
+    }
+    return LayoutUtil::Equal(layout, it.first->second) ? nullptr
+                                                       : &it.first->second;
+  }
+
+ private:
+  absl::flat_hash_map<int64_t, Layout> constraints_;
+};
+
+// HLO pass which assigns layouts to all instructions in the HLO module while
+// satisfying all necessary invariants and minimizing cost.
+class LayoutAssignment : public HloModulePass {
+ public:
+  // entry_computation_layout is modified to populate a layout for the result in
+  // the case that no particular layout is requested.
+  //
+  // channel_constraints is both an input and output. Any sends or recvs that
+  // are present in channel_constraints will be laid out as constrained. Any
+  // unconstrained sends or recvs will be laid out as locally optimal and their
+  // layout will be added as a constraint to channel_constraints.
+  //
+  // If channel_constraints is nullptr, no kSend or kRecvs must be contained
+  // within any module passed to `Run`.
+  explicit LayoutAssignment(
+      ComputationLayout* entry_computation_layout,
+      ChannelLayoutConstraints* channel_constraints = nullptr,
+      bool reverse_computation_order = false);
+  ~LayoutAssignment() override {}
+  const TuplePointsToAnalysis& points_to_analysis() const {
+    return *points_to_analysis_;
+  }
+  absl::string_view name() const override { return "layout-assignment"; }
+
+  // Assign layouts to the given module. Returns whether the module was changed
+  // (any layouts were changed).
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Class encapsulating the layout constraints of the values in a HLO
+  // computation.
+  class LayoutConstraints {
+   public:
+    explicit LayoutConstraints(HloComputation* computation,
+                               ComputationLayout* computation_layout,
+                               int64_t priority)
+        : computation_(computation),
+          computation_constraint_(computation, computation_layout, priority) {}
+    ~LayoutConstraints() = default;
+
+    const HloComputation* computation() const { return computation_; }
+    HloComputation* computation() { return computation_; }
+    void ResetOperandConstraints() { operand_constraints_.clear(); }
+    const ShapeLayout* OperandLayout(const HloInstruction* instruction,
+                                     int64_t operand_no) const;
+    const OperandLayoutConstraint* GetOperandLayoutConstraint(
+        const HloInstruction* instruction, int64_t operand_no) const;
+    std::unique_ptr<OperandLayoutConstraint>& MutableOperandLayoutConstraint(
+        const HloInstruction* instruction, int64_t operand_no);
+    const ShapeLayout* ResultLayout() const;
+    absl::Status SetResultLayout(LayoutAssignment* assignment,
+                                 const Shape& shape_with_layout,
+                                 int64_t priority);
+
+    const ComputationLayout& computation_layout() const {
+      return computation_constraint_.computation_layout();
+    }
+    const ComputationLayoutConstraint& computation_constraint() const {
+      return computation_constraint_;
+    }
+    ComputationLayoutConstraint* mutable_computation_constraint() {
+      return &computation_constraint_;
+    }
+
+   private:
+    // The set of OperandLayoutConstraints applied to the computation.
+    using OperandConstraintKey = std::pair<const HloInstruction*, int64_t>;
+    absl::flat_hash_map<OperandConstraintKey,
+                        std::unique_ptr<OperandLayoutConstraint>>
+        operand_constraints_;
+
+    HloComputation* computation_;
+    ComputationLayoutConstraint computation_constraint_;
+  };
+
+  // Determines whether an instruction can change layouts. An instruction not
+  // being able to change layout means that it requires operands with the same
+  // rank as the output to have the same layout as the output.
+  static bool InstructionCanChangeLayout(const HloInstruction* instruction);
+
+  const LayoutConstraints& computation_constraints(
+      const HloComputation* computation) const {
+    return *FindOrDie(computation_layouts_, computation);
+  }
+
+  LayoutConstraints& mutable_computation_constraints(
+      const HloComputation* computation) {
+    return *FindOrDie(computation_layouts_, computation);
+  }
+  LayoutConstraints* mutable_computation_constraints(
+      HloComputation* computation) {
+    auto it = computation_layouts_.find(computation);
+    LayoutConstraints* constraints = nullptr;
+    if (it == computation_layouts_.end()) {
+      computation_layouts_.emplace(
+          computation,
+          constraints = new LayoutConstraints(
+              computation, nullptr, LayoutConstraint::kDefaultPriority));
+    } else {
+      constraints = (*it).second.get();
+    }
+    return constraints;
+  }
+  void PushAddedConstraints(const LayoutConstraint* constraint);
+
+  // In case of an array shape returns true iff it is at most rank 1. In case of
+  // a tuple shape returns true iff all leaf shapes are at most rank 1.
+  static bool IsAtMostRank1(const Shape& shape);
+  // Convenience wrapper around SetOperandLayout for setting the layout of a
+  // operand using a Layout object. The operand must be array-shaped.
+  absl::Status SetArrayOperandLayout(const Layout& layout,
+                                     const HloInstruction* instruction,
+                                     int64_t operand_no, bool mandatory = true,
+                                     bool dfs = true) {
+    return SetArrayOperandLayout(layout, instruction, operand_no, mandatory,
+                                 dfs, current_priority_);
+  }
+  absl::Status SetArrayOperandLayout(const Layout& layout,
+                                     const HloInstruction* instruction,
+                                     int64_t operand_no, bool mandatory,
+                                     bool dfs, int64_t priority);
+  // Convenience wrapper around SetBufferLayout. Sets the layouts of all buffers
+  // created by the instruction to the layouts in the given shape. The
+  // instruction must define every logical buffer in its output.
+  // If `allow_alias` is false, the function will check that all output buffers
+  // are defined by `instruction`, not aliased to an instruction elsewhere.
+  absl::Status SetInstructionLayout(const Shape& shape_with_layout,
+                                    const HloInstruction* instruction,
+                                    bool mandatory = true, bool dfs = true,
+                                    bool allow_alias = false,
+                                    ShapeIndexView subshape_index = {}) {
+    return SetInstructionLayout(shape_with_layout, instruction, mandatory, dfs,
+                                allow_alias, current_priority_, subshape_index);
+  }
+  absl::Status SetInstructionLayout(const Shape& shape_with_layout,
+                                    const HloInstruction* instruction,
+                                    bool mandatory, bool dfs, bool allow_alias,
+                                    int64_t priority,
+                                    ShapeIndexView subshape_index = {});
+  // Set the same given layout across all components of the instruction output.
+  // It works the same as the API above if the output is a single array.
+  absl::Status SetInstructionLayout(const Layout& layout,
+                                    const HloInstruction* instruction,
+                                    bool mandatory = true, bool dfs = true,
+                                    bool allow_alias = false,
+                                    int64_t priority = -1);
+  // Add a constraint on the layout of a LogicalBuffer, the layout of the
+  // operand of the instruction, or the layout of the result of the computation,
+  // respectively.
+  absl::Status SetBufferLayout(const Layout& layout,
+                               const LogicalBuffer& buffer,
+                               bool mandatory = true, bool dfs = true) {
+    return SetBufferLayout(layout, buffer, mandatory, dfs, current_priority_);
+  }
+  absl::Status SetBufferLayout(const Layout& layout,
+                               const LogicalBuffer& buffer, bool mandatory,
+                               bool dfs, int64_t priority,
+                               const HloInstruction* from_user = nullptr);
+  absl::Status SetOperandLayout(const Shape& shape_with_layout,
+                                const HloInstruction* instruction,
+                                int64_t operand_no, bool mandatory = true,
+                                bool dfs = true) {
+    return SetOperandLayout(shape_with_layout, instruction, operand_no,
+                            mandatory, dfs, current_priority_);
+  }
+  absl::Status SetOperandLayout(const Shape& shape_with_layout,
+                                const HloInstruction* instruction,
+                                int64_t operand_no, bool mandatory, bool dfs,
+                                int64_t priority);
+  bool reverse_computation_order() const { return reverse_computation_order_; }
+
+  ComputationLayout& saved_entry_computation_layout() {
+    return saved_entry_computation_layout_;
+  }
+  virtual bool NegotiateLayout(const HloInstruction* instruction,
+                               const Layout& new_layout,
+                               const Layout& existing_layout,
+                               const HloInstruction* from_user,
+                               const HloInstruction* orig_user) {
+    return false;
+  }
+  virtual bool NegotiateOperandLayout(const HloInstruction* instruction,
+                                      int64_t operand_no,
+                                      const Layout& new_layout,
+                                      const Layout& existing_layout) {
+    return false;
+  }
+  // Should be made consistent with the ChooseOperandLayoutFromOutputLayout
+  // except that a boolean instead of concrete layout is returned.
+  virtual bool OperandLayoutAlwaysPropagateForward(const HloInstruction* user);
+  // Controls when all operands of user must have the same layout.
+  virtual bool OperandLayoutAlwaysPropagateToSiblings(
+      const HloInstruction* user);
+  // Controls when all operands of user must have the same layout as the output.
+  virtual bool OutputLayoutAlwaysPropagateToOperands(
+      const HloInstruction* user);
+  // Whether to propagate the reduction layout to the operand by preserving the
+  // same relative order of the dimensions that are kept, and making the
+  // reduction dims the most minor dimensions.
+  virtual bool PropagateReductionLayoutToOperand(const HloInstruction* user) {
+    return false;
+  }
+
+ protected:
+  // These methods, invoked by PropagateConstraints, propagate a layout
+  // constraint to its neighbors (i.e. operands and users) in order to minimize
+  // the cost of the instructions being constrainted on. New constraints are
+  // added to the given constraint set.
+  //
+  // Backends can override these methods with backend-specific propagation
+  // rules.
+  virtual absl::Status PropagateBufferConstraint(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints);
+  virtual absl::Status PropagateOperandConstraint(
+      const OperandLayoutConstraint& operand_constraint,
+      LayoutConstraints* constraints);
+  virtual absl::Status PropagateResultConstraint(
+      const ComputationLayoutConstraint& layout_constraint,
+      LayoutConstraints* constraints);
+
+  virtual Layout GetUnconstrainedLayout(const LogicalBuffer& buffer) {
+    return LayoutUtil::GetDefaultLayoutForShape(buffer.shape());
+  }
+  // Called after layouts of an instruction have been finalized to allow
+  // subclasses to check for platform specific assumptions.
+  virtual absl::Status Verify(const HloInstruction* instruction) {
+    return absl::OkStatus();
+  }
+
+  absl::Status PropagateUnconstraintedBuffers(LayoutConstraints* constraints);
+  const BufferLayoutConstraint* GetBufferLayoutConstraint(
+      const LogicalBuffer& buffer) const;
+  absl::StatusOr<const BufferLayoutConstraint*>
+  GetInstructionBufferLayoutConstraint(const HloInstruction* instruction) const;
+  // Find a bufferset in the bufferset cache. This is useful since we can
+  // currently create the flattened buffer set for the same instruction many
+  // times, which is often slow.
+  PointsToSet::BufferSet* GetBufferSet(const HloInstruction* instruction) const;
+  // Similar to above, but returns true only if all buffers associated with that
+  // operand are forwarded.
+  bool AllOperandBuffersForwarded(const HloInstruction* instruction,
+                                  int64_t operand_no) const;
+  // Returns true if any buffer in the given operand is forwarded to the output
+  // of the given instruction. For example, the Tuple instruction forwards the
+  // buffers of its operands and would return true for each of its operands.
+  bool AnyOperandBufferForwarded(const HloInstruction* instruction,
+                                 int64_t operand_no) const;
+  absl::StatusOr<Layout> InferArrayLayout(const HloInstruction* instruction,
+                                          const ShapeIndex& index);
+
+  // Propagates a buffer layout constraint into the operands that use it.
+  absl::Status PropagateBufferConstraintToUses(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints);
+
+  // Propagates a layout constraint on the use of the result of the given
+  // instruction to the definitions of the LogicalBuffers which make up the
+  // result.
+  absl::Status PropagateUseConstraintToDefs(
+      const ShapeLayout& shape_layout, const HloInstruction* instruction,
+      LayoutConstraints* constraints, int64_t priority,
+      const HloInstruction* user = nullptr);
+
+  // Chooses a layout of operand `operand_no` of `instruction` that minimizes
+  // the cost of `instruction`. `output_layout` is the layout of `instruction`.
+  // Returns null if it can't decide the best layout.
+  // Precondition: `instruction` and the operand are array-shaped.
+  virtual std::unique_ptr<Layout> ChooseOperandLayoutFromOutputLayout(
+      const Layout& output_layout, const HloInstruction* instruction,
+      int64_t operand_no);
+  // Given the layout of `user`'s `operand_no`-th operand, chooses a layout of
+  // `user` that minimizes its cost on that operand.  Returns null if it can't
+  // decide the best layout.
+  // Precondition: `user` and the operand are array-shaped.
+  virtual std::unique_ptr<Layout> ChooseOutputLayoutFromOperandLayout(
+      const Layout& operand_layout, const HloInstruction* user,
+      int64_t operand_no);
+
+  // Convenient wrapper for InstructionCanChangeLayout which can be overridden
+  // in subclasses.
+  virtual bool InstructionCanChangeLayoutInstance(
+      const HloInstruction* instruction);
+
+  // The shapes in caller can be different from the shapes in callee. For
+  // example, a shape (1024, 128) of an array can be distributed to four threads
+  // so the shape for each thread is (256, 128). When verifying the callee's
+  // shapes based on the caller, we should use this function to compute the
+  // expected shape. The param_id should be the parameter id of the shape or -1
+  // for the result output or unknown.
+  virtual Shape ShardedShape(const HloInstruction* call, const Shape& shape,
+                             int param_id) {
+    return shape;
+  }
+  // When verifying the caller's shapes based on the callee, we should use this
+  // function to compute the expected shape.
+  // The param_id should be the parameter id of the shape or -1 for the result
+  // output or unknown.
+  virtual Shape UnShardedShape(const HloInstruction* call, const Shape& shape,
+                               int param_id) {
+    return shape;
+  }
+
+  // The operands of a call must match the layouts of parameters in the
+  // ComputationLayout, and the call instruction itself must match the result
+  // layout in the ComputationLayout.
+  absl::Status CheckCallLayout(HloInstruction* call,
+                               const ComputationLayout& computation_layout);
+
+ private:
+  // Initializes the layout assignment object for a new Run() call.
+  absl::Status Init(HloModule* module);
+
+  // Adds constraints which must be satisfied for correctness on all
+  // backends. Called once prior to propagating constraints.
+  absl::Status AddMandatoryConstraints(
+      ChannelLayoutConstraints* channel_constraints,
+      LayoutConstraints* constraints);
+
+  // Return a vector containing the constraints which have been added to the
+  // LayoutConstraints object since the construction of the object or since the
+  // last time ConsumeAddedConstraints() has been called. This is used to
+  // identify newly added constraints when propagating layouts.
+  std::vector<const LayoutConstraint*> ConsumeAddedConstraints() {
+    std::vector<const LayoutConstraint*> ret_vec(std::move(added_constraints_));
+    added_constraints_.clear();
+    return ret_vec;
+  }
+  void ClearAddedConstraints() { added_constraints_.clear(); }
+
+  // This method can be overridden to add backend-specific constraints to the
+  // layout of the instructions of a computation. This method is called after
+  // all mandatory constraints have been added via AddMandatoryConstraints
+  // and before propagating constraints.
+  virtual absl::Status AddBackendConstraints(LayoutConstraints* constraints) {
+    return absl::OkStatus();
+  }
+
+  // Construct constraints and assign layouts to all instructions in the
+  // computation satisfying the given ComputationLayout, if not nullptr.
+  // Otherwise the ComputationLayout will be calculated by propagating the
+  // computation instruction constraints.
+  // Layouts constraints are added, then propagated until all LogicalBuffers in
+  // the computation are constrained.
+  absl::Status RunOnComputation(LayoutConstraints* constraints,
+                                ChannelLayoutConstraints* channel_constraints);
+
+  // Assign layouts to the instructions of a computation which satisfy the given
+  // layout constraints. Copies may be added to satisfy the constraints. The
+  // given LayoutConstraints must have layout constraints every logical buffer
+  // in the computation.
+  absl::Status AssignLayouts(LayoutConstraints& constraints);
+
+  // Propagates layout constraints from a set of initial constraints in order to
+  // minimize the local cost of the computation. This propagation is *not*
+  // required for correctness.
+  absl::Status PropagateConstraints(LayoutConstraints* constraints);
+
+  absl::Status PropagateBufferConstraintToOperands(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints);
+
+  // Check that all layouts in the module have been set and satisfy all
+  // necessary conditions.
+  absl::Status CheckLayouts(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Computes the ComputationLayout of the given constraints based of the
+  // layouts assigned to parameters and root instruction. Also propagate
+  // constraints to computation nested inside.
+  absl::Status CalculateComputationLayout(LayoutConstraints* constraints);
+
+  // Clears all the layouts which can be cleared within a computation.
+  absl::Status ClearComputationLayouts(HloComputation* computation);
+
+  // Clears the side effects of a previous pass, like added copy instructions.
+  absl::Status ClearPreviousPassSideEffects(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Propagates the layouts computed by the layout assignment pass on the given
+  // computation, to the computation layout passed in to this API.
+  // This API propagates missing layout, and also checks that the caller
+  // specified have been respected, by comparing those with the parameters and
+  // root computation instruction.
+  absl::Status PropagateComputationLayouts(
+      HloComputation* computation, ComputationLayout* computation_layout);
+
+  // The pointer to the ComputationLayout passed as constructor parameter.
+  ComputationLayout* entry_computation_layout_;
+
+  // A copy of entry_computation_layout_ used to reset it to the initial values
+  // during the multiple passes done by the layout assignment operation.
+  ComputationLayout saved_entry_computation_layout_;
+  // If set true, reverse the computation traversal order when assigning layout.
+  bool reverse_computation_order_;
+
+ protected:
+  static constexpr int64_t kNumberOfPropagationRounds = 2;
+  // Sets up the copy instruction according to the characteristic (sharding,
+  // metadata, ...) of the reference instruction. The index argument is used
+  // when the instruction is a tuple, and in such case the index represents
+  // the location from where the copy instruction was created from.
+  // If the index is empty, the whole sharding will be propagated, even in case
+  // the instruction has a tuple sharding.
+  static void SetupCopiedInstruction(const HloInstruction& instruction,
+                                     HloInstruction* copy,
+                                     const ShapeIndex& index);
+
+  // Creates and returns a copy of the given instruction with a different
+  // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
+  // instruction producing the copy is returned.
+  absl::StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+      const Shape& shape_with_layout, HloInstruction* instruction);
+
+  // Creates a copy of the given operand if the operand's layout does not match
+  // the given layout. This copy replaces the use in the given instruction.
+  // Tuple operands will be deep-copied.
+  virtual absl::Status CopyOperandIfLayoutsDiffer(
+      const ShapeLayout& operand_layout, HloInstruction* instruction,
+      int64_t operand_no);
+
+  // Registers a copy instruction added by the layout assignment pass.
+  void RegisterAddedCopy(HloInstruction* copy) {
+    CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+    added_copies_.insert(copy);
+  }
+
+  // Adds a copy for the operand of an instruction, unless such operand is
+  // already a copy, and has a single user (which is forcibly the instruction
+  // itself).
+  absl::Status AddCopyForOperand(HloInstruction* instruction,
+                                 int64_t operand_number);
+
+  // Apply the channel layout constraints by populating the channel_constraints
+  // data structure passed in at constructor time. Eventually adds copies in
+  // case two ends of a channel ended up with a different leyout.
+  absl::Status ConstrainChannelLayouts(
+      HloComputation* computation,
+      ChannelLayoutConstraints* channel_constraints);
+
+  // Resets the input ChannelLayoutConstraints to the original copy received
+  // from the constructor input.
+  void ResetChannelConstraints() {
+    if (channel_layout_constraints_ != nullptr) {
+      *channel_layout_constraints_ = channel_constraints_;
+    }
+  }
+
+  // Adds constraints related to host Send/Recv instructions.
+  absl::Status BuildHostChannelConstraints(HloComputation* computation);
+
+  // Module points to analysis that can be updated for cloned computations.
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  // The set of HLO instructions which lacked any layout constraint, thus
+  // receiving propagated default layouts.
+  absl::flat_hash_set<const HloInstruction*> unconstrained_layout_instructions_;
+
+  HloPredicate instruction_can_change_layout_func_;
+
+  // CallGraph of the module, used to track callsites of each computation.
+  std::unique_ptr<CallGraph> call_graph_;
+
+  std::string ToString(const LayoutConstraints& constraints) const;
+
+  int64_t current_priority() const { return current_priority_; }
+
+ private:
+  // Map containing the layouts of all computations assigned so
+  // far. Computations are handled in a topological sort where computations are
+  // handled before their caller instructions so the layouts of caller
+  // instructions can be set to match the computation.
+  absl::flat_hash_map<const HloComputation*, std::unique_ptr<LayoutConstraints>>
+      computation_layouts_;
+
+  // Map from branch computations to the result layout they should apply.
+  absl::flat_hash_map<HloComputation*, ComputationLayout> conditional_mismatch_;
+
+  // Every copy added to the module by the layout assignment pass is registered
+  // here.
+  absl::flat_hash_set<HloInstruction*> added_copies_;
+
+  // The pointer to the channel layout constraints passed in with the
+  // constructor. If not nullptr, this is an input/output argument.
+  ChannelLayoutConstraints* channel_layout_constraints_ = nullptr;
+
+  // A copy of the input layout constraints used to reset the above pointer in
+  // case we have to undo operations due to the multiple passes over the
+  // computations/instructions.
+  ChannelLayoutConstraints channel_constraints_;
+
+  // Layout constraints for send/recv instructions which communicate with the
+  // host.
+  ChannelLayoutConstraints host_channel_constraints_;
+
+  // Array-shaped buffers which have not yet been constrained.
+  std::set<LogicalBuffer::Id> unconstrained_buffer_ids_;
+
+  mutable absl::flat_hash_map<const HloInstruction*,
+                              std::unique_ptr<PointsToSet::BufferSet>>
+      buffer_sets_cache_;
+
+  // The set of BufferLayoutConstraints applied to the computation.
+  absl::flat_hash_map<const LogicalBuffer*,
+                      std::unique_ptr<BufferLayoutConstraint>>
+      buffer_constraints_;
+
+  // A vector which holds constraints as they are added. Can be cleared with
+  // ClearAddedConstraints.
+  std::vector<const LayoutConstraint*> added_constraints_;
+  int64_t current_priority_ = LayoutConstraint::kBeginningPriority;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/layout_normalization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/layout_normalization.h
new file mode 100644
index 00000000..da416aa9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/layout_normalization.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LAYOUT_NORMALIZATION_H_
+#define XLA_SERVICE_LAYOUT_NORMALIZATION_H_
+
+#include <functional>
+#include <optional>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+using CustomCallTransformer =
+    std::function<absl::StatusOr<std::optional<HloInstruction*>>(
+        HloCustomCallInstruction*)>;
+
+// Normalize shapes for some subsets of HLOs.
+//
+// A shape is called "normalized" when it's layout is descending, and no
+// degenerate dimensions are present.
+//
+// The normalization pass converts shapes to physically-equivalent but
+// normalized ones, e.g. f32[5,1,4]{0,1,2} is converted to f32[4,5]{1,0}.
+class LayoutNormalization : public HloModulePass {
+ public:
+  // The provided custom_call_transformer allows backend to specify custom-call
+  // transformation rules.
+  explicit LayoutNormalization(
+      const CustomCallTransformer& custom_call_transformer = nullptr)
+      : custom_call_transformer_(custom_call_transformer) {}
+
+  absl::string_view name() const override { return "layout_normalization"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  CustomCallTransformer custom_call_transformer_;
+};
+
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_LAYOUT_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/legalize_scheduling_annotations.h b/third_party/tflite-hdrs/third_party/xla/xla/service/legalize_scheduling_annotations.h
new file mode 100644
index 00000000..49b02271
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/legalize_scheduling_annotations.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LEGALIZE_SCHEDULING_ANNOTATIONS_H_
+#define XLA_SERVICE_LEGALIZE_SCHEDULING_ANNOTATIONS_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Legalizer pass for scheduling annotations (to be used in
+//  LatencyHidingScheduler).
+class LegalizeSchedulingAnnotations : public HloModulePass {
+ public:
+  struct Config {
+    HloPredicate keep_sync_annotation = HloPredicateTrue;
+  };
+
+  explicit LegalizeSchedulingAnnotations(Config config)
+      : config_(std::move(config)) {}
+  absl::string_view name() const override {
+    return "legalize-scheduling-annotations";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool KeepSchedulingAnnotation(HloInstruction* instr);
+  Config config_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LEGALIZE_SCHEDULING_ANNOTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_compiler.h
new file mode 100644
index 00000000..c6027c51
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_compiler.h
@@ -0,0 +1,94 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_COMPILER_H_
+#define XLA_SERVICE_LLVM_COMPILER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/Module.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+
+namespace xla {
+
+// Interface for an LLVM-based compiler. This provides the ability to register
+// hooks to inspect the LLVM IR during compilation, both before and after
+// optimizations are applied.
+//
+// Hooks get called once per HLO module being compiled. The following should not
+// be relied on:
+// * The order in which hooks get called.
+// * Whether or not a hook gets called if a compilation exits with a non-OK
+//   status.
+class LLVMCompiler : public Compiler {
+ public:
+  ~LLVMCompiler() override {}
+
+  // A callback of this type can be run before and/or after IR-level
+  // optimization to e.g. dump out the generated IR to disk or gather some
+  // statistics.
+  using ModuleHook = std::function<void(const llvm::Module&)>;
+
+  void SetPreOptimizationHook(ModuleHook hook) {
+    CHECK(!user_pre_optimization_hook_)
+        << "Pre-optimization hook is already set";
+    CHECK(hook) << "hook cannot be null";
+    user_pre_optimization_hook_ = hook;
+  }
+
+  void RemovePreOptimizationHook() { user_pre_optimization_hook_ = nullptr; }
+
+  void SetPostOptimizationHook(ModuleHook hook) {
+    CHECK(!user_post_optimization_hook_)
+        << "Post-optimization hook is already set";
+    CHECK(hook) << "hook cannot be null";
+    user_post_optimization_hook_ = hook;
+  }
+
+  void RemovePostOptimizationHook() { user_post_optimization_hook_ = nullptr; }
+
+  // Bring in
+  //   absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
+  //       std::unique_ptr<HloModule> module,
+  //       se::StreamExecutor* stream_exec,
+  //       se::DeviceMemoryAllocator* device_allocator)
+  //   absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+  //       std::unique_ptr<HloModule> module,
+  //       se::StreamExecutor* stream_exec,
+  //       se::DeviceMemoryAllocator* device_allocator)
+  using Compiler::Compile;
+  using Compiler::RunBackend;
+  using Compiler::RunHloPasses;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+      const CompileOptions& options) override;
+
+ protected:
+  ModuleHook user_pre_optimization_hook_;
+  ModuleHook user_post_optimization_hook_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/alias_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/alias_analysis.h
new file mode 100644
index 00000000..66957cc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/alias_analysis.h
@@ -0,0 +1,96 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
+#define XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape_util.h"
+#include "xla/types.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// Helper functionality used to augment the LLVM IR emitted with alias-scope
+// metadata.
+class AliasAnalysis {
+ public:
+  AliasAnalysis(const HloModule& module, const BufferAssignment& assignment,
+                llvm::LLVMContext* context)
+      : module_(module), assignment_(assignment), context_(context) {}
+
+  // Augments IrArray with aliasing information.
+  void AddAliasingInformationToIrArray(const HloInstruction& hlo,
+                                       llvm_ir::IrArray* array,
+                                       const ShapeIndex& index = {});
+
+ private:
+  // Returns a unique alias domain for this emitter.
+  llvm::MDNode* GetAliasDomain();
+
+  // Returns an alias.scope metadata node corresponding to a given buffer slice.
+  llvm::MDNode* GetAliasScopeMetadataForBuffer(
+      const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain);
+
+  // Returns a noalias metadata node corresponding to a given buffer slice.
+  //
+  // |buffer_slice| is the buffer slice.
+  //
+  // |domain| corresponds to the alias scope domain as documented at
+  // http://llvm.org/docs/LangRef.html#noalias-and-alias-scope-metadata
+  //
+  // |hlo| is the instruction we are computing a noalias set for.
+  llvm::MDNode* GetNoaliasMetadataForBuffer(
+      const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain,
+      const BufferAssignment& assignment, const HloInstruction& hlo);
+
+  // The HLO module we are compiling for.
+  const HloModule& module_;
+
+  // Assignment of the temporary buffers needed by the computation and their
+  // shape information.
+  const BufferAssignment& assignment_;
+
+  // The LLVM context which we are using for IR emission.
+  llvm::LLVMContext* context_;
+
+  // Holds the alias domain for this computation.
+  llvm::MDNode* alias_domain_ = nullptr;
+
+  // A map from a buffer slice to metadata corresponding to its alias.scope
+  // metadata.  The index kParameterAliasSet is used to hold aliasing
+  // information for parameters.
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*>
+      alias_scope_metadata_;
+
+  // A map from a buffer slice and producer to metadata corresponding to its
+  // noalias metadata.
+  absl::flat_hash_map<std::pair<BufferAllocation::Slice, const HloInstruction*>,
+                      llvm::MDNode*>
+      noalias_metadata_;
+};
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/buffer_assignment_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/buffer_assignment_util.h
new file mode 100644
index 00000000..a195352d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/buffer_assignment_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
+#define XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla {
+namespace llvm_ir {
+// Sanitizes the HLO constant instruction name so that it can be used for the
+// name of the corresponding constant buffer. In particular, it replaces . and
+// - with _.
+std::string SanitizeConstantName(const HloInstruction& instr);
+std::string SanitizeConstantName(absl::string_view name);
+
+std::string ConstantHloToGlobalName(const HloInstruction& instr);
+std::string ConstantNameToGlobalName(absl::string_view name);
+
+// Returns the Literal corresponding to `allocation`, which must be a constant
+// allocation.
+const Literal& LiteralForConstantAllocation(const BufferAllocation& allocation);
+// Returns the constant HloInstruction corresponding to `allocation`, which must
+// be a constant allocation.
+const HloInstruction& InstrForConstantBufferAllocation(
+    const BufferAllocation& allocation);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h
new file mode 100644
index 00000000..f7bd430a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -0,0 +1,98 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_
+#define XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/llvm_ir/fused_ir_emitter.h"
+#include "xla/service/llvm_ir/ir_array.h"
+
+// Utilities related to emitting LLVM IR for various HLO ops.
+
+namespace xla {
+namespace llvm_ir {
+
+using GeneratorForOperandIrArrays =
+    std::function<std::vector<llvm_ir::IrArray>()>;
+
+// Determines whether the given instruction might be implemented as an
+// in-place dynamic-update-slice after we have a buffer assignment.
+//
+// If this returns false, then CanUpdateDynamicSliceInPlace and
+// CanEmitFusedDynamicUpdateSliceInPlace will also return false.
+//
+// This is useful if you want to check whether an instruction might be an
+// in-place DUS during an HLO pass, at which point you don't have a buffer
+// assignment.
+//
+// Note that simplifications to the HLO graph might change this function from
+// returning false to returning true.  Specifically, simplifying the contents of
+// fusion nodes might cause a false->true transition.  In general this isn't a
+// problem by the time you're calling this function, but beware.
+bool MayBeImplementedAsInPlaceDynamicUpdateSlice(const HloInstruction* instr);
+
+// Checks if we can emit code for the given DynamicUpdateSlice node that updates
+// its input in place.  Returns true if the dynamic-update-slice's
+// array-to-be-updated and output share the same BufferAllocation::Slice.
+//
+// dynamic_update_slice must be a DynamicUpdateSlice op.
+bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
+                                  const BufferAssignment& assignment);
+
+// Checks if the given fusion node is amenable to being implemented by
+// EmitFusedDynamicUpdateSliceInPlace.
+bool CanEmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
+                                           const BufferAssignment& assignment);
+
+// Emits IR for running the given dynamic-update-slice op in-place -- that is,
+// where the input and output buffers share the same slice, so we can simply
+// modify the input/output buffer without touching any of the other elements.
+absl::Status EmitDynamicUpdateSliceInPlace(
+    absl::Span<const IrArray> operand_arrays, const IrArray& output_array,
+    absl::string_view name, llvm::IRBuilderBase* b);
+
+// Given a loop-fusion node whose root is a dynamic-update-slice op whose
+// array-to-be-updated and output share the same buffer slice, emits
+// (sequential) code for a fusion node that does the dynamic-update-slice in
+// place.
+absl::Status EmitFusedDynamicUpdateSliceInPlace(
+    HloInstruction* fusion, const IrArray& fusion_output_array,
+    FusedIrEmitter* fused_emitter, llvm::IRBuilderBase* b);
+
+// Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with
+// the given launch dimensions for arbitrarily many independent dynamic slice
+// updates.
+absl::Status EmitParallelFusedDynamicUpdateSliceInPlace(
+    const HloComputation* fusion,
+    const std::vector<std::pair<const HloInstruction*, const IrArray>>&
+        dus_and_output_array,
+    FusedIrEmitter* fused_emitter,
+    const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilderBase* b);
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.h
new file mode 100644
index 00000000..cfde0418
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.h
@@ -0,0 +1,90 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_FUSED_IR_EMITTER_H_
+#define XLA_SERVICE_LLVM_IR_FUSED_IR_EMITTER_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+
+// FusedIrEmitter is used to generate code for fusion nodes.
+//
+// Unlike IrEmitter and its ilk, which directly create LLVM IR in an LLVM
+// Module, FusedIrEmitter is better understood as "IR generator generator".
+// FusedIrEmitter recursively creates a generator (a host function) which the
+// compiler can invoke at a later time.  Invoking the generator emits LLVM IR
+// that, when run, produces the value at a particular index of the output.
+//
+// After building this generator, the compiler creates a loop (or its moral
+// equivalent, e.g. a GPU kernel) and calls the generator from within the loop.
+// This generates code that produces each element of the output.
+//
+// This class handles both vanilla fusion and multi-output fusion.  In the MOF
+// case, the fusion node ends with a kTuple instruction, and the generator
+// created produces an LLVM struct with N elements, one for each element of the
+// arrays in the tuple.  It follows that the arrays in the tuple must have the
+// same length.
+class FusedIrEmitter {
+ public:
+  using IndexedGenerator = llvm_ir::ElementGenerator;
+
+  explicit FusedIrEmitter(ElementalIrEmitter& elemental_emitter)
+      : elemental_emitter_(elemental_emitter) {}
+
+  void BindGenerator(const HloInstruction& instruction,
+                     llvm_ir::ElementGenerator generator) {
+    indexed_generators_[&instruction] = std::move(generator);
+  }
+
+  // Returns the generator function for the given instruction.
+  absl::StatusOr<IndexedGenerator> GetGenerator(
+      const HloInstruction& instruction);
+
+ private:
+  absl::StatusOr<IndexedGenerator> CreateGenerator(
+      const HloInstruction& instruction);
+  absl::StatusOr<IndexedGenerator> DefaultAction(
+      const HloInstruction& instruction);
+  IndexedGenerator HandleConstant(const HloInstruction& constant);
+  absl::StatusOr<IndexedGenerator> HandleTuple(const HloInstruction& tuple);
+
+  ElementalIrEmitter& elemental_emitter_;
+
+  // Map from instructions to functions that generate code for the output
+  // elements. If an instruction is a GetTupleElement instruction, the
+  // instruction produces non-tuple result.
+  absl::flat_hash_map<const HloInstruction*, IndexedGenerator>
+      indexed_generators_;
+
+  // Cache of generated values, lest we regenerate an element of a node with
+  // multiple outgoing edges.
+  // Use instruction and index values as the key.
+  using ValueCacheKey =
+      std::pair<const HloInstruction*, std::vector<llvm::Value*>>;
+  absl::flat_hash_map<ValueCacheKey, llvm::Value*> value_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_FUSED_IR_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/ir_array.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/ir_array.h
new file mode 100644
index 00000000..223f8a66
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/ir_array.h
@@ -0,0 +1,389 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_IR_ARRAY_H_
+#define XLA_SERVICE_LLVM_IR_IR_ARRAY_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/layout.h"
+#include "xla/map_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// IrArray represents an XLA array at the LLVM IR level. This class
+// encapsulates a base pointer to the buffer holding the array (as an LLVM
+// Value) and the shape of the array. The class includes methods for emitting
+// LLVM IR sequences which access elements of the array at a multidimensional
+// index (eg, [x, y, z] in a 3-dimensional array). Arbitrary shape and layouts
+// are supported.
+class IrArray {
+ public:
+  // A multidimensional index into an IrArray. The order of the runtime indices
+  // (multidim) corresponds to the order of dimensions in the Shape passed to
+  // the constructor.
+  //
+  // This may also keep a linear index and the layout and dimensions it was
+  // emitted for; if the shape where this `Index` is used matches, the linear
+  // index may be used, potentially sparing the cost of computing the
+  // multidimensional index, which LLVM DCE can delete.
+  class Index {
+   public:
+    // Constructs an index for a scalar shape.
+    explicit Index(llvm::Type* index_ty) : index_type_(index_ty) {
+      CHECK(index_ty->isIntegerTy());
+    }
+
+    // Constructs an index from linear index "linear" and computes the
+    // multi-dimensional index from "linear" and "shape". "b" is the IR
+    // builder to emit the index of each dimension in the multi-dimensional
+    // index.
+    //
+    // Precondition: "shape" has a layout.
+    Index(llvm::Value* linear, const Shape& shape, llvm::IRBuilderBase* b);
+
+    // As before, but also take a multidim to reuse.  multidim.size()
+    // == shape.rank() must be true.  If some of the multidim element
+    // are null we will use the value that would be used if
+    // deliearized from linear.
+    Index(llvm::Value* linear, absl::Span<llvm::Value* const> multidim,
+          const Shape& shape, llvm::IRBuilderBase* b);
+
+    // Similar to the above constructor except using "dynamic_dims" instead of
+    // shape's static dimension to constructs the index.
+    Index(llvm::Value* linear, const Shape& shape,
+          absl::Span<llvm::Value*> dynamic_dims, llvm::IRBuilderBase* b);
+
+    // Constructs an index from a multi-dimensional index. 'shape' is the shape
+    // for which the multi-dimensional index is used. 'index_type' is the type
+    // of the index.
+    //
+    // Precondition: "shape" has a layout.
+    Index(absl::Span<llvm::Value* const> multidim, const Shape& shape,
+          llvm::Type* index_type);
+
+    // Same as above, but only the dimensions of the shape without layout is
+    // passed. The layout is assumed to be the default (descending
+    // minor-to-major) layout.
+    Index(absl::Span<llvm::Value* const> multidim,
+          absl::Span<int64_t const> dimensions, llvm::Type* index_type);
+
+    // Returns an index that adds `addend` to the given `dim` of the object.
+    Index AddOffsetToDim(llvm::Value* addend, int64_t dim,
+                         llvm::IRBuilderBase* b) const {
+      Index with_offset = *this;
+      with_offset.linear_ = nullptr;
+      with_offset.multidim_[dim] =
+          b->CreateAdd(with_offset.multidim_[dim], addend);
+      return with_offset;
+    }
+
+    Index AddOffset(absl::Span<llvm::Value* const> offsets,
+                    llvm::IRBuilderBase* b) const {
+      CHECK_EQ(multidim_.size(), offsets.size());
+      Index with_offset = *this;
+      with_offset.linear_ = nullptr;
+      for (auto&& [dim, offset] : llvm::zip(with_offset.multidim_, offsets)) {
+        dim = b->CreateAdd(dim, offset);
+      }
+      return with_offset;
+    }
+
+    const std::vector<llvm::Value*>& multidim() const { return multidim_; }
+    const std::vector<int64_t>& dims() const { return dims_; }
+    llvm::Value* linear() const { return linear_; }
+
+    size_t size() const { return multidim().size(); }
+
+    llvm::Value* operator[](size_t i) const { return multidim()[i]; }
+
+    using const_iterator = std::vector<llvm::Value*>::const_iterator;
+
+    const_iterator begin() const { return multidim().begin(); }
+    const_iterator end() const { return multidim().end(); }
+
+    bool LinearValidOnShape(const Shape& a) const;
+
+    static bool ShapeIsCompatible(const Shape& a, const Shape& b);
+
+    bool ShapeIsCompatible(const Shape& a) const {
+      return ShapeIsCompatible(a, AsShapeWithType(a.element_type()));
+    }
+
+    Shape AsShapeWithType(PrimitiveType element_type) const {
+      return ShapeUtil::MakeShapeWithDenseLayout(element_type, dims_,
+                                                 layout_.minor_to_major());
+    }
+
+    // Given that "this" is the target index of a reshape from `input_shape`
+    // to `output_shape`, returns the source index.
+    Index SourceIndexOfReshape(const Shape& output_shape,
+                               const Shape& input_shape,
+                               llvm::IRBuilderBase* builder) const;
+
+    // Returns the index into the source operand from which a slice operation
+    // selects a value to be placed into index "this". The slice is described
+    // by starting indices `starts` and stride values `strides`.
+    //
+    // Precondition: "this" is an index into a slice whose operand shape is
+    // `operand_shape`.
+    Index SourceIndexOfSlice(const Shape& operand_shape,
+                             absl::Span<const int64_t> starts,
+                             absl::Span<const int64_t> strides,
+                             llvm::IRBuilderBase* builder) const;
+
+    // Given that "this" is the target index of a transpose from `operand_shape`
+    // to `shape` with the given dimension mapping, returns the source index.
+    Index SourceIndexOfTranspose(
+        const Shape& shape, const Shape& operand_shape,
+        absl::Span<const int64_t> dimension_mapping) const;
+
+    // Given that "this" is the target index of a bitcast from `operand_shape`
+    // to `shape`, returns the source index.
+    Index SourceIndexOfBitcast(const Shape& shape, const Shape& operand_shape,
+                               llvm::IRBuilderBase* builder) const;
+    // Same as above, but for bitcasts from `operand_shape` to `this->dims`.
+    Index SourceIndexOfBitcast(const Shape& operand_shape,
+                               llvm::IRBuilderBase* builder) const;
+
+    // Given that "this" is the target index of a broadcast from `operand_shape`
+    // to `shape` with the given dimension mapping, returns the source index.
+    Index SourceIndexOfBroadcast(const Shape& shape, const Shape& operand_shape,
+                                 absl::Span<const int64_t> dimension_mapping,
+                                 llvm::IRBuilderBase* builder) const;
+
+    // Linearizes the index into the given shape, i.e. reshapes it to rank-1 and
+    // returns the index into the sole dimension 0 of the new shape.
+    llvm::Value* Linearize(absl::Span<const int64_t> dimensions,
+                           llvm::IRBuilderBase* builder) const;
+
+    // Linearizes the index into the given dynamic dimensions.
+    llvm::Value* Linearize(const std::vector<llvm::Value*>& dynamic_dims,
+                           llvm::IRBuilderBase* builder) const;
+
+    llvm::Type* GetType() const { return index_type_; }
+
+    llvm::Constant* GetConstantWithIndexType(int64_t c) const {
+      // The LLVM function makes sure that the value can be represented by the
+      // specified type, see ConstantInt::ConstantInt(IntegerType *Ty, const
+      // APInt &V).
+      return llvm::ConstantInt::get(index_type_, c);
+    }
+
+   private:
+    // Constructs an index from both a multi-dimensional index and a linear
+    // index. 'shape' is the shape on which the index is used. 'index_type' is
+    // the type of the index.
+    //
+    // Precondition: "shape" has a layout.
+    Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
+          const Shape& shape, llvm::Type* index_type);
+
+    void Delinearize(std::vector<llvm::Value*>* multidim, llvm::Value* linear,
+                     const Shape& shape, llvm::IRBuilderBase* b) const;
+
+    // Delinearize the linear index with the dynamic dimensions.
+    void Delinearize(std::vector<llvm::Value*>* multidim, llvm::Value* linear,
+                     const Shape& shape, absl::Span<llvm::Value*> dynamic_dims,
+                     llvm::IRBuilderBase* b) const;
+
+    std::vector<llvm::Value*> multidim_;
+
+    // These values are purely for efficiency; `multidim_` is enough to find the
+    // element at a given `Index`, but if a loop is emitted with a linear index
+    // space, that linear index can be saved in `linear_`, and the layout and
+    // dimensions of the shape the loop was emitted for in `layout_` and
+    // `dims_`, and if the `Index` is used in another array, and its layout and
+    // dimensions match, the linear index can be used, sparing the cost of
+    // computing `multidim_`, which LLVM DCE could potentially so delete.
+    // Modifying `multidim_` after construction nullifies `linear_`, lest it
+    // be used wrongly, as it would be valid no more.
+    // If a loop is emitted with a multidimensional index space, `linear_` would
+    // be null and `layout_` and `dims_` would be ignored.
+    llvm::Value* linear_ = nullptr;
+    Layout layout_;
+    std::vector<int64_t> dims_;
+
+    llvm::Type* index_type_;
+  };
+
+  // Default constructor. Constructs an IrArray in a null status.
+  IrArray() : base_ptr_(nullptr) {}
+
+  // Construct an IrArray with the given base pointer, pointee type, and shape.
+  // base_ptr is a pointer type pointing to the first element(lowest address)
+  // of the array.
+  //
+  // For packed arrays, base_ptr points to packed memory with the correct number
+  // of elements when unpacked. pointee_type should be an iN array in this case,
+  // and reads and writes will return or take in iN values. IrArray internally
+  // reads or writes i8 values, by treating base_ptr as an i8 array and
+  // masking/shifting on the fly. IrArray does not directly read/write iN
+  // values, since arrays of iN values in LLVM are not packed (every element of
+  // an LLVM IR array must have unique address).
+  IrArray(llvm::Value* base_ptr, llvm::Type* pointee_type, Shape shape);
+
+  // Default implementations of copying and moving.
+  IrArray(IrArray&& other) noexcept = default;
+  IrArray(const IrArray& other) = default;
+  IrArray& operator=(IrArray&& other) noexcept = default;
+  IrArray& operator=(const IrArray& other) = default;
+
+  llvm::Value* GetBasePointer() const { return base_ptr_; }
+  llvm::Type* GetBasePointeeType() const { return pointee_type_; }
+  llvm::Type* GetElementLlvmType() const { return element_type_; }
+
+  const Shape& GetShape() const { return shape_; }
+
+  // Emit a sequence of instructions to compute the address of the element in
+  // the given array at the given index. Returns the address of the element as
+  // an LLVM Value.
+  //
+  // The optional name is useful for debugging when looking at
+  // the emitted LLVM IR.
+  //
+  // `bit_offset` contains the offset of the element inside the address.
+  llvm::Value* EmitArrayElementAddress(
+      const Index& index, llvm::IRBuilderBase* b, absl::string_view name = "",
+      bool use_linear_index = true, llvm::Value** bit_offset = nullptr) const;
+
+  // Attach metadata this IrArray instance knows about to "instruction".
+  void AnnotateLoadStoreInstructionWithMetadata(
+      llvm::Instruction* instruction) const;
+
+  // Emit IR to read an array element at the given index. Returns the read
+  // result (effectively, a Value loaded from memory). This method seamlessly
+  // handles scalar shapes by broadcasting their value to all indices (index is
+  // ignored).
+  //
+  // The optional name is useful for debugging when looking at
+  // the emitted LLVM IR.
+  // 'use_linear_index' can be used to specify whether the linear index (if
+  // available) or the multi-dimensional index should be used.
+  llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilderBase* b,
+                                    absl::string_view name = "",
+                                    bool use_linear_index = true) const;
+
+  // Emit IR to write the given value to the array element at the given index.
+  // 'use_linear_index' can be used to specify whether the linear index (if
+  // available) or the multi-dimensional index should be used.
+  //
+  // For packed arrays, only part of the byte in the array is written. First
+  // the appropriate byte is read from the array, then a subset of bits are
+  // modified and written back. To avoid race conditions, the caller must ensure
+  // that the different values within a byte are not written to in parallel.
+  void EmitWriteArrayElement(const Index& index, llvm::Value* value,
+                             llvm::IRBuilderBase* b,
+                             bool use_linear_index = true) const;
+
+  // Returns a new IrArray whose shape is "new_shape" and base pointer is a
+  // bitcast of the base pointer of "this" IrArray.
+  // 'use_linear_index' can be used to specify whether the linear index (if
+  // available) or the multi-dimensional index should be used.
+  IrArray CastToShape(const Shape& new_shape, llvm::IRBuilderBase* b) const;
+
+  void AddAliasScopeMetadata(llvm::MDNode* alias_scope) {
+    CHECK_NE(alias_scope, nullptr);
+    AddMetadata(llvm::LLVMContext::MD_alias_scope, alias_scope);
+  }
+
+  void AddNoaliasMetadata(llvm::MDNode* noalias) {
+    CHECK_NE(noalias, nullptr);
+    AddMetadata(llvm::LLVMContext::MD_noalias, noalias);
+  }
+
+  // Promises LLVM that the data pointed to by this IrArray never changes after
+  // it's first loaded.
+  //
+  // The temporal scope of this promise is the "whole program" from LLVM's point
+  // of view, but how this translates to HLOs differs between backends.
+  //
+  // In the single-threaded CPU backend, we emit one function that
+  // runs all the HLOs in sequence, so the whole program is the whole HLO
+  // module.
+  //
+  // In the GPU backend, we emit one GPU kernel per top-level HLO (i.e. per HLO
+  // in the entry computation).  From LLVM's perspective, launching a new kernel
+  // is like launching a new program, and so the whole program is one top-level
+  // HLO.  Since the scope of the promise is smaller than in the CPU backend, we
+  // can mark more things as invariant in the GPU backend.
+  //
+  // Marking loads as invariant is particularly helpful on GPUs because
+  // invariant loads can be lowered to PTX ld.global.nc (equivalent to CUDA's
+  // __ldg intrinsic).  These loads use a special cache, and can be
+  // significantly faster than regular loads.
+  void MarkInvariantOverWholeProgram(llvm::LLVMContext* context) {
+    if (is_invariant_) {
+      return;
+    }
+    is_invariant_ = true;
+    AddMetadata(llvm::LLVMContext::MD_invariant_load,
+                llvm::MDNode::get(*context, {}));
+  }
+
+  const std::map<int, llvm::MDNode*>& metadata() const { return metadata_; }
+
+ private:
+  // Add the specified LLVM IR metadata to loads/stores associated with this
+  // IrArray.
+  void AddMetadata(int kind, llvm::MDNode* md) {
+    InsertOrDie(&metadata_, kind, md);
+  }
+
+  // Like EmitArrayElementAddress, but always uses a linear index.
+  llvm::Value* EmitLinearArrayElementAddress(
+      const Index& index, llvm::IRBuilderBase* b, absl::string_view name = "",
+      llvm::Value** bit_offset = nullptr) const;
+
+  // Address of the base of the array as an LLVM Value.
+  llvm::Value* base_ptr_;
+
+  // The pointee type of base_ptr_;
+  llvm::Type* pointee_type_;
+
+  // The LLVM type of the elements in the array.
+  llvm::Type* element_type_;
+
+  // Shape of the XLA array.
+  Shape shape_;
+
+  // The list of key/value pairs used when attaching metadata to emitted
+  // loads/stores for this array.  They keys are the metadata kinds and the
+  // values are the metadata nodes.
+  std::map<int, llvm::MDNode*> metadata_;
+
+  bool is_invariant_ = false;
+};
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_IR_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h
new file mode 100644
index 00000000..fa63ee58
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h
@@ -0,0 +1,452 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_
+#define XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_
+
+#include <optional>
+
+#include "llvm/IR/IRBuilder.h"
+
+namespace xla {
+
+// Mixin class that injects more ergonomic versions of llvm::IRBuilder methods
+// into a class.  Intended to be used as a CRTP base class, like:
+//
+//  class MyIrEmitter : public IrBuilderMixin<MyIrEmitter> {
+//    llvm::IRBuilder<>* builder() { return builder_; }
+//
+//    void EmitFoo(HloInstruction* foo) {
+//      Add(Mul(...), FPToUI(...));
+//    }
+//  };
+
+template <typename Derived>
+class IrBuilderMixin {
+ protected:
+  template <class... Args>
+  llvm::Value* Add(Args&&... args) {
+    return mixin_builder()->CreateAdd(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::LoadInst* AlignedLoad(Args&&... args) {
+    return mixin_builder()->CreateAlignedLoad(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::StoreInst* AlignedStore(Args&&... args) {
+    return mixin_builder()->CreateAlignedStore(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::AllocaInst* Alloca(Args&&... args) {
+    return mixin_builder()->CreateAlloca(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* And(Args&&... args) {
+    return mixin_builder()->CreateAnd(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* AtomicCmpXchg(Args&&... args) {
+    return mixin_builder()->CreateAtomicCmpXchg(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* AtomicRMW(Args&&... args) {
+    return mixin_builder()->CreateAtomicRMW(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* BitCast(Args&&... args) {
+    return mixin_builder()->CreateBitCast(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Br(Args&&... args) {
+    return mixin_builder()->CreateBr(std::forward<Args>(args)...);
+  }
+
+  llvm::CallInst* Call(llvm::FunctionCallee func_callee,
+                       llvm::ArrayRef<llvm::Value*> args = std::nullopt,
+                       const llvm::Twine& name = "",
+                       llvm::MDNode* fp_math_tag = nullptr) {
+    return mixin_builder()->CreateCall(func_callee, args, name, fp_math_tag);
+  }
+
+  llvm::CallInst* Call(llvm::FunctionType* func_type, llvm::Value* callee,
+                       llvm::ArrayRef<llvm::Value*> args = std::nullopt,
+                       const llvm::Twine& name = "",
+                       llvm::MDNode* fp_math_tag = nullptr) {
+    return mixin_builder()->CreateCall(func_type, callee, args, name,
+                                       fp_math_tag);
+  }
+
+  template <class... Args>
+  llvm::BranchInst* CondBr(Args&&... args) {
+    return mixin_builder()->CreateCondBr(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ConstInBoundsGEP1_32(Args&&... args) {
+    return mixin_builder()->CreateConstInBoundsGEP1_32(
+        std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ConstInBoundsGEP1_64(Args&&... args) {
+    return mixin_builder()->CreateConstInBoundsGEP1_64(
+        std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FAdd(Args&&... args) {
+    return mixin_builder()->CreateFAdd(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FMul(Args&&... args) {
+    return mixin_builder()->CreateFMul(std::forward<Args>(args)...);
+  }
+
+  llvm::Value* GEP(llvm::Type* type, llvm::Value* ptr,
+                   llvm::ArrayRef<llvm::Value*> idx_list,
+                   const llvm::Twine& name = "") {
+    return mixin_builder()->CreateGEP(type, ptr, idx_list, name);
+  }
+
+  template <class... Args>
+  llvm::Value* ICmpEQ(Args&&... args) {
+    return mixin_builder()->CreateICmpEQ(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ICmpNE(Args&&... args) {
+    return mixin_builder()->CreateICmpNE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ICmpULE(Args&&... args) {
+    return mixin_builder()->CreateICmpULE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ICmpULT(Args&&... args) {
+    return mixin_builder()->CreateICmpULT(std::forward<Args>(args)...);
+  }
+
+  llvm::Value* InBoundsGEP(llvm::Type* type, llvm::Value* ptr,
+                           llvm::ArrayRef<llvm::Value*> idx_list,
+                           const llvm::Twine& name = "") {
+    return mixin_builder()->CreateInBoundsGEP(type, ptr, idx_list, name);
+  }
+
+  llvm::Value* ExtractValue(llvm::Value* agg, llvm::ArrayRef<unsigned> idxs,
+                            const llvm::Twine& name = "") {
+    return mixin_builder()->CreateExtractValue(agg, idxs, name);
+  }
+
+  llvm::Value* InsertValue(llvm::Value* agg, llvm::Value* val,
+                           llvm::ArrayRef<unsigned> idxs,
+                           const llvm::Twine& name = "") {
+    return mixin_builder()->CreateInsertValue(agg, val, idxs, name);
+  }
+
+  template <class... Args>
+  llvm::Value* IntToPtr(Args&&... args) {
+    return mixin_builder()->CreateIntToPtr(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::LoadInst* Load(Args&&... args) {
+    return mixin_builder()->CreateLoad(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::CallInst* MemCpy(Args&&... args) {
+    return mixin_builder()->CreateMemCpy(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Mul(Args&&... args) {
+    return mixin_builder()->CreateMul(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* NSWAdd(Args&&... args) {
+    return mixin_builder()->CreateNSWAdd(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* NSWMul(Args&&... args) {
+    return mixin_builder()->CreateNSWMul(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* NSWSub(Args&&... args) {
+    return mixin_builder()->CreateNSWSub(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Or(Args&&... args) {
+    return mixin_builder()->CreateOr(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* PointerCast(Args&&... args) {
+    return mixin_builder()->CreatePointerCast(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* PtrToInt(Args&&... args) {
+    return mixin_builder()->CreatePtrToInt(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* SDiv(Args&&... args) {
+    return mixin_builder()->CreateSDiv(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Select(Args&&... args) {
+    return mixin_builder()->CreateSelect(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* SRem(Args&&... args) {
+    return mixin_builder()->CreateSRem(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::StoreInst* Store(Args&&... args) {
+    return mixin_builder()->CreateStore(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* UDiv(Args&&... args) {
+    return mixin_builder()->CreateUDiv(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* URem(Args&&... args) {
+    return mixin_builder()->CreateURem(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* VectorSplat(Args&&... args) {
+    return mixin_builder()->CreateVectorSplat(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ZExtOrTrunc(Args&&... args) {
+    return mixin_builder()->CreateZExtOrTrunc(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* AShr(Args&&... args) {
+    return mixin_builder()->CreateAShr(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpOEQ(Args&&... args) {
+    return mixin_builder()->CreateFCmpOEQ(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpOGT(Args&&... args) {
+    return mixin_builder()->CreateFCmpOGT(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpOGE(Args&&... args) {
+    return mixin_builder()->CreateFCmpOGE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpOLT(Args&&... args) {
+    return mixin_builder()->CreateFCmpOLT(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpULT(Args&&... args) {
+    return mixin_builder()->CreateFCmpULT(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpULE(Args&&... args) {
+    return mixin_builder()->CreateFCmpULE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpOLE(Args&&... args) {
+    return mixin_builder()->CreateFCmpOLE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpONE(Args&&... args) {
+    return mixin_builder()->CreateFCmpONE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpUNE(Args&&... args) {
+    return mixin_builder()->CreateFCmpUNE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpUNO(Args&&... args) {
+    return mixin_builder()->CreateFCmpUNO(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FCmpUGE(Args&&... args) {
+    return mixin_builder()->CreateFCmpUGE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FDiv(Args&&... args) {
+    return mixin_builder()->CreateFDiv(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FNeg(Args&&... args) {
+    return mixin_builder()->CreateFNeg(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FPCast(Args&&... args) {
+    return mixin_builder()->CreateFPCast(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FPToSI(Args&&... args) {
+    return mixin_builder()->CreateFPToSI(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FPToUI(Args&&... args) {
+    return mixin_builder()->CreateFPToUI(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FPTrunc(Args&&... args) {
+    return mixin_builder()->CreateFPTrunc(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FRem(Args&&... args) {
+    return mixin_builder()->CreateFRem(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* FSub(Args&&... args) {
+    return mixin_builder()->CreateFSub(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ICmpSGE(Args&&... args) {
+    return mixin_builder()->CreateICmpSGE(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* ICmpSLT(Args&&... args) {
+    return mixin_builder()->CreateICmpSLT(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* IntCast(Args&&... args) {
+    return mixin_builder()->CreateIntCast(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* LShr(Args&&... args) {
+    return mixin_builder()->CreateLShr(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* MemSet(Args&&... args) {
+    return mixin_builder()->CreateMemSet(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Neg(Args&&... args) {
+    return mixin_builder()->CreateNeg(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Not(Args&&... args) {
+    return mixin_builder()->CreateNot(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::PHINode* PHI(Args&&... args) {
+    return mixin_builder()->CreatePHI(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* RetVoid(Args&&... args) {
+    return mixin_builder()->CreateRetVoid(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* SExtOrTrunc(Args&&... args) {
+    return mixin_builder()->CreateSExtOrTrunc(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Shl(Args&&... args) {
+    return mixin_builder()->CreateShl(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* SIToFP(Args&&... args) {
+    return mixin_builder()->CreateSIToFP(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Sub(Args&&... args) {
+    return mixin_builder()->CreateSub(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Trunc(Args&&... args) {
+    return mixin_builder()->CreateTrunc(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* UIToFP(Args&&... args) {
+    return mixin_builder()->CreateUIToFP(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Unreachable(Args&&... args) {
+    return mixin_builder()->CreateUnreachable(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::Value* Xor(Args&&... args) {
+    return mixin_builder()->CreateXor(std::forward<Args>(args)...);
+  }
+
+ private:
+  llvm::IRBuilderBase* mixin_builder() {
+    return static_cast<Derived*>(this)->builder();
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/kernel_support_library.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
new file mode 100644
index 00000000..1ce2f3f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
@@ -0,0 +1,264 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_
+#define XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/llvm_loop.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+// A thin wrapper around llvm_loop.h to make code generating structured control
+// flow more readable.
+class KernelSupportLibrary {
+ public:
+  // `b` is the llvm::IRBuilder instance used to generate LLVM IR.
+  // `unroll_mode` specifies the desired LLVM unrolling behavior for every loop
+  // generated by this instance of KernelSupportLibrary.
+  explicit KernelSupportLibrary(
+      llvm::IRBuilderBase* b,
+      llvm_ir::UnrollMode unroll_mode = llvm_ir::UnrollMode::kNoUnroll,
+      bool prevent_vectorization = true)
+      : b_(b),
+        unroll_mode_(unroll_mode),
+        prevent_vectorization_(prevent_vectorization) {}
+
+  // Generates the following control flow structure:
+  //
+  //   if (`start` < `end`) {
+  //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/true)`;
+  //     for (i64 i = `start` + `step`; i < `end`; i += `step`)
+  //       `for_body_generator(/*ind_var=*/i, /*is_first_iteration=*/false)`;
+  //   }
+  absl::Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<absl::Status(
+          llvm::Value* ind_var, bool is_first_iteration)>& for_body_generator);
+
+  void For(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
+          for_body_generator) {
+    CHECK_EQ(absl::OkStatus(),
+             ForWithStatus(name, start, end, step,
+                           [&](llvm::Value* ind_var,
+                               bool is_first_iteration) -> absl::Status {
+                             for_body_generator(ind_var, is_first_iteration);
+                             return absl::OkStatus();
+                           }));
+  }
+
+  absl::Status ForWithStatus(
+      absl::string_view name, int64_t start, int64_t end, int64_t step,
+      const std::function<absl::Status(
+          llvm::Value* ind_var, bool is_first_iteration)>& for_body_generator) {
+    return ForWithStatus(name, /*start=*/b_->getInt64(start),
+                         /*end=*/b_->getInt64(end),
+                         /*step=*/b_->getInt64(step), for_body_generator);
+  }
+
+  void For(
+      absl::string_view name, int64_t start, int64_t end, int64_t step,
+      const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
+          for_body_generator) {
+    For(name, /*start=*/b_->getInt64(start),
+        /*end=*/b_->getInt64(end),
+        /*step=*/b_->getInt64(step), for_body_generator);
+  }
+
+  // Generates the following control flow structure:
+  //
+  //   for (i64 i = `start`; i < `end`; i += `step`)
+  //     `for_body_generator(/*ind_var=*/i)`;
+  absl::Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<absl::Status(llvm::Value* ind_var)>&
+          for_body_generator);
+
+  void For(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    CHECK_EQ(absl::OkStatus(),
+             ForWithStatus(name, start, end, step,
+                           [&](llvm::Value* ind_var) -> absl::Status {
+                             for_body_generator(ind_var);
+                             return absl::OkStatus();
+                           }));
+  }
+
+  absl::Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      int64_t step,
+      const std::function<absl::Status(llvm::Value* ind_var)>&
+          for_body_generator) {
+    return ForWithStatus(name, start, end,
+                         llvm::ConstantInt::get(start->getType(), step),
+                         [&](llvm::Value* indvar) -> absl::Status {
+                           return for_body_generator(indvar);
+                         });
+  }
+
+  void For(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      int64_t step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, start, end, llvm::ConstantInt::get(start->getType(), step),
+        for_body_generator);
+  }
+
+  absl::Status ForWithStatus(
+      absl::string_view name, int64_t start, int64_t end, int64_t step,
+      const std::function<absl::Status(llvm::Value* ind_var)>&
+          for_body_generator) {
+    return ForWithStatus(name, /*start=*/b_->getInt64(start),
+                         /*end=*/b_->getInt64(end),
+                         /*step=*/b_->getInt64(step), for_body_generator);
+  }
+
+  void For(
+      absl::string_view name, int64_t start, int64_t end, int64_t step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, /*start=*/b_->getInt64(start),
+        /*end=*/b_->getInt64(end),
+        /*step=*/b_->getInt64(step), for_body_generator);
+  }
+
+  // Generates the following control flow structure:
+  //
+  //   if (`condition`)
+  //     `true_block_generator()`;
+  //   else
+  //      `false_block_generator()`;
+  // The else is skipped if false_block_generator is null.
+  absl::Status IfWithStatus(
+      absl::string_view name, llvm::Value* condition,
+      const std::function<absl::Status()>& true_block_generator,
+      const std::function<absl::Status()>& false_block_generator = nullptr);
+
+  absl::Status IfWithStatus(
+      llvm::Value* condition,
+      const std::function<absl::Status()>& true_block_generator,
+      const std::function<absl::Status()>& false_block_generator =
+          []() -> absl::Status { return absl::OkStatus(); }) {
+    return IfWithStatus("", condition, true_block_generator,
+                        false_block_generator);
+  }
+
+  void If(llvm::Value* condition,
+          const std::function<void()>& true_block_generator,
+          const std::function<void()>& false_block_generator = nullptr) {
+    If("", condition, true_block_generator, false_block_generator);
+  }
+
+  void If(absl::string_view name, llvm::Value* condition,
+          const std::function<void()>& true_block_generator,
+          const std::function<void()>& false_block_generator = nullptr) {
+    if (false_block_generator != nullptr) {
+      TF_CHECK_OK(IfWithStatus(
+          name, condition,
+          [&]() {
+            true_block_generator();
+            return absl::OkStatus();
+          },
+          [&]() {
+            false_block_generator();
+            return absl::OkStatus();
+          }));
+    } else {
+      TF_CHECK_OK(IfWithStatus(name, condition, [&]() {
+        true_block_generator();
+        return absl::OkStatus();
+      }));
+    }
+  }
+
+  using ArgumentVector = absl::Span<llvm::Value* const>;
+
+  // Generates the following control flow structure:
+  //
+  //  define @`kernel_name`(arg0, arg1, ... arg`arguments.size()`) {
+  //    kernel_body_generator({arg0, arg1, ... arg`arguments.size()`});
+  //  }
+  //
+  //  ...
+  //  call @`kernel_name`(arguments[0], arguments[1] ...)
+  //  ...
+  //
+  // If a function called `kernel_name` is already present in the module then
+  // that function is re-used.  In that sense we're using the llvm::Module as a
+  // cache of outlined kernels, keyed by function name.
+  //
+  // If any of the values in `arguments` is nullptr (i.e. a nullptr
+  // llvm::Value*) then we ignore it when generating LLVM IR, and instead pass
+  // in a nullptr llvm::Value* in its position to `kernel_body_generator`.
+  // Currently we only support at most one nullptr value in `arguments`.
+  static void EmitAndCallOutlinedKernel(
+      const HloModuleConfig& module_config, llvm::IRBuilderBase* b,
+      absl::string_view kernel_name, ArgumentVector arguments,
+      const std::function<void(ArgumentVector)>& kernel_body_generator);
+
+  // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
+  static void EmitAndCallOutlinedKernel(
+      const HloModuleConfig& module_config, llvm::IRBuilderBase* b,
+      absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      llvm::Value* arg2,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
+          kernel_body_generator) {
+    EmitAndCallOutlinedKernel(module_config, b, kernel_name, {arg0, arg1, arg2},
+                              [&](ArgumentVector args) {
+                                kernel_body_generator(args[0], args[1],
+                                                      args[2]);
+                              });
+  }
+
+  static void EmitAndCallOutlinedKernel(
+      const HloModuleConfig& module_config, llvm::IRBuilderBase* b,
+      absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      llvm::Value* arg2, llvm::Value* arg3,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
+                               llvm::Value*)>& kernel_body_generator) {
+    EmitAndCallOutlinedKernel(
+        module_config, b, kernel_name, {arg0, arg1, arg2, arg3},
+        [&](ArgumentVector args) {
+          kernel_body_generator(args[0], args[1], args[2], args[3]);
+        });
+  }
+
+ private:
+  llvm::IRBuilderBase* b_;
+  llvm_ir::UnrollMode unroll_mode_;
+  bool prevent_vectorization_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h
new file mode 100644
index 00000000..605e9d1e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h
@@ -0,0 +1,110 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_LLVM_COMMAND_LINE_OPTIONS_H_
+#define XLA_SERVICE_LLVM_IR_LLVM_COMMAND_LINE_OPTIONS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/const_init.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/match.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// Given a map with options (e.g. originating from xla_backend_extra_options())
+// pass those that don't start with xla_ to LLVM.
+template <typename T>
+std::vector<std::string> ExtractXlaBackendExtraOptions(const T& options) {
+  if (!options.empty()) {
+    std::vector<std::string> backend_extra_opts;
+    for (const auto& it : options) {
+      // Skip options the XLA backend itself consumes.
+      if (!absl::StartsWith(it.first, "xla_")) {
+        if (it.second.empty()) {
+          backend_extra_opts.push_back(it.first);
+        } else {
+          backend_extra_opts.push_back(it.first + "=" + it.second);
+        }
+      }
+    }
+    // We non-deterministically iterated over the proto.
+    // Sort the options to make their ordering deterministic.
+    absl::c_sort(backend_extra_opts);
+    return backend_extra_opts;
+  }
+  return {};
+}
+
+// Different XLA clients have different LLVM usage. This is not safe, as each
+// client is responsible for setting LLVM's own global state.
+//
+// Each client before starting compilation, is required to *acquire* the state
+// of LLVM, given a list of options. While the lock is acquired, it is
+// guaranteed that LLVM will be initialized with this set of options.
+//
+// Multiple clients are allowed concurrent compilation, as long as their set
+// of LLVM options is identical.
+class ABSL_SCOPED_LOCKABLE LLVMCommandLineOptionsLock {
+ public:
+  explicit LLVMCommandLineOptionsLock(
+      const std::vector<std::string>& client_options)
+      ABSL_EXCLUSIVE_LOCK_FUNCTION(lock_);
+
+  LLVMCommandLineOptionsLock(const LLVMCommandLineOptionsLock&) = delete;
+  LLVMCommandLineOptionsLock(LLVMCommandLineOptionsLock&&) = delete;
+  LLVMCommandLineOptionsLock& operator=(const LLVMCommandLineOptionsLock&) =
+      delete;
+  LLVMCommandLineOptionsLock& operator=(LLVMCommandLineOptionsLock&&) = delete;
+
+  ~LLVMCommandLineOptionsLock() ABSL_UNLOCK_FUNCTION();
+
+  static std::vector<std::string>& GetGlobalOptions() {
+    // absl::NoDestructor is not available in OSS XLA.
+    static std::vector<std::string>* global_options =
+        new std::vector<std::string>();
+    return *global_options;
+  }
+
+ private:
+  // Global XLA LLVM options lock.
+  static inline absl::Mutex lock_{absl::kConstInit};
+
+  // Number of clients currently using LLVM.
+  static inline int32_t num_active_clients_{0};
+
+  // Signature of client options LLVM is currently initialized with.
+  static inline uint64_t active_client_signature_{0};
+
+  static std::vector<std::string>& GetActiveClientOptions() {
+    // absl::NoDestructor is not available in OSS XLA.
+    static std::vector<std::string>* active_client_options =
+        new std::vector<std::string>();
+    return *active_client_options;
+  }
+
+  // Signature of the current clients LLVM options.
+  uint64_t client_signature_;
+};
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_LLVM_COMMAND_LINE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_loop.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_loop.h
new file mode 100644
index 00000000..7aa8ce9e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_loop.h
@@ -0,0 +1,299 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_LLVM_LOOP_H_
+#define XLA_SERVICE_LLVM_IR_LLVM_LOOP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace llvm_ir {
+
+enum class UnrollMode {
+  kDefaultUnroll,
+  kFullyUnroll,
+  kNoUnroll,
+};
+
+// A class for constructing a for-loop in LLVM IR.
+class ForLoop {
+ public:
+  ForLoop(const ForLoop&) = delete;
+  ForLoop& operator=(const ForLoop&) = delete;
+
+  // Emit a for-loop at the current insert point of the given IRBuilder.
+  //
+  // start_index and end_index are the loop bounds (end_index is not inclusive).
+  // `step` is the increment of the loop index after each iteration.
+  //
+  // The current insert basic block of the builder is the preheader to the loop
+  // (see below for definition of basic block names). All instructions (if any)
+  // at or after the insert point in the insert basic block are moved to a newly
+  // created exit basic block. Instructions before the insert point remain in
+  // the insert BB:
+  //
+  //                   +--------------+         +----------------+
+  //                   |  insert BB   |         |   insert BB    |
+  //                   |     ...      |         | (preheader BB) |
+  //                   | %foo = ...   |         |      ...       |
+  //    insert point ->| %bar = ...   |  ===>   | %foo = ...     |
+  //                   |     ...      |         +----------------+
+  //                   +--------------+                 |
+  //                                                    V
+  //                                              [[ LOOP BBs ]]
+  //                                                    |
+  //                                                    V
+  //                                             +--------------+
+  //                                             |   exit BB    |
+  //                                             | %bar = ...   |
+  //                                             |     ...      |
+  //                                             +--------------+
+  //
+  // `prefix` is used to disambiguate variable and basic block names emitted in
+  // LLVM IR. If non-empty, it is prepended to the name of the induction
+  // variable value and each basic block created for the loop.
+  //
+  // `unroll_mode` specifies the desired LLVM unrolling behavior for generated
+  //  loop.
+  static std::unique_ptr<ForLoop> EmitForLoop(
+      absl::string_view prefix, llvm::Value* start_index,
+      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilderBase* b,
+      UnrollMode unroll_mode = llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
+
+  // The names of the blocks follow LLVM's conventions. Control flow amongst the
+  // blocks for the example C code looks like:
+  //
+  //   for (int i = 0; i < n; ++i) {
+  //     do_stuff(i);
+  //   }
+  //
+  //      +--------------+
+  //      | preheader BB |
+  //      |     i = 0    |
+  //      +--------------+
+  //              |
+  //              V
+  //      +-------------+
+  //      |  header BB  |<-+
+  //      | if i < n:   |  |
+  //      |   goto body |  |
+  //      | else:       |  |
+  //      |   goto exit |  |
+  //      +-------------+  |
+  //            | |        |
+  //   +--------+ |        |
+  //   |          V        |
+  //   |  +-------------+  |
+  //   |  |   body BB   |  |
+  //   |  | dostuff(i)  |--+
+  //   |  | ++i         |
+  //   |  +-------------+
+  //   |
+  //   |  +-------------+
+  //   +->|   exit BB   |
+  //      +-------------+
+  //
+  // Caller-emitted code to execute within the loop should be placed within the
+  // "body" basic block.
+  //
+  // Return pointers to various blocks in the loop.
+  llvm::BasicBlock* GetPreheaderBasicBlock() const { return preheader_bb_; }
+  llvm::BasicBlock* GetHeaderBasicBlock() const { return header_bb_; }
+  llvm::BasicBlock* GetBodyBasicBlock() const { return body_bb_; }
+  llvm::BasicBlock* GetExitBasicBlock() const { return exit_bb_; }
+
+  // Return the Value representing the induction variable in the body basic
+  // block of the loop.
+  llvm::Value* GetIndVarValue() const { return indvar_; }
+
+ private:
+  // Allow ForLoopNest to call this private constructor.
+  friend class ForLoopNest;
+
+  ForLoop(absl::string_view prefix, absl::string_view suffix,
+          llvm::Value* start_index, llvm::Value* end_index, llvm::Value* step,
+          UnrollMode unroll_mode, bool prevent_vectorization);
+
+  // Emit the loop at the insert point of the builder.
+  void Emit(llvm::IRBuilderBase* b);
+
+  llvm::BasicBlock* CreateLoopBB(absl::string_view name,
+                                 llvm::IRBuilderBase* b);
+
+  // Creates a name for an LLVM construct, appending prefix_ and suffix_, if
+  // they are set.
+  std::string GetQualifiedName(absl::string_view name);
+
+  // Return a list of metadata nodes that should be associated with the
+  // llvm::Loop for this `ForLoop`.
+  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilderBase* b);
+
+  std::string prefix_;
+  std::string suffix_;
+  llvm::Value* start_index_;
+  llvm::Value* end_index_;
+  llvm::Value* step_;
+
+  // To improve readability of the IR, we want the basic blocks to appear
+  // consecutively in the following order: preheader, header, body, loop,
+  // exit. The member insert_before_bb_ points to where the next basic block
+  // should be created to ensure this ordering.
+  llvm::BasicBlock* insert_before_bb_;
+
+  llvm::BasicBlock* preheader_bb_;
+  llvm::BasicBlock* header_bb_;
+  llvm::BasicBlock* body_bb_;
+  llvm::BasicBlock* exit_bb_;
+  llvm::Value* indvar_;
+  UnrollMode unroll_mode_;
+  bool prevent_vectorization_;
+};
+
+// A simple class for constructing nested for-loops.
+class ForLoopNest {
+ public:
+  ForLoopNest(absl::string_view name, llvm::IRBuilderBase* b,
+              llvm::Type* index_ty = nullptr)
+      : name_(name),
+        outer_loop_preheader_bb_(nullptr),
+        outer_loop_exit_bb_(nullptr),
+        inner_loop_body_bb_(nullptr),
+        b_(b) {
+    SetIndexType(index_ty);
+  }
+  ForLoopNest(const ForLoopNest&) = delete;
+  ForLoopNest& operator=(const ForLoopNest&) = delete;
+
+  // Adds a loop to the nest. If no loop has been added yet then emit a loop at
+  // the current insert point of the given builder. If one or more loops have
+  // been added then emit loop inside the body of the last added loop.
+  // unroll_mode is used to emit metadata that controls LLVM unrolling.
+  std::unique_ptr<ForLoop> AddLoop(
+      absl::string_view suffix, llvm::Value* start_index,
+      llvm::Value* end_index, llvm::Value* stride,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
+
+  // Like the above, except that it defaults to a stride of one.
+  std::unique_ptr<ForLoop> AddLoop(
+      absl::string_view suffix, llvm::Value* start_index,
+      llvm::Value* end_index,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
+
+  // A convenient wrapper of the other flavor of AddLoop. The given start and
+  // end index are constant.
+  std::unique_ptr<ForLoop> AddLoop(
+      int64_t start_index, int64_t end_index, int64_t stride,
+      absl::string_view suffix,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
+
+  // Like the above, except that it defaults to a stride of one.
+  std::unique_ptr<ForLoop> AddLoop(
+      int64_t start_index, int64_t end_index, absl::string_view suffix,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
+
+  // Add loops to iterate through the indices within the specified
+  // shape. The returned index collects the induction variables of the
+  // loops so that it will iterate through all coordinates within the
+  // specified shape.
+  //
+  // E.g. if you pass in a 2x3 shape, you will get back an index with
+  // two entries that are induction variables of the two loops that
+  // will be added. That index will iterate through the 6 coordinates
+  // within the shape. One possible order for that sequence would be:
+  //
+  //   (0,0), (0,1), (0,2), (1,0), (1,1), (1,2)
+  IrArray::Index AddLoopsForShape(const Shape& shape, absl::string_view suffix);
+
+  // Add a loop for each dimension in "dimensions". "suffix" is the
+  // name suffix of the indvar and basic blocks in this new loop nest.
+  //
+  // The return value is an index with the induction variables. The
+  // size equals the rank of shape and there is a null for each
+  // dimension that is not in "dimensions".
+  std::vector<llvm::Value*> AddLoopsForShapeOnDimensions(
+      const Shape& shape, absl::Span<const int64_t> dimensions,
+      absl::string_view suffix);
+
+  // Emits a series of nested loops for iterating over an operand array. Loops
+  // are constructed in major to minor dimension layout order. No loop is
+  // emitted for the given 'dimension_to_skip'. The function returns an IrArray
+  // index for the given operand_array containing the indvars of the loops. All
+  // dimensions of the index are filled except for 'dimension_to_skip'.
+  // name_suffix is the string to append to the names of LLVM constructs (eg,
+  // basic blocks) constructed by this method.
+  std::vector<llvm::Value*> EmitOperandArrayLoopNest(
+      const llvm_ir::IrArray& operand_array, int64_t dimension_to_skip,
+      absl::string_view name_suffix);
+
+  // Convenience methods which return particular basic blocks of the outermost
+  // or innermost loops. These methods return nullptr if no loops have been
+  // added yet.
+  llvm::BasicBlock* GetOuterLoopPreheaderBasicBlock() {
+    return outer_loop_preheader_bb_;
+  }
+  llvm::BasicBlock* GetOuterLoopExitBasicBlock() { return outer_loop_exit_bb_; }
+  llvm::BasicBlock* GetInnerLoopBodyBasicBlock() { return inner_loop_body_bb_; }
+
+ private:
+  void SetIndexType(llvm::Type* index_ty) {
+    index_type_ = index_ty == nullptr ? b_->getInt64Ty() : index_ty;
+  }
+
+  llvm::Constant* GetConstantWithIndexType(int64_t c) const {
+    return llvm::ConstantInt::get(index_type_, c);
+  }
+
+  // Human-friendly name of the loop nest.
+  std::string name_;
+
+  // The preheader and exit basic block of the outermost loop, or nullptr if no
+  // loop has been added yet.
+  llvm::BasicBlock* outer_loop_preheader_bb_;
+  llvm::BasicBlock* outer_loop_exit_bb_;
+
+  // The body basic block of the most-recently added loop, or nullptr if no loop
+  // has been added yet.
+  llvm::BasicBlock* inner_loop_body_bb_;
+
+  llvm::IRBuilderBase* b_;
+
+  llvm::Type* index_type_;
+};
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_LLVM_LOOP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_type_conversion_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_type_conversion_util.h
new file mode 100644
index 00000000..2c2af057
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_type_conversion_util.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_LLVM_TYPE_CONVERSION_UTIL_H_
+#define XLA_SERVICE_LLVM_IR_LLVM_TYPE_CONVERSION_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// Convert a absl::string_view to a llvm::StringRef. Note: both
+// absl::string_view and llvm::StringRef are non-owning pointers into a
+// string in memory. This method is used to feed strings to LLVM
+// & Clang APIs that expect llvm::StringRef.
+inline llvm::StringRef AsStringRef(absl::string_view str) {
+  return llvm::StringRef(str.data(), str.size());
+}
+
+inline absl::string_view AsStringView(llvm::StringRef str) {
+  return absl::string_view(str.data(), str.size());
+}
+
+template <typename T>
+llvm::ArrayRef<T> AsArrayRef(const std::vector<T>& vec) {
+  return llvm::ArrayRef<T>(vec.data(), vec.size());
+}
+
+template <typename T>
+llvm::ArrayRef<T> AsArrayRef(const absl::Span<const T> slice) {
+  return llvm::ArrayRef<T>(slice.data(), slice.size());
+}
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_LLVM_TYPE_CONVERSION_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_util.h
new file mode 100644
index 00000000..88c1287d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/llvm_util.h
@@ -0,0 +1,340 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_LLVM_UTIL_H_
+#define XLA_SERVICE_LLVM_IR_LLVM_UTIL_H_
+
+#include <cstdint>
+#include <map>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/FPEnv.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace llvm {
+class FastMathFlags;
+class TargetOptions;
+};  // namespace llvm
+
+namespace xla {
+namespace llvm_ir {
+
+// We have different DumpToString functions for each type for findability. We
+// use pointers / values based on the usual semantics of the parameter type.
+
+std::string DumpToString(const llvm::Module* module);
+std::string DumpToString(const llvm::Type* type);
+std::string DumpToString(const llvm::Value* value);
+
+// This also works for mlir::Op<...> descendants, such as mlir::ModuleOp.
+//
+// For findability:
+//   std::string DumpToString(mlir::Op<...>& op);
+//   std::string DumpToString(mlir::ModuleOp& module_op);
+//
+// The `operation` parameter is not const, because the used print() method is
+// not const.
+std::string DumpToString(mlir::Operation* operation);
+std::string DumpToString(mlir::Type type);
+std::string DumpToString(mlir::Value value);
+
+// Constructs a human-friendly name from the given inputs.  The result is
+// suitable for use as an llvm::Value's name.
+//
+// This is equivalent to
+//
+//   - changing the HloInstruction* to its name() (if we called that overload),
+//   - joining all of the nonempty inputs by '.', and then
+//   - removing all '%'s.
+//
+std::string IrName(absl::string_view a);
+std::string IrName(absl::string_view a, absl::string_view b);
+std::string IrName(const HloInstruction* a, absl::string_view b = "");
+
+// Construct a module from the given location with an optional name.
+//
+// The underlying "create" method is unsafe, because it leaks the new module by
+// default. This function avoids this by always returning an OwningOpRef.
+mlir::OwningOpRef<mlir::ModuleOp> CreateMlirModuleOp(
+    mlir::Location loc, std::optional<llvm::StringRef> name = std::nullopt);
+
+// Removes special characters from a function name.
+//
+// Note that this can cause different inputs to map to the same output, so after
+// sanitizing a function name, you must run it through a uniquer.
+std::string SanitizeFunctionName(std::string function_name);
+
+// Emits a call to the specified intrinsic with the given operands. Overloaded
+// intrinsics (for example, "minnum") must include a type in overloaded_types
+// for each overloaded type. Typically, overloaded intrinsics have only a single
+// overloaded type.
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilderBase* b,
+    absl::string_view name = "");
+
+// Emit float max. Emit maxnum intrinsic is fast math is disabled, or
+// fcmp+select otherwise
+llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                          llvm::IRBuilderBase* b, bool enable_fast_min_max,
+                          absl::string_view name = "");
+
+// Emit float min. Emit minnum intrinsic is fast math is disabled, or
+// fcmp+select otherwise
+llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                          llvm::IRBuilderBase* b, bool enable_fast_min_max,
+                          absl::string_view name = "");
+
+// Convenience methods for emitting a GEP instruction that indexes into a buffer
+// (1-dimensional array), equivalent to array[index]. The element type of the
+// array must be explicitly passed in.  The int64_t index overload
+// wraps the index in a i64 llvm::Value.
+llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Type* element_type,
+                                   llvm::Value* index, llvm::IRBuilderBase* b);
+llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Type* element_type,
+                                   int64_t index, llvm::IRBuilderBase* b);
+
+// Returns the LLVM type which represents the given XLA primitive type.
+llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
+                                  llvm::LLVMContext& context);
+
+// Returns the type size in bits. If "type" is a struct, it must be packed.
+int GetSizeInBits(llvm::Type* type);
+
+// Returns the LLVM type which represents the given XLA shape. For example,
+// if "shape" is [5 x [10 x f32]], the function returns [5 x [10 x float]].
+llvm::Type* ShapeToIrType(const Shape& shape, llvm::LLVMContext& context);
+
+// Returns a value that represents a pointer to a global string constant that
+// encodes the shape as a serialized protobuf.
+absl::StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
+    const Shape& shape, int32_t* shape_size, llvm::IRBuilderBase* b);
+
+// Converts a given literal to an IR Constant. Literals have known constant
+// values at IR emission time.
+llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
+                                           llvm::Module* module);
+
+// Allocates a tile of shared memory.
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name);
+
+// Utility class for working with shared memory.
+class SharedMemoryTile {
+ public:
+  SharedMemoryTile() = default;
+  explicit SharedMemoryTile(llvm::GlobalVariable* base_ptr,
+                            llvm::Type* element_type)
+      : base_ptr_(base_ptr), element_type_(element_type) {}
+
+  llvm::Value* Address(absl::Span<llvm::Value* const> index,
+                       llvm::IRBuilderBase* b) const;
+  llvm::Value* Load(absl::Span<llvm::Value* const> index,
+                    llvm::IRBuilderBase* b) const;
+  llvm::StoreInst* Store(llvm::Value* value,
+                         absl::Span<llvm::Value* const> index,
+                         llvm::IRBuilderBase* b) const;
+  llvm::Type* GetElementType() const { return element_type_; }
+
+ private:
+  llvm::GlobalVariable* base_ptr_;
+  llvm::Type* element_type_;
+};
+
+SharedMemoryTile AllocateSharedMemoryTile(
+    llvm::Module* module, llvm::Type* element_type,
+    absl::Span<int64_t const> dimensions_major_to_minor,
+    absl::string_view buffer_name);
+
+// Inserts an allocate of the requested type at the entry point of the
+// function that the builder is currently building. The insert point
+// of the builder is set to the same place after calling this function
+// as before.
+//
+// This can be useful to avoid e.g. executing an alloca every time
+// through a loop.
+llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
+                                            absl::string_view name,
+                                            llvm::IRBuilderBase* b,
+                                            int alignment = 0);
+
+// As EmitAllocaAtFunctionEntry, but allocates element_count entries
+// instead of a single element.
+llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
+                                                     llvm::Value* element_count,
+                                                     absl::string_view name,
+                                                     llvm::IRBuilderBase* b,
+                                                     int alignment = 0);
+
+// Creates a basic block with the same context and function as for the
+// builder. Inserts at the end of the function if insert_before is
+// null.
+llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
+                                   absl::string_view name,
+                                   llvm::IRBuilderBase* b);
+
+// Struct with data on a conditional branch in a diamond shape created
+// via EmitIfThenElse.
+struct LlvmIfData {
+  // The block that has the conditional branch.
+  llvm::BasicBlock* if_block;
+
+  // The block that is executed if the condition is true.
+  llvm::BasicBlock* true_block;
+
+  // The block that is executed if the condition is false.
+  llvm::BasicBlock* false_block;
+
+  // The block that follows after both the true_block and the
+  // false_block.
+  llvm::BasicBlock* after_block;
+};
+
+// Inserts a diamond-shaped if-then-else construct at the current
+// insertion point of the builder. This involves splitting the current
+// block into two blocks, at the insertion point, and introducing a
+// true-block and a false-block that connect the two split pieces. The
+// true-block is executed if the condition parameter evaluates to true
+// and otherwise the false-block is executed. If `emit_else` is false,
+// it jumps to the after-block rather than the false-block if the
+// condition is false, and the returned `false_block` is null.
+//
+// Currently the insertion point of the builder must be a well-formed
+// block with a terminator. If you need to use this for a
+// non-terminated block, just make the function able to do that too.
+LlvmIfData EmitIfThenElse(llvm::Value* condition, absl::string_view name,
+                          llvm::IRBuilderBase* b, bool emit_else = true);
+
+// Emits a compare operation between "lhs" and "rhs" with the given predicate,
+// and then converts the result to i8 so that it is addressable.
+llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
+                            llvm::Value* lhs, llvm::Value* rhs,
+                            llvm::IRBuilderBase* b,
+                            absl::string_view name = "");
+
+// Emits a call that logs the given value with the given tag as a prefix.
+// The provided tag and value are passed to a runtime logging call that is
+// embedded in this translation unit when the emitted code is executed.
+//
+// This can be very useful for debugging generated programs in short order when
+// developing new generated routines.
+//
+// Precondition: value must be an int64_t.
+// Precondition: tag must be a stable pointer for the lifetime of the generated
+// program (the constant pointer is burned in to the program).
+void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilderBase* b);
+
+// Adds alignment metadata to a load instruction using the given alignment.
+// The alignment refers to the result of the load, not the load itself.
+void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment);
+
+// Adds dereferenceable metadata to a load instruction using the given
+// the number of dereferenceable bytes.
+// Dereferenceable refers to the result of the load, not the load itself.
+void SetDereferenceableMetadataForLoad(llvm::LoadInst* load,
+                                       uint64_t dereferenceable_bytes);
+
+// Tells LLVM `inst >= lower && inst < upper`. Returns `inst` for convenience.
+llvm::Instruction* AddRangeMetadata(int32_t lower, int32_t upper,
+                                    llvm::Instruction* inst,
+                                    llvm::Module* module);
+
+void SetToFirstInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilderBase* builder);
+
+void SetToLastInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilderBase* builder);
+
+// Returns the number of bytes within the shape.
+int64_t ByteSizeOf(const Shape& shape, const llvm::DataLayout& data_layout);
+
+// Gets an llvm::FastMathFlags that reflects the settings in the given
+// module config.
+llvm::FastMathFlags GetCpuFastMathFlags(const HloModuleConfig& module_config);
+
+// Computes a conservative union of the metadata in "a" and "b".  For
+// aliasing-related metadata, this means the result can be applied to
+// instructions whose aliasing relationship can be described either by "a" *or*
+// by "b".
+std::map<int, llvm::MDNode*> MergeMetadata(
+    llvm::LLVMContext* context, const std::map<int, llvm::MDNode*>& a,
+    const std::map<int, llvm::MDNode*>& b);
+
+// Dumps out `llvm_module` to the path specified in DebugOptions, if dumping is
+// enabled for the given HLO module.
+//
+// A sanitized version of `hlo_module_name` is incorporated into the file name.
+// If `optimized` is true then a suffix of "-with-opt.ll" is used, else a suffix
+// of "-no-opt.ll" is used.
+void DumpIrIfEnabled(const HloModule& hlo_module,
+                     const llvm::Module& llvm_module, bool optimized,
+                     absl::string_view filename_suffix = "");
+
+llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
+                                  llvm::GlobalValue::LinkageTypes linkage,
+                                  const HloModuleConfig& module_config,
+                                  absl::string_view name, llvm::Module* module);
+
+// Checks whether a global variable is already created to represent the state
+// of a random number generator. If not, creates such a variable. Returns the
+// global variable.
+llvm::GlobalVariable* GetOrCreateVariableRngState(llvm::Module* module,
+                                                  llvm::IRBuilderBase* b);
+
+// Adds a delta value to the global state variable and return the old value of
+// the variable.
+llvm::Value* RngGetAndUpdateState(uint64_t delta, llvm::Module* module,
+                                  llvm::IRBuilderBase* b);
+
+// Gets the LLVM address space that should be used for global variables (e.g.
+// XLA's rng state).
+unsigned GetGlobalMemoryAddressSpace();
+
+// Emits a block which does "return void". Leaves the insert point as is.
+llvm::BasicBlock* EmitReturnBlock(llvm::IRBuilderBase* b);
+
+// Emits `if (condition) return`. Assumes that the current function returns
+// void.
+//
+// Can either use a supplied `return_block`, or generate a new one.
+void EmitEarlyReturn(llvm::Value* condition, llvm::IRBuilderBase* b,
+                     llvm::BasicBlock* return_block = nullptr);
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_LLVM_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/loop_emitter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/loop_emitter.h
new file mode 100644
index 00000000..4d825058
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/loop_emitter.h
@@ -0,0 +1,117 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_LOOP_EMITTER_H_
+#define XLA_SERVICE_LLVM_IR_LOOP_EMITTER_H_
+
+#include <functional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/llvm_loop.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// A function type for emitting code that generates an element in the target
+// array. The function gets a multi-dimensional index as its only input. This
+// index specifies the target element for which a value needs to be computed.
+// The function has to emit code to compute this value and return the resulting
+// llvm::Value*.
+using ElementGenerator =
+    std::function<absl::StatusOr<llvm::Value*>(const IrArray::Index& index)>;
+using BodyEmitter = std::function<absl::Status(const IrArray::Index& index)>;
+
+// Creates the body emitter from target arrays.
+BodyEmitter MakeBodyEmitter(const ElementGenerator& target_element_generator,
+                            absl::Span<IrArray const> target_arrays,
+                            llvm::IRBuilderBase* b, bool is_tuple);
+
+// Emits a loop for every element in the given shape.
+class LoopEmitter {
+ public:
+  LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
+              llvm::IRBuilderBase* b);
+
+  // Constructs a LoopEmitter from an body_emitter that generates
+  // element of the given target array in the dynamic dimension.
+  LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
+              std::vector<llvm::Value*> dynamic_dims, llvm::IRBuilderBase* b);
+
+  // Constructs a LoopEmitter from an element generator that generates each
+  // element of the given target array.
+  LoopEmitter(const ElementGenerator& target_element_generator,
+              const IrArray& target_array, llvm::IRBuilderBase* b);
+
+  // Constructs a LoopEmitter that emits one element into each of N separate
+  // arrays on each iteration of the loop.
+  //
+  // This is used for multi-output fusion.  target_element_generator must
+  // produce an LLVM struct with N elements.
+  LoopEmitter(const ElementGenerator& target_element_generator,
+              absl::Span<const IrArray> target_arrays, llvm::IRBuilderBase* b);
+
+  LoopEmitter(const LoopEmitter&) = delete;
+  LoopEmitter& operator=(const LoopEmitter&) = delete;
+  virtual ~LoopEmitter() = default;
+
+  // Emits a loop nest (with a yet-to-be-filled loop body) that iterates through
+  // every element in the given shape. Returns the multi-dimensional index that
+  // specifies the element, will return multiple indices if the loop is
+  // unrolled.
+  virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
+      absl::string_view loop_name, llvm::Type* index_type,
+      llvm::Value* base_index);
+
+  // Emits a complete loop nest for every element in the given shape.
+  absl::Status EmitLoop(absl::string_view loop_name = "",
+                        llvm::Type* index_type = nullptr);
+
+ protected:
+  // An IR emitter that generates the loop body.
+  BodyEmitter body_emitter_;
+
+  // The shape that the emitted loop iterates through.
+  Shape shape_;
+
+  // Dynamic dimensions that  emitted loop iterates through. Generate the
+  // loop based on the dynamic dimensions if this vector is not empty.
+  std::vector<llvm::Value*> dynamic_dims_;
+
+  // Points to the exit block of the emitted loop. If the given shape is
+  // scalar, no loops are emitted and exit_bb_ is nullptr in that case.
+  llvm::BasicBlock* exit_bb_;
+
+  llvm::IRBuilderBase* b_;
+
+ private:
+  IrArray::Index EmitStaticIndex(ForLoopNest* loop_nest,
+                                 llvm::Type* index_type);
+  IrArray::Index EmitDynamicIndex(ForLoopNest* loop_nest,
+                                  llvm::Type* index_type);
+};
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_LOOP_EMITTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/math_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/math_ops.h
new file mode 100644
index 00000000..eb085088
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/math_ops.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_MATH_OPS_H_
+#define XLA_SERVICE_LLVM_IR_MATH_OPS_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+
+namespace xla {
+namespace llvm_ir {
+
+// Emits an approximation of tanh. The implementation uses the same rational
+// interpolant as implemented in Eigen3. 'with_fma' should be set to true if FMA
+// instructions are available.
+llvm::Value* EmitFastTanh(llvm::IRBuilderBase* b, llvm::Value* input,
+                          bool with_fma = false);
+llvm::Value* EmitFastTanhF64(llvm::IRBuilderBase* b, llvm::Value* input,
+                             bool with_fma = false);
+
+// Emits an approximation of erf. The implementation uses the same rational
+// interpolant as implemented in Eigen3.
+llvm::Value* EmitErfF32(llvm::IRBuilderBase* b, llvm::Value* x);
+
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_MATH_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/sort_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/sort_util.h
new file mode 100644
index 00000000..24cf8879
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/sort_util.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+#define XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+namespace llvm_ir {
+using EmitCallToNestedComputationCallback =
+    std::function<absl::Status(absl::Span<llvm::Value* const>, llvm::Value*)>;
+// Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
+// dimension of each array in 'values_arrays'. All other dimensions are kept
+// as-is. This implements the inner loop of BitonicSort. It is assumed that
+// 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
+absl::Status EmitSortInPlace(
+    int64_t dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64_t> xor_masks,
+    llvm::IRBuilderBase* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64_t num_iterations_in_sort_dim, int64_t tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/tuple_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/tuple_ops.h
new file mode 100644
index 00000000..2f989e59
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/llvm_ir/tuple_ops.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LLVM_IR_TUPLE_OPS_H_
+#define XLA_SERVICE_LLVM_IR_TUPLE_OPS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
+
+// Utilities for emitting LLVM IR related to HLO tuples.
+
+namespace xla {
+namespace llvm_ir {
+
+// A tuple is an array of pointers, one for each operand. Each pointer points to
+// the output buffer of its corresponding operand.
+void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
+               llvm::IRBuilderBase* b);
+
+// Emits one alloca for each element in the tuple of shape tuple_shape,
+// returns the emitted allocas.
+// Precondition: tuple_shape should be a tuple of scalars.
+std::vector<llvm::Value*> EmitTupleAllocasAtFunctionEntry(
+    const Shape& tuple_shape, llvm::IRBuilderBase* b);
+
+// Similar to EmitTuple above, except that the output buffers are provided in
+// the form of IrArray.
+void EmitTuple(const IrArray& tuple, absl::Span<const IrArray> buffers,
+               llvm::IRBuilderBase* b);
+
+// A tuple is an array of pointers, one for each operand. Each pointer points to
+// the output buffer of its corresponding operand. A GetTupleElement instruction
+// forwards the pointer to underlying tuple element buffer at the given index.
+// Returns an llvm value representing a pointer to the tuple element buffer.
+llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64_t index,
+                                 int alignment, llvm::Value* operand,
+                                 llvm::Type* operand_pointee_type,
+                                 llvm::IRBuilderBase* b);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LLVM_IR_TUPLE_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/local_service.h b/third_party/tflite-hdrs/third_party/xla/xla/service/local_service.h
new file mode 100644
index 00000000..f2f7f42b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/local_service.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOCAL_SERVICE_H_
+#define XLA_SERVICE_LOCAL_SERVICE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/service/backend.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/service.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Service implementation that extends the XLA Service to leverage running
+// in the same process as the client.
+class LocalService : public Service {
+ public:
+  // Factory for creating a LocalService.
+  static absl::StatusOr<std::unique_ptr<LocalService>> NewService(
+      const ServiceOptions& options);
+
+  // Builds Executables with the given XlaComputation, argument layouts and
+  // options. If result_layout is non-null, then the executable is compiled to
+  // produce a result of the given layout.  If device_allocator is non-null,
+  // then the compiler may use it to allocate temp space on the device.  The
+  // compiler is responsible for freeing any memory it allocates this way.
+  absl::StatusOr<std::vector<std::unique_ptr<Executable>>> CompileExecutables(
+      const XlaComputation& computation,
+      const absl::Span<const Shape* const> argument_layouts,
+      const ExecutableBuildOptions& build_options);
+
+  // Same as CompileExecutables() above, but return AotCompilationResult objects
+  // (instead of Executable objects), which can be persisted to later load
+  // Executable objects.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAotResults(const XlaComputation& computation,
+                    const absl::Span<const Shape* const> argument_layouts,
+                    const ExecutableBuildOptions& build_options);
+
+  // Returns the device ordinal that corresponds to the given replica number.
+  //
+  // This returns an error if there is not a one-to-one correspondence of
+  // replicas to device ordinals, but is useful as a short term mechanism for
+  // the "easy" case where a single replica is a single device.
+  absl::StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
+
+  // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
+  // as long as the handle is valid.
+  absl::StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+      const GlobalDataHandle& data, int replica_number);
+
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  absl::StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers,
+      const std::string& tag);
+
+ private:
+  explicit LocalService(const ServiceOptions& options,
+                        std::unique_ptr<Backend> backend);
+  LocalService(const LocalService&) = delete;
+  void operator=(const LocalService&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LOCAL_SERVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/local_service_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/local_service_utils.h
new file mode 100644
index 00000000..d17300c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/local_service_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOCAL_SERVICE_UTILS_H_
+#define XLA_SERVICE_LOCAL_SERVICE_UTILS_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/backend.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
+
+namespace xla {
+// Validates the computation argument layouts, and returns the corresponding
+// HloModuleConfig.
+absl::StatusOr<std::unique_ptr<HloModuleConfig>> GetHloModuleConfig(
+    const XlaComputation& computation,
+    absl::Span<const Shape* const> argument_layouts,
+    const ExecutableBuildOptions& build_options,
+    ServiceOptions* options = nullptr, Backend* backend = nullptr);
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LOCAL_SERVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/lockable.h b/third_party/tflite-hdrs/third_party/xla/xla/service/lockable.h
new file mode 100644
index 00000000..246f4d13
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/lockable.h
@@ -0,0 +1,146 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOCKABLE_H_
+#define XLA_SERVICE_LOCKABLE_H_
+
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/platform/logging.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+
+// A template that can be specialized to give a human readable name to lockable
+// of type `T`.
+template <typename T>
+struct LockableName {
+  static std::string ToString(const T& value) {
+    return absl::StrFormat("lockable %p", &value);
+  }
+};
+
+// An RAII helper for a value of type `T` that requires exclusive access.
+template <typename T, typename LockableName = LockableName<T>>
+class Lockable {
+ public:
+  // RAII type that will release the exclusive lock when it is destroyed.
+  class Lock {
+   public:
+    Lock() = default;
+
+    Lock(Lock&& other) {
+      lockable_ = other.lockable_;
+      other.lockable_ = nullptr;
+    }
+
+    Lock& operator=(Lock&& other) {
+      lockable_ = other.lockable_;
+      other.lockable_ = nullptr;
+      return *this;
+    }
+
+    ~Lock() {
+      if (lockable_) lockable_->Release();
+    }
+
+    T& operator*() const { return lockable_->value_; }
+    T* operator->() const { return &lockable_->value_; }
+    operator bool() const { return lockable_ != nullptr; }  // NOLINT
+
+    std::string ToString() const {
+      return lockable_ ? lockable_->ToString() : "<empty lock>";
+    }
+
+   private:
+    friend class Lockable;
+    explicit Lock(Lockable* lockable) : lockable_(lockable) {}
+    Lockable* lockable_ = nullptr;
+  };
+
+  Lockable() = default;
+
+  explicit Lockable(T value) : value_(std::move(value)) {
+    VLOG(2) << "Constructed " << LockableName::ToString(value_);
+  }
+
+  template <typename... Args>
+  explicit Lockable(Args&&... args) : value_(std::forward<Args>(args)...) {
+    VLOG(2) << "Constructed " << LockableName::ToString(value_);
+  }
+
+  Lockable(const Lockable&) = delete;
+  Lockable& operator=(const Lockable&) = delete;
+
+  ~Lockable() {
+    VLOG(2) << "Destroy " << LockableName::ToString(value_);
+    absl::MutexLock lock(&mutex_);
+    CHECK_EQ(is_unlocked_, true);  // NOLINT
+  }
+
+  Lock Acquire() {
+    tsl::profiler::TraceMe trace([&] {
+      return tsl::profiler::TraceMeEncode("Lockable::Lock::Acquire",
+                                          {{"lockable", ToString()}});
+    });
+
+    absl::MutexLock lock(&mutex_);
+    mutex_.Await(absl::Condition(&is_unlocked_));
+    VLOG(2) << "Acquired " << LockableName::ToString(value_);
+    is_unlocked_ = false;
+
+    return Lock(this);
+  }
+
+  Lock TryAcquire() {
+    absl::MutexLock lock(&mutex_);
+
+    // Someone already locked this object, return an empty lock.
+    if (is_unlocked_ == false) {
+      VLOG(2) << "Failed to acquire " << LockableName::ToString(value_);
+      return Lock();
+    }
+
+    VLOG(2) << "Acquired " << LockableName::ToString(value_);
+    is_unlocked_ = false;
+    return Lock(this);
+  }
+
+  std::string ToString() const { return LockableName::ToString(value_); }
+
+ protected:
+  const T& value() const { return value_; }
+
+ private:
+  friend class Lock;
+
+  void Release() {
+    absl::MutexLock lock(&mutex_);
+    VLOG(2) << "Released " << LockableName::ToString(value_);
+    CHECK(!is_unlocked_);  // NOLINT
+    is_unlocked_ = true;
+  }
+
+  T value_;
+  absl::Mutex mutex_;
+  bool is_unlocked_ ABSL_GUARDED_BY(mutex_) = true;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LOCKABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/logical_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/logical_buffer.h
new file mode 100644
index 00000000..350bbdfc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/logical_buffer.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOGICAL_BUFFER_H_
+#define XLA_SERVICE_LOGICAL_BUFFER_H_
+
+#include <string>
+
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// TuplePointsToAnalysis uses this subclass of BufferValue.
+class LogicalBuffer : public BufferValue {
+ public:
+  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id);
+
+  // Return the instruction that defines the buffer.
+  HloInstruction* instruction() const override { return instruction_; }
+
+  // Return the index within the output of the instruction where the buffer is
+  // defined. Index used defined as in ShapeUtil::GetSubshape()
+  const ShapeIndex& index() const override { return index_; }
+
+  // Return the shape of the buffer. This reference points into the shape field
+  // of the instruction defining the buffer.  Therefore, the returned shape will
+  // contain the layout of instruction, if any.
+  const Shape& shape() const override {
+    return ShapeUtil::GetSubshape(instruction_->shape(), index_);
+  }
+
+  std::string ToString() const override;
+
+ private:
+  HloInstruction* instruction_;
+  ShapeIndex index_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LOGICAL_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/logical_buffer_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/logical_buffer_analysis.h
new file mode 100644
index 00000000..6571558f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/logical_buffer_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#define XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/logical_buffer_analysis.h"
+
+#endif  // XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/logistic_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/logistic_expander.h
new file mode 100644
index 00000000..c0c5ec0c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/logistic_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOGISTIC_EXPANDER_H_
+#define XLA_SERVICE_LOGISTIC_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/logistic_expander.h"
+
+#endif  // XLA_SERVICE_LOGISTIC_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/loop_schedule_linearizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/loop_schedule_linearizer.h
new file mode 100644
index 00000000..e6348b15
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/loop_schedule_linearizer.h
@@ -0,0 +1,60 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
+#define XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Adds control dependency edges from instructions which "write" values inside
+// the loop, to instructions which "read" those same values, in order to avoid
+// extraneous copies. This is not always possible with our buffer layout
+// constraints (that is, assuming that every element of the tuple the while loop
+// operates upon gets the same buffer) as it may create cycles (an easiest
+// example of a dependency cycle is a loop doing `(a, b) = (b, a)`). Thus we
+// take a best-effort approach instead: add dependency edges only if we can show
+// they don't create a cycle.
+class LoopScheduleLinearizer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "loop-schedule-linearizer"; }
+
+  explicit LoopScheduleLinearizer(
+      const HloDataflowAnalysis::CanShareBuffer& can_share_buffer = nullptr)
+      : can_share_buffer_(can_share_buffer) {}
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Backend specific function that decides whether an instruction can share
+  // buffer with its operand.
+  HloDataflowAnalysis::CanShareBuffer can_share_buffer_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/map_inliner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/map_inliner.h
new file mode 100644
index 00000000..33821208
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/map_inliner.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MAP_INLINER_H_
+#define XLA_SERVICE_MAP_INLINER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which performs map inlining. This replaces kMap instructions with
+// their equivalent sequence of array operations. For example:
+//   map({X, Y}, add) -> add(X, Y)).
+class MapInliner : public HloModulePass {
+ public:
+  ~MapInliner() override = default;
+  absl::string_view name() const override { return "map-inline"; }
+
+  // Run map inlining on the given computation. Returns whether the computation
+  // was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MAP_INLINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/mapped_ptr_container_sorter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/mapped_ptr_container_sorter.h
new file mode 100644
index 00000000..9c083ab5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/mapped_ptr_container_sorter.h
@@ -0,0 +1,457 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Below, we specify an example usage, in which clone is sorted according to
+// original, using map_fn to map from pointers in original to pointers in clone.
+//
+//   std::vector<std::unique_ptr<HloInstruction*>> original = ...;
+//   std::vector<std::unique_ptr<HloInstruction*>> clone = ...;
+//   HloCloneContext* ctx = ...;
+//   using Sorter = MappedPtrContainerSorter<HloInstruction>;
+//   Sorter::MappedPtrFn map_fn = [ctx](const HloInstruction* i) {
+//       return ctx->FindInstruction(i);
+//     };
+//
+//   auto status = Sorter::Sort(map_fn, Sorter::IndexAfterMappedElementsFn(),
+//                              original, clone);
+
+#ifndef XLA_SERVICE_MAPPED_PTR_CONTAINER_SORTER_H_
+#define XLA_SERVICE_MAPPED_PTR_CONTAINER_SORTER_H_
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// A class for sorting an unordered container of pointers according to the sort
+// order of an ordered container of pointers. Sorting is stable.
+//
+// Terminology:
+// - unmapped element: An element from the unordered container that does not
+//   have a corresponding element in the ordered container.
+template <typename PointedToTy>
+class MappedPtrContainerSorter {
+ public:
+  // A function to map elements from an ordered container to elements in an
+  // unordered container. Not every element in ordered_container need map to an
+  // element in unordered_container and vice versa.
+  using MapPtrFn = absl::FunctionRef<const PointedToTy*(const PointedToTy*)>;
+
+  // A function that maps unmapped elements (from an unordered container) to an
+  // index in the final sorted result. The returned index indicates that the
+  // unmapped element should be placed just after the mapped element at that
+  // index, in the result without unmapped elements. See
+  // IndexBeforeMappedElementsFn() and IndexAfterMappedElementsFn() for how to
+  // indicate that an unmapped element should be placed before or after all
+  // mapped elements, respectively. Unmapped elements destined for the same
+  // index will retain their order from the unordered container.
+  using UnmappedPtrIndexFn = absl::FunctionRef<size_t(const PointedToTy*)>;
+
+  // Functions that return an UnmappedElementIndexFn that indicates that
+  // ummapped elements (from an unordered container) should be placed before or
+  // after all mapped elements, respectively.
+  static UnmappedPtrIndexFn IndexBeforeMappedElementsFn();
+  static UnmappedPtrIndexFn IndexAfterMappedElementsFn();
+
+  // Returned function always returns an error.
+  static UnmappedPtrIndexFn InvalidIndexFn();
+
+  // Sorts an unordered container of pointers according to the order of an
+  // ordered container of pointers. Sorting is stable. Works with POD pointers,
+  // const POD pointers, and unique_ptrs. If an error is returned,
+  // unordered_container is not modified. Returns an error status if:
+  // - unmapped_index() returns an invalid index
+  // - An internal error occurs. (This should theoretically not happen.)
+  template <typename OrderedTy, typename UnorderedTy>
+  static absl::Status Sort(MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
+                           const OrderedTy& ordered_container,
+                           UnorderedTy& unordered_container);
+
+ private:
+  // A class for sorting the indices of the unordered_container.
+  class SortedIndices {
+   public:
+    // max_partial_order_exclusive is 1 greater than the maximum partial order
+    // value allowed to be sent to AddMappedElement().
+    SortedIndices(size_t max_partial_order_exclusive,
+                  size_t unordered_container_size)
+        : max_partial_order_exclusive_(max_partial_order_exclusive),
+          unordered_container_size_(unordered_container_size),
+          mapped_element_indices_by_partial_order_(
+              max_partial_order_exclusive) {}
+
+    // Specify the partial ordering value of a mapped element from the
+    // unordered container. The partial ordering is amongst other mapped
+    // elements.
+    absl::Status AddMappedElement(size_t unordered_container_index,
+                                  size_t partial_order);
+
+    // Specify the index (amongst mapped elements), where an unmapped element
+    // should be inserted. The unmapped element is inserted just after the
+    // mapped element with index target_index_amongst_mapped_elements.
+    void AddUnmappedElement(size_t unordered_container_index,
+                            size_t target_index_amongst_mapped_elements);
+
+    std::string ToString() const;
+
+    // The result maps each element in the unordered_container to the target
+    // index that it will occupy in the sorted result.
+    absl::StatusOr<std::vector<size_t>> Flatten() const;
+
+   private:
+    SortedIndices() = delete;
+
+    size_t max_partial_order_exclusive_;
+    size_t unordered_container_size_;
+    std::vector<std::vector<size_t>> mapped_element_indices_by_partial_order_;
+    absl::flat_hash_map<size_t, std::vector<size_t>>
+        target_index_to_unmapped_element_index_;
+  };
+
+  static size_t IndexBeforeMappedElements() {
+    return std::numeric_limits<size_t>::max() - 2;
+  }
+
+  static size_t IndexAfterMappedElements() {
+    return std::numeric_limits<size_t>::max() - 1;
+  }
+
+  static size_t InvalidIndex() { return std::numeric_limits<size_t>::max(); }
+
+  // Returns a mapping in which the element at index i indicates the target
+  // index that unordered_container[i] should occupy in the sorted result.
+  template <typename OrderedTy, typename UnorderedTy>
+  static absl::StatusOr<std::vector<size_t>> ComputeNewIndices(
+      MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
+      const OrderedTy& ordered_container,
+      const UnorderedTy& unordered_container);
+
+  // Reorders unordered_container according to the indices in new_indices. See
+  // ComputeNewIndices() for how to interpret new_indices.
+  template <typename UnorderedTy>
+  static void Reorder(std::vector<size_t> new_indices,
+                      UnorderedTy& unordered_container);
+};
+
+///// Template implementation below /////
+
+namespace mapped_ptr_container_sorter_internal {
+
+template <typename I, typename O>
+struct PtrGetter {
+  // Extracts a pointer of type O from i.
+  static O Get(I i);
+};
+
+template <typename T>
+struct PtrGetter<T* const&, const T*> {
+  static const T* Get(T* const& p) { return p; }
+};
+
+template <typename T>
+struct PtrGetter<T const* const&, const T*> {
+  static const T* Get(T const* const& p) { return p; }
+};
+
+template <typename T>
+struct PtrGetter<T*&, T*> {
+  static T* Get(T*& p) { return p; }
+};
+
+template <typename T>
+struct PtrGetter<const std::unique_ptr<T>&, const T*> {
+  static const T* Get(const std::unique_ptr<T>& p) { return p.get(); }
+};
+
+template <typename T>
+struct PtrGetter<std::unique_ptr<T>&, T*> {
+  static T* Get(std::unique_ptr<T>& p) { return p.get(); }
+};
+
+}  // namespace mapped_ptr_container_sorter_internal
+
+template <typename PointedToTy>
+typename MappedPtrContainerSorter<PointedToTy>::UnmappedPtrIndexFn
+MappedPtrContainerSorter<PointedToTy>::IndexBeforeMappedElementsFn() {
+  static const auto fn = [](const PointedToTy*) {
+    return IndexBeforeMappedElements();
+  };
+  return fn;
+}
+
+template <typename PointedToTy>
+typename MappedPtrContainerSorter<PointedToTy>::UnmappedPtrIndexFn
+MappedPtrContainerSorter<PointedToTy>::IndexAfterMappedElementsFn() {
+  static const auto fn = [](const PointedToTy*) {
+    return IndexAfterMappedElements();
+  };
+  return fn;
+}
+
+template <typename PointedToTy>
+typename MappedPtrContainerSorter<PointedToTy>::UnmappedPtrIndexFn
+MappedPtrContainerSorter<PointedToTy>::InvalidIndexFn() {
+  static const auto fn = [](const PointedToTy*) { return InvalidIndex(); };
+  return fn;
+}
+
+template <typename PointedToTy>
+absl::Status
+MappedPtrContainerSorter<PointedToTy>::SortedIndices::AddMappedElement(
+    size_t unordered_container_index, size_t partial_order) {
+  if (partial_order >= mapped_element_indices_by_partial_order_.size()) {
+    return InternalStrCat("invalid partial order: ", partial_order, " v max(",
+                          mapped_element_indices_by_partial_order_.size(), ")");
+  }
+
+  mapped_element_indices_by_partial_order_[partial_order].push_back(
+      unordered_container_index);
+  return absl::OkStatus();
+}
+
+template <typename PointedToTy>
+void MappedPtrContainerSorter<PointedToTy>::SortedIndices::AddUnmappedElement(
+    size_t unordered_container_index,
+    size_t target_index_amongst_mapped_elements) {
+  target_index_to_unmapped_element_index_[target_index_amongst_mapped_elements]
+      .push_back(unordered_container_index);
+}
+
+template <typename PointedToTy>
+std::string MappedPtrContainerSorter<PointedToTy>::SortedIndices::ToString()
+    const {
+  std::vector<std::string> mapped_element_strs;
+  mapped_element_strs.reserve(mapped_element_indices_by_partial_order_.size());
+  for (const auto& indices : mapped_element_indices_by_partial_order_) {
+    mapped_element_strs.push_back(
+        absl::StrCat("[", absl::StrJoin(indices, ", "), "]"));
+  }
+  std::vector<std::string> unmapped_element_strs;
+  unmapped_element_strs.reserve(target_index_to_unmapped_element_index_.size());
+  for (const auto& kv : target_index_to_unmapped_element_index_) {
+    std::string key = absl::StrCat(kv.first);
+    if (kv.first == IndexBeforeMappedElements()) {
+      key = "before_mapped";
+    }
+    if (kv.first == IndexAfterMappedElements()) {
+      key = "after_mapped";
+    }
+    if (kv.first == InvalidIndex()) {
+      key = "invalid";
+    }
+    unmapped_element_strs.push_back(
+        absl::StrCat(key, ": [", absl::StrJoin(kv.second, ", "), "]"));
+  }
+
+  return absl::StrCat(
+      "max_partial_order_exclusive_: ", max_partial_order_exclusive_, "\n",
+      "unordered_container_size_: ", unordered_container_size_, "\n",
+      "mapped_element_indices_by_partial_order_: [",
+      absl::StrJoin(mapped_element_strs, ", "), "]\n",
+      "target_index_to_unmapped_element_index_: {",
+      absl::StrJoin(unmapped_element_strs, ", "), "}\n");
+}
+
+template <typename PointedToTy>
+absl::StatusOr<std::vector<size_t>>
+MappedPtrContainerSorter<PointedToTy>::SortedIndices::Flatten() const {
+  std::vector<size_t> result(unordered_container_size_, InvalidIndex());
+  size_t next_available_index = 0;
+  auto next_index_fn = [&]() -> absl::StatusOr<size_t> {
+    if (next_available_index >= unordered_container_size_) {
+      return InternalStrCat(
+          "invalid unordered_container index: ", next_available_index,
+          " v size(", unordered_container_size_, ")");
+    }
+    return next_available_index++;
+  };
+
+  if (target_index_to_unmapped_element_index_.contains(
+          IndexBeforeMappedElements())) {
+    const auto& indices =
+        target_index_to_unmapped_element_index_.at(IndexBeforeMappedElements());
+    for (size_t index : indices) {
+      TF_ASSIGN_OR_RETURN(result[index], next_index_fn());
+    }
+  }
+  size_t num_inserted_mapped_elements = 0;
+  for (const auto& mapped_element_indices :
+       mapped_element_indices_by_partial_order_) {
+    for (size_t mapped_element_index : mapped_element_indices) {
+      TF_ASSIGN_OR_RETURN(result[mapped_element_index], next_index_fn());
+      ++num_inserted_mapped_elements;
+      if (target_index_to_unmapped_element_index_.contains(
+              num_inserted_mapped_elements - 1)) {
+        const auto& unmapped_element_indices =
+            target_index_to_unmapped_element_index_.at(
+                num_inserted_mapped_elements - 1);
+        for (size_t unmapped_element_index : unmapped_element_indices) {
+          TF_ASSIGN_OR_RETURN(result[unmapped_element_index], next_index_fn());
+        }
+      }
+    }
+  }
+  if (target_index_to_unmapped_element_index_.contains(
+          IndexAfterMappedElements())) {
+    const auto& indices =
+        target_index_to_unmapped_element_index_.at(IndexAfterMappedElements());
+    for (size_t index : indices) {
+      TF_ASSIGN_OR_RETURN(result[index], next_index_fn());
+    }
+  }
+
+  // Ensure that every element in unordered_container has a valid new index.
+  absl::flat_hash_set<size_t> used_indices;
+  for (size_t index : result) {
+    if (used_indices.contains(index)) {
+      return InternalStrCat(
+          "2 elements in unordered_container are destined for the same "
+          "index: ",
+          index);
+    }
+    if (index >= unordered_container_size_) {
+      return InvalidArgumentStrCat("invalid unordered_container index: ", index,
+                                   " v size(", unordered_container_size_, ")");
+    }
+  }
+
+  return result;
+}
+
+template <typename PointedToTy>
+template <typename OrderedTy, typename UnorderedTy>
+absl::StatusOr<std::vector<size_t>>
+MappedPtrContainerSorter<PointedToTy>::ComputeNewIndices(
+    MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
+    const OrderedTy& ordered_container,
+    const UnorderedTy& unordered_container) {
+  using UnorderedPtrGetter = mapped_ptr_container_sorter_internal::PtrGetter<
+      typename UnorderedTy::const_reference, const PointedToTy*>;
+  using OrderedPtrGetter = mapped_ptr_container_sorter_internal::PtrGetter<
+      typename OrderedTy::const_reference, const PointedToTy*>;
+
+  if (unordered_container.size() >= IndexBeforeMappedElements()) {
+    return InvalidArgumentStrCat("Unordered container is too large to sort.");
+  }
+
+  // Step 1: build a set of the ptrs in unordered_container
+  absl::flat_hash_set<const PointedToTy*> unordered_ptrs;
+  for (const auto& unordered_element : unordered_container) {
+    const PointedToTy* ptr = UnorderedPtrGetter::Get(unordered_element);
+    unordered_ptrs.insert(ptr);
+  }
+
+  // Step 2: for mapped elements (in unordered_container), create a map from
+  // mapped ptr -> partial ordering
+  absl::flat_hash_map<const PointedToTy*, std::list<size_t>>
+      mapped_ptr_to_partial_order;
+  size_t next_partial_order_value = 0;
+  for (const auto& ordered_element : ordered_container) {
+    const PointedToTy* ordered_ptr = OrderedPtrGetter::Get(ordered_element);
+    const PointedToTy* unordered_ptr = map_ptr(ordered_ptr);
+    if (!unordered_ptr) {
+      // A corresponding unordered element does not exist.
+      continue;
+    }
+    if (!unordered_ptrs.contains(unordered_ptr)) {
+      // A pointer exists that maps to the ordered element, but it's not in our
+      // unordered_container.
+      continue;
+    }
+    mapped_ptr_to_partial_order[unordered_ptr].push_back(
+        next_partial_order_value);
+    ++next_partial_order_value;
+  }
+
+  // Step 3: create sorted unordered element indices
+  SortedIndices result(next_partial_order_value, unordered_container.size());
+  for (size_t i = 0; i < unordered_container.size(); ++i) {
+    const PointedToTy* ptr = UnorderedPtrGetter::Get(unordered_container[i]);
+    if (!mapped_ptr_to_partial_order.contains(ptr)) {
+      // ptr is unmapped
+      result.AddUnmappedElement(i, unmapped_index(ptr));
+      continue;
+    }
+
+    // ptr is mapped
+    //
+    // Potentially, several elements in ordered_container map to ptr.
+    // We assign ptr theindex corresponding to the next such ordered element.
+    auto& index_list = mapped_ptr_to_partial_order[ptr];
+    TF_RETURN_IF_ERROR(result.AddMappedElement(i, index_list.front()));
+    // Do not map more than one unordered element to the same index, unless we
+    // have no choice.
+    if (index_list.size() > 1) {
+      // We never remove the last ordered index, in case ptr appears in the
+      // unordered_container more times than the ordered container.
+      index_list.pop_front();
+    }
+  }
+
+  VLOG(5) << "Pre flatten unordered_container result:\n" << result.ToString();
+  return result.Flatten();
+}
+
+template <typename PointedToTy>
+template <typename UnorderedTy>
+void MappedPtrContainerSorter<PointedToTy>::Reorder(
+    std::vector<size_t> new_indices, UnorderedTy& unordered_container) {
+  size_t old_pos = 0;
+  while (old_pos < new_indices.size()) {
+    size_t new_pos = new_indices[old_pos];
+    if (old_pos == new_pos) {
+      ++old_pos;
+      continue;
+    }
+    std::swap(new_indices[old_pos], new_indices[new_pos]);
+    std::swap(unordered_container[old_pos], unordered_container[new_pos]);
+  }
+}
+
+template <typename PointedToTy>
+template <typename OrderedTy, typename UnorderedTy>
+absl::Status MappedPtrContainerSorter<PointedToTy>::Sort(
+    MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
+    const OrderedTy& ordered_container, UnorderedTy& unordered_container) {
+  std::vector<size_t> indices;
+  TF_ASSIGN_OR_RETURN(
+      indices, ComputeNewIndices(map_ptr, unmapped_index, ordered_container,
+                                 unordered_container));
+  Reorder(std::move(indices), unordered_container);
+  return absl::OkStatus();
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MAPPED_PTR_CONTAINER_SORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/maybe_owning_device_memory.h b/third_party/tflite-hdrs/third_party/xla/xla/service/maybe_owning_device_memory.h
new file mode 100644
index 00000000..74bf3012
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/maybe_owning_device_memory.h
@@ -0,0 +1,78 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
+#define XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
+
+#include <optional>
+#include <utility>
+
+#include "absl/types/variant.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+
+namespace xla {
+
+// MaybeOwningDeviceMemory represents either an owned or unowned device memory.
+// Like std::variant<se::OwningDeviceMemory, DeviceMemory>. When the object goes
+// output of scope, it will free the underlying memory if it owns it.
+class MaybeOwningDeviceMemory {
+ public:
+  MaybeOwningDeviceMemory() = default;
+  explicit MaybeOwningDeviceMemory(tensorflow::se::OwningDeviceMemory owned)
+      : mem_(std::move(owned)) {}
+  explicit MaybeOwningDeviceMemory(tensorflow::se::DeviceMemoryBase unowned)
+      : mem_(unowned) {}
+  MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
+  ~MaybeOwningDeviceMemory() = default;
+
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::DeviceMemoryBase unowned) {
+    mem_ = unowned;
+    return *this;
+  }
+
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::OwningDeviceMemory owned) {
+    mem_ = std::move(owned);
+    return *this;
+  }
+
+  MaybeOwningDeviceMemory& operator=(MaybeOwningDeviceMemory&&) = default;
+
+  // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The
+  // caller of this function is *not* responsible for freeing the memory.
+  tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase() const;
+
+  // Release the tensorflow::se::OwningDeviceMemory without freeing it, and
+  // moves the ownership of the memory buffer from the object to the caller.
+  //
+  // A nullopt is returned if the HasOwnership() == false;
+  std::optional<tensorflow::se::OwningDeviceMemory> Release();
+
+  // If the device memory is owned, returns a pointer to the internal
+  // OwningDeviceMemory, otherwise nullptr is returned.
+  const tensorflow::se::OwningDeviceMemory* AsOwningDeviceMemory() const;
+
+  // Returns true if the device_memory has ownership over underlying memory.
+  bool HasOwnership() const;
+
+ private:
+  std::variant<tensorflow::se::OwningDeviceMemory,
+               tensorflow::se::DeviceMemoryBase>
+      mem_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/algorithm.h
new file mode 100644
index 00000000..d5501143
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -0,0 +1,1091 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALGORITHM_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALGORITHM_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+#include <vector>
+
+// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
+#if defined(__GNUC__) || defined(__clang__)
+#include "absl/container/btree_map.h"
+#endif
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/allocation_value.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/options.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+// A struct representing an asynchronous copy with its logical start and end
+// time (time that copy done is scheduled), the resource this copy would use,
+// its destination memory space, and a unique ID.
+struct AsynchronousCopy {
+  int64_t exclusive_start_time;
+  int64_t end_time;
+  float resource;
+  MemorySpace destination;
+  int64_t id;
+
+  std::tuple<int64_t, int64_t, float, MemorySpace, int64_t> AsTuple() const {
+    return std::make_tuple(exclusive_start_time, end_time, resource,
+                           destination, id);
+  }
+};
+
+// Represents a context for allocating a segment of an AllocationValue.
+// AllocationValue typically provides enough information to allocate the entire
+// live range of the AllocationValue, since all segments update only the
+// AllocationSequence belonging to the AllocationValue. However, in cases of
+// synchronous memory op conversion (e.g., copy, slice, etc.), we also need
+// to modify the AllocationSequence of the AllocationValue produced at the
+// synchronous memory op's output. This struct provides a context for allocating
+// a segment of an AllocationValue, specifying the uses of the AllocationValue
+// that we are processing, the index of the use that we are processing in the
+// AllocationValue's uses vector, the index of the AllocationValue in
+// Span<AllocationValue>, whose allocation sequence we will update, and whether
+// the use is only processed to extend the lifetime of its operand's allocation,
+// and the use will not receive a new allocation.
+struct AllocationSegmentContext {
+  // The uses of the AllocationValue that we are processing.
+  const std::vector<AllocationValue::Use>* uses;
+  // The index of the use that we are processing in the AllocationValue's
+  // AllocationValue::uses vector.
+  int use_idx;
+  // Index of the AllocationValue in allocation_values that is being processed
+  // in AllocateAllocationValues(), whose allocation sequence we will be
+  // updated.
+  int allocation_value_to_update_idx;
+  // If true, the use is only processed to extend the lifetime of its operand's
+  // allocation, and the use will not receive a new allocation.
+  bool only_extend_existing_allocation;
+};
+
+// Compare asynchronous copies such that an earlier start time has the same or
+// earlier end time and an earlier end time has the same or earlier start time.
+bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b);
+
+bool operator==(const AsynchronousCopy& a, const AsynchronousCopy& b);
+bool operator!=(const AsynchronousCopy& a, const AsynchronousCopy& b);
+
+// Helper class to enforce asynchronous copy ordering. If the appropriate option
+// is enabled, we only allow asynchronous copies that are pipelined: if an
+// asynchronous copy ends earlier than another asynchronous copy, it must start
+// the same time or earlier than the other asynchronous copy; and if an
+// asynchronous copy starts earlier than another asynchronous copy, it must end
+// the same time or earlier than the other asynchronous copy.
+class AsynchronousCopyOrdering {
+ public:
+  AsynchronousCopyOrdering() = default;
+
+  // Adds an asynchronous copy.
+  void AddCopy(const AsynchronousCopy& copy);
+
+  // Removes an asynchronous copy. CHECKs that it is removed.
+  void RemoveCopy(const AsynchronousCopy& copy);
+
+  // Returns true if the addition of an asynchronous copy in the given time
+  // interval would violate the asynchronous copy ordering. E.g., consider the
+  // following scenario:
+  //                                  CS          CD
+  //  already committed async copy:   +-----------+
+  //                new async copy:     +--------+
+  //
+  // The new asynchronous copy would violate the ordering guarantee because the
+  // copy start is after an already committed asynchronous copy while its copy
+  // done is before the committed copy.
+  bool ViolatesOrdering(int64_t exclusive_start_time, int64_t end_time) const;
+
+ private:
+  // We use this data structure for keys into the map that has a custom
+  // comparator for the ordering guarantees.
+  struct Interval {
+    int64_t exclusive_start_time;
+    int64_t end_time;
+
+    // We allow multiple prefetches that have one or both of the same start and
+    // end times. std::map considers two values as equal if neither are less
+    // than the other.  Using this comparator, we can ensure that the only
+    // intervals that evaluate to be equal are those with the same start and end
+    // times or those with intervals that violate the FIFO order.
+    bool operator<(const Interval& other) const {
+      return (exclusive_start_time < other.exclusive_start_time &&
+              end_time <= other.end_time) ||
+             (exclusive_start_time <= other.exclusive_start_time &&
+              end_time < other.end_time);
+    }
+  };
+  // Stores asynchronous copies in a tree set respecting the pipelining order.
+  std::map<Interval, std::set<AsynchronousCopy>> ranges_;
+};
+
+// Helper class to enforce asynchronous copy resources by keeping track of
+// available copy bandwidth and elapsed times of overlapped operations. It
+// maintains a list of initial resources that correspond to the elapsed times of
+// overlapped operations. As asynchronous copies are added, the available
+// resource is subtracted to keep track of the current state.
+class AsynchronousCopyResource {
+ public:
+  // A specification of needed asynchronous copy resources.
+  struct ResourceSpec {
+    int64_t exclusive_start_time;
+    int64_t end_time;
+    float resource;
+  };
+
+  AsynchronousCopyResource() = default;
+
+  // The constructor needs the initial resources.
+  explicit AsynchronousCopyResource(absl::Span<const float> initial_resources)
+      : initial_resources_(initial_resources.begin(), initial_resources.end()),
+        delay_(initial_resources.size(), 0) {}
+
+  // Adds the given asynchronous copy and updates the current resources. CHECK
+  // fails if there aren't enough resources to satisfy this copy (the caller
+  // should use HasEnoughResource first to ensure there is enough resource).
+  void AddCopy(const AsynchronousCopy& copy);
+
+  // Removes the given copy and frees the resource.
+  void RemoveCopy(const AsynchronousCopy& copy);
+
+  // Returns true if a copy with the given start and end times and resource can
+  // be satisfied.
+  bool HasEnoughResource(int64_t exclusive_start_time, int64_t end_time,
+                         float resource);
+
+  // Returns true if a set of copy specifications can be satisfied in the
+  // order specified.
+  bool HasEnoughResourceMultiCheck(const std::vector<ResourceSpec>& specs);
+
+  // This is only used for debugging and testing purposes, it returns the
+  // currently available resource at each logical time.
+  std::vector<float> GetCurrentResources() const {
+    std::vector<float> current_resources(initial_resources_.begin(),
+                                         initial_resources_.end());
+    for (int i = 0; i < current_resources.size(); ++i) {
+      current_resources[i] -= std::min(current_resources[i], delay_[i]);
+    }
+    return current_resources;
+  }
+
+  // A useful debugging tool for printing several pieces of information about
+  // AsynchronousCopyResource.
+  std::string Dump(int64_t start_time, int64_t end_time,
+                   MemorySpace memory_space_filter) const;
+
+ private:
+  // Internal helper method to implement adding/removing/checking resources.
+  // ConsumeResource() may modify delay_. If delay_change_map is not null,
+  // for any change to delay_[i], {i, delay_[i]} will be added to
+  // delay_change_map, allowing callers to undo any modifications.
+  bool ConsumeResource(
+      int64_t exclusive_start_time, int64_t end_time, float resource,
+      absl::flat_hash_map<int64_t, float>* delay_change_map = nullptr,
+      float resource_to_free = 0.0);
+
+  // Same as the public RemoveCopy except it works on the async_copies_
+  // iterator. Assumes copy_it points to the last copy for its start time;
+  // otherwise the public RemoveCopy method is supposed to temporarily remove
+  // these later copies that share the same start time before removing the
+  // requested copy.
+  void RemoveCopy(std::list<AsynchronousCopy>::iterator& copy_it);
+
+  // We maintain a linked list of asynchronous copies sorted by the start times.
+  // This allows us to efficiently find the copy that starts right after another
+  // one because adding a copy might push a copy further into the future.
+  std::list<AsynchronousCopy> async_copies_;
+// To make the lookups into async_copies_ more efficient, we also maintain a
+// binary tree that is indexed by the start time, containing iterators into
+// async_copies_.
+// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
+#if defined(__GNUC__) || defined(__clang__)
+  absl::btree_map<int64_t, std::list<AsynchronousCopy>::iterator>
+      async_copy_time_map_;
+#else
+  std::map<int64_t, std::list<AsynchronousCopy>::iterator> async_copy_time_map_;
+#endif
+  std::vector<float> initial_resources_;
+  std::vector<float> delay_;
+};
+
+// This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
+// maximum size.
+//
+// Note: Memory space assignment (MSA) creates an MsaAlgorithm object and passes
+// it to the HeapSimulator. The HeapSimulator calls Alloc(), Free() and
+// ShareWith() on the MsaAlgorithm object to create buffer intervals (populate
+// buffer_intervals_), these methods are inherited from
+// GlobalDecreasingSizeBestFitHeap. The HeapSimulator finally calls the Finish()
+// method which is overridden in this class.
+class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
+ public:
+  using HloPositionOrUse = std::variant<HloPosition, HloUse>;
+
+  MsaAlgorithm(AllocationSequence* allocations, const Options& options,
+               const HloAliasAnalysis& alias_analysis,
+               const HloLiveRange& hlo_live_range);
+
+  // Allocates a buffer in preferred memory with whole program lifetime and
+  // enables prefetching prefetch_candidate from default memory across program
+  // boundaries.
+  void AllocateCrossProgramPrefetchBuffer(
+      HloModule* module, const MsaBufferInterval& prefetch_candidate);
+
+  // Given an HloValue, returns a group of HloValues that need to be processed
+  // jointly. Normally, HloValues can be processed individually. However, in
+  // case we are trying to replace synchronous copies, we need to jointly
+  // process all values that are produced or consumed by a synchronous memory
+  // call instruction.
+  std::vector<const HloValue*> GenerateJointProcessedValues(
+      const HloValue* entrance_value);
+
+  // Updates sorted_sync_copy_replacement_candidates_ with synchronous copy
+  // instructions that connect the given joint processed values, and meet the
+  // conditions in IsReplaceableSyncCopyCandidate().
+  void UpdateSyncDataMovementCandidatesForJointProcessedValues(
+      const std::vector<const HloValue*>& joint_processed_values);
+
+  // Returns true if repack_allocation_blocks_ includes an AllocationBlock
+  // belonging to a converted synchronous memory operations.
+  bool RepackAllocationsIncludeConvertedSyncMemOp();
+
+  absl::StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
+
+ protected:
+  // Given a buffer interval, returns the colocated intervals. Unlike the
+  // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
+  // returns the colocated intervals sorted by scheduled time.
+  std::vector<const MsaBufferInterval*> GetSortedColocatedIntervals(
+      const MsaBufferInterval& interval) const;
+
+  // Given a MsaBufferInterval, creates AllocationValue objects and
+  // corresponding AllocationSequences and appends them into
+  // allocation_sequence_list_.
+  void CreateAllocationValues(
+      const MsaBufferInterval& buffer_interval,
+      std::vector<AllocationValue>& allocation_values) const;
+
+  // Given colocated intervals, populates allocation_values with the
+  // corresponding AllocationValue objects.
+  virtual void CreateAllocationValuesFromColocatedIntervals(
+      absl::Span<const MsaBufferInterval* const> colocated_intervals,
+      std::vector<AllocationValue>& allocation_values);
+
+  // Go through all the uses in the AllocationValues and find the aliasing
+  // positions.
+  void FindAliases(std::vector<AllocationValue>* allocation_values) const;
+
+  AllocationSequence* allocations() { return allocations_; }
+  const Options& options() const { return options_; }
+  const HloAliasAnalysis& alias_analysis() { return alias_analysis_; }
+  const HloLiveRange& hlo_live_range() { return hlo_live_range_; }
+
+ private:
+  // We inherit AllocationBlock struct to attach the Allocation information to
+  // make importing repacked offsets easier.
+  struct RepackAllocationBlock : AllocationBlock {
+    Allocation* allocation;
+  };
+  // This struct contains mandatory memory assignments at a given time. E.g., an
+  // input's required memory assignment time would correspond to the definition
+  // time of the parameter instruction, and an output's time would correspond to
+  // the time of last use.
+  struct RequiredMemoryAssignment {
+    MemorySpace memory_space;
+    int64_t time;
+    AliasedOffset* offset;
+
+    bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && offset == other.offset;
+    }
+
+    bool operator==(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && time == other.time &&
+             offset == other.offset;
+    }
+
+    bool operator!=(const RequiredMemoryAssignment& other) const {
+      return !(*this == other);
+    }
+  };
+
+  // A struct that contains a pointer to loop-optimized allocation along with
+  // essential data about the loop itself.
+  struct LoopOptimizedAllocationInfo {
+    // The use_idx is the instruction index of the use within the loop.
+    int64_t use_index;
+    // The number of instructions in one iteration of the loop. We use use_index
+    // and loop_size to calculate when exactly to schedule a prefetch
+    // instruction.
+    int64_t loop_size;
+    // A pointer into an Allocation in loop_optimized_allocations_.
+    const Allocation* loop_optimized_allocation;
+  };
+
+  // A context object that is used to share state amongst the methods that
+  // implement Prefetch(). Prefetch tries to find both a sliced solution and an
+  // unsliced solution at the same time. We store both in this structure.
+  struct PrefetchContext {
+    // Prefetching is designed to operate on a SlicedBufferInterval that is
+    // backed by a standard MsaBufferInterval, even if the number of slices
+    // == 1. WorkingIntervals is used to store a SlicedBufferInterval and its
+    // backing MsaBufferInterval.
+    struct WorkingIntervals {
+      MsaBufferInterval full;
+      // sliced is a unique_ptr because it won't necessarily be initialized
+      // when the WorkingBufferIntervals are created, and there is no way to
+      // create an empty SlicedBufferInterval.
+      std::unique_ptr<SlicedBufferInterval> sliced;
+    };
+
+    struct SlicedSolution {
+      // When we talk about a slice, we think of spatial slices, where each
+      // slice is allocated at different times. The following example shows
+      // 3 slices that are used to form a contiguous buffer from [p0, p3]
+      //
+      //   space
+      //    ^
+      // p3 |       +-----------+
+      //    |       |    s2     |
+      // p2 |   +---+-----------+
+      //    |   |      s1       |
+      // p1 |   +-------+-------+
+      //    |           |  s0   |
+      // p0 |           +-------+
+      //    +---|---|---|---|---|----> time
+      //        t0  t1  t2  t3  t4
+      std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
+
+      // In order to support colocated buffer calculations, we need to add a
+      // MsaBufferInterval-Chunk pair to pending_chunks_, such that:
+      // - The duration of the MsaBufferInterval is non-zero.
+      // - All slices have been allocated by the start of the MsaBufferInterval.
+      // - The MsaBufferInterval ends at the end time for all slices.
+      // - The Chunk covers the space allocated for all slices.
+      //
+      // In order to meet that requirement,
+      // we create MsaBufferInterval-Chunk pairs from
+      // slice_decisions_sorted_by_start_time that meet those requirement but do
+      // not cause any memory to be allocated in more than one Chunk at a time.
+      // The result is stored in slices_for_pending_chunks.
+      //
+      // The illustration below demonstrates how we would construct such
+      // MsaBufferInterval-Chunk pairs from the
+      // slice_decisions_sorted_by_start_time example above.
+      //
+      //   space
+      //    ^
+      // p3 |       +---+---+---+
+      //    |       |c2 |       |
+      // p2 |   +---+---+       |
+      //    |   |  c0   |   c2  |
+      // p1 |   +-------+       |
+      //    |           |       |
+      // p0 |           +-------+
+      //    +---|---|---|---|---|----> time
+      //        t0  t1  t2  t3  t4
+      std::vector<std::pair<MsaBufferInterval, Chunk>>
+          slices_for_pending_chunks;
+
+      // The prefetch_picker_debug_string will only be set with the appropriate
+      // VLOG level.
+      std::string prefetch_picker_debug_string;
+    };
+
+    struct UnslicedSolution {
+      Chunk chunk_candidate;    // The chunk chosen for the solution.
+      float prefetch_resource;  // The amount of required prefetch resource.
+      // The prefetch_picker_debug_string will only be set with the appropriate
+      // VLOG level.
+      std::string prefetch_picker_debug_string;
+    };
+
+    WorkingIntervals& GetMutableWorkingIntervals(bool for_sliced_solution) {
+      if (for_sliced_solution) {
+        return sliced_solution_intervals;
+      }
+      return unsliced_solution_intervals;
+    }
+
+    const WorkingIntervals& GetWorkingIntervals(
+        bool for_sliced_solution) const {
+      if (for_sliced_solution) {
+        return sliced_solution_intervals;
+      }
+      return unsliced_solution_intervals;
+    }
+
+    // Parameters to Prefetch().
+    const AllocationRequest* request;
+    Allocation* prev_allocation_in_default_mem;
+
+    // Intermediate calculations common to both the sliced and unsliced
+    // solutions.
+    int64_t exclusive_prefetch_start_time = -1;
+    int64_t prefetch_end_time = -1;
+    const Shape* full_shape;
+    int64_t extra_async_copy_limit = 0;
+    // As a compilation time optimization, store the prefetch start time where
+    // we have first seen out of memory. There is no point of exploring prefetch
+    // start times earlier than this point.
+    std::optional<int64_t> exclusive_out_of_mem_start = std::nullopt;
+
+    // Data structures used to compute and store the sliced solution.
+    std::optional<SliceProposalCollection> slice_proposal_collection =
+        std::nullopt;
+    WorkingIntervals sliced_solution_intervals;
+    std::optional<SlicedSolution> sliced_solution;
+
+    // Data structures used to compute and store the unsliced solution.
+    WorkingIntervals unsliced_solution_intervals;
+    std::optional<UnslicedSolution> unsliced_solution;
+
+    // Indicates whether the prefetch is for a windowed prefetch. A window
+    // prefetch only prefetches a window worth of data. Its prefetch does not
+    // use sliced prefetch.
+    bool window_prefetch = false;
+  };
+
+  // Return true if the result belongs to a failure.
+  static bool result_is(AllocationResult result, AllocationResult failure) {
+    return static_cast<int>(result) & static_cast<int>(failure);
+  }
+
+  // Mark (bitwise OR) a failure to the result.
+  static AllocationResult result_mark(AllocationResult failure,
+                                      AllocationResult& result) {
+    result = static_cast<AllocationResult>(static_cast<int>(result) |
+                                           static_cast<int>(failure));
+    return result;
+  }
+
+  // Return a string representation of a result that has at most a single
+  // failure. Consider using ResultToString for a general case.
+  static std::string SingleFailureResultToString(
+      const AllocationResult& result);
+  // Return a string representation of the result, with possibly more than one
+  // failure.
+  static std::string ResultToString(const AllocationResult& result);
+
+  // Return true if the result is a failure that requires us to uncommit pending
+  // chunks.
+  static bool result_requires_uncommit(AllocationResult result) {
+    return result_is(result, AllocationResult::kFailRequiresUncommit);
+  }
+
+  // Return true if the result is a failure either due to running out of
+  // outstanding asynchronous copies or due to violating asynchronous copy
+  // ordering.
+  static bool result_failed_because_of_async_copy(AllocationResult result) {
+    return result_is(result, AllocationResult::kFailOutOfAsyncCopies) ||
+           result_is(result, AllocationResult::kFailViolatesAsyncCopyResource);
+  }
+
+  // For the given loop with the start and end index and loop size, run the
+  // MemoryBoundLoopOptimizer and record its outputs into
+  // optimized_allocations_map_.
+  absl::Status OptimizeMemoryBoundLoop(int loop_start_idx, int loop_end_idx,
+                                       int loop_size);
+
+  // Identify memory-bound loops in the graph and call OptimizeMemoryBoundLoop
+  // for the found loops.
+  void IdentifyAndOptimizeMemoryBoundLoops();
+
+  // Returns true if the instruction meets the preconditions of a replaceable
+  // synchronous copy or slice instruction. This only checks for necessary
+  // conditions, and doesn't guarantee a successful replacement.
+  bool IsAsyncConversionCandidate(const HloInstruction* instruction) const;
+  // Not supported instructions for sync copy replacement:
+  // 1. Layout-changing copies
+  // 2. Instruction operand or output has a pre-specified memory space
+  bool IsAsyncConversionCopyCandidate(const HloInstruction* instruction) const;
+
+  enum class AsyncConversionResult {
+    kSuccess = 0,
+    kFeatureNotEnabled = 1,
+    kFailedPrecondition = 2,
+    kFailedValueNotAllowedInAlternateMemory = 4,
+    kFailedSatisfyingConstraints = 8,
+    kFailedNotProcessed = 16,
+    kFailedGaveUp = 32,
+  };
+
+  AsyncConversionResult IsAsyncConversionSliceCandidate(
+      const HloInstruction* instruction) const;
+
+  // Allocates buffers for instructions that need reserved scoped allocations in
+  // the alternate memory space.
+  void AllocateReservedScopedAllocations();
+
+  // Returns the AliasedOffset object associated with the allocation.
+  AliasedOffset* GetAliasedOffset(const Allocation& allocation);
+
+  // If aliased_offset is non-null, this method adds the allocation to
+  // aliased_offset. Otherwise, it creates a new AliasedOffset object and adds
+  // the allocation to this new AliasedOffset.
+  void CreateOrAddToAliasedOffset(const Allocation& allocation,
+                                  AliasedOffset* aliased_offset);
+
+  // Given an allocation sequence, returns the live allocation at time with a
+  // preference towards allocations in alternate memory. Returns nullptr if no
+  // allocation is alive at that time.
+  static Allocation* GetLiveAllocationAt(const AllocationSequence& allocations,
+                                         int64_t time);
+
+  // Returns true if the use is allowed in the alternate memory.
+  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                     const HloUse& use) const;
+
+  // Adjusts the preferred memory offset for a given use, taking aliasing
+  // constraints into account. If the use already has a preferred offset in the
+  // alternate memory space (e.g., due to prior allocations), the offset derived
+  // from aliasing considerations must match the existing preferred offset.
+  AliasedOffset* UpdatePreferredOffsetForUse(
+      const AllocationValue::Use& use, AliasedOffset* preferred_offset) const;
+
+  // Propagate the allocation at the use time to any aliases that this use might
+  // have had.
+  void UpdateAllocationRequirementForUseAliases(
+      const AllocationValue& allocation_value, const AllocationValue::Use& use,
+      int64_t use_time);
+
+  // For while uses that are allocated in the alternate memory space, if
+  // they also have an allocation in the default memory space in their
+  // allocation sequence, create a "parent" allocation that mirrors this
+  // default memory space allocation. When we process the parent
+  // allocation, we add an additional parameter to the while that is a
+  // reference to the buffer in the default memory space. With parent
+  // allocations, we don't need to unnecessarily evict buffers since they
+  // already have a copy in the default memory space. We search backwards
+  // (latest to earliest in execution time) for a suitable allocation in
+  // order to find the most recent one.
+  void MaybeCreateMirroredParentAllocationForWhileUse(
+      const AllocationValue& allocation_value, const AllocationValue::Use& use,
+      int64_t use_time, absl::Span<AllocationValue> allocation_values,
+      absl::flat_hash_map<const HloComputation*, AliasedOffset*>&
+          preferred_offset_for_computation);
+
+  // Creates a detailed memory allocation request for a given use of an
+  // allocation value. Analyzes the usage pattern of the use to determine if it
+  // can be placed in alternate memory, considering the restrictions for loops
+  // and conditionals. Also calculates the timing for prefetching, taking into
+  // account instruction schedules, operation type (e.g., sequential vs.
+  // non-sequential calls), and prior usage patterns. We add the resulting
+  // Allocation to the AllocationSequence of allocation_value_to_update. When
+  // only_extend_existing_allocation is true, no new Allocations will be created
+  // while processing the resulting AllocationRequest, and we only need to
+  // extend an existing Allocation's end_time.
+  AllocationRequest CreateAllocationRequest(
+      AllocationValue& allocation_value,
+      AllocationValue& allocation_value_to_update,
+      const AllocationValue::Use& use, const AllocationValue::Use* previous_use,
+      AliasedOffset* preferred_offset, int64_t definition_time,
+      bool require_no_copy_alternate_mem_allocation,
+      const std::vector<int64_t>& all_use_times,
+      bool only_extend_existing_allocation);
+
+  // Returns true, if the allocation value requires a pinned allocation in the
+  // alternate memory space.
+  bool RequiresNoCopyAlternateMemAllocation(
+      AllocationValue& allocation_value) const;
+
+  // Adds a required assignment in default memory, at the given time, if
+  // allocation_value's defining position is not allowed in alternate memory.
+  void AssignDefaultMemIfNotAllowedInAlternateMem(
+      AllocationValue& allocation_value, int64_t time);
+
+  // Returns all AllocationSegmentContexts needed for a given set of
+  // AllocationValues that we would like to process jointly.
+  std::vector<AllocationSegmentContext> GenerateAllocationSegmentContexts(
+      absl::Span<AllocationValue>& allocation_values,
+      absl::flat_hash_map<const HloInstruction*, std::vector<size_t>>&
+          value_indices_by_sync_inst,
+      int allocation_value_idx) const;
+
+  bool VerifyAllConversionsAreSuccessful();
+
+  // Finds allocations for allocation values generated from colocated intervals.
+  // All of the allocation values have a must-alias relationship with each
+  // other. Returns either kSuccess if all of the sites could be placed in the
+  // alternate memory or a bitwise OR of failure reasons why they couldn't
+  absl::StatusOr<AllocationResult> AllocateAllocationValues(
+      absl::Span<AllocationValue> allocation_values);
+
+  // Finds an allocation for an allocation request for a segment (see the
+  // documentation for AllocationRequest above how a segment is defined).
+  //
+  // It performs three things in the following order:
+  //  1- Allocate the allocation request entirely in the alternate memory, if
+  //     there is enough space and if the prefetch interval picker allows.
+  //  2- If (1) was unsuccessful, and the only allocation for
+  //     this buffer was in the alternate memory, we try to perform a prefetch.
+  //  3- If (1) was unsuccessful, prefetch the buffer into the alternate memory,
+  //     if there is enough space and if the prefetch interval picker allows.
+  //
+  // If an eviction (2) was requested and was unsuccessful, this method returns
+  // Result::kFailRequiresUncommit. This means we could not find a suitable
+  // allocation, so all previous allocations for this buffer must be removed and
+  // allocated in the default memory. Otherwise, this method may return
+  // Result::kSuccess if the buffer could be placed in alternate memory or some
+  // other Result with an OR of reasons why the buffer couldn't be placed in
+  // alternate memory.
+  AllocationResult AllocateSegment(AllocationRequest& request);
+
+  // Try allocating in alternate memory without any copies.
+  AllocationResult AllocateInAlternateMemoryNoCopy(
+      const AllocationRequest& request);
+
+  // Try evicting to default memory space.
+  AllocationResult Evict(const AllocationRequest& request);
+
+  // Returns the time a copy done of a prefetch should be scheduled.
+  int64_t FindPrefetchEndTime(const AllocationRequest& request,
+                              int64_t earliest_prefetch_time) const;
+
+  // Try prefetching to alternate memory space.
+  AllocationResult Prefetch(const AllocationRequest& request,
+                            Allocation& prev_allocation_in_default_mem,
+                            const Shape* shape = nullptr);
+
+  // Helper methods used to implement Prefetch().
+  //
+  // Generates a SliceProposal in context, if options dictate and one can be
+  // constructed.
+  void GenerateSliceProposal(PrefetchContext& context) const;
+  // Calls GenerateSliceProposal to potentially create a SliceProposal, and
+  // sets up WorkingIntervals for a sliced and unsliced solution. Updates
+  // context.
+  void SetupPrefetchWorkingIntervalsAndSliceProposal(
+      PrefetchContext& context) const;
+  // Initializes the PrefetchIntervalPicker and associated data structures in
+  // context.
+  AllocationResult InitializePrefetchIntervalPicker(PrefetchContext& context);
+  // As a compile time optimization, try a prefetch allocation that is as late
+  // as possible. If this is not able to find a solution, none of the
+  // earlier tries will succeed either.
+  AllocationResult EnsureSomeSpatialPrefetchFitExists(
+      PrefetchContext& context) const;
+  // Check if for the specified type of solution, using the parameters in
+  // context. If we find a solution, it will be stored in context.
+  AllocationResult CheckPrefetchFit(bool for_sliced_solution,
+                                    PrefetchContext& context);
+  // Creates a debugging string describing the timing of the prefetch solution
+  // we are currently attempting (as dictated by for_sliced_solution and
+  // context).
+  std::string AlternateMemoryAllocationAttemptToString(
+      bool for_sliced_solution, const PrefetchContext& context) const;
+
+  // Try to prefetch a window worth of data into the alternate memory.
+  AllocationResult WindowPrefetch(const AllocationRequest& request,
+                                  Allocation& prev_allocation_in_default_mem);
+
+  // Find the best possible chunk candidate, where it has the longest possible
+  // availability if no preferred offset is given, or at the preferred_offset if
+  // it is given.
+  std::optional<Chunk> FindBestChunkCandidate(
+      const AllocationRequest& request, const AliasedOffset* preferred_offset,
+      MsaBufferInterval* alternate_mem_interval) const;
+  // The same as FindBestChunkCandidate() but allocates the request in slices.
+  // The ith returned chunk should be allocated at slice time i.
+  std::vector<Chunk> FindBestChunkCandidates(
+      const AllocationRequest& request, const AliasedOffset* preferred_offset,
+      SlicedBufferInterval* alternate_mem_interval) const;
+
+  // Returns the corrected schedule time of an HloUse. The corrected time is
+  // equivalent to the actual time of the use instructions for all instructions
+  // except for while and conditional instructions. For while instructions, the
+  // corrected time is the time of the body parameter, and for conditional, the
+  // corrected time is the time of the parameter of the earliest-scheduled
+  // called computation.
+  int64_t GetCorrectedUseTime(const HloUse& use) const;
+  int64_t GetCorrectedUseTime(const HloInstruction* instruction) const;
+
+  // Returns the required assignment at a particular time, if available.
+  std::optional<RequiredMemoryAssignment> RequiredMemoryAssignmentAt(
+      const HloValue* buffer, int64_t time) const;
+
+  // Searches for aliases in the use for a required assignment, and returns it
+  // if found.
+  std::optional<RequiredMemoryAssignment> AliasedRequiredAssignmentForUse(
+      const AllocationValue::Use& use) const;
+
+  // Goes through the colocated intervals and adds any required assignment.
+  void AddRequiredAssignmentsForColocatedIntervals(
+      absl::Span<const MsaBufferInterval* const> colocated_intervals);
+
+  // Propagates aliased required assignment for a given position.
+  void AddAliasedRequiredAssignment(const HloInstruction* instruction,
+                                    ShapeIndex index,
+                                    const Allocation* aliased_allocation);
+
+  // This sets a required assignment. CHECK fails if there is a conflicting
+  // required assignment at the same time.
+  void AddRequiredAssignment(const HloValue* value,
+                             const HloInstruction* instruction,
+                             MemorySpace memory_space, int64_t time,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+  void AddRequiredAssignment(const HloInstruction* instruction,
+                             ShapeIndex index, MemorySpace memory_space,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+  void AddRequiredAssignment(const HloPosition& position,
+                             MemorySpace memory_space,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+  void AddRequiredAssignment(const HloUse& use, MemorySpace memory_space,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+
+  // Adds input and outputs as required assignments.
+  void AddInputAndOutputRequiredAssignments();
+
+  // Returns a list of "linked" allocations in the alternate memory. Linked
+  // allocations all share a common allocation site (a use or position) with
+  // each other. This can be used to determine if a group of linked allocations
+  // are considered efficient or not.
+  std::vector<std::vector<const Allocation*>>
+  GetLinkedAllocationsInAlternateMemory(
+      absl::Span<const AllocationValue> allocation_values) const;
+
+  // Returns allocation sites (use or position) that are allocated in the
+  // alternate memory, but is considered inefficient.  These arise in the
+  // context of in-place operation like dynamic-update-slice.  We will typically
+  // have an allocation that has the DUS as a use, and another allocation that
+  // has the DUS as a defining position. These two allocation will be part of
+  // the same linked allocation group.
+  //
+  // One reason why an allocation site could be inefficient is because the
+  // amount of data that is asynchronously copied (prefetch and eviction) is
+  // much larger than the amount of data that is used by the HLOs. If we find
+  // inefficient allocation sites, we can require these sites default memory
+  // allocations and allocate them again.
+  std::vector<HloPositionOrUse> GetInefficientAllocationSites(
+      absl::Span<const AllocationValue> allocation_values) const;
+
+  // Returns true if the colocated intervals in the argument are in a parameter
+  // or root instruction of the entry computation and are reserved by the user
+  // to be in the alternate memory space.
+  bool AreIntervalsReservedInAlternateMemory(
+      absl::Span<const MsaBufferInterval* const> colocated_intervals) const;
+
+  // Since the allocations are recorded to the AllocationSequence, we don't
+  // maintain result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap
+  // to avoid unnecessarily adding the chunk to the chunk map.
+  //
+  // Sliced prefetching requires that we override this method because we
+  // associate more than one chunk with a buffer (i.e., 1 chunk per slice),
+  // which would cause the original implementation of this method to CHECK fail.
+  void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {}
+
+  // Returns true if the addition of num_additional_copies asynchronous copies
+  // in the given time interval would violate the maximum number of asynchronous
+  // copies. An extra  async copy limit can be provided to increase the limit of
+  // asynchronous copies for this instance.
+  bool ViolatesMaximumOutstandingAsyncCopies(
+      int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
+      int64_t extra_async_copy_limit = 0,
+      int64_t num_additional_copies = 1) const;
+
+  // Exports the allocations for repacking and puts them into the vector in the
+  // parameter.
+  void ExportAllocationsForRepacking(
+      std::vector<AllocationBlock*>& allocations);
+
+  // Update reserved scoped allocation size for instructions when their
+  // operand/output has been allocated in alternate memory by invoking
+  // reserved_scoped_memory_fn
+  void UpdateReservedScopedAllocationSize();
+
+  // Imports repacked allocations and updates the internal data structures
+  // consistent with the new packing.
+  void ImportRepackedAllocations();
+  // Helper functions to implement ImportRepackedAllocations.
+  void ImportRepackedNonSlicedAllocation(RepackAllocationBlock& block);
+  void ImportRepackedSlicedAllocation(RepackAllocationBlock& block);
+  absl::Status AreRepackedSlicesValid(const RepackAllocationBlock& block);
+
+  // Registers an asynchronous copy with asynchronous copy data structures to
+  // keep track of its state.
+  void RegisterAsyncCopy(MemorySpace memory_space, int64_t exclusive_start_time,
+                         int64_t copy_done_schedule_before_time,
+                         AllocationSequence* allocations,
+                         AliasedOffset* aliased_offset, float resource,
+                         std::optional<int> cross_program_prefetch_index);
+
+  // Adds an asynchronous copy or other memory operation (e.g., slice) to
+  // allocations. We pass sync_mem_op to the CopyAllocation constructor. When
+  // sync_mem_op is set, instead of an async copy, CopyAllocation::Process()
+  // will replace sync_mem_op with the async version of sync_mem_op's opcode
+  // (e.g., slice) and shape.
+  void AddAsyncCopyOrOtherMemOp(
+      Allocation& prev_allocation, MemorySpace memory_space,
+      std::optional<Chunk> chunk, int64_t exclusive_start_time,
+      int64_t end_time, int64_t copy_done_schedule_before_time,
+      AllocationSequence* allocations, AliasedOffset* aliased_offset,
+      float resource,
+      std::optional<int> cross_program_prefetch_index = std::nullopt,
+      HloInstruction* sync_mem_op = nullptr);
+
+  // For prefetching, adds a SlicedCopyAllocation to allocations. Also updates
+  // asynchronous copy data structures, prefetch_interval_tree_, and aliasing
+  // data structures
+  void AddAsyncSlicesForPrefetch(
+      const Allocation& prev_allocation, AllocationSequence* allocations,
+      AliasedOffset* aliased_offset,
+      const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
+      int64_t prefetch_end_time, int64_t allocation_end_time,
+      HloInstruction* sync_mem_op);
+
+  // For window prefetching, adds a WindowPrefetchedAllocation to allocations.
+  // Also updates asynchronous copy data structures, prefetch_interval_tree_,
+  // and aliasing data structures.
+  void AddAsyncCopyForWindowPrefetch(
+      Allocation& prev_allocation, HloUse use, const Chunk& chunk,
+      int64_t exclusive_start_time, int64_t inclusive_end_time,
+      AllocationSequence* allocations, AliasedOffset* aliased_offset,
+      float resource, const WindowPrefetchedAllocation::Options& options);
+
+  // This method is used for committing the chunk candidate but adding it to
+  // pending_chunks_ so that we can "uncommit" them in case we need to roll back
+  // this allocation sequence.
+  void AddToPendingChunks(const MsaBufferInterval& buffer_interval,
+                          const Chunk& chunk);
+  // If we need to remove the allocations for this allocation sequence, this
+  // removes pending chunks and asynchronous copies in the respective pending
+  // buffers from the interval trees. If an allocation request returns
+  // kFailRequiresUncommit, this method must be called.
+  void UncommitPendingChunks(absl::Span<AllocationValue> allocation_values);
+
+  // Finalizes the allocations where they can no longer be uncommitted.
+  void FinalizeAllocations(absl::Span<AllocationValue> allocation_values);
+
+  // Clears all pending chunks and asynchronous copies.
+  void ClearPendingChunks();
+
+  // Returns true if we are trying to replace instruction with its async
+  // version, while processing JointAllocationProposal.
+  bool IsInstructionPendingReplacements(
+      const HloInstruction* instruction) const;
+
+  // Colors the colocated intervals in the alternate memory.
+  void ColorColocatedIntervalsToAlternate(
+      const std::vector<const MsaBufferInterval*>& colocated_intervals);
+
+  // A proposal for a group of values to be allocated jointly. Proposals are not
+  // guaranteed to be accepted, and when they fail, the algorithm will try to
+  // come up with a new proposal on a smaller subset of values.
+  struct JointAllocationProposal {
+    // The values that are being jointly processed.
+    std::vector<const HloValue*> values;
+    // The allocation values created for the joint-processed values.
+    std::vector<AllocationValue> allocation_values;
+    // The colocated buffer intervals for the joint-processed values. This is a
+    // vector of vectors, one vector per joint-processed value, and the
+    // colocation must be only enforced on intervals belonging to the same
+    // joint-processed value.
+    std::vector<std::vector<const MsaBufferInterval*>> colocated_intervals;
+  };
+
+  // Iterates over proposal's values and populates its allocation_values and
+  // colocated_intervals with the appropriate allocation values and colocated
+  // intervals created for the values.
+  void CreateAllocationValuesForJointProcessedValues(
+      JointAllocationProposal& proposal);
+
+  // Returns a JointAllocationProposal with values, allocation
+  // values, and colocated intervals that are proposed to be processed jointly
+  // for the given interval. Also, if the interval consumes or produces any
+  // synchronous memory call instructions (e.g., kCopy, kSlice) and the option
+  // to replace them with their asynchronous versions is enabled, this method
+  // will add those instructions to the sorted_async_conversion_candidates_
+  // vector.
+  JointAllocationProposal GetJointProposal(MsaBufferInterval& interval);
+
+  // Append buffer and allocation infos for debugging and dump it into a file,
+  // if enabled.
+  void AppendBufferInfoDebugString(const MsaBufferInterval& interval,
+                                   std::string* debug_str) const;
+  void AppendScopedAllocationBufferInfoDebugString(
+      const HloInstruction* instruction, int64_t time, int64_t size,
+      std::string& debug_str) const;
+  void AppendAllocationInfoDebugString(const Allocation& allocation,
+                                       std::string& debug_str) const;
+  void DumpDebugStringsIfEnabled() const;
+
+  // Returns the available heap size in the alternate memory.
+  int64_t available_heap_size() const {
+    return options_.max_size_in_bytes - reserved_in_bytes_;
+  }
+
+  // Returns the earliest time in the (exclusive_start_time, end_time) range
+  // that a new allocation with the given size would fit in the alternate
+  // memory. If it doesn't fit, it returns nullopt.
+  std::optional<int> FindEarliestExclusiveTimeToSatisfyPeakMemory(
+      int exclusive_start_time, int end_time, int64_t size) const;
+
+  // Creates and returns a RepackAllocationBlock.
+  static RepackAllocationBlock MakeRepackAllocationBlock(
+      int64_t start_time, int64_t end_time, int64_t size,
+      int64_t initial_offset, int64_t id, Allocation* allocation) {
+    RepackAllocationBlock allocation_block;
+    allocation_block.inclusive_start_time = start_time;
+    allocation_block.end_time = end_time;
+    allocation_block.size = size;
+    allocation_block.offset = -1;
+    allocation_block.initial_offset = initial_offset;
+    allocation_block.id = id;
+    allocation_block.next_colocated = nullptr;
+    allocation_block.allocation = allocation;
+    return allocation_block;
+  }
+
+  // Returns a vector of instructions that have the same fingerprint as this
+  // instruction.
+  const std::vector<const HloInstruction*>* GetRepeatedInstructionList(
+      const HloInstruction* instruction) const;
+
+  // Returns true if the interval is pinned in the alternate memory. Buffers are
+  // pinned when their layout has the alternate memory space before MSA runs.
+  bool IsIntervalPinnedToAlternateMemory(
+      const MsaBufferInterval& interval) const;
+
+  // A convenience debugging method that returns true if the prefetch context
+  // matches the described producer and consumer.
+  bool MatchesPrefetchContext(const PrefetchContext& context,
+                              absl::string_view producer_name,
+                              ShapeIndex producer_shape_index,
+                              absl::string_view consumer_name) const;
+
+  AllocationSequence* allocations_;
+  const Options& options_;
+  const HloAliasAnalysis& alias_analysis_;
+  const HloLiveRange& hlo_live_range_;
+  std::unique_ptr<CallGraph> call_graph_;
+  // We use a interval tree to keep track of the number of outstanding
+  // prefetches and evictions.
+  BufferIntervalTree prefetch_interval_tree_;
+  BufferIntervalTree eviction_interval_tree_;
+  AsynchronousCopyOrdering async_copy_ordering_;
+  AsynchronousCopyResource prefetch_async_copy_resource_;
+  AsynchronousCopyResource eviction_async_copy_resource_;
+  // A list of RepackAllocationBlock objects that mirrors allocation sequences,
+  // used for repacking. We use a list here because we need pointer stability
+  // for aliased allocations.
+  std::list<RepackAllocationBlock> repack_allocation_blocks_;
+  int64_t num_repacks_ = 0;
+  int64_t num_repacks_successful_ = 0;
+  std::vector<std::pair<MsaBufferInterval, Chunk>> pending_chunks_;
+  std::vector<AsynchronousCopy> pending_async_copies_;
+  std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
+      pending_required_assignments_;
+  // A list of candidate sync instructions that we are trying to replace with
+  // an asynchronous version, while processing the current interval, sorted by
+  // their order in the instruction schedule. Being in this list doesn't
+  // guarantee that the sync instruction will be converted to async.
+  std::vector<const HloInstruction*> sorted_async_conversion_candidates_;
+  // A cache to keep the peak memory usage at each point in the graph. We use
+  // this to see if the proposed allocation in the alternate memory would fit
+  // ignoring fragmentation, and if not, we can skip the more expensive lookup
+  // in the BufferIntervalTree, which also considers fragmentation.
+  std::vector<int64_t> peak_memory_usage_;
+  // The data structure that contains AliasedOffset objects and Allocation to
+  // AliasedOffset map for efficient lookup.
+  std::list<AliasedOffset> aliased_offsets_;
+  absl::flat_hash_map<const Allocation*, AliasedOffset*> aliased_offset_map_;
+  // This map contains required memory assignments for HloValues (e.g., input
+  // and outputs).
+  absl::flat_hash_map<const HloValue*, std::vector<RequiredMemoryAssignment>>
+      required_assignments_;
+  // Number of bytes reserved in alternate memory space.
+  int64_t reserved_in_bytes_ = 0;
+  // A rough measure of the memory pressure of the model, in bytes. Note that
+  // this is pressure for memory capacity (and not accessed bytes), and for
+  // alternate memory (not default memory).
+  int64_t memory_pressure_ = 0;
+  int64_t next_async_copy_id_ = 0;
+  // Fingerprint cache.
+  absl::flat_hash_map<const HloInstruction*, std::string> fingerprint_map_;
+  // Vector of repeated instructions (that have the same fingerprint) indexed by
+  // fingerprint.
+  absl::flat_hash_map<std::string, std::vector<const HloInstruction*>>
+      repeated_inst_map_;
+
+  // Loop-optimized allocations found by MemoryBoundLoopOptimizer. These
+  // allocation objects describe the allocations for one iteration of the loop,
+  // so we translate them into the program-level Allocation objects in
+  // allocations_.
+  std::vector<AllocationSequence> loop_optimized_allocations_;
+  // A map to look up the loop-optimized allocation info by use.
+  absl::flat_hash_map<HloUse, LoopOptimizedAllocationInfo>
+      loop_optimized_allocations_map_;
+  // A map to look the operands of each instruction that are assigned in
+  // alternate memory or are window prefetched.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<std::pair<int, ShapeIndex>>>
+      operands_in_alternate_memory_map_;
+  // A map to look the outputs of each instruction that are assigned in
+  // alternate memory.
+  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<ShapeIndex>>
+      outputs_in_alternate_memory_map_;
+  // HloValues whose allocation values have been finalized and cannot be
+  // uncommitted or changed.
+  absl::flat_hash_set<const HloValue*> finalized_values_;
+  // Set of sync copy instructions that we failed/succeeded in replacing with
+  // asynchronous copies.
+  absl::flat_hash_map<const HloInstruction*, AsyncConversionResult>
+      failed_async_conversions_;
+  absl::flat_hash_set<const HloInstruction*> successful_async_conversion_set_;
+  std::vector<const HloInstruction*> not_finalized_async_conversions_;
+  // Debug strings.
+  std::string buffer_info_str_;
+  std::string allocation_info_str_;
+  std::string instruction_schedule_str_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALGORITHM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/allocation.h
new file mode 100644
index 00000000..0e1d688a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -0,0 +1,561 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_H_
+
+#include <stdbool.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/shape.h"
+
+namespace xla::memory_space_assignment {
+
+// MemorySpaceAssignment uses a notion of a slow and large default memory
+// space and a fast and small alternate memory space.
+enum class MemorySpace : std::uint8_t { kDefault, kAlternate };
+
+// An interface describing what to do with a value in memory over its lifetime.
+// An allocation might either be placed in the default or alternate memory. An
+// HloValue might live in multiple different allocations over its lifetime. The
+// lifetimes of the allocations are defined using start_time and end_time, which
+// corresponds to the instruction indexes in the flattened schedule. Each of
+// these allocations might partially overlap with each other.
+//
+// Consider an instruction Foo, and its users Bar and Baz, and the times given
+// in terms of the flattened schedule of the entire module:
+//
+//      Foo:10
+//       /   \
+//    Bar:14  \
+//           Baz:25
+//
+// A valid memory space assignment could be like the following:
+//
+//  Time:         10 ... 14        ...      25
+//                Foo    Bar                Baz
+//  Alternate     +-------+           +-----+
+//  Default           +---------------------+
+//                    ^   ^           ^     ^
+//                    |   |           |     |
+//                evict   evict  prefetch  prefetch
+//                start    end    start      end
+//
+// This would be represented with:
+//   - PinnedAllocation(memory_space=kAlternate, start_time=10, end_time=14)
+//   - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25)
+//   - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25)
+class Allocation {
+ public:
+  virtual ~Allocation() = default;
+
+  // Allocation source methods
+  // --------------------------------------------------------------------------
+  // Returns the defining position for this allocation.
+  virtual HloPosition defining_position() const = 0;
+  // Returns the cross-program prefetch index for this allocation.
+  std::optional<int64_t> cross_program_prefetch_index() const;
+
+  // Allocation timing methods
+  // --------------------------------------------------------------------------
+  // TODO(cl/604356742): update all timing methods to explicitly state that
+  // they're representing inclusive intervals.
+  int64_t start_time() const { return start_time_; }
+  int64_t end_time() const { return end_time_; }
+  // Returns the time the buffer is first available to be used
+  virtual int64_t earliest_available_time() const = 0;
+  void set_start_time(int64_t start_time) { start_time_ = start_time; }
+  void set_end_time(int64_t end_time) { end_time_ = end_time; }
+  // Extends the end time of this allocation.
+  void Extend(int64_t end_time) { end_time_ = std::max(end_time_, end_time); }
+
+  // Allocation space methods
+  // --------------------------------------------------------------------------
+  MemorySpace memory_space() const { return memory_space_; }
+  // Returns the associated chunk that may be a nullopt if the allocation is
+  // in the default memory space.
+  std::optional<HeapSimulator::Chunk> maybe_chunk() const { return chunk_; }
+  // Returns the associated chunk. The caller should ensure that the chunk is
+  // defined (the allocation should be in the alternate memory space).
+  HeapSimulator::Chunk chunk() const;
+  HeapSimulator::Chunk* mutable_chunk() { return &*chunk_; }
+  void set_offset(int64_t offset);
+  bool is_scoped_allocation() const { return is_scoped_allocation_; }
+  // Returns true if the allocation is in the alternate memory space.
+  bool is_in_alternate_mem() const;
+  // Returns true if the allocation is in the default memory space.
+  bool is_in_default_mem() const;
+
+  // Use methods
+  // --------------------------------------------------------------------------
+  const std::vector<HloUse>& uses() const { return uses_; }
+  void clear_uses() { uses_.clear(); }
+  bool has_no_uses() const { return uses_.empty(); }
+  // Adds a use to this allocation.
+  void AddUse(HloUse use);
+  void RemoveUse(HloUse use);
+  // Replaces all uses of the allocation with the copy_complete instruction.
+  absl::Status UpdateUses(HloComputation* computation,
+                          HloInstruction* producing_instruction);
+
+  // Allocation type methods
+  // --------------------------------------------------------------------------
+  virtual bool is_pinned_allocation() const = 0;
+  virtual bool is_copy_allocation() const = 0;
+  virtual bool is_sliced_copy_allocation() const = 0;
+  virtual bool is_window_prefetched_allocation() const = 0;
+  // True if the allocation is for a copy or a sliced-copy.
+  bool is_copy_like_allocation() const;
+
+  // Processing methods
+  // --------------------------------------------------------------------------
+  // Recursively create kGetTupleElement instructions if the defining position
+  // shape is not an array. Returns the new instruction that has array shape.
+  HloInstruction* AddGetTupleElements() const;
+  // After all of the time ranges for the allocations have been assigned,
+  // Process morphs the instructions affected to assign the memory spaces and
+  // insert asynchronous copy instructions if necessary.
+  virtual absl::Status Process() = 0;
+  // An optional post-process step that will be called after all allocations
+  // have been processed.
+  virtual absl::Status PostProcess() = 0;
+  // Marks (adds this allocation to needed_allocations) if this allocation is
+  // needed. PinnedAllocation and CopyAllocations are always needed and
+  // ParentAllocations are needed if they have any uses or if other
+  // CopyAllocation or ParentAllocations depend on them.
+  virtual void MarkIfNeeded(
+      absl::flat_hash_set<const Allocation*>& needed_allocations) const = 0;
+  // Marks this allocation as needed.
+  virtual void MarkNeeded(
+      absl::flat_hash_set<const Allocation*>& needed_allocations) const = 0;
+
+  // Utility methods
+  // --------------------------------------------------------------------------
+  virtual std::string ToString() const = 0;
+  virtual bool operator==(const Allocation& other) const = 0;
+
+ protected:
+  // Protected constructor to encourage use of the final subclasses (e.g.,
+  // PinnedAllocation, CopyAllocation, etc.).
+  Allocation(HloPosition defining_position, MemorySpace memory_space,
+             std::optional<HeapSimulator::Chunk> chunk, int64_t start_time,
+             int64_t end_time, bool is_scoped_allocation,
+             std::optional<int64_t> cross_program_prefetch_index);
+
+  // Returns the original defining position of this allocation.
+  HloPosition original_defining_position() const;
+  // Sets the original defining position of this allocation.
+  void set_original_defining_position(HloPosition defining_position);
+  bool base_is_equal(const Allocation& other) const;
+
+ private:
+  HloPosition original_defining_position_;
+  MemorySpace memory_space_;
+  std::optional<HeapSimulator::Chunk> chunk_;
+  int64_t start_time_;
+  int64_t end_time_;
+  const bool is_scoped_allocation_;
+  std::vector<HloUse> uses_;
+  std::optional<int64_t> cross_program_prefetch_index_;
+};
+
+using AllocationSequence = std::vector<std::unique_ptr<Allocation>>;
+std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
+    const std::unique_ptr<Allocation>& allocation);
+void SortAllocationSequence(AllocationSequence& allocations);
+std::string AllocationSequenceToString(AllocationSequence& allocations,
+                                       bool sort_allocations = false);
+std::vector<Allocation*> GetAllocationSequenceInRawPointers(
+    AllocationSequence& allocations);
+
+// This class represents an allocation that pins a tensor to
+// a specific memory space.
+class PinnedAllocation final : public Allocation {
+ public:
+  PinnedAllocation(HloPosition defining_position, MemorySpace memory_space,
+                   std::optional<HeapSimulator::Chunk> chunk,
+                   int64_t start_time, int64_t end_time,
+                   bool is_scoped_allocation);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_pinned_allocation() const override { return true; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const PinnedAllocation& other) const;
+};
+
+// This class represents an allocation as a result of an asynchronous copy.
+// Note: CopyStart instructions are inserted after
+// `copy_start_schedule_after`, while CopyDone instructions are inserted
+// before `copy_done_schedule_before_time`.
+class CopyAllocation final : public Allocation {
+ public:
+  CopyAllocation(
+      Allocation& prev_allocation, MemorySpace memory_space,
+      std::optional<HeapSimulator::Chunk> chunk,
+      int64_t copy_start_schedule_after_time,
+      int64_t copy_done_schedule_before_time, int64_t end_time,
+      std::optional<int64_t> cross_program_prefetch_index = std::nullopt,
+      HloInstruction* sync_mem_op = nullptr);
+
+  // Overridden methods
+  //
+  HloPosition defining_position() const override;
+  // Returns the time the buffer is first available to be used. For
+  // CopyAllocation, this is when the copy ends, which is
+  // copy_done_schedule_before.
+  int64_t earliest_available_time() const override;
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return true; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  const HloInstruction* sync_mem_op() const { return sync_mem_op_; }
+  bool operator==(const CopyAllocation& other) const;
+
+  const Allocation& prev_allocation() { return prev_allocation_; }
+  Allocation& mutable_prev_allocation() { return prev_allocation_; }
+
+  HloInstruction* copy_start() const { return copy_start_; }
+  HloInstruction* copy_done() const { return copy_done_; }
+
+  void set_copy_start_schedule_after(int64_t copy_start_schedule_after);
+  void set_copy_done_schedule_before(int64_t copy_done_schedule_before);
+  int64_t copy_start_schedule_after() const;
+  int64_t copy_done_schedule_before() const;
+
+ private:
+  Allocation& prev_allocation_;
+  // These variables define the scheduling boundaries where CopyStart and
+  // CopyDone can be scheduled. The earliest CopyStart can be scheduled is
+  // after copy_start_schedule_after_ and the latest CopyDone can be scheduled
+  // is before copy_done_schedule_before_.
+  int64_t copy_start_schedule_after_;
+  int64_t copy_done_schedule_before_;
+  HloInstruction* copy_start_ = nullptr;
+  HloInstruction* copy_done_ = nullptr;
+  // The sync data movement instruction that this copy is associated with.
+  HloInstruction* sync_mem_op_ = nullptr;
+};
+
+// This class represents an allocation resulting from asynchronous sliced
+// copies.
+//
+// Let the sliced allocation be represented as follows, and imagine that t3
+// is the time when the entire buffer [p0, p3) is available for use
+//
+//   space
+//    ^
+// p3 |       +-----------+
+//    |       |           |
+// p2 |   +---+           |
+//    |   |               |
+// p1 |   +-------+       |
+//    |           |       |
+// p0 |           +-------+
+//    +---|---|---|---|---|----> time
+//        t0  t1  t2  t3  t4
+//
+// The PinnedAllocation underlying the SlicedCopyAllocation will use the
+// following dimensions:
+// - chunk = [p0, p3)
+// - start time = t2
+// - earliest_available_time = t3
+// - end_time = t4
+class SlicedCopyAllocation final : public Allocation {
+ public:
+  // Full details about a slice in the sliced allocation.
+  struct SliceDetail {
+    std::string ToString() const;
+    std::tuple<const SliceDecision&, int64_t, int64_t, const HloInstruction*,
+               const HloInstruction*>
+    ToTuple() const;
+    bool operator==(const SliceDetail& other) const;
+
+    // Create the instructions to copy the slice. This method updates
+    // copy_start and copy_done.
+    absl::Status CreateAsyncSlice(const Shape& original_shape,
+                                  HloInstruction& producer,
+                                  HloComputation& parent);
+
+    SliceDecision slice_decision;
+    int64_t copy_start_after_time = -1;
+    int64_t copy_done_before_time = -1;
+    HloInstruction* copy_start = nullptr;
+    HloInstruction* copy_done = nullptr;
+  };
+
+  // REQUIRES:
+  // - slice_decisions_sorted_by_exclusive_start_time.size() >= 2, otherwise,
+  //   CopyAllocation should be used.
+  SlicedCopyAllocation(
+      const Allocation& prev_allocation, MemorySpace memory_space,
+      std::vector<SliceDecision> slice_decisions_sorted_by_exclusive_start_time,
+      int64_t copy_done_schedule_before_time, int64_t end_time,
+      const SlicedPrefetchOptions& sliced_prefetch_options,
+      absl::FunctionRef<Shape(const Shape&)> get_equivalent_s8_shape_fn,
+      HloInstruction* sync_mem_op = nullptr);
+
+  // Overridden methods
+  //
+  HloPosition defining_position() const override;
+  // Returns the time the buffer is first available to be used. For
+  // SlicedCopyAllocation, this is when all copies have ended.
+  int64_t earliest_available_time() const override;
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return true; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  // MemorySpaceAssignment::Process() calls Process() to create asynchronous
+  // slice copies, and a bitcast-concat call to glue the slices back together.
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  // Marks the allocation as needed.
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  const HloInstruction* sync_mem_op() const { return sync_mem_op_; }
+  bool operator==(const SlicedCopyAllocation& other) const;
+
+  std::vector<int64_t> SliceOffsetsSortedByStartTime() const;
+  void AddDiffToAllSliceOffsets(int64_t diff);
+  // Used to update offsets and start times after repacking.
+  void ImportRepackedSliceData(const SlicedAllocationData& data);
+  const std::vector<SliceDetail>& slice_details_sorted_by_start_time() const;
+  std::vector<SliceDetail>& mutable_slice_details_sorted_by_start_time();
+  HloInstruction* concat() const { return concat_; }
+
+ private:
+  SlicedCopyAllocation() = delete;
+
+  // Create an instruction to concatenate the slices. Populates concat_.
+  absl::Status CreateBitcastConcat(const Shape& shape,
+                                   absl::Span<HloInstruction* const> slices);
+
+  Shape original_shape_to_slice_;
+  const Allocation& prev_allocation_;
+  // REQUIRES:
+  // - sorted_segments_[i].copy_start_after_time <=
+  //   sorted_segments_[i+j].copy.start_after_time
+  // - sorted_segments_[i].copy_done_before_time <=
+  //   sorted_segments_[i+j].copy.start_before_time
+  std::vector<SliceDetail> slice_details_sorted_by_exclusive_start_time_;
+  HloInstruction* concat_ = nullptr;
+  const SlicedPrefetchOptions& sliced_prefetch_options_;
+  absl::FunctionRef<Shape(const Shape&)> get_equivalent_s8_shape_fn_;
+  // The sync data movement instruction that this copy is associated with.
+  HloInstruction* sync_mem_op_ = nullptr;
+};
+
+// This class represents an allocation resulting from asynchronously prefetching
+// a window buffer. When a tensor is placed in the default memory, we can
+// prefetch the window buffer of the tensor to the alternate memory space. This
+// is called window prefetching.
+class WindowPrefetchedAllocation final : public Allocation {
+ public:
+  struct Options {
+    int64_t bytes = 0;
+    int64_t uid = 0;
+    int64_t alternate_memory_space = 0;
+    std::function<void(HloInstruction*, int64_t, int64_t)>
+        notify_operand_appended_fn =
+            [](const HloInstruction*, int64_t, int64_t) {};
+  };
+
+  WindowPrefetchedAllocation(Allocation& prev_allocation, HloUse use,
+                             const HeapSimulator::Chunk& chunk,
+                             int64_t prefetch_start_schedule_after_time,
+                             int64_t prefetch_done_schedule_before_time,
+                             const Options& options);
+
+  // Overridden methods
+  //
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override;
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return true; }
+  // MemorySpaceAssignment::Process() calls Process() to create asynchronous
+  // window prefetches.
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  // Marks the allocation as needed.
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const WindowPrefetchedAllocation& other) const;
+  bool operator==(const Allocation& other) const override;
+  int64_t bytes() const { return bytes_; }
+  int64_t prefetch_start_schedule_after() const {
+    return prefetch_start_schedule_after_;
+  }
+  int64_t prefetch_done_schedule_before() const {
+    return prefetch_done_schedule_before_;
+  }
+  HloInstruction* prefetch() const { return prefetch_instruction_; }
+
+ private:
+  // This method is called by Process() to create window prefetch instructions.
+  // These instructions include a pair of async WindowPrefetch which is passed
+  // to the fusion.
+  absl::Status InsertWindowPrefetchInstruction(
+      HloInstruction* producing_instruction, HloInstruction* use_instruction,
+      HloComputation* computation);
+
+  Options options_;
+  HloInstruction* prefetch_instruction_ = nullptr;
+  Allocation& prev_allocation_;
+  HloUse use_;
+  int64_t prefetch_start_schedule_after_;
+  int64_t prefetch_done_schedule_before_;
+  int64_t bytes_;
+};
+
+// An allocation in the default memory space that mirrors another Allocation
+// object. This is useful to model an eviction that happens before a while op
+// so that we don't need to redundantly evict the buffer after the while op as
+// well.
+class MirroredAllocation final : public Allocation {
+ public:
+  MirroredAllocation(const Allocation& original_allocation, int64_t time);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const MirroredAllocation& other) const;
+
+ private:
+  const Allocation& original_allocation_;
+};
+
+// An allocation in default memory space that is defined in the parent
+// computation. If a value has a copy in the default memory space in the
+// parent computation, we don't need to evict this buffer in a while loop.
+class ParentAllocation final : public Allocation {
+ public:
+  ParentAllocation(const Allocation& original_allocation,
+                   HloInstruction* calling_instruction, HloPosition position,
+                   int64_t time);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  absl::Status Process() override;
+  absl::Status PostProcess() override;
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const ParentAllocation& other) const;
+
+ private:
+  const Allocation& original_allocation_;
+  HloInstruction* calling_instruction_;
+};
+
+// A class with some utility functions that are useful in debugging.
+struct AllocationSequenceDebugging {
+  // Developers can call this method to log all the allocations in alternate
+  // memory, at a given instruction time.
+  //
+  // REQUIRED:
+  // - This method is intended to be called before MSA modifies the HloModule.
+  static void LogAltMemAllocationsAt(const AllocationSequence& allocations,
+                                     int64_t time);
+};
+
+}  // namespace xla::memory_space_assignment
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/allocation_value.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/allocation_value.h
new file mode 100644
index 00000000..55cd8c99
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/allocation_value.h
@@ -0,0 +1,300 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_VALUE_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_VALUE_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+
+namespace xla {
+namespace memory_space_assignment {
+// AllocationValue is used to break up HloValues for each non-trivial position
+// (trivial positions are considered Tuple, GetTupleElement, and Bitcast). An
+// HloValue may include positions and uses that alias with each other across
+// multiple computations. We use this class to break these HloValues such that
+// every AllocationValue has one defining position (that may alias with other
+// AllocationValues). The uses field of the AllocationValue contains only the
+// direct uses of the AllocationValue's defining position.
+//
+// For example, consider the following HLO snippet:
+//
+// Body {
+//   body_param = (f32[4,3]{1,0}, f32[]) parameter(0)
+//   get-tuple-element.3 = f32[4,3]{1,0} get-tuple-element(body_param),
+//   index=0
+//   ...
+//   ROOT tuple = (f32[4,3]{1,0}, f32[]) tuple(get-tuple-element.3, ...)
+// }
+//
+// Cond {
+//   cond_param = (f32[4,3]{1,0}, f32[]) parameter(0)
+//   ...
+// }
+//
+// add.4 = f32[4,3]{1,0} add(...)
+// tuple.1 = (f32[4,3]{1,0}, f32[]) tuple(add.4, ...)
+// while = (f32[4,3]{1,0}, f32[]) while(tuple.1), body=Body, condition=Cond
+// get-tuple-element.5 = f32[4,3]{1,0} get-tuple-element(while), index=0
+// add.5 = f32[4,3]{1,0} add(get-tuple-element.5, ...)
+//
+// This contains an HloValue that looks like the following:
+// positions:
+//  add.4
+//  body_param {0}
+//  get-tuple-element.3
+//  tuple {0}
+//  cond_param {0}
+//  tuple.1 {0}
+//  while {0}
+//  get-tuple-element.5
+// uses:
+//  add.1, operand 0
+//  tuple, operand 0
+//  while, operand 0 {0}
+//  add.5, operand 0
+//
+// We break this HloValue up into the following AllocationValues for each
+// non-trivial position:
+// AllocationValue1: computation = Entry
+//  position:
+//   add.4
+//  uses:
+//   while, operand 0 {0}
+// AllocationValue2: computation = Cond
+//  position:
+//   cond_param {0}
+//  uses:
+// AllocationValue3: computation = Body
+//  position:
+//   body_param {0}
+//  uses:
+//   add.1, operand 0
+//   tuple, operand 0
+// AllocationValue4: computation = Entry
+//  position:
+//   while {0}
+//  uses:
+//   add.5, operand 0
+class AllocationValue {
+ public:
+  // This data structure wraps an HloUse and adds additional metadata that are
+  // useful for allocation.
+  struct Use {
+    // The wrapped HloUse object.
+    HloUse hlo_use;
+    // The logical time this use is scheduled.
+    int64_t time;
+    // All the positions where this use aliases with. The aliased positions
+    // must get the same allocation.
+    std::vector<HloPosition> aliases;
+    // A synchronous memory operation that feeds this use.
+    // TODO(mehrdadk): extend this to support multiple sync data movement
+    // operands.
+    HloInstruction* sync_mem_op_operand = nullptr;
+
+    bool operator==(const Use& other) const {
+      return hlo_use == other.hlo_use && time == other.time &&
+             aliases == other.aliases;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Use& s) {
+      return H::combine(std::move(h), s.hlo_use, s.time, s.aliases);
+    }
+  };
+
+  AllocationValue(const HloValue* value, const HloPosition& position,
+                  int64_t size)
+      : value_(value),
+        defining_position_(position),
+        size_(size),
+        requires_contiguous_allocation_(false) {}
+
+  const HloPosition& defining_position() const { return defining_position_; }
+  const HloInstruction* defining_instruction() const {
+    return defining_position().instruction;
+  }
+  int64_t size() const { return size_; }
+  const std::vector<Use>& uses() const { return uses_; }
+  std::vector<Use>& uses() { return uses_; }
+  const HloValue* value() const { return value_; }
+  const HloComputation* computation() const {
+    return defining_instruction()->parent();
+  }
+  AllocationSequence* mutable_allocation_sequence() {
+    return &allocation_sequence_;
+  }
+  const AllocationSequence* allocation_sequence() const {
+    return &allocation_sequence_;
+  }
+
+  // Sets/gets whether this AllocationValue requires allocating it
+  // contiguously throughout its live range (without any copies).
+  bool requires_contiguous_allocation() const {
+    return requires_contiguous_allocation_;
+  }
+  void set_requires_contiguous_allocation(bool requires_contiguous_allocation) {
+    requires_contiguous_allocation_ = requires_contiguous_allocation;
+  }
+
+  void AddUse(const HloUse& use, int64_t use_time) {
+    uses_.push_back({use, use_time, {}});
+  }
+
+  std::string ToString() const;
+  std::string ToShortString() const;
+
+ private:
+  const HloValue* value_;
+  HloPosition defining_position_;
+  int64_t size_;
+  // If true, there must be a contiguous allocation for this buffer without
+  // any copies.
+  bool requires_contiguous_allocation_;
+  std::vector<Use> uses_;
+  AllocationSequence allocation_sequence_;
+};
+
+// A data structure we use to associate Allocation objects that are aliased
+// and must get the same offset.
+struct AliasedOffset {
+  int64_t offset;
+  absl::flat_hash_set<const Allocation*> allocations;
+};
+
+// An allocation request for a use segment. A use segment is the time segment
+// between the definition and the first use, and the time segment between the
+// uses of a buffer. For example, the time between the definition and Use1, is
+// the first segment, and the time between Use1 and Use2 is the second segment
+// and so on:
+//
+//        +------+----------+-------+
+//       /        \          \       \
+//      /          v          v       v
+//    Def         Use1       Use2    Use3
+//     <----------> <--------> <----->
+//        Segment    Segment   Segment
+//
+// start_time and end_time are the start and end logical times of the segment.
+// use_times is a sorted sequence of the times of all uses.
+// latest_prefetch_time is the latest time we can schedule the CopyDone for a
+// prefetch.
+// If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
+// If earliest_prefetch_time is set, prefetches cannot start before this
+// value.
+//
+// In case we are trying to replace synchronous copies, and for example Use2
+// is a replaceable sync copy candidate, we now skip Use2 and segments will be
+// between Def, Use1, Use2.1, Use2.2, Use3:
+//        +------+----------+-------------------+
+//       /        \          \                   \
+  //      /          v          v                   v
+//    Def         Use1       Use2(Sync Copy)     Use3
+//    |            |           \         \        |
+//    |            |            v         v       |
+//    |            |           Use2.1    Use2.2   |
+//    |<---------->|<---------->|<------->|<----->|
+//    |  Segment   |   Segment  | Segment |Segment|
+
+struct AllocationRequest {
+  int64_t inclusive_start_time;
+  int64_t end_time;
+  int64_t latest_prefetch_time;
+  // See the comment for require_copy_allocation
+  int64_t required_copy_allocation_latest_time;
+  int64_t size;
+  bool prefer_no_copy_alternate_mem_allocation;
+  bool allow_no_copy_alternate_mem_allocation;
+  bool require_no_copy_alternate_mem_allocation;
+  // If true, indicates we are requiring a copy allocation between def and
+  // use, that finishes by required_copy_allocation_latest_time.
+  // required_copy_allocation_for is a synchronous copy instruction that will
+  // be removed, if we are successful in adding the copy allocation.
+  bool require_copy_allocation;
+  bool allow_prefetch;
+  std::optional<int64_t> earliest_prefetch_time;
+  std::optional<int64_t> preferred_prefetch_time;
+  AliasedOffset* preferred_offset;
+  const AllocationValue::Use* use;
+  AllocationValue* allocation_value;
+  absl::Span<const int64_t> all_use_times;
+  // See the comment for require_copy_allocation
+  HloInstruction* required_copy_allocation_for;
+  // If the required copy in require_copy_allocation is only for a slice of
+  // the allocation_value
+  bool required_copy_for_slice;
+  // The resulting Allocation will be added to the AllocationSequence of
+  // allocation_value_to_update. We only expect allocation_value_to_update to
+  // be different from allocation_value in the case of a synchronous memory
+  // operation conversion to asynchronous, otherwise, they should be the same.
+  AllocationValue* allocation_value_to_update;
+  // No new Allocation is needed to be created and we will only extend an
+  // existing one.
+  bool only_extend_existing_allocation;
+  // Data structure that contains the options for making window prefetched
+  // allocations.
+  const WindowPrefetchedAllocation::Options* window_prefetch_options = nullptr;
+};
+
+// Result of an allocation, prefetch, eviction etc. request.  The result is
+// either kSuccess or a bitwise OR of one or more failures. The values are
+// unique powers of two. To check if a result contains a particular failure,
+// use the result_is method. To add a new failure to a result, use the
+// result_mark method.
+enum class AllocationResult {
+  // Successful allocation.
+  kSuccess = 0,
+  // Allocation failed because we ran out of alternate memory.
+  kFailOutOfMemory = 1,
+  // A no-copy allocation couldn't be performed because the previous
+  // allocation wasn't in the alternate memory space.
+  kFailPrevAllocationNotInAlternateMem = 2,
+  // A no-copy allocation couldn't be performed because the live range was too
+  // long.
+  kFailLiveRangeTooLong = 4,
+  // A prefetching couldn't be performed because the live range was too short.
+  kFailLiveRangeTooShort = 8,
+  // Ran out of outstanding asynchronous copy limit either during prefetching
+  // or eviction.
+  kFailOutOfAsyncCopies = 16,
+  // A prefetching couldn't be performed because the asynchronous copy
+  // resource was violated.
+  kFailViolatesAsyncCopyResource = 32,
+  // An allocation failure happened that requires uncommitting all the pending
+  // allocations. Usually this is due to a situation requiring an eviction but
+  // the eviction couldn't be performed.
+  kFailRequiresUncommit = 64,
+  // For prefetching, indicates that all slices have the same start time, in
+  // which case, we fallback to an unsliced solution.
+  kAllSlicesHaveTheSameStartTime = 128,
+  // There were conflicting preferred offsets.
+  kFailConflictingPreferredOffsets = 256,
+  // Could not replace the synchronous data movement instruction (e.g., kCopy,
+  // kSlice) with an asynchronous one
+  kFailSyncDataMoveReplacement = 512
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
new file mode 100644
index 00000000..e22daba3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/memory_space_assignment/repacking.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// This is a repacker algorithm that wraps around best fit heap algorithm in
+// heap simulator.
+class MemorySpaceAssignmentBestFitRepacker
+    : public MemorySpaceAssignmentRepacker {
+ public:
+  using BufferInterval =
+      GlobalDecreasingSizeBestFitHeap<AllocationBlock>::BufferInterval;
+  using BufferIntervalCompare =
+      GlobalDecreasingSizeBestFitHeap<AllocationBlock>::BufferIntervalCompare;
+
+  struct BestFitRepackOptions {
+    // Running the validator is potentially expensive.
+    bool validate = false;
+
+    // Specify the comparison function used for determining the order in which
+    // buffers will be allocated, during repacking.
+    BufferIntervalCompare buffer_interval_compare = nullptr;
+  };
+
+  MemorySpaceAssignmentBestFitRepacker(
+      int64_t max_size, int64_t alignment,
+      SliceTimePermutationIterator::Ty slice_time_permutation_iterator_type)
+      : MemorySpaceAssignmentRepacker(max_size, alignment),
+        options_(BestFitRepackOptions()),
+        slice_time_permutation_iterator_type_(
+            slice_time_permutation_iterator_type) {}
+  MemorySpaceAssignmentBestFitRepacker(
+      int64_t max_size, int64_t alignment,
+      SliceTimePermutationIterator::Ty slice_time_permutation_iterator_type,
+      BestFitRepackOptions options)
+      : MemorySpaceAssignmentRepacker(max_size, alignment),
+        options_(std::move(options)),
+        slice_time_permutation_iterator_type_(
+            slice_time_permutation_iterator_type) {}
+
+  absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) override;
+
+ private:
+  BestFitRepackOptions options_;
+  SliceTimePermutationIterator::Ty slice_time_permutation_iterator_type_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.h
new file mode 100644
index 00000000..5c7a94b6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.h
@@ -0,0 +1,150 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BUFFER_INTERVAL_COMPARATOR_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BUFFER_INTERVAL_COMPARATOR_H_
+
+#include <cstdint>
+#include <string>
+#include <tuple>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/utils.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+using MsaBufferIntervalCompare =
+    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare;
+
+// The MsaBufferInterval sorting interface that MemorySpaceAssignment expects.
+class BufferIntervalComparator {
+ public:
+  virtual ~BufferIntervalComparator() = default;
+
+  // A logging string explaining the sorting criteria. E.g., [ -size, offset ]
+  // indicates we sort (desc) size, then (asc) offset.
+  virtual std::string DescribeComparisonCriteria() const = 0;
+
+  // A logging string containing the values used to sort buffer_interval.
+  // E.g., we might return [ -1024, 100 ], if the criteria is [ -size,
+  // offset ].
+  virtual std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) = 0;
+
+  // comparator.LessThan(lhs, rhs) will be used for MsaBufferIntervalCompare.
+  virtual bool LessThan(const MsaBufferInterval& lhs,
+                        const MsaBufferInterval& rhs) = 0;
+
+  // Used to create a functor that can be passed to a method like std::sort.
+  // E.g., absl::c_sort(v, comparator.GetComparisonFunctor());
+  MsaBufferIntervalCompare GetComparisonFunctor() {
+    return [this](const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+      return LessThan(lhs, rhs);
+    };
+  }
+
+ protected:
+  BufferIntervalComparator() = default;
+};
+
+// A BufferIntervalComparator that utilizes MemoryBoundedness as its primary
+// sorting criteria.
+//
+// This comparator caches HloValues -> latest use time.
+class MemoryBoundednessBufferIntervalComparator
+    : public BufferIntervalComparator {
+ public:
+  MemoryBoundednessBufferIntervalComparator(
+      const CostAnalysis& cost_analysis,
+      CostAnalysis::Cache* cost_analysis_cache);
+
+  MemoryBoundednessBufferIntervalComparator(
+      const CostAnalysis& cost_analysis,
+      CostAnalysis::Cache* cost_analysis_cache,
+      MsaSortOrderOverrides msa_sort_order_overrides);
+
+  ~MemoryBoundednessBufferIntervalComparator() override = default;
+
+  std::string DescribeComparisonCriteria() const override;
+  std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) override;
+  bool LessThan(const MsaBufferInterval& lhs,
+                const MsaBufferInterval& rhs) override;
+
+ private:
+  // See the value returned by DescribeComparisonCriteria() for the meaning of
+  // each tuple element.
+  using ComparisonTuple = std::tuple<int64_t, float, int64_t, int64_t, int64_t,
+                                     int64_t, BufferValue::Id>;
+
+  ComparisonTuple GetTuple(const MsaBufferInterval& buffer_interval);
+  int64_t GetLatestUseTime(const MsaBufferInterval& buffer_interval);
+  absl::flat_hash_map<const HloValue*, int64_t> buffer_to_latest_use_;
+  const CostAnalysis& cost_analysis_;
+  CostAnalysis::Cache* cost_analysis_cache_;
+
+  // Config to override alternate memory assignment sorting order for filtered
+  // buffers.
+  MsaSortOrderOverrides msa_sort_order_overrides_;
+};
+
+// The default BufferIntervalComparator used for cross-program prefetching.
+//
+// This class caches HloValue -> {latest use, cumulative use size }.
+class DefaultCrossProgramPrefetchBufferIntervalComparator
+    : public BufferIntervalComparator {
+ public:
+  explicit DefaultCrossProgramPrefetchBufferIntervalComparator(
+      const HloLiveRange& hlo_live_range,
+      const MsaSortOrderOverrides& msa_sort_order_overrides);
+
+  ~DefaultCrossProgramPrefetchBufferIntervalComparator() override = default;
+
+  std::string DescribeComparisonCriteria() const override;
+  std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) override;
+  bool LessThan(const MsaBufferInterval& lhs,
+                const MsaBufferInterval& rhs) override;
+
+ private:
+  // See the value returned by DescribeComparisonCriteria() for the meaning of
+  // each tuple element.
+  using ComparisonTuple =
+      std::tuple<int64_t, int64_t, int64_t, int64_t, BufferValue::Id>;
+
+  struct AdditionalSortData {
+    int64_t latest_use = 0;
+    int64_t cumulative_use_size = 0;
+  };
+
+  ComparisonTuple GetTuple(const MsaBufferInterval& buffer_interval);
+
+  absl::flat_hash_map<const HloValue*, AdditionalSortData>
+      additional_sort_data_;
+  const HloLiveRange& hlo_live_range_;
+  const MsaSortOrderOverrides& msa_sort_order_overrides_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BUFFER_INTERVAL_COMPARATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
new file mode 100644
index 00000000..23857dc0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
@@ -0,0 +1,301 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_COST_ANALYSIS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_COST_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// Options to be passed to the CostAnalysis.
+struct CostAnalysisOptions {
+  // This variable is used by the cost analysis in estimating how many times
+  // each while loop will execute. Nested loops will be assumed to have
+  // executed pow(while_execution_count, nesting_level) times.
+  uint64_t xla_tpu_memory_space_assignment_while_execution_count = 5ULL;
+
+  // This variable is used to scale the alternate memory benefit factor for
+  // large buffers. The default scaling function is sqrt.
+  std::string
+      xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers =
+          "SQRT";
+
+  // The window size used to calculate the pipeline overhead when HLO accesses
+  // the default memory, in MiB.
+  float pipeline_overhead_window_size_mib = 0;
+
+  double alternate_mem_bandwidth_bytes_per_second = 0.0f;
+
+  double default_mem_bandwidth_bytes_per_second = 0.0f;
+
+  // Scales effective bandwidth for async copies. Valid range is (0, 1].
+  float async_copy_bandwidth_scaling_factor = 1.0;
+
+  // Used to get the layout size of a shape in bytes.
+  std::function<int64_t(const Shape&)> shape_size_bytes_fn =
+      [](const Shape& shape) { return ShapeUtil::ByteSizeOf(shape); };
+};
+
+// An interface for getting basic HLO costs.
+class BaseCosts {
+ public:
+  virtual ~BaseCosts() = default;
+
+  // The number of operand and output bytes accessed by instruction.
+  virtual float BytesAccessed(const HloInstruction& instruction) = 0;
+
+  // The number of bytes accessed by instruction, for operand operand_num, at
+  // shape_index.
+  virtual float OperandBytesAccessed(const HloInstruction& instruction,
+                                     int64_t operand_num,
+                                     const ShapeIndex& shape_index) = 0;
+
+  // The number of bytes accessed by instruction, in its output, at shape_index.
+  virtual float OutputBytesAccessed(const HloInstruction& instruction,
+                                    const ShapeIndex& shape_index) = 0;
+
+  // The compute cost of instruction. The compute cost assumes 0 memory transfer
+  // is required.
+  virtual float ComputeSeconds(const HloInstruction& instruction) = 0;
+
+ protected:
+  BaseCosts() = default;
+};
+
+// An implementation of BaseCosts based on HloCostAnalysis.
+class HloCostAnalysisCosts : public BaseCosts {
+ public:
+  explicit HloCostAnalysisCosts(const HloCostAnalysis& hlo_cost_analysis);
+
+  ~HloCostAnalysisCosts() override = default;
+
+  float BytesAccessed(const HloInstruction& instruction) override;
+  float OperandBytesAccessed(const HloInstruction& instruction,
+                             int64_t operand_num,
+                             const ShapeIndex& shape_index) override;
+  float OutputBytesAccessed(const HloInstruction& instruction,
+                            const ShapeIndex& shape_index) override;
+  float ComputeSeconds(const HloInstruction& instruction) override;
+
+ private:
+  const HloCostAnalysis& hlo_cost_analysis_;
+};
+
+// A wrapper class around BaseCosts with additional knowledge about the
+// bandwidths of different memory spaces.
+class CostAnalysis {
+ public:
+  // An optional Cache object may be provided to some of the methods below to
+  // speed up the lookup.
+  struct Cache {
+    // TODO(hanruobing): This map assumes the nested while loops have the same
+    // hard-coded trip count. We plan to replace it with a more accurate
+    // estimation provided by 'while_nest_trip_count'.
+    absl::flat_hash_map<const HloInstruction*, float> while_nest_multiplier;
+    absl::flat_hash_map<const HloComputation*, float> computation_trip_count;
+    absl::flat_hash_map<HloPosition, float> memory_boundedness;
+  };
+
+  // Function type that can be used to indicate which input/output values are in
+  // the alternate memory.
+  using IsInAlternateMemoryFun = absl::FunctionRef<bool(
+      std::optional<int> /*operand_num*/, const ShapeIndex& /*index*/,
+      const Shape& /*shape*/)>;
+
+  virtual ~CostAnalysis() = default;
+
+  static absl::StatusOr<std::unique_ptr<CostAnalysis>> Create(
+      BaseCosts& base_costs, const CostAnalysisOptions& options,
+      const HloModule& module);
+
+  BaseCosts& base_costs() const { return base_costs_; }
+
+  int64_t GetShapeSizeBytes(const Shape& shape) const;
+
+  double DefaultMemBandwidthBytesPerSecond(
+      bool use_scaling_factor = false) const;
+
+  // Returns a heuristic value that captures how much putting this tensor to the
+  // alternate memory would help if the op is memory bound, or otherwise how far
+  // off is the op to memory boundedness. The larger this number, the higher
+  // priority it will be placed in the alternate memory.
+  float GetAlternateMemoryBenefit(const HloInstruction& instruction,
+                                  float elapsed_time_due_to_alternate_mem,
+                                  Cache* cache = nullptr) const;
+  // Like above, return the benefit of putting the output tensor in the
+  // alternate memory.
+  float GetAlternateMemoryBenefit(const HloPosition& position,
+                                  Cache* cache = nullptr) const;
+  // Like above, return the benefit of putting the input tensor in the alternate
+  // memory.
+  float GetAlternateMemoryBenefit(const HloUse& use,
+                                  Cache* cache = nullptr) const;
+
+  // Returns a heuristic value of memory boundedness for the given
+  // BufferInterval.  The larger this number, the higher priority it will be
+  // placed in the alternate memory.
+  float GetMemoryBoundedness(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
+      Cache* cache = nullptr) const;
+
+  // If enabled in CostAnalysisOptions::pipeline_overhead_window_size_mib,
+  // returns the overhead of accessing the default memory, in seconds. The
+  // source of the overhead is the software pipelining ovehead. The lowering of
+  // the operations typically use tiling to copy one window at a time from
+  // default memory, and perform compute:
+  //
+  // Pipeline overhead:                          <->
+  //                        +----+----+----+----+
+  // Copy from default mem: |    |    |    |    |
+  //                        +----+----+----+----+
+  //                            \    \    \    \
+  //                             \    \    \    \
+  //                              V    V    V    V
+  //                             +--+ +--+ +--+ +--+
+  // Compute:                    |  | |  | |  | |  |
+  //                             +--+ +--+ +--+ +--+
+  float GetDefaultMemoryAccessOverhead(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the amount of time the default memory bandwidth is idle, while
+  // executing this instruction, in seconds.  This value can be multiplied with
+  // the default memory bandwidth to get the amount of bytes that are available
+  // to be copied to/from default memory during the execution of this
+  // instruction.
+  float GetDefaultMemoryBandwidthIdleTime(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the bytes accessed from alternate memory.
+  float GetBytesAccessedFromAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the elapsed time in seconds due to compute only.
+  float GetInstructionElapsedDueToCompute(
+      const HloInstruction& instruction) const;
+
+  // Returns the elapsed time in seconds due to memory only. If
+  // operands_in_alternate_mem or outputs_in_alternate_mem is provided, it will
+  // assume that the corresponding operands or output will be in the alternate
+  // memory space. This is useful for calculating the benefit of placing the
+  // buffer in alternate memory.
+  float GetInstructionElapsedDueToMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Like above, only the inputs/outputs indicated by is_in_alternate_mem are in
+  // the alternate memory.
+  float GetInstructionElapsedDueToMemory(
+      const HloInstruction& instruction,
+      IsInAlternateMemoryFun is_in_alternate_mem) const;
+
+  // Returns the estimated elapsed duration of the instruction in seconds.  It
+  // assumes all operands and outputs of the instruction are in the default
+  // memory.
+  virtual float GetInstructionElapsed(const HloInstruction& instruction) const;
+
+  // Returns the estimated elapsed duration of the instruction in seconds.  It
+  // assumes all operands and outputs of the instruction are in the default
+  // memory, except for the operands and outputs specified to be in the
+  // alternate memory.
+  virtual float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem,
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem) const;
+
+  // Like above, only the inputs/outputs indicated by is_in_alternate_mem are in
+  // the alternate memory.
+  float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      IsInAlternateMemoryFun is_in_alternate_mem) const;
+
+  // Returns the elapsed time it would take to asynchronously copy the shape
+  // from default to alternate memory space (or vice versa).
+  virtual float GetAsyncCopyElapsed(const Shape& shape) const;
+
+  int64_t GetScheduleEndTime() const;
+
+  // Returns the number of nested computation levels this instruction resides
+  // in. If while_only is true, it returns the while loop nest level and 0
+  // means the instruction is not in a while loop.
+  int CalculateComputationNestLevel(const HloInstruction* instruction,
+                                    bool while_only) const;
+
+  // Returns the number of times the instruction will be executed.
+  // For instructions in nested loops, this is the product of the number of
+  // trip counts of outer loops.
+  float CalculateNestTripCount(const HloInstruction* instruction,
+                               Cache* cache = nullptr) const;
+
+  float GetWhileNestMultiplier(int while_nest_level) const;
+
+  const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
+
+ protected:
+  CostAnalysis(BaseCosts& base_costs, const CostAnalysisOptions& options,
+               std::unique_ptr<HloAliasAnalysis> alias_analysis,
+               std::unique_ptr<HloLiveRange> hlo_live_range,
+               std::unique_ptr<CallGraph> call_graph)
+      : base_costs_(base_costs),
+        options_(options),
+        alias_analysis_(std::move(alias_analysis)),
+        hlo_live_range_(std::move(hlo_live_range)),
+        call_graph_(std::move(call_graph)) {}
+
+ private:
+  BaseCosts& base_costs_;
+  const CostAnalysisOptions options_;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  std::unique_ptr<HloLiveRange> hlo_live_range_;
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_COST_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
new file mode 100644
index 00000000..3db5b5d5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
@@ -0,0 +1,483 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_BOUND_LOOP_OPTIMIZER_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_BOUND_LOOP_OPTIMIZER_H_
+
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/options.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// Pair of chunks for even and odd loop iterations.
+struct EvenOddChunkPair {
+  std::optional<HeapSimulator::Chunk> even_chunk;
+  std::optional<HeapSimulator::Chunk> odd_chunk;
+
+  bool HasValues() const {
+    return even_chunk.has_value() && odd_chunk.has_value();
+  }
+};
+
+// LoopOptimizerBestFitHeap extends GlobalDecreasingSizeBestFitHeap to track
+// allocated buffers and their live intervals for the MemoryBoundLoopOptimizer.
+// * We model 4 loop iterations.
+// * The 0th and 2nd iterations are even. The 1st and 3rd are odd.
+// * Allocations in even iterations are required to have the same offsets.
+//   Likewise, allocations in odd iterations are required to have the same
+//   offset.
+// * Allocations may have different offsets between odd and even iterations.
+// * Buffers can span up to 2 iterations.
+// * The algorithm uses the 0th and 1st iterations to account for buffers that
+//   start in those iterations but are still alive in the 2nd and 3rd
+//   iterations. The 2nd and 3rd iterations are used to give the complete loop
+//   buffer picture.
+class LoopOptimizerBestFitHeap
+    : public GlobalDecreasingSizeBestFitHeap<AllocationBlock> {
+ public:
+  explicit LoopOptimizerBestFitHeap(uint64_t size_limit_per_heap,
+                                    int64_t loop_size,
+                                    int64_t alignment_in_bytes)
+      : GlobalDecreasingSizeBestFitHeap<AllocationBlock>(alignment_in_bytes),
+        size_limit_per_heap_(size_limit_per_heap),
+        loop_size_(loop_size) {}
+  ~LoopOptimizerBestFitHeap() override = default;
+
+  // Frees the memory space denoted by chunk from [begin_idx_in_loop,
+  // end_idx_in_loop] from all iterations.
+  void RemoveEvenOddChunkPair(int64_t begin_idx_in_loop,
+                              int64_t end_idx_in_loop,
+                              EvenOddChunkPair& chunks);
+
+  // Displays the current memory usage vs time for 6 loop iterations by default.
+  // Note: The 0th and the 1st iterations are just to account for loop around
+  // for buffers that go across one or two loop boundaries. The 2nd and the 3rd
+  // iterations present the actual memory view of the allocation. The 4th and
+  // 5th iterations show buffers from previous two iterations that go across one
+  // or two loop boundaries. begin_iteration_idx and end_iteration_idx are both
+  // inclusive, 0 indexed.
+  std::string MemoryUsageToAsciiArt(int64_t begin_iteration = 0,
+                                    int64_t end_iteration = 5) const;
+
+  // Returns a vector of size loop_size, where the i'th element denotes the
+  // available(unfragmented) alternate memory in bytes at loop_idx i.
+  std::vector<int64_t> RemainingMemoryByTime() const;
+
+  // Returns an integer denoting the largest occupied memory location in the
+  // alternate memory.
+  int64_t LastMemoryOffsetOccupied() const;
+
+  // Finds free memory chunks of size "size" between [begin_idx_in_loop,
+  // end_idx_in_loop] in the even and odd loop iterations, only if free chunks
+  // are found in both iterations. The even and odd iteration offsets may be
+  // different.
+  EvenOddChunkPair FindEvenAndOddAllocationBetween(
+      int64_t begin_idx_in_loop, int64_t end_idx_in_loop, int64_t size,
+      std::pair<int64_t, int64_t> preferred_offsets = {-1, -1});
+
+  // Finds and reserves free memory chunks of size "size" between
+  // [begin_idx_in_loop, end_idx_in_loop] in the even and odd loop iterations,
+  // only if free chunks are found in both iterations. The even and odd
+  // iteration offsets may be different.
+  EvenOddChunkPair AllocateEvenAndOddBetween(
+      int64_t begin_idx_in_loop, int64_t end_idx_in_loop, int64_t size,
+      std::pair<int64_t, int64_t> preferred_offsets = {-1, -1});
+
+  // Finds free memory chunks of size "size" between [begin_idx_in_loop,
+  // end_idx_in_loop] in the even and odd loop iterations, only if free chunks
+  // are found in both iterations. The even and odd iteration offsets are same.
+  EvenOddChunkPair FindSameEvenAndOddAllocationBetween(
+      int64_t begin_idx_in_loop, int64_t end_idx_in_loop, int64_t size,
+      int64_t preferred_offset = -1);
+
+  // Finds and reserves free memory chunks of size "size" between
+  // [begin_idx_in_loop, end_idx_in_loop] in the even and odd loop iterations,
+  // only if free chunks are found in both iterations. The even and odd
+  // iteration offsets are same.
+  EvenOddChunkPair AllocateSameEvenAndOddBetween(int64_t begin_idx_in_loop,
+                                                 int64_t end_idx_in_loop,
+                                                 int64_t size,
+                                                 int64_t preferred_offset = -1);
+
+ private:
+  // REQUIRES:
+  // * begin_idx_in_loop <= end_idx_in_loop
+  // * begin_idx_in_loop is within [-loop_size loop_size)
+  // * end_idx_in_loop is within [0, 2 * loop_size)
+  // * end_idx_in_loop - begin_idx_in_loop + 1 <= 2 * loop_size (allocation
+  //   colocated in even (or odd) iterations cannot span more than 2 loop
+  //   iterations)
+  void CheckAllocationIntervalValid(int64_t begin_idx_in_loop,
+                                    int64_t end_idx_in_loop) const;
+
+  // Shifts allocation interval at [begin_idx_in_loop, end_idx_in_loop] to
+  // [begin_idx_in_loop + loop_size, end_idx_in_loop + loop_size], if
+  // begin_idx_in_loop is negative.
+  void ShiftAllocationIntervalIfRequired(int64_t& begin_idx_in_loop,
+                                         int64_t& end_idx_in_loop) const;
+
+  // Returns pointer to a newly created allocation block that is added to
+  // allocation_blocks_.
+  const AllocationBlock& GetAllocationBlock(int64_t start_time,
+                                            int64_t end_time, int64_t size);
+
+  // Creates a BufferInterval corresponding to the AllocationBlock and adds it
+  // to buffer_intervals_.
+  void CreateBufferInterval(const AllocationBlock& allocation_block,
+                            const AllocationBlock* colocated_with = nullptr);
+
+  std::optional<Chunk> MaybeFindChunkCandidate(
+      const AllocationBlock& allocation_block, int64_t preferred_offset = -1);
+
+  std::optional<Chunk> FindAndCommitChunkCandidate(
+      const AllocationBlock& allocation_block, int64_t preferred_offset = -1);
+
+  void RemoveChunk(int64_t start_time, int64_t end_time, Chunk chunk);
+
+  void RemoveEvenChunks(int64_t begin_idx_in_loop, int64_t end_idx_in_loop,
+                        std::optional<HeapSimulator::Chunk>& chunk);
+
+  void RemoveOddChunks(int64_t begin_idx_in_loop, int64_t end_idx_in_loop,
+                       std::optional<HeapSimulator::Chunk>& chunk);
+
+  // Creates colocated allocation blocks for loop interval [begin_idx_in_loop,
+  // end_idx_in_loop] in 0th, 1st, 2nd and 3rd loop iterations and returns one
+  // AllocationBlock.
+  const AllocationBlock& CreateSameEvenAndOddAllocationBlock(
+      int64_t begin_idx_in_loop, int64_t end_idx_in_loop, int64_t size);
+
+  // Creates colocated allocation blocks for loop interval [begin_idx_in_loop,
+  // end_idx_in_loop] in 0th and 2nd loop iterations and returns one
+  // AllocationBlock.
+  const AllocationBlock& CreateEvenAllocationBlock(int64_t begin_idx_in_loop,
+                                                   int64_t end_idx_in_loop,
+                                                   int64_t size);
+  // Creates colocated allocation blocks for loop interval [begin_idx_in_loop,
+  // end_idx_in_loop] in 1st and 3rd loop iterations and returns one
+  // AllocationBlock.
+  const AllocationBlock& CreateOddAllocationBlock(int64_t begin_idx_in_loop,
+                                                  int64_t end_idx_in_loop,
+                                                  int64_t size);
+
+  uint64_t size_limit_per_heap_;
+  int64_t loop_size_;
+  std::list<AllocationBlock> allocation_blocks_;
+};
+
+// An optimizer for unrolled memory-bound loops. It keeps track of alternate
+// memory capacity and default memory bandwidth to decide the allocations of
+// each tensor within a loop iteration. The assumption is that all of the
+// unrolled loop iterations will use the same allocation decisions, so we can
+// spend more time to optimize this one iteration as optimally as possible.
+//
+// To represent instructions, we keep track of three iterations (previous,
+// current, and next), as well as the header and footer regions that are before
+// and after the loop, respectively.
+//
+// We classify each tensor used in the current iteration as one of the following
+// allocations based on its positions and uses:
+//
+// Temporary Allocations: These are produced by a producer in the current
+// iteration and consumed either in this or the next iteration. For these, we
+// try to give them alternate memory allocations for their entire live range.
+//
+// Case 1: producer and consumer all in the current iteration.
+//                                     p-----c--c
+// Case 2: producer is in the current iter, consumer is in the next iter.
+//                                           p-----c
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+//
+// Loop Carried Dependences: This is where the last use is at a larger index
+// than the producer. This would require 2X peak buffer consumption because both
+// this and next iteration's buffer is alive at the same time. This case is
+// currently not supported.
+//
+// Case 3: producer is in the current iter, consumer is in the next iter
+//         (consumer idx >= producer idx).
+//                                           p-----------------c
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+//
+// Pinned Allocations: These are values produced at the header and are used in
+// every iteration at the same indices. For these, we just allocate the buffer
+// for the duration of the loop:
+//
+// Case 4: producer: kHead, consumer: kCurrent
+//         p---------------c--------------c--------------c--------
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+//
+// Prefetch Allocations: These are values produced at the header and are used in
+// the current (and possibly next) iteration. We will try to prefetch these
+// values into the alternate memory:
+//
+// Case 5: producer: kHead, consumer: kCurrent
+//         p---------------------------------c--------c
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+class MemoryBoundLoopOptimizer {
+ public:
+  // We represent each tensor used in the current iteration as a LoopValue,
+  // wrapping the relevant information such as its HLO value, indices and
+  // pointers to its use and position sites in different iterations.
+  // TODO(b/364621066): Make LoopValue a class.
+  struct LoopValue {
+    // An enum that encodes the allocation type that is suitable for this
+    // LoopValue. See the comment above on what each of these mean.
+    enum class AllocationType {
+      kTemporary,
+      kLoopCarriedDependence,
+      kPinned,
+      kPrefetch,
+      kUnsupported
+    };
+
+    // ToString methods for logging/debugging.
+    static std::string AllocationTypeToString(AllocationType allocation_type);
+    std::string ToString() const;
+
+    // Returns true if memory-bound loop optimizer supports allocating this type
+    // of a loop value.
+    bool IsAllocationTypeSupported() const;
+
+    // Sets the data members `chunks`, `alternate_memory_begin_idx_in_loop`, and
+    // `alternate_memory_end_idx_in_loop`.
+    void SetChunkPairAndInterval(EvenOddChunkPair chunk_pair,
+                                 int64_t begin_idx_in_loop,
+                                 int64_t end_idx_in_loop);
+
+    // The HloValues that correspond to this LoopValue.
+    std::vector<const HloValue*> hlo_values;
+    // The position in the header, if any.
+    std::optional<HloPosition> header_position;
+    // The loop index and position in the previous and current iterations.
+    std::vector<std::pair<int64_t, HloPosition>> prev_iteration_positions;
+    std::vector<std::pair<int64_t, HloPosition>> loop_positions;
+    // The loop index and use in the current and next iterations.
+    std::vector<std::pair<int64_t, HloUse>> loop_uses;
+    std::vector<std::pair<int64_t, HloUse>> next_iteration_uses;
+    // The allocation type.
+    AllocationType allocation_type;
+    // Size of this tensor.
+    int64_t size;
+    // The default memory bandwidth savings were we to successfully put this in
+    // the alternate memory using the allocation type, in bytes.
+    float savings;
+    // The savings divided by the size. This is typically 2 for temporary
+    // allocations (skip a write and a read to the default memory). More complex
+    // production/consumption patterns may result in higher or lower values. We
+    // use this value to sort LoopValues so that the algorithm can prioritize
+    // allocating the buffers with the highest savings per byte to the alternate
+    // memory.
+    float savings_per_byte;
+    // The optimized AllocationSequence.
+    AllocationSequence allocations;
+    // Chunks for even and odd iterations. If a loop value is double buffered
+    // then it must have different chunks for even and odd iterations.
+    EvenOddChunkPair chunks;
+    // Begin index of loop value in alternate memory.
+    // REQUIRES:
+    // * (-loop_size) <= alternate_memory_begin_idx_in_loop
+    // * alternate_memory_begin_idx_in_loop < loop_size
+    std::optional<int64_t> alternate_memory_begin_idx_in_loop = std::nullopt;
+    // End index of loop value in alternate memory.
+    // REQUIRES:
+    // * 0 <= alternate_memory_end_idx_in_loop
+    // * alternate_memory_end_idx_in_loop < 2*loop_size
+    std::optional<int64_t> alternate_memory_end_idx_in_loop = std::nullopt;
+  };
+
+  // Factory method to create and initialize a MemoryBoundLoopOptimizer.
+  static absl::StatusOr<std::unique_ptr<MemoryBoundLoopOptimizer>> Create(
+      int loop_start, int loop_end, const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis, const Options& options);
+
+  // Optimize the loop. Initialize must be called first.
+  void Optimize();
+
+  // Calculate the steady-state execution time of one loop iteration using the
+  // allocation decisions so far.
+  float CalculateExecutionTime() const;
+
+  // Return the LoopValues.
+  const std::vector<LoopValue>& loop_values() const { return loop_values_; }
+  std::vector<LoopValue>& loop_values() { return loop_values_; }
+
+  // Return the remaining memory vector for each point in time in the loop using
+  // the allocation decisions so far.
+  std::vector<int64_t> RemainingMemory() const {
+    return heap_.RemainingMemoryByTime();
+  }
+
+  int64_t MaxAlternateMemoryUsed() const {
+    return heap_.LastMemoryOffsetOccupied();
+  }
+
+  std::string MemoryUsageToAsciiArt() const {
+    return heap_.MemoryUsageToAsciiArt();
+  }
+
+  // The loop start, end, and size accessors.
+  int loop_start() const { return loop_start_; }
+  int loop_end() const { return loop_end_; }
+  int loop_size() const { return loop_size_; }
+
+ private:
+  // Temporary data structures used by the AllocatePrefetch function.
+  struct AllocatePrefetchesContext {
+    // The values that are requested to be prefetched.
+    absl::Span<LoopValue*> values;
+
+    // A list of indices into values array, sorted by the (descending) start
+    // time of the first use.
+    std::vector<int> value_indices;
+
+    // Default memory remaining bandwidths assuming all prefetches succeeded.
+    std::vector<float> bandwidth_idle_times;
+  };
+
+  MemoryBoundLoopOptimizer(
+      int loop_start, int loop_end, uint64_t alternate_memory_size,
+      const MemoryBoundLoopOptimizerOptions& options,
+      const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis_,
+      const CostAnalysis& cost_analysis,
+      const BufferValue::SizeFunction& size_function,
+      const ReservedScopedMemoryFunction& reserved_scoped_memory_fn,
+      int64_t alignment_in_bytes);
+
+  // Initializes the data structures used by the optimizer.
+  absl::Status Initialize();
+
+  // Given an HloBuffer object, determines if this buffer represents a LoopValue
+  // that can be optimized by the optimizer, and if so it adds a LoopValue to
+  // the back of loop_values_ that represents the HloBuffer. Otherwise, no new
+  // LoopValue is added to loop_values_.
+  void MaybeCreateLoopValue(const HloBuffer& buffer,
+                            const HloComputation* loop_computation);
+
+  // Sort LoopValues by savings_per_byte.
+  void SortLoopValues();
+
+  // After allocation finishes, we fix up by creating Allocation objects to any
+  // LoopValues that didn't get alternate memory allocations.
+  void PostProcess();
+
+  // Allocate LoopValues by dispatching to the correct Allocate method.
+  void AllocateLoopValues();
+
+  // Perform allocation type kTemporary. Return true if successful.
+  bool AllocateTemporary(LoopValue& value);
+
+  // Perform allocation type kPinned. Return true if successful.
+  bool AllocatePinned(LoopValue& value);
+
+  // Perform allocation type kPrefetch. Unlike the other Allocate methods, this
+  // performs allocation of multiple LoopValues in order to consider the effect
+  // of remaining bandwidth assuming the other prefetches were successful.
+  // Return true if successful.
+  bool AllocatePrefetches(absl::Span<LoopValue*> values);
+
+  // Allocate one prefetch for the loop value index that corresponds to
+  // context.context.values. Returns true if successful.
+  bool AllocatePrefetch(int value_index, AllocatePrefetchesContext& context);
+
+  // Keeps track of successful allocation of all uses and positions of this
+  // LoopValue.
+  void AddAllLoopPositionsAndUses(LoopValue& value,
+                                  bool allocate_next_iteration_uses);
+
+  // Returns the default memory bandwidth idle time at the index.
+  float GetBandwidthIdleTime(int idx) const;
+
+  // Returns the default memory bandwidth idle time at the index assuming the
+  // given uses and positions got alternate memory allocations.
+  float GetBandwidthIdleTime(
+      int idx,
+      const absl::flat_hash_map<const HloInstruction*,
+                                std::vector<std::pair<int64_t, ShapeIndex>>>&
+          additional_uses_in_alternate_mem,
+      const absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>&
+          additional_positions_in_alternate_mem) const;
+
+  // Returns the instruction elapsed at the index.
+  float GetInstructionElapsed(int idx) const;
+
+  int loop_start_;
+  int loop_end_;
+  int loop_size_;
+  uint64_t alternate_memory_size_;
+  MemoryBoundLoopOptimizerOptions options_;
+  const HloLiveRange& hlo_live_range_;
+  const HloAliasAnalysis& alias_analysis_;
+  const CostAnalysis& cost_analysis_;
+  BufferValue::SizeFunction size_function_;
+
+  absl::flat_hash_map<const HloInstruction*, int64_t> instructions_in_loop_;
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instructions_in_prev_iteration_;
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instructions_in_next_iteration_;
+  std::vector<LoopValue> loop_values_;
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<std::pair<int64_t, ShapeIndex>>>
+      uses_in_alternate_mem_;
+  absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>
+      positions_in_alternate_mem_;
+  const ReservedScopedMemoryFunction& reserved_scoped_memory_fn_;
+
+  // The heap used to allocate loop values. Since some loop values can be double
+  // buffered, between successive iterations, they must have different chunks
+  // for even and odd iterations. We model 4 iterations of the loop to allocate
+  // the loop values to alternate memory so we can model the buffers that cross
+  // one or two loop boundaries. The allocations in the 2nd and 3rd iterations
+  // represent the actual memory view. The 0th and 1st iteration serve to
+  // account for allocations, whose buffers cross one or two loop boundaries,
+  // into the 2nd and 3rd iterations.
+  LoopOptimizerBestFitHeap heap_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_BOUND_LOOP_OPTIMIZER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
new file mode 100644
index 00000000..d2bcccc1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -0,0 +1,409 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/*
+Quick reference
+
+This section is meant as to be a quick reference for getting the gist of
+commonly used terminology in the code and logging. Please see the code for more
+details.
+
+General concepts
+
+  - Time: In MSA, time typically refers to an index into the flattened
+    instruction schedule.
+
+  - Cross-program prefetch: Cross-program prefetched tensors are copied from
+    memory to alternate the first time a program executes, like usual
+    prefetches. MSA keeps these buffers alive in alternate memory at the end of
+    the program, such that if the same program is executed again, these tensors
+    would not need to be prefetched again.
+
+Classes
+
+  - HloPosition (Hlo dataflow analysis concept): Identifies a tensor referenced
+    in an instruction's output. Defined by <instruction, shape index>.
+
+  - HloValue (Hlo dataflow analysis concept): The value of a tensor. Each
+    HloValue is represented by a collection of HloPositions. Exactly 1 of those
+    positions is the HloValue's defining position, i.e., the point in code where
+    the value is created/written. The rest of the positions pertain to read-only
+    uses of the value.
+    * Example: A tensor that is inserted in a Tuple has 2 HloPositions, one for
+      the instruction that creates the tensor, and one indexing into the Tuple
+      instruction result.
+    * The read-only positions of an HloValue should not be confused with
+      HloUses. Read-only positions are references to the HloValue in the output
+      of an instruction. Uses are references to an HloValue in the input of an
+      instruction.
+    * Dataflow analysis assigns HloValues for the instructions in computations
+      pertaining to while loops, conditionals, and call ops. However, it does
+      not assign HloValues to the computations pertaining to instructions with
+      "call" semantics (e.g., fusions, reduce, and custom-call) because those
+      computations are treated as black boxes.
+    * If a while loop does not modify an input tensor, that tensor will be
+      assigned 1 HloValue that lasts from its creation point through the while
+      loop.
+    * If a while loop modifies one of its input tensors, that tensor will
+      receive at least the following HloValues:
+      - An HloValue for the tensor's creation, with a use at the operand of the
+        while instruction.
+      - An HloValue with its defining position at the while body's parameter.
+      - An HloValue whose defining position is an instruction in the while body
+        that feeds the new tensor value to the body's ROOT instruction.
+      - An HloValue with its defining position at the while instruction's
+        result.
+
+  - HloBuffer (Hlo alias analysis concept): A memory container that holds one
+    or more HloValues that must alias. Typically, each HloValue corresponds to
+    1 HloBuffer; however, many exceptions exist. For example, tensors that are
+    modified by a while loop have their HloValues share an HloBuffer, for the
+    HloValues that come immediately before, during, and immediately after the
+    loop. HloBuffers are shared between HloValues wherever their is aliasing,
+    whether implicit by the nature of the instruction (e.g.,
+    dynamic-update-slice) or explicit (e.g., fusion input-output aliasing).
+
+  - MsaBufferInterval (HeapSimulator concept): A MsaBufferInterval is defined by
+    a buffer of a given size, with a defined lifetime. In MSA, the buffer
+    corresponds to an HloValue.
+
+  - AllocationValue: An AllocationValue is defined by an HloValue, and *one* of
+    its HloPositions.
+    * We do not create AllocationValues for non-trivial HloPositions, e.g., ones
+      defined by Tuple, GetTupleElement, and Bitcast instructions.
+    * The HloPosition used to define the AllocationValue is referred to as the
+      AllocationValue's defining position.
+      * Typically, this is also the defining position of the HloValue. However,
+        it may not be. For example, we would create an AllocationValue with an
+        HloPosition of a read-only while loop parameter, but the HloValue
+        corresponding to that HloPosition would have a different defining
+        position.
+    * The uses of an AllocationValue are limited to the direct uses of the
+      AllocationValue's defining position.
+    * An AllocationValue is associated with an AllocationSequence, describing
+      what to do with the underlying tensor, in memory, over the lifetime of the
+      AllocationValue.
+
+  - (Use) Segment: Each AllocationValue and its uses are separated into periods
+    of time called use segments. The first use segment is from the (inclusive)
+    time of the AllocationValue's defining position to its first use
+    (inclusive). The second use segment is from the first use (inclusive) to
+    the second use (inclusive), etc.
+
+  - AllocationRequest: A request to determine what to do with an
+    AllocationValue, in memory, during a use segment. It also contains
+    restrictions and preferences on what to do.
+    * A request results in updates to the AllocationValue's AllocationSequence.
+      It may add Allocations, or modify existing Allocations in the sequence.
+
+  - Allocation: A description of what to do with an AllocationValue in memory,
+    over a period of time.
+    * Pure virtual base class of all Allocations.
+
+  - AllocationSequence: A sequential list of Allocations, explaining what to do
+    with an AllocationValue over its lifetime. Allocations in the sequence may
+    overlap.
+
+  - Pinned Allocation: Represents producing a tensor in a particular memory
+    space, or keeping a tensor in a memory space in which it already exists.
+
+  - Copy Allocation: Instructions to copy an AllocationValue from one memory
+    space to another. Used for prefetching (default mem -> alt mem), and
+    eviction (alt mem -> default mem).
+    * A copy Allocation contains a copy_done_schedule_before_time. The buffer is
+      available for use at that schedule time, through the Allocation's
+      end_time.
+
+  - Sliced Copy Allocation: Similar to a Copy Allocation, except the memory is
+    copied in slices, in an effort to delay allocating memory in the destination
+    memory space, for as long as possible.
+
+  - Mirrored Allocation and Parent Allocation: R/W tensors passed to while loops
+    typically have at least 3 AllocationValues, 1 for the producer of the tensor
+    before the while loop, 1 for the while loop's body parameter, and 1 for the
+    result of the while loop. There are situations heading into a while loop, in
+    which the while loop input is both in alternate memory and default memory.
+    (For example, this could happen beause we want the buffer in alternate
+    memory for the while loop and default memory after the while loop, but we
+    don't have resources to evict the buffer after the while loop.) In those
+    cases, we use a mirrored allocation for the AllocationValue inside the
+    while loop, to mirror the allocation in default memory. We use a parent
+    allocation for the AllocationValue resulting from the while loop result.
+
+Useful logging and error messages
+
+  - Live range too long: The live range of a use segement is too long to for an
+    alternate memory no copy, i.e., its longer than we want to keep a buffer in
+    alternate memory wihtout being used.
+    * If the CostAnalysisPrefetchIntervalPicker is used, which is the default,
+      live range too long is governed by the picker's
+      max_overlap_to_mem_size_async_copy_ratio argument.
+
+  - Live range too short: The live range of a use segement is too short to
+    prefetch a buffer to alternate memory, according to some heuristic and not
+    based on limited copy resource.
+    * If the CostAnalysisPrefetchIntervalPicker is used, which is the default,
+      live range too long is governed by the picker's
+      min_overlap_to_async_copy_ratio argument.
+
+  - "Finding allocation for": Magical logging phrase indicating the point in
+    time where we are are trying to determine how to update an AllocationValue's
+    AllocationSequenece, for a particular use segment.
+
+  - To log the alternate memory allocations that MSA made at a given schedule
+    time:
+    * Find the time point of interest. For example, to find the time for an
+      instruction fusion.1:
+      - Set vlogging to 2 for algorithm.cc.
+      - Find logging lines that look like:
+        Initial resource[100] = 1.0 (fusion.1)
+      - That tells us that the fusion.1 has schedule time 100.
+    * Uncomment the line in memory_space_assignment.cc labeled
+      DEBUG_ALLOCATIONS_AT, and use time 100.
+*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/options.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// This class contains pre-set assignments determined by memory space
+// assignment. It contains two data structures: (1) a chunks vector that maps a
+// defining HloPosition to a Chunk (offset and size), and (2) an assignment_info
+// vector that maps the memory space to information like its allocated size and
+// heap memory trace. If there is only one alternate memory space like there is
+// currently, there will be one entry in assignment_info.
+class PresetAssignments {
+ public:
+  // Contains per-memory-space information like the allocated size and heap
+  // simulator trace.
+  struct AssignmentInformation {
+    int64_t size;
+    HeapSimulatorTrace heap_simulator_trace;
+  };
+
+  PresetAssignments() = default;
+
+  void add_chunk(const HloPosition& position,
+                 const HeapSimulator::Chunk& chunk) {
+    chunks_.emplace_back(position, chunk);
+  }
+
+  void add_scoped_allocation_chunk(HloInstruction* instruction,
+                                   const HeapSimulator::Chunk& chunk) {
+    scoped_allocation_chunks_.emplace_back(instruction, chunk);
+  }
+
+  AssignmentInformation* assignment_information_for_space(
+      int64_t memory_space) {
+    for (auto& space_and_info : assignment_info_) {
+      if (space_and_info.first == memory_space) {
+        return &space_and_info.second;
+      }
+    }
+    assignment_info_.emplace_back(memory_space, AssignmentInformation());
+    return &assignment_info_.back().second;
+  }
+
+  absl::Span<const std::pair<HloPosition, HeapSimulator::Chunk>> chunks()
+      const {
+    return chunks_;
+  }
+
+  absl::Span<const std::pair<HloInstruction*, HeapSimulator::Chunk>>
+  scoped_allocation_chunks() const {
+    return scoped_allocation_chunks_;
+  }
+
+  absl::Span<const std::pair<int64_t, AssignmentInformation>>
+  assignment_informations() const {
+    return assignment_info_;
+  }
+
+  // Get debugging information.
+  std::string buffer_info_str() const { return buffer_info_str_; }
+  std::string allocation_info_str() const { return allocation_info_str_; }
+  std::string instruction_schedule_str() const {
+    return instruction_schedule_str_;
+  }
+
+ private:
+  std::vector<std::pair<HloPosition, HeapSimulator::Chunk>> chunks_;
+  std::vector<std::pair<HloInstruction*, HeapSimulator::Chunk>>
+      scoped_allocation_chunks_;
+  std::vector<std::pair<int64_t, AssignmentInformation>> assignment_info_;
+  std::string buffer_info_str_;
+  std::string allocation_info_str_;
+  std::string instruction_schedule_str_;
+};
+
+// MemorySpaceAssignment assigns memory spaces (default or alternate) to each
+// instruction in the module. It will greedily try placing as as many values in
+// the alternate memory space as possible. It uses the heap simulator to
+// determine the actual allocation offsets of values in the alternate memory
+// space to account for fragmentation. The default memory space is assumed to be
+// large enough to hold the values that could not be placed in the alternate
+// memory space.
+class MemorySpaceAssignment {
+ public:
+  // Statistics of asynchronous copies.
+  struct AsyncCopyStats {
+    // Includes both async copies and async sliced copies.
+    int64_t max_outstanding_async_copies = 0;
+    // Includes both async copies and async sliced copies.
+    int64_t num_prefetches = 0;
+    int64_t num_sliced_prefetches = 0;
+    int64_t num_sliced_prefetch_slices = 0;
+    int64_t prefetch_bytes = 0;
+    int64_t num_evictions = 0;
+    int64_t eviction_bytes = 0;
+  };
+
+  virtual ~MemorySpaceAssignment() = default;
+
+  // Runs the MemorySpaceAssignment pass.
+  static absl::StatusOr<std::unique_ptr<PresetAssignments>> Run(
+      HloModule* module, const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis, const Options& options);
+
+  // Calculates asynchronous copy statistics.
+  absl::StatusOr<AsyncCopyStats> CalculateAsyncCopyStats(
+      const HloDataflowAnalysis& dataflow_analysis) const;
+
+  // Verify that allocations_ are free of overlapping Allocations in time and
+  // space. This is a post-processing step called after all allocations have
+  // been finalized, before the async copies get scheduled.
+  absl::Status VerifyAllocations() const;
+
+  // Verify that the memory space assignment is free of overlapping buffers and
+  // export heap simulator trace to be used by buffer_assignment.
+  //
+  // If alt_mem_bytes_occupied is not null, it will be populated with the number
+  // of bytes occupied in the alternate memory space at each instruction time.
+  absl::Status VerifyAndExportHeapSimulatorTrace(
+      const HloAliasAnalysis& alias_analysis,
+      std::vector<int64_t>* alt_mem_bytes_occupied = nullptr);
+
+ protected:
+  // Main driver of the memory space assignment pass.
+  virtual absl::StatusOr<std::unique_ptr<PresetAssignments>>
+  RunMemorySpaceAssignment(const HloLiveRange& hlo_live_range,
+                           const HloAliasAnalysis& alias_analysis);
+
+  // Finds an AllocationSequence for placing buffers in alternate memory using
+  // the MsaAlgorithm algorithm. Must be set before Process() is called.
+  virtual absl::Status FindAllocationSequence(
+      const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis);
+
+  const Options& options() const { return options_; }
+
+  MemorySpaceAssignment(HloModule* module, const Options& options,
+                        const HloLiveRange& hlo_live_range)
+      : module_(module),
+        options_(options),
+        flattened_instructions_(hlo_live_range.flattened_instruction_sequence()
+                                    .instructions()
+                                    .begin(),
+                                hlo_live_range.flattened_instruction_sequence()
+                                    .instructions()
+                                    .end()),
+        computations_in_schedule_(),
+        preset_assignments_(std::make_unique<PresetAssignments>()) {
+    for (const auto& computation_and_bound :
+         hlo_live_range.computation_span_times()) {
+      computations_in_schedule_.insert(computation_and_bound.first);
+    }
+  }
+
+  AllocationSequence allocations_;
+
+  HloModule* module() { return module_; }
+
+ private:
+  // Process calls Process methods of the allocations after the allocations have
+  // been finalized.
+  absl::Status Process(const HloLiveRange& hlo_live_range);
+
+  // Process() might have altered the computation graph by inserting kTuple and
+  // kGetTupleElement instructions. SimplifyGraph performs a simple DCE and
+  // tuple simplification operation (e.g., given GetTupleElement(Tuple(a, b),
+  // 1), simply forwards b). Runs to fixed point.
+  absl::Status SimplifyGraph();
+
+  // FixSchedule inserts asynchronous copies in the schedule.
+  absl::Status FixSchedule();
+
+  // Export the alternate memory assignments to the PresetAssignments and color
+  // the HLO graph with the determined memory spaces.
+  absl::Status ExportAndColorBuffers(const HloAliasAnalysis& alias_analysis);
+
+  // Schedules asynchronous copies and ensures that the CopyStarts and their
+  // corresponding CopyDones follow the same order.
+  void ScheduleAsynchronousCopies();
+
+  // Remove the positions and chunks associated with the instruction from
+  // alternate_memory_assignments_.
+  void RemoveAssignmentForInstruction(const HloInstruction* instruction);
+
+  HloModule* module_;
+  const Options& options_;
+  std::vector<HloInstruction*> flattened_instructions_;
+  absl::flat_hash_set<const HloComputation*> computations_in_schedule_;
+  std::unique_ptr<PresetAssignments> preset_assignments_;
+  std::vector<std::pair<HloPosition, HeapSimulator::Chunk>>
+      alternate_memory_assignments_;
+  std::vector<std::pair<HloInstruction*, HeapSimulator::Chunk>>
+      scoped_memory_assignments_;
+  int64_t alternate_memory_size_ = 0;
+
+  // These maps hold vectors of new instructions that need to be scheduled after
+  // (or before) the instruction index in the key. FixSchedule uses these maps
+  // to modify and fix the schedule.
+  absl::flat_hash_map<int64_t, std::vector<HloInstruction*>> schedule_after_;
+  absl::flat_hash_map<int64_t, std::vector<HloInstruction*>> schedule_before_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
new file mode 100644
index 00000000..c81035e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
@@ -0,0 +1,448 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TEST_BASE_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TEST_BASE_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/transforms/simplifiers/instruction_hoister.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.h"
+#include "xla/service/memory_space_assignment/options.h"
+#include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+constexpr int64_t kPointerSize = 8;
+constexpr float kDefaultMemBandwidth = 100;
+constexpr float kAlternateMemBandwidth = 1000;
+constexpr float kBytesPerSecond = 100;
+constexpr float kFlopsPerSecond = 1000;
+constexpr float kTranscendentalsPerSecond = 10;
+
+class TestBufferIntervalComparator : public BufferIntervalComparator {
+ public:
+  explicit TestBufferIntervalComparator(MsaBufferIntervalCompare compare_method)
+      : compare_method_(std::move(compare_method)) {}
+
+  ~TestBufferIntervalComparator() override = default;
+
+  std::string DescribeComparisonCriteria() const override {
+    return "internal to test";
+  }
+  std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) override {
+    return "internal to test";
+  }
+  bool LessThan(const MsaBufferInterval& lhs,
+                const MsaBufferInterval& rhs) override {
+    return compare_method_(lhs, rhs);
+  }
+
+ private:
+  MsaBufferIntervalCompare compare_method_;
+};
+
+class MemorySpaceAssignmentTestBase : public HloTestBase {
+ protected:
+  // We use the following two memory space values to describe the default (slow
+  // and large) and alternate (fast and small) memory spaces.
+  const int64_t kDefaultMemorySpace = 0;
+  const int64_t kAlternateMemorySpace = 1;
+  const int64_t kPinnedDefaultMemorySpace = 2;
+
+  static HloCostAnalysis::Options DefaultHloCostAnalysisOptions() {
+    HloCostAnalysis::Options options;
+    options.set_flops_per_second(kFlopsPerSecond);
+    options.set_bytes_per_second(kBytesPerSecond);
+    options.set_transcendentals_per_second(kTranscendentalsPerSecond);
+
+    return options;
+  }
+
+  Options DefaultMemorySpaceOptions() const {
+    Options options;
+    options.max_size_in_bytes = 128;
+    options.alignment_in_bytes = 8;
+    options.verify = false;
+    options.alternate_memory_space = kAlternateMemorySpace;
+    options.max_outstanding_prefetches = -1;
+    options.max_outstanding_evictions = -1;
+
+    return options;
+  }
+
+  static CostAnalysisOptions DefaultCostAnalysisOptions() {
+    CostAnalysisOptions options;
+    options.default_mem_bandwidth_bytes_per_second = kDefaultMemBandwidth;
+    options.alternate_mem_bandwidth_bytes_per_second = kAlternateMemBandwidth;
+    return options;
+  }
+
+  static Options UpdateMaxAsyncCopies(Options options,
+                                      int64_t max_async_copies) {
+    options.max_outstanding_prefetches = max_async_copies;
+    options.max_outstanding_evictions = max_async_copies;
+
+    return options;
+  }
+
+  std::unique_ptr<PresetAssignments> AssignMemorySpaceUsingCostAnalysis(
+      HloModule* module,
+      std::optional<Options> memory_space_options_override = std::nullopt,
+      std::optional<CostAnalysisOptions> cost_analysis_options_override =
+          std::nullopt,
+      std::optional<HloCostAnalysis::Options> hlo_cost_options_override =
+          std::nullopt,
+      std::optional<MsaSortOrderOverrides> optional_msa_sort_order_overrides =
+          std::nullopt) {
+    HloCostAnalysis::Options hlo_cost_options = DefaultHloCostAnalysisOptions();
+    if (hlo_cost_options_override) {
+      hlo_cost_options = *hlo_cost_options_override;
+    }
+
+    HloCostAnalysis hlo_cost_analysis(hlo_cost_options);
+    for (HloComputation* computation : module->MakeNonfusionComputations()) {
+      TF_CHECK_OK(computation->Accept(&hlo_cost_analysis));
+    }
+    TF_CHECK_OK(HloAliasAnalysis::Run(module).status());
+
+    Options memory_space_options = DefaultMemorySpaceOptions();
+    if (memory_space_options_override) {
+      memory_space_options = *memory_space_options_override;
+    }
+    CostAnalysisOptions cost_analysis_options = DefaultCostAnalysisOptions();
+    if (cost_analysis_options_override) {
+      cost_analysis_options = *cost_analysis_options_override;
+    }
+    HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
+
+    auto status_or_cost_analysis = CostAnalysis::Create(
+        hlo_cost_analysis_costs, cost_analysis_options, *module);
+    TF_CHECK_OK(status_or_cost_analysis.status());
+    auto cost_analysis = std::move(status_or_cost_analysis.value());
+
+    memory_space_options.cost_analysis = cost_analysis.get();
+    CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
+        CostAnalysisPrefetchIntervalPicker(
+            *cost_analysis, /*min_overlap_to_async_copy_ratio=*/0.8f,
+            /*preferred_overlap_to_async_copy_ratio=*/1.5,
+            /*max_overlap_to_mem_size_async_copy_ratio=*/10.0,
+            /*mem_size_bytes=*/memory_space_options.max_size_in_bytes));
+    MsaSortOrderOverrides msa_sort_order_overrides;
+    if (optional_msa_sort_order_overrides.has_value()) {
+      msa_sort_order_overrides = optional_msa_sort_order_overrides.value();
+    }
+    MemoryBoundednessBufferIntervalComparator comparator(
+        *cost_analysis, &cache_, msa_sort_order_overrides);
+    return AssignMemorySpace(
+        module, memory_space_options,
+        [&comparator](const MsaBufferInterval& lhs,
+                      const MsaBufferInterval& rhs) {
+          return comparator.LessThan(lhs, rhs);
+        },
+        &prefetch_interval_picker);
+  }
+
+  std::unique_ptr<PresetAssignments> AssignMemorySpace(
+      HloModule* module, std::optional<Options> options_override = std::nullopt,
+      int64_t max_prefetch_interval = 10, int64_t min_prefetch_interval = 2) {
+    InstructionHoister instruction_hoister;
+    TF_CHECK_OK(instruction_hoister.Run(module).status());
+    InstructionCountPrefetchIntervalPicker prefetch_interval_picker(
+        min_prefetch_interval, max_prefetch_interval);
+    return AssignMemorySpace(module, std::move(options_override),
+                             /*buffer_interval_compare=*/{},
+                             &prefetch_interval_picker);
+  }
+
+  std::unique_ptr<PresetAssignments> AssignMemorySpace(
+      HloModule* module, std::optional<Options> options_override,
+      std::optional<MsaBufferIntervalCompare> buffer_interval_compare,
+      PrefetchIntervalPicker* prefetch_interval_picker) {
+    auto status_or = AssignMemorySpaceAndReturnStatus(
+        module, std::move(options_override), std::move(buffer_interval_compare),
+        prefetch_interval_picker);
+    TF_EXPECT_OK(status_or.status());
+    return std::move(status_or.value());
+  }
+
+  absl::StatusOr<std::unique_ptr<PresetAssignments>>
+  AssignMemorySpaceAndReturnStatus(
+      HloModule* module, std::optional<Options> options_override,
+      std::optional<MsaBufferIntervalCompare> buffer_interval_compare,
+      PrefetchIntervalPicker* prefetch_interval_picker) {
+    auto size_fn = [](const BufferValue& buffer) {
+      return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+    };
+
+    auto is_allowed_in_alternate_mem = [](const HloValue& value) {
+      // Check if the value belongs to the entry computation.
+      HloInstruction* instruction = value.instruction();
+      HloComputation* computation = instruction->parent();
+      bool in_entry_computation =
+          (computation == computation->parent()->entry_computation());
+
+      return (!in_entry_computation ||
+              instruction->opcode() != HloOpcode::kParameter);
+    };
+
+    // Only check parameters in default memory if the original module didn't
+    // have the parameters in alternate memory.
+    bool check_parameters_in_default_memory = true;
+    for (const HloInstruction* parameter :
+         module->entry_computation()->parameter_instructions()) {
+      ShapeUtil::ForEachSubshape(
+          parameter->shape(),
+          [&](const Shape& subshape, const ShapeIndex& /*index*/) {
+            if (subshape.has_layout() &&
+                subshape.layout().memory_space() == kAlternateMemorySpace) {
+              check_parameters_in_default_memory = false;
+            }
+          });
+    }
+
+    Options options = DefaultMemorySpaceOptions();
+    if (options_override) {
+      options = *options_override;
+    }
+    std::unique_ptr<TestBufferIntervalComparator> test_comparator;
+    if (buffer_interval_compare.has_value()) {
+      test_comparator = std::make_unique<TestBufferIntervalComparator>(
+          *buffer_interval_compare);
+      options.buffer_interval_comparator = test_comparator.get();
+    }
+    options.prefetch_interval_picker = prefetch_interval_picker;
+    options.size_fn = size_fn;
+    if (options.is_allowed_in_alternate_mem_fn == nullptr) {
+      options.is_allowed_in_alternate_mem_fn = is_allowed_in_alternate_mem;
+    }
+
+    TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloLiveRange> hlo_live_range,
+                        HloLiveRange::Run(module->schedule(), *alias_analysis,
+                                          module->entry_computation()));
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<PresetAssignments> preset_assignments,
+                        MemorySpaceAssignment::Run(module, *hlo_live_range,
+                                                   *alias_analysis, options));
+    if (check_parameters_in_default_memory) {
+      CheckParametersInDefaultMemory(module);
+    }
+    CheckRootInDefaultMemory(module);
+    CheckPresetAssignments(preset_assignments.get());
+    return preset_assignments;
+  }
+
+  void CheckPresetAssignments(const PresetAssignments* preset_assignments) {
+    // Ensure that the exported preset assignments point to layouts in the
+    // alternate memory.  Also ensure that the positions are unique. Note that
+    // we're using a std::set instead of absl::flat_hash_set because we can make
+    // use of HloPosition's comparator logic instead of providing a hasher.
+    std::set<HloPosition> positions_in_preset_assignments;
+    for (auto& position_and_chunk : preset_assignments->chunks()) {
+      HloPosition position = position_and_chunk.first;
+      EXPECT_EQ(positions_in_preset_assignments.find(position),
+                positions_in_preset_assignments.end());
+      positions_in_preset_assignments.insert(position);
+      const Shape& subshape =
+          ShapeUtil::GetSubshape(position.instruction->shape(), position.index);
+      EXPECT_EQ(subshape.layout().memory_space(), kAlternateMemorySpace)
+          << "Exported position is not in alternate mem: "
+          << position.ToString();
+    }
+  }
+
+  void CheckParametersInDefaultMemory(const HloModule* module) {
+    // Check that all the entry parameter subshapes are placed in default
+    // memory.
+    const HloComputation* entry_computation = module->entry_computation();
+    for (const HloInstruction* parameter :
+         entry_computation->parameter_instructions()) {
+      ShapeUtil::ForEachSubshape(
+          parameter->shape(),
+          [&](const Shape& subshape, const ShapeIndex& /*index*/) {
+            if (subshape.has_layout()) {
+              EXPECT_NE(subshape.layout().memory_space(), kAlternateMemorySpace)
+                  << "Parameter not in default memory: "
+                  << parameter->ToString();
+            }
+          });
+    }
+  }
+
+  void CheckRootInDefaultMemory(const HloModule* module) {
+    const HloInstruction* root =
+        module->entry_computation()->root_instruction();
+    if (root->shape().IsArray()) {
+      EXPECT_EQ(root->shape().layout().memory_space(), kDefaultMemorySpace);
+    }
+  }
+
+  struct OutstandingAsyncCopies {
+    int64_t max_copies;
+    int64_t max_prefetches;
+    int64_t max_evictions;
+  };
+
+  /*static*/ OutstandingAsyncCopies CountMaximumOutstandingAsyncCopies(
+      const HloModule& module) const {
+    OutstandingAsyncCopies copies{0, 0, 0};
+    int64_t current_copies = 0;
+    int64_t current_prefetches = 0;
+    int64_t current_evictions = 0;
+    for (HloInstruction* instruction : module.schedule()
+                                           .sequence(module.entry_computation())
+                                           .instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+        if (ShapeUtil::GetSubshape(instruction->shape(), {0})
+                .layout()
+                .memory_space() == kAlternateMemorySpace) {
+          current_prefetches++;
+        } else {
+          current_evictions++;
+        }
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+        if (instruction->shape().layout().memory_space() ==
+            kAlternateMemorySpace) {
+          current_prefetches--;
+        } else {
+          current_evictions--;
+        }
+      }
+      copies.max_copies = std::max(copies.max_copies, current_copies);
+      copies.max_prefetches =
+          std::max(copies.max_prefetches, current_prefetches);
+      copies.max_prefetches = std::max(copies.max_evictions, current_evictions);
+    }
+    return copies;
+  }
+
+  static int64_t GetAlternateMemoryOffset(
+      const PresetAssignments& preset_assignments,
+      const HloInstruction* instruction, const ShapeIndex& index = {}) {
+    // Returns the offset of the assignment, -1 if it's not in the alternate
+    // memory.
+    const HloModule* module = instruction->GetModule();
+    auto status_or_alias_analysis = HloAliasAnalysis::Run(module);
+    TF_CHECK_OK(status_or_alias_analysis.status());
+    auto alias_analysis = std::move(status_or_alias_analysis.value());
+    HloBuffer& buffer = alias_analysis->GetUniqueBufferAt(instruction, index);
+    for (auto& pos_and_chunk : preset_assignments.chunks()) {
+      for (auto& value : buffer.values()) {
+        if (pos_and_chunk.first == value->defining_position()) {
+          return pos_and_chunk.second.offset;
+        }
+      }
+    }
+    return -1;
+  }
+
+  std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
+    HloComputation::Builder builder(TestName());
+    Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+    HloInstruction* p0 =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+    HloInstruction* p1 =
+        builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+    HloInstruction* tanh = builder.AddInstruction(
+        HloInstruction::CreateUnary(shape, HloOpcode::kTanh, p0));
+    // tanh should be placed in the alternate memory since there isn't much
+    // contention in the beginning. However, tanh has another consumer at the
+    // end. So it should be kicked out to default memory and prefetched back in.
+    // The graph below is meant to increase the contention to force
+    // eviction/prefetch behavior.
+    HloInstruction* a = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, tanh));
+    HloInstruction* b = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1));
+    HloInstruction* c = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1));
+    HloInstruction* d = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1));
+    HloInstruction* e = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, b));
+    HloInstruction* f = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, c));
+    HloInstruction* g = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, d));
+    HloInstruction* h = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, c));
+    HloInstruction* i = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, d));
+    HloInstruction* j = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, c, d));
+    HloInstruction* k = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, e, f));
+    HloInstruction* l = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, g, h));
+    HloInstruction* m = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, i, j));
+    HloInstruction* n = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, k, l));
+    HloInstruction* o = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, n, m));
+    // tanh is being used at the root instruction, and this should be
+    // prefetched.
+    HloInstruction* add = builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, o, tanh));
+
+    auto module = CreateNewVerifiedModule();
+    HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+    HloSchedule schedule(module.get());
+    schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i,
+                                        j, k, l, m, n, o, add});
+    TF_CHECK_OK(module->set_schedule(schedule));
+    return module;
+  }
+
+  CostAnalysis::Cache cache_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/options.h
new file mode 100644
index 00000000..96de9500
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/options.h
@@ -0,0 +1,335 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_OPTIONS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_OPTIONS_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation_value.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
+#include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+using IsAllowedInAlternateMemoryFunction = std::function<bool(const HloValue&)>;
+using IsUseAllowedInAlternateMemoryFunction =
+    std::function<bool(const HloUse&)>;
+using IsPositionAllowedInAlternateMemoryFunction =
+    std::function<bool(const HloPosition&)>;
+using ReservedScopedMemoryFunction = std::function<int64_t(
+    const HloInstruction*,
+    const absl::flat_hash_set<
+        std::pair<int, ShapeIndex>>& /*operands_in_alternate_memory*/,
+    const absl::flat_hash_set<ShapeIndex>& /*outputs_in_alternate_memory*/)>;
+using PositionRequiresContiguousAllocationFunction =
+    std::function<bool(const HloPosition&)>;
+using WindowPrefetchDetailFunction =
+    std::function<WindowPrefetchDetail(const HloInstruction*)>;
+using WindowPrefetchNotifyOperandAppendedFunction =
+    std::function<void(HloInstruction*, int64_t, int64_t)>;
+using IsAsyncSliceImplementedFunction =
+    std::function<bool(const HloInstruction*)>;
+
+// MSA allows for custom post-allocation transformations. When a post-allocation
+// transformation is performed on an instruction, this result is returned. It
+// tells MSA:
+//  1. A list of instructions that MSA should delete.
+//  2. A list of HloUses that the transformation replaced.
+//
+// This information is then processed via
+// FixAllocationSequenceAfterPostAllocationTransformation call.
+struct PostAllocationTransformationUpdate {
+  std::vector<HloInstruction*> to_be_removed;
+  absl::flat_hash_map<HloUse, HloUse> update_use_map;
+
+  std::string ToString() const;
+};
+
+// The different options to be passed to the Run() API.
+struct Options {
+  // The backend-specific integer value that describes the default memory.
+  int64_t default_memory_space = 0;
+
+  // Backend-specific integer value that describes the alternate memory.
+  int64_t alternate_memory_space = 0;
+
+  // Maximum size of the alternate memory space.
+  int64_t max_size_in_bytes = 0;
+
+  // Memory alignment of the alternate memory space.
+  int64_t alignment_in_bytes = 1;
+
+  // If provided, we sort the buffers using this comparator. Otherwise, we use
+  // GlobalDecreasingSizeBestFitHeap::kSpatial.
+  BufferIntervalComparator* buffer_interval_comparator = nullptr;
+
+  // This object determines how early and how late prefetches can occur.
+  PrefetchIntervalPicker* prefetch_interval_picker = nullptr;
+
+  // This object is used to determine the benefit of a particular allocation.
+  CostAnalysis* cost_analysis = nullptr;
+
+  // Size function for buffer values.
+  BufferValue::SizeFunction size_fn;
+
+  std::function<Shape(const Shape&)> get_equivalent_s8_shape_fn;
+
+  // This function can be used to prevent certain HloValues (e.g., based on
+  // the opcode) to be placed on the alternate memory.
+  IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_fn;
+
+  // This function can be used to prevent certain HloUses (e.g., based on
+  // the opcode) to be placed on the alternate memory.
+  IsUseAllowedInAlternateMemoryFunction is_use_allowed_in_alternate_mem_fn =
+      [](const HloUse&) { return true; };
+
+  // Specifies if the given position is allowed in the alternate memory.
+  IsPositionAllowedInAlternateMemoryFunction
+      is_position_allowed_in_alternate_mem_fn =
+          [](const HloPosition&) { return true; };
+
+  // This function returns the amount of scoped memory in bytes that should be
+  // reserved during the execution of this instruction. Note that the
+  // `operands_in_alternate_memory` also includes the window prefetched
+  // operands.
+  ReservedScopedMemoryFunction reserved_scoped_memory_fn =
+      [](const HloInstruction*,
+         const absl::flat_hash_set<
+             std::pair<int, ShapeIndex>>& /*operands_in_alternate_memory*/,
+         const absl::flat_hash_set<
+             ShapeIndex>& /*outputs_in_alternate_memory*/) { return 0; };
+
+  PositionRequiresContiguousAllocationFunction
+      position_requires_contiguous_allocation_fn =
+          [](const HloPosition&) { return false; };
+
+  // This function is called to get details about window prefetches.
+  WindowPrefetchDetailFunction window_prefetch_detail_fn =
+      [](const HloInstruction*) { return WindowPrefetchDetail(); };
+
+  // This function is called to notify that an operand has been appended as a
+  // window prefetch buffer.
+  WindowPrefetchNotifyOperandAppendedFunction notify_operand_appended_fn =
+      [](HloInstruction*, int64_t, int64_t) {};
+
+  // This function can be used to check if an equivalent asynchronous slice
+  // lowering is implemented for a given  synchronous slice instruction.
+  IsAsyncSliceImplementedFunction is_async_slice_implemented_fn =
+      [](const HloInstruction*) { return false; };
+
+  // Should only be used for testing purposes. This function allows us to
+  // modify the AllocationResult after the AllocationRequest has been processed
+  // by AllocateSegment().
+  std::function<void(const AllocationRequest&, AllocationResult&)>
+      allocation_result_modifier_testing_fn = nullptr;
+
+  // Should only be used for testing purposes. This function allows us to
+  // modify the AllocationRequest before the AllocationRequest is passed to
+  // AllocateSegment().
+  std::function<void(AllocationRequest&)>
+      allocation_request_modifier_testing_fn = nullptr;
+
+  // Applies post-allocation transformations to the given instruction. This
+  // function is called after the allocations are found in the MsaAlgorithm. It
+  // is called on each instruction I that meets the following conditions:
+  // 1. I is called from a non-fusion computation
+  // 2. I's operands are not in alternate memory
+  // 3. I is not successfully converted to async instruction.
+  // 4. I's operands don't have in-place users, e.g., a dynamic-update-slice.
+  //
+  // The transformation function is allowed to do the following:
+  //  1. Mark instructions for removal.
+  //  2. Modify existing instructions.
+  //
+  // This transformation is NOT allowed to:
+  //  1. Directly remove instructions (or nullify them).
+  //  2. Add new instructions.
+  //
+  // Note that it is up to the transformation function to ensure that the
+  // changes to the module preserves the semantics of the original program.
+  std::function<absl::StatusOr<PostAllocationTransformationUpdate>(
+      HloInstruction*)>
+      post_allocation_transformation_fn;
+
+  // If true, we will try to reduce scoped allocation buffer size for all
+  // instructions if their operand/output has been allocated in alternate
+  // memory.
+  bool reduce_scoped_memory_limit = false;
+
+  // If true, we allocate the reserved scoped memory at the same offset. This
+  // is useful to enable more deduplication between HLOs that have reserved
+  // scoped memories, but may result in less efficient memory packing.
+  bool allocate_reserved_scoped_memory_at_same_offset = true;
+
+  // Specifies the upper bound for number of outstanding prefetches and
+  // evictions, -1 for unlimited.
+  int64_t max_outstanding_prefetches = -1;
+  int64_t max_outstanding_evictions = -1;
+
+  // Extra outstanding prefetch limit for while uses (in addition to
+  // max_outstanding_prefetches).
+  int64_t while_use_extra_outstanding_prefetch_limit = 0;
+
+  // Specifies the maximum number of retries that will be performed for each
+  // value in case prefetching failed due to running out of asynchronous
+  // copies or asynchronous copy resource.
+  int64_t max_retries = 1;
+
+  // The maximum number of repacks that we are willing to perform in case we
+  // can't allocate a buffer due to running out of memory. If this value is
+  // greater than 0, repacker must be non-nullptr.
+  int64_t max_repacks = 0;
+
+  // The repacking algorithm to reduce fragmentation. Must be non-null if
+  // max_repacks is greater than 0.
+  MemorySpaceAssignmentRepacker* repacker = nullptr;
+
+  // This is only useful for testing, repack after every allocation.
+  bool repack_after_every_allocation = false;
+
+  // If true, verifies the memory space assignment against overlapping
+  // buffers.
+  bool verify = false;
+
+  // If not nullptr, this function is called to dump debugging information.
+  // The first argument is appended to the file name and the second argument
+  // is the contents of the file.
+  std::function<void(absl::string_view, absl::string_view)> dump_fn = nullptr;
+
+  // Enable prefetching buffers into preferred memory across program
+  // boundaries
+  bool enable_cross_program_prefetch = true;
+
+  // If true, use buffer_interval_compare to determine which buffers to
+  // prefetch across program boundaries.
+  bool default_cross_program_prefetch_heuristic = false;
+
+  // Enable cross-program prefetch freeing optimization where the
+  // cross-program-prefetched buffer can be reused.
+  bool enable_cross_program_prefetch_freeing = true;
+
+  // The maximum number of cross program prefetches.
+  // TODO(tjablin): Use a heuristic to determine this automatically.
+  int max_cross_program_prefetches = 1;
+
+  // If false, we assume tensors that we couldn't explicitly determine to be
+  // activations are activations. If true, we assume these aren't activations,
+  // so they may be cross-program-prefetch candidates.
+  bool cross_program_prefetch_permissive_mode = false;
+
+  // Enable redundant eviction optimization in/around while loops. If enabled,
+  // this optimization would keep a copy of the buffer in the default memory in
+  // addition to alternate memory to eliminate redundant evictions.
+  bool enable_while_redundant_eviction_elimination = true;
+
+  // An optional memory space assignment autotuning config, which is used
+  // to sort allocated buffers.
+  std::optional<std::vector<uint64_t>> autotuning_config = std::nullopt;
+
+  // If true, uses the earlier instance of the same instruction to use as
+  // preferred prefetch start time.
+  bool use_repeated_instance_for_preferred_prefetch_time = false;
+
+  // If true, enforces the FIFO order for prefetches.
+  bool enforce_prefetch_fifo_order = false;
+
+  // If true, tries to replace synchronous copy instructions with asynchronous
+  // ones. If it fails to replace the copy, it keeps the sync version.
+  bool enable_sync_copy_replacement = false;
+
+  // If true, tries to replace synchronous slice instructions with asynchronous
+  // ones. If it fails to replace the slice, it keeps the sync version.
+  bool enable_sync_slice_replacement = false;
+
+  // If non-zero, this is the number of extra outstanding async copies that we
+  // allow for each sync mem op that is converted to an async mem op.
+  int extend_async_copies_limit_for_sync_mem_op_conversion = 0;
+
+  // The ratio of use bytes to copy bytes for a given allocation site below
+  // which we consider the site to be inefficient. A value of 0 would treat all
+  // sites as efficient and a value of 1 would require the amount of bytes used
+  // at the site to be at least as much as the async copy bytes. There are two
+  // factors that determine the copy and use bytes:
+  //   - Some uses don't actually access the entire tensor, e.g. in
+  //     dynamic-update-slice.
+  //   - copy_bytes may be larger than the size of the tensor as well. An
+  //     example is a tensor may be prefetched, used, and then evicted. In that
+  //     case copy_bytes would be twice the size of the tensor.
+  float inefficient_use_to_copy_ratio = 0.0;
+
+  // This is mostly used for testing, it allows a test case to inject its own
+  // logic for MsaAlgorithm::GetInefficientAllocationSites.
+  std::function<std::vector<std::variant<HloPosition, HloUse>>(
+      absl::Span<HloPosition>)>
+      get_inefficient_allocation_sites_fn = nullptr;
+
+  // Config to filter prefetches and update preferred prefetch times for the
+  // filtered prefetches.
+  PreferredPrefetchOverrides preferred_prefetch_overrides;
+
+  // Options for slicing prefetches into smaller asynchronously copied pieces.
+  SlicedPrefetchOptions sliced_prefetch_options;
+
+  // Options for the memory-bound loop optimizer feature.
+  MemoryBoundLoopOptimizerOptions memory_bound_loop_optimizer_options;
+
+  SliceProposalFunction propose_slice_fn = [](const Shape&,
+                                              const SlicedPrefetchOptions&)
+      -> absl::StatusOr<SliceProposalCollection> {
+    return UnimplementedStrCat("Generation of SliceProposals unimplemented");
+  };
+
+  // Option to always spill buffers from alternate memory to default memory
+  // and prefetching back to alternate memory(if needed) just in time for use.
+  bool always_spill_to_default_memory = false;
+
+  // If true, enables window prefetching. Window prefetching is a mechanism
+  // where we prefetch windows of data into the alternate memory before the
+  // first use of the buffer. This allows large tensors to be prefetched as well
+  // and gives MSA more flexibility in choosing the prefetch time and how much
+  // data to prefetch.
+  bool enable_window_prefetch = false;
+
+  MsaSortOrderOverrides msa_sort_order_overrides;
+};
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.h
new file mode 100644
index 00000000..0ae8af53
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.h
@@ -0,0 +1,292 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_PREFETCH_INTERVAL_PICKER_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_PREFETCH_INTERVAL_PICKER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// Abstract base class that memory space assignment uses to pick prefetch
+// intervals.
+class PrefetchIntervalPicker {
+ public:
+  PrefetchIntervalPicker() = default;
+  virtual ~PrefetchIntervalPicker() = default;
+
+  // Returns true if the buffer can be allocated in alternate memory space
+  // without any copies (prefetches).
+  virtual bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
+                                                  int64_t start_time,
+                                                  int64_t end_time) const = 0;
+
+  // Returns the preferred end time for an eviction that starts at a given time
+  // and must end by the given end time.
+  virtual int64_t PreferredEvictionEndTime(const Shape& shape,
+                                           int64_t start_time,
+                                           int64_t latest_end_time) const = 0;
+
+  // Returns the latest time that a prefetch can start.
+  virtual int64_t LatestPrefetchStartTime(const Shape& shape,
+                                          int64_t start_time, int64_t end_time,
+                                          const HloUse* use) const = 0;
+
+  // Returns the preferred time that a prefetch can start.
+  virtual int64_t PreferredPrefetchStartTime(
+      const Shape& shape, int64_t earliest_prefetch_start_time,
+      int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const = 0;
+
+  // Returns the latest time that a prefetch can end that is less than or equal
+  // to proposed_prefetch_end_time.
+  virtual int64_t LatestPrefetchEndTime(
+      int64_t original_prefetch_end_time,
+      int64_t proposed_prefetch_end_time) const {
+    return proposed_prefetch_end_time;
+  }
+
+  // Returns the estimated end time of a prefetch that starts at the given time.
+  virtual int64_t EstimatedPrefetchEndTime(const Shape& shape,
+                                           int64_t start_time,
+                                           int64_t end_time) const = 0;
+
+  // Returns the elapsed time in seconds between the logical interval that
+  // corresponds to the instruction schedule.
+  virtual float GetLogicalIntervalElapsed(int64_t start_time,
+                                          int64_t end_time) const = 0;
+
+  // Begins the iterator for the first start time of the prefetch.
+  virtual void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
+                     std::optional<int64_t> preferred_time) = 0;
+
+  // Advances the start time of the prefetch and returns that value.
+  virtual int64_t Next() = 0;
+
+  // Returns true if the available prefetch intervals have been exhausted.
+  virtual bool Done() const = 0;
+
+  // Returns the latest time the prefetch interval picker will have pick.
+  virtual int64_t latest_time() const = 0;
+
+  // The retry number can be used to modify the interval picking policies. The
+  // first attempt will have a retry_number of 0, then 1, etc.
+  virtual void SetRetryNumber(int retry_number) {
+    retry_number_ = retry_number;
+  }
+  int retry_number() const { return retry_number_; }
+
+  // Returns a debug string for the current state of the prefetch interval
+  // picker.
+  virtual std::string ToDebugString() const = 0;
+
+  // Returns a debug string for no-copy allocation.
+  virtual std::string ToNoCopyDebugString(const Shape& shape,
+                                          int64_t start_time,
+                                          int64_t end_time) const = 0;
+
+  // Prefetch interval pickers may return a value corresponding to the benefit
+  // of placing the BufferInterval in the alternate memory. The larger value,
+  // the more beneficial.
+  virtual std::optional<float> BufferIntervalAlternateMemoryBenefit(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+      const {
+    return std::nullopt;
+  }
+
+ protected:
+  const absl::flat_hash_map<const HloInstruction*, int64_t>*
+      instruction_schedule_ = nullptr;
+  int retry_number_ = 0;
+};
+
+// Prefetch interval picker that uses instruction count to overlap asynchronous
+// copies with independent computation. The min and max overlap counts describe
+// the number of independent HLOs overlapped while a value is being prefetched
+// into the alternate memory (between CopyStart and CopyDone HLO instructions).
+// max_overlap_count attempts to prevent bringing tensors into the alternate
+// memory too eagerly and hence occupying the space for other tensors which
+// might use it.  min_overlap_count attempts to prevent cases where tensors are
+// prefetched into the alternate memory without sufficient time for the copy to
+// take place.  In those cases, it's just better to keep the tensor in the
+// default memory instead of hurting the critical path with this copy that
+// likely won't finish in time.
+class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
+ public:
+  InstructionCountPrefetchIntervalPicker(int64_t min_overlap_count,
+                                         int64_t max_overlap_count)
+      : min_overlap_count_(min_overlap_count),
+        max_overlap_count_(max_overlap_count) {}
+
+  bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
+                                          int64_t start_time,
+                                          int64_t end_time) const override;
+
+  int64_t PreferredEvictionEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t latest_end_time) const override;
+
+  int64_t LatestPrefetchStartTime(const Shape& shape, int64_t start_time,
+                                  int64_t end_time,
+                                  const HloUse* use) const override;
+
+  int64_t PreferredPrefetchStartTime(const Shape& shape,
+                                     int64_t earliest_prefetch_start_time,
+                                     int64_t latest_prefetch_start_time,
+                                     int64_t prefetch_end_time) const override;
+
+  int64_t EstimatedPrefetchEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t end_time) const override;
+  float GetLogicalIntervalElapsed(int64_t start_time,
+                                  int64_t end_time) const override;
+
+  void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
+             std::optional<int64_t> preferred_time) override;
+
+  int64_t Next() override;
+  bool Done() const override;
+
+  int64_t latest_time() const override;
+
+  std::string ToDebugString() const override;
+  std::string ToNoCopyDebugString(const Shape& shape, int64_t start_time,
+                                  int64_t end_time) const override;
+
+ private:
+  int64_t min_overlap_count_;
+  int64_t max_overlap_count_;
+  int64_t end_time_;
+  int64_t current_prefetch_time_;
+};
+
+// Prefetch interval picker that uses cost analysis to overlap asynchronous
+// copies with independent computation. It uses min (independent computation
+// duration) / (asynchronous copy duration) ratio to guide whether the prefetch
+// is within the lower bound. For the upper bound, it restricts the maximum
+// duration that a buffer may occupy the alternate memory space as a multiple of
+// the time it would take to copy a buffer that is the size of the alternate
+// memory. It starts with the preferred ratio in Begin() and works its way for
+// alternately earlier and later prefetches until hitting min and max ratios.
+// The value for buffer size for max async copy is a mechanism to prevent
+// copying small buffers between the two memories unnecessarily. For calculating
+// the max time that the buffer can reside in alternate memory, we use the
+// larger of this value and the actual size of the buffer. A shape override can
+// also be provided which causes the interval picker to use that shape for async
+// copy durations instead of the actual shape of the copy.
+class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
+ public:
+  CostAnalysisPrefetchIntervalPicker(
+      const CostAnalysis& cost_analysis, float min_overlap_to_async_copy_ratio,
+      float preferred_overlap_to_async_copy_ratio,
+      float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
+      const Shape* shape_override = nullptr);
+
+  bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
+                                          int64_t start_time,
+                                          int64_t end_time) const override;
+
+  int64_t PreferredEvictionEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t latest_end_time) const override;
+
+  int64_t LatestPrefetchEndTime(
+      int64_t original_prefetch_end_time,
+      int64_t proposed_prefetch_end_time) const override;
+
+  int64_t LatestPrefetchStartTime(const Shape& shape, int64_t start_time,
+                                  int64_t end_time,
+                                  const HloUse* use) const override;
+
+  int64_t PreferredPrefetchStartTime(const Shape& shape,
+                                     int64_t earliest_prefetch_start_time,
+                                     int64_t latest_prefetch_start_time,
+                                     int64_t prefetch_end_time) const override;
+
+  int64_t EstimatedPrefetchEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t end_time) const override;
+  float GetLogicalIntervalElapsed(int64_t start_time,
+                                  int64_t end_time) const override;
+
+  void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
+             std::optional<int64_t> preferred_time) override;
+
+  int64_t Next() override;
+  bool Done() const override;
+
+  int64_t latest_time() const override;
+
+  void SetRetryNumber(int retry_number) override;
+
+  std::string ToDebugString() const override;
+  std::string ToNoCopyDebugString(const Shape& shape, int64_t start_time,
+                                  int64_t end_time) const override;
+
+  std::optional<float> BufferIntervalAlternateMemoryBenefit(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+      const override;
+
+ private:
+  // Finds the minimum nest level in the given interval.
+  int GetMinWhileNestLevel(int64_t start_time, int64_t end_time) const;
+
+  // Given the elapsed time to copy this buffer to the alternate memory, returns
+  // the longest time that this buffer may reside in the alternate memory space.
+  float GetMaxElapsedInAlternateMemory(float async_copy_elapsed) const;
+
+  // For each instruction in the flattened schedule, maintain their elapsed time
+  // (in cumulative sum) and while nesting level.
+  std::vector<float> elapsed_time_cumsum_;
+  std::vector<int> while_nest_level_;
+  std::vector<int> computation_nest_level_;
+  // Maintain the index of the most recent (before this instruction) nest level
+  // change in order to efficiently determine the minimum nest level in an
+  // interval.
+  std::vector<int> while_nest_level_change_;
+
+  const CostAnalysis& cost_analysis_;
+  float min_overlap_to_async_copy_ratio_;
+  float preferred_overlap_to_async_copy_ratio_;
+  float max_async_copy_elapsed_;
+  float async_copy_elapsed_;
+  float inst_elapsed_reduction_;
+  int64_t end_logical_time_;
+  int64_t earliest_prefetch_time_;
+  int64_t latest_prefetch_time_;
+  bool using_increasing_prefetch_time_iterator_ = true;
+  int64_t increasing_prefetch_time_iterator_;
+  int64_t decreasing_prefetch_time_iterator_;
+
+  std::vector<float> while_execution_counts_;
+  // Shape override is used to override the shape of the shape of the async copy
+  // to treat all async copies the same duration. Having an override forces
+  // prefetches to be scheduled roughly in FIFO order.
+  std::optional<Shape> shape_override_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_PREFETCH_INTERVAL_PICKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/repacking.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/repacking.h
new file mode 100644
index 00000000..03277927
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/repacking.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// An interface to define allocation repacking algorithms.
+class MemorySpaceAssignmentRepacker {
+ public:
+  MemorySpaceAssignmentRepacker(int64_t max_size, int64_t alignment)
+      : max_size_(max_size), alignment_(alignment) {}
+  virtual ~MemorySpaceAssignmentRepacker() = default;
+
+  // Repack the AllocationBlocks provided in the parameter. Returns true if
+  // allocations have been modified and false if not. Returns a non-ok status if
+  // there was an error.
+  virtual absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) = 0;
+
+ protected:
+  int64_t max_size_;
+  int64_t alignment_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/simulator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/simulator.h
new file mode 100644
index 00000000..425f579b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/simulator.h
@@ -0,0 +1,182 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SIMULATOR_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SIMULATOR_H_
+
+#include <cstdint>
+#include <list>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+enum class MemoryTransferDirection {
+  kUnsupported,
+  kDefaultToAlternate,
+  kAlternateToDefault,
+};
+
+// REQUIRES:
+// * async_copy_like_start must be an async copy-start or slice-start
+// instruction.
+MemoryTransferDirection GetAsyncCopyLikeDirection(
+    const HloInstruction* async_copy_like_start,
+    int64_t alternate_memory_space);
+
+// This struct is used to track the outstanding async copy like instructions and
+// the remaining bytes required to be accessed.
+struct OutstandingAsyncCopyLike {
+  const HloInstruction* copy_like_start_inst;
+  float remaining_bytes_to_transfer;
+  bool operator==(const OutstandingAsyncCopyLike& other) const {
+    return copy_like_start_inst == other.copy_like_start_inst &&
+           remaining_bytes_to_transfer == other.remaining_bytes_to_transfer;
+  }
+};
+
+// A wrapper class around runtime simulator.
+class RuntimeSimulator {
+ public:
+  // A struct that captures an instructions elapsed time and the amount of time
+  // we estimate default memory bandwidth to be idle, during that instruction.
+  struct ElapsedAndIdleTimes {
+    float elapsed_time;
+    float idle_default_memory_bandwidth_time;
+  };
+
+  explicit RuntimeSimulator(CostAnalysis* cost_analysis,
+                            int64_t alternate_memory_space)
+      : cost_analysis_(cost_analysis),
+        alternate_memory_space_(alternate_memory_space) {}
+
+  // This constructor is used to inject the outstanding async copy queues for
+  // testing purpose.
+  explicit RuntimeSimulator(
+      CostAnalysis* cost_analysis, int64_t alternate_memory_space,
+      const std::list<OutstandingAsyncCopyLike>& outstanding_read_default_queue,
+      const std::list<OutstandingAsyncCopyLike>&
+          outstanding_write_default_queue)
+      : cost_analysis_(cost_analysis),
+        alternate_memory_space_(alternate_memory_space),
+        outstanding_read_default_queue_(outstanding_read_default_queue),
+        outstanding_write_default_queue_(outstanding_write_default_queue) {}
+
+  ~RuntimeSimulator() = default;
+
+  // This function provides a basic estimate without considering the overhead of
+  // async copies.
+  float SimulateElapsedTimeWithoutAsyncCopyLikes(
+      const HloLiveRange& hlo_live_range,
+      const AllocationSequence& allocations);
+
+  // Returns the time to simulate the hlo_live_range, when we account for the
+  // waiting time for async copy like instructions to finish.
+  //
+  // To simulate the overhead of async copy like instructions, we need to
+  // maintain two queues to track the outstanding memory access requests that
+  // read/write the default memory. When we simulate compute, we use any time
+  // there is spare bandwidth to simulate async memory accesses to default
+  // memory. If we get to an async copy like done, we must wait until it
+  // finishes (potentially waiting for copies issued before it to finish.
+  //
+  // alt_mem_bytes_occupied is a vector of the amount of alt mem bytes allocated
+  // at any given instruction. It may be null.
+  float SimulateElapsedTime(
+      const HloModule* hlo_module, const AllocationSequence& allocations,
+      const std::vector<int64_t>* alt_mem_bytes_occupied = nullptr);
+
+  // This is an auxiliary function for simulating the execution
+  // time for executing a copy-done instruction. It returns the
+  // elapsed time (in seconds) for executing the copy-done instruction.
+  //
+  // This function also updates the passed in queues as we complete async copy
+  // like instructions during the simulation.
+  //
+  // We simulate the shared bandwidth for default-alternate memory access.
+  // For example, if the copy-done instruction is a default-write memory
+  // process, and there are outstanding default-read memory processes in the
+  // outstanding_read_default_queue, then we use half of the bandwidth to
+  // process both requests in parallel. Otherwise, we use the full bandwidth to
+  // process the default-write request.
+  float SimulateAsyncCopyLikeDone(
+      const HloInstruction* copy_like_done_instruction);
+
+  const std::list<OutstandingAsyncCopyLike>& GetOutstandingReadDefaultQueue()
+      const;
+
+  const std::list<OutstandingAsyncCopyLike>& GetOutstandingWriteDefaultQueue()
+      const;
+
+  // This is an auxiliary function for simulating the execution
+  // time for a compute instruction. It returns the elapsed time (in seconds)
+  // for executing the compute instruction.
+  //
+  // Aside from returning the elapsed time, this function also updates the
+  // outstanding memory request queues, by draining them when the compute
+  // instruction is not occupying bandwidth.
+  ElapsedAndIdleTimes SimulateComputeInstruction(
+      const HloInstruction* compute_instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_memory,
+      absl::Span<const ShapeIndex> outputs_in_alternate_memory);
+
+ private:
+  // This function parses the memory space assignment solution and initializes
+  // the maps that record, for each instruction, which outputs and operands are
+  // stored in alternate memory. These maps are used to estimate the runtime of
+  // the HLO module.
+  void InitializeAlternateMemoryMap(const AllocationSequence& allocations);
+  const CostAnalysis* cost_analysis_;
+  CostAnalysis::Cache cost_analysis_cache_;
+  // Members used for memory model simulation
+  // This function updates the queue by updating the front request with the
+  // processed bytes. If the request is completed (no remaining bytes to
+  // process), the function returns the instruction and pop it from the queue.
+  // Otherwise, it returns nullptr.
+  const HloInstruction* RemoveBytesFromQueueIfNotEmpty(
+      std::list<OutstandingAsyncCopyLike>& async_copy_like_queue,
+      float processed_bytes);
+
+  // This is an auxiliary function which simulates the process of draining
+  // the memory access queues in a given amount of time (seconds). If both
+  // outstanding_*_default_queues are non-empty, they share bandwidth. If one of
+  // the queues is empty and the other is not, it gets the full bandwdith.
+  //
+  // Returns the remaining idle time after processing async-copy-likes.
+  float ProcessAsyncCopyLikesInIdleTime(float time);
+
+  int64_t alternate_memory_space_;
+  std::list<OutstandingAsyncCopyLike> outstanding_read_default_queue_;
+  std::list<OutstandingAsyncCopyLike> outstanding_write_default_queue_;
+  absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>
+      outputs_in_alternate_memory_map_;
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<std::pair<int64_t, ShapeIndex>>>
+      operands_in_alternate_memory_map_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SIMULATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/slice.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/slice.h
new file mode 100644
index 00000000..f0caa04e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/slice.h
@@ -0,0 +1,150 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains definitions for MSA slicing. Slicing is an allocation
+// technique in which we allocate a buffer in slices that can start at different
+// times, but once allocated, form a contiguous buffer. When copying buffers, we
+// may want to allocate a buffer in slices, so that we delay allocating memory
+// that would otherwise not be in use, due to copy bandwidth constraints.
+//
+// The following illustrates a buffer that is fully allocated at time t2, via
+// slices.
+//
+//   space
+//    ^
+// p3 |       +-----------+
+//    |       |    s2     |
+// p2 |   +---+-----------+
+//    |   |      s1       |
+// p1 |   +-------+-------+
+//    |           |  s0   |
+// p0 |           +-------+
+//    +---|---|---|---|---|----> time
+//        t0  t1  t2  t3  t4
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
+
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/shape.h"
+
+namespace xla::memory_space_assignment {
+
+// The target of a custom call that slicing uses to concatenate slices
+// that are already contiguous in memory, into a larger buffer.
+inline constexpr char kConcatBitcastCustomCall[] = "ConcatBitcast";
+
+// The parameters for slicing a single dimension of a tensor.
+struct SliceParam {
+  std::string ToString() const;
+  bool operator==(const SliceParam& other) const;
+
+  int64_t start_inclusive;
+  int64_t end_exclusive;
+};
+
+// A proposed way to slice a buffer.
+struct SliceProposal {
+  std::string ToString() const;
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const SliceProposal& proposal);
+  std::tuple<const Shape&, const std::vector<SliceParam>&, int64_t> ToTuple()
+      const;
+  bool operator==(const SliceProposal& other) const;
+
+  // Shape resulting from the slice.
+  Shape slice_shape;
+
+  // slice_params map to the parameters that would be passed to a slice
+  // instruction. Thus:
+  // * There should be a slice parameter for every dimension in the shape of
+  //   the tensor being sliced.
+  // * The ith slice_param applies to the ith logical dimension in the shape
+  //   being sliced.
+  // * If a dimension is not being sliced, it should have a SliceParam of
+  //   {0, dim size}.
+  std::vector<SliceParam> slice_params;
+
+  // The size to be allocated for the slice. Note, this may be > the size of
+  // the slice shape, due to additional padding that may occur when the slices
+  // are concatenated back together.
+  int64_t slice_size;
+};
+
+// A SliceProposalCollection proposes a way to to slice an AllocationRequest.
+// A SliceProposalCollection is generated from a SliceProposalFunction and is
+// used when we want to slice a prefetch.
+using SliceProposalCollection = std::vector<SliceProposal>;
+using SliceProposalFunction =
+    std::function<absl::StatusOr<SliceProposalCollection>(
+        const Shape& shape, const SlicedPrefetchOptions& options)>;
+
+// A SliceDecision is a SliceProposal that we've determined where and when to
+// allocate.
+struct SliceDecision {
+  std::string ToString() const;
+  bool operator==(const SliceDecision& other) const;
+
+  HeapSimulator::Chunk chunk;
+  int64_t exclusive_start_time;
+  SliceProposal sizing;
+  float copy_resource_consumed;
+};
+
+// Returns true if the options indicates that there is a preferred slice
+// size.
+bool IsUniformSliceSizingEnabled(const SlicedPrefetchOptions& options);
+
+// A class for turning a copy start time and end time into slice start times.
+class SlicedPrefetchStartTimePicker {
+ public:
+  // Returns the amount of time elapsed in the instruction schedule between
+  // (exclusive_start_time, exclusive_end_time).
+  using ElapsedTimeFn = std::add_pointer<float(
+      int64_t exclusive_start_time, int64_t exclusive_end_time) const>::type;
+
+  // Returns true if the instructions at lhs_time and rhs_time are in the same
+  // computation.
+  using SameComputationParentFn =
+      std::add_pointer<bool(int64_t lhs_time, int64_t rhs_time) const>::type;
+
+  // Picks slice start times, given the num_slices, prefetch_start_time, and
+  // prefetch_end_time. The returned times are exclusive.
+  //
+  // REQUIRES:
+  // - The instructions following each start time are guaranateed to be in the
+  //   same computation.
+  // - The returned times sorted.
+  // - The first returned time is equal to prefetch_start_time.
+  static std::vector<int64_t> Pick(
+      int64_t num_slices, int64_t exclusive_prefetch_start_time,
+      int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
+      absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn);
+};
+
+}  // namespace xla::memory_space_assignment
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/testing_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
new file mode 100644
index 00000000..c1bf0f5a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TESTING_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TESTING_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// For testing purposes, we define a cost analysis where we can control the
+// elapsed times of each HLO and asynchronous copy.
+class FakeCostAnalysis : public CostAnalysis {
+ public:
+  static absl::StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
+      HloCostAnalysisCosts& cost_analysis_costs, const HloModule& module,
+      const CostAnalysisOptions& options) {
+    TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
+    TF_ASSIGN_OR_RETURN(auto hlo_live_range,
+                        HloLiveRange::Run(module.schedule(), *alias_analysis,
+                                          module.entry_computation()));
+    auto call_graph = CallGraph::Build(&module);
+    return absl::WrapUnique(new FakeCostAnalysis(
+        cost_analysis_costs, options, std::move(alias_analysis),
+        std::move(hlo_live_range), std::move(call_graph)));
+  }
+
+  float GetInstructionElapsed(
+      const HloInstruction& instruction) const override {
+    if (get_instruction_elapsed_override_) {
+      return get_instruction_elapsed_override_(instruction);
+    }
+    return 1.0;
+  }
+
+  float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem,
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem) const override {
+    if (get_instruction_elapsed_in_alternate_memory_override_) {
+      return get_instruction_elapsed_in_alternate_memory_override_(
+          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+    }
+    if (!operands_in_alternate_mem.empty()) {
+      return 0.5;
+    } else {
+      return 1.0;
+    }
+  }
+
+  float GetAsyncCopyElapsed(const Shape& shape) const override {
+    if (get_async_copy_elapsed_override_) {
+      return get_async_copy_elapsed_override_(shape);
+    }
+    return 3.0;
+  }
+
+  // The following methods can be used to override what the above API calls
+  // return.
+  void SetOverrideForGetInstructionElapsed(
+      std::function<float(const HloInstruction&)> function) {
+    get_instruction_elapsed_override_ = function;
+  }
+  void SetOverrideForGetInstructionElapsedInAlternateMemory(
+      std::function<float(const HloInstruction&,
+                          absl::Span<const std::pair<int64_t, ShapeIndex>>,
+                          absl::Span<const ShapeIndex>)>
+          function) {
+    get_instruction_elapsed_in_alternate_memory_override_ = function;
+  }
+  void SetOverrideForGetAsyncCopyElapsed(
+      std::function<float(const Shape&)> function) {
+    get_async_copy_elapsed_override_ = function;
+  }
+
+ protected:
+  FakeCostAnalysis(HloCostAnalysisCosts& cost_analysis_costs,
+                   const CostAnalysisOptions& options,
+                   std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                   std::unique_ptr<HloLiveRange> hlo_live_range,
+                   std::unique_ptr<CallGraph> call_graph)
+      : CostAnalysis(cost_analysis_costs, options, std::move(alias_analysis),
+                     std::move(hlo_live_range), std::move(call_graph)) {}
+
+ private:
+  std::function<float(const HloInstruction&)>
+      get_instruction_elapsed_override_ = nullptr;
+  std::function<float(const HloInstruction&,
+                      absl::Span<const std::pair<int64_t, ShapeIndex>>,
+                      absl::Span<const ShapeIndex>)>
+      get_instruction_elapsed_in_alternate_memory_override_ = nullptr;
+  std::function<float(const Shape&)> get_async_copy_elapsed_override_ = nullptr;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TESTING_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/tuning_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/tuning_utils.h
new file mode 100644
index 00000000..9aa9b653
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/tuning_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+
+namespace xla {
+
+namespace memory_space_assignment {
+
+using BufferInterval =
+    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval;
+
+// This function converts sorted_buffer_intervals to order-customized buffer
+// intervals respecting a given memory space assignment autotuning config.
+void CustomizeSortedBufferInterval(
+    std::optional<std::vector<uint64_t>> memory_space_assignment_config,
+    std::vector<BufferInterval>& sorted_buffer_intervals);
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/utils.h
new file mode 100644
index 00000000..ee7096a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_assignment/utils.h
@@ -0,0 +1,117 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+using MsaBufferInterval =
+    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval;
+
+// Encapsulates common utility methods for memory space assignment.
+class MemorySpaceAssignmentUtils {
+ public:
+  // Returns true if this buffer is allowed to be placed in the alternate
+  // memory.
+  static bool IsIntervalAllowedInAlternateMemory(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
+      int64_t alternate_memory_space);
+
+  // Returns true if the HloValue is allowed to be placed in alternate memory.
+  static bool IsValueAllowedInAlternateMemory(const HloValue* value,
+                                              int64_t alternate_memory_space);
+
+  static bool DoesUseMatchFilter(const HloOperandFilter& filter,
+                                 const HloUse& hlo_use, int64_t operand_size);
+
+  static bool DoesInstructionMatchFilter(const HloPositionMatcher& filter,
+                                         const HloInstruction& instruction);
+
+  static bool DoesPositionMatchFilter(const HloPositionMatcher& filter,
+                                      const MsaBufferInterval& buffer_interval);
+
+  static absl::StatusOr<xla::HloLiveRange::LogicalTime>
+  GetScheduleTimeFromInstructionMatcher(
+      const HloPositionMatcher& position_matcher,
+      const absl::flat_hash_map<const xla::HloInstruction*,
+                                xla::HloLiveRange::LogicalTime>& schedule);
+
+  static absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
+      float prefetch_eagerness, int64_t earliest_prefetch_time,
+      int64_t latest_prefetch_time);
+
+  static absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
+      const HloPositionMatcher& after_instruction,
+      const absl::flat_hash_map<const xla::HloInstruction*,
+                                xla::HloLiveRange::LogicalTime>& schedule);
+
+  static absl::StatusOr<std::optional<int64_t>>
+  GetPrefetchTimeBeforeInstruction(
+      const HloPositionMatcher& before_instruction,
+      const absl::flat_hash_map<const xla::HloInstruction*,
+                                xla::HloLiveRange::LogicalTime>& schedule);
+
+  static absl::StatusOr<std::optional<int64_t>> GetPrefetchTime(
+      const PreferredPrefetchOverrideOptions& override_options,
+      int64_t earliest_prefetch_time, int64_t latest_prefetch_time,
+      const absl::flat_hash_map<const HloInstruction*,
+                                HloLiveRange::LogicalTime>&
+          instruction_schedule);
+
+  static absl::StatusOr<std::optional<int64_t>>
+  GetOverriddenPreferredPrefetchTime(
+      const PreferredPrefetchOverrides& preferred_prefetch_overrides,
+      int64_t operand_size, const HloUse& hlo_use,
+      const absl::flat_hash_map<const HloInstruction*,
+                                HloLiveRange::LogicalTime>&
+          instruction_schedule,
+      int64_t earliest_prefetch_time, int64_t latest_prefetch_time);
+
+  static bool DoesCrossProgramPrefetchBufferMatchAnyFilter(
+      const MsaSortOrderOverrides& sort_order_overrides,
+      const MsaBufferInterval& buffer_interval);
+
+  // Returns an integer representing the priority of a MsaBufferInterval during
+  // assignment, a smaller number indicates a higher priority.
+  static int64_t GetBufferIntervalOverridePriority(
+      const MsaSortOrderOverrides& msa_sort_order_overrides,
+      const MsaBufferInterval& buffer_interval,
+      bool is_cross_program_prefetch = false);
+
+ private:
+  static bool DoesBufferIntervalMatchHloUseFilter(
+      const HloPositionMatcher& filter,
+      const MsaBufferInterval& buffer_interval);
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_propagation.h
new file mode 100644
index 00000000..11676aa4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/memory_space_propagation.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+#define XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/memory_space_propagation.h"
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/metrics_hook_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/service/metrics_hook_interface.h
new file mode 100644
index 00000000..58b9300b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/metrics_hook_interface.h
@@ -0,0 +1,62 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
+#define XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/service/metrics.pb.h"
+
+namespace xla {
+
+// MetricsHookInterface is an abstract interface for compiler backends to record
+// stages of their compilation process.
+
+class MetricsHookInterface {
+ public:
+  virtual ~MetricsHookInterface() = default;
+  // Used to record instance of a successful XLA compilation pass happening
+  // under a stage
+  virtual void RecordStagePassCount(absl::string_view stage,
+                                    absl::string_view pass) const = 0;
+
+  // Used to record instance of an error with error_status happening under an
+  // XLA compilation stage and pass
+  virtual void RecordStagePassError(absl::string_view stage,
+                                    absl::string_view pass,
+                                    absl::string_view error_status) const = 0;
+
+  // Used to record instance of a successful XLA compilation stage that does not
+  // encompass its own passes (empty pass field).
+  virtual void RecordStageCount(absl::string_view stage) const = 0;
+
+  // Used to record instance of an error with error_status happening under an
+  // XLA compilation stage (empty pass field)
+  virtual void RecordStageError(absl::string_view stage,
+                                absl::string_view error_status) const = 0;
+
+  // Captures metrics for a given XLA compilation stage and module_ids. The
+  // `pass_metrics` can be empty if no pass specific metrics are available.
+  virtual void RecordCompilationMetrics(
+      CompilationLogEntry::CompilationStage stage, absl::Duration latency,
+      absl::Span<const uint64_t> module_ids,
+      absl::Span<const PassMetrics> pass_metrics) const = 0;
+};
+}  // namespace xla
+#endif  // XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/multi_output_fusion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/multi_output_fusion.h
new file mode 100644
index 00000000..dd321c7e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/multi_output_fusion.h
@@ -0,0 +1,222 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
+#define XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
+
+#include <optional>
+#include <queue>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This class implements the fusing of sibling fusion instructions that sharing
+// common operands.
+// It constructs the following associated data structures.
+//  (1) candidates_: stores the instruction and the set of instructions it can
+//      fuse to.
+//  (2) candidates_index_: maps instruction to id.
+//  (3) reachability_: reachability map in this computation.
+//  (4) all_fusion_candidates_: the vector of candidate instructions.
+//  (5) worklist_: a priority queue that contains pairs of instructions to be
+//      fused and their fusion profit scores.
+//
+//  Function Perform() applies the optimization. It picks up the most profitable
+//  pair in the worklist_, checks if it's legal to fuse and fuses the pair.
+//  After fusion, it updates the associated structures such as reachability_,
+//  candidates_ and worklist_.
+//  Note that the reachability map is updated based on the original computation.
+//  This works because the reachability is monotonically increasing with
+//  instruction fusion.
+class MultiOutputFusion : public HloModulePass {
+ public:
+  MultiOutputFusion() = default;
+
+  absl::string_view name() const override { return "multi_output_fusion"; }
+
+  // Run multi-output fusion on the given module. Returns whether the module
+  // was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ protected:
+  // Main entry for the optimization. Returns true if the optimization happens.
+  bool Perform();
+
+  // Test if instr1 and instr2 have the compatible shapes that can be legally
+  // fused.
+  virtual bool ShapesCompatibleForFusion(HloInstruction* instr1,
+                                         HloInstruction* instr2) = 0;
+
+  // Whether the instruction is a candidate for fusion.
+  virtual bool IsFusible(HloInstruction* instr) = 0;
+
+  // This function estimates the savings by merging instr1 and instr2 into one
+  // multi-output fusion instruction. It returns a result in kib. (The result
+  // is intentionally not granules, because this method is not TPU-specific.)
+  virtual int64_t GetProfit(HloInstruction* instr1, HloInstruction* instr2) = 0;
+
+  // Whether fusing the instruction can reduce memory reads.
+  virtual bool IsProfitableOperand(HloInstruction* instr);
+
+  // Test if it's legal to fuse instr1 and instr2 into one fusion instruction.
+  virtual bool LegalToFuse(HloInstruction* instr1, HloInstruction* instr2);
+
+  // Test if it's legal to fuse instr1 and instr2 into one fusion instruction
+  // using main constraints.
+  bool LegalToFuseMainConstraints(HloInstruction* instr1,
+                                  HloInstruction* instr2);
+
+  // Fuse HloInstruction instr1 and instr2 and return the fused instruction.
+  // The other instruction is removed from its parent computation.
+  virtual HloInstruction* Fuse(HloInstruction* instr1, HloInstruction* instr2);
+
+  // Recompute reachability for the current computation.
+  void RecomputeReachability();
+
+  // Returns the reachability map for the current computation.
+  HloReachabilityMap* reachability() const { return reachability_.get(); }
+
+  // Returns the computation for the pass.
+  HloComputation* computation() const { return computation_; }
+
+  // Update the reachability map after fusing instr1 and instr2.
+  void UpdateReachability(
+      HloInstruction* instr1, HloInstruction* instr2,
+      absl::Span<const std::pair<HloInstruction*, HloReachabilityMap::Index>>
+          instrs_to_update,
+      std::optional<absl::FunctionRef<bool(HloInstruction*)>> skip =
+          std::nullopt);
+
+  // Hook for multi-output fusion along producer-consumer edges.
+  // Returns whether any instructions were fused.
+  //
+  // TODO(b/80420762): Perform producer-consumer multi-output fusion in
+  // InstructionFusion instead.
+  virtual bool DoProducerConsumerMultiOutputFusion();
+
+  // Return a list of fusible instructions that can be fused into the fusion of
+  // instr1 and instr2. The second entry in the vector is an old profit value
+  // from fusing the corresponding instruction and the base op of the new
+  // fusion.
+  std::vector<std::pair<HloInstruction*, int64_t>> GetNewFusibles(
+      HloInstruction* instr1, HloInstruction* instr2);
+
+  // Create a new fusion instruction and add `base' into it.
+  // Prepare for fusing `to_fuse' into the created fusion by updating
+  // reachability, worklist, and fusion candidates.
+  HloInstruction* CreateFusion(HloInstruction* base, HloInstruction* to_fuse);
+
+  bool is_connected(HloInstruction* instr1, HloInstruction* instr2) {
+    return reachability_->IsConnected(instr1, instr2);
+  }
+
+ private:
+  // An internal data structure for each instruction in current computation.
+  // When an instruction is removed, member 'hlo' is set to nullptr.
+  struct FusionCandidate {
+    HloInstruction* hlo;
+    std::list<std::pair<HloInstruction*, int64_t>> fusibles;
+    explicit FusionCandidate(HloInstruction* hlo) : hlo(hlo) {}
+  };
+
+  // The pair of candidates to be fused and the profit score.
+  struct ToBeFused {
+    HloInstruction* instr1;
+    HloInstruction* instr2;
+    int64_t score;
+    int64_t timestamp;
+    ToBeFused(HloInstruction* instr1, HloInstruction* instr2, int64_t score,
+              int64_t timestamp)
+        : instr1(instr1), instr2(instr2), score(score), timestamp(timestamp) {}
+    bool operator<(const ToBeFused& rhs) const {
+      return std::pair<int64_t, int64_t>(score, timestamp) <
+             std::pair<int64_t, int64_t>(rhs.score, rhs.timestamp);
+    }
+  };
+
+  // Stable priority queue where each insertion has a timestamp for
+  // deterministic popping.
+  class WorkList {
+   public:
+    bool empty() { return worklist_.empty(); }
+    ToBeFused pop() {
+      ToBeFused tmp = worklist_.top();
+      worklist_.pop();
+      return tmp;
+    }
+    template <class... Args>
+    void emplace(Args&&... args) {
+      worklist_.emplace(std::forward<Args>(args)..., timestamp_++);
+    }
+
+   private:
+    std::priority_queue<ToBeFused> worklist_;
+    int64_t timestamp_ = 0;
+  };
+
+  // Update the internal data structures before instr1 and instr2 are fused into
+  // one fusion instruction.
+  void UpdateBeforeFuse(HloInstruction* instr1, HloInstruction* instr2);
+
+  // Update the internal data structures after instructions are fused into
+  // one fusion instruction.
+  void UpdateAfterFuse(
+      HloInstruction* fusion,
+      const std::vector<std::pair<HloInstruction*, int64_t>>& new_fusibles,
+      bool new_fusion_node);
+
+  int64_t get_candidate_id(HloInstruction* instr) {
+    return FindOrDie(candidates_index_, instr);
+  }
+
+  bool is_fused(HloInstruction* instr) {
+    return candidates_[get_candidate_id(instr)].hlo == nullptr;
+  }
+
+  void set_is_fused(HloInstruction* instr) {
+    candidates_[get_candidate_id(instr)].hlo = nullptr;
+  }
+
+  std::vector<FusionCandidate> candidates_;
+  WorkList worklist_;
+
+  // A map that maps an instruction to the index_.
+  absl::flat_hash_map<HloInstruction*, int> candidates_index_;
+
+  // The reachability map of current computation.
+  std::unique_ptr<HloReachabilityMap> reachability_;
+
+  // This stores all the candidate instructions and their indices within
+  // reachability_ in current computation.
+  std::vector<std::pair<HloInstruction*, HloReachabilityMap::Index>>
+      all_fusion_candidates_;
+
+  // Computation for the pass.
+  HloComputation* computation_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/name_uniquer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/name_uniquer.h
new file mode 100644
index 00000000..5f0a83f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/name_uniquer.h
@@ -0,0 +1,88 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_NAME_UNIQUER_H_
+#define XLA_SERVICE_NAME_UNIQUER_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Simple stateful class that helps generate "unique" names. To use it, simply
+// call GetUniqueName as many times as needed. The names returned by
+// GetUniqueName are guaranteed to be distinct for this instance of the class.
+// Note that the names will be sanitized to match regexp
+// "[a-zA-Z_][a-zA-Z0-9_.-]*".
+class NameUniquer {
+ public:
+  // The separator must contain allowed characters only: "[a-zA-Z0-9_.-]".
+  explicit NameUniquer(const std::string& separator = "__");
+
+  // Get a sanitized unique name in a string, with an optional prefix for
+  // convenience.
+  std::string GetUniqueName(absl::string_view prefix = "");
+
+  // Sanitizes and returns the name. Unallowed characters will be replaced with
+  // '_'. The result will match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  static std::string GetSanitizedName(absl::string_view name);
+
+ private:
+  // Used to track and generate new identifiers for the same instruction name
+  // root.
+  class SequentialIdGenerator {
+   public:
+    SequentialIdGenerator() = default;
+
+    // Tries to register id as used identifier. If id is not already used, the
+    // id itself will be returned. Otherwise a new one will be generated, and
+    // returned.
+    int64_t RegisterId(int64_t id) {
+      if (used_.insert(id).second) {
+        return id;
+      }
+      while (!used_.insert(next_).second) {
+        ++next_;
+      }
+      return next_++;
+    }
+
+   private:
+    // The next identifier to be tried.
+    int64_t next_ = 0;
+
+    // Set of all the identifiers which has been used.
+    absl::flat_hash_set<int64_t> used_;
+  };
+
+  // The string to use to separate the prefix of the name from the uniquing
+  // integer value.
+  std::string separator_;
+
+  // Map from name prefix to the generator data structure which tracks used
+  // identifiers and generates new ones.
+  absl::flat_hash_map<std::string, SequentialIdGenerator> generated_names_;
+
+  NameUniquer(const NameUniquer&) = delete;
+  NameUniquer& operator=(const NameUniquer&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_NAME_UNIQUER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/op_expander_pass.h b/third_party/tflite-hdrs/third_party/xla/xla/service/op_expander_pass.h
new file mode 100644
index 00000000..df65b012
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/op_expander_pass.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_OP_EXPANDER_PASS_H_
+#define XLA_SERVICE_OP_EXPANDER_PASS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+#endif  // XLA_SERVICE_OP_EXPANDER_PASS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/operand_upcaster.h b/third_party/tflite-hdrs/third_party/xla/xla/service/operand_upcaster.h
new file mode 100644
index 00000000..8b237a47
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/operand_upcaster.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_OPERAND_UPCASTER_H_
+#define XLA_SERVICE_OPERAND_UPCASTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/operand_upcaster.h"
+
+#endif  // XLA_SERVICE_OPERAND_UPCASTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/optimization_barrier_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/optimization_barrier_expander.h
new file mode 100644
index 00000000..b257010f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/optimization_barrier_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_OPTIMIZATION_BARRIER_EXPANDER_H_
+#define XLA_SERVICE_OPTIMIZATION_BARRIER_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/optimization_barrier_expander.h"
+
+#endif  // XLA_SERVICE_OPTIMIZATION_BARRIER_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/optimize_input_output_buffer_alias.h b/third_party/tflite-hdrs/third_party/xla/xla/service/optimize_input_output_buffer_alias.h
new file mode 100644
index 00000000..04ad98bc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/optimize_input_output_buffer_alias.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+#define XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h"
+
+#endif  // XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/p2p_schedule_preparation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/p2p_schedule_preparation.h
new file mode 100644
index 00000000..1545b3b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/p2p_schedule_preparation.h
@@ -0,0 +1,206 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_P2P_SCHEDULE_PREPARATION_H_
+#define XLA_SERVICE_P2P_SCHEDULE_PREPARATION_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// P2PSchedulePreparation is a pass to linearize point-to-point operation chains
+// to prepare for any HLO scheduler. In particular, this pass currently does the
+// following:
+// (1) For an unpipelined P2P Send-Recv chain, add control dependence to
+//     express this ordering:
+//       recv => send => recv-done => send-done
+//
+// (2.1) For a single pipelined P2P Send-Recv chain, add control dependence to
+//     the while-body to express this ordering:
+//       recv-done => send-done => recv => send
+//       In the computation with such a while-loop, add control dependence to
+//     express this ordering:
+//       recv => send
+//       recv-done => send-done
+//     The data dependence already express this dependence:
+//       recv, send => while-loop => recv-done, send-done
+//
+// (2.2) For two pipelined P2P Send-Recv chain together forms a cycle, add
+//    control dependence to the while-body to express this ordering:
+//       recv-done.0 => send-done.0 => recv-done.1 => send-done.1 => recv.0 =>
+//       send.0 => recv.1 => send.1
+//       In the computation with such a while-loop, add control dependence to
+//     express this ordering:
+//       recv.0 => send.0 => recv.1 => send.1
+//       recv-done.0 => send-done.0 => recv-done.1 => send-done.1
+//     The data dependence already express this dependence:
+//       recv.0/1, send.0/1 => while-loop => recv-done.0/1, send-done.0/1
+//
+// (3) For a pipelined P2P Send-Recv chain, if the while-body has other
+// collective ops, we add control dependence to ensure that the pipelined
+// Send-done (or Send-done.1 in the cyclic case) is ordered before other P2P
+// chains while the pipelined Recv ( or Recv.1 in the cyclic case) is ordered
+// after other P2P chains. For example, if the other collective op is another
+// Send-Recv chain, we make the pipelined Send-done the control predecessor of
+// the other Recv and the pipelined Recv the control successor of the other
+// other Send. Here is an example to illustrate the problem we address:
+//
+// Assume a while-body with the following HLO collective-permute operations:
+//    collective-permute-start.1 = (u32[2], u32[2])
+//      collective-permute-start(data), channel_id=1...
+//    collective-permute-done.1 = u32[2], channel_id=1
+//    use of collective-permute-done.1 result
+//    collective-permute-start.2 = (u32[2], u32[2])
+//      collective-permute-start(data), channel_id=2...
+//    collective-permute-done.2 = u32[2], channel_id=2
+//    use of collective-permute-don.2 result
+//
+// Now assume we transform the collective-permute operations into two P2P
+// Send-Recv chains, the block of code will become something like this:
+//    after-all.1 = token[] after-all()
+//    recv.1 = (u32[2], token[]) recv(after-all.1), channel_id=1 ...
+//    send.1 = (u32[2], token[]) send(data, after-all.1), channel_id=1 ...
+//    recv-done.1 = (u32[2], token[]) recv-done(recv.1), channel_id=1 ...
+//    send-done.1 = token[] send-done(send.1), channel_id=1 ...
+//    use of recv-done.1 result
+//    after-all.2 = token[] after-all()
+//    recv.2 = (u32[2], token[]) recv(after-all.2), channel_id=2 ...
+//    send.2 = (u32[2], token[]) send(data, after-all.2), channel_id=2 ...
+//    recv-done.2 = (u32[2], token[]) recv-done(recv.2), channel_id=2 ...
+//    send-done.2 = token[] send-done(send.2), channel_id=2 ...
+//    use of recv-done.2 result
+//
+// If the while-loop is not pipelined, this pass adds control dependence to
+// make sure the first Send-Recv chain finish before the second Send-Recv
+// starts.
+//
+// If the while-loop is pipelined for the first Send-Recv chain, then the
+// first Recv/Send and the last Recv-done/Send-done of the chain are moved to
+// the computation that calls the while-loop, and the block of code in the
+// while-body will become something like this:
+//    recv.1 = (u32[2], u32[], token[]) get-tuple-element(param), index=1
+//    recv-done.1 = (u32[2], token[]) recv-done(recv.1), channel_id=1
+//    send.1 = (u32[2], u32[], token[]) get-tuple-element(param), index=4
+//    send-done.1 = token[] send-done(send.1), channel_id=1
+//    use of recv-done.1 result
+//    after-all.2 = token[] after-all()
+//    recv.2 = (u32[2], token[]) recv(after-all.2), channel_id=2 ...
+//    send.2 = (u32[2], token[]) send(data, after-all.2), channel_id=2 ...
+//    recv-done.2 = (u32[2], token[]) recv-done(recv.2), channel_id=2 ...
+//    send-done.2 = token[] send-done(send.2), channel_id=2 ...
+//    use of recv-done.2 result
+//    after-all.1.n = token[] after-all()
+//    recv.1.n = (u32[2], u32[], token[]) recv(after-all.1.n), channel_id=1
+//    send.1.n = (u32[2], u32[], token[]) send(new-data, after-all.1.n),
+//      channel_id=1
+//
+// In this case, we make send-done-1 the control predecessor of recv-2 and
+// send-done-2 the control predecessor of recv-1.n to ensure that the second
+// Send-Recv chain is executed after the Send for the first chain finishes and
+// before the Recv for the first chain starts.
+//
+// (4) For an unpipelined P2P chain or a pipelined P2P chain in the computation
+// containing the pipelined while-loop, adds control dependence to ensure
+// other instructions that may invoke collective operations do not interference
+// with the P2P chain.
+//
+// Here is an example to illustrate a potential scheduler deadlock we want to
+// avoid:
+//
+// Assume a computation with the following HLO instructions, where while-body
+// invokes collective-permute operations:
+//    collective-permute-start = (u32[2], u32[2])
+//      collective-permute-start(data) ...
+//    collective-permute-done = u32[2]
+//      collective-permute-done(collective-permute-start)
+//    while-init = (u32[], u32[2]) tuple(c0, collective-permute-done)
+//    while-result = (u32[], u32[2]) while(while-init),
+//      body=while-body, condition=while-cond
+//
+// Without collective-permute-decomposer transformation, LHS will Schedule
+// while-result after collective-permute-start without any problem.
+//
+// Now assume we transform the collective-permute operations in the computation
+// as well as inside the while-body into a sequence of P2P Send-Recv sequence,
+// the computation will become something like this:
+//    after-all = token[] after-all()
+//    recv = (u32[2], token[]) recv(after-all) ...
+//    send = (u32[2], token[]) send(data, after-all),
+//      control-predecessors={recv} ...
+//    recv-done = (u32[2], token[]) recv-done(recv),
+//      control-predecessors={send} ...
+//    send-done = token[] send-done(send),
+//      control-predecessors={recv-done} ...
+//    recv-data = u32[2] get-tuple-element(recv-done), index=0
+//    while-init = (u32[], u32[2]) tuple(c0, recv-data)
+//    while-result = (u32[], u32[2]) while(while_init),
+//        body=while_body, condition=while_cond
+//
+// When scheduling this computation in a bottom up fashion, the LHS will reach a
+// point where both while-result and send-done are in the ready queue. If LHS
+// picks send-done over while-result, the scheduler is stuck because
+// while-result can't be scheduled when the Send-Recv chain is holding the
+// resources for P2P operations and recv-done cannot be scheduled as well
+// because while-result depends on while-init which depends on recv-done. To
+// avoid this deadlock, we make send-done a control predecessor of the
+// while-loop with nested collective ops, regardless whether the P2P chain is
+// pipelined or not.
+//
+// Here is an example to illustrate a potential runtime deadlock we want to
+// avoid:
+//
+// Assume a computation with the following HLO instructions:
+//    collective-permute-start = (u32[2], u32[2])
+//      collective-permute-start(data) ...
+//    collective-permute-done = u32[2]
+//      collective-permute-done(collective-permute-start)
+//    an-independent-all-gather = ... all-gather(...)
+//
+// If we transform the collective-permute operations into a sequence of P2P
+// Send-Recv sequence and schedule All-Gather operation between the Send
+// and Recv, a runtime deadlock will happen as the devices that would have
+// bypassed Recv to perform Send are not blocked by All-Gather.
+//
+//    after-all = token[] after-all()
+//    recv = (u32[2], token[]) recv(after-all) ...
+//    an-independent-all-gather = ... all-gather(...)
+//    send = (u32[2], token[]) send(data, after-all),
+//      control-predecessors={recv} ...
+//    recv-done = (u32[2], token[]) recv-done(recv),
+//      control-predecessors={send} ...
+//    send-done = token[] send-done(send),
+//      control-predecessors={recv-done} ...
+//
+// To avoid this deadlock, we either make All-Gather a control predecessor of
+// Send or make Send-Done a control predecessor of All-Gather.
+//
+class P2PSchedulePreparation : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "latency-hiding-scheduler-preparation";
+  }
+
+  using HloPassInterface::Run;
+  // Runs P2PSchedulePreparation pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_P2P_SCHEDULE_PREPARATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/pattern_matcher.h b/third_party/tflite-hdrs/third_party/xla/xla/service/pattern_matcher.h
new file mode 100644
index 00000000..776b92d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/pattern_matcher.h
@@ -0,0 +1,3140 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_PATTERN_MATCHER_H_
+#define XLA_SERVICE_PATTERN_MATCHER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <ios>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "absl/utility/utility.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/ptrvec.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A pattern matcher for HloInstructions, Shapes, and Layouts.
+//
+// The Match function's first argument must be HloInstruction*, Shape*, or
+// Layout*. The second argument is a pattern that will be matched against the
+// first argument, as described below.
+//
+// Patterns are constructed using the match::Op, match::Shape, or match::Layout
+// functions. By default, the returned patterns will match any HloInstruction,
+// Shape, or Layout, respectively. However the match can be made more specific
+// by using the pattern's modifier methods, for example:
+//
+//   match::Op().WithOpcode(HloOpcode::kAdd).WithOperand(
+//     0, match::Op().WithOpcode(HloOpcode::kConstant))
+//
+// This pattern will match Add instructions whose first operand is a constant.
+//
+// Each pattern type has the following modifiers, which are described where
+// nontrivial.
+//
+//   Op():
+//     - Is: is the given HloInstruction* (i.e. pointer equality)
+//     - WithName
+//     - WithOpcode
+//     - WithoutOpcode: anything other than the given opcode
+//     - WithShape: instr's shape matches the given pattern
+//     - WithShapeEqualTo: instr's shape is equal to the given Shape
+//     - WithShapeCompatibleTo: instr's shape is compatible with the given Shape
+//     - WithElementType: instr.shape().element_type() matches the given type
+//     - WithNumOperands
+//     - WithOperand: operand at the given index matches the given pattern
+//     - WithOperandIfPresent: instr has fewer than i operands or the i'th one
+//       matches the given pattern
+//     - IsConstant
+//     - IsNonConstant
+//     - IsConstantScalar/IsEffectiveConstantScalar: Optionally accepts a value,
+//       e.g. IsConstantScalar() or IsConstantScalar(42).
+//     - WithFusionKind
+//     - WithTupleIndex: get-tuple-element operations with the given tuple index
+//     - WithOneUse: Instruction is used as an operand exactly once.
+//     - WithOneUser: Instruction is used by exactly one other instruction, but
+//       is possibly used more than once as an operand (e.g. multiply(x,x)).
+//     - WithComparisonDirection: instr has the given direction
+//     - WithConvDnums(string or proto): checks convolution_dimension_numbers().
+//     - WithPredicate: Instruction matches an arbitrary function you pass.
+//       Function must have signature `bool(const HloInstruction*)`.
+//     - WithContractingDims: Dot instruction with specific LHS and RHS
+//       contracting dimensions.
+//     - WithReplicaGroups: Collective instruction's replica groups matches the
+//       given pattern.
+//     - WithSharding: Instruction's sharding is equal to the given sharding.
+//     - WithControlDeps: Instruction's control predecessors/successors match
+//       the given list of instructions.
+//
+//   Shape():
+//     - EqualTo
+//     - CompatibleTo
+//     - IsScalar/IsEffectiveScalar/IsArray/IsTuple
+//     - IsDenseArray
+//     - WithLayout: layout shape's layout matches the given pattern (e.g.
+//     - WithLayoutEqualTo: shape's layout equals the argument (i.e. another
+//       Layout, but not the result of Layout().foo())
+//     - WithSubshape: shape is a tuple whose subshape matches the given pattern
+//       (e.g. Shape().IsScalar()).
+//     - WithSubshapeEqualTo: shape is a tuple with a subshape equal to the arg
+//       (i.e. another Shape, but not the result of Shape().foo())
+//     - WithElementType: shape is an array/scalar with the given elem type
+//     - WithRank: shape is an array/scalar with the given rank
+//
+//  Layout():
+//     - EqualTo
+//     - WithMinorToMajor: minor to major dimension ordering matches the given
+//       pattern
+//
+// Op(), Shape(), and Layout() may be passed an argument of type
+// HloInstruction**, Shape**, or Layout**, respectively, or const versions of
+// these pointers. If the pattern is matched, the address of the matched value
+// will be "captured" and stored at this location.
+//
+// For example:
+//   HloInstruction* foo = ...;
+//   HloInstruction* matched_operand;
+//   CHECK(Match(foo,
+//               match::Op().WithOperand(0, match::Op(&matched_operand))));
+//
+// Helpers are provided for most HLO instructions. These helpers can be called
+// with no arguments, in which case they will match any instruction matching the
+// opcode. They may also be called with matches for the operands and with an
+// optional capture. (The capture must be the first argument.) Some examples of
+// these helpers and their equivalents are provided below.
+
+// Example nullary instruction:
+//   Parameter()                    == Op().WithOpcode(HloOpcode::kParameter)
+//   Parameter(&a)                  == Op(&a).WithOpcode(HloOpcode::kParameter)
+//
+// Example unary instruction:
+//   Abs()                          == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                    == Op().WithOpcode(HloOpcode::kAbs)
+//                                         .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                           .WithOperand(0, Op(&b))
+//
+// Commutative binary instructions have a special form that accepts either order
+// of args, e.g.:
+//
+//   AddAnyOrder(Parameter(1), Abs()) ==
+//     Op().WithOpcode(HloOpcode::kAdd)
+//         .WithBinaryOperandsAnyOrder(Op().WithParameterNum(1), Abs());
+//
+//   MultiplyAnyOrder(&a, Parameter(), Abs())  // Captures the mul in `a`.
+//
+// The following additional helpers are provided.  In all cases, `&a` is
+// optional.
+//
+//   ConstantScalar(&a)               == Op(&a).IsConstantScalar();
+//   ConstantScalar(&a, v)            == Op(&a).IsConstantScalar(v);
+//   ConstantEffectiveScalar(&a)      == Op(&a).IsConstantEffectiveScalar();
+//   ConstantEffectiveScalar(&a, v)   == Op(&a).IsConstantEffectiveScalar(&a, v)
+//   NonConstant(&a)                  == Op(&a).IsNonConstant()
+//   GetTupleElement(&a, b, index)    == Op(&a).WithTupleIndex(index)
+//                                             .WithOperand(0, b);
+//   Parameter(&a, n)                 == Op(&a).WithParameterNum(n);
+
+struct MatchOption {
+  // If true, actually capture matched item into the user pointer.
+  bool capture;
+
+  // If true, require all operands prescribed in pattern to have one user.
+  bool single_user_only;
+
+  // An explanation for why we failed to match is streamed here, if not-null.
+  std::ostream* explain_os;
+};
+
+template <typename Value, typename Pattern>
+bool Match(Value* value, const Pattern& pattern,
+           MatchOption option = {/*.capture=*/true,
+                                 /*.single_user_only=*/false,
+                                 /*.explain_os=*/nullptr}) {
+  if (option.capture) {
+    auto new_option = option;
+    new_option.capture = false;
+    if (!pattern.Match(value, new_option)) {
+      return false;
+    }
+  }
+  return pattern.Match(value, option);
+}
+
+// Recursively requires all operands of the top-level operation prescribed in
+// pattern (but not the top-level operation itself) to have one user. The
+// behavior is identical to calling Match(value, pattern) with WithOneUser()
+// applied to all prescribed operands at all levels in pattern.
+//
+// Example:
+// p0 = parameter(0)
+// add = add(p0, p0)
+// mul = multiply(p0, p0)
+//
+// MatchSingleUserOnly(p0, m::Op()) -> true
+// (Top-level operation in the pattern is not required to have one user).
+//
+// MatchSingleUserOnly(add, m::Add()) -> true
+// (Only operands prescribed in the pattern are required to have one user).
+//
+// MatchSingleUserOnly(add, m::Add(m::Op(), m::Op()) -> false
+// (Operands prescribed in the pattern have two users).
+//
+// The previous line is equivalent to:
+// Match(add, m::Add(m::Op().WithOneUser(), m::Op().WithOneUser()) -> false.
+template <typename Value, typename Pattern>
+bool MatchSingleUserOnly(Value* value, const Pattern& pattern) {
+  MatchOption option = {/*.capture=*/true, /*.single_user_only=*/true,
+                        /*.explain_os=*/nullptr};
+  return Match(value, pattern, option);
+}
+
+// If `enable_logging` is false, this is identical to Match(instr, pattern).
+//
+// If `enable_logging` is true and the match fails, we try to
+// Match(instr, filter_pattern). If this is true, then we log an explanation for
+// why the original Match(instr, pattern) failed.
+//
+// This function can be used aid in debugging passes with complex matchers.
+// For example, in the following snippet we're trying to match
+// m::Slice(m::Reshape(m::Pad())). Every time we encounter a slice that
+// doesn't match the larger pattern, we will log an explanation for why it
+// didn't match the larger pattern.
+//
+// if (MatchAndLogIfFailed(instr, "slice of reshape of pad",
+//                         m::Slice(m::Reshape(m::Pad())),
+//                         VLOG_IS_ON(3), m::Slice())
+//
+// TODO(jlebar): Log the caller's absl::SourceLocation once that's in OSS.
+template <typename FilterPattern, typename Pattern>
+bool MatchAndLogIfFailed(HloInstruction* instr, absl::string_view desc,
+                         const Pattern& pattern, bool enable_logging,
+                         const FilterPattern& filter_pattern) {
+  bool matched = Match(instr, pattern);
+  if (matched || !enable_logging || !Match(instr, filter_pattern)) {
+    return matched;
+  }
+  std::stringstream os;
+  CHECK(!Match(
+      instr, pattern,
+      {/*capture=*/false, /*.single_user_only=*/false, /*explain_os=*/&os}));
+  LOG(ERROR) << "Failed to match " << desc << ":\n" << os.str();
+  return false;
+}
+
+namespace match {
+
+namespace detail {
+
+// Macro for streaming to option.explain_os if it's not null.
+//
+//   EXPLAIN << "value of foo(): " << foo()
+//
+#pragma push_macro("EXPLAIN")
+#define EXPLAIN \
+  if (option.explain_os) *option.explain_os
+
+// kIndentInc is the additional number of spaces that we indent by when we
+// increase the indent "by one".
+enum {
+  kIndentInc = 2,
+};
+
+// Writes a newline and then `indent` spaces.
+//
+// We follow an unintuitive convention in this file's pretty-printers: Indents
+// are performed by the caller, not the callee.  For example, if you want to
+// print
+//
+//   foo:
+//    - bar
+//
+// you'd do:
+//
+//  Foo::DescribeTo(std::ostream* os, int64_t indent) {
+//    *os << "foo:";
+//    Indent(os, indent)  // Create a newline at the *current* indent level.
+//    *os << " - ";
+//    bar.DescribeTo(os, indent + 3);  // + 3 because strlen(" * ") == 3.
+//  }
+//
+//  Bar::DescribeTo(std::ostream* os, int64_t indent) { *os << "bar"; }
+//
+// Notice that Bar::DescribeTo() does not call Indent; the indenting is
+// performed by Foo.  This convention allows the caller to decide whether a
+// matcher is preceded by a newline, which is important e.g. for the AllOf
+// matcher.
+//
+// (Incidentally, indenting in Match's explanations is handled differently.
+// Indents are a common case in DescribeTo [we're printing a whole tree], but
+// they're a special case in Match [we're printing only a path through the tree
+// that encounters a failing node]. Indents in Match only appear when we
+// encounter a failing disjunction, so we just handle them as a special case
+// there.)
+inline void Indent(std::ostream* os, int64_t indent) {
+  *os << "\n";
+  for (int64_t i = 0; i < indent; ++i) {
+    *os << " ";
+  }
+}
+
+// SFINAE template that determines whether T declares a static member
+// kIsTrivialMatcher.
+//
+// Trivial matchers get special treatment.  For example, when printing
+// a conjunction of matchers, we don't print "and" after a trivial matcher. This
+// yields e.g.
+//    "a shape compatible with f32[1,2]"
+// rather than
+//    "a shape AND compatible with f32[1,2]"
+template <typename T, typename Dummy = void>
+struct IsTrivialMatcher {
+  static constexpr bool value = false;
+};
+template <typename T>
+struct IsTrivialMatcher<T,
+                        typename std::enable_if<T::kIsTrivialMatcher>::type> {
+  static constexpr bool value = true;
+};
+
+template <typename Item, typename... Patterns>
+class AllOfPattern {
+ public:
+  explicit AllOfPattern(const Patterns&... patterns) : patterns_(patterns...) {}
+
+  bool Match(const Item* item, MatchOption option) const {
+    bool matched = MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    // This invariant is guaranteed by the top-level Match and AnyOf.
+    DCHECK(matched || !option.capture);
+    return matched;
+  }
+
+  bool Match(Item* item, MatchOption option) const {
+    bool matched = MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    // This invariant is guaranteed by the top-level Match and AnyOf.
+    DCHECK(matched || !option.capture);
+    return matched;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
+  }
+
+  // Accessor for patterns_.  Please don't use this outside of this file.
+  const std::tuple<Patterns...>& patterns() const { return patterns_; }
+
+ private:
+  template <typename ItemType, size_t index>
+  bool MatchImpl(ItemType* item, MatchOption option,
+                 std::integral_constant<size_t, index>) const {
+    // We don't need to do any EXPLAINing here; it's all correctly handled by
+    // our sub-matchers (if any fail).
+    return std::get<index>(patterns_).Match(item, option) &&
+           MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
+  }
+
+  template <typename ItemType>
+  bool MatchImpl(ItemType* item, MatchOption option,
+                 std::integral_constant<size_t, sizeof...(Patterns)>) const {
+    return true;
+  }
+
+  // Pretty-printing a conjunction has some special cases to make it easy to
+  // read in the simple (common) case.
+  //
+  // If sizeof...(Patterns) == 1, prints as e.g.
+  //
+  //   a shape
+  //
+  // If sizeof...(Patterns) == 2 and patterns_[0] is a trivial matcher (e.g. "a
+  // shape") prints as
+  //
+  //   a shape compatible with f32[1,2]
+  //
+  // If sizeof...(Patterns) > 2 and patterns_[0] is a trivial matcher, prints as
+  //
+  //   a shape:
+  //    * compatible with f32[1,2] AND
+  //    * that represents a scalar
+  //
+  // Otherwise prints as:
+  //
+  //   all of:
+  //    * foo AND
+  //    * bar
+  //
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64_t indent) const {
+    constexpr bool first_is_trivial =
+        IsTrivialMatcher<typename std::remove_reference<decltype(std::get<0>(
+            patterns_))>::type>::value;
+    constexpr bool is_last = index == sizeof...(Patterns) - 1;
+    const auto& submatcher = std::get<index>(patterns_);
+
+    auto print_bulleted_item = [&] {
+      *os << " * ";
+      submatcher.DescribeTo(os, indent + 3);
+      if (!is_last) {
+        *os << " AND";
+        Indent(os, indent);
+      }
+    };
+
+    if (index == 0) {
+      if (first_is_trivial || is_last) {
+        submatcher.DescribeTo(os, indent + kIndentInc);
+        if (sizeof...(Patterns) > 2) {
+          *os << ":";
+          Indent(os, indent);
+        }
+      } else {
+        *os << "all of:";
+        Indent(os, indent);
+        print_bulleted_item();
+      }
+    } else if (first_is_trivial && index == 1 && sizeof...(Patterns) == 2) {
+      *os << " ";
+      submatcher.DescribeTo(os, indent);
+    } else {
+      print_bulleted_item();
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64_t indent) const {}
+
+  std::tuple<Patterns...> patterns_;
+};
+
+}  // namespace detail
+
+// Returns a pattern that represents the conjunction of all input patterns. All
+// patterns need to match in order to have the AllOf pattern match.
+template <typename Item, typename... Patterns>
+auto AllOf(const Patterns&... patterns) {
+  return detail::AllOfPattern<typename std::remove_const<Item>::type,
+                              Patterns...>(patterns...);
+}
+
+// AllOf<AllOf<A, B...>, X, Y, ...> => AllOf<A, B, ..., X, Y, ...>.
+//
+// This transformation is necessary for good pretty-printing.
+template <typename Item, typename... InnerPs, typename... OuterPs>
+auto AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
+           const OuterPs&... outer_ps) {
+  // Invoke constructor of AllOfPattern<Item, InnerPs..., OuterPs...>.
+  auto make_all_of = [](const InnerPs&... inner_ps,
+                        const OuterPs&... outer_ps) {
+    return detail::AllOfPattern<typename std::remove_const<Item>::type,
+                                InnerPs..., OuterPs...>(inner_ps...,
+                                                        outer_ps...);
+  };
+  return absl::apply(make_all_of, std::tuple_cat(inner_p.patterns(),
+                                                 std::make_tuple(outer_ps...)));
+}
+
+namespace detail {
+
+template <typename LayoutType, typename Impl>
+class LayoutPattern;
+
+// The base LayoutPattern implementation. Matches only if the layout is not
+// nullptr.
+class LayoutPatternBaseImpl {
+ public:
+  bool Match(const ::xla::Layout* layout, MatchOption option) const {
+    if (layout == nullptr) {
+      EXPLAIN << "Layout is null";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "a layout";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
+};
+
+// A LayoutPattern implementation that matches only if the layout equals a
+// Layout proto.
+class LayoutPatternEqualImpl {
+ public:
+  explicit constexpr LayoutPatternEqualImpl(const ::xla::Layout* layout)
+      : layout_(layout) {}
+
+  bool Match(const ::xla::Layout* layout, MatchOption option) const {
+    if (!LayoutUtil::Equal(*layout_, *layout)) {
+      EXPLAIN << "Layout " << LayoutUtil::HumanString(*layout)
+              << " is not equal to expected "
+              << LayoutUtil::HumanString(*layout_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "equal to " << LayoutUtil::HumanString(*layout_);
+  }
+
+ private:
+  const ::xla::Layout* layout_;
+};
+
+class LayoutPatternMinorToMajorImpl {
+ public:
+  explicit LayoutPatternMinorToMajorImpl(
+      absl::Span<const int64_t> minor_to_major)
+      : minor_to_major_(minor_to_major.begin(), minor_to_major.end()) {}
+
+  bool Match(const ::xla::Layout* layout, MatchOption option) const {
+    if (layout->minor_to_major() != minor_to_major_) {
+      EXPLAIN << "Layout does not have minor to major ["
+              << absl::StrJoin(minor_to_major_, ",") << "]";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with minor to major [" << absl::StrJoin(minor_to_major_, ",")
+        << "]";
+  }
+
+ private:
+  absl::InlinedVector<int64_t, 8> minor_to_major_;
+};
+
+// A pattern that matches Layouts.
+template <typename LayoutType, typename Impl>
+class LayoutPattern {
+ private:
+  template <typename NewImpl>
+  auto AppendImpl(NewImpl new_impl) const {
+    auto new_allof = AllOf<::xla::Layout>(impl_, std::move(new_impl));
+    return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
+                                                          matched_layout_);
+  }
+
+ public:
+  explicit constexpr LayoutPattern(const Impl& impl,
+                                   LayoutType** matched_layout)
+      : impl_(impl), matched_layout_(matched_layout) {}
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(const ::xla::Layout* layout, MatchOption option) const {
+    if (impl_.Match(layout, option)) {
+      if (option.capture && matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(::xla::Layout* layout, MatchOption option) const {
+    if (impl_.Match(layout, option)) {
+      if (option.capture && matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
+  // Modifies the pattern to match only if the layout equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr auto EqualTo(const ::xla::Layout* layout) const {
+    return AppendImpl(LayoutPatternEqualImpl(layout));
+  }
+
+  constexpr auto WithMinorToMajor(
+      absl::Span<const int64_t> minor_to_major) const {
+    return AppendImpl(LayoutPatternMinorToMajorImpl(minor_to_major));
+  }
+
+ private:
+  Impl impl_;
+  LayoutType** matched_layout_;
+};
+
+template <typename Item, typename... Patterns>
+class AnyOfPattern {
+ public:
+  explicit AnyOfPattern(const Patterns&... patterns) : patterns_(patterns...) {}
+
+  bool Match(const Item* item, MatchOption option) const {
+    return MatchImpl(item, option);
+  }
+
+  bool Match(Item* item, MatchOption option) const {
+    return MatchImpl(item, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "any of:";
+    Indent(os, indent);
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
+  }
+
+ private:
+  template <typename ItemType>
+  bool MatchImpl(ItemType* item, MatchOption option) const {
+    // If we're generating an explanation, buffer it until we know we failed.
+    std::optional<std::stringstream> explanation;
+    MatchOption new_option = option;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+    bool rv = MatchRecursiveImpl(item, new_option,
+                                 std::integral_constant<size_t, 0>());
+    if (!rv && option.explain_os) {
+      EXPLAIN << "None of the following matchers succeeded:";
+      EXPLAIN << explanation->str();
+    }
+    return rv;
+  }
+
+  template <typename ItemType, size_t index>
+  bool MatchRecursiveImpl(ItemType* item, MatchOption option,
+                          std::integral_constant<size_t, index>) const {
+    auto new_option = option;
+    new_option.capture = false;
+
+    std::optional<std::stringstream> explanation;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+
+    // Try to match the sub-pattern without capturing behavior.
+    if (std::get<index>(patterns_).Match(item, new_option)) {
+      // Capture the branch.
+      if (option.capture) {
+        // TODO(timshen): Currently the behavior can be exponential. Optimize it
+        // with memoization or recording the matched sub-pattern index, if it
+        // takes too long to run.
+        //
+        // Specifically, the "memoization" approach is to create an empty
+        // container with the key (pattern, instruction), and value as whether
+        // matched or not.
+        //
+        // Alternatively, we may run the pattern matching with captures off, but
+        // instead record a "trace" somewhere, indicating how exactly the
+        // pattern matches the input. For example, the trace information for
+        // AnyOf will be a runtime number indicate which sub-pattern is matched.
+        // Then we run another pass to do captures only with the help of the
+        // trace.
+        bool matched = std::get<index>(patterns_).Match(item, option);
+        DCHECK(matched);
+      }
+      return true;
+    }
+    if (option.explain_os) {
+      EXPLAIN << "\nMatcher #" << index + 1;
+      EXPLAIN << "\n - ";
+      std::get<index>(patterns_).DescribeTo(option.explain_os, /*indent=*/3);
+      EXPLAIN << "\nfailed with";
+      EXPLAIN << "\n - ";
+      EXPLAIN << absl::StrReplaceAll(explanation->str(), {{"\n", "\n   "}});
+    }
+    return MatchRecursiveImpl(item, option,
+                              std::integral_constant<size_t, index + 1>());
+  }
+
+  template <typename ItemType>
+  bool MatchRecursiveImpl(
+      ItemType* item, MatchOption option,
+      std::integral_constant<size_t, sizeof...(Patterns)>) const {
+    return false;
+  }
+
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64_t indent) const {
+    *os << " - ";
+    std::get<index>(patterns_).DescribeTo(os, indent + 3);
+    if (index != sizeof...(Patterns) - 1) {
+      *os << " OR";
+      Indent(os, indent);
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64_t indent) const {}
+
+  std::tuple<Patterns...> patterns_;
+};
+
+}  // namespace detail
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr auto Layout(const ::xla::Layout** matched_layout = nullptr) {
+  return detail::LayoutPattern<const ::xla::Layout,
+                               detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr auto Layout(::xla::Layout** matched_layout) {
+  return detail::LayoutPattern<::xla::Layout, detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+namespace detail {
+
+template <typename ShapeType, typename Impl>
+class ShapePattern;
+
+// The base ShapePattern implementation. Matches only if the shape is not
+// nullptr.
+class ShapePatternBaseImpl {
+ public:
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape == nullptr) {
+      EXPLAIN << "Shape is null";
+    }
+    return shape != nullptr;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "a shape";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
+};
+
+// A ShapePattern implementation that matches only if the shape equals a Shape
+// proto.
+class ShapePatternEqualImpl {
+ public:
+  explicit constexpr ShapePatternEqualImpl(const ::xla::Shape* shape)
+      : shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::Equal(*shape_, *shape)) {
+      EXPLAIN << "Shape not equal to "
+              << ShapeUtil::HumanStringWithLayout(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "equal to " << ShapeUtil::HumanStringWithLayout(*shape_);
+  }
+
+ private:
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape is compatible to
+// a Shape proto.
+class ShapePatternCompatibleImpl {
+ public:
+  explicit constexpr ShapePatternCompatibleImpl(const ::xla::Shape* shape)
+      : shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::Compatible(*shape_, *shape)) {
+      EXPLAIN << "Shape not compatible with "
+              << ShapeUtil::HumanString(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "compatible with " << ShapeUtil::HumanString(*shape_);
+  }
+
+ private:
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// element type.
+class ShapePatternElementTypeImpl {
+ public:
+  explicit constexpr ShapePatternElementTypeImpl(PrimitiveType element_type)
+      : element_type_(element_type) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape->element_type() != element_type_) {
+      EXPLAIN << "Shape does not have element type "
+              << PrimitiveType_Name(element_type_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with element type " << PrimitiveType_Name(element_type_);
+  }
+
+ private:
+  PrimitiveType element_type_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// list of dimensions.
+class ShapePatternDimsImpl {
+ public:
+  explicit ShapePatternDimsImpl(absl::Span<const int64_t> dims)
+      : dims_(dims.begin(), dims.end()) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape->dimensions() != dims_) {
+      EXPLAIN << "Shape does not have dimensions [" << absl::StrJoin(dims_, ",")
+              << "]";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with dimensions [" << absl::StrJoin(dims_, ",") << "]";
+  }
+
+ private:
+  absl::InlinedVector<int64_t, 8> dims_;
+};
+
+// A ShapePattern implementation that matches only if the shape is scalar.
+class ShapePatternIsScalarImpl {
+ public:
+  explicit constexpr ShapePatternIsScalarImpl() = default;
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::IsScalar(*shape)) {
+      EXPLAIN << "Shape is not a scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "that represents a scalar";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is an array
+class ShapePatternIsArrayImpl {
+ public:
+  explicit constexpr ShapePatternIsArrayImpl() = default;
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!shape->IsArray()) {
+      EXPLAIN << "Shape is not an array";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "that represents an array";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is an array
+class ShapePatternIsDenseArrayImpl {
+ public:
+  explicit constexpr ShapePatternIsDenseArrayImpl() = default;
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!LayoutUtil::IsDenseArray(*shape)) {
+      EXPLAIN << "Shape is not a dense array";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "that represents a dense array";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is a tuple.
+class ShapePatternIsTupleImpl {
+ public:
+  explicit constexpr ShapePatternIsTupleImpl() = default;
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!shape->IsTuple()) {
+      EXPLAIN << "Shape is not a tuple";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "that represents a tuple";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is an effective
+// scalar.
+class ShapePatternEffectiveScalarImpl {
+ public:
+  explicit constexpr ShapePatternEffectiveScalarImpl() = default;
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::IsEffectiveScalar(*shape)) {
+      EXPLAIN << "Shape is not an effective scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "that is an effective scalar";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// rank.
+class ShapePatternRankImpl {
+ public:
+  explicit constexpr ShapePatternRankImpl(int64_t rank) : rank_(rank) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape->rank() != rank_) {
+      if (rank_ == 0) {
+        EXPLAIN << "Shape is not a scalar";
+      } else {
+        EXPLAIN << "Shape does not have rank " << rank_;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    if (rank_ == 0) {
+      *os << "that is a scalar";
+    } else {
+      *os << "that has " << rank_ << " dimension" << (rank_ != 1 ? "s" : "");
+    }
+  }
+
+ private:
+  int64_t rank_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a layout
+// that matches a given pattern.
+template <typename LayoutType, typename LayoutImpl>
+class ShapePatternLayoutImpl {
+ public:
+  explicit constexpr ShapePatternLayoutImpl(
+      const LayoutPattern<LayoutType, LayoutImpl>& layout)
+      : layout_(layout) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    return LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(&shape->layout(), option);
+  }
+
+  bool Match(::xla::Shape* shape, MatchOption option) const {
+    if (!LayoutUtil::HasLayout(*shape)) {
+      EXPLAIN << "Shape does not have a layout";
+      return false;
+    }
+    if (!layout_.Match(shape->mutable_layout(), option)) {
+      EXPLAIN << "\nin layout";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with";
+    Indent(os, indent + kIndentInc);
+    layout_.DescribeTo(os, indent + kIndentInc);
+  }
+
+ private:
+  LayoutPattern<LayoutType, LayoutImpl> layout_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a subshape
+// that matches a given pattern.
+template <typename SubshapeType, typename SubshapeImpl>
+class ShapePatternSubshapeImpl {
+ public:
+  explicit ShapePatternSubshapeImpl(
+      ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape)
+      : index_(index), subshape_(subshape) {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    return MatchImpl(shape, option);
+  }
+
+  bool Match(::xla::Shape* shape, MatchOption option) const {
+    return MatchImpl(shape, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with subshape at index " << ShapeIndex(index_) << " which is";
+    Indent(os, indent + kIndentInc);
+    subshape_.DescribeTo(os, indent + kIndentInc);
+  }
+
+ private:
+  ::xla::Shape* GetSubshape(::xla::Shape* shape) const {
+    return ShapeUtil::GetMutableSubshape(shape, index_);
+  }
+  const ::xla::Shape* GetSubshape(const ::xla::Shape* shape) const {
+    return &ShapeUtil::GetSubshape(*shape, index_);
+  }
+
+  template <typename ShapeType>
+  bool MatchImpl(ShapeType* shape, MatchOption option) const {
+    if (!ShapeUtil::IndexIsValid(*shape, index_)) {
+      EXPLAIN << "No subshape at " << ShapeIndex(index_);
+      return false;
+    }
+    if (!subshape_.Match(GetSubshape(shape), option)) {
+      EXPLAIN << "\nin subshape at " << ShapeIndex(index_);
+      return false;
+    }
+    return true;
+  }
+
+  ShapeIndexView index_;
+  ShapePattern<SubshapeType, SubshapeImpl> subshape_;
+};
+
+// A pattern that matches Shapes.
+template <typename ShapeType, typename Impl>
+class ShapePattern {
+ private:
+  template <typename NewImpl>
+  auto AppendImpl(NewImpl new_impl) const {
+    auto new_all_of = AllOf<::xla::Shape>(impl_, std::move(new_impl));
+    return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
+                                                         matched_shape_);
+  }
+
+ public:
+  explicit constexpr ShapePattern(const Impl& impl, ShapeType** matched_shape)
+      : impl_(impl), matched_shape_(matched_shape) {}
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (impl_.Match(shape, option)) {
+      if (option.capture && matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    if (shape) {
+      EXPLAIN << "\nin "
+              << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                      : ShapeUtil::HumanString(*shape));
+    }
+    return false;
+  }
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(::xla::Shape* shape, MatchOption option) const {
+    if (impl_.Match(shape, option)) {
+      if (option.capture && matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    EXPLAIN << "\nin "
+            << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                    : ShapeUtil::HumanString(*shape));
+    return false;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    return impl_.DescribeTo(os, indent);
+  }
+
+  // Modifies the pattern to match only if the shape equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr auto EqualTo(const ::xla::Shape* shape) const {
+    return AppendImpl(ShapePatternEqualImpl(shape));
+  }
+
+  // Modifies the pattern to match only if the shape is compatible to the given
+  // proto. The layout must outlive the returned pattern.
+  constexpr auto CompatibleTo(const ::xla::Shape* shape) const {
+    return AppendImpl(ShapePatternCompatibleImpl(shape));
+  }
+
+  // Modifies the pattern to match only if the shape has the given element type.
+  constexpr auto WithElementType(PrimitiveType element_type) const {
+    return AppendImpl(ShapePatternElementTypeImpl(element_type));
+  }
+
+  constexpr auto WithDims(absl::Span<const int64_t> dims) const {
+    return AppendImpl(ShapePatternDimsImpl(dims));
+  }
+
+  // Modifies the pattern to match only if the shape is scalar.
+  constexpr auto IsScalar() const {
+    return AppendImpl(ShapePatternIsScalarImpl());
+  }
+
+  // Modifies the pattern to match only if the shape is an array.
+  constexpr auto IsArray() const {
+    return AppendImpl(ShapePatternIsArrayImpl());
+  }
+
+  // Modifies the pattern to match only if the shape is a tuple.
+  constexpr auto IsTuple() const {
+    return AppendImpl(ShapePatternIsTupleImpl());
+  }
+
+  constexpr auto IsEffectiveScalar() const {
+    return AppendImpl(ShapePatternEffectiveScalarImpl());
+  }
+
+  // Modifies the pattern to match only if the shape has the given rank.
+  constexpr auto WithRank(int64_t rank) const {
+    return AppendImpl(ShapePatternRankImpl(rank));
+  }
+
+  // Modifies the pattern to match only if the shape has a layout that matches
+  // the given pattern.
+  template <typename LayoutType, typename LayoutImpl>
+  auto WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const {
+    return AppendImpl(ShapePatternLayoutImpl<LayoutType, LayoutImpl>(layout));
+  }
+
+  constexpr auto WithLayout(absl::Span<const int64_t> minor_to_major) const {
+    return WithLayout(Layout().WithMinorToMajor(minor_to_major));
+  }
+
+  constexpr auto WithLayoutEqualTo(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().EqualTo(layout));
+  }
+
+  // Modifies the pattern to match only if the shape is a dense array.
+  constexpr auto IsDenseArray() const {
+    return AppendImpl(ShapePatternIsDenseArrayImpl());
+  }
+
+  // Modifies the pattern to match only if the shape has a subshape that matches
+  // the given pattern.
+  template <typename SubshapeType, typename SubshapeImpl>
+  auto WithSubshape(
+      ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape) const {
+    return AppendImpl(
+        ShapePatternSubshapeImpl<SubshapeType, SubshapeImpl>(index, subshape));
+  }
+
+  ShapePattern<ShapeType,
+               AllOfPattern<::xla::Shape, Impl,
+                            ShapePatternSubshapeImpl<
+                                const ::xla::Shape,
+                                AllOfPattern<::xla::Shape, ShapePatternBaseImpl,
+                                             ShapePatternEqualImpl>>>>
+  WithSubshapeEqualTo(ShapeIndexView index, const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .EqualTo(shape));
+  }
+
+  ShapePattern<ShapeType,
+               AllOfPattern<::xla::Shape, Impl,
+                            ShapePatternSubshapeImpl<
+                                const ::xla::Shape,
+                                AllOfPattern<::xla::Shape, ShapePatternBaseImpl,
+                                             ShapePatternCompatibleImpl>>>>
+  WithSubshapeCompatibleTo(ShapeIndexView index,
+                           const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .CompatibleTo(shape));
+  }
+
+ private:
+  Impl impl_;
+  ShapeType** matched_shape_;
+};
+
+}  // namespace detail
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr auto Shape(const ::xla::Shape** matched_shape = nullptr) {
+  return detail::ShapePattern<const ::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr auto Shape(::xla::Shape** matched_shape) {
+  return detail::ShapePattern<::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+namespace detail {
+
+// Overloads to get a const or non-const operand out of an instruction.
+inline HloInstruction* HloOperand(HloInstruction* instr, int64_t idx) {
+  return instr->mutable_operand(idx);
+}
+inline const HloInstruction* HloOperand(const HloInstruction* instr,
+                                        int64_t idx) {
+  return instr->operand(idx);
+}
+
+// Pretty-printer for HloInstruction.  Sort of like ToShortString, but with
+// fewer %s and more shapes.
+inline std::string InstToString(const HloInstruction* inst) {
+  return inst->ToString(
+      HloPrintOptions().set_print_metadata(false).set_print_percent(false));
+}
+
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern;
+
+// The base HloInstructionPattern implementation. Matches only if the
+// instruction is not nullptr.
+class HloInstructionPatternBaseImpl {
+ public:
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst == nullptr) {
+      EXPLAIN << "HloInstruction* is null";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "an HloInstruction";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given name.
+class HloInstructionPatternNameImpl {
+ public:
+  explicit HloInstructionPatternNameImpl(absl::string_view name)
+      : name_(name) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->name() != name_) {
+      EXPLAIN << "HloInstruction not named \"" << name_ << "\"";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "named \"" << name_ << "\"";
+  }
+
+ private:
+  absl::string_view name_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// equals a particular pointer.
+class HloInstructionIsImpl {
+ public:
+  explicit HloInstructionIsImpl(const HloInstruction* inst) : inst_(inst) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst != inst_) {
+      EXPLAIN << "HloInstruction " << std::hex << std::nouppercase
+              << std::showbase << reinterpret_cast<uint64_t>(inst) << " is not "
+              << reinterpret_cast<uint64_t>(inst_) << " ("
+              << InstToString(inst_) << ")";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which is " << std::hex << std::nouppercase << std::showbase
+        << reinterpret_cast<uint64_t>(inst_) << " (" << InstToString(inst_)
+        << ")";
+  }
+
+ private:
+  const HloInstruction* inst_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given opcode.
+class HloInstructionPatternOpcodeImpl {
+ public:
+  explicit constexpr HloInstructionPatternOpcodeImpl(HloOpcode opcode,
+                                                     bool invert)
+      : opcode_(opcode), invert_(invert) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (invert_ && inst->opcode() == opcode_) {
+      EXPLAIN << "HloInstruction has opcode " << opcode_
+              << ", expected anything else";
+      return false;
+    }
+    if (!invert_ && inst->opcode() != opcode_) {
+      EXPLAIN << "HloInstruction doesn't have opcode " << opcode_;
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    if (!invert_) {
+      *os << "with opcode " << opcode_;
+    } else {
+      *os << "with any opcode other than " << opcode_;
+    }
+  }
+
+ private:
+  HloOpcode opcode_;
+  bool invert_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has one of a given list of custom call targets.
+class HloInstructionCustomCallTargetImpl {
+ public:
+  explicit HloInstructionCustomCallTargetImpl(
+      absl::Span<const absl::string_view> custom_call_targets)
+      : custom_call_targets_(custom_call_targets.begin(),
+                             custom_call_targets.end()) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kCustomCall ||
+        !absl::c_linear_search(custom_call_targets_,
+                               inst->custom_call_target())) {
+      if (custom_call_targets_.size() == 1) {
+        EXPLAIN << "HloInstruction is not a custom call with a target '"
+                << custom_call_targets_.front() << "'";
+      } else {
+        EXPLAIN << "HloInstruction is not a custom call with a target in {"
+                << absl::StrJoin(custom_call_targets_, ", ") << "}";
+      }
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    if (custom_call_targets_.size() == 1) {
+      *os << "custom call with target '" << custom_call_targets_.front() << "'";
+    } else {
+      *os << "custom call with target in {"
+          << absl::StrJoin(custom_call_targets_, ", ") << "}";
+    }
+  }
+
+ private:
+  absl::InlinedVector<std::string, 1> custom_call_targets_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has the given number of operands.
+class HloInstructionPatternNumOperandsImpl {
+ public:
+  explicit constexpr HloInstructionPatternNumOperandsImpl(int64_t num_operands)
+      : num_operands_(num_operands) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->operand_count() != num_operands_) {
+      EXPLAIN << "HloInstruction doesn't have " << num_operands_ << " operands";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with " << num_operands_ << " operand"
+        << (num_operands_ != 1 ? "s" : "");
+  }
+
+ private:
+  int64_t num_operands_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a shape that matches a given pattern.
+template <typename ShapeType, typename ShapeImpl>
+class HloInstructionPatternShapeImpl {
+ public:
+  explicit constexpr HloInstructionPatternShapeImpl(
+      const ShapePattern<ShapeType, ShapeImpl>& shape)
+      : shape_(shape) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (!shape_.Match(&inst->shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    if (!shape_.Match(inst->mutable_shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "outputting";
+    Indent(os, indent + kIndentInc);
+    shape_.DescribeTo(os, indent + kIndentInc);
+  }
+
+ private:
+  ShapePattern<ShapeType, ShapeImpl> shape_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has an operand that matches a given pattern.
+template <typename OperandType, typename OperandImpl>
+class HloInstructionPatternOperandImpl {
+ public:
+  explicit constexpr HloInstructionPatternOperandImpl(
+      int64_t operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand)
+      : operand_index_(operand_index), operand_(operand) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with operand " << operand_index_ << " which is:";
+    Indent(os, indent + kIndentInc);
+    operand_.DescribeTo(os, indent + kIndentInc);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (operand_index_ >= inst->operand_count()) {
+      EXPLAIN << "desired operand index " << operand_index_
+              << " is out of bounds";
+      return false;
+    }
+    if (!operand_.Match(HloOperand(inst, operand_index_), option)) {
+      EXPLAIN << "\nin operand " << operand_index_;
+      return false;
+    }
+    if (option.single_user_only &&
+        inst->operand(operand_index_)->user_count() != 1) {
+      EXPLAIN << "Operand " << operand_index_ << " of HloInstruction has "
+              << inst->operand(operand_index_)->user_count()
+              << " users. Expected 1.";
+      return false;
+    }
+    return true;
+  }
+
+  int64_t operand_index_;
+  HloInstructionPattern<OperandType, OperandImpl> operand_;
+};
+
+// An HloInstructionPattern implementation that matches if the instruction has
+// fewer than i+1 operands, or if the i'th operand matches a given pattern.
+template <typename OperandType, typename OperandImpl>
+class HloInstructionPatternOperandIfPresentImpl {
+ public:
+  explicit constexpr HloInstructionPatternOperandIfPresentImpl(
+      int64_t operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand)
+      : operand_index_(operand_index), operand_(operand) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "either with fewer than " << operand_index_ + 1 << " operand"
+        << (operand_index_ + 1 != 1 ? "s" : "") << ", or with an operand "
+        << operand_index_ << " which is:";
+    Indent(os, indent + kIndentInc);
+    operand_.DescribeTo(os, indent + kIndentInc);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (operand_index_ >= inst->operand_count()) {
+      return true;
+    }
+    if (!operand_.Match(HloOperand(inst, operand_index_), option)) {
+      EXPLAIN << "\nin operand " << operand_index_;
+      return false;
+    }
+    return true;
+  }
+
+  int64_t operand_index_;
+  HloInstructionPattern<OperandType, OperandImpl> operand_;
+};
+
+// Matches a binary instruction whose operands come in any order.
+template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+          typename OperandImpl2>
+class HloInstructionPatternBinaryOperandsAnyOrderImpl {
+ public:
+  explicit constexpr HloInstructionPatternBinaryOperandsAnyOrderImpl(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2)
+      : op1_(op1), op2_(op2) {}
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with two operands in either order:";
+    Indent(os, indent);
+    *os << " - ";
+    op1_.DescribeTo(os, indent + 3);
+    Indent(os, indent);
+    *os << " - ";
+    op2_.DescribeTo(os, indent + 3);
+  }
+
+ private:
+  HloInstruction* operand(HloInstruction* inst, int64_t idx) const {
+    return inst->mutable_operand(idx);
+  }
+  const HloInstruction* operand(const HloInstruction* inst, int64_t idx) const {
+    return inst->operand(idx);
+  }
+
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    // We could implement this using AnyOf and AllOf matchers, but the templates
+    // get pretty difficult to debug, since any compile error herein becomes
+    // not-an-error via SFINAE.  Also this way lets us give better messages on
+    // failure.
+    if (inst->operand_count() != 2) {
+      EXPLAIN << "HloInstruction did not have two operands";
+      return false;
+    }
+
+    if (option.single_user_only) {
+      for (int i = 0; i < 2; ++i) {
+        if (inst->operand(i)->user_count() != 1) {
+          EXPLAIN << "Operand " << i << " of HloInstruction has "
+                  << inst->operand(i)->user_count() << " users. Expected 1.";
+          return false;
+        }
+      }
+    }
+
+    // If we're not generating explanations, this is pretty simple.
+    if (!option.explain_os) {
+      auto try_match = [&](int64_t idx1, int64_t idx2) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        if (op1_.Match(operand(inst, idx1), new_option) &&
+            op2_.Match(operand(inst, idx2), new_option)) {
+          if (option.capture) {
+            bool matched = op1_.Match(operand(inst, idx1), option) &&
+                           op2_.Match(operand(inst, idx2), option);
+            DCHECK(matched);
+          }
+          return true;
+        }
+        return false;
+      };
+      return try_match(0, 1) || try_match(1, 0);
+    }
+
+    // If we are generating explanations, we have some work to do in order to
+    // generate a helpful error.
+    //
+    // First, try all four operand/matcher combinations, recording the
+    // failure explanations separately from option.explain_os. matches[i][j]
+    // tells us if matcher_i matches operand j.
+    bool matches[/*matcher*/ 2][/*operand*/ 2];
+    std::stringstream explanations[/*matcher*/ 2][/*operand*/ 2];
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        new_option.explain_os = &explanations[i][j];
+        matches[i][j] = i == 0 ? op1_.Match(operand(inst, j), new_option)
+                               : op2_.Match(operand(inst, j), new_option);
+      }
+    }
+
+    // Check if the match succeeded.
+    for (int i = 0; i < 2; ++i) {
+      if (matches[0][i] && matches[1][(i + 1) % 2]) {
+        // Rerun the matches with capture enabled if necessary.
+        if (option.capture) {
+          auto* operand1 = operand(inst, i);
+          auto* operand2 = operand(inst, (i + 1) % 2);
+          bool matched =
+              op1_.Match(operand1, option) && op2_.Match(operand2, option);
+          DCHECK(matched);
+        }
+        return true;
+      }
+    }
+
+    auto describe_matcher = [&](int matcher_idx) {
+      EXPLAIN << "\n - ";
+      if (matcher_idx == 0) {
+        op1_.DescribeTo(option.explain_os, /*indent=*/3);
+      } else {
+        CHECK_EQ(matcher_idx, 1);
+        op2_.DescribeTo(option.explain_os, /*indent=*/3);
+      }
+      for (int i = 0; i < 2; ++i) {
+        if (matches[matcher_idx][/*operand*/ i]) {
+          continue;
+        }
+        EXPLAIN << "\ndoes not match " << (i == 0 ? "LHS" : "RHS") << ":\n";
+        EXPLAIN << " - ";
+        EXPLAIN << absl::StrReplaceAll(
+            explanations[matcher_idx][/*operand*/ i].str(), {{"\n", "\n   "}});
+      }
+    };
+
+    // If we failed to match, one of the following is true:
+    //  1. op1 (op2) matches neither LHS nor RHS, or
+    //  2. op1 and op2 both match LHS (RHS), but neither matches RHS (LHS).
+    // We print different explanations depending on which case we're in.
+
+    // Case 1.
+    bool wrote_explanation = false;
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (!matches[i][0] && !matches[i][1]) {
+        EXPLAIN << "HloInstruction's operands (ignoring order) did not match "
+                << (i == 0 ? "first" : "second") << " matcher. Specifically,";
+        describe_matcher(i);
+        wrote_explanation = true;
+      }
+    }
+
+    // Case 2.
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (matches[/*matcher*/ 0][/*operand*/ i] &&
+          matches[/*matcher*/ 1][/*operand*/ i]) {
+        CHECK(!matches[0][(i + 1) % 2]);
+        CHECK(!matches[1][(i + 1) % 2]);
+        CHECK(!wrote_explanation);
+        EXPLAIN << "HloInstruction's " << (i == 1 ? "LHS" : "RHS")
+                << " operand did not match either of the two matchers. "
+                   "Specifically,";
+        describe_matcher(0);
+        EXPLAIN << "\nand";
+        describe_matcher(1);
+        wrote_explanation = true;
+      }
+    }
+
+    CHECK(wrote_explanation);
+    return false;
+  }
+
+  HloInstructionPattern<OperandType1, OperandImpl1> op1_;
+  HloInstructionPattern<OperandType2, OperandImpl2> op2_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// is a fusion node with a particular kind.
+class HloInstructionPatternFusionKindImpl {
+ public:
+  explicit constexpr HloInstructionPatternFusionKindImpl(
+      ::xla::HloInstruction::FusionKind kind)
+      : kind_(kind) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with fusion kind " << ToString(kind_);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kFusion) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_)
+              << "; it's not a fusion";
+      return false;
+    }
+    if (inst->fusion_kind() != kind_) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_);
+      return false;
+    }
+    return true;
+  }
+
+  ::xla::HloInstruction::FusionKind kind_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// is a kGetTupleElement with a particular tuple index.
+class HloInstructionPatternTupleIndexImpl {
+ public:
+  explicit constexpr HloInstructionPatternTupleIndexImpl(int64_t tuple_index)
+      : tuple_index_(tuple_index) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which is a GTE with index " << tuple_index_;
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kGetTupleElement) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_
+              << "; it's not a GTE at all";
+      return false;
+    }
+    if (inst->tuple_index() != tuple_index_) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_;
+      return false;
+    }
+    return true;
+  }
+
+  int64_t tuple_index_;
+};
+
+class HloInstructionPatternParameterNumImpl {
+ public:
+  explicit constexpr HloInstructionPatternParameterNumImpl(
+      int64_t parameter_num)
+      : parameter_num_(parameter_num) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which is parameter " << parameter_num_;
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kParameter ||
+        inst->parameter_number() != parameter_num_) {
+      EXPLAIN << "HloInstruction is not parameter " << parameter_num_;
+      return false;
+    }
+    return true;
+  }
+
+  int64_t parameter_num_;
+};
+
+// Superclass that contains common code used by Op::WithOneUse() and
+// Op::WithOneUser().
+class HloInstructionPatternOneUseOrUserImpl {
+ protected:
+  bool MatchOneUser(const HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != 1) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly one.";
+      if (inst->user_count() > 1) {
+        EXPLAIN << "\nAll users:";
+        for (const HloInstruction* user : inst->users()) {
+          EXPLAIN << "\n - " << InstToString(user);
+        }
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+class HloInstructionPatternOneUseImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (!MatchOneUser(inst, option)) {
+      return false;
+    }
+
+    int64_t use_count = absl::c_count_if(
+        inst->users()[0]->operands(),
+        [&](const HloInstruction* operand) { return operand == inst; });
+    if (use_count != 1) {
+      EXPLAIN << "HloInstruction is used " << use_count
+              << " times by its user, but is expected to be used just once: "
+              << InstToString(inst->users()[0]);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has exactly one use";
+  }
+};
+
+class HloInstructionPatternOneUserImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchOneUser(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has exactly one user (but possibly is used multiple times by "
+           "that instruction)";
+  }
+};
+
+class HloInstructionPatternNumUserImpl {
+ public:
+  explicit constexpr HloInstructionPatternNumUserImpl(int64_t user_num)
+      : user_num_(user_num) {}
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != user_num_) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly " << user_num_ << " users.";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has exactly " << user_num_
+        << " users (but possibly is used multiple times by "
+           "same instruction)";
+  }
+
+ private:
+  int64_t user_num_;
+};
+
+class HloInstructionPatternAtMostNumUserImpl {
+ public:
+  explicit constexpr HloInstructionPatternAtMostNumUserImpl(int64_t user_num)
+      : user_num_(user_num) {}
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() > user_num_) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected less than or equal " << user_num_
+              << " users.";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has less than or equal " << user_num_
+        << " users (but possibly is used multiple times by "
+           "same instruction)";
+  }
+
+ private:
+  int64_t user_num_;
+};
+
+class HloInstructionPatternComparisonDirectionImpl {
+ public:
+  explicit constexpr HloInstructionPatternComparisonDirectionImpl(
+      ComparisonDirection direction)
+      : direction_(direction) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has comparison direction "
+        << ComparisonDirectionToString(direction_);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kCompare ||
+        inst->comparison_direction() != direction_) {
+      EXPLAIN << "HloInstruction is not comparison "
+              << ComparisonDirectionToString(direction_);
+      return false;
+    }
+    return true;
+  }
+
+  ComparisonDirection direction_;
+};
+
+class HloInstructionPatternConvDnumsImpl {
+ public:
+  explicit HloInstructionPatternConvDnumsImpl(absl::string_view dnums)
+      : HloInstructionPatternConvDnumsImpl(
+            ParseConvolutionDimensionNumbers(dnums).value()) {}
+
+  explicit HloInstructionPatternConvDnumsImpl(ConvolutionDimensionNumbers dnums)
+      : dnums_(std::move(dnums)) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has convolution dimension numbers "
+        << ConvolutionDimensionNumbersToString(dnums_);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kConvolution &&
+        inst->opcode() != HloOpcode::kCustomCall) {
+      EXPLAIN << "HloInstruction is not convolution or custom-call and so "
+                 "can't have convolution_dimension_numbers";
+      return false;
+    }
+
+    const ConvolutionDimensionNumbers& actual_dnums =
+        inst->convolution_dimension_numbers();
+    if (!tsl::protobuf::util::MessageDifferencer::Equals(dnums_,
+                                                         actual_dnums)) {
+      EXPLAIN << "convolution_dimension_numbers "
+              << ConvolutionDimensionNumbersToString(actual_dnums)
+              << " don't match expected "
+              << ConvolutionDimensionNumbersToString(dnums_);
+      return false;
+    }
+    return true;
+  }
+
+  ConvolutionDimensionNumbers dnums_;
+};
+
+class HloInstructionPredicateImpl {
+ public:
+  explicit HloInstructionPredicateImpl(HloPredicate fn) : fn_(std::move(fn)) {}
+
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    bool match = fn_(inst);
+    if (!match) {
+      EXPLAIN << "HloInstruction does not match user-specified predicate";
+    }
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which matches a user-specified predicate";
+  }
+
+ private:
+  HloPredicate fn_;
+};
+
+class HloInstructionContractingDimsImpl {
+ public:
+  explicit HloInstructionContractingDimsImpl(
+      absl::Span<const int64_t> lhs_contracting_dims,
+      absl::Span<const int64_t> rhs_contracting_dims)
+      : lhs_contracting_dims_(lhs_contracting_dims.begin(),
+                              lhs_contracting_dims.end()),
+        rhs_contracting_dims_(rhs_contracting_dims.begin(),
+                              rhs_contracting_dims.end()) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "with lhs_contracting_dims {"
+        << absl::StrJoin(lhs_contracting_dims_, ",")
+        << "} and rhs_contracting_dims {"
+        << absl::StrJoin(rhs_contracting_dims_, ",") << "}";
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kDot) {
+      EXPLAIN << "HloInstruction is not dot so "
+                 "can't have dot_dimension_numbers";
+      return false;
+    }
+
+    const DotDimensionNumbers& dnums = inst->dot_dimension_numbers();
+    if (absl::MakeSpan(dnums.lhs_contracting_dimensions()) !=
+        lhs_contracting_dims_) {
+      EXPLAIN << "lhs_contracting_dimensions {"
+              << absl::StrJoin(dnums.lhs_contracting_dimensions(), ",")
+              << "} don't match expected {"
+              << absl::StrJoin(lhs_contracting_dims_, ",") << "}";
+      return false;
+    }
+
+    if (absl::MakeSpan(dnums.rhs_contracting_dimensions()) !=
+        rhs_contracting_dims_) {
+      EXPLAIN << "rhs_contracting_dimensions {"
+              << absl::StrJoin(dnums.rhs_contracting_dimensions(), ",")
+              << "} don't match expected {"
+              << absl::StrJoin(rhs_contracting_dims_, ",") << "}";
+      return false;
+    }
+    return true;
+  }
+
+  absl::InlinedVector<int64_t, 8> lhs_contracting_dims_;
+  absl::InlinedVector<int64_t, 8> rhs_contracting_dims_;
+};
+
+class HloInstructionReplicaGroupsImpl {
+ public:
+  explicit HloInstructionReplicaGroupsImpl(
+      std::vector<std::vector<int64_t>> replica_groups)
+      : replica_groups_(std::move(replica_groups)) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    std::vector<std::string> replica_group_strs;
+    replica_group_strs.reserve(replica_groups_.size());
+    for (const std::vector<int64_t>& replica_group : replica_groups_) {
+      replica_group_strs.push_back(
+          absl::StrCat("{", absl::StrJoin(replica_group, ","), "}"));
+    }
+    *os << "with replica_group {" << absl::StrJoin(replica_group_strs, ",")
+        << "}";
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    const HloCollectiveInstruction* collective =
+        DynCast<HloCollectiveInstruction>(inst);
+    if (!collective) {
+      EXPLAIN << "HloInstruction is not a collective";
+      return false;
+    }
+
+    if (absl::c_equal(collective->replica_groups(), replica_groups_,
+                      [](const ReplicaGroup& a, const std::vector<int64_t>& b) {
+                        return absl::c_equal(a.replica_ids(), b);
+                      })) {
+      return true;
+    }
+
+    std::ostringstream desc_stream;
+    DescribeTo(&desc_stream);
+
+    std::vector<std::string> replica_group_strs;
+    replica_group_strs.reserve(replica_groups_.size());
+    for (const ReplicaGroup& replica_group : collective->replica_groups()) {
+      replica_group_strs.push_back(absl::StrCat(
+          "{", absl::StrJoin(replica_group.replica_ids(), ","), "}"));
+    }
+    EXPLAIN << "replica_group {" << absl::StrJoin(replica_group_strs, ",")
+            << "} don't match expected " << desc_stream.str();
+    return false;
+  }
+
+  std::vector<std::vector<int64_t>> replica_groups_;
+};
+
+class HloInstructionShardingImpl {
+ public:
+  explicit HloInstructionShardingImpl(
+      const std::optional<HloSharding>& sharding)
+      : sharding_(sharding) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    if (sharding_.has_value()) {
+      *os << "with sharding " << sharding_->ToString();
+    } else {
+      *os << "with no sharding";
+    }
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (!sharding_.has_value()) {
+      if (!inst->has_sharding()) {
+        return true;
+      }
+      EXPLAIN << "HloInstruction is expected to have no sharding.";
+      return false;
+    }
+    if (inst->has_sharding()) {
+      if (inst->sharding() == sharding_.value()) {
+        return true;
+      }
+      EXPLAIN << "sharding " << inst->sharding().ToString()
+              << " don't match expected " << sharding_->ToString();
+      return false;
+    } else {
+      EXPLAIN << "HloInstruction has no sharding. Expected: "
+              << sharding_->ToString();
+      return false;
+    }
+  }
+
+  std::optional<HloSharding> sharding_;
+};
+
+class HloInstructionControlDepsImpl {
+ public:
+  explicit HloInstructionControlDepsImpl(
+      absl::Span<HloInstruction* const> preds,
+      absl::Span<HloInstruction* const> succs)
+      : preds_(preds), succs_(succs) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    auto print_deps = [os](absl::Span<HloInstruction* const> deps,
+                           absl::string_view type) {
+      if (deps.empty()) {
+        *os << "no control " << type;
+      } else {
+        *os << "control " << type << " {" << absl::StrJoin(deps, ",", fmt)
+            << "}";
+      }
+    };
+
+    *os << "with ";
+    print_deps(preds_, "predecessors");
+    *os << " and ";
+    print_deps(succs_, "successors");
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    auto match_deps = [&](absl::Span<HloInstruction* const> expected_deps,
+                          const PtrVec<HloInstruction*>& actual_deps,
+                          absl::string_view type) {
+      if (!absl::c_equal(expected_deps, actual_deps)) {
+        EXPLAIN << "HloInstruction expected to have control " << type << " {"
+                << absl::StrJoin(expected_deps, ",", fmt) << "} but has {"
+                << absl::StrJoin(actual_deps, ",", fmt) << "}";
+        return false;
+      }
+      return true;
+    };
+    return match_deps(preds_, inst->control_predecessors(), "predecessors") &&
+           match_deps(succs_, inst->control_successors(), "successors");
+  }
+
+  static void fmt(std::string* out, const HloInstruction* inst) {
+    absl::StrAppend(out, inst->name());
+  };
+
+  absl::Span<HloInstruction* const> preds_, succs_;
+};
+
+// Matches a constant scalar or effective scalar, optionally with a given value.
+template <typename ScalarTy>
+class HloConstantScalarImpl {
+ public:
+  explicit constexpr HloConstantScalarImpl(bool match_effective_scalar)
+      : val_(std::nullopt), match_effective_scalar_(match_effective_scalar) {}
+
+  constexpr HloConstantScalarImpl(ScalarTy val, bool match_effective_scalar)
+      : val_(val), match_effective_scalar_(match_effective_scalar) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which is a constant "
+        << (match_effective_scalar_ ? "effective " : "") << "scalar";
+    if (val_.has_value()) {
+      *os << " with value " << *val_;
+    }
+  }
+
+ private:
+  template <typename InstTy>
+  bool MatchImpl(InstTy* inst, MatchOption option) const {
+    const auto* const_inst = DynCast<HloConstantInstruction>(inst);
+    if (!const_inst) {
+      EXPLAIN << "HloInstruction is not a constant";
+      return false;
+    }
+    if (match_effective_scalar_ &&
+        !ShapeUtil::IsEffectiveScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not an effective scalar";
+      return false;
+    }
+    if (!match_effective_scalar_ && !ShapeUtil::IsScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not a scalar";
+      return false;
+    }
+    if (!val_.has_value()) {
+      return true;
+    }
+
+    auto const_inst_scalar_or = const_inst->literal().Reshape({});
+    if (!const_inst_scalar_or.ok()) {
+      EXPLAIN << "could not convert matched literal to effective scalar";
+      return false;
+    }
+    Literal const_inst_scalar = std::move(const_inst_scalar_or).value();
+    if (!const_inst_scalar.IsEqualAt({}, *val_)) {
+      EXPLAIN << "HloInstruction's constant value "
+              << const_inst_scalar.ToStringWithoutShape()
+              << " did not match expected value " << *val_;
+      return false;
+    }
+    return true;
+  }
+
+  std::optional<ScalarTy> val_;
+  bool match_effective_scalar_;
+};
+
+// A pattern that matches HloInstructions.
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern {
+ private:
+  template <typename NewImpl>
+  auto AppendImpl(NewImpl new_impl) const {
+    auto new_allof = AllOf<::xla::HloInstruction>(impl_, std::move(new_impl));
+    return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
+        std::move(new_allof), matched_inst_);
+  }
+
+ public:
+  explicit constexpr HloInstructionPattern(const Impl& impl,
+                                           HloInstructionType** matched_inst)
+      : impl_(impl), matched_inst_(matched_inst) {}
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (impl_.Match(inst, option)) {
+      if (option.capture && matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    if (inst != nullptr) {
+      EXPLAIN << "\nin " << InstToString(inst);
+    }
+    return false;
+  }
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(::xla::HloInstruction* inst, MatchOption option,
+             bool explain_instruction = true) const {
+    if (impl_.Match(inst, option)) {
+      if (option.capture && matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    if (explain_instruction) {
+      EXPLAIN << "\nin " << InstToString(inst);
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the instruction has the given name.
+  auto WithName(absl::string_view name) const {
+    return AppendImpl(HloInstructionPatternNameImpl(name));
+  }
+
+  // Modifies the pattern to match only if the instruction has the given opcode.
+  auto WithOpcode(HloOpcode opcode) const {
+    return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, false));
+  }
+
+  // Modifies the pattern to match a custom call with one of the given targets.
+  auto WithCustomCallTarget(
+      absl::Span<const absl::string_view> custom_call_targets) const {
+    return AppendImpl(HloInstructionCustomCallTargetImpl(custom_call_targets));
+  }
+
+  auto WithNumOperands(int64_t num_operands) const {
+    return AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands));
+  }
+
+  // Modifies the pattern to match only if the instruction does not have the
+  // given opcode.
+  auto WithoutOpcode(HloOpcode opcode) const {
+    return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, true));
+  }
+
+  constexpr auto Is(const HloInstruction* instr) const {
+    return AppendImpl(HloInstructionIsImpl(instr));
+  }
+
+  // Modifies the pattern to match only if the instruction is a constant.
+  constexpr auto IsConstant() const { return WithOpcode(HloOpcode::kConstant); }
+
+  constexpr auto IsConstantScalar() const {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false));
+  }
+
+  // This does not check that T has the same type as the instruction, so e.g.
+  // IsConstantScalar(1.0) may match a constant of shape int32_t[].
+  template <typename ScalarTy>
+  constexpr auto IsConstantScalar(const ScalarTy& val) const {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/false));
+  }
+
+  constexpr auto IsConstantEffectiveScalar() const {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true));
+  }
+
+  template <typename ScalarTy>
+  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/true));
+  }
+
+  // Modifies the pattern to match only if the instruction is not a constant.
+  constexpr auto IsNonConstant() const {
+    return WithoutOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction has a shape that
+  // matches the given pattern.
+  template <typename ShapeType, typename ShapeImpl>
+  constexpr auto WithShape(
+      const ShapePattern<ShapeType, ShapeImpl>& shape) const {
+    return AppendImpl(
+        HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape));
+  }
+
+  // Because we only specify the shape's element type and dims, this is
+  // effectively checking shape-compatible-to, not shape-equal-to.  Perhaps this
+  // function should be called WithShapeCompatibleTo, but the short name is
+  // nice, and there's no ambiguity because there's no layout in the args!
+  constexpr auto WithShape(PrimitiveType ty, absl::Span<const int64_t> dims) {
+    return WithShape(Shape().WithElementType(ty).WithDims(dims));
+  }
+
+  // Modifies the pattern to match only if the instruction's shape's element
+  // type, dims and minor to major dimension ordering match the given pattern.
+  constexpr auto WithShape(PrimitiveType ty, absl::Span<const int64_t> dims,
+                           absl::Span<const int64_t> minor_to_major) {
+    return WithShape(
+        Shape().WithElementType(ty).WithDims(dims).WithLayout(minor_to_major));
+  }
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const {
+    return WithShape(Shape().EqualTo(shape));
+  }
+
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const {
+    return WithShape(Shape().CompatibleTo(shape));
+  }
+
+  // Modifies the pattern to match only if the instruction's shape's element
+  // type matches the given pattern.
+  constexpr auto WithElementType(PrimitiveType ty) {
+    return WithShape(Shape().WithElementType(ty));
+  }
+
+  // Modifies the pattern to match only if the instruction has an operand that
+  // matches the given pattern.
+  template <typename OperandType, typename OperandImpl>
+  constexpr auto WithOperand(
+      int64_t operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
+    return AppendImpl(
+        HloInstructionPatternOperandImpl<OperandType, OperandImpl>(
+            operand_index, operand));
+  }
+
+  // Modifies the pattern to match only if
+  //  - the instruction has fewer than i+1 operands, or
+  //  - the i'th operand matches the given pattern.
+  template <typename OperandType, typename OperandImpl>
+  constexpr auto WithOperandIfPresent(
+      int64_t operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
+    return AppendImpl(
+        HloInstructionPatternOperandIfPresentImpl<OperandType, OperandImpl>(
+            operand_index, operand));
+  }
+
+  template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+            typename OperandImpl2>
+  constexpr auto WithBinaryOperandsAnyOrder(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const {
+    return AppendImpl(
+        HloInstructionPatternBinaryOperandsAnyOrderImpl<
+            OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1, op2));
+  }
+
+  // Modifies the pattern to match only if the instruction is a fusion node with
+  // the given kind.
+  constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const {
+    return AppendImpl(HloInstructionPatternFusionKindImpl(kind));
+  }
+
+  // Modifies the pattern to match only if the instruction is a
+  // get-tuple-element with the given tuple index.
+  constexpr auto WithTupleIndex(int64_t tuple_index) const {
+    return AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index));
+  }
+
+  // Modifies the pattern to match only if the instruction is a parameter
+  // with the given parameter number.
+  constexpr auto WithParameterNum(int64_t parameter_num) const {
+    return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
+  }
+
+  // Modifies the pattern to match if the instruction is used exactly once.
+  // Does not match if the instruction is used twice by the same user (e.g.
+  // multiply(x,x)).
+  constexpr auto WithOneUse() const {
+    return AppendImpl(HloInstructionPatternOneUseImpl());
+  }
+
+  // Modifies the pattern to match if the instruction is used by exactly one
+  // other instruction.  Will match if the instruction is used twice, so long as
+  // it's by the same user (e.g.  multiply(x,x)).
+  constexpr auto WithOneUser() const {
+    return AppendImpl(HloInstructionPatternOneUserImpl());
+  }
+
+  // Modifies the pattern to match if the instruction is used by exactly
+  // user_num times by other instruction.
+  constexpr auto WithNumUser(int64_t user_num) const {
+    return AppendImpl(HloInstructionPatternNumUserImpl(user_num));
+  }
+
+  // Modifies the pattern to match if the instruction is used by less than
+  // user_num times by other instruction.
+  constexpr auto WithAtMostNumUser(int64_t user_num) const {
+    return AppendImpl(HloInstructionPatternAtMostNumUserImpl(user_num));
+  }
+
+  // Modifies the pattern to match only if the instruction has the given
+  // comparison direction.
+  auto WithComparisonDirection(ComparisonDirection direction) const {
+    return AppendImpl(HloInstructionPatternComparisonDirectionImpl(direction));
+  }
+
+  auto WithConvDnums(absl::string_view dnums) const {
+    return AppendImpl(HloInstructionPatternConvDnumsImpl(dnums));
+  }
+  auto WithConvDnums(ConvolutionDimensionNumbers dnums) const {
+    return AppendImpl(HloInstructionPatternConvDnumsImpl(dnums));
+  }
+
+  auto WithPredicate(HloPredicate fn) const {
+    return AppendImpl(HloInstructionPredicateImpl(std::move(fn)));
+  }
+
+  auto WithContractingDims(
+      absl::Span<const int64_t> lhs_contracting_dims,
+      absl::Span<const int64_t> rhs_contracting_dims) const {
+    return AppendImpl(HloInstructionContractingDimsImpl(lhs_contracting_dims,
+                                                        rhs_contracting_dims));
+  }
+
+  auto WithReplicaGroups(
+      std::vector<std::vector<int64_t>> replica_groups) const {
+    return AppendImpl(
+        HloInstructionReplicaGroupsImpl(std::move(replica_groups)));
+  }
+
+  auto WithSharding(absl::string_view sharding) const {
+    return AppendImpl(
+        HloInstructionShardingImpl(ParseSharding(sharding).value()));
+  }
+
+  auto WithControlDeps(absl::Span<HloInstruction* const> preds,
+                       absl::Span<HloInstruction* const> succs) {
+    return AppendImpl(HloInstructionControlDepsImpl(preds, succs));
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
+ private:
+  Impl impl_;
+  HloInstructionType** matched_inst_;
+};
+
+template <typename Item, typename... Patterns>
+struct AnyOfImpl {
+  auto operator()(const Patterns&... patterns) const {
+    return AnyOfPattern<typename std::remove_const<Item>::type, Patterns...>(
+        patterns...);
+  }
+};
+
+template <typename... Patterns>
+struct AnyOfImpl<HloInstruction, Patterns...> {
+  auto operator()(const Patterns&... patterns) const {
+    auto any_of = AnyOfPattern<HloInstruction, Patterns...>(patterns...);
+    return HloInstructionPattern<HloInstruction, decltype(any_of)>(
+        std::move(any_of), /*matched_inst=*/nullptr);
+  }
+};
+
+}  // namespace detail
+
+// Returns a pattern that represents the logical disjunction of the input
+// patterns. The returned pattern matches from left to right, and stops on the
+// first match.
+template <typename Item, typename... Patterns>
+auto AnyOf(const Patterns&... patterns) {
+  return detail::AnyOfImpl<Item, Patterns...>()(patterns...);
+}
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr auto Op(const ::xla::HloInstruction** matched_inst = nullptr) {
+  return detail::HloInstructionPattern<const ::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr auto Op(::xla::HloInstruction** matched_inst) {
+  return detail::HloInstructionPattern<::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Helpers for nullary instructions.
+#define XLA_NULLOP_PATTERN(NAME)                                     \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
+                                                                     \
+  template <typename HloInstructionType>                             \
+  inline auto NAME(HloInstructionType** matched_inst) {              \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);          \
+  }
+XLA_NULLOP_PATTERN(Constant)
+XLA_NULLOP_PATTERN(Parameter)
+XLA_NULLOP_PATTERN(Iota)
+XLA_NULLOP_PATTERN(Rng)
+XLA_NULLOP_PATTERN(PartitionId)
+XLA_NULLOP_PATTERN(ReplicaId)
+#undef XLA_NULLOP_PATTERN
+
+// Helpers for unary instructions.
+#define XLA_UNOP_PATTERN(NAME)                                       \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
+                                                                     \
+  template <typename HloInstructionType>                             \
+  inline auto NAME(HloInstructionType** matched_inst) {              \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);          \
+  }                                                                  \
+                                                                     \
+  template <typename Arg>                                            \
+  inline auto NAME(Arg&& arg) {                                      \
+    return Op()                                                      \
+        .WithOpcode(HloOpcode::k##NAME)                              \
+        .WithOperand(0, std::forward<Arg>(arg));                     \
+  }                                                                  \
+                                                                     \
+  template <typename HloInstructionType, typename Arg>               \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg) {   \
+    return Op(matched_inst)                                          \
+        .WithOpcode(HloOpcode::k##NAME)                              \
+        .WithOperand(0, std::forward<Arg>(arg));                     \
+  }
+XLA_UNOP_PATTERN(Abs)
+XLA_UNOP_PATTERN(Bitcast)
+XLA_UNOP_PATTERN(BitcastConvert)
+XLA_UNOP_PATTERN(Broadcast)
+XLA_UNOP_PATTERN(Cbrt)
+XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Convert)
+XLA_UNOP_PATTERN(Copy)
+XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(AllReduceStart)
+XLA_UNOP_PATTERN(AllReduceDone)
+XLA_UNOP_PATTERN(AllToAll)
+XLA_UNOP_PATTERN(RaggedAllToAll)
+XLA_UNOP_PATTERN(AsyncDone)
+XLA_UNOP_PATTERN(CollectiveBroadcast)
+XLA_UNOP_PATTERN(CollectivePermute)
+XLA_UNOP_PATTERN(CollectivePermuteStart)
+XLA_UNOP_PATTERN(CollectivePermuteDone)
+XLA_UNOP_PATTERN(Domain)
+XLA_UNOP_PATTERN(Erf)
+XLA_UNOP_PATTERN(Exp)
+XLA_UNOP_PATTERN(Expm1)
+XLA_UNOP_PATTERN(Fft)
+XLA_UNOP_PATTERN(Floor)
+XLA_UNOP_PATTERN(GetTupleElement)
+XLA_UNOP_PATTERN(Imag)
+XLA_UNOP_PATTERN(Infeed)
+XLA_UNOP_PATTERN(IsFinite)
+XLA_UNOP_PATTERN(Log)
+XLA_UNOP_PATTERN(Logistic)
+XLA_UNOP_PATTERN(Not)
+XLA_UNOP_PATTERN(Negate)
+XLA_UNOP_PATTERN(OptimizationBarrier)
+XLA_UNOP_PATTERN(Real)
+XLA_UNOP_PATTERN(Recv)
+XLA_UNOP_PATTERN(RecvDone)
+XLA_UNOP_PATTERN(ReducePrecision)
+XLA_UNOP_PATTERN(Reshape)
+XLA_UNOP_PATTERN(Reverse)
+XLA_UNOP_PATTERN(RoundNearestAfz)
+XLA_UNOP_PATTERN(RoundNearestEven)
+XLA_UNOP_PATTERN(Rsqrt)
+XLA_UNOP_PATTERN(SendDone)
+XLA_UNOP_PATTERN(Sign)
+XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Slice)
+XLA_UNOP_PATTERN(Sqrt)
+XLA_UNOP_PATTERN(Tan)
+XLA_UNOP_PATTERN(Tanh)
+XLA_UNOP_PATTERN(Transpose)
+XLA_UNOP_PATTERN(While)
+#undef XLA_UNOP_PATTERN
+
+// Helpers for binary instructions.
+#define XLA_BINOP_PATTERN(NAME)                                               \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }          \
+                                                                              \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::k##NAME)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs));                              \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return Op(matched_inst)                                                   \
+        .WithOpcode(HloOpcode::k##NAME)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs));                              \
+  }
+
+#define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                \
+  XLA_BINOP_PATTERN(NAME)                                                  \
+                                                                           \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>       \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs, \
+                             Rhs&& rhs) {                                  \
+    return Op(matched_inst)                                                \
+        .WithOpcode(HloOpcode::k##NAME)                                    \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                \
+                                    std::forward<Rhs>(rhs));               \
+  }                                                                        \
+  template <typename Lhs, typename Rhs>                                    \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs) {                       \
+    return NAME##AnyOrder<const HloInstruction>(                           \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));          \
+  }
+XLA_COMMUTATIVE_BINOP_PATTERN(Add)
+XLA_BINOP_PATTERN(Atan2)
+XLA_BINOP_PATTERN(Divide)
+XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Compare)
+XLA_BINOP_PATTERN(Convolution)
+XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(Gather)
+XLA_COMMUTATIVE_BINOP_PATTERN(Maximum)
+XLA_COMMUTATIVE_BINOP_PATTERN(Minimum)
+XLA_COMMUTATIVE_BINOP_PATTERN(Multiply)
+XLA_BINOP_PATTERN(Outfeed)
+XLA_BINOP_PATTERN(Pad)
+XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(Remainder)
+XLA_BINOP_PATTERN(Send)
+XLA_BINOP_PATTERN(Subtract)
+XLA_COMMUTATIVE_BINOP_PATTERN(And)
+XLA_COMMUTATIVE_BINOP_PATTERN(Or)
+XLA_BINOP_PATTERN(ShiftLeft)
+XLA_BINOP_PATTERN(ShiftRightArithmetic)
+XLA_BINOP_PATTERN(ShiftRightLogical)
+XLA_COMMUTATIVE_BINOP_PATTERN(Xor)
+#undef XLA_COMMUTATIVE_BINOP_PATTERN
+#undef XLA_BINOP_PATTERN
+
+// Helpers for ternary instructions.
+#define XLA_TERNOP_PATTERN(NAME)                                       \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }   \
+                                                                       \
+  template <typename Arg0, typename Arg1, typename Arg2>               \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2) {            \
+    return Op()                                                        \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }                                                                    \
+                                                                       \
+  template <typename HloInstructionType, typename Arg0, typename Arg1, \
+            typename Arg2>                                             \
+  inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
+                   Arg1&& arg1, Arg2&& arg2) {                         \
+    return Op(matched_inst)                                            \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }
+XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(RaggedDot);
+XLA_TERNOP_PATTERN(Select);
+XLA_TERNOP_PATTERN(SelectAndScatter);
+#undef XLA_TERNOP_PATTERN
+
+namespace detail {
+template <typename Matcher, typename FirstArg>
+inline auto WithOperands(Matcher&& m, int64_t operand_num,
+                         FirstArg&& first_arg) {
+  return m.WithOperand(operand_num, std::forward<FirstArg>(first_arg));
+}
+
+template <typename Matcher, typename FirstArg, typename... Args>
+inline auto WithOperands(Matcher&& m, int64_t operand_num, FirstArg&& first_arg,
+                         Args&&... args) {
+  return WithOperands(
+      m.WithOperand(operand_num, std::forward<FirstArg>(first_arg)),
+      operand_num + 1, std::forward<Args>(args)...);
+}
+}  // namespace detail
+
+#define XLA_VARIADIC_OP_PATTERN(NAME)                                         \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }          \
+                                                                              \
+  template <typename... Args>                                                 \
+  inline auto NAME(Args&&... args) {                                          \
+    return detail::WithOperands(                                              \
+        Op().WithOpcode(HloOpcode::k##NAME).WithNumOperands(sizeof...(Args)), \
+        /*operand_num=*/0, std::forward<Args>(args)...);                      \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename... Args>                    \
+  inline auto NAME(HloInstructionType** matched_inst, Args&&... args) {       \
+    return detail::WithOperands(Op(matched_inst)                              \
+                                    .WithOpcode(HloOpcode::k##NAME)           \
+                                    .WithNumOperands(sizeof...(Args)),        \
+                                /*operand_num=*/0,                            \
+                                std::forward<Args>(args)...);                 \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType>                                      \
+  inline auto NAME(HloInstructionType** matched_inst) {                       \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);                   \
+  }
+
+// We could implement all ops as "variadic" ops, but it would make the
+// already-bad compile errors even worse.
+XLA_VARIADIC_OP_PATTERN(AfterAll);
+XLA_VARIADIC_OP_PATTERN(AllGather)
+XLA_VARIADIC_OP_PATTERN(AllReduce)
+XLA_VARIADIC_OP_PATTERN(AsyncStart)
+XLA_VARIADIC_OP_PATTERN(Concatenate);
+XLA_VARIADIC_OP_PATTERN(Conditional);
+XLA_VARIADIC_OP_PATTERN(DynamicSlice)
+XLA_VARIADIC_OP_PATTERN(DynamicUpdateSlice)
+XLA_VARIADIC_OP_PATTERN(Fusion);
+XLA_VARIADIC_OP_PATTERN(Map)
+XLA_VARIADIC_OP_PATTERN(Reduce);
+XLA_VARIADIC_OP_PATTERN(ReduceScatter)
+XLA_VARIADIC_OP_PATTERN(ReduceWindow)
+XLA_VARIADIC_OP_PATTERN(Scatter);
+XLA_VARIADIC_OP_PATTERN(Sort);
+XLA_VARIADIC_OP_PATTERN(Tuple);
+XLA_VARIADIC_OP_PATTERN(Call);
+
+// CustomCall doesn't use the XLA_VARIADIC_OP_PATTERN macro so that you can
+// optionally pass a string_view for the custom_call_target before the other
+// operands.
+inline auto CustomCall() { return Op().WithOpcode(HloOpcode::kCustomCall); }
+
+template <typename HloInstructionType>
+auto CustomCall(HloInstructionType** matched_inst) {
+  return Op(matched_inst).WithOpcode(HloOpcode::kCustomCall);
+}
+
+template <
+    typename Arg0, typename... Args,
+    typename std::enable_if<
+        !std::is_convertible<Arg0, absl::string_view>::value &&
+        !std::is_convertible<Arg0, HloInstruction**>::value &&
+        !std::is_convertible<Arg0, const HloInstruction**>::value>::type* =
+        nullptr>
+auto CustomCall(Arg0&& arg0, Args&&... args) {
+  return detail::WithOperands(CustomCall().WithNumOperands(sizeof...(Args) + 1),
+                              /*operand_num=*/0, std::forward<Arg0>(arg0),
+                              std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+auto CustomCall(absl::Span<const absl::string_view> custom_call_targets,
+                Args&&... args) {
+  return CustomCall(std::forward<Args>(args)...)
+      .WithCustomCallTarget(custom_call_targets);
+}
+
+template <typename HloInstructionType, typename Arg0, typename... Args,
+          typename std::enable_if<!std::is_convertible<
+              Arg0, absl::string_view>::value>::type* = nullptr>
+auto CustomCall(HloInstructionType** matched_inst, Arg0&& arg0,
+                Args&&... args) {
+  return detail::WithOperands(
+      CustomCall(matched_inst).WithNumOperands(sizeof...(Args) + 1),
+      /*operand_num=*/0, std::forward<Arg0>(arg0), std::forward<Args>(args)...);
+}
+
+template <typename HloInstructionType, typename... Args>
+auto CustomCall(HloInstructionType** matched_inst,
+                absl::Span<const absl::string_view> custom_call_targets,
+                Args&&... args) {
+  return CustomCall(matched_inst, std::forward<Args>(args)...)
+      .WithCustomCallTarget(custom_call_targets);
+}
+
+// Helpers for comparison instructions.
+#define XLA_COMPARE_PATTERN(NAME)                                             \
+  inline auto NAME() {                                                        \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }                                                                           \
+                                                                              \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs))                               \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return Op(matched_inst)                                                   \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs))                               \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }
+
+#define XLA_COMMUTATIVE_COMPARE_PATTERN(NAME)                              \
+  XLA_COMPARE_PATTERN(NAME)                                                \
+                                                                           \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>       \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs, \
+                             Rhs&& rhs) {                                  \
+    return Op(matched_inst)                                                \
+        .WithOpcode(HloOpcode::kCompare)                                   \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                \
+                                    std::forward<Rhs>(rhs));               \
+  }                                                                        \
+  template <typename Lhs, typename Rhs>                                    \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs) {                       \
+    return NAME##AnyOrder<const HloInstruction>(                           \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));          \
+  }
+
+XLA_COMMUTATIVE_COMPARE_PATTERN(Eq);
+XLA_COMMUTATIVE_COMPARE_PATTERN(Ne);
+XLA_COMPARE_PATTERN(Ge);
+XLA_COMPARE_PATTERN(Gt);
+XLA_COMPARE_PATTERN(Le);
+XLA_COMPARE_PATTERN(Lt);
+
+// Helpers for matching non-constant instructions.
+inline auto NonConstant() { return Op().IsNonConstant(); }
+
+template <typename HloInstructionType>
+inline auto NonConstant(HloInstructionType** matched_inst) {
+  return Op(matched_inst).IsNonConstant();
+}
+
+// Add overloads for GetTupleElement which take a int64_t specifying which tuple
+// element is selected.
+template <typename Arg>
+inline auto GetTupleElement(Arg&& arg, int64_t tuple_index) {
+  return Op()
+      .WithOpcode(HloOpcode::kGetTupleElement)
+      .WithOperand(0, std::forward<Arg>(arg))
+      .WithTupleIndex(tuple_index);
+}
+
+template <typename HloInstructionType, typename Arg>
+inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
+                            int64_t tuple_index) {
+  return Op(matched_inst)
+      .WithOpcode(HloOpcode::kGetTupleElement)
+      .WithOperand(0, std::forward<Arg>(arg))
+      .WithTupleIndex(tuple_index);
+}
+
+// Add overloads for Parameter which take an int64_t specifying the parameter
+// number.
+inline auto Parameter(int64_t parameter_num) {
+  return Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num);
+}
+template <typename HloInstructionType>
+inline auto Parameter(HloInstructionType** matched_inst,
+                      int64_t parameter_num) {
+  return Op(matched_inst)
+      .WithOpcode(HloOpcode::kParameter)
+      .WithParameterNum(parameter_num);
+}
+
+inline auto ConstantScalar() { return Op().IsConstantScalar(); }
+
+template <typename HloInstructionType>
+inline auto ConstantScalar(HloInstructionType** matched_inst) {
+  return Op(matched_inst).IsConstantScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantScalar(ScalarTy val) {
+  return Op().IsConstantScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val) {
+  return Op(matched_inst).IsConstantScalar(val);
+}
+
+inline auto ConstantEffectiveScalar() {
+  return Op().IsConstantEffectiveScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst) {
+  return Op(matched_inst).IsConstantEffectiveScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantEffectiveScalar(ScalarTy val) {
+  return Op().IsConstantEffectiveScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst,
+                                    ScalarTy val) {
+  return Op(matched_inst).IsConstantEffectiveScalar(val);
+}
+
+namespace detail {
+
+// A helper for the type erasure.
+class InstructionPatternInterface {
+ public:
+  virtual ~InstructionPatternInterface() = default;
+
+  virtual bool Match(::xla::HloInstruction* instr,
+                     MatchOption option) const = 0;
+
+  virtual void DescribeTo(std::ostream* os, int64_t indent) const = 0;
+};
+
+// A helper for the type erasure.
+template <typename Pattern>
+class TypedInstructionPattern : public InstructionPatternInterface {
+ public:
+  explicit TypedInstructionPattern(Pattern pattern)
+      : pattern_(std::move(pattern)) {}
+
+  bool Match(::xla::HloInstruction* instr, MatchOption option) const override {
+    return pattern_.Match(instr, option);
+  }
+  void DescribeTo(std::ostream* os, int64_t indent) const override {
+    pattern_.DescribeTo(os, indent);
+  }
+
+ private:
+  Pattern pattern_;
+};
+
+// An Impl class for HloInstructionPattern that stores the inner pattern in a
+// type-erased shared_ptr.
+class HloInstructionPatternSharedImpl {
+ public:
+  template <typename Pattern>
+  explicit HloInstructionPatternSharedImpl(Pattern pattern)
+      // The return type annotation of the lambda is needed for correctness.
+      : pattern_(std::make_shared<TypedInstructionPattern<Pattern>>(
+            std::move(pattern))) {}
+
+  bool Match(::xla::HloInstruction* instr, MatchOption option) const {
+    return pattern_->Match(instr, option);
+  }
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    pattern_->DescribeTo(os, indent);
+  }
+
+ private:
+  // We use std::shared_ptr to avoid copying the underlying pattern when reusing
+  // it, thus avoiding exponential memory use.
+  std::shared_ptr<InstructionPatternInterface> pattern_;
+};
+
+}  // namespace detail
+
+// Wraps the HloInstructionPattern in a type-erased shared_ptr.
+//
+// This can be used around repeated subpatterns to avoid exponential code growth
+// and compilation time, while keeping memory usage minimal at runtime. This is
+// similar to defining a subroutine in imperative programming languages.
+//
+// If used excessively, this may slightly increase the runtime of the patterns.
+//
+// Example usage:
+// auto shared = m::SharedSubpattern(sub_pattern);
+// return m::AnyOf<HloInstruction>(m::Convert(shared), shared);
+template <typename HloInstructionType, typename OriginalImpl>
+inline auto SharedSubpattern(
+    detail::HloInstructionPattern<HloInstructionType, OriginalImpl> pattern) {
+  auto impl = detail::HloInstructionPatternSharedImpl(std::move(pattern));
+  return detail::HloInstructionPattern<HloInstructionType, decltype(impl)>(
+      std::move(impl), /*matched_inst=*/nullptr);
+}
+
+}  // namespace match
+
+}  // namespace xla
+
+#undef EXPLAIN
+#pragma pop_macro("EXPLAIN")
+#endif  // XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/pattern_matcher_gmock.h b/third_party/tflite-hdrs/third_party/xla/xla/service/pattern_matcher_gmock.h
new file mode 100644
index 00000000..f8bea2cf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/pattern_matcher_gmock.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+#define XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+
+#endif  // XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/platform_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/platform_util.h
new file mode 100644
index 00000000..1162ebfe
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/platform_util.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_PLATFORM_UTIL_H_
+#define XLA_SERVICE_PLATFORM_UTIL_H_
+
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// Utilities for querying platforms and devices used by XLA.
+class PlatformUtil {
+ public:
+  // Returns the canonical name of the underlying platform.
+  //
+  // This is needed to differentiate if for given platform like GPU or CPU
+  // there are multiple implementations. For example, GPU platform may be
+  // cuda(Nvidia) or rocm(AMD)
+  static absl::StatusOr<std::string> CanonicalPlatformName(
+      absl::string_view platform_name);
+
+  // Returns the platforms present on the system and supported by XLA.
+  //
+  // Note that, even if a platform is present with zero devices, if we *do* have
+  // compilation support for it, it will be returned in this sequence.
+  static absl::StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();
+
+  // Convenience function which returns the default supported platform for
+  // tests. If exactly one supported platform is present, then this platform is
+  // the default platform. If exactly two platforms are present and one of them
+  // is the interpreter platform, then the other platform is the default
+  // platform. Otherwise returns an error.
+  static absl::StatusOr<se::Platform*> GetDefaultPlatform();
+
+  // Returns the platform according to the given name. Returns error if there is
+  // no such platform.
+  static absl::StatusOr<se::Platform*> GetPlatform(
+      absl::string_view platform_name);
+
+  // Returns a vector of StreamExecutors for the given platform.
+  // If populated, only the devices in allowed_devices will have
+  // their StreamExecutors initialized, otherwise all StreamExecutors will be
+  // initialized and returned.
+  //
+  // If the platform has no visible devices, a not-found error is returned.
+  static absl::StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
+      se::Platform* platform,
+      const std::optional<std::set<int>>& allowed_devices = std::nullopt);
+
+ private:
+  PlatformUtil(const PlatformUtil&) = delete;
+  PlatformUtil& operator=(const PlatformUtil&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_PLATFORM_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/profile_guided_latency_estimator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/profile_guided_latency_estimator.h
new file mode 100644
index 00000000..40a4df59
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/profile_guided_latency_estimator.h
@@ -0,0 +1,110 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_PROFILE_GUIDED_LATENCY_ESTIMATOR_H_
+#define XLA_SERVICE_PROFILE_GUIDED_LATENCY_ESTIMATOR_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/latency_hiding_scheduler.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
+
+namespace xla {
+
+// Helper class enabling gathering of statistics (such as missing instruction
+// from the profile) for PGLE.
+class ProfileStatisticsAggregator {
+ public:
+  struct Statistics {
+    int found_instructions_count;
+    absl::flat_hash_set<const HloInstruction*>& missing_instructions;
+  };
+
+  virtual ~ProfileStatisticsAggregator() = default;
+
+  // Handler for the missing instruction cost.
+  virtual void HandleMissingInstructionCost(
+      const HloInstruction& instruction) = 0;
+
+  // Handler for found instruction cost.
+  virtual void HandleFoundInstructionCost(
+      const HloInstruction& instruction) = 0;
+
+  // Handler for the missing latency info between `from` and `to`.
+  virtual void HandleMissingInstructionLatency(const HloInstruction& from,
+                                               const HloInstruction& to) = 0;
+
+  // Handler for found latency info between `from` and `to`.
+  virtual void HandleFoundInstructionLatency(const HloInstruction& from,
+                                             const HloInstruction& to) = 0;
+
+  // Returns gathered statistics summary.
+  Statistics GetStats();
+
+ protected:
+  absl::flat_hash_set<const HloInstruction*> missing_instructions_;
+  int found_instructions_count_ = 0;
+};
+
+// Implementation of LatencyEstimator using a profile to estimate HLO cost and
+// latencies between instructions. If a cost is not known, it will forward to
+// an underlying estimator.
+class ProfileGuidedLatencyEstimator : public LatencyEstimator {
+ public:
+  ProfileGuidedLatencyEstimator(
+      const SchedulerConfig& config,
+      std::unique_ptr<LatencyEstimator> latency_estimator,
+      const tensorflow::profiler::ProfiledInstructionsProto& proto,
+      std::unique_ptr<ProfileStatisticsAggregator> aggregator = nullptr);
+
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& target) const override;
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+  int CyclesPerMicrosecond() const override {
+    return latency_estimator_->CyclesPerMicrosecond();
+  }
+
+  // Checks whether `module` has all the respective instructions present in the
+  // profile grabbed by this object.
+  //
+  // Returns absl::OkStatus if accuracy check passes,
+  // `absl::InvalidArgumentError` does not pass and
+  // `absl::FailedPreconditionError` if `aggregator_` is not provided.
+  absl::Status CheckAccuracy(const HloModule& module);
+
+ private:
+  const SchedulerConfig config_;
+  std::unique_ptr<LatencyEstimator> latency_estimator_;
+
+  // Profile info pertaining to a single instruction.
+  struct ProfileInfo {
+    std::optional<TimeCost> cost;
+    // Latencies to other instruction with this instruction as source.
+    absl::flat_hash_map<std::string, TimeCost> latencies;
+  };
+  absl::flat_hash_map<std::string, ProfileInfo> instr_map_;
+  // Aggregator gathering data about missed/found instructions.
+  std::unique_ptr<ProfileStatisticsAggregator> aggregator_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_PROFILE_GUIDED_LATENCY_ESTIMATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/qr_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/qr_expander.h
new file mode 100644
index 00000000..067ea64c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/qr_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_QR_EXPANDER_H_
+#define XLA_SERVICE_QR_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/qr_expander.h"
+
+#endif  // XLA_SERVICE_QR_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/real_imag_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/real_imag_expander.h
new file mode 100644
index 00000000..fc87a60e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/real_imag_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REAL_IMAG_EXPANDER_H_
+#define XLA_SERVICE_REAL_IMAG_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/real_imag_expander.h"
+
+#endif  // XLA_SERVICE_REAL_IMAG_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_decomposer.h
new file mode 100644
index 00000000..12fac9b0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_decomposer.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REDUCE_DECOMPOSER_H_
+#define XLA_SERVICE_REDUCE_DECOMPOSER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/reduce_decomposer.h"
+
+#endif  // XLA_SERVICE_REDUCE_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_combiner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_combiner.h
new file mode 100644
index 00000000..22134bcb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_combiner.h
@@ -0,0 +1,88 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REDUCE_SCATTER_COMBINER_H_
+#define XLA_SERVICE_REDUCE_SCATTER_COMBINER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/all_reduce_key.h"
+#include "xla/service/hlo_domain_map.h"
+
+namespace xla {
+
+// Combines small non-dependent ReduceScatter ops into larger combined
+// ReduceScatter ops. A typical ReduceScatter implementation has a minimum
+// latency-induced time for a ReduceScatter op so a single combined op can be
+// more efficient than many small ones.
+class ReduceScatterCombiner : public HloModulePass {
+ public:
+  ReduceScatterCombiner(int64_t combine_threshold_in_bytes,
+                        int64_t combine_threshold_count, bool combine_by_dim,
+                        bool combine_while_loops = true);
+
+  absl::string_view name() const override { return "reduce-scatter-combiner"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  using GroupKey = std::tuple<AllReduceKey, /*scatter_dimension*/ int64_t,
+                              /*extra_args*/ std::string>;
+
+  static std::string& GetGroupKeyExtraArgs(
+      ReduceScatterCombiner::GroupKey& key);
+
+  // Returns a key that will be equal for instructions that might be combined,
+  // or different if not.
+  static std::optional<ReduceScatterCombiner::GroupKey> CombineKey(
+      const HloInstruction* instruction, const HloDomainMap& domain_map,
+      bool combine_by_dim);
+
+ protected:
+  absl::StatusOr<bool> RunWithKeyCombiner(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      absl::FunctionRef<std::optional<ReduceScatterCombiner::GroupKey>(
+          const HloInstruction*, const HloDomainMap&, bool)>
+          combine_key);
+
+  // Combine reduce-scatter ops up to this threshold.
+  int64_t combine_threshold_in_bytes_;
+
+  // Combine reduce-scatter ops up to this threshold (number of operands).
+  int64_t combine_threshold_count_;
+
+  // Combine only reduce-scatter ops with the same dimension.
+  bool combine_by_dim_;
+
+  // Combine reduce-scatter ops that are inside while loop body computations.
+  bool combine_while_loops_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_REDUCE_SCATTER_COMBINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_decomposer.h
new file mode 100644
index 00000000..ea52de24
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_decomposer.h
@@ -0,0 +1,49 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REDUCE_SCATTER_DECOMPOSER_H_
+#define XLA_SERVICE_REDUCE_SCATTER_DECOMPOSER_H_
+
+#include <functional>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that decomposes a reduce-scatter into an all-reduce followed by a
+// dynamic-slice.
+class ReduceScatterDecomposer : public HloModulePass {
+ public:
+  explicit ReduceScatterDecomposer(
+      std::function<void(Shape&)> update_layout = nullptr,
+      std::function<bool(const HloInstruction*)> should_decompose = nullptr)
+      : update_layout_(update_layout), should_decompose_(should_decompose) {}
+  absl::string_view name() const override {
+    return "reduce-scatter-decomposer";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  std::function<void(Shape&)> update_layout_;
+  std::function<bool(const HloInstruction*)> should_decompose_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_REDUCE_SCATTER_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_reassociate.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_reassociate.h
new file mode 100644
index 00000000..fb7a0ecc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_scatter_reassociate.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REDUCE_SCATTER_REASSOCIATE_H_
+#define XLA_SERVICE_REDUCE_SCATTER_REASSOCIATE_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that reassociates reduce-scatter feeding into compatible elementwise
+// operations. As an example: add(reduce-scatter(x), reduce-scatter(y)) will be
+// replaced with reduce_scatter(add(x,y)).
+//
+//  i.e., reassociating the reduce-scatter operation.
+
+class ReduceScatterReassociate : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "reduce-scatter-reassociate";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_REDUCE_SCATTER_REASSOCIATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_window_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_window_rewriter.h
new file mode 100644
index 00000000..01f1ad58
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reduce_window_rewriter.h
@@ -0,0 +1,22 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
+#define XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/reduce_window_rewriter.h"
+
+#endif  // XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/rendezvous.h b/third_party/tflite-hdrs/third_party/xla/xla/service/rendezvous.h
new file mode 100644
index 00000000..b1977660
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/rendezvous.h
@@ -0,0 +1,403 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_RENDEZVOUS_H_
+#define XLA_SERVICE_RENDEZVOUS_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/logging.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+
+//===----------------------------------------------------------------------===//
+// A rendezvous for a group of threads.
+//===----------------------------------------------------------------------===//
+
+// A little bit of compile time metaprogramming to simplify the rendezvous
+// return type for functions returning `absl::StatusOr`. If we detect that
+// rendezvous callback returns `absl::StatusOr` we swap the order of a shared
+// pointer and status container.
+
+template <typename R>
+struct RendezvousResult {
+  using Type = std::shared_ptr<R>;
+
+  static Type Wrap(R result) { return std::make_shared<R>(std::move(result)); }
+
+  static Type Empty() { return std::shared_ptr<R>(); }
+};
+
+template <typename R>
+struct RendezvousResult<absl::StatusOr<R>> {
+  using Type = absl::StatusOr<std::shared_ptr<R>>;
+
+  static Type Wrap(absl::StatusOr<R> result) {
+    if (!result.ok()) return result.status();
+    return std::make_shared<R>(std::move(*result));
+  }
+
+  static Type Empty() { return {std::shared_ptr<R>()}; }
+};
+
+template <>
+struct RendezvousResult<absl::Status> {
+  using Type = absl::Status;
+
+  static Type Wrap(absl::Status result) { return result; }
+  static Type Empty() { return absl::OkStatus(); }
+};
+
+template <typename R>
+using RendezvousResultType = typename RendezvousResult<R>::Type;
+
+// The group of threads identifies itself with a key that must be unique to
+// the group. When all threads have arrived at the rendezvous, one thread
+// executes the given function with the values supplied by each thread, and
+// all threads receive the result. Rendezvous must have a human readable name to
+// make easy to debug stuck and timed out attempts.
+template <typename R, typename K, typename V, typename Fn>
+RendezvousResultType<R> RendezvousSingle(
+    absl::string_view name, const K& key, const V& value, size_t num_threads,
+    Fn fn, absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+// A rendezvous for a group of threads that do not have any value arguments.
+template <typename R, typename K, typename Fn>
+RendezvousResultType<R> RendezvousSingle(
+    absl::string_view name, const K& key, size_t num_threads, Fn fn,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+// A rendezvous for a group of threads that do not have any computation to run
+// and simply acts as a barrier for a group of thread.
+template <typename K>
+void RendezvousSingle(
+    absl::string_view name, const K& key, size_t num_threads,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+// An `std::once_flag`-like primitive for executing RendezvousSingle operations.
+//
+// RendezvousSingleFlag guarantees that all or none participants in a rendezvous
+// join the rendezvous process and once rendezvous is completed flag marked as
+// `completed` and all further rendezvous using this flag will be skipped. It
+// has a weaker than exactly-once guarantee and multiple racing rendezvous can
+// execute in parallel, and the last completed rendezvous will switch flag to
+// `completed` state.
+//
+// In XLA rendezvous are rare and used to guard costly shared state
+// initialization, so in practice we do not expect to see many racing rendezvous
+// and prefer simpler implementation with weaker guarantees.
+//
+// See: https://en.cppreference.com/w/cpp/thread/once_flag
+class RendezvousSingleFlag {
+ public:
+  RendezvousSingleFlag();
+
+  RendezvousSingleFlag(const RendezvousSingleFlag&) = delete;
+  RendezvousSingleFlag& operator=(const RendezvousSingleFlag&) = delete;
+
+  // RAII wrapper to exit from in-flight rendezvous when destructed.
+  class InFlightRendezvous {
+   public:
+    explicit InFlightRendezvous(RendezvousSingleFlag* flag);
+    ~InFlightRendezvous();
+
+    InFlightRendezvous(const InFlightRendezvous&) = delete;
+    InFlightRendezvous& operator=(const InFlightRendezvous&) = delete;
+
+    operator bool() const;  // NOLINT
+
+   private:
+    RendezvousSingleFlag* flag_;
+  };
+
+  // Returns InFlightRendezvous convertible to `true` if the caller should join
+  // the rendezvous process. If result conversion to bool is `false` it means
+  // that the rendezvous is already completed.
+  InFlightRendezvous TryJoin();
+
+  bool IsCompleted() const;
+
+ private:
+  friend class InFlightRendezvous;
+
+  std::atomic<int32_t> state_;
+};
+
+// A rendezvous for a group of threads that will be executed only if the flag is
+// not in `completed` state and will switch it to `completed` after finishing a
+// rendezvous. If rendezvous will not be executed it will return empty shared
+// pointer result.
+template <typename R, typename K, typename Fn>
+RendezvousResultType<R> RendezvousSingle(
+    RendezvousSingleFlag& flag, absl::string_view name, const K& key,
+    size_t num_threads, Fn fn,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+// A rendezvous for a group of threads that will be executed only if the flag is
+// not in `completed` state and will switch it to `completed` after finishing a
+// rendezvous.
+template <typename K>
+void RendezvousSingle(
+    RendezvousSingleFlag& flag, absl::string_view name, const K& key,
+    size_t num_threads,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+//===----------------------------------------------------------------------===//
+// Internal implementation details.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// A base class for rendezvous state that holds synchronization primitives.
+struct RendezvousStateSynchronization {
+  explicit RendezvousStateSynchronization(size_t num_threads)
+      : num_threads(num_threads), ack(0), rel(0), ready(false) {}
+
+  int32_t num_threads;
+
+  std::atomic<int32_t> ack;
+  std::atomic<int32_t> rel;
+
+  absl::Mutex mutex;
+  absl::CondVar cv;
+
+  // Signals availability of `result`.
+  std::atomic<bool> ready ABSL_GUARDED_BY(mutex);
+};
+
+// A state for a single round of rendezvous. We expect exactly `num_treads` to
+// arrive to a rendezvous and update corresponding slots in `values`. We
+// pre-allocate storage for values so at run time each participant doesn't have
+// to grab a lock and can simple write to the destination storage.
+template <typename R, typename V>
+struct RendezvousState : public RendezvousStateSynchronization {
+  explicit RendezvousState(size_t n_threads)
+      : RendezvousStateSynchronization(n_threads),
+        values(n_threads, nullptr),
+        result(RendezvousResult<R>::Empty()) {}
+
+  std::vector<const V*> values;
+  RendezvousResultType<R> result;
+};
+
+// A container for in-progress rendezvous.
+//
+// Rendezvous state ownership:
+//
+// (1) When rendezvous participant initiates a rendezvous with a particular key
+//     we create a new state for it, keep it in a map for tracking and return a
+//     shared pointer to the caller.
+//
+// (2) When rendezvous participant joins in-progress rendezvous it gets back
+//     a shared pointer that is copied from a tracking map.
+//
+// (3) When the last rendezvous participant computes the result it completes the
+//     rendezvous and removes a shared pointer to a state. Remaining shared
+//     pointers destructed when all participants are notified.
+//
+// This process guarantees that all completed rendezvous are removed from a map
+// and a map has records only for rendezvous in progress.
+template <typename K, typename R, typename V>
+class RendezvousMap {
+ public:
+  using State = RendezvousState<R, V>;
+
+  std::shared_ptr<State> Join(const K& key, size_t num_threads) {
+    absl::MutexLock lock(&mutex_);
+    std::shared_ptr<State>& state = state_[key];
+
+    // Join an in-progress rendezvous.
+    if (state) return state;
+
+    // Join a newly created rendezvous.
+    return state = std::make_shared<State>(num_threads);
+  }
+
+  void Complete(const K& key, RendezvousResultType<R> result) {
+    std::shared_ptr<State> state = [&] {
+      absl::MutexLock lock(&mutex_);
+
+      // Extract state from the map so we can immediately start a new round of
+      // rendezvous with the same key. A state for previous rendezvous will be
+      // destructed with the last copy of a shared pointer.
+      std::shared_ptr<State> state = state_.extract(key).mapped();
+
+      // Check that we have have exactly the number of participants we expected:
+      // +1 reference for all participants and a +1 reference we extracted.
+      CHECK_EQ(state.use_count(), 1 + state->values.size());  // NOLINT
+
+      return state;
+    }();
+
+    // We notify awaiting participants without holding a rendezvous map lock, as
+    // the rendezvous callback might be an expensive operation and might block
+    // the progress of concurrent rendezvous for other keys.
+
+    // Publish rendezvous result to all participants.
+    state->result = std::move(result);
+
+    // Notify awaiting participants that result is ready.
+    absl::MutexLock lock(&state->mutex);
+    state->ready.store(true);
+    state->cv.SignalAll();
+  }
+
+ private:
+  absl::Mutex mutex_;
+  absl::flat_hash_map<K, std::shared_ptr<State>> state_ ABSL_GUARDED_BY(mutex_);
+};
+
+void AwaitAndLogIfStuck(RendezvousStateSynchronization& state, int32_t id,
+                        absl::string_view name,
+                        absl::Duration warn_stuck_timeout,
+                        absl::Duration terminate_timeout);
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Rendezvous implemenetation.
+//===----------------------------------------------------------------------===//
+
+template <typename R, typename K, typename V, typename Fn>
+RendezvousResultType<R> RendezvousSingle(absl::string_view name, const K& key,
+                                         const V& value, size_t num_threads,
+                                         Fn fn,
+                                         absl::Duration warn_stuck_timeout,
+                                         absl::Duration terminate_timeout) {
+  // Check that `fn` is callable with a span of values and returns `R`.
+  static_assert(std::is_invocable_r_v<R, Fn, absl::Span<const V*>>,
+                "invalid rendezvous function signature");
+
+  // Fast-path (DO NOT REMOVE: the logic below doesn't work for single thread).
+  if (num_threads == 1) {
+    const V* ptr = &value;
+    return RendezvousResult<R>::Wrap(fn(absl::MakeSpan(&ptr, 1)));
+  }
+
+  using State = internal::RendezvousState<R, V>;
+  static auto& rendezvous = *new internal::RendezvousMap<K, R, V>;
+  std::shared_ptr<State> state = rendezvous.Join(key, num_threads);
+
+  // If we got an id larger than `num_threads` it means that we have multiple
+  // rendezvous sharing the same key running concurrently.
+  int64_t id = state->ack.fetch_add(1);
+  CHECK_LT(id, num_threads)  // NOLINT
+      << "Id can't be larger than the number of participating threads"
+      << "; id=" << id << "; num_threads=" << num_threads;
+
+  tsl::profiler::TraceMe trace([&] {
+    return tsl::profiler::TraceMeEncode(
+        "RendezvousSingle",
+        {{"num_threads", num_threads}, {"name", name}, {"id", id}});
+  });
+
+  // Signal all waiting threads that new participant has arrived.
+  state->cv.SignalAll();
+
+  // std::vector::operator[] creates data races, so we rely on data pointer
+  // here and when we create an absl::Span below.
+  *(state->values.data() + id) = &value;
+
+  // Use a second atomic to safely publish values without data races.
+  if constexpr (!std::is_same_v<R, std::nullopt_t>) {
+    id = state->rel.fetch_add(1);
+  }
+
+  if (id < num_threads - 1) {
+    // Threads arriving before the last one wait for a result to be computed by
+    // the last joining thread.
+    internal::AwaitAndLogIfStuck(*state, id, name, warn_stuck_timeout,
+                                 terminate_timeout);
+  } else {
+    // Last thread to arrive executes the function and completes rendezvous by
+    // making result available to all participants. All other participants will
+    // be notified via `state->ready` flag when result is ready, and we rely on
+    // the store to a flag to create a memory barrier that makes access to
+    // `state->result` safe without any extra synchronization.
+    tsl::profiler::TraceMe trace("ExecuteRendezvousCallback");
+    absl::Span<const V*> values(state->values.data(), num_threads);
+    rendezvous.Complete(key, RendezvousResult<R>::Wrap(fn(values)));
+  }
+
+  return state->result;
+}
+
+template <typename R, typename K, typename Fn>
+RendezvousResultType<R> RendezvousSingle(absl::string_view name, const K& key,
+                                         size_t num_threads, Fn fn,
+                                         absl::Duration warn_stuck_timeout,
+                                         absl::Duration terminate_timeout) {
+  return RendezvousSingle<R, K, std::nullopt_t>(
+      name, key, std::nullopt, num_threads, [fn](auto) { return fn(); },
+      warn_stuck_timeout, terminate_timeout);
+}
+
+template <typename K>
+void RendezvousSingle(absl::string_view name, const K& key, size_t num_threads,
+                      absl::Duration warn_stuck_timeout,
+                      absl::Duration terminate_timeout) {
+  RendezvousSingle<std::nullopt_t, K, std::nullopt_t>(
+      name, key, std::nullopt, num_threads, [](auto) { return std::nullopt; },
+      warn_stuck_timeout, terminate_timeout);
+}
+
+template <typename R, typename K, typename Fn>
+RendezvousResultType<R> RendezvousSingle(RendezvousSingleFlag& flag,
+                                         absl::string_view name, const K& key,
+                                         size_t num_threads, Fn fn,
+                                         absl::Duration warn_stuck_timeout,
+                                         absl::Duration terminate_timeout) {
+  if (auto in_flight_rendezvous = flag.TryJoin()) {
+    return RendezvousSingle<K>(name, key, num_threads, std::move(fn),
+                               warn_stuck_timeout, terminate_timeout);
+  } else {
+    return RendezvousResult<R>::Empty();
+  }
+}
+
+template <typename K>
+void RendezvousSingle(RendezvousSingleFlag& flag, absl::string_view name,
+                      const K& key, size_t num_threads,
+                      absl::Duration warn_stuck_timeout,
+                      absl::Duration terminate_timeout) {
+  if (auto in_flight_rendezvous = flag.TryJoin()) {
+    RendezvousSingle<K>(name, key, num_threads, warn_stuck_timeout,
+                        terminate_timeout);
+  }
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_RENDEZVOUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reshape_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reshape_decomposer.h
new file mode 100644
index 00000000..f5d5b140
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reshape_decomposer.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_RESHAPE_DECOMPOSER_H_
+#define XLA_SERVICE_RESHAPE_DECOMPOSER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/reshape_decomposer.h"
+
+#endif  // XLA_SERVICE_RESHAPE_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/reshape_mover.h b/third_party/tflite-hdrs/third_party/xla/xla/service/reshape_mover.h
new file mode 100644
index 00000000..63f2003e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/reshape_mover.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_RESHAPE_MOVER_H_
+#define XLA_SERVICE_RESHAPE_MOVER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/reshape_mover.h"
+
+#endif  // XLA_SERVICE_RESHAPE_MOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/result_caster.h b/third_party/tflite-hdrs/third_party/xla/xla/service/result_caster.h
new file mode 100644
index 00000000..d8fc2122
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/result_caster.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_RESULT_CASTER_H_
+#define XLA_SERVICE_RESULT_CASTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/result_caster.h"
+
+#endif  // XLA_SERVICE_RESULT_CASTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/rng_bit_generator_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/rng_bit_generator_expander.h
new file mode 100644
index 00000000..40a8b353
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/rng_bit_generator_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
+#define XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/rng_bit_generator_expander.h"
+
+#endif  // XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/rng_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/rng_expander.h
new file mode 100644
index 00000000..5f1951d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/rng_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_RNG_EXPANDER_H_
+#define XLA_SERVICE_RNG_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/rng_expander.h"
+
+#endif  // XLA_SERVICE_RNG_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/root_instruction_sinker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/root_instruction_sinker.h
new file mode 100644
index 00000000..38cc3c77
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/root_instruction_sinker.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
+#define XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/root_instruction_sinker.h"
+
+#endif  // XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h b/third_party/tflite-hdrs/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
new file mode 100644
index 00000000..e105dddb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
@@ -0,0 +1,97 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCAN_LOOP_ACCUMULATOR_INPUT_UNIFICATION_H_
+#define XLA_SERVICE_SCAN_LOOP_ACCUMULATOR_INPUT_UNIFICATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+// This pass looks at the nested loops with accumulator patterns and unifies the
+// accumulation buffer with the input. The accumulation pattern usually comes
+// from jax.scan function. This transformation is beneficial by removing the
+// unnecessary copy of the accumulation buffer in the outer body.
+// Below is the pattern that this pass identifies:
+//   +-while------------------------------------+
+//   | param = tuple(..., prev_acc, ...)        |
+//   | ...                                      |
+//   | input = gte(param), index=@prev_acc      |
+//   | acc = allocate-buffer()                  |
+//   | ...                                      |
+//   | +-scan----------------------------------+|
+//   | | param = tuple(..., acc, input, ...)   ||
+//   | | ...                                   ||
+//   | | slice = ds(input, i, 0, ...)          ||
+//   | | slice' = f(slice, ...)                ||
+//   | | acc'  = dus(acc, slice', i, 0, ...)   ||
+//   | | ...                                   ||
+//   | | ROOT = tuple(..., acc', input, ...)   ||
+//   | +---------------------------------------+|
+//   | new_acc = gte(scan), index=@acc'         |
+//   | copy_acc = copy(new_acc)                 |
+//   | ...                                      |
+//   | ROOT = tuple(..., copy_acc, ...)         |
+//   +------------------------------------------+
+//
+// To apply the unification we need to find pair of (acc,input). The
+// accumulators are found by simply looking for shape-covering write-only
+// instructions, in this case acc is written to by dynamic-update-slice that
+// covers the entire shape across all the iterations of the scan loop. To find
+// the input that corresponds to the accumulator, we follow the accumulated
+// output of the scan loop (index @acc') through the outer loop (index
+// @prev_acc) and find the index in which it is passed to the scan loop. Below
+// is the simplified program after unification:
+//
+//   +-while------------------------------------+
+//   | param = tuple(..., prev_acc, ...)        |
+//   | ...                                      |
+//   | input = gte(param), index=@prev_acc      |
+//   | ...                                      |
+//   | +-scan----------------------------------+|
+//   | | param = tuple(..., input, ...)        ||
+//   | | ...                                   ||
+//   | | slice = ds(input, i, 0, ...)          ||
+//   | | slice' = f(slice, ...)                ||
+//   | | acc'  = dus(input, slice', i, 0, ...) ||
+//   | | ...                                   ||
+//   | | ROOT = tuple(..., acc', ...)          ||
+//   | +---------------------------------------+|
+//   | new_acc = gte(scan), index=@acc'         |
+//   | ...                                      |
+//   | ROOT = tuple(..., new_acc, ...)          |
+//   +------------------------------------------+
+//
+class ScanLoopAccumulatorInputUnification : public HloModulePass {
+ public:
+  ~ScanLoopAccumulatorInputUnification() override = default;
+
+  explicit ScanLoopAccumulatorInputUnification() = default;
+
+  absl::string_view name() const override {
+    return "scan_loop_accumulator_input_unification";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCAN_LOOP_ACCUMULATOR_INPUT_UNIFICATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_determinism_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_determinism_expander.h
new file mode 100644
index 00000000..62c80346
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_determinism_expander.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCATTER_DETERMINISM_EXPANDER_H_
+#define XLA_SERVICE_SCATTER_DETERMINISM_EXPANDER_H_
+
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites scatter operations into a prefix-scan based algorithm that
+// ensures the scatter results to be determininstic. Note that the computation
+// after the expansion still contains a scatter operation, but it does not have
+// duplicated indices and hence the results are guaranteed to be deterministic.
+class ScatterDeterminismExpander : public OpExpanderPass {
+ public:
+  explicit ScatterDeterminismExpander() = default;
+
+  absl::string_view name() const override {
+    return "scatter_determinism_expander";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* inst) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCATTER_DETERMINISM_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_expander.h
new file mode 100644
index 00000000..fd19be44
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_expander.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCATTER_EXPANDER_H_
+#define XLA_SERVICE_SCATTER_EXPANDER_H_
+
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites scatter operations into (roughly) while loops of
+// dynamic-update-slices.
+//
+// This pass can be used in three ways:
+//
+//   - kEliminateAllScatters: For backends that don't support scatter, this pass
+//     can convert every scatter into a loop.
+//
+//   - kEliminateSimpleScatters: For backends that *do* support scatter, this
+//     pass can strength-reduce "simple" scatters -- specifically, scatters that
+//     can be represented without a loop -- to dynamic-update-slices.
+//
+//   - kEliminateIndeterministicScatters: For backends that *do* support
+//     scatter, this pass converts scatters with potentially indeterministic
+//     behavior, because of non-unique indices or non-associative combiner
+//     functions. There may be false positives, but no false negatives, i.e.
+//     some scatters are converted even when deterministic in practice.
+//
+// Note that even in kEliminateSimpleScatters mode, this pass may still expand a
+// scatter into a loop (with a trip-count of 1).  It's up to other
+// simplification passes to remove the loop.
+class ScatterExpander : public OpExpanderPass {
+ public:
+  enum Mode {
+    kEliminateAllScatters,
+    kEliminateSimpleScatters,
+    kEliminateIndeterministicScatters,
+  };
+
+  explicit ScatterExpander(Mode m) : mode_(m) {}
+
+  absl::string_view name() const override { return "scatter_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* inst) override;
+
+ private:
+  Mode mode_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCATTER_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_simplifier.h
new file mode 100644
index 00000000..42fbf443
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_simplifier.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCATTER_SIMPLIFIER_H_
+#define XLA_SERVICE_SCATTER_SIMPLIFIER_H_
+
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites scatter operations into a combination of transposes,
+// reshapes and a simpler scatter.
+//
+// It implements the first two steps of the algorithm decribed in
+// ScatterExpander::ExpandInstruction (scatter_expander.cc). Additionally, it
+// transposes updates and operands to transform scatter_dims_to_operand_dims
+// into the identity mapping. This is different from the algorithm in
+// ScatterExpander, which instead applies the mapping in scatter_indices.
+//
+// The output scatter's attributes will have the following characteristics:
+// - scatter_indices is a two-dimensional tensor
+// - index_vector_dim is 1
+// - inserted_window_dims is []
+// - update_window_dims is [0, 1, ...]
+// - scatter_dims_to_operand_dims is [0, 1, ...]
+//
+// The purpose of this pass is to check whether this transformation has any
+// performance implications.
+class ScatterSimplifier : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "scatter_simplifier"; }
+
+  static bool IsSimplifiedScatter(const HloScatterInstruction* scatter);
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* inst) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCATTER_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_utils.h
new file mode 100644
index 00000000..22209e4f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/scatter_utils.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCATTER_UTILS_H_
+#define XLA_SERVICE_SCATTER_UTILS_H_
+
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla {
+
+// Transposes the given scatter_indices such that the index_vector_dim becomes
+// the most-minor dimension.
+absl::StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
+    HloInstruction* scatter_indices, int64_t index_vector_dim);
+
+// Permutes the `updates` tensor such that all the scatter dims appear in the
+// major dimensions and all the window dimensions appear in the minor
+// dimensions.
+absl::StatusOr<HloInstruction*> PermuteScatterAndWindowDims(
+    HloInstruction* updates, absl::Span<const int64_t> update_window_dims);
+
+// Expands or contracts the scatter indices in the updates tensor.
+absl::StatusOr<HloInstruction*> AdjustScatterDims(
+    const Shape& scatter_indices_shape, HloInstruction* updates,
+    int64_t index_vector_dim);
+
+// Canonicalizes the scatter_indices tensor in order to keep them uniform while
+// performing the scatter operation.
+absl::StatusOr<HloInstruction*> CanonicalizeScatterIndices(
+    HloInstruction* scatter_indices, int64_t index_vector_dim);
+
+absl::StatusOr<HloComputation*> CallAndGetOutput(HloComputation* original,
+                                                 int output_index);
+absl::StatusOr<HloComputation*> CallComputationAndGetIthOutputWithBinaryParams(
+    HloComputation* original, int output_index);
+
+int64_t ScatterIndicesCount(const HloScatterInstruction* scatter);
+
+// Checks if the combiner is associative.
+bool IsScatterCombinerAssociative(const HloComputation* combiner);
+
+// Checks if the scatter operation is deterministic.
+bool IsScatterDeterministic(const HloScatterInstruction* scatter);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCATTER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/select_and_scatter_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/select_and_scatter_expander.h
new file mode 100644
index 00000000..bcaabdb6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/select_and_scatter_expander.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SELECT_AND_SCATTER_EXPANDER_H_
+#define XLA_SERVICE_SELECT_AND_SCATTER_EXPANDER_H_
+
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+// This pass rewrites select-and-scatter operations into a window reduction and
+// a scatter as described in the conceptual explanation of the "select" and
+// "scatter" steps of this operation.
+class SelectAndScatterExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "select_and_scatter_expander";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* inst) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SELECT_AND_SCATTER_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/service.h b/third_party/tflite-hdrs/third_party/xla/xla/service/service.h
new file mode 100644
index 00000000..b8c0c65d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/service.h
@@ -0,0 +1,369 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SERVICE_H_
+#define XLA_SERVICE_SERVICE_H_
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/debug_options_flags.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/service/allocation_tracker.h"
+#include "xla/service/backend.h"
+#include "xla/service/channel_tracker.h"
+#include "xla/service/compilation_cache.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/execution_tracker.h"
+#include "xla/service/hlo_execution_profile.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+class Service;
+
+// Options to configure the service when it is created.
+class ServiceOptions {
+ public:
+  // Set the platform backing the service, or nullptr for the default platform.
+  ServiceOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
+
+  // Set the default number of replicas to use when compiling replicated
+  // programs.
+  ServiceOptions& set_number_of_replicas(int number_of_replicas);
+  int number_of_replicas() const;
+
+  // Sets the thread pool size for parallel execution of an individual operator.
+  ServiceOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
+  // Sets the allowed_devices set for selectively constructing stream executors
+  // on the platform.
+  ServiceOptions& set_allowed_devices(
+      const std::optional<std::set<int>>& allowed_devices);
+  const std::optional<std::set<int>>& allowed_devices() const;
+
+ private:
+  se::Platform* platform_ = nullptr;
+  int number_of_replicas_ = 1;
+  int intra_op_parallelism_threads_ = -1;
+  std::optional<std::set<int>> allowed_devices_;
+};
+
+// A GlobalData object represents a globally-accessible allocation of
+// data in the associated XLA service.
+class GlobalData {
+ public:
+  // Gives ownership of the global data handle to this object.
+  GlobalData(Service* parent, GlobalDataHandle handle);
+
+  // Unregisters the wrapped handle, which causes the service to
+  // deallocate the associated data.
+  ~GlobalData();
+
+  const GlobalDataHandle& handle() const { return handle_; }
+
+  // Releases a set of GlobalData handles. A single RPC will be issued
+  // per unique Service of the given GlobalData objects.
+  static void Release(std::vector<std::unique_ptr<GlobalData>> instances);
+
+ private:
+  // Detaches the global data handle from the object, such that the destructor
+  // will not try to release it.
+  GlobalDataHandle Release() {
+    parent_ = nullptr;
+    return handle_;
+  }
+
+  GlobalDataHandle handle_;  // Handle being wrapped.
+  Service* parent_;          // Service used to unregister handle_.
+
+  GlobalData(const GlobalData&) = delete;
+  GlobalData& operator=(const GlobalData&) = delete;
+};
+
+// A struct to represent a computation instance to be executed.
+// * If execution_options.device_handles is not empty, the computation is
+//   executed on the devices associated with the handles by partitioning the
+//   computation based on the attached sharding attributes. Otherwise, a
+//   device is chosen by the service.
+struct XlaComputationInstance {
+  const XlaComputation& computation;
+  std::vector<GlobalData*> arguments;
+  ExecutionOptions execution_options;
+  ExecutionProfile* execution_profile;
+
+  XlaComputationInstance(const XlaComputation& computation,
+                         std::vector<GlobalData*> arguments,
+                         ExecutionOptions execution_options,
+                         ExecutionProfile* execution_profile)
+      : computation(computation),
+        arguments(std::move(arguments)),
+        execution_options(execution_options),
+        execution_profile(execution_profile) {}
+};
+
+// The XLA service object, which is the same across all platforms. It maintains
+// the service state of computations and allocations, and delegates
+// target-specific requests to the target-specific infrastructure
+// (target-specific compiler, StreamExecutor).
+class Service {
+ public:
+  // Unregisters a previously-allocated global handle.
+  //
+  // If the handle given is not currently allocated, a NOT_FOUND status is
+  // returned.
+  virtual absl::Status Unregister(const GlobalDataHandle& data);
+
+  // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
+  // element in the tuple.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<GlobalData>>>
+  DeconstructTuple(const GlobalData& data);
+
+  // Compiles a computation into an executable. The request contains the whole
+  // computation graph. Returns the handle to the executable.
+  virtual absl::StatusOr<ExecutionHandle> Compile(
+      const XlaComputation& computation,
+      absl::Span<const Shape> argument_shapes,
+      const ExecutionOptions& execution_options);
+
+  // Executes an executable with the provided global data passes as immutable
+  // arguments. The request contains the handle to the executable. Returns
+  // global data output and execution timing.
+  virtual absl::StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+      ExecutionProfile* execution_profile);
+
+  // Executes one or more computations in parallel with the provided global data
+  // passed as immutable arguments. Returns global data output for each
+  // computation.
+  absl::StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteGraphParallel(
+      absl::Span<const XlaComputationInstance> computations);
+
+  // Requests one or more device handles from the target.
+  //
+  // When N device handles are requested and the number of replicas is R, at
+  // least N * R devices must be available. The devices are assigned based on
+  // the device ordinals such that the first R available devices are assigned to
+  // the first set of replicas, and the next R devices to the second set of
+  // replicas, etc. Each returned device handle represents the device with the
+  // replica id 0.
+  virtual absl::StatusOr<std::vector<DeviceHandle>> GetDeviceHandles(
+      int64_t device_count);
+
+  // Requests that global data be transferred to the client in literal form.
+  virtual absl::StatusOr<Literal> TransferToClient(
+      const GlobalData& data, const Shape* shape_with_layout);
+
+  // Transfers data from a literal provided by the client, into device memory.
+  virtual absl::StatusOr<std::unique_ptr<GlobalData>> TransferToServer(
+      const LiteralSlice& literal_slice, const DeviceHandle* device_handle);
+
+  // Transfers data from a literal provided by the client, into the Infeed
+  // buffer of the device.
+  virtual absl::Status TransferToInfeed(const LiteralSlice& literal,
+                                        int64_t replica_id,
+                                        const DeviceHandle* device_handle);
+
+  // Transfers data from the Outfeed othe device to the literal provided by the
+  // client.
+  virtual absl::StatusOr<Literal> TransferFromOutfeed(
+      const Shape* shape_with_layout, int64_t replica_id,
+      const DeviceHandle* device_handle);
+
+  // Resets devices, clearing all existing state on all the devices associated
+  // with this service (including memory allocated on the devices).
+  //
+  // ResetDevice may only be called where no previous Execution state on the
+  // device is used by the next Execution.
+  //
+  // ResetDevice should be called before an Execution that expect the device to
+  // be in the reset state. For example, if the prior Execution modifies device
+  // state (e.g., architectural state) that the next Execution depends on.
+  virtual absl::Status ResetDevice();
+
+  virtual absl::StatusOr<Literal> ComputeConstantGraph(
+      const XlaComputation& computation, const Layout* output_layout);
+
+  // Returns the shape (with layout) of an array associated with a given data
+  // handle.
+  virtual absl::StatusOr<Shape> GetShape(const GlobalData& data);
+
+  // Creates a unique channel handle that can be used for Send/Recv
+  // instructions.
+  virtual absl::StatusOr<ChannelHandle> CreateChannelHandle(
+      ChannelHandle::ChannelType type);
+
+  // Returns the backend used to execute computations.
+  const Backend& backend() const { return *execute_backend_; }
+  Backend* mutable_backend() { return execute_backend_.get(); }
+
+  // Create a Hlo module config for the given program shape and arguments.
+  // aot_options is optional; if not given a default is used.
+  absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
+      const ProgramShape& program_shape,
+      absl::Span<const Shape* const> argument_shapes,
+      const ExecutionOptions* execution_options,
+      const AotCompilationOptions* aot_options = nullptr);
+
+  // Convenience function which checks whether the given client_shape
+  // (presumably passed by the client to set the result layout) is valid for the
+  // given computation result shape.
+  static absl::Status ValidateResultShape(const Shape& client_shape,
+                                          const Shape& result_shape);
+
+  virtual ~Service() = default;
+
+ private:
+  // A private overload for Service itself, used by other methods within this
+  // class.
+  absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
+      const ProgramShape& program_shape,
+      absl::Span<const ShapedBuffer* const> arguments,
+      const ExecutionOptions& execution_options,
+      const AotCompilationOptions* aot_options = nullptr);
+
+  // Prepare the executors for executing parallel.
+  absl::StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
+      const ExecutionOptions& execution_options, int64_t requests_size,
+      int64_t request_index) const;
+
+  // Prepare the arguments for executing parallel.
+  absl::StatusOr<std::vector<std::vector<const ShapedBuffer*>>> GetArguments(
+      const ExecutionOptions& execution_options,
+      absl::Span<const GlobalData* const> arguments) const;
+
+ protected:
+  friend class LocalExecutable;
+
+  // The constructor is private. Use the NewService factory to create new
+  // service objects.
+  Service(const ServiceOptions& options,
+          std::unique_ptr<Backend> execute_backend);
+
+  // Resolves the given argument handles in the allocation tracker and returns
+  // the corresponding allocations for every replica. The function also verifies
+  // that each allocation matches the execution platform and device ordinal of
+  // the corresponding replica.
+  absl::StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
+  ResolveAndValidateArguments(
+      absl::Span<const GlobalData* const> arguments,
+      absl::Span<se::StreamExecutor* const> stream_executors) const;
+
+ public:
+  // Builds an Executable for the given parameters.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // buffers, which the compiler is responsible for freeing.  The allocator
+  // given here need not match the allocator used when running the executable.
+  absl::StatusOr<std::unique_ptr<Executable>> BuildExecutable(
+      const HloModuleProto& module_proto,
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      se::StreamExecutor* executor, const Compiler::CompileOptions& options,
+      bool run_backend_only = false);
+
+  // Same as BuildExecutable() above, but builds a list of Executables for the
+  // given computations that may interact with each other.
+  absl::StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
+      const std::vector<const HloModuleProto*>& module_protos,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
+      const Compiler::CompileOptions& options, bool run_backend_only = false);
+
+ protected:
+  // Same as BuildExecutable() above, but builds a list of
+  // AotCompilationResult(s), which can be persisted to later load Executable
+  // objects.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  BuildAotResults(const std::vector<const HloModuleProto*>& module_protos,
+                  std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+                  Backend* backend,
+                  std::vector<std::vector<se::StreamExecutor*>> executors,
+                  const Compiler::CompileOptions& options,
+                  bool run_backend_only = false);
+
+  // Runs the given executable with the given arguments and register the result
+  // in the allocation tracker. The handle of the result from the tracker is
+  // returned. If the parameter "profile" is not null, it points to an
+  // ExecutionProfile object which will be filled in with profile data.
+  absl::StatusOr<GlobalDataHandle> ExecuteAndRegisterResult(
+      Executable* executable,
+      absl::Span<const std::vector<const ShapedBuffer*>> arguments,
+      Backend* backend, const DeviceHandle& device_handle,
+      const std::string& result_tag, ExecutionProfile* profile);
+
+  // Runs the given executables with the given arguments and register the result
+  // from each executable in the allocation tracker. The handles of the result
+  // from the tracker are returned.
+  absl::StatusOr<std::vector<GlobalDataHandle>>
+  ExecuteParallelAndRegisterResult(
+      absl::Span<Executable* const> executables,
+      absl::Span<const std::vector<std::vector<const ShapedBuffer*>>> arguments,
+      Backend* backend, absl::Span<const DeviceHandle> device_handles,
+      absl::Span<const std::string> result_tags, ExecutionProfile* profile);
+
+  // Returns the stream executors assigned to the replicas represented by the
+  // given device handle. Each device_handle is a virtual replicated device that
+  // represents a set of physical devices for the replicas.
+  absl::StatusOr<std::vector<se::StreamExecutor*>> Replicas(
+      const Backend& backend, const DeviceHandle& device_handle) const;
+
+  // Returns the device handle that represents the replicated device for a
+  // single computation that is not model-parallelized.
+  DeviceHandle SingleComputationDeviceHandle() const;
+
+  ServiceOptions options_;
+
+  // Cache containing previously built Executables.
+  CompilationCache compilation_cache_;
+
+  // Tracks channels created via the API.
+  ChannelTracker channel_tracker_;
+
+  // Tracks allocations made via the API and computation execution.
+  AllocationTracker allocation_tracker_;
+
+  // Tracks asynchronously launched executions via the API.
+  ExecutionTracker execution_tracker_;
+
+  // Backend to compile and execute computations on.
+  std::unique_ptr<Backend> execute_backend_;
+
+  Service(const Service&) = delete;
+  Service& operator=(const Service&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SERVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/service_executable_run_options.h b/third_party/tflite-hdrs/third_party/xla/xla/service/service_executable_run_options.h
new file mode 100644
index 00000000..0cb9c0a2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/service_executable_run_options.h
@@ -0,0 +1,98 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
+#define XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/stream_pool.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Class containing options for running a LocalExecutable and other auxiliary
+// data.
+class ServiceExecutableRunOptions {
+ public:
+  // Defines the interface of the stream borrower function pointer
+  // with the first argument being the device ordinal, the second
+  // argument being the number of streams to borrow, and the third
+  // argument being the priority of the streams.
+  using StreamBorrower =
+      std::function<absl::StatusOr<std::vector<StreamPool::Ptr>>(
+          int, int, se::StreamPriority)>;
+
+  ServiceExecutableRunOptions()
+      : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
+
+  explicit ServiceExecutableRunOptions(const ExecutableRunOptions& run_options,
+                                       StreamBorrower stream_borrower = nullptr)
+      : run_options_(run_options),
+        stream_borrower_(std::move(stream_borrower)) {}
+
+  // Returns reference or pointer to `ExecutableRunOptions` member.
+  const ExecutableRunOptions& run_options() const { return run_options_; }
+  ExecutableRunOptions* mutable_run_options() { return &run_options_; }
+
+  // Delegate to `ExecutableRunOptions` member.
+  se::Stream* stream() const { return run_options_.stream(); }
+  se::DeviceMemoryAllocator* allocator() const {
+    return run_options_.allocator();
+  }
+  int device_ordinal() const { return run_options_.device_ordinal(); }
+
+  int local_device_count() const { return run_options_.local_device_count(); };
+  // Borrows a stream and returns a smart pointer which returns the stream on
+  // destruction.
+  absl::StatusOr<StreamPool::Ptr> BorrowStream(
+      int device_ordinal,
+      se::StreamPriority priority = se::StreamPriority::Default) const {
+    if (!stream_borrower_) {
+      return absl::Status(absl::StatusCode::kUnimplemented,
+                          "No stream borrower");
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        std::vector<StreamPool::Ptr> streams,
+        stream_borrower_(device_ordinal, /*num_streams=*/1, priority));
+    StreamPool::Ptr stream = std::move(streams.back());
+    return stream;
+  }
+
+  absl::StatusOr<std::vector<StreamPool::Ptr>> BorrowStreams(
+      int device_ordinal, int num_streams,
+      se::StreamPriority priority = se::StreamPriority::Default) const {
+    return stream_borrower_
+               ? stream_borrower_(device_ordinal, num_streams, priority)
+               : absl::Status(absl::StatusCode::kUnimplemented,
+                              "No stream borrower");
+  }
+
+  bool HasStreamBorrower() const { return stream_borrower_ != nullptr; }
+
+ private:
+  ExecutableRunOptions run_options_;
+  StreamBorrower stream_borrower_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/shape_inference.h b/third_party/tflite-hdrs/third_party/xla/xla/service/shape_inference.h
new file mode 100644
index 00000000..2bb4d8eb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/shape_inference.h
@@ -0,0 +1,459 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Shape inference is used by the XLA service as the user builds up
+// computation requests.
+
+#ifndef XLA_SERVICE_SHAPE_INFERENCE_H_
+#define XLA_SERVICE_SHAPE_INFERENCE_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// For a given operation and input shapes, infers what the resulting shape is
+// for the operation. With this functionality, the user does not need to specify
+// the expected result type for computations that are built up via the API --
+// the shape that results from an operation is inferred. Some methods have
+// overloads for inferring shape at the HLO level.
+//
+// TODO(b/73352135): Shape inference does not issue very good error messages, in
+// part because HloInstruction::ToString() is not available since shape
+// inference runs before the HloInstruction object is created. We need a
+// solution for this.
+class ShapeInference {
+ public:
+  // Infers the shape produced by applying the given unary operation to the
+  // given input shape.
+  static absl::StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
+                                                 const Shape& shape);
+  static absl::StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
+                                                 const HloInstruction* operand);
+
+  // For ternary ops, only scalar broadcasting is supported.
+  // Return the non-scalar shape that all scalars should be broadcasted too
+  // Returns status if non-scalar operands do not match.
+  // Returns first shape when all shapes are scalar.
+  static absl::StatusOr<std::optional<Shape>> InferScalarBroadcastShape(
+      absl::Span<const Shape> shapes);
+
+  // Infers the shape produced by applying the given binary operation to the
+  // given input shapes.
+  static absl::StatusOr<Shape> InferBinaryOpShape(
+      HloOpcode opcode, const Shape& lhs, const Shape& rhs,
+      absl::Span<const int64_t> broadcast_dimensions);
+  static absl::StatusOr<Shape> InferBinaryOpShape(HloOpcode opcode,
+                                                  const HloInstruction* lhs,
+                                                  const HloInstruction* rhs);
+
+  // Infers the shape produced by applying the given ternary operation to the
+  // given input shapes.
+  static absl::StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode,
+                                                   const Shape& lhs,
+                                                   const Shape& rhs,
+                                                   const Shape& ehs);
+  static absl::StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode,
+                                                   const HloInstruction* lhs,
+                                                   const HloInstruction* rhs,
+                                                   const HloInstruction* ehs);
+
+  // Infers the shape produced by applying the given variadic operation to the
+  // given input operand shapes.
+  static absl::StatusOr<Shape> InferVariadicOpShape(
+      HloOpcode opcode, absl::Span<const Shape* const> operand_shapes);
+  static absl::StatusOr<Shape> InferVariadicOpShape(
+      HloOpcode opcode, absl::Span<const HloInstruction* const> operands);
+
+  // Infers the shape produced by applying the given mapping computation shape
+  // to the given operand shapes.
+  static absl::StatusOr<Shape> InferMapShape(
+      absl::Span<const Shape* const> arg_shapes, const ProgramShape& to_apply,
+      absl::Span<const int64_t> dimensions);
+
+  // Infers the shape produced by InferBatchNormTraining with the given
+  // operands.
+  static absl::StatusOr<Shape> InferBatchNormTrainingShape(
+      const Shape& operand_shape, const Shape& scale_shape,
+      const Shape& offset_shape, int64_t feature_index);
+
+  // Infers the shape produced by InferBatchNormInference with the given
+  // operands.
+  static absl::StatusOr<Shape> InferBatchNormInferenceShape(
+      const Shape& operand_shape, const Shape& scale_shape,
+      const Shape& offset_shape, const Shape& mean_shape,
+      const Shape& variance_shape, int64_t feature_index);
+
+  // Infers the shape produced by InferBatchNormGrad with the given operands.
+  static absl::StatusOr<Shape> InferBatchNormGradShape(
+      const Shape& operand_shape, const Shape& scale_shape,
+      const Shape& mean_shape, const Shape& var_shape,
+      const Shape& output_grad_shape, int64_t feature_index);
+
+  // Infers the shape produced by applying the given convolutional filter (rhs)
+  // to lhs in the way specified by the fields on window. An optional
+  // preferred_element_type can be specified to upcast the element type.
+  static absl::StatusOr<Shape> InferConvolveShape(
+      const Shape& lhs, const Shape& rhs, int64_t feature_group_count,
+      int64_t batch_group_count, const Window& window,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      std::optional<PrimitiveType> preferred_element_type);
+
+  // Infers the shape produced by the given FFT type on the given operand.
+  static absl::StatusOr<Shape> InferFftShape(
+      const Shape& in, FftType fft_type, absl::Span<const int64_t> fft_length);
+
+  // Infers the shape produced by the given triangular solve operation.
+  static absl::StatusOr<Shape> InferTriangularSolveShape(
+      const Shape& a, const Shape& b, const TriangularSolveOptions& options);
+
+  // Infers the shape produced by the given triangular solve operation.
+  static absl::StatusOr<Shape> InferCholeskyShape(const Shape& a);
+
+  // Infers the shape produced by an all-gather with the given operand shape,
+  // concat dimension, and shard count.
+  static absl::StatusOr<Shape> InferAllGatherShape(
+      absl::Span<const Shape* const> operand_shapes,
+      int64_t all_gather_dimension, int64_t shard_count);
+
+  // Infers the shape produced by an all-gather-start with the given operand
+  // shape, concat dimension, and shard count.
+  static absl::StatusOr<Shape> InferAllGatherStartShape(
+      absl::Span<const Shape* const> operand_shapes,
+      int64_t all_gather_dimension, int64_t shard_count);
+
+  // Infers the shape produced by an all-gather-done given a certain
+  // all-gather-start shape.
+  static absl::StatusOr<Shape> InferAllGatherDoneShape(
+      const Shape& all_gather_start_shape);
+
+  // Infers the shape produced by a cross replica sum with the given operand
+  // shapes.
+  static absl::StatusOr<Shape> InferAllReduceShape(
+      absl::Span<const Shape* const> operand_shapes);
+
+  // Infers the shape produced by a reduce-scatter with the given operand
+  // shape, scatter dimension, and shard count.
+  static absl::StatusOr<Shape> InferReduceScatterShape(
+      absl::Span<const Shape* const> operand_shapes, int64_t scatter_dimension,
+      int64_t shard_count);
+
+  // Infers the shape produced by a cross replica sum start.
+  static absl::StatusOr<Shape> InferAllReduceStartShape(
+      absl::Span<const Shape* const> operand_shapes);
+
+  // Infers the shape produced by a cross replica sum done.
+  static absl::StatusOr<Shape> InferAllReduceDoneShape(
+      const Shape& operand_shape);
+
+  // Infers final shape of an Alltoall operation that is created by the xla
+  // builder.
+  static absl::StatusOr<Shape> InferAllToAllShape(const Shape& shape,
+                                                  int64_t split_dimension,
+                                                  int64_t concat_dimension,
+                                                  int64_t split_count);
+
+  // Infers the shape of an HLO all-to-all instruction.
+  static absl::StatusOr<Shape> InferAllToAllTupleShape(
+      absl::Span<const Shape* const> operand_shapes);
+
+  // Infers the shape of an HLO ragged-all-to-all instruction.
+  static absl::StatusOr<Shape> InferRaggedAllToAllShape(
+      absl::Span<const Shape* const> operand_shapes);
+
+  // Infers the shape of a collective broadcast operation.
+  static absl::StatusOr<Shape> InferCollectiveBroadcastShape(
+      absl::Span<const Shape* const> operand_shapes);
+
+  // Infers the shape of a collective permute operation.
+  static absl::StatusOr<Shape> InferCollectivePermuteShape(
+      absl::Span<const Shape* const> operand_shapes);
+
+  // Infers the shape of a collective permute start operation.
+  static absl::StatusOr<Shape> InferCollectivePermuteStartShape(
+      absl::Span<const Shape* const> operand_shapes,
+      absl::Span<const Shape> context_shapes);
+
+  // Infers the shape of a collective permute operation.
+  static absl::StatusOr<Shape> InferCollectivePermuteDoneShape(
+      const Shape& operand_shape);
+
+  // Infers the shape produced by applying the given reduction computation
+  // shape to the given input operand shape.
+  //
+  // If pass_index is true, the reduce function is invoked with the element
+  // index as the leading parameter, and the program shape should match
+  // accordingly (or an error will result).
+  static absl::StatusOr<Shape> InferReduceShape(
+      absl::Span<const Shape* const> arg_shapes,
+      absl::Span<const int64_t> dimensions_to_reduce,
+      const ProgramShape& to_apply);
+
+  // Infers the shape produced by applying the given computation to the operand
+  // shape with the given window and stride dimensions.
+  static absl::StatusOr<Shape> InferReduceWindowShape(
+      const Shape& operand_shape, const Shape& init_value, const Window& window,
+      const ProgramShape& to_apply_shape);
+  static absl::StatusOr<Shape> InferReduceWindowShape(
+      const Shape& operand_shape, const Shape& init_value,
+      const Window& window);
+  static absl::StatusOr<Shape> InferReduceWindowShape(
+      absl::Span<const Shape* const> operands,
+      absl::Span<const Shape* const> init_values, const Window& window,
+      const ProgramShape& to_apply_shape);
+
+  static absl::StatusOr<Shape> InferReduceWindowShape(
+      absl::Span<const Shape*> operands, absl::Span<const Shape*> init_values,
+      const Window& window);
+
+  // Infers the shape produced by scattering the given source shape to the
+  // selected indices of each window on the operand shape.
+  static absl::StatusOr<Shape> InferSelectAndScatterShape(
+      const Shape& operand_shape, const ProgramShape& select_shape,
+      const Window& window, const Shape& source_shape,
+      const Shape& init_value_shape, const ProgramShape& scatter_shape);
+
+  // Infers the shape produced by a reverse operation that reverses the order
+  // of the elements in the given dimensions.
+  static absl::StatusOr<Shape> InferReverseShape(
+      const Shape& operand_shape, absl::Span<const int64_t> dimensions);
+
+  // Infers the shape produced by a slice operation spanning from the starts to
+  // the limits in the original shape's dimensions.
+  //
+  // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16]
+  static absl::StatusOr<Shape> InferSliceShape(
+      const Shape& arg, absl::Span<const int64_t> starts,
+      absl::Span<const int64_t> limits, absl::Span<const int64_t> strides);
+
+  // Infers the shape produced by a dynamic slice operation of size specified
+  // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
+  static absl::StatusOr<Shape> InferDynamicSliceShape(
+      const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+      absl::Span<const int64_t> slice_sizes, bool allow_scalar_indices = true);
+
+  // Infers the shape produced by a dynamic update slice operation based
+  // on the shape of operand and update.
+  static absl::StatusOr<Shape> InferDynamicUpdateSliceShape(
+      const Shape& operand_shape, const Shape& update_shape,
+      absl::Span<const Shape> start_index_shapes,
+      bool allow_scalar_indices = true);
+
+  // Infers the shape produced by doing a compile-time-constant indexing into
+  // the given input shape. This is essential for operations on tuples, because
+  // it is impossible to infer the type that comes out of the tuple indexing if
+  // it is not a compile time constant.
+  static absl::StatusOr<Shape> InferGetTupleElementShape(const Shape& arg,
+                                                         int64_t index);
+
+  // Infers the shape produced from a while node. condition and body are the
+  // shapes of computations for the condition and the body of a while node, and
+  // init is the shape of data initially passed in to the body as an argument.
+  // The shapes must match; condition: T -> PRED, body: T -> T, init: T
+  static absl::StatusOr<Shape> InferWhileShape(const ProgramShape& condition,
+                                               const ProgramShape& body,
+                                               const Shape& init);
+
+  // Infers the shape produced by a predicated or indexed conditional operation.
+  static absl::StatusOr<Shape> InferConditionalShape(
+      const Shape& branch_index,
+      absl::Span<const ProgramShape> branch_computations,
+      absl::Span<const Shape> branch_operands);
+
+  // Infers the shape produced by a broadcast operation.
+  static absl::StatusOr<Shape> InferBroadcastShape(
+      const Shape& operand, absl::Span<const int64_t> broadcast_sizes);
+
+  // Checks whether the given parameters can form a broadcast. Returns the same
+  // output_shape if it's legal.
+  static absl::StatusOr<Shape> InferBroadcastShape(
+      const Shape& operand_shape, const Shape& output_shape,
+      absl::Span<const int64_t> broadcast_dimensions);
+
+  // Infers the shape produced by a reshape operation from the element type of
+  // its operand and the new dimension sizes specified.
+  static absl::StatusOr<Shape> InferReshapeShape(
+      const Shape& operand, absl::Span<const int64_t> dimensions,
+      absl::Span<const int64_t> new_sizes, int64_t inferred_dimension);
+
+  // Infers the shape produced by a dynamic reshape operation from the element
+  // type of its operand and the new dimension sizes specified. The result shape
+  // will have dynamic dimensions as specific in `dim_is_dynamic` and bound
+  // `new_size_bounds`.
+  static absl::StatusOr<Shape> InferDynamicReshapeShape(
+      const Shape& operand, absl::Span<const Shape* const> dim_size_shapes,
+      absl::Span<const int64_t> new_size_bounds,
+      const std::vector<bool>& dims_are_dynamic);
+
+  // Infers the shape produced by a transpose operation from the element type of
+  // its operand and its dimensions field.
+  static absl::StatusOr<Shape> InferTransposeShape(
+      const Shape& operand, absl::Span<const int64_t> dimensions);
+
+  // Helper that infers the shape produced by performing a concatenate operation
+  // with the given operand shapes.
+  static absl::StatusOr<Shape> InferConcatOpShape(
+      absl::Span<const Shape* const> arg_shapes, int64_t dimension);
+
+  // Helper that validates the given operand shape can be converted to the
+  // target output_shape via a convert instruction -- the requirement is that
+  // the shape is identical except for the element type.
+  static absl::StatusOr<Shape> InferConvertShape(
+      const Shape& operand_shape, PrimitiveType new_element_type);
+
+  // Helper that validates the given operand shape can be bitcast converted to
+  // the target output_shape via a bitcast convert instruction -- the
+  // requirement is that the shape is identical except for the element type and
+  // the element types have identical bit-widths.
+  static absl::StatusOr<Shape> InferBitcastConvertShape(
+      const Shape& operand_shape, PrimitiveType new_element_type);
+
+  // Helper that validates the given operand shape can be converted to the
+  // target output_shape via a stochastic convert instruction -- the requirement
+  // is that the shape is identical except for the element type.
+  static absl::StatusOr<Shape> InferStochasticConvertShape(
+      const Shape& operand_shape, const Shape& random_shape,
+      PrimitiveType new_element_type);
+
+  // Helper that validates the input data type for a reduce-precision operation,
+  // and returns the result shape.
+  static absl::StatusOr<Shape> InferReducePrecisionShape(
+      const Shape& operand_shape, const int exponent_bits,
+      const int mantissa_bits);
+
+  // Helper that infers the shape produced by a pad operation based on the
+  // padding configuration.
+  static absl::StatusOr<Shape> InferPadShape(
+      const Shape& operand_shape, const Shape& padding_value_shape,
+      const PaddingConfig& padding_config);
+
+  // Helper that validates the given arg_shapes are compatible with the shape of
+  // the to_apply parameters, and returns the to_apply result shape.
+  static absl::StatusOr<Shape> InferCallShape(
+      absl::Span<const Shape* const> arg_shapes, const ProgramShape& to_apply);
+
+  // Helper that infers the shape produced by performing a dot operation with
+  // the given LHS and RHS shapes. An optional preferred_element_type can be
+  // specified to upcast the element type.
+  static absl::StatusOr<Shape> InferDotOpShape(
+      const Shape& lhs, const Shape& rhs,
+      const DotDimensionNumbers& dimension_numbers,
+      std::optional<PrimitiveType> preferred_element_type,
+      absl::Span<const SparsityDescriptor> sparsity = {});
+
+  // Helper that infers the shape of the sparse dot metadata.
+  static absl::StatusOr<Shape> InferSparseDotMetadataShape(
+      const Shape& operand_shape, const DotDimensionNumbers& dimension_numbers,
+      const SparsityDescriptor& sparsity, PrimitiveType element_type = U16);
+
+  // Helper that infers the shape produced by performing a ragged dot operation
+  // with the given LHS and RHS shapes. An optional preferred_element_type can
+  // be specified to upcast the element type.
+  static absl::StatusOr<Shape> InferRaggedDotOpShape(
+      const Shape& lhs, const Shape& rhs, const Shape& group_sizes,
+      const RaggedDotDimensionNumbers& ragged_dot_dim_nums,
+      std::optional<PrimitiveType> preferred_element_type);
+
+  // Helper that infers the shape of the tensor produced by a gather operation
+  // with the given input shape, gather indices shape and gather dimension
+  // numbers.
+  static absl::StatusOr<Shape> InferGatherShape(
+      const Shape& input_shape, const Shape& start_indices_shape,
+      const GatherDimensionNumbers& gather_dim_numbers,
+      absl::Span<const int64_t> slice_sizes);
+
+  // Helper that validates the given input shape, scatter indices shape, updates
+  // shape, and scatter dimension numbers that constitute a scatter operation,
+  // and returns the result shape of the scatter operation.
+  static absl::StatusOr<Shape> InferScatterShape(
+      absl::Span<const Shape* const> arg_shapes,
+      const ProgramShape& to_apply_shape,
+      const ScatterDimensionNumbers& scatter_dim_numbers);
+
+  // Helper that validates the given input shape to GetDimensionSize.
+  static absl::StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
+                                                          int64_t dimension);
+
+  // Helper that validates the given input shape to SetDimensionSize.
+  static absl::StatusOr<Shape> InferSetDimensionSizeShape(
+      const Shape& operand_shape, const Shape& val_shape, int64_t dimension);
+
+  static absl::StatusOr<Shape> InferTopKShape(const Shape& operand_shape,
+                                              int64_t k);
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  static absl::StatusOr<Window> InferWindowFromDimensions(
+      absl::Span<const int64_t> window_dimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const std::pair<int64_t, int64_t>> padding,
+      absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      std::optional<std::vector<bool>> window_reversal = std::nullopt);
+
+ private:
+  // Helper that infers the shape produced by performing an element-wise binary
+  // operation with the given LHS and RHS shapes.
+  // Note: By "element-wise" we mean operations that look at a single element in
+  // the LHS and a single element in the RHS to produce a single output element,
+  // even in the presence of broadcasting of one of the operands over the other.
+  static absl::StatusOr<Shape> InferElementwiseBinaryOpShape(
+      HloOpcode operation, const Shape& lhs, const Shape& rhs,
+      absl::Span<const int64_t> broadcast_dimensions);
+
+  // Helper for inferring the shape of Clamp ops.
+  static absl::StatusOr<Shape> InferClampShape(const Shape& min,
+                                               const Shape& operand,
+                                               const Shape& max);
+
+  // Helper for inferring the shape of Select ops.
+  static absl::StatusOr<Shape> InferSelectShape(const Shape& pred,
+                                                const Shape& on_true,
+                                                const Shape& on_false);
+
+  // Helper for inferring shapes of binary operations which use degenerate
+  // dimension broadcasting (a dimension of size 1 in one operand is broadcast
+  // up to match the size of the dimension in the other operand).
+  static absl::StatusOr<Shape> InferDegenerateDimensionBroadcastShape(
+      const Shape& lhs, const Shape& rhs);
+
+  // Helper for inferring shapes of binary operations using "InDim"
+  // broadcasting. This is the broadcasting used in the *InDim binary operations
+  // (for example ComputationBuilder::AddInDim). smaller_shape must be a
+  // lower-rank shape than larger_shape. Returns the shape that the
+  // smaller_shape is broadcast to.
+  //
+  // Since this method is only used by InferBinaryOpShape transitively, this
+  // method also supports inference of unbounded dynamic dimensions.
+  static absl::StatusOr<Shape> InferInDimBroadcastShape(
+      const Shape& smaller_shape, const Shape& larger_shape,
+      absl::Span<const int64_t> broadcast_dimensions);
+
+  ShapeInference(const ShapeInference&) = delete;
+  ShapeInference& operator=(const ShapeInference&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SHAPE_INFERENCE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/shaped_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/shaped_buffer.h
new file mode 100644
index 00000000..92644c18
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/shaped_buffer.h
@@ -0,0 +1,221 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SHAPED_BUFFER_H_
+#define XLA_SERVICE_SHAPED_BUFFER_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class ScopedShapedBuffer;
+
+// Class which encapsulates a buffer or set of buffers containing data of a
+// particular XLA shape.
+class ShapedBuffer {
+ public:
+  // Construct a ShapedBuffer with null DeviceMemoryBases at each index. The
+  // shape of the data on the host and the device may differ because the device
+  // may have a different representation for different data types. Therefore,
+  // both the on-host and on-device shape are required. The on-device shape
+  // determines the number of device allocations (DeviceMemoryBase) held by the
+  // ShapedBuffer.
+  // Specify `physical_device_ordinal` if multiple devices share the same
+  // physical device, e.g., virtual GPUs.
+  ShapedBuffer(Shape on_device_shape, int device_ordinal,
+               int physical_device_ordinal = -1);
+
+  // TODO(b/170310047): remove this overload.
+  ShapedBuffer(Shape on_host_shape, Shape on_device_shape, int device_ordinal,
+               int physical_device_ordinal = -1);
+
+  // Movable, but not copyable.
+  ShapedBuffer(ShapedBuffer&& s) noexcept;
+  ShapedBuffer& operator=(ShapedBuffer&&) noexcept;
+  ShapedBuffer(const ShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
+
+  // Prevent (some forms of) accidental object slicing.
+  ShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
+  virtual ~ShapedBuffer();
+
+  // Returns the shape of the on-host representation of the data held by this
+  // ShapedBuffer.
+  const Shape& on_host_shape() const { return on_host_shape_; }
+
+  // Returns the shape of the on-device representation of the data held by this
+  // ShapedBuffer.
+  const Shape& on_device_shape() const { return on_device_shape_; }
+
+  int device_ordinal() const { return device_ordinal_; }
+  int physical_device_ordinal() const { return physical_device_ordinal_; }
+
+  // Return the root buffer of the shape (shape index {}).
+  const se::DeviceMemoryBase& root_buffer() const {
+    return buffer(/*index=*/{});
+  }
+
+  // Returns the buffer at the given shape index where index is defined as in
+  // ShapeUtil::GetSubshape.
+  const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
+    return buffers_.element(index);
+  }
+
+  // Sets the device memory buffer at the given index.
+  void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
+    *buffers_.mutable_element(index) = buffer;
+  }
+
+  // Sets all buffers.
+  //
+  // Precondition: buffers.shape == on_device_shape_
+  void set_buffers(ShapeTree<se::DeviceMemoryBase> buffers) {
+    CHECK(ShapeUtil::Equal(buffers.shape(), on_device_shape_));
+    buffers_ = std::move(buffers);
+    buffers_.replace_shape_ptr(on_device_shape_);
+  }
+
+  // Reset the shape of this shaped buffer and underlying buffer structure.
+  //
+  // Precondition: EqualStructure(this->on_device_shape_, on_device_shape).
+  void set_shapes(const Shape& on_device_shape) {
+    CHECK(ShapeUtil::EqualStructure(on_device_shape, on_device_shape_))
+        << "Structures are not the same. new: " << on_device_shape
+        << ", old: " << on_device_shape_;
+    on_host_shape_ = ShapeUtil::DeviceShapeToHostShape(on_device_shape);
+    on_device_shape_ = on_device_shape;
+    buffers_.replace_shape_ptr(on_device_shape_);
+  }
+  // TODO(b/170310047): remove this overload.
+  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+    set_shapes(on_device_shape);
+  }
+
+  // Returns the underlying ShapeTree containing all the device addresses in the
+  // ShapedBuffer.
+  const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
+  ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
+
+  absl::StatusOr<ShapedBuffer> SubShapedBuffer(const ShapeIndex& index) const;
+
+  // Set all device memory pointers in the object to null.
+  void clear();
+
+  std::string ToString() const;
+
+ protected:
+  Shape on_host_shape_;
+
+  // The shape of the data on the device.
+  Shape on_device_shape_;
+
+  // The device the memory is allocated on.
+  int device_ordinal_;
+  int physical_device_ordinal_;
+
+  // The tree of device buffers. Its shape is on_device_shape().
+  ShapeTree<se::DeviceMemoryBase> buffers_;
+};
+
+std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
+
+// ScopedShapedBuffer takes allocated buffers as inputs, and deallocates on
+// destruction. This class represents an owning wrapper around `ShapedBuffer`.
+//
+// TODO(timshen): Remove inheritance between ScopedShapedBuffer and
+// ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer
+// as a ShapedBuffer, because in that case we should just be able to pass around
+// our ShapeTree<DeviceMemoryBase>.  Inheritance only adds complexity.  See
+// discussion in cl/192849370.
+class ScopedShapedBuffer : public ShapedBuffer {
+ public:
+  // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  explicit ScopedShapedBuffer(Shape on_device_shape,
+                              se::DeviceMemoryAllocator* allocator,
+                              int device_ordinal,
+                              int physical_device_ordinal = -1);
+  // TODO(b/170310047): remove this overload.
+  explicit ScopedShapedBuffer(Shape on_host_shape, Shape on_device_shape,
+                              se::DeviceMemoryAllocator* allocator,
+                              int device_ordinal,
+                              int physical_device_ordinal = -1);
+
+  // Create a ScopedShapedBuffer by taking over the memory from the incoming
+  // ShapedBuffer.
+  explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
+                              se::DeviceMemoryAllocator* allocator);
+
+  // Movable, but not copyable.
+  ScopedShapedBuffer(ScopedShapedBuffer&& s) noexcept;
+  ScopedShapedBuffer& operator=(ScopedShapedBuffer&&) noexcept;
+  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ScopedShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
+  // All buffers in the shape are deallocated on destruction.
+  ~ScopedShapedBuffer() override;
+
+  // Return the allocator used to allocate the device memory held in this
+  // ScopedShapedBuffer.
+  se::DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
+
+  // Sets the device memory buffer at the given index.
+  //
+  // If the given buffer's device memory is non-null, its device_ordinal and
+  // allocator must match those in `this`.
+  void set_buffer(se::OwningDeviceMemory buffer, const ShapeIndex& index) {
+    if (!buffer.is_null()) {
+      CHECK_EQ(buffer.device_ordinal(), device_ordinal());
+      CHECK_EQ(buffer.allocator(), allocator_);
+      *buffers_.mutable_element(index) = buffer.Release();
+    } else {
+      *buffers_.mutable_element(index) = se::DeviceMemoryBase();
+    }
+  }
+
+  // Like unique_ptr::release(), creates and returns a regular ShapedBuffer from
+  // this ScopedShapedBuffer, without freeing any of the associated memory.
+  //
+  // It's the caller's job to ensure that the memory contained therein is freed.
+  [[nodiscard]] ShapedBuffer release();
+
+  // Extracts the sub-tree rooted at 'index' and returns a ScopedShapedBuffer
+  // that holds ownership of the subtree. Sets the buffers corresponding to the
+  // subtree to null in 'this'.
+  ScopedShapedBuffer TakeSubTree(ShapeIndexView index);
+
+ protected:
+  void Deallocate();
+
+  se::DeviceMemoryAllocator* allocator_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SHAPED_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_config.h b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_config.h
new file mode 100644
index 00000000..9de051f6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_config.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SHARDING_CONFIG_H_
+#define XLA_SERVICE_SHARDING_CONFIG_H_
+
+#include <optional>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Node's sharding config, where sharding represents the sharding for a
+// non-tuple, and nodes[i] represents the sharding for the i-th tuple element.
+struct NodeShardingConfig {
+  std::optional<HloSharding> sharding;
+  std::vector<NodeShardingConfig> nodes;
+  bool operator==(const NodeShardingConfig& other) const {
+    return sharding == other.sharding && nodes == other.nodes;
+  }
+};
+
+// Program's sharding configuration.
+struct ShardingConfig {
+  std::vector<NodeShardingConfig> nodes;
+  bool operator==(const ShardingConfig& other) const {
+    return nodes == other.nodes;
+  }
+  static ShardingConfig FromProto(const ShardingConfigProto& proto);
+  static ShardingConfigProto ToProto(const ShardingConfig& config);
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SHARDING_CONFIG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_format_picker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_format_picker.h
new file mode 100644
index 00000000..9a369fae
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_format_picker.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SHARDING_FORMAT_PICKER_H_
+#define XLA_SERVICE_SHARDING_FORMAT_PICKER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/sharding_format_picker.h"
+
+#endif  // XLA_SERVICE_SHARDING_FORMAT_PICKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_propagation.h b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_propagation.h
new file mode 100644
index 00000000..903d5d77
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_propagation.h
@@ -0,0 +1,209 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SHARDING_PROPAGATION_H_
+#define XLA_SERVICE_SHARDING_PROPAGATION_H_
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/custom_call_sharding_helper.h"
+#include "xla/service/dot_as_convolution_util.h"
+
+namespace xla {
+
+// Infers the shardings for a dot HLO op from the shardings on its operands,
+// which are expected to have sharding annotations.
+bool InferDotShardingFromOperands(
+    HloInstruction* instruction, const CallGraph& call_graph,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
+    bool may_combine_partial_sharding, bool is_spmd);
+
+// Infers the shardings for a convolution HLO op from the shardings on its
+// operands, which are expected to have sharding annotations.
+bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
+                                          const CallGraph& call_graph,
+                                          bool may_combine_partial_sharding,
+                                          bool is_spmd);
+
+// Remove Sharding custom-call instruction by folding the sharding attribute
+// to its operand. If the operand already has a different sharding, insert a
+// copy node for reshard. Depending on whether propagating the spmd sharding to
+// output/parameters is allowed, the existing shardings of output and parameters
+// will be saved in saved_root_shardings and saved_parameter_shardings. The user
+// can select which sharding(s) to keep and which shardings to allow spmd to
+// propagate. saved_parameter_shardings is a map from the operand index to that
+// operand's existing sharding.
+// unspecified_dims will be populated with the converted copies if the custom
+// call is partially specified.
+absl::StatusOr<bool> ProcessShardingInstruction(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    bool replace_sharding_with_copy,
+    absl::flat_hash_map<const HloInstruction*, std::vector<int64_t>>*
+        unspecified_dims,
+    std::vector<HloSharding>* saved_root_shardings,
+    absl::flat_hash_map<int64_t, HloSharding>* saved_parameter_shardings,
+    absl::flat_hash_map<HloInstruction*, int64_t>*
+        instruction_to_shard_group_id = nullptr,
+    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>*
+        shard_group_id_to_shard_as_group = nullptr,
+    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>*
+        shard_group_id_to_shard_like_group = nullptr,
+    const std::vector<bool>*
+        allow_spmd_sharding_propagation_to_parameters_vector = nullptr,
+    bool remove_unknown_shardings = false);
+
+int64_t ComputeNonRootUsers(const HloInstruction* instr);
+
+// Infers broadcast ops' operand sharding, based on its output sharding.
+std::optional<HloSharding> InferBroadcastOperandSharding(
+    const HloInstruction& instruction, bool is_spmd = true);
+
+bool InferReduceShardingFromOperand(HloInstruction* instruction,
+                                    bool may_combine_partial_sharding,
+                                    bool is_spmd);
+
+// Propagates sharding information around the graph. HLOs that have shardings
+// are kept as-is, those that do not have shardings are given shardings based on
+// a simple local greedy heuristic.
+class ShardingPropagation : public HloModulePass {
+ public:
+  using ComputationMap =
+      absl::flat_hash_map<const HloComputation*, HloInstruction*>;
+  explicit ShardingPropagation(
+      bool is_spmd = false, bool propagate_metadata = false,
+      absl::Span<const bool> allow_spmd_sharding_propagation_to_output =
+          {false},
+      absl::Span<const bool> allow_spmd_sharding_propagation_to_parameters =
+          {false},
+      bool cse_prevention_only = false,
+      std::unique_ptr<CustomCallShardingHelper> sharding_helper = nullptr)
+      : is_spmd_(is_spmd),
+        propagate_metadata_(propagate_metadata),
+        allow_spmd_sharding_propagation_to_output_(
+            absl::c_any_of(allow_spmd_sharding_propagation_to_output,
+                           [](bool v) { return v; })),
+        allow_spmd_sharding_propagation_to_parameters_(
+            absl::c_any_of(allow_spmd_sharding_propagation_to_parameters,
+                           [](bool v) { return v; })),
+        allow_spmd_sharding_propagation_to_output_vector_(
+            allow_spmd_sharding_propagation_to_output.begin(),
+            allow_spmd_sharding_propagation_to_output.end()),
+        allow_spmd_sharding_propagation_to_parameters_vector_(
+            allow_spmd_sharding_propagation_to_parameters.begin(),
+            allow_spmd_sharding_propagation_to_parameters.end()),
+        cse_prevention_only_(cse_prevention_only) {
+    if (sharding_helper) {
+      sharding_helper_ = std::move(sharding_helper);
+    } else {
+      sharding_helper_ = std::make_unique<CustomCallShardingHelper>();
+    }
+  }
+  absl::string_view name() const override { return "sharding-propagation"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Function which can be used to apply a spatially partitioned sharding onto a
+  // given domain. It will apply the sharding into the exit edges of the domain
+  // and then rely on the rest of sharding propagation to ensure that the
+  // intermediate nodes get the correct sharding.
+  static absl::Status NormalizeDomain(const DomainMetadata::Domain& domain,
+                                      const DomainMetadata* metadata);
+
+  static std::optional<HloSharding> GetShardingFromUser(
+      const HloInstruction& instruction, const HloInstruction& user,
+      int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph,
+      const CustomCallShardingHelper* sharding_helper);
+
+ private:
+  bool InferShardingFromShardGroup(
+      HloInstruction* instruction, int64_t aggressiveness,
+      const absl::flat_hash_set<HloInstruction*>& shard_group);
+  bool InferShardingFromOperands(
+      HloInstruction* instruction, const ComputationMap& computation_map,
+      int64_t aggressiveness, const CallGraph& call_graph,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  bool InferShardingFromUsers(
+      HloInstruction* instruction,
+      const ShardingPropagation::ComputationMap& computation_map,
+      int64_t aggressiveness, bool is_spmd,
+      const CustomCallShardingHelper* sharding_helper,
+      const CallGraph& call_graph);
+
+  absl::StatusOr<bool> RunToFixPoint(
+      int64_t aggressiveness, bool propagate_shard_group,
+      const ComputationMap& computation_map,
+      const absl::flat_hash_set<const HloInstruction*>& provided_shardings,
+      const CallGraph& call_graph, HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      absl::flat_hash_map<const HloInstruction*, std::vector<int64_t>>&
+          unspecified_dims,
+      absl::flat_hash_map<HloInstruction*, int64_t>&
+          instruction_to_shard_group_id,
+      absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>&
+          shard_group_id_to_shard_as_group,
+      absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>&
+          shard_group_id_to_shard_like_group,
+      int64_t& iterations);
+
+  // If instruction is a while, or the root or a parameter of a while body,
+  // then propagate its sharding to the while instruction, to its body root,
+  // and to its condition parameter.
+  void MaybeComputationPropagation(
+      const ComputationMap& computation_map,
+      const absl::flat_hash_set<const HloInstruction*>& provided_shardings,
+      HloInstruction* instruction,
+      absl::flat_hash_set<HloInstruction*>* changed);
+
+  // Gets instructions that are related through a computation and need to share
+  // the same sharding.
+  std::vector<HloInstruction*> GetRelatedInstructions(
+      HloInstruction* inst, const ComputationMap& computation_map);
+
+  std::unique_ptr<CustomCallShardingHelper> sharding_helper_;
+  bool is_spmd_;
+  bool propagate_metadata_;
+  bool allow_spmd_sharding_propagation_to_output_;
+  bool allow_spmd_sharding_propagation_to_parameters_;
+  std::vector<bool> allow_spmd_sharding_propagation_to_output_vector_;
+  std::vector<bool> allow_spmd_sharding_propagation_to_parameters_vector_;
+  // If true, the pass keeps the propagation results only on selected
+  // instructions to prevent CSE across unrelated subgraphs. (A common case is
+  // scalar broadcasts).
+  bool cse_prevention_only_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SHARDING_PROPAGATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_remover.h b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_remover.h
new file mode 100644
index 00000000..5ea1b6e1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/sharding_remover.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SHARDING_REMOVER_H_
+#define XLA_SERVICE_SHARDING_REMOVER_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Remove Sharding custom-call instruction by assigning its users to
+// to its operand. This is helpful when partition_count == 1.
+class ShardingRemover : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "sharding-remover"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SHARDING_REMOVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/simplify_fp_conversions.h b/third_party/tflite-hdrs/third_party/xla/xla/service/simplify_fp_conversions.h
new file mode 100644
index 00000000..b1272794
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/simplify_fp_conversions.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
+#define XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h"
+
+#endif  // XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/slice_sinker.h b/third_party/tflite-hdrs/third_party/xla/xla/service/slice_sinker.h
new file mode 100644
index 00000000..d1d1aa59
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/slice_sinker.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SLICE_SINKER_H_
+#define XLA_SERVICE_SLICE_SINKER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/slice_sinker.h"
+
+#endif  // XLA_SERVICE_SLICE_SINKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/slow_operation_alarm.h b/third_party/tflite-hdrs/third_party/xla/xla/service/slow_operation_alarm.h
new file mode 100644
index 00000000..245ca6a5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/slow_operation_alarm.h
@@ -0,0 +1,93 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SLOW_OPERATION_ALARM_H_
+#define XLA_SERVICE_SLOW_OPERATION_ALARM_H_
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+
+namespace xla {
+
+// This RAII object asynchronously prints a warning if it's alive for more than
+// a certain amount of time.
+class SlowOperationAlarm {
+ public:
+  // If `counter` is not null, this alarm will throttle itself to logging
+  // once-every-power-of-two occurrences. The counter must outlive this object.
+  SlowOperationAlarm(absl::Duration timeout, std::string msg,
+                     std::atomic<int64_t>* counter = nullptr,
+                     absl::string_view context = "");
+  SlowOperationAlarm(absl::Duration timeout,
+                     std::function<std::string()> msg_fn,
+                     std::atomic<int64_t>* counter = nullptr,
+                     absl::string_view context = "");
+  ~SlowOperationAlarm();
+
+  // Not copyable or movable, because the constructor stores a pointer to `this`
+  // into a global variable.
+  SlowOperationAlarm(const SlowOperationAlarm&) = delete;
+  SlowOperationAlarm(const SlowOperationAlarm&&) = delete;
+  SlowOperationAlarm& operator=(const SlowOperationAlarm&) = delete;
+  SlowOperationAlarm& operator=(const SlowOperationAlarm&&) = delete;
+
+  absl::Time deadline() const { return deadline_; }
+  std::atomic<int64_t>* counter() { return counter_; }
+  void cancel() { UnscheduleAlarm(this); }
+  // Has the alarm fired?  If appropriate, consider cancel()'ing first, to avoid
+  // a race.
+  bool fired() const { return fired_.load(); }
+
+ private:
+  static void AlarmLoop();
+  static void ScheduleAlarm(SlowOperationAlarm* alarm);
+  static void UnscheduleAlarm(const SlowOperationAlarm* alarm);
+
+  absl::Time start_;
+  absl::Time deadline_;
+  std::string context_;
+  std::function<std::string()> msg_fn_;
+  std::atomic<bool> fired_{false};
+  // counter_ may be null.  If it's not, this alarm prints something only once
+  // every power of two occurrences.
+  std::atomic<int64_t>* counter_;
+  // If the alarm has fired the result of calling msg_fn_ is cached into msg_
+  // so that it can be reused in the destructor.
+  std::string msg_;
+};
+
+// Returns an object which prints a warning about slow compilation after a
+// certain amount of time. It will also print the total lifetime duration of
+// the returned object when it goes out of scope.
+//
+// In debug builds, recommends building with -c opt.
+//
+// In opt builds, recommends filing a bug.
+//
+// This is throttled to once-every-power-of-two occurrences, globally.
+//
+// `context` is an additional message prepended to the alarm.
+[[nodiscard]] std::unique_ptr<SlowOperationAlarm> SlowCompilationAlarm(
+    absl::string_view context);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SLOW_OPERATION_ALARM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/sort_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/sort_simplifier.h
new file mode 100644
index 00000000..d0599670
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/sort_simplifier.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SORT_SIMPLIFIER_H_
+#define XLA_SERVICE_SORT_SIMPLIFIER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/sort_simplifier.h"
+
+#endif  // XLA_SERVICE_SORT_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/source_map_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/source_map_util.h
new file mode 100644
index 00000000..bfde7dd8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/source_map_util.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SOURCE_MAP_UTIL_H_
+#define XLA_SERVICE_SOURCE_MAP_UTIL_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "xla/service/executable.h"
+
+namespace xla {
+namespace source_map_util {
+
+// Creates an INVALID_ARGUMENT status with the given format string.
+template <typename... Args>
+absl::Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                      const absl::FormatSpec<Args...>& format,
+                                      const Args&... args) {
+  std::string message = absl::StrFormat(format, args...);
+  if (!op_metadata.source_file().empty()) {
+    absl::StrAppendFormat(&message, " (%s:%d)", op_metadata.source_file(),
+                          op_metadata.source_line());
+  }
+  return InvalidArgument("%s", message);
+}
+
+// Creates an INVALID_ARGUMENT status with the given format string.
+//
+// Also, attempts to extract the OpMetadata for parameter_number on executable
+// and append it to the status message for source mapping to user code.
+//
+// executable may be nullptr, but parameter_number should not be out of bounds
+// or a CHECK-failure may occur.
+template <typename... Args>
+absl::Status InvalidParameterArgument(Executable* executable,
+                                      int parameter_number,
+                                      const absl::FormatSpec<Args...>& format,
+                                      const Args&... args) {
+  if (executable != nullptr && executable->has_module()) {
+    const HloModule& module = executable->module();
+    const HloComputation& computation = *module.entry_computation();
+    HloInstruction* param = computation.parameter_instruction(parameter_number);
+    const OpMetadata& metadata = param->metadata();
+    return InvalidParameterArgument(metadata, format, args...);
+  }
+  return InvalidArgument(format, args...);
+}
+
+}  // namespace source_map_util
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SOURCE_MAP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/space_to_batch_converter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/space_to_batch_converter.h
new file mode 100644
index 00000000..37cf1809
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/space_to_batch_converter.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
+#define XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
+
+#include <stdbool.h>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/status_macros.h"
+
+namespace xla {
+
+// Controller of various knobs.
+struct SpaceToBatchController {
+  bool enable_propagations_on_base_dilations;
+  bool enable_propagations_on_window_dilations;
+  bool enable_propagations_on_trivial_window_dilations;
+  bool disable_starting_on_small_chains;
+  int64_t limit_on_batch_size;
+  int64_t dimension_from_end_to_convert = 1;
+  // We choose the new batch size to be number_of_splits times that of the old
+  // batch so that space-to-batch propagation through several convolutional
+  // layers is consistent.
+  int64_t number_of_splits = 8;
+  int64_t count_of_dimensions_to_convert = 1;
+};
+
+// Represents the different dimension mappings. Can be extended as needed.
+enum class SpaceToBatchDimMap : uint8_t {
+  kBatch = 0,
+  kFeature = 1,
+  kSpace0 = 2,
+};
+
+// A pass which rewrites convolutions such that space dimension is turned into
+// batch.
+class SpaceToBatchConverter : public HloModulePass {
+ public:
+  explicit SpaceToBatchConverter(SpaceToBatchController ctrl) : ctrl_(ctrl) {}
+
+  absl::string_view name() const override { return "space-to-batch-converter"; }
+
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Controller for various knobs.
+  SpaceToBatchController ctrl_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
new file mode 100644
index 00000000..113ffa17
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_CANONICALIZE_ALL_GATHER_FOR_CSE_H_
+#define XLA_SERVICE_SPMD_CANONICALIZE_ALL_GATHER_FOR_CSE_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Performs canonicalizations on AllGather for CSE.
+class CanonicalizeAllGatherForCSE : public HloModulePass {
+ public:
+  CanonicalizeAllGatherForCSE() : next_channel_id_(0) {}
+
+  ~CanonicalizeAllGatherForCSE() override = default;
+  absl::string_view name() const override { return "canon-all-gather-for-cse"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnComputation(HloComputation* comp);
+  int64_t NextChannelId() { return next_channel_id_++; }
+
+  int64_t next_channel_id_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_CANONICALIZE_ALL_GATHER_FOR_CSE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/collective_permute_motion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/collective_permute_motion.h
new file mode 100644
index 00000000..8a97b165
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/collective_permute_motion.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_COLLECTIVE_PERMUTE_MOTION_H_
+#define XLA_SERVICE_SPMD_COLLECTIVE_PERMUTE_MOTION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass moves collective permutes at the end of a loop to the beginning,
+// which makes overlapping possible for megascale decomposed ops.
+class CollectivePermuteMotion : public HloModulePass {
+ public:
+  CollectivePermuteMotion() = default;
+  absl::string_view name() const override {
+    return "collective-permute-motion";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_COLLECTIVE_PERMUTE_MOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/convolution_handler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/convolution_handler.h
new file mode 100644
index 00000000..6df55c85
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/convolution_handler.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
+#define XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
+
+#include <cstdint>
+
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/dot_as_convolution_util.h"
+#include "xla/service/spmd/spmd_partitioner.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+// Partition convolution.
+absl::StatusOr<HloInstruction*> PartitionConvolution(
+    const PartitionedHlo& lhs, const PartitionedHlo& rhs,
+    const Shape& output_base_shape, const HloSharding& output_sharding,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dims_mapping,
+    absl::FunctionRef<absl::StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>
+        create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64_t num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/custom_call_handler.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/custom_call_handler.h
new file mode 100644
index 00000000..cf54c5e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/custom_call_handler.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_CUSTOM_CALL_HANDLER_H_
+#define XLA_SERVICE_SPMD_CUSTOM_CALL_HANDLER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace spmd {
+
+// Creators of custom ops defined by the partitioner itself.
+
+// Creates a custom op that rotates data along `dim` with the given amount.
+std::unique_ptr<HloInstruction> CreateCustomCallSPMDInternal_RotateRight(
+    HloInstruction* input, int64_t dim, int64_t amount);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_CUSTOM_CALL_HANDLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/partition_assignment.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/partition_assignment.h
new file mode 100644
index 00000000..5ba8a208
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/partition_assignment.h
@@ -0,0 +1,125 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_PARTITION_ASSIGNMENT_H_
+#define XLA_SERVICE_SPMD_PARTITION_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Base class for the partitioning algorithm. The derived classes will implement
+// different partitioning algorithms using various heuristics and cost models.
+// The aim is to create HLO shardings with small costs.
+class PartitioningAlgorithm {
+ public:
+  // The kind/type/name of the (derived) algorithm.
+  enum class AlgorithmKind {
+    kNoop,
+    kExp0,
+    kExp1,
+    kExp2,
+  };
+
+  // Constructors and destructor.
+  PartitioningAlgorithm() = delete;
+  PartitioningAlgorithm(const PartitioningAlgorithm&) = delete;
+  PartitioningAlgorithm& operator=(const PartitioningAlgorithm&) = delete;
+  virtual ~PartitioningAlgorithm() = default;
+
+  // Factory method to create a Noop partitioning algorithm.
+  static std::unique_ptr<PartitioningAlgorithm> CreateNoopPartitioning(
+      int64_t num_partitions);
+
+  // Returns the kind of this algorithm.
+  const AlgorithmKind& kind() const;
+
+  // Returns the name of this algorithm.
+  absl::string_view name() const;
+
+  // Returns the number of shards/partitions.
+  int64_t num_partitions() const;
+
+  // Assigns shardings to the given module.
+  virtual absl::StatusOr<bool> Run(HloModule* module) const = 0;
+
+ protected:
+  // Internal constructor for a given algorithm kind. Other fields must be
+  // filled by factory methods.
+  explicit PartitioningAlgorithm(AlgorithmKind kind, int64_t num_partitions);
+
+ private:
+  // Kind for this algorithm.
+  AlgorithmKind kind_ = AlgorithmKind::kNoop;
+
+  // Number of requested shards (parts), i.e., number of available devices.
+  int64_t num_partitions_;
+};
+
+// Noop algorithm is essentially 'algorithm 0'.
+class NoopPartitioning : public PartitioningAlgorithm {
+ public:
+  explicit NoopPartitioning(int64_t num_partitions);
+
+  // Assigns shardings to the given module.
+  absl::StatusOr<bool> Run(HloModule* module) const override;
+};
+
+// PartitionAssignment assigns sharding annotations to some HLOs in the given
+// module. The HLOs to target are more important/costly than the others in terms
+// of certain metrics. The plan is to find and assign good sharding annotations
+// to those HLOs in this pass and let the sharding propagation pass propagate
+// those to the remaining HLOs. The current assumption is that the module does
+// not have any sharding annotations yet.
+class PartitionAssignment : public HloModulePass {
+ public:
+  explicit PartitionAssignment(int64_t num_partitions);
+
+  // Returns the name of the pass.
+  absl::string_view name() const override;
+
+  // Returns the PartitioningAlgorithm to be used by PartitionAssignment.
+  virtual std::unique_ptr<PartitioningAlgorithm> ChoosePartitioningAlgorithm(
+      const HloModule& module) const;
+
+  // Runs the pass.
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Returns the algorithm being used.
+  const PartitioningAlgorithm* algorithm();
+
+  // Returns the number of partitions.
+  int64_t num_partitions() const;
+
+ private:
+  // The partitioning algorithm to be used. For now, it is determined by a flag.
+  std::unique_ptr<PartitioningAlgorithm> algorithm_ = nullptr;
+
+  // The number of partitions (shards) being requested.
+  int64_t num_partitions_;
+};
+
+}  // namespace xla
+#endif  // XLA_SERVICE_SPMD_PARTITION_ASSIGNMENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
new file mode 100644
index 00000000..b23216be
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SCHEDULE_AWARE_COLLECTIVE_OPS_CSE_H_
+#define XLA_SERVICE_SPMD_SCHEDULE_AWARE_COLLECTIVE_OPS_CSE_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Performs CSE for collectives if their users are within reasonable live range.
+class ScheduleAwareCollectiveOpsCSE : public HloModulePass {
+ public:
+  // distance_threshold: maximum live range (in number of HLO instructions on
+  //   the path) to consider CSE.
+  // for_replicas: specifies if this pass is for cross-replica or
+  //   cross-partition collectives.
+  explicit ScheduleAwareCollectiveOpsCSE(int64_t distance_threshold,
+                                         bool for_replicas)
+      : distance_threshold_(distance_threshold), for_replicas_(for_replicas) {}
+
+  ~ScheduleAwareCollectiveOpsCSE() override = default;
+  absl::string_view name() const override {
+    return "schedule-aware-collective-cse";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  int64_t distance_threshold_;
+  bool for_replicas_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SCHEDULE_AWARE_COLLECTIVE_OPS_CSE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shard_barrier_partitioner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shard_barrier_partitioner.h
new file mode 100644
index 00000000..e7975b0a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shard_barrier_partitioner.h
@@ -0,0 +1,82 @@
+/* Copyright 2024 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARD_BARRIER_PARTITIONER_H_
+#define XLA_SERVICE_SPMD_SHARD_BARRIER_PARTITIONER_H_
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/custom_call_sharding_helper.h"
+
+namespace xla {
+namespace spmd {
+
+constexpr char kShardBarrierFrom[] = "ShardBarrierFrom";
+constexpr char kShardBarrierTo[] = "ShardBarrierTo";
+
+// Custom-call partitioner shard barrier.
+class ShardBarrierPartitioner : public CustomCallPartitioner {
+ public:
+  // This allows ShardBarrier related custom-call ops to be propagated during
+  // sharding propagation.
+  bool IsCustomCallShardable(const HloInstruction* instruction) const override {
+    return true;
+  }
+
+  // This allows replicated sharding on custom-call op to pass checks at spmd
+  // partitioner preprocess stage.
+  bool CanSideEffectingHaveReplicatedSharding() const override { return true; }
+};
+
+// Custom-call partitioner for ShardBarrierFrom.
+class ShardBarrierFromPartitioner : public ShardBarrierPartitioner {
+ public:
+  // Always do not allow forward propagation with ShardBarrierFrom.
+  std::optional<HloSharding> InferShardingFromOperands(
+      const HloInstruction* instruction) const override {
+    return std::nullopt;
+  }
+
+  // Always let backward propagation run through with ShardBarrierFrom.
+  HloSharding PropagateUserSharding(
+      const HloInstruction* instruction, const HloInstruction* user,
+      const HloSharding& sharding) const override {
+    return sharding;
+  }
+};
+
+// Custom-call partitioner for TPU logger.
+class ShardBarrierToPartitioner : public ShardBarrierPartitioner {
+ public:
+  // Always let forward propagation run through with ShardBarrierTo.
+  std::optional<HloSharding> InferShardingFromOperands(
+      const HloInstruction* instruction) const override {
+    if (instruction->operand(0)->has_sharding()) {
+      return instruction->operand(0)->sharding();
+    }
+    return std::nullopt;
+  }
+
+  // Always do not allow backward propagation with ShardBarrierTo.
+  HloSharding PropagateUserSharding(
+      const HloInstruction* instruction, const HloInstruction* user,
+      const HloSharding& sharding) const override {
+    return HloSharding::Replicate();
+  }
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARD_BARRIER_PARTITIONER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/constants.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/constants.h
new file mode 100644
index 00000000..4ebd8d36
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/constants.h
@@ -0,0 +1,132 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_CONSTANTS_H_
+#define XLA_SERVICE_SPMD_SHARDY_CONSTANTS_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace xla {
+namespace sdy {
+
+// The attribute name for attributes in MHLO ops.
+inline constexpr llvm::StringRef kMhloAttributesAttr = "mhlo.attributes";
+
+// The attribute name for xla::HloSharding.
+inline constexpr llvm::StringRef kXlaShardingAttr = "mhlo.sharding";
+
+// The target name of the Sharding custom call.
+inline constexpr llvm::StringRef kShardingCustomCallTargetName = "Sharding";
+
+// The target name of the SPMDFullToShardShape custom call.
+inline constexpr llvm::StringRef kSPMDFullToShardShapeCallTargetName =
+    "SPMDFullToShardShape";
+
+// The target name of the SPMDShardToFullShape custom call.
+inline constexpr llvm::StringRef kSPMDShardToFullShapeCallTargetName =
+    "SPMDShardToFullShape";
+
+// The target name of the Python CPU callback custom call.
+inline constexpr llvm::StringRef kPythonCpuCallbackCustomCallTargetName =
+    "xla_python_cpu_callback";
+
+// The target name of the Python GPU callback custom call.
+inline constexpr llvm::StringRef kPythonGpuCallbackCustomCallTargetName =
+    "xla_python_gpu_callback";
+
+// The attribute name for backend config.
+inline constexpr llvm::StringRef kXlaBackendConfigAttr = "backend_config";
+
+// Attribute name for temporarily storing the Shardy sharding during HLO
+// sdy-round-trip. It cannot match the name `kShardingAttr` ("sdy.sharding"), as
+// during sdy-round-trip, going from HLO to MHLO, the code removes attributes
+// in the `frontend_attributes` field, making them top level. And Shardy
+// verification expects `kShardingAttr` to be of type
+// TensorShardingAttr/TensorShardingPerValueAttr - not a StringAttr.
+inline constexpr llvm::StringRef kShardingRoundTripAttr = "xla.sdy.sharding";
+
+// Attribute name for temporarily storing the Shardy sharding rule during HLO
+// sdy-round-trip. It cannot match the name `kShardingRuleAttr`
+// ("sdy.sharding_rule"), as during sdy-round-trip, going from HLO to MHLO, the
+// code removes attributes in the `frontend_attributes` field, making them top
+// level. And Shardy verification expects `kShardingRuleAttr` to be of type
+// OpShardingRuleAttr - not a StringAttr.
+inline constexpr llvm::StringRef kShardingRuleRoundTripAttr =
+    "xla.sdy.sharding_rule";
+
+// Attribute name for temporarily storing the Shardonnay meshes during HLO
+// round-trip.
+inline constexpr llvm::StringRef kMeshesRoundTripAttr = "xla.sdy.meshes";
+
+// The target name of the custom call when round tripping during HLO
+// round-trip.
+inline constexpr llvm::StringRef kFuncResultShardingTargetName =
+    "xla.sdy.FuncResultSharding";
+
+// The target name of the ShardingGroup custom call.
+inline constexpr llvm::StringRef kShardingGroupCustomCallTargetName =
+    "xla.sdy.ShardingGroup";
+
+// Sharding group id attribute name. The attribute will be of type `int64_t`
+// and will be used to identify a group of ops that should be sharded together.
+inline constexpr llvm::StringRef kShardingGroupIdAttr =
+    "xla.sdy.sharding_group_id";
+
+// Attribute name for storing frontend attributes in XLA.
+inline constexpr llvm::StringRef kFrontendAttributesAttr =
+    "mhlo.frontend_attributes";
+
+// Attribute name for determining whether we need to import MHLO shardings,
+// i.e., the input module doesn't contain SDY shardings as frontend attributes.
+inline constexpr llvm::StringRef kImportMhloShardings =
+    "xla.sdy.import_mhlo_shardings";
+
+// Attribute name for determining whether tuple parameters should be used for
+// the rest of the XLA pipeline.
+// TODO(b/345414638): remove this when Shardy is the first thing run in the
+// XLA pipeline, so no HLO<->MLIR round-tripping.
+inline constexpr llvm::StringRef kUseTupleArgs = "xla.sdy.use_tuple_args";
+
+// Attribute name for the in shardings of a `ManualComputationOp`.
+inline constexpr llvm::StringRef kInShardings = "xla.sdy.in_shardings";
+
+// Attribute name for the out shardings of a `ManualComputationOp`.
+inline constexpr llvm::StringRef kOutShardings = "xla.sdy.out_shardings";
+
+// Attribute name for the manual axes of a `ManualComputationOp`.
+inline constexpr llvm::StringRef kManualAxes = "xla.sdy.manual_axes";
+
+// The function name of the of the body of a `ManualComputationOp` during Shardy
+// round tripping. Used
+inline constexpr llvm::StringRef kManualComputationBodyFuncName =
+    "xla.sdy.manual_computation_body";
+
+// The target name of the custom call that changes operands from global to local
+// shape during Shardy round tripping.
+inline constexpr llvm::StringRef kGlobalToLocalShapeCallTargetName =
+    "xla.sdy.GlobalToLocalShape";
+
+// The target name of the custom call that changes results from local to global
+// shape during Shardy round tripping.
+inline constexpr llvm::StringRef kLocalToGlobalShapeCallTargetName =
+    "xla.sdy.LocalToGlobalShape";
+
+// The name of the global mesh.
+inline constexpr llvm::StringRef kGlobalMeshName = "mesh";
+
+}  //  namespace sdy
+}  //  namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_callback_custom_calls.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_callback_custom_calls.h
new file mode 100644
index 00000000..b67955f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_callback_custom_calls.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_CALLBACK_CUSTOM_CALLS_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_CALLBACK_CUSTOM_CALLS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts the `CustomCallOp`s for host callbacks in XLA
+// into the pattern that the XLA compiler recognizes.
+//
+// The rest of the XLA pipeline expects host callback custom calls to either be
+// a tuple with a get_tuple_element or no results (which we changed due to
+// shardy shardings expecting at least one result, and needing to attach a
+// maximal sharding to the callbacks).
+std::unique_ptr<mlir::Pass> createMhloRoundTripExportCallbackCustomCallsPass();
+
+// Registers the xla-sdy-mhlo-round-trip-export-callback-custom-calls pass.
+void registerMhloRoundTripExportCallbackCustomCallsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_CALLBACK_CUSTOM_CALLS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_ops.h
new file mode 100644
index 00000000..dd05c071
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_ops.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_OPS_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_OPS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts Shardy ops to MHLO ops (except
+// sdy::ManualComputationOp).
+std::unique_ptr<mlir::Pass> createExportOpsPass();
+
+// Register the xla-sdy-export-ops pass.
+void registerExportOpsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_shardings.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_shardings.h
new file mode 100644
index 00000000..1b002fac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/export_shardings.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_SHARDINGS_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_SHARDINGS_H_
+
+#include <functional>
+#include <memory>
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LLVM.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+
+namespace xla {
+namespace sdy {
+
+// Convert the `sdySharding` into an `xla::HloSharding`.
+HloSharding convertToHloSharding(
+    mlir::sdy::TensorShardingAttr sdySharding,
+    std::function<mlir::sdy::MeshAttr(mlir::sdy::TensorShardingAttr)>
+        getMeshAttr,
+    mlir::ArrayRef<mlir::StringAttr> manualAxes = {});
+
+// Convert the `shardings` into a `StringAttr` representing `xla::HloSharding`
+// for the given `op`.
+mlir::StringAttr convertToHloShardingAttr(
+    mlir::Operation* op,
+    mlir::ArrayRef<mlir::sdy::TensorShardingAttr> shardings,
+    std::function<mlir::sdy::MeshAttr(mlir::sdy::TensorShardingAttr)>
+        getMeshAttr,
+    std::function<mlir::StringAttr(const HloSharding&)> getStringAttr,
+    mlir::ArrayRef<mlir::StringAttr> manualAxes = {});
+
+// Creates a pass that converts the shardings from `kShardingAttr` to
+// `kXlaShardingAttr` and removes mesh symbols. Fully or partially manual
+// shardings are processed in `ShardMapExportPass`.
+std::unique_ptr<mlir::Pass> createExportMhloShardingsPass();
+
+// Register the xla-sdy-mhlo-export-shardings pass.
+void registerMhloExportShardingsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_EXPORT_SHARDINGS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/mhlo_export.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/mhlo_export.h
new file mode 100644
index 00000000..74245d3d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/mhlo_export.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_MHLO_EXPORT_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_MHLO_EXPORT_H_
+
+#include "mlir/Pass/PassRegistry.h"
+
+namespace xla {
+namespace sdy {
+
+// Register the xla-sdy-mhlo-export-pipeline.
+void registerMhloExportPipeline();
+
+// Add the xla-sdy-mhlo-export-pipeline in `pm`. The pipeline, including a
+// sequence of passes, exports the Shardy dialect into an MHLO module meant
+// for the XLA compiler with HLO shardings.
+void addMhloExportPipeline(mlir::OpPassManager& pm);
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_MHLO_EXPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.h
new file mode 100644
index 00000000..c096b131
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/mhlo_import.h
@@ -0,0 +1,69 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_MHLO_IMPORT_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_MHLO_IMPORT_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/tile_assignment.h"
+
+namespace xla {
+namespace sdy {
+
+// Parses `sharding` to obtain a `xla::HloSharding`.
+xla::HloSharding parseShardingFromString(const mlir::StringAttr& sharding);
+
+// Converts `hloSharding` into a `TensorShardingAttr` based on the
+// `globalMesh`.
+//
+// If `hloSharding` is unknown, return fully open sharding. Otherwise, the
+// returned sharding is open iff `openDims` is true.
+mlir::sdy::TensorShardingAttr convertToSdySharding(
+    const xla::HloSharding& hloSharding, mlir::sdy::MeshAttr globalMesh,
+    const llvm::SmallDenseMap<int64_t, mlir::StringRef>&
+        deviceIdToMaximalMeshName,
+    int64_t rank, bool openDims = false);
+
+// Register the xla-sdy-import-shardings pass.
+void registerMhloImportShardingsPass();
+
+// Register the xla-sdy-mhlo-import-pipeline.
+void registerMhloImportPipeline();
+
+// Add the xla-sdy-mhlo-import-pipeline in `pm`. The pipeline, including a
+// sequence of passes, imports a MHLO module into the SDY (Shardonnay) dialect.
+//
+// `allowPropagationToArgs` and `allowPropagationToResults` indicate for each
+// argument and result of the main function respectively, whether their existing
+// sharding can be modified during propagation, i.e., should their dimension
+// shardings be open. Each vector can either:
+// - be empty, in which case the default is false for all args/results.
+// - have a single element, in which case the value applies to all args/results.
+// - have the same number of elements as the number of args/results.
+void addMhloImportPipeline(mlir::OpPassManager& pm,
+                           mlir::ArrayRef<bool> allowPropagationToArgs,
+                           mlir::ArrayRef<bool> allowPropagationToResults);
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_MHLO_IMPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/shard_map_export.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/shard_map_export.h
new file mode 100644
index 00000000..6f4a1a38
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/shard_map_export.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_SHARD_MAP_EXPORT_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_SHARD_MAP_EXPORT_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts the `ManualComputationOp` into the pattern that
+// the XLA compiler recognizes. This pass also exports fully or partially manual
+// shardings, while other shardings are processed in `ExportMhloShardingsPass`.
+std::unique_ptr<mlir::Pass> createMhloRoundTripShardMapExportPass();
+
+// Registers the xla-sdy-mhlo-round-trip-shard-map-export pass.
+void registerMhloRoundTripShardMapExportPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_SHARD_MAP_EXPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/shard_map_import.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/shard_map_import.h
new file mode 100644
index 00000000..d0759f39
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/mhlo_round_trip/shard_map_import.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_SHARD_MAP_IMPORT_H_
+#define XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_SHARD_MAP_IMPORT_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts the code pattern generated by JAX's shard_map
+// into the `ManualComputationOp`.
+std::unique_ptr<mlir::Pass> createMhloRoundTripShardMapImportPass();
+
+// Registers the xla-mhlo-round-trip-shard-map-import pass.
+void registerMhloRoundTripShardMapImportPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_MHLO_ROUND_TRIP_SHARD_MAP_IMPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.h
new file mode 100644
index 00000000..cdd8fd42
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_EXPORT_NAMED_COMPUTATIONS_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_EXPORT_NAMED_COMPUTATIONS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts a `NamedComputationOp` to a `CallOp` with a new
+// private function called the `NamedComputationOp`'s `name`. The new `FuncOp`
+// and `CallOp` have the same shardings as the original `NamedComputationOp`s
+// operands/results.
+//
+// If there is a function with the same name as the `NamedComputationOp` in the
+// module, the MLIR symbol table will change it to `{name}_#`.
+std::unique_ptr<mlir::Pass> createExportNamedComputationsPass();
+
+// Register the xla-sdy-export-named-computations pass.
+void registerExportNamedComputationsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_EXPORT_NAMED_COMPUTATIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h
new file mode 100644
index 00000000..50f03781
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_BACKEND_FUNC_CALLS_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_BACKEND_FUNC_CALLS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts a `CallOp` with a `backend_config` attr to a
+// `NamedComputationOp` with the function body inlined and name of the callee.
+//
+// This pass is used to handle host offloading calls which are non inlined
+// functions that require the callee to be propagated through.
+//
+// NOTE: it assumes that there is a unique callee for each caller.
+std::unique_ptr<mlir::Pass> createImportBackendFuncCallsPass();
+
+// Register the xla-sdy-import-backend-func-calls pass.
+void registerImportBackendFuncCallsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_BACKEND_FUNC_CALLS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.h
new file mode 100644
index 00000000..a83869ca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_CONSTANTS_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_CONSTANTS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts a `stablehlo.constant` (which is foldable) into
+// an `sdy.constant` (which isn't foldable).
+std::unique_ptr<mlir::Pass> createImportConstantsPass();
+
+// Register the xla-sdy-import-constants pass.
+void registerImportConstantsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_CONSTANTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.h
new file mode 100644
index 00000000..74ac5c84
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_SDY_CUSTOM_CALLS_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_SDY_CUSTOM_CALLS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that imports sdy tagged `CustomCall` ops. Namely it converts
+// * xla.sdy.Sharding -> ShardingConstraintOp
+// * xla.sdy.ShardingGroup -> ShardingGroupOp
+std::unique_ptr<mlir::Pass> createImportSdyCustomCallsPass();
+
+// Register the xla-sdy-import-sdy-custom-calls pass.
+void registerImportSdyCustomCallsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_SDY_CUSTOM_CALLS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.h
new file mode 100644
index 00000000..c06776f3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_OPEN_WHILE_FREE_VARS_SHARDING_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_OPEN_WHILE_FREE_VARS_SHARDING_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that adds a fully open sharding constraint to free variables
+// of while op that already have a user-defined sharding.
+//
+// This allows for their uses in the while op to be further sharded, which is
+// important when converting to HLO as they will be lifted as passthrough while
+// operands/results.
+std::unique_ptr<mlir::Pass> createOpenWhileFreeVarsShardingPass();
+
+// Registers the xla-sdy-open-while-free-vars-sharding pass.
+void registerOpenWhileFreeVarsShardingPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_OPEN_WHILE_FREE_VARS_SHARDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h
new file mode 100644
index 00000000..75812a4e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_PIPELINE_PASSES_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_PIPELINE_PASSES_H_
+
+#include "mlir/Pass/PassOptions.h"
+
+namespace xla {
+namespace sdy {
+
+// Adds the common import passes for both the SDY and MHLO import
+// pipelines that need to be called before each pipeline converts an HLO
+// sharding/SDY sharding string into an `sdy.sharding` attribute.
+void addCommonPreImportPasses(mlir::OpPassManager& pm);
+
+// Adds the common import passes for both the SDY and MHLO import
+// pipelines that need to be called after each pipeline converts an HLO
+// sharding/SDY sharding string into an `sdy.sharding` attribute.
+void addCommonPostImportPasses(mlir::OpPassManager& pm);
+
+// Adds the common export passes for both the SDY and MHLO import
+// pipelines that need to be called before each pipeline converts an HLO
+// sharding/SDY sharding string into an `sdy.sharding` attribute.
+void addCommonPreExportPasses(mlir::OpPassManager& pm);
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_PIPELINE_PASSES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.h
new file mode 100644
index 00000000..1e90b81d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_EXPORT_OPS_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_EXPORT_OPS_H_
+
+#include <memory>
+
+#include "mlir/Pass/PassRegistry.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts Shardy ops to MHLO ops (except
+// sdy::ManualComputationOp).
+std::unique_ptr<mlir::Pass> createSdyRoundTripExportOpsPass();
+
+// Registers the xla-sdy-round-trip-export-ops
+// pass.
+void registerSdyRoundTripExportOpsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_EXPORT_OPS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_shardy_attrs.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_shardy_attrs.h
new file mode 100644
index 00000000..d4d64aef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_shardy_attrs.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_EXPORT_SHARDY_ATTRS_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_EXPORT_SHARDY_ATTRS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Registers the xla-sdy-round-trip-export-shardy-attrs pass.
+void registerSdyRoundTripExportShardyAttrsPass();
+
+// Creates the pass to convert SDY attributes to frontend attributes:
+//
+// - Converts shardings from `kShardingAttr` to `kShardingRoundTripAttr`
+// - Converts sharding rules from `kShardingRuleAttr` to
+//   `kShardingRuleRoundTripAttr`
+// - Saves the mesh symbols as `kMeshesRoundTripAttr`
+//
+// NOTE: The `kShardingAttr`s are not removed from the ops. They are kept around
+// because part of the `SdyRoundTripExportPipeline` also converts the
+// `kShardingAttr`s to `kXlaShardingAttr`s.
+std::unique_ptr<mlir::Pass> createSdyRoundTripExportShardyAttrsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_EXPORT_SHARDY_ATTRS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h
new file mode 100644
index 00000000..ce81f5ea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_CALLBACK_CUSTOM_CALLS_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_CALLBACK_CUSTOM_CALLS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates the pass to modify the return types of XLA host callback custom calls
+// to be compatible with SDY.
+//
+// Shardy shardings require an op to have at least one result, and the XLA host
+// callback custom calls are not guaranteed to return a value.
+// To allow the custom calls to have a maximal sharding, we change the return
+// type to return a dummy value.
+std::unique_ptr<mlir::Pass> createSdyRoundTripImportCallbackCustomCallsPass();
+
+// Registers the xla-sdy-round-trip-import-callback-custom-calls pass.
+void registerSdyRoundTripImportCallbackCustomCallsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_CALLBACK_CUSTOM_CALLS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h
new file mode 100644
index 00000000..0e75e2fb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_SHARDY_ATTRS_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_SHARDY_ATTRS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates the pass to convert frontend attributes to SDY attributes:
+//
+// - Converts shardings from `kShardingRoundTripAttr` to `kShardingAttr`
+// - Converts sharding rules from `kShardingRuleRoundTripAttr` to
+//   `kShardingRuleAttr`
+// - Converts meshes from `kMeshesRoundTripAttr` to sdy.mesh symbols
+std::unique_ptr<mlir::Pass> createSdyRoundTripImportShardyAttrsPass();
+
+// Registers the xla-sdy-round-trip-import-shardy-attrs pass.
+void registerSdyRoundTripImportShardyAttrsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_SHARDY_ATTRS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.h
new file mode 100644
index 00000000..a947f3ae
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_PIPELINES_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_PIPELINES_H_
+
+#include "mlir/Pass/PassManager.h"
+
+namespace xla {
+namespace sdy {
+
+// Add the xla-sdy-round-trip-export-pipeline in `pm`. The pipeline,
+// including a sequence of passes, exports the Shardy dialect into an MHLO
+// module with no XLA shardings, but SDY shardings and meshes saved as string
+// frontend attributes.
+//
+// This is meant for temporarily saving the Shardy attrs/meshes in order to
+// run some HLO passes before coming back to Shardy later in the XLA
+// pipeline to run propagation. Should only be used for frontend frameworks like
+// JAX to integrate with Shardy while the Shardy team works on a more
+// long-term solution moving the HLO passes either after propagation or into
+// MLIR (see b/335666088). So this pass will eventually be removed.
+void addSdyRoundTripExportPipeline(mlir::OpPassManager& pm);
+
+// Add the xla-sdy-round-trip-import-pipeline in `pm`. The pipeline, including a
+// sequence of passes, imports an MHLO module into the SDY (Shardy) dialect.
+//
+// The module is assumed to have `kShardingRoundTripAttr` and
+// `kMeshesRoundTripAttr`.
+void addSdyRoundTripImportPipeline(mlir::OpPassManager& pm);
+
+// Register the xla-sdy-round-trip-export-pipeline.
+void registerSdyRoundTripExportPipeline();
+
+// Register the xla-sdy-round-trip-import-pipeline.
+void registerSdyRoundTripImportPipeline();
+
+// Register the xla-sdy-round-trip-testing-pipeline.
+// This takes an SDY module, exports it to MHLO while saving the SDY attrs
+// and meshes, goes to HLO, back to MHLO, and then back to SDY.
+// This is for testing roundtripping SDY modules, but should be eventually
+// removed as part of b/335666088.
+void registerSdyRoundTripTestingPipeline();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_PIPELINES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h
new file mode 100644
index 00000000..04d280e5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_REMOVE_SIZE_ONE_AXES_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_REMOVE_SIZE_ONE_AXES_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates the pass that removes axes of size one from all meshes, shardings,
+// and manual computation ops, to avoid conflict during propagation that are due
+// to such axes.
+std::unique_ptr<mlir::Pass> createSdyRoundTripRemoveSizeOneAxesPass();
+
+// Registers the xla-sdy-round-trip-remove-size-one-axes pass.
+void registerSdyRoundTripRemoveSizeOneAxesPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_REMOVE_SIZE_ONE_AXES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.h
new file mode 100644
index 00000000..c3a7ed9b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_SHARD_MAP_EXPORT_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_SHARD_MAP_EXPORT_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates the pass that converts `ManualComputationOp`s to a separate function
+// with a CallOp and a pair of `CustomCallOp`s that change the shape of the
+// arguments/results. The CallOp saves the in/out shardings and manual axes as
+// frontend attrs.
+std::unique_ptr<mlir::Pass> createSdyRoundTripShardMapExportPass();
+
+// Registers the xla-sdy-round-trip-shard-map-export pass.
+void registerSdyRoundTripShardMapExportPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_SHARD_MAP_EXPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.h
new file mode 100644
index 00000000..e84304a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_SHARD_MAP_IMPORT_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_SHARD_MAP_IMPORT_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates the pass that converts a `CallOp` calling
+// `@local_xla.sdy.manual_computation_body` with in/out shardings and manual
+// axes as frontend attrs, wrapped with a pair of `CustomCallOp`s that change
+// the shape of the arguments/results, to a `ManualComputationOp`.
+std::unique_ptr<mlir::Pass> createSdyRoundTripShardMapImportPass();
+
+// Registers the xla-sdy-round-trip-shard-map-import pass.
+void registerSdyRoundTripShardMapImportPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_SHARD_MAP_IMPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/mhlo_to_hlo_to_mhlo.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/mhlo_to_hlo_to_mhlo.h
new file mode 100644
index 00000000..f98030e3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/mhlo_to_hlo_to_mhlo.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_TEST_UTILS_MHLO_TO_HLO_TO_MHLO_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_TEST_UTILS_MHLO_TO_HLO_TO_MHLO_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates the pass that round trips from MHLO -> HLO -> MHLO.
+std::unique_ptr<mlir::Pass> createSdyRoundTripMhloToHloToMhloPass();
+
+// Register the xla-sdy-round-trip-mhlo-to-hlo-to-mhlo pass.
+void registerSdyRoundTripMhloToHloToMhloPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_TEST_UTILS_MHLO_TO_HLO_TO_MHLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/testing_pipeline.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/testing_pipeline.h
new file mode 100644
index 00000000..96c0e8b0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/testing_pipeline.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_TEST_UTILS_TESTING_PIPELINE_H_
+#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_TEST_UTILS_TESTING_PIPELINE_H_
+
+namespace xla {
+namespace sdy {
+
+// Register the xla-sdy-round-trip-testing-pipeline.
+// This takes an SDY module, exports it to MHLO while saving the SDY attrs
+// and meshes, goes to HLO, back to MHLO, and then back to SDY.
+// This is for testing roundtripping SDY modules, but should be eventually
+// removed as part of b/335666088.
+void registerSdyRoundTripTestingPipeline();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_TEST_UTILS_TESTING_PIPELINE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
new file mode 100644
index 00000000..abd3e64f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_SHARDY_XLA_PASS_H_
+#define XLA_SERVICE_SPMD_SHARDY_SHARDY_XLA_PASS_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace sdy {
+
+// An HloModulePass to run Shardy. The pass:
+// 1. converts the HLO module into MLIR MHLO and the SDY (Shardy) dialect,
+// 2. runs Shardy passes, including sharding propagation and partitioner,
+// 3. converts the MLIR MHLO back to the HLO module.
+class ShardyXLA : public xla::HloModulePass {
+ public:
+  explicit ShardyXLA(bool runSdyShardingPropagation = true)
+      : runSdyShardingPropagation(runSdyShardingPropagation) {}
+
+  absl::string_view name() const override { return "shardy-xla"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      xla::HloModule* hloModule,
+      const absl::flat_hash_set<absl::string_view>& executionThreads) override;
+
+  void setRunSdyShardingPropagation(bool runSdyShardingPropagation) {
+    this->runSdyShardingPropagation = runSdyShardingPropagation;
+  }
+
+ private:
+  bool runSdyShardingPropagation;
+  // TODO. Run other SDY passes with flags.
+};
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_SHARDY_XLA_PASS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/utils.h
new file mode 100644
index 00000000..7975a555
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/shardy/utils.h
@@ -0,0 +1,119 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_UTILS_H_
+#define XLA_SERVICE_SPMD_SHARDY_UTILS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/strings/escaping.h"
+#include "mlir/AsmParser/AsmParser.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
+
+namespace xla {
+namespace sdy {
+
+// Gets the "frontend_attributes" `DictionaryAttr` from `op`. If it doesn't
+// exist, return nullptr.
+mlir::DictionaryAttr getFrontendAttrs(mlir::Operation* op);
+
+// Gets the `frontend_attributes` `DictionaryAttr` from `funcOp`'s arg at
+// `index`. If it doesn't exist, return nullptr.
+mlir::DictionaryAttr getFuncArgFrontendAttrs(mlir::func::FuncOp funcOp,
+                                             unsigned int index);
+
+// Adds `name` into the frontend attributes of `op` with value `value`. If
+// `name` already exists, it will be overwritten. Note that `value` will be
+// turned into a `StringAttr`.
+void setFrontendAttribute(mlir::Operation* op, mlir::StringRef name,
+                          mlir::Attribute value, bool escapeAttr = true);
+
+// Adds `name` into the argument at `argNum`'s frontend attributes of `funcOp`
+// with value `value`. If `name` already exists, it will be overwritten. Note
+// that `value` will be turned into a `StringAttr`.
+void setFrontendAttribute(mlir::func::FuncOp funcOp, mlir::StringRef name,
+                          mlir::Attribute value, int64_t argNum,
+                          bool escapeAttr = true);
+
+// Remove `attributeName` from the frontend attributes of `op`.
+void removeFrontendAttribute(mlir::Operation* op,
+                             mlir::StringRef attributeName);
+
+// Remove `attributeName` from the argument at `argNum`'s frontend attributes
+// of `funcOp`.
+void removeFrontendAttribute(mlir::func::FuncOp funcOp,
+                             mlir::StringRef attributeName, int64_t argNum);
+
+// Checks if "frontend_attributes" `DictionaryAttr` from `op` contains `key`.
+bool hasFrontendAttr(mlir::Operation* op, mlir::StringRef key);
+
+// Checks if `dictAttr` exists and contains `key`.
+bool hasKey(mlir::DictionaryAttr dictAttr, mlir::StringRef key);
+
+void loadAllRequiredDialects(mlir::MLIRContext* context);
+
+// Parses `attrName` from `dictAttr` to an attribute of type `AttrTy`.
+template <typename AttrTy>
+AttrTy parseStringAttr(mlir::DictionaryAttr dictAttr,
+                       llvm::StringRef attrName) {
+  if (mlir::Attribute stringAttr = dictAttr.get(attrName)) {
+    std::string value;
+    std::string error;
+    CHECK(absl::CUnescape(mlir::cast<mlir::StringAttr>(stringAttr).getValue(),
+                          &value, &error))
+        << error;
+    return mlir::cast<AttrTy>(
+        mlir::parseAttribute(value, stringAttr.getContext()));
+  }
+  return nullptr;
+}
+
+// Checks if `op`'s "frontend_attributes" `DictionaryAttr` contains `attrName`
+// and parses it to an attribute of type `AttrTy`. If it doesn't exist, then
+// returns std::nullopt.
+template <typename AttrTy>
+std::optional<AttrTy> tryGetFrontendAttr(mlir::Operation* op,
+                                         mlir::StringRef attrName) {
+  mlir::DictionaryAttr dictAttr = getFrontendAttrs(op);
+  if (hasKey(dictAttr, attrName)) {
+    return parseStringAttr<AttrTy>(dictAttr, attrName);
+  }
+  return std::nullopt;
+}
+
+// Builds a new `stablehlo.custom_call` with the same operands and attributes
+// as `op` but with new `resultTypes`.
+mlir::stablehlo::CustomCallOp cloneCustomCallWithNewResultTypes(
+    mlir::stablehlo::CustomCallOp op, mlir::TypeRange resultTypes,
+    mlir::IRRewriter& rewriter);
+
+// Whether `op` is a Python callback custom call.
+bool isPythonCallbackCustomCall(mlir::stablehlo::CustomCallOp op);
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_partitioner.h
new file mode 100644
index 00000000..c6354a45
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -0,0 +1,780 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+#define XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/literal.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/custom_call_sharding_helper.h"
+#include "xla/service/dot_as_convolution_util.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+// Enum representing the partitioning methods for gather and scatter.
+enum class PartitioningMethod {
+  kExplicitBatch,
+  kIndexParallel,
+  kOperandPassthrough,
+  kTrivialSlicedOperand,
+  kIndexPassthrough,
+};
+
+struct SpmdPartitionerOptions {
+  // Always exchange halo on LHS for all convolutions. If false, backprop filter
+  // convolution exchanges halo on RHS.
+  bool conv_halo_exchange_always_on_lhs = true;
+
+  // The number of instructions to be reported for the highest memory profile
+  // instructions.
+  int64_t report_instruction_count = 5;
+
+  // The minimum size in MiB of an einsum operand to be considered using
+  // windowed implementation in an HLO loop.
+  int64_t threshold_for_windowed_einsum_mib = 256;
+
+  // Whether unroll windowed einsum loop by degree of two.
+  bool unroll_windowed_einsum = false;
+
+  // Whether doing bidirectional collective permute in windowed einsum loop.
+  bool bidirectional_windowed_einsum = false;
+
+  // Whether the entry computations' signature could change after partitioning.
+  bool allow_module_signature_change = false;
+
+  // Whether to use cached all-gather to avoid repeatedly replicate a tiled
+  // tensor. If it is set to false, the result tends to be more
+  // memory-efficient, and the compiler can use the ScheduleAwareAllGatherCSE
+  // pass to CSE some all-gathers which are relatively close to each other.
+  bool cache_all_gather = true;
+
+  // When making a compromise between windowed einsum speed and memory usage
+  // prefer the former if true.
+  bool choose_faster_windowed_einsum_over_mem = false;
+
+  // Whether doing bidirectional communication when decomposing independent
+  // all-gathers.
+  bool bidirectional_decomposed_all_gather = false;
+
+  // Whether to skip checking the numbers and shardings of windowed einsum's
+  // users.
+  bool skip_checking_windowed_einsum_users = false;
+
+  // Enables windowed einsum for operand all-gather.
+  bool enable_windowed_einsum_for_all_gather = true;
+  // Enables windowed einsum for result reduce-scatter.
+  bool enable_windowed_einsum_for_reduce_scatter = true;
+
+  // Whether disable rewrite for dots that share the same
+  // operand as an already rewritten windowed einsum loop.
+  bool disable_ag_rewrite_for_multiple_consumers = false;
+
+  // Partitioning method to prioritize for gather operations.
+  PartitioningMethod gather_partition_method =
+      PartitioningMethod::kExplicitBatch;
+
+  // Partitioning method to prioritize for scatter operations.
+  PartitioningMethod scatter_partition_method =
+      PartitioningMethod::kExplicitBatch;
+
+  // The minimum size to enable windowed einsum in total bytes.
+  // This combines sizes in bytes of both operands.
+  // When it's set, it will override threshold_for_windowed_einsum_mib.
+  std::optional<int64_t> total_bytes_windowed_einsum_threshold = std::nullopt;
+};
+
+// Class to wrap the computation builder to capture information during SPMD
+// transformation.
+class SpmdBuilder : public HloComputation::Builder {
+ public:
+  SpmdBuilder(const std::string& name, HloInstruction* hlo)
+      : HloComputation::Builder(name) {
+    visiting_hlo_ = hlo;
+  }
+
+  HloInstruction* AddInstruction(
+      std::unique_ptr<HloInstruction> instruction) override;
+
+  const std::vector<HloInstruction*>& derived_instructions(
+      HloInstruction* hlo) {
+    return instructions_.at(hlo);
+  }
+
+  void set_visiting_hlo(HloInstruction* hlo) {
+    visiting_hlo_ = hlo;
+    instructions_[hlo];
+  }
+
+  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
+
+  // Wrapper of queries to broadcast_dims_.
+  std::optional<const absl::flat_hash_set<int64_t>*> BroadcastDimsForCreatedHlo(
+      const HloInstruction* hlo) {
+    auto it = broadcast_dims_.find(hlo);
+    if (it == broadcast_dims_.end()) {
+      return std::nullopt;
+    }
+    return &it->second;
+  }
+
+ private:
+  // Sets the broadcast dims for the newly added/created hlo.
+  void SetBroadcastDimsForAddedHlo(const HloInstruction& hlo);
+
+  void SetBroadcastDimsForReshape(const HloInstruction& hlo);
+
+  void SetBroadcastDimsForTranspose(const HloInstruction& hlo);
+
+  void SetBroadcastDimsForPad(const HloInstruction& hlo);
+
+  void SetBroadcastDimsForSlice(const HloInstruction& hlo);
+
+  void SetBroadcastDimsForElementwise(const HloInstruction& hlo);
+  // Currently visiting instruction.
+  HloInstruction* visiting_hlo_;
+
+  // Map from the currently visiting (old) instruction to new instructions
+  // created during SPMD partitioning.
+  HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+
+  // Maps from each created instruction to a set of dimensions that are from
+  // broadcasts or elementwise ops over broadcasts. This means elements along
+  // these dimensions have the same value.
+  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<int64_t>>
+      broadcast_dims_;
+};
+
+// A set of functions that create the cross-partition collective ops.
+struct SPMDCollectiveOpsCreator {
+  // Function used to create a partition ID HLO.
+  std::function<HloInstruction*(SpmdBuilder*)> create_partition_id;
+
+  // Function used to create a cross-partition all-reduce HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, HloComputation* reduction,
+      const std::vector<std::vector<int64_t>>& partition_subgroups,
+      int64_t channel_id)>
+      create_cross_partition_all_reduce;
+
+  // Function used to create a cross-partition all-reduce HLO using device list
+  // in iota format. This function is optional: if it is a nullptr, use
+  // create_cross_partition_all_reduce.
+  // TODO(b/316622399): Merge this and create_cross_partition_all_reduce into a
+  // function that uses CollectiveDeviceList.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, HloComputation* reduction,
+      const IotaReplicaGroupList& partition_group_list, int64_t channel_id)>
+      create_cross_partition_all_reduce_with_iota_device_list;
+
+  // Function used to create a cross-partition collective-permute HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand,
+      std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
+      int64_t next_channel_id)>
+      create_cross_partition_collective_permute;
+
+  // Function used to create a cross-partition all-to-all HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, absl::Span<HloInstruction* const> operands,
+      const std::vector<std::vector<int64_t>>& partition_subgroups,
+      int64_t channel_id, std::optional<int64_t> split_dimension)>
+      create_cross_partition_all_to_all;
+
+  // Function used to create a cross-partition all-gather HLO. This is optional:
+  // if it is nullptr, the partitioner will use all-reduce instead.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
+      const std::vector<std::vector<int64_t>>& partition_subgroups,
+      int64_t channel_id, int64_t all_gather_dimension)>
+      create_cross_partition_all_gather;
+
+  // Function used to create a cross-partition all-gather HLO using device list
+  // in iota format. This function is optional: if it is a nullptr, use
+  // create_cross_partition_all_gather.
+  // TODO(b/316622399): Merge this and create_cross_partition_all_gather into a
+  // function that uses CollectiveDeviceList.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
+      const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
+      int64_t all_gather_dimension)>
+      create_cross_partition_all_gather_with_iota_device_list;
+};
+
+// Create a default SPMDCollectiveOpsCreator.
+SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
+                                                        int64_t num_replicas);
+
+// Logger to report memory usage during SPMD partitioning.
+class SpmdLogger {
+ public:
+  SpmdLogger(int64_t report_instruction_count, bool disabled)
+      : report_instruction_count_(report_instruction_count),
+        disabled_(disabled) {}
+  static std::string ReportBeforePartition(const HloModule& module,
+                                           int64_t report_instruction_count);
+  static std::string ReportAfterPartition(const HloModule& module,
+                                          int64_t report_instruction_count);
+
+  // Registers the logging for the groups of instructions created to transform
+  // the given hlo.
+  void RegisterLogEntry(HloInstruction* hlo,
+                        const std::vector<HloInstruction*>& group);
+
+  std::string MakeReport();
+
+ private:
+  template <typename F>
+  static std::string ReportMemoryUsage(const HloModule& module, const F& filter,
+                                       int64_t report_instruction_count);
+
+  // A vector of logging messages (one for each original HLO instruction), where
+  // the first integer of the pair represents the size of the HBM used.
+  std::vector<std::pair<int64_t, std::string>> entries_;
+
+  int64_t report_instruction_count_;
+
+  // Note that we allow creating a *disabled* logger when logging is not
+  // enabled, in which case it is supposed to avoid doing any potentially
+  // expensive work. The logger is still created in this case and passed to the
+  // users to help avoid changing current call sites.
+  const bool disabled_;
+};
+
+class SpmdPartitioningVisitor;
+
+class SpmdPartitioner : public HloModulePass {
+ public:
+  SpmdPartitioner(int64_t num_partitions, int64_t num_replicas,
+                  SpmdPartitionerOptions options);
+  SpmdPartitioner(int64_t num_partitions, int64_t num_replicas,
+                  SpmdPartitionerOptions options,
+                  SPMDCollectiveOpsCreator collective_ops_creator)
+      : num_partitions_(num_partitions),
+        num_replicas_(num_replicas),
+        options_(std::move(options)),
+        collective_ops_creator_(std::move(collective_ops_creator)) {}
+  absl::string_view name() const override { return "spmd-partitioning"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Transforms the given computation with SPMD instructions, replacing it with
+  // a new computation.
+  absl::StatusOr<bool> PartitionComputation(HloComputation* computation,
+                                            const HloSharding& root_sharding,
+                                            int64_t* next_channel_id,
+                                            SpmdLogger* logger,
+                                            const CallGraph& call_graph);
+
+  // Creates all-gather(s) based on HloSharding. Can be overridden to customize.
+  // The default uses a single all-gather even if there are multiple sharded
+  // dimensions, and adds potential reshapes and transposes to achieve that.
+  // If it returns false, the partitioner will fall back to all-reduce.
+  // `selected_dims` specifies the dimensions along which the all-gather happens
+  // in the tiled sharding, which allows potentially creating a subgroup
+  // all-gather.
+  virtual HloInstruction* AllGatherShards(
+      SpmdBuilder* b, HloInstruction* operand, const HloSharding& sharding,
+      int64_t* next_channel_id, absl::Span<const int64_t> selected_dims,
+      const SPMDCollectiveOpsCreator& collectives_creator);
+
+  // Creates all-reduce(s) across devices along selected_dims in sharding. Can
+  // be overridden to customize.
+  virtual HloInstruction* AllReduceAlongShardingDims(
+      SpmdBuilder* b, HloInstruction* operand, const HloSharding& sharding,
+      int64_t* next_channel_id, absl::Span<const int64_t> selected_dims,
+      const SPMDCollectiveOpsCreator& collectives_creator,
+      HloComputation* reduction);
+
+  const SpmdPartitionerOptions& options() { return options_; }
+
+  virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
+      HloComputation* computation, int64_t num_partitions, int64_t num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64_t* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options, const CallGraph& call_graph);
+
+  // Estimate the memory cost for an op, override this for target-specific
+  // op buffer implementation.
+  virtual int64_t MemoryCostInBytes(HloInstruction* hlo);
+
+  // Estimate the communication cost for a collective op, override this for
+  // target-specific collective implementation.
+  virtual int64_t CommunicationCostInBytes(HloInstruction* hlo);
+
+  const absl::flat_hash_set<absl::string_view>& execution_threads() const {
+    return execution_threads_;
+  }
+
+ protected:
+  // This is the internal implementation for AllGatherShards(), returns a pair
+  // of hlo instructions whose first element is the result of the all-gather
+  // shard(which might not be the all-gather itself and it could go through
+  // some other formatting instructions), and the second element is the
+  // all-gather being generated or nullptr is no all-gather is generated.
+  std::pair<HloInstruction*, HloInstruction*> AllGatherShardsInternal(
+      SpmdBuilder* b, HloInstruction* operand, const HloSharding& sharding,
+      int64_t* next_channel_id, absl::Span<const int64_t> selected_dims,
+      const SPMDCollectiveOpsCreator& collectives_creator, bool per_dim_ag);
+
+  HloInstruction* AllReduceAlongShardingDimsInternal(
+      SpmdBuilder* b, HloInstruction* operand, const HloSharding& sharding,
+      int64_t* next_channel_id, absl::Span<const int64_t> selected_dims,
+      const SPMDCollectiveOpsCreator& collectives_creator,
+      HloComputation* reduction, bool per_dim_ar);
+
+  // Verifies that the sharding of instructions in the module are valid, and
+  // also fill in missing sharding information.
+  virtual absl::Status PreprocessSharding(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Returns if the given side-effecting instruction is allowed to have
+  // replicated sharding.
+  virtual bool CanSideEffectingHaveReplicatedSharding(
+      const HloInstruction* hlo) {
+    if (hlo->opcode() == HloOpcode::kCustomCall) {
+      if (auto* partitioner =
+              GetCustomCallPartitioner(hlo->custom_call_target())) {
+        return partitioner->CanSideEffectingHaveReplicatedSharding();
+      }
+    }
+    return hlo->opcode() == HloOpcode::kInfeed ||
+           hlo->opcode() == HloOpcode::kOutfeed;
+  }
+
+  // Preprocesses the graph to simplify some communication patterns. E.g., merge
+  // pad->slice into a single pad with potentially negative padding to avoid
+  // multiple halo exchanges.
+  absl::Status PreprocessHlos(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // A plug for subclasses to alter the IR based on the computation that has the
+  // rotate-right pattern. This is called during `PreprocessHlos`.
+  virtual absl::Status HandleRotateRightWhilePreprocessing(
+      HloComputation* computation) {
+    return absl::OkStatus();
+  };
+
+  void set_execution_threads(
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    execution_threads_ = execution_threads;
+  }
+
+  const int64_t num_partitions_;
+  const int64_t num_replicas_;
+
+  SpmdPartitionerOptions options_;
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+  std::vector<std::vector<int64_t>> device_groups_;
+  absl::flat_hash_set<absl::string_view> execution_threads_;
+};
+
+// Class describes partition state of the data represented by an HLO created
+// during SPMD partitioning pass.
+//
+// Data on some devices may include padding region, if the base (full) shape
+// could not be evenly partitioned.
+class PartitionedHlo {
+ public:
+  // Return value for ReshardAsWindowedInput which describes the resharded HLO,
+  // the window for the user on the shard, and if necessary, the dynamic slice
+  // offsets to be applied to the output of the op being sharded.
+  struct WindowedInputShardReturnValue {
+    HloInstruction* sharded_input;
+    Window shard_window;
+    std::optional<std::vector<HloInstruction*>> dynamic_slice_index_on_output;
+  };
+  // A cache for resharding each partitioned HLO.
+  struct ReshardCache {
+    struct PerHloCache {
+      absl::flat_hash_map<HloSharding, PartitionedHlo> reshard_cache;
+      std::vector<
+          std::tuple<HloSharding, Window, WindowedInputShardReturnValue>>
+          window_reshard_cache;
+    };
+    // Use absl::node_hash_map for pointer stability.
+    absl::node_hash_map<HloInstruction*, PerHloCache> per_hlo_cache;
+    // Caches for nested partitioning of grouped sharding. Each string key
+    // represents a unique way of grouping devices.
+    absl::flat_hash_map<std::string, std::unique_ptr<ReshardCache>>
+        groupd_caches;
+  };
+  struct PartitioningState {
+    SpmdBuilder* b;
+    HloModule* module;
+    int64_t num_replicas;
+    HloInstruction* partition_id;
+    SPMDCollectiveOpsCreator collective_ops_creator;
+    int64_t* next_channel_id;
+    ReshardCache* reshard_cache;
+    SpmdPartitioner* partitioner;
+  };
+  PartitionedHlo(HloInstruction* hlo, Shape base_shape, PartitioningState state)
+      : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
+    CHECK(hlo->has_sharding())
+        << "PartitionedHlo is missing sharding:" << hlo->ToString();
+  }
+
+  PartitionedHlo CloneWithNewHlo(HloInstruction* hlo) const {
+    PartitionedHlo new_phlo = *this;
+    new_phlo.hlo_ = hlo;
+    if (!hlo->has_sharding() && hlo_->has_sharding()) {
+      hlo->copy_sharding(hlo_);
+    }
+    return new_phlo;
+  }
+
+  // Reshards the current SPMD instruction to a new sharding with optional
+  // specified pad value used during resharding. Could only modify the reshard
+  // cache.
+  PartitionedHlo Reshard(const HloSharding& target,
+                         std::optional<Literal> pad_value = std::nullopt) const;
+
+  // Pads the garbage area of the output with the provided value. Normally,
+  // unevenly partitioned dimensions are padded on the right, but this function
+  // allows specifying left-padded dimensions, which can be used during the
+  // handling of kReverse, etc.
+  PartitionedHlo PadWithValue(
+      HloInstruction* pad_value,
+      absl::Span<const int64_t> left_padded_dims = {},
+      absl::Span<const int64_t> skipped_dims = {}) const;
+
+  // Same as PadWithValue but does not create a new PartitionedHlo.
+  HloInstruction* PadWithValueHlo(
+      HloInstruction* pad_value,
+      absl::Span<const int64_t> left_padded_dims = {},
+      absl::Span<const int64_t> skipped_dims = {}) const;
+
+  PartitionedHlo PadWithZero(absl::Span<const int64_t> left_padded_dims = {},
+                             absl::Span<const int64_t> skipped_dims = {}) const;
+
+  // Returns the SPMD instruction.
+  HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the sharding of the SPMD instruction.
+  const HloSharding& sharding() const { return hlo_->sharding(); }
+
+  // Returns the rank of the SPMD instruction.
+  const int64_t rank() const { return base_shape_.rank(); }
+
+  // Original full shape of the data.
+  const Shape& base_shape() const { return base_shape_; }
+
+  int64_t NewChannel() const { return (*state_.next_channel_id)++; }
+
+  // Reshards the HLO to a usable partitioned input for a windowed user. Could
+  // only modify the reshard cache.
+  std::optional<WindowedInputShardReturnValue> ReshardAsWindowedInput(
+      const Window& window, const HloSharding& target,
+      HloInstruction* pad_value, bool mask_invalid_region = true,
+      bool force_mask_in_compact = false);
+
+  const PartitioningState& state() const { return state_; }
+
+  // Helper function to replicate the data on all devices. Could only modify
+  // the reshard cache.
+  PartitionedHlo Replicate() const;
+
+  // Helper function to replicate the data for partitions along the given dims.
+  HloInstruction* ReplicatePartial(absl::Span<const int64_t> dims) const;
+
+  // Set state of the partitoned HLO.
+  void set_state(PartitioningState state) { state_ = std::move(state); }
+
+ private:
+  // Same as Reshard except that it does not explicitly modify the reshard
+  // cache, although it would indirectly modify by calling Replicate().
+  PartitionedHlo ReshardNoCache(const HloSharding& target,
+                                std::optional<Literal> pad_value = std::nullopt,
+                                bool allow_full_replication = true) const;
+
+  // Helper function to broadcast data from a single device to all devices.
+  PartitionedHlo Broadcast() const;
+
+  // Try to perform complicated reshard handling by splitting a big reshard into
+  // multiple reshards using that can be handled directly.
+  std::optional<PartitionedHlo> TryComplexReshardHandling(
+      const HloSharding& target) const;
+
+  // Helper function to reshard the tensor using AllToAll (instead of the
+  // default of Replicate followed by Slice).
+  PartitionedHlo ReshardWithAllToAll(
+      const HloSharding& target,
+      absl::Span<const std::pair<int64_t, int64_t>> source_target_dims,
+      bool try_multiple_source_target_dims = true) const;
+
+  // Called by ReshardWithAllToAll if try_multiple_source_target_dims is true.
+  // Try to handle multiple source and target dims in a single AllToAll.
+  PartitionedHlo TryMultipleSourceTargetDims(
+      const HloSharding& target,
+      absl::Span<const std::pair<int64_t, int64_t>> source_target_dims) const;
+
+  // Helper function to reshard the tensor using CollectivePermute.
+  PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
+
+  // Helper function to reshard to partial replicate using AllGather.
+  std::optional<PartitionedHlo> ReshardToPartialReplicateWithAllGather(
+      const HloSharding& target) const;
+
+  // Helper function to reshard from partial replicate using DynamicSlice.
+  std::optional<PartitionedHlo> ReshardFromPartialReplicateWithDynamicSlice(
+      const HloSharding& target) const;
+
+  // Helper function to reshard from partial replicate using AllToAll.
+  std::optional<PartitionedHlo> ReshardPartialReplicateWithAllToAll(
+      const HloSharding& target) const;
+
+  // SPMD instruction.
+  HloInstruction* hlo_;
+
+  // The original shape of the data before SPMD transformation is applied.
+  Shape base_shape_;
+
+  PartitioningState state_;
+};
+
+class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
+ public:
+  SpmdPartitioningVisitor(
+      HloComputation* computation, int64_t num_partitions, int64_t num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64_t* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options, SpmdPartitioner* partitioner,
+      const CallGraph& call_graph);
+
+  SpmdPartitioningVisitor(const SpmdPartitioningVisitor& src);
+
+  absl::Status DefaultAction(HloInstruction* hlo) override;
+
+  absl::Status HandleAllReduce(HloInstruction* hlo) override;
+  absl::Status HandleBroadcast(HloInstruction* hlo) override;
+  absl::Status HandleCall(HloInstruction* hlo) override;
+  absl::Status HandleConcatenate(HloInstruction* hlo) override;
+  absl::Status HandleConditional(HloInstruction* hlo) override;
+  absl::Status HandleConstant(HloInstruction* hlo) override;
+  absl::Status HandleConvolution(HloInstruction* hlo) override;
+  absl::Status HandleCustomCall(HloInstruction* hlo) override;
+  absl::Status HandleDot(HloInstruction* hlo) override;
+  absl::Status HandleDynamicSlice(HloInstruction* hlo) override;
+  absl::Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  absl::Status HandleFft(HloInstruction* hlo) override;
+  absl::Status HandleGather(HloInstruction* hlo) override;
+  absl::Status HandleGetTupleElement(HloInstruction* hlo) override;
+  absl::Status HandleInfeed(HloInstruction* hlo) override;
+  absl::Status HandleIota(HloInstruction* hlo) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* hlo) override;
+  absl::Status HandleOutfeed(HloInstruction* hlo) override;
+  absl::Status HandlePad(HloInstruction* hlo) override;
+  absl::Status HandleParameter(HloInstruction* hlo) override;
+  absl::Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status HandleReduce(HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(HloInstruction* hlo) override;
+  absl::Status HandleReshape(HloInstruction* hlo) override;
+  absl::Status HandleReverse(HloInstruction* hlo) override;
+  absl::Status HandleRng(HloInstruction* hlo) override;
+  absl::Status HandleScatter(HloInstruction* hlo) override;
+  absl::Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  absl::Status HandleSlice(HloInstruction* hlo) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleTranspose(HloInstruction* hlo) override;
+  absl::Status HandleTuple(HloInstruction* hlo) override;
+  absl::Status HandleWhile(HloInstruction* hlo) override;
+
+  // Implementation of dot partitioning given DotGeneralDimsMapping.
+  absl::Status HandleDotHelper(
+      HloInstruction* hlo,
+      const dot_as_convolution_util::DotConvolutionDimsInfo& dims_mapping,
+      absl::FunctionRef<absl::StatusOr<HloInstruction*>(
+          HloInstruction*, HloInstruction*, SpmdBuilder*,
+          const Window& conv_window)>
+          create_sharded_dot);
+
+  // Common handle for elementwise HLOs.
+  absl::Status HandleElementwise(HloInstruction* hlo);
+
+  // Common handle for HLOs that runs on a single device.
+  absl::Status HandleSingleDevice(const HloInstruction* hlo);
+
+  // CustomCall handlers per call target.
+  absl::Status HandleCustomCallTopK(HloInstruction* hlo);
+  // Convenient custom ops defined by the partitioner itself.
+  absl::Status HandleCustomCallSPMDInternal_RotateRight(HloInstruction* hlo);
+
+  virtual std::unique_ptr<SpmdPartitioningVisitor> Clone() const;
+
+  // Returns the PartitionedHlo that corresponds to the original hlo.
+  PartitionedHlo& GetPartitionedHlo(const HloInstruction* hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 1);
+    return partitioned_instructions_.find(hlo)->second;
+  }
+
+  // Sets the PartitionedHlo for the original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const PartitionedHlo& partitioned_hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 0);
+    partitioned_instructions_.emplace(hlo, partitioned_hlo);
+    changed_ = true;
+  }
+
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         absl::FunctionRef<HloInstruction*()> func) {
+    HloInstruction* new_hlo = func();
+    new_hlo->set_sharding(hlo->sharding());
+    SetPartitionedHlo(
+        hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
+    changed_ = true;
+  }
+
+  int64_t NewChannel() { return (*next_channel_id_)++; }
+
+  PartitionedHlo::PartitioningState MakePartitioningState();
+
+  SpmdBuilder* builder() { return &b_; }
+
+  virtual absl::StatusOr<bool> DoPartition(
+      HloComputation* computation, const HloSharding& root_sharding,
+      const SpmdPartitionerOptions& options);
+
+  virtual double GetComputationTimeInMilliSec(HloInstruction* hlo) {
+    return 0.0;
+  }
+
+  virtual double GetCommunicationTimeInMilliSec(
+      int64_t bytes, absl::Span<const ReplicaGroup> device_groups) {
+    return 0.0;
+  }
+
+  virtual int GetCommunicationMultiplier(
+      absl::Span<const ReplicaGroup> device_groups) {
+    return 1;
+  }
+
+  std::vector<ReplicaGroup> CreateReplicaGroups(
+      std::vector<std::vector<int64_t>>& groups);
+
+  const CallGraph& call_graph() { return call_graph_; }
+  int64_t num_partitions() const { return num_partitions_; }
+  int64_t num_replicas() const { return num_replicas_; }
+  SpmdLogger* logger() { return logger_; }
+  const SpmdLogger* logger() const { return logger_; }
+  const SpmdPartitionerOptions& options() const { return options_; }
+  SpmdPartitioner* partitioner() { return partitioner_; }
+  const SpmdPartitioner* partitioner() const { return partitioner_; }
+  SPMDCollectiveOpsCreator& collective_ops_creator() {
+    return collective_ops_creator_;
+  }
+  const SPMDCollectiveOpsCreator& collective_ops_creator() const {
+    return collective_ops_creator_;
+  }
+  HloModule* module() { return module_; }
+  const HloModule* module() const { return module_; }
+  void set_module(HloModule* module) { module_ = module; }
+
+  // Information about a loop created for windowed dot-general. Used when
+  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
+  // finishes traversing the graph.
+  struct WindowedDotGeneralLoop {
+    HloInstruction* while_loop;
+    int64_t windowed_operand;
+    bool windowed_in_contracting_dims;
+    bool windowed_in_batch_dims;
+    bool operands_sharded_at_contracting_dims;
+    int64_t num_partitions;
+    std::vector<ReplicaGroup> loop_replica_groups;
+  };
+
+ protected:
+  absl::Status Preprocess(HloInstruction* hlo) override;
+  absl::Status Postprocess(HloInstruction* hlo) override;
+
+  // Performs code motion for windowed dot-general loops in
+  // windowed_dot_general_loops_. Invoked after the visitor finishes traversing
+  // the graph.
+  absl::Status DoCodeMotionForWindowedDotGeneralLoops(
+      HloComputation* computation, const SpmdPartitionerOptions& options);
+
+  bool changed_;
+  HloModule* module_;
+  int64_t num_partitions_;
+  int64_t num_replicas_;
+
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+
+  // Tracks the next channel id to use for cross-partition all-reduce.
+  int64_t* next_channel_id_;
+  SpmdBuilder b_;
+
+  std::vector<WindowedDotGeneralLoop> windowed_dot_general_loops_;
+
+  HloInstruction* partition_id_;
+
+ private:
+  PartitionedHlo::ReshardCache reshard_cache_;
+
+  // Mapping from the instruction in the original computation to the new SPMD
+  // partitioned instruction.
+  ConstHloInstructionMap<PartitionedHlo> partitioned_instructions_;
+
+  HloInstruction* visiting_hlo_;
+  SpmdLogger* logger_;
+  const SpmdPartitionerOptions options_;
+  SpmdPartitioner* partitioner_;
+  std::vector<HloSharding> visiting_hlo_operand_shardings_;
+  std::optional<HloSharding> visiting_hlo_sharding_;
+  std::optional<int64_t> visiting_num_partitions_;
+  std::optional<SPMDCollectiveOpsCreator> visiting_collective_ops_creator_;
+  std::optional<HloInstruction*> visiting_partition_id_;
+  std::vector<PartitionedHlo::PartitioningState> visiting_state_;
+  std::vector<std::vector<int64_t>> device_groups_;
+  const CallGraph& call_graph_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+#endif  // XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
new file mode 100644
index 00000000..be596dd0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -0,0 +1,964 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+#define XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_replace.h"
+#include "absl/types/span.h"
+#include "absl/utility/utility.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/spmd/spmd_partitioner.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace spmd {
+
+template <typename T>
+using IsCompOrCompBuilder =
+    typename std::enable_if_t<std::is_same<HloComputation, T>::value ||
+                              std::is_same<HloComputation::Builder, T>::value ||
+                              std::is_same<SpmdBuilder, T>::value>;
+
+struct GatherScatterParallelDimSharding {
+  HloSharding indices_sharding;
+  HloSharding operand_sharding;
+};
+
+// Returns true if the given sharding contains any replicated sharding.
+bool HasReplicatedSharding(const HloSharding& sharding);
+
+// Base for creating constants.
+template <typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* CreateConstantBase(const Shape& shape, Literal value, T* b,
+                                   Literal (*literal_creator)(Literal,
+                                                              PrimitiveType)) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    elements.reserve(ShapeUtil::TupleElementCount(shape));
+    for (int64_t i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(
+          CreateConstantBase(ShapeUtil::GetTupleElementShape(shape, i),
+                             value.Clone(), b, literal_creator));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  if (shape.IsToken()) {
+    return b->AddInstruction(HloInstruction::CreateToken());
+  }
+  auto c = b->AddInstruction(HloInstruction::CreateConstant(
+      literal_creator(std::move(value), shape.element_type())));
+  if (shape.rank() == 0) {
+    return c;
+  }
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, c, {}));
+}
+
+// Creates constant value instructions of the given shape. The literal must be a
+// scalar shape and is broadcast to the given shape.
+template <typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* CreateConstant(const Shape& shape, Literal value, T* b) {
+  auto identity = [](Literal value, PrimitiveType primitive_type) {
+    CHECK(ShapeUtil::IsScalarWithElementType(value.shape(), primitive_type));
+    return value;
+  };
+  return CreateConstantBase(shape, std::move(value), b, identity);
+}
+
+// Creates zero value instructions of the given shape.
+template <typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* CreateZero(const Shape& shape, T* b) {
+  auto zero = [](Literal /*unused*/, PrimitiveType primitive_type) {
+    return LiteralUtil::Zero(primitive_type);
+  };
+  return CreateConstantBase(shape, /*unused*/ Literal(), b, zero);
+}
+
+// Creates one value instructions of the given shape.
+template <typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* CreateOne(const Shape& shape, T* b) {
+  auto one = [](Literal /*unused*/, PrimitiveType primitive_type) {
+    return LiteralUtil::One(primitive_type);
+  };
+  return CreateConstantBase(shape, /*unused*/ Literal(), b, one);
+}
+
+template <typename NativeT, typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value, T* b) {
+  auto literal = LiteralUtil::CreateR0(value)
+                     .ConvertToShape(ShapeUtil::MakeShape(type, {}))
+                     .value();
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+template <typename T, typename = IsCompOrCompBuilder<T>>
+inline HloInstruction* CreateFirstWithType(PrimitiveType type, T* b) {
+  if (type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    return CreateR0WithType(type, -float_pad_value, b);
+  }
+  auto literal = LiteralUtil::MinValue(type);
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+template <typename T, typename = IsCompOrCompBuilder<T>>
+inline HloInstruction* CreateLastWithType(PrimitiveType type, T* b) {
+  if (type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    return CreateR0WithType(type, float_pad_value, b);
+  }
+  auto literal = LiteralUtil::MaxValue(type);
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+// Create a binary add computation of the given type and add to the module.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module);
+
+// Returns true if the shape can be evenly partitioned for the given sharding.
+// All tile sharded dimensions should be evenly divisible and there should be no
+// single-device sharding. Replicate sharding is considered even partition.
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape of the given shape when it is partitioned for the
+// target sharding.
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
+
+// Similar to ShapeUtil::ByteSizeOf(), but does not check it has dense layout
+// since this can be before layout assignment.
+int64_t ShapeSizeInBytes(const Shape& shape);
+
+// Creates a table lookup HLO using the ordinal as the offset.
+template <typename NativeT>
+HloInstruction* TableLookup(absl::Span<const NativeT> table, PrimitiveType type,
+                            HloInstruction* ordinal, SpmdBuilder* b) {
+  HloInstruction* table_hlo = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<NativeT>(table)));
+  HloInstruction* value = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      ShapeUtil::MakeShape(type, {1}), table_hlo, {ordinal}, {1}));
+  return b->AddInstruction(
+      HloInstruction::CreateReshape(ShapeUtil::MakeShape(type, {}), value));
+}
+
+// Returns the shard shape for a partition without padding due to uneven
+// sharding.
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64_t partition_id);
+
+// Generates the HLO instructions that represent the dimension offsets on any
+// device. The size of the returned vector is the rank of the given shape.
+// If `dims` is non-empty, the generated offsets will only be non-zero for those
+// dimensions.
+std::vector<HloInstruction*> MakePartitionOffsets(
+    const Shape& shape, const HloSharding& sharding,
+    HloInstruction* partition_id, SpmdBuilder* b,
+    absl::Span<const int64_t> dims = {});
+
+// Returns the offsets of the partition in the tile assignment.
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Pads hlo to the desired shape using high padding. Either a builder or a
+// computation needs to be supplied, but not both.
+template <typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape, T* b,
+                           std::optional<Literal> value = std::nullopt) {
+  if (ShapeUtil::Compatible(hlo->shape(), padded_shape)) {
+    return hlo;
+  }
+  PaddingConfig padding_config;
+  for (int64_t i = 0; i < padded_shape.rank(); ++i) {
+    auto padding_config_dim = padding_config.add_dimensions();
+    padding_config_dim->set_edge_padding_low(0);
+    padding_config_dim->set_interior_padding(0);
+    padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                              hlo->shape().dimensions(i));
+  }
+  const Shape padding_shape =
+      ShapeUtil::MakeScalarShape(hlo->shape().element_type());
+  HloInstruction* padding =
+      value.has_value() ? CreateConstant(padding_shape, std::move(*value), b)
+                        : CreateZero(padding_shape, b);
+  return b->AddInstruction(
+      HloInstruction::CreatePad(padded_shape, hlo, padding, padding_config));
+}
+
+// Returns the padded shape when combining all partitions.
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding);
+
+// Pads the HLO (with base shape) for uneven tiled partition to make it evenly
+// partitionable.
+template <typename T, typename = IsCompOrCompBuilder<T>>
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, T* b,
+    std::optional<Literal> value = std::nullopt) {
+  auto padded_base_shape =
+      GetPaddedShapeForUnevenPartitioning(hlo->shape(), sharding);
+  if (ShapeUtil::Compatible(padded_base_shape, hlo->shape())) {
+    return hlo;
+  }
+  return PadToShape(hlo, padded_base_shape, b, std::move(value));
+}
+
+// Returns the index of the unique tile dimension. Returns std::nullopt if the
+// given sharding is not tiled or tiled along multiple dimensions.
+std::optional<int64_t> UniqueTiledDim(const HloSharding& sharding);
+
+// Utilities for symbolic offset calculation and halo exchange.
+class OffsetCalculation;
+
+// Represents a calculation over integers:
+//   (shard_ordinal * multiplier + offset) / divisor
+class MultiplyAddDivideOffsetCalculation {
+ public:
+  MultiplyAddDivideOffsetCalculation()
+      : multiplier_(0), offset_(0), divisor_(1) {}
+  MultiplyAddDivideOffsetCalculation(int64_t multiplier, int64_t offset,
+                                     int64_t divisor);
+
+  OffsetCalculation operator-(
+      const MultiplyAddDivideOffsetCalculation& other) const;
+  OffsetCalculation operator+(
+      const MultiplyAddDivideOffsetCalculation& other) const;
+
+  bool operator==(const MultiplyAddDivideOffsetCalculation& other) const {
+    return multiplier_ == other.multiplier_ && offset_ == other.offset_ &&
+           divisor_ == other.divisor_;
+  }
+
+  bool IsConstant() const { return multiplier_ == 0; }
+  void Simplify();
+  int64_t Calculate(int64_t shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64_t MaxInRange(int64_t start_ordinal, int64_t limit_ordinal) const;
+
+ private:
+  int64_t multiplier_;
+  int64_t offset_;
+  int64_t divisor_;
+};
+
+// Represents a calculation over integers based on results of other calculations
+// defined by an opcode. If the opcode is kCopy, it simply wraps an
+// MultiplyAddDivideOffsetCalculation.
+class OffsetCalculation {
+ public:
+  OffsetCalculation() : opcode_(HloOpcode::kCopy), copy_from_() {}
+  explicit OffsetCalculation(
+      const MultiplyAddDivideOffsetCalculation& copy_from)
+      : opcode_(HloOpcode::kCopy), copy_from_(copy_from) {}
+  OffsetCalculation(const OffsetCalculation& copy_from) { *this = copy_from; }
+  OffsetCalculation(HloOpcode opcode,
+                    const MultiplyAddDivideOffsetCalculation& lhs,
+                    const MultiplyAddDivideOffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(std::make_unique<OffsetCalculation>(lhs)),
+        rhs_(std::make_unique<OffsetCalculation>(rhs)) {}
+  OffsetCalculation(HloOpcode opcode, const OffsetCalculation& lhs,
+                    const OffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(std::make_unique<OffsetCalculation>(lhs)),
+        rhs_(std::make_unique<OffsetCalculation>(rhs)) {}
+
+  OffsetCalculation& operator=(const OffsetCalculation& other);
+
+  // Returns whether the calculation returns the same value for all shards. This
+  // is conservative and could return false even if it is actually constant.
+  bool IsConstant() const;
+
+  OffsetCalculation operator-(const OffsetCalculation& other) const;
+  OffsetCalculation operator+(const OffsetCalculation& other) const;
+  bool operator==(const OffsetCalculation& other) const;
+  int64_t Calculate(int64_t shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64_t MaxInRange(int64_t start_ordinal, int64_t limit_ordinal) const;
+
+ private:
+  HloOpcode opcode_;
+  std::unique_ptr<OffsetCalculation> lhs_;
+  std::unique_ptr<OffsetCalculation> rhs_;
+  MultiplyAddDivideOffsetCalculation copy_from_;
+};
+
+// Performs halo exchange on the given dimension based on the provided
+// left/right halo size functions. Returns nullopt if the halo is beyond the
+// direct neighbor of the shard.
+std::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64_t dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, SpmdBuilder* b);
+
+// Exchange halo on all dimensions of the HLO. Returns nullopt if any one of the
+// dimensions fails to exchange halo (halo is beyond the neighbor shard).
+std::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, SpmdBuilder* b);
+
+// A compact version of halo exchange, which generates fewer collective permutes
+// when the halo ranges are far from the current shard while the final result
+// size is small. It tries to reuse the same collective permute to do as many
+// disjoint communications as possible. It also includes data masking. pad_value
+// can be nullptr, which means the value in padding regions doesn't matter.
+HloInstruction* ExchangeHaloCompact(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    HloInstruction* pad_value, int64_t dim, const HloSharding& sharding,
+    HloInstruction* shard_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, SpmdBuilder* b);
+
+// Exchanges halos and performs pad/dynamic-slice on the concatenated data such
+// that the result starts with the first needed element on each shard. It also
+// masks off invalid data due to padding.
+// Arguments:
+//  hlo: the HLO op before halo exchange
+//  explicit_left_padding_on_full_shape: the amount of left padding to be added
+//   explicitly by this function on the base shape before partitioning. Without
+//   base dilation, this is usually set to the window's padding_low so that the
+//   sharded op do not need to add padding_low on the window; however, with base
+//   dilation, this could only be set to a custom size.
+//  padded_full_shape_size: the size of the padded full shape on the given
+//   dimension, which includes explicit_left_padding_on_full_shape and required
+//   right padding to make the shape evenly shardable.
+//  shard_size_with_halo: the shard size on the dimension after halo exchange.
+//   If different shards have different sizes, use the maximum size.
+//  offset_on_padded_shape: the offset HLO (S32) that represents the start of
+//   each shard on the padded full shape.
+//  pad_value: the padding value used on the full shape.
+//  force_mask_in_compact: If true, masking is always applied if it uses
+//   ExchangeHaloCompact. An example is that certain cases in pad can skip
+//   masking in non-compact halo exchange, but not in compact ones.
+std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64_t explicit_left_padding_on_full_shape, int64_t padded_full_shape_size,
+    int64_t shard_size_with_halo, int64_t dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true,
+    bool force_mask_in_compact = false);
+
+// Uses halo exchange to change from right-padding to left-padding for uneven
+// tiled sharding on the given dimensions. Tiled sharding always pads uneven
+// partitioned data on the right, but we need to swap it to the left for
+// kReverse or kConvolution with window reversal.
+HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
+                                        absl::Span<const int64_t> dims);
+
+// Check if the computation is GT comparison and safe for NaNs.
+bool IsNanSafeGt(HloComputation* computation);
+
+// Return k in TopK when input value is parttioned in the sort dimension.
+std::optional<int64_t> GetKValueInTopKWhenPartitionSortDim(HloInstruction* hlo);
+
+// Slices the first k elements at slice dimension.
+HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
+                            int64_t slice_dim, int64_t k);
+
+// Check if a dimension is sharded.
+int64_t ShardCountAtDim(const HloSharding& sharding, int64_t dim);
+
+// Returns the list of source-target pairs of dimensions to swap during
+// resharding via all-to-all. Reshard can be done by swapping each pair at a
+// time.
+std::optional<std::vector<std::pair<int64_t, int64_t>>>
+GetReshardAllToAllSourceTargetDims(const HloSharding& source,
+                                   const HloSharding& target);
+
+// Returns whether the resharding can be done via collective-permute.
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target);
+
+// Returns a new GroupedSharding that has the same group definition of
+// `reference`.
+hlo_sharding_util::GroupedSharding AlignGroupsWith(
+    hlo_sharding_util::GroupedSharding grouped_sharding,
+    const hlo_sharding_util::GroupedSharding& reference,
+    bool ignore_group_order = false);
+
+// Align device groups between the two shardings. Equivalent in calling
+// GroupShardingOnDims on the two sharding AlignGroupsWith and then
+// UngroupSharding
+HloSharding AlignShardingOnDims(const HloSharding& sharding,
+                                absl::Span<const int64_t> sharding_dims,
+                                const HloSharding& reference,
+                                absl::Span<const int64_t> reference_dims);
+
+// AlignShardingOnDims only if it doesn't change the sharding when ungrouped.
+std::optional<hlo_sharding_util::GroupedSharding> AlignGroupsWithIfCompatible(
+    hlo_sharding_util::GroupedSharding grouped_sharding,
+    const hlo_sharding_util::GroupedSharding& reference);
+
+// Returns the per-group base shape, i.e., before applying the in-group
+// sharding.
+Shape GetPerGroupBaseShape(
+    const hlo_sharding_util::GroupedSharding& grouped_sharding,
+    const Shape& original_base_shape);
+
+// Returns the partition id within a group.
+HloInstruction* GetInGroupPartitionId(
+    HloInstruction* partition_id,
+    const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b);
+
+// Creates the nested partitioner state for in-group partitioning.
+PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
+    const PartitionedHlo::PartitioningState& state,
+    const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b);
+
+// Partially shards a replicated HLO into groups along the group dimensions, and
+// within each group data is still replicated.
+HloInstruction* PerGroupSliceFromReplicated(
+    HloInstruction* replicated, HloInstruction* partition_id,
+    const std::vector<std::vector<int64_t>>& device_groups,
+    absl::Span<const int64_t> group_dims,
+    absl::Span<const int64_t> group_dim_sizes, SpmdBuilder* b);
+
+// Pad the shape from partial replicate shape for `dst_sharding`.
+// If dst_sharding needs more padding and per_shard_size increased in
+// dst_sharding, halo exchange on the right side is needed.
+std::optional<HloInstruction*> PadFromPartialReplicateShape(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64_t>& expand_tile_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Get the compatible sharding from a partial replicate sharding to a desired
+// target tiled sharding.
+// Compatible means replicate sharding can transform to the target tile
+// dimensions by dynamic slice.
+// For example, if partial_sharding is
+// {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+// Target sharding is {devices=[2,2]0,1,2,3}, the returned compatible sharding
+// will be sharding={devices=[2,2]0,2,1,3}.
+// If partial_sharding is not partial replicate or can't reshard to
+// target_tile_dims by dynamic slice, return std::nullopt.
+// If target_sharding is already compatible, returns it.
+std::optional<HloSharding> PartialReplicateReshardCompatibleSharding(
+    const HloSharding& partial_sharding, const HloSharding& target_sharding);
+
+// Do left halo exchange if all-reduce directly from tile sharding to partial
+// replicate sharding will remove useful data from the source.
+std::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64_t>& replicate_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Finds a list of dimensions that can be grouped on such that it will have the
+// specified device groups. Group order and dimension order are ignored.
+std::optional<std::vector<int64_t>> FindMatchingPartitionedDimsForGrouping(
+    const HloSharding& sharding,
+    const std::vector<std::vector<int64_t>>& device_groups);
+
+// Create a sharding that matches the provided source sharding on the
+// specified dimensions. 'target_dims' and 'source_dims' represent the
+// dimensions for which the sharding should match in their respective shape.
+// If some devices from the source sharding are left over (because not all the
+// devices are allocated to 'source_dims' dimensions) then partial replication
+// is employed to make sure the number of devices for the two sharding match.
+HloSharding CreateMatchingShardingOnDims(const Shape& target_shape,
+                                         const HloSharding& source_sharding,
+                                         absl::Span<const int64_t> target_dims,
+                                         absl::Span<const int64_t> source_dims);
+
+// Returns if the sharding across operand and indices of a gather/scatter is
+// across parallel dimensions and matches what SPMD partitioner supports.
+std::optional<GatherScatterParallelDimSharding>
+GatherScatterOperandsShardedAcrossParallelDims(
+    const HloInstruction& operand, const HloInstruction& indices,
+    const hlo_sharding_util::GatherScatterDims& parallel_dims);
+
+// Pattern rewrite preprocessing utilities.
+
+// Returns rotate_amount if the concat(lhs, rhs) is equivalent to rotating the
+// elements along the concat dimension to the right by rotate_amount, where the
+// input of rotation is the shard operand of lhs and rhs. Returns -1 if the
+// pattern is not found.
+int64_t FindRotateRightPattern(const HloInstruction* concat,
+                               const HloInstruction* lhs,
+                               const HloInstruction* rhs);
+
+// Describes the pad with wrap pattern.
+struct PadWithWrapPattern {
+  int64_t lhs_slice_start;
+  int64_t rhs_slice_start;
+  std::vector<const HloInstruction*> lhs_modifiers;
+  std::vector<const HloInstruction*> rhs_modifiers;
+};
+
+// Returns the `PadWithWrapPattern` if the concat(lhs,mid,rhs) is equivalent to
+// padding mid with wrapping (i.e., padding mid with slices of itself). Return
+// std::nullopt if the pattern is not found.
+std::optional<PadWithWrapPattern> FindPadWithWrapPattern(
+    const HloInstruction* concat, const HloInstruction* lhs,
+    const HloInstruction* mid, const HloInstruction* rhs);
+
+// Reshards data for a slice to be happening on such data with the passed
+// parameters.
+std::optional<PartitionedHlo::WindowedInputShardReturnValue>
+ReshardDataForSlicing(absl::Span<const int64_t> strides,
+                      absl::Span<const int64_t> starts,
+                      absl::Span<const int64_t> limits,
+                      PartitionedHlo to_reshard,
+                      const HloSharding& target_sharding, SpmdBuilder* b);
+
+// Performs slicing of data based on the windowed sharding passed as input.
+HloInstruction* SliceDataFromWindowReshard(
+    const PartitionedHlo::WindowedInputShardReturnValue& reshard_operand,
+    absl::Span<const int64_t> strides, const Shape& base_shape,
+    const HloSharding& target_sharding, SpmdBuilder* b);
+
+// Reshards data for a pad to be happening on such data with the passed
+// parameters.
+std::optional<PartitionedHlo::WindowedInputShardReturnValue> ReshardDataForPad(
+    HloInstruction* pad_value, PaddingConfig pc, PartitionedHlo to_reshard,
+    const HloSharding& target_sharding, SpmdBuilder* b);
+
+// Performs padding of data based on the windowed sharding passed as input.
+HloInstruction* PadDataFromWindowReshard(
+    const PartitionedHlo::WindowedInputShardReturnValue& reshard_operand,
+    HloInstruction* pad_value, SpmdBuilder* b);
+
+// Generates partition groups (groups of devices that will communicate via a
+// collective) from sharding and provided replication_dims.
+std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64_t> replication_dims);
+
+// Generates partition groups (groups of devices that will communicate via a
+// collective) in iota format from sharding and provided replication_dims.
+// NOTE: If provided sharding does not utilize all the partitions, we skip
+// generating a compressed format. This is because this device ids
+// (IotaReplicaGroupList) generated by this method are partition ids, but later
+// they have to be expanded across replicas into global device ids (see
+// ExpandPartitionGroupListAcrossReplicas) before they are inserted into a
+// collective. The expansion to global device ids while retaining the compressed
+// format is only possible if the device list generated covers all partitions.
+// The generated device list can cover all partitions if the provided
+// sharding covers all partitions.
+std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64_t> replication_dims,
+    int64_t num_partitions);
+
+// Expands partition group list across all replicas. Expects that provided
+// partition_group_list utilizes all the partitions.
+CollectiveDeviceList ExpandPartitionGroupListAcrossReplicas(
+    IotaReplicaGroupList partition_group_list, int num_replicas,
+    int num_partitions);
+
+namespace detail {
+
+// Check if a type is SpmdPartitioningVisitor* type.
+template <typename T, typename = void>
+struct IsSpmdPartitioningVisitorPointerType : std::false_type {};
+
+template <typename T>
+struct IsSpmdPartitioningVisitorPointerType<
+    T, std::enable_if_t<std::is_same_v<std::remove_reference_t<T>,
+                                       SpmdPartitioningVisitor*>>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool IsSpmdPartitioningVisitorPointerType_v =
+    IsSpmdPartitioningVisitorPointerType<T>::value;
+
+template <typename T>
+using IsSpmdPartitioningVisitorPointer =
+    std::enable_if_t<IsSpmdPartitioningVisitorPointerType_v<T>, int>;
+
+template <typename T>
+using IsNotSpmdPartitioningVisitorPointer =
+    std::enable_if_t<!IsSpmdPartitioningVisitorPointerType_v<T>, int>;
+
+// Check if a type is SpmdBuilder* type.
+template <typename T, typename = void>
+struct IsSpmdBuilderPointerType : std::false_type {};
+
+template <typename T>
+struct IsSpmdBuilderPointerType<
+    T,
+    std::enable_if_t<std::is_same_v<std::remove_reference_t<T>, SpmdBuilder*>>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool IsSpmdBuilderPointerType_v = IsSpmdBuilderPointerType<T>::value;
+
+template <typename T>
+using IsSpmdBuilderPointer =
+    std::enable_if_t<IsSpmdBuilderPointerType_v<T>, int>;
+
+template <typename T>
+using IsNotSpmdBuilderPointer =
+    std::enable_if_t<!IsSpmdBuilderPointerType_v<T>, int>;
+
+// Check if a type is HloModule* type.
+template <typename T, typename = void>
+struct IsHloModulePointerType : std::false_type {};
+
+template <typename T>
+struct IsHloModulePointerType<
+    T, std::enable_if_t<std::is_same_v<std::remove_reference_t<T>, HloModule*>>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool IsHloModulePointerType_v = IsHloModulePointerType<T>::value;
+
+template <typename T>
+using IsHloModulePointer = std::enable_if_t<IsHloModulePointerType_v<T>, int>;
+
+template <typename T>
+using IsNotHloModulePointer =
+    std::enable_if_t<!IsHloModulePointerType_v<T>, int>;
+
+// Check if a type is PartitionedHlo type.
+template <typename T, typename = void>
+struct IsPartitionedHloType : std::false_type {};
+
+template <typename T>
+struct IsPartitionedHloType<
+    T, std::enable_if_t<std::is_same_v<std::decay_t<T>, PartitionedHlo>>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool IsPartitionedHloType_v = IsPartitionedHloType<T>::value;
+
+template <typename T>
+using IsPartitionedHlo = std::enable_if_t<IsPartitionedHloType_v<T>, int>;
+
+template <typename T>
+using IsNotPartitionedHlo = std::enable_if_t<!IsPartitionedHloType_v<T>, int>;
+
+// Check if a type is iterable type.
+template <typename T, typename = void>
+struct is_iterable : std::false_type {};
+
+template <typename T>
+struct is_iterable<T, std::void_t<decltype(std::declval<T>().begin()),
+                                  decltype(std::declval<T>().end())>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool is_iterable_v = is_iterable<T>::value;
+
+template <typename T>
+using iterable_element_type =
+    std::decay_t<decltype(*std::declval<T>().begin())>;
+
+// Check if a type is iterable container type of PartitionedHlo.
+template <typename T, typename = void>
+struct IsIterablePartitionedHloContainerType : std::false_type {};
+
+template <typename T>
+struct IsIterablePartitionedHloContainerType<
+    T,
+    std::enable_if_t<is_iterable_v<T> &&
+                     std::is_same_v<iterable_element_type<T>, PartitionedHlo>>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool IsIterablePartitionedHloContainerType_v =
+    IsIterablePartitionedHloContainerType<T>::value;
+
+template <typename T>
+using IsIterablePartitionedHloContainer =
+    std::enable_if_t<IsIterablePartitionedHloContainerType_v<T>, int>;
+
+template <typename T>
+using IsNotIterablePartitionedHloContainer =
+    std::enable_if_t<!IsIterablePartitionedHloContainerType_v<T>, int>;
+
+// Create a fake PartitionedHlo object in a fake builder/module as a new
+// parameter.
+template <typename Arg, IsPartitionedHlo<Arg> = 0>
+std::decay_t<Arg> FakePartitionedHlo(Arg&& phlo, HloModule* module,
+                                     int* parameter_count,
+                                     SpmdPartitioningVisitor* fake_visitor) {
+  HloInstruction* param =
+      fake_visitor->builder()
+          ->AddParameter(HloInstruction::CreateParameter(
+              *parameter_count, phlo.hlo()->shape(),
+              "fake_parameter." + std::to_string(*parameter_count)))
+          .value();
+  *parameter_count = *parameter_count + 1;
+  PartitionedHlo fake_phlo = phlo.CloneWithNewHlo(param);
+  PartitionedHlo::PartitioningState fake_state =
+      fake_visitor->MakePartitioningState();
+  fake_state.module = module;
+  fake_phlo.set_state(fake_state);
+  return fake_phlo;
+}
+
+// Create a fake PartitionedHlo container object in a fake builder/module as a
+// number new parameters.
+template <typename Arg, IsIterablePartitionedHloContainer<Arg> = 0>
+std::decay_t<Arg> FakeIterablePartitionedHloContainer(
+    Arg&& phlo_container, HloModule* module, int* parameter_count,
+    SpmdPartitioningVisitor* fake_visitor) {
+  std::vector<iterable_element_type<Arg>> phlos;
+  phlos.reserve(phlo_container.size());
+  for (const PartitionedHlo& phlo : phlo_container) {
+    phlos.push_back(std::move(
+        FakePartitionedHlo(phlo, module, parameter_count, fake_visitor)));
+  }
+  bool is_constructible_from_iterators =
+      std::is_constructible_v<std::decay_t<Arg>, decltype(phlos.begin()),
+                              decltype(phlos.end())>;
+  CHECK(is_constructible_from_iterators);
+  return std::decay_t<Arg>(phlos.begin(), phlos.end());
+}
+
+// Create a fake SpmdPartitioningVisitor*.
+template <typename Arg, IsSpmdPartitioningVisitorPointer<Arg> = 0>
+std::decay_t<Arg> FakeSpmdPartitioningVisitor(
+    Arg&& visitor, SpmdPartitioningVisitor* fake_visitor) {
+  return fake_visitor;
+}
+
+// Create a fake SpmdBuilder*.
+template <typename Arg, IsSpmdBuilderPointer<Arg> = 0>
+std::decay_t<Arg> FakeSpmdBuilder(Arg&& builder,
+                                  SpmdPartitioningVisitor* fake_visitor) {
+  return fake_visitor->builder();
+}
+// Create a fake HloModule*.
+template <typename Arg, IsHloModulePointer<Arg> = 0>
+std::decay_t<Arg> FakeHloModule(Arg&& module, HloModule* fake_module) {
+  return fake_module;
+}
+template <class T>
+using decay_rvalue_reference_t =
+    std::conditional_t<std::is_rvalue_reference<T>::value, std::decay_t<T>, T>;
+
+// Modifies SpmdPartitioningVisitor* type objects.
+template <typename Arg, IsSpmdPartitioningVisitorPointer<Arg> = 0>
+std::decay_t<Arg> ArgModifier(Arg&& arg, HloModule* module,
+                              int* parameter_count,
+                              SpmdPartitioningVisitor* fake_visitor) {
+  VLOG(5) << "Faking argument type: " << typeid(arg).name();
+  return FakeSpmdPartitioningVisitor(std::forward<Arg>(arg), fake_visitor);
+}
+
+// Modifies SpmdBuilder* type objects.
+template <typename Arg, IsSpmdBuilderPointer<Arg> = 0>
+std::decay_t<Arg> ArgModifier(Arg&& arg, HloModule* module,
+                              int* parameter_count,
+                              SpmdPartitioningVisitor* fake_visitor) {
+  VLOG(5) << "Faking argument type: " << typeid(arg).name();
+  return FakeSpmdBuilder(std::forward<Arg>(arg), fake_visitor);
+}
+
+// Modifies SpmdPartitioningVisitor* type objects.
+template <typename Arg, IsHloModulePointer<Arg> = 0>
+std::decay_t<Arg> ArgModifier(Arg&& arg, HloModule* module,
+                              int* parameter_count,
+                              SpmdPartitioningVisitor* fake_visitor) {
+  VLOG(5) << "Faking argument type: " << typeid(arg).name();
+  return FakeHloModule(std::forward<Arg>(arg), module);
+}
+
+// Modifies PartitionedHlo type objects.
+template <typename Arg, IsPartitionedHlo<Arg> = 0>
+std::decay_t<Arg> ArgModifier(Arg&& arg, HloModule* module,
+                              int* parameter_count,
+                              SpmdPartitioningVisitor* fake_visitor) {
+  VLOG(5) << "Faking argument type: " << typeid(arg).name();
+  return FakePartitionedHlo(std::forward<Arg>(arg), module, parameter_count,
+                            fake_visitor);
+}
+
+// Modifies PartitionedHlo container type objects.
+template <typename Arg, IsIterablePartitionedHloContainer<Arg> = 0>
+std::decay_t<Arg> ArgModifier(Arg&& arg, HloModule* module,
+                              int* parameter_count,
+                              SpmdPartitioningVisitor* fake_visitor) {
+  VLOG(5) << "Faking argument type: " << typeid(arg).name();
+  return FakeIterablePartitionedHloContainer(std::forward<Arg>(arg), module,
+                                             parameter_count, fake_visitor);
+}
+
+// Modifies nothing, equivalent to no-op.
+template <typename Arg, IsNotSpmdPartitioningVisitorPointer<Arg> = 0,
+          IsNotSpmdBuilderPointer<Arg> = 0, IsNotHloModulePointer<Arg> = 0,
+          IsNotIterablePartitionedHloContainer<Arg> = 0,
+          IsNotPartitionedHlo<Arg> = 0>
+std::decay_t<Arg> ArgModifier(Arg&& arg, HloModule* module,
+                              int* parameter_count,
+                              SpmdPartitioningVisitor* fake_visitor) {
+  VLOG(5) << "Passing through argument type: " << typeid(arg).name();
+  return arg;
+}
+
+// Finds SpmdPartitioningVisitor* object in an arg list.
+template <typename Arg, IsSpmdPartitioningVisitorPointer<Arg> = 0>
+absl::StatusOr<SpmdPartitioningVisitor*> FindSpmdPartitioningVisitor(
+    Arg&& arg) {
+  return arg;
+}
+
+template <typename Arg, typename... Args,
+          IsSpmdPartitioningVisitorPointer<Arg> = 0>
+absl::StatusOr<SpmdPartitioningVisitor*> FindSpmdPartitioningVisitor(
+    Arg&& arg, Args&&... args) {
+  return arg;
+}
+
+template <typename Arg, typename... Args,
+          IsNotSpmdPartitioningVisitorPointer<Arg> = 0>
+absl::StatusOr<SpmdPartitioningVisitor*> FindSpmdPartitioningVisitor(
+    Arg&& arg, Args&&... args) {
+  return FindSpmdPartitioningVisitor(std::forward<Args>(args)...);
+}
+
+}  // namespace detail
+
+// Evaluate the memory and communication cost for any arbitrary partitioning
+// methods.
+template <typename F, typename... Args>
+absl::StatusOr<std::pair<int64_t, int64_t>> EvaluatePartitionCost(
+    const HloInstruction* original_hlo, F partition_method,
+    Args&&... partition_method_args) {
+  HloModule* module = original_hlo->GetModule();
+  auto comp_env =
+      std::make_unique<CompilationEnvironments>(module->comp_envs());
+  // Create a fake module and run partitioning with this fake module later.
+  HloModule fake_module("fake_module", module->config(), std::move(comp_env));
+  auto temp_b = HloComputation::Builder("temp_entry");
+  auto temp_p = temp_b.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "input"));
+  HloComputation* temp_entry = fake_module.AddEntryComputation(temp_b.Build());
+
+  TF_ASSIGN_OR_RETURN(SpmdPartitioningVisitor * visitor,
+                      detail::FindSpmdPartitioningVisitor(
+                          std::forward<Args>(partition_method_args)...));
+  SpmdPartitioner* partitioner = visitor->partitioner();
+  std::unique_ptr<SpmdPartitioningVisitor> fake_visitor = visitor->Clone();
+  fake_visitor->set_module(&fake_module);
+  auto* fake_b = fake_visitor->builder();
+  fake_b->set_visiting_hlo(temp_p);
+  auto parameter_count = std::make_unique<int>(0);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * new_hlo,
+      partition_method(detail::ArgModifier(
+          std::forward<Args>(partition_method_args), &fake_module,
+          parameter_count.get(), fake_visitor.get())...));
+
+  if (new_hlo == nullptr) {
+    return std::make_pair(INT64_MAX, INT64_MAX);
+  }
+  auto new_entry = fake_module.AddEmbeddedComputation(fake_b->Build(new_hlo));
+  // Replace the original computation with the new SPMD computation.
+  absl::flat_hash_map<HloComputation*, HloComputation*> replacement;
+  replacement[temp_entry] = new_entry;
+  for (HloInstruction* hlo : new_entry->instructions()) {
+    for (HloComputation* comp : hlo->called_computations()) {
+      if (comp->parent() != &fake_module) {
+        replacement[comp] = fake_module.AddEmbeddedComputation(comp->Clone());
+      }
+    }
+  }
+  fake_module.ReplaceComputations(replacement);
+
+  HloDCE hlo_dce;
+  TF_ASSIGN_OR_RETURN(
+      auto _, hlo_dce.Run(&fake_module, partitioner->execution_threads()));
+  (void)_;  // Suppress unused variable warning in OSS
+  VLOG(5) << "Dry-run partitioning for op: " << original_hlo->ToString() << "\n"
+          << fake_module.ToString();
+
+  int64_t max_memory = 0;
+  int64_t total_communication = 0;
+  for (HloComputation* computation : fake_module.computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      // Check the memory cost for the partitioned hlo op, as well as the
+      // memory cost for collectives for potential overhead from full remat.
+      if (hlo->opcode() == original_hlo->opcode() || IsCollective(hlo)) {
+        int64_t memory_cost = partitioner->MemoryCostInBytes(hlo);
+        if (memory_cost > max_memory) {
+          VLOG(5) << hlo->ToString() << " has memory cost of " << memory_cost;
+          max_memory = memory_cost;
+        }
+      }
+      if (IsCollective(hlo)) {
+        total_communication += partitioner->CommunicationCostInBytes(hlo);
+      }
+    }
+  }
+  if (max_memory != 0) {
+    return std::make_pair(max_memory, total_communication);
+  }
+  return std::make_pair(INT64_MAX, INT64_MAX);
+}
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_prepare.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_prepare.h
new file mode 100644
index 00000000..7789287a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/spmd_prepare.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SPMD_PREPARE_H_
+#define XLA_SERVICE_SPMD_SPMD_PREPARE_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace spmd {
+
+// Performs preparation steps for better SPMD partitioning of ops.
+// This is organized as a separate pass so it can be interleaved with other
+// optimizations over sharded ops or shardings.
+class SpmdPrepare : public HloModulePass {
+ public:
+  explicit SpmdPrepare() = default;
+
+  ~SpmdPrepare() override = default;
+  absl::string_view name() const override { return "spmd-prepare"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SPMD_PREPARE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
new file mode 100644
index 00000000..6141c6b3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -0,0 +1,114 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_STATEFUL_RNG_SPMD_PARTITIONER_H_
+#define XLA_SERVICE_SPMD_STATEFUL_RNG_SPMD_PARTITIONER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+class StatefulRngSpmdPartitioningVisitor
+    : public spmd::SpmdPartitioningVisitor {
+ public:
+  StatefulRngSpmdPartitioningVisitor(
+      HloComputation* computation, int64_t num_partitions, int64_t num_replicas,
+      const spmd::SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64_t* next_channel_id, spmd::SpmdLogger* logger,
+      spmd::SpmdPartitionerOptions options, spmd::SpmdPartitioner* partitioner,
+      const CallGraph& call_graph)
+      : spmd::SpmdPartitioningVisitor(computation, num_partitions, num_replicas,
+                                      collective_ops_creator, next_channel_id,
+                                      logger, std::move(options), partitioner,
+                                      call_graph) {}
+  absl::Status HandleRngGetAndUpdateState(HloInstruction* hlo) override;
+};
+
+class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
+ public:
+  StatefulRngSpmdPartitioner(
+      int64_t num_partitions, int64_t num_replicas,
+      int64_t threshold_for_windowed_einsum_mib = 100000,
+      bool windowed_einsum_use_multiple_streams = false,
+      bool skip_checking_windowed_einsum_users = false,
+      bool disable_ag_rewrite_for_multiple_consumers = false,
+      std::optional<int64_t> total_bytes_windowed_einsum_threshold =
+          std::nullopt)
+      : spmd::SpmdPartitioner(
+            num_partitions, num_replicas,
+            GetSpmdPartitionerOptions(threshold_for_windowed_einsum_mib,
+                                      windowed_einsum_use_multiple_streams,
+                                      skip_checking_windowed_einsum_users,
+                                      disable_ag_rewrite_for_multiple_consumers,
+                                      total_bytes_windowed_einsum_threshold)) {}
+
+ protected:
+  std::unique_ptr<spmd::SpmdPartitioningVisitor> CreateVisitor(
+      HloComputation* computation, int64_t num_partitions, int64_t num_replicas,
+      const spmd::SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64_t* next_channel_id, spmd::SpmdLogger* logger,
+      spmd::SpmdPartitionerOptions options,
+      const CallGraph& call_graph) override;
+
+  absl::Status PreprocessSharding(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // This adds an unsafe attribute labelling the while loop as a pipelined
+  // while loop. This attribute lets the rest of the passes ignore the
+  // computations in the pipeline bubble.
+  absl::Status HandleRotateRightWhilePreprocessing(
+      HloComputation* computation) override;
+  bool CanSideEffectingHaveReplicatedSharding(
+      const HloInstruction* hlo) override;
+
+ private:
+  static spmd::SpmdPartitionerOptions GetSpmdPartitionerOptions(
+      int64_t threshold_for_windowed_einsum_mib,
+      bool windowed_einsum_use_multiple_streams = false,
+      bool skip_checking_windowed_einsum_users = false,
+      bool disable_ag_rewrite_for_multiple_consumers = false,
+      std::optional<int64_t> total_bytes_windowed_einsum_threshold =
+          std::nullopt) {
+    spmd::SpmdPartitionerOptions options;
+    options.allow_module_signature_change = true;
+    options.threshold_for_windowed_einsum_mib =
+        threshold_for_windowed_einsum_mib;
+    options.unroll_windowed_einsum = windowed_einsum_use_multiple_streams;
+    options.skip_checking_windowed_einsum_users =
+        skip_checking_windowed_einsum_users;
+    options.disable_ag_rewrite_for_multiple_consumers =
+        disable_ag_rewrite_for_multiple_consumers;
+    options.total_bytes_windowed_einsum_threshold =
+        total_bytes_windowed_einsum_threshold;
+    return options;
+  }
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_STATEFUL_RNG_SPMD_PARTITIONER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
new file mode 100644
index 00000000..ba3264aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_WHOLE_GRAPH_MANUAL_PASS_H_
+#define XLA_SERVICE_SPMD_WHOLE_GRAPH_MANUAL_PASS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass adds manual sharding annotation to every instruction in the graph
+// to specify that this graph is manually sharded and any subsequent execution
+// of the SPMD partitioner will not try to partition the graph.
+class WholeGraphManualPass : public HloModulePass {
+ public:
+  WholeGraphManualPass() : HloModulePass() {}
+  absl::string_view name() const override { return "whole-graph-manual-pass"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_WHOLE_GRAPH_MANUAL_PASS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/stable_sort_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/stable_sort_expander.h
new file mode 100644
index 00000000..78d58b24
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/stable_sort_expander.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+#define XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/stable_sort_expander.h"
+
+#endif  // XLA_SERVICE_STABLE_SORT_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/stochastic_convert_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/stochastic_convert_decomposer.h
new file mode 100644
index 00000000..79aefac7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/stochastic_convert_decomposer.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
+#define XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/expanders/stochastic_convert_decomposer.h"
+
+#endif  // XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/stream_pool.h b/third_party/tflite-hdrs/third_party/xla/xla/service/stream_pool.h
new file mode 100644
index 00000000..1610071d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/stream_pool.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_STREAM_POOL_H_
+#define XLA_SERVICE_STREAM_POOL_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace se = ::stream_executor;
+
+// Pool of stream_executor::Streams, which are created as needed and
+// destroyed when the pool is destroyed.
+class StreamPool {
+ public:
+  struct PtrDeleter {
+    void operator()(se::Stream* stream) { pool->ReturnStream(stream); }
+    StreamPool* pool;
+  };
+
+  // Stream pointer type returned by BorrowStream, which returns the
+  // stream to the pool on destruction.
+  using Ptr = std::unique_ptr<se::Stream, PtrDeleter>;
+
+  explicit StreamPool(se::StreamExecutor* executor) : executor_(executor) {}
+
+  // Returns a pointer to a stream in the pool, creating a new stream
+  // if none are available in the pool. The returned smart pointer
+  // returns the stream to the pool on destruction.
+  //
+  // This method is thread-safe.
+  Ptr BorrowStream(se::StreamPriority priority = se::StreamPriority::Default);
+
+ private:
+  // Puts a pointer to a stream back into the pool, leaving it free
+  // for future use. Streams that have previously encountered errors
+  // are deleted, and not returned to the pool.
+  //
+  // This method is thread-safe.
+  void ReturnStream(se::Stream* stream);
+
+  absl::Mutex mu_;
+  // This stores streams with user-specified priority.
+  std::unordered_map<se::StreamPriority,
+                     std::vector<std::unique_ptr<se::Stream>>>
+      streams_with_pri_ ABSL_GUARDED_BY(mu_);
+  se::StreamExecutor* executor_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_STREAM_POOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/sub_byte_normalization.h b/third_party/tflite-hdrs/third_party/xla/xla/service/sub_byte_normalization.h
new file mode 100644
index 00000000..3f9f7005
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/sub_byte_normalization.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
+#define XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h"
+
+#endif  // XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/symbol_repository.h b/third_party/tflite-hdrs/third_party/xla/xla/service/symbol_repository.h
new file mode 100644
index 00000000..6cce3bc4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/symbol_repository.h
@@ -0,0 +1,122 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SYMBOL_REPOSITORY_H_
+#define XLA_SERVICE_SYMBOL_REPOSITORY_H_
+
+// Functionality to do lookups in HLO repositories. See export_hlo.h for
+// uploads.
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+
+// Different backends that repositories might store symbols for. This enum could
+// change to a string in the future if required, but ideally repositories only
+// care about the class of hardware, not the specific make/model and so an enum
+// is fine.
+enum class BackendType {
+  kCpu,
+  kGpu,
+  kTpu,
+};
+
+// Dummy struct for individual backends to add their data to.
+struct BackendSpecificData {
+  virtual ~BackendSpecificData() = default;
+};
+
+// A module and some collected metadata that allow for pure compilation of an
+// HLO module. Implementations may want to subclass to add additional
+// functionality or data.
+struct HloModuleAndMetadata {
+  virtual ~HloModuleAndMetadata() = default;
+
+  std::unique_ptr<HloModule> hlo_module;
+  std::unique_ptr<Compiler::TargetConfig> target_config;
+  // Use static_cast to cast this to a concrete type.
+  std::unique_ptr<BackendSpecificData> backend_specific_data;
+};
+
+// Looks up HLO in a repository. The only non-dummy implementation is
+// Google-internal as of 2023-10.
+class SymbolRepository {
+ public:
+  virtual ~SymbolRepository() = default;
+  virtual absl::StatusOr<std::unique_ptr<HloModuleAndMetadata>> Lookup(
+      absl::string_view symbol_reference, BackendType backend) const = 0;
+};
+
+// Registry for SymbolRepository implementations.
+class SymbolRepositoryRegistry {
+ public:
+  void Register(const std::string& name,
+                std::unique_ptr<SymbolRepository> repo) {
+    absl::MutexLock lock(&mu_);
+    VLOG(1) << "Registering SymbolRepository " << name;
+    repo_[name] = std::move(repo);
+  }
+
+  SymbolRepository* repo(absl::string_view name) {
+    absl::MutexLock lock(&mu_);
+    const auto it = repo_.find(name);
+    if (it == repo_.end()) {
+      return nullptr;
+    }
+
+    return it->second.get();
+  }
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::string, std::unique_ptr<SymbolRepository>> repo_
+      ABSL_GUARDED_BY(mu_);
+};
+
+inline SymbolRepositoryRegistry& GetGlobalSymbolRepositoryRegistry() {
+  static auto* const registry = new SymbolRepositoryRegistry;
+  return *registry;
+}
+
+// Entry points start here.
+
+inline absl::StatusOr<std::unique_ptr<HloModuleAndMetadata>>
+LookupSymbolInRepository(absl::string_view repository,
+                         absl::string_view symbol_reference,
+                         BackendType backend) {
+  if (SymbolRepository* repo =
+          GetGlobalSymbolRepositoryRegistry().repo(repository);
+      repo != nullptr) {
+    return repo->Lookup(symbol_reference, backend);
+  }
+
+  return nullptr;
+}
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SYMBOL_REPOSITORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/time_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/service/time_utils.h
new file mode 100644
index 00000000..e632c065
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/time_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TIME_UTILS_H_
+#define XLA_SERVICE_TIME_UTILS_H_
+
+#include <cstdint>
+
+namespace xla {
+
+// Convert between inclusive/exclusive start/end times.
+int64_t ExclusiveToInclusiveStartTime(int64_t exclusive_time);
+int64_t InclusiveToExclusiveStartTime(int64_t inclusive_time);
+int64_t ExclusiveToInclusiveEndTime(int64_t exclusive_time);
+int64_t InclusiveToExclusiveEndTime(int64_t inclusive_time);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TIME_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/topk_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/topk_rewriter.h
new file mode 100644
index 00000000..cf673fe7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/topk_rewriter.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TOPK_REWRITER_H_
+#define XLA_SERVICE_TOPK_REWRITER_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass pattern-matches soups of HLOs executing a TopK operation and
+// replaces them with a TopK CustomCall when the given values are supported by
+// the CustomCall and it is more efficient to use that implementation.
+class TopkRewriter : public HloModulePass {
+ public:
+  explicit TopkRewriter(std::function<bool(const HloSortInstruction*, int64_t)>
+                            is_profitable_to_convert)
+      : is_profitable_to_convert_(std::move(is_profitable_to_convert)) {}
+
+  absl::string_view name() const override { return "topk-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ protected:
+  // Check if the sort instruction is in TopK.
+  std::optional<int64_t> SortIsInTopK(HloInstruction* inst);
+
+  // Transform to CustomCall.
+  absl::StatusOr<bool> TransformToCustomCall(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+ private:
+  // Predicate that returns true if a sort instruction is profitable to be
+  // converted into a custom call.
+  std::function<bool(const HloSortInstruction*, int64_t)>
+      is_profitable_to_convert_;
+
+  // Matches the input to the sort+iota+slice pattern and converts to custom
+  // call if profitable. Returns the custom call if one was created.
+  absl::StatusOr<HloInstruction*> TransformPatternToCustomCall(
+      HloInstruction* inst);
+};
+
+class TopkDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "topk-decomposer"; }
+
+  explicit TopkDecomposer(HloPredicate should_decompose = {})
+      : should_decompose_(should_decompose) {}
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloPredicate should_decompose_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TOPK_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/tpu_computation_placer.h b/third_party/tflite-hdrs/third_party/xla/xla/service/tpu_computation_placer.h
new file mode 100644
index 00000000..5ad60829
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/tpu_computation_placer.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TPU_COMPUTATION_PLACER_H_
+#define XLA_SERVICE_TPU_COMPUTATION_PLACER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/service/computation_placer.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+#include "xla/stream_executor/tpu/tpu_topology.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuComputationPlacer : public xla::ComputationPlacer {
+ public:
+  TpuComputationPlacer();
+  ~TpuComputationPlacer() override;
+
+  absl::StatusOr<int> DeviceId(int replica, int computation, int replica_count,
+                               int computation_count) override;
+
+  absl::StatusOr<xla::DeviceAssignment> AssignDevices(
+      int replica_count, int computation_count) override;
+
+  static absl::StatusOr<xla::DeviceAssignment> AssignLocalDevices(
+      TpuHostLocationExternal host_location, int replica_count,
+      int computation_count);
+
+ private:
+  XLA_ComputationPlacer* placer_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_SERVICE_TPU_COMPUTATION_PLACER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/transfer_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/transfer_manager.h
new file mode 100644
index 00000000..2f4ad984
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/transfer_manager.h
@@ -0,0 +1,338 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TRANSFER_MANAGER_H_
+#define XLA_SERVICE_TRANSFER_MANAGER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// The TransferManager interface lets backends provide platform-specific
+// mechanisms for constructing literals from given device memory handles.
+// This lets each platform customize how literals are transferred to/from the
+// device in terms of padding, leading dimension, etc.
+class TransferManager {
+ public:
+  virtual ~TransferManager() = default;
+
+  // Returns the ID of the platform that this transfer manager acts on.
+  virtual se::Platform::Id PlatformId() const = 0;
+
+  // Returns the shape of the on-device representation for the given shape on
+  // the host. This is intended for use with ShapedBuffer where buffers are
+  // pre-allocated by the host, e.g. TransferLiteralToDevice, without the user
+  // needing to consider device-specific behaviors.
+  virtual Shape HostShapeToDeviceShape(const Shape& host_shape) const {
+    // Strips off any preexisting tiling or memory space information.
+    // TODO(phawkins): fix clients not to including tiling or memory space
+    // information in shapes passed to this function and turn this into an
+    // assertion.
+    return ShapeUtil::DeviceShapeToHostShape(host_shape);
+  }
+
+  // Base class for specifying platform specific transfer metadata that can be
+  // used to tell the underlying implementation to perform specific optimization
+  // to a transfer. Actual metadata passed to supported transfer methods should
+  // subclass this class.
+  class TransferMetadata {
+   public:
+    virtual ~TransferMetadata() = default;
+  };
+  // Returns a literal containing the data held in the given ShapedBuffer
+  // using the provided executor. This operation is performed synchronously
+  // without waiting for any other operation on a stream to complete.
+  //
+  // This function should be avoided in favor of the asynchronous version below.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  absl::StatusOr<Literal> TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  absl::Status TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      const MutableBorrowingLiteral& literal,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  // Begins transferring a literal containing the data held in the given
+  // ShapedBuffer using the provided executor.
+  //
+  // This operation is performed asynchronously on the given stream. It returns
+  // once the transfer is enqueued. 'done' is invoked with the result when
+  // complete.
+  //
+  // device_buffer is copied by reference and must live at least until done() is
+  // invoked.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  virtual void TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      MutableBorrowingLiteral literal, std::function<void(absl::Status)> done,
+      const TransferMetadata* transfer_metadata) = 0;
+
+  void TransferLiteralFromDevice(se::Stream* stream,
+                                 const ShapedBuffer& device_buffer,
+                                 MutableBorrowingLiteral literal,
+                                 std::function<void(absl::Status)> done) {
+    return TransferLiteralFromDevice(stream, device_buffer, literal, done,
+                                     nullptr);
+  }
+
+  // Transfers the given literal into the previously allocated device memory
+  // represented by the given ShapedBuffer using the given executor. The shape
+  // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
+  // but need not have the same layout.
+  //
+  // This operation is performed synchronously without waiting for any other
+  // operation on a stream to complete. This function should be avoided in favor
+  // of the asynchronous version below.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  absl::Status TransferLiteralToDevice(
+      se::Stream* stream, const LiteralSlice& literal,
+      const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  // Transfers the given literal into the previously allocated device memory
+  // represented by the given ShapedBuffer using the given executor. The shape
+  // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
+  // but need not have the same layout.
+  //
+  // This operation is performed asynchronously on the given stream. It returns
+  // once the transfer is enqueued, and may return before the transfer has
+  // completed.
+  //
+  // The caller may free the data structures 'literal' and 'device_buffer'
+  // immediately after this function returns, however their constituent buffers
+  // on both host and device must remain valid until the enqueued transfer has
+  // completed on 'stream'.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  virtual absl::Status TransferLiteralToDeviceAsync(
+      se::Stream* stream, const LiteralSlice& literal,
+      const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata) = 0;
+
+  absl::Status TransferLiteralToDeviceAsync(se::Stream* stream,
+                                            const LiteralSlice& literal,
+                                            const ShapedBuffer& device_buffer) {
+    return TransferLiteralToDeviceAsync(stream, literal, device_buffer,
+                                        nullptr);
+  }
+
+  // Convenience methods for transferring an array to or from the device at a
+  // known address. This avoids having to construct a ShapedBuffer just to
+  // transfer an array at a known address.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  absl::Status TransferArrayToDevice(
+      se::Stream* stream, const LiteralSlice& literal,
+      const se::DeviceMemoryBase& dest,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  absl::Status TransferArrayToDeviceAsync(
+      se::Stream* stream, const LiteralSlice& literal,
+      const se::DeviceMemoryBase& dest,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  absl::StatusOr<Literal> TransferArrayFromDevice(
+      se::Stream* stream, const Shape& shape,
+      const se::DeviceMemoryBase& source,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  // Read from a device buffer and update the dynamic dimension sizes of
+  // `host_shape` and `device_shape`. The function takes in bounded dynamic
+  // shapes, and returns static shapes with dynamic shapes updated.
+  // The shape of the buffer also have to be compatible with the host shape and
+  // device shape.
+  virtual absl::Status ReadDynamicShapes(se::Stream* stream,
+                                         const ShapedBuffer* device_buffer,
+                                         Shape* device_shape);
+
+  // Transfers the given literal into the Infeed interface of the device,
+  // using the given executor.
+  virtual absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                               const LiteralSlice& literal) = 0;
+
+  // Transfers the given literal from the Outfeed interface of the device,
+  // using the given executor. The shape and layout are determined by the
+  // shape and layout of `literal`.
+  virtual absl::Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, MutableBorrowingLiteral literal) = 0;
+
+  // Resets the devices associated with this transfer manager.
+  virtual absl::Status ResetDevices(
+      absl::Span<se::StreamExecutor* const> executor) = 0;
+
+  // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
+  // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
+  // ShapedBuffer is array-shaped this method does nothing.
+  absl::Status WriteTupleIndexTables(se::Stream* stream,
+                                     const ShapedBuffer& device_buffer);
+  absl::Status WriteTupleIndexTablesAsync(se::Stream* stream,
+                                          const ShapedBuffer& device_buffer);
+
+  // Writes a tuple index buffer for the root of 'device_buffer', which must
+  // be a tuple. Unlike WriteTupleIndexTables, only writes the root buffer,
+  // rather than writing all subbuffers. This method is always asynchronous.
+  absl::Status WriteRootTupleIndexTable(se::Stream* stream,
+                                        const ShapedBuffer& device_buffer);
+  absl::Status WriteRootTupleIndexTable(
+      se::Stream* stream,
+      const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree);
+
+  // Determines the byte size requirement for the given shape on the underlying
+  // architecture. This will be used to allocate an appropriately sized memory
+  // region for a host-to-device transfer.
+  virtual int64_t GetByteSizeRequirement(const Shape& shape) const = 0;
+
+  // Chooses a compact layout for 'shape', ignoring any existing layout on
+  // 'shape'. What "reasonable" means is left up to the backend. The
+  // intended use case is to choose a layout that avoids excessive padding on
+  // devices that have tiled memory architectures.
+  // The default implementation always picks a default (major-to-minor) layout.
+  // Fails if 'shape' cannot be represented by the device.
+  virtual absl::StatusOr<Shape> ChooseCompactLayoutForShape(
+      const Shape& host_shape) const;
+
+  // For the given shape, chooses a layout for infeed. The returned shape
+  // has the same dimensions as the original shape, and only the layout is
+  // changed.
+  virtual Shape ChooseGoodInfeedLayout(const Shape& shape) const;
+
+  typedef std::function<Shape(const Shape&)> DeviceShapeRepresentationFn;
+
+  // Allocates a ScopedShapedBuffer which can hold data with the given on-host
+  // shape. The on-device shape may be different as indicated by
+  // HostShapeToDeviceShape.
+  absl::StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
+      const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
+      int device_ordinal, int physical_device_ordinal,
+      DeviceShapeRepresentationFn shape_representation_fn = nullptr);
+
+  absl::StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
+      const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
+      int device_ordinal,
+      DeviceShapeRepresentationFn shape_representation_fn = nullptr) {
+    return AllocateScopedShapedBuffer(on_host_shape, allocator, device_ordinal,
+                                      device_ordinal, shape_representation_fn);
+  }
+
+  // The given ShapedBuffer holds a handle to allocated memory, but it is not
+  // in the general case legal to immediately copy or access that allocated
+  // memory because queued operations on the device may alias that memory.
+  // Memory ordering is enforced by the Stream's happens-before relationship
+  // which allows eager deallocation and reallocation of buffers host-side even
+  // if the device hasn't finished with them.
+  //
+  // In certain cases, it can be known that a ShapedBuffer does not have any
+  // conflicting accesses on the device and thus is eligible to be accessed at
+  // any time from the host.
+  //
+  // This function returns true if device_buffer can be accessed immediately
+  // without waiting for the Stream's previously enqueued items. This only
+  // returns true if all subbuffers in device_buffer can be accessed
+  // immediately.
+  virtual bool CanShapedBufferBeAccessedNow(
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) const {
+    return false;
+  }
+
+  // Equivalent to CanShapedBufferBeAccessedNow but for a single device buffer.
+  virtual bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const {
+    return false;
+  }
+
+  /////
+  // The TransferManager class also serves as a point to register objects for
+  // the various platforms.
+
+  // Registers the TransferManager singleton for the platform kind. This is
+  // assumed to be a singleton, so no ownership is transferred.
+  //
+  // Precondition: a platform kind must not be registered more than once.
+  typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
+  static void RegisterTransferManager(
+      se::Platform::Id platform_id,
+      TransferManagerCreationFunction transfer_manager);
+
+  // Returns the transfer manager singleton pointer if it is available for the
+  // given platform, or an error status if it is not.
+  static absl::StatusOr<TransferManager*> GetForPlatform(
+      const se::Platform* platform);
+
+  // Writes the given device-memory pointers in 'elements' to the given region
+  // to construct a tuple index table in the platform-specific tuple
+  // representation.
+  virtual absl::Status WriteSingleTupleIndexTable(
+      se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) = 0;
+
+  // Returns whether subbyte types (types less than 1 byte, e.g. U4) should
+  // have multiple values packed into a single byte on the device. Subbyte
+  // bytes are never packed on the host. By default, returns false, so a byte
+  // can only hold one value, but subclasses can override this.
+  //
+  // If overridden to return true, subclasses should pack and unpack in their
+  // overridden implementations of TransferLiteralToDeviceAsync and
+  // TransferLiteralFromDevice respectively.
+  virtual bool PackSubbyteTypes() const { return false; }
+
+ private:
+  // The mutex that guards the platform-to-transfer manager map.
+  static absl::Mutex platform_transfer_manager_mutex_;
+
+  // State kept for each kind of TransferManager.  Registration functions
+  // set up creation_function, and then we use that to lazily create
+  // "manager" the first time GetForPlatform is invoked for a particular id.
+  struct State {
+    std::unique_ptr<TransferManager> manager;
+    TransferManagerCreationFunction creation_function = nullptr;
+  };
+
+  // Map from platform kind to transfer manager singleton.
+  static absl::flat_hash_map<se::Platform::Id, State>*
+  GetPlatformTransferManagers();
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TRANSFER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/transpose_folding.h b/third_party/tflite-hdrs/third_party/xla/xla/service/transpose_folding.h
new file mode 100644
index 00000000..447b9677
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/transpose_folding.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TRANSPOSE_FOLDING_H_
+#define XLA_SERVICE_TRANSPOSE_FOLDING_H_
+
+#include <functional>
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that folds transpose operators into Dot operators, where the Dot
+// operator is implemented by a GEMM kernel that can transpose its inputs.
+class TransposeFolding : public HloModulePass {
+ public:
+  using OperandIndices = std::vector<int64_t>;
+
+  // Returns the set of foldable operands for a given HLO and some candidate
+  // operands.
+  using TransposableConvOperandsFn = std::function<OperandIndices(
+      const HloInstruction&, const OperandIndices&)>;
+
+  using CanFoldTransposeOperand = std::function<absl::StatusOr<bool>(
+      const HloInstruction&, int64_t /*operand_idx*/)>;
+
+  // Helper function to explicitly not fold transposes.
+  static OperandIndices NeverFoldTranspose(const HloInstruction&,
+                                           const OperandIndices&) {
+    return {};
+  }
+
+  // Helper function to always fold transposes.
+  static OperandIndices AlwaysFoldTranspose(const HloInstruction&,
+                                            const OperandIndices& ids) {
+    return ids;
+  }
+
+  // `dot_can_fold_transpose_operand` returns whether the dot operation can fold
+  // in the given transpose operand.
+  //
+  // transposable_conv_operands returns the set of operands it wants to fold if
+  // the instruction argument is implemented as a convolution that supports
+  // transposing its arguments.
+  explicit TransposeFolding(
+      CanFoldTransposeOperand dot_can_fold_transpose_operand =
+          IsRowColumnTransposeDotOperand,
+      TransposableConvOperandsFn transposable_conv_operands =
+          AlwaysFoldTranspose);
+  absl::string_view name() const override { return "transpose-folding"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  static absl::StatusOr<bool> IsRowColumnTransposeDotOperand(
+      const HloInstruction& dot, int64_t operand_idx);
+
+ private:
+  CanFoldTransposeOperand dot_can_fold_transpose_operand_;
+  TransposableConvOperandsFn transposable_conv_operands_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TRANSPOSE_FOLDING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/tree_reduction_rewriter.h b/third_party/tflite-hdrs/third_party/xla/xla/service/tree_reduction_rewriter.h
new file mode 100644
index 00000000..e505b69e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/tree_reduction_rewriter.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
+#define XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h"
+
+#endif  // XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/triangular_solve_expander.h b/third_party/tflite-hdrs/third_party/xla/xla/service/triangular_solve_expander.h
new file mode 100644
index 00000000..87aaf561
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/triangular_solve_expander.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+#define XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+class TriangularSolveExpander : public OpExpanderPass {
+ public:
+  explicit TriangularSolveExpander(int64_t block_size = 128);
+
+  absl::string_view name() const override {
+    return "triangular_solve_expander";
+  }
+
+ protected:
+  // Should we use direct solves for batched inputs?
+  virtual bool UseDirectSolves() const { return true; }
+
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+  // Performs a triangular solve using an algorithm from MAGMA, which inverts
+  // diagonal blocks and multiplies them using matrix multiplications.
+  XlaOp SolveByInvertingDiagonalBlocks(XlaOp a, XlaOp b, bool left_side,
+                                       bool lower, bool transpose_a,
+                                       bool conjugate_a, bool unit_diagonal,
+                                       PrecisionConfig::Precision precision);
+
+  // Helper function used by SolveByInvertingDiagonalBlocks
+  virtual XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower_triangular,
+                                     PrecisionConfig::Precision precision);
+
+  // Performs a direct triangular solve, suitable for case with small matrices
+  // or with large batch.
+  XlaOp SolveDirectly(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool transpose_a, bool conjugate_a, bool unit_diagonal,
+                      PrecisionConfig::Precision precision);
+
+  XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                             bool transpose_a, bool conjugate_a,
+                             bool unit_diagonal, int64_t block_size,
+                             PrecisionConfig::Precision precision);
+
+ private:
+  // Block size for BuildTriangularSolve
+  const int64_t block_size_;
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<std::string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_points_to_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_points_to_analysis.h
new file mode 100644
index 00000000..1b231e4b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_points_to_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TUPLE_POINTS_TO_ANALYSIS_H_
+#define XLA_SERVICE_TUPLE_POINTS_TO_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/tuple_points_to_analysis.h"
+
+#endif  // XLA_SERVICE_TUPLE_POINTS_TO_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_simplifier.h
new file mode 100644
index 00000000..19d81248
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_simplifier.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TUPLE_SIMPLIFIER_H_
+#define XLA_SERVICE_TUPLE_SIMPLIFIER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+
+#endif  // XLA_SERVICE_TUPLE_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_util.h
new file mode 100644
index 00000000..86719aba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/tuple_util.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TUPLE_UTIL_H_
+#define XLA_SERVICE_TUPLE_UTIL_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+class TupleUtil {
+ public:
+  // Generates HLO instructions to get a prefix tuple from `input_tuple` (which
+  // must be of tuple shape) of length `elements`.  Returns the root of the
+  // graph of instructions generated.
+  //
+  // The instructions are generated into the computation containing
+  // `input_tuple`.
+  static HloInstruction* ExtractPrefix(HloInstruction* input_tuple,
+                                       int64_t elements,
+                                       absl::string_view name = "");
+
+  // Generates HLO instructions to create a tuple that consists of the values in
+  // `trailing_values` appended to `input_tuple` (which must be of tuple shape).
+  // Returns the root of the graph of instructions generated.
+  //
+  // The instructions are generated into the computation containing
+  // `input_tuple`.
+  static HloInstruction* AppendSuffix(
+      HloInstruction* input_tuple,
+      absl::Span<HloInstruction* const> trailing_values);
+
+  // Generates HLO instructions that duplicates the tuple by inserting
+  // get-tuple-elements and a new tuple instruction. Returns the root of the
+  // graph of instructions generated.
+  static HloInstruction* Duplicate(HloInstruction* input_tuple) {
+    return ExtractPrefix(input_tuple, input_tuple->shape().tuple_shapes_size());
+  }
+
+  // Descend to the shape_index element of the tuple and replace that with
+  // new_instruction. If the replacement instruction has a different shape than
+  // the old one, we insert a bitcast if insert_bitcast_if_different_shape is
+  // set to true.
+  static absl::StatusOr<HloInstruction*> ReplaceTupleWith(
+      HloInstruction* new_instruction, HloInstruction* tuple,
+      ShapeIndex shape_index, bool insert_bitcast_if_different_shape = true);
+
+  // Recursively create kGetTupleElement instructions if the defining position
+  // shape is not an array. Returns the new instruction that has array shape.
+  static HloInstruction* AddGetTupleElements(const HloPosition& position);
+
+  // Returns a ShapeTree where each index is a GetTupleElement instruction for
+  // that subshape of the tuple.  The root index is the original argument.
+  // The new instructions are added to the parent computation of the argument.
+  // This function is similar to `xla::DisassembleTuple` except it operates
+  // directly on `HloInstruction*`.
+  static ShapeTree<HloInstruction*> DisassembleTupleInstruction(
+      HloInstruction* tuple);
+
+  // Assembles a tuple from a ShapeTree that contains the leaves of the tuple.
+  // Non-leaf elements of the ShapeTree are ignored.  DisassembleTuple and
+  // AssembleTuple are essentially inverse operations.
+  // The new instructions are added to the given computation.
+  // This function is similar to `xla::AssembleTuple` except it operates
+  // directly on `HloInstruction*`.
+  static HloInstruction* AssembleTupleInstruction(
+      HloComputation* computation, ShapeTree<HloInstruction*> elements,
+      absl::string_view name = "");
+
+  // Returns the tuple instruction at the given ShapeIndex `target_index`.
+  // Returns nullptr if there does not exist a tuple instruction at the given
+  // index, or if the index is invalid.
+  static HloInstruction* GetTupleInstructionAtIndex(
+      HloInstruction& tuple, const ShapeIndex& target_index);
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TUPLE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/value_range.h b/third_party/tflite-hdrs/third_party/xla/xla/service/value_range.h
new file mode 100644
index 00000000..eb06d3b4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/value_range.h
@@ -0,0 +1,110 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_VALUE_RANGE_H_
+#define XLA_SERVICE_VALUE_RANGE_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/constant_value.h"
+
+namespace xla {
+
+// Class keeping track of the range of an HLO value. A range is typically
+// defined by a minimum value, a maximum value, and a step value. The step and
+// maximum values are optional. If the maximum value is missing, the range is
+// unbounded. The default step value is nullopt.
+class Range {
+ public:
+  Range()
+      : min_(ConstantValue::GetZero(/*bitwidth=*/64, /*is_signed=*/false)),
+        max_(ConstantValue::GetZero(/*bitwidth=*/64, /*is_signed=*/false)),
+        step_(ConstantValue::GetZero(/*bitwidth=*/64, /*is_signed=*/false)),
+        empty_(true),
+        is_linear_(false) {}
+  Range(const ConstantValue& min, std::optional<ConstantValue> max,
+        bool is_linear)
+      : min_(min),
+        max_(max),
+        step_(std::nullopt),
+        empty_(false),
+        is_linear_(is_linear) {}
+  Range(const ConstantValue& min, std::optional<ConstantValue> max,
+        std::optional<ConstantValue> step, bool is_linear)
+      : min_(min),
+        max_(max),
+        step_(step),
+        empty_(false),
+        is_linear_(is_linear) {}
+  // Minimum value of the range.
+  const ConstantValue& min() const { return min_; }
+  // Maximum value of the range.
+  const std::optional<ConstantValue>& max() const { return max_; }
+  // Step value of the range.
+  const std::optional<ConstantValue>& step() const { return step_; }
+  // Returns if the range has min and max values (it can be a single value).
+  bool IsEmpty() const { return empty_; }
+  // Only one value in set. This means the range is a constant.
+  bool IsSingleValue() const {
+    return !IsEmpty() && max_.has_value() && min_ == max_;
+  }
+  // This is a way to track in some way recurring values that change in a
+  // monotonic way. This true means that the variables driving the range change
+  // in a monotonic way and that the way they are composed together is linear
+  // causing the final value represented by the range in a monotonic way during
+  // loop recursion.
+  bool IsLinear() const { return is_linear_; }
+  bool IsStepKnown() const { return step_.has_value(); }
+  // If this range is a bounded range with known max value.
+  bool IsBounded() const { return max_.has_value(); }
+  // If this range represents a single value return that signed value.
+  std::optional<int64_t> GetSingleSignedValue() const;
+  // If this range represents a single value return that unsigned value.
+  std::optional<int64_t> GetSingleUnsignedValue() const;
+
+  std::string ToString() const;
+
+  bool operator==(const Range& other) const {
+    return min_ == other.min_ && max_ == other.max_ &&
+           IsStepKnown() == other.IsStepKnown() &&
+           (IsStepKnown() ? step_ == other.step_ : true) &&
+           empty_ == other.empty_ && is_linear_ == other.is_linear_;
+  }
+
+ private:
+  ConstantValue min_;
+  std::optional<ConstantValue> max_;
+  std::optional<ConstantValue> step_;
+  bool empty_;
+  bool is_linear_;
+};
+
+// Constructs a Range object from a HloInstruction. Gets a "known_ranges"
+// object as input that returns known ranges for some variables for which we
+// already know the range. The final range is composed from operations over
+// these predetermined ranges.
+// The input HLO needs to be of scalar type and integer.
+Range RecursivelyIdentifyRange(
+    const HloInstruction* instr,
+    absl::flat_hash_map<const HloInstruction*, Range>& known_ranges,
+    const HloAliasAnalysis* alias_analysis = nullptr);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_VALUE_RANGE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
new file mode 100644
index 00000000..e3b30c90
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_H_
+#define XLA_SERVICE_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that rewrites while loops to sink all-reduces that are only
+// accumulated into a buffer and not otherwise used in the loop body.
+// An all-reduce instruction can be sinked if its result is only added
+// to a number of accumulation buffers, and the accumulation buffers are not
+// used inside the loop.
+//
+// Pattern before this pass:
+// a = ...
+// while:
+//   b = ...
+//   c = all-reduce(b)
+//   a += c
+// Pattern after this pass:
+// a = ...
+// d = 0
+// while:
+//   b = ...
+//   d += b
+// e = all-reduce(d)
+// a += e
+class WhileLoopAllReduceCodeMotion : public HloModulePass {
+ public:
+  explicit WhileLoopAllReduceCodeMotion(bool enable_reduce_scatter = false,
+                                        bool run_setup_passes = false)
+      : enable_reduce_scatter_(enable_reduce_scatter),
+        run_setup_passes_(run_setup_passes) {}
+  ~WhileLoopAllReduceCodeMotion() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-all-reduce-code-motion";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const bool enable_reduce_scatter_;
+
+  // Whether to run passes that may setup the add(all-reduce/reduce-scatter,
+  // accumulation_buffer) pattern.
+  const bool run_setup_passes_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_analysis.h
new file mode 100644
index 00000000..c6d95ac8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_analysis.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
+#define XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/analysis/while_loop_analysis.h"
+
+#endif  // XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_concat_code_motion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_concat_code_motion.h
new file mode 100644
index 00000000..99f82c7e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_concat_code_motion.h
@@ -0,0 +1,80 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_CONCAT_CODE_MOTION_H_
+#define XLA_SERVICE_WHILE_LOOP_CONCAT_CODE_MOTION_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that tries to lift concatenation out of a while loop, and replace
+// piece-wise subcomputations in the loop body with one on the concatenated
+// shape.
+//
+// For example:
+//
+// loop = while (a, b, c, d) {
+//   e = concat(a, b)
+//   f = some-op(e) <with the same shape as e>
+//   s0 = slice(f) first half
+//   s1 = slice(f) second half
+//   a_1 = add(a, s0)
+//   b_1 = add(b, s1)
+//   a_new = add(a_1, c)
+//   b_new = add(b_1, d)
+//   c_new = add(a_new, c)
+//   d_new = add(b_new, d)
+//   ROOT tuple(a_new, b_new, c_new, d_new)
+// }
+//
+// will be transformed to
+//
+// ab = concat(a, b)
+// cd = concat(c, d)
+// while (ab, cd) {
+//   f = some-op(ab)
+//   ab_1 = add(ab, f)
+//   ab_new = add(ab_1, cd)
+//   cd_new = add(ab_new, cd)
+//   ROOT tuple(ab_new, cd_new)
+// }
+// a_new = slice(ab_new) first half
+// b_new = slice(ab_new) second half
+// c_new = slice(cd_new) first half
+// d_new = slice(cd_new) second half
+class WhileLoopConcatCodeMotion : public HloModulePass {
+ public:
+  explicit WhileLoopConcatCodeMotion(int64_t min_operand_count_to_optimize)
+      : min_operand_count_to_optimize_(min_operand_count_to_optimize) {}
+  ~WhileLoopConcatCodeMotion() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-concat-code-motion";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const int64_t min_operand_count_to_optimize_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_CONCAT_CODE_MOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_constant_sinking.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_constant_sinking.h
new file mode 100644
index 00000000..1ea8e4db
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_constant_sinking.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
+#define XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Sinks while loop invariant values that happen to be constants into the while
+// loop body and conditional. This is probably not a win in isolation but may
+// unlock further optimizations like constant folding.
+//
+//   state = (..., const, ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(v)
+//     state = (..., v, ...)
+//   }
+//
+// =>
+//
+//   state = (..., const, ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(const)
+//     state = (..., v, ...)
+//   }
+//
+// Note that it leaves the `v` in place to keep that component of the state
+// tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
+// `v`.
+//
+class WhileLoopConstantSinking : public HloModulePass {
+ public:
+  explicit WhileLoopConstantSinking(bool sink_broadcast_of_constants = false,
+                                    bool sink_only_scalar_constants = false)
+      : sink_broadcast_of_constants_(sink_broadcast_of_constants),
+        sink_only_scalar_constants_(sink_only_scalar_constants) {}
+
+  ~WhileLoopConstantSinking() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-constant-sinking";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> TrySinkingConstantsIntoWhileLoop(
+      HloModule* module, HloInstruction* while_instr);
+
+  const bool sink_broadcast_of_constants_;
+  const bool sink_only_scalar_constants_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
new file mode 100644
index 00000000..06b4bac1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -0,0 +1,66 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
+#define XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
+
+#include <functional>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// HLO pass that rewrites while loops to hoist expensive and non-size-inflating
+// groups of loop invariant instructions in the while body into the computation
+// that contains the while instruction.
+// Users can specify worth_hoisting_individually, and only the groups
+// instructions with a root that returns true with it will be hoisted out.
+class WhileLoopExpensiveInvariantCodeMotion : public HloModulePass {
+ public:
+  using ShapeSizeFunction = std::function<int64_t(const Shape&)>;
+  explicit WhileLoopExpensiveInvariantCodeMotion(
+      HloPredicate worth_hoisting_individually,
+      ShapeSizeFunction shape_size_function = ShapeUtil::ByteSizeOfElements)
+      : shape_size_function_(std::move(shape_size_function)),
+        worth_hoisting_individually_(std::move(worth_hoisting_individually)) {}
+  ~WhileLoopExpensiveInvariantCodeMotion() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-expensive-invariant-code-motion";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+      HloInstruction* while_instr);
+
+  ShapeSizeFunction shape_size_function_;
+  HloPredicate worth_hoisting_individually_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_fusible_sinking.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_fusible_sinking.h
new file mode 100644
index 00000000..e3f65e6b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_fusible_sinking.h
@@ -0,0 +1,120 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_FUSIBLE_SINKING_H_
+#define XLA_SERVICE_WHILE_LOOP_FUSIBLE_SINKING_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Sinks values into the while loop body and conditional that fusibles. This is
+// probably not a win in isolation but may unlock further optimizations like
+// fusible folding. There are two categories:
+
+// 1. Sinks while loop invariant values into the while
+// loop body and conditional.
+//
+//   state = (..., fusible_graph, ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(v)
+//     state = (..., v, ...)
+//   }
+//
+// =>
+//
+//   state = (..., fusbile_graph, ..., fusible_graph_operands)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(fusibile_graph)
+//     state = (..., v, ...)
+//   }
+//
+// Note that it leaves the `v` in place to keep that component of the state
+// tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
+// `v`.
+//
+// 2. Sinks constant-initialized value, i.e., broadcast(constant) into the while
+// body. The high level idea is that we don't want to leave any element of the
+// buffer after loop execution as undefined. Therefore, all the elements of the
+// buffer must be written to in the body. For element-wise operation 'instr' we
+// have:
+// forall index i in output shape: instr[i] = f(operand1[i], ...),  where
+// f is the elementwise operation.
+// We can see that all the indices of the output shape is written to. These
+// values can sink into the loop and fused later.
+//
+//   state = (..., broadcast(constant), ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     value = f(v) // f writes to the entire shape of v.
+//     state = (..., value, ...)
+//   }
+//
+// =>
+//
+//   state = (..., allocate-buffer(), ...)
+//   while (pred(state)) {
+//     i = iteration_var
+//     (..., v, ...) = state
+//     new_v = select(i == 0, broadcast(constant), v)
+//     value = f(new_v)
+//     state = (..., value, ...)
+//   }
+//
+//   This transformation replaces the broadcast with a free AllocateBuffer
+//   outside the while loop with the hope that the broadcast inside the loop
+//   will be fused.
+class WhileLoopFusibleSinking : public HloModulePass {
+ public:
+  explicit WhileLoopFusibleSinking(bool sink_broadcast_of_constant = true)
+      : sink_broadcast_of_constant_(sink_broadcast_of_constant) {}
+
+  ~WhileLoopFusibleSinking() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-fusible-sinking";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Sink a fusible subgraph into a while loop.
+  absl::StatusOr<bool> TrySinkingFusiblesIntoWhileLoop(
+      HloInstruction* while_instr);
+
+  // Creates a loop fusion instruction containing the computation to move into
+  // the while loop to avoid conflicts with actual instruction fusion, the loop
+  // fusion will be defused.
+  bool IsSinkableFusion(HloInstruction* while_operand);
+  HloInstruction* CreateSinkableFusion(HloInstruction* while_operand);
+
+  absl::flat_hash_map<HloComputation*, int> call_counts_;
+  const bool sink_broadcast_of_constant_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_FUSIBLE_SINKING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_invariant_code_motion.h
new file mode 100644
index 00000000..695f08d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
+#define XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/compile_time_cap.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+// HLO pass that rewrites while loops to hoist loop invariant instructions in
+// the while body into the computation that contains the while instruction.
+
+class WhileLoopInvariantCodeMotion : public HloModulePass {
+ public:
+  using ShapeSizeFunction = std::function<int64_t(const Shape&)>;
+  // If `hoist_constants` is true then constants are always hoisted out of while
+  // loop bodies.  Otherwise they are only hoisted out if they enable other
+  // non-trivial computations to be hoisted out.
+  //
+  // Setting `hoist_constants` to false can be help if LICM is run in the mid
+  // level HLO pipeline because hoisting constants out of while loop bodies can
+  // break optimizations like constant folding.
+  //
+  // Setting `hoist_other` and `hoist_reshapes` to false can be used to hoist
+  // only constants. If provided, `hoist_size_inflation_ratio` will forbid
+  // hoisting instructions where the ratio of the size of the output(s) to the
+  // input(s) is larger than hoist_size_inflation_ratio. This is useful on
+  // platforms on which it's important to prevent blow-ups in memory size.
+  //
+  // If `hoist_reshapes` is true, then reshapes are allowed to be hoisted out of
+  // while loop body by themselves. Otherwise, they are only hoisted out if they
+  // enable other non-trivial computations to be hoisted out.
+  //
+  // Setting `hoist_reshapes` to false can be useful when LICM is run in the
+  // mid level HLO pipeline because the reshapes will often get fused with
+  // consumer instructions, and won't cost anything if not hoisted. However,
+  // any stand alone reshapes after fusion will benefit from hoisting.
+  explicit WhileLoopInvariantCodeMotion(
+      bool hoist_constants = false, bool hoist_reshapes = false,
+      bool hoist_other = true,
+      std::optional<float> hoist_size_inflation_ratio = std::nullopt,
+      ShapeSizeFunction shape_size_function = ShapeUtil::ByteSizeOfElements)
+      : hoist_constants_(hoist_constants),
+        hoist_reshapes_(hoist_reshapes),
+        hoist_other_(hoist_other),
+        hoist_size_inflation_ratio_(hoist_size_inflation_ratio),
+        shape_size_function_(shape_size_function) {}
+  ~WhileLoopInvariantCodeMotion() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-invariant-code-motion";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  bool NotWorthHoistingIndividually(const HloInstruction& instruction);
+  absl::StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+      HloInstruction* while_instr, BoundNonLinearCompilerAnalysis* allowance);
+
+  bool hoist_constants_;
+  bool hoist_reshapes_;
+  bool hoist_other_;
+  std::optional<float> hoist_size_inflation_ratio_;
+  ShapeSizeFunction shape_size_function_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_pipeline_unroller.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_pipeline_unroller.h
new file mode 100644
index 00000000..f259fe3b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_pipeline_unroller.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_PIPELINE_UNROLLER_H_
+#define XLA_SERVICE_WHILE_LOOP_PIPELINE_UNROLLER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+// Pipelined loops have inherent aliasing interference in them, due to loop
+// inputs shifting positions across iterations. This results in copy insertion
+// adding copies for each pipelined input. In some cases extra copies on top of
+// this are needed to properly sequence all the mandatory aliasing copies.
+//
+// It is not necessary to insert copies to resolve interference in this case.
+// The loop inputs, despite directly carried out as loop outputs, still have
+// finite lifetimes across a certain amount of loop iterations. If the loop was
+// unrolled just enough times to have the lifetimes of its inputs end before the
+// outputs would be materialized, this would implicitly remove any sort of
+// interference. The drawback of this approach is that it can in some cases
+// drastically increase compile times due to linearly increasing graph size.
+class WhileLoopPipelineUnroller : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "while_loop_pipeline_unroller";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // The pipeline depth of a while loop is the number of loop iterations that
+  // pipelined loop inputs live throughout. This is used to determine how many
+  // times to unroll the loop in order to remove aliasing interference.
+  static int64_t ComputeWhileLoopPipelineDepth(
+      const HloInstruction& while_instruction);
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_PIPELINE_UNROLLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_simplifier.h
new file mode 100644
index 00000000..7fda6d93
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_simplifier.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#define XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Tries to remove elements in a while loop's tuple that aren't used within the
+// loop.
+//
+// Specifically, if a loop is tuple-shaped, and there exists some element of
+// that tuple that is not used by the loop condition and is not used by the loop
+// body except to pass it to the next iteration of the loop, then we can remove
+// that element from the loop's tuple.
+absl::StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op);
+
+// HLO pass that makes the following transformations on while loops:
+//
+//  - A while loop with static trip count of 0 is deleted.
+//
+//  - A while loop with static trip count of 1 is replaced by its body (sans
+//    loop).
+//
+//  - Elements of a while loop's tuple that the loop doesn't use are removed
+//    from the tuple.
+//
+//  - If the while loop's parameter is a nested tuple, it's flattened to a
+//    single-level tuple.  This is good because it usually reduces the number of
+//    kTuple instructions, but also because it unlocks additional optimizations
+//    (e.g. removing unused loop parameters).
+//
+//  - Removing trivial compare instructions inside while bodies. Assuming a
+//    while loop with known trip count, k, loop induction variable i, and the
+//    initial loop induction value c, a compare(i,x) instruction is trivial if:
+//      1) x is a constant and x >= k + c.
+//      2) x is a constant x <= c.
+//
+// Flattening nested while loop tuples adds a whole mess of likely unnecessary
+// kGetTupleElement and kTuple operations to the graph.  We expect that tuple
+// simplifier will be run afterwards.
+//
+class WhileLoopSimplifier : public HloModulePass {
+ public:
+  explicit WhileLoopSimplifier(bool simplify_compare_instrs = false)
+      : simplify_compare_instrs_(simplify_compare_instrs) {}
+
+  ~WhileLoopSimplifier() override = default;
+  absl::string_view name() const override { return "simplify-while-loops"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Whether to remove trivial compare instructions inside while loops.
+  const bool simplify_compare_instrs_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_trip_count_annotator.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_trip_count_annotator.h
new file mode 100644
index 00000000..ee737742
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_trip_count_annotator.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
+#define XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
+
+#endif  // XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_unroller.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_unroller.h
new file mode 100644
index 00000000..e3c96dc4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_loop_unroller.h
@@ -0,0 +1,156 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
+#define XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Config for unroll thresholds.
+struct UnrollConfig {
+  int64_t trip_count_threshold = 64;
+  int64_t instruction_count_threshold = 800;
+  int64_t expand_factor_threshold = 10000;
+};
+
+// Config for unrollable while loops.
+struct WhileLoopConfig {
+  const HloInstruction* while_instr;
+  // The initial value of the induction variable of the while loop.
+  int64_t init;
+  // The number of iterations the loop executes.
+  int64_t trip_count;
+  // The index of the induction variable in the input tuple of the while loop.
+  int64_t induction_var_idx;
+};
+
+// Result for unrolled while loops.
+struct UnrollResult {
+  // Whether it's unrolled.
+  bool unrolled = false;
+  // If the while-loop has been unrolled and replaced with a new unrolled
+  // while-loop with a trip count of 1, this is the new while-loop.
+  // Otherwise this is nullptr.
+  HloInstruction* new_while_op = nullptr;
+};
+
+// Check if `instr` is a dynamic index instruction, i.e., dynamic-slice or
+// dynamic-update-slice with the given input that operates on the entire
+// shape of the instruction. To satisfy this:
+// 1. All start indices must be constant zero except for a single dimension,
+//    hereafter referred to as the dynamic dimension.
+// 2. The slice sizes of all nondynamic dimensions is the same as their size in
+//    the input shape.
+// 3. The start index of the dynamic dimension should be equal to the enclosing
+//    loop induction variable times the dynamic dimension's slice size.
+// 4. The size of the dynamic dimension must be at most the loop trip count
+//    times the slice size.
+// If so, it returns the index of the dynamic dimension.
+std::optional<int64_t> MatchShapeCoveringDynamicIndexInstruction(
+    const HloInstruction* instr, const HloInstruction* input, HloOpcode opcode,
+    const WhileLoopConfig& config);
+
+// Check if `instr` is a dynamic-slice with the given input and a single dynamic
+// start index that is effectively static, i.e., it is an expression that only
+// involves the iteration variable of the surrounding loop and some constants,
+// if we unroll the surrounding loop. If so, it returns the dynamic index.
+std::optional<int64_t> MatchEffectivelyStaticDynamicSliceInsideLoop(
+    const HloInstruction* instr, const HloInstruction* input,
+    const WhileLoopConfig& config);
+
+// This pass unrolls while loops with the given unrolling factor. The value of
+// unroll_factor = -1 will fully unroll the loop.
+//
+// TODO(b/288130138): Currently, we `only` support full unrolling.
+//
+// The trip count for loops is calculated based on
+// `MatchTrivialLoopTripCount` function in
+// tensorflow/compiler/xla/hlo/analysis/while_loop_analysis.h`
+//
+// TODO(b/301472793): Add utility functions to unroll specific loops.
+class WhileLoopUnroller : public HloModulePass {
+ public:
+  ~WhileLoopUnroller() override = default;
+
+  // Default unroll_factor of -1 indicates full unrolling
+  explicit WhileLoopUnroller(int64_t unroll_factor = -1,
+                             bool wrap_in_trivial_loop = false,
+                             UnrollConfig config = UnrollConfig())
+      : unroll_factor_(unroll_factor),
+        wrap_in_trivial_loop_(wrap_in_trivial_loop),
+        unroll_config_(config) {}
+
+  absl::string_view name() const override { return "while_loop_unroller"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Runs a sequence of passes that are necessary to prepare loops for
+  // unrolling. Failure to run these passes will prevent unroller from unrolling
+  // loops that would have been otherwise unrollable.
+  static absl::StatusOr<bool> PrepareModuleForUnrolling(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Function that decides whether a loop is unrollable or not and returns the
+  // loop config.
+  static std::optional<WhileLoopConfig> IsLoopUnrollable(
+      HloInstruction* while_op);
+
+  // Returns the list of unrollable loops in the given module. If
+  // `unroll_config` is provided, it will be used to check feasibility according
+  // to InitialFeasibilityCheck method
+  static std::vector<std::pair<HloInstruction*, WhileLoopConfig>>
+  GetUnrollableLoops(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      std::optional<UnrollConfig> unroll_config);
+
+  // Unrolls the given while loop with the default behaviour set to full unroll.
+  // If wrap_in_trivial_loop is set, the unrolled body of the loop will be
+  // wrapped in a loop with trip count of one. Forcing unroll will not perform
+  // soft checking of the conditions. If prepare is set, it will run the
+  // necessary passes to prepare the module for unrolling. Returns the unrolled
+  // flag and the new unrolled while instruction.
+  static absl::StatusOr<UnrollResult> UnrollAndReturnReplacement(
+      HloInstruction* while_op, int64_t unroll_factor = -1,
+      bool wrap_in_trivial_loop = false, bool force_unroll = false,
+      bool prepare = true, const UnrollConfig& unroll_config = UnrollConfig());
+
+ private:
+  int64_t unroll_factor_;
+  // Whether to wrap the unrolled computation in a loop with trip count of one.
+  bool wrap_in_trivial_loop_;
+  UnrollConfig unroll_config_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/while_util.h b/third_party/tflite-hdrs/third_party/xla/xla/service/while_util.h
new file mode 100644
index 00000000..69c44e0a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/while_util.h
@@ -0,0 +1,136 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_UTIL_H_
+#define XLA_SERVICE_WHILE_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/call_inliner.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+class WhileUtil {
+ public:
+  // Holds a return value from MakeInstructionsLiveIn.
+  struct MakeInstructionsLiveInResult {
+    // The new while operation that has the requested values live in.
+    HloInstruction* new_while_instr;
+
+    // The new tuple instruction that replaced the original while instruction
+    // with the same shape.
+    HloInstruction* replacement_instr;
+
+    // The i'th element of `while_body_live_in_values` is an instruction in the
+    // while body that holds the i'th *newly added* live in value at runtime.
+    std::vector<HloInstruction*> while_body_live_in_values;
+
+    // `while_body_instruction_map` maps instructions in the original while body
+    // to the corresponding instructions in the body for the newly created while
+    // operation.
+    CallInliner::InlinedInstructionMap while_body_instruction_map;
+
+    // `while_body_instruction_map` maps instructions in the original while body
+    // to the corresponding instructions in the body for the newly created while
+    // operation.
+    CallInliner::InlinedInstructionMap while_condition_instruction_map;
+  };
+
+  // Replaces `while_instr` with a new while instruction that is equivalent to
+  // `while_instr` except that it has all of the HLO instructions in
+  // `instructions` as live-in, loop invariant values.  These new live in values
+  // are represented as new elements appended to the parameter of the while
+  // loop, which must be of tuple shape.  GetTupleElement instructions computing
+  // each new live in value is returned in the `while_body_live_in_values`
+  // vector.
+  //
+  // Deletes `while_instr` after replacing it.
+  //
+  // Preconditions:
+  //
+  //  `while_instr` must have a tuple shaped state.
+  //
+  //   Every instruction in `instructions` must be contained in the computation
+  //   that contains `while_instr`.
+  static absl::StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
+      HloInstruction* while_instr,
+      absl::Span<HloInstruction* const> instructions);
+
+  using LoopStateTy = std::vector<HloInstruction*>;
+  using LoopBodyGeneratorTy = absl::FunctionRef<absl::StatusOr<LoopStateTy>(
+      HloInstruction* /*induction_var*/,
+      const LoopStateTy& /*current_values*/)>;
+
+  // Creates a while loop in `computation` that runs for `trip_count`
+  // iterations.  The structure of the while loop is as follows, in pseudocode:
+  //
+  //  loop_state while_loop() {
+  //    indvar = 0;
+  //    loop_state = init_values
+  //    while (indvar < trip_count) {
+  //      loop_state = loop_body_generator(loop_state)
+  //      indvar++;
+  //    }
+  //    return loop_state;
+  //  }
+  static absl::StatusOr<LoopStateTy> MakeCountedLoop(
+      HloComputation* computation, int32_t trip_count,
+      const LoopStateTy& init_values, LoopBodyGeneratorTy loop_body_generator,
+      const OpMetadata& metadata);
+
+  struct OwningLoopStateTy {
+    std::vector<std::unique_ptr<HloInstruction>> instructions_to_add;
+    WhileUtil::LoopStateTy while_results;
+  };
+  // As above but does not add the while loop or other instructions created
+  // around it in any particular computation. The caller can instead add it to a
+  // computation of their choosing.
+  static absl::StatusOr<OwningLoopStateTy> MakeCountedLoop(
+      HloModule* module, int32_t trip_count,
+      const WhileUtil::LoopStateTy& init_values,
+      WhileUtil::LoopBodyGeneratorTy loop_body_generator,
+      const OpMetadata& metadata);
+
+  // Returns the GetTupleElement instructions in `while_body` that access
+  // elements in the parameter tuple that don't change across iterations.
+  // Assumes `while_body` is the body computation of the while loop in question.
+  static std::vector<HloInstruction*> GetInvariantGTEsForWhileBody(
+      const HloComputation& while_body);
+
+  // Returns a map of index to GetTupleElement instructions in
+  // `while_conditional` that access elements in the parameter tuple. Assumes
+  // `while_conditional` is the conditional computation of the while loop in
+  // question.
+  static absl::flat_hash_map<int64_t, absl::InlinedVector<HloInstruction*, 1>>
+  GetGTEsMapForWhileConditional(const HloComputation& while_conditional);
+
+  // Modifies the trip count of the loop by the given increment.
+  // Requires loop body to be incrementing the induction variable by exactly 1.
+  static absl::Status IncrementWhileLoopTripCount(
+      const HloInstruction& while_instruction, int32_t increment);
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/xla_debug_info_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/service/xla_debug_info_manager.h
new file mode 100644
index 00000000..0d3ce1ca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/xla_debug_info_manager.h
@@ -0,0 +1,89 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_
+#define XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+using ModuleIdentifier = int;
+
+// XlaDebugInfoManager tracks all XLA programs (Executables) throughout their
+// lifetime. Because the tracing period can start during an Executable's
+// execution, we need to track Executables even when tracing is off.
+// This class is thread-safe.
+class XlaDebugInfoManager {
+ public:
+  static XlaDebugInfoManager* Get() {
+    static XlaDebugInfoManager* singleton = new XlaDebugInfoManager();
+    return singleton;
+  }
+
+  // Registers an active module to XlaDebugInfoManager.
+  // The module_id of the module is expected to be unique per process.
+  void RegisterModule(std::shared_ptr<const HloModule> hlo_module,
+                      BufferAssignmentProto buffer_assignment);
+
+  // Unregisters an active module.
+  void UnregisterModule(ModuleIdentifier module_id);
+
+  // Start tracing, began to collecting debug information for all the running
+  // modules during the tracing period.
+  void StartTracing();
+
+  // Stops tracing.
+  // If module_debug_info is not null, returns debug information for all the
+  // modules that were alive since StartTracing().
+  void StopTracing(
+      std::vector<std::unique_ptr<HloProto>>* module_debug_info = nullptr);
+
+  // Returns whether 'module_id' is tracked by XlaDebugInfoManager.
+  bool TracksModule(ModuleIdentifier module_id) const;
+
+  friend class XlaDebugInfoManagerTestPeer;
+
+ private:
+  XlaDebugInfoManager() = default;
+
+  struct XlaModuleEntry {
+    std::shared_ptr<const HloModule> hlo_module;
+    BufferAssignmentProto buffer_assignment;
+    bool active = false;
+  };
+
+  mutable absl::Mutex mutex_;
+  bool tracing_active_ ABSL_GUARDED_BY(mutex_) = false;
+  // Active modules are those still tracked by us. There could be much more
+  // active modules than running modules, we will try to reduce the trace size
+  // by only transfer those modules that were running during tracing period.
+  absl::flat_hash_map<ModuleIdentifier, XlaModuleEntry> modules_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_XLA_DEBUG_INFO_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/service/zero_sized_hlo_elimination.h b/third_party/tflite-hdrs/third_party/xla/xla/service/zero_sized_hlo_elimination.h
new file mode 100644
index 00000000..3da82bd2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/service/zero_sized_hlo_elimination.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
+#define XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h"
+
+#endif  // XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/shape.h b/third_party/tflite-hdrs/third_party/xla/xla/shape.h
new file mode 100644
index 00000000..75c8c0f8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/shape.h
@@ -0,0 +1,455 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SHAPE_H_
+#define XLA_SHAPE_H_
+
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/primitive_util.h"
+#include "xla/printer.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// A shape describes the number of dimensions in a array, the bounds of each
+// dimension, and the primitive component type. For tuples, shape describes the
+// structure (number of elements and nesting).
+class Shape {
+ public:
+  Shape();
+  ~Shape();
+  Shape(const Shape&);
+  Shape(Shape&&);
+  Shape& operator=(const Shape&);
+  Shape& operator=(Shape&&);
+
+  // Construct a shape from a ShapeProto.
+  explicit Shape(const ShapeProto& shape_proto);
+
+  Shape(PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+        absl::Span<const bool> dynamic_dimensions,
+        std::vector<Shape> tuple_shapes)
+      : element_type_(element_type),
+        dimensions_(dimensions.begin(), dimensions.end()),
+        dynamic_dimensions_(dynamic_dimensions.begin(),
+                            dynamic_dimensions.end()),
+        tuple_shapes_(std::move(tuple_shapes)) {}
+
+  // Returns a ShapeProto representation of the Shape.
+  ShapeProto ToProto() const;
+  // Sets a ShapeProto to the representation of the Shape.
+  void SetProto(ShapeProto& proto) const;
+
+  // Prints a human-readable string that represents the given shape, with or
+  // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
+  void Print(Printer* printer, bool print_layout = false) const;
+
+  // Returns a human-readable string that represents the given shape, with or
+  // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
+  std::string ToString(bool print_layout = false) const;
+
+  // Returns the rank (number of dimensions) of the given shape. Shape must be
+  // an array.
+  int64_t rank() const {
+    return dimensions_.size();
+  }
+
+  // Returns whether the shape is of the specified type (array, tuple, etc).
+  bool IsArray() const { return primitive_util::IsArrayType(element_type()); }
+  bool IsTuple() const { return element_type() == TUPLE; }
+  bool IsToken() const { return element_type() == TOKEN; }
+  bool IsOpaque() const { return element_type() == OPAQUE_TYPE; }
+
+  // Returns whether all elements in the shape are integer.
+  // A nested tuple of integers is considered as integer.
+  bool IsInteger() const;
+
+  // Returns true if no array dimension in the shape is dynamically sized. Tuple
+  // shapes are traversed recursively.
+  bool is_static() const;
+
+  bool is_dynamic() const { return !is_static(); }
+
+  // Unbounded dynamism.
+  // If `dimensions(axis) == kUnboundedSize && is_dynamic_dimension(axis)`,
+  // this means that the axis has unbounded dynamic size.
+  // The sentinel value for kUnboundedSize is chosen to be exactly the same
+  // as the sentinel value mlir::ShapedType::kDynamic.
+  static constexpr int64_t kUnboundedSize = std::numeric_limits<int64_t>::min();
+
+  // Returns true if the shape has one or more dimensions with unbounded sizes.
+  // Tuple shapes are traversed recursively, returns true if any element is
+  // unbounded dynamic.
+  bool is_unbounded_dynamic() const;
+
+  // Returns true if the given dimension is unbounded dynamic.
+  bool is_unbounded_dynamic_dimension(int dimension) const {
+    return dimensions_[dimension] == kUnboundedSize;
+  }
+
+  // Sets a given dimension as unbounded dynamic.
+  void set_unbounded_dynamic_dimension(int dimension) {
+    dynamic_dimensions_[dimension] = true;
+    dimensions_[dimension] = kUnboundedSize;
+  }
+
+  // Returns true if the shape has one or more dimensions with bounded sizes.
+  // Tuple shapes are traversed recursively, returns true if any element is
+  // bounded dynamic.
+  bool is_bounded_dynamic() const;
+
+  // Returns true if the given dimension is bounded dynamic.
+  bool is_bounded_dynamic_dimension(int dimension) const {
+    return is_dynamic_dimension(dimension) &&
+           !is_unbounded_dynamic_dimension(dimension);
+  }
+
+  // Returns true if the given dimension is dynamically-sized.
+  bool is_dynamic_dimension(int dimension) const {
+    return dynamic_dimensions_[dimension];
+  }
+
+  // Returns true if the given dimension is statically-sized.
+  bool is_static_dimension(int dimension) const {
+    return !dynamic_dimensions_[dimension];
+  }
+
+  // Sets whether or not the given dimension is dynamically-sized.
+  void set_dynamic_dimension(int dimension, bool is_dynamic) {
+    dynamic_dimensions_[dimension] = is_dynamic;
+  }
+
+  absl::Span<const bool> dynamic_dimensions() const {
+    return dynamic_dimensions_;
+  }
+
+  absl::Span<bool> mutable_dynamic_dimensions() {
+    return absl::MakeSpan(dynamic_dimensions_);
+  }
+
+  // Removes the given dimension from the shape. Layout, if it exists, is
+  // adjusted to match the modified shape.
+  void DeleteDimension(int64_t dim_to_delete);
+  void DeleteDimensions(absl::Span<const int64_t> sorted_dims_to_delete);
+
+  // Methods for accessing the primitive type.
+  PrimitiveType element_type() const { return element_type_; }
+  void set_element_type(PrimitiveType value) { element_type_ = value; }
+
+  // Methods for accessing the dimensions array.
+  int dimensions_size() const { return dimensions_.size(); }
+  int64_t dimensions(int index) const { return dimensions_[index]; }
+
+  int64_t dimensions_minor(int index) const {
+    CHECK(has_layout());
+    return dimensions_[layout_->minor_to_major(index)];
+  }
+  void set_dimensions(int index, int64_t value) { dimensions_[index] = value; }
+  void set_dimensions_minor(int index, int64_t value) {
+    CHECK(has_layout());
+    dimensions_[layout_->minor_to_major(index)] = value;
+  }
+  void add_dimensions(int64_t value) {
+    dimensions_.push_back(value);
+    dynamic_dimensions_.push_back(false);
+  }
+  void clear_dimensions() {
+    dimensions_.clear();
+    dynamic_dimensions_.clear();
+  }
+  absl::Span<const int64_t> dimensions() const { return dimensions_; }
+  absl::Span<int64_t> mutable_dimensions() {
+    return absl::MakeSpan(dimensions_);
+  }
+
+  // Methods for accessing the tuple subshapes. This field only non-empty for
+  // tuple shapes.
+  int tuple_shapes_size() const { return tuple_shapes_.size(); }
+  const Shape& tuple_shapes(int index) const;
+  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_[index]; }
+  Shape* add_tuple_shapes();
+  void clear_tuple_shapes() { tuple_shapes_.clear(); }
+  const std::vector<Shape>& tuple_shapes() const { return tuple_shapes_; }
+  std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
+
+  // Methods for accessing the layout field.
+  bool has_layout() const { return layout_ != std::nullopt; }
+  const Layout& layout() const {
+    CHECK(has_layout()) << ShortDebugString();
+    return *layout_;
+  }
+  Layout* mutable_layout() {
+    CHECK(IsArray()) << ShortDebugString();
+    if (layout_ == std::nullopt) {
+      layout_.emplace();
+    }
+    return &(*layout_);
+  }
+  void clear_layout() { layout_ = std::nullopt; }
+
+  // Recursively clear all dynamic dimension of a shape, including bounded and
+  // unbounded dynamic dimensions.
+  void clear_dynamic_dimensions() {
+    if (!IsTuple()) {
+      if (is_dynamic()) {
+        mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(0);
+      }
+      for (int64_t i = 0; i < dynamic_dimensions_.size(); ++i) {
+        dynamic_dimensions_[i] = false;
+      }
+      return;
+    }
+    for (auto& subshape : tuple_shapes_) {
+      subshape.clear_dynamic_dimensions();
+    }
+  }
+
+  void Clear() {
+    element_type_ = PRIMITIVE_TYPE_INVALID;
+    clear_dimensions();
+    tuple_shapes_.clear();
+    clear_layout();
+  }
+
+  std::string SerializeAsString() const {
+    return ToProto().SerializeAsString();
+  }
+  std::string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  std::string DebugString() const { return ToProto().DebugString(); }
+
+  // Equal is a configurable functor to check the equality of two shapes.
+  //
+  // Examples:
+  //
+  // - Comparing two shapes ignoring their layout difference:
+  //   Equal().IgnoreLayout()(shape1, shape2);
+  //
+  // - Comparing two shapes ignoring their layout and element type difference:
+  //   Equal().IgnoreLayout().IgnoreElementType()(shape1, shape2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Shape& lhs, const Shape& rhs);
+
+    Equal& IgnoreLayout() {
+      ignore_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreTilesInLayout() {
+      ignore_tiles_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementSizeInLayout() {
+      ignore_element_size_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreMemorySpaceInLayout() {
+      ignore_memory_space_in_layout_ = true;
+      return *this;
+    }
+    Equal& MinorToMajorOnlyInLayout() {
+      ignore_tiles_in_layout_ = true;
+      ignore_element_size_in_layout_ = true;
+      ignore_memory_space_in_layout_ = true;
+      ignore_tail_padding_alignment_in_elements_in_layout_ = true;
+      ignore_split_config_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementType() {
+      ignore_element_type_ = true;
+      return *this;
+    }
+    Equal& IgnoreFpPrecision() {
+      ignore_fp_precision_ = true;
+      return *this;
+    }
+    Equal& IgnoreDynamicDimension() {
+      ignore_dynamic_dimension_ = true;
+      return *this;
+    }
+    Equal& IgnoreDimensions() {
+      ignore_dimensions_ = true;
+      return *this;
+    }
+    Equal& IgnoreTailPaddingAlignmentInElements() {
+      ignore_tail_padding_alignment_in_elements_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreSplitConfigInLayout() {
+      ignore_split_config_in_layout_ = true;
+      return *this;
+    }
+
+   private:
+    bool ignore_layout_ = false;
+    bool ignore_tiles_in_layout_ = false;
+    bool ignore_element_size_in_layout_ = false;
+    bool ignore_memory_space_in_layout_ = false;
+    bool ignore_element_type_ = false;
+    bool ignore_fp_precision_ = false;
+    bool ignore_dynamic_dimension_ = false;
+    bool ignore_dimensions_ = false;
+    bool ignore_tail_padding_alignment_in_elements_in_layout_ = false;
+    bool ignore_split_config_in_layout_ = false;
+  };
+
+  // Test that all fields of the shape are the same, equivalent to Equal().
+  bool operator==(const Shape& other) const { return Equal()(*this, other); }
+  bool operator!=(const Shape& other) const { return !(*this == other); }
+
+  template <typename H, bool kIsLayoutSensitive = true>
+  static H Hash(H h, const Shape& s) {
+    if (s.IsTuple()) {
+      for (const Shape& subshape : s.tuple_shapes_) {
+        h = Shape::Hash<H, kIsLayoutSensitive>(std::move(h), subshape);
+      }
+      return H::combine(std::move(h), s.tuple_shapes_size());
+    }
+    h = H::combine(std::move(h), s.element_type_, s.dimensions_,
+                   s.dynamic_dimensions_);
+    if (kIsLayoutSensitive) {
+      h = H::combine(std::move(h), s.layout_);
+    }
+    return std::move(h);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Shape& s) {
+    return Shape::Hash(std::move(h), s);
+  }
+
+ private:
+  // The element type of this shape (tuple, array, etc).
+  PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
+
+  // The array bounds of the dimensions. This is nonempty only for array
+  // shapes. For a dynamically-sized dimension, the respective value in this
+  // vector is an inclusive upper limit of the array bound.
+  DimensionVector dimensions_;
+
+  // This vector is the same size as 'dimensions_' and indicates whether the
+  // respective dimension is dynamically sized.
+  absl::InlinedVector<bool, InlineRank()> dynamic_dimensions_;
+
+  // The tuple element subshapes. This is nonempty only for tuple shapes.
+  std::vector<Shape> tuple_shapes_;
+
+  // The layout of the shape. Only relevant for arrays.
+  std::optional<Layout> layout_;
+};
+
+// Shape of the parameters and output of an XLA computation. This is analogous
+// to a traditional function signature.
+class ProgramShape {
+ public:
+  ProgramShape();
+  ~ProgramShape();
+  ProgramShape(const ProgramShape&);
+  ProgramShape(ProgramShape&&);
+  ProgramShape& operator=(const ProgramShape&);
+  ProgramShape& operator=(ProgramShape&&);
+
+  // Creates a ProgramShape from a ProgramShapeProto protobuf.
+  explicit ProgramShape(const ProgramShapeProto& program_shape_proto);
+
+  // Returns a proto representation of the object.
+  ProgramShapeProto ToProto() const;
+
+  void Print(Printer* printer) const;
+
+  std::string ToString() const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ProgramShapeProto. This enabled easy migration of this data
+  // structure from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing and manipulating the Shape of the parameters.
+  int parameters_size() const { return parameters_.size(); }
+  const Shape& parameters(int index) const { return parameters_[index]; }
+  Shape* mutable_parameters(int index) { return &parameters_[index]; }
+  Shape* add_parameters() {
+    parameters_.emplace_back();
+    return &parameters_.back();
+  }
+  void clear_parameters() { parameters_.clear(); }
+  const std::vector<Shape>& parameters() const { return parameters_; }
+  std::vector<Shape>* mutable_parameters() { return &parameters_; }
+
+  // Methods for accessing and manipulating the Shape of the result.
+  const Shape& result() const { return result_; }
+  Shape* mutable_result() { return &result_; }
+
+  // Methods for accessing and manipulating the names of the parameters.
+  int parameter_names_size() const { return parameter_names_.size(); }
+  const std::string& parameter_names(int index) const {
+    return parameter_names_[index];
+  }
+  void set_parameter_names(int index, const std::string& value) {
+    parameter_names_[index] = value;
+  }
+  std::string* mutable_parameter_names(int index) {
+    return &parameter_names_[index];
+  }
+  void add_parameter_names(const std::string& value) {
+    parameter_names_.push_back(value);
+  }
+  std::string* add_parameter_names() {
+    parameter_names_.push_back("");
+    return &parameter_names_.back();
+  }
+  void clear_parameter_names() { parameter_names_.clear(); }
+  const std::vector<std::string>& parameter_names() const {
+    return parameter_names_;
+  }
+  std::vector<std::string>* mutable_parameter_names() {
+    return &parameter_names_;
+  }
+
+  std::string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  std::string DebugString() const { return ToProto().DebugString(); }
+
+ private:
+  // The shapes of the parameters of the computation represented by this object.
+  std::vector<Shape> parameters_;
+
+  // The names of the parameters of the computation represented by this object.
+  std::vector<std::string> parameter_names_;
+
+  // The shape of the result of the computation represented by this object.
+  Shape result_;
+};
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape);
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape);
+
+}  // namespace xla
+
+#endif  // XLA_SHAPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/shape_layout.h b/third_party/tflite-hdrs/third_party/xla/xla/shape_layout.h
new file mode 100644
index 00000000..cb3d56a5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/shape_layout.h
@@ -0,0 +1,106 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SHAPE_LAYOUT_H_
+#define XLA_SHAPE_LAYOUT_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/layout.h"
+#include "xla/printer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+// A ShapeLayout object encapsulates the layout of a particular shape (including
+// tuples). This differs from the Layout proto which describes the layout of a
+// single array. ShapeLayout contains a Layout proto for each array in the shape
+// (a tuple can have more than one array). For array shapes, this object
+// trivially holds a single Layout. Logically, ShapeLayout holds a nonmutable
+// shape with mutable layouts.
+class ShapeLayout {
+ public:
+  // Constructs a ShapeLayout of the given shape. Layouts are copied from the
+  // shape parameter.
+  explicit ShapeLayout(const Shape& shape) : shape_(shape) {}
+
+  // Assigns the layouts in this ShapeLayout to the Layout fields of the given
+  // shape. 'to_shape' and the shape of the ShapeLayout object must be
+  // compatible.
+  absl::Status AssignLayoutToShape(Shape* to_shape) const;
+
+  // Returns true if the Layouts in this ShapeLayout match the layouts in the
+  // given shape. Returns false otherwise. If the given shape is not compatible
+  // with the ShapeLayout's shape, then false is returned. If
+  // `ignore_fully_empty_tiling` is true, tiling info is ignored if one of the
+  // shapes has no tiling at all in all its subshapes.
+  bool MatchesLayoutInShape(const Shape& shape,
+                            bool minor_to_major_only = false,
+                            bool ignore_fully_empty_tiling = false) const;
+
+  // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
+  // must be compatible with the ShapeLayout's shape.
+  absl::Status CopyLayoutFromShape(const Shape& other_shape);
+
+  // Clears (Layout::Clear) all the Layouts stored in this object.
+  void Clear();
+  void Clear(ShapeIndexView shape_index);
+
+  // Sets all Layouts stored in this object to the default layout.
+  void SetToDefaultLayout();
+
+  // Returns the shape (with layouts).
+  const Shape& shape() const { return shape_; }
+
+  // Clear dynamic dimensions of this module. Pretending the module creates
+  // static results. Useful in inspecting full outputs when testing.
+  void ClearDynamicShape() { shape_.clear_dynamic_dimensions(); }
+
+  // Checks that a layout is set for the shape, and returns a reference to the
+  // layout directly on the shape. Shape must not be a tuple.
+  const Layout& layout() const;
+
+  // Returns true if all layouts have been set for this ShapeLayout object. That
+  // is, every array has a layout.
+  bool LayoutIsSet() const;
+  bool AnyLayoutIsSet() const;
+
+  // Resets the layout on the shape to the provided layout. Shape must not be a
+  // tuple.
+  void ResetLayout(const Layout& layout);
+
+  // Resets the layout on the shape at the provided ShapeIndex to the provided
+  // layout. Shape must be a tuple.
+  void ResetLayout(const Layout& layout, ShapeIndexView shape_index);
+
+  // Returns a string representation of this object.
+  void Print(Printer* printer) const { shape_.Print(printer, true); }
+
+  // Returns a string representation of this object.
+  std::string ToString() const { return shape_.ToString(true); }
+
+  // Tests for equality of both shape and layout (ShapeUtil::Equal).
+  bool operator==(const ShapeLayout& other) const;
+  bool operator!=(const ShapeLayout& other) const;
+
+ private:
+  Shape shape_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SHAPE_LAYOUT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/shape_tree.h b/third_party/tflite-hdrs/third_party/xla/xla/shape_tree.h
new file mode 100644
index 00000000..9ea53dd4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/shape_tree.h
@@ -0,0 +1,462 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SHAPE_TREE_H_
+#define XLA_SHAPE_TREE_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/lib/gtl/iterator_range.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+namespace internal {
+
+class IndexTable {
+ public:
+  // Use indices, rather than pointers, so index table can be copied between
+  // ShapeTrees.
+  struct Entry {
+    // Index of the node in the nodes vector.
+    size_t node_id;
+    // Index of the first child of this node in the index table (-1 for leaves).
+    std::make_signed_t<size_t> children_start_id = -1;
+  };
+
+  IndexTable() = default;
+  explicit IndexTable(const Shape& shape);
+
+  bool empty() const { return entries_.empty(); }
+
+  const Entry& operator[](ShapeIndexView index) const;
+
+ private:
+  void CreateEntry(Entry& entry, const Shape& shape, size_t& next_node_id);
+
+  absl::InlinedVector<Entry, 1> entries_;
+};
+
+}  // namespace internal
+
+// A ShapeTree<T> is a recursive data structure which mirrors the structure of a
+// XLA shape and holds a value of type T for each subshape (i.e. tuple or array)
+// in the shape. For array shapes, a ShapeTree trivially holds a single value of
+// type T.
+//
+// For tuple shapes which can be an arbitrary tree with arrays at the leaves, a
+// ShapeTree is an identically structured tree with data elements of type T at
+// every node. I.e. the root is a tuple by definition, all interior nodes are
+// also tuples, and all leaves are arrays.
+//
+// Like the Shape data structure, this is a tree and tuple elements cannot be
+// duplicated. That is, every distinct ShapeIndex in the Shape has a unique T
+// object.
+//
+// Normally a ShapeTree owns its Shape, but for efficiency reasons, sometimes
+// it's helpful not to copy a Shape just to make a ShapeTree.  In these cases,
+// you can pass a Shape* instead of a Shape to the ShapeTree constructor.  It's
+// then up to you to ensure that the pointed-to Shape isn't freed, moved or
+// modified before its ShapeTree goes away.
+template <typename T>
+class ShapeTree {
+  template <typename U>
+  friend class ShapeTree;
+
+ public:
+  // TODO(cjfj): Don't store ShapeIndex with data. Generate it or cache it?
+  using Node = std::pair<ShapeIndex, T>;
+  using Nodes = absl::InlinedVector<Node, 1>;
+  using IndexTable = internal::IndexTable;
+
+  template <typename Iterator, typename ValueType>
+  class LeafIterator;
+
+  // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
+  ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
+
+  // Create ShapeTree with the given shape, and default-constructed T values for
+  // all nodes.
+  //
+  // The version that takes a pointer may be cheaper because it doesn't require
+  // any Shape copies, but then it's up to you to ensure that the pointer stays
+  // alive longer than this ShapeTree.
+  explicit ShapeTree(Shape shape)
+      : ShapeTree(std::make_shared<Shape>(std::move(shape))) {}
+
+  explicit ShapeTree(const Shape* shape)
+      : ShapeTree(shape, CreateNodes(*shape)) {}
+
+  // Create ShapeTree with the given shape, and init_value for all nodes.
+  ShapeTree(Shape shape, const T& init_value)
+      : ShapeTree(std::make_shared<Shape>(std::move(shape)), init_value) {}
+
+  ShapeTree(const Shape* shape, const T& init_value)
+      : ShapeTree(shape, CreateNodes(*shape, init_value)) {}
+
+  // Returns the data element associated with the array in the shape at the
+  // given index (see ShapeUtil::GetSubshape for how indexes are defined).
+  const T& element(ShapeIndexView index) const { return find(index)->second; }
+  T* mutable_element(ShapeIndexView index) { return &find(index)->second; }
+
+  // Return the shape represented with this ShapeTree.
+  const Shape& shape() const { return *shape_; }
+
+  // A ShapeTree object can own the underlying Shape pointer (via the
+  // shape_storage_ member), or can point to a Shape object owned by the caller.
+  // This API replaces the underlying Shape object to the one supplied by the
+  // caller, whom must ensure the object remain valid for the whole lifetime of
+  // this ShapeTree object, and also that the Shape is consistent with it.
+  void replace_shape_ptr(const Shape& shape) {
+    if (shape_storage_ != nullptr) {
+      DCHECK_EQ(shape, *shape_storage_);
+      shape_storage_ = nullptr;
+    }
+    shape_ = &shape;
+  }
+
+  // Returns true if the node at the given index is a leaf node (an array
+  // shape).
+  bool IsLeaf(ShapeIndexView index) const {
+    return index_table_[index].children_start_id == -1;
+  }
+
+  using iterator = typename Nodes::iterator;
+  using const_iterator = typename Nodes::const_iterator;
+  using reverse_iterator = typename Nodes::reverse_iterator;
+  using const_reverse_iterator = typename Nodes::const_reverse_iterator;
+
+  using leaf_iterator = LeafIterator<iterator, Node>;
+  using const_leaf_iterator = LeafIterator<const_iterator, const Node>;
+  using reverse_leaf_iterator = std::reverse_iterator<leaf_iterator>;
+  using const_reverse_leaf_iterator =
+      std::reverse_iterator<const_leaf_iterator>;
+
+  iterator begin() { return nodes_.begin(); }
+  iterator end() { return nodes_.end(); }
+  const_iterator begin() const { return nodes_.begin(); }
+  const_iterator end() const { return nodes_.end(); }
+
+  reverse_iterator rbegin() { return nodes_.rbegin(); }
+  reverse_iterator rend() { return nodes_.rend(); }
+  const_reverse_iterator rbegin() const { return nodes_.rbegin(); }
+  const_reverse_iterator rend() const { return nodes_.rend(); }
+
+  // leaf_begin()/leaf_end() iterates over all leaf nodes (nodes with no
+  // children).
+  leaf_iterator leaf_begin() { return leaf_iterator(*this, nodes_.begin()); }
+  leaf_iterator leaf_end() { return leaf_iterator(*this, nodes_.end()); }
+  const_leaf_iterator leaf_begin() const {
+    return const_leaf_iterator(*this, nodes_.begin());
+  }
+  const_leaf_iterator leaf_end() const {
+    return const_leaf_iterator(*this, nodes_.end());
+  }
+  // range-based iterator for leaf_begin()/leaf_end().
+  tsl::gtl::iterator_range<leaf_iterator> leaves() {
+    return tsl::gtl::make_range(leaf_begin(), leaf_end());
+  }
+  tsl::gtl::iterator_range<const_leaf_iterator> leaves() const {
+    return tsl::gtl::make_range(leaf_begin(), leaf_end());
+  }
+
+  reverse_leaf_iterator leaf_rbegin() {
+    return reverse_leaf_iterator(leaf_end());
+  }
+  reverse_leaf_iterator leaf_rend() {
+    return reverse_leaf_iterator(leaf_begin());
+  }
+  const_reverse_leaf_iterator leaf_rbegin() const {
+    return const_reverse_leaf_iterator(leaf_end());
+  }
+  const_reverse_leaf_iterator leaf_rend() const {
+    return const_reverse_leaf_iterator(leaf_begin());
+  }
+
+  // Returns an iterator pointing to the given ShapeIndex.
+  // REQUIRES: index must exist in the ShapeTree.
+  iterator find(ShapeIndexView index) {
+    return nodes_.begin() + index_table_[index].node_id;
+  }
+  const_iterator find(ShapeIndexView index) const {
+    return nodes_.begin() + index_table_[index].node_id;
+  }
+
+  // Returns the number of leaf nodes in the tree.
+  int64_t leaf_count() const { return std::distance(leaf_begin(), leaf_end()); }
+
+  // TODO(cjfj): Remove the `ForEach...` methods. They are redundant.
+  // Recursively traverses the shape and calls the given function at each
+  // element.
+  void ForEachElement(
+      absl::FunctionRef<void(const ShapeIndex&, const T&)> func) const {
+    for (const Node& node : nodes_) {
+      func(node.first, node.second);
+    }
+  }
+
+  void ForEachMutableElement(
+      absl::FunctionRef<void(const ShapeIndex&, T*)> func) {
+    for (Node& node : nodes_) {
+      func(node.first, &node.second);
+    }
+  }
+
+  // Like ForEach(Mutable)Element, but the callable returns a absl::Status
+  // instead of void.  The first non-OK return value is returned by the ForEach*
+  // function.
+  absl::Status ForEachElementWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, const T&)> func) const {
+    for (const Node& node : nodes_) {
+      TF_RETURN_IF_ERROR(func(node.first, node.second));
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status ForEachMutableElementWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, T*)> func) {
+    for (Node& node : nodes_) {
+      TF_RETURN_IF_ERROR(func(node.first, &node.second));
+    }
+    return absl::OkStatus();
+  }
+
+  // Like the above, but traverses in post-order.  Note children are visited in
+  // right-to-left order.
+  void ForEachElementPostOrder(
+      absl::FunctionRef<void(const ShapeIndex&, const T&)> func) const {
+    for (auto node = nodes_.rbegin(); node != nodes_.rend(); ++node) {
+      func(node->first, node->second);
+    }
+  }
+
+  void ForEachMutableElementPostOrder(
+      absl::FunctionRef<void(const ShapeIndex&, T*)> func) {
+    for (auto node = nodes_.rbegin(); node != nodes_.rend(); ++node) {
+      func(node->first, &node->second);
+    }
+  }
+
+  absl::Status ForEachElementPostOrderWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, const T&)> func) const {
+    for (auto node = nodes_.rbegin(); node != nodes_.rend(); ++node) {
+      TF_RETURN_IF_ERROR(func(node->first, node->second));
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status ForEachMutableElementPostOrderWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, T*)> func) {
+    for (auto node = nodes_.rbegin(); node != nodes_.rend(); ++node) {
+      TF_RETURN_IF_ERROR(func(node->first, &node->second));
+    }
+    return absl::OkStatus();
+  }
+
+  // Maps each element to generate a new tree with the same shape.
+  template <typename U>
+  ShapeTree<U> Map(absl::FunctionRef<U(const T&)> func) {
+    typename ShapeTree<U>::Nodes result_nodes;
+    result_nodes.reserve(nodes_.size());
+    for (const Node& node : nodes_) {
+      result_nodes.push_back({node.first, func(node.second)});
+    }
+
+    ShapeTree<U> result(shape_, std::move(result_nodes));
+    result.index_table_ = index_table_;
+    result.shape_storage_ = shape_storage_;
+    return result;
+  }
+
+  template <typename U>
+  absl::StatusOr<ShapeTree<U>> MapWithStatus(
+      absl::FunctionRef<absl::StatusOr<U>(const T&)> func) {
+    typename ShapeTree<U>::Nodes result_nodes;
+    result_nodes.reserve(nodes_.size());
+    for (const Node& node : nodes_) {
+      TF_ASSIGN_OR_RETURN(U result, func(node.second));
+      result_nodes.push_back({node.first, std::move(result)});
+    }
+
+    ShapeTree<U> result(shape_, std::move(result_nodes));
+    result.index_table_ = index_table_;
+    result.shape_storage_ = shape_storage_;
+    return result;
+  }
+
+  // Copy the subtree of values from 'other' rooted at ShapeIndex 'src_index'
+  // into the subtree of value in this ShapeTree rooted at 'dst_index'.
+  //
+  // Precondition: The subshape of other.shape() at index src_index must be
+  // compatible with the subshape of shape() at index dst_index.
+  void CopySubtreeFrom(const ShapeTree<T>& other, const ShapeIndex& src_index,
+                       const ShapeIndex& dst_index) {
+    const Shape& src_shape = ShapeUtil::GetSubshape(other.shape(), src_index);
+    const Shape& dst_shape = ShapeUtil::GetSubshape(shape(), dst_index);
+    CHECK(ShapeUtil::Compatible(src_shape, dst_shape))
+        << src_shape << ", " << dst_shape;
+
+    // Replace the prefix `src_index` with `dst_index`.
+    auto replace_shape_index_prefix = [&](const ShapeIndex& index) {
+      auto without_prefix = ShapeIndexView(index).subspan(src_index.size());
+      ShapeIndex result;
+      result.reserve(dst_index.size() + without_prefix.size());
+      result.insert(result.end(), dst_index.begin(), dst_index.end());
+      result.insert(result.end(), without_prefix.begin(), without_prefix.end());
+      return result;
+    };
+
+    auto first = other.find(src_index);
+    auto last = first + ShapeUtil::SubshapeCount(src_shape);
+
+    std::transform(first, last, find(dst_index), [&](const Node& node) -> Node {
+      return {replace_shape_index_prefix(node.first), node.second};
+    });
+  }
+
+  absl::StatusOr<ShapeTree<T>> SubShapeTree(const ShapeIndex& index) const {
+    TF_ASSIGN_OR_RETURN(const Shape* sub_shape,
+                        ShapeUtil::TryGetSubshape(shape(), index));
+    size_t count = ShapeUtil::SubshapeCount(*sub_shape);
+    Nodes sub_tree_nodes;
+    sub_tree_nodes.reserve(count);
+    for (auto it = find(index), end = it + count; it != end; ++it) {
+      // For each shape index, remove the prefix `index`.
+      auto without_prefix = ShapeIndexView(it->first).subspan(index.size());
+      sub_tree_nodes.push_back(Node{without_prefix, it->second});
+    }
+    return ShapeTree(sub_shape, std::move(sub_tree_nodes));
+  }
+
+  bool operator==(const ShapeTree<T>& other) const {
+    return nodes_ == other.nodes_;
+  }
+  bool operator!=(const ShapeTree<T>& other) const { return !(*this == other); }
+
+ private:
+  explicit ShapeTree(std::shared_ptr<Shape> shape) : ShapeTree(shape.get()) {
+    shape_storage_.swap(shape);
+  }
+
+  ShapeTree(std::shared_ptr<Shape> shape, const T& init_value)
+      : ShapeTree(shape.get(), init_value) {
+    shape_storage_.swap(shape);
+  }
+
+  ShapeTree(const Shape* shape, Nodes nodes)
+      : nodes_(std::move(nodes)), index_table_(*shape), shape_(shape) {
+    DCHECK_EQ(nodes_.size(), ShapeUtil::SubshapeCount(*shape));
+  }
+
+  template <typename... Ts>
+  static Nodes CreateNodes(const Shape& shape, Ts&&... args) {
+    Nodes nodes;
+    ShapeUtil::ForEachSubshape(
+        shape, [&](const Shape&, const ShapeIndex& index) {
+          nodes.push_back({index, T(std::forward<Ts>(args)...)});
+        });
+    return nodes;
+  }
+
+  // The nodes in this shape tree.
+  Nodes nodes_;
+
+  // Index table for node lookups. Each entry contains the index of the first
+  // child of the node at that index, or -1 for leaf nodes. Evaluated lazily.
+  IndexTable index_table_;
+
+  // If we own our Shape, this field contains it, and shape_ is a pointer into
+  // here.  Otherwise if we don't own our shape, this is nullptr.
+  std::shared_ptr<Shape> shape_storage_;
+
+  // The XLA shape mirrored in this ShapeTree.  This is either
+  // shape_storage_.get() or the Shape pointer passed to our constructor.
+  const Shape* shape_;
+};
+
+// Internal iterator that performs a pre-order walk of the leaves. This is cheap
+// to copy. The iterator value_type is equivalent to a std::pair<ShapeIndex,T>&,
+// similar to std::map.
+template <typename T>
+template <typename Iterator, typename ValueType>
+class ShapeTree<T>::LeafIterator {
+ public:
+  using iterator_category = std::bidirectional_iterator_tag;
+  using value_type = ValueType;
+  using difference_type = ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  LeafIterator(const ShapeTree& tree, Iterator it) : tree_(tree), it_(it) {
+    while ((it_ != tree_.nodes_.end()) && !IsLeaf()) ++it_;
+  }
+
+  LeafIterator& operator++() {
+    do {
+      ++it_;
+    } while ((it_ != tree_.nodes_.end()) && !IsLeaf());
+    return *this;
+  }
+
+  LeafIterator operator++(int) {
+    auto prev = *this;
+    ++(*this);
+    return prev;
+  }
+
+  LeafIterator& operator--() {
+    do {
+      --it_;
+    } while ((it_ != tree_.nodes_.begin()) && !IsLeaf());
+    return *this;
+  }
+
+  LeafIterator operator--(int) {
+    auto prev = *this;
+    --(*this);
+    return prev;
+  }
+
+  bool operator==(const LeafIterator& other) const { return it_ == other.it_; }
+  bool operator!=(const LeafIterator& other) const { return !(*this == other); }
+  ValueType& operator*() const { return *it_; }
+  ValueType* operator->() const { return &*it_; }
+
+ private:
+  bool IsLeaf() const { return tree_.IsLeaf(it_->first); }
+
+  const ShapeTree<T>& tree_;
+  Iterator it_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SHAPE_TREE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/shape_util.h b/third_party/tflite-hdrs/third_party/xla/xla/shape_util.h
new file mode 100644
index 00000000..0fcc75aa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/shape_util.h
@@ -0,0 +1,1177 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Shapes are protobuf messages, so this utility header offers a bunch of
+// functionality for querying / poking at them.
+
+#ifndef XLA_SHAPE_UTIL_H_
+#define XLA_SHAPE_UTIL_H_
+
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <numeric>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/overflow_util.h"
+#include "xla/primitive_util.h"
+#include "xla/printer.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/macros.h"
+
+namespace xla {
+
+// A view into a ShapeIndex below, with the cheap/easy ability to consume the
+// value at the front of the view.
+//
+// NB! ShapeIndexView does not own the memory backing the index array.
+// The memory backing the index array should be owned by an object
+// that lives longer than the ShapeIndexView instances pointing into
+// it.
+using ShapeIndexView = absl::Span<const int64_t>;
+
+// An index for specifying a particular nested subshape within a shape. Used in
+// ShapeUtil::GetSubshape and other interfaces. Shapes are recursive data
+// structures (trees) and ShapeIndex defines a path through the tree where each
+// element of ShapeIndex indexes into a tuple (or nested tuple) within the
+// shape. For a non-nested tuple, an index has a single element. For example,
+// given a 3-element tuple (a, b, c) containing arrays a, b, and c, the index
+// {1} corresponds to array b. For a nested tuple, the index can have more than
+// one element. For the nested tuple (a, (b, c, d), e) below are the values
+// corresponding to the given indices:
+//
+//   index {0}    : array a
+//   index {1, 2} : array d
+//   index {2}    : array e
+//   index {0, 0} : invalid index (element at {0} is an array not a tuple)
+//
+// For indexing into array shapes, the index is always trivially empty, ie {}.
+struct ShapeIndex : public absl::InlinedVector<int64_t, 2> {
+  using InlinedVector::InlinedVector;
+  TF_ATTRIBUTE_NOINLINE ShapeIndex() = default;
+
+  explicit ShapeIndex(ShapeIndexView view)
+      : ShapeIndex(view.begin(), view.end()) {}
+
+  // push_front is O(n), but shapes don't usually have a ton of dimensions.
+  void push_front(int64_t value) { insert(begin(), value); }
+  void pop_front() { erase(begin()); }
+
+  std::string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index);
+
+// Namespaced collection of (static) shape utilities.
+//
+// These are all effectively convenience functions for testing/tweaking proto
+// properties, which do invariant checks before / after the operation.
+class ShapeUtil {
+ public:
+  using DynamicSizeType = int32_t;
+
+  // Data structure which describes the coordinates and the shape, of a tuple
+  // shaped sub-shape.
+  struct IndexedShape {
+    IndexedShape() = default;
+    IndexedShape(ShapeIndex index, Shape shape)
+        : index(std::move(index)), shape(std::move(shape)) {}
+    ShapeIndex index;
+    Shape shape;
+  };
+
+  // Returns the product of the statically bound dimensions.
+  template <bool kBoundedDynamicOk>
+  static inline std::pair<int64_t, bool> ExtentProduct(const Shape& shape) {
+    DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
+    DCHECK_EQ(shape.dimensions_size(), shape.rank());
+    int64_t product = 1;
+    bool any_overflows = false;
+    for (int dim = 0; dim < shape.dimensions_size(); ++dim) {
+      if constexpr (kBoundedDynamicOk) {
+        if (shape.is_unbounded_dynamic_dimension(dim)) {
+          continue;
+        }
+      } else {
+        DCHECK(!shape.is_unbounded_dynamic_dimension(dim));
+      }
+      bool overflow;
+      std::tie(product, overflow) =
+          OverflowSafeMultiply(product, shape.dimensions(dim));
+      any_overflows |= overflow;
+    }
+    return {product, any_overflows};
+  }
+
+  // Returns the product of the statically bound dimensions.
+  static inline int64_t StaticExtentProduct(const Shape& shape) {
+    auto [product, overflow] = ExtentProduct</*kBoundedDynamicOk=*/true>(shape);
+    DCHECK(!overflow);
+    return product;
+  }
+
+  // Returns the number of elements contained within the provided shape;
+  // e.g. for rank 0 (scalars) the result is always 1.
+  // Precondition: shape.IsArray()
+  static inline int64_t ElementsIn(const Shape& shape) {
+    auto [product, overflow] =
+        ExtentProduct</*kBoundedDynamicOk=*/false>(shape);
+    DCHECK(!overflow);
+    return product;
+  }
+
+  // As ElementsIn(), but recurses through tuples.
+  static int64_t ElementsInRecursive(const Shape& shape);
+
+  // Returns true if shape has the primitive type, recurses through tuples.
+  static bool HasPrimitiveType(const Shape& shape,
+                               PrimitiveType primitive_type);
+
+  // Returns true if 'shape' is an array with zero elements.
+  static bool IsZeroElementArray(const Shape& shape);
+
+  // Returns the number of bytes required for an allocation of shape.  The
+  // |pointer_size| parameter is used for calculating the size of tuple
+  // shapes. This includes only the size of the top-level buffer. For example, a
+  // tuple is stored as an array of pointers to other buffers. In this case,
+  // this method only returns the size of the pointer array.
+  static int64_t ByteSizeOf(const Shape& shape, int64_t pointer_size = -1);
+
+  // Returns the number of bytes used to store the primitive_type.
+  //
+  // Precondition: shape.IsArray()
+  static int64_t ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
+
+  // Returns the number of bytes required to store the tuple member pointers for
+  // a allocation of shape. The `shape` must be a TUPLE shape, and
+  // `pointer_size` must be larger than zero.
+  static int64_t ByteSizeOfTupleIndexTable(const Shape& shape,
+                                           int64_t pointer_size);
+
+  // Returns the number of bytes required for the elements in an allocation of
+  // `shape`, which must be an array shape. Shapes use a separate
+  // memory location for each element, and so for these shapes,
+  // `ByteSizeOf(shape) == ByteSizeOfElements(shape)`. This
+  // size also includes padding if present in the layout.
+  static int64_t ByteSizeOfElements(const Shape& shape);
+
+  // Returns the size in bytes for the serialized form of this shape.
+  // This serialized size includes the header of the serialized format, and so
+  // should not be used for subshapes.  Use SerializedSizeOfData for that
+  // purpose.
+  static absl::StatusOr<int64_t> SerializedSize(const Shape& shape);
+
+  // As above, but assumes the given ShapeProto is the result of
+  // shape.ToProto().  This can be used to avoid converting the shape to a
+  // protobuf multiple times.
+  static absl::StatusOr<int64_t> SerializedSizeWithProto(
+      const Shape& shape, const ShapeProto& proto);
+
+  // Prints a human-readable string that represents the given shape, with or
+  // without layout. e.g. "f32[42x12] {0, 1}" or "f32[64]".
+  static void PrintHumanString(xla::Printer* printer, const Shape& shape);
+  static void PrintHumanStringWithLayout(xla::Printer* printer,
+                                         const Shape& shape);
+
+  // As above, but for program shapes, prints a string for the form:
+  //
+  // (param_name: f32[42x12], ...) -> f32[24x42]
+  static void PrintHumanString(xla::Printer* printer,
+                               const ProgramShape& program_shape);
+
+  // Returns a human-readable string that represents the given shape, with or
+  // without layout. e.g. "f32[42x12] {0, 1}" or "f32[64]".
+  static std::string HumanString(const Shape& shape);
+  static std::string HumanStringWithLayout(const Shape& shape);
+
+  // As above, but for program shapes, returns a string for the form:
+  //
+  // (param_name: f32[42x12], ...) -> f32[24x42]
+  static std::string HumanString(const ProgramShape& program_shape);
+
+  // Returns whether the LHS and RHS shapes have the same dimensions, ignoring
+  // the unbounded dimension sizes; note: does not check element type.
+  // Precondition: IsArray(lhs) && IsArray(rhs)
+  static bool SameDimensions(const Shape& lhs, const Shape& rhs);
+
+  // Returns whether the LHS and RHS shapes have the same rank; note: does
+  // not check element type.
+  // Precondition: IsArray(lhs) && IsArray(rhs)
+  static bool SameRank(const Shape& lhs, const Shape& rhs);
+
+  // Returns whether the lhs and rhs shapes have the same element type.
+  static bool SameElementType(const Shape& lhs, const Shape& rhs) {
+    return lhs.element_type() == rhs.element_type();
+  }
+
+  // As SameElementType, but allows floating point types to have different
+  // precisions.
+  static bool SameElementTypeIgnoringFpPrecision(const Shape& a,
+                                                 const Shape& b) {
+    if (ElementIsFloating(a) && ElementIsFloating(b)) {
+      return true;
+    }
+    return ShapeUtil::SameElementType(a, b);
+  }
+
+  // Returns the higher-precision element type if a and b are both floating
+  // point types; otherwise, checks that they have the same element type
+  // and returns it.
+  static PrimitiveType HigherPrecisionElementType(const Shape& a,
+                                                  const Shape& b) {
+    return primitive_util::HigherPrecisionType(a.element_type(),
+                                               b.element_type());
+  }
+
+  // Returns true if the rank, dimension sizes, and element type are
+  // identical. Layout is ignored. Tuple elements are compared recursively for
+  // compatibility.
+  static bool Compatible(const Shape& lhs, const Shape& rhs);
+
+  // Returns true if the rank and dimension sizes are identical. Element type
+  // and layout are ignored. Tuple elements are compared recursively for
+  // compatibility.
+  static bool CompatibleIgnoringElementType(const Shape& lhs, const Shape& rhs);
+
+  // Returns true if the tuple tree shapes and leaf ranks are identical.
+  // Leaf dimensions, element type, and layout are ignored. Tuple elements are
+  // compared recursively for compatibility.
+  static bool CompatibleKind(const Shape& lhs, const Shape& rhs);
+
+  // As Compatible, but allow one of lhs and rhs to be BF16 while the other
+  // being F32. Tuple elements are compared recursively for compatibility.
+  static bool CompatibleIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
+
+  // Returns whether the lhs and rhs shapes are identical.
+  static bool Equal(const Shape& lhs, const Shape& rhs);
+
+  // As Equal, but does not compare the element type.
+  static bool EqualIgnoringElementType(const Shape& lhs, const Shape& rhs);
+
+  // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
+  static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
+
+  // Two shapes have same structure if all subshape indices of lhs are presented
+  // on rhs and vice versa.
+  // A nested tuple shape of (F32, (S32[2], F32[2, 2])) is structurally equal to
+  // (S32, (F32[3], S32[2])) as their structures are both (,(,))
+  //
+  // In contrast, (F32, (F32, F32)) is structurally different from
+  // ((F32, F32), F32) as the former has structure (,(,)) while the latter has
+  // ((,),)
+  static bool EqualStructure(const Shape& lhs, const Shape& rhs);
+
+  // Returns the number of dimensions for which the dimension is not (trivially)
+  // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
+  // fluff. Note that zero dimensions are included in the true rank, e.g.,
+  // f32[3,0,1] has a true rank of 2D.
+  static int64_t TrueRank(const Shape& shape);
+
+  static ProgramShape MakeProgramShape(std::initializer_list<Shape> parameters,
+                                       Shape result);
+
+  ////////////////////
+  // Scalar-specific
+
+  static bool IsScalar(const Shape& shape) {
+    return shape.IsArray() && shape.rank() == 0;
+  }
+  static bool IsEffectiveScalar(const Shape& shape) {
+    return shape.IsArray() && TrueRank(shape) == 0;
+  }
+
+  // Returns whether "shape" is a scalar (array) with the given element_type.
+  static bool IsScalarWithElementType(const Shape& shape,
+                                      PrimitiveType element_type);
+
+  // Creates a `DimensionVector` by copying dimensions from a given shape.
+  static DimensionVector CreateDimensionVectorFromShape(const Shape& shape);
+
+  // Extracts the size of the shape's dimension at dimension number
+  // GetDimensionNumber(dimension_number).
+  static int64_t GetDimension(const Shape& shape, int64_t dimension_number);
+
+  // Resolves a dimension number, supporting negative indexing.
+  //
+  // Negative indexing has similar semantics to Python. For an N-dimensional
+  // array, dimension -1 is equivalent to dimension N-1, -2 is equivalent to
+  // N-2, and so on.
+  //
+  // This function always returns a positive dimension number for any given
+  // dimension_number (which itself can be negative).
+  static int64_t GetDimensionNumber(const Shape& shape,
+                                    int64_t dimension_number);
+
+  // Returns a shape with the same dimensions as the original, but with the
+  // element type changed to type.
+  static Shape ChangeElementType(const Shape& original, PrimitiveType type);
+
+  // Returns a shape with same dimensions but with all dimensions set to static.
+  // If the shape has a layout, its dynamic_shape_metadata_prefix_bytes will be
+  // set to zero.
+  static Shape MakeStaticShape(const Shape& original);
+
+  // Creates a tuple shape from a slice of element shapes within the tuple.
+  static Shape MakeTupleShape(absl::Span<const Shape> shapes);
+  static Shape MakeTupleShapeWithPtrs(absl::Span<const Shape* const> shapes);
+
+  // Creates a tuple shape from a slice of element shapes within the tuple. If
+  // only one shape is passed, returns that.
+  static Shape MakeMaybeTupleShape(absl::Span<const Shape> shapes);
+
+  // Creates an opaque shape. These are generally used for threading a context
+  // into a custom operation.
+  static Shape MakeOpaqueShape();
+
+  // Creates a token shape. Values of this shape are used for ordering
+  // side-effecting operations.
+  static Shape MakeTokenShape();
+
+  // Appends a shape to the given tuple.
+  static void AppendShapeToTuple(const Shape& shape, Shape* tuple_shape);
+
+  // Update a subshape of a tuple.
+  static void UpdateTupleShape(const Shape& shape, int64_t index,
+                               Shape* tuple_shape);
+
+  // Update the dynamic dimension for a shape. This shape can be a nested tuple.
+  static void UpdateDynamicDimension(Shape* shape, ShapeIndexView index,
+                                     int64_t dim, bool is_dynamic);
+
+  // Appends a major dimension to the shape with the given bound.
+  static void AppendMajorDimension(int bound, Shape* shape);
+
+  // Prepends a major dimension sized `bound` to the shape.
+  static Shape PrependMajorDimension(int64_t bound, Shape shape);
+
+  // Appends a minor dimension to the shape with the given bound.
+  static void AppendMinorDimension(int bound, Shape* shape);
+
+  // Copy the dynamic dimensions property from one shape to another.
+  static void CopyDynamicDimensions(Shape* to, const Shape& from);
+
+  // Returns true if the given dimension is effectively the most major dimension
+  // of the shape taking into account any unit dimensions. Requires that the
+  // shape has a layout.
+  static bool IsEffectivelyMostMajorDimension(const Shape& shape,
+                                              int64_t dimension);
+
+  // Returns an empty tuple shape. Can be used as a sentinel Shape value.
+  static Shape MakeNil() { return MakeTupleShape({}); }
+
+  // Checks whether the shape is initialized.
+  static bool IsInitialized(const Shape& shape) {
+    return shape.element_type() != PRIMITIVE_TYPE_INVALID;
+  }
+
+  // Constructs a new shape with the given element type and sequence of
+  // dimensions.
+  static Shape MakeShape(PrimitiveType element_type,
+                         absl::Span<const int64_t> dimensions);
+
+  // Make a scalar shape with given primitive type.
+  static Shape MakeScalarShape(PrimitiveType element_type);
+
+  // Constructs a new shape with the given element type and sequence of
+  // potentially dynamic dimensions. The argument 'dynamic_dimensions' indicates
+  // with a true value that the respective dimension is dynamic. If the
+  // dimension is dynamic then the respective value in 'dimension' is an upper
+  // bound on the dimension size. 'dimensions' and 'dynamic_dimensions' must be
+  // the same size.
+  static Shape MakeShape(PrimitiveType element_type,
+                         absl::Span<const int64_t> dimensions,
+                         const std::vector<bool>& dynamic_dimensions);
+
+  // Constructs a new shape with the given element type and sequence of
+  // dimensions. Method checks if the element type is valid, the shape's
+  // size fits in std::numeric_limits<int64_t>::max(), and dynamic size is not
+  // marked static.
+  static absl::StatusOr<Shape> MakeValidatedShape(
+      PrimitiveType element_type, absl::Span<const int64_t> dimensions);
+  static absl::StatusOr<Shape> MakeValidatedShape(
+      PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+      const std::vector<bool>& dynamic_dimensions);
+
+  // Creates a Shape with element type corresponding to T and the given
+  // dimensions
+  template <typename T>
+  static Shape MakeShapeWithType(absl::Span<const int64_t> dimensions) {
+    return ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<T>(),
+                                dimensions);
+  }
+
+  // Constructs a new dense array shape with the given minor_to_major order in
+  // its Layout. Returns a value shape such that shape.has_layout().
+  static Shape MakeShapeWithDenseLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+      absl::Span<const int64_t> minor_to_major,
+      absl::Span<const Tile> tiles = {},
+      int64_t tail_padding_alignment_in_elements = 1,
+      int64_t element_size_in_bits = 0, int64_t memory_space = 0,
+      absl::Span<const SplitConfig> split_configs = {});
+
+  // Constructs a new sparse array shape with the given minor_to_major order and
+  // dim_level_types in its Layout. Returns a value shape such that
+  // shape.has_layout().
+  static Shape MakeShapeWithSparseLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+      absl::Span<const int64_t> minor_to_major,
+      absl::Span<const DimLevelType> dim_level_types,
+      absl::Span<const bool> dim_unique = {},
+      absl::Span<const bool> dim_ordered = {},
+      PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
+      PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
+      int64_t tail_padding_alignment_in_elements = 1,
+      int64_t element_size_in_bits = 0, int64_t memory_space = 0,
+      std::optional<Shape> physical_shape = std::nullopt);
+
+  // Constructs a new shape with the given dimension `dim` as the most major
+  // dimension in the layout. If the shape does not have a layout, assumes a
+  // default layout. If the shape is a tuple, apply this to all the leaf shapes
+  // of the tuple.
+  static Shape MoveDimToMajor(const Shape& shape, int64_t dim);
+
+  // Returns the same shape except with all dimensions set to be static.
+  static Shape MakeShapeWithStaticDimensions(const Shape& shape);
+
+  // Constructs a new shape with major-first layout (i.e. {n, n-1, ..., 0}).
+  static Shape MakeShapeWithDescendingLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dimensions);
+
+  // Returns a new Shape based on the given Shape with low-dimension-major
+  // layout (i.e. {n, n-1, ..., 0}, like Fortran), and with the dimensions
+  // rearranged so that it has the same in-memory layout as the given shape.
+  //
+  // For example, transforms f32[B,H,W,C]{0,3,2,1} to f32[H,W,C,B]{3,2,1,0}.
+  static Shape MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+      const Shape& shape);
+
+  // As MakeShape, but the object to write to is passed in.
+  static absl::Status PopulateShape(PrimitiveType element_type,
+                                    absl::Span<const int64_t> dimensions,
+                                    Shape* shape);
+
+  // Validates that the provided shape satisfies invariants.
+  static absl::Status ValidateShape(const Shape& shape);
+
+  // Validates the provided shape satisfies invariants, except those that
+  // pertain to layout.
+  //
+  // Layout is optional for client-provided shapes, so that the compiler may
+  // determine and assign an optimized layout.
+  static absl::Status ValidateShapeWithOptionalLayout(const Shape& shape);
+
+  // Returns whether the element type of the shape is integral (signed or
+  // unsigned). Note that predicates are not considered integral here, since
+  // they are logical values.
+  static bool ElementIsIntegral(const Shape& shape);
+
+  // Returns whether the element type of the shape is floating point.
+  static bool ElementIsFloating(const Shape& shape);
+
+  // Returns whether the element type of the shape is complex.
+  static bool ElementIsComplex(const Shape& shape);
+
+  // Returns whether the element type has the given bit width.
+  static bool ElementHasBitWidth(const Shape& shape, int bits);
+
+  // Returns whether the element type of the shape is integral and has
+  // the specified number of bits.
+  static bool ElementIsIntegralWithBits(const Shape& shape, int bits);
+
+  // Returns whether the element type of the shape is signed. Note
+  // that floating point numbers are signed.
+  static bool ElementIsSigned(const Shape& shape);
+
+  // Returns whether the shape is a tuple with at least one element which is
+  // also a tuple.
+  static bool IsNestedTuple(const Shape& shape);
+
+  // Returns true if shape is an empty tuple.
+  static bool IsEmptyTuple(const Shape& shape);
+
+  // Returns the number of elements in the given tuple shape.
+  // Precondition: IsTuple(shape)
+  static int64_t TupleElementCount(const Shape& shape);
+
+  // Returns the tuple element shape at given index.
+  // Precondition: IsTuple(shape) && TupleElementCount(shape) > index
+  static const Shape& GetTupleElementShape(const Shape& shape, int64_t index);
+
+  // Returns the number of elements, recursively, in the given shape.
+  static int64_t SubshapeCount(const Shape& shape);
+
+  // Slices tuple elements in the range [start, limit) and returns a new tuple
+  // shape. E.g. a tuple like (f32, s32, u32) would slice via 1,3 to (s32, u32).
+  static Shape SliceTuple(const Shape& tuple, int64_t start, int64_t limit);
+
+  // Returns the shape of the real/imaginary components of the given complex
+  // shape.
+  static Shape ComplexComponentShape(const Shape& complex_shape);
+
+  // Returns true if the given shape has a subshape at the given index.
+  static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
+
+  // GetSubshape and GetMutableSubshape return a particular nested Shape within
+  // the given Shape argument. The non-Try variants check fail if index is
+  // invalid.
+  static const Shape& GetSubshape(const Shape& shape, ShapeIndexView index);
+
+  // Faster version for one index.
+  static const Shape& GetSubshapeOneIndex(const Shape& shape, int64_t index);
+
+  static absl::StatusOr<const Shape*> TryGetSubshape(const Shape& shape,
+                                                     ShapeIndexView index);
+  static Shape* GetMutableSubshape(Shape* shape, ShapeIndexView index);
+
+  // Returns whether the given index in the given shape is a leaf element of the
+  // shape.
+  static bool IsLeafIndex(const Shape& shape, const ShapeIndex& index);
+
+  // Returns the number of leaves in the shape.
+  static int64_t GetLeafCount(const Shape& shape);
+  static int64_t GetLeafCountTuple(const Shape& shape);
+
+  // Retrieves all the leaf shapes and their indexes, in the order walked by
+  // the ForEachSubshape() API.
+  static std::vector<IndexedShape> GetLeafShapes(const Shape& shape);
+
+  // Calls the given visitor function for each subshape of the given shape.
+  // Subshapes are visited in DFS pre-order starting with the entire shape
+  // (index {}).
+  //
+  // The visitor function must have the signature
+  //
+  //   void fn(const Shape& subshape, const ShapeIndex& index), or
+  //   void fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  template <typename Fn>
+  static void ForEachSubshape(const Shape& shape, Fn&& fn) {
+    ForEachSubshapeWithStatus(shape, [&](const Shape& subshape,
+                                         const ShapeIndex& index) {
+      fn(subshape, index);
+      return absl::OkStatus();
+    }).IgnoreError();
+  }
+  template <typename Fn>
+  static void ForEachMutableSubshape(Shape* shape, Fn&& fn) {
+    ForEachMutableSubshapeWithStatus(shape, [&](Shape* subshape,
+                                                const ShapeIndex& index) {
+      fn(subshape, index);
+      return absl::OkStatus();
+    }).IgnoreError();
+  }
+
+  // Calls the given visitor function for each leaf subshape of the given shape.
+  // Subshapes are visited in DFS pre-order starting with the entire shape
+  // (index {}).
+  //
+  // The visitor function must have the signature
+  //
+  //   absl::Status fn(const Shape& subshape, const ShapeIndex& index)
+  //   void fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  template <typename Fn>
+  static absl::Status ForEachLeafShapeWithStatus(const Shape& shape, Fn&& fn) {
+    return ForEachSubshapeWithStatus(
+        shape, [&](const Shape& subshape, const ShapeIndex& index) {
+          if (IsLeafIndex(shape, index)) {
+            TF_RETURN_IF_ERROR(fn(subshape, index));
+          }
+          return absl::OkStatus();
+        });
+  }
+  template <typename Fn>
+  static absl::Status ForEachMutableLeafShapeWithStatus(Shape* shape, Fn&& fn) {
+    return ForEachMutableSubshapeWithStatus(
+        shape, [&](Shape* subshape, const ShapeIndex& index) {
+          if (IsLeafIndex(*shape, index)) {
+            TF_RETURN_IF_ERROR(fn(subshape, index));
+          }
+          return absl::OkStatus();
+        });
+  }
+
+  // Calls the given visitor function for each leaf subshape of the given shape.
+  // Subshapes are visited in DFS pre-order starting with the entire shape
+  // (index {}).
+  //
+  // The visitor function must have the signature
+  //   void fn(const Shape& subshape, const ShapeIndex& index)
+  //   void fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  template <typename Fn>
+  static void ForEachLeafShape(const Shape& shape, Fn&& fn) {
+    ForEachLeafShapeWithStatus(shape, [&](const Shape& subshape,
+                                          const ShapeIndex& index) {
+      fn(subshape, index);
+      return absl::OkStatus();
+    }).IgnoreError();
+  }
+  template <typename Fn>
+  static void ForEachMutableLeafShape(Shape* shape, Fn&& fn) {
+    ForEachMutableLeafShapeWithStatus(shape, [&](Shape* subshape,
+                                                 const ShapeIndex& index) {
+      fn(subshape, index);
+      return absl::OkStatus();
+    }).IgnoreError();
+  }
+
+  // Variants of ForEach(Mutable)Subshape which propagate absl::Status from the
+  // visitor function.
+  //
+  // Visitor function must have the signature
+  //
+  //   absl::Status fn(const Shape& subshape, const ShapeIndex& index), or
+  //   absl::Status fn(Shape* subshape, const ShapeIndex& index) (mutable
+  //   version)
+  //
+  template <typename Fn>
+  static absl::Status ForEachSubshapeWithStatus(const Shape& shape, Fn&& fn) {
+    return ForEachMutableSubshapeWithStatus(
+        const_cast<Shape*>(&shape),
+        [&](Shape* subshape, const ShapeIndex& index) -> absl::Status {
+          return fn(*const_cast<const Shape*>(subshape), index);
+        });
+  }
+  template <typename Fn>
+  static absl::Status ForEachMutableSubshapeWithStatus(Shape* shape, Fn&& fn) {
+    ShapeIndex index;
+    return ForEachMutableSubshapeWithStatusHelper(shape, fn, &index);
+  }
+
+  // Calls the given visitor function for each subshape of the given shape.
+  // Subshapes are visited in DFS post-order starting with the entire shape
+  // (index {}).
+  //
+  // The visitor function must have the signature
+  //
+  //   void fn(const Shape& subshape, const ShapeIndex& index), or
+  //   void fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  template <typename Fn>
+  static void ForEachSubshapePostOrder(const Shape& shape, Fn&& fn) {
+    ForEachSubshapePostOrderWithStatus(shape, [&](const Shape& subshape,
+                                                  const ShapeIndex& index) {
+      fn(subshape, index);
+      return absl::OkStatus();
+    }).IgnoreError();
+  }
+  template <typename Fn>
+  static void ForEachMutableSubshapePostOrder(Shape* shape, Fn&& fn) {
+    ForEachMutableSubshapePostOrderWithStatus(
+        shape,
+        [&](Shape* subshape, const ShapeIndex& index) {
+          fn(subshape, index);
+          return absl::OkStatus();
+        })
+        .IgnoreError();
+  }
+
+  // Variants of ForEach(Mutable)SubshapePostOrder which propagate absl::Status
+  // from the visitor function.
+  //
+  // Visitor function must have the signature
+  //
+  //   absl::Status fn(const Shape& subshape, const ShapeIndex& index), or
+  //   absl::Status fn(Shape* subshape, const ShapeIndex& index) (mutable
+  //   version)
+  //
+  template <typename Fn>
+  static absl::Status ForEachSubshapePostOrderWithStatus(const Shape& shape,
+                                                         Fn&& fn) {
+    return ForEachMutableSubshapePostOrderWithStatus(
+        const_cast<Shape*>(&shape),
+        [&](Shape* subshape, const ShapeIndex& index) -> absl::Status {
+          return fn(*const_cast<const Shape*>(subshape), index);
+        });
+  }
+  template <typename Fn>
+  static absl::Status ForEachMutableSubshapePostOrderWithStatus(Shape* shape,
+                                                                Fn&& fn) {
+    ShapeIndex index;
+    return ForEachMutableSubshapePostOrderWithStatusHelper(shape, fn, &index);
+  }
+
+  // Returns true if `shape` (which must be an array) with degenerate dimensions
+  // (dimensions with bound 1).
+  static bool HasDegenerateDimensions(const Shape& shape);
+
+  // Extracts the packing factor for a 1D interleaved array based on the layout.
+  // For example, bf16[1024]{0:T(1024)(128)(2,1)} -> 2
+  static absl::StatusOr<int64_t> PackedFactorFor1DInterleavedArray(
+      const Shape& shape);
+
+  // Drops any degenerate dimensions (i.e. dimensions of size 1)
+  static Shape DropDegenerateDimensions(const Shape& shape);
+
+  // Permutes the dimensions by the given permutation, so
+  // return_value.dimensions[i] = argument.dimensions[permutation[i]].
+  //
+  // Postcondition: For any valid permutation,
+  //
+  //   !HasLayout(shape) ||
+  //   TransposeIsBitcast(shape, PermuteDimensions(permutation, shape),
+  //                      permutation).
+  static Shape PermuteDimensions(absl::Span<const int64_t> permutation,
+                                 const Shape& shape);
+
+  // Describes how we can go from shape A to shape B by inserting degenerate
+  // 1-sized dimensions in `added_dimensions` and removing degenerate 1-sized
+  // dimensions from B in `removed_dimensions`.
+  //
+  // Only exists if shapes A and B only differ by degenerate dimensions.
+  struct ShapeEqualityDescriptor {
+    std::vector<int64_t> deleted_dimensions;
+    std::vector<int64_t> inserted_dimensions;
+  };
+
+  // If we can go from `shape_pre` to `shape_post` by merely inserting or
+  // deleting 1-sized dimensions, return the indices in `shape_pre` of the
+  // deleted dimensions and the indices in `dims_post` of the inserted
+  // dimensions.
+  // For example, if `shape_pre = {a_1, a_2, ..., a_m}` and
+  // `shape_post = {b_1, b_2, ..., b_n}` where we can find some sequence of `i`s
+  // and some sequence of `j`s so `a_i = 1` for each `i` and `b_j = 1` for each
+  // `j` and `a_(k-s) = b_(k-t)` where `s` and `t` are the number of `i`s and
+  // `j`s less than `k` for all other `k`, we return the `i`s and `j`s.
+  // For another example, if `shape_pre = shape_post = {}`, we return `{}`.
+  static std::optional<ShapeEqualityDescriptor>
+  InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
+                                    const Shape& shape_post);
+
+  // Suppose a reshape transforms input_shape to output shape. Returns a vector
+  // of pairs that indicate the input and output dimensions that this reshape
+  // doesn't logically (i.e. ignoring the layout) modify. For each pair (I,O) in
+  // the returned vector, the reshape transforms any input index whose I-th
+  // dimension is x to an output index whose O-th dimension is x too.
+  //
+  // Post-condition: the returned vector is sorted (by both input and output
+  // dimensions because input and output dimensions have the same order).
+  //
+  // Example:
+  //   input  shape = T[a, b, x, y, cd]
+  //   output shape = T[ab, x, 1, y, c, d]
+  //   return value = {{2, 1}, {3, 3}}
+  //
+  //   The two pairs represent the input and output dimension of size x and
+  //   those of size y.
+  static std::vector<std::pair<int64_t, int64_t>> DimensionsUnmodifiedByReshape(
+      const Shape& input_shape, const Shape& output_shape);
+
+  // Return whether the given reshape instruction leaves the dimensions at the
+  // given input indices unmodified, and returns their output indices.
+  //
+  // Example:
+  //   input_dim_indices = {2, 3}
+  //   input  shape = T[a, b, x, y, cd]
+  //   output shape = T[ab, x, 1, y, c, d]
+  //   return value = {1, 3}
+  //
+  // Precondition: input_dim_indices is sorted.
+  static std::optional<std::vector<int64_t>> ReshapeLeavesDimensionsUnmodified(
+      const Shape& from_shape, const Shape& to_shape,
+      absl::Span<const int64_t> input_dim_indices);
+
+  // Returns whether a transpose from input_shape to output_shape with dimension
+  // mapping "dimension_mapping" produces a result which is bit-wise identical
+  // to its input and thus may be replaced with a bitcast.
+  //
+  // Precondition: Both input_shape and output_shape have explicit layouts.
+  static bool TransposeIsBitcast(const Shape& input_shape,
+                                 const Shape& output_shape,
+                                 absl::Span<const int64_t> dimension_mapping,
+                                 bool ignore_element_type = false);
+
+  // Returns whether a reshape from `input_shape` to `output_shape` is a
+  // bitcast, when minor_to_major in layout is considered.
+  //
+  // Precondition: Both input_shape and output_shape have explicit layouts.
+  static bool ReshapeIsBitcast(const Shape& input_shape,
+                               const Shape& output_shape,
+                               bool ignore_element_type = false);
+
+  // Returns whether there is a bitcasting reshape or transpose from `a` to `b`.
+  //
+  // Precondition: Both input_shape and output_shape have explicit layouts.
+  static bool IsReshapeOrTransposeBitcast(const Shape& a, const Shape& b,
+                                          bool ignore_element_type = false);
+
+  // If the given bitcast is a transpose, deduce and return `dimensions`
+  // attribute of such a transpose. Otherwise, return std::nullopt.
+  static std::optional<std::vector<int64_t>>
+  DeduceTransposeDimensionsForBitcast(const Shape& input_shape,
+                                      const Shape& output_shape);
+
+  // This means that the bitcast can be decomposed to a single reshape.
+  struct BitcastDecompositionReshape {};
+
+  // This means that the bitcast can be decomposed to a single transpose.
+  struct BitcastDecompositionTranspose {
+    std::vector<int64_t> transpose_dims;
+  };
+
+  // Every bitcast from A to B can be represented as a sequence of:
+  // 1) Transpose to a normalized layout of A
+  // 2) Reshape to a normalized layout of B
+  // 3) Transpose from (2) to B
+  //
+  // All members are always set, even if they correspond to an identity
+  // operation.
+  //
+  // Note: Some bitcasts can be converted to a single transpose or reshape,
+  // using other methods.
+  struct BitcastDecompositionTrt {
+    std::vector<int64_t> transpose1_dims;
+    // Has a normalized layout.
+    Shape transpose1_shape;
+    // Has a normalized layout.
+    Shape reshape_shape;
+    std::vector<int64_t> transpose2_dims;
+
+    bool IsTranspose1Identity() const;
+    bool IsTranspose2Identity() const;
+  };
+
+  // A variant type holding one of the possible bitcast decompositions.
+  using BitcastDecomposition =
+      std::variant<BitcastDecompositionReshape, BitcastDecompositionTranspose,
+                   BitcastDecompositionTrt>;
+
+  // Decomposes a bitcast to a sequence of transpose, reshape, transpose.
+  //
+  // See the comment on BitcastDecompositionTrt.
+  static BitcastDecompositionTrt DecomposeBitcastToTrt(
+      const Shape& input_shape, const Shape& output_shape);
+
+  // Decomposes a bitcast to one of the possible decompositions.
+  static BitcastDecomposition DecomposeBitcast(const Shape& input_shape,
+                                               const Shape& output_shape);
+
+  // Find a physical layout for 'output_shape' such that
+  // ShapeUtil::ReshapeIsBitcast(input_shape, output_shape_with_layout) returns
+  // true (where 'output_shape_with_layout' is 'output_shape' with the found
+  // layout). The layout of 'input_shape' is kept fixed. Returns
+  // 'output_shape_with_layout' if such a layout can be found, and an error
+  // otherwise.
+  static std::optional<Shape> AlignLayouts(const Shape& input_shape,
+                                           const Shape& output_shape);
+
+  // Returns a shape with the given logical dimensions reordered, updating the
+  // layout so that physical dimensions are preserved.
+  static Shape ReorderLogicalDimensions(const Shape& shape,
+                                        absl::Span<const int64_t> permutation);
+
+  // Returns a shape with the given dimension deleted.
+  // For example:
+  // • `DeleteDimension(1, T[m, n, k]) = T[m, k]`
+  static Shape DeleteDimension(int64_t dim_to_delete, Shape shape);
+
+  // Returns a shape with dimensions in `to_drop` dropped.
+  static Shape DeleteDimensions(absl::Span<int64_t const> dims_to_delete,
+                                Shape shape);
+
+  // Returns a shape with all the dimensions of the input shape for which `p`
+  // returns true.
+  // For examples:
+  // • `FilterDimensions((< 2), T[m, n, k]) = T[m, n]`
+  // • `FilterDimensions(is_even_number, T[m, n, k]) = T[m, k]`
+  static Shape FilterDimensions(absl::FunctionRef<bool(int64_t)> p,
+                                Shape shape);
+
+  // Returns true if `dynamic_shape` has dimensions that are less-equal to the
+  // "bounded_shape". Shapes must be arrays.
+  static bool DynamicArrayShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                            const xla::Shape& bounded_shape);
+
+  // Same as DynamicArrayShapeIsCompatible() but supports tuples.
+  static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                       const xla::Shape& bounded_shape);
+
+  using ForEachVisitorFunction =
+      absl::FunctionRef<absl::StatusOr<bool>(absl::Span<const int64_t>)>;
+
+  using ForEachVisitorFunctionNoStatus =
+      absl::FunctionRef<bool(absl::Span<const int64_t>)>;
+
+  // Iterates through all the shape indexes, in minor to major order,
+  // starting from the base indexes, incrementing by the incr steps, up to
+  // count (index[i] < base[i] + count[i]), and calls the visitor_function
+  // with the current index. The visitor_function visitor function should
+  // return true if it wants to continue, or false otherwise.
+  static absl::Status ForEachIndexWithStatus(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachVisitorFunction& visitor_function);
+
+  // Simple ergonomic wrapper around ShapeUtil::ForEachIndexWithStatus.
+  struct IndexIterationSpace {
+    std::vector<int64_t> index_base;
+    std::vector<int64_t> index_count;
+    std::vector<int64_t> index_incr;
+  };
+
+  template <typename FnTy>
+  static absl::Status ForEachIndexWithStatus(
+      const Shape& shape, const IndexIterationSpace& iteration_space,
+      FnTy&& function) {
+    return ShapeUtil::ForEachIndexWithStatus(
+        shape, iteration_space.index_base, iteration_space.index_count,
+        iteration_space.index_incr, std::forward<FnTy>(function));
+  }
+
+  static void ForEachIndex(const Shape& shape, absl::Span<const int64_t> base,
+                           absl::Span<const int64_t> count,
+                           absl::Span<const int64_t> incr,
+                           const ForEachVisitorFunction& visitor_function);
+
+  static void ForEachIndexNoStatus(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachVisitorFunctionNoStatus& visitor_function);
+
+  // These convenience wrappers don't take `base`, `count` and `incr`
+  // explicitly, but iterate over every element in `shape` instead.
+
+  static absl::Status ForEachIndexWithStatus(
+      const Shape& shape, const ForEachVisitorFunction& visitor_function) {
+    std::vector<int64_t> base(shape.dimensions_size());
+    std::vector<int64_t> incr(shape.dimensions_size(), 1);
+    return ForEachIndexWithStatus(shape, base,
+                                  /*count=*/shape.dimensions(), incr,
+                                  visitor_function);
+  }
+
+  static void ForEachIndexNoStatus(
+      const Shape& shape,
+      const ForEachVisitorFunctionNoStatus& visitor_function) {
+    std::vector<int64_t> base(shape.dimensions_size());
+    std::vector<int64_t> incr(shape.dimensions_size(), 1);
+    ForEachIndexNoStatus(shape, base,
+                         /*count=*/shape.dimensions(), incr, visitor_function);
+  }
+
+  static void ForEachIndex(const Shape& shape,
+                           const ForEachVisitorFunction& visitor_function) {
+    ForEachIndexWithStatus(shape, [&](absl::Span<const int64_t> indices) {
+      return absl::StatusOr<bool>(visitor_function(indices));
+    }).IgnoreError();
+  }
+
+  using ForEachParallelVisitorFunction =
+      absl::FunctionRef<absl::StatusOr<bool>(absl::Span<const int64_t>, int)>;
+
+  // A parallel version of ForEachIndex(WithStatus). This can only be used if
+  // the visitor_function is thread-safe and the order of iteration does not
+  // matter.
+  //
+  // Please use GetForEachIndexParallelThreadCount() to get the number of
+  // threads in the threadpool of ForEachIndexParallel*. This will not change
+  // during the runtime of the process. Please DO NOT use
+  // tsl::port::MaxParallelism() for this purpose, as it may change.
+  static void ForEachIndexParallel(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachParallelVisitorFunction& visitor_function);
+
+  // Returns the number of threads in the threadpool of ForEachIndexParallel*.
+  static int GetForEachIndexParallelThreadCount();
+
+  static absl::Status ForEachIndexParallelWithStatus(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachParallelVisitorFunction& visitor_function);
+
+  // Convenience wrapper which doesn't take `base`, `count` and `incr`
+  // explicitly, but iterates over every element in `shape` instead.
+  static void ForEachIndexParallel(
+      const Shape& shape,
+      const ForEachParallelVisitorFunction& visitor_function);
+
+  static absl::Status ForEachIndexParallelWithStatus(
+      const Shape& shape,
+      const ForEachParallelVisitorFunction& visitor_function);
+
+  // Strips device-specific information, namely tiling and memory-space
+  // information, from a shape.
+  static Shape DeviceShapeToHostShape(Shape s);
+
+  // Returns true iff element type of shape `from` can be safely upcasted to
+  // element type of shape `to`.
+  static bool ElementCanUpcast(const Shape& from, const Shape& to);
+
+  // Computes byte strides of an array shape `shape`. `shape` must have a
+  // layout. Ignores tiling. `strides` must have size equal to the number of
+  // dimensions of `shape`.
+  static absl::Status ByteStrides(const Shape& shape,
+                                  absl::Span<int64_t> strides);
+  // Same as above but returns the stride array, or std::nullopt if error.
+  static std::optional<absl::InlinedVector<int64_t, 4>> ByteStrides(
+      const Shape& shape);
+
+  // Returns the array size in bytes (layout/tiling required), all paddings are
+  // included.
+  static int64_t ArraySize(const Shape& shape);
+
+  // Returns the size of array data in bytes, ignoring the trailing padding
+  // due to the tiling requirement.
+  static int64_t ArrayDataSize(const Shape& shape);
+
+  // Updates element_size_in_bits on each subshape's layout. If
+  // 'pack_subbyte_types' is true, sets the element size to the dtype bitwidth
+  // for subbyte types (S4, U4, etc) and 0 for non-subbyte types, which
+  // indicates that for arrays of subbyte types, multiple elements are packed in
+  // a single byte. If 'pack_subbyte_types' is false, sets the element size to 0
+  // for all types.
+  static void UpdateElementSizeInBits(Shape* s, bool pack_subbyte_types);
+
+ private:
+  // Fills *shape ignoring dynamic dimensions. Returns true on success.
+  // REQUIRES: *shape is empty.
+  static bool FillNewShape(PrimitiveType element_type,
+                           absl::Span<const int64_t> dimensions, Shape* shape);
+
+  // Helper for ForEachSubshape which visits the subshapes of the given shape in
+  // DFS pre-order starting with the index.
+  template <typename Fn>
+  static absl::Status ForEachMutableSubshapeWithStatusHelper(
+      Shape* shape, Fn&& fn, ShapeIndex* index) {
+    TF_RETURN_IF_ERROR(fn(shape, *index));
+    if (shape->IsTuple()) {
+      for (int64_t i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(ForEachMutableSubshapeWithStatusHelper(
+            shape->mutable_tuple_shapes(i), fn, index));
+        index->pop_back();
+      }
+    }
+    return absl::OkStatus();
+  }
+
+  // Helper for ForEachSubshapePost which visits the subshapes of the given
+  // shape in DFS post-order.
+  template <typename Fn>
+  static absl::Status ForEachMutableSubshapePostOrderWithStatusHelper(
+      Shape* shape, Fn&& fn, ShapeIndex* index) {
+    if (shape->IsTuple()) {
+      for (int64_t i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(ForEachMutableSubshapePostOrderWithStatusHelper(
+            shape->mutable_tuple_shapes(i), fn, index));
+        index->pop_back();
+      }
+    }
+    TF_RETURN_IF_ERROR(fn(shape, *index));
+    return absl::OkStatus();
+  }
+
+  // Keeps track of the iteration state for the ForEach...Internal routines
+  struct ForEachState {
+    ForEachState(const Shape& s, absl::Span<const int64_t> b,
+                 absl::Span<const int64_t> c, absl::Span<const int64_t> i);
+    inline ~ForEachState() = default;
+
+    const Shape& shape;
+    // Pointers to arrays of the passed-in spans
+    const int64_t* const base;
+    const int64_t* const count;
+    const int64_t* const incr;
+
+    const int64_t* const minor_to_major;  // Base of s's minor_to_major array
+    const int64_t rank;
+
+    std::vector<int64_t> indexes;  // The mutable set of indices we go through
+    int64_t* indexes_ptr;          // Points into "indexes"
+    absl::Span<const int64_t> indexes_span;  // Pre-formed span of "indexes"
+
+    int64_t IncrementDim();
+    bool IsZeroElementArray() const;
+
+    // Returns the number of visited elements assuming that the iteration will
+    // not be interrupted.
+    int64_t CalculateNumSteps() const;
+  };
+
+  static absl::Status ForEachIndexInternal(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachVisitorFunction& visitor_function);
+
+  static void ForEachIndexInternalNoStatus(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachVisitorFunctionNoStatus& visitor_function);
+
+  static absl::Status ForEachIndexInternalParallel(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachParallelVisitorFunction& visitor_function);
+
+  ShapeUtil(const ShapeUtil&) = delete;
+  ShapeUtil& operator=(const ShapeUtil&) = delete;
+};
+
+inline ShapeUtil::ForEachState::ForEachState(const Shape& s,
+                                             absl::Span<const int64_t> b,
+                                             absl::Span<const int64_t> c,
+                                             absl::Span<const int64_t> i)
+    : shape(s),
+      base(b.data()),
+      count(c.data()),
+      incr(i.data()),
+      minor_to_major(shape.layout().minor_to_major().data()),
+      rank(LayoutUtil::MinorToMajor(shape).size()),
+      indexes(b.begin(), b.end()),
+      indexes_ptr((rank == 0) ? nullptr : indexes.data()),
+      indexes_span(indexes) {
+  CHECK_EQ(shape.rank(), b.size());
+  CHECK_EQ(i.size(), b.size());
+  CHECK_EQ(c.size(), b.size());
+}
+
+inline int64_t ShapeUtil::ForEachState::IncrementDim() {
+  int64_t n;
+  for (n = 0; n < rank; ++n) {
+    int64_t dim = minor_to_major[n];
+    indexes_ptr[dim] += incr[dim];
+    if (indexes_ptr[dim] < base[dim] + count[dim]) {
+      break;
+    }
+    indexes_ptr[dim] = base[dim];
+  }
+  return n;
+}
+
+inline bool ShapeUtil::ForEachState::IsZeroElementArray() const {
+  return ShapeUtil::IsZeroElementArray(shape);
+}
+
+}  // namespace xla
+
+#endif  // XLA_SHAPE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/sharding_op_util.h b/third_party/tflite-hdrs/third_party/xla/xla/sharding_op_util.h
new file mode 100644
index 00000000..968dfbbc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/sharding_op_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SHARDING_OP_UTIL_H_
+#define XLA_SHARDING_OP_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+namespace xla {
+namespace sharding_op_util {
+
+// Encodes the attributes string for Sharding and auto/manual conversion custom
+// ops. This will be used in the opaque field.
+std::string EncodeAttributes(absl::Span<const int64_t> unspecified_dims);
+
+// Parses the opaque string of Sharding and auto/manual conversion custom ops.
+absl::Status ParseAttributes(absl::string_view opaque,
+                             std::vector<int64_t>* unspecified_dims);
+
+}  // namespace sharding_op_util
+}  // namespace xla
+
+#endif  // XLA_SHARDING_OP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/side_effect_util.h b/third_party/tflite-hdrs/third_party/xla/xla/side_effect_util.h
new file mode 100644
index 00000000..d8c3c118
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/side_effect_util.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SIDE_EFFECT_UTIL_H_
+#define XLA_SIDE_EFFECT_UTIL_H_
+
+namespace xla {
+
+// XLA frontend attribute name which specifies TensorFlow rendezvous name.
+extern const char kXlaHostTransferRendezvousNameAttr[];
+
+// XLA frontend attribute name which specifies the name of host side handler
+// associates with this transfer.
+extern const char kXlaHostTransferHandlerNameAttr[];
+
+// XLA frontend attribute value of the name of TensorFlow Rendezvous Host
+// Command Handler.
+extern const char kXlaHostTransferTfRendezvousHandlerName[];
+
+// XLA frontend attribute name which specifies the type of computation.
+extern const char kXlaComputeTypeAttr[];
+
+// XLA frontend attribute values for kXlaComputeTypeAttr
+extern const char kXlaComputeTypeSparse[];
+extern const char kXlaComputeTypeDense[];
+extern const char kXlaComputeTypeHost[];
+
+// XLA frontend attribute name for the maximum number of ids expected per
+// partition *before* an input batch is partitioned.
+extern const char kXlaMaxIdsPerPartitionAttr[];
+
+// XLA frontend attribute name for the maximum number of unique ids expected per
+// partition *after* an input batch is partitioned.
+extern const char kXlaMaxUniqueIdsPerPartitionAttr[];
+
+// XLA frontend attribute for how to assign ids to partitions.
+extern const char kXlaShardingStrategyAttr[];
+
+// XLA frontend attribute values for kXlaShardingStrategyAttr.
+extern const char kXlaShardingStrategyMod[];
+extern const char kXlaShardingStrategyDiv[];
+
+// XLA frontend attribute for pad value.
+extern const char kXlaPadValueAttr[];
+
+// XLA frontend attributes for simulated quantization.
+extern const char kXlaQuantizationHighValueAttr[];
+extern const char kXlaQuantizationLowValueAttr[];
+extern const char kXlaQuantizationNumBucketsValueAttr[];
+
+// XLA frontend attribute for table id.
+extern const char kXlaTableId[];
+
+// XLA frontend attribute for buffer placement.
+extern const char kXlaBufferPlacementAttr[];
+extern const char kXlaBufferPlacementParam[];
+
+// XLA frontend attribute for stream annotation.
+extern const char kXlaStreamAnnotationAttr[];
+
+// XLA frontend attribute for collective matmul control.
+extern const char kXlaCollectiveMatmulAttr[];
+
+// XLA frontend attribute values for kXlaCollectiveMatmulAttr
+extern const char kXlaCollectiveMatmulLhsAg[];
+extern const char kXlaCollectiveMatmulRhsAg[];
+extern const char kXlaCollectiveMatmulRs[];
+extern const char kXlaCollectiveMatmulNone[];
+
+// XLA frontend attribute for specifying the number of sends this recv should
+// match.
+extern const char kXlaMultiRecvCountAttr[];
+
+// XLA frontend attribute for specifying the scheduling group id annotations.
+extern const char kXlaSchedulingGroupIdAttr[];
+}  // namespace xla
+
+#endif  // XLA_SIDE_EFFECT_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/sort_json.h b/third_party/tflite-hdrs/third_party/xla/xla/sort_json.h
new file mode 100644
index 00000000..b4283f55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/sort_json.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SORT_JSON_H_
+#define XLA_SORT_JSON_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// Sorts the given JSON string or returns an error if the JSON could not be
+// parsed. Note that this function expects the input JSON to be valid and not
+// all forms of invalid JSON are correctly recognized. This function completely
+// ignores whitespace and the resulting JSON does not have any whitespace.
+// Comments are not supported in the input JSON.
+absl::StatusOr<std::string> SortJson(absl::string_view json);
+
+}  // namespace xla
+
+#endif  // XLA_SORT_JSON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/status_macros.h b/third_party/tflite-hdrs/third_party/xla/xla/status_macros.h
new file mode 100644
index 00000000..fbba412d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/status_macros.h
@@ -0,0 +1,214 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STATUS_MACROS_H_
+#define XLA_STATUS_MACROS_H_
+
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "absl/base/log_severity.h"
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace status_macros {
+
+// This is a useful error message when encountering XLA Compiler errors that
+// could be handled with the non-strict AutoJit mode.
+extern const char kPossibleAutoJitAlternative[];
+
+// Stream object used to collect error messages in MAKE_ERROR macros
+// or append error messages with APPEND_ERROR.  It accepts any
+// arguments with operator<< to build an error string, and then has an
+// implicit cast operator to absl::Status, which converts the
+// logged string to a absl::Status object and returns it, after logging the
+// error.  At least one call to operator<< is required; a compile time
+// error will be generated if none are given. Errors will only be
+// logged by default for certain status codes, as defined in
+// IsLoggedByDefault. This class will give ERROR errors if you don't
+// retrieve a absl::Status exactly once before destruction.
+//
+// The class converts into an intermediate wrapper object
+// MakeErrorStreamWithOutput to check that the error stream gets at least one
+// item of input.
+class MakeErrorStream {
+ public:
+  // Wrapper around MakeErrorStream that only allows for output. This
+  // is created as output of the first operator<< call on
+  // MakeErrorStream. The bare MakeErrorStream does not have a
+  // absl::Status operator. The net effect of that is that you
+  // have to call operator<< at least once or else you'll get a
+  // compile time error.
+  class MakeErrorStreamWithOutput {
+   public:
+    explicit MakeErrorStreamWithOutput(MakeErrorStream* error_stream)
+        : wrapped_error_stream_(error_stream) {}
+
+    template <typename T>
+    MakeErrorStreamWithOutput& operator<<(const T& value) {
+      *wrapped_error_stream_ << value;
+      return *this;
+    }
+
+    // Implicit cast operators to absl::Status and absl::StatusOr.
+    // Exactly one of these must be called exactly once before destruction.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator absl::Status() { return wrapped_error_stream_->GetStatus(); }
+    template <typename T>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator absl::StatusOr<T>() {
+      return wrapped_error_stream_->GetStatus();
+    }
+
+   private:
+    MakeErrorStream* wrapped_error_stream_;
+
+    MakeErrorStreamWithOutput(const MakeErrorStreamWithOutput&) = delete;
+    MakeErrorStreamWithOutput& operator=(const MakeErrorStreamWithOutput&) =
+        delete;
+  };
+
+  // When starting from an existing error status, this determines whether we'll
+  // append or prepend to that status's error message.
+  enum PriorMessageHandling { kAppendToPriorMessage, kPrependToPriorMessage };
+
+  // Make an error with the given code.
+  template <typename ERROR_CODE_TYPE>
+  MakeErrorStream(const char* file, int line, ERROR_CODE_TYPE code);
+
+  template <typename T>
+  MakeErrorStreamWithOutput& operator<<(const T& value) {
+    CheckNotDone();
+    impl_->stream_ << value;
+    return impl_->make_error_stream_with_output_wrapper_;
+  }
+
+  // When this message is logged (see with_logging()), include the stack trace.
+  MakeErrorStream& with_log_stack_trace() {
+    impl_->should_log_stack_trace_ = true;
+    return *this;
+  }
+
+  // Disables logging this message.
+  MakeErrorStream& without_logging() {
+    impl_->should_log_ = false;
+    return *this;
+  }
+
+  // Adds RET_CHECK failure text to error message.
+  MakeErrorStreamWithOutput& add_ret_check_failure(const char* condition);
+
+ private:
+  class Impl {
+   public:
+    Impl(const char* file, int line, tsl::error::Code code,
+         MakeErrorStream* error_stream, bool is_logged_by_default = true);
+    Impl(const absl::Status& status,
+         PriorMessageHandling prior_message_handling, const char* file,
+         int line, MakeErrorStream* error_stream);
+
+    ~Impl();
+
+    // This must be called exactly once before destruction.
+    absl::Status GetStatus();
+
+    void CheckNotDone() const;
+
+   private:
+    const char* file_;
+    int line_;
+    absl::StatusCode code_;
+
+    PriorMessageHandling prior_message_handling_ = kAppendToPriorMessage;
+    std::string prior_message_;
+    bool is_done_;  // true after absl::Status object has been returned
+    std::ostringstream stream_;
+    bool should_log_;
+    absl::LogSeverity log_severity_;
+    bool should_log_stack_trace_;
+
+    // Wrapper around the MakeErrorStream object that has a
+    // absl::Status conversion. The first << operator called on
+    // MakeErrorStream will return this object, and only this object
+    // can implicitly convert to absl::Status. The net effect of
+    // this is that you'll get a compile time error if you call
+    // MAKE_ERROR etc. without adding any output.
+    MakeErrorStreamWithOutput make_error_stream_with_output_wrapper_;
+
+    friend class MakeErrorStream;
+    Impl(const Impl&) = delete;
+    Impl& operator=(const Impl&) = delete;
+  };
+
+  void CheckNotDone() const;
+
+  // Returns the status. Used by MakeErrorStreamWithOutput.
+  absl::Status GetStatus() const { return impl_->GetStatus(); }
+
+  // Store the actual data on the heap to reduce stack frame sizes.
+  std::unique_ptr<Impl> impl_;
+
+  MakeErrorStream(const MakeErrorStream&) = delete;
+  MakeErrorStream& operator=(const MakeErrorStream&) = delete;
+};
+
+template <typename ERROR_CODE_TYPE>
+TF_ATTRIBUTE_NOINLINE MakeErrorStream::MakeErrorStream(const char* file,
+                                                       int line,
+                                                       ERROR_CODE_TYPE code)
+    : impl_(new Impl(file, line, code, this, true)) {}
+
+// Provides a conversion to bool so that it can be used inside an if statement
+// that declares a variable.
+class StatusAdaptorForMacros {
+ public:
+  explicit StatusAdaptorForMacros(absl::Status status)
+      : status_(std::move(status)) {}
+
+  StatusAdaptorForMacros(const StatusAdaptorForMacros&) = delete;
+  StatusAdaptorForMacros& operator=(const StatusAdaptorForMacros&) = delete;
+
+  explicit operator bool() const { return ABSL_PREDICT_TRUE(status_.ok()); }
+
+  absl::Status&& Consume() { return std::move(status_); }
+
+ private:
+  absl::Status status_;
+};
+
+}  // namespace status_macros
+}  // namespace xla
+
+#define TF_RET_CHECK(condition)                                      \
+  while (ABSL_PREDICT_FALSE(!(condition)))                           \
+  return xla::status_macros::MakeErrorStream(__FILE__, __LINE__,     \
+                                             ::tsl::error::INTERNAL) \
+      .with_log_stack_trace()                                        \
+      .add_ret_check_failure(#condition)
+
+#define XLA_RET_CHECK_FAIL()                                         \
+  return xla::status_macros::MakeErrorStream(__FILE__, __LINE__,     \
+                                             ::tsl::error::INTERNAL) \
+      .with_log_stack_trace()
+
+#endif  // XLA_STATUS_MACROS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/activate_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/activate_context.h
new file mode 100644
index 00000000..d0b85fc3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/activate_context.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ACTIVATE_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_ACTIVATE_CONTEXT_H_
+
+namespace stream_executor {
+
+// An RAII handle for ensuring a context is activated for the duration of the
+// ActivateContext's scope.  The creation of an ActivateContext ensures that any
+// necessary state changes are done to make the requested context active.  When
+// the ActivateContext is destroyed, it will enable any previous context that
+// was active.
+class ActivateContext {
+ public:
+  virtual ~ActivateContext() = default;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ACTIVATE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/allocator_stats.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/allocator_stats.h
new file mode 100644
index 00000000..c6d185cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/allocator_stats.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+#define XLA_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+
+namespace stream_executor {
+
+// Runtime statistics collected by an allocator. Exactly the same as
+// tsl::AllocatorStats, but independently defined to preserve the mutual
+// independence of StreamExecutor and TensorFlow.
+struct AllocatorStats {
+  int64_t num_allocs;          // Number of allocations.
+  int64_t bytes_in_use;        // Number of bytes in use.
+  int64_t peak_bytes_in_use;   // The peak bytes in use.
+  int64_t largest_alloc_size;  // The largest single allocation seen.
+
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  std::optional<int64_t> bytes_limit;
+
+  // Stack related memory usage.
+  int64_t bytes_reserved;  // Number of bytes reserved on the stack.
+  int64_t
+      peak_bytes_reserved;  // The peak number of bytes reserved on the stack.
+  // The upper limit on the number bytes of reservable memory on the stack,
+  // if such a limit is known.
+  std::optional<int64_t> bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;  // Largest free block's size in heap.
+
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0),
+        bytes_reserved(0),
+        peak_bytes_reserved(0),
+        largest_free_block_bytes(0) {}
+
+  std::string DebugString() const;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/bit_pattern.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/bit_pattern.h
new file mode 100644
index 00000000..467e9812
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/bit_pattern.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_BIT_PATTERN_H_
+#define XLA_STREAM_EXECUTOR_BIT_PATTERN_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <variant>
+
+namespace stream_executor {
+
+// BitPattern represents a 4 byte bit pattern. It can be constructed from either
+// a 8 bit, 16 bit or 32 bit pattern and it gets broadcasted to a uint32_t.
+class BitPattern : public std::variant<uint8_t, uint16_t, uint32_t> {
+ public:
+  using std::variant<uint8_t, uint16_t, uint32_t>::variant;
+
+  // Returns the size of the pattern in bytes.
+  size_t GetElementSize() const;
+
+  // Returns the pattern broadcasted to a uint32_t.
+  uint32_t GetPatternBroadcastedToUint32() const;
+
+  // Returns a string representation of the pattern - mainly meant for debugging
+  // and logging.
+  std::string ToString() const;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_BIT_PATTERN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/blas.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/blas.h
new file mode 100644
index 00000000..f6ad27ae
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/blas.h
@@ -0,0 +1,903 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Exposes the family of BLAS routines as pre-canned high performance calls for
+// use in conjunction with the StreamExecutor abstraction.
+//
+// Note that this interface is optionally supported by platforms.
+
+#ifndef XLA_STREAM_EXECUTOR_BLAS_H_
+#define XLA_STREAM_EXECUTOR_BLAS_H_
+
+#include <complex>
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/data_type.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace Eigen {
+struct half;
+}  // namespace Eigen
+
+namespace stream_executor {
+
+namespace gpu {
+struct BlasLt;
+struct MatrixDescriptor;
+struct OutputMatrixDescriptor;
+}  // namespace gpu
+
+template <typename T>
+using DeviceMemorySlice = absl::Span<DeviceMemory<T> *const>;
+
+namespace blas {
+
+// Specifies whether the input matrix will be transposed or
+// transposed+conjugated before any BLAS operations.
+enum class Transpose { kNoTranspose, kTranspose, kConjugateTranspose };
+
+// Returns a name for t.
+std::string TransposeString(Transpose t);
+
+// Specifies whether the upper or lower triangular part of a
+// symmetric/Hermitian matrix is used.
+enum class UpperLower { kUpper, kLower };
+
+// Returns a name for ul.
+std::string UpperLowerString(UpperLower ul);
+
+// Specifies whether a matrix is unit triangular.
+enum class Diagonal { kUnit, kNonUnit };
+
+// Returns a name for d.
+std::string DiagonalString(Diagonal d);
+
+// Specifies whether a Hermitian matrix appears on the left or right in
+// operation.
+enum class Side { kLeft, kRight };
+
+// Returns a name for s.
+std::string SideString(Side s);
+
+// Type with which intermediate computations of a blas routine are performed.
+//
+// Some blas calls can perform computations with a type that's different than
+// the type of their inputs/outputs.  This lets you e.g. multiply two matrices
+// of int8s using float32s to store the matmul's intermediate values.
+enum class ComputationType {
+  kF16,  // 16-bit floating-point
+  kF32,  // 32-bit floating-point
+  kF64,  // 64-bit floating-point
+  kI32,  // 32-bit integer
+  // The below values use float32 for accumulation, but allow the inputs and
+  // outputs to be downcast to a lower precision:
+  kF16AsF32,   // Allow downcast to F16 precision.
+  kBF16AsF32,  // Allow downcast to BF16 precision.
+  kTF32AsF32,  // Allow downcast to TF32 precision.
+};
+
+// Call context information for GEMM API calls
+// This is extra information that can optionally be passed down to the blas
+// library, so that it can pick the efficient imlpementation based on context
+enum class CallContext {
+  kNone = 0,            // No information
+  kForward = 1,         // call happens in "forward" pass
+  kBackpropInput1 = 2,  // call happens in "backprop" pass for the first input
+  kBackpropInput2 = 4,  // call happens in "backprop" pass for the second input
+};
+
+// Converts a ComputationType to a string.
+std::string ComputationTypeString(ComputationType ty);
+
+std::ostream &operator<<(std::ostream &os, ComputationType ty);
+
+using dnn::DataType;
+using dnn::ToDataType;
+
+// Converts a ComputationType to a string.
+std::string DataTypeString(DataType ty);
+
+std::ostream &operator<<(std::ostream &os, DataType ty);
+
+// Opaque identifier for an "algorithm" used by a blas routine.  This functions
+// as a hint to the blas library.
+typedef int64_t AlgorithmType;
+constexpr AlgorithmType kDefaultAlgorithm = -1;
+constexpr AlgorithmType kDefaultBlasGemm = -2;
+constexpr AlgorithmType kDefaultBlasGemv = -3;
+constexpr AlgorithmType kNoAlgorithm = -4;
+constexpr AlgorithmType kRuntimeAutotuning = -5;
+
+// blas uses -1 to represent the default algorithm. This happens to match up
+// with the CUBLAS_GEMM_DFALT constant, so cuda_blas.cc is using static_cast
+// to convert from AlgorithmType to cublasGemmAlgo_t, and uses a static_assert
+// to ensure that this assumption does not break.
+// If another blas implementation uses a different value for the default
+// algorithm, then it needs to convert kDefaultGemmAlgo to that value
+// (e.g. via a function called ToWhateverGemmAlgo).
+constexpr AlgorithmType kDefaultGemmAlgo = -1;
+
+// Describes the result of a performance experiment, usually timing the speed of
+// a particular AlgorithmType.
+//
+// If the call we were benchmarking failed (a common occurrence; not all
+// algorithms are valid for all calls), is_valid() will be false.
+class ProfileResult {
+ public:
+  bool is_valid() const { return is_valid_; }
+  void set_is_valid(bool val) { is_valid_ = val; }
+  bool warmup_run_executed() const { return warmup_run_executed_; }
+  void set_warmup_run_executed(bool val) { warmup_run_executed_ = val; }
+  AlgorithmType algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
+  float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
+  void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
+
+ private:
+  bool is_valid_ = false, warmup_run_executed_ = false;
+  AlgorithmType algorithm_ = kDefaultAlgorithm;
+  float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
+};
+
+class AlgorithmConfig {
+ public:
+  AlgorithmConfig() : algorithm_(kDefaultAlgorithm) {}
+  explicit AlgorithmConfig(AlgorithmType algorithm) : algorithm_(algorithm) {}
+  AlgorithmType algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
+  bool operator==(const AlgorithmConfig &other) const {
+    return this->algorithm_ == other.algorithm_;
+  }
+  bool operator!=(const AlgorithmConfig &other) const {
+    return !(*this == other);
+  }
+  std::string ToString() const;
+
+ private:
+  AlgorithmType algorithm_;
+};
+
+// Opaque identifier specifying the precision to use in gemm calls.
+typedef int64_t ComputePrecision;
+constexpr ComputePrecision kDefaultComputePrecision = 0;
+
+namespace detail {
+
+// Helper to return if `T` is the same type as `First` or any or `Rest`.
+template <typename T>
+constexpr bool is_any_of() {
+  return false;
+}
+
+template <typename T, typename First, typename... Rest>
+constexpr bool is_any_of() {
+  return std::is_same_v<T, First> || is_any_of<T, Rest...>();
+}
+
+}  // namespace detail
+
+// BLAS support interface -- this can be derived from a GPU executor when the
+// underlying platform has an BLAS library implementation available. See
+// StreamExecutor::AsBlas().
+//
+// Thread-hostile: CUDA associates a CUDA-context with a particular thread in
+// the system. Any operation that a user attempts to perform by enqueueing BLAS
+// operations on a thread not-associated with the CUDA-context has unknown
+// behavior at the current time; see b/13176597
+class BlasSupport {
+ public:
+  virtual ~BlasSupport() {}
+
+  virtual gpu::BlasLt *GetBlasLt() = 0;
+
+  // For tests only: sets *is_main_stream to true if the underlying Blas library
+  // has stream 0 set as its current stream.
+  virtual absl::StatusOr<bool> IsMainStreamSet() const = 0;
+
+  // Computes the product of a vector by a scalar: x <- a*x.
+  virtual bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,
+                          DeviceMemory<float> *x, int incx) = 0;
+  virtual bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,
+                          DeviceMemory<double> *x, int incx) = 0;
+  virtual bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) = 0;
+  virtual bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) = 0;
+  virtual bool DoBlasScal(Stream *stream, uint64_t elem_count,
+                          std::complex<float> alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) = 0;
+  virtual bool DoBlasScal(Stream *stream, uint64_t elem_count,
+                          std::complex<double> alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) = 0;
+
+  // Computes a matrix-vector product using a general matrix.
+  //
+  //     y <- alpha * a * x + beta * y,
+  // or
+  //     y <- alpha * a' * x + beta * y,
+  // or
+  //     y <- alpha * conj(a') * x + beta * y,
+  //
+  // alpha and beta are scalars; a is an m-by-n general matrix; x is a vector
+  // with n(trans==kNoTranspose)/m(otherwise) elements;
+  // y is a vector with m(trans==kNoTranspose)/n(otherwise) elements.
+  virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,
+                          uint64_t n, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) = 0;
+  virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,
+                          uint64_t n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) = 0;
+  virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,
+                          uint64_t n, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) = 0;
+  virtual bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,
+                          uint64_t n, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) = 0;
+
+  // Computes a matrix-matrix product with general matrices:
+  //
+  //     c <- alpha * op(a) * op(b) + beta * c,
+  //
+  // op(X) is one of op(X) = X, or op(X) = X', or op(X) = conj(X'); alpha and
+  // beta are scalars; a, b, and c are matrices; op(a) is an m-by-k matrix;
+  // op(b) is a k-by-n matrix; c is an m-by-n matrix.
+  //
+  // Note: The half interface uses float precision internally; the version
+  // that uses half precision internally is not yet supported. There is no
+  // batched version of the half-precision interface.
+  //
+  // Alpha/beta type matches `dtype`, unless `dtype` is `Eigen::half`, in that
+  // case the expected alpha/beta type is `float`.
+  virtual absl::Status DoBlasGemm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void *alpha,
+      const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
+      const void *beta, DeviceMemoryBase *c, int ldc,
+      const NumericOptions &numeric_options, blas::CallContext context) = 0;
+
+  // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
+  virtual bool GetBlasGemmAlgorithms(
+      Stream *stream, const gpu::MatrixDescriptor &a,
+      const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,
+      const void *alpha, const void *beta,
+      std::vector<blas::AlgorithmType> *out_algorithms) = 0;
+
+  // Like DoBlasGemm, but accepts an algorithm and an compute type.
+  //
+  // The compute type lets you say (e.g.) that the inputs and outputs are
+  // Eigen::halfs, but you want the internal computations to be done with
+  // float32 precision.
+  //
+  // If output_profile_result is not null, a failure here does not put the
+  // stream in a failure state.  Instead, success/failure is indicated by
+  // output_profile_result->is_valid().  This lets you use this function for
+  // choosing the best algorithm among many (some of which may fail) without
+  // creating a new Stream for each attempt.
+  virtual absl::Status DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const void *alpha,
+      const DeviceMemoryBase &a, DataType type_a, int lda,
+      const DeviceMemoryBase &b, DataType type_b, int ldb, const void *beta,
+      DeviceMemoryBase *c, DataType type_c, int ldc,
+      ComputationType computation_type, AlgorithmType algorithm,
+      const NumericOptions &numeric_options,
+      ProfileResult *output_profile_result, blas::CallContext context) = 0;
+  virtual absl::Status DoBlasGemmStridedBatchedWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const void *alpha,
+      const DeviceMemoryBase &a, DataType type_a, int lda, int64_t stride_a,
+      const DeviceMemoryBase &b, DataType type_b, int ldb, int64_t stride_b,
+      const void *beta, DeviceMemoryBase *c, DataType type_c, int ldc,
+      int64_t stride_c, int batch_count, ComputationType computation_type,
+      AlgorithmType algorithm, const NumericOptions &numeric_options,
+      ProfileResult *output_profile_result, blas::CallContext context) = 0;
+
+  // Computes a batch of matrix-matrix product with general matrices.
+  // This is a batched version of DoBlasGemm.
+  // The batched GEMM computes matrix product for each input/output in a, b,
+  // and c, which contain batch_count DeviceMemory objects.
+  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64_t k, float alpha,
+                                 DeviceMemorySlice<Eigen::half> a, int lda,
+                                 DeviceMemorySlice<Eigen::half> b, int ldb,
+                                 float beta, DeviceMemorySlice<Eigen::half> c,
+                                 int ldc, int batch_count,
+                                 const NumericOptions &numeric_options,
+                                 ScratchAllocator *scratch_allocator,
+                                 blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, float alpha,
+      DeviceMemorySlice<Eigen::bfloat16> a, int lda,
+      DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
+      DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
+      const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, float alpha,
+      DeviceMemorySlice<float> a, int lda, DeviceMemorySlice<float> b, int ldb,
+      float beta, DeviceMemorySlice<float> c, int ldc, int batch_count,
+      const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, double alpha,
+      DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,
+      int ldb, double beta, DeviceMemorySlice<double> c, int ldc,
+      int batch_count, const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, std::complex<float> alpha,
+      DeviceMemorySlice<std::complex<float>> a, int lda,
+      DeviceMemorySlice<std::complex<float>> b, int ldb,
+      std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
+      int ldc, int batch_count, const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, std::complex<double> alpha,
+      DeviceMemorySlice<std::complex<double>> a, int lda,
+      DeviceMemorySlice<std::complex<double>> b, int ldb,
+      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
+      int ldc, int batch_count, const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  // Batched gemm with strides instead of pointer arrays.
+  virtual absl::Status DoBlasGemmStridedBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void *alpha,
+      const DeviceMemoryBase &a, int lda, int64_t stride_a,
+      const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
+      DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
+      const NumericOptions &numeric_options, blas::CallContext context) = 0;
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemmStridedBatchedWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
+      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
+      int batch_count, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    TF_RETURN_IF_ERROR(
+        CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
+            computation_type));
+
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+    absl::Status status = DoBlasGemmStridedBatchedWithAlgorithm(
+        stream, transa, transb, m, n, k, alpha_ptr, a,
+        blas::ToDataType<InputType>::value, lda, stride_a, b,
+        blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
+        blas::ToDataType<OutputType>::value, ldc, stride_c, batch_count,
+        computation_type, algorithm, numeric_options, output_profile_result,
+        context);
+    if (output_profile_result) {
+      // The error is recorded in the profile.
+      return absl::OkStatus();
+    }
+    return status;
+  }
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+                        blas::Transpose transb, uint64_t m, uint64_t n,
+                        uint64_t k, ConstantType alpha,
+                        const DeviceMemory<InputType> &a, int lda,
+                        const DeviceMemory<InputType> &b, int ldb,
+                        ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
+                        const NumericOptions &numeric_options,
+                        blas::CallContext context) {
+    static_assert(
+        detail::is_any_of<InputType, int8_t, Eigen::half, Eigen::bfloat16,
+                          float, double, std::complex<float>,
+                          std::complex<double>>(),
+        "Input can be int8_t, half, bf16, float, double, std::complex<float> "
+        "or "
+        "std::complex<double>");
+    static_assert(!std::is_same_v<InputType, Eigen::half> ||
+                      detail::is_any_of<ConstantType, float, Eigen::half>(),
+                  "If input is Eigen::half, constant has to be either "
+                  "Eigen::half or float");
+    static_assert(detail::is_any_of<InputType, int8_t, Eigen::half,
+                                    Eigen::bfloat16, ConstantType>(),
+                  "If input is not int8_t, Eigen::half, constant and input "
+                  "types have to match");
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+
+    return DoBlasGemm(stream, transa, transb, m, n, k,
+                      blas::ToDataType<InputType>::value, alpha_ptr, a, lda, b,
+                      ldb, beta_ptr, c, ldc, numeric_options, context);
+  }
+
+  template <typename InputType, typename OutputType>
+  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+                        blas::Transpose transb, uint64_t m, uint64_t n,
+                        uint64_t k, const DeviceMemory<InputType> &a, int lda,
+                        const DeviceMemory<InputType> &b, int ldb,
+                        DeviceMemory<OutputType> *c, int ldc,
+                        const NumericOptions &numeric_options,
+                        blas::CallContext context) {
+    InputType alpha{1.0};
+    InputType beta{0.0};
+    return BlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                    beta, c, ldc, numeric_options, context);
+  }
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
+      const DeviceMemory<InputType> &a, int lda,
+      const DeviceMemory<InputType> &b, int ldb, ConstantType beta,
+      DeviceMemory<OutputType> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      const NumericOptions &numeric_options,
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    TF_RETURN_IF_ERROR(
+        CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
+            computation_type));
+
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+
+    absl::Status st = DoBlasGemmWithAlgorithm(
+        stream, transa, transb, m, n, k, alpha_ptr, a,
+        blas::ToDataType<InputType>::value, lda, b,
+        blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
+        blas::ToDataType<OutputType>::value, ldc, computation_type, algorithm,
+        numeric_options, output_profile_result, context);
+
+    if (output_profile_result) {
+      // The error is recorded in the profile.
+      return absl::OkStatus();
+    }
+    return st;
+  }
+
+  template <typename InputType, typename OutputType>
+  absl::Status BlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const DeviceMemory<InputType> &a,
+      int lda, const DeviceMemory<InputType> &b, int ldb,
+      DeviceMemory<OutputType> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    OutputType alpha{1};
+    OutputType beta{0};
+
+    return BlasGemmWithAlgorithm(stream, transa, transb, m, n, k, alpha, a, lda,
+                                 b, ldb, beta, c, ldc, computation_type,
+                                 algorithm, NumericOptions{},
+                                 output_profile_result, context);
+  }
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemmStridedBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
+      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
+      int batch_count, const NumericOptions &numeric_options,
+      blas::CallContext context) {
+    static_assert(
+        detail::is_any_of<InputType, int8_t, float, Eigen::half,
+                          Eigen::bfloat16, double, std::complex<float>,
+                          std::complex<double>>(),
+        "Unsupported input type");
+    static_assert(std::is_same_v<ConstantType, InputType> ||
+                      (detail::is_any_of<InputType, int8_t, Eigen::half,
+                                         Eigen::bfloat16>() &&
+                       std::is_same_v<ConstantType, float>),
+                  "Mismatched input and alpha/beta types");
+
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+
+    return DoBlasGemmStridedBatched(
+        stream, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
+        alpha_ptr, a, lda, stride_a, b, ldb, stride_b, beta_ptr, c, ldc,
+        stride_c, batch_count, numeric_options, context);
+  }
+
+  // Solves a triangular matrix equation.
+  //
+  //     op(a) * x = alpha * b,
+  // or
+  //     x * op(a) = alpha * b
+  //
+  // alpha is a scalar; x and b are m-by-n matrices; a is a unit, or non-unit,
+  // upper or lower triangular matrix; op(a) is one of op(a) = a, or op(a) = a',
+  // or op(a) = conj(a').
+  virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) = 0;
+  virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) = 0;
+  virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) = 0;
+  virtual bool DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) = 0;
+
+  // Same as DoBlasTrsm, but operates over a list of a's and b's.  The lists
+  // `as` and `bs` must have the same length.
+  virtual bool DoBlasTrsmBatched(Stream *stream, blas::Side side,
+                                 blas::UpperLower uplo, blas::Transpose transa,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
+                                 float alpha, const DeviceMemory<float *> &as,
+                                 int lda, DeviceMemory<float *> *bs, int ldb,
+                                 int batch_count) = 0;
+  virtual bool DoBlasTrsmBatched(Stream *stream, blas::Side side,
+                                 blas::UpperLower uplo, blas::Transpose transa,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
+                                 double alpha, const DeviceMemory<double *> &as,
+                                 int lda, DeviceMemory<double *> *bs, int ldb,
+                                 int batch_count) = 0;
+  virtual bool DoBlasTrsmBatched(Stream *stream, blas::Side side,
+                                 blas::UpperLower uplo, blas::Transpose transa,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
+                                 std::complex<float> alpha,
+                                 const DeviceMemory<std::complex<float> *> &as,
+                                 int lda,
+                                 DeviceMemory<std::complex<float> *> *bs,
+                                 int ldb, int batch_count) = 0;
+  virtual bool DoBlasTrsmBatched(Stream *stream, blas::Side side,
+                                 blas::UpperLower uplo, blas::Transpose transa,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
+                                 std::complex<double> alpha,
+                                 const DeviceMemory<std::complex<double> *> &as,
+                                 int lda,
+                                 DeviceMemory<std::complex<double> *> *bs,
+                                 int ldb, int batch_count) = 0;
+
+  // TODO(ezhulenev): We should never pass ScratchAllocator to any of the APIs
+  // in this file, because it makes them incompatible with command buffers (CUDA
+  // graphs). We should pass workspace memory explicitly to all APIs. However
+  // this is a giant change, so currently we work around it by setting a thread
+  // local workspace and rely on `ScopedBlasWorkspace` RAII helper to reset it.
+  //
+  // APIs that get ScratchAllocator ignore this workspace, and continue
+  // allocating scratch memory on demand.
+  class ScopedWorkspace {
+   public:
+    ScopedWorkspace(BlasSupport *blas, DeviceMemoryBase *workspace);
+    ~ScopedWorkspace();
+
+   private:
+    BlasSupport *blas_;
+  };
+
+  virtual absl::Status GetVersion(std::string *version) = 0;
+
+ protected:
+  DeviceMemoryBase *GetWorkspace();
+
+  BlasSupport() {}
+
+ private:
+  // Workspace memory pointer is thread local, once it is set all Blas
+  // operations issued from a caller thread might use it if it has large enough
+  // size. It's a user responsibility to make sure that workspace will outlive
+  // all issued BLAS operations.
+  //
+  // TODO(ezhulenev): This is a giant footgun! We have to remove it and use
+  // explicit workspace memory argument for all BLAS operations.
+  void SetWorkspace(DeviceMemoryBase *workspace);
+
+  // Resets user-defined workspace memory, so that Blas operations can use their
+  // own memory pool for allocating workspace.
+  void ResetWorkspace();
+
+  // Checks whether types match before a call to extended BLAS version.
+  template <typename ABType, typename CType, typename ScaleType>
+  absl::Status CheckTypesForExtendedBlas(
+      blas::ComputationType computation_type) {
+    static_assert(
+        detail::is_any_of<ABType, Eigen::half, Eigen::bfloat16, float, double,
+                          int8_t, std::complex<float>, std::complex<double>>(),
+        "The only buffer types supported are: Eigen::half, float, "
+        "double, int8, std::complex<float> and std::complex<double>");
+    static_assert(
+        std::is_same_v<ScaleType, CType> ||
+            (std::is_same_v<ScaleType, float> &&
+             detail::is_any_of<CType, Eigen::half, Eigen::bfloat16>()),
+        "Mismatched alpha/beta and output types");
+
+    bool valid_computation_type = [computation_type] {
+      switch (computation_type) {
+        case blas::ComputationType::kF16:
+          return std::is_same_v<CType, Eigen::half>;
+        case blas::ComputationType::kF32:
+          return detail::is_any_of<CType, Eigen::half, Eigen::bfloat16, float,
+                                   std::complex<float>>();
+        case blas::ComputationType::kF64:
+          return detail::is_any_of<CType, double, std::complex<double>>();
+        case blas::ComputationType::kI32:
+          return std::is_same_v<CType, int32_t>;
+        case blas::ComputationType::kF16AsF32:   // fall-through
+        case blas::ComputationType::kBF16AsF32:  // fall-through
+        case blas::ComputationType::kTF32AsF32:
+          return detail::is_any_of<CType, float, std::complex<float>>();
+      }
+    }();
+
+    if (!valid_computation_type) {
+      return absl::InternalError(absl::StrCat(
+          "Invalid computation type ",
+          blas::ComputationTypeString(computation_type), " for output type: ",
+          blas::DataTypeString(blas::ToDataType<CType>::value)));
+    }
+    return absl::OkStatus();
+  }
+
+  // Non-extended BLAS interface requires alpha/beta to be floats when input
+  // type is Eigen::half. However, for consistency purposes it is convenient
+  // for the interface to accept Eigen::half.
+  template <typename T>
+  void UpcastHalfToFloat(void **alpha_ptr, void **beta_ptr,
+                         float *alpha_storage, float *beta_storage) {
+    if (std::is_same<T, Eigen::half>::value) {
+      *alpha_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::half *>(*alpha_ptr));
+      *beta_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::half *>(*beta_ptr));
+      *alpha_ptr = alpha_storage;
+      *beta_ptr = beta_storage;
+    } else if (std::is_same<T, Eigen::bfloat16>::value) {
+      *alpha_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::bfloat16 *>(*alpha_ptr));
+      *beta_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::bfloat16 *>(*beta_ptr));
+      *alpha_ptr = alpha_storage;
+      *beta_ptr = beta_storage;
+    }
+  }
+
+  BlasSupport(const BlasSupport &) = delete;
+  void operator=(const BlasSupport &) = delete;
+};
+
+// Macro used to quickly declare overrides for abstract virtuals in the
+// BlasSupport base class.
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES                  \
+  absl::StatusOr<bool> IsMainStreamSet() const override;                       \
+  bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,            \
+                  DeviceMemory<float> *x, int incx) override;                  \
+  bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,           \
+                  DeviceMemory<double> *x, int incx) override;                 \
+  bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,            \
+                  DeviceMemory<std::complex<float>> *x, int incx) override;    \
+  bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,           \
+                  DeviceMemory<std::complex<double>> *x, int incx) override;   \
+  bool DoBlasScal(Stream *stream, uint64_t elem_count,                         \
+                  std::complex<float> alpha,                                   \
+                  DeviceMemory<std::complex<float>> *x, int incx) override;    \
+  bool DoBlasScal(Stream *stream, uint64_t elem_count,                         \
+                  std::complex<double> alpha,                                  \
+                  DeviceMemory<std::complex<double>> *x, int incx) override;   \
+  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, float alpha, const DeviceMemory<float> &a,       \
+                  int lda, const DeviceMemory<float> &x, int incx, float beta, \
+                  DeviceMemory<float> *y, int incy) override;                  \
+  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, double alpha, const DeviceMemory<double> &a,     \
+                  int lda, const DeviceMemory<double> &x, int incx,            \
+                  double beta, DeviceMemory<double> *y, int incy) override;    \
+  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, std::complex<float> alpha,                       \
+                  const DeviceMemory<std::complex<float>> &a, int lda,         \
+                  const DeviceMemory<std::complex<float>> &x, int incx,        \
+                  std::complex<float> beta,                                    \
+                  DeviceMemory<std::complex<float>> *y, int incy) override;    \
+  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, std::complex<double> alpha,                      \
+                  const DeviceMemory<std::complex<double>> &a, int lda,        \
+                  const DeviceMemory<std::complex<double>> &x, int incx,       \
+                  std::complex<double> beta,                                   \
+                  DeviceMemory<std::complex<double>> *y, int incy) override;   \
+  absl::Status DoBlasGemm(                                                     \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,                \
+      const void *alpha, const DeviceMemoryBase &a, int lda,                   \
+      const DeviceMemoryBase &b, int ldb, const void *beta,                    \
+      DeviceMemoryBase *c, int ldc, const NumericOptions &numeric_options,     \
+      blas::CallContext context) override;                                     \
+  bool GetBlasGemmAlgorithms(                                                  \
+      Stream *stream, const gpu::MatrixDescriptor &a,                          \
+      const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,          \
+      const void *alpha, const void *beta,                                     \
+      std::vector<blas::AlgorithmType> *out_algorithms) override;              \
+  absl::Status DoBlasGemmWithAlgorithm(                                        \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, const void *alpha,                   \
+      const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
+      const DeviceMemoryBase &b, blas::DataType type_b, int ldb,               \
+      const void *beta, DeviceMemoryBase *c, blas::DataType type_c, int ldc,   \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
+      const NumericOptions &numeric_options,                                   \
+      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      override;                                                                \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
+      DeviceMemorySlice<Eigen::half> a, int lda,                               \
+      DeviceMemorySlice<Eigen::half> b, int ldb, float beta,                   \
+      DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,              \
+      const NumericOptions &numeric_options,                                   \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
+      DeviceMemorySlice<Eigen::bfloat16> a, int lda,                           \
+      DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,               \
+      DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,          \
+      const NumericOptions &numeric_options,                                   \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
+      DeviceMemorySlice<float> a, int lda, DeviceMemorySlice<float> b,         \
+      int ldb, float beta, DeviceMemorySlice<float> c, int ldc,                \
+      int batch_count, const NumericOptions &numeric_options,                  \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, double alpha,                        \
+      DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,       \
+      int ldb, double beta, DeviceMemorySlice<double> c, int ldc,              \
+      int batch_count, const NumericOptions &numeric_options,                  \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, std::complex<float> alpha,           \
+      DeviceMemorySlice<std::complex<float>> a, int lda,                       \
+      DeviceMemorySlice<std::complex<float>> b, int ldb,                       \
+      std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,      \
+      int ldc, int batch_count, const NumericOptions &numeric_options,         \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, std::complex<double> alpha,          \
+      DeviceMemorySlice<std::complex<double>> a, int lda,                      \
+      DeviceMemorySlice<std::complex<double>> b, int ldb,                      \
+      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,    \
+      int ldc, int batch_count, const NumericOptions &numeric_options,         \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
+  absl::Status DoBlasGemmStridedBatched(                                       \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,                \
+      const void *alpha, const DeviceMemoryBase &a, int lda, int64_t stride_a, \
+      const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,  \
+      DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,         \
+      const NumericOptions &numeric_options, blas::CallContext context)        \
+      override;                                                                \
+  absl::Status DoBlasGemmStridedBatchedWithAlgorithm(                          \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, const void *alpha,                   \
+      const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
+      int64_t stride_a, const DeviceMemoryBase &b, blas::DataType type_b,      \
+      int ldb, int64_t stride_b, const void *beta, DeviceMemoryBase *c,        \
+      blas::DataType type_c, int ldc, int64_t stride_c, int batch_count,       \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
+      const NumericOptions &numeric_options,                                   \
+      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      override;                                                                \
+  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
+                  uint64_t n, float alpha, const DeviceMemory<float> &a,       \
+                  int lda, DeviceMemory<float> *b, int ldb) override;          \
+  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
+                  uint64_t n, double alpha, const DeviceMemory<double> &a,     \
+                  int lda, DeviceMemory<double> *b, int ldb) override;         \
+  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
+                  uint64_t n, std::complex<float> alpha,                       \
+                  const DeviceMemory<std::complex<float>> &a, int lda,         \
+                  DeviceMemory<std::complex<float>> *b, int ldb) override;     \
+  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
+                  uint64_t n, std::complex<double> alpha,                      \
+                  const DeviceMemory<std::complex<double>> &a, int lda,        \
+                  DeviceMemory<std::complex<double>> *b, int ldb) override;    \
+  bool DoBlasTrsmBatched(                                                      \
+      Stream *stream, blas::Side side, blas::UpperLower uplo,                  \
+      blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
+      float alpha, const DeviceMemory<float *> &as, int lda,                   \
+      DeviceMemory<float *> *bs, int ldb, int batch_count) override;           \
+  bool DoBlasTrsmBatched(                                                      \
+      Stream *stream, blas::Side side, blas::UpperLower uplo,                  \
+      blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
+      double alpha, const DeviceMemory<double *> &as, int lda,                 \
+      DeviceMemory<double *> *bs, int ldb, int batch_count) override;          \
+  bool DoBlasTrsmBatched(Stream *stream, blas::Side side,                      \
+                         blas::UpperLower uplo, blas::Transpose transa,        \
+                         blas::Diagonal diag, uint64_t m, uint64_t n,          \
+                         std::complex<float> alpha,                            \
+                         const DeviceMemory<std::complex<float> *> &as,        \
+                         int lda, DeviceMemory<std::complex<float> *> *bs,     \
+                         int ldb, int batch_count) override;                   \
+  bool DoBlasTrsmBatched(Stream *stream, blas::Side side,                      \
+                         blas::UpperLower uplo, blas::Transpose transa,        \
+                         blas::Diagonal diag, uint64_t m, uint64_t n,          \
+                         std::complex<double> alpha,                           \
+                         const DeviceMemory<std::complex<double> *> &as,       \
+                         int lda, DeviceMemory<std::complex<double> *> *bs,    \
+                         int ldb, int batch_count) override;                   \
+  absl::Status GetVersion(std::string *version) override;
+
+}  // namespace blas
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_BLAS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/command_buffer.h
new file mode 100644
index 00000000..bb56f0f0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/command_buffer.h
@@ -0,0 +1,395 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
+#define XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <variant>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/bit_pattern.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+#include "tsl/platform/errors.h"
+
+namespace stream_executor {
+
+class Stream;
+
+//===----------------------------------------------------------------------===//
+// CommandBuffer
+//===----------------------------------------------------------------------===//
+
+// Command buffer represent a "bundle of work items" for StreamExecutor device
+// that can be submitted with one API call, e.g. command buffer might have
+// multiple device kernels and synchronization barriers between them. Command
+// buffers allow to amortize the cost of launching "work" on device by building
+// it on the host ahead of time without expensive interaction with underlying
+// device.
+class CommandBuffer {
+ public:
+  // Execution scope enables fine-grained synchronization scopes inside
+  // commands buffers. Implementation is very backend-specific and for CUDA/ROCM
+  // backends it's implemented as DAG edges. By default all commands launched in
+  // the `kDefaultExecutionScope` execution scope.
+  //
+  // Example #1: independent execution scopes and independent barriers
+  //
+  // ExecutionScope #0       ExecutionScope #1
+  //
+  //          A                        D
+  //          B                        E
+  // ----- barrier -----      ----- barrier -----
+  //          C                        F
+  //
+  //   (1) Commands A and B can run concurrently and must complete before C.
+  //   (2) Commands D and E can run concurrently and must complete before F.
+  //   (3) There is no syncrhonization between execution scopes, and commands
+  //       from different execution scopes can execute concurrently with each
+  //       other as long as they satisfy constraints of their respective
+  //       execution scopes.
+  //
+  //
+  //
+  // Example #2: dependencies between scopes and inter-scope barriers
+  //
+  // ExecutionScope #0       ExecutionScope #1
+  //
+  //          A                        D
+  //          B                        E
+  // ----------------- barrier ------------------
+  //          C                        F
+  //
+  //   (1) Commands A and B can run concurrently and must complete before
+  //       C and F.
+  //   (2) Commands D and E can run concurrently and must complete before
+  //       C and F.
+  //   (3) Commands C and F can run concurrently.
+  //   (4) All commands before a shared barrier (in both excecution scopes)
+  //       should complete before any command after a berrier starts execution.
+  //
+  //
+  //
+  // Example #3: one-directional barriers between execution scopes
+  //
+  // ExecutionScope #0       ExecutionScope #1
+  //
+  //          A
+  //          B
+  // ----- barrier -----               D
+  //          C            \           E
+  //                           ----- barrier -----
+  //                                   F
+  //
+  //   (1) Commands A and B can run concurrently and must complete before
+  //       C and F.
+  //   (2) Commands D and E can run concurrently and must complete before
+  //       F (does not synchronize with C).
+  //   (3) Commands C and F can run concurrently.
+  //
+  //  This is a more fine-grained barrier than in example #2: it enforces
+  //  synchronization from execution scope #0 to execution scope #1 but no
+  //  synchronization in other direction. For CUDA/ROCM backend it has the same
+  //  semantics as stream wait operation.
+  //
+  TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionScopeId, uint64_t);
+  static constexpr auto kDefaultExecutionScope = ExecutionScopeId(0);
+
+  // Builder constructs nested command buffers owned by a parent command buffer.
+  //
+  // Builder can use arbitrary number of nested execution scopes, the only
+  // requirement is that after builder constructed all commands, they all must
+  // be synchronized with a default execution scope.
+  using Builder = std::function<absl::Status(CommandBuffer*)>;
+
+  // An extension of a `Builder` defined above that builds a nested command
+  // buffer in a given execution scope. Builder can use arbitrary number of
+  // nested execution scopes, the only requirement is that after builder
+  // constructed all commands, they all must be synchronized with an execution
+  // scope passed as an argument.
+  using ExecutionScopeBuilder =
+      std::function<absl::Status(ExecutionScopeId, CommandBuffer*)>;
+
+  CommandBuffer() = default;
+  virtual ~CommandBuffer() = default;
+
+  CommandBuffer(const CommandBuffer&) = delete;
+  void operator=(const CommandBuffer&) = delete;
+
+  // Command buffer state:
+  //
+  //   (1) kCreate:    a new command buffer under construction
+  //   (2) kUpdate:    updating a previously finalized command buffer
+  //   (3) kFinalized: command buffer ready for execution
+  //
+  // Supported state transitions:
+  //
+  //   (1) Finalize: (kCreate|kUpdate) -> kFinalized
+  //   (2) Update:   kFinalized -> kUpdate
+  //
+  enum class State { kCreate, kUpdate, kFinalized };
+
+  // Command buffers have two modes of execution:
+  //
+  //   (1) kPrimary: command buffer can be submitted for execution via
+  //                 StreamExecutor APIs
+  //   (2) kNested:  command buffer can be executed only within a primary
+  //                 command buffer
+  //
+  enum class Mode { kPrimary, kNested };
+
+  friend absl::string_view ModeToString(Mode mode) {
+    switch (mode) {
+      case CommandBuffer::Mode::kPrimary:
+        return "primary";
+      case CommandBuffer::Mode::kNested:
+        return "nested";
+    }
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Command buffer API
+  //===--------------------------------------------------------------------===//
+
+  // Adds an execution barrier to a given execution scope: all commands added
+  // before a barrier in a the execution scope will complete before any of the
+  // commands added after a barrier in the same execution scope.
+  virtual absl::Status Barrier(ExecutionScopeId execution_scope_id) = 0;
+
+  // Adds an execution barrier that synchronizes commands across multiple
+  // execution scopes. See example #2 in execution scope id documentation.
+  virtual absl::Status Barrier(
+      absl::Span<const ExecutionScopeId> execution_scope_ids) = 0;
+
+  // Adds an execution barrier from execution scope `from_execution_scope_id` to
+  // execution scope `to_execution_scope_id`. See example #3 for details.
+  virtual absl::Status Barrier(ExecutionScopeId from_execution_scope_id,
+                               ExecutionScopeId to_execution_scope_id) = 0;
+
+  // Adds an execution barrier to the default execution scope.
+  absl::Status Barrier() { return Barrier(kDefaultExecutionScope); }
+
+  // Adds a kernel launch command.
+  virtual absl::Status Launch(ExecutionScopeId execution_scope_id,
+                              const ThreadDim& threads, const BlockDim& blocks,
+                              const Kernel& kernel, const KernelArgs& args) = 0;
+
+  // Adds a kernel launch command to the default execution scope.
+  absl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
+                      const Kernel& kernel, const KernelArgs& args) {
+    return Launch(kDefaultExecutionScope, threads, blocks, kernel, args);
+  }
+
+  // Type-safe wrapper for launching typed kernels. Notice that the order of
+  // arguments is different do disambiguate from the regular launch API.
+  template <typename... Params, typename... Args>
+  absl::Status Launch(const TypedKernel<Params...>& kernel,
+                      ExecutionScopeId execution_scope_id,
+                      const ThreadDim& threads, const BlockDim& blocks,
+                      Args... args);
+
+  // Type-safe wrapper for launching typed kernels in default execution scope.
+  template <typename... Params, typename... Args>
+  absl::Status Launch(const TypedKernel<Params...>& kernel,
+                      const ThreadDim& threads, const BlockDim& blocks,
+                      Args... args) {
+    return Launch(kernel, kDefaultExecutionScope, threads, blocks, args...);
+  }
+
+  // Adds a nested command buffer.
+  virtual absl::Status AddNestedCommandBuffer(
+      ExecutionScopeId execution_scope_id, const CommandBuffer& nested) = 0;
+
+  // Adds a nested command buffer to the default execution scope.
+  absl::Status AddNestedCommandBuffer(const CommandBuffer& nested) {
+    return AddNestedCommandBuffer(kDefaultExecutionScope, nested);
+  }
+
+  // Adds a device-to-device memory copy.
+  virtual absl::Status MemcpyDeviceToDevice(ExecutionScopeId execution_scope_id,
+                                            DeviceMemoryBase* dst,
+                                            const DeviceMemoryBase& src,
+                                            uint64_t size) = 0;
+
+  // Adds a device-to-device memory copy to the default execution scope.
+  absl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                    const DeviceMemoryBase& src,
+                                    uint64_t size) {
+    return MemcpyDeviceToDevice(kDefaultExecutionScope, dst, src, size);
+  }
+
+  // Adds a memset command.
+  virtual absl::Status Memset(ExecutionScopeId execution_scope_id,
+                              DeviceMemoryBase* dst, BitPattern bit_pattern,
+                              size_t num_elements) = 0;
+
+  // Adds a memset command to the default execution scope.
+  absl::Status Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
+                      size_t num_elements) {
+    return Memset(kDefaultExecutionScope, dst, bit_pattern, num_elements);
+  }
+
+  //--------------------------------------------------------------------------//
+  // Command buffer condtitional commands API
+  //--------------------------------------------------------------------------//
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by `then_builder` if `pred` value is `true`.
+  virtual absl::Status If(ExecutionScopeId execution_scope_id,
+                          DeviceMemory<bool> pred, Builder then_builder) = 0;
+
+  // Adds a conditional If operation to default execution scope.
+  absl::Status If(DeviceMemory<bool> pred, Builder then_builder) {
+    return If(kDefaultExecutionScope, pred, then_builder);
+  }
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by `then_builder` if `pred` value is `true`, or a command buffer
+  // constructed by `else_builder` if `pred` is `false`.
+  virtual absl::Status IfElse(ExecutionScopeId execution_scope_id,
+                              DeviceMemory<bool> pred, Builder then_builder,
+                              Builder else_builder) = 0;
+
+  // Adds a conditional IfElse operation to default execution scope.
+  absl::Status IfElse(DeviceMemory<bool> pred, Builder then_builder,
+                      Builder else_builder) {
+    return IfElse(kDefaultExecutionScope, pred, then_builder, else_builder);
+  }
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `branches` builder at `index`. If `index` is out of range, then it
+  // will run a conditional command buffer constructed by the last builder.
+  //
+  // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
+  virtual absl::Status Case(ExecutionScopeId execution_scope_id,
+                            DeviceMemory<int32_t> index,
+                            std::vector<Builder> branches) = 0;
+
+  // Adds a conditional Case operation to default execution scope.
+  absl::Status Case(DeviceMemory<int32_t> index,
+                    std::vector<Builder> branches) {
+    return Case(kDefaultExecutionScope, index, branches);
+  }
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `body_builder` exactly `num_iteration` times. This means the
+  // condition is known at compile time (`num_iteration` < `loop_counter`), and
+  // does not require a `cond_builder`.
+  virtual absl::Status For(ExecutionScopeId execution_scope_id,
+                           int32_t num_iteration,
+                           DeviceMemory<int32_t> loop_counter,
+                           Builder body_builder) = 0;
+
+  // Adds a conditional For operation to default execution scope.
+  absl::Status For(int32_t num_iteration, DeviceMemory<int32_t> loop_counter,
+                   Builder body_builder) {
+    return For(kDefaultExecutionScope, num_iteration, loop_counter,
+               body_builder);
+  }
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `cond_builder` that must update `pred` value, and then depending on
+  // the value might execute command buffer constructed by `body_builder` and
+  // `cond_builder`. Will continue while `pred` value (which is continuously
+  // updated by `cond_builder`) is `true`.
+  //
+  // In pseudocode:
+  //
+  //   cond_builder()
+  //   while(pred):
+  //     body_builder()
+  //     cond_builder()
+  //
+  // We use execution scope builder for the condition because we have to build
+  // condition twice: (1) before the conditional node in the scope defined by
+  // `execution_scope_id` (2) inside the loop body with default execution scope.
+  virtual absl::Status While(ExecutionScopeId execution_scope_id,
+                             DeviceMemory<bool> pred,
+                             ExecutionScopeBuilder cond_builder,
+                             Builder body_builder) = 0;
+
+  // Adds a conditional While operation to default execution scope.
+  absl::Status While(DeviceMemory<bool> pred,
+                     ExecutionScopeBuilder cond_builder, Builder body_builder) {
+    return While(kDefaultExecutionScope, pred, cond_builder, body_builder);
+  }
+
+  // Submits the command buffer for execution.
+  virtual absl::Status Submit(Stream* stream) {
+    return absl::UnimplementedError("Not implemented for this command buffer.");
+  }
+
+  //--------------------------------------------------------------------------//
+  // Command buffer state management API
+  //--------------------------------------------------------------------------//
+
+  // Finalizes command buffer and makes it executable. Once command buffer is
+  // finalized no commands can be added to it.
+  virtual absl::Status Finalize() = 0;
+
+  // Begins command buffer update. Command buffer update should be finalized
+  // before it can be executed.
+  virtual absl::Status Update() = 0;
+
+  // Returns command buffer execution mode.
+  virtual Mode mode() const = 0;
+
+  // Returns command buffer state.
+  virtual State state() const = 0;
+
+  //--------------------------------------------------------------------------//
+  // Command buffer tracing API
+  //--------------------------------------------------------------------------//
+ private:
+  friend class TraceCommandBufferFactory;
+  // Tracing APIs are private because they do not compose with command buffer
+  // updates. Instead of tracing directly into the command buffer users should
+  // create traced command buffers using factory methods and add them to primary
+  // command buffers as nested operations.
+
+  // Traces `function` invocation by recording all operations on the `stream`
+  // into the command buffer. Command buffer must be empty.
+  virtual absl::Status Trace(Stream* stream,
+                             absl::AnyInvocable<absl::Status()> function) = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// CommandBuffer templates implementation below
+//===----------------------------------------------------------------------===//
+
+template <typename... Params, typename... Args>
+inline absl::Status CommandBuffer::Launch(const TypedKernel<Params...>& kernel,
+                                          ExecutionScopeId execution_scope_id,
+                                          const ThreadDim& threads,
+                                          const BlockDim& blocks,
+                                          Args... args) {
+  auto kernel_args = PackKernelArgs(kernel, args...);
+  TF_RETURN_IF_ERROR(
+      Launch(execution_scope_id, threads, blocks, *kernel, *kernel_args));
+  return absl::OkStatus();
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.h
new file mode 100644
index 00000000..92bfd5c9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.h
@@ -0,0 +1,49 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_ASSEMBLE_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_ASSEMBLE_COMPILATION_PROVIDER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/xla.pb.h"
+
+namespace stream_executor::cuda {
+
+// Returns the best available CompilationProvider while considering the
+// following flags from DebugOptions:
+// - xla_gpu_enable_libnvptxcompiler
+// - xla_gpu_libnvjitlink_mode
+// - xla_gpu_cuda_data_dir
+// - xla_gpu_enable_llvm_module_compilation_parallelism
+// - xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found
+//
+// Considered compilation methods are:
+// - nvptxcompiler
+// - nvjitlink
+// - subprocess(ptxas, nvlink)
+// - driver
+//
+// Returns an error if either no compilation method is available or if
+// requested features like compilation parallelism are not supported.
+// Also returns an error if contradicting flags are set.
+absl::StatusOr<std::unique_ptr<CompilationProvider>>
+AssembleCompilationProvider(const xla::DebugOptions& debug_options);
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_ASSEMBLE_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h
new file mode 100644
index 00000000..264b0384
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CACHING_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CACHING_COMPILATION_PROVIDER_H_
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+// Delegates all CompilationProvider calls to a delegate and caches the results
+// to avoid recompilation.
+//
+// Note that linking step is not cached and compilations happening as part of
+// `CompileAndLink` are only cached if the delegate supports
+// `CompileToRelocatableModule`.
+class CachingCompilationProvider : public CompilationProvider {
+ public:
+  explicit CachingCompilationProvider(
+      std::unique_ptr<CompilationProvider> delegate)
+      : delegate_(std::move(delegate)) {}
+
+  std::string name() const override;
+
+  bool SupportsCompileToRelocatableModule() const override;
+  bool SupportsCompileAndLink() const override;
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+
+ private:
+  std::unique_ptr<CompilationProvider> delegate_;
+
+  using CacheKey =
+      std::tuple<CudaComputeCapability, std::string, CompilationOptions>;
+  // Indicates that the compilation is currently in progress on a different
+  // thread.
+  struct Pending {};
+
+  // We use node_hash_maps to ensure pointer stability of values which is
+  // required for the interlock mechanism to work.
+  using RelocatableModuleCache = absl::node_hash_map<
+      CacheKey, std::variant<Pending, absl::StatusOr<RelocatableModule>>>;
+  using AssemblyCache =
+      absl::node_hash_map<CacheKey,
+                          std::variant<Pending, absl::StatusOr<Assembly>>>;
+  mutable absl::Mutex relocatable_module_cache_mutex_;
+  mutable RelocatableModuleCache relocatable_module_cache_
+      ABSL_GUARDED_BY(relocatable_module_cache_mutex_);
+
+  mutable absl::Mutex assembly_cache_mutex_;
+  mutable AssemblyCache assembly_cache_ ABSL_GUARDED_BY(assembly_cache_mutex_);
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CACHING_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h
new file mode 100644
index 00000000..a610b3eb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_COMMAND_BUFFER_KERNELS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_COMMAND_BUFFER_KERNELS_H_
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace stream_executor::cuda {
+
+// These are various kernels that update Gpu conditionals based on the device
+// memory values, and allow implementing on-device control flow via conditional
+// command buffers.
+absl::StatusOr<MultiKernelLoaderSpec> GetSetIfConditionKernelLoaderSpec();
+absl::StatusOr<MultiKernelLoaderSpec> GetSetIfElseConditionKernelLoaderSpec();
+absl::StatusOr<MultiKernelLoaderSpec> GetSetCaseConditionKernelLoaderSpec();
+absl::StatusOr<MultiKernelLoaderSpec> GetSetForConditionKernelLoaderSpec();
+absl::StatusOr<MultiKernelLoaderSpec> GetSetWhileConditionKernelLoaderSpec();
+absl::StatusOr<MultiKernelLoaderSpec> GetNoOpKernelLoaderSpec();
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_COMMAND_BUFFER_KERNELS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_options.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_options.h
new file mode 100644
index 00000000..f6a3fab9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_options.h
@@ -0,0 +1,72 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_COMPILATION_OPTIONS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_COMPILATION_OPTIONS_H_
+
+#include "absl/strings/str_format.h"
+
+namespace stream_executor::cuda {
+
+// Collects all compilation options that are used by the CompilationProvider
+// interface.
+struct CompilationOptions {
+  // Disable all PTX compiler optimizations
+  bool disable_optimizations = false;
+
+  // If true, compilation will fail if register spilling is detected. A
+  // absl::CancelledError will be returned.
+  bool cancel_if_reg_spill = false;
+
+  // If true, the PTX compiler will generate line information which is useful
+  // for profiling
+  bool generate_line_info = false;
+
+  // If true, the PTX compiler will generate debug information.
+  bool generate_debug_info = false;
+
+  friend bool operator==(const CompilationOptions& lhs,
+                         const CompilationOptions& rhs) {
+    return lhs.disable_optimizations == rhs.disable_optimizations &&
+           lhs.cancel_if_reg_spill == rhs.cancel_if_reg_spill &&
+           lhs.generate_line_info == rhs.generate_line_info &&
+           lhs.generate_debug_info == rhs.generate_debug_info;
+  }
+
+  friend bool operator!=(const CompilationOptions& lhs,
+                         const CompilationOptions& rhs) {
+    return !(lhs == rhs);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const CompilationOptions& options) {
+    return H::combine(std::move(h), options.disable_optimizations,
+                      options.cancel_if_reg_spill, options.generate_line_info,
+                      options.generate_debug_info);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const CompilationOptions& options) {
+    absl::Format(&sink,
+                 "disable_optimizations: %v, cancel_if_reg_spill: %v, "
+                 "generate_line_info: %v, generate_debug_info: %v",
+                 options.disable_optimizations, options.cancel_if_reg_spill,
+                 options.generate_line_info, options.generate_debug_info);
+  }
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_COMPILATION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_provider.h
new file mode 100644
index 00000000..38efab1e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_provider.h
@@ -0,0 +1,134 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_COMPILATION_PROVIDER_H_
+
+#include <cstdint>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+// A compiled PTX module in CUBIN format. The module still needs to be linked
+// before it can be loaded.
+struct RelocatableModule {
+  std::vector<uint8_t> cubin;
+
+  friend bool operator==(const RelocatableModule& lhs,
+                         const RelocatableModule& rhs) {
+    return lhs.cubin == rhs.cubin;
+  }
+
+  friend bool operator!=(const RelocatableModule& lhs,
+                         const RelocatableModule& rhs) {
+    return lhs.cubin != rhs.cubin;
+  }
+};
+
+// A compiled and linked CUDA program in CUBIN format.
+struct Assembly {
+  std::vector<uint8_t> cubin;
+
+  friend bool operator==(const Assembly& lhs, const Assembly& rhs) {
+    return lhs.cubin == rhs.cubin;
+  }
+
+  friend bool operator!=(const Assembly& lhs, const Assembly& rhs) {
+    return lhs.cubin != rhs.cubin;
+  }
+};
+
+// A PTX module in textual assembly format.
+struct Ptx {
+  std::string ptx;
+
+  friend bool operator==(const Ptx& lhs, const Ptx& rhs) {
+    return lhs.ptx == rhs.ptx;
+  }
+
+  friend bool operator!=(const Ptx& lhs, const Ptx& rhs) {
+    return lhs.ptx != rhs.ptx;
+  }
+};
+
+// Provides PTX compilation and linking facilities
+//
+// `Compile` is supported by all compilation providers.
+//
+// `CompileToRelocatableModule` is not supported by all compilation providers.
+// `SupportsCompileToRelocatableModule` can be used to check if this method is
+// supported.
+//
+// `CompileAndLink` is not supported by all compilation providers.
+// `SupportsCompileAndLink` can be used to check if this method is supported.
+//
+// Calling `CompileToRelocatableModule` in parallel from multiple threads and
+// then linking all modules in a single CompileAndLink call allows for parallel
+// compilation.
+//
+// The CompilationProvider is thread-compatible and since all methods are
+// const, it's safe to call them from multiple threads at the same time.
+class CompilationProvider {
+ public:
+  virtual ~CompilationProvider() = default;
+
+  // Compiles a single PTX module into a CUDA program. This method is supported
+  // by all compilation providers.
+  virtual absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const = 0;
+
+  // Compiles the given PTX string into relocatable CUBIN for the given
+  // architecture `cc`. This method is not supported by all compilation
+  // providers. `SupportsCompileToRelocatableModule` can be used to check if
+  // this method is supported.
+  virtual absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const = 0;
+
+  // Returns true if 'CompileToRelocatableModule' can be used.
+  // Not all compilation providers can produce a relocatable CUBIN. For these
+  // providers, this function will return false. Any calls to `Compile` will
+  // result in an error. `ComileAndLink` can be used instead, but it doesn't
+  // allow for separate (parallel) compilation of multiple modules.
+  virtual bool SupportsCompileToRelocatableModule() const = 0;
+
+  // Returns true if 'CompileAndLink' can be used.
+  // Not all compilation providers can compile and link multiple modules.
+  virtual bool SupportsCompileAndLink() const = 0;
+
+  using RelocatableModuleOrPtx = std::variant<RelocatableModule, Ptx>;
+
+  // Links relocatable CUBINs and PTX strings into a single binary. The PTX are
+  // getting compiled using the same compilation provider.
+  virtual absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const = 0;
+
+  // Returns the name of the compilation provider.
+  virtual std::string name() const = 0;
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h
new file mode 100644
index 00000000..118d2c83
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_COMPILATION_PROVIDER_TEST_H_
+#define XLA_STREAM_EXECUTOR_CUDA_COMPILATION_PROVIDER_TEST_H_
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+
+namespace stream_executor::cuda {
+
+inline constexpr absl::string_view kSubprocessCompilationProviderName =
+    "subprocess";
+inline constexpr absl::string_view kNvJitLinkCompilationProviderName =
+    "nvjitlink";
+inline constexpr absl::string_view kNvptxcompilerCompilationProviderName =
+    "nvptxcompiler";
+inline constexpr absl::string_view kDriverCompilationProviderName = "driver";
+
+class CompilationProviderTest
+    : public testing::TestWithParam<absl::string_view> {
+  absl::StatusOr<std::unique_ptr<CompilationProvider>>
+  CreateCompilationProvider(absl::string_view name);
+
+  void SetUp() override;
+  std::unique_ptr<CompilationProvider> compilation_provider_;
+
+ protected:
+  CompilationProvider* compilation_provider() {
+    return compilation_provider_.get();
+  }
+};
+
+// Prints the test parameter name as is. Needed for gtest instantiation.
+struct CompilationProviderTestParamNamePrinter {
+  std::string operator()(
+      const ::testing::TestParamInfo<absl::string_view>& name) const {
+    return std::string(name.param);
+  }
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_COMPILATION_PROVIDER_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h
new file mode 100644
index 00000000..131d80d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_COMPOSITE_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_COMPOSITE_COMPILATION_PROVIDER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+// Takes a list of CompilationProviders and delegates to the first one that
+// supports the requested operation. Effectively this means there are never more
+// than 3 providers in use - the first one which supports `Compile`, one that
+// supports `CompileToRelocatableModule` and one that supports `CompileAndLink`.
+//
+// Note that it's up to the user to ensure that the providers are compatible
+// with each other.
+//
+// A typical use case is to combine a provider that doesn't support relocatable
+// compilation (e.g. NvJitLink or driver) with a driver that doesn't support
+// linking (e.g. NvPtxCompiler).
+class CompositeCompilationProvider : public CompilationProvider {
+ public:
+  static absl::StatusOr<std::unique_ptr<CompositeCompilationProvider>> Create(
+      std::vector<std::unique_ptr<CompilationProvider>> providers);
+
+  std::string name() const override;
+  bool SupportsCompileToRelocatableModule() const override;
+  bool SupportsCompileAndLink() const override;
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+
+ private:
+  explicit CompositeCompilationProvider(
+      std::vector<std::unique_ptr<CompilationProvider>> providers);
+
+  std::vector<std::unique_ptr<CompilationProvider>> providers_;
+  CompilationProvider* relocatable_compilation_provider_ = nullptr;
+  CompilationProvider* compile_and_link_compilation_provider_ = nullptr;
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_COMPOSITE_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cubin_or_ptx_image.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cubin_or_ptx_image.h
new file mode 100644
index 00000000..d9c7e580
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cubin_or_ptx_image.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUBIN_OR_PTX_IMAGE_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUBIN_OR_PTX_IMAGE_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace stream_executor {
+// This is the input to various PTX compilation and linking functions. The
+// struct holds either PTX or CUBIN in `bytes` and a compilation profile in
+// `profile`. `profile` can either be a compile profile `compute_XY` or a SASS
+// profile `sm_XY`.
+struct CubinOrPTXImage {
+  std::string profile;
+  std::vector<uint8_t> bytes;
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUBIN_OR_PTX_IMAGE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
new file mode 100644
index 00000000..caf2af50
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_ASM_COMPILER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_ASM_COMPILER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/base/macros.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/cuda/cubin_or_ptx_image.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+
+namespace stream_executor {
+// Compiles the given PTX string using a statically determined compilation
+// method and returns the resulting machine code (i.e. a cubin) as a byte array.
+// The generated cubin matches the compute capabilities provided by 'cc'.
+//
+// 'options' is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(
+    const CudaComputeCapability& cc, const std::string& ptx_contents,
+    GpuAsmOpts options, bool cancel_if_reg_spill = false);
+
+// Temporary overload for users outside of XLA that still use the old API.
+inline absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(
+    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
+    bool cancel_if_reg_spill = false) {
+  return CompileGpuAsm(CudaComputeCapability(cc_major, cc_minor),
+                       std::string(ptx_contents), options, cancel_if_reg_spill);
+}
+
+// Same as CompileGpuAsm, but caches the result, and returns unowned view of
+// the compiled binary.
+//
+// A copy of the string provided in ptx will be made.
+absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
+    const CudaComputeCapability& cc, const std::string& ptx_contents,
+    GpuAsmOpts compilation_options);
+
+// Bundles the GPU machine code (cubins) and PTX if requested and returns the
+// resulting binary (i.e. a fatbin) as a byte array.
+absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+    std::vector<CubinOrPTXImage> images, GpuAsmOpts options);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_ASM_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
new file mode 100644
index 00000000..fc87558a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -0,0 +1,127 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA-specific support for BLAS functionality -- this wraps the cuBLAS library
+// capabilities, and is only included into CUDA implementation code -- it will
+// not introduce cuda headers into other code.
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
+
+#include <cstdint>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/cuda/cuda_blas_lt.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace cuda {
+
+// BLAS plugin for CUDA platform via cuBLAS library.
+//
+// This satisfies the platform-agnostic BlasSupport interface.
+//
+// Note that the cuBLAS handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the cuBLAS handle when a
+// CUDA context is active.
+//
+// Thread-safe post-initialization.
+class CUDABlas : public blas::BlasSupport {
+ public:
+  explicit CUDABlas(StreamExecutor *parent);
+
+  // Allocates a cuBLAS handle.
+  bool Init();
+
+  // Releases the cuBLAS handle, if present.
+  ~CUDABlas() override;
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES
+
+  BlasLt *GetBlasLt() override { return &blas_lt_; }
+
+ private:
+  // Tells cuBLAS to enqueue the BLAS operation onto a particular Stream.
+  //
+  // cuBLAS is stateful, and only be associated with one stream (in order to
+  // enqueue dispatch) at a given time. As a result, this generally must be
+  // invoked before calling into cuBLAS.
+  bool SetStream(Stream *stream) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // A helper function that calls the real cuBLAS function together with error
+  // handling.
+  //
+  // cublas_func:        cuBLAS function pointer.
+  // cublas_name:        cuBLAS function name.
+  // stream:             Stream to enqueue the BLAS operation onto.
+  // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
+  //                     (true) or device (false).
+  // args:               Arguments of cuBLAS function.
+  template <typename FuncT, typename... Args>
+  absl::Status DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+                                  bool pointer_mode_host,
+                                  cublasMath_t math_type, Args... args);
+
+  // Convenience functions that call DoBlasInternalImpl with err_on_failure=true
+  // and math_type=CUBLAS_DEFAULT_MATH.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
+                      Args... args) {
+    return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
+                              CUBLAS_DEFAULT_MATH, args...)
+        .ok();
+  }
+
+  // A helper function to implement DoBlasGemmBatched interfaces for generic
+  // types.
+  template <typename T, typename Scalar, typename FuncT>
+  absl::Status DoBlasGemmBatchedInternal(
+      FuncT cublas_func, Stream *stream, blas::Transpose transa,
+      blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, Scalar alpha,
+      const DeviceMemorySlice<T> &a_array, int lda,
+      const DeviceMemorySlice<T> &b_array, int ldb, Scalar beta,
+      const DeviceMemorySlice<T> &c_array, int ldc, int batch_count,
+      const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator);
+
+  // Guards the cuBLAS handle for this device.
+  mutable absl::Mutex mu_;
+
+  // StreamExecutor which instantiated this CUDABlas.
+  // Immutable post-initialization.
+  StreamExecutor *parent_;
+
+  // cuBLAS library handle on the device.
+  cublasHandle_t blas_ ABSL_GUARDED_BY(mu_);
+
+  cuda::BlasLt blas_lt_;
+
+  CUDABlas(const CUDABlas &) = delete;
+  void operator=(const CUDABlas &) = delete;
+};
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
new file mode 100644
index 00000000..47bcc6d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
@@ -0,0 +1,171 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cublasLt.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/library_types.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+
+namespace stream_executor {
+namespace cuda {
+
+class BlasLt : public gpu::BlasLt {
+  template <typename T>
+  using Owned =
+      std::unique_ptr<std::remove_pointer_t<T>, cublasStatus_t (*)(T)>;
+
+ public:
+  struct MatrixLayout {
+    // If `leading_dim_stride` is not specified, it defaults to:
+    //  - `num_cols` if `order == kRowMajor`,
+    //  - `num_rows` if `order == kColumnMajor`.
+    // If `batch_stride` is not specified, it defaults to `num_rows * num_cols`
+    // if `batch_size > 1`, otherwise `0`.
+    static absl::StatusOr<MatrixLayout> Create(const gpu::MatrixLayout& m);
+
+    cudaDataType_t type() const;
+    cublasLtMatrixLayout_t get() const { return handle_.get(); }
+
+   private:
+    explicit MatrixLayout(cublasLtMatrixLayout_t handle)
+        : handle_(handle, cublasLtMatrixLayoutDestroy) {}
+
+    Owned<cublasLtMatrixLayout_t> handle_;
+  };
+
+  class MatmulDesc {
+   public:
+    static absl::StatusOr<MatmulDesc> Create(
+        blas::ComputationType compute_type, blas::DataType scale_type,
+        blas::Transpose trans_a = blas::Transpose::kNoTranspose,
+        blas::Transpose trans_b = blas::Transpose::kNoTranspose,
+        Epilogue epilogue = Epilogue::kDefault, bool enable_fast_accum = false,
+        PointerMode pointer_mode = PointerMode::kHost);
+
+    cublasComputeType_t compute_type() const;
+    cudaDataType_t scale_type() const;
+    cublasLtPointerMode_t pointer_mode() const;
+    cublasLtMatmulDesc_t get() const { return handle_.get(); }
+
+   private:
+    explicit MatmulDesc(cublasLtMatmulDesc_t handle)
+        : handle_(handle, cublasLtMatmulDescDestroy) {}
+
+    Owned<cublasLtMatmulDesc_t> handle_;
+  };
+
+  class MatmulPlan : public gpu::BlasLt::MatmulPlan {
+   public:
+    MatmulPlan(const BlasLt& blas_lt_ref, MatmulDesc&& op_desc,
+               MatrixLayout&& a_desc, MatrixLayout&& b_desc,
+               MatrixLayout&& c_desc, MatrixLayout&& d_desc,
+               xla::complex128 alpha, double beta, bool must_swap_operands)
+        : blas_lt_ref_(blas_lt_ref),
+          op_desc_(std::move(op_desc)),
+          a_desc_(std::move(a_desc)),
+          b_desc_(std::move(b_desc)),
+          c_desc_(std::move(c_desc)),
+          d_desc_(std::move(d_desc)),
+          alpha_(alpha),
+          beta_(beta),
+          must_swap_operands_(must_swap_operands) {}
+
+    ~MatmulPlan() override = default;
+
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const override;
+
+    absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
+        size_t max_algorithm_count, size_t max_workspace_size) const override;
+
+   protected:
+    absl::Status ValidateInputs(blas::DataType scale_type, bool alpha_on_device,
+                                bool beta_on_device, blas::DataType A_type,
+                                blas::DataType B_type, blas::DataType C_type,
+                                blas::DataType D_type) const override;
+
+    absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
+                          DeviceMemoryBase b, const void* beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          const MatmulAlgorithm& algorithm,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          std::optional<DeviceMemoryBase> workspace,
+                          std::optional<ScratchAllocator*> scratch_allocator,
+                          blas::ProfileResult* profile_result) const override;
+
+   private:
+    const BlasLt& blas_lt_ref_;
+    // TODO(cjfj): Add consistency checks for types, shapes, etc.?
+    MatmulDesc op_desc_;
+    MatrixLayout a_desc_;
+    MatrixLayout b_desc_;
+    MatrixLayout c_desc_;
+    MatrixLayout d_desc_;
+    xla::complex128 alpha_;
+    double beta_;
+    bool must_swap_operands_;
+  };  // class MatmulPlan
+
+  explicit BlasLt(StreamExecutor* parent)
+      : parent_(parent), blas_lt_(nullptr, cublasLtDestroy) {}
+
+  absl::Status Init() override;
+
+  absl::StatusOr<MatmulPlanPtr> GetMatmulPlan(const gpu::GemmConfig& cfg,
+                                              Epilogue epilogue) const override;
+
+  ~BlasLt() override = default;
+
+ private:
+  StreamExecutor* parent_;
+  mutable absl::Mutex mu_;
+  Owned<cublasLtHandle_t> blas_lt_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h
new file mode 100644
index 00000000..aaaf4257
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
+
+
+#include "absl/status/status.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/library_types.h"
+#include "xla/stream_executor/blas.h"
+#include "tsl/platform/errors.h"
+
+#define SE_CUBLAS_RETURN_IF_ERROR(expr) \
+  TF_RETURN_IF_ERROR(::stream_executor::cuda::ToStatus(expr, #expr))
+
+namespace stream_executor {
+namespace cuda {
+
+const char* ToString(cublasStatus_t status);
+absl::Status ToStatus(cublasStatus_t status, const char* prefix = "cublasLt");
+cudaDataType_t AsCudaDataType(blas::DataType type);
+cublasComputeType_t AsCublasComputeType(blas::ComputationType type);
+cublasOperation_t AsCublasOperation(blas::Transpose trans);
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_collectives.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_collectives.h
new file mode 100644
index 00000000..bbcf0212
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_collectives.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_COLLECTIVES_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_COLLECTIVES_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+struct CudaCollectives {
+  // Allocates a collective device memory space of size bytes associated with
+  // the given context.
+  //
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclmemalloc
+  static absl::StatusOr<void *> CollectiveMemoryAllocate(
+      StreamExecutor *executor, uint64_t bytes);
+
+  // Deallocates a collective device memory space of size bytes associated with
+  // the given context.
+  //
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclmemfree
+  static absl::Status CollectiveMemoryDeallocate(StreamExecutor *executor,
+                                                 void *location);
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_COLLECTIVES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
new file mode 100644
index 00000000..566f6066
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
@@ -0,0 +1,204 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_COMMAND_BUFFER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_COMMAND_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/bit_pattern.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_context.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_command_buffer.h"
+#include "xla/stream_executor/gpu/scoped_gpu_graph_exec.h"
+#include "xla/stream_executor/gpu/scoped_update_mode.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// This class implements GpuCommandBuffer for Nvidia GPUs.
+class CudaCommandBuffer final : public GpuCommandBuffer {
+ public:
+  // Creates a new CUDA command buffer and the underlying CUDA graph.
+  static absl::StatusOr<std::unique_ptr<CudaCommandBuffer>> Create(
+      Mode mode, StreamExecutor* parent, CudaContext* cuda_context);
+
+  ~CudaCommandBuffer() override;
+
+ private:
+  CudaCommandBuffer(Mode mode, StreamExecutor* parent,
+                    CudaContext* cuda_context, CUgraph graph,
+                    bool is_owned_graph)
+      : GpuCommandBuffer(mode, parent),
+        parent_(parent),
+        cuda_context_(cuda_context),
+        graph_(graph),
+        is_owned_graph_(is_owned_graph) {
+    VLOG(5) << "Created command buffer for graph " << graph_
+            << "; mode=" << ModeToString(mode)
+            << "; is_owned_graph=" << is_owned_graph_;
+  }
+
+  absl::Status LaunchSetIfConditionKernel(
+      ExecutionScopeId execution_scope_id,
+      GraphConditionalHandle if_conditional,
+      DeviceMemory<bool> predicate) override;
+  absl::Status LaunchSetIfElseConditionKernel(
+      ExecutionScopeId execution_scope_id,
+      GraphConditionalHandle if_conditional,
+      GraphConditionalHandle else_conditional,
+      DeviceMemory<bool> predicate) override;
+  absl::Status LaunchSetCaseConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+      DeviceMemory<int32_t> index, int32_t batch_offset,
+      bool enable_conditional_default) override;
+  absl::Status LaunchSetForConditionKernel(ExecutionScopeId execution_scope_id,
+                                           GraphConditionalHandle conditional,
+                                           DeviceMemory<int32_t> loop_counter,
+                                           int32_t iterations) override;
+  absl::Status LaunchSetWhileConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+      DeviceMemory<bool> predicate) override;
+  absl::StatusOr<NoOpKernel*> GetNoOpKernel();
+
+  absl::StatusOr<ConditionalNodeResult> CreateConditionalNode(
+      const Dependencies& dependencies, GraphConditionalHandle conditional,
+      ConditionType type) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateMemsetNode(
+      const Dependencies& dependencies, DeviceMemoryBase destination,
+      BitPattern bit_pattern, size_t num_elements) override;
+
+  absl::Status UpdateMemsetNode(GraphNodeHandle node_handle,
+                                DeviceMemoryBase destination,
+                                BitPattern bit_pattern,
+                                size_t num_elements) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateMemcpyD2DNode(
+      const Dependencies& dependencies, DeviceMemoryBase destination,
+      DeviceMemoryBase source, uint64_t size) override;
+
+  absl::Status UpdateMemcpyD2DNode(GraphNodeHandle node_handle,
+                                   DeviceMemoryBase destination,
+                                   DeviceMemoryBase source,
+                                   uint64_t size) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateChildNode(
+      const Dependencies& dependencies, const CommandBuffer& nested) override;
+
+  absl::Status UpdateChildNode(GraphNodeHandle node_handle,
+                               const CommandBuffer& nested) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateKernelNode(
+      const Dependencies& dependencies, const ThreadDim& threads,
+      const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgsPackedArrayBase& args) override;
+
+  absl::Status UpdateKernelNode(GraphNodeHandle node_handle,
+                                const ThreadDim& threads,
+                                const BlockDim& blocks, const Kernel& kernel,
+                                const KernelArgsPackedArrayBase& args) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateBarrierNode(
+      const Dependencies& dependencies) override;
+
+  absl::Status Trace(Stream* stream,
+                     absl::AnyInvocable<absl::Status()> function) override;
+
+  absl::Status SetNodeExecutionEnabled(GraphNodeHandle node_handle,
+                                       bool enabled) override;
+
+  absl::Status LaunchGraph(Stream* stream) override;
+
+  absl::StatusOr<size_t> GetNodeCount() const override;
+
+  absl::Status PrepareFinalization() override;
+
+  absl::StatusOr<GraphConditionalHandle> CreateConditionalHandle() override;
+
+  absl::Status WriteGraphToDotFile(absl::string_view path) override;
+
+  absl::Status InstantiateGraph() override;
+
+  using ScopedCudaGraphExec = ScopedGraphExec<CUgraphExec>;
+  std::unique_ptr<ScopedUpdateMode> ActivateUpdateMode(
+      GpuCommandBuffer* nested_cmd_buffer) override;
+
+  absl::Status CheckCanBeUpdated() override;
+
+  absl::StatusOr<std::vector<GraphNodeHandle>> GetNodeDependencies(
+      GraphNodeHandle node) override;
+
+  // A signature of a device kernels updating conditional handle(s).
+  using SetIfConditionKernel =
+      TypedKernel<CUgraphConditionalHandle, DeviceMemory<bool>>;
+
+  using SetIfElseConditionKernel =
+      TypedKernel<CUgraphConditionalHandle, CUgraphConditionalHandle,
+                  DeviceMemory<bool>>;
+
+  using SetCaseConditionKernel =
+      TypedKernel<CUgraphConditionalHandle, CUgraphConditionalHandle,
+                  CUgraphConditionalHandle, CUgraphConditionalHandle,
+                  CUgraphConditionalHandle, CUgraphConditionalHandle,
+                  CUgraphConditionalHandle, CUgraphConditionalHandle,
+                  DeviceMemory<int32_t>, int32_t, int32_t, bool>;
+
+  using SetForConditionKernel =
+      TypedKernel<CUgraphConditionalHandle, DeviceMemory<int32_t>, int32_t>;
+
+  using SetWhileConditionKernel =
+      TypedKernel<CUgraphConditionalHandle, DeviceMemory<bool>>;
+
+  // Lazy loaded auxiliary kernels required for building CUDA graphs (no-op
+  // barriers, updating conditional handles, etc.).
+  SetIfConditionKernel set_if_condition_kernel_;
+  SetIfElseConditionKernel set_if_else_condition_kernel_;
+  SetCaseConditionKernel set_case_condition_kernel_;
+  SetForConditionKernel set_for_condition_kernel_;
+  SetWhileConditionKernel set_while_condition_kernel_;
+  NoOpKernel noop_kernel_;
+
+  StreamExecutor* parent_;
+
+  CudaContext* cuda_context_;
+
+  static_assert(std::is_pointer_v<CUgraph>, "CUgraph must be a pointer");
+  static_assert(std::is_pointer_v<CUgraphExec>,
+                "CUgraphExec must be a pointer");
+
+  CUgraph graph_ = nullptr;     // owned if `is_owned_graph_`
+  bool is_owned_graph_ = true;  // ownership of `graph_`
+
+  CUgraphExec exec_ = nullptr;       // owned if `is_owned_graph_exec_`
+  bool is_owned_graph_exec_ = true;  // ownership of `is_owned_graph_exec_`
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_COMMAND_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_context.h
new file mode 100644
index 00000000..bce62d4e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_context.h
@@ -0,0 +1,64 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA userspace driver library wrapper functionality.
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_CONTEXT_H_
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/gpu/context.h"
+#include "xla/stream_executor/gpu/context_map.h"
+
+namespace stream_executor::gpu {
+
+// CudaContext implements the Context class for CUDA GPUs.
+class CudaContext : public Context {
+ public:
+  CudaContext(CUcontext context, int device_ordinal)
+      : context_(context), device_ordinal_(device_ordinal) {}
+  ~CudaContext() override;
+
+  void SetActive() override;
+  bool IsActive() const override;
+  CUcontext context() const { return context_; }
+  int device_ordinal() const override { return device_ordinal_; }
+  absl::Status Synchronize() override;
+
+  // Disallow copying and moving.
+  CudaContext(CudaContext&&) = delete;
+  CudaContext(const CudaContext&) = delete;
+  CudaContext& operator=(CudaContext&&) = delete;
+  CudaContext& operator=(const CudaContext&) = delete;
+
+  // Returns a new context for the given device.
+  static absl::StatusOr<CudaContext*> Create(int device_ordinal,
+                                             CUdevice device);
+
+  // Returns the context map for all XLA-known CUDA contexts.
+  static ContextMap<CUcontext, CudaContext>* GetContextMap();
+
+ private:
+  CUcontext const context_;
+  const int device_ordinal_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
new file mode 100644
index 00000000..ea1fa0cf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
@@ -0,0 +1,44 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/gpu/gpu_diagnostics.h"
+
+namespace stream_executor {
+namespace cuda {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = gpu::DriverVersion;
+
+// Converts a parsed driver version to string form.
+std::string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+std::string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
+
+using Diagnostician = gpu::Diagnostician;
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
new file mode 100644
index 00000000..9d46794e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -0,0 +1,749 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA-specific DNN library support, implementing the general DnnSupport
+// interface.
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
+
+#include <Eigen/Core>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "third_party/gpus/cudnn/cudnn_version.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+
+#if CUDNN_VERSION >= 8100
+#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
+#endif  // CUDNN_VERSION >= 8100
+
+namespace stream_executor {
+namespace gpu {
+
+class CudnnRnnDescriptor;
+class CudnnRnnSequenceTensorDescriptor;
+class CudnnRnnStateTensorDescriptor;
+class CudnnCtcLossDescriptor;
+
+using BatchDescriptorSlice = absl::Span<const dnn::BatchDescriptor>;
+
+template <typename T>
+using DeviceMemorySlice = absl::Span<const DeviceMemory<T>* const>;
+
+#if CUDNN_VERSION >= 8100
+class CudnnGraph : public dnn::DnnGraph {
+ public:
+  explicit CudnnGraph(cudnn_frontend::graph::Graph&& graph)
+      : graph_(std::move(graph)) {}
+  // Prepares a graph and checks whether it is generally supported.
+  absl::Status Prepare(dnn::DnnSupport&, const NumericOptions&) override;
+  // Builds single plan of the graph with given ID.
+  absl::Status Build(dnn::DnnSupport&, std::optional<int64_t> plan_id) override;
+  // Builds all the plans
+  absl::Status Execute(Stream& stream, absl::Span<DeviceMemoryBase> operands,
+                       int64_t local_device_ordinal) const override;
+  const cudnn_frontend::graph::Graph& Graph() const { return graph_; }
+  void InitDropoutState(int64_t local_device_count, int64_t seed,
+                        int64_t increment) override {
+    dropout_rng_seed_ = seed;
+    current_dropout_rng_offset_ = std::vector<int64_t>(local_device_count, 0);
+    dropout_rng_offset_increment_ = increment;
+  }
+  void UpdateDropoutState(int64_t local_device_ordinal) const {
+    current_dropout_rng_offset_[local_device_ordinal] +=
+        dropout_rng_offset_increment_;
+  }
+
+ private:
+  cudnn_frontend::graph::Graph graph_;
+  int64_t dropout_rng_seed_;
+  mutable std::vector<int64_t> current_dropout_rng_offset_;
+  int64_t dropout_rng_offset_increment_ = 0;
+};
+#endif  // CUDNN_VERSION >= 8100
+
+// cudnn-library based DNN support. For details on overridden interface
+// functions, see dnn.h.
+class CudnnSupport : public dnn::DnnSupport {
+ public:
+  explicit CudnnSupport(StreamExecutor* parent);
+
+  absl::Status Init() override;
+  absl::StatusOr<stream_executor::dnn::VersionInfo> GetVersion() override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> CreateRnnDescriptor(
+      int num_layers, int hidden_size, int input_size, int cell_size,
+      int batch_size, dnn::RnnInputMode input_mode,
+      dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
+      dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
+      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      ScratchAllocator* state_allocator, bool use_padded_io) override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size,
+                                    dnn::DataType data_type) override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size,
+                                    const absl::Span<const int>& seq_lengths,
+                                    bool time_major,
+                                    dnn::DataType data_type) override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  CreateRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+                                 dnn::DataType data_type) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<Eigen::half>& input_data,
+                    const DeviceMemory<int>& seq_lengths_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<Eigen::half>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<Eigen::half>& input_c_data,
+                    const DeviceMemory<Eigen::half>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<Eigen::half>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<Eigen::half>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<Eigen::half>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<float>& input_data,
+                    const DeviceMemory<int>& seq_lengths_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<float>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<float>& input_c_data,
+                    const DeviceMemory<float>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<float>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<float>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<float>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<double>& input_data,
+                    const DeviceMemory<int>& seq_lengths_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<double>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<double>& input_c_data,
+                    const DeviceMemory<double>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<double>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<double>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<double>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const DeviceMemory<int>& seq_lengths_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<Eigen::half>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<Eigen::half>& input_c_data,
+                     const DeviceMemory<Eigen::half>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<Eigen::half>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<Eigen::half>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<Eigen::half>& output_c_data,
+                     const DeviceMemory<Eigen::half>& output_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_h_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_c_backprop_data,
+                     DeviceMemory<Eigen::half>* input_backprop_data,
+                     DeviceMemory<Eigen::half>* input_h_backprop_data,
+                     DeviceMemory<Eigen::half>* input_c_backprop_data,
+                     DeviceMemory<Eigen::half>* params_backprop_data,
+                     DeviceMemory<uint8_t>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<float>& input_data,
+                     const DeviceMemory<int>& seq_lengths_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<float>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<float>& input_c_data,
+                     const DeviceMemory<float>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<float>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<float>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<float>& output_c_data,
+                     const DeviceMemory<float>& output_backprop_data,
+                     const DeviceMemory<float>& output_h_backprop_data,
+                     const DeviceMemory<float>& output_c_backprop_data,
+                     DeviceMemory<float>* input_backprop_data,
+                     DeviceMemory<float>* input_h_backprop_data,
+                     DeviceMemory<float>* input_c_backprop_data,
+                     DeviceMemory<float>* params_backprop_data,
+                     DeviceMemory<uint8_t>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<double>& input_data,
+                     const DeviceMemory<int>& seq_lengths_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<double>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<double>& input_c_data,
+                     const DeviceMemory<double>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<double>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<double>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<double>& output_c_data,
+                     const DeviceMemory<double>& output_backprop_data,
+                     const DeviceMemory<double>& output_h_backprop_data,
+                     const DeviceMemory<double>& output_c_backprop_data,
+                     DeviceMemory<double>* input_backprop_data,
+                     DeviceMemory<double>* input_h_backprop_data,
+                     DeviceMemory<double>* input_c_backprop_data,
+                     DeviceMemory<double>* params_backprop_data,
+                     DeviceMemory<uint8_t>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  absl::Status GetConvolveRunners(
+      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
+      dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      bool use_fallback, ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans)
+      override;
+
+  absl::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor) override;
+
+  absl::Status GetGraphConvolveRunners(
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      bool use_fallback, const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
+      std::string serialized_graph) override;
+
+  absl::StatusOr<std::unique_ptr<const dnn::GraphConvRunner>>
+  GraphConvolveRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      std::string serialized_graph) override;
+
+  absl::Status GetFusedConvolveRunners(
+      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
+      dnn::DataType input_type, dnn::DataType bias_type,
+      dnn::DataType output_type, double conv_scale, double side_input_scale,
+      double leakyrelu_alpha, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      bool use_fallback, dnn::ActivationMode activation_mode,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
+      override;
+
+  absl::Status GetFusedMatmulRunners(
+      bool use_cudnn_frontend, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
+      bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
+      int64_t lda, int64_t ldb, int64_t ldc,
+      dnn::ActivationMode activation_mode, bool use_fallback,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
+          out_exec_plans) override;
+
+  absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  FusedConvolveRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+      double side_input_scale, double leakyrelu_alpha,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::ActivationMode activation_mode) override;
+
+  absl::StatusOr<std::unique_ptr<const dnn::NormRunner>> NormRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::NormKind kind, double epsilon,
+      const dnn::TensorDescriptor& x_descriptor,
+      const dnn::TensorDescriptor& scale_descriptor,
+      const dnn::TensorDescriptor& y_or_dx_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor,
+      std::optional<dnn::TensorDescriptor> dy_descriptor,
+      std::optional<dnn::TensorDescriptor> expectation_descriptor,
+      std::optional<dnn::TensorDescriptor> norm_factor_descriptor,
+      std::optional<dnn::TensorDescriptor> dscale_descriptor,
+      std::optional<dnn::TensorDescriptor> dbias_descriptor) override;
+
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<float>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_var_iance,
+      const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::half>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::half>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::bfloat16>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<float>& y_backprop,
+      const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var, const DeviceMemory<float>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<float>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+      const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::half>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::half>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::half>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+      const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var,
+      const DeviceMemory<Eigen::bfloat16>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  absl::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8_t> scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
+
+  absl::Status DoFusedConvolve(
+      Stream* stream, dnn::DataType input_type, dnn::DataType side_input_type,
+      dnn::DataType bias_type, dnn::DataType output_type,
+      const dnn::BatchDescriptor& conv_input_descriptor,
+      DeviceMemoryBase conv_input_data, double conv_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      DeviceMemoryBase side_input_data, double side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor, DeviceMemoryBase biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
+  absl::Status CudnnReorderConvolutionFilterAndBias(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<int8_t>& filter_input,
+      DeviceMemory<int8_t>* filter_output,
+      std::optional<const DeviceMemory<float>> bias_input,
+      std::optional<DeviceMemory<float>> bias_output) override;
+
+  absl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             DeviceMemoryBase input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemoryBase output_data,
+                             ScratchAllocator* workspace_allocator) override;
+
+  absl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const NumericOptions& numeric_options,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             DeviceMemoryBase input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemoryBase output_data,
+                             ScratchAllocator* workspace_allocator) override;
+
+  absl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              DeviceMemoryBase input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              DeviceMemoryBase output_data,
+                              DeviceMemoryBase input_diff_data,
+                              DeviceMemoryBase output_diff_data,
+                              ScratchAllocator* workspace_allocator) override;
+
+  absl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const NumericOptions& numeric_options,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              DeviceMemoryBase input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              DeviceMemoryBase output_data,
+                              DeviceMemoryBase input_diff_data,
+                              DeviceMemoryBase output_diff_data,
+                              ScratchAllocator* workspace_allocator) override;
+
+  bool DoNormalizeWithDimensions(
+      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+      const dnn::BatchDescriptor& dimensions,
+      const DeviceMemory<float>& input_data,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoNormalizeBackwardWithDimensions(
+      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+      const dnn::BatchDescriptor& dimensions,
+      const DeviceMemory<float>& raw_data,
+      const DeviceMemory<float>& normalized_data,
+      const DeviceMemory<float>& normalized_variable_gradient,
+      DeviceMemory<float>* raw_variable_gradient,
+      ScratchAllocator* workspace_allocator) override;
+
+  // Derives an output batch descriptor from an input batch and convolution
+  // descriptors.
+  bool DeriveOutputBatchDescriptor(
+      const dnn::BatchDescriptor& batch_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::BatchDescriptor* output_batch_descriptor);
+
+  absl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
+                         const dnn::RnnStateTensorDescriptor& probs_desc,
+                         const DeviceMemoryBase probs_data,
+                         absl::Span<const int> labels_data,
+                         absl::Span<const int> labels_lengths_data,
+                         absl::Span<const int> input_lengths_data,
+                         DeviceMemoryBase costs_data,
+                         const dnn::RnnStateTensorDescriptor& grads_desc,
+                         DeviceMemoryBase grads_data,
+                         DeviceMemory<uint8_t> scratch_memory,
+                         int ctc_loss_algo_id) override;
+
+  bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
+                         dnn::DataType input_type,
+                         const DeviceMemoryBase& input_data,
+                         const dnn::BatchDescriptor& output_desc,
+                         dnn::DataType output_type, float scale,
+                         DeviceMemoryBase* output_data) override;
+
+  void NotifyStreamDestroyed(Stream* stream) override;
+
+#if CUDNN_VERSION >= 8100
+  // Loads complete graph from its serialized representation.
+  absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> DeserializeGraph(
+      absl::string_view serialized_data) const override;
+#endif  // CUDNN_VERSION >= 8100
+
+ private:
+  // Uses cuDNN handle for execution.
+  friend class CudnnGraph;
+
+  StreamExecutor* parent_;  // Parent executor object. Not owned.
+
+  // Provides access to the cuDNN handle.
+  std::unique_ptr<class CudnnAccess> cudnn_;
+
+  bool GetConvolveAlgorithms(CudaComputeCapability cuda_compute_capability,
+                             dnn::DataType input_type,
+                             const NumericOptions& numeric_options,
+                             std::vector<dnn::AlgorithmDesc>* out_algorithms);
+
+  bool GetConvolveBackwardDataAlgorithms(
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+      const NumericOptions& numeric_options,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms);
+
+  bool GetConvolveBackwardFilterAlgorithms(
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+      const NumericOptions& numeric_options,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms);
+
+  template <class T, class U>
+  absl::Status DoBatchNormalizationForwardImpl(
+      Stream* stream, dnn::DataType input_data_type,
+      dnn::DataType scale_data_type, const DeviceMemory<T>& x,
+      const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+      const DeviceMemory<U>& estimated_mean,
+      const DeviceMemory<U>& estimated_variance,
+      const DeviceMemory<T>& side_input, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<T>* y,
+      DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var,
+      DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator);
+
+  template <class T, class U>
+  absl::Status DoBatchNormalizationBackwardImpl(
+      Stream* stream, int cudnn_input_type, int cudnn_scale_type,
+      const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
+      const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+      const DeviceMemory<U>& mean, const DeviceMemory<U>& inv_var,
+      const DeviceMemory<T>& y, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<T>* x_backprop,
+      DeviceMemory<U>* scale_backprop, DeviceMemory<U>* offset_backprop,
+      DeviceMemory<T>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator);
+
+  template <class T>
+  absl::Status DoRnnForwardImpl(
+      Stream* stream, const CudnnRnnDescriptor& rnn_desc,
+      const CudnnRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const DeviceMemory<int>& seq_lengths_data,
+      const CudnnRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const CudnnRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const CudnnRnnSequenceTensorDescriptor& output_desc,
+      DeviceMemory<T>* output_data,
+      const CudnnRnnStateTensorDescriptor& output_h_desc,
+      DeviceMemory<T>* output_h_data,
+      const CudnnRnnStateTensorDescriptor& output_c_desc,
+      DeviceMemory<T>* output_c_data, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
+
+  template <class T>
+  absl::Status DoRnnBackwardImpl(
+      Stream* stream, const CudnnRnnDescriptor& rnn_desc,
+      const CudnnRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const DeviceMemory<int>& seq_lengths_data,
+      const CudnnRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const CudnnRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const CudnnRnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<T>& output_data,
+      const CudnnRnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<T>& output_h_data,
+      const CudnnRnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<T>& output_c_data,
+      const DeviceMemory<T>& output_backprop_data,
+      const DeviceMemory<T>& output_h_backprop_data,
+      const DeviceMemory<T>& output_c_backprop_data,
+      DeviceMemory<T>* input_backprop_data,
+      DeviceMemory<T>* input_h_backprop_data,
+      DeviceMemory<T>* input_c_backprop_data,
+      DeviceMemory<T>* params_backprop_data,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
+
+  absl::Status DoCtcLossImpl(
+      Stream* stream, const CudnnRnnStateTensorDescriptor& probs_desc,
+      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
+      const CudnnRnnStateTensorDescriptor& grads_desc,
+      DeviceMemoryBase grads_data, const CudnnCtcLossDescriptor& ctc_loss_desc,
+      DeviceMemory<uint8_t> scratch_memory, int ctc_loss_algo_id);
+
+ private:
+  absl::Status DoPrepareForConvolution(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8_t>* scratch_memory) override;
+
+  absl::Status DoPrepareForCtcLoss(
+      Stream* stream, dnn::DataType element_type,
+      const dnn::RnnStateTensorDescriptor& probs_desc,
+      const dnn::RnnStateTensorDescriptor& grads_desc,
+      absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data,
+      const NumericOptions& numeric_options,
+      ScratchAllocator* scratch_allocator,
+      DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
+
+  CudnnSupport(const CudnnSupport&) = delete;
+  void operator=(const CudnnSupport&) = delete;
+};
+
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
+    dnn::DnnSupport& dnn_support,
+    const dnn::MatmulTensorDescriptor& q_descriptor,
+    const dnn::MatmulTensorDescriptor& k_descriptor,
+    const dnn::MatmulTensorDescriptor& v_descriptor,
+    const dnn::TensorDescriptor& o_descriptor,
+    const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    const std::optional<dnn::TensorDescriptor> stats_descriptor, double scale,
+    const bool use_dropout, const std::optional<double> dropout_rate,
+    const dnn::FMHAMaskKind mask_type, const int sliding_window_length,
+    const int max_seg_per_batch);
+
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionF8OperationGraph(
+    dnn::DnnSupport& dnn_support,
+    const dnn::MatmulTensorDescriptor& q_descriptor,
+    const dnn::MatmulTensorDescriptor& k_descriptor,
+    const dnn::MatmulTensorDescriptor& v_descriptor,
+    const dnn::TensorDescriptor& o_descriptor,
+    const std::optional<dnn::TensorDescriptor>& stats_descriptor, double scale,
+    dnn::FMHAMaskKind mask_type);
+
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
+    dnn::DnnSupport& dnn_support, const dnn::MatmulTensorDescriptor& q_desc,
+    const dnn::MatmulTensorDescriptor& k_desc,
+    const dnn::MatmulTensorDescriptor& p_desc,
+    const dnn::MatmulTensorDescriptor& v_desc,
+    const dnn::MatmulTensorDescriptor& do_desc,
+    const dnn::TensorDescriptor& dq_desc, const dnn::TensorDescriptor& dk_desc,
+    const dnn::TensorDescriptor& dv_desc,
+    const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    double scale, bool use_dropout, bool use_bias,
+    const dnn::FMHAMaskKind mask_type, bool force_deterministic,
+    const int sliding_window_length, const int max_seg_per_batch);
+
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardF8OperationGraph(
+    dnn::DnnSupport& dnn_support, const dnn::MatmulTensorDescriptor& q_desc,
+    const dnn::MatmulTensorDescriptor& k_desc,
+    const dnn::MatmulTensorDescriptor& p_desc,
+    const dnn::MatmulTensorDescriptor& v_desc,
+    const dnn::MatmulTensorDescriptor& do_desc,
+    const dnn::TensorDescriptor& dq_desc, const dnn::TensorDescriptor& dk_desc,
+    const dnn::TensorDescriptor& dv_desc, double scale,
+    dnn::FMHAMaskKind mask_type);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_event.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_event.h
new file mode 100644
index 00000000..0d6f871d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_event.h
@@ -0,0 +1,63 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+class GpuContext;
+
+// This class implements Event for CUDA devices.
+class CudaEvent : public Event {
+ public:
+  Event::Status PollForStatus() override;
+  absl::Status WaitForEventOnExternalStream(std::intptr_t stream) override;
+
+  // Creates a new CudaEvent. If allow_timing is false, the event will not
+  // support timing, which is cheaper to create.
+  static absl::StatusOr<CudaEvent> Create(StreamExecutor* executor,
+                                          bool allow_timing);
+
+  CUevent GetHandle() const { return handle_; }
+
+  ~CudaEvent() override;
+  CudaEvent(const CudaEvent&) = delete;
+  CudaEvent& operator=(const CudaEvent&) = delete;
+  CudaEvent(CudaEvent&& other);
+  CudaEvent& operator=(CudaEvent&& other);
+
+ private:
+  explicit CudaEvent(StreamExecutor* executor, CUevent handle)
+      : executor_(executor), handle_(handle) {}
+
+  // The StreamExecutor to which this object and CUevent are bound.
+  StreamExecutor* executor_;
+
+  // The underlying CUDA event handle.
+  CUevent handle_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
new file mode 100644
index 00000000..ad5da0f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -0,0 +1,222 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/numeric/int128.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_collectives.h"
+#include "xla/stream_executor/cuda/cuda_context.h"
+#include "xla/stream_executor/cuda/cuda_kernel.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// This class implements GpuExecutor for NVIDIA GPUs that use CUDA libraries.
+class CudaExecutor : public GpuExecutor {
+ public:
+  CudaExecutor(Platform* platform, int device_ordinal)
+      : GpuExecutor(platform, device_ordinal) {}
+  ~CudaExecutor() override;
+  std::unique_ptr<ActivateContext> Activate() override;
+  absl::Status Init() override;
+  bool SynchronizeAllActivity() override;
+  absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
+      const DeviceMemoryBase& location) override;
+
+  absl::StatusOr<void*> CollectiveMemoryAllocate(uint64_t size) override {
+    return CudaCollectives::CollectiveMemoryAllocate(this, size);
+  }
+
+  absl::Status CollectiveMemoryDeallocate(void* location) override {
+    return CudaCollectives::CollectiveMemoryDeallocate(this, location);
+  }
+
+  absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
+      Stream* stream, bool use_delay_kernel) override;
+  absl::StatusOr<DeviceMemoryBase> GetSymbol(
+      const std::string& symbol_name, ModuleHandle module_handle) override;
+  absl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                  uint64_t size) override;
+  absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64_t size) override;
+  void DeallocateStream(Stream* stream) override;
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override;
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override;
+  bool DeviceMemoryUsage(int64_t* free_out, int64_t* total_out) const override;
+  absl::StatusOr<std::unique_ptr<Kernel>> LoadKernel(
+      const MultiKernelLoaderSpec& spec) override;
+  void UnloadKernel(const Kernel* kernel) override;
+  absl::StatusOr<ModuleHandle> LoadModule(
+      const MultiModuleLoaderSpec& spec) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
+  absl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
+      Stream* stream, absl::Span<const uint8_t> content) override;
+  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
+  void Deallocate(DeviceMemoryBase* mem) override;
+  blas::BlasSupport* AsBlas() override;
+  fft::FftSupport* AsFft() override;
+  dnn::DnnSupport* AsDnn() override;
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) override;
+  absl::StatusOr<std::unique_ptr<CommandBuffer>> CreateCommandBuffer(
+      CommandBuffer::Mode mode) override;
+  int cc_major() const { return cc_major_; }
+  int cc_minor() const { return cc_minor_; }
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return CudaExecutor::CreateDeviceDescription(device_ordinal());
+  }
+  void* UnifiedMemoryAllocate(uint64_t size) override;
+  void UnifiedMemoryDeallocate(void* location) override;
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override;
+
+  void HostMemoryDeallocate(void* location) override;
+  bool HostMemoryRegister(void* location, uint64_t size) override;
+  bool HostMemoryUnregister(void* location) override;
+
+  absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
+
+  Stream* FindAllocatedStream(void* gpu_stream) override {
+    absl::MutexLock lock(&alive_gpu_streams_mu_);
+    auto it = alive_gpu_streams_.find(gpu_stream);
+    if (it == alive_gpu_streams_.end()) {
+      return nullptr;
+    }
+    return it->second;
+  }
+
+  static absl::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
+
+  // Returns a CudaKernel pointer for a given Kernel, if the kernel is
+  // associated with this executor. Otherwise a NotFound error is returned.
+  absl::StatusOr<const CudaKernel*> GetCudaKernel(const Kernel* kernel);
+
+ private:
+  // Loads a module in cubin format.
+  absl::StatusOr<ModuleHandle> LoadModuleFromCuBin(const char* cubin)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
+  absl::StatusOr<ModuleHandle> LoadModuleFromPtx(const char* ptx)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(ModuleHandle gpu_binary)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Returns true if a delay kernel is supported.
+  absl::StatusOr<bool> DelayKernelIsSupported();
+
+  // Guards the in-memory-module mapping.
+  absl::Mutex in_memory_modules_mu_;
+
+  absl::Mutex shared_constants_mu_;
+  // On-device constants that can be shared between multiple executables. A
+  // pointer for a given constant will expire when no executables require use
+  // of that constant anymore.
+  std::map<const absl::uint128, std::weak_ptr<DeviceMemoryBase>>
+      shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
+
+  // Kernel -> loaded GPU module. Many kernels may load the same binary.
+  absl::flat_hash_map<const Kernel*, ModuleHandle> kernel_to_gpu_binary_
+      ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  // Loaded GPU module handle -> {CUDA module, reference count}.
+  absl::flat_hash_map<ModuleHandle, std::pair<CUmodule, uint64_t>>
+      gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  // Set of loaded kernels. This contains all kernels loaded by this executor,
+  // including in-process kernels.
+  absl::flat_hash_set<const Kernel*> loaded_kernels_
+      ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  CUdevice device_;
+
+  // True if delay kernels are supported.
+  bool delay_kernels_supported_ = false;
+
+  // The major version of the compute capability for device_.
+  int cc_major_;
+
+  // The minor version of the compute capability for device_.
+  int cc_minor_;
+
+  // Reader/writer lock for mutable data structures on this object.
+  absl::Mutex mu_;
+
+  // Memoized DNN support object -- we only want to create this once when asked
+  // for a DNN interface.
+  std::unique_ptr<dnn::DnnSupport> dnn_ ABSL_GUARDED_BY(mu_);
+
+  // Memoized FFT support object -- we only want to create this once when asked
+  // for a FFT interface.
+  std::unique_ptr<fft::FftSupport> fft_ ABSL_GUARDED_BY(mu_);
+
+  // Memoized BLAS support object -- we only want to create this once when asked
+  // for a BLAS interface.
+  std::unique_ptr<blas::BlasSupport> blas_ ABSL_GUARDED_BY(mu_);
+
+  absl::Mutex alive_gpu_streams_mu_;
+
+  // Lookup map for alive streams, from raw stream pointers.
+  absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
+      ABSL_GUARDED_BY(alive_gpu_streams_mu_);
+
+  // CudaContext for this device.
+  CudaContext* cuda_context_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_fft.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
new file mode 100644
index 00000000..342f9b9c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
@@ -0,0 +1,134 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA-specific support for FFT functionality -- this wraps the cuFFT library
+// capabilities, and is only included into CUDA implementation code -- it will
+// not introduce cuda headers into other code.
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "third_party/gpus/cuda/include/cufft.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// CUDAFftPlan uses deferred initialization. Only a single call of
+// Initialize() is allowed to properly create cufft plan and set member
+// variable is_initialized_ to true. Newly added interface that uses member
+// variables should first check is_initialized_ to make sure that the values of
+// member variables are valid.
+class CUDAFftPlan : public fft::Plan {
+ public:
+  CUDAFftPlan()
+      : parent_(nullptr),
+        plan_(-1),
+        fft_type_(fft::Type::kInvalid),
+        scratch_(nullptr),
+        scratch_size_bytes_(0),
+        is_initialized_(false),
+        scratch_allocator_(nullptr) {}
+  ~CUDAFftPlan() override;
+
+  // Get FFT direction in cuFFT based on FFT type.
+  int GetFftDirection() const;
+  cufftHandle GetPlan() const {
+    if (IsInitialized()) {
+      return plan_;
+    } else {
+      LOG(FATAL) << "Try to get cufftHandle value before initialization.";
+    }
+  }
+
+  // Initialize function for batched plan
+  absl::Status Initialize(StreamExecutor* parent, Stream* stream, int rank,
+                          uint64_t* elem_count, uint64_t* input_embed,
+                          uint64_t input_stride, uint64_t input_distance,
+                          uint64_t* output_embed, uint64_t output_stride,
+                          uint64_t output_distance, fft::Type type,
+                          int batch_count, ScratchAllocator* scratch_allocator);
+
+  absl::Status UpdateScratchAllocator(Stream* stream,
+                                      ScratchAllocator* scratch_allocator);
+
+  ScratchAllocator* GetScratchAllocator() const { return scratch_allocator_; }
+
+ protected:
+  bool IsInitialized() const { return is_initialized_; }
+
+ private:
+  StreamExecutor* parent_;
+  cufftHandle plan_;
+  fft::Type fft_type_;
+  DeviceMemory<uint8_t> scratch_;
+  size_t scratch_size_bytes_;
+  bool is_initialized_;
+  ScratchAllocator* scratch_allocator_;
+};
+
+// FFT support for CUDA platform via cuFFT library.
+//
+// This satisfies the platform-agnostic FftSupport interface.
+//
+// Note that the cuFFT handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent StreamExecutor is tied
+// to. This simply happens as an artifact of creating the cuFFT handle when a
+// CUDA context is active.
+//
+// Thread-safe. The CUDA context associated with all operations is the CUDA
+// context of parent_, so all context is explicit.
+class CUDAFft : public fft::FftSupport {
+ public:
+  explicit CUDAFft(StreamExecutor* parent) : parent_(parent) {}
+  ~CUDAFft() override {}
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
+
+ private:
+  StreamExecutor* parent_;
+
+  // Two helper functions that execute dynload::cufftExec?2?.
+
+  // This is for complex to complex FFT, when the direction is required.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftWithDirectionInternal(Stream* stream, fft::Plan* plan,
+                                  FuncT cufft_exec,
+                                  const DeviceMemory<InputT>& input,
+                                  DeviceMemory<OutputT>* output);
+
+  // This is for complex to real or real to complex FFT, when the direction
+  // is implied.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftInternal(Stream* stream, fft::Plan* plan, FuncT cufft_exec,
+                     const DeviceMemory<InputT>& input,
+                     DeviceMemory<OutputT>* output);
+
+  CUDAFft(const CUDAFft&) = delete;
+  void operator=(const CUDAFft&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_helpers.h
new file mode 100644
index 00000000..f85fd81b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_helpers.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
+
+#include <complex>
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "third_party/gpus/cuda/include/cuComplex.h"
+
+namespace stream_executor {
+namespace cuda {
+
+// Type traits to get CUDA complex types from std::complex<T>.
+template <typename T>
+struct CUDAComplexT {
+  typedef T type;
+};
+template <>
+struct CUDAComplexT<std::complex<float>> {
+  typedef cuComplex type;
+};
+template <>
+struct CUDAComplexT<std::complex<double>> {
+  typedef cuDoubleComplex type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
+  auto *result = reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(p) % alignof(decltype(*result)), 0)
+      << "Source pointer is not aligned by " << alignof(decltype(*result));
+  return result;
+}
+template <typename T>
+inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
+  auto *result = reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(p) % alignof(decltype(*result)), 0)
+      << "Source pointer is not aligned by " << alignof(decltype(*result));
+  return result;
+}
+
+// Converts values of std::complex<float/double> to values of
+// cuComplex/cuDoubleComplex.
+inline cuComplex CUDAComplexValue(std::complex<float> val) {
+  return {val.real(), val.imag()};
+}
+
+inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
+  return {val.real(), val.imag()};
+}
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
new file mode 100644
index 00000000..c2e0b990
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutor functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/logging.h"
+
+namespace stream_executor::gpu {
+
+class CudaKernel : public Kernel {
+ public:
+  explicit CudaKernel(StreamExecutor* executor) : executor_(executor) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the StreamExecutor.
+  ~CudaKernel() override { executor_->UnloadKernel(this); }
+
+  // As arity cannot be reflected upon using the CUDA API, the arity is
+  // explicitly set during the StreamExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const override;
+
+  // Simple accessor methods.
+  CUfunction gpu_function() const { return gpu_function_; }
+  void set_gpu_function(CUfunction gpu_function) {
+    gpu_function_ = gpu_function;
+  }
+
+  // Collects metadata for the specified kernel.
+  absl::StatusOr<KernelMetadata> GetKernelMetadata();
+
+ private:
+  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
+                      const std::optional<ClusterDim> &cluster_dims,
+                      Stream *stream, const KernelArgs &args) override;
+
+  StreamExecutor* executor_ = nullptr;
+
+  CUfunction gpu_function_ = nullptr;  // wrapped CUDA kernel handle
+  unsigned arity_ = 0;  // number of formal parameters the kernel takes
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
new file mode 100644
index 00000000..b03e90f0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
@@ -0,0 +1,83 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace cuda {
+// Opaque and unique identifier for the CUDA platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a CudaPlatform object.
+extern const Platform::Id kCudaPlatformId;
+}  // namespace cuda
+
+namespace gpu {
+// Cuda-specific platform plugin, registered as a singleton value via module
+// initializer.
+class CudaPlatform : public Platform {
+ public:
+  CudaPlatform();
+
+  // Platform interface implementation:
+  // Returns the same value as kCudaPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const std::string& Name() const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  absl::StatusOr<StreamExecutor*> FindExisting(int ordinal) override;
+
+ private:
+  // Returns a device constructed with the ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      int ordinal);
+
+  // This platform's name.
+  std::string name_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  CudaPlatform(const CudaPlatform&) = delete;
+  void operator=(const CudaPlatform&) = delete;
+};
+
+}  // namespace gpu
+
+namespace cuda {
+
+using CudaPlatform = gpu::CudaPlatform;
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.h
new file mode 100644
index 00000000..b4140499
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
+
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace cuda {
+
+// Opaque and unique identifier for the cuda platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a CudaPlatform object.
+// This is broken out here to avoid a circular dependency between CudaPlatform
+// and CudaExecutor.
+extern const Platform::Id kCudaPlatformId;
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h
new file mode 100644
index 00000000..15c3a326
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h
@@ -0,0 +1,83 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
+
+#include <complex>
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/gpus/cuda/include/cusolverDn.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu_solver_context.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace stream_executor {
+
+class CudaSolverContext : public GpuSolverContext {
+ public:
+  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create();
+
+  ~CudaSolverContext() override;
+
+  absl::Status SetStream(Stream* stream) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<float*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<double*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<std::complex<float>*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<std::complex<double>*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<float> a,
+                     int lda, DeviceMemory<int> lapack_info,
+                     DeviceMemory<float> workspace) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<double> a,
+                     int lda, DeviceMemory<int> lapack_info,
+                     DeviceMemory<double> workspace) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n,
+                     DeviceMemory<std::complex<float>> a, int lda,
+                     DeviceMemory<int> lapack_info,
+                     DeviceMemory<std::complex<float>> workspace) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n,
+                     DeviceMemory<std::complex<double>> a, int lda,
+                     DeviceMemory<int> lapack_info,
+                     DeviceMemory<std::complex<double>> workspace) override;
+  absl::StatusOr<int64_t> PotrfBufferSize(xla::PrimitiveType type,
+                                          blas::UpperLower uplo, int n, int lda,
+                                          int batch_size) override;
+
+ private:
+  explicit CudaSolverContext(cusolverDnHandle_t handle);
+
+  cusolverDnHandle_t handle_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_status.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_status.h
new file mode 100644
index 00000000..007f3575
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_status.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_STATUS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_STATUS_H_
+
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+
+namespace stream_executor::cuda {
+
+namespace internal {
+// Helper method to handle the slow path of ToStatus.  Assumes a non-successful
+// result code.
+absl::Status ToStatusSlow(CUresult result, absl::string_view detail);
+absl::Status ToStatusSlow(cudaError_t result, absl::string_view detail);
+}  // namespace internal
+
+// Returns an absl::Status corresponding to the CUresult.
+inline absl::Status ToStatus(CUresult result, absl::string_view detail = "") {
+  if (ABSL_PREDICT_TRUE(result == CUDA_SUCCESS)) {
+    return absl::OkStatus();
+  }
+  return internal::ToStatusSlow(result, detail);
+}
+
+// Returns an absl::Status corresponding to the cudaError_t (CUDA runtime API
+// error type). The string `detail` will be included in the error message.
+inline absl::Status ToStatus(cudaError_t result,
+                             absl::string_view detail = "") {
+  if (ABSL_PREDICT_TRUE(result == cudaSuccess)) {
+    return absl::OkStatus();
+  }
+  return internal::ToStatusSlow(result, detail);
+}
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
new file mode 100644
index 00000000..a692e0e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
@@ -0,0 +1,106 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/cuda/cuda_event.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class CudaStream : public StreamCommon {
+ public:
+  absl::Status WaitFor(Stream* other) override;
+  absl::Status RecordEvent(Event* event) override;
+  absl::Status WaitFor(Event* event) override;
+
+  absl::Status Memset32(DeviceMemoryBase* location, uint32_t pattern,
+                        uint64_t size) override;
+  absl::Status MemZero(DeviceMemoryBase* location, uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst,
+                      const DeviceMemoryBase& gpu_src, uint64_t size) override;
+  absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) override;
+  absl::Status BlockHostUntilDone() override;
+
+  void SetName(std::string name) override;
+
+  Stream::PlatformSpecificHandle platform_specific_handle() const override {
+    return {stream_handle_};
+  }
+
+  absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
+      bool use_delay_kernel) override {
+    return executor_->CreateEventBasedTimer(this, use_delay_kernel);
+  }
+
+  static absl::StatusOr<std::unique_ptr<CudaStream>> Create(
+      StreamExecutor* executor,
+      std::optional<std::variant<StreamPriority, int>> priority);
+
+  ~CudaStream() override;
+
+  CUstream stream_handle() const { return stream_handle_; }
+
+ private:
+  CudaStream(StreamExecutor* executor, CudaEvent completed_event,
+             std::optional<std::variant<StreamPriority, int>> priority,
+             CUstream stream_handle)
+      : StreamCommon(executor, priority),
+        executor_(executor),
+        completed_event_(std::move(completed_event)),
+        stream_handle_(stream_handle) {}
+
+  absl::Status RecordCompletedEvent();
+
+  absl::Status LaunchKernel(const ThreadDim& thread_dims,
+                            const BlockDim& block_dims,
+                            const std::optional<ClusterDim>& cluster_dims,
+                            void* function, absl::string_view name, void** args,
+                            int64_t shmem_bytes) override;
+
+  StreamExecutor* executor_;
+  CudaEvent completed_event_;
+  CUstream stream_handle_;
+};
+}  // namespace gpu
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_timer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_timer.h
new file mode 100644
index 00000000..2690c4b6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_timer.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/stream_executor/cuda/cuda_event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// This class implements EventBasedTimer for CUDA devices.
+class CudaTimer : public EventBasedTimer {
+ public:
+  ~CudaTimer() override;
+  CudaTimer(CudaTimer&&) = default;
+  CudaTimer& operator=(CudaTimer&&) = default;
+
+  absl::StatusOr<absl::Duration> GetElapsedDuration() override;
+
+  enum class TimerType {
+    kDelayKernel,
+    kEventBased,
+  };
+  static absl::StatusOr<CudaTimer> Create(StreamExecutor* executor,
+                                          Stream* stream, TimerType timer_type);
+
+ private:
+  CudaTimer(StreamExecutor* executor, CudaEvent start_event,
+            CudaEvent stop_event, Stream* stream, GpuSemaphore semaphore);
+
+  GpuSemaphore semaphore_;
+  bool is_stopped_ = false;
+  StreamExecutor* executor_;
+  Stream* stream_;
+  CudaEvent start_event_;
+  CudaEvent stop_event_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_version_parser.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_version_parser.h
new file mode 100644
index 00000000..ef5ecdbc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cuda_version_parser.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_VERSION_PARSER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_VERSION_PARSER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/semantic_version.h"
+namespace stream_executor {
+
+// Parses a CUDA version as returned by the `CUDA_VERSION` macro, or
+// API functions like `cuDriverGetVersion`, or `cudaRuntimeGetVersion`.
+absl::StatusOr<SemanticVersion> ParseCudaVersion(int cuda_version);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_VERSION_PARSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cudnn_frontend_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cudnn_frontend_helpers.h
new file mode 100644
index 00000000..0a30c1af
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/cudnn_frontend_helpers.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDNN_FRONTEND_HELPERS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDNN_FRONTEND_HELPERS_H_
+
+namespace stream_executor {
+namespace gpu {
+
+#define RETURN_IF_CUDNN_FRONTEND_ERROR(expr)                                \
+  do {                                                                      \
+    if (ABSL_PREDICT_TRUE((expr).is_bad())) {                               \
+      std::ostringstream oss;                                               \
+      oss << (expr).get_message() << "\nin " << __FILE__ << "(" << __LINE__ \
+          << "): '" << #expr << "' ";                                       \
+      return absl::InternalError(oss.str());                                \
+    }                                                                       \
+  } while (false)
+
+// UIDs for cuDNN are unique identifiers of tensors within a graph. They are
+// assigned during graph construction; then graph execution takes a {uid:
+// buffer pointer} map defining the correspondance of buffers to tensors.
+// UID assignment scheme can be arbitrary; at the moment for simplicity XLA uses
+// a scheme UID = (HLO operand number + 1).
+int CuDnnTensorUID(int offset);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDNN_FRONTEND_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
new file mode 100644
index 00000000..4451ea72
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
@@ -0,0 +1,79 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_DEFER_RELOCATABLE_COMPILATION_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_DEFER_RELOCATABLE_COMPILATION_COMPILATION_PROVIDER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+// Simulates support for CompileToRelocatableModule by deferring the actual
+// compilation to the linking step.
+//
+// For parallel compilation of LLVM modules, we need support for both
+// compilation into relocatable modules and linking. (Individual LLVM modules
+// get compiled in parallel and then linked together in a single step.)
+//
+// Some compilation providers like libnvjitlink do not support compilation into
+// relocatable modules. To be able to still benefit from the parallel
+// compilation of LLVM modules, we can defer the PTX compilation to the linking
+// step using this delegating compilation provider.
+class DeferRelocatableCompilationCompilationProvider
+    : public CompilationProvider {
+ public:
+  static absl::StatusOr<
+      std::unique_ptr<DeferRelocatableCompilationCompilationProvider>>
+  Create(std::unique_ptr<CompilationProvider> delegate);
+
+  bool SupportsCompileToRelocatableModule() const override { return true; }
+  bool SupportsCompileAndLink() const override { return true; }
+
+  std::string name() const override {
+    return absl::StrFormat(
+        "DeferRelocatableCompilationCompilationProvider(delegate: %s)",
+        delegate_->name());
+  }
+
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+ private:
+  explicit DeferRelocatableCompilationCompilationProvider(
+      std::unique_ptr<CompilationProvider> delegate);
+
+  std::unique_ptr<CompilationProvider> delegate_;
+};
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_DEFER_RELOCATABLE_COMPILATION_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/delay_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/delay_kernel.h
new file mode 100644
index 00000000..016639d0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/delay_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_DELAY_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_CUDA_DELAY_KERNEL_H_
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
+#include "xla/stream_executor/stream.h"
+
+namespace stream_executor::gpu {
+
+// Launches the delay kernel on the given stream. The caller is responsible for
+// keeping the returned semaphore alive until the kernel finished executing.
+// Setting the semaphore to `kRelease` makes the kernel quit.
+absl::StatusOr<GpuSemaphore> LaunchDelayKernel(Stream* stream);
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_DELAY_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/driver_compilation.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/driver_compilation.h
new file mode 100644
index 00000000..d3c11822
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/driver_compilation.h
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_DRIVER_COMPILATION_H_
+#define XLA_STREAM_EXECUTOR_CUDA_DRIVER_COMPILATION_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+// Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
+// single image using the CUDA driver linking API.
+absl::StatusOr<std::vector<uint8_t>> LinkGpuAsmUsingDriver(
+    StreamExecutor* executor, const CudaComputeCapability& cc,
+    absl::Span<const std::vector<uint8_t>> images);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_DRIVER_COMPILATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h
new file mode 100644
index 00000000..e73db347
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_DRIVER_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_DRIVER_COMPILATION_PROVIDER_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+class DriverCompilationProvider : public CompilationProvider {
+ public:
+  explicit DriverCompilationProvider() = default;
+
+  std::string name() const override { return "DriverCompilationProvider"; }
+
+  bool SupportsCompileToRelocatableModule() const override { return false; }
+  bool SupportsCompileAndLink() const override { return true; }
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_DRIVER_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/mock_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/mock_compilation_provider.h
new file mode 100644
index 00000000..23a69fe5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/mock_compilation_provider.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_MOCK_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_MOCK_COMPILATION_PROVIDER_H_
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+class MockCompilationProvider : public CompilationProvider {
+ public:
+  MOCK_METHOD(bool, SupportsCompileToRelocatableModule, (), (const, override));
+  MOCK_METHOD(bool, SupportsCompileAndLink, (), (const, override));
+  MOCK_METHOD(std::string, name, (), (const, override));
+  MOCK_METHOD(absl::StatusOr<Assembly>, Compile,
+              (const CudaComputeCapability& cc, absl::string_view ptx,
+               const CompilationOptions& options),
+              (const, override));
+  MOCK_METHOD(absl::StatusOr<RelocatableModule>, CompileToRelocatableModule,
+              (const CudaComputeCapability& cc, absl::string_view ptx,
+               const CompilationOptions& options),
+              (const, override));
+  MOCK_METHOD(absl::StatusOr<Assembly>, CompileAndLink,
+              (const CudaComputeCapability& cc,
+               absl::Span<const RelocatableModuleOrPtx> inputs,
+               const CompilationOptions& options),
+              (const, override));
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_MOCK_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink.h
new file mode 100644
index 00000000..f7fd6958
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_H_
+
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+
+namespace stream_executor {
+
+using NvJitLinkVersion = std::tuple<unsigned, unsigned>;
+
+// Returns the version of the loaded libnvjitlink library or an error otherwise.
+// Returns 12.0 for libnvjitlink 12.0 through 12.2 since the version getter was
+// added in 12.3.
+absl::StatusOr<NvJitLinkVersion> GetNvJitLinkVersion();
+
+struct NvJitLinkInput {
+  enum class Type { kPtx, kCubin };
+  Type type;
+  absl::Span<const uint8_t> bytes;
+};
+
+// Compiles and links the given inputs using libnvjitlink.
+// Compilation takes only place for inputs of type Type::kPtx.
+absl::StatusOr<std::vector<uint8_t>> CompileAndLinkUsingLibNvJitLink(
+    const CudaComputeCapability& cc, absl::Span<const NvJitLinkInput> inputs,
+    GpuAsmOpts options, bool cancel_if_reg_spill);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h
new file mode 100644
index 00000000..b680e088
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_COMPILATION_PROVIDER_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+class NvJitLinkCompilationProvider : public CompilationProvider {
+ public:
+  NvJitLinkCompilationProvider() = default;
+
+  bool SupportsCompileToRelocatableModule() const override { return false; }
+  bool SupportsCompileAndLink() const override { return true; }
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+
+  std::string name() const override { return "NvJitLinkCompilationProvider"; }
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_known_issues.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_known_issues.h
new file mode 100644
index 00000000..d4d8c7d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_known_issues.h
@@ -0,0 +1,28 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_KNOWN_ISSUES_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_KNOWN_ISSUES_H_
+
+namespace stream_executor {
+
+// Returns true if the loaded NvJitLink library is known to have bugs and
+// shouldn't be used unconditionally. Returns false otherwise - also returns
+// false if NvJitLink is not available.
+bool LoadedNvJitLinkHasKnownIssues();
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_KNOWN_ISSUES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_support.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_support.h
new file mode 100644
index 00000000..4f554746
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvjitlink_support.h
@@ -0,0 +1,25 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_SUPPORT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_SUPPORT_H_
+
+namespace stream_executor {
+// Returns true if XLA was built with libnvjitlink support. Otherwise false
+// is returned.
+bool IsLibNvJitLinkSupported();
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVJITLINK_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h
new file mode 100644
index 00000000..5ffdee12
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVPTXCOMPILER_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVPTXCOMPILER_COMPILATION_PROVIDER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+// Provides PTX compilation facilities using the PTX compiler API from the
+// libnvptxcompiler library.
+class NvptxcompilerCompilationProvider : public CompilationProvider {
+ public:
+  NvptxcompilerCompilationProvider() = default;
+
+  bool SupportsCompileToRelocatableModule() const override { return true; }
+  bool SupportsCompileAndLink() const override { return false; }
+  std::string name() const override {
+    return "NvptxcompilerCompilationProvider";
+  }
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+
+ private:
+  absl::StatusOr<std::vector<uint8_t>> CompileHelper(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options,
+      bool compile_to_relocatable_module) const;
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVPTXCOMPILER_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compilation_method.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compilation_method.h
new file mode 100644
index 00000000..93ef2be3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compilation_method.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILATION_METHOD_H_
+#define XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILATION_METHOD_H_
+
+#include <ostream>
+
+#include "absl/strings/str_cat.h"
+namespace stream_executor {
+
+// Ptxas is a standalone binary that compiles PTX to GPU assembly. NvJitLink
+// and NvPtxCompiler are libraries that can be linked to from other C++
+// programs. NvJitLink and NvPtxCompiler should always produce the same
+// result. Ptxas should also produce the same result, unless it is on a
+// different version than the rest of the CUDA tools.
+// We select a compilation method in AssembleOptionsAndCompile.
+enum class PtxCompilationMethod {
+  kNvJitLink,
+  kNvPtxCompiler,
+  kPtxas,
+};
+
+template <typename Sink>
+static void AbslStringify(Sink& sink,
+                          const PtxCompilationMethod& compilation_method) {
+  switch (compilation_method) {
+    case PtxCompilationMethod::kNvJitLink:
+      sink.Append("NvJitLink");
+      break;
+    case PtxCompilationMethod::kNvPtxCompiler:
+      sink.Append("NvPtxCompiler");
+      break;
+    case PtxCompilationMethod::kPtxas:
+      sink.Append("Ptxas");
+      break;
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const PtxCompilationMethod& method) {
+  return os << absl::StrCat(method);
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILATION_METHOD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler.h
new file mode 100644
index 00000000..a6db631a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace stream_executor {
+
+// Takes PTX as a null-terminated string and compiles it to SASS (CUBIN)
+// targeting the sm_<cc_major>.<cc_minor> NVIDIA GPU architecture.
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingLibNvPtxCompiler(
+    const CudaComputeCapability& cc, const std::string& ptx, GpuAsmOpts options,
+    bool cancel_if_reg_spill);
+
+absl::StatusOr<SemanticVersion> GetLibNvPtxCompilerVersion();
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h
new file mode 100644
index 00000000..10b13b21
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_HELPERS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_HELPERS_H_
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace stream_executor {
+
+// Creates a status with a payload indicating a register allocation error.
+absl::Status PtxRegisterAllocationError(absl::string_view message);
+
+// Checks whether ptxas log contains errors related to register allocation.
+bool IsPtxRegisterAllocationError(absl::string_view);
+
+// Checks whether the status is a register allocation error.
+bool IsPtxRegisterAllocationError(absl::Status status);
+
+// Identifies errors in the ptxas log and creates an error status.
+// `architecture` is the name of the GPU architecture, e.g. "sm_80" and is only
+// used for error message generation. If `cancel_if_reg_spill` is true, then a
+// register spill warning will be treated as an error, otherwise it will be
+// ignored.
+absl::Status CreateErrorFromPTXASLog(absl::string_view log,
+                                     absl::string_view architecture,
+                                     bool cancel_if_reg_spill);
+
+// Warns if the ptxas version should be upgraded.
+void WarnIfBadPtxasVersion(absl::string_view method,
+                           const CudaComputeCapability& cc,
+                           SemanticVersion compiler_version);
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler_support.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler_support.h
new file mode 100644
index 00000000..37f28f8a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_compiler_support.h
@@ -0,0 +1,25 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_SUPPORT_H_
+#define XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_SUPPORT_H_
+
+namespace stream_executor {
+// Returns true if XLA was built with libnvptxcompiler support. Otherwise false
+// is returned.
+bool IsLibNvPtxCompilerSupported();
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_SUPPORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_linking_method.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_linking_method.h
new file mode 100644
index 00000000..12fd8573
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/ptx_linking_method.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_PTX_LINKING_METHOD_H_
+#define XLA_STREAM_EXECUTOR_CUDA_PTX_LINKING_METHOD_H_
+
+#include <ostream>
+
+#include "absl/strings/str_cat.h"
+
+namespace stream_executor {
+
+// NvLink is a standalone binary that links object files to produce GPU
+// executables (see: nvlink --help). Not to be confused with the interconnect
+// system of the same name. Driver and NvJitLink and libraries that can be be
+// linked to from other C++ programs. The introduction here
+// https://docs.nvidia.com/cuda/nvjitlink/index.html provides a comparison
+// between NvJitLink and other methods. For our purposes, we generally expect
+// all linking methods to produce the same result. We select a linking method
+// in NVPTXCompiler::ChooseLinkingMethod.
+enum class PtxLinkingMethod {
+  kNone,
+  kNvLink,
+  kDriver,
+  kNvJitLink,
+};
+
+template <typename Sink>
+void AbslStringify(Sink& sink, const PtxLinkingMethod& method) {
+  switch (method) {
+    case PtxLinkingMethod::kNvJitLink:
+      sink.Append("NvJitLink");
+      break;
+    case PtxLinkingMethod::kNvLink:
+      sink.Append("NvLink");
+      break;
+    case PtxLinkingMethod::kDriver:
+      sink.Append("Driver");
+      break;
+    case PtxLinkingMethod::kNone:
+      sink.Append("None");
+      break;
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const PtxLinkingMethod& method) {
+  return os << absl::StrCat(method);
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_PTX_LINKING_METHOD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h
new file mode 100644
index 00000000..f052da91
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h
@@ -0,0 +1,103 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_SUBPROCESS_COMPILATION_H_
+#define XLA_STREAM_EXECUTOR_CUDA_SUBPROCESS_COMPILATION_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/cubin_or_ptx_image.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace stream_executor {
+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
+// capabilities provided by `cc`.
+//
+// 'options' is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
+    const CudaComputeCapability& cc, absl::string_view ptx_contents,
+    GpuAsmOpts options, bool cancel_if_reg_spill = false);
+
+// Like the above, but uses the ptxas_binary from `ptxas_path` instead of
+// using `FindCudaExecutable` to find it.
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
+    absl::string_view ptxas_path, const CudaComputeCapability& cc,
+    absl::string_view ptx_contents, GpuAsmOpts options,
+    bool cancel_if_reg_spill = false);
+
+// Finds the CUDA executable with the given binary_name
+// The path <preferred_cuda_dir>/bin is checked first, afterwards some other
+// predefined locations are being checked.
+//
+// A binary is only considered if it is of at least `minimum_version` and not
+// in `excluded_versions`.
+absl::StatusOr<std::string> FindCudaExecutable(
+    absl::string_view binary_name, absl::string_view preferred_cuda_dir,
+    SemanticVersion minimum_version,
+    absl::Span<const SemanticVersion> excluded_versions);
+
+// Same as above, but with no version constraints.
+absl::StatusOr<std::string> FindCudaExecutable(
+    absl::string_view binary_name, absl::string_view preferred_cuda_dir);
+
+// Returns the path to the first found ptxas binary that fulfills our version
+// requirements.
+absl::StatusOr<std::string> FindPtxAsExecutable(
+    absl::string_view preferred_cuda_dir);
+
+// Returns the path to the first found nvlink binary that fulfills our version
+// requirements.
+absl::StatusOr<std::string> FindNvlinkExecutable(
+    absl::string_view preferred_cuda_dir);
+
+// Runs tool --version and parses its version string. All the usual CUDA
+// tools are supported.
+absl::StatusOr<SemanticVersion> GetToolVersion(absl::string_view tool_path);
+
+// On NVIDIA GPUs, returns the version of the ptxas command line tool.
+absl::StatusOr<SemanticVersion> GetAsmCompilerVersion(
+    absl::string_view preferred_cuda_dir);
+
+// On NVIDIA GPUs, returns the version of the nvlink command line tool.
+absl::StatusOr<SemanticVersion> GetNvLinkVersion(
+    absl::string_view preferred_cuda_dir);
+
+// Bundles the GPU machine code (cubins) and PTX if requested and returns the
+// resulting binary (i.e. a fatbin) as a byte array.
+absl::StatusOr<std::vector<uint8_t>> BundleGpuAsmUsingFatbin(
+    std::vector<CubinOrPTXImage> images, GpuAsmOpts options);
+
+// Links the given CUBIN `images` using nvlink.
+absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
+    stream_executor::CudaComputeCapability cc,
+    absl::string_view preferred_cuda_dir,
+    absl::Span<const std::vector<uint8_t>> images);
+
+// The same as above, but uses the nvlink_path instead of
+// `FindCudaExecutable` to find the nvlink binary.
+absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
+    absl::string_view nvlink_path, stream_executor::CudaComputeCapability cc,
+    absl::Span<const std::vector<uint8_t>> images);
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_SUBPROCESS_COMPILATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h
new file mode 100644
index 00000000..2960b3c6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_SUBPROCESS_COMPILATION_PROVIDER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_SUBPROCESS_COMPILATION_PROVIDER_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/compilation_options.h"
+#include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor::cuda {
+
+// This compilation provider invokes ptxas and nvlink to compile and link PTX to
+// CUBIN.
+class SubprocessCompilationProvider : public CompilationProvider {
+ public:
+  explicit SubprocessCompilationProvider(std::string path_to_ptxas,
+                                         std::string path_to_nvlink)
+      : path_to_ptxas_(std::move(path_to_ptxas)),
+        path_to_nvlink_(std::move(path_to_nvlink)) {}
+
+  absl::StatusOr<Assembly> Compile(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<RelocatableModule> CompileToRelocatableModule(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options) const override;
+
+  absl::StatusOr<Assembly> CompileAndLink(
+      const CudaComputeCapability& cc,
+      absl::Span<const RelocatableModuleOrPtx> inputs,
+      const CompilationOptions& options) const override;
+
+  bool SupportsCompileToRelocatableModule() const override { return true; }
+  bool SupportsCompileAndLink() const override { return true; }
+
+  std::string name() const override;
+
+ private:
+  absl::StatusOr<std::vector<uint8_t>> CompileHelper(
+      const CudaComputeCapability& cc, absl::string_view ptx,
+      const CompilationOptions& options,
+      bool compile_to_relocatable_module) const;
+
+  std::string path_to_ptxas_;
+  std::string path_to_nvlink_;
+};
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_SUBPROCESS_COMPILATION_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/data_type.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/data_type.h
new file mode 100644
index 00000000..f5246389
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/data_type.h
@@ -0,0 +1,103 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_DATA_TYPE_H_
+#define XLA_STREAM_EXECUTOR_DATA_TYPE_H_
+
+#include <complex>
+#include <cstdint>
+
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "tsl/platform/ml_dtypes.h"
+
+namespace Eigen {
+struct bfloat16;
+struct half;
+}  // namespace Eigen
+
+namespace stream_executor {
+namespace dnn {
+
+// A helper class to convert C/C++ types to the proper enums.
+template <typename T>
+struct ToDataType;
+
+// Note: If you add a new specialization below, make sure to add the
+// corresponding definition in stream_executor/dnn.cc.
+template <>
+struct ToDataType<tsl::float8_e3m4> {
+  static constexpr DataType value = DataType::kF8E3M4;
+};
+template <>
+struct ToDataType<tsl::float8_e4m3> {
+  static constexpr DataType value = DataType::kF8E4M3;
+};
+template <>
+struct ToDataType<tsl::float8_e4m3fn> {
+  static constexpr DataType value = DataType::kF8E4M3FN;
+};
+template <>
+struct ToDataType<tsl::float8_e5m2> {
+  static constexpr DataType value = DataType::kF8E5M2;
+};
+template <>
+struct ToDataType<tsl::float8_e4m3fnuz> {
+  static constexpr DataType value = DataType::kF8E4M3FNUZ;
+};
+template <>
+struct ToDataType<tsl::float8_e5m2fnuz> {
+  static constexpr DataType value = DataType::kF8E5M2FNUZ;
+};
+template <>
+struct ToDataType<float> {
+  static constexpr DataType value = DataType::kFloat;
+};
+template <>
+struct ToDataType<double> {
+  static constexpr DataType value = DataType::kDouble;
+};
+template <>
+struct ToDataType<Eigen::half> {
+  static constexpr DataType value = DataType::kHalf;
+};
+template <>
+struct ToDataType<Eigen::bfloat16> {
+  static constexpr DataType value = DataType::kBF16;
+};
+template <>
+struct ToDataType<int8_t> {
+  static constexpr DataType value = DataType::kInt8;
+};
+template <>
+struct ToDataType<int32_t> {
+  static constexpr DataType value = DataType::kInt32;
+};
+template <>
+struct ToDataType<int64_t> {
+  static constexpr DataType value = DataType::kInt64;
+};
+template <>
+struct ToDataType<std::complex<float>> {
+  static constexpr DataType value = DataType::kComplexFloat;
+};
+template <>
+struct ToDataType<std::complex<double>> {
+  static constexpr DataType value = DataType::kComplexDouble;
+};
+
+}  // namespace dnn
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DATA_TYPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_description.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_description.h
new file mode 100644
index 00000000..f0210275
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_description.h
@@ -0,0 +1,632 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
+// device and platform properties. Also contains convenience functions for
+// checking/calculating launch dimensionality based on device properties.
+
+#ifndef XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
+#define XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace stream_executor {
+
+// CUDA compute capability, as reported by the device description.
+struct CudaComputeCapability {
+  int major = 0;
+  int minor = 0;
+
+  // MSVC does not like "PASCAL" symbol.
+  enum CudaComputeCapabilities {
+    PASCAL_ = 6,
+    VOLTA = 7,
+    AMPERE = 8,
+    HOPPER = 9,
+    BLACKWELL = 10
+  };
+
+  constexpr CudaComputeCapability() = default;
+  constexpr CudaComputeCapability(int major, int minor) {
+    this->major = major;
+    this->minor = minor;
+  }
+  // cuda arch format "major.minor", example: "8.6".
+  explicit CudaComputeCapability(std::string cuda_arch_name) {
+    std::vector<std::string> split = absl::StrSplit(cuda_arch_name, '.');
+    assert(split.size() == 2);
+    this->major = std::stoi(split[0]);
+    this->minor = std::stoi(split[1]);
+  }
+
+  explicit CudaComputeCapability(const CudaComputeCapabilityProto &proto) {
+    this->major = proto.major();
+    this->minor = proto.minor();
+  }
+
+  static CudaComputeCapability Volta() {
+    return CudaComputeCapability{VOLTA, 0};
+  }
+
+  static CudaComputeCapability Ampere() {
+    return CudaComputeCapability{AMPERE, 0};
+  }
+
+  static CudaComputeCapability Hopper() {
+    return CudaComputeCapability{HOPPER, 0};
+  }
+
+  static CudaComputeCapability Blackwell() {
+    return CudaComputeCapability{BLACKWELL, 0};
+  }
+
+  bool IsAtLeast(int other_major, int other_minor = 0) const {
+    return IsAtLeast(CudaComputeCapability{other_major, other_minor});
+  }
+
+  bool IsAtLeast(const CudaComputeCapability &cc) const {
+    return !(*this < cc);
+  }
+
+  bool IsAtLeastVolta() const {
+    return major >= CudaComputeCapabilities::VOLTA;
+  }
+
+  bool IsAtLeastAmpere() const {
+    return major >= CudaComputeCapabilities::AMPERE;
+  }
+
+  bool IsAtLeastHopper() const {
+    return major >= CudaComputeCapabilities::HOPPER;
+  }
+
+  bool IsAtLeastBlackwell() const {
+    return major >= CudaComputeCapabilities::BLACKWELL;
+  }
+
+  bool operator<(const CudaComputeCapability &other) const {
+    return ToPair() < other.ToPair();
+  }
+
+  bool operator==(const CudaComputeCapability &other) const {
+    return ToPair() == other.ToPair();
+  }
+
+  bool operator!=(const CudaComputeCapability &other) const {
+    return !(*this == other);
+  }
+
+  bool operator>(const CudaComputeCapability &other) const {
+    return ToPair() > other.ToPair();
+  }
+
+  bool operator>=(const CudaComputeCapability &other) const {
+    return ToPair() >= other.ToPair();
+  }
+
+  bool operator<=(const CudaComputeCapability &other) const {
+    return ToPair() <= other.ToPair();
+  }
+
+  std::string ToString() const { return absl::StrCat(major, ".", minor); }
+
+  std::pair<int, int> ToPair() const { return std::make_pair(major, minor); }
+
+  CudaComputeCapabilityProto ToProto() const {
+    CudaComputeCapabilityProto proto;
+    proto.set_major(major);
+    proto.set_minor(minor);
+    return proto;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H state, const CudaComputeCapability &cc) {
+    return H::combine(std::move(state), cc.major, cc.minor);
+  }
+};
+
+// ROCm compute capability, as reported by the device description.
+class RocmComputeCapability {
+ public:
+  // gcn_arch_name example --  gfx90a:sramecc+:xnack-
+  // gfx_version is the "gfx90a" part of the gcn_arch_name
+  explicit RocmComputeCapability(std::string gcn_arch_name)
+      : gcn_arch_name_(std::move(gcn_arch_name)) {}
+
+  explicit RocmComputeCapability(const RocmComputeCapabilityProto &proto)
+      : gcn_arch_name_(proto.gcn_arch_name()) {}
+
+  RocmComputeCapability() = default;
+
+  std::string gcn_arch_name() const { return gcn_arch_name_; }
+
+  std::string gfx_version() const {
+    std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name_, ':');
+    return tokens[0];
+  }
+
+  bool is_supported_gfx_version() const {
+    return absl::c_count(kSupportedGfxVersions, gfx_version()) != 0;
+  }
+
+  std::string supported_gfx_versions_str() const {
+    return absl::StrJoin(kSupportedGfxVersions, ", ");
+  }
+
+  bool gfx9_mi100() const { return gfx_version() == "gfx908"; }
+
+  bool gfx9_mi200() const { return gfx_version() == "gfx90a"; }
+
+  bool gfx9_mi300() const {
+    static constexpr absl::string_view kList[] = {"gfx940", "gfx941", "gfx942"};
+    return absl::c_count(kList, gfx_version()) != 0;
+  }
+
+  bool gfx9_mi100_or_later() const {
+    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a", "gfx940",
+                                                  "gfx941", "gfx942"};
+    return absl::c_count(kList, gfx_version()) != 0;
+  }
+
+  bool gfx9_mi200_or_later() const {
+    static constexpr absl::string_view kList[] = {"gfx90a", "gfx940", "gfx941",
+                                                  "gfx942"};
+    return absl::c_count(kList, gfx_version()) != 0;
+  }
+
+  bool gfx10_rx68xx() const { return gfx_version() == "gfx1030"; }
+
+  bool gfx10_rx69xx() const { return gfx_version() == "gfx1030"; }
+
+  bool gfx11_rx7900() const { return gfx_version() == "gfx1100"; }
+
+  bool has_nhwc_layout_support() const { return gfx9_mi100_or_later(); }
+
+  bool has_bf16_dtype_support() const { return gfx9_mi100_or_later(); }
+
+  bool has_fast_fp16_support() const {
+    return gfx9_mi100_or_later() || gfx10_rx68xx() || gfx10_rx69xx() ||
+           gfx11_rx7900();
+  }
+
+  bool has_mfma_instr_support() const { return gfx9_mi100_or_later(); }
+
+  bool has_amd_matrix_core() const {
+    return (gfx9_mi100_or_later() || gfx_version().find("gfx11") ||
+            gfx_version().find("gfx12"));
+  }
+
+  bool has_fp16_atomics_support() const {
+    // TODO(rocm): Check. This should be the same as has_fast_fp16_support().
+    return gfx9_mi200_or_later();
+  }
+
+  bool fence_before_barrier() const {
+    return gfx_version() != "gfx900" && gfx_version() != "gfx906";
+  }
+
+  bool has_hipblaslt() const { return gfx9_mi200_or_later(); }
+
+  bool has_fp8_support() const { return gfx9_mi300(); }
+
+  std::string ToString() const { return gcn_arch_name(); }
+
+  RocmComputeCapabilityProto ToProto() const {
+    RocmComputeCapabilityProto proto;
+    proto.set_gcn_arch_name(gcn_arch_name_);
+    return proto;
+  }
+
+  bool operator==(const RocmComputeCapability &other) const {
+    return gcn_arch_name_ == other.gcn_arch_name_;
+  }
+
+ private:
+  std::string gcn_arch_name_ = "gfx000";  // default to invalid arch.
+
+  static constexpr absl::string_view kSupportedGfxVersions[]{
+      "gfx900",                       // MI25
+      "gfx906",                       // MI50 / MI60
+      "gfx908",                       // MI100
+      "gfx90a",                       // MI200
+      "gfx940",  "gfx941", "gfx942",  // MI300
+      "gfx1030",                      // RX68xx / RX69xx
+      "gfx1100"                       // RX7900
+  };
+};
+
+using GpuComputeCapability =
+    std::variant<CudaComputeCapability, RocmComputeCapability>;
+
+// Data that describes the execution target of the StreamExecutor, in terms of
+// important logical parameters. These include dimensionality limits and
+// physical parameters of interest, such as number of cores present on the
+// device.
+//
+// Thread-safe: immutable post-initialization.
+class DeviceDescription {
+ public:
+  // Returns the platform being run on; this value is primarily intended for
+  // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
+  // 3.5".
+  const std::string &platform_version() const { return platform_version_; }
+
+  // Returns the driver version interfacing with the underlying platform.
+  // Note for CUDA this returns the CUDA Toolkit version the driver ships with.
+  SemanticVersion driver_version() const { return driver_version_; }
+
+  // Returns the runtime version.
+  SemanticVersion runtime_version() const { return runtime_version_; }
+
+  // Returns the toolkit version that the application was compiled against.
+  SemanticVersion compile_time_toolkit_version() const {
+    return compile_time_toolkit_version_;
+  }
+
+  // Returns the name that the device reports. Vendor dependent.
+  const std::string &name() const { return name_; }
+
+  // Gets a human-readable description of the device, e.g. "nvidia GPU
+  // supporting sm75 with 32GB RAM, 80 SMs, ...".  This is intended to be the
+  // same if and only if two devices are "the same" (e.g. the same make/model of
+  // GPU), though it may not completely succeed at this for all platforms.
+  //
+  // This string is not guaranteed to be stable between versions.  Please DO NOT
+  // rely on it never changing.  (Within one version of the code, it won't
+  // change, don't worry.)
+  const std::string &model_str() const { return model_str_; }
+
+  // Returns the PCI bus identifier for this device, of the form
+  // [domain]:[bus]:[device].[function]
+  const std::string &pci_bus_id() const { return pci_bus_id_; }
+
+  // Returns the NUMA node associated with this device, for use in
+  // determining socket locality. If the NUMA node could not be determined, -1
+  // is returned.
+  int numa_node() const { return numa_node_; }
+
+  // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
+  // or an AMD Compute Unit.
+  int core_count() const { return core_count_; }
+
+  // Number of floating point operations one core (SM, compute unit) can execute
+  // in parallel. Corresponds to the number of "CUDA cores" for NVIDIA devices.
+  int fpus_per_core() const { return fpus_per_core_; }
+
+  // Returns the limit on the thread dimensionality values in each of the
+  // respective dimensions. These limits affect what constitutes a legitimate
+  // kernel launch request.
+  const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
+
+  // Returns the limit on the block dimensionality values in each of the
+  // respective dimensions. These limits may affect what constitutes a
+  // legitimate kernel launch request.
+  const BlockDim &block_dim_limit() const { return block_dim_limit_; }
+
+  // Returns the limit on the total number of threads that can be launched in a
+  // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
+  // This limit affects what constitutes a legitimate kernel launch request.
+  const int64_t &threads_per_block_limit() const {
+    return threads_per_block_limit_;
+  }
+
+  // Returns the limit on the total number of threads that can be simultaneously
+  // launched on a given multiprocessor.
+  const int64_t &threads_per_core_limit() const {
+    return threads_per_core_limit_;
+  }
+
+  // Returns the number of threads per warp/wavefront.
+  constexpr int64_t threads_per_warp() const { return threads_per_warp_; }
+
+  // Returns the limit on the total number of registers per core.
+  const int64_t &registers_per_core_limit() const {
+    return registers_per_core_limit_;
+  }
+
+  // Returns the limit on the total number of registers that can be
+  // simultaneously used by a block.
+  const int64_t &registers_per_block_limit() const {
+    return registers_per_block_limit_;
+  }
+
+  // Returns the number of address bits available to kernel code running on the
+  // platform. This affects things like the maximum allocation size and perhaps
+  // types used in kernel code such as size_t.
+  const int64_t &device_address_bits() const { return device_address_bits_; }
+
+  // Returns the device memory size in bytes.
+  int64_t device_memory_size() const { return device_memory_size_; }
+
+  // Returns the L2 cache size in bytes.
+  int64_t l2_cache_size() const { return l2_cache_size_; }
+
+  // Returns the device's memory bandwidth in bytes/sec.  (This is for
+  // reads/writes to/from the device's own memory, not for transfers between the
+  // host and device.)
+  int64_t memory_bandwidth() const { return memory_bandwidth_; }
+
+  // Returns the device's core clock rate in GHz.
+  float clock_rate_ghz() const { return clock_rate_ghz_; }
+
+  // Returns whether ECC is enabled.
+  bool ecc_enabled() const { return ecc_enabled_; }
+
+  // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
+  // Micro Devices, Inc.", or "GenuineIntel".
+  const std::string &device_vendor() const { return device_vendor_; }
+
+  // Returns the CUDA compute capability if we're running on the CUDA platform.
+  // If a CUDA compute capability is not available, the major version will be
+  // zero.
+  CudaComputeCapability cuda_compute_capability() const;
+
+  // Returns the ROCm compute capability if we're running on the ROCm platform.
+  // If a ROCm compute capability is not available, the default gfx_arch will
+  // be "gfx000" (which is an invalid gfx arch).
+  RocmComputeCapability rocm_compute_capability() const;
+
+  const GpuComputeCapability &gpu_compute_capability() const;
+
+  // Returns the maximum amount of shared memory present on a single core
+  // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
+  // devices). Note that some devices, such as NVIDIA's have a configurable
+  // partitioning between shared memory and L1 cache.
+  int64_t shared_memory_per_core() const { return shared_memory_per_core_; }
+
+  // Returns the maximum amount of static shared memory
+  // available for a single block.
+  int64_t shared_memory_per_block() const { return shared_memory_per_block_; }
+
+  // Returns the maximum amount of shared memory available for a single block
+  // including the dynamically allocated one.
+  int64_t shared_memory_per_block_optin() const {
+    return shared_memory_per_block_optin_;
+  }
+
+  // L1 size varies because it can be dynamically
+  // configured as shared memory; there is no easy way to query its actual size;
+  // also we do not count what occupies cache, but rather claim that what is
+  // much smaller than the cache size will likely stay in it.
+  constexpr int64_t l1_cache_size_per_SM() const {
+    return std::visit(
+        [](const auto &capability) -> int64_t {
+          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                       RocmComputeCapability>) {
+            // MI100 and MI200 has 16KB L1 cache per CU.
+            if (capability.gfx9_mi100() || capability.gfx9_mi200()) {
+              return 16 * 1024;
+            }
+            // MI300 has 32KB L1 cache per CU.
+            if (capability.gfx9_mi300()) {
+              return 32 * 1024;
+            }
+          }
+          // Default return for other GPUs (e.g., RTX A6000).
+          return 2 * 1024;
+        },
+        gpu_compute_capability_);
+  }
+
+  constexpr int64_t dram_to_l2_transaction_size_bytes() const {
+    return std::visit(
+        [](const auto &capability) -> int {
+          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                       RocmComputeCapability>) {
+            // DRAM->L2 bus is 128 Byte width for MI300.
+            if (capability.gfx9_mi300()) {
+              return 128;
+            }
+          }
+          // Cache line is 128B that is split into 4 sectors of 32B. Default
+          // transaction size from DRAM -> L2 = 64 Bytes = 2 sectors, since
+          // V100, but it can be also configured.
+          // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
+          // (page 10).
+          // return 64 Bytes by default.
+          return 64;
+        },
+        gpu_compute_capability_);
+  }
+
+  constexpr int64_t memory_transactions_per_clock() const {
+    return std::visit(
+        [](const auto &capability) -> int {
+          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                       RocmComputeCapability>) {
+            // 16 works well on MI300.
+            if (capability.gfx9_mi300()) {
+              return 16;
+            }
+          }
+          // Default return for other GPUs.
+          return 32;
+        },
+        gpu_compute_capability_);
+  }
+
+  GpuDeviceInfoProto ToGpuProto() const;
+
+  std::string ToString() const;
+
+  DeviceDescription() = default;
+  explicit DeviceDescription(const GpuDeviceInfoProto &proto);
+
+  // For string values that are not available via the underlying platform, this
+  // value will be provided.
+  static inline const char *const kUndefinedString = "<undefined>";
+
+  void set_gpu_compute_capability(const GpuComputeCapability &c) {
+    gpu_compute_capability_ = c;
+  }
+
+  void set_block_dim_limit_x(int64_t limit) { block_dim_limit_.x = limit; }
+
+  void set_block_dim_limit_y(int64_t limit) { block_dim_limit_.y = limit; }
+
+  void set_block_dim_limit_z(int64_t limit) { block_dim_limit_.z = limit; }
+
+  void set_device_vendor(std::string value) {
+    device_vendor_ = std::move(value);
+  }
+  void set_platform_version(std::string value) {
+    platform_version_ = std::move(value);
+  }
+  void set_driver_version(const SemanticVersion &value) {
+    driver_version_ = value;
+  }
+  void set_runtime_version(const SemanticVersion &value) {
+    runtime_version_ = value;
+  }
+  void set_compile_time_toolkit_version(const SemanticVersion &value) {
+    compile_time_toolkit_version_ = value;
+  }
+  void set_pci_bus_id(std::string value) { pci_bus_id_ = std::move(value); }
+  void set_name(std::string value) { name_ = std::move(value); }
+  void set_model_str(std::string value) { model_str_ = std::move(value); }
+
+  void set_thread_dim_limit(const ThreadDim &value) {
+    thread_dim_limit_ = value;
+  }
+  void set_block_dim_limit(const BlockDim &value) { block_dim_limit_ = value; }
+
+  void set_threads_per_core_limit(int64_t value) {
+    threads_per_core_limit_ = value;
+  }
+  void set_threads_per_block_limit(int64_t value) {
+    threads_per_block_limit_ = value;
+  }
+  void set_threads_per_warp(int64_t value) { threads_per_warp_ = value; }
+
+  void set_registers_per_core_limit(int64_t value) {
+    registers_per_core_limit_ = value;
+  }
+  void set_registers_per_block_limit(int64_t value) {
+    registers_per_block_limit_ = value;
+  }
+
+  void set_device_address_bits(int64_t value) { device_address_bits_ = value; }
+  void set_device_memory_size(int64_t value) { device_memory_size_ = value; }
+  void set_l2_cache_size(int64_t value) { l2_cache_size_ = value; }
+  void set_memory_bandwidth(int64_t value) { memory_bandwidth_ = value; }
+
+  void set_shared_memory_per_core(int64_t value) {
+    shared_memory_per_core_ = value;
+  }
+  void set_shared_memory_per_block(int64_t value) {
+    shared_memory_per_block_ = value;
+  }
+  void set_shared_memory_per_block_optin(int64_t value) {
+    shared_memory_per_block_optin_ = value;
+  }
+
+  void set_clock_rate_ghz(float value) { clock_rate_ghz_ = value; }
+
+  void set_cuda_compute_capability(int major, int minor) {
+    gpu_compute_capability_ = CudaComputeCapability{major, minor};
+  }
+
+  void set_rocm_compute_capability(std::string gcn_arch_name) {
+    gpu_compute_capability_ = RocmComputeCapability(std::move(gcn_arch_name));
+  }
+
+  void set_numa_node(int value) { numa_node_ = value; }
+  void set_core_count(int value) { core_count_ = value; }
+  void set_fpus_per_core(int value) { fpus_per_core_ = value; }
+  void set_ecc_enabled(bool value) { ecc_enabled_ = value; }
+
+ private:
+  // For description of the following members, see the corresponding accessor
+  // above.
+  //
+  // N.B. If another field is added, update ToMap() above.
+  std::string device_vendor_ = kUndefinedString;
+  std::string platform_version_ = kUndefinedString;
+  std::string pci_bus_id_ = kUndefinedString;
+  std::string name_ = kUndefinedString;
+  std::string model_str_ = kUndefinedString;
+
+  template <typename T>
+  static constexpr T kUninitialized = T(-1);
+
+  ThreadDim thread_dim_limit_{kUninitialized<uint64_t>,
+                              kUninitialized<uint64_t>,
+                              kUninitialized<uint64_t>};
+  BlockDim block_dim_limit_{kUninitialized<uint64_t>, kUninitialized<uint64_t>,
+                            kUninitialized<uint64_t>};
+
+  int64_t threads_per_core_limit_ = kUninitialized<int64_t>;
+  int64_t threads_per_block_limit_ = kUninitialized<int64_t>;
+  int64_t threads_per_warp_ = kUninitialized<int64_t>;
+
+  int64_t registers_per_core_limit_ = kUninitialized<int64_t>;
+  int64_t registers_per_block_limit_ = kUninitialized<int64_t>;
+
+  int64_t device_address_bits_ = kUninitialized<int64_t>;
+  int64_t device_memory_size_ = kUninitialized<int64_t>;
+  int64_t l2_cache_size_ = kUninitialized<int64_t>;
+  int64_t memory_bandwidth_ = kUninitialized<int64_t>;
+
+  // Shared memory limits on a given device.
+  int64_t shared_memory_per_core_ = kUninitialized<int64_t>;
+  int64_t shared_memory_per_block_ = kUninitialized<int64_t>;
+  int64_t shared_memory_per_block_optin_ = kUninitialized<int64_t>;
+
+  float clock_rate_ghz_ = kUninitialized<float>;
+
+  GpuComputeCapability gpu_compute_capability_{};
+
+  int numa_node_ = kUninitialized<int>;
+  int core_count_ = kUninitialized<int>;
+  int fpus_per_core_ = kUninitialized<int>;
+  bool ecc_enabled_ = false;
+
+  SemanticVersion driver_version_{0, 0, 0};
+  SemanticVersion runtime_version_{0, 0, 0};
+  SemanticVersion compile_time_toolkit_version_{0, 0, 0};
+};
+
+// Returns whether the given thread_dim is acceptable given the limits described
+// in device_description. For detailed reasons for failing the predicate, enable
+// VLOG(2) for this module.
+bool ThreadDimOk(const DeviceDescription &device_description,
+                 const ThreadDim &thread_dim);
+
+// Calculate the number of threads/blocks required to process element_count
+// elements. Note that you can still end up with more threads than
+// element_count due to rounding, so kernels often start with an "is this
+// thread id in the element_count range?" test.
+void CalculateDimensionality(const DeviceDescription &device_description,
+                             int64_t element_count, int64_t *threads_per_block,
+                             int64_t *block_count);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory.h
new file mode 100644
index 00000000..d599faad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory.h
@@ -0,0 +1,179 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Suite of types that represent device memory allocations. These are
+// allocated by the StreamExecutor interface, which produces values appropriate
+// for the underlying platform (whether it be CUDA or OpenCL).
+//
+// The untyped base class (like a device void*) is DeviceMemoryBase, which can
+// be specialized for a given allocation type (like a device T*) using
+// DeviceMemory<T>.
+
+#ifndef XLA_STREAM_EXECUTOR_DEVICE_MEMORY_H_
+#define XLA_STREAM_EXECUTOR_DEVICE_MEMORY_H_
+
+#include <stddef.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+#include "absl/base/attributes.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace stream_executor {
+
+// void*-analogous device memory allocation. For the typed variation, see
+// DeviceMemory<T>.
+//
+// This is effectively a two-tuple of a pointer and size; however, note that the
+// pointer may not be to the virtual address itself -- in OpenCL the pointer is
+// to a cl_mem handle that describes the device allocation. Therefore,
+// DeviceMemoryBase::opaque does not necessarily produce a pointer that can be
+// referenced directly, so use it with caution.
+//
+// Thread-compatible.
+class DeviceMemoryBase {
+ public:
+  // Default constructor instantiates a null-pointed, zero-sized device memory
+  // region. An opaque pointer may be provided -- see header for details on the
+  // opacity of that pointer.
+  explicit DeviceMemoryBase(void *opaque = nullptr, uint64_t size = 0)
+      : opaque_(opaque), size_(size) {
+    // TODO(b/336267585): This constructor dangerously encourages
+    //                 DeviceMemoryBase(mem) which would imply
+    //                 DeviceMemoryBase(mem, 0)
+    //                 We should delete & resolve any dependencies.
+    //  explicit DeviceMemoryBase(void *opaque) = delete;
+  }
+
+  // Returns whether the backing memory is the null pointer.
+  // A `== nullptr` convenience method is also provided.
+  bool is_null() const { return opaque_ == nullptr; }
+
+  bool operator==(std::nullptr_t other) const { return is_null(); }
+  bool operator!=(std::nullptr_t other) const { return !is_null(); }
+
+  bool operator==(const DeviceMemoryBase &other) const {
+    return opaque_ == other.opaque_ && size_ == other.size_;
+  }
+
+  // Provides a partial order between device memory values.
+  //
+  // This operator is provided so that this object can be used as a key in an
+  // ordered map.
+  bool operator<(const DeviceMemoryBase &other) const {
+    return std::tie(opaque_, size_) < std::tie(other.opaque_, other.size_);
+  }
+
+  // Returns the size, in bytes, for the backing memory.
+  uint64_t size() const { return size_; }
+
+  // Warning: note that the pointer returned is not necessarily directly to
+  // device virtual address space, but is platform-dependent.
+  void *opaque() const { return opaque_; }
+
+  // Returns the payload of this memory region.
+  uint64_t payload() const { return payload_; }
+
+  // Sets payload to given value.
+  void SetPayload(uint64_t payload) { payload_ = payload; }
+
+  // Returns whether the two DeviceMemoryBase segments are identical (both in
+  // their opaque pointer and size).
+  bool IsSameAs(const DeviceMemoryBase &other) const {
+    return opaque() == other.opaque() && size() == other.size();
+  }
+
+  // Creates a memory region (slice) inside another allocated memory region.
+  // Offset and size are in bytes.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE DeviceMemoryBase
+  GetByteSlice(uint64_t offset_bytes, uint64_t size_bytes) const {
+    DCHECK(offset_bytes + size_bytes <= size_)
+        << "requested slice allocation (offset + size) is greater "
+        << "than parent allocation size: (" << offset_bytes << " + "
+        << size_bytes << ") vs. (" << size_ << ")";
+
+    return DeviceMemoryBase(
+        reinterpret_cast<std::byte *>(opaque_) + offset_bytes, size_bytes);
+  }
+
+ private:
+  // Platform-dependent value representing allocated memory.
+  //
+  // User may also constructs the object with `kExternalAllocationMarker`
+  // address and non-zero size, which indicates the case that buffer is
+  // allocated externally (for Gpu backends we use it to allocate memory via
+  // command buffer APIs).
+  void *opaque_;
+  uint64_t size_;         // Size in bytes of this allocation.
+  uint64_t payload_ = 0;  // Payload data associated with this allocation.
+};
+
+// Typed wrapper around "void *"-like DeviceMemoryBase.
+//
+// For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase
+// that represents one or more integers in Device memory.
+//
+// Thread-compatible.
+template <typename T>
+class DeviceMemory final : public DeviceMemoryBase {
+ public:
+  // Default constructor instantiates a null-pointed, zero-sized memory region.
+  DeviceMemory() : DeviceMemoryBase(nullptr, 0) {}
+  explicit DeviceMemory(std::nullptr_t) : DeviceMemory() {}
+
+  // Typed device memory regions may be constructed from untyped device memory
+  // regions, this effectively amounts to a cast from a void*.
+  explicit DeviceMemory(const DeviceMemoryBase &other)
+      : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
+                         other.size()) {
+    SetPayload(other.payload());
+  }
+
+  // Returns the number of elements of type T that constitute this
+  // allocation.
+  uint64_t ElementCount() const { return size() / sizeof(T); }
+
+  // Returns pointer to the allocated data
+  T *base() const { return reinterpret_cast<T *>(opaque()); }
+
+  // Creates a typed area of DeviceMemory with a given opaque pointer and the
+  // quantity of bytes in the allocation. This function is broken out to
+  // distinguish bytes from an element count.
+  static DeviceMemory<T> MakeFromByteSize(void *opaque, uint64_t bytes) {
+    return DeviceMemory<T>(opaque, bytes);
+  }
+
+  // Creates a memory region (slice) inside another allocated memory region.
+  // Offset and size are specified in terms of T elements.
+  DeviceMemory<T> GetSlice(uint64_t element_offset, uint64_t element_count) {
+    return DeviceMemory<T>(
+        GetByteSlice(sizeof(T) * element_offset, sizeof(T) * element_count));
+  }
+
+ protected:
+  // This is made protected because it accepts a byte-size instead of an element
+  // count, which could potentially be misused given the ElementCount() nature
+  // of this interface.
+  //
+  // In order to specify the desire to use byte size instead of element count
+  // explicitly, use MakeFromByteSize.
+  DeviceMemory(void *opaque, uint64_t size) : DeviceMemoryBase(opaque, size) {}
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory_allocator.h
new file mode 100644
index 00000000..9520fa9b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory_allocator.h
@@ -0,0 +1,230 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_DEVICE_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_DEVICE_MEMORY_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+
+namespace stream_executor {
+
+class Stream;
+class DeviceMemoryAllocator;
+
+// Owning pointer for memory on a device.
+//
+// ScopedDeviceMemory is an owning pointer like std::unique_ptr, but it can
+// point to memory that resides on a "device" (e.g. a GPU).  When a
+// ScopedDeviceMemory goes out of scope, it frees the memory it owns.
+//
+// We say that an instance of ScopedDeviceMemory is "active" if it currently
+// owns a (possibly empty) slice of memory on the device.  Moving,
+// Release()'ing, Free()'ing, and other actions can deactivate an active object.
+template <typename ElemT>
+class ScopedDeviceMemory {
+ public:
+  // Default construction initializes the internal state to nullptr.  This
+  // mirrors the std::unique_ptr<> functionality, where default construction
+  // produces a nullptr unique_ptr, which can be assigned later.
+  ScopedDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
+
+  // Construct a ScopedDeviceMemory from a custom allocator.
+  //
+  // Parameters:
+  //  mem: Already-allocated device memory value for this scoped mechanism to
+  //       deallocate. This memory must have been allocated by parent.
+  //  device_ordinal: Device on which the memory was allocated.
+  //  allocator: Allocator used to deallocate memory when this instance goes
+  //             out of scope.
+  ScopedDeviceMemory(DeviceMemoryBase mem, int device_ordinal,
+                     DeviceMemoryAllocator *allocator)
+      : wrapped_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
+    DCHECK_GE(device_ordinal_, 0);
+  }
+
+  // Moves ownership of the memory from other to the constructed
+  // object.
+  //
+  // Postcondition: other == nullptr.
+  ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept
+      : wrapped_(other.Release()),
+        device_ordinal_(other.device_ordinal_),
+        allocator_(other.allocator_) {}
+
+  // Releases the memory that was provided in the constructor.
+  ~ScopedDeviceMemory() { TF_CHECK_OK(Free()); }
+
+  // Moves ownership of the memory from other to this object.
+  //
+  // Postcondition: other == nullptr.
+  ScopedDeviceMemory &operator=(ScopedDeviceMemory &&other) noexcept {
+    TF_CHECK_OK(Free());
+    wrapped_ = other.Release();
+    allocator_ = other.allocator_;
+    device_ordinal_ = other.device_ordinal_;
+    return *this;
+  }
+
+  // Returns the memory that backs this scoped allocation converted to
+  // DeviceMemory<T> apparent type. This is useful for cases where the
+  // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't
+  // allow copying, for scoped-object-lifetime reasons.
+  const DeviceMemory<ElemT> &cref() const { return wrapped_; }
+
+  // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable
+  // operations. The value returned should not be used outside the scope of this
+  // ScopedDeviceMemory object's lifetime.
+  DeviceMemory<ElemT> *ptr() { return &wrapped_; }
+  const DeviceMemory<ElemT> *ptr() const { return &wrapped_; }
+
+  // Smart-pointer-like operators for the wrapped DeviceMemory.
+  // This reference must not be used outside the lifetime of this
+  // ScopedDeviceMemory.
+  const DeviceMemory<ElemT> &operator*() const { return cref(); }
+  DeviceMemory<ElemT> *operator->() { return ptr(); }
+  const DeviceMemory<ElemT> *operator->() const { return ptr(); }
+
+  bool is_null() const { return wrapped_.is_null(); }
+  bool operator==(std::nullptr_t other) const { return is_null(); }
+  bool operator!=(std::nullptr_t other) const { return !is_null(); }
+
+  // Analogous to std::unique_ptr::release, releases ownership of the held
+  // memory and transfers it to the caller.
+  //
+  // Postcondition: *this == nullptr
+  DeviceMemory<ElemT> Release() {
+    DeviceMemory<ElemT> tmp = wrapped_;
+    wrapped_ = DeviceMemory<ElemT>{};
+    return tmp;
+  }
+
+  // The returned allocator is nonnull iff this object is active.
+  DeviceMemoryAllocator *allocator() const { return allocator_; }
+
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Frees the existing memory, resets the wrapped memory to null.
+  absl::Status Free();
+
+ private:
+  DeviceMemory<ElemT> wrapped_;       // Value we wrap with scoped-release.
+  int device_ordinal_;                // Negative one for inactive object.
+  DeviceMemoryAllocator *allocator_;  // Null if this object is inactive.
+
+  ScopedDeviceMemory(const ScopedDeviceMemory &) = delete;
+  void operator=(const ScopedDeviceMemory &) = delete;
+};
+
+// Type alias for compatibility with the previous managed memory implementation.
+using OwningDeviceMemory = ScopedDeviceMemory<uint8_t>;
+
+// Memory allocator interface for the device.
+//
+// Intended usage is through Allocate() functions which return an owning smart
+// pointer.
+class DeviceMemoryAllocator {
+ public:
+  // Parameter platform indicates which platform the allocator allocates memory
+  // on. Must be non-null.
+  explicit DeviceMemoryAllocator(const Platform *platform)
+      : platform_(platform) {}
+  virtual ~DeviceMemoryAllocator() {}
+
+  // Allocates memory on the device.
+  //
+  // If size > 0 and the returned absl::StatusOr is OK, the wrapped
+  // OwningDeviceMemory must not be null.  If size == 0, must return a null
+  // OwningDeviceMemory.
+  //
+  // 'retry_on_failure': If false, and the first attempt to allocate the memory
+  // fails, the allocation should return immediately without retrying.  An
+  // example use case is optional scratch spaces where a failure has only
+  // performance impact.
+  virtual absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal,
+                                                      uint64_t size,
+                                                      bool retry_on_failure,
+                                                      int64_t memory_space) = 0;
+
+  // Two-arg version of Allocate(), which sets retry-on-failure to true and
+  // memory_space to default (0).
+  //
+  // (We don't simply use a default argument on the virtual Allocate function
+  // because default args on virtual functions are disallowed by the Google
+  // style guide.)
+  absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal,
+                                              uint64_t size) {
+    return Allocate(device_ordinal, size, /*retry_on_failure=*/true,
+                    /*memory_space=*/0);
+  }
+
+  // Three-arg version of Allocate(), which sets memory_space to default (0).
+  absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                              bool retry_on_failure) {
+    return Allocate(device_ordinal, size, retry_on_failure,
+                    /*memory_space=*/0);
+  }
+
+  // Typed version of the allocation, returning typed memory.
+  template <typename ElemT>
+  absl::StatusOr<ScopedDeviceMemory<ElemT>> Allocate(
+      int device_ordinal, uint64_t size, bool retry_on_failure = true,
+      int64_t memory_space = 0) {
+    return Allocate(device_ordinal, size, retry_on_failure, memory_space);
+  }
+
+  // Must be a nop for null pointers. Should not be used.
+  //
+  // TODO(cheshire): Add deprecation notice.
+  virtual absl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
+
+  // Return the platform that the allocator allocates memory on.
+  const Platform *platform() const { return platform_; }
+
+  // Can we call Deallocate() as soon as a computation has been scheduled on
+  // a stream, or do we have to wait for the computation to complete first?
+  virtual bool AllowsAsynchronousDeallocation() const { return false; }
+
+  // Returns a stream pointer on which it is always safe to access memory
+  // allocated by this allocator. It is not necessary to use the returned stream
+  // though, as clients may have additional information letting them safely use
+  // a different stream.
+  virtual absl::StatusOr<Stream *> GetStream(int device_ordinal) = 0;
+
+ protected:
+  const Platform *platform_;
+};
+
+template <typename ElemT>
+absl::Status ScopedDeviceMemory<ElemT>::Free() {
+  if (!wrapped_.is_null()) {
+    CHECK(allocator_ != nullptr) << "Owning pointer in inconsistent state";
+    TF_RETURN_IF_ERROR(allocator_->Deallocate(device_ordinal_, wrapped_));
+  }
+  wrapped_ = DeviceMemory<ElemT>{};
+  return absl::OkStatus();
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DEVICE_MEMORY_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory_handle.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory_handle.h
new file mode 100644
index 00000000..65ceab28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/device_memory_handle.h
@@ -0,0 +1,55 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_DEVICE_MEMORY_HANDLE_H_
+#define XLA_STREAM_EXECUTOR_DEVICE_MEMORY_HANDLE_H_
+
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+// This class will deallocate the held DeviceMemoryBase upon destruction.
+class DeviceMemoryHandle {
+ public:
+  DeviceMemoryHandle() : memory_(), executor_(nullptr) {}
+
+  // A helper constructor to generate a scoped device memory given an already
+  // allocated memory and a stream executor.
+  DeviceMemoryHandle(StreamExecutor *executor, DeviceMemoryBase memory);
+  ~DeviceMemoryHandle();
+
+  // Moves ownership of the memory from other to the constructed
+  // object.
+  DeviceMemoryHandle(DeviceMemoryHandle &&other) noexcept;
+
+  // Moves ownership of the memory from other to this object.
+  DeviceMemoryHandle &operator=(DeviceMemoryHandle &&other) noexcept;
+
+  // Accessors for the DeviceMemoryBase.
+  const DeviceMemoryBase &memory() const { return memory_; }
+  DeviceMemoryBase *memory_ptr() { return &memory_; }
+
+ private:
+  // Frees the associated memory.
+  void Free();
+
+  DeviceMemoryBase memory_;            // Value we wrap with scoped-release.
+  StreamExecutor *executor_;           // Null if this object is inactive.
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DEVICE_MEMORY_HANDLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/dnn.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/dnn.h
new file mode 100644
index 00000000..f99f6a03
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/dnn.h
@@ -0,0 +1,2118 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Neural Net operation support for StreamExecutor instances.
+//
+// This is an abstract interface for a platform to optionally support common
+// neural net operations; it accommodates implementations such as the cudnn
+// library operations.
+
+#ifndef XLA_STREAM_EXECUTOR_DNN_H_
+#define XLA_STREAM_EXECUTOR_DNN_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "google/protobuf/wrappers.pb.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/data_type.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+
+namespace Eigen {
+struct half;
+}  // namespace Eigen
+
+namespace stream_executor {
+
+namespace dnn {
+
+// Specifies an index to use when accessing specific spatial dimensions.
+enum class DimIndex : int {
+  X = 0,
+  Y = 1,
+  Z = 2,
+};
+
+// Return a reordered dims.
+std::vector<int64_t> ReorderDims(const std::vector<int64_t>& input,
+                                 const DataLayout& from, const DataLayout& to);
+
+// Helper functions to make methods more readable.
+inline int64_t GetDim(absl::Span<const int64_t> data, DimIndex dim) {
+  return data.rbegin()[static_cast<int64_t>(dim)];
+}
+
+inline void SetDim(absl::Span<int64_t> data, DimIndex dim, int64_t value) {
+  data.rbegin()[static_cast<int64_t>(dim)] = value;
+}
+
+inline void SetDim(std::vector<int64_t>* data, DimIndex dim, int64_t value) {
+  return SetDim(absl::MakeSpan(*data), dim, value);
+}
+
+// int64_t is not the same type as tensorflow::protobuf_int64 in open-source.
+// This wrapper function gives an int64_t array slice view of a repeated int64
+// protobuf field.
+//
+// T should be a protobuf RepeatedField.
+template <typename T>
+inline absl::Span<const int64_t> AsInt64Slice(const T& repeated_field) {
+  using data_ty =
+      typename std::remove_reference<decltype(*repeated_field.data())>::type;
+  static_assert(std::is_integral<data_ty>::value &&
+                    std::is_signed<data_ty>::value && sizeof(data_ty) == 8,
+                "repeated_field.data() must return a pointer to a signed "
+                "64-bit integer type.");
+  return absl::Span<const int64_t>(
+      reinterpret_cast<const int64_t*>(repeated_field.data()),
+      repeated_field.size());
+}
+template <typename T>
+inline absl::Span<int64_t> AsInt64Slice(T* repeated_field) {
+  using data_ty =
+      typename std::remove_reference<decltype(*repeated_field->data())>::type;
+  static_assert(std::is_integral<data_ty>::value &&
+                    std::is_signed<data_ty>::value && sizeof(data_ty) == 8,
+                "repeated_field->data() must return a pointer to a signed "
+                "64-bit integer type.");
+  return absl::Span<int64_t>(
+      reinterpret_cast<int64_t*>(repeated_field->mutable_data()),
+      repeated_field->size());
+}
+
+// Returns a string representation of the given data layout.
+std::string DataLayoutString(DataLayout layout);
+
+// Specifies a quantization for activations in a given BatchDescriptor.
+enum class QuantizedActivationMode {
+  k8Bit = 1,
+  k16Bit = 2,
+  k32Bit = 4,
+};
+
+// Specifies the types of a RNN model.
+enum class RnnMode {
+  kRnnRelu = 0,
+  kRnnTanh = 1,
+  kRnnLstm = 2,
+  kRnnGru = 3,
+};
+
+// Specifies the input model and whether there is a linear transformation
+// between the input state and the first layer hidden state.
+enum class RnnInputMode {
+  kRnnLinearSkip = 0,
+  kRnnSkipInput = 1,
+};
+
+// Specifies the number of directions used in a RNN model. When bidirection
+// is used, the input states and output sequence contain data for both
+// directions.
+enum class RnnDirectionMode {
+  kRnnUnidirectional = 0,
+  kRnnBidirectional = 1,
+};
+
+class TensorDescriptor {
+ public:
+  TensorDescriptor() = default;
+  absl::StatusOr<std::vector<int64_t>> GetPhysicalDimensionsMajorToMinor()
+      const;
+  std::vector<int64_t> GetPhysicalStridesMajorToMinor() const;
+  std::vector<int64_t> GetLogicalStrides() const;
+
+  static TensorDescriptor For(DataType type,
+                              absl::Span<const int64_t> dimensions,
+                              absl::Span<const int64_t> minor_to_major);
+  int ndims() const;
+  std::vector<int64_t> dimensions() const { return dimensions_; }
+  std::vector<int64_t> minor_to_major() const { return minor_to_major_; }
+  DataType type() const { return d_type_; }
+  std::string ToString() const;
+
+ protected:
+  TensorDescriptor(DataType type, std::vector<int64_t> dimensions,
+                   std::vector<int64_t> minor_to_major)
+      : d_type_(type),
+        dimensions_(dimensions),
+        minor_to_major_(minor_to_major) {}
+
+ private:
+  DataType d_type_;
+  std::vector<int64_t> dimensions_;
+  std::vector<int64_t> minor_to_major_;
+};
+
+class MatmulTensorDescriptor {
+ public:
+  MatmulTensorDescriptor() = default;
+  absl::StatusOr<std::vector<int64_t>> GetNonContractingDims() const;
+  std::vector<int64_t> GetCudnnCompatibleDimensions(
+      bool is_lhs
+      /*if not lhs, then rhs*/) const;
+  std::vector<int64_t> GetCudnnCompatibleStrides(
+      bool is_lhs
+      /*if not lhs, then rhs*/) const;
+  absl::StatusOr<std::vector<int64_t>> MakeCudnnCompatible(
+      const std::vector<int64_t>&, bool is_lhs) const;
+
+  static MatmulTensorDescriptor For(DataType type,
+                                    absl::Span<const int64_t> dimensions,
+                                    absl::Span<const int64_t> minor_to_major,
+                                    absl::Span<const int64_t> batch_dims,
+                                    absl::Span<const int64_t> contracting_dims);
+  DataType type() const { return tensor_.type(); }
+
+  std::string ToString() const;
+
+ protected:
+  MatmulTensorDescriptor(TensorDescriptor tensor,
+                         std::vector<int64_t> batch_dims,
+                         std::vector<int64_t> contracting_dims)
+      : tensor_(tensor),
+        batch_dimension_numbers_(batch_dims),
+        contracting_dim_(contracting_dims) {}
+
+ private:
+  TensorDescriptor tensor_;
+  std::vector<int64_t> batch_dimension_numbers_;
+  std::vector<int64_t> contracting_dim_;
+};
+
+// Specifies the descriptor for a RNN model.
+//
+// An example use case:
+//   * The user first creates a model through CreateRnnDescriptor.
+//   * The user queries the size of the underlying opaque parameter buffer.
+//   * The user creates and initializes a parameter buffer of the proper size.
+//   * The user runs forward and backward operations using this RNN descriptor.
+//   * Once a while, user queries maintainable weights and bias regions from
+//       the underlying parameter buffer. They are more likely to be forward
+//       compatible and should used in saving and restoring a model.
+//   * The user releases the RNN descriptor when the model is no longer in use.
+class RnnDescriptor {
+ public:
+  struct ParamsRegion {
+    int64_t offset;
+    int64_t size;
+  };
+  typedef std::vector<ParamsRegion> ParamsRegions;
+  virtual ~RnnDescriptor() = default;
+  virtual int64_t ParamsSizeInBytes() const { return -1; }
+  virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); }
+  virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); }
+};
+
+// Specifies the sequence in a RNN model.
+//
+// The user is responsible for releasing this descriptor when it is no longer
+// in use. The destructor releases the underlying descriptors.
+class RnnSequenceTensorDescriptor {
+ public:
+  virtual ~RnnSequenceTensorDescriptor() = default;
+};
+
+// Specifies either the input and hidden state in a RNN model.
+//
+// The user is responsible for releasing this descriptor when it is no longer
+// in use. The destructor releases the underlying descriptors.
+class RnnStateTensorDescriptor {
+ public:
+  virtual ~RnnStateTensorDescriptor() = default;
+};
+
+// Describes the dimensions that a layer consumes/produces.
+//
+// This is a matrix (height, width), its "depth" (feature_map_count),
+// how many of these matrices are present (count),
+// and the maximum and minimum values expected in the matrix (value_max,
+// value_min).
+// If input is quantized, all values greater
+// than value_max will be clipped to value_max and all values less than
+// value_min will be clipped to value_min.
+// When quantized output is dequantized no value will be greater than
+// value_max or less than value_min.
+//
+// Uses the named argument construction form:
+//
+//  auto input_batch_dimensions =
+//      BatchDescriptor().set_count(42).set_feature_map_count(7)...
+//
+// Details:
+//
+// For a convolutional layer, a single inference takes a 3-dimensional matrix
+// of input and produces a 3-dimensional matrix of output. We call the three
+// dimensions height, width and feature_map_count, where for an image, the
+// height and width correspond to the Y and X pixel indices, respectively, and
+// the feature_map_count corresponds to the RGB dimension of the input data.
+// Then the count indicates how many 3D matrices are being presented to be
+// processed at once; this corresponds to the neural network concept of
+// minibatch size.
+//
+// For a fully connected layer, it's better to put the nodes of the layer in
+// the feature_map_count, and leave the height and weight as degenerate (== 1).
+// Count indicates how many input vectors (degenerate 3D matrices) are to be
+// processed.
+//
+// If unspecified, value_max and value_min default to 0.0.
+// If value_max == value_min the Stream will attempt to derive valid values -
+// for example the output of Relu6 activation will always be in the range
+// [0.0, 6.0].
+//
+// If unspecified, layout defaults to kYXDepthBatch.
+class BatchDescriptor {
+ public:
+  // Creates a "blank" batch descriptor, which should be initialized via the
+  // named argument helpers.
+  BatchDescriptor();
+  explicit BatchDescriptor(int ndims);
+
+  std::string ToString() const;
+  std::string ToShortString() const;
+
+  // Pre-condition:
+  //   value_max_ == 0
+  //   value_min_ == 0
+  //   quantized_activation_mode_ == QuantizedActivationMode::k8Bit
+  TensorDescriptorProto ToProto(DataType data_type) const;
+
+  // Accessors.
+  int64_t count() const { return tensor_.dimensions(0); }
+  int64_t feature_map_count() const { return tensor_.dimensions(1); }
+  int64_t height() const { return GetDim(spatial_size(), DimIndex::Y); }
+  int64_t width() const { return GetDim(spatial_size(), DimIndex::X); }
+  int ndims() const { return spatial_size().size(); }
+  float value_max() const { return value_max_; }
+  float value_min() const { return value_min_; }
+  DataLayout layout() const { return tensor_.data_layout(); }
+  QuantizedActivationMode quantized_activation_mode() const {
+    return quantized_activation_mode_;
+  }
+  // Full dimensions of the underlying data, ordered according to a specific
+  // layout.
+  std::vector<int64_t> full_dims(const DataLayout& layout) const;
+
+  // Full strides of the underlying data, ordered according to a specific
+  // layout.
+  std::vector<int64_t> full_strides(const DataLayout& layout) const;
+
+  // Vectorized dimensions where users can specify the dimension that the number
+  // of dimensions is reported rather than the full number of elements.
+  std::vector<int64_t> vectorized_dims(const DataLayout& layout,
+                                       int vector_size, int vector_dim) const;
+
+  // Vectorized strides correspond to the vectorized_dims.
+  std::vector<int64_t> vectorized_strides(const DataLayout& layout,
+                                          int vector_size,
+                                          int vector_dim) const;
+
+  // Named-argument helpers for avoiding user error during construction.
+  BatchDescriptor& set_count(int64_t value) {
+    tensor_.set_dimensions(0, value);
+    return *this;
+  }
+  BatchDescriptor& set_feature_map_count(int64_t value) {
+    tensor_.set_dimensions(1, value);
+    return *this;
+  }
+  BatchDescriptor& set_height(int64_t value) {
+    SetDim(spatial_size(), DimIndex::Y, value);
+    return *this;
+  }
+  BatchDescriptor& set_width(int64_t value) {
+    SetDim(spatial_size(), DimIndex::X, value);
+    return *this;
+  }
+  BatchDescriptor& set_spatial_dim(DimIndex dim, int64_t value) {
+    SetDim(spatial_size(), dim, value);
+    return *this;
+  }
+  BatchDescriptor& set_layout(DataLayout layout) {
+    tensor_.set_data_layout(layout);
+    return *this;
+  }
+
+  // Return the number of nodes in a single feature map.
+  int64_t NodesPerFeatureMap() const;
+
+  // Return the number of nodes across all feature maps. Note that this is not
+  // affected by the batch count.
+  int64_t NodesAcrossFeatureMaps() const;
+
+ private:
+  absl::Span<const int64_t> spatial_size() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
+
+  absl::Span<int64_t> spatial_size() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
+  float value_max_;
+  float value_min_;
+  QuantizedActivationMode quantized_activation_mode_;
+};
+
+// Returns a string representation of the given filter layout.
+std::string FilterLayoutString(FilterLayout layout);
+
+// Describes a filter for the convolution. This is the "window" from
+// height-by-width patches of each of the feature maps in the input layer to the
+// cells within the output feature map.
+//
+// Uses the named argument construction form:
+//
+//  FilterDescriptor filter_dimensions;
+//  filter_dimensions
+//    .set_output_feature_map_count(42)
+//    .set_input_feature_map_count(7)
+//    ...
+//
+// Arguments:
+// - output_feature_map_count: number of feature maps in the output layer.
+// - input_feature_map_count: number of feature maps in the input layer (from
+//      which the filter patch is taken).
+// - input_filter_height: "height" number of neurons used in the sliding window
+//      over the input layer.
+// - input_filter_width: "width" number of neurons used in the sliding window
+//      over the input layer.
+//
+// Sometimes names like "filter input height" are referred to by synonymous
+// terminology, such as "kernel y size".
+//
+// If unspecified, layout defaults to kOutputInputYX.
+class FilterDescriptor {
+ public:
+  // By default construction, all dimensions are set to zero, so they should all
+  // be populated by the user via the named-argument helpers below. (See class
+  // comment for details.)
+  FilterDescriptor();
+  explicit FilterDescriptor(int ndims);
+  ~FilterDescriptor();
+
+  // Named-argument helpers for avoiding user error during construction.
+  FilterDescriptor& set_output_feature_map_count(int64_t value) {
+    tensor_.set_dimensions(0, value);
+    return *this;
+  }
+  FilterDescriptor& set_input_feature_map_count(int64_t value) {
+    tensor_.set_dimensions(1, value);
+    return *this;
+  }
+  FilterDescriptor& set_input_filter_height(int64_t value) {
+    SetDim(input_filter_dims(), DimIndex::Y, value);
+    return *this;
+  }
+  FilterDescriptor& set_input_filter_width(int64_t value) {
+    SetDim(input_filter_dims(), DimIndex::X, value);
+    return *this;
+  }
+  FilterDescriptor& set_layout(FilterLayout layout) {
+    tensor_.set_filter_layout(layout);
+    return *this;
+  }
+  FilterDescriptor& set_spatial_dim(DimIndex dim, int64_t value) {
+    SetDim(input_filter_dims(), dim, value);
+    return *this;
+  }
+  int ndims() const { return input_filter_dims().size(); }
+
+  std::string ToString() const;
+  TensorDescriptorProto ToProto(DataType data_type) const;
+
+  int64_t output_feature_map_count() const { return tensor_.dimensions(0); }
+  int64_t input_feature_map_count() const { return tensor_.dimensions(1); }
+  FilterLayout layout() const { return tensor_.filter_layout(); }
+
+  absl::Span<const int64_t> input_filter_dims() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
+
+  // Full dimensions of the underlying filter,
+  // ordered according to a specific layout.
+  std::vector<int64_t> full_dims(const FilterLayout& layout) const;
+
+  // Full strides of the underlying filter,
+  // ordered according to a specific layout.
+  std::vector<int64_t> full_strides(const FilterLayout& layout) const;
+
+  // Vectorized dimensions where users can specify the dimension that the number
+  // of dimensions is reported rather than the full number of elements.
+  std::vector<int64_t> vectorized_dims(const FilterLayout& layout,
+                                       int vector_size, int vector_dim) const;
+
+  // Vectorized strides correspond to the vectorized_dims.
+  std::vector<int64_t> vectorized_strides(const FilterLayout& layout,
+                                          int vector_size,
+                                          int vector_dim) const;
+
+ private:
+  absl::Span<int64_t> input_filter_dims() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
+};
+
+// Describes how padding should be aligned when the total number of pad
+// elements is odd.
+enum class PadAlignment : int64_t {
+  kDefault = 0,        // default padding for the device.
+  kCudnnPadding,       // cuDNN padding - prefer to pad at the start.
+  kTensorFlowPadding,  // TensorFlow padding - prefer to pad at the end.
+};
+
+// Returns a string representation of the given padding alignment.
+std::string PadAlignmentString(PadAlignment alignment);
+
+// Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
+std::ostream& operator<<(std::ostream& str, PadAlignment alignment);
+
+// Describes a convolution.
+//
+// Uses the named argument construction form:
+//
+//  ConvolutionDescriptor convolution_dimensions;
+//  convolution_dimensions
+//    .set_vertical_filter_stride(2)
+//    .set_horizontal_filter_stride(2)
+//    ...
+//
+// Arguments:
+// - zero_padding_height: padding of the "y dimension" of the input data. Note
+//    that this is different from the height of the filter.
+// - zero_padding_width: analogous to the height above, but in the "x
+//    dimension".
+// - vertical_filter_stride: the convolution slides a 2-dimensional window of
+//    filter-height-by-filter-width over the input layer -- the center of that
+//    window is moved in the "y dimension" according to this stride value.
+// - horizontal_filter_stride: analogous to the vertical stride above, but in
+//    the "x dimension".
+// - vertical_dilation_rate: there will be (vertical_dilation_rate - 1) skipped
+//   cells between each filter element in the "y dimension".
+// - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
+//   skipped cells between each filter element in the "x dimension".
+// - convolution_not_crosscor: By default (convolution_not_crosscor == false),
+//   we perform cross correlation rather than convolution. With the flag set,
+//   we perform convolution. Convolution and cross correlation are related by
+//   rotating the filter by 180 degrees (or equivalently flipping all spatial
+//   dimensions).
+class ConvolutionDescriptor {
+ public:
+  // By default construction, there is no zero-padding and the filter stride is
+  // 1x1 (centering the filter on every cell in the input layer's
+  // width-by-height area).
+  ConvolutionDescriptor();
+  explicit ConvolutionDescriptor(int ndims);
+  ~ConvolutionDescriptor();
+
+  std::string ToString() const;
+  ConvolutionDescriptorProto ToProto() const { return proto_; }
+
+  ConvolutionDescriptor& set_zero_padding_height(int64_t value) {
+    SetDim(padding(), DimIndex::Y, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_zero_padding_width(int64_t value) {
+    SetDim(padding(), DimIndex::X, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64_t value) {
+    SetDim(padding(), dim, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_vertical_filter_stride(int64_t value) {
+    SetDim(strides(), DimIndex::Y, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_horizontal_filter_stride(int64_t value) {
+    SetDim(strides(), DimIndex::X, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64_t value) {
+    SetDim(strides(), dim, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_vertical_dilation_rate(int64_t value) {
+    SetDim(dilations(), DimIndex::Y, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_horizontal_dilation_rate(int64_t value) {
+    SetDim(dilations(), DimIndex::X, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64_t value) {
+    SetDim(dilations(), dim, value);
+    return *this;
+  }
+  ConvolutionDescriptor& set_group_count(int group_count) {
+    proto_.set_group_count(group_count);
+    return *this;
+  }
+  ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
+    proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
+                                     : ConvolutionMode::CROSS_CORRELATION);
+    return *this;
+  }
+
+  // TODO(timshen): remove this function. No users of this class is setting a
+  // non-default pad alignment.
+  PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
+  int group_count() const { return proto_.group_count(); }
+  int ndims() const { return padding().size(); }
+  bool convolution_not_crosscorr() const {
+    return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
+  }
+
+  absl::Span<const int64_t> strides() const {
+    return AsInt64Slice(proto_.strides());
+  }
+
+  absl::Span<const int64_t> dilations() const {
+    return AsInt64Slice(proto_.dilations());
+  }
+
+  absl::Span<const int64_t> padding() const {
+    return AsInt64Slice(proto_.paddings());
+  }
+
+ private:
+  absl::Span<int64_t> strides() {
+    return AsInt64Slice(proto_.mutable_strides());
+  }
+
+  absl::Span<int64_t> dilations() {
+    return AsInt64Slice(proto_.mutable_dilations());
+  }
+
+  absl::Span<int64_t> padding() {
+    return AsInt64Slice(proto_.mutable_paddings());
+  }
+
+  ConvolutionDescriptorProto proto_;
+
+  // TODO(leary) cudnn provides these fields, but need to characterize what
+  // their effect is -- they may be boolean rather than integral.
+  // int64_t upscale_input_x;
+  // int64_t upscale_input_y;
+};
+
+// A patch of values in the input can be pooled via either a max or an average
+// operation.
+// Specify int64_t so there's no padding in PoolingDescriptor.
+enum class PoolingMode : int64_t {
+  kMaximum,
+  kAverage,
+};
+
+// Specify the dimension in which to concatenate inputs in space.
+// Specify int64_t so there's no padding in SpaceConcatenateMode.
+enum class SpaceConcatenateMode : int64_t {
+  XDirection,
+  YDirection,
+};
+
+// Describes a pooling operation to be enqueued onto a stream via a platform's
+// DnnSupport.
+//
+// TODO(broune): describe how padding works and what happens if the
+// window height/width is not divisible by the vertical/horizontal
+// stride.
+//
+// Arguments:
+//  pooling_mode: pooling operator to use on the input patch
+//  window_height: height of input window
+//  window_width: width of input window
+//  vertical_stride: vertical delta for center of the input patch
+//  horizontal_stride: horizontal delta for center of the input patch
+class PoolingDescriptor {
+ public:
+  PoolingDescriptor();
+  explicit PoolingDescriptor(int ndims);
+
+  PoolingDescriptor& set_pooling_mode(PoolingMode value) {
+    mode_ = value;
+    return *this;
+  }
+  PoolingDescriptor& set_window_height(int64_t value) {
+    SetDim(&window_, DimIndex::Y, value);
+    return *this;
+  }
+  PoolingDescriptor& set_window_width(int64_t value) {
+    SetDim(&window_, DimIndex::X, value);
+    return *this;
+  }
+  PoolingDescriptor& set_window(DimIndex dim, int64_t value) {
+    SetDim(&window_, dim, value);
+    return *this;
+  }
+  PoolingDescriptor& set_vertical_padding(int64_t value) {
+    SetDim(&padding_, DimIndex::Y, value);
+    return *this;
+  }
+  PoolingDescriptor& set_horizontal_padding(int64_t value) {
+    SetDim(&padding_, DimIndex::X, value);
+    return *this;
+  }
+  PoolingDescriptor& set_padding(DimIndex dim, int64_t value) {
+    SetDim(&padding_, dim, value);
+    return *this;
+  }
+  PoolingDescriptor& set_vertical_stride(int64_t value) {
+    SetDim(&strides_, DimIndex::Y, value);
+    return *this;
+  }
+  PoolingDescriptor& set_horizontal_stride(int64_t value) {
+    SetDim(&strides_, DimIndex::X, value);
+    return *this;
+  }
+  PoolingDescriptor& set_stride(DimIndex dim, int64_t value) {
+    SetDim(&strides_, dim, value);
+    return *this;
+  }
+  PoolingDescriptor& set_propagate_nans(bool value) {
+    propagate_nans_ = value;
+    return *this;
+  }
+
+  int ndims() const { return ndims_; }
+
+  PoolingMode mode() const { return mode_; }
+  absl::Span<const int64_t> window() const { return window_; }
+  absl::Span<const int64_t> padding() const { return padding_; }
+  absl::Span<const int64_t> strides() const { return strides_; }
+  bool propagate_nans() const { return propagate_nans_; }
+
+ private:
+  PoolingMode mode_;
+  int ndims_;
+  bool propagate_nans_;
+
+  // Stored as: ..., y, x.
+  std::vector<int64_t> window_;
+  std::vector<int64_t> padding_;
+  std::vector<int64_t> strides_;
+};
+
+// Collects parameters for DNN algorithms
+class AlgorithmDesc {
+ public:
+  typedef int64_t Index;
+  AlgorithmDesc() : AlgorithmDesc(0, false, std::nullopt) {}
+  explicit AlgorithmDesc(AlgorithmProto proto) : proto_(std::move(proto)) {}
+  AlgorithmDesc(Index algo_id, bool use_tensor_ops)
+      : AlgorithmDesc(algo_id, use_tensor_ops, std::nullopt) {}
+  AlgorithmDesc(Index algo_id, bool use_tensor_ops,
+                std::optional<uint64_t> workspace_size) {
+    proto_.set_is_cudnn_frontend(false);
+    proto_.set_algo_id(algo_id);
+    proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
+                                        : AlgorithmProto::DEFAULT_MATH);
+    if (workspace_size) {
+      proto_.mutable_workspace_size()->set_value(*workspace_size);
+    }
+  }
+  AlgorithmDesc(int64_t engine_id,
+                const std::vector<std::pair<int64_t, int64_t>>& tuning_knobs,
+                std::optional<uint64_t> workspace_size);
+  bool is_cudnn_frontend() const { return proto_.is_cudnn_frontend(); }
+
+  bool tensor_ops_enabled() const {
+    return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
+  }
+  std::optional<uint64_t> workspace_size() const {
+    if (proto_.has_workspace_size()) {
+      return proto_.workspace_size().value();
+    }
+    return std::nullopt;
+  }
+  Index algo_id() const { return proto_.algo_id(); }
+
+  std::vector<std::pair<int64_t, int64_t>> TuningKnobs() const;
+
+  bool operator==(const AlgorithmDesc& other) const;
+
+  uint64_t hash() const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const AlgorithmDesc& algo_desc);
+
+  AlgorithmProto ToProto() const { return proto_; }
+
+  std::string ToString() const;
+
+ private:
+  AlgorithmProto proto_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const AlgorithmDesc& algo_desc) {
+  return H::combine(std::move(h), algo_desc.hash());
+}
+
+// Describes the result from a perf experiment.
+//
+// Arguments:
+//  algorithm: returns the exact algorithm that was used.
+//  elapsed_time_in_ms: returns the measured elapsed time in milliseconds.
+class ProfileResult {
+ public:
+  bool is_valid() const {
+    return algorithm_.has_value() &&
+           elapsed_time_in_ms() != std::numeric_limits<float>::max();
+  }
+  bool warmup_run_executed() const { return warmup_run_executed_; }
+  void set_warmup_run_executed(bool val) { warmup_run_executed_ = val; }
+
+  AlgorithmDesc algorithm() const { return *algorithm_; }
+  void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
+
+  float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
+  void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
+
+  void set_scratch_size(size_t val) { scratch_size_ = val; }
+
+ private:
+  std::optional<AlgorithmDesc> algorithm_;
+  float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
+  // The scratch size algorithm_ requires. Currently it's only populated by
+  // convolutions.
+  size_t scratch_size_ = 0;
+  bool warmup_run_executed_ = false;
+};
+
+// Backend-specific data shared between repeated launches of the same
+// convolution.
+template <typename Sig>
+class OpRunner;
+
+// An abstract class owning cached state for a particular op/configuration.
+//
+// The primary motivation for this is cuDNN backend ExecutionPlans, which are
+// costly to recreate.
+//
+// All OpRunners must be outlived by their parent Stream.
+template <typename... Args>
+class OpRunner<void(Args...)> {
+ public:
+  virtual ~OpRunner() = default;
+
+  // Get a description of the runner, for uniqueness of autotune entries.
+  //
+  // Since this is used to determine whether runners are equivalent for the
+  // purpose of scoring autotune entries, it shall be unique among runners of
+  // the same op and parameters.
+  virtual std::string ToString() const = 0;
+
+  // Get the number of bytes of scratch space needed for `operator()`.
+  //
+  // If determining the workspace size can fail, runners should precompute and
+  // cache it at construction time.
+  virtual size_t GetWorkspaceSize() const = 0;
+
+  // Convert to an AlgorithmDesc for AoT compilation or autotuning.
+  virtual absl::StatusOr<AlgorithmDesc> ToAlgorithmDesc() const = 0;
+
+  // Launch the operation, with the signature determined by `Sig`.
+  virtual absl::Status operator()(Stream*, ProfileResult*,
+                                  DeviceMemoryBase scratch_memory,
+                                  Args... args) const = 0;
+};
+
+using ConvSignature = void(DeviceMemoryBase /* input_data */,
+                           DeviceMemoryBase /* filter_data */,
+                           DeviceMemoryBase /* output_data */);
+using ConvRunner = OpRunner<ConvSignature>;
+
+using GraphConvSignature = void(std::vector<DeviceMemoryBase>);
+using GraphConvRunner = OpRunner<GraphConvSignature>;
+
+using FusedConvSignature = void(DeviceMemoryBase /* input_data */,
+                                DeviceMemoryBase /* filter_data */,
+                                DeviceMemoryBase /* side_input_data */,
+                                DeviceMemoryBase /* bias_data */,
+                                DeviceMemoryBase /* output_data */);
+using FusedConvRunner = OpRunner<FusedConvSignature>;
+
+using FusedMatmulSignature = void(DeviceMemoryBase /* a_data */,
+                                  DeviceMemoryBase /* b_data */,
+                                  DeviceMemoryBase /* bias_data */,
+                                  DeviceMemoryBase /* c_data */);
+using FusedMatmulRunner = OpRunner<FusedMatmulSignature>;
+
+using NormSignature = void(std::vector<DeviceMemoryBase>);
+using NormRunner = OpRunner<NormSignature>;
+
+// Describes the configuration for the algorithms that will used.
+//
+// Arguments:
+//  algorithm: the primary algorithm that should be used.
+//  algorithm_no_scratch: a secondary algorithm that should be used, if the
+//    the allocation for the scratch memory fails.
+//  scrach_size: specify the size of scratch memory in bytes needed for the
+//    algorithm used.
+//
+// On CUDA platform with CUDNN library, algorithm and algorithm_no_scratch
+// would be used. On ROCm platform with MIOpen library, algorithm and
+// scratch_size would be used. The major difference between the two platforms
+// are whether it's possible to get an algorithm without scratch memory. On
+// CUDA + CUDNN it's possible, and algorithm_no_scratch can be used to track
+// such information, whereas on ROCm + MIOpen there is no guarantee to getting
+// one without scratch memory, and scratch_size field is used to track it.
+class AlgorithmConfig {
+ public:
+  AlgorithmConfig() = default;
+  explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
+  AlgorithmConfig(AlgorithmDesc algorithm, size_t scratch_size)
+      : algorithm_(algorithm), scratch_size_(scratch_size) {}
+  AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch)
+      : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {}
+  AlgorithmConfig(AlgorithmDesc algorithm, size_t scratch_size,
+                  AlgorithmDesc algorithm_no_scratch)
+      : algorithm_(algorithm),
+        algorithm_no_scratch_(algorithm_no_scratch),
+        scratch_size_(scratch_size) {}
+
+  // TODO(ruochengw): After cl/380702564, add support for algorithm configs with
+  // cuDNN Frontend APIs.
+  explicit AlgorithmConfig(const AlgorithmConfigProto& algorithm_config_proto) {
+    const AlgorithmProto& algorithm_proto = algorithm_config_proto.algorithm();
+    algorithm_ = AlgorithmDesc(algorithm_proto);
+    if (algorithm_config_proto.optional_scratch_size_case() !=
+        /*ONEOF_NAME_NOT_SET=*/0) {
+      scratch_size_ = algorithm_config_proto.scratch_size();
+    }
+    if (algorithm_config_proto.optional_algorithm_no_scratch_case() !=
+        /*ONEOF_NAME_NOT_SET=*/0) {
+      const AlgorithmProto& algorithm_no_scratch_proto =
+          algorithm_config_proto.algorithm_no_scratch();
+      algorithm_no_scratch_ = AlgorithmDesc(algorithm_no_scratch_proto);
+    }
+  }
+
+  std::optional<AlgorithmDesc> algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
+  std::optional<AlgorithmDesc> algorithm_no_scratch() const {
+    return algorithm_no_scratch_;
+  }
+  void set_algorithm_no_scratch(AlgorithmDesc val) {
+    algorithm_no_scratch_ = val;
+  }
+  std::optional<size_t> scratch_size() const { return scratch_size_; }
+  bool operator==(const AlgorithmConfig& other) const {
+    return this->algorithm_ == other.algorithm_ &&
+           this->algorithm_no_scratch_ == other.algorithm_no_scratch_ &&
+           this->scratch_size_ == other.scratch_size_;
+  }
+  bool operator!=(const AlgorithmConfig& other) const {
+    return !(*this == other);
+  }
+  std::string ToString() const;
+
+  // TODO(ruochengw): After cl/380702564, add support for algorithm configs with
+  // cuDNN Frontend APIs.
+  AlgorithmConfigProto ToProto() const {
+    AlgorithmConfigProto algorithm_config_proto;
+    if (algorithm_.has_value()) {
+      *algorithm_config_proto.mutable_algorithm() =
+          algorithm_.value().ToProto();
+    }
+    if (algorithm_no_scratch_.has_value()) {
+      *algorithm_config_proto.mutable_algorithm_no_scratch() =
+          algorithm_no_scratch_.value().ToProto();
+    }
+    if (scratch_size_.has_value()) {
+      algorithm_config_proto.set_scratch_size(scratch_size_.value());
+    }
+    return algorithm_config_proto;
+  }
+
+ private:
+  std::optional<AlgorithmDesc> algorithm_;
+  std::optional<AlgorithmDesc> algorithm_no_scratch_;
+  std::optional<size_t> scratch_size_;
+};
+
+// Describes a local response normalization (LRN). LRN is used e.g. in
+// dist_belief.
+//
+// Let V be the vector of feature maps at some (batch, y, x)
+// coordinate. LRN applies independently to each vector V in the
+// input, across all coordinates (batch, y, x), by mapping each V to
+// another vector U of the same size using the formula
+//
+//   U_i = V_i / ((bias + alpha * (sum_j V_j^2)) ^ beta)
+//
+// where the sum is taken over j in the closed range [i - range, i + range].
+//
+// When calculating U_i the j in the sum can extend beyond the bounds
+// of V. If wrap_around is true, then V_j = V_{j mod F} where F is the
+// size of V, which is the number of feature maps. If wrap_around is
+// false, then V_j = 0 for j outside [0, F-1].
+//
+// If segment_size <= F, where F is the number of feature_maps, then
+// segment_size has no effect. Otherwise, each consecutive segment of
+// segment_size entries in V are normalized separately.
+//
+// Not all StreamExecutors allow wrap_around == true or segment_size
+// != 64. Some do not implement normalization at all.
+class NormalizeDescriptor {
+ public:
+  NormalizeDescriptor();
+
+  NormalizeDescriptor& set_bias(float bias) {
+    bias_ = bias;
+    return *this;
+  }
+
+  NormalizeDescriptor& set_range(int32_t range) {
+    range_ = range;
+    return *this;
+  }
+
+  NormalizeDescriptor& set_alpha(float alpha) {
+    alpha_ = alpha;
+    return *this;
+  }
+
+  NormalizeDescriptor& set_beta(float beta) {
+    beta_ = beta;
+    return *this;
+  }
+
+  float bias() const { return bias_; }
+  int32_t range() const { return range_; }
+  float alpha() const { return alpha_; }
+  float beta() const { return beta_; }
+  bool wrap_around() const { return wrap_around_; }
+  int32_t segment_size() const { return segment_size_; }
+
+ private:
+  float bias_;
+  int32_t range_;
+  float alpha_;
+  float beta_;
+  bool wrap_around_;
+  int32_t segment_size_;
+};
+
+// Returns a string representation of the given activation mode.
+std::string ActivationModeString(ActivationMode mode);
+
+// Describes the operation that DoElementwiseOperation should perform on its
+// inputs.
+enum class ElementwiseOperation { kAdd, kMultiply };
+
+// A simple class representing the version of the backing library, to
+// workaround the "too perfect forwarding" issue in gcc6+ compilers.
+// See PR#16309 and issue #18402 for links discussing the issue.
+class VersionInfo {
+ public:
+  explicit VersionInfo(int major = 0, int minor = 0, int patch = 0)
+      : major_(major), minor_(minor), patch_(patch) {}
+  explicit VersionInfo(DnnVersionInfoProto proto)
+      : major_(proto.major()), minor_(proto.minor()), patch_(proto.patch()) {}
+
+  DnnVersionInfoProto ToProto() const {
+    DnnVersionInfoProto proto;
+    proto.set_major(major_);
+    proto.set_minor(minor_);
+    proto.set_patch(patch_);
+    return proto;
+  }
+
+  int major_version() const { return major_; }
+  int minor_version() const { return minor_; }
+  int patch() const { return patch_; }
+
+  std::tuple<int, int, int> as_tuple() const {
+    return std::make_tuple(major_, minor_, patch_);
+  }
+
+  friend bool operator<(const VersionInfo& a, const VersionInfo& b) {
+    return a.as_tuple() < b.as_tuple();
+  }
+  friend bool operator>(const VersionInfo& a, const VersionInfo& b) {
+    return a.as_tuple() > b.as_tuple();
+  }
+  friend bool operator<=(const VersionInfo& a, const VersionInfo& b) {
+    return a.as_tuple() <= b.as_tuple();
+  }
+  friend bool operator>=(const VersionInfo& a, const VersionInfo& b) {
+    return a.as_tuple() >= b.as_tuple();
+  }
+  friend bool operator==(const VersionInfo& a, const VersionInfo& b) {
+    return a.as_tuple() == b.as_tuple();
+  }
+  friend bool operator!=(const VersionInfo& a, const VersionInfo& b) {
+    return a.as_tuple() != b.as_tuple();
+  }
+
+  std::string ToString() const {
+    return absl::StrCat(major_, ".", minor_, ".", patch_);
+  }
+
+ private:
+  int major_;
+  int minor_;
+  int patch_;
+};
+
+class DnnSupport;
+
+class DnnGraph {
+ public:
+  DnnGraph() = default;
+  virtual ~DnnGraph() = default;
+
+  virtual absl::Status Prepare(DnnSupport&, const NumericOptions&) = 0;
+  virtual absl::Status Build(DnnSupport&, std::optional<int64_t> plan_id) = 0;
+  virtual absl::Status Execute(Stream& stream,
+                               absl::Span<DeviceMemoryBase> operands,
+                               int64_t local_device_ordinal) const = 0;
+  virtual void InitDropoutState(int64_t local_device_count, int64_t seed,
+                                int64_t increment) = 0;
+};
+
+using LazyDnnGraph = std::unique_ptr<DnnGraph>;
+
+// Suite of operations typically used for implementing Deep/Convolutional Neural
+// Nets. Note: A false return value of an operation indicates the
+// implementation is not available.
+//
+// TODO(b/118763918): this class (or rather dispatch table) has several
+// problems:
+// * Some overloads are missing. Ideally we want to have template virtual
+//   functions while the template arguments is a closed set. However, we don't
+//   get that from the language.
+// * The API is a union of cuDNN and another private backend. Only 10% of the
+//   functions are actually implemented by both backends, the rest are
+//   actually backend-specific. The massive interface creates extra mental
+//   burden.
+// * Poor error handling: the API should return absl::Status objects.
+//
+// PrepareForConvolution is an example for how new APIs should be written.
+class DnnSupport {
+ public:
+  DnnSupport() = default;
+  virtual ~DnnSupport() = default;
+
+  virtual absl::Status Init() = 0;
+
+  // Gets the version of the backing library, as a VersionInfo object.
+  virtual absl::StatusOr<VersionInfo> GetVersion() {
+    return absl::UnimplementedError(
+        "DnnSupport::GetVersion not implemented on this platform.");
+  }
+
+  // Performs a single-precision forward batch normalization operation onto
+  // the stream.
+  //
+  // Arguments:
+  //  stream: borrowed pointer to the stream that the batch normalization
+  //    operation should be enqueued onto.
+  //  x: input data.
+  //  scale: scaling parameters.
+  //  offset: offset parameters.
+  //  estimated_mean: population mean estimated during training.
+  //    Used for inference only; empty for training.
+  //  estimated_variance: population variance estimated during training,
+  //    used for inference only; empty for training.
+  //  side_input: optional input that is element-wise added to the output of
+  //    batch normalization.
+  //  x_desc: dimensions of the input data, which is the same as the dimensions
+  //    of the output and side input.
+  //  scale_offset_desc: dimensions of scale and offset.
+  //  epsilon: a small floating point number added to the variance of x.
+  //  activation_mode: activation applied to the result of batch normalization
+  //    (or after adding optional side input)
+  //  y: output data.
+  //  batch_mean: batch mean, to be used to compute the running mean.
+  //  batch_variance: batch variance, to be used to compute
+  //    the running variance.
+  //  reserve_space_1: saved mean, to be reused in the backward gradient
+  //    computation.
+  //  reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused
+  //    in the backward gradient computation.
+  //  is_training: Set to true for training, false for inference.
+  virtual bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<float>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<float>& side_input, const BatchDescriptor& x_desc,
+      const BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor, ActivationMode activation_mode,
+      DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
+      DeviceMemory<float>* reserve_space_2, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Performs a half-precision forwards batch normalization operation onto the
+  // stream. See DoBatchNormalizationForward above for argument details.
+  virtual bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::half>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::half>& side_input,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, const double exponential_average_factor,
+      ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* reserve_space_1,
+      DeviceMemory<float>* reserve_space_2, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Performs a bfloat16 forward batch normalization operation onto the
+  // stream. See DoBatchNormalizationForward above for argument details.
+  virtual bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::bfloat16>& side_input,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, const double exponential_average_factor,
+      ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* reserve_space_1,
+      DeviceMemory<float>* reserve_space_2, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Performs a single-precision backward batch normalization gradient
+  // computation operation onto the stream.
+  //
+  // Arguments:
+  //  stream: borrowed pointer to the stream that the batch normalization
+  //    gradient computation operation should be enqueued onto.
+  //  y_backprop: gradient with regard to output y.
+  //  x: input data.
+  //  scale: scaling parameters.
+  //  inv_var: 1/sqrt(epsilon + variance) of x.
+  //  x_desc: dimensions of the input data, which is the same as the dimensions
+  //    of the output.
+  //  scale_offset_desc: dimensions of scale and offset.
+  //  epsilon: a small floating point number added to the variance of x.
+  //  x_backprop: gradient with respect to input x.
+  //  scale_backprop: gradient with respect to scale.
+  //  offset_backprop: gradient with respect to offset.
+  virtual bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<float>& y_backprop,
+      const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var, const DeviceMemory<float>& y,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, ActivationMode activation_mode,
+      DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+      DeviceMemory<float>* offset_backprop,
+      DeviceMemory<float>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Performs a half-precision backward batch normalization gradient computation
+  // operation onto the stream. See DoBatchNormalizationBackward above for
+  // argument details.
+  virtual bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+      const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::half>& y,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, ActivationMode activation_mode,
+      DeviceMemory<Eigen::half>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::half>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Performs a bfloat16 backward batch normalization gradient computation
+  // operation onto the stream. See DoBatchNormalizationBackward above for
+  // argument details.
+  virtual bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+      const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var,
+      const DeviceMemory<Eigen::bfloat16>& y, const BatchDescriptor& x_desc,
+      const BatchDescriptor& scale_offset_desc, const double epsilon,
+      ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Enqueues a fused convolution operation onto the stream.
+  // We provide several variants with different types for inputs, biases and
+  // scaling parameters.
+  //
+  // Arguments (all borrowed):
+  //  stream: borrowed pointer to the stream that the 'convolve' operation
+  //    should be enqueued onto.
+  //  conv_input_descriptor: dimensions of the convolution input layer.
+  //  conv_input_data: un-owned device memory region which contains the
+  //    convolution input.
+  //  conv_input_scale: a floating point scale to multiply with each element
+  //    of conv_input_data.
+  //  filter_descriptor: dimensions of the convolution filter.
+  //  filter_data: un-owned device memory region which contains the
+  //    convolution filter weights.
+  //  convolution_descriptor: stride of the convolution filter.
+  //  biases: un-owned device memory region containing biases to add to the
+  //    input.
+  //  activation_mode: Type of activation to perform.
+  //  side_input_data: un-owned device memory region which contains optional
+  //    side input data. If 'side_input_scale' is non-zero, then this must
+  //    point to data in the tensor shape specified by output_shape.
+  //    It will be scaled by 'side_input_scale' and added to the convolution
+  //    result and bias prior to applying the activation function.
+  //  side_input_scale: a floating point scale to multiply with each element
+  //    of side_input_data.
+  //  output_descriptor: dimensions of the output layer.
+  //  output_data: un-owned device memory region in which to place the
+  //    convolution result.
+  //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
+  //    space in order to speed up the convolution operation.
+  //  algorithm_config: specifies which algorithm should be used for the
+  //    operation.
+  //  output_profile_result: the output profile result for this call. The
+  //    profiling is only enabled when this is not nullptr.
+  //
+  // conv_input_descriptor, filter_descriptor, convolution_descriptor and
+  // output_descriptor together specify exactly how the convolution is aligned
+  // with the input data:
+  //
+  // * (input dimensions - filter size + 1) / filter stride == output dimensions
+  //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
+  // * input dimensions / filter stride == output dimensions
+  //   corresponds to dist_belief padding = SAME, i.e. input and output are the
+  //   same size - this requires padding the input.
+  // * (input dimensions + filter size - 1) / filter stride == output dimensions
+  //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
+  //   that if the inverse of the filter is applied to the output in VALID mode
+  //   the result is the same size as the input - this requires even more
+  //   padding of the input.
+  virtual absl::Status DoFusedConvolve(
+      Stream* stream, DataType input_type, DataType side_input_type,
+      DataType bias_type, DataType output_type,
+      const BatchDescriptor& conv_input_descriptor,
+      DeviceMemoryBase conv_input_data, double conv_input_scale,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      DeviceMemoryBase side_input_data, double side_input_scale,
+      const BatchDescriptor& bias_descriptor, DeviceMemoryBase biases,
+      ActivationMode activation_mode, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data, ScratchAllocator* scratch_allocator,
+      const AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    return absl::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
+  }
+
+  template <typename InputT, typename ScaleT, typename SideInputT,
+            typename BiasT, typename OutputT>
+  absl::Status FusedConvolveWithAlgorithm(
+      Stream* stream, const BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<InputT>& conv_input_data, ScaleT conv_input_scale,
+      const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<InputT>& filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<SideInputT>& side_input_data, ScaleT side_input_scale,
+      const BatchDescriptor& bias_descriptor, const DeviceMemory<BiasT>& biases,
+      ActivationMode activation_mode, const BatchDescriptor& output_descriptor,
+      DeviceMemory<OutputT>* output, ScratchAllocator* scratch_allocator,
+      const AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    return DoFusedConvolve(
+        stream, ToDataType<InputT>::value, ToDataType<SideInputT>::value,
+        ToDataType<BiasT>::value, ToDataType<OutputT>::value,
+        conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, *output, scratch_allocator, algorithm_config,
+        output_profile_result);
+  }
+
+  template <typename ElementType, typename OutputType>
+  absl::Status PrepareForConvolution(
+      ConvolutionKind kind, Stream* stream,
+      const BatchDescriptor& batch_descriptor,
+      DeviceMemory<ElementType> input_data,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<ElementType> filter_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<OutputType> output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8_t>* scratch_memory) {
+    return DoPrepareForConvolution(
+        kind, ToDataType<ElementType>::value, stream, batch_descriptor,
+        input_data, filter_descriptor, filter_data, output_descriptor,
+        output_data, convolution_descriptor, algorithm_config,
+        scratch_allocator, algorithm_desc, scratch_memory);
+  }
+
+  // cuDNN-specific input transformation that allows running int8x32
+  // convolutions faster using Tensor Core IMMA instruction.
+  virtual absl::Status CudnnReorderConvolutionFilterAndBias(
+      Stream* stream, const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<int8_t>& filter_input,
+      DeviceMemory<int8_t>* filter_output,
+      std::optional<const DeviceMemory<float>> bias_input,
+      std::optional<DeviceMemory<float>> bias_output) {
+    return absl::UnimplementedError(
+        "DnnSupport::CudnnReorderConvolutionFilterAndBias is specific to CUDA "
+        "convolution implementation.");
+  }
+
+  // Enqueues a single-precision convolution operation onto the stream.
+  //
+  // Arguments (all borrowed):
+  //  stream: borrowed pointer to the stream that the 'convolve' operation
+  //    should be enqueued onto.
+  //  input_descriptor: dimensions of the input layer.
+  //  input_data: un-owned device memory region which contains the
+  //    convolution input.
+  //  filter_descriptor: dimensions of the convolution filter.
+  //  convolution_descriptor: stride of the convolution filter.
+  //  output_descriptor: dimensions of the output layer.
+  //  output_data: un-owned device memory region in which to place the
+  //    convolution result.
+  //  algorithm_desc: specifies which algorithm should be used for the
+  //    operation.
+  //  scratch: un-owned device memory for scratch space in order to speed up
+  //    the convolution operation.
+  //  output_profile_result: the output profile result for this call. The
+  //    profiling is only enabled when this is not nullptr.
+  //
+  // input_descriptor, filter_descriptor, convolution_descriptor and
+  // output_descriptor together specify exactly how the convolution is aligned
+  // with the input data:
+  //
+  // * (input dimensions - filter size + 1) / filter stride == output dimensions
+  //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
+  // * input dimensions / filter stride == output dimensions
+  //   corresponds to dist_belief padding = SAME, i.e. input and output are the
+  //   same size - this requires padding the input.
+  // * (input dimensions + filter size - 1) / filter stride == output dimensions
+  //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
+  //   that if the inverse of the filter is applied to the output in VALID mode
+  //   the result is the same size as the input - this requires even more
+  //   padding of the input.
+  virtual absl::Status DoConvolve(
+      ConvolutionKind kind, DataType element_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      AlgorithmDesc algorithm_desc, DeviceMemory<uint8_t> scratch_memory,
+      ProfileResult* output_profile_result) = 0;
+
+  template <typename InputType, typename OutputType>
+  absl::Status ConvolveWithAlgorithm(
+      Stream* stream, ConvolutionKind kind,
+      const BatchDescriptor& input_descriptor,
+      DeviceMemory<InputType> input_data,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<InputType> filter_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<OutputType> output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      const AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    DeviceMemory<uint8_t> scratch_memory;
+    AlgorithmDesc algorithm_desc;
+    TF_RETURN_IF_ERROR(PrepareForConvolution(
+        kind, stream, input_descriptor, input_data, filter_descriptor,
+        filter_data, output_descriptor, output_data, convolution_descriptor,
+        algorithm_config, scratch_allocator, &algorithm_desc, &scratch_memory));
+    return DoConvolve(kind, ToDataType<InputType>::value,
+                      ToDataType<OutputType>::value, stream, input_descriptor,
+                      input_data, filter_descriptor, filter_data,
+                      output_descriptor, output_data, convolution_descriptor,
+                      algorithm_desc, scratch_memory, output_profile_result);
+  }
+
+  virtual absl::Status GetConvolveRunners(
+      bool use_cudnn_frontend, ConvolutionKind kind, DataType input_type,
+      DataType output_type, Stream* stream,
+      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+      ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const ConvRunner>>* out_exec_plans);
+
+  virtual absl::StatusOr<std::unique_ptr<const ConvRunner>>
+  ConvolveRunnerFromDesc(Stream* stream, const AlgorithmDesc& algorithm_desc,
+                         ConvolutionKind kind, DataType element_type,
+                         DataType output_type,
+                         const BatchDescriptor& input_descriptor,
+                         const FilterDescriptor& filter_descriptor,
+                         const BatchDescriptor& output_descriptor,
+                         const ConvolutionDescriptor& convolution_descriptor);
+
+  virtual absl::Status GetGraphConvolveRunners(
+      ConvolutionKind kind, DataType input_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const GraphConvRunner>>* out_exec_plans,
+      std::string serialized_graph);
+
+  virtual absl::StatusOr<std::unique_ptr<const GraphConvRunner>>
+  GraphConvolveRunnerFromDesc(
+      Stream* stream, const AlgorithmDesc& algorithm_desc, ConvolutionKind kind,
+      DataType element_type, DataType output_type,
+      const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor,
+      std::string serialized_graph);
+
+  virtual absl::Status GetFusedConvolveRunners(
+      bool use_cudnn_frontend, ConvolutionKind kind, DataType element_type,
+      DataType bias_type, DataType output_type, double conv_input_scale,
+      double side_input_scale, double leakyrelu_alpha, Stream* stream,
+      const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& bias_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+      ActivationMode activation_mode, const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const FusedConvRunner>>* out_exec_plans);
+
+  virtual absl::Status GetFusedMatmulRunners(
+      bool use_cudnn_frontend, DataType element_type, DataType bias_type,
+      DataType output_type, Stream* stream, bool trans_a, bool trans_b,
+      uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
+      ActivationMode activation_mode, bool use_fallback,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const FusedMatmulRunner>>* out_exec_plans);
+
+  virtual absl::StatusOr<std::unique_ptr<const FusedConvRunner>>
+  FusedConvolveRunnerFromDesc(
+      Stream* stream, const AlgorithmDesc& algorithm_desc, ConvolutionKind kind,
+      DataType element_type, DataType bias_type, DataType output_type,
+      double conv_scale, double side_input_scale, double leakyrelu_alpha,
+      const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& bias_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor,
+      ActivationMode activation_mode);
+
+  virtual absl::StatusOr<std::unique_ptr<const dnn::NormRunner>>
+  NormRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::NormKind kind, double epsilon,
+      const dnn::TensorDescriptor& x_descriptor,
+      const dnn::TensorDescriptor& scale_descriptor,
+      const dnn::TensorDescriptor& y_or_dx_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor,
+      std::optional<dnn::TensorDescriptor> dy_descriptor,
+      std::optional<dnn::TensorDescriptor> expectation_descriptor,
+      std::optional<dnn::TensorDescriptor> norm_factor_descriptor,
+      std::optional<dnn::TensorDescriptor> dscale_descriptor,
+      std::optional<dnn::TensorDescriptor> dbias_descriptor);
+
+  virtual absl::StatusOr<std::unique_ptr<DnnGraph>> DeserializeGraph(
+      absl::string_view) const {
+    return absl::UnimplementedError("Graph support requires cuDNN >= 8.1.");
+  };
+
+  virtual bool GetMIOpenConvolveAlgorithms(
+      ConvolutionKind kind, DataType element_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      std::vector<ProfileResult>* out_algorithms);
+
+  // Returns a list of supported rnn algorithms.
+  virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
+
+  template <typename ElementType>
+  absl::Status PoolForward(Stream* stream,
+                           const PoolingDescriptor& pooling_dimensions,
+                           const NumericOptions& numeric_options,
+                           const BatchDescriptor& input_dimensions,
+                           const DeviceMemory<ElementType>& input_data,
+                           const BatchDescriptor& output_dimensions,
+                           DeviceMemory<ElementType>* output_data,
+                           ScratchAllocator* workspace_allocator = nullptr) {
+    return DoPoolForward(ToDataType<ElementType>::value, stream,
+                         pooling_dimensions, numeric_options, input_dimensions,
+                         input_data, output_dimensions, *output_data,
+                         workspace_allocator);
+  }
+
+  template <typename ElementType>
+  absl::Status PoolBackward(Stream* stream,
+                            const PoolingDescriptor& pooling_dimensions,
+                            const NumericOptions& numeric_options,
+                            const BatchDescriptor& input_dimensions,
+                            const DeviceMemory<ElementType>& input_data,
+                            const BatchDescriptor& output_dimensions,
+                            const DeviceMemory<ElementType>& output_data,
+                            const DeviceMemory<ElementType>& input_diff_data,
+                            DeviceMemory<ElementType>* output_diff_data,
+                            ScratchAllocator* workspace_allocator = nullptr) {
+    return DoPoolBackward(
+        ToDataType<ElementType>::value, stream, pooling_dimensions,
+        numeric_options, input_dimensions, input_data, output_dimensions,
+        output_data, input_diff_data, *output_diff_data, workspace_allocator);
+  }  // Performs a forward pooling operation on input_data, writing to
+  // output_data. See PoolingDescriptor for how to configure the
+  // pooling operation.
+  //
+  // Pooling happens as a window that moves across the Y and X
+  // dimensions of input_data, where each position of the window
+  // yields one output value. E.g. for max pooling, the computed value
+  // is the maximum element in the window. The operation is applied
+  // independently to each batch and at each feature map (depth), so
+  // that the output depth and feature_map_count are the same as for
+  // the input. The output width and height can be different.
+  //
+  // See PoolingDescriptor for how to configure the pooling operation.
+  virtual absl::Status DoPoolForward(
+      DataType element_type, Stream* stream,
+      const PoolingDescriptor& pooling_dimensions,
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      ScratchAllocator* workspace_allocator) = 0;
+
+  virtual absl::Status DoPoolForward(
+      DataType element_type, Stream* stream,
+      const PoolingDescriptor& pooling_dimensions,
+      const NumericOptions& numeric_options,
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      ScratchAllocator* workspace_allocator);
+
+  // Performs differentiation of the pooling operation.
+  virtual absl::Status DoPoolBackward(
+      DataType element_type, Stream* stream,
+      const PoolingDescriptor& pooling_dimensions,
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
+      ScratchAllocator* workspace_allocator) = 0;
+
+  virtual absl::Status DoPoolBackward(
+      DataType element_type, Stream* stream,
+      const PoolingDescriptor& pooling_dimensions,
+      const NumericOptions& numeric_options,
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
+      ScratchAllocator* workspace_allocator);
+
+  // Applies local response normalization to the values from input_data and
+  // writes the result to output_data.
+  //
+  // See comments on NormalizeDescriptor for a description of local response
+  // normalization.
+  virtual bool DoNormalizeWithDimensions(
+      Stream* stream, const NormalizeDescriptor& normalize_descriptor,
+      const BatchDescriptor& dimensions, const DeviceMemory<float>& input_data,
+      DeviceMemory<float>* output_data) {
+    return false;
+  }
+
+  // Performs backpropagation for the normalization operation
+  //
+  // Given raw data, its corresponding normalized output, and a gradient of some
+  // unspecified function with respect to the normalized variables, computes the
+  // gradient of that unspecified function with respect to the raw variables.
+  //
+  // The normalized data input array is expected to match the output that would
+  // be obtained by running the raw data input array through the DoNormalize
+  // method above.
+  //
+  // See comments on NormalizeDescriptor for a description of local response
+  // normalization.
+  virtual bool DoNormalizeBackwardWithDimensions(
+      Stream* stream, const NormalizeDescriptor& normalize_descriptor,
+      const BatchDescriptor& dimensions, const DeviceMemory<float>& raw_data,
+      const DeviceMemory<float>& normalized_data,
+      const DeviceMemory<float>& normalized_variable_gradient,
+      DeviceMemory<float>* raw_variable_gradient,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
+  // Create an RNN descriptor based on model shapes and configurations.
+  // The caller retains the ownership of the descriptor.
+  //
+  // Arguments:
+  //  num_layers: the number of layers for a RNN model.
+  //  hidden_size: the size of the hidden state.
+  //  input_size: the size of the input state.
+  //  cell_size: the size of the cell state
+  //  input_mode: an enum to specify whether a linear transformation is added
+  //    after the input state. If input_size is different from hidden_size, this
+  //    is required.
+  //  direction_mode: an enum to specify whether this model is unidirectional or
+  //    bidirectional.
+  //  rnn_mode: an enum to specify the type of model to build.
+  //  data_type: an enum to specify the data types used in this model.
+  //  dropout: the dropout threshold between layers. When it is 0., no dropout
+  //    is added.
+  //  seed: a seed for initializing the dropout layers.
+  //  state_allocator: an memory allocator that will be used to store the state
+  //    for dropout layer. The user has to maintain the memory until the model
+  //    is no longer in use.
+  //  use_padded_io: a bool to specify whether the input is using padded IO.
+  virtual absl::StatusOr<std::unique_ptr<RnnDescriptor>> CreateRnnDescriptor(
+      int num_layers, int hidden_size, int input_size, int cell_size,
+      int batch_size, RnnInputMode input_mode, RnnDirectionMode direction_mode,
+      RnnMode rnn_mode, DataType data_type,
+      const AlgorithmConfig& algorithm_config,
+      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      ScratchAllocator* state_allocator, bool use_padded_io) {
+    return absl::UnimplementedError("CreateRnnDescriptor is unimplemented");
+  }
+
+  // Create a RNN sequence descriptor that specifies either the input or output
+  // sequence. The caller retains the ownership of the returned descriptor.
+  //
+  // Arguments:
+  //  max_seq_length: the max length of the sequences.
+  //  batch_size: the size of a minibatch.
+  //  data_size: the size of the state.
+  //  seq_lengths: the lengths of sequences in a batch.
+  //  data_type: an enum to specify the type for the underlying data.
+  virtual absl::StatusOr<std::unique_ptr<RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size, DataType data_type) {
+    return absl::UnimplementedError(
+        "CreateRnnSequenceTensorDescriptor is unimplemented");
+  }
+
+  virtual absl::StatusOr<std::unique_ptr<RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size,
+                                    const absl::Span<const int>& seq_lengths,
+                                    bool time_major, DataType data_type) {
+    return absl::UnimplementedError(
+        "CreateRnnSequenceTensorDescriptor is unimplemented");
+  }
+
+  // Create an RNN state descriptor that specifies the input or hidden state.
+  // The caller retains the ownership of the returned descriptor.
+  virtual absl::StatusOr<std::unique_ptr<RnnStateTensorDescriptor>>
+  CreateRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+                                 DataType data_type) {
+    return absl::UnimplementedError(
+        "CreateRnnStateTensorDescriptor is unimplemented");
+  }
+
+  // Enqueue a forward operation of the RNN model onto the stream.
+  //
+  // Arguments:
+  //  stream: pointer to the stream where this operation should be enqueued to.
+  //  rnn_desc: a RNN descriptor created by CreateRnnDescriptor.
+  //  input_desc: descriptor for the input sequence.
+  //  input_data: the device memory region that contains the input data.
+  //  input_h_desc: descriptor for the input "h" state.
+  //  input_h_data: the device memory region that contains the input "h" data.
+  //  input_c_desc: descriptor for the input "c" state.
+  //  input_c_data: the device memory region that contains the input "c" data.
+  //    This must be specified for LSTM models.
+  //  params: the device memory region that contains the parameters used in this
+  //    model.
+  //  output_desc: descriptor for the output sequence.
+  //  output_data: the memory region that stores the output sequence data.
+  //  output_h_desc: descriptor for the output "h" state.
+  //  output_h_data: the memory region that stores the output "h" data.
+  //  output_c_desc: descriptor for the output "c" state.
+  //  output_c_data: the memory region that stores the output "c" data. This
+  //    must be specified for LSTM models.
+  //  is_training: whether this is used in training or inference. That decides
+  //    whether respace_space data need to be produced.
+  //  reserve_space_allocator: if "is_training" is true, an memory allocator
+  //    to create memory that holds the produced reserve_space. The caller is
+  //  retains the data and feed it to the backward pass.
+  //  workspace_allocator: an allocator to create temporary workspace used in
+  //    this kernel. The caller is responsible for retaining the memory long
+  //    enough for the lifespan of this operation, and recycles afterwards.
+  virtual bool DoRnnForward(Stream* stream, const RnnDescriptor& rnn_desc,
+                            const RnnSequenceTensorDescriptor& input_desc,
+                            const DeviceMemory<Eigen::half>& input_data,
+                            const DeviceMemory<int>& seq_lengths_data,
+                            const RnnStateTensorDescriptor& input_h_desc,
+                            const DeviceMemory<Eigen::half>& input_h_data,
+                            const RnnStateTensorDescriptor& input_c_desc,
+                            const DeviceMemory<Eigen::half>& input_c_data,
+                            const DeviceMemory<Eigen::half>& params,
+                            const RnnSequenceTensorDescriptor& output_desc,
+                            DeviceMemory<Eigen::half>* output_data,
+                            const RnnStateTensorDescriptor& output_h_desc,
+                            DeviceMemory<Eigen::half>* output_h_data,
+                            const RnnStateTensorDescriptor& output_c_desc,
+                            DeviceMemory<Eigen::half>* output_c_data,
+                            bool is_training,
+                            ScratchAllocator* reserve_space_allocator,
+                            ScratchAllocator* workspace_allocator,
+                            ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoRnnForward(Stream* stream, const RnnDescriptor& rnn_desc,
+                            const RnnSequenceTensorDescriptor& input_desc,
+                            const DeviceMemory<float>& input_data,
+                            const DeviceMemory<int>& seq_lengths_data,
+                            const RnnStateTensorDescriptor& input_h_desc,
+                            const DeviceMemory<float>& input_h_data,
+                            const RnnStateTensorDescriptor& input_c_desc,
+                            const DeviceMemory<float>& input_c_data,
+                            const DeviceMemory<float>& params,
+                            const RnnSequenceTensorDescriptor& output_desc,
+                            DeviceMemory<float>* output_data,
+                            const RnnStateTensorDescriptor& output_h_desc,
+                            DeviceMemory<float>* output_h_data,
+                            const RnnStateTensorDescriptor& output_c_desc,
+                            DeviceMemory<float>* output_c_data,
+                            bool is_training,
+                            ScratchAllocator* reserve_space_allocator,
+                            ScratchAllocator* workspace_allocator,
+                            ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoRnnForward(Stream* stream, const RnnDescriptor& rnn_desc,
+                            const RnnSequenceTensorDescriptor& input_desc,
+                            const DeviceMemory<double>& input_data,
+                            const DeviceMemory<int>& seq_lengths_data,
+                            const RnnStateTensorDescriptor& input_h_desc,
+                            const DeviceMemory<double>& input_h_data,
+                            const RnnStateTensorDescriptor& input_c_desc,
+                            const DeviceMemory<double>& input_c_data,
+                            const DeviceMemory<double>& params,
+                            const RnnSequenceTensorDescriptor& output_desc,
+                            DeviceMemory<double>* output_data,
+                            const RnnStateTensorDescriptor& output_h_desc,
+                            DeviceMemory<double>* output_h_data,
+                            const RnnStateTensorDescriptor& output_c_desc,
+                            DeviceMemory<double>* output_c_data,
+                            bool is_training,
+                            ScratchAllocator* reserve_space_allocator,
+                            ScratchAllocator* workspace_allocator,
+                            ProfileResult* output_profile_result) {
+    return false;
+  }
+  // Enqueue a backward operation of the RNN model onto the stream.
+  //
+  // Arguments:
+  //  stream: pointer to the stream where this operation should be enqueued to.
+  //  rnn_desc: a RNN descriptor created by CreateRnnDescriptor.
+  //  input_desc: descriptor for the input sequence.
+  //  input_data: the device memory region that contains the input data.
+  //  input_h_desc: descriptor for the input "h" state.
+  //  input_h_data: the device memory region that contains the input "h" data.
+  //  input_c_desc: descriptor for the input "c" state.
+  //  input_c_data: the device memory region that contains the input "c" data.
+  //    This must be specified for LSTM models.
+  //  params: the device memory region that contains the parameters used in this
+  //    model.
+  //  output_desc: descriptor for the output sequence.
+  //  output_data: the memory region that stores the output sequence data.
+  //  output_h_desc: descriptor for the output "h" state.
+  //  output_h_data: the memory region that stores the output "h" data.
+  //  output_c_desc: descriptor for the output "c" state.
+  //  output_c_data: the memory region that stores the output "c" data. This
+  //    must be specified for LSTM models.
+  //  output_backprop_data: the device memory region that contains the backprop
+  //    to the output sequence.
+  //  output_h_backprop_data: the device memory region that contains the
+  //    backprop to the output "h" state.
+  //  output_c_backprop_data: the device memory region that contains the
+  //    backprop to the output "c" state.
+  //  input_backprop_data: the device memory region that stores the backprop
+  //    to the input sequence.
+  //  input_h_backprop_data: the device memory region that stores the backprop
+  //    to the input "h" state.
+  //  input_c_backprop_data: the device memory region that stores the backprop
+  //    to the input "c" state.
+  //  params_backprop_data: the device memory region that stores the backprop
+  //    to the parameters.
+  //  reserve_space_data: the reserve_space data that is produced by the forward
+  //    operation. This memory region could be modified by this operation.
+  //  workspace_allocator: a memory allocator that creates the temporary
+  //    workspace memory used by this operation. The caller is responsible for
+  //    keeping the memory alive long enough for this operation, and recylces
+  //    afterwards.
+  virtual bool DoRnnBackward(
+      Stream* stream, const RnnDescriptor& rnn_desc,
+      const RnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<Eigen::half>& input_data,
+      const DeviceMemory<int>& seq_lengths_data,
+      const RnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<Eigen::half>& input_h_data,
+      const RnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<Eigen::half>& input_c_data,
+      const DeviceMemory<Eigen::half>& params,
+      const RnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<Eigen::half>& output_data,
+      const RnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<Eigen::half>& output_h_data,
+      const RnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<Eigen::half>& output_c_data,
+      const DeviceMemory<Eigen::half>& output_backprop_data,
+      const DeviceMemory<Eigen::half>& output_h_backprop_data,
+      const DeviceMemory<Eigen::half>& output_c_backprop_data,
+      DeviceMemory<Eigen::half>* input_backprop_data,
+      DeviceMemory<Eigen::half>* input_h_backprop_data,
+      DeviceMemory<Eigen::half>* input_c_backprop_data,
+      DeviceMemory<Eigen::half>* params_backprop_data,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator,
+      ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoRnnBackward(Stream* stream, const RnnDescriptor& rnn_desc,
+                             const RnnSequenceTensorDescriptor& input_desc,
+                             const DeviceMemory<float>& input_data,
+                             const DeviceMemory<int>& seq_lengths_data,
+                             const RnnStateTensorDescriptor& input_h_desc,
+                             const DeviceMemory<float>& input_h_data,
+                             const RnnStateTensorDescriptor& input_c_desc,
+                             const DeviceMemory<float>& input_c_data,
+                             const DeviceMemory<float>& params,
+                             const RnnSequenceTensorDescriptor& output_desc,
+                             const DeviceMemory<float>& output_data,
+                             const RnnStateTensorDescriptor& output_h_desc,
+                             const DeviceMemory<float>& output_h_data,
+                             const RnnStateTensorDescriptor& output_c_desc,
+                             const DeviceMemory<float>& output_c_data,
+                             const DeviceMemory<float>& output_backprop_data,
+                             const DeviceMemory<float>& output_h_backprop_data,
+                             const DeviceMemory<float>& output_c_backprop_data,
+                             DeviceMemory<float>* input_backprop_data,
+                             DeviceMemory<float>* input_h_backprop_data,
+                             DeviceMemory<float>* input_c_backprop_data,
+                             DeviceMemory<float>* params_backprop_data,
+                             DeviceMemory<uint8_t>* reserve_space_data,
+                             ScratchAllocator* workspace_allocator,
+                             ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoRnnBackward(Stream* stream, const RnnDescriptor& rnn_desc,
+                             const RnnSequenceTensorDescriptor& input_desc,
+                             const DeviceMemory<double>& input_data,
+                             const DeviceMemory<int>& seq_lengths_data,
+                             const RnnStateTensorDescriptor& input_h_desc,
+                             const DeviceMemory<double>& input_h_data,
+                             const RnnStateTensorDescriptor& input_c_desc,
+                             const DeviceMemory<double>& input_c_data,
+                             const DeviceMemory<double>& params,
+                             const RnnSequenceTensorDescriptor& output_desc,
+                             const DeviceMemory<double>& output_data,
+                             const RnnStateTensorDescriptor& output_h_desc,
+                             const DeviceMemory<double>& output_h_data,
+                             const RnnStateTensorDescriptor& output_c_desc,
+                             const DeviceMemory<double>& output_c_data,
+                             const DeviceMemory<double>& output_backprop_data,
+                             const DeviceMemory<double>& output_h_backprop_data,
+                             const DeviceMemory<double>& output_c_backprop_data,
+                             DeviceMemory<double>* input_backprop_data,
+                             DeviceMemory<double>* input_h_backprop_data,
+                             DeviceMemory<double>* input_c_backprop_data,
+                             DeviceMemory<double>* params_backprop_data,
+                             DeviceMemory<uint8_t>* reserve_space_data,
+                             ScratchAllocator* workspace_allocator,
+                             ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  template <typename ElementType>
+  absl::Status PrepareForCtcLoss(Stream* stream,
+                                 const RnnStateTensorDescriptor& probs_desc,
+                                 DeviceMemory<ElementType> probs_data,
+                                 const RnnStateTensorDescriptor& grads_desc,
+                                 absl::Span<const int> labels_data,
+                                 absl::Span<const int> labels_lengths_data,
+                                 absl::Span<const int> input_lengths_data,
+                                 const NumericOptions& numeric_options,
+                                 ScratchAllocator* workspace_allocator,
+                                 DeviceMemory<uint8_t>* scratch_memory,
+                                 int* ctc_loss_algo_id) {
+    return DoPrepareForCtcLoss(
+        stream, ToDataType<ElementType>::value, probs_desc, grads_desc,
+        labels_data, labels_lengths_data, input_lengths_data, numeric_options,
+        workspace_allocator, scratch_memory, ctc_loss_algo_id);
+  }
+
+  // Enqueue a CTC Loss operation onto the stream.
+  //
+  // Arguments:
+  //  stream: pointer to the stream where this operation should be enqueued to.
+  //  element_type: date type of the input tensors
+  //  probs_desc: specifies the shape and the data layout of the input tensor.
+  //  probs_data: the device memory region that contains the input tensor.
+  //  labels_data: the device memory region that contains the labels_value
+  //    tensor.
+  //  labels_lengths_data: the device memory region that contains the
+  //    labels_lengths tensor
+  //  input_lengths_data: the device memory region that contains the seq_lengths
+  //    tensor
+  //  costs_data: the device memory region that contains the costs tensor.
+  //  grads_desc: specifies the shape and the data layout of the grads tensor.
+  //  grads_data: the device memory region that contains the grads tensor.
+  //  ctc_loss_desc: a CTCLoss descriptor.
+  //  workspace_allocator: a memory allocator that creates the temporary
+  //    workspace memory used by this operation. The caller is responsible for
+  //    keeping the memory alive long enough for this operation, and recylces
+  //    afterwards.
+  virtual absl::Status DoCtcLoss(
+      Stream* stream, DataType element_type,
+      const RnnStateTensorDescriptor& probs_desc,
+      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
+      const RnnStateTensorDescriptor& grads_desc, DeviceMemoryBase grads_data,
+      DeviceMemory<uint8_t> scratch_memory, int ctc_loss_algo_id);
+
+  template <typename ElementType>
+  bool DoCtcLoss(Stream* stream, const RnnStateTensorDescriptor& probs_desc,
+                 const DeviceMemory<ElementType>& probs_data,
+                 absl::Span<const int> labels_data,
+                 absl::Span<const int> labels_lengths_data,
+                 absl::Span<const int> input_lengths_data,
+                 DeviceMemory<ElementType>* costs_data,
+                 const RnnStateTensorDescriptor& grads_desc,
+                 DeviceMemory<ElementType>* grads_data,
+                 DeviceMemory<uint8_t>* scratch_memory, int ctc_loss_algo_id) {
+    return IsStatusOk(
+        DoCtcLoss(stream, ToDataType<ElementType>::value, probs_desc,
+                  probs_data, labels_data, labels_lengths_data,
+                  input_lengths_data, *costs_data, grads_desc, *grads_data,
+                  *scratch_memory, ctc_loss_algo_id),
+        false);
+  }
+
+  // Transforms a tensor into another tensor with a different layout and/or data
+  // type.
+  //
+  // Arguments:
+  //  stream: pointer to the stream where this operation should be enqueued to.
+  //  input_desc: specifies the shape and the data layout of the input tensor.
+  //  input_type: the data type of the input tensor.
+  //  input_data: the device memory region that contains the input tensor.
+  //  output_desc: specifies the shape and the data layout of the output tensor.
+  //  output_type: the data type of the output tensor.
+  //  scale: an element-wise scaling factor to apply.
+  //  output_data: the device memory region that contains the output tensor.
+  virtual bool DoTransformTensor(
+      Stream* stream, const BatchDescriptor& input_desc, DataType input_type,
+      const DeviceMemoryBase& input_data, const BatchDescriptor& output_desc,
+      DataType output_type, float scale, DeviceMemoryBase* output_data) {
+    return false;
+  }
+
+  // Notifies that a stream is being destroyed and should be invalidated from
+  // any internal caching.  This exists to allow the CUDA implementation to
+  // avoid redundant cudnnSetStream calls without risking problems when a stream
+  // is destroyed and a new stream later created in the same memory.
+  virtual void NotifyStreamDestroyed(Stream* stream) {}
+
+ protected:
+  // Returns whether status is 'ok', and potentially logs the error.
+  static bool IsStatusOk(const absl::Status& status, bool report_error);
+
+ private:
+  virtual absl::Status DoPrepareForConvolution(
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8_t>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return absl::OkStatus();
+  }
+
+  virtual absl::Status DoPrepareForCtcLoss(
+      Stream* stream, DataType element_type,
+      const RnnStateTensorDescriptor& probs_desc,
+      const RnnStateTensorDescriptor& grads_desc,
+      absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data,
+      const NumericOptions& numeric_options,
+      ScratchAllocator* scratch_allocator,
+      DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
+    *scratch_memory = {};
+    return absl::OkStatus();
+  }
+
+  DnnSupport(const DnnSupport&) = delete;
+  void operator=(const DnnSupport&) = delete;
+};
+
+}  // namespace dnn
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DNN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/event.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/event.h
new file mode 100644
index 00000000..d79c10b7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/event.h
@@ -0,0 +1,57 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_EVENT_H_
+#define XLA_STREAM_EXECUTOR_EVENT_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+
+namespace stream_executor {
+
+// The Event class, when supported by a platform, enables low-overhead status
+// reporting for a Stream. An Event is inserted at a location in a stream via
+// the Stream::RecordEvent() API. From then on, the Event's status can be
+// monitored via the nonblocking Event::PollForStatus() call.
+class Event {
+ public:
+  // Potential states for an Event. If PollForStatus() returns anything aside
+  // from kPending or kComplete, an error has occurred; kUnknown is a bad state.
+  // Not all implementations are able to return all enumeration values. Refer to
+  // the platform-specific implementation for details.
+  enum class Status {
+    kUnknown,
+    kError,
+    kPending,
+    kComplete,
+  };
+
+  // Releases any resources held by the Event object.
+  virtual ~Event() = default;
+
+  // Returns the current Status for the event.
+  virtual Status PollForStatus() { return Status::kError; }
+
+  // Blocks `stream` on this event. `stream` is a raw platform-specific
+  // stream (e.g. GpuStreamHandle).
+  virtual absl::Status WaitForEventOnExternalStream(std::intptr_t stream) {
+    return absl::UnimplementedError("Not supported for this Event.");
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_EVENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/event_based_timer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/event_based_timer.h
new file mode 100644
index 00000000..96900806
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/event_based_timer.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_EVENT_BASED_TIMER_H_
+#define XLA_STREAM_EXECUTOR_EVENT_BASED_TIMER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+
+namespace stream_executor {
+
+// This class defines an interface for an Event-based timer.  It allows the
+// timing via Events from the creation of an EventBasedTimer to some arbitrary
+// later point when the GetElapsedDuration method is called.
+class EventBasedTimer {
+ public:
+  virtual ~EventBasedTimer() = default;
+  EventBasedTimer() = default;
+  EventBasedTimer(EventBasedTimer&&) = default;
+  EventBasedTimer& operator=(EventBasedTimer&&) = default;
+
+  // Stops the timer on the first call and returns the elapsed duration.
+  // Subsequent calls error out.
+  virtual absl::StatusOr<absl::Duration> GetElapsedDuration() = 0;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_EVENT_BASED_TIMER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/executor_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/executor_cache.h
new file mode 100644
index 00000000..d4cf4b5e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/executor_cache.h
@@ -0,0 +1,64 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
+#define XLA_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+// Utility class to allow Platform objects to manage cached StreamExecutors.
+// Thread-safe.
+class ExecutorCache {
+ public:
+  using ExecutorFactory =
+      std::function<absl::StatusOr<std::unique_ptr<StreamExecutor>>()>;
+
+  ExecutorCache();
+  ~ExecutorCache();
+
+  // Looks up 'ordinal' in the cache. Returns a pointer to the existing
+  // executor, if already present, or creates it using 'factory', if it does
+  // not. Factories may be executed concurrently for different device ordinals.
+  absl::StatusOr<StreamExecutor*> GetOrCreate(int ordinal,
+                                              const ExecutorFactory& factory);
+
+  // Returns a pointer to the described executor (if one with a matching ordinal
+  // has been created), or a NOT_FOUND status.
+  absl::StatusOr<StreamExecutor*> Get(int ordinal);
+
+ private:
+  // Protects cache_.
+  absl::Mutex mutex_;
+
+  // Maps ordinal number to a cached executor for that ordinal.
+  absl::flat_hash_map<int, std::unique_ptr<StreamExecutor>> cache_
+      ABSL_GUARDED_BY(mutex_);
+
+  ExecutorCache(const ExecutorCache&) = delete;
+  void operator=(const ExecutorCache&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/fft.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/fft.h
new file mode 100644
index 00000000..3349beb7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/fft.h
@@ -0,0 +1,192 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Exposes the family of FFT routines as pre-canned high performance calls for
+// use in conjunction with the StreamExecutor abstraction.
+//
+// Note that this interface is optionally supported by platforms..
+//
+// This abstraction makes it simple to entrain FFT operations on GPU data into
+// a Stream -- users typically will not use this API directly, but will use the
+// Stream builder methods to entrain these operations "under the hood". For
+// example:
+//
+//  DeviceMemory<std::complex<float>> x =
+//    stream_exec->AllocateArray<std::complex<float>>(1024);
+//  DeviceMemory<std::complex<float>> y =
+//    stream_exec->AllocateArray<std::complex<float>>(1024);
+//  // ... populate x and y ...
+//  Stream stream{stream_exec};
+//  std::unique_ptr<Plan> plan =
+//     stream_exec.AsFft()->Create1dPlan(&stream, 1024, Type::kC2CForward);
+//  stream
+//    .Init()
+//    .ThenFft(plan.get(), x, &y);
+//  TF_CHECK_OK(stream.BlockHostUntilDone());
+//
+// By using stream operations in this manner the user can easily intermix custom
+// kernel launches with these pre-canned FFT routines.
+
+#ifndef XLA_STREAM_EXECUTOR_FFT_H_
+#define XLA_STREAM_EXECUTOR_FFT_H_
+
+#include <complex>
+#include <cstdint>
+#include <memory>
+
+namespace stream_executor {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+class ScratchAllocator;
+
+namespace fft {
+
+// Specifies FFT input and output types, and the direction.
+// R, D, C, and Z stand for SP real, DP real, SP complex, and DP complex.
+enum class Type {
+  kInvalid,
+  kC2CForward,
+  kC2CInverse,
+  kC2R,
+  kR2C,
+  kZ2ZForward,
+  kZ2ZInverse,
+  kZ2D,
+  kD2Z
+};
+
+// FFT plan class. Each FFT implementation should define a plan class that is
+// derived from this class. It does not provide any interface but serves
+// as a common type that is used to execute the plan.
+class Plan {
+ public:
+  virtual ~Plan() {}
+};
+
+// FFT support interface -- this can be derived from a GPU executor when the
+// underlying platform has an FFT library implementation available. See
+// StreamExecutor::AsFft().
+//
+// This support interface is not generally thread-safe; it is only thread-safe
+// for the CUDA platform (cuFFT) usage; host side FFT support is known
+// thread-compatible, but not thread-safe.
+class FftSupport {
+ public:
+  virtual ~FftSupport() {}
+
+  // Creates a batched FFT plan with scratch allocator.
+  //
+  // stream:          The GPU stream in which the FFT runs.
+  // rank:            Dimensionality of the transform (1, 2, or 3).
+  // elem_count:      Array of size rank, describing the size of each dimension.
+  // input_embed, output_embed:
+  //                  Pointer of size rank that indicates the storage dimensions
+  //                  of the input/output data in memory. If set to null_ptr all
+  //                  other advanced data layout parameters are ignored.
+  // input_stride:    Indicates the distance (number of elements; same below)
+  //                  between two successive input elements.
+  // input_distance:  Indicates the distance between the first element of two
+  //                  consecutive signals in a batch of the input data.
+  // output_stride:   Indicates the distance between two successive output
+  //                  elements.
+  // output_distance: Indicates the distance between the first element of two
+  //                  consecutive signals in a batch of the output data.
+  virtual std::unique_ptr<Plan> CreateBatchedPlanWithScratchAllocator(
+      Stream *stream, int rank, uint64_t *elem_count, uint64_t *input_embed,
+      uint64_t input_stride, uint64_t input_distance, uint64_t *output_embed,
+      uint64_t output_stride, uint64_t output_distance, Type type,
+      bool in_place_fft, int batch_count,
+      ScratchAllocator *scratch_allocator) = 0;
+
+  // Updates the plan's work area with space allocated by a new scratch
+  // allocator. This facilitates plan reuse with scratch allocators.
+  //
+  // This requires that the plan was originally created using a scratch
+  // allocator, as otherwise scratch space will have been allocated internally
+  // by cuFFT.
+  virtual void UpdatePlanWithScratchAllocator(
+      Stream *stream, Plan *plan, ScratchAllocator *scratch_allocator) = 0;
+
+  // Computes complex-to-complex FFT in the transform direction as specified
+  // by direction parameter.
+  virtual bool DoFft(Stream *stream, Plan *plan,
+                     const DeviceMemory<std::complex<float>> &input,
+                     DeviceMemory<std::complex<float>> *output) = 0;
+  virtual bool DoFft(Stream *stream, Plan *plan,
+                     const DeviceMemory<std::complex<double>> &input,
+                     DeviceMemory<std::complex<double>> *output) = 0;
+
+  // Computes real-to-complex FFT in forward direction.
+  virtual bool DoFft(Stream *stream, Plan *plan,
+                     const DeviceMemory<float> &input,
+                     DeviceMemory<std::complex<float>> *output) = 0;
+  virtual bool DoFft(Stream *stream, Plan *plan,
+                     const DeviceMemory<double> &input,
+                     DeviceMemory<std::complex<double>> *output) = 0;
+
+  // Computes complex-to-real FFT in inverse direction.
+  virtual bool DoFft(Stream *stream, Plan *plan,
+                     const DeviceMemory<std::complex<float>> &input,
+                     DeviceMemory<float> *output) = 0;
+  virtual bool DoFft(Stream *stream, Plan *plan,
+                     const DeviceMemory<std::complex<double>> &input,
+                     DeviceMemory<double> *output) = 0;
+
+ protected:
+  FftSupport() {}
+
+ private:
+  FftSupport(const FftSupport &) = delete;
+  void operator=(const FftSupport &) = delete;
+};
+
+// Macro used to quickly declare overrides for abstract virtuals in the
+// fft::FftSupport base class. Assumes that it's emitted somewhere inside the
+// ::stream_executor namespace.
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES                   \
+  std::unique_ptr<fft::Plan> CreateBatchedPlanWithScratchAllocator(            \
+      Stream *stream, int rank, uint64_t *elem_count, uint64_t *input_embed,   \
+      uint64_t input_stride, uint64_t input_distance, uint64_t *output_embed,  \
+      uint64_t output_stride, uint64_t output_distance, fft::Type type,        \
+      bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) \
+      override;                                                                \
+  void UpdatePlanWithScratchAllocator(Stream *stream, fft::Plan *plan,         \
+                                      ScratchAllocator *scratch_allocator)     \
+      override;                                                                \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<float>> &input,                   \
+             DeviceMemory<std::complex<float>> *output) override;              \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<double>> &input,                  \
+             DeviceMemory<std::complex<double>> *output) override;             \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<float> &input,                                 \
+             DeviceMemory<std::complex<float>> *output) override;              \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<double> &input,                                \
+             DeviceMemory<std::complex<double>> *output) override;             \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<float>> &input,                   \
+             DeviceMemory<float> *output) override;                            \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<double>> &input,                  \
+             DeviceMemory<double> *output) override;
+
+}  // namespace fft
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_FFT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
new file mode 100644
index 00000000..97caf40c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
+#define XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+
+namespace stream_executor {
+
+struct HsacoImage {
+  std::string gfx_arch;
+  std::vector<uint8_t> bytes;
+};
+
+// Bundles the GPU machine code (HSA Code Object) and returns the resulting
+// binary (i.e. a fatbin) as a byte array.
+absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+    std::vector<HsacoImage> images, const std::string rocm_root_dir);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/context.h
new file mode 100644
index 00000000..1baa0e58
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/context.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_GPU_CONTEXT_H_
+
+#include "absl/status/status.h"
+
+namespace stream_executor::gpu {
+
+// This defines a base class for interacting with any context-specific state
+// residing in a GPU.
+class Context {
+ public:
+  virtual ~Context() = default;
+
+  // Sets this context to be the active GPU context.
+  virtual void SetActive() = 0;
+
+  // Returns true if this Context is the active GPU context.
+  virtual bool IsActive() const = 0;
+
+  // Returns the device ordinal associated with this context.
+  virtual int device_ordinal() const = 0;
+
+  // Synchronizes all activity on the GPU.
+  virtual absl::Status Synchronize() = 0;
+};
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/context_map.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/context_map.h
new file mode 100644
index 00000000..ac5e1d8d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/context_map.h
@@ -0,0 +1,112 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_CONTEXT_MAP_H_
+#define XLA_STREAM_EXECUTOR_GPU_CONTEXT_MAP_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+
+namespace stream_executor::gpu {
+
+// Manages a map of gpu contexts that we've created, mapping
+// from the gpu-specific contexts to the ContextType that we pass around
+// internally.
+template <class GpuContext, class ContextType>
+class ContextMap {
+ public:
+  // Takes as a parameter a function that translates a void pointer into a
+  // device ordinal.
+  explicit ContextMap(absl::AnyInvocable<int(void* ptr)> find_device_ordinal)
+      : find_device_ordinal_(std::move(find_device_ordinal)) {}
+  // Returns whether context is a member of the live set.
+  bool Has(GpuContext context) {
+    absl::ReaderMutexLock lock(&mutex_);
+    return gpu_context_to_context_type_map_.find(context) !=
+           gpu_context_to_context_type_map_.end();
+  }
+
+  // Adds context to the live set, or returns it if it's already present.
+  ContextType* Add(GpuContext context, int device_ordinal) {
+    CHECK_NE(context, nullptr);
+    absl::MutexLock lock(&mutex_);
+
+    auto insert_result = gpu_context_to_context_type_map_.insert(
+        std::make_pair(context, nullptr));
+    auto it = insert_result.first;
+    if (insert_result.second) {
+      // context was not present in the map.  Add it.
+      it->second = std::make_unique<ContextType>(context, device_ordinal);
+      ordinal_to_type_map_[device_ordinal].push_back(context);
+    }
+    return it->second.get();
+  }
+
+  // Removes context from the live set.
+  void Remove(GpuContext context) {
+    absl::MutexLock lock(&mutex_);
+    CHECK_NE(context, nullptr);
+    auto it = gpu_context_to_context_type_map_.find(context);
+    CHECK(it != gpu_context_to_context_type_map_.end()) << context;
+    gpu_context_to_context_type_map_.erase(it);
+    for (auto p : ordinal_to_type_map_) {
+      auto it2 = std::find(p.second.begin(), p.second.end(), context);
+      if (it2 != p.second.end()) {
+        p.second.erase(it2);
+        if (p.second.empty()) {
+          ordinal_to_type_map_.erase(p.first);
+        }
+        break;
+      }
+    }
+  }
+
+  // Returns the context associated to that ptr.
+  GpuContext GetAnyContext(void* ptr) {
+    absl::ReaderMutexLock lock(&mutex_);
+    int device_ordinal = find_device_ordinal_(ptr);
+    CHECK_EQ(ordinal_to_type_map_.count(device_ordinal), 1);
+    CHECK(!ordinal_to_type_map_.at(device_ordinal).empty())
+        << "Need at least one context.";
+    return ordinal_to_type_map_.at(device_ordinal)[0];
+  }
+
+ private:
+  // Mutex protecting concurrent access to the maps.
+  absl::Mutex mutex_;
+
+  // A map of GpuContext to ContextType objects.
+  absl::flat_hash_map<GpuContext, std::unique_ptr<ContextType>>
+      gpu_context_to_context_type_map_ ABSL_GUARDED_BY(mutex_);
+
+  // A map of device ordinal to GpuContext.
+  absl::flat_hash_map<int, std::vector<GpuContext>> ordinal_to_type_map_
+      ABSL_GUARDED_BY(mutex_);
+
+  // A function that translates a given memory address into an associated device
+  // ordinal.
+  absl::AnyInvocable<int(void* ptr)> find_device_ordinal_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_CONTEXT_MAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_asm_opts.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_asm_opts.h
new file mode 100644
index 00000000..7a34e6bb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_asm_opts.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_ASM_OPTS_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_ASM_OPTS_H_
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+namespace stream_executor {
+// Compilation options for compiling ptxas.
+struct GpuAsmOpts {
+  // Disable Cuda ptxas optimizations.
+  bool disable_gpuasm_optimizations;
+
+  // Cuda directory which would be searched first.
+  std::string preferred_cuda_dir;
+
+  std::vector<std::string> extra_flags;
+
+  explicit GpuAsmOpts(bool disable_gpuasm_optimizations = false,
+                      absl::string_view preferred_cuda_dir = "",
+                      absl::Span<const std::string> extra_flags = {})
+      : disable_gpuasm_optimizations(disable_gpuasm_optimizations),
+        preferred_cuda_dir(preferred_cuda_dir),
+        extra_flags(extra_flags.begin(), extra_flags.end()) {}
+
+  using PtxOptionsTuple =
+      std::tuple<bool, std::string, std::vector<std::string>>;
+
+  PtxOptionsTuple ToTuple() {
+    return std::make_tuple(disable_gpuasm_optimizations, preferred_cuda_dir,
+                           extra_flags);
+  }
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_ASM_OPTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
new file mode 100644
index 00000000..20caccbe
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -0,0 +1,346 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_BLAS_LT_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_BLAS_LT_H_
+
+#include <any>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host_or_device_scalar.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace stream_executor::gpu {
+
+absl::StatusOr<blas::DataType> AsBlasDataType(xla::PrimitiveType dtype);
+
+absl::StatusOr<xla::PrimitiveType> AsXlaPrimitiveType(blas::DataType dtype);
+
+absl::StatusOr<blas::ComputationType> GetBlasComputationType(
+    xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
+    xla::PrimitiveType output_dtype, int64_t compute_precision);
+
+// Returns the type for the alpha and beta scalars.
+blas::DataType GetScaleType(blas::DataType c_type,
+                            blas::ComputationType computation_type);
+
+struct MatrixLayout {  // plain MatrixLayout which is extended with create
+                       // functions in matmul_utils.h
+  enum class Order {
+    kRowMajor,     // Elements in the same row are contiguous in memory.
+    kColumnMajor,  // Elements in the same column are contiguous in memory.
+  };
+
+  MatrixLayout(xla::PrimitiveType dtype_, int64_t num_rows_, int64_t num_cols_,
+               Order order_, int64_t batch_size_ = 1,
+               std::optional<int64_t> leading_dim_stride_ = {},
+               std::optional<int64_t> batch_stride_ = {},
+               std::optional<blas::Transpose> transpose_ = {});
+
+  void Transpose();
+
+  xla::PrimitiveType dtype;
+  // `num_rows` / `num_cols` are for the "logical" matrix shape:
+  // i.e. the contracting dim has size `num_cols` for LHS operands and
+  // `num_rows` for RHS operands.
+  int64_t num_rows;
+  int64_t num_cols;
+  Order order;
+  int64_t batch_size;
+  int64_t leading_dim_stride;
+  // `batch_stride` is set to `0` when `batch_size == 1`.
+  int64_t batch_stride;
+  blas::Transpose transpose;
+};
+
+// compact version of the matrix layout to be used to pass matrices
+// to underlying blas API
+struct MatrixDescriptor {
+  DeviceMemoryBase data;
+  int64_t leading_dim_stride = 0;
+  int64_t batch_stride = 0;
+  blas::DataType type{};
+  blas::Transpose transpose{};
+
+  template <typename T>
+  DeviceMemory<T> cast() const {
+    return DeviceMemory<T>(data);
+  }
+};
+
+struct OutputMatrixDescriptor : public MatrixDescriptor {
+  OutputMatrixDescriptor(MatrixDescriptor&& parent) noexcept
+      : MatrixDescriptor(std::move(parent)) {}
+  int64_t batch_size = 0;
+  int64_t m = 0, n = 0, k = 0;
+  blas::ComputationType compute_type{};
+};
+
+// BLAS GeMM's output is column-major. If we require row-major, use identity:
+// C^T = (A @ B)^T = B^T @ A^T.
+bool MakeOutputColumnMajor(MatrixLayout& lhs, MatrixLayout& rhs,
+                           MatrixLayout& output, MatrixLayout* c = nullptr);
+
+struct GemmConfig {  // plain GemmConfig which is extended with create functions
+                     // in matmul_utils.h
+  MatrixLayout lhs_layout;
+  MatrixLayout rhs_layout;
+  MatrixLayout c_layout;
+  MatrixLayout output_layout;
+  xla::complex128 alpha;
+  double beta;
+  int64_t compute_precision;
+  // PrecisionConfig-level algorithm
+  xla::PrecisionConfig::Algorithm precision_algorithm;
+  // BLAS-library-level algorithm.
+  std::optional<int64_t> algorithm;
+  bool grad_x;
+  bool grad_y;
+  std::optional<blas::ComputationType> compute_type;
+};
+
+struct BlasLt {
+  enum class Epilogue {
+    kDefault = 1,                   // No special postprocessing
+    kReLU = 2,                      // Apply point-wise ReLU function
+    kBias = 4,                      // Add broadcasted bias vector
+    kBiasThenReLU = kBias | kReLU,  // Apply bias and then ReLU transform
+    kGELU = 32,                // Apply GELU point-wise transform to the results
+    kGELUWithAux = 32 | 1024,  // Apply GELU with auxiliary output.
+    kBiasThenGELU = kBias | kGELU,  // Apply bias and then approximate GELU.
+    kBiasThenGELUWithAux = kBiasThenGELU | 1024,
+  };
+
+  // Describes the location of pointers for the scaling factors alpha and beta.
+  enum class PointerMode {
+    kHost,
+    kDevice,
+  };
+
+  struct MatmulAlgorithm {
+    std::any opaque_algo;
+    size_t workspace_size;
+  };
+
+  struct MatmulPlan {
+    // DoMatmul provides two sets of API for maintaning compatibility for XLA,
+    // and TF. One set API uses scratch_allocator to allocate workspace, and one
+    // set API allow uses to provide pre-allocated buffer as workspace.
+
+    // API that uses scratch_allocator to allocate workspace
+    // This function is also used by Tensorflow:
+    // see tensorflow/core/kernels/matmul_util.h.
+    template <typename A, typename B, typename C, typename D, typename Scale>
+    absl::Status DoMatmul(Stream* stream,
+                          const HostOrDeviceScalar<Scale>& alpha,
+                          const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+                          const HostOrDeviceScalar<Scale>& beta,
+                          const DeviceMemory<C>& c, DeviceMemory<D>& d,
+                          const MatmulAlgorithm& algorithm,
+                          ScratchAllocator& scratch_allocator,
+                          const DeviceMemory<C>& bias = {},
+                          const DeviceMemoryBase& aux = DeviceMemory<uint8_t>{},
+                          blas::ProfileResult* profile_result = nullptr) const {
+      return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux, {},
+                      {}, {}, {}, {}, std::nullopt, &scratch_allocator,
+                      profile_result);
+    }
+
+    // API that uses pre-allocated buffer as workspace
+    template <typename A, typename B, typename C, typename D, typename Scale>
+    absl::Status DoMatmul(
+        Stream* stream, const HostOrDeviceScalar<Scale>& alpha,
+        const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+        const HostOrDeviceScalar<Scale>& beta, const DeviceMemory<C>& c,
+        DeviceMemory<D>& d, const MatmulAlgorithm& algorithm,
+        const DeviceMemory<C>& bias = {},
+        const DeviceMemoryBase& aux = DeviceMemory<uint8_t>{},
+        std::optional<DeviceMemoryBase> workspace = std::nullopt,
+        blas::ProfileResult* profile_result = nullptr) const {
+      return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux, {},
+                      {}, {}, {}, {}, workspace, std::nullopt, profile_result);
+    }
+
+    // The most general form: uses pre-allocated buffer workspace or
+    // provided scratch allocator
+    template <typename A, typename B, typename C, typename D, typename Scale>
+    absl::Status DoMatmul(
+        Stream* stream, const HostOrDeviceScalar<Scale>& alpha,
+        const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+        const HostOrDeviceScalar<Scale>& beta, const DeviceMemory<C>& c,
+        DeviceMemory<D>& d, const MatmulAlgorithm& algorithm,
+        const DeviceMemory<C>& bias = {},
+        const DeviceMemoryBase& aux = DeviceMemoryBase{},
+        const DeviceMemory<Scale>& a_scale = {},
+        const DeviceMemory<Scale>& b_scale = {},
+        const DeviceMemory<Scale>& c_scale = {},
+        const DeviceMemory<Scale>& d_scale = {},
+        const DeviceMemory<Scale>& d_amax = {},
+        std::optional<DeviceMemoryBase> workspace = std::nullopt,
+        std::optional<ScratchAllocator*> scratch_allocator = std::nullopt,
+        blas::ProfileResult* profile_result = nullptr) const {
+      TF_RETURN_IF_ERROR(ValidateInputs(
+          blas::ToDataType<Scale>::value, alpha.on_device(), beta.on_device(),
+          blas::ToDataType<A>::value, blas::ToDataType<B>::value,
+          blas::ToDataType<C>::value, blas::ToDataType<D>::value));
+
+      return DoMatmul(stream, alpha.opaque(), a, b, beta.opaque(), c, d,
+                      algorithm, bias, aux, a_scale, b_scale, c_scale, d_scale,
+                      d_amax, workspace, scratch_allocator, profile_result);
+    }
+
+    // API that uses scratch_allocator to allocate workspace
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        ScratchAllocator& scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const {
+      return ExecuteOnStream(stream, a_buffer, b_buffer, c_buffer, d_buffer,
+                             bias_buffer, aux_buffer, a_scale_buffer,
+                             b_scale_buffer, c_scale_buffer, d_scale_buffer,
+                             d_amax_buffer, algorithm, std::nullopt,
+                             &scratch_allocator, profile_result);
+    }
+
+    // API that uses pre-allocated buffer as workspace
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        blas::ProfileResult* profile_result = nullptr) const {
+      return ExecuteOnStream(stream, a_buffer, b_buffer, c_buffer, d_buffer,
+                             bias_buffer, aux_buffer, a_scale_buffer,
+                             b_scale_buffer, c_scale_buffer, d_scale_buffer,
+                             d_amax_buffer, algorithm, workspace, std::nullopt,
+                             profile_result);
+    }
+
+    // The most general form: to be implemented by derived clases.
+    virtual absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result) const = 0;
+
+    // Returns a list of supported algorithms for DoMatmul. The algorithms are
+    // returned in the order of increasing estimated compute time according to
+    // an internal heuristic.
+    virtual absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
+        size_t max_algorithm_count = 128,
+        size_t max_workspace_size = 1ll << 32) const = 0;
+
+    virtual ~MatmulPlan() {}
+
+   protected:
+    // might be used internally by ExecuteOnStream in derived classes
+    template <typename Scale, typename A, typename B = A, typename C = A,
+              typename D = A>
+    absl::Status DoMatmul(Stream* stream, xla::complex128 alpha,
+                          DeviceMemoryBase a, DeviceMemoryBase b, double beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          const MatmulAlgorithm& algorithm,
+                          std::optional<DeviceMemoryBase> workspace,
+                          std::optional<ScratchAllocator*> scratch_allocator,
+                          blas::ProfileResult* profile_result = nullptr) const {
+      Scale salpha;
+      if constexpr (std::is_same_v<Scale, xla::complex64> ||
+                    std::is_same_v<Scale, xla::complex128>) {
+        salpha = static_cast<Scale>(alpha);
+      } else {
+        salpha = static_cast<Scale>(alpha.real());
+      }
+      Scale sbeta = static_cast<Scale>(beta);
+      DeviceMemory<D> output(d);
+
+      return DoMatmul<A, B, C, D, Scale>(
+          stream, HostOrDeviceScalar<Scale>(salpha), DeviceMemory<A>(a),
+          DeviceMemory<B>(b), HostOrDeviceScalar<Scale>(sbeta),
+          DeviceMemory<C>(c), output, algorithm, DeviceMemory<C>(bias), aux,
+          DeviceMemory<Scale>(a_scale), DeviceMemory<Scale>(b_scale),
+          DeviceMemory<Scale>(c_scale), DeviceMemory<Scale>(d_scale),
+          DeviceMemory<Scale>(d_amax), workspace, scratch_allocator,
+          profile_result);
+    }
+
+    // This is used internally by template DoMatmul function to validate inputs
+    virtual absl::Status ValidateInputs(
+        blas::DataType scale_type, bool alpha_on_device, bool beta_on_device,
+        blas::DataType A_type, blas::DataType B_type, blas::DataType C_type,
+        blas::DataType D_type) const = 0;
+
+    // The most general version to be implemented by derived classes
+    virtual absl::Status DoMatmul(
+        Stream* stream, const void* alpha, DeviceMemoryBase a,
+        DeviceMemoryBase b, const void* beta, DeviceMemoryBase c,
+        DeviceMemoryBase d, const MatmulAlgorithm& algorithm,
+        DeviceMemoryBase bias, DeviceMemoryBase aux, DeviceMemoryBase a_scale,
+        DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
+        DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const = 0;
+  };  // class MatmulPlan
+
+  using MatmulPlanPtr = std::unique_ptr<MatmulPlan>;
+
+  virtual absl::Status Init() = 0;
+
+  virtual absl::StatusOr<MatmulPlanPtr> GetMatmulPlan(
+      const GemmConfig& cfg, Epilogue epilogue) const = 0;
+
+  static BlasLt* Get(const Stream* stream);
+
+  // convenience function to create MatmulPlan directly using stream
+  static absl::StatusOr<MatmulPlanPtr> GetMatmulPlan(const Stream* stream,
+                                                     const GemmConfig& cfg,
+                                                     Epilogue epilogue);
+
+  virtual ~BlasLt() {}
+};  // class BlasLt
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_BLAS_LT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
new file mode 100644
index 00000000..886713d2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -0,0 +1,447 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/bit_pattern.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/scoped_update_mode.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// GpuCommandBuffer provides platform-specific CommandBuffer implementation
+// (it's backed by CUDA or HIP graphs on NVIDIA and AMD devices).
+class GpuCommandBuffer : public CommandBuffer {
+  // GraphNodeHandleOpaque is an opaque type that won't be ODR used, hence
+  // doesn't need to fully defined. It's an implementation detail of the
+  // GraphNodeHandle defined below.
+  struct GraphNodeHandleOpaque;
+
+  // GraphConditionalOpaque is an opaque type that won't be ODR used, hence
+  // doesn't need to fully defined. It's an implementation detail of the
+  // GraphConditionalHandle defined below.
+  struct GraphConditionalOpaque;
+
+ public:
+  // A graph node handle is an opaque handle that identifies a graph node in the
+  // graph associated with a command buffer. GraphNodeHandles are created by
+  // node factory functions and can be referenced in node update functions.
+  // The handle has the same properties as a pointer (can be constructed from a
+  // nullptr, trivial copyable, POD, etc.), that's why we use a pointer to
+  // define it.
+  using GraphNodeHandle = GraphNodeHandleOpaque*;
+
+  // A graph conditional handle is an opaque handle that is tied to a nested
+  // command buffer. Its value determines whether the nested command buffer is
+  // executed or not. Set condition functions will update the conditional
+  // handles values. The handle has the same properties as a pointer (can be
+  // constructed from a nullptr, trivially copyable, POD, etc.), that's why we
+  // use a pointer to define it.
+  using GraphConditionalHandle = GraphConditionalOpaque*;
+  using GraphConditionalHandles = absl::Span<const GraphConditionalHandle>;
+
+  // A handle to a Gpu graph node and a metadata describing its properties. Each
+  // command (launch, memcpy, etc.) creates one or more graph nodes.
+  struct GpuGraphNodeInfo {
+    // A handle to the gpu graph node corresponding to a command.
+    GraphNodeHandle handle{};
+  };
+
+  // A handle to Gpu graph barrier and metadata describing its properties. Each
+  // call to `Barrier` creates a new barrier record.
+  struct GpuGraphBarrierInfo {
+    // A handle to graph node acting as a barrier that defines execution order.
+    // It can be a handle to a `GpuGraphNodeInfo` node or a handle to an empty
+    // node created to be a barrier. We try to reuse existing nodes as barriers
+    // if possible to reduce the size of constructed gpu graphs.
+    GraphNodeHandle handle{};
+
+    // If `true` it means `handle` corresponds to an empty node specifically
+    // created to act as an execution barrier, otherwise `handle` points to one
+    // of the nodes created for recorded commands.
+    bool is_barrier_node = true;
+
+    // Nodes with index smaller than `nodes_offset` are synchronized with this
+    // barrier. We use this offset to find nodes added after the last barrier
+    // that should be added as dependencies to the next barrier.
+    size_t nodes_offset = 0;
+  };
+
+  GpuCommandBuffer(Mode mode, StreamExecutor* parent);
+
+  using CommandBuffer::Barrier;
+  absl::Status Barrier(ExecutionScopeId execution_scope_id) override;
+
+  absl::Status Barrier(
+      absl::Span<const ExecutionScopeId> execution_scope_ids) override;
+
+  absl::Status Barrier(ExecutionScopeId from_execution_scope_id,
+                       ExecutionScopeId to_execution_scope_id) override;
+
+  using CommandBuffer::Launch;
+  absl::Status Launch(ExecutionScopeId execution_scope_id,
+                      const ThreadDim& threads, const BlockDim& blocks,
+                      const Kernel& kernel, const KernelArgs& args) override;
+
+  absl::Status AddNestedCommandBuffer(ExecutionScopeId execution_scope_id,
+                                      const CommandBuffer& nested) override;
+
+  absl::Status MemcpyDeviceToDevice(ExecutionScopeId execution_scope_id,
+                                    DeviceMemoryBase* dst,
+                                    const DeviceMemoryBase& src,
+                                    uint64_t size) override;
+
+  absl::Status Memset(ExecutionScopeId execution_scope_id,
+                      DeviceMemoryBase* dst, BitPattern bit_pattern,
+                      size_t num_elements) override;
+
+  absl::Status If(ExecutionScopeId execution_scope_id,
+                  DeviceMemory<bool> predicate, Builder then_builder) override;
+
+  absl::Status IfElse(ExecutionScopeId execution_scope_id,
+                      DeviceMemory<bool> predicate, Builder then_builder,
+                      Builder else_builder) override;
+
+  absl::Status Case(ExecutionScopeId execution_scope_id,
+                    DeviceMemory<int32_t> index,
+                    std::vector<Builder> branches) override;
+
+  absl::Status For(ExecutionScopeId execution_scope_id, int32_t num_iteration,
+                   DeviceMemory<int32_t> loop_counter,
+                   Builder body_builder) override;
+
+  absl::Status While(ExecutionScopeId execution_scope_id,
+                     DeviceMemory<bool> pred,
+                     ExecutionScopeBuilder cond_builder,
+                     Builder body_builder) override;
+
+  absl::Status Finalize() override;
+  absl::Status Update() override;
+  absl::Status Submit(Stream* stream) override;
+
+  Mode mode() const override { return mode_; }
+  State state() const override { return state_; }
+
+  absl::Span<const GpuGraphNodeInfo> nodes(ExecutionScopeId id) const;
+  absl::Span<const GpuGraphBarrierInfo> barriers(ExecutionScopeId id) const;
+
+  absl::Span<const GpuGraphNodeInfo> nodes() const {
+    return nodes(kDefaultExecutionScope);
+  }
+
+  absl::Span<const GpuGraphBarrierInfo> barriers() const {
+    return barriers(kDefaultExecutionScope);
+  }
+
+  // Returns the list of dependencies for a given node. `node` must be a node
+  // added to the current command buffer. The returned node pointer's lifetimes
+  // are bound to the current command buffer.
+  virtual absl::StatusOr<std::vector<GraphNodeHandle>> GetNodeDependencies(
+      GraphNodeHandle node) = 0;
+
+ protected:
+  // We track the total number of allocated and alive executable graphs in the
+  // process to track the command buffers resource usage. Executable graph
+  // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
+  // we have to be careful not to keep too many of them alive for too long, or
+  // we have a higher risk of OOM errors.
+  static int64_t AliveExecs();
+  static int64_t NotifyExecCreated();
+  static int64_t NotifyExecDestroyed();
+
+  using Dependencies = absl::InlinedVector<GraphNodeHandle, 1>;
+
+  using NoOpKernel = TypedKernel<>;
+
+ private:
+  // A callback to launch a kernel that updates conditional handles state.
+  using SetConditionFn = std::function<absl::Status(
+      ExecutionScopeId, absl::Span<const GraphConditionalHandle>)>;
+
+  // An extension of `Builder` for building conditional command buffers tied to
+  // conditional handles.
+  using ConditionBuilder =
+      std::function<absl::Status(GpuCommandBuffer*, GraphConditionalHandle)>;
+
+  // Wraps a regular command buffer builder into condition builder.
+  static ConditionBuilder ToConditionBuilder(Builder builder);
+
+ public:
+  enum class ConditionType { kIf, kWhile };
+
+ private:
+  // Prepares a nested command buffer for an update of the graph.
+  // It's a prerequisite to a call to `Update` on a nested command buffer.
+  // The return value needs to be kept alive until the update is finished. An
+  // update finishes by a call to `Finalize`.
+  virtual std::unique_ptr<ScopedUpdateMode> ActivateUpdateMode(
+      GpuCommandBuffer* nested_cmd_buffer) = 0;
+
+  // For each conditional node in the Gpu graph we keep a record of conditional
+  // command buffers attached to a node, so we can apply updates to them.
+  struct ConditionalCommandBuffers {
+    std::vector<GraphConditionalHandle> conditionals;
+    std::vector<std::unique_ptr<GpuCommandBuffer>> command_buffers;
+  };
+
+  absl::StatusOr<std::vector<GraphConditionalHandle>> CreateConditionalHandles(
+      size_t num_handles);
+
+  absl::StatusOr<std::vector<std::unique_ptr<GpuCommandBuffer>>>
+  CreateConditionalCommandBuffers(
+      ExecutionScopeId execution_scope_id, ConditionType type,
+      absl::Span<const GraphConditionalHandle> conditionals,
+      absl::Span<const ConditionBuilder> builders);
+
+  absl::Status UpdateConditionalCommandBuffers(
+      absl::Span<const GraphConditionalHandle> handles,
+      absl::Span<const std::unique_ptr<GpuCommandBuffer>> command_buffers,
+      absl::Span<const ConditionBuilder> builders);
+
+  absl::StatusOr<std::unique_ptr<GpuCommandBuffer>>
+  CreateConditionalCommandBuffer(ExecutionScopeId execution_scope_id,
+                                 ConditionType type,
+                                 GraphConditionalHandle conditional);
+
+  // Adds a new conditional command (If, IfElse, Case, While, For) to the
+  // command buffer.
+  absl::Status AddConditionalCommandNode(
+      ExecutionScopeId execution_scope_id, ConditionType type,
+      SetConditionFn set_condition,
+      absl::Span<const ConditionBuilder> builders);
+
+  Dependencies GetBarrier(ExecutionScopeId execution_scope_id);
+
+  // Launches a kernels that updates the state of the given graph conditional
+  // based on the predicate. If the predicate is true, `if_conditional` is set
+  // to 1, otherwise to 0.
+  virtual absl::Status LaunchSetIfConditionKernel(
+      ExecutionScopeId execution_scope_id,
+      GraphConditionalHandle if_conditional, DeviceMemory<bool> predicate) = 0;
+  // Launches a kernels that updates the state of the given graph conditionals
+  // based on the predicate. If the predicate is true, `if_conditional` is set
+  // to 1 and `else_conditional` to 0. If the predicate is false,
+  // `if_conditional` is set to 0 and `else_conditional` to 1.
+  virtual absl::Status LaunchSetIfElseConditionKernel(
+      ExecutionScopeId execution_scope_id,
+      GraphConditionalHandle if_conditional,
+      GraphConditionalHandle else_conditional,
+      DeviceMemory<bool> predicate) = 0;
+  // Launches a kernel that updates the state of the given graph conditionals
+  // based on the index and batch_offset. conditional[x] is set to 1 if index
+  // == x + batch_offset and 0 otherwise. `conditionals` may contain up to 8
+  // conditionals
+  virtual absl::Status LaunchSetCaseConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+      DeviceMemory<int32_t> index, int32_t batch_offset,
+      bool enable_conditional_default) = 0;
+  // Launches a kernel that updates the state of the given graph conditional
+  // based on the loop counter and the total number of iterations. If the loop
+  // counter is less than the number of iterations, `conditional` is set to 1,
+  // otherwise to 0. The loop counter is also incremented by 1.
+  virtual absl::Status LaunchSetForConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+      DeviceMemory<int32_t> loop_counter, int32_t iterations) = 0;
+  // Launches a kernel that updates the state of the given graph conditional
+  // based on the predicate. If the predicate is true, `conditional` is set to
+  // 1, otherwise to 0.
+  virtual absl::Status LaunchSetWhileConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+      DeviceMemory<bool> predicate) = 0;
+
+  // Recursively disable all nodes corresponding to barriers (including nested
+  // conditional command buffers). This is work around the fact that we can't
+  // use empty nodes inside conditional CUDA graphs and instead we add no-op
+  // kernel nodes, however large number of no-op kernels impacts performance.
+  // The function needs access to the root command buffer which holds the
+  // executable graph.
+  absl::Status DisableBarriersExecution(GpuCommandBuffer& root_command_buffer);
+
+  // Launches CUDA kernels with packed arguments.
+  absl::Status LaunchWithPackedArgs(
+      ExecutionScopeId execution_scope_id, const ThreadDim& threads,
+      const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgsPackedArrayBase& packed_args);
+
+ protected:
+  // Returns OK status if command buffer is not finalized and it is still
+  // possible to add new commands to it, otherwise returns internal error.
+  absl::Status CheckNotFinalized();
+
+  // Returns OK status if the command buffer can be updated.
+  virtual absl::Status CheckCanBeUpdated() = 0;
+
+ private:
+  // Returns OK status if the number of command buffers is equal to the expected
+  // one, otherwise returns internal error.
+  absl::Status CheckNumCommandBuffers(
+      const ConditionalCommandBuffers& cmd_buffers, size_t num_cmd_buffers);
+
+  // Collects a set of dependencies for a new barrier.
+  Dependencies GetBarrierDependencies(ExecutionScopeId execution_scope_id);
+
+  Mode mode_;
+  State state_ = State::kCreate;
+
+  StreamExecutor* parent_;  // not owned, must outlive *this
+
+ private:
+  // ExecutionScope holds the state of an underlying CUDA graph (nodes an
+  // barriers added to a graph) for a single execution scope.
+  struct ExecutionScope {
+    // Tracks indices into data structures during command buffer updates.
+    struct UpdateState {
+      // Index points to the graph node inside `nodes` that will be updated
+      // next.
+      int64_t node_idx = 0;
+
+      // Index points to the barrier node inside `barriers` that will be updated
+      // on a next call to `Barrier(...)`.
+      int64_t barrier_idx = 0;
+
+      // Index points to the conditional command buffers that will be updated
+      // next when we'll be updating next conditional command (If, Case, While).
+      int64_t conditional_idx = 0;
+    };
+
+    // Gpu graph nodes corresponding to recorded commands (launch, memcpy,
+    // etc.).
+    std::vector<GpuGraphNodeInfo> nodes;
+
+    // Gpu graph barriers that define recorded commands execution order.
+    std::vector<GpuGraphBarrierInfo> barriers;
+
+    // Command buffers for conditional nodes in the Gpu graph. Underlying Gpu
+    // graphs owned by the `graph_` instance.
+    std::vector<ConditionalCommandBuffers> conditional_command_buffers;
+
+    // Tracks execution scope update state.
+    UpdateState update_state;
+  };
+
+  // Execution scopes recorded into the command buffer.
+  absl::flat_hash_map<ExecutionScopeId, ExecutionScope> execution_scopes_;
+
+  // Track the number of command buffer updates for debugging.
+  int64_t num_updates_ = 0;
+
+  // Creates a nested command buffer, associated with the same executor.
+  // The given graph will not be owned by the created command buffer.
+ protected:
+  struct ConditionalNodeResult {
+    GraphNodeHandle node_handle;
+    std::unique_ptr<GpuCommandBuffer> command_buffer;
+  };
+
+ private:
+  // Adds a new conditional node to the graph and creates a corresponding nested
+  // command buffer.
+  virtual absl::StatusOr<ConditionalNodeResult> CreateConditionalNode(
+      const Dependencies& dependencies, GraphConditionalHandle conditional,
+      ConditionType type) = 0;
+
+  // Adds a new memset node to the graph.
+  virtual absl::StatusOr<GraphNodeHandle> CreateMemsetNode(
+      const Dependencies& dependencies, DeviceMemoryBase destination,
+      BitPattern bit_pattern, size_t num_elements) = 0;
+
+  // Updates an existing memset node. Note that `node_handle` needs to be refer
+  // to a node created by `CreateMemsetNode`.
+  virtual absl::Status UpdateMemsetNode(GraphNodeHandle node_handle,
+                                        DeviceMemoryBase destination,
+                                        BitPattern bit_pattern,
+                                        size_t num_elements) = 0;
+
+  // Adds a new memcpy node to the graph.
+  virtual absl::StatusOr<GraphNodeHandle> CreateMemcpyD2DNode(
+      const Dependencies& dependencies, DeviceMemoryBase destination,
+      DeviceMemoryBase source, uint64_t size) = 0;
+
+  virtual absl::Status UpdateMemcpyD2DNode(GraphNodeHandle node_handle,
+                                           DeviceMemoryBase destination,
+                                           DeviceMemoryBase source,
+                                           uint64_t size) = 0;
+
+  // Adds a new nested command buffer node to the graph.
+  virtual absl::StatusOr<GraphNodeHandle> CreateChildNode(
+      const Dependencies& dependencies, const CommandBuffer& nested) = 0;
+
+  // Associate another command buffer with this child node. Will return an
+  // error if the given node has not been created as a child node.
+  virtual absl::Status UpdateChildNode(GraphNodeHandle node_handle,
+                                       const CommandBuffer& nested) = 0;
+
+  // Adds a new kernel launch node to the graph.
+  virtual absl::StatusOr<GraphNodeHandle> CreateKernelNode(
+      const Dependencies& dependencies, const ThreadDim& threads,
+      const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgsPackedArrayBase& args) = 0;
+
+  // Updates the kernel launch node with the given parameters. Will return an
+  // error if the given node has not been created as a kernel launch node.
+  virtual absl::Status UpdateKernelNode(
+      GraphNodeHandle node_handle, const ThreadDim& threads,
+      const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgsPackedArrayBase& args) = 0;
+
+  // Creates a new no-op node acting as a barrier and adds it to the graph.
+  virtual absl::StatusOr<GraphNodeHandle> CreateBarrierNode(
+      const Dependencies& dependencies) = 0;
+
+  // Enables or disables the execution of the given node in the graph.
+  virtual absl::Status SetNodeExecutionEnabled(GraphNodeHandle node_handle,
+                                               bool enabled) = 0;
+
+  // Launches an instantiated graph. Only supported on primary command buffers.
+  virtual absl::Status LaunchGraph(Stream* stream) = 0;
+
+  // Returns the number of nodes in the graph associated with this command
+  // buffer.
+  virtual absl::StatusOr<size_t> GetNodeCount() const = 0;
+
+  // This gets called at the beginning of `Finalize` and allows subclasses to
+  // perform any necessary preparation before the graph is finalized.
+  virtual absl::Status PrepareFinalization() = 0;
+
+  // Create a new conditional handle in the underlying graph.
+  virtual absl::StatusOr<GraphConditionalHandle> CreateConditionalHandle() = 0;
+
+  // Writes the underlying graph to a file in graphviz DOT format.
+  virtual absl::Status WriteGraphToDotFile(absl::string_view path) = 0;
+
+  // Instantiates the executable graph from the underlying graph.
+  virtual absl::Status InstantiateGraph() = 0;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
new file mode 100644
index 00000000..ab072e44
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
@@ -0,0 +1,136 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/device_id.h"
+
+namespace stream_executor {
+
+// An allocator that wraps cudaMallocAsync. It has fewer fragmentation
+// issues then the BFC memory allocator.  The compute-sanitizer tool
+// helps to detect OOB memory errors when using cudaMallocAsync. Use
+// the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` to
+// enable it.
+//
+// It needs CUDA 11.2+. When using a container, this only needs the
+// container driver to be 11.2. It has a WAR again a driver bug in
+// multi-GPU setup with CUDA 11.2. The WAR creates an extra context on
+// GPU 0.
+//
+// We configure cudaMallocAsync to grow when more memory is needed
+// instead of preallocating everything up front and to keep a local
+// pool up to release_threshold bytes that is never released to other processes.
+// So no other process will "steal" the GPU memory already used by the
+// current process. This is to speed up execution and prevent crashes
+// of long-running jobs. Use `reserve_memory=true` if you want to
+// preallocate the full release_threshold. You can also use the environment
+// variable `TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=nb_bytes` to preallocate
+// that amount of memory. `TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1` is a
+// special value that preallocate all what the BFC memory allocator
+// would have allocated. This is useful when benchmarking as it doesn't
+// change when driver allocations are done.
+//
+// Here, the release_threshold isn't the absolute max as for [Gpu]BFCAllocator.
+// The pool can grow above that up to the total GPU memory.  But the
+// driver can return the excess memory to other processes.
+class GpuCudaMallocAsyncAllocator : public tsl::Allocator {
+ public:
+  // API that uses the default memory pool for cuda malloc async
+  explicit GpuCudaMallocAsyncAllocator(tsl::PlatformDeviceId platform_device_id,
+                                       size_t release_threshold,
+                                       bool reserve_memory = false,
+                                       bool compute_stats = true);
+
+  // Construct the allocator that allows the user to instantiate with a new cuda
+  // memory pool.
+  explicit GpuCudaMallocAsyncAllocator(tsl::PlatformDeviceId platform_device_id,
+                                       bool create_new_pool,
+                                       size_t new_pool_size,
+                                       bool reserve_memory = false,
+                                       size_t reserve_memory_size = 0,
+                                       bool sync_mode = false,
+                                       bool compute_stats = true);
+
+  ~GpuCudaMallocAsyncAllocator() override;
+  std::string Name() override { return name_; }
+  void* AllocateRaw(size_t alignment,
+                    size_t num_bytes) override ABSL_NO_THREAD_SAFETY_ANALYSIS;
+  void DeallocateRaw(void* ptr) override ABSL_NO_THREAD_SAFETY_ANALYSIS;
+
+  bool TracksAllocationSizes() const override;
+
+  size_t RequestedSize(const void* ptr) const override;
+
+  size_t AllocatedSize(const void* ptr) const override;
+
+  std::optional<tsl::AllocatorStats> GetStats() override;
+
+  bool ClearStats() override;
+
+  void SetStreamAndPreallocateMemory(void* stream) override;
+
+  static int GetInstantiatedCountTestOnly() { return number_instantiated_; }
+
+  tsl::AllocatorMemoryType GetMemoryType() const override {
+    return tsl::AllocatorMemoryType::kDevice;
+  }
+
+ private:
+  void PrintAllocatorStatisticsNoLock() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  StreamExecutor* stream_exec_;  // Not owned.
+  struct CudaState;
+  std::unique_ptr<CudaState> cuda_state_;
+
+  // Just a counter for the number of time this class is instantiated.
+  // Only useful for tests.
+  static std::atomic<int> number_instantiated_;
+
+  std::string name_;
+
+  bool reserve_memory_;
+
+  bool create_new_pool_;
+
+  // When the allocator is working in sync mode, the allocator will block host
+  // thread until memory allocation has completed.
+  bool sync_mode_;
+
+  GpuCudaMallocAsyncAllocator(const GpuCudaMallocAsyncAllocator&) = delete;
+  void operator=(const GpuCudaMallocAsyncAllocator&) = delete;
+
+  // Stats.
+  // Structures mutable after construction
+  mutable absl::Mutex mutex_;
+  std::unique_ptr<tsl::AllocatorStats> stats_ ABSL_PT_GUARDED_BY(mutex_);
+  absl::flat_hash_map<const void*, size_t> size_map_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
new file mode 100644
index 00000000..678a34e5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+
+#include <string>
+#include <tuple>
+
+#include "absl/status/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = std::tuple<int, int, int>;
+
+// FIXME: These functions are in stream_executor::cuda namespaces for now
+// Will move to stream_executor::gpu namespace in the near future
+//
+//// Converts a parsed driver version to string form.
+// string DriverVersionToString(DriverVersion version);
+//
+//// Converts a parsed driver version or status value to natural string form.
+// string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
+//
+//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+// absl::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static absl::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const std::string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static absl::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static absl::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      absl::StatusOr<DriverVersion> dso_version,
+      absl::StatusOr<DriverVersion> kernel_version);
+
+  static std::string GetDevNodePath(int dev_node_ordinal);
+
+  Diagnostician(const Diagnostician&) = delete;
+  void operator=(const Diagnostician&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
new file mode 100644
index 00000000..ad9b10e3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+
+#include <cstdint>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_common.h"
+
+namespace stream_executor {
+
+namespace gpu {
+
+class GpuStream;
+
+// Intermediate implementation class for StreamExecutors that are used with
+// GPUs.
+class GpuExecutor : public StreamExecutorCommon {
+ public:
+  GpuExecutor(Platform* platform, int device_ordinal)
+      : StreamExecutorCommon(platform), device_ordinal_(device_ordinal) {}
+
+  int device_ordinal() const override { return device_ordinal_; };
+
+  absl::StatusOr<std::vector<ApiTrace>> ExtractApiTrace() override {
+    absl::MutexLock lock(&logger_mu_);
+    return std::move(argument_logs_);
+  }
+
+  absl::Status RecordApiTrace(ApiTrace call) override {
+    absl::MutexLock lock(&logger_mu_);
+    if (std::holds_alternative<GemmCallTrace>(call) &&
+        (argument_logging_mode_ & kLogGemm)) {
+      argument_logs_.push_back(call);
+    }
+    return absl::OkStatus();
+  }
+
+  bool SetArgumentLoggingMode(uint64_t mode) override {
+    absl::MutexLock lock(&logger_mu_);
+    argument_logging_mode_ = mode;
+    return true;
+  }
+
+  uint64_t GetArgumentLoggingMode() const { return argument_logging_mode_; }
+
+ private:
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  absl::Mutex logger_mu_;
+
+  mutable std::vector<ApiTrace> argument_logs_ ABSL_GUARDED_BY(logger_mu_);
+
+  uint64_t argument_logging_mode_ = 0;
+
+  GpuExecutor(const GpuExecutor&) = delete;
+  void operator=(const GpuExecutor&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_helpers.h
new file mode 100644
index 00000000..187d882c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_helpers.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+
+#include <stddef.h>
+
+#include "xla/stream_executor/device_memory.h"
+
+namespace stream_executor {
+
+namespace gpu {
+
+// Converts a const DeviceMemory reference to its underlying typed pointer in
+// CUDA device memory.
+template <typename T>
+const T* GpuMemory(const DeviceMemory<T>& mem) {
+  return static_cast<const T*>(mem.opaque());
+}
+
+// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
+// pointer in CUDA device memory.
+template <typename T>
+T* GpuMemoryMutable(DeviceMemory<T>* mem) {
+  return static_cast<T*>(mem->opaque());
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_init.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_init.h
new file mode 100644
index 00000000..ea51473c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_init.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_INIT_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_INIT_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+
+namespace stream_executor {
+class Platform;
+
+// Initializes the GPU platform and returns OK if the GPU
+// platform could be initialized.
+absl::Status ValidateGPUMachineManager();
+
+// Returns the GPU machine manager singleton, creating it and
+// initializing the GPUs on the machine if needed the first time it is
+// called.  Must only be called when there is a valid GPU environment
+// in the process (e.g., ValidateGPUMachineManager() returns OK).
+Platform* GPUMachineManager();
+
+// Returns the string describing the name of the GPU platform in use.
+// This value is "CUDA" by default, and
+// "ROCM" when TF is built with `--config==rocm`
+std::string GpuPlatformName();
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_INIT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
new file mode 100644
index 00000000..4436c970
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_SEMAPHORE_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_SEMAPHORE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+enum struct GpuSemaphoreState { kHold, kRelease, kTimedOut };
+
+// A basic semaphore that allows synchronization between host and GPU.
+// It uses pinned host memory as the communication channel.
+class GpuSemaphore {
+ public:
+  // Creates an invalid semaphore instance
+  GpuSemaphore() = default;
+
+  // Creates a valid semaphore. Allocates some pinned host memory using
+  // `executor`.
+  static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
+
+  // Returns true if this semaphore is valid, otherwise false.
+  explicit operator bool() const { return bool{ptr_}; }
+
+  GpuSemaphoreState& operator*() {
+    return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
+  }
+  DeviceMemory<GpuSemaphoreState> device();
+
+ private:
+  explicit GpuSemaphore(std::unique_ptr<MemoryAllocation> alloc)
+      : ptr_{std::move(alloc)} {}
+  std::unique_ptr<MemoryAllocation> ptr_;
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_SEMAPHORE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
new file mode 100644
index 00000000..ec95ec50
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
+// StreamExecutor Stream interface.
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+
+#include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/stream.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
+GpuStreamHandle AsGpuStreamValue(Stream* stream);
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
new file mode 100644
index 00000000..f64d1015
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
@@ -0,0 +1,111 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TEST_KERNELS_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_TEST_KERNELS_H_
+
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace stream_executor::gpu {
+namespace internal {
+
+// This is a collection of gpu kernels for writing simple StreamExecutor tests.
+//
+// Some of the kernels available as pre-compiled PTX blobs (can be loaded with
+// CUDA driver API), and
+// some of the kernels are written directly in CUDA C++ and can be loaded from a
+// symbol pointer (to test StreamExecutor CUDA runtime integration).
+
+// PTX kernel compiled from:
+//
+//  __global__ void add(int* a, int* b, int* c) {
+//    int index = threadIdx.x + blockIdx.x * blockDim.x;
+//    c[index] = a[index] + b[index];
+//  }
+//
+// Easiest way to get PTX from C++ is to use https://godbolt.org.
+inline constexpr absl::string_view kAddI32KernelPtx = R"(
+.version 4.0
+.target sm_50
+.address_size 64
+
+.visible .entry AddI32(
+        .param .u64 AddI32_param_0,
+        .param .u64 AddI32_param_1,
+        .param .u64 AddI32_param_2
+)
+{
+        .reg .b32       %r<8>;
+        .reg .b64       %rd<11>;
+        .loc    1 1 0
+
+        ld.param.u64    %rd1, [AddI32_param_0];
+        ld.param.u64    %rd2, [AddI32_param_1];
+        ld.param.u64    %rd3, [AddI32_param_2];
+        .loc    1 3 3
+        cvta.to.global.u64      %rd4, %rd3;
+        cvta.to.global.u64      %rd5, %rd2;
+        cvta.to.global.u64      %rd6, %rd1;
+        mov.u32         %r1, %tid.x;
+        mov.u32         %r2, %ctaid.x;
+        mov.u32         %r3, %ntid.x;
+        mad.lo.s32      %r4, %r2, %r3, %r1;
+        .loc    1 4 3
+        mul.wide.s32    %rd7, %r4, 4;
+        add.s64         %rd8, %rd6, %rd7;
+        ld.global.u32   %r5, [%rd8];
+        add.s64         %rd9, %rd5, %rd7;
+        ld.global.u32   %r6, [%rd9];
+        add.s32         %r7, %r6, %r5;
+        add.s64         %rd10, %rd4, %rd7;
+        st.global.u32   [%rd10], %r7;
+        .loc    1 5 1
+        ret;
+
+})";
+
+template <typename T>
+struct Ptrs3 {
+  T* a;
+  T* b;
+  T* c;
+};
+
+// Returns a pointer to device kernel compiled from the CUDA C++ code above.
+void* GetAddI32Kernel();
+
+// Returns a pointer to device kernel doing multiplication instead of addition.
+void* GetMulI32Kernel();
+
+// Returns a pointer to device kernel doing increment and compare, intended for
+// testing on-device while loops.
+void* GetIncAndCmpKernel();
+
+// Returns a pointer to device kernel compiled from the CUDA C++ but with all
+// three pointers passed to argument as an instance of `Ptr3` template to test
+// StreamExecutor arguments packing for custom C++ types.
+void* GetAddI32Ptrs3Kernel();
+
+}  // namespace internal
+
+// Returns an in-process kernel loader spec for the `AddI32` kernel above.
+MultiKernelLoaderSpec GetAddI32KernelSpec();
+
+// Returns a PTX kernel loader spec for the `AddI32` PTX kernel above.
+MultiKernelLoaderSpec GetAddI32PtxKernelSpec();
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TEST_KERNELS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin.h
new file mode 100644
index 00000000..803b8b3c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TEST_KERNELS_FATBIN_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_TEST_KERNELS_FATBIN_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+
+namespace stream_executor::gpu {
+
+// Returns the NVIDIA or HIP fatbin for the :gpu_test_kernels target.
+// The fatbin is being extracted at compile time from the compilation artifact.
+// Note that this function will read the extracted fatbin from the file system
+// at runtime and will only be able to succeed when the test is being invoked by
+// `bazel test`.
+absl::StatusOr<std::vector<uint8_t>> GetGpuTestKernelsFatbin();
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TEST_KERNELS_FATBIN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_types.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_types.h
new file mode 100644
index 00000000..84c5d400
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/gpu_types.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// GPU (SYCL / ROCm / CUDA) specific type handle resolution
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+
+#if TENSORFLOW_USE_SYCL
+
+#include "sycl/sycl.hpp"
+
+#elif TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/hiprand/hiprand.h"
+
+#else  // CUDA
+
+#include "third_party/gpus/cuda/include/cuda.h"
+
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+#if TENSORFLOW_USE_SYCL
+
+using GpuStreamHandle = ::sycl::queue*;
+
+#elif TENSORFLOW_USE_ROCM
+
+using GpuStreamHandle = hipStream_t;
+#else  // CUDA
+
+using GpuStreamHandle = CUstream;
+
+#endif
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/mock_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/mock_context.h
new file mode 100644
index 00000000..11b3228a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/mock_context.h
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_MOCK_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_GPU_MOCK_CONTEXT_H_
+
+#include "xla/stream_executor/gpu/context.h"
+#include "xla/test.h"
+
+namespace stream_executor::gpu {
+
+// Implements the Context interface for testing.
+class MockContext : public Context {
+ public:
+  MockContext() = default;
+  MOCK_METHOD(void, SetActive, (), (override));
+  MOCK_METHOD(bool, IsActive, (), (const, override));
+  MOCK_METHOD(int, device_ordinal, (), (const, override));
+  MOCK_METHOD(absl::Status, Synchronize, (), (override));
+};
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_MOCK_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/read_numa_node.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/read_numa_node.h
new file mode 100644
index 00000000..93245bfe
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/read_numa_node.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_READ_NUMA_NODE_H_
+#define XLA_STREAM_EXECUTOR_GPU_READ_NUMA_NODE_H_
+
+#include <string>
+
+namespace stream_executor::gpu {
+
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
+// of SysFS. Returns -1 if it cannot.
+int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal);
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_READ_NUMA_NODE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
new file mode 100644
index 00000000..dba6fe2e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
@@ -0,0 +1,143 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+
+namespace stream_executor {
+
+// An allocator that allocates a bit of extra memory around the beginning/end of
+// every allocation and can check that this memory is unmodified.
+//
+// This can be used to check for out-of-bounds writes, and, if the redzone is
+// filled with a sufficiently "ugly" pattern, may also be able to check for
+// out-of-bounds reads.  The default fill pattern of -1 is an unusual NaN
+// pattern when interpreted as a floating-point number, so hopefully works for
+// out-of-bounds reads and writes in those cases.
+//
+// This class implements ScratchAllocator, so can be used to allocate temp
+// memory for cudnn convolutions.
+class RedzoneAllocator : public ScratchAllocator {
+ public:
+  static constexpr int64_t kDefaultRedzoneSize =
+      1LL << 23;  // 8MiB per side, 16MiB total.
+  static constexpr uint8_t kDefaultRedzonePattern = -1;  // NOLINT
+  RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
+                   int64_t memory_limit = (1LL << 32),  // 4GB
+                   int64_t redzone_size = kDefaultRedzoneSize,
+                   uint8_t redzone_pattern = kDefaultRedzonePattern);
+
+  // Redzones don't count towards the memory limit.
+  int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
+
+  int64_t TotalAllocatedBytesExcludingRedzones() const {
+    return allocated_bytes_excluding_redzones_;
+  }
+
+  absl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
+      int64_t byte_size) override;
+
+  // Non-empty redzone check status implies that there was a write into a
+  // redzone, with a string communicating the location of the write.
+  struct RedzoneCheckStatus {
+    RedzoneCheckStatus() = default;
+
+    RedzoneCheckStatus(absl::string_view buffer_name, void* user_buffer_address,
+                       int64_t offset, uint64_t expected_value,
+                       uint64_t actual_value)
+        : buffer_name(buffer_name),
+          user_buffer_address(user_buffer_address),
+          offset(offset),
+          expected_value(expected_value),
+          actual_value(actual_value) {}
+
+    static RedzoneCheckStatus OK() { return {}; }
+
+    bool ok() { return user_buffer_address == nullptr; }
+
+    std::string RedzoneFailureMsg() const;
+
+    std::string buffer_name = {};
+    void* user_buffer_address = nullptr;
+    int64_t offset = 0;
+    uint64_t expected_value = 0;
+    uint64_t actual_value = 0;
+  };
+
+  // Determines whether redzones around all allocated buffers are unmodified.
+  //
+  // Reinitializes redzones to the expected value, so that the same buffer
+  // can be reused for multiple checks.
+  //
+  // Returns:
+  //
+  //  - RedzoneCheckStatus::OK() if everything went well.
+  //  - RedzoneCheckStatus with a non-empty error message iff a write into a
+  //    redzone has been detected.
+  //  - A stream error, if loading or launching the kernel has failed.
+  absl::StatusOr<RedzoneCheckStatus> CheckRedzones() const;
+
+  Stream* stream() const { return stream_; }
+
+  // Create a buffer for a given operation using redzone checker, initialize
+  // based on a given rng state.
+  absl::StatusOr<DeviceMemoryBase> CreateBuffer(const xla::Shape& shape,
+                                                bool initialize_buffers,
+                                                int64_t& rng_state);
+
+ private:
+  const int device_ordinal_;
+  Stream* stream_;
+
+  // Memory limit of the allocator in bytes.
+  const int64_t memory_limit_;
+
+  // Redzone size on *one side* of allocation in bytes.
+  //
+  // Must be a multiple of kXlaAllocatedBufferAlignBytes, otherwise the buffers
+  // returned to users will be misaligned.
+  const int64_t redzone_size_;
+
+  const uint8_t redzone_pattern_;
+  DeviceMemoryAllocator* memory_allocator_;
+
+  // The second element of the pair is the size of the user allocation.  This
+  // isn't necessarily just first.size() - 2 * redzone_size_ because when the
+  // user allocation size is not a multiple of 4 bytes, we round up the size of
+  // the RHS redzone.
+  //
+  // ScratchAllocators need to free all allocated memory on destruction so we
+  // use `OwningDeviceMemory` here.
+  std::vector<std::pair<OwningDeviceMemory, int64_t>> allocated_buffers_;
+
+  int64_t allocated_bytes_excluding_redzones_ = 0;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel.h
new file mode 100644
index 00000000..7f1a3c34
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_KERNEL_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+using ComparisonKernel = TypedKernel<DeviceMemory<uint8_t>, uint8_t, uint64_t,
+                                     DeviceMemory<uint64_t>>;
+
+// Returns a GPU kernel that checks a memory location for redzone patterns.
+// Parameters are (buffer_address, redzone_pattern, buffer_length,
+// mismatch_count_ptr). For each byte in buffer `[buffer_address :
+// buffer_address
+// + buffer_length]` that is not equal to `redzone_pattern`,
+// `*mismatch_count_ptr` gets incremented by 1.
+absl::StatusOr<ComparisonKernel*> GetComparisonKernel(StreamExecutor* executor,
+                                                      GpuAsmOpts gpu_asm_opts);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.h
new file mode 100644
index 00000000..1dad6ae4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_SCOPED_ACTIVATE_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_GPU_SCOPED_ACTIVATE_CONTEXT_H_
+
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/gpu/context.h"
+
+namespace stream_executor::gpu {
+
+// Ensures a context is activated within a scope.
+class ScopedActivateContext : public ActivateContext {
+ public:
+  // Activates the context via Context::SetActive.
+  explicit ScopedActivateContext(Context* gpu_context);
+
+  // Checks that the context has remained activated for the duration of the
+  // scope.
+  ~ScopedActivateContext() override;
+
+ private:
+  Context* to_restore_ = nullptr;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_SCOPED_ACTIVATE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_gpu_graph_exec.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_gpu_graph_exec.h
new file mode 100644
index 00000000..244aba61
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_gpu_graph_exec.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_SCOPED_GPU_GRAPH_EXEC_H_
+#define XLA_STREAM_EXECUTOR_GPU_SCOPED_GPU_GRAPH_EXEC_H_
+
+#include "xla/stream_executor/gpu/scoped_update_mode.h"
+
+namespace stream_executor::gpu {
+
+// ScopedGraphExec reads from `*exec_graph_location` and
+// `*is_owned_graph_exec_location` in the constructor, stores their values for
+// the lifetime of the object, and restores them in the destructor.
+// Basically it's a simple RAII variable state wrapper for two variables with
+// one being a bool and the other being a templated `GraphExecHandle`.
+template <typename GraphExecHandle>
+class ScopedGraphExec : public ScopedUpdateMode {
+ public:
+  ScopedGraphExec(GraphExecHandle* exec_graph_location,
+                  bool* is_owned_graph_exec_location)
+      : restore_location_(exec_graph_location),
+        restore_is_owned_location_(is_owned_graph_exec_location),
+        restore_value_(*exec_graph_location),
+        restore_is_owned_value_(*is_owned_graph_exec_location) {}
+
+  ~ScopedGraphExec() override {
+    *restore_location_ = restore_value_;
+    *restore_is_owned_location_ = restore_is_owned_value_;
+  }
+
+ private:
+  GraphExecHandle* restore_location_;
+  bool* restore_is_owned_location_;
+  GraphExecHandle restore_value_;
+  bool restore_is_owned_value_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_SCOPED_GPU_GRAPH_EXEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_update_mode.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_update_mode.h
new file mode 100644
index 00000000..43e927d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu/scoped_update_mode.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_SCOPED_UPDATE_MODE_H_
+#define XLA_STREAM_EXECUTOR_GPU_SCOPED_UPDATE_MODE_H_
+
+namespace stream_executor::gpu {
+
+// RAII wrapper for `GpuCommandBuffer::ActivateUpdateMode` that enables updates
+// of nested command buffers.
+class ScopedUpdateMode {
+ public:
+  virtual ~ScopedUpdateMode() = 0;
+};
+
+inline ScopedUpdateMode::~ScopedUpdateMode() = default;
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_SCOPED_UPDATE_MODE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu_solver_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu_solver_context.h
new file mode 100644
index 00000000..b48cfa8c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/gpu_solver_context.h
@@ -0,0 +1,101 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_SOLVER_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_GPU_SOLVER_CONTEXT_H_
+
+#include <complex>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace stream_executor {
+
+// A virtual interface for interacting with a gpu solver context.
+//
+// This is used to abstract away the implementation of the solver context,
+// allowing us to switch between different solver implementations (e.g.
+// cuSolver, rocmsolver, etc.).
+class GpuSolverContext {
+ public:
+  virtual absl::Status SetStream(Stream* stream) = 0;
+  virtual ~GpuSolverContext() = default;
+
+  // Computes the Cholesky factorization of multiple matrices.  See
+  // https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-batchpotrf
+  //
+  // `as` is a list of pointers to the batch_size individual n x n matrices
+  // that make up the input array.
+  virtual absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                                    DeviceMemory<float*> as, int lda,
+                                    DeviceMemory<int> lapack_info,
+                                    int batch_size) = 0;
+  virtual absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                                    DeviceMemory<double*> as, int lda,
+                                    DeviceMemory<int> lapack_info,
+                                    int batch_size) = 0;
+  virtual absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                                    DeviceMemory<std::complex<float>*> as,
+                                    int lda, DeviceMemory<int> lapack_info,
+                                    int batch_size) = 0;
+  virtual absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                                    DeviceMemory<std::complex<double>*> as,
+                                    int lda, DeviceMemory<int> lapack_info,
+                                    int batch_size) = 0;
+
+  virtual absl::Status Potrf(blas::UpperLower uplo, int n,
+                             DeviceMemory<float> a, int lda,
+                             DeviceMemory<int> lapack_info,
+                             DeviceMemory<float> workspace) = 0;
+  virtual absl::Status Potrf(blas::UpperLower uplo, int n,
+                             DeviceMemory<double> a, int lda,
+                             DeviceMemory<int> lapack_info,
+                             DeviceMemory<double> workspace) = 0;
+  virtual absl::Status Potrf(blas::UpperLower uplo, int n,
+                             DeviceMemory<std::complex<float>> a, int lda,
+                             DeviceMemory<int> lapack_info,
+                             DeviceMemory<std::complex<float>> workspace) = 0;
+  virtual absl::Status Potrf(blas::UpperLower uplo, int n,
+                             DeviceMemory<std::complex<double>> a, int lda,
+                             DeviceMemory<int> lapack_info,
+                             DeviceMemory<std::complex<double>> workspace) = 0;
+
+  // Returns the max size of the `workspace` required by Potrf and PotrfBatched,
+  // in number of elements of `type`.
+  //
+  // (PotrfBatched doesn't require a workspace per se -- it uses the
+  // input array as scratch.  But we do need to materialize the `as` input, and
+  // we do this in the workspace.)
+  //
+  // This is a bit of a hack; we could instead split it up into two functions.
+  // But at the moment, it's an implementation detail of CholeskyThunk whether
+  // it calls Potrf or PotrfBatched, so we need to allocate enough scratch space
+  // for either.
+  //
+  // In practice, this does not result in a notable increase in scratch space
+  // needed, because both cases require a relatively small amount of scratch.
+  virtual absl::StatusOr<int64_t> PotrfBufferSize(xla::PrimitiveType type,
+                                                  blas::UpperLower uplo, int n,
+                                                  int lda, int batch_size) = 0;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_SOLVER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_event.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_event.h
new file mode 100644
index 00000000..44086b91
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_event.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_EVENT_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_EVENT_H_
+
+#include <memory>
+
+#include "absl/synchronization/notification.h"
+#include "xla/stream_executor/event.h"
+
+namespace stream_executor {
+
+// This class is a host-side implementation of the Event interface. It is
+// intended to be used with the HostStream implementation.
+class HostEvent : public Event {
+ public:
+  HostEvent() : notification_(std::make_shared<absl::Notification>()) {}
+
+  std::shared_ptr<absl::Notification>& notification() { return notification_; }
+
+  Status PollForStatus() override {
+    return notification_->HasBeenNotified() ? Event::Status::kComplete
+                                            : Event::Status::kPending;
+  }
+
+ private:
+  // We use a std::shared_ptr here because the client may delete the HostEvent
+  // object while there are still RecordEvent and WaitForEvent callbacks pending
+  // on a stream.
+  std::shared_ptr<absl::Notification> notification_;
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_EVENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_executor.h
new file mode 100644
index 00000000..831cf277
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -0,0 +1,121 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/host/host_kernel.h"
+#include "xla/stream_executor/host_memory_allocation.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_common.h"
+#include "tsl/platform/threadpool.h"
+
+namespace stream_executor {
+namespace host {
+
+// Declares the HostExecutor class, which is a CPU-only implementation of
+// the StreamExecutor interface. For now, this is used for testing and to
+// examine the performance of host-based StreamExecutor code.
+//
+// This is useful for evaluating the performance of host-based or fallback
+// routines executed under the context of a GPU executor.
+class HostExecutor : public StreamExecutorCommon {
+ public:
+  // A function that loads a kernel function from a given spec. If spec is not
+  // supported it returns an empty optional.
+  using KernelFunctionLoader = std::function<std::optional<
+      absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>>(
+      const MultiKernelLoaderSpec& spec)>;
+
+  // Registers a kernel function loader in a static registry.
+  static void RegisterKernelFunctionLoader(KernelFunctionLoader loader);
+
+  HostExecutor(Platform* platform, int device_ordinal)
+      : StreamExecutorCommon(platform), device_ordinal_(device_ordinal) {}
+
+  absl::Status Init() override;
+
+  absl::StatusOr<std::unique_ptr<Kernel>> LoadKernel(
+      const MultiKernelLoaderSpec& spec) override;
+
+  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
+  void Deallocate(DeviceMemoryBase* mem) override;
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
+    return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
+  }
+  void HostMemoryDeallocate(void* mem) override {
+    delete[] static_cast<char*>(mem);
+  }
+
+  bool SynchronizeAllActivity() override { return true; }
+  absl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                  uint64_t size) override;
+
+  absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64_t size) override;
+
+  void DeallocateStream(Stream* stream) override;
+
+  bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return CreateDeviceDescription(0);
+  }
+
+  static absl::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
+  int device_ordinal() const override { return device_ordinal_; }
+
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override {
+    return absl::OkStatus();
+  }
+
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override { return true; }
+
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
+
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) override;
+
+ private:
+  int device_ordinal_;
+  std::shared_ptr<tsl::thread::ThreadPool> thread_pool_;
+};
+
+}  // namespace host
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_kernel.h
new file mode 100644
index 00000000..fe62b907
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_kernel.h
@@ -0,0 +1,164 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "tsl/platform/threadpool.h"
+
+namespace stream_executor::host {
+
+class HostExecutor;
+
+class HostKernel : public Kernel {
+ public:
+  using Task = std::function<void()>;
+  using TaskRunner = absl::AnyInvocable<void(Task)>;
+
+  // A struct to report completion of the kernel execution.
+  using LaunchEvent = tsl::Chain;
+
+  // Virtual base class that owns the function behind the host kernel. It can be
+  // a function in a jit-compiled LLVM module or simply a pointer to the
+  // in-process function written in C++. HostKernel is responsible for launching
+  // the kernel function owned by the KernelFunction with given user-provided
+  // arguments potentially on a thread pool.
+  class KernelFunction {
+   public:
+    virtual ~KernelFunction() = default;
+    virtual SE_HOST_Kernel* kernel() const = 0;
+  };
+
+  // A wrapper around function pointer that implements SE_HOST_Kernel API.
+  class KernelFunctionPtr final : public KernelFunction {
+   public:
+    explicit KernelFunctionPtr(SE_HOST_Kernel* ptr) : ptr_(ptr) {}
+    SE_HOST_Kernel* kernel() const override { return ptr_; }
+
+   private:
+    SE_HOST_Kernel* ptr_;  // not owned
+  };
+
+  // TODO(ezhulenev): Remove this constructor as we prefer to rely on task
+  // runner as it gives us more flexibility.
+  explicit HostKernel(std::shared_ptr<tsl::thread::ThreadPool> thread_pool);
+
+  // TODO(tsilytskyi): make this implementation detail private
+  HostKernel(unsigned arity, SE_HOST_Kernel* kernel,
+             std::shared_ptr<tsl::thread::ThreadPool> thread_pool = nullptr);
+
+  // Calls the kernel once in the caller thread for a thread dim (0,0,0).
+  // This is a fast path for small host kernels that have just one thread.
+  absl::Status CallOnce(absl::Span<const SE_HOST_KernelArg> args) const;
+
+  // Launches the kernel on the current thread by iterating over all threads in
+  // `thread_dims` and calling the kernel function.
+  absl::Status Launch(const ThreadDim& thread_dims,
+                      absl::Span<const DeviceMemoryBase> buffers) const;
+  absl::Status Launch(const ThreadDim& thread_dims,
+                      absl::Span<const SE_HOST_KernelArg> args) const;
+  absl::Status Launch(const ThreadDim& thread_dims, const BlockDim& block_dims,
+                      const std::optional<ClusterDim>& cluster_dims,
+                      Stream* stream, const KernelArgs& args) override;
+
+  // Launches the kernel by iterating over all threads in `thread_dims` and
+  // calling `task_runner` to run individual task (implementation might decide
+  // to run some of the tasks in the caller thread to save on scheduling
+  // overheads). It's up to the caller to define where task runner will execute
+  // the task, i.e., a common case is to launch them on a thread pool.
+  //
+  // The returned async value becomes available after all tasks are completed.
+  // Async value returned in constructed state and the caller can access it to
+  // get the number of tasks that are expected to be completed.
+  tsl::AsyncValueRef<LaunchEvent> Launch(
+      const ThreadDim& thread_dims, absl::Span<const DeviceMemoryBase> buffers,
+      TaskRunner task_runner) const;
+  tsl::AsyncValueRef<LaunchEvent> Launch(
+      const ThreadDim& thread_dims, absl::Span<const SE_HOST_KernelArg> args,
+      TaskRunner task_runner) const;
+
+  // For host platform, we assume that a core is a thread, and we can run at
+  // most one instance of a kernel on a given thread.
+  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(ThreadDim,
+                                                      size_t) const override {
+    return 1;
+  };
+
+  void SetArity(unsigned arity) { arity_ = arity; };
+  unsigned Arity() const override { return arity_; };
+
+  template <typename T,
+            std::enable_if_t<std::is_base_of_v<KernelFunction, T>>* = nullptr>
+  void SetKernelFunction(std::unique_ptr<T> function) {
+    function_ = std::move(function);
+    kernel_ = function_->kernel();
+  }
+
+ private:
+  std::unique_ptr<KernelFunction> function_;
+  SE_HOST_Kernel* kernel_;  // pointer to the kernel owned by `function_`
+
+  unsigned arity_;
+  std::shared_ptr<tsl::thread::ThreadPool> thread_pool_;
+};
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE absl::Status HostKernel::CallOnce(
+    absl::Span<const SE_HOST_KernelArg> args) const {
+  constexpr SE_HOST_KernelThreadDim kernel_thread_dims = {1, 1, 1};
+  constexpr SE_HOST_KernelThread kernel_thread = {1, 1, 1};
+
+  SE_HOST_KernelCallFrame call_frame = {&kernel_thread_dims, &kernel_thread,
+                                        args.size(), args.data()};
+
+  SE_HOST_KernelError* error = (*kernel_)(&call_frame);
+
+  if (ABSL_PREDICT_FALSE(error != nullptr)) {
+    return absl::InternalError("Failed to call host kernel");
+  }
+
+  return absl::OkStatus();
+}
+
+inline const HostKernel* AsHostKernel(const Kernel* kernel) {
+  return static_cast<const HostKernel*>(kernel);
+}
+
+inline HostKernel* AsHostKernel(Kernel* kernel) {
+  return static_cast<HostKernel*>(kernel);
+}
+
+}  // namespace stream_executor::host
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
new file mode 100644
index 00000000..c51a6a94
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+//===----------------------------------------------------------------------===//
+// StreamExecutor Host Kernel API
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// StreamExecutor host kernel API is an integration point between a codegen
+// backend and a runtime. XLA:CPU backend compiles fusion regions to native
+// functions (via LLVM backend) that are compatible with a kernel API (and ABI),
+// and the runtime is simply invoking them with user buffers and orchestrates
+// multi-threaded execution.
+
+// WARNING: This API does not provide any backward compatibility guarantees as
+// today XLA:CPU backend is statically linked and we do not plan to load
+// kernels from dynamic libraries. It's defined as C API because we have to
+// match it in the codegen backend (built on top of LLVM) and C structs have
+// trivial layout that can be expressed as llvm stuct (*).
+//
+// (*) https://llvm.org/docs/LangRef.html#structure-types
+
+// Similar to a Gpu backend an XLA:CPU compiler generates a tiled function from
+// an HLO fusion where each tile is responsible for computing a part of the
+// output. It's up to compiler to chose the tiling strategy, from StreamExecutor
+// perspective it's simply an iteration space where each task is independent and
+// can be executed concurrently.
+typedef struct SE_HOST_KernelDim3 {
+  uint64_t x;
+  uint64_t y;
+  uint64_t z;
+} SE_HOST_KernelDim3;
+
+// Kernel grid size roughly corresponds to a CUDA block size.
+typedef struct SE_HOST_KernelDim3 SE_HOST_KernelThreadDim;
+
+// Kernel grid coordinate roughly corresponds to a CUDA block, with an
+// assumption that all kernel invocations can run concurrently.
+typedef struct SE_HOST_KernelDim3 SE_HOST_KernelThread;
+
+// A CPU kernel argument that corresponds to se::DeviceMemoryBase.
+typedef struct SE_HOST_KernelArg {
+  void* data;
+  size_t size;
+} SE_HOST_KernelArg;
+
+// A CPU kernel call frame.
+typedef struct SE_HOST_KernelCallFrame {
+  const SE_HOST_KernelThreadDim* thread_dims;
+  const SE_HOST_KernelThread* thread;
+
+  size_t num_args;
+  const SE_HOST_KernelArg* args;
+} SE_HOST_KernelCallFrame;
+
+// Error reporting for host kernels. NULL means success.
+typedef struct SE_HOST_KernelError SE_HOST_KernelError;
+
+// Host kernel API.
+typedef SE_HOST_KernelError* SE_HOST_Kernel(
+    const SE_HOST_KernelCallFrame* call_frame);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_platform.h
new file mode 100644
index 00000000..3d6f09d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_platform.h
@@ -0,0 +1,75 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the "host" platform, which is a CPU-only implementation of the
+// StreamExecutor. The host platform only supports memory operations and plugin
+// routines, and is primarily used for testing.
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace host {
+
+// Host (CPU) platform plugin, registered as a singleton value via module
+// initializer.
+class HostPlatform : public Platform {
+ public:
+  HostPlatform();
+  ~HostPlatform() override;
+
+  Platform::Id id() const override;
+
+  // Device count is less clear-cut for CPUs than accelerators. This call
+  // currently returns the number of thread units in the host, as reported by
+  // base::NumCPUs().
+  int VisibleDeviceCount() const override;
+
+  const std::string& Name() const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+ private:
+  // Returns a device constructed with ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      int ordinal);
+
+  // This platform's name.
+  std::string name_;
+
+  // Cache of created StreamExecutors.
+  ExecutorCache executor_cache_;
+
+  HostPlatform(const HostPlatform&) = delete;
+  void operator=(const HostPlatform&) = delete;
+};
+
+}  // namespace host
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_platform_id.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_platform_id.h
new file mode 100644
index 00000000..f9d85aeb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
+
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace host {
+
+// Opaque and unique identifier for the host platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a HostPlatform object.
+// This is broken out here to avoid a circular dependency between HostPlatform
+// and HostStreamExecutor.
+extern const Platform::Id kHostPlatformId;
+
+}  // namespace host
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_stream.h
new file mode 100644
index 00000000..46440528
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/host_stream.h
@@ -0,0 +1,90 @@
+/* Copyright 2016 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
+
+#include <cstdint>
+#include <memory>
+#include <queue>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace stream_executor {
+namespace host {
+
+// Class declaration for Stream type that enqueues tasks onto a host/CPU-based
+// execution context (as opposed to a GPU device), HostExecutor.
+class HostStream : public StreamCommon {
+ public:
+  explicit HostStream(StreamExecutor* executor);
+  ~HostStream() override;
+
+  // Enqueue a task that reports a status when finished. Tasks that fail do not
+  // stop the stream or block any other tasks from executing; rather, the stream
+  // will remember the first error encountered and return it from
+  // 'BlockUntilDone'.
+  bool EnqueueTaskWithStatus(absl::AnyInvocable<absl::Status() &&> task);
+  // Enqueue a task that doesn't report any status.
+  bool EnqueueTask(absl::AnyInvocable<void() &&> task);
+
+  // Blocks until all tasks are done, returns the first error reported by a task
+  // (if any) and clears the error status.
+  absl::Status BlockUntilDone();
+
+  absl::Status BlockHostUntilDone() override { return BlockUntilDone(); }
+
+  absl::Status WaitFor(Stream* other) override;
+  absl::Status WaitFor(Event* event) override;
+  absl::Status RecordEvent(Event* event) override;
+  absl::Status MemZero(DeviceMemoryBase* location, uint64_t size) override;
+  absl::Status Memset32(DeviceMemoryBase* location, uint32_t pattern,
+                        uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst,
+                      const DeviceMemoryBase& gpu_src, uint64_t size) override;
+  absl::Status Memcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                      uint64_t size) override;
+  absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) override;
+
+ private:
+  bool WorkAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void WorkLoop();
+
+  absl::Mutex mu_;
+  std::queue<absl::AnyInvocable<absl::Status() &&>> work_queue_
+      ABSL_GUARDED_BY(mu_);
+  std::unique_ptr<tsl::Thread> thread_;
+  absl::Status status_;
+};
+
+}  // namespace host
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.h
new file mode 100644
index 00000000..73991761
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_JIT_HOST_KERNEL_FUNCTION_H_
+#define XLA_STREAM_EXECUTOR_HOST_JIT_HOST_KERNEL_FUNCTION_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/host/host_kernel.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+
+namespace stream_executor::host {
+
+namespace internal {
+class ExecutionEngine;
+}
+
+// A host kernel function compiled from LLVM IR at run time
+class JitHostKernelFunction : public HostKernel::KernelFunction {
+ public:
+  SE_HOST_Kernel *kernel() const override { return kernel_; }
+
+  static absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>
+  CreateFromLlvmIr(absl::string_view name, absl::string_view entry,
+                   absl::string_view ir, absl::Span<const std::string> options);
+
+ private:
+  explicit JitHostKernelFunction(
+      std::unique_ptr<internal::ExecutionEngine> exec_engine);
+
+  std::unique_ptr<internal::ExecutionEngine> engine_;
+  SE_HOST_Kernel *kernel_;
+};
+
+}  // namespace stream_executor::host
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_JIT_HOST_KERNEL_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/ptr_host_kernel_function.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/ptr_host_kernel_function.h
new file mode 100644
index 00000000..9a8ce110
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host/ptr_host_kernel_function.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_PTR_HOST_KERNEL_FUNCTION_H_
+#define XLA_STREAM_EXECUTOR_HOST_PTR_HOST_KERNEL_FUNCTION_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/host/host_kernel.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+
+namespace stream_executor::host {
+
+// A host kernel function compiled together with XLA by a regular C++ compiler.
+class PtrHostKernelFunction : public HostKernel::KernelFunction {
+ public:
+  SE_HOST_Kernel *kernel() const override { return kernel_; }
+
+  static absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>
+  CreateFromPtr(SE_HOST_Kernel *kernel, absl::string_view kernel_name);
+
+ private:
+  explicit PtrHostKernelFunction(SE_HOST_Kernel *kernel) : kernel_(kernel) {}
+
+  SE_HOST_Kernel *kernel_;
+};
+
+}  // namespace stream_executor::host
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_PTR_HOST_KERNEL_FUNCTION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host_memory_allocation.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host_memory_allocation.h
new file mode 100644
index 00000000..eb0bf290
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host_memory_allocation.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_MEMORY_ALLOCATION_H_
+#define XLA_STREAM_EXECUTOR_HOST_MEMORY_ALLOCATION_H_
+
+#include <cstdint>
+
+#include "xla/stream_executor/memory_allocation.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+// RAII container for pinned host memory allocation allocated on an underlying
+// device owned by `*this`.
+class HostMemoryAllocation final : public MemoryAllocation {
+ public:
+  HostMemoryAllocation(void* ptr, uint64_t size, StreamExecutor* executor);
+  ~HostMemoryAllocation() final;
+
+  void* opaque() const final { return ptr_; }
+  uint64_t size() const final { return size_; }
+
+ private:
+  void* ptr_ = nullptr;
+  uint64_t size_ = 0;
+  StreamExecutor* executor_ = nullptr;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_MEMORY_ALLOCATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host_or_device_scalar.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host_or_device_scalar.h
new file mode 100644
index 00000000..81e07e2d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/host_or_device_scalar.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
+#define XLA_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
+
+#include <utility>
+#include <variant>
+
+#include "absl/log/check.h"
+#include "xla/stream_executor/device_memory.h"
+
+namespace stream_executor {
+
+// Allows to represent a value that is either a host scalar or a scalar stored
+// on the device.
+template <typename T>
+class HostOrDeviceScalar {
+ public:
+  explicit HostOrDeviceScalar(T host_value) : value_(std::move(host_value)) {}
+  explicit HostOrDeviceScalar(DeviceMemory<T> device_ptr)
+      : value_(std::move(device_ptr)) {
+    CHECK_EQ(1, device_ptr.ElementCount());
+  }
+
+  bool on_device() const {
+    return std::holds_alternative<DeviceMemory<T>>(value_);
+  }
+
+  const void* opaque() const {
+    return on_device() ? std::get<DeviceMemory<T>>(value_).opaque()
+                       : &std::get<T>(value_);
+  }
+
+ private:
+  std::variant<T, DeviceMemory<T>> value_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h
new file mode 100644
index 00000000..127e455f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h
@@ -0,0 +1,106 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_HOST_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_HOST_ALLOCATOR_H_
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tsl/platform/logging.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace stream_executor {
+
+// Allocator for pinned CPU RAM that is made known to a StreamExecutor-based
+// device for the purpose of efficient DMA with the device.
+class DeviceHostAllocator : public tsl::SubAllocator {
+ public:
+  // Note: stream_exec cannot be null.
+  explicit DeviceHostAllocator(StreamExecutor* stream_exec, int numa_node,
+                               const std::vector<Visitor>& alloc_visitors,
+                               const std::vector<Visitor>& free_visitors)
+      : SubAllocator(alloc_visitors, free_visitors),
+        stream_exec_(stream_exec),
+        numa_node_(numa_node) {
+    CHECK(stream_exec_ != nullptr);
+  }
+
+  ~DeviceHostAllocator() override = default;
+
+  void* Alloc(size_t alignment, size_t num_bytes,
+              size_t* bytes_received) override {
+    tsl::profiler::TraceMe traceme("DeviceHostAllocator::Alloc");
+
+    void* ptr = nullptr;
+    *bytes_received = num_bytes;
+
+    if (num_bytes > 0) {
+      auto allocation = stream_exec_->HostMemoryAllocate(num_bytes);
+      if (!allocation.ok()) {
+        LOG(WARNING) << "could not allocate pinned host memory of size: "
+                     << num_bytes;
+        return nullptr;
+      }
+
+      ptr = (*allocation)->opaque();
+      VisitAlloc(ptr, numa_node_, num_bytes);
+
+      absl::MutexLock lock(&mutex_);
+      allocs_[ptr] = std::move(*allocation);
+    }
+
+    return ptr;
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {
+    tsl::profiler::TraceMe traceme("DeviceHostAllocator::Free");
+
+    if (ptr != nullptr) {
+      VisitFree(ptr, numa_node_, num_bytes);
+      absl::MutexLock lock(&mutex_);
+      allocs_.erase(ptr);
+    }
+  }
+
+  bool SupportsCoalescing() const override { return false; }
+
+  tsl::AllocatorMemoryType GetMemoryType() const override {
+    return tsl::AllocatorMemoryType::kHostPinned;
+  }
+
+ private:
+  StreamExecutor* stream_exec_;  // not owned, non-null
+  const int numa_node_;
+
+  DeviceHostAllocator(const DeviceHostAllocator&) = delete;
+  void operator=(const DeviceHostAllocator&) = delete;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<void*, std::unique_ptr<MemoryAllocation>> allocs_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_HOST_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
new file mode 100644
index 00000000..4e941aec
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
@@ -0,0 +1,117 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_MEM_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_MEM_ALLOCATOR_H_
+
+#include <vector>
+
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/device_id.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace stream_executor {
+
+// Suballocator for StreamExecutor-based device memory.
+class DeviceMemAllocator : public tsl::SubAllocator {
+ public:
+  // 'platform_device_id' refers to the ID of the device within
+  // the process and must reference a valid ID in the process.
+  // Note: stream_exec cannot be null.
+  DeviceMemAllocator(StreamExecutor* stream_exec,
+                     tsl::PlatformDeviceId device_id, MemoryType memory_type,
+                     const std::vector<Visitor>& alloc_visitors)
+      : SubAllocator(alloc_visitors, {}),
+        stream_exec_(stream_exec),
+        device_id_(device_id),
+        memory_type_(memory_type) {
+    CHECK(stream_exec_ != nullptr);
+  }
+
+  DeviceMemAllocator(StreamExecutor* stream_exec,
+                     tsl::PlatformDeviceId device_id, MemoryType memory_type)
+      : SubAllocator({}, {}),
+        stream_exec_(stream_exec),
+        device_id_(device_id),
+        memory_type_(memory_type) {
+    CHECK(stream_exec_ != nullptr);
+  }
+
+  ~DeviceMemAllocator() override = default;
+
+  void* Alloc(size_t alignment, size_t num_bytes,
+              size_t* bytes_received) override {
+    tsl::profiler::TraceMe traceme("DeviceMemAllocator::Alloc");
+
+    void* ptr = nullptr;
+    *bytes_received = num_bytes;
+    if (num_bytes > 0) {
+      if (memory_type_ == MemoryType::kUnified) {
+        ptr = stream_exec_->UnifiedMemoryAllocate(num_bytes);
+      } else if (memory_type_ == MemoryType::kCollective) {
+        auto status_or = stream_exec_->CollectiveMemoryAllocate(num_bytes);
+        CHECK(status_or.ok()) << status_or.status().message();
+        ptr = status_or.value();
+      } else if (memory_type_ == MemoryType::kHost) {
+        // Convert size_t to long unsigned int
+        long unsigned int value = static_cast<long unsigned int>(num_bytes);
+        auto status_or = stream_exec_->HostMemoryAllocate(value);
+        CHECK(status_or.ok()) << status_or.status().message();
+      } else {
+        ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
+      }
+      VisitAlloc(ptr, device_id_.value(), num_bytes);
+    }
+    return ptr;
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {
+    tsl::profiler::TraceMe traceme("DeviceMemAllocator::Free");
+
+    if (ptr != nullptr) {
+      VisitFree(ptr, device_id_.value(), num_bytes);
+      if (memory_type_ == MemoryType::kUnified) {
+        stream_exec_->UnifiedMemoryDeallocate(ptr);
+      } else if (memory_type_ == MemoryType::kCollective) {
+        auto status = stream_exec_->CollectiveMemoryDeallocate(ptr);
+        CHECK(status.ok()) << status.message();
+      } else if (memory_type_ == MemoryType::kHost) {
+        stream_exec_->HostMemoryDeallocate(ptr);
+      } else {
+        DeviceMemoryBase device_ptr(ptr);
+        stream_exec_->Deallocate(&device_ptr);
+      }
+    }
+  }
+
+  bool SupportsCoalescing() const override { return false; }
+
+  tsl::AllocatorMemoryType GetMemoryType() const override {
+    return tsl::AllocatorMemoryType::kDevice;
+  }
+
+ private:
+  StreamExecutor* stream_exec_;  // not owned, non-null
+  const tsl::PlatformDeviceId device_id_;
+  const MemoryType memory_type_ = MemoryType::kDevice;
+
+  DeviceMemAllocator(const DeviceMemAllocator&) = delete;
+  void operator=(const DeviceMemAllocator&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_MEM_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
new file mode 100644
index 00000000..d712027b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
@@ -0,0 +1,211 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_INTEGRATIONS_TF_ALLOCATOR_ADAPTER_H_
+#define XLA_STREAM_EXECUTOR_INTEGRATIONS_TF_ALLOCATOR_ADAPTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+// Adapter class that wraps a Tensorflow allocator.
+//
+// Assumes that the Tensorflow allocator permits asynchronous deallocation:
+// see comment on `AllowsAsynchronousDeallocation()`.
+class TfAllocatorAdapter : public DeviceMemoryAllocator {
+ public:
+  // stream: a Stream on which the allocator can only be used. If non-null, the
+  // allocator can not be used on any other stream.
+  TfAllocatorAdapter(tsl::Allocator *wrapped, Stream *stream);
+
+  // Constructor for the cases where `stream` can not be provided.
+  TfAllocatorAdapter(tsl::Allocator *wrapped, Platform *platform);
+
+  ~TfAllocatorAdapter() override;
+
+  absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                              bool retry_on_failure,
+                                              int64_t memory_space) override;
+
+  absl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
+
+  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
+  // before GPU execution takes place. Tensorflow uses the ordering of the main
+  // compute stream to enforce a happens-before relationship between a memory
+  // allocation and code that reuses the same memory. If Tensorflow adds
+  // support for multiple GPU streams or allocators with different ordering
+  // requirements, this code may need to change.
+  // (This attribute has no effect on CPU.)
+  bool AllowsAsynchronousDeallocation() const override { return true; }
+
+  absl::StatusOr<Stream *> GetStream(int device_ordinal) override;
+
+  absl::StatusOr<tsl::Allocator *> GetAllocator(int device_ordinal);
+
+ private:
+  tsl::Allocator *wrapped_;
+  Stream *stream_;
+};
+
+// Adapter class that wraps per-device TF allocators with corresponding streams
+// as a TfAllocatorAdapter. Assumes that the Tensorflow allocator permits
+// asynchronous deallocation; see comment on `AllowsAsynchronousDeallocation()`.
+class MultiDeviceAdapter : public DeviceMemoryAllocator {
+ public:
+  struct AllocatorInfo {
+    std::unique_ptr<tsl::Allocator> allocator;
+    Stream *stream;
+    int64_t memory_space;
+    std::optional<int> device_ordinal = std::nullopt;
+
+    AllocatorInfo(std::unique_ptr<tsl::Allocator> allocator, Stream *stream,
+                  int64_t memory_space,
+                  std::optional<int> device_ordinal = std::nullopt)
+        : allocator(std::move(allocator)),
+          stream(stream),
+          memory_space(memory_space),
+          device_ordinal(device_ordinal) {}
+  };
+
+  MultiDeviceAdapter(const Platform *platform,
+                     std::vector<AllocatorInfo> tf_allocators)
+      : DeviceMemoryAllocator(platform) {
+    tf_allocators_.reserve(tf_allocators.size());
+    for (AllocatorInfo &info : tf_allocators) {
+      auto &per_device_allocators =
+          memory_space_to_per_device_allocators_[info.memory_space];
+      int device_ordinal = info.device_ordinal.has_value()
+                               ? *info.device_ordinal
+                               : info.stream->parent()->device_ordinal();
+      if (per_device_allocators.size() <= device_ordinal) {
+        per_device_allocators.resize(device_ordinal + 1);
+      }
+      CHECK(!per_device_allocators[device_ordinal]);
+      per_device_allocators[device_ordinal] =
+          std::make_unique<TfAllocatorAdapter>(info.allocator.get(),
+                                               info.stream);
+      tf_allocators_.push_back(std::move(info.allocator));
+    }
+  }
+
+  absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                              bool retry_on_failure,
+                                              int64_t memory_space) override {
+    // memory_space is used here to select allocator. This isn't a need to pass
+    // it any lower to TfAllocatorAdapter.
+    auto it = memory_space_to_per_device_allocators_.find(memory_space);
+    CHECK(it != memory_space_to_per_device_allocators_.end());
+    CHECK_LT(device_ordinal, it->second.size());
+    TF_ASSIGN_OR_RETURN(
+        auto result, it->second[device_ordinal]->Allocate(
+                         device_ordinal, size, retry_on_failure, memory_space));
+
+    absl::MutexLock lock(&mu_);
+    buffer_memory_spaces_[{device_ordinal, result->opaque()}] = memory_space;
+    return result;
+  }
+
+  absl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override {
+    if (mem.opaque() == nullptr) return absl::OkStatus();
+    // Memory space is not passed to deallocate, look up in
+    // buffer_memory_spaces_.
+    int64_t memory_space;
+    {
+      absl::MutexLock lock(&mu_);
+      auto it = buffer_memory_spaces_.find({device_ordinal, mem.opaque()});
+      if (it == buffer_memory_spaces_.end()) {
+        // There might be situation when device memory was allocated somewhere
+        // outside of the current allocator. For backward compatibility in
+        // this case we are falling back to the first allocator to deallocate
+        // the memory.
+        // See b/325527293 for more details.
+        return memory_space_to_per_device_allocators_[0][device_ordinal]
+            ->Deallocate(device_ordinal, mem);
+      }
+      memory_space = it->second;
+      buffer_memory_spaces_.erase(it);
+    }
+
+    auto it = memory_space_to_per_device_allocators_.find(memory_space);
+    CHECK(it != memory_space_to_per_device_allocators_.end());
+    CHECK_LT(device_ordinal, it->second.size());
+    return it->second[device_ordinal]->Deallocate(device_ordinal, mem);
+  }
+
+  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
+  // before GPU execution takes place. Tensorflow uses the ordering of the main
+  // compute stream to enforce a happens-before relationship between a memory
+  // allocation and code that reuses the same memory. If Tensorflow adds
+  // support for multiple GPU streams or allocators with different ordering
+  // requirements, this code may need to change.
+  // (This attribute has no effect on CPU.)
+  bool AllowsAsynchronousDeallocation() const override { return true; }
+
+  absl::StatusOr<Stream *> GetStream(int device_ordinal) override {
+    // Both allocators should use the same stream, so just use 0.
+    return memory_space_to_per_device_allocators_[0][device_ordinal]->GetStream(
+        device_ordinal);
+  }
+
+  absl::StatusOr<tsl::Allocator *> GetAllocator(int device_ordinal) {
+    // GetAllocator is used for memory stats. Currently we will only see stats
+    // for main device memory allocator.
+    return memory_space_to_per_device_allocators_[0][device_ordinal]
+        ->GetAllocator(device_ordinal);
+  }
+
+ private:
+  absl::flat_hash_map<int64_t, std::vector<std::unique_ptr<TfAllocatorAdapter>>>
+      memory_space_to_per_device_allocators_;
+  // Map of device ordinal, buffer to which memory space it resides in.
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::pair<int, void *>, int64_t> buffer_memory_spaces_
+      ABSL_GUARDED_BY(mu_);
+  // The wrapped TF allocators backing per_device_allocators_
+  // (TfAllocatorAdapter does not take ownership of its underlying Allocator).
+  std::vector<std::unique_ptr<tsl::Allocator>> tf_allocators_;
+};
+
+// Creates a status with a payload indicating an error while allocating `size`
+// bytes of memory.
+absl::Status MemoryAllocationError(uint64_t size);
+
+// Checks whether the status is a memory allocation error.
+bool IsMemoryAllocationError(absl::Status status);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_INTEGRATIONS_TF_ALLOCATOR_ADAPTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/kernel.h
new file mode 100644
index 00000000..7ce0877c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/kernel.h
@@ -0,0 +1,747 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Suite of datatypes to represent data-parallel kernel objects (code entities).
+//
+// Kernel is the untyped variant, whereas TypedKernel takes a type signature
+// to do some template-based helper generation and give compile-time type
+// checking for kernel launch parameters.
+//
+// Users encouraged to use typed kernels when they know the type signature at
+// compile time. TypedKernels express their argument types via template
+// parameters like so:
+//
+//  TypedKernel<DeviceMemory<int>*, int>
+//
+// Which expresses a data parallel kernel signature for:
+//
+//  void(int*, int);
+//
+// And for a const memory region:
+//
+//  TypedKernel<const DeviceMemory<int>&, int>
+//
+// Corresponds to a data parallel kernel signature for:
+//
+//  void(const int*, int)
+//
+// Note that kernels always have a void return type, so results typically must
+// be memcpy'ied from device memory to the host.
+//
+// Also note that a scalar integer residing in device memory and an array of
+// integers residing in device memory have the same signature: DeviceMemory<T>.
+// However, in the future, checks may be added for additional safety that arrays
+// of minimum sizes are passed when those minimum sizes are contractually
+// expected by the kernel.
+//
+// For user-defined types whose definitions are appropriately shared between the
+// host code doing the launching and the kernel code being launched, the user
+// defined types are similarly permitted to be expressed as residing in device
+// memory:
+//
+//  TypedKernel<DeviceMemory<MyUserDefinedStructure>>
+//
+// And, when the alignment and padding are agreed upon, POD types will also be
+// able to be passed by value; for example, it is a common idiom to specify a
+// bunch of options simultaneously with a structure:
+//
+//  TypedKernel<MyOptionsStructurePassedByValue, DeviceMemory<float>>
+//
+// Which corresponds to a data parallel kernel signature like:
+//
+//  void(MyOptionsStructurePassedByValue value, float *result);
+//
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_H_
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/meta/type_traits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "tsl/platform/logging.h"
+
+namespace stream_executor {
+
+//===----------------------------------------------------------------------===//
+// Kernel metadata
+//===----------------------------------------------------------------------===//
+
+// KernelMetadata holds runtime-queryable attributes of a loaded kernel, such as
+// registers allocated, shared memory used, etc.
+// Not all platforms support reporting of all information, so each accessor
+// returns false if the associated field is not populated in the underlying
+// platform.
+class KernelMetadata {
+ public:
+  KernelMetadata() = default;
+
+  // Returns the number of registers used per thread executing this kernel.
+  std::optional<int64_t> registers_per_thread() const;
+
+  // Returns the amount of [static] shared memory used per block executing this
+  // kernel. Note that dynamic shared memory allocations are not (and can not)
+  // be reported here (since they're not specified until kernel launch time).
+  std::optional<int64_t> shared_memory_bytes() const;
+
+  void set_registers_per_thread(int registers_per_thread);
+  void set_shared_memory_bytes(int shared_memory_bytes);
+
+ private:
+  std::optional<int64_t> registers_per_thread_;
+  std::optional<int64_t> shared_memory_bytes_;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments to a stream executor APIs.
+class KernelArgs {
+ public:
+  template <typename T>
+  using IsKernelArgs = std::enable_if_t<std::is_base_of<KernelArgs, T>::value>;
+
+  enum class Kind {
+    // A list of type-erased DeviceMemoryBase pointers to on-device memory. This
+    // type of kernel arguments used only when the kernel has to do its own
+    // custom packing, e.g. wrap all device pointers into a custom
+    // structure, but can't be implemented as a TypedKernel because it has to be
+    // passed around as a generic Kernel.
+    kDeviceMemoryArray,
+
+    // A list of kernel arguments packed into a storage that can be passed
+    // directly to device kernel as void** kernel parameters.
+    kPackedArray
+  };
+
+  virtual ~KernelArgs() = default;
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  virtual size_t number_of_arguments() const = 0;
+
+  // Gets the total number of shared memory bytes added so far.
+  virtual uint64_t number_of_shared_bytes() const = 0;
+
+  virtual Kind kind() const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packed array
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments packed into a storage so
+// that we have stable addresses for all arguments. This is a low level API for
+// passing arguments in a platform-specific way that relies on the knowledge of
+// the ABI of the underlying platform.
+//
+// For example `cuLaunchKernel` accepts arguments as `void** kernelParams`, and
+// packed array base guarantees that `argument_addresses` are compatible with
+// the CUDA APIs.
+//
+// See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
+class KernelArgsPackedArrayBase : public KernelArgs {
+ public:
+  // Gets the list of argument addresses.
+  virtual absl::Span<const void *const> argument_addresses() const = 0;
+
+  static bool classof(const KernelArgs *args) {
+    return args->kind() == Kind::kPackedArray;
+  }
+
+  Kind kind() const final { return Kind::kPackedArray; }
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel
+//===----------------------------------------------------------------------===//
+
+// A data-parallel kernel (code entity) for launching via the StreamExecutor,
+// analogous to a void* device function pointer. See TypedKernel for the typed
+// variant.
+//
+// Thread-compatible.
+class Kernel {
+ public:
+  // A function for converting kernel arguments into a packed kernels arguments
+  // that can be directly passed to a device kernel. This indirection allows
+  // registering custom CUDA C++ kernels with non-trivial C++ API with a
+  // StreamExecutor as a generic `Kernel`.
+  using KernelArgsPacking =
+      std::function<absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
+          const Kernel &kernel, const KernelArgs &args)>;
+
+  Kernel() = default;
+  virtual ~Kernel() = default;
+
+  Kernel(const Kernel &) = delete;
+  void operator=(const Kernel &) = delete;
+
+  // Returns the number of parameters that this kernel accepts. (Arity refers to
+  // nullary, unary, ...).
+  virtual unsigned Arity() const = 0;
+
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // kernel given the number of threads per block and shared memory size.
+  virtual absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const = 0;
+
+  const KernelMetadata &metadata() const { return metadata_; }
+  void set_metadata(KernelMetadata metadata) {
+    metadata_ = std::move(metadata);
+  }
+
+  const KernelArgsPacking &args_packing() const { return args_packing_; }
+  void set_args_packing(KernelArgsPacking args_packing) {
+    args_packing_ = std::move(args_packing);
+  }
+
+  absl::string_view name() const { return name_; }
+  void set_name(absl::string_view name);
+
+  // Launches a data parallel kernel with the given thread/block
+  // dimensionality and already-packed args/sizes to pass to the underlying
+  // platform driver.
+  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
+                      Stream *stream, const KernelArgs &args);
+
+  // Launches a data parallel kernel with the given thread/block
+  // dimensionality and already-packed args/sizes to pass to the underlying
+  // platform driver.
+  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
+                      const ClusterDim &cluster_dims, Stream *stream,
+                      const KernelArgs &args);
+
+ private:
+  // Helper method to launch a kernel with optional cluster dimensions.
+  virtual absl::Status Launch(const ThreadDim &thread_dims,
+                              const BlockDim &block_dims,
+                              const std::optional<ClusterDim> &cluster_dims,
+                              Stream *stream, const KernelArgs &args) = 0;
+
+  std::string name_;
+
+  KernelMetadata metadata_;
+  KernelArgsPacking args_packing_;
+};
+
+inline absl::Status Kernel::Launch(const ThreadDim &thread_dims,
+                                   const BlockDim &block_dims, Stream *stream,
+                                   const KernelArgs &args) {
+  return Launch(thread_dims, block_dims, std::nullopt, stream, args);
+}
+inline absl::Status Kernel::Launch(const ThreadDim &thread_dims,
+                                   const BlockDim &block_dims,
+                                   const ClusterDim &cluster_dims,
+                                   Stream *stream, const KernelArgs &args) {
+  return Launch(thread_dims, block_dims, std::make_optional(cluster_dims),
+                stream, args);
+}
+
+//===----------------------------------------------------------------------===//
+// Typed kernel
+//===----------------------------------------------------------------------===//
+template <typename... Params>
+class TypedKernelFactory;
+
+// Typed kernel is a typed smart-pointer-like wrapper around untyped Kernel.
+template <typename... Params>
+class TypedKernel {
+ public:
+  static constexpr size_t kNumberOfParameters = sizeof...(Params);
+
+  TypedKernel() = default;
+
+  Kernel &operator*() { return *kernel_; }
+  const Kernel &operator*() const { return *kernel_; }
+
+  Kernel *operator->() { return kernel_.get(); }
+  const Kernel *operator->() const { return kernel_.get(); }
+
+  operator bool() const { return static_cast<bool>(kernel_); }  // NOLINT
+
+  // Type of factory used to create a TypedKernel.
+  using FactoryType = TypedKernelFactory<Params...>;
+
+  // Launches a kernel with the given (variadic) parameters for the invocation
+  // onto the specified stream. These arguments can be things
+  // like DeviceMemory or primitive types such as int. What arguments you may
+  // pass to a given kernel are noted as the template parameters to the
+  // TypedKernel type that the compiler generates.
+  //
+  //  Template parameters:
+  //   Params...   The type list of formal parameters that the typed kernel
+  //               expects, which is matched against Args...
+  //   Args...     The deduced type list for passed actual arguments
+  //
+  // Implementation: A compile-time compatibility check is performed that has
+  // some leniency versus an exact parameter pack match -- for example,
+  // `const DeviceMemory<T>` is considered "pack compatible" with a
+  // `const DeviceMemory<T>&` formal parameter; in part, because we don't have
+  // perfect forwarding support without rvalue references. It also attempts to
+  // spit out helpful static_assert error traces with information as to the
+  // argument number and types that were mismatched.
+  template <typename... Args>
+  inline absl::Status Launch(ThreadDim thread_dims, BlockDim block_dims,
+                             Stream *stream, Args... args) {
+    auto kernel_args = PackKernelArgs(*this, args...);
+    return kernel_->Launch(thread_dims, block_dims, stream, *kernel_args);
+  }
+
+  template <typename... Args>
+  inline absl::Status Launch(ThreadDim thread_dims, BlockDim block_dims,
+                             int32_t shmem_bytes, Stream *stream,
+                             Args... args) {
+    auto kernel_args = PackKernelArgs(shmem_bytes, args...);
+    return kernel_->Launch(thread_dims, block_dims, stream, *kernel_args);
+  }
+
+ private:
+  friend class TypedKernelFactory<Params...>;
+  explicit TypedKernel(std::unique_ptr<Kernel> kernel)
+      : kernel_(std::move(kernel)) {}
+
+  std::unique_ptr<Kernel> kernel_;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments LLVM-style RTTI library
+//===----------------------------------------------------------------------===//
+
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+T *Cast(KernelArgs *args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T *>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+const T *Cast(const KernelArgs *args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T *>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+const T *DynCast(const KernelArgs *args) {
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return T::classof(args) ? static_cast<const T *>(args) : nullptr;
+}
+
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+const T *DynCastOrNull(const KernelArgs *args) {
+  return args && T::classof(args) ? static_cast<const T *>(args) : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments device memory array
+//===----------------------------------------------------------------------===//
+
+class KernelArgsDeviceMemoryArray : public KernelArgs {
+ public:
+  KernelArgsDeviceMemoryArray(absl::Span<const DeviceMemoryBase> args,
+                              size_t shared_memory_bytes)
+      : device_memory_args_(args.begin(), args.end()),
+        shared_memory_bytes_(shared_memory_bytes) {}
+
+  static bool classof(const KernelArgs *args) {
+    return args->kind() == Kind::kDeviceMemoryArray;
+  }
+
+  Kind kind() const final { return Kind::kDeviceMemoryArray; }
+
+  size_t number_of_arguments() const final {
+    return device_memory_args_.size() + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const DeviceMemoryBase> device_memory_args() const {
+    return device_memory_args_;
+  }
+
+  const void *device_memory_ptr(size_t index) const {
+    return device_memory_args_[index].opaque();
+  }
+
+  size_t device_memory_size(size_t index) const {
+    return device_memory_args_[index].size();
+  }
+
+ private:
+  absl::InlinedVector<DeviceMemoryBase, 4> device_memory_args_;
+  size_t shared_memory_bytes_ = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for device memory and POD args
+//===----------------------------------------------------------------------===//
+
+// KernelArgsPackedArray is optimized for packing DeviceMemoryBase pointers
+// and POD arguments (i.e. scalars) when the number and type of arguments are
+// not known at compile time.
+
+namespace internal {
+
+// An empty storage for packing just the device memory arguments, that are
+// stored directly in the `KernelArgsPackedArray`.
+class EmptyArgs {};
+
+// A storage for POD generic arguments that are smaller than `size` and require
+// alignment smaller or equal to `alignment`.
+template <size_t capacity, size_t size = 8,
+          size_t alignment = alignof(std::max_align_t)>
+class PodArgs {
+ protected:
+  template <typename T>
+  const std::byte *add_pod_argument(const T &arg) {
+    static_assert(
+        std::is_pod_v<T> && sizeof(T) <= size & alignof(T) <= alignment,
+        "Type is not compatible with POD arguments storage");
+
+    assert(num_args_ < capacity && "pod args overflow");
+    std::byte *arg_storage = args_storage_[num_args_++].storage;
+    std::memcpy(arg_storage, &arg, sizeof(T));
+
+    return arg_storage;
+  }
+
+ private:
+  struct Arg {
+    alignas(alignment) std::byte storage[size];
+  };
+
+  size_t num_args_ = 0;
+  std::array<Arg, capacity> args_storage_;
+};
+
+template <typename ArgsStorage>
+static constexpr bool is_pod_args_v = false;
+
+template <size_t capacity, size_t size, size_t alignment>
+static constexpr bool is_pod_args_v<PodArgs<capacity, size, alignment>> = true;
+
+}  // namespace internal
+
+// An array of arguments for a kernel call.
+//
+// The template parameter `num_args` is the maximum number of arguments which
+// can be stored in the array.
+template <size_t num_args, typename ArgsStorage = internal::PodArgs<num_args>>
+class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
+ public:
+  KernelArgsPackedArray() = default;
+
+  // KernelArgsPackedArray is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedArray(const KernelArgsPackedArray &) = delete;
+  KernelArgsPackedArray &operator=(const KernelArgsPackedArray &) = delete;
+
+  // Adds an argument to the list.
+  template <typename T>
+  void add_argument(const T &arg) {
+    if constexpr (internal::is_pod_args_v<ArgsStorage>) {
+      argument_addresses_[number_of_argument_addresses_++] =
+          ArgsStorage::add_pod_argument(arg);
+    } else {
+      // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+      static_assert(sizeof(T) == 0, "Arguments storage is not supported");
+    }
+  }
+
+  // Adds a device memory argument to the list.
+  void add_device_memory_argument(const DeviceMemoryBase &arg) {
+    const void **copy_ptr =
+        &device_memory_opaque_pointers_[number_of_argument_addresses_];
+    *copy_ptr = arg.opaque();
+    argument_addresses_[number_of_argument_addresses_] = copy_ptr;
+    ++number_of_argument_addresses_;
+  }
+
+  // Adds a shared memory argument to the list.
+  //
+  // The only significant information about a shared argument is its size, so
+  // that is the only parameter in this function.
+  void add_shared_bytes(size_t number_of_bytes) {
+    shared_memory_bytes_ += number_of_bytes;
+  }
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  size_t number_of_arguments() const final {
+    return number_of_argument_addresses_ + (shared_memory_bytes_ > 0);
+  }
+
+  // Gets the total number of shared memory bytes added so far.
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  // Gets the list of argument addresses.
+  absl::Span<const void *const> argument_addresses() const final {
+    return absl::Span<const void *const>(argument_addresses_.data(),
+                                         number_of_argument_addresses_);
+  }
+
+ private:
+  // A place to store copies of opaque pointers from device memory arguments.
+  std::array<const void *, num_args> device_memory_opaque_pointers_;
+
+  // Addresses for non-shared-memory arguments.
+  std::array<const void *, num_args> argument_addresses_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Number of significant entries in argument_addresses_.
+  size_t number_of_argument_addresses_ = 0;
+};
+
+namespace internal {
+template <int n>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
+  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
+  for (const DeviceMemoryBase &buf : args) {
+    packed->add_device_memory_argument(buf);
+  }
+  if (shared_mem_bytes > 0) {
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
+}  // namespace internal
+
+inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
+PackKernelArgs(absl::Span<const DeviceMemoryBase> args,
+               uint32_t shared_mem_bytes) {
+  static constexpr int kKernelArgsLimit = 1024;
+
+  if (args.size() > kKernelArgsLimit)
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't pack device memory arguments array of size ", args.size(),
+        " which is larger than the maximum supported size of ",
+        kKernelArgsLimit));
+
+  // Specialize kernel arguments array for small sizes to allocate a smaller
+  // chunk of memory and hopefully hit a small allocations cache.
+  if (args.size() <= 4) {
+    return internal::PackKernelArgs<4>(args, shared_mem_bytes);
+  } else if (args.size() <= 8) {
+    return internal::PackKernelArgs<8>(args, shared_mem_bytes);
+  } else if (args.size() <= 16) {
+    return internal::PackKernelArgs<16>(args, shared_mem_bytes);
+  } else if (args.size() <= 32) {
+    return internal::PackKernelArgs<32>(args, shared_mem_bytes);
+  } else if (args.size() <= 64) {
+    return internal::PackKernelArgs<64>(args, shared_mem_bytes);
+  } else if (args.size() <= 256) {
+    return internal::PackKernelArgs<256>(args, shared_mem_bytes);
+  } else if (args.size() <= 512) {
+    return internal::PackKernelArgs<512>(args, shared_mem_bytes);
+  }
+
+  return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
+}
+
+inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
+PackKernelArgs(absl::Span<const DeviceMemoryBase> args,
+               const KernelMetadata &metadata) {
+  return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for statically know argument types
+//===----------------------------------------------------------------------===//
+
+// KernelArgsPackedTuple is optimized for packing arguments when their types are
+// known at compile time, and somewhat similar to `std::tuple` but with a few
+// special rules for passing device memory arguments.
+
+namespace internal {
+
+// PackedArgType template specialization defines what storage type we'll be
+// using for each kernel argument type:
+//
+//   (1) We always strip references and store a copy of an argument.
+//   (2) We do not support pointer arguments, as we should not be passing a
+//       pointers to host memory to device kernels.
+//   (3) DeviceMemory passed as an opaque `void*` pointer.
+//   (4) We have a special case for passing pointers to DeviceMemory where we
+//       also pass it as an opaque device pointer.
+template <typename T>
+struct PackedArgType {
+  static_assert(!std::is_pointer_v<T>, "cannot pass raw pointer to the device");
+  using Type = T;
+};
+
+template <>
+struct PackedArgType<DeviceMemoryBase> {
+  using Type = const void *;
+};
+
+template <typename T>
+struct PackedArgType<DeviceMemory<T>> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <>
+struct PackedArgType<DeviceMemoryBase *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <>
+struct PackedArgType<const DeviceMemoryBase *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <typename T>
+struct PackedArgType<DeviceMemory<T> *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <typename T>
+struct PackedArgType<const DeviceMemory<T> *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+// Overload set for packing kernel arguments. This overload set matches
+// supported kernel arguments types defined by `PackedArgType`.
+template <typename T, std::enable_if_t<!std::is_pointer_v<T>> * = nullptr>
+T PackArg(const T &arg) {
+  return arg;
+}
+
+inline const void *PackArg(const DeviceMemoryBase &arg) { return arg.opaque(); }
+inline const void *PackArg(const DeviceMemoryBase *arg) {
+  return PackArg(*arg);
+}
+
+template <typename T>
+const void *PackArg(const DeviceMemory<T> &arg) {
+  return arg.opaque();
+}
+
+template <typename T>
+const void *PackArg(const DeviceMemory<T> *arg) {
+  return PackArg(*arg);
+}
+
+}  // namespace internal
+
+template <typename... Args>
+class KernelArgsPackedTuple : public KernelArgsPackedArrayBase {
+ public:
+  static constexpr size_t kSize = sizeof...(Args);
+
+  using Storage = std::tuple<
+      typename internal::PackedArgType<absl::remove_cvref_t<Args>>::Type...>;
+
+  explicit KernelArgsPackedTuple(Args... args, size_t shared_memory_bytes)
+      : storage_(internal::PackArg(std::forward<Args>(args))...),
+        shared_memory_bytes_(shared_memory_bytes) {
+    InitializeArgumentAddresses(std::make_index_sequence<kSize>{});
+  }
+
+  // KernelArgsPackedTuple is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedTuple(const KernelArgsPackedTuple &) = delete;
+  KernelArgsPackedTuple &operator=(const KernelArgsPackedTuple &) = delete;
+
+  size_t number_of_arguments() const final {
+    return kSize + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const void *const> argument_addresses() const final {
+    return absl::Span<const void *const>(argument_addresses_.data(), kSize);
+  }
+
+  // Compile time check that KernelArgsPackedTuple is compatible with
+  // `OtherArgs`: after stripping const and reference all types match.
+  template <typename... OtherArgs>
+  static void CheckCompatibleStaticAssert() {
+    static constexpr size_t kOtherSize = sizeof...(OtherArgs);
+    static_assert(kSize == kOtherSize, "length of arguments packs must match");
+
+    using StrippedArgs = std::tuple<absl::remove_cvref_t<Args>...>;
+    using StrippedOtherArgs = std::tuple<absl::remove_cvref_t<OtherArgs>...>;
+    static_assert(std::is_same_v<StrippedArgs, StrippedOtherArgs>,
+                  "arguments types do not match");
+  }
+
+ private:
+  template <size_t... Is>
+  void InitializeArgumentAddresses(std::index_sequence<Is...>) {
+    ((argument_addresses_[Is] = &std::get<Is>(storage_)), ...);
+  }
+
+  // Storage for packed kernel arguments.
+  Storage storage_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Pointers into `storage_`.
+  std::array<const void *, kSize> argument_addresses_;
+};
+
+// Packs the given arguments into a KernelArgsPackedTuple.
+template <typename... Args>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(int64_t shmem_bytes,
+                                                          Args... args) {
+  using PackedArgs = KernelArgsPackedTuple<Args...>;
+  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
+}
+
+// Packs the given arguments into a KernelArgsPackedTuple with compile-time type
+// checks that arguments are compatible with TypedKernel signature.
+template <typename... Params, typename... Args>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    const TypedKernel<Params...> &kernel, Args... args) {
+  using PackedParams = KernelArgsPackedTuple<Params...>;
+  using PackedArgs = KernelArgsPackedTuple<Args...>;
+
+  PackedParams::template CheckCompatibleStaticAssert<Args...>();
+
+  int64_t shmem_bytes = kernel->metadata().shared_memory_bytes().value_or(0);
+  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/kernel_spec.h
new file mode 100644
index 00000000..f5b5193b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -0,0 +1,285 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Kernel-loader specs are structures that describe how to load a data-parallel
+// kernel on a given platform for subsequent launching. Headers that instantiate
+// these data structures will typically be auto-generated. However, users can
+// also instantiate them by hand.
+//
+// A kernel with the same exact functionality and type signature may be
+// implemented on several different platforms. Typical usage is to create a
+// singleton that describes how to load a kernel on the various supported
+// platforms:
+//
+//  static const MultiKernelLoaderSpec &SaxpySpec() {
+//    static auto *mkls =
+//        (new MultiKernelLoaderSpec{4 /* = arity */})
+//            ->AddCudaPtxInMemory(ptx_bytes, ptx_kernel_name);
+//    };
+//
+//    return *mkls;
+//  }
+//
+// This lazily instantiates an object that describes how to load CUDA PTX
+// present on disk that implements saxpy for the CUDA platform. The
+// CudaPtxInMemory object is a subtype of KernelLoaderSpec -- KernelLoaderSpec
+// describes how to load a kernel for subsequent launching on a single platform.
+//
+// For the loader functionality that accepts these KernelLoaderSpecs in order
+// to grab the kernel appropriately, see StreamExecutor::GetKernel().
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_SPEC_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_SPEC_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tsl/platform/logging.h"
+
+namespace stream_executor {
+
+class Kernel;                     // defined in kernel.h
+class KernelArgs;                 // defined in kernel.h
+class KernelArgsPackedArrayBase;  // defined in kernel.h
+
+// Describes how to load a kernel on a target platform.
+//
+// This is an abstract base class, subclassed for specific platforms.
+// The filename_or_text field represents the program location (i.e. PTX or
+// OpenCL loadable translation unit path) and is simply stored; whether it is a
+// filename or text is exposed via more specifically named accessors in
+// subclasses.
+//
+// These kernel loader specifications are typically auto-generated into header
+// files at build time, but can also be specified manually.
+class KernelLoaderSpec {
+ public:
+  virtual ~KernelLoaderSpec() = default;
+
+  // Returns the kernel name to load out of the program.
+  const std::string &kernel_name() const { return kernel_name_; }
+
+ protected:
+  explicit KernelLoaderSpec(absl::string_view kernel_name);
+
+ private:
+  // The kernel name that should be loaded out of the program description given
+  // above.
+  std::string kernel_name_;
+
+  KernelLoaderSpec(const KernelLoaderSpec &) = delete;
+  void operator=(const KernelLoaderSpec &) = delete;
+};
+
+// Loads kernel from in process symbol pointer (e.g. pointer to C++ device
+// function).
+class InProcessSymbol : public KernelLoaderSpec {
+ public:
+  InProcessSymbol(void *symbol, std::string kernel_name);
+
+  void *symbol() const { return symbol_; }
+
+ private:
+  void *symbol_;
+};
+
+// Kernel loader specification for PTX text that resides in memory.
+class CudaPtxInMemory : public KernelLoaderSpec {
+ public:
+  // Components: compute capability major number, compute capability minor
+  // number, and PTX source.
+  typedef std::tuple<int, int, absl::string_view> PtxSpec;
+
+  // Single-PTX constructor. Adds the provided PTX version with an unknown
+  // compute capability. Since the CC is unknown, the PTX is assumed to be very
+  // generally usable - in other words, PTX specified in this manner is VERY
+  // likely to be used as the default! Note that the PTX can be compressed,
+  // which is indicated by the argument ptx_compressed.
+  //
+  // Warning: the string backing the provided absl::string_view ptx must outlive
+  // this instance.
+  CudaPtxInMemory(absl::string_view ptx, absl::string_view kernel_name);
+
+  // Multiple-PTX-version constructor. Adds each item in spec_list to this
+  // object. Note that the PTX can be compressed, which is indicated by the
+  // argument ptx_compressed.
+  CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
+                  absl::string_view kernel_name);
+
+  // Add the PTX implementation described by ptx_spec to this object. On
+  // collision (i.e., if a version with the same compute_capability already
+  // exists), the existing implementation will be overwritten.
+  void AddSpec(PtxSpec ptx_spec);
+
+  // Returns pointer to the ptx of available implementation with the
+  // lowest-valued compute capability. For example, if PTX written to CC2.0,
+  // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns
+  // nullptr on failed lookup (if any version is not available).
+  const char *default_text() const;
+
+  // Returns pointer to the ptx for the requested compute capability.
+  // Returns nullptr on failed lookup (if the requested version is not
+  // available).
+  const char *text(int compute_capability_major,
+                   int compute_capability_minor) const;
+
+ private:
+  // PTX translation unit text contents in memory. The key is of as a tuple
+  // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's
+  // represented in this way have a clear sorting order, map::begin() will give
+  // the lowest-numbered version available, i.e. the default.
+  std::map<std::tuple<int, int>, const char *> ptx_by_compute_capability_;
+
+  // Defines the minimum compute capability possible. Used when PTX has no
+  // compute capability specified (in the single-PTX constructor).
+  static const std::tuple<int, int> kMinimumCapability;
+
+  CudaPtxInMemory(const CudaPtxInMemory &) = delete;
+  void operator=(const CudaPtxInMemory &) = delete;
+};
+
+// Kernel loader specification for a CUBIN blob that resides in memory.
+class CudaCubinInMemory : public KernelLoaderSpec {
+ public:
+  CudaCubinInMemory(absl::Span<const uint8_t> cubin_bytes,
+                    absl::string_view kernel_name);
+
+  absl::Span<const uint8_t> cubin_bytes() const { return cubin_bytes_; }
+
+ private:
+  absl::Span<const uint8_t> cubin_bytes_;
+
+  CudaCubinInMemory(const CudaCubinInMemory &) = delete;
+  void operator=(const CudaCubinInMemory &) = delete;
+};
+
+class LlvmHostKernel : public KernelLoaderSpec {
+ public:
+  LlvmHostKernel(absl::string_view ir, absl::string_view entrypoint,
+                 absl::string_view kernel_name,
+                 absl::Span<std::string> options);
+
+  absl::string_view ir() const { return ir_; }
+  absl::string_view entrypoint() const { return entrypoint_; }
+  absl::Span<const std::string> options() const { return options_; }
+
+ private:
+  std::string ir_;
+  std::string entrypoint_;
+  std::vector<std::string> options_;
+
+  LlvmHostKernel(const LlvmHostKernel &) = delete;
+  void operator=(const LlvmHostKernel &) = delete;
+};
+
+// Describes how to load a kernel on any subset of a number of target platforms.
+class MultiKernelLoaderSpec {
+ public:
+  // A function for converting kernel arguments into a packed kernels arguments
+  // that can be directly passed to a device kernel. This indirection allows
+  // registering custom CUDA C++ kernels with non-trivial C++ API with a
+  // StreamExecutor as a generic `Kernel`.
+  using KernelArgsPacking =
+      std::function<absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
+          const Kernel &kernel, const KernelArgs &args)>;
+
+  explicit MultiKernelLoaderSpec(
+      size_t arity, KernelArgsPacking kernel_args_packing = nullptr);
+
+  // Returns the number of arguments that this kernel accepts.
+  size_t arity() const { return arity_; }
+
+  // Convenience getters for testing whether these platform variants have
+  // kernel loader specifications available.
+  bool has_in_process_symbol() const { return in_process_symbol_ != nullptr; }
+  bool has_cuda_cubin_in_memory() const {
+    return cuda_cubin_in_memory_ != nullptr;
+  }
+  bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
+  bool has_llvm_host_kernel() const { return llvm_host_kernel_ != nullptr; }
+
+  // Accessors for platform variant kernel load specifications.
+  // Precondition: corresponding has_* is true.
+  const InProcessSymbol &in_process_symbol() const {
+    CHECK(has_in_process_symbol());
+    return *in_process_symbol_;
+  }
+  const CudaCubinInMemory &cuda_cubin_in_memory() const {
+    CHECK(has_cuda_cubin_in_memory());
+    return *cuda_cubin_in_memory_;
+  }
+  const CudaPtxInMemory &cuda_ptx_in_memory() const {
+    CHECK(has_cuda_ptx_in_memory());
+    return *cuda_ptx_in_memory_;
+  }
+  const LlvmHostKernel &llvm_host_kernel() const {
+    CHECK(has_llvm_host_kernel());
+    return *llvm_host_kernel_;
+  }
+  // Builder-pattern-like methods for use in initializing a
+  // MultiKernelLoaderSpec. Each of these should be used at most once for a
+  // single MultiKernelLoaderSpec object. See file comment for example usage.
+  //
+  // Note that the kernel_name parameter must be consistent with the kernel in
+  // the PTX being loaded. Also be aware that in CUDA C++ the kernel name may be
+  // mangled by the compiler if it is not declared in an extern "C" scope.
+  MultiKernelLoaderSpec *AddInProcessSymbol(void *symbol,
+                                            absl::string_view kernel_name);
+  MultiKernelLoaderSpec *AddCudaCubinInMemory(
+      absl::Span<const uint8_t> cubin_bytes, absl::string_view kernel_name);
+  MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx,
+                                            absl::string_view kernel_name);
+  MultiKernelLoaderSpec *AddLlvmHostKernel(absl::string_view ir,
+                                           absl::string_view entrypoint,
+                                           absl::string_view kernel_name,
+                                           absl::Span<std::string> options);
+
+  const KernelArgsPacking &kernel_args_packing() const {
+    return kernel_args_packing_;
+  }
+
+ private:
+  std::shared_ptr<InProcessSymbol>
+      in_process_symbol_;  // In process symbol pointer.
+  std::shared_ptr<CudaCubinInMemory>
+      cuda_cubin_in_memory_;  // Binary CUDA program in memory.
+  std::shared_ptr<CudaPtxInMemory>
+      cuda_ptx_in_memory_;  // PTX text that resides in memory.
+  std::shared_ptr<LlvmHostKernel>
+      llvm_host_kernel_;  // LLVM kernel for host execution.
+
+  // Number of parameters that the kernel takes. (This is nicer to have in a
+  // constexpr than having to determine it from the types via template
+  // metaprogramming).
+  size_t arity_;
+
+  // Custom kernel arguments packing.
+  KernelArgsPacking kernel_args_packing_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/launch_dim.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/launch_dim.h
new file mode 100644
index 00000000..59b935c1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/launch_dim.h
@@ -0,0 +1,79 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_LAUNCH_DIM_H_
+#define XLA_STREAM_EXECUTOR_LAUNCH_DIM_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace stream_executor {
+namespace internal {
+
+struct Dim3D {
+  uint64_t x, y, z;
+
+  bool operator==(const Dim3D& other) const {
+    return x == other.x && y == other.y && z == other.z;
+  }
+
+  bool operator!=(const Dim3D& other) const { return !(*this == other); }
+};
+
+}  // namespace internal
+
+// Types to express dimensionality of a kernel launch. Blocks, threads and
+// clusters are (up to) 3-dimensional.
+//
+// See NVIDIA documentation for a thread hierarchy:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-hierarchy
+
+// Thread dimensionality for use in a kernel launch.
+// details.
+struct ThreadDim : internal::Dim3D {
+  explicit ThreadDim(uint64_t x = 1, uint64_t y = 1, uint64_t z = 1)
+      : internal::Dim3D({x, y, z}) {}
+
+  std::string ToString() const {
+    return absl::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}");
+  }
+};
+
+// Block dimensionality for use in a kernel launch.
+// details.
+struct BlockDim : internal::Dim3D {
+  explicit BlockDim(uint64_t x = 1, uint64_t y = 1, uint64_t z = 1)
+      : internal::Dim3D({x, y, z}) {}
+
+  std::string ToString() const {
+    return absl::StrCat("BlockDim{", x, ", ", y, ", ", z, "}");
+  }
+};
+
+// Cluster dimensionality for use in a kernel launch.
+struct ClusterDim : internal::Dim3D {
+  explicit ClusterDim(uint64_t x = 1, uint64_t y = 1, uint64_t z = 1)
+      : internal::Dim3D({x, y, z}) {}
+
+  std::string ToString() const {
+    return absl::StrCat("ClusterDim{", x, ", ", y, ", ", z, "}");
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_LAUNCH_DIM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/lazy_op_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/lazy_op_runner.h
new file mode 100644
index 00000000..f3c8d004
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/lazy_op_runner.h
@@ -0,0 +1,286 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+------------------------------------------------------------------------------*/
+
+#ifndef XLA_STREAM_EXECUTOR_LAZY_OP_RUNNER_H_
+#define XLA_STREAM_EXECUTOR_LAZY_OP_RUNNER_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/call_once.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+namespace dnn {
+
+namespace internal {
+// Returns the DnnSupport object for the given stream.
+inline absl::StatusOr<DnnSupport*> GetDnnFromStream(Stream* stream) {
+  auto dnn = stream->parent()->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InternalError("No DNN support for stream");
+  }
+  return dnn;
+}
+}  // namespace internal
+
+// A lazily-initialized OpRunner from an AlgorithmDesc.
+//
+// This exists to hold a choice of conv algorithm for a particular config,
+// initialize its OpRunner at most once, and defer that initialization until the
+// config is first needed.  This allows AoT autotuning to load configurations
+// for all convolutions it knows about, without doing expensive initialization
+// (e.g. runtime codegen) and retaining non-negligible resources (e.g.  compiled
+// kernels) for potentially irrelevant configurations.  It also enables XLA conv
+// thunks to defer binding to a particular stream executor until the first run.
+//
+// `Op` must satisfy the following "concept":
+//
+// struct Op {
+//   // The function type signature parameter of an OpRunner.
+//   using Signature = _;
+//
+//   // The parameter to be used by GetOrCreateRunner.
+//   struct Config;
+//
+//   // Use a StreamExecutor to create an OpRunner.
+//   static absl::StatusOr<OpRunner<Config>> OpRunnerFromDesc(
+//       const AlgorithmDesc& desc, Config config, StreamExecutor* stream);
+// };
+template <typename Op>
+class LazyOpRunner {
+ public:
+  // Construct from a pre-initialized OpRunner; all calls to GetOrCreateRunner
+  // will return a pointer to exactly this runner.
+  static absl::StatusOr<std::unique_ptr<LazyOpRunner>> FromOpRunner(
+      std::unique_ptr<const OpRunner<typename Op::Signature>> runner) {
+    if (!runner) {
+      return absl::InternalError("Null runner argument to FromOpRunner");
+    }
+    TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
+    // Private constructor cannot be called by make_unique :(
+    return {std::unique_ptr<LazyOpRunner>(
+        new LazyOpRunner(desc, std::move(runner)))};
+  }
+
+  // Construct from an AlgorithmDesc, with no pre-initialized OpRunner; it will
+  // be created on the first call to GetOrCreateRunner.
+  explicit LazyOpRunner(AlgorithmDesc desc) : LazyOpRunner(desc, nullptr) {}
+
+  // Returns an already-initialized OpRunner if available, or creates one.
+  //
+  // Invariant: a particular instance of this class shall only receive calls
+  // with identical `config`s and `stream_executor`s.  If the config is changed,
+  // only the first config it sees will have any effect, and second and
+  // subsequent configs will be ignored.  If the stream executor is changed,
+  // some operations on the returned `OpRunner` using the changed stream
+  // executor will be errors.
+  //
+  // The result is owned by LazyOpRunner.
+  absl::StatusOr<const OpRunner<typename Op::Signature>*> GetOrCreateRunner(
+      typename Op::Config config, Stream* stream) {
+    absl::call_once(once_flag_, [&] {
+      if (runner_) return;  // runner was passed via constructor argument
+
+      auto r = Op::RunnerFromAlgorithmDesc(desc_, std::move(config), stream);
+      if (!r.ok()) {
+        error_ = std::move(r).status();
+      } else {
+        runner_ = std::move(r).value();
+      }
+    });
+
+    if (!error_.ok()) return error_;
+    return runner_.get();
+  }
+
+  // Get the contained runner with the invariant that it's already initialized.
+  absl::StatusOr<const OpRunner<typename Op::Signature>*> GetRunner() {
+    if (auto* runner = runner_ptr_.load(std::memory_order_acquire)) {
+      return runner;
+    }
+    return absl::InternalError("LazyOpRunner::GetRunner: not initialized");
+  }
+
+  bool operator==(const LazyOpRunner& other) const {
+    return desc_ == other.desc_;
+  }
+
+  std::string ToString() const { return desc_.ToString(); }
+
+  const AlgorithmDesc& ToAlgorithmDesc() const { return desc_; }
+
+ private:
+  LazyOpRunner(AlgorithmDesc desc,
+               std::unique_ptr<const OpRunner<typename Op::Signature>> runner)
+      : desc_(std::move(desc)),
+        error_(absl::OkStatus()),
+        runner_(std::move(runner)),
+        runner_ptr_(runner_.get()) {}
+
+  AlgorithmDesc desc_;
+
+  // We use absl::call_once to lazily initialize `runner_` (or `error_`).
+  absl::once_flag once_flag_;
+  absl::Status error_;  // holds error if runner can't be initialized
+  std::unique_ptr<const OpRunner<typename Op::Signature>> runner_;
+
+  // Once we initialize `runner_` we publish a pointer through atomic so that
+  // `GetRunner` can read it without data races with initialization.
+  std::atomic<const OpRunner<typename Op::Signature>*> runner_ptr_;
+};
+
+// Implementation of the concept required by LazyOpRunner, for ConvRunner.
+struct ConvOp {
+  using Signature = ConvSignature;
+
+  struct Config {
+    ConvolutionKind kind;
+    DataType input_type, output_type;
+    const BatchDescriptor& input_descriptor;
+    const FilterDescriptor& filter_descriptor;
+    const BatchDescriptor& output_descriptor;
+    const ConvolutionDescriptor& convolution_descriptor;
+  };
+
+  static absl::StatusOr<std::unique_ptr<const OpRunner<ConvSignature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->ConvolveRunnerFromDesc(
+        stream, desc, config.kind, config.input_type, config.output_type,
+        config.input_descriptor, config.filter_descriptor,
+        config.output_descriptor, config.convolution_descriptor);
+  }
+};
+
+// Implementation of the concept required by LazyOpRunner, for
+// GraphConvolveRunner.
+struct GraphConvOp {
+  using Signature = GraphConvSignature;
+
+  struct Config {
+    ConvolutionKind kind;
+    DataType input_type, output_type;
+    const BatchDescriptor& input_descriptor;
+    const FilterDescriptor& filter_descriptor;
+    const BatchDescriptor& output_descriptor;
+    const ConvolutionDescriptor& convolution_descriptor;
+    std::string serialized_graph;
+  };
+
+  static absl::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->GraphConvolveRunnerFromDesc(
+        stream, desc, config.kind, config.input_type, config.output_type,
+        config.input_descriptor, config.filter_descriptor,
+        config.output_descriptor, config.convolution_descriptor,
+        config.serialized_graph);
+  }
+};
+
+// Implementation of the concept required by LazyOpRunner, for LazyConvRunner.
+struct FusedConvOp {
+  using Signature = FusedConvSignature;
+
+  struct Config {
+    ConvolutionKind kind;
+    DataType input_type, bias_type, output_type;
+    double conv_scale, side_input_scale, leakyrelu_alpha;
+    const BatchDescriptor& input_descriptor;
+    const FilterDescriptor& filter_descriptor;
+    const BatchDescriptor& bias_descriptor;
+    const BatchDescriptor& output_descriptor;
+    const ConvolutionDescriptor& convolution_descriptor;
+    ActivationMode activation_mode;
+  };
+
+  static absl::StatusOr<std::unique_ptr<const OpRunner<FusedConvSignature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->FusedConvolveRunnerFromDesc(
+        stream, desc, config.kind, config.input_type, config.bias_type,
+        config.output_type, config.conv_scale, config.side_input_scale,
+        config.leakyrelu_alpha, config.input_descriptor,
+        config.filter_descriptor, config.bias_descriptor,
+        config.output_descriptor, config.convolution_descriptor,
+        config.activation_mode);
+  }
+};
+
+// Implementation of the concept required by LazyOpRunner, for NormRunner.
+struct NormOp {
+  using Signature = NormSignature;
+
+  struct Config {
+    NormKind kind;
+    double epsilon;
+    const TensorDescriptor& x_descriptor;
+    const TensorDescriptor& scale_descriptor;
+    const TensorDescriptor& y_or_dx_descriptor;
+    std::optional<TensorDescriptor> bias_descriptor;
+    std::optional<TensorDescriptor> dy_descriptor;
+    std::optional<TensorDescriptor> expectation_descriptor;
+    std::optional<TensorDescriptor> norm_factor_descriptor;
+    std::optional<TensorDescriptor> dscale_descriptor;
+    std::optional<TensorDescriptor> dbias_descriptor;
+  };
+
+  static absl::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->NormRunnerFromDesc(
+        stream, desc, config.kind, config.epsilon, config.x_descriptor,
+        config.scale_descriptor, config.y_or_dx_descriptor,
+        config.bias_descriptor, config.dy_descriptor,
+        config.expectation_descriptor, config.norm_factor_descriptor,
+        config.dscale_descriptor, config.dbias_descriptor);
+  }
+};
+
+// Implementation of the concept required by LazyOpRunner, for FusedMatmul.
+struct FusedMatmulOp {
+  using Signature = FusedMatmulSignature;
+
+  // Config is mainly used in RunnerFromAlgorithmDesc() to lazily create the
+  // runner. At this moment we only get existing runners and don't implement
+  // this feature.
+  struct Config {};
+
+  static absl::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    return absl::UnimplementedError("Unimplemented");
+  }
+};
+
+}  // namespace dnn
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_LAZY_OP_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/memory_allocation.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/memory_allocation.h
new file mode 100644
index 00000000..0e0df244
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/memory_allocation.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MEMORY_ALLOCATION_H_
+#define XLA_STREAM_EXECUTOR_MEMORY_ALLOCATION_H_
+
+#include <cstdint>
+
+namespace stream_executor {
+
+// An RAII handle for a memory allocated for a device. It can be pinned host
+// memory, unified memory, device memory, etc. depending on what kinds of
+// memories are supported by underlying device.
+class MemoryAllocation {
+ public:
+  MemoryAllocation() = default;
+  virtual ~MemoryAllocation() = default;
+
+  MemoryAllocation(MemoryAllocation&&) = delete;
+  MemoryAllocation& operator=(MemoryAllocation&&) = delete;
+
+  virtual void* opaque() const = 0;
+  virtual uint64_t size() const = 0;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MEMORY_ALLOCATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_platform.h
new file mode 100644
index 00000000..7c8e11dc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_platform.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MOCK_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_MOCK_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/test.h"
+
+namespace stream_executor {
+
+// Implements the Platform interface for testing.
+class MockPlatform : public Platform {
+ public:
+  MockPlatform() = default;
+  MOCK_METHOD(Id, id, (), (const, override));
+  MOCK_METHOD(const std::string&, Name, (), (const, override));
+  MOCK_METHOD(int, VisibleDeviceCount, (), (const, override));
+  MOCK_METHOD(bool, Initialized, (), (const, override));
+  MOCK_METHOD(absl::Status, Initialize, (), (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<DeviceDescription>>,
+              DescriptionForDevice, (int ordinal), (const, override));
+  MOCK_METHOD(absl::StatusOr<StreamExecutor*>, ExecutorForDevice, (int ordinal),
+              (override));
+  MOCK_METHOD(absl::StatusOr<StreamExecutor*>, FindExisting, (int ordinal),
+              (override));
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MOCK_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_stream.h
new file mode 100644
index 00000000..2aa6e806
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_stream.h
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MOCK_STREAM_H_
+#define XLA_STREAM_EXECUTOR_MOCK_STREAM_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/test.h"
+
+namespace stream_executor {
+
+// Implements the Stream interface for testing.
+class MockStream : public Stream {
+ public:
+  MockStream() = default;
+  MOCK_METHOD(PlatformSpecificHandle, platform_specific_handle, (),
+              (const, override));
+  MOCK_METHOD(bool, ok, (), (const, override));
+  MOCK_METHOD(absl::Status, RefreshStatus, (), (override));
+  MOCK_METHOD(absl::StatusOr<Stream *>, GetOrCreateSubStream, (), (override));
+  MOCK_METHOD(void, ReturnSubStream, (Stream * sub_stream), (override));
+  MOCK_METHOD(absl::Status, WaitFor, (Stream * other), (override));
+  MOCK_METHOD(absl::Status, WaitFor, (Event * event), (override));
+  MOCK_METHOD(absl::Status, RecordEvent, (Event * event), (override));
+  MOCK_METHOD(absl::Status, Memcpy,
+              (void *host_dst, const DeviceMemoryBase &gpu_src, uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, Memcpy,
+              (DeviceMemoryBase * gpu_dst, const void *host_src, uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, Memcpy,
+              (DeviceMemoryBase * gpu_dst, const DeviceMemoryBase &gpu_src,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, MemZero,
+              (DeviceMemoryBase * location, uint64_t size), (override));
+  MOCK_METHOD(absl::Status, Memset32,
+              (DeviceMemoryBase * location, uint32_t pattern, uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, BlockHostUntilDone, (), (override));
+  MOCK_METHOD(absl::Status, DoHostCallbackWithStatus,
+              (absl::AnyInvocable<absl::Status() &&> callback), (override));
+  MOCK_METHOD(StreamExecutor *, parent, (), (const, override));
+  MOCK_METHOD(CudaComputeCapability, GetCudaComputeCapability, (),
+              (const, override));
+  MOCK_METHOD(RocmComputeCapability, GetRocmComputeCapability, (),
+              (const, override));
+  MOCK_METHOD((std::variant<StreamPriority, int>), priority, (),
+              (const, override));
+  MOCK_METHOD(absl::Status, LaunchKernel,
+              (const ThreadDim &thread_dims, const BlockDim &block_dims,
+               const std::optional<ClusterDim> &cluster_dims, void *function,
+               absl::string_view name, void **args, int64_t shmem_bytes),
+              (override));
+  MOCK_METHOD(const std::string &, GetName, (), (const, override));
+  MOCK_METHOD(void, SetName, (std::string name), (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<EventBasedTimer>>,
+              CreateEventBasedTimer, (bool use_delay_kernel), (override));
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MOCK_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_stream_executor.h
new file mode 100644
index 00000000..68d76c5e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -0,0 +1,123 @@
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/fft.h"
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MOCK_STREAM_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_MOCK_STREAM_EXECUTOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/test.h"
+
+namespace stream_executor {
+
+// Implements StreamExecutor for testing.
+class MockStreamExecutor : public StreamExecutor {
+ public:
+  MockStreamExecutor() = default;
+  MOCK_METHOD(absl::Status, Init, (), (override));
+  MOCK_METHOD(int, device_ordinal, (), (const, override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Kernel>>, LoadKernel,
+              (const MultiKernelLoaderSpec& spec), (override));
+  MOCK_METHOD(std::unique_ptr<ActivateContext>, Activate, (), (override));
+  MOCK_METHOD(bool, UnloadModule, (ModuleHandle module_handle), (override));
+  MOCK_METHOD(absl::StatusOr<ModuleHandle>, LoadModule,
+              (const MultiModuleLoaderSpec& spec), (override));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<DeviceMemoryBase>>,
+              CreateOrShareConstant,
+              (Stream * stream, absl::Span<const uint8_t> content), (override));
+  MOCK_METHOD(DeviceMemoryBase, Allocate, (uint64_t size, int64_t memory_space),
+              (override));
+  MOCK_METHOD(void, Deallocate, (DeviceMemoryBase * mem), (override));
+  MOCK_METHOD(void*, UnifiedMemoryAllocate, (uint64_t size), (override));
+  MOCK_METHOD(void, UnifiedMemoryDeallocate, (void* mem), (override));
+  MOCK_METHOD(absl::StatusOr<void*>, CollectiveMemoryAllocate, (uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, CollectiveMemoryDeallocate, (void* mem),
+              (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<MemoryAllocation>>,
+              HostMemoryAllocate, (uint64_t size), (override));
+  MOCK_METHOD(void, HostMemoryDeallocate, (void* mem), (override));
+  MOCK_METHOD(bool, SynchronizeAllActivity, (), (override));
+  MOCK_METHOD(absl::Status, SynchronousMemZero,
+              (DeviceMemoryBase * location, uint64_t size), (override));
+  MOCK_METHOD(absl::Status, SynchronousMemcpy,
+              (DeviceMemoryBase * device_dst, const void* host_src,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, SynchronousMemcpy,
+              (void* host_dst, const DeviceMemoryBase& device_src,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(void, DeallocateStream, (Stream * stream), (override));
+  MOCK_METHOD(absl::Status, EnablePeerAccessTo, (StreamExecutor * other),
+              (override));
+  MOCK_METHOD(bool, CanEnablePeerAccessTo, (StreamExecutor * other),
+              (override));
+  MOCK_METHOD(bool, DeviceMemoryUsage, (int64_t* free, int64_t* total),
+              (const, override));
+  MOCK_METHOD(absl::StatusOr<DeviceMemoryBase>, GetSymbol,
+              (const std::string& symbol_name, ModuleHandle module_handle),
+              (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<DeviceDescription>>,
+              CreateDeviceDescription, (), (const, override));
+  MOCK_METHOD(blas::BlasSupport*, AsBlas, (), (override));
+  MOCK_METHOD(fft::FftSupport*, AsFft, (), (override));
+  MOCK_METHOD(dnn::DnnSupport*, AsDnn, (), (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<CommandBuffer>>,
+              CreateCommandBuffer, (CommandBuffer::Mode mode), (override));
+  MOCK_METHOD(std::optional<AllocatorStats>, GetAllocatorStats, (), (override));
+  MOCK_METHOD(bool, ClearAllocatorStats, (), (override));
+  MOCK_METHOD(absl::Status, FlushCompilationCache, (), (override));
+  MOCK_METHOD(Stream*, FindAllocatedStream, (void* device_stream), (override));
+  MOCK_METHOD(const Platform*, GetPlatform, (), (const, override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Stream>>, CreateStream,
+              ((std::optional<std::variant<StreamPriority, int>>)), (override));
+  MOCK_METHOD(int64_t, GetMemoryLimitBytes, (), (const.override));
+  MOCK_METHOD(const DeviceDescription&, GetDeviceDescription, (),
+              (const, override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Event>>, CreateEvent, (),
+              (override));
+  MOCK_METHOD(void, UnloadKernel, (const Kernel* kernel), (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<EventBasedTimer>>,
+              CreateEventBasedTimer, (Stream * stream, bool use_delay_kernel),
+              (override));
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MOCK_STREAM_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/module_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/module_spec.h
new file mode 100644
index 00000000..db733e48
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/module_spec.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MODULE_SPEC_H_
+#define XLA_STREAM_EXECUTOR_MODULE_SPEC_H_
+
+#include <cstdint>
+#include <iosfwd>
+
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "tsl/platform/logging.h"
+
+namespace stream_executor {
+
+//===----------------------------------------------------------------------===//
+// ModuleHandle
+//===----------------------------------------------------------------------===//
+
+// An opaque handle to a loaded module.
+//
+// An instance of this is returned from StreamExecutor::GetModule.
+class ModuleHandle {
+ public:
+  explicit ModuleHandle(const void* id = nullptr) : id_(id) {}
+
+  // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
+  // null pointer.
+  const void* id() const { return id_; }
+
+  explicit operator bool() const { return id() != nullptr; }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ModuleHandle& handle) {
+    return H::combine(std::move(h), handle.id_);
+  }
+  friend bool operator==(const ModuleHandle& lhs, const ModuleHandle& rhs) {
+    return lhs.id_ == rhs.id_;
+  }
+  friend bool operator!=(const ModuleHandle& lhs, const ModuleHandle& rhs) {
+    return lhs.id_ != rhs.id_;
+  }
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ModuleHandle& handle) {
+    sink.Append(absl::StrFormat("ModuleHandle(id=%p)", handle.id_));
+  }
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const ModuleHandle& handle) {
+    return os << absl::StrFormat("ModuleHandle(id=%p)", handle.id_);
+  }
+
+ private:
+  const void* id_;
+};
+
+//===----------------------------------------------------------------------===//
+// MultiModuleLoaderSpec
+//===----------------------------------------------------------------------===//
+
+// Describes how to load a module on a target platform.
+//
+// The exact meaning of a "module" may differ from platform to platform but
+// loosely speaking a module a collection of kernels and global variables.  It
+// corresponds to CUmodule when running on CUDA.
+class MultiModuleLoaderSpec {
+ public:
+  bool has_cuda_cubin_in_memory() const { return has_cuda_cubin_in_memory_; }
+  absl::Span<const uint8_t> cuda_cubin_in_memory() const {
+    CHECK(has_cuda_cubin_in_memory());
+    return {cuda_cubin_in_memory_.data(), cuda_cubin_in_memory_.size()};
+  }
+
+  bool has_cuda_ptx_in_memory() const { return has_cuda_ptx_in_memory_; }
+  const char* cuda_ptx_in_memory() const {
+    CHECK(has_cuda_ptx_in_memory());
+    return cuda_ptx_in_memory_;
+  }
+
+  void AddCudaCubinInMemory(absl::Span<const uint8_t> cubin_bytes) {
+    CHECK(!cubin_bytes.empty());
+    has_cuda_cubin_in_memory_ = true;
+    cuda_cubin_in_memory_ = cubin_bytes;
+  }
+
+  void AddCudaPtxInMemory(const char* ptx) {
+    has_cuda_ptx_in_memory_ = true;
+    // The CUDA driver does not like getting an empty string as PTX.
+    cuda_ptx_in_memory_ = *ptx ? ptx : nullptr;
+  }
+
+ private:
+  absl::Span<const uint8_t> cuda_cubin_in_memory_;
+  bool has_cuda_cubin_in_memory_ = false;
+  const char* cuda_ptx_in_memory_;
+  bool has_cuda_ptx_in_memory_ = false;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MODULE_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/numeric_options.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/numeric_options.h
new file mode 100644
index 00000000..5620d3ad
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/numeric_options.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+------------------------------------------------------------------------------*/
+
+#ifndef XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+#define XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+
+namespace stream_executor {
+
+// Options that specify the numeric behavior of operations like matrix
+// multiplications and convolutions
+struct NumericOptions {
+  NumericOptions(bool require_determinism, bool allow_tf32)
+      : require_determinism(require_determinism), allow_tf32(allow_tf32) {}
+
+  NumericOptions() : require_determinism(false), allow_tf32(true) {}
+
+  // If true, the op must be deterministic
+  bool require_determinism;
+  // If true, float32 inputs can be rounded to TensorFloat-32 precision
+  bool allow_tf32;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform.h
new file mode 100644
index 00000000..759a4c0a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform.h
@@ -0,0 +1,108 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines types and declares functions for identifying and extracting
+// information about the types of platforms and supporting libraries for which
+// StreamExecutor implementations exist.
+#ifndef XLA_STREAM_EXECUTOR_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+// An enum to represent different levels of stream priorities.
+// This is to avoid platform-specific representations in abstractions.
+enum class StreamPriority { Default = 0, Lowest, Highest };
+
+// Returns a printable description of StreamPriority.
+std::string StreamPriorityToString(StreamPriority priority);
+
+// Abstract base class for a platform registered with the PlatformManager.
+class Platform {
+ public:
+  virtual ~Platform() = default;
+
+  // A platform ID is a unique identifier for each registered platform type -
+  // each platform is required to expose an ID to ensure unique registration and
+  // as a target against which plugins can register.
+  //
+  // The macro below is provided to help generate a [process-unique] identifier.
+  using Id = void*;
+
+// Helper macro to define a plugin ID. To be used only inside plugin
+// implementation files. Works by "reserving" an address/value (guaranteed to be
+// unique) inside a process space.
+#define PLATFORM_DEFINE_ID(ID_VAR_NAME) \
+  namespace {                           \
+  int plugin_id_value;                  \
+  }                                     \
+  const ::stream_executor::Platform::Id ID_VAR_NAME = &plugin_id_value;
+
+  // Returns a key uniquely identifying this platform.
+  virtual Id id() const = 0;
+
+  // Name of this platform.
+  virtual const std::string& Name() const = 0;
+
+  // Returns the number of devices accessible on this platform.
+  //
+  // Note that, though these devices are visible, if there is only one userspace
+  // context allowed for the device at a time and another process is using this
+  // device, a call to ExecutorForDevice may return an error status.
+  virtual int VisibleDeviceCount() const = 0;
+
+  // Returns true iff the platform has been initialized.
+  virtual bool Initialized() const;
+
+  // Initializes the platform. The platform must be initialized before obtaining
+  // StreamExecutor objects.
+  virtual absl::Status Initialize();
+
+  // Returns a populated DeviceDescription for the device at the given ordinal.
+  // This should not require device initialization. Note that not all platforms
+  // may support acquiring the DeviceDescription indirectly.
+  //
+  // Alternatively callers may call GetDeviceDescription() on the StreamExecutor
+  // which returns a cached instance specific to the initialized StreamExecutor.
+  virtual absl::StatusOr<std::unique_ptr<DeviceDescription>>
+  DescriptionForDevice(int ordinal) const = 0;
+
+  // Returns a StreamExecutor for the given ordinal if one has already been
+  // created, or an error is returned if none exists.  Does not create a new
+  // context with the device.
+  virtual absl::StatusOr<StreamExecutor*> FindExisting(int ordinal) {
+    return absl::NotFoundError("Not implemented for this platform.");
+  }
+
+  // Returns a device with the given ordinal on this platform or, if none can
+  // be found with the given ordinal or there is an error in opening a context
+  // to communicate with the device, an error status is returned.
+  //
+  // Ownership of the executor is NOT transferred to the caller --
+  // the Platform owns the executors in a singleton-like fashion.
+  virtual absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) = 0;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform/default/initialize.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform/default/initialize.h
new file mode 100644
index 00000000..a3cc0f24
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform/default/initialize.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// IWYU pragma: private, include "xla/stream_executor/platform/initialize.h"
+
+#ifndef XLA_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+#define XLA_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+
+namespace stream_executor {
+namespace port {
+
+class Initializer {
+ public:
+  explicit Initializer(void (*func)()) { func(); }
+};
+
+}  // namespace port
+}  // namespace stream_executor
+
+#define STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(name, body)            \
+  ::stream_executor::port::Initializer google_initializer_module##_##name( \
+      []() { body; })
+
+#endif  // XLA_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform/initialize.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform/initialize.h
new file mode 100644
index 00000000..1e3069f7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform/initialize.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+#define XLA_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+
+#include "tsl/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_CHROMIUMOS)
+#include "xla/stream_executor/platform/google/initialize.h"  // IWYU pragma: export
+#else
+#include "xla/stream_executor/platform/default/initialize.h"  // IWYU pragma: export
+#endif
+
+#endif  // XLA_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform_manager.h
new file mode 100644
index 00000000..161c3d9b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/platform_manager.h
@@ -0,0 +1,136 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a registration-oriented interface for multiple platforms.
+//
+// Usage:
+//
+// In your BUILD rule, add a dependency on a platform plugin that you'd like
+// to use, such as:
+//
+//   //third_party/tensorflow/compiler/xla/stream_executor/cuda:cuda_platform
+//   //third_party/tensorflow/compiler/xla/stream_executor/opencl:opencl_platform
+//
+// This will register platform plugins that can be discovered via this
+// interface. Sample API usage:
+//
+//   absl::StatusOr<Platform*> platform_status =
+//      se::PlatformManager::PlatformWithName("OpenCL");
+//   if (!platform_status.ok()) { ... }
+//   Platform* platform = platform_status.value();
+//   LOG(INFO) << platform->VisibleDeviceCount() << " devices visible";
+//   if (platform->VisibleDeviceCount() <= 0) { return; }
+//
+//   for (int i = 0; i < platform->VisibleDeviceCount(); ++i) {
+//     absl::StatusOr<StreamExecutor*> executor_status =
+//        platform->ExecutorForDevice(i);
+//     if (!executor_status.ok()) {
+//       LOG(INFO) << "could not retrieve executor for device ordinal " << i
+//                 << ": " << executor_status.status();
+//       continue;
+//     }
+//     LOG(INFO) << "found usable executor: " << executor_status.value();
+//   }
+//
+// A few things to note:
+//  - There is no standard formatting/practice for identifying the name of a
+//    platform. Ideally, a platform will list its registered name in its header
+//    or in other associated documentation.
+//  - Platform name lookup is case-insensitive. "OpenCL" or "opencl" (or even
+//    ("OpEnCl") would work correctly in the above example.
+//
+// And similarly, for standard interfaces (BLAS, etc.) you can add
+// dependencies on support libraries, e.g.:
+//
+//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:pluton_blas_plugin
+//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin
+//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin
+
+#ifndef XLA_STREAM_EXECUTOR_PLATFORM_MANAGER_H_
+#define XLA_STREAM_EXECUTOR_PLATFORM_MANAGER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+
+// Manages multiple platforms that may be present on the current machine.
+class PlatformManager {
+ public:
+  // Registers a platform object, returns an error status if the platform is
+  // already registered. The associated listener, if not null, will be used to
+  // trace events for ALL executors for that platform.
+  // Takes ownership of platform.
+  static absl::Status RegisterPlatform(std::unique_ptr<Platform> platform);
+
+  // Retrieves the platform registered with the given platform name (e.g.
+  // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
+  // Platform's Id() method).
+  //
+  // If the platform has not already been initialized, it will be initialized
+  // with a default set of parameters.
+  //
+  // If the requested platform is not registered, an error status is returned.
+  // Ownership of the platform is NOT transferred to the caller --
+  // the PlatformManager owns the platforms in a singleton-like fashion.
+  static absl::StatusOr<Platform*> PlatformWithName(absl::string_view target);
+  static absl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
+
+  // Same functions as above, but allows platforms to be returned without
+  // initialization if initialize_platform == false.
+  static absl::StatusOr<Platform*> PlatformWithName(absl::string_view target,
+                                                    bool initialize_platform);
+
+  // Retrieves the platform registered with the given platform id (an opaque,
+  // comparable value provided by the Platform's Id() method).
+  //
+  // If the requested platform is not registered, an error status is returned.
+  // Ownership of the platform is NOT transferred to the caller --
+  // the PlatformManager owns the platforms in a singleton-like fashion.
+  static absl::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id);
+
+  // Retrieves the platforms satisfying the given filter, i.e. returns true.
+  // Returned Platforms are always initialized.
+  static absl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+      const std::function<bool(const Platform*)>& filter);
+
+  static absl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+      const std::function<bool(const Platform*)>& filter,
+      bool initialize_platform);
+
+  // Although the PlatformManager "owns" its platforms, it holds them as
+  // undecorated pointers to prevent races during program exit (between this
+  // object's data and the underlying platforms (e.g., CUDA, OpenCL).
+  // Because certain platforms have unpredictable deinitialization
+  // times/sequences, it is not possible to strucure a safe deinitialization
+  // sequence. Thus, we intentionally "leak" allocated platforms to defer
+  // cleanup to the OS. This should be acceptable, as these are one-time
+  // allocations per program invocation.
+  // The PlatformManager should be considered the owner
+  // of any platforms registered with it, and leak checking should be disabled
+  // during allocation of such Platforms, to avoid spurious reporting at program
+  // exit.
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_PLATFORM_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/plugin_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/plugin_registry.h
new file mode 100644
index 00000000..f2051fa1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/plugin_registry.h
@@ -0,0 +1,128 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
+#define XLA_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
+
+#include <map>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+// Enumeration to list the supported types of plugins / support libraries.
+enum class PluginKind {
+  kInvalid,
+  kBlas,
+  kDnn,
+  kFft,
+};
+
+// The PluginRegistry is a singleton that maintains the set of registered
+// "support library" plugins. Currently, there are four kinds of plugins:
+// BLAS, DNN, and FFT. Each interface is defined in the corresponding
+// gpu_{kind}.h header.
+//
+// At runtime, a StreamExecutor object will query the singleton registry to
+// retrieve the plugin kind that StreamExecutor was configured with (refer to
+// the StreamExecutor declarations).
+//
+// Plugin libraries are best registered using REGISTER_MODULE_INITIALIZER,
+// but can be registered at any time. When registering a DSO-backed plugin, it
+// is usually a good idea to load the DSO at registration time, to prevent
+// late-loading from distorting performance/benchmarks as much as possible.
+class PluginRegistry {
+ public:
+  typedef blas::BlasSupport* (*BlasFactory)(StreamExecutor*);
+  typedef dnn::DnnSupport* (*DnnFactory)(StreamExecutor*);
+  typedef fft::FftSupport* (*FftFactory)(StreamExecutor*);
+
+  // Gets (and creates, if necessary) the singleton PluginRegistry instance.
+  static PluginRegistry* Instance();
+
+  // Registers the specified factory with the specified platform.
+  // Returns a non-successful status if the factory has already been registered
+  // with that platform (but execution should be otherwise unaffected).
+  template <typename FactoryT>
+  absl::Status RegisterFactory(Platform::Id platform_id,
+                               const std::string& name, FactoryT factory);
+
+  // Return true if the factory/kind has been registered for the
+  // specified platform and plugin kind and false otherwise.
+  bool HasFactory(Platform::Id platform_id, PluginKind plugin_kind) const;
+
+  // Retrieves the factory registered for the specified kind,
+  // or a absl::Status on error.
+  template <typename FactoryT>
+  absl::StatusOr<FactoryT> GetFactory(Platform::Id platform_id);
+
+ private:
+  // Containers for the sets of registered factories, by plugin kind.
+  struct Factories {
+    std::optional<BlasFactory> blas;
+    std::optional<DnnFactory> dnn;
+    std::optional<FftFactory> fft;
+  };
+
+  PluginRegistry();
+
+  // Actually performs the work of registration.
+  template <typename FactoryT>
+  absl::Status RegisterFactoryInternal(const std::string& plugin_name,
+                                       FactoryT factory,
+                                       std::optional<FactoryT>* factories);
+
+  // Returns true if the specified plugin has been registered with the specified
+  // platform factories. Unlike the other overload of this method, this does
+  // not implicitly examine the default factory lists.
+  bool HasFactory(const Factories& factories, PluginKind plugin_kind) const;
+
+  // The singleton itself.
+  static PluginRegistry* instance_;
+
+  // The set of registered factories, keyed by platform ID.
+  std::map<Platform::Id, Factories> factories_;
+
+  PluginRegistry(const PluginRegistry&) = delete;
+  void operator=(const PluginRegistry&) = delete;
+};
+
+// Explicit specializations are defined in plugin_registry.cc.
+#define DECLARE_PLUGIN_SPECIALIZATIONS(FACTORY_TYPE)                          \
+  template <>                                                                 \
+  absl::Status PluginRegistry::RegisterFactory<PluginRegistry::FACTORY_TYPE>( \
+      Platform::Id platform_id, const std::string& name,                      \
+      PluginRegistry::FACTORY_TYPE factory);                                  \
+  template <>                                                                 \
+  absl::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
+      Platform::Id platform_id)
+
+DECLARE_PLUGIN_SPECIALIZATIONS(BlasFactory);
+DECLARE_PLUGIN_SPECIALIZATIONS(DnnFactory);
+DECLARE_PLUGIN_SPECIALIZATIONS(FftFactory);
+#undef DECL_PLUGIN_SPECIALIZATIONS
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
new file mode 100644
index 00000000..bcaaefe0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -0,0 +1,167 @@
+/* Copyright 2023 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_LT_H_
+#define XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_LT_H_
+
+#include "absl/status/status.h"
+#include "rocm/rocm_config.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/host_or_device_scalar.h"
+#include "xla/types.h"
+
+#if TF_HIPBLASLT
+
+#include "xla/stream_executor/rocm/hip_blas_utils.h"
+
+namespace stream_executor {
+
+namespace rocm {
+
+class BlasLt : public gpu::BlasLt {
+  template <typename T>
+  using Owned =
+      std::unique_ptr<std::remove_pointer_t<T>, hipblasStatus_t (*)(T)>;
+
+ public:
+  struct MatrixLayout {
+    static absl::StatusOr<MatrixLayout> Create(const gpu::MatrixLayout& m);
+
+    hipDataType type() const { return datatype_; }
+    hipblasLtMatrixLayout_t get() const { return handle_.get(); }
+
+   private:
+    MatrixLayout(hipblasLtMatrixLayout_t handle, hipDataType datatype)
+        : handle_(handle, wrap::hipblasLtMatrixLayoutDestroy),
+          datatype_(datatype) {}
+
+    Owned<hipblasLtMatrixLayout_t> handle_;
+    hipDataType datatype_;
+  };
+
+  class MatmulDesc {
+   public:
+    static absl::StatusOr<MatmulDesc> Create(
+        blas::ComputationType compute_type, blas::DataType scale_type,
+        blas::Transpose trans_a = blas::Transpose::kNoTranspose,
+        blas::Transpose trans_b = blas::Transpose::kNoTranspose,
+        Epilogue epilogue = Epilogue::kDefault,
+        PointerMode pointer_mode = PointerMode::kHost);
+
+    hipblasComputeType_t compute_type() const { return compute_type_; }
+    hipDataType scale_type() const { return datatype_; }
+    bool has_bias_epilogue() const { return has_bias_epilogue_; }
+    hipblasPointerMode_t pointer_mode() const {
+      return HIPBLAS_POINTER_MODE_HOST;
+    }
+    hipblasLtMatmulDesc_t get() const { return handle_.get(); }
+
+   private:
+    MatmulDesc(hipblasLtMatmulDesc_t handle, hipblasComputeType_t compute_type,
+               hipDataType datatype, bool bias_epilogue)
+        : handle_(handle, wrap::hipblasLtMatmulDescDestroy),
+          compute_type_(compute_type),
+          datatype_(datatype),
+          has_bias_epilogue_(bias_epilogue) {}
+
+    Owned<hipblasLtMatmulDesc_t> handle_;
+    hipblasComputeType_t compute_type_;
+    hipDataType datatype_;
+    bool has_bias_epilogue_;
+  };
+
+  struct MatmulPlan : public gpu::BlasLt::MatmulPlan {
+    MatmulPlan(const BlasLt& blas_lt_ref, MatmulDesc&& op_desc,
+               MatrixLayout&& a_desc, MatrixLayout&& b_desc,
+               MatrixLayout&& c_desc, MatrixLayout&& d_desc,
+               xla::complex128 alpha, double beta, bool must_swap_operands)
+        : blas_lt_ref_(blas_lt_ref),
+          op_desc_(std::move(op_desc)),
+          a_desc_(std::move(a_desc)),
+          b_desc_(std::move(b_desc)),
+          c_desc_(std::move(c_desc)),
+          d_desc_(std::move(d_desc)),
+          alpha_(alpha),
+          beta_(beta),
+          must_swap_operands_(must_swap_operands) {}
+
+    ~MatmulPlan() override = default;
+
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result) const override;
+
+    absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
+        size_t max_algorithm_count, size_t max_workspace_size) const override;
+
+   protected:
+    absl::Status ValidateInputs(blas::DataType scale_type, bool alpha_on_device,
+                                bool beta_on_device, blas::DataType A_type,
+                                blas::DataType B_type, blas::DataType C_type,
+                                blas::DataType D_type) const override;
+
+    absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
+                          DeviceMemoryBase b, const void* beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          const MatmulAlgorithm& algorithm,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          std::optional<DeviceMemoryBase> workspace,
+                          std::optional<ScratchAllocator*> scratch_allocator,
+                          blas::ProfileResult* profile_result) const override;
+
+   private:
+    const BlasLt& blas_lt_ref_;
+    // TODO(cjfj): Add consistency checks for types, shapes, etc.?
+    MatmulDesc op_desc_;
+    MatrixLayout a_desc_;
+    MatrixLayout b_desc_;
+    MatrixLayout c_desc_;
+    MatrixLayout d_desc_;
+    xla::complex128 alpha_;
+    double beta_;
+    bool must_swap_operands_;
+  };  // class MatmulPlan
+
+  explicit BlasLt(StreamExecutor* parent)
+      : parent_(parent), blas_lt_(nullptr, wrap::hipblasLtDestroy) {}
+
+  absl::Status Init() override;
+
+  absl::StatusOr<MatmulPlanPtr> GetMatmulPlan(const gpu::GemmConfig& cfg,
+                                              Epilogue epilogue) const override;
+
+  ~BlasLt() override = default;
+
+ private:
+  StreamExecutor* parent_;
+  mutable absl::Mutex mu_;
+  Owned<hipblasLtHandle_t> blas_lt_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TF_HIPBLASLT
+#endif  // XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_LT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h
new file mode 100644
index 00000000..267e2a31
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h
@@ -0,0 +1,61 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_UTILS_H_
+#define XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_UTILS_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/rocm/hipblaslt_wrapper.h"
+#include "tsl/platform/errors.h"
+
+#if TF_HIPBLASLT
+
+#if TF_ROCM_VERSION < 60000
+#define hipDataType hipblasDatatype_t
+#define HIP_R_16F HIPBLAS_R_16F
+#define HIP_R_16BF HIPBLAS_R_16B
+#define HIP_R_32F HIPBLAS_R_32F
+#define HIP_R_64F HIPBLAS_R_64F
+#define HIP_R_8I HIPBLAS_R_8I
+#define HIP_R_32I HIPBLAS_R_32I
+#define HIP_C_32F HIPBLAS_C_32F
+#define HIP_C_64F HIPBLAS_C_64F
+
+#define hipblasComputeType_t hipblasLtComputeType_t
+#define HIPBLAS_COMPUTE_32F HIPBLASLT_COMPUTE_F32
+#define HIPBLAS_COMPUTE_64F HIPBLASLT_COMPUTE_F64
+#define HIPBLAS_COMPUTE_32I HIPBLASLT_COMPUTE_I32
+#endif
+
+namespace stream_executor {
+namespace rocm {
+
+#define SE_HIPBLAS_RETURN_IF_ERROR(expr) \
+  TF_RETURN_IF_ERROR(::stream_executor::rocm::ToStatus(expr, #expr))
+
+absl::Status ToStatus(hipblasStatus_t status, const char* prefix);
+hipDataType AsHipblasDataType(blas::DataType type);
+hipblasComputeType_t AsHipblasComputeType(blas::ComputationType type);
+hipblasOperation_t AsHipblasOperation(blas::Transpose trans);
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TF_HIPBLASLT
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h
new file mode 100644
index 00000000..09cac948
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocsolver API calls with dso loader so that we don't need to
+// have explicit linking to librocsolver. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_HIPBLASLT_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_HIPBLASLT_WRAPPER_H_
+
+#include "rocm/rocm_config.h"
+
+#if TF_HIPBLASLT
+#if TF_ROCM_VERSION >= 50500
+#include "rocm/include/hipblaslt/hipblaslt.h"
+#else
+#include "rocm/include/hipblaslt.h"
+#endif
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+
+namespace stream_executor {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define HIPBLASLT_API_WRAPPER(api_name)                          \
+  template <typename... Args>                                    \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) { \
+    return ::api_name(args...);                                  \
+  }
+
+#else
+
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define HIPBLASLT_API_WRAPPER(api_name)                                    \
+  template <typename... Args>                                              \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) {           \
+    using FuncPtrT = std::add_pointer<decltype(::api_name)>::type;         \
+    static FuncPtrT loaded = []() -> FuncPtrT {                            \
+      static const char* kName = TO_STR(api_name);                         \
+      void* f;                                                             \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                  \
+          tsl::internal::CachedDsoLoader::GetHipblasltDsoHandle().value(), \
+          kName, &f);                                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in hipblaslt lib; dlerror: " << s.message();      \
+      return reinterpret_cast<FuncPtrT>(f);                                \
+    }();                                                                   \
+    return loaded(args...);                                                \
+  }
+
+#endif
+
+// clang-format off
+#define FOREACH_HIPBLASLT_API(__macro)      \
+  __macro(hipblasLtCreate) \
+  __macro(hipblasLtDestroy) \
+  __macro(hipblasLtMatmulPreferenceCreate) \
+  __macro(hipblasLtMatmulPreferenceSetAttribute) \
+  __macro(hipblasLtMatmulPreferenceDestroy) \
+  __macro(hipblasLtMatmulDescSetAttribute) \
+  __macro(hipblasLtMatmulDescGetAttribute) \
+  __macro(hipblasLtMatmulAlgoGetHeuristic) \
+  __macro(hipblasLtMatrixLayoutCreate) \
+  __macro(hipblasLtMatrixLayoutDestroy) \
+  __macro(hipblasLtMatrixLayoutSetAttribute) \
+  __macro(hipblasLtMatrixLayoutGetAttribute) \
+  __macro(hipblasLtMatmulDescCreate) \
+  __macro(hipblasLtMatmulDescDestroy) \
+  __macro(hipblasLtMatmul) \
+  __macro(hipblasStatusToString)
+// clang-format on
+
+FOREACH_HIPBLASLT_API(HIPBLASLT_API_WRAPPER)
+
+#undef TO_STR_
+#undef TO_STR
+#undef FOREACH_HIPBLASLT_API
+#undef HIPBLASLT_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // TF_HIPBLASLT
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_HIPBLASLT_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipsolver_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipsolver_wrapper.h
new file mode 100644
index 00000000..10bd1cb7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipsolver_wrapper.h
@@ -0,0 +1,148 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps hipsolver API calls with dso loader so that we don't need to
+// have explicit linking to libhipsolver. All TF hipsolver API usage should
+// route through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_HIPSOLVER_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_HIPSOLVER_WRAPPER_H_
+
+#include "rocm/rocm_config.h"
+
+#if TF_ROCM_VERSION >= 40500
+#if TF_ROCM_VERSION >= 50600
+#include "rocm/include/hipsolver/hipsolver.h"
+#else
+
+#include "rocm/include/hipsolver.h"
+#endif
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+
+namespace stream_executor {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define HIPSOLVER_API_WRAPPER(api_name)                          \
+  template <typename... Args>                                    \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) { \
+    return ::api_name(args...);                                  \
+  }
+
+#else
+
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define HIPSOLVER_API_WRAPPER(api_name)                                    \
+  template <typename... Args>                                              \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) {           \
+    using FuncPtrT = std::add_pointer<decltype(::api_name)>::type;         \
+    static FuncPtrT loaded = []() -> FuncPtrT {                            \
+      static const char* kName = TO_STR(api_name);                         \
+      void* f;                                                             \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                  \
+          tsl::internal::CachedDsoLoader::GetHipsolverDsoHandle().value(), \
+          kName, &f);                                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in hipsolver lib; dlerror: " << s.message();      \
+      return reinterpret_cast<FuncPtrT>(f);                                \
+    }();                                                                   \
+    return loaded(args...);                                                \
+  }
+
+#endif
+
+// clang-format off
+#define FOREACH_HIPSOLVER_API(__macro)       \
+  __macro(hipsolverCreate)                   \
+  __macro(hipsolverDestroy)                  \
+  __macro(hipsolverSetStream)                \
+  __macro(hipsolverCgetrf)                   \
+  __macro(hipsolverCgetrf_bufferSize)        \
+  __macro(hipsolverDgetrf)                   \
+  __macro(hipsolverDgetrf_bufferSize)        \
+  __macro(hipsolverSgetrf)                   \
+  __macro(hipsolverSgetrf_bufferSize)        \
+  __macro(hipsolverZgetrf)                   \
+  __macro(hipsolverZgetrf_bufferSize)        \
+  __macro(hipsolverCgetrs)                   \
+  __macro(hipsolverCgetrs_bufferSize)        \
+  __macro(hipsolverDgetrs)                   \
+  __macro(hipsolverDgetrs_bufferSize)        \
+  __macro(hipsolverSgetrs)                   \
+  __macro(hipsolverSgetrs_bufferSize)        \
+  __macro(hipsolverZgetrs)                   \
+  __macro(hipsolverZgetrs_bufferSize)        \
+  __macro(hipsolverSgesvd)                   \
+  __macro(hipsolverSgesvd_bufferSize)        \
+  __macro(hipsolverDgesvd)                   \
+  __macro(hipsolverDgesvd_bufferSize)        \
+  __macro(hipsolverCgesvd)                   \
+  __macro(hipsolverCgesvd_bufferSize)        \
+  __macro(hipsolverZgesvd)                   \
+  __macro(hipsolverZgesvd_bufferSize)        \
+  __macro(hipsolverCpotrf)                   \
+  __macro(hipsolverCpotrf_bufferSize)        \
+  __macro(hipsolverDpotrf)                   \
+  __macro(hipsolverDpotrf_bufferSize)        \
+  __macro(hipsolverSpotrf)                   \
+  __macro(hipsolverSpotrf_bufferSize)        \
+  __macro(hipsolverZpotrf)                   \
+  __macro(hipsolverZpotrf_bufferSize)        \
+  __macro(hipsolverCpotrfBatched)            \
+  __macro(hipsolverCpotrfBatched_bufferSize) \
+  __macro(hipsolverDpotrfBatched)            \
+  __macro(hipsolverDpotrfBatched_bufferSize) \
+  __macro(hipsolverSpotrfBatched)            \
+  __macro(hipsolverSpotrfBatched_bufferSize) \
+  __macro(hipsolverZpotrfBatched)            \
+  __macro(hipsolverZpotrfBatched_bufferSize) \
+  __macro(hipsolverCgeqrf)                   \
+  __macro(hipsolverCgeqrf_bufferSize)        \
+  __macro(hipsolverDgeqrf)                   \
+  __macro(hipsolverDgeqrf_bufferSize)        \
+  __macro(hipsolverSgeqrf)                   \
+  __macro(hipsolverSgeqrf_bufferSize)        \
+  __macro(hipsolverZgeqrf)                   \
+  __macro(hipsolverZgeqrf_bufferSize)        \
+  __macro(hipsolverCunmqr)                   \
+  __macro(hipsolverCunmqr_bufferSize)        \
+  __macro(hipsolverZunmqr)                   \
+  __macro(hipsolverZunmqr_bufferSize)        \
+  __macro(hipsolverCungqr)                   \
+  __macro(hipsolverCungqr_bufferSize)        \
+  __macro(hipsolverZungqr)                   \
+  __macro(hipsolverZungqr_bufferSize)        \
+  __macro(hipsolverCheevd)                   \
+  __macro(hipsolverCheevd_bufferSize)        \
+  __macro(hipsolverZheevd)                   \
+  __macro(hipsolverZheevd_bufferSize)
+// clang-format on
+
+FOREACH_HIPSOLVER_API(HIPSOLVER_API_WRAPPER)
+
+#undef TO_STR_
+#undef TO_STR
+#undef FOREACH_HIPSOLVER_API
+#undef HIPSOLVER_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // TF_ROCM_VERSION
+#endif  // XLA_STREAM_EXECUTOR_ROCM_HIPSOLVER_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
new file mode 100644
index 00000000..213ef7e2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
@@ -0,0 +1,153 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps hipsparse API calls with dso loader so that we don't need to
+// have explicit linking to libhipsparse. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
+
+#include "rocm/rocm_config.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/hipsparse/hipsparse.h"
+#else
+#include "rocm/include/hipsparse.h"
+#endif
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/platform.h"
+
+namespace stream_executor {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define HIPSPARSE_API_WRAPPER(__name)               \
+  struct WrapperShim__##__name {                    \
+    template <typename... Args>                     \
+    hipsparseStatus_t operator()(Args... args) {    \
+      hipsparseStatus_t retval = ::__name(args...); \
+      return retval;                                \
+    }                                               \
+  } __name;
+
+#else
+
+#define HIPSPARSE_API_WRAPPER(__name)                                    \
+  static struct DynLoadShim__##__name {                                  \
+    constexpr static const char* kName = #__name;                        \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
+    static void* GetDsoHandle() {                                        \
+      auto s = tsl::internal::CachedDsoLoader::GetHipsparseDsoHandle();  \
+      return s.value();                                                  \
+    }                                                                    \
+    static FuncPtrT LoadOrDie() {                                        \
+      void* f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                         kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in miopen DSO; dlerror: " << s.message();       \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }                                                                    \
+    static FuncPtrT DynLoad() {                                          \
+      static FuncPtrT f = LoadOrDie();                                   \
+      return f;                                                          \
+    }                                                                    \
+    template <typename... Args>                                          \
+    hipsparseStatus_t operator()(Args... args) {                         \
+      return DynLoad()(args...);                                         \
+    }                                                                    \
+  } __name;
+
+#endif
+
+// clang-format off
+#define FOREACH_HIPSPARSE_API(__macro)          \
+  __macro(hipsparseCreate)                      \
+  __macro(hipsparseCreateMatDescr)              \
+  __macro(hipsparseCcsr2csc)                    \
+  __macro(hipsparseCcsrgeam2)                   \
+  __macro(hipsparseCcsrgeam2_bufferSizeExt)     \
+  __macro(hipsparseCcsrgemm)                    \
+  __macro(hipsparseCcsrmm)                      \
+  __macro(hipsparseCcsrmm2)                     \
+  __macro(hipsparseCcsrmv)                      \
+  __macro(hipsparseDcsr2csc)                    \
+  __macro(hipsparseDcsrgeam2)                   \
+  __macro(hipsparseDcsrgeam2_bufferSizeExt)     \
+  __macro(hipsparseDcsrgemm)                    \
+  __macro(hipsparseDcsrmm)                      \
+  __macro(hipsparseDcsrmm2)                     \
+  __macro(hipsparseDcsrmv)                      \
+  __macro(hipsparseDestroy)                     \
+  __macro(hipsparseDestroyMatDescr)             \
+  __macro(hipsparseScsr2csc)                    \
+  __macro(hipsparseScsrgeam2)                   \
+  __macro(hipsparseScsrgeam2_bufferSizeExt)     \
+  __macro(hipsparseScsrgemm)                    \
+  __macro(hipsparseScsrmm)                      \
+  __macro(hipsparseScsrmm2)                     \
+  __macro(hipsparseScsrmv)                      \
+  __macro(hipsparseSetStream)                   \
+  __macro(hipsparseSetMatIndexBase)             \
+  __macro(hipsparseSetMatType)                  \
+  __macro(hipsparseXcoo2csr)                    \
+  __macro(hipsparseXcsr2coo)                    \
+  __macro(hipsparseXcsrgeam2Nnz)                \
+  __macro(hipsparseXcsrgemmNnz)                 \
+  __macro(hipsparseZcsr2csc)                    \
+  __macro(hipsparseZcsrgeam2)                   \
+  __macro(hipsparseZcsrgeam2_bufferSizeExt)     \
+  __macro(hipsparseZcsrgemm)                    \
+  __macro(hipsparseZcsrmm)                      \
+  __macro(hipsparseZcsrmm2)                     \
+  __macro(hipsparseZcsrmv)
+
+#if TF_ROCM_VERSION >= 40200
+#define FOREACH_HIPSPARSE_ROCM42_API(__macro)   \
+  __macro(hipsparseCcsru2csr_bufferSizeExt)     \
+  __macro(hipsparseCcsru2csr)                   \
+  __macro(hipsparseCreateCsr)                   \
+  __macro(hipsparseCreateDnMat)                 \
+  __macro(hipsparseDestroyDnMat)                \
+  __macro(hipsparseDestroySpMat)                \
+  __macro(hipsparseDcsru2csr_bufferSizeExt)     \
+  __macro(hipsparseDcsru2csr)                   \
+  __macro(hipsparseScsru2csr_bufferSizeExt)     \
+  __macro(hipsparseScsru2csr)                   \
+  __macro(hipsparseSpMM_bufferSize)             \
+  __macro(hipsparseSpMM)                        \
+  __macro(hipsparseZcsru2csr_bufferSizeExt)     \
+  __macro(hipsparseZcsru2csr)
+
+
+FOREACH_HIPSPARSE_ROCM42_API(HIPSPARSE_API_WRAPPER)
+
+#undef FOREACH_HIPSPARSE_ROCM42_API
+#endif
+
+// clang-format on
+
+FOREACH_HIPSPARSE_API(HIPSPARSE_API_WRAPPER)
+
+#undef FOREACH_HIPSPARSE_API
+#undef HIPSPARSE_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
new file mode 100644
index 00000000..8b706c22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
@@ -0,0 +1,287 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocblas API calls with dso loader so that we don't need to
+// have explicit linking to librocblas. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
+
+// needed for rocblas_gemm_ex_get_solutions* functionality
+#define ROCBLAS_BETA_FEATURES_API
+
+#include "rocm/include/rocblas/rocblas.h"
+#include "rocm/rocm_config.h"
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/platform.h"
+
+namespace stream_executor {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+#define ROCBLAS_API_WRAPPER(__name)               \
+  struct WrapperShim__##__name {                  \
+    constexpr static const char* kName = #__name; \
+    template <typename... Args>                   \
+    rocblas_status operator()(Args... args) {     \
+      return (::__name)(args...);                 \
+    }                                             \
+  } __name;
+
+#else
+using tsl::internal::CachedDsoLoader::GetRocblasDsoHandle;
+
+#define ROCBLAS_API_WRAPPER(__name)                                      \
+  static struct DynLoadShim__##__name {                                  \
+    constexpr static const char* kName = #__name;                        \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
+    static void* GetDsoHandle() {                                        \
+      auto s = GetRocblasDsoHandle();                                    \
+      return s.value();                                                  \
+    }                                                                    \
+    static FuncPtrT LoadOrDie() {                                        \
+      void* f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                         kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in rocblas DSO; dlerror: " << s.message();      \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }                                                                    \
+    static FuncPtrT DynLoad() {                                          \
+      static FuncPtrT f = LoadOrDie();                                   \
+      return f;                                                          \
+    }                                                                    \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) {                                      \
+      return DynLoad()(args...);                                         \
+    }                                                                    \
+  } __name;
+
+#endif
+
+// clang-format off
+#define FOREACH_ROCBLAS_API(__macro)            \
+  __macro(rocblas_snrm2)                        \
+  __macro(rocblas_dnrm2)                        \
+  __macro(rocblas_scnrm2)                       \
+  __macro(rocblas_dznrm2)                       \
+  __macro(rocblas_sdot)                         \
+  __macro(rocblas_ddot)                         \
+  __macro(rocblas_cdotu)                        \
+  __macro(rocblas_cdotc)                        \
+  __macro(rocblas_zdotu)                        \
+  __macro(rocblas_zdotc)                        \
+  __macro(rocblas_sscal)                        \
+  __macro(rocblas_dscal)                        \
+  __macro(rocblas_cscal)                        \
+  __macro(rocblas_csscal)                       \
+  __macro(rocblas_zscal)                        \
+  __macro(rocblas_zdscal)                       \
+  __macro(rocblas_saxpy)                        \
+  __macro(rocblas_daxpy)                        \
+  __macro(rocblas_caxpy)                        \
+  __macro(rocblas_zaxpy)                        \
+  __macro(rocblas_scopy)                        \
+  __macro(rocblas_dcopy)                        \
+  __macro(rocblas_ccopy)                        \
+  __macro(rocblas_zcopy)                        \
+  __macro(rocblas_sswap)                        \
+  __macro(rocblas_dswap)                        \
+  __macro(rocblas_cswap)                        \
+  __macro(rocblas_zswap)                        \
+  __macro(rocblas_isamax)                       \
+  __macro(rocblas_idamax)                       \
+  __macro(rocblas_icamax)                       \
+  __macro(rocblas_izamax)                       \
+  __macro(rocblas_isamin)                       \
+  __macro(rocblas_idamin)                       \
+  __macro(rocblas_icamin)                       \
+  __macro(rocblas_izamin)                       \
+  __macro(rocblas_sasum)                        \
+  __macro(rocblas_dasum)                        \
+  __macro(rocblas_scasum)                       \
+  __macro(rocblas_dzasum)                       \
+  __macro(rocblas_srot)                         \
+  __macro(rocblas_drot)                         \
+  __macro(rocblas_crot)                         \
+  __macro(rocblas_csrot)                        \
+  __macro(rocblas_zrot)                         \
+  __macro(rocblas_zdrot)                        \
+  __macro(rocblas_srotg)                        \
+  __macro(rocblas_drotg)                        \
+  __macro(rocblas_crotg)                        \
+  __macro(rocblas_zrotg)                        \
+  __macro(rocblas_srotm)                        \
+  __macro(rocblas_drotm)                        \
+  __macro(rocblas_srotmg)                       \
+  __macro(rocblas_drotmg)                       \
+  __macro(rocblas_sgemv)                        \
+  __macro(rocblas_dgemv)                        \
+  __macro(rocblas_cgemv)                        \
+  __macro(rocblas_zgemv)                        \
+  __macro(rocblas_sgbmv)                        \
+  __macro(rocblas_dgbmv)                        \
+  __macro(rocblas_cgbmv)                        \
+  __macro(rocblas_zgbmv)                        \
+  __macro(rocblas_strmv)                        \
+  __macro(rocblas_dtrmv)                        \
+  __macro(rocblas_ctrmv)                        \
+  __macro(rocblas_ztrmv)                        \
+  __macro(rocblas_stbmv)                        \
+  __macro(rocblas_dtbmv)                        \
+  __macro(rocblas_ctbmv)                        \
+  __macro(rocblas_ztbmv)                        \
+  __macro(rocblas_stpmv)                        \
+  __macro(rocblas_dtpmv)                        \
+  __macro(rocblas_ctpmv)                        \
+  __macro(rocblas_ztpmv)                        \
+  __macro(rocblas_strsv)                        \
+  __macro(rocblas_dtrsv)                        \
+  __macro(rocblas_ctrsv)                        \
+  __macro(rocblas_ztrsv)                        \
+  __macro(rocblas_stpsv)                        \
+  __macro(rocblas_dtpsv)                        \
+  __macro(rocblas_ctpsv)                        \
+  __macro(rocblas_ztpsv)                        \
+  __macro(rocblas_stbsv)                        \
+  __macro(rocblas_dtbsv)                        \
+  __macro(rocblas_ctbsv)                        \
+  __macro(rocblas_ztbsv)                        \
+  __macro(rocblas_ssymv)                        \
+  __macro(rocblas_dsymv)                        \
+  __macro(rocblas_csymv)                        \
+  __macro(rocblas_zsymv)                        \
+  __macro(rocblas_chemv)                        \
+  __macro(rocblas_zhemv)                        \
+  __macro(rocblas_ssbmv)                        \
+  __macro(rocblas_dsbmv)                        \
+  __macro(rocblas_chbmv)                        \
+  __macro(rocblas_zhbmv)                        \
+  __macro(rocblas_sspmv)                        \
+  __macro(rocblas_dspmv)                        \
+  __macro(rocblas_chpmv)                        \
+  __macro(rocblas_zhpmv)                        \
+  __macro(rocblas_sger)                         \
+  __macro(rocblas_dger)                         \
+  __macro(rocblas_cgeru)                        \
+  __macro(rocblas_cgerc)                        \
+  __macro(rocblas_zgeru)                        \
+  __macro(rocblas_zgerc)                        \
+  __macro(rocblas_ssyr)                         \
+  __macro(rocblas_dsyr)                         \
+  __macro(rocblas_csyr)                         \
+  __macro(rocblas_zsyr)                         \
+  __macro(rocblas_cher)                         \
+  __macro(rocblas_zher)                         \
+  __macro(rocblas_sspr)                         \
+  __macro(rocblas_dspr)                         \
+  __macro(rocblas_chpr)                         \
+  __macro(rocblas_zhpr)                         \
+  __macro(rocblas_ssyr2)                        \
+  __macro(rocblas_dsyr2)                        \
+  __macro(rocblas_csyr2)                        \
+  __macro(rocblas_zsyr2)                        \
+  __macro(rocblas_cher2)                        \
+  __macro(rocblas_zher2)                        \
+  __macro(rocblas_sspr2)                        \
+  __macro(rocblas_dspr2)                        \
+  __macro(rocblas_chpr2)                        \
+  __macro(rocblas_zhpr2)                        \
+  __macro(rocblas_sgemm)                        \
+  __macro(rocblas_dgemm)                        \
+  __macro(rocblas_hgemm)                        \
+  __macro(rocblas_cgemm)                        \
+  __macro(rocblas_zgemm)                        \
+  __macro(rocblas_ssyrk)                        \
+  __macro(rocblas_dsyrk)                        \
+  __macro(rocblas_csyrk)                        \
+  __macro(rocblas_zsyrk)                        \
+  __macro(rocblas_cherk)                        \
+  __macro(rocblas_zherk)                        \
+  __macro(rocblas_ssyr2k)                       \
+  __macro(rocblas_dsyr2k)                       \
+  __macro(rocblas_csyr2k)                       \
+  __macro(rocblas_zsyr2k)                       \
+  __macro(rocblas_cher2k)                       \
+  __macro(rocblas_zher2k)                       \
+  __macro(rocblas_ssyrkx)                       \
+  __macro(rocblas_dsyrkx)                       \
+  __macro(rocblas_csyrkx)                       \
+  __macro(rocblas_zsyrkx)                       \
+  __macro(rocblas_cherkx)                       \
+  __macro(rocblas_zherkx)                       \
+  __macro(rocblas_ssymm)                        \
+  __macro(rocblas_dsymm)                        \
+  __macro(rocblas_csymm)                        \
+  __macro(rocblas_zsymm)                        \
+  __macro(rocblas_chemm)                        \
+  __macro(rocblas_zhemm)                        \
+  __macro(rocblas_strsm)                        \
+  __macro(rocblas_dtrsm)                        \
+  __macro(rocblas_ctrsm)                        \
+  __macro(rocblas_ztrsm)                        \
+  __macro(rocblas_strmm)                        \
+  __macro(rocblas_dtrmm)                        \
+  __macro(rocblas_ctrmm)                        \
+  __macro(rocblas_ztrmm)                        \
+  __macro(rocblas_sgeam)                        \
+  __macro(rocblas_dgeam)                        \
+  __macro(rocblas_cgeam)                        \
+  __macro(rocblas_zgeam)                        \
+  __macro(rocblas_sdgmm)                        \
+  __macro(rocblas_ddgmm)                        \
+  __macro(rocblas_cdgmm)                        \
+  __macro(rocblas_zdgmm)                        \
+  __macro(rocblas_sgemm_batched)                \
+  __macro(rocblas_dgemm_batched)                \
+  __macro(rocblas_cgemm_batched)                \
+  __macro(rocblas_zgemm_batched)                \
+  __macro(rocblas_hgemm_strided_batched)        \
+  __macro(rocblas_sgemm_strided_batched)        \
+  __macro(rocblas_dgemm_strided_batched)        \
+  __macro(rocblas_cgemm_strided_batched)        \
+  __macro(rocblas_zgemm_strided_batched)        \
+  __macro(rocblas_gemm_ex)                      \
+  __macro(rocblas_gemm_strided_batched_ex)      \
+  __macro(rocblas_gemm_ex_get_solutions)                 \
+  __macro(rocblas_gemm_ex_get_solutions_by_type)         \
+  __macro(rocblas_gemm_batched_ex_get_solutions)         \
+  __macro(rocblas_gemm_batched_ex_get_solutions_by_type) \
+  __macro(rocblas_gemm_strided_batched_ex_get_solutions) \
+  __macro(rocblas_is_managing_device_memory)             \
+  __macro(rocblas_is_user_managing_device_memory)        \
+  __macro(rocblas_set_workspace)                         \
+  __macro(rocblas_strsm_batched)                         \
+  __macro(rocblas_dtrsm_batched)                         \
+  __macro(rocblas_ctrsm_batched)                         \
+  __macro(rocblas_ztrsm_batched)                         \
+  __macro(rocblas_create_handle)                         \
+  __macro(rocblas_destroy_handle)                        \
+  __macro(rocblas_get_stream)                            \
+  __macro(rocblas_set_stream)                            \
+  __macro(rocblas_set_atomics_mode)                      \
+  __macro(rocblas_get_version_string_size)               \
+  __macro(rocblas_get_version_string)
+
+// clang-format on
+
+FOREACH_ROCBLAS_API(ROCBLAS_API_WRAPPER)
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_blas.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
new file mode 100644
index 00000000..48a35762
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
@@ -0,0 +1,215 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for BLAS functionality -- this wraps the rocBLAS
+// library capabilities, and is only included into ROCM implementation code --
+// it will not introduce rocm headers into other code.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/rocm_config.h"
+
+#define ROCBLAS_BETA_FEATURES_API
+#if TF_ROCM_VERSION >= 50600
+#include "rocm/include/rocblas/rocblas.h"
+#else
+#include "rocm/include/rocblas.h"
+#endif
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/plugin_registry.h"
+#if TF_HIPBLASLT
+#include "xla/stream_executor/rocm/hip_blas_lt.h"
+#endif
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+template <bool ErrorIfMissing, class Target, class A, class B, class... T>
+struct ChooseType {
+  using type = std::conditional_t<
+      std::is_same_v<Target, A>, B,
+      typename ChooseType<ErrorIfMissing, Target, T...>::type>;
+};
+
+template <class Target, class A, class B>
+struct ChooseType<false, Target, A, B> {
+  // default case: return the same type Target if there is no recursive match
+  using type = std::conditional_t<std::is_same_v<Target, A>, B, Target>;
+};
+
+template <class Target, class A, class B>
+struct ChooseType<true, Target, A, B> {
+  // default case: return compile error if type is not found
+  static_assert(std::is_same_v<Target, A>,
+                "ChooseType: the target type is not found!");
+  using type = B;
+};
+
+// Type conversion helper that helps to map non-rocblas types to rocblas types
+template <typename T>
+using RocBlasType_t =
+    typename ChooseType<false, T, Eigen::half, rocblas_half, Eigen::bfloat16,
+                        rocblas_bfloat16, std::complex<float>,
+                        rocblas_float_complex, std::complex<double>,
+                        rocblas_double_complex>::type;
+
+// BLAS plugin for ROCM platform via rocBLAS library.
+//
+// This satisfies the platform-agnostic BlasSupport interface.
+//
+// Note that the rocBLAS handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent StreamExecutor is tied
+// to. This simply happens as an artifact of creating the rocBLAS handle when a
+// ROCM context is active.
+//
+// Thread-safe post-initialization.
+class ROCMBlas : public blas::BlasSupport {
+ public:
+  explicit ROCMBlas(StreamExecutor *parent);
+
+  // Allocates a rocBLAS handle.
+  bool Init();
+
+  // Releases the rocBLAS handle, if present.
+  ~ROCMBlas() override;
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES
+
+  gpu::BlasLt *GetBlasLt() override {
+#if TF_HIPBLASLT
+    return &blas_lt_;
+#else
+    return nullptr;
+#endif
+  }
+
+ private:
+  // Tells rocBLAS to enqueue the BLAS operation onto a particular Stream.
+  //
+  // rocBLAS is stateful, and only be associated with one stream (in order to
+  // enqueue dispatch) at a given time. As a result, this generally must be
+  // invoked before calling into rocBLAS.
+  bool SetStream(Stream *stream) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // A helper function that calls the real rocBLAS function together with error
+  // handling.
+  //
+  // rocblas_func:       rocBLAS function pointer.
+  // rocblas_name:       rocBLAS function name.
+  // stream:             Stream to enqueue the BLAS operation onto.
+  // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
+  //                     (true) or device (false).
+  // err_on_failure:     Whether to print an error if the rocBLAS function
+  // fails. args:               Arguments of rocBLAS function.
+  template <typename FuncT, typename... Args>
+  absl::Status DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                                  bool pointer_mode_host, bool err_on_failure,
+                                  Args &&...args);
+
+  // Convenience functions that call DoBlasInternalImpl with different values
+  // for err_on_failure.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternal(FuncT rocblas_func, Stream *stream,
+                      bool pointer_mode_host, Args &&...args) {
+    auto ret = DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                                  /*err_on_failure=*/true,
+                                  std::forward<Args>(args)...);
+    return ret.ok();
+  }
+
+  // Same as above, but returns absl::Status.
+  template <typename FuncT, typename... Args>
+  absl::Status DoBlasInternalStatus(FuncT rocblas_func, Stream *stream,
+                                    bool pointer_mode_host, Args &&...args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/true,
+                              std::forward<Args>(args)...);
+  }
+
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalFailureOK(FuncT rocblas_func, Stream *stream,
+                               bool pointer_mode_host, Args &&...args) {
+    auto ret = DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                                  /*err_on_failure=*/false,
+                                  std::forward<Args>(args)...);
+    return ret.ok();
+  }
+
+  // A helper function to implement DoBlasGemmBatched interfaces for generic
+  // types.
+  //
+  // Note: This function is implemented using gemm_strided_batched interface,
+  // NOT gemm_batched interface, because rocblas do not support it. As a
+  // result, if the passed in batch matrix are not allocated in strided batched
+  // format, it might end up in non-trivial amount of memory allocation and
+  // copy. To avoid this, always prioritize to use DoBlasGemmStridedBatched
+  // interface.
+  //
+  // In most use cases, batch matrix do get allocated in strided manner, making
+  // calling this interface equivalent with DoBlasGemmStridedBatched. The only
+  // use case we see so far that violates this observation is when batch
+  // matrix is created by broadcasting from a smaller matrix. When it happens,
+  // It will take advantage of the AllocateStridedBuffer subroutine to
+  // reallocate the memory layout to be strided batched.
+  template <typename T, typename FuncT>
+  absl::Status DoBlasGemmBatchedInternal(
+      FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+      blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, T alpha,
+      DeviceMemorySlice<T> a_ptrs_to_wrappers, int lda,
+      DeviceMemorySlice<T> b_ptrs_to_wrappers, int ldb, T beta,
+      DeviceMemorySlice<T> c_ptrs_to_wrappers, int ldc, int batch_count,
+      ScratchAllocator *scratch_allocator);
+
+  // mutex that guards the rocBLAS handle for this device.
+  mutable absl::Mutex mu_;
+
+  // StreamExecutor which instantiated this ROCMBlas.
+  // Immutable post-initialization.
+  StreamExecutor *parent_;
+
+  // rocBLAS library handle on the device.
+  rocblas_handle blas_ ABSL_GUARDED_BY(mu_);
+
+  // container holding solutions vector (to avoid reallocating it each time)
+  std::vector<rocblas_int> solutions_;
+
+  void MaybeLogGemmOp(StreamExecutor::GemmCallTrace::GemmType op,
+                      blas::CallContext context, uint64_t size1,
+                      uint64_t size2);
+
+#if TF_HIPBLASLT
+  rocm::BlasLt blas_lt_;
+#endif
+
+  ROCMBlas(const ROCMBlas &) = delete;
+  void operator=(const ROCMBlas &) = delete;
+
+  bool has_mfma_ = false;
+  bool use_hgemm_alt_impl_ = false;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
new file mode 100644
index 00000000..33b90227
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
@@ -0,0 +1,159 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_COMMAND_BUFFER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_COMMAND_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_command_buffer.h"
+#include "xla/stream_executor/gpu/scoped_gpu_graph_exec.h"
+#include "xla/stream_executor/gpu/scoped_update_mode.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// Implements GpuCommandBuffer for AMD GPUs.
+class RocmCommandBuffer : public GpuCommandBuffer {
+ public:
+  // Creates a new ROCm command buffer and the underlying HIP graph.
+  static absl::StatusOr<std::unique_ptr<RocmCommandBuffer>> Create(
+      Mode mode, StreamExecutor* parent);
+
+  ~RocmCommandBuffer() override;
+
+ private:
+  RocmCommandBuffer(Mode mode, StreamExecutor* parent, hipGraph_t graph,
+                    bool is_owned_graph)
+      : GpuCommandBuffer(mode, parent),
+        graph_(graph),
+        is_owned_graph_(is_owned_graph) {}
+
+  absl::Status LaunchSetIfConditionKernel(
+      ExecutionScopeId execution_scope_id,
+      GraphConditionalHandle if_conditional,
+      DeviceMemory<bool> predicate) override;
+  absl::Status LaunchSetIfElseConditionKernel(
+      ExecutionScopeId execution_scope_id,
+      GraphConditionalHandle if_conditional,
+      GraphConditionalHandle else_conditional,
+      DeviceMemory<bool> predicate) override;
+  absl::Status LaunchSetCaseConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+      DeviceMemory<int32_t> index, int32_t batch_offset,
+      bool enable_conditional_default) override;
+  absl::Status LaunchSetForConditionKernel(ExecutionScopeId execution_scope_id,
+                                           GraphConditionalHandle conditional,
+                                           DeviceMemory<int32_t> loop_counter,
+                                           int32_t iterations) override;
+  absl::Status LaunchSetWhileConditionKernel(
+      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+      DeviceMemory<bool> predicate) override;
+
+  absl::StatusOr<ConditionalNodeResult> CreateConditionalNode(
+      const Dependencies& dependencies, GraphConditionalHandle conditional,
+      ConditionType type) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateMemsetNode(
+      const Dependencies& dependencies, DeviceMemoryBase destination,
+      BitPattern bit_pattern, size_t num_elements) override;
+
+  absl::Status UpdateMemsetNode(GraphNodeHandle node_handle,
+                                DeviceMemoryBase destination,
+                                BitPattern bit_pattern,
+                                size_t num_elements) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateMemcpyD2DNode(
+      const Dependencies& dependencies, DeviceMemoryBase destination,
+      DeviceMemoryBase source, uint64_t size) override;
+
+  absl::Status UpdateMemcpyD2DNode(GraphNodeHandle node_handle,
+                                   DeviceMemoryBase destination,
+                                   DeviceMemoryBase source,
+                                   uint64_t size) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateChildNode(
+      const Dependencies& dependencies, const CommandBuffer& nested) override;
+
+  absl::Status UpdateChildNode(GraphNodeHandle node_handle,
+                               const CommandBuffer& nested) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateKernelNode(
+      const Dependencies& dependencies, const ThreadDim& threads,
+      const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgsPackedArrayBase& args) override;
+
+  absl::Status UpdateKernelNode(GraphNodeHandle node_handle,
+                                const ThreadDim& threads,
+                                const BlockDim& blocks, const Kernel& kernel,
+                                const KernelArgsPackedArrayBase& args) override;
+
+  absl::StatusOr<GraphNodeHandle> CreateBarrierNode(
+      const Dependencies& dependencies) override;
+
+  absl::Status Trace(Stream* stream,
+                     absl::AnyInvocable<absl::Status()> function) override;
+
+  absl::Status SetNodeExecutionEnabled(GraphNodeHandle node_handle,
+                                       bool enabled) override;
+
+  absl::Status LaunchGraph(Stream* stream) override;
+
+  absl::StatusOr<size_t> GetNodeCount() const override;
+
+  absl::Status PrepareFinalization() override;
+
+  absl::StatusOr<GraphConditionalHandle> CreateConditionalHandle() override;
+
+  absl::Status WriteGraphToDotFile(absl::string_view path) override;
+
+  absl::Status InstantiateGraph() override;
+
+  using ScopedRocmGraphExec = ScopedGraphExec<hipGraphExec_t>;
+  std::unique_ptr<ScopedUpdateMode> ActivateUpdateMode(
+      GpuCommandBuffer* nested_cmd_buffer) override;
+
+  absl::Status CheckCanBeUpdated() override;
+
+  absl::StatusOr<std::vector<GraphNodeHandle>> GetNodeDependencies(
+      GraphNodeHandle node) override;
+
+  static_assert(std::is_pointer_v<hipGraph_t>, "hipGraph_t must be a pointer");
+  static_assert(std::is_pointer_v<hipGraphExec_t>,
+                "hipGraphExec_t must be a pointer");
+
+  hipGraph_t graph_ = nullptr;  // owned if `is_owned_graph_`
+  bool is_owned_graph_ = true;  // ownership of `graph_`
+
+  hipGraphExec_t exec_ = nullptr;    // owned if `is_owned_graph_exec_`
+  bool is_owned_graph_exec_ = true;  // ownership of `is_owned_graph_exec_`
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_COMMAND_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_complex_converters.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_complex_converters.h
new file mode 100644
index 00000000..34ca488c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_complex_converters.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with ROCM API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_COMPLEX_CONVERTERS_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_COMPLEX_CONVERTERS_H_
+
+#include <complex>
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "rocm/include/hip/hip_complex.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// Type traits to get ROCM complex types from std::complex<T>.
+template <typename T>
+struct ROCMComplexT {
+  typedef T type;
+};
+template <>
+struct ROCMComplexT<std::complex<float>> {
+  typedef hipComplex type;
+};
+template <>
+struct ROCMComplexT<std::complex<double>> {
+  typedef hipDoubleComplex type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// hipComplex/hipDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename ROCMComplexT<T>::type *ROCMComplex(const T *p) {
+  auto *result = reinterpret_cast<const typename ROCMComplexT<T>::type *>(p);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(p) % alignof(decltype(*result)), 0)
+      << "Source pointer is not aligned by " << alignof(decltype(*result));
+  return result;
+}
+template <typename T>
+inline typename ROCMComplexT<T>::type *ROCMComplex(T *p) {
+  auto *result = reinterpret_cast<typename ROCMComplexT<T>::type *>(p);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(p) % alignof(decltype(*result)), 0)
+      << "Source pointer is not aligned by " << alignof(decltype(*result));
+  return result;
+}
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_COMPLEX_CONVERTERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_context.h
new file mode 100644
index 00000000..60480f49
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_context.h
@@ -0,0 +1,72 @@
+#include "absl/status/statusor.h"
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The ROCM-specific Driver library support, implementing the general Driver
+// interface.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_CONTEXT_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/gpu/context.h"
+#include "xla/stream_executor/gpu/context_map.h"
+
+namespace stream_executor::gpu {
+
+// RocmContext implements the Context class for ROCm GPUs.
+class RocmContext : public Context {
+ public:
+  RocmContext(hipCtx_t context, const int ordinal)
+      : context_(context), device_ordinal_(ordinal) {}
+  ~RocmContext() override;
+
+  hipCtx_t context() const { return context_; }
+  void SetActive() override;
+  bool IsActive() const override;
+  int device_ordinal() const override { return device_ordinal_; }
+  absl::Status Synchronize() override;
+
+  // Disallow copying and moving.
+  RocmContext(RocmContext&&) = delete;
+  RocmContext(const RocmContext&) = delete;
+  RocmContext& operator=(RocmContext&&) = delete;
+  RocmContext& operator=(const RocmContext&) = delete;
+
+  // Returns the free amount of memory and total amount of memory, as reported
+  // by hipDeviceTotalMem.
+  bool GetDeviceMemoryUsage(int64_t* free_out, int64_t* total_out);
+
+  // Returns the total amount of memory available on the device.
+  static bool GetDeviceTotalMemory(hipDevice_t device, uint64_t* result);
+
+  // Returns the context map for all XLA-known ROCm contexts.
+  static ContextMap<hipCtx_t, RocmContext>* GetContextMap();
+
+  // Creates a new context for the given device.
+  static absl::StatusOr<RocmContext*> Create(int device_ordinal,
+                                             hipDevice_t device);
+
+ private:
+  hipCtx_t const context_;
+  const int device_ordinal_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
new file mode 100644
index 00000000..8c17b8e0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
@@ -0,0 +1,44 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/gpu/gpu_diagnostics.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = gpu::DriverVersion;
+
+// Converts a parsed driver version to string form.
+std::string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+std::string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
+
+using Diagnostician = gpu::Diagnostician;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
new file mode 100644
index 00000000..27f99258
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -0,0 +1,641 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The ROCM-specific DNN library support, implementing the general DnnSupport
+// interface.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
+
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/include/miopen/miopen.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/plugin_registry.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class MIOpenRnnDescriptor;
+class MIOpenRnnSequenceTensorDescriptor;
+class MIOpenRnnStateTensorDescriptor;
+class MIOpenCTCLossDescriptor;
+
+struct PoolingWorkspaceDescriptor {
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> output_dims;
+  dnn::PoolingDescriptor op;
+  int dtype;
+  uint64_t timestamp;
+  ScopedDeviceMemory<uint8_t> workspace;
+  size_t workspace_size;
+  bool IsSame(const dnn::BatchDescriptor& input_dimensions,
+              const dnn::BatchDescriptor& output_dimensions,
+              const dnn::PoolingDescriptor& pooling_dimensions, int _type);
+};
+
+struct PoolingWorkspaceCache {
+  std::map<const void*, PoolingWorkspaceDescriptor> cache;
+  const int trim_size = 1000;
+  const uint64_t memory_budget = 2e7;
+  uint64_t timestamp = 0;
+  uint64_t memory_used = 0;
+  bool find(const void* p, const dnn::BatchDescriptor& input_dimensions,
+            const dnn::BatchDescriptor& output_dimensions,
+            const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+            PoolingWorkspaceDescriptor*& pdesc);
+  void insert(const void* p, const dnn::BatchDescriptor& input_dimensions,
+              const dnn::BatchDescriptor& output_dimensions,
+              const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+              ScopedDeviceMemory<uint8_t>& workspace, size_t wsp_size,
+              hipStream_t hip_stream);
+
+ private:
+  void trim(hipStream_t hip_stream);
+};
+
+// miopen-library based DNN support. For details on overridden interface
+// functions, see dnn.h.
+class MIOpenSupport : public dnn::DnnSupport {
+ public:
+  explicit MIOpenSupport(StreamExecutor* parent);
+
+  absl::Status Init() override;
+  absl::StatusOr<stream_executor::dnn::VersionInfo> GetVersion() override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> CreateRnnDescriptor(
+      int num_layers, int hidden_size, int input_size, int cell_size,
+      int batch_size, dnn::RnnInputMode input_mode,
+      dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
+      dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
+      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      ScratchAllocator* state_allocator, bool use_padded_io) override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+                                    int data_size,
+                                    dnn::DataType data_type) override;
+
+  absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  CreateRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+                                 dnn::DataType data_type) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<Eigen::half>& input_data,
+                    const DeviceMemory<int>& seq_lengths_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<Eigen::half>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<Eigen::half>& input_c_data,
+                    const DeviceMemory<Eigen::half>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<Eigen::half>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<Eigen::half>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<Eigen::half>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<float>& input_data,
+                    const DeviceMemory<int>& seq_lengths_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<float>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<float>& input_c_data,
+                    const DeviceMemory<float>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<float>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<float>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<float>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<double>& input_data,
+                    const DeviceMemory<int>& seq_lengths_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<double>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<double>& input_c_data,
+                    const DeviceMemory<double>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<double>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<double>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<double>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const DeviceMemory<int>& seq_lengths_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<Eigen::half>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<Eigen::half>& input_c_data,
+                     const DeviceMemory<Eigen::half>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<Eigen::half>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<Eigen::half>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<Eigen::half>& output_c_data,
+                     const DeviceMemory<Eigen::half>& output_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_h_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_c_backprop_data,
+                     DeviceMemory<Eigen::half>* input_backprop_data,
+                     DeviceMemory<Eigen::half>* input_h_backprop_data,
+                     DeviceMemory<Eigen::half>* input_c_backprop_data,
+                     DeviceMemory<Eigen::half>* params_backprop_data,
+                     DeviceMemory<uint8_t>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<float>& input_data,
+                     const DeviceMemory<int>& seq_lengths_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<float>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<float>& input_c_data,
+                     const DeviceMemory<float>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<float>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<float>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<float>& output_c_data,
+                     const DeviceMemory<float>& output_backprop_data,
+                     const DeviceMemory<float>& output_h_backprop_data,
+                     const DeviceMemory<float>& output_c_backprop_data,
+                     DeviceMemory<float>* input_backprop_data,
+                     DeviceMemory<float>* input_h_backprop_data,
+                     DeviceMemory<float>* input_c_backprop_data,
+                     DeviceMemory<float>* params_backprop_data,
+                     DeviceMemory<uint8_t>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<double>& input_data,
+                     const DeviceMemory<int>& seq_lengths_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<double>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<double>& input_c_data,
+                     const DeviceMemory<double>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<double>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<double>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<double>& output_c_data,
+                     const DeviceMemory<double>& output_backprop_data,
+                     const DeviceMemory<double>& output_h_backprop_data,
+                     const DeviceMemory<double>& output_c_backprop_data,
+                     DeviceMemory<double>* input_backprop_data,
+                     DeviceMemory<double>* input_h_backprop_data,
+                     DeviceMemory<double>* input_c_backprop_data,
+                     DeviceMemory<double>* params_backprop_data,
+                     DeviceMemory<uint8_t>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  absl::Status GetConvolveRunners(
+      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
+      dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      bool use_fallback, ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners)
+      override;
+
+  absl::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor) override;
+
+  absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  FusedConvolveRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+      double side_input_scale, double leakyrelu_alpha,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::ActivationMode activation_mode) override;
+
+  bool GetMIOpenConvolveAlgorithms(
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      std::vector<dnn::ProfileResult>* out_algorithms);
+
+  absl::Status GetMIOpenConvolveAlgorithmsImmediateMode(
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      std::vector<dnn::ProfileResult>* out_algorithms);
+
+  absl::Status GetMIOpenConvolveAlgorithmsFindMode(
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      std::vector<dnn::ProfileResult>* out_algorithms);
+
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<float>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::half>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::half>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::bfloat16>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<float>& y_backprop,
+      const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& variance, const DeviceMemory<float>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<float>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+      const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::half>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::half>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::half>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+      const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var,
+      const DeviceMemory<Eigen::bfloat16>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  absl::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8_t> scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
+
+  absl::Status DoFusedConvolve(
+      Stream* stream, dnn::DataType input_type, dnn::DataType side_input_type,
+      dnn::DataType bias_type, dnn::DataType output_type,
+      const dnn::BatchDescriptor& conv_input_descriptor,
+      DeviceMemoryBase conv_input_data, double conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      DeviceMemoryBase side_input_data, double side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor, DeviceMemoryBase biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
+  absl::Status GetFusedMatmulRunners(
+      bool use_cudnn_frontend, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
+      bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
+      int64_t lda, int64_t ldb, int64_t ldc,
+      dnn::ActivationMode activation_mode, bool use_fallback,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
+          out_exec_plans) override;
+
+  absl::Status GetFusedConvolveRunners(
+      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
+      dnn::DataType input_type, dnn::DataType bias_type,
+      dnn::DataType output_type, double conv_scale, double side_input_scale,
+      double leakyrelu_alpha, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      bool use_fallback, dnn::ActivationMode activation_mode,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
+      override;
+
+  absl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             DeviceMemoryBase input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemoryBase output_data,
+                             ScratchAllocator* workspace_allocator) override;
+
+  absl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              DeviceMemoryBase input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              DeviceMemoryBase output_data,
+                              DeviceMemoryBase input_diff_data,
+                              DeviceMemoryBase output_diff_data,
+                              ScratchAllocator* workspace_allocator) override;
+
+  bool DoNormalizeWithDimensions(
+      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+      const dnn::BatchDescriptor& dimensions,
+      const DeviceMemory<float>& input_data,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoNormalizeBackwardWithDimensions(
+      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+      const dnn::BatchDescriptor& dimensions,
+      const DeviceMemory<float>& raw_data,
+      const DeviceMemory<float>& normalized_data,
+      const DeviceMemory<float>& normalized_variable_gradient,
+      DeviceMemory<float>* raw_variable_gradient,
+      ScratchAllocator* workspace_allocator = nullptr) override;
+
+  // Derives an output batch descriptor from an input batch and convolution
+  // descriptors.
+  bool DeriveOutputBatchDescriptor(
+      const dnn::BatchDescriptor& batch_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::BatchDescriptor* output_batch_descriptor);
+
+  bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
+                         dnn::DataType input_type,
+                         const DeviceMemoryBase& input_data,
+                         const dnn::BatchDescriptor& output_desc,
+                         dnn::DataType output_type, float scale,
+                         DeviceMemoryBase* output_data) override;
+
+  StreamExecutor* GetParentExecutor() { return parent_; }
+
+  absl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
+                         const dnn::RnnStateTensorDescriptor& probs_desc,
+                         const DeviceMemoryBase probs_data,
+                         absl::Span<const int> labels_data,
+                         absl::Span<const int> labels_lengths_data,
+                         absl::Span<const int> input_lengths_data,
+                         DeviceMemoryBase costs_data,
+                         const dnn::RnnStateTensorDescriptor& grads_desc,
+                         DeviceMemoryBase grads_data,
+                         DeviceMemory<uint8_t> scratch_memory,
+                         int ctc_loss_algo_id) override;
+
+ private:
+  StreamExecutor* parent_;  // Parent executor object. Not owned.
+
+  // Flag to indicate whether Get*Algorithm routines should only return
+  // the best algorithm (as opposed to a list of all applicable ones)
+  bool return_best_algo_only_;
+
+  // Flag to indicate whether to use Immediate (or Find) mode for Convolutions
+  bool use_immediate_mode_;
+
+  // Provide access to the MIOpen handle.
+  std::unique_ptr<class MIOpenAccess> miopen_;
+
+  PoolingWorkspaceCache m_pooling_cache;
+  bool m_pooling_cache_allowed = false;
+  bool m_pooling_cache_enabled = false;
+
+  template <class T, class U>
+  absl::Status DoBatchNormalizationForwardImpl(
+      Stream* stream, dnn::DataType input_data_type,
+      dnn::DataType scale_data_type, const DeviceMemory<T>& x,
+      const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+      const DeviceMemory<U>& estimated_mean,
+      const DeviceMemory<U>& estimated_variance,
+      const DeviceMemory<T>& side_input, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<T>* y,
+      DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var,
+      DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
+      bool is_training);
+
+  template <class T, class U>
+  absl::Status DoBatchNormalizationBackwardImpl(
+      Stream* stream, int miopen_input_type, int miopen_scale_type,
+      const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
+      const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
+      const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
+      DeviceMemory<U>* offset_backprop);
+
+  template <class T>
+  absl::Status DoRnnForwardImpl(
+      Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+      const MIOpenRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const MIOpenRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const MIOpenRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const MIOpenRnnSequenceTensorDescriptor& output_desc,
+      DeviceMemory<T>* output_data,
+      const MIOpenRnnStateTensorDescriptor& output_h_desc,
+      DeviceMemory<T>* output_h_data,
+      const MIOpenRnnStateTensorDescriptor& output_c_desc,
+      DeviceMemory<T>* output_c_data, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
+  template <class T>
+  absl::Status DoRnnBackwardImpl(
+      Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+      const MIOpenRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const MIOpenRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const MIOpenRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const MIOpenRnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<T>& output_data,
+      const MIOpenRnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<T>& output_h_data,
+      const MIOpenRnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<T>& output_c_data,
+      const DeviceMemory<T>& output_backprop_data,
+      const DeviceMemory<T>& output_h_backprop_data,
+      const DeviceMemory<T>& output_c_backprop_data,
+      DeviceMemory<T>* input_backprop_data,
+      DeviceMemory<T>* input_h_backprop_data,
+      DeviceMemory<T>* input_c_backprop_data,
+      DeviceMemory<T>* params_backprop_data,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
+
+  absl::Status DoPrepareForConvolution(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8_t>* scratch_memory) override;
+
+  absl::Status DoCtcLossImpl(
+      Stream* stream, const MIOpenRnnStateTensorDescriptor& probs_desc,
+      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
+      const MIOpenRnnStateTensorDescriptor& grads_desc,
+      DeviceMemoryBase grads_data, const MIOpenCTCLossDescriptor& ctc_loss_desc,
+      DeviceMemory<uint8_t> scratch_memory, int ctc_loss_algo_id);
+
+  absl::Status DoPrepareForCtcLoss(
+      Stream* stream, dnn::DataType element_type,
+      const dnn::RnnStateTensorDescriptor& probs_desc,
+      const dnn::RnnStateTensorDescriptor& grads_desc,
+      absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data,
+      const NumericOptions& numeric_options,
+      ScratchAllocator* scratch_allocator,
+      DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
+
+  MIOpenSupport(const MIOpenSupport&) = delete;
+  void operator=(const MIOpenSupport&) = delete;
+};
+
+// A helper function for the front frameworks.
+// e.g., TF(tensorflow/core/kernels/conv_ops.cc, fused_batch_norm_op.cc
+// and tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc)
+// This will decide whether to use NHWC in Convolution/Batchnorm.
+// This mode can be faster in in FP16 workloads on gfx908 and beyond.
+// Requires ROCm 5.0+.
+// TODO (ROCm): Use autotune to choose between this mode and NCHW
+// when MIOpen has more optimized kernels.
+bool UseNhwcLayoutForRocm();
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
new file mode 100644
index 00000000..a90af297
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -0,0 +1,196 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocm driver calls with dso loader so that we don't need to
+// have explicit linking to librocm. All TF rocm driver usage should route
+// through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/rocm_config.h"
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+
+namespace stream_executor {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                            \
+  template <typename... Args>                                              \
+  auto hipSymbolName(Args... args) -> decltype(::hipSymbolName(args...)) { \
+    return ::hipSymbolName(args...);                                       \
+  }
+
+// This macro wraps a global identifier, given by hipSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
+  template <typename... Args>                                               \
+  auto hipSymbolName(Args... args) -> decltype(::hipSymbolName(args...)) {  \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
+    static FuncPtrT loaded = []() -> FuncPtrT {                             \
+      static const char *kName = TO_STR(hipSymbolName);                     \
+      void *f;                                                              \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                   \
+          tsl::internal::CachedDsoLoader::GetHipDsoHandle().value(), kName, \
+          &f);                                                              \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in HIP DSO; dlerror: " << s.message();             \
+      return reinterpret_cast<FuncPtrT>(f);                                 \
+    }();                                                                    \
+    return loaded(args...);                                                 \
+  }
+#endif
+
+// clang-format off
+// IMPORTANT: if you add a new HIP API to this list, please notify
+// the rocm-profiler developers to track the API traces.
+#define HIP_ROUTINE_EACH(__macro)                   \
+  __macro(hipCtxGetDevice)                          \
+  __macro(hipCtxSetCurrent)                         \
+  __macro(hipCtxEnablePeerAccess)                   \
+  __macro(hipDeviceCanAccessPeer)                   \
+  __macro(hipDeviceEnablePeerAccess)                \
+  __macro(hipDeviceGet)                             \
+  __macro(hipDeviceGetAttribute)                    \
+  __macro(hipDeviceGetName)                         \
+  __macro(hipDeviceGetPCIBusId)                     \
+  __macro(hipDeviceGetSharedMemConfig)              \
+  __macro(hipDeviceGetStreamPriorityRange)          \
+  __macro(hipDeviceGraphMemTrim)                    \
+  __macro(hipDevicePrimaryCtxGetState)              \
+  __macro(hipDevicePrimaryCtxSetFlags)              \
+  __macro(hipDevicePrimaryCtxRetain)                \
+  __macro(hipDevicePrimaryCtxRelease)               \
+  __macro(hipDeviceSetSharedMemConfig)              \
+  __macro(hipDeviceSynchronize)                     \
+  __macro(hipDeviceTotalMem)                        \
+  __macro(hipDriverGetVersion)                      \
+  __macro(hipEventCreateWithFlags)                  \
+  __macro(hipEventDestroy)                          \
+  __macro(hipEventElapsedTime)                      \
+  __macro(hipEventQuery)                            \
+  __macro(hipEventRecord)                           \
+  __macro(hipEventSynchronize)                      \
+  __macro(hipFree)                                  \
+  __macro(hipFuncSetCacheConfig)                    \
+  __macro(hipFuncGetAttribute)                      \
+  __macro(hipFuncSetAttribute)                      \
+  __macro(hipGetDevice)                             \
+  __macro(hipGetDeviceCount)                        \
+  __macro(hipGetDeviceProperties)                   \
+  __macro(hipGetErrorString)                        \
+  __macro(hipGraphAddKernelNode)                    \
+  __macro(hipGraphAddChildGraphNode)                \
+  __macro(hipGraphAddEmptyNode)                     \
+  __macro(hipGraphAddMemAllocNode)                  \
+  __macro(hipGraphAddMemcpyNode1D)                  \
+  __macro(hipGraphAddMemsetNode)                    \
+  __macro(hipGraphAddMemFreeNode)                   \
+  __macro(hipGraphCreate)                           \
+  __macro(hipGraphDebugDotPrint)                    \
+  __macro(hipGraphDestroy)                          \
+  __macro(hipGraphGetNodes)                         \
+  __macro(hipGraphExecChildGraphNodeSetParams)      \
+  __macro(hipGraphExecDestroy)                      \
+  __macro(hipGraphExecKernelNodeSetParams)          \
+  __macro(hipGraphExecMemcpyNodeSetParams1D)        \
+  __macro(hipGraphExecMemsetNodeSetParams)          \
+  __macro(hipGraphExecUpdate)                       \
+  __macro(hipGraphInstantiate)                      \
+  __macro(hipGraphMemAllocNodeGetParams)            \
+  __macro(hipGraphLaunch)                           \
+  __macro(hipGraphNodeGetType)                      \
+  __macro(hipGraphNodeSetEnabled)                   \
+  __macro(hipHostFree)                              \
+  __macro(hipHostMalloc)                            \
+  __macro(hipHostRegister)                          \
+  __macro(hipHostUnregister)                        \
+  __macro(hipInit)                                  \
+  __macro(hipKernelNameRefByPtr)                    \
+  __macro(hipLaunchHostFunc)                        \
+  __macro(hipLaunchKernel)                          \
+  __macro(hipMalloc)                                \
+  __macro(hipMallocManaged)                         \
+  __macro(hipMemGetAddressRange)                    \
+  __macro(hipMemGetInfo)                            \
+  __macro(hipMemcpyDtoD)                            \
+  __macro(hipMemcpyDtoDAsync)                       \
+  __macro(hipMemcpyDtoH)                            \
+  __macro(hipMemcpyDtoHAsync)                       \
+  __macro(hipMemcpyHtoD)                            \
+  __macro(hipMemcpyHtoDAsync)                       \
+  __macro(hipMemset)                                \
+  __macro(hipMemsetD8)                              \
+  __macro(hipMemsetD16)                             \
+  __macro(hipMemsetD32)                             \
+  __macro(hipMemsetAsync)                           \
+  __macro(hipMemsetD8Async)                         \
+  __macro(hipMemsetD16Async)                        \
+  __macro(hipMemsetD32Async)                        \
+  __macro(hipModuleGetFunction)                     \
+  __macro(hipModuleGetGlobal)                       \
+  __macro(hipModuleLaunchKernel)                    \
+  __macro(hipModuleLoadData)                        \
+  __macro(hipModuleUnload)                          \
+  __macro(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor) \
+  __macro(hipModuleOccupancyMaxPotentialBlockSize)  \
+  __macro(hipPointerGetAttribute)                   \
+  __macro(hipPointerGetAttributes)                  \
+  __macro(hipRuntimeGetVersion)                     \
+  __macro(hipSetDevice)                             \
+  __macro(hipStreamAddCallback)                     \
+  __macro(hipStreamBeginCapture)                    \
+  __macro(hipStreamCreateWithFlags)                 \
+  __macro(hipStreamCreateWithPriority)              \
+  __macro(hipStreamDestroy)                         \
+  __macro(hipStreamEndCapture)                      \
+  __macro(hipStreamIsCapturing)                     \
+  __macro(hipStreamQuery)                           \
+  __macro(hipStreamSynchronize)                     \
+  __macro(hipStreamWaitEvent)  // clang-format on
+
+HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
+
+#if TF_ROCM_VERSION >= 60200
+
+// clang-format off
+#define HIP_ROUTINE_EACH_62(__macro)            \
+  __macro(hipGetFuncBySymbol)                   \
+  __macro(hipStreamBeginCaptureToGraph)
+// clang-format on
+
+HIP_ROUTINE_EACH_62(STREAM_EXECUTOR_HIP_WRAP)
+
+#undef HIP_ROUTINE_EACH_62
+#endif  // TF_ROCM_VERSION >= 60200
+
+#undef HIP_ROUTINE_EACH
+#undef STREAM_EXECUTOR_HIP_WRAP
+#undef TO_STR
+#undef TO_STR_
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_event.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_event.h
new file mode 100644
index 00000000..81e0cbba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_event.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_EVENT_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_EVENT_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// This class implements Event for ROCm devices.
+class RocmEvent : public Event {
+ public:
+  Event::Status PollForStatus() override;
+  absl::Status WaitForEventOnExternalStream(std::intptr_t stream) override;
+
+  // Creates a new RocmEvent. If allow_timing is false, the event will not
+  // support timing, which is cheaper to create.
+  static absl::StatusOr<RocmEvent> Create(StreamExecutor* executor,
+                                          bool allow_timing);
+
+  hipEvent_t GetHandle() const { return handle_; }
+
+  ~RocmEvent() override;
+  RocmEvent(const RocmEvent&) = delete;
+  RocmEvent& operator=(const RocmEvent&) = delete;
+  RocmEvent(RocmEvent&& other);
+  RocmEvent& operator=(RocmEvent&& other);
+
+ private:
+  explicit RocmEvent(StreamExecutor* executor, hipEvent_t handle)
+      : executor_(executor), handle_(handle) {}
+
+  // The Executor used to which this object and hipEvent_t are bound.
+  StreamExecutor* executor_;
+
+  // The underlying CUDA event handle.
+  hipEvent_t handle_;
+};
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_EVENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
new file mode 100644
index 00000000..8045d63e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
@@ -0,0 +1,204 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_EXECUTOR_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/numeric/int128.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/rocm/rocm_context.h"
+#include "xla/stream_executor/rocm/rocm_event.h"
+#include "xla/stream_executor/rocm/rocm_kernel.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// This class implements GpuExecutor for AMD GPUs that use ROCm libraries.
+class RocmExecutor : public GpuExecutor {
+ public:
+  RocmExecutor(Platform* platform, int device_ordinal)
+      : GpuExecutor(platform, device_ordinal) {}
+  ~RocmExecutor() override;
+  std::unique_ptr<ActivateContext> Activate() override;
+
+  absl::Status Init() override;
+  blas::BlasSupport* AsBlas() override;
+  fft::FftSupport* AsFft() override;
+  dnn::DnnSupport* AsDnn() override;
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) override;
+  absl::StatusOr<std::unique_ptr<CommandBuffer>> CreateCommandBuffer(
+      CommandBuffer::Mode mode) override;
+  absl::StatusOr<std::unique_ptr<Kernel>> LoadKernel(
+      const MultiKernelLoaderSpec& spec) override;
+  void UnloadKernel(const Kernel* kernel) override;
+  absl::StatusOr<ModuleHandle> LoadModule(
+      const MultiModuleLoaderSpec& spec) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
+  absl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
+      Stream* stream, absl::Span<const uint8_t> content) override;
+  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
+  absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
+      const DeviceMemoryBase& location) override;
+  void Deallocate(DeviceMemoryBase* mem) override;
+  bool SynchronizeAllActivity() override;
+  absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
+      Stream* stream, bool use_delay_kernel) override;
+  absl::StatusOr<DeviceMemoryBase> GetSymbol(
+      const std::string& symbol_name, ModuleHandle module_handle) override;
+  absl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                  uint64_t size) override;
+  absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64_t size) override;
+  void DeallocateStream(Stream* stream) override;
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override;
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override;
+  bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    return RocmExecutor::CreateDeviceDescription(device_ordinal());
+  }
+  void* UnifiedMemoryAllocate(uint64_t size) override;
+
+  void UnifiedMemoryDeallocate(void* location) override;
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override;
+  void HostMemoryDeallocate(void* location) override;
+
+  absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
+
+  Stream* FindAllocatedStream(void* gpu_stream) override {
+    absl::MutexLock lock(&alive_gpu_streams_mu_);
+    auto it = alive_gpu_streams_.find(gpu_stream);
+    if (it == alive_gpu_streams_.end()) {
+      return nullptr;
+    }
+    return it->second;
+  }
+
+  static absl::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription(int device_ordinal);
+
+  // Returns a RocmKernel pointer for a given Kernel, if the kernel is
+  // associated with this executor. Otherwise a NotFound error is returned.
+  absl::StatusOr<const RocmKernel*> GetRocmKernel(const Kernel* kernel);
+
+ private:
+  // Loads a module in HSACO format.
+  absl::StatusOr<ModuleHandle> LoadModuleFromHsaco(const char* hsaco)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(ModuleHandle module_handle)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Creates a GpuEvent for the given stream.
+  absl::StatusOr<std::unique_ptr<RocmEvent>> CreateGpuEvent(bool allow_timing);
+
+  // Guards the in-memory-module mapping.
+  absl::Mutex in_memory_modules_mu_;
+
+  absl::flat_hash_map<ModuleHandle, hipModule_t> in_memory_modules_
+      ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  absl::Mutex shared_constants_mu_;
+  // On-device constants that can be shared between multiple executables. A
+  // pointer for a given constant will expire when no executables require use
+  // of that constant anymore.
+  std::map<const absl::uint128, std::weak_ptr<DeviceMemoryBase>>
+      shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
+
+  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
+  absl::flat_hash_map<const Kernel*, ModuleHandle> kernel_to_gpu_binary_
+      ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  // Loaded GPU binary handle -> {module, reference count}.
+  absl::flat_hash_map<ModuleHandle, std::pair<hipModule_t, uint64_t>>
+      gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  // Handle for the ROCm device being operated on. Immutable
+  // post-initialization.
+  hipDevice_t device_;
+
+  // Reader/writer lock for mutable data structures on this object.
+  absl::Mutex mu_;
+
+  // Memoized DNN support object -- we only want to create this once when asked
+  // for a DNN interface.
+  std::unique_ptr<dnn::DnnSupport> dnn_ ABSL_GUARDED_BY(mu_);
+
+  // Memoized FFT support object -- we only want to create this once when asked
+  // for a FFT interface.
+  std::unique_ptr<fft::FftSupport> fft_ ABSL_GUARDED_BY(mu_);
+
+  // Memoized BLAS support object -- we only want to create this once when asked
+  // for a BLAS interface.
+  std::unique_ptr<blas::BlasSupport> blas_ ABSL_GUARDED_BY(mu_);
+
+  absl::Mutex alive_gpu_streams_mu_;
+
+  // Lookup map for alive streams, from raw stream pointers.
+  absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
+      ABSL_GUARDED_BY(alive_gpu_streams_mu_);
+
+  // Set of loaded kernels. This contains all kernels loaded by this executor,
+  // including in-process kernels.
+  absl::flat_hash_set<const Kernel*> loaded_kernels_
+      ABSL_GUARDED_BY(in_memory_modules_mu_);
+
+  // GPU ISA version for device_.
+  int version_;
+
+  // RocmContext for this device.
+  RocmContext* rocm_context_;
+};
+
+}  // namespace stream_executor::gpu
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_fft.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_fft.h
new file mode 100644
index 00000000..1fbdee8f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_fft.h
@@ -0,0 +1,143 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for FFT functionality -- this wraps the rocFFT library
+// capabilities, and is only included into ROCM implementation code -- it will
+// not introduce rocm headers into other code.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+
+#include <cstdint>
+
+#include "rocm/rocm_config.h"  // IWYU pragma: keep - needed for TF_ROCM_VERSION
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/hipfft/hipfft.h"
+#else
+#include "rocm/include/hipfft.h"
+#endif
+
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/plugin_registry.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+
+namespace stream_executor {
+
+namespace gpu {
+
+// ROCMFftPlan uses deferred initialization. Only a single call of
+// Initialize() is allowed to properly create hipfft plan and set member
+// variable is_initialized_ to true. Newly added interface that uses member
+// variables should first check is_initialized_ to make sure that the values of
+// member variables are valid.
+class ROCMFftPlan : public fft::Plan {
+ public:
+  ROCMFftPlan()
+      : parent_(nullptr),
+        plan_(),
+        fft_type_(fft::Type::kInvalid),
+        scratch_(nullptr),
+        scratch_size_bytes_(0),
+        is_initialized_(false) {}
+  ~ROCMFftPlan() override;
+
+  // Get FFT direction in hipFFT based on FFT type.
+  int GetFftDirection() const;
+  hipfftHandle GetPlan() const {
+    if (IsInitialized()) {
+      return plan_;
+    } else {
+      LOG(FATAL) << "Try to get hipfftHandle value before initialization.";
+    }
+  }
+
+  // Initialize function for batched plan
+  absl::Status Initialize(StreamExecutor *parent, Stream *stream, int rank,
+                          uint64_t *elem_count, uint64_t *input_embed,
+                          uint64_t input_stride, uint64_t input_distance,
+                          uint64_t *output_embed, uint64_t output_stride,
+                          uint64_t output_distance, fft::Type type,
+                          int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Initialize function for 1d,2d, and 3d plan
+  absl::Status Initialize(StreamExecutor *parent, Stream *stream, int rank,
+                          uint64_t *elem_count, fft::Type type,
+                          ScratchAllocator *scratch_allocator);
+
+  absl::Status UpdateScratchAllocator(Stream *stream,
+                                      ScratchAllocator *scratch_allocator);
+
+  ScratchAllocator *GetScratchAllocator() const { return scratch_allocator_; }
+
+ protected:
+  bool IsInitialized() const { return is_initialized_; }
+  ScratchAllocator *scratch_allocator_;
+
+ private:
+  StreamExecutor *parent_;
+  hipfftHandle plan_;
+  fft::Type fft_type_;
+  DeviceMemory<uint8_t> scratch_;
+  size_t scratch_size_bytes_;
+  bool is_initialized_;
+};
+
+// FFT support for ROCM platform via rocFFT library.
+//
+// This satisfies the platform-agnostic FftSupport interface.
+//
+// Note that the hipFFT handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent StreamExecutor is tied
+// to. This simply happens as an artifact of creating the hipFFT handle when a
+// ROCM context is active.
+//
+// Thread-safe. The ROCM context associated with all operations is the ROCM
+// context of parent_, so all context is explicit.
+class ROCMFft : public fft::FftSupport {
+ public:
+  explicit ROCMFft(StreamExecutor *parent) : parent_(parent) {}
+  ~ROCMFft() override {}
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
+
+ private:
+  StreamExecutor *parent_;
+
+  // Two helper functions that execute dynload::hipfftExec?2?.
+
+  // This is for complex to complex FFT, when the direction is required.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                  FuncT hipfft_exec,
+                                  const DeviceMemory<InputT> &input,
+                                  DeviceMemory<OutputT> *output);
+
+  // This is for complex to real or real to complex FFT, when the direction
+  // is implied.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfft_exec,
+                     const DeviceMemory<InputT> &input,
+                     DeviceMemory<OutputT> *output);
+
+  ROCMFft(const ROCMFft &) = delete;
+  void operator=(const ROCMFft &) = delete;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
new file mode 100644
index 00000000..a252666e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutor functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/logging.h"
+
+namespace stream_executor::gpu {
+
+class RocmKernel : public Kernel {
+ public:
+  explicit RocmKernel(StreamExecutor* executor) : executor_(executor) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the StreamExecutor.
+  ~RocmKernel() override { executor_->UnloadKernel(this); }
+
+  // As arity cannot be reflected upon using the HIP API, the arity is
+  // explicitly set during the RocmExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const override;
+
+  // Simple accessor methods.
+  hipFunction_t gpu_function() const { return rocm_function_; }
+  void set_gpu_function(hipFunction_t rocm_function) {
+    rocm_function_ = rocm_function;
+  }
+
+  // Collects metadata for the specified kernel.
+  absl::StatusOr<KernelMetadata> GetKernelMetadata();
+
+ private:
+  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
+                      const std::optional<ClusterDim> &cluster_dims,
+                      Stream *stream, const KernelArgs &args) override;
+
+  StreamExecutor* executor_ = nullptr;
+
+  hipFunction_t rocm_function_ = nullptr;  // wrapped HIP kernel handle
+  unsigned arity_ = 0;  // number of formal parameters the kernel takes
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
new file mode 100644
index 00000000..8e23c945
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -0,0 +1,81 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCM platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+extern const Platform::Id kROCmPlatformId;
+
+// ROCm-specific platform plugin, registered as a singleton value via module
+// initializer.
+class ROCmPlatform : public Platform {
+ public:
+  ROCmPlatform();
+
+  // Platform interface implementation:
+  // Returns the same value as kROCmPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const std::string& Name() const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  absl::StatusOr<StreamExecutor*> FindExisting(int ordinal) override;
+
+ private:
+  // Returns a device constructed with ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      int ordinal);
+
+  // This platform's name.
+  std::string name_;
+
+  // mutex that guards internal state.
+  mutable absl::Mutex mu_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  ROCmPlatform(const ROCmPlatform&) = delete;
+  void operator=(const ROCmPlatform&) = delete;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_platform_id.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_platform_id.h
new file mode 100644
index 00000000..b11b377c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// Opaque and unique identifier for the ROCm platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+// This is broken out here to avoid a circular dependency between ROCmPlatform
+// and ROCmExecutor.
+extern const Platform::Id kROCmPlatformId;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_solver_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_solver_context.h
new file mode 100644
index 00000000..c1e8acf9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_solver_context.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_SOLVER_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_SOLVER_CONTEXT_H_
+
+#include <complex>
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+
+#define TENSORFLOW_USE_HIPSOLVER (TF_ROCM_VERSION >= 40500)
+#define TENSORFLOW_USE_ROCSOLVER (TF_ROCM_VERSION < 40500)
+
+#include "rocm/rocm_config.h"
+// Macros to ease the transition from rocsolver to hipsolver.
+#if TENSORFLOW_USE_HIPSOLVER
+#include "xla/stream_executor/rocm/hipsolver_wrapper.h"
+using gpusolverHandle_t = hipsolverHandle_t;
+#else  // TENSORFLOW_USE_ROCSOLVER
+#include "xla/stream_executor/rocm/rocblas_wrapper.h"
+#include "xla/stream_executor/rocm/rocsolver_wrapper.h"
+using gpusolverHandle_t = rocblas_handle;
+#endif  // TF_ROCM_VERSION >= 40500
+
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu_solver_context.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace stream_executor {
+
+class RocmSolverContext : public GpuSolverContext {
+ public:
+  ~RocmSolverContext() override;
+  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create();
+
+  absl::Status SetStream(Stream* stream) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<float*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<double*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<std::complex<float>*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
+                            DeviceMemory<std::complex<double>*> as, int lda,
+                            DeviceMemory<int> lapack_info,
+                            int batch_size) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<float> a,
+                     int lda, DeviceMemory<int> lapack_info,
+                     DeviceMemory<float> workspace) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<double> a,
+                     int lda, DeviceMemory<int> lapack_info,
+                     DeviceMemory<double> workspace) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n,
+                     DeviceMemory<std::complex<float>> a, int lda,
+                     DeviceMemory<int> lapack_info,
+                     DeviceMemory<std::complex<float>> workspace) override;
+  absl::Status Potrf(blas::UpperLower uplo, int n,
+                     DeviceMemory<std::complex<double>> a, int lda,
+                     DeviceMemory<int> lapack_info,
+                     DeviceMemory<std::complex<double>> workspace) override;
+  absl::StatusOr<int64_t> PotrfBufferSize(xla::PrimitiveType type,
+                                          blas::UpperLower uplo, int n, int lda,
+                                          int batch_size) override;
+
+ private:
+  explicit RocmSolverContext(gpusolverHandle_t handle);
+
+  gpusolverHandle_t handle_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_SOLVER_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_status.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_status.h
new file mode 100644
index 00000000..e86e970c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_status.h
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_STATUS_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_STATUS_H_
+
+#include <string>
+
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "rocm/include/hip/hip_runtime.h"
+
+namespace stream_executor::gpu {
+
+namespace internal {
+// Helper method to handle the slow path of ToStatus.  Assumes a non-successful
+// result code.
+absl::Status ToStatusSlow(hipError_t result, absl::string_view detail);
+}  // namespace internal
+
+// Returns an absl::Status corresponding to the hipError_t.
+inline absl::Status ToStatus(hipError_t result, absl::string_view detail = "") {
+  if (ABSL_PREDICT_TRUE(result == hipSuccess)) {
+    return absl::OkStatus();
+  }
+  return internal::ToStatusSlow(result, detail);
+}
+
+// Returns a textual description of the given hipError_t.
+std::string ToString(hipError_t result);
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
new file mode 100644
index 00000000..977d27f3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
@@ -0,0 +1,101 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_STREAM_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_STREAM_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/rocm/rocm_event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_common.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class RocmStream : public StreamCommon {
+ public:
+  absl::Status WaitFor(Stream* other) override;
+  absl::Status RecordEvent(Event* event) override;
+  absl::Status WaitFor(Event* event) override;
+
+  absl::Status Memset32(DeviceMemoryBase* location, uint32_t pattern,
+                        uint64_t size) override;
+  absl::Status MemZero(DeviceMemoryBase* location, uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                      uint64_t size) override;
+  absl::Status Memcpy(DeviceMemoryBase* gpu_dst,
+                      const DeviceMemoryBase& gpu_src, uint64_t size) override;
+  absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) override;
+  absl::Status BlockHostUntilDone() override;
+
+  Stream::PlatformSpecificHandle platform_specific_handle() const override {
+    return {stream_handle_};
+  }
+
+  absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
+      bool use_delay_kernel) override {
+    return executor_->CreateEventBasedTimer(this, use_delay_kernel);
+  }
+
+  static absl::StatusOr<std::unique_ptr<RocmStream>> Create(
+      StreamExecutor* executor,
+      std::optional<std::variant<StreamPriority, int>> priority);
+
+  ~RocmStream() override;
+
+  hipStream_t stream_handle() const { return stream_handle_; }
+
+ private:
+  RocmStream(StreamExecutor* executor, RocmEvent completed_event,
+             std::optional<std::variant<StreamPriority, int>> priority,
+             hipStream_t stream_handle)
+      : StreamCommon(executor, priority),
+        executor_(executor),
+        completed_event_(std::move(completed_event)),
+        stream_handle_(stream_handle) {}
+
+  absl::Status RecordCompletedEvent();
+
+  absl::Status LaunchKernel(const ThreadDim& thread_dims,
+                            const BlockDim& block_dims,
+                            const std::optional<ClusterDim>& cluster_dims,
+                            void* function, absl::string_view name, void** args,
+                            int64_t shmem_bytes) override;
+
+  StreamExecutor* executor_;
+  RocmEvent completed_event_;
+  hipStream_t stream_handle_;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_timer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_timer.h
new file mode 100644
index 00000000..f6764fbc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_timer.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_TIMER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_TIMER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/rocm/rocm_event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+class RocmTimer : public EventBasedTimer {
+ public:
+  RocmTimer(RocmTimer&&) = default;
+  RocmTimer& operator=(RocmTimer&&) = default;
+
+  absl::StatusOr<absl::Duration> GetElapsedDuration() override;
+
+  static absl::StatusOr<RocmTimer> Create(StreamExecutor* executor,
+                                          Stream* stream);
+
+ private:
+  RocmTimer(StreamExecutor* executor, RocmEvent start_event,
+            RocmEvent stop_event, Stream* stream);
+
+  bool is_stopped_ = false;
+  StreamExecutor* executor_;
+  Stream* stream_;
+  RocmEvent start_event_;
+  RocmEvent stop_event_;
+};
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_TIMER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_version_parser.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_version_parser.h
new file mode 100644
index 00000000..1cd933ef
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocm_version_parser.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_VERSION_PARSER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCM_VERSION_PARSER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/semantic_version.h"
+
+namespace stream_executor {
+
+// Parses a ROCm version as returned by the `HIP_VERSION` macro, or
+// API functions like `hipDriverGetVersion`, or `hipRuntimeGetVersion`.
+absl::StatusOr<SemanticVersion> ParseRocmVersion(int rocm_version);
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_VERSION_PARSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocsolver_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocsolver_wrapper.h
new file mode 100644
index 00000000..173f4e5f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/rocsolver_wrapper.h
@@ -0,0 +1,118 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocsolver API calls with dso loader so that we don't need to
+// have explicit linking to librocsolver. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
+
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rocsolver/rocsolver.h"
+#else
+#include "rocm/include/rocsolver.h"
+#endif
+
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+
+namespace stream_executor {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define ROCSOLVER_API_WRAPPER(api_name)                        \
+  template <typename... Args>                                  \
+  auto api_name(Args... args)->decltype(::api_name(args...)) { \
+    return ::api_name(args...);                                \
+  }
+
+#else
+
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define ROCSOLVER_API_WRAPPER(api_name)                                    \
+  template <typename... Args>                                              \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) {           \
+    using FuncPtrT = std::add_pointer<decltype(::api_name)>::type;         \
+    static FuncPtrT loaded = []() -> FuncPtrT {                            \
+      static const char* kName = TO_STR(api_name);                         \
+      void* f;                                                             \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                  \
+          tsl::internal::CachedDsoLoader::GetRocsolverDsoHandle().value(), \
+          kName, &f);                                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in rocsolver lib; dlerror: " << s.message();      \
+      return reinterpret_cast<FuncPtrT>(f);                                \
+    }();                                                                   \
+    return loaded(args...);                                                \
+  }
+
+#endif
+
+// clang-format off
+#define FOREACH_ROCSOLVER_API(__macro)      \
+  __macro(rocsolver_cgetrf)                 \
+  __macro(rocsolver_dgetrf)                 \
+  __macro(rocsolver_sgetrf)                 \
+  __macro(rocsolver_zgetrf)                 \
+  __macro(rocsolver_cgetrs)                 \
+  __macro(rocsolver_dgetrs)                 \
+  __macro(rocsolver_sgetrs)                 \
+  __macro(rocsolver_zgetrs)                 \
+  __macro(rocsolver_cgetrf_batched)         \
+  __macro(rocsolver_dgetrf_batched)         \
+  __macro(rocsolver_sgetrf_batched)         \
+  __macro(rocsolver_zgetrf_batched)         \
+  __macro(rocsolver_cgetrs_batched)         \
+  __macro(rocsolver_dgetrs_batched)         \
+  __macro(rocsolver_sgetrs_batched)         \
+  __macro(rocsolver_zgetrs_batched)         \
+  __macro(rocsolver_cgetri_batched)         \
+  __macro(rocsolver_dgetri_batched)         \
+  __macro(rocsolver_sgetri_batched)         \
+  __macro(rocsolver_zgetri_batched)         \
+  __macro(rocsolver_cpotrf)         	    \
+  __macro(rocsolver_dpotrf)                 \
+  __macro(rocsolver_spotrf)                 \
+  __macro(rocsolver_zpotrf)                 \
+  __macro(rocsolver_cpotrf_batched)         \
+  __macro(rocsolver_dpotrf_batched)         \
+  __macro(rocsolver_spotrf_batched)         \
+  __macro(rocsolver_zpotrf_batched)         \
+  __macro(rocsolver_cgeqrf)                 \
+  __macro(rocsolver_dgeqrf)                 \
+  __macro(rocsolver_sgeqrf)                 \
+  __macro(rocsolver_zgeqrf)                 \
+  __macro(rocsolver_cunmqr)                 \
+  __macro(rocsolver_zunmqr)                 \
+  __macro(rocsolver_cungqr)                 \
+  __macro(rocsolver_zungqr)
+// clang-format on
+
+FOREACH_ROCSOLVER_API(ROCSOLVER_API_WRAPPER)
+
+#undef TO_STR_
+#undef TO_STR
+#undef FOREACH_ROCSOLVER_API
+#undef ROCSOLVER_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
new file mode 100644
index 00000000..871df2cb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
@@ -0,0 +1,111 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps roctracer API calls with dso loader so that we don't need to
+// have explicit linking to libroctracer. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
+#define XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
+
+#include "rocm/include/roctracer/roctracer.h"
+#include "rocm/include/roctracer/roctracer_hip.h"
+#include "rocm/rocm_config.h"
+#if TF_ROCM_VERSION >= 50300
+#include "rocm/include/roctracer/roctracer_roctx.h"
+#else
+#include "rocm/include/roctracer/roctracer_hcc.h"
+#endif
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/platform.h"
+
+namespace stream_executor {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define ROCTRACER_API_WRAPPER(API_NAME)                            \
+  template <typename... Args>                                      \
+  auto API_NAME(Args... args) -> decltype((::API_NAME)(args...)) { \
+    return (::API_NAME)(args...);                                  \
+  }
+
+#else
+
+#define ROCTRACER_API_WRAPPER(API_NAME)                                    \
+  template <typename... Args>                                              \
+  auto API_NAME(Args... args) -> decltype(::API_NAME(args...)) {           \
+    using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type;         \
+    static FuncPtrT loaded = []() -> FuncPtrT {                            \
+      static const char* kName = #API_NAME;                                \
+      void* f;                                                             \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                  \
+          tsl::internal::CachedDsoLoader::GetRoctracerDsoHandle().value(), \
+          kName, &f);                                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in roctracer DSO; dlerror: " << s.message();      \
+      return reinterpret_cast<FuncPtrT>(f);                                \
+    }();                                                                   \
+    return loaded(args...);                                                \
+  }
+
+#endif  // PLATFORM_GOOGLE
+
+#if TF_ROCM_VERSION >= 50300
+#define FOREACH_ROCTRACER_API(DO_FUNC)           \
+  DO_FUNC(roctracer_default_pool_expl)           \
+  DO_FUNC(roctracer_disable_domain_activity)     \
+  DO_FUNC(roctracer_disable_domain_callback)     \
+  DO_FUNC(roctracer_disable_op_activity)         \
+  DO_FUNC(roctracer_disable_op_callback)         \
+  DO_FUNC(roctracer_enable_domain_activity_expl) \
+  DO_FUNC(roctracer_enable_domain_callback)      \
+  DO_FUNC(roctracer_enable_op_activity_expl)     \
+  DO_FUNC(roctracer_enable_op_callback)          \
+  DO_FUNC(roctracer_error_string)                \
+  DO_FUNC(roctracer_flush_activity_expl)         \
+  DO_FUNC(roctracer_get_timestamp)               \
+  DO_FUNC(roctracer_op_string)                   \
+  DO_FUNC(roctracer_open_pool_expl)              \
+  DO_FUNC(roctracer_set_properties)              \
+  DO_FUNC(roctracer_next_record)
+#else
+#define FOREACH_ROCTRACER_API(DO_FUNC)           \
+  DO_FUNC(roctracer_default_pool_expl)           \
+  DO_FUNC(roctracer_disable_domain_activity)     \
+  DO_FUNC(roctracer_disable_domain_callback)     \
+  DO_FUNC(roctracer_disable_op_activity)         \
+  DO_FUNC(roctracer_disable_op_callback)         \
+  DO_FUNC(roctracer_enable_domain_activity_expl) \
+  DO_FUNC(roctracer_enable_domain_callback)      \
+  DO_FUNC(roctracer_enable_op_activity_expl)     \
+  DO_FUNC(roctracer_enable_op_callback)          \
+  DO_FUNC(roctracer_error_string)                \
+  DO_FUNC(roctracer_flush_activity_expl)         \
+  DO_FUNC(roctracer_get_timestamp)               \
+  DO_FUNC(roctracer_op_string)                   \
+  DO_FUNC(roctracer_open_pool_expl)              \
+  DO_FUNC(roctracer_set_properties)
+#endif
+FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
+
+#undef FOREACH_ROCTRACER_API
+#undef ROCTRACER_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/scoped_module_handle.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/scoped_module_handle.h
new file mode 100644
index 00000000..d82fc2dc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/scoped_module_handle.h
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_SCOPED_MODULE_HANDLE_H_
+#define XLA_STREAM_EXECUTOR_SCOPED_MODULE_HANDLE_H_
+
+#include "absl/log/check.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+// A wrapper around ModuleHandle that uses RAII to manage its lifetime.
+class ScopedModuleHandle {
+ public:
+  ScopedModuleHandle(StreamExecutor* executor, ModuleHandle module_handle)
+      : executor_(executor), module_handle_(module_handle) {}
+
+  ScopedModuleHandle(ScopedModuleHandle&& other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+  }
+
+  ScopedModuleHandle& operator=(ScopedModuleHandle&& other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+    return *this;
+  }
+
+  ~ScopedModuleHandle() {
+    if (static_cast<bool>(module_handle_)) {
+      CHECK(executor_->UnloadModule(module_handle_));
+    }
+  }
+
+ private:
+  StreamExecutor* executor_;
+  ModuleHandle module_handle_;
+
+  ScopedModuleHandle(const ScopedModuleHandle&) = delete;
+  void operator=(const ScopedModuleHandle&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_SCOPED_MODULE_HANDLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/scratch_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/scratch_allocator.h
new file mode 100644
index 00000000..546f382b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/scratch_allocator.h
@@ -0,0 +1,90 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+class Stream;
+
+// Interface for "scratch" allocator for device memory, which deallocates all
+// buffers it has allocated at destruction. Returned memory pointers are not
+// owning.
+//
+// Used by stream operations (e.g. Stream::ThenConvolveWithScratch) to
+// optionally request scratch space to speed up the operation.
+class ScratchAllocator {
+ public:
+  virtual ~ScratchAllocator() {}
+
+  // Returns a limit of memory this scratch allocator wants to produce, in
+  // bytes. This information may be used to help select an algorithm.
+  //
+  // Returns values < 0 to indicate that there is no recommended limit.
+  virtual int64_t GetMemoryLimitInBytes() = 0;
+
+  // Returns an allocation on byte_size bytes for use in an operation on stream.
+  //
+  // This is a temporary allocation, and the caller is responsible for
+  // deallocating at some known-safe point. See the class comment above.
+  virtual absl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
+      int64_t byte_size) = 0;
+};
+
+// Can allocate several times -- this memory is deallocated when the scratch
+// allocator is destroyed.
+//
+// Thread-compatible, but not thread-safe (use in scenarios where only one
+// thread will request the scratch allocation).
+template <size_t N = 1>
+class OwningScratchAllocator : public ScratchAllocator {
+ public:
+  OwningScratchAllocator(int device_ordinal, DeviceMemoryAllocator* allocator)
+      : device_ordinal_(device_ordinal), allocator_(allocator) {}
+
+  OwningScratchAllocator(OwningScratchAllocator&&) = default;
+  OwningScratchAllocator& operator=(OwningScratchAllocator&&) = default;
+
+  int64_t GetMemoryLimitInBytes() override { return -1; }
+
+  absl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
+      int64_t byte_size) override {
+    TF_ASSIGN_OR_RETURN(OwningDeviceMemory buffer,
+                        allocator_->Allocate(device_ordinal_, byte_size,
+                                             /*retry_on_failure=*/false));
+    buffers_.push_back(std::move(buffer));
+    return *buffers_.back();
+  }
+
+ private:
+  int device_ordinal_;
+  DeviceMemoryAllocator* allocator_;
+  absl::InlinedVector<OwningDeviceMemory, N> buffers_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/semantic_version.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/semantic_version.h
new file mode 100644
index 00000000..faed5a80
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/semantic_version.h
@@ -0,0 +1,105 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_SEMANTIC_VERSION_H_
+#define XLA_STREAM_EXECUTOR_SEMANTIC_VERSION_H_
+
+#include <array>
+#include <iosfwd>
+#include <string>
+#include <tuple>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace stream_executor {
+
+// `SemanticVersion` represents a version number of the form X.Y.Z with
+// - X being called the major version,
+// - Y being called the minor version,
+// - Z being called the patch version.
+//
+// The type is lexicographically ordered and supports printing and parsing.
+class SemanticVersion {
+ public:
+  constexpr SemanticVersion(unsigned major, unsigned minor, unsigned patch)
+      : major_(major), minor_(minor), patch_(patch) {}
+  explicit SemanticVersion(std::array<unsigned, 3> other)
+      : major_(other[0]), minor_(other[1]), patch_(other[2]) {}
+
+  static absl::StatusOr<SemanticVersion> ParseFromString(absl::string_view str);
+
+  unsigned& major() { return major_; }
+  unsigned major() const { return major_; }
+
+  unsigned& minor() { return minor_; }
+  unsigned minor() const { return minor_; }
+
+  unsigned& patch() { return patch_; }
+  unsigned patch() const { return patch_; }
+
+  std::string ToString() const;
+
+  friend bool operator==(const SemanticVersion& lhs,
+                         const SemanticVersion& rhs) {
+    return std::tie(lhs.major_, lhs.minor_, lhs.patch_) ==
+           std::tie(rhs.major_, rhs.minor_, rhs.patch_);
+  }
+  friend bool operator<(const SemanticVersion& lhs,
+                        const SemanticVersion& rhs) {
+    return std::tie(lhs.major_, lhs.minor_, lhs.patch_) <
+           std::tie(rhs.major_, rhs.minor_, rhs.patch_);
+  }
+  friend bool operator!=(const SemanticVersion& lhs,
+                         const SemanticVersion& rhs) {
+    return !(lhs == rhs);
+  }
+  friend bool operator>(const SemanticVersion& lhs,
+                        const SemanticVersion& rhs) {
+    return rhs < lhs;
+  }
+  friend bool operator>=(const SemanticVersion& lhs,
+                         const SemanticVersion& rhs) {
+    return !(lhs < rhs);
+  }
+  friend bool operator<=(const SemanticVersion& lhs,
+                         const SemanticVersion& rhs) {
+    return !(lhs > rhs);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const SemanticVersion& version) {
+    sink.Append(version.ToString());
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const SemanticVersion& v) {
+    return H::combine(std::move(h), v.major_, v.minor_, v.patch_);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const SemanticVersion& version) {
+    return os << version.ToString();
+  }
+
+ private:
+  unsigned major_;
+  unsigned minor_;
+  unsigned patch_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_SEMANTIC_VERSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream.h
new file mode 100644
index 00000000..c3f01994
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream.h
@@ -0,0 +1,273 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The Stream is used in conjunction with the StreamExecutor "parent" to
+// perform actions with a linear stream of dependencies. Dependencies can also
+// be created between Streams to do task management (i.e. limit which tasks
+// can be performed concurrently and specify what task dependencies exist).
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_H_
+#define XLA_STREAM_EXECUTOR_STREAM_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+// Represents a stream of dependent computations on a GPU device.
+//
+// The operations within a stream execute linearly and asynchronously until
+// BlockHostUntilDone() is invoked, which synchronously joins host code with
+// the execution of the stream.
+//
+// If any given operation fails when entraining work for the stream, ok() will
+// indicate that an error has occurred. After initialization, once a stream is
+// !ok(), it will never be ok().
+//
+// Thread-safe post-initialization.
+class Stream {
+ public:
+  // Platform specific handle to the underlying resources behind a stream
+  // implementation (e.g. it gives access to CUstream for CUDA platform).
+  struct PlatformSpecificHandle {
+    void *stream = nullptr;  // will be nullptr if not supported
+  };
+
+  // Deallocates any stream resources that the parent StreamExecutor has
+  // bestowed
+  // upon this object.
+  virtual ~Stream() = default;
+
+  // TODO(ezhulenev): Consider removing this platform-specific accessor and
+  // forward all users to platform-specific headers, however it requires careful
+  // build rules set up to avoid leaking even more implementation details.
+  virtual PlatformSpecificHandle platform_specific_handle() const = 0;
+
+  // Returns whether any errors have occurred while entraining work for this
+  // stream.
+  virtual bool ok() const = 0;
+
+  // Retrieves execution status back into the stream from the underlying
+  // implementation without blocking the stream.
+  //
+  // Normally, Stream::BlockHostUntilDone is used to get execution status.
+  // However, some devices use out-of-band mechanisms to ensure their streams
+  // have finished on-device work, without needing to block the streams. (These
+  // devices should also override AllowsSyncOnCompletion to return false.) For
+  // these devices, this method can be used after work is finished to retrieve
+  // execution status.
+  virtual absl::Status RefreshStatus() {
+    return absl::UnimplementedError(
+        "RefreshStatus is not supported on this stream.");
+  }
+
+  // Get or create a sub-stream from this stream. If there is any sub-stream in
+  // the pool that can be reused then just return this sub-stream.  Otherwise
+  // create a new sub-stream.
+  //
+  // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
+  virtual absl::StatusOr<Stream *> GetOrCreateSubStream() = 0;
+
+  // Return the sub-stream back to the host stream so that it can be reused
+  // later. Sub-streams that are !ok() will not be reused.
+  //
+  // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
+  virtual void ReturnSubStream(Stream *sub_stream) = 0;
+
+  // Create a dependency for this stream's next work on the other stream
+  // completing. Does not take ownership of other, and other must not be
+  // null.
+  //
+  // Checks that a stream does not wait for itself, and it is up to the
+  // user to guarantee that a stream does not come to wait on itself in a
+  // cyclic manner; in that case, behavior is undefined.
+  virtual absl::Status WaitFor(Stream *other) = 0;
+
+  // Waits for an event object to be set.
+  // Note that RecordEvent must have been called on the event before
+  // you call this function; otherwise the event will be considered complete
+  // and this wait will do nothing.
+  virtual absl::Status WaitFor(Event *event) = 0;
+
+  // Inserts the specified event into the end of this stream. Once the stream
+  // has processed all events prior to the insertion point, the event will be
+  // marked as completed.
+  // The stream does not take ownership of event - meaning that event's lifetime
+  // must extend past the point at which it is marked complete!
+  virtual absl::Status RecordEvent(Event *event) = 0;
+
+  // Entrain onto the stream: a memcpy to a host destination from a GPU source
+  // of the given target size. host_dst must be a pointer to host memory
+  // allocated by StreamExecutor::HostMemoryAllocate.
+  virtual absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+                              uint64_t size) = 0;
+
+  // Entrain onto the stream: a memcpy to a GPU destination from a host source
+  // of the given target size. host_src must be a pointer to host memory
+  // allocated by StreamExecutor::HostMemoryAllocate.
+  virtual absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+                              uint64_t size) = 0;
+
+  // Alternative interface for memcpying from device to host that takes an
+  // array slice. Checks that the destination size can accommodate the host
+  // slice size.
+  template <typename T>
+  absl::Status MemcpyD2H(const DeviceMemory<T> &gpu_src,
+                         absl::Span<T> host_dst) {
+    auto host_size = host_dst.size() * sizeof(T);
+    if (gpu_src.size() == 0 || host_size >= gpu_src.size()) {
+      return Memcpy(host_dst.begin(), gpu_src, host_size);
+    }
+    return absl::InternalError("Bad source size.");
+  }
+
+  // Alternative interface for memcpying from host to device that takes an
+  // array slice. Checks that the destination size can accommodate the host
+  // slice size.
+  template <typename T>
+  absl::Status MemcpyH2D(absl::Span<const T> host_src,
+                         DeviceMemory<T> *gpu_dst) {
+    auto host_size = host_src.size() * sizeof(T);
+    if (gpu_dst->size() == 0 || gpu_dst->size() >= host_size) {
+      return Memcpy(gpu_dst, host_src.begin(), host_size);
+    }
+    return absl::InternalError("Bad destination size.");
+  }
+
+  // Entrain onto the stream: a memcpy to a GPU destination from a GPU source
+  // of the given target size. gpu_src/dst must be pointers to GPU memory and
+  // peer access must be enabled between their owning StreamExecutors.
+  virtual absl::Status Memcpy(DeviceMemoryBase *gpu_dst,
+                              const DeviceMemoryBase &gpu_src, uint64_t size) {
+    return absl::UnimplementedError(
+        "Memcpy from device to device is not implemented for this "
+        "stream.");
+  }
+
+  absl::Status MemcpyD2D(DeviceMemoryBase *gpu_dst,
+                         const DeviceMemoryBase &gpu_src, uint64_t size) {
+    return Memcpy(gpu_dst, gpu_src, size);
+  }
+
+  // Entrain onto the stream: a memset of zero at a device location of size
+  // bytes. The location must not be null.
+  virtual absl::Status MemZero(DeviceMemoryBase *location, uint64_t size) {
+    return absl::UnimplementedError("MemZero is not supported on this stream.");
+  }
+
+  // Entrain onto the stream: a memset of a 32-bit pattern at device location of
+  // size bytes, where bytes must be evenly 32-bit sized (i.e. evenly divisible
+  // by 4). The location must not be null.
+  virtual absl::Status Memset32(DeviceMemoryBase *location, uint32_t pattern,
+                                uint64_t size) {
+    return absl::UnimplementedError(
+        "Memset32 is not supported on this stream.");
+  }
+
+  // (Synchronously) block the host code waiting for the operations
+  // entrained on the stream (enqueued to this point in program
+  // execution) to complete.
+  //
+  // Returns an OK status if the blocking was successful and the stream is ok().
+  // Otherwise returns an error describing why the blocking failed.
+  virtual absl::Status BlockHostUntilDone() = 0;
+
+  // Entrains onto the stream a callback to the host (from the device).
+  // Behaves as DoHostCallbackWithStatus below, but the callback should
+  // never fail or its failure is inconsequential.
+  //
+  // This is kept for backward compatibility. Future code should use
+  // DoHostCallbackWithStatus and explicitly return a success status.
+  // TODO(b/112125301): Eventually remove this method.
+  absl::Status DoHostCallback(absl::AnyInvocable<void() &&> callback) {
+    return DoHostCallbackWithStatus([cb = std::move(callback)]() mutable {
+      std::move(cb)();
+      return absl::OkStatus();
+    });
+  }
+
+  // Entrains onto the stream a callback to the host (from the device).
+  // Host callbacks block/occupy the stream just as device functions
+  // (execute one at a time, block later stream operations).
+  // Whether the callback return status affects the result of BlockHostUntilDone
+  // is platform-dependent.
+  //
+  // On certain platforms, DoHostCallback is expected to have significant
+  // negative effects on performance.
+  virtual absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) = 0;
+
+  // Returns the StreamExecutor (parent object) associated with this stream.
+  virtual StreamExecutor *parent() const = 0;
+
+  // Returns the CudaComputeCapability for this stream.
+  virtual CudaComputeCapability GetCudaComputeCapability() const = 0;
+
+  // Returns the RocmComputeCapability for this stream.
+  virtual RocmComputeCapability GetRocmComputeCapability() const = 0;
+
+  // Gets priority for a stream.
+  virtual std::variant<StreamPriority, int> priority() const = 0;
+
+  // Get/set a name for a stream, which can be shown in profiling tools
+  virtual const std::string &GetName() const = 0;
+  virtual void SetName(std::string name) = 0;
+
+  // Create an EventBasedTimer that can be used to time operations on this
+  // stream using Events.
+  //
+  // If use_delay_kernel is true, the timer will launch a delay kernel into the
+  // stream and queue a start event immediately afterwards. This delay kernel
+  // blocks execution on the stream until EventBasedTimer::GetElapsedDuration()
+  // is called, at which point an end event is queued and the delay kernel
+  // exits. This allows the device execution time of the tasks queued to the
+  // stream while the timer is active to be measured more accurately.
+  virtual absl::StatusOr<std::unique_ptr<EventBasedTimer>>
+  CreateEventBasedTimer(bool use_delay_kernel) {
+    return absl::UnimplementedError(
+        "This stream does not support EventBasedTimers.");
+  }
+
+  // Helper method to launch a kernel with optional cluster dimensions.
+  virtual absl::Status LaunchKernel(
+      const ThreadDim &thread_dims, const BlockDim &block_dims,
+      const std::optional<ClusterDim> &cluster_dims, void *function,
+      absl::string_view name, void **args, int64_t shmem_bytes) {
+    return absl::UnimplementedError("Not implemented");
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_common.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_common.h
new file mode 100644
index 00000000..5832a8a1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_common.h
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The Stream is used in conjunction with the StreamExecutor "parent" to
+// perform actions with a linear stream of dependencies. Dependencies can also
+// be created between Streams to do task management (i.e. limit which tasks
+// can be performed concurrently and specify what task dependencies exist).
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_COMMON_H_
+#define XLA_STREAM_EXECUTOR_STREAM_COMMON_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace stream_executor {
+
+// Represents a stream of dependent computations on a GPU device.
+//
+// The operations within a stream execute linearly and asynchronously until
+// BlockHostUntilDone() is invoked, which synchronously joins host code with
+// the execution of the stream.
+//
+// If any given operation fails when entraining work for the stream, ok() will
+// indicate that an error has occurred. After initialization, once a stream is
+// !ok(), it will never be ok().
+//
+// Thread-safe post-initialization.
+class StreamCommon : public Stream {
+ public:
+  // Instantiate a stream tied to parent as a platform executor. Work
+  // entrained onto this stream will be launched/managed on that
+  // StreamExecutor's platform.
+  explicit StreamCommon(StreamExecutor *parent);
+
+  StreamCommon(StreamExecutor *parent,
+               std::optional<std::variant<StreamPriority, int>> priority);
+
+  PlatformSpecificHandle platform_specific_handle() const override;
+  bool ok() const override { return !InErrorState(); }
+  absl::StatusOr<Stream *> GetOrCreateSubStream() override
+      TF_LOCKS_EXCLUDED(mu_);
+  void ReturnSubStream(Stream *sub_stream) override TF_LOCKS_EXCLUDED(mu_);
+  StreamExecutor *parent() const override {
+    CHECK(parent_ != nullptr);
+    return parent_;
+  }
+  std::variant<StreamPriority, int> priority() const override {
+    return stream_priority_;
+  }
+
+  CudaComputeCapability GetCudaComputeCapability() const override {
+    return parent()->GetDeviceDescription().cuda_compute_capability();
+  }
+
+  RocmComputeCapability GetRocmComputeCapability() const override {
+    return parent()->GetDeviceDescription().rocm_compute_capability();
+  }
+
+  // Doesn't do anything interesting by default; GpuStream connects this to NVTX
+  const std::string &GetName() const override { return name_; }
+  void SetName(std::string name) override { name_ = std::move(name); }
+
+ protected:
+  bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
+    absl::ReaderMutexLock lock(&mu_);
+    return !status_.ok();
+  }
+
+  // Sets the error state if operation_retcode is false.
+  // This is a useful shorthand for many stream routines.
+  void CheckError(bool operation_retcode) TF_LOCKS_EXCLUDED(mu_);
+
+  // Checks the status and logs the error message, if any.
+  void CheckStatus(absl::Status status) TF_LOCKS_EXCLUDED(mu_);
+
+  std::string name_;
+
+ private:
+  // The StreamExecutor that supports the operation of this stream.
+  StreamExecutor *parent_;
+
+  // mutex that guards the allocation / error state flags.
+  // Mutable so that it can be obtained via const reader lock.
+  mutable absl::Mutex mu_;
+
+  // The last error (if any) of all method calls.
+  absl::Status status_ ABSL_GUARDED_BY(mu_);
+
+  // Sub-streams that are generated from this stream. Each element has a
+  // pointer to sub-stream and a boolean value indicating if this substream is
+  // ready to be reused.
+  std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_
+      ABSL_GUARDED_BY(mu_);
+
+  std::variant<StreamPriority, int> stream_priority_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor.h
new file mode 100644
index 00000000..719207ba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor.h
@@ -0,0 +1,365 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+
+namespace stream_executor {
+
+// Identifies the memory space where an allocation resides.
+enum class MemoryType { kDevice = 0, kUnified, kCollective, kHost = 5 };
+
+/// The StreamExecutor is a single-device abstraction for:
+//
+// * Loading/launching data-parallel-kernels
+// * Invoking pre-canned high-performance library routines (like matrix
+//   multiply)
+//
+// Interface which defines the method for interacting with an accelerator device
+// (e.g. GPU, TPU).
+class StreamExecutor {
+ public:
+  virtual ~StreamExecutor() = default;
+
+  // Returns an ActivateContext that ensures the StreamExecutor's context is
+  // activated for the duration of the returned ActivateContext's scope.
+  virtual std::unique_ptr<ActivateContext> Activate() = 0;
+
+  // Returns a reference to the platform that created this executor.
+  virtual const Platform* GetPlatform() const = 0;
+
+  // Initializes the device for use.
+  virtual absl::Status Init() = 0;
+
+  // Returns the device ordinal.
+  virtual int device_ordinal() const { return -1; }
+
+  // Creates and initializes a Stream.
+  virtual absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) = 0;
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream() {
+    return CreateStream(std::nullopt);
+  }
+  // Creates an EventBasedTimer for the given stream.
+  virtual absl::StatusOr<std::unique_ptr<EventBasedTimer>>
+  CreateEventBasedTimer(Stream* stream, bool use_delay_kernel) {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
+  // Creates and initializes an Event.
+  virtual absl::StatusOr<std::unique_ptr<Event>> CreateEvent() = 0;
+
+  // Obtains metadata about the underlying device.
+  // The value is cached on first use.
+  virtual const DeviceDescription& GetDeviceDescription() const = 0;
+
+  // Synchronously allocates an array on the device of type T with element_count
+  // elements.
+  template <typename T>
+  DeviceMemory<T> AllocateArray(uint64_t element_count,
+                                int64_t memory_space = 0);
+
+  // Convenience wrapper that allocates space for a single element of type T in
+  // device memory.
+  template <typename T>
+  DeviceMemory<T> AllocateScalar() {
+    return AllocateArray<T>(1);
+  }
+
+  // Loads a kernel from a MultiKernelLoaderSpec.
+  //
+  // Parameters:
+  //   spec: The MultiKernelLoaderSpec is usually generated as a compile-time
+  //    constant into an appropriate namespace.
+  virtual absl::StatusOr<std::unique_ptr<Kernel>> LoadKernel(
+      const MultiKernelLoaderSpec& spec) {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
+  // Releases any state associated with the previously loaded kernel.
+  virtual void UnloadKernel(const Kernel* kernel) {}
+
+  // Unloads the module with handle `module_handle`.
+  virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
+
+  // Loads a module for the platform this StreamExecutor is acting upon.
+  //
+  // `spec` describes the module to be loaded.  On success returns the handle
+  // for the loaded module. Otherwise, returns the error which has occurred.
+  virtual absl::StatusOr<ModuleHandle> LoadModule(
+      const MultiModuleLoaderSpec& spec) {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
+  // Creates a shared constant using the content provided.
+  virtual absl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
+  CreateOrShareConstant(Stream* stream, absl::Span<const uint8_t> content) {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
+  // Synchronously allocates size bytes on the underlying platform and returns
+  // a DeviceMemoryBase representing that allocation. In the case of failure,
+  // nullptr is returned.
+  virtual DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) = 0;
+  DeviceMemoryBase Allocate(uint64_t size) {
+    return Allocate(size, /*memory_space=*/0);
+  }
+  // Deallocates the DeviceMemory previously allocated via this interface.
+  // Deallocation of a nullptr-representative value is permitted.
+  virtual void Deallocate(DeviceMemoryBase* mem) = 0;
+
+  // Allocates unified memory space of the given size, if supported.
+  // See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+  // for more details on unified memory.
+  virtual void* UnifiedMemoryAllocate(uint64_t size) { return nullptr; }
+
+  // Deallocates unified memory space previously allocated with
+  // UnifiedMemoryAllocate.
+  virtual void UnifiedMemoryDeallocate(void* mem) {}
+
+  // Allocates collective device memory using ncclMemAlloc.
+  // See
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
+  // for more details on User Buffer Registration.
+  virtual absl::StatusOr<void*> CollectiveMemoryAllocate(uint64_t size) {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  // Deallocates collective device memory previously allocated with
+  // CollectiveMemoryAllocate.
+  virtual absl::Status CollectiveMemoryDeallocate(void* mem) {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  // Allocates a region of host memory and registers it with the platform API.
+  // Memory allocated in this manner is required for use in asynchronous memcpy
+  // operations, such as Stream::Memcpy.
+  virtual absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) = 0;
+
+  // Deallocates a region of host memory allocated by HostMemoryAllocate().
+  virtual void HostMemoryDeallocate(void* mem) = 0;
+
+  // Returns the memory space of the given pointer.
+  virtual absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  // Synchronizes all activity occurring in the StreamExecutor's context.
+  virtual bool SynchronizeAllActivity() = 0;
+
+  // Blocks the caller while "size" bytes are zeroed out (in POD fashion) at the
+  // given location in device memory.
+  virtual absl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                          uint64_t size) = 0;
+
+  // Returns a DeviceMemoryBase representing the range [base, base + size)
+  // for the given DeviceMemoryBase, such that location is contained within the
+  // returned range.
+  virtual absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
+      const DeviceMemoryBase& location) {
+    return absl::UnimplementedError("Not implemented for this executor.");
+  }
+
+  virtual bool HostMemoryUnregister(void* location) { return false; };
+  virtual bool HostMemoryRegister(void* location, uint64_t size) {
+    return false;
+  };
+
+  // Blocks the caller while "size" bytes are copied to the given location in
+  // device memory.
+  virtual absl::Status SynchronousMemcpy(DeviceMemoryBase* device_dst,
+                                         const void* host_src,
+                                         uint64_t size) = 0;
+  absl::Status SynchronousMemcpyH2D(const void* host_src, int64_t size,
+                                    DeviceMemoryBase* device_dst) {
+    return SynchronousMemcpy(device_dst, host_src, size);
+  }
+
+  // Blocks the caller while "size" bytes are copied to the given location
+  // in host memory.
+  virtual absl::Status SynchronousMemcpy(void* host_dst,
+                                         const DeviceMemoryBase& device_src,
+                                         uint64_t size) = 0;
+  absl::Status SynchronousMemcpyD2H(const DeviceMemoryBase& device_src,
+                                    int64_t size, void* host_dst) {
+    return SynchronousMemcpy(host_dst, device_src, size);
+  }
+
+  // Deallocates stream resources on the underlying platform.
+  virtual void DeallocateStream(Stream* stream) = 0;
+
+  // Enables peer access from this StreamExecutor to memory
+  // allocated by other, such that launched device code, memcpies, etc may
+  // access it directly.
+  virtual absl::Status EnablePeerAccessTo(StreamExecutor* other) = 0;
+
+  // Returns whether it's possible to enable peer access from this
+  // StreamExecutor to memory allocated by another.
+  virtual bool CanEnablePeerAccessTo(StreamExecutor* other) = 0;
+
+  // Returns the underlying device memory usage information, if it is available.
+  // If it is not available (false is returned), free/total may not be
+  // initialized.
+  virtual bool DeviceMemoryUsage(int64_t* free, int64_t* total) const {
+    return false;
+  }
+
+  // Retrieves device pointer and size for a symbol. To use
+  // constant memory in CUDA, GetSymbol has to be used. Returns DeviceMemoryBase
+  // describing the symbol in memory if symbol is found.
+  //
+  // If ModuleHandle is set then we search for `symbol_name` only within the
+  // module corresponding to `module_handle`.  Otherwise all loaded modules are
+  // searched.
+  virtual absl::StatusOr<DeviceMemoryBase> GetSymbol(
+      const std::string& symbol_name, ModuleHandle module_handle) {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  // Creates a new DeviceDescription object. Ownership is transferred to the
+  // caller.
+  virtual absl::StatusOr<std::unique_ptr<DeviceDescription>>
+  CreateDeviceDescription() const = 0;
+
+  // Gets-or-creates a BlasSupport datatype that can be used to execute BLAS
+  // routines on the current platform.
+  //
+  // Returns null if there was an error initializing the BLAS support for the
+  // underlying platform.
+  virtual blas::BlasSupport* AsBlas() { return nullptr; }
+
+  // Gets or creates a FftSupport datatype that can be used to execute FFT
+  // routines on the current platform.
+  //
+  // Returns null if there was an error initializing the FFT support for the
+  // underlying platform.
+  virtual fft::FftSupport* AsFft() { return nullptr; }
+
+  // Gets-or-creates  a DnnSupport datatype that can be used for neural network
+  // routines on the current platform.
+  //
+  // Returns null if there was an error initializing the DNN support for the
+  // underlying platform.
+  virtual dnn::DnnSupport* AsDnn() { return nullptr; }
+
+  // Creates a new CommandBuffer object.
+  virtual absl::StatusOr<std::unique_ptr<CommandBuffer>> CreateCommandBuffer(
+      CommandBuffer::Mode mode) {
+    return absl::UnimplementedError("Command buffers are not implemented");
+  }
+
+  // Returns allocator statistics.
+  virtual std::optional<AllocatorStats> GetAllocatorStats() {
+    return std::nullopt;
+  }
+
+  // Clears the internal stats except for the `in_use` fields  and sets the
+  // `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns true if
+  // implemented.
+  virtual bool ClearAllocatorStats() { return false; }
+
+  // Clears the compilation cache from volatile memory. Returns OK if no
+  // compilation cache exists or if clearing the compilation cache is
+  // unsupported. Caches in non-volatile storage are unaffected.
+  virtual absl::Status FlushCompilationCache() { return absl::OkStatus(); }
+
+  // Returns a stream allocated by this executor, or nullptr if not found.
+  virtual Stream* FindAllocatedStream(void* device_stream) { return nullptr; }
+
+  // Returns the memory limit in bytes supported by this executor.
+  virtual int64_t GetMemoryLimitBytes() const = 0;
+
+  // The following methods access an internal log of some subset
+  // of arguments passed to other class methods.
+  // Used for testing/debugging purposes.
+
+  struct GemmCallTrace {
+    enum class GemmType {
+      kPlain = 0,
+      kStridedBatched = 1,
+      kBatched = 2,
+      kBlasLt = 3
+    };
+    GemmType op;
+    int flags;
+    uint64_t size1, size2;
+  };
+  // This may be expanded as necessary to trace other calls
+  using ApiTrace = std::variant<GemmCallTrace>;
+
+  // Retrieves and clears internal argument logs.
+  virtual absl::StatusOr<std::vector<ApiTrace>> ExtractApiTrace() {
+    return absl::UnimplementedError("Not implemented");
+  }
+  virtual absl::Status RecordApiTrace(ApiTrace call) {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  static constexpr uint64_t kLogGemm = 1 << 0;
+
+  // Sets the argument logging mode. Returns true if 'mode' is valid.
+  // The mode is a bitmask of the kLog* constants.
+  virtual bool SetArgumentLoggingMode(uint64_t mode) { return false; }
+};
+
+template <typename T>
+inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64_t element_count,
+                                                     int64_t memory_space) {
+  uint64_t bytes = sizeof(T) * element_count;
+  auto memory_limit_bytes = GetMemoryLimitBytes();
+  if (memory_limit_bytes > 0 &&
+      static_cast<int64_t>(bytes) > memory_limit_bytes) {
+    LOG(WARNING) << "Not enough memory to allocate " << bytes << " on device "
+                 << device_ordinal()
+                 << " within provided limit.  limit=" << memory_limit_bytes
+                 << "]";
+    return DeviceMemory<T>();
+  }
+  return DeviceMemory<T>(Allocate(bytes, memory_space));
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor_common.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor_common.h
new file mode 100644
index 00000000..1e755d63
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor_common.h
@@ -0,0 +1,78 @@
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/device_description.h"
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_COMMON_H_
+#define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_COMMON_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+// A StreamExecutor manages a single device, in terms of executing work (kernel
+// launches) and memory management (allocation/deallocation, memory copies to
+// and from the device). It is conceptually the "handle" for a device -- Stream
+// objects, which are used to enqueue work to run on the
+// coprocessor have a StreamExecutor instance as their "parent" object.
+//
+// StreamExecutor objects have an underlying platform that is specified up
+// front;
+// e.g. either it is a CUDA or OpenCL executor.
+//
+// Thread-safe after initialization.
+// StreamExecutor interface should not be invoked from a signal handler.
+class StreamExecutorCommon : public StreamExecutor {
+ public:
+  explicit StreamExecutorCommon(const Platform* platform);
+
+  std::unique_ptr<ActivateContext> Activate() override {
+    // Non-GPU stream executors don't have a context to activate.
+    return std::make_unique<ActivateContext>();
+  }
+
+  const Platform* GetPlatform() const override { return platform_; }
+  const DeviceDescription& GetDeviceDescription() const override;
+  int64_t GetMemoryLimitBytes() const override { return memory_limit_bytes_; }
+
+ private:
+  // Reader/writer lock for mutable data structures on this StreamExecutor.
+  //
+  // Mutable so that caching functions (like DeviceDescription, AsBlas, etc.)
+  // can acquire the lock on their first (mutating) call as well.
+  mutable absl::Mutex mu_;
+
+  // Reference to the platform that created this executor.
+  const Platform* platform_;
+
+  // Slot to cache the owned DeviceDescription for the underlying device
+  // once it has been queried from DeviceDescription().
+  mutable std::unique_ptr<DeviceDescription> device_description_
+      ABSL_GUARDED_BY(mu_);
+
+  // Memory limit in bytes. Value less or equal to 0 indicates there is no
+  // limit.
+  int64_t memory_limit_bytes_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h
new file mode 100644
index 00000000..0ec7e627
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_executor_memory_allocator.h
@@ -0,0 +1,82 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_MEMORY_ALLOCATOR_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+// Default memory allocator for a platform which uses
+// StreamExecutor::Allocate/Deallocate.
+class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
+ public:
+  // Create an allocator supporting a single device, corresponding to the
+  // passed executor.
+  explicit StreamExecutorMemoryAllocator(StreamExecutor *executor);
+
+  // Create an allocator supporting multiple stream executors.
+  //
+  // Precondition: all stream_executors have different device ordinals.
+  StreamExecutorMemoryAllocator(
+      const Platform *platform,
+      absl::Span<StreamExecutor *const> stream_executors);
+
+  absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                              bool retry_on_failure,
+                                              int64_t memory_space) override;
+
+  // Pull in two-arg overload that sets retry_on_failure to true.
+  using DeviceMemoryAllocator::Allocate;
+
+  absl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
+
+  bool AllowsAsynchronousDeallocation() const override;
+
+  // Gets-or-creates a stream for a given `device_ordinal` from an appropriate
+  // stream executor.
+  absl::StatusOr<Stream *> GetStream(int device_ordinal) override;
+
+  // Gets the stream executor for given device ordinal.
+  absl::StatusOr<StreamExecutor *> GetStreamExecutor(int device_ordinal) const;
+
+ private:
+  // Available stream executors. Each stream executor has a different device
+  // ordinal.
+  std::vector<StreamExecutor *> stream_executors_;
+
+  absl::Mutex mutex_;
+
+  // Cache of streams for GetStream.
+  std::map<int, std::unique_ptr<Stream>> streams_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_MEMORY_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_finder.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_finder.h
new file mode 100644
index 00000000..0503d3fb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/stream_finder.h
@@ -0,0 +1,30 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_FINDER_H_
+#define XLA_STREAM_EXECUTOR_STREAM_FINDER_H_
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+
+namespace stream_executor {
+
+// Returns a Stream given the gpu_stream handle.
+absl::StatusOr<Stream*> FindStream(Platform* platform, void* gpu_stream);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_FINDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/sycl/sycl_platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
new file mode 100644
index 00000000..7c70e5d1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
@@ -0,0 +1,84 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_SYCL_SYCL_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_SYCL_SYCL_PLATFORM_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace sycl {
+// Opaque and unique identifier for the SYCL platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a SyclPlatform object.
+extern const Platform::Id kSyclPlatformId;
+}  // namespace sycl
+
+namespace gpu {
+// Sycl-specific platform plugin, registered as a singleton value via module
+// initializer.
+class SyclPlatform : public Platform {
+ public:
+  SyclPlatform();
+  ~SyclPlatform() override;
+
+  // Platform interface implementation:
+  // Returns the same value as kSyclPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const std::string& Name() const override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+ private:
+  // Returns a device constructed with ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      int ordinal);
+
+  // This platform's name.
+  std::string name_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  SyclPlatform(const SyclPlatform&) = delete;
+  void operator=(const SyclPlatform&) = delete;
+};
+
+}  // namespace gpu
+
+namespace sycl {
+
+using SyclPlatform = gpu::SyclPlatform;
+
+}  // namespace sycl
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_SYCL_SYCL_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/sycl/sycl_platform_id.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/sycl/sycl_platform_id.h
new file mode 100644
index 00000000..779f4797
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/sycl/sycl_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_SYCL_SYCL_PLATFORM_ID_H_
+#define XLA_STREAM_EXECUTOR_SYCL_SYCL_PLATFORM_ID_H_
+
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace sycl {
+
+// Opaque and unique identifier for the sycl platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a SyclPlatform object.
+// This is broken out here to avoid a circular dependency between SyclPlatform
+// and SyclExecutor.
+extern const Platform::Id kSyclPlatformId;
+
+}  // namespace sycl
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_SYCL_SYCL_PLATFORM_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
new file mode 100644
index 00000000..5c0199e1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
@@ -0,0 +1,168 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
+#define XLA_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/xla_data.pb.h"
+
+// APIs for converting between internal and external versions of
+// XLA/StreamExecutor data structures.
+namespace ApiConverter {
+
+absl::Span<const float> MakeSpan(const FloatList& src_list);
+void CreateVector(absl::Span<const float> src, FloatList* dst);
+void Destroy(FloatList* float_list);
+
+absl::Span<const int64_t> MakeSpan(const Int64List& src_list);
+void CreateVector(absl::Span<const int64_t> src, Int64List* dst);
+
+absl::Span<const int> MakeSpan(const IntList& src_list);
+void CreateVector(absl::Span<const int> src, IntList* dst);
+
+absl::Span<const bool> MakeSpan(const BoolList& src_list);
+void CreateVector(absl::Span<const bool> src, BoolList* dst);
+
+void CreateVector(absl::Span<const xla::DimLevelType> src, IntList* dst);
+
+// se::DeviceMemoryBase
+SE_DeviceMemoryBase ToC(const stream_executor::DeviceMemoryBase& base);
+void ToC(const stream_executor::DeviceMemoryBase& base,
+         SE_DeviceMemoryBase* se_base);
+stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
+void Destroy(SE_DeviceMemoryBase*);
+
+// xla::Tile
+xla::Tile FromC(const XLA_Tile* c_tile);
+void ToC(const xla::Tile& xla_tile, XLA_Tile* c_tile);
+void Destroy(XLA_Tile* c_tile);
+
+// xla::Layout
+xla::Layout FromC(const XLA_Layout* c_layout);
+void ToC(const xla::Layout& xla_layout, XLA_Layout* c_layout);
+void Destroy(XLA_Layout* c_layout);
+
+// xla::Shape
+xla::Shape FromC(const XLA_Shape* c_shape);
+void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape);
+void Destroy(XLA_Shape* c_shape);
+
+// xla::ShapeIndex
+XLA_ShapeIndex ToC(const xla::ShapeIndex& xla_shape);
+xla::ShapeIndex FromC(XLA_ShapeIndex* c_shape);
+void Destroy(XLA_ShapeIndex*);
+
+// Literal
+void ToC(const xla::LiteralSlice& literal, XLA_Literal* c_literal);
+xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal);
+void Destroy(XLA_Literal* c_literal);
+
+// ShapedBuffer
+void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer);
+xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer);
+void Destroy(XLA_ShapedBuffer* c_buffer);
+
+// se::DeviceMemoryBase
+SE_DeviceMemoryBase ToC(const stream_executor::DeviceMemoryBase& base);
+stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
+void Destroy(SE_DeviceMemoryBase*);
+
+// Literal
+void ToC(const xla::LiteralSlice& literal, XLA_Literal* c_literal);
+xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal);
+void Destroy(XLA_Literal* c_literal);
+
+// ShapedBuffer
+void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer);
+xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer);
+void Destroy(XLA_ShapedBuffer* c_buffer);
+
+// TpuEmbeddingEngineParametersData
+struct TpuEmbeddingEngineParametersData {
+  // Backing vector for struct
+  std::array<std::vector<FloatListRef*>, 8> vectors;
+  TpuEmbeddingEngineParameters c_params;
+};
+
+std::unique_ptr<TpuEmbeddingEngineParametersData> Create(int num_tables);
+
+xla::MaybeOwningDeviceMemory FromC(
+    SE_MaybeOwningDeviceMemory* se_mem,
+    stream_executor::DeviceMemoryAllocator* allocator);
+
+// DeviceMemoryAllocator
+SE_DeviceMemoryAllocator ToC(stream_executor::DeviceMemoryAllocator* allocator);
+stream_executor::DeviceMemoryAllocator* FromC(
+    const SE_DeviceMemoryAllocator& c_allocator);
+
+// OwningDeviceMemory
+SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem);
+// mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
+// released. 'aliased' should be true in this case. 'aliased' has no effect if
+// 'mem' is unowned.
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem, bool aliased);
+
+// HloModule
+XLA_HloModule ToC(const xla::HloModule& module);
+absl::StatusOr<std::unique_ptr<xla::HloModule>> FromC(
+    const XLA_HloModule& c_module);
+void Destroy(XLA_HloModule* c_module);
+
+// HloModuleConfig
+XLA_HloModuleConfig ToC(const xla::HloModuleConfig& config);
+xla::HloModuleConfig FromC(const XLA_HloModuleConfig& c_config);
+void Destroy(XLA_HloModuleConfig* c_config);
+
+// Helper for managing stack based C -> C++ conversions.
+template <class CType>
+struct StackHelper {
+  explicit StackHelper() {}
+
+  template <class CppType>
+  explicit StackHelper(const CppType& t) {
+    ::ApiConverter::ToC(t, &value);
+  }
+  ~StackHelper() { ::ApiConverter::Destroy(&value); }
+
+  template <class CppType>
+  CppType AsCpp() const {
+    return ::ApiConverter::FromC(&value);
+  }
+
+  mutable CType value;
+};
+
+}  // namespace ApiConverter
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_decl.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
new file mode 100644
index 00000000..63315241
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
@@ -0,0 +1,360 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_C_API_DECL_H_
+#define XLA_STREAM_EXECUTOR_TPU_C_API_DECL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xla/stream_executor/tpu/libtftpu.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TSL_Status TF_Status;
+
+// Maximum number of array elements to inline into structs for performance.
+#define TPU_C_API_MAX_INLINED 6
+
+typedef enum TpuCoreTypeEnum {
+  kTensorCore,
+  kEmbeddingV1,
+  kEmbeddingV2,
+} TpuCoreTypeEnum;
+
+typedef enum TpuVersionEnum {
+  kUnknownTpuVersion,
+  kTpuV2,
+  kTpuV3,
+  kTpuV4,
+  kTpuV5,
+} TpuVersionEnum;
+
+typedef struct TpuRuntimeVersion {
+  // The three version numbers are: major, minor, patch
+  int version[3];
+  const char* metadata;
+  size_t metadata_size;
+} TpuRuntimeVersion;
+
+typedef struct SE_Platform SE_Platform;
+typedef struct SE_StreamExecutor SE_StreamExecutor;
+typedef struct SE_Stream SE_Stream;
+typedef struct SE_Event SE_Event;
+
+typedef struct TpuSerializedProto {
+  const char* bytes;
+  size_t size;
+} TpuSerializedProto;
+
+typedef struct SE_PlatformId {
+  void* id;  // aka stream_executor::Platform::Id
+} SE_PlatformId;
+typedef TF_Status* (*SE_StatusCallback)(void*);
+
+typedef struct SE_DeviceMemoryBase {
+  void* opaque;
+  uint64_t size;
+  uint64_t payload;
+} SE_DeviceMemoryBase;
+
+typedef struct SE_ScopedDeviceMemory {
+  SE_DeviceMemoryBase wrapped;
+  int device_ordinal;
+} SE_ScopedDeviceMemory;
+
+typedef struct SE_AllocatorStats {
+  int64_t num_allocs;
+  int64_t bytes_in_use;
+  int64_t peak_bytes_in_use;
+  int64_t largest_alloc_size;
+
+  bool has_bytes_limit;
+  int64_t bytes_limit;
+
+  int64_t bytes_reserved;
+  int64_t peak_bytes_reserved;
+
+  bool has_bytes_reservable_limit;
+  int64_t bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;
+} SE_AllocatorStats;
+
+// Note, due to the... odd way in which DeviceMemoryAllocator is used in TF, we
+// cannot simply wrap an underlying pointer. Instead, we reverse the call
+// direction and request memory via a callback.
+typedef void (*SE_AllocateFn)(void* ctx, int device_ordinal, uint64_t size,
+                              bool retry_on_failure, int64_t memory_space,
+                              SE_ScopedDeviceMemory* result, TF_Status* status);
+
+typedef void (*SE_DeallocateFn)(void* ctx, SE_DeviceMemoryBase* base,
+                                int device_ordinal, TF_Status* status);
+
+typedef struct SE_DeviceMemoryAllocator {
+  SE_Platform* platform;
+  void* ctx;
+  SE_AllocateFn allocate;
+  SE_DeallocateFn deallocate;
+} SE_DeviceMemoryAllocator;
+
+typedef struct SE_DeviceDescription {
+  char* device_vendor;
+  char* platform_version;
+  char* driver_version;
+  char* runtime_version;
+  char* pci_bus_id;
+  char* name;
+
+  int64_t thread_dim_limit_x;
+  int64_t thread_dim_limit_y;
+  int64_t thread_dim_limit_z;
+  int64_t block_dim_limit_x;
+  int64_t block_dim_limit_y;
+  int64_t block_dim_limit_z;
+
+  int64_t threads_per_core_limit;
+  int64_t threads_per_block_limit;
+  int64_t threads_per_warp;
+
+  int64_t registers_per_core_limit;
+  int64_t registers_per_block_limit;
+
+  int64_t device_address_bits;
+  int64_t device_memory_size;
+  int64_t memory_bandwidth;
+
+  int64_t shared_memory_per_core;
+  int64_t shared_memory_per_block;
+
+  float clock_rate_ghz;
+
+  int cuda_compute_capability_major;
+  int cuda_compute_capability_minor;
+
+  int numa_node;
+  int core_count;
+  bool ecc_enabled;
+} SE_DeviceDescription;
+
+typedef struct Tpu_Compiler Tpu_Compiler;
+typedef struct SE_Executable SE_Executable;
+
+typedef struct SE_ExecutableRunOptions {
+  SE_DeviceMemoryAllocator allocator;
+  int device_ordinal;
+  SE_Stream* stream;
+  SE_Stream* host_to_device_stream;
+  TpuSerializedProto device_assignment;
+  int rng_seed;
+  int64_t run_id;
+  int launch_id;
+} SE_ExecutableRunOptions;
+
+typedef struct SE_ExecutableSerializationHandle
+    SE_ExecutableSerializationHandle;
+
+typedef struct SE_MaybeOwningDeviceMemory {
+  SE_DeviceMemoryBase memory;
+  bool owned;
+
+  // Set if owned
+  int device_ordinal;
+  SE_DeviceMemoryAllocator allocator;
+} SE_MaybeOwningDeviceMemory;
+
+typedef struct IntList {
+  union {
+    int* heap;  // owned
+    int inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} IntList;
+
+typedef struct Int64List {
+  union {
+    int64_t* heap;  // owned
+    int64_t inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} Int64List;
+
+typedef struct FloatList {
+  union {
+    float* heap;  // owned
+    float inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} FloatList;
+
+typedef struct BoolList {
+  union {
+    bool* heap;  // owned
+    bool inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} BoolList;
+
+typedef struct FloatListRef {
+  float* ptr;  // not owned
+  int64_t size;
+} FloatListRef;
+
+typedef struct TpuEmbeddingEngineParameters {
+  FloatListRef** parameters[8];
+  size_t num_tables;
+} TpuEmbeddingEngineParameters;
+
+typedef struct XLA_Tile {
+  Int64List dimensions;
+} XLA_Tile;
+
+typedef struct TileList {
+  union {
+    XLA_Tile* heap;  // owned
+    XLA_Tile inlined[TPU_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} TileList;
+
+typedef struct XLA_Layout {
+  Int64List minor_to_major;
+  IntList dim_level_types;
+  IntList dim_unique;
+  IntList dim_ordered;
+  TileList tiles;
+  int index_primitive_type;
+  int pointer_primitive_type;
+  int64_t element_size_in_bits;
+  int64_t memory_space;
+  int64_t dynamic_shape_metadata_prefix_bytes;
+  int64_t tail_padding_alignment_in_elements;
+} XLA_Layout;
+
+// Represents an XLA shape tree.
+typedef struct XLA_Shape {
+  int element_type;
+  Int64List dimensions;
+  BoolList dynamic_dimensions;
+  struct XLA_Shape* tuple_shapes;  // owned
+  int ntuple_shapes;
+  bool has_layout;
+  XLA_Layout layout;
+} XLA_Shape;
+
+// Represents a leaf node for a XLA shaped buffer.
+typedef struct XLA_ShapedBuffer {
+  XLA_Shape on_device_shape;
+  int device_ordinal;
+
+  SE_DeviceMemoryBase* bases;
+  size_t count;
+} XLA_ShapedBuffer;
+
+// Represents a leaf XLA literal.
+typedef struct XLA_Literal {
+  char** buffers;
+  size_t* sizes;
+  size_t count;
+  XLA_Shape shape;
+} XLA_Literal;
+
+typedef struct XLA_MaybeOwningDeviceMemoryShapeTree {
+  XLA_Shape shape;
+  SE_MaybeOwningDeviceMemory* buffers;
+} XLA_MaybeOwningDeviceMemoryShapeTree;
+
+typedef struct XLA_ShapeIndex {
+  int64_t indices[8];
+  int64_t count;
+} XLA_ShapeIndex;
+
+typedef struct SE_ExecutionInput {
+  XLA_MaybeOwningDeviceMemoryShapeTree shape_tree;
+  XLA_ShapeIndex* unowned_indices;
+  int unowned_indices_size;
+  XLA_Shape dynamic_shape;
+} SE_ExecutionInput;
+
+typedef struct SE_ExecutionOutput {
+  XLA_ShapedBuffer result;
+  SE_MaybeOwningDeviceMemory* to_be_released;
+  int to_be_released_size;
+  XLA_ShapeIndex* aliased_indices;
+  int aliased_indices_size;
+} SE_ExecutionOutput;
+
+typedef struct XLA_ComputationLayout {
+  int parameter_count;
+  XLA_Shape* parameter_layouts;
+  XLA_Shape result_layout;
+} XLA_ComputationLayout;
+
+typedef struct XLA_HloModuleConfig {
+  uint64_t seed;
+  int32_t launch_id;
+  int64_t replica_count;
+  int64_t num_partitions;
+  bool use_spmd_partitioning;
+  bool use_auto_spmd_partitioning;
+  Int64List auto_spmd_partitioning_mesh_shape;
+  Int64List auto_spmd_partitioning_mesh_ids;
+  TpuSerializedProto debug_options;
+  bool has_static_device_assignment;
+  TpuSerializedProto static_device_assignment;
+  bool has_entry_computation_layout;
+  XLA_ComputationLayout entry_computation_layout;
+  BoolList allow_spmd_sharding_propagation_to_parameters;
+  BoolList allow_spmd_sharding_propagation_to_output;
+} XLA_HloModuleConfig;
+
+typedef struct SE_HloExecutionProfile SE_HloExecutionProfile;
+
+typedef struct SE_StreamExecutorList {
+  SE_StreamExecutor** exec;
+  int count;
+} SE_StreamExecutorList;
+
+typedef struct XLA_HloModuleGroup {
+  TpuSerializedProto proto;
+  XLA_HloModuleConfig* module_config;
+} XLA_HloModuleGroup;
+
+typedef struct XLA_HloModule {
+  TpuSerializedProto proto;
+  XLA_HloModuleConfig module_config;
+} XLA_HloModule;
+
+typedef struct XLA_TransferManager XLA_TransferManager;
+typedef struct XLA_TpuMeshState XLA_TpuMeshState;
+
+typedef struct XLA_ComputationPlacer XLA_ComputationPlacer;
+
+typedef void (*XLA_CallbackFn)(void*);
+typedef void (*XLA_StatusCallbackFn)(void*, TF_Status*);
+
+typedef struct SE_TpuTopology SE_TpuTopology;
+typedef struct SE_TpuTopology_Core SE_TpuTopology_Core;
+typedef struct SE_TpuTopology_Core SE_TpuTopology_Host;
+
+typedef struct SE_OutsideCompilationParams SE_OutsideCompilationParams;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_C_API_DECL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_defn.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_defn.h
new file mode 100644
index 00000000..2d4f9453
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/c_api_defn.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
+#define XLA_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
+
+#include <memory>
+
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+// Definitions for XLA API data structures. Any underlying C++ data structures
+// are implementation details and should only be used from within the stream
+// executor implementation.
+
+namespace stream_executor {
+class Platform;
+class StreamExecutor;
+}  // namespace stream_executor
+
+struct SE_Platform {
+  stream_executor::Platform* platform;
+};
+
+struct SE_StreamExecutor {
+  stream_executor::StreamExecutor* executor;
+};
+
+struct SE_Stream {
+  explicit SE_Stream(stream_executor::StreamExecutor* parent) {}
+  std::unique_ptr<stream_executor::Stream> stream;
+};
+
+struct SE_Event {
+  explicit SE_Event(stream_executor::StreamExecutor* parent) {}
+  std::unique_ptr<stream_executor::Event> event;
+};
+
+// Ignored -- these are just used to enforce the interface types
+struct XLA_TransferManager {};
+struct XLA_ComputationPlacer {};
+struct SE_TpuTopology {};
+struct SE_TpuTopology_Core {};
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/libtftpu.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/libtftpu.h
new file mode 100644
index 00000000..3f20d6d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/libtftpu.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdbool.h>
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_LIBTFTPU_H_
+#define XLA_STREAM_EXECUTOR_TPU_LIBTFTPU_H_
+
+#ifndef __cplusplus
+#define decltype(expr) __typeof__(expr)
+#endif
+
+// Unfortunately we have to add an Fn suffix because we cannot have the same
+// name for both a function and a element within a struct in the global
+// namespace in gcc. This restriction doesn't exist in clang.
+#define TFTPU_ADD_FN_IN_STRUCT(FnName) decltype(FnName)* FnName##Fn;
+
+#ifdef SWIG
+#define TFTPU_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TFTPU_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFTPU_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TFTPU_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TFTPU_CAPI_EXPORT void TfTpu_Initialize(bool init_library, int num_args,
+                                        const char** args);
+
+#ifdef __cplusplus
+}
+#endif
+
+struct TfTpu_BaseFn {
+  TFTPU_ADD_FN_IN_STRUCT(TfTpu_Initialize);
+};
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_LIBTFTPU_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
new file mode 100644
index 00000000..f9c86d59
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
@@ -0,0 +1,158 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
+#define XLA_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/mem.h"
+
+namespace tensorflow {
+namespace tpu {
+
+using BufferDeallocator = std::function<void(void*)>;
+using OwnedDataPtr = std::unique_ptr<uint8_t[], BufferDeallocator>;
+using BufferAllocator = absl::FunctionRef<OwnedDataPtr(size_t)>;
+
+inline OwnedDataPtr DefaultAllocator(size_t size) {
+  return {static_cast<uint8_t*>(malloc(size)), free};
+}
+
+// Uncopyable buffer type with optional ownership of the underlying data. If
+// data is not owned then ensuring lifetime of the data exceeds the lifetime of
+// the buffer is the responsibility of the user.
+class NoncopyableBuffer {
+ public:
+  NoncopyableBuffer() = default;
+
+  // Allocate an owning buffer without initializing the data. Useful when it
+  // will be filled by a subsequent function and want to avoid initialization
+  // cost. Size is specified in number of bytes.
+  explicit NoncopyableBuffer(size_t size,
+                             BufferAllocator allocator = DefaultAllocator)
+      : data_(allocator(size)), buf_(data_.get()), size_(size) {}
+
+  // Allocates an owning buffer and initializes it with the specified data. Size
+  // is specified in number of uint32_t's.
+  NoncopyableBuffer(size_t size_in_u32s, std::optional<uint32_t> value,
+                    BufferAllocator allocator = DefaultAllocator)
+      : NoncopyableBuffer(size_in_u32s * sizeof(uint32_t), allocator) {
+#ifndef MEMORY_SANITIZER
+    if (!value.has_value()) {
+      return;
+    }
+#endif
+    uint32_t* data_u32 = reinterpret_cast<uint32_t*>(data_.get());
+    uint32_t v = value.value_or(uint32_t{0});
+    std::fill_n(data_u32, size_in_u32s, v);
+  }
+
+  // Directly use buf pointer without copying it to owning data_. This delays
+  // the memcpy until mutable access is requested. "buf" is not owned by this
+  // data structure, so it is the user's duty to ensure the live range of "buf"
+  // is longer than this data structure.
+  NoncopyableBuffer(const uint8_t* buf, size_t size)  // Size is in uint8's.
+      : buf_(buf), size_(size) {}
+  NoncopyableBuffer(const uint32_t* buf,
+                    size_t size_in_u32s)  // Size is in uint32_t's.
+      : buf_(buf), size_(size_in_u32s * sizeof(uint32_t)) {}
+
+  NoncopyableBuffer(const NoncopyableBuffer&) = delete;
+  NoncopyableBuffer(NoncopyableBuffer&&) = default;
+
+  NoncopyableBuffer& operator=(const NoncopyableBuffer&) = delete;
+  NoncopyableBuffer& operator=(NoncopyableBuffer&&) = default;
+
+  // Ensure that the buffer owns the data and returns a mutable view into the
+  // owned data for modification.
+  template <typename T>
+  absl::Span<T> mutable_data() {
+    static_assert(std::is_arithmetic<T>::value, "Must be arithmetic type.");
+    EnsureDataOwned();
+    DCHECK_EQ(size_ % sizeof(T), 0);
+    return absl::Span<T>(reinterpret_cast<T*>(data_.get()), size_ / sizeof(T));
+  }
+
+  template <typename T>
+  absl::Span<const T> const_data() const {
+    static_assert(std::is_arithmetic<T>::value, "Must be arithmetic type.");
+    DCHECK_EQ(size_ % sizeof(T), 0);
+    return absl::Span<const T>(static_cast<const T*>(buf_), size_ / sizeof(T));
+  }
+  // Clone the content to a given buffer.
+  void CloneTo(void* buf) { memcpy(buf, buf_, size_); }
+
+  // Return true if data is owned by this buffer (have been copied to `data_`).
+  bool owns_data() const { return data_ != nullptr; }
+
+  // Returns a copy of the object that owns its buffer.
+  NoncopyableBuffer Clone(size_t alignment = 1) const {
+    auto clone = alignment <= 1
+                     ? NoncopyableBuffer(size_)
+                     : NoncopyableBuffer(AlignedAlloc(size_, alignment), size_);
+    memcpy(clone.data_.get(), buf_, size_);
+    return clone;
+  }
+  // Returns a copy of the object that owns its buffer. It uses `allocator` to
+  // allocate the new buffer, which can have custom properties like special
+  // alignment.
+  NoncopyableBuffer Clone(BufferAllocator allocator) const {
+    NoncopyableBuffer clone(size_, allocator);
+    memcpy(clone.data_.get(), buf_, size_);
+    return clone;
+  }
+
+  // Ensure that the buffer owns the data.
+  void EnsureDataOwned(BufferAllocator allocator = DefaultAllocator) {
+    if (data_ == nullptr) {
+      data_ = allocator(size_);
+      memcpy(data_.get(), buf_, size_);
+      buf_ = data_.get();
+    }
+  }
+
+  static OwnedDataPtr AlignedAlloc(size_t size, size_t alignment) {
+    return OwnedDataPtr(
+        static_cast<uint8_t*>(tsl::port::AlignedMalloc(size, alignment)),
+        tsl::port::AlignedFree);
+  }
+
+ private:
+  NoncopyableBuffer(OwnedDataPtr data, size_t size)
+      : data_(std::move(data)), buf_(data_.get()), size_(size) {}
+
+  // If data_ != nullptr then buf_ == data_.get()
+  OwnedDataPtr data_{nullptr, free};  // Owning data pointer.
+  const void* buf_;                   // Non-owning data pointer.
+  size_t size_;                       // Size in number of bytes.
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/proto_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/proto_helper.h
new file mode 100644
index 00000000..cda8b489
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/proto_helper.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_PROTO_HELPER_H_
+#define XLA_STREAM_EXECUTOR_TPU_PROTO_HELPER_H_
+
+#include <cstddef>
+
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+extern "C" {
+
+void StreamExecutor_Tpu_FreeSerializedProto(const TpuSerializedProto* proto);
+
+}  // extern "C"
+
+namespace stream_executor {
+namespace tpu {
+
+using SerializedProto = TpuSerializedProto;
+
+// Serializes a `proto` and put the result in the given `SerializedProtoType*`
+// argument.
+//
+// Users should call SerializedProto_Free on `serialized_proto` afterwards.
+template <class ProtoType, class SerializedProtoType>
+inline void SerializeProto(const ProtoType& proto,
+                           SerializedProtoType* serialized_proto) {
+  auto size = proto.ByteSizeLong();
+  auto bytes = new char[size];
+  CHECK(proto.SerializeToArray(bytes, size));
+  serialized_proto->size = size;
+  serialized_proto->bytes = bytes;
+}
+
+// Serializes a proto and return the result as a SerializedProto value.
+//
+// Users should call SerializedProto_Free on the return value afterwards.
+template <class ProtoType>
+inline SerializedProto SerializeProto(const ProtoType& proto) {
+  SerializedProto serialized_proto;
+  SerializeProto(proto, &serialized_proto);
+  return serialized_proto;
+}
+
+// Deserializes a buffer and return the corresponding proto. If the buffer is
+// empty, return an empty proto.
+template <class ProtoType, class SerializedProtoType>
+inline ProtoType DeserializeProto(const SerializedProtoType& serialized_proto) {
+  ProtoType proto;
+  if (serialized_proto.bytes != nullptr) {
+    CHECK_GT(serialized_proto.size, 0);
+    CHECK(proto.ParseFromArray(serialized_proto.bytes, serialized_proto.size))
+        << "Invalid buffer, failed to deserialize buffer.";
+  }
+  return proto;
+}
+
+// Releases the memory allocated for serialized protos.
+template <class SerializedProtoType>
+inline void SerializedProto_Free(const SerializedProtoType& serialized_proto) {
+  CHECK_NE(serialized_proto.bytes, nullptr);
+  CHECK_GT(serialized_proto.size, 0);
+  delete[] serialized_proto.bytes;
+}
+
+}  // namespace tpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_PROTO_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/status_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/status_helper.h
new file mode 100644
index 00000000..64a87d9a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/status_helper.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
+#define XLA_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_executor_api.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+
+class StatusHelper {
+ public:
+  StatusHelper()
+      : c_status(stream_executor::tpu::ExecutorApiFn()->TpuStatus_NewFn()) {}
+
+  ~StatusHelper() {
+    stream_executor::tpu::ExecutorApiFn()->TpuStatus_FreeFn(c_status);
+  }
+
+  static absl::Status FromC(  // TENSORFLOW_STATUS_OK
+      TF_Status* const c_status) {
+    if (stream_executor::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status)) {
+      return absl::OkStatus();
+    } else {
+      return absl::Status(  // TENSORFLOW_STATUS_OK
+          absl::StatusCode(
+              stream_executor::tpu::ExecutorApiFn()->TpuStatus_CodeFn(
+                  c_status)),
+          stream_executor::tpu::ExecutorApiFn()->TpuStatus_MessageFn(c_status));
+    }
+  }
+
+  bool ok() const {
+    return stream_executor::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status);
+  }
+
+  absl::Status status() const {  // TENSORFLOW_STATUS_OK
+    return FromC(c_status);
+  }
+
+  TF_Status* const c_status;  // NOLINT
+};
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_api.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_api.h
new file mode 100644
index 00000000..8338f0f6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_api.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_API_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_API_H_
+
+#include "xla/stream_executor/tpu/libtftpu.h"
+#include "xla/stream_executor/tpu/tpu_executor_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "xla/stream_executor/tpu/tpu_profiler_c_api.h"
+
+namespace stream_executor {
+namespace tpu {
+
+TfTpu_BaseFn* InitializeApiFn();
+
+const TfTpu_OpsApiFn* OpsApiFn();
+
+const TfTpu_ProfilerApiFn* ProfilerApiFn();
+
+}  // namespace tpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_api_dlsym_set_fn.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_api_dlsym_set_fn.h
new file mode 100644
index 00000000..5aaa164b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_api_dlsym_set_fn.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_API_DLSYM_SET_FN_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_API_DLSYM_SET_FN_H_
+
+#define TFTPU_SET_FN(Struct, FnName)                                       \
+  Struct->FnName##Fn =                                                     \
+      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName)); \
+  if (!(Struct->FnName##Fn)) {                                             \
+    LOG(FATAL) << #FnName " not available in this library.";               \
+    return absl::UnimplementedError(#FnName                                \
+                                    " not available in this library.");    \
+  }
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_API_DLSYM_SET_FN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_event.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_event.h
new file mode 100644
index 00000000..36614c23
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_event.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
+
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_executor_api.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace stream_executor {
+namespace tpu {
+
+class TpuEvent : public Event {
+ public:
+  explicit TpuEvent(SE_Event* event,
+                    tensorflow::tpu::TpuPlatformInterface* platform)
+      : event_(event), platform_(platform) {}
+  ~TpuEvent() override {
+    platform_->EraseEvent(this);
+    ExecutorApiFn()->TpuEvent_FreeFn(event_);
+  }
+
+ private:
+  SE_Event* event_;
+  tensorflow::tpu::TpuPlatformInterface* platform_;
+};
+
+}  // namespace tpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
new file mode 100644
index 00000000..756876bb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
@@ -0,0 +1,78 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/executable.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_executable_interface.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+
+namespace xla {
+
+class TpuExecutable : public xla::TpuExecutableInterface {
+ public:
+  TpuExecutable(SE_Executable* se_executable,
+                std::shared_ptr<HloModule> hlo_module)
+      : TpuExecutableInterface(std::move(hlo_module)),
+        se_executable_(se_executable) {}
+
+  ~TpuExecutable() override;
+
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments) override;
+
+  absl::string_view fingerprint() const override;
+
+  // The serialization is not guaranteed to be stable over time and has no
+  // compatibility guarantees (i.e. this is not a suitable long-term storage
+  // format).
+  absl::StatusOr<std::string> Serialize() const;
+  static absl::StatusOr<std::unique_ptr<TpuExecutable>> Deserialize(
+      absl::string_view serialized);
+
+ private:
+  absl::Status LoadProgramAndEnqueueToStream(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      const std::vector<stream_executor::DeviceMemoryBase>&
+          cross_program_prefetch_addrs,
+      const std::vector<uint32_t>& cross_program_prefetch_offsets) override {
+    LOG(FATAL) << "LoadProgramAndEnqueueToStream unimplemented";
+  }
+
+  SE_Executable* se_executable_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
new file mode 100644
index 00000000..5ed03e63
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/executable.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+
+// An executable capable of being fed to a TPU device.
+class TpuExecutableInterface : public Executable {
+ public:
+  explicit TpuExecutableInterface(std::shared_ptr<HloModule> hlo_module)
+      : Executable(std::move(hlo_module)) {}
+  ~TpuExecutableInterface() override = default;
+
+  absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments) override;
+
+  // Same as AllocateOutputMemory, except that input buffers can be reused
+  // as output buffers. See UserBufferAlias class comment for more details on
+  // the buffer reuse.
+  //
+  // `alias_config` indicates which input and output buffers can be aliased.
+  //
+  // `arguments` are ExecutionInput containing the input parameters. Currently
+  // only a single input parameter (typically a tuple) is supported on TPU. For
+  // each element in the shape tree, if the element holds the ownership of the
+  // memory, it is considered donated and XLA will potentially reuse it as
+  // output buffers.
+  //
+  // The optional 'transfer_stream' parameter enables transfers (for tuple
+  // tables) to be performed on a separate stream to 'stream'.
+  absl::StatusOr<ExecutionOutput> AllocateOutputMemoryWithInputReuse(
+      const Shape& shape, const HloInputOutputAliasConfig& alias_config,
+      se::DeviceMemoryAllocator* allocator,
+      std::vector<ExecutionInput>* arguments, se::Stream* stream,
+      se::Stream* transfer_stream = nullptr);
+
+  virtual absl::Status LoadProgramAndEnqueueToStream(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      const std::vector<stream_executor::DeviceMemoryBase>&
+          cross_program_prefetch_addrs,
+      const std::vector<uint32_t>& cross_program_prefetch_offsets) = 0;
+
+  virtual absl::string_view fingerprint() const = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
new file mode 100644
index 00000000..65ca29a0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -0,0 +1,170 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+#include "xla/stream_executor/tpu/tpu_executor_interface.h"
+#include "xla/stream_executor/tpu/tpu_platform.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+#include "xla/stream_executor/tpu/tpu_topology.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/types.h"
+
+namespace stream_executor {
+namespace tpu {
+
+class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
+ public:
+  using StatusCallback = std::function<void(const absl::Status&)>;
+
+  TpuExecutor(::tensorflow::tpu::TpuPlatformInterface* platform,
+              SE_StreamExecutor* executor, int device_ordinal)
+      : TpuExecutorInterface(platform),
+        platform_(platform),
+        executor_(executor),
+        device_ordinal_(device_ordinal) {}
+
+  ~TpuExecutor() override;
+
+  absl::Status Init() override;
+
+  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override;
+
+  void DeallocateStream(Stream* stream) override;
+
+  void Deallocate(const DeviceMemoryBase& memory);
+
+  void Deallocate(DeviceMemoryBase* memory) override;
+
+  bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
+
+  void DequeueOutfeed(int32_t outfeed_queue_index, absl::Span<uint8_t> bytes,
+                      StatusCallback done);
+
+  absl::Status EnqueueInfeed(int32_t infeed_queue_index,
+                             absl::Span<const uint8_t> bytes);
+
+  std::optional<stream_executor::AllocatorStats> GetAllocatorStats() override;
+
+  tensorflow::tpu::TpuCoreLocationExternal GetCoreLocationExternal()
+      const override;
+
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) override;
+
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
+
+  bool SynchronizeAllActivity() override;
+
+  absl::Status SynchronousMemcpy(DeviceMemoryBase* device_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& device_src,
+                                 uint64_t size) override;
+  absl::Status UnloadAllPrograms() override;
+
+  absl::Status EnqueueCompactionOnStreamForHbm(
+      Stream* compaction_stream) override;
+
+  const ::tensorflow::tpu::TpuPlatformInterface& platform() const override {
+    return *platform_;
+  }
+
+  ::tensorflow::tpu::TpuPlatformInterface& platform() override {
+    return *platform_;
+  }
+  int device_ordinal() const override { return device_ordinal_; }
+  // TODO(henrytan): convert this to override once the base interface is changed
+  // to TpuExecutorInterface.
+  absl::StatusOr<std::unique_ptr<
+      tensorflow::tpu::TpuExecutorInterface::TemporaryDeviceMemory>>
+  CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
+                              int64_t size) override {
+    LOG(FATAL) << "Unimplemented.";
+  }
+
+  // -- Unimplemented (stubbed out) methods.
+
+  absl::Status EnablePeerAccessTo(StreamExecutor* other) override {
+    LOG(FATAL) << "not yet implemented";
+  }
+  bool CanEnablePeerAccessTo(StreamExecutor* other) override {
+    LOG(FATAL) << "not yet implemented";
+  }
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
+    LOG(FATAL) << "not yet implemented";
+  }
+  void HostMemoryDeallocate(void* mem) override {
+    LOG(FATAL) << "not yet implemented";
+  }
+  absl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                  uint64_t size) override {
+    LOG(FATAL) << "not yet implemented";
+  }
+
+  SE_StreamExecutor* se_executor() { return executor_; }
+
+ private:
+  tensorflow::tpu::TpuPlatform& tpu_platform() {
+    return *(tensorflow::down_cast<tensorflow::tpu::TpuPlatform*>(platform_));
+  }
+
+  tensorflow::tpu::TpuPlatform::StreamMap& stream_map() {
+    return *(tpu_platform().stream_map());
+  }
+
+  SE_Stream* get_stream(Stream* ptr) {
+    absl::MutexLock m(&tpu_platform().mutex());
+    return stream_map()[ptr];
+  }
+
+  tensorflow::tpu::TpuPlatformInterface* platform_;
+  SE_StreamExecutor* executor_;
+  int device_ordinal_;
+};
+
+}  // namespace tpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_api.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_api.h
new file mode 100644
index 00000000..c28d4d5c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_api.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_API_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_API_H_
+
+#include "xla/stream_executor/tpu/libtftpu.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+
+namespace stream_executor {
+namespace tpu {
+
+TfTpu_ExecutorApiFn* ExecutorApiFn();
+
+// Returns whether function pointers in `executor_api_fn` have been set and
+// stream_executor is enabled.
+bool IsStreamExecutorEnabled(TfTpu_ExecutorApiFn* executor_api_fn);
+
+// Returns whether function pointers in `executor_api_fn` have been set.  If
+// false, it probably means an appropriate initializer needs to be linked in.
+bool IsInitialized(TfTpu_ExecutorApiFn* executor_api_fn);
+
+}  // namespace tpu
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
new file mode 100644
index 00000000..3b13c28c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -0,0 +1,500 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_C_API_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/libtftpu.h"
+
+extern "C" {
+
+SE_Platform* TpuPlatform_New();
+void TpuPlatform_Free(SE_Platform* platform);
+void TpuPlatform_Initialize(SE_Platform* platform, TF_Status* status);
+bool TpuPlatform_Initialized(SE_Platform* platform);
+SE_StreamExecutor* TpuPlatform_GetExecutor(SE_Platform* platform, int ordinal,
+                                           TF_Status* status);
+SE_PlatformId TpuPlatform_Id(SE_Platform* platform);
+int64_t TpuPlatform_VisibleDeviceCount(SE_Platform* platform);
+bool TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy(SE_Platform* platform);
+SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
+SE_TpuTopology_Host* TpuPlatform_GetHostLocation(SE_Platform* platform);
+TpuRuntimeVersion TpuPlatform_GetRuntimeVersion(SE_Platform* platform);
+
+void TpuExecutor_Init(SE_StreamExecutor* executor, TF_Status* status);
+void TpuExecutor_Free(SE_StreamExecutor* executor);
+
+SE_DeviceMemoryBase TpuExecutor_Allocate(SE_StreamExecutor* executor,
+                                         uint64_t size, int64_t memory_space);
+void TpuExecutor_Deallocate(SE_StreamExecutor* executor,
+                            SE_DeviceMemoryBase* memory);
+bool TpuExecutor_GetAllocatorStats(SE_StreamExecutor* executor,
+                                   SE_AllocatorStats* stats);
+bool TpuExecutor_DeviceMemoryUsage(SE_StreamExecutor* executor, int64_t* free,
+                                   int64_t* total);
+
+bool TpuExecutor_AllocateStream(SE_StreamExecutor* executor, SE_Stream* stream);
+void TpuExecutor_DeallocateStream(SE_StreamExecutor* executor,
+                                  SE_Stream* stream);
+bool TpuExecutor_CreateStreamDependency(SE_StreamExecutor* executor,
+                                        SE_Stream* dependent, SE_Stream* other);
+void TpuExecutor_GetStatus(SE_StreamExecutor* executor, SE_Stream* stream,
+                           TF_Status* status);
+
+SE_TpuTopology_Core* TpuExecutor_GetCoreLocation(SE_StreamExecutor* executor);
+
+void TpuExecutor_AllocateEvent(SE_StreamExecutor* executor, SE_Event* event,
+                               TF_Status* status);
+void TpuExecutor_RecordEvent(SE_StreamExecutor* executor, SE_Stream* stream,
+                             SE_Event* event, TF_Status* status);
+void TpuExecutor_WaitForEvent(SE_StreamExecutor* executor, SE_Stream* stream,
+                              SE_Event* event, TF_Status* status);
+
+void TpuExecutor_SynchronousMemcpyToHost(SE_StreamExecutor* executor,
+                                         void* host_dst,
+                                         const SE_DeviceMemoryBase* device_src,
+                                         uint64_t size, TF_Status* status);
+void TpuExecutor_SynchronousMemcpyFromHost(SE_StreamExecutor* executor,
+                                           SE_DeviceMemoryBase* device_dst,
+                                           const void* host_src, uint64_t size,
+                                           TF_Status* status);
+void TpuExecutor_MemcpyToHost(SE_StreamExecutor* executor, SE_Stream* stream,
+                              void* host_dst,
+                              const SE_DeviceMemoryBase* device_src,
+                              uint64_t size, TF_Status* status);
+
+void TpuExecutor_MemcpyFromHost(SE_StreamExecutor* executor, SE_Stream* stream,
+                                SE_DeviceMemoryBase* device_dst,
+                                const void* host_src, uint64_t size,
+                                TF_Status* status);
+
+void TpuExecutor_EnqueueInfeed(SE_StreamExecutor* executor,
+                               int32_t infeed_queue_index, const uint8_t* data,
+                               int64_t size, TF_Status* status);
+void TpuExecutor_DequeueOutfeed(SE_StreamExecutor* executor,
+                                int32_t outfeed_queue_index, uint8_t* data,
+                                int64_t size, TF_Status* status);
+
+void TpuExecutor_BlockHostUntilDone(SE_StreamExecutor* executor,
+                                    SE_Stream* stream, TF_Status* status);
+bool TpuExecutor_SynchronizeAllActivity(SE_StreamExecutor* executor);
+
+void TpuExecutor_UnloadAllPrograms(SE_StreamExecutor* executor,
+                                   TF_Status* status);
+void TpuExecutor_EnqueueCompactionOnStreamForHbm(SE_StreamExecutor* executor,
+                                                 SE_Stream* compaction_stream,
+                                                 TF_Status* status);
+
+SE_Stream* TpuStream_New(SE_StreamExecutor* parent);
+void TpuStream_Free(SE_Stream*);
+void* TpuStream_Stream(SE_Stream*);
+bool TpuStream_Status(SE_Stream*);
+bool TpuStream_IsSameSharedMemoryLocation(SE_Stream*, SE_Stream*);
+void TpuStream_EnqueueTransferHostToDevice(SE_Stream* stream,
+                                           SE_DeviceMemoryBase device_dst,
+                                           void* host_src, uint64_t size,
+                                           TF_Status* status);
+void TpuStream_EnqueueTransferDeviceToHost(SE_Stream* stream,
+                                           SE_DeviceMemoryBase device_src,
+                                           void* host_dst, uint64_t size,
+                                           TF_Status* status);
+void TpuStream_TpuEnqueueOnDeviceSendRecvLocal(SE_Stream* stream,
+                                               SE_DeviceMemoryBase send_buffer,
+                                               SE_DeviceMemoryBase recv_buffer,
+                                               TF_Status* status);
+
+SE_Event* TpuEvent_New(SE_StreamExecutor* parent);
+void TpuEvent_Free(SE_Event*);
+
+TF_Status* TpuStatus_New();
+TF_Status* TpuStatus_Create(int32_t code, const char* msg);
+void TpuStatus_Set(TF_Status* status, int32_t code, const char* msg,
+                   int32_t len);
+void TpuStatus_Free(TF_Status* status);
+const char* TpuStatus_Message(TF_Status* status);
+int TpuStatus_Code(TF_Status* status);
+bool TpuStatus_Ok(TF_Status* status);
+
+SE_DeviceDescription* TpuDeviceDescription_New();
+void TpuDeviceDescription_Free(SE_DeviceDescription* description);
+void TpuExecutor_CreateDeviceDescription(SE_StreamExecutor* executor,
+                                         SE_DeviceDescription* description,
+                                         TF_Status* status);
+
+bool TpuExecutor_HostCallback(SE_StreamExecutor* executor, SE_Stream* stream,
+                              SE_StatusCallback callback_fn, void* ctx);
+
+XLA_TransferManager* TpuTransferManager_New();
+void TpuTransferManager_Free(XLA_TransferManager* manager);
+SE_PlatformId TpuTransferManager_PlatformId(XLA_TransferManager* manager);
+void TpuTransferManager_HostShapeToDeviceShape(XLA_TransferManager* manager,
+                                               XLA_Shape* host_shape,
+                                               XLA_Shape* device_shape);
+void TpuTransferManager_TransferLiteralToDeviceAsync(
+    XLA_TransferManager* manager, SE_Stream* stream, XLA_Literal* literal,
+    XLA_ShapedBuffer* device_buffer, TF_Status* status);
+void TpuTransferManager_TransferLiteralFromDevice(
+    XLA_TransferManager* manager, SE_Stream* stream,
+    XLA_ShapedBuffer* device_buffer, XLA_Literal* literal,
+    XLA_StatusCallbackFn callback, void* ctx);
+int64_t TpuTransferManager_GetByteSizeRequirement(XLA_TransferManager* manager,
+                                                  XLA_Shape* shape);
+void TpuTransferManager_ChooseCompactLayoutForShape(
+    XLA_TransferManager* manager, XLA_Shape* host_shape, XLA_Shape* output,
+    TF_Status* status);
+bool TpuTransferManager_CanShapedBufferBeAccessedNow(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    XLA_ShapedBuffer* device_buffer);
+bool TpuTransferManager_CanBufferBeAccessedNow(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    SE_DeviceMemoryBase* device_buffer);
+void TpuTransferManager_WriteSingleTupleIndexTable(
+    XLA_TransferManager* manager, SE_Stream* stream,
+    SE_DeviceMemoryBase* elements, size_t elements_len, XLA_Shape* shape,
+    SE_DeviceMemoryBase* region, TF_Status* status);
+void TpuTransferManager_GetInfeedLayout(XLA_Shape* shape,
+                                        XLA_Shape* infeed_shape);
+void TpuTransferManager_LinearizeToBuffers(
+    XLA_TransferManager* manager, XLA_Literal* c_literal,
+    XLA_Shape* c_device_shape, char*** buffers_array, int64_t** buffers_size,
+    int64_t* buffers_array_size, TF_Status* status);
+void TpuTransferManager_FreeBuffers(char** buffers_array, int64_t* buffers_size,
+                                    int64_t buffers_array_size);
+void TpuTransferManager_TransferLiteralToInfeed(XLA_TransferManager* manager,
+                                                SE_StreamExecutor* executor,
+                                                XLA_Literal* c_literal,
+                                                TF_Status* status);
+void TpuTransferManager_TransferBuffersToInfeed(XLA_TransferManager* manager,
+                                                SE_StreamExecutor* executor,
+                                                uint32_t** buffers_array,
+                                                int64_t* buffers_size_in_uint32,
+                                                int64_t buffers_array_size,
+                                                TF_Status* status);
+void TpuTransferManager_TransferLiteralFromOutfeed(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    XLA_Shape* shape /*deprecated*/, XLA_Literal* c_literal, TF_Status* status);
+void TpuTransferManager_ResetDevices(XLA_TransferManager* manager,
+                                     SE_StreamExecutor** executors,
+                                     int64_t num_executors, TF_Status* status);
+void TpuTransferManager_ReadDynamicShapes(SE_Stream* stream,
+                                          XLA_ShapedBuffer* buffer,
+                                          const XLA_Shape& original_shape,
+                                          XLA_Shape* updated_shape,
+                                          TF_Status* status);
+
+XLA_ComputationPlacer* TpuComputationPlacer_New();
+void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
+// `assignment` should be a preallocated array of size `replicate_count` *
+// `computation_count`. The assignment will be constructed as a 2D array where
+// assignment[replica][computation] = device_id.
+void TpuComputationPlacer_AssignDevices(XLA_ComputationPlacer* placer,
+                                        int replica_count,
+                                        int computation_count, int* assignment,
+                                        TF_Status* status);
+void TpuComputationPlacer_AssignLocalDevices(SE_TpuTopology_Host* host,
+                                             int replica_count,
+                                             int computation_count,
+                                             int* assignment,
+                                             TF_Status* status);
+
+int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
+                                      TpuCoreTypeEnum tpu_core_type);
+int TpuTopology_LogicalDevicesPerChip(SE_TpuTopology* tpu_topology,
+                                      TpuCoreTypeEnum tpu_core_type);
+int TpuTopology_HostCount(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipsPerHost(SE_TpuTopology* tpu_topology);
+
+int TpuTopology_ChipBounds_X(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Y(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Z(SE_TpuTopology* tpu_topology);
+bool TpuTopology_HasChip(SE_TpuTopology* tpu_topology, int x, int y, int z);
+SE_TpuTopology_Core* TpuTopology_CoreForId(SE_TpuTopology* tpu_topology,
+                                           TpuCoreTypeEnum tpu_core_type,
+                                           int id);
+SE_TpuTopology_Core* TpuTopology_Core(SE_TpuTopology* tpu_topology,
+                                      TpuCoreTypeEnum tpu_core_type, int x,
+                                      int y, int z, int index);
+int TpuTopology_NumCores(SE_TpuTopology* tpu_topology,
+                         TpuCoreTypeEnum tpu_core_type);
+// 'cores' should be a preallocated array of size TpuTopology_NumCores.
+void TpuTopology_Cores(SE_TpuTopology* tpu_topology,
+                       TpuCoreTypeEnum tpu_core_type,
+                       SE_TpuTopology_Core** cores);
+int TpuTopology_IdForHost(SE_TpuTopology* tpu_topology, int x, int y, int z);
+TpuVersionEnum TpuTopology_Version(SE_TpuTopology* tpu_topology);
+void TpuCoreLocation_ChipCoordinates(SE_TpuTopology_Core* tpu_core_location,
+                                     int* x, int* y, int* z);
+void TpuCoreLocation_HostCoordinates(SE_TpuTopology_Core* tpu_core_location,
+                                     int* x, int* y, int* z);
+int TpuCoreLocation_Index(SE_TpuTopology_Core* tpu_core_location);
+int TpuCoreLocation_Id(SE_TpuTopology_Core* tpu_core_location);
+
+int TpuHostLocation_Id(SE_TpuTopology_Host* tpu_host_location);
+int TpuHostLocation_NumCores(SE_TpuTopology_Host* tpu_host_location,
+                             TpuCoreTypeEnum tpu_core_type);
+// 'cores' should be a preallocated array of size TpuHostLocation_NumCores.
+void TpuHostLocation_Cores(SE_TpuTopology_Host* tpu_host_location,
+                           TpuCoreTypeEnum tpu_core_type,
+                           SE_TpuTopology_Core** cores);
+
+// Async collective offloading.
+// Safe to call multiple times.
+void TpuAsyncCollectiveOffloadHelper_Init();
+// Must be called after TpuAsyncCollectiveOffloadHelper_Init.
+void TpuAsyncCollectiveOffloadHelper_Shutdown();
+
+// C API for XLA::Compiler interface
+
+TFTPU_CAPI_EXPORT Tpu_Compiler* TpuCompiler_New();
+TFTPU_CAPI_EXPORT void TpuCompiler_Free(Tpu_Compiler* compiler);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_RunHloPasses(
+    Tpu_Compiler* compiler, XLA_HloModule* se_hlo_module,
+    SE_StreamExecutor* stream_executor, SE_DeviceMemoryAllocator* allocator,
+    XLA_HloModule* result, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_RunBackend(
+    Tpu_Compiler* compiler, XLA_HloModule* se_hlo_module,
+    SE_StreamExecutor* stream_executor, SE_DeviceMemoryAllocator* allocator,
+    SE_Executable** result, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_Compile(
+    Tpu_Compiler* compiler, XLA_HloModuleGroup* se_hlo_module_group,
+    SE_StreamExecutorList* stream_exec_lists, int num_lists,
+    SE_DeviceMemoryAllocator* allocator, SE_Executable** executables,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT int64_t TpuCompiler_ShapeSize(Tpu_Compiler* compiler,
+                                                XLA_Shape* c_shape);
+
+TFTPU_CAPI_EXPORT void TpuCompiler_DefaultDeviceShapeRepresentation(
+    Tpu_Compiler* compiler, XLA_Shape* host_shape, XLA_Shape* device_shape);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
+    SE_Executable* executable, SE_ExecutableRunOptions* se_options,
+    SE_ExecutionInput** se_arguments, int se_arguments_size,
+    SE_ExecutionOutput* se_output, TF_Status* status);
+
+// This frees the XLA_ShapeIndex* array allocated when se_output is returned by
+// TpuExecutable_ExecuteAsyncOnStream.
+TFTPU_CAPI_EXPORT void TpuExecutable_FreeXlaShapeIndexArray(
+    XLA_ShapeIndex* array);
+
+// This frees the SE_MaybeOwningDeviceMemory* array allocated when se_output is
+// returned by TpuExecutable_ExecuteAsyncOnStream.
+// Note that this only frees the heap-allocated array itself, and does not
+// free any of the underlying device memory.
+TFTPU_CAPI_EXPORT void TpuExecutable_FreeMaybeOwningDeviceMemoryArray(
+    SE_MaybeOwningDeviceMemory* array);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_Fingerprint(SE_Executable* executable,
+                                                 const char** fingerprint,
+                                                 size_t* size);
+
+// The serialization format is not guaranteed to be stable over time and has no
+// compatibility guarantees (i.e. this is not a suitable long-term storage
+// format). TpuExecutableSerialize_FreeHandle should be called after 'handle' is
+// no longer needed. 'handle' is set to nullptr on error.
+TFTPU_CAPI_EXPORT void TpuExecutable_Serialize(
+    SE_Executable* executable, SE_ExecutableSerializationHandle** handle,
+    TF_Status* status);
+
+// Returns the size of the serialized executable in bytes, i.e. the size of the
+// array that should be passed to TpuExecutableSerialize_WriteToArray. `handle`
+// must be non-null.
+TFTPU_CAPI_EXPORT size_t
+TpuExecutableSerialize_GetByteSize(SE_ExecutableSerializationHandle* handle);
+
+// Writes the serialized executable to `serialized`, which must be of size
+// `serialized_size`. `serialized_size` should must be at least
+// `TpuExecutableSerialize_GetByteSize(handle)`. `handle` must be non-null.
+TFTPU_CAPI_EXPORT void TpuExecutableSerialize_WriteToArray(
+    SE_ExecutableSerializationHandle* handle, int serialized_size,
+    uint8_t* serialized, TF_Status* status);
+
+// Safe to call if 'handle' is null.
+TFTPU_CAPI_EXPORT void TpuExecutableSerialize_FreeHandle(
+    SE_ExecutableSerializationHandle* handle);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_Deserialize(int serialized_size,
+                                                 const uint8_t* serialized,
+                                                 SE_Executable** executable,
+                                                 TF_Status* status);
+
+// Caller is responsible for freeing the returned module's proto and its
+// config's proto.
+TFTPU_CAPI_EXPORT XLA_HloModule
+TpuExecutable_HloModule(SE_Executable* executable);
+
+TFTPU_CAPI_EXPORT void TpuExecutable_Free(SE_Executable*);
+
+// Converts an XLA `Shape` into its equivalent TPU `Shape` representation.
+TFTPU_CAPI_EXPORT void XlaShapeToTpuShapeRepresentation(
+    XLA_Shape* serialized_xla_shape, int data_type, bool use_fast_memory,
+    XLA_Shape* serialized_tpu_shape, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void XlaShapeToTpuPaddedShape(XLA_Shape* serialized_xla_shape,
+                                                XLA_Shape* padded_shape,
+                                                TF_Status* status);
+
+struct TfTpu_ExecutorApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Initialize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Initialized);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetExecutor);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Id);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_VisibleDeviceCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetTopologyPtr);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetHostLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetRuntimeVersion);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Init);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Allocate);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Deallocate);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetAllocatorStats);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeviceMemoryUsage);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateStreamDependency);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetCoreLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_RecordEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_MemcpyToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_MemcpyFromHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_EnqueueInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DequeueOutfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_BlockHostUntilDone);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronizeAllActivity);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_UnloadAllPrograms);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_EnqueueCompactionOnStreamForHbm);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Stream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Status);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_EnqueueTransferHostToDevice);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_EnqueueTransferDeviceToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuEvent_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEvent_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Set);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Message);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Code);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Ok);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuDeviceDescription_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuDeviceDescription_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateDeviceDescription);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_HostCallback);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_PlatformId);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_ChooseCompactLayoutForShape);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_CanShapedBufferBeAccessedNow);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_CanBufferBeAccessedNow);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetInfeedLayout);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_LinearizeToBuffers);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_FreeBuffers);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferBuffersToInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromOutfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_ResetDevices);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_ReadDynamicShapes);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignDevices);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignLocalDevices);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_HostCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipsPerHost);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_X);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Y);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Z);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_HasChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_CoreForId);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Core);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_NumCores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Cores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_IdForHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Version);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_ChipCoordinates);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_HostCoordinates);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Index);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Id);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Id);
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_NumCores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Cores);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_RunHloPasses);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_RunBackend);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Compile);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_ShapeSize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_DefaultDeviceShapeRepresentation);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeXlaShapeIndexArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeMaybeOwningDeviceMemoryArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Fingerprint);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Serialize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutableSerialize_GetByteSize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutableSerialize_WriteToArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutableSerialize_FreeHandle);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Deserialize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_HloModule);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuShapeRepresentation);
+  TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuPaddedShape);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuAsyncCollectiveOffloadHelper_Init);
+  TFTPU_ADD_FN_IN_STRUCT(TpuAsyncCollectiveOffloadHelper_Shutdown);
+};
+}
+
+// extern "C"
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
new file mode 100644
index 00000000..043257c5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -0,0 +1,75 @@
+#include "xla/stream_executor/platform.h"
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor_common.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+#include "xla/stream_executor/tpu/tpu_topology.h"
+
+namespace tpu {
+class TpuCore;
+}  // namespace tpu
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuExecutorInterface : public stream_executor::StreamExecutorCommon {
+ public:
+  explicit TpuExecutorInterface(stream_executor::Platform* platform)
+      : StreamExecutorCommon(platform) {}
+
+  class TemporaryDeviceMemory {
+   public:
+    virtual ~TemporaryDeviceMemory() {}
+    virtual stream_executor::DeviceMemoryBase AsDeviceMemoryBase() const = 0;
+  };
+
+  virtual absl::StatusOr<std::unique_ptr<TemporaryDeviceMemory>>
+  CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
+                              int64_t size) {
+    LOG(FATAL) << "Unimplemented.";
+  }
+
+  virtual const TpuPlatformInterface& platform() const {
+    LOG(FATAL) << "Unimplemented.";
+  }
+
+  virtual TpuPlatformInterface& platform() { LOG(FATAL) << "Unimplemented."; }
+
+  virtual TpuCoreLocationExternal GetCoreLocationExternal() const {
+    LOG(FATAL) << "Unimplemented.";
+  }
+
+  virtual absl::Status UnloadAllPrograms() { LOG(FATAL) << "Unimplemented."; }
+
+  virtual absl::Status EnqueueCompactionOnStreamForHbm(
+      stream_executor::Stream* compaction_stream) {
+    LOG(FATAL) << "Unimplemented.";
+  }
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.h
new file mode 100644
index 00000000..9567f581
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_INITIALIZE_UTIL_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_INITIALIZE_UTIL_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// This will acquire a system-wide lock on behalf of the whole process. Follow
+// up calls to this function will return true if the lock has been acquired and
+// false if we failed to acquire the lock.
+absl::Status TryAcquireTpuLock();  // TENSORFLOW_STATUS_OK
+
+// Returns arguments (e.g. flags) set in the LIBTPU_INIT_ARGS environment
+// variable. The first return value is the arguments, the second return value is
+// pointers to the arguments suitable for passing into the C API.
+std::pair<std::vector<std::string>, std::vector<const char*>>
+GetLibTpuInitArguments();
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_INITIALIZE_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
new file mode 100644
index 00000000..ba51e611
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_H_
+
+#include <memory>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/backend.h"
+#include "xla/service/stream_pool.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// A TpuNodeContext object represents a specific TPU node (core). The static
+// class methods represent host-wide actions.
+//
+// First call Initialize in a freshly reset system. Then call Create to talk to
+// individual nodes.
+class TpuNodeContext final {
+ public:
+  static absl::StatusOr<std::unique_ptr<TpuNodeContext>> Create(
+      int device_ordinal);
+
+  explicit TpuNodeContext(int device_ordinal, XLA_TpuNodeContext* node_context)
+      : device_ordinal_(device_ordinal), node_context_(node_context) {
+    CHECK_NE(node_context, nullptr);
+  }
+  ~TpuNodeContext();
+
+  static absl::Status CloseTpuHost();
+
+  static absl::Status Initialize(int device_ordinal);
+
+  static TpuPlatformInterface* platform();
+
+  int device_ordinal() const;
+
+  xla::Backend* backend() const;
+
+  stream_executor::StreamExecutor* stream_executor() const;
+
+  bool CompactionSupported(int device_ordinal) const;
+
+ private:
+  const int device_ordinal_;
+  XLA_TpuNodeContext* const node_context_;
+
+  TpuNodeContext(const TpuNodeContext&) = delete;
+  void operator=(const TpuNodeContext&) = delete;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
new file mode 100644
index 00000000..c87f7217
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_OP_EXECUTABLE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_OP_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_executable_interface.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+
+// An executable capable of being fed to a TPU device via TpuExecutor.
+class TpuOpExecutable : public xla::TpuExecutableInterface {
+ public:
+  // Constructs an executable that holds a non-owning reference to an
+  // XLA_TpuProgram.
+  explicit TpuOpExecutable(
+      const XLA_TpuProgram* core_program,
+      std::unique_ptr<xla::HloModule> hlo_module,
+      SE_OutsideCompilationParams* outside_compilation_params);
+
+  ~TpuOpExecutable() override = default;
+
+  const XLA_TpuProgram* core_program() const { return core_program_; }
+
+  absl::string_view fingerprint() const override;
+
+ private:
+  absl::Status LoadProgramAndEnqueueToStream(
+      const xla::ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      const std::vector<stream_executor::DeviceMemoryBase>&
+          cross_program_prefetch_addrs,
+      const std::vector<uint32_t>& cross_program_prefetch_offsets) override;
+
+  const XLA_TpuProgram* const core_program_;
+
+  SE_OutsideCompilationParams* outside_compilation_params_;
+
+  TpuOpExecutable(const TpuOpExecutable&) = delete;
+  void operator=(const TpuOpExecutable&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_OP_EXECUTABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
new file mode 100644
index 00000000..8214be6e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -0,0 +1,851 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_OPS_C_API_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_OPS_C_API_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <optional>
+
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/libtftpu.h"
+
+typedef struct TpuSerializedProto TpuSerializedProto;
+
+namespace tensorflow {
+
+class TpuMeshCommonState;
+class TpuEmbeddingEngineState;
+class ResourceMgr;
+
+}  // namespace tensorflow
+
+extern "C" {
+
+typedef struct XLA_TpuProgram XLA_TpuProgram;
+
+// Enum for choosing sharding/unsharding program from a `XLA_TpuProgram` obj.
+enum TpuProgramShardingType { kInvalid = 0, kMain, kSharding, kUnsharding };
+
+struct TpuProgramFingerprint {
+  const char* bytes;
+  size_t size;
+};
+
+struct TpuExecutableSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct CompilerMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct HostComputeMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+typedef struct XLA_TpuMeshState XLA_TpuMeshState;
+
+typedef struct XLA_TpuEmbeddingEngineState XLA_TpuEmbeddingEngineState;
+
+typedef struct TpuEmbedding_TensorBatchFixedState
+    TpuEmbedding_TensorBatchFixedState;
+
+typedef struct XLA_DeviceAssignment {
+  const char* bytes;
+  size_t size;
+} XLA_DeviceAssignment;
+
+// Property for creating compilation cache key.
+struct CompilationCacheKeyProperty {
+  const char* config_prefix;
+  const char* shapes_prefix;
+  const char* function_name;
+  uint64_t mlir_module_fingerprint;
+  const int32_t* device_ids;
+  size_t device_ids_size;
+  int32_t guaranteed_constants_size;
+  uint64_t function_library_fingerprint;
+  int32_t num_cores_per_replica;
+  int32_t num_replicas;
+  const XLA_TpuMeshState* mesh_state;
+  uint64_t session_id;
+  tensorflow::ResourceMgr* resource_mgr;
+};
+
+// Compilation cache key result returning both the key and a more verbose debug
+// version.
+struct CompilationCacheKeyResult {
+  const char* key;
+  const char* debug_string;
+};
+
+typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
+
+typedef struct TfTpu_OrdinalSelector TfTpuOrdinalSelector;
+
+struct TpuPartitionedCall_Params {
+  bool input_shape_opt;
+  bool group_tensors_for_packing;
+  int32_t minimum_input_tensors_packing;
+  int32_t minimum_output_tensors_packing;
+
+  // Whether to attempt to automatically shard inputs by adding an
+  // XlaSharding op after each input.
+  bool enable_auto_xla_input_sharding;
+
+  // The dimension of each input to shard if
+  // enable_auto_xla_input_sharding is set to true. Negative numbers are
+  // allowed and refers to dimensions starting from the end.
+  int32_t auto_xla_input_sharding_dim;
+
+  // If true, only create one variable on the TPU for each variable on the CPU.
+  bool enable_variable_deduplication;
+};
+
+// Compiles Mlir or TF function computation by lowering into HLO IR and returns
+// `count` number of TPU programs ready for execution.
+// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
+// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
+// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
+// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
+// API respectively.
+TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
+    TpuSerializedProto compilation_request, const XLA_TpuMeshState* mesh_state,
+    XLA_TpuProgram** tpu_programs[], size_t* count, TF_Status* status);
+
+// Creates a new TPU mesh state object.
+TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
+
+// Deletes the given TPU `mesh_state` object. Once deleted the object is
+// unusable.
+TFTPU_CAPI_EXPORT void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
+
+// Returns a pointer to an opaque mesh data structure used internally.
+TFTPU_CAPI_EXPORT void* TpuMeshState_MeshCommonState(
+    XLA_TpuMeshState* mesh_state);
+
+// Creates a new TPU embedding engine state object.
+TFTPU_CAPI_EXPORT XLA_TpuEmbeddingEngineState* TpuEmbeddingEngineState_Create();
+
+// Delete the given TPU embedding engine state object. Once deleted the object
+// is unusable.
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngineState_Free(
+    XLA_TpuEmbeddingEngineState* engine_state);
+
+// Returns a pointer to an opaque embedding engine state data structure used
+// internally.
+TFTPU_CAPI_EXPORT void* TpuEmbeddingEngineState_GetState(
+    XLA_TpuEmbeddingEngineState* engine_state);
+
+TFTPU_CAPI_EXPORT void TfTpuOrdinalSelector_Create(
+    TfTpuOrdinalSelector** ordinal_selector, int num_cores_per_replica);
+
+TFTPU_CAPI_EXPORT void TfTpuOrdinalSelector_Destroy(
+    TfTpuOrdinalSelector* ordinal_selector);
+
+TFTPU_CAPI_EXPORT void TfTpuOrdinalSelector_GetOrdinal(
+    TfTpuOrdinalSelector* ordinal_selector, std::optional<uint64_t> key,
+    int64_t* req_id, int64_t* ordinal);
+
+TFTPU_CAPI_EXPORT void TfTpuOrdinalSelector_DequeueFromCoreSelector(
+    TfTpuOrdinalSelector* ordinal_selector, int32_t device_ordinal,
+    int64_t req_id);
+
+TFTPU_CAPI_EXPORT void TfTpu_GetTpuPartitionedCallParams(
+    TpuPartitionedCall_Params* params);
+
+typedef struct TpuExecutable_LoadProgramAndEnqueueToStream_Params {
+  int32_t struct_size;
+  void* priv;
+
+  const XLA_TpuProgram* program;
+  SE_DeviceMemoryBase* arguments;
+  size_t arguments_len;
+  SE_DeviceMemoryBase* result;
+  size_t cross_program_prefetch_addrs_len;
+  SE_DeviceMemoryBase* cross_program_prefetch_addrs;
+  size_t cross_program_prefetch_offsets_len;
+  const uint32_t* cross_program_prefetch_offsets;
+  int32_t rng_seed;
+  XLA_DeviceAssignment* device_assignment;
+  SE_Stream* stream;
+  SE_OutsideCompilationParams* outside_compilation_params;
+
+  TF_Status* status;  // out
+} TpuExecutable_LoadProgramAndEnqueueToStream_Params;
+
+#define TpuExecutable_LoadProgramAndEnqueueToStream_Params_SIZE \
+  (sizeof(struct TpuExecutable_LoadProgramAndEnqueueToStream_Params))
+
+TFTPU_CAPI_EXPORT void TpuExecutable_LoadProgramAndEnqueueToStream(
+    TpuExecutable_LoadProgramAndEnqueueToStream_Params* params);
+
+TFTPU_CAPI_EXPORT void HardwareLayout_HostShapeToDeviceShape(
+    XLA_Shape* host_shape, XLA_Shape* device_shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSize(XLA_Shape* shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompact(XLA_Shape* shape);
+TFTPU_CAPI_EXPORT int64_t HardwareLayout_ShapeSizeCompactRaw(XLA_Shape* shape);
+
+typedef struct TpuExecute_RuntimeInputToPaddedData_Params {
+  int32_t struct_size;
+  void* priv;
+
+  uint32_t* runtime_input_ptr;
+  size_t runtime_input_size;
+  int8_t* padded_data_ptr;
+  size_t padded_data_size;
+  XLA_Shape* runtime_shape;
+  XLA_Shape* compile_time_shape;
+
+  TF_Status* status;  // out
+} TpuExecute_RuntimeInputToPaddedData_Params;
+
+#define TpuExecute_RuntimeInputToPaddedData_Params_SIZE \
+  (sizeof(struct TpuExecute_RuntimeInputToPaddedData_Params))
+
+TFTPU_CAPI_EXPORT void TpuExecute_RuntimeInputToPaddedData(
+    TpuExecute_RuntimeInputToPaddedData_Params* params);
+
+TFTPU_CAPI_EXPORT void TpuExecute_GetTpuEmbeddingMemoryAllocations(
+    int device_ordinal, SE_DeviceMemoryBase** addrs, size_t* addrs_count,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuExecute_FreeTpuEmbeddingMemoryAllocations(
+    int device_ordinal, SE_DeviceMemoryBase* addrs);
+
+typedef struct ConfigureDistributedTpuOp_DoWork_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t num_cores_per_host_size;
+  const int32_t* num_cores_per_host;
+  size_t server_address_size;
+  const char* server_address;
+
+  size_t* host_config_output_size;  // out
+  char** host_config_output;        // out
+  TF_Status* status;                // out
+} ConfigureDistributedTpuOp_DoWork_Params;
+
+#define ConfigureDistributedTpuOp_DoWork_Params_SIZE \
+  (sizeof(struct ConfigureDistributedTpuOp_DoWork_Params))
+
+TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
+    ConfigureDistributedTpuOp_DoWork_Params* params);
+
+typedef struct WaitForDistributedTpuOp_DoWork_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t num_hosts;
+  size_t num_cores_per_host;
+  const int32_t** host_ordinal_to_global_core_id_map;
+  tensorflow::TpuMeshCommonState* tpu_mesh_common_state;
+
+  size_t* tpu_topology_output_size;  // out
+  char** tpu_topology_output;        // out
+  TF_Status* status;                 // out
+} WaitForDistributedTpuOp_DoWork_Params;
+
+#define WaitForDistributedTpuOp_DoWork_Params_SIZE \
+  (sizeof(struct WaitForDistributedTpuOp_DoWork_Params))
+
+TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
+    WaitForDistributedTpuOp_DoWork_Params* params);
+
+typedef struct InitializeHostForDistributedTpuOp_DoWork_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t tpu_host_config_size;
+  const char* tpu_host_config;
+  bool enable_whole_mesh_compilations;
+  bool is_master_worker;
+
+  size_t* core_id_output_size;  // out
+  int32_t** core_id_output;     // out
+  TF_Status* status;            // out
+} InitializeHostForDistributedTpuOp_DoWork_Params;
+
+#define InitializeHostForDistributedTpuOp_DoWork_Params_SIZE \
+  (sizeof(struct InitializeHostForDistributedTpuOp_DoWork_Params))
+
+TFTPU_CAPI_EXPORT void InitializeHostForDistributedTpuOp_DoWork(
+    InitializeHostForDistributedTpuOp_DoWork_Params* params);
+
+TFTPU_CAPI_EXPORT void SetGlobalTPUArrayOp_DoWork(
+    const size_t tpu_topology_size, const char* tpu_topology,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT void DisconnectDistributedTpuChipsOp_DoWork(
+    int32_t* number_of_chips_output, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeCharArray(char* output);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeInt32Array(int32_t* output);
+
+TFTPU_CAPI_EXPORT bool TpuConfigurationApi_HasTPUPodState();
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpusPerHost(int32_t* tpus,
+                                                       TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpuMemoryLimit(int64_t* memory_limit,
+                                                          TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_RemoteCompilationCacheSizeInBytes(
+    int64_t* cache_size_in_bytes);
+
+typedef struct TpuConfigurationApi_CompilationCacheServerAddrFromConfig_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t tpu_host_config_size;
+  const char* tpu_host_config;
+
+  size_t* server_address_output_size;  // out
+  char** server_address_output;        // out
+  TF_Status* status;                   // out
+} TpuConfigurationApi_CompilationCacheServerAddressFromConfig_Params;
+
+#define TpuConfigurationApi_CompilationCacheServerAddrFromConfig_Params_SIZE \
+  (sizeof(                                                                   \
+      struct TpuConfigurationApi_CompilationCacheServerAddrFromConfig_Params))
+
+TFTPU_CAPI_EXPORT
+void TpuConfigurationApi_CompilationCacheServerAddressFromConfig(
+    TpuConfigurationApi_CompilationCacheServerAddrFromConfig_Params* params);
+
+typedef struct TpuConfigurationApi_GetServerAddressAndPort_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t* server_address_output_size;  // out
+  char** server_address_output;        // out
+  int* port_output;                    // out
+  TF_Status* status;                   // out
+} TpuConfigurationApi_GetServerAddressAndPort_Params;
+
+#define TpuConfigurationApi_GetServerAddressAndPort_Params_SIZE \
+  (sizeof(struct TpuConfigurationApi_GetServerAddressAndPort_Params))
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_GetServerAddressAndPort(
+    TpuConfigurationApi_GetServerAddressAndPort_Params* params);
+
+// Creates a new TPU program.
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_New();
+
+// Destroys the `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_Free(XLA_TpuProgram* tpu_program);
+
+// Creates an array of `XLA_TpuProgram*`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram** TpuProgram_NewArray(size_t count);
+
+// Destroys an array of `XLA_TpuProgram*`.
+TFTPU_CAPI_EXPORT void TpuProgram_FreeArray(XLA_TpuProgram* tpu_program[]);
+
+// Unloads and destroys the `tpu_program`. Once the TPU program is unloaded and
+// destroyed, it is in an unusable state.
+TFTPU_CAPI_EXPORT void TpuProgram_UnloadAndDestroy(XLA_TpuProgram* tpu_program,
+                                                   TF_Status* status);
+
+// Gets TPU program size in bytes from the `tpu_program`.
+TFTPU_CAPI_EXPORT int64_t
+TpuProgram_GetProgramSize(const XLA_TpuProgram* tpu_program);
+
+// Logs the summary of current memory state snapshot of the `tpu_program`.
+TFTPU_CAPI_EXPORT bool TpuProgram_LogProgramMemorySummary(
+    const XLA_TpuProgram* tpu_program);
+
+// Gets TPU program executable info from the `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_GetExecutableInfo(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info,
+    TF_Status* status);
+
+// Gets host transfer info proto.
+TFTPU_CAPI_EXPORT void TpuProgram_GetHostTransferInfo(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info,
+    TF_Status* status);
+
+// Gets HLO metadata proto.
+TFTPU_CAPI_EXPORT void TpuProgram_GetHloMetadata(
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata,
+    TF_Status* status);
+
+// Gets may modify variables boolean value.
+TFTPU_CAPI_EXPORT void TpuProgram_GetMayModifyVariables(
+    const XLA_TpuProgram* tpu_program, bool* may_modify_variables);
+
+// Checks if TPU program has sharding.
+TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
+    const XLA_TpuProgram* tpu_program);
+
+// Gets TPU program by sharding type. Return value is valid only when the
+// `status.status()` returns `OK`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_GetTpuProgram(
+    XLA_TpuProgram* tpu_program, TpuProgramShardingType type);
+
+// Gets TPU executable proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeTpuExecutable(
+    const XLA_TpuProgram* tpu_program, TpuExecutableSerializedProto* executable,
+    TF_Status* status);
+
+// Gets compilation metadata proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeCompilerMetadata(
+    const XLA_TpuProgram* tpu_program,
+    CompilerMetadataSerializedProto* compiler_metadata, TF_Status* status);
+
+// Deserializes the `GetTpuProgramResponse` proto into an `XLA_TpuProgram`.
+TFTPU_CAPI_EXPORT void TpuProgram_DeserializeFromGetTpuProgramResponseProto(
+    TpuSerializedProto get_tpu_program_response, XLA_TpuProgram* tpu_program,
+    TF_Status* status);
+
+TFTPU_CAPI_EXPORT TpuProgramFingerprint
+TpuProgram_GetFingerprint(const XLA_TpuProgram* tpu_program);
+
+TFTPU_CAPI_EXPORT void TpuProgram_DestroyFingerprint(
+    TpuProgramFingerprint fingerprint);
+
+// Checks if whether a TPU compilation is enabled.
+TFTPU_CAPI_EXPORT bool TpuCompile_IsTpuCompilationEnabled();
+
+// XLA compilation cannot be cancelled. To avoid hanging the TF worker will exit
+// when cancellation is requested for an XLA compile op. Some tests require this
+// behavior to be disabled, and we test for this condition with the following
+// flag function.
+TFTPU_CAPI_EXPORT bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
+
+// Returns the number of available TPU core count.
+TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
+    const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
+
+// Returns the number of cores per Chip.
+TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoresPerChip(
+    TpuCoreTypeEnum tpu_core_type);
+
+// Recycle unused service port.
+TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
+
+// Creates a unique compilation cache `key` used for `put` and `get` operations.
+// Returned buffers are heap-allocated and must be owned.
+TFTPU_CAPI_EXPORT CompilationCacheKeyResult
+TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty property);
+
+// Destroys the CompilationCacheKeyResult returned by calling the
+// `TpuCompile_CreateCompilationCacheKey` API.
+TFTPU_CAPI_EXPORT void TpuCompile_DestroyCompilationCacheKey(
+    CompilationCacheKeyResult result);
+
+// Creates a guaranteed const fingerprint. Guarantee const is normally used in
+// TPU inference to avoid re-copying unchanged variables onto the TPU device.
+// It promises the value is identical for every execution in the same session
+// even if the actual value changes in later executions.
+TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
+    uint64_t fingerprint, const char* data, size_t size);
+
+// Returns a pointer to the TPU topology struct.
+TFTPU_CAPI_EXPORT SE_TpuTopology* TpuUtil_GetTopologyPtr();
+
+// Returns XLA pad size from TPU topology.
+TFTPU_CAPI_EXPORT size_t TpuUtil_GetXlaPadSizeFromTpuTopology();
+
+XLA_TpuNodeContext* TpuNodeContext_Create(int device_ordinal,
+                                          TF_Status* status);
+void TpuNodeContext_Free(XLA_TpuNodeContext* node_context);
+
+void TpuNodeContext_CloseTpuHost(TF_Status* status);
+
+void TpuNodeContext_Initialize(int device_ordinal, TF_Status* status);
+
+bool TpuNodeContext_CompactionSupported(int device_ordinal);
+
+// Globally initialize the TPU system for inference.
+TFTPU_CAPI_EXPORT void TfTpu_InitializeTpuModelServer();
+
+typedef struct TpuEmbeddingEngine_ExecutePartitioner_Params {
+  int32_t struct_size;
+  void* priv;
+  TpuSerializedProto tpu_embedding_config;
+
+  // out
+  size_t* common_config_size;
+  char** common_config;
+  TF_Status* status;
+} TpuEmbeddingEngine_ExecutePartitioner_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_ExecutePartitioner(
+    TpuEmbeddingEngine_ExecutePartitioner_Params* params);
+
+typedef struct TpuEmbeddingEngine_ConfigureMemory_Params {
+  int32_t struct_size;
+  void* priv;
+
+  int num_inputs;
+  size_t common_config_size;
+  const char* common_config;
+
+  // out
+  size_t* memory_config_size;
+  char** memory_config;
+  TF_Status* status;
+} TpuEmbeddingEngine_ConfigureMemory_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_ConfigureMemory(
+    TpuEmbeddingEngine_ConfigureMemory_Params* params);
+
+typedef struct TpuEmbeddingEngine_CollateMemory_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t memory_configs_size;
+  const TpuSerializedProto* memory_configs;
+
+  // out
+  size_t* merged_memory_config_size;
+  char** merged_memory_config;
+  TF_Status* status;
+} TpuEmbeddingEngine_CollateMemory_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_CollateMemory(
+    TpuEmbeddingEngine_CollateMemory_Params* params);
+
+typedef struct TpuEmbeddingEngine_ConfigureHost_Params {
+  int32_t struct_size;
+  void* priv;
+
+  int num_inputs;
+  size_t common_config_size;
+  const char* common_config;
+  size_t memory_config_size;
+  const char* memory_config;
+  TpuSerializedProto tpu_embedding_config;
+
+  // out
+  size_t* network_config_size;
+  char** network_config;
+  TF_Status* status;
+} TpuEmbeddingEngine_ConfigureHost_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_ConfigureHost(
+    TpuEmbeddingEngine_ConfigureHost_Params* params);
+
+typedef struct TpuEmbeddingEngine_ConnectHosts_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t network_configs_size;
+  const TpuSerializedProto* network_configs;
+
+  // out
+  TF_Status* status;
+} TpuEmbeddingEngine_ConnectHosts_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_ConnectHosts(
+    TpuEmbeddingEngine_ConnectHosts_Params* params);
+
+typedef struct TpuEmbeddingEngine_Finalize_Params {
+  int32_t struct_size;
+  void* priv;
+  const XLA_TpuMeshState* tpu_mesh_state;
+
+  size_t common_config_size;
+  const char* common_config;
+  size_t memory_config_size;
+  const char* memory_config;
+
+  // out
+  TF_Status* status;
+} TpuEmbeddingEngine_Finalize_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_Finalize(
+    TpuEmbeddingEngine_Finalize_Params* params);
+
+typedef struct TpuEmbeddingEngine_IsInitialized_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t config_string_size;
+  const char* config_string;
+
+  // out
+  bool* is_tpu_embedding_initialized;
+  TF_Status* status;
+} TpuEmbeddingEngine_IsInitialized_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_IsInitialized(
+    TpuEmbeddingEngine_IsInitialized_Params* params);
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_WriteParameters(
+    TpuEmbeddingEngineParameters* params, TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_ReadParameters(
+    TpuEmbeddingEngineParameters* params, TF_Status* status);
+
+typedef struct TF_Tensor TF_Tensor;
+
+typedef struct TpuEmbeddingEngine_EnqueueTensorBatch_Params {
+  int32_t struct_size;
+  void* priv;
+
+  int32_t mode;
+  int32_t local_device_ordinal;
+  TpuEmbedding_TensorBatchFixedState* fixed_state;
+
+  TF_Tensor** sample_indices_tensors;
+  size_t sample_indices_tensors_size;
+  TF_Tensor** embedding_indices_tensors;
+  size_t embedding_indices_tensors_size;
+  TF_Tensor** aggregation_weights_tensors;
+  size_t aggregation_weights_tensors_size;
+  TF_Status* status;
+} TpuEmbeddingEngine_EnqueueTensorBatch_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_EnqueueTensorBatch(
+    TpuEmbeddingEngine_EnqueueTensorBatch_Params* params);
+
+typedef struct TpuEmbedding_TensorBatchFixedState_Create_Params {
+  int32_t struct_size;
+  void* priv;
+
+  size_t combiners_size;
+  char** combiners;
+
+  // out
+  TF_Status* status;
+} TpuEmbedding_TensorBatchFixedState_Create_Params;
+
+TFTPU_CAPI_EXPORT TpuEmbedding_TensorBatchFixedState*
+TpuEmbeddingTensorBatchFixedState_Create(
+    TpuEmbedding_TensorBatchFixedState_Create_Params* params);
+TFTPU_CAPI_EXPORT void TpuEmbeddingTensorBatchFixedState_Destroy(
+    TpuEmbedding_TensorBatchFixedState* fixed_state);
+
+typedef struct TpuEmbeddingEngine_RecvActivationsComputation_Params {
+  int32_t struct_size;
+  void* priv;
+
+  TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
+  XLA_Shape* deduplication_data_shape;
+  TpuSerializedProto* op_sharding;
+
+  // out
+  TpuSerializedProto* xla_computation;
+  TF_Status* status;
+} TpuEmbeddingEngine_RecvActivationsComputation_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_RecvActivationsComputation(
+    TpuEmbeddingEngine_RecvActivationsComputation_Params* params);
+
+typedef struct
+    TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params {
+  int32_t struct_size;
+  void* priv;
+
+  TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
+  TpuSerializedProto* op_sharding;
+  // out
+  TpuSerializedProto* xla_computation;
+  TF_Status* status;
+} TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params;
+
+TFTPU_CAPI_EXPORT void
+TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation(
+    TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params*
+        params);
+
+typedef struct TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params {
+  int32_t struct_size;
+  void* priv;
+
+  int32_t num_inputs;
+  TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
+  XLA_Shape* learning_rate_tuple_shape;
+  XLA_Shape* deduplication_data_shape;
+  XLA_Shape* gradient_tuple_shape;
+  TpuSerializedProto* op_sharding;
+  // out
+  TpuSerializedProto* xla_computation;
+  TF_Status* status;
+} TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation(
+    TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params* params);
+
+typedef struct TpuEmbeddingEngine_DedupDataSizeComputation_Params {
+  int32_t struct_size;
+  void* priv;
+
+  TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
+  // out
+  int32_t* num_elements;
+  TF_Status* status;
+} TpuEmbeddingEngine_DedupDataSizeComputation_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_DedupDataSizeComputation(
+    TpuEmbeddingEngine_DedupDataSizeComputation_Params* params);
+
+typedef struct TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params {
+  int32_t struct_size;
+  void* priv;
+
+  TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
+  // out
+  TpuSerializedProto* xla_computation;
+  TF_Status* status;
+} TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_DedupDataTupleMaskComputation(
+    TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params* params);
+
+typedef struct SparseCore_GetMaxIdsAndUniques_Params {
+  size_t struct_size;
+  void* priv;
+  const char* program_key;
+  const char* table_name;
+  int64_t num_samples_per_sparse_core;
+  int64_t feature_width;
+  // out
+  TF_Status* status;
+  int64_t max_ids_per_partition;
+  int64_t max_unique_ids_per_partition;
+} SparseCore_GetMaxIdsAndUniques_Params;
+
+TFTPU_CAPI_EXPORT void SparseCore_GetMaxIdsAndUniques(
+    SparseCore_GetMaxIdsAndUniques_Params* params);
+
+struct TfTpu_OpsApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_MeshCommonState);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngineState_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngineState_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngineState_GetState);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_LoadProgramAndEnqueueToStream);
+
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_HostShapeToDeviceShape);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSize);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompact);
+  TFTPU_ADD_FN_IN_STRUCT(HardwareLayout_ShapeSizeCompactRaw);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecute_RuntimeInputToPaddedData);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecute_GetTpuEmbeddingMemoryAllocations);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecute_FreeTpuEmbeddingMemoryAllocations);
+
+  TFTPU_ADD_FN_IN_STRUCT(ConfigureDistributedTpuOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(WaitForDistributedTpuOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(InitializeHostForDistributedTpuOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(SetGlobalTPUArrayOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(DisconnectDistributedTpuChipsOp_DoWork);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeCharArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeInt32Array);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpusPerHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
+  TFTPU_ADD_FN_IN_STRUCT(
+      TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_GetServerAddressAndPort);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_NewArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_FreeArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_UnloadAndDestroy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetProgramSize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_LogProgramMemorySummary);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetExecutableInfo);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHostTransferInfo);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHloMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetMayModifyVariables);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_HasSharding);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetTpuProgram);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeTpuExecutable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeCompilerMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_DeserializeFromGetTpuProgramResponseProto);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetFingerprint);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_DestroyFingerprint);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoresPerChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_ADD_FN_IN_STRUCT(TpuUtil_GetTopologyPtr);
+  TFTPU_ADD_FN_IN_STRUCT(TpuUtil_GetXlaPadSizeFromTpuTopology);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CompactionSupported);
+
+  TFTPU_ADD_FN_IN_STRUCT(TfTpu_InitializeTpuModelServer);
+
+  TFTPU_ADD_FN_IN_STRUCT(TfTpuOrdinalSelector_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TfTpuOrdinalSelector_Destroy);
+  TFTPU_ADD_FN_IN_STRUCT(TfTpuOrdinalSelector_GetOrdinal);
+  TFTPU_ADD_FN_IN_STRUCT(TfTpuOrdinalSelector_DequeueFromCoreSelector);
+  TFTPU_ADD_FN_IN_STRUCT(TfTpu_GetTpuPartitionedCallParams);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_ExecutePartitioner);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_ConfigureMemory);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_CollateMemory);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_ConfigureHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_ConnectHosts);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_Finalize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_IsInitialized);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_WriteParameters);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_ReadParameters);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingTensorBatchFixedState_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingTensorBatchFixedState_Destroy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_EnqueueTensorBatch);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_RecvActivationsComputation);
+  TFTPU_ADD_FN_IN_STRUCT(
+      TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation);
+  TFTPU_ADD_FN_IN_STRUCT(
+      TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_DedupDataSizeComputation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_DedupDataTupleMaskComputation);
+
+  TFTPU_ADD_FN_IN_STRUCT(SparseCore_GetMaxIdsAndUniques);
+};
+
+}  // extern "C"
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_OPS_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
new file mode 100644
index 00000000..8eb6f19b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
@@ -0,0 +1,134 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"  // IWYU pragma: keep
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
+#include "xla/stream_executor/tpu/tpu_topology.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
+ public:
+  using StreamMap = absl::flat_hash_map<stream_executor::Stream*, SE_Stream*>;
+  using EventMap = absl::flat_hash_map<stream_executor::Event*, SE_Event*>;
+
+  static const ::stream_executor::Platform::Id kId;
+
+  TpuPlatform();
+
+  ~TpuPlatform() override;
+
+  static TpuPlatform* GetRegisteredPlatform();
+
+  Id id() const override;
+
+  const std::string& Name() const override;
+
+  int VisibleDeviceCount() const override;
+
+  bool ShouldRegisterTpuDeviceToDeviceCopy() override;
+
+  const tensorflow::tpu::TpuTopologyPtr GetTopologyPtr() override;
+
+  const tensorflow::tpu::TpuHostLocationExternal GetTpuHostLocation()
+      const override;
+
+  TpuRuntimeVersion version() const override;
+
+  bool Initialized() const override;
+
+  absl::Status Initialize() override;
+
+  absl::Status Reset(bool only_tear_down, absl::string_view reason) override {
+    LOG(FATAL) << "Not yet implemented";
+  }
+
+  absl::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
+  DescriptionForDevice(int ordinal) const override {
+    LOG(FATAL) << "Not yet implemented";
+  }
+
+  absl::StatusOr<::stream_executor::StreamExecutor*> ExecutorForDevice(
+      int ordinal) override;
+
+  absl::StatusOr<::stream_executor::StreamExecutor*> FindExisting(
+      int ordinal) override {
+    return executor_cache_.Get(ordinal);
+  }
+
+  StreamMap* stream_map() { return &stream_map_; }
+
+  void InsertEvent(stream_executor::Event* key, SE_Event* val);
+  SE_Event* LookupEvent(stream_executor::Event* key);
+  SE_Stream* LookupStream(stream_executor::Stream* key) {
+    mutex().Lock();
+    auto stream = stream_map_.at(key);
+    mutex().Unlock();
+    return stream;
+  }
+  void EraseEvent(stream_executor::Event* key) override;
+
+  SE_Platform* se_platform() const { return platform_; }
+
+  // Returns the number of TPUs per host.
+  static absl::Status TpusPerHost(int* tpus);
+
+  // Returns the memory capacity of the TPUs on this host.
+  static absl::Status TpuMemoryLimit(int64_t* memory_limit);
+
+  absl::Mutex& mutex() { return event_map_mu_; }
+
+ private:
+  // Returns a device constructed with the ordinal without
+  // looking in or storing to the Platform's executor cache.
+  // Ownership IS transferred to the caller.
+  absl::StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
+  GetUncachedExecutor(int ordinal);
+
+  mutable SE_Platform* platform_;
+  std::string name_;
+  stream_executor::ExecutorCache executor_cache_;
+  StreamMap stream_map_;
+  EventMap event_map_;
+  absl::Mutex event_map_mu_;
+};
+
+bool RegisterTpuPlatform();
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.h
new file mode 100644
index 00000000..53424463
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.h
@@ -0,0 +1,29 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_ID_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_ID_H_
+
+#include "xla/stream_executor/platform.h"
+
+namespace tensorflow {
+namespace tpu {
+
+::stream_executor::Platform::Id GetTpuPlatformId();
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h
new file mode 100644
index 00000000..5f320adc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_INTERFACE_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/tpu_topology.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// TODO(skyewm): get rid of TpuTopologyPtr and either use SE_TpuTopology* or
+// return a TpuTopologyExternal.
+typedef SE_TpuTopology* TpuTopologyPtr;
+
+class TpuPlatformInterface : public stream_executor::Platform {
+ public:
+  // Returns a TPU platform to be used by TPU ops. If multiple TPU platforms are
+  // registered, finds the most suitable one. Returns nullptr if no TPU platform
+  // is registered or an error occurred.
+  //
+  // 'initialize_platform' can be set to false to not initialize a platform if
+  // not necessary. 'num_tries' specifies the number of tries if the TPU
+  // platform isn't initialized yet, with a 1-second delay between each try
+  // (num_tries == 1 means try once with no retries).
+  static TpuPlatformInterface* GetRegisteredPlatform(
+      bool initialize_platform = true, int num_tries = 5);
+
+  virtual absl::Status Reset(bool only_tear_down, absl::string_view reason) = 0;
+
+  absl::Status Reset(absl::string_view reason) { return Reset(false, reason); }
+
+  absl::Status Reset() { return Reset(false, {}); }
+
+  virtual bool ShouldRegisterTpuDeviceToDeviceCopy() = 0;
+
+  virtual const TpuTopologyPtr GetTopologyPtr() = 0;
+
+  virtual const TpuHostLocationExternal GetTpuHostLocation() const = 0;
+
+  virtual TpuRuntimeVersion version() const = 0;
+
+  virtual void EraseEvent(stream_executor::Event* key) {};
+
+  TpuTopologyExternal topology() {
+    return TpuTopologyExternal(GetTopologyPtr());
+  }
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_PLATFORM_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h
new file mode 100644
index 00000000..448dcba8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h
@@ -0,0 +1,84 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_PROFILER_C_API_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_PROFILER_C_API_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/libtftpu.h"
+
+extern "C" {
+
+typedef struct TpuProfiler TpuProfiler;
+
+// Creates a TPU profiler that is ready to start profiling.
+TFTPU_CAPI_EXPORT void TpuProfiler_Create(TpuProfiler** tpu_profiler,
+                                          TF_Status* status);
+// Destroys the given TPU profiler.
+TFTPU_CAPI_EXPORT void TpuProfiler_Destroy(TpuProfiler* tpu_profiler);
+// Starts profiling if not already started, returns an error otherwise.
+TFTPU_CAPI_EXPORT void TpuProfiler_Start(TpuProfiler* tpu_profiler,
+                                         TF_Status* status);
+// Stops profiling if not already stopped, returns an error otherwise.
+TFTPU_CAPI_EXPORT void TpuProfiler_Stop(TpuProfiler* tpu_profiler,
+                                        TF_Status* status);
+// Serializes profiled data into `buffer` and returns the size of `buffer`. The
+// profile data held by the TPU driver will be cleared after retrieval.
+//
+// Step 1. Query the size of buffer required into `size_in_bytes`.
+//
+//   size_t size_in_bytes;
+//   TpuProfiler_CollectData(profiler, status, nullptr, &size_in_bytes);
+//
+// Step 2. Retrieve the data into a `buffer` of size `size_in_bytes`.
+//         Subsequently,The TPU driver clears its copy of the profile data.
+//
+//   uint8_t buffer = new uint8_t[size_in_bytes];
+//   TpuProfiler_CollectData(profiler, status, buffer, size_in_bytes);
+//
+// Step 3. Unpack the data into an XSpace.
+//
+//   tensorflow::profiler::XSpace space;
+//   space.ParseFromArray(buffer, size_in_bytes);
+//
+TFTPU_CAPI_EXPORT void TpuProfiler_CollectData(TpuProfiler* tpu_profiler,
+                                               TF_Status* status,
+                                               uint8_t* buffer,
+                                               size_t* size_in_bytes);
+
+// absl::Status helpers to create TFStatus for Profiler.
+TF_Status* TpuStatus_New();
+void TpuStatus_Free(TF_Status* status);
+const char* TpuStatus_Message(TF_Status* status);
+int TpuStatus_Code(TF_Status* status);
+
+struct TfTpu_ProfilerApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuProfiler_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProfiler_Destroy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProfiler_Start);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProfiler_Stop);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProfiler_CollectData);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Message);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Code);
+};
+
+}  // extern "C"
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_PROFILER_C_API_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_stream.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_stream.h
new file mode 100644
index 00000000..dfa4b4a9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_stream.h
@@ -0,0 +1,191 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tpu/c_api_conversions.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/status_helper.h"
+#include "xla/stream_executor/tpu/tpu_executor_api.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+#include "xla/stream_executor/tpu/tpu_platform.h"
+#include "xla/stream_executor/tpu/tpu_stream_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuStream : public tensorflow::tpu::TpuStreamInterface {
+ public:
+  explicit TpuStream(SE_Stream* stream,
+                     stream_executor::StreamExecutor* executor,
+                     SE_StreamExecutor* se_executor,
+                     tensorflow::tpu::TpuPlatform* tpu_platform)
+      : TpuStreamInterface(executor),
+        stream_(stream),
+        se_executor_(se_executor),
+        tpu_platform_(tpu_platform) {}
+  ~TpuStream() override {
+    BlockHostUntilDone().IgnoreError();
+    parent()->DeallocateStream(this);
+    stream_executor::tpu::ExecutorApiFn()->TpuStream_FreeFn(stream_);
+  }
+
+  bool IsSameSharedMemoryLocation(
+      tensorflow::tpu::TpuStreamInterface* other) override {
+    return stream_executor::tpu::ExecutorApiFn()
+        ->TpuStream_IsSameSharedMemoryLocationFn(
+            stream_, static_cast<TpuStream*>(other)->stream_);
+  }
+
+  absl::Status EnqueueTransferHostToDevice(
+      stream_executor::DeviceMemoryBase device_dst, const void* host_src,
+      uint64_t size) {
+    StatusHelper status;
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuStream_EnqueueTransferHostToDeviceFn(
+            stream_, ApiConverter::ToC(device_dst), const_cast<void*>(host_src),
+            size, status.c_status);
+    return status.status();
+  }
+
+  absl::Status BlockHostUntilDone() override {
+    StatusHelper status;
+    stream_executor::tpu::ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
+        se_executor_, stream_, status.c_status);
+    return status.status();
+  }
+
+  absl::Status EnqueueTransferDeviceToHost(
+      stream_executor::DeviceMemoryBase device_src, void* host_dst,
+      uint64_t size) {
+    StatusHelper status;
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuStream_EnqueueTransferDeviceToHostFn(
+            stream_, ApiConverter::ToC(device_src), host_dst, size,
+            status.c_status);
+    return status.status();
+  }
+
+  absl::Status EnqueueOnTpuDeviceSendRecvLocal(
+      stream_executor::DeviceMemoryBase send_buffer,
+      stream_executor::DeviceMemoryBase recv_buffer) override {
+    StatusHelper status;
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuStream_TpuEnqueueOnDeviceSendRecvLocalFn(
+            stream_, ApiConverter::ToC(send_buffer),
+            ApiConverter::ToC(recv_buffer), status.c_status);
+    return status.status();
+  }
+
+  absl::Status WaitFor(stream_executor::Stream* stream) override {
+    if (stream_executor::tpu::ExecutorApiFn()
+            ->TpuExecutor_CreateStreamDependencyFn(
+                se_executor_, stream_, tpu_platform_->LookupStream(stream))) {
+      return absl::OkStatus();
+    }
+    return absl::InternalError("Failed to create stream dependency");
+  }
+
+  absl::Status WaitFor(stream_executor::Event* event) override {
+    StatusHelper status;
+    auto se_event = tpu_platform_->LookupEvent(event);
+    stream_executor::tpu::ExecutorApiFn()->TpuExecutor_WaitForEventFn(
+        se_executor_, stream_, se_event, status.c_status);
+    return status.status();
+  }
+
+  absl::Status RefreshStatus() override {
+    StatusHelper status;
+    stream_executor::tpu::ExecutorApiFn()->TpuExecutor_GetStatusFn(
+        se_executor_, stream_, status.c_status);
+    CheckStatus(status.status());
+    return status.status();
+  }
+
+  absl::Status RecordEvent(stream_executor::Event* event) override {
+    StatusHelper status;
+    auto se_event = tpu_platform_->LookupEvent(event);
+    stream_executor::tpu::ExecutorApiFn()->TpuExecutor_RecordEventFn(
+        se_executor_, stream_, se_event, status.c_status);
+    return status.status();
+  }
+
+  absl::Status Memcpy(stream_executor::DeviceMemoryBase* device_dst,
+                      const void* host_src, uint64_t size) override {
+    StatusHelper status;
+    SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
+    stream_executor::tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
+        se_executor_, stream_, &se_base, host_src, size, status.c_status);
+    return status.status();
+  }
+  absl::Status Memcpy(stream_executor::DeviceMemoryBase* device_dst,
+                      const stream_executor::DeviceMemoryBase& device_src,
+                      uint64_t size) override {
+    return absl::UnimplementedError(
+        "Memcpy from device to deviceis not implemented for TPU");
+  }
+  absl::Status Memcpy(void* host_dst,
+                      const stream_executor::DeviceMemoryBase& device_src,
+                      uint64_t size) override {
+    StatusHelper status;
+    SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
+    stream_executor::tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
+        se_executor_, stream_, host_dst, &se_base, size, status.c_status);
+    return status.status();
+  }
+  struct HostCallbackContext {
+    absl::AnyInvocable<absl::Status() &&> callback;
+  };
+  static TSL_Status* HostCallbackTrampoline(void* ctx) {
+    HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
+    absl::Status status = std::move(host_ctx->callback)();
+    TSL_Status* c_status =
+        stream_executor::tpu::ExecutorApiFn()->TpuStatus_CreateFn(
+            status.raw_code(), absl::StatusMessageAsCStr(status));
+    delete host_ctx;
+    return c_status;
+  }
+  absl::Status DoHostCallbackWithStatus(
+      absl::AnyInvocable<absl::Status() &&> callback) override {
+    HostCallbackContext* ctx = new HostCallbackContext{std::move(callback)};
+    if (stream_executor::tpu::ExecutorApiFn()->TpuExecutor_HostCallbackFn(
+            se_executor_, stream_, &HostCallbackTrampoline, ctx)) {
+      return absl::OkStatus();
+    }
+    return absl::InternalError("Failed to  host callback.");
+  }
+
+  SE_Stream* se_stream() const { return stream_; }
+
+ private:
+  mutable SE_Stream* stream_;
+  SE_StreamExecutor* se_executor_;
+  tensorflow::tpu::TpuPlatform* tpu_platform_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
new file mode 100644
index 00000000..cc6e1c02
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_common.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuStreamInterface : public stream_executor::StreamCommon {
+ public:
+  explicit TpuStreamInterface(stream_executor::StreamExecutor* executor)
+      : StreamCommon(executor) {}
+  virtual bool IsSameSharedMemoryLocation(TpuStreamInterface* other) = 0;
+  virtual absl::Status EnqueueOnTpuDeviceSendRecvLocal(
+      stream_executor::DeviceMemoryBase send_buffer,
+      stream_executor::DeviceMemoryBase recv_buffer) = 0;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_topology.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_topology.h
new file mode 100644
index 00000000..8c5f6ae1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_topology.h
@@ -0,0 +1,96 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "tsl/platform/types.h"
+
+namespace tensorflow {
+namespace tpu {
+
+struct TpuDimensionsExternal {
+  int x;
+  int y;
+  int z;
+};
+
+class TpuCoreLocationExternal {
+ public:
+  TpuCoreLocationExternal() : core_location_(nullptr) {}
+  explicit TpuCoreLocationExternal(SE_TpuTopology_Core* core_location)
+      : core_location_(core_location) {}
+  TpuDimensionsExternal chip_coordinates() const;
+  TpuDimensionsExternal host_coordinates() const;
+  int32_t index() const;
+  int32_t Id() const;
+
+  SE_TpuTopology_Core* impl() const { return core_location_; }
+
+ private:
+  SE_TpuTopology_Core* core_location_;
+};
+
+class TpuHostLocationExternal {
+ public:
+  explicit TpuHostLocationExternal(SE_TpuTopology_Host* host_location)
+      : host_location_(host_location) {}
+  int32_t Id() const;
+  std::vector<TpuCoreLocationExternal> Cores(TpuCoreTypeEnum core_type) const;
+
+  SE_TpuTopology_Host* impl() const { return host_location_; }
+
+ private:
+  SE_TpuTopology_Host* host_location_;
+};
+
+struct TpuTopologyChipBoundsExternal {
+  int x;
+  int y;
+  int z;
+};
+
+class TpuTopologyExternal {
+ public:
+  explicit TpuTopologyExternal(SE_TpuTopology* topology)
+      : topology_(topology) {}
+  int32_t LogicalDevicesPerHost(TpuCoreTypeEnum core_type) const;
+  int32_t LogicalDevicesPerChip(TpuCoreTypeEnum core_type) const;
+  int32_t HostCount() const;
+  int32_t ChipsPerHost() const;
+  TpuTopologyChipBoundsExternal chip_bounds() const;
+  bool HasChip(int x, int y, int z) const;
+  TpuCoreLocationExternal CoreForId(TpuCoreTypeEnum core_type, int id) const;
+  TpuCoreLocationExternal Core(TpuCoreTypeEnum core_type, int x, int y, int z,
+                               int index) const;
+  std::vector<TpuCoreLocationExternal> cores(TpuCoreTypeEnum core_type) const;
+  int IdForHost(TpuDimensionsExternal host) const;
+  TpuVersionEnum version() const;
+
+ private:
+  SE_TpuTopology* topology_;
+};
+
+std::string TpuVersionEnumToString(TpuVersionEnum version);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h
new file mode 100644
index 00000000..6252f4d9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h
@@ -0,0 +1,112 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
+
+#include <cstdint>
+#include <deque>
+#include <functional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/noncopyable_buffer.h"
+#include "xla/stream_executor/tpu/tpu_executor_c_api.h"
+#include "xla/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuTransferManager : public xla::TpuTransferManagerInterface {
+ public:
+  TpuTransferManager();
+  ~TpuTransferManager() override;
+
+  stream_executor::Platform::Id PlatformId() const override;
+
+  xla::Shape HostShapeToDeviceShape(
+      const xla::Shape& host_shape) const override;
+
+  absl::Status TransferLiteralToDeviceAsync(
+      stream_executor::Stream* stream, const xla::LiteralSlice& literal,
+      const xla::ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata) override;
+
+  void TransferLiteralFromDevice(
+      stream_executor::Stream* stream, const xla::ShapedBuffer& device_buffer,
+      xla::MutableBorrowingLiteral literal,
+      std::function<void(absl::Status)> done,
+      const TransferMetadata* transfer_metadata) override;
+
+  absl::Status TransferLiteralToInfeed(
+      stream_executor::StreamExecutor* executor,
+      const xla::LiteralSlice& literal) override;
+
+  absl::Status TransferLiteralFromOutfeed(
+      stream_executor::StreamExecutor* executor,
+      xla::MutableBorrowingLiteral literal) override;
+
+  absl::Status TransferBuffersToInfeed(
+      se::StreamExecutor* executor,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) override;
+
+  absl::Status ResetDevices(
+      absl::Span<stream_executor::StreamExecutor* const> executor) override;
+
+  int64_t GetByteSizeRequirement(const xla::Shape& shape) const override;
+
+  absl::StatusOr<xla::Shape> ChooseCompactLayoutForShape(
+      const xla::Shape& host_shape) const override;
+
+  bool CanShapedBufferBeAccessedNow(
+      stream_executor::StreamExecutor* executor,
+      const xla::ShapedBuffer& device_buffer) const override;
+
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override;
+
+  absl::Status WriteSingleTupleIndexTable(
+      stream_executor::Stream* stream,
+      absl::Span<const stream_executor::DeviceMemoryBase> elements,
+      const xla::Shape& shape,
+      stream_executor::DeviceMemoryBase* region) override;
+
+  absl::Status LinearizeToBuffers(
+      const xla::LiteralSlice& literal, const xla::Shape& device_shape,
+      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) override;
+
+  absl::Status ReadDynamicShapes(se::Stream* stream,
+                                 const xla::ShapedBuffer* device_buffer,
+                                 xla::Shape* device_shape) override;
+
+ private:
+  XLA_TransferManager* manager_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
new file mode 100644
index 00000000..e61e4fec
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
+
+#include <deque>
+
+#include "absl/status/status.h"
+#include "xla/literal.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tpu/noncopyable_buffer.h"
+
+namespace xla {
+
+class TpuTransferManagerInterface : public xla::TransferManager {
+ public:
+  virtual absl::Status TransferBuffersToInfeed(
+      se::StreamExecutor* executor,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) = 0;
+
+  virtual absl::Status LinearizeToBuffers(
+      const LiteralSlice& literal, const Shape& device_shape,
+      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
+
+  static TpuTransferManagerInterface* GetRegisteredTpuTransferManager();
+
+  bool PackSubbyteTypes() const override { return true; }
+};
+
+}  // namespace xla
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h
new file mode 100644
index 00000000..074b7703
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TPU_TSL_STATUS_HELPER_H_
+#define XLA_STREAM_EXECUTOR_TPU_TSL_STATUS_HELPER_H_
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/tsl/c/tsl_status.h"
+#include "xla/tsl/c/tsl_status_helper.h"
+#include "tsl/platform/status.h"
+
+class TslStatusHelper {
+ public:
+  TslStatusHelper() : c_status(TSL_NewStatus()) {}
+
+  ~TslStatusHelper() { TSL_DeleteStatus(c_status); }
+
+  static absl::Status FromC(
+      TF_Status* const c_status) {  // TENSORFLOW_STATUS_OK
+    absl::StatusCode code = tsl::StatusCodeFromTSLCode(TSL_GetCode(c_status));
+    if (code == absl::StatusCode::kOk) {
+      return tsl::OkStatus();
+    }
+    return tsl::Status(code, TSL_Message(c_status));  // TENSORFLOW_STATUS_OK
+  }
+
+  bool ok() const {
+    return tsl::StatusCodeFromTSLCode(TSL_GetCode(c_status)) ==
+           absl::StatusCode::kOk;
+  }
+
+  absl::Status status() const {  // TENSORFLOW_STATUS_OK
+    return FromC(c_status);
+  }
+
+  TF_Status* const c_status;  // NOLINT
+};
+
+#endif  // XLA_STREAM_EXECUTOR_TPU_TSL_STATUS_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
new file mode 100644
index 00000000..298c3c32
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_TRACE_COMMAND_BUFFER_FACTORY_H_
+#define XLA_STREAM_EXECUTOR_TRACE_COMMAND_BUFFER_FACTORY_H_
+
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+
+class TraceCommandBufferFactory {
+ public:
+  // Creates a new command buffer on the given executor by tracing `function`
+  // invocation. All StreamExecutor operations on a Stream argument will be
+  // recorded into the command buffer. Returned command buffer is finalized, and
+  // can't be updated.
+  //
+  // Command buffer tracing should be used only when it is impossible to use
+  // explicit construction APIs, e.g. when calling external libraries. By
+  // default we construct traced command buffers in nested mode because the
+  // primary use case for traced command buffers is to be inserted into primary
+  // command buffers constructed with explicit APIs.
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
+      StreamExecutor* executor,
+      absl::AnyInvocable<absl::Status(Stream*)> function,
+      CommandBuffer::Mode mode = CommandBuffer::Mode::kNested);
+
+  // Creates a new command buffer on the given executor by tracing `function`
+  // invocation using a user provided stream that will be passed to `function`.
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
+      StreamExecutor* executor, Stream* stream,
+      absl::AnyInvocable<absl::Status(Stream*)> function,
+      CommandBuffer::Mode mode = CommandBuffer::Mode::kNested);
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TRACE_COMMAND_BUFFER_FACTORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/typed_kernel_factory.h b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/typed_kernel_factory.h
new file mode 100644
index 00000000..5a0b5133
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/stream_executor/typed_kernel_factory.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TYPED_KERNEL_FACTORY_H_
+#define XLA_STREAM_EXECUTOR_TYPED_KERNEL_FACTORY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+// This class creates TypedKernel objects for stream executors based on the
+// specification.
+template <typename... Params>
+class TypedKernelFactory {
+ public:
+  // Creates a typed kernel on a given executor from a kernel specification.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutor *executor, const MultiKernelLoaderSpec &spec) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Kernel> kernel,
+                        executor->LoadKernel(spec));
+    return TypedKernel<Params...>(std::move(kernel));
+  }
+
+  // Creates a kernel which can be launched on a stream from a
+  // PTX (and optional CUBIN), such that the types of the arguments provided for
+  // launch would have to match types of the arguments provided at creation
+  // time. The canonical storage for both ptx and cubin_data should outlive the
+  // lifetime of the kernel.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutor *executor, absl::string_view kernel_name,
+      absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
+    MultiKernelLoaderSpec loader_spec(
+        TypedKernel<Params...>::kNumberOfParameters);
+    loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
+
+    if (!cubin_data.empty()) {
+      loader_spec.AddCudaCubinInMemory(cubin_data, kernel_name);
+    }
+
+    return Create(executor, loader_spec);
+  }
+
+  // Creates a kernel which can be launched on a stream from
+  // an in-process symbol pointer.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutor *executor, absl::string_view kernel_name, void *symbol) {
+    MultiKernelLoaderSpec loader_spec(
+        TypedKernel<Params...>::kNumberOfParameters);
+    loader_spec.AddInProcessSymbol(symbol, kernel_name);
+
+    return Create(executor, loader_spec);
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TYPED_KERNEL_FACTORY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/test.h b/third_party/tflite-hdrs/third_party/xla/xla/test.h
new file mode 100644
index 00000000..8ce11ab8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/test.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TEST_H_
+#define XLA_TEST_H_
+
+// This header includes gmock.h and enables the use of gmock matchers in tests
+// in third_party/tensorflow/compiler/xla.
+//
+// Test including this header can use the macros EXPECT_THAT(...) and
+// ASSERT_THAT(...) in combination with gmock matchers.
+// Example:
+//  std::vector<int> vec = Foo();
+//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
+//
+// For more details on gmock matchers see:
+// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
+//
+// The advantages of using gmock matchers instead of self defined matchers are
+// better error messages, more maintainable tests and more test coverage.
+//
+// Note that while the use of gmock matchers is allowed in the xla project, the
+// use of mocks is disallowed in the whole tensorflow project!
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/testlib/test.h"
+
+#endif  // XLA_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/test_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/test_helpers.h
new file mode 100644
index 00000000..77336bd5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/test_helpers.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TEST_HELPERS_H_
+#define XLA_TEST_HELPERS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/testlib/test_helpers.h"
+
+#endif  // XLA_TEST_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/client_library_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/client_library_test_base.h
new file mode 100644
index 00000000..016b73b6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/client_library_test_base.h
@@ -0,0 +1,665 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
+#define XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
+
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/client/client_library.h"
+#include "xla/client/global_data.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tests/test_utils.h"
+#include "xla/tsl/lib/core/bitmap.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/ml_dtypes.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+
+template <typename TestCase>
+std::vector<TestCase> ExpandTestType(
+    absl::Span<const PrimitiveType> test_type_params,
+    absl::Span<const TestCase> specs) {
+  std::vector<TestCase> expanded;
+  for (const PrimitiveType test_type : test_type_params) {
+    for (const auto& spec : specs) {
+      expanded.push_back(spec);
+      expanded.back().test_type = test_type;
+    }
+  }
+  return expanded;
+}
+
+// A client library test establishes an in-process XLA client connection.
+class ClientLibraryTestBase : public ::testing::Test {
+ protected:
+  explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
+
+  // Creates a new ClientLibraryTestBase with custom client options.
+  ClientLibraryTestBase(se::Platform* platform,
+                        const LocalClientOptions& client_options);
+
+  // Returns the name of the suite currently being run.
+  std::string SuiteName() const;
+
+  // Returns the name of the test currently being run.
+  std::string TestName() const;
+
+  void SetFastMathDisabled(bool disabled) {
+    auto* opts = execution_options_.mutable_debug_options();
+    opts->set_xla_cpu_enable_fast_math(!disabled);
+    opts->set_xla_cpu_enable_fast_min_max(!disabled);
+    opts->set_xla_gpu_enable_fast_min_max(!disabled);
+  }
+
+  void SetSeed(uint64_t seed) { execution_options_.set_seed(seed); }
+
+  // Provides mutable access to the execution DebugOptions field; this lets
+  // tests tweak the options that will be used to compile/run the graph.
+  DebugOptions* mutable_debug_options() {
+    return execution_options_.mutable_debug_options();
+  }
+
+  // TODO(b/25566808): Add helper that populates a literal from a testdata file.
+
+  // Convenience methods for building and running a computation with the member
+  // execution options. Modify execution_options_ in your test if you want to
+  // customize the options.
+  absl::StatusOr<std::unique_ptr<GlobalData>> Execute(
+      XlaBuilder* builder, absl::Span<GlobalData* const> arguments);
+
+  absl::StatusOr<Literal> ExecuteAndTransfer(
+      XlaBuilder* builder, absl::Span<GlobalData* const> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
+  absl::StatusOr<Literal> ExecuteAndTransfer(
+      const XlaComputation& computation,
+      absl::Span<GlobalData* const> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
+  // This executes the computation via the reference client (which connects a
+  // interpreter backend). The result is used as the expected value of the
+  // computation.
+  absl::StatusOr<Literal> ExecuteAndTransferReference(
+      const XlaComputation& computation,
+      absl::Span<GlobalData* const> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
+  // Run a computation and return its value as a string. If an error
+  // occurs, then instead return the error as a string.
+  std::string ExecuteToString(XlaBuilder* builder,
+                              absl::Span<GlobalData* const> arguments);
+
+  // Convenience methods for building and running a computation, transferring
+  // the result, and comparing it to the expected value(s). Methods are
+  // templated on the native host type which maps to specific XLA types (See
+  // XlaBuilder for details). For each rank, two forms are
+  // provided: one for floating point types with an ErrorSpec parameter, and one
+  // for integral types without the ErrorSpec parameter.
+  template <typename NativeT>
+  void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
+                           absl::Span<GlobalData* const> arguments);
+  template <typename NativeT>
+  void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
+                           absl::Span<GlobalData* const> arguments,
+                           ErrorSpec error);
+
+  template <typename NativeT>
+  void ComputeAndCompareR1(XlaBuilder* builder,
+                           absl::Span<const NativeT> expected,
+                           absl::Span<GlobalData* const> arguments);
+  template <typename NativeT>
+  void ComputeAndCompareR1(XlaBuilder* builder,
+                           absl::Span<const NativeT> expected,
+                           absl::Span<GlobalData* const> arguments,
+                           ErrorSpec error);
+
+  // As above, but uses a bitmap to hold the predicate vector to avoid
+  // deficiencies of vector<bool>.
+  void ComputeAndCompareR1(XlaBuilder* builder,
+                           const tsl::core::Bitmap& expected,
+                           absl::Span<GlobalData* const> arguments);
+
+  template <typename NativeT>
+  void ComputeAndCompareR2(XlaBuilder* builder,
+                           const Array2D<NativeT>& expected,
+                           absl::Span<GlobalData* const> arguments);
+  template <typename NativeT>
+  void ComputeAndCompareR2(XlaBuilder* builder,
+                           const Array2D<NativeT>& expected,
+                           absl::Span<GlobalData* const> arguments,
+                           ErrorSpec error);
+
+  template <typename NativeT>
+  void ComputeAndCompareR3(XlaBuilder* builder,
+                           const Array3D<NativeT>& expected,
+                           absl::Span<GlobalData* const> arguments);
+  template <typename NativeT>
+  void ComputeAndCompareR3(XlaBuilder* builder,
+                           const Array3D<NativeT>& expected,
+                           absl::Span<GlobalData* const> arguments,
+                           ErrorSpec error);
+
+  template <typename NativeT>
+  void ComputeAndCompareR4(XlaBuilder* builder,
+                           const Array4D<NativeT>& expected,
+                           absl::Span<GlobalData* const> arguments);
+  template <typename NativeT>
+  void ComputeAndCompareR4(XlaBuilder* builder,
+                           const Array4D<NativeT>& expected,
+                           absl::Span<GlobalData* const> arguments,
+                           ErrorSpec error);
+
+  // Build and run the computation and compare the result with the given
+  // literal. shape_with_layout indicates the result layout to request when
+  // calling Execute.
+  void ComputeAndCompareLiteral(XlaBuilder* builder, const Literal& expected,
+                                absl::Span<GlobalData* const> arguments,
+                                const Shape* shape_with_layout = nullptr);
+  void ComputeAndCompareLiteral(XlaBuilder* builder, const Literal& expected,
+                                absl::Span<GlobalData* const> arguments,
+                                ErrorSpec error,
+                                const Shape* shape_with_layout = nullptr);
+
+  // Build and run the computation and return the result as a literal.
+  // shape_with_layout indicates the result layout to request when calling
+  // Execute.
+  absl::StatusOr<Literal> ComputeAndTransfer(
+      XlaBuilder* builder, absl::Span<GlobalData* const> arguments,
+      const Shape* shape_with_layout = nullptr);
+
+  // ComputeAndCompare variant which returns an error status.
+  absl::Status ComputeAndCompareLiteralWithStatus(
+      XlaBuilder* builder, const Literal& expected,
+      absl::Span<GlobalData* const> arguments,
+      const Shape* shape_with_layout = nullptr);
+  absl::Status ComputeAndCompareLiteralWithStatus(
+      XlaBuilder* builder, const Literal& expected,
+      absl::Span<GlobalData* const> arguments, ErrorSpec error,
+      const Shape* shape_with_layout = nullptr);
+
+  // Compare the result of the computation to a strings. In XLA strings are
+  // represented using rank-1 U8 shapes.
+  void ComputeAndCompareR1U8(XlaBuilder* builder, absl::string_view expected,
+                             absl::Span<GlobalData* const> arguments);
+
+  // Convenience method for running a built computation, transferring the
+  // result, and comparing it to the expected tuple literal.
+  void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected,
+                              absl::Span<GlobalData* const> arguments);
+  void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected,
+                              absl::Span<GlobalData* const> arguments,
+                              ErrorSpec error);
+
+  // Convenience method for running a built computation and comparing the result
+  // with the reference result.
+  void ComputeAndCompare(XlaBuilder* builder,
+                         absl::Span<const Literal> arguments);
+  void ComputeAndCompare(XlaBuilder* builder,
+                         absl::Span<const Literal> arguments, ErrorSpec error);
+  template <typename NativeT>
+  void ComputeAndCompare(XlaBuilder* builder, const Array<NativeT>& expected,
+                         absl::Span<GlobalData* const> arguments);
+  template <typename NativeT>
+  void ComputeAndCompare(XlaBuilder* builder, const Array<NativeT>& expected,
+                         absl::Span<GlobalData* const> arguments,
+                         ErrorSpec error);
+  // Create scalar operations for use in reductions.
+  XlaComputation CreateScalarReluF32();
+  XlaComputation CreateScalarMax();
+
+  // Special case convenience functions for creating filled arrays.
+
+  // Creates an array of pseudorandom values lying between the given minimum and
+  // maximum values.
+  template <typename NativeT>
+  std::vector<NativeT> CreatePseudorandomR1(const int width, NativeT min_value,
+                                            NativeT max_value, uint32_t seed);
+  template <typename NativeT>
+  std::unique_ptr<Array2D<NativeT>> CreatePseudorandomR2(const int rows,
+                                                         const int cols,
+                                                         NativeT min_value,
+                                                         NativeT max_value,
+                                                         uint32_t seed);
+
+  // Creates a (rows x cols) array filled in the following form:
+  //
+  //  [      0              1 ...                   cols-1]
+  //  [  1,000          1,001 ...          1000.0 + cols-1]
+  //  [    ...            ... ...                      ...]
+  //  [(rows-1)*1000.0    ... ... (rows-1)*1000.0 + cols-1]
+  //
+  // If provided, offset is added uniformly to every element (e.g. an offset of
+  // 64 would cause 0 in the above to be 64, 1 to be 65, 1000 to be 1064, etc.)
+  std::unique_ptr<Array2D<float>> CreatePatternedMatrix(const int rows,
+                                                        const int cols,
+                                                        float offset = 0.0);
+
+  // Creates a (rows x cols) array as above, padded out to
+  // (rows_padded x cols_padded) with zeroes.  Requires rows_padded >= rows
+  // and cols_padded > cols.
+  std::unique_ptr<Array2D<float>> CreatePatternedMatrixWithZeroPadding(
+      const int rows, const int cols, const int rows_padded,
+      const int cols_padded);
+
+  // Creates a parameter instruction, transfers the literal for the parameter to
+  // server, then stores into "data_handle" the global handle for that
+  // parameter. When the test_type is bfloat16 but the literal has F32 elements,
+  // the literal will be converted to test_type_ before being transferred.
+  absl::StatusOr<std::unique_ptr<GlobalData>> CreateParameterAndTransferLiteral(
+      int64_t parameter_number, const Literal& literal, const std::string& name,
+      XlaBuilder* builder, XlaOp* data_handle);
+
+  // As above, but the caller can specify the device that the literal is
+  // transferred to. If device_handle is nullptr, the literal will be
+  // transferred to the default device.
+  absl::StatusOr<std::unique_ptr<GlobalData>> CreateParameterAndTransferLiteral(
+      int64_t parameter_number, const Literal& literal, const std::string& name,
+      const DeviceHandle* device_handle, XlaBuilder* builder,
+      XlaOp* data_handle);
+
+  // Creates a parameter instruction and sets the value that will be passed to
+  // the computation as specified. This function must be used for all parameters
+  // or none and no parameters must be passed when invoking the computation if
+  // using this mechanism. If using this mechanism, then each parameter must be
+  // set exactly once. The first added parameter gets index 0, then 1 and so on.
+  XlaOp AddParam(const Literal& argument, XlaBuilder* builder);
+
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(LiteralUtil::CreateFromArray(argument), builder);
+  }
+
+  // Creates a constant instruction with the given literal. When the test_type
+  // is bfloat16 but the literal has F32 elements, the literal will be converted
+  // to test_type_ before being transferred.
+  XlaOp CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder);
+
+  // Creates a constant instruction with the given array. When the test_type is
+  // bfloat16, the elements will be converted to bfloat16s.
+  template <typename NativeT>
+  XlaOp CreateConstantFromArray(const Array<NativeT>& array,
+                                XlaBuilder* builder) {
+    return CreateConstantFromLiteral(LiteralUtil::CreateFromArray(array),
+                                     builder);
+  }
+
+  // Same as CreateConstantFromArray, but for scalars.
+  template <typename NativeT>
+  XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) {
+    return CreateConstantFromLiteral(LiteralUtil::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
+  // Creates a parameter instruction that wraps a given value and then stores
+  // into "data_handle" the global handle for that parameter.
+  //
+  // "parameter_number" is the parameter number.
+  // "name" is the name of the parameter instruction.
+  //
+  // When the test_type is bfloat16 but NativeT is float, the data will be
+  // converted to bfloat16.
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateR0Parameter(NativeT value,
+                                                int64_t parameter_number,
+                                                const std::string& name,
+                                                XlaBuilder* builder,
+                                                XlaOp* data_handle);
+
+  // Creates a parameter instruction that wraps the given values and then stores
+  // into "data_handle" the global handle for that parameter.
+  //
+  // "parameter_number" is the parameter number.
+  // "name" is the name of the parameter instruction.
+  //
+  // When the test_type is bfloat16 but NativeT is float, the data will be
+  // converted to bfloat16.
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateR1Parameter(
+      absl::Span<const NativeT> values, int64_t parameter_number,
+      const std::string& name, XlaBuilder* builder, XlaOp* data_handle);
+
+  // Creates a parameter instruction that wraps the given constant array
+  // "array_2d" and then stores it to the global handle for that parameter
+  // "data_handle".
+  //
+  // "parameter_number" is the parameter number.
+  // "name" is the name of the parameter instruction.
+  //
+  // When the test_type is bfloat16 but NativeT is float, the data will be
+  // converted to bfloat16.
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateR2Parameter(
+      const Array2D<NativeT>& array_2d, int64_t parameter_number,
+      const std::string& name, XlaBuilder* builder, XlaOp* data_handle);
+
+  // Creates a parameter instruction that wraps the given constant array
+  // "array_3d" and then stores it to the global handle for that parameter
+  // "data_handle".
+  //
+  // "parameter_number" is the parameter number.
+  // "name" is the name of the parameter instruction.
+  //
+  // When the test_type is bfloat16 but NativeT is float, the data will be
+  // converted to bfloat16.
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateR3Parameter(
+      const Array3D<NativeT>& array_3d, int64_t parameter_number,
+      const std::string& name, XlaBuilder* builder, XlaOp* data_handle);
+
+  // Creates a parameter instruction that wraps the given constant array
+  // "array_4d" and then stores it to the global handle for that parameter
+  // "data_handle".
+  //
+  // "parameter_number" is the parameter number.
+  // "name" is the name of the parameter instruction.
+  //
+  // When the test_type is bfloat16 but NativeT is float, the data will be
+  // converted to bfloat16.
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateR4Parameter(
+      const Array4D<NativeT>& array_4d, int64_t parameter_number,
+      const std::string& name, XlaBuilder* builder, XlaOp* data_handle);
+
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateParameter(const Array<NativeT>& array_4d,
+                                              int64_t parameter_number,
+                                              const std::string& name,
+                                              XlaBuilder* builder,
+                                              XlaOp* data_handle);
+
+  // The float type used in this test.
+  PrimitiveType FloatType() const { return test_type_; }
+  void set_float_type(PrimitiveType type) { test_type_ = type; }
+
+  // Executes the computation and calculates the expected reference value using
+  // the reference client. Returns two literals in the order of (expected,
+  // actual).
+  absl::StatusOr<std::pair<Literal, Literal>> ComputeValueAndReference(
+      XlaBuilder* builder, absl::Span<const Literal> arguments);
+
+  // Converts a literal to the test_type if the literal's type is F32.
+  Literal MaybeConvertLiteralToTestType(const Literal& literal);
+
+  LocalClient* client_;
+  LocalClient* ref_client_;  // To compute reference result.
+  ExecutionOptions execution_options_;
+
+ private:
+  absl::Status ComputeAndCompareLiteralWithAllOutputLayouts(
+      const xla::XlaComputation& computation, const Literal& expected,
+      absl::Span<GlobalData* const> arguments,
+      const std::function<void(const Literal& actual,
+                               const std::string& error_message)>&
+          verify_output);
+  absl::Status ComputeAndCompareLiteralWithAllInputLayouts(
+      const xla::XlaComputation& computation, const Literal& expected,
+      absl::Span<GlobalData* const> arguments,
+      const std::function<void(const Literal& actual,
+                               const std::string& error_message)>&
+          verify_output,
+      const Shape* output_with_layout = nullptr);
+
+  // Converts an f32 shape to test_type_.
+  Shape MaybeConvertShapeToTestType(const Shape& shape);
+
+  // Type to use when running tests. By default, we use F32 for historical
+  // reasons and we rely on the underlying tests to change it.
+  PrimitiveType test_type_ = F32;
+
+  // Arguments to be passed to the computation when it runs.
+  std::vector<Literal> arguments_;
+
+  template <typename T>
+  static constexpr inline bool is_floating_or_complex_v =
+      std::disjunction_v<is_specialized_floating_point<T>, is_complex<T>>;
+};
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR0(
+    XlaBuilder* builder, NativeT expected,
+    absl::Span<GlobalData* const> arguments) {
+  Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR0(
+    XlaBuilder* builder, NativeT expected,
+    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
+  static_assert(is_floating_or_complex_v<NativeT>,
+                "Float or complex type required when specifying an ErrorSpec");
+  Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments, error);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR1(
+    XlaBuilder* builder, absl::Span<const NativeT> expected,
+    absl::Span<GlobalData* const> arguments) {
+  Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR1(
+    XlaBuilder* builder, absl::Span<const NativeT> expected,
+    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
+  static_assert(is_floating_or_complex_v<NativeT>,
+                "Float or complex type required when specifying an ErrorSpec");
+  Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments, error);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR2(
+    XlaBuilder* builder, const Array2D<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments) {
+  Literal expected_literal =
+      LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR2(
+    XlaBuilder* builder, const Array2D<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
+  static_assert(is_floating_or_complex_v<NativeT>,
+                "Float or complex type required when specifying an ErrorSpec");
+  Literal expected_literal =
+      LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments, error);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR3(
+    XlaBuilder* builder, const Array3D<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments) {
+  Literal expected_literal =
+      LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR3(
+    XlaBuilder* builder, const Array3D<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
+  static_assert(is_floating_or_complex_v<NativeT>,
+                "Float or complex type required when specifying an ErrorSpec");
+  Literal expected_literal =
+      LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments, error);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR4(
+    XlaBuilder* builder, const Array4D<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments) {
+  Literal expected_literal =
+      LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompareR4(
+    XlaBuilder* builder, const Array4D<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
+  static_assert(is_floating_or_complex_v<NativeT>,
+                "Float or complex type required when specifying an ErrorSpec");
+  Literal expected_literal =
+      LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments, error);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, const Array<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments) {
+  Literal expected_literal = LiteralUtil::CreateFromArray<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments);
+}
+
+template <typename NativeT>
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, const Array<NativeT>& expected,
+    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
+  static_assert(is_floating_or_complex_v<NativeT>,
+                "Float or complex type required when specifying an ErrorSpec");
+  Literal expected_literal = LiteralUtil::CreateFromArray<NativeT>(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
+                                                  arguments, error);
+}
+
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
+    NativeT value, int64_t parameter_number, const std::string& name,
+    XlaBuilder* builder, XlaOp* data_handle) {
+  Literal literal = LiteralUtil::CreateR0(value);
+  literal = MaybeConvertLiteralToTestType(literal);
+  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
+  *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+  return data;
+}
+
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
+    absl::Span<const NativeT> values, int64_t parameter_number,
+    const std::string& name, XlaBuilder* builder, XlaOp* data_handle) {
+  Literal literal = LiteralUtil::CreateR1(values);
+  literal = MaybeConvertLiteralToTestType(literal);
+  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
+  *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+  return data;
+}
+
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
+    const Array2D<NativeT>& array_2d, int64_t parameter_number,
+    const std::string& name, XlaBuilder* builder, XlaOp* data_handle) {
+  Literal literal = LiteralUtil::CreateR2FromArray2D(array_2d);
+  literal = MaybeConvertLiteralToTestType(literal);
+  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
+  *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+  return data;
+}
+
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
+    const Array3D<NativeT>& array_3d, int64_t parameter_number,
+    const std::string& name, XlaBuilder* builder, XlaOp* data_handle) {
+  Literal literal = LiteralUtil::CreateR3FromArray3D(array_3d);
+  literal = MaybeConvertLiteralToTestType(literal);
+  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
+  *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+  return data;
+}
+
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR4Parameter(
+    const Array4D<NativeT>& array_4d, int64_t parameter_number,
+    const std::string& name, XlaBuilder* builder, XlaOp* data_handle) {
+  Literal literal = LiteralUtil::CreateR4FromArray4D(array_4d);
+  literal = MaybeConvertLiteralToTestType(literal);
+  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
+  *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+  return data;
+}
+
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateParameter(
+    const Array<NativeT>& array, int64_t parameter_number,
+    const std::string& name, XlaBuilder* builder, XlaOp* data_handle) {
+  Literal literal = LiteralUtil::CreateFromArray(array);
+  literal = MaybeConvertLiteralToTestType(literal);
+  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
+  *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+  return data;
+}
+
+template <typename NativeT>
+std::vector<NativeT> ClientLibraryTestBase::CreatePseudorandomR1(
+    const int width, NativeT min_value, NativeT max_value, uint32_t seed) {
+  std::vector<NativeT> result(width);
+  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
+  for (int i = 0; i < width; ++i) {
+    result[i] = generator.get();
+  }
+  return result;
+}
+
+template <typename NativeT>
+std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
+    const int rows, const int cols, NativeT min_value, NativeT max_value,
+    uint32_t seed) {
+  auto result = std::make_unique<Array2D<NativeT>>(rows, cols);
+  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
+  for (int y = 0; y < rows; ++y) {
+    for (int x = 0; x < cols; ++x) {
+      (*result)(y, x) = generator.get();
+    }
+  }
+  return result;
+}
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/codegen_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/codegen_test_base.h
new file mode 100644
index 00000000..7d71613a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/codegen_test_base.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_CODEGEN_TEST_BASE_H_
+#define XLA_TESTS_CODEGEN_TEST_BASE_H_
+
+#include <memory>
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+
+// Provides access to both the JIT and the AOT compiler for testing.
+class CodegenTestBase : public HloTestBase {
+ protected:
+  // Compiles hlo_module with the JIT compiler.
+  absl::StatusOr<std::unique_ptr<Executable>> CompileToExecutable(
+      std::unique_ptr<HloModule> hlo_module,
+      bool run_optimization_passes = true);
+
+  // Compiles hlo_module with the AOT compiler.
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  CompileToAotCompilationResult(std::unique_ptr<HloModule> hlo_module,
+                                const AotCompilationOptions& options);
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_CODEGEN_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/complex_unary_op_samples.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/complex_unary_op_samples.h
new file mode 100644
index 00000000..1851e564
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/complex_unary_op_samples.h
@@ -0,0 +1,2560 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/*
+  This file is generated using xla/tests/generate_complex_unary_op_samples.py.
+  Do not edit!
+ */
+
+#include <cmath>
+#include <complex>
+#include <limits>
+#include <tuple>
+#include <vector>
+
+#ifndef XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_
+#define XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_
+
+namespace complex_unary_op_samples {
+// NOLINTBEGIN(whitespace/line_length)
+
+template <class>
+constexpr bool dependent_false = false;
+
+template <typename T, int default_dps_deficiency = 0>
+struct Log1p {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T pi = 3.1415927f;
+      const T pi_4 = 0.7853982f;
+      const T pi_2 = 1.5707964f;
+      const T pi3_4 = 2.3561945f;
+      const T zero = 0.0f;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { inf, -pi3_4 }, 1.e+00f },
+        /* 1 */ { { -max, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 2 */ { { -6.1409603e+25f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 3 */ { { -1.1082383e+13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 4 */ { { -2.e+00f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 5 */ { { -3.6093321e-13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 6 */ { { -6.5136393e-26f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 7 */ { { -min, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 8 */ { { zero, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 9 */ { { min, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 10 */ { { 6.5136393e-26f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 11 */ { { 3.6093321e-13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 12 */ { { 2.e+00f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 13 */ { { 1.1082383e+13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 14 */ { { 6.1409603e+25f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 15 */ { { max, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 16 */ { { inf, -inf }, { inf, -pi_4 }, 1.e+00f },
+        /* 17 */ { { -inf, -max }, { inf, -pi }, 1.e+00f },
+        /* 18 */ { { -max, -max }, { 8.9069412e+01f, -pi3_4 }, 7.8125e-03f },
+        /* 19 */ { { -6.1409603e+25f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 20 */ { { -1.1082383e+13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 21 */ { { -2.e+00f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 22 */ { { -3.6093321e-13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 23 */ { { -6.5136393e-26f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 24 */ { { -min, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 25 */ { { zero, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 26 */ { { min, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 27 */ { { 6.5136393e-26f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 28 */ { { 3.6093321e-13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 29 */ { { 2.e+00f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 30 */ { { 1.1082383e+13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 31 */ { { 6.1409603e+25f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 32 */ { { max, -max }, { 8.9069412e+01f, -pi_4 }, 7.8125e-03f },
+        /* 33 */ { { inf, -max }, { inf, zero }, 1.e+00f },
+        /* 34 */ { { -inf, -6.1409603e+25f }, { inf, -pi }, 1.e+00f },
+        /* 35 */ { { -max, -6.1409603e+25f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 36 */ { { -6.1409603e+25f, -6.1409603e+25f }, { 5.9726181e+01f, -pi3_4 }, 1.5625e-02f },
+        /* 37 */ { { -1.1082383e+13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 38 */ { { -2.e+00f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 39 */ { { -3.6093321e-13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 40 */ { { -6.5136393e-26f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 41 */ { { -min, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 42 */ { { zero, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 43 */ { { min, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 44 */ { { 6.5136393e-26f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 45 */ { { 3.6093321e-13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 46 */ { { 2.e+00f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 47 */ { { 1.1082383e+13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 48 */ { { 6.1409603e+25f, -6.1409603e+25f }, { 5.9726181e+01f, -pi_4 }, 1.5625e-02f },
+        /* 49 */ { { max, -6.1409603e+25f }, { 8.8722839e+01f, -1.8046662e-13f }, 7.8125e-03f },
+        /* 50 */ { { inf, -6.1409603e+25f }, { inf, zero }, 1.e+00f },
+        /* 51 */ { { -inf, -1.1082383e+13f }, { inf, -pi }, 1.e+00f },
+        /* 52 */ { { -max, -1.1082383e+13f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 53 */ { { -6.1409603e+25f, -1.1082383e+13f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 54 */ { { -1.1082383e+13f, -1.1082383e+13f }, { 3.0382952e+01f, -pi3_4 }, 3.125e-02f },
+        /* 55 */ { { -2.e+00f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 56 */ { { -3.6093321e-13f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 57 */ { { -6.5136393e-26f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 58 */ { { -min, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 59 */ { { zero, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 60 */ { { min, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 61 */ { { 6.5136393e-26f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 62 */ { { 3.6093321e-13f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 63 */ { { 2.e+00f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 64 */ { { 1.1082383e+13f, -1.1082383e+13f }, { 3.0382952e+01f, -pi_4 }, 3.125e-02f },
+        /* 65 */ { { 6.1409603e+25f, -1.1082383e+13f }, { 5.9379608e+01f, -1.8046662e-13f }, 1.5625e-02f },
+        /* 66 */ { { max, -1.1082383e+13f }, { 8.8722839e+01f, -3.25682e-26f }, 7.8125e-03f },
+        /* 67 */ { { inf, -1.1082383e+13f }, { inf, zero }, 1.e+00f },
+        /* 68 */ { { -inf, -2.e+00f }, { inf, -pi }, 1.e+00f },
+        /* 69 */ { { -max, -2.e+00f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 70 */ { { -6.1409603e+25f, -2.e+00f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 71 */ { { -1.1082383e+13f, -2.e+00f }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 72 */ { { -2.e+00f, -2.e+00f }, { 8.0471897e-01f, -2.0344439e+00f }, 2.5e-01f },
+        /* 73 */ { { -3.6093321e-13f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 74 */ { { -6.5136393e-26f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 75 */ { { -min, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 76 */ { { zero, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 77 */ { { min, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 78 */ { { 6.5136393e-26f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 79 */ { { 3.6093321e-13f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 80 */ { { 2.e+00f, -2.e+00f }, { 1.2824746e+00f, -5.8800262e-01f }, 5.e-01f },
+        /* 81 */ { { 1.1082383e+13f, -2.e+00f }, { 3.0036377e+01f, -1.804666e-13f }, 3.125e-02f },
+        /* 82 */ { { 6.1409603e+25f, -2.e+00f }, { 5.9379608e+01f, -3.2568196e-26f }, 1.5625e-02f },
+        /* 83 */ { { max, -2.e+00f }, { 8.8722839e+01f, -5.8774718e-39f }, 7.8125e-03f },
+        /* 84 */ { { inf, -2.e+00f }, { inf, zero }, 1.e+00f },
+        /* 85 */ { { -inf, -3.6093321e-13f }, { inf, -pi }, 1.e+00f },
+        /* 86 */ { { -max, -3.6093321e-13f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 87 */ { { -6.1409603e+25f, -3.6093321e-13f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 88 */ { { -1.1082383e+13f, -3.6093321e-13f }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 89 */ { { -2.e+00f, -3.6093321e-13f }, { 6.5136393e-26f, -pi }, 2.5e-01f },
+        /* 90 */ { { -3.6093321e-13f, -3.6093321e-13f }, { -3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 91 */ { { -6.5136393e-26f, -3.6093321e-13f }, { -2.8437115e-33f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 92 */ { { -min, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 93 */ { { zero, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 94 */ { { min, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 95 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 1.3027279e-25f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 96 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 97 */ { { 2.e+00f, -3.6093321e-13f }, { 1.0986123e+00f, -1.2031107e-13f }, 5.e-01f },
+        /* 98 */ { { 1.1082383e+13f, -3.6093321e-13f }, { 3.0036377e+01f, -3.2568193e-26f }, 3.125e-02f },
+        /* 99 */ { { 6.1409603e+25f, -3.6093321e-13f }, { 5.9379608e+01f, -5.8774718e-39f }, 1.5625e-02f },
+        /* 100 */ { { max, -3.6093321e-13f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 101 */ { { inf, -3.6093321e-13f }, { inf, zero }, 1.e+00f },
+        /* 102 */ { { -inf, -6.5136393e-26f }, { inf, -pi }, 1.e+00f },
+        /* 103 */ { { -max, -6.5136393e-26f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 104 */ { { -6.1409603e+25f, -6.5136393e-26f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 105 */ { { -1.1082383e+13f, -6.5136393e-26f }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 106 */ { { -2.e+00f, -6.5136393e-26f }, { zero, -pi }, 2.5e-01f },
+        /* 107 */ { { -3.6093321e-13f, -6.5136393e-26f }, { -3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 108 */ { { -6.5136393e-26f, -6.5136393e-26f }, { -6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 109 */ { { -min, -6.5136393e-26f }, { -min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 110 */ { { zero, -6.5136393e-26f }, { zero, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 111 */ { { min, -6.5136393e-26f }, { min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 112 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 113 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 114 */ { { 2.e+00f, -6.5136393e-26f }, { 1.0986123e+00f, -2.1712131e-26f }, 5.e-01f },
+        /* 115 */ { { 1.1082383e+13f, -6.5136393e-26f }, { 3.0036377e+01f, -5.8774718e-39f }, 3.125e-02f },
+        /* 116 */ { { 6.1409603e+25f, -6.5136393e-26f }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 117 */ { { max, -6.5136393e-26f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 118 */ { { inf, -6.5136393e-26f }, { inf, zero }, 1.e+00f },
+        /* 119 */ { { -inf, -min }, { inf, -pi }, 1.e+00f },
+        /* 120 */ { { -max, -min }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 121 */ { { -6.1409603e+25f, -min }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 122 */ { { -1.1082383e+13f, -min }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 123 */ { { -2.e+00f, -min }, { zero, -pi }, 2.5e-01f },
+        /* 124 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 125 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 126 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
+        /* 127 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
+        /* 128 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
+        /* 129 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 130 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 131 */ { { 2.e+00f, -min }, { 1.0986123e+00f, zero }, 5.e-01f },
+        /* 132 */ { { 1.1082383e+13f, -min }, { 3.0036377e+01f, zero }, 3.125e-02f },
+        /* 133 */ { { 6.1409603e+25f, -min }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 134 */ { { max, -min }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 135 */ { { inf, -min }, { inf, zero }, 1.e+00f },
+        /* 136 */ { { -inf, zero }, { inf, pi }, 1.e+00f },
+        /* 137 */ { { -max, zero }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 138 */ { { -6.1409603e+25f, zero }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 139 */ { { -1.1082383e+13f, zero }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 140 */ { { -2.e+00f, zero }, { zero, pi }, 2.5e-01f },
+        /* 141 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 142 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 143 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00f },
+        /* 145 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
+        /* 146 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 147 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 148 */ { { 2.e+00f, zero }, { 1.0986123e+00f, zero }, 5.e-01f },
+        /* 149 */ { { 1.1082383e+13f, zero }, { 3.0036377e+01f, zero }, 3.125e-02f },
+        /* 150 */ { { 6.1409603e+25f, zero }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 151 */ { { max, zero }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00f },
+        /* 153 */ { { -inf, min }, { inf, pi }, 1.e+00f },
+        /* 154 */ { { -max, min }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 155 */ { { -6.1409603e+25f, min }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 156 */ { { -1.1082383e+13f, min }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 157 */ { { -2.e+00f, min }, { zero, pi }, 2.5e-01f },
+        /* 158 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 159 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 160 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
+        /* 161 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
+        /* 162 */ { { min, min }, { min, min }, 4.2535296e+37f },
+        /* 163 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 164 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 165 */ { { 2.e+00f, min }, { 1.0986123e+00f, zero }, 5.e-01f },
+        /* 166 */ { { 1.1082383e+13f, min }, { 3.0036377e+01f, zero }, 3.125e-02f },
+        /* 167 */ { { 6.1409603e+25f, min }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 168 */ { { max, min }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 169 */ { { inf, min }, { inf, zero }, 1.e+00f },
+        /* 170 */ { { -inf, 6.5136393e-26f }, { inf, pi }, 1.e+00f },
+        /* 171 */ { { -max, 6.5136393e-26f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 172 */ { { -6.1409603e+25f, 6.5136393e-26f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 173 */ { { -1.1082383e+13f, 6.5136393e-26f }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 174 */ { { -2.e+00f, 6.5136393e-26f }, { zero, pi }, 2.5e-01f },
+        /* 175 */ { { -3.6093321e-13f, 6.5136393e-26f }, { -3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 176 */ { { -6.5136393e-26f, 6.5136393e-26f }, { -6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 177 */ { { -min, 6.5136393e-26f }, { -min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 178 */ { { zero, 6.5136393e-26f }, { zero, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 179 */ { { min, 6.5136393e-26f }, { min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 180 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 181 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 182 */ { { 2.e+00f, 6.5136393e-26f }, { 1.0986123e+00f, 2.1712131e-26f }, 5.e-01f },
+        /* 183 */ { { 1.1082383e+13f, 6.5136393e-26f }, { 3.0036377e+01f, 5.8774718e-39f }, 3.125e-02f },
+        /* 184 */ { { 6.1409603e+25f, 6.5136393e-26f }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 185 */ { { max, 6.5136393e-26f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 186 */ { { inf, 6.5136393e-26f }, { inf, zero }, 1.e+00f },
+        /* 187 */ { { -inf, 3.6093321e-13f }, { inf, pi }, 1.e+00f },
+        /* 188 */ { { -max, 3.6093321e-13f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 189 */ { { -6.1409603e+25f, 3.6093321e-13f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 190 */ { { -1.1082383e+13f, 3.6093321e-13f }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 191 */ { { -2.e+00f, 3.6093321e-13f }, { 6.5136393e-26f, pi }, 2.5e-01f },
+        /* 192 */ { { -3.6093321e-13f, 3.6093321e-13f }, { -3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 193 */ { { -6.5136393e-26f, 3.6093321e-13f }, { -2.8437115e-33f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 194 */ { { -min, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 195 */ { { zero, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 196 */ { { min, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 197 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 1.3027279e-25f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 198 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 199 */ { { 2.e+00f, 3.6093321e-13f }, { 1.0986123e+00f, 1.2031107e-13f }, 5.e-01f },
+        /* 200 */ { { 1.1082383e+13f, 3.6093321e-13f }, { 3.0036377e+01f, 3.2568193e-26f }, 3.125e-02f },
+        /* 201 */ { { 6.1409603e+25f, 3.6093321e-13f }, { 5.9379608e+01f, 5.8774718e-39f }, 1.5625e-02f },
+        /* 202 */ { { max, 3.6093321e-13f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 203 */ { { inf, 3.6093321e-13f }, { inf, zero }, 1.e+00f },
+        /* 204 */ { { -inf, 2.e+00f }, { inf, pi }, 1.e+00f },
+        /* 205 */ { { -max, 2.e+00f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 206 */ { { -6.1409603e+25f, 2.e+00f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 207 */ { { -1.1082383e+13f, 2.e+00f }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 208 */ { { -2.e+00f, 2.e+00f }, { 8.0471897e-01f, 2.0344439e+00f }, 2.5e-01f },
+        /* 209 */ { { -3.6093321e-13f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 210 */ { { -6.5136393e-26f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 211 */ { { -min, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 212 */ { { zero, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 213 */ { { min, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 214 */ { { 6.5136393e-26f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 215 */ { { 3.6093321e-13f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 216 */ { { 2.e+00f, 2.e+00f }, { 1.2824746e+00f, 5.8800262e-01f }, 5.e-01f },
+        /* 217 */ { { 1.1082383e+13f, 2.e+00f }, { 3.0036377e+01f, 1.804666e-13f }, 3.125e-02f },
+        /* 218 */ { { 6.1409603e+25f, 2.e+00f }, { 5.9379608e+01f, 3.2568196e-26f }, 1.5625e-02f },
+        /* 219 */ { { max, 2.e+00f }, { 8.8722839e+01f, 5.8774718e-39f }, 7.8125e-03f },
+        /* 220 */ { { inf, 2.e+00f }, { inf, zero }, 1.e+00f },
+        /* 221 */ { { -inf, 1.1082383e+13f }, { inf, pi }, 1.e+00f },
+        /* 222 */ { { -max, 1.1082383e+13f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 223 */ { { -6.1409603e+25f, 1.1082383e+13f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 224 */ { { -1.1082383e+13f, 1.1082383e+13f }, { 3.0382952e+01f, pi3_4 }, 3.125e-02f },
+        /* 225 */ { { -2.e+00f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 226 */ { { -3.6093321e-13f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 227 */ { { -6.5136393e-26f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 228 */ { { -min, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 229 */ { { zero, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 230 */ { { min, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 231 */ { { 6.5136393e-26f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 232 */ { { 3.6093321e-13f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 233 */ { { 2.e+00f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 234 */ { { 1.1082383e+13f, 1.1082383e+13f }, { 3.0382952e+01f, pi_4 }, 3.125e-02f },
+        /* 235 */ { { 6.1409603e+25f, 1.1082383e+13f }, { 5.9379608e+01f, 1.8046662e-13f }, 1.5625e-02f },
+        /* 236 */ { { max, 1.1082383e+13f }, { 8.8722839e+01f, 3.25682e-26f }, 7.8125e-03f },
+        /* 237 */ { { inf, 1.1082383e+13f }, { inf, zero }, 1.e+00f },
+        /* 238 */ { { -inf, 6.1409603e+25f }, { inf, pi }, 1.e+00f },
+        /* 239 */ { { -max, 6.1409603e+25f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 240 */ { { -6.1409603e+25f, 6.1409603e+25f }, { 5.9726181e+01f, pi3_4 }, 1.5625e-02f },
+        /* 241 */ { { -1.1082383e+13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 242 */ { { -2.e+00f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 243 */ { { -3.6093321e-13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 244 */ { { -6.5136393e-26f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 245 */ { { -min, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 246 */ { { zero, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 247 */ { { min, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 248 */ { { 6.5136393e-26f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 249 */ { { 3.6093321e-13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 250 */ { { 2.e+00f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 251 */ { { 1.1082383e+13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 252 */ { { 6.1409603e+25f, 6.1409603e+25f }, { 5.9726181e+01f, pi_4 }, 1.5625e-02f },
+        /* 253 */ { { max, 6.1409603e+25f }, { 8.8722839e+01f, 1.8046662e-13f }, 7.8125e-03f },
+        /* 254 */ { { inf, 6.1409603e+25f }, { inf, zero }, 1.e+00f },
+        /* 255 */ { { -inf, max }, { inf, pi }, 1.e+00f },
+        /* 256 */ { { -max, max }, { 8.9069412e+01f, pi3_4 }, 7.8125e-03f },
+        /* 257 */ { { -6.1409603e+25f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 258 */ { { -1.1082383e+13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 259 */ { { -2.e+00f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 260 */ { { -3.6093321e-13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 261 */ { { -6.5136393e-26f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 262 */ { { -min, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 263 */ { { zero, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 264 */ { { min, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 265 */ { { 6.5136393e-26f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 266 */ { { 3.6093321e-13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 267 */ { { 2.e+00f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 268 */ { { 1.1082383e+13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 269 */ { { 6.1409603e+25f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 270 */ { { max, max }, { 8.9069412e+01f, pi_4 }, 7.8125e-03f },
+        /* 271 */ { { inf, max }, { inf, zero }, 1.e+00f },
+        /* 272 */ { { -inf, inf }, { inf, pi3_4 }, 1.e+00f },
+        /* 273 */ { { -max, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 274 */ { { -6.1409603e+25f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 275 */ { { -1.1082383e+13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 276 */ { { -2.e+00f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 277 */ { { -3.6093321e-13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 278 */ { { -6.5136393e-26f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 279 */ { { -min, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 280 */ { { zero, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 281 */ { { min, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 282 */ { { 6.5136393e-26f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 283 */ { { 3.6093321e-13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 284 */ { { 2.e+00f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 285 */ { { 1.1082383e+13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 286 */ { { 6.1409603e+25f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 287 */ { { max, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 288 */ { { inf, inf }, { inf, pi_4 }, 1.e+00f }
+          // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T pi = 3.141592653589793;
+      const T pi_4 = 0.7853981633974483;
+      const T pi_2 = 1.5707963267948966;
+      const T pi3_4 = 2.356194490192345;
+      const T zero = 0.0;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { inf, -pi3_4 }, 1.e+00 },
+        /* 1 */ { { -max, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 2 */ { { -4.0131652080900752e+205, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 3 */ { { -8.9589789687104559e+102, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 4 */ { { -1.9999999999998694e+00, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 5 */ { { -4.4647944971961829e-103, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 6 */ { { -9.9671949510973086e-206, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 7 */ { { -min, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 8 */ { { zero, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 9 */ { { min, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 10 */ { { 9.9671949510973086e-206, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 11 */ { { 4.4647944971961829e-103, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 12 */ { { 1.9999999999998694e+00, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 13 */ { { 8.9589789687104559e+102, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 14 */ { { 4.0131652080900752e+205, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 15 */ { { max, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 16 */ { { inf, -inf }, { inf, -pi_4 }, 1.e+00 },
+        /* 17 */ { { -inf, -max }, { inf, -pi }, 1.e+00 },
+        /* 18 */ { { -max, -max }, { 7.1012928648366392e+02, -pi3_4 }, 9.765625e-04 },
+        /* 19 */ { { -4.0131652080900752e+205, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 20 */ { { -8.9589789687104559e+102, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 21 */ { { -1.9999999999998694e+00, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 22 */ { { -4.4647944971961829e-103, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 23 */ { { -9.9671949510973086e-206, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 24 */ { { -min, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 25 */ { { zero, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 26 */ { { min, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 27 */ { { 9.9671949510973086e-206, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 28 */ { { 4.4647944971961829e-103, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 29 */ { { 1.9999999999998694e+00, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 30 */ { { 8.9589789687104559e+102, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 31 */ { { 4.0131652080900752e+205, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 32 */ { { max, -max }, { 7.1012928648366392e+02, -pi_4 }, 9.765625e-04 },
+        /* 33 */ { { inf, -max }, { inf, zero }, 1.e+00 },
+        /* 34 */ { { -inf, -4.0131652080900752e+205 }, { inf, -pi }, 1.e+00 },
+        /* 35 */ { { -max, -4.0131652080900752e+205 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 36 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { 4.737660979127225e+02, -pi3_4 }, 1.953125e-03 },
+        /* 37 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 38 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 39 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 40 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 41 */ { { -min, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 42 */ { { zero, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 43 */ { { min, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 44 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 45 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 46 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 47 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 48 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { 4.737660979127225e+02, -pi_4 }, 1.953125e-03 },
+        /* 49 */ { { max, -4.0131652080900752e+205 }, { 7.0978271289338397e+02, -2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 50 */ { { inf, -4.0131652080900752e+205 }, { inf, zero }, 1.e+00 },
+        /* 51 */ { { -inf, -8.9589789687104559e+102 }, { inf, -pi }, 1.e+00 },
+        /* 52 */ { { -max, -8.9589789687104559e+102 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 53 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 54 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { 2.374029093417812e+02, -pi3_4 }, 3.90625e-03 },
+        /* 55 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 56 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 57 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 58 */ { { -min, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 59 */ { { zero, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 60 */ { { min, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 61 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 62 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 63 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 64 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { 2.374029093417812e+02, -pi_4 }, 3.90625e-03 },
+        /* 65 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { 4.7341952432244256e+02, -2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 66 */ { { max, -8.9589789687104559e+102 }, { 7.0978271289338397e+02, -4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 67 */ { { inf, -8.9589789687104559e+102 }, { inf, zero }, 1.e+00 },
+        /* 68 */ { { -inf, -1.9999999999998694e+00 }, { inf, -pi }, 1.e+00 },
+        /* 69 */ { { -max, -1.9999999999998694e+00 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 70 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 71 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 72 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { 8.0471895621697187e-01, -2.0344439357956765e+00 }, 2.5e-01 },
+        /* 73 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 74 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 75 */ { { -min, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 76 */ { { zero, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 77 */ { { min, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 78 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 79 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 80 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { 1.2824746787307182e+00, -5.8800260354755751e-01 }, 5.e-01 },
+        /* 81 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { 2.3705633575150122e+02, -2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 82 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { 4.7341952432244256e+02, -4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 83 */ { { max, -1.9999999999998694e+00 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 84 */ { { inf, -1.9999999999998694e+00 }, { inf, zero }, 1.e+00 },
+        /* 85 */ { { -inf, -4.4647944971961829e-103 }, { inf, -pi }, 1.e+00 },
+        /* 86 */ { { -max, -4.4647944971961829e-103 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 87 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 88 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 89 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { -1.3056222769592694e-13, -pi }, 2.5e-01 },
+        /* 90 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { -4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 91 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { -6.5066958834738373e-219, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 92 */ { { -min, -4.4647944971961829e-103 }, { 9.9671949510966579e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 93 */ { { zero, -4.4647944971961829e-103 }, { 9.9671949510966579e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 94 */ { { min, -4.4647944971961829e-103 }, { 9.9671949510966579e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 95 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 1.9934389902193967e-205, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 96 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 97 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { 1.0986122886680663e+00, -1.4882648323987925e-103 }, 5.e-01 },
+        /* 98 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { 2.3705633575150122e+02, -4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 99 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { 4.7341952432244256e+02, -1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 100 */ { { max, -4.4647944971961829e-103 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 101 */ { { inf, -4.4647944971961829e-103 }, { inf, zero }, 1.e+00 },
+        /* 102 */ { { -inf, -9.9671949510973086e-206 }, { inf, -pi }, 1.e+00 },
+        /* 103 */ { { -max, -9.9671949510973086e-206 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 104 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 105 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 106 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { -1.3056222769592694e-13, -pi }, 2.5e-01 },
+        /* 107 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { -4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 108 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { -9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 109 */ { { -min, -9.9671949510973086e-206 }, { -min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 110 */ { { zero, -9.9671949510973086e-206 }, { zero, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 111 */ { { min, -9.9671949510973086e-206 }, { min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 112 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 113 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 114 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { 1.0986122886680663e+00, -3.322398317032581e-206 }, 5.e-01 },
+        /* 115 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { 2.3705633575150122e+02, -1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 116 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 117 */ { { max, -9.9671949510973086e-206 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 118 */ { { inf, -9.9671949510973086e-206 }, { inf, zero }, 1.e+00 },
+        /* 119 */ { { -inf, -min }, { inf, -pi }, 1.e+00 },
+        /* 120 */ { { -max, -min }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 121 */ { { -4.0131652080900752e+205, -min }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 122 */ { { -8.9589789687104559e+102, -min }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 123 */ { { -1.9999999999998694e+00, -min }, { -1.3056222769592694e-13, -pi }, 2.5e-01 },
+        /* 124 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 125 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 126 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
+        /* 127 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
+        /* 128 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
+        /* 129 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 130 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 131 */ { { 1.9999999999998694e+00, -min }, { 1.0986122886680663e+00, zero }, 5.e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, -min }, { 2.3705633575150122e+02, zero }, 3.90625e-03 },
+        /* 133 */ { { 4.0131652080900752e+205, -min }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 134 */ { { max, -min }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 135 */ { { inf, -min }, { inf, zero }, 1.e+00 },
+        /* 136 */ { { -inf, zero }, { inf, pi }, 1.e+00 },
+        /* 137 */ { { -max, zero }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 138 */ { { -4.0131652080900752e+205, zero }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 139 */ { { -8.9589789687104559e+102, zero }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 140 */ { { -1.9999999999998694e+00, zero }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 141 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 142 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 143 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00 },
+        /* 145 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
+        /* 146 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 147 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 148 */ { { 1.9999999999998694e+00, zero }, { 1.0986122886680663e+00, zero }, 5.e-01 },
+        /* 149 */ { { 8.9589789687104559e+102, zero }, { 2.3705633575150122e+02, zero }, 3.90625e-03 },
+        /* 150 */ { { 4.0131652080900752e+205, zero }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 151 */ { { max, zero }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00 },
+        /* 153 */ { { -inf, min }, { inf, pi }, 1.e+00 },
+        /* 154 */ { { -max, min }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 155 */ { { -4.0131652080900752e+205, min }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 156 */ { { -8.9589789687104559e+102, min }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 157 */ { { -1.9999999999998694e+00, min }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 158 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 159 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 160 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
+        /* 161 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
+        /* 162 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
+        /* 163 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 164 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 165 */ { { 1.9999999999998694e+00, min }, { 1.0986122886680663e+00, zero }, 5.e-01 },
+        /* 166 */ { { 8.9589789687104559e+102, min }, { 2.3705633575150122e+02, zero }, 3.90625e-03 },
+        /* 167 */ { { 4.0131652080900752e+205, min }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 168 */ { { max, min }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 169 */ { { inf, min }, { inf, zero }, 1.e+00 },
+        /* 170 */ { { -inf, 9.9671949510973086e-206 }, { inf, pi }, 1.e+00 },
+        /* 171 */ { { -max, 9.9671949510973086e-206 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 172 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 173 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 174 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 175 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { -4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 176 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { -9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 177 */ { { -min, 9.9671949510973086e-206 }, { -min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 178 */ { { zero, 9.9671949510973086e-206 }, { zero, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 179 */ { { min, 9.9671949510973086e-206 }, { min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 180 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 181 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 182 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { 1.0986122886680663e+00, 3.322398317032581e-206 }, 5.e-01 },
+        /* 183 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { 2.3705633575150122e+02, 1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 184 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 185 */ { { max, 9.9671949510973086e-206 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 186 */ { { inf, 9.9671949510973086e-206 }, { inf, zero }, 1.e+00 },
+        /* 187 */ { { -inf, 4.4647944971961829e-103 }, { inf, pi }, 1.e+00 },
+        /* 188 */ { { -max, 4.4647944971961829e-103 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 189 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 190 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 191 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 192 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { -4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 193 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { -6.5066958834738373e-219, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 194 */ { { -min, 4.4647944971961829e-103 }, { 9.9671949510966579e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 195 */ { { zero, 4.4647944971961829e-103 }, { 9.9671949510966579e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 196 */ { { min, 4.4647944971961829e-103 }, { 9.9671949510966579e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 197 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 1.9934389902193967e-205, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 198 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 199 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { 1.0986122886680663e+00, 1.4882648323987925e-103 }, 5.e-01 },
+        /* 200 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { 2.3705633575150122e+02, 4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 201 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { 4.7341952432244256e+02, 1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 202 */ { { max, 4.4647944971961829e-103 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 203 */ { { inf, 4.4647944971961829e-103 }, { inf, zero }, 1.e+00 },
+        /* 204 */ { { -inf, 1.9999999999998694e+00 }, { inf, pi }, 1.e+00 },
+        /* 205 */ { { -max, 1.9999999999998694e+00 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 206 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 207 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 208 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { 8.0471895621697187e-01, 2.0344439357956765e+00 }, 2.5e-01 },
+        /* 209 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 210 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 211 */ { { -min, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 212 */ { { zero, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 213 */ { { min, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 214 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 215 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 216 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { 1.2824746787307182e+00, 5.8800260354755751e-01 }, 5.e-01 },
+        /* 217 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { 2.3705633575150122e+02, 2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 218 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { 4.7341952432244256e+02, 4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 219 */ { { max, 1.9999999999998694e+00 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 220 */ { { inf, 1.9999999999998694e+00 }, { inf, zero }, 1.e+00 },
+        /* 221 */ { { -inf, 8.9589789687104559e+102 }, { inf, pi }, 1.e+00 },
+        /* 222 */ { { -max, 8.9589789687104559e+102 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 223 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 224 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { 2.374029093417812e+02, pi3_4 }, 3.90625e-03 },
+        /* 225 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 226 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 227 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 228 */ { { -min, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 229 */ { { zero, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 230 */ { { min, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 231 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 232 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 233 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 234 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { 2.374029093417812e+02, pi_4 }, 3.90625e-03 },
+        /* 235 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { 4.7341952432244256e+02, 2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 236 */ { { max, 8.9589789687104559e+102 }, { 7.0978271289338397e+02, 4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 237 */ { { inf, 8.9589789687104559e+102 }, { inf, zero }, 1.e+00 },
+        /* 238 */ { { -inf, 4.0131652080900752e+205 }, { inf, pi }, 1.e+00 },
+        /* 239 */ { { -max, 4.0131652080900752e+205 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 240 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { 4.737660979127225e+02, pi3_4 }, 1.953125e-03 },
+        /* 241 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 242 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 243 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 244 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 245 */ { { -min, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 246 */ { { zero, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 247 */ { { min, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 248 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 249 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 250 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 251 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 252 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { 4.737660979127225e+02, pi_4 }, 1.953125e-03 },
+        /* 253 */ { { max, 4.0131652080900752e+205 }, { 7.0978271289338397e+02, 2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 254 */ { { inf, 4.0131652080900752e+205 }, { inf, zero }, 1.e+00 },
+        /* 255 */ { { -inf, max }, { inf, pi }, 1.e+00 },
+        /* 256 */ { { -max, max }, { 7.1012928648366392e+02, pi3_4 }, 9.765625e-04 },
+        /* 257 */ { { -4.0131652080900752e+205, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 258 */ { { -8.9589789687104559e+102, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 259 */ { { -1.9999999999998694e+00, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 260 */ { { -4.4647944971961829e-103, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 261 */ { { -9.9671949510973086e-206, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 262 */ { { -min, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 263 */ { { zero, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 264 */ { { min, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 265 */ { { 9.9671949510973086e-206, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 266 */ { { 4.4647944971961829e-103, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 267 */ { { 1.9999999999998694e+00, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 268 */ { { 8.9589789687104559e+102, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 269 */ { { 4.0131652080900752e+205, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 270 */ { { max, max }, { 7.1012928648366392e+02, pi_4 }, 9.765625e-04 },
+        /* 271 */ { { inf, max }, { inf, zero }, 1.e+00 },
+        /* 272 */ { { -inf, inf }, { inf, pi3_4 }, 1.e+00 },
+        /* 273 */ { { -max, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 274 */ { { -4.0131652080900752e+205, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 275 */ { { -8.9589789687104559e+102, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 276 */ { { -1.9999999999998694e+00, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 277 */ { { -4.4647944971961829e-103, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 278 */ { { -9.9671949510973086e-206, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 279 */ { { -min, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 280 */ { { zero, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 281 */ { { min, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 282 */ { { 9.9671949510973086e-206, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 283 */ { { 4.4647944971961829e-103, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 284 */ { { 1.9999999999998694e+00, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 285 */ { { 8.9589789687104559e+102, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 286 */ { { 4.0131652080900752e+205, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 287 */ { { max, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 288 */ { { inf, inf }, { inf, pi_4 }, 1.e+00 }
+          // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+
+template <typename T, int default_dps_deficiency = 0>
+struct Tan {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T nan = std::nanf("");
+      const T zero = 0.0f;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 1 */ { { -max, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 2 */ { { -6.1409603e+25f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 3 */ { { -1.1082383e+13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 4 */ { { -2.e+00f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 5 */ { { -3.6093321e-13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 6 */ { { -6.5136393e-26f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 7 */ { { -min, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 8 */ { { zero, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 9 */ { { min, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 10 */ { { 6.5136393e-26f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 11 */ { { 3.6093321e-13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 12 */ { { 2.e+00f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 13 */ { { 1.1082383e+13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 14 */ { { 6.1409603e+25f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 15 */ { { max, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 16 */ { { inf, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        /* 17 */ { { -inf, -max }, { nan, nan }, 1.e+00f },
+        /* 18 */ { { -max, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 19 */ { { -6.1409603e+25f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 20 */ { { -1.1082383e+13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 21 */ { { -2.e+00f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 22 */ { { -3.6093321e-13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 23 */ { { -6.5136393e-26f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 24 */ { { -min, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 25 */ { { zero, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 26 */ { { min, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 27 */ { { 6.5136393e-26f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 28 */ { { 3.6093321e-13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 29 */ { { 2.e+00f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 30 */ { { 1.1082383e+13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 31 */ { { 6.1409603e+25f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 32 */ { { max, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 33 */ { { inf, -max }, { nan, nan }, 1.e+00f },
+        /* 34 */ { { -inf, -6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        /* 35 */ { { -max, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 36 */ { { -6.1409603e+25f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 37 */ { { -1.1082383e+13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 38 */ { { -2.e+00f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 39 */ { { -3.6093321e-13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 40 */ { { -6.5136393e-26f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 41 */ { { -min, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 42 */ { { zero, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 43 */ { { min, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 44 */ { { 6.5136393e-26f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 45 */ { { 3.6093321e-13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 46 */ { { 2.e+00f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 47 */ { { 1.1082383e+13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 48 */ { { 6.1409603e+25f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 49 */ { { max, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 50 */ { { inf, -6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        /* 51 */ { { -inf, -1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        /* 52 */ { { -max, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 53 */ { { -6.1409603e+25f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 54 */ { { -1.1082383e+13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 55 */ { { -2.e+00f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 56 */ { { -3.6093321e-13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 57 */ { { -6.5136393e-26f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 58 */ { { -min, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 59 */ { { zero, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 60 */ { { min, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 61 */ { { 6.5136393e-26f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 62 */ { { 3.6093321e-13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 63 */ { { 2.e+00f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 64 */ { { 1.1082383e+13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 65 */ { { 6.1409603e+25f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 66 */ { { max, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 67 */ { { inf, -1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        /* 68 */ { { -inf, -2.e+00f }, { nan, nan }, 1.e+00f },
+        /* 69 */ { { -max, -2.e+00f }, { 3.2068815e-02f, -9.8294145e-01f }, 1.e+00f },
+        /* 70 */ { { -6.1409603e+25f, -2.e+00f }, { 5.3751757e-03f, -1.0369183e+00f }, 5.e-01f },
+        /* 71 */ { { -1.1082383e+13f, -2.e+00f }, { -3.4068439e-02f, -9.8717695e-01f }, 1.e+00f },
+        /* 72 */ { { -2.e+00f, -2.e+00f }, { 2.8392954e-02f, -1.0238355e+00f }, 5.e-01f },
+        /* 73 */ { { -3.6093321e-13f, -2.e+00f }, { -2.5500228e-14f, -9.6402758e-01f }, 1.e+00f },
+        /* 74 */ { { -6.5136393e-26f, -2.e+00f }, { -4.6019399e-27f, -9.6402758e-01f }, 1.e+00f },
+        /* 75 */ { { -min, -2.e+00f }, { zero, -9.6402758e-01f }, 1.e+00f },
+        /* 76 */ { { zero, -2.e+00f }, { zero, -9.6402758e-01f }, 1.e+00f },
+        /* 77 */ { { min, -2.e+00f }, { zero, -9.6402758e-01f }, 1.e+00f },
+        /* 78 */ { { 6.5136393e-26f, -2.e+00f }, { 4.6019399e-27f, -9.6402758e-01f }, 1.e+00f },
+        /* 79 */ { { 3.6093321e-13f, -2.e+00f }, { 2.5500228e-14f, -9.6402758e-01f }, 1.e+00f },
+        /* 80 */ { { 2.e+00f, -2.e+00f }, { -2.8392954e-02f, -1.0238355e+00f }, 5.e-01f },
+        /* 81 */ { { 1.1082383e+13f, -2.e+00f }, { 3.4068439e-02f, -9.8717695e-01f }, 1.e+00f },
+        /* 82 */ { { 6.1409603e+25f, -2.e+00f }, { -5.3751757e-03f, -1.0369183e+00f }, 5.e-01f },
+        /* 83 */ { { max, -2.e+00f }, { -3.2068815e-02f, -9.8294145e-01f }, 1.e+00f },
+        /* 84 */ { { inf, -2.e+00f }, { nan, nan }, 1.e+00f },
+        /* 85 */ { { -inf, -3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        /* 86 */ { { -max, -3.6093321e-13f }, { 6.1179793e-01f, -4.9602932e-13f }, 1.e+00f },
+        /* 87 */ { { -6.1409603e+25f, -3.6093321e-13f }, { 1.406664e+01f, -7.1778909e-11f }, 6.25e-02f },
+        /* 88 */ { { -1.1082383e+13f, -3.6093321e-13f }, { -7.0485216e-01f, -5.4025084e-13f }, 1.e+00f },
+        /* 89 */ { { -2.e+00f, -3.6093321e-13f }, { 2.1850398e+00f, -2.0841725e-12f }, 2.5e-01f },
+        /* 90 */ { { -3.6093321e-13f, -3.6093321e-13f }, { -3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 91 */ { { -6.5136393e-26f, -3.6093321e-13f }, { -6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 92 */ { { -min, -3.6093321e-13f }, { -min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 93 */ { { zero, -3.6093321e-13f }, { zero, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 94 */ { { min, -3.6093321e-13f }, { min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 95 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 96 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 97 */ { { 2.e+00f, -3.6093321e-13f }, { -2.1850398e+00f, -2.0841725e-12f }, 2.5e-01f },
+        /* 98 */ { { 1.1082383e+13f, -3.6093321e-13f }, { 7.0485216e-01f, -5.4025084e-13f }, 1.e+00f },
+        /* 99 */ { { 6.1409603e+25f, -3.6093321e-13f }, { -1.406664e+01f, -7.1778909e-11f }, 6.25e-02f },
+        /* 100 */ { { max, -3.6093321e-13f }, { -6.1179793e-01f, -4.9602932e-13f }, 1.e+00f },
+        /* 101 */ { { inf, -3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        /* 102 */ { { -inf, -6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        /* 103 */ { { -max, -6.5136393e-26f }, { 6.1179793e-01f, -8.9516731e-26f }, 1.e+00f },
+        /* 104 */ { { -6.1409603e+25f, -6.5136393e-26f }, { 1.406664e+01f, -1.2953697e-23f }, 6.25e-02f },
+        /* 105 */ { { -1.1082383e+13f, -6.5136393e-26f }, { -7.0485216e-01f, -9.7497236e-26f }, 1.e+00f },
+        /* 106 */ { { -2.e+00f, -6.5136393e-26f }, { 2.1850398e+00f, -3.7612353e-25f }, 2.5e-01f },
+        /* 107 */ { { -3.6093321e-13f, -6.5136393e-26f }, { -3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 108 */ { { -6.5136393e-26f, -6.5136393e-26f }, { -6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 109 */ { { -min, -6.5136393e-26f }, { -min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 110 */ { { zero, -6.5136393e-26f }, { zero, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 111 */ { { min, -6.5136393e-26f }, { min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 112 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 113 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 114 */ { { 2.e+00f, -6.5136393e-26f }, { -2.1850398e+00f, -3.7612353e-25f }, 2.5e-01f },
+        /* 115 */ { { 1.1082383e+13f, -6.5136393e-26f }, { 7.0485216e-01f, -9.7497236e-26f }, 1.e+00f },
+        /* 116 */ { { 6.1409603e+25f, -6.5136393e-26f }, { -1.406664e+01f, -1.2953697e-23f }, 6.25e-02f },
+        /* 117 */ { { max, -6.5136393e-26f }, { -6.1179793e-01f, -8.9516731e-26f }, 1.e+00f },
+        /* 118 */ { { inf, -6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        /* 119 */ { { -inf, -min }, { nan, nan }, 1.e+00f },
+        /* 120 */ { { -max, -min }, { 6.1179793e-01f, -1.6154781e-38f }, 1.e+00f },
+        /* 121 */ { { -6.1409603e+25f, -min }, { 1.406664e+01f, -2.3377097e-36f }, 6.25e-02f },
+        /* 122 */ { { -1.1082383e+13f, -min }, { -7.0485216e-01f, -1.7594995e-38f }, 1.e+00f },
+        /* 123 */ { { -2.e+00f, -min }, { 2.1850398e+00f, -6.7877737e-38f }, 2.5e-01f },
+        /* 124 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 125 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
+#ifndef __aarch64__
+// TODO(b/342448599); Fix and re-enable on Arm.
+        /* 126 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
+#endif
+        /* 127 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
+#ifndef __aarch64__
+// TODO(b/342448599); Fix and re-enable on Arm.
+        /* 128 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
+#endif
+        /* 129 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 130 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 131 */ { { 2.e+00f, -min }, { -2.1850398e+00f, -6.7877737e-38f }, 2.5e-01f },
+        /* 132 */ { { 1.1082383e+13f, -min }, { 7.0485216e-01f, -1.7594995e-38f }, 1.e+00f },
+        /* 133 */ { { 6.1409603e+25f, -min }, { -1.406664e+01f, -2.3377097e-36f }, 6.25e-02f },
+        /* 134 */ { { max, -min }, { -6.1179793e-01f, -1.6154781e-38f }, 1.e+00f },
+        /* 135 */ { { inf, -min }, { nan, nan }, 1.e+00f },
+        /* 136 */ { { -inf, zero }, { nan, nan }, 1.e+00f },
+        /* 137 */ { { -max, zero }, { 6.1179793e-01f, zero }, 1.e+00f },
+        /* 138 */ { { -6.1409603e+25f, zero }, { 1.406664e+01f, zero }, 6.25e-02f },
+        /* 139 */ { { -1.1082383e+13f, zero }, { -7.0485216e-01f, zero }, 1.e+00f },
+        /* 140 */ { { -2.e+00f, zero }, { 2.1850398e+00f, zero }, 2.5e-01f },
+        /* 141 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 142 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
+#ifndef __aarch64__
+// TODO(b/342448599); Fix and re-enable on Arm.
+        /* 143 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
+#endif
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00f },
+#ifndef __aarch64__
+// TODO(b/342448599); Fix and re-enable on Arm.
+        /* 145 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
+#endif
+        /* 146 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 147 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 148 */ { { 2.e+00f, zero }, { -2.1850398e+00f, zero }, 2.5e-01f },
+        /* 149 */ { { 1.1082383e+13f, zero }, { 7.0485216e-01f, zero }, 1.e+00f },
+        /* 150 */ { { 6.1409603e+25f, zero }, { -1.406664e+01f, zero }, 6.25e-02f },
+        /* 151 */ { { max, zero }, { -6.1179793e-01f, zero }, 1.e+00f },
+        /* 152 */ { { inf, zero }, { nan, nan }, 1.e+00f },
+        /* 153 */ { { -inf, min }, { nan, nan }, 1.e+00f },
+        /* 154 */ { { -max, min }, { 6.1179793e-01f, 1.6154781e-38f }, 1.e+00f },
+        /* 155 */ { { -6.1409603e+25f, min }, { 1.406664e+01f, 2.3377097e-36f }, 6.25e-02f },
+        /* 156 */ { { -1.1082383e+13f, min }, { -7.0485216e-01f, 1.7594995e-38f }, 1.e+00f },
+        /* 157 */ { { -2.e+00f, min }, { 2.1850398e+00f, 6.7877737e-38f }, 2.5e-01f },
+        /* 158 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 159 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
+#ifndef __aarch64__
+// TODO(b/342448599); Fix and re-enable on Arm.
+        /* 160 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
+#endif
+        /* 161 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
+#ifndef __aarch64__
+// TODO(b/342448599); Fix and re-enable on Arm.
+        /* 162 */ { { min, min }, { min, min }, 4.2535296e+37f },
+#endif
+        /* 163 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 164 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 165 */ { { 2.e+00f, min }, { -2.1850398e+00f, 6.7877737e-38f }, 2.5e-01f },
+        /* 166 */ { { 1.1082383e+13f, min }, { 7.0485216e-01f, 1.7594995e-38f }, 1.e+00f },
+        /* 167 */ { { 6.1409603e+25f, min }, { -1.406664e+01f, 2.3377097e-36f }, 6.25e-02f },
+        /* 168 */ { { max, min }, { -6.1179793e-01f, 1.6154781e-38f }, 1.e+00f },
+        /* 169 */ { { inf, min }, { nan, nan }, 1.e+00f },
+        /* 170 */ { { -inf, 6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        /* 171 */ { { -max, 6.5136393e-26f }, { 6.1179793e-01f, 8.9516731e-26f }, 1.e+00f },
+        /* 172 */ { { -6.1409603e+25f, 6.5136393e-26f }, { 1.406664e+01f, 1.2953697e-23f }, 6.25e-02f },
+        /* 173 */ { { -1.1082383e+13f, 6.5136393e-26f }, { -7.0485216e-01f, 9.7497236e-26f }, 1.e+00f },
+        /* 174 */ { { -2.e+00f, 6.5136393e-26f }, { 2.1850398e+00f, 3.7612353e-25f }, 2.5e-01f },
+        /* 175 */ { { -3.6093321e-13f, 6.5136393e-26f }, { -3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 176 */ { { -6.5136393e-26f, 6.5136393e-26f }, { -6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 177 */ { { -min, 6.5136393e-26f }, { -min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 178 */ { { zero, 6.5136393e-26f }, { zero, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 179 */ { { min, 6.5136393e-26f }, { min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 180 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 181 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 182 */ { { 2.e+00f, 6.5136393e-26f }, { -2.1850398e+00f, 3.7612353e-25f }, 2.5e-01f },
+        /* 183 */ { { 1.1082383e+13f, 6.5136393e-26f }, { 7.0485216e-01f, 9.7497236e-26f }, 1.e+00f },
+        /* 184 */ { { 6.1409603e+25f, 6.5136393e-26f }, { -1.406664e+01f, 1.2953697e-23f }, 6.25e-02f },
+        /* 185 */ { { max, 6.5136393e-26f }, { -6.1179793e-01f, 8.9516731e-26f }, 1.e+00f },
+        /* 186 */ { { inf, 6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        /* 187 */ { { -inf, 3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        /* 188 */ { { -max, 3.6093321e-13f }, { 6.1179793e-01f, 4.9602932e-13f }, 1.e+00f },
+        /* 189 */ { { -6.1409603e+25f, 3.6093321e-13f }, { 1.406664e+01f, 7.1778909e-11f }, 6.25e-02f },
+        /* 190 */ { { -1.1082383e+13f, 3.6093321e-13f }, { -7.0485216e-01f, 5.4025084e-13f }, 1.e+00f },
+        /* 191 */ { { -2.e+00f, 3.6093321e-13f }, { 2.1850398e+00f, 2.0841725e-12f }, 2.5e-01f },
+        /* 192 */ { { -3.6093321e-13f, 3.6093321e-13f }, { -3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 193 */ { { -6.5136393e-26f, 3.6093321e-13f }, { -6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 194 */ { { -min, 3.6093321e-13f }, { -min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 195 */ { { zero, 3.6093321e-13f }, { zero, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 196 */ { { min, 3.6093321e-13f }, { min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 197 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 198 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 199 */ { { 2.e+00f, 3.6093321e-13f }, { -2.1850398e+00f, 2.0841725e-12f }, 2.5e-01f },
+        /* 200 */ { { 1.1082383e+13f, 3.6093321e-13f }, { 7.0485216e-01f, 5.4025084e-13f }, 1.e+00f },
+        /* 201 */ { { 6.1409603e+25f, 3.6093321e-13f }, { -1.406664e+01f, 7.1778909e-11f }, 6.25e-02f },
+        /* 202 */ { { max, 3.6093321e-13f }, { -6.1179793e-01f, 4.9602932e-13f }, 1.e+00f },
+        /* 203 */ { { inf, 3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        /* 204 */ { { -inf, 2.e+00f }, { nan, nan }, 1.e+00f },
+        /* 205 */ { { -max, 2.e+00f }, { 3.2068815e-02f, 9.8294145e-01f }, 1.e+00f },
+        /* 206 */ { { -6.1409603e+25f, 2.e+00f }, { 5.3751757e-03f, 1.0369183e+00f }, 5.e-01f },
+        /* 207 */ { { -1.1082383e+13f, 2.e+00f }, { -3.4068439e-02f, 9.8717695e-01f }, 1.e+00f },
+        /* 208 */ { { -2.e+00f, 2.e+00f }, { 2.8392954e-02f, 1.0238355e+00f }, 5.e-01f },
+        /* 209 */ { { -3.6093321e-13f, 2.e+00f }, { -2.5500228e-14f, 9.6402758e-01f }, 1.e+00f },
+        /* 210 */ { { -6.5136393e-26f, 2.e+00f }, { -4.6019399e-27f, 9.6402758e-01f }, 1.e+00f },
+        /* 211 */ { { -min, 2.e+00f }, { zero, 9.6402758e-01f }, 1.e+00f },
+        /* 212 */ { { zero, 2.e+00f }, { zero, 9.6402758e-01f }, 1.e+00f },
+        /* 213 */ { { min, 2.e+00f }, { zero, 9.6402758e-01f }, 1.e+00f },
+        /* 214 */ { { 6.5136393e-26f, 2.e+00f }, { 4.6019399e-27f, 9.6402758e-01f }, 1.e+00f },
+        /* 215 */ { { 3.6093321e-13f, 2.e+00f }, { 2.5500228e-14f, 9.6402758e-01f }, 1.e+00f },
+        /* 216 */ { { 2.e+00f, 2.e+00f }, { -2.8392954e-02f, 1.0238355e+00f }, 5.e-01f },
+        /* 217 */ { { 1.1082383e+13f, 2.e+00f }, { 3.4068439e-02f, 9.8717695e-01f }, 1.e+00f },
+        /* 218 */ { { 6.1409603e+25f, 2.e+00f }, { -5.3751757e-03f, 1.0369183e+00f }, 5.e-01f },
+        /* 219 */ { { max, 2.e+00f }, { -3.2068815e-02f, 9.8294145e-01f }, 1.e+00f },
+        /* 220 */ { { inf, 2.e+00f }, { nan, nan }, 1.e+00f },
+        /* 221 */ { { -inf, 1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        /* 222 */ { { -max, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 223 */ { { -6.1409603e+25f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 224 */ { { -1.1082383e+13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 225 */ { { -2.e+00f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 226 */ { { -3.6093321e-13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 227 */ { { -6.5136393e-26f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 228 */ { { -min, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 229 */ { { zero, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 230 */ { { min, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 231 */ { { 6.5136393e-26f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 232 */ { { 3.6093321e-13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 233 */ { { 2.e+00f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 234 */ { { 1.1082383e+13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 235 */ { { 6.1409603e+25f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 236 */ { { max, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 237 */ { { inf, 1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        /* 238 */ { { -inf, 6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        /* 239 */ { { -max, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 240 */ { { -6.1409603e+25f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 241 */ { { -1.1082383e+13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 242 */ { { -2.e+00f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 243 */ { { -3.6093321e-13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 244 */ { { -6.5136393e-26f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 245 */ { { -min, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 246 */ { { zero, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 247 */ { { min, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 248 */ { { 6.5136393e-26f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 249 */ { { 3.6093321e-13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 250 */ { { 2.e+00f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 251 */ { { 1.1082383e+13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 252 */ { { 6.1409603e+25f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 253 */ { { max, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 254 */ { { inf, 6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        /* 255 */ { { -inf, max }, { nan, nan }, 1.e+00f },
+        /* 256 */ { { -max, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 257 */ { { -6.1409603e+25f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 258 */ { { -1.1082383e+13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 259 */ { { -2.e+00f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 260 */ { { -3.6093321e-13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 261 */ { { -6.5136393e-26f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 262 */ { { -min, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 263 */ { { zero, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 264 */ { { min, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 265 */ { { 6.5136393e-26f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 266 */ { { 3.6093321e-13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 267 */ { { 2.e+00f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 268 */ { { 1.1082383e+13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 269 */ { { 6.1409603e+25f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 270 */ { { max, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 271 */ { { inf, max }, { nan, nan }, 1.e+00f },
+        /* 272 */ { { -inf, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 273 */ { { -max, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 274 */ { { -6.1409603e+25f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 275 */ { { -1.1082383e+13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 276 */ { { -2.e+00f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 277 */ { { -3.6093321e-13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 278 */ { { -6.5136393e-26f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 279 */ { { -min, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 280 */ { { zero, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 281 */ { { min, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 282 */ { { 6.5136393e-26f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 283 */ { { 3.6093321e-13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 284 */ { { 2.e+00f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 285 */ { { 1.1082383e+13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 286 */ { { 6.1409603e+25f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 287 */ { { max, inf }, { zero, 1.e+00f }, 5.e-01f },
+        /* 288 */ { { inf, inf }, { zero, 1.e+00f }, 5.e-01f }
+
+          // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T nan = std::nan("");
+      const T zero = 0.0;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 1 */ { { -max, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 2 */ { { -4.0131652080900752e+205, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 3 */ { { -8.9589789687104559e+102, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 4 */ { { -1.9999999999998694e+00, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 5 */ { { -4.4647944971961829e-103, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 6 */ { { -9.9671949510973086e-206, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 7 */ { { -min, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 8 */ { { zero, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 9 */ { { min, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 10 */ { { 9.9671949510973086e-206, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 11 */ { { 4.4647944971961829e-103, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 12 */ { { 1.9999999999998694e+00, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 13 */ { { 8.9589789687104559e+102, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 14 */ { { 4.0131652080900752e+205, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 15 */ { { max, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 16 */ { { inf, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        /* 17 */ { { -inf, -max }, { nan, nan }, 1.e+00 },
+        /* 18 */ { { -max, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 19 */ { { -4.0131652080900752e+205, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 20 */ { { -8.9589789687104559e+102, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 21 */ { { -1.9999999999998694e+00, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 22 */ { { -4.4647944971961829e-103, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 23 */ { { -9.9671949510973086e-206, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 24 */ { { -min, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 25 */ { { zero, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 26 */ { { min, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 27 */ { { 9.9671949510973086e-206, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 28 */ { { 4.4647944971961829e-103, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 29 */ { { 1.9999999999998694e+00, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 30 */ { { 8.9589789687104559e+102, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 31 */ { { 4.0131652080900752e+205, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 32 */ { { max, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 33 */ { { inf, -max }, { nan, nan }, 1.e+00 },
+        /* 34 */ { { -inf, -4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        /* 35 */ { { -max, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 36 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 37 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 38 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 39 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 40 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 41 */ { { -min, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 42 */ { { zero, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 43 */ { { min, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 44 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 45 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 46 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 47 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 48 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 49 */ { { max, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 50 */ { { inf, -4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        /* 51 */ { { -inf, -8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        /* 52 */ { { -max, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 53 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 54 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 55 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 56 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 57 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 58 */ { { -min, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 59 */ { { zero, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 60 */ { { min, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 61 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 62 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 63 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 64 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 65 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 66 */ { { max, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 67 */ { { inf, -8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        /* 68 */ { { -inf, -1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        /* 69 */ { { -max, -1.9999999999998694e+00 }, { 3.5056249287033966e-04, -9.6402925699855813e-01 }, 1.e+00 },
+        /* 70 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { 2.7597961351536058e-02, -1.02477724455786e+00 }, 5.e-01 },
+        /* 71 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { 3.6482534190112638e-02, -1.0041027601252124e+00 }, 5.e-01 },
+        /* 72 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { 2.8392952868233685e-02, -1.0238355945704865e+00 }, 5.e-01 },
+        /* 73 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { -3.1544141402685944e-104, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 74 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { -7.0419054476749839e-207, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 75 */ { { -min, -1.9999999999998694e+00 }, { zero, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 76 */ { { zero, -1.9999999999998694e+00 }, { zero, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 77 */ { { min, -1.9999999999998694e+00 }, { zero, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 78 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { 7.0419054476749839e-207, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 79 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { 3.1544141402685944e-104, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 80 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { -2.8392952868233685e-02, -1.0238355945704865e+00 }, 5.e-01 },
+        /* 81 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { -3.6482534190112638e-02, -1.0041027601252124e+00 }, 5.e-01 },
+        /* 82 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { -2.7597961351536058e-02, -1.02477724455786e+00 }, 5.e-01 },
+        /* 83 */ { { max, -1.9999999999998694e+00 }, { -3.5056249287033966e-04, -9.6402925699855813e-01 }, 1.e+00 },
+        /* 84 */ { { inf, -1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        /* 85 */ { { -inf, -4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        /* 86 */ { { -max, -4.4647944971961829e-103 }, { 4.9620158744448951e-03, -4.4649044275872426e-103 }, 1.28e+02 },
+        /* 87 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { 2.2833759509915996e+00, -2.7743365645875101e-102 }, 2.5e-01 },
+        /* 88 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { 1.1394650926503123e+00, -1.0261797490073182e-102 }, 5.e-01 },
+        /* 89 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { 2.185039863262273e+00, -2.5781505790835082e-102 }, 2.5e-01 },
+        /* 90 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { -4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 91 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { -9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 92 */ { { -min, -4.4647944971961829e-103 }, { -min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 93 */ { { zero, -4.4647944971961829e-103 }, { zero, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 94 */ { { min, -4.4647944971961829e-103 }, { min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 95 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 96 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 97 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { -2.185039863262273e+00, -2.5781505790835082e-102 }, 2.5e-01 },
+        /* 98 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { -1.1394650926503123e+00, -1.0261797490073182e-102 }, 5.e-01 },
+        /* 99 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { -2.2833759509915996e+00, -2.7743365645875101e-102 }, 2.5e-01 },
+        /* 100 */ { { max, -4.4647944971961829e-103 }, { -4.9620158744448951e-03, -4.4649044275872426e-103 }, 1.28e+02 },
+        /* 101 */ { { inf, -4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        /* 102 */ { { -inf, -9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        /* 103 */ { { -max, -9.9671949510973086e-206 }, { 4.9620158744448951e-03, -9.9674403593998487e-206 }, 1.28e+02 },
+        /* 104 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { 2.2833759509915996e+00, -6.1934213134706426e-205 }, 2.5e-01 },
+        /* 105 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { 1.1394650926503123e+00, -2.2908408482511669e-205 }, 5.e-01 },
+        /* 106 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { 2.185039863262273e+00, -5.7554562592179759e-205 }, 2.5e-01 },
+        /* 107 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { -4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 108 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { -9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 109 */ { { -min, -9.9671949510973086e-206 }, { -min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 110 */ { { zero, -9.9671949510973086e-206 }, { zero, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 111 */ { { min, -9.9671949510973086e-206 }, { min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 112 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 113 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 114 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { -2.185039863262273e+00, -5.7554562592179759e-205 }, 2.5e-01 },
+        /* 115 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { -1.1394650926503123e+00, -2.2908408482511669e-205 }, 5.e-01 },
+        /* 116 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { -2.2833759509915996e+00, -6.1934213134706426e-205 }, 2.5e-01 },
+        /* 117 */ { { max, -9.9671949510973086e-206 }, { -4.9620158744448951e-03, -9.9674403593998487e-206 }, 1.28e+02 },
+        /* 118 */ { { inf, -9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        /* 119 */ { { -inf, -min }, { nan, nan }, 1.e+00 },
+        /* 120 */ { { -max, -min }, { 4.9620158744448951e-03, -2.2251286433891388e-308 }, 1.28e+02 },
+        /* 121 */ { { -4.0131652080900752e+205, -min }, { 2.2833759509915996e+00, -1.3826176699601631e-307 }, 2.5e-01 },
+        /* 122 */ { { -8.9589789687104559e+102, -min }, { 1.1394650926503123e+00, -5.1140668066123887e-308 }, 5.e-01 },
+        /* 123 */ { { -1.9999999999998694e+00, -min }, { 2.185039863262273e+00, -1.2848464717505794e-307 }, 2.5e-01 },
+        /* 124 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 125 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
+        /* 126 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
+#endif
+        /* 127 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
+#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
+        /* 128 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
+#endif
+        /* 129 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 130 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 131 */ { { 1.9999999999998694e+00, -min }, { -2.185039863262273e+00, -1.2848464717505794e-307 }, 2.5e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, -min }, { -1.1394650926503123e+00, -5.1140668066123887e-308 }, 5.e-01 },
+        /* 133 */ { { 4.0131652080900752e+205, -min }, { -2.2833759509915996e+00, -1.3826176699601631e-307 }, 2.5e-01 },
+        /* 134 */ { { max, -min }, { -4.9620158744448951e-03, -2.2251286433891388e-308 }, 1.28e+02 },
+        /* 135 */ { { inf, -min }, { nan, nan }, 1.e+00 },
+        /* 136 */ { { -inf, zero }, { nan, nan }, 1.e+00 },
+        /* 137 */ { { -max, zero }, { 4.9620158744448951e-03, zero }, 1.28e+02 },
+        /* 138 */ { { -4.0131652080900752e+205, zero }, { 2.2833759509915996e+00, zero }, 2.5e-01 },
+        /* 139 */ { { -8.9589789687104559e+102, zero }, { 1.1394650926503123e+00, zero }, 5.e-01 },
+        /* 140 */ { { -1.9999999999998694e+00, zero }, { 2.185039863262273e+00, zero }, 2.5e-01 },
+        /* 141 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 142 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
+        /* 143 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
+#endif
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00 },
+#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
+        /* 145 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
+#endif
+        /* 146 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 147 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 148 */ { { 1.9999999999998694e+00, zero }, { -2.185039863262273e+00, zero }, 2.5e-01 },
+        /* 149 */ { { 8.9589789687104559e+102, zero }, { -1.1394650926503123e+00, zero }, 5.e-01 },
+        /* 150 */ { { 4.0131652080900752e+205, zero }, { -2.2833759509915996e+00, zero }, 2.5e-01 },
+        /* 151 */ { { max, zero }, { -4.9620158744448951e-03, zero }, 1.28e+02 },
+        /* 152 */ { { inf, zero }, { nan, nan }, 1.e+00 },
+        /* 153 */ { { -inf, min }, { nan, nan }, 1.e+00 },
+        /* 154 */ { { -max, min }, { 4.9620158744448951e-03, 2.2251286433891388e-308 }, 1.28e+02 },
+        /* 155 */ { { -4.0131652080900752e+205, min }, { 2.2833759509915996e+00, 1.3826176699601631e-307 }, 2.5e-01 },
+        /* 156 */ { { -8.9589789687104559e+102, min }, { 1.1394650926503123e+00, 5.1140668066123887e-308 }, 5.e-01 },
+        /* 157 */ { { -1.9999999999998694e+00, min }, { 2.185039863262273e+00, 1.2848464717505794e-307 }, 2.5e-01 },
+        /* 158 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 159 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
+        /* 160 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
+#endif
+        /* 161 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
+#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
+        /* 162 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
+#endif
+        /* 163 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 164 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 165 */ { { 1.9999999999998694e+00, min }, { -2.185039863262273e+00, 1.2848464717505794e-307 }, 2.5e-01 },
+        /* 166 */ { { 8.9589789687104559e+102, min }, { -1.1394650926503123e+00, 5.1140668066123887e-308 }, 5.e-01 },
+        /* 167 */ { { 4.0131652080900752e+205, min }, { -2.2833759509915996e+00, 1.3826176699601631e-307 }, 2.5e-01 },
+        /* 168 */ { { max, min }, { -4.9620158744448951e-03, 2.2251286433891388e-308 }, 1.28e+02 },
+        /* 169 */ { { inf, min }, { nan, nan }, 1.e+00 },
+        /* 170 */ { { -inf, 9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        /* 171 */ { { -max, 9.9671949510973086e-206 }, { 4.9620158744448951e-03, 9.9674403593998487e-206 }, 1.28e+02 },
+        /* 172 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { 2.2833759509915996e+00, 6.1934213134706426e-205 }, 2.5e-01 },
+        /* 173 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { 1.1394650926503123e+00, 2.2908408482511669e-205 }, 5.e-01 },
+        /* 174 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { 2.185039863262273e+00, 5.7554562592179759e-205 }, 2.5e-01 },
+        /* 175 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { -4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 176 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { -9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 177 */ { { -min, 9.9671949510973086e-206 }, { -min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 178 */ { { zero, 9.9671949510973086e-206 }, { zero, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 179 */ { { min, 9.9671949510973086e-206 }, { min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 180 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 181 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 182 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { -2.185039863262273e+00, 5.7554562592179759e-205 }, 2.5e-01 },
+        /* 183 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { -1.1394650926503123e+00, 2.2908408482511669e-205 }, 5.e-01 },
+        /* 184 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { -2.2833759509915996e+00, 6.1934213134706426e-205 }, 2.5e-01 },
+        /* 185 */ { { max, 9.9671949510973086e-206 }, { -4.9620158744448951e-03, 9.9674403593998487e-206 }, 1.28e+02 },
+        /* 186 */ { { inf, 9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        /* 187 */ { { -inf, 4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        /* 188 */ { { -max, 4.4647944971961829e-103 }, { 4.9620158744448951e-03, 4.4649044275872426e-103 }, 1.28e+02 },
+        /* 189 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { 2.2833759509915996e+00, 2.7743365645875101e-102 }, 2.5e-01 },
+        /* 190 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { 1.1394650926503123e+00, 1.0261797490073182e-102 }, 5.e-01 },
+        /* 191 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { 2.185039863262273e+00, 2.5781505790835082e-102 }, 2.5e-01 },
+        /* 192 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { -4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 193 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { -9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 194 */ { { -min, 4.4647944971961829e-103 }, { -min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 195 */ { { zero, 4.4647944971961829e-103 }, { zero, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 196 */ { { min, 4.4647944971961829e-103 }, { min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 197 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 198 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 199 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { -2.185039863262273e+00, 2.5781505790835082e-102 }, 2.5e-01 },
+        /* 200 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { -1.1394650926503123e+00, 1.0261797490073182e-102 }, 5.e-01 },
+        /* 201 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { -2.2833759509915996e+00, 2.7743365645875101e-102 }, 2.5e-01 },
+        /* 202 */ { { max, 4.4647944971961829e-103 }, { -4.9620158744448951e-03, 4.4649044275872426e-103 }, 1.28e+02 },
+        /* 203 */ { { inf, 4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        /* 204 */ { { -inf, 1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        /* 205 */ { { -max, 1.9999999999998694e+00 }, { 3.5056249287033966e-04, 9.6402925699855813e-01 }, 1.e+00 },
+        /* 206 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { 2.7597961351536058e-02, 1.02477724455786e+00 }, 5.e-01 },
+        /* 207 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { 3.6482534190112638e-02, 1.0041027601252124e+00 }, 5.e-01 },
+        /* 208 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { 2.8392952868233685e-02, 1.0238355945704865e+00 }, 5.e-01 },
+        /* 209 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { -3.1544141402685944e-104, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 210 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { -7.0419054476749839e-207, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 211 */ { { -min, 1.9999999999998694e+00 }, { zero, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 212 */ { { zero, 1.9999999999998694e+00 }, { zero, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 213 */ { { min, 1.9999999999998694e+00 }, { zero, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 214 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { 7.0419054476749839e-207, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 215 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { 3.1544141402685944e-104, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 216 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { -2.8392952868233685e-02, 1.0238355945704865e+00 }, 5.e-01 },
+        /* 217 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { -3.6482534190112638e-02, 1.0041027601252124e+00 }, 5.e-01 },
+        /* 218 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { -2.7597961351536058e-02, 1.02477724455786e+00 }, 5.e-01 },
+        /* 219 */ { { max, 1.9999999999998694e+00 }, { -3.5056249287033966e-04, 9.6402925699855813e-01 }, 1.e+00 },
+        /* 220 */ { { inf, 1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        /* 221 */ { { -inf, 8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        /* 222 */ { { -max, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 223 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 224 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 225 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 226 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 227 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 228 */ { { -min, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 229 */ { { zero, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 230 */ { { min, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 231 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 232 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 233 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 234 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 235 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 236 */ { { max, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 237 */ { { inf, 8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        /* 238 */ { { -inf, 4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        /* 239 */ { { -max, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 240 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 241 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 242 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 243 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 244 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 245 */ { { -min, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 246 */ { { zero, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 247 */ { { min, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 248 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 249 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 250 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 251 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 252 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 253 */ { { max, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 254 */ { { inf, 4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        /* 255 */ { { -inf, max }, { nan, nan }, 1.e+00 },
+        /* 256 */ { { -max, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 257 */ { { -4.0131652080900752e+205, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 258 */ { { -8.9589789687104559e+102, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 259 */ { { -1.9999999999998694e+00, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 260 */ { { -4.4647944971961829e-103, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 261 */ { { -9.9671949510973086e-206, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 262 */ { { -min, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 263 */ { { zero, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 264 */ { { min, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 265 */ { { 9.9671949510973086e-206, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 266 */ { { 4.4647944971961829e-103, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 267 */ { { 1.9999999999998694e+00, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 268 */ { { 8.9589789687104559e+102, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 269 */ { { 4.0131652080900752e+205, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 270 */ { { max, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 271 */ { { inf, max }, { nan, nan }, 1.e+00 },
+        /* 272 */ { { -inf, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 273 */ { { -max, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 274 */ { { -4.0131652080900752e+205, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 275 */ { { -8.9589789687104559e+102, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 276 */ { { -1.9999999999998694e+00, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 277 */ { { -4.4647944971961829e-103, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 278 */ { { -9.9671949510973086e-206, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 279 */ { { -min, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 280 */ { { zero, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 281 */ { { min, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 282 */ { { 9.9671949510973086e-206, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 283 */ { { 4.4647944971961829e-103, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 284 */ { { 1.9999999999998694e+00, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 285 */ { { 8.9589789687104559e+102, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 286 */ { { 4.0131652080900752e+205, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 287 */ { { max, inf }, { zero, 1.e+00 }, 5.e-01 },
+        /* 288 */ { { inf, inf }, { zero, 1.e+00 }, 5.e-01 },
+          // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+
+template <typename T, int default_dps_deficiency = 0>
+struct Asin {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T pi_4 = 0.7853982f;
+      const T pi_2 = 1.5707964f;
+      const T zero = 0.0f;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { -pi_4, -inf }, 1.e+00f },
+        /* 1 */ { { -max, -inf }, { zero, -inf }, 1.e+00f },
+        /* 2 */ { { -6.1409603e+25f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 3 */ { { -1.1082383e+13f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 4 */ { { -2.e+00f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 5 */ { { -3.6093321e-13f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 6 */ { { -6.5136393e-26f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 7 */ { { -min, -inf }, { zero, -inf }, 1.e+00f },
+        /* 8 */ { { zero, -inf }, { zero, -inf }, 1.e+00f },
+        /* 9 */ { { min, -inf }, { zero, -inf }, 1.e+00f },
+        /* 10 */ { { 6.5136393e-26f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 11 */ { { 3.6093321e-13f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 12 */ { { 2.e+00f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 13 */ { { 1.1082383e+13f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 14 */ { { 6.1409603e+25f, -inf }, { zero, -inf }, 1.e+00f },
+        /* 15 */ { { max, -inf }, { zero, -inf }, 1.e+00f },
+        /* 16 */ { { inf, -inf }, { pi_4, -inf }, 1.e+00f },
+        /* 17 */ { { -inf, -max }, { -pi_2, -inf }, 1.e+00f },
+        /* 18 */ { { -max, -max }, { -pi_4, -8.9762558e+01f }, 7.8125e-03f },
+        /* 19 */ { { -6.1409603e+25f, -max }, { -1.8046662e-13f, -8.9415985e+01f }, 7.8125e-03f },
+        /* 20 */ { { -1.1082383e+13f, -max }, { -3.25682e-26f, -8.9415985e+01f }, 7.8125e-03f },
+        /* 21 */ { { -2.e+00f, -max }, { -5.8774718e-39f, -8.9415985e+01f }, 7.8125e-03f },
+        /* 22 */ { { -3.6093321e-13f, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 23 */ { { -6.5136393e-26f, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 24 */ { { -min, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 25 */ { { zero, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 26 */ { { min, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 27 */ { { 6.5136393e-26f, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 28 */ { { 3.6093321e-13f, -max }, { zero, -8.9415985e+01f }, 7.8125e-03f },
+        /* 29 */ { { 2.e+00f, -max }, { 5.8774718e-39f, -8.9415985e+01f }, 7.8125e-03f },
+        /* 30 */ { { 1.1082383e+13f, -max }, { 3.25682e-26f, -8.9415985e+01f }, 7.8125e-03f },
+        /* 31 */ { { 6.1409603e+25f, -max }, { 1.8046662e-13f, -8.9415985e+01f }, 7.8125e-03f },
+        /* 32 */ { { max, -max }, { pi_4, -8.9762558e+01f }, 7.8125e-03f },
+        /* 33 */ { { inf, -max }, { pi_2, -inf }, 1.e+00f },
+        /* 34 */ { { -inf, -6.1409603e+25f }, { -pi_2, -inf }, 1.e+00f },
+        /* 35 */ { { -max, -6.1409603e+25f }, { -pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 36 */ { { -6.1409603e+25f, -6.1409603e+25f }, { -pi_4, -6.0419331e+01f }, 1.5625e-02f },
+        /* 37 */ { { -1.1082383e+13f, -6.1409603e+25f }, { -1.8046662e-13f, -6.0072754e+01f }, 1.5625e-02f },
+        /* 38 */ { { -2.e+00f, -6.1409603e+25f }, { -3.2568196e-26f, -6.0072754e+01f }, 1.5625e-02f },
+        /* 39 */ { { -3.6093321e-13f, -6.1409603e+25f }, { -5.8774718e-39f, -6.0072754e+01f }, 1.5625e-02f },
+        /* 40 */ { { -6.5136393e-26f, -6.1409603e+25f }, { zero, -6.0072754e+01f }, 1.5625e-02f },
+        /* 41 */ { { -min, -6.1409603e+25f }, { zero, -6.0072754e+01f }, 1.5625e-02f },
+        /* 42 */ { { zero, -6.1409603e+25f }, { zero, -6.0072754e+01f }, 1.5625e-02f },
+        /* 43 */ { { min, -6.1409603e+25f }, { zero, -6.0072754e+01f }, 1.5625e-02f },
+        /* 44 */ { { 6.5136393e-26f, -6.1409603e+25f }, { zero, -6.0072754e+01f }, 1.5625e-02f },
+        /* 45 */ { { 3.6093321e-13f, -6.1409603e+25f }, { 5.8774718e-39f, -6.0072754e+01f }, 1.5625e-02f },
+        /* 46 */ { { 2.e+00f, -6.1409603e+25f }, { 3.2568196e-26f, -6.0072754e+01f }, 1.5625e-02f },
+        /* 47 */ { { 1.1082383e+13f, -6.1409603e+25f }, { 1.8046662e-13f, -6.0072754e+01f }, 1.5625e-02f },
+        /* 48 */ { { 6.1409603e+25f, -6.1409603e+25f }, { pi_4, -6.0419331e+01f }, 1.5625e-02f },
+        /* 49 */ { { max, -6.1409603e+25f }, { pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 50 */ { { inf, -6.1409603e+25f }, { pi_2, -inf }, 1.e+00f },
+        /* 51 */ { { -inf, -1.1082383e+13f }, { -pi_2, -inf }, 1.e+00f },
+        /* 52 */ { { -max, -1.1082383e+13f }, { -pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 53 */ { { -6.1409603e+25f, -1.1082383e+13f }, { -pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 54 */ { { -1.1082383e+13f, -1.1082383e+13f }, { -pi_4, -3.1076099e+01f }, 3.125e-02f },
+        /* 55 */ { { -2.e+00f, -1.1082383e+13f }, { -1.804666e-13f, -3.0729525e+01f }, 3.125e-02f },
+        /* 56 */ { { -3.6093321e-13f, -1.1082383e+13f }, { -3.2568193e-26f, -3.0729525e+01f }, 3.125e-02f },
+        /* 57 */ { { -6.5136393e-26f, -1.1082383e+13f }, { -5.8774718e-39f, -3.0729525e+01f }, 3.125e-02f },
+        /* 58 */ { { -min, -1.1082383e+13f }, { zero, -3.0729525e+01f }, 3.125e-02f },
+        /* 59 */ { { zero, -1.1082383e+13f }, { zero, -3.0729525e+01f }, 3.125e-02f },
+        /* 60 */ { { min, -1.1082383e+13f }, { zero, -3.0729525e+01f }, 3.125e-02f },
+        /* 61 */ { { 6.5136393e-26f, -1.1082383e+13f }, { 5.8774718e-39f, -3.0729525e+01f }, 3.125e-02f },
+        /* 62 */ { { 3.6093321e-13f, -1.1082383e+13f }, { 3.2568193e-26f, -3.0729525e+01f }, 3.125e-02f },
+        /* 63 */ { { 2.e+00f, -1.1082383e+13f }, { 1.804666e-13f, -3.0729525e+01f }, 3.125e-02f },
+        /* 64 */ { { 1.1082383e+13f, -1.1082383e+13f }, { pi_4, -3.1076099e+01f }, 3.125e-02f },
+        /* 65 */ { { 6.1409603e+25f, -1.1082383e+13f }, { pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 66 */ { { max, -1.1082383e+13f }, { pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 67 */ { { inf, -1.1082383e+13f }, { pi_2, -inf }, 1.e+00f },
+        /* 68 */ { { -inf, -2.e+00f }, { -pi_2, -inf }, 1.e+00f },
+        /* 69 */ { { -max, -2.e+00f }, { -pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 70 */ { { -6.1409603e+25f, -2.e+00f }, { -pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 71 */ { { -1.1082383e+13f, -2.e+00f }, { -pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 72 */ { { -2.e+00f, -2.e+00f }, { -7.5424916e-01f, -1.7343246e+00f }, 5.e-01f },
+        /* 73 */ { { -3.6093321e-13f, -2.e+00f }, { -1.6141424e-13f, -1.4436355e+00f }, 5.e-01f },
+        /* 74 */ { { -6.5136393e-26f, -2.e+00f }, { -2.9129881e-26f, -1.4436355e+00f }, 5.e-01f },
+        /* 75 */ { { -min, -2.e+00f }, { zero, -1.4436355e+00f }, 5.e-01f },
+        /* 76 */ { { zero, -2.e+00f }, { zero, -1.4436355e+00f }, 5.e-01f },
+        /* 77 */ { { min, -2.e+00f }, { zero, -1.4436355e+00f }, 5.e-01f },
+        /* 78 */ { { 6.5136393e-26f, -2.e+00f }, { 2.9129881e-26f, -1.4436355e+00f }, 5.e-01f },
+        /* 79 */ { { 3.6093321e-13f, -2.e+00f }, { 1.6141424e-13f, -1.4436355e+00f }, 5.e-01f },
+        /* 80 */ { { 2.e+00f, -2.e+00f }, { 7.5424916e-01f, -1.7343246e+00f }, 5.e-01f },
+        /* 81 */ { { 1.1082383e+13f, -2.e+00f }, { pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 82 */ { { 6.1409603e+25f, -2.e+00f }, { pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 83 */ { { max, -2.e+00f }, { pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 84 */ { { inf, -2.e+00f }, { pi_2, -inf }, 1.e+00f },
+        /* 85 */ { { -inf, -3.6093321e-13f }, { -pi_2, -inf }, 1.e+00f },
+        /* 86 */ { { -max, -3.6093321e-13f }, { -pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 87 */ { { -6.1409603e+25f, -3.6093321e-13f }, { -pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 88 */ { { -1.1082383e+13f, -3.6093321e-13f }, { -pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 89 */ { { -2.e+00f, -3.6093321e-13f }, { -pi_2, -1.316958e+00f }, 2.5e-01f },
+        /* 90 */ { { -3.6093321e-13f, -3.6093321e-13f }, { -3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 91 */ { { -6.5136393e-26f, -3.6093321e-13f }, { -6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 92 */ { { -min, -3.6093321e-13f }, { -min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 93 */ { { zero, -3.6093321e-13f }, { zero, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 94 */ { { min, -3.6093321e-13f }, { min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 95 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 96 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 97 */ { { 2.e+00f, -3.6093321e-13f }, { pi_2, -1.316958e+00f }, 2.5e-01f },
+        /* 98 */ { { 1.1082383e+13f, -3.6093321e-13f }, { pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 99 */ { { 6.1409603e+25f, -3.6093321e-13f }, { pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 100 */ { { max, -3.6093321e-13f }, { pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 101 */ { { inf, -3.6093321e-13f }, { pi_2, -inf }, 1.e+00f },
+        /* 102 */ { { -inf, -6.5136393e-26f }, { -pi_2, -inf }, 1.e+00f },
+        /* 103 */ { { -max, -6.5136393e-26f }, { -pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 104 */ { { -6.1409603e+25f, -6.5136393e-26f }, { -pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 105 */ { { -1.1082383e+13f, -6.5136393e-26f }, { -pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 106 */ { { -2.e+00f, -6.5136393e-26f }, { -pi_2, -1.316958e+00f }, 2.5e-01f },
+        /* 107 */ { { -3.6093321e-13f, -6.5136393e-26f }, { -3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 108 */ { { -6.5136393e-26f, -6.5136393e-26f }, { -6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 109 */ { { -min, -6.5136393e-26f }, { -min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 110 */ { { zero, -6.5136393e-26f }, { zero, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 111 */ { { min, -6.5136393e-26f }, { min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 112 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 113 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 114 */ { { 2.e+00f, -6.5136393e-26f }, { pi_2, -1.316958e+00f }, 2.5e-01f },
+        /* 115 */ { { 1.1082383e+13f, -6.5136393e-26f }, { pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 116 */ { { 6.1409603e+25f, -6.5136393e-26f }, { pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 117 */ { { max, -6.5136393e-26f }, { pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 118 */ { { inf, -6.5136393e-26f }, { pi_2, -inf }, 1.e+00f },
+        /* 119 */ { { -inf, -min }, { -pi_2, -inf }, 1.e+00f },
+        /* 120 */ { { -max, -min }, { -pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 121 */ { { -6.1409603e+25f, -min }, { -pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 122 */ { { -1.1082383e+13f, -min }, { -pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 123 */ { { -2.e+00f, -min }, { -pi_2, -1.316958e+00f }, 2.5e-01f },
+        /* 124 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 125 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 126 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
+        /* 127 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
+        /* 128 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
+        /* 129 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 130 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 131 */ { { 2.e+00f, -min }, { pi_2, -1.316958e+00f }, 2.5e-01f },
+        /* 132 */ { { 1.1082383e+13f, -min }, { pi_2, -3.0729525e+01f }, 3.125e-02f },
+        /* 133 */ { { 6.1409603e+25f, -min }, { pi_2, -6.0072754e+01f }, 1.5625e-02f },
+        /* 134 */ { { max, -min }, { pi_2, -8.9415985e+01f }, 7.8125e-03f },
+        /* 135 */ { { inf, -min }, { pi_2, -inf }, 1.e+00f },
+        /* 136 */ { { -inf, zero }, { -pi_2, inf }, 1.e+00f },
+        /* 137 */ { { -max, zero }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 138 */ { { -6.1409603e+25f, zero }, { -pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 139 */ { { -1.1082383e+13f, zero }, { -pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 140 */ { { -2.e+00f, zero }, { -pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 141 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 142 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 143 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00f },
+        /* 145 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
+        /* 146 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 147 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 148 */ { { 2.e+00f, zero }, { pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 149 */ { { 1.1082383e+13f, zero }, { pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 150 */ { { 6.1409603e+25f, zero }, { pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 151 */ { { max, zero }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 152 */ { { inf, zero }, { pi_2, inf }, 1.e+00f },
+        /* 153 */ { { -inf, min }, { -pi_2, inf }, 1.e+00f },
+        /* 154 */ { { -max, min }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 155 */ { { -6.1409603e+25f, min }, { -pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 156 */ { { -1.1082383e+13f, min }, { -pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 157 */ { { -2.e+00f, min }, { -pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 158 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 159 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 160 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
+        /* 161 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
+        /* 162 */ { { min, min }, { min, min }, 4.2535296e+37f },
+        /* 163 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 164 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 165 */ { { 2.e+00f, min }, { pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 166 */ { { 1.1082383e+13f, min }, { pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 167 */ { { 6.1409603e+25f, min }, { pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 168 */ { { max, min }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 169 */ { { inf, min }, { pi_2, inf }, 1.e+00f },
+        /* 170 */ { { -inf, 6.5136393e-26f }, { -pi_2, inf }, 1.e+00f },
+        /* 171 */ { { -max, 6.5136393e-26f }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 172 */ { { -6.1409603e+25f, 6.5136393e-26f }, { -pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 173 */ { { -1.1082383e+13f, 6.5136393e-26f }, { -pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 174 */ { { -2.e+00f, 6.5136393e-26f }, { -pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 175 */ { { -3.6093321e-13f, 6.5136393e-26f }, { -3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 176 */ { { -6.5136393e-26f, 6.5136393e-26f }, { -6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 177 */ { { -min, 6.5136393e-26f }, { -min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 178 */ { { zero, 6.5136393e-26f }, { zero, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 179 */ { { min, 6.5136393e-26f }, { min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 180 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 181 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 182 */ { { 2.e+00f, 6.5136393e-26f }, { pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 183 */ { { 1.1082383e+13f, 6.5136393e-26f }, { pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 184 */ { { 6.1409603e+25f, 6.5136393e-26f }, { pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 185 */ { { max, 6.5136393e-26f }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 186 */ { { inf, 6.5136393e-26f }, { pi_2, inf }, 1.e+00f },
+        /* 187 */ { { -inf, 3.6093321e-13f }, { -pi_2, inf }, 1.e+00f },
+        /* 188 */ { { -max, 3.6093321e-13f }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 189 */ { { -6.1409603e+25f, 3.6093321e-13f }, { -pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 190 */ { { -1.1082383e+13f, 3.6093321e-13f }, { -pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 191 */ { { -2.e+00f, 3.6093321e-13f }, { -pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 192 */ { { -3.6093321e-13f, 3.6093321e-13f }, { -3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 193 */ { { -6.5136393e-26f, 3.6093321e-13f }, { -6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 194 */ { { -min, 3.6093321e-13f }, { -min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 195 */ { { zero, 3.6093321e-13f }, { zero, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 196 */ { { min, 3.6093321e-13f }, { min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 197 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 198 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 199 */ { { 2.e+00f, 3.6093321e-13f }, { pi_2, 1.316958e+00f }, 2.5e-01f },
+        /* 200 */ { { 1.1082383e+13f, 3.6093321e-13f }, { pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 201 */ { { 6.1409603e+25f, 3.6093321e-13f }, { pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 202 */ { { max, 3.6093321e-13f }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 203 */ { { inf, 3.6093321e-13f }, { pi_2, inf }, 1.e+00f },
+        /* 204 */ { { -inf, 2.e+00f }, { -pi_2, inf }, 1.e+00f },
+        /* 205 */ { { -max, 2.e+00f }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 206 */ { { -6.1409603e+25f, 2.e+00f }, { -pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 207 */ { { -1.1082383e+13f, 2.e+00f }, { -pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 208 */ { { -2.e+00f, 2.e+00f }, { -7.5424916e-01f, 1.7343246e+00f }, 5.e-01f },
+        /* 209 */ { { -3.6093321e-13f, 2.e+00f }, { -1.6141424e-13f, 1.4436355e+00f }, 5.e-01f },
+        /* 210 */ { { -6.5136393e-26f, 2.e+00f }, { -2.9129881e-26f, 1.4436355e+00f }, 5.e-01f },
+        /* 211 */ { { -min, 2.e+00f }, { zero, 1.4436355e+00f }, 5.e-01f },
+        /* 212 */ { { zero, 2.e+00f }, { zero, 1.4436355e+00f }, 5.e-01f },
+        /* 213 */ { { min, 2.e+00f }, { zero, 1.4436355e+00f }, 5.e-01f },
+        /* 214 */ { { 6.5136393e-26f, 2.e+00f }, { 2.9129881e-26f, 1.4436355e+00f }, 5.e-01f },
+        /* 215 */ { { 3.6093321e-13f, 2.e+00f }, { 1.6141424e-13f, 1.4436355e+00f }, 5.e-01f },
+        /* 216 */ { { 2.e+00f, 2.e+00f }, { 7.5424916e-01f, 1.7343246e+00f }, 5.e-01f },
+        /* 217 */ { { 1.1082383e+13f, 2.e+00f }, { pi_2, 3.0729525e+01f }, 3.125e-02f },
+        /* 218 */ { { 6.1409603e+25f, 2.e+00f }, { pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 219 */ { { max, 2.e+00f }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 220 */ { { inf, 2.e+00f }, { pi_2, inf }, 1.e+00f },
+        /* 221 */ { { -inf, 1.1082383e+13f }, { -pi_2, inf }, 1.e+00f },
+        /* 222 */ { { -max, 1.1082383e+13f }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 223 */ { { -6.1409603e+25f, 1.1082383e+13f }, { -pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 224 */ { { -1.1082383e+13f, 1.1082383e+13f }, { -pi_4, 3.1076099e+01f }, 3.125e-02f },
+        /* 225 */ { { -2.e+00f, 1.1082383e+13f }, { -1.804666e-13f, 3.0729525e+01f }, 3.125e-02f },
+        /* 226 */ { { -3.6093321e-13f, 1.1082383e+13f }, { -3.2568193e-26f, 3.0729525e+01f }, 3.125e-02f },
+        /* 227 */ { { -6.5136393e-26f, 1.1082383e+13f }, { -5.8774718e-39f, 3.0729525e+01f }, 3.125e-02f },
+        /* 228 */ { { -min, 1.1082383e+13f }, { zero, 3.0729525e+01f }, 3.125e-02f },
+        /* 229 */ { { zero, 1.1082383e+13f }, { zero, 3.0729525e+01f }, 3.125e-02f },
+        /* 230 */ { { min, 1.1082383e+13f }, { zero, 3.0729525e+01f }, 3.125e-02f },
+        /* 231 */ { { 6.5136393e-26f, 1.1082383e+13f }, { 5.8774718e-39f, 3.0729525e+01f }, 3.125e-02f },
+        /* 232 */ { { 3.6093321e-13f, 1.1082383e+13f }, { 3.2568193e-26f, 3.0729525e+01f }, 3.125e-02f },
+        /* 233 */ { { 2.e+00f, 1.1082383e+13f }, { 1.804666e-13f, 3.0729525e+01f }, 3.125e-02f },
+        /* 234 */ { { 1.1082383e+13f, 1.1082383e+13f }, { pi_4, 3.1076099e+01f }, 3.125e-02f },
+        /* 235 */ { { 6.1409603e+25f, 1.1082383e+13f }, { pi_2, 6.0072754e+01f }, 1.5625e-02f },
+        /* 236 */ { { max, 1.1082383e+13f }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 237 */ { { inf, 1.1082383e+13f }, { pi_2, inf }, 1.e+00f },
+        /* 238 */ { { -inf, 6.1409603e+25f }, { -pi_2, inf }, 1.e+00f },
+        /* 239 */ { { -max, 6.1409603e+25f }, { -pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 240 */ { { -6.1409603e+25f, 6.1409603e+25f }, { -pi_4, 6.0419331e+01f }, 1.5625e-02f },
+        /* 241 */ { { -1.1082383e+13f, 6.1409603e+25f }, { -1.8046662e-13f, 6.0072754e+01f }, 1.5625e-02f },
+        /* 242 */ { { -2.e+00f, 6.1409603e+25f }, { -3.2568196e-26f, 6.0072754e+01f }, 1.5625e-02f },
+        /* 243 */ { { -3.6093321e-13f, 6.1409603e+25f }, { -5.8774718e-39f, 6.0072754e+01f }, 1.5625e-02f },
+        /* 244 */ { { -6.5136393e-26f, 6.1409603e+25f }, { zero, 6.0072754e+01f }, 1.5625e-02f },
+        /* 245 */ { { -min, 6.1409603e+25f }, { zero, 6.0072754e+01f }, 1.5625e-02f },
+        /* 246 */ { { zero, 6.1409603e+25f }, { zero, 6.0072754e+01f }, 1.5625e-02f },
+        /* 247 */ { { min, 6.1409603e+25f }, { zero, 6.0072754e+01f }, 1.5625e-02f },
+        /* 248 */ { { 6.5136393e-26f, 6.1409603e+25f }, { zero, 6.0072754e+01f }, 1.5625e-02f },
+        /* 249 */ { { 3.6093321e-13f, 6.1409603e+25f }, { 5.8774718e-39f, 6.0072754e+01f }, 1.5625e-02f },
+        /* 250 */ { { 2.e+00f, 6.1409603e+25f }, { 3.2568196e-26f, 6.0072754e+01f }, 1.5625e-02f },
+        /* 251 */ { { 1.1082383e+13f, 6.1409603e+25f }, { 1.8046662e-13f, 6.0072754e+01f }, 1.5625e-02f },
+        /* 252 */ { { 6.1409603e+25f, 6.1409603e+25f }, { pi_4, 6.0419331e+01f }, 1.5625e-02f },
+        /* 253 */ { { max, 6.1409603e+25f }, { pi_2, 8.9415985e+01f }, 7.8125e-03f },
+        /* 254 */ { { inf, 6.1409603e+25f }, { pi_2, inf }, 1.e+00f },
+        /* 255 */ { { -inf, max }, { -pi_2, inf }, 1.e+00f },
+        /* 256 */ { { -max, max }, { -pi_4, 8.9762558e+01f }, 7.8125e-03f },
+        /* 257 */ { { -6.1409603e+25f, max }, { -1.8046662e-13f, 8.9415985e+01f }, 7.8125e-03f },
+        /* 258 */ { { -1.1082383e+13f, max }, { -3.25682e-26f, 8.9415985e+01f }, 7.8125e-03f },
+        /* 259 */ { { -2.e+00f, max }, { -5.8774718e-39f, 8.9415985e+01f }, 7.8125e-03f },
+        /* 260 */ { { -3.6093321e-13f, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 261 */ { { -6.5136393e-26f, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 262 */ { { -min, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 263 */ { { zero, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 264 */ { { min, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 265 */ { { 6.5136393e-26f, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 266 */ { { 3.6093321e-13f, max }, { zero, 8.9415985e+01f }, 7.8125e-03f },
+        /* 267 */ { { 2.e+00f, max }, { 5.8774718e-39f, 8.9415985e+01f }, 7.8125e-03f },
+        /* 268 */ { { 1.1082383e+13f, max }, { 3.25682e-26f, 8.9415985e+01f }, 7.8125e-03f },
+        /* 269 */ { { 6.1409603e+25f, max }, { 1.8046662e-13f, 8.9415985e+01f }, 7.8125e-03f },
+        /* 270 */ { { max, max }, { pi_4, 8.9762558e+01f }, 7.8125e-03f },
+        /* 271 */ { { inf, max }, { pi_2, inf }, 1.e+00f },
+        /* 272 */ { { -inf, inf }, { -pi_4, inf }, 1.e+00f },
+        /* 273 */ { { -max, inf }, { zero, inf }, 1.e+00f },
+        /* 274 */ { { -6.1409603e+25f, inf }, { zero, inf }, 1.e+00f },
+        /* 275 */ { { -1.1082383e+13f, inf }, { zero, inf }, 1.e+00f },
+        /* 276 */ { { -2.e+00f, inf }, { zero, inf }, 1.e+00f },
+        /* 277 */ { { -3.6093321e-13f, inf }, { zero, inf }, 1.e+00f },
+        /* 278 */ { { -6.5136393e-26f, inf }, { zero, inf }, 1.e+00f },
+        /* 279 */ { { -min, inf }, { zero, inf }, 1.e+00f },
+        /* 280 */ { { zero, inf }, { zero, inf }, 1.e+00f },
+        /* 281 */ { { min, inf }, { zero, inf }, 1.e+00f },
+        /* 282 */ { { 6.5136393e-26f, inf }, { zero, inf }, 1.e+00f },
+        /* 283 */ { { 3.6093321e-13f, inf }, { zero, inf }, 1.e+00f },
+        /* 284 */ { { 2.e+00f, inf }, { zero, inf }, 1.e+00f },
+        /* 285 */ { { 1.1082383e+13f, inf }, { zero, inf }, 1.e+00f },
+        /* 286 */ { { 6.1409603e+25f, inf }, { zero, inf }, 1.e+00f },
+        /* 287 */ { { max, inf }, { zero, inf }, 1.e+00f },
+        /* 288 */ { { inf, inf }, { pi_4, inf }, 1.e+00f }
+          // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T pi_4 = 0.7853981633974483;
+      const T pi_2 = 1.5707963267948966;
+      const T zero = 0.0;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { -pi_4, -inf }, 1.e+00 },
+        /* 1 */ { { -max, -inf }, { zero, -inf }, 1.e+00 },
+        /* 2 */ { { -4.0131652080900752e+205, -inf }, { zero, -inf }, 1.e+00 },
+        /* 3 */ { { -8.9589789687104559e+102, -inf }, { zero, -inf }, 1.e+00 },
+        /* 4 */ { { -1.9999999999998694e+00, -inf }, { zero, -inf }, 1.e+00 },
+        /* 5 */ { { -4.4647944971961829e-103, -inf }, { zero, -inf }, 1.e+00 },
+        /* 6 */ { { -9.9671949510973086e-206, -inf }, { zero, -inf }, 1.e+00 },
+        /* 7 */ { { -min, -inf }, { zero, -inf }, 1.e+00 },
+        /* 8 */ { { zero, -inf }, { zero, -inf }, 1.e+00 },
+        /* 9 */ { { min, -inf }, { zero, -inf }, 1.e+00 },
+        /* 10 */ { { 9.9671949510973086e-206, -inf }, { zero, -inf }, 1.e+00 },
+        /* 11 */ { { 4.4647944971961829e-103, -inf }, { zero, -inf }, 1.e+00 },
+        /* 12 */ { { 1.9999999999998694e+00, -inf }, { zero, -inf }, 1.e+00 },
+        /* 13 */ { { 8.9589789687104559e+102, -inf }, { zero, -inf }, 1.e+00 },
+        /* 14 */ { { 4.0131652080900752e+205, -inf }, { zero, -inf }, 1.e+00 },
+        /* 15 */ { { max, -inf }, { zero, -inf }, 1.e+00 },
+        /* 16 */ { { inf, -inf }, { pi_4, -inf }, 1.e+00 },
+        /* 17 */ { { -inf, -max }, { -pi_2, -inf }, 1.e+00 },
+        /* 18 */ { { -max, -max }, { -pi_4, -7.1082243366422392e+02 }, 9.765625e-04 },
+        /* 19 */ { { -4.0131652080900752e+205, -max }, { -2.2323972485979601e-103, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 20 */ { { -8.9589789687104559e+102, -max }, { -4.9835974755483611e-206, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 21 */ { { -1.9999999999998694e+00, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 22 */ { { -4.4647944971961829e-103, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 23 */ { { -9.9671949510973086e-206, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 24 */ { { -min, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 25 */ { { zero, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 26 */ { { min, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 27 */ { { 9.9671949510973086e-206, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 28 */ { { 4.4647944971961829e-103, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 29 */ { { 1.9999999999998694e+00, -max }, { zero, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 30 */ { { 8.9589789687104559e+102, -max }, { 4.9835974755483611e-206, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 31 */ { { 4.0131652080900752e+205, -max }, { 2.2323972485979601e-103, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 32 */ { { max, -max }, { pi_4, -7.1082243366422392e+02 }, 9.765625e-04 },
+        /* 33 */ { { inf, -max }, { pi_2, -inf }, 1.e+00 },
+        /* 34 */ { { -inf, -4.0131652080900752e+205 }, { -pi_2, -inf }, 1.e+00 },
+        /* 35 */ { { -max, -4.0131652080900752e+205 }, { -pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 36 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { -pi_4, -4.7445924509328245e+02 }, 1.953125e-03 },
+        /* 37 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { -2.2323972485982374e-103, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 38 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { -4.9835974755489796e-206, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 39 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { -1.1125369292536664e-308, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 40 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { zero, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 41 */ { { -min, -4.0131652080900752e+205 }, { zero, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 42 */ { { zero, -4.0131652080900752e+205 }, { zero, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 43 */ { { min, -4.0131652080900752e+205 }, { zero, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 44 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { zero, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 45 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { 1.1125369292536664e-308, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 46 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { 4.9835974755489796e-206, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 47 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { 2.2323972485982374e-103, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 48 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { pi_4, -4.7445924509328245e+02 }, 1.953125e-03 },
+        /* 49 */ { { max, -4.0131652080900752e+205 }, { pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 50 */ { { inf, -4.0131652080900752e+205 }, { pi_2, -inf }, 1.e+00 },
+        /* 51 */ { { -inf, -8.9589789687104559e+102 }, { -pi_2, -inf }, 1.e+00 },
+        /* 52 */ { { -max, -8.9589789687104559e+102 }, { -pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 53 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { -pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 54 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { -pi_4, -2.3809605652234112e+02 }, 3.90625e-03 },
+        /* 55 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { -2.2323972485982374e-103, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 56 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { -4.9835974755489796e-206, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 57 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { -1.1125369292536664e-308, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 58 */ { { -min, -8.9589789687104559e+102 }, { zero, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 59 */ { { zero, -8.9589789687104559e+102 }, { zero, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 60 */ { { min, -8.9589789687104559e+102 }, { zero, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 61 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { 1.1125369292536664e-308, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 62 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { 4.9835974755489796e-206, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 63 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { 2.2323972485982374e-103, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 64 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { pi_4, -2.3809605652234112e+02 }, 3.90625e-03 },
+        /* 65 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 66 */ { { max, -8.9589789687104559e+102 }, { pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 67 */ { { inf, -8.9589789687104559e+102 }, { pi_2, -inf }, 1.e+00 },
+        /* 68 */ { { -inf, -1.9999999999998694e+00 }, { -pi_2, -inf }, 1.e+00 },
+        /* 69 */ { { -max, -1.9999999999998694e+00 }, { -pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 70 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { -pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 71 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { -pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 72 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { -7.5424914469804205e-01, -1.7343245214879015e+00 }, 5.e-01 },
+        /* 73 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { -1.996716800259636e-103, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 74 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { -4.4574650911294878e-206, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 75 */ { { -min, -1.9999999999998694e+00 }, { zero, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 76 */ { { zero, -1.9999999999998694e+00 }, { zero, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 77 */ { { min, -1.9999999999998694e+00 }, { zero, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 78 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { 4.4574650911294878e-206, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 79 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { 1.996716800259636e-103, -1.4436354751787519e+00 }, 5.e-01 },
+        /* 80 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { 7.5424914469804205e-01, -1.7343245214879015e+00 }, 5.e-01 },
+        /* 81 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 82 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 83 */ { { max, -1.9999999999998694e+00 }, { pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 84 */ { { inf, -1.9999999999998694e+00 }, { pi_2, -inf }, 1.e+00 },
+        /* 85 */ { { -inf, -4.4647944971961829e-103 }, { -pi_2, -inf }, 1.e+00 },
+        /* 86 */ { { -max, -4.4647944971961829e-103 }, { -pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 87 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { -pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 88 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { -pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 89 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { -pi_2, -1.3169578969247413e+00 }, 2.5e-01 },
+        /* 90 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { -4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 91 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { -9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 92 */ { { -min, -4.4647944971961829e-103 }, { -min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 93 */ { { zero, -4.4647944971961829e-103 }, { zero, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 94 */ { { min, -4.4647944971961829e-103 }, { min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 95 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 96 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 97 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { pi_2, -1.3169578969247413e+00 }, 2.5e-01 },
+        /* 98 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 99 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 100 */ { { max, -4.4647944971961829e-103 }, { pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 101 */ { { inf, -4.4647944971961829e-103 }, { pi_2, -inf }, 1.e+00 },
+        /* 102 */ { { -inf, -9.9671949510973086e-206 }, { -pi_2, -inf }, 1.e+00 },
+        /* 103 */ { { -max, -9.9671949510973086e-206 }, { -pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 104 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { -pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 105 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { -pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 106 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { -pi_2, -1.3169578969247413e+00 }, 2.5e-01 },
+        /* 107 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { -4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 108 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { -9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 109 */ { { -min, -9.9671949510973086e-206 }, { -min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 110 */ { { zero, -9.9671949510973086e-206 }, { zero, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 111 */ { { min, -9.9671949510973086e-206 }, { min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 112 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 113 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 114 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { pi_2, -1.3169578969247413e+00 }, 2.5e-01 },
+        /* 115 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 116 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 117 */ { { max, -9.9671949510973086e-206 }, { pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 118 */ { { inf, -9.9671949510973086e-206 }, { pi_2, -inf }, 1.e+00 },
+        /* 119 */ { { -inf, -min }, { -pi_2, -inf }, 1.e+00 },
+        /* 120 */ { { -max, -min }, { -pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 121 */ { { -4.0131652080900752e+205, -min }, { -pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 122 */ { { -8.9589789687104559e+102, -min }, { -pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 123 */ { { -1.9999999999998694e+00, -min }, { -pi_2, -1.3169578969247413e+00 }, 2.5e-01 },
+        /* 124 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 125 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 126 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
+        /* 127 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
+        /* 128 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
+        /* 129 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 130 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 131 */ { { 1.9999999999998694e+00, -min }, { pi_2, -1.3169578969247413e+00 }, 2.5e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, -min }, { pi_2, -2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 133 */ { { 4.0131652080900752e+205, -min }, { pi_2, -4.741126715030025e+02 }, 1.953125e-03 },
+        /* 134 */ { { max, -min }, { pi_2, -7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 135 */ { { inf, -min }, { pi_2, -inf }, 1.e+00 },
+        /* 136 */ { { -inf, zero }, { -pi_2, inf }, 1.e+00 },
+        /* 137 */ { { -max, zero }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 138 */ { { -4.0131652080900752e+205, zero }, { -pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 139 */ { { -8.9589789687104559e+102, zero }, { -pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 140 */ { { -1.9999999999998694e+00, zero }, { -pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 141 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 142 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 143 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00 },
+        /* 145 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
+        /* 146 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 147 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 148 */ { { 1.9999999999998694e+00, zero }, { pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 149 */ { { 8.9589789687104559e+102, zero }, { pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 150 */ { { 4.0131652080900752e+205, zero }, { pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 151 */ { { max, zero }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 152 */ { { inf, zero }, { pi_2, inf }, 1.e+00 },
+        /* 153 */ { { -inf, min }, { -pi_2, inf }, 1.e+00 },
+        /* 154 */ { { -max, min }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 155 */ { { -4.0131652080900752e+205, min }, { -pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 156 */ { { -8.9589789687104559e+102, min }, { -pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 157 */ { { -1.9999999999998694e+00, min }, { -pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 158 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 159 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 160 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
+        /* 161 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
+        /* 162 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
+        /* 163 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 164 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 165 */ { { 1.9999999999998694e+00, min }, { pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 166 */ { { 8.9589789687104559e+102, min }, { pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 167 */ { { 4.0131652080900752e+205, min }, { pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 168 */ { { max, min }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 169 */ { { inf, min }, { pi_2, inf }, 1.e+00 },
+        /* 170 */ { { -inf, 9.9671949510973086e-206 }, { -pi_2, inf }, 1.e+00 },
+        /* 171 */ { { -max, 9.9671949510973086e-206 }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 172 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { -pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 173 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { -pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 174 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { -pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 175 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { -4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 176 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { -9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 177 */ { { -min, 9.9671949510973086e-206 }, { -min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 178 */ { { zero, 9.9671949510973086e-206 }, { zero, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 179 */ { { min, 9.9671949510973086e-206 }, { min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 180 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 181 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 182 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 183 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 184 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 185 */ { { max, 9.9671949510973086e-206 }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 186 */ { { inf, 9.9671949510973086e-206 }, { pi_2, inf }, 1.e+00 },
+        /* 187 */ { { -inf, 4.4647944971961829e-103 }, { -pi_2, inf }, 1.e+00 },
+        /* 188 */ { { -max, 4.4647944971961829e-103 }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 189 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { -pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 190 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { -pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 191 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { -pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 192 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { -4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 193 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { -9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 194 */ { { -min, 4.4647944971961829e-103 }, { -min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 195 */ { { zero, 4.4647944971961829e-103 }, { zero, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 196 */ { { min, 4.4647944971961829e-103 }, { min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 197 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 198 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 199 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { pi_2, 1.3169578969247413e+00 }, 2.5e-01 },
+        /* 200 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 201 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 202 */ { { max, 4.4647944971961829e-103 }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 203 */ { { inf, 4.4647944971961829e-103 }, { pi_2, inf }, 1.e+00 },
+        /* 204 */ { { -inf, 1.9999999999998694e+00 }, { -pi_2, inf }, 1.e+00 },
+        /* 205 */ { { -max, 1.9999999999998694e+00 }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 206 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { -pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 207 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { -pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 208 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { -7.5424914469804205e-01, 1.7343245214879015e+00 }, 5.e-01 },
+        /* 209 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { -1.996716800259636e-103, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 210 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { -4.4574650911294878e-206, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 211 */ { { -min, 1.9999999999998694e+00 }, { zero, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 212 */ { { zero, 1.9999999999998694e+00 }, { zero, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 213 */ { { min, 1.9999999999998694e+00 }, { zero, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 214 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { 4.4574650911294878e-206, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 215 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { 1.996716800259636e-103, 1.4436354751787519e+00 }, 5.e-01 },
+        /* 216 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { 7.5424914469804205e-01, 1.7343245214879015e+00 }, 5.e-01 },
+        /* 217 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { pi_2, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 218 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 219 */ { { max, 1.9999999999998694e+00 }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 220 */ { { inf, 1.9999999999998694e+00 }, { pi_2, inf }, 1.e+00 },
+        /* 221 */ { { -inf, 8.9589789687104559e+102 }, { -pi_2, inf }, 1.e+00 },
+        /* 222 */ { { -max, 8.9589789687104559e+102 }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 223 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { -pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 224 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { -pi_4, 2.3809605652234112e+02 }, 3.90625e-03 },
+        /* 225 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { -2.2323972485982374e-103, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 226 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { -4.9835974755489796e-206, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 227 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { -1.1125369292536664e-308, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 228 */ { { -min, 8.9589789687104559e+102 }, { zero, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 229 */ { { zero, 8.9589789687104559e+102 }, { zero, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 230 */ { { min, 8.9589789687104559e+102 }, { zero, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 231 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { 1.1125369292536664e-308, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 232 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { 4.9835974755489796e-206, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 233 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { 2.2323972485982374e-103, 2.3774948293206117e+02 }, 3.90625e-03 },
+        /* 234 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { pi_4, 2.3809605652234112e+02 }, 3.90625e-03 },
+        /* 235 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { pi_2, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 236 */ { { max, 8.9589789687104559e+102 }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 237 */ { { inf, 8.9589789687104559e+102 }, { pi_2, inf }, 1.e+00 },
+        /* 238 */ { { -inf, 4.0131652080900752e+205 }, { -pi_2, inf }, 1.e+00 },
+        /* 239 */ { { -max, 4.0131652080900752e+205 }, { -pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 240 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { -pi_4, 4.7445924509328245e+02 }, 1.953125e-03 },
+        /* 241 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { -2.2323972485982374e-103, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 242 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { -4.9835974755489796e-206, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 243 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { -1.1125369292536664e-308, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 244 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { zero, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 245 */ { { -min, 4.0131652080900752e+205 }, { zero, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 246 */ { { zero, 4.0131652080900752e+205 }, { zero, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 247 */ { { min, 4.0131652080900752e+205 }, { zero, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 248 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { zero, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 249 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { 1.1125369292536664e-308, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 250 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { 4.9835974755489796e-206, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 251 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { 2.2323972485982374e-103, 4.741126715030025e+02 }, 1.953125e-03 },
+        /* 252 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { pi_4, 4.7445924509328245e+02 }, 1.953125e-03 },
+        /* 253 */ { { max, 4.0131652080900752e+205 }, { pi_2, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 254 */ { { inf, 4.0131652080900752e+205 }, { pi_2, inf }, 1.e+00 },
+        /* 255 */ { { -inf, max }, { -pi_2, inf }, 1.e+00 },
+        /* 256 */ { { -max, max }, { -pi_4, 7.1082243366422392e+02 }, 9.765625e-04 },
+        /* 257 */ { { -4.0131652080900752e+205, max }, { -2.2323972485979601e-103, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 258 */ { { -8.9589789687104559e+102, max }, { -4.9835974755483611e-206, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 259 */ { { -1.9999999999998694e+00, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 260 */ { { -4.4647944971961829e-103, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 261 */ { { -9.9671949510973086e-206, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 262 */ { { -min, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 263 */ { { zero, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 264 */ { { min, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 265 */ { { 9.9671949510973086e-206, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 266 */ { { 4.4647944971961829e-103, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 267 */ { { 1.9999999999998694e+00, max }, { zero, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 268 */ { { 8.9589789687104559e+102, max }, { 4.9835974755483611e-206, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 269 */ { { 4.0131652080900752e+205, max }, { 2.2323972485979601e-103, 7.1047586007394398e+02 }, 9.765625e-04 },
+        /* 270 */ { { max, max }, { pi_4, 7.1082243366422392e+02 }, 9.765625e-04 },
+        /* 271 */ { { inf, max }, { pi_2, inf }, 1.e+00 },
+        /* 272 */ { { -inf, inf }, { -pi_4, inf }, 1.e+00 },
+        /* 273 */ { { -max, inf }, { zero, inf }, 1.e+00 },
+        /* 274 */ { { -4.0131652080900752e+205, inf }, { zero, inf }, 1.e+00 },
+        /* 275 */ { { -8.9589789687104559e+102, inf }, { zero, inf }, 1.e+00 },
+        /* 276 */ { { -1.9999999999998694e+00, inf }, { zero, inf }, 1.e+00 },
+        /* 277 */ { { -4.4647944971961829e-103, inf }, { zero, inf }, 1.e+00 },
+        /* 278 */ { { -9.9671949510973086e-206, inf }, { zero, inf }, 1.e+00 },
+        /* 279 */ { { -min, inf }, { zero, inf }, 1.e+00 },
+        /* 280 */ { { zero, inf }, { zero, inf }, 1.e+00 },
+        /* 281 */ { { min, inf }, { zero, inf }, 1.e+00 },
+        /* 282 */ { { 9.9671949510973086e-206, inf }, { zero, inf }, 1.e+00 },
+        /* 283 */ { { 4.4647944971961829e-103, inf }, { zero, inf }, 1.e+00 },
+        /* 284 */ { { 1.9999999999998694e+00, inf }, { zero, inf }, 1.e+00 },
+        /* 285 */ { { 8.9589789687104559e+102, inf }, { zero, inf }, 1.e+00 },
+        /* 286 */ { { 4.0131652080900752e+205, inf }, { zero, inf }, 1.e+00 },
+        /* 287 */ { { max, inf }, { zero, inf }, 1.e+00 },
+        /* 288 */ { { inf, inf }, { pi_4, inf }, 1.e+00 }
+          // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+
+template <typename T, int default_dps_deficiency = 0>
+struct Asinh {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T pi_4 = 0.7853982f;
+      const T pi_2 = 1.5707964f;
+      const T zero = 0.0f;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { -inf, -pi_4 }, 1.e+00f },
+        /* 1 */ { { -max, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 2 */ { { -6.1409603e+25f, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 3 */ { { -1.1082383e+13f, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 4 */ { { -2.e+00f, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 5 */ { { -3.6093321e-13f, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 6 */ { { -6.5136393e-26f, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 7 */ { { -min, -inf }, { -inf, -pi_2 }, 1.e+00f },
+        /* 8 */ { { zero, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 9 */ { { min, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 10 */ { { 6.5136393e-26f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 11 */ { { 3.6093321e-13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 12 */ { { 2.e+00f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 13 */ { { 1.1082383e+13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 14 */ { { 6.1409603e+25f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 15 */ { { max, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 16 */ { { inf, -inf }, { inf, -pi_4 }, 1.e+00f },
+        /* 17 */ { { -inf, -max }, { -inf, zero }, 1.e+00f },
+        /* 18 */ { { -max, -max }, { -8.9762558e+01f, -pi_4 }, 7.8125e-03f },
+        /* 19 */ { { -6.1409603e+25f, -max }, { -8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 20 */ { { -1.1082383e+13f, -max }, { -8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 21 */ { { -2.e+00f, -max }, { -8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 22 */ { { -3.6093321e-13f, -max }, { -8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 23 */ { { -6.5136393e-26f, -max }, { -8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 24 */ { { -min, -max }, { -8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 25 */ { { zero, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 26 */ { { min, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 27 */ { { 6.5136393e-26f, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 28 */ { { 3.6093321e-13f, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 29 */ { { 2.e+00f, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 30 */ { { 1.1082383e+13f, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 31 */ { { 6.1409603e+25f, -max }, { 8.9415985e+01f, -pi_2 }, 7.8125e-03f },
+        /* 32 */ { { max, -max }, { 8.9762558e+01f, -pi_4 }, 7.8125e-03f },
+        /* 33 */ { { inf, -max }, { inf, zero }, 1.e+00f },
+        /* 34 */ { { -inf, -6.1409603e+25f }, { -inf, zero }, 1.e+00f },
+        /* 35 */ { { -max, -6.1409603e+25f }, { -8.9415985e+01f, -1.8046662e-13f }, 7.8125e-03f },
+        /* 36 */ { { -6.1409603e+25f, -6.1409603e+25f }, { -6.0419331e+01f, -pi_4 }, 1.5625e-02f },
+        /* 37 */ { { -1.1082383e+13f, -6.1409603e+25f }, { -6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 38 */ { { -2.e+00f, -6.1409603e+25f }, { -6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 39 */ { { -3.6093321e-13f, -6.1409603e+25f }, { -6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 40 */ { { -6.5136393e-26f, -6.1409603e+25f }, { -6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 41 */ { { -min, -6.1409603e+25f }, { -6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 42 */ { { zero, -6.1409603e+25f }, { 6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 43 */ { { min, -6.1409603e+25f }, { 6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 44 */ { { 6.5136393e-26f, -6.1409603e+25f }, { 6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 45 */ { { 3.6093321e-13f, -6.1409603e+25f }, { 6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 46 */ { { 2.e+00f, -6.1409603e+25f }, { 6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 47 */ { { 1.1082383e+13f, -6.1409603e+25f }, { 6.0072754e+01f, -pi_2 }, 1.5625e-02f },
+        /* 48 */ { { 6.1409603e+25f, -6.1409603e+25f }, { 6.0419331e+01f, -pi_4 }, 1.5625e-02f },
+        /* 49 */ { { max, -6.1409603e+25f }, { 8.9415985e+01f, -1.8046662e-13f }, 7.8125e-03f },
+        /* 50 */ { { inf, -6.1409603e+25f }, { inf, zero }, 1.e+00f },
+        /* 51 */ { { -inf, -1.1082383e+13f }, { -inf, zero }, 1.e+00f },
+        /* 52 */ { { -max, -1.1082383e+13f }, { -8.9415985e+01f, -3.25682e-26f }, 7.8125e-03f },
+        /* 53 */ { { -6.1409603e+25f, -1.1082383e+13f }, { -6.0072754e+01f, -1.8046662e-13f }, 1.5625e-02f },
+        /* 54 */ { { -1.1082383e+13f, -1.1082383e+13f }, { -3.1076099e+01f, -pi_4 }, 3.125e-02f },
+        /* 55 */ { { -2.e+00f, -1.1082383e+13f }, { -3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 56 */ { { -3.6093321e-13f, -1.1082383e+13f }, { -3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 57 */ { { -6.5136393e-26f, -1.1082383e+13f }, { -3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 58 */ { { -min, -1.1082383e+13f }, { -3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 59 */ { { zero, -1.1082383e+13f }, { 3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 60 */ { { min, -1.1082383e+13f }, { 3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 61 */ { { 6.5136393e-26f, -1.1082383e+13f }, { 3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 62 */ { { 3.6093321e-13f, -1.1082383e+13f }, { 3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 63 */ { { 2.e+00f, -1.1082383e+13f }, { 3.0729525e+01f, -pi_2 }, 3.125e-02f },
+        /* 64 */ { { 1.1082383e+13f, -1.1082383e+13f }, { 3.1076099e+01f, -pi_4 }, 3.125e-02f },
+        /* 65 */ { { 6.1409603e+25f, -1.1082383e+13f }, { 6.0072754e+01f, -1.8046662e-13f }, 1.5625e-02f },
+        /* 66 */ { { max, -1.1082383e+13f }, { 8.9415985e+01f, -3.25682e-26f }, 7.8125e-03f },
+        /* 67 */ { { inf, -1.1082383e+13f }, { inf, zero }, 1.e+00f },
+        /* 68 */ { { -inf, -2.e+00f }, { -inf, zero }, 1.e+00f },
+        /* 69 */ { { -max, -2.e+00f }, { -8.9415985e+01f, -5.8774718e-39f }, 7.8125e-03f },
+        /* 70 */ { { -6.1409603e+25f, -2.e+00f }, { -6.0072754e+01f, -3.2568196e-26f }, 1.5625e-02f },
+        /* 71 */ { { -1.1082383e+13f, -2.e+00f }, { -3.0729525e+01f, -1.804666e-13f }, 3.125e-02f },
+        /* 72 */ { { -2.e+00f, -2.e+00f }, { -1.7343246e+00f, -7.5424916e-01f }, 5.e-01f },
+        /* 73 */ { { -3.6093321e-13f, -2.e+00f }, { -1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 74 */ { { -6.5136393e-26f, -2.e+00f }, { -1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 75 */ { { -min, -2.e+00f }, { -1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 76 */ { { zero, -2.e+00f }, { 1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 77 */ { { min, -2.e+00f }, { 1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 78 */ { { 6.5136393e-26f, -2.e+00f }, { 1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 79 */ { { 3.6093321e-13f, -2.e+00f }, { 1.316958e+00f, -pi_2 }, 2.5e-01f },
+        /* 80 */ { { 2.e+00f, -2.e+00f }, { 1.7343246e+00f, -7.5424916e-01f }, 5.e-01f },
+        /* 81 */ { { 1.1082383e+13f, -2.e+00f }, { 3.0729525e+01f, -1.804666e-13f }, 3.125e-02f },
+        /* 82 */ { { 6.1409603e+25f, -2.e+00f }, { 6.0072754e+01f, -3.2568196e-26f }, 1.5625e-02f },
+        /* 83 */ { { max, -2.e+00f }, { 8.9415985e+01f, -5.8774718e-39f }, 7.8125e-03f },
+        /* 84 */ { { inf, -2.e+00f }, { inf, zero }, 1.e+00f },
+        /* 85 */ { { -inf, -3.6093321e-13f }, { -inf, zero }, 1.e+00f },
+        /* 86 */ { { -max, -3.6093321e-13f }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 87 */ { { -6.1409603e+25f, -3.6093321e-13f }, { -6.0072754e+01f, -5.8774718e-39f }, 1.5625e-02f },
+        /* 88 */ { { -1.1082383e+13f, -3.6093321e-13f }, { -3.0729525e+01f, -3.2568193e-26f }, 3.125e-02f },
+        /* 89 */ { { -2.e+00f, -3.6093321e-13f }, { -1.4436355e+00f, -1.6141424e-13f }, 5.e-01f },
+        /* 90 */ { { -3.6093321e-13f, -3.6093321e-13f }, { -3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 91 */ { { -6.5136393e-26f, -3.6093321e-13f }, { -6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 92 */ { { -min, -3.6093321e-13f }, { -min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 93 */ { { zero, -3.6093321e-13f }, { zero, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 94 */ { { min, -3.6093321e-13f }, { min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 95 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 96 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 97 */ { { 2.e+00f, -3.6093321e-13f }, { 1.4436355e+00f, -1.6141424e-13f }, 5.e-01f },
+        /* 98 */ { { 1.1082383e+13f, -3.6093321e-13f }, { 3.0729525e+01f, -3.2568193e-26f }, 3.125e-02f },
+        /* 99 */ { { 6.1409603e+25f, -3.6093321e-13f }, { 6.0072754e+01f, -5.8774718e-39f }, 1.5625e-02f },
+        /* 100 */ { { max, -3.6093321e-13f }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 101 */ { { inf, -3.6093321e-13f }, { inf, zero }, 1.e+00f },
+        /* 102 */ { { -inf, -6.5136393e-26f }, { -inf, zero }, 1.e+00f },
+        /* 103 */ { { -max, -6.5136393e-26f }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 104 */ { { -6.1409603e+25f, -6.5136393e-26f }, { -6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 105 */ { { -1.1082383e+13f, -6.5136393e-26f }, { -3.0729525e+01f, -5.8774718e-39f }, 3.125e-02f },
+        /* 106 */ { { -2.e+00f, -6.5136393e-26f }, { -1.4436355e+00f, -2.9129881e-26f }, 5.e-01f },
+        /* 107 */ { { -3.6093321e-13f, -6.5136393e-26f }, { -3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 108 */ { { -6.5136393e-26f, -6.5136393e-26f }, { -6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 109 */ { { -min, -6.5136393e-26f }, { -min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 110 */ { { zero, -6.5136393e-26f }, { zero, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 111 */ { { min, -6.5136393e-26f }, { min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 112 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 113 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 114 */ { { 2.e+00f, -6.5136393e-26f }, { 1.4436355e+00f, -2.9129881e-26f }, 5.e-01f },
+        /* 115 */ { { 1.1082383e+13f, -6.5136393e-26f }, { 3.0729525e+01f, -5.8774718e-39f }, 3.125e-02f },
+        /* 116 */ { { 6.1409603e+25f, -6.5136393e-26f }, { 6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 117 */ { { max, -6.5136393e-26f }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 118 */ { { inf, -6.5136393e-26f }, { inf, zero }, 1.e+00f },
+        /* 119 */ { { -inf, -min }, { -inf, zero }, 1.e+00f },
+        /* 120 */ { { -max, -min }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 121 */ { { -6.1409603e+25f, -min }, { -6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 122 */ { { -1.1082383e+13f, -min }, { -3.0729525e+01f, zero }, 3.125e-02f },
+        /* 123 */ { { -2.e+00f, -min }, { -1.4436355e+00f, zero }, 5.e-01f },
+        /* 124 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 125 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 126 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
+        /* 127 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
+        /* 128 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
+        /* 129 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 130 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 131 */ { { 2.e+00f, -min }, { 1.4436355e+00f, zero }, 5.e-01f },
+        /* 132 */ { { 1.1082383e+13f, -min }, { 3.0729525e+01f, zero }, 3.125e-02f },
+        /* 133 */ { { 6.1409603e+25f, -min }, { 6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 134 */ { { max, -min }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 135 */ { { inf, -min }, { inf, zero }, 1.e+00f },
+        /* 136 */ { { -inf, zero }, { -inf, zero }, 1.e+00f },
+        /* 137 */ { { -max, zero }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 138 */ { { -6.1409603e+25f, zero }, { -6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 139 */ { { -1.1082383e+13f, zero }, { -3.0729525e+01f, zero }, 3.125e-02f },
+        /* 140 */ { { -2.e+00f, zero }, { -1.4436355e+00f, zero }, 5.e-01f },
+        /* 141 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 142 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 143 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00f },
+        /* 145 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
+        /* 146 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 147 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 148 */ { { 2.e+00f, zero }, { 1.4436355e+00f, zero }, 5.e-01f },
+        /* 149 */ { { 1.1082383e+13f, zero }, { 3.0729525e+01f, zero }, 3.125e-02f },
+        /* 150 */ { { 6.1409603e+25f, zero }, { 6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 151 */ { { max, zero }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00f },
+        /* 153 */ { { -inf, min }, { -inf, zero }, 1.e+00f },
+        /* 154 */ { { -max, min }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 155 */ { { -6.1409603e+25f, min }, { -6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 156 */ { { -1.1082383e+13f, min }, { -3.0729525e+01f, zero }, 3.125e-02f },
+        /* 157 */ { { -2.e+00f, min }, { -1.4436355e+00f, zero }, 5.e-01f },
+        /* 158 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 159 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 160 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
+        /* 161 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
+        /* 162 */ { { min, min }, { min, min }, 4.2535296e+37f },
+        /* 163 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 164 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 165 */ { { 2.e+00f, min }, { 1.4436355e+00f, zero }, 5.e-01f },
+        /* 166 */ { { 1.1082383e+13f, min }, { 3.0729525e+01f, zero }, 3.125e-02f },
+        /* 167 */ { { 6.1409603e+25f, min }, { 6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 168 */ { { max, min }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 169 */ { { inf, min }, { inf, zero }, 1.e+00f },
+        /* 170 */ { { -inf, 6.5136393e-26f }, { -inf, zero }, 1.e+00f },
+        /* 171 */ { { -max, 6.5136393e-26f }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 172 */ { { -6.1409603e+25f, 6.5136393e-26f }, { -6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 173 */ { { -1.1082383e+13f, 6.5136393e-26f }, { -3.0729525e+01f, 5.8774718e-39f }, 3.125e-02f },
+        /* 174 */ { { -2.e+00f, 6.5136393e-26f }, { -1.4436355e+00f, 2.9129881e-26f }, 5.e-01f },
+        /* 175 */ { { -3.6093321e-13f, 6.5136393e-26f }, { -3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 176 */ { { -6.5136393e-26f, 6.5136393e-26f }, { -6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 177 */ { { -min, 6.5136393e-26f }, { -min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 178 */ { { zero, 6.5136393e-26f }, { zero, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 179 */ { { min, 6.5136393e-26f }, { min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 180 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 181 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 182 */ { { 2.e+00f, 6.5136393e-26f }, { 1.4436355e+00f, 2.9129881e-26f }, 5.e-01f },
+        /* 183 */ { { 1.1082383e+13f, 6.5136393e-26f }, { 3.0729525e+01f, 5.8774718e-39f }, 3.125e-02f },
+        /* 184 */ { { 6.1409603e+25f, 6.5136393e-26f }, { 6.0072754e+01f, zero }, 1.5625e-02f },
+        /* 185 */ { { max, 6.5136393e-26f }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 186 */ { { inf, 6.5136393e-26f }, { inf, zero }, 1.e+00f },
+        /* 187 */ { { -inf, 3.6093321e-13f }, { -inf, zero }, 1.e+00f },
+        /* 188 */ { { -max, 3.6093321e-13f }, { -8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 189 */ { { -6.1409603e+25f, 3.6093321e-13f }, { -6.0072754e+01f, 5.8774718e-39f }, 1.5625e-02f },
+        /* 190 */ { { -1.1082383e+13f, 3.6093321e-13f }, { -3.0729525e+01f, 3.2568193e-26f }, 3.125e-02f },
+        /* 191 */ { { -2.e+00f, 3.6093321e-13f }, { -1.4436355e+00f, 1.6141424e-13f }, 5.e-01f },
+        /* 192 */ { { -3.6093321e-13f, 3.6093321e-13f }, { -3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 193 */ { { -6.5136393e-26f, 3.6093321e-13f }, { -6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 194 */ { { -min, 3.6093321e-13f }, { -min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 195 */ { { zero, 3.6093321e-13f }, { zero, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 196 */ { { min, 3.6093321e-13f }, { min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 197 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 198 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 199 */ { { 2.e+00f, 3.6093321e-13f }, { 1.4436355e+00f, 1.6141424e-13f }, 5.e-01f },
+        /* 200 */ { { 1.1082383e+13f, 3.6093321e-13f }, { 3.0729525e+01f, 3.2568193e-26f }, 3.125e-02f },
+        /* 201 */ { { 6.1409603e+25f, 3.6093321e-13f }, { 6.0072754e+01f, 5.8774718e-39f }, 1.5625e-02f },
+        /* 202 */ { { max, 3.6093321e-13f }, { 8.9415985e+01f, zero }, 7.8125e-03f },
+        /* 203 */ { { inf, 3.6093321e-13f }, { inf, zero }, 1.e+00f },
+        /* 204 */ { { -inf, 2.e+00f }, { -inf, zero }, 1.e+00f },
+        /* 205 */ { { -max, 2.e+00f }, { -8.9415985e+01f, 5.8774718e-39f }, 7.8125e-03f },
+        /* 206 */ { { -6.1409603e+25f, 2.e+00f }, { -6.0072754e+01f, 3.2568196e-26f }, 1.5625e-02f },
+        /* 207 */ { { -1.1082383e+13f, 2.e+00f }, { -3.0729525e+01f, 1.804666e-13f }, 3.125e-02f },
+        /* 208 */ { { -2.e+00f, 2.e+00f }, { -1.7343246e+00f, 7.5424916e-01f }, 5.e-01f },
+        /* 209 */ { { -3.6093321e-13f, 2.e+00f }, { -1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 210 */ { { -6.5136393e-26f, 2.e+00f }, { -1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 211 */ { { -min, 2.e+00f }, { -1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 212 */ { { zero, 2.e+00f }, { 1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 213 */ { { min, 2.e+00f }, { 1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 214 */ { { 6.5136393e-26f, 2.e+00f }, { 1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 215 */ { { 3.6093321e-13f, 2.e+00f }, { 1.316958e+00f, pi_2 }, 2.5e-01f },
+        /* 216 */ { { 2.e+00f, 2.e+00f }, { 1.7343246e+00f, 7.5424916e-01f }, 5.e-01f },
+        /* 217 */ { { 1.1082383e+13f, 2.e+00f }, { 3.0729525e+01f, 1.804666e-13f }, 3.125e-02f },
+        /* 218 */ { { 6.1409603e+25f, 2.e+00f }, { 6.0072754e+01f, 3.2568196e-26f }, 1.5625e-02f },
+        /* 219 */ { { max, 2.e+00f }, { 8.9415985e+01f, 5.8774718e-39f }, 7.8125e-03f },
+        /* 220 */ { { inf, 2.e+00f }, { inf, zero }, 1.e+00f },
+        /* 221 */ { { -inf, 1.1082383e+13f }, { -inf, zero }, 1.e+00f },
+        /* 222 */ { { -max, 1.1082383e+13f }, { -8.9415985e+01f, 3.25682e-26f }, 7.8125e-03f },
+        /* 223 */ { { -6.1409603e+25f, 1.1082383e+13f }, { -6.0072754e+01f, 1.8046662e-13f }, 1.5625e-02f },
+        /* 224 */ { { -1.1082383e+13f, 1.1082383e+13f }, { -3.1076099e+01f, pi_4 }, 3.125e-02f },
+        /* 225 */ { { -2.e+00f, 1.1082383e+13f }, { -3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 226 */ { { -3.6093321e-13f, 1.1082383e+13f }, { -3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 227 */ { { -6.5136393e-26f, 1.1082383e+13f }, { -3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 228 */ { { -min, 1.1082383e+13f }, { -3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 229 */ { { zero, 1.1082383e+13f }, { 3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 230 */ { { min, 1.1082383e+13f }, { 3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 231 */ { { 6.5136393e-26f, 1.1082383e+13f }, { 3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 232 */ { { 3.6093321e-13f, 1.1082383e+13f }, { 3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 233 */ { { 2.e+00f, 1.1082383e+13f }, { 3.0729525e+01f, pi_2 }, 3.125e-02f },
+        /* 234 */ { { 1.1082383e+13f, 1.1082383e+13f }, { 3.1076099e+01f, pi_4 }, 3.125e-02f },
+        /* 235 */ { { 6.1409603e+25f, 1.1082383e+13f }, { 6.0072754e+01f, 1.8046662e-13f }, 1.5625e-02f },
+        /* 236 */ { { max, 1.1082383e+13f }, { 8.9415985e+01f, 3.25682e-26f }, 7.8125e-03f },
+        /* 237 */ { { inf, 1.1082383e+13f }, { inf, zero }, 1.e+00f },
+        /* 238 */ { { -inf, 6.1409603e+25f }, { -inf, zero }, 1.e+00f },
+        /* 239 */ { { -max, 6.1409603e+25f }, { -8.9415985e+01f, 1.8046662e-13f }, 7.8125e-03f },
+        /* 240 */ { { -6.1409603e+25f, 6.1409603e+25f }, { -6.0419331e+01f, pi_4 }, 1.5625e-02f },
+        /* 241 */ { { -1.1082383e+13f, 6.1409603e+25f }, { -6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 242 */ { { -2.e+00f, 6.1409603e+25f }, { -6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 243 */ { { -3.6093321e-13f, 6.1409603e+25f }, { -6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 244 */ { { -6.5136393e-26f, 6.1409603e+25f }, { -6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 245 */ { { -min, 6.1409603e+25f }, { -6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 246 */ { { zero, 6.1409603e+25f }, { 6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 247 */ { { min, 6.1409603e+25f }, { 6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 248 */ { { 6.5136393e-26f, 6.1409603e+25f }, { 6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 249 */ { { 3.6093321e-13f, 6.1409603e+25f }, { 6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 250 */ { { 2.e+00f, 6.1409603e+25f }, { 6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 251 */ { { 1.1082383e+13f, 6.1409603e+25f }, { 6.0072754e+01f, pi_2 }, 1.5625e-02f },
+        /* 252 */ { { 6.1409603e+25f, 6.1409603e+25f }, { 6.0419331e+01f, pi_4 }, 1.5625e-02f },
+        /* 253 */ { { max, 6.1409603e+25f }, { 8.9415985e+01f, 1.8046662e-13f }, 7.8125e-03f },
+        /* 254 */ { { inf, 6.1409603e+25f }, { inf, zero }, 1.e+00f },
+        /* 255 */ { { -inf, max }, { -inf, zero }, 1.e+00f },
+        /* 256 */ { { -max, max }, { -8.9762558e+01f, pi_4 }, 7.8125e-03f },
+        /* 257 */ { { -6.1409603e+25f, max }, { -8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 258 */ { { -1.1082383e+13f, max }, { -8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 259 */ { { -2.e+00f, max }, { -8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 260 */ { { -3.6093321e-13f, max }, { -8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 261 */ { { -6.5136393e-26f, max }, { -8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 262 */ { { -min, max }, { -8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 263 */ { { zero, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 264 */ { { min, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 265 */ { { 6.5136393e-26f, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 266 */ { { 3.6093321e-13f, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 267 */ { { 2.e+00f, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 268 */ { { 1.1082383e+13f, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 269 */ { { 6.1409603e+25f, max }, { 8.9415985e+01f, pi_2 }, 7.8125e-03f },
+        /* 270 */ { { max, max }, { 8.9762558e+01f, pi_4 }, 7.8125e-03f },
+        /* 271 */ { { inf, max }, { inf, zero }, 1.e+00f },
+        /* 272 */ { { -inf, inf }, { -inf, pi_4 }, 1.e+00f },
+        /* 273 */ { { -max, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 274 */ { { -6.1409603e+25f, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 275 */ { { -1.1082383e+13f, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 276 */ { { -2.e+00f, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 277 */ { { -3.6093321e-13f, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 278 */ { { -6.5136393e-26f, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 279 */ { { -min, inf }, { -inf, pi_2 }, 1.e+00f },
+        /* 280 */ { { zero, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 281 */ { { min, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 282 */ { { 6.5136393e-26f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 283 */ { { 3.6093321e-13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 284 */ { { 2.e+00f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 285 */ { { 1.1082383e+13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 286 */ { { 6.1409603e+25f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 287 */ { { max, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 288 */ { { inf, inf }, { inf, pi_4 }, 1.e+00f }
+          // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T pi_4 = 0.7853981633974483;
+      const T pi_2 = 1.5707963267948966;
+      const T zero = 0.0;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { -inf, -pi_4 }, 1.e+00 },
+        /* 1 */ { { -max, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 2 */ { { -4.0131652080900752e+205, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 3 */ { { -8.9589789687104559e+102, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 4 */ { { -1.9999999999998694e+00, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 5 */ { { -4.4647944971961829e-103, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 6 */ { { -9.9671949510973086e-206, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 7 */ { { -min, -inf }, { -inf, -pi_2 }, 1.e+00 },
+        /* 8 */ { { zero, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 9 */ { { min, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 10 */ { { 9.9671949510973086e-206, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 11 */ { { 4.4647944971961829e-103, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 12 */ { { 1.9999999999998694e+00, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 13 */ { { 8.9589789687104559e+102, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 14 */ { { 4.0131652080900752e+205, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 15 */ { { max, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 16 */ { { inf, -inf }, { inf, -pi_4 }, 1.e+00 },
+        /* 17 */ { { -inf, -max }, { -inf, zero }, 1.e+00 },
+        /* 18 */ { { -max, -max }, { -7.1082243366422392e+02, -pi_4 }, 9.765625e-04 },
+        /* 19 */ { { -4.0131652080900752e+205, -max }, { -7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 20 */ { { -8.9589789687104559e+102, -max }, { -7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 21 */ { { -1.9999999999998694e+00, -max }, { -7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 22 */ { { -4.4647944971961829e-103, -max }, { -7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 23 */ { { -9.9671949510973086e-206, -max }, { -7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 24 */ { { -min, -max }, { -7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 25 */ { { zero, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 26 */ { { min, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 27 */ { { 9.9671949510973086e-206, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 28 */ { { 4.4647944971961829e-103, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 29 */ { { 1.9999999999998694e+00, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 30 */ { { 8.9589789687104559e+102, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 31 */ { { 4.0131652080900752e+205, -max }, { 7.1047586007394398e+02, -pi_2 }, 9.765625e-04 },
+        /* 32 */ { { max, -max }, { 7.1082243366422392e+02, -pi_4 }, 9.765625e-04 },
+        /* 33 */ { { inf, -max }, { inf, zero }, 1.e+00 },
+        /* 34 */ { { -inf, -4.0131652080900752e+205 }, { -inf, zero }, 1.e+00 },
+        /* 35 */ { { -max, -4.0131652080900752e+205 }, { -7.1047586007394398e+02, -2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 36 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { -4.7445924509328245e+02, -pi_4 }, 1.953125e-03 },
+        /* 37 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { -4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 38 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { -4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 39 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { -4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 40 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { -4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 41 */ { { -min, -4.0131652080900752e+205 }, { -4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 42 */ { { zero, -4.0131652080900752e+205 }, { 4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 43 */ { { min, -4.0131652080900752e+205 }, { 4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 44 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 45 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 46 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { 4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 47 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { 4.741126715030025e+02, -pi_2 }, 1.953125e-03 },
+        /* 48 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { 4.7445924509328245e+02, -pi_4 }, 1.953125e-03 },
+        /* 49 */ { { max, -4.0131652080900752e+205 }, { 7.1047586007394398e+02, -2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 50 */ { { inf, -4.0131652080900752e+205 }, { inf, zero }, 1.e+00 },
+        /* 51 */ { { -inf, -8.9589789687104559e+102 }, { -inf, zero }, 1.e+00 },
+        /* 52 */ { { -max, -8.9589789687104559e+102 }, { -7.1047586007394398e+02, -4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 53 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { -4.741126715030025e+02, -2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 54 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { -2.3809605652234112e+02, -pi_4 }, 3.90625e-03 },
+        /* 55 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { -2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 56 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { -2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 57 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { -2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 58 */ { { -min, -8.9589789687104559e+102 }, { -2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 59 */ { { zero, -8.9589789687104559e+102 }, { 2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 60 */ { { min, -8.9589789687104559e+102 }, { 2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 61 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { 2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 62 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { 2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 63 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { 2.3774948293206117e+02, -pi_2 }, 3.90625e-03 },
+        /* 64 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { 2.3809605652234112e+02, -pi_4 }, 3.90625e-03 },
+        /* 65 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { 4.741126715030025e+02, -2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 66 */ { { max, -8.9589789687104559e+102 }, { 7.1047586007394398e+02, -4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 67 */ { { inf, -8.9589789687104559e+102 }, { inf, zero }, 1.e+00 },
+        /* 68 */ { { -inf, -1.9999999999998694e+00 }, { -inf, zero }, 1.e+00 },
+        /* 69 */ { { -max, -1.9999999999998694e+00 }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 70 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { -4.741126715030025e+02, -4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 71 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { -2.3774948293206117e+02, -2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 72 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { -1.7343245214879015e+00, -7.5424914469804205e-01 }, 5.e-01 },
+        /* 73 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { -1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 74 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { -1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 75 */ { { -min, -1.9999999999998694e+00 }, { -1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 76 */ { { zero, -1.9999999999998694e+00 }, { 1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 77 */ { { min, -1.9999999999998694e+00 }, { 1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 78 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { 1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 79 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { 1.3169578969247413e+00, -pi_2 }, 2.5e-01 },
+        /* 80 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { 1.7343245214879015e+00, -7.5424914469804205e-01 }, 5.e-01 },
+        /* 81 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { 2.3774948293206117e+02, -2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 82 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { 4.741126715030025e+02, -4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 83 */ { { max, -1.9999999999998694e+00 }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 84 */ { { inf, -1.9999999999998694e+00 }, { inf, zero }, 1.e+00 },
+        /* 85 */ { { -inf, -4.4647944971961829e-103 }, { -inf, zero }, 1.e+00 },
+        /* 86 */ { { -max, -4.4647944971961829e-103 }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 87 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { -4.741126715030025e+02, -1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 88 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { -2.3774948293206117e+02, -4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 89 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { -1.4436354751787519e+00, -1.996716800259636e-103 }, 5.e-01 },
+        /* 90 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { -4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 91 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { -9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 92 */ { { -min, -4.4647944971961829e-103 }, { -min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 93 */ { { zero, -4.4647944971961829e-103 }, { zero, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 94 */ { { min, -4.4647944971961829e-103 }, { min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 95 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 96 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 97 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { 1.4436354751787519e+00, -1.996716800259636e-103 }, 5.e-01 },
+        /* 98 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { 2.3774948293206117e+02, -4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 99 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { 4.741126715030025e+02, -1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 100 */ { { max, -4.4647944971961829e-103 }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 101 */ { { inf, -4.4647944971961829e-103 }, { inf, zero }, 1.e+00 },
+        /* 102 */ { { -inf, -9.9671949510973086e-206 }, { -inf, zero }, 1.e+00 },
+        /* 103 */ { { -max, -9.9671949510973086e-206 }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 104 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { -4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 105 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { -2.3774948293206117e+02, -1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 106 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { -1.4436354751787519e+00, -4.4574650911294878e-206 }, 5.e-01 },
+        /* 107 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { -4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 108 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { -9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 109 */ { { -min, -9.9671949510973086e-206 }, { -min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 110 */ { { zero, -9.9671949510973086e-206 }, { zero, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 111 */ { { min, -9.9671949510973086e-206 }, { min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 112 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 113 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 114 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { 1.4436354751787519e+00, -4.4574650911294878e-206 }, 5.e-01 },
+        /* 115 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { 2.3774948293206117e+02, -1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 116 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { 4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 117 */ { { max, -9.9671949510973086e-206 }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 118 */ { { inf, -9.9671949510973086e-206 }, { inf, zero }, 1.e+00 },
+        /* 119 */ { { -inf, -min }, { -inf, zero }, 1.e+00 },
+        /* 120 */ { { -max, -min }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 121 */ { { -4.0131652080900752e+205, -min }, { -4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 122 */ { { -8.9589789687104559e+102, -min }, { -2.3774948293206117e+02, zero }, 3.90625e-03 },
+        /* 123 */ { { -1.9999999999998694e+00, -min }, { -1.4436354751787519e+00, zero }, 5.e-01 },
+        /* 124 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 125 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 126 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
+        /* 127 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
+        /* 128 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
+        /* 129 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 130 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 131 */ { { 1.9999999999998694e+00, -min }, { 1.4436354751787519e+00, zero }, 5.e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, -min }, { 2.3774948293206117e+02, zero }, 3.90625e-03 },
+        /* 133 */ { { 4.0131652080900752e+205, -min }, { 4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 134 */ { { max, -min }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 135 */ { { inf, -min }, { inf, zero }, 1.e+00 },
+        /* 136 */ { { -inf, zero }, { -inf, zero }, 1.e+00 },
+        /* 137 */ { { -max, zero }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 138 */ { { -4.0131652080900752e+205, zero }, { -4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 139 */ { { -8.9589789687104559e+102, zero }, { -2.3774948293206117e+02, zero }, 3.90625e-03 },
+        /* 140 */ { { -1.9999999999998694e+00, zero }, { -1.4436354751787519e+00, zero }, 5.e-01 },
+        /* 141 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 142 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 143 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00 },
+        /* 145 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
+        /* 146 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 147 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 148 */ { { 1.9999999999998694e+00, zero }, { 1.4436354751787519e+00, zero }, 5.e-01 },
+        /* 149 */ { { 8.9589789687104559e+102, zero }, { 2.3774948293206117e+02, zero }, 3.90625e-03 },
+        /* 150 */ { { 4.0131652080900752e+205, zero }, { 4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 151 */ { { max, zero }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00 },
+        /* 153 */ { { -inf, min }, { -inf, zero }, 1.e+00 },
+        /* 154 */ { { -max, min }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 155 */ { { -4.0131652080900752e+205, min }, { -4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 156 */ { { -8.9589789687104559e+102, min }, { -2.3774948293206117e+02, zero }, 3.90625e-03 },
+        /* 157 */ { { -1.9999999999998694e+00, min }, { -1.4436354751787519e+00, zero }, 5.e-01 },
+        /* 158 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 159 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 160 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
+        /* 161 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
+        /* 162 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
+        /* 163 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 164 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 165 */ { { 1.9999999999998694e+00, min }, { 1.4436354751787519e+00, zero }, 5.e-01 },
+        /* 166 */ { { 8.9589789687104559e+102, min }, { 2.3774948293206117e+02, zero }, 3.90625e-03 },
+        /* 167 */ { { 4.0131652080900752e+205, min }, { 4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 168 */ { { max, min }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 169 */ { { inf, min }, { inf, zero }, 1.e+00 },
+        /* 170 */ { { -inf, 9.9671949510973086e-206 }, { -inf, zero }, 1.e+00 },
+        /* 171 */ { { -max, 9.9671949510973086e-206 }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 172 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { -4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 173 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { -2.3774948293206117e+02, 1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 174 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { -1.4436354751787519e+00, 4.4574650911294878e-206 }, 5.e-01 },
+        /* 175 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { -4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 176 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { -9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 177 */ { { -min, 9.9671949510973086e-206 }, { -min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 178 */ { { zero, 9.9671949510973086e-206 }, { zero, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 179 */ { { min, 9.9671949510973086e-206 }, { min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 180 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 181 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 182 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { 1.4436354751787519e+00, 4.4574650911294878e-206 }, 5.e-01 },
+        /* 183 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { 2.3774948293206117e+02, 1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 184 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { 4.741126715030025e+02, zero }, 1.953125e-03 },
+        /* 185 */ { { max, 9.9671949510973086e-206 }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 186 */ { { inf, 9.9671949510973086e-206 }, { inf, zero }, 1.e+00 },
+        /* 187 */ { { -inf, 4.4647944971961829e-103 }, { -inf, zero }, 1.e+00 },
+        /* 188 */ { { -max, 4.4647944971961829e-103 }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 189 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { -4.741126715030025e+02, 1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 190 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { -2.3774948293206117e+02, 4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 191 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { -1.4436354751787519e+00, 1.996716800259636e-103 }, 5.e-01 },
+        /* 192 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { -4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 193 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { -9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 194 */ { { -min, 4.4647944971961829e-103 }, { -min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 195 */ { { zero, 4.4647944971961829e-103 }, { zero, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 196 */ { { min, 4.4647944971961829e-103 }, { min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 197 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 198 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 199 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { 1.4436354751787519e+00, 1.996716800259636e-103 }, 5.e-01 },
+        /* 200 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { 2.3774948293206117e+02, 4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 201 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { 4.741126715030025e+02, 1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 202 */ { { max, 4.4647944971961829e-103 }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 203 */ { { inf, 4.4647944971961829e-103 }, { inf, zero }, 1.e+00 },
+        /* 204 */ { { -inf, 1.9999999999998694e+00 }, { -inf, zero }, 1.e+00 },
+        /* 205 */ { { -max, 1.9999999999998694e+00 }, { -7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 206 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { -4.741126715030025e+02, 4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 207 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { -2.3774948293206117e+02, 2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 208 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { -1.7343245214879015e+00, 7.5424914469804205e-01 }, 5.e-01 },
+        /* 209 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { -1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 210 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { -1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 211 */ { { -min, 1.9999999999998694e+00 }, { -1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 212 */ { { zero, 1.9999999999998694e+00 }, { 1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 213 */ { { min, 1.9999999999998694e+00 }, { 1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 214 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { 1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 215 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { 1.3169578969247413e+00, pi_2 }, 2.5e-01 },
+        /* 216 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { 1.7343245214879015e+00, 7.5424914469804205e-01 }, 5.e-01 },
+        /* 217 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { 2.3774948293206117e+02, 2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 218 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { 4.741126715030025e+02, 4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 219 */ { { max, 1.9999999999998694e+00 }, { 7.1047586007394398e+02, zero }, 9.765625e-04 },
+        /* 220 */ { { inf, 1.9999999999998694e+00 }, { inf, zero }, 1.e+00 },
+        /* 221 */ { { -inf, 8.9589789687104559e+102 }, { -inf, zero }, 1.e+00 },
+        /* 222 */ { { -max, 8.9589789687104559e+102 }, { -7.1047586007394398e+02, 4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 223 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { -4.741126715030025e+02, 2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 224 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { -2.3809605652234112e+02, pi_4 }, 3.90625e-03 },
+        /* 225 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { -2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 226 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { -2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 227 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { -2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 228 */ { { -min, 8.9589789687104559e+102 }, { -2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 229 */ { { zero, 8.9589789687104559e+102 }, { 2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 230 */ { { min, 8.9589789687104559e+102 }, { 2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 231 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { 2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 232 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { 2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 233 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { 2.3774948293206117e+02, pi_2 }, 3.90625e-03 },
+        /* 234 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { 2.3809605652234112e+02, pi_4 }, 3.90625e-03 },
+        /* 235 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { 4.741126715030025e+02, 2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 236 */ { { max, 8.9589789687104559e+102 }, { 7.1047586007394398e+02, 4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 237 */ { { inf, 8.9589789687104559e+102 }, { inf, zero }, 1.e+00 },
+        /* 238 */ { { -inf, 4.0131652080900752e+205 }, { -inf, zero }, 1.e+00 },
+        /* 239 */ { { -max, 4.0131652080900752e+205 }, { -7.1047586007394398e+02, 2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 240 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { -4.7445924509328245e+02, pi_4 }, 1.953125e-03 },
+        /* 241 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { -4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 242 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { -4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 243 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { -4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 244 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { -4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 245 */ { { -min, 4.0131652080900752e+205 }, { -4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 246 */ { { zero, 4.0131652080900752e+205 }, { 4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 247 */ { { min, 4.0131652080900752e+205 }, { 4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 248 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 249 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 250 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { 4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 251 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { 4.741126715030025e+02, pi_2 }, 1.953125e-03 },
+        /* 252 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { 4.7445924509328245e+02, pi_4 }, 1.953125e-03 },
+        /* 253 */ { { max, 4.0131652080900752e+205 }, { 7.1047586007394398e+02, 2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 254 */ { { inf, 4.0131652080900752e+205 }, { inf, zero }, 1.e+00 },
+        /* 255 */ { { -inf, max }, { -inf, zero }, 1.e+00 },
+        /* 256 */ { { -max, max }, { -7.1082243366422392e+02, pi_4 }, 9.765625e-04 },
+        /* 257 */ { { -4.0131652080900752e+205, max }, { -7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 258 */ { { -8.9589789687104559e+102, max }, { -7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 259 */ { { -1.9999999999998694e+00, max }, { -7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 260 */ { { -4.4647944971961829e-103, max }, { -7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 261 */ { { -9.9671949510973086e-206, max }, { -7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 262 */ { { -min, max }, { -7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 263 */ { { zero, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 264 */ { { min, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 265 */ { { 9.9671949510973086e-206, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 266 */ { { 4.4647944971961829e-103, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 267 */ { { 1.9999999999998694e+00, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 268 */ { { 8.9589789687104559e+102, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 269 */ { { 4.0131652080900752e+205, max }, { 7.1047586007394398e+02, pi_2 }, 9.765625e-04 },
+        /* 270 */ { { max, max }, { 7.1082243366422392e+02, pi_4 }, 9.765625e-04 },
+        /* 271 */ { { inf, max }, { inf, zero }, 1.e+00 },
+        /* 272 */ { { -inf, inf }, { -inf, pi_4 }, 1.e+00 },
+        /* 273 */ { { -max, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 274 */ { { -4.0131652080900752e+205, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 275 */ { { -8.9589789687104559e+102, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 276 */ { { -1.9999999999998694e+00, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 277 */ { { -4.4647944971961829e-103, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 278 */ { { -9.9671949510973086e-206, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 279 */ { { -min, inf }, { -inf, pi_2 }, 1.e+00 },
+        /* 280 */ { { zero, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 281 */ { { min, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 282 */ { { 9.9671949510973086e-206, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 283 */ { { 4.4647944971961829e-103, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 284 */ { { 1.9999999999998694e+00, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 285 */ { { 8.9589789687104559e+102, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 286 */ { { 4.0131652080900752e+205, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 287 */ { { max, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 288 */ { { inf, inf }, { inf, pi_4 }, 1.e+00 }
+          // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+
+// NOLINTEND(whitespace/line_length)
+}  // namespace complex_unary_op_samples
+
+#endif  // XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/conv_depthwise_common.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/conv_depthwise_common.h
new file mode 100644
index 00000000..35085849
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/conv_depthwise_common.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
+#define XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
+
+#include <optional>
+
+#include "xla/execution_options_util.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/transforms/despecializer.h"
+#include "xla/hlo/transforms/simplifiers/float_normalization.h"
+#include "xla/status_macros.h"
+#include "xla/test.h"
+#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/test_macros.h"
+
+namespace xla {
+std::string GetFloatDataType(bool use_bfloat16);
+
+struct DepthwiseConvolution2DSpec {
+  int64_t output_feature = -1, window = -1, stride = -1, pad = -1,
+          lhs_dilate = -1;
+  std::vector<int64_t> activation_dims;
+  std::vector<int64_t> activation_layout;
+  std::vector<int64_t> kernel_dims;
+  std::vector<int64_t> kernel_layout;
+  std::vector<int64_t> output_dims;
+  std::vector<int64_t> output_layout;
+};
+
+std::string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data);
+
+std::string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16,
+    bool is_scheduled = false);
+
+}  // namespace xla
+#endif  // XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/error_spec.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/error_spec.h
new file mode 100644
index 00000000..c435b266
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/error_spec.h
@@ -0,0 +1,131 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_ERROR_SPEC_H_
+#define XLA_TESTS_EXHAUSTIVE_ERROR_SPEC_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+
+class ErrorSpecBuilder;
+
+// Specify error tolerances for comparison between a value produced on device
+// and its expectation.
+//
+// N.B.: It is our intention to eventually merge this with the one at
+// tensorflow/compiler/xla/error_spec.h. All functionality available here will
+// remain, but the majority of the error tolerances will be taken from the other
+// one. This type will become either a typealias of or a child class inheriting
+// from the other.
+struct ErrorSpec {
+  using Builder = ErrorSpecBuilder;
+
+  double abs_err = 0.0;
+  double rel_err = 0.0;
+  // The acceptable amount of floating point values between the expected and
+  // actual (also calling floating point distance).
+  //
+  // This is similar to absolute error, but the same distance_err can have
+  // different floating point values as the exponent changes. In some way, it is
+  // a hybrid of absolute and relative error, as it allows a fixed binary
+  // difference (like abs_err), but that has a varied floating point value based
+  // on the number (like rel_err).
+  int64_t distance_err = 0;
+  // If true, will consider -0 not near to +0 and vice versa.  Note that
+  // +epsilon may still be considered close to -0, depending on the error
+  // spec; this only covers the case when both `expected` and `actual` are
+  // equal to 0.
+  bool strict_signed_zeros = false;
+  // If true, this will skip comparing the output of the test to the expected
+  // value. This should be used only as a last resort, since it is effectively
+  // turning off the test for a specific input value set.
+  bool skip_comparison = false;
+};
+
+// Builder pattern to construct an ErrorSpec without a proliferation of
+// constructors or requiring extensive argument name comments.
+//
+// This was created mostly to avoid using designated initializers since that is
+// not compliant with all intended compilers (MSVC). This offers about the same
+// functionality with slightly more keystrokes.
+//
+// You can use an lvalue or rvalue to call the setter functions, but you can
+// only build (explicitly or implicitly) using an rvalue from std::move.
+class ErrorSpecBuilder {
+ public:
+  ErrorSpecBuilder() : spec_() {}
+
+  ErrorSpecBuilder& abs_err(double abs_err) & {
+    spec_.abs_err = abs_err;
+    return *this;
+  }
+  ErrorSpecBuilder&& abs_err(double abs_err) && {
+    spec_.abs_err = abs_err;
+    return std::move(*this);
+  }
+
+  ErrorSpecBuilder& rel_err(double rel_err) & {
+    spec_.rel_err = rel_err;
+    return *this;
+  }
+  ErrorSpecBuilder&& rel_err(double rel_err) && {
+    spec_.rel_err = rel_err;
+    return std::move(*this);
+  }
+
+  ErrorSpecBuilder& distance_err(int64_t distance_err) & {
+    spec_.distance_err = distance_err;
+    return *this;
+  }
+  ErrorSpecBuilder&& distance_err(int64_t distance_err) && {
+    spec_.distance_err = distance_err;
+    return std::move(*this);
+  }
+
+  ErrorSpecBuilder& strict_signed_zeros(bool strict_signed_zeros = true) & {
+    spec_.strict_signed_zeros = strict_signed_zeros;
+    return *this;
+  }
+  ErrorSpecBuilder&& strict_signed_zeros(bool strict_signed_zeros = true) && {
+    spec_.strict_signed_zeros = strict_signed_zeros;
+    return std::move(*this);
+  }
+
+  ErrorSpecBuilder& skip_comparison(bool skip_comparison = true) & {
+    spec_.skip_comparison = skip_comparison;
+    return *this;
+  }
+  ErrorSpecBuilder&& skip_comparison(bool skip_comparison = true) && {
+    spec_.skip_comparison = skip_comparison;
+    return std::move(*this);
+  }
+
+  ErrorSpec build() && { return spec_; }
+
+  explicit operator ErrorSpec() && { return std::move(*this).build(); }
+
+ private:
+  ErrorSpec spec_;
+};
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_ERROR_SPEC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_binary_test_definitions.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_binary_test_definitions.h
new file mode 100644
index 00000000..e4aee45b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_binary_test_definitions.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_BINARY_TEST_DEFINITIONS_H_
+#define XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_BINARY_TEST_DEFINITIONS_H_
+
+#include <array>    // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include <bit>      // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include <cstdint>  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include <ios>      // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include <tuple>    // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include <utility>  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+
+#include "absl/log/check.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "absl/log/log.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "absl/types/span.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "xla/literal.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "xla/tests/exhaustive/exhaustive_op_test.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "xla/tests/test_macros.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "xla/types.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+#include "tsl/platform/test.h"  // IWYU pragma: keep, exhaustive_binary_test_definitions.inc
+
+namespace xla {
+namespace exhaustive_op_test {
+
+#include "xla/tests/exhaustive/exhaustive_binary_test_definitions.inc"
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_BINARY_TEST_DEFINITIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test.h
new file mode 100644
index 00000000..524ab7f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_H_
+#define XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_H_
+
+#include <cstddef>
+
+#include "xla/tests/exhaustive/exhaustive_op_test_base.h"
+#include "xla/tests/exhaustive/platform.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+
+// openXLA-specific ExhaustiveOpTestBase subclass.
+//
+// Holds utility functions related to determining the execution platform.
+//
+// Type Parameters:
+// - T: The primitive type being tested.
+// - N: The number of operands that the function being tested takes.
+//
+// Pure Virtual Functions:
+// - GetInputSize
+// - FillInput
+template <PrimitiveType T, size_t N>
+class ExhaustiveOpTest : public ExhaustiveOpTestBase<T, N> {
+ public:
+  using Traits = ExhaustiveOpTestBase<T, 1>::Traits;
+
+  ExhaustiveOpTest() : platform_(*this->client_->platform()) {}
+
+  bool RelaxedDenormalSigns() const override {
+    return !platform_.IsNvidiaGpu();
+  }
+
+  const Platform& Platform() { return platform_; }
+
+  // DEPRECATED: Only kept until exhaustive_unary_complex_test is merged into
+  // exhaustive_unary_test. Use the new TestOp framework for
+  // exhaustive_unary_test.
+  bool IsGpu() const { return platform_.IsGpu(); }
+  bool IsCpu() const { return platform_.IsCpu(); }
+
+  static typename Traits::ErrorSpecGen GetDefaultSpecGenerator() {
+    return exhaustive_op_test::GetDefaultSpecGenerator<T, N>();
+  }
+
+ protected:
+  const class Platform platform_;
+};
+
+template <PrimitiveType T>
+using ExhaustiveUnaryTest = ExhaustiveOpTest<T, 1>;
+
+template <PrimitiveType T>
+using ExhaustiveBinaryTest = ExhaustiveOpTest<T, 2>;
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_base.h
new file mode 100644
index 00000000..1f4f5150
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_base.h
@@ -0,0 +1,238 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_BASE_H_
+#define XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_BASE_H_
+
+#include <array>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/bit_cast.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/exhaustive/error_spec.h"
+#include "xla/tests/exhaustive/exhaustive_op_test_utils.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+
+// Return if the user specified dumping all tested values with their expected
+// and actual results.
+bool ShouldDumpValues();
+
+// Add all extra CLI flags that are used by ExhaustiveOpTestBase.
+void AddExhaustiveFlags(std::vector<tsl::Flag>& flag_list);
+
+// Base class from which all exhaustive tests should inherit.
+//
+// Holds a bunch of utility functions to simplify the process of running the
+// operation and checking against expectations across multiple values.
+//
+// Type Parameters:
+// - T: The primitive type being tested.
+// - N: The number of operands that the function being tested takes.
+//
+// Pure Virtual Functions:
+// - GetInputSize
+// - FillInput
+// - RelaxedDenormalSigns
+template <PrimitiveType T, size_t N>
+class ExhaustiveOpTestBase : public ClientLibraryTestBase {
+ public:
+  using Traits = ExhaustiveOpTestTraits<T, N>;
+  static constexpr PrimitiveType kT = Traits::kT;
+
+  using NativeT = typename Traits::NativeT;
+  using NativeRefT = typename Traits::NativeRefT;
+  using ComponentNativeT = typename Traits::ComponentNativeT;
+  using ComponentNativeRefT = typename Traits::ComponentNativeRefT;
+  using ComponentIntegralNativeT = typename Traits::ComponentIntegralNativeT;
+  using ComponentIntegralNativeRefT =
+      typename Traits::ComponentIntegralNativeRefT;
+
+  using NativeInputs = typename Traits::NativeInputs;
+  using NativeListInputs = typename Traits::NativeListInputs;
+  using NativeRefInputs = typename Traits::NativeRefInputs;
+  using LiteralInputs = typename Traits::LiteralInputs;
+  using XlaInputs = typename Traits::XlaInputs;
+
+  using EvaluateOp = typename Traits::EvaluateOp;
+  using EnqueueOp = typename Traits::EnqueueOp;
+  using OutputRangeCheck = typename Traits::OutputRangeCheck;
+  using ErrorSpecGen = typename Traits::ErrorSpecGen;
+
+  ExhaustiveOpTestBase()
+      : should_dump_values_(xla::exhaustive_op_test::ShouldDumpValues()) {
+    SetFastMathDisabled(true);
+
+    // Run all HLO passes.  In particular, constant folding is disabled by
+    // default for tests, but we need to run it in order to tickle some bugs.
+    mutable_debug_options()->clear_xla_disable_hlo_passes();
+  }
+
+  // Converts part or all bits in an uint64_t to the value of the floating point
+  // data type being tested.
+  //
+  // When trying to exhaustive test for an operation of data type T, we always
+  // use an integral I with the same number of bits at T to exhaustive the input
+  // bit patterns for T. This bit pattern is zero extended and stored as
+  // uint64_t. This function is used to convert such a bit pattern stored as
+  // uint64_t to the input value for T.
+  static ComponentNativeT ConvertValue(uint64_t bits) {
+    auto used_bits = static_cast<ComponentIntegralNativeT>(bits);
+    return BitCast<ComponentNativeT>(used_bits);
+  }
+
+  // Returns the number of elements in each input literal.
+  virtual int64_t GetInputSize() = 0;
+
+  // Fills the literals with values to test for.
+  virtual void FillInput(LiteralInputs* literals) = 0;
+
+  // If true, allows denormals to be flushed to non-sign-preserving 0.
+  //
+  // For example, normally we'd expect sqrt(-denormal) to be either nan (sqrt of
+  // a negative number) or -inf (flush the denormal to sign-preserving zero,
+  // then sqrt(-0)). When true, we'll also accept 0 (sqrt(0)).
+  //
+  // XLA:GPU preserves denormal signs, but other backends don't.
+  virtual bool RelaxedDenormalSigns() const = 0;
+
+  // Enable debug logging for the invocation of the lambda.
+  //
+  // This is intended to be used to wrap a call to `Run`, which will then
+  // log extra debug information for a failure such as the calculated
+  // absolute, relative, and distance errors. In addition, in an effort to
+  // reduce output log size, this will trigger an ASSERT failure to early
+  // return from a test at the first failure.
+  template <typename Callable,
+            std::enable_if_t<std::is_invocable_r_v<void, Callable>, int> = 0>
+  void EnableDebugLoggingForScope(Callable&& work) {
+    should_emit_debug_logging_ = true;
+    work();
+    should_emit_debug_logging_ = false;
+  }
+
+  // A helper for implementing the Run method for exhaustive op tests. It
+  // constructs the HLO module, compiles and runs the module and checks the
+  // result.
+  //
+  // We use a function pointer for evaluate_op for performance because it is
+  // called each time an output element is compared inside a loop in routine
+  // ExpectNear.
+  void Run(EnqueueOp enqueue_op, EvaluateOp evaluate_op,
+           OutputRangeCheck check_valid_range = nullptr) {
+    Run(enqueue_op, evaluate_op, GetDefaultSpecGenerator<T, N>(),
+        check_valid_range);
+  }
+  void Run(EnqueueOp enqueue_op, EvaluateOp evaluate_op,
+           ErrorSpecGen error_spec_gen,
+           OutputRangeCheck check_valid_range = nullptr);
+
+  // Builds and runs the computation using the LocalClient API, rather than the
+  // plain Client API, which is used by ClientLibraryTestBase.  This is because
+  // the plain Client API results does more memcpys to/from Literals, and that's
+  // slow given that we're touching a lot of data here.
+  absl::StatusOr<Literal> RunComputationHelper(const XlaComputation& comp,
+                                               const Literal& literal) {
+    return RunComputation(comp, {&literal});
+  }
+  absl::StatusOr<Literal> RunComputationHelper(
+      const XlaComputation& comp, const std::array<Literal, N>& literals) {
+    std::array<const Literal*, N> lit_ptrs;
+    for (int i = 0; i < N; ++i) {
+      lit_ptrs[i] = &literals[i];
+    }
+    return RunComputation(comp, lit_ptrs);
+  }
+  absl::StatusOr<Literal> RunComputation(
+      const XlaComputation& computation,
+      absl::Span<const Literal* const> input_literals);
+
+  // Determines if two output values are sufficiently close to each other based
+  // on an error spec.
+  bool IsClose(NativeRefT expected, NativeRefT actual, ErrorSpec spec);
+
+  // Returns a list of inputs that should be tested for closeness given some
+  // original input values.
+  //
+  // For denormal component inputs, we accept answers that are close to any of:
+  //
+  //   - evaluate_op(input)
+  //   - evaluate_op(+/-0), where the sign of 0 equal to the sign of
+  //     `input`,
+  //   - evaluate_op(+/-min_normal_float), where the sign of
+  //     min_normal_float matches `input`.
+  //   - if relaxed_denormal_signs_, evaluate_op(-/+0), where the sign of
+  //     0 is the opposite of `input`.
+  //
+  // (In particular, the XLA:CPU implementation of log flushes positive
+  // denormals to min-normal-float.  This seems kind of reasonable if our
+  // goal is to avoid infinities because they cause nans?)
+  //
+  // For complex numbers, we can find the subnormal testing values for each
+  // component, then take the Cartesian product of each set of component values.
+  std::vector<ComponentNativeRefT> GetTestValuesWithSubnormalSubstitutions(
+      ComponentNativeRefT value);
+  std::vector<std::complex<ComponentNativeRefT>>
+  GetTestValuesWithSubnormalSubstitutions(
+      std::complex<ComponentNativeRefT> value);
+
+  // The test values for an XLA function with N operands are the Cartesian
+  // product of the test values for each of the N operands.
+  std::vector<NativeRefInputs> GetTestValuesWithSubnormalSubstitutionsArray(
+      const NativeRefInputs& value);
+
+  // We essentially reimplement LiteralTestUtil::Near here because
+  //  a) this streamlined implementation is much faster, and
+  //  b) we can print out better error messages (namely, we can print out
+  //     which floating-point value input failed, while
+  //     LiteralTestUtil::Near can only print out the input index that
+  //     failed).
+  //  c) we need special handling of certain inputs.  For example, we say
+  //  that
+  //     a denormal input has multiple correct outputs (namely, f(x) and
+  //     f(0)) and just needs to be close to one of them.
+  // check_valid_range can be used to provide a function that is called with
+  // the result to check whether it is in the expected range.
+  void ExpectNear(const LiteralInputs& input_literals,
+                  const Literal& result_literal, EvaluateOp evaluate_op,
+                  ErrorSpecGen error_spec_gen,
+                  OutputRangeCheck check_valid_range = nullptr);
+
+ protected:
+  // Indicates if files of the expected and actual values should be dumped.
+  bool should_dump_values_ = false;
+
+  // Indicates if additional (potentially costly) logging should be emitted to
+  // ease with debugging.
+  bool should_emit_debug_logging_ = false;
+};
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
new file mode 100644
index 00000000..6448c960
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
@@ -0,0 +1,859 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_UTILS_H_
+#define XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_UTILS_H_
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/literal.h"
+#include "xla/primitive_util.h"
+#include "xla/tests/exhaustive/error_spec.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+
+// The primitive type used to compute the reference output.
+constexpr PrimitiveType Ref(PrimitiveType T) {
+  return (!primitive_util::IsFloatingPointType(T) || T == F64) ? T : F32;
+}
+
+// The primitive type of the component of T. If T is not complex, then
+// ComponentT = T.
+constexpr PrimitiveType Component(PrimitiveType T) {
+  return primitive_util::IsComplexType(T)
+             ? primitive_util::ComplexComponentType(T)
+             : T;
+}
+
+// Associates constants and types with a PrimitiveType (T) and number of test
+// arguments (N) for the exhaustive test infrastructure.
+template <PrimitiveType T, size_t N>
+class ExhaustiveOpTestTraits {
+ public:
+  static constexpr PrimitiveType kT = T;
+  static constexpr size_t kN = N;
+
+  static constexpr bool kIsComplex = primitive_util::IsComplexType(T);
+  static constexpr PrimitiveType kRef = Ref(T);
+
+  static constexpr PrimitiveType kComponent = Component(T);
+  static constexpr PrimitiveType kComponentRef = Component(kRef);
+  // The PrimitiveType of the associated unsigned integer to use T with
+  // bitcasting.
+  static constexpr PrimitiveType kComponentIntegral =
+      primitive_util::UnsignedIntegralTypeForBitWidth(
+          primitive_util::BitWidth(kComponent));
+  static constexpr PrimitiveType kComponentIntegralRef =
+      primitive_util::UnsignedIntegralTypeForBitWidth(
+          primitive_util::BitWidth(kComponentRef));
+
+  using NativeT = primitive_util::NativeTypeOf<T>;
+  using NativeRefT = primitive_util::NativeTypeOf<kRef>;
+  using ComponentNativeT = primitive_util::NativeTypeOf<kComponent>;
+  using ComponentNativeRefT = primitive_util::NativeTypeOf<kComponentRef>;
+  using ComponentIntegralNativeT =
+      primitive_util::NativeTypeOf<kComponentIntegral>;
+  using ComponentIntegralNativeRefT =
+      primitive_util::NativeTypeOf<kComponentIntegralRef>;
+
+  using NativeInputs = std::array<NativeT, N>;
+  // N spans corresponding to the list of literal data values.
+  using NativeListInputs = std::array<absl::Span<const NativeT>, N>;
+  using NativeRefInputs = std::array<NativeRefT, N>;
+  using LiteralInputs = std::array<Literal, N>;
+  using XlaInputs = std::array<XlaOp, N>;
+
+  using EnqueueOp = std::conditional_t<
+      N == 1, std::function<XlaOp(XlaOp)>,
+      std::conditional_t<N == 2, std::function<XlaOp(XlaOp, XlaOp)>,
+                         std::enable_if_t<N == 1 || N == 2, void>>>;
+  using EvaluateOp = std::conditional_t<
+      N == 1, NativeRefT (*)(NativeRefT),
+      std::conditional_t<N == 2, NativeRefT (*)(NativeRefT, NativeRefT),
+                         std::enable_if_t<N == 1 || N == 2, void>>>;
+  using OutputRangeCheck = std::function<bool(NativeInputs, NativeT)>;
+
+  using ErrorSpecGen = std::conditional_t<
+      N == 1, std::function<ErrorSpec(NativeT)>,
+      std::conditional_t<N == 2, std::function<ErrorSpec(NativeT, NativeT)>,
+                         std::enable_if_t<N == 1 || N == 2, void>>>;
+  using ErrorSpecGenFnPtr = std::conditional_t<
+      N == 1, ErrorSpec (*)(NativeT),
+      std::conditional_t<N == 2, ErrorSpec (*)(NativeT, NativeT),
+                         std::enable_if_t<N == 1 || N == 2, void>>>;
+
+  // Returns an ErrorSpecGen that sets no error tolerances.
+  //
+  // The intention of this default is to force test writers to tighten bounds at
+  // least somewhat and not rely on overly large default tolerances.
+  static ErrorSpecGen FallbackErrorSpecGen();
+};
+
+template <PrimitiveType T, size_t N>
+ErrorSpec DefaultSpecGenerator(typename ExhaustiveOpTestTraits<T, N>::NativeT);
+
+template <PrimitiveType T, size_t N>
+ErrorSpec DefaultSpecGenerator(typename ExhaustiveOpTestTraits<T, N>::NativeT,
+                               typename ExhaustiveOpTestTraits<T, N>::NativeT);
+
+// The following two constants set the default absolute and relative error
+// tolerance in units of the smallest normalized value and the relative accuracy
+// of the format, respectively. Notice that setting an absolute tolerance above
+// the value of the smallest normalized float means that we effectively ignore
+// relative errors in values at or below the subnormal boundary (e.g. for values
+// less than ~1e-38 for FP32).
+static constexpr float kDefaultAbsoluteToleranceSlackFactor = 2;
+static constexpr float kDefaultRelativeToleranceSlackFactor = 20;
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<C128, 1>(xla::complex128) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<double>::min();  // NOLINT
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<double>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<C64, 1>(xla::complex64) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<float>::min();  // NOLINT
+  double rtol = 40 * std::numeric_limits<float>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F64, 1>(double) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<double>::min();  // NOLINT
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<double>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F32, 1>(float) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<float>::min();  // NOLINT
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<float>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F16, 1>(xla::half) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<Eigen::half>::min();  // NOLINT
+  // epsilon for FP16 is quite large, so a slack factor of 5 suffices.
+  double rtol = 5 * std::numeric_limits<Eigen::half>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<BF16, 1>(xla::bfloat16) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<bfloat16>::min();  // NOLINT
+  // epsilon for BF16 is quite large, so a slack factor of 2 suffices.
+  double rtol = 2 * std::numeric_limits<bfloat16>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F8E4M3FN, 1>(tsl::float8_e4m3fn) {
+  return ErrorSpec::Builder().strict_signed_zeros().build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F8E5M2, 1>(tsl::float8_e5m2) {
+  return ErrorSpec::Builder().strict_signed_zeros().build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F64, 2>(double, double) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<double>::min();  // NOLINT
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<double>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F32, 2>(float, float) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<float>::min();  // NOLINT
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<float>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F16, 2>(xla::half, xla::half) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<Eigen::half>::min();  // NOLINT
+  // epsilon for FP16 is quite large, so a slack factor of 5 suffices.
+  double rtol = 5 * std::numeric_limits<Eigen::half>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<BF16, 2>(bfloat16, bfloat16) {
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<bfloat16>::min();  // NOLINT
+  // epsilon for BF16 is quite large, so a slack factor of 5 suffices.
+  double rtol = 2 * std::numeric_limits<bfloat16>::epsilon();
+  return ErrorSpec::Builder().abs_err(atol).rel_err(rtol).build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F8E4M3FN, 2>(tsl::float8_e4m3fn,
+                                                   tsl::float8_e4m3fn) {
+  return ErrorSpec::Builder().strict_signed_zeros().build();
+}
+
+template <>
+inline ErrorSpec DefaultSpecGenerator<F8E5M2, 2>(tsl::float8_e5m2,
+                                                 tsl::float8_e5m2) {
+  return ErrorSpec::Builder().strict_signed_zeros().build();
+}
+
+template <PrimitiveType T, size_t N>
+typename ExhaustiveOpTestTraits<T, N>::ErrorSpecGen GetDefaultSpecGenerator() {
+  // Select overload by casting to fn ptr type.
+  return static_cast<typename ExhaustiveOpTestTraits<T, N>::ErrorSpecGenFnPtr>(
+      DefaultSpecGenerator<T, N>);
+}
+
+template <typename Traits>
+typename Traits::ErrorSpecGen PickFirstErrorSpecGenPresent(
+    std::initializer_list<typename Traits::ErrorSpecGen> error_specs) {
+  typename Traits::ErrorSpecGen ret = Traits::FallbackErrorSpecGen();
+  for (auto it = error_specs.begin(); it != error_specs.end(); it++) {
+    // Check if the ErrorSpecGen is nullptr to indicate it is not set. Replace
+    // ret with the first non-nullptr ErrorSpecGen.
+    if (*it != nullptr) {
+      ret = *it;
+      break;
+    }
+  }
+  return ret;
+}
+
+// Determines if the real component of the complex number is subnormal (either
+// sign).
+//
+// See also IsSubnormal to check if either component is subnormal.
+bool IsSubnormalReal(xla::complex64);
+bool IsSubnormalReal(xla::complex128);
+
+// Determines if the real component of the complex number is the minimum
+// normal floating point value (either sign).
+//
+// See also IsMinPositive to check if either component is the minimum normal
+// floating point value.
+bool IsMinNormalReal(xla::complex64);
+bool IsMinNormalReal(xla::complex128);
+
+// Determines if the imaginary component of the complex number is subnormal
+// (either sign).
+//
+// See also IsSubnormal to check if either component is subnormal.
+bool IsSubnormalImaginary(xla::complex64);
+bool IsSubnormalImaginary(xla::complex128);
+
+// Determines if the imaginary component of the complex number is the minimum
+// normal floating point value (either sign).
+//
+// See also IsMinPositive to check if either component is the minimum normal
+// floating point value.
+bool IsMinNormalImaginary(xla::complex64);
+bool IsMinNormalImaginary(xla::complex128);
+
+// Determines if the NativeT is subnormal (either sign).
+//
+// For complex numbers, this will return true if either real or imaginary
+// component is subnormal. See IsSubnormalReal and IsSubnormalImaginary if you
+// only care about one component.
+template <typename NativeT>
+bool IsSubnormal(NativeT value) {
+  if constexpr (std::is_same_v<NativeT, xla::complex64> ||
+                std::is_same_v<NativeT, xla::complex128>) {
+    return IsSubnormalReal(value) || IsSubnormalImaginary(value);
+  } else {
+    return std::fpclassify(value) == FP_SUBNORMAL;
+  }
+}
+
+// Determines if the NativeT is the minimum normal floating point value
+// (either sign).
+//
+// For complex numbers, this will return true if either real or imaginary
+// component is the minimum normal floating point value. See IsMinPositiveReal
+// and IsMinPositiveImaginary if you only care about one component.
+template <typename NativeT>
+bool IsMinNormal(NativeT value) {
+  if constexpr (std::is_same_v<NativeT, xla::complex64> ||
+                std::is_same_v<NativeT, xla::complex128>) {
+    return IsMinNormalReal(value) || IsMinNormalImaginary(value);
+  } else {
+    return std::abs(value) == std::numeric_limits<NativeT>::min();  // NOLINT
+  }
+}
+
+// Determines if the NativeT is subnormal or the minimum normal floating point
+// value (either sign).
+//
+// For complex numbers, this will return true if either real or imaginary
+// component is subnormal or the minimum normal floating point value.
+template <typename NativeT>
+bool IsSubnormalOrMinNormal(NativeT value) {
+  return IsSubnormal(value) || IsMinNormal(value);
+}
+
+// Represents a set of 64 bit chunks by representing the starting bit chunk,
+// the last bit chunk, and the spacing between two adjacent bit chunks, without
+// actually storing all the bit chunks being generated. The bit chunk iterator
+// is provided to retrieve all the bit chunks.
+//
+// This data structure is used to generate the bit representation to test
+// operations that requires more than 64 bit input data. In this case,
+// truly exhaustive testing is not possible and we want to test a value every
+// n values, where n == spacing_.
+//
+// Currently, the iterator of BitChunks adds the `spacing_` to a bit chunk to
+// compute the next bit chunk. We can change this to use values generated
+// by a random number generator that can achieve the average spacing
+// statistically, if we will find this is necessary.
+class BitChunks {
+ public:
+  class iterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = uint64_t;
+    using difference_type = uint64_t;
+    using pointer = const uint64_t*;
+    using reference = uint64_t;
+
+    iterator() = default;
+
+    explicit iterator(const BitChunks* bit_chunks)
+        : bit_chunks_(bit_chunks), next_bit_chunk_(bit_chunks->start_) {}
+
+    iterator& operator++() {
+      Next();
+      return *this;
+    }
+
+    iterator operator++(int) {
+      iterator retval = *this;
+      Next();
+      return retval;
+    }
+
+    bool operator==(iterator other) const {
+      return bit_chunks_ == other.bit_chunks_ &&
+             next_bit_chunk_ == other.next_bit_chunk_;
+    }
+
+    bool operator!=(iterator other) const { return !(*this == other); }
+
+    iterator MoveToEnd() {
+      MoveNextBitChunkToOnePassEnd();
+      return *this;
+    }
+
+    reference operator*() const {
+      CHECK(*this != this->bit_chunks_->end());
+      return next_bit_chunk_;
+    }
+
+    const BitChunks* GetBitChunks() const { return bit_chunks_; }
+
+    void Reset() { next_bit_chunk_ = bit_chunks_->start_; }
+
+    void Next() {
+      CHECK(*this != this->bit_chunks_->end());
+      if (next_bit_chunk_ == bit_chunks_->end_) {
+        MoveNextBitChunkToOnePassEnd();
+      } else {
+        next_bit_chunk_ += bit_chunks_->spacing_;
+        if (next_bit_chunk_ > bit_chunks_->end_) {
+          next_bit_chunk_ = bit_chunks_->end_;
+        }
+      }
+    }
+
+    std::string ToString() const {
+      return absl::StrFormat("0x%08x", next_bit_chunk_);
+    }
+
+   private:
+    // Move next_bit_chunk_ to 1 pass the bit_chunks_->end, to mark that the
+    // iterator has reached the end. When spacing_ is not one, or if we will
+    // change to use a random value instead of spacing_ in function Next(),
+    // normalizing the representation of the iterator ending this way can
+    // can simplify the checking for iterator ending.
+    void MoveNextBitChunkToOnePassEnd() {
+      next_bit_chunk_ = bit_chunks_->end_ + 1;
+    }
+
+    const BitChunks* bit_chunks_;
+    uint64_t next_bit_chunk_;
+  };
+
+  iterator begin() const { return iterator(this); }
+  iterator end() const {
+    iterator end(this);
+    return end.MoveToEnd();
+  }
+
+  explicit BitChunks(uint64_t start = 0, uint64_t end = 0, uint64_t spacing = 1)
+      : start_(start), end_(end), spacing_(spacing) {
+    CHECK_GE(end_, start_);
+    CHECK_NE(spacing, 0) << ToString();
+  }
+
+  int64_t GetTotalBitChunks() const {
+    if (start_ == end_) {
+      return 1;
+    }
+
+    return 1 + (end_ - start_ + spacing_ - 1) / spacing_;
+  }
+
+  std::string ToString() const {
+    return absl::StrFormat("(0x%08x, 0x%08x, 0x%08x)", start_, end_, spacing_);
+  }
+
+  uint64_t start_;
+  uint64_t end_;
+  uint64_t spacing_;
+};
+
+inline std::string StringifyNum(BitChunks c) { return c.ToString(); }
+
+inline std::string StringifyNum(BitChunks::iterator c) { return c.ToString(); }
+
+template <typename T>
+void AppendStringifyNum(std::string* s, T x) {
+  absl::StrAppend(s, StringifyNum(x));
+}
+
+// Represents a set of floating point values through the possible values for
+// the three components: mantissa, exponent, and sign. Also implements an
+// iterator for retrieving all the represented floating point values.
+class FpValues {
+ public:
+  static constexpr int kTotalBitChunks = 3;
+
+  class iterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = uint64_t;
+    using difference_type = uint64_t;
+    using pointer = const uint64_t*;
+    using reference = uint64_t;
+
+    explicit iterator(const FpValues* fp_values) : fp_values_(fp_values) {
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        iters_[i] = BitChunks::iterator(&fp_values->GetBitChunks(i));
+      }
+    }
+
+    iterator& operator++() {
+      Next();
+      return *this;
+    }
+
+    iterator operator++(int) {
+      iterator retval = *this;
+      Next();
+      return retval;
+    }
+
+    bool operator==(iterator other) const {
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        if (iters_[i] != other.GetBitChunksIter(i)) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    bool operator!=(iterator other) const { return !(*this == other); }
+
+    iterator MoveToEnd() {
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        iters_[i].MoveToEnd();
+      }
+      return *this;
+    }
+
+    uint64_t operator*() const {
+      uint64_t value = 0;
+      for (int i = 0; i < FpValues::kTotalBitChunks; ++i) {
+        value = value | (*iters_[i]) << fp_values_->offsets_[i];
+      }
+      return value;
+    }
+
+    const BitChunks::iterator& GetBitChunksIter(int i) { return iters_[i]; }
+
+    std::string ToString() const {
+      return absl::StrJoin(iters_, ",",
+                           AppendStringifyNum<BitChunks::iterator>);
+    }
+
+   private:
+    // Moves the iterator for the ith BitChunks to the next value, and
+    // returns true if the new state is not the end of the iterator.
+    bool Next(int i = 0) {
+      iters_[i].Next();
+      if (iters_[i] == iters_[i].GetBitChunks()->end()) {
+        if (i == FpValues::kTotalBitChunks - 1) {
+          return false;
+        }
+        if (Next(i + 1)) {
+          iters_[i].Reset();
+          return true;
+        }
+        return false;
+      }
+      return true;
+    }
+
+    std::array<BitChunks::iterator, FpValues::kTotalBitChunks> iters_;
+    const FpValues* fp_values_;
+  };
+
+  FpValues() : bit_chunks_(), offsets_() {}
+  FpValues(absl::Span<const BitChunks> chunks, absl::Span<const int> offsets) {
+    CHECK_EQ(chunks.size(), offsets.size() - 1);
+    CHECK_EQ(chunks.size(), kTotalBitChunks);
+    std::copy_n(chunks.begin(), kTotalBitChunks, bit_chunks_.begin());
+    std::copy_n(offsets.begin(), kTotalBitChunks, offsets_.begin());
+
+    // The last value in `offsets` is the total number of bits.
+    offsets_[kTotalBitChunks] = offsets[kTotalBitChunks];
+    // Validate the input values.
+    for (int i = 0; i < kTotalBitChunks; ++i) {
+      int total_bits = offsets[i + 1] - offsets[i];
+      if (total_bits < 64) {
+        uint64_t bound = 1ull << total_bits;
+        CHECK_LT(chunks[i].start_, bound);
+        CHECK_LT(chunks[i].end_, bound);
+      } else {
+        CHECK_EQ(total_bits, 64);
+      }
+    }
+  }
+
+  iterator begin() const { return iterator(this); }
+
+  iterator end() const {
+    iterator end(this);
+    return end.MoveToEnd();
+  }
+
+  int64_t GetTotalNumValues() const {
+    int64_t total = 1;
+    absl::c_for_each(bit_chunks_, [&](const BitChunks& chunks) {
+      total *= chunks.GetTotalBitChunks();
+    });
+    return total;
+  }
+
+  const BitChunks& GetBitChunks(int i) const { return bit_chunks_[i]; }
+
+  std::string ToString() const {
+    return absl::StrCat(
+        "[", absl::StrJoin(bit_chunks_, ",", AppendStringifyNum<BitChunks>),
+        "]");
+  }
+
+  std::array<BitChunks, kTotalBitChunks> bit_chunks_;
+  std::array<int, kTotalBitChunks + 1> offsets_;
+};
+
+template <typename T>
+int GetMantissaTotalBits() {
+  return std::numeric_limits<T>::digits - 1;
+}
+
+template <typename T>
+int GetFpTotalBits() {
+  return sizeof(T) * 8;
+}
+
+template <typename T>
+int GetExponentTotalBits() {
+  return GetFpTotalBits<T>() - GetMantissaTotalBits<T>() - 1;
+}
+
+template <typename T>
+uint64_t GetAllOneMantissa() {
+  return (1ull << GetMantissaTotalBits<T>()) - 1ull;
+}
+
+template <typename T>
+uint64_t GetAllOneExponent() {
+  return (1ull << GetExponentTotalBits<T>()) - 1ull;
+}
+
+template <typename T>
+FpValues GetFpValues(BitChunks mantissa, BitChunks exponent, BitChunks sign) {
+  int total_bits = GetFpTotalBits<T>();
+  return FpValues({mantissa, exponent, sign},
+                  {0, GetMantissaTotalBits<T>(), total_bits - 1, total_bits});
+}
+
+template <typename T>
+FpValues GetZeros() {
+  return GetFpValues<T>(BitChunks(0, 0, 1), BitChunks(0, 0, 1),
+                        BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetSubnormals(int approx_num_values) {
+  int mantissa = GetMantissaTotalBits<T>();
+  uint64_t mantissa_spacing = (1ull << mantissa) / (approx_num_values * 2);
+  return GetFpValues<T>(
+      BitChunks(0x1, GetAllOneMantissa<T>(), mantissa_spacing),
+      BitChunks(0, 0, 1), BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetInfinites() {
+  uint64_t all_one_exp = GetAllOneExponent<T>();
+  return GetFpValues<T>(BitChunks(0, 0, 1),
+                        BitChunks(all_one_exp, all_one_exp, 1),
+                        BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetNans(int approx_num_values) {
+  int mantissa = GetMantissaTotalBits<T>();
+  uint64_t mantissa_spacing = (1ull << mantissa) / (approx_num_values * 2);
+  uint64_t all_one_exp = GetAllOneExponent<T>();
+  return GetFpValues<T>(
+      BitChunks(0x1, GetAllOneMantissa<T>(), mantissa_spacing),
+      BitChunks(all_one_exp, all_one_exp, 1), BitChunks(0, 1, 1));
+}
+
+template <typename T>
+FpValues GetNormals(int approx_num_values) {
+  float component_total = std::sqrt(static_cast<float>(approx_num_values));
+  return GetFpValues<T>(
+      BitChunks(0x1, GetAllOneMantissa<T>(),
+                (1ull << (GetMantissaTotalBits<T>() + 1)) / component_total),
+      BitChunks(0x1, GetAllOneExponent<T>() - 1,
+                (1ull << (GetExponentTotalBits<T>() + 1)) / component_total),
+      BitChunks(0, 1, 1));
+}
+
+// Returns a vector of FpValues, which together represent about
+// `approx_num_values` floating point values of type `T`, with each FpValues
+// represents about `num_values_per_group` floating point values.
+template <typename T>
+std::vector<FpValues> GetFpValuesWithExponents(uint64_t first_exponent,
+                                               uint64_t exponent_spacing,
+                                               uint64_t num_exponents,
+                                               uint64_t approx_num_values,
+                                               uint64_t num_values_per_group) {
+  const uint64_t num_signs = 2;
+  uint64_t approx_num_mantissa =
+      approx_num_values / (num_exponents * num_signs);
+  uint64_t num_mantissa_per_group =
+      num_values_per_group / (num_exponents * num_signs);
+  CHECK_GT(approx_num_mantissa, 0);
+  CHECK_GT(num_mantissa_per_group, 0);
+
+  CHECK_LT(first_exponent + num_exponents - 1ull, GetAllOneExponent<T>());
+  int mantissa = GetMantissaTotalBits<T>();
+  uint64_t mantissa_spacing = (1ull << mantissa) / approx_num_mantissa;
+
+  std::vector<FpValues> result;
+  for (uint64_t group_start = 0; group_start < GetAllOneMantissa<T>();
+       group_start += mantissa_spacing * num_mantissa_per_group) {
+    uint64_t group_end =
+        group_start + (num_mantissa_per_group - 1) * mantissa_spacing;
+    if (group_end > GetAllOneMantissa<T>()) {
+      group_end = GetAllOneMantissa<T>();
+    }
+    result.push_back(GetFpValues<T>(
+        BitChunks(group_start, group_end, mantissa_spacing),
+        BitChunks(first_exponent, first_exponent + num_exponents - 1, 1),
+        BitChunks(0, 1, 1)));
+  }
+  return result;
+}
+
+// Returns a vector of FpValues together represent about `approx_num_values`
+// "very large" floating point values and `approx_num_values` "very small"
+// floating point values of type `T`, which each FpValues represent about
+// `num_values_per_group` floating point values. Because we use FpValues as
+// a parameter for parameterized testing, the number of floating values
+// represented by each FpValues affects the input size for each sub-test and
+// the hence the peak memory usage of the test.
+template <typename T>
+std::vector<FpValues> GetFpValuesForMagnitudeExtremeNormals(
+    uint64_t approx_num_values = 40000, uint64_t num_values_per_group = 4000) {
+  std::vector<FpValues> large =
+      GetFpValuesWithExponents<T>(GetAllOneExponent<T>() - 5, 1, 5,
+                                  approx_num_values / 2, num_values_per_group);
+  std::vector<FpValues> small = GetFpValuesWithExponents<T>(
+      1, 1, 5, approx_num_values / 2, num_values_per_group);
+  large.insert(large.end(), small.begin(), small.end());
+  return large;
+}
+
+template <typename T>
+std::vector<FpValues> CreateFpValuesForBoundaryTest() {
+  return {GetZeros<T>(), GetSubnormals<T>(1000), GetInfinites<T>(),
+          GetNans<T>(1000)};
+}
+
+// Creates ranges for exhaustively testing T pairs, (T1, T2), where T1 and T2
+// both only range over the subnormal values for T.
+//
+// This is intended to be used for exhaustive binary tests where it is helpful
+// to only look at how subnormals interact for both parameters.
+//
+// Ranges are encoded as a tuple of `int64_t`. Each `int64_t` tuple element is a
+// packed pair of values equal to `(T1 << TotalBits(T)) | T2`. A range
+// `((T1,T2),(T3,T4))` will test all binary pairs between `(T1,T2)`, inclusive,
+// and `(T3,T4)`, exclusive.
+//
+// Any `T` supported by `std::numeric_limits<T>` is supported here.
+template <typename T>
+inline std::vector<std::pair<int64_t, int64_t>> CreateSubnormalStrictRanges() {
+  std::vector<std::pair<int64_t, int64_t>> ret;
+  // N.B.: Exclude 0.
+  int subnormal_count = (1ull << GetMantissaTotalBits<T>()) - 1;
+  // N.B.: subnormal_count / 2 is intended to provide mantissa spacing of 1.
+  for (auto subnormal :
+       GetSubnormals<T>((1ull << GetMantissaTotalBits<T>()) / 2)) {
+    // | 1 avoids selecting 0 as the first right value.
+    auto start = (subnormal << GetFpTotalBits<T>()) | 1;
+    auto end = start + subnormal_count;
+    ret.push_back({start, end});
+  }
+  return ret;
+}
+
+// Creates ranges for exhaustively testing T pairs, (T1, T2), where T1 only
+// ranges over the subnormal values for T and T2 ranges over all values.
+//
+// This is intended to be used for exhaustive binary tests where it is helpful
+// to only look at how subnormals interact for both parameters.
+//
+// Ranges are encoded as a tuple of `int64_t`. Each `int64_t` tuple element is a
+// packed pair of values equal to `(T1 << TotalBits(T)) | T2`. A range
+// `((T1,T2),(T3,T4))` will test all binary pairs between `(T1,T2)`, inclusive,
+// and `(T3,T4)`, exclusive.
+//
+// Any `T` supported by `std::numeric_limits<T>` is supported here.
+template <typename T>
+inline std::vector<std::pair<int64_t, int64_t>>
+CreateSubnormalExhaustiveRanges() {
+  std::vector<std::pair<int64_t, int64_t>> ret;
+  int entire_count = 1ull << GetFpTotalBits<T>();
+  // N.B.: subnormal_count / 2 is intended to provide mantissa spacing of 1.
+  for (auto subnormal :
+       GetSubnormals<T>((1ull << GetMantissaTotalBits<T>()) / 2)) {
+    auto start = subnormal << GetFpTotalBits<T>();
+    auto end = start + entire_count;
+    ret.push_back({start, end});
+  }
+  return ret;
+}
+
+inline std::vector<std::pair<int64_t, int64_t>> CreateExhaustiveU16Ranges() {
+  // The entire U16 range is small enough that we don't need to do any
+  // partitioning.
+  return {{0, std::numeric_limits<uint16_t>::max()}};
+}
+
+inline std::vector<std::pair<int64_t, int64_t>> CreateExhaustiveU32Ranges() {
+  // We break up the 2^32-element space into small-ish chunks to keep peak
+  // memory usage low.
+  std::vector<std::pair<int64_t, int64_t>> result;
+  const int64_t step = 1 << 25;
+  for (int64_t i = 0; i < (int64_t{1} << 32); i += step) {
+    result.push_back({i, i + step});
+  }
+  return result;
+}
+
+template <typename T>
+T ReferenceMax(T x, T y) {
+  if (x != x) {
+    return x;
+  }
+  if (y != y) {
+    return y;
+  }
+
+  return ToSignMagnitude(x) < ToSignMagnitude(y) ? y : x;
+}
+
+template <typename T>
+T ReferenceMin(T x, T y) {
+  if (x != x) {
+    return x;
+  }
+  if (y != y) {
+    return y;
+  }
+
+  return ToSignMagnitude(x) < ToSignMagnitude(y) ? x : y;
+}
+
+// Returns a wrapper of the given build method, which build an HLO operation
+// with an empty broadcast dimension.
+inline std::function<XlaOp(XlaOp, XlaOp)> AddEmptyBroadcastDimension(
+    std::function<XlaOp(XlaOp, XlaOp, absl::Span<const int64_t>)>
+        build_method) {
+  return [build_method](XlaOp src0, XlaOp src1) -> XlaOp {
+    return build_method(src0, src1, {});
+  };
+}
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+#endif  // XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_definitions.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_definitions.h
new file mode 100644
index 00000000..2b3fa8f3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_definitions.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_UNARY_TEST_DEFINITIONS_H_
+#define XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_UNARY_TEST_DEFINITIONS_H_
+
+#include <array>    // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include <cstdint>  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include <ios>      // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include <utility>  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+
+#include "absl/log/check.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "absl/log/log.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "absl/types/span.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "xla/literal.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "xla/tests/exhaustive/exhaustive_op_test.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "xla/tests/exhaustive/exhaustive_op_test_utils.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "xla/tests/test_macros.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+#include "tsl/platform/test.h"  // IWYU pragma: keep, exhaustive_unary_test_definitions.inc
+
+namespace xla {
+namespace exhaustive_op_test {
+
+#include "xla/tests/exhaustive/exhaustive_unary_test_definitions.inc"
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_UNARY_TEST_DEFINITIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/platform.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/platform.h
new file mode 100644
index 00000000..7728033e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/platform.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_PLATFORM_H_
+#define XLA_TESTS_EXHAUSTIVE_PLATFORM_H_
+
+#include <variant>
+
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/platform.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+
+// Represents an enum class of all possible openXLA execution platforms along
+// with helper functions to categorically handle them.
+class Platform {
+ public:
+  enum class CpuValue {
+    AARCH64,
+    X86_64,
+  };
+
+  using Value = std::variant<CpuValue, stream_executor::CudaComputeCapability,
+                             stream_executor::RocmComputeCapability>;
+
+  explicit Platform(const stream_executor::Platform& platform);
+
+  bool IsCpu() const { return std::holds_alternative<CpuValue>(value_); }
+
+  bool IsGpu() const {
+    return std::holds_alternative<stream_executor::CudaComputeCapability>(
+               value_) ||
+           std::holds_alternative<stream_executor::RocmComputeCapability>(
+               value_);
+  }
+
+  bool IsNvidiaGpu() const {
+    return std::holds_alternative<stream_executor::CudaComputeCapability>(
+        value_);
+  }
+
+  bool IsNvidiaP100() const;
+
+  bool IsNvidiaV100() const;
+
+  bool IsNvidiaA100() const;
+
+  bool IsNvidiaH100() const;
+
+  bool IsAmdGpu() const {
+    return std::holds_alternative<stream_executor::RocmComputeCapability>(
+        value_);
+  }
+
+  const Value& value() const { return value_; }
+
+ private:
+  const Value value_;
+};
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_PLATFORM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/test_op.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/test_op.h
new file mode 100644
index 00000000..35ad4b51
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/exhaustive/test_op.h
@@ -0,0 +1,247 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_EXHAUSTIVE_TEST_OP_H_
+#define XLA_TESTS_EXHAUSTIVE_TEST_OP_H_
+
+#include <cstddef>
+#include <type_traits>
+
+#include "xla/tests/exhaustive/exhaustive_op_test.h"
+#include "xla/tests/exhaustive/exhaustive_op_test_utils.h"
+#include "xla/tests/exhaustive/platform.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace exhaustive_op_test {
+
+// Declares a single exhaustive test operation.
+//
+// This class is intended to be subclassed by an actual operation implementation
+// that configures EnqueueOp() and EvaluateOp() as necessary.
+//
+// The exhaustive test can be run using the Run() function defined here.
+//
+// Pure virtual functions:
+// - EnqueueOp
+// - EvaluateOp
+template <PrimitiveType T, size_t N>
+class TestOp {
+ public:
+  using Traits = ExhaustiveOpTestTraits<T, N>;
+  using Test = std::conditional_t<
+      N == 1, ExhaustiveUnaryTest<T>,
+      std::conditional_t<N == 2, ExhaustiveBinaryTest<T>,
+                         std::enable_if_t<N == 1 || N == 2, void>>>;
+
+  explicit TestOp(Test* test) : test_(test) {}
+
+  virtual ~TestOp() = default;
+
+  virtual Traits::EnqueueOp EnqueueOp() const = 0;
+  virtual Traits::EvaluateOp EvaluateOp() const = 0;
+
+  // Establish a verification check that each EnqueueOp() value is within range.
+  TestOp& OutputRangeCheck(Traits::OutputRangeCheck output_range_check) & {
+    output_range_check_ = output_range_check;
+    return *this;
+  }
+  TestOp&& OutputRangeCheck(Traits::OutputRangeCheck output_range_check) && {
+    output_range_check_ = output_range_check;
+    return std::move(*this);
+  }
+
+  // The following methods set ErrorSpecGen for associated platforms. There is a
+  // precedence hierarchy to allow for easily setting fallbacks and overriding
+  // for certain platforms.
+  //
+  // CPU Precedence:
+  // CPU Make (x86, ARM, etc) Error -> CPU Error -> Error
+  //
+  // GPU Precedence:
+  // GPU Model (P100, V100, etc) Error -> GPU Make (Nvidia) Error -> GPU Error
+  // -> Error
+
+  TestOp& Error(Traits::ErrorSpecGen error_spec_gen) & {
+    error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& Error(Traits::ErrorSpecGen error_spec_gen) && {
+    error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& CpuError(Traits::ErrorSpecGen error_spec_gen) & {
+    cpu_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& CpuError(Traits::ErrorSpecGen error_spec_gen) && {
+    cpu_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& CpuX86Error(Traits::ErrorSpecGen error_spec_gen) & {
+    cpu_x86_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& CpuX86Error(Traits::ErrorSpecGen error_spec_gen) && {
+    cpu_x86_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& CpuArmError(Traits::ErrorSpecGen error_spec_gen) & {
+    cpu_arm_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& CpuArmError(Traits::ErrorSpecGen error_spec_gen) && {
+    cpu_arm_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& GpuError(Traits::ErrorSpecGen error_spec_gen) & {
+    gpu_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& GpuError(Traits::ErrorSpecGen error_spec_gen) && {
+    gpu_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& GpuNvidiaError(Traits::ErrorSpecGen error_spec_gen) & {
+    gpu_nv_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& GpuNvidiaError(Traits::ErrorSpecGen error_spec_gen) && {
+    gpu_nv_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& GpuP100Error(Traits::ErrorSpecGen error_spec_gen) & {
+    gpu_nv_p100_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& GpuP100Error(Traits::ErrorSpecGen error_spec_gen) && {
+    gpu_nv_p100_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& GpuV100Error(Traits::ErrorSpecGen error_spec_gen) & {
+    gpu_nv_v100_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& GpuV100Error(Traits::ErrorSpecGen error_spec_gen) && {
+    gpu_nv_v100_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& GpuA100Error(Traits::ErrorSpecGen error_spec_gen) & {
+    gpu_nv_a100_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& GpuA100Error(Traits::ErrorSpecGen error_spec_gen) && {
+    gpu_nv_a100_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  TestOp& GpuH100Error(Traits::ErrorSpecGen error_spec_gen) & {
+    gpu_nv_h100_error_spec_gen_ = error_spec_gen;
+    return *this;
+  }
+  TestOp&& GpuH100Error(Traits::ErrorSpecGen error_spec_gen) && {
+    gpu_nv_h100_error_spec_gen_ = std::move(error_spec_gen);
+    return std::move(*this);
+  }
+
+  // Execute the TestCase as configured.
+  //
+  // Requires invoking on a TestCase&& to ensure the TestCase is not used
+  // afterwards.
+  void Run() && {
+    typename Traits::ErrorSpecGen error_spec_gen;
+    if (test_->Platform().IsCpu()) {
+      switch (std::get<Platform::CpuValue>(test_->Platform().value())) {
+        case Platform::CpuValue::X86_64: {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {cpu_x86_error_spec_gen_, cpu_error_spec_gen_, error_spec_gen_});
+          break;
+        }
+        case Platform::CpuValue::AARCH64: {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {cpu_arm_error_spec_gen_, cpu_error_spec_gen_, error_spec_gen_});
+          break;
+        }
+        default: {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {cpu_error_spec_gen_, error_spec_gen_});
+          break;
+        }
+      }
+    } else if (test_->Platform().IsGpu()) {
+      if (test_->Platform().IsNvidiaGpu()) {
+        if (test_->Platform().IsNvidiaP100()) {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {gpu_nv_p100_error_spec_gen_, gpu_nv_error_spec_gen_,
+               gpu_error_spec_gen_, error_spec_gen_});
+        } else if (test_->Platform().IsNvidiaV100()) {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {gpu_nv_v100_error_spec_gen_, gpu_nv_error_spec_gen_,
+               gpu_error_spec_gen_, error_spec_gen_});
+        } else if (test_->Platform().IsNvidiaA100()) {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {gpu_nv_a100_error_spec_gen_, gpu_nv_error_spec_gen_,
+               gpu_error_spec_gen_, error_spec_gen_});
+        } else if (test_->Platform().IsNvidiaH100()) {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {gpu_nv_h100_error_spec_gen_, gpu_nv_error_spec_gen_,
+               gpu_error_spec_gen_, error_spec_gen_});
+        } else {
+          error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+              {gpu_nv_error_spec_gen_, gpu_error_spec_gen_, error_spec_gen_});
+        }
+      } else {
+        error_spec_gen = PickFirstErrorSpecGenPresent<Traits>(
+            {gpu_error_spec_gen_, error_spec_gen_});
+      }
+    } else {
+      error_spec_gen = PickFirstErrorSpecGenPresent<Traits>({error_spec_gen_});
+    }
+    test_->Run(EnqueueOp(), EvaluateOp(), error_spec_gen, output_range_check_);
+  }
+
+ private:
+  Test* test_ = nullptr;
+  Traits::OutputRangeCheck output_range_check_ = nullptr;
+  Traits::ErrorSpecGen error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen cpu_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen cpu_x86_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen cpu_arm_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen gpu_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen gpu_nv_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen gpu_nv_p100_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen gpu_nv_v100_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen gpu_nv_a100_error_spec_gen_ = nullptr;
+  Traits::ErrorSpecGen gpu_nv_h100_error_spec_gen_ = nullptr;
+};
+
+template <PrimitiveType T>
+using UnaryTestOp = TestOp<T, 1>;
+
+template <PrimitiveType T>
+using BinaryTestOp = TestOp<T, 2>;
+
+}  // namespace exhaustive_op_test
+}  // namespace xla
+
+#endif  // XLA_TESTS_EXHAUSTIVE_TEST_OP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/filecheck.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/filecheck.h
new file mode 100644
index 00000000..e9615251
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/filecheck.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_FILECHECK_H_
+#define XLA_TESTS_FILECHECK_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/testlib/filecheck.h"
+
+#endif  // XLA_TESTS_FILECHECK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_pjrt_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_pjrt_test_base.h
new file mode 100644
index 00000000..fe7b95df
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_pjrt_test_base.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_HLO_PJRT_TEST_BASE_H_
+#define XLA_TESTS_HLO_PJRT_TEST_BASE_H_
+
+#include "xla/tests/hlo_runner_agnostic_test_base.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+struct HloPjRtTestBaseOptions {
+  bool verifier_layout_sensitive = false;
+  bool allow_mixed_precision_in_hlo_verifier = true;
+  HloPredicate instruction_can_change_layout_func;
+};
+
+class HloPjRtTestBase : public HloRunnerAgnosticTestBase {
+ protected:
+  // This uses the SE interpreter backend for the reference backend and
+  // automatically finds a PjRt backend for the test backend.
+  explicit HloPjRtTestBase(HloPjRtTestBaseOptions options = {});
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_HLO_PJRT_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
new file mode 100644
index 00000000..9b8ae26f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -0,0 +1,358 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_HLO_RUNNER_AGNOSTIC_TEST_BASE_H_
+#define XLA_TESTS_HLO_RUNNER_AGNOSTIC_TEST_BASE_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/literal.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/test_helpers.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A base class for tests which build and/or run HLO code. The class includes
+// support for running an HLO module on two platforms and compare the results.
+// This is a lower level of abstraction than using the client interface and
+// enables, for one, explicitly building a graph of HLO instructions to run.
+//
+// This can also be used to write text/file-based test cases. Note that the test
+// target is responsible for linking the needed backends. A convenient way to do
+// this is to make it an xla_test: it will generate test targets linking with
+// the respective backends, which will be used as the test backend; the
+// interpreter backend is already linked with hlo_test_base so it will be the
+// default reference backend. For example, if you want to compare both cpu vs.
+// interpreter, and gpu vs. interpreter, you can:
+//
+//  xla_test (
+//    name = "sample_text_test",
+//    srcs = ["sample_text_test.cc"],
+//    backends = [
+//      "cpu",
+//      "gpu",
+//    ],
+//    deps = [
+//      "//xla/tests:hlo_runner_agnostic_test_base",
+//      ...
+//    ],
+//  )
+//
+// Unlike HloTestBase, which relies on StreamExecutor via HloRunner, this class
+// relies on HloRunnerInterface. HloRunnerInterface supports HloRunner among
+// other implementations. We plan to incrementally migrate tests to this class
+// and away from HloTestBase.
+class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
+ public:
+  static constexpr ErrorSpec kDefaultErrorSpec{0.0001};
+
+ protected:
+  explicit HloRunnerAgnosticTestBase(
+      absl::Nonnull<std::unique_ptr<HloRunnerInterface>> test_runner,
+      absl::Nonnull<std::unique_ptr<HloRunnerInterface>> reference_runner,
+      bool verifier_layout_sensitive = false,
+      bool allow_mixed_precision_in_hlo_verifier = true,
+      HloPredicate instruction_can_change_layout_func = {});
+
+  // Creates a new HLO module for a test. The module created will have
+  // TestName() for its name; it will also automatically populate its debug
+  // options from command-line flags. If you want a fresh HloModule object and
+  // then add HloComputations to it, it's recommended to use this method in your
+  // tests.
+  //
+  // This returns a VerifiedHloModule that runs the HLO verifier on
+  // destruction.
+  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
+      const std::string& name = TestName(), int64_t replica_count = 1);
+
+  // Parses the given string and returns module as a VerifiedHloModule.
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               int64_t replica_count = 1,
+                               int64_t num_partitions = 1);
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               const HloModuleConfig& config);
+
+  HloComputation* AddEntryComputationAndUpdateEntryComputationLayout(
+      HloModule*, std::unique_ptr<HloComputation> computation);
+  void UpdateEntryComputationLayout(HloModule* module) const;
+
+  // Executes the given module and return the result as a Literal.
+  absl::StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
+                                  absl::Span<Literal* const> arguments,
+                                  bool run_hlo_passes = true);
+
+  // Same as above, except the module will be executed without running any HLO
+  // passes on it.
+  Literal ExecuteNoHloPasses(std::unique_ptr<HloModule> module,
+                             absl::Span<Literal* const> arguments);
+
+  Literal ExecuteAndTransfer(std::unique_ptr<HloModule> module,
+                             absl::Span<Literal* const> arguments);
+
+  // Compile the given module to an executable.
+  absl::StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) {
+    return test_runner_->CreateExecutable(std::move(module), run_hlo_passes);
+  }
+
+  // Executes the given module on multiple replicas.
+  //
+  // use_threads indicates whether this replicated computation will be executed
+  // with a thread-per-replica, vs using an implicitly async call such as
+  // Executable::ExecuteOnStreams.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64_t num_replicas, bool use_threads, bool run_hlo_passes = false);
+
+  // Same as above, but uses specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64_t num_replicas, DeviceAssignment* device_assignment,
+      bool run_hlo_passes, bool use_threads);
+
+  // Same as above, but allows passing different programs for replicas.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64_t)> executable_provider,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      int64_t num_replicas, bool run_hlo_passes,
+      DeviceAssignment* device_assignment = nullptr);
+
+  // Convenience function for above. Allows passing different inputs to
+  // different replicas of the same program.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      std::vector<std::vector<Literal*>> arguments, int64_t num_replicas,
+      bool run_hlo_passes, DeviceAssignment* device_assignment = nullptr);
+
+  // Executes the given hlo module on two backends and compares results.
+  //
+  // 'arguments': the input of the hlo module.
+  //
+  // 'error': if has value, expects the results to be near (within the error
+  // bound). Otherwise, expects the results to be equal.
+  //
+  // 'reference_preprocessor': the module should be ready to run on the test
+  // backend, but it might need to be tailored so that it is able to run on the
+  // reference backend. Note that the program shape of the module must not be
+  // modified.
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr);
+
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr);
+
+  // Executes an hlo module with fake inputs and compares the results.
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module, const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
+
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module, const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr);
+
+  // Executes an hlo module with fake inputs and checks that the execution is
+  // successful.
+  ::testing::AssertionResult Run(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr);
+
+  // Convenient wrappers for executing and comparing an hlo module with fake
+  // input. Module can be passed in directly, or parsed from an hlo_string,
+  // or loaded from a file.
+  ::testing::AssertionResult RunAndCompare(
+      absl::string_view hlo_string, const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
+  ::testing::AssertionResult Run(
+      absl::string_view hlo_string, bool run_hlo_passes = true,
+      ExecutionProfile* profile = nullptr,
+      const tsl::protobuf::Message* backend_config = nullptr,
+      bool use_random_data = true);
+
+  // Same as below, except that it requires all the options to be passed.
+  ::testing::AssertionResult RunAndCompareTwoModulesReplicated(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      HloRunnerInterface::ReplicatedExecuteOptions options,
+      const std::optional<ErrorSpec>& error);
+
+  // Same as below, except that it requires the parsed modules to be passed.
+  ::testing::AssertionResult RunAndCompareTwoModulesReplicated(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      bool run_hlo_passes, bool use_threads,
+      const std::optional<ErrorSpec>& error);
+
+  ::testing::AssertionResult RunAndCompareTwoModulesReplicated(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      const std::vector<Literal>& fake_arguments, bool run_hlo_passes,
+      bool use_threads, const std::optional<ErrorSpec>& error);
+
+  // Parses the modules, and executes them based on `run_hlo_passes` and
+  // `use_threads` flags. The replica count should be mentioned in the module
+  // itself.
+  ::testing::AssertionResult RunAndCompareTwoModulesReplicated(
+      absl::string_view module_0_str, absl::string_view module_1_str,
+      bool run_hlo_passes, bool use_threads,
+      const std::optional<ErrorSpec>& error);
+
+  // Same as below, except requires passing fake arguments.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
+
+  // Same as below, except requires passing the modules.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes = true,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
+
+  // Convenient wrapper for executing and comparing results of two hlo modules
+  // with fake input. By default compares unoptimized modules. If the modules
+  // are already optimized, set |run_hlo_passes| to false.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      absl::string_view hlo_string_module_0,
+      absl::string_view hlo_string_module_1,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes = true,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
+
+  // Same as above but allows running with different configs.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      absl::string_view hlo_string_module_0,
+      absl::string_view hlo_string_module_1, const HloModuleConfig& config_0,
+      const HloModuleConfig& config_1, const std::optional<ErrorSpec>& error,
+      bool run_hlo_passes = true,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
+
+  // Same as above but requires explicit arguments.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      absl::string_view hlo_string_module_0,
+      absl::string_view hlo_string_module_1,
+      absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
+
+  // Executes an hlo module with fake inputs on multiple replicas.
+  ::testing::AssertionResult RunReplicated(
+      absl::string_view hlo_string, bool run_hlo_passes = true,
+      int64_t num_replicas = 1,
+      const tsl::protobuf::Message* backend_config = nullptr);
+
+  // If assert_determinism is true, the assertion will fail unless all runs
+  // produce exactly the same output.
+  ::testing::AssertionResult RunMultipleTimes(
+      absl::string_view hlo_string, bool run_hlo_passes,
+      std::vector<ExecutionProfile>* profiles,
+      const tsl::protobuf::Message* backend_config = nullptr,
+      bool assert_determinism = false);
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      absl::string_view hlo_string, const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr);
+
+  // Override this method to add a default preprocessing step that is applied to
+  // the test module in all Run* methods. The intended usecase for this is to
+  // adapt existing test cases to be compatible with runners that don't support
+  // certain features. Does nothing and returns OK by default.
+  //
+  // This method is called before any additional preprocessing steps performed
+  // by the optional `test_preprocessor` argument.
+  virtual absl::Status PreprocessModuleForTestRunner(HloModule* module) const {
+    return absl::OkStatus();
+  }
+
+  HloRunnerInterface& test_runner() const { return *test_runner_; }
+  HloRunnerInterface& reference_runner() const { return *reference_runner_; }
+
+ private:
+  // Given the test module, makes a reference module that is ready to run on the
+  // reference platform. This assumes that the given module is ready to run on
+  // the test platform.
+  absl::StatusOr<std::unique_ptr<HloModule>> MakeReferenceModule(
+      const HloModule& test_module,
+      const std::function<void(HloModule*)>& reference_preprocessor);
+
+  // Runs the module on two platforms with or without running hlo passes and
+  // compares the results. Returns whether the results are near or equal. If any
+  // error happens before the results are computed, returns the error status.
+  absl::StatusOr<::testing::AssertionResult> RunAndCompareInternal(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes,
+      const std::function<void(HloModule*)>& reference_preprocessor,
+      const std::function<void(HloModule*)>& test_preprocessor = nullptr);
+
+  // Runs the two module with or without running hlo passes and compares
+  // the results. Returns whether the results are near or equal. If any
+  // error happens before the results are computed, returns the error status.
+  absl::StatusOr<::testing::AssertionResult>
+  RunAndCompareTwoModulesInternalReplicated(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      HloRunnerInterface::ReplicatedExecuteOptions options,
+      const std::optional<ErrorSpec>& error);
+
+  // Runs the two module on with or without running hlo passes and
+  // compares the results. Returns whether the results are near or equal. If any
+  // error happens before the results are computed, returns the error status.
+  absl::StatusOr<::testing::AssertionResult> RunAndCompareTwoModulesInternal(
+      std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
+      absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes);
+
+  std::unique_ptr<HloRunnerInterface> test_runner_;
+  std::unique_ptr<HloRunnerInterface> reference_runner_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_HLO_RUNNER_AGNOSTIC_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_test_base.h
new file mode 100644
index 00000000..00e755a0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/hlo_test_base.h
@@ -0,0 +1,211 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_HLO_TEST_BASE_H_
+#define XLA_TESTS_HLO_TEST_BASE_H_
+
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/literal.h"
+#include "xla/service/backend.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tests/hlo_runner_agnostic_test_base.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+
+// A base class for tests which build and/or run HLO code. The class includes
+// support for running an HLO module on two platforms and compare the results.
+// This is a lower level of abstraction than using the client interface and
+// enables, for one, explicitly building a graph of HLO instructions to run.
+//
+// This can also be used to write text/file-based test cases. Note that the test
+// target is responsible for linking the needed backends. A convenient way to do
+// this is to make it an xla_test: it will generate test targets linking with
+// the respective backends, which will be used as the test backend; the
+// interpreter backend is already linked with hlo_test_base so it will be the
+// default reference backend. For example, if you want to compare both cpu vs.
+// interpreter, and gpu vs. interpreter, you can:
+//
+//  xla_test (
+//    name = "sample_text_test",
+//    srcs = ["sample_text_test.cc"],
+//    backends = [
+//      "cpu",
+//      "gpu",
+//    ],
+//    deps = [
+//      "//xla/tests:hlo_test_base",
+//      ...
+//    ],
+//  )
+//
+// For a more detailed example, see "../tests/sample_text_test.cc".
+//
+// ** NOTE **
+// This class will soon be deprecated in favor of HloRunnerAgnosticTestBase. We
+// are in the process of incrementally migrating tests to use this new base
+// class.  HloTestBase remains as a shim on tests during this migration process.
+// While we would prefer if you can avoid introducing new tests that use this
+// class, we are still working on documenting the exact migration procedure.
+class HloTestBase : public HloRunnerAgnosticTestBase {
+ public:
+  // Compiles the given `hlo` with optimizations, and verifies that optimized
+  // HLO matches the given FileCheck pattern.
+  void MatchOptimizedHlo(absl::string_view hlo, absl::string_view pattern,
+                         bool print_operand_shape = false);
+
+  // Like MatchOptimizedHlo, but checks operand shapes as well.
+  void MatchOptimizedHloWithShapes(absl::string_view hlo,
+                                   absl::string_view pattern) {
+    MatchOptimizedHlo(hlo, pattern, /*print_operand_shape=*/true);
+  }
+
+  // Compiles and returns module with optimizations from a given HLO.
+  absl::StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
+      absl::string_view hlo);
+
+  absl::StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
+      std::unique_ptr<HloModule> hlo_module);
+
+  using HloRunnerAgnosticTestBase::ParseAndReturnVerifiedModule;
+
+ protected:
+  // This uses the interpreter backend as the reference backend and
+  // automatically finds another supported backend as the test backend. If the
+  // interpreter is the only supported backend, it will be both the test backend
+  // and the reference backend.
+  explicit HloTestBase(bool verifier_layout_sensitive = false,
+                       bool allow_mixed_precision_in_hlo_verifier = true,
+                       HloPredicate instruction_can_change_layout_func = {});
+
+  // If your test doesn't use interpreter as the reference backend, you can use
+  // this constructor. Note that your test target is responsible for linking in
+  // both needed backends.
+  HloTestBase(se::Platform* test_platform, se::Platform* reference_platform,
+              bool verifier_layout_sensitive = false,
+              bool allow_mixed_precision_in_hlo_verifier = true,
+              HloPredicate instruction_can_change_layout_func = {});
+
+  // DO NOT USE: This is a temporary method to help migrate away from HloRunner.
+  // Some test fixures rely on functionality that is not supported by other
+  // HloRunnerInterface implementations, thus we expose it here.
+  [[nodiscard]] [[deprecated(
+      "This is a temporary method to help migrate existing tests away from "
+      "directly depending on HloRunner. Please do not introduce new uses.")]]
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithHloRunner(
+      Executable* executable,
+      const HloRunnerInterface::ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment,
+      ExecutionProfile* profile = nullptr) {
+    return test_runner_as_hlo_runner().ExecuteReplicated(
+        executable, options, device_assignment, profile);
+  }
+
+  [[nodiscard]] ::testing::AssertionResult RunAndCompareFromFile(
+      const std::string& filename, const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr);
+  [[nodiscard]] ::testing::AssertionResult RunAndCompareNoHloPassesFromFile(
+      const std::string& filename, const std::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr);
+
+  // DO NOT USE: This is a temporary method to help migrate away from HloRunner.
+  // Some test fixures rely on functionality that is not supported by other
+  // HloRunnerInterface implementations, thus we expose it here.
+  [[deprecated(
+      "This is a temporary method to help migrate existing tests away from "
+      "directly depending on HloRunner. Please do not introduce new uses.")]]
+  const Backend& backend() const {
+    return test_runner_as_hlo_runner().backend();
+  }
+  // Returns the backend owned by the test runner.
+  // DO NOT USE: This is a temporary method to help migrate away from HloRunner.
+  // Some test fixures rely on functionality that is not supported by other
+  // HloRunnerInterface implementations, thus we expose it here.
+  [[deprecated(
+      "This is a temporary method to help migrate existing tests away from "
+      "directly depending on HloRunner. Please do not introduce new uses.")]]
+  Backend& backend() {
+    return test_runner_as_hlo_runner().backend();
+  }
+
+  // DO NOT USE: This is a temporary method to help migrate away from HloRunner.
+  // Some test fixures rely on functionality that is not supported by other
+  // HloRunnerInterface implementations, thus we expose it here.
+  [[deprecated(
+      "This is a temporary method to help migrate existing tests away from "
+      "directly depending on HloRunner. Please do not introduce new uses.")]]
+  const HloRunner& test_runner_as_hlo_runner() const {
+    return *static_cast<HloRunner*>(&test_runner());
+  }
+  // DO NOT USE: This is a temporary method to help migrate away from HloRunner.
+  // Some test fixures rely on functionality that is not supported by other
+  // HloRunnerInterface implementations, thus we expose it here.
+  [[deprecated(
+      "This is a temporary method to help migrate existing tests away from "
+      "directly depending on HloRunner. Please do not introduce new uses.")]]
+  HloRunner& test_runner_as_hlo_runner() {
+    return *static_cast<HloRunner*>(&test_runner());
+  }
+
+  [[deprecated(
+      "This is a temporary method to help migrate existing tests away from "
+      "directly depending on HloRunner. Please do not introduce new uses.")]]
+  int64_t num_devices() {
+    return backend().device_count();
+  }
+
+  absl::StatusOr<std::unique_ptr<HloRunnerInterface>> GetHloRunner();
+
+  // Helper functions to get test and reference platforms.
+  static se::Platform* GetReferencePlatform();
+  static se::Platform* GetTestPlatform();
+
+  // Creates or retrieves the allocator.
+  se::DeviceMemoryAllocator* GetAllocator();
+
+  ErrorSpec error_spec_{0.0001};
+
+ private:
+  se::Platform* test_platform_;
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_HLO_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/literal_test_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/literal_test_util.h
new file mode 100644
index 00000000..01b2aa64
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/literal_test_util.h
@@ -0,0 +1,260 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_LITERAL_TEST_UTIL_H_
+#define XLA_TESTS_LITERAL_TEST_UTIL_H_
+
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "absl/types/span.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/error_spec.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/test.h"
+#include "xla/test_helpers.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+
+// Utility class for making expectations/assertions related to XLA literals.
+class LiteralTestUtil {
+ public:
+  // Asserts that the given shapes have the same rank, dimension sizes, and
+  // primitive types.
+  [[nodiscard]] static ::testing::AssertionResult EqualShapes(
+      const Shape& expected, const Shape& actual);
+
+  // Asserts that the provided shapes are equal as defined in AssertEqualShapes
+  // and that they have the same layout.
+  [[nodiscard]] static ::testing::AssertionResult EqualShapesAndLayouts(
+      const Shape& expected, const Shape& actual);
+
+  [[nodiscard]] static ::testing::AssertionResult Equal(
+      const LiteralSlice& expected, const LiteralSlice& actual);
+
+  // Asserts the given literal are (bitwise) equal to given expected values.
+  template <typename NativeT>
+  static void ExpectR0Equal(NativeT expected, const LiteralSlice& actual);
+
+  template <typename NativeT>
+  static void ExpectR1Equal(absl::Span<const NativeT> expected,
+                            const LiteralSlice& actual);
+  template <typename NativeT>
+  static void ExpectR2Equal(
+      std::initializer_list<std::initializer_list<NativeT>> expected,
+      const LiteralSlice& actual);
+
+  template <typename NativeT>
+  static void ExpectR3Equal(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>
+          expected,
+      const LiteralSlice& actual);
+
+  // Asserts the given literal are (bitwise) equal to given array.
+  template <typename NativeT>
+  static void ExpectR2EqualArray2D(const Array2D<NativeT>& expected,
+                                   const LiteralSlice& actual);
+  template <typename NativeT>
+  static void ExpectR3EqualArray3D(const Array3D<NativeT>& expected,
+                                   const LiteralSlice& actual);
+  template <typename NativeT>
+  static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
+                                   const LiteralSlice& actual);
+
+  // Decorates literal_comparison::Near() with an AssertionResult return type.
+  //
+  // See comment on literal_comparison::Near().
+  [[nodiscard]] static ::testing::AssertionResult Near(
+      const LiteralSlice& expected, const LiteralSlice& actual,
+      const ErrorSpec& error_spec,
+      std::optional<bool> detailed_message = std::nullopt);
+
+  // Asserts the given literal are within the given error bound of the given
+  // expected values. Only supported for floating point values.
+  template <typename NativeT>
+  static void ExpectR0Near(NativeT expected, const LiteralSlice& actual,
+                           const ErrorSpec& error);
+
+  template <typename NativeT>
+  static void ExpectR1Near(absl::Span<const NativeT> expected,
+                           const LiteralSlice& actual, const ErrorSpec& error);
+
+  template <typename NativeT>
+  static void ExpectR2Near(
+      std::initializer_list<std::initializer_list<NativeT>> expected,
+      const LiteralSlice& actual, const ErrorSpec& error);
+
+  template <typename NativeT>
+  static void ExpectR3Near(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>
+          expected,
+      const LiteralSlice& actual, const ErrorSpec& error);
+
+  template <typename NativeT>
+  static void ExpectR4Near(
+      std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>>
+          expected,
+      const LiteralSlice& actual, const ErrorSpec& error);
+
+  // Asserts the given literal are within the given error bound to the given
+  // array. Only supported for floating point values.
+  template <typename NativeT>
+  static void ExpectR2NearArray2D(const Array2D<NativeT>& expected,
+                                  const LiteralSlice& actual,
+                                  const ErrorSpec& error);
+
+  template <typename NativeT>
+  static void ExpectR3NearArray3D(const Array3D<NativeT>& expected,
+                                  const LiteralSlice& actual,
+                                  const ErrorSpec& error);
+
+  template <typename NativeT>
+  static void ExpectR4NearArray4D(const Array4D<NativeT>& expected,
+                                  const LiteralSlice& actual,
+                                  const ErrorSpec& error);
+
+  // If the error spec is given, returns whether the expected and the actual are
+  // within the error bound; otherwise, returns whether they are equal. Tuples
+  // will be compared recursively.
+  [[nodiscard]] static ::testing::AssertionResult NearOrEqual(
+      const LiteralSlice& expected, const LiteralSlice& actual,
+      const std::optional<ErrorSpec>& error);
+
+ private:
+  LiteralTestUtil(const LiteralTestUtil&) = delete;
+  LiteralTestUtil& operator=(const LiteralTestUtil&) = delete;
+};
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR0Equal(NativeT expected,
+                                                 const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR0<NativeT>(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR1Equal(
+    absl::Span<const NativeT> expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR1<NativeT>(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR2Equal(
+    std::initializer_list<std::initializer_list<NativeT>> expected,
+    const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR2<NativeT>(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR3Equal(
+    std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
+        expected,
+    const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR3<NativeT>(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR2EqualArray2D(
+    const Array2D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR2FromArray2D(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR3EqualArray3D(
+    const Array3D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR3FromArray3D(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR4EqualArray4D(
+    const Array4D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(LiteralUtil::CreateR4FromArray4D(expected), actual));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR0Near(NativeT expected,
+                                                const LiteralSlice& actual,
+                                                const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR0<NativeT>(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR1Near(
+    absl::Span<const NativeT> expected, const LiteralSlice& actual,
+    const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR1<NativeT>(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR2Near(
+    std::initializer_list<std::initializer_list<NativeT>> expected,
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR2<NativeT>(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR3Near(
+    std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
+        expected,
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR3<NativeT>(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR4Near(
+    std::initializer_list<std::initializer_list<
+        std::initializer_list<std::initializer_list<NativeT>>>>
+        expected,
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR4<NativeT>(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR2NearArray2D(
+    const Array2D<NativeT>& expected, const LiteralSlice& actual,
+    const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR2FromArray2D(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR3NearArray3D(
+    const Array3D<NativeT>& expected, const LiteralSlice& actual,
+    const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR3FromArray3D(expected), actual, error));
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR4NearArray4D(
+    const Array4D<NativeT>& expected, const LiteralSlice& actual,
+    const ErrorSpec& error) {
+  EXPECT_TRUE(Near(LiteralUtil::CreateR4FromArray4D(expected), actual, error));
+}
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_LITERAL_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/llvm_irgen_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/llvm_irgen_test_base.h
new file mode 100644
index 00000000..718eb360
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/llvm_irgen_test_base.h
@@ -0,0 +1,75 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_LLVM_IRGEN_TEST_BASE_H_
+#define XLA_TESTS_LLVM_IRGEN_TEST_BASE_H_
+
+#include <string>
+
+#include "xla/service/llvm_compiler.h"
+#include "xla/tests/codegen_test_base.h"
+
+namespace xla {
+
+// Tests that verify IR emitted by the CPU/GPU backend is as expected.
+class LlvmIrGenTestBase : public CodegenTestBase {
+ protected:
+  // Compiles the given HLO module to LLVM IR and verifies the IR matches the
+  // given pattern. `pattern` is in the FileCheck pattern matching syntax
+  // (http://llvm.org/docs/CommandGuide/FileCheck.html).
+  //
+  // This function invokes the JIT compiler.
+  //
+  // If `match_optimized_ir` is true, match the version of the IR after internal
+  // optimizations are applied; otherwise, the IR before optimizations is
+  // matched.
+  void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
+                          const std::string& pattern, bool match_optimized_ir,
+                          bool run_optimization_passes = true);
+
+  // A thin wrapper around CompileAndVerifyIr that parses `hlo_text` to create
+  // an HLO module.
+  void CompileAndVerifyIr(const std::string& hlo_text,
+                          const std::string& expected_llvm_ir,
+                          bool match_optimized_ir = false,
+                          bool run_optimization_passes = true);
+
+  // Compiles the given HLO module to LLVM IR and verifies the IR matches the
+  // given pattern. `pattern` is in the FileCheck pattern matching syntax
+  // (http://llvm.org/docs/CommandGuide/FileCheck.html).
+  //
+  // This function invokes the AOT compiler, with options in `options`.
+  //
+  // If `match_optimized_ir` is true, match the version of the IR after internal
+  // optimizations are applied; otherwise, the IR before optimizations is
+  // matched.
+  void CompileAheadOfTimeAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
+                                     const AotCompilationOptions& options,
+                                     const std::string& pattern,
+                                     bool match_optimized_ir);
+
+ private:
+  LLVMCompiler* GetLLVMCompiler();
+
+  void SetIrHook(bool match_optimized_ir);
+  void ResetIrHook();
+
+  std::string ir_;
+  absl::Status IrHook(const llvm::Module& module);
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_LLVM_IRGEN_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/local_client_test_base.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/local_client_test_base.h
new file mode 100644
index 00000000..df1facab
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/local_client_test_base.h
@@ -0,0 +1,147 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_LOCAL_CLIENT_TEST_BASE_H_
+#define XLA_TESTS_LOCAL_CLIENT_TEST_BASE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/client/client_library.h"
+#include "xla/client/local_client.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/local_service.h"
+#include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tests/client_library_test_base.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class TestAllocator : public se::StreamExecutorMemoryAllocator {
+ public:
+  explicit TestAllocator(se::Platform* platform)
+      : se::StreamExecutorMemoryAllocator(
+            platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
+
+  absl::StatusOr<se::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64_t size, bool retry_on_failure,
+      int64_t memory_space) override;
+  absl::Status Deallocate(int device_ordinal,
+                          se::DeviceMemoryBase mem) override;
+
+  // Return the number of allocations that have been performed.
+  int64_t allocation_count() const;
+  int64_t allocation_count(int device_ordinal) const;
+
+  // Return the number of deallocations that have been performed.
+  int64_t deallocation_count() const;
+  int64_t deallocation_count(int device_ordinal) const;
+
+ private:
+  mutable absl::Mutex count_mutex_;
+
+  // Global counts of allocations and deallocations.
+  int64_t allocation_count_ ABSL_GUARDED_BY(count_mutex_) = 0;
+  int64_t deallocation_count_ ABSL_GUARDED_BY(count_mutex_) = 0;
+
+  // Per-device counts of allocations and deallocations.
+  std::map<int, int64_t> device_allocation_count_ ABSL_GUARDED_BY(count_mutex_);
+  std::map<int, int64_t> device_deallocation_count_
+      ABSL_GUARDED_BY(count_mutex_);
+};
+
+// A base class for tests which exercise the LocalClient interface.
+class LocalClientTestBase : public ::testing::Test {
+ protected:
+  struct EigenThreadPoolWrapper;
+  explicit LocalClientTestBase(se::Platform* platform = nullptr);
+  virtual ~LocalClientTestBase();
+
+  static TestAllocator* GetOrCreateAllocator(se::Platform* platform);
+
+  // Copy the given literal onto the default device and return a
+  // ScopedShapedBuffer. Convenience wrapper around
+  // LocalClient::LiteralToShapedBuffer.
+  ScopedShapedBuffer LiteralToShapedBuffer(const Literal& literal);
+
+  // Construct and return a literal containing the array represented by
+  // shaped_buffer.
+  Literal ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
+
+  // Execute the given computation on the local client. With and without
+  // options.
+  absl::StatusOr<ScopedShapedBuffer> ExecuteLocally(
+      const XlaComputation& computation,
+      absl::Span<const ShapedBuffer* const> arguments);
+  absl::StatusOr<ScopedShapedBuffer> ExecuteLocally(
+      const XlaComputation& computation,
+      absl::Span<const ShapedBuffer* const> arguments,
+      const ExecutableBuildOptions& build_options,
+      const ExecutableRunOptions& run_options);
+
+  ScopedShapedBuffer ExecuteLocallyOrDie(
+      const XlaComputation& computation,
+      absl::Span<const ShapedBuffer* const> arguments);
+  ScopedShapedBuffer ExecuteLocallyOrDie(
+      const XlaComputation& computation,
+      absl::Span<const ShapedBuffer* const> arguments,
+      const ExecutableBuildOptions& build_options,
+      const ExecutableRunOptions& run_options);
+
+  // Parses the given string and returns module as a VerifiedHloModule.
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text);
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               const HloModuleConfig& config);
+
+  // Returns a default set of execute options.
+  ExecutableBuildOptions DefaultExecutableBuildOptions() const;
+
+  // Returns a default set of execute options, configured to use allocator_
+  // as the allocator.
+  ExecutableRunOptions DefaultExecutableRunOptions() const;
+
+  std::string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  // The allocator must live as long as the service, which lives until the end
+  // of the process. So make the allocator static.
+  static TestAllocator* allocator_;
+
+  se::StreamExecutor* stream_executor_;
+  TransferManager* transfer_manager_;
+
+  LocalClient* local_client_;
+
+  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_LOCAL_CLIENT_TEST_BASE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/pjrt_client_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/pjrt_client_registry.h
new file mode 100644
index 00000000..a8add484
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/pjrt_client_registry.h
@@ -0,0 +1,125 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_PJRT_CLIENT_REGISTRY_H_
+#define XLA_TESTS_PJRT_CLIENT_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla {
+
+class PjRtClientTestFactoryRegistry {
+ public:
+  using DeviceShapeRepresentationFn = std::function<Shape(const Shape&)>;
+  using DeviceShapeRepresentationFnFactory =
+      std::function<DeviceShapeRepresentationFn(PjRtClient*)>;
+  using DeviceShapeSizeFn = std::function<int64_t(const Shape&)>;
+  using DeviceShapeSizeFnFactory =
+      std::function<DeviceShapeSizeFn(PjRtClient*)>;
+  using PjRtClientFactory =
+      std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()>;
+
+  static DeviceShapeRepresentationFn DefaultShapeRepresentationRegisteredFn(
+      PjRtClient* client) {
+    return [](const Shape& host_shape) { return host_shape; };
+  }
+  static DeviceShapeSizeFn DefaultDeviceShapeSizeRegisteredFn(
+      PjRtClient* client) {
+    return [](const Shape& shape) -> int64_t {
+      if (shape.IsOpaque()) {
+        return sizeof(void*);
+      }
+      return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+    };
+  }
+
+  void Register(PjRtClientFactory factory,
+                DeviceShapeRepresentationFnFactory
+                    registered_device_shape_representation_fn,
+                DeviceShapeSizeFnFactory registered_device_shape_size_fn)
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    if (HasRegisteredFactory()) {
+      LOG(FATAL) << "A PjRtClient has already been registered.";
+      return;
+    }
+
+    absl::MutexLock lock(&mu_);
+    factory_ = std::move(factory);
+    registered_device_shape_representation_fn_ =
+        std::move(registered_device_shape_representation_fn);
+    registered_device_shape_size_fn_ =
+        std::move(registered_device_shape_size_fn);
+  }
+
+  // Return the device shape representation of 'host_shape'.
+  DeviceShapeRepresentationFn GetDeviceShapeRepresentationFn(
+      PjRtClient* pjrt_client) ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    return registered_device_shape_representation_fn_(pjrt_client);
+  }
+
+  // Return the device shape size of 'host_shape'.
+  // This function is used e.g. to create a VerifiedHloModule. It returns an
+  // integer representing the size of the shape in bytes as opposed to a Shape.
+  DeviceShapeSizeFn GetDeviceShapeSizeFn(PjRtClient* pjrt_client)
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    return registered_device_shape_size_fn_(pjrt_client);
+  }
+
+  bool HasRegisteredFactory() ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    return factory_ != nullptr;
+  }
+
+  std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> Get() const {
+    absl::MutexLock lock(&mu_);
+    return factory_;
+  }
+
+ private:
+  mutable absl::Mutex mu_;
+  std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> factory_
+      ABSL_GUARDED_BY(mu_);
+  DeviceShapeRepresentationFnFactory registered_device_shape_representation_fn_
+      ABSL_GUARDED_BY(mu_);
+  DeviceShapeSizeFnFactory registered_device_shape_size_fn_
+      ABSL_GUARDED_BY(mu_);
+};
+
+PjRtClientTestFactoryRegistry& GetGlobalPjRtClientTestFactory();
+
+void RegisterPjRtClientTestFactory(
+    PjRtClientTestFactoryRegistry::PjRtClientFactory factory,
+    PjRtClientTestFactoryRegistry::DeviceShapeRepresentationFnFactory
+        registered_device_shape_representation_fn =
+            PjRtClientTestFactoryRegistry::
+                DefaultShapeRepresentationRegisteredFn,
+    PjRtClientTestFactoryRegistry::DeviceShapeSizeFnFactory
+        registered_device_shape_size_fn_ =
+            PjRtClientTestFactoryRegistry::DefaultDeviceShapeSizeRegisteredFn);
+
+bool ShouldUsePjRt();
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_PJRT_CLIENT_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/test_macros.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/test_macros.h
new file mode 100644
index 00000000..8d0b6bc0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/test_macros.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Macros for use in enabling/disabling tests on particular
+// platforms. Marking a gunit test as disabled still ensures that it
+// compiles.
+//
+// Implementation note: the macros are structured as follows:
+// * Define the disabled macro to just pass the test name through (which, in
+//   effect, does not disable it at all)
+// * If a XLA_TEST_BACKEND_$TARGET macro indicates we're compiling for
+//   $TARGET platform, make the disabled macro truly disable the test; i.e. by
+//   redefining the DISABLED_ON_$TARGET macro to prepend "DISABLED_" to the test
+//   name.
+
+#ifndef XLA_TESTS_TEST_MACROS_H_
+#define XLA_TESTS_TEST_MACROS_H_
+
+#define DISABLED_ON_CPU(X) X
+#define DISABLED_ON_GPU(X) X
+#define DISABLED_ON_GPU_A100(X) X
+#define DISABLED_ON_GPU_H100(X) X
+#define DISABLED_ON_GPU_ROCM(X) X
+#define DISABLED_ON_INTERPRETER(X) X
+#define DISABLED_ON_INTERPRETER_TSAN(X) X
+#define DISABLED_ON_DEBUG(X) X
+#define DISABLED_ON_TPU(X) X
+#define DISABLED_ON_GRM(X) X
+#define DISABLED_ON_ISS(X) X
+
+#define OVERSIZE_ON_GRM(X) X
+#define OVERSIZE_ON_ISS(X) X
+
+// We need this macro instead of pasting directly to support nesting
+// the DISABLED_ON_FOO macros, as in the definition of DISABLED_ON_CPU.
+// Otherwise the pasting is applied before macro expansion completes.
+#define XLA_TEST_PASTE(A, B) A##B
+
+// We turn off clang-format so we can indent the macros for readability.
+// clang-format off
+
+#ifdef XLA_TEST_BACKEND_CPU
+# undef DISABLED_ON_CPU
+# define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_CPU
+
+#ifdef XLA_TEST_BACKEND_GPU
+# undef DISABLED_ON_GPU
+# define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X)
+
+#if TENSORFLOW_USE_ROCM
+# undef DISABLED_ON_GPU_ROCM
+# define DISABLED_ON_GPU_ROCM(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // TENSORFLOW_USE_ROCM
+
+#endif  // XLA_TEST_BACKEND_GPU
+
+#ifdef XLA_TEST_BACKEND_GPU_A100
+# undef DISABLED_ON_GPU_A100
+# define DISABLED_ON_GPU_A100(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_GPU_A100
+
+#ifdef XLA_TEST_BACKEND_GPU_H100
+# undef DISABLED_ON_GPU_H100
+# define DISABLED_ON_GPU_H100(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_GPU_H100
+
+#ifdef XLA_TEST_BACKEND_INTERPRETER
+# undef DISABLED_ON_INTERPRETER
+# define DISABLED_ON_INTERPRETER(X) XLA_TEST_PASTE(DISABLED_, X)
+
+#ifdef THREAD_SANITIZER
+# undef DISABLED_ON_INTERPRETER_TSAN
+# define DISABLED_ON_INTERPRETER_TSAN(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // THREAD_SANITIZER
+
+#endif  // XLA_TEST_BACKEND_INTERPRETER
+
+#ifndef NDEBUG
+# undef DISABLED_ON_DEBUG
+# define DISABLED_ON_DEBUG(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // !NDEBUG
+
+#ifdef XLA_TEST_BACKEND_TPU
+# undef DISABLED_ON_TPU
+# define DISABLED_ON_TPU(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_TPU
+
+#ifdef XLA_TEST_BACKEND_GRM
+# undef DISABLED_ON_GRM
+# define DISABLED_ON_GRM(X) XLA_TEST_PASTE(DISABLED_, X)
+
+# undef OVERSIZE_ON_GRM
+# define OVERSIZE_ON_GRM(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_GRM
+
+#ifdef XLA_TEST_BACKEND_ISS
+# undef DISABLED_ON_ISS
+# define DISABLED_ON_ISS(X) XLA_TEST_PASTE(DISABLED_, X)
+
+#undef OVERSIZE_ON_ISS
+# define OVERSIZE_ON_ISS(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_ISS
+
+// clang-format on
+
+namespace xla {
+
+inline const char** TestPlatform() {
+  static const char* test_platform = nullptr;
+  return &test_platform;
+}
+
+}  // namespace xla
+
+#define XLA_TEST_F(test_fixture, test_name) TEST_F(test_fixture, test_name)
+
+#define XLA_TEST_P(test_case_name, test_name) TEST_P(test_case_name, test_name)
+
+#define XLA_TYPED_TEST(CaseName, TestName) TYPED_TEST(CaseName, TestName)
+
+#endif  // XLA_TESTS_TEST_MACROS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/test_utils.h
new file mode 100644
index 00000000..ec851b97
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/test_utils.h
@@ -0,0 +1,138 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_TEST_UTILS_H_
+#define XLA_TESTS_TEST_UTILS_H_
+
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+// A class which generates pseudorandom numbers of a given type within a given
+// range. Not cryptographically secure and likely not perfectly evenly
+// distributed across the range but sufficient for most tests.
+template <typename NativeT>
+class PseudorandomGenerator {
+ public:
+  explicit PseudorandomGenerator(NativeT min_value, NativeT max_value,
+                                 uint32_t seed)
+      : min_(min_value), max_(max_value), generator_(seed) {}
+
+  // Get a pseudorandom value.
+  NativeT get() {
+    std::uniform_real_distribution<> distribution;
+    return static_cast<NativeT>(min_ +
+                                (max_ - min_) * distribution(generator_));
+  }
+
+ private:
+  NativeT min_;
+  NativeT max_;
+  std::mt19937 generator_;
+};
+
+// Generates a vector of arguments containing fake data. The number, shape and
+// layout of the arguments is appropriate for given HLO module.
+//
+// A best-effort attempt is made to generate the data in a way which produce
+// stable computation results across platforms. Specifically:
+//
+//  (1) Init values of reductions should be the identity of the reduction
+//  computation.
+//
+//  (2) Indices of dynamic slices and update slices should be in bounds.
+//
+//  (3) Keys of key/value sorts should contain no duplicates.
+//
+// These constraints are best-effort only.
+//
+// If max_bits_of_precision is set to a number, then floating point & integer
+// types will be constrained to be represented in that number of bits. Setting
+// it to 5 for integers would mean it only creates integers between -32 and 32.
+//
+// If pseudo_random is true, the generated numbers will be generated
+// deterministically in a pseudo random way unless the values are constrated to
+// be e.g. init values as above. If pseudo_random is false, the returned values
+// will be generated in a faster way that yields less interesting data, e.g. the
+// values may all be just the same value.
+//
+// If use_large_range is false, the generated floating point numbers will be
+// sampled from a small range of possible values. If use_large_range is true,
+// the generated floating point numbers will be sampled from a uniform-log
+// distribution of most possible floats, with a small chance to instead be
+// sampled from a list of special floating point values (such as 0, inf, etc.).
+//
+// TODO(b/79942829): Make interesting argument generation fast enough that using
+// pseudo_random does not save any noticeable amount of time so that the
+// parameter can be removed.
+absl::StatusOr<std::vector<Literal>> MakeFakeArguments(
+    const HloModule* module, bool pseudo_random = true,
+    bool use_large_range = false, bool treat_gte_as_data_formatting = false,
+    std::optional<int64_t> max_bits_of_precision = std::nullopt,
+    std::minstd_rand0* engine = nullptr);
+
+// Overload which accepts a random number generator. This enables generation of
+// different random values with sequential calls to MakeFakeArguments by reusing
+// the same generator.
+absl::StatusOr<std::vector<Literal>> MakeFakeArguments(
+    const HloModule* module, std::minstd_rand0* engine,
+    bool use_large_range = false, bool treat_gte_as_data_formatting = false,
+    std::optional<int64_t> max_bits_of_precision = std::nullopt);
+
+// Check that a given module satisfies various constraints before trying to
+// execute it.
+absl::Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
+                             bool allow_mixed_precision);
+
+// Creates a dot op with operands 'lhs' and 'rhs' that contracts dimension 1 of
+// the LHS with dimension 0 of the RHS with no batch dimensions.
+// Both LHS and the RHS must be of rank 2.
+std::unique_ptr<HloDotInstruction> CreateCanonicalDot(const Shape& shape,
+                                                      HloInstruction* lhs,
+                                                      HloInstruction* rhs);
+
+// Checks whether MLIR lowering is enabled through XLA_FLAGS.
+bool IsMlirLoweringEnabled();
+
+template <typename MessageType>
+absl::StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
+  tsl::protobuf::TextFormat::Parser parser;
+  MessageType parsed_proto;
+  tsl::protobuf::io::ArrayInputStream input_stream(
+      text_proto.data(), static_cast<int32_t>(text_proto.size()));
+  if (!parser.Parse(&input_stream, &parsed_proto)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Could not parse text proto: ", text_proto));
+  }
+  return parsed_proto;
+}
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tests/verified_hlo_module.h b/third_party/tflite-hdrs/third_party/xla/xla/tests/verified_hlo_module.h
new file mode 100644
index 00000000..3b27b3bd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tests/verified_hlo_module.h
@@ -0,0 +1,21 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TESTS_VERIFIED_HLO_MODULE_H_
+#define XLA_TESTS_VERIFIED_HLO_MODULE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/testlib/verified_hlo_module.h"
+
+#endif  // XLA_TESTS_VERIFIED_HLO_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/text_literal_reader.h b/third_party/tflite-hdrs/third_party/xla/xla/text_literal_reader.h
new file mode 100644
index 00000000..20684755
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/text_literal_reader.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TEXT_LITERAL_READER_H_
+#define XLA_TEXT_LITERAL_READER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/literal.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/file_system.h"
+
+namespace xla {
+
+// Reads a textual literal from a file path.  The format of the file must be:
+//
+//    f32[1,2,3,4]
+//    (0, 0, 0, 0): 1.234
+//    (0, 0, 0, 1): 0xf00p-2
+//    ...
+//
+// Note that for floating values the hex output (as in the second value above)
+// will more precisely convey the exact values.
+class TextLiteralReader {
+ public:
+  // See class comment -- reads a file in its entirety (there must be only one
+  // literal in the text file path provided).
+  static absl::StatusOr<Literal> ReadPath(absl::string_view path);
+
+ private:
+  // Ownership of file is transferred.
+  explicit TextLiteralReader(tsl::RandomAccessFile* file);
+
+  // Parses a shape string on the first line, followed by lines of values to the
+  // end of the file.
+  absl::StatusOr<Literal> ReadAllLines();
+
+  // Owns the file being read
+  std::unique_ptr<tsl::RandomAccessFile> file_;
+
+  TextLiteralReader(const TextLiteralReader&) = delete;
+  TextLiteralReader& operator=(const TextLiteralReader&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_TEXT_LITERAL_READER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/text_literal_writer.h b/third_party/tflite-hdrs/third_party/xla/xla/text_literal_writer.h
new file mode 100644
index 00000000..a11205c9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/text_literal_writer.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TEXT_LITERAL_WRITER_H_
+#define XLA_TEXT_LITERAL_WRITER_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/literal.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// Writes a literal to textual form at a file path.
+//
+// The format is roughly:
+//
+//    f32[1,2,3,4]
+//    (0, 0, 0, 0): 1.234
+//    (0, 0, 0, 1): 0xf00p-2
+//    ...
+//
+// This should be readable by xla::TextLiteralReader.
+class TextLiteralWriter {
+ public:
+  static absl::Status WriteToPath(const Literal& literal,
+                                  absl::string_view path);
+
+ private:
+  TextLiteralWriter(const TextLiteralWriter&) = delete;
+  TextLiteralWriter& operator=(const TextLiteralWriter&) = delete;
+};
+
+}  // namespace xla
+
+#endif  // XLA_TEXT_LITERAL_WRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
new file mode 100644
index 00000000..2db497f1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
@@ -0,0 +1,96 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_BISECT_HLO_BISECT_STATE_H_
+#define XLA_TOOLS_HLO_BISECT_HLO_BISECT_STATE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+
+namespace xla {
+namespace bisect {
+
+// Processes an HloModule, such as compiling the module or executing the module,
+// to check whether a bug exists. When the module is executed, should provide
+// the resulting literals for the reference implementation.
+class BugCheckerInterface {
+ public:
+  virtual ~BugCheckerInterface() {}
+
+  // Returns true if `module` has a bug we're interested in.
+  virtual absl::StatusOr<bool> Run(const HloModule& module) = 0;
+
+  // Returns mapping of instruction names to their results after the run
+  // (empty if this information is unavailable).
+  virtual absl::flat_hash_map<std::string, Literal> GetResults() = 0;
+};
+
+// Trims down an HloModule that manifests a bug to a smaller module that
+// still exhibits a problem. Only the entry computation is reduced.
+class HloBisectState {
+ public:
+  explicit HloBisectState(std::unique_ptr<HloModule> module,
+                          BugCheckerInterface* bug_checker)
+      : module_(std::move(module)), bug_checker_(bug_checker) {}
+
+  // Returns true if the current module has a bug and should be processed.
+  absl::StatusOr<bool> ShouldProcess();
+
+  // Trims the entry computation until no more reductions are possible. Returns
+  // a boolean to indicate whether the computation has been reduced.
+  absl::StatusOr<bool> TrimEntryComputation();
+
+  // Returns the resulting module.
+  std::unique_ptr<xla::HloModule>&& GetResult();
+
+ private:
+  // Runs a modified module and updates the foldable instructions data, if
+  // available. Returns true if `module` has a bug.
+  absl::StatusOr<bool> RunModule(const HloModule& module);
+
+  // Trims the entry computation by reducing the total number of outputs.
+  // Returns a boolean to indicate whether the computation has been reduced.
+  absl::StatusOr<bool> TrimByOutputs();
+
+  // Trims the entry computation by reducing the total number of instructions.
+  // Returns a boolean to indicate whether the computation has been reduced.
+  absl::StatusOr<bool> TrimByInstructions();
+
+  // Trims the given computation by replacing instructions with constant values.
+  // Returns a boolean to indicate whether the computation has been reduced.
+  absl::StatusOr<bool> TrimByUsingConstants();
+
+  // Asserts that the module still has the bug. If negative, runs the bug
+  // checker repeatedly to verify that it's deterministic.
+  absl::Status ExpectModuleIsBuggy();
+
+  std::unique_ptr<xla::HloModule> module_;
+  BugCheckerInterface* bug_checker_;
+  absl::flat_hash_set<std::string> foldable_instructions_;
+  absl::flat_hash_map<std::string, Literal> foldable_instructions_values_;
+};
+
+}  // namespace bisect
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_BISECT_HLO_BISECT_STATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.h
new file mode 100644
index 00000000..b0649a78
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.h
@@ -0,0 +1,99 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_BISECT_HLO_BISECT_UTILS_H_
+#define XLA_TOOLS_HLO_BISECT_HLO_BISECT_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/tools/hlo_bisect/hlo_bisect_state.h"
+
+namespace xla {
+namespace bisect {
+
+// Checks whether the execution of an HloModule on the test platform and the
+// reference platform produce different results.
+class MiscompareChecker : public BugCheckerInterface {
+ public:
+  MiscompareChecker(HloModule* module, std::vector<Literal>&& input_data,
+                    absl::string_view test_platform,
+                    absl::string_view reference_platform,
+                    ErrorSpec error_spec);
+  absl::StatusOr<bool> Run(const HloModule& module) override;
+  absl::flat_hash_map<std::string, Literal> GetResults() override;
+
+  virtual absl::StatusOr<std::unique_ptr<HloModule>> PrepareReferenceModule(
+      const HloModule& hlo_module, HloRunnerInterface* hlo_runner) const;
+
+ private:
+  std::vector<Literal> input_data_;
+  std::unique_ptr<HloRunnerInterface> reference_runner_;
+  std::unique_ptr<HloRunnerInterface> test_runner_;
+  absl::flat_hash_map<std::string, Literal> results_;
+  ErrorSpec error_spec_;
+};
+
+// Runs a user provided script and considers an HLO module to be buggy if the
+// script exits with a non-zero exit code.
+class ScriptChecker : public BugCheckerInterface {
+ public:
+  explicit ScriptChecker(std::string path_to_script)
+      : path_to_script_(std::move(path_to_script)) {}
+  absl::StatusOr<bool> Run(const HloModule& module) override;
+  absl::flat_hash_map<std::string, Literal> GetResults() override;
+
+ private:
+  std::string path_to_script_;
+};
+
+// Runner class for the bisect tool.
+class BisectRunner {
+ public:
+  BisectRunner(std::unique_ptr<HloModule> module,
+               std::unique_ptr<BugCheckerInterface> bug_checker)
+      : module_(std::move(module)), bug_checker_(std::move(bug_checker)) {}
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunEntry();
+  absl::StatusOr<std::unique_ptr<HloModule>> RunAll();
+
+ protected:
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<BugCheckerInterface> bug_checker_;
+};
+
+// Main runner for the bisect tool.
+void RunBisect(std::unique_ptr<BisectRunner> runner, bool all_computations,
+               absl::string_view dump_path, absl::string_view output_format);
+
+// Utility function for getting the verified module and optional inputs.
+using ModuleWithInputs =
+    std::pair<std::unique_ptr<HloModule>, std::vector<Literal>>;
+absl::StatusOr<ModuleWithInputs> GetVerifiedModuleAndInputData(
+    absl::string_view input_filename);
+
+}  // namespace bisect
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_BISECT_HLO_BISECT_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_control_flow_flattening.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_control_flow_flattening.h
new file mode 100644
index 00000000..a034473f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_control_flow_flattening.h
@@ -0,0 +1,145 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_CONTROL_FLOW_FLATTENING_H_
+#define XLA_TOOLS_HLO_CONTROL_FLOW_FLATTENING_H_
+
+#include <stdbool.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+
+// An HLO pass that replaces while loop conditionals to execute a known constant
+// number of iterations and remove operations that are difficult to run in
+// standalone tests, such as infeed/outfeed and collective operations.
+class HloControlFlowFlattening : public HloModulePass {
+ public:
+  // While execution count specifies how many times the while loops in the
+  // transformed graph will execute.
+  // If remove_comm = true, remove all communication operations.
+  // If remove_host_transfer = true, remove the host-transfer send and recv
+  // operations.
+  struct Options {
+    int while_execution_count = 1;
+    int max_outer_loop_count = std::numeric_limits<int>::max();
+    int max_loop_count = std::numeric_limits<int>::max();
+    bool remove_infeed_outfeed = true;
+    bool flatten_while_loop = true;
+    bool remove_comm = true;
+    bool remove_host_transfer = false;
+    // Removes partition-id, replica-id, and slice-id.
+    bool remove_id = false;
+    // Whether to flatten conditional ops by setting a default index or PRED
+    // value. For indexed conditional ops, the default value is the N-1'th
+    // conditional branch computation.
+    bool flatten_conditional = false;
+    // If flatten_conditional is true, this will behe default predicate value to
+    // use for predicated conditional ops.
+    bool conditional_value = false;
+  };
+  explicit HloControlFlowFlattening(const Options& options)
+      : while_execution_count_(options.while_execution_count),
+        max_outer_loop_count_(options.max_outer_loop_count),
+        max_loop_count_(options.max_loop_count),
+        remove_infeed_outfeed_(options.remove_infeed_outfeed),
+        flatten_while_loop_(options.flatten_while_loop),
+        remove_host_transfer_(options.remove_host_transfer),
+        flatten_conditional_(options.flatten_conditional),
+        conditional_value_(options.conditional_value),
+        remove_comm_(options.remove_comm),
+        remove_id_(options.remove_id) {}
+  ~HloControlFlowFlattening() override = default;
+  absl::string_view name() const override { return "control-flow-flattening"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Replaces an infeed with a custom call.
+  absl::Status RemoveInfeed(HloInstruction* infeed_hlo) const;
+  // Removes outfeeds and replaces the outfeed HLO with a side-effecting custom
+  // call that ensures that XLA doesn't dead-code-eliminate the outfeeded values
+  // but lowers to a no-op.
+  absl::Status RemoveOutfeed(HloInstruction* outfeed_hlo) const;
+  // Flattens the while loop. Precondition: while_hlo is a while instruction.
+  absl::Status FlattenWhileLoop(HloInstruction* while_hlo,
+                                const CallGraph& call_graph) const;
+  // Replaces an id with a zero constant.
+  absl::Status RemoveId(HloInstruction* hlo) const;
+  // Sets the default branch for conditional ops to take. For indexed
+  // conditionals, the index is set to the N-1'th conditional branch
+  // computation. For predicated conditionals, the PRED is set to
+  // conditional_value_.
+  absl::Status SetConditionalValue(HloInstruction* conditional) const;
+
+  int while_execution_count_;
+  int max_outer_loop_count_;
+  int max_loop_count_;
+  bool remove_infeed_outfeed_;
+  bool flatten_while_loop_;
+  bool remove_host_transfer_;
+  bool flatten_conditional_;
+  bool conditional_value_;
+
+ protected:
+  // Replaces a collective op with a custom call and returns the custom call.
+  virtual absl::StatusOr<HloInstruction*> RemoveCollective(
+      HloInstruction* hlo) const;
+  // Replaces send and send-done with a custom call. Returns the new custom
+  // calls in a pair.
+  virtual absl::StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+  RemoveSendAndSendDone(
+      HloInstruction* send_done,
+      absl::flat_hash_set<HloInstruction*>* additional_removed) const;
+  // Replaces recv and recv-done with a custom call. Returns the new custom
+  // calls in a pair
+  virtual absl::StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+  RemoveRecvAndRecvDone(
+      HloInstruction* recv_done,
+      absl::flat_hash_set<HloInstruction*>* additional_removed) const;
+  bool remove_comm_;
+  bool remove_id_;
+};
+
+// Retrieves the original loop bound. If fail, return a default value. If bounds
+// exceed a given max, returns the max. This function is more opportunistic than
+// ComputeWhileLoopTripCount in the while loop analysis as it may return a
+// constant found in a compare expression when it is not an actual bound.
+int GetLoopBound(const HloInstruction& while_hlo, const int default_loop_count,
+                 const int max_loop_count);
+
+// Retrieves the loop bound determined by the original loop bound, the max
+// outer loops count and max loop count.
+int GetLoopBoundWithOuterLoopMax(const HloInstruction& while_hlo,
+                                 const CallGraph& call_graph,
+                                 const int default_loop_count,
+                                 const int max_outer_loop_count,
+                                 const int max_loop_count);
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_CONTROL_FLOW_FLATTENING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_decomposer.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_decomposer.h
new file mode 100644
index 00000000..d12b4d82
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_decomposer.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_DECOMPOSER_H_
+#define XLA_TOOLS_HLO_DECOMPOSER_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// Decomposes the `module` into individual ops and de-duplicates the decomposed
+// op if `deduplicate_modules` is true. The modules are considered duplicate if
+// if their computation graphs are isomorphic (i.e. computations and
+// instructions are sorted, names are ignored etc).
+absl::StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
+    const HloModule& module, bool deduplicate_modules);
+
+// Extracts an HLO instruction into a new HLO module replacing its operands
+// with parameter instructions.
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const HloInstruction& hlo);
+
+// Extracts HLO instructions into a new HLO module replacing all operands
+// with parameter instructions even if the result of one instruction is used
+// as a parameter to another. Combines results of all operations into the
+// tuple and adds this tuple as a root instruction of the new module.
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const std::vector<HloInstruction*>& instructions);
+
+// Extracts producer and consumer HLO instruction into a new HLO module
+// replacing its operands with parameter instructions.
+std::unique_ptr<HloModule> ExtractProducerConsumerIntoNewModule(
+    const HloInstruction& producer, const HloInstruction& consumer);
+
+// Extracts an HLO computation into a new HLO module, using its clone as the
+// root computation.
+std::unique_ptr<HloModule> ExtractComputationIntoNewModule(
+    const HloComputation& computation);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_DECOMPOSER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_expand.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_expand.h
new file mode 100644
index 00000000..441847bb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_expand.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_EXPAND_H_
+#define XLA_TOOLS_HLO_EXPAND_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/tsl/util/command_line_flags.h"
+
+namespace xla {
+
+// Command-line options to this tool. See hlo_opt.cc for the descriptions of
+// these fields.
+struct HloExpandConfig {
+  // Optional flags.
+  bool help{false};
+  std::string input_format;
+  std::string output_file;
+  std::string output_format;
+  // Compound flags setting multiple passes.
+  bool batch_norm_expander{false};
+  bool expand_all{false};
+  bool rng_bit_generator_expander{false};
+  // Flags for individual passes.
+  bool batch_norm_grad_expander{false};
+  bool batch_norm_inference_expander{false};
+  bool batch_norm_training_expander{false};
+  bool cholesky_expander{false};
+  bool rng_expander{false};
+  bool rng_bit_generator_philox_expander{false};
+  bool rng_bit_generator_three_fry_expander{false};
+  bool triangular_solve_expander{false};
+  bool spmd_expander{false};
+  bool verify_hlo{false};
+};
+
+// Adds passes to the `pipeline` for flags set in `config`.
+void AddPassesToPipeline(xla::HloExpandConfig& config,
+                         xla::HloPassPipeline& pipeline,
+                         const xla::HloModuleConfig& hlo_module_config);
+
+// Wraps `config` with flag descriptions and returns a vector of `tsl::Flag`s.
+std::vector<tsl::Flag> GetFlags(xla::HloExpandConfig& config);
+
+// Parses compound flags that sets multiple flags from `config` and overrides
+// individual flags that were set previously.
+void ParseCompoundFlags(xla::HloExpandConfig& config);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_EXPAND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_extractor.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_extractor.h
new file mode 100644
index 00000000..ea6aeb8a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_extractor.h
@@ -0,0 +1,105 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_EXTRACTOR_H_
+#define XLA_TOOLS_HLO_EXTRACTOR_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// Define ExtractSelector, which is a lambda that, given an HLO
+// instruction, returns true if selected, otherwise return false.
+using ExtractSelector = std::function<bool(const HloInstruction*)>;
+
+// Define ReplaceTypeSelector, which is a lambda that, given an HLO
+// instruction, returns ReplaceType, which indicated which type of op should be
+// used to replace.
+//
+// kReplaceParam: hlo instruction will be replaced with parameter. Note that it
+// can only replace the instructions at the entry computation with parameters.
+// If `cross_computation` is enabled and users attempt to replace an instruction
+// in non-entry computation with a parameter, this library would report FATAL.
+//
+// kReplaceConst: hlo instruction will be replaced with randomly-generated
+// constant of the same shape. Note that it could be very slow if hlo
+// instruction has a large shape. It can be used in both entry and non-entry
+// computation.
+//
+// kReplaceZeroBroadcast: hlo instruction will be replaced with a broadcasted
+// zero constant of the same shape. It can be used in both entry and non-entry
+// computation.
+//
+// kReplaceRandomBroadcast: hlo instruction will be replaced with a broadcasted
+// random constant of the same shape. It can be used in both entry and non-entry
+// computation.
+enum class ReplaceType {
+  kReplaceParam,
+  kReplaceConst,
+  kReplaceZeroBroadcast,
+  kReplaceRandomBroadcast
+};
+using ReplaceTypeSelector = std::function<ReplaceType(const HloInstruction*)>;
+
+// Creates a new HLO module rooted with an entry computation rooted at the given
+// instruction.
+//
+// By default (height == -1), the new computation includes all transitive
+// operands of `root`.  If you specify a different height, the new computation
+// will include all instructions <= `height` hops away from `root`.
+// Instructions at the boundary are replaced by parameters.
+//
+// The `extractor_selector` will return true/false for each hlo instruction. If
+// false is returned, the corresponding instruction and its predecessors will
+// not be included in the extracted hlo module
+//
+// The `replace_type_selector` specify, if an HLO instruction is determined to
+// be excluded, which type of node should be the replacement. Please check the
+// comments for ReplaceTypeSelector for details.
+//
+// If the `cross_computation` is enabled, we would be capable of visiting the
+// instructions at the non-entry computations and exclude/replace some
+// instructions there.
+// There are two restrictions if this option is enabled:
+//   1. `height` must be -1, as we do not support calculating boundary across
+//   computations.
+//   2. We do not support replace an instruction at non-entry computation with
+//   parameter.
+// Please check test cases `HloExtractorTest.ExtractFromMultipleComputation` for
+// more details.
+//
+// If the `inline_calls_and_fusions` flag is on, then the kCall and kFusion
+// instructions are inlined. This allows for better extraction as some
+// parameters can be eliminated because they are not used anymore. For an
+// example of this, see `HloExtractorTest.TestWithCalledComputationsAndFusion`.
+//
+// If the `run_verifier` flag is on, then the extracted module will be verified
+// for correctness.
+std::unique_ptr<HloModule> ExtractModule(
+    const HloInstruction* instruction, int64_t height = -1,
+    ExtractSelector extract_selector = nullptr,
+    ReplaceTypeSelector replace_type_selector = nullptr,
+    bool cross_computation = false, bool inline_calls_and_fusions = false,
+    bool run_verifier = true);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_EXTRACTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_module_loader.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_module_loader.h
new file mode 100644
index 00000000..a8b7c1e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_module_loader.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_MODULE_LOADER_H_
+#define XLA_TOOLS_HLO_MODULE_LOADER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/tools/run_hlo_module.pb.h"
+
+namespace xla {
+namespace hlo_module_loader_details {
+
+struct Config {
+  Config() = default;
+  int64_t num_replicas = 1;
+  int64_t num_partitions = 1;
+};
+
+}  // namespace hlo_module_loader_details
+
+// Given a string composed by multiple lines, strip the log headers, if present
+// at the beginning of each line.
+std::string StripLogHeaders(absl::string_view hlo_string);
+
+// Loads an HLO module from a string.
+// The data can have the followings formats:
+// 1) A binary of text proto file, the proto should be in xla.HloProto type. It
+//    can be a binary proto (format must be "pb"), or a text proto (format must
+//    be "pbtxt").
+// 2) A hlo text dump, the string should be in HloModule::ToString() format
+//    (format must be "txt" or "hlo"). The input data can also contain log
+//    headers, which will be stripped.
+// The ovr_config data can be used to override certain fields of the
+// HloModuleConfig.
+// The HloModuleConfig is passed to config_modifier_hook for custom
+// modifications before use. If the buffer assignment proto pointer is not null
+// and the hlo module format is proto, it loads buffer assignment from the
+// proto.
+absl::StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
+    const std::string& data, absl::string_view format,
+    const hlo_module_loader_details::Config& ovr_config =
+        hlo_module_loader_details::Config(),
+    const std::function<void(HloModuleConfig*)>& config_modifier_hook = {},
+    BufferAssignmentProto* buffer_assignment_proto = nullptr,
+    bool fill_missing_layouts = true);
+
+// Loads an HLO module from file.
+// The file can be one of the followings:
+// 1) A binary of text proto file, the proto should be in xla.HloProto type. It
+//    can be a binary proto (with .pb extension), or a text proto (with a .pbtxt
+//    extension).
+// 2) A hlo text dump, the string should be in HloModule::ToString() format
+//    (with a .hlo or .txt extension). A text file can also contain log headers,
+//    which will be stripped.
+// If the format is specified (not empty), it overrides the one guessed from the
+// file extension. The ovr_config data can be used to override certain fields of
+// the HloModuleConfig.
+// The HloModuleConfig is passed to config_modifier_hook for custom
+// modifications before use. If the buffer assignment proto pointer is not null
+// and the hlo module format is proto, it loads buffer assignment from the
+// proto.
+absl::StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
+    const std::string& path, std::string format = "",
+    const hlo_module_loader_details::Config& ovr_config =
+        hlo_module_loader_details::Config(),
+    const std::function<void(HloModuleConfig*)>& config_modifier_hook = {},
+    BufferAssignmentProto* buffer_assignment_proto = nullptr,
+    bool fill_missing_layouts = true);
+
+// Loads an HLO snapshot from a string, only for its inputs
+// The data format must be one of the following:
+// 1) A binary proto (format "pb")
+// 2) A text proto (format "pbtxt")
+absl::StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>>
+LoadInputFromData(const std::string& data, absl::string_view format);
+
+// Loads an HLO snapshot from file, only for its inputs
+// The file must be one of the following:
+// 1) A binary proto (with .pb extension)
+// 2) A text proto (with a .pbtxt extension)
+// If the format is specified (not empty), it overrides the one guessed from the
+// file extension.
+absl::StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>>
+LoadInputFromFile(const std::string& path, std::string format = "");
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_MODULE_LOADER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h
new file mode 100644
index 00000000..eaabe294
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h
@@ -0,0 +1,74 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_OPT_COMPILED_OPT_LIB_H_
+#define XLA_TOOLS_HLO_OPT_COMPILED_OPT_LIB_H_
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/tools/hlo_opt/opt_lib.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/stream_executor/platform.h"
+
+namespace xla {
+
+// Platform-specific provider of `hlo-opt` functionality.
+class CompiledOptProvider : public OptProvider {
+ public:
+  CompiledOptProvider() : OptProvider() {
+    RegisterSharedHardwareSpecificPasses();
+  }
+
+  // Generates textual output for a given stage on a given platform, returns
+  // empty optional if the stage is not supported.
+  absl::StatusOr<std::optional<std::string>> GenerateStage(
+      std::unique_ptr<HloModule> module, absl::string_view stage) override;
+
+  // Returns a set of stages supported by the opt provider.
+  std::set<std::string> SupportedStages() override;
+
+ protected:
+  // Returns platform name associated with the provider.
+  virtual std::string GetPlatformName() = 0;
+
+  // Returns a stream executor for the provider (could be nullptr).
+  virtual absl::StatusOr<se::StreamExecutor *> GetExecutor();
+
+  // Generates executable from a given input module.
+  absl::StatusOr<std::unique_ptr<Executable>> GetExecutable(
+      std::unique_ptr<HloModule> input_module);
+
+  // Generates optimized HLO.
+  absl::StatusOr<std::unique_ptr<HloModule>> GetOptimizedHlo(
+      std::unique_ptr<HloModule> input_module);
+
+  // Gets a compiler associated with the provider.
+  virtual absl::StatusOr<Compiler *> GetCompiler();
+
+  // Registers hardware-specific passes which are shared by
+  // multiple backends (CPU, GPU, xPU).
+  void RegisterSharedHardwareSpecificPasses();
+};
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_OPT_COMPILED_OPT_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_slicer.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_slicer.h
new file mode 100644
index 00000000..d9d5db1e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/hlo_slicer.h
@@ -0,0 +1,256 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_SLICER_H_
+#define XLA_TOOLS_HLO_SLICER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// Define FrontierSelector, which is a lambda that, given an HLO
+// instruction, returns true if we should continue propagation from this
+// instruction, otherwise return false.
+using FrontierSelector = std::function<bool(const HloInstruction*)>;
+
+// The data structure capturing the outputs of forward/backward slicing.
+class SliceOutput {
+ public:
+  SliceOutput(absl::flat_hash_map<const HloComputation*,
+                                  absl::flat_hash_set<const HloInstruction*>>
+                  sliced_instructions,
+              absl::flat_hash_map<const HloComputation*,
+                                  absl::flat_hash_set<const HloInstruction*>>
+                  frontier_instructions,
+              const HloInstruction* nearest_common_ancestor_root = nullptr)
+      : sliced_instructions_(sliced_instructions),
+        frontier_instructions_(frontier_instructions),
+        nearest_common_ancestor_root_(nearest_common_ancestor_root) {}
+
+  explicit SliceOutput(
+      absl::flat_hash_map<const HloComputation*,
+                          absl::flat_hash_set<const HloInstruction*>>
+          sliced_instructions)
+      : sliced_instructions_(sliced_instructions) {}
+
+  // Default constructor.
+  SliceOutput() = default;
+
+  // Returns all the instructions that are sliced, grouped by their parent
+  // computation.
+  const absl::flat_hash_map<const HloComputation*,
+                            absl::flat_hash_set<const HloInstruction*>>&
+  sliced_instructions() const {
+    return sliced_instructions_;
+  }
+
+  // Returns all the instructions that are determined to be at the frontier,
+  // grouped by their parent computation.
+  const absl::flat_hash_map<const HloComputation*,
+                            absl::flat_hash_set<const HloInstruction*>>&
+  frontier_instructions() const {
+    return frontier_instructions_;
+  }
+
+  // Returns the total number of the sliced instructions
+  int NumSlicedInstructions() const {
+    return CountMapOfSet(sliced_instructions_);
+  }
+
+  // Returns the total number of the frontier instructions
+  int NumFrontierInstructions() const {
+    return CountMapOfSet(frontier_instructions_);
+  }
+
+  // If forward slicing and "nearest_common_ancestor_as_new_root" are specified,
+  // return the nearest common ancestor as an HLO instruction. Otherwise, return
+  // nullptr.
+  const HloInstruction* nearest_common_ancestor_root() const {
+    return nearest_common_ancestor_root_;
+  }
+
+  // Computes the intersection of the sliced instructions from two SliceOutput.
+  static absl::flat_hash_map<const HloComputation*,
+                             absl::flat_hash_set<const HloInstruction*>>
+  IntersectSlicedInstructions(SliceOutput slice_a, SliceOutput slice_b) {
+    absl::flat_hash_map<const HloComputation*,
+                        absl::flat_hash_set<const HloInstruction*>>
+        intersect_sliced_instructions;
+    auto& sliced_instructions_a = slice_a.sliced_instructions();
+    auto& sliced_instructions_b = slice_b.sliced_instructions();
+    for (auto& [computation, instructions] : sliced_instructions_a) {
+      for (auto& instruction : instructions) {
+        if (sliced_instructions_b.contains(computation) &&
+            sliced_instructions_b.at(computation).contains(instruction)) {
+          intersect_sliced_instructions[computation].insert(instruction);
+        }
+      }
+    }
+    return intersect_sliced_instructions;
+  }
+
+  // Computes the union of the sliced instructions from two SliceOutput.
+  static absl::flat_hash_map<const HloComputation*,
+                             absl::flat_hash_set<const HloInstruction*>>
+  UnionSlicedInstructions(SliceOutput slice_a, SliceOutput slice_b) {
+    absl::flat_hash_map<const HloComputation*,
+                        absl::flat_hash_set<const HloInstruction*>>
+        union_sliced_instructions;
+    auto& sliced_instructions_a = slice_a.sliced_instructions();
+    auto& sliced_instructions_b = slice_b.sliced_instructions();
+
+    for (auto& sliced_instructions :
+         {sliced_instructions_a, sliced_instructions_b}) {
+      for (auto& [computation, instructions] : sliced_instructions) {
+        for (auto& instruction : instructions) {
+          union_sliced_instructions[computation].insert(instruction);
+        }
+      }
+    }
+    return union_sliced_instructions;
+  }
+
+ private:
+  // A map that maps from sliced HLO computation to sliced HLO
+  // instructions (excluding the parts of the HLO computations/instructions that
+  // are irrelevant).
+  absl::flat_hash_map<const HloComputation*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      sliced_instructions_;
+
+  // A map that maps from the computations to the instructions that form the
+  // slicing frontier.
+  absl::flat_hash_map<const HloComputation*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      frontier_instructions_;
+
+  // The computed nearest common ancestor.
+  const HloInstruction* nearest_common_ancestor_root_;
+
+  // Counts the number of HloInstruction in the data structure
+  // `map<HloComputation, set<HloInstruction>>`.
+  int CountMapOfSet(
+      const absl::flat_hash_map<const HloComputation*,
+                                absl::flat_hash_set<const HloInstruction*>>&
+          to_count) const {
+    int count = 0;
+    for (const auto& [key, set] : to_count) {
+      count += set.size();
+    }
+    return count;
+  }
+};
+
+// Conduct inter-computation program slicing.
+//
+// `slice_starting_instructions`: the starting HLO instructions of slicing.
+// `ignore_control_dependency`: if set as true, control dependency will be
+// ignored during slicing.
+// `frontier_selector`: a lambda function that dictates the ending points (i.e.,
+// frontier) of the slicing.
+//
+// 'forward_slice': conduct forward slicing (i.e., in the direction that from
+// the `slice_starting_instructions` to the ROOT) if set as true, conduct
+// backward slicing (i.e., from the `slice_starting_instructions` to the leaf
+// nodes) otherwise.
+//
+// `nearest_common_ancestor_as_root`: this option is only available when
+// `forward_slice` is true and `FrontierSelector` is not specified. When
+// enabled, this function would compute one of the nearest common ancestor (NCA)
+// as an HloInstructionm with `slice_starting_instructions` as the starting
+// points, and only slice down to the NCA . Note that there could be multiple
+// NCAs in the DAG. We would use the first NCA we encounter during the
+// traversal as the root, and return it as `nearest_common_ancestor_root` in
+// `SliceOutput`. Please check the test
+// `HloSlicerTest.ForwardSlicingNearestCommonAncestor` and
+// `MultipleComputationForwardSlicingNearestCommonAncestor` for detailed
+// examples. Use the original root in the entry computation as the forward
+// slicing root if this option is not enabled.
+SliceOutput SliceModule(
+    const HloModule* hlo_module,
+    absl::Span<const HloInstruction*> slice_starting_instructions,
+    FrontierSelector frontier_selector = nullptr,
+    bool ignore_control_dependency = false, bool forward_slice = true,
+    bool nearest_common_ancestor_as_root = false);
+
+// Specifies slicing configurations.
+//
+// `forward_slicing`: how forward slicing is conducted from the
+// the hlo instructions we are starting slicing from.
+//    kRoot: slice to the root instruction of the entry computation.
+//    kNca: slice to the nearest common ancestors of the starting hlo
+//    instructions.
+//
+// `backward_slicing`: if backward slicing is conducted from the hlo
+// instructions we are starting slicing from.
+//
+// `remove_sharding`: if the custom call to Sharding should be removed. If
+// specified as true, the custom call instruction to sharding (e.g.,
+// %custom-call = bf16[8] custom-call(bf16[8] %multiply),
+// custom_call_target="Sharding", sharding={replicated}) will be removed./
+//
+// `reduce_tuple_parameter`: If specified as true, we will try to reduce the
+// size of parameters of entry computation if they are tuple. Specifically, for
+// each parameters of entry computation, if it is of tuple type, we will remove
+// the elements that are not used by any other instructions. This is useful when
+// slicing from a large module.
+//
+// `slicing_group`: `SliceModuleAndExtract` groups
+// `slicing_starting_instructions` into multiple non-overlapping groups, and
+// for each group of `slicing_starting_instructions`, slice/extract an HLO
+// module. The `slicing_group` specifies the number of
+// `slicing_starting_instructions` each group contains. For example,
+// say `slicing_start_instructions` = {a, b, c ,d}. If `slicing_group` = 1,
+// there would be 4 sliced/extracted HLO modules, sliced from {a}, {b}, {c},
+// {d}, respectively. If `slicing_group` = 2, there would be 2 sliced/extracted
+// HLO modules, sliced from {a, b}, {c, d}, respectively. The
+// `slicing_starting_instructions` are grouped accoding to order in the
+// absl::Span. When `slicing_group` = -1, there would be only one group which
+// contains all the `slice_starting_instructions`, so there would be only 1
+// sliced/extracted module. `slicing_group` can only be -1 or positive integer.
+struct SlicingConfiguration {
+  enum class ForwardSlicingConfig { kRoot, kNca };
+  ForwardSlicingConfig forward_slicing = ForwardSlicingConfig::kRoot;
+  bool backward_slicing = false;
+  bool remove_sharding = false;
+  bool reduce_tuple_parameter = false;
+  int slicing_group = -1;
+};
+
+// Slices from the `hlo_module` from the `slicing_starting_instructions`,
+// following configurations specified by `slicing_configuration`, and return
+// (multiple) sliced hlo modules.
+//
+// `slice_starting_instructions`: the starting HLO instructions of slicing.
+//
+// `slicing_configuration`: specifies how the slicing is conducted. Please
+// check more details at the comments of `SlicingConfiguration`.
+std::vector<std::unique_ptr<HloModule>> SliceModuleAndExtract(
+    const HloModule* hlo_module,
+    absl::Span<const HloInstruction*> slice_starting_instructions,
+    const SlicingConfiguration& slicing_configuration);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_SLICER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/multihost_hlo_runner/create_client.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/multihost_hlo_runner/create_client.h
new file mode 100644
index 00000000..0d1fbdd4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/multihost_hlo_runner/create_client.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_MULTIHOST_HLO_RUNNER_CREATE_CLIENT_H_
+#define XLA_TOOLS_MULTIHOST_HLO_RUNNER_CREATE_CLIENT_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/pjrt/distributed/client.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/distributed/service.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+
+namespace xla {
+
+struct PjRtEnvironment {
+  // Sequence matters here, client should be destroyed before service.
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  std::unique_ptr<xla::PjRtClient> client;
+  std::shared_ptr<xla::KeyValueStoreInterface> kv_store;
+  std::shared_ptr<xla::DistributedRuntimeClient> distributed_client;
+};
+
+// Creates an environment with a PjRtClient for host CPU.
+absl::StatusOr<PjRtEnvironment> GetPjRtEnvironmentForHostCpu();
+
+// Creates an environment with a PjRtClient for GPU and potentially distributed
+// runtime components if using multiple GPU nodes.
+// In GPU options `kv_store` will be initialized separately for the multi-node
+// environment.
+absl::StatusOr<PjRtEnvironment> GetPjRtEnvironmentForGpu(
+    absl::string_view address, GpuClientOptions gpu_options,
+    absl::Duration init_timeout);
+
+// Creates a PjRtClient which can run HLOs on Host CPU.
+absl::StatusOr<std::unique_ptr<PjRtClient>> CreateHostClient();
+
+// Creates a PjRtClient which can run HLOs on GPU.
+absl::StatusOr<std::unique_ptr<PjRtClient>> CreateGpuClient(
+    const GpuClientOptions& options);
+
+// Creates a PjRtClient which mocks multi-host GPU runs.
+absl::StatusOr<std::unique_ptr<PjRtClient>> CreateMockGpuClient(
+    int num_nodes = 1);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_MULTIHOST_HLO_RUNNER_CREATE_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
new file mode 100644
index 00000000..15ce5de0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -0,0 +1,429 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_MULTIHOST_HLO_RUNNER_FUNCTIONAL_HLO_RUNNER_H_
+#define XLA_TOOLS_MULTIHOST_HLO_RUNNER_FUNCTIONAL_HLO_RUNNER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/shape.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Supported input formats for the input HLO module.
+enum class InputFormat {
+  kText,                 // Text format returned by HloModule::ToString().
+  kProtoText,            // Protobuf text format of an xla::HloProto message.
+  kProtoBinary,          // Protobuf binary format of an xla::HloProto message.
+  kSnapshotProtoBinary,  // HloSnapshot protobuf binary format. Can be dumped by
+                         // TensorFlow by setting the environment variable
+                         // xla_dump_hlo_snapshots.
+};
+
+// Interface for profiler plugins. If being set in RunningOptions, profiling
+// session will be created for the last run of the HLO module.
+class ProfilerInterface {
+ public:
+  virtual ~ProfilerInterface() = default;
+  // Creates profiling session while running HLO module.
+  virtual void CreateSession() = 0;
+  // Uploads profiling session data after finishing running HLO module.
+  virtual void UploadSession() = 0;
+};
+
+bool AbslParseFlag(absl::string_view text, InputFormat* input_format,
+                   std::string* error);
+std::string AbslUnparseFlag(InputFormat input_format);
+
+// FunctionalHloRunner takes an HLO module as input and runs the HLO module
+// on a single or multiple hosts with various options (e.g. SPMD). The HLO
+// module can be pre- or post-optimizations.
+// TODO(b/306118803): replace this fully stateless class by a namespace.
+class FunctionalHloRunner {
+ public:
+  // This class has only static methods.
+  FunctionalHloRunner() = delete;
+
+  using LiteralVec = std::vector<Literal>;
+  using ShapeVec = std::vector<Shape>;
+  using PerDeviceLiteralVecType = absl::btree_map<int, LiteralVec>;
+  using PerDeviceShapeVecType = absl::btree_map<int, ShapeVec>;
+  using PerDeviceIndexVecType = absl::btree_map<int, std::vector<int>>;
+
+  enum class LogOutputMode { kLogOutput, kNotLogOutput };
+
+  enum class HloPassesMode {
+    // Call only XLA's RunBackend during the compilation. This is used to run a
+    // post-optimization HLO module (dumped as
+    // 'xxx.after_optimizations.hlo.xxx').
+    kRunXLABackendOnly,
+    // Calls Compile (i.e., both RunHloPasses and RunBackend) to compile the
+    // module, but disables all HLO passes.
+    kDisableAllHloPasses,
+    // Standard XLA compilation by calling Compile (or both RunHloPasses and
+    // RunBackend). This is used to run a pre-optimizations module.
+    kStandardCompile
+  };
+
+  enum class SpmdMode : int8_t {
+    kUseSpmdPartitioning,    // Use the GSPMD partitioner for partitioning.
+    kUseShardyPartitioning,  // Use the Shardy partitioner for partitioning.
+    kNotUseSpmdPartitioning  // Do not perform partitioning.
+  };
+
+  enum class SpmdPartitionedMode {
+    kIsSpmdPartitionedModule,
+    kIsNotSpmdPartitionedModule
+  };
+
+  enum class XlaTextDumpMode { kDumpAsText, kNotDumpAsText };
+
+  enum class XlaProtoDumpMode { kDumpAsProto, kNotDumpAsProto };
+
+  enum class ModuleArgumentMode {
+    // Use device ID (casted to proper type) as arguments.
+    kUseDeviceIdAsInput,
+    // Use random values as arguments.
+    kUseRandomInputs,
+    // Use random values as arguments, and different local devices share the
+    // same argument values.
+    kUseSharedRandomInputs,
+    // Use arguments which have all of their bytes set to 0 (not respecting any
+    // constraints on the range).
+    kUseZerosAsInput,
+    // Use uninitialized device buffers as arguments (not respecting any
+    // constraints on the range). This drastically reduces
+    // the host memory usage and the startup time.
+    kUninitialized,
+  };
+
+  enum class ModuleOutputMode {
+    // Return output from all devices.
+    kReturnOutputs,
+    // Do not return output from any device.
+    kNotReturnOutputs,
+    // Return the output only from the logical device 0.
+    kReturnDevice0Outputs
+  };
+
+  // The options controlling the preprocessing of the HLO before it's compiled
+  // and executed.
+  struct PreprocessingOptions {
+    // This indicates whether the module is the partitioned result of SPMD. If
+    // yes, we will add (replicated) sharding annotations to the module.
+    SpmdPartitionedMode spmd_partitioned_mode =
+        SpmdPartitionedMode::kIsNotSpmdPartitionedModule;
+    // If set, we will flatten all while loops to the specified number of
+    // iterations.
+    std::optional<int> while_execution_count = std::nullopt;
+    // If set, we will remove all infeed and outfeed operations.
+    bool remove_infeed_outfeed = true;
+
+    // If set, we will flatten all conditional operations by setting default
+    // branch index to N-1 for indexed conditionals. Default PRED is false for
+    // predicated conditionals if conditional_value is not set.
+    bool flatten_conditional = false;
+
+    // If set, used as default predicate value for predicated conditional ops.
+    bool conditional_value = false;
+
+    // Should we flatten all while loops?
+    bool flatten_while_loop() const {
+      return while_execution_count.has_value();
+    }
+
+    // Is the module the partitioned result of SPMD?
+    bool is_spmd_partitioned_module() const {
+      return spmd_partitioned_mode ==
+             SpmdPartitionedMode::kIsSpmdPartitionedModule;
+    }
+  };
+
+  // The options controlling the compilation of the HLO module.
+  //
+  // A CompileOptions object can be created from this with CreateCompileOptions.
+  struct RawCompileOptions {
+    HloPassesMode hlo_passes_mode = HloPassesMode::kStandardCompile;
+    SpmdMode spmd_mode = SpmdMode::kNotUseSpmdPartitioning;
+    // We can set additional build options by specifying an ExecutionOptions
+    // message.
+    //
+    // It can also specify the number of replicas and partitions - in
+    // that case we don't have to set num_replicas and num_partitions.
+    std::optional<ExecutionOptions> execution_options = std::nullopt;
+    std::optional<int> num_replicas = 1;
+    std::optional<int> num_partitions = 1;
+    // See the comment on xla::MultiSliceConfig.
+    std::optional<int> num_slices = std::nullopt;
+    // A directory to dump xla debug data to.
+    std::string xla_dump_to = "";
+    XlaTextDumpMode xla_text_dump_mode = XlaTextDumpMode::kNotDumpAsText;
+    XlaProtoDumpMode xla_proto_dump_mode = XlaProtoDumpMode::kNotDumpAsProto;
+  };
+
+  // The options controlling the execution of the HLO module.
+  struct RunningOptions {
+    // Option controlling the inputs of the HLO.
+    ModuleArgumentMode module_argument_mode =
+        ModuleArgumentMode::kUseRandomInputs;
+    // Option controlling the outputs of the HLO.
+    ModuleOutputMode module_output_mode = ModuleOutputMode::kReturnOutputs;
+    // Repeatedly execute the HLO for this many times.
+    size_t num_repeats = 1;
+    // If true, we recreate the buffers between repeats to reset of effect of
+    // buffer donation.
+    bool recreate_buffers_between_repeats = false;
+    // This indicates whether we log the inputs and outputs to stderr.
+    LogOutputMode log_input_output_mode = LogOutputMode::kNotLogOutput;
+    const MultiSliceConfig* multi_slice_config = nullptr;
+    ProfilerInterface* profiler = nullptr;
+    // Whether to untuple the result of running HLO module into a vector of
+    // arrays. If unprovided, use the default in ExecuteOptions.
+    std::optional<bool> untuple_result = std::nullopt;
+
+    // Should we log the inputs and outputs to stderr?
+    bool log_input_output() const {
+      return log_input_output_mode == LogOutputMode::kLogOutput;
+    }
+  };
+
+  struct HloModuleAndArguments {
+    std::unique_ptr<HloModule> hlo_module;
+    std::vector<Literal> arguments;
+  };
+
+  struct ReplicasAndPartitions {
+    int replicas = 1;
+    int partitions = 1;
+  };
+
+  // Loads an ExecutionOptions proto (which can be used in RawCompileOptions).
+  static absl::StatusOr<ExecutionOptions> LoadExecutionOptions(
+      absl::string_view path);
+
+  // Creates the compilation options.
+  //
+  // If RawCompileOptions::num_slices is set, the
+  // CompileOptions::device_assignment has to be set manually.
+  static absl::StatusOr<CompileOptions> CreateCompileOptions(
+      const PjRtClient& client,
+      const FunctionalHloRunner::RawCompileOptions& raw_options,
+      int task_id = 0, int num_nodes = 1,
+      std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr);
+
+  // Runs on HLO module and dumps the output if needed.
+  //
+  // This is the highest level API in this file.
+  static absl::Status LoadAndRunAndDump(
+      PjRtClient& client, const DebugOptions& debug_options,
+      const xla::FunctionalHloRunner::PreprocessingOptions& preproc_options,
+      const xla::FunctionalHloRunner::RawCompileOptions& raw_compile_options,
+      const xla::FunctionalHloRunner::RunningOptions& running_options,
+      absl::string_view hlo_text, InputFormat input_format,
+      std::string dump_output_to = "", int task_id = 0, int num_nodes = 1,
+      std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr);
+
+  // Loads an HLO module from hlo_file according to input_format and run it.
+  // The HLO module is run with the provided arguments if the arguments map is
+  // not empty. Otherwise, use arguments from the HLO file or fake arguments.
+  // The hlo file might be a HLO snapshot and thus contain arguments, otherwise
+  // it is run with fake arguments.
+  static absl::StatusOr<PerDeviceLiteralVecType> LoadAndRun(
+      PjRtClient& client, const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options,
+      const CompileOptions& compile_options,
+      const RunningOptions& running_options, absl::string_view hlo_text,
+      InputFormat input_format, const PerDeviceLiteralVecType& arguments = {});
+
+  // Loads and compiles an HLO for debugging purposes.
+  //
+  // This function allows compiling multi-device HLOs on machines with fewer
+  // devices.
+  static absl::Status LoadAndCompile(
+      PjRtClient& client, const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options,
+      const RawCompileOptions& raw_compile_options, absl::string_view hlo_file,
+      InputFormat input_format, int task_id = 0, int num_nodes = 1,
+      std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr,
+      bool use_gpu_count_workaround = true);
+
+  // Compiles and runs the given HLO module with the given arguments for each
+  // device. The given arguments is a map from device ID to a list of arguments.
+  // If the arguments map is empty, the HLO module is run with fake arguments.
+  static absl::StatusOr<PerDeviceLiteralVecType> CompileAndRun(
+      PjRtClient& client, const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options,
+      const CompileOptions& compile_options,
+      const RunningOptions& running_options, HloModule* hlo_module,
+      const PerDeviceLiteralVecType& arguments = {});
+
+  // Compiles the HLO module.
+  static absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      PjRtClient& client, HloModule* hlo_module,
+      const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options,
+      const CompileOptions& compile_options);
+
+  // Ahead-of-time compilation using the PjRtTopologyDescription that's passed
+  // instead of using the registered topology. This enables reproduction of
+  // compilation based on captured information.
+  static absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      PjRtClient& client, HloModule* hlo_module,
+      const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options,
+      const CompileOptions& compile_options,
+      const PjRtTopologyDescription& topology);
+
+  // Runs the executable.
+  static absl::StatusOr<PerDeviceLiteralVecType> Run(
+      PjRtClient& client, PjRtLoadedExecutable* executable,
+      const PerDeviceLiteralVecType& arguments,
+      const RunningOptions& running_options,
+      std::minstd_rand0* engine = nullptr);
+
+  static absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
+      absl::string_view hlo_file);
+  static absl::StatusOr<std::unique_ptr<HloModule>>
+  ReadModuleFromBinaryProtoFile(absl::string_view hlo_file);
+  static absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromTextProtoFile(
+      absl::string_view hlo_file);
+
+  static absl::StatusOr<HloModuleAndArguments>
+  ReadModuleFromSnapshotBinaryProtoFile(absl::string_view hlo_file);
+  static absl::StatusOr<HloModuleAndArguments> LoadHloModuleAndArguments(
+      absl::string_view hlo_file, InputFormat input_format);
+
+  static absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromString(
+      absl::string_view hlo_text);
+
+  static absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromProto(
+      const HloModuleProto& proto);
+
+  // This would ideally be private, but we need it for the implementation of
+  // MultihostHloRunner.
+  static absl::Status PrepareHloModuleForCompilation(
+      HloModule* hlo_module, const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options);
+  // This would ideally be private, but we need it for the implementation of
+  // MultihostHloRunner.
+  static CompileOptions CompleteCompileOptions(const HloModule& hlo_module,
+                                               CompileOptions compile_options);
+
+  static absl::Status DumpOutput(
+      const FunctionalHloRunner::PerDeviceLiteralVecType& output,
+      absl::string_view dump_output_to, int task_id);
+
+ private:
+  // Calculates the requested number of replicas and partitions.
+  //
+  // The explicit num_replicas and num_partitions options override
+  // execution_options.
+  //
+  // Regarding the num_slices parameter, see the comment on
+  // xla::MultiSliceConfig.
+  static ReplicasAndPartitions GetReplicasAndPartitions(
+      const std::optional<ExecutionOptions>& execution_options,
+      int device_count, const std::optional<int>& num_replicas,
+      const std::optional<int>& num_partitions, int num_slices = 1);
+
+  // Creates an ExecutableBuildOptions using the specified ExecutionOptions.
+  static ExecutableBuildOptions
+  CreateExecutableBuildOptionsFromExecutionOptions(
+      const ExecutionOptions& execution_options);
+
+  static absl::Span<PjRtDevice* const> GetLocalDevices(
+      const PjRtClient& client);
+
+  // Creates fake arguments to run the given executable.
+  static absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+  CreateArgumentsOnDevice(PjRtClient& client,
+                          const PjRtLoadedExecutable* executable,
+                          const RunningOptions& running_options,
+                          bool flatten_arguments = false,
+                          std::minstd_rand0* engine = nullptr);
+
+  // Creates uninitialized arguments to run the given executable.
+  static absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+  CreateUninitializedArgumentsOnDevice(PjRtClient& client,
+                                       const PjRtLoadedExecutable* executable,
+                                       const RunningOptions& running_options,
+                                       bool flatten_arguments = false);
+
+  // Creates argument buffers based on the given arguments map. Note that the
+  // arguments might be invalid when arguments are destructed.
+  static absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+  CopyArgumentsToDevice(PjRtClient& client,
+                        const PjRtLoadedExecutable* executable,
+                        const PerDeviceLiteralVecType& arguments,
+                        const RunningOptions& options, bool flattened_arguments,
+                        bool clone_device0_arguments = false);
+
+  static absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
+      PjRtClient& client, PjRtLoadedExecutable* executable,
+      std::function<absl::StatusOr<
+          std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(bool)>
+          create_argument_buffers_on_device,
+      const RunningOptions& running_options);
+
+  static absl::StatusOr<PerDeviceLiteralVecType> FetchAndLogOutput(
+      PjRtClient& client,
+      const std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>&
+          output_buffers,
+      ModuleOutputMode module_output_mode, bool log_output);
+
+  static ReplicasAndPartitions GetReplicasAndPartitionsInternal(
+      const std::optional<ExecutionOptions>& execution_options,
+      int device_count, const std::optional<int>& num_replicas,
+      const std::optional<int>& num_partitions, int num_slices = 1);
+};
+
+bool AbslParseFlag(absl::string_view text,
+                   FunctionalHloRunner::ModuleArgumentMode* argument_mode,
+                   std::string* error);
+std::string AbslUnparseFlag(
+    FunctionalHloRunner::ModuleArgumentMode argument_mode);
+
+bool AbslParseFlag(absl::string_view text,
+                   FunctionalHloRunner::ModuleOutputMode* output_mode,
+                   std::string* error);
+std::string AbslUnparseFlag(FunctionalHloRunner::ModuleOutputMode output_mode);
+
+void AddShardingAnnotationsToSpmdPartitionedModule(HloModule* hlo_module);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_MULTIHOST_HLO_RUNNER_FUNCTIONAL_HLO_RUNNER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/prepare_reference_module.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/prepare_reference_module.h
new file mode 100644
index 00000000..f26e8474
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/prepare_reference_module.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_PREPARE_REFERENCE_MODULE_H_
+#define XLA_TOOLS_PREPARE_REFERENCE_MODULE_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/stream_executor/platform.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+// A helper function that takes a HloModule, derives a HloModuleConfig from it
+// which disables fast-math und sets the DebugOptions from flags, then runs the
+// deoptimization pipeline (or calls 'module_modifier_hook' if provided). This
+// is meant to produce a reference module that is comparable to our custom test
+// platforms.
+absl::StatusOr<std::unique_ptr<HloModule>> PrepareReferenceModule(
+    const HloModule& test_module, HloRunnerInterface* test_runner,
+    const std::function<void(HloModuleConfig*)>& config_modifier_hook = {},
+    const std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                                     HloModule*)>& module_modifier_hook = {},
+    bool skip_despecialization = false);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_PREPARE_REFERENCE_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/run_hlo_module.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/run_hlo_module.h
new file mode 100644
index 00000000..66afdc55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/run_hlo_module.h
@@ -0,0 +1,104 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_RUN_HLO_MODULE_H_
+#define XLA_TOOLS_RUN_HLO_MODULE_H_
+
+#include <functional>
+#include <memory>
+#include <random>
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/tools/run_hlo_module.pb.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+
+class BufferAssignmentProto;
+
+// Command-line options to this tool.  See main() in run_hlo_module_main.cc for
+// descriptions of these fields.
+struct RunHloModuleOptions {
+  std::string platform;
+  std::string reference_platform{"default"};
+  bool print_literals{false};
+  bool flatten_control_flow{false};
+  bool run_test_hlo_passes{true};
+  bool run_reference_hlo_passes{true};
+  bool force_use_cpu_thunk_runtime_for_test{false};
+  // Using small float range by default, as otherwise all reductions
+  // miscompare vs. the interpreter with inf/nan.
+  bool use_large_float_range{false};
+  bool treat_gte_as_data_formatting{false};
+  float abs_error_bound{1e-3};
+  float rel_error_bound{1e-3};
+  std::string input_format;
+  bool use_buffer_assignment_from_proto{false};
+  // The format and the usage of the option is platform-dependent.
+  std::string input_compilation_environments;
+  int iterations{1};
+  std::string output_literals_file;
+  std::string input_literals_file;
+  bool random_init_input_literals{true};
+  bool force_fake_data{false};
+  bool isolate_instructions{false};
+};
+
+// Runs test_module on the platform with the name
+// 'test_platform_name', and if 'reference_platform_name' is non-empty, it also
+// runs it on the platform with the name 'reference_platform_name' and compares
+// the results. 'reference_module_modifier_hook' can be used to transform the
+// HloModule before it is run on the reference platform. This may be necessary
+// to match the numerics of the test platform.
+absl::Status RunAndCompare(
+    std::unique_ptr<HloModule> test_module,
+    const BufferAssignmentProto* buffer_assignment_proto,
+    HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
+    std::minstd_rand0* engine, const RunHloModuleOptions& options,
+    xla::RunHloModuleIterationLiterals* iteration_literals_proto = nullptr,
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
+        reference_module_modifier_hook = {},
+    std::function<void(HloModuleConfig*)> config_modifier_hook = {});
+
+// Same as above but reads an HloModule from 'hlo_filename'. It also takes as
+// an argument, a function 'compilation_env_modifier_hook' that potentially sets
+// various fields in compilation environments, for an HLO module being loaded
+// from the file.
+absl::Status RunAndCompare(
+    const std::string& hlo_filename, HloRunnerInterface* test_runner,
+    HloRunnerInterface* reference_runner, std::minstd_rand0* engine,
+    const RunHloModuleOptions& options,
+    xla::RunHloModuleIterationLiterals* iteration_literals_proto = nullptr,
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
+        reference_module_modifier_hook = {},
+    std::function<void(HloModuleConfig*)> config_modifier_hook = {},
+    std::function<absl::Status(const RunHloModuleOptions& options,
+                               HloModule& module)>
+        compilation_env_modifier_hook = {});
+
+// Read the input literals from 'file_path'. The file can be either a binary
+// proto or a text proto. If it doesn't contain a RunHloModuleLiterals proto, it
+// will fallback to reading a RunHloModuleIterationLiterals proto and use that
+// for the first entry in 'iterations'.
+void ReadInputLiteralsFromFile(const std::string& file_path,
+                               xla::RunHloModuleLiterals* input_literals_proto);
+}  // namespace xla
+
+#endif  // XLA_TOOLS_RUN_HLO_MODULE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tools/xla_compile_lib.h b/third_party/tflite-hdrs/third_party/xla/xla/tools/xla_compile_lib.h
new file mode 100644
index 00000000..3892f156
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tools/xla_compile_lib.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_XLA_COMPILE_LIB_H_
+#define XLA_TOOLS_XLA_COMPILE_LIB_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/symbol_repository.h"
+#include "xla/service/xla_compile_result.pb.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Compiles the provided module for the given platform.
+// When compiling for GPU, if the target config is provided, the compilation
+// will be AOT. If it is not provided, an attached GPU will be used. When
+// compiling for CPU, the compilation will always be AOT. If a result is
+// provided, the post-optimization module will be stored in it.
+//
+// This is the expected entry point to the compilation functionality.
+absl::StatusOr<std::string> CompileExecutable(
+    std::unique_ptr<HloModule> hlo_module, BackendType backend,
+    std::optional<Compiler::TargetConfig> target_config,
+    CompilationResult& result);
+
+// Merges the measured duration into compilation_result and writes
+// compilation_result to result_output_file in the wire format.
+absl::Status WriteResultFile(absl::string_view result_output_file,
+                             TimerStats& stats,
+                             CompilationResult& compilation_result);
+
+// Loads the HLO, MHLO, or StableHLO module at the given file path.
+absl::StatusOr<std::unique_ptr<HloModule>> LoadModule(
+    absl::string_view module_path);
+
+struct XlaCompileOptions {
+  // Fully backend-independent options.
+  std::string module_path;
+  std::string output_path;
+  std::string platform;
+  std::string result_output_file;
+
+  // Options for SymbolRepository lookup.
+  struct SymbolRepoOptions {
+    std::string symbol_repo;
+    std::string symbol_id;
+    std::string optimized_symbol_id;
+    bool wait_for_uploads = false;
+  };
+
+  // GPU-specific options.
+  struct GpuOptions {
+    std::string gpu_target_config_path;
+    bool use_attached_device = false;
+    std::string autotune_results_path;
+  };
+
+  SymbolRepoOptions repo_options;
+  GpuOptions gpu_options;
+};
+
+// Full entry point if you want to wrap a binary around this functionality. See
+// flag definitions in ../service/xla_compile_main.cc for semantics, which
+// correspond to fields in XlaCompileOptions.
+absl::Status XlaCompileMain(const XlaCompileOptions& compile_options);
+
+namespace internal {
+
+// Loads autotuning data if autotuning is enabled and autotuning results are
+// present. Returns true if data was present and successfully loaded, false
+// otherwise.
+absl::StatusOr<bool> LoadAutotuneDataFromModule(HloModuleAndMetadata* mod,
+                                                BackendType backend);
+
+}  // namespace internal
+}  // namespace xla
+
+#endif  // XLA_TOOLS_XLA_COMPILE_LIB_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.h
new file mode 100644
index 00000000..2b5f8198
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/hlo_to_mhlo/attribute_importer.h"
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
new file mode 100644
index 00000000..0ebd37fa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h"
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
new file mode 100644
index 00000000..8577e86d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h"
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
new file mode 100644
index 00000000..4943ef79
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
new file mode 100644
index 00000000..50e31028
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -0,0 +1,24 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines helpers useful when creating or manipulating lhlo/hlo.
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/hlo_to_mhlo/hlo_utils.h"
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/translate.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
new file mode 100644
index 00000000..4ed0dc5c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/hlo_to_mhlo/translate.h"
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
new file mode 100644
index 00000000..2caf77bf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
+#define XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
+
+#endif  // XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
new file mode 100644
index 00000000..6005d23d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
@@ -0,0 +1,24 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities for working with XLA layout and shapes.
+
+#ifndef XLA_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
+#define XLA_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/mhlo_to_hlo/layout_util.h"
+
+#endif  // XLA_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.h
new file mode 100644
index 00000000..b5c43ce4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.h
@@ -0,0 +1,22 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
+#define XLA_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/mhlo_to_hlo/location_exporter.h"
+
+#endif  // XLA_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
new file mode 100644
index 00000000..1544b99e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
+#define XLA_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
+
+#endif  // XLA_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/translate.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/translate.h
new file mode 100644
index 00000000..373eaca3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/translate.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
+#define XLA_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/mhlo_to_hlo/translate.h"
+
+#endif  // XLA_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.h
new file mode 100644
index 00000000..2e99276e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.h
@@ -0,0 +1,22 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
+#define XLA_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
+
+#endif  // XLA_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/translate/stablehlo_to_hlo/translate.h b/third_party/tflite-hdrs/third_party/xla/xla/translate/stablehlo_to_hlo/translate.h
new file mode 100644
index 00000000..badaeeaa
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/translate/stablehlo_to_hlo/translate.h
@@ -0,0 +1,22 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
+#define XLA_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
+
+// The current header will be deprecated in favour of the following.
+#include "xla/hlo/translate/stablehlo_to_hlo/translate.h"
+
+#endif  // XLA_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status.h
new file mode 100644
index 00000000..01745601
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status.h
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_C_TSL_STATUS_H_
+#define XLA_TSL_C_TSL_STATUS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TSL_Status TSL_Status;
+
+// --------------------------------------------------------------------------
+// TSL_Code holds an error code.  The enum values here are identical to
+// corresponding values in error_codes.proto.
+typedef enum TSL_Code {
+  TSL_OK = 0,
+  TSL_CANCELLED = 1,
+  TSL_UNKNOWN = 2,
+  TSL_INVALID_ARGUMENT = 3,
+  TSL_DEADLINE_EXCEEDED = 4,
+  TSL_NOT_FOUND = 5,
+  TSL_ALREADY_EXISTS = 6,
+  TSL_PERMISSION_DENIED = 7,
+  TSL_UNAUTHENTICATED = 16,
+  TSL_RESOURCE_EXHAUSTED = 8,
+  TSL_FAILED_PRECONDITION = 9,
+  TSL_ABORTED = 10,
+  TSL_OUT_OF_RANGE = 11,
+  TSL_UNIMPLEMENTED = 12,
+  TSL_INTERNAL = 13,
+  TSL_UNAVAILABLE = 14,
+  TSL_DATA_LOSS = 15,
+} TSL_Code;
+
+// --------------------------------------------------------------------------
+
+// Return a new status object.
+extern TSL_Status* TSL_NewStatus(void);
+
+// Delete a previously created status object.
+extern void TSL_DeleteStatus(TSL_Status*);
+
+// Record <code, msg> in *s.  Any previous information is lost.
+// A common use is to clear a status: TSL_SetStatus(s, TSL_OK, "");
+extern void TSL_SetStatus(TSL_Status* s, TSL_Code code, const char* msg);
+
+// Record <key, value> as a payload in *s. The previous payload having the
+// same key (if any) is overwritten. Payload will not be added if the Status
+// is OK.
+extern void TSL_SetPayload(TSL_Status* s, const char* key, const char* value);
+
+// Iterates over the stored payloads and calls the `visitor(key, value)`
+// callable for each one. `key` and `value` is only usable during the callback.
+// `capture` will be passed to the callback without modification.
+typedef void (*TSL_PayloadVisitor)(const char* key, const char* value,
+                                   void* capture);
+extern void TSL_ForEachPayload(const TSL_Status* s, TSL_PayloadVisitor visitor,
+                               void* capture);
+
+// Convert from an I/O error code (e.g., errno) to a TSL_Status value.
+// Any previous information is lost. Prefer to use this instead of TSL_SetStatus
+// when the error comes from I/O operations.
+extern void TSL_SetStatusFromIOError(TSL_Status* s, int error_code,
+                                     const char* context);
+
+// Return the code record in *s.
+extern TSL_Code TSL_GetCode(const TSL_Status* s);
+
+// Return a pointer to the (null-terminated) error message in *s.  The
+// return value points to memory that is only usable until the next
+// mutation to *s.  Always returns an empty string if TSL_GetCode(s) is
+// TSL_OK.
+extern const char* TSL_Message(const TSL_Status* s);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // XLA_TSL_C_TSL_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status_helper.h
new file mode 100644
index 00000000..6199c872
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status_helper.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_C_TSL_STATUS_HELPER_H_
+#define XLA_TSL_C_TSL_STATUS_HELPER_H_
+
+#include <memory>
+
+#include "xla/tsl/c/tsl_status.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+
+TSL_Code TSLCodeFromStatusCode(absl::StatusCode code);
+
+absl::StatusCode StatusCodeFromTSLCode(TSL_Code code);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_C_TSL_STATUS_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status_internal.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status_internal.h
new file mode 100644
index 00000000..a535fac0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/c/tsl_status_internal.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_C_TSL_STATUS_INTERNAL_H_
+#define XLA_TSL_C_TSL_STATUS_INTERNAL_H_
+
+#include "xla/tsl/platform/status.h"
+
+// Internal structures used by the status C API. These are likely to change
+// and should not be depended on.
+
+struct TSL_Status {
+  absl::Status status;
+};
+
+#endif  // XLA_TSL_C_TSL_STATUS_INTERNAL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/async_value.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/async_value.h
new file mode 100644
index 00000000..d04efc88
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/async_value.h
@@ -0,0 +1,1059 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_CONCURRENCY_ASYNC_VALUE_H_
+#define XLA_TSL_CONCURRENCY_ASYNC_VALUE_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/optimization.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/tsl/concurrency/concurrent_vector.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+
+class NotifierListNode;
+
+namespace internal {
+
+template <typename T>
+class ConcreteAsyncValue;
+
+template <typename T>
+constexpr bool kMaybeBase = std::is_class<T>::value && !std::is_final<T>::value;
+
+}  // namespace internal
+
+// This is a future of the specified value type. Arbitrary C++ types may be used
+// here, even non-copyable types and expensive ones like tensors.
+//
+// An AsyncValue is in one of two states: unavailable or available. If it is in
+// the unavailable state, it may have a list of waiters which are notified when
+// the value transitions to another state.
+//
+// The actual payload data is stored in the templated subclass
+// ConcreteAsyncValue. This achieves good cache efficiency by storing the meta
+// data and the payload data in consecutive memory locations.
+class AsyncValue {
+ public:
+  class Executor;
+
+  ~AsyncValue();
+
+  // Return true if state is kUnconstructed.
+  bool IsUnconstructed() const;
+
+  // Return true if state is kConstructed.
+  bool IsConstructed() const;
+
+  // Return true if state is kConcrete.
+  bool IsConcrete() const;
+
+  // Return true if state is kError.
+  bool IsError() const;
+
+  // Return true if this async value is resolved to a concrete value or error.
+  bool IsAvailable() const;
+  bool IsUnavailable() const { return !IsAvailable(); }
+
+  // Return true if this is an IndirectAsyncValue that hasn't been resolved.
+  // Currently an IndirectAsyncValue is available if and only if it is resolved.
+  bool IsUnresolvedIndirect() const;
+
+  // Return true if this is an IndirectAsyncValue.
+  bool IsIndirect() const;
+
+  // Return reference count. This should be used for testing and debugging only.
+  uint32_t NumRef() const { return refcount_.load(std::memory_order_acquire); }
+
+  // Return true if reference count is 1.
+  bool IsUnique() const;
+
+  // Add a new reference to this object.
+  //
+  // Return this object for convenience. e.g. when a call-site wants to feed an
+  // AsyncValue object into a function call that takes the object at +1, we can
+  // write: foo(av->AddRef());
+  AsyncValue* AddRef() { return AddRef(1); }
+  AsyncValue* AddRef(uint32_t count);
+
+  // Drop a reference to this object, potentially deallocating it.
+  void DropRef() { DropRef(1); }
+  void DropRef(uint32_t count);
+
+  // Return the stored value as type T. For the consumer of the AsyncValue, this
+  // requires that the state be `kConcrete`. For the producer of the AsyncValue,
+  // the state may also be `constructed`, as long as the producer can ensure
+  // race-free access to the data (e.g. no concurrent writes and reads and no
+  // concurrent changing the state to `kError). For both cases, T must be the
+  // exact type or a base type of the payload type of this AsyncValue. When T is
+  // a base type of the payload type, the following additional conditions are
+  // required:
+  // 1) Both the payload type and T are polymorphic (have virtual function) or
+  //    neither are.
+  // 2) The payload type does not use multiple inheritance.
+  // The above conditions are required since we store only the offset of the
+  // payload type in AsyncValue as data_traits_.buf_offset. Violation of either
+  // 1) or 2) requires additional pointer adjustments to get the proper pointer
+  // for the base type, which we do not have sufficient information to perform
+  // at runtime.
+  template <typename T>
+  const T& get() const;
+
+  // Same as the const overload of get(), except for returning a non-const ref.
+  template <typename T>
+  T& get();
+
+  // Returns the underlying error. IsError() must be true.
+  const absl::Status& GetError() const;
+
+  // Returns the underlying error, or nullptr if there is none.
+  const absl::Status* GetErrorIfPresent() const;
+
+  template <typename T>
+  bool IsType() const {
+    return GetTypeId<T>() == type_id_;
+  }
+
+  // Change state to kConcrete. Requires that this AsyncValue
+  // previously have state constructed.
+  void SetStateConcrete();
+
+  // Construct the payload of the AsyncValue in place and change its state to
+  // kConcrete. Requires that this AsyncValue previously have state
+  // kUnconstructed or kConstructed.
+  template <typename T, typename... Args>
+  void emplace(Args&&... args);
+
+  void SetError(absl::Status status);
+
+  // If the value is available or becomes available, this invokes the waiter
+  // immediately. Otherwise, adds the waiter to the waiter list and calls it
+  // when the value becomes available.
+  template <typename Waiter>
+  void AndThen(Waiter&& waiter);
+
+  // Same as above, but waiter will be invoked on the given executor.
+  template <typename Waiter>
+  void AndThen(Executor& executor, Waiter&& waiter);
+
+  // Return the total number of async values that are currently live in the
+  // process. This is intended for debugging/assertions only, and shouldn't be
+  // used for mainline logic in the runtime.
+  static size_t GetNumAsyncValueInstances() {
+    DCHECK(AsyncValueAllocationTrackingEnabled())
+        << "AsyncValue instance tracking disabled!";
+    return total_allocated_async_values_.load(std::memory_order_relaxed);
+  }
+
+  // Returns true if we track the number of alive AsyncValue instances in
+  // total_allocated_async_values_.
+  static bool AsyncValueAllocationTrackingEnabled() {
+    // For now we track the number of alive AsyncValue instances only in debug
+    // builds.
+#ifdef NDEBUG
+    return false;
+#else
+    return true;
+#endif
+  }
+
+  // What sort of AsyncValue this is.
+  //
+  // We make this an unsigned type so that loading the enum from the bitfield
+  // does not sign extend.
+  enum class Kind : uint8_t {
+    kConcrete = 0,  // ConcreteAsyncValue
+    kIndirect = 1,  // IndirectAsyncValue
+  };
+
+  // Return the kind of this AsyncValue.
+  Kind kind() const { return kind_; }
+
+  class State {
+   public:
+    // The state of AsyncValue.
+    enum StateEnum : int8_t {
+      // The underlying value's constructor is not called and the value is not
+      // available for consumption. This state can transition to kConstructed,
+      // kConcrete and kError.
+      kUnconstructed = 0,
+      // The underlying value's constructor is called but the value is not
+      // available for consumption. This state can transition to
+      // kConcrete and kError.
+      kConstructed = 1,
+      // The underlying value is available for consumption. This state can not
+      // transition to any other state.
+      kConcrete = 2,
+      // This AsyncValue is available and contains an error. This state can not
+      // transition to any other state.
+      kError = 3,
+    };
+
+    State(StateEnum s) : state_(s) {}  // NOLINT
+
+    operator StateEnum() { return state_; }  // NOLINT
+
+    // Return true if state is kUnconstructed.
+    bool IsUnconstructed() const { return state_ == kUnconstructed; }
+
+    // Return true if state is kConstructed.
+    bool IsConstructed() const { return state_ == kConstructed; }
+
+    // Return true if state is kConcrete.
+    bool IsConcrete() const { return state_ == kConcrete; }
+
+    // Return true if state is kError.
+    bool IsError() const { return state_ == kError; }
+
+    // Return true if this async value is resolved to a concrete value or error.
+    bool IsAvailable() const { return state_ == kConcrete || state_ == kError; }
+    bool IsUnavailable() const { return !IsAvailable(); }
+
+    const char* DebugString() const {
+      switch (state_) {
+        case kUnconstructed:
+          return "kUnconstructed";
+        case kConstructed:
+          return "kConstructed";
+        case kConcrete:
+          return "kConcrete";
+        case kError:
+          return "kError";
+      }
+    }
+
+   private:
+    StateEnum state_;
+  };
+
+  // Return which state this AsyncValue is in.
+  State state() const {
+    return waiters_and_state_.load(std::memory_order_acquire).state();
+  }
+
+  // AsyncValue executor allows to customize where the waiter callback is
+  // executed. By default the waiter callback is executed on the caller thread
+  // if async value is already available, or on a thread that sets async value
+  // available (emplacing a value or setting an error), which can accidentally
+  // lead to executing a very expensive computations on a low-latency thread.
+  //
+  // IMPORTANT: It's the caller responsibility to ensure that executor passed to
+  // all `AndThen` or `Map` function calls stay alive while async values have
+  // unresolved waiters waiting to be invoked.
+  class Executor {
+   public:
+    using Task = absl::AnyInvocable<void()>;
+
+    virtual ~Executor() = default;
+
+    virtual void Execute(Task task) = 0;
+  };
+
+ protected:
+  friend class IndirectAsyncValue;
+
+  static constexpr uint16_t kUnknownTypeId = 0;
+
+  // Utility template for tag dispatching.
+  template <typename T>
+  struct TypeTag {};
+
+  template <typename T>
+  AsyncValue(Kind kind, State state, bool is_refcounted, TypeTag<T>)
+      : refcount_(1),
+        kind_(kind),
+        has_vtable_(std::is_polymorphic<T>()),
+        is_refcounted_(is_refcounted),
+        type_id_(GetTypeId<T>()),
+        waiters_and_state_(WaitersAndState(nullptr, state)) {
+    if (AsyncValueAllocationTrackingEnabled() && is_refcounted)
+      total_allocated_async_values_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  AsyncValue(Kind kind, State state, bool is_refcounted)
+      : refcount_(1),
+        kind_(kind),
+        has_vtable_(false),
+        is_refcounted_(is_refcounted),
+        type_id_(kUnknownTypeId),
+        waiters_and_state_(WaitersAndState(nullptr, state)) {
+    if (AsyncValueAllocationTrackingEnabled() && is_refcounted)
+      total_allocated_async_values_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  AsyncValue(const AsyncValue&) = delete;
+  AsyncValue& operator=(const AsyncValue&) = delete;
+
+  void NotifyAvailable(State available_state);
+  void Destroy();
+  void RunWaiters(NotifierListNode* list);
+
+  // IsTypeIdCompatible returns true if the type value stored in this AsyncValue
+  // instance can be safely cast to `T`. This is a conservative check. I.e.
+  // IsTypeIdCompatible may return true even if the value cannot be safely cast
+  // to `T`. However, if it returns false then the value definitely cannot be
+  // safely cast to `T`. This means it is useful mainly as a debugging aid for
+  // use in assert() etc.
+
+  template <typename T, std::enable_if_t<internal::kMaybeBase<T>>* = nullptr>
+  bool IsTypeIdCompatible() const {
+    // We can't do a GetTypeId<T> in this case because `T` might be an abstract
+    // class.  So we conservatively return true.
+    return true;
+  }
+
+  template <typename T, std::enable_if_t<!internal::kMaybeBase<T>>* = nullptr>
+  bool IsTypeIdCompatible() const {
+    return GetTypeId<T>() == type_id_;
+  }
+
+  // Return the ID of the given type. Note that at most 2^16-2 (approx. 64K)
+  // unique types can be used in AsyncValues, since the ID is 16 bits, and 0 and
+  // 2^16-1 are not allowed to be used as type IDs.
+  template <typename T>
+  static uint16_t GetTypeId() {
+    return internal::ConcreteAsyncValue<T>::concrete_type_id_;
+  }
+
+  // Creates a AsyncValue::TypeInfo object for `T` and store it in the global
+  // TypeInfo table. Returns the "type id" for `T` which currently happens to
+  // be one plus the index of this TypeInfo object in the TypeInfo table.
+  //
+  // This should only be called from the initializer for the static
+  // ConcreteAsyncValue concrete_type_id_ field.
+  template <typename T>
+  static uint16_t CreateTypeInfoAndReturnTypeId() {
+    return CreateTypeInfoAndReturnTypeIdImpl(
+        MakeTypeInfo<internal::ConcreteAsyncValue<T>>());
+  }
+
+  std::atomic<uint32_t> refcount_{1};
+
+  Kind kind_ : 2;
+  // has_vtable_ has the same value for a given payload type T. If we want to
+  // use the unused bits here for other purpose in the future, we can move
+  // has_vtable_ to a global vector<bool> indexed by type_id_.
+  const bool has_vtable_ : 1;
+
+  // When is_refcounted_ is false, `AddRef` and `DropRef` have no effect in
+  // optimized builds. We always do reference counting in debug builds to verify
+  // that async values used correctly and we do not have accidental dangling
+  // references.
+  const bool is_refcounted_ : 1;
+
+  // This is a 16-bit value that identifies the type.
+  uint16_t type_id_ = 0;
+
+  // The waiter list and the state are compacted into one single atomic word as
+  // accesses to them are tightly related. To change the state from unavailable
+  // (i.e. kUnconstructed or kConstructed) to available
+  // (i.e. kConcrete or kError), we also need to empty the waiter
+  // list. To add a node to the waiter list, we want to make sure the state is
+  // unavailable, otherwise we could run the new node immediately.
+  //
+  // Invariant: If the state is not available, then the waiter list must be
+  // nullptr.
+  struct WaitersAndState {
+    // We rely on the fact that all `NotifierListNode` values are aligned at
+    // least to 8 bytes and we can encode state in the lowest 3 bits. We use
+    // the conservative estimation of the minimal alignment of pointers returned
+    // from memory allocation functions.
+    //
+    // See: https://en.cppreference.com/w/cpp/types/max_align_t
+    static_assert(alignof(std::max_align_t) >= 8 &&
+                  sizeof(NotifierListNode*) == 8);
+
+    static constexpr uint64_t kStateMask = (1ull << 2) - 1;
+    static constexpr uint64_t kPointerMask = ~kStateMask;
+
+    WaitersAndState(NotifierListNode* ptr, State state) {
+      value = (reinterpret_cast<uintptr_t>(ptr) & kPointerMask) |
+              (state & kStateMask);
+    }
+
+    State state() const {
+      return State(static_cast<State::StateEnum>(value & kStateMask));
+    }
+
+    NotifierListNode* waiter() const {
+      return reinterpret_cast<NotifierListNode*>(value & kPointerMask);
+    }
+
+    uintptr_t value;
+  };
+
+  std::atomic<WaitersAndState> waiters_and_state_;
+
+  // We assume (and static_assert) that this is the offset of ConcreteAsyncValue
+  // data payload so that we can always get a pointer to the start of payload
+  // from an async value pointer. We use alignas attribute to guarantee that the
+  // data payload stored at exactly this offset. It means that types that have
+  // larger alignment requirement are not compatible with AsyncValues.
+  static constexpr int kDataOffset = 64;
+
+ private:
+  // Information about a ConcreteAsyncValue<T> subclass.
+  struct TypeInfo {
+    // Destructor returns the size and alignment of the derived AsyncValue to
+    // be deallocated.
+    using DestructorFn = std::pair<size_t, std::align_val_t> (*)(AsyncValue*);
+    using GetErrorFn = const absl::Status& (*)(const AsyncValue*);
+    using SetErrorFn = void (*)(AsyncValue*, absl::Status);
+    using HasDataFn = bool (*)(const AsyncValue*);
+
+    DestructorFn destructor;
+    GetErrorFn get_error;
+    SetErrorFn set_error;
+    HasDataFn has_data;
+  };
+
+  template <typename Derived>
+  static TypeInfo MakeTypeInfo() {
+    return TypeInfo{
+        [](AsyncValue* v) -> std::pair<size_t, std::align_val_t> {
+          static_cast<Derived*>(v)->~Derived();
+          return {sizeof(Derived), std::align_val_t{alignof(Derived)}};
+        },
+        [](const AsyncValue* v) -> const absl::Status& {
+          return static_cast<const Derived*>(v)->GetError();
+        },
+        [](AsyncValue* v, absl::Status status) {
+          static_cast<Derived*>(v)->SetError(std::move(status));
+        },
+        [](const AsyncValue* v) {
+          return static_cast<const Derived*>(v)->HasData();
+        },
+    };
+  }
+
+  static uint16_t CreateTypeInfoAndReturnTypeIdImpl(const TypeInfo& type_info);
+
+  template <typename T>
+  const T& GetConcreteValue() const;
+
+  // Returns the TypeInfoTable instance (there is one per process).
+  using TypeInfoTable = internal::ConcurrentVector<TypeInfo>;
+  static TypeInfoTable* GetTypeInfoTableSingleton();
+
+  // Get the TypeInfo instance for this AsyncValue.
+  const TypeInfo& GetTypeInfo() const {
+    TypeInfoTable* type_info_table = AsyncValue::GetTypeInfoTableSingleton();
+    DCHECK_NE(type_id_, 0) << "TypeId must be set";
+    return (*type_info_table)[type_id_ - 1];
+  }
+
+  void EnqueueWaiter(absl::AnyInvocable<void()> waiter,
+                     WaitersAndState old_value);
+
+  // This is a global counter of the number of AsyncValue instances currently
+  // live in the process.  This is intended to be used for debugging only, and
+  // is only kept in sync if AsyncValueAllocationTrackingEnabled() returns
+  // true.
+  static std::atomic<size_t> total_allocated_async_values_;
+};
+
+// We only optimize the code for 64-bit architectures for now.
+static_assert(sizeof(AsyncValue) == 16 || sizeof(void*) != 8,
+              "Unexpected size for AsyncValue");
+
+//===----------------------------------------------------------------------===//
+// Functions for awaiting on the async values.
+//===----------------------------------------------------------------------===//
+
+// Blocks the caller thread until the async value becomes available.
+void BlockUntilReady(AsyncValue* async_value);
+
+// Runs the `callee` when all async values become available.
+void RunWhenReady(absl::Span<AsyncValue* const> values,
+                  absl::AnyInvocable<void()> callee);
+void RunWhenReady(absl::Span<RCReference<AsyncValue> const> values,
+                  absl::AnyInvocable<void()> callee);
+
+//===----------------------------------------------------------------------===//
+
+// Traits for customizing AsyncValue behavior for different payload types.
+struct AsyncPayload {
+  // Under the normal behavior, if an AsyncValue is in kConstructed state (i.e.
+  // the payload data is constructed), it will destruct the payload data when
+  // the AsyncValue enters the error state (e.g., on AsyncValue::SetError()).
+  //
+  // However, for the payload types that inherit from `KeepOnError`, AsyncValue
+  // exhibits a different behavior: the payload value if constructed, will be
+  // kept valid when the AsyncValue goes into the error state.
+  struct KeepOnError {};
+};
+
+namespace internal {
+
+// Subclass for storing the concrete payload of the AsyncValue.
+//
+// Async value itself is a container that either holds `absl::Status` (in error
+// state) or a concrete value of type `T` (in concrete state). Async value that
+// holds an `absl::Status` or `absl::StatusOr<T>` is typically a bad idea, and
+// should be expressed as a plain async value of type `T`.
+template <typename T>
+class ConcreteAsyncValue : public AsyncValue {
+ public:
+  // Tag type for making a ConcreteAsyncValue without calling underlying value's
+  // constructor.
+  struct UnconstructedPayload {
+    bool is_refcounted = true;
+  };
+
+  // Tag type for making a ConcreteAsyncValue with the underlying value
+  // constructed but not available for consumption.
+  struct ConstructedPayload {
+    bool is_refcounted = true;
+  };
+
+  // Tag type for making a ConcreteAsyncValue with the underlying value
+  // constructed and available for consumption.
+  struct ConcretePayload {
+    bool is_refcounted = true;
+  };
+
+  // Make a ConcreteAsyncValue with kUnconstructed state.
+  explicit ConcreteAsyncValue(UnconstructedPayload payload)
+      : AsyncValue(Kind::kConcrete, State::kUnconstructed,
+                   payload.is_refcounted, TypeTag<T>()) {
+    VerifyOffsets();
+  }
+
+  // Make a ConcreteAsyncValue with kError state.
+  explicit ConcreteAsyncValue(absl::Status status)
+      : AsyncValue(Kind::kConcrete, State::kError,
+                   /*is_refcounted=*/true, TypeTag<T>()),
+        data_store_{std::move(status)} {
+    VerifyOffsets();
+  }
+
+  // Make a ConcreteAsyncValue with kConstructed state.
+  template <typename... Args>
+  explicit ConcreteAsyncValue(ConstructedPayload payload, Args&&... args)
+      : AsyncValue(Kind::kConcrete, State::kConstructed, payload.is_refcounted,
+                   TypeTag<T>()),
+        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {
+    VerifyOffsets();
+  }
+
+  // Make a ConcreteAsyncValue with kConcrete state.
+  template <typename... Args>
+  explicit ConcreteAsyncValue(ConcretePayload payload, Args&&... args)
+      : AsyncValue(Kind::kConcrete, State::kConcrete, payload.is_refcounted,
+                   TypeTag<T>()),
+        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {
+    VerifyOffsets();
+  }
+
+  ~ConcreteAsyncValue() { Destroy(); }
+
+  // Return the underlying error. IsError() must return true.
+  const absl::Status& GetError() const {
+    DCHECK(IsError());
+    return data_store_.error();
+  }
+
+  void SetError(absl::Status status) {
+    data_store_.SetError(state(), std::move(status));
+    NotifyAvailable(State::kError);
+  }
+
+  const T& get() const {
+    DCHECK(HasData());
+    return data_store_.data();
+  }
+
+  T& get() {
+    DCHECK(HasData());
+    return data_store_.data();
+  }
+
+  // Construct the payload of the AsyncValue in place and change its state to
+  // kConcrete. Requires that this AsyncValue previously have state
+  // unavailable.
+  template <typename... Args>
+  void emplace(Args&&... args) {
+    data_store_.EmplaceData(std::forward<Args>(args)...);
+    NotifyAvailable(State::kConcrete);
+  }
+
+  static bool classof(const AsyncValue* v) {
+    return v->kind() == AsyncValue::Kind::kConcrete;
+  }
+
+ private:
+  friend class AsyncValue;
+
+  // Data and error layout when the payload does not inherit from
+  // AsyncPayload::KeepOnError. This type destructs the payload value on
+  // error. It never keeps both data and error alive at the same time.
+  class DataOrError {
+   public:
+    DataOrError() {}
+
+    explicit DataOrError(absl::Status status)
+        : error_{new absl::Status(std::move(status))} {}
+
+    template <typename... Args>
+    explicit DataOrError(TypeTag<T>, Args&&... args)
+        : data_{std::forward<Args>(args)...} {}
+
+    ~DataOrError() {}
+
+    void Destroy(State s) {
+      if (s == State::kError) {
+        delete error_;
+      } else if (s == State::kConstructed || s == State::kConcrete) {
+        data_.~T();
+      }
+    }
+
+    void SetError(State s, absl::Status status) {
+      DCHECK(s == State::kUnconstructed || s == State::kConstructed);
+      if (s == State::kConstructed) {
+        data_.~T();
+      }
+      error_ = new absl::Status(std::move(status));
+    }
+
+    template <typename... Args>
+    void EmplaceData(Args&&... args) {
+      new (&data_) T(std::forward<Args>(args)...);
+    }
+
+    bool HasData(State s) const {
+      return s == State::kConstructed || s == State::kConcrete;
+    }
+
+    absl::Status& error() { return *error_; }
+    T& data() { return data_; }
+    const absl::Status& error() const { return *error_; }
+    const T& data() const { return data_; }
+
+   private:
+    friend class ConcreteAsyncValue;
+    union {
+      absl::Status* error_;
+      T data_;
+    };
+  };
+
+  // Data and error layout when the payload inherits from
+  // AsyncPayload::KeepOnError. This type does to destruct the payload value
+  // on error. It may keep both data and error alive at the same time.
+  class DataAndError {
+   public:
+    explicit DataAndError(absl::Status status) { SetError(std::move(status)); }
+
+    template <typename... Args>
+    explicit DataAndError(TypeTag<T>, Args&&... args) {
+      EmplaceData(std::forward<Args>(args)...);
+    }
+
+    void Destroy(State s) {
+      if (HasData()) data().~T();
+      error_.reset();
+      has_data_ = false;
+    }
+
+    void SetError(State s, absl::Status status) {
+      DCHECK(!error_);
+      error_ = std::make_unique<absl::Status>(std::move(status));
+    }
+
+    template <typename... Args>
+    void EmplaceData(Args&&... args) {
+      DCHECK(!HasData());
+      new (&data_) T(std::forward<Args>(args)...);
+      has_data_ = true;
+    }
+
+    T& data() { return *reinterpret_cast<T*>(&data_); }
+    const T& data() const { return *reinterpret_cast<const T*>(&data_); }
+
+    bool HasData(State s) const { return has_data_; }
+    bool HasData() const { return has_data_; }
+    const absl::Status& error() const { return *error_; }
+    absl::Status& error() { return *error_; }
+
+   private:
+    friend class ConcreteAsyncValue;
+    using StorageT = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
+
+    StorageT data_;
+    bool has_data_ = false;
+    std::unique_ptr<absl::Status> error_;
+  };
+
+  using DataStoreT =
+      std::conditional_t<std::is_base_of_v<AsyncPayload::KeepOnError, T>,
+                         DataAndError, DataOrError>;
+  alignas(AsyncValue::kDataOffset) DataStoreT data_store_;
+
+  void Destroy() { data_store_.Destroy(state()); }
+  bool HasData() const { return data_store_.HasData(state()); }
+
+  static void VerifyOffsets() {
+    static_assert(offsetof(ConcreteAsyncValue<T>, data_store_.data_) ==
+                      AsyncValue::kDataOffset,
+                  "Offset of ConcreteAsyncValue data payload is assumed to be "
+                  "AsyncValue::kDataOffset == 64");
+  }
+
+  static const uint16_t concrete_type_id_;
+};
+
+template <typename T>
+const uint16_t ConcreteAsyncValue<T>::concrete_type_id_ =
+    AsyncValue::CreateTypeInfoAndReturnTypeId<T>();
+}  // namespace internal
+
+struct DummyValueForErrorAsyncValue {};
+
+class ErrorAsyncValue
+    : public internal::ConcreteAsyncValue<DummyValueForErrorAsyncValue> {
+ public:
+  ErrorAsyncValue(absl::Status status)  // NOLINT
+      : internal::ConcreteAsyncValue<DummyValueForErrorAsyncValue>(
+            std::move(status)) {}
+};
+
+// IndirectAsyncValue represents an un-computed AsyncValue of unspecified kind
+// and maybe unknown type. IndirectAsyncValue is used when an AsyncValue must be
+// returned, but the value it holds is not ready and the producer of the value
+// might not know what type it will ultimately be, or whether it will be an
+// error. The purpose of indirect async value is to be eventually forwarded to
+// a concrete async value with a constructed payload or an error.
+class IndirectAsyncValue : public AsyncValue {
+  friend class AsyncValue;
+
+ public:
+  IndirectAsyncValue()
+      : AsyncValue(Kind::kIndirect, State::kUnconstructed,
+                   /*is_refcounted=*/true) {}
+
+  IndirectAsyncValue* AddRef() { return AddRef(1); }
+  IndirectAsyncValue* AddRef(uint32_t count) {
+    return static_cast<IndirectAsyncValue*>(AsyncValue::AddRef(count));
+  }
+
+  // Mark this IndirectAsyncValue as forwarding to the specified value. This
+  // gives the IndirectAsyncValue a +1 reference.
+  // This method must be called at most once.
+  void ForwardTo(RCReference<AsyncValue> value);
+
+  static bool classof(const AsyncValue* v) {
+    return v->kind() == AsyncValue::Kind::kIndirect;
+  }
+
+  bool IsUnique() const {
+    // In addition to checking the refcount of this IndirectAsyncValue, we also
+    // need to check the refcount of the underlying value. If the underlying
+    // value is not available, we conservatively return false.
+    return (refcount_.load(std::memory_order_acquire) == 1) && IsAvailable() &&
+           value_->IsUnique();
+  }
+
+ protected:
+  // Constructor for TypedIndirectAsyncValue (defined below).
+  template <typename T>
+  explicit IndirectAsyncValue(TypeTag<T>)
+      : AsyncValue(Kind::kIndirect, State::kUnconstructed,
+                   /*is_refcounted=*/true, TypeTag<T>()) {}
+
+  ~IndirectAsyncValue() { Destroy(); }
+
+ private:
+  void Destroy() {
+    if (value_) {
+      value_->DropRef();
+      value_ = nullptr;
+    }
+  }
+
+  AsyncValue* value_ = nullptr;
+};
+
+// TypedIndirectAsyncValue represents an indirect async value of a particular
+// type. Indirect async values constructed with a known type can be forwarded
+// only to async values of exactly the same type.
+template <typename T>
+class TypedIndirectAsyncValue : public IndirectAsyncValue {
+ public:
+  TypedIndirectAsyncValue() : IndirectAsyncValue(TypeTag<T>()) {
+    static_assert(sizeof(TypedIndirectAsyncValue) ==
+                  sizeof(IndirectAsyncValue));
+  }
+};
+
+inline AsyncValue::~AsyncValue() {
+  DCHECK_EQ(waiters_and_state_.load().waiter(), nullptr)
+      << "An async value with waiters should never have refcount of zero";
+  if (AsyncValueAllocationTrackingEnabled() && is_refcounted_)
+    total_allocated_async_values_.fetch_sub(1, std::memory_order_relaxed);
+
+  // Catch use-after-free errors more eagerly, by triggering the size assertion
+  // in the 'get' accessor.
+  type_id_ = ~0;
+}
+
+inline bool AsyncValue::IsAvailable() const {
+  auto s = state();
+  return s == State::kConcrete || s == State::kError;
+}
+
+inline bool AsyncValue::IsError() const { return state() == State::kError; }
+
+inline bool AsyncValue::IsUnconstructed() const {
+  return state() == State::kUnconstructed;
+}
+
+inline bool AsyncValue::IsConstructed() const {
+  return state() == State::kConstructed;
+}
+
+inline bool AsyncValue::IsConcrete() const {
+  return state() == State::kConcrete;
+}
+
+// Return true if this is an IndirectAsyncValue that hasn't been resolved.
+// Currently an IndirectAsyncValue is available if and only if it is resolved.
+inline bool AsyncValue::IsUnresolvedIndirect() const {
+  return IsUnavailable() && (kind() == Kind::kIndirect);
+}
+
+inline bool AsyncValue::IsIndirect() const { return kind() == Kind::kIndirect; }
+
+inline AsyncValue* AsyncValue::AddRef(uint32_t count) {
+  // Always enable reference counting in debug builds to verify that the use of
+  // async values is "ref count correct". In optimized builds the async value
+  // owner is responsible for destructing the non-reference-counted async value.
+#if defined(NDEBUG)
+  if (!is_refcounted_) return this;
+#endif
+
+  if (count > 0) {
+    DCHECK_GT(refcount_.load(std::memory_order_relaxed), 0);
+    // Increasing the reference counter can always be done with
+    // memory_order_relaxed: New references to an object can only be formed from
+    // an existing reference, and passing an existing reference from one thread
+    // to another must already provide any required synchronization.
+    refcount_.fetch_add(count, std::memory_order_relaxed);
+  }
+  return this;
+}
+
+inline void AsyncValue::DropRef(uint32_t count) {
+  // Always enable reference counting in debug builds to verify that the use of
+  // async values is "ref count correct". In optimized builds the async value
+  // owner is responsible for destructing the non-reference-counted async value.
+#if defined(NDEBUG)
+  if (!is_refcounted_) return;
+#endif
+
+  DCHECK_GT(refcount_.load(std::memory_order_relaxed), 0);
+  // We expect that `count` argument will often equal the actual reference count
+  // here; optimize for that. If `count` == reference count, only an acquire
+  // barrier is needed to prevent the effects of the deletion from leaking
+  // before this point.
+  auto is_last_ref = refcount_.load(std::memory_order_acquire) == count;
+  if (!is_last_ref) {
+    // If `count` != reference count, a release barrier is needed in
+    // addition to an acquire barrier so that prior changes by this thread
+    // cannot be seen to occur after this decrement.
+    is_last_ref =
+        refcount_.fetch_sub(count, std::memory_order_acq_rel) == count;
+  }
+  // Destroy this value if the refcount drops to zero.
+  if (is_last_ref) {
+    Destroy();
+  }
+}
+
+template <typename T>
+const T& AsyncValue::GetConcreteValue() const {
+  // Make sure both T (the stored type) and BaseT have vtable_ptr or
+  // neither have the vtable_ptr.
+  DCHECK_EQ(std::is_polymorphic<T>::value, has_vtable_);
+  DCHECK(IsTypeIdCompatible<T>()) << "Incorrect accessor";
+
+  const char* this_ptr = reinterpret_cast<const char*>(this);
+  return *reinterpret_cast<const T*>(this_ptr + AsyncValue::kDataOffset);
+}
+
+template <typename T>
+const T& AsyncValue::get() const {
+  auto s = state();
+  (void)s;
+
+  switch (kind()) {
+    case Kind::kConcrete:
+#ifndef NDEBUG
+      if (!GetTypeInfo().has_data(this)) {
+        LOG(FATAL) << "Cannot call get() when ConcreteAsyncValue"
+                   << " isn't constructed; state: " << s.DebugString() << ","
+                   << " error message: "
+                   << (IsError() ? GetError().message() : "None");
+      }
+#endif  // NDEBUG
+      return GetConcreteValue<T>();
+    case Kind::kIndirect:
+#ifndef NDEBUG
+      if (s != State::kConcrete) {
+        LOG(FATAL) << "Cannot call get() when IndirectAsyncValue"
+                   << " isn't concrete; state: " << s.DebugString() << ","
+                   << " error message: "
+                   << (IsError() ? GetError().message() : "None");
+      }
+#endif  // NDEBUG
+      auto* iv_value = static_cast<const IndirectAsyncValue*>(this)->value_;
+      DCHECK(iv_value) << "Indirect value not resolved";
+      return iv_value->get<T>();
+  }
+}
+
+template <typename T>
+T& AsyncValue::get() {
+  return const_cast<T&>(static_cast<const AsyncValue*>(this)->get<T>());
+}
+
+inline void AsyncValue::SetStateConcrete() {
+  DCHECK(IsConstructed() && kind() == Kind::kConcrete);
+  NotifyAvailable(State::kConcrete);
+}
+
+template <typename T, typename... Args>
+void AsyncValue::emplace(Args&&... args) {
+  DCHECK_EQ(GetTypeId<T>(), type_id_) << "Incorrect accessor";
+  DCHECK(IsUnconstructed() && kind() == Kind::kConcrete);
+
+  static_cast<internal::ConcreteAsyncValue<T>*>(this)->emplace(
+      std::forward<Args>(args)...);
+}
+
+// Returns the underlying error, or nullptr if there is none.
+inline const absl::Status* AsyncValue::GetErrorIfPresent() const {
+  switch (kind()) {
+    case Kind::kConcrete: {
+      if (state() != State::kError) return nullptr;
+      return &GetTypeInfo().get_error(this);
+    }
+    case Kind::kIndirect: {
+      auto* iv_value = static_cast<const IndirectAsyncValue*>(this)->value_;
+      // Unresolved IndirectAsyncValues are not errors.
+      if (!iv_value) return nullptr;
+
+      DCHECK(iv_value->kind() != Kind::kIndirect);
+      return iv_value->GetErrorIfPresent();
+    }
+  }
+}
+
+inline const absl::Status& AsyncValue::GetError() const {
+  auto* result = GetErrorIfPresent();
+  DCHECK(result) << "Cannot call GetError() when error isn't available.";
+  return *result;
+}
+
+template <typename Waiter>
+void AsyncValue::AndThen(Waiter&& waiter) {
+  // Clients generally want to use AndThen without them each having to check
+  // to see if the value is present. Check for them, and immediately run the
+  // waiter if it is already here.
+  auto old_value = waiters_and_state_.load(std::memory_order_acquire);
+  if (old_value.state() == State::kConcrete ||
+      old_value.state() == State::kError) {
+    DCHECK_EQ(old_value.waiter(), nullptr);
+    waiter();
+    return;
+  }
+  EnqueueWaiter(std::forward<Waiter>(waiter), old_value);
+}
+
+template <typename Waiter>
+void AsyncValue::AndThen(Executor& executor, Waiter&& waiter) {
+  // Clients generally want to use AndThen without them each having to check
+  // to see if the value is present. Check for them, and immediately run the
+  // waiter if it is already here.
+  auto old_value = waiters_and_state_.load(std::memory_order_acquire);
+  if (old_value.state() == State::kConcrete ||
+      old_value.state() == State::kError) {
+    DCHECK_EQ(old_value.waiter(), nullptr);
+    executor.Execute(std::forward<Waiter>(waiter));
+    return;
+  }
+  EnqueueWaiter(
+      [&executor, waiter = std::forward<Waiter>(waiter)]() mutable {
+        executor.Execute(std::move(waiter));
+      },
+      old_value);
+}
+
+inline void AsyncValue::Destroy() {
+  // Copy `is_refcounted` flag before destroying the async value object.
+  bool was_ref_counted = is_refcounted_;
+
+  if (ABSL_PREDICT_FALSE(kind() == Kind::kIndirect)) {
+    // Depending on what the benchmarks say, it might make sense to remove this
+    // explicit check and instead make ~IndirectAsyncValue go through the
+    // GetTypeInfo().destructor case below.
+    static_cast<IndirectAsyncValue*>(this)->~IndirectAsyncValue();
+    if (was_ref_counted) {
+#if defined(__cpp_sized_deallocation)
+      ::operator delete(this, sizeof(IndirectAsyncValue),
+                        std::align_val_t{alignof(IndirectAsyncValue)});
+#else   // defined(__cpp_sized_deallocation)
+      ::operator delete(this, std::align_val_t{alignof(IndirectAsyncValue)});
+#endif  // defined(__cpp_sized_deallocation)
+    }
+    return;
+  }
+
+  auto [size, alignment] = GetTypeInfo().destructor(this);
+  if (was_ref_counted) {
+#if defined(__cpp_sized_deallocation)
+    ::operator delete(this, size, alignment);
+#else   // defined(__cpp_sized_deallocation)
+    ::operator delete(this, alignment);
+#endif  // defined(__cpp_sized_deallocation)
+  }
+}
+
+inline bool AsyncValue::IsUnique() const {
+  if (kind() != Kind::kIndirect) {
+    return refcount_.load(std::memory_order_acquire) == 1;
+  }
+
+  // If it is an IndirectAsyncValue, we also need to check the refcount of the
+  // underlying value.
+  return static_cast<const IndirectAsyncValue*>(this)->IsUnique();
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_CONCURRENCY_ASYNC_VALUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/async_value_ref.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/async_value_ref.h
new file mode 100644
index 00000000..76f39e55
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/async_value_ref.h
@@ -0,0 +1,1256 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+#define XLA_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+
+// Forward declare owning typed async value pointer.
+template <typename T>
+class AsyncValueRef;
+
+// Forward declare non-owning typed async value pointer.
+template <typename T>
+class AsyncValuePtr;
+
+// Constructs a ConcreteAsyncValue in error state with the given status.
+RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(absl::Status status);
+
+ABSL_DEPRECATED("Use the error async value constructor that takes absl::Status")
+RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(std::string_view message);
+
+// Constructs an IndirectAsyncValue without forwarding it to anything.
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
+template <typename T>
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
+
+// Forward declare AsyncValueRef constructors.
+template <typename T>
+AsyncValueRef<T> MakeUnconstructedAsyncValueRef();
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeConstructedAsyncValueRef(Args&&... args);
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeAvailableAsyncValueRef(Args&&... args);
+
+// A collection of type traits used by AsyncValueRef and AsyncValuePtr.
+namespace internal {
+
+// Detects if a type is a specialization of an AsyncValueRef template.
+template <typename T>
+struct IsAsyncValueRef : std::false_type {};
+template <typename T>
+struct IsAsyncValueRef<AsyncValueRef<T>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_async_value_ref_v = IsAsyncValueRef<T>::value;
+
+// Detects types that are `absl::StatusOr<R>` container.
+template <typename T>
+struct IsStatusOr : std::false_type {};
+template <typename T>
+struct IsStatusOr<absl::StatusOr<T>> : std::true_type {};
+
+// Type predicates for detecting absl::Status-like types.
+template <typename T>
+static constexpr bool is_status_v = std::is_same_v<T, absl::Status>;
+template <typename T>
+static constexpr bool is_status_or_v = IsStatusOr<T>::value;
+template <typename T>
+static constexpr bool is_status_like_v = is_status_v<T> || is_status_or_v<T>;
+
+// Deduces the result type of invoking `F` with a first compatible `Arg`.
+template <typename F, typename... Args>
+struct FirstInvokeResult {
+  template <typename Arg, bool invocable = std::is_invocable_v<F, Arg>>
+  struct is_invocable : std::false_type {
+    using type = void;
+  };
+
+  template <typename Arg>
+  struct is_invocable<Arg, true> : std::true_type {
+    using type = std::invoke_result_t<F, Arg>;
+  };
+
+  using type = typename std::disjunction<is_invocable<Args>...>::type;
+};
+
+// In contrast to `std::invoke_result_t` `Args` are not passed to `F` all
+// together, but instead they are passed one-by-one, and the first valid one
+// determines the result type.
+template <typename F, typename... Args>
+using first_invoke_result_t = typename FirstInvokeResult<F, Args...>::type;
+
+}  // namespace internal
+
+// AsyncValueRef<T> is an asynchronous container for a payload of type `T` or an
+// error of type `absl::Status`. It is similar to an `absl::StatusOr<T>`, but
+// does not require immediate value or error to be constructed. It is a promise
+// that at some point in the future it will become concrete and will hold a
+// payload of type `T` or an error of type `absl::Status`.
+//
+//  - Prefer `AsyncValueRef<Chain>` to `AsyncValueRef<absl::Status>`.
+//    Instead of a `Chain` it can be any other empty struct to signal that only
+//    the potential error is important.
+//
+//  - Prefer `AsyncValueRef<T>` to `AsyncValueRef<absl::StatusOr<T>>`.
+//    Similar to the `absl::StatusOr<T>` async value will be either in error
+//    state holding an `absl::Status` error, or in concrete state holding a
+//    value of type `T`.
+template <typename T>
+class AsyncValueRef {
+ public:
+  // AsyncValueRef<T>::value_type
+  using value_type = T;
+
+  AsyncValueRef() = default;
+
+  AsyncValueRef(const AsyncValueRef&) = default;
+  AsyncValueRef& operator=(const AsyncValueRef&) = default;
+
+  AsyncValueRef(AsyncValueRef&&) noexcept = default;
+  AsyncValueRef& operator=(AsyncValueRef&&) noexcept = default;
+
+  explicit AsyncValueRef(RCReference<AsyncValue> value)
+      : value_(std::move(value)) {}
+
+  template <typename Derived,
+            internal::DerivedFrom<Derived, AsyncValue>* = nullptr>
+  explicit AsyncValueRef(RCReference<Derived> value)
+      : AsyncValueRef(RCReference<AsyncValue>(std::move(value))) {}
+
+  // Support implicit construction from nullptr to empty async value ref.
+  AsyncValueRef(std::nullptr_t) {}  // NOLINT
+
+  // Support implicit construction from immediate `Status` error convertible to
+  // `absl::Status` (only if payload type is not `absl::Status`, because
+  // otherwise it is ambiguous, is it an error or a concrete payload).
+  template <typename Status,
+            std::enable_if_t<std::is_convertible_v<Status, absl::Status> &&
+                             !std::is_same_v<T, absl::Status>>* = nullptr>
+  AsyncValueRef(Status&& status)  // NOLINT
+      : AsyncValueRef(MakeErrorAsyncValueRef(std::forward<Status>(status))) {}
+
+  // Support implicit conversion from an async value of a derived type.
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef(AsyncValueRef<Derived> derived)  // NOLINT
+      : value_(derived.ReleaseRCRef()) {}
+
+  // Support implicit construction from RCReference<ErrorAsyncValue>.
+  AsyncValueRef(RCReference<ErrorAsyncValue> value)  // NOLINT
+      : value_(std::move(value)) {}
+
+  AsyncValueRef& operator=(RCReference<ErrorAsyncValue> new_value) {
+    value_ = std::move(new_value);
+    return *this;
+  }
+
+  // Allow implicit conversion to type-erased RCReference<AsyncValue>
+  operator RCReference<AsyncValue>() && { return std::move(value_); }  // NOLINT
+
+  bool IsAvailable() const { return value_->IsAvailable(); }
+  bool IsUnavailable() const { return value_->IsUnavailable(); }
+  bool IsConcrete() const { return value_->IsConcrete(); }
+  bool IsConstructed() const { return value_->IsConstructed(); }
+  bool IsUnconstructed() const { return value_->IsUnconstructed(); }
+
+  // Return the stored value. The AsyncValueRef must be available.
+  T& get() const { return value_->get<T>(); }
+
+  // Return the stored value as a derived type. The AsyncValueRef must be
+  // available.
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  Derived& get() const {
+    return value_->get<Derived>();
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  bool Isa() const {
+    // Isa is successful if:
+    //   (1) This is no-op cast even if concrete payload has different type.
+    //   (2) Type id of a concrete payload matches Derived type id.
+    //   (3) Payload is for a special case of ErrorAsyncValue.
+    //
+    // IMPORTANT: Because AsyncValue can be in unconstructed state we can't rely
+    // on `dynamic_cast` (and for similar reason on LLVM casts) and have to
+    // rely on type id stored in the async value itself. The downside of this
+    // approach that we might return false negatives.
+    //
+    // Example:
+    //
+    //   struct A {};
+    //   struct B : public A {};
+    //   struct C : public C {}
+    //
+    //   AsyncValueRef<A> ref = MakeUnconstructedAsyncValueRef<C>();
+    //
+    // In this example `ref.Isa<B>()` will return `false` although `C` can be
+    // safely casted to a pointer to its base type `B`, however type id does
+    // not have any details about type relationship. This can be fixed by adding
+    // extra bits of information to type table and by requiring participating
+    // types to register their relationship to base types in terms of their type
+    // ids, however there is no such need in practice (so far).
+    return value_ && (std::is_same_v<Derived, T> ||                     // (1)
+                      value_->IsType<Derived>() ||                      // (2)
+                      value_->IsType<DummyValueForErrorAsyncValue>());  // (3)
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef<Derived> Cast() const {
+    DCHECK(DynCast<Derived>()) << "Illegal async value cast";
+    return AsyncValueRef<Derived>(value_);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef<Derived> DynCast() const {
+    DCHECK(value_) << "Async value must be not null";
+    return Isa<Derived>() ? AsyncValueRef<Derived>(value_)
+                          : AsyncValueRef<Derived>(nullptr);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef<Derived> DynCastOrNull() const {
+    return value_ ? DynCast<Derived>(value_) : AsyncValueRef<Derived>(nullptr);
+  }
+
+  T* operator->() const { return &get(); }
+
+  T& operator*() const { return get(); }
+
+  template <typename Waiter>
+  void AndThen(Waiter&& waiter) const {
+    AsPtr().AndThen(std::forward<Waiter>(waiter));
+  }
+
+  template <typename Waiter>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    AsPtr().AndThen(executor, std::forward<Waiter>(waiter));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> Map(F&& f) {
+    return AsPtr().template Map<R>(std::forward<F>(f));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> Map(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template Map<R>(executor, std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto Map(F&& f) {
+    return AsPtr().template Map<F>(std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto Map(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template Map<F>(executor, std::forward<F>(f));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> TryMap(F&& f) {
+    return AsPtr().template TryMap<R>(std::forward<F>(f));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> TryMap(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template TryMap<R>(executor, std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto TryMap(F&& f) {
+    return AsPtr().TryMap(std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto TryMap(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().TryMap(executor, std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto FlatMap(F&& f) {
+    return AsPtr().FlatMap(std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto FlatMap(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().FlatMap(executor, std::forward<F>(f));
+  }
+
+  // Make the AsyncValueRef available.
+  void SetStateConcrete() const { value_->SetStateConcrete(); }
+
+  // Set the stored value. The AsyncValueRef must be unavailable. After this
+  // returns, the AsyncValueRef will be available.
+  template <typename... Args>
+  void emplace(Args&&... args) const {
+    value_->emplace<T>(std::forward<Args>(args)...);
+  }
+
+  void emplace(absl::StatusOr<T> v) const {
+    if (v.ok()) {
+      emplace(std::move(*v));
+    } else {
+      SetError(std::move(v.status()));
+    }
+  }
+
+  // Return true if this AsyncValueRef represents an error.
+  bool IsError() const { return value_->IsError(); }
+
+  // Returns the underlying error. IsError() must be true.
+  const absl::Status& GetError() const { return value_->GetError(); }
+
+  // Returns the underlying error, or nullptr if there is none.
+  const absl::Status* GetErrorIfPresent() const {
+    return value_->GetErrorIfPresent();
+  }
+
+  void SetError(absl::Status status) const {
+    DCHECK(!status.ok()) << "expected non-ok status";
+    return value_->SetError(std::move(status));
+  }
+
+  ABSL_DEPRECATED("Use SetError with absl::Status argument")
+  void SetError(std::string_view message) const {
+    SetError(absl::InternalError(message));
+  }
+
+  explicit operator bool() const { return value_.get() != nullptr; }
+  bool operator==(const AsyncValueRef& r) const { return value_ == r.value_; }
+  bool operator!=(const AsyncValueRef& r) const { return value_ != r.value_; }
+
+  // Return a raw pointer to the AsyncValue.
+  AsyncValue* GetAsyncValue() const { return value_.get(); }
+
+  // Returns a non-owning pointer to the underlying async value.
+  AsyncValuePtr<T> AsPtr() const { return AsyncValuePtr<T>(GetAsyncValue()); }
+
+  // Return true if this is the only ref to the AsyncValue.
+  // This function requires the internal AsyncValue to be set (value_ !=
+  // nullptr).
+  bool IsUnique() const { return value_->IsUnique(); }
+
+  // Make an explicit copy of this AsyncValueRef, increasing value_'s refcount
+  // by one.
+  AsyncValueRef<T> CopyRef() const { return AsyncValueRef(CopyRCRef()); }
+
+  // Make a copy of value_, increasing value_'s refcount by one.
+  RCReference<AsyncValue> CopyRCRef() const { return value_; }
+
+  // Release ownership of one reference on the AsyncValue and return a raw
+  // pointer to it.
+  AsyncValue* release() { return value_.release(); }
+
+  void reset() { value_.reset(); }
+
+  // Transfer ownership of one reference on the AsyncValue to the returned
+  // RCReference<AsyncValue>.
+  RCReference<AsyncValue> ReleaseRCRef() { return std::move(value_); }
+
+ private:
+  RCReference<AsyncValue> value_;
+};
+
+// Non owning typed pointer for the AsyncValue. Can be cheaply passed around
+// when the lifetime of the underlying async value is clear from the context.
+// It is the user responsibility to construct an owning AsyncValueRef to extend
+// the lifetime of the underlying value if needed.
+template <typename T>
+class AsyncValuePtr {
+  // Wait for async value availability: AndThen([] {})
+  template <typename Waiter>
+  using SimpleWaiter = std::enable_if_t<std::is_invocable_v<Waiter>>;
+
+  // Wait for async value status and value: AndThen([](absl::StatusOr<T*>) {})
+  template <typename Waiter>
+  using StatusOrWaiter =
+      std::enable_if_t<std::is_invocable_v<Waiter, absl::StatusOr<T*>>>;
+
+  // Wait for async value status: AndThen([](absl::Status) {})
+  //
+  // IMPORTANT: We disable this type of AndThen callback if the payload type is
+  // absl::Status because it is ambiguous and confusing: error can be an async
+  // value error or a concrete payload of a completed async value. Users should
+  // use other types of callbacks to disambiguate the provenance of status.
+  template <typename Waiter>
+  using StatusWaiter =
+      std::enable_if_t<(std::is_invocable_v<Waiter, absl::Status> &&
+                        !std::is_invocable_v<Waiter, absl::StatusOr<T*>> &&
+                        !internal::is_status_v<T>)>;
+
+  // Map async value of type `T` to an async value of type `R`.
+  template <typename R, typename F, typename U = std::invoke_result_t<F, T&>>
+  using MapFunctor = std::enable_if_t<std::is_constructible_v<R, U>>;
+
+  // Try map async value of type `T` to an async value of type `R`.
+  template <typename R, typename F, typename U = std::invoke_result_t<F, T&>>
+  using TryMapFunctor =
+      std::enable_if_t<internal::is_status_or_v<U> &&
+                       std::is_constructible_v<R, typename U::value_type>>;
+
+  // Flat map async value of type `T` to an async value `R` (`R` itself is an
+  // async value ref). Returns `R` value type (async payload type).
+  template <typename F,
+            typename R =
+                internal::first_invoke_result_t<F, T&, AsyncValuePtr<T>>>
+  using FlatMapFunctor = std::enable_if_t<internal::is_async_value_ref_v<R>,
+                                          typename R::value_type>;
+
+ public:
+  // AsyncValuePtr<T>::value_type
+  using value_type = T;
+
+  AsyncValuePtr() : value_(nullptr) {}
+
+  explicit AsyncValuePtr(AsyncValue* value) : value_(value) {}
+  explicit AsyncValuePtr(const AsyncValueRef<T>& ref)
+      : value_(ref.GetAsyncValue()) {}
+
+  AsyncValue* value() const { return value_; }
+
+  AsyncValueRef<T> CopyRef() const { return AsyncValueRef<T>(FormRef(value_)); }
+
+  T& get() const { return value_->template get<T>(); }
+  T* operator->() const { return &get(); }
+  T& operator*() const { return get(); }
+
+  explicit operator bool() const { return value_ != nullptr; }
+  bool operator==(const AsyncValuePtr& p) const { return value_ == p.value_; }
+  bool operator!=(const AsyncValuePtr& p) const { return value_ != p.value_; }
+
+  AsyncValuePtr& operator=(std::nullptr_t) {
+    value_ = nullptr;
+    return *this;
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  bool Isa() const {
+    // Isa is successful if:
+    //   (1) This is no-op cast even if concrete payload has different type.
+    //   (2) Type id of a concrete payload matches Derived type id.
+    //   (3) Payload is for a special case of ErrorAsyncValue.
+    return value_ && (std::is_same_v<Derived, T> ||                     // (1)
+                      value_->IsType<Derived>() ||                      // (2)
+                      value_->IsType<DummyValueForErrorAsyncValue>());  // (3)
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValuePtr<Derived> Cast() const {
+    DCHECK(DynCast<Derived>()) << "Illegal async value cast";
+    return AsyncValuePtr<Derived>(value_);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValuePtr<Derived> DynCast() const {
+    DCHECK(value_) << "Async value must be not null";
+    return Isa<Derived>() ? AsyncValuePtr<Derived>(value_)
+                          : AsyncValuePtr<Derived>(nullptr);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValuePtr<Derived> DynCastOrNull() const {
+    return value_ ? DynCast<Derived>(value_) : AsyncValuePtr<Derived>(nullptr);
+  }
+
+  bool IsAvailable() const { return value_->IsAvailable(); }
+  bool IsUnavailable() const { return value_->IsUnavailable(); }
+
+  bool IsConcrete() const { return value_->IsConcrete(); }
+  void SetStateConcrete() const { value_->SetStateConcrete(); }
+
+  template <typename... Args>
+  void emplace(Args&&... args) const {
+    value_->emplace<T>(std::forward<Args>(args)...);
+  }
+
+  bool IsError() const { return value_->IsError(); }
+
+  const absl::Status& GetError() const { return value_->GetError(); }
+
+  void SetError(absl::Status status) const {
+    DCHECK(!status.ok()) << "expected non-ok status";
+    return value_->SetError(std::move(status));
+  }
+
+  // If the AsyncValueRef is available, invokes the `waiter` immediately.
+  // Otherwise, invokes the `waiter` when the AsyncValueRef becomes available.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.AndThen([] {
+  //   // async_value_ptr is now ready.
+  // });
+  template <typename Waiter, SimpleWaiter<Waiter>* = nullptr>
+  void AndThen(Waiter&& waiter) const {
+    value_->AndThen(std::forward<Waiter>(waiter));
+  }
+
+  // An overload that executes `waiter` on a user-provided executor.
+  template <typename Waiter, SimpleWaiter<Waiter>* = nullptr>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    value_->AndThen(executor, std::forward<Waiter>(waiter));
+  }
+
+  // This AndThen() function takes a functor that takes absl::StatusOr<T*> as
+  // argument. This makes it easy for the callback function to use the value of
+  // the AsyncValue when it becomes available.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.AndThen([] (absl::StatusOr<T*> status_or) {
+  //   // async_value_ptr is now ready and its value/error is in the provided
+  //   // `status_or` argument.
+  //   if (!status_or.ok()) {
+  //      // Handle the error in `status_or.status()`.
+  //   } else {
+  //      // Handle the value in `*status_or`.
+  //   }
+  // });
+  template <typename Waiter, StatusOrWaiter<Waiter>* = nullptr>
+  void AndThen(Waiter&& waiter) const {
+    AndThen([waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        return waiter(ptr.GetError());
+      } else {
+        return waiter(&ptr.get());
+      }
+    });
+  }
+
+  // An overload that executes `waiter` on a user-provided executor.
+  template <typename Waiter, StatusOrWaiter<Waiter>* = nullptr>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                return waiter(ref.GetError());
+              } else {
+                return waiter(&ref.get());
+              }
+            });
+  }
+
+  // This AndThen() function takes a functor that takes an absl::Status as
+  // argument. This makes it easy for the callback function to use the error of
+  // the AsyncValue when it becomes available. This is useful when the callback
+  // function only cares about the error value of the AsyncValue, e.g. for
+  // AsyncValueRef<Chain>.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.AndThen([] (absl::Status status) {
+  //   // async_value_ptr is now ready and its status is in the provided
+  //   // `status` argument.
+  //   if (!status.ok()) {
+  //     // Handle the error.
+  //   } else {
+  //     // No error occurred.
+  //   }
+  // });
+  template <typename Waiter, StatusWaiter<Waiter>* = nullptr>
+  void AndThen(Waiter&& waiter) const {
+    AndThen([waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        return waiter(ptr.GetError());
+      } else {
+        return waiter(absl::OkStatus());
+      }
+    });
+  }
+
+  // An overload that executes `waiter` on a user-provided executor.
+  template <typename Waiter, StatusWaiter<Waiter>* = nullptr>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                return waiter(ref.GetError());
+              } else {
+                return waiter(absl::OkStatus());
+              }
+            });
+  }
+
+  // Returns and AsyncValueRef<R> that is emplaced from the result of invoking
+  // functor `f` with *this value. If *this completes with an error, returned
+  // async value will also be an error.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.Map<R>([](T& value) -> U {
+  //   return U(value); // R must be constructible from U
+  // })
+  //
+  template <typename R, typename F, MapFunctor<R, F>* = nullptr>
+  AsyncValueRef<R> Map(F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    AndThen([f = std::forward<F>(f), result, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        result.SetError(ptr.GetError());
+      } else {
+        result.emplace(f(*ptr));
+      }
+    });
+    return result;
+  }
+
+  // An overload that executes `f` on a user-provided executor.
+  template <typename R, typename F, MapFunctor<R, F>* = nullptr>
+  AsyncValueRef<R> Map(AsyncValue::Executor& executor, F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [f = std::forward<F>(f), result, ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                result.SetError(ref.GetError());
+              } else {
+                result.emplace(f(*ref));
+              }
+            });
+    return result;
+  }
+
+  // Returns and AsyncValueRef<R> that is emplaced from the result of invoking
+  // functor `f` with *this value. Functor must return an `absl::StatusOr<U>`
+  // result that in case of error will be folded into the returned async value
+  // as an error. If *this completes with an error, returned async value will
+  // also be an error.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.TryMap<R>([](T& value) -> absl::StatusOr<U> {
+  //   return absl::StatusOr<U>(U{value}); // R must be constructible from U
+  // })
+  //
+  // If returned status container will have an error status, it will be
+  // automatically converted to async value error.
+  template <typename R, typename F, TryMapFunctor<R, F>* = nullptr>
+  AsyncValueRef<R> TryMap(F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    AndThen([f = std::forward<F>(f), result, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        result.SetError(ptr.GetError());
+      } else {
+        auto status_or = f(*ptr);
+        if (status_or.ok()) {
+          result.emplace(std::move(status_or.value()));
+        } else {
+          result.SetError(status_or.status());
+        }
+      }
+    });
+    return result;
+  }
+
+  // An overload that executes `f` on a user-provided executor.
+  template <typename R, typename F, TryMapFunctor<R, F>* = nullptr>
+  AsyncValueRef<R> TryMap(AsyncValue::Executor& executor, F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [f = std::forward<F>(f), result, ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                result.SetError(ref.GetError());
+              } else {
+                auto status_or = f(*ref);
+                if (status_or.ok()) {
+                  result.emplace(std::move(status_or.value()));
+                } else {
+                  result.SetError(status_or.status());
+                }
+              }
+            });
+    return result;
+  }
+
+  // A `Map` overload that automatically infers the type of result from `f`.
+  template <typename F, typename R = std::invoke_result_t<F, T&>>
+  auto Map(F&& f) {
+    return Map<R>(std::forward<F>(f));
+  }
+
+  // A `Map` overload that automatically infers the type of result from `f` and
+  // executes `f` on user-provided executor.
+  template <typename F, typename R = std::invoke_result_t<F, T&>>
+  auto Map(AsyncValue::Executor& executor, F&& f) {
+    return Map<R>(executor, std::forward<F>(f));
+  }
+
+  // A `TryMap` overload that automatically infers the type of result from `f`.
+  template <typename F, typename R = std::invoke_result_t<F, T&>,
+            std::enable_if_t<internal::is_status_or_v<R>>* = nullptr>
+  auto TryMap(F&& f) {
+    return TryMap<typename R::value_type>(std::forward<F>(f));
+  }
+
+  // A `TryMap` overload that automatically infers the type of result from `f`
+  // and executes `f` on user-provided executor.
+  template <typename F, typename R = std::invoke_result_t<F, T&>,
+            std::enable_if_t<internal::is_status_or_v<R>>* = nullptr>
+  auto TryMap(AsyncValue::Executor& executor, F&& f) {
+    return TryMap<typename R::value_type>(executor, std::forward<F>(f));
+  }
+
+  // Returns an AsyncValueRef<R> that will be forwarded to the AsyncValueRef
+  // returned from a functor.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.FlatMap([](T& value) -> AsyncValueRef<R> {
+  //   return LaunchAsyncTask(value);
+  // })
+  //
+  // Functor argument can be a `T&` or an `AsyncValueRef<T>`, where async value
+  // pointer is guaranteed to be in concrete state. Async value pointer allows
+  // the functor to extend the lifetime of underlying async value if needed.
+  //
+  // async_value_ptr.FlatMap([](AsyncValuePtr<T> ptr) -> AsyncValueRef<R> {
+  //   return LaunchAsyncTask([ref = ptr.CopyRef()] { ... });
+  // })
+  //
+  template <typename F, typename R = FlatMapFunctor<F>>
+  AsyncValueRef<R> FlatMap(F&& f) {
+    // If async value is in concrete state, we can immediately call the functor.
+    // We don't handle errors here and prefer a generic code path below because
+    // error handling is never on a performance critical path.
+    if (ABSL_PREDICT_TRUE(IsConcrete())) {
+      if constexpr (std::is_invocable_v<F, T&>) {
+        return f(get());
+      } else {
+        return f(*this);
+      }
+    }
+
+    auto promise = MakePromise<R>();
+    AndThen([f = std::forward<F>(f), promise, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        promise->SetError(ptr.GetError());
+      } else {
+        if constexpr (std::is_invocable_v<F, T&>) {
+          promise->ForwardTo(f(*ptr));
+        } else {
+          promise->ForwardTo(f(ptr));
+        }
+      }
+    });
+    return AsyncValueRef<R>(promise);
+  }
+
+  // An overload that executes `f` on a user-provided executor.
+  template <typename F, typename R = FlatMapFunctor<F>>
+  AsyncValueRef<R> FlatMap(AsyncValue::Executor& executor, F&& f) {
+    // We don't have a special handling for concrete values here because
+    // we must execute user functor on a separate executor and can't call it in
+    // the caller thread.
+    auto promise = MakePromise<R>();
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [f = std::forward<F>(f), promise, ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                promise->SetError(ref.GetError());
+              } else {
+                if constexpr (std::is_invocable_v<F, T&>) {
+                  promise->ForwardTo(f(*ref));
+                } else {
+                  promise->ForwardTo(f(ref.AsPtr()));
+                }
+              }
+            });
+    return AsyncValueRef<R>(promise);
+  }
+
+ private:
+  // We set a concrete type for indirect async value promise only if the type is
+  // final, because otherwise we can forward it later to one of the derived
+  // types and this will be a run time error.
+  template <typename R>
+  RCReference<IndirectAsyncValue> MakePromise() {
+    if constexpr (std::is_final_v<R>) {
+      return MakeIndirectAsyncValue<R>();
+    } else {
+      return MakeIndirectAsyncValue();
+    };
+  }
+
+  AsyncValue* value_;  // doesn't own the async value
+};
+
+//===----------------------------------------------------------------------===//
+// Count down AsyncValueRef.
+//===----------------------------------------------------------------------===//
+
+// Count down async value ref is used to set the async value available when the
+// count reaches zero, or to an error state if any of the count down operations
+// fails.
+//
+// Sample usage:
+//
+//   AsyncValueRef<Chain> done = MakeConstructedAsyncValueRef<Chain>();
+//   CountDownAsyncValueRef<Chain> count_down(done, num_tasks);
+//
+//   for (size_t i = 0; i < num_tasks; ++i) {
+//     thread_pool.Schedule([count_down] {
+//       count_down.CountDown();
+//     });
+//   }
+//
+//   return done;
+//
+//  When the counter reaches zero, the async value will be set to available
+//  state (or an error state if any of the count down operations got an error).
+template <typename T>
+class CountDownAsyncValueRef {
+ public:
+  CountDownAsyncValueRef() = default;
+
+  CountDownAsyncValueRef(AsyncValueRef<T> ref, int64_t cnt)
+      : state_(std::make_shared<State>(std::move(ref), cnt)) {
+    DCHECK(state_->ref.IsConstructed()) << "AsyncValue must be constructed";
+    DCHECK(state_->ref.IsUnavailable()) << "AsyncValue must be unavailable";
+    DCHECK_GT(cnt, 0) << "Count must be positive";
+  }
+
+  template <typename... Args>
+  explicit CountDownAsyncValueRef(Args&&... args, int64_t cnt)
+      : CountDownAsyncValueRef(
+            MakeConstructedAsyncValueRef<T>(std::forward<Args>(args)...), cnt) {
+  }
+
+  // Drops the count by one and returns true if async value became available.
+  bool CountDown(const absl::Status& status = absl::OkStatus()) {
+    DCHECK(state_->ref.IsUnavailable()) << "AsyncValue must be unavailable";
+    DCHECK_GT(state_->cnt.load(), 0) << "Count must be positive";
+
+    if (ABSL_PREDICT_FALSE(!status.ok())) {
+      absl::MutexLock lock(&state_->mutex);
+      state_->is_error.store(true, std::memory_order_release);
+      state_->status = status;
+    }
+
+    // Note on the `std::memory_order_acq_rel` barrier below:
+    //
+    // 1. It is an acquire barrier because we want to make sure that, if the
+    //    current thread sets `is_error` above, then another thread who might
+    //    set `cnt` to 0 will read an up-to-date is_error. An acquire barrier
+    //    achieves this by forcing ordering between the is_error load and the
+    //    fetch_sub. Note that there is a control dependence between the two,
+    //    not a data dependence; we therefore need an acquire ("read") barrier
+    //    to enforce ordering, otherwise the compiler or CPU might speculatively
+    //    perform the second load before the first.
+    //
+    // 2. It is also a release barrier because all prior writes in the thread
+    //    should be visible to other threads after the fetch_sub -- otherwise
+    //    other threads might not see updated values.
+    bool is_complete = state_->cnt.fetch_sub(1, std::memory_order_acq_rel) == 1;
+
+    // If this was the last count down, we have to decide if we set async value
+    // to concrete or error state.
+    if (ABSL_PREDICT_FALSE(is_complete)) {
+      bool is_error = state_->is_error.load(std::memory_order_acquire);
+      if (ABSL_PREDICT_FALSE(is_error)) {
+        // Ownership of the CountDownAsyncValueRef can be transferred to
+        // AsyncValueRef itself (via the `AndThen` callback), and `ref.SetError`
+        // call can destroy the `state_` and the `mutex`. We take the error
+        // status by copy to avoid using memory after it was freed.
+        auto take_error = [&] {
+          absl::MutexLock lock(&state_->mutex);
+          return state_->status;
+        };
+        state_->ref.SetError(take_error());
+        return true;
+      } else {
+        state_->ref.SetStateConcrete();
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  AsyncValueRef<T> AsRef() const { return state_->ref; }
+  AsyncValuePtr<T> AsPtr() const { return state_->ref.AsPtr(); }
+
+  // Returns true if count down was called with an error.
+  bool is_error() const {
+    return state_->is_error.load(std::memory_order_acquire);
+  }
+
+  // Returns the number of count down operations left.
+  int64_t count() const { return state_->cnt.load(std::memory_order_acquire); }
+
+ private:
+  static constexpr size_t kAtomicAlignment =
+#if defined(__cpp_lib_hardware_interference_size)
+      std::hardware_destructive_interference_size;
+#else
+      64;
+#endif
+
+  struct State {
+    State(AsyncValueRef<T> ref, int64_t cnt)
+        : ref(std::move(ref)), cnt(cnt), is_error(false) {}
+
+    AsyncValueRef<T> ref;
+
+    // Align atomic counters to a cache line boundary to avoid reloading `cnt`
+    // cache line when checking `is_error` status.
+    alignas(kAtomicAlignment) std::atomic<int64_t> cnt;
+    alignas(kAtomicAlignment) std::atomic<bool> is_error;
+
+    absl::Mutex mutex;
+    absl::Status status ABSL_GUARDED_BY(mutex);
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+//===----------------------------------------------------------------------===//
+// Functions for awaiting on the async values.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+void BlockUntilReady(const AsyncValueRef<T>& ref) {
+  BlockUntilReady(ref.GetAsyncValue());
+}
+
+template <typename T>
+void BlockUntilReady(const AsyncValuePtr<T>& ptr) {
+  BlockUntilReady(ptr.value());
+}
+
+template <typename T>
+void RunWhenReady(absl::Span<const AsyncValueRef<T>> refs,
+                  absl::AnyInvocable<void()> callee) {
+  absl::InlinedVector<AsyncValue*, 8> values(refs.size());
+  for (size_t i = 0; i < refs.size(); ++i) {
+    values[i] = refs[i].GetAsyncValue();
+  }
+  RunWhenReady(values, std::move(callee));
+}
+
+template <typename T>
+void RunWhenReady(absl::Span<const AsyncValuePtr<T>> ptrs,
+                  absl::AnyInvocable<void()> callee) {
+  absl::InlinedVector<AsyncValue*, 8> values(ptrs.size());
+  for (size_t i = 0; i < ptrs.size(); ++i) {
+    values[i] = ptrs[i].value();
+  }
+  RunWhenReady(values, std::move(callee));
+}
+
+//===----------------------------------------------------------------------===//
+// LLVM-style type casting library for async value refs and ptrs.
+//===----------------------------------------------------------------------===//
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+bool Isa(const AsyncValueRef<T>& ref) {
+  return ref.template Isa<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValueRef<Derived> Cast(const AsyncValueRef<T>& ref) {
+  return ref.template Cast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValueRef<Derived> DynCast(const AsyncValueRef<T>& ref) {
+  return ref.template DynCast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValueRef<Derived> DynCastOrNull(const AsyncValueRef<T>& ref) {
+  return ref.template DynCastOrNull<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+bool Isa(AsyncValuePtr<T> ptr) {
+  return ptr.template Isa<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValuePtr<Derived> Cast(AsyncValuePtr<T> ptr) {
+  return ptr.template Cast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValuePtr<Derived> DynCast(AsyncValuePtr<T> ptr) {
+  return ptr.template DynCast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValuePtr<Derived> DynCastOrNull(AsyncValuePtr<T> ptr) {
+  return ptr.template DynCastOrNull<Derived>();
+}
+
+//===----------------------------------------------------------------------===//
+// Constructing reference-counted async values on the heap.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+template <typename T, typename... Args>
+T* PlacementConstruct(void* buf, Args&&... args) {
+  return new (buf) T(std::forward<Args>(args)...);
+}
+
+template <typename T, typename... Args>
+T* AllocateAndConstruct(Args&&... args) {
+  void* buf = ::operator new(sizeof(T), std::align_val_t{alignof(T)});
+  return PlacementConstruct<T, Args...>(buf, std::forward<Args>(args)...);
+}
+
+}  // namespace internal
+
+// Construct an empty IndirectAsyncValue with a known type.
+template <typename T>
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue() {
+  return TakeRef(internal::AllocateAndConstruct<TypedIndirectAsyncValue<T>>());
+}
+
+// Allocate an unconstructed AsyncValueRef. The AsyncValueRef should be made
+// available later by invoking AsyncValueRef::emplace or
+// AsyncValueRef::SetError.
+template <typename T>
+AsyncValueRef<T> MakeUnconstructedAsyncValueRef() {
+  return AsyncValueRef<T>(tsl::TakeRef(
+      internal::AllocateAndConstruct<internal::ConcreteAsyncValue<T>>(
+          typename internal::ConcreteAsyncValue<T>::UnconstructedPayload{})));
+}
+
+// Allocate and construct an AsyncValueRef without making it available for
+// consumption. The AsyncValueRef should be made available later by invoking
+// AsyncValueRef::SetStateConcrete or AsyncValueRef::SetError.
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeConstructedAsyncValueRef(Args&&... args) {
+  return AsyncValueRef<T>(tsl::TakeRef(
+      internal::AllocateAndConstruct<internal::ConcreteAsyncValue<T>>(
+          typename internal::ConcreteAsyncValue<T>::ConstructedPayload{},
+          std::forward<Args>(args)...)));
+}
+
+// Allocate and construct an available AsyncValueRef.
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeAvailableAsyncValueRef(Args&&... args) {
+  return AsyncValueRef<T>(tsl::TakeRef(
+      internal::AllocateAndConstruct<internal::ConcreteAsyncValue<T>>(
+          typename internal::ConcreteAsyncValue<T>::ConcretePayload{},
+          std::forward<Args>(args)...)));
+}
+
+// Allocates an AsyncValueRef that is constructed from the result of calling an
+// `f` on a user-provided `executor`.
+//
+// Sample usage:
+//
+//   MakeAsyncValueRef<int32_t>(executor, []() -> int32_t { ... });
+//
+template <typename T, typename F, typename R = std::invoke_result_t<F>,
+          std::enable_if_t<std::is_constructible_v<T, R>>* = nullptr>
+AsyncValueRef<T> MakeAsyncValueRef(AsyncValue::Executor& executor, F&& f) {
+  auto result = MakeUnconstructedAsyncValueRef<T>();
+  executor.Execute(
+      [result, f = std::forward<F>(f)]() mutable { result.emplace(f()); });
+  return result;
+}
+
+// A `MakeAsyncValueRef` overload that automatically infers the type of result
+// from `f`.
+template <typename F, typename R = std::invoke_result_t<F>>
+AsyncValueRef<R> MakeAsyncValueRef(AsyncValue::Executor& executor, F&& f) {
+  return MakeAsyncValueRef<R>(executor, std::forward<F>(f));
+}
+
+// Allocates an AsyncValueRef that is constructed from the result of calling an
+// `f` on a user-provided `executor`. `F` must return an absl::StatusOr<U>, and
+// result of type `T` must be constructible from `U`.
+//
+// Sample usage:
+//
+//   TryMakeAsyncValueRef<int32_t>(executor,
+//     []() -> absl::StatusOr<int32_t> { ... });
+//
+template <typename T, typename F, typename R = std::invoke_result_t<F>,
+          std::enable_if_t<
+              internal::is_status_or_v<R> &&
+              std::is_constructible_v<T, typename R::value_type>>* = nullptr>
+AsyncValueRef<T> TryMakeAsyncValueRef(AsyncValue::Executor& executor, F&& f) {
+  auto result = MakeUnconstructedAsyncValueRef<T>();
+  executor.Execute([result, f = std::forward<F>(f)]() mutable {
+    absl::StatusOr<typename R::value_type> status_or = f();
+    if (ABSL_PREDICT_TRUE(status_or.ok())) {
+      result.emplace(std::move(status_or).value());
+    } else {
+      result.SetError(std::move(status_or).status());
+    }
+  });
+  return result;
+}
+
+// A `TryMakeAsyncValueRef` overload that automatically infers the type of
+// result from `f`.
+template <typename F, typename R = std::invoke_result_t<F>,
+          std::enable_if_t<internal::is_status_or_v<R>>* = nullptr>
+AsyncValueRef<typename R::value_type> TryMakeAsyncValueRef(
+    AsyncValue::Executor& executor, F&& f) {
+  return TryMakeAsyncValueRef<typename R::value_type>(executor,
+                                                      std::forward<F>(f));
+}
+
+//===----------------------------------------------------------------------===//
+// Constructing non-reference-counted values in user provided storage.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// Properly sized and aligned storage for allocating async values of given type.
+template <typename T>
+struct AsyncValueStorage {
+  using Payload = ConcreteAsyncValue<T>;
+
+  AsyncValueStorage() = default;
+
+  AsyncValueStorage(const AsyncValueStorage&) = delete;
+  AsyncValueStorage& operator=(const AsyncValueStorage&) = delete;
+
+  void* buf() { return &storage[0]; }
+
+  alignas(Payload) std::byte storage[sizeof(Payload)];
+};
+
+}  // namespace internal
+
+// Exclusive owner of the non reference-counted async value (e.g. allocated in
+// the user provided storage) that is responsible for destructing it. If you'd
+// look at `AsyncValueRef` as `std::shared_ptr`, then this is `std::unique_ptr`.
+template <typename T>
+class AsyncValueOwningRef {
+ public:
+  AsyncValueOwningRef() = default;
+  ~AsyncValueOwningRef() { Destroy(); }
+
+  AsyncValueOwningRef(const AsyncValueOwningRef&) = delete;
+  AsyncValueOwningRef& operator=(const AsyncValueOwningRef&) = delete;
+
+  AsyncValueOwningRef& operator=(AsyncValueOwningRef&& other) noexcept {
+    Destroy();
+    std::swap(value_, other.value_);
+    return *this;
+  }
+
+  AsyncValueOwningRef(AsyncValueOwningRef&& other) noexcept {
+    Destroy();
+    std::swap(value_, other.value_);
+  }
+
+  AsyncValueRef<T> AsRef() const { return AsyncValueRef<T>(FormRef(value_)); }
+  AsyncValuePtr<T> AsPtr() const { return AsyncValuePtr<T>(value_); }
+
+  T* operator->() const { return &value_->get(); }
+  T& operator*() const { return value_->get(); }
+
+ private:
+  template <typename U, typename... Args>
+  friend AsyncValueOwningRef<U> MakeConstructedAsyncValueRef(
+      internal::AsyncValueStorage<U>&, Args&&...);
+
+  template <typename U, typename... Args>
+  friend AsyncValueOwningRef<U> MakeAvailableAsyncValueRef(
+      internal::AsyncValueStorage<U>&, Args&&...);
+
+  explicit AsyncValueOwningRef(internal::ConcreteAsyncValue<T>* value)
+      : value_(value) {}
+
+  void Destroy() {
+    if (value_) {
+      CallDestructor(value_);
+      value_ = nullptr;
+    }
+  }
+
+  // Work around NVCC compilation error.
+  template <typename U>
+  void CallDestructor(U* ptr) {
+    ptr->~U();
+  }
+
+  internal::ConcreteAsyncValue<T>* value_ = nullptr;
+};
+
+// Constructs an AsyncValueRef in the provided storage without making it
+// available for consumption. The AsyncValueRef should be made available later
+// by invoking AsyncValueRef::SetStateConcrete or AsyncValueRef::SetError.
+template <typename T, typename... Args>
+AsyncValueOwningRef<T> MakeConstructedAsyncValueRef(
+    internal::AsyncValueStorage<T>& storage, Args&&... args) {
+  return AsyncValueOwningRef<T>(
+      internal::PlacementConstruct<internal::ConcreteAsyncValue<T>>(
+          storage.buf(),
+          typename internal::ConcreteAsyncValue<T>::ConstructedPayload{false},
+          std::forward<Args>(args)...));
+}
+
+// Construct an available AsyncValueRef in the provided storage.
+template <typename T, typename... Args>
+AsyncValueOwningRef<T> MakeAvailableAsyncValueRef(
+    internal::AsyncValueStorage<T>& storage, Args&&... args) {
+  return AsyncValueOwningRef<T>(
+      internal::PlacementConstruct<internal::ConcreteAsyncValue<T>>(
+          storage.buf(),
+          typename internal::ConcreteAsyncValue<T>::ConcretePayload{false},
+          std::forward<Args>(args)...));
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/chain.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/chain.h
new file mode 100644
index 00000000..4c8685fd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/chain.h
@@ -0,0 +1,26 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_CONCURRENCY_CHAIN_H_
+#define XLA_TSL_CONCURRENCY_CHAIN_H_
+
+namespace tsl {
+
+// An empty struct to signal completion of asynchronous events.
+class Chain {};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_CONCURRENCY_CHAIN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/concurrent_vector.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/concurrent_vector.h
new file mode 100644
index 00000000..aebca036
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/concurrent_vector.h
@@ -0,0 +1,169 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+#define XLA_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+namespace internal {
+
+// A simple concurrent sequential container that allows concurrent reads and
+// writes and is optimized for read access. It is designed for the usage pattern
+// where objects are inserted once but are read many times. The key difference
+// between this data structure and std::vector is that when we re-allocate the
+// underlying buffer, we do not free the previous buffer. This allows us to
+// implement read access with a single atomic load.
+//
+// Sample usage:
+//
+// ConcurrentVector<T> vec;
+//
+// On the writer side, concurrent writers are allowed;
+//
+// size_t index1 = vec.emplace_back(args);
+// size_t index2 = vec.emplace_back(args);
+//
+// On the reader side, concurrent readers are allowed.
+//
+// auto& t1 = vec[index1];
+// auto& t2 = vec[index1];
+//
+// Requirements:
+//
+// Type T needs to be copyable.
+
+template <typename T>
+class ConcurrentVector {
+ public:
+  // Initialize the vector with the given initial_capapcity
+  explicit ConcurrentVector(size_t initial_capacity) : state_(0ull) {
+    auto& v = all_allocated_elements_[0];
+    v.reserve(std::max(static_cast<size_t>(1), initial_capacity));
+  }
+
+  const T& operator[](size_t index) const {
+    auto state = State::Decode(state_.load(std::memory_order_acquire));
+    DCHECK_LT(index, state.size);
+    // .data() is a workaround for libc++ assertions in operator[], which will
+    // cause data race when container is resized from another thread.
+    return all_allocated_elements_.data()[state.last_allocated].data()[index];
+  }
+
+  absl::Span<const T> ToConstSpan() const {
+    auto state = State::Decode(state_.load(std::memory_order_acquire));
+    auto& storage = all_allocated_elements_[state.last_allocated];
+    // .data() is a workaround for libc++ assertions in operator[], which will
+    // cause data race when container is resized from another thread.
+    return absl::MakeConstSpan(storage.data(), state.size);
+  }
+
+  // Return the number of elements currently valid in this vector.  The vector
+  // only grows, so this is conservative w.r.t. the execution of the current
+  // thread.
+  size_t size() const {
+    return State::Decode(state_.load(std::memory_order_relaxed)).size;
+  }
+
+  // Insert a new element at the end. If the current buffer is full, we allocate
+  // a new buffer with twice as much capacity and copy the items in the
+  // previous buffer over.
+  //
+  // Returns the index of the newly inserted item.
+  template <typename... Args>
+  size_t emplace_back(Args&&... args) {
+    absl::MutexLock lock(&mutex_);
+
+    auto state = State::Decode(state_.load(std::memory_order_relaxed));
+    auto& last = all_allocated_elements_[state.last_allocated];
+
+    if (last.size() < last.capacity()) {
+      // There is still room in the current vector without reallocation. Just
+      // add the new element there.
+      last.emplace_back(std::forward<Args>(args)...);
+
+      // Increment the size of the concurrent vector.
+      state.size += 1;
+      state_.store(state.Encode(), std::memory_order_release);
+
+      return state.size - 1;  // return insertion index
+    }
+    // There is no more room in the current vector without reallocation.
+    // Allocate a new vector with twice as much capacity, copy the elements
+    // from the previous vector, and set elements_ to point to the data of the
+    // new vector.
+    auto& new_last = all_allocated_elements_[state.last_allocated + 1];
+    new_last.reserve(last.capacity() * 2);
+    DCHECK_EQ(last.size(), last.capacity());
+
+    // Copy over the previous vector to the new vector.
+    new_last.insert(new_last.begin(), last.begin(), last.end());
+    new_last.emplace_back(std::forward<Args>(args)...);
+
+    // Increment the size of the concurrent vector and index of the last
+    // allocated vector.
+    state.last_allocated += 1;
+    state.size += 1;
+    state_.store(state.Encode(), std::memory_order_release);
+
+    return state.size - 1;  // return insertion index
+  }
+
+ private:
+  // Concurrent vector state layout:
+  // - Low 32 bits encode the index of the last allocated vector.
+  // - High 32 bits encode the size of the concurrent vector.
+  static constexpr uint64_t kLastAllocatedMask = (1ull << 32) - 1;
+  static constexpr uint64_t kSizeMask = ((1ull << 32) - 1) << 32;
+
+  struct State {
+    uint64_t last_allocated;  // index of last allocated vector
+    uint64_t size;            // size of the concurrent vector
+
+    static State Decode(uint64_t state) {
+      uint64_t last_allocated = (state & kLastAllocatedMask);
+      uint64_t size = (state & kSizeMask) >> 32;
+      return {last_allocated, size};
+    }
+
+    uint64_t Encode() const { return (size << 32) | last_allocated; }
+  };
+
+  // Stores/loads to/from this atomic used to enforce happens-before
+  // relationship between emplace_back and operator[].
+  std::atomic<uint64_t> state_;
+
+  absl::Mutex mutex_;
+
+  // ConcurrentVector does not support inserting more than 2^64 elements,
+  // which should be more than enough for any reasonable use case.
+  std::array<std::vector<T>, 64> all_allocated_elements_;
+};
+
+}  // namespace internal
+}  // namespace tsl
+
+#endif  // XLA_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/ref_count.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/ref_count.h
new file mode 100644
index 00000000..4ea65eea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/concurrency/ref_count.h
@@ -0,0 +1,265 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_CONCURRENCY_REF_COUNT_H_
+#define XLA_TSL_CONCURRENCY_REF_COUNT_H_
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+namespace tsl {
+
+namespace internal {
+// TODO(ezhulenev): Replace with C++20 concept when available.
+// https://en.cppreference.com/w/cpp/concepts/derived_from
+template <typename Derived, typename Base>
+using DerivedFrom = typename std::enable_if_t<std::is_base_of_v<Base, Derived>>;
+}  // namespace internal
+
+#ifndef NDEBUG
+inline std::atomic<size_t> total_reference_counted_objects;
+
+// Return the total number of reference-counted objects that are currently
+// live in the process.  This is intended for debugging/assertions only, and
+// shouldn't be used for mainline logic in the runtime.
+inline size_t GetNumReferenceCountedObjects() {
+  return total_reference_counted_objects.load(std::memory_order_relaxed);
+}
+inline void AddNumReferenceCountedObjects() {
+  total_reference_counted_objects.fetch_add(1, std::memory_order_relaxed);
+}
+inline void DropNumReferenceCountedObjects() {
+  total_reference_counted_objects.fetch_sub(1, std::memory_order_relaxed);
+}
+#else
+inline void AddNumReferenceCountedObjects() {}
+inline void DropNumReferenceCountedObjects() {}
+#endif
+
+// This class is a common base class for things that need an atomic reference
+// count for ownership management.
+//
+// Subclasses of this are allowed to implement a Destroy() instance method,
+// which allows custom allocation/deallocation logic.
+//
+// This class intentionally doesn't have a virtual destructor or anything else
+// that would require a vtable, but subclasses can have one if they choose.
+template <typename SubClass>
+class ReferenceCounted {
+ public:
+  ReferenceCounted() : ReferenceCounted(1) {}
+  explicit ReferenceCounted(unsigned ref_count) : ref_count_(ref_count) {
+    AddNumReferenceCountedObjects();
+  }
+
+  ~ReferenceCounted() {
+    assert(ref_count_.load() == 0 &&
+           "Shouldn't destroy a reference counted object with references!");
+    DropNumReferenceCountedObjects();
+  }
+
+  // Not copyable or movable.
+  ReferenceCounted(const ReferenceCounted&) = delete;
+  ReferenceCounted& operator=(const ReferenceCounted&) = delete;
+
+  // Add a new reference to this object.
+  void AddRef() {
+    assert(ref_count_.load(std::memory_order_relaxed) >= 1);
+    // It is OK to use std::memory_order_relaxed here as it does not affect the
+    // ownership state of the object.
+    ref_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  // Drop a reference to this object, potentially deallocating it.
+  void DropRef() {
+    assert(ref_count_.load(std::memory_order_relaxed) > 0);
+
+    // If ref_count_==1, this object is owned only by the caller. Bypass a
+    // locked op in that case.
+    if (ref_count_.load(std::memory_order_acquire) == 1 ||
+        ref_count_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      // Make assert in ~ReferenceCounted happy
+      assert((ref_count_.store(0, std::memory_order_relaxed), true));
+      static_cast<SubClass*>(this)->Destroy();
+    }
+  }
+
+  // Return reference count. This should be used for testing and debugging only.
+  uint32_t NumRef() const { return ref_count_.load(); }
+
+  // Return true if reference count is 1.
+  bool IsUnique() const {
+    return ref_count_.load(std::memory_order_acquire) == 1;
+  }
+
+ protected:
+  // Subclasses are allowed to customize this, but the default implementation of
+  // Destroy() just deletes the pointer.
+  void Destroy() { delete static_cast<SubClass*>(this); }
+
+ private:
+  std::atomic<unsigned> ref_count_;
+};
+
+// This is a smart pointer that keeps the specified reference counted value
+// around.
+template <typename T>
+class RCReference {
+ public:
+  RCReference() : pointer_(nullptr) {}
+
+  RCReference(RCReference&& other) noexcept : pointer_(other.pointer_) {
+    other.pointer_ = nullptr;
+  }
+
+  RCReference(const RCReference& other) : pointer_(other.pointer_) {
+    if (pointer_) pointer_->AddRef();
+  }
+
+  RCReference& operator=(RCReference&& other) noexcept {
+    reset(other.pointer_);
+    other.pointer_ = nullptr;
+    return *this;
+  }
+
+  RCReference& operator=(const RCReference& other) {
+    reset(other.pointer_);
+    if (pointer_) pointer_->AddRef();
+    return *this;
+  }
+
+  // Support implicit conversion from RCReference<Derived> to RCReference<Base>.
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  RCReference(RCReference<Derived>&& u) : pointer_(u.pointer_) {  // NOLINT
+    u.pointer_ = nullptr;
+  }
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  RCReference(const RCReference<Derived>& u) : pointer_(u.pointer_) {  // NOLINT
+    if (pointer_) pointer_->AddRef();
+  }
+
+  ~RCReference() {
+    if (pointer_ != nullptr) pointer_->DropRef();
+  }
+
+  void reset(T* pointer = nullptr) {
+    if (pointer_ != nullptr) pointer_->DropRef();
+    pointer_ = pointer;
+  }
+
+  T* release() {
+    T* tmp = pointer_;
+    pointer_ = nullptr;
+    return tmp;
+  }
+
+  T& operator*() const {
+    assert(pointer_ && "null RCReference");
+    return *pointer_;
+  }
+
+  T* operator->() const {
+    assert(pointer_ && "null RCReference");
+    return pointer_;
+  }
+
+  // Return a raw pointer.
+  T* get() const { return pointer_; }
+
+  // Make an explicit copy of this RCReference, increasing the refcount by one.
+  [[deprecated("Use copy constructor instead.")]] RCReference CopyRef() const;
+
+  explicit operator bool() const { return pointer_ != nullptr; }
+
+  void swap(RCReference& other) noexcept {
+    using std::swap;
+    swap(pointer_, other.pointer_);
+  }
+
+  bool operator==(const RCReference& ref) const {
+    return pointer_ == ref.pointer_;
+  }
+  bool operator!=(const RCReference& ref) const {
+    return pointer_ != ref.pointer_;
+  }
+
+  friend bool operator==(const RCReference& ref, std::nullptr_t) {
+    return ref.pointer_ == nullptr;
+  }
+  friend bool operator==(std::nullptr_t, const RCReference& ref) {
+    return ref.pointer_ == nullptr;
+  }
+  friend bool operator!=(const RCReference& ref, std::nullptr_t) {
+    return ref.pointer_ != nullptr;
+  }
+  friend bool operator!=(std::nullptr_t, const RCReference& ref) {
+    return ref.pointer_ != nullptr;
+  }
+
+  template <typename R>
+  friend RCReference<R> FormRef(R*);
+  template <typename R>
+  friend RCReference<R> TakeRef(R*);
+
+ private:
+  T* pointer_;
+
+  template <typename R>
+  friend class RCReference;
+};
+
+// Add a new reference to the specified pointer.
+template <typename T>
+RCReference<T> FormRef(T* pointer) {
+  RCReference<T> ref;
+  ref.pointer_ = pointer;
+  pointer->AddRef();
+  return ref;
+}
+
+// Return an RCReference for the specified object and *takes ownership* of a
+// +1 reference.  When destroyed, this will drop the reference.
+template <typename T>
+RCReference<T> TakeRef(T* pointer) {
+  RCReference<T> ref;
+  ref.pointer_ = pointer;
+  return ref;
+}
+
+template <typename T>
+RCReference<T> RCReference<T>::CopyRef() const {
+  if (!pointer_) return RCReference();
+  return FormRef(get());
+}
+
+// Create a new reference counted object, similar to std::make_shared.
+template <typename T, typename... Args>
+RCReference<T> MakeRef(Args&&... args) {
+  auto t = new T(std::forward<Args>(args)...);
+  return TakeRef(t);
+}
+// For ADL style swap.
+template <typename T>
+void swap(RCReference<T>& a, RCReference<T>& b) noexcept {
+  a.swap(b);
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_CONCURRENCY_REF_COUNT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/call_options.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/call_options.h
new file mode 100644
index 00000000..95231e12
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/call_options.h
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+
+#include <functional>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+// Options passed to interface calls. This class provides portable
+// functionality across different RPC systems on top of
+// platform-specific mechanisms (for client and server contexts,
+// cancellation, etc.).
+//
+// TODO(zhifengc): Maybe change all RPC methods to take CallOptions.
+class CallOptions {
+ public:
+  CallOptions();
+
+  // Cancellation.
+  //
+  // The caller may call StartCancel() anytime as long as this
+  // CallOptions object is alive. The callee may or may not receive
+  // the cancellation notification depending on the rpc layer
+  // implementation.
+  void StartCancel();
+
+  // The callee (the rpc layer implementation) must set a cancellation
+  // notifier before its blocking operation and clear the notifier
+  // before the call returns.
+  //
+  // "cancel_func" may be called zero, once or more time. Therefore, it
+  // should _not_ be responsible for memory management of any objects.
+  //
+  // "cancel_func" must be very light-weight. It should not block on
+  // IO or locking. Typically, it just calls the rpc implementation
+  // layer's specific cancellation mechanism and does nothing else.
+  //
+  // NOTE: "cancel_func" itself is pass-by-value. Therefore, we do not
+  // worry about its ownership here.
+  typedef std::function<void()> CancelFunction;
+  void SetCancelCallback(CancelFunction cancel_func);
+  void ClearCancelCallback();
+
+  // Get and set operation timeout. Timeout value is in milliseconds.
+  //
+  // Default: 0. indicating there is no timeout for this call.
+  int64_t GetTimeout();
+  void SetTimeout(int64_t ms);
+
+ private:
+  mutex mu_;
+  CancelFunction cancel_func_ TF_GUARDED_BY(mu_);
+
+  // RPC operation timeout in milliseconds.
+  int64_t timeout_in_ms_ TF_GUARDED_BY(mu_) = 0;
+
+  CallOptions(const CallOptions&) = delete;
+  void operator=(const CallOptions&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
new file mode 100644
index 00000000..6efd02a7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
@@ -0,0 +1,160 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+using tensorflow::BarrierRequest;
+using tensorflow::BarrierResponse;
+using tensorflow::CancelBarrierRequest;
+using tensorflow::CancelBarrierResponse;
+using tensorflow::DeleteKeyValueRequest;
+using tensorflow::DeleteKeyValueResponse;
+using tensorflow::GetAliveTasksRequest;
+using tensorflow::GetAliveTasksResponse;
+using tensorflow::GetKeyValueDirRequest;
+using tensorflow::GetKeyValueDirResponse;
+using tensorflow::GetKeyValueRequest;
+using tensorflow::GetKeyValueResponse;
+using tensorflow::GetTaskStateRequest;
+using tensorflow::GetTaskStateResponse;
+using tensorflow::HeartbeatRequest;
+using tensorflow::HeartbeatResponse;
+using tensorflow::InsertKeyValueRequest;
+using tensorflow::InsertKeyValueResponse;
+using tensorflow::PollForErrorRequest;
+using tensorflow::PollForErrorResponse;
+using tensorflow::RegisterTaskRequest;
+using tensorflow::RegisterTaskResponse;
+using tensorflow::ReportErrorToServiceRequest;
+using tensorflow::ReportErrorToServiceResponse;
+using tensorflow::ReportErrorToTaskRequest;
+using tensorflow::ReportErrorToTaskResponse;
+using tensorflow::ResetTaskRequest;
+using tensorflow::ResetTaskResponse;
+using tensorflow::ShutdownTaskRequest;
+using tensorflow::ShutdownTaskResponse;
+using tensorflow::TryGetKeyValueRequest;
+using tensorflow::TryGetKeyValueResponse;
+using tensorflow::WaitForAllTasksRequest;
+using tensorflow::WaitForAllTasksResponse;
+
+// Base class of client interface for communicating with coordination service.
+// Can be implemented by a variety of transports such as gRPC.
+class CoordinationClient {
+ public:
+  virtual ~CoordinationClient() = default;
+
+  virtual void RegisterTaskAsync(CallOptions* call_opts,
+                                 const RegisterTaskRequest* request,
+                                 RegisterTaskResponse* response,
+                                 StatusCallback done) = 0;
+
+  virtual void HeartbeatAsync(CallOptions* call_opts,
+                              const HeartbeatRequest* request,
+                              HeartbeatResponse* response,
+                              StatusCallback done) = 0;
+
+  virtual void WaitForAllTasksAsync(const WaitForAllTasksRequest* request,
+                                    WaitForAllTasksResponse* response,
+                                    StatusCallback done) = 0;
+
+  virtual void ShutdownTaskAsync(CallOptions* call_opts,
+                                 const ShutdownTaskRequest* request,
+                                 ShutdownTaskResponse* response,
+                                 StatusCallback done) = 0;
+
+  virtual void ResetTaskAsync(const ResetTaskRequest* request,
+                              ResetTaskResponse* response,
+                              StatusCallback done) = 0;
+
+  virtual void ReportErrorToTaskAsync(CallOptions* call_opts,
+                                      const ReportErrorToTaskRequest* request,
+                                      ReportErrorToTaskResponse* response,
+                                      StatusCallback done) = 0;
+
+  virtual void ReportErrorToServiceAsync(
+      const ReportErrorToServiceRequest* request,
+      ReportErrorToServiceResponse* response, StatusCallback done) = 0;
+
+  virtual void GetTaskStateAsync(const GetTaskStateRequest* request,
+                                 GetTaskStateResponse* response,
+                                 StatusCallback done) = 0;
+
+  virtual void InsertKeyValueAsync(const InsertKeyValueRequest* request,
+                                   InsertKeyValueResponse* response,
+                                   StatusCallback done) = 0;
+
+  virtual void GetKeyValueAsync(CallOptions* call_opts,
+                                const GetKeyValueRequest* request,
+                                GetKeyValueResponse* response,
+                                StatusCallback done) = 0;
+
+  virtual void TryGetKeyValueAsync(const TryGetKeyValueRequest* request,
+                                   TryGetKeyValueResponse* response,
+                                   StatusCallback done) = 0;
+
+  virtual void GetKeyValueDirAsync(const GetKeyValueDirRequest* request,
+                                   GetKeyValueDirResponse* response,
+                                   StatusCallback done) = 0;
+
+  virtual void DeleteKeyValueAsync(const DeleteKeyValueRequest* request,
+                                   DeleteKeyValueResponse* response,
+                                   StatusCallback done) = 0;
+
+  virtual void BarrierAsync(CallOptions* call_opts,
+                            const BarrierRequest* request,
+                            BarrierResponse* response, StatusCallback done) = 0;
+
+  virtual void CancelBarrierAsync(const CancelBarrierRequest* request,
+                                  CancelBarrierResponse* response,
+                                  StatusCallback done) = 0;
+
+  virtual void GetAliveTasksAsync(const GetAliveTasksRequest* request,
+                                  GetAliveTasksResponse* response,
+                                  StatusCallback done) = 0;
+
+  virtual void PollForErrorAsync(CallOptions* call_opts,
+                                 const PollForErrorRequest* request,
+                                 PollForErrorResponse* response,
+                                 StatusCallback done) = 0;
+};
+
+// Simple wrapper class that can be used to retrieve CoordinationClients.
+class CoordinationClientCache {
+ public:
+  virtual ~CoordinationClientCache() = default;
+
+  // If the `target` names a remote task, returns a pointer of the
+  // CoordinationClient object wrapping that channel to the remote task.
+  virtual CoordinationClient* GetClient(const std::string& target) = 0;
+
+  // If the `target` names a remote task, returns an owned pointer of the
+  // CoordinationClient object wrapping that channel to the remote task.
+  virtual std::unique_ptr<CoordinationClient> GetOwnedClient(
+      const std::string& target) = 0;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
new file mode 100644
index 00000000..0dac23f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
@@ -0,0 +1,326 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/coordination_config.pb.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+class Env;
+
+// Static registration for coordination service implementations.
+#define REGISTER_COORDINATION_SERVICE(service_type_name, factory_fn)        \
+  REGISTER_COORDINATION_SERVICE_UNIQ_HELPER(__COUNTER__, service_type_name, \
+                                            factory_fn)
+#define REGISTER_COORDINATION_SERVICE_UNIQ_HELPER(counter, service_type_name, \
+                                                  factory_fn)                 \
+  static bool static_coordination_service_##counter TF_ATTRIBUTE_UNUSED =     \
+      []() {                                                                  \
+        ::tsl::CoordinationServiceInterface::RegisterCoordinationService(     \
+            service_type_name, std::move(factory_fn));                        \
+        return true;                                                          \
+      }()
+
+// Coordination service is used for controlling and coordinating distributed
+// execution in a cluster of multiple tasks.
+//
+// When enabled, the service keeps track of cluster configurations and the state
+// of cluster members. TF runtime and libraries can use it to orchestrate
+// cluster initialization, check the healthiness of tasks, and propagate error
+// messages to the cluster.
+//
+// Normally, the service should first Start(), then perform the supported
+// coordination operations, and finally Stop(). When service runs into error or
+// SetError() is called, all subsequent operations will be in error state.
+//
+// CoordinationServiceInterface defines the service interface for distributed
+// coordination. One instance of the service should be deployed in a cluster,
+// handling various requests and stores configuration key-value data for the
+// tasks. Each task interacts with the service through CoordinationServiceAgent.
+class CoordinationServiceInterface {
+ public:
+  using CoordinationServiceFactory =
+      std::function<std::unique_ptr<CoordinationServiceInterface>(
+          Env* env, const tensorflow::CoordinationServiceConfig& config,
+          std::unique_ptr<CoordinationClientCache> cache)>;
+
+  using StatusOrValueCallback =
+      std::function<void(const absl::StatusOr<std::string_view>&)>;
+  using BarrierCallback = std::function<void(const absl::Status&, int64_t)>;
+  using GetAliveTasksCallback = std::function<void(
+      const absl::Status&, const std::vector<tensorflow::CoordinatedTask>&)>;
+
+  virtual ~CoordinationServiceInterface() = default;
+
+  static void RegisterCoordinationService(
+      std::string_view service_type_name,
+      CoordinationServiceFactory factory_fn) {
+    auto factories = GetCoordinationServiceFactories();
+    factories->emplace(service_type_name, factory_fn);
+  }
+
+  static std::unique_ptr<CoordinationServiceInterface>
+  EnableCoordinationService(Env* env,
+                            const tensorflow::CoordinationServiceConfig& config,
+                            std::unique_ptr<CoordinationClientCache> cache) {
+    const auto* factories = GetCoordinationServiceFactories();
+    auto factories_iter = factories->find(config.service_type());
+    if (factories_iter == factories->end()) {
+      LOG(ERROR) << "No coordination service factory found for service type "
+                 << config.service_type();
+      return nullptr;
+    }
+    auto service = factories_iter->second(env, config, std::move(cache));
+    if (service != nullptr) {
+      *GetCoordinationServiceInstancePtr() = service.get();
+    }
+    return service;
+  }
+
+  static CoordinationServiceInterface* GetCoordinationServiceInstance() {
+    return *GetCoordinationServiceInstancePtr();
+  }
+
+  // This function is invoked after each task's local devices are appended in a
+  // deterministic order during WaitForAllTasks(). This is useful to convert the
+  // result into another message, or set global device ids.
+  virtual void SetDeviceAggregationFunction(
+      std::function<
+          tensorflow::DeviceInfo(const tensorflow::DeviceInfo& devices)>
+          post_aggregate_device_fn) = 0;
+
+  // Register a task to the service.
+  // Possible service errors:
+  //   - Internal: Service has shut down.
+  //   - InvalidArgument: Unexpected task request.
+  //   - Aborted: (1) task is in error state, or (2) task is in connected state
+  //       with a different incarnation, indicating that it restarted.
+  //   - DeadlineExceeded: waited too long for straggler tasks to register.
+  virtual absl::Status RegisterTask(const tensorflow::CoordinatedTask& task,
+                                    uint64_t incarnation) = 0;
+  virtual void RegisterTaskAsync(const tensorflow::CoordinatedTask& task,
+                                 uint64_t incarnation, StatusCallback done) = 0;
+
+  // Wait for all tasks to be up and running, and register local device
+  // info. The callback is invoked when all tasks are up and registered, or some
+  // error occurs.
+  // Each task's local devices will be appended in a deterministic order, and
+  // post-processed by the callback in SetDeviceAggregationFunction() (if set).
+  virtual void WaitForAllTasks(const tensorflow::CoordinatedTask& task,
+                               const tensorflow::DeviceInfo& devices,
+                               StatusCallback done) = 0;
+
+  // Disconnects task from the service. If `shutdown_barrier_timeout_in_ms` is
+  // specified in the config, blocks until all tasks reach the barrier before
+  // disconnecting together.
+  // Possible service errors:
+  //   - Internal: Service has shut down.
+  //   - InvalidArgument: Unexpected task request.
+  //   - FailedPrecondition: task has already disconnected.
+  virtual void ShutdownTaskAsync(const tensorflow::CoordinatedTask& task,
+                                 StatusCallback done) = 0;
+
+  // Disconnects task from the service and cleans up its internal error state.
+  // Possible service errors:
+  //   - Internal: Service has shut down.
+  //   - InvalidArgument: Unexpected task request.
+  //   - FailedPrecondition: task has already disconnected.
+  virtual absl::Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
+
+  // Update the heartbeat timestamp of a task. This should only be invoked on
+  // the leader of the cluster.
+  //   - Internal: Service has shut down.
+  virtual absl::Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
+                                       uint64_t incarnation) = 0;
+
+  // Set a task in error state permanently.
+  virtual absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
+                                       const absl::Status& error) = 0;
+
+  // Get the state and the error status of the tasks.
+  virtual std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
+      const std::vector<tensorflow::CoordinatedTask>& task) = 0;
+
+  // Insert a configuration key-value in the coordination service.
+  // For now, a key-value can only be inserted once and cannot be updated.
+  // The key-values are not persisted and will be lost if the leader fails.
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value,
+                                      bool allow_overwrite) = 0;
+
+  // Get a configuration key-value from the coordination service. The `done`
+  // callback is invoked when the key-value becomes available.
+  virtual void GetKeyValueAsync(std::string_view key,
+                                StatusOrValueCallback done) = 0;
+
+  // Get a configuration key-value from the coordination service. If the key
+  // does not exist, return NotFound error.
+  virtual absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
+
+  // Gets all values under a directory (key).
+  // A value is considered to be in the directory if its key is prefixed with
+  // the directory. This is not a blocking call. Agent does not need to be
+  // connected to utilize the distributed key-value store.
+  virtual std::vector<tensorflow::KeyValueEntry> GetKeyValueDir(
+      std::string_view directory_key) = 0;
+
+  // Delete configuration key-value. If key is a directory, recursively clean
+  // up all key-values under the directory.
+  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
+
+  // Blocks until all (or a subset of) tasks are at the barrier or the barrier
+  // fails.
+  //
+  // `barrier_id` should be unique across barriers. Once the barrier has passed
+  // or failed, subsequent calls will not block, and immediately respond with
+  // the previous response.
+  //
+  // The first WaitAtBarrier() call received by the service for a particular
+  // barrier id is special in that it determines the barrier deadline based on
+  // timeout duration.
+  // However, if subsequent calls by different agents specify a different set of
+  // `participating_tasks` for the same `barrier_id`, the barrier will fail
+  // instantly.
+  //
+  // If no tasks are specified (default), the barrier will block for all the
+  // connected tasks.
+  //
+  // Possible service errors:
+  //   - DeadlineExceeded: Timed out waiting for specified tasks at the barrier.
+  //       Deadline is determined by the server timestamp when it receives the
+  //       first WaitAtBarrier() + timeout duration.
+  //   - Cancelled: One of the tasks called CancelBarrier().
+  //   - Aborted: Service is shutting down.
+  //   - Internal: (1) Any participating task is in ERROR state, (2)
+  //       coordination service has shut down, or (3) the barrier request has a
+  //       mismatched counter, indicating that somebody unexpectedly restarted.
+  //   - InvalidArgument: (1) Conflicting tasks specified by different agents
+  //       for the same barrier, (2) one of the participating tasks is not in
+  //       the cluster, or (3) task making the request is not included in the
+  //       list of participating tasks.
+  //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state.
+  // TODO(b/342448688): Allow re-use of ids by specifying different counters.
+  // The counter field is mostly ignored at the moment with no user-facing
+  // effect.
+  virtual void BarrierAsync(
+      std::string barrier_id, int64_t counter, absl::Duration timeout,
+      const tensorflow::CoordinatedTask& task,
+      const std::vector<tensorflow::CoordinatedTask>& participating_tasks,
+      BarrierCallback done) = 0;
+
+  // Aborts the barrier if it is ongoing.
+  // Current and future WaitAtBarrier() calls with the same id will return a
+  // CANCELLED error status.
+  // Possible service errors:
+  //   - FailedPrecondition: Barrier has already been passed.
+  // TODO(b/342448688): Allow re-use of ids by specifying different counters.
+  // The counter field is mostly ignored at the moment with no user-facing
+  // effect.
+  virtual absl::Status CancelBarrier(
+      std::string barrier_id, int64_t counter,
+      const tensorflow::CoordinatedTask& task) = 0;
+
+  // Returns the set of currently alive tasks. More specifically, given a set of
+  // tasks T, GetAliveTasks(T) returns the subset T of alive tasks. Note that
+  // `tasks` must include `requesting_task`.
+  //
+  // # Barrier Semantics
+  //
+  // If multiple tasks call GetAliveTasks concurrently, it's important that they
+  // all agree on which tasks are alive. Otherwise, the tasks' behavior might
+  // diverge. For example, imagine a set of tasks trying to run an AllGather,
+  // but they all disagree on which tasks should be participating in the
+  // AllGather. This is buggy.
+  //
+  // To ensure that every task agrees on which tasks are alive, the
+  // GetAliveTasks RPC has barrier-like semantics. Consider an invocation
+  // GetAliveTasks(T) for a set of tasks T. The invocation acts as a barrier,
+  // waiting for every task in T to call GetAliveTasks(T). Afterwards,
+  // GetAliveTasks returns the same set of alive tasks A to all the tasks in T.
+  // This ensures that every task agrees which tasks are alive.
+  //
+  // One small correction. GetAliveTasks doesn't act as a barrier for *every*
+  // task in T. Some tasks in T might have failed, so we should not wait for
+  // them. Instead, the GetAliveTasks RPC waits only for the returned tasks A.
+  //
+  // # An Example
+  //
+  // Imagine we have four tasks: A, B, C, and D. Further imagine that task D
+  // has failed and that every task calls GetAliveTasks([A, B, C, D]). The
+  // invocation will return tasks [A, B, C]. The GetAliveTasks call acts as a
+  // barrier across tasks A, B, and C. Task D, which failed, is ignored.
+  virtual void GetAliveTasksAsync(
+      const tensorflow::CoordinatedTask& requesting_task,
+      const std::vector<tensorflow::CoordinatedTask>& tasks,
+      GetAliveTasksCallback done) = 0;
+
+  // Gets error from the coordination service. Block until the service
+  // returns an error or the task/service is shutdown. This should never be used
+  // when there is service to client connection (i.e. `CoordinationClientCache`
+  // is passed in during construction).
+  //
+  // The first call to this function will trigger the error polling mode in the
+  // coordination service, so once an error occurs after the first call, the
+  // service will use the error polling mode to propagate the error to all
+  // connected tasks instead of simply shutting down.
+  virtual void PollForErrorAsync(const tensorflow::CoordinatedTask& task,
+                                 StatusCallback done) = 0;
+
+ private:
+  friend class CoordinationServiceRpcHandler;
+  friend class CoordinationServiceTest_ListClusterDevices_TfDevice_Test;
+  friend class CoordinationServiceTest_ListClusterDevices_XlaDevice_Test;
+  friend class
+      CoordinationServiceTest_ListClusterDevices_DevicesAreNotAddedTwice_Test;
+
+  virtual const tensorflow::DeviceInfo& ListClusterDevices() = 0;
+  virtual uint64_t GetServiceIncarnation() = 0;
+
+  static std::unordered_map<std::string, CoordinationServiceFactory>*
+  GetCoordinationServiceFactories() {
+    static auto* coordination_service_factories =
+        new std::unordered_map<std::string, CoordinationServiceFactory>();
+    return coordination_service_factories;
+  }
+
+  static CoordinationServiceInterface** GetCoordinationServiceInstancePtr() {
+    static CoordinationServiceInterface* instance = nullptr;
+    return &instance;
+  }
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
new file mode 100644
index 00000000..2cfef926
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -0,0 +1,327 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+
+namespace tensorflow {
+class CoordinationServiceConfig;
+};  // namespace tensorflow
+
+namespace tsl {
+class Env;
+
+// CoordinationServiceAgent defines the interface for tasks to communicate with
+// the coordination service instance (which implements
+// CoordinationServiceInterface). One instance of the agent should be deployed
+// on each task for it to send various requests and stores / retrieves config
+// key-value data to the service.
+//
+// See CoordinationServiceInterface for more details on coordination service.
+//
+// All coordination service errors will have an additional
+// CoordinationServiceError payload to distinguish themselves from RPC failures.
+// The payload can optionally specify the error origin, and if the error is
+// reported by the user via `agent->ReportError()`.
+//
+// Possible service errors:
+//    - Internal: Coordination service has shut down or has not been enabled.
+//    - Aborted: Incarnation mismatch during heartbeat (either remote
+//                       task or coordination service has restarted).
+//    - Unavailable: Heartbeat timeout from remote task (failed,
+//                           crashed or got preempted).
+//    - InvalidArgument: Unexpected heartbeat from remote task (not
+//                               registered or wrong config).
+class CoordinationServiceAgent {
+ public:
+  using StatusOrValueCallback =
+      std::function<void(const absl::StatusOr<std::string>&)>;
+  // Collection of key-value pairs in the same directory.
+  using StatusOrValueDirCallback = std::function<void(
+      const absl::StatusOr<std::vector<tensorflow::KeyValueEntry>>&)>;
+  using ChangedKeyValuesCallback =
+      std::function<void(const std::map<std::string, std::string>&)>;
+
+  virtual ~CoordinationServiceAgent() = default;
+
+  // Initialize coordination service agent.
+  virtual absl::Status Initialize(
+      tsl::Env* env, std::string_view job_name, int task_id,
+      const tensorflow::CoordinationServiceConfig& configs,
+      std::unique_ptr<CoordinationClient> leader_client,
+      StatusCallback error_fn, bool recoverable) = 0;
+  virtual absl::Status Initialize(
+      tsl::Env* env, std::string_view job_name, int task_id,
+      const tensorflow::CoordinationServiceConfig& configs,
+      std::unique_ptr<CoordinationClient> leader_client,
+      StatusCallback error_fn) = 0;
+  virtual absl::Status Initialize(
+      tsl::Env* env, const tensorflow::CoordinatedTask& task,
+      const tensorflow::CoordinationServiceConfig& configs,
+      std::unique_ptr<CoordinationClient> leader_client,
+      StatusCallback error_fn) = 0;
+
+  // Return true if the coordination service agent has been initialized.
+  virtual bool IsInitialized() = 0;
+
+  // Return true if the coordination service agent has successfully connected
+  // with the Coordination Service
+  virtual bool IsConnected() = 0;
+
+  // Return true if the coordination service agent has an error state.
+  virtual bool IsError() = 0;
+
+  // Connect to coordination service with the following steps:
+  //   - connect to service address specified in the config of `server_def`
+  //   - register itself as a task to the service
+  //   - start a thread to periodically send heartbeat message with the service
+  // Possible service errors:
+  //   - Internal: Coordination service has shut down.
+  //   - FailedPrecondition: Agent is not in DISCONNECTED state.
+  //   - InvalidArgument: Unexpected task registration
+  //   - Aborted: Duplicate task registration (agent will retry connecting until
+  //              the configured timeout)
+  virtual absl::Status Connect() = 0;
+
+  // Wait for all tasks to be up and registered. The call blocks until all tasks
+  // in the cluster are up, or some error occurs.
+  // Possible service errors:
+  //   - Internal: Coordination service has shut down.
+  //   - FailedPrecondition: Agent is not in CONNECTED state.
+  //   - InvalidArgument: Unexpected task request
+  virtual absl::Status WaitForAllTasks(
+      const tensorflow::DeviceInfo& local_devices) = 0;
+
+  // Get the device attributes of tasks from remote tasks in the cluster.
+  virtual const tensorflow::DeviceInfo& GetClusterDeviceInfo() = 0;
+
+  // State transition in coordination service agent:
+  //
+  //                 Init              Connect           SetError
+  //   UNINITIALIZED ---> DISCONNECTED ------> CONNECTED -------> ERROR
+  //                           ^                                  |
+  //                           |__________________________________|
+  //                                         Reset
+
+  // Get task associated with this agent.
+  virtual absl::StatusOr<tensorflow::CoordinatedTask> GetOwnTask() = 0;
+
+  // Get status of a remote task.
+  virtual absl::StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>>
+  GetTaskState(const std::vector<tensorflow::CoordinatedTask>& task) = 0;
+
+  // Report error to coordination service. This will invoke the error callback.
+  // Note that the error payload will set `is_reported_error` to true, to
+  // distinguish user-specified errors from internal service or RPC failures.
+  // Possible service errors:
+  //   - Internal: Coordination service has shut down.
+  //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
+  //   - InvalidArgument: Unexpected task request
+  virtual absl::Status ReportError(const absl::Status& error) = 0;
+
+  // Shuts down by disconnecting from the service. Should only be called if
+  // agent is connected and no further agent calls (except the destructor) are
+  // expected. If `shutdown_barrier_timeout_in_ms` is specified in the config,
+  // blocks until all tasks reach the barrier before shutting down together. If
+  // the barrier times out, this agent will still disconnect, while an error is
+  // reported to other agents that did not reach the barrier on time.
+  // Possible service errors:
+  //   - Internal: Coordination service has shut down.
+  //   - InvalidArgument: Unexpected task request.
+  //   - FailedPrecondition: Task was in error state (note: agent is still
+  //                         shut down forcefully).
+  virtual absl::Status Shutdown() = 0;
+
+  // Disconnect from the service, and clean up the internal error status.
+  // Possible service errors:
+  //   - Internal: Coordination service has shut down.
+  //   - InvalidArgument: Unexpected task request.
+  //   - FailedPrecondition: task is not in error state/has already
+  //       disconnected.
+  virtual absl::Status Reset() = 0;
+
+  // Key-value store API.
+  // The agent does not need to be connected to utilize the key-value store.
+  // There are no concurrency guarantees. To avoid a race / impose an ordering
+  // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
+
+  // Get config key-value from the service.
+  // If the key-value is not inserted yet, this is a blocking call that waits
+  // until the corresponding key is inserted.
+  //   - DeadlineExceeded: timed out waiting for key.
+  virtual absl::StatusOr<std::string> GetKeyValue(std::string_view key) = 0;
+  virtual absl::StatusOr<std::string> GetKeyValue(std::string_view key,
+                                                  absl::Duration timeout) = 0;
+  // Note: Cancel the underlying RPC call with `call_opts->StartCancel()` and
+  // `call_opts->ClearCancelCallback()`.
+  virtual std::shared_ptr<CallOptions> GetKeyValueAsync(
+      std::string_view, StatusOrValueCallback done) = 0;
+
+  // Get config key-value from the service.
+  //   - NotFound: the requested key does not exist.
+  virtual absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
+
+  // Get all values under a directory (key).
+  // A value is considered to be in the directory if its key is prefixed with
+  // the directory.
+  // This is not a blocking call. If no keys are found, an empty vector is
+  // returned immediately.
+  virtual absl::StatusOr<std::vector<tensorflow::KeyValueEntry>> GetKeyValueDir(
+      std::string_view key) = 0;
+  virtual void GetKeyValueDirAsync(std::string_view key,
+                                   StatusOrValueDirCallback done) = 0;
+
+  // Insert config key-value to the service.
+  //   - AlreadyExists: key is already set.
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
+
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value,
+                                      bool allow_overwrite) = 0;
+
+  // Delete config keys in the coordination service.
+  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
+
+  // Update the value of a config key.
+  virtual absl::Status UpdateKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
+
+  // Register a callback that will be invoked when the key or keys under the key
+  // directory are changed (inserted, deleted, or updated).
+  virtual absl::Status StartWatchKey(std::string_view key,
+                                     ChangedKeyValuesCallback on_change) = 0;
+  virtual absl::Status StopWatchKey(std::string_view key) = 0;
+
+  // Blocks until all (or a subset of) tasks are at the barrier or the barrier
+  // fails.
+  //
+  // `barrier_id` should be unique across barriers.
+  //
+  // The first WaitAtBarrier() call received by the service for a particular
+  // barrier_id is special in that it determines the barrier deadline based on
+  // timeout duration.
+  // However, if subsequent calls by different agents specify a different set of
+  // `tasks` for the same `barrier_id`, the barrier will fail instantly.
+  // For example,
+  //   agent_1->WaitAtBarrier(“barrier”, 10min, <<”worker”, 1>, <”worker”, 2>>);
+  //   agent_2->WaitAtBarrier(“barrier”, 10min, <<”worker”, 2>, <”worker”, 3>>);
+  // Barrier fails after agent_2’s call because it specifies a different set of
+  // participating tasks.
+  //
+  // If no tasks are specified (default), the barrier will block for all the
+  // connected tasks.
+  //
+  // Possible service errors:
+  //   - DeadlineExceeded: Timed out waiting for specified tasks at the barrier.
+  //      Deadline is determined by the server timestamp when it receives the
+  //      first WaitAtBarrier() + timeout duration.
+  //   - Cancelled: One of the tasks called CancelBarrier().
+  //   - Aborted: Service is shutting down.
+  //   - Internal: Any participating task is in ERROR state, or service has shut
+  //     down.
+  //   - InvalidArgument: (1) Conflicting tasks specified by different agents
+  //       for the same barrier, (2) one of the participating tasks is not in
+  //       the cluster, or (3) task making the request is not included in the
+  //       list of participating tasks.
+  //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state, or the
+  //       same barrier id is still being invoked.
+  virtual absl::Status WaitAtBarrier(
+      std::string_view barrier_id, absl::Duration timeout,
+      const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
+
+  virtual void WaitAtBarrierAsync(
+      std::string_view barrier_id, absl::Duration timeout,
+      const std::vector<tensorflow::CoordinatedTask>& tasks,
+      StatusCallback done) = 0;
+
+  // Aborts the barrier if it is ongoing.
+  // Current and future WaitAtBarrier() calls with the same id will return a
+  // CANCELLED error status.
+  // Possible service errors:
+  //   - Internal: Coordination service has shut down.
+  //   - FailedPrecondition: Barrier is non-existent or not ongoing.
+  virtual absl::Status CancelBarrier(std::string_view barrier_id) = 0;
+  virtual void CancelBarrierAsync(std::string_view barrier_id,
+                                  StatusCallback done) = 0;
+
+  // Returns the set of currently alive tasks. More specifically, given a set of
+  // tasks T, GetAliveTasks(T) returns the subset T of alive tasks.
+  //
+  // # Barrier Semantics
+  //
+  // If multiple tasks call GetAliveTasks concurrently, it's important that they
+  // all agree on which tasks are alive. Otherwise, the tasks' behavior might
+  // diverge. For example, imagine a set of tasks trying to run an AllGather,
+  // but they all disagree on which tasks should be participating in the
+  // AllGather. This is buggy.
+  //
+  // To ensure that every task agrees on which tasks are alive, the
+  // GetAliveTasks RPC has barrier-like semantics. Consider an invocation
+  // GetAliveTasks(T) for a set of tasks T. The invocation acts as a barrier,
+  // waiting for every task in T to call GetAliveTasks(T). Afterwards,
+  // GetAliveTasks returns the same set of alive tasks A to all the tasks in T.
+  // This ensures that every task agrees which tasks are alive.
+  //
+  // One small correction. GetAliveTasks doesn't act as a barrier for *every*
+  // task in T. Some tasks in T might have failed, so we should not wait for
+  // them. Instead, the GetAliveTasks RPC waits only for the returned tasks A.
+  //
+  // # An Example
+  //
+  // Imagine we have four tasks: A, B, C, and D. Further imagine that task D
+  // has failed and that every task calls GetAliveTasks([A, B, C, D]). The
+  // invocation will return tasks [A, B, C]. The GetAliveTasks call acts as a
+  // barrier across tasks A, B, and C. Task D, which failed, is ignored.
+  virtual absl::StatusOr<std::vector<tensorflow::CoordinatedTask>>
+  GetAliveTasks(const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
+
+  // Get unowned Env* that the agent was initialized with.
+  virtual absl::StatusOr<tsl::Env*> GetEnv() = 0;
+
+ protected:
+  // Set the service agent to error status and invoke the error callback.
+  // Note: different from ReportError, this does not report the error status to
+  // remote coordination service.
+  virtual void SetError(const absl::Status& error) = 0;
+
+  // Activate the key-value callback watch.
+  virtual absl::Status ActivateWatch(
+      std::string_view, const std::map<std::string, std::string>&) = 0;
+
+ private:
+  friend class CoordinationServiceRpcHandler;
+};
+
+std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent();
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
new file mode 100644
index 00000000..833f27bf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+
+constexpr absl::string_view CoordinationErrorPayloadKey() {
+  return "type.googleapis.com/tensorflow.CoordinationServiceError";
+}
+
+constexpr absl::string_view BarrierErrorPayloadKey() {
+  return "type.googleapis.com/tensorflow.BarrierError";
+}
+
+// Mark error as a coordination service error (as opposed to RPC
+// errors).
+inline absl::Status MakeCoordinationError(absl::Status s) {
+  s.SetPayload(CoordinationErrorPayloadKey(), absl::Cord(""));
+  return s;
+}
+
+inline absl::Status MakeBarrierError(absl::Status s,
+                                     std::string_view barrier_id,
+                                     int64_t counter) {
+  tensorflow::BarrierError error;
+  error.set_barrier_id(std::string(barrier_id));
+  error.set_counter(counter);
+  s.SetPayload(BarrierErrorPayloadKey(), absl::Cord(error.SerializeAsString()));
+  return MakeCoordinationError(s);
+}
+
+inline int64_t GetBarrierCounterFromError(const absl::Status& s) {
+  if (s.GetPayload(BarrierErrorPayloadKey()) == std::nullopt) {
+    return -1;
+  }
+  tensorflow::BarrierError error;
+  error.ParseFromString(
+      std::string(s.GetPayload(BarrierErrorPayloadKey()).value()));
+  return error.counter();
+}
+
+// Mark error as a coordination service error (as opposed to RPC
+// errors), and indicate error origin.
+// Errors reported via the agent API by the user should set `is_reported_error`
+// to true.
+inline absl::Status MakeCoordinationError(
+    absl::Status s, const tensorflow::CoordinatedTask& origin,
+    bool is_reported_error = false) {
+  tensorflow::CoordinationServiceError error;
+  *error.mutable_source_task() = origin;
+  error.set_is_reported_error(is_reported_error);
+  s.SetPayload(CoordinationErrorPayloadKey(),
+               absl::Cord(error.SerializeAsString()));
+  return s;
+}
+
+// Mark error as a coordination service error with payload.
+inline absl::Status MakeCoordinationError(
+    absl::Status s, const tensorflow::CoordinationServiceError& payload) {
+  s.SetPayload(CoordinationErrorPayloadKey(),
+               absl::Cord(payload.SerializeAsString()));
+  return s;
+}
+
+// Trims the error message by replacing the `Additional GRPC error` part.
+// Note: The duplicated error message is a quirk of the underlying gRPC code
+// that we are using. Changing the shared code may hide important messages for
+// other libraries, so we trim the error message for coordination service
+// instead. See tsl/distributed_runtime/rpc/grpc_state.h for more details.
+absl::Status TrimCoordinationErrorMessage(const absl::Status& s);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
new file mode 100644
index 00000000..b77fb54b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -0,0 +1,111 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+class CoordinationServiceRpcHandler {
+ public:
+  explicit CoordinationServiceRpcHandler() = default;
+
+  void SetAgentInstance(CoordinationServiceAgent* agent);
+
+  void SetServiceInstance(CoordinationServiceInterface* service);
+
+  void RegisterTaskAsync(const tensorflow::RegisterTaskRequest* request,
+                         tensorflow::RegisterTaskResponse* response,
+                         StatusCallback done);
+
+  void HeartbeatAsync(const tensorflow::HeartbeatRequest* request,
+                      tensorflow::HeartbeatResponse* response,
+                      StatusCallback done);
+
+  void WaitForAllTasksAsync(const tensorflow::WaitForAllTasksRequest* request,
+                            tensorflow::WaitForAllTasksResponse* response,
+                            StatusCallback done);
+
+  void ShutdownTaskAsync(const tensorflow::ShutdownTaskRequest* request,
+                         tensorflow::ShutdownTaskResponse* response,
+                         StatusCallback done);
+
+  void ResetTaskAsync(const tensorflow::ResetTaskRequest* request,
+                      tensorflow::ResetTaskResponse* response,
+                      StatusCallback done);
+
+  void ReportErrorToTaskAsync(
+      const tensorflow::ReportErrorToTaskRequest* request,
+      tensorflow::ReportErrorToTaskResponse* response, StatusCallback done);
+
+  void ReportErrorToServiceAsync(
+      const tensorflow::ReportErrorToServiceRequest* request,
+      tensorflow::ReportErrorToServiceResponse* response, StatusCallback done);
+
+  void GetTaskStateAsync(const tensorflow::GetTaskStateRequest* request,
+                         tensorflow::GetTaskStateResponse* response,
+                         StatusCallback done);
+
+  void InsertKeyValueAsync(const tensorflow::InsertKeyValueRequest* request,
+                           tensorflow::InsertKeyValueResponse* response,
+                           StatusCallback done);
+
+  void GetKeyValueAsync(const tensorflow::GetKeyValueRequest* request,
+                        tensorflow::GetKeyValueResponse* response,
+                        StatusCallback done);
+
+  void TryGetKeyValueAsync(const tensorflow::TryGetKeyValueRequest* request,
+                           tensorflow::TryGetKeyValueResponse* response,
+                           StatusCallback done);
+
+  void GetKeyValueDirAsync(const tensorflow::GetKeyValueDirRequest* request,
+                           tensorflow::GetKeyValueDirResponse* response,
+                           StatusCallback done);
+
+  void DeleteKeyValueAsync(const tensorflow::DeleteKeyValueRequest* request,
+                           tensorflow::DeleteKeyValueResponse* response,
+                           StatusCallback done);
+
+  void BarrierAsync(const tensorflow::BarrierRequest* request,
+                    tensorflow::BarrierResponse* response, StatusCallback done);
+
+  void CancelBarrierAsync(const tensorflow::CancelBarrierRequest* request,
+                          tensorflow::CancelBarrierResponse* response,
+                          StatusCallback done);
+
+  void GetAliveTasksAsync(const tensorflow::GetAliveTasksRequest* request,
+                          tensorflow::GetAliveTasksResponse* response,
+                          StatusCallback done);
+
+  void PollForErrorAsync(const tensorflow::PollForErrorRequest* request,
+                         tensorflow::PollForErrorResponse* response,
+                         StatusCallback done);
+
+ private:
+  absl::Mutex mu_;
+  CoordinationServiceAgent* agent_ TF_GUARDED_BY(mu_) = nullptr;
+  CoordinationServiceInterface* service_ TF_GUARDED_BY(mu_) = nullptr;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
new file mode 100644
index 00000000..97479dd0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
@@ -0,0 +1,147 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "absl/time/time.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+// Static registration for preemption notifiers.
+#define REGISTER_PREEMPTION_NOTIFIER(notifier_type_name, factory_fn)        \
+  REGISTER_PREEMPTION_NOTIFIER_UNIQ_HELPER(__COUNTER__, notifier_type_name, \
+                                           factory_fn)
+#define REGISTER_PREEMPTION_NOTIFIER_UNIQ_HELPER(counter, notifier_type_name, \
+                                                 factory_fn)                  \
+  static bool static_preemption_notifier_##counter TF_ATTRIBUTE_UNUSED =      \
+      []() {                                                                  \
+        ::tsl::PreemptionNotifier::RegisterPreemptionNotifier(                \
+            notifier_type_name, factory_fn);                                  \
+        return true;                                                          \
+      }()
+
+// Base class for listening and propagating task preemption notices.
+//
+// This class provides common mechanism to block on waiting for preemption
+// signals, or register callbacks that will be triggered upon preemption.
+//
+// Example:
+//
+//    // Monitors the SIGTERM preemption signal
+//    notifier = PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
+//
+//    // Register callback that will be invoked once preempted
+//    notifier->WillBePreemptedAtAsync(
+//      [](absl::StatusOr<absl::Time> status_or_time) {
+//        if (status_or_time.ok()) {
+//          LOG(INFO) << "Preempted at time: " << status_or_time.value();
+//        } else {
+//          LOG(ERROR) << "Received error: " << status_or_time.status();
+//        }
+//      });
+//
+//    // Block current thread until preemption
+//    absl::Time preempt_time = notifier->WillBePreemptedAt().value();
+//
+// Users can extend this class to support custom preemption signals, by subclass
+// `PreemptionNotifier` with a custom constructor, register its creator (factory
+// function) with `REGISTER_PREEMPTION_NOTIFIER`. The custom constructor should
+// set up the communication with the cluster scheduler, and invoke the
+// `NotifyRegisteredListeners` method once a preemption signal is received.
+// See `SigtermNotifier` as an example.
+
+class PreemptionNotifier {
+ public:
+  typedef std::function<void(absl::StatusOr<absl::Time>)> PreemptTimeCallback;
+  using PreemptionNotifierFactory =
+      std::function<std::unique_ptr<PreemptionNotifier>(Env* env)>;
+
+  explicit PreemptionNotifier(Env* env) : env_(env) {}
+  virtual ~PreemptionNotifier() = default;
+
+  static void RegisterPreemptionNotifier(const std::string& notifier_type_name,
+                                         PreemptionNotifierFactory factory_fn) {
+    GetPreemptionNotifierFactories()->emplace(notifier_type_name,
+                                              std::move(factory_fn));
+  }
+
+  static std::unique_ptr<PreemptionNotifier> CreatePreemptionNotifier(
+      const std::string& notifier_type, Env* env) {
+    const auto* factories = GetPreemptionNotifierFactories();
+    auto it = factories->find(notifier_type);
+    if (it == factories->end()) {
+      std::vector<std::string> registered_types;
+      registered_types.reserve(factories->size());
+      for (auto& kv : *factories) {
+        registered_types.push_back(kv.first);
+      }
+      LOG(ERROR) << "No preemption notifier factory found for notifier type "
+                 << notifier_type
+                 << ". All registered preemption notifier types are: "
+                 << absl::StrJoin(registered_types, ", ")
+                 << ". Make sure the library is loaded to the program.";
+      return nullptr;
+    }
+    return it->second(env);
+  }
+
+  // This is a blocking call that returns a death time when preemption /
+  // termination will occur once the listener receives the preemption
+  // notification. If no death time is specified, absl::Now() is returned.
+  // Returns error::Cancelled if UnregisterListeners() is called.
+  absl::StatusOr<absl::Time> WillBePreemptedAt();
+
+  // Registers a callback that takes the death time as input once the listener
+  // receives the preemption notification.
+  // If no death time is specified, absl::Now() is specified as input.
+  // Note: callback should be kept as simple and fast as possible (e.g. simply
+  // retrieve result). It should not wait for work done by another callback, and
+  // invoke ahy PreemptionNotifier method (e.g. Reset(), destructor).
+  void WillBePreemptedAtAsync(PreemptTimeCallback callback);
+
+ protected:
+  Env* GetEnv() { return env_; }
+  // Invokes all pending callbacks upon receipt of preemption notice with death
+  // time or errors (e.g. cancellation during shutdown).
+  void NotifyRegisteredListeners(absl::StatusOr<absl::Time> death_time);
+
+ private:
+  static std::unordered_map<std::string, PreemptionNotifierFactory>*
+  GetPreemptionNotifierFactories() {
+    static auto* preemption_notifier_factories =
+        new std::unordered_map<std::string, PreemptionNotifierFactory>();
+    return preemption_notifier_factories;
+  }
+
+  Env* env_;  // Not owned.
+  mutex mu_;
+  absl::Time death_time_ TF_GUARDED_BY(mu_) = absl::InfinitePast();
+  std::vector<PreemptTimeCallback> callbacks_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
new file mode 100644
index 00000000..5d36540a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
+
+namespace tsl {
+
+// Enables multiple tasks to coordinate on a safe sync point if any of the tasks
+// receive a preemption notice. Example: tasks agree on a safe checkpointing
+// step after a preemption notice so that training can resume with minimal
+// disruption after the preemption.
+// Note: the sync point can only be set once whenever the first preemption
+// occurs.
+// TODO(b/230630494): Add Reset() to allow multiple sync points to be set.
+class PreemptionSyncManager {
+ public:
+  virtual ~PreemptionSyncManager() = default;
+
+  virtual absl::Status Initialize(CoordinationServiceAgent* agent) = 0;
+  virtual absl::Status Initialize(
+      CoordinationServiceAgent* agent,
+      const std::string& preemption_notifier_type) = 0;
+  virtual absl::Status Initialize(
+      CoordinationServiceAgent* agent,
+      std::unique_ptr<PreemptionNotifier> notifier) = 0;
+
+  // Check if the synchronized point has been reached. When a task has been
+  // preempted, a safe sync point will be determined by using the fastest task's
+  // next possible sync point, which is then propagated to all tasks via this
+  // method.
+  // Notes:
+  // 1) This must be called during every possible sync point so that the library
+  //    is aware of each task's progress.
+  // 2) This assumes that each task begins from the same point.
+  //    Internally, it updates a counter to track the last `step_counter` passed
+  //    in as argument to record each task's current progress.
+  // Example use case: this can be called during every training step for every
+  // task. Once a preemption notice is received, all tasks will agree on a safe
+  // step to pause training and handle the preemption (e.g. save checkpoint and
+  // exit, or wait for preempted task to restart, then resume training).
+  virtual bool ReachedSyncPoint(int step_counter) = 0;
+};
+
+std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager();
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/async_service_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/async_service_interface.h
new file mode 100644
index 00000000..fb6976cf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/async_service_interface.h
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+
+namespace tsl {
+
+// Represents an abstract asynchronous service that handles incoming
+// RPCs with a polling loop.
+class AsyncServiceInterface {
+ public:
+  virtual ~AsyncServiceInterface() {}
+
+  // A blocking method that should be called to handle incoming RPCs.
+  // This method will block until the service shuts down.
+  virtual void HandleRPCsLoop() = 0;
+
+  // Starts shutting down this service.
+  //
+  // NOTE(mrry): To shut down this service completely, the caller must
+  // also shut down any servers that might share ownership of this
+  // service's resources (e.g. completion queues).
+  virtual void Shutdown() = 0;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
new file mode 100644
index 00000000..33a0533e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+
+#include <memory>
+
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
+
+namespace tsl {
+
+CoordinationClientCache* NewGrpcCoordinationClientCache(
+    std::shared_ptr<GrpcChannelCache> channel);
+
+CoordinationClient* NewGrpcCoordinationClient(
+    std::shared_ptr<::grpc::Channel> channel);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
new file mode 100644
index 00000000..0550a856
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -0,0 +1,121 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "grpcpp/alarm.h"
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/protobuf/coordination_service.grpc.pb.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+
+class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
+ public:
+  template <class RequestMessage, class ResponseMessage>
+  using CoordCall = Call<GrpcCoordinationServiceImpl,
+                         tensorflow::grpc::CoordinationService::AsyncService,
+                         RequestMessage, ResponseMessage>;
+
+  GrpcCoordinationServiceImpl(thread::ThreadPool* compute_pool,
+                              ::grpc::ServerBuilder* server_builder);
+  ~GrpcCoordinationServiceImpl() override {}
+
+  void HandleRPCsLoop() override;
+  void Shutdown() override;
+  void SetCoordinationServiceAgentInstance(CoordinationServiceAgent* agent) {
+    rpc_handler_.SetAgentInstance(agent);
+  }
+  void SetCoordinationServiceInstance(CoordinationServiceInterface* service) {
+    rpc_handler_.SetServiceInstance(service);
+  }
+  CoordinationServiceRpcHandler* GetRpcHandler() { return &rpc_handler_; }
+
+ private:
+#define HANDLER(method)                                                       \
+  void method##Handler(CoordCall<tensorflow::method##Request,                 \
+                                 tensorflow::method##Response>* call) {       \
+    absl::ReaderMutexLock l(&shutdown_mu_);                                   \
+    if (shutdown_) {                                                          \
+      call->SendResponse(ToGrpcStatus(                                        \
+          absl::InternalError("Coordination service has been shut down.")));  \
+      return;                                                                 \
+    }                                                                         \
+    compute_pool_.Schedule([this, call]() {                                   \
+      rpc_handler_.method##Async(&call->request, &call->response,             \
+                                 [call](const absl::Status& s) {              \
+                                   call->ClearCancelCallback();               \
+                                   call->SendResponse(ToGrpcStatus(s));       \
+                                 });                                          \
+    });                                                                       \
+    Call<GrpcCoordinationServiceImpl,                                         \
+         tensorflow::grpc::CoordinationService::AsyncService,                 \
+         tensorflow::method##Request, tensorflow::method##Response>::         \
+        EnqueueRequest(&service_, cq_.get(),                                  \
+                       &tensorflow::grpc::CoordinationService::AsyncService:: \
+                           Request##method,                                   \
+                       &GrpcCoordinationServiceImpl::method##Handler,         \
+                       /*supports_cancel=*/false);                            \
+  }
+  HANDLER(RegisterTask);
+  HANDLER(WaitForAllTasks);
+  HANDLER(ShutdownTask);
+  HANDLER(ResetTask);
+  HANDLER(Heartbeat);
+  HANDLER(ReportErrorToTask);
+  HANDLER(ReportErrorToService);
+  HANDLER(GetTaskState);
+  HANDLER(InsertKeyValue);
+  HANDLER(GetKeyValue);
+  HANDLER(TryGetKeyValue);
+  HANDLER(GetKeyValueDir);
+  HANDLER(DeleteKeyValue);
+  HANDLER(Barrier);
+  HANDLER(CancelBarrier);
+  HANDLER(GetAliveTasks);
+  HANDLER(PollForError);
+#undef HANDLER
+
+  thread::ThreadPool& compute_pool_;
+  CoordinationServiceRpcHandler rpc_handler_;
+
+  absl::Mutex shutdown_mu_;
+  bool shutdown_ ABSL_GUARDED_BY(shutdown_mu_);
+  std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
+
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  tensorflow::grpc::CoordinationService::AsyncService service_;
+
+  GrpcCoordinationServiceImpl(const GrpcCoordinationServiceImpl&) = delete;
+  void operator=(const GrpcCoordinationServiceImpl&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
new file mode 100644
index 00000000..1a5cbfbb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
@@ -0,0 +1,521 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/impl/service_type.h"
+#include "grpcpp/server_builder.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/async_stream.h"
+#include "grpcpp/support/async_unary_call.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/refcount.h"
+
+namespace tsl {
+
+// CALL STRUCTURES
+// ===============
+//
+// Each pending (incoming) request corresponds to a call object that
+// encapsulates the state of the call. Templates and
+// pointers-to-member functions are used to avoid boilerplate and
+// redundant closure creation. The class hierarchy is as follows:
+//
+// * `UntypedCall<Service>`: The base class represents a call that
+//   could be associated with any of the methods on a service of type
+//   `Service`. Also defines a `Tag` nested class that can be used as
+//   the tag in a `grpc::CompletionQueue`.  Each class that
+//   instantiates `Service` should have a completion queue polling
+//   loop that knows about `UntypedCall<Service>::Tag` objects, and
+//   invokes their `OnCompleted()` method to continue processing.
+//
+// * `Call<Service, GrpcService, Req, Resp>`: This class extends
+//   `UntypedCall<Service>` and is additionally parameterized by the
+//   gRPC-generated asynchronous service class, and the request and
+//   response message types. It defines the state associated with a
+//   call (whose type depends on the message types), and stores a
+//   pointer to a `Service::HandleFoo()` handler method. Each
+//   `Service::HandleFoo()` method knows about the corresponding
+//   `Call` type, in order to access its state, and invoke its
+//   `SendResponse()` method.
+//
+// The lifecycle of a call object is as follows.
+//
+// 1. A `Service` creates a `Call` for a particular method and
+//    enqueues it in its completion queue (via an
+//    `UntypedCall<Service>::Tag`).
+//
+// 2. When the tag is returned from `cq_->Next()`, the
+//    `UntypedCall::RequestReceived()` method is invoked and takes
+//    ownership of the call object. This indirectly invokes the
+//    appropriate handler method on `Service`.
+//
+// 3. After the response has been written (perhaps in another thread),
+//    the `Call::SendResponse()` method is invoked. It transfers
+//    ownership of the call object back to the completion queue (via
+//    an `UntypedCall::Tag`).
+//
+// 4. When the response has been sent, the tag is returned from
+//    `cq_->Next()`, and the call object is deleted.
+//
+
+template <class Service>
+class GrpcCallTag {
+ public:
+  virtual ~GrpcCallTag() {}
+
+  // Calls the callback associated with this tag.
+  virtual void OnCompleted(Service* service, bool ok) = 0;
+};
+
+// Represents a pending request with unknown message types.
+template <class Service>
+class UntypedCall : public core::RefCounted {
+ public:
+  virtual ~UntypedCall() {}
+
+  // The implementation of this method should use `service` to handle
+  // an incoming request, and (perhaps asynchronously) send the
+  // response.
+  //
+  // One reference on `this` is transferred to the callee, and the
+  // callee is responsible for releasing it (typically via
+  // `Call::SendResponse()`).
+  //
+  // `ok` is true if the request was received in a "regular event",
+  // otherwise false.
+  virtual void RequestReceived(Service* service, bool ok) = 0;
+
+  // This method will be called either (i) when the server is notified
+  // that the request has been canceled, or (ii) when the request completes
+  // normally. The implementation should distinguish these cases by querying
+  // the `grpc::ServerContext` associated with the request.
+  virtual void RequestCancelled(Service* service, bool ok) = 0;
+
+  // Associates a tag in a `::grpc::CompletionQueue` with a callback
+  // for an incoming RPC.  An active Tag owns a reference on the corresponding
+  // Call object.
+  class Tag : public GrpcCallTag<Service> {
+   public:
+    // One enum value per supported callback.
+    enum Callback { kRequestReceived, kResponseSent, kCancelled };
+
+    Tag(UntypedCall* call, Callback cb) : call_(call), callback_(cb) {}
+
+    // Calls the callback associated with this tag.
+    //
+    // The callback takes ownership of `this->call_`.
+    void OnCompleted(Service* service, bool ok) override {
+      switch (callback_) {
+        case kRequestReceived:
+          call_->RequestReceived(service, ok);
+          break;
+        case kResponseSent:
+          // No special handling needed apart from the Unref below.
+          break;
+        case kCancelled:
+          call_->RequestCancelled(service, ok);
+          break;
+      }
+      call_->Unref();  // Ref acquired when tag handed to grpc.
+    }
+
+   private:
+    UntypedCall* const call_;  // `this` owns one reference.
+    Callback callback_;
+  };
+};
+
+// Represents a pending call with known request and response message
+// types, and a known request-handling method.
+template <class Service, class GrpcService, class RequestMessage,
+          class ResponseMessage>
+class Call : public UntypedCall<Service> {
+ public:
+  // Represents the generic signature of a generated
+  // `GrpcService::RequestFoo()` method, where `Foo` is the name of an
+  // RPC method.
+  using EnqueueFunction = void (GrpcService::*)(
+      ::grpc::ServerContext*, RequestMessage*,
+      ::grpc::ServerAsyncResponseWriter<ResponseMessage>*,
+      ::grpc::CompletionQueue*, ::grpc::ServerCompletionQueue*, void*);
+
+  // Represents the generic signature of a `Service::HandleFoo()`
+  // method, where `Foo` is the name of an RPC method.
+  using HandleRequestFunction = void (Service::*)(
+      Call<Service, GrpcService, RequestMessage, ResponseMessage>*);
+
+  Call(HandleRequestFunction handle_request_function)
+      : handle_request_function_(handle_request_function), responder_(&ctx_) {}
+
+  virtual ~Call() {}
+
+  void RequestReceived(Service* service, bool ok) override {
+    if (ok) {
+      this->Ref();
+      (service->*handle_request_function_)(this);
+    }
+  }
+
+  void SendResponse(::grpc::Status status) {
+    this->Ref();  // Ref for grpc; released in Tag callback.
+    responder_.Finish(response, status, &response_sent_tag_);
+    this->Unref();
+  }
+
+  void RequestCancelled(Service* service, bool ok) override {
+    if (ctx_.IsCancelled()) {
+      mutex_lock l(mu_);
+      if (cancel_callback_) {
+        cancel_callback_();
+      }
+    }
+  }
+
+  // Registers `callback` as the function that should be called if and when this
+  // call is canceled by the client.
+  void SetCancelCallback(std::function<void()> callback) {
+    mutex_lock l(mu_);
+    cancel_callback_ = std::move(callback);
+  }
+
+  // Clears any cancellation callback that has been registered for this call.
+  void ClearCancelCallback() {
+    mutex_lock l(mu_);
+    cancel_callback_ = nullptr;
+  }
+
+  // Enqueues a new request for the given service on the given
+  // completion queue, using the given `enqueue_function`.
+  //
+  // The request will be handled with the given
+  // `handle_request_function`.
+  static void EnqueueRequest(GrpcService* grpc_service,
+                             ::grpc::ServerCompletionQueue* cq,
+                             EnqueueFunction enqueue_function,
+                             HandleRequestFunction handle_request_function,
+                             bool supports_cancel) {
+    auto call = new Call<Service, GrpcService, RequestMessage, ResponseMessage>(
+        handle_request_function);
+    if (supports_cancel) {
+      call->RegisterCancellationHandler();
+    }
+
+    // Initial ref for call handed to grpc; released in Tag callback.
+    (grpc_service->*enqueue_function)(&call->ctx_, &call->request,
+                                      &call->responder_, cq, cq,
+                                      &call->request_received_tag_);
+  }
+
+  // Enqueues a new request for the given service on the given
+  // completion queue, using the given `method_id`.
+  //
+  // The request will be handled with the given
+  // `handle_request_function`.
+  static void EnqueueRequestForMethod(
+      GrpcService* grpc_service, ::grpc::ServerCompletionQueue* cq,
+      int method_id, HandleRequestFunction handle_request_function,
+      bool supports_cancel) {
+    auto call = new Call<Service, GrpcService, RequestMessage, ResponseMessage>(
+        handle_request_function);
+    if (supports_cancel) {
+      call->RegisterCancellationHandler();
+    }
+
+    // Initial ref for call handed to grpc; released in Tag callback.
+    grpc_service->RequestAsyncUnary(method_id, &call->ctx_, &call->request,
+                                    &call->responder_, cq, cq,
+                                    &call->request_received_tag_);
+  }
+
+  RequestMessage request;
+  ResponseMessage response;
+
+  const std::multimap<::grpc::string_ref, ::grpc::string_ref>& client_metadata()
+      const {
+    return ctx_.client_metadata();
+  }
+
+ private:
+  // Creates a completion queue tag for handling cancellation by the client.
+  // NOTE: This method must be called before this call is enqueued on a
+  // completion queue.
+  void RegisterCancellationHandler() {
+    this->Ref();  // Ref for grpc; released in Tag callback.
+    ctx_.AsyncNotifyWhenDone(&cancelled_tag_);
+  }
+
+  HandleRequestFunction handle_request_function_;
+  ::grpc::ServerContext ctx_;
+  ::grpc::ServerAsyncResponseWriter<ResponseMessage> responder_;
+
+  // Used as void* completion markers from grpc to indicate different
+  // events of interest for a Call.
+  typedef typename UntypedCall<Service>::Tag Tag;
+  Tag request_received_tag_{this, Tag::kRequestReceived};
+  Tag response_sent_tag_{this, Tag::kResponseSent};
+  Tag cancelled_tag_{this, Tag::kCancelled};
+
+  mutex mu_;
+  std::function<void()> cancel_callback_ TF_GUARDED_BY(mu_);
+};
+
+// Lifetime of a server-side bidirectional streaming call:
+// - The call is created in the static EnqueueRequest method. It transfers
+//   ownership to the kCallOpen tag pushed onto the completion queue.
+// - If kCallOpen completes successfully, a read is requested and the
+//   kRequestReceived tag takes ownership of the call. If kCallOpen fails,
+//   e.g. server is shutdown, no further requests are pushed and the call is
+//   destroyed (at the end of Tag::OnCompleted).
+// - When the first request is received, we Ref() the call and invoke the
+//   handler method thereby transferring ownership to the handler method.
+//   The handler is responsible for calling SendResponse() or Finish() on this
+//   call.
+//   - If the handler calls Finish(), e.g. the request was invalid, Finish()
+//     transfers ownership from the handler to the kServerFinished tag that
+//     it pushes on the completion queue. The ownership is transferred because
+//     the ref count is not incremented before putting the tag on the queue.
+//   - If the handler calls SendResponse(), SendResponse() transfers ownership
+//     to the kResponseSent tag.
+// - When kResponseSent completes, we request a new read, which owns the call
+//   now.
+// - When the next request is received, it is handled the same way as the first
+//   request.
+//
+// Because we request a read only after the write is sent, we can safely reuse
+// the same request and response messages for the whole call.
+template <class Service>
+class ServerUntypedBidirectionalStreamingCall : public core::RefCounted {
+ public:
+  virtual void RequestReceived(Service* service) = 0;
+
+  // Enqueues a request on the completion queue to read the next request.
+  virtual void CallOpen() = 0;
+
+  virtual void RequestRead() = 0;
+
+  // Associates a tag in a `::grpc::CompletionQueue` with a callback.
+  // An active Tag owns a reference on the corresponding Call object.
+  class Tag : public GrpcCallTag<Service> {
+   public:
+    // One enum value per supported callback.
+    enum class TagType {
+      kCallOpen,
+      kRequestReceived,
+      kResponseSent,
+      kServerFinished,
+    };
+
+    Tag(ServerUntypedBidirectionalStreamingCall* call, TagType cb)
+        : call_(call), callback_(cb) {}
+
+    // Calls the callback associated with this tag and Unrefs this->call_.
+    void OnCompleted(Service* service, bool ok) override {
+      switch (callback_) {
+        case TagType::kCallOpen:
+          // Non-ok value indicates that the server has been shutdown before we
+          // received a message for this call type. We do nothing to let this
+          // call object be destroyed and avoid enqueuing request for another
+          // call.
+          if (ok) {
+            call_->CallOpen();
+          }
+          break;
+        case TagType::kRequestReceived:
+          // Non-ok value from completion queue here means that we will not
+          // receive any more messages from the client, e.g. the client called
+          // WritesDone. There is nothing we need to do in this case. The call
+          // will be Unref'ed and deleted. If the client wants to open a new
+          // call, we have already enqueued a request for a new call in CallOpen
+          // above.
+          if (ok) {
+            call_->RequestReceived(service);
+          }
+          break;
+        case TagType::kResponseSent:
+          if (ok) {
+            // The obvious place to request a read would be at the end of
+            // RequestReceived(). Unfortunately, this can result in multiple
+            // outstanding write requests in the completion queue. This is
+            // currently not supported by gRPC, which requires at most one
+            // outstanding write request in the completion queue.
+            // Requesting a read here, in ResponseSent, works because at
+            // this point, the completion queue has no write requests
+            // (kResponseSent happens when a write completes).
+            // This might be synchronizing the processing more than strictly
+            // necessary, but is probably fine because, AFAICT from gRPC docs,
+            // the write request completes as soon as it can be written to
+            // outgoing buffer.
+            call_->RequestRead();
+          }
+          // ok == false means that the response is not going on the wire
+          // because the call is already dead (i.e., canceled, deadline
+          // expired, other side dropped the channel, etc). Since the call is
+          // dead, there is nothing for us to do, we just let the call be
+          // deleted.
+          break;
+        case TagType::kServerFinished:
+          // Whether our finish request is successful or not (whether it went
+          // on the wire towards the client), there is nothing for us to do.
+          // In the current implementation, there can be no read or write
+          // requests in the completion queue (see the comment in kResponseSent)
+          // above. Even if there were pending requests, they would complete
+          // with a non-ok status, we would not do anything, and let the call be
+          // deleted.
+          break;
+      }
+      call_->Unref();  // Ref acquired when tag was handed to grpc.
+    }
+
+   private:
+    ServerUntypedBidirectionalStreamingCall* const
+        call_;  // `this` owns one reference.
+    TagType callback_;
+  };
+};
+
+// Represents a pending call with known request and response message
+// types, and a known request-handling method.
+// Common usage pattern is to have a single thread waiting on events from
+// completion queue and calling Tag::OnCompleted(), which invokes methods
+// on this.
+// This implementation assumes that the server will generate a single response
+// message for each request message. More precisely, this class expects that
+// each time it invokes handle_request_function_, the service implementation
+// will either call SendResponse or Finish exactly once.
+// Not thread-safe.
+template <class Service, class GrpcService, class RequestMessage,
+          class ResponseMessage>
+class ServerBidirectionalStreamingCall
+    : public ServerUntypedBidirectionalStreamingCall<Service> {
+ public:
+  // Represents the generic signature of a generated
+  // `GrpcService::RequestFoo()` method, where `Foo` is the name of an
+  // RPC method.
+  using EnqueueFunction = void (GrpcService::*)(
+      ::grpc::ServerContext*,
+      ::grpc::ServerAsyncReaderWriter<ResponseMessage, RequestMessage>*,
+      ::grpc::CompletionQueue*, ::grpc::ServerCompletionQueue*, void*);
+
+  // Represents the generic signature of a `Service::HandleFoo()`
+  // method, where `Foo` is the name of an RPC method.
+  using HandleRequestFunction = void (Service::*)(
+      ServerBidirectionalStreamingCall<Service, GrpcService, RequestMessage,
+                                       ResponseMessage>*);
+
+  ServerBidirectionalStreamingCall(
+      HandleRequestFunction handle_request_function, GrpcService* grpc_service,
+      ::grpc::ServerCompletionQueue* cq, EnqueueFunction enqueue_function)
+      : handle_request_function_(handle_request_function),
+        stream_(&ctx_),
+        grpc_service_(grpc_service),
+        cq_(cq),
+        enqueue_function_(enqueue_function) {
+    VLOG(3) << "Creating ServerBidirectionalStreamingCall " << this;
+  }
+
+  ~ServerBidirectionalStreamingCall() override {
+    VLOG(3) << "Destroying ServerBidirectionalStreamingCall " << this;
+  }
+
+  void CallOpen() override {
+    // Let gRPC know that we can accept another call.
+    ServerBidirectionalStreamingCall<
+        Service, GrpcService, RequestMessage,
+        ResponseMessage>::EnqueueRequest(grpc_service_, cq_, enqueue_function_,
+                                         handle_request_function_);
+    RequestRead();
+  }
+
+  void RequestRead() override {
+    this->Ref();
+    request_.Clear();
+    stream_.Read(&request_, &request_received_tag_);
+  }
+
+  void RequestReceived(Service* service) override {
+    this->Ref();
+    // Request handling should result in a call to SendResponse or Finish.
+    (service->*handle_request_function_)(this);
+  }
+
+  void SendResponse() {
+    // Transferring ownership of this to the response_sent_tag_.
+    stream_.Write(response_, &response_sent_tag_);
+    // stream_.Write does not save references to response_. We are free to muck
+    // around with it as soon as Write returns.
+    // We clear the response_ to prepare it for the next response.
+    response_.Clear();
+  }
+
+  void Finish(::grpc::Status status) {
+    // Transferring ownership of this to the server_finished_tag_.
+    stream_.Finish(status, &server_finished_tag_);
+  }
+
+  // Enqueues a new request for the given service on the given
+  // completion queue, using the given `enqueue_function`.
+  //
+  // The request will be handled by the given `handle_request_function`.
+  static void EnqueueRequest(GrpcService* grpc_service,
+                             ::grpc::ServerCompletionQueue* cq,
+                             EnqueueFunction enqueue_function,
+                             HandleRequestFunction handle_request_function) {
+    auto call =
+        new ServerBidirectionalStreamingCall<Service, GrpcService,
+                                             RequestMessage, ResponseMessage>(
+            handle_request_function, grpc_service, cq, enqueue_function);
+
+    // Initial ref for call handed to grpc; released in Tag callback.
+    (grpc_service->*enqueue_function)(&call->ctx_, &call->stream_, cq, cq,
+                                      &call->call_open_tag_);
+  }
+
+  const RequestMessage& request() const { return request_; }
+  ResponseMessage* mutable_response() { return &response_; }
+
+ private:
+  // Request and response messages are reused for each request/response exchange
+  // between the client and the server.
+  RequestMessage request_;
+  ResponseMessage response_;
+  ::grpc::ServerContext ctx_;
+
+  HandleRequestFunction handle_request_function_;
+  ::grpc::ServerAsyncReaderWriter<ResponseMessage, RequestMessage> stream_;
+
+  // Used as void* completion markers from grpc to indicate different
+  // events of interest for a ServerBidirectionalStreamingCall.
+  typedef typename ServerUntypedBidirectionalStreamingCall<Service>::Tag Tag;
+  // At most one tag of each kind may be given to gRPC at any one time.
+  // Beyond semantic sanity, this is needed to ensure proper ref counting
+  // of this call object.
+  Tag call_open_tag_{this, Tag::TagType::kCallOpen};
+  Tag request_received_tag_{this, Tag::TagType::kRequestReceived};
+  Tag response_sent_tag_{this, Tag::TagType::kResponseSent};
+  Tag server_finished_tag_{this, Tag::TagType::kServerFinished};
+
+  // These fields are used only to spawn another instance of this to accept
+  // more streaming calls.
+  GrpcService* grpc_service_;
+  ::grpc::ServerCompletionQueue* cq_;
+  EnqueueFunction enqueue_function_;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
new file mode 100644
index 00000000..de9aadff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "grpcpp/grpcpp.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/protobuf/rpc_options.pb.h"
+
+namespace tsl {
+using tensorflow::RPCOptions;
+
+// Consolidated parameter structure to ease use of generic interfaces.
+//
+// Each job_id requires:
+// - a list of host:port (or sparse list of index:host:port)
+// - the number of tasks per replica
+class GrpcChannelSpec {
+ public:
+  struct HostPortsJob {
+    HostPortsJob(const string& job_id, const std::map<int, string>& host_ports)
+        : job_id(job_id), host_ports(host_ports) {}
+    const string job_id;
+    const std::map<int, string> host_ports;
+  };
+
+  absl::Status AddHostPortsJob(const string& job_id,
+                               const std::map<int, string>& host_ports);
+
+  const std::vector<HostPortsJob>& host_ports_jobs() const {
+    return host_ports_jobs_;
+  }
+
+ private:
+  std::vector<HostPortsJob> host_ports_jobs_;
+  std::set<string> job_ids_;
+};
+
+class GrpcChannelCache {
+ public:
+  virtual ~GrpcChannelCache() {}
+
+  // Populates *workers with names of all workers which this object
+  // was created to handle.  Worker names are in the format
+  //  /job:<job identifier>/task:<task id>
+  // e.g. /job:mnist/task:2
+  virtual void ListWorkers(std::vector<string>* workers) = 0;
+  virtual void ListWorkersInJob(const string& job_name,
+                                std::vector<string>* workers) = 0;
+
+  // If found, returns a gRPC channel that is connected to the remote
+  // worker named by 'target'. 'target' is of the following
+  // format: /job:<job identifier>/task:<task id>
+  // E.g., /job:mnist/task:2
+  virtual SharedGrpcChannelPtr FindWorkerChannel(const string& target) = 0;
+
+  // Translates a string in the form `/job:X/task:Z` into a host_port.
+  virtual string TranslateTask(const string& task) = 0;
+};
+
+typedef std::function<SharedGrpcChannelPtr(string)> ChannelCreationFunction;
+
+GrpcChannelCache* NewGrpcChannelCache(
+    const GrpcChannelSpec& channel_spec, ChannelCreationFunction channel_func,
+    const RPCOptions& rpc_options = RPCOptions());
+
+// Below here are internal-only functions.
+
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options);
+
+ChannelCreationFunction ConvertToChannelCreationFunction(
+    const std::function<absl::Status(string, const RPCOptions*,
+                                     SharedGrpcChannelPtr*)>&
+        new_channel_func_ptr);
+
+absl::Status NewHostPortGrpcChannel(const string& target,
+                                    const RPCOptions* rpc_options,
+                                    SharedGrpcChannelPtr* channel_pointer);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
new file mode 100644
index 00000000..8d37233a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
@@ -0,0 +1,103 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/platform/logging.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+// GenericCachingChannelCache that caches results to FindWorkerChannel() calls.
+// To use instantiate with the type of channel cache needed.
+// GenericCachingChannelCache allows using multiple channels to communiate with
+// same target to provide throughput gains. When multiple channels exist for
+// the same target they are chosen in a simple round robin fashion on each call
+// to FindWorkerChannel.
+template <typename ChannelCacheT>
+class GenericCachingChannelCache : public ChannelCacheT {
+ public:
+  explicit GenericCachingChannelCache(int num_channels_per_target)
+      : num_channels_per_target_(
+            num_channels_per_target > 0 ? num_channels_per_target : 1) {}
+
+  ~GenericCachingChannelCache() override {}
+
+  SharedGrpcChannelPtr FindWorkerChannel(const string& target) override {
+    {
+      mutex_lock l(mu_);
+      auto iter = channels_.find(target);
+      if (iter != channels_.end()) {
+        return GetNextChannelPtrAndUpdateState(iter->second);
+      }
+    }
+    ChannelState new_chan_state;
+    for (int indx = 0; indx < num_channels_per_target_; indx++) {
+      auto ch = FindChannelOnce(target);
+      if (!ch) return nullptr;
+      new_chan_state.channels.push_back(ch);
+    }
+    new_chan_state.last_used = num_channels_per_target_ - 1;
+
+    {
+      mutex_lock l(mu_);
+      typename absl::flat_hash_map<string, ChannelState>::iterator iter;
+      bool was_inserted;
+      std::tie(iter, was_inserted) = channels_.insert({target, new_chan_state});
+      VLOG(2) << "Channel cache for target: " << target
+              << " Size: " << new_chan_state.channels.size()
+              << " insertion: " << was_inserted;
+      return GetNextChannelPtrAndUpdateState(iter->second);
+    }
+  }
+
+ protected:
+  // Find the ClientChannel for "target".  Only called when no channel was
+  // found in the channels_ cache for "target".  A non nullptr result will be
+  // cached in channels_.
+  virtual SharedGrpcChannelPtr FindChannelOnce(const string& target) = 0;
+
+ private:
+  struct ChannelState {
+    std::vector<SharedGrpcChannelPtr> channels;
+    int last_used;
+  };
+
+  // Should be called with mu_ held.
+  SharedGrpcChannelPtr GetNextChannelPtrAndUpdateState(
+      ChannelState& chan_state) {
+    // Following statement is marked as Crash OK as this is an invariant of
+    // code flow in this class.
+    CHECK_EQ(chan_state.channels.size(), num_channels_per_target_);  // Crash OK
+    chan_state.last_used =
+        (chan_state.last_used + 1) % num_channels_per_target_;
+    return chan_state.channels[chan_state.last_used];
+  }
+
+  const int num_channels_per_target_;
+  // TODO(zhifengc): Eviction when the map becomes too big.
+  mutex mu_;
+  absl::flat_hash_map<string, ChannelState> channels_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
new file mode 100644
index 00000000..eb547c82
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+
+// Represents a pending asynchronous client call as a tag that can be
+// stored in a `grpc::CompletionQueue`.
+class GrpcClientCQTag {
+ public:
+  GrpcClientCQTag() = default;
+  virtual ~GrpcClientCQTag() = default;
+
+  // OnCompleted is invoked when the RPC has finished.
+  // Implementations of OnCompleted can delete *this.
+  virtual void OnCompleted(bool ok) = 0;
+
+ private:
+  GrpcClientCQTag(const GrpcClientCQTag&) = delete;
+  void operator=(const GrpcClientCQTag&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
new file mode 100644
index 00000000..d59f2ced
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
@@ -0,0 +1,254 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "grpcpp/generic/generic_stub.h"
+#include "grpcpp/grpcpp.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/util/env_var.h"
+#include "tsl/platform/strcat.h"
+
+namespace tsl {
+
+// Object allocated per active RPC.
+// Manage the state of a single asynchronous RPC request.  If `max_retries`
+// is greater than 0, the request will be retried for any transient failures.
+// Note: `parse_proto_fn` is used solely to allow TensorFlow's worker service
+// to pass in an optimized function that avoids an unnecessary copy of tensors.
+// That is not implemented as an overload of tsl::GrpcMaybeParseProto because it
+// has dependencies on many TensorFlow-specific absractions.
+template <class Response>
+class RPCState : public GrpcClientCQTag {
+ public:
+  RPCState(
+      ::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+      const ::grpc::string& method, const protobuf::Message& request,
+      Response* response, StatusCallback done, CallOptions* call_opts,
+      thread::ThreadPool* threadpool, int32_t max_retries = 0,
+      bool fail_fast = true, const string* target = nullptr,
+      std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
+          [](::grpc::ByteBuffer* src, Response* dst) {
+            return tsl::GrpcMaybeParseProto(src, dst);
+          })
+      : RPCState(
+            stub, cq, method, request, response, std::move(done), call_opts,
+            threadpool,
+            // 1) If GRPC_FAIL_FAST is set to 'true' or 'false',
+            // fail_fast=$GRPC_FAIL_FAST. See b/141948186.
+            // 2) Otherwise if GRPC_FAIL_FAST is set to 'use_caller', use the
+            // fail_fast from the caller. See b/140260119.
+            //
+            // Current default: use caller's fail_fast argument.
+            //
+            // NOTE: Callers mostly set fail_fast=true to prevent job hanging
+            // on worker task failures, except a few cases such as GetStatus
+            // in cluster initialization and collective param resolution.
+            [fail_fast, &done]() -> bool {
+              string fail_fast_env;
+              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
+                                               &fail_fast_env));
+              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
+              if (fail_fast_env_lower == "true") {
+                return true;
+              } else if (fail_fast_env_lower == "use_caller") {
+                return fail_fast;
+              } else if (fail_fast_env_lower == "false") {
+                return false;
+              } else {
+                string error_message = strings::StrCat(
+                    "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
+                LOG(WARNING) << error_message;
+                done(errors::InvalidArgument(error_message));
+                return false;
+              }
+            }(),
+            (call_opts != nullptr ? call_opts->GetTimeout() : 0), max_retries,
+            target, parse_proto_fn) {}
+
+  template <typename Request>
+  RPCState(
+      ::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+      const ::grpc::string& method, const Request& request, Response* response,
+      StatusCallback done, CallOptions* call_opts,
+      thread::ThreadPool* threadpool, bool fail_fast, int64_t timeout_in_ms,
+      int32_t max_retries, const string* target,
+      std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
+          [](::grpc::ByteBuffer* src, Response* dst) {
+            return tsl::GrpcMaybeParseProto(src, dst);
+          })
+      : call_opts_(call_opts),
+        threadpool_(threadpool),
+        done_(std::move(done)),
+        timeout_in_ms_(timeout_in_ms),
+        max_retries_(max_retries),
+        cq_(cq),
+        stub_(stub),
+        method_(method),
+        fail_fast_(fail_fast),
+        target_(target),
+        parse_proto_fn_(std::move(parse_proto_fn)) {
+    response_ = response;
+    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
+    if (!s.ok()) {
+      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
+                 << s.error_message();
+      // Skip retry logic if we fail to parse our request.
+      done_(FromGrpcStatus(s));
+      delete this;
+      return;
+    }
+    StartCall();
+  }
+
+  void StartCall() {
+    context_.reset(new ::grpc::ClientContext());
+    context_->set_wait_for_ready(!fail_fast_);
+    if (timeout_in_ms_ > 0) {
+      context_->set_deadline(
+          gpr_time_from_millis(timeout_in_ms_, GPR_TIMESPAN));
+    }
+    if (call_opts_) {
+      call_opts_->SetCancelCallback([this]() { context_->TryCancel(); });
+    }
+
+    VLOG(2) << "Starting call: " << method_;
+
+    call_ = stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_);
+    call_->StartCall();
+    call_->Finish(&response_buf_, &status_, this);
+  }
+
+  void OnCompleted(bool ok) override {
+    if (call_opts_) {
+      call_opts_->ClearCancelCallback();
+    }
+
+    VLOG(2) << "Completed call: " << method_;
+
+    absl::Status s = FromGrpcStatus(status_);
+    if (s.ok() && !ok) {
+      // Since this function is only being used for processing the response
+      // to Finish for client-side unary calls, ok should never be false
+      s.Update(
+          errors::Internal("GRPC status is okay but CompletionQueueStatus is "
+                           "not.  This should never happen."));
+    }
+
+    if (s.ok()) {
+      if (threadpool_) {
+        // Run parse and callback in another thread, returning this
+        // one to service more RPCs.
+        threadpool_->Schedule([this]() { ParseAndCallDone(); });
+      } else {
+        ParseAndCallDone();
+      }
+      return;
+    }
+
+    VLOG(1) << method_ << " returned with non-ok status: " << s
+            << " Retries: " << num_retries_ << " Max: " << max_retries_ << "\n"
+            << context_->debug_error_string();
+    // Retry if we have any attempts left
+    if (++num_retries_ <= max_retries_ &&
+        (absl::IsUnavailable(s) || absl::IsUnknown(s))) {
+      response_buf_.Clear();
+      VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
+              << " of " << max_retries_;
+
+      ComputeRetryBackoffMs(/*min_backoff_ms=*/1, /*max_backoff_ms=*/10000);
+      int64_t backoff_us = retry_backoff_ms_ * 1000;
+      Env::Default()->SchedClosureAfter(/*micros=*/backoff_us,
+                                        [this]() { StartCall(); });
+    } else {
+      // Attach additional GRPC error information if any to the final status
+      string error_msg = std::string(s.message());
+      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
+      if (target_) {
+        strings::StrAppend(&error_msg, " from remote target ", *target_);
+      }
+      strings::StrAppend(&error_msg, " while calling ", method_);
+      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
+      s = errors::CreateWithUpdatedMessage(s, error_msg);
+      // Always treat gRPC cancellation as a derived error. This ensures that
+      // other error types are preferred during status aggregation. (gRPC
+      // cancellation messages do not contain the original status message).
+      if (s.code() == absl::StatusCode::kCancelled) {
+        s = StatusGroup::MakeDerived(s);
+      }
+
+      done_(s);
+      delete this;
+    }
+  }
+
+  void ParseAndCallDone() {
+    absl::Status s;
+    if (!parse_proto_fn_(&response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
+    }
+    done_(s);
+    delete this;
+  }
+
+ private:
+  void ComputeRetryBackoffMs(int min_backoff_ms, int max_backoff_ms) {
+    constexpr float kBackoffBase = 1.3;
+    if (retry_backoff_ms_ < 0) {
+      retry_backoff_ms_ = min_backoff_ms;
+    } else {
+      retry_backoff_ms_ *= kBackoffBase;
+      if (retry_backoff_ms_ > max_backoff_ms) {
+        retry_backoff_ms_ = max_backoff_ms;
+      }
+    }
+  }
+
+  CallOptions* call_opts_;
+  std::unique_ptr<::grpc::ClientContext> context_;
+  thread::ThreadPool* threadpool_;
+  std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
+  Response* response_;
+  ::grpc::ByteBuffer request_buf_;
+  ::grpc::ByteBuffer response_buf_;
+  ::grpc::Status status_;
+  StatusCallback done_;
+  int64_t timeout_in_ms_;
+
+  size_t num_retries_ = 0;
+  size_t max_retries_;
+  double retry_backoff_ms_ = -1;
+
+  ::grpc::CompletionQueue* cq_;
+  ::grpc::GenericStub* stub_;
+  ::grpc::string method_;
+  bool fail_fast_;
+  const string* target_;
+  std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn_ = nullptr;
+};
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
new file mode 100644
index 00000000..4b510b1a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
@@ -0,0 +1,132 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/distributed_runtime_payloads.pb.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/stringprintf.h"
+
+namespace tsl {
+
+// Proto: tensorflow::distributed_runtime::GrpcPayloadsLost
+// Location: tsl/protobuf/distributed_runtime_payloads.proto
+// Usage: Flags the absl::Status to have lost payloads during GRPC conversion.
+constexpr char kGrpcPayloadsLost[] =
+    "type.googleapis.com/tensorflow.distributed_runtime.GrpcPayloadsLost";
+
+constexpr char kStreamRemovedMessage[] = "Stream removed";
+
+// Identify if the given grpc::Status corresponds to an HTTP stream removed
+// error (see chttp2_transport.cc).
+//
+// When auto-reconnecting to a remote worker after it restarts, gRPC can return
+// an UNKNOWN error code with a "Stream removed" error message. This should not
+// be treated as an unrecoverable error.
+//
+// N.B. This is dependent on the error message from grpc remaining consistent.
+inline bool IsStreamRemovedError(const ::grpc::Status& s) {
+  return !s.ok() && s.error_code() == ::grpc::StatusCode::UNKNOWN &&
+         s.error_message() == kStreamRemovedMessage;
+}
+
+inline std::string SerializePayloads(const absl::Status& s) {
+  tensorflow::distributed_runtime::GrpcPayloadContainer container;
+  s.ForEachPayload(
+      [&container](absl::string_view key, const absl::Cord& value) {
+        (*container.mutable_payloads())[std::string(key)] = std::string(value);
+      });
+  return container.SerializeAsString();
+}
+
+inline void InsertSerializedPayloads(absl::Status& s, std::string payloads) {
+  tensorflow::distributed_runtime::GrpcPayloadContainer container;
+  if (container.ParseFromString(payloads)) {
+    for (const auto& key_val : container.payloads()) {
+      s.SetPayload(key_val.first, absl::Cord(key_val.second));
+    }
+  } else {
+    s.SetPayload(kGrpcPayloadsLost,
+                 absl::Cord(tensorflow::distributed_runtime::GrpcPayloadsLost()
+                                .SerializeAsString()));
+  }
+}
+
+inline absl::Status FromGrpcStatus(const ::grpc::Status& s) {
+  if (s.ok()) {
+    return absl::OkStatus();
+  } else {
+    absl::Status converted;
+    // Convert "UNKNOWN" stream removed errors into unavailable, to allow
+    // for retry upstream.
+    if (IsStreamRemovedError(s)) {
+      converted =
+          absl::Status(absl::StatusCode::kUnavailable, s.error_message());
+    }
+    converted = absl::Status(static_cast<absl::StatusCode>(s.error_code()),
+                             s.error_message());
+    InsertSerializedPayloads(converted, s.error_details());
+    return converted;
+  }
+}
+
+inline ::grpc::Status ToGrpcStatus(const absl::Status& s) {
+  if (s.ok()) {
+    return ::grpc::Status::OK;
+  } else {
+    if (s.message().size() > 3072 /* 3k bytes */) {
+      // TODO(b/62947679): Remove truncation once the gRPC issue is resolved.
+      string scratch = strings::Printf("%.3072s ... [truncated]",
+                                       absl::StatusMessageAsCStr(s));
+      LOG(ERROR) << "Truncated error message: " << s;
+      return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()), scratch,
+                            SerializePayloads(s));
+    }
+    return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()),
+                          std::string(s.message()), SerializePayloads(s));
+  }
+}
+
+typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
+
+// Serialize src and store in *dst.
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     ::grpc::ByteBuffer* dst);
+
+// Parse contents of src and initialize *dst with them.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
+
+// Copy string src to grpc buffer *dst.
+::grpc::Status GrpcMaybeUnparseProto(const string& src,
+                                     ::grpc::ByteBuffer* dst);
+
+// Copy grpc buffer src to string *dst.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
+
+// Copy grpc buffer src to tstring *dst.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator.h
new file mode 100644
index 00000000..a6ab9a67
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator.h
@@ -0,0 +1,437 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_ALLOCATOR_H_
+#define XLA_TSL_FRAMEWORK_ALLOCATOR_H_
+
+#include <stdlib.h>
+
+#include <functional>
+#include <limits>
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/framework/numeric_types.h"
+#include "xla/tsl/framework/type_traits.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/numa.h"
+
+namespace tsl {
+
+// Attributes for a single allocation call. Different calls to the same
+// allocator could potentially have different allocation attributes.
+struct AllocationAttributes {
+  AllocationAttributes() = default;
+
+  AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
+                       std::function<uint64()>* freed_by_func)
+      : retry_on_failure(retry_on_failure),
+        allocation_will_be_logged(allocation_will_be_logged),
+        freed_by_func(freed_by_func) {}
+
+  // If the first attempt to allocate the memory fails, the allocation should
+  // wait and retry (with a timeout).
+  //
+  // This is usually set to true, but we may set it to false in cases where a
+  // failure has only performance impact (e.g. optional scratch space
+  // allocation).
+  bool retry_on_failure = true;
+  // If a Tensor is allocated without the following set to true, then
+  // it is logged as an unknown allocation. During execution Tensors
+  // should be allocated through the OpKernelContext which records
+  // which Op is performing the allocation, and sets this flag to
+  // true.
+  bool allocation_will_be_logged = false;
+  // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
+  // a memory chunk whose freed_at_count is at this value or earlier may be
+  // returned.
+  std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
+
+  AllocationAttributes(const AllocationAttributes&) = delete;
+  void operator=(const AllocationAttributes&) = delete;
+};
+
+// Runtime statistics collected by an allocator. Exactly the same as
+// stream_executor::AllocatorStats, but independently defined to preserve the
+// mutual independence of StreamExecutor and TensorFlow.
+struct AllocatorStats {
+  int64_t num_allocs;          // Number of allocations.
+  int64_t bytes_in_use;        // Number of bytes in use.
+  int64_t peak_bytes_in_use;   // The peak bytes in use.
+  int64_t largest_alloc_size;  // The largest single allocation seen.
+
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  std::optional<int64_t> bytes_limit;
+
+  // Stats for reserved memory usage.
+  int64_t bytes_reserved;       // Number of bytes reserved.
+  int64_t peak_bytes_reserved;  // The peak number of bytes reserved.
+  // The upper limit on the number bytes of reservable memory,
+  // if such a limit is known.
+  std::optional<int64_t> bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;  // Largest free block's size in heap.
+
+  // Number of bytes of memory held by the allocator.  This may be higher than
+  // bytes_in_use if the allocator holds a pool of memory (e.g. BFCAllocator).
+  std::optional<int64_t> pool_bytes;
+  std::optional<int64_t> peak_pool_bytes;
+
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0),
+        bytes_reserved(0),
+        peak_bytes_reserved(0),
+        largest_free_block_bytes(0) {}
+
+  std::string DebugString() const;
+};
+
+// The type of the allocated memory.
+enum class AllocatorMemoryType {
+  kUnknown = 0,       // Memory type unknown.
+  kDevice = 1,        // Memory on device.
+  kHostPageable = 2,  // Memory on host and it is pagable.
+  kHostPinned = 3,    // Memory on host and it is pinned.
+};
+
+// Allocator is an abstract interface for allocating and deallocating
+// device memory.
+class Allocator {
+ public:
+  // Align to 64 byte boundary.
+  static constexpr size_t kAllocatorAlignment = 64;
+
+  virtual ~Allocator();
+
+  // Return a string identifying this allocator
+  virtual std::string Name() = 0;
+
+  // Return an uninitialized block of memory that is "num_bytes" bytes
+  // in size.  The returned pointer is guaranteed to be aligned to a
+  // multiple of "alignment" bytes.
+  // REQUIRES: "alignment" is a power of 2.
+  virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0;
+
+  // Return an uninitialized block of memory that is "num_bytes" bytes
+  // in size with specified allocation attributes.  The returned pointer is
+  // guaranteed to be aligned to a multiple of "alignment" bytes.
+  // REQUIRES: "alignment" is a power of 2.
+  virtual void* AllocateRaw(size_t alignment, size_t num_bytes,
+                            const AllocationAttributes& allocation_attr) {
+    // The default behavior is to use the implementation without any allocation
+    // attributes.
+    return AllocateRaw(alignment, num_bytes);
+  }
+
+  // Deallocate a block of memory pointer to by "ptr"
+  // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
+  virtual void DeallocateRaw(void* ptr) = 0;
+
+  virtual void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) {
+    (void)alignment;
+    (void)num_bytes;
+
+    DeallocateRaw(ptr);
+  }
+
+  // Returns true if this allocator tracks the sizes of allocations.
+  // RequestedSize and AllocatedSize must be overridden if
+  // TracksAllocationSizes is overridden to return true.
+  virtual bool TracksAllocationSizes() const { return false; }
+
+  // Returns true if this allocator allocates an opaque handle rather than the
+  // requested number of bytes.
+  //
+  // This method returns false for most allocators, but may be used by
+  // special-case allocators that track tensor usage. If this method returns
+  // true, AllocateRaw() should be invoked for all values of `num_bytes`,
+  // including 0.
+  //
+  // NOTE: It is the caller's responsibility to track whether an allocated
+  // object is a buffer or an opaque handle. In particular, when this method
+  // returns `true`, users of this allocator must not run any constructors or
+  // destructors for complex objects, since there is no backing store for the
+  // tensor in which to place their outputs.
+  virtual bool AllocatesOpaqueHandle() const { return false; }
+
+  // Returns the user-requested size of the data allocated at
+  // 'ptr'.  Note that the actual buffer allocated might be larger
+  // than requested, but this function returns the size requested by
+  // the user.
+  //
+  // REQUIRES: TracksAllocationSizes() is true.
+  //
+  // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+  // allocated by this allocator.
+  virtual size_t RequestedSize(const void* ptr) const {
+    CHECK(false) << "allocator doesn't track sizes";
+    return size_t(0);
+  }
+
+  // Returns the allocated size of the buffer at 'ptr' if known,
+  // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is
+  // guaranteed to be >= RequestedSize(ptr).
+  //
+  // REQUIRES: TracksAllocationSizes() is true.
+  //
+  // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+  // allocated by this allocator.
+  virtual size_t AllocatedSize(const void* ptr) const {
+    return RequestedSize(ptr);
+  }
+
+  // Returns either 0 or an identifier assigned to the buffer at 'ptr'
+  // when the buffer was returned by AllocateRaw. If non-zero, the
+  // identifier differs from every other ID assigned by this
+  // allocator.
+  //
+  // REQUIRES: TracksAllocationSizes() is true.
+  //
+  // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+  // allocated by this allocator.
+  virtual int64_t AllocationId(const void* ptr) const { return 0; }
+
+  // Returns the allocated size of the buffer at 'ptr' if known,
+  // otherwise returns 0. This method can be called when
+  // TracksAllocationSizes() is false, but can be extremely slow.
+  //
+  // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+  // allocated by this allocator.
+  virtual size_t AllocatedSizeSlow(const void* ptr) const {
+    if (TracksAllocationSizes()) {
+      return AllocatedSize(ptr);
+    }
+    return 0;
+  }
+
+  // Fills in 'stats' with statistics collected by this allocator.
+  virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
+
+  // If implemented, clears the internal stats except for the `in_use` fields
+  // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns
+  //  true if implemented.
+  //
+  // REQUIRES: GetStats is overridden.
+  virtual bool ClearStats() TF_MUST_USE_RESULT { return false; }
+
+  virtual void SetSafeFrontier(uint64 count) {}
+
+  // For allocator that are stream aware, allow to specify the compute
+  // stream this allocator is used for. This can also trigger memory
+  // preallocation.
+  virtual void SetStreamAndPreallocateMemory(void* stream) {}
+
+  // Returns the type of the memory allocated by this allocator.
+  virtual AllocatorMemoryType GetMemoryType() const {
+    return AllocatorMemoryType::kUnknown;
+  }
+};
+
+// An implementation of Allocator that delegates all calls to another Allocator.
+//
+// Useful to clients who want to override part of the functionality of another
+// allocator.
+class AllocatorWrapper : public Allocator {
+ public:
+  explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {}
+
+  ~AllocatorWrapper() override {}
+
+  // Returns the wrapped allocator to which all calls are delegated.
+  Allocator* wrapped() const { return wrapped_; }
+
+  std::string Name() override { return wrapped_->Name(); }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return wrapped_->AllocateRaw(alignment, num_bytes);
+  }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocation_attr) override {
+    return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr);
+  }
+
+  void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
+
+  bool TracksAllocationSizes() const override {
+    return wrapped_->TracksAllocationSizes();
+  }
+
+  bool AllocatesOpaqueHandle() const override {
+    return wrapped_->AllocatesOpaqueHandle();
+  }
+
+  size_t RequestedSize(const void* ptr) const override {
+    return wrapped_->RequestedSize(ptr);
+  }
+
+  size_t AllocatedSize(const void* ptr) const override {
+    return wrapped_->AllocatedSize(ptr);
+  }
+
+  int64_t AllocationId(const void* ptr) const override {
+    return wrapped_->AllocationId(ptr);
+  }
+
+  size_t AllocatedSizeSlow(const void* ptr) const override {
+    return wrapped_->AllocatedSizeSlow(ptr);
+  }
+
+  AllocatorMemoryType GetMemoryType() const override {
+    return wrapped_->GetMemoryType();
+  }
+
+ private:
+  Allocator* const wrapped_;
+};
+
+// A tensorflow Op may need access to different kinds of memory that
+// are not simply a function of the device to which the Op has been
+// assigned.  For example, an Op executing on a GPU may still need
+// to allocate CPU RAM for some purpose.  Internal to the tensorflow
+// runtime we may choose to allocate CPU ram from special regions
+// that have been prepared for higher performance in some use
+// contexts, e.g. doing DMA with particular devices.  For these
+// reasons, the Device interface does not expose just one memory
+// Allocator, but instead provides an accessor that takes a
+// specification of the desired memory attributes in order to select
+// an Allocator.
+//
+// Example use:
+//  // Allocator for ordinary device memory:
+//  Allocator* a = allocator(AllocatorAttributes());
+// ...
+//  // Allocator for CPU RAM, regardless of where Op is executing:
+//  AllocatorAttributes attr;
+//  attr.set_on_host(true);
+//  Allocator* a = allocator(attr);
+struct AllocatorAttributes {
+  void set_on_host(bool v) { value |= (static_cast<int>(v)); }
+  bool on_host() const { return value & 0x1; }
+  void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); }
+  bool nic_compatible() const { return value & (0x1 << 1); }
+  void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); }
+  bool gpu_compatible() const { return value & (0x1 << 2); }
+  void set_use_pjrt_allocator(bool v) { value |= (static_cast<int>(v) << 3); }
+  bool use_pjrt_allocator() const { return value & (0x1 << 3); }
+  void Merge(AllocatorAttributes other) {
+    value |= other.value;
+    if (scope_id != other.scope_id) {
+      CHECK(scope_id == 0 || other.scope_id == 0)
+          << "At least one scope_id should be zero to merge "
+             "AllocatorAttributes but found this.scope_id="
+          << scope_id << " and other.scope_id=" << other.scope_id;
+      scope_id = scope_id == 0 ? other.scope_id : scope_id;
+    }
+  }
+  // Returns true if the fields set in *this is a subset of or equal to
+  // those set in other.
+  bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
+    return (value | other.value) == other.value;
+  }
+
+  // NOTE: The upper 8 bits of the value are reserved for
+  // device-specific uses.  Implementors of a device can interpret these
+  // upper 8 bits in device-specific ways, and ops implemented for those
+  // devices are responsible for setting those 8 bits appropriately.
+  uint32 value = 0;
+  // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
+  // a named special-purpose allocator on the same device.
+  int32 scope_id = 0;
+
+  // Returns a human readable representation of this.
+  std::string DebugString() const;
+};
+
+// Returns a trivial implementation of Allocator, which is a process singleton.
+// Access through this function is only intended for use by restricted parts
+// of the infrastructure.
+Allocator* cpu_allocator_base();
+
+// If available, calls ProcessState::GetCPUAllocator(numa_node).
+// If not, falls back to cpu_allocator_base().
+// Intended for use in contexts where ProcessState is not visible at
+// compile time. Where ProcessState is visible, it's preferable to
+// call it directly.
+Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
+
+// Enables AllocatorStats in the default CPU allocator implementation.  By
+// default, it's disabled.
+void EnableCPUAllocatorStats();
+// Disables AllocatorStats in the default CPU allocator implementation.  By
+// default, it's disabled.
+void DisableCPUAllocatorStats();
+bool CPUAllocatorStatsEnabled();
+
+// Enables full statistics collection in the default CPU allocator
+// implementation.  By default, it's disabled.
+void EnableCPUAllocatorFullStats();
+bool CPUAllocatorFullStatsEnabled();
+
+// An object that does the underlying suballoc/free of memory for a higher-level
+// allocator.  The expectation is that the higher-level allocator is doing some
+// kind of cache or pool management so that it will call SubAllocator::Alloc and
+// Free relatively infrequently, compared to the number of times its own
+// AllocateRaw and Free methods are called.
+class SubAllocator {
+ public:
+  // Visitor gets called with a pointer to a memory area and its
+  // size in bytes.  The index value will be numa_node for a CPU
+  // allocator and GPU id for a GPU allocator.
+  typedef std::function<void(void*, int index, size_t)> Visitor;
+
+  SubAllocator(const std::vector<Visitor>& alloc_visitors,
+               const std::vector<Visitor>& free_visitors);
+
+  virtual ~SubAllocator() {}
+  // Allocates at least num_bytes. Returns actual number of bytes allocated in
+  // bytes_received. The caller can safely use the full bytes_received sized
+  // buffer following the returend pointer.
+  virtual void* Alloc(size_t alignment, size_t num_bytes,
+                      size_t* bytes_received) = 0;
+  virtual void Free(void* ptr, size_t num_bytes) = 0;
+
+  // Returns true if the BFC allocator can safely coalesce adjacent regions
+  // returned by this allocator.
+  virtual bool SupportsCoalescing() const = 0;
+
+  // Returns the type of the memory allocated by this SubAllocator.
+  virtual AllocatorMemoryType GetMemoryType() const {
+    return AllocatorMemoryType::kUnknown;
+  }
+
+ protected:
+  // Implementation of Alloc() method must call this on newly allocated
+  // value.
+  void VisitAlloc(void* ptr, int index, size_t num_bytes);
+
+  // Implementation of Free() method must call this on value to be
+  // freed immediately before deallocation.
+  void VisitFree(void* ptr, int index, size_t num_bytes);
+
+  const std::vector<Visitor> alloc_visitors_;
+  const std::vector<Visitor> free_visitors_;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator_registry.h
new file mode 100644
index 00000000..46907279
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator_registry.h
@@ -0,0 +1,154 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of memory allocator factories.
+#ifndef XLA_TSL_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+#define XLA_TSL_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/platform/macros.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/numa.h"
+
+namespace tensorflow {
+
+class ProcessState;
+
+}
+
+namespace tsl {
+
+class AllocatorFactory {
+ public:
+  virtual ~AllocatorFactory() {}
+
+  // Returns true if the factory will create a functionally different
+  // SubAllocator for different (legal) values of numa_node.
+  virtual bool NumaEnabled() { return false; }
+
+  // Create an Allocator.
+  virtual Allocator* CreateAllocator() = 0;
+
+  // Create a SubAllocator. If NumaEnabled() is true, then returned SubAllocator
+  // will allocate memory local to numa_node.  If numa_node == kNUMANoAffinity
+  // then allocated memory is not specific to any NUMA node.
+  virtual SubAllocator* CreateSubAllocator(int numa_node) = 0;
+};
+
+// ProcessState is defined in a package that cannot be a dependency of
+// framework.  This definition allows us to access the one method we need.
+class ProcessStateInterface {
+ public:
+  virtual ~ProcessStateInterface() {}
+  virtual Allocator* GetCPUAllocator(int numa_node) = 0;
+};
+
+// A singleton registry of AllocatorFactories.
+//
+// Allocators should be obtained through ProcessState or cpu_allocator()
+// (deprecated), not directly through this interface.  The purpose of this
+// registry is to allow link-time discovery of multiple AllocatorFactories among
+// which ProcessState will obtain the best fit at startup.
+class AllocatorFactoryRegistry {
+ public:
+  AllocatorFactoryRegistry() {}
+  ~AllocatorFactoryRegistry() {}
+
+  void Register(const char* source_file, int source_line, const string& name,
+                int priority, AllocatorFactory* factory);
+
+  // Returns 'best fit' Allocator.  Find the factory with the highest priority
+  // and return an allocator constructed by it.  If multiple factories have
+  // been registered with the same priority, picks one by unspecified criteria.
+  Allocator* GetAllocator();
+
+  // Returns 'best fit' SubAllocator.  First look for the highest priority
+  // factory that is NUMA-enabled.  If none is registered, fall back to the
+  // highest priority non-NUMA-enabled factory.  If NUMA-enabled, return a
+  // SubAllocator specific to numa_node, otherwise return a NUMA-insensitive
+  // SubAllocator.
+  SubAllocator* GetSubAllocator(int numa_node);
+
+  // Returns the singleton value.
+  static AllocatorFactoryRegistry* singleton();
+
+  ProcessStateInterface* process_state() const {
+    mutex_lock ml(mu_);
+    return process_state_;
+  }
+
+ protected:
+  friend class tensorflow::ProcessState;
+
+  void SetProcessState(ProcessStateInterface* interface) {
+    mutex_lock ml(mu_);
+    process_state_ = interface;
+  }
+
+ private:
+  mutable mutex mu_;
+  ProcessStateInterface* process_state_ ABSL_GUARDED_BY(mu_) = nullptr;
+  bool first_alloc_made_ = false;
+  struct FactoryEntry {
+    const char* source_file;
+    int source_line;
+    string name;
+    int priority;
+    std::unique_ptr<AllocatorFactory> factory;
+    std::unique_ptr<Allocator> allocator;
+    // Index 0 corresponds to kNUMANoAffinity, other indices are (numa_node +
+    // 1).
+    std::vector<std::unique_ptr<SubAllocator>> sub_allocators;
+  };
+  std::vector<FactoryEntry> factories_ ABSL_GUARDED_BY(mu_);
+
+  // Returns any FactoryEntry registered under 'name' and 'priority',
+  // or 'nullptr' if none found.
+  const FactoryEntry* FindEntry(const string& name, int priority) const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  AllocatorFactoryRegistry(const AllocatorFactoryRegistry&) = delete;
+  void operator=(const AllocatorFactoryRegistry&) = delete;
+};
+
+class AllocatorFactoryRegistration {
+ public:
+  AllocatorFactoryRegistration(const char* file, int line, const string& name,
+                               int priority, AllocatorFactory* factory) {
+    AllocatorFactoryRegistry::singleton()->Register(file, line, name, priority,
+                                                    factory);
+  }
+};
+
+#define REGISTER_MEM_ALLOCATOR(name, priority, factory)                     \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, __FILE__, __LINE__, name, \
+                                     priority, factory)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, file, line, name, priority, \
+                                           factory)                         \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, file, line, name, priority, factory)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, file, line, name, priority, factory) \
+  static AllocatorFactoryRegistration allocator_factory_reg_##ctr(            \
+      file, line, name, priority, new factory)
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_ALLOCATOR_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator_retry.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator_retry.h
new file mode 100644
index 00000000..01e5d1d2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/allocator_retry.h
@@ -0,0 +1,64 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_ALLOCATOR_RETRY_H_
+#define XLA_TSL_FRAMEWORK_ALLOCATOR_RETRY_H_
+
+#include <cstddef>
+#include <functional>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/platform/env.h"
+
+namespace tsl {
+
+// A retrying wrapper for a memory allocator.
+class AllocatorRetry {
+ public:
+  AllocatorRetry();
+  ~AllocatorRetry();
+
+  // Call 'alloc_func' to obtain memory.  On first call,
+  // 'verbose_failure' will be false.  If return value is nullptr,
+  // then wait up to 'max_millis_to_wait' milliseconds, retrying each
+  // time a call to DeallocateRaw() is detected, until either a good
+  // pointer is returned or the deadline is exhausted.  If the
+  // deadline is exhausted, try one more time with 'verbose_failure'
+  // set to true.  The value returned is either the first good pointer
+  // obtained from 'alloc_func' or nullptr.
+  void* AllocateRaw(std::function<void*(size_t alignment, size_t num_bytes,
+                                        bool verbose_failure)>
+                        alloc_func,
+                    int max_millis_to_wait, size_t alignment, size_t bytes);
+
+  // Called to notify clients that some memory was returned.
+  void NotifyDealloc();
+
+ private:
+  Env* env_;
+  absl::Mutex mu_;
+  absl::CondVar memory_returned_ ABSL_GUARDED_BY(mu_);
+};
+
+// Implementation details below
+inline void AllocatorRetry::NotifyDealloc() {
+  absl::MutexLock l(&mu_);
+  memory_returned_.SignalAll();
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_ALLOCATOR_RETRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/bfc_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/bfc_allocator.h
new file mode 100644
index 00000000..a0d6568e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/bfc_allocator.h
@@ -0,0 +1,611 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_BFC_ALLOCATOR_H_
+#define XLA_TSL_FRAMEWORK_BFC_ALLOCATOR_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/allocator_retry.h"
+#include "xla/tsl/framework/shared_counter.h"
+#include "xla/tsl/lib/core/bits.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/numbers.h"
+
+namespace tensorflow {
+class MemoryDump;
+}
+namespace tsl {
+using tensorflow::MemoryDump;
+
+// A memory allocator that implements a 'best-fit with coalescing'
+// algorithm.  This is essentially a very simple version of Doug Lea's
+// malloc (dlmalloc).
+//
+// The goal of this allocator is to support defragmentation via
+// coalescing.  One assumption we make is that the process using this
+// allocator owns pretty much all of the memory, and that nearly
+// all requests to allocate memory go through this interface.
+class BFCAllocator : public Allocator {
+ public:
+  struct Options {
+    bool allow_growth = true;
+
+    // If true, the allocator may sleep for a period of time when it can't
+    // fulfill an allocation request, in the hopes that another thread will free
+    // up memory in the meantime.
+    //
+    // If false, the allocator will never sleep, even if
+    // AllocationAttributes::attr_retry_on_failure is true.
+    bool allow_retry_on_failure = true;
+
+    // Whether the allocator will deallocate free regions to avoid OOM due to
+    // memory fragmentation.
+    bool garbage_collection = false;
+
+    // Controls when a chunk should be split, if its size exceeds the requested
+    // allocation size.
+    double fragmentation_fraction = 0;
+  };
+  BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory,
+               const string& name, const Options& opts);
+
+  ~BFCAllocator() override;
+
+  string Name() override { return name_; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return AllocateRaw(alignment, num_bytes, AllocationAttributes());
+  }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocation_attr) override;
+
+  void DeallocateRaw(void* ptr) override;
+
+  bool TracksAllocationSizes() const override;
+
+  size_t RequestedSize(const void* ptr) const override;
+
+  size_t AllocatedSize(const void* ptr) const override;
+
+  int64_t AllocationId(const void* ptr) const override;
+
+  std::optional<AllocatorStats> GetStats() override;
+
+  bool ClearStats() override;
+
+  void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
+
+  void SetSafeFrontier(uint64 count) override;
+
+  AllocatorMemoryType GetMemoryType() const override;
+
+  bool ShouldRecordOpName() const { return true; }
+
+  MemoryDump RecordMemoryMap();
+
+ private:
+  struct Bin;
+
+  void* AllocateRawInternal(size_t alignment, size_t num_bytes,
+                            bool dump_log_on_failure,
+                            uint64 freed_before_count);
+
+  void* AllocateRawInternalWithRetry(
+      size_t alignment, size_t num_bytes,
+      const AllocationAttributes& allocation_attr);
+
+  void DeallocateRawInternal(void* ptr);
+
+  // Chunks whose freed_at_count is later than the safe frontier value are kept
+  // on a special list and not subject to merging immediately upon being freed.
+  //
+  // This function sweeps that list looking for Chunks whose timestamp is now
+  // safe. When found their freed_at_count is set to 0 and we attempt to merge
+  // them with their neighbors.
+  //
+  // If required_bytes > 0 then this function is being called in the context of
+  // a need for this many bytes that could not be satisfied without merging
+  // unsafe chunks, so we go ahead and merge the unsafe chunks too, just up to
+  // the point that a free chunk of required_bytes is produced.  Note that
+  // unsafe merged chunks adopt the most conservative timestamp from their
+  // constituents so they're only useful for allocations not requiring a
+  // particular timestamp.
+  bool MergeTimestampedChunks(size_t required_bytes)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Return the largest free chunk bytes from the largest bin in constant time.
+  // The free chunks are sorted by size (and then address) in a bin.
+  int64_t LargestFreeChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Add TraceMe (in memory allocation and deallocation) for memory stats
+  // profiling. The chunk_ptr is passed to get information such as address,
+  // chunk size and requested_size.
+  void AddTraceMe(absl::string_view traceme_name, const void* ptr)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Overloaded AddTraceMe function with chunk information.
+  void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr,
+                  int64_t req_bytes, int64_t alloc_bytes)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
+  // kInvalidChunkHandle means an invalid chunk
+  typedef size_t ChunkHandle;
+  static constexpr ChunkHandle kInvalidChunkHandle = SIZE_MAX;
+
+  typedef int BinNum;
+  static constexpr int kInvalidBinNum = -1;
+  // The following means that the largest bin'd chunk size is 256 << 21 = 512MB.
+  static constexpr int kNumBins = 21;
+
+  // A Chunk points to a piece of memory that's either entirely free or entirely
+  // in use by one user memory allocation.
+  //
+  // An AllocationRegion's memory is split up into one or more disjoint Chunks,
+  // which together cover the whole region without gaps.  Chunks participate in
+  // a doubly-linked list, and the prev/next pointers point to the physically
+  // adjacent chunks.
+  //
+  // Since a chunk cannot be partially in use, we may need to split a free chunk
+  // in order to service a user allocation.  We always merge adjacent free
+  // chunks.
+  //
+  // Chunks contain information about whether they are in use or whether they
+  // are free, and contain a pointer to the bin they are in.
+  struct Chunk {
+    size_t size = 0;  // Full size of buffer.
+
+    // We sometimes give chunks that are larger than needed to reduce
+    // fragmentation.  requested_size keeps track of what the client
+    // actually wanted so we can understand whether our splitting
+    // strategy is efficient.
+    size_t requested_size = 0;
+
+    // allocation_id is set to -1 when the chunk is not in use. It is assigned a
+    // value greater than zero before the chunk is returned from
+    // AllocateRaw, and this value is unique among values assigned by
+    // the parent allocator.
+    int64_t allocation_id = -1;
+    void* ptr = nullptr;  // pointer to granted subbuffer.
+
+    // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
+    // preceding the memory used by this chunk.  E.g., It should start
+    // at 'ptr - prev->size'
+    ChunkHandle prev = kInvalidChunkHandle;
+
+    // If not kInvalidChunkHandle, the memory referred to by 'next' is directly
+    // following the memory used by this chunk.  E.g., It should be at
+    // 'ptr + size'
+    ChunkHandle next = kInvalidChunkHandle;
+
+    // What bin are we in?
+    BinNum bin_num = kInvalidBinNum;
+
+    // Optional count when this chunk was most recently made free.
+    uint64 freed_at_count = 0;
+
+    bool in_use() const { return allocation_id != -1; }
+
+#ifdef TENSORFLOW_MEM_DEBUG
+    // optional debugging info
+    const char* op_name = nullptr;
+    uint64 step_id = 0;
+    int64 action_count = 0;
+#endif
+
+    string DebugString(BFCAllocator* a,
+                       bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS {
+      string dbg;
+      absl::StrAppend(
+          &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
+          " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
+          " | in_use: ", in_use(), " | bin_num: ", bin_num);
+      if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
+        Chunk* p = a->ChunkFromHandle(prev);
+        absl::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
+      }
+      if (recurse && next != BFCAllocator::kInvalidChunkHandle) {
+        Chunk* n = a->ChunkFromHandle(next);
+        absl::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
+      }
+#ifdef TENSORFLOW_MEM_DEBUG
+      absl::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN",
+                      ", stepid: ", step_id, ", last_action: ", action_count);
+#endif
+      return dbg;
+    }
+  };
+
+  // A Bin is a collection of similar-sized free chunks.
+  // Allocated chunks are never in a Bin.
+  struct Bin {
+    // All chunks in this bin have >= bin_size memory.
+    size_t bin_size = 0;
+
+    class ChunkComparator {
+     public:
+      explicit ChunkComparator(BFCAllocator* allocator)
+          : allocator_(allocator) {}
+      // Sort first by size and then use pointer address as a tie breaker.
+      bool operator()(const ChunkHandle ha, const ChunkHandle hb) const
+          ABSL_NO_THREAD_SAFETY_ANALYSIS {
+        const Chunk* a = allocator_->ChunkFromHandle(ha);
+        const Chunk* b = allocator_->ChunkFromHandle(hb);
+        if (a->size != b->size) {
+          return a->size < b->size;
+        }
+        return a->ptr < b->ptr;
+      }
+
+     private:
+      BFCAllocator* allocator_;  // The parent allocator
+    };
+
+    typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
+    // List of free chunks within the bin, sorted by chunk size.
+    // Chunk * not owned.
+    FreeChunkSet free_chunks;
+    Bin(BFCAllocator* allocator, size_t bs)
+        : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
+  };
+
+  static constexpr size_t kMinAllocationBits = 8;
+  static constexpr size_t kMinAllocationSize = 1 << kMinAllocationBits;
+
+  // BFCAllocator allocates memory into a collection of disjoint
+  // AllocationRegions.  Each AllocationRegion corresponds to one call to
+  // SubAllocator::Alloc().  (Actually, if a subsequent call to
+  // SubAllocator::Alloc() returns another region immediately adjacent to the
+  // last, it will be used to extend the first AllocationRegion, not create a
+  // separate one.)
+  //
+  // An AllocationRegion contains one or more Chunks, covering all of its
+  // memory.  Its primary job is to map pointers to ChunkHandles.
+  //
+  // This class is thread-compatible.
+  class AllocationRegion {
+   public:
+    AllocationRegion(void* ptr, size_t memory_size)
+        : ptr_(ptr),
+          memory_size_(memory_size),
+          end_ptr_(
+              static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
+      DCHECK_EQ(0, memory_size % kMinAllocationSize);
+      const size_t n_handles =
+          (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_.resize(n_handles, kInvalidChunkHandle);
+    }
+
+    AllocationRegion() = default;
+    AllocationRegion(AllocationRegion&& other) { Swap(&other); }
+    AllocationRegion& operator=(AllocationRegion&& other) {
+      Swap(&other);
+      return *this;
+    }
+
+    void* ptr() const { return ptr_; }
+    void* end_ptr() const { return end_ptr_; }
+    size_t memory_size() const { return memory_size_; }
+    void extend(size_t size) {
+      memory_size_ += size;
+      DCHECK_EQ(0, memory_size_ % kMinAllocationSize);
+
+      end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size);
+      const size_t n_handles =
+          (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_.resize(n_handles, kInvalidChunkHandle);
+    }
+    ChunkHandle get_handle(const void* p) const {
+      return handles_[IndexFor(p)];
+    }
+    void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
+    void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
+
+   private:
+    void Swap(AllocationRegion* other) {
+      std::swap(ptr_, other->ptr_);
+      std::swap(memory_size_, other->memory_size_);
+      std::swap(end_ptr_, other->end_ptr_);
+      std::swap(handles_, other->handles_);
+    }
+
+    size_t IndexFor(const void* p) const {
+      std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
+      std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
+      DCHECK_GE(p_int, base_int);
+      DCHECK_LT(p_int, base_int + memory_size_);
+      return static_cast<size_t>(((p_int - base_int) >> kMinAllocationBits));
+    }
+
+    // Metadata about the allocation region.
+    void* ptr_ = nullptr;
+    size_t memory_size_ = 0;
+    void* end_ptr_ = nullptr;
+
+    // Array of size "memory_size / kMinAllocationSize".  It is
+    // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
+    // for the memory allocation represented by "p"
+    std::vector<ChunkHandle> handles_;
+
+    AllocationRegion(const AllocationRegion&) = delete;
+    void operator=(const AllocationRegion&) = delete;
+  };
+
+  // RegionManager aggregates one or more "AllocationRegions" and provides
+  // a layer of indirection from pointers to the underlying ChunkHandle,
+  // allowing allocation across multiple discontiguous memory regions.
+  //
+  // This class is thread-compatible.
+  class RegionManager {
+   public:
+    RegionManager() {}
+    ~RegionManager() {}
+
+    void AddAllocationRegion(void* ptr, size_t memory_size) {
+      // Insert sorted by end_ptr.
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size));
+    }
+
+    // Adds an alloation region for the given ptr and size, potentially
+    // extending a region if ptr matches the end_ptr of an existing region.
+    // If a region is extended, returns a pointer to the extended region so that
+    // the BFC allocator can reason about chunkification.
+    AllocationRegion* AddOrExtendAllocationRegion(void* ptr,
+                                                  size_t memory_size) {
+      // Insert sorted by end_ptr.
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      // Check if can be coalesced with preceding region.
+      if (entry != regions_.begin()) {
+        auto preceding_region = entry - 1;
+        if (preceding_region->end_ptr() == ptr) {
+          if (VLOG_IS_ON(1)) {
+            LOG(INFO) << "Extending region " << preceding_region->ptr()
+                      << " of "
+                      << strings::HumanReadableNumBytes(
+                             preceding_region->memory_size())
+                      << "  by " << strings::HumanReadableNumBytes(memory_size)
+                      << " bytes";
+          }
+          preceding_region->extend(memory_size);
+          return &*preceding_region;
+        }
+      }
+      VLOG(1) << "Inserting new region " << ptr << " of "
+              << strings::HumanReadableNumBytes(memory_size);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size));
+      return nullptr;
+    }
+
+    std::vector<AllocationRegion>::iterator RemoveAllocationRegion(
+        std::vector<AllocationRegion>::iterator it) {
+      return regions_.erase(it);
+    }
+
+    ChunkHandle get_handle(const void* p) const {
+      return RegionFor(p)->get_handle(p);
+    }
+
+    void set_handle(const void* p, ChunkHandle h) {
+      return MutableRegionFor(p)->set_handle(p, h);
+    }
+    void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
+
+    const std::vector<AllocationRegion>& regions() const { return regions_; }
+
+   private:
+    static bool Comparator(const void* ptr, const AllocationRegion& other) {
+      return ptr < other.end_ptr();
+    }
+
+    AllocationRegion* MutableRegionFor(const void* p) {
+      return const_cast<AllocationRegion*>(RegionFor(p));
+    }
+
+    const AllocationRegion* RegionFor(const void* p) const {
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
+
+      if (entry != regions_.end()) {
+        return &(*entry);
+      }
+
+      LOG(FATAL) << "Could not find Region for " << p;
+      return nullptr;
+    }
+
+   private:
+    std::vector<AllocationRegion> regions_;
+  };
+
+  // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
+  static size_t RoundedBytes(size_t bytes);
+
+  // Try to add a new memory region that can satisfy an allocation of
+  // 'rounded_bytes' bytes.  Returns true on success and false on
+  // failure.
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Deallocate free regions to give back the memory to suballocator, so that
+  // we can reallocate a larger region.  The main use scenario of this function
+  // is when OOM happens but we have free regions and the sum of sizes of free
+  // regions and unallocated bytes is larger than the requested size, implying
+  // (external) memory fragmentation.  Returns true if any free regions are
+  // found and freed; false otherwise.
+  bool DeallocateFreeRegions(size_t rounded_bytes);
+
+  // Helper function to deallocate regions.
+  void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Returns a pointer to an underlying allocated chunk of size
+  // 'rounded_bytes'.
+  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
+                     uint64 freed_before) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Splits the chunk specified by 'h' into two chunks, one at least
+  // of size 'num_bytes'.
+  void SplitChunk(ChunkHandle h, size_t num_bytes)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Merges the two chunk handles.  Requires that the chunks are
+  // contiguous in their allocation.
+  void Merge(ChunkHandle h, ChunkHandle h2)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Adds the chunk 'h' to the proper free bin.
+  void InsertFreeChunkIntoBin(ChunkHandle h)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Removes the free chunk pointed to by 'c' from the set free_chunks.
+  void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
+                                  const Bin::FreeChunkSet::iterator& c)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Removes a free chunk from the bin.
+  void RemoveFreeChunkFromBin(ChunkHandle h)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void MaybeRemoveFreeChunkFromBin(ChunkHandle h)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Removes the chunk metadata represented by 'h'.
+  void DeleteChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void DumpMemoryLog(size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  tensorflow::MemoryDump RecordMemoryMapInternal()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void MaybeWriteMemoryMap() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  ChunkHandle AllocateChunk() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void DeallocateChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  Chunk* ChunkFromHandle(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  const Chunk* ChunkFromHandle(ChunkHandle h) const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  void MarkFree(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Fragmentation is calculated as the reverse ratio of the largest free chunk
+  // size over total free memory, and returns a value within [0, 1].
+  double GetFragmentation() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Information about a Bin that is useful for debugging.
+  struct BinDebugInfo {
+    size_t total_bytes_in_use = 0;
+    size_t total_bytes_in_bin = 0;
+    size_t total_requested_bytes_in_use = 0;
+    size_t total_chunks_in_use = 0;
+    size_t total_chunks_in_bin = 0;
+  };
+
+  // Computes and returns a BinDebugInfo for each Bin.
+  std::array<BinDebugInfo, kNumBins> get_bin_debug_info()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  AllocatorRetry retry_helper_;
+
+  // Structures immutable after construction
+  size_t memory_limit_ = 0;
+
+  // Map from bin size to Bin
+  Bin* BinFromIndex(BinNum index) {
+    return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
+  }
+  size_t BinNumToSize(BinNum index) {
+    return static_cast<size_t>(256) << index;
+  }
+  BinNum BinNumForSize(size_t bytes) {
+    uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
+    int b = std::min(kNumBins - 1, tsl::Log2Floor64(v));
+    return b;
+  }
+  Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
+
+  char bins_space_[sizeof(Bin) * kNumBins];
+
+  const Options opts_;
+
+  // The size of the current region allocation.
+  size_t curr_region_allocation_bytes_;
+
+  // Whether the allocator will coalesce adjacent sub allocator provided
+  // AllocationRegions. This may be disabled if discrete sub allocator
+  // regions can't be treated as contiguous (e.g. if the allocation refers to
+  // device visible memory which is not adjacent to the other region in the
+  // device's address space).
+  const bool coalesce_regions_;
+
+  std::unique_ptr<SubAllocator> sub_allocator_;
+  string name_;
+  SharedCounter* timing_counter_ = nullptr;
+  std::deque<ChunkHandle> timestamped_chunks_;
+
+  std::atomic<uint64> safe_frontier_ = {0};
+
+  // Structures mutable after construction
+  mutable absl::Mutex mutex_;
+  RegionManager region_manager_ ABSL_GUARDED_BY(mutex_);
+
+  std::vector<Chunk> chunks_ ABSL_GUARDED_BY(mutex_);
+
+  // Pointer to head of linked list of free Chunks
+  ChunkHandle free_chunks_list_ ABSL_GUARDED_BY(mutex_);
+
+  // Counter containing the next unique identifier to assign to a
+  // newly-created chunk.
+  int64_t next_allocation_id_ ABSL_GUARDED_BY(mutex_);
+
+  // Stats.
+  AllocatorStats stats_ ABSL_GUARDED_BY(mutex_);
+
+#ifdef TENSORFLOW_MEM_DEBUG
+  int64 action_counter_ ABSL_GUARDED_BY(mutex_);
+#define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
+  int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
+#endif
+
+  friend class GPUBFCAllocatorPrivateMethodsTest;
+  friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
+  BFCAllocator(const BFCAllocator&) = delete;
+  void operator=(const BFCAllocator&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_BFC_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/cancellation.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/cancellation.h
new file mode 100644
index 00000000..fcfd4c83
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/cancellation.h
@@ -0,0 +1,217 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_CANCELLATION_H_
+#define XLA_TSL_FRAMEWORK_CANCELLATION_H_
+
+#include <atomic>
+#include <functional>
+
+#include "xla/tsl/lib/gtl/flatmap.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/hash.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/notification.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+// A token that can be used to register and deregister a
+// CancelCallback with a CancellationManager.
+//
+// CancellationToken values must be created by a call to
+// CancellationManager::get_cancellation_token.
+typedef int64_t CancellationToken;
+
+// A callback that is invoked when a step is canceled.
+//
+// NOTE(mrry): See caveats about CancelCallback implementations in the
+// comment for CancellationManager::RegisterCallback.
+typedef std::function<void()> CancelCallback;
+
+// This class should never simultaneously be used as the cancellation manager
+// for two separate sets of executions (i.e two separate steps, or two separate
+// function executions).
+class CancellationManager {
+ public:
+  // A value that won't be returned by get_cancellation_token().
+  static const CancellationToken kInvalidToken;
+
+  CancellationManager();
+
+  // Constructs a new CancellationManager that is a "child" of `*parent`.
+  //
+  // If `*parent` is cancelled, `*this` will be cancelled. `*parent` must
+  // outlive the created CancellationManager.
+  explicit CancellationManager(CancellationManager* parent);
+
+  ~CancellationManager();
+
+  // Run all callbacks associated with this manager.
+  void StartCancel();
+
+  // Run all callbacks associated with this manager with a status.
+  // Currently the status is for logging purpose only. See also
+  // CancellationManager::RegisterCallbackWithErrorLogging.
+  void StartCancelWithStatus(const absl::Status& status);
+
+  // Returns true iff StartCancel() has been called.
+  bool IsCancelled() { return is_cancelled_.load(std::memory_order_acquire); }
+
+  // Returns a token that must be used in calls to RegisterCallback
+  // and DeregisterCallback.
+  CancellationToken get_cancellation_token() {
+    return next_cancellation_token_.fetch_add(1);
+  }
+
+  // Attempts to register the given callback to be invoked when this
+  // manager is cancelled. Returns true if the callback was
+  // registered; returns false if this manager was already cancelled,
+  // and the callback was not registered.
+  //
+  // If this method returns false, it is the caller's responsibility
+  // to perform any cancellation cleanup.
+  //
+  // This method is tricky to use correctly. The following usage pattern
+  // is recommended:
+  //
+  // class ObjectWithCancellableOperation {
+  //   mutex mu_;
+  //   void CancellableOperation(CancellationManager* cm,
+  //                             std::function<void(Status)> callback) {
+  //     bool already_cancelled;
+  //     CancellationToken token = cm->get_cancellation_token();
+  //     {
+  //       mutex_lock(mu_);
+  //       already_cancelled = !cm->RegisterCallback(
+  //           [this, token]() { Cancel(token); });
+  //       if (!already_cancelled) {
+  //         // Issue asynchronous operation. Associate the pending operation
+  //         // with `token` in some object state, or provide another way for
+  //         // the Cancel method to look up the operation for cancellation.
+  //         // Ensure that `cm->DeregisterCallback(token)` is called without
+  //         // holding `mu_`, before `callback` is invoked.
+  //         // ...
+  //       }
+  //     }
+  //     if (already_cancelled) {
+  //       callback(errors::Cancelled("Operation was cancelled"));
+  //     }
+  //   }
+  //
+  //   void Cancel(CancellationToken token) {
+  //     mutex_lock(mu_);
+  //     // Take action to cancel the operation with the given cancellation
+  //     // token.
+  //   }
+  //
+  // NOTE(mrry): The caller should take care that (i) the calling code
+  // is robust to `callback` being invoked asynchronously (e.g. from
+  // another thread), (ii) `callback` is deregistered by a call to
+  // this->DeregisterCallback(token) when the operation completes
+  // successfully, and (iii) `callback` does not invoke any method
+  // on this cancellation manager. Furthermore, it is important that
+  // the eventual caller of the complementary DeregisterCallback does not
+  // hold any mutexes that are required by `callback`.
+  bool RegisterCallback(CancellationToken token, CancelCallback callback);
+
+  // Similar to RegisterCallback, but if the cancellation manager starts a
+  // cancellation with an error status, it will log the error status before
+  // invoking the callback. `callback_name` is a human-readable name of the
+  // callback, which will be displayed on the log.
+  bool RegisterCallbackWithErrorLogging(CancellationToken token,
+                                        CancelCallback callback,
+                                        absl::string_view callback_name);
+
+  // Deregister the callback that, when registered, was associated
+  // with the given cancellation token. Returns true iff the callback
+  // was deregistered and will not be invoked; otherwise returns false
+  // after the callback has been invoked, blocking if necessary.
+  //
+  // NOTE(mrry): This method may block if cancellation is in progress.
+  // The caller of this method must not hold any mutexes that are required
+  // to invoke any cancellation callback that has been registered with this
+  // cancellation manager.
+  bool DeregisterCallback(CancellationToken token);
+
+  // Deregister the callback that, when registered, was associated
+  // with the given cancellation token. Returns true iff the callback
+  // was deregistered and will not be invoked; otherwise returns false
+  // immediately, with no guarantee that the callback has completed.
+  //
+  // This method is guaranteed to return true if StartCancel has not been
+  // called.
+  bool TryDeregisterCallback(CancellationToken token);
+
+  // Returns true iff cancellation is in progress.
+  bool IsCancelling();
+
+ private:
+  struct CallbackConfiguration {
+    CancelCallback callback;
+    std::string name;
+    bool log_error = false;
+  };
+
+  struct State {
+    Notification cancelled_notification;
+    gtl::FlatMap<CancellationToken, CallbackConfiguration> callbacks;
+
+    // If this CancellationManager has any children, this member points to the
+    // head of a doubly-linked list of its children.
+    CancellationManager* first_child = nullptr;  // Not owned.
+  };
+
+  bool RegisterCallbackConfig(CancellationToken token,
+                              CallbackConfiguration config);
+
+  bool RegisterChild(CancellationManager* child);
+  void DeregisterChild(CancellationManager* child);
+
+  bool is_cancelling_;
+  std::atomic_bool is_cancelled_;
+  std::atomic<CancellationToken> next_cancellation_token_;
+
+  CancellationManager* const parent_ = nullptr;  // Not owned.
+
+  // If this CancellationManager is associated with a parent, this member will
+  // be set to `true` after this is removed from the parent's list of children.
+  bool is_removed_from_parent_ TF_GUARDED_BY(parent_->mu_) = false;
+
+  // If this CancellationManager is associated with a parent, these members form
+  // a doubly-linked list of that parent's children.
+  //
+  // These fields are valid only when `this->is_removed_from_parent_` is false.
+  CancellationManager* prev_sibling_ TF_GUARDED_BY(parent_->mu_) =
+      nullptr;  // Not owned.
+  CancellationManager* next_sibling_ TF_GUARDED_BY(parent_->mu_) =
+      nullptr;  // Not owned.
+
+  mutex mu_;
+  std::unique_ptr<State> state_ TF_GUARDED_BY(mu_);
+};
+
+// Registers the given cancellation callback, returning a function that can be
+// used to deregister the callback. If `cancellation_manager` is NULL, no
+// registration occurs and `deregister_fn` will be a no-op.
+absl::Status RegisterCancellationCallback(
+    CancellationManager* cancellation_manager, std::function<void()> callback,
+    std::function<void()>* deregister_fn);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_CANCELLATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/contraction/eigen_contraction_kernel.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/contraction/eigen_contraction_kernel.h
new file mode 100644
index 00000000..001085b2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/contraction/eigen_contraction_kernel.h
@@ -0,0 +1,905 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_CONTRACTION_EIGEN_CONTRACTION_KERNEL_H_
+#define XLA_TSL_FRAMEWORK_CONTRACTION_EIGEN_CONTRACTION_KERNEL_H_
+
+// Depending on a build configuration this header provides custom kernel for
+// Eigen tensor contractions (small matrix multiplication kernel used to
+// multiple together blocks of the original tensors).
+//
+// 1) --define tensorflow_mkldnn_contraction_kernel=1
+//    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at
+//    runtime and use avx/avx2/fma/avx512 based on cpu status registers
+//    (https://en.wikipedia.org/wiki/CPUID).
+//
+// If you use `tensor.contract(other_tensor)` in your code, you must include
+// this header to get the benefit of custom contraction kernel:
+//
+//   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+//   #include
+//   "third_party/tensorflow/compiler/xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+//   #endif
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/tsl/framework/fixedpoint/FixedPoint.h"
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+#include "dnnl.h"
+#endif
+
+#include "tsl/platform/dynamic_annotations.h"
+
+namespace Eigen {
+namespace internal {
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+// Returns `true` iff we can use custom contraction kernels. This is a runtime
+// check, that uses environment variables.
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE bool UseCustomContractionKernels();
+
+// Pack a 2D block of a Tensor expression into contiguous block of memory with
+// col-major storage order. We do not have access to the underlying Tensor
+// expression, we only have a DataMapper (TensorContractionInputMapper for
+// tensor contractions, or blas_data_mapper for plain tensors), that provides a
+// two-dimensional view into the Tensor expression.
+//
+// Default Eigen gemm_pack_rhs and gemm_pack_lhs pack blocks of tensor
+// expressions into the packed format described in "Anatomy of High-Performance
+// Matrix Multiplication" paper (1). Eigen::internal::gebp_kernel relies on this
+// packing format for efficient micro-panel multiplication.
+//
+// This simple packing can be used with any '?gemm' function from BLAS
+// libraries, that work with col-major matrices.
+//
+// (1) http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf
+//
+// IMPORTANT: `gemm_pack_colmajor_block` always packs the block in column major
+// order, DataMapperStorageOrder specifies the storage order of the underlying
+// Tensor expression.
+template <typename Scalar, typename IndexType, typename DataMapper,
+          int DataMapperStorageOrder>
+struct gemm_pack_colmajor_block;
+
+// gemm_pack_colmajor_block for ColMajor storage order.
+template <typename Scalar, typename IndexType, typename DataMapper>
+struct gemm_pack_colmajor_block<Scalar, IndexType, DataMapper,
+                                /*DataMapperStorageOrder*/ ColMajor> {
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum { PacketSize = internal::packet_traits<Scalar>::size };
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows,
+                  IndexType cols) {
+    const IndexType unrolled_rows = rows - 4 * PacketSize;
+    const IndexType vectorized_rows = rows - PacketSize;
+
+    for (IndexType col = 0; col < cols; ++col) {
+      LinearMapper lm = data_mapper.getLinearMapper(0, col);
+
+      IndexType row = 0;
+      // Give compiler a strong possibility to unroll the loop.
+      for (; row <= unrolled_rows; row += 4 * PacketSize) {
+        for (IndexType j = 0; j < 4; ++j) {
+          const Packet p = lm.template loadPacket<Packet>(row + j * PacketSize);
+          internal::pstoreu(block + j * PacketSize, p);
+        }
+        block += 4 * PacketSize;
+      }
+      // Process remaining rows with packets.
+      for (; row <= vectorized_rows; row += PacketSize) {
+        const Packet p = lm.template loadPacket<Packet>(row);
+        internal::pstoreu(block, p);
+        block += PacketSize;
+      }
+      // Finalize with coefficients.
+      for (; row < rows; ++row) {
+        *block = lm(row);
+        ++block;
+      }
+    }
+  }
+};
+
+#endif  // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL
+
+// Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+template <typename Scalar, typename IndexType, typename OutputMapper,
+          bool ConjugateLhs = false, bool ConjugateRhs = false>
+struct dnnl_gemm_kernel;
+
+// dnnl_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
+template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
+          bool ConjugateRhs>
+struct dnnl_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper, ConjugateLhs,
+                        ConjugateRhs> {
+  static_assert(!ConjugateLhs, "DNNL kernel doesn't support ConjugateLhs");
+  static_assert(!ConjugateRhs, "DNNL kernel doesn't support ConjugateRhs");
+
+  static constexpr int kComputeStrideFromBlockDimensions = -1;
+
+  using LhsScalar = float;
+  using RhsScalar = float;
+  using ResScalar = float;
+
+  EIGEN_DONT_INLINE
+  void operator()(const OutputMapper& output, const LhsScalar* blockA,
+                  const RhsScalar* blockB, const IndexType rows,
+                  const IndexType depth, const IndexType cols, float alpha,
+                  float beta, int ldA = kComputeStrideFromBlockDimensions,
+                  int ldB = kComputeStrideFromBlockDimensions,
+                  char transposeA = 'N', char transposeB = 'N') {
+    static const int max_index = (std::numeric_limits<int>::max)();
+
+    eigen_assert(max_index >= rows);
+    eigen_assert(max_index >= cols);
+    eigen_assert(max_index >= depth);
+    eigen_assert(max_index >= output.stride());
+
+    const int m = static_cast<int>(rows);
+    const int n = static_cast<int>(cols);
+    const int k = static_cast<int>(depth);
+
+    ldA = ldA == kComputeStrideFromBlockDimensions ? m : ldA;
+    ldB = ldB == kComputeStrideFromBlockDimensions ? k : ldB;
+    const int ldC = static_cast<int>(output.stride());
+
+    // DNNL takes row-major matrices. Our packed column-major matrices can be
+    // viewed as a transposed row-major matrix, i.e.,
+    //   C_colmajor = C_rowmajor^T = (A_rowmajor * B_rowmajor)^T
+    //                             = B_rowmajor^T * A_rowmajor^T
+    //                             = B_colmajor * A_colmajor
+    // So we can just swap the input matrices A and B for DNNL.
+    // TODO(penporn): Switch to row-major packing instead.
+    dnnl_status_t st =
+        dnnl_sgemm(transposeB, transposeA, n, m, k, alpha, blockB, ldB, blockA,
+                   ldA, beta, const_cast<ResScalar*>(output.data()), ldC);
+    eigen_assert(st == 0);
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 1 || defined(MEMORY_SANITIZER)
+    for (IndexType col = 0; col < cols; ++col) {
+      ResScalar* row_base = &output(0, col);
+      EIGEN_UNUSED_VARIABLE(row_base);  // Suppress unused variable error.
+      TF_ANNOTATE_MEMORY_IS_INITIALIZED(row_base, sizeof(ResScalar) * rows);
+    }
+#endif
+
+    // eigen_assert is a no-op in optimized mode so we add these to avoid
+    // compiler's unused-variable errors.
+    EIGEN_UNUSED_VARIABLE(max_index);
+    EIGEN_UNUSED_VARIABLE(st);
+  }
+};
+
+template <typename IndexType, typename OutputMapper, bool ConjugateLhs = false,
+          bool ConjugateRhs = false>
+struct mkldnn_gemm_s8u8s32_kernel {
+  static_assert(!ConjugateLhs, "DNNL kernel doesn't support ConjugateLhs");
+  static_assert(!ConjugateRhs, "DNNL kernel doesn't support ConjugateRhs");
+
+  static constexpr int kComputeStrideFromBlockDimensions = -1;
+
+  using LhsScalar = Eigen::QInt8;
+  using RhsScalar = Eigen::QUInt8;
+  using ResScalar = Eigen::QInt32;
+
+  EIGEN_DONT_INLINE
+  void operator()(const OutputMapper& output, const LhsScalar* blockA,
+                  const RhsScalar* blockB, const IndexType rows,
+                  const IndexType depth, const IndexType cols, float alpha,
+                  float beta, int ldA = kComputeStrideFromBlockDimensions,
+                  int ldB = kComputeStrideFromBlockDimensions,
+                  char transposeA = 'N', char transposeB = 'N') {
+    static const int max_index = (std::numeric_limits<int>::max)();
+
+    eigen_assert(max_index >= rows);
+    eigen_assert(max_index >= cols);
+    eigen_assert(max_index >= depth);
+    eigen_assert(max_index >= output.stride());
+
+    const int m = static_cast<int>(rows);
+    const int n = static_cast<int>(cols);
+    const int k = static_cast<int>(depth);
+
+    ldA = ldA == kComputeStrideFromBlockDimensions ? m : ldA;
+    ldB = ldB == kComputeStrideFromBlockDimensions ? k : ldB;
+    const int ldC = static_cast<int>(output.stride());
+
+    // Currently we support only symmetric quantization with zero point at 0.
+    const int8_t ao = 0;
+    const int8_t bo = 0;
+
+    // Don't add any offset to the result C.
+    const char offsetc = 'F';
+    const int32_t co = 0;
+
+    const auto* A = reinterpret_cast<const int8_t*>(blockA);
+    const auto* B = reinterpret_cast<const uint8_t*>(blockB);
+    auto* C = reinterpret_cast<int32_t*>(const_cast<ResScalar*>(output.data()));
+
+    // DNNL takes row-major matrices. Our packed column-major matrices can be
+    // viewed as a transposed row-major matrix, i.e., C_colmajor = C_rowmajor^T.
+    // C_colmajor = C_rowmajor^T = (A_rowmajor * B_rowmajor)^T
+    //                           = B_rowmajor^T * A_rowmajor^T
+    //                           = B_colmajor * A_colmajor
+    // So we can just swap the input matrices A and B for DNNL.
+    // TODO(penporn): Switch to row-major packing instead.
+    dnnl_status_t st = dnnl_gemm_u8s8s32(transposeB, transposeA, offsetc,  //
+                                         n, m, k,                          //
+                                         alpha,                            //
+                                         B, ldB, bo,                       //
+                                         A, ldA, ao,                       //
+                                         beta,                             //
+                                         C, ldC, &co);
+    eigen_assert(st == 0);
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 1 || defined(MEMORY_SANITIZER)
+    for (IndexType col = 0; col < cols; ++col) {
+      ResScalar* row_base = &output(0, col);
+      EIGEN_UNUSED_VARIABLE(row_base);  // Suppress unused variable error.
+      TF_ANNOTATE_MEMORY_IS_INITIALIZED(row_base, sizeof(ResScalar) * rows);
+    }
+#endif
+
+    // eigen_assert is a no-op in optimized mode so we add these to avoid
+    // compiler's unused-variable errors.
+    EIGEN_UNUSED_VARIABLE(max_index);
+    EIGEN_UNUSED_VARIABLE(st);
+  }
+};
+
+// For mkldnn_sgemm having the right dimensions (especially for small matrices)
+// is more important than fitting all the working set in L1/L2 caches.
+// TODO(ezhulenev): Do better heuristics.
+template <typename StorageIndex, int sharding_type>
+class TensorContractionBlocking<float, float, float, StorageIndex,
+                                sharding_type> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+
+  // Adjust the block sizes to work well with mkldnn kernels.
+
+  // Multiply default choice of block size along M and N dimensions.
+  // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
+  // well in some of models).
+  static constexpr float kScaleM = 1.5;
+  static constexpr float kScaleN = 1.0;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
+  static constexpr StorageIndex kUnrollM = 48;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8.
+  static constexpr StorageIndex kUnrollN = 24;
+
+ public:
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
+                            StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    // 1. Compute block sizes using default Eigen heuristics.
+    if (sharding_type == ShardByCol) {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, mc_, nc_,
+                                                     num_threads);
+    } else {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, nc_, mc_,
+                                                     num_threads);
+    }
+
+    // If dimensions do not pass basic sanity checks return immediately.
+    if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return;
+
+    // If we are using default Eigen gebp kernel there is no need to adjust the
+    // block sizes for DNNL.
+    if (!UseCustomContractionKernels()) return;
+
+    // 2. And refine them to work well with mkldnn sgemm.
+    mc_ = (std::min)(
+        m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
+               kUnrollM);
+    nc_ = (std::min)(
+        n, Eigen::divup(static_cast<StorageIndex>(nc_ * kScaleN), kUnrollN) *
+               kUnrollN);
+
+    // We split Kth dimensions in roughly equal slices.
+    StorageIndex target_k_slices =
+        (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
+    StorageIndex packet_size = internal::packet_traits<Scalar>::size;
+    if (packet_size < 8) packet_size = 8;
+    StorageIndex target_bk =
+        Eigen::divup(k / target_k_slices, packet_size) * packet_size;
+    kc_ = (std::min)(k, target_bk);
+  }
+
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+template <typename StorageIndex, int sharding_type>
+class TensorContractionBlocking<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
+                                StorageIndex, sharding_type> {
+  // TODO(ezhulenev): Define proper gebp_traits in Eigen for quantized types?
+
+  // Default Eigen block heuristics for `QInt8xQUInt8 -> QInt32` are wrong.
+  // Mostly because gebp_traits are not correctly defined. But we know that we
+  // are going to use s8u8s32_gemm from DNNL, so we use float heuristics, and
+  // adjust them to work well with DNNL.
+  using LhsScalar = Eigen::QInt8;
+  using RhsScalar = Eigen::QUInt8;
+  using ResScalar = Eigen::QInt32;
+
+  // Multiply default choice of block size along M, N and K dimensions.
+  static constexpr float kScaleM = 1.5;
+  static constexpr float kScaleN = 1.5;
+  static constexpr float kScaleK = 1.5;
+
+ public:
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
+                            StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    // Each dimension is a multiple of 32 (fits into _m256i).
+    mc_ = (std::min)(m, static_cast<StorageIndex>(192));
+    nc_ = (std::min)(n, static_cast<StorageIndex>(288));
+    kc_ = (std::min)(k, static_cast<StorageIndex>(320));
+  }
+
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+// If the Lhs or Rhs Tensor expressions are already evaluated and have access to
+// raw data, we can skip packing step and setup pointers and a stride to the
+// underlying memory buffer and pass them directly to Gemm.
+template <typename Scalar, typename StorageIndex>
+struct ColMajorBlock {
+  bool is_direct_access;
+
+  // Valid iff `is_direct_access == false`
+  Scalar* packed_data;
+
+  // Valid iff `is_direct_access == true`
+  Scalar* raw_data;
+  StorageIndex stride;
+  char transpose;
+};
+
+template <typename DataMapper>
+struct DirectColMajorAccess {
+  enum { value = false };
+
+  template <typename Scalar, typename StorageIndex>
+  static bool block(const typename DataMapper::SubMapper& data_mapper,
+                    const StorageIndex rows, const StorageIndex cols,
+                    const StorageIndex num_kernels,
+                    ColMajorBlock<Scalar, StorageIndex>* block) {
+    eigen_assert(false && "Not implemented");
+    return false;
+  }
+};
+
+// If we have an access to raw memory of the contraction input, we can safely
+// skip packing if:
+//   (1) Packing is a no-op.
+//   (2) Packed block will be used just once.
+//
+// If a packed block is used many times, it's more efficient to pack it into
+// contiguous block of memory to reduce pressure on TLB.
+//
+// TODO(ezhulenev): Add support for more tensor expressions that matters.
+#define REGISTER_DIRECT_COL_MAJOR_ACCESS(TENSOR_EXPR)                          \
+  template <typename Scalar, typename StorageIndex, int Side, typename Device, \
+            typename nocontract_t, typename contract_t, int packet_size,       \
+            int Alignment>                                                     \
+  struct DirectColMajorAccess<TensorContractionInputMapper<                    \
+      Scalar, StorageIndex, Side, TensorEvaluator<TENSOR_EXPR, Device>,        \
+      nocontract_t, contract_t, packet_size, /*inner_dim_contiguous=*/true,    \
+      /*inner_dim_reordered=*/false, Alignment>> {                             \
+    enum { value = true };                                                     \
+                                                                               \
+    using DataMapper = TensorContractionInputMapper<                           \
+        Scalar, StorageIndex, Side, TensorEvaluator<TENSOR_EXPR, Device>,      \
+        nocontract_t, contract_t, packet_size, /*inner_dim_contiguous=*/true,  \
+        /*inner_dim_reordered=*/false, Alignment>;                             \
+                                                                               \
+    static bool block(const typename DataMapper::SubMapper& data_mapper,       \
+                      const StorageIndex rows, const StorageIndex cols,        \
+                      const StorageIndex num_kernels,                          \
+                      ColMajorBlock<Scalar, StorageIndex>* block) {            \
+      static_assert(DataMapper::DirectOffsets == true,                         \
+                    "DataMapper must support direct offsets");                 \
+                                                                               \
+      const StorageIndex vert_offset = data_mapper.vert_offset();              \
+      const StorageIndex horiz_offset = data_mapper.horiz_offset();            \
+      const StorageIndex stride =                                              \
+          Side == Lhs ? data_mapper.base_mapper().stride()                     \
+                      : data_mapper.base_mapper().nocontract_strides()[0];     \
+      const Scalar* data = data_mapper.base_mapper().tensor().data();          \
+      data = Side == Lhs ? data : data + vert_offset + horiz_offset * stride;  \
+                                                                               \
+      const bool is_no_op_packing = stride == rows;                            \
+      const StorageIndex addressable_mem = (stride * cols * sizeof(Scalar));   \
+      const bool use_direct_access =                                           \
+          is_no_op_packing || num_kernels == 1 /* used once */ ||              \
+          ((num_kernels == 2) &&                                               \
+           (addressable_mem < (256 << 10) /* 256 kb */));                      \
+                                                                               \
+      if (use_direct_access) {                                                 \
+        block->is_direct_access = true;                                        \
+        block->raw_data = const_cast<Scalar*>(data);                           \
+        block->stride = stride;                                                \
+        block->transpose = 'N';                                                \
+        return true;                                                           \
+      }                                                                        \
+      return false;                                                            \
+    }                                                                          \
+  }
+
+#define SIMPLE_TENSOR const Tensor<Scalar, 2, Eigen::ColMajor, StorageIndex>
+
+#define TENSOR_MAP_ROWMAJOR                                               \
+  const TensorMap<Tensor<const Scalar, 2, Eigen::RowMajor, StorageIndex>, \
+                  Eigen::Aligned>
+
+#define TENSOR_MAP_COLMAJOR                                               \
+  const TensorMap<Tensor<const Scalar, 2, Eigen::ColMajor, StorageIndex>, \
+                  Eigen::Aligned>
+
+#define TENSOR_MAP_CONST_ROWMAJOR                                   \
+  const TensorMap<Tensor<Scalar, 2, Eigen::RowMajor, StorageIndex>, \
+                  Eigen::Aligned>
+
+#define TENSOR_MAP_CONST_COLMAJOR                                   \
+  const TensorMap<Tensor<Scalar, 2, Eigen::ColMajor, StorageIndex>, \
+                  Eigen::Aligned>
+
+// This is reshaped convolution filter from `eigen_spatial_convolutions.h`.
+#define TENSOR_RESHAPE                                                        \
+  const TensorReshapingOp<                                                    \
+      const Eigen::DSizes<StorageIndex, 2>,                                   \
+      const TensorMap<Tensor<const Scalar, 4, Eigen::RowMajor, StorageIndex>, \
+                      Eigen::Aligned>>
+
+REGISTER_DIRECT_COL_MAJOR_ACCESS(SIMPLE_TENSOR);
+REGISTER_DIRECT_COL_MAJOR_ACCESS(TENSOR_MAP_ROWMAJOR);
+REGISTER_DIRECT_COL_MAJOR_ACCESS(TENSOR_MAP_COLMAJOR);
+REGISTER_DIRECT_COL_MAJOR_ACCESS(TENSOR_MAP_CONST_ROWMAJOR);
+REGISTER_DIRECT_COL_MAJOR_ACCESS(TENSOR_MAP_CONST_COLMAJOR);
+REGISTER_DIRECT_COL_MAJOR_ACCESS(TENSOR_RESHAPE);
+
+#undef SIMPLE_TENSOR
+#undef TENSOR_MAP_ROWMAJOR
+#undef TENSOR_MAP_COLMAJOR
+#undef TENSOR_MAP_CONST_ROWMAJOR
+#undef TENSOR_MAP_CONST_COLMAJOR
+#undef TENSOR_RESHAPE
+#undef REGISTER_DIRECT_COL_MAJOR_ACCESS
+
+template <typename ResScalar, typename LhsScalar, typename RhsScalar,
+          typename StorageIndex, typename OutputMapper>
+struct GemmKernelProvider {
+  enum { Defined = 0 };
+  using GemmKernel = void;
+};
+
+template <typename StorageIndex, typename OutputMapper>
+struct GemmKernelProvider<float, float, float, StorageIndex, OutputMapper> {
+  enum { Defined = 1 };
+  using GemmKernel = dnnl_gemm_kernel<float, StorageIndex, OutputMapper>;
+};
+
+template <typename StorageIndex, typename OutputMapper>
+struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
+                          StorageIndex, OutputMapper> {
+  enum { Defined = 1 };
+  using GemmKernel = mkldnn_gemm_s8u8s32_kernel<StorageIndex, OutputMapper>;
+};
+
+// NOTE: 'std::enable_if' doesn't work for template specializations. See
+// "default template argument in a class template partial specialization".
+
+// Tensor contraction kernel that can fallback on Eigen gebp_kernel at runtime.
+#define REGISTER_TENSOR_CONTRACTION_KERNEL_WITH_FALLBACK(                      \
+    RES_SCALAR, LHS_SCALAR, RHS_SCALAR)                                        \
+                                                                               \
+  template <typename StorageIndex, typename OutputMapper, typename LhsMapper,  \
+            typename RhsMapper>                                                \
+  struct TensorContractionKernel<RES_SCALAR, LHS_SCALAR, RHS_SCALAR,           \
+                                 StorageIndex, OutputMapper, LhsMapper,        \
+                                 RhsMapper> {                                  \
+    TensorContractionKernel(StorageIndex m, StorageIndex k, StorageIndex n,    \
+                            StorageIndex bm, StorageIndex bk, StorageIndex bn) \
+        : m(m), k(k), n(n), bm(bm), bk(bk), bn(bn) {}                          \
+                                                                               \
+    enum { HasBeta = true };                                                   \
+                                                                               \
+    using ResScalar = RES_SCALAR;                                              \
+    using LhsScalar = LHS_SCALAR;                                              \
+    using RhsScalar = RHS_SCALAR;                                              \
+                                                                               \
+    using Traits = typename internal::gebp_traits<LhsScalar, RhsScalar>;       \
+                                                                               \
+    using LhsBlock = ColMajorBlock<LhsScalar, StorageIndex>;                   \
+    using RhsBlock = ColMajorBlock<RhsScalar, StorageIndex>;                   \
+                                                                               \
+    using DirectLhsAccess = DirectColMajorAccess<LhsMapper>;                   \
+    using DirectRhsAccess = DirectColMajorAccess<RhsMapper>;                   \
+                                                                               \
+    /* Packed Lhs/Rhs block memory allocator.*/                                \
+    typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>           \
+        BlockMemAllocator;                                                     \
+    typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;         \
+                                                                               \
+    using LhsPacker =                                                          \
+        gemm_pack_colmajor_block<LhsScalar, StorageIndex,                      \
+                                 typename LhsMapper::SubMapper, ColMajor>;     \
+    using RhsPacker =                                                          \
+        gemm_pack_colmajor_block<RhsScalar, StorageIndex,                      \
+                                 typename RhsMapper::SubMapper, ColMajor>;     \
+                                                                               \
+    using GemmKernelProviderType =                                             \
+        GemmKernelProvider<ResScalar, LhsScalar, RhsScalar, StorageIndex,      \
+                           OutputMapper>;                                      \
+    static_assert(                                                             \
+        GemmKernelProviderType::Defined,                                       \
+        "Custom GEMM kernel is not registered for given scalar types");        \
+    using GemmKernel = typename GemmKernelProviderType::GemmKernel;            \
+                                                                               \
+    /* Fallback on default Eigen pack and GEBP kernel if custom contraction */ \
+    /* kernels disabled at runtime.                                         */ \
+    using EigenLhsPacker =                                                     \
+        gemm_pack_lhs<LhsScalar, StorageIndex, typename LhsMapper::SubMapper,  \
+                      Traits::mr, Traits::LhsProgress,                         \
+                      typename Traits::LhsPacket4Packing, ColMajor>;           \
+    using EigenRhsPacker =                                                     \
+        gemm_pack_rhs<RhsScalar, StorageIndex, typename RhsMapper::SubMapper,  \
+                      Traits::nr, ColMajor>;                                   \
+    using GebpKernel =                                                         \
+        gebp_kernel<LhsScalar, RhsScalar, StorageIndex, OutputMapper,          \
+                    Traits::mr, Traits::nr, /*ConjugateLhs*/ false,            \
+                    /*ConjugateRhs*/ false>;                                   \
+                                                                               \
+    template <typename Device>                                                 \
+    EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,  \
+                                              RhsBlock* rhs_block) {           \
+      return BlockMemAllocator::allocate(                                      \
+          d, bm, bk, bn, &lhs_block->packed_data, &rhs_block->packed_data);    \
+    }                                                                          \
+                                                                               \
+    template <typename Device>                                                 \
+    EIGEN_DEVICE_FUNC BlockMemHandle                                           \
+    allocateSlices(Device& d, const int num_lhs, const int num_rhs,            \
+                   const int num_slices, std::vector<LhsBlock>* lhs_blocks,    \
+                   std::vector<RhsBlock>* rhs_blocks) {                        \
+      eigen_assert(num_slices > 0);                                            \
+      std::vector<std::vector<LhsScalar*>> lhs_mem(num_slices);                \
+      std::vector<std::vector<RhsScalar*>> rhs_mem(num_slices);                \
+                                                                               \
+      BlockMemHandle block_mem = BlockMemAllocator::allocateSlices(            \
+          d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_mem.data(),         \
+          rhs_mem.data());                                                     \
+                                                                               \
+      for (Index x = 0; x < num_slices; x++) {                                 \
+        if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);                        \
+        for (Index m = 0; m < num_lhs; m++) {                                  \
+          lhs_blocks[x][m].packed_data = lhs_mem[x][m];                        \
+        }                                                                      \
+        if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);                        \
+        for (Index n = 0; n < num_rhs; n++) {                                  \
+          rhs_blocks[x][n].packed_data = rhs_mem[x][n];                        \
+        }                                                                      \
+      }                                                                        \
+                                                                               \
+      return block_mem;                                                        \
+    }                                                                          \
+                                                                               \
+    template <typename Device>                                                 \
+    EIGEN_DEVICE_FUNC static void deallocate(Device& d,                        \
+                                             BlockMemHandle handle) {          \
+      BlockMemAllocator::deallocate(d, handle);                                \
+    }                                                                          \
+                                                                               \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(                          \
+        LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,  \
+        const StorageIndex depth, const StorageIndex rows) {                   \
+      if (UseCustomContractionKernels()) {                                     \
+        const bool is_direct_access =                                          \
+            DirectLhsAccess::value &&                                          \
+            DirectLhsAccess::block(data_mapper, rows, depth,                   \
+                                   bn > 0 ? divup(n, bn) : 0, lhsBlock);       \
+                                                                               \
+        if (!is_direct_access) {                                               \
+          lhsBlock->is_direct_access = false;                                  \
+          LhsPacker()(lhsBlock->packed_data, data_mapper, rows, depth);        \
+        }                                                                      \
+      } else {                                                                 \
+        lhsBlock->is_direct_access = false;                                    \
+        EigenLhsPacker()(lhsBlock->packed_data, data_mapper, depth, rows,      \
+                         /*stride*/ 0, /*offset*/ 0);                          \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(                          \
+        RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,  \
+        const StorageIndex depth, const StorageIndex cols) {                   \
+      if (UseCustomContractionKernels()) {                                     \
+        const bool is_direct_access =                                          \
+            DirectRhsAccess::value &&                                          \
+            DirectRhsAccess::block(data_mapper, depth, cols,                   \
+                                   bm > 0 ? divup(m, bm) : 0, rhsBlock);       \
+                                                                               \
+        if (!is_direct_access) {                                               \
+          rhsBlock->is_direct_access = false;                                  \
+          RhsPacker()(rhsBlock->packed_data, data_mapper, depth, cols);        \
+        }                                                                      \
+      } else {                                                                 \
+        rhsBlock->is_direct_access = false;                                    \
+        EigenRhsPacker()(rhsBlock->packed_data, data_mapper, depth, cols);     \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(                           \
+        const OutputMapper& output_mapper, const LhsBlock& lhsBlock,           \
+        const RhsBlock& rhsBlock, const StorageIndex rows,                     \
+        const StorageIndex depth, const StorageIndex cols, const float alpha,  \
+        const float beta) {                                                    \
+      if (UseCustomContractionKernels()) {                                     \
+        if ((DirectLhsAccess::value && lhsBlock.is_direct_access) &&           \
+            (DirectRhsAccess::value && rhsBlock.is_direct_access)) {           \
+          GemmKernel()(output_mapper, lhsBlock.raw_data, rhsBlock.raw_data,    \
+                       rows, depth, cols, alpha, beta,                         \
+                       /*ldA=*/lhsBlock.stride, /*ldB=*/rhsBlock.stride,       \
+                       /*transposeA=*/lhsBlock.transpose,                      \
+                       /*transposeB=*/rhsBlock.transpose);                     \
+                                                                               \
+        } else if (DirectLhsAccess::value && lhsBlock.is_direct_access) {      \
+          GemmKernel()(output_mapper, lhsBlock.raw_data, rhsBlock.packed_data, \
+                       rows, depth, cols, alpha, beta,                         \
+                       /*ldA=*/lhsBlock.stride,                                \
+                       /*ldB=*/GemmKernel::kComputeStrideFromBlockDimensions,  \
+                       /*transposeA=*/lhsBlock.transpose, /*transposeB=*/'N'); \
+                                                                               \
+        } else if (DirectRhsAccess::value && rhsBlock.is_direct_access) {      \
+          GemmKernel()(output_mapper, lhsBlock.packed_data, rhsBlock.raw_data, \
+                       rows, depth, cols, alpha, beta,                         \
+                       /*ldA=*/GemmKernel::kComputeStrideFromBlockDimensions,  \
+                       /*ldB=*/rhsBlock.stride, /*transposeA=*/'N',            \
+                       /*transposeB=*/rhsBlock.transpose);                     \
+                                                                               \
+        } else {                                                               \
+          GemmKernel()(output_mapper, lhsBlock.packed_data,                    \
+                       rhsBlock.packed_data, rows, depth, cols, alpha, beta);  \
+        }                                                                      \
+      } else {                                                                 \
+        /* Gebp kernel does not support beta, so we have to clear memory in */ \
+        /* the output mapper manually.                                      */ \
+        /* WARNING(ezhulenev): This is optimized into a memset in a loop,   */ \
+        /* could be much slower for small matrices. Currently this code     */ \
+        /* path used only for testing, and performance does not matter.     */ \
+        if (beta == 0.0) {                                                     \
+          for (StorageIndex col = 0; col < cols; ++col) {                      \
+            ResScalar* output_base = &output_mapper(0, col);                   \
+            typedef Array<ResScalar, Dynamic, 1> OutputRow;                    \
+            typedef Map<OutputRow, 0, InnerStride<1>> OutputRowMap;            \
+            OutputRowMap(output_base, rows).setZero();                         \
+          }                                                                    \
+        }                                                                      \
+                                                                               \
+        GebpKernel()(                                                          \
+            output_mapper, lhsBlock.packed_data, rhsBlock.packed_data, rows,   \
+            depth, cols, alpha,                                                \
+            /*strideA*/ GemmKernel::kComputeStrideFromBlockDimensions,         \
+            /*strideB*/ GemmKernel::kComputeStrideFromBlockDimensions,         \
+            /*offsetA*/ 0, /*offsetB*/ 0);                                     \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+   private:                                                                    \
+    /* These are dimensions of the original Tensors, and selected block     */ \
+    /* sizes. The actual block sizes passed to all function above might be  */ \
+    /* smaller because of the partial blocks at the end.                    */ \
+    const StorageIndex m;                                                      \
+    const StorageIndex k;                                                      \
+    const StorageIndex n;                                                      \
+    const StorageIndex bm;                                                     \
+    const StorageIndex bk;                                                     \
+    const StorageIndex bn;                                                     \
+  }
+
+// Tensor contraction kernel that do not fallback on Eigen. Currently not all
+// data types are supported by Eigen data packing and default gebp_kernel.
+#define REGISTER_TENSOR_CONTRACTION_KERNEL_NO_FALLBACK(RES_SCALAR, LHS_SCALAR, \
+                                                       RHS_SCALAR)             \
+                                                                               \
+  template <typename StorageIndex, typename OutputMapper, typename LhsMapper,  \
+            typename RhsMapper>                                                \
+  struct TensorContractionKernel<RES_SCALAR, LHS_SCALAR, RHS_SCALAR,           \
+                                 StorageIndex, OutputMapper, LhsMapper,        \
+                                 RhsMapper> {                                  \
+    TensorContractionKernel(StorageIndex m, StorageIndex k, StorageIndex n,    \
+                            StorageIndex bm, StorageIndex bk, StorageIndex bn) \
+        : m(m), k(k), n(n), bm(bm), bk(bk), bn(bn) {}                          \
+                                                                               \
+    enum { HasBeta = true };                                                   \
+                                                                               \
+    using ResScalar = RES_SCALAR;                                              \
+    using LhsScalar = LHS_SCALAR;                                              \
+    using RhsScalar = RHS_SCALAR;                                              \
+                                                                               \
+    using Traits = typename internal::gebp_traits<LhsScalar, RhsScalar>;       \
+                                                                               \
+    using LhsBlock = ColMajorBlock<LhsScalar, StorageIndex>;                   \
+    using RhsBlock = ColMajorBlock<RhsScalar, StorageIndex>;                   \
+                                                                               \
+    using DirectLhsAccess = DirectColMajorAccess<LhsMapper>;                   \
+    using DirectRhsAccess = DirectColMajorAccess<RhsMapper>;                   \
+                                                                               \
+    /* Packed Lhs/Rhs block memory allocator.*/                                \
+    typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>           \
+        BlockMemAllocator;                                                     \
+    typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;         \
+                                                                               \
+    using LhsPacker =                                                          \
+        gemm_pack_colmajor_block<LhsScalar, StorageIndex,                      \
+                                 typename LhsMapper::SubMapper, ColMajor>;     \
+    using RhsPacker =                                                          \
+        gemm_pack_colmajor_block<RhsScalar, StorageIndex,                      \
+                                 typename RhsMapper::SubMapper, ColMajor>;     \
+                                                                               \
+    using GemmKernelProviderType =                                             \
+        GemmKernelProvider<ResScalar, LhsScalar, RhsScalar, StorageIndex,      \
+                           OutputMapper>;                                      \
+    static_assert(                                                             \
+        GemmKernelProviderType::Defined,                                       \
+        "Custom GEMM kernel is not registered for given scalar types");        \
+    using GemmKernel = typename GemmKernelProviderType::GemmKernel;            \
+                                                                               \
+    template <typename Device>                                                 \
+    EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,  \
+                                              RhsBlock* rhs_block) {           \
+      return BlockMemAllocator::allocate(                                      \
+          d, bm, bk, bn, &lhs_block->packed_data, &rhs_block->packed_data);    \
+    }                                                                          \
+                                                                               \
+    template <typename Device>                                                 \
+    EIGEN_DEVICE_FUNC BlockMemHandle                                           \
+    allocateSlices(Device& d, const int num_lhs, const int num_rhs,            \
+                   const int num_slices, std::vector<LhsBlock>* lhs_blocks,    \
+                   std::vector<RhsBlock>* rhs_blocks) {                        \
+      eigen_assert(num_slices > 0);                                            \
+      std::vector<std::vector<LhsScalar*>> lhs_mem(num_slices);                \
+      std::vector<std::vector<RhsScalar*>> rhs_mem(num_slices);                \
+                                                                               \
+      BlockMemHandle block_mem = BlockMemAllocator::allocateSlices(            \
+          d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_mem.data(),         \
+          rhs_mem.data());                                                     \
+                                                                               \
+      for (Index x = 0; x < num_slices; x++) {                                 \
+        if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);                        \
+        for (Index m = 0; m < num_lhs; m++) {                                  \
+          lhs_blocks[x][m].packed_data = lhs_mem[x][m];                        \
+        }                                                                      \
+        if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);                        \
+        for (Index n = 0; n < num_rhs; n++) {                                  \
+          rhs_blocks[x][n].packed_data = rhs_mem[x][n];                        \
+        }                                                                      \
+      }                                                                        \
+                                                                               \
+      return block_mem;                                                        \
+    }                                                                          \
+                                                                               \
+    template <typename Device>                                                 \
+    EIGEN_DEVICE_FUNC static void deallocate(Device& d,                        \
+                                             BlockMemHandle handle) {          \
+      BlockMemAllocator::deallocate(d, handle);                                \
+    }                                                                          \
+                                                                               \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(                          \
+        LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,  \
+        const StorageIndex depth, const StorageIndex rows) {                   \
+      const bool is_direct_access =                                            \
+          DirectLhsAccess::value &&                                            \
+          DirectLhsAccess::block(data_mapper, rows, depth,                     \
+                                 bn > 0 ? divup(n, bn) : 0, lhsBlock);         \
+                                                                               \
+      if (!is_direct_access) {                                                 \
+        lhsBlock->is_direct_access = false;                                    \
+        LhsPacker()(lhsBlock->packed_data, data_mapper, rows, depth);          \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(                          \
+        RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,  \
+        const StorageIndex depth, const StorageIndex cols) {                   \
+      const bool is_direct_access =                                            \
+          DirectRhsAccess::value &&                                            \
+          DirectRhsAccess::block(data_mapper, depth, cols,                     \
+                                 bm > 0 ? divup(m, bm) : 0, rhsBlock);         \
+                                                                               \
+      if (!is_direct_access) {                                                 \
+        rhsBlock->is_direct_access = false;                                    \
+        RhsPacker()(rhsBlock->packed_data, data_mapper, depth, cols);          \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(                           \
+        const OutputMapper& output_mapper, const LhsBlock& lhsBlock,           \
+        const RhsBlock& rhsBlock, const StorageIndex rows,                     \
+        const StorageIndex depth, const StorageIndex cols, const float alpha,  \
+        const float beta) {                                                    \
+      if ((DirectLhsAccess::value && lhsBlock.is_direct_access) &&             \
+          (DirectRhsAccess::value && rhsBlock.is_direct_access)) {             \
+        GemmKernel()(output_mapper, lhsBlock.raw_data, rhsBlock.raw_data,      \
+                     rows, depth, cols, alpha, beta, /*ldA=*/lhsBlock.stride,  \
+                     /*ldB=*/rhsBlock.stride,                                  \
+                     /*transposeA=*/lhsBlock.transpose,                        \
+                     /*transposeB=*/rhsBlock.transpose);                       \
+                                                                               \
+      } else if (DirectLhsAccess::value && lhsBlock.is_direct_access) {        \
+        GemmKernel()(output_mapper, lhsBlock.raw_data, rhsBlock.packed_data,   \
+                     rows, depth, cols, alpha, beta, /*ldA=*/lhsBlock.stride,  \
+                     /*ldB=*/GemmKernel::kComputeStrideFromBlockDimensions,    \
+                     /*transposeA=*/lhsBlock.transpose, /*transposeB=*/'N');   \
+                                                                               \
+      } else if (DirectRhsAccess::value && rhsBlock.is_direct_access) {        \
+        GemmKernel()(output_mapper, lhsBlock.packed_data, rhsBlock.raw_data,   \
+                     rows, depth, cols, alpha, beta,                           \
+                     /*ldA=*/GemmKernel::kComputeStrideFromBlockDimensions,    \
+                     /*ldB=*/rhsBlock.stride, /*transposeA=*/'N',              \
+                     /*transposeB=*/rhsBlock.transpose);                       \
+                                                                               \
+      } else {                                                                 \
+        GemmKernel()(output_mapper, lhsBlock.packed_data,                      \
+                     rhsBlock.packed_data, rows, depth, cols, alpha, beta);    \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+   private:                                                                    \
+    /* These are dimensions of the original Tensors, and selected block     */ \
+    /* sizes. The actual block sizes passed to all function above might be  */ \
+    /* smaller because of the partial blocks at the end.                    */ \
+    const StorageIndex m;                                                      \
+    const StorageIndex k;                                                      \
+    const StorageIndex n;                                                      \
+    const StorageIndex bm;                                                     \
+    const StorageIndex bk;                                                     \
+    const StorageIndex bn;                                                     \
+  }
+
+REGISTER_TENSOR_CONTRACTION_KERNEL_WITH_FALLBACK(float, float, float);
+REGISTER_TENSOR_CONTRACTION_KERNEL_NO_FALLBACK(Eigen::QInt32, Eigen::QInt8,
+                                               Eigen::QUInt8);
+
+#undef REGISTER_TENSOR_CONTRACTION_KERNEL
+
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_CONTRACTION_EIGEN_CONTRACTION_KERNEL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_convolution_helpers.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_convolution_helpers.h
new file mode 100644
index 00000000..03c97b85
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_convolution_helpers.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_CONVOLUTION_HELPERS_H_
+#define XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_CONVOLUTION_HELPERS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace Eigen {
+namespace internal {
+
+// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
+// provides `value` that is true if TensorEvaluatorType has `PacketType
+// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
+// const` and if the PacketType supports masked load.
+//
+// Partial packets are used to:
+//
+// 1) Split the packet over two columns in eigen based spatial convolution and
+// use partial loads for each individual part before combining them to get the
+// required packet. This class is used to pick the correct implementation of
+// loadPacketStandard function.
+//
+// 2) Split the packet over two rows (within the same column) in eigen based
+// cuboid convolution and use partial loads for each individual part before
+// combining them to get the required packet. This class is used to pick the
+// correct implementation of loadPacketStandard function. This usage is similar
+// to the usage in eigen based spatial convolution described above.
+//
+// 3) Finalize packing of columns in gemm_pack_colmajor after processing
+//    vectorized part with full packets (see eigen_spatial_convolutions.h).
+template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
+class TensorEvaluatorHasPartialPacket {
+ public:
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(
+      typename std::enable_if<
+          unpacket_traits<PacketT>::masked_load_available &&
+          std::is_same<PacketT,
+                       decltype(std::declval<const TensorEvaluatorT>()
+                                    .template partialPacket<PacketT>(
+                                        std::declval<IndexT>(),
+                                        std::declval<typename unpacket_traits<
+                                            PacketT>::mask_t>()))>::value>::
+          type*) -> std::true_type;
+
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(...) -> std::false_type;
+
+  typedef decltype(functionExistsSfinae<TensorEvaluatorType, PacketType,
+                                        IndexType>(nullptr)) status;
+
+  static constexpr bool value = status::value;
+};
+
+// Compute a mask for loading/storing coefficients in/from a packet in a
+// [from, to) range. If the mask bit is 1, element will be loaded/stored.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                            typename unpacket_traits<Packet>::mask_t>::type
+    mask(int from, int to) {
+  const Index packet_size = internal::unpacket_traits<Packet>::size;
+  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
+
+  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
+  const Mask mask_max = std::numeric_limits<Mask>::max();
+
+  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_CONVOLUTION_HELPERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_spatial_convolutions-inl.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_spatial_convolutions-inl.h
new file mode 100644
index 00000000..e41eaa9c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_spatial_convolutions-inl.h
@@ -0,0 +1,1772 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+#define XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+
+#include "xla/tsl/framework/convolution/eigen_convolution_helpers.h"
+
+// Note this header is used in both TF and TFLite.
+namespace Eigen {
+
+namespace internal {
+
+#if !EIGEN_ALTIVEC_USE_CUSTOM_PACK
+// WARNING: Most of the code here implicitly assumes that the matrix is in
+// ColMajor layout. This is guaranteed by the tensor contraction (see
+// TensorContraction.h).
+//
+// Inside Eigen a tensor contraction is represented by a matrix multiplication.
+// We don't want to actually extract image patches and reshape the result into
+// a matrix (this involves allocating huge extra memory), so the patch
+// extraction and reshape operations are implicit.
+//
+// TensorContractionInputMapper takes a matrix index and returns the coefficient
+// (or the packet) of the "virtual tensor", that would be at that index if we
+// were to actually reshape the result of patch extraction.
+//
+// TensorContractionSubMapper provides a similar view into the "virtual matrix"
+// at the given vertical and horizontal offsets.
+//
+// "Virtual matrix" dimensions:
+//   *0: kernelChannels * kernelRows * kernelCols;
+//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
+//
+// *) extracted patches are continuous in memory (innermost dimension assuming
+//    col major layout)
+//
+// With this dimensions:
+//   row - offset within a single patch (in code: patchId)
+//   col - index of the extracted patch (in code: patchIndex)
+//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
+//
+// TODO(ezhulenev): Consolidate this part of the code with the image patch
+// extraction code since they are both very similar.
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<
+    Scalar_, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension,
+                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef Scalar_ Scalar;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  typedef TensorEvaluator<ArgType, Device> TensorEvaluatorT;
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(
+      const TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>& tensor,
+      const nocontract_t&, const nocontract_t&, const contract_t&,
+      const contract_t&)
+      : m_impl(tensor.impl().impl()) {
+    Index patch_rows;
+    Index patch_depth;
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      patch_depth = tensor.impl().dimensions()[0];
+      patch_rows = tensor.impl().dimensions()[1];
+      m_patch_cols = tensor.impl().dimensions()[2];
+      m_num_patches = tensor.impl().dimensions()[3];
+    } else {
+      const size_t NumDims = tensor.impl().dimensions().size();
+      patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      patch_rows = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
+    }
+
+    // Strides for navigating through the single patch.
+    m_patch_row_stride = patch_depth;
+    m_patch_col_stride = patch_rows * m_patch_row_stride;
+
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    m_colStride = patch_rows;
+
+    m_outputRows = tensor.impl().outputRows();
+    m_outputCols = tensor.impl().outputCols();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_inputRows = tensor.impl().impl().dimensions()[1];
+      m_inputCols = tensor.impl().impl().dimensions()[2];
+    } else {
+      const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
+    }
+
+    m_rowInputStride = patch_depth;
+    m_colInputStride = patch_depth * m_inputRows;
+    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
+
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastPatchRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_stride);
+    m_fastPatchColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_stride);
+    m_fastInputRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
+  }
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
+      : m_impl(base_mapper.m_impl) {
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+
+    m_patch_row_stride = base_mapper.m_patch_row_stride;
+    m_patch_col_stride = base_mapper.m_patch_col_stride;
+
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_colStride = base_mapper.m_colStride;
+
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputRows = base_mapper.m_outputRows;
+    m_outputCols = base_mapper.m_outputCols;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
+    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+  }
+
+  // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_in_row_strides != 1 || m_in_col_strides != 1 ||
+           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual
+  // m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  // EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+ private:
+  friend class TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>;
+
+  // Load coefficient from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex,
+                                       Index colIndex, Index otherIndex) const {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol =
+        (m_patch_col_inflate_strides == 1)
+            ? inputCol
+            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow =
+        (m_patch_row_inflate_strides == 1)
+            ? inputRow
+            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
+        origInputRow >= m_inputRows ||
+        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
+        (inputRow != origInputRow * m_patch_row_inflate_strides)) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + origInputRow * m_rowInputStride +
+                             origInputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
+  // and `in_strides` equal to 1 (template specialization without templates).
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex,
+                                               Index colIndex,
+                                               Index otherIndex) const {
+    eigen_assert(!nonStandardPatches());
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
+        inputRow >= m_inputRows) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // Load packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex,
+                                        Index colIndex,
+                                        Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    if (nonStandardPatches()) {
+      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+    }
+    typedef decltype(m_impl) TensorEvaluatorT;
+    return loadPacketStandard<Packet, TensorEvaluatorT>(patchId, rowIndex,
+                                                        colIndex, otherIndex);
+  }
+
+  // Helper function to load a 'partial' packet - this is the single column
+  // part of a packet that is split across two columns. In the 'partial' packet,
+  // the elements corresponding to the column (specified through colOffset) are
+  // loaded and the rest of the elements are zero-filled into the 'partial'
+  // packet. This function is called from loadPacketStandardFromTwoColumns().
+  // This code path is exercised only when the packet type supports masked load
+  // and when the partial packet load is available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
+      Index rowIndex, Index colIndex, Index otherIndex, Index patchId,
+      const Index span[], const Index patchOffsets[], Index colOffset) const {
+    const Index inputCol = colIndex + colOffset;
+    const Index rowOffsets[2] = {patchOffsets[0] - colOffset * m_colStride,
+                                 patchOffsets[1] - colOffset * m_colStride};
+    const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                rowIndex + rowOffsets[1]};
+
+    if (inputRows[0] >= m_inputRows || inputRows[1] < 0 ||
+        inputCol >= m_inputCols || inputCol < 0) {
+      // Partial packet is all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    } else if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+      // From inputIndex-span[0], we need to load elements starting from index
+      // span[0] all the way upto (and including) span[1].
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                               inputCol * m_colInputStride + otherIndex;
+      return m_impl.template partialPacket<Packet>(
+          inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
+    } else {
+      // Using slow path for this partial packet.
+      // We need to load elements starting from index span[0] all the way upto
+      // (and including) span[1]. We split this load into 3 parts:
+      // 0 : span[0]-1 - Zeros will be loaded for these indices
+      // span[0] : span[1] - Elements will be loaded here for these indices
+      // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
+      const Index packetSize = internal::unpacket_traits<Packet>::size;
+      EIGEN_ALIGN_MAX
+      std::remove_const_t<Scalar> values[packetSize];
+      for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
+      for (int i = span[0]; i < span[1] + 1; ++i)
+        values[i] =
+            loadCoeff(patchId - span[0] + i, rowIndex, colIndex, otherIndex);
+      for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
+      return internal::pload<Packet>(values);
+    }
+  }
+
+  // Helper function to load a packet that is split across two columns.
+  // If required, this function is called from loadPacketStandard() when the
+  // packet type supports masked load and when the partial packet load is
+  // available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromTwoColumns(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex,
+      const Index patchOffsets[], const Index colOffsets[]) const {
+    eigen_assert(colOffsets[1] == colOffsets[0] + 1);
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+
+    // Packet to load will be split into 2 parts where each part spans a single
+    // column. First determine where to split.
+    const Index patchIdSplit =
+        ((colOffsets[1] * m_colStride) * m_rowInputStride) - 1;
+    const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
+
+    // patchIds[i]:          patchId corresponding to partial packet i
+    // spans[i]:             Start and end indices corresponding to the elements
+    //                       to be loaded for partial packet i
+    // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
+    const Index patchIds[2] = {patchId, patchIdSplit + 1};
+    const Index spans[2][2] = {{0, patchIdSplit - patchId},
+                               {patchIdSplit - patchId + 1, packetSize - 1}};
+    const Index patchOffsets2Cols[2][2] = {
+        {patchOffsets[0], patchOffsetSplit},
+        {patchOffsetSplit + 1, patchOffsets[1]}};
+
+    // Load partial packets and do bit-wise OR to generate required packet
+    return internal::por<Packet>(
+        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0],
+                                  spans[0], patchOffsets2Cols[0],
+                                  colOffsets[0]),
+        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1],
+                                  spans[1], patchOffsets2Cols[1],
+                                  colOffsets[1]));
+  }
+
+  // Helper function to load a packet that is present in a single columns.
+  // If required, this function is called from loadPacketStandard().
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumn(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex,
+      const Index patchOffsets[], const Index colOffsets[],
+      const Index inputCols[]) const {
+    eigen_assert(colOffsets[0] == colOffsets[1]);
+    const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride,
+                                 patchOffsets[1] - colOffsets[1] * m_colStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                rowIndex + rowOffsets[1]};
+
+    if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));  // all zeros
+    }
+
+    if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+      // no padding
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                               inputCols[0] * m_colInputStride + otherIndex;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is not available
+  // for the TensorEvaluator or if the packet type does not support masked
+  // load.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index rowIndex, Index colIndex,
+                     Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    }
+
+    // Offsets and input calculation here are identical to
+    // loadCoeffStandard(...), but repeated twice.
+    const Index patchOffsets[2] = {patchId / m_fastDimZero,
+                                   (patchId + packetSize - 1) / m_fastDimZero};
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+    const Index inputCols[2] = {colIndex + colOffsets[0],
+                                colIndex + colOffsets[1]};
+
+    if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    if (inputCols[0] == inputCols[1]) {
+      return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex,
+                                                otherIndex, patchOffsets,
+                                                colOffsets, inputCols);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is available for
+  // the TensorEvaluator and if the packet type supports masked load.
+  // The only difference between this and the other case is that if the packet
+  // to load is split across two columns, then in this case instead of going to
+  // the slow (element-by-element) load, we load two packets - each containing
+  // elements from one of the columns (rest of the elements of the packets are
+  // zeroes), and then combine these two packets to generate the required
+  // packet. The idea is to enable fast load (if possible) of these 'partial'
+  // packets.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index rowIndex, Index colIndex,
+                     Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<PacketT>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    }
+
+    // Offsets and input calculation here are identical to
+    // loadCoeffStandard(...), but repeated twice.
+    const Index patchOffsets[2] = {patchId / m_fastDimZero,
+                                   (patchId + packetSize - 1) / m_fastDimZero};
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+    const Index inputCols[2] = {colIndex + colOffsets[0],
+                                colIndex + colOffsets[1]};
+
+    if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+      // all zeros
+      return internal::pset1<PacketT>(Scalar(0));
+    }
+    if (inputCols[0] == inputCols[1]) {
+      return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex,
+                                                otherIndex, patchOffsets,
+                                                colOffsets, inputCols);
+    }
+    if (inputCols[1] == inputCols[0] + 1) {
+      return loadPacketStandardFromTwoColumns(
+          patchId, rowIndex, colIndex, otherIndex, patchOffsets, colOffsets);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex,
+                                            Index colIndex,
+                                            Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+    eigen_assert((patchDepth() % packetSize) == 0);
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols ||
+        inputRow >= m_inputRows) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    // no padding
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX
+    std::remove_const_t<Scalar> values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
+      Index patchIndex, Index& rowIndex, Index& colIndex,
+      Index& otherIndex) const {
+    const size_t NumInputDims = array_size<
+        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
+    const Index patch2DIndex = (NumInputDims == 3)
+                                   ? patchIndex
+                                   : (patchIndex - otherIndex * m_num_patches);
+    otherIndex *= m_patchInputStride;
+    colIndex = patch2DIndex / m_fastOutputRows;
+    rowIndex = patch2DIndex - colIndex * m_outputRows;
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+  }
+
+  Index m_patch_cols;   // number of columns in the patch
+  Index m_num_patches;  // number of patches to extract.
+
+  // Strides for navigating through the single patch.
+  Index m_patch_row_stride;
+  Index m_patch_col_stride;
+  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
+  internal::TensorIntDivisor<Index> m_fastPatchColStride;
+
+  Index m_patch_row_inflate_strides;  // the strides for row inflation in the
+                                      // image patch
+  Index m_patch_col_inflate_strides;  // the strides for col inflation in the
+                                      // image patch
+  // Fast representation of inflation strides.
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  Index m_otherStride;
+  Index m_colStride;
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;    // row stride in the input tensor
+  Index m_colInputStride;    // col stride in the input tensor
+  Index m_patchInputStride;  // patch stride in the input tensor
+
+  Index m_inputRows;  // Number of rows in the input tensor
+  Index m_inputCols;  // Number of cols in the input tensor
+
+  Index m_outputRows;  // Number of convolution output rows
+  Index m_outputCols;  // Number of convolution output column
+
+  Index m_row_strides;  // User specified row stride
+  Index m_col_strides;  // User specified col stride
+
+  Index m_in_row_strides;  // User specified input row stride
+  Index m_in_col_strides;  // User specified input col stride
+
+  Index m_rowPaddingTop;   // Row padding
+  Index m_colPaddingLeft;  // Column padding
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper<
+    Scalar, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension,
+                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      ParentMapper;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+
+  typedef Self LinearMapper;
+
+  typedef typename ParentMapper::TensorEvaluatorT TensorEvaluatorT;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset),
+        m_base_mapper(base_mapper) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
+                                     m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const Self& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
+                                     m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                   m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
+                                                          Index j) const {
+    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                    m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
+                                                          Index j) const {
+    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
+                                                        j + m_col_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
+  loadCoeffStandard(Index i) const {
+    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
+                                           m_colIndex, m_otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
+    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex,
+                                        m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
+  loadPacketStandard(Index i) const {
+    typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
+    return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
+        i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_base_mapper.nonStandardPatches();
+  }
+
+  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
+  // index respectively that fits into the peeled_k elements starting at
+  // m_depth_offset.
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
+    const Index max_col =
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
+        fastPatchColStride();
+    return std::min<Index>(1 + max_col, patchCols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
+                                   const Index col) const {
+    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
+                           col * patchColStride()) /
+                          fastPatchRowStride();
+    return std::min<Index>(1 + max_row, patchRows());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
+                                     Index row) const {
+    const Index max_depth = m_depth_offset + peeled_k -  //
+                            col * patchColStride() -     //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
+  // MaxDepth uses only the remaining number of elements in the peeled_k.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
+                                     const Index start_depth) const {
+    return std::min<Index>(start_depth + num_elements, patchDepth());
+  }
+
+  // Every register matters in this code, so sometimes to prevent register
+  // spilling, instead of the variable that you would expect to see, we use
+  // another one, that is guaranteed to have the same value. E.g. patch depth is
+  // always the same as input depth, and it's also the same as input row stride.
+  // Bunch of other parameters have similar relations.
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const {
+    return m_base_mapper.m_rowInputStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const {
+    return m_base_mapper.m_colStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const {
+    return m_base_mapper.m_patch_cols;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return patchDepth();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchColStride() const {
+    return m_base_mapper.m_patch_col_stride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return m_base_mapper.m_fastDimZero;  // patch_depth
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
+    return m_base_mapper.m_fastPatchColStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
+                                             const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
+  template <typename PacketT = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  partialPacketNoPadding(const Index depth, const Index baseIndex,
+                         Index num_coeffs) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template partialPacket<PacketT>(
+        inputIndex, mask<PacketT>(0, num_coeffs));
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool hasPadding() const {
+    // TODO(ezhulenev): It does seems that for inflated filter it's still
+    // possible to guarantee "no padding or skipping" for non-standard packing.
+    if (nonStandardPatches()) return true;
+
+    // Non zero padding before.
+    if (m_base_mapper.m_rowPaddingTop > 0) return true;
+    if (m_base_mapper.m_colPaddingLeft > 0) return true;
+
+    // Non zero padding after in rows.
+    const Index last_row =
+        (m_base_mapper.m_outputRows - 1) * m_base_mapper.m_row_strides;
+    if (last_row + (patchRows() - 1) >= m_base_mapper.m_inputRows) return true;
+
+    // Non zero padding after in cols.
+    const Index last_col =
+        (m_base_mapper.m_outputCols - 1) * m_base_mapper.m_col_strides;
+    if (last_col + (patchCols() - 1) >= m_base_mapper.m_inputCols) return true;
+
+    return false;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
+    const Index r = m_rowIndex + row;
+    return r < 0 || r >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
+                                     const Index last_row) const {
+    return m_rowIndex + first_row < 0 ||
+           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padOrSkipRow(const Index row,
+                                        Index* orig_row) const {
+    eigen_assert(nonStandardPatches());
+
+    const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides;
+    *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1)
+                    ? input_row
+                    : ((input_row >= 0)
+                           ? (input_row / m_base_mapper.m_fastInputRowStride)
+                           : 0);
+
+    return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) ||
+           (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
+    const Index c = m_colIndex + col;
+    return c < 0 || c >= m_base_mapper.m_inputCols;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padOrSkipCol(const Index col,
+                                        Index* orig_col) const {
+    eigen_assert(nonStandardPatches());
+
+    const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides;
+    *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1)
+                    ? input_col
+                    : ((input_col >= 0)
+                           ? (input_col / m_base_mapper.m_fastInputColStride)
+                           : 0);
+
+    return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) ||
+           (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return r * m_base_mapper.m_rowInputStride +
+           c * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+  // Compute a base index when original input row and column were precomputed
+  // using padOrSkipRow and padOrSkipCol. Used only for non standard patches.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index origBaseIndex(const Index orig_row,
+                                          const Index orig_col) const {
+    return orig_row * m_base_mapper.m_rowInputStride +
+           orig_col * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const {
+    return m_base_mapper.m_row_strides;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const {
+    return m_base_mapper.m_col_strides;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return patchOffset - colOffset * m_base_mapper.m_colStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return colOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const {
+    return m_depth_offset % patchDepth();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
+  getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
+  }
+
+ private:
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
+
+  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
+  // indices for the first element in a patch specified by col_offset
+  // (see computeBaseIndices(...) for details).
+  Index m_rowIndex;
+  Index m_colIndex;
+  Index m_otherIndex;
+
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+};
+
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted image patches) in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
+// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
+// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
+// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
+// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
+// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
+// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
+// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
+// A8 ...
+// ...
+//
+// *) A, B, C, ... - patches extracted from the original input.
+// *) A0, A1, A2 ... - values from the same patch at different offsets.
+//
+// The traversal (packed rhs memory) order (B0 besides A0 in memory):
+// A0 B0 C0 D0 A1 B1 C1 D1 ...
+// E0 F0 G0 H0 E1 F1 G1 H1 ...
+// ...
+// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
+//
+// This traversal order must be the same as in default gemm_pack_rhs defined in
+// GeneralBlockPanelKernel.h.
+//
+// *) nr - number of registers along the 'n' dimension.
+//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
+//    Multiplication" paper.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((packet_size % 4) == 0 && !non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns and rows, if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // Check if we can squeeze reads along the `row` and `depth`
+            // dimensions (two innermost dimensions).
+            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth = ((c == start_col) && (r == start_row))
+                                            ? rhs.depthOffset()
+                                            : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketStandard(k);
+            kernel.packet[1] = dm1.loadPacketStandard(k);
+            kernel.packet[2] = dm2.loadPacketStandard(k);
+            kernel.packet[3] = dm3.loadPacketStandard(k);
+            ptranspose(kernel);
+            pstoreu(block + 0 * packet_size, kernel.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel.packet[1]);
+            pstoreu(block + 2 * packet_size, kernel.packet[2]);
+            pstoreu(block + 3 * packet_size, kernel.packet[3]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!rhs.nonStandardPatches()) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const int packet_size = 2;
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns and rows if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth = ((c == start_col) && (r == start_row))
+                                            ? rhs.depthOffset()
+                                            : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          // Packet can span multiple rows or columns, so we have to go
+          // though the slower "standard" path.
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!non_standard_patches) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Special case for non-vectorized types such as float16.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      if (!rhs.nonStandardPatches()) {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+#endif
+}  // end namespace internal
+
+/** SpatialConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
+ * (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The input and the kernel must both be in col-major layout. The result will
+ * also be in col-major layout.
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
+ * pixels.
+ *
+ * If padding_top, padding_bottom, padding_left, or padding_right is specified,
+ * then those paddings will be used to pad the input, and padding_type must be
+ * PADDING_VALID.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
+ */
+template <typename Input, typename Kernel,
+          typename OutputKernel = const NoOpOutputKernel>
+EIGEN_ALWAYS_INLINE static const std::conditional_t<
+    internal::traits<Input>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const Kernel>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+            const OutputKernel> >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const Kernel>,
+            const OutputKernel> > >
+SpatialConvolution(const Input& input, const Kernel& kernel,
+                   const Index row_stride = 1, const Index col_stride = 1,
+                   const PaddingType padding_type = PADDING_SAME,
+                   const Index row_in_stride = 1, const Index col_in_stride = 1,
+                   const OutputKernel& output_kernel = OutputKernel(),
+                   Index padding_top = 0, Index padding_bottom = 0,
+                   Index padding_left = 0, Index padding_right = 0) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  typedef typename internal::traits<Input>::Scalar InputScalar;
+  TensorRef<Tensor<InputScalar, internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+                   internal::traits<Kernel>::NumDimensions,
+                   internal::traits<Kernel>::Layout, TensorIndex> >
+      kern(kernel);
+
+  EIGEN_STATIC_ASSERT(
+      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+      YOU_MADE_A_PROGRAMMING_MISTAKE)
+  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const Index kernelRowsEff =
+      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const Index kernelColsEff =
+      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const bool padding_explicit =
+      (padding_top || padding_bottom || padding_left || padding_right);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID: {
+      const TensorIndex InputRowsEff = InputRows + padding_top + padding_bottom;
+      const TensorIndex InputColsEff = InputCols + padding_left + padding_right;
+      out_height = divup(InputRowsEff - kernelRowsEff + 1, row_stride);
+      out_width = divup(InputColsEff - kernelColsEff + 1, col_stride);
+      break;
+    }
+    case PADDING_SAME: {
+      eigen_assert(!padding_explicit);
+      out_height = divup(InputRows, row_stride);
+      out_width = divup(InputCols, col_stride);
+      break;
+    }
+    default: {
+      // Initialize unused variables to avoid a compiler warning
+      out_height = 0;
+      out_width = 0;
+      eigen_assert(false && "unexpected padding");
+    }
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  if (padding_explicit) {
+    return choose(
+        Cond<internal::traits<Input>::Layout == ColMajor>(),
+        kernel.reshape(kernel_dims)
+            .contract(input
+                          .extract_image_patches(
+                              kernelRows, kernelCols, row_stride, col_stride,
+                              row_in_stride, col_in_stride,
+                              /*row_inflate_stride=*/1,
+                              /*col_inflate_stride=*/1, padding_top,
+                              padding_bottom, padding_left, padding_right,
+                              /*padding_value=*/static_cast<InputScalar>(0))
+                          .reshape(pre_contract_dims),
+                      contract_dims, output_kernel)
+            .reshape(post_contract_dims),
+        input
+            .extract_image_patches(
+                kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
+                col_in_stride,
+                /*row_inflate_stride=*/1,
+                /*col_inflate_stride=*/1, padding_top, padding_bottom,
+                padding_left, padding_right,
+                /*padding_value=*/static_cast<InputScalar>(0))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+            .reshape(post_contract_dims));
+  } else {
+    return choose(
+        Cond<internal::traits<Input>::Layout == ColMajor>(),
+        kernel.reshape(kernel_dims)
+            .contract(input
+                          .extract_image_patches(
+                              kernelRows, kernelCols, row_stride, col_stride,
+                              row_in_stride, col_in_stride, padding_type)
+                          .reshape(pre_contract_dims),
+                      contract_dims, output_kernel)
+            .reshape(post_contract_dims),
+        input
+            .extract_image_patches(kernelRows, kernelCols, row_stride,
+                                   col_stride, row_in_stride, col_in_stride,
+                                   padding_type)
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+            .reshape(post_contract_dims));
+  }
+}
+
+}  // end namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_spatial_convolutions.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_spatial_convolutions.h
new file mode 100644
index 00000000..78b6d737
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/convolution/eigen_spatial_convolutions.h
@@ -0,0 +1,445 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// Note the following header is used in both TF and TFLite. Particularly, it's
+// used for float TFLite Conv2D.
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions-inl.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+
+namespace Eigen {
+namespace internal {
+
+// After we vectorized all loads from the underlying tensor using Packet ops, we
+// have to finalize coefficients that do not fit into a packet.
+template <typename Scalar, typename DataMapper, int packet_size,
+          bool masked_load_store>
+struct FinalizeDataMapperCoeffs {
+  EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
+                                            const DataMapper& rhs,
+                                            Index base_idx, Index depth,
+                                            Index max_depth, bool pad = false) {
+    const Index num_coeffs = max_depth - depth;
+    eigen_assert(num_coeffs <= packet_size);
+
+    for (; depth < max_depth; ++depth) {
+      *block = pad ? Scalar(0) : rhs.coeffNoPadding(depth, base_idx);
+      ++block;
+    }
+
+    return num_coeffs;
+  }
+};
+
+template <typename Scalar, typename DataMapper, int packet_size>
+struct FinalizeDataMapperCoeffs<Scalar, DataMapper, packet_size,
+                                /*masked_load_store=*/true> {
+  EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
+                                            const DataMapper& rhs,
+                                            Index base_idx, Index depth,
+                                            Index max_depth, bool pad = false) {
+    Index num_coeffs = max_depth - depth;
+    eigen_assert(num_coeffs <= packet_size);
+    if (num_coeffs == 0) return 0;
+
+    using Packet = typename packet_traits<Scalar>::type;
+    Packet p = pad ? pset1<Packet>(Scalar(0))
+                   : rhs.partialPacketNoPadding(depth, base_idx, num_coeffs);
+    internal::pstoreu(block, p, mask<Packet>(0, num_coeffs));
+
+    return num_coeffs;
+  }
+};
+
+// Pack a block of the right input matrix (in our case it's always a
+// "virtual matrix" constructed from extracted image patches) in contiguous
+// block in column-major storage order. Knowing the properties of the
+// original patch op we can do it more efficient than the default
+// gemm_pack_colmajor_block.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename StorageIndex,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+struct gemm_pack_colmajor_block<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  using CoeffFinalizer = FinalizeDataMapperCoeffs<
+      Scalar, DataMapper, packet_size,
+      TensorEvaluatorHasPartialPacket<typename DataMapper::TensorEvaluatorT,
+                                      Packet, Index>::value &&
+          unpacket_traits<Packet>::masked_store_available>;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
+      // Single packet always belong to single patch (row, col).
+      if (rhs.hasPadding()) {
+        packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true,
+                            /*has_padding=*/true>(block, rhs, rows, cols);
+      } else {
+        packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true,
+                            /*has_padding=*/false>(block, rhs, rows, cols);
+      }
+
+    } else if (standard_patches) {
+      // Single packet can span across multiple patch rows or columns.
+      if (rhs.hasPadding()) {
+        packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false,
+                            /*has_padding=*/true>(block, rhs, rows, cols);
+      } else {
+        packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false,
+                            /*has_padding=*/false>(block, rhs, rows, cols);
+      }
+
+    } else if (rhs.patchDepth() % packet_size == 0) {
+      // Single packet always belong to single patch (row, col).
+      packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/
+                             true>(block, rhs, rows, cols);
+
+    } else {
+      // Single packet can span across multiple patch rows or columns.
+      packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/
+                             false>(block, rhs, rows, cols);
+    }
+  }
+
+ private:
+  // (A) Standard image patches:
+  //
+  //  (1) patch_row_inflate_strides == 1    AND
+  //  (2) patch_col_inflate_strides == 1
+  //
+  // Standard patches guarantee that two inner most dimensions (depth and rows)
+  // are contiguous in memory and we can try to squeeze reads from them.
+  //
+  // (B) Non standard image patches: in_row/in_col and patch_row/patch_col
+  // strides can be not equal to 1, and for each [row, col] inside a patch we
+  // have to do additional computations to find corresponding row and col in the
+  // input tensor. Also we can no longer squeeze reads from inner dimensions.
+  //
+  // Additional parameters:
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks, because it's guaranteed that block size
+  //   will be a multiple of a packet size (see TensorContractionBlocking).
+  //
+  // - has_padding: Input tensor has non-zero padding. In this case for each
+  //   patch col and row we need to check that it doesn't correspond to the
+  //   padded region of original input.
+  template <bool patch_depth_is_multiple_of_packet_size, bool has_padding>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* __restrict block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const StorageIndex peeled_k = (rows / packet_size) * packet_size;
+
+    const StorageIndex start_col = rhs.colOffset();
+    const StorageIndex max_col = rhs.maxCol(peeled_k);
+    const StorageIndex rhs_depth_offset = rhs.depthOffset();
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      StorageIndex k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const StorageIndex max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = has_padding && lm.padCol(c);
+
+        eigen_assert(has_padding || !lm.padCol(c));
+        eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1));
+
+        // We can squeeze reads for all rows in [start_row, max_row) range.
+        if (!has_padding ||
+            (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
+          const StorageIndex start_depth =
+              (c == start_col) ? rhs_depth_offset : 0;
+
+          const StorageIndex max_depth =
+              std::min<StorageIndex>(start_depth + (peeled_k - k),
+                                     (max_row - start_row) * rhs.patchDepth());
+
+          const StorageIndex base_idx = lm.baseIndex(start_row, c);
+
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
+
+            const StorageIndex unrolled_depth = max_depth - 4 * packet_size;
+            for (; d <= unrolled_depth; d += 4 * packet_size) {
+              eigen_assert(k < peeled_k);
+
+              Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
+              Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
+              Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
+              Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
+
+              internal::pstoreu(block + 0 * packet_size, p0);
+              internal::pstoreu(block + 1 * packet_size, p1);
+              internal::pstoreu(block + 2 * packet_size, p2);
+              internal::pstoreu(block + 3 * packet_size, p3);
+
+              block += 4 * packet_size;
+              k += 4 * packet_size;
+            }
+
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
+
+          } else {
+            StorageIndex d = start_depth;
+
+            const StorageIndex unrolled_depth = max_depth - 4 * packet_size;
+            for (; d <= unrolled_depth; d += 4 * packet_size) {
+              eigen_assert(k < peeled_k);
+
+              Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
+              Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
+              Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
+              Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
+
+              internal::pstoreu(block + 0 * packet_size, p0);
+              internal::pstoreu(block + 1 * packet_size, p1);
+              internal::pstoreu(block + 2 * packet_size, p2);
+              internal::pstoreu(block + 3 * packet_size, p3);
+
+              block += 4 * packet_size;
+              k += 4 * packet_size;
+            }
+
+            const StorageIndex vectorized_depth = max_depth - packet_size;
+            for (; d <= vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
+
+            eigen_assert(k <= peeled_k);
+            const Index num_coeffs =
+                CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth);
+
+            k += num_coeffs;
+            block += num_coeffs;
+            eigen_assert(k <= peeled_k);
+          }
+
+          // Go to the next column.
+          continue;
+        }
+
+        // If we are not allowed to squeeze reads along the `row` and `depth`
+        // dimensions, we must process rows one by one.
+        for (StorageIndex r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const StorageIndex start_depth =
+              ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
+          const StorageIndex max_depth =
+              rhs.maxDepth(peeled_k - k, start_depth);
+
+          const bool pad = has_padding && (pad_col || lm.padRow(r));
+          eigen_assert(has_padding || !lm.padRow(r));
+
+          const StorageIndex base_idx = lm.baseIndex(r, c);
+
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
+
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = (has_padding && pad)
+                                   ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
+
+          } else {
+            StorageIndex d = start_depth;
+
+            const StorageIndex vectorized_depth = max_depth - packet_size;
+            for (; d <= vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = (has_padding && pad)
+                                   ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
+
+            eigen_assert(k <= peeled_k);
+            const Index num_coeffs = CoeffFinalizer::finalize(
+                block, rhs, base_idx, d, max_depth, has_padding && pad);
+
+            k += num_coeffs;
+            block += num_coeffs;
+            eigen_assert(k <= peeled_k);
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+
+  template <bool patch_depth_is_multiple_of_packet_size>
+  EIGEN_ALWAYS_INLINE void packNonStandardPatches(Scalar* __restrict block,
+                                                  const DataMapper& rhs,
+                                                  StorageIndex rows,
+                                                  StorageIndex cols) {
+    eigen_assert(rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const StorageIndex peeled_k = (rows / packet_size) * packet_size;
+
+    const StorageIndex start_col = rhs.colOffset();
+    const StorageIndex max_col = rhs.maxCol(peeled_k);
+    const StorageIndex rhs_depth_offset = rhs.depthOffset();
+
+    // Original input column and row after applying all non-standard strides and
+    // dilations. Computed by padOrSkip{Row,Col}.
+    Index orig_c = 0;
+    Index orig_r = 0;
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      StorageIndex k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const StorageIndex max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_or_skip_col = lm.padOrSkipCol(c, &orig_c);
+
+        for (StorageIndex r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const StorageIndex start_depth =
+              ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
+          const StorageIndex max_depth =
+              rhs.maxDepth(peeled_k - k, start_depth);
+
+          const bool pad_or_skip =
+              pad_or_skip_col || lm.padOrSkipRow(r, &orig_r);
+          const StorageIndex base_idx = lm.origBaseIndex(orig_r, orig_c);
+
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
+
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0))
+                                           : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
+
+          } else {
+            const StorageIndex vectorized_depth = max_depth - packet_size;
+            StorageIndex d = start_depth;
+            for (; d <= vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0))
+                                           : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
+
+            eigen_assert(k <= peeled_k);
+            const Index num_coeffs = CoeffFinalizer::finalize(
+                block, rhs, base_idx, d, max_depth, pad_or_skip);
+
+            k += num_coeffs;
+            block += num_coeffs;
+            eigen_assert(k <= peeled_k);
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeff.
+      for (; k < rows; ++k) {
+        *block = lm(k);
+        ++block;
+      }
+    }
+  }
+};
+}  // namespace internal
+}  // namespace Eigen
+#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#endif  // XLA_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id.h
new file mode 100644
index 00000000..e80d8429
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id.h
@@ -0,0 +1,89 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_DEVICE_ID_H_
+#define XLA_TSL_FRAMEWORK_DEVICE_ID_H_
+
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace tsl {
+
+// There are three types of device ids:
+// - *physical* device id: this is the integer index of a device in the
+//   physical machine, it can be filtered (for e.g. using environment variable
+//   CUDA_VISIBLE_DEVICES when using CUDA). Note that this id is not visible to
+//   Tensorflow, but result after filtering is visible to TF and is called
+//   platform device id as below.
+//   For CUDA, see
+//   http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
+//   for more details.
+// - *platform* device id (also called *visible* device id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that is
+//   visible to Tensorflow after filtering (for e.g. by CUDA_VISIBLE_DEVICES).
+//   For CUDA, this id is generated by the CUDA GPU driver. It starts from 0
+//   and is used for CUDA API calls like cuDeviceGet().
+// - TF device id (also called *virtual* device id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that
+//   Tensorflow generates and exposes to its users. It is the id in the <id>
+//   field of the device name "/device:GPU:<id>", and is also the identifier of
+//   a BaseGPUDevice. Note that the configuration allows us to create multiple
+//   BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
+//   hardware, so the mapping between TF GPU id and platform GPU id is not a 1:1
+//   mapping, see the example below.
+//
+// For example, assuming that in the machine we have GPU device with index 0, 1,
+// 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create
+// the following mapping between platform GPU id and physical GPU id:
+//
+//        platform GPU id ->  physical GPU id
+//                 0  ->  1
+//                 1  ->  2
+//                 2  ->  3
+//
+// Note that physical GPU id 0 is invisible to TF so there is no mapping entry
+// for it.
+//
+// Assuming we configure the Session to create one BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mapping between TF device id and platform device id:
+//
+//                  TF GPU id  ->  platform GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  0
+//
+// Note that platform device id 1 is filtered out by
+// GPUOptions::visible_device_list, so it won't be used by the TF process.
+//
+// On the other hand, if we configure it to create 2 BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mapping between TF device id and platform device id:
+//
+//                  TF GPU id  ->  platform GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  2
+//      2 (i.e. /device:GPU:2) ->  0
+//      3 (i.e. /device:GPU:3) ->  0
+//
+// We create strong-typed integer classes for both TF device id and platform
+// device id to minimize programming errors and improve code readability. Except
+// for the StreamExecutor interface (as we don't change its API), whenever we
+// need a TF device id (or platform device id) we should use TfDeviceId (or
+// PlatformDeviceId) instead of a raw integer.
+TSL_LIB_GTL_DEFINE_INT_TYPE(TfDeviceId, int32);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PlatformDeviceId, int32);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_DEVICE_ID_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id_manager.h
new file mode 100644
index 00000000..3de2413f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id_manager.h
@@ -0,0 +1,53 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_DEVICE_ID_MANAGER_H_
+#define XLA_TSL_FRAMEWORK_DEVICE_ID_MANAGER_H_
+
+#include <vector>
+
+#include "xla/tsl/framework/device_id.h"
+#include "xla/tsl/framework/device_type.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace tsl {
+
+// Class that maintains a map from TfDeviceId to PlatformDeviceId, and manages
+// the translation between them.
+class DeviceIdManager {
+ public:
+  // Adds a mapping from tf_device_id to platform_device_id.
+  static absl::Status InsertTfPlatformDeviceIdPair(
+      const DeviceType& type, TfDeviceId tf_device_id,
+      PlatformDeviceId platform_device_id);
+
+  // Gets the platform_device_id associated with tf_device_id. Returns OK if
+  // found.
+  static absl::Status TfToPlatformDeviceId(
+      const DeviceType& type, TfDeviceId tf_device_id,
+      PlatformDeviceId* platform_device_id);
+
+  // Gets all tf_device_ids that are on the platform with `platform_device_id`.
+  static absl::StatusOr<std::vector<TfDeviceId>> GetTfDevicesOnPlatform(
+      const DeviceType& type, PlatformDeviceId platform_device_id);
+
+  // Clears the map. Used in unit tests only.
+  static void TestOnlyReset();
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_DEVICE_ID_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id_utils.h
new file mode 100644
index 00000000..871bc69b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_id_utils.h
@@ -0,0 +1,79 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_DEVICE_ID_UTILS_H_
+#define XLA_TSL_FRAMEWORK_DEVICE_ID_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/framework/device_id.h"
+#include "xla/tsl/framework/device_type.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/device_name_utils.h"
+
+namespace tsl {
+
+// Utility methods for translation between TensorFlow device ids and platform
+// device ids.
+
+// Verify that the platform_device_id associated with a TfDeviceId is
+// legitimate.
+void CheckValidTfDeviceId(const DeviceType& type, int visible_device_count,
+                          TfDeviceId tf_device_id);
+
+// Parse `visible_device_list` into a list of platform Device ids.
+// When parsing non-PluggableDevices, the `device_type` parameter is
+// optional (can be empty) and ignored. When using this function to
+// parse the `visible_device_list` for PluggableDevices, the pluggable
+// device type will be included in the `visible_device_list`, e.g.
+// "PluggableDeviceA:0,PluggableDeviceA:1,PluggableDeviceB:0".
+// In this case, the `device_type` parameter should be set to the
+// corresponding pluggable device type to be parsed, e.g.
+// "PluggableDeviceA". And the other types of PluggableDevices
+// in the `visible_device_list` will be ignored.
+absl::Status ParseVisibleDeviceList(
+    const std::string& visible_device_list, int visible_device_count,
+    std::vector<PlatformDeviceId>* visible_device_order,
+    absl::string_view device_type = "");
+
+// Returns how many TF devices should be created, and generates the mapping
+// between TfDeviceId and PlatformDeviceId. The number of TF devices is the
+// minimum among the device count in `session_option_device_counts`,
+// `visible_device_count` and the number of visible devices in
+// `visible_device_list`. If `visible_device_list` is empty, the mapping
+// between TfDeviceId and PlatformDeviceId is an identity mapping.
+// Please refer to tensorflow/compiler/xla/tsl/framework/device_id.h and
+// tensorflow/core/protobuf/config.proto about the relationship between
+// TfDeviceId and PlatformDeviceId, and how `visible_device_list` is used.
+absl::StatusOr<size_t> GetNumberTfDevicesAndConfigurePlatformDeviceId(
+    const absl::flat_hash_map<std::string, int64_t>&
+        session_option_device_counts,
+    absl::string_view device_type, absl::string_view visible_device_list,
+    int visible_device_count);
+
+absl::StatusOr<int> GetPlatformDeviceIdFromDeviceParsedName(
+    const DeviceNameUtils::ParsedName& device_name,
+    const DeviceType& device_type);
+
+// Returns the id in device_name.
+int GetDeviceIdFromDeviceParsedName(
+    const DeviceNameUtils::ParsedName& device_name);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_DEVICE_ID_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_type.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_type.h
new file mode 100644
index 00000000..b6e7f3c0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/device_type.h
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_DEVICE_TYPE_H_
+#define XLA_TSL_FRAMEWORK_DEVICE_TYPE_H_
+
+#include <ostream>
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace tsl {
+
+// A DeviceType is just a string, but we wrap it up in a class to give
+// some type checking as we're passing these around
+class DeviceType {
+ public:
+  DeviceType(const char* type)  // NOLINT
+      : type_(type) {}
+
+  explicit DeviceType(absl::string_view type)
+      : type_(type.data(), type.size()) {}
+
+  const char* type() const { return type_.c_str(); }
+  const std::string& type_string() const { return type_; }
+
+  bool operator<(const DeviceType& other) const;
+  bool operator==(const DeviceType& other) const;
+  bool operator!=(const DeviceType& other) const { return !(*this == other); }
+
+ private:
+  std::string type_;
+};
+std::ostream& operator<<(std::ostream& os, const DeviceType& d);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_DEVICE_TYPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/FixedPoint.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/FixedPoint.h
new file mode 100644
index 00000000..00667b67
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/FixedPoint.h
@@ -0,0 +1,53 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_FIXEDPOINT_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_FIXEDPOINT_H_
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/tsl/framework/fixedpoint_types.h"
+
+// Use optimized implementations whenever available
+#if defined(EIGEN_VECTORIZE_AVX512DQ) || defined(EIGEN_VECTORIZE_AVX512BW)
+#include "xla/tsl/framework/fixedpoint/PacketMathAVX512.h"
+#include "xla/tsl/framework/fixedpoint/TypeCastingAVX512.h"
+
+#elif defined EIGEN_VECTORIZE_AVX2
+#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+#include "xla/tsl/framework/fixedpoint/PacketMathAVX2.h"
+// Disable clang-format to prevent 'MatMatProductAVX2.h' header from being
+// included before 'PacketMathAVX2' header on which it depends.
+// clang-format off
+#include "xla/tsl/framework/fixedpoint/MatMatProductAVX2.h"
+// clang-format on
+#include "xla/tsl/framework/fixedpoint/TypeCastingAVX2.h"
+
+#elif defined EIGEN_VECTORIZE_AVX
+#include "xla/tsl/framework/fixedpoint/PacketMathAVX.h"
+
+#elif defined EIGEN_VECTORIZE_NEON
+#define EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
+#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+#define EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
+#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+#include "xla/tsl/framework/fixedpoint/MatMatProductNEON.h"
+#endif
+
+// Use the default implementation when no optimized code is available
+#include "xla/tsl/framework/fixedpoint/MatMatProduct.h"
+#include "xla/tsl/framework/fixedpoint/MatVecProduct.h"
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_FIXEDPOINT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProduct.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProduct.h
new file mode 100644
index 00000000..83fae2bd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProduct.h
@@ -0,0 +1,363 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCT_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCT_H_
+
+namespace Eigen {
+namespace internal {
+
+// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
+// overflows
+template <>
+struct scalar_product_traits<QInt8, QInt8> {
+  enum { Defined = 1 };
+  typedef QInt32 ReturnType;
+};
+
+// Accumulate the product of 2 QInt16 inputs on 32 bits to prevent
+// overflows
+template <>
+struct scalar_product_traits<QInt16, QInt16> {
+  enum { Defined = 1 };
+  typedef QInt32 ReturnType;
+};
+
+// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
+// to prevent overflows
+template <>
+struct scalar_product_traits<QInt8, QUInt8> {
+  enum { Defined = 1 };
+  typedef QInt32 ReturnType;
+};
+
+// Accumulate the product of QUInt8 inputs with Qint8 inputs on 32 bits
+// to prevent overflows
+template <>
+struct scalar_product_traits<QUInt8, QInt8> {
+  enum { Defined = 1 };
+  typedef QInt32 ReturnType;
+};
+
+// Description of the product implementation. It's pretty simple now since
+// nothing is vectorized yet.
+// This definition tackle the case where both lhs and rhs are encoded using
+// signed 8bit integers
+#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
+
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 8bit Mat-Mat product itself.
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt8* blockA, const QInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+// This definition tackle the case where the lhs is encoded using signed 8bit
+// integers and the rhs using unsigned 8bit integers.
+#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt8* blockA, const QUInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+// This definition tackle the case where the lhs is encoded using unsigned 8bit
+// integers and the rhs using signed 8bit integers.
+#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QUInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QUInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QUInt8* blockA, const QInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+#ifndef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt16 LhsScalar;
+  typedef QInt16 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 16bit Mat-Mat product itself.
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt16* blockA,
+                  const QInt16* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt16* blockA,
+                                      const QInt16* blockB, Index rows,
+                                      Index depth, Index cols, QInt32 alpha,
+                                      Index strideA, Index strideB,
+                                      Index offsetA, Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProductAVX2.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProductAVX2.h
new file mode 100644
index 00000000..5301f403
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProductAVX2.h
@@ -0,0 +1,2314 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCTAVX2_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCTAVX2_H_
+
+namespace Eigen {
+namespace internal {
+
+// AVX2 optimized implementation of Mat-Mat product.
+// LHS is encoded using signed 16-bit integers.
+// RHS is encoded using signed 16-bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+
+// Define quantized traits
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt16 LhsScalar;
+  typedef QInt16 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  enum {
+    // Define register blocking scheme.
+    nr = 16,
+    mr = 16,
+    kr = 4,
+    // Ignore progress tracking per loop iteration.
+    LhsProgress = -1,
+    RhsProgress = -1
+  };
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContractionThreadPool, inputs must have dimensions that are
+// multiples of 32.
+template <typename Index, int ShardingType>
+class TensorContractionBlocking<QInt16, QInt16, QInt16, Index, ShardingType> {
+ public:
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
+      : kc_(((k + 15) / 16) * 16),
+        mc_(((m + 15) / 16) * 16),
+        nc_(((n + 15) / 16) * 16) {
+    eigen_assert(mc_ % 16 == 0);
+    eigen_assert(kc_ % 16 == 0);
+    if (!k || !m || !n) {
+      return;
+    }
+
+    if (ShardingType == ShardByCol) {
+      eigen_assert(nc_ % 16 == 0);
+      nc_ = (((nc_ / num_threads) + 15) / 16) * 16;
+    } else {
+      eigen_assert(nc_ % 16 == 0);
+      mc_ = (((mc_ / num_threads) + 15) / 16) * 16;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+
+ private:
+  Index kc_;
+  Index mc_;
+  Index nc_;
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
+// multiples of 32.
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt16, QInt16> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 15) / 16) * 16;
+    this->m_nc = ((cols + 15) / 16) * 16;
+    this->m_kc = ((depth + 15) / 16) * 16;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt16>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt16>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+// Below are the fully optimized versions that are correct only for sizes that
+// are multiple of 16.  It is about a 10% performance benefit to keep these
+// implementations separate.
+
+// Arrange a block of the left input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing with m = 8 yields row major output (A0 beside B0 in memory):
+// A0 B0
+// A1 B1
+// A2 B2
+// A3 B3
+// A4 B4
+// A5 B5
+// A6 B6
+// A7 B7
+// ...
+//
+// The purpose is to collect m rows of size k.  Two elements of the same
+// row are arranged contiguously because madd performs an adjacent addition
+// in the kernel.
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
+                     Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
+              Conjugate, PanelMode>::operator()(QInt16* blockA,
+                                                const DataMapper& lhs,
+                                                Index depth, Index rows,
+                                                Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  typedef typename packet_traits<QInt16>::type Packet;
+
+  // Use alternate function for weird sizes
+  if (rows % 16 != 0 || depth % 16 != 0) {
+    eigen_assert(false &&
+                 "only depths and rows that are a multiple of 16 are currently "
+                 "supported");
+    // gemm_pack_lhs_any<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
+    // Conjugate, PanelMode> lhs_pack;
+    // return lhs_pack(blockA, lhs, depth, rows, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Pack rows in sets of 16
+  for (Index m = 0; m < rows; m += 16) {
+    // Pack depth in sets of 4
+    for (Index k = 0; k < depth; k += 4) {
+      // Load vectors
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
+
+      // Rearrange the inputs as required by the kernel
+      __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
+      __m256i L_AB8_AB15 = _mm256_unpackhi_epi16(L_A, L_B);
+      __m256i L_CD0_CD7 = _mm256_unpacklo_epi16(L_C, L_D);
+      __m256i L_CD8_CD15 = _mm256_unpackhi_epi16(L_C, L_D);
+
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+    }
+  }
+}
+
+// Arrange a block of the right input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+// Packing yields row major output (A0 beside A1 in memory):
+// A0 A1 A2 A3 A4 A5 A6 A7
+// B0 B1 B2 B3 B4 B5 B6 B7
+// ...
+//
+// At least two elements of the same col are arranged contiguously because
+// maddubs and madd both perform an adjacent addition in the kernel.  We can
+// save work by leaving 4 adjacent elements because kr = 4.
+// The purpose is to collect n cols of size k.  Two elements of the same
+// col are arranged contiguously because madd performs an adjacent addition
+// in the kernel.
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
+                     PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt16* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
+              PanelMode>::operator()(QInt16* blockB, const DataMapper& rhs,
+                                     Index depth, Index cols, Index stride,
+                                     Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  typedef typename packet_traits<QInt16>::type Packet;
+
+  // Use alternate function for weird sizes
+  if (cols % 16 != 0 || depth % 16 != 0) {
+    eigen_assert(false &&
+                 "only depths and cols that are a multiple of 16 are currently "
+                 "supported");
+    // gemm_pack_rhs_any<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
+    // PanelMode> rhs_pack;
+    // return rhs_pack(blockB, rhs, depth, cols, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_4, R_AD_8, R_AD_12;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31);  \
+  R_AD_4 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_12 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 4, R_AD_4);                \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 12, R_AD_12);              \
+  blockB_256++;
+
+  // Pack cols in sets of 16
+  for (Index n = 0; n < cols; n += 16) {
+    // Pack depth in sets of 16
+    for (Index k = 0; k < depth; k += 16) {
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
+      PACK_STEP;
+
+      blockB_256 += 12;
+    }
+  }
+#undef PACK_STEP
+}
+
+// Perform the actual multiplication on packed inputs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt16* blockA,
+                  const QInt16* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt16* blockA,
+                                      const QInt16* blockB, Index rows,
+                                      Index depth, Index cols, QInt32 alpha,
+                                      Index strideA, Index strideB,
+                                      Index offsetA, Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  // Use alternate function for weird sizes
+  if (rows % 16 != 0 || cols % 16 != 0 || depth % 16 != 0) {
+    eigen_assert(
+        false &&
+        "only depths, cols and rows that are a multiple of 16 are currently "
+        "supported");
+    // gebp_kernel_any<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+    // ConjugateRhs> gebp;
+    // return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA,
+    // strideB, offsetA, offsetB);
+  }
+
+  // Create result block
+  QInt32* blockO = aligned_new<QInt32>(16 * 16);
+  memset(blockO, 0, 16 * 16 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 16 columns
+  for (Index n = 0; n < cols; n += 16) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 16 rows
+    for (Index m = 0; m < rows; m += 16) {
+      // Reset index into blockB
+      Index indexR = n / 16 * depth;
+      // Loop over blocks of 4 on depth
+      for (Index k = 0; k < depth; k += 4) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                         \
+  P_32_A = _mm256_madd_epi16(R_INPUT_A, L_AD0);                            \
+  P_32_B = _mm256_madd_epi16(R_INPUT_B, L_AD8);                            \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                 \
+  _mm256_store_si256(                                                      \
+      blockO_256 + 2 * OFFSET,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET), P_32)); \
+                                                                           \
+  P_32_A = _mm256_madd_epi16(R_INPUT_A, L_EH0);                            \
+  P_32_B = _mm256_madd_epi16(R_INPUT_B, L_EH8);                            \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                 \
+  _mm256_store_si256(                                                      \
+      blockO_256 + 2 * OFFSET + 1,                                         \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET + 1), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        // Replicate lower 128-bits of R_AH0 across both lanes
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        // Copy first two elements of R_AH0 across entire vector
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        // Copy second two elements of R_AH0 across entire vector
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+
+        // Replicate upper 128-bits of R_AH0 across both lanes
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix
+      Index i = 0;
+      for (Index j = n; j < n + 16; j++) {
+        LinearMapper r0 = res.getLinearMapper(m, j);
+        LinearMapper r1 = res.getLinearMapper(m + 8, j);
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 16 * 16 * sizeof(QInt32));
+    }
+  }
+  aligned_delete(blockO, 16 * 16);
+}
+
+#endif
+
+// AVX2 optimized implementation of Mat-Mat product.
+// LHS is encoded using signed 8-bit integers.
+// RHS is encoded using unsigned 8-bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+// Define quantized traits
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  enum {
+    // Define register blocking scheme.
+    nr = 32,
+    mr = 32,
+    kr = 8,
+    // Ignore progress tracking per loop iteration.
+    LhsProgress = -1,
+    RhsProgress = -1
+  };
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContractionThreadPool, inputs must have dimensions that are
+// multiples of 32.
+template <typename ResScalar, typename Index, typename LeftTensor,
+          typename left_nocontract_t, typename left_contract_t,
+          bool left_inner_dim_contiguous, bool left_inner_dim_reordered,
+          int LeftAlignment, typename RightTensor, typename right_nocontract_t,
+          typename right_contract_t, bool right_inner_dim_contiguous,
+          bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
+class TensorContractionBlocking<
+    ResScalar,
+    TensorContractionInputMapper<
+        QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32,
+        left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>,
+    TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor,
+                                 right_nocontract_t, right_contract_t, 32,
+                                 right_inner_dim_contiguous,
+                                 right_inner_dim_reordered, RightAlignment>,
+    Index, ShardingType> {
+ public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    eigen_assert(m % 32 == 0);
+    eigen_assert(k % 32 == 0);
+    if (!k || !m || !n) {
+      return;
+    }
+
+    if (ShardingType == ShardByCol) {
+      eigen_assert(n % 32 == 0);
+      nc_ = (((n / num_threads) + 31) / 32) * 32;
+    } else {
+      eigen_assert(n % 32 == 0 || n == 1);
+      // Special case to avoid breaking the unimplemented matrix-vector case
+      if (n == 1) {
+        nc_ = 32;
+      }
+      mc_ = (((m / num_threads) + 31) / 32) * 32;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+
+ private:
+  Index kc_;
+  Index mc_;
+  Index nc_;
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
+// multiples of 32.
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt8, QInt8> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 31) / 32) * 32;
+    this->m_nc = ((cols + 31) / 32) * 32;
+    this->m_kc = ((depth + 31) / 32) * 32;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt8, QUInt8> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 31) / 32) * 32;
+    this->m_nc = ((cols + 31) / 32) * 32;
+    this->m_kc = ((depth + 31) / 32) * 32;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+// Alternate templates for any input sizes
+template <typename Scalar, typename Index, typename DataMapper, int Pack1,
+          int Pack2, int StorageOrder, bool Conjugate = false,
+          bool PanelMode = false>
+struct gemm_pack_lhs_any;
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                         Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Scalar, typename Index, typename DataMapper, int nr,
+          int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+struct gemm_pack_rhs_any;
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                         PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename LhsScalar, typename RhsScalar, typename Index,
+          typename DataMapper, int mr, int nr, bool ConjugateLhs = false,
+          bool ConjugateRhs = false>
+struct gebp_kernel_any;
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                       ConjugateRhs> {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+// Alternate implementations for any input sizes
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate,
+                  PanelMode>::operator()(QInt8* blockA, const DataMapper& lhs,
+                                         Index depth, Index rows, Index stride,
+                                         Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  typedef typename packet_traits<QInt8>::type Packet;
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Get even multiples of the dimensions
+  Index rows_32 = (rows / 32) * 32;
+  Index depth_8 = (depth / 8) * 8;
+
+  // Get padding for when depth is not a multiple of 32
+  int padding = 0;
+  if (depth % 32 != 0) {
+    int depth_32 = (depth / 32) * 32;
+    int extra_depth = depth - depth_32;
+    int extra_depth_8 = ((extra_depth + 7) / 8) * 8;
+    padding = 32 - extra_depth_8;
+  }
+
+  // Pack rows in sets of 32
+  for (Index m = 0; m < rows_32; m += 32) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth_8; k += 8) {
+      // Load vectors
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+
+    // Finish the k dimension, padding with zeros
+    if (depth_8 < depth) {
+      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
+      switch (depth - depth_8) {
+        case 1:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 2:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 3:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 4:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 5:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 6:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 7:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = lhs.template loadPacket<Packet>(m, depth_8 + 6);
+          L_H = _mm256_setzero_si256();
+          break;
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+    blockA_256 += padding;
+  }
+
+  // Finish the m dimension, padding with zeros
+  if (rows_32 < rows) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth_8; k += 8) {
+      // Load vectors
+      __m256i L_A = _mm256_setzero_si256();
+      __m256i L_B = _mm256_setzero_si256();
+      __m256i L_C = _mm256_setzero_si256();
+      __m256i L_D = _mm256_setzero_si256();
+      __m256i L_E = _mm256_setzero_si256();
+      __m256i L_F = _mm256_setzero_si256();
+      __m256i L_G = _mm256_setzero_si256();
+      __m256i L_H = _mm256_setzero_si256();
+      for (Index m = 0; m < rows - rows_32; m++) {
+        QInt8* ptr = (QInt8*)&L_A;
+        ptr[m] = lhs(rows_32 + m, k);
+        ptr = (QInt8*)&L_B;
+        ptr[m] = lhs(rows_32 + m, k + 1);
+        ptr = (QInt8*)&L_C;
+        ptr[m] = lhs(rows_32 + m, k + 2);
+        ptr = (QInt8*)&L_D;
+        ptr[m] = lhs(rows_32 + m, k + 3);
+        ptr = (QInt8*)&L_E;
+        ptr[m] = lhs(rows_32 + m, k + 4);
+        ptr = (QInt8*)&L_F;
+        ptr[m] = lhs(rows_32 + m, k + 5);
+        ptr = (QInt8*)&L_G;
+        ptr[m] = lhs(rows_32 + m, k + 6);
+        ptr = (QInt8*)&L_H;
+        ptr[m] = lhs(rows_32 + m, k + 7);
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+
+    // Finish the k dimension, padding with zeros
+    if (depth_8 < depth) {
+      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
+      QInt8* ptr;
+      switch (depth - depth_8) {
+        case 1:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            QInt8* ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+          }
+          break;
+        case 2:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          }
+          break;
+        case 3:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          }
+          break;
+        case 4:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          }
+          break;
+        case 5:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          }
+          break;
+        case 6:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+          }
+          break;
+        case 7:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+            ptr = (QInt8*)&L_G;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 6);
+          }
+          break;
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+  }
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                  PanelMode>::operator()(QUInt8* blockB, const DataMapper& rhs,
+                                         Index depth, Index cols, Index stride,
+                                         Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  typedef typename packet_traits<QUInt8>::type Packet;
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Get even multiples of the dimensions
+  Index cols_32 = (cols / 32) * 32;
+  Index depth_32 = (depth / 32) * 32;
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
+  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
+  blockB_256++;
+
+  // Pack cols in sets of 32
+  for (Index n = 0; n < cols_32; n += 32) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth_32; k += 32) {
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
+      PACK_STEP;
+
+      blockB_256 += 24;
+    }
+
+    if (depth_32 < depth) {
+      QUInt8* ptr;
+      __m256i R_A = _mm256_setzero_si256();
+      __m256i R_B = _mm256_setzero_si256();
+      __m256i R_C = _mm256_setzero_si256();
+      __m256i R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 1);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 2);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 3);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 4);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 5);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 6);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 7);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 8);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 9);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 10);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 11);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 12);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 13);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 14);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 15);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 16);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 17);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 18);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 19);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 20);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 21);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 22);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 23);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 24);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 25);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 26);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 27);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*)&R_A;
+        ptr[k - depth_32] = rhs(k, n + 28);
+        ptr = (QUInt8*)&R_B;
+        ptr[k - depth_32] = rhs(k, n + 29);
+        ptr = (QUInt8*)&R_C;
+        ptr[k - depth_32] = rhs(k, n + 30);
+        ptr = (QUInt8*)&R_D;
+        ptr[k - depth_32] = rhs(k, n + 31);
+      }
+      PACK_STEP;
+      blockB_256 += 24;
+    }
+  }
+
+  // Finish packing cols
+  if (cols_32 < cols) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth_32; k += 32) {
+      __m256i R_A, R_B, R_C, R_D;
+      Index n;
+      for (n = cols_32; n < cols; n += 4) {
+        switch (cols - n) {
+          case 1:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = _mm256_setzero_si256();
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 2:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 3:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          default:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = rhs.template loadPacket<Packet>(k, n + 3);
+            PACK_STEP;
+            break;
+        }
+      }
+
+      // Increment the block pointer.
+      // We must pad if cols is not a multiple of 32.
+      blockB_256 += 32 - (n - cols_32) / 4;
+    }
+
+    if (depth_32 < depth) {
+      for (Index n = cols_32; n < cols; n += 4) {
+        QUInt8* ptr;
+        __m256i R_A = _mm256_setzero_si256();
+        __m256i R_B = _mm256_setzero_si256();
+        __m256i R_C = _mm256_setzero_si256();
+        __m256i R_D = _mm256_setzero_si256();
+        switch (cols - n) {
+          case 1:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+            }
+            PACK_STEP;
+            break;
+          case 2:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+            }
+            PACK_STEP;
+            break;
+          case 3:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+            }
+            PACK_STEP;
+            break;
+          default:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+              ptr = (QUInt8*)&R_D;
+              ptr[k - depth_32] = rhs(k, n + 3);
+            }
+            PACK_STEP;
+            break;
+        }
+      }
+    }
+  }
+#undef PACK_STEP
+}
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                ConjugateRhs>::operator()(const DataMapper& res,
+                                          const QInt8* blockA,
+                                          const QUInt8* blockB, Index rows,
+                                          Index depth, Index cols, QInt32 alpha,
+                                          Index strideA, Index strideB,
+                                          Index offsetA, Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  Index rows_32 = ((rows + 31) / 32) * 32;
+  Index cols_32 = ((cols + 31) / 32) * 32;
+  Index depth_32 = ((depth + 31) / 32) * 32;
+
+  // Create result block
+  ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
+  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 32 columns
+  for (Index n = 0; n < cols_32; n += 32) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 32 rows
+    for (Index m = 0; m < rows_32; m += 32) {
+      // Reset index into blockB
+      Index indexR = n / 32 * depth_32;
+      // Loop over blocks of 8 on depth
+      for (Index k = 0; k < depth_32; k += 8) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_AD16 = blockA_256[indexL++];
+        __m256i L_AD24 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+        __m256i L_EH16 = blockA_256[indexL++];
+        __m256i L_EH24 = blockA_256[indexL++];
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+        __m256i R_AH16 = blockB_256[indexR++];
+        __m256i R_AH20 = blockB_256[indexR++];
+        __m256i R_AH24 = blockB_256[indexR++];
+        __m256i R_AH28 = blockB_256[indexR++];
+
+        // This constant is used with madd to convert 16 bit to 32 bit
+        const __m256i ONE = _mm256_set1_epi32(0x00010001);
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET,                                                 \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 1,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 2,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 3,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 16);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 17);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 18);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 19);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 20);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 21);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 22);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 23);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 24);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 25);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 26);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 27);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 28);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 29);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 30);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 31);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix.
+      if (m + 32 <= rows && n + 32 <= cols) {
+        Index i = 0;
+        for (Index j = n; j < n + 32; j++) {
+          LinearMapper r0 = res.getLinearMapper(m, j);
+          LinearMapper r1 = res.getLinearMapper(m + 8, j);
+          LinearMapper r2 = res.getLinearMapper(m + 16, j);
+          LinearMapper r3 = res.getLinearMapper(m + 24, j);
+          typedef typename packet_traits<QInt32>::type Packet;
+          r0.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r0.template loadPacket<Packet>(0)));
+          r1.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r1.template loadPacket<Packet>(0)));
+          r2.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r2.template loadPacket<Packet>(0)));
+          r3.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r3.template loadPacket<Packet>(0)));
+        }
+      } else {
+        for (Index j = n; j < cols; j++) {
+          for (Index i = m; i < rows; i++) {
+            res(i, j) = blockO[(j - n) * 32 + (i - m)];
+          }
+        }
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+    }
+  }
+}
+
+// Below are the fully optimized versions that are correct only for sizes that
+// are multiple of 32.  It is about a 10% performance benefit to keep these
+// implementations separate.
+
+// Arrange a block of the left input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing yields output (A0 beside B0 in memory):
+// A0 B0 C0 D0
+// A1 B1 C1 D1
+// A2 B2 C2 D2
+// A3 B3 C3 D3
+// A4 B4 C4 D4
+// A5 B5 C5 D5
+// A6 B6 C6 D6
+// A7 B7 C7 D7
+// ...
+// A31 B31 C31 D31
+// E0 F0 G0 H0
+// E1 F1 G1 H1
+// E2 F2 G2 H2
+// E3 F3 G3 H3
+// E4 F4 G4 H4
+// E5 F5 G5 H5
+// E6 F6 G6 H6
+// E7 F7 G7 H7
+// ...
+//
+// Four elements of the same row are arranged contiguously because maddubs and
+// madd both perform an adjacent addition in the kernel.
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
+                     Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
+              Conjugate, PanelMode>::operator()(QInt8* blockA,
+                                                const DataMapper& lhs,
+                                                Index depth, Index rows,
+                                                Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  typedef typename packet_traits<QInt8>::type Packet;
+
+  // Use alternate function for weird sizes
+  if (rows % 32 != 0 || depth % 32 != 0) {
+    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                      Conjugate, PanelMode>
+        lhs_pack;
+    return lhs_pack(blockA, lhs, depth, rows, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Pack rows in sets of 32
+  for (Index m = 0; m < rows; m += 32) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth; k += 8) {
+      // Load vectors
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+  }
+}
+
+// Arrange a block of the right input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing yields row major output (A0 beside A1 in memory):
+// A0 A1 A2 A3 A4 A5 A6 A7
+// B0 B1 B2 B3 B4 B5 B6 B7
+// ...
+//
+// At least four elements of the same col are arranged contiguously because
+// maddubs and madd both perform an adjacent addition in the kernel.  We can
+// save work by leaving 8 adjacent elements because kr = 8.
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                     PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+              PanelMode>::operator()(QUInt8* blockB, const DataMapper& rhs,
+                                     Index depth, Index cols, Index stride,
+                                     Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  typedef typename packet_traits<QUInt8>::type Packet;
+
+  // Use alternate function for weird sizes
+  if (cols % 32 != 0 || depth % 32 != 0) {
+    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                      PanelMode>
+        rhs_pack;
+    return rhs_pack(blockB, rhs, depth, cols, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
+  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
+  blockB_256++;
+
+  // Pack cols in sets of 32
+  for (Index n = 0; n < cols; n += 32) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth; k += 32) {
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
+      PACK_STEP;
+
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
+      PACK_STEP;
+
+      blockB_256 += 24;
+    }
+  }
+#undef PACK_STEP
+}
+
+// Perform the actual multiplication on packed inputs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt8* blockA, const QUInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  // Use alternate function for weird sizes
+  if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
+    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                    ConjugateRhs>
+        gebp;
+    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB,
+                offsetA, offsetB);
+  }
+
+  // Create result block
+  QInt32* blockO = aligned_new<QInt32>(32 * 32);
+  // Allocating the result block is about 5-10% faster than declaring stack
+  // space.  It is unclear why this is the case.
+  // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
+  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 32 columns
+  for (Index n = 0; n < cols; n += 32) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 32 rows
+    for (Index m = 0; m < rows; m += 32) {
+      // Reset index into blockB
+      Index indexR = n / 32 * depth;
+      // Loop over blocks of 8 on depth
+      for (Index k = 0; k < depth; k += 8) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_AD16 = blockA_256[indexL++];
+        __m256i L_AD24 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+        __m256i L_EH16 = blockA_256[indexL++];
+        __m256i L_EH24 = blockA_256[indexL++];
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+        __m256i R_AH16 = blockB_256[indexR++];
+        __m256i R_AH20 = blockB_256[indexR++];
+        __m256i R_AH24 = blockB_256[indexR++];
+        __m256i R_AH28 = blockB_256[indexR++];
+
+        // This constant is used with madd to convert 16 bit to 32 bit
+        const __m256i ONE = _mm256_set1_epi32(0x00010001);
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET,                                                 \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 1,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 2,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 3,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 16);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 17);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 18);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 19);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 20);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 21);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 22);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 23);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 24);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 25);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 26);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 27);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 28);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 29);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 30);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 31);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix
+      Index i = 0;
+      for (Index j = n; j < n + 32; j++) {
+        LinearMapper r0 = res.getLinearMapper(m, j);
+        LinearMapper r1 = res.getLinearMapper(m + 8, j);
+        LinearMapper r2 = res.getLinearMapper(m + 16, j);
+        LinearMapper r3 = res.getLinearMapper(m + 24, j);
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
+        r2.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r2.template loadPacket<Packet>(0)));
+        r3.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r3.template loadPacket<Packet>(0)));
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+    }
+  }
+  aligned_delete(blockO, 32 * 32);
+}
+
+#endif  // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCTAVX2_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProductNEON.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProductNEON.h
new file mode 100644
index 00000000..28684015
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatMatProductNEON.h
@@ -0,0 +1,316 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCTNEON_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCTNEON_H_
+
+namespace Eigen {
+namespace internal {
+
+// Neon optimized implementation where both lhs and rhs are encoded using
+// signed 8bit integers
+#ifdef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
+
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 4,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 8bit Mat-Mat product itself.
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt8* blockA, const QInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+// Neon optimized implementation of the case where the lhs is encoded using
+// signed 8bit integers and the rhs using unsigned 8bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    nr = 4,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt8* blockA, const QUInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+// Neon optimized implementation where the lhs is encoded using unsigned 8bit
+// integers and the rhs using signed 8bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QUInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    nr = 4,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QUInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QUInt8* blockA, const QInt8* blockB,
+                                      Index rows, Index depth, Index cols,
+                                      QInt32 alpha, Index strideA,
+                                      Index strideB, Index offsetA,
+                                      Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+#ifdef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt16 LhsScalar;
+  typedef QInt16 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 4,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 16bit Mat-Mat product itself.
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt16* blockA,
+                  const QInt16* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void
+gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+            ConjugateRhs>::operator()(const DataMapper& res,
+                                      const QInt16* blockA,
+                                      const QInt16* blockB, Index rows,
+                                      Index depth, Index cols, QInt32 alpha,
+                                      Index strideA, Index strideB,
+                                      Index offsetA, Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_MATMATPRODUCTNEON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatVecProduct.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatVecProduct.h
new file mode 100644
index 00000000..bfabaa5e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/MatVecProduct.h
@@ -0,0 +1,151 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_MATVECPRODUCT_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_MATVECPRODUCT_H_
+
+namespace Eigen {
+namespace internal {
+
+// Mat-Vec product
+// Both lhs and rhs are encoded as 8bit signed integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
+};
+
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+// Mat-Vec product
+// Both lhs and rhs are encoded as 16bit signed integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt16, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt16, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt16 alpha);
+};
+
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt16, LhsMapper, ColMajor, ConjugateLhs, QInt16, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt16 alpha) {
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+// Mat-Vec product
+// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QUInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QUInt8 alpha);
+};
+
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QUInt8 alpha) {
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+// Mat-Vec product
+// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QUInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
+};
+
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_MATVECPRODUCT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX.h
new file mode 100644
index 00000000..628a74d7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX.h
@@ -0,0 +1,164 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX_H_
+#ifdef _MSC_VER
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#endif
+
+namespace Eigen {
+namespace internal {
+
+typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
+typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
+
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+  typedef Packet32q8i type;
+  typedef Packet16q8i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet32q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum {
+    size = 32,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
+  return _mm256_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
+                                               from.m_val);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.m_val);
+}
+
+typedef __m256 Packet8f;
+
+template <>
+struct type_casting_traits<float, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i
+pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
+                             const Packet8f& c, const Packet8f& d) {
+  const __m256i a_conv = _mm256_cvtps_epi32(a);
+  const __m256i b_conv = _mm256_cvtps_epi32(b);
+  const __m256i c_conv = _mm256_cvtps_epi32(c);
+  const __m256i d_conv = _mm256_cvtps_epi32(d);
+  __m128i low = _mm256_castsi256_si128(a_conv);
+  __m128i high = _mm256_extractf128_si256(a_conv, 1);
+  __m128i tmp = _mm_packs_epi32(low, high);
+  __m128i low2 = _mm256_castsi256_si128(b_conv);
+  __m128i high2 = _mm256_extractf128_si256(b_conv, 1);
+  __m128i tmp2 = _mm_packs_epi32(low2, high2);
+  __m128i converted_low = _mm_packs_epi16(tmp, tmp2);
+  low = _mm256_castsi256_si128(c_conv);
+  high = _mm256_extractf128_si256(c_conv, 1);
+  tmp = _mm_packs_epi32(low, high);
+  low2 = _mm256_castsi256_si128(d_conv);
+  high2 = _mm256_extractf128_si256(d_conv, 1);
+  tmp2 = _mm_packs_epi32(low2, high2);
+  __m128i converted_high = _mm_packs_epi16(tmp, tmp2);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low),
+                                 converted_high, 1);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX2.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX2.h
new file mode 100644
index 00000000..da122f51
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX2.h
@@ -0,0 +1,560 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX2_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX2_H_
+#ifdef _MSC_VER
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#endif
+
+inline int _mm256_extract_epi16_N0(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
+}
+
+inline int _mm256_extract_epi16_N1(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
+}
+
+inline int _mm256_extract_epi8_N0(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
+}
+
+inline int _mm256_extract_epi8_N1(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
+}
+
+namespace Eigen {
+namespace internal {
+
+typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
+typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
+typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
+typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
+typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
+typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
+typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
+typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+  typedef Packet32q8i type;
+  typedef Packet16q8i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QUInt8> : default_packet_traits {
+  typedef Packet32q8u type;
+  typedef Packet16q8u half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet16q16i type;
+  typedef Packet8q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt32> : default_packet_traits {
+  typedef Packet8q32i type;
+  typedef Packet4q32i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+  };
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet32q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum {
+    size = 32,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet32q8u> {
+  typedef QUInt8 type;
+  typedef Packet16q8u half;
+  enum {
+    size = 32,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8q32i> {
+  typedef QInt32 type;
+  typedef Packet4q32i half;
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// Unaligned load
+template <>
+EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+
+// Aligned load
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+
+// Unaligned store
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.m_val);
+}
+
+// Aligned store
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.m_val);
+}
+
+// Extract first element.
+template <>
+EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+  return _mm256_extract_epi16_N0(a.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
+  return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
+  return _mm256_extract_epi8_N0(a.m_val);
+}
+
+// Initialize to constant value.
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
+  return _mm256_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
+  return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
+  return _mm256_set1_epi32(from.value);
+}
+
+// Basic arithmetic packet ops for QInt32.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_add_epi32(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_sub_epi32(a.m_val, b.m_val);
+}
+// Note: mullo truncates the result to 32 bits.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_mullo_epi32(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
+}
+
+// Min and max.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_min_epi32(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_max_epi32(a.m_val, b.m_val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_min_epi16(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_max_epi16(a.m_val, b.m_val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
+                                                  const Packet32q8u& b) {
+  return _mm256_min_epu8(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
+                                                  const Packet32q8u& b) {
+  return _mm256_max_epu8(a.m_val, b.m_val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
+                                                  const Packet32q8i& b) {
+  return _mm256_min_epi8(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
+                                                  const Packet32q8i& b) {
+  return _mm256_max_epi8(a.m_val, b.m_val);
+}
+
+// Reductions.
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
+  __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst<Packet8q32i>(
+      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
+  __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst<Packet8q32i>(
+      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
+}
+
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
+  __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_min_epu8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
+                  static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
+  __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_max_epu8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
+                  static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
+}
+
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
+  __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_min_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
+  __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_max_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
+}
+
+// Vectorized scaling of Packet32q8i by float.
+template <>
+struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
+  typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_product_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  operator()(const QInt32& a, const double& b) const {
+    return a * b;
+  }
+
+  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
+                                                 const double& b) const {
+    __m256d scale = _mm256_set1_pd(b);
+    __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
+    __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
+    __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
+    __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
+                                   1);
+  }
+};
+
+template <>
+struct functor_traits<scalar_product_op<QInt32, double>> {
+  enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX512.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX512.h
new file mode 100644
index 00000000..0b7a2a97
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/PacketMathAVX512.h
@@ -0,0 +1,531 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX512_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX512_H_
+
+#include "PacketMathAVX2.h"
+
+namespace Eigen {
+namespace internal {
+
+typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i;
+typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i;
+typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u;
+typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i;
+
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+  typedef Packet64q8i type;
+  typedef Packet32q8i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 64,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QUInt8> : default_packet_traits {
+  typedef Packet64q8u type;
+  typedef Packet32q8u half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 64,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet32q16i type;
+  typedef Packet16q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt32> : default_packet_traits {
+  typedef Packet16q32i type;
+  typedef Packet8q32i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet64q8i> {
+  typedef QInt8 type;
+  typedef Packet32q8i half;
+  enum {
+    size = 64,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet32q16i> {
+  typedef QInt16 type;
+  typedef Packet16q16i half;
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet64q8u> {
+  typedef QUInt8 type;
+  typedef Packet32q8u half;
+  enum {
+    size = 64,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16q32i> {
+  typedef QInt32 type;
+  typedef Packet8q32i half;
+  enum {
+    size = 16,
+    alignment = Aligned64,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// Unaligned load
+template <>
+EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+// Aligned load
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+// Unaligned store
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.m_val);
+}
+
+// Aligned store
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.m_val);
+}
+
+// Extract first element.
+template <>
+EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
+  return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
+  return static_cast<uint8_t>(
+      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
+  return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
+  return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
+}
+
+// Initialize to constant value.
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
+  return _mm512_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
+  return _mm512_set1_epi16(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
+  return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
+  return _mm512_set1_epi32(from.value);
+}
+
+// Basic arithmetic packet ops for QInt32.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_add_epi32(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_sub_epi32(a.m_val, b.m_val);
+}
+// Note: mullo truncates the result to 32 bits.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_mullo_epi32(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
+  return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val);
+}
+
+// Min and max.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_min_epi32(a.m_val, b.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_max_epi32(a.m_val, b.m_val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
+                                                  const Packet64q8u& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_min_epu8(a.m_val, b.m_val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
+  __m256i r0 = _mm256_min_epu8(ap0, bp0);
+  __m256i r1 = _mm256_min_epu8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
+                                                  const Packet64q8u& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_max_epu8(a.m_val, b.m_val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
+  __m256i r0 = _mm256_max_epu8(ap0, bp0);
+  __m256i r1 = _mm256_max_epu8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
+                                                  const Packet64q8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_min_epi8(a.m_val, b.m_val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
+  __m256i r0 = _mm256_min_epi8(ap0, bp0);
+  __m256i r1 = _mm256_min_epi8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
+                                                    const Packet32q16i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_min_epi16(a.m_val, b.m_val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
+  __m256i r0 = _mm256_min_epi16(ap0, bp0);
+  __m256i r1 = _mm256_min_epi16(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
+                                                  const Packet64q8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_max_epi8(a.m_val, b.m_val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
+  __m256i r0 = _mm256_max_epi8(ap0, bp0);
+  __m256i r1 = _mm256_max_epi8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
+                                                    const Packet32q16i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_max_epi16(a.m_val, b.m_val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
+  __m256i r0 = _mm256_max_epi16(ap0, bp0);
+  __m256i r1 = _mm256_max_epi16(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+
+// Reductions.
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
+  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  return pfirst(res);
+}
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
+  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  return pfirst(res);
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
+  res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
+  return std::min(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
+  res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
+  return std::max(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
+  res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
+  return std::min(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
+  res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
+  return std::max(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
+  res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
+  Packet4i res =
+      _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
+  res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
+  std::uint32_t w = pfirst(res);
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_PACKETMATHAVX512_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/TypeCastingAVX2.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/TypeCastingAVX2.h
new file mode 100644
index 00000000..79bbb5b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/TypeCastingAVX2.h
@@ -0,0 +1,108 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPECASTINGAVX2_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPECASTINGAVX2_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef __m256 Packet8f;
+
+template <>
+struct type_casting_traits<QInt32, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
+  return _mm256_cvtepi32_ps(a.m_val);
+}
+
+template <>
+struct type_casting_traits<float, QInt32> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template <>
+struct type_casting_traits<QInt32, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i
+pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
+                                const Packet8q32i& c, const Packet8q32i& d) {
+  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val),
+                                         _mm256_packs_epi32(c.m_val, d.m_val));
+  // Since packs does not cross 128 bit lane boundaries,
+  // we have to permute to properly order the final result.
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+template <>
+struct type_casting_traits<float, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i
+pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
+                             const Packet8f& c, const Packet8f& d) {
+  const __m256i a_conv = _mm256_cvtps_epi32(a);
+  const __m256i b_conv = _mm256_cvtps_epi32(b);
+  const __m256i c_conv = _mm256_cvtps_epi32(c);
+  const __m256i d_conv = _mm256_cvtps_epi32(d);
+  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv),
+                                         _mm256_packs_epi32(c_conv, d_conv));
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+template <>
+struct type_casting_traits<QInt32, QUInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8u
+pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
+                                const Packet8q32i& c, const Packet8q32i& d) {
+  // _mm256_packus_epi32 trims negative numbers to 0 but we can't allow numbers
+  // that are too large because _mm256_packus_epi16 expects signed input
+  // (example of problem input: 0x11111111, which saturates to 0xffff = -1,
+  // which saturates to 0).
+  const __m256i a_clip = _mm256_min_epi32(a, _mm256_set1_epi32(255));
+  const __m256i b_clip = _mm256_min_epi32(b, _mm256_set1_epi32(255));
+  const __m256i c_clip = _mm256_min_epi32(c, _mm256_set1_epi32(255));
+  const __m256i d_clip = _mm256_min_epi32(d, _mm256_set1_epi32(255));
+  const __m256i converted = _mm256_packus_epi16(
+      _mm256_packus_epi32(a_clip, b_clip), _mm256_packus_epi32(c_clip, d_clip));
+  // Since packus does not cross 128 bit lane boundaries,
+  // we have to permute to properly order the final result.
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/TypeCastingAVX512.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/TypeCastingAVX512.h
new file mode 100644
index 00000000..b056cf37
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint/TypeCastingAVX512.h
@@ -0,0 +1,206 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPECASTINGAVX512_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPECASTINGAVX512_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef __m512 Packet16f;
+typedef __m512i Packet16i;
+
+template <>
+struct type_casting_traits<QInt32, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
+  return _mm512_cvtepi32_ps(a.m_val);
+}
+
+template <>
+struct type_casting_traits<float, QInt32> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) {
+  return _mm512_cvtps_epi32(a);
+}
+
+template <>
+struct type_casting_traits<float, QInt16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
+                                                  const Packet16f& b) {
+  Packet16i a_int = _mm512_cvtps_epi32(a);
+  Packet16i b_int = _mm512_cvtps_epi32(b);
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_packs_epi32(a_int, b_int);
+#else
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
+                            1);
+#endif
+}
+
+template <>
+struct type_casting_traits<float, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
+                                                 const Packet16f& b,
+                                                 const Packet16f& c,
+                                                 const Packet16f& d) {
+  Packet16i a_int = _mm512_cvtps_epi32(a);
+  Packet16i b_int = _mm512_cvtps_epi32(b);
+  Packet16i c_int = _mm512_cvtps_epi32(c);
+  Packet16i d_int = _mm512_cvtps_epi32(d);
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
+                            _mm512_packs_epi32(c_int, d_int));
+#else
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
+                         _mm512_castsi512_si256(d_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
+                         _mm512_extracti32x8_epi32(d_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_high =
+      _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
+                               _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
+                            abcd_int8_high, 1);
+#endif
+}
+
+template <>
+struct type_casting_traits<QInt32, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<QInt32, QInt16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i
+pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
+                                 const Packet16q32i& c, const Packet16q32i& d) {
+  __m128i a_part = _mm512_cvtsepi32_epi8(a);
+  __m128i b_part = _mm512_cvtsepi32_epi8(b);
+  __m128i c_part = _mm512_cvtsepi32_epi8(c);
+  __m128i d_part = _mm512_cvtsepi32_epi8(d);
+  __m256i ab =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
+  __m256i cd =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
+  __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
+  return converted;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
+    const Packet16q32i& a, const Packet16q32i& b) {
+  __m256i a_part = _mm512_cvtsepi32_epi16(a);
+  __m256i b_part = _mm512_cvtsepi32_epi16(b);
+  __m512i converted =
+      _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
+  return converted;
+}
+
+template <>
+struct type_casting_traits<QInt32, QUInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8u
+pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
+                                 const Packet16q32i& c, const Packet16q32i& d) {
+  // Brute-force saturation since there isn't a pack operation for unsigned
+  // numbers that keeps the elements in order.
+  __m128i a_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(a, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m128i b_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(b, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m128i c_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(c, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m128i d_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(d, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m256i ab =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
+  __m256i cd =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
+  __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
+  return converted;
+}
+
+#if 0
+// The type Packet32q16u does not exist for AVX-512 yet
+template <>
+struct type_casting_traits<QInt32, QUInt16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q16u
+pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
+                                  const Packet16q32i& b) {
+  // Brute-force saturation since there isn't a pack operation for unsigned
+  // numbers that keeps the elements in order.
+  __m256i a_part =
+      _mm512_cvtepi32_epi16(_mm512_max_epi32(
+        _mm512_min_epi32(a, _mm512_set1_epi32(65535)), _mm512_setzero_si512()));
+  __m256i b_part = _mm512_cvtepi32_epi16(
+    _mm512_max_epi32(_mm512_min_epi32(b, _mm512_set1_epi32(65535)),
+                     _mm512_setzero_si512()));
+  __m512i converted =
+      _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
+  return converted;
+}
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPECASTINGAVX512_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint_types.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint_types.h
new file mode 100644
index 00000000..9e074041
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/fixedpoint_types.h
@@ -0,0 +1,354 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPES_H_
+#define XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPES_H_
+
+#include <stdint.h>
+
+#include <Eigen/Core>
+#include <cmath>
+#include <iostream>
+
+namespace Eigen {
+
+// The mantissa part of the fixed point representation. See
+// go/tensorfixedpoint for details
+struct QInt8;
+struct QUInt8;
+struct QInt16;
+struct QUInt16;
+struct QInt32;
+
+template <>
+struct NumTraits<QInt8> : GenericNumTraits<int8_t> {};
+template <>
+struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {};
+template <>
+struct NumTraits<QInt16> : GenericNumTraits<int16_t> {};
+template <>
+struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {};
+template <>
+struct NumTraits<QInt32> : GenericNumTraits<int32_t> {};
+
+namespace internal {
+template <>
+struct scalar_product_traits<QInt32, double> {
+  enum {
+    // Cost = NumTraits<T>::MulCost,
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+}  // namespace internal
+
+// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent
+// the compiler from silently type cast the mantissa into a bigger or a smaller
+// representation.
+struct QInt8 {
+  QInt8() : value(0) {}
+  QInt8(const int8_t v) : value(v) {}
+  QInt8(const QInt32 v);
+
+  operator int() const { return static_cast<int>(value); }
+
+  int8_t value;
+};
+
+struct QUInt8 {
+  QUInt8() : value(0) {}
+  QUInt8(const uint8_t v) : value(v) {}
+  QUInt8(const QInt32 v);
+
+  operator int() const { return static_cast<int>(value); }
+
+  uint8_t value;
+};
+
+struct QInt16 {
+  QInt16() : value(0) {}
+  QInt16(const int16_t v) : value(v) {}
+  QInt16(const QInt32 v);
+  operator int() const { return static_cast<int>(value); }
+
+  int16_t value;
+};
+
+struct QUInt16 {
+  QUInt16() : value(0) {}
+  QUInt16(const uint16_t v) : value(v) {}
+  QUInt16(const QInt32 v);
+  operator int() const { return static_cast<int>(value); }
+
+  uint16_t value;
+};
+
+struct QInt32 {
+  QInt32() : value(0) {}
+  QInt32(const int8_t v) : value(v) {}
+  QInt32(const int32_t v) : value(v) {}
+  QInt32(const uint32_t v) : value(static_cast<int32_t>(v)) {}
+  QInt32(const QInt8 v) : value(v.value) {}
+  QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {}
+#ifdef EIGEN_MAKING_DOCS
+  // Workaround to fix build on PPC.
+  QInt32(unsigned long v) : value(v) {}
+#endif
+
+  operator float() const { return static_cast<float>(value); }
+
+  int32_t value;
+};
+
+EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v)
+    : value(static_cast<int8_t>(
+          v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value))) {}
+EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v)
+    : value(static_cast<uint8_t>(v.value > 255 ? 255
+                                               : (v.value < 0 ? 0 : v.value))) {
+}
+EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v)
+    : value(static_cast<int16_t>(
+          v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value))) {}
+EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v)
+    : value(static_cast<uint16_t>(
+          v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value))) {}
+
+// Basic widening 8-bit operations: This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
+}
+
+// Basic widening 16-bit operations: This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
+}
+
+// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Basic arithmetic operations on QInt32, which behaves like a int32_t.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) {
+  return a.value + b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) {
+  return a.value - b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) {
+  return a.value * b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) {
+  return a.value / b.value;
+}
+EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) {
+  a.value += b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) {
+  a.value -= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) {
+  a.value *= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
+  a.value /= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; }
+
+// Scaling QInt32 by double. We do the arithmetic in double because
+// float only has 23 bits of mantissa, so casting QInt32 to float might reduce
+// accuracy by discarding up to 7 (least significant) bits.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) {
+  return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) {
+  return static_cast<int32_t>(lrint(a * static_cast<double>(b.value)));
+}
+EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) {
+  a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
+  return a;
+}
+
+// Comparisons
+EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) {
+  return a.value == b.value;
+}
+
+EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) {
+  return a.value < b.value;
+}
+
+EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) {
+  return a.value > b.value;
+}
+
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
+  os << a.value;
+  return os;
+}
+
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_FIXEDPOINT_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/metrics.h
new file mode 100644
index 00000000..61cd7acd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/metrics.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_METRICS_H_
+#define XLA_TSL_FRAMEWORK_METRICS_H_
+
+#include <cstdint>
+
+namespace tsl {
+namespace metrics {
+
+// Updates the metrics stored about time BFC allocator spents during delay.
+void UpdateBfcAllocatorDelayTime(const uint64_t delay_usecs);
+
+}  // namespace metrics
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h
new file mode 100644
index 00000000..4ed63288
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_MLIR_STATUS_SCOPED_DIAGNOSTIC_HANDLER_H_
+#define XLA_TSL_FRAMEWORK_MLIR_STATUS_SCOPED_DIAGNOSTIC_HANDLER_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace tsl {
+
+// Diagnostic handler that collects all of the diagnostics reported and produces
+// an absl::Status to return to callers.
+class StatusScopedDiagnosticHandler : public mlir::SourceMgrDiagnosticHandler {
+ public:
+  explicit StatusScopedDiagnosticHandler(mlir::MLIRContext* context);
+
+  // Destruction CHECK-fails if ConsumeStatus has not been called.
+  ~StatusScopedDiagnosticHandler();
+
+  // Returns the aggregate status.
+  absl::Status consumeStatus();
+
+  // Returns the aggregate status, if it is non-OK, or an error, if `result` is
+  // mlir::failed. If the aggregate status is OK and mlir::succeeded(result),
+  // returns OK.
+  absl::Status consumeStatus(mlir::LogicalResult result);
+
+ private:
+  mlir::LogicalResult handleDiagnostic(mlir::Diagnostic& diag);
+
+  std::string diag_str_;
+  llvm::raw_string_ostream diag_stream_;
+  llvm::SourceMgr source_mgr_;
+  absl::Status status_;
+  bool consumed_ = false;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_MLIR_STATUS_SCOPED_DIAGNOSTIC_HANDLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/numeric_types.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/numeric_types.h
new file mode 100644
index 00000000..e7e7fcd2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/numeric_types.h
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_NUMERIC_TYPES_H_
+#define XLA_TSL_FRAMEWORK_NUMERIC_TYPES_H_
+
+#include <complex>
+
+#include "xla/tsl/framework/fixedpoint_types.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+// Single precision complex.
+typedef std::complex<float> complex64;
+// Double precision complex.
+typedef std::complex<double> complex128;
+
+// We use Eigen's QInt implementations for our quantized int types.
+typedef Eigen::QInt8 qint8;
+typedef Eigen::QUInt8 quint8;
+typedef Eigen::QInt32 qint32;
+typedef Eigen::QInt16 qint16;
+typedef Eigen::QUInt16 quint16;
+
+}  // namespace tsl
+
+static inline tsl::bfloat16 FloatToBFloat16(float float_val) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return *reinterpret_cast<tsl::bfloat16*>(
+      reinterpret_cast<uint16_t*>(&float_val));
+#else
+  return *reinterpret_cast<tsl::bfloat16*>(
+      &(reinterpret_cast<uint16_t*>(&float_val)[1]));
+#endif
+}
+
+namespace Eigen {
+template <>
+struct NumTraits<tsl::tstring> : GenericNumTraits<tsl::tstring> {
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost = HugeCost,
+    MulCost = HugeCost
+  };
+
+  static constexpr inline int digits10() { return 0; }
+  static constexpr inline int max_digits10() { return 0; }
+
+ private:
+  static inline tsl::tstring epsilon();
+  static inline tsl::tstring dummy_precision();
+  static inline tsl::tstring lowest();
+  static inline tsl::tstring highest();
+  static inline tsl::tstring infinity();
+  static inline tsl::tstring quiet_NaN();
+};
+
+}  // namespace Eigen
+
+#endif  // XLA_TSL_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/real_time_in_memory_metric.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/real_time_in_memory_metric.h
new file mode 100644
index 00000000..4b6ebce6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/real_time_in_memory_metric.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_FRAMEWORK_REAL_TIME_IN_MEMORY_METRIC_H_
+#define XLA_TSL_FRAMEWORK_REAL_TIME_IN_MEMORY_METRIC_H_
+
+#include <atomic>
+
+namespace tsl {
+
+// Represents a metric with backing storage in local RAM, for exporting real
+// time metrics for consumers that live in the same process. It currently only
+// supports a simple scalar value. The implementation of this class is lossy but
+// minimizes overhead, because there is usually no requirement for metrics
+// consumer to get the exact value for any specific time point, but the metrics
+// update is usually placed at the critical path of some request. This class is
+// a replacement for streamz metric for the above described use case, not
+// complimentary.
+//
+// This class is thread-safe.
+//
+// NOTE: Only integer and floating point values are supported.
+template <typename T>
+class RealTimeInMemoryMetric {
+ public:
+  RealTimeInMemoryMetric() : value_(T{0}) {}
+
+  // Gets the current value of this metric.
+  T Get() const { return value_.load(std::memory_order_relaxed); }
+
+  // Updates the current value of this metric.
+  void Set(T new_value) { value_.store(new_value, std::memory_order_relaxed); }
+
+  RealTimeInMemoryMetric(const RealTimeInMemoryMetric&) = delete;
+  RealTimeInMemoryMetric& operator=(const RealTimeInMemoryMetric&) = delete;
+  RealTimeInMemoryMetric(RealTimeInMemoryMetric&&) = delete;
+  RealTimeInMemoryMetric& operator=(RealTimeInMemoryMetric&&) = delete;
+
+  static_assert(std::is_arithmetic_v<T>);
+
+ private:
+  std::atomic<T> value_;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_REAL_TIME_IN_MEMORY_METRIC_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/serving_device_selector.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/serving_device_selector.h
new file mode 100644
index 00000000..2a5f6509
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/serving_device_selector.h
@@ -0,0 +1,201 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_FRAMEWORK_SERVING_DEVICE_SELECTOR_H_
+#define XLA_TSL_FRAMEWORK_SERVING_DEVICE_SELECTOR_H_
+
+#include <cstdint>
+#include <deque>
+#include <optional>
+
+#include "absl/container/fixed_array.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+
+class ServingDeviceSelector;
+
+// A RAII type for device reservation.
+class DeviceReservation {
+ public:
+  DeviceReservation(int device_index, ServingDeviceSelector* selector);
+  ~DeviceReservation();
+
+  DeviceReservation(const DeviceReservation&) = delete;
+  DeviceReservation& operator=(const DeviceReservation&) = delete;
+
+  DeviceReservation(DeviceReservation&& r);
+  DeviceReservation& operator=(DeviceReservation&& r);
+
+  int device_index() const { return device_index_; }
+
+  void reset();
+
+ private:
+  int device_index_;
+  ServingDeviceSelector* device_selector_;
+};
+
+// Interface for runtime device selection for serving.
+// NOTE: This interface is experimental and subject to change.
+class ServingDeviceSelector {
+ public:
+  // Tracks the running average of certain program execution time.
+  class RunningAverage {
+   public:
+    void Add(int64_t value) {
+      DCHECK_GE(value, 0);
+      sum_ += value;
+      ++count_;
+      latency_ = sum_ / count_;
+    }
+
+    int64_t Get() const { return latency_; }
+
+   private:
+    int64_t sum_ = 0;
+    int64_t count_ = 0;
+    int64_t latency_ = 0;
+  };
+
+  // Tracks the program execution information, including execution time.
+  class ExecutionInfo {
+   public:
+    explicit ExecutionInfo(int64_t num_prefetch_result = 1)
+        : running_average_(num_prefetch_result) {}
+
+    virtual ~ExecutionInfo() = default;
+
+    void AddTime(int64_t value, int result) {
+      DCHECK_GE(value, 0);
+      DCHECK_LT(result, running_average_.size());
+      running_average_.at(result).Add(value);
+    }
+
+    int64_t GetTime(int result) const {
+      DCHECK_LT(result, running_average_.size());
+      return running_average_.at(result).Get();
+    }
+
+    // To be conservative when one of the path is missing.
+    virtual int64_t MaybeGetValidTime(int result) const {
+      return GetTime(result);
+    }
+
+   private:
+    // Records program average execution time, one for each prefetch result.
+    absl::FixedArray<RunningAverage> running_average_;
+  };
+
+  struct DeviceState {
+    explicit DeviceState(int64_t priority_count = 1)
+        : enqueued_programs(priority_count),
+          scheduled_programs(priority_count) {}
+    // TODO(b/295352859): Add more stats to track that are useful for the Policy
+    // to use when selecting a device.
+    struct ProgramInfo {
+      std::string fingerprint;
+      int32_t priority;
+      int64_t req_id = -1;
+      const ExecutionInfo* execution_info;
+      int prefetch_results;
+    };
+    // A queue of enqueued programs, one for each priority level
+    absl::FixedArray<std::deque<ProgramInfo>> enqueued_programs;
+    // A queue of scheduled yet enqueued programs, one for each priority level.
+    // May or may not have fingerprint.
+    absl::FixedArray<std::deque<ProgramInfo>> scheduled_programs;
+    // Timestamp in nanoseconds of last started program.
+    int64_t last_started_ns = 0;
+    // Fingerprint of last enqueued high priority program.
+    std::string last_fingerprint;
+    // The number of scheduled not yet enqueued programs with unknown
+    // fingerprints.
+    int32_t unknown_fingerprint_requests;
+    // Whether execution timer was reset, true iff a program is enqueued while
+    // all queues (for all priorities) were empty.
+    bool timer_reset = true;
+  };
+
+  // Struct of all tracked device states, which will be passed to Policy.
+  struct DeviceStates {
+    absl::Span<const DeviceState> states;
+  };
+
+  // Policy used to select a device.
+  class Policy {
+   public:
+    virtual ~Policy() = default;
+    // Selects a device based on the tracked states of all devices.
+    virtual int SelectDevice(absl::string_view program_fingerprint,
+                             const DeviceStates& device_states) = 0;
+  };
+
+  virtual ~ServingDeviceSelector() = default;
+
+  // Reserves a device according to a given selection policy. The reserved
+  // device will be freed when the lifetime of the returned `DeviceReservation`
+  // object ends.
+  virtual DeviceReservation ReserveDevice(
+      absl::string_view program_fingerprint) = 0;
+
+  // Enqueues a program on the given device. Used only for load tracking
+  // purposes when the device selection feature is unused.
+  virtual void Enqueue(int32_t device_index, absl::string_view fingerprint) = 0;
+
+  // Marks the completion of a program on the given device. Used only for load
+  // tracking purposes when the device selection feature is unused.
+  virtual void Completed(int32_t device_index, bool had_error) = 0;
+
+ protected:
+  // A helper function for Enqueue. The EnqueueHelper does the following things.
+  //  1. If there are programs in the scheduled_programs queue of the given
+  //     priority, move the program to the corresponding enqueued_programs
+  //     queue. Update the fingerprint if it is unknown. This is a typical TF1
+  //     use case.
+  //  2. If there are no programs in the scheduled_programs queue of the given
+  //     priority, create the program of the fingerprint and place it in the
+  //     corresponding enqueued_programs queue.
+  //     This can happen in two cases: (1) TFRT that doesn't need
+  //     scheduled_programs queue. (2) In TF1, Schedule() was not called prior
+  //     to Enqueue().
+  // This helper also updates last_started_ns and timer_reset.
+  static void EnqueueHelper(DeviceState& device_state, int32_t device_index,
+                            ExecutionInfo& execution_info,
+                            absl::string_view fingerprint, int32_t priority,
+                            int64_t req_id, size_t priority_queue_count,
+                            int prefetch_results, int64_t now_ns);
+  // A helper function tells a program has completed on the given device.
+  static void CompletedHelper(DeviceState& device_state, int32_t device_index,
+                              int32_t priority,
+                              std::optional<int64_t>& min_exec_time,
+                              bool had_error, int64_t now_ns);
+  // Helper to estimate the time until the core becomes idle in nanoseconds.
+  // Only considers queues with priority at least as high as 'priority'.
+  static int64_t EstimateTimeTillIdleNs(const DeviceState& device_state,
+                                        int32_t priority, int64_t min_exec_time,
+                                        int64_t now_ns);
+
+ private:
+  friend DeviceReservation;
+
+  // Frees the given device reservation.
+  virtual void FreeDeviceReservation(const DeviceReservation& reservation) = 0;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_SERVING_DEVICE_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/serving_device_selector_policies.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/serving_device_selector_policies.h
new file mode 100644
index 00000000..60316bde
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/serving_device_selector_policies.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_FRAMEWORK_SERVING_DEVICE_SELECTOR_POLICIES_H_
+#define XLA_TSL_FRAMEWORK_SERVING_DEVICE_SELECTOR_POLICIES_H_
+
+#include <atomic>
+
+#include "xla/tsl/framework/serving_device_selector.h"
+
+namespace tsl {
+
+enum class ServingDeviceSelectorPolicy {
+  kRoundRobin,
+};
+
+class RoundRobinPolicy : public ServingDeviceSelector::Policy {
+ public:
+  RoundRobinPolicy() : ordinal_(0) {}
+
+  int SelectDevice(
+      absl::string_view program_fingerprint,
+      const ServingDeviceSelector::DeviceStates& device_states) override;
+
+ private:
+  std::atomic<uint64_t> ordinal_;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_SERVING_DEVICE_SELECTOR_POLICIES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/shared_counter.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/shared_counter.h
new file mode 100644
index 00000000..79a37675
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/shared_counter.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_FRAMEWORK_SHARED_COUNTER_H_
+#define XLA_TSL_FRAMEWORK_SHARED_COUNTER_H_
+
+#include <atomic>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+// A lightweight thread-safe monotone counter for establishing
+// temporal ordering.
+class SharedCounter {
+ public:
+  int64_t get() { return value_; }
+  int64_t next() { return ++value_; }
+
+ private:
+  std::atomic<int64_t> value_{0};
+};
+
+}  // namespace tsl
+#endif  // XLA_TSL_FRAMEWORK_SHARED_COUNTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/test_util/mock_serving_device_selector.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/test_util/mock_serving_device_selector.h
new file mode 100644
index 00000000..4e876ae3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/test_util/mock_serving_device_selector.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_TEST_UTIL_MOCK_SERVING_DEVICE_SELECTOR_H_
+#define XLA_TSL_FRAMEWORK_TEST_UTIL_MOCK_SERVING_DEVICE_SELECTOR_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/framework/serving_device_selector.h"
+#include "xla/tsl/platform/test.h"
+namespace tsl {
+
+namespace test_util {
+
+class MockServingDeviceSelector : public tsl::ServingDeviceSelector {
+ public:
+  MOCK_METHOD(tsl::DeviceReservation, ReserveDevice, (absl::string_view),
+              (override));
+  MOCK_METHOD(void, Enqueue, (int32_t, absl::string_view), (override));
+  MOCK_METHOD(void, Completed, (int32_t, bool), (override));
+  MOCK_METHOD(void, FreeDeviceReservation, (const tsl::DeviceReservation&),
+              (override));
+};
+
+}  // namespace test_util
+}  // namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_TEST_UTIL_MOCK_SERVING_DEVICE_SELECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/tracking_allocator.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/tracking_allocator.h
new file mode 100644
index 00000000..a0d5260d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/tracking_allocator.h
@@ -0,0 +1,137 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_TRACKING_ALLOCATOR_H_
+#define XLA_TSL_FRAMEWORK_TRACKING_ALLOCATOR_H_
+
+#include <unordered_map>
+
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/lib/gtl/inlined_vector.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+// TrackingAllocator is a wrapper for an Allocator. It keeps a running
+// count of the number of bytes allocated through the wrapper. It is
+// used by the Executor to "charge" allocations to particular Op
+// executions. Each Op gets a separate TrackingAllocator wrapper
+// around the underlying allocator.
+//
+// The implementation assumes the invariant that all calls to
+// AllocateRaw by an Op (or work items spawned by the Op) will occur
+// before the Op's Compute method returns. Thus the high watermark is
+// established once Compute returns.
+//
+// DeallocateRaw can be called long after the Op has finished,
+// e.g. when an output tensor is deallocated, and the wrapper cannot
+// be deleted until the last of these calls has occurred.  The
+// TrackingAllocator keeps track of outstanding calls using a
+// reference count, and deletes itself once the last call has been
+// received and the high watermark has been retrieved.
+struct AllocRecord {
+  AllocRecord(int64_t a_btyes, int64_t a_micros)
+      : alloc_bytes(a_btyes), alloc_micros(a_micros) {}
+  AllocRecord() : AllocRecord(0, 0) {}
+
+  int64_t alloc_bytes;
+  int64_t alloc_micros;
+};
+
+class TrackingAllocator : public Allocator {
+ public:
+  explicit TrackingAllocator(Allocator* allocator, bool track_ids);
+  std::string Name() override { return allocator_->Name(); }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return AllocateRaw(alignment, num_bytes, AllocationAttributes());
+  }
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocation_attr) override;
+  void DeallocateRaw(void* ptr) override;
+  bool TracksAllocationSizes() const override;
+  size_t RequestedSize(const void* ptr) const override;
+  size_t AllocatedSize(const void* ptr) const override;
+  int64_t AllocationId(const void* ptr) const override;
+  absl::optional<AllocatorStats> GetStats() override;
+  bool ClearStats() override;
+
+  AllocatorMemoryType GetMemoryType() const override {
+    return allocator_->GetMemoryType();
+  }
+
+  // If the underlying allocator tracks allocation sizes, this returns
+  // a tuple where the first value is the total number of bytes
+  // allocated through this wrapper, the second value is the high
+  // watermark of bytes allocated through this wrapper and the third value is
+  // the allocated bytes through this wrapper that are still alive. If the
+  // underlying allocator does not track allocation sizes the first
+  // value is the total number of bytes requested through this wrapper
+  // and the second and the third are 0.
+  //
+  std::tuple<size_t, size_t, size_t> GetSizes();
+  // After GetRecordsAndUnRef is called, the only further calls allowed
+  // on this wrapper are calls to DeallocateRaw with pointers that
+  // were allocated by this wrapper and have not yet been
+  // deallocated. After this call completes and all allocated pointers
+  // have been deallocated the wrapper will delete itself.
+  absl::InlinedVector<AllocRecord, 4UL> GetRecordsAndUnRef();
+  // Returns a copy of allocation records collected so far.
+  absl::InlinedVector<AllocRecord, 4UL> GetCurrentRecords();
+
+ protected:
+  ~TrackingAllocator() override {}
+
+ private:
+  bool UnRef() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  Allocator* allocator_;  // not owned.
+  mutable mutex mu_;
+  // the number of calls to AllocateRaw that have not yet been matched
+  // by a corresponding call to DeAllocateRaw, plus 1 if the Executor
+  // has not yet read out the high watermark.
+  int ref_ TF_GUARDED_BY(mu_);
+  // the current number of outstanding bytes that have been allocated
+  // by this wrapper, or 0 if the underlying allocator does not track
+  // allocation sizes.
+  size_t allocated_ TF_GUARDED_BY(mu_);
+  // the maximum number of outstanding bytes that have been allocated
+  // by this wrapper, or 0 if the underlying allocator does not track
+  // allocation sizes.
+  size_t high_watermark_ TF_GUARDED_BY(mu_);
+  // the total number of bytes that have been allocated by this
+  // wrapper if the underlying allocator tracks allocation sizes,
+  // otherwise the total number of bytes that have been requested by
+  // this allocator.
+  size_t total_bytes_ TF_GUARDED_BY(mu_);
+
+  absl::InlinedVector<AllocRecord, 4UL> allocations_ TF_GUARDED_BY(mu_);
+
+  // Track allocations locally if requested in the constructor and the
+  // underlying allocator doesn't already do it for us.
+  const bool track_sizes_locally_;
+  struct Chunk {
+    size_t requested_size;
+    size_t allocated_size;
+    int64_t allocation_id;
+  };
+  std::unordered_map<const void*, Chunk> in_use_ TF_GUARDED_BY(mu_);
+  int64_t next_allocation_id_ TF_GUARDED_BY(mu_);
+};
+
+}  // end namespace tsl
+
+#endif  // XLA_TSL_FRAMEWORK_TRACKING_ALLOCATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/type_traits.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/type_traits.h
new file mode 100644
index 00000000..5aabbf28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/framework/type_traits.h
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_FRAMEWORK_TYPE_TRAITS_H_
+#define XLA_TSL_FRAMEWORK_TYPE_TRAITS_H_
+
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "xla/tsl/framework/numeric_types.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+// Functions to define quantization attribute of types.
+struct true_type {
+  static constexpr bool value = true;
+};
+struct false_type {
+  static constexpr bool value = false;
+};
+
+// Default is_quantized is false.
+template <typename T>
+struct is_quantized : false_type {};
+
+// Specialize the quantized types.
+template <>
+struct is_quantized<qint8> : true_type {};
+template <>
+struct is_quantized<quint8> : true_type {};
+template <>
+struct is_quantized<qint32> : true_type {};
+template <>
+struct is_quantized<qint16> : true_type {};
+template <>
+struct is_quantized<quint16> : true_type {};
+
+// Default is_complex is false.
+template <typename T>
+struct is_complex : false_type {};
+
+// Specialize std::complex<float> and std::complex<double> types.
+template <>
+struct is_complex<std::complex<float>> : true_type {};
+template <>
+struct is_complex<std::complex<double>> : true_type {};
+
+// is_simple_type<T>::value if T[] can be safely constructed and destructed
+// without running T() and ~T().  We do not use std::is_trivial<T>
+// directly because std::complex<float> and std::complex<double> are
+// not trivial, but their arrays can be constructed and destructed
+// without running their default ctors and dtors.
+template <typename T>
+struct is_simple_type {
+  static constexpr bool value =
+      std::is_trivial<T>::value || std::is_same<T, Eigen::half>::value ||
+      std::is_same<T, complex64>::value || std::is_same<T, complex128>::value ||
+      is_quantized<T>::value || std::is_same<T, bfloat16>::value ||
+      std::is_same<T, float8_e3m4>::value ||
+      std::is_same<T, float8_e4m3>::value ||
+      std::is_same<T, float8_e4m3fn>::value ||
+      std::is_same<T, float8_e4m3fnuz>::value ||
+      std::is_same<T, float8_e4m3b11fnuz>::value ||
+      std::is_same<T, float8_e5m2>::value ||
+      std::is_same<T, float8_e5m2fnuz>::value || std::is_same<T, int4>::value ||
+      std::is_same<T, uint4>::value;
+};
+
+}  // namespace tsl
+
+// Define numeric limits for our quantized as subclasses of the
+// standard types.
+namespace std {
+template <>
+class numeric_limits<tsl::qint8> : public numeric_limits<tsl::int8> {};
+template <>
+class numeric_limits<tsl::quint8> : public numeric_limits<tsl::uint8> {};
+template <>
+class numeric_limits<tsl::qint16> : public numeric_limits<tsl::int16> {};
+template <>
+class numeric_limits<tsl::quint16> : public numeric_limits<tsl::uint16> {};
+template <>
+class numeric_limits<tsl::qint32> : public numeric_limits<tsl::int32> {};
+
+// Specialize is_signed for quantized types.
+template <>
+struct is_signed<tsl::qint8> : public is_signed<tsl::int8> {};
+template <>
+struct is_signed<tsl::quint8> : public is_signed<tsl::uint8> {};
+template <>
+struct is_signed<tsl::qint16> : public is_signed<tsl::int16> {};
+template <>
+struct is_signed<tsl::quint16> : public is_signed<tsl::uint16> {};
+template <>
+struct is_signed<tsl::qint32> : public is_signed<tsl::int32> {};
+
+}  // namespace std
+
+#endif  // XLA_TSL_FRAMEWORK_TYPE_TRAITS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/bitmap.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/bitmap.h
new file mode 100644
index 00000000..173c0329
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/bitmap.h
@@ -0,0 +1,107 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_CORE_BITMAP_H_
+#define XLA_TSL_LIB_CORE_BITMAP_H_
+
+#include <string>
+
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+namespace core {
+
+class Bitmap {
+ public:
+  // Create a bitmap that holds 0 bits.
+  Bitmap();
+
+  // Create a bitmap that holds n bits, all initially zero.
+  explicit Bitmap(size_t n);
+
+  ~Bitmap();
+
+  Bitmap(const Bitmap&) = delete;
+  Bitmap& operator=(const Bitmap&) = delete;
+
+  // Return the number of bits that the bitmap can hold.
+  size_t bits() const;
+
+  // Replace contents of *this with a bitmap of n bits, all set to zero.
+  void Reset(size_t n);
+
+  // Return the contents of the ith bit.
+  // REQUIRES: i < bits()
+  bool get(size_t i) const;
+
+  // Set the contents of the ith bit to true.
+  // REQUIRES: i < bits()
+  void set(size_t i);
+
+  // Set the contents of the ith bit to false.
+  // REQUIRES: i < bits()
+  void clear(size_t i);
+
+  // Return the smallest i such that i >= start and !get(i).
+  // Returns bits() if no such i exists.
+  size_t FirstUnset(size_t start) const;
+
+  // Returns the bitmap as an ascii string of '0' and '1' characters, bits()
+  // characters in length.
+  std::string ToString() const;
+
+ private:
+  typedef uint32_t Word;
+  static constexpr size_t kBits = 32;
+
+  // Return the number of words needed to store n bits.
+  static size_t NumWords(size_t n) { return (n + kBits - 1) / kBits; }
+
+  // Return the mask to use for the ith bit in a word.
+  static Word Mask(size_t i) { return 1ull << i; }
+
+  size_t nbits_;  // Length of bitmap in bits.
+  Word* word_;
+};
+
+// Implementation details follow.  Clients should ignore.
+
+inline Bitmap::Bitmap() : nbits_(0), word_(nullptr) {}
+
+inline Bitmap::Bitmap(size_t n) : Bitmap() { Reset(n); }
+
+inline Bitmap::~Bitmap() { delete[] word_; }
+
+inline size_t Bitmap::bits() const { return nbits_; }
+
+inline bool Bitmap::get(size_t i) const {
+  DCHECK_LT(i, nbits_);
+  return word_[i / kBits] & Mask(i % kBits);
+}
+
+inline void Bitmap::set(size_t i) {
+  DCHECK_LT(i, nbits_);
+  word_[i / kBits] |= Mask(i % kBits);
+}
+
+inline void Bitmap::clear(size_t i) {
+  DCHECK_LT(i, nbits_);
+  word_[i / kBits] &= ~Mask(i % kBits);
+}
+
+}  // namespace core
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_CORE_BITMAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/bits.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/bits.h
new file mode 100644
index 00000000..af4d6d25
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/bits.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_CORE_BITS_H_
+#define XLA_TSL_LIB_CORE_BITS_H_
+
+#include <cstdint>
+
+#include "absl/numeric/bits.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+
+// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+inline int Log2Floor(uint32_t n) { return absl::bit_width(n) - 1; }
+
+// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+inline int Log2Floor64(uint64_t n) { return absl::bit_width(n) - 1; }
+
+// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+inline int Log2Ceiling(uint32_t n) {
+  return n == 0 ? -1 : absl::bit_width(n - 1);
+}
+
+// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+inline int Log2Ceiling64(uint64_t n) {
+  return n == 0 ? -1 : absl::bit_width(n - 1);
+}
+
+inline uint32_t NextPowerOfTwo(uint32_t value) { return absl::bit_ceil(value); }
+
+inline uint64_t NextPowerOfTwo64(uint64_t value) {
+  return absl::bit_ceil(value);
+}
+
+inline int64_t NextPowerOfTwoS64(int64_t value) {
+  constexpr int64_t kMaxRepresentablePowerOfTwo =
+      static_cast<int64_t>(uint64_t{1} << 62);
+  DCHECK_GE(value, 0);
+  DCHECK_LE(value, kMaxRepresentablePowerOfTwo);
+  return static_cast<int64_t>(absl::bit_ceil(static_cast<uint64_t>(value)));
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_CORE_BITS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/status_test_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/status_test_util.h
new file mode 100644
index 00000000..a75eab84
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/core/status_test_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_CORE_STATUS_TEST_UTIL_H_
+#define XLA_TSL_LIB_CORE_STATUS_TEST_UTIL_H_
+
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/test.h"
+
+// Macros for testing the results of functions that return tensorflow::Status.
+#define TF_EXPECT_OK(statement) EXPECT_THAT((statement), ::tsl::testing::IsOk())
+#define TF_ASSERT_OK(statement) ASSERT_THAT((statement), ::tsl::testing::IsOk())
+
+// There are no EXPECT_NOT_OK/ASSERT_NOT_OK macros since they would not
+// provide much value (when they fail, they would just print the OK status
+// which conveys no more information than EXPECT_FALSE(status.ok());
+// If you want to check for particular errors, a better alternative is with
+// status matchers:
+// EXPECT_THAT(s, tensorflow::testing::StatusIs(status.code(), "message"));
+
+#endif  // XLA_TSL_LIB_CORE_STATUS_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/compactptrset.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/compactptrset.h
new file mode 100644
index 00000000..3848430e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/compactptrset.h
@@ -0,0 +1,209 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_GTL_COMPACTPTRSET_H_
+#define XLA_TSL_LIB_GTL_COMPACTPTRSET_H_
+
+#include <type_traits>
+
+#include "xla/tsl/lib/gtl/flatset.h"
+
+namespace tsl {
+namespace gtl {
+
+// CompactPointerSet<T> is like a std::unordered_set<T> but is optimized
+// for small sets (<= 1 element).  T must be a pointer type.
+template <typename T>
+class CompactPointerSet {
+ private:
+  using BigRep = FlatSet<T>;
+
+ public:
+  using value_type = T;
+
+  CompactPointerSet() : rep_(0) {}
+
+  ~CompactPointerSet() {
+    static_assert(
+        std::is_pointer<T>::value,
+        "CompactPointerSet<T> can only be used with T's that are pointers");
+    if (isbig()) delete big();
+  }
+
+  CompactPointerSet(const CompactPointerSet& other) : rep_(0) { *this = other; }
+
+  CompactPointerSet& operator=(const CompactPointerSet& other) {
+    if (this == &other) return *this;
+    if (other.isbig()) {
+      // big => any
+      if (!isbig()) MakeBig();
+      *big() = *other.big();
+    } else if (isbig()) {
+      // !big => big
+      big()->clear();
+      if (other.rep_ != 0) {
+        big()->insert(reinterpret_cast<T>(other.rep_));
+      }
+    } else {
+      // !big => !big
+      rep_ = other.rep_;
+    }
+    return *this;
+  }
+
+  class iterator {
+   public:
+    typedef ssize_t difference_type;
+    typedef T value_type;
+    typedef const T* pointer;
+    typedef const T& reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
+    explicit iterator(uintptr_t rep)
+        : bigrep_(false), single_(reinterpret_cast<T>(rep)) {}
+    explicit iterator(typename BigRep::iterator iter)
+        : bigrep_(true), single_(nullptr), iter_(iter) {}
+
+    iterator& operator++() {
+      if (bigrep_) {
+        ++iter_;
+      } else {
+        DCHECK(single_ != nullptr);
+        single_ = nullptr;
+      }
+      return *this;
+    }
+    // maybe post-increment?
+
+    bool operator==(const iterator& other) const {
+      if (bigrep_) {
+        return iter_ == other.iter_;
+      } else {
+        return single_ == other.single_;
+      }
+    }
+    bool operator!=(const iterator& other) const { return !(*this == other); }
+
+    const T& operator*() const {
+      if (bigrep_) {
+        return *iter_;
+      } else {
+        DCHECK(single_ != nullptr);
+        return single_;
+      }
+    }
+
+   private:
+    friend class CompactPointerSet;
+    bool bigrep_;
+    T single_;
+    typename BigRep::iterator iter_;
+  };
+  using const_iterator = iterator;
+
+  bool empty() const { return isbig() ? big()->empty() : (rep_ == 0); }
+  size_t size() const { return isbig() ? big()->size() : (rep_ == 0 ? 0 : 1); }
+
+  void clear() {
+    if (isbig()) {
+      delete big();
+    }
+    rep_ = 0;
+  }
+
+  std::pair<iterator, bool> insert(T elem) {
+    if (!isbig()) {
+      if (rep_ == 0) {
+        uintptr_t v = reinterpret_cast<uintptr_t>(elem);
+        if (v == 0 || ((v & 0x3) != 0)) {
+          // Cannot use small representation for nullptr.  Fall through.
+        } else {
+          rep_ = v;
+          return {iterator(v), true};
+        }
+      }
+      MakeBig();
+    }
+    auto p = big()->insert(elem);
+    return {iterator(p.first), p.second};
+  }
+
+  template <typename InputIter>
+  void insert(InputIter begin, InputIter end) {
+    for (; begin != end; ++begin) {
+      insert(*begin);
+    }
+  }
+
+  const_iterator begin() const {
+    return isbig() ? iterator(big()->begin()) : iterator(rep_);
+  }
+  const_iterator end() const {
+    return isbig() ? iterator(big()->end()) : iterator(0);
+  }
+
+  iterator find(T elem) const {
+    if (rep_ == reinterpret_cast<uintptr_t>(elem)) {
+      return iterator(rep_);
+    } else if (!isbig()) {
+      return iterator(0);
+    } else {
+      return iterator(big()->find(elem));
+    }
+  }
+
+  size_t count(T elem) const { return find(elem) != end() ? 1 : 0; }
+
+  size_t erase(T elem) {
+    if (!isbig()) {
+      if (rep_ == reinterpret_cast<uintptr_t>(elem)) {
+        rep_ = 0;
+        return 1;
+      } else {
+        return 0;
+      }
+    } else {
+      return big()->erase(elem);
+    }
+  }
+
+ private:
+  // Size         rep_
+  // -------------------------------------------------------------------------
+  // 0            0
+  // 1            The pointer itself (bottom bits == 00)
+  // large        Pointer to a BigRep (bottom bits == 01)
+  uintptr_t rep_;
+
+  bool isbig() const { return (rep_ & 0x3) == 1; }
+  BigRep* big() const {
+    DCHECK(isbig());
+    return reinterpret_cast<BigRep*>(rep_ - 1);
+  }
+
+  void MakeBig() {
+    DCHECK(!isbig());
+    BigRep* big = new BigRep;
+    if (rep_ != 0) {
+      big->insert(reinterpret_cast<T>(rep_));
+    }
+    rep_ = reinterpret_cast<uintptr_t>(big) + 0x1;
+  }
+};
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_COMPACTPTRSET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatmap.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatmap.h
new file mode 100644
index 00000000..63ece98a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatmap.h
@@ -0,0 +1,396 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_GTL_FLATMAP_H_
+#define XLA_TSL_LIB_GTL_FLATMAP_H_
+
+#include <stddef.h>
+
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <utility>
+
+#include "xla/tsl/lib/gtl/flatrep.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/hash.h"
+
+namespace tsl {
+namespace gtl {
+
+// FlatMap<K,V,...> provides a map from K to V.
+//
+// The map is implemented using an open-addressed hash table.  A
+// single array holds entire map contents and collisions are resolved
+// by probing at a sequence of locations in the array.
+template <typename Key, typename Val, class Hash = hash<Key>,
+          class Eq = std::equal_to<Key>>
+class FlatMap {
+ private:
+  // Forward declare some internal types needed in public section.
+  struct Bucket;
+
+  // We cannot use std::pair<> since internal representation stores
+  // keys and values in separate arrays, so we make a custom struct
+  // that holds references to the internal key, value elements.
+  //
+  // We define the struct as private ValueType, and typedef it as public
+  // value_type, to work around a gcc bug when compiling the iterators.
+  struct ValueType {
+    typedef Key first_type;
+    typedef Val second_type;
+
+    const Key& first;
+    Val& second;
+    ValueType(const Key& k, Val& v) : first(k), second(v) {}
+  };
+
+ public:
+  typedef Key key_type;
+  typedef Val mapped_type;
+  typedef Hash hasher;
+  typedef Eq key_equal;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef ValueType value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  FlatMap() : FlatMap(1) {}
+
+  explicit FlatMap(size_t N, const Hash& hf = Hash(), const Eq& eq = Eq())
+      : rep_(N, hf, eq) {}
+
+  FlatMap(const FlatMap& src) : rep_(src.rep_) {}
+
+  // Move constructor leaves src in a valid but unspecified state (same as
+  // std::unordered_map).
+  FlatMap(FlatMap&& src) noexcept : rep_(std::move(src.rep_)) {}
+
+  template <typename InputIter>
+  FlatMap(InputIter first, InputIter last, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatMap(N, hf, eq) {
+    insert(first, last);
+  }
+
+  FlatMap(std::initializer_list<std::pair<const Key, Val>> init, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatMap(init.begin(), init.end(), N, hf, eq) {}
+
+  FlatMap& operator=(const FlatMap& src) {
+    rep_.CopyFrom(src.rep_);
+    return *this;
+  }
+
+  // Move-assignment operator leaves src in a valid but unspecified state (same
+  // as std::unordered_map).
+  FlatMap& operator=(FlatMap&& src) noexcept {
+    rep_.MoveFrom(std::move(src.rep_));
+    return *this;
+  }
+
+  ~FlatMap() {}
+
+  void swap(FlatMap& x) noexcept { rep_.swap(x.rep_); }
+  void clear_no_resize() { rep_.clear_no_resize(); }
+  void clear() { rep_.clear(); }
+  void reserve(size_t N) { rep_.Resize(std::max(N, size())); }
+  void rehash(size_t N) { rep_.Resize(std::max(N, size())); }
+  void resize(size_t N) { rep_.Resize(std::max(N, size())); }
+  size_t size() const { return rep_.size(); }
+  bool empty() const { return size() == 0; }
+  size_t bucket_count() const { return rep_.bucket_count(); }
+  hasher hash_function() const { return rep_.hash_function(); }
+  key_equal key_eq() const { return rep_.key_eq(); }
+
+  class iterator {
+   public:
+    typedef typename FlatMap::difference_type difference_type;
+    typedef typename FlatMap::value_type value_type;
+    typedef typename FlatMap::pointer pointer;
+    typedef typename FlatMap::reference reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
+    iterator() : b_(nullptr), end_(nullptr), i_(0) {}
+
+    // Make iterator pointing at first element at or after b.
+    iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) { SkipUnused(); }
+
+    // Make iterator pointing exactly at ith element in b, which must exist.
+    iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {
+      FillValue();
+    }
+
+    reference operator*() { return *val(); }
+    pointer operator->() { return val(); }
+    bool operator==(const iterator& x) const {
+      return b_ == x.b_ && i_ == x.i_;
+    }
+    bool operator!=(const iterator& x) const { return !(*this == x); }
+    iterator& operator++() {
+      DCHECK(b_ != end_);
+      i_++;
+      SkipUnused();
+      return *this;
+    }
+    iterator operator++(int /*indicates postfix*/) {
+      iterator tmp(*this);
+      ++*this;
+      return tmp;
+    }
+
+   private:
+    friend class FlatMap;
+    Bucket* b_;
+    Bucket* end_;
+    char space_ alignas(value_type)[sizeof(value_type)];
+    uint32 i_;
+
+    pointer val() { return reinterpret_cast<pointer>(space_); }
+    void FillValue() { new (space_) value_type(b_->key(i_), b_->val(i_)); }
+    void SkipUnused() {
+      while (b_ < end_) {
+        if (i_ >= Rep::kWidth) {
+          i_ = 0;
+          b_++;
+        } else if (b_->marker[i_] < 2) {
+          i_++;
+        } else {
+          FillValue();
+          break;
+        }
+      }
+    }
+  };
+
+  class const_iterator {
+   private:
+    mutable iterator rep_;  // Share state and logic with non-const iterator.
+
+   public:
+    typedef typename FlatMap::difference_type difference_type;
+    typedef typename FlatMap::value_type value_type;
+    typedef typename FlatMap::const_pointer pointer;
+    typedef typename FlatMap::const_reference reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
+    const_iterator() : rep_() {}
+    const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
+    const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
+
+    reference operator*() const { return *rep_.val(); }
+    pointer operator->() const { return rep_.val(); }
+    bool operator==(const const_iterator& x) const { return rep_ == x.rep_; }
+    bool operator!=(const const_iterator& x) const { return rep_ != x.rep_; }
+    const_iterator& operator++() {
+      ++rep_;
+      return *this;
+    }
+    const_iterator operator++(int /*indicates postfix*/) {
+      const_iterator tmp(*this);
+      ++*this;
+      return tmp;
+    }
+  };
+
+  iterator begin() { return iterator(rep_.start(), rep_.limit()); }
+  iterator end() { return iterator(rep_.limit(), rep_.limit()); }
+  const_iterator begin() const {
+    return const_iterator(rep_.start(), rep_.limit());
+  }
+  const_iterator end() const {
+    return const_iterator(rep_.limit(), rep_.limit());
+  }
+
+  size_t count(const Key& k) const { return rep_.Find(k).found ? 1 : 0; }
+  iterator find(const Key& k) {
+    auto r = rep_.Find(k);
+    return r.found ? iterator(r.b, rep_.limit(), r.index) : end();
+  }
+  const_iterator find(const Key& k) const {
+    auto r = rep_.Find(k);
+    return r.found ? const_iterator(r.b, rep_.limit(), r.index) : end();
+  }
+
+  Val& at(const Key& k) {
+    auto r = rep_.Find(k);
+    DCHECK(r.found);
+    return r.b->val(r.index);
+  }
+  const Val& at(const Key& k) const {
+    auto r = rep_.Find(k);
+    DCHECK(r.found);
+    return r.b->val(r.index);
+  }
+
+  template <typename P>
+  std::pair<iterator, bool> insert(const P& p) {
+    return Insert(p.first, p.second);
+  }
+  std::pair<iterator, bool> insert(const std::pair<const Key, Val>& p) {
+    return Insert(p.first, p.second);
+  }
+  template <typename InputIter>
+  void insert(InputIter first, InputIter last) {
+    for (; first != last; ++first) {
+      insert(*first);
+    }
+  }
+
+  Val& operator[](const Key& k) { return IndexOp(k); }
+  Val& operator[](Key&& k) { return IndexOp(std::forward<Key>(k)); }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return InsertPair(std::make_pair(std::forward<Args>(args)...));
+  }
+
+  size_t erase(const Key& k) {
+    auto r = rep_.Find(k);
+    if (!r.found) return 0;
+    rep_.Erase(r.b, r.index);
+    return 1;
+  }
+  iterator erase(iterator pos) {
+    rep_.Erase(pos.b_, pos.i_);
+    ++pos;
+    return pos;
+  }
+  iterator erase(iterator pos, iterator last) {
+    for (; pos != last; ++pos) {
+      rep_.Erase(pos.b_, pos.i_);
+    }
+    return pos;
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& k) {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+  std::pair<const_iterator, const_iterator> equal_range(const Key& k) const {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+
+  bool operator==(const FlatMap& x) const {
+    if (size() != x.size()) return false;
+    for (auto& p : x) {
+      auto i = find(p.first);
+      if (i == end()) return false;
+      if (i->second != p.second) return false;
+    }
+    return true;
+  }
+  bool operator!=(const FlatMap& x) const { return !(*this == x); }
+
+  // If key exists in the table, prefetch the associated value.  This
+  // is a hint, and may have no effect.
+  void prefetch_value(const Key& key) const { rep_.Prefetch(key); }
+
+ private:
+  using Rep = internal::FlatRep<Key, Bucket, Hash, Eq>;
+
+  // Bucket stores kWidth <marker, key, value> triples.
+  // The data is organized as three parallel arrays to reduce padding.
+  struct Bucket {
+    uint8 marker[Rep::kWidth];
+
+    // Wrap keys and values in union to control construction and destruction.
+    union Storage {
+      struct {
+        Key key[Rep::kWidth];
+        Val val[Rep::kWidth];
+      };
+      Storage() {}
+      ~Storage() {}
+    } storage;
+
+    Key& key(uint32 i) {
+      DCHECK_GE(marker[i], 2);
+      return storage.key[i];
+    }
+    Val& val(uint32 i) {
+      DCHECK_GE(marker[i], 2);
+      return storage.val[i];
+    }
+    template <typename V>
+    void InitVal(uint32 i, V&& v) {
+      new (&storage.val[i]) Val(std::forward<V>(v));
+    }
+    void Destroy(uint32 i) {
+      storage.key[i].Key::~Key();
+      storage.val[i].Val::~Val();
+    }
+    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
+      new (&storage.val[i]) Val(std::move(src->storage.val[src_index]));
+    }
+    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(src->storage.key[src_index]);
+      new (&storage.val[i]) Val(src->storage.val[src_index]);
+    }
+  };
+
+  template <typename Pair>
+  std::pair<iterator, bool> InsertPair(Pair&& p) {
+    return Insert(std::forward<decltype(p.first)>(p.first),
+                  std::forward<decltype(p.second)>(p.second));
+  }
+
+  template <typename K, typename V>
+  std::pair<iterator, bool> Insert(K&& k, V&& v) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
+    const bool inserted = !r.found;
+    if (inserted) {
+      r.b->InitVal(r.index, std::forward<V>(v));
+    }
+    return {iterator(r.b, rep_.limit(), r.index), inserted};
+  }
+
+  template <typename K>
+  Val& IndexOp(K&& k) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
+    Val* vptr = &r.b->val(r.index);
+    if (!r.found) {
+      new (vptr) Val();  // Initialize value in new slot.
+    }
+    return *vptr;
+  }
+
+  Rep rep_;
+};
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_FLATMAP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatrep.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatrep.h
new file mode 100644
index 00000000..ed772875
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatrep.h
@@ -0,0 +1,353 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_GTL_FLATREP_H_
+#define XLA_TSL_LIB_GTL_FLATREP_H_
+
+#include <string.h>
+
+#include <utility>
+
+#include "absl/base/prefetch.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace gtl {
+namespace internal {
+
+// Internal representation for FlatMap and FlatSet.
+//
+// The representation is an open-addressed hash table.  Conceptually,
+// the representation is a flat array of entries.  However we
+// structure it as an array of buckets where each bucket holds
+// kWidth entries along with metadata for the kWidth entries.  The
+// metadata marker is
+//
+//  (a) kEmpty: the entry is empty
+//  (b) kDeleted: the entry has been deleted
+//  (c) other: the entry is occupied and has low-8 bits of its hash.
+//      These hash bits can be used to avoid potentially expensive
+//      key comparisons.
+//
+// FlatMap passes in a bucket that contains keys and values, FlatSet
+// passes in a bucket that does not contain values.
+template <typename Key, typename Bucket, class Hash, class Eq>
+class FlatRep {
+ public:
+  // kWidth is the number of entries stored in a bucket.
+  static constexpr uint32 kBase = 3;
+  static constexpr uint32 kWidth = (1 << kBase);
+
+  FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
+    Init(N);
+  }
+  FlatRep(const FlatRep& src) : hash_(src.hash_), equal_(src.equal_) {
+    Init(src.size());
+    CopyEntries(src.array_, src.end_, CopyEntry());
+  }
+
+  FlatRep(
+      FlatRep&& src) noexcept  // Copy rather than move src.hash_ and
+                               // src.equal_.  This is necessary to leave src in
+                               // a valid state -- otherwise e.g. if hash_ is an
+                               // std::function, moving it would null it out.
+      : hash_(src.hash_), equal_(src.equal_) {
+    // TODO(jlebar): Init(1) still allocates some memory, so this isn't as cheap
+    // as it could be.  The fundamental problem is that we need to leave src in
+    // a valid state, and FlatRep *always* owns a nonzero amount of memory.
+    Init(1);
+    swap(src);
+  }
+
+  ~FlatRep() {
+    clear_no_resize();
+    delete[] array_;
+  }
+
+  // Simple accessors.
+  size_t size() const { return not_empty_ - deleted_; }
+  size_t bucket_count() const { return mask_ + 1; }
+  Bucket* start() const { return array_; }
+  Bucket* limit() const { return end_; }
+  const Hash& hash_function() const { return hash_; }
+  const Eq& key_eq() const { return equal_; }
+
+  // Overwrite contents of *this with contents of src.
+  void CopyFrom(const FlatRep& src) {
+    if (this != &src) {
+      clear_no_resize();
+      delete[] array_;
+      Init(src.size());
+      CopyEntries(src.array_, src.end_, CopyEntry());
+    }
+  }
+
+  void MoveFrom(FlatRep&& src) {
+    if (this != &src) {
+      swap(src);
+    }
+  }
+
+  void clear_no_resize() {
+    for (Bucket* b = array_; b != end_; b++) {
+      for (uint32 i = 0; i < kWidth; i++) {
+        if (b->marker[i] >= 2) {
+          b->Destroy(i);
+          b->marker[i] = kEmpty;
+        }
+      }
+    }
+    not_empty_ = 0;
+    deleted_ = 0;
+  }
+
+  void clear() {
+    clear_no_resize();
+    grow_ = 0;  // Consider shrinking in MaybeResize()
+    MaybeResize();
+  }
+
+  void swap(FlatRep& x) noexcept {
+    using std::swap;
+    swap(array_, x.array_);
+    swap(end_, x.end_);
+    swap(lglen_, x.lglen_);
+    swap(mask_, x.mask_);
+    swap(not_empty_, x.not_empty_);
+    swap(deleted_, x.deleted_);
+    swap(grow_, x.grow_);
+    swap(shrink_, x.shrink_);
+  }
+
+  struct SearchResult {
+    bool found;
+    Bucket* b;
+    uint32 index;
+  };
+
+  // Hash value is partitioned as follows:
+  // 1. Bottom 8 bits are stored in bucket to help speed up comparisons.
+  // 2. Next 3 bits give index inside bucket.
+  // 3. Remaining bits give bucket number.
+
+  // Find bucket/index for key k.
+  SearchResult Find(const Key& k) const {
+    size_t h = hash_(k);
+    const uint32 marker = Marker(h & 0xff);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 num_probes = 1;            // Needed for quadratic probing
+    while (true) {
+      uint32 bi = index & (kWidth - 1);
+      Bucket* b = &array_[index >> kBase];
+      const uint32 x = b->marker[bi];
+      if (x == marker && equal_(b->key(bi), k)) {
+        return {true, b, bi};
+      } else if (x == kEmpty) {
+        return {false, nullptr, 0};
+      }
+      index = NextIndex(index, num_probes);
+      num_probes++;
+    }
+  }
+
+  // Find bucket/index for key k, creating a new one if necessary.
+  //
+  // KeyType is a template parameter so that k's type is deduced and it
+  // becomes a universal reference which allows the key initialization
+  // below to use an rvalue constructor if available.
+  template <typename KeyType>
+  SearchResult FindOrInsert(KeyType&& k) {
+    size_t h = hash_(k);
+    const uint32 marker = Marker(h & 0xff);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 num_probes = 1;            // Needed for quadratic probing
+    Bucket* del = nullptr;            // First encountered deletion for kInsert
+    uint32 di = 0;
+    while (true) {
+      uint32 bi = index & (kWidth - 1);
+      Bucket* b = &array_[index >> kBase];
+      const uint32 x = b->marker[bi];
+      if (x == marker && equal_(b->key(bi), k)) {
+        return {true, b, bi};
+      } else if (!del && x == kDeleted) {
+        // Remember deleted index to use for insertion.
+        del = b;
+        di = bi;
+      } else if (x == kEmpty) {
+        if (del) {
+          // Store in the first deleted slot we encountered
+          b = del;
+          bi = di;
+          deleted_--;  // not_empty_ does not change
+        } else {
+          not_empty_++;
+        }
+        b->marker[bi] = marker;
+        new (&b->key(bi)) Key(std::forward<KeyType>(k));
+        return {false, b, bi};
+      }
+      index = NextIndex(index, num_probes);
+      num_probes++;
+    }
+  }
+
+  void Erase(Bucket* b, uint32 i) {
+    b->Destroy(i);
+    b->marker[i] = kDeleted;
+    deleted_++;
+    grow_ = 0;  // Consider shrinking on next insert
+  }
+
+  void Prefetch(const Key& k) const {
+    size_t h = hash_(k);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 bi = index & (kWidth - 1);
+    Bucket* b = &array_[index >> kBase];
+    absl::PrefetchToLocalCache(&b->marker[bi]);
+    absl::PrefetchToLocalCache(&b->storage.key[bi]);
+  }
+
+  inline void MaybeResize() {
+    if (not_empty_ < grow_) {
+      return;  // Nothing to do
+    }
+    if (grow_ == 0) {
+      // Special value set by erase to cause shrink on next insert.
+      if (size() >= shrink_) {
+        // Not small enough to shrink.
+        grow_ = static_cast<size_t>(bucket_count() * 0.8);
+        if (not_empty_ < grow_) return;
+      }
+    }
+    Resize(size() + 1);
+  }
+
+  void Resize(size_t N) {
+    Bucket* old = array_;
+    Bucket* old_end = end_;
+    Init(N);
+    CopyEntries(old, old_end, MoveEntry());
+    delete[] old;
+  }
+
+ private:
+  enum { kEmpty = 0, kDeleted = 1 };  // Special markers for an entry.
+
+  Hash hash_;         // User-supplied hasher
+  Eq equal_;          // User-supplied comparator
+  uint8 lglen_;       // lg(#buckets)
+  Bucket* array_;     // array of length (1 << lglen_)
+  Bucket* end_;       // Points just past last bucket in array_
+  size_t mask_;       // (# of entries in table) - 1
+  size_t not_empty_;  // Count of entries with marker != kEmpty
+  size_t deleted_;    // Count of entries with marker == kDeleted
+  size_t grow_;       // Grow array when not_empty_ >= grow_
+  size_t shrink_;     // Shrink array when size() < shrink_
+
+  // Avoid kEmpty and kDeleted markers when computing hash values to
+  // store in Bucket::marker[].
+  static uint32 Marker(uint32 hb) { return hb + (hb < 2 ? 2 : 0); }
+
+  void Init(size_t N) {
+    // Make enough room for N elements.
+    size_t lg = 0;  // Smallest table is just one bucket.
+    while (N >= 0.8 * ((1 << lg) * kWidth)) {
+      lg++;
+    }
+    const size_t n = (1 << lg);
+    Bucket* array = new Bucket[n];
+    for (size_t i = 0; i < n; i++) {
+      Bucket* b = &array[i];
+      memset(b->marker, kEmpty, kWidth);
+    }
+    const size_t capacity = (1 << lg) * kWidth;
+    lglen_ = lg;
+    mask_ = capacity - 1;
+    array_ = array;
+    end_ = array + n;
+    not_empty_ = 0;
+    deleted_ = 0;
+    grow_ = static_cast<size_t>(capacity * 0.8);
+    if (lg == 0) {
+      // Already down to one bucket; no more shrinking.
+      shrink_ = 0;
+    } else {
+      shrink_ = static_cast<size_t>(grow_ * 0.4);  // Must be less than 0.5
+    }
+  }
+
+  // Used by FreshInsert when we should copy from source.
+  struct CopyEntry {
+    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+      dst->CopyFrom(dsti, src, srci);
+    }
+  };
+
+  // Used by FreshInsert when we should move from source.
+  struct MoveEntry {
+    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+      dst->MoveFrom(dsti, src, srci);
+      src->Destroy(srci);
+      src->marker[srci] = kDeleted;
+    }
+  };
+
+  template <typename Copier>
+  void CopyEntries(Bucket* start, Bucket* end, Copier copier) {
+    for (Bucket* b = start; b != end; b++) {
+      for (uint32 i = 0; i < kWidth; i++) {
+        if (b->marker[i] >= 2) {
+          FreshInsert(b, i, copier);
+        }
+      }
+    }
+  }
+
+  // Create an entry for the key numbered src_index in *src and return
+  // its bucket/index.  Used for insertion into a fresh table.  We
+  // assume that there are no deletions, and k does not already exist
+  // in the table.
+  template <typename Copier>
+  void FreshInsert(Bucket* src, uint32 src_index, Copier copier) {
+    size_t h = hash_(src->key(src_index));
+    const uint32 marker = Marker(h & 0xff);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 num_probes = 1;            // Needed for quadratic probing
+    while (true) {
+      uint32 bi = index & (kWidth - 1);
+      Bucket* b = &array_[index >> kBase];
+      const uint32 x = b->marker[bi];
+      if (x == 0) {
+        b->marker[bi] = marker;
+        not_empty_++;
+        copier(b, bi, src, src_index);
+        return;
+      }
+      index = NextIndex(index, num_probes);
+      num_probes++;
+    }
+  }
+
+  inline size_t NextIndex(size_t i, uint32 num_probes) const {
+    // Quadratic probing.
+    return (i + num_probes) & mask_;
+  }
+};
+
+}  // namespace internal
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_FLATREP_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatset.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatset.h
new file mode 100644
index 00000000..c4b44b9b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/flatset.h
@@ -0,0 +1,296 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_GTL_FLATSET_H_
+#define XLA_TSL_LIB_GTL_FLATSET_H_
+
+#include <stddef.h>
+
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <utility>
+
+#include "xla/tsl/lib/gtl/flatrep.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/hash.h"
+
+namespace tsl {
+namespace gtl {
+
+// FlatSet<K,...> provides a set of K.
+//
+// The map is implemented using an open-addressed hash table.  A
+// single array holds entire map contents and collisions are resolved
+// by probing at a sequence of locations in the array.
+template <typename Key, class Hash = hash<Key>, class Eq = std::equal_to<Key>>
+class FlatSet {
+ private:
+  // Forward declare some internal types needed in public section.
+  struct Bucket;
+
+ public:
+  typedef Key key_type;
+  typedef Key value_type;
+  typedef Hash hasher;
+  typedef Eq key_equal;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  FlatSet() : FlatSet(1) {}
+
+  explicit FlatSet(size_t N, const Hash& hf = Hash(), const Eq& eq = Eq())
+      : rep_(N, hf, eq) {}
+
+  FlatSet(const FlatSet& src) : rep_(src.rep_) {}
+
+  // Move constructor leaves src in a valid but unspecified state (same as
+  // std::unordered_set).
+  FlatSet(FlatSet&& src) noexcept : rep_(std::move(src.rep_)) {}
+
+  template <typename InputIter>
+  FlatSet(InputIter first, InputIter last, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatSet(N, hf, eq) {
+    insert(first, last);
+  }
+
+  FlatSet(std::initializer_list<value_type> init, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatSet(init.begin(), init.end(), N, hf, eq) {}
+
+  FlatSet& operator=(const FlatSet& src) {
+    rep_.CopyFrom(src.rep_);
+    return *this;
+  }
+
+  // Move-assignment operator leaves src in a valid but unspecified state (same
+  // as std::unordered_set).
+  FlatSet& operator=(FlatSet&& src) noexcept {
+    rep_.MoveFrom(std::move(src.rep_));
+    return *this;
+  }
+
+  ~FlatSet() {}
+
+  void swap(FlatSet& x) noexcept { rep_.swap(x.rep_); }
+  void clear_no_resize() { rep_.clear_no_resize(); }
+  void clear() { rep_.clear(); }
+  void reserve(size_t N) { rep_.Resize(std::max(N, size())); }
+  void rehash(size_t N) { rep_.Resize(std::max(N, size())); }
+  void resize(size_t N) { rep_.Resize(std::max(N, size())); }
+  size_t size() const { return rep_.size(); }
+  bool empty() const { return size() == 0; }
+  size_t bucket_count() const { return rep_.bucket_count(); }
+  hasher hash_function() const { return rep_.hash_function(); }
+  key_equal key_eq() const { return rep_.key_eq(); }
+
+  class const_iterator {
+   public:
+    typedef typename FlatSet::difference_type difference_type;
+    typedef typename FlatSet::value_type value_type;
+    typedef typename FlatSet::const_pointer pointer;
+    typedef typename FlatSet::const_reference reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
+    const_iterator() : b_(nullptr), end_(nullptr), i_(0) {}
+
+    // Make iterator pointing at first element at or after b.
+    const_iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) {
+      SkipUnused();
+    }
+
+    // Make iterator pointing exactly at ith element in b, which must exist.
+    const_iterator(Bucket* b, Bucket* end, uint32 i)
+        : b_(b), end_(end), i_(i) {}
+
+    reference operator*() const { return key(); }
+    pointer operator->() const { return &key(); }
+    bool operator==(const const_iterator& x) const {
+      return b_ == x.b_ && i_ == x.i_;
+    }
+    bool operator!=(const const_iterator& x) const { return !(*this == x); }
+    const_iterator& operator++() {
+      DCHECK(b_ != end_);
+      i_++;
+      SkipUnused();
+      return *this;
+    }
+    const_iterator operator++(int /*indicates postfix*/) {
+      const_iterator tmp(*this);
+      ++*this;
+      return tmp;
+    }
+
+   private:
+    friend class FlatSet;
+    Bucket* b_;
+    Bucket* end_;
+    uint32 i_;
+
+    reference key() const { return b_->key(i_); }
+    void SkipUnused() {
+      while (b_ < end_) {
+        if (i_ >= Rep::kWidth) {
+          i_ = 0;
+          b_++;
+        } else if (b_->marker[i_] < 2) {
+          i_++;
+        } else {
+          break;
+        }
+      }
+    }
+  };
+
+  typedef const_iterator iterator;
+
+  iterator begin() { return iterator(rep_.start(), rep_.limit()); }
+  iterator end() { return iterator(rep_.limit(), rep_.limit()); }
+  const_iterator begin() const {
+    return const_iterator(rep_.start(), rep_.limit());
+  }
+  const_iterator end() const {
+    return const_iterator(rep_.limit(), rep_.limit());
+  }
+
+  size_t count(const Key& k) const { return rep_.Find(k).found ? 1 : 0; }
+  iterator find(const Key& k) {
+    auto r = rep_.Find(k);
+    return r.found ? iterator(r.b, rep_.limit(), r.index) : end();
+  }
+  const_iterator find(const Key& k) const {
+    auto r = rep_.Find(k);
+    return r.found ? const_iterator(r.b, rep_.limit(), r.index) : end();
+  }
+
+  std::pair<iterator, bool> insert(const Key& k) { return Insert(k); }
+  std::pair<iterator, bool> insert(Key&& k) { return Insert(std::move(k)); }
+  template <typename InputIter>
+  void insert(InputIter first, InputIter last) {
+    for (; first != last; ++first) {
+      insert(*first);
+    }
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<Args>(args)...);
+    const bool inserted = !r.found;
+    return {iterator(r.b, rep_.limit(), r.index), inserted};
+  }
+
+  size_t erase(const Key& k) {
+    auto r = rep_.Find(k);
+    if (!r.found) return 0;
+    rep_.Erase(r.b, r.index);
+    return 1;
+  }
+  iterator erase(iterator pos) {
+    rep_.Erase(pos.b_, pos.i_);
+    ++pos;
+    return pos;
+  }
+  iterator erase(iterator pos, iterator last) {
+    for (; pos != last; ++pos) {
+      rep_.Erase(pos.b_, pos.i_);
+    }
+    return pos;
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& k) {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+  std::pair<const_iterator, const_iterator> equal_range(const Key& k) const {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+
+  bool operator==(const FlatSet& x) const {
+    if (size() != x.size()) return false;
+    for (const auto& elem : x) {
+      auto i = find(elem);
+      if (i == end()) return false;
+    }
+    return true;
+  }
+  bool operator!=(const FlatSet& x) const { return !(*this == x); }
+
+  // If key exists in the table, prefetch it.  This is a hint, and may
+  // have no effect.
+  void prefetch_value(const Key& key) const { rep_.Prefetch(key); }
+
+ private:
+  using Rep = internal::FlatRep<Key, Bucket, Hash, Eq>;
+
+  // Bucket stores kWidth <marker, key, value> triples.
+  // The data is organized as three parallel arrays to reduce padding.
+  struct Bucket {
+    uint8 marker[Rep::kWidth];
+
+    // Wrap keys in union to control construction and destruction.
+    union Storage {
+      Key key[Rep::kWidth];
+      Storage() {}
+      ~Storage() {}
+    } storage;
+
+    Key& key(uint32 i) {
+      DCHECK_GE(marker[i], 2);
+      return storage.key[i];
+    }
+    void Destroy(uint32 i) { storage.key[i].Key::~Key(); }
+    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
+    }
+    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(src->storage.key[src_index]);
+    }
+  };
+
+  template <typename K>
+  std::pair<iterator, bool> Insert(K&& k) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
+    const bool inserted = !r.found;
+    return {iterator(r.b, rep_.limit(), r.index), inserted};
+  }
+
+  Rep rep_;
+};
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_FLATSET_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/inlined_vector.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/inlined_vector.h
new file mode 100644
index 00000000..40eb3c9f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/inlined_vector.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_GTL_INLINED_VECTOR_H_
+#define XLA_TSL_LIB_GTL_INLINED_VECTOR_H_
+
+#include <cstddef>
+
+#include "absl/base/macros.h"
+#include "absl/container/inlined_vector.h"  // IWYU pragma: export
+// TODO(kramerb): This is kept only because lots of targets transitively depend
+// on it. Remove all targets' dependencies.
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tsl {
+namespace gtl {
+
+template <typename T, size_t N>
+using InlinedVector ABSL_DEPRECATE_AND_INLINE() = absl::InlinedVector<T, N>;
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_INLINED_VECTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/int_type.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/int_type.h
new file mode 100644
index 00000000..c0760d45
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/int_type.h
@@ -0,0 +1,364 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// #status: LEGACY
+// #category: Miscellaneous
+// #summary: Integral types; prefer util/intops/strong_int.h
+// #bugs: Infrastructure > C++ Library Team > util
+//
+// IntType is a simple template class mechanism for defining "logical"
+// integer-like class types that support many of the same functionalities
+// as native integer types, but which prevent assignment, construction, and
+// other operations from other similar integer-like types.  Essentially, the
+// template class IntType<IntTypeName, ValueType> (where ValueType assumes
+// valid scalar types such as int, uint, int32, etc) has the additional
+// property that it cannot be assigned to or constructed from other IntTypes
+// or native integer types of equal or implicitly convertible type.
+//
+// The class is useful for preventing mingling of integer variables with
+// different logical roles or units.  Unfortunately, C++ provides relatively
+// good type-safety for user-defined classes but not for integer types.  It is
+// essentially up to the user to use nice variable names and comments to prevent
+// accidental mismatches, such as confusing a user-index with a group-index or a
+// time-in-milliseconds with a time-in-seconds.  The use of typedefs are limited
+// in that regard as they do not enforce type-safety.
+//
+// USAGE -----------------------------------------------------------------------
+//
+//    DEFINE_INT_TYPE(IntTypeName, ValueType);
+//
+//  where:
+//    IntTypeName: is the desired (unique) name for the "logical" integer type.
+//    ValueType: is one of the integral types as defined by base::is_integral
+//               (see base/type_traits.h).
+//
+// DISALLOWED OPERATIONS / TYPE-SAFETY ENFORCEMENT -----------------------------
+//
+//  Consider these definitions and variable declarations:
+//    DEFINE_INT_TYPE(GlobalDocID, int64);
+//    DEFINE_INT_TYPE(LocalDocID, int64);
+//    GlobalDocID global;
+//    LocalDocID local;
+//
+//  The class IntType prevents:
+//
+//  1) Assignments of other IntTypes with different IntTypeNames.
+//
+//    global = local;                  <-- Fails to compile!
+//    local = global;                  <-- Fails to compile!
+//
+//  2) Explicit/implicit conversion from an IntType to another IntType.
+//
+//    LocalDocID l(global);            <-- Fails to compile!
+//    LocalDocID l = global;           <-- Fails to compile!
+//
+//    void GetGlobalDoc(GlobalDocID global) { }
+//    GetGlobalDoc(global);            <-- Compiles fine, types match!
+//    GetGlobalDoc(local);             <-- Fails to compile!
+//
+//  3) Implicit conversion from an IntType to a native integer type.
+//
+//    void GetGlobalDoc(int64 global) { ...
+//    GetGlobalDoc(global);            <-- Fails to compile!
+//    GetGlobalDoc(local);             <-- Fails to compile!
+//
+//    void GetLocalDoc(int32 local) { ...
+//    GetLocalDoc(global);             <-- Fails to compile!
+//    GetLocalDoc(local);              <-- Fails to compile!
+//
+//
+// SUPPORTED OPERATIONS --------------------------------------------------------
+//
+// The following operators are supported: unary: ++ (both prefix and postfix),
+// +, -, ! (logical not), ~ (one's complement); comparison: ==, !=, <, <=, >,
+// >=; numerical: +, -, *, /; assignment: =, +=, -=, /=, *=; stream: <<. Each
+// operator allows the same IntTypeName and the ValueType to be used on
+// both left- and right-hand sides.
+//
+// It also supports an accessor value() returning the stored value as ValueType,
+// and a templatized accessor value<T>() method that serves as syntactic sugar
+// for static_cast<T>(var.value()).  These accessors are useful when assigning
+// the stored value into protocol buffer fields and using it as printf args.
+//
+// The class also defines a hash functor that allows the IntType to be used
+// as key to hashable containers such as std::unordered_map and
+// std::unordered_set.
+//
+// We suggest using the IntTypeIndexedContainer wrapper around FixedArray and
+// STL vector (see int-type-indexed-container.h) if an IntType is intended to
+// be used as an index into these containers.  These wrappers are indexed in a
+// type-safe manner using IntTypes to ensure type-safety.
+//
+// NB: this implementation does not attempt to abide by or enforce dimensional
+// analysis on these scalar types.
+//
+// EXAMPLES --------------------------------------------------------------------
+//
+//    DEFINE_INT_TYPE(GlobalDocID, int64);
+//    GlobalDocID global = 3;
+//    cout << global;                      <-- Prints 3 to stdout.
+//
+//    for (GlobalDocID i(0); i < global; ++i) {
+//      cout << i;
+//    }                                    <-- Print(ln)s 0 1 2 to stdout
+//
+//    DEFINE_INT_TYPE(LocalDocID, int64);
+//    LocalDocID local;
+//    cout << local;                       <-- Prints 0 to stdout it default
+//                                             initializes the value to 0.
+//
+//    local = 5;
+//    local *= 2;
+//    LocalDocID l(local);
+//    cout << l + local;                   <-- Prints 20 to stdout.
+//
+//    GenericSearchRequest request;
+//    request.set_doc_id(global.value());  <-- Uses value() to extract the value
+//                                             from the IntType class.
+//
+// REMARKS ---------------------------------------------------------------------
+//
+// The following bad usage is permissible although discouraged.  Essentially, it
+// involves using the value*() accessors to extract the native integer type out
+// of the IntType class.  Keep in mind that the primary reason for the IntType
+// class is to prevent *accidental* mingling of similar logical integer types --
+// and not type casting from one type to another.
+//
+//  DEFINE_INT_TYPE(GlobalDocID, int64);
+//  DEFINE_INT_TYPE(LocalDocID, int64);
+//  GlobalDocID global;
+//  LocalDocID local;
+//
+//  global = local.value();                       <-- Compiles fine.
+//
+//  void GetGlobalDoc(GlobalDocID global) { ...
+//  GetGlobalDoc(local.value());                  <-- Compiles fine.
+//
+//  void GetGlobalDoc(int64 global) { ...
+//  GetGlobalDoc(local.value());                  <-- Compiles fine.
+
+#ifndef XLA_TSL_LIB_GTL_INT_TYPE_H_
+#define XLA_TSL_LIB_GTL_INT_TYPE_H_
+
+#include <stddef.h>
+
+#include <functional>
+#include <iosfwd>
+#include <ostream>  // NOLINT
+#include <unordered_map>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace gtl {
+
+template <typename IntTypeName, typename _ValueType>
+class IntType;
+
+// Defines the IntType using value_type and typedefs it to int_type_name.
+// The struct int_type_name ## _tag_ trickery is needed to ensure that a new
+// type is created per int_type_name.
+#define TSL_LIB_GTL_DEFINE_INT_TYPE(int_type_name, value_type) \
+  struct int_type_name##_tag_ {};                              \
+  typedef ::tsl::gtl::IntType<int_type_name##_tag_, value_type> int_type_name;
+
+// Holds an integer value (of type ValueType) and behaves as a ValueType by
+// exposing assignment, unary, comparison, and arithmetic operators.
+//
+// The template parameter IntTypeName defines the name for the int type and must
+// be unique within a binary (the convenient DEFINE_INT_TYPE macro at the end of
+// the file generates a unique IntTypeName).  The parameter ValueType defines
+// the integer type value (see supported list above).
+//
+// This class is NOT thread-safe.
+template <typename IntTypeName, typename _ValueType>
+class IntType {
+ public:
+  typedef _ValueType ValueType;                      // for non-member operators
+  typedef IntType<IntTypeName, ValueType> ThisType;  // Syntactic sugar.
+
+  // Note that this may change from time to time without notice.
+  struct Hasher {
+    size_t operator()(const IntType& arg) const {
+      return static_cast<size_t>(arg.value());
+    }
+  };
+
+  template <typename H>
+  friend H AbslHashValue(H h, const IntType& i) {
+    return H::combine(std::move(h), i.value());
+  }
+
+ public:
+  // Default c'tor initializing value_ to 0.
+  constexpr IntType() : value_(0) {}
+  // C'tor explicitly initializing from a ValueType.
+  constexpr explicit IntType(ValueType value) : value_(value) {}
+
+  // IntType uses the default copy constructor, destructor and assign operator.
+  // The defaults are sufficient and omitting them allows the compiler to add
+  // the move constructor/assignment.
+
+  // -- ACCESSORS --------------------------------------------------------------
+  // The class provides a value() accessor returning the stored ValueType value_
+  // as well as a templatized accessor that is just a syntactic sugar for
+  // static_cast<T>(var.value());
+  constexpr ValueType value() const { return value_; }
+
+  template <typename ValType>
+  constexpr ValType value() const {
+    return static_cast<ValType>(value_);
+  }
+
+  // -- UNARY OPERATORS --------------------------------------------------------
+  ThisType& operator++() {  // prefix ++
+    ++value_;
+    return *this;
+  }
+  const ThisType operator++(int v) {  // postfix ++
+    ThisType temp(*this);
+    ++value_;
+    return temp;
+  }
+  ThisType& operator--() {  // prefix --
+    --value_;
+    return *this;
+  }
+  const ThisType operator--(int v) {  // postfix --
+    ThisType temp(*this);
+    --value_;
+    return temp;
+  }
+
+  constexpr bool operator!() const { return value_ == 0; }
+  constexpr const ThisType operator+() const { return ThisType(value_); }
+  constexpr const ThisType operator-() const { return ThisType(-value_); }
+  constexpr const ThisType operator~() const { return ThisType(~value_); }
+
+// -- ASSIGNMENT OPERATORS ---------------------------------------------------
+// We support the following assignment operators: =, +=, -=, *=, /=, <<=, >>=
+// and %= for both ThisType and ValueType.
+#define INT_TYPE_ASSIGNMENT_OP(op)                   \
+  ThisType& operator op(const ThisType& arg_value) { \
+    value_ op arg_value.value();                     \
+    return *this;                                    \
+  }                                                  \
+  ThisType& operator op(ValueType arg_value) {       \
+    value_ op arg_value;                             \
+    return *this;                                    \
+  }
+  INT_TYPE_ASSIGNMENT_OP(+=);
+  INT_TYPE_ASSIGNMENT_OP(-=);
+  INT_TYPE_ASSIGNMENT_OP(*=);
+  INT_TYPE_ASSIGNMENT_OP(/=);
+  INT_TYPE_ASSIGNMENT_OP(<<=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(>>=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(%=);
+#undef INT_TYPE_ASSIGNMENT_OP
+
+  ThisType& operator=(ValueType arg_value) {
+    value_ = arg_value;
+    return *this;
+  }
+
+ private:
+  // The integer value of type ValueType.
+  ValueType value_;
+
+  static_assert(std::is_integral<ValueType>::value, "invalid integer type");
+} TF_PACKED;
+
+// -- NON-MEMBER STREAM OPERATORS ----------------------------------------------
+// We provide the << operator, primarily for logging purposes.  Currently, there
+// seems to be no need for an >> operator.
+template <typename IntTypeName, typename ValueType>
+std::ostream& operator<<(std::ostream& os,  // NOLINT
+                         IntType<IntTypeName, ValueType> arg) {
+  return os << arg.value();
+}
+
+// -- NON-MEMBER ARITHMETIC OPERATORS ------------------------------------------
+// We support only the +, -, *, and / operators with the same IntType and
+// ValueType types.  The reason is to allow simple manipulation on these IDs
+// when used as indices in vectors and arrays.
+//
+// NB: Although it is possible to do IntType * IntType and IntType / IntType,
+// it is probably non-sensical from a dimensionality analysis perspective.
+#define INT_TYPE_ARITHMETIC_OP(op)                                        \
+  template <typename IntTypeName, typename ValueType>                     \
+  static inline constexpr IntType<IntTypeName, ValueType> operator op(    \
+      IntType<IntTypeName, ValueType> id_1,                               \
+      IntType<IntTypeName, ValueType> id_2) {                             \
+    return IntType<IntTypeName, ValueType>(id_1.value() op id_2.value()); \
+  }                                                                       \
+  template <typename IntTypeName, typename ValueType>                     \
+  static inline constexpr IntType<IntTypeName, ValueType> operator op(    \
+      IntType<IntTypeName, ValueType> id,                                 \
+      typename IntType<IntTypeName, ValueType>::ValueType arg_val) {      \
+    return IntType<IntTypeName, ValueType>(id.value() op arg_val);        \
+  }                                                                       \
+  template <typename IntTypeName, typename ValueType>                     \
+  static inline constexpr IntType<IntTypeName, ValueType> operator op(    \
+      typename IntType<IntTypeName, ValueType>::ValueType arg_val,        \
+      IntType<IntTypeName, ValueType> id) {                               \
+    return IntType<IntTypeName, ValueType>(arg_val op id.value());        \
+  }
+INT_TYPE_ARITHMETIC_OP(+);
+INT_TYPE_ARITHMETIC_OP(-);
+INT_TYPE_ARITHMETIC_OP(*);
+INT_TYPE_ARITHMETIC_OP(/);
+INT_TYPE_ARITHMETIC_OP(<<);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(>>);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(%);
+#undef INT_TYPE_ARITHMETIC_OP
+
+// -- NON-MEMBER COMPARISON OPERATORS ------------------------------------------
+// Static inline comparison operators.  We allow all comparison operators among
+// the following types (OP \in [==, !=, <, <=, >, >=]:
+//   IntType<IntTypeName, ValueType> OP IntType<IntTypeName, ValueType>
+//   IntType<IntTypeName, ValueType> OP ValueType
+//   ValueType OP IntType<IntTypeName, ValueType>
+#define INT_TYPE_COMPARISON_OP(op)                               \
+  template <typename IntTypeName, typename ValueType>            \
+  static inline constexpr bool operator op(                      \
+      IntType<IntTypeName, ValueType> id_1,                      \
+      IntType<IntTypeName, ValueType> id_2) {                    \
+    return id_1.value() op id_2.value();                         \
+  }                                                              \
+  template <typename IntTypeName, typename ValueType>            \
+  static inline constexpr bool operator op(                      \
+      IntType<IntTypeName, ValueType> id,                        \
+      typename IntType<IntTypeName, ValueType>::ValueType val) { \
+    return id.value() op val;                                    \
+  }                                                              \
+  template <typename IntTypeName, typename ValueType>            \
+  static inline constexpr bool operator op(                      \
+      typename IntType<IntTypeName, ValueType>::ValueType val,   \
+      IntType<IntTypeName, ValueType> id) {                      \
+    return val op id.value();                                    \
+  }
+INT_TYPE_COMPARISON_OP(==);  // NOLINT
+INT_TYPE_COMPARISON_OP(!=);  // NOLINT
+INT_TYPE_COMPARISON_OP(<);   // NOLINT
+INT_TYPE_COMPARISON_OP(<=);  // NOLINT
+INT_TYPE_COMPARISON_OP(>);   // NOLINT
+INT_TYPE_COMPARISON_OP(>=);  // NOLINT
+#undef INT_TYPE_COMPARISON_OP
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_INT_TYPE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/iterator_range.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/iterator_range.h
new file mode 100644
index 00000000..2914dce3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/iterator_range.h
@@ -0,0 +1,68 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This provides a very simple, boring adaptor for a begin and end iterator
+// into a range type. This should be used to build range views that work well
+// with range based for loops and range based constructors.
+//
+// Note that code here follows more standards-based coding conventions as it
+// is mirroring proposed interfaces for standardization.
+//
+// Converted from chandlerc@'s code to Google style by joshl@.
+
+#ifndef XLA_TSL_LIB_GTL_ITERATOR_RANGE_H_
+#define XLA_TSL_LIB_GTL_ITERATOR_RANGE_H_
+
+#include <utility>
+
+namespace tsl {
+namespace gtl {
+
+// A range adaptor for a pair of iterators.
+//
+// This just wraps two iterators into a range-compatible interface. Nothing
+// fancy at all.
+template <typename IteratorT>
+class iterator_range {
+ public:
+  using value_type = decltype(*std::declval<IteratorT>());
+  using iterator = IteratorT;
+  using const_iterator = IteratorT;
+
+  iterator_range() : begin_iterator_(), end_iterator_() {}
+  iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
+      : begin_iterator_(std::move(begin_iterator)),
+        end_iterator_(std::move(end_iterator)) {}
+
+  IteratorT begin() const { return begin_iterator_; }
+  IteratorT end() const { return end_iterator_; }
+
+ private:
+  IteratorT begin_iterator_, end_iterator_;
+};
+
+// Convenience function for iterating over sub-ranges.
+//
+// This provides a bit of syntactic sugar to make using sub-ranges
+// in for loops a bit easier. Analogous to std::make_pair().
+template <class T>
+iterator_range<T> make_range(T x, T y) {
+  return iterator_range<T>(std::move(x), std::move(y));
+}
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_ITERATOR_RANGE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/map_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/map_util.h
new file mode 100644
index 00000000..d04ba364
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/map_util.h
@@ -0,0 +1,215 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides utility functions for use with STL map-like data
+// structures, such as std::map and hash_map. Some functions will also work with
+// sets, such as ContainsKey().
+
+#ifndef XLA_TSL_LIB_GTL_MAP_UTIL_H_
+#define XLA_TSL_LIB_GTL_MAP_UTIL_H_
+
+#include <stddef.h>
+
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "xla/tsl/lib/gtl/subtle/map_traits.h"
+
+namespace tsl {
+namespace gtl {
+
+// Returns a pointer to the const value associated with the given key if it
+// exists, or NULL otherwise.
+template <class Collection>
+const typename Collection::value_type::second_type* FindOrNull(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+// Same as above but returns a pointer to the non-const value.
+template <class Collection>
+typename Collection::value_type::second_type* FindOrNull(
+    Collection& collection,  // NOLINT
+    const typename Collection::value_type::first_type& key) {
+  typename Collection::iterator it = collection.find(key);
+  if (it == collection.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+// Returns the pointer value associated with the given key. If none is found,
+// NULL is returned. The function is designed to be used with a map of keys to
+// pointers.
+//
+// This function does not distinguish between a missing key and a key mapped
+// to a NULL value.
+template <class Collection>
+typename Collection::value_type::second_type FindPtrOrNull(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    return typename Collection::value_type::second_type();
+  }
+  return it->second;
+}
+
+// Returns a const reference to the value associated with the given key if it
+// exists, otherwise returns a const reference to the provided default value.
+//
+// WARNING: If a temporary object is passed as the default "value,"
+// this function will return a reference to that temporary object,
+// which will be destroyed at the end of the statement. A common
+// example: if you have a map with string values, and you pass a char*
+// as the default "value," either use the returned value immediately
+// or store it in a string (not string&).
+template <class Collection>
+const typename Collection::value_type::second_type& FindWithDefault(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key,
+    const typename Collection::value_type::second_type& value) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    return value;
+  }
+  return it->second;
+}
+
+// Inserts the given key-value pair into the collection. Returns true if and
+// only if the key from the given pair didn't previously exist. Otherwise, the
+// value in the map is replaced with the value from the given pair.
+template <class Collection>
+bool InsertOrUpdate(Collection* const collection,
+                    const typename Collection::value_type& vt) {
+  std::pair<typename Collection::iterator, bool> ret = collection->insert(vt);
+  if (!ret.second) {
+    // update
+    ret.first->second = vt.second;
+    return false;
+  }
+  return true;
+}
+
+// Same as above, except that the key and value are passed separately.
+template <class Collection>
+bool InsertOrUpdate(Collection* const collection,
+                    const typename Collection::value_type::first_type& key,
+                    const typename Collection::value_type::second_type& value) {
+  return InsertOrUpdate(collection,
+                        typename Collection::value_type(key, value));
+}
+
+// Inserts the given key and value into the given collection if and only if the
+// given key did NOT already exist in the collection. If the key previously
+// existed in the collection, the value is not changed. Returns true if the
+// key-value pair was inserted; returns false if the key was already present.
+template <class Collection>
+bool InsertIfNotPresent(Collection* const collection,
+                        const typename Collection::value_type& vt) {
+  return collection->insert(vt).second;
+}
+
+// Same as above except the key and value are passed separately.
+template <class Collection>
+bool InsertIfNotPresent(
+    Collection* const collection,
+    const typename Collection::value_type::first_type& key,
+    const typename Collection::value_type::second_type& value) {
+  return InsertIfNotPresent(collection,
+                            typename Collection::value_type(key, value));
+}
+
+// Looks up a given key and value pair in a collection and inserts the key-value
+// pair if it's not already present. Returns a reference to the value associated
+// with the key.
+template <class Collection>
+typename Collection::value_type::second_type& LookupOrInsert(
+    Collection* const collection, const typename Collection::value_type& vt) {
+  return collection->insert(vt).first->second;
+}
+
+// Same as above except the key-value are passed separately.
+template <class Collection>
+typename Collection::value_type::second_type& LookupOrInsert(
+    Collection* const collection,
+    const typename Collection::value_type::first_type& key,
+    const typename Collection::value_type::second_type& value) {
+  return LookupOrInsert(collection,
+                        typename Collection::value_type(key, value));
+}
+
+// Saves the reverse mapping into reverse. Returns true if values could all be
+// inserted.
+template <typename M, typename ReverseM>
+bool ReverseMap(const M& m, ReverseM* reverse) {
+  bool all_unique = true;
+  for (const auto& kv : m) {
+    if (!InsertOrUpdate(reverse, kv.second, kv.first)) {
+      all_unique = false;
+    }
+  }
+  return all_unique;
+}
+
+// Like ReverseMap above, but returns its output m. Return type has to
+// be specified explicitly. Example:
+// M::M(...) : m_(...), r_(ReverseMap<decltype(r_)>(m_)) {}
+template <typename ReverseM, typename M>
+ReverseM ReverseMap(const M& m) {
+  typename std::remove_const<ReverseM>::type reverse;
+  ReverseMap(m, &reverse);
+  return reverse;
+}
+
+// Erases the m item identified by the given key, and returns the value
+// associated with that key. It is assumed that the value (i.e., the
+// mapped_type) is a pointer. Returns null if the key was not found in the
+// m.
+//
+// Examples:
+//   std::map<string, MyType*> my_map;
+//
+// One line cleanup:
+//     delete EraseKeyReturnValuePtr(&my_map, "abc");
+//
+// Use returned value:
+//     std::unique_ptr<MyType> value_ptr(
+//         EraseKeyReturnValuePtr(&my_map, "abc"));
+//     if (value_ptr.get())
+//       value_ptr->DoSomething();
+//
+template <typename Collection>
+typename Collection::value_type::second_type EraseKeyReturnValuePtr(
+    Collection* collection,
+    const typename Collection::value_type::first_type& key) {
+  auto it = collection->find(key);
+  if (it == collection->end()) return nullptr;
+  auto v = gtl::subtle::GetMapped(*it);
+  collection->erase(it);
+  return v;
+}
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_MAP_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/subtle/map_traits.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/subtle/map_traits.h
new file mode 100644
index 00000000..961dc550
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/gtl/subtle/map_traits.h
@@ -0,0 +1,65 @@
+
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Traits classes for performing uniform lookup on different map value types.
+//
+// The access is computed as follows:
+//
+//   1. If T has a `first` or `second` field, use them.
+//   2. Otherwise if it has `key()` or `value()` methods, use them.
+//   3. Otherwise the program is ill-formed.
+#ifndef XLA_TSL_LIB_GTL_SUBTLE_MAP_TRAITS_H_
+#define XLA_TSL_LIB_GTL_SUBTLE_MAP_TRAITS_H_
+#include <utility>
+namespace tsl {
+namespace gtl {
+namespace subtle {
+namespace internal_map_traits {
+struct Rank1 {};
+struct Rank0 : Rank1 {};
+template <class V>
+auto GetKey(V&& v, Rank0) -> decltype((std::forward<V>(v).first)) {
+  return std::forward<V>(v).first;
+}
+template <class V>
+auto GetKey(V&& v, Rank1) -> decltype(std::forward<V>(v).key()) {
+  return std::forward<V>(v).key();
+}
+template <class V>
+auto GetMapped(V&& v, Rank0) -> decltype((std::forward<V>(v).second)) {
+  return std::forward<V>(v).second;
+}
+template <class V>
+auto GetMapped(V&& v, Rank1) -> decltype(std::forward<V>(v).value()) {
+  return std::forward<V>(v).value();
+}
+}  // namespace internal_map_traits
+// Accesses the `key_type` from a `value_type`.
+template <typename V>
+auto GetKey(V&& v)
+    -> decltype(internal_map_traits::GetKey(std::forward<V>(v),
+                                            internal_map_traits::Rank0())) {
+  return internal_map_traits::GetKey(std::forward<V>(v),
+                                     internal_map_traits::Rank0());
+}
+// Accesses the `mapped_type` from a `value_type`.
+template <typename V>
+auto GetMapped(V&& v)
+    -> decltype(internal_map_traits::GetMapped(std::forward<V>(v),
+                                               internal_map_traits::Rank0())) {
+  return internal_map_traits::GetMapped(std::forward<V>(v),
+                                        internal_map_traits::Rank0());
+}
+}  // namespace subtle
+}  // namespace gtl
+}  // namespace tsl
+#endif  // XLA_TSL_LIB_GTL_SUBTLE_MAP_TRAITS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/hash/crc32c.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/hash/crc32c.h
new file mode 100644
index 00000000..8d797dac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/hash/crc32c.h
@@ -0,0 +1,70 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_HASH_CRC32C_H_
+#define XLA_TSL_LIB_HASH_CRC32C_H_
+
+#include <stddef.h>
+
+#include "absl/crc/crc32c.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/cord.h"
+#include "tsl/platform/platform.h"
+
+namespace tsl {
+namespace crc32c {
+
+// Return the crc32c of concat(A, buf[0,size-1]) where init_crc is the
+// crc32c of some string A.  Extend() is often used to maintain the
+// crc32c of a stream of data.
+inline uint32 Extend(uint32 init_crc, const char* buf, size_t size) {
+  return static_cast<uint32>(absl::ExtendCrc32c(
+      static_cast<absl::crc32c_t>(init_crc), absl::string_view(buf, size)));
+}
+
+#if defined(TF_CORD_SUPPORT)
+extern uint32 Extend(uint32 init_crc, const absl::Cord& cord);
+#endif
+
+// Return the crc32c of data[0,n-1]
+inline uint32 Value(const char* data, size_t n) { return Extend(0, data, n); }
+
+#if defined(TF_CORD_SUPPORT)
+inline uint32 Value(const absl::Cord& cord) { return Extend(0, cord); }
+#endif
+
+static const uint32 kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs.  Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32 Mask(uint32 crc) {
+  // Rotate right by 15 bits and add a constant.
+  return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32 Unmask(uint32 masked_crc) {
+  uint32 rot = masked_crc - kMaskDelta;
+  return ((rot >> 17) | (rot << 15));
+}
+
+}  // namespace crc32c
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_HASH_CRC32C_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/histogram/histogram.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/histogram/histogram.h
new file mode 100644
index 00000000..88fe7be6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/histogram/histogram.h
@@ -0,0 +1,143 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_HISTOGRAM_HISTOGRAM_H_
+#define XLA_TSL_LIB_HISTOGRAM_HISTOGRAM_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+class HistogramProto;
+}
+
+namespace tsl {
+using tensorflow::HistogramProto;
+
+namespace histogram {
+
+class Histogram {
+ public:
+  // Create a histogram with a default set of bucket boundaries.
+  // Buckets near zero cover very small ranges (e.g. 10^-12), and each
+  // bucket range grows by ~10% as we head away from zero.  The
+  // buckets cover the range from -DBL_MAX to DBL_MAX.
+  Histogram();
+
+  // Create a histogram with a custom set of bucket boundaries,
+  // specified in "custom_bucket_limits[0..custom_bucket_limits.size()-1]"
+  // REQUIRES: custom_bucket_limits[i] values are monotonically increasing.
+  // REQUIRES: custom_bucket_limits is not empty()
+  explicit Histogram(absl::Span<const double> custom_bucket_limits);
+
+  // Restore the state of a histogram that was previously encoded
+  // via Histogram::EncodeToProto.  Note that only the bucket boundaries
+  // generated by EncodeToProto will be restored.
+  bool DecodeFromProto(const HistogramProto& proto);
+
+  ~Histogram() {}
+
+  void Clear();
+  void Add(double value);
+
+  // Save the current state of the histogram to "*proto".  If
+  // "preserve_zero_buckets" is false, only non-zero bucket values and
+  // ranges are saved, and the bucket boundaries of zero-valued buckets
+  // are lost.
+  void EncodeToProto(HistogramProto* proto, bool preserve_zero_buckets) const;
+
+  // Return the median of the values in the histogram
+  double Median() const;
+
+  // Return the "p"th percentile [0.0..100.0] of the values in the
+  // distribution
+  double Percentile(double p) const;
+
+  // Return the average value of the distribution
+  double Average() const;
+
+  // Return the standard deviation of values in the distribution
+  double StandardDeviation() const;
+
+  // Returns a multi-line human-readable string representing the histogram
+  // contents.  Example output:
+  //   Count: 4  Average: 251.7475  StdDev: 432.02
+  //   Min: -3.0000  Median: 5.0000  Max: 1000.0000
+  //   ------------------------------------------------------
+  //   [      -5,       0 )       1  25.000%  25.000% #####
+  //   [       0,       5 )       1  25.000%  50.000% #####
+  //   [       5,      10 )       1  25.000%  75.000% #####
+  //   [    1000,   10000 )       1  25.000% 100.000% #####
+  std::string ToString() const;
+
+ private:
+  double min_;
+  double max_;
+  double num_;
+  double sum_;
+  double sum_squares_;
+
+  std::vector<double> custom_bucket_limits_;
+  absl::Span<const double> bucket_limits_;
+  std::vector<double> buckets_;
+
+  double Remap(double x, double x0, double x1, double y0, double y1) const;
+
+  Histogram(const Histogram&) = delete;
+  void operator=(const Histogram&) = delete;
+};
+
+// Wrapper around a Histogram object that is thread safe.
+//
+// All methods hold a lock while delegating to a Histogram object owned by the
+// ThreadSafeHistogram instance.
+//
+// See Histogram for documentation of the methods.
+class ThreadSafeHistogram {
+ public:
+  ThreadSafeHistogram() {}
+  explicit ThreadSafeHistogram(absl::Span<const double> custom_bucket_limits)
+      : histogram_(custom_bucket_limits) {}
+  bool DecodeFromProto(const HistogramProto& proto);
+
+  ~ThreadSafeHistogram() {}
+
+  void Clear();
+
+  // TODO(mdevin): It might be a good idea to provide a AddN(<many values>)
+  // method to avoid grabbing/releasing the lock when adding many values.
+  void Add(double value);
+
+  void EncodeToProto(HistogramProto* proto, bool preserve_zero_buckets) const;
+  double Median() const;
+  double Percentile(double p) const;
+  double Average() const;
+  double StandardDeviation() const;
+  std::string ToString() const;
+
+ private:
+  mutable mutex mu_;
+  Histogram histogram_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace histogram
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_HISTOGRAM_HISTOGRAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/block.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/block.h
new file mode 100644
index 00000000..c97a0f98
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/block.h
@@ -0,0 +1,57 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_BLOCK_H_
+#define XLA_TSL_LIB_IO_BLOCK_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xla/tsl/lib/io/iterator.h"
+
+namespace tsl {
+namespace table {
+
+struct BlockContents;
+
+class Block {
+ public:
+  // Initialize the block with the specified contents.
+  explicit Block(const BlockContents& contents);
+
+  ~Block();
+
+  size_t size() const { return size_; }
+  Iterator* NewIterator();
+
+ private:
+  uint32 NumRestarts() const;
+
+  const char* data_;
+  size_t size_;
+  uint32 restart_offset_;  // Offset in data_ of restart array
+  bool owned_;             // Block owns data_[]
+
+  // No copying allowed
+  Block(const Block&);
+  void operator=(const Block&);
+
+  class Iter;
+};
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_BLOCK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/block_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/block_builder.h
new file mode 100644
index 00000000..aef643d3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/block_builder.h
@@ -0,0 +1,70 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_BLOCK_BUILDER_H_
+#define XLA_TSL_LIB_IO_BLOCK_BUILDER_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace table {
+
+struct Options;
+
+class BlockBuilder {
+ public:
+  explicit BlockBuilder(const Options* options);
+
+  // Reset the contents as if the BlockBuilder was just constructed.
+  void Reset();
+
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  void Add(const absl::string_view& key, const absl::string_view& value);
+
+  // Finish building the block and return a slice that refers to the
+  // block contents.  The returned slice will remain valid for the
+  // lifetime of this builder or until Reset() is called.
+  absl::string_view Finish();
+
+  // Returns an estimate of the current (uncompressed) size of the block
+  // we are building.
+  size_t CurrentSizeEstimate() const;
+
+  // Return true iff no entries have been added since the last Reset()
+  bool empty() const { return buffer_.empty(); }
+
+ private:
+  const Options* options_;
+  string buffer_;                 // Destination buffer
+  std::vector<uint32> restarts_;  // Restart points
+  int counter_;                   // Number of entries emitted since restart
+  bool finished_;                 // Has Finish() been called?
+  string last_key_;
+
+  // No copying allowed
+  BlockBuilder(const BlockBuilder&);
+  void operator=(const BlockBuilder&);
+};
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_BLOCK_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/buffered_file.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/buffered_file.h
new file mode 100644
index 00000000..6fc9b994
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/buffered_file.h
@@ -0,0 +1,116 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_BUFFERED_FILE_H_
+#define XLA_TSL_LIB_IO_BUFFERED_FILE_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "xla/tsl/lib/hash/crc32c.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/cord.h"
+
+namespace tsl {
+class BufferedWritableFile : public WritableFile {
+ public:
+  explicit BufferedWritableFile(std::unique_ptr<WritableFile> file,
+                                int64_t buffer_size = kDefaultBufferSize)
+      : file_(std::move(file)) {
+    buffer_.resize(buffer_size);
+  }
+  ~BufferedWritableFile() override { Close().IgnoreError(); }
+
+  absl::Status Append(absl::string_view str_data) override {
+    int64_t bytes_left = str_data.size();
+    const char* data = str_data.data();
+
+    while (bytes_left > 0) {
+      int64_t append_bytes = std::min(
+          static_cast<int64_t>(buffer_.size() - buffer_pos_), bytes_left);
+
+      // Create a copy of the current segment and compute the checksum. We
+      // compute the checksum on the copied section in case the underlying
+      // `data` buffer changes while we are writing.
+      std::copy_n(data, append_bytes, buffer_.begin() + buffer_pos_);
+      crc32_ = crc32c::Extend(crc32_, &buffer_[buffer_pos_], append_bytes);
+      buffer_pos_ += append_bytes;
+      if (buffer_pos_ == buffer_.size()) {
+        TF_RETURN_IF_ERROR(file_->Append(buffer_));
+        buffer_pos_ = 0;
+      }
+      data = data + append_bytes;
+      bytes_left -= append_bytes;
+    }
+
+    return absl::OkStatus();
+  }
+
+  absl::Status Append(const absl::Cord& data) override {
+    for (absl::string_view fragment : data.Chunks()) {
+      TF_RETURN_IF_ERROR(Append(fragment));
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status Close() override {
+    TF_RETURN_IF_ERROR(Flush());
+    return file_->Close();
+  }
+
+  absl::Status Flush() override {
+    if (buffer_pos_ > 0) {
+      TF_RETURN_IF_ERROR(
+          file_->Append(absl::string_view(&buffer_[0], buffer_pos_)));
+      buffer_pos_ = 0;
+    }
+    return file_->Flush();
+  }
+
+  absl::Status Tell(int64_t* position) override {
+    int64_t bytes_written;
+    absl::Status status = file_->Tell(&bytes_written);
+    if (status.ok()) {
+      *position = bytes_written + buffer_pos_;
+      return absl::OkStatus();
+    } else {
+      return status;
+    }
+  }
+
+  absl::Status Sync() override { return file_->Sync(); }
+
+  // For compatibilty with the TensorBundle writer, we expose CRC32 checksums.
+  uint32_t crc32() const { return crc32_; }
+  void reset_crc32() { crc32_ = 0; }
+
+ private:
+  static constexpr int64_t kDefaultBufferSize = 1048576;
+
+  std::string buffer_;
+  int64_t buffer_pos_ = 0;
+
+  std::unique_ptr<WritableFile> file_;
+  uint32_t crc32_ = 0;
+
+  BufferedWritableFile(const BufferedWritableFile&) = delete;
+  void operator=(const BufferedWritableFile&) = delete;
+};
+
+}  // namespace tsl
+#endif  // XLA_TSL_LIB_IO_BUFFERED_FILE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/buffered_inputstream.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/buffered_inputstream.h
new file mode 100644
index 00000000..a06c79be
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/buffered_inputstream.h
@@ -0,0 +1,127 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#define XLA_TSL_LIB_IO_BUFFERED_INPUTSTREAM_H_
+
+#include <string>
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+#include "xla/tsl/platform/file_system.h"
+
+namespace tsl {
+namespace io {
+
+// Provides a buffer on top of an InputStreamInterface. A single instance of
+// BufferedInputStream is NOT safe for concurrent use by multiple threads.
+class BufferedInputStream : public InputStreamInterface {
+ public:
+  // Does not take ownership of input_stream unless owns_input_stream is set
+  // to true. input_stream must outlive *this then.
+  // TODO(rohanj): Remove owns_input_stream once the constructor below is
+  // removed.
+  BufferedInputStream(InputStreamInterface* input_stream, size_t buffer_bytes,
+                      bool owns_input_stream = false);
+
+  // For backwards compatibility, expose an interface that is similar to what
+  // InputBuffer exposes. Does not take ownership of file. file must outlive
+  // *this. This will be removed once we migrate all uses of this class to the
+  // constructor above.
+  BufferedInputStream(RandomAccessFile* file, size_t buffer_bytes);
+
+  ~BufferedInputStream() override;
+
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+
+  absl::Status SkipNBytes(int64_t bytes_to_skip) override;
+
+  int64_t Tell() const override;
+
+  // Seek to this offset within the file.
+  //
+  // If we seek to somewhere within our pre-buffered data, we will re-use what
+  // data we can.  Otherwise, Seek() throws out the current buffer and the next
+  // read will trigger an underlying read.
+  //
+  // Note: When seeking backwards in a stream, this implementation uses
+  // Reset() + SkipNBytes(), so its performance will be dependent
+  // largely on the performance of SkipNBytes().
+  absl::Status Seek(int64_t position);
+
+  // Read one text line of data into "*result" until end-of-file or a
+  // \n is read.  (The \n is not included in the result.)  Overwrites
+  // any existing data in *result.
+  //
+  // If successful, returns OK.  If we are already at the end of the
+  // file, we return an OUT_OF_RANGE error.  Otherwise, we return
+  // some other non-OK status.
+  absl::Status ReadLine(std::string* result);
+  absl::Status ReadLine(tstring* result);
+
+  // Returns one text line of data until end-of-file or a '\n' is read. The '\n'
+  // is included in the result.
+  // This method is a substitute for ReadLine() when called from Python which is
+  // the expectation in the python File::readline() API.
+  // Also, '\0's are treated like any other character within the line and given
+  // no special treatment.
+  std::string ReadLineAsString();
+
+  // Skip one text line of data.
+  //
+  // If successful, returns OK.  If we are already at the end of the
+  // file, we return an OUT_OF_RANGE error.  Otherwise, we return
+  // some other non-OK status.
+  absl::Status SkipLine();
+
+  // Reads the entire contents of the file into *result.
+  //
+  // Note: the amount of memory used by this function call is unbounded, so only
+  // use in ops that expect that behavior.
+  template <typename T>
+  absl::Status ReadAll(T* result);
+
+  absl::Status Reset() override;
+
+ private:
+  absl::Status FillBuffer();
+  template <typename StringType>
+  absl::Status ReadLineHelper(StringType* result, bool include_eol);
+
+  InputStreamInterface* input_stream_;  // not owned.
+  size_t size_;                         // buffer size.
+  tstring buf_;                         // the buffer itself.
+  // buf_[pos_, limit_) holds the valid "read ahead" data in the file.
+  size_t pos_ = 0;    // current position in buf_.
+  size_t limit_ = 0;  // just past the end of valid data in buf_.
+  bool owns_input_stream_ = false;
+  // When EoF is reached, file_status_ contains the status to skip unnecessary
+  // buffer allocations.
+  absl::Status file_status_ = absl::OkStatus();
+
+  BufferedInputStream(const BufferedInputStream&) = delete;
+  void operator=(const BufferedInputStream&) = delete;
+};
+
+// Explicit instantiations defined in buffered_inputstream.cc.
+#ifndef SWIG
+extern template Status BufferedInputStream::ReadAll<std::string>(
+    std::string* result);
+extern template Status BufferedInputStream::ReadAll<tstring>(tstring* result);
+#endif  // SWIG
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_BUFFERED_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/cache.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/cache.h
new file mode 100644
index 00000000..9cd5502c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/cache.h
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_CACHE_H_
+#define XLA_TSL_LIB_IO_CACHE_H_
+
+#include <cstdint>
+
+#include "tsl/platform/stringpiece.h"
+
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+namespace tsl {
+
+using Slice = absl::string_view;
+
+namespace table {
+
+class Cache;
+
+// Create a new cache with a fixed size capacity.  This implementation
+// of Cache uses a least-recently-used eviction policy.
+Cache* NewLRUCache(size_t capacity);
+
+class Cache {
+ public:
+  Cache() = default;
+
+  Cache(const Cache&) = delete;
+  Cache& operator=(const Cache&) = delete;
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle {};
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // Remove all cache entries that are not actively in use.  Memory-constrained
+  // applications may wish to call this method to reduce memory usage.
+  // Default implementation of Prune() does nothing.  Subclasses are strongly
+  // encouraged to override the default implementation.  A future release of
+  // leveldb may change Prune() to a pure abstract method.
+  virtual void Prune() {}
+
+  // Return an estimate of the combined charges of all elements stored in the
+  // cache.
+  virtual size_t TotalCharge() const = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+};
+
+}  // namespace table
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/compression.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/compression.h
new file mode 100644
index 00000000..ce3b7fb4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/compression.h
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_COMPRESSION_H_
+#define XLA_TSL_LIB_IO_COMPRESSION_H_
+
+namespace tsl {
+namespace io {
+namespace compression {
+
+extern const char kNone[];
+extern const char kGzip[];
+extern const char kSnappy[];
+extern const char kZlib[];
+
+}  // namespace compression
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_COMPRESSION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/format.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/format.h
new file mode 100644
index 00000000..408be574
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/format.h
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_FORMAT_H_
+#define XLA_TSL_LIB_IO_FORMAT_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "xla/tsl/lib/io/table_builder.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+class RandomAccessFile;
+
+namespace table {
+
+class Block;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+  BlockHandle();
+
+  // The offset of the block in the file.
+  uint64 offset() const { return offset_; }
+  void set_offset(uint64 offset) { offset_ = offset; }
+
+  // The size of the stored block
+  uint64 size() const { return size_; }
+  void set_size(uint64 size) { size_ = size; }
+
+  void EncodeTo(string* dst) const;
+  absl::Status DecodeFrom(absl::string_view* input);
+
+  // Maximum encoding length of a BlockHandle
+  enum { kMaxEncodedLength = 10 + 10 };
+
+ private:
+  uint64 offset_;
+  uint64 size_;
+};
+
+// Footer encapsulates the fixed information stored at the tail
+// end of every table file.
+class Footer {
+ public:
+  Footer() {}
+
+  // The block handle for the metaindex block of the table
+  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+  void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
+
+  // The block handle for the index block of the table
+  const BlockHandle& index_handle() const { return index_handle_; }
+  void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
+
+  void EncodeTo(string* dst) const;
+  absl::Status DecodeFrom(absl::string_view* input);
+
+  // Encoded length of a Footer.  Note that the serialization of a
+  // Footer will always occupy exactly this many bytes.  It consists
+  // of two block handles and a magic number.
+  enum { kEncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8 };
+
+ private:
+  BlockHandle metaindex_handle_;
+  BlockHandle index_handle_;
+};
+
+// kTableMagicNumber was picked by running
+//    echo http://code.google.com/p/leveldb/ | sha1sum
+// and taking the leading 64 bits.
+static const uint64 kTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// 1-byte type + 32-bit crc
+static const size_t kBlockTrailerSize = 5;
+
+struct BlockContents {
+  absl::string_view data;  // Actual contents of data
+  bool cacheable;       // True iff data can be cached
+  bool heap_allocated;  // True iff caller should delete[] data.data()
+};
+
+// Read the block identified by "handle" from "file".  On failure
+// return non-OK.  On success fill *result and return OK.
+extern absl::Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
+                              BlockContents* result);
+
+// Implementation details follow.  Clients should ignore,
+
+inline BlockHandle::BlockHandle()
+    : offset_(~static_cast<uint64>(0)), size_(~static_cast<uint64>(0)) {}
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_FORMAT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/inputbuffer.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/inputbuffer.h
new file mode 100644
index 00000000..1d9db6bf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/inputbuffer.h
@@ -0,0 +1,152 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_INPUTBUFFER_H_
+#define XLA_TSL_LIB_IO_INPUTBUFFER_H_
+
+#include <string>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/coding.h"
+
+namespace tsl {
+namespace io {
+
+// An InputBuffer provides a buffer on top of a RandomAccessFile.
+// A given instance of an InputBuffer is NOT safe for concurrent use
+// by multiple threads
+class InputBuffer {
+ public:
+  // Create an InputBuffer for "file" with a buffer size of
+  // "buffer_bytes" bytes.  'file' must outlive *this.
+  InputBuffer(RandomAccessFile* file, size_t buffer_bytes);
+  ~InputBuffer();
+
+  // Read one text line of data into "*result" until end-of-file or a
+  // \n is read.  (The \n is not included in the result.)  Overwrites
+  // any existing data in *result.
+  //
+  // If successful, returns OK.  If we are already at the end of the
+  // file, we return an OUT_OF_RANGE error.  Otherwise, we return
+  // some other non-OK status.
+  template <typename T>
+  absl::Status ReadLine(T* result);
+
+  // Reads bytes_to_read bytes into *result, overwriting *result.
+  //
+  // If successful, returns OK.  If we there are not enough bytes to
+  // read before the end of the file, we return an OUT_OF_RANGE error.
+  // Otherwise, we return some other non-OK status.
+  absl::Status ReadNBytes(int64_t bytes_to_read, std::string* result);
+
+  // An overload that writes to char*.  Caller must ensure result[0,
+  // bytes_to_read) is valid to be overwritten.  Returns OK iff "*bytes_read ==
+  // bytes_to_read".
+  absl::Status ReadNBytes(int64_t bytes_to_read, char* result,
+                          size_t* bytes_read);
+
+  // Reads a single varint32.
+  absl::Status ReadVarint32(uint32* result);
+
+  // Reads a single varint64.
+  absl::Status ReadVarint64(uint64* result);
+
+  // Like ReadNBytes() without returning the bytes read.
+  absl::Status SkipNBytes(int64_t bytes_to_skip);
+
+  // Seek to this offset within the file.
+  //
+  // If we seek to somewhere within our pre-buffered data, we will re-use what
+  // data we can.  Otherwise, Seek() throws out the current buffer and the next
+  // read will trigger a File::Read().
+  absl::Status Seek(int64_t position);
+
+  // Provides a hint about future reads, which may improve their performance.
+  absl::Status Hint(int64_t bytes_to_read);
+
+  // Returns the position in the file.
+  int64_t Tell() const { return file_pos_ - (limit_ - pos_); }
+
+  // Returns the underlying RandomAccessFile.
+  RandomAccessFile* file() const { return file_; }
+
+ private:
+  absl::Status FillBuffer();
+
+  // Internal slow-path routine used by ReadVarint32().
+  absl::Status ReadVarint32Fallback(uint32* result);
+
+  // Internal slow-path routine used by ReadVarint64().
+  absl::Status ReadVarint64Fallback(uint64* result);
+
+  // Helper method for reading a varint which can span at max `max_bytes`.
+  // If the varint is longer, a DataLoss error status is returned.
+  // If end of file is reached while reading, OutOfRange error is returned.
+  template <typename T>
+  absl::Status ReadVarintFallback(T* result, int max_bytes);
+
+  RandomAccessFile* file_;  // Not owned
+  int64_t file_pos_;        // Next position to read from in "file_"
+  size_t size_;             // Size of "buf_"
+  char* buf_;               // The buffer itself
+  // [pos_,limit_) hold the "limit_ - pos_" bytes just before "file_pos_"
+  char* pos_;    // Current position in "buf"
+  char* limit_;  // Just past end of valid data in "buf"
+
+  InputBuffer(const InputBuffer&) = delete;
+  void operator=(const InputBuffer&) = delete;
+};
+
+// Implementation details.
+
+// Explicit instantiations defined in inputbuffer.cc.
+extern template Status InputBuffer::ReadLine<std::string>(std::string* result);
+extern template Status InputBuffer::ReadLine<tstring>(tstring* result);
+
+// Inlined for performance.
+inline absl::Status InputBuffer::ReadVarint32(uint32* result) {
+  if (pos_ + core::kMaxVarint32Bytes <= limit_) {
+    // Fast path: directly parse from buffered data.
+    // Reads strictly from the range [pos_, limit_).
+    const char* offset = core::GetVarint32Ptr(pos_, limit_, result);
+    if (offset == nullptr) return errors::OutOfRange("Parsed past limit.");
+    pos_ = const_cast<char*>(offset);
+    return absl::OkStatus();
+  } else {
+    return ReadVarint32Fallback(result);
+  }
+}
+
+// Inlined for performance.
+inline absl::Status InputBuffer::ReadVarint64(uint64* result) {
+  if (pos_ + core::kMaxVarint64Bytes <= limit_) {
+    // Fast path: directly parse from buffered data.
+    // Reads strictly from the range [pos_, limit_).
+    const char* offset = core::GetVarint64Ptr(pos_, limit_, result);
+    if (offset == nullptr) return errors::OutOfRange("Parsed past limit.");
+    pos_ = const_cast<char*>(offset);
+    return absl::OkStatus();
+  } else {
+    return ReadVarint64Fallback(result);
+  }
+}
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_INPUTBUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/inputstream_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/inputstream_interface.h
new file mode 100644
index 00000000..bde311a7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/inputstream_interface.h
@@ -0,0 +1,70 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_INPUTSTREAM_INTERFACE_H_
+#define XLA_TSL_LIB_IO_INPUTSTREAM_INTERFACE_H_
+
+#include <string>
+
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/cord.h"
+
+namespace tsl {
+namespace io {
+
+// An interface that defines input streaming operations.
+class InputStreamInterface {
+ public:
+  InputStreamInterface() {}
+  virtual ~InputStreamInterface() {}
+
+  // Reads the next bytes_to_read from the file. Typical return codes:
+  //  * OK - in case of success.
+  //  * OUT_OF_RANGE - not enough bytes remaining before end of file.
+  virtual absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) = 0;
+
+#if defined(TF_CORD_SUPPORT)
+  // Reads the next bytes_to_read from the file. Typical return codes:
+  //  * OK - in case of success.
+  //  * OUT_OF_RANGE - not enough bytes remaining before end of file.
+  virtual absl::Status ReadNBytes(int64_t bytes_to_read, absl::Cord* cord) {
+    return errors::Unimplemented(
+        "ReadNBytes(int64, absl::Cord*) is not implemented.");
+  }
+#endif
+
+  // Skips bytes_to_skip before next ReadNBytes. bytes_to_skip should be >= 0.
+  // Typical return codes:
+  //  * OK - in case of success.
+  //  * OUT_OF_RANGE - not enough bytes remaining before end of file.
+  virtual absl::Status SkipNBytes(int64_t bytes_to_skip);
+
+  // Return the offset of the current byte relative to the beginning of the
+  // file.
+  // If we Skip / Read beyond the end of the file, this should return the length
+  // of the file.
+  // If there are any errors, this must return -1.
+  virtual int64_t Tell() const = 0;
+
+  // Resets the stream to the beginning.
+  virtual absl::Status Reset() = 0;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_INPUTSTREAM_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/iterator.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/iterator.h
new file mode 100644
index 00000000..23774db4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/iterator.h
@@ -0,0 +1,104 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef XLA_TSL_LIB_IO_ITERATOR_H_
+#define XLA_TSL_LIB_IO_ITERATOR_H_
+
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace table {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the first key in the source that is at or past target.
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const absl::string_view& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual absl::string_view key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual absl::string_view value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  virtual absl::Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const absl::Status& status);
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
new file mode 100644
index 00000000..a63a4f95
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
@@ -0,0 +1,99 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_PROTO_ENCODE_HELPER_H_
+#define XLA_TSL_LIB_IO_PROTO_ENCODE_HELPER_H_
+
+#include "xla/tsl/platform/logging.h"
+#include "tsl/platform/coding.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/stringpiece.h"
+
+// A helper class for appending various kinds of values in protocol
+// buffer encoding format to a buffer.  The client gives a pointer to
+// a buffer and a maximum size guarantee for the number of bytes they
+// will add to this buffer.
+namespace tsl {
+namespace io {
+
+class ProtoEncodeHelper {
+ public:
+  ProtoEncodeHelper(char* buf, int max_size)
+      : base_(buf), p_(buf), limit_(base_ + max_size) {}
+
+  ~ProtoEncodeHelper() {
+    // Make sure callers didn't do operations that went over max_size promised
+    DCHECK_LE(p_, limit_);
+  }
+
+  const char* data() const { return base_; }
+  size_t size() const { return p_ - base_; }
+
+  void WriteUint64(int tag, uint64 v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    Encode64(v);
+  }
+  void WriteBool(int tag, bool v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    EncodeBool(v);
+  }
+  void WriteString(int tag, absl::string_view v) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(v.size());
+    EncodeBytes(v.data(), v.size());
+  }
+  void WriteVarlengthBeginning(int tag, uint32 len) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(len);
+  }
+  void WriteRawBytes(absl::string_view v) { EncodeBytes(v.data(), v.size()); }
+
+ private:
+  // Note: this module's behavior must match the protocol buffer wire encoding
+  // format.
+  enum {
+    WIRETYPE_VARINT = 0,
+    WIRETYPE_LENGTH_DELIMITED = 2,
+  };
+  static uint32 combine(uint32 tag, uint32 type) { return ((tag << 3) | type); }
+  inline void Encode32(uint32 v) {
+    if (v < 128) {
+      // Fast path for single-byte values.  Many of the calls will use a
+      // constant value for v, so the comparison will get optimized away
+      // when Encode32 is inlined into the caller.
+      *p_ = v;
+      p_++;
+    } else {
+      p_ = core::EncodeVarint32(p_, v);
+    }
+  }
+  void Encode64(uint64 v) { p_ = core::EncodeVarint64(p_, v); }
+  void EncodeBool(bool v) {
+    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
+    p_++;
+  }
+  void EncodeBytes(const char* bytes, int N) {
+    memcpy(p_, bytes, N);
+    p_ += N;
+  }
+
+  char* base_;
+  char* p_;
+  char* limit_;  // Just for CHECKs
+};
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_PROTO_ENCODE_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/random_inputstream.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/random_inputstream.h
new file mode 100644
index 00000000..04f57654
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/random_inputstream.h
@@ -0,0 +1,62 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_RANDOM_INPUTSTREAM_H_
+#define XLA_TSL_LIB_IO_RANDOM_INPUTSTREAM_H_
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+#include "xla/tsl/platform/file_system.h"
+#include "tsl/platform/cord.h"
+
+namespace tsl {
+namespace io {
+
+// Wraps a RandomAccessFile in an InputStreamInterface. A given instance of
+// RandomAccessInputStream is NOT safe for concurrent use by multiple threads.
+class RandomAccessInputStream : public InputStreamInterface {
+ public:
+  // Does not take ownership of 'file' unless owns_file is set to true. 'file'
+  // must outlive *this.
+  RandomAccessInputStream(RandomAccessFile* file, bool owns_file = false);
+
+  ~RandomAccessInputStream() override;
+
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
+#endif
+
+  absl::Status SkipNBytes(int64_t bytes_to_skip) override;
+
+  int64_t Tell() const override;
+
+  absl::Status Seek(int64_t position) {
+    pos_ = position;
+    return absl::OkStatus();
+  }
+
+  absl::Status Reset() override { return Seek(0); }
+
+ private:
+  RandomAccessFile* file_;  // Not owned.
+  int64_t pos_ = 0;         // Tracks where we are in the file.
+  bool owns_file_ = false;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_RANDOM_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/record_reader.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/record_reader.h
new file mode 100644
index 00000000..8f144148
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/record_reader.h
@@ -0,0 +1,180 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_RECORD_READER_H_
+#define XLA_TSL_LIB_IO_RECORD_READER_H_
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+#include "xla/tsl/platform/errors.h"
+#include "tsl/platform/stringpiece.h"
+#if !defined(IS_SLIM_BUILD)
+#include "xla/tsl/lib/io/snappy/snappy_compression_options.h"
+#include "xla/tsl/lib/io/snappy/snappy_inputstream.h"
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "xla/tsl/lib/io/zlib_inputstream.h"
+#endif  // IS_SLIM_BUILD
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+class RandomAccessFile;
+
+namespace io {
+
+struct RecordReaderOptions {
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
+  CompressionType compression_type = NONE;
+
+  // If buffer_size is non-zero, then all reads must be sequential, and no
+  // skipping around is permitted. (Note: this is the same behavior as reading
+  // compressed files.) Consider using SequentialRecordReader.
+  int64_t buffer_size = 0;
+
+  static RecordReaderOptions CreateRecordReaderOptions(
+      const string& compression_type);
+
+#if !defined(IS_SLIM_BUILD)
+  // Options specific to compression.
+  ZlibCompressionOptions zlib_options;
+  SnappyCompressionOptions snappy_options;
+#endif  // IS_SLIM_BUILD
+};
+
+// Low-level interface to read TFRecord files.
+//
+// If using compression or buffering, consider using SequentialRecordReader.
+//
+// Note: this class is not thread safe; external synchronization required.
+class RecordReader {
+ public:
+  // Format of a single record:
+  //  uint64    length
+  //  uint32    masked crc of length
+  //  byte      data[length]
+  //  uint32    masked crc of data
+  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
+  static constexpr size_t kFooterSize = sizeof(uint32);
+
+  // Statistics (sizes are in units of bytes)
+  struct Stats {
+    int64_t file_size = -1;
+    int64_t data_size = -1;
+    int64_t entries = -1;  // Number of values
+  };
+
+  // Metadata for the TFRecord file.
+  struct Metadata {
+    Stats stats;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  explicit RecordReader(
+      tsl::RandomAccessFile* file,
+      const RecordReaderOptions& options = RecordReaderOptions());
+
+  virtual ~RecordReader() = default;
+
+  // Read the record at "*offset" into *record and update *offset to
+  // point to the offset of the next record.  Returns OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  absl::Status ReadRecord(uint64* offset, tstring* record);
+
+  // Skip num_to_skip record starting at "*offset" and update *offset
+  // to point to the offset of the next num_to_skip + 1 record.
+  // Return OK on success, OUT_OF_RANGE for end of file, or something
+  // else for an error. "*num_skipped" records the number of records that
+  // are actually skipped. It should be equal to num_to_skip on success.
+  absl::Status SkipRecords(uint64* offset, int num_to_skip, int* num_skipped);
+
+  // Return the metadata of the Record file.
+  //
+  // The current implementation scans the file to completion,
+  // skipping over the data regions, to extract the metadata once
+  // on the first call to GetStats().  An improved implementation
+  // would change RecordWriter to write the metadata into TFRecord
+  // so that GetMetadata() could be a const method.
+  //
+  // 'metadata' must not be nullptr.
+  absl::Status GetMetadata(Metadata* md);
+
+ private:
+  absl::Status ReadChecksummed(uint64 offset, size_t n, tstring* result);
+  absl::Status PositionInputStream(uint64 offset);
+
+  RecordReaderOptions options_;
+  std::unique_ptr<InputStreamInterface> input_stream_;
+  bool last_read_failed_;
+
+  std::unique_ptr<Metadata> cached_metadata_;
+
+  RecordReader(const RecordReader&) = delete;
+  void operator=(const RecordReader&) = delete;
+};
+
+// High-level interface to read TFRecord files.
+//
+// Note: this class is not thread safe; external synchronization required.
+class SequentialRecordReader {
+ public:
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  explicit SequentialRecordReader(
+      tsl::RandomAccessFile* file,
+      const RecordReaderOptions& options = RecordReaderOptions());
+
+  virtual ~SequentialRecordReader() = default;
+
+  // Read the next record in the file into *record. Returns OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  absl::Status ReadRecord(tstring* record) {
+    return underlying_.ReadRecord(&offset_, record);
+  }
+
+  // Skip the next num_to_skip record in the file. Return OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  // "*num_skipped" records the number of records that are actually skipped.
+  // It should be equal to num_to_skip on success.
+  absl::Status SkipRecords(int num_to_skip, int* num_skipped) {
+    return underlying_.SkipRecords(&offset_, num_to_skip, num_skipped);
+  }
+
+  // Return the current offset in the file.
+  uint64 TellOffset() { return offset_; }
+
+  // Seek to this offset within the file and set this offset as the current
+  // offset. Trying to seek backward will throw error.
+  absl::Status SeekOffset(uint64 offset) {
+    if (offset < offset_)
+      return errors::InvalidArgument(
+          "Trying to seek offset: ", offset,
+          " which is less than the current offset: ", offset_);
+    offset_ = offset;
+    return absl::OkStatus();
+  }
+
+ private:
+  RecordReader underlying_;
+  uint64 offset_ = 0;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_RECORD_READER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/record_writer.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/record_writer.h
new file mode 100644
index 00000000..ced0bc68
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/record_writer.h
@@ -0,0 +1,156 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_RECORD_WRITER_H_
+#define XLA_TSL_LIB_IO_RECORD_WRITER_H_
+
+#include "xla/tsl/lib/hash/crc32c.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/coding.h"
+#include "tsl/platform/stringpiece.h"
+#if !defined(IS_SLIM_BUILD)
+#include "xla/tsl/lib/io/snappy/snappy_compression_options.h"
+#include "xla/tsl/lib/io/snappy/snappy_outputbuffer.h"
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "xla/tsl/lib/io/zlib_outputbuffer.h"
+#endif  // IS_SLIM_BUILD
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/cord.h"
+
+namespace tsl {
+
+class WritableFile;
+
+namespace io {
+
+struct RecordWriterOptions {
+ public:
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
+  CompressionType compression_type = NONE;
+
+  static RecordWriterOptions CreateRecordWriterOptions(
+      const string& compression_type);
+
+#if !defined(IS_SLIM_BUILD)
+  // Options specific to compression.
+  io::ZlibCompressionOptions zlib_options;
+  io::SnappyCompressionOptions snappy_options;
+#endif  // IS_SLIM_BUILD
+};
+
+class RecordWriter {
+ public:
+  // Format of a single record:
+  //  uint64    length
+  //  uint32    masked crc of length
+  //  byte      data[length]
+  //  uint32    masked crc of data
+  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
+  static constexpr size_t kFooterSize = sizeof(uint32);
+
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit RecordWriter(WritableFile* dest, const RecordWriterOptions& options =
+                                                RecordWriterOptions());
+
+  // Calls Close() and logs if an error occurs.
+  //
+  // TODO(jhseu): Require that callers explicitly call Close() and remove the
+  // implicit Close() call in the destructor.
+  ~RecordWriter();
+
+  absl::Status WriteRecord(absl::string_view data);
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status WriteRecord(const absl::Cord& data);
+#endif
+
+  // Flushes any buffered data held by underlying containers of the
+  // RecordWriter to the WritableFile. Does *not* flush the
+  // WritableFile.
+  absl::Status Flush();
+
+  // Writes all output to the file. Does *not* close the WritableFile.
+  //
+  // After calling Close(), any further calls to `WriteRecord()` or `Flush()`
+  // are invalid.
+  absl::Status Close();
+
+  // Utility method to populate TFRecord headers.  Populates record-header in
+  // "header[0,kHeaderSize-1]".  The record-header is based on data[0, n-1].
+  inline static void PopulateHeader(char* header, const char* data, size_t n);
+
+#if defined(TF_CORD_SUPPORT)
+  inline static void PopulateHeader(char* header, const absl::Cord& data);
+#endif
+
+  // Utility method to populate TFRecord footers.  Populates record-footer in
+  // "footer[0,kFooterSize-1]".  The record-footer is based on data[0, n-1].
+  inline static void PopulateFooter(char* footer, const char* data, size_t n);
+
+#if defined(TF_CORD_SUPPORT)
+  inline static void PopulateFooter(char* footer, const absl::Cord& data);
+#endif
+
+ private:
+  WritableFile* dest_;
+  RecordWriterOptions options_;
+
+  inline static uint32 MaskedCrc(const char* data, size_t n) {
+    return crc32c::Mask(crc32c::Value(data, n));
+  }
+
+#if defined(TF_CORD_SUPPORT)
+  inline static uint32 MaskedCrc(const absl::Cord& data) {
+    return crc32c::Mask(crc32c::Value(data));
+  }
+#endif
+
+  RecordWriter(const RecordWriter&) = delete;
+  void operator=(const RecordWriter&) = delete;
+};
+
+void RecordWriter::PopulateHeader(char* header, const char* data, size_t n) {
+  core::EncodeFixed64(header + 0, n);
+  core::EncodeFixed32(header + sizeof(uint64),
+                      MaskedCrc(header, sizeof(uint64)));
+}
+
+void RecordWriter::PopulateFooter(char* footer, const char* data, size_t n) {
+  core::EncodeFixed32(footer, MaskedCrc(data, n));
+}
+
+#if defined(TF_CORD_SUPPORT)
+void RecordWriter::PopulateHeader(char* header, const absl::Cord& data) {
+  core::EncodeFixed64(header + 0, data.size());
+  core::EncodeFixed32(header + sizeof(uint64),
+                      MaskedCrc(header, sizeof(uint64)));
+}
+
+void RecordWriter::PopulateFooter(char* footer, const absl::Cord& data) {
+  core::EncodeFixed32(footer, MaskedCrc(data));
+}
+#endif
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_RECORD_WRITER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_compression_options.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_compression_options.h
new file mode 100644
index 00000000..3dbf2ead
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_compression_options.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+#define XLA_TSL_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace io {
+
+struct SnappyCompressionOptions {
+  // Size of the buffer used for caching the data read from source file.
+  int64_t input_buffer_size = 256 << 10;
+
+  // Size of the sink buffer where the compressed/decompressed data produced by
+  // snappy is cached.
+  int64_t output_buffer_size = 256 << 10;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
new file mode 100644
index 00000000..8688e368
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_SNAPPY_SNAPPY_INPUTBUFFER_H_
+#define XLA_TSL_LIB_IO_SNAPPY_SNAPPY_INPUTBUFFER_H_
+
+#include <memory>
+#include <string>
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/snappy.h"
+
+namespace tsl {
+namespace io {
+
+// An SnappyInputBuffer provides support for reading from a file compressed
+// using snappy (https://github.com/google/snappy).
+//
+// A given instance of an SnappyInputBuffer is NOT safe for concurrent use
+// by multiple threads
+class SnappyInputBuffer : public InputStreamInterface {
+ public:
+  // Create a SnappyInputBuffer for `file` with a buffer of size
+  // `input_buffer_bytes` bytes for reading contents from `file` and another
+  // buffer with size `output_buffer_bytes` for caching decompressed contents.
+  // Does *not* take ownership of "file".
+  SnappyInputBuffer(RandomAccessFile* file, size_t input_buffer_bytes,
+                    size_t output_buffer_bytes);
+
+  // Reads bytes_to_read bytes into *result, overwriting *result.
+  //
+  // Return Status codes:
+  // OK:
+  //   If successful.
+  // OUT_OF_RANGE:
+  //   If there are not enough bytes to read before the end of the file.
+  // DATA_LOSS:
+  //   If uncompression failed or if the file is corrupted.
+  // RESOURCE_EXHAUSTED:
+  //   If input_buffer_ is smaller in size than a compressed block.
+  // others:
+  //   If reading from file failed.
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+
+  int64_t Tell() const override;
+
+  absl::Status Reset() override;
+
+ private:
+  // Reads data from `file_` and tries to fill up `input_buffer_` if enough
+  // unread data is left in `file_`.
+  //
+  // Looks up `next_in_` to check how much data in `input_buffer_`
+  // has already been read. The used data is removed and new data is added to
+  // after any unread data in `input_buffer_`.
+  // After this call `next_in` points to the start of `input_buffer_`
+  // and `avail_in_` stores the number of readable bytes in
+  // `input_buffer_`.
+  //
+  // Returns OutOfRange error if NO data could be read from file. Note that this
+  // won't return an OutOfRange if there wasn't sufficient data in file to
+  // completely fill up `input_buffer_`.
+  absl::Status ReadFromFile();
+
+  // Reads the length of the next compressed block stored in the next 4 bytes at
+  // `next_in_`. Uncompresses the next compressed block and writes the output
+  // produced to the output_buffer_.
+  // Should be called only after the cached output has been consumed.
+  absl::Status Inflate();
+
+  // Starts reading bytes at `next_out_` until either `bytes_to_read`
+  // bytes have been read or `next_out_` is reached.
+  // Returns the number of bytes read and advances the `next_out_`
+  // pointer to the next location to read from.
+  size_t ReadBytesFromCache(size_t bytes_to_read, char* result);
+
+  // Reads the length of the next *compressed* block and stores in `length`.
+  // The length is stored in 4 bytes in little endian notation.
+  absl::Status ReadCompressedBlockLength(uint32* length);
+
+  RandomAccessFile* file_;         // Not owned
+  int64_t file_pos_ = 0;           // Next position to read from in `file_`
+  size_t input_buffer_capacity_;   // Size of `input_buffer_`.
+                                   // Must be at least as big as the size of
+                                   // the largest compressed block.
+  size_t output_buffer_capacity_;  // Size of `output_buffer_`
+
+  // Buffer for storing contents read from compressed file.
+  // TODO(srbs): Consider using circular buffers. That would greatly simplify
+  // the implementation.
+  std::unique_ptr<char[]> input_buffer_;
+
+  // Buffer for storing inflated contents of `file_`.
+  std::unique_ptr<char[]> output_buffer_;
+
+  // Next unread byte in `input_buffer_`.
+  char* next_in_;
+
+  // Next unread byte in `output_buffer_`
+  char* next_out_;
+
+  // Number of unread bytes available at `next_in_` in `input_buffer_`.
+  size_t avail_in_ = 0;
+
+  // Number of unread bytes available at `next_out_` in `output_buffer_`.
+  size_t avail_out_ = 0;
+
+  // Number of *uncompressed* bytes that have been read from this stream.
+  int64_t bytes_read_;
+
+  SnappyInputBuffer(const SnappyInputBuffer&) = delete;
+  void operator=(const SnappyInputBuffer&) = delete;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_SNAPPY_SNAPPY_INPUTBUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.h
new file mode 100644
index 00000000..44535fe6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.h
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
+#define XLA_TSL_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
+
+#include <memory>
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+
+namespace tsl {
+namespace io {
+
+class SnappyInputStream : public InputStreamInterface {
+ public:
+  // Creates a SnappyInputStream for `input_stream`.
+  //
+  // Takes ownership  of `input_stream` iff `owns_input_stream` is true.
+  SnappyInputStream(InputStreamInterface* input_stream,
+                    size_t output_buffer_bytes, bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream = false.
+  explicit SnappyInputStream(InputStreamInterface* input_stream,
+                             size_t output_buffer_bytes);
+
+  ~SnappyInputStream() override;
+
+  // Reads bytes_to_read bytes into *result, overwriting *result.
+  //
+  // Return Status codes:
+  // OK:           If successful.
+  // OUT_OF_RANGE: If there are not enough bytes to read before
+  //               the end of the stream.
+  // ABORTED:      If inflate() fails, we return the error code with the
+  //               error message in `z_stream_->msg`.
+  // others:       If reading from stream failed.
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
+#endif
+
+  int64_t Tell() const override;
+
+  absl::Status Reset() override;
+
+ private:
+  // Decompress the next chunk of data and place the data into the cache.
+  absl::Status Inflate();
+
+  // Attempt to read `bytes_to_read` from the decompressed data cache. Returns
+  // the actual number of bytes read.
+  size_t ReadBytesFromCache(size_t bytes_to_read, char* result);
+
+  InputStreamInterface* input_stream_;
+  const size_t output_buffer_bytes_;
+  const bool owns_input_stream_;
+
+  // Specifies the number of decompressed bytes currently read.
+  int64_t bytes_read_;
+
+  // output_buffer_ contains decompressed data not yet read by the client.
+  std::unique_ptr<char[]> output_buffer_;
+
+  // next_out_ points to the position in the `output_buffer_` that contains the
+  // next unread byte.
+  char* next_out_;
+
+  // avail_out_ specifies the number of bytes left in the output_buffers_ that
+  // is not yet read.
+  size_t avail_out_;
+
+  SnappyInputStream(const SnappyInputStream&) = delete;
+  void operator=(const SnappyInputStream&) = delete;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
new file mode 100644
index 00000000..d48ded21
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
@@ -0,0 +1,158 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_SNAPPY_SNAPPY_OUTPUTBUFFER_H_
+#define XLA_TSL_LIB_IO_SNAPPY_SNAPPY_OUTPUTBUFFER_H_
+
+#include <memory>
+#include <string>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/snappy.h"
+
+namespace tsl {
+namespace io {
+
+// Compresses input data using Snappy (https://github.com/google/snappy) and
+// writes to `file`.
+//
+// The input data is cached in a buffer of size `input_buffer_bytes`. When the
+// buffer does not have enough available space to fit new data (in the call to
+// `Write`), the contents of the buffer are compressed and sent to the output
+// buffer.
+//
+// The compressed output is buffered in a buffer of size `output_buffer_bytes`
+// which gets flushed to file when full.
+//
+// Output file format:
+// The output file consists of a sequence of compressed blocks. Each block
+// starts with a 4 byte header which stores the length (in bytes) of the
+// _compressed_ block _excluding_ this header. The compressed
+// block (excluding the 4 byte header) is a valid snappy block and can directly
+// be uncompressed using Snappy_Uncompress.
+class SnappyOutputBuffer : public WritableFile {
+ public:
+  // Create an SnappyOutputBuffer for `file` with two buffers that cache the
+  // 1. input data to be deflated
+  // 2. the deflated output
+  // with sizes `input_buffer_bytes` and `output_buffer_bytes` respectively.
+  // Does not take ownership of `file`.
+  SnappyOutputBuffer(WritableFile* file, int32_t input_buffer_bytes,
+                     int32_t output_buffer_bytes);
+
+  // Per convention, the dtor does not call Flush() or Close(). We expect the
+  // caller to call those manually when done.
+  ~SnappyOutputBuffer() override;
+
+  // Adds `data` to the compression pipeline.
+  //
+  // The input data is buffered internally and will be written to disk at a
+  // later time. To immediately write contents to file call `Flush()`.
+  absl::Status Append(absl::string_view data) override;
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status Append(const absl::Cord& cord) override;
+#endif
+
+  // Compresses any buffered input and writes all output to file. This must be
+  // called before the destructor to avoid any data loss.
+  //
+  // Contrary to `Flush()` this informs snappy that it should not expect any
+  // further input.
+  //
+  // After calling this, any further calls to `Write()`, `Flush()` or `Close()`
+  // will fail.
+  absl::Status Close() override;
+
+  // Returns the name of the underlying file.
+  absl::Status Name(absl::string_view* result) const override;
+
+  // Deflates any cached input, writes all output to file and syncs it.
+  absl::Status Sync() override;
+
+  // Returns the write position in the underlying file. The position does not
+  // reflect buffered, un-flushed data.
+  absl::Status Tell(int64_t* position) override;
+
+  // Adds `data` to the compression pipeline.
+  //
+  // The input data is buffered in `input_buffer_` and is compressed in bulk
+  // when the buffer gets full. The compressed output may not be immediately
+  // written to file but rather buffered in `output_buffer_` and gets written
+  // to file when the buffer is full.
+  //
+  // To immediately write contents to file call `Flush()`.
+  absl::Status Write(absl::string_view data);
+
+  // Compresses any cached input and writes all output to file. This must be
+  // called before the destructor to avoid any data loss.
+  absl::Status Flush() override;
+
+ private:
+  // Appends `data` to `input_buffer_`.
+  // Throws if `data.size()` > AvailableInputSpace().
+  void AddToInputBuffer(absl::string_view data);
+
+  // Appends `data` to `output_buffer_`. Flushes buffer contents to file when
+  // buffer gets full.
+  absl::Status AddToOutputBuffer(const char* data, size_t length);
+
+  // Returns the total space available in `input_buffer_`.
+  int32 AvailableInputSpace() const;
+
+  // Deflate contents in input_buffer_ and store results in output_buffer_.
+  // The contents of output stream are written to file if more space is needed.
+  //
+  // Note: This method does not flush contents to file.
+  // Returns non-ok status if writing contents to file fails.
+  absl::Status DeflateBuffered();
+
+  // Appends contents of `output_buffer_` to `file_`.
+  // Returns non-OK status if writing to file fails.
+  absl::Status FlushOutputBufferToFile();
+
+  // Compresses `avail_in_` bytes at `next_in_` location in `input_buffer_` and
+  // writes the results to output using `AddToOutputBuffer`.
+  // Returns non-OK status if writing to file failed.
+  absl::Status Deflate();
+
+  WritableFile* file_;  // Not owned
+
+  // Buffer for storing contents read from input `file_`.
+  // TODO(srbs): Consider using circular buffers. That would greatly simplify
+  // the implementation.
+  std::unique_ptr<char[]> input_buffer_;
+  size_t input_buffer_capacity_;
+  char* next_in_;
+  size_t avail_in_ = 0;
+
+  // Buffer for storing deflated contents of `file_`.
+  std::unique_ptr<char[]> output_buffer_;
+  size_t output_buffer_capacity_;
+  char* next_out_;
+  size_t avail_out_;
+
+  SnappyOutputBuffer(const SnappyOutputBuffer&) = delete;
+  void operator=(const SnappyOutputBuffer&) = delete;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_SNAPPY_SNAPPY_OUTPUTBUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table.h
new file mode 100644
index 00000000..3afdb0c4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table.h
@@ -0,0 +1,87 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_TABLE_H_
+#define XLA_TSL_LIB_IO_TABLE_H_
+
+#include <stdint.h>
+
+#include "xla/tsl/lib/io/iterator.h"
+
+namespace tsl {
+
+class RandomAccessFile;
+
+namespace table {
+
+struct Options;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class Table {
+ public:
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table" to the newly opened
+  // table.  The client should delete "*table" when no longer needed.
+  // If there was an error while initializing the table, sets "*table"
+  // to NULL and returns a non-ok status.  Does not take ownership of
+  // "*file", but the client must ensure that "file" remains live
+  // for the duration of the returned table's lifetime.
+  static absl::Status Open(const Options& options, tsl::RandomAccessFile* file,
+                           uint64 file_size, Table** table);
+
+  ~Table();
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  Iterator* NewIterator() const;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64 ApproximateOffsetOf(const absl::string_view& key) const;
+
+ private:
+  struct Rep;
+  Rep* rep_;
+
+  explicit Table(Rep* rep) { rep_ = rep; }
+  static Iterator* BlockReader(void*, const absl::string_view&);
+
+  // Calls (*handle_result)(arg, ...) with the entry found after a call
+  // to Seek(key).  May not make such a call if filter policy says
+  // that key is not present.
+  absl::Status InternalGet(const absl::string_view& key, void* arg,
+                           void (*handle_result)(void* arg,
+                                                 const absl::string_view& k,
+                                                 const absl::string_view& v));
+
+  // No copying allowed
+  Table(const Table&);
+  void operator=(const Table&);
+};
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_TABLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table_builder.h
new file mode 100644
index 00000000..a9ad59a7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table_builder.h
@@ -0,0 +1,101 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+
+#ifndef XLA_TSL_LIB_IO_TABLE_BUILDER_H_
+#define XLA_TSL_LIB_IO_TABLE_BUILDER_H_
+
+#include <stdint.h>
+
+#include "xla/tsl/lib/io/table_options.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+class WritableFile;
+namespace table {
+
+class BlockBuilder;
+class BlockHandle;
+
+class TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  TableBuilder(const Options& options, tsl::WritableFile* file);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~TableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key in lexicographic order.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const absl::string_view& key, const absl::string_view& value);
+
+  // Advanced operation: writes any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // Does not flush the file itself.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // Return non-ok iff some error has been detected.
+  absl::Status status() const;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  absl::Status Finish();
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon();
+
+  // Number of calls to Add() so far.
+  uint64 NumEntries() const;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64 FileSize() const;
+
+ private:
+  bool ok() const { return status().ok(); }
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
+  void WriteRawBlock(const absl::string_view& data, CompressionType,
+                     BlockHandle* handle);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  TableBuilder(const TableBuilder&);
+  void operator=(const TableBuilder&);
+};
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_TABLE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table_options.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table_options.h
new file mode 100644
index 00000000..7784d225
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/table_options.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_TABLE_OPTIONS_H_
+#define XLA_TSL_LIB_IO_TABLE_OPTIONS_H_
+
+#include <stddef.h>
+
+namespace tsl {
+namespace table {
+
+class Cache;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0,
+  kSnappyCompression = 0x1
+};
+
+// Options to control the behavior of a table (passed to Table::Open)
+struct Options {
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  size_t block_size = 262144;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  int block_restart_interval = 16;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression = kSnappyCompression;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-null, use the specified cache for blocks.
+  Cache* block_cache = nullptr;
+};
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_TABLE_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/two_level_iterator.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/two_level_iterator.h
new file mode 100644
index 00000000..87f2e154
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/two_level_iterator.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_TWO_LEVEL_ITERATOR_H_
+#define XLA_TSL_LIB_IO_TWO_LEVEL_ITERATOR_H_
+
+#include "xla/tsl/lib/io/iterator.h"
+
+namespace tsl {
+namespace table {
+
+// Return a new two level iterator.  A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs.  The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks.  Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+extern Iterator* NewTwoLevelIterator(
+    Iterator* index_iter,
+    Iterator* (*block_function)(void* arg,
+                                const absl::string_view& index_value),
+    void* arg);
+
+}  // namespace table
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_TWO_LEVEL_ITERATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
new file mode 100644
index 00000000..b0cb2f05
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
@@ -0,0 +1,141 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+#define XLA_TSL_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace io {
+
+class ZlibCompressionOptions {
+ public:
+  ZlibCompressionOptions();
+
+  static ZlibCompressionOptions DEFAULT();
+  static ZlibCompressionOptions RAW();
+  static ZlibCompressionOptions GZIP();
+
+  // Defaults to Z_NO_FLUSH
+  int8 flush_mode;
+
+  // Size of the buffer used for caching the data read from source file.
+  int64_t input_buffer_size = 256 << 10;
+
+  // Size of the sink buffer where the compressed/decompressed data produced by
+  // zlib is cached.
+  int64_t output_buffer_size = 256 << 10;
+
+  // The window_bits parameter is the base two logarithm of the window size
+  // (the size of the history buffer). Larger values of buffer size result in
+  // better compression at the expense of memory usage.
+  //
+  // Accepted values:
+  //
+  // 8..15:
+  // Normal deflate with zlib header and checksum.
+  //
+  // -8..-15:
+  // Negative values can be used for raw deflate/inflate. In this case,
+  // -window_bits determines the window size. deflate() will then generate raw
+  // deflate data  with no zlib header or trailer, and will not compute an
+  // adler32 check value. inflate() will then process raw deflate data, not
+  // looking for a zlib or gzip header, not generating a check value, and not
+  // looking for any check values for comparison at the end of the stream.
+  //
+  // 16 + [8..15]:
+  // window_bits can also be greater than 15 for optional gzip encoding. Add 16
+  // to window_bits to write a simple gzip header and trailer around the
+  // compressed data instead of a zlib wrapper. The gzip header will have no
+  // file name, no extra data, no comment, no modification time (set to zero),
+  // no header crc, and the operating system will be set to 255 (unknown). If a
+  // gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+  //
+  // 0:
+  // window_bits can also be zero to request that inflate use the window size
+  // in the zlib header of the compressed stream.
+  //
+  // While inflating, window_bits must be greater than or equal to the
+  // window_bits value provided used while compressing. If a compressed stream
+  // with a larger window size is given as input, inflate() will return with the
+  // error code Z_DATA_ERROR instead of trying to allocate a larger window.
+  //
+  // Defaults to MAX_WBITS
+  int8 window_bits;
+
+  // From the zlib manual (http://www.zlib.net/manual.html):
+  // The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+  // 1 gives best speed, 9 gives best compression, 0 gives no compression at all
+  // (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION
+  // requests a default compromise between speed and compression (currently
+  // equivalent to level 6).
+  int8 compression_level;
+
+  // Only Z_DEFLATED is supported at this time.
+  int8 compression_method;
+
+  // From the zlib manual (http://www.zlib.net/manual.html):
+  // The mem_level parameter specifies how much memory should be allocated for
+  // the internal compression state. mem_level=1 uses minimum memory but is slow
+  // and reduces compression ratio; mem_level=9 uses maximum memory for optimal
+  // speed. The default value is 8.
+  int8 mem_level = 9;
+
+  // From the zlib manual (http://www.zlib.net/manual.html):
+  // The strategy parameter is used to tune the compression algorithm. Use the
+  // value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by
+  // a filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only
+  // (no string match), or Z_RLE to limit match distances to one
+  // (run-length encoding). Filtered data consists mostly of small values with
+  // a somewhat random distribution. In this case, the compression algorithm is
+  // tuned to compress them better. The effect of Z_FILTERED is to force more
+  // Huffman coding and less string matching; it is somewhat intermediate
+  // between Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY. Z_RLE is designed to be
+  // almost as fast as Z_HUFFMAN_ONLY, but give better compression for
+  // PNG image data. The strategy parameter only affects the compression ratio
+  // but not the correctness of the compressed output even if it is not set
+  // appropriately. Z_FIXED prevents the use of dynamic Huffman codes, allowing
+  // for a simpler decoder for special applications.
+  int8 compression_strategy;
+
+  // When this is set to true and we are unable to find the header to correctly
+  // decompress a file, we return an error when `ReadNBytes` is called instead
+  // of CHECK-failing. Defaults to false (i.e. CHECK-failing).
+  //
+  // This option is ignored for `ZlibOutputBuffer`.
+  bool soft_fail_on_error = false;  // NOLINT
+};
+
+inline ZlibCompressionOptions ZlibCompressionOptions::DEFAULT() {
+  return ZlibCompressionOptions();
+}
+
+inline ZlibCompressionOptions ZlibCompressionOptions::RAW() {
+  ZlibCompressionOptions options = ZlibCompressionOptions();
+  options.window_bits = -options.window_bits;
+  return options;
+}
+
+inline ZlibCompressionOptions ZlibCompressionOptions::GZIP() {
+  ZlibCompressionOptions options = ZlibCompressionOptions();
+  options.window_bits = options.window_bits + 16;
+  return options;
+}
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_inputstream.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_inputstream.h
new file mode 100644
index 00000000..46d78fac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_inputstream.h
@@ -0,0 +1,142 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_ZLIB_INPUTSTREAM_H_
+#define XLA_TSL_LIB_IO_ZLIB_INPUTSTREAM_H_
+
+#include <string>
+
+#include "xla/tsl/lib/io/inputstream_interface.h"
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace io {
+
+// Forward declare some members of zlib.h, which is only included in the
+// .cc file.
+struct ZStreamDef;
+
+// An ZlibInputStream provides support for reading from a stream compressed
+// using zlib (http://www.zlib.net/). Buffers the contents of the file.
+//
+// A given instance of an ZlibInputStream is NOT safe for concurrent use
+// by multiple threads
+class ZlibInputStream : public InputStreamInterface {
+ public:
+  // Create a ZlibInputStream for `input_stream` with a buffer of size
+  // `input_buffer_bytes` bytes for reading contents from `input_stream` and
+  // another buffer with size `output_buffer_bytes` for caching decompressed
+  // contents.
+  //
+  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
+  ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
+                  size_t output_buffer_bytes,
+                  const ZlibCompressionOptions& zlib_options,
+                  bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream=false.
+  ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
+                  size_t output_buffer_bytes,
+                  const ZlibCompressionOptions& zlib_options);
+
+  ~ZlibInputStream() override;
+
+  // Reads bytes_to_read bytes into *result, overwriting *result.
+  //
+  // Return Status codes:
+  // OK:           If successful.
+  // OUT_OF_RANGE: If there are not enough bytes to read before
+  //               the end of the stream.
+  // ABORTED:      If inflate() fails, we return the error code with the
+  //               error message in `z_stream_->msg`.
+  // others:       If reading from stream failed.
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
+#endif
+
+  int64_t Tell() const override;
+
+  absl::Status Reset() override;
+
+ private:
+  void InitZlibBuffer();
+
+  const bool owns_input_stream_;
+  InputStreamInterface* input_stream_;
+  size_t input_buffer_capacity_;   // Size of z_stream_input_
+  size_t output_buffer_capacity_;  // Size of z_stream_output_
+  char* next_unread_byte_;         // Next unread byte in z_stream_output_
+  bool init_error_ = false;        // Whether we encountered an error in init.
+
+  ZlibCompressionOptions const zlib_options_;
+
+  std::unique_ptr<ZStreamDef> z_stream_def_;
+
+  // Reads data from `input_stream_` and tries to fill up `z_stream_input_` if
+  // enough unread data is left in `input_stream_`.
+  //
+  // Looks up z_stream_->next_in to check how much data in z_stream_input_
+  // has already been read. The used data is removed and new data is added to
+  // after any unread data in z_stream_input_.
+  // After this call z_stream_->next_in points to the start of z_stream_input_
+  // and z_stream_->avail_in stores the number of readable bytes in
+  // z_stream_input_.
+  //
+  // Returns OutOfRange error if NO data could be read from stream. Note that
+  // this won't return an OutOfRange if there wasn't sufficient data in stream
+  // to completely fill up z_stream_input_.
+  absl::Status ReadFromStream();
+
+  // Calls `inflate()` and returns DataLoss Status if it failed.
+  absl::Status Inflate();
+
+  // Starts reading bytes at `next_unread_byte_` till either `bytes_to_read`
+  // bytes have been read or `z_stream_->next_out` is reached.
+  // Returns the number of bytes read and advances the `next_unread_byte_`
+  // pointer to the next location to read from.
+  size_t ReadBytesFromCache(size_t bytes_to_read, tstring* result);
+
+  // The number of unread bytes in z_stream_output_.
+  //
+  // z_stream_output_  -->
+  //
+  // [RRRRRRRRRRRRRRRRRRUUUUUUUUUUUUUU000000000000000000]
+  //                    ^             ^
+  //           next_unread_byte_    z_stream_->next_out
+  //
+  // R: Read bytes
+  // U: Unread bytes
+  // 0: garbage bytes where new output will be written
+  //
+  // Returns the size of [next_unread_byte_, z_stream_->next_out)
+  size_t NumUnreadBytes() const;
+
+  // Number of *uncompressed* bytes that have been read from this stream.
+  int64_t bytes_read_;
+
+  ZlibInputStream(const ZlibInputStream&) = delete;
+  void operator=(const ZlibInputStream&) = delete;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_ZLIB_INPUTSTREAM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
new file mode 100644
index 00000000..3d7e3024
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
@@ -0,0 +1,159 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_IO_ZLIB_OUTPUTBUFFER_H_
+#define XLA_TSL_LIB_IO_ZLIB_OUTPUTBUFFER_H_
+
+#include <zlib.h>
+
+#include <string>
+
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace io {
+
+// Provides support for writing compressed output to file using zlib
+// (http://www.zlib.net/).
+// A given instance of an ZlibOutputBuffer is NOT safe for concurrent use
+// by multiple threads
+class ZlibOutputBuffer : public WritableFile {
+ public:
+  // Create an ZlibOutputBuffer for `file` with two buffers that cache the
+  // 1. input data to be deflated
+  // 2. the deflated output
+  // with sizes `input_buffer_bytes` and `output_buffer_bytes` respectively.
+  // Does not take ownership of `file`.
+  // output_buffer_bytes should be greater than 1.
+  ZlibOutputBuffer(
+      WritableFile* file,
+      int32_t input_buffer_bytes,   // size of z_stream.next_in buffer
+      int32_t output_buffer_bytes,  // size of z_stream.next_out buffer
+      const ZlibCompressionOptions& zlib_options);
+
+  ~ZlibOutputBuffer() override;
+
+  // Initializes some state necessary for the output buffer. This call is
+  // required before any other operation on the buffer.
+  absl::Status Init();
+
+  // Adds `data` to the compression pipeline.
+  //
+  // The input data is buffered in `z_stream_input_` and is compressed in bulk
+  // when the buffer gets full. The compressed output is not immediately
+  // written to file but rather buffered in `z_stream_output_` and gets written
+  // to file when the buffer is full.
+  //
+  // To immediately write contents to file call `Flush()`.
+  absl::Status Append(absl::string_view data) override;
+
+#if defined(TF_CORD_SUPPORT)
+  absl::Status Append(const absl::Cord& cord) override;
+#endif
+
+  // Deflates any cached input and writes all output to file.
+  absl::Status Flush() override;
+
+  // Compresses any cached input and writes all output to file. This must be
+  // called before the destructor to avoid any data loss.
+  //
+  // Contrary to `Flush()` this informs zlib that it should not expect any
+  // further input by using Z_FINISH flush mode. Also cleans up z_stream.
+  //
+  // After calling this, any further calls to `Write()`, `Flush()` or `Close()`
+  // will fail.
+  absl::Status Close() override;
+
+  // Returns the name of the underlying file.
+  absl::Status Name(absl::string_view* result) const override;
+
+  // Deflates any cached input, writes all output to file and syncs it.
+  absl::Status Sync() override;
+
+  // Returns the write position in the underlying file. The position does not
+  // reflect buffered, un-flushed data.
+  absl::Status Tell(int64_t* position) override;
+
+ private:
+  WritableFile* file_;  // Not owned
+  absl::Status init_status_;
+  size_t input_buffer_capacity_;
+  size_t output_buffer_capacity_;
+
+  // Buffer for storing contents read from input `file_`.
+  // TODO(srbs): Consider using circular buffers. That would greatly simplify
+  // the implementation.
+  std::unique_ptr<Bytef[]> z_stream_input_;
+
+  // Buffer for storing deflated contents of `file_`.
+  std::unique_ptr<Bytef[]> z_stream_output_;
+
+  ZlibCompressionOptions const zlib_options_;
+
+  // Configuration passed to `deflate`.
+  //
+  // z_stream_->next_in:
+  //   Next byte to compress. Points to some byte in z_stream_input_ buffer.
+  // z_stream_->avail_in:
+  //   Number of bytes available to be compressed at this time.
+  // z_stream_->next_out:
+  //   Next byte to write compressed data to. Points to some byte in
+  //   z_stream_output_ buffer.
+  // z_stream_->avail_out:
+  //   Number of free bytes available at write location.
+  std::unique_ptr<z_stream> z_stream_;
+
+  // Adds `data` to `z_stream_input_`.
+  // Throws if `data.size()` > AvailableInputSpace().
+  void AddToInputBuffer(absl::string_view data);
+
+  // Returns the total space available in z_input_stream_ buffer.
+  int32 AvailableInputSpace() const;
+
+  // Deflate contents in z_stream_input_ and store results in z_stream_output_.
+  // The contents of output stream are written to file if more space is needed.
+  // On successful termination it is assured that:
+  // - z_stream_->avail_in == 0
+  // - z_stream_->avail_out > 0
+  //
+  // Note: This method does not flush contents to file.
+  // Returns non-ok status if writing contents to file fails.
+  absl::Status DeflateBuffered(int flush_mode);
+
+  // Appends contents of `z_stream_output_` to `file_`.
+  // Returns non-OK status if writing to file fails.
+  absl::Status FlushOutputBufferToFile();
+
+  // Calls `deflate()` and returns DataLoss Status if it failed.
+  absl::Status Deflate(int flush);
+
+  static bool IsSyncOrFullFlush(uint8 flush_mode) {
+    return flush_mode == Z_SYNC_FLUSH || flush_mode == Z_FULL_FLUSH;
+  }
+
+  ZlibOutputBuffer(const ZlibOutputBuffer&) = delete;
+  void operator=(const ZlibOutputBuffer&) = delete;
+};
+
+}  // namespace io
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_IO_ZLIB_OUTPUTBUFFER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/math/math_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/math/math_util.h
new file mode 100644
index 00000000..a2622d48
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/math/math_util.h
@@ -0,0 +1,161 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MATH_MATH_UTIL_H_
+#define XLA_TSL_LIB_MATH_MATH_UTIL_H_
+
+#include <type_traits>
+
+#include "absl/base/macros.h"
+
+namespace tsl {
+
+class MathUtil {
+ public:
+  // ----------------------------------------------------------------------
+  // CeilOfRatio<IntegralType>
+  // FloorOfRatio<IntegralType>
+  //   Returns the ceil (resp. floor) of the ratio of two integers.
+  //
+  //  * IntegralType: any integral type, whether signed or not.
+  //  * numerator: any integer: positive, negative, or zero.
+  //  * denominator: a non-zero integer, positive or negative.
+  //
+  // This implementation is correct, meaning there is never any precision loss,
+  // and there is never an overflow. However, if the type is signed, having
+  // numerator == MathLimits<IntegralType>::kMin and denominator == -1 is not a
+  // valid input, because kMin has a greater absolute value than kMax.
+  //
+  // Input validity is DCHECKed. When not in debug mode, invalid inputs raise
+  // SIGFPE.
+  //
+  // This method has been designed and tested so that it should always be
+  // preferred to alternatives. Indeed, there exist popular recipes to compute
+  // the result, such as casting to double, but they are in general incorrect.
+  // In cases where an alternative technique is correct, performance measurement
+  // showed the provided implementation is faster.
+  template <typename IntegralType>
+  static constexpr IntegralType CeilOfRatio(IntegralType numerator,
+                                            IntegralType denominator) {
+    return CeilOrFloorOfRatio<IntegralType, true>(numerator, denominator);
+  }
+  template <typename IntegralType>
+  static constexpr IntegralType FloorOfRatio(IntegralType numerator,
+                                             IntegralType denominator) {
+    return CeilOrFloorOfRatio<IntegralType, false>(numerator, denominator);
+  }
+
+  template <typename IntegralType, bool ceil>
+  static constexpr IntegralType CeilOrFloorOfRatio(IntegralType numerator,
+                                                   IntegralType denominator);
+
+  template <typename IntegralType>
+  static constexpr IntegralType GCD(IntegralType x, IntegralType y);
+
+  // ----------------------------------------------------------------------
+  // IPow<T>
+  //   Computes the result of raising a number to a non-negative integral power.
+  //
+  //  * T: An integral type, floating-point type, or user-defined type for which
+  //    operator*= is defined.
+  //  * base: the base "v" of the operation
+  //  * exp: the exponent "i" of the operation; must be non-negative.
+  //
+  // Computes v^i, in a way that is faster than std::pow (which supports
+  // arbitrary real exponents).
+  //
+  // When T is a floating point type, this has the same semantics as std::pow,
+  // but it is much faster. When T is an integral type, computations are
+  // performed in the value domain of T, and overflow semantics are those of T.
+  //
+  // Input validity is DCHECKed.
+  template <typename T>
+  static constexpr T IPow(T base, int exp);
+};
+
+// ---- CeilOrFloorOfRatio ----
+// This is a branching-free, cast-to-double-free implementation.
+//
+// Casting to double is in general incorrect because of loss of precision
+// when casting an int64 into a double.
+//
+// There's a bunch of 'recipes' to compute a integer ceil (or floor) on the web,
+// and most of them are incorrect.
+template <typename IntegralType, bool ceil>
+constexpr IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
+                                                    IntegralType denominator) {
+  ABSL_ASSERT(denominator != 0);
+
+  const IntegralType rounded_toward_zero = numerator / denominator;
+  const IntegralType intermediate_product = rounded_toward_zero * denominator;
+
+  if (ceil) {  // Compile-time condition: not an actual branching
+    // When rounded_toward_zero is negative, then an adjustment is never needed:
+    // the real ratio is negative, and so rounded toward zero is the ceil.
+    // When rounded_toward_zero is non-negative, an adjustment is needed if the
+    // sign of the difference numerator - intermediate_product is the same as
+    // the sign of the denominator.
+    //
+    //
+    // Using a bool and then a static_cast to IntegralType is not strictly
+    // necessary, but it makes the code clear, and anyway the compiler should
+    // get rid of it.
+    const bool needs_adjustment =
+        (rounded_toward_zero >= 0) &&
+        ((denominator > 0 && numerator > intermediate_product) ||
+         (denominator < 0 && numerator < intermediate_product));
+    const IntegralType adjustment = static_cast<IntegralType>(needs_adjustment);
+    const IntegralType ceil_of_ratio = rounded_toward_zero + adjustment;
+    return ceil_of_ratio;
+  } else {
+    // Floor case: symmetrical to the previous one
+    const bool needs_adjustment =
+        (rounded_toward_zero <= 0) &&
+        ((denominator > 0 && numerator < intermediate_product) ||
+         (denominator < 0 && numerator > intermediate_product));
+    const IntegralType adjustment = static_cast<IntegralType>(needs_adjustment);
+    const IntegralType floor_of_ratio = rounded_toward_zero - adjustment;
+    return floor_of_ratio;
+  }
+}
+
+template <typename IntegralType>
+constexpr IntegralType MathUtil::GCD(IntegralType x, IntegralType y) {
+  static_assert(std::is_unsigned_v<IntegralType>, "signed GCD not supported!");
+  while (y != 0) {
+    IntegralType r = x % y;
+    x = y;
+    y = r;
+  }
+  return x;
+}
+
+// ---- IPow ----
+// Implemented with the squared exponentiation method (a.k.a. double-and-add).
+//
+// Note that "exp >>= 1" is faster than "exp /= 2" on at least one platform.
+template <typename T>
+constexpr T MathUtil::IPow(T base, int exp) {
+  ABSL_ASSERT(exp >= 0);
+  for (T result(1);; base *= base) {
+    if ((exp & 1) != 0) result *= base;
+    exp >>= 1;
+    if (exp == 0) return result;
+  }
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MATH_MATH_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
new file mode 100644
index 00000000..8eb263ba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
@@ -0,0 +1,138 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_LIB_MONITORING_CELL_READER_INL_H_
+#define XLA_TSL_LIB_MONITORING_CELL_READER_INL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/tsl/lib/monitoring/collected_metrics.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/lib/monitoring/test_utils.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace tsl {
+namespace monitoring {
+namespace testing {
+namespace internal {
+// Returns a snapshot of the metrics collected at the time of calling.
+std::unique_ptr<CollectedMetrics> CollectMetrics();
+
+// Returns whether this is a cumulative or gauge metric.
+MetricKind GetMetricKind(const CollectedMetrics& metrics,
+                         const std::string& metric_name);
+
+// Returns the points collected for `metric_name` associated with the `labels`.
+// A `Point` represents a data point collected for the metric. For example,
+// suppose a counter is incremented 3 times, then its points are 1, 2, 3.
+//
+// If the metric does not exist, it returns a `NotFound` error. If the number of
+// labels does not match the metric definition, it returns an `InvalidArgument`
+// error.
+absl::StatusOr<std::vector<Point>> GetPoints(
+    const CollectedMetrics& metrics, const std::string& metric_name,
+    const std::vector<std::string>& labels);
+
+// Returns the `Point` that corresponds to the latest data point collected for
+// `metric_name`, associated with the `labels`.
+//
+// If the metric does not exist, it returns a `NotFound` error. If the metric
+// exists but no data is collected, it returns an `Unavailable` error. If the
+// number of labels does not match the metric definition, it returns an
+// `InvalidArgument` error.
+absl::StatusOr<Point> GetLatestPoint(const CollectedMetrics& metrics,
+                                     const std::string& metric_name,
+                                     const std::vector<std::string>& labels);
+
+// Returns the value of `point`. Currently, only int64_t (counter) values are
+// supported.
+template <typename ValueType>
+ValueType GetValue(const Point& point) {
+  LOG(FATAL) << "Invalid argument: Tensorflow CellReader does not support type "
+             << typeid(ValueType).name();
+}
+
+template <>
+int64_t GetValue(const Point& point);
+
+template <>
+std::string GetValue(const Point& point);
+
+template <>
+bool GetValue(const Point& point);
+
+template <>
+Histogram GetValue(const Point& point);
+
+template <>
+Percentiles GetValue(const Point& point);
+
+// Returns the latest value for `metric_name`, associated with the `labels`. If
+// the metric has not collected any data, it returns a default value appropriate
+// for `ValueType`. If the metric does not exist, or the wrong number of labels
+// is provided, it will crash.
+template <typename ValueType>
+ValueType GetLatestValueOrDefault(const CollectedMetrics& metrics,
+                                  const std::string& metric_name,
+                                  const std::vector<std::string>& labels,
+                                  const ValueType default_value = ValueType()) {
+  absl::StatusOr<Point> latest_point =
+      GetLatestPoint(metrics, metric_name, labels);
+  if (absl::IsUnavailable(latest_point.status())) {
+    return std::move(default_value);
+  }
+  if (!latest_point.ok()) {
+    LOG(FATAL) << "Failed to read from tfstreamz: " << latest_point.status();
+  }
+  return GetValue<ValueType>(*latest_point);
+}
+
+// Returns the difference between two values. Currently, only int64_t (counter)
+// values are supported.
+template <typename ValueType>
+ValueType GetDelta(const ValueType& a, const ValueType& b) {
+  LOG(FATAL) << "Invalid argument: Tensorflow CellReader does not support type "
+             << typeid(ValueType).name();
+}
+
+template <>
+int64_t GetDelta(const int64_t& a, const int64_t& b);
+
+template <>
+Histogram GetDelta(const Histogram& a, const Histogram& b);
+
+template <>
+Percentiles GetDelta(const Percentiles& a, const Percentiles& b);
+
+template <>
+std::string GetDelta(const std::string& a, const std::string& b);
+
+template <>
+bool GetDelta(const bool& a, const bool& b);
+
+}  // namespace internal
+}  // namespace testing
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_CELL_READER_INL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
new file mode 100644
index 00000000..782eb5bf
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
@@ -0,0 +1,166 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_LIB_MONITORING_CELL_READER_H_
+#define XLA_TSL_LIB_MONITORING_CELL_READER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/lib/monitoring/cell_reader-inl.h"
+#include "xla/tsl/lib/monitoring/collected_metrics.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+
+namespace tsl {
+namespace monitoring {
+namespace testing {
+
+// `CellReader` is a testing class which allows a user to read the current value
+// of a tfstreamz cell.
+//
+// For tfstreamz metrics like the following:
+//
+// ```
+// auto* test_counter = monitoring::Counter<1>::New(
+//    "/tensorflow/monitoring/test/counter", "label",
+//    "Test tfstreamz counter.");
+// auto* test_sampler = monitoring::Sampler<2>::New(
+//    "/tensorflow/monitoring/test/sampler", "label1", "label2",
+//    "Test tfstreamz sampler.");
+// auto* test_string_gauge = monitoring::Gauge<2>::New(
+//    "/tensorflow/monitoring/test/gauge", "label1", "label2",
+//    "Test tfstreamz gauge.");
+// auto* test_percentiles = monitoring::PercentileSampler<2>::New(
+//    {"/tensorflow/monitoring/test/percentiles", "Test percentiles.",
+//     "label1", "label2"},
+//     /*percentiles=*/{25.0, 50.0, 80.0, 90.0, 95.0, 99.0},
+//     /*max_samples=*/1024,
+//     monitoring::UnitOfMeasure::kNumber);
+// ```
+//
+// one could read the exported tfstreamz values using a `CellReader` like this:
+//
+// ```
+// using tensorflow::monitoring::testing::Histogram;
+// using tensorflow::monitoring::testing::Percentiles;
+//
+// CellReader<int64_t> counter_reader("/tensorflow/monitoring/test/counter");
+// CellReader<Histogram> sampler_reader("/tensorflow/monitoring/test/sampler");
+// CellReader<std::string> gauge_reader("/tensorflow/monitoring/test/gauge");
+// CellReader<Percentiles> percentiles_reader(
+//     "/tensorflow/monitoring/test/percentiles");
+// EXPECT_EQ(counter_reader.Delta("label_value"), 0);
+// EXPECT_FLOAT_EQ(sampler_reader.Delta("x", "y").num(), 0.0);
+// EXPECT_EQ(gauge_reader.Delta("x", "y"), "");
+// EXPECT_EQ(percentiles_reader.Delta("x", "y").num(), 0);
+//
+// CodeThatUpdateMetrics();
+// EXPECT_EQ(counter_reader.Delta("label_value"), 5);
+// Histogram histogram = sampler_reader.Delta("x", "y");
+// EXPECT_FLOAT_EQ(histogram.num(), 5.0);
+// EXPECT_GT(histogram.sum(), 0.0);
+// EXPECT_EQ(gauge_reader.Delta("x", "y"), "gauge value");
+// EXPECT_EQ(percentiles_reader.Delta("x", "y").num(), 5);
+// ```
+template <typename ValueType>
+class CellReader {
+ public:
+  // Constructs a `CellReader` that reads values exported for `metric_name`.
+  //
+  // REQUIRES: a tfstreamz with `metric_name` exists. Otherwise, the
+  // `CellReader` will construct without issue, but the `Read` and `Delta` calls
+  // will CHECK-fail.
+  explicit CellReader(const std::string& metric_name);
+  virtual ~CellReader() = default;
+  CellReader(const CellReader&) = delete;
+  CellReader& operator=(const CellReader&) = delete;
+
+  // Returns the current value of the cell with the given `labels`. A metric can
+  // have zero or more labels, depending on its definition. If the metric has
+  // not been modified, it returns a default value appropriate for `ValueType`.
+  //
+  // REQUIRES: The tfstreamz exists, and `labels` contains a correct number of
+  // labels per tfstreamz definition. Otherwise, it will CHECK-fail.
+  template <typename... LabelType>
+  ValueType Read(const LabelType&... labels);
+
+  // Returns the difference in the value of this cell since the last time
+  // `Delta()` was called for this cell, or when the `CellReader` was created,
+  // whichever was most recent. If the metric has not been modified, it returns
+  // a default value appropriate for `ValueType`. `Delta` is not supported for
+  // string and bool gauges.
+  //
+  // REQUIRES: The tfstreamz exists, `labels` contains a correct number of
+  // labels per tfstreamz definition, and the ValueType is not string or bool.
+  // Otherwise, it will CHECK-fail.
+  template <typename... LabelType>
+  ValueType Delta(const LabelType&... labels);
+
+ private:
+  const std::string metric_name_;
+
+  // Metrics collected at the time of construction. It is needed because data
+  // may have been collected when this object is constructed. The initial values
+  // need to be subtracted from the result of the `Read()` call to compute the
+  // correct values.
+  std::unique_ptr<CollectedMetrics> initial_metrics_;
+
+  // Records the value of the cells since the last time `Delta()` was called.
+  // This is used to compute the next delta value.
+  absl::flat_hash_map<std::vector<std::string>, ValueType> delta_map_;
+};
+
+template <typename ValueType>
+CellReader<ValueType>::CellReader(const std::string& metric_name)
+    : metric_name_(metric_name), initial_metrics_(internal::CollectMetrics()) {}
+
+template <typename ValueType>
+template <typename... LabelType>
+ValueType CellReader<ValueType>::Read(const LabelType&... labels) {
+  std::vector<std::string> labels_list{labels...};
+  std::unique_ptr<CollectedMetrics> metrics = internal::CollectMetrics();
+  ValueType value = internal::GetLatestValueOrDefault<ValueType>(
+      *metrics, metric_name_, labels_list);
+  if (internal::GetMetricKind(*metrics, metric_name_) == MetricKind::kGauge) {
+    return value;
+  }
+  ValueType initial_value = internal::GetLatestValueOrDefault<ValueType>(
+      *initial_metrics_, metric_name_, labels_list);
+  return internal::GetDelta<ValueType>(value, initial_value);
+}
+
+template <typename ValueType>
+template <typename... LabelType>
+ValueType CellReader<ValueType>::Delta(const LabelType&... labels) {
+  std::vector<std::string> labels_list{labels...};
+  std::unique_ptr<CollectedMetrics> metrics = internal::CollectMetrics();
+  ValueType value = internal::GetLatestValueOrDefault<ValueType>(
+      *metrics, metric_name_, labels_list);
+  ValueType initial_value = internal::GetLatestValueOrDefault<ValueType>(
+      *initial_metrics_, metric_name_, labels_list);
+  if (delta_map_.contains(labels_list)) {
+    initial_value = delta_map_[labels_list];
+  }
+  delta_map_[labels_list] = value;
+  return internal::GetDelta<ValueType>(value, initial_value);
+}
+
+}  // namespace testing
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_CELL_READER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
new file mode 100644
index 00000000..8e305493
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
@@ -0,0 +1,157 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Standard format in which the metrics are collected, before being exported.
+// These are to be used only by the CollectionRegistry and exporters which
+// collect metrics using the CollectionRegistry.
+
+#ifndef XLA_TSL_LIB_MONITORING_COLLECTED_METRICS_H_
+#define XLA_TSL_LIB_MONITORING_COLLECTED_METRICS_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/lib/monitoring/types.h"
+#include "xla/tsl/protobuf/histogram.pb.h"
+
+namespace tsl {
+namespace monitoring {
+
+// A metric is a statistic about a monitorable entity.
+//
+// Metrics are named with path-like strings, which must conform to the regular
+// expression (/[a-zA-Z0-9_-]+)+.  For example:
+//
+//     /proc/cpu_usage
+//     /rpc/client/count
+//
+// Metrics may optionally have labels, which are additional dimensions used to
+// identify the metric's values.  For example, the metric /rpc/client/count
+// might have two labels named "rpc_service" and "rpc_method".
+//
+// A label name must be an identifier, which conform to the regular expression
+// [a-zA-Z_][a-zA-Z_0-9]*, and is only unique within the context of the metric
+// it is a label for.
+//
+// MetricDescriptor defines the structure of the metric (e.g. the fact that it's
+// a counter and that it has two labels named "rpc_service" and "rpc_method").
+// Individual points will provide a value for the metric (e.g. the counter
+// value) and specific values for each of the labels.
+//
+// There's no scoping relationship between metrics and monitorable entities: the
+// metric /rpc/client/count should be defined the same way no matter which
+// monitorable entity is exporting it.
+struct MetricDescriptor {
+  // Metric names are path-like.  E.g., "/mycomponent/mymetric".
+  string name;
+
+  // A human-readable description of what this metric measures.
+  string description;
+
+  // Label names for the metric.
+  // See the example in the top level comment for MetricDescriptor.
+  std::vector<string> label_names;
+
+  MetricKind metric_kind;
+
+  ValueType value_type;
+};
+
+struct Point {
+  // Usually a Point should provide a |label| field for each of the labels
+  // defined in the corresponding MetricDescriptor.  During transitions in
+  // metric definitions, however, there may be times when a Point provides more
+  // or fewer labels than those that appear in the MetricDescriptor.
+  struct Label {
+    // The |name| field must match the |label_name| field in the
+    // MetricDescriptor for this Point.
+    string name;
+    string value;
+  };
+  std::vector<Label> labels;
+
+  // The actual metric value, dependent on the value_type enum.
+  ValueType value_type;
+  int64_t int64_value;
+  string string_value;
+  bool bool_value;
+  double double_value;
+  HistogramProto histogram_value;
+  Percentiles percentiles_value;
+
+  // start_timestamp and end_timestamp indicate the time period over which this
+  // point's value measurement applies.
+  //
+  // A cumulative metric like /rpc/client/count typically has runs of
+  // consecutive points that share a common start_timestamp, which is often
+  // the time at which the exporting process started.  For example:
+  //
+  //   value:  3  start_timestamp: 1000  end_timestamp: 1234
+  //   value:  7  start_timestamp: 1000  end_timestamp: 1245
+  //   value: 10  start_timestamp: 1000  end_timestamp: 1256
+  //   value: 15  start_timestamp: 1000  end_timestamp: 1267
+  //   value: 21  start_timestamp: 1000  end_timestamp: 1278
+  //   value:  4  start_timestamp: 1300  end_timestamp: 1400
+  //
+  // The meaning of each point is: "Over the time period from
+  // 'start_timestamp' to 'end_timestamp', 'value' client RPCs finished."
+  //
+  // Note the changed start_timestamp and the decrease in 'value' in the
+  // last line; those are the effects of the process restarting.
+  //
+  // Delta metrics have the same interpretation of the timestamps and values,
+  // but the time ranges of two points do not overlap.  The delta form of the
+  // above sequence would be:
+  //
+  //   value:  3  start_timestamp: 1000  end_timestamp: 1234
+  //   value:  4  start_timestamp: 1235  end_timestamp: 1245
+  //   value:  3  start_timestamp: 1246  end_timestamp: 1256
+  //   value:  5  start_timestamp: 1257  end_timestamp: 1267
+  //   value:  6  start_timestamp: 1268  end_timestamp: 1278
+  //   value:  4  start_timestamp: 1300  end_timestamp: 1400
+  //
+  // For gauge metrics whose values are instantaneous measurements,
+  // start_timestamp and end_timestamp may be identical.  I.e., there is no need
+  // to strictly measure the time period during which the value measurement was
+  // made.
+  //
+  // start_timestamp must not be younger than end_timestamp.
+  uint64 start_timestamp_millis;
+  uint64 end_timestamp_millis;
+};
+
+// A set of points belonging to a metric.
+struct PointSet {
+  // This must match a name defined by a MetricDescriptor message.
+  string metric_name;
+
+  // No two Points in the same PointSet should have the same set of labels.
+  std::vector<std::unique_ptr<Point>> points;
+};
+
+// Standard format in which the metrics are collected, before being exported.
+struct CollectedMetrics {
+  // The keys are the metric-names.
+  std::map<string, std::unique_ptr<MetricDescriptor>> metric_descriptor_map;
+  std::map<string, std::unique_ptr<PointSet>> point_set_map;
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_COLLECTED_METRICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
new file mode 100644
index 00000000..e2d370a2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
@@ -0,0 +1,519 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#define XLA_TSL_LIB_MONITORING_COLLECTION_REGISTRY_H_
+namespace tensorflow {
+namespace monitoring {
+namespace test_util {
+class CollectionRegistryTestAccess;
+}  // namespace test_util
+}  // namespace monitoring
+}  // namespace tensorflow
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tsl/platform/platform.h"
+// clang-format on
+
+// We use a null implementation for mobile platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include <functional>
+#include <map>
+#include <memory>
+
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+namespace monitoring {
+
+// MetricCollector which has a null implementation.
+template <MetricKind metric_kind, typename Value, int NumLabels>
+class MetricCollector {
+ public:
+  ~MetricCollector() = default;
+
+  void CollectValue(const std::array<std::string, NumLabels>& labels,
+                    Value value) {}
+
+ private:
+  friend class MetricCollectorGetter;
+
+  MetricCollector() {}
+};
+
+// MetricCollectorGetter which has a null implementation.
+class MetricCollectorGetter {
+ public:
+  template <MetricKind metric_kind, typename Value, int NumLabels>
+  MetricCollector<metric_kind, Value, NumLabels> Get(
+      const MetricDef<metric_kind, Value, NumLabels>* const metric_def) {
+    return MetricCollector<metric_kind, Value, NumLabels>();
+  }
+
+ private:
+  MetricCollectorGetter() {}
+};
+
+// CollectionRegistry which has a null implementation.
+class CollectionRegistry {
+ public:
+  ~CollectionRegistry() = default;
+
+  static CollectionRegistry* Default() { return new CollectionRegistry(); }
+
+  using CollectionFunction = std::function<void(MetricCollectorGetter getter)>;
+
+  // RegistrationHandle which has a null implementation.
+  class RegistrationHandle {
+   public:
+    RegistrationHandle() {}
+
+    ~RegistrationHandle() {}
+  };
+
+  std::unique_ptr<RegistrationHandle> Register(
+      const AbstractMetricDef* metric_def,
+      const CollectionFunction& collection_function) {
+    return std::unique_ptr<RegistrationHandle>(new RegistrationHandle());
+  }
+
+ private:
+  CollectionRegistry() {}
+
+  CollectionRegistry(const CollectionRegistry&) = delete;
+  void operator=(const CollectionRegistry&) = delete;
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+#else  // !defined(IS_MOBILE_PLATFORM)
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <utility>
+
+#include "xla/tsl/lib/monitoring/collected_metrics.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/lib/monitoring/types.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/protobuf/histogram.pb.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace monitoring {
+
+namespace internal {
+class Collector;
+}  // namespace internal
+
+// Metric implementations would get an instance of this class using the
+// MetricCollectorGetter in the collection-function lambda, so that their values
+// can be collected.
+//
+// Read the documentation on CollectionRegistry::Register() for more details.
+//
+// For example:
+//   auto metric_collector = metric_collector_getter->Get(&metric_def);
+//   metric_collector.CollectValue(some_labels, some_value);
+//   metric_collector.CollectValue(others_labels, other_value);
+//
+// This class is NOT thread-safe.
+template <MetricKind metric_kind, typename Value, int NumLabels>
+class MetricCollector {
+ public:
+  ~MetricCollector() = default;
+
+  // Collects the value with these labels.
+  void CollectValue(const std::array<std::string, NumLabels>& labels,
+                    Value value);
+
+ private:
+  friend class internal::Collector;
+
+  MetricCollector(
+      const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
+      const uint64 registration_time_millis,
+      internal::Collector* const collector, PointSet* const point_set)
+      : metric_def_(metric_def),
+        registration_time_millis_(registration_time_millis),
+        collector_(collector),
+        point_set_(point_set) {
+    point_set_->metric_name = std::string(metric_def->name());
+  }
+
+  const MetricDef<metric_kind, Value, NumLabels>* const metric_def_;
+  const uint64 registration_time_millis_;
+  internal::Collector* const collector_;
+  PointSet* const point_set_;
+
+  // This is made copyable because we can't hand out references of this class
+  // from MetricCollectorGetter because this class is templatized, and we need
+  // MetricCollectorGetter not to be templatized and hence MetricCollectorGetter
+  // can't own an instance of this class.
+};
+
+// Returns a MetricCollector with the same template parameters as the
+// metric-definition, so that the values of a metric can be collected.
+//
+// The collection-function defined by a metric takes this as a parameter.
+//
+// Read the documentation on CollectionRegistry::Register() for more details.
+class MetricCollectorGetter {
+ public:
+  // Returns the MetricCollector with the same template parameters as the
+  // metric_def.
+  template <MetricKind metric_kind, typename Value, int NumLabels>
+  MetricCollector<metric_kind, Value, NumLabels> Get(
+      const MetricDef<metric_kind, Value, NumLabels>* const metric_def);
+
+ private:
+  friend class internal::Collector;
+
+  MetricCollectorGetter(internal::Collector* const collector,
+                        const AbstractMetricDef* const allowed_metric_def,
+                        const uint64 registration_time_millis)
+      : collector_(collector),
+        allowed_metric_def_(allowed_metric_def),
+        registration_time_millis_(registration_time_millis) {}
+
+  internal::Collector* const collector_;
+  const AbstractMetricDef* const allowed_metric_def_;
+  const uint64 registration_time_millis_;
+};
+
+// A collection registry for metrics.
+//
+// Metrics are registered here so that their state can be collected later and
+// exported.
+//
+// This class is thread-safe.
+class CollectionRegistry {
+ public:
+  ~CollectionRegistry() = default;
+
+  // Returns the default registry for the process.
+  //
+  // This registry belongs to this library and should never be deleted.
+  static CollectionRegistry* Default();
+
+  using CollectionFunction = std::function<void(MetricCollectorGetter getter)>;
+
+  // Registers the metric and the collection-function which can be used to
+  // collect its values. Returns a Registration object, which when upon
+  // destruction would cause the metric to be unregistered from this registry.
+  //
+  // IMPORTANT: Delete the handle before the metric-def is deleted.
+  //
+  // Example usage;
+  // CollectionRegistry::Default()->Register(
+  //   &metric_def,
+  //   [&](MetricCollectorGetter getter) {
+  //     auto metric_collector = getter.Get(&metric_def);
+  //     for (const auto& cell : cells) {
+  //       metric_collector.CollectValue(cell.labels(), cell.value());
+  //     }
+  //   });
+  class RegistrationHandle;
+  std::unique_ptr<RegistrationHandle> Register(
+      const AbstractMetricDef* metric_def,
+      const CollectionFunction& collection_function)
+      TF_LOCKS_EXCLUDED(mu_) TF_MUST_USE_RESULT;
+
+  // Options for collecting metrics.
+  struct CollectMetricsOptions {
+    CollectMetricsOptions() {}
+    bool collect_metric_descriptors = true;
+  };
+  // Goes through all the registered metrics, collects their definitions
+  // (optionally) and current values and returns them in a standard format.
+  std::unique_ptr<CollectedMetrics> CollectMetrics(
+      const CollectMetricsOptions& options) const;
+
+ private:
+  friend class ::tensorflow::monitoring::test_util::
+      CollectionRegistryTestAccess;
+  friend class internal::Collector;
+
+  explicit CollectionRegistry(Env* env);
+
+  // Unregisters the metric from this registry. This is private because the
+  // public interface provides a Registration handle which automatically calls
+  // this upon destruction.
+  void Unregister(const AbstractMetricDef* metric_def) TF_LOCKS_EXCLUDED(mu_);
+
+  // TF environment, mainly used for timestamping.
+  Env* const env_;
+
+  mutable mutex mu_;
+
+  // Information required for collection.
+  struct CollectionInfo {
+    const AbstractMetricDef* const metric_def;
+    CollectionFunction collection_function;
+    uint64 registration_time_millis;
+  };
+  std::map<absl::string_view, CollectionInfo> registry_ TF_GUARDED_BY(mu_);
+
+  CollectionRegistry(const CollectionRegistry&) = delete;
+  void operator=(const CollectionRegistry&) = delete;
+};
+
+////
+// Implementation details follow. API readers may skip.
+////
+
+class CollectionRegistry::RegistrationHandle {
+ public:
+  RegistrationHandle(CollectionRegistry* const export_registry,
+                     const AbstractMetricDef* const metric_def)
+      : export_registry_(export_registry), metric_def_(metric_def) {}
+
+  ~RegistrationHandle() { export_registry_->Unregister(metric_def_); }
+
+ private:
+  CollectionRegistry* const export_registry_;
+  const AbstractMetricDef* const metric_def_;
+};
+
+namespace internal {
+
+template <typename Value>
+void CollectValue(Value value, Point* point);
+
+template <>
+inline void CollectValue(int64_t value, Point* const point) {
+  point->value_type = ValueType::kInt64;
+  point->int64_value = value;
+}
+
+template <>
+inline void CollectValue(std::function<int64_t()> value_fn,
+                         Point* const point) {
+  point->value_type = ValueType::kInt64;
+  point->int64_value = value_fn();
+}
+
+template <>
+inline void CollectValue(std::string value, Point* const point) {
+  point->value_type = ValueType::kString;
+  point->string_value = std::move(value);
+}
+
+template <>
+inline void CollectValue(std::function<std::string()> value_fn,
+                         Point* const point) {
+  point->value_type = ValueType::kString;
+  point->string_value = value_fn();
+}
+
+template <>
+inline void CollectValue(bool value, Point* const point) {
+  point->value_type = ValueType::kBool;
+  point->bool_value = value;
+}
+
+template <>
+inline void CollectValue(std::function<bool()> value_fn, Point* const point) {
+  point->value_type = ValueType::kBool;
+  point->bool_value = value_fn();
+}
+
+template <>
+inline void CollectValue(HistogramProto value, Point* const point) {
+  point->value_type = ValueType::kHistogram;
+  // This is inefficient. If and when we hit snags, we can change the API to do
+  // this more efficiently.
+  point->histogram_value = std::move(value);
+}
+
+template <>
+inline void CollectValue(Percentiles value, Point* const point) {
+  point->value_type = ValueType::kPercentiles;
+  point->percentiles_value = std::move(value);
+}
+
+template <>
+inline void CollectValue(double value, Point* const point) {
+  point->value_type = ValueType::kDouble;
+  point->double_value = value;
+}
+
+template <>
+inline void CollectValue(std::function<double()> value_fn, Point* const point) {
+  point->value_type = ValueType::kDouble;
+  point->double_value = value_fn();
+}
+
+// Used by the CollectionRegistry class to collect all the values of all the
+// metrics in the registry. This is an implementation detail of the
+// CollectionRegistry class, please do not depend on this.
+//
+// This cannot be a private nested class because we need to forward declare this
+// so that the MetricCollector and MetricCollectorGetter classes can be friends
+// with it.
+//
+// This class is thread-safe.
+class Collector {
+ public:
+  explicit Collector(const uint64 collection_time_millis)
+      : collected_metrics_(new CollectedMetrics()),
+        collection_time_millis_(collection_time_millis) {}
+
+  template <MetricKind metric_kind, typename Value, int NumLabels>
+  MetricCollector<metric_kind, Value, NumLabels> GetMetricCollector(
+      const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
+      const uint64 registration_time_millis,
+      internal::Collector* const collector) TF_LOCKS_EXCLUDED(mu_) {
+    auto* const point_set = [&]() {
+      mutex_lock l(mu_);
+      return collected_metrics_->point_set_map
+          .insert(std::make_pair(std::string(metric_def->name()),
+                                 std::unique_ptr<PointSet>(new PointSet())))
+          .first->second.get();
+    }();
+    return MetricCollector<metric_kind, Value, NumLabels>(
+        metric_def, registration_time_millis, collector, point_set);
+  }
+
+  uint64 collection_time_millis() const { return collection_time_millis_; }
+
+  void CollectMetricDescriptor(const AbstractMetricDef* const metric_def)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  void CollectMetricValues(
+      const CollectionRegistry::CollectionInfo& collection_info);
+
+  std::unique_ptr<CollectedMetrics> ConsumeCollectedMetrics()
+      TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  mutable mutex mu_;
+  std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
+  const uint64 collection_time_millis_;
+
+  Collector(const Collector&) = delete;
+  void operator=(const Collector&) = delete;
+};
+
+// Write the timestamps for the point based on the MetricKind.
+//
+// Gauge metrics will have start and end timestamps set to the collection time.
+//
+// Cumulative metrics will have the start timestamp set to the time when the
+// collection function was registered, while the end timestamp will be set to
+// the collection time.
+template <MetricKind kind>
+void WriteTimestamps(const uint64 registration_time_millis,
+                     const uint64 collection_time_millis, Point* const point);
+
+template <>
+inline void WriteTimestamps<MetricKind::kGauge>(
+    const uint64 registration_time_millis, const uint64 collection_time_millis,
+    Point* const point) {
+  point->start_timestamp_millis = collection_time_millis;
+  point->end_timestamp_millis = collection_time_millis;
+}
+
+template <>
+inline void WriteTimestamps<MetricKind::kCumulative>(
+    const uint64 registration_time_millis, const uint64 collection_time_millis,
+    Point* const point) {
+  point->start_timestamp_millis = registration_time_millis;
+  // There's a chance that the clock goes backwards on the same machine, so we
+  // protect ourselves against that.
+  point->end_timestamp_millis =
+      registration_time_millis < collection_time_millis
+          ? collection_time_millis
+          : registration_time_millis;
+}
+
+}  // namespace internal
+
+template <MetricKind metric_kind, typename Value, int NumLabels>
+void MetricCollector<metric_kind, Value, NumLabels>::CollectValue(
+    const std::array<std::string, NumLabels>& labels, Value value) {
+  point_set_->points.emplace_back(new Point());
+  auto* const point = point_set_->points.back().get();
+  const std::vector<std::string> label_descriptions =
+      metric_def_->label_descriptions();
+  point->labels.reserve(NumLabels);
+  for (int i = 0; i < NumLabels; ++i) {
+    point->labels.push_back({});
+    auto* const label = &point->labels.back();
+    label->name = label_descriptions[i];
+    label->value = labels[i];
+  }
+  internal::CollectValue(std::move(value), point);
+  internal::WriteTimestamps<metric_kind>(
+      registration_time_millis_, collector_->collection_time_millis(), point);
+}
+
+template <MetricKind metric_kind, typename Value, int NumLabels>
+MetricCollector<metric_kind, Value, NumLabels> MetricCollectorGetter::Get(
+    const MetricDef<metric_kind, Value, NumLabels>* const metric_def) {
+  if (allowed_metric_def_ != metric_def) {
+    LOG(FATAL) << "Expected collection for: " << allowed_metric_def_->name()
+               << " but instead got: " << metric_def->name();
+  }
+
+  return collector_->GetMetricCollector(metric_def, registration_time_millis_,
+                                        collector_);
+}
+
+class Exporter {
+ public:
+  virtual ~Exporter() {}
+  virtual void PeriodicallyExportMetrics() = 0;
+  virtual void ExportMetrics() = 0;
+};
+
+namespace exporter_registration {
+
+class ExporterRegistration {
+ public:
+  explicit ExporterRegistration(Exporter* exporter) : exporter_(exporter) {
+    exporter_->PeriodicallyExportMetrics();
+  }
+
+ private:
+  Exporter* exporter_;
+};
+
+}  // namespace exporter_registration
+
+#define REGISTER_TF_METRICS_EXPORTER(exporter) \
+  REGISTER_TF_METRICS_EXPORTER_UNIQ_HELPER(__COUNTER__, exporter)
+
+#define REGISTER_TF_METRICS_EXPORTER_UNIQ_HELPER(ctr, exporter) \
+  REGISTER_TF_METRICS_EXPORTER_UNIQ(ctr, exporter)
+
+#define REGISTER_TF_METRICS_EXPORTER_UNIQ(ctr, exporter)                \
+  static ::tsl::monitoring::exporter_registration::ExporterRegistration \
+      exporter_registration_##ctr(new exporter())
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // IS_MOBILE_PLATFORM
+
+#endif  // XLA_TSL_LIB_MONITORING_COLLECTION_REGISTRY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/counter.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/counter.h
new file mode 100644
index 00000000..72777585
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/counter.h
@@ -0,0 +1,247 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_COUNTER_H_
+#define XLA_TSL_LIB_MONITORING_COUNTER_H_
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tsl/platform/platform.h"
+// clang-format on
+
+// We replace this implementation with a null implementation for mobile
+// platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace monitoring {
+
+// CounterCell which has a null implementation.
+class CounterCell {
+ public:
+  CounterCell() {}
+  ~CounterCell() {}
+
+  void IncrementBy(int64 step) {}
+  int64 value() const { return 0; }
+
+ private:
+  CounterCell(const CounterCell&) = delete;
+  void operator=(const CounterCell&) = delete;
+};
+
+// Counter which has a null implementation.
+template <int NumLabels>
+class Counter {
+ public:
+  ~Counter() {}
+
+  template <typename... MetricDefArgs>
+  static Counter* New(MetricDefArgs&&... metric_def_args) {
+    return new Counter<NumLabels>();
+  }
+
+  template <typename... Labels>
+  CounterCell* GetCell(const Labels&... labels) {
+    return &default_counter_cell_;
+  }
+
+  Status GetStatus() { return OkStatus(); }
+
+ private:
+  Counter() {}
+
+  CounterCell default_counter_cell_;
+
+  Counter(const Counter&) = delete;
+  void operator=(const Counter&) = delete;
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#else  // IS_MOBILE_PLATFORM
+
+#include <array>
+#include <atomic>
+#include <map>
+#include <memory>
+#include <tuple>
+
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace monitoring {
+
+// CounterCell stores each value of an Counter.
+//
+// A cell can be passed off to a module which may repeatedly update it without
+// needing further map-indexing computations. This improves both encapsulation
+// (separate modules can own a cell each, without needing to know about the map
+// to which both cells belong) and performance (since map indexing and
+// associated locking are both avoided).
+//
+// This class is thread-safe.
+class CounterCell {
+ public:
+  explicit CounterCell(int64_t value) : value_(value) {}
+  ~CounterCell() {}
+
+  // Atomically increments the value by step.
+  // REQUIRES: Step be non-negative.
+  void IncrementBy(int64_t step);
+
+  // Retrieves the current value.
+  int64_t value() const;
+
+ private:
+  std::atomic<int64_t> value_;
+
+  CounterCell(const CounterCell&) = delete;
+  void operator=(const CounterCell&) = delete;
+};
+
+// A stateful class for updating a cumulative integer metric.
+//
+// This class encapsulates a set of values (or a single value for a label-less
+// metric). Each value is identified by a tuple of labels. The class allows the
+// user to increment each value.
+//
+// Counter allocates storage and maintains a cell for each value. You can
+// retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <int NumLabels>
+class Counter {
+ public:
+  ~Counter() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments.
+  //
+  // Example;
+  // auto* counter_with_label = Counter<1>::New("/tensorflow/counter",
+  //   "Tensorflow counter", "MyLabelName");
+  template <typename... MetricDefArgs>
+  static Counter* New(MetricDefArgs&&... metric_def_args);
+
+  // Retrieves the cell for the specified labels, creating it on demand if
+  // not already present.
+  template <typename... Labels>
+  CounterCell* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status GetStatus() { return status_; }
+
+ private:
+  explicit Counter(
+      const MetricDef<MetricKind::kCumulative, int64_t, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              mutex_lock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second.value());
+              }
+            })) {
+    if (registration_handle_) {
+      status_ = absl::OkStatus();
+    } else {
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
+                       "Another metric with the same name already exists.");
+    }
+  }
+
+  mutable mutex mu_;
+
+  absl::Status status_;
+
+  using LabelArray = std::array<string, NumLabels>;
+  std::map<LabelArray, CounterCell> cells_ TF_GUARDED_BY(mu_);
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kCumulative, int64_t, NumLabels> metric_def_;
+
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  Counter(const Counter&) = delete;
+  void operator=(const Counter&) = delete;
+};
+
+////
+//  Implementation details follow. API readers may skip.
+////
+
+inline void CounterCell::IncrementBy(const int64_t step) {
+  DCHECK_LE(0, step) << "Must not decrement cumulative metrics.";
+  value_ += step;
+}
+
+inline int64_t CounterCell::value() const { return value_; }
+
+template <int NumLabels>
+template <typename... MetricDefArgs>
+Counter<NumLabels>* Counter<NumLabels>::New(
+    MetricDefArgs&&... metric_def_args) {
+  return new Counter<NumLabels>(
+      MetricDef<MetricKind::kCumulative, int64_t, NumLabels>(
+          std::forward<MetricDefArgs>(metric_def_args)...));
+}
+
+template <int NumLabels>
+template <typename... Labels>
+CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
+    TF_LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(sizeof...(Labels) == NumLabels,
+                "Mismatch between Counter<NumLabels> and number of labels "
+                "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  mutex_lock l(mu_);
+  const auto found_it = cells_.find(label_array);
+  if (found_it != cells_.end()) {
+    return &(found_it->second);
+  }
+  return &(cells_
+               .emplace(std::piecewise_construct,
+                        std::forward_as_tuple(label_array),
+                        std::forward_as_tuple(0))
+               .first->second);
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // IS_MOBILE_PLATFORM
+#endif  // XLA_TSL_LIB_MONITORING_COUNTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/gauge.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/gauge.h
new file mode 100644
index 00000000..2b1c7f8e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/gauge.h
@@ -0,0 +1,339 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_GAUGE_H_
+#define XLA_TSL_LIB_MONITORING_GAUGE_H_
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tsl/platform/platform.h"
+// clang-format on
+
+// We replace this implementation with a null implementation for mobile
+// platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include <functional>
+#include <string>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace monitoring {
+
+// GaugeCell which has a null implementation.
+template <typename T>
+class GaugeCell {
+ public:
+ public:
+  GaugeCell() {}
+  ~GaugeCell() {}
+
+  void Set(const T& value) {}
+  T value() const { return T(); }
+
+ private:
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
+};
+
+// Gauge which has a null implementation.
+template <typename ValueType, int NumLabels>
+class Gauge {
+ public:
+  ~Gauge() {}
+
+  template <typename... MetricDefArgs>
+  static Gauge* New(MetricDefArgs&&... metric_def_args) {
+    static_assert(
+        std::is_same<ValueType, int64>::value ||
+            std::is_same<ValueType, std::string>::value ||
+            std::is_same<ValueType, bool>::value ||
+            std::is_same<ValueType, std::function<int64()> >::value ||
+            std::is_same<ValueType, std::function<std::string()> >::value ||
+            std::is_same<ValueType, std::function<bool()> >::value ||
+            std::is_same<ValueType, std::function<double()> >::value ||
+            std::is_same<ValueType, double>::value,
+        "Gauge only allows bool, int64, double and string types.");
+    return new Gauge();
+  }
+
+  template <typename... Labels>
+  GaugeCell<ValueType>* GetCell(const Labels&... labels) {
+    return &default_gauge_cell_;
+  }
+
+  Status GetStatus() { return OkStatus(); }
+
+ private:
+  Gauge() {}
+
+  GaugeCell<ValueType> default_gauge_cell_;
+
+  Gauge(const Gauge&) = delete;
+  void operator=(const Gauge&) = delete;
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#else  // IS_MOBILE_PLATFORM
+
+#include <array>
+#include <atomic>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace monitoring {
+// GaugeCell stores each value of a gauge.
+//
+// A cell can be passed off to a module which may repeatedly update it without
+// needing further map-indexing computations. This improves both encapsulation
+// (separate modules can own a cell each, without needing to know about the map
+// to which both cells belong) and performance (since map indexing and
+// associated locking are both avoided).
+//
+// This class is thread-safe.
+template <typename T>
+class GaugeCell {
+ public:
+  explicit GaugeCell(const T& value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(const T& value) TF_LOCKS_EXCLUDED(mu_);
+
+  // Retrieves the current value.
+  T value() const TF_LOCKS_EXCLUDED(mu_);
+
+ private:
+  T value_ TF_GUARDED_BY(mu_);
+  mutable mutex mu_;
+
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
+};
+
+// Explicit specialization of GaugeCell<int64_t>. Compared to the primary
+// template, it uses atomic values as opposed to mutex. This class is
+// thread-safe.
+template <>
+class GaugeCell<int64_t> {
+ public:
+  explicit GaugeCell(int64_t value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(int64_t value);
+
+  // Retrieves the current value.
+  int64_t value() const;
+
+ private:
+  std::atomic<int64_t> value_;
+
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
+};
+
+// Explicit specialization of GaugeCell<bool>. Compared to the primary
+// template, it uses atomic values as opposed to mutex. This class is
+// thread-safe.
+template <>
+class GaugeCell<bool> {
+ public:
+  explicit GaugeCell(bool value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(bool value);
+
+  // Retrieves the current value.
+  bool value() const;
+
+ private:
+  std::atomic<bool> value_;
+
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
+};
+
+// A stateful class for updating a gauge-like metric. Allowed ValueType are
+// int64, string and bool.
+//
+// This class encapsulates a set of values (or a single value for a label-less
+// metric). Each value is identified by a tuple of labels. The class allows the
+// user to set each value.
+//
+// Gauge allocates storage and maintains a cell for each value. You can
+// retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <typename ValueType, int NumLabels>
+class Gauge {
+ public:
+  ~Gauge() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments.
+  //
+  // Example:
+  //
+  // auto* string_gauge_with_label = Gauge<string,1>::New(
+  //   "/tensorflow/string_gauge_with_label",
+  //   "String gauge with one label.", "MyLabelName");
+  //
+  // auto* integer_gauge = Gauge<int64, 0>::New("/tensorflow/integer_gauge",
+  //   "Integer gauge")
+  //
+  // auto* bool_gauge = Gauge<bool, 0>::New("/tensorflow/bool_gauge",
+  //   "Bool gauge")
+  template <typename... MetricDefArgs>
+  static Gauge* New(MetricDefArgs&&... metric_def_args);
+
+  // Retrieves the cell for the specified labels, creating it on demand if not
+  // already present.
+  template <typename... Labels>
+  GaugeCell<ValueType>* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status GetStatus() { return status_; }
+
+ private:
+  explicit Gauge(
+      const MetricDef<MetricKind::kGauge, ValueType, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              mutex_lock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second.value());
+              }
+            })) {
+    if (registration_handle_) {
+      status_ = absl::OkStatus();
+    } else {
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
+                       "Another metric with the same name already exists.");
+    }
+  }
+
+  mutable mutex mu_;
+
+  absl::Status status_;
+
+  using LabelArray = std::array<string, NumLabels>;
+  std::map<LabelArray, GaugeCell<ValueType> > cells_ TF_GUARDED_BY(mu_);
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kGauge, ValueType, NumLabels> metric_def_;
+
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  Gauge(const Gauge&) = delete;
+  void operator=(const Gauge&) = delete;
+};
+
+////
+//  Implementation details follow. API readers may skip.
+////
+template <typename T>
+void GaugeCell<T>::Set(const T& value) {
+  mutex_lock l(mu_);
+  value_ = value;
+}
+
+template <typename T>
+T GaugeCell<T>::value() const {
+  mutex_lock l(mu_);
+  return value_;
+}
+
+inline void GaugeCell<int64_t>::Set(int64_t value) { value_ = value; }
+
+inline int64_t GaugeCell<int64_t>::value() const { return value_; }
+
+inline void GaugeCell<bool>::Set(bool value) { value_ = value; }
+
+inline bool GaugeCell<bool>::value() const { return value_; }
+
+template <typename ValueType, int NumLabels>
+template <typename... MetricDefArgs>
+Gauge<ValueType, NumLabels>* Gauge<ValueType, NumLabels>::New(
+    MetricDefArgs&&... metric_def_args) {
+  static_assert(
+      std::is_same<ValueType, int64_t>::value ||
+          std::is_same<ValueType, std::string>::value ||
+          std::is_same<ValueType, bool>::value ||
+          std::is_same<ValueType, std::function<int64_t()> >::value ||
+          std::is_same<ValueType, std::function<std::string()> >::value ||
+          std::is_same<ValueType, std::function<bool()> >::value ||
+          std::is_same<ValueType, std::function<double()> >::value ||
+          std::is_same<ValueType, double>::value,
+      "Gauge only allows bool, int64, double, and string types.");
+  return new Gauge<ValueType, NumLabels>(
+      MetricDef<MetricKind::kGauge, ValueType, NumLabels>(
+          std::forward<MetricDefArgs>(metric_def_args)...));
+}
+
+template <typename ValueType, int NumLabels>
+template <typename... Labels>
+GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
+    const Labels&... labels) TF_LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(
+      sizeof...(Labels) == NumLabels,
+      "Mismatch between Gauge<ValueType, NumLabels> and number of labels "
+      "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  mutex_lock l(mu_);
+  const auto found_it = cells_.find(label_array);
+  if (found_it != cells_.end()) {
+    return &(found_it->second);
+  }
+  return &(cells_
+               .emplace(std::piecewise_construct,
+                        std::forward_as_tuple(label_array),
+                        std::forward_as_tuple(ValueType()))
+               .first->second);
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // IS_MOBILE_PLATFORM
+#endif  // XLA_TSL_LIB_MONITORING_GAUGE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/metric_def.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
new file mode 100644
index 00000000..82896f43
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
@@ -0,0 +1,185 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_METRIC_DEF_H_
+#define XLA_TSL_LIB_MONITORING_METRIC_DEF_H_
+
+#include <array>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/lib/monitoring/types.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/protobuf/histogram.pb.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+namespace monitoring {
+// TODO(pedaveeraiah): remove this when it moves to TSL
+using tensorflow::HistogramProto;
+// The different metric kinds available.
+//
+// Gauge indicates that the metric's values are instantaneous measurements of a
+// (typically) continuously varying value. Examples: a process's current heap
+// size, a queue's current length, the name of the binary used by a process,
+// whether a task is complete.
+//
+// Cumulative indicates that the metric's values represent non-negative changes
+// over specified time periods. Example: the number of rpc calls to a service.
+enum class MetricKind : int { kGauge = 0, kCumulative };
+
+// The type of the metric values.
+enum class ValueType : int {
+  kInt64 = 0,
+  kHistogram,
+  kString,
+  kBool,
+  kPercentiles,
+  kDouble
+};
+
+// Everything in the internal namespace is implementation details. Do not depend
+// on this.
+namespace internal {
+
+template <typename Value>
+ValueType GetValueType();
+
+template <>
+inline ValueType GetValueType<int64_t>() {
+  return ValueType::kInt64;
+}
+
+template <>
+inline ValueType GetValueType<std::function<int64_t()>>() {
+  return ValueType::kInt64;
+}
+
+template <>
+inline ValueType GetValueType<HistogramProto>() {
+  return ValueType::kHistogram;
+}
+
+template <>
+inline ValueType GetValueType<Percentiles>() {
+  return ValueType::kPercentiles;
+}
+
+template <>
+inline ValueType GetValueType<std::string>() {
+  return ValueType::kString;
+}
+
+template <>
+inline ValueType GetValueType<std::function<std::string()>>() {
+  return ValueType::kString;
+}
+
+template <>
+inline ValueType GetValueType<bool>() {
+  return ValueType::kBool;
+}
+
+template <>
+inline ValueType GetValueType<std::function<bool()>>() {
+  return ValueType::kBool;
+}
+
+template <>
+inline ValueType GetValueType<double>() {
+  return ValueType::kDouble;
+}
+
+template <>
+inline ValueType GetValueType<std::function<double()>>() {
+  return ValueType::kDouble;
+}
+
+}  // namespace internal
+
+// Abstract base class for a metric definition.
+//
+// Unlike MetricDef, this class is non-templatized and allows storing and
+// accessing metric definitions without the full type information.
+//
+// Everything except the value type of a metric is stored here. Please read
+// MetricDef class comments for more details.
+class AbstractMetricDef {
+ public:
+  MetricKind kind() const { return kind_; }
+
+  ValueType value_type() const { return value_type_; }
+
+  absl::string_view name() const { return name_; }
+
+  absl::string_view description() const { return description_; }
+
+  const std::vector<string>& label_descriptions() const {
+    return label_descriptions_;
+  }
+
+ private:
+  template <MetricKind kind, typename Value, int NumLabels>
+  friend class MetricDef;
+
+  AbstractMetricDef(const MetricKind kind, const ValueType value_type,
+                    const absl::string_view name,
+                    const absl::string_view description,
+                    const std::vector<string>& label_descriptions)
+      : kind_(kind),
+        value_type_(value_type),
+        name_(name),
+        description_(description),
+        label_descriptions_(std::vector<string>(label_descriptions.begin(),
+                                                label_descriptions.end())) {}
+
+  const MetricKind kind_;
+  const ValueType value_type_;
+  const string name_;
+  const string description_;
+  const std::vector<string> label_descriptions_;
+};
+
+// Metric definition.
+//
+// A metric is defined by its kind, value-type, name, description and the
+// description of its labels.
+//
+// NOTE: Name, description, and label descriptions should be logically static,
+// but do not have to live for the lifetime of the MetricDef.
+//
+// By "logically static", we mean that they should never contain dynamic
+// information, but is static for the lifetime of the MetricDef, and
+// in-turn the metric; they do not need to be compile-time constants.
+// This allows for e.g. prefixed metrics in a CLIF wrapped environment.
+template <MetricKind metric_kind, typename Value, int NumLabels>
+class MetricDef : public AbstractMetricDef {
+ public:
+  template <typename... LabelDesc>
+  MetricDef(const absl::string_view name, const absl::string_view description,
+            const LabelDesc&... label_descriptions)
+      : AbstractMetricDef(metric_kind, internal::GetValueType<Value>(), name,
+                          description, {label_descriptions...}) {
+    static_assert(sizeof...(LabelDesc) == NumLabels,
+                  "Mismatch between Counter<NumLabels> and number of label "
+                  "descriptions.");
+  }
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_METRIC_DEF_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
new file mode 100644
index 00000000..5ee2ceea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
@@ -0,0 +1,294 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_PERCENTILE_SAMPLER_H_
+#define XLA_TSL_LIB_MONITORING_PERCENTILE_SAMPLER_H_
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "absl/status/status.h"
+#include "tsl/platform/platform.h"
+#include "xla/tsl/platform/types.h"
+// clang-format on
+
+// We replace this implementation with a null implementation for mobile
+// platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/lib/monitoring/types.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+namespace monitoring {
+
+class PercentileSamplerCell {
+ public:
+  void Add(double sample) {}
+
+  Percentiles value() const { return Percentiles(); }
+};
+
+template <int NumLabels>
+class PercentileSampler {
+ public:
+  static PercentileSampler* New(
+      const MetricDef<MetricKind::kCumulative, Percentiles, NumLabels>&
+          metric_def,
+      std::vector<double> percentiles, size_t max_samples,
+      UnitOfMeasure unit_of_measure);
+
+  template <typename... Labels>
+  PercentileSamplerCell* GetCell(const Labels&... labels) {
+    return &default_cell_;
+  }
+
+  Status GetStatus() { return tsl::OkStatus(); }
+
+ private:
+  PercentileSamplerCell default_cell_;
+
+  PercentileSampler() = default;
+
+  PercentileSampler(const PercentileSampler&) = delete;
+  void operator=(const PercentileSampler&) = delete;
+};
+
+template <int NumLabels>
+PercentileSampler<NumLabels>* PercentileSampler<NumLabels>::New(
+    const MetricDef<MetricKind::kCumulative, Percentiles, NumLabels>&
+    /* metric_def */,
+    std::vector<double> /* percentiles */, size_t /* max_samples */,
+    UnitOfMeasure /* unit_of_measure */) {
+  return new PercentileSampler<NumLabels>();
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#else  // IS_MOBILE_PLATFORM
+
+#include <cmath>
+#include <map>
+
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/lib/monitoring/types.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace monitoring {
+
+// PercentileSamplerCell stores each value of an PercentileSampler.
+// The class uses a circular buffer to maintain a window of samples.
+//
+// This class is thread-safe.
+class PercentileSamplerCell {
+ public:
+  PercentileSamplerCell(UnitOfMeasure unit_of_measure,
+                        std::vector<double> percentiles, size_t max_samples)
+      : unit_of_measure_(unit_of_measure),
+        percentiles_(std::move(percentiles)),
+        samples_(max_samples),
+        num_samples_(0),
+        next_position_(0),
+        total_samples_(0),
+        accumulator_(0.0) {}
+
+  // Atomically adds a sample.
+  void Add(double sample);
+
+  Percentiles value() const;
+
+ private:
+  struct Sample {
+    bool operator<(const Sample& rhs) const { return value < rhs.value; }
+
+    uint64 nstime = 0;
+    double value = NAN;
+  };
+
+  std::vector<Sample> GetSamples(size_t* total_samples,
+                                 long double* accumulator) const;
+
+  mutable mutex mu_;
+  UnitOfMeasure unit_of_measure_;
+  const std::vector<double> percentiles_;
+  std::vector<Sample> samples_ TF_GUARDED_BY(mu_);
+  size_t num_samples_ TF_GUARDED_BY(mu_);
+  size_t next_position_ TF_GUARDED_BY(mu_);
+  size_t total_samples_ TF_GUARDED_BY(mu_);
+  long double accumulator_ TF_GUARDED_BY(mu_);
+
+  PercentileSamplerCell(const PercentileSamplerCell&) = delete;
+  void operator=(const PercentileSamplerCell&) = delete;
+};
+
+// A stateful class for updating a cumulative percentile sampled metric.
+//
+// This class stores, in each cell, up to max_samples values in a circular
+// buffer, and returns the percentiles information as cell value.
+//
+// PercentileSampler allocates storage and maintains a cell for each value. You
+// can retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <int NumLabels>
+class PercentileSampler {
+ public:
+  ~PercentileSampler() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments and buckets.
+  //
+  // Example;
+  // auto* sampler_with_label =
+  // PercentileSampler<1>::New({"/tensorflow/sampler",
+  //   "Tensorflow sampler", "MyLabelName"}, {10.0, 20.0, 30.0}, 1024,
+  //   UnitOfMeasure::kTime);
+  static PercentileSampler* New(
+      const MetricDef<MetricKind::kCumulative, Percentiles, NumLabels>&
+          metric_def,
+      std::vector<double> percentiles, size_t max_samples,
+      UnitOfMeasure unit_of_measure);
+
+  // Retrieves the cell for the specified labels, creating it on demand if
+  // not already present.
+  template <typename... Labels>
+  PercentileSamplerCell* GetCell(const Labels&... labels)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status GetStatus() { return status_; }
+
+ private:
+  friend class PercentileSamplerCell;
+
+  PercentileSampler(const MetricDef<MetricKind::kCumulative, Percentiles,
+                                    NumLabels>& metric_def,
+                    std::vector<double> percentiles, size_t max_samples,
+                    UnitOfMeasure unit_of_measure)
+      : metric_def_(metric_def),
+        unit_of_measure_(unit_of_measure),
+        percentiles_(std::move(percentiles)),
+        max_samples_(max_samples),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+              mutex_lock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second.value());
+              }
+            })) {
+    if (registration_handle_) {
+      for (size_t i = 0; i < percentiles_.size(); ++i) {
+        if (percentiles_[i] < 0.0 || percentiles_[i] > 100.0) {
+          status_ =
+              absl::Status(absl::StatusCode::kInvalidArgument,
+                           "Percentile values must be in [0, 100] range.");
+          break;
+        }
+        if (i + 1 < percentiles_.size() &&
+            percentiles_[i] >= percentiles_[i + 1]) {
+          status_ = absl::Status(
+              absl::StatusCode::kInvalidArgument,
+              "Percentile values must be in strictly ascending order.");
+          break;
+        }
+      }
+    } else {
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
+                       "Another metric with the same name already exists.");
+    }
+  }
+
+  mutable mutex mu_;
+
+  absl::Status status_;
+
+  using LabelArray = std::array<string, NumLabels>;
+  // we need a container here that guarantees pointer stability of the value,
+  // namely, the pointer of the value should remain valid even after more cells
+  // are inserted.
+  std::map<LabelArray, PercentileSamplerCell> cells_ TF_GUARDED_BY(mu_);
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kCumulative, Percentiles, NumLabels> metric_def_;
+
+  UnitOfMeasure unit_of_measure_ = UnitOfMeasure::kNumber;
+
+  // The percentiles samples required for this metric.
+  const std::vector<double> percentiles_;
+
+  // The maximum size of the samples colected by the PercentileSamplerCell cell.
+  const size_t max_samples_ = 0;
+
+  // Registration handle with the CollectionRegistry.
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  PercentileSampler(const PercentileSampler&) = delete;
+  void operator=(const PercentileSampler&) = delete;
+};
+
+template <int NumLabels>
+PercentileSampler<NumLabels>* PercentileSampler<NumLabels>::New(
+    const MetricDef<MetricKind::kCumulative, Percentiles, NumLabels>&
+        metric_def,
+    std::vector<double> percentiles, size_t max_samples,
+    UnitOfMeasure unit_of_measure) {
+  return new PercentileSampler<NumLabels>(metric_def, std::move(percentiles),
+                                          max_samples, unit_of_measure);
+}
+
+template <int NumLabels>
+template <typename... Labels>
+PercentileSamplerCell* PercentileSampler<NumLabels>::GetCell(
+    const Labels&... labels) TF_LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(
+      sizeof...(Labels) == NumLabels,
+      "Mismatch between PercentileSampler<NumLabels> and number of labels "
+      "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  mutex_lock l(mu_);
+  const auto found_it = cells_.find(label_array);
+  if (found_it != cells_.end()) {
+    return &(found_it->second);
+  }
+  return &(cells_
+               .emplace(std::piecewise_construct,
+                        std::forward_as_tuple(label_array),
+                        std::forward_as_tuple(unit_of_measure_, percentiles_,
+                                              max_samples_))
+               .first->second);
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // IS_MOBILE_PLATFORM
+#endif  // XLA_TSL_LIB_MONITORING_PERCENTILE_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/sampler.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/sampler.h
new file mode 100644
index 00000000..2fdbbd69
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/sampler.h
@@ -0,0 +1,338 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_SAMPLER_H_
+#define XLA_TSL_LIB_MONITORING_SAMPLER_H_
+
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "absl/status/status.h"
+#include "tsl/platform/platform.h"
+// clang-format on
+
+// We replace this implementation with a null implementation for mobile
+// platforms.
+#ifdef IS_MOBILE_PLATFORM
+
+#include <memory>
+
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/protobuf/histogram.pb.h"
+
+namespace tsl {
+namespace monitoring {
+using tensorflow::HistogramProto;
+// SamplerCell which has a null implementation.
+class SamplerCell {
+ public:
+  SamplerCell() {}
+  ~SamplerCell() {}
+
+  void Add(double value) {}
+  HistogramProto value() const { return HistogramProto(); }
+
+ private:
+  SamplerCell(const SamplerCell&) = delete;
+  void operator=(const SamplerCell&) = delete;
+};
+
+// Buckets which has a null implementation.
+class Buckets {
+ public:
+  Buckets() = default;
+  ~Buckets() = default;
+
+  static std::unique_ptr<Buckets> Explicit(
+      std::initializer_list<double> bucket_limits) {
+    return std::unique_ptr<Buckets>(new Buckets());
+  }
+
+  static std::unique_ptr<Buckets> Exponential(double scale,
+                                              double growth_factor,
+                                              int bucket_count) {
+    return std::unique_ptr<Buckets>(new Buckets());
+  }
+
+  const std::vector<double>& explicit_bounds() const {
+    return explicit_bounds_;
+  }
+
+ private:
+  std::vector<double> explicit_bounds_;
+
+  Buckets(const Buckets&) = delete;
+  void operator=(const Buckets&) = delete;
+};
+
+// Sampler which has a null implementation.
+template <int NumLabels>
+class Sampler {
+ public:
+  ~Sampler() {}
+
+  template <typename... MetricDefArgs>
+  static Sampler* New(const MetricDef<MetricKind::kCumulative, HistogramProto,
+                                      NumLabels>& metric_def,
+                      std::unique_ptr<Buckets> buckets) {
+    return new Sampler<NumLabels>(std::move(buckets));
+  }
+
+  template <typename... Labels>
+  SamplerCell* GetCell(const Labels&... labels) {
+    return &default_sampler_cell_;
+  }
+
+  Status GetStatus() { return OkStatus(); }
+
+ private:
+  Sampler(std::unique_ptr<Buckets> buckets) : buckets_(std::move(buckets)) {}
+
+  SamplerCell default_sampler_cell_;
+  std::unique_ptr<Buckets> buckets_;
+
+  Sampler(const Sampler&) = delete;
+  void operator=(const Sampler&) = delete;
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#else  // IS_MOBILE_PLATFORM
+
+#include <float.h>
+
+#include <map>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/lib/histogram/histogram.h"
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/histogram.pb.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace monitoring {
+using tensorflow::HistogramProto;
+
+// SamplerCell stores each value of an Sampler.
+//
+// A cell can be passed off to a module which may repeatedly update it without
+// needing further map-indexing computations. This improves both encapsulation
+// (separate modules can own a cell each, without needing to know about the map
+// to which both cells belong) and performance (since map indexing and
+// associated locking are both avoided).
+//
+// This class is thread-safe.
+class SamplerCell {
+ public:
+  explicit SamplerCell(const std::vector<double>& bucket_limits)
+      : histogram_(bucket_limits) {}
+
+  ~SamplerCell() {}
+
+  // Atomically adds a sample.
+  void Add(double sample);
+
+  // Returns the current histogram value as a proto.
+  HistogramProto value() const;
+
+ private:
+  histogram::ThreadSafeHistogram histogram_;
+
+  SamplerCell(const SamplerCell&) = delete;
+  void operator=(const SamplerCell&) = delete;
+};
+
+// Bucketing strategies for the samplers.
+//
+// We automatically add -DBL_MAX and DBL_MAX to the ranges, so that no sample
+// goes out of bounds.
+//
+// WARNING: If you are changing the interface here, please do change the same in
+// mobile_sampler.h.
+class Buckets {
+ public:
+  virtual ~Buckets() = default;
+
+  // Sets up buckets of the form:
+  // [-DBL_MAX, ..., scale * growth^i,
+  //   scale * growth_factor^(i + 1), ..., DBL_MAX].
+  //
+  // So for powers of 2 with a bucket count of 10, you would say (1, 2, 10)
+  static std::unique_ptr<Buckets> Exponential(double scale,
+                                              double growth_factor,
+                                              int bucket_count);
+
+  // Sets up buckets of the form:
+  // [-DBL_MAX, ..., bucket_limits[i], bucket_limits[i + 1], ..., DBL_MAX].
+  static std::unique_ptr<Buckets> Explicit(
+      std::initializer_list<double> bucket_limits);
+
+  // This alternative Explicit Buckets factory method is primarily meant to be
+  // used by the CLIF layer code paths that are incompatible with
+  // initialize_lists.
+  static std::unique_ptr<Buckets> Explicit(std::vector<double> bucket_limits);
+
+  virtual const std::vector<double>& explicit_bounds() const = 0;
+};
+
+// A stateful class for updating a cumulative histogram metric.
+//
+// This class encapsulates a set of histograms (or a single histogram for a
+// label-less metric) configured with a list of increasing bucket boundaries.
+// Each histogram is identified by a tuple of labels. The class allows the
+// user to add a sample to each histogram value.
+//
+// Sampler allocates storage and maintains a cell for each value. You can
+// retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <int NumLabels>
+class Sampler {
+ public:
+  ~Sampler() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments and buckets.
+  //
+  // Example;
+  // auto* sampler_with_label = Sampler<1>::New({"/tensorflow/sampler",
+  //   "Tensorflow sampler", "MyLabelName"}, {10.0, 20.0, 30.0});
+  static Sampler* New(const MetricDef<MetricKind::kCumulative, HistogramProto,
+                                      NumLabels>& metric_def,
+                      std::unique_ptr<Buckets> buckets);
+
+  // Retrieves the cell for the specified labels, creating it on demand if
+  // not already present.
+  template <typename... Labels>
+  SamplerCell* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status GetStatus() { return status_; }
+
+ private:
+  friend class SamplerCell;
+
+  Sampler(const MetricDef<MetricKind::kCumulative, HistogramProto, NumLabels>&
+              metric_def,
+          std::unique_ptr<Buckets> buckets)
+      : metric_def_(metric_def),
+        buckets_(std::move(buckets)),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              tf_shared_lock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second.value());
+              }
+            })) {
+    if (registration_handle_) {
+      status_ = absl::OkStatus();
+    } else {
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
+                       "Another metric with the same name already exists.");
+    }
+  }
+
+  mutable mutex mu_;
+
+  absl::Status status_;
+
+  using LabelArray = std::array<string, NumLabels>;
+  // we need a container here that guarantees pointer stability of the value,
+  // namely, the pointer of the value should remain valid even after more cells
+  // are inserted.
+  std::map<LabelArray, SamplerCell> cells_ TF_GUARDED_BY(mu_);
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kCumulative, HistogramProto, NumLabels>
+      metric_def_;
+
+  // Bucket limits for the histograms in the cells.
+  std::unique_ptr<Buckets> buckets_;
+
+  // Registration handle with the CollectionRegistry.
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  Sampler(const Sampler&) = delete;
+  void operator=(const Sampler&) = delete;
+};
+
+////
+//  Implementation details follow. API readers may skip.
+////
+
+inline void SamplerCell::Add(const double sample) { histogram_.Add(sample); }
+
+inline HistogramProto SamplerCell::value() const {
+  HistogramProto pb;
+  histogram_.EncodeToProto(&pb, true /* preserve_zero_buckets */);
+  return pb;
+}
+
+template <int NumLabels>
+Sampler<NumLabels>* Sampler<NumLabels>::New(
+    const MetricDef<MetricKind::kCumulative, HistogramProto, NumLabels>&
+        metric_def,
+    std::unique_ptr<Buckets> buckets) {
+  return new Sampler<NumLabels>(metric_def, std::move(buckets));
+}
+
+template <int NumLabels>
+template <typename... Labels>
+SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
+    TF_LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(sizeof...(Labels) == NumLabels,
+                "Mismatch between Sampler<NumLabels> and number of labels "
+                "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  {
+    tf_shared_lock l(mu_);
+    const auto found_it = cells_.find(label_array);
+    if (found_it != cells_.end()) {
+      return &(found_it->second);
+    }
+  }
+  mutex_lock l(mu_);
+  return &(cells_
+               .emplace(std::piecewise_construct,
+                        std::forward_as_tuple(label_array),
+                        std::forward_as_tuple(buckets_->explicit_bounds()))
+               .first->second);
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // IS_MOBILE_PLATFORM
+#endif  // XLA_TSL_LIB_MONITORING_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/test_utils.h
new file mode 100644
index 00000000..5f083d00
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/test_utils.h
@@ -0,0 +1,95 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_LIB_MONITORING_TEST_UTILS_H_
+#define XLA_TSL_LIB_MONITORING_TEST_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "xla/tsl/lib/monitoring/types.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/histogram.pb.h"
+
+namespace tsl {
+namespace monitoring {
+namespace testing {
+using tensorflow::HistogramProto;
+// Represents a `HistogramProto` but with a restricted API. This is used by
+// a `CellReader` to return collected histograms in unit tests.
+// Refer to core/framework/summary.proto for documentation of relevant fields.
+class Histogram final {
+ public:
+  Histogram() = default;
+  explicit Histogram(const HistogramProto& histogram_proto)
+      : histogram_proto_(histogram_proto) {}
+
+  // Returns the number of samples.
+  double num() const;
+
+  // Returns the number of samples in the `bucket`-th bucket.
+  //
+  // The range for a bucket is:
+  //   bucket == 0:  -DBL_MAX .. bucket_limit(0)
+  //   bucket != 0:  bucket_limit(bucket - 1) .. bucket_limit(bucket)
+  double num(size_t bucket) const;
+
+  // Returns the sum of the samples.
+  double sum() const;
+
+  // Returns the sum of squares of the samples.
+  double sum_squares() const;
+
+  // Subtracts the histogram by `other`. This is used by `CellReader` to compute
+  // the delta of the metrics.
+  //
+  // REQUIRES:
+  //   - The histograms have the same bucket boundaries.
+  //   - This histogram has more or equal number of samples than `other` in
+  //     every bucket.
+  // Returns an InvalidArgument error if the requirements are violated.
+  absl::StatusOr<Histogram> Subtract(const Histogram& other) const;
+
+ private:
+  HistogramProto histogram_proto_;
+};
+
+// Represents a collected `Percentiles` but with a restricted API. Subtracting
+// two `Percentiles` does not produce a meaningful `Percentiles`, so we only
+// expose a limited API that supports testing the number and sum of the samples.
+class Percentiles final {
+ public:
+  Percentiles() = default;
+  explicit Percentiles(const tsl::monitoring::Percentiles& percentiles)
+      : percentiles_(percentiles) {}
+
+  // Returns the number of samples.
+  size_t num() const;
+
+  // Returns the sum of samples.
+  double sum() const;
+
+  // Subtracts the percentiles by `other`. This is used by `CellReader` to
+  // compute the delta of the metrics.
+  Percentiles Subtract(const Percentiles& other) const;
+
+ private:
+  tsl::monitoring::Percentiles percentiles_;
+};
+
+}  // namespace testing
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/timed.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/timed.h
new file mode 100644
index 00000000..10a76b18
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/timed.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_TIMED_H_
+#define XLA_TSL_LIB_MONITORING_TIMED_H_
+
+#include "xla/tsl/platform/env_time.h"
+
+namespace tsl {
+namespace monitoring {
+
+// Takes a Sampler, PercentileSample or Gauge cell, and post timing values
+// (default in milliseconds) according to its scope lifetime.
+template <typename T>
+class Timed {
+ public:
+  explicit Timed(T* cell, double scale = 1e-6)
+      : cell_(cell), scale_(scale), start_(EnvTime::NowNanos()) {}
+
+  ~Timed() { cell_->Add(scale_ * (EnvTime::NowNanos() - start_)); }
+
+ private:
+  T* cell_ = nullptr;
+  double scale_ = 1e-6;
+  uint64 start_ = 0;
+};
+
+template <typename T>
+Timed<T> MakeTimed(T* cell, double scale = 1e-6) {
+  return Timed<T>(cell, scale);
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_TIMED_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/types.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/types.h
new file mode 100644
index 00000000..4618308c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/monitoring/types.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_TYPES_H_
+#define XLA_TSL_LIB_MONITORING_TYPES_H_
+
+#include <cmath>
+#include <vector>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace monitoring {
+
+enum class UnitOfMeasure {
+  kNumber,
+  kTime,
+  kBytes,
+};
+
+struct PercentilePoint {
+  // In the [0, 100] range.
+  double percentile = 0.0;
+  double value = 0.0;
+};
+
+struct Percentiles {
+  UnitOfMeasure unit_of_measure = UnitOfMeasure::kNumber;
+  uint64 start_nstime = 0;
+  uint64 end_nstime = 0;
+  double min_value = NAN;
+  double max_value = NAN;
+  double mean = NAN;
+  double stddev = NAN;
+  size_t num_samples = 0;
+  size_t total_samples = 0;
+  long double accumulator = NAN;
+  std::vector<PercentilePoint> points;
+};
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/distribution_sampler.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/distribution_sampler.h
new file mode 100644
index 00000000..afa0dac4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/distribution_sampler.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DistributionSampler allows generating a discrete random variable with a given
+// distribution.
+// The values taken by the variable are [0, N) and relative weights for each
+// value are specified using a vector of size N.
+//
+// The Algorithm takes O(N) time to precompute data at construction time and
+// takes O(1) time (2 random number generation, 2 lookups) for each sample.
+// The data structure takes O(N) memory.
+//
+// In contrast, util/random/weighted-picker.h provides O(lg N) sampling.
+// The advantage of that implementation is that weights can be adjusted
+// dynamically, while DistributionSampler doesn't allow weight adjustment.
+//
+// The algorithm used is Walker's Aliasing algorithm, described in Knuth, Vol 2.
+
+#ifndef XLA_TSL_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+#define XLA_TSL_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "xla/tsl/lib/random/simple_philox.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace random {
+
+class DistributionSampler {
+ public:
+  explicit DistributionSampler(const absl::Span<const float> weights);
+
+  ~DistributionSampler() {}
+
+  int Sample(SimplePhilox* rand) const {
+    float r = rand->RandFloat();
+    // Since n is typically low, we don't bother with UnbiasedUniform.
+    int idx = rand->Uniform(num_);
+    if (r < prob(idx)) return idx;
+    // else pick alt from that bucket.
+    DCHECK_NE(-1, alt(idx));
+    return alt(idx);
+  }
+
+  int num() const { return num_; }
+
+ private:
+  float prob(int idx) const {
+    DCHECK_LT(idx, num_);
+    return data_[idx].first;
+  }
+
+  int alt(int idx) const {
+    DCHECK_LT(idx, num_);
+    return data_[idx].second;
+  }
+
+  void set_prob(int idx, float f) {
+    DCHECK_LT(idx, num_);
+    data_[idx].first = f;
+  }
+
+  void set_alt(int idx, int val) {
+    DCHECK_LT(idx, num_);
+    data_[idx].second = val;
+  }
+
+  int num_;
+  std::unique_ptr<std::pair<float, int>[]> data_;
+
+  DistributionSampler(const DistributionSampler&) = delete;
+  void operator=(const DistributionSampler&) = delete;
+};
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/exact_uniform_int.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/exact_uniform_int.h
new file mode 100644
index 00000000..25d05cb6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/exact_uniform_int.h
@@ -0,0 +1,83 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Exact uniform integers using rejection sampling
+
+#ifndef XLA_TSL_LIB_RANDOM_EXACT_UNIFORM_INT_H_
+#define XLA_TSL_LIB_RANDOM_EXACT_UNIFORM_INT_H_
+
+#include <type_traits>
+
+namespace tsl {
+namespace random {
+
+template <typename UintType, typename RandomBits>
+UintType ExactUniformInt(const UintType n, const RandomBits& random) {
+  static_assert(std::is_unsigned<UintType>::value,
+                "UintType must be an unsigned int");
+  static_assert(std::is_same<UintType, decltype(random())>::value,
+                "random() should return UintType");
+  if (n == 0) {
+    // Consume a value anyway
+    // TODO(geoffreyi): Assert n != 0, since this case makes no sense.
+    return random() * n;
+  } else if (0 == (n & (n - 1))) {
+    // N is a power of two, so just mask off the lower bits.
+    return random() & (n - 1);
+  } else {
+    // Reject all numbers that skew the distribution towards 0.
+
+    // random's output is uniform in the half-open interval [0, 2^{bits}).
+    // For any interval [m,n), the number of elements in it is n-m.
+
+    const UintType range = ~static_cast<UintType>(0);
+    const UintType rem = (range % n) + 1;
+    UintType rnd;
+
+    // rem = ((2^bits-1) \bmod n) + 1
+    // 1 <= rem <= n
+
+    // NB: rem == n is impossible, since n is not a power of 2 (from
+    // earlier check).
+
+    do {
+      rnd = random();     // rnd uniform over [0, 2^{bits})
+    } while (rnd < rem);  // reject [0, rem)
+    // rnd is uniform over [rem, 2^{bits})
+    //
+    // The number of elements in the half-open interval is
+    //
+    //  2^{bits} - rem = 2^{bits} - ((2^{bits}-1) \bmod n) - 1
+    //                 = 2^{bits}-1 - ((2^{bits}-1) \bmod n)
+    //                 = n \cdot \lfloor (2^{bits}-1)/n \rfloor
+    //
+    // therefore n evenly divides the number of integers in the
+    // interval.
+    //
+    // The function v \rightarrow v % n takes values from [bias,
+    // 2^{bits}) to [0, n).  Each integer in the range interval [0, n)
+    // will have exactly \lfloor (2^{bits}-1)/n \rfloor preimages from
+    // the domain interval.
+    //
+    // Therefore, v % n is uniform over [0, n).  QED.
+
+    return rnd % n;
+  }
+}
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_EXACT_UNIFORM_INT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/philox_random.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/philox_random.h
new file mode 100644
index 00000000..f3b57794
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/philox_random.h
@@ -0,0 +1,258 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implement the Philox algorithm to generate random numbers in parallel.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+
+#ifndef XLA_TSL_LIB_RANDOM_PHILOX_RANDOM_H_
+#define XLA_TSL_LIB_RANDOM_PHILOX_RANDOM_H_
+
+#include <stdlib.h>
+
+#include <cstdint>
+
+// Function qualifiers that need to work on both CPU and GPU.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// For nvcc.
+#define PHILOX_DEVICE_FUNC __host__ __device__
+#define PHILOX_INLINE __inline__
+#else
+// For non-nvcc.
+#define PHILOX_DEVICE_FUNC
+#define PHILOX_INLINE inline
+#endif
+#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
+
+#include <math.h>
+
+namespace tsl {
+namespace random {
+
+// A class that represents an inline array. It can be used on both CPU and GPU,
+// and also trivially copyable between CPU and GPU.
+// Arguments:
+//   T: the array element type;
+//   ElementCount: the fixed size of the array;
+template <typename T, int ElementCount>
+class Array {
+ public:
+  static constexpr int kElementCount = ElementCount;
+  PHILOX_DEVICE_INLINE Array() {
+    for (int i = 0; i < ElementCount; ++i) {
+      data_[i] = T(0);
+    }
+  }
+
+  PHILOX_DEVICE_INLINE const T& operator[](int index) const {
+    return data_[index];
+  }
+
+  PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; }
+
+  size_t size() const { return ElementCount; }
+
+ private:
+  T data_[ElementCount];
+};
+
+// A class that encapsulates all the states for a random number generator using
+// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
+// in the form of four uint32_t.
+// There are multiple variants of this algorithm, we picked the 4x32_10 version
+// that is most suited for our applications.
+// Since this class is meant to be copied between CPU to GPU, it maintains a
+// value semantics.
+//
+// For example: To use this class and populate an array of 1024 randoms on CPU
+// with two threads,
+//
+//  void Fill(PhiloxRandom rnd, uint32_t* output, int start, int limit) {
+//    assert(start % 4 == 0);
+//    assert(limit % 4 == 0);
+//    rnd.Skip(start / 4);
+//    for (int i = start; i < limit; i += 4) {
+//      auto sample = rnd();
+//      ... copy sample[0..3] to output[i..i+3]
+//    }
+//  }
+//
+//  PhiloxRandom rng(seed);
+//  PhiloxRandom rng_copy = rng;
+//  rng.Skip(1000/4);
+//
+//  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
+//  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
+//  ... wait for thread 1 & 2 to finish executing Fill().
+//
+// NOTE:
+// 1. PhiloxRandom is trivially copyable.
+// 2. PhiloxRandom is compilable by gcc and nvcc.
+class PhiloxRandom {
+ public:
+  using ResultType = Array<uint32_t, 4>;
+  using ResultElementType = uint32_t;
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 4;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 10;
+  // The type for the 64-bit key stored in the form of two 32-bit uint
+  // that are used in the diffusion process.
+  using Key = Array<uint32_t, 2>;
+
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom() {}
+
+  PHILOX_DEVICE_INLINE
+  explicit PhiloxRandom(uint64_t seed) {
+    key_[0] = static_cast<uint32_t>(seed);
+    key_[1] = static_cast<uint32_t>(seed >> 32);
+  }
+
+  PHILOX_DEVICE_INLINE
+  explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi) {
+    key_[0] = static_cast<uint32_t>(seed_lo);
+    key_[1] = static_cast<uint32_t>(seed_lo >> 32);
+    counter_[2] = static_cast<uint32_t>(seed_hi);
+    counter_[3] = static_cast<uint32_t>(seed_hi >> 32);
+  }
+
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType const& counter() const { return counter_; }
+
+  PHILOX_DEVICE_INLINE
+  Key const& key() const { return key_; }
+
+  // Skip the specified number of samples of 128-bits in the current stream.
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64_t count) {
+    const uint32_t count_lo = static_cast<uint32_t>(count);
+    uint32_t count_hi = static_cast<uint32_t>(count >> 32);
+
+    counter_[0] += count_lo;
+    if (counter_[0] < count_lo) {
+      ++count_hi;
+    }
+
+    counter_[1] += count_hi;
+    if (counter_[1] < count_hi) {
+      if (++counter_[2] == 0) {
+        ++counter_[3];
+      }
+    }
+  }
+
+  // Returns a group of four random numbers using the underlying Philox
+  // algorithm.
+  PHILOX_DEVICE_INLINE ResultType operator()() {
+    ResultType counter = counter_;
+    Key key = key_;
+
+    // Run the single rounds for ten times. Manually unrolling the loop
+    // for better performance.
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+
+    SkipOne();
+
+    return counter;
+  }
+
+ private:
+  // We use the same constants as recommended by the original paper.
+  static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
+  static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
+  static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
+
+  // Helper function to skip the next sample of 128-bits in the current stream.
+  PHILOX_DEVICE_INLINE void SkipOne() {
+    if (++counter_[0] == 0) {
+      if (++counter_[1] == 0) {
+        if (++counter_[2] == 0) {
+          ++counter_[3];
+        }
+      }
+    }
+  }
+
+  // Helper function to return the lower and higher 32-bits from two 32-bit
+  // integer multiplications.
+  PHILOX_DEVICE_INLINE
+  static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t* result_low,
+                              uint32_t* result_high) {
+#ifndef __CUDA_ARCH__
+    const uint64_t product = static_cast<uint64_t>(a) * b;
+    *result_low = static_cast<uint32_t>(product);
+    *result_high = static_cast<uint32_t>(product >> 32);
+#else
+    *result_low = a * b;
+    *result_high = __umulhi(a, b);
+#endif
+  }
+
+  // Helper function for a single round of the underlying Philox algorithm.
+  PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(
+      const ResultType& counter, const Key& key) {
+    uint32_t lo0;
+    uint32_t hi0;
+    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
+
+    uint32_t lo1;
+    uint32_t hi1;
+    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
+
+    ResultType result;
+    result[0] = hi1 ^ counter[1] ^ key[0];
+    result[1] = lo1;
+    result[2] = hi0 ^ counter[3] ^ key[1];
+    result[3] = lo0;
+    return result;
+  }
+
+  PHILOX_DEVICE_INLINE void RaiseKey(Key* key) {
+    (*key)[0] += kPhiloxW32A;
+    (*key)[1] += kPhiloxW32B;
+  }
+
+ private:
+  ResultType counter_;
+  Key key_;
+};
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_PHILOX_RANDOM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
new file mode 100644
index 00000000..3c76e155
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
@@ -0,0 +1,51 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_RANDOM_PHILOX_RANDOM_TEST_UTILS_H_
+#define XLA_TSL_LIB_RANDOM_PHILOX_RANDOM_TEST_UTILS_H_
+
+#include <algorithm>
+
+#include "xla/tsl/lib/random/philox_random.h"
+#include "xla/tsl/platform/logging.h"
+#include "tsl/platform/random.h"
+
+namespace tsl {
+namespace random {
+
+// Return a random seed.
+inline uint64 GetTestSeed() { return New64(); }
+
+// A utility function to fill the given array with samples from the given
+// distribution.
+template <class Distribution>
+void FillRandoms(PhiloxRandom gen, typename Distribution::ResultElementType* p,
+                 int64_t size) {
+  const int granularity = Distribution::kResultElementCount;
+
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
+
+  Distribution dist;
+  for (int i = 0; i < size; i += granularity) {
+    const auto sample = dist(&gen);
+    std::copy(&sample[0], &sample[0] + granularity, &p[i]);
+  }
+}
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_PHILOX_RANDOM_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/random_distributions.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/random_distributions.h
new file mode 100644
index 00000000..72ee2ae4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/random_distributions.h
@@ -0,0 +1,756 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+#define XLA_TSL_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/tsl/lib/random/philox_random.h"
+#include "xla/tsl/lib/random/random_distributions_utils.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace random {
+
+// Helper function to convert a 16-bit integer to a half between [0..1).
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
+// Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
+
+// Computes a + b. Requires that the result is representable in the destination
+// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
+// need *not* be representable in that type. (The condition on b excludes the
+// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
+// compute.)
+template <typename Int>
+PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
+                                   typename std::make_unsigned<Int>::type b) {
+  // Implementation note: both b_div_2 and b - b_div_2 are positive and
+  // representable as Int.
+  auto b_div_2 = b >> 1;
+  return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
+}
+
+// A class that generates uniform distribution random numbers from the
+// underlying random integer generator.
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for the
+//              actual returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class Generator, typename RealType>
+class UniformDistribution;
+
+template <class Generator>
+class UniformDistribution<Generator, Eigen::half> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class UniformDistribution<Generator, bfloat16> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<bfloat16, kResultElementCount> ResultType;
+  typedef bfloat16 ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = Uint16ToGfloat16(sample[i]);
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class UniformDistribution<Generator, float> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = Uint32ToFloat(sample[i]);
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class UniformDistribution<Generator, double> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class UniformDistribution<Generator, int32> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<int32, kResultElementCount> ResultType;
+  typedef int32 ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int32_t lo, int32_t hi)
+      : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = SignedAdd(lo_, sample[i] % range_);
+    }
+    return result;
+  }
+
+ private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int32 lo_;
+  uint32 range_;
+};
+
+template <class Generator>
+class UniformDistribution<Generator, int64_t> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<int64_t, kResultElementCount> ResultType;
+  typedef int64_t ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int64_t lo, int64_t hi)
+      : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+      result[i] = SignedAdd(lo_, bits % range_);
+    }
+    return result;
+  }
+
+ private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int64_t lo_;
+  uint64 range_;
+};
+
+// Similar to `UniformDistribution`, except that instead of generating numbers
+// in the range [low, high), it generates numbers covering the whole range of
+// the integer type.
+template <typename Generator, typename IntType>
+class UniformFullIntDistribution;
+
+template <typename Generator, typename IntType>
+class UniformFullIntDistribution32 {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<IntType, kResultElementCount> ResultType;
+  typedef IntType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = sample[i];
+    }
+    return result;
+  }
+};
+
+template <typename Generator, typename IntType>
+class UniformFullIntDistribution64 {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<IntType, kResultElementCount> ResultType;
+  typedef IntType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+    }
+    return result;
+  }
+};
+
+template <typename Generator>
+class UniformFullIntDistribution<Generator, int32>
+    : public UniformFullIntDistribution32<Generator, int32> {};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, uint32>
+    : public UniformFullIntDistribution32<Generator, uint32> {};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, int64_t>
+    : public UniformFullIntDistribution64<Generator, int64_t> {};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, uint64>
+    : public UniformFullIntDistribution64<Generator, uint64> {};
+
+// A class that adapts the underlying native multiple samples to return a single
+// sample at a time.
+template <class Generator>
+class SingleSampleAdapter {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 1;
+  // The number of elements that will be returned by the underlying generator.
+  static constexpr int kNativeElementCount = Generator::kResultElementCount;
+  typedef typename Generator::ResultElementType ResultType;
+  typedef typename Generator::ResultElementType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  explicit SingleSampleAdapter(Generator* gen)
+      : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()() {
+    if (used_result_index_ == Generator::kResultElementCount) {
+      unused_results_ = (*generator_)();
+      used_result_index_ = 0;
+    }
+
+    return unused_results_[used_result_index_++];
+  }
+
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64 num_skips) {
+    if (!num_skips) {
+      return;
+    }
+    int num_unused_results = kNativeElementCount - used_result_index_;
+    if (num_skips <= num_unused_results) {
+      used_result_index_ += num_skips;
+      return;
+    }
+    num_skips -= num_unused_results;
+    used_result_index_ = kNativeElementCount;
+    SkipFromGenerator(num_skips / kNativeElementCount);
+    num_skips = num_skips % kNativeElementCount;
+    if (num_skips) {
+      unused_results_ = (*generator_)();
+      used_result_index_ = num_skips;
+    }
+  }
+
+ private:
+  // This implementation iteratively skips over `num_skips` samples
+  // from `generator_`. There is an O(1) implementation for PhiloxRandom
+  // in random_distributions.cc.
+  PHILOX_DEVICE_INLINE
+  void SkipFromGenerator(uint64 num_skips) {
+    while (num_skips--) {
+      (*generator_)();
+    }
+  }
+
+  Generator* generator_;
+  typename Generator::ResultType unused_results_;
+  int used_result_index_;
+};
+
+// A class that generates unit normal distribution random numbers from the
+// underlying random integer generator.
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              each invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for actual
+//              returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class Generator, typename RealType>
+class NormalDistribution;
+
+PHILOX_DEVICE_INLINE
+void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
+                     double* d1);
+
+// Exactly like the float version, except that we convert to half afterwards;
+// since we don't have half-precision sin/cos even on GPUs, there's nothing to
+// gain from working in half internally.
+template <class Generator>
+class NormalDistribution<Generator, Eigen::half> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2) {
+      float f[2];
+      BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
+      result[i] = Eigen::half(f[0]);
+      result[i + 1] = Eigen::half(f[1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class NormalDistribution<Generator, bfloat16> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<bfloat16, kResultElementCount> ResultType;
+  typedef bfloat16 ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    static_assert(kResultElementCount % 2 == 0,
+                  "kResultElementCount should be an even number");
+    for (int i = 0; i < kResultElementCount; i += 2) {
+      float f[2];
+      // Box-Muller transform requires processing 2 elements at a time.
+      BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
+      result[i] = bfloat16(f[0]);
+      result[i + 1] = bfloat16(f[1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class NormalDistribution<Generator, float> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2) {
+      BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator>
+class NormalDistribution<Generator, double> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2) {
+      const int i2 = 2 * i;
+      BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
+                      sample[i2 + 3], &result[i], &result[i + 1]);
+    }
+    return result;
+  }
+};
+
+// A class that returns standard normal distribution between
+// [-kTruncateValue, kTruncateValue].
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              each invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for actual
+//              returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class SingleSampleGenerator, typename RealType>
+class TruncatedNormalDistribution;
+
+// Exactly like the float version, except that we convert to half afterwards;
+// since we don't have half-precision sin/cos even on GPUs, there's nothing to
+// gain from working in half internally.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount =
+      SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator* gen) {
+    ResultType results;
+    int index = 0;
+    while (true) {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32 x0 = (*gen)();
+      const uint32 x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue) {
+        results[index++] = Eigen::half(f[0]);
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue) {
+        results[index++] = Eigen::half(f[1]);
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount =
+      SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<bfloat16, kResultElementCount> ResultType;
+  typedef bfloat16 ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator* gen) {
+    ResultType results;
+    int index = 0;
+    while (true) {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32 x0 = (*gen)();
+      const uint32 x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue) {
+        results[index++] = bfloat16(f[0]);
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue) {
+        results[index++] = bfloat16(f[1]);
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for float.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, float> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount =
+      SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator* gen) {
+    ResultType results;
+    int index = 0;
+    while (true) {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32 x0 = (*gen)();
+      const uint32 x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue) {
+        results[index++] = f[0];
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue) {
+        results[index++] = f[1];
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for double.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, double> {
+ public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount =
+      (SingleSampleGenerator::kNativeElementCount > 1)
+          ? SingleSampleGenerator::kNativeElementCount / 2
+          : 1;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+  const double kTruncateValue = 2.0;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator* gen) {
+    ResultType results;
+    int index = 0;
+    while (true) {
+      const uint32 x0 = (*gen)();
+      const uint32 x1 = (*gen)();
+      const uint32 x2 = (*gen)();
+      const uint32 x3 = (*gen)();
+      double d[2];
+      BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
+
+      if (Eigen::numext::abs(d[0]) < kTruncateValue) {
+        results[index++] = d[0];
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(d[1]) < kTruncateValue) {
+        results[index++] = d[1];
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Helper function to convert four 32-bit uniform integers to two doubles
+// under the unit normal distribution.
+PHILOX_DEVICE_INLINE
+void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
+                     double* d1) {
+  // This function implements the Box-Muller transform:
+  // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
+  // Do not send a really small number to log().
+  // We cannot mark "epsilon" as "static const" because NVCC would complain
+  const double epsilon = 1.0e-7;
+  double u1 = Uint64ToDouble(x0, x1);
+  if (u1 < epsilon) {
+    u1 = epsilon;
+  }
+  const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
+  const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
+#if !defined(__linux__)
+  *d0 = Eigen::numext::sin(v1);
+  *d1 = Eigen::numext::cos(v1);
+#else
+  sincos(v1, d0, d1);
+#endif
+  *d0 *= u2;
+  *d1 *= u2;
+}
+
+// Helper function to convert an 16-bit integer to a half between [0..1).
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
+  // IEEE754 halfs are formatted as follows (MSB first):
+  //    sign(1) exponent(5) mantissa(10)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 15  -- an excess 15 representation of a zero exponent
+  //    mantissa == 10 random bits
+  const uint16 man = x & 0x3ffu;  // 10 bit mantissa
+  const uint16 exp = static_cast<uint16>(15);
+  const uint16 val = (exp << 10) | man;
+
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(val);
+  return result - Eigen::half(1.0);
+}
+
+// Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
+// This can create a uniform distribution of values between [0..1).
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
+  // bfloat are formatted as follows (MSB first):
+  //    sign(1) exponent(8) mantissa(7)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 127  -- an excess 127 representation of a zero exponent
+  //    mantissa == 7 random bits
+  const uint16 man = x & 0x7fu;  // 7 bit mantissa
+  const uint16 exp = static_cast<uint16>(127);
+  const uint16 val = (exp << 7) | man;
+
+  bfloat16 result;
+  memcpy(&result, &val, sizeof(val));
+  // The mantissa has an implicit leading 1, so the above code creates a value
+  // in [1, 2). The minus will not cause a rounding that makes the result 1.
+  // Instead it will just be close to 1.
+  return result - bfloat16(1.0);
+}
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/random_distributions_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/random_distributions_utils.h
new file mode 100644
index 00000000..8da345b8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/random_distributions_utils.h
@@ -0,0 +1,97 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_RANDOM_RANDOM_DISTRIBUTIONS_UTILS_H_
+#define XLA_TSL_LIB_RANDOM_RANDOM_DISTRIBUTIONS_UTILS_H_
+
+#include <string.h>
+
+#include <cstdint>
+
+#include "xla/tsl/lib/random/philox_random.h"
+
+#ifndef M_PI
+#define M_PI (3.14159265358979323846)
+#endif
+
+namespace tsl {
+namespace random {
+
+// Helper function to convert an 32-bit integer to a float between [0..1).
+PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x) {
+  // IEEE754 floats are formatted as follows (MSB first):
+  //    sign(1) exponent(8) mantissa(23)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 127  -- an excess 127 representation of a zero exponent
+  //    mantissa == 23 random bits
+  const uint32_t man = x & 0x7fffffu;  // 23 bit mantissa
+  const uint32_t exp = static_cast<uint32_t>(127);
+  const uint32_t val = (exp << 23) | man;
+
+  // Assumes that endian-ness is same for float and uint32_t.
+  float result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0f;
+}
+
+// Helper function to convert two 32-bit integers to a double between [0..1).
+PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1) {
+  // IEEE754 doubles are formatted as follows (MSB first):
+  //    sign(1) exponent(11) mantissa(52)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 1023  -- an excess 1023 representation of a zero exponent
+  //    mantissa == 52 random bits
+  const uint32_t mhi = x0 & 0xfffffu;  // upper 20 bits of mantissa
+  const uint32_t mlo = x1;             // lower 32 bits of mantissa
+  const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo;  // mantissa
+  const uint64_t exp = static_cast<uint64_t>(1023);
+  const uint64_t val = (exp << 52) | man;
+  // Assumes that endian-ness is same for double and uint64_t.
+  double result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0;
+}
+
+// Helper function to convert two 32-bit uniform integers to two floats
+// under the unit normal distribution.
+PHILOX_DEVICE_INLINE
+void BoxMullerFloat(uint32_t x0, uint32_t x1, float* f0, float* f1) {
+  // This function implements the Box-Muller transform:
+  // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
+  // Do not send a really small number to log().
+  // We cannot mark "epsilon" as "static const" because NVCC would complain
+  const float epsilon = 1.0e-7f;
+  float u1 = Uint32ToFloat(x0);
+  if (u1 < epsilon) {
+    u1 = epsilon;
+  }
+  const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
+  const float u2 = sqrt(-2.0f * log(u1));
+#if !defined(__linux__)
+  *f0 = sin(v1);
+  *f1 = cos(v1);
+#else
+  sincosf(v1, f0, f1);
+#endif
+  *f0 *= u2;
+  *f1 *= u2;
+}
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_RANDOM_DISTRIBUTIONS_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/simple_philox.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/simple_philox.h
new file mode 100644
index 00000000..736bec4d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/simple_philox.h
@@ -0,0 +1,77 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_RANDOM_SIMPLE_PHILOX_H_
+#define XLA_TSL_LIB_RANDOM_SIMPLE_PHILOX_H_
+
+#include <math.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "xla/tsl/lib/random/philox_random.h"
+#include "xla/tsl/lib/random/random_distributions.h"
+
+namespace tsl {
+namespace random {
+
+// A simple imperative interface to Philox
+class SimplePhilox {
+ public:
+  PHILOX_DEVICE_INLINE
+  explicit SimplePhilox(PhiloxRandom* gen) : single_(gen) {}
+
+  // 32 random bits
+  PHILOX_DEVICE_INLINE uint32 Rand32() { return single_(); }
+
+  // 64 random bits
+  PHILOX_DEVICE_INLINE uint64 Rand64() {
+    const uint32 lo = single_(), hi = single_();
+    return lo | static_cast<uint64>(hi) << 32;
+  }
+
+  // Uniform float in [0, 1)
+  PHILOX_DEVICE_INLINE float RandFloat() { return Uint32ToFloat(single_()); }
+
+  // Uniform double in [0, 1)
+  PHILOX_DEVICE_INLINE double RandDouble() {
+    const uint32 x0 = single_(), x1 = single_();
+    return Uint64ToDouble(x0, x1);
+  }
+
+  // Uniform integer in [0, n).
+  // Uses rejection sampling, so may need more than one 32-bit sample.
+  uint32 Uniform(uint32 n);
+
+  // Approximately uniform integer in [0, n).
+  // Uses rejection sampling, so may need more than one 64-bit sample.
+  uint64 Uniform64(uint64 n);
+
+  // True with probability 1/n.
+  bool OneIn(uint32 n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with bias towards smaller numbers.
+  uint32 Skewed(int max_log);
+
+ private:
+  SingleSampleAdapter<PhiloxRandom> single_;
+};
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_SIMPLE_PHILOX_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/weighted_picker.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/weighted_picker.h
new file mode 100644
index 00000000..1300fba8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/random/weighted_picker.h
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An abstraction to pick from one of N elements with a specified
+// weight per element.
+//
+// The weight for a given element can be changed in O(lg N) time
+// An element can be picked in O(lg N) time.
+//
+// Uses O(N) bytes of memory.
+//
+// Alternative: distribution-sampler.h allows O(1) time picking, but no weight
+// adjustment after construction.
+
+#ifndef XLA_TSL_LIB_RANDOM_WEIGHTED_PICKER_H_
+#define XLA_TSL_LIB_RANDOM_WEIGHTED_PICKER_H_
+
+#include <assert.h>
+
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace random {
+
+class SimplePhilox;
+
+class WeightedPicker {
+ public:
+  // REQUIRES   N >= 0
+  // Initializes the elements with a weight of one per element
+  explicit WeightedPicker(int N);
+
+  // Releases all resources
+  ~WeightedPicker();
+
+  // Pick a random element with probability proportional to its weight.
+  // If total weight is zero, returns -1.
+  int Pick(SimplePhilox* rnd) const;
+
+  // Deterministically pick element x whose weight covers the
+  // specified weight_index.
+  // Returns -1 if weight_index is not in the range [ 0 .. total_weight()-1 ]
+  int PickAt(int32_t weight_index) const;
+
+  // Get the weight associated with an element
+  // REQUIRES 0 <= index < N
+  int32 get_weight(int index) const;
+
+  // Set the weight associated with an element
+  // REQUIRES weight >= 0.0f
+  // REQUIRES 0 <= index < N
+  void set_weight(int index, int32_t weight);
+
+  // Get the total combined weight of all elements
+  int32 total_weight() const;
+
+  // Get the number of elements in the picker
+  int num_elements() const;
+
+  // Set weight of each element to "weight"
+  void SetAllWeights(int32_t weight);
+
+  // Resizes the picker to N and
+  // sets the weight of each element i to weight[i].
+  // The sum of the weights should not exceed 2^31 - 2
+  // Complexity O(N).
+  void SetWeightsFromArray(int N, const int32* weights);
+
+  // REQUIRES   N >= 0
+  //
+  // Resize the weighted picker so that it has "N" elements.
+  // Any newly added entries have zero weight.
+  //
+  // Note: Resizing to a smaller size than num_elements() will
+  // not reclaim any memory.  If you wish to reduce memory usage,
+  // allocate a new WeightedPicker of the appropriate size.
+  //
+  // It is efficient to use repeated calls to Resize(num_elements() + 1)
+  // to grow the picker to size X (takes total time O(X)).
+  void Resize(int N);
+
+  // Grow the picker by one and set the weight of the new entry to "weight".
+  //
+  // Repeated calls to Append() in order to grow the
+  // picker to size X takes a total time of O(X lg(X)).
+  // Consider using SetWeightsFromArray instead.
+  void Append(int32_t weight);
+
+ private:
+  // We keep a binary tree with N leaves.  The "i"th leaf contains
+  // the weight of the "i"th element.  An internal node contains
+  // the sum of the weights of its children.
+  int N_;           // Number of elements
+  int num_levels_;  // Number of levels in tree (level-0 is root)
+  int32** level_;   // Array that holds nodes per level
+
+  // Size of each level
+  static int LevelSize(int level) { return 1 << level; }
+
+  // Rebuild the tree weights using the leaf weights
+  void RebuildTreeWeights();
+
+  WeightedPicker(const WeightedPicker&) = delete;
+  void operator=(const WeightedPicker&) = delete;
+};
+
+inline int32 WeightedPicker::get_weight(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, N_);
+  return level_[num_levels_ - 1][index];
+}
+
+inline int32 WeightedPicker::total_weight() const { return level_[0][0]; }
+
+inline int WeightedPicker::num_elements() const { return N_; }
+
+}  // namespace random
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_RANDOM_WEIGHTED_PICKER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/strings/proto_serialization.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
new file mode 100644
index 00000000..b79e9aff
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_LIB_STRINGS_PROTO_SERIALIZATION_H_
+#define XLA_TSL_LIB_STRINGS_PROTO_SERIALIZATION_H_
+
+#include "tsl/platform/protobuf.h"
+
+namespace tsl {
+
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+// Serialization is guaranteed to be deterministic for a given binary only.
+// See the following for more details:
+// https://github.com/google/protobuf/blob/a1bb147e96b6f74db6cdf3c3fcb00492472dbbfa/src/google/protobuf/io/coded_stream.h#L834
+bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
+                                    string* result);
+
+// As above, but takes a pre-allocated buffer wrapped by result.
+// PRECONDITION: size == msg.ByteSizeLong() && size <= INT_MAX.
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size);
+
+// Returns true if serializing x and y using
+// SerializeToBufferDeterministic() yields identical strings.
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y);
+
+// Computes Hash64 of the output of SerializeToBufferDeterministic().
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto);
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_STRINGS_PROTO_SERIALIZATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/auth_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
new file mode 100644
index 00000000..5cbc1704
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
@@ -0,0 +1,55 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_AUTH_PROVIDER_H_
+#define XLA_TSL_PLATFORM_CLOUD_AUTH_PROVIDER_H_
+
+#include <string>
+
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+
+/// Interface for a provider of authentication bearer tokens.
+class AuthProvider {
+ public:
+  virtual ~AuthProvider() {}
+
+  /// \brief Returns the short-term authentication bearer token.
+  ///
+  /// Safe for concurrent use by multiple threads.
+  virtual absl::Status GetToken(string* t) = 0;
+
+  static absl::Status GetToken(AuthProvider* provider, string* token) {
+    if (!provider) {
+      return errors::Internal("Auth provider is required.");
+    }
+    return provider->GetToken(token);
+  }
+};
+
+/// No-op auth provider, which will only work for public objects.
+class EmptyAuthProvider : public AuthProvider {
+ public:
+  absl::Status GetToken(string* token) override {
+    *token = "";
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_AUTH_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
new file mode 100644
index 00000000..81863019
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
+#define XLA_TSL_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
+
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/retrying_utils.h"
+
+namespace tsl {
+
+/// \brief A client that accesses to the metadata server running on GCE hosts.
+///
+/// Uses the provided HttpRequest::Factory to make requests to the local
+/// metadata service
+/// (https://cloud.google.com/compute/docs/storing-retrieving-metadata).
+/// Retries on recoverable failures using exponential backoff with the initial
+/// retry wait configurable via initial_retry_delay_usec.
+class ComputeEngineMetadataClient {
+ public:
+  explicit ComputeEngineMetadataClient(
+      std::shared_ptr<HttpRequest::Factory> http_request_factory,
+      const RetryConfig& config = RetryConfig(
+          10000,  /* init_delay_time_us = 1 ms */
+          1000000 /* max_delay_time_us = 1 s */
+          ));
+  virtual ~ComputeEngineMetadataClient() {}
+
+  /// \brief Get the metadata value for a given attribute of the metadata
+  /// service.
+  ///
+  /// Given a metadata path relative
+  /// to http://metadata.google.internal/computeMetadata/v1/,
+  /// fills response_buffer with the metadata. Returns OK if the server returns
+  /// the response for the given metadata path successfully.
+  ///
+  /// Example usage:
+  /// To get the zone of an instance:
+  ///   compute_engine_metadata_client.GetMetadata(
+  ///       "instance/zone", response_buffer);
+  virtual absl::Status GetMetadata(const string& path,
+                                   std::vector<char>* response_buffer);
+
+ private:
+  std::shared_ptr<HttpRequest::Factory> http_request_factory_;
+  const RetryConfig retry_config_;
+
+  ComputeEngineMetadataClient(const ComputeEngineMetadataClient&) = delete;
+  void operator=(const ComputeEngineMetadataClient&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
new file mode 100644
index 00000000..e54b8aba
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_COMPUTE_ENGINE_ZONE_PROVIDER_H_
+#define XLA_TSL_PLATFORM_CLOUD_COMPUTE_ENGINE_ZONE_PROVIDER_H_
+
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "xla/tsl/platform/cloud/zone_provider.h"
+
+namespace tsl {
+
+class ComputeEngineZoneProvider : public ZoneProvider {
+ public:
+  explicit ComputeEngineZoneProvider(
+      std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client);
+  virtual ~ComputeEngineZoneProvider();
+
+  absl::Status GetZone(string* zone) override;
+
+ private:
+  std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client_;
+  string cached_zone;
+  ComputeEngineZoneProvider(const ComputeEngineZoneProvider&) = delete;
+  void operator=(const ComputeEngineZoneProvider&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_COMPUTE_ENGINE_ZONE_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
new file mode 100644
index 00000000..717e59b1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
@@ -0,0 +1,277 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
+#define XLA_TSL_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
+
+#include <curl/curl.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+class LibCurl;  // libcurl interface as a class, for dependency injection.
+
+/// \brief A basic HTTP client based on the libcurl library.
+///
+/// The usage pattern for the class reflects the one of the libcurl library:
+/// create a request object, set request parameters and call Send().
+///
+/// For example:
+///   std::unique_ptr<HttpRequest> request(http_request_factory->Create());
+///   request->SetUri("http://www.google.com");
+///   request->SetResultsBuffer(out_buffer);
+///   request->Send();
+class CurlHttpRequest : public HttpRequest {
+ public:
+  class Factory : public HttpRequest::Factory {
+   public:
+    virtual ~Factory() {}
+    virtual HttpRequest* Create() { return new CurlHttpRequest(); }
+  };
+
+  CurlHttpRequest();
+  explicit CurlHttpRequest(LibCurl* libcurl)
+      : CurlHttpRequest(libcurl, Env::Default()) {}
+  CurlHttpRequest(LibCurl* libcurl, Env* env);
+  ~CurlHttpRequest() override;
+
+  /// Sets the request URI.
+  void SetUri(const string& uri) override;
+
+  /// \brief Sets the Range header.
+  ///
+  /// Used for random seeks, for example "0-999" returns the first 1000 bytes
+  /// (note that the right border is included).
+  void SetRange(uint64 start, uint64 end) override;
+
+  /// Sets a request header.
+  void AddHeader(const string& name, const string& value) override;
+
+  void AddResolveOverride(const string& hostname, int64_t port,
+                          const string& ip_addr) override;
+
+  /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
+  void AddAuthBearerHeader(const string& auth_token) override;
+
+  void SetRequestStats(RequestStats* stats) override;
+
+  /// Makes the request a DELETE request.
+  void SetDeleteRequest() override;
+
+  /// \brief Makes the request a PUT request.
+  ///
+  /// The request body will be taken from the specified file starting from
+  /// the given offset.
+  absl::Status SetPutFromFile(const string& body_filepath,
+                              size_t offset) override;
+
+  /// Makes the request a PUT request with an empty body.
+  void SetPutEmptyBody() override;
+
+  /// \brief Makes the request a POST request.
+  ///
+  /// The request body will be taken from the specified buffer.
+  void SetPostFromBuffer(const char* buffer, size_t size) override;
+
+  /// Makes the request a POST request with an empty body.
+  void SetPostEmptyBody() override;
+
+  /// \brief Specifies the buffer for receiving the response body.
+  ///
+  /// Size of out_buffer after an access will be exactly the number of bytes
+  /// read. Existing content of the vector will be cleared.
+  void SetResultBuffer(std::vector<char>* out_buffer) override;
+
+  /// \brief Specifies the buffer for receiving the response body, when the
+  /// caller knows the maximum size of the response body.
+  ///
+  /// This method allows the caller to receive the response body without an
+  /// additional intermediate buffer allocation and copy.  This method should
+  /// be called before calling Send(). After Send() has succeeded, the caller
+  /// should use the GetResultBufferDirectBytesTransferred() method in order
+  /// to learn how many bytes were transferred.
+  ///
+  /// Using this method is mutually exclusive with using SetResultBuffer().
+  void SetResultBufferDirect(char* buffer, size_t size) override;
+
+  /// \brief Distinguish response type (direct vs. implicit).
+  bool IsDirectResponse() const;
+
+  /// \brief Returns the number of bytes (of the response body) that were
+  /// transferred, when using the SetResultBufferDirect() method. The returned
+  /// value will always be less than or equal to the 'size' parameter that
+  /// was passed to SetResultBufferDirect(). If the actual HTTP response body
+  /// was greater than 'size' bytes, then this transfer method will only copy
+  /// the first 'size' bytes, and the rest will be ignored.
+  size_t GetResultBufferDirectBytesTransferred() override;
+
+  /// \brief Returns the response headers of a completed request.
+  ///
+  /// If the header is not found, returns an empty string.
+  string GetResponseHeader(const string& name) const override;
+
+  /// Returns the response code of a completed request.
+  uint64 GetResponseCode() const override;
+
+  /// \brief Sends the formed request.
+  ///
+  /// If the result buffer was defined, the response will be written there.
+  /// The object is not designed to be re-used after Send() is executed.
+  absl::Status Send() override;
+
+  // Url encodes str and returns a new string.
+  string EscapeString(const string& str) override;
+
+  void SetTimeouts(uint32 connection, uint32 inactivity, uint32 total) override;
+
+ private:
+  /// A write callback in the form which can be accepted by libcurl.
+  static size_t WriteCallback(const void* ptr, size_t size, size_t nmemb,
+                              void* userdata);
+
+  /// Processes response body content received when using SetResultBufferDirect.
+  static size_t WriteCallbackDirect(const void* ptr, size_t size, size_t nmemb,
+                                    void* userdata);
+  /// A read callback in the form which can be accepted by libcurl.
+  static size_t ReadCallback(void* ptr, size_t size, size_t nmemb,
+                             FILE* userdata);
+  /// A header callback in the form which can be accepted by libcurl.
+  static size_t HeaderCallback(const void* ptr, size_t size, size_t nmemb,
+                               void* this_object);
+  /// A progress meter callback in the form which can be accepted by libcurl.
+  static int ProgressCallback(void* this_object, curl_off_t dltotal,
+                              curl_off_t dlnow, curl_off_t ultotal,
+                              curl_off_t ulnow);
+  void CheckMethodNotSet() const;
+  void CheckNotSent() const;
+  absl::string_view GetResponse() const;
+
+  /// Helper to convert the given CURLcode and error buffer, representing the
+  /// result of performing a transfer, into a Status with an error message.
+  absl::Status CURLcodeToStatus(CURLcode code, const char* error_buffer);
+
+  LibCurl* libcurl_;
+  Env* env_;
+
+  FILE* put_body_ = nullptr;
+
+  absl::string_view post_body_buffer_;
+  size_t post_body_read_ = 0;
+
+  std::vector<char>* response_buffer_ = nullptr;
+
+  struct DirectResponseState {
+    char* buffer_;
+    size_t buffer_size_;
+    size_t bytes_transferred_;
+    size_t bytes_received_;
+  };
+  DirectResponseState direct_response_ = {};
+
+  CURL* curl_ = nullptr;
+  curl_slist* curl_headers_ = nullptr;
+  curl_slist* resolve_list_ = nullptr;
+
+  RequestStats* stats_ = nullptr;
+
+  std::vector<char> default_response_buffer_;
+
+  std::unordered_map<string, string> response_headers_;
+  uint64 response_code_ = 0;
+
+  // The timestamp of the last activity related to the request execution, in
+  // seconds since epoch.
+  uint64 last_progress_timestamp_ = 0;
+  // The last progress in terms of bytes transmitted.
+  curl_off_t last_progress_bytes_ = 0;
+
+  // The maximum period of request inactivity.
+  uint32 inactivity_timeout_secs_ = 60;  // 1 minute
+
+  // Timeout for the connection phase.
+  uint32 connect_timeout_secs_ = 120;  // 2 minutes
+
+  // Timeout for the whole request. Set only to prevent hanging indefinitely.
+  uint32 request_timeout_secs_ = 3600;  // 1 hour
+
+  // Members to enforce the usage flow.
+  bool is_uri_set_ = false;
+  bool is_method_set_ = false;
+  bool is_sent_ = false;
+
+  // Store the URI to help disambiguate requests when errors occur.
+  string uri_;
+  RequestMethod method_ = RequestMethod::kGet;
+
+  // Limit the size of an http response that is copied into an error message.
+  const size_t response_to_error_limit_ = 500;
+
+  CurlHttpRequest(const CurlHttpRequest&) = delete;
+  void operator=(const CurlHttpRequest&) = delete;
+};
+
+/// \brief A proxy to the libcurl C interface as a dependency injection measure.
+///
+/// This class is meant as a very thin wrapper for the libcurl C library.
+class LibCurl {
+ public:
+  virtual ~LibCurl() {}
+
+  virtual CURL* curl_easy_init() = 0;
+  virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                                    uint64 param) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                                    const char* param) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                                    void* param) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_setopt(
+      CURL* curl, CURLoption option,
+      size_t (*param)(void*, size_t, size_t, FILE*)) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                                    size_t (*param)(const void*, size_t, size_t,
+                                                    void*))
+      TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_setopt(
+      CURL* curl, CURLoption option,
+      int (*param)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                   curl_off_t ultotal,
+                   curl_off_t ulnow)) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_perform(CURL* curl) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
+                                     uint64* value) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
+                                     double* value) TF_MUST_USE_RESULT = 0;
+  virtual void curl_easy_cleanup(CURL* curl) = 0;
+  virtual curl_slist* curl_slist_append(curl_slist* list, const char* str) = 0;
+  virtual void curl_slist_free_all(curl_slist* list) = 0;
+  virtual char* curl_easy_escape(CURL* curl, const char* str, int length) = 0;
+  virtual void curl_free(void* p) = 0;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
new file mode 100644
index 00000000..4858cf3b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
@@ -0,0 +1,188 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_EXPIRING_LRU_CACHE_H_
+#define XLA_TSL_PLATFORM_CLOUD_EXPIRING_LRU_CACHE_H_
+
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+/// \brief An LRU cache of string keys and arbitrary values, with configurable
+/// max item age (in seconds) and max entries.
+///
+/// This class is thread safe.
+template <typename T>
+class ExpiringLRUCache {
+ public:
+  /// A `max_age` of 0 means that nothing is cached. A `max_entries` of 0 means
+  /// that there is no limit on the number of entries in the cache (however, if
+  /// `max_age` is also 0, the cache will not be populated).
+  ExpiringLRUCache(uint64 max_age, size_t max_entries,
+                   Env* env = Env::Default())
+      : max_age_(max_age), max_entries_(max_entries), env_(env) {}
+
+  /// Insert `value` with key `key`. This will replace any previous entry with
+  /// the same key.
+  void Insert(const string& key, const T& value) {
+    if (max_age_ == 0) {
+      return;
+    }
+    mutex_lock lock(mu_);
+    InsertLocked(key, value);
+  }
+
+  // Delete the entry with key `key`. Return true if the entry was found for
+  // `key`, false if the entry was not found. In both cases, there is no entry
+  // with key `key` existed after the call.
+  bool Delete(const string& key) {
+    mutex_lock lock(mu_);
+    return DeleteLocked(key);
+  }
+
+  /// Look up the entry with key `key` and copy it to `value` if found. Returns
+  /// true if an entry was found for `key`, and its timestamp is not more than
+  /// max_age_ seconds in the past.
+  bool Lookup(const string& key, T* value) {
+    if (max_age_ == 0) {
+      return false;
+    }
+    mutex_lock lock(mu_);
+    return LookupLocked(key, value);
+  }
+
+  typedef std::function<absl::Status(const string&, T*)> ComputeFunc;
+
+  /// Look up the entry with key `key` and copy it to `value` if found. If not
+  /// found, call `compute_func`. If `compute_func` returns successfully, store
+  /// a copy of the output parameter in the cache, and another copy in `value`.
+  absl::Status LookupOrCompute(const string& key, T* value,
+                               const ComputeFunc& compute_func) {
+    if (max_age_ == 0) {
+      return compute_func(key, value);
+    }
+
+    // Note: we hold onto mu_ for the rest of this function. In practice, this
+    // is okay, as stat requests are typically fast, and concurrent requests are
+    // often for the same file. Future work can split this up into one lock per
+    // key if this proves to be a significant performance bottleneck.
+    mutex_lock lock(mu_);
+    if (LookupLocked(key, value)) {
+      return absl::OkStatus();
+    }
+    absl::Status s = compute_func(key, value);
+    if (s.ok()) {
+      InsertLocked(key, *value);
+    }
+    return s;
+  }
+
+  /// Clear the cache.
+  void Clear() {
+    mutex_lock lock(mu_);
+    cache_.clear();
+    lru_list_.clear();
+  }
+
+  /// Accessors for cache parameters.
+  uint64 max_age() const { return max_age_; }
+  size_t max_entries() const { return max_entries_; }
+
+ private:
+  struct Entry {
+    /// The timestamp (seconds) at which the entry was added to the cache.
+    uint64 timestamp;
+
+    /// The entry's value.
+    T value;
+
+    /// A list iterator pointing to the entry's position in the LRU list.
+    std::list<string>::iterator lru_iterator;
+  };
+
+  bool LookupLocked(const string& key, T* value)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    if (env_->NowSeconds() - it->second.timestamp > max_age_) {
+      cache_.erase(it);
+      return false;
+    }
+    *value = it->second.value;
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return true;
+  }
+
+  void InsertLocked(const string& key, const T& value)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    lru_list_.push_front(key);
+    Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
+    auto insert = cache_.insert(std::make_pair(key, entry));
+    if (!insert.second) {
+      lru_list_.erase(insert.first->second.lru_iterator);
+      insert.first->second = entry;
+    } else if (max_entries_ > 0 && cache_.size() > max_entries_) {
+      cache_.erase(lru_list_.back());
+      lru_list_.pop_back();
+    }
+  }
+
+  bool DeleteLocked(const string& key) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    cache_.erase(it);
+    return true;
+  }
+
+  /// The maximum age of entries in the cache, in seconds. A value of 0 means
+  /// that no entry is ever placed in the cache.
+  const uint64 max_age_;
+
+  /// The maximum number of entries in the cache. A value of 0 means there is no
+  /// limit on entry count.
+  const size_t max_entries_;
+
+  /// The Env from which we read timestamps.
+  Env* const env_;  // not owned
+
+  /// Guards access to the cache and the LRU list.
+  mutex mu_;
+
+  /// The cache (a map from string key to Entry).
+  std::map<string, Entry> cache_ TF_GUARDED_BY(mu_);
+
+  /// The LRU list of entries. The front of the list identifies the most
+  /// recently accessed entry.
+  std::list<string> lru_list_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_EXPIRING_LRU_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
new file mode 100644
index 00000000..07dd253c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
+#define XLA_TSL_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/notification.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+class FileBlockCache;
+
+/// FileBlockCacheStatsInterface allows for instrumentation of the block cache.
+///
+/// FileBlockCacheStatsInterface and its subclasses must be safe to use from
+/// multiple threads concurrently.
+///
+/// WARNING! This is an experimental interface that may change or go away at any
+/// time.
+class FileBlockCacheStatsInterface {
+ public:
+  /// Configure is called to provide instrumentation hooks.
+  ///
+  /// Note: Configure can be called multiple times (e.g. if the block cache is
+  /// re-initialized).
+  virtual void Configure(const FileBlockCache* block_cache) = 0;
+
+  /// RecordBlockLoadRequest is called to record the size of a hit block.
+  virtual void RecordCacheHitBlockSize(size_t bytes_transferred) = 0;
+
+  /// RecordBlockLoadRequest is called to record the size of a missed block.
+  virtual void RecordCacheMissBlockSize(size_t bytes_transferred) = 0;
+
+  virtual ~FileBlockCacheStatsInterface() = default;
+};
+
+/// \brief A block cache of file contents, keyed by {filename, offset}.
+///
+/// This class should be shared by read-only random access files on a remote
+/// filesystem (e.g. GCS).
+class FileBlockCache {
+ public:
+  /// The callback executed when a block is not found in the cache, and needs to
+  /// be fetched from the backing filesystem. This callback is provided when the
+  /// cache is constructed. The returned Status should be OK as long as the
+  /// read from the remote filesystem succeeded (similar to the semantics of the
+  /// read(2) system call).
+  typedef std::function<absl::Status(const string& filename, size_t offset,
+                                     size_t buffer_size, char* buffer,
+                                     size_t* bytes_transferred)>
+      BlockFetcher;
+
+  virtual ~FileBlockCache() {}
+
+  /// Read `n` bytes from `filename` starting at `offset` into `out`. This
+  /// method will return:
+  ///
+  /// 1) The error from the remote filesystem, if the read from the remote
+  ///    filesystem failed.
+  /// 2) PRECONDITION_FAILED if the read from the remote filesystem succeeded,
+  ///    but the read returned a partial block, and the LRU cache contained a
+  ///    block at a higher offset (indicating that the partial block should have
+  ///    been a full block).
+  /// 3) OUT_OF_RANGE if the read from the remote filesystem succeeded, but
+  ///    the file contents do not extend past `offset` and thus nothing was
+  ///    placed in `out`.
+  /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
+  ///    in `out`).
+  virtual absl::Status Read(const string& filename, size_t offset, size_t n,
+                            char* buffer, size_t* bytes_transferred) = 0;
+
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file did not
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  virtual bool ValidateAndUpdateFileSignature(const string& filename,
+                                              int64_t file_signature) = 0;
+
+  /// Remove all cached blocks for `filename`.
+  virtual void RemoveFile(const string& filename) = 0;
+
+  /// Remove all cached data.
+  virtual void Flush() = 0;
+
+  /// Accessors for cache parameters.
+  virtual size_t block_size() const = 0;
+  virtual size_t max_bytes() const = 0;
+  virtual uint64 max_staleness() const = 0;
+
+  /// The current size (in bytes) of the cache.
+  virtual size_t CacheSize() const = 0;
+
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  virtual bool IsCacheEnabled() const = 0;
+
+  void SetStats(FileBlockCacheStatsInterface* stats) {
+    if (stats == nullptr) {
+      LOG(ERROR)
+          << "Attempted to monitor a NULL stats object. This may prevent the "
+             "corresponding monitoring data from being exported";
+      return;
+    }
+    cache_stats_ = stats;
+    cache_stats_->Configure(this);
+  }
+
+ protected:
+  FileBlockCacheStatsInterface* cache_stats_ = nullptr;  // Not owned.
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
new file mode 100644
index 00000000..5753881a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
+#define XLA_TSL_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
+
+#include <random>
+
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "xla/tsl/platform/env.h"
+
+namespace tsl {
+const int64_t kDefaultRefreshRateSecs = 60;
+
+// DnsCache is a userspace DNS cache specialized for the GCS filesystem.
+//
+// Some environments have unreliable DNS resolvers. DnsCache ameliorates the
+// situation by radically reducing the number of DNS requests by performing
+// 2 DNS queries per minute (by default) on a background thread. Updated cache
+// entries are used to override curl's DNS resolution processes.
+class GcsDnsCache {
+ public:
+  // Default no-argument constructor.
+  GcsDnsCache() : GcsDnsCache(kDefaultRefreshRateSecs) {}
+
+  // Constructs a GcsDnsCache with the specified refresh rate.
+  GcsDnsCache(int64_t refresh_rate_secs)
+      : GcsDnsCache(Env::Default(), refresh_rate_secs) {}
+
+  GcsDnsCache(Env* env, int64_t refresh_rate_secs);
+
+  ~GcsDnsCache() {
+    mutex_lock l(mu_);
+    cancelled_ = true;
+    cond_var_.notify_one();
+  }
+
+  // Annotate the given HttpRequest with resolve overrides from the cache.
+  void AnnotateRequest(HttpRequest* request);
+
+ private:
+  static std::vector<string> ResolveName(const string& name);
+  static std::vector<std::vector<string>> ResolveNames(
+      const std::vector<string>& names);
+  void WorkerThread();
+
+  // Define a friend class for testing.
+  friend class GcsDnsCacheTest;
+
+  mutex mu_;
+  Env* env_;
+  condition_variable cond_var_;
+  std::default_random_engine random_ TF_GUARDED_BY(mu_);
+  bool started_ TF_GUARDED_BY(mu_) = false;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  std::unique_ptr<Thread> worker_ TF_GUARDED_BY(mu_);  // After mutable vars.
+  const int64_t refresh_rate_secs_;
+
+  // Entries in this vector correspond to entries in kCachedDomainNames.
+  std::vector<std::vector<string>> addresses_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
new file mode 100644
index 00000000..811f9828
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
@@ -0,0 +1,482 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
+#define XLA_TSL_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
+
+#include <cstddef>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/auth_provider.h"
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "xla/tsl/platform/cloud/compute_engine_zone_provider.h"
+#include "xla/tsl/platform/cloud/expiring_lru_cache.h"
+#include "xla/tsl/platform/cloud/file_block_cache.h"
+#include "xla/tsl/platform/cloud/gcs_dns_cache.h"
+#include "xla/tsl/platform/cloud/gcs_throttle.h"
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/retrying_file_system.h"
+
+namespace tsl {
+
+class GcsFileSystem;
+
+// The environment variable that overrides the block size for aligned reads from
+// GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
+constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
+#if defined(LIBTPU_ON_GCE)
+// Overwrite the default max block size for `libtpu` BUILDs which do not
+// offer a mechanism to override the default through environment variable.
+constexpr size_t kDefaultBlockSize = 512 * 1024 * 1024;
+#else
+constexpr size_t kDefaultBlockSize = 64 * 1024 * 1024;
+#endif
+// The environment variable that overrides the max size of the LRU cache of
+// blocks read from GCS. Specified in MB.
+constexpr char kMaxCacheSize[] = "GCS_READ_CACHE_MAX_SIZE_MB";
+#if defined(LIBTPU_ON_GCE)
+// Overwrite the default max cache size for `libtpu` BUILDs which do not
+// offer a mechanism to override the default through environment variable.
+constexpr size_t kDefaultMaxCacheSize = 163840LL * 1024LL * 1024LL;
+#else
+constexpr size_t kDefaultMaxCacheSize = 0;
+#endif
+// The environment variable that overrides the maximum staleness of cached file
+// contents. Once any block of a file reaches this staleness, all cached blocks
+// will be evicted on the next read.
+constexpr char kMaxStaleness[] = "GCS_READ_CACHE_MAX_STALENESS";
+constexpr uint64 kDefaultMaxStaleness = 0;
+
+// Helper function to extract an environment variable and convert it into a
+// value of type T.
+template <typename T>
+bool GetEnvVar(const char* varname, bool (*convert)(absl::string_view, T*),
+               T* value) {
+  const char* env_value = std::getenv(varname);
+  if (env_value == nullptr) {
+    return false;
+  }
+  return convert(env_value, value);
+}
+
+/// GcsCacheOptions allows setting the parameters for the GCS file cache.
+///
+/// This also includes the block size for buffered reads.
+
+struct GcsCacheOptions {
+  // Block size for aligned reads from GCS.
+  size_t block_size = kDefaultBlockSize;
+  // Max size of the LRU cache of blocks read from GCS.
+  size_t max_bytes = kDefaultMaxCacheSize;
+  // Max staleness of cached file contents, in seconds.
+  uint64_t max_staleness_secs = kDefaultMaxStaleness;
+};
+
+/// GcsStatsInterface allows for instrumentation of the GCS file system.
+///
+/// GcsStatsInterface and its subclasses must be safe to use from multiple
+/// threads concurrently.
+///
+/// WARNING! This is an experimental interface that may change or go away at any
+/// time.
+class GcsStatsInterface {
+ public:
+  /// Configure is called by the GcsFileSystem to provide instrumentation hooks.
+  ///
+  /// Note: Configure can be called multiple times (e.g. if the block cache is
+  /// re-initialized).
+  virtual void Configure(GcsFileSystem* fs, GcsThrottle* throttle,
+                         const FileBlockCache* block_cache) = 0;
+
+  /// RecordBlockLoadRequest is called to record a block load request is about
+  /// to be made.
+  virtual void RecordBlockLoadRequest(const string& file, size_t offset) = 0;
+
+  /// RecordBlockRetrieved is called once a block within the file has been
+  /// retrieved.
+  virtual void RecordBlockRetrieved(const string& file, size_t offset,
+                                    size_t bytes_transferred) = 0;
+
+  // RecordStatObjectRequest is called once a statting object request over GCS
+  // is about to be made.
+  virtual void RecordStatObjectRequest() = 0;
+
+  /// HttpStats is called to optionally provide a RequestStats listener
+  /// to be annotated on every HTTP request made to the GCS API.
+  ///
+  /// HttpStats() may return nullptr.
+  virtual HttpRequest::RequestStats* HttpStats() = 0;
+
+  virtual ~GcsStatsInterface() = default;
+};
+
+struct UploadSessionHandle {
+  std::string session_uri;
+  bool resumable;
+};
+
+/// Google Cloud Storage implementation of a file system.
+///
+/// The clients should use RetryingGcsFileSystem defined below,
+/// which adds retry logic to GCS operations.
+class GcsFileSystem : public FileSystem {
+ public:
+  struct TimeoutConfig;
+
+  // Main constructor used (via RetryingFileSystem) throughout Tensorflow
+  explicit GcsFileSystem(bool make_default_cache = true,
+                         GcsCacheOptions cache_options = {});
+  explicit GcsFileSystem(GcsCacheOptions cache_options)
+      : GcsFileSystem(true, cache_options) {}
+  // Used mostly for unit testing or use cases which need to customize the
+  // filesystem from defaults
+  GcsFileSystem(std::unique_ptr<AuthProvider> auth_provider,
+                std::unique_ptr<HttpRequest::Factory> http_request_factory,
+                std::unique_ptr<ZoneProvider> zone_provider, size_t block_size,
+                size_t max_bytes, uint64 max_staleness,
+                uint64 stat_cache_max_age, size_t stat_cache_max_entries,
+                uint64 matching_paths_cache_max_age,
+                size_t matching_paths_cache_max_entries,
+                RetryConfig retry_config, TimeoutConfig timeouts,
+                const std::unordered_set<string>& allowed_locations,
+                std::pair<const string, const string>* additional_header,
+                bool compose_append);
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override;
+
+  absl::Status NewAppendableFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override;
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override;
+
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override;
+
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override;
+
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* results) override;
+
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override;
+
+  absl::Status CreateDir(const string& dirname,
+                         TransactionToken* token) override;
+
+  absl::Status DeleteDir(const string& dirname,
+                         TransactionToken* token) override;
+
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* file_size) override;
+
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override;
+
+  absl::Status IsDirectory(const string& fname,
+                           TransactionToken* token) override;
+
+  absl::Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override;
+
+  void FlushCaches(TransactionToken* token) override;
+
+  /// Set an object to collect runtime statistics from the GcsFilesystem.
+  void SetStats(GcsStatsInterface* stats);
+
+  /// Set an object to collect file block cache stats.
+  void SetCacheStats(FileBlockCacheStatsInterface* cache_stats);
+
+  /// These accessors are mainly for testing purposes, to verify that the
+  /// environment variables that control these parameters are handled correctly.
+  size_t block_size() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->block_size();
+  }
+  size_t max_bytes() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->max_bytes();
+  }
+  uint64 max_staleness() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->max_staleness();
+  }
+  TimeoutConfig timeouts() const { return timeouts_; }
+  std::unordered_set<string> allowed_locations() const {
+    return allowed_locations_;
+  }
+
+  bool compose_append() const { return compose_append_; }
+  string additional_header_name() const {
+    return additional_header_ ? additional_header_->first : "";
+  }
+  string additional_header_value() const {
+    return additional_header_ ? additional_header_->second : "";
+  }
+
+  uint64 stat_cache_max_age() const { return stat_cache_->max_age(); }
+  size_t stat_cache_max_entries() const { return stat_cache_->max_entries(); }
+
+  uint64 matching_paths_cache_max_age() const {
+    return matching_paths_cache_->max_age();
+  }
+  size_t matching_paths_cache_max_entries() const {
+    return matching_paths_cache_->max_entries();
+  }
+
+  /// Structure containing the information for timeouts related to accessing the
+  /// GCS APIs.
+  ///
+  /// All values are in seconds.
+  struct TimeoutConfig {
+    // The request connection timeout. If a connection cannot be established
+    // within `connect` seconds, abort the request.
+    uint32 connect = 120;  // 2 minutes
+
+    // The request idle timeout. If a request has seen no activity in `idle`
+    // seconds, abort the request.
+    uint32 idle = 60;  // 1 minute
+
+    // The maximum total time a metadata request can take. If a request has not
+    // completed within `metadata` seconds, the request is aborted.
+    uint32 metadata = 3600;  // 1 hour
+
+    // The maximum total time a block read request can take. If a request has
+    // not completed within `read` seconds, the request is aborted.
+    uint32 read = 3600;  // 1 hour
+
+    // The maximum total time an upload request can take. If a request has not
+    // completed within `write` seconds, the request is aborted.
+    uint32 write = 3600;  // 1 hour
+
+    TimeoutConfig() {}
+    TimeoutConfig(uint32 connect, uint32 idle, uint32 metadata, uint32 read,
+                  uint32 write)
+        : connect(connect),
+          idle(idle),
+          metadata(metadata),
+          read(read),
+          write(write) {}
+  };
+
+  absl::Status CreateHttpRequest(std::unique_ptr<HttpRequest>* request);
+
+  /// \brief Sets a new AuthProvider on the GCS FileSystem.
+  ///
+  /// The new auth provider will be used for all subsequent requests.
+  void SetAuthProvider(std::unique_ptr<AuthProvider> auth_provider);
+
+  /// \brief Resets the block cache and re-instantiates it with the new values.
+  ///
+  /// This method can be used to clear the existing block cache and/or to
+  /// re-configure the block cache for different values.
+  ///
+  /// Note: the existing block cache is not cleaned up until all existing files
+  /// have been closed.
+  void ResetFileBlockCache(size_t block_size_bytes, size_t max_bytes,
+                           uint64 max_staleness_secs);
+
+ protected:
+  virtual std::unique_ptr<FileBlockCache> MakeFileBlockCache(
+      size_t block_size, size_t max_bytes, uint64 max_staleness);
+
+  /// Loads file contents from GCS for a given filename, offset, and length.
+  virtual absl::Status LoadBufferFromGCS(const string& fname, size_t offset,
+                                         size_t n, char* buffer,
+                                         size_t* bytes_transferred);
+
+  // Creates an upload session for an upcoming GCS object upload.
+  virtual absl::Status CreateNewUploadSession(
+      uint64 start_offset, const std::string& object_to_upload,
+      const std::string& bucket, uint64 file_size, const std::string& gcs_path,
+      UploadSessionHandle* session_handle);
+
+  // Uploads object data to session.
+  virtual absl::Status UploadToSession(const std::string& session_uri,
+                                       uint64 start_offset,
+                                       uint64 already_uploaded,
+                                       const std::string& tmp_content_filename,
+                                       uint64 file_size,
+                                       const std::string& file_path);
+
+  /// \brief Requests status of a previously initiated upload session.
+  ///
+  /// If the upload has already succeeded, sets 'completed' to true.
+  /// Otherwise sets 'completed' to false and 'uploaded' to the currently
+  /// uploaded size in bytes.
+  virtual absl::Status RequestUploadSessionStatus(const string& session_uri,
+                                                  uint64 file_size,
+                                                  const std::string& gcs_path,
+                                                  bool* completed,
+                                                  uint64* uploaded);
+
+  absl::Status ParseGcsPathForScheme(absl::string_view fname, string scheme,
+                                     bool empty_object_ok, string* bucket,
+                                     string* object);
+
+  /// \brief Splits a GCS path to a bucket and an object.
+  ///
+  /// For example, "gs://bucket-name/path/to/file.txt" gets split into
+  /// "bucket-name" and "path/to/file.txt".
+  /// If fname only contains the bucket and empty_object_ok = true, the returned
+  /// object is empty.
+  virtual absl::Status ParseGcsPath(absl::string_view fname,
+                                    bool empty_object_ok, string* bucket,
+                                    string* object);
+
+  std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
+
+  // Used by a subclass.
+  TimeoutConfig timeouts_;
+
+  /// The retry configuration used for retrying failed calls.
+  RetryConfig retry_config_;
+
+ private:
+  // GCS file statistics.
+  struct GcsFileStat {
+    FileStatistics base;
+    int64_t generation_number = 0;
+  };
+
+  /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
+  ///
+  /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
+  absl::Status BucketExists(const string& bucket, bool* result);
+
+  /// \brief Retrieves the GCS bucket location. Returns OK if the location was
+  /// retrieved.
+  ///
+  /// Given a string bucket the GCS bucket metadata API will be called and the
+  /// location string filled with the location of the bucket.
+  ///
+  /// This requires the bucket metadata permission.
+  /// Repeated calls for the same bucket are cached so this function can be
+  /// called frequently without causing an extra API call
+  absl::Status GetBucketLocation(const string& bucket, string* location);
+
+  /// \brief Check if the GCS buckets location is allowed with the current
+  /// constraint configuration
+  absl::Status CheckBucketLocationConstraint(const string& bucket);
+
+  /// \brief Given the input bucket `bucket`, fills `result_buffer` with the
+  /// results of the metadata. Returns OK if the API call succeeds without
+  /// error.
+  absl::Status GetBucketMetadata(const string& bucket,
+                                 std::vector<char>* result_buffer);
+
+  /// \brief Checks if the object exists. Returns OK if the check succeeded.
+  ///
+  /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
+  absl::Status ObjectExists(const string& fname, const string& bucket,
+                            const string& object, bool* result);
+
+  /// \brief Checks if the folder exists. Returns OK if the check succeeded.
+  ///
+  /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
+  absl::Status FolderExists(const string& dirname, bool* result);
+
+  /// \brief Internal version of GetChildren with more knobs.
+  ///
+  /// If 'recursively' is true, returns all objects in all subfolders.
+  /// Otherwise only returns the immediate children in the directory.
+  ///
+  /// If 'include_self_directory_marker' is true and there is a GCS directory
+  /// marker at the path 'dir', GetChildrenBound will return an empty string
+  /// as one of the children that represents this marker.
+  absl::Status GetChildrenBounded(const string& dir, uint64 max_results,
+                                  std::vector<string>* result, bool recursively,
+                                  bool include_self_directory_marker);
+
+  /// Retrieves file statistics assuming fname points to a GCS object. The data
+  /// may be read from cache or from GCS directly.
+  absl::Status StatForObject(const string& fname, const string& bucket,
+                             const string& object, GcsFileStat* stat);
+  /// Retrieves file statistics of file fname directly from GCS.
+  absl::Status UncachedStatForObject(const string& fname, const string& bucket,
+                                     const string& object, GcsFileStat* stat);
+
+  absl::Status RenameObject(const string& src, const string& target);
+
+  // Clear all the caches related to the file with name `filename`.
+  void ClearFileCaches(const string& fname);
+
+  mutex mu_;
+  std::unique_ptr<AuthProvider> auth_provider_ TF_GUARDED_BY(mu_);
+  std::shared_ptr<HttpRequest::Factory> http_request_factory_;
+  std::unique_ptr<ZoneProvider> zone_provider_;
+
+  // Reads smaller than block_size_ will trigger a read of block_size_.
+  uint64 block_size_;
+
+  // block_cache_lock_ protects the file_block_cache_ pointer (Note that
+  // FileBlockCache instances are themselves threadsafe).
+  mutex block_cache_lock_;
+  std::unique_ptr<FileBlockCache> file_block_cache_
+      TF_GUARDED_BY(block_cache_lock_);
+
+  bool cache_enabled_;
+  std::unique_ptr<GcsDnsCache> dns_cache_;
+  GcsThrottle throttle_;
+
+  using StatCache = ExpiringLRUCache<GcsFileStat>;
+  std::unique_ptr<StatCache> stat_cache_;
+
+  using MatchingPathsCache = ExpiringLRUCache<std::vector<string>>;
+  std::unique_ptr<MatchingPathsCache> matching_paths_cache_;
+
+  using BucketLocationCache = ExpiringLRUCache<string>;
+  std::unique_ptr<BucketLocationCache> bucket_location_cache_;
+  std::unordered_set<string> allowed_locations_;
+  bool compose_append_;
+
+  GcsStatsInterface* stats_ = nullptr;  // Not owned.
+
+  // Additional header material to be transmitted with all GCS requests
+  std::unique_ptr<std::pair<const string, const string>> additional_header_;
+
+  GcsFileSystem(const GcsFileSystem&) = delete;
+  void operator=(const GcsFileSystem&) = delete;
+};
+
+/// Google Cloud Storage implementation of a file system with retry on failures.
+class RetryingGcsFileSystem : public RetryingFileSystem<GcsFileSystem> {
+ public:
+  explicit RetryingGcsFileSystem(GcsCacheOptions cache_options = {});
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
new file mode 100644
index 00000000..be11261f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
@@ -0,0 +1,168 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_GCS_THROTTLE_H_
+#define XLA_TSL_PLATFORM_CLOUD_GCS_THROTTLE_H_
+
+#include "xla/tsl/platform/env.h"
+
+namespace tsl {
+
+/**
+ * GcsThrottleConfig is used to configure the GcsThrottle.
+ */
+struct GcsThrottleConfig {
+  /**
+   * enabled is true if GcsThrottle should throttle requests, false otherwise.
+   */
+  bool enabled = false;
+
+  /**
+   * token_rate is the number of tokens accrued every second that can be used
+   * for making requests to the GCS service.
+   */
+  int64_t token_rate =
+      100000;  // Approximately 800 MBits/second bandwidth-only.
+
+  /**
+   * bucket_size is the maximum number of available tokens the GcsThrottle can
+   * accrue.
+   */
+  int64_t bucket_size = 10000000;  // 10 million tokens total
+
+  /**
+   * tokens_per_request determines the number of tokens consumed for every
+   * request.
+   *
+   * Note: tokens are also consumed in proportion to the response size.
+   */
+  int64_t tokens_per_request = 100;
+
+  /**
+   * initial_tokens determines how many tokens should be available immediately
+   * after the GcsThrottle is constructed.
+   */
+  int64_t initial_tokens = 0;
+};
+
+/**
+ * GcsThrottle is used to ensure fair use of the available GCS capacity.
+ *
+ * GcsThrottle operates around a concept of tokens. Tokens are consumed when
+ * making requests to the GCS service. Tokens are consumed both based on the
+ * number of requests made, as well as the bandwidth consumed (response sizes).
+ *
+ * GcsThrottle is thread safe and can be used from multiple threads.
+ */
+class GcsThrottle {
+ public:
+  /**
+   * Constructs a GcsThrottle.
+   */
+  explicit GcsThrottle(EnvTime* env_time = nullptr);
+
+  /**
+   * AdmitRequest updates the GcsThrottle to record a request will be made.
+   *
+   * AdmitRequest should be called before any request is made. AdmitRequest
+   * returns false if the request should be denied. If AdmitRequest
+   * returns false, no tokens are consumed. If true is returned, the configured
+   * number of tokens are consumed.
+   */
+  bool AdmitRequest();
+
+  /**
+   * RecordResponse updates the GcsThrottle to record a request has been made.
+   *
+   * RecordResponse should be called after the response has been received.
+   * RecordResponse will update the internal state based on the number of bytes
+   * in the response.
+   *
+   * Note: we split up the request and the response in this fashion in order to
+   * avoid penalizing consumers who are using large readahead buffers at higher
+   * layers of the I/O stack.
+   */
+  void RecordResponse(size_t num_bytes);
+
+  /**
+   * SetConfig sets the configuration for GcsThrottle and re-initializes state.
+   *
+   * After calling this, the token pool will be config.initial_tokens.
+   */
+  void SetConfig(GcsThrottleConfig config);
+
+  /**
+   * available_tokens gives a snapshot of how many tokens are available.
+   *
+   * The returned value should not be used to make admission decisions. The
+   * purpose of this function is to make available to monitoring or other
+   * instrumentation the number of available tokens in the pool.
+   */
+  inline int64_t available_tokens() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    UpdateState();
+    return available_tokens_;
+  }
+
+  /**
+   * is_enabled determines if the throttle is enabled.
+   *
+   * If !is_enabled(), AdmitRequest() will always return true. To enable the
+   * throttle, call SetConfig passing in a configuration that has enabled set to
+   * true.
+   */
+  bool is_enabled() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    return config_.enabled;
+  }
+
+ private:
+  /**
+   * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
+   *
+   * UpdateState should be called in order to mark the passage of time, and
+   * therefore add tokens to the available_tokens_ pool.
+   */
+  void UpdateState() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+    return num_bytes >> 10;
+  }
+
+  mutex mu_;
+
+  /**
+   * last_updated_secs_ records the number of seconds since the Unix epoch that
+   * the internal state of the GcsThrottle was updated. This is important when
+   * determining the number of tokens to add to the available_tokens_ pool.
+   */
+  uint64 last_updated_secs_ TF_GUARDED_BY(mu_) = 0;
+
+  /**
+   * available_tokens_ records how many tokens are available to be consumed.
+   *
+   * Note: it is possible for available_tokens_ to become negative. If a
+   * response comes back that consumes more than the available tokens, the count
+   * will go negative, and block future requests until we have available tokens.
+   */
+  int64_t available_tokens_ TF_GUARDED_BY(mu_) = 0;
+
+  EnvTime* const env_time_;
+  GcsThrottleConfig config_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_GCS_THROTTLE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
new file mode 100644
index 00000000..79b8b90c
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
@@ -0,0 +1,70 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
+#define XLA_TSL_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
+
+#include <memory>
+
+#include "xla/tsl/platform/cloud/auth_provider.h"
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "xla/tsl/platform/cloud/oauth_client.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+/// Implementation based on Google Application Default Credentials.
+class GoogleAuthProvider : public AuthProvider {
+ public:
+  GoogleAuthProvider(std::shared_ptr<ComputeEngineMetadataClient>
+                         compute_engine_metadata_client);
+  explicit GoogleAuthProvider(std::unique_ptr<OAuthClient> oauth_client,
+                              std::shared_ptr<ComputeEngineMetadataClient>
+                                  compute_engine_metadata_client,
+                              Env* env);
+  virtual ~GoogleAuthProvider() {}
+
+  /// \brief Returns the short-term authentication bearer token.
+  ///
+  /// Safe for concurrent use by multiple threads.
+  absl::Status GetToken(string* token) override;
+
+ private:
+  /// \brief Gets the bearer token from files.
+  ///
+  /// Tries the file from $GOOGLE_APPLICATION_CREDENTIALS and the
+  /// standard gcloud tool's location.
+  absl::Status GetTokenFromFiles() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Gets the bearer token from Google Compute Engine environment.
+  absl::Status GetTokenFromGce() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Gets the bearer token from the system env variable, for testing purposes.
+  absl::Status GetTokenForTesting() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  std::unique_ptr<OAuthClient> oauth_client_;
+  std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
+  Env* env_;
+  mutex mu_;
+  string current_token_ TF_GUARDED_BY(mu_);
+  uint64 expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
+  GoogleAuthProvider(const GoogleAuthProvider&) = delete;
+  void operator=(const GoogleAuthProvider&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/http_request.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/http_request.h
new file mode 100644
index 00000000..9ca2391b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/http_request.h
@@ -0,0 +1,195 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_HTTP_REQUEST_H_
+#define XLA_TSL_PLATFORM_CLOUD_HTTP_REQUEST_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+/// \brief An abstract basic HTTP client.
+///
+/// The usage pattern for the class is based on the libcurl library:
+/// create a request object, set request parameters and call Send().
+///
+/// For example:
+///   HttpRequest request;
+///   request.SetUri("http://www.google.com");
+///   request.SetResultsBuffer(out_buffer);
+///   request.Send();
+class HttpRequest {
+ public:
+  class Factory {
+   public:
+    virtual ~Factory() {}
+    virtual HttpRequest* Create() = 0;
+  };
+
+  /// RequestMethod is used to capture what type of HTTP request is made and
+  /// is used in conjunction with RequestStats for instrumentation and
+  /// monitoring of HTTP requests and their responses.
+  enum class RequestMethod : char {
+    kGet,
+    kPost,
+    kPut,
+    kDelete,
+  };
+
+  /// RequestMethodName converts a RequestMethod to the canonical method string.
+  inline static const char* RequestMethodName(RequestMethod m) {
+    switch (m) {
+      case RequestMethod::kGet:
+        return "GET";
+      case RequestMethod::kPost:
+        return "POST";
+      case RequestMethod::kPut:
+        return "PUT";
+      case RequestMethod::kDelete:
+        return "DELETE";
+      default:
+        return "???";
+    }
+  }
+
+  /// RequestStats is a class that can be used to instrument an Http Request.
+  class RequestStats {
+   public:
+    virtual ~RequestStats() = default;
+
+    /// RecordRequest is called right before a request is sent on the wire.
+    virtual void RecordRequest(const HttpRequest* request, const string& uri,
+                               RequestMethod method) = 0;
+
+    /// RecordResponse is called after the response has been received.
+    virtual void RecordResponse(const HttpRequest* request, const string& uri,
+                                RequestMethod method,
+                                const absl::Status& result) = 0;
+  };
+
+  HttpRequest() {}
+  virtual ~HttpRequest() {}
+
+  /// Sets the request URI.
+  virtual void SetUri(const string& uri) = 0;
+
+  /// \brief Sets the Range header.
+  ///
+  /// Used for random seeks, for example "0-999" returns the first 1000 bytes
+  /// (note that the right border is included).
+  virtual void SetRange(uint64 start, uint64 end) = 0;
+
+  /// Sets a request header.
+  virtual void AddHeader(const string& name, const string& value) = 0;
+
+  /// Sets a DNS resolve mapping (to skip DNS resolution).
+  ///
+  /// Note: because GCS is available over HTTPS, we cannot replace the hostname
+  /// in the URI with an IP address, as that will cause the certificate check
+  /// to fail.
+  virtual void AddResolveOverride(const string& hostname, int64_t port,
+                                  const string& ip_addr) = 0;
+
+  /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
+  virtual void AddAuthBearerHeader(const string& auth_token) = 0;
+
+  /// Sets the RequestStats object to use to record the request and response.
+  virtual void SetRequestStats(RequestStats* stats) = 0;
+
+  /// Makes the request a DELETE request.
+  virtual void SetDeleteRequest() = 0;
+
+  /// \brief Makes the request a PUT request.
+  ///
+  /// The request body will be taken from the specified file starting from
+  /// the given offset.
+  virtual absl::Status SetPutFromFile(const string& body_filepath,
+                                      size_t offset) = 0;
+
+  /// Makes the request a PUT request with an empty body.
+  virtual void SetPutEmptyBody() = 0;
+
+  /// \brief Makes the request a POST request.
+  ///
+  /// The request body will be taken from the specified buffer.
+  virtual void SetPostFromBuffer(const char* buffer, size_t size) = 0;
+
+  /// Makes the request a POST request with an empty body.
+  virtual void SetPostEmptyBody() = 0;
+
+  /// \brief Specifies the buffer for receiving the response body.
+  ///
+  /// Size of out_buffer after an access will be exactly the number of bytes
+  /// read. Existing content of the vector will be cleared.
+  virtual void SetResultBuffer(std::vector<char>* out_buffer) = 0;
+
+  /// \brief Specifies the buffer for receiving the response body.
+  ///
+  /// This method should be used when a caller knows the upper bound of the
+  /// size of the response data.  The caller provides a pre-allocated buffer
+  /// and its size. After the Send() method is called, the
+  /// GetResultBufferDirectBytesTransferred() method may be used to learn to the
+  /// number of bytes that were transferred using this method.
+  virtual void SetResultBufferDirect(char* buffer, size_t size) = 0;
+
+  /// \brief Returns the number of bytes transferred, when using
+  /// SetResultBufferDirect(). This method may only be used when using
+  /// SetResultBufferDirect().
+  virtual size_t GetResultBufferDirectBytesTransferred() = 0;
+
+  /// \brief Returns the response headers of a completed request.
+  ///
+  /// If the header is not found, returns an empty string.
+  virtual string GetResponseHeader(const string& name) const = 0;
+
+  /// Returns the response code of a completed request.
+  virtual uint64 GetResponseCode() const = 0;
+
+  /// \brief Sends the formed request.
+  ///
+  /// If the result buffer was defined, the response will be written there.
+  /// The object is not designed to be re-used after Send() is executed.
+  virtual absl::Status Send() = 0;
+
+  // Url encodes str and returns a new string.
+  virtual string EscapeString(const string& str) = 0;
+
+  /// \brief Set timeouts for this request.
+  ///
+  /// The connection parameter controls how long we should wait for the
+  /// connection to be established. The inactivity parameter controls how long
+  /// we should wait between additional responses from the server. Finally the
+  /// total parameter controls the maximum total connection time to prevent
+  /// hanging indefinitely.
+  virtual void SetTimeouts(uint32 connection, uint32 inactivity,
+                           uint32 total) = 0;
+
+  HttpRequest(const HttpRequest&) = delete;
+  void operator=(const HttpRequest&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_HTTP_REQUEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
new file mode 100644
index 00000000..0df34865
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
@@ -0,0 +1,217 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
+#define XLA_TSL_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
+
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/cloud/curl_http_request.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+/// Fake HttpRequest for testing.
+class FakeHttpRequest : public CurlHttpRequest {
+ public:
+  /// Return the response for the given request.
+  FakeHttpRequest(const string& request, const string& response)
+      : FakeHttpRequest(request, response, absl::OkStatus(), nullptr, {}, 200) {
+  }
+
+  /// Return the response with headers for the given request.
+  FakeHttpRequest(const string& request, const string& response,
+                  const std::map<string, string>& response_headers)
+      : FakeHttpRequest(request, response, absl::OkStatus(), nullptr,
+                        response_headers, 200) {}
+
+  /// \brief Return the response for the request and capture the POST body.
+  ///
+  /// Post body is not expected to be a part of the 'request' parameter.
+  FakeHttpRequest(const string& request, const string& response,
+                  string* captured_post_body)
+      : FakeHttpRequest(request, response, absl::OkStatus(), captured_post_body,
+                        {}, 200) {}
+
+  /// \brief Return the response and the status for the given request.
+  FakeHttpRequest(const string& request, const string& response,
+                  absl::Status response_status, uint64 response_code)
+      : FakeHttpRequest(request, response, response_status, nullptr, {},
+                        response_code) {}
+
+  /// \brief Return the response and the status for the given request
+  ///  and capture the POST body.
+  ///
+  /// Post body is not expected to be a part of the 'request' parameter.
+  FakeHttpRequest(const string& request, const string& response,
+                  absl::Status response_status, string* captured_post_body,
+                  const std::map<string, string>& response_headers,
+                  uint64 response_code)
+      : expected_request_(request),
+        response_(response),
+        response_status_(response_status),
+        captured_post_body_(captured_post_body),
+        response_headers_(response_headers),
+        response_code_(response_code) {}
+
+  void SetUri(const string& uri) override {
+    actual_uri_ += "Uri: " + uri + "\n";
+  }
+  void SetRange(uint64 start, uint64 end) override {
+    actual_request_ += strings::StrCat("Range: ", start, "-", end, "\n");
+  }
+  void AddHeader(const string& name, const string& value) override {
+    actual_request_ += "Header " + name + ": " + value + "\n";
+  }
+  void AddAuthBearerHeader(const string& auth_token) override {
+    actual_request_ += "Auth Token: " + auth_token + "\n";
+  }
+  void SetDeleteRequest() override { actual_request_ += "Delete: yes\n"; }
+  absl::Status SetPutFromFile(const string& body_filepath,
+                              size_t offset) override {
+    std::ifstream stream(body_filepath);
+    const string& content = string(std::istreambuf_iterator<char>(stream),
+                                   std::istreambuf_iterator<char>())
+                                .substr(offset);
+    actual_request_ += "Put body: " + content + "\n";
+    return absl::OkStatus();
+  }
+  void SetPostFromBuffer(const char* buffer, size_t size) override {
+    if (captured_post_body_) {
+      *captured_post_body_ = string(buffer, size);
+    } else {
+      actual_request_ +=
+          strings::StrCat("Post body: ", absl::string_view(buffer, size), "\n");
+    }
+  }
+  void SetPutEmptyBody() override { actual_request_ += "Put: yes\n"; }
+  void SetPostEmptyBody() override {
+    if (captured_post_body_) {
+      *captured_post_body_ = "<empty>";
+    } else {
+      actual_request_ += "Post: yes\n";
+    }
+  }
+  void SetResultBuffer(std::vector<char>* buffer) override {
+    buffer->clear();
+    buffer_ = buffer;
+  }
+  void SetResultBufferDirect(char* buffer, size_t size) override {
+    direct_result_buffer_ = buffer;
+    direct_result_buffer_size_ = size;
+  }
+  size_t GetResultBufferDirectBytesTransferred() override {
+    return direct_result_bytes_transferred_;
+  }
+  absl::Status Send() override {
+    EXPECT_EQ(expected_request_, actual_request())
+        << "Unexpected HTTP request.";
+    if (buffer_) {
+      buffer_->insert(buffer_->begin(), response_.data(),
+                      response_.data() + response_.size());
+    } else if (direct_result_buffer_ != nullptr) {
+      size_t bytes_to_copy =
+          std::min<size_t>(direct_result_buffer_size_, response_.size());
+      memcpy(direct_result_buffer_, response_.data(), bytes_to_copy);
+      direct_result_bytes_transferred_ += bytes_to_copy;
+    }
+    return response_status_;
+  }
+
+  // This function just does a simple replacing of "/" with "%2F" instead of
+  // full url encoding.
+  string EscapeString(const string& str) override {
+    const string victim = "/";
+    const string encoded = "%2F";
+
+    string copy_str = str;
+    std::string::size_type n = 0;
+    while ((n = copy_str.find(victim, n)) != std::string::npos) {
+      copy_str.replace(n, victim.size(), encoded);
+      n += encoded.size();
+    }
+    return copy_str;
+  }
+
+  string GetResponseHeader(const string& name) const override {
+    const auto header = response_headers_.find(name);
+    return header != response_headers_.end() ? header->second : "";
+  }
+
+  virtual uint64 GetResponseCode() const override { return response_code_; }
+
+  void SetTimeouts(uint32 connection, uint32 inactivity,
+                   uint32 total) override {
+    actual_request_ += strings::StrCat("Timeouts: ", connection, " ",
+                                       inactivity, " ", total, "\n");
+  }
+
+ private:
+  string actual_request() const {
+    string s;
+    s.append(actual_uri_);
+    s.append(actual_request_);
+    return s;
+  }
+
+  std::vector<char>* buffer_ = nullptr;
+  char* direct_result_buffer_ = nullptr;
+  size_t direct_result_buffer_size_ = 0;
+  size_t direct_result_bytes_transferred_ = 0;
+  string expected_request_;
+  string actual_uri_;
+  string actual_request_;
+  string response_;
+  absl::Status response_status_;
+  string* captured_post_body_ = nullptr;
+  std::map<string, string> response_headers_;
+  uint64 response_code_ = 0;
+};
+
+/// Fake HttpRequest factory for testing.
+class FakeHttpRequestFactory : public HttpRequest::Factory {
+ public:
+  FakeHttpRequestFactory(const std::vector<HttpRequest*>* requests)
+      : requests_(requests) {}
+
+  ~FakeHttpRequestFactory() {
+    EXPECT_EQ(current_index_, requests_->size())
+        << "Not all expected requests were made.";
+  }
+
+  HttpRequest* Create() override {
+    EXPECT_LT(current_index_, requests_->size())
+        << "Too many calls of HttpRequest factory.";
+    return (*requests_)[current_index_++];
+  }
+
+ private:
+  const std::vector<HttpRequest*>* requests_;
+  int current_index_ = 0;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
new file mode 100644
index 00000000..db13a305
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
+#define XLA_TSL_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+/// This Env wrapper lets us control the NowSeconds() return value.
+class NowSecondsEnv : public EnvWrapper {
+ public:
+  NowSecondsEnv() : EnvWrapper(Env::Default()) {}
+
+  /// The current (fake) timestamp.
+  uint64 NowSeconds() const override {
+    mutex_lock lock(mu_);
+    return now_;
+  }
+
+  /// Set the current (fake) timestamp.
+  void SetNowSeconds(uint64 now) {
+    mutex_lock lock(mu_);
+    now_ = now;
+  }
+
+  /// Guards access to now_.
+  mutable mutex mu_;
+
+  /// The NowSeconds() value that this Env will return.
+  uint64 now_ = 1;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/oauth_client.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
new file mode 100644
index 00000000..578914ea
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
@@ -0,0 +1,64 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#define XLA_TSL_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+
+#include <memory>
+
+#include "json/json.h"
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+
+/// OAuth 2.0 client.
+class OAuthClient {
+ public:
+  OAuthClient();
+  explicit OAuthClient(
+      std::unique_ptr<HttpRequest::Factory> http_request_factory, Env* env);
+  virtual ~OAuthClient() {}
+
+  /// \brief Retrieves a bearer token using a private key.
+  ///
+  /// Retrieves the authentication bearer token using a JSON file
+  /// with the client's private key.
+  virtual absl::Status GetTokenFromServiceAccountJson(
+      Json::Value json, absl::string_view oauth_server_uri,
+      absl::string_view scope, string* token, uint64* expiration_timestamp_sec);
+
+  /// Retrieves a bearer token using a refresh token.
+  virtual absl::Status GetTokenFromRefreshTokenJson(
+      Json::Value json, absl::string_view oauth_server_uri, string* token,
+      uint64* expiration_timestamp_sec);
+
+  /// Parses the JSON response with the token from an OAuth 2.0 server.
+  virtual absl::Status ParseOAuthResponse(absl::string_view response,
+                                          uint64 request_timestamp_sec,
+                                          string* token,
+                                          uint64* expiration_timestamp_sec);
+
+ private:
+  std::unique_ptr<HttpRequest::Factory> http_request_factory_;
+  Env* env_;
+  OAuthClient(const OAuthClient&) = delete;
+  void operator=(const OAuthClient&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_OAUTH_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
new file mode 100644
index 00000000..74faa7ac
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
@@ -0,0 +1,250 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
+#define XLA_TSL_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/cloud/file_block_cache.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/notification.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+
+/// \brief An LRU block cache of file contents, keyed by {filename, offset}.
+///
+/// This class should be shared by read-only random access files on a remote
+/// filesystem (e.g. GCS).
+class RamFileBlockCache : public FileBlockCache {
+ public:
+  /// The callback executed when a block is not found in the cache, and needs to
+  /// be fetched from the backing filesystem. This callback is provided when the
+  /// cache is constructed. The returned Status should be OK as long as the
+  /// read from the remote filesystem succeeded (similar to the semantics of the
+  /// read(2) system call).
+  typedef std::function<absl::Status(const string& filename, size_t offset,
+                                     size_t buffer_size, char* buffer,
+                                     size_t* bytes_transferred)>
+      BlockFetcher;
+
+  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
+                    BlockFetcher block_fetcher, Env* env = Env::Default())
+      : block_size_(block_size),
+        max_bytes_(max_bytes),
+        max_staleness_(max_staleness),
+        block_fetcher_(block_fetcher),
+        env_(env) {
+    if (max_staleness_ > 0) {
+      pruning_thread_.reset(env_->StartThread(ThreadOptions(), "TF_prune_FBC",
+                                              [this] { Prune(); }));
+    }
+    VLOG(1) << "GCS file block cache is "
+            << (IsCacheEnabled() ? "enabled" : "disabled");
+  }
+
+  ~RamFileBlockCache() override {
+    if (pruning_thread_) {
+      stop_pruning_thread_.Notify();
+      // Destroying pruning_thread_ will block until Prune() receives the above
+      // notification and returns.
+      pruning_thread_.reset();
+    }
+  }
+
+  /// Read `n` bytes from `filename` starting at `offset` into `out`. This
+  /// method will return:
+  ///
+  /// 1) The error from the remote filesystem, if the read from the remote
+  ///    filesystem failed.
+  /// 2) PRECONDITION_FAILED if the read from the remote filesystem succeeded,
+  ///    but the read returned a partial block, and the LRU cache contained a
+  ///    block at a higher offset (indicating that the partial block should have
+  ///    been a full block).
+  /// 3) OUT_OF_RANGE if the read from the remote filesystem succeeded, but
+  ///    the file contents do not extend past `offset` and thus nothing was
+  ///    placed in `out`.
+  /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
+  ///    in `out`).
+  absl::Status Read(const string& filename, size_t offset, size_t n,
+                    char* buffer, size_t* bytes_transferred) override;
+
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file doesn't
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  bool ValidateAndUpdateFileSignature(const string& filename,
+                                      int64_t file_signature) override
+      TF_LOCKS_EXCLUDED(mu_);
+
+  /// Remove all cached blocks for `filename`.
+  void RemoveFile(const string& filename) override TF_LOCKS_EXCLUDED(mu_);
+
+  /// Remove all cached data.
+  void Flush() override TF_LOCKS_EXCLUDED(mu_);
+
+  /// Accessors for cache parameters.
+  size_t block_size() const override { return block_size_; }
+  size_t max_bytes() const override { return max_bytes_; }
+  uint64 max_staleness() const override { return max_staleness_; }
+
+  /// The current size (in bytes) of the cache.
+  size_t CacheSize() const override TF_LOCKS_EXCLUDED(mu_);
+
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  bool IsCacheEnabled() const override {
+    return block_size_ > 0 && max_bytes_ > 0;
+  }
+
+ private:
+  /// The size of the blocks stored in the LRU cache, as well as the size of the
+  /// reads from the underlying filesystem.
+  const size_t block_size_;
+  /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
+  const size_t max_bytes_;
+  /// The maximum staleness of any block in the LRU cache, in seconds.
+  const uint64 max_staleness_;
+  /// The callback to read a block from the underlying filesystem.
+  const BlockFetcher block_fetcher_;
+  /// The Env from which we read timestamps.
+  Env* const env_;  // not owned
+
+  /// \brief The key type for the file block cache.
+  ///
+  /// The file block cache key is a {filename, offset} pair.
+  typedef std::pair<string, size_t> Key;
+
+  /// \brief The state of a block.
+  ///
+  /// A block begins in the CREATED stage. The first thread will attempt to read
+  /// the block from the filesystem, transitioning the state of the block to
+  /// FETCHING. After completing, if the read was successful the state should
+  /// be FINISHED. Otherwise the state should be ERROR. A subsequent read can
+  /// re-fetch the block if the state is ERROR.
+  enum class FetchState {
+    CREATED,
+    FETCHING,
+    FINISHED,
+    ERROR,
+  };
+
+  /// \brief A block of a file.
+  ///
+  /// A file block consists of the block data, the block's current position in
+  /// the LRU cache, the timestamp (seconds since epoch) at which the block
+  /// was cached, a coordination lock, and state & condition variables.
+  ///
+  /// Thread safety:
+  /// The iterator and timestamp fields should only be accessed while holding
+  /// the block-cache-wide mu_ instance variable. The state variable should only
+  /// be accessed while holding the Block's mu lock. The data vector should only
+  /// be accessed after state == FINISHED, and it should never be modified.
+  ///
+  /// In order to prevent deadlocks, never grab the block-cache-wide mu_ lock
+  /// AFTER grabbing any block's mu lock. It is safe to grab mu without locking
+  /// mu_.
+  struct Block {
+    /// The block data.
+    std::vector<char> data;
+    /// A list iterator pointing to the block's position in the LRU list.
+    std::list<Key>::iterator lru_iterator;
+    /// A list iterator pointing to the block's position in the LRA list.
+    std::list<Key>::iterator lra_iterator;
+    /// The timestamp (seconds since epoch) at which the block was cached.
+    uint64 timestamp;
+    /// Mutex to guard state variable
+    mutex mu;
+    /// The state of the block.
+    FetchState state TF_GUARDED_BY(mu) = FetchState::CREATED;
+    /// Wait on cond_var if state is FETCHING.
+    condition_variable cond_var;
+  };
+
+  /// \brief The block map type for the file block cache.
+  ///
+  /// The block map is an ordered map from Key to Block.
+  typedef std::map<Key, std::shared_ptr<Block>> BlockMap;
+
+  /// Prune the cache by removing files with expired blocks.
+  void Prune() TF_LOCKS_EXCLUDED(mu_);
+
+  bool BlockNotStale(const std::shared_ptr<Block>& block)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Look up a Key in the block cache.
+  std::shared_ptr<Block> Lookup(const Key& key) TF_LOCKS_EXCLUDED(mu_);
+
+  absl::Status MaybeFetch(const Key& key, const std::shared_ptr<Block>& block)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  /// Trim the block cache to make room for another entry.
+  void Trim() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Update the LRU iterator for the block at `key`.
+  absl::Status UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
+      TF_LOCKS_EXCLUDED(mu_);
+
+  /// Remove all blocks of a file, with mu_ already held.
+  void RemoveFile_Locked(const string& filename)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Remove the block `entry` from the block map and LRU list, and update the
+  /// cache size accordingly.
+  void RemoveBlock(BlockMap::iterator entry) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// The cache pruning thread that removes files with expired blocks.
+  std::unique_ptr<Thread> pruning_thread_;
+
+  /// Notification for stopping the cache pruning thread.
+  Notification stop_pruning_thread_;
+
+  /// Guards access to the block map, LRU list, and cached byte count.
+  mutable mutex mu_;
+
+  /// The block map (map from Key to Block).
+  BlockMap block_map_ TF_GUARDED_BY(mu_);
+
+  /// The LRU list of block keys. The front of the list identifies the most
+  /// recently accessed block.
+  std::list<Key> lru_list_ TF_GUARDED_BY(mu_);
+
+  /// The LRA (least recently added) list of block keys. The front of the list
+  /// identifies the most recently added block.
+  ///
+  /// Note: blocks are added to lra_list_ only after they have successfully been
+  /// fetched from the underlying block store.
+  std::list<Key> lra_list_ TF_GUARDED_BY(mu_);
+
+  /// The combined number of bytes in all of the cached blocks.
+  size_t cache_size_ TF_GUARDED_BY(mu_) = 0;
+
+  // A filename->file_signature map.
+  std::map<string, int64_t> file_signature_map_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/time_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/time_util.h
new file mode 100644
index 00000000..de9653b8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/time_util.h
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_TIME_UTIL_H_
+#define XLA_TSL_PLATFORM_CLOUD_TIME_UTIL_H_
+
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+
+/// Parses the timestamp in RFC 3339 format and returns it
+/// as nanoseconds since epoch.
+absl::Status ParseRfc3339Time(const string& time, int64_t* mtime_nsec);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_TIME_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/zone_provider.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
new file mode 100644
index 00000000..22a10950
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_CLOUD_ZONE_PROVIDER_H_
+#define XLA_TSL_PLATFORM_CLOUD_ZONE_PROVIDER_H_
+
+#include <string>
+
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tsl {
+
+/// Interface for a provider of cloud instance zone
+class ZoneProvider {
+ public:
+  virtual ~ZoneProvider() {}
+
+  /// \brief  Gets the zone of the Cloud instance and set the result in `zone`.
+  /// Returns OK if success.
+  ///
+  /// Returns an empty string in the case where the zone does not match the
+  /// expected format
+  /// Safe for concurrent use by multiple threads.
+  virtual absl::Status GetZone(string* zone) = 0;
+
+  static absl::Status GetZone(ZoneProvider* provider, string* zone) {
+    if (!provider) {
+      return errors::Internal("Zone provider is required.");
+    }
+    return provider->GetZone(zone);
+  }
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_CLOUD_ZONE_PROVIDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/casts.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/casts.h
new file mode 100644
index 00000000..600d40bd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/casts.h
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_CASTS_H_
+#define XLA_TSL_PLATFORM_DEFAULT_CASTS_H_
+
+#include <assert.h>  // for use with down_cast<>
+
+#include <type_traits>
+
+namespace tensorflow {
+
+// An "upcast", i.e. a conversion from a pointer to an object to a pointer to a
+// base subobject, always succeeds if the base is unambiguous and accessible,
+// and so it's fine to use implicit_cast.
+//
+// A "downcast", i.e. a conversion from a pointer to an object to a pointer
+// to a more-derived object that may contain the original object as a base
+// subobject, cannot safely be done using static_cast, because you do not
+// generally know whether the source object is really the base subobject of
+// a containing, more-derived object of the target type. Thus, when you
+// downcast in a polymorphic type hierarchy, you should use the following
+// function template.
+//
+// In debug mode, we use dynamic_cast to double-check whether the downcast is
+// legal (we die if it's not). In normal mode, we do the efficient static_cast
+// instead. Thus, it's important to test in debug mode to make sure the cast is
+// legal!
+//
+// This is the only place in the codebase we should use dynamic_cast.
+// In particular, you should NOT use dynamic_cast for RTTI, e.g. for
+// code like this:
+//    if (auto* p = dynamic_cast<Subclass1*>(foo)) HandleASubclass1Object(p);
+//    if (auto* p = dynamic_cast<Subclass2*>(foo)) HandleASubclass2Object(p);
+// You should design the code some other way not to need this.
+
+template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+inline To down_cast(From* f) {         // so we only accept pointers
+  static_assert(
+      (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+      "target type not derived from source type");
+
+  // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+  // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+  assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+  return static_cast<To>(f);
+}
+
+// Overload of down_cast for references. Use like this: down_cast<T&>(foo).
+// The code is slightly convoluted because we're still using the pointer
+// form of dynamic cast. (The reference form throws an exception if it
+// fails.)
+//
+// There's no need for a special const overload either for the pointer
+// or the reference form. If you call down_cast with a const T&, the
+// compiler will just bind From to const T.
+template <typename To, typename From>
+inline To down_cast(From& f) {
+  static_assert(std::is_lvalue_reference<To>::value,
+                "target type not a reference");
+  static_assert(
+      (std::is_base_of<From, typename std::remove_reference<To>::type>::value),
+      "target type not derived from source type");
+
+  // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+  // RTTI: debug mode only
+  assert(dynamic_cast<typename std::remove_reference<To>::type*>(&f) !=
+         nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+  return static_cast<To>(f);
+}
+
+}  // namespace tensorflow
+
+namespace tsl {
+using ::tensorflow::down_cast;
+}
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_CASTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/context.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/context.h
new file mode 100644
index 00000000..8f1b36ee
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/context.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_CONTEXT_H_
+#define XLA_TSL_PLATFORM_DEFAULT_CONTEXT_H_
+
+namespace tsl {
+
+class Context {
+ public:
+  Context() {}
+  Context(const ContextKind kind) {}
+
+  bool operator==(const Context& other) const { return true; }
+};
+
+class WithContext {
+ public:
+  explicit WithContext(const Context& x) {}
+  ~WithContext() {}
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_CONTEXT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/crash_analysis.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/crash_analysis.h
new file mode 100644
index 00000000..f374c874
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/crash_analysis.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_CRASH_ANALYSIS_H_
+#define XLA_TSL_PLATFORM_DEFAULT_CRASH_ANALYSIS_H_
+
+#include <string>
+
+#include "tsl/platform/protobuf.h"
+
+namespace tensorflow {
+namespace crash_analysis {
+
+class BufferedDataSource {};
+
+// Reports `message` proto which will be stored in the `file_name` in case
+// of a process crash.
+// Default implementation is currently NOOP.
+BufferedDataSource* ReportProtoDataOnCrash(
+    const std::string& file_name, const tsl::protobuf::Message& message);
+
+// Removes `data_source` from the list of data reported in case of a process
+// crash.
+// Default implementation is currently NOOP.
+void RemoveReportData(const BufferedDataSource* data_source);
+
+// Reports `event_data` with the associated `message` under `event_name` to the
+// crash analysis system. This does not require process crash.
+// Default implementation is currently NOOP.
+void ReportEvent(const std::string& event_name, const std::string& message,
+                 const std::string& event_data);
+
+}  // namespace crash_analysis
+}  // namespace tensorflow
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_CRASH_ANALYSIS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/criticality.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/criticality.h
new file mode 100644
index 00000000..469a6613
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/criticality.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_CRITICALITY_H_
+#define XLA_TSL_PLATFORM_DEFAULT_CRITICALITY_H_
+
+namespace tsl {
+
+namespace criticality {
+
+inline Criticality GetCriticality() {
+  // For default platforms, return the default criticality.
+  return Criticality::kCritical;
+}
+
+}  // namespace criticality
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_CRITICALITY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/dso_loader.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/dso_loader.h
new file mode 100644
index 00000000..c74cb01d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/dso_loader.h
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common DSO loading functionality: exposes callables that dlopen DSOs
+// in either the runfiles directories
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_DSO_LOADER_H_
+#define XLA_TSL_PLATFORM_DEFAULT_DSO_LOADER_H_
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+
+namespace tsl {
+namespace internal {
+
+namespace DsoLoader {
+// The following methods either load the DSO of interest and return a dlopen
+// handle or error status.
+absl::StatusOr<void*> GetCudaDriverDsoHandle();
+absl::StatusOr<void*> GetCudaRuntimeDsoHandle();
+absl::StatusOr<void*> GetCublasDsoHandle();
+absl::StatusOr<void*> GetCublasLtDsoHandle();
+absl::StatusOr<void*> GetCufftDsoHandle();
+absl::StatusOr<void*> GetCusolverDsoHandle();
+absl::StatusOr<void*> GetCusparseDsoHandle();
+absl::StatusOr<void*> GetCuptiDsoHandle();
+absl::StatusOr<void*> GetCudnnDsoHandle();
+absl::StatusOr<void*> GetNcclDsoHandle();
+absl::StatusOr<void*> GetNvInferDsoHandle();
+absl::StatusOr<void*> GetNvInferPluginDsoHandle();
+
+absl::StatusOr<void*> GetRocblasDsoHandle();
+absl::StatusOr<void*> GetMiopenDsoHandle();
+absl::StatusOr<void*> GetHipfftDsoHandle();
+absl::StatusOr<void*> GetRocrandDsoHandle();
+absl::StatusOr<void*> GetRoctracerDsoHandle();
+absl::StatusOr<void*> GetRocsolverDsoHandle();
+absl::StatusOr<void*> GetHipsolverDsoHandle();
+absl::StatusOr<void*> GetHipsparseDsoHandle();
+absl::StatusOr<void*> GetHipDsoHandle();
+
+// The following method tries to dlopen all necessary GPU libraries for the GPU
+// platform TF is built with (CUDA or ROCm) only when these libraries should be
+// dynamically loaded. Error status is returned when any of the libraries cannot
+// be dlopened.
+absl::Status MaybeTryDlopenGPULibraries();
+
+// The following method tries to dlopen all necessary TensorRT libraries when
+// these libraries should be dynamically loaded. Error status is returned when
+// any of the libraries cannot be dlopened.
+absl::Status TryDlopenTensorRTLibraries();
+}  // namespace DsoLoader
+
+// Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
+// more than once.
+namespace CachedDsoLoader {
+// Cached versions of the corresponding DsoLoader methods above.
+absl::StatusOr<void*> GetCudaDriverDsoHandle();
+absl::StatusOr<void*> GetCudaRuntimeDsoHandle();
+absl::StatusOr<void*> GetCublasDsoHandle();
+absl::StatusOr<void*> GetCublasLtDsoHandle();
+absl::StatusOr<void*> GetCufftDsoHandle();
+absl::StatusOr<void*> GetCusolverDsoHandle();
+absl::StatusOr<void*> GetCusparseDsoHandle();
+absl::StatusOr<void*> GetCuptiDsoHandle();
+absl::StatusOr<void*> GetCudnnDsoHandle();
+
+absl::StatusOr<void*> GetRocblasDsoHandle();
+absl::StatusOr<void*> GetMiopenDsoHandle();
+absl::StatusOr<void*> GetHipfftDsoHandle();
+absl::StatusOr<void*> GetRocrandDsoHandle();
+absl::StatusOr<void*> GetRocsolverDsoHandle();
+absl::StatusOr<void*> GetHipsolverDsoHandle();
+absl::StatusOr<void*> GetRoctracerDsoHandle();
+absl::StatusOr<void*> GetHipsparseDsoHandle();
+absl::StatusOr<void*> GetHipblasltDsoHandle();
+absl::StatusOr<void*> GetHipDsoHandle();
+}  // namespace CachedDsoLoader
+
+}  // namespace internal
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_DSO_LOADER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/integral_types.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/integral_types.h
new file mode 100644
index 00000000..0e67cdf9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/integral_types.h
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+#define XLA_TSL_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+
+#include <cstdint>
+
+// IWYU pragma: private, include "xla/tsl/platform/types.h"
+// IWYU pragma: friend third_party/tensorflow/compiler/xla/tsl/platform/types.h
+
+namespace tsl {
+
+typedef signed char int8;
+typedef short int16;
+typedef int int32;
+typedef ::std::int64_t int64;
+
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+typedef std::uint64_t uint64;
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/logging.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/logging.h
new file mode 100644
index 00000000..bc72e301
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/logging.h
@@ -0,0 +1,640 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(_WIN32)
+// prevent compile error because MSVC doesn't realize in debug build that
+// LOG(FATAL) finally invokes abort()
+#pragma warning(disable : 4716)
+#endif  // _WIN32
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_LOGGING_H_
+#define XLA_TSL_PLATFORM_DEFAULT_LOGGING_H_
+
+// IWYU pragma: private, include "xla/tsl/platform/logging.h"
+// IWYU pragma: friend third_party/tensorflow/compiler/xla/tsl/platform/logging.h
+
+#include <atomic>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/base/log_severity.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+// TODO(mrry): Prevent this Windows.h #define from leaking out of our headers.
+#undef ERROR
+
+// Undef everything in case we're being mixed with some other Google library
+// which already defined them itself.  Presumably all Google libraries will
+// support the same syntax for these so it should not be a big deal if they
+// end up using our definitions instead.
+#undef LOG
+#undef LOG_EVERY_N
+#undef LOG_FIRST_N
+#undef LOG_EVERY_POW_2
+#undef LOG_EVERY_N_SEC
+#undef VLOG
+
+#undef CHECK
+#undef CHECK_EQ
+#undef CHECK_NE
+#undef CHECK_LT
+#undef CHECK_LE
+#undef CHECK_GT
+#undef CHECK_GE
+
+#undef DCHECK
+#undef DCHECK_EQ
+#undef DCHECK_NE
+#undef DCHECK_LT
+#undef DCHECK_LE
+#undef DCHECK_GT
+#undef DCHECK_GE
+
+#undef QCHECK
+#undef QCHECK_EQ
+#undef QCHECK_NE
+#undef QCHECK_LT
+#undef QCHECK_LE
+#undef QCHECK_GT
+#undef QCHECK_GE
+
+#undef PCHECK
+
+namespace tsl {
+
+namespace internal {
+
+// Emit "message" as a log message to the log for the specified
+// "severity" as if it came from a LOG call at "fname:line"
+void LogString(const char* fname, int line, absl::LogSeverity severity,
+               const std::string& message);
+
+class LogMessage : public std::basic_ostringstream<char> {
+ public:
+  LogMessage(const char* fname, int line, absl::LogSeverity severity);
+  ~LogMessage() override;
+
+  // Change the location of the log message.
+  LogMessage& AtLocation(const char* fname, int line);
+
+  // Returns the maximum log level for VLOG statements.
+  // E.g., if MaxVLogLevel() is 2, then VLOG(2) statements will produce output,
+  // but VLOG(3) will not. Defaults to 0.
+  static int MaxVLogLevel();
+
+  // Returns whether VLOG level lvl is activated for the file fname.
+  //
+  // E.g. if the environment variable TF_CPP_VMODULE contains foo=3 and fname is
+  // foo.cc and lvl is <= 3, this will return true. It will also return true if
+  // the level is lower or equal to TF_CPP_MAX_VLOG_LEVEL (default zero).
+  //
+  // It is expected that the result of this query will be cached in the VLOG-ing
+  // call site to avoid repeated lookups. This routine performs a hash-map
+  // access against the VLOG-ing specification provided by the env var.
+  static bool VmoduleActivated(const char* fname, int level);
+
+ protected:
+  void GenerateLogMessage();
+
+ private:
+  const char* fname_;
+  int line_;
+  absl::LogSeverity severity_;
+};
+
+// Uses the lower operator & precedence to voidify a LogMessage reference, so
+// that the ternary VLOG() implementation is balanced, type wise.
+struct Voidifier {
+  template <typename T>
+  void operator&(const T&) const {}
+};
+
+// LogMessageFatal ensures the process will exit in failure after
+// logging this message.
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line) TF_ATTRIBUTE_COLD;
+  TF_ATTRIBUTE_NORETURN ~LogMessageFatal() override;
+};
+
+// LogMessageNull supports the DVLOG macro by simply dropping any log messages.
+class LogMessageNull : public std::basic_ostringstream<char> {
+ public:
+  LogMessageNull() = default;
+  ~LogMessageNull() override {}
+};
+
+#define _TF_LOG_INFO \
+  ::tsl::internal::LogMessage(__FILE__, __LINE__, absl::LogSeverity::kInfo)
+#define _TF_LOG_WARNING \
+  ::tsl::internal::LogMessage(__FILE__, __LINE__, absl::LogSeverity::kWarning)
+#define _TF_LOG_ERROR \
+  ::tsl::internal::LogMessage(__FILE__, __LINE__, absl::LogSeverity::kError)
+#define _TF_LOG_FATAL ::tsl::internal::LogMessageFatal(__FILE__, __LINE__)
+
+#define _TF_LOG_QFATAL _TF_LOG_FATAL
+
+#ifdef NDEBUG
+#define _TF_LOG_DFATAL _TF_LOG_ERROR
+#else
+#define _TF_LOG_DFATAL _TF_LOG_FATAL
+#endif
+
+#define LOG(severity) _TF_LOG_##severity
+
+#ifdef IS_MOBILE_PLATFORM
+
+// Turn VLOG off when under mobile devices for considerations of binary size.
+#define VLOG_IS_ON(lvl) ((lvl) <= 0)
+
+#else
+
+// Otherwise, set TF_CPP_MAX_VLOG_LEVEL environment to update minimum log level
+// of VLOG, or TF_CPP_VMODULE to set the minimum log level for individual
+// translation units.
+#define VLOG_IS_ON(lvl)                                              \
+  (([](int level, const char* fname) {                               \
+    static const bool vmodule_activated =                            \
+        ::tsl::internal::LogMessage::VmoduleActivated(fname, level); \
+    return vmodule_activated;                                        \
+  })(lvl, __FILE__))
+
+#endif
+
+#define VLOG(level)                                       \
+  TF_PREDICT_TRUE(!VLOG_IS_ON(level))                     \
+  ? (void)0                                               \
+  : ::tsl::internal::Voidifier() &                        \
+          ::tsl::internal::LogMessage(__FILE__, __LINE__, \
+                                      absl::LogSeverity::kInfo)
+
+// `DVLOG` behaves like `VLOG` in debug mode (i.e. `#ifndef NDEBUG`).
+// Otherwise, it compiles away and does nothing.
+#ifndef NDEBUG
+#define DVLOG VLOG
+#else
+#define DVLOG(verbose_level) \
+  while (false && (verbose_level) > 0) ::tsl::internal::LogMessageNull()
+#endif
+
+class LogEveryNState {
+ public:
+  bool ShouldLog(int n);
+  uint32_t counter() { return counter_.load(std::memory_order_relaxed); }
+
+ private:
+  std::atomic<uint32> counter_{0};
+};
+
+class LogFirstNState {
+ public:
+  bool ShouldLog(int n);
+  uint32 counter() { return counter_.load(std::memory_order_relaxed); }
+
+ private:
+  std::atomic<uint32> counter_{0};
+};
+
+class LogEveryPow2State {
+ public:
+  bool ShouldLog(int ignored);
+  uint32 counter() { return counter_.load(std::memory_order_relaxed); }
+
+ private:
+  std::atomic<uint32> counter_{0};
+};
+
+class LogEveryNSecState {
+ public:
+  bool ShouldLog(double seconds);
+  uint32 counter() { return counter_.load(std::memory_order_relaxed); }
+
+ private:
+  std::atomic<uint32> counter_{0};
+  // Cycle count according to CycleClock that we should next log at.
+  std::atomic<int64_t> next_log_time_cycles_{0};
+};
+
+// This macro has a lot going on!
+//
+// * A local static (`logging_internal_stateful_condition_state`) is
+//   declared in a scope such that each `LOG_EVERY_N` (etc.) line has its own
+//   state.
+// * `COUNTER`, the third variable, is used to support `<< COUNTER`. It is not
+//   mangled, so shadowing can be a problem, albeit more of a
+//   shoot-yourself-in-the-foot one.  Don't name your variables `COUNTER`.
+// * A single for loop can declare state and also test
+//   `condition && state.ShouldLog()`, but there's no way to constrain it to run
+//   only once (or not at all) without declaring another variable.  The outer
+//   for-loop declares this variable (`do_log`).
+// * Using for loops instead of if statements means there's no risk of an
+//   ambiguous dangling else statement.
+#define LOGGING_INTERNAL_STATEFUL_CONDITION(kind, condition, arg)   \
+  for (bool logging_internal_stateful_condition_do_log(condition);  \
+       logging_internal_stateful_condition_do_log;                  \
+       logging_internal_stateful_condition_do_log = false)          \
+    for (static ::tsl::internal::Log##kind##State                   \
+             logging_internal_stateful_condition_state;             \
+         logging_internal_stateful_condition_do_log &&              \
+         logging_internal_stateful_condition_state.ShouldLog(arg);  \
+         logging_internal_stateful_condition_do_log = false)        \
+      for (const uint32_t COUNTER ABSL_ATTRIBUTE_UNUSED =           \
+               logging_internal_stateful_condition_state.counter(); \
+           logging_internal_stateful_condition_do_log;              \
+           logging_internal_stateful_condition_do_log = false)
+
+// An instance of `LOG_EVERY_N` increments a hidden zero-initialized counter
+// every time execution passes through it and logs the specified message when
+// the counter's value is a multiple of `n`, doing nothing otherwise.  Each
+// instance has its own counter.  The counter's value can be logged by streaming
+// the symbol `COUNTER`.  `LOG_EVERY_N` is thread-safe.
+// Example:
+//
+//   for (const auto& user : all_users) {
+//     LOG_EVERY_N(INFO, 1000) << "Processing user #" << COUNTER;
+//     ProcessUser(user);
+//   }
+#define LOG_EVERY_N(severity, n)                       \
+  LOGGING_INTERNAL_STATEFUL_CONDITION(EveryN, true, n) \
+  LOG(severity)
+// `LOG_FIRST_N` behaves like `LOG_EVERY_N` except that the specified message is
+// logged when the counter's value is less than `n`.  `LOG_FIRST_N` is
+// thread-safe.
+#define LOG_FIRST_N(severity, n)                       \
+  LOGGING_INTERNAL_STATEFUL_CONDITION(FirstN, true, n) \
+  LOG(severity)
+// `LOG_EVERY_POW_2` behaves like `LOG_EVERY_N` except that the specified
+// message is logged when the counter's value is a power of 2.
+// `LOG_EVERY_POW_2` is thread-safe.
+#define LOG_EVERY_POW_2(severity)                         \
+  LOGGING_INTERNAL_STATEFUL_CONDITION(EveryPow2, true, 0) \
+  LOG(severity)
+// An instance of `LOG_EVERY_N_SEC` uses a hidden state variable to log the
+// specified message at most once every `n_seconds`.  A hidden counter of
+// executions (whether a message is logged or not) is also maintained and can be
+// logged by streaming the symbol `COUNTER`.  `LOG_EVERY_N_SEC` is thread-safe.
+// Example:
+//
+//   LOG_EVERY_N_SEC(INFO, 2.5) << "Got " << COUNTER << " cookies so far";
+#define LOG_EVERY_N_SEC(severity, n_seconds)                      \
+  LOGGING_INTERNAL_STATEFUL_CONDITION(EveryNSec, true, n_seconds) \
+  LOG(severity)
+
+// CHECK dies with a fatal error if condition is not true.  It is *not*
+// controlled by NDEBUG, so the check will be executed regardless of
+// compilation mode.  Therefore, it is safe to do things like:
+//    CHECK(fp->Write(x) == 4)
+#define CHECK(condition)              \
+  if (TF_PREDICT_FALSE(!(condition))) \
+  LOG(FATAL) << "Check failed: " #condition " "
+
+// Function is overloaded for integral types to allow static const
+// integrals declared in classes and not defined to be used as arguments to
+// CHECK* macros. It's not encouraged though.
+template <typename T>
+inline const T& GetReferenceableValue(const T& t) {
+  return t;
+}
+inline char GetReferenceableValue(char t) { return t; }
+inline unsigned char GetReferenceableValue(unsigned char t) { return t; }
+inline signed char GetReferenceableValue(signed char t) { return t; }
+inline int16 GetReferenceableValue(int16_t t) { return t; }
+inline uint16 GetReferenceableValue(uint16 t) { return t; }
+inline int GetReferenceableValue(int t) { return t; }
+inline unsigned int GetReferenceableValue(unsigned int t) { return t; }
+inline int64_t GetReferenceableValue(int64_t t) { return t; }
+inline uint64 GetReferenceableValue(uint64 t) { return t; }
+
+// This formats a value for a failing CHECK_XX statement.  Ordinarily,
+// it uses the definition for operator<<, with a few special cases below.
+template <typename T>
+inline void MakeCheckOpValueString(std::ostream* os, const T& v) {
+  (*os) << v;
+}
+
+// Overrides for char types provide readable values for unprintable
+// characters.
+template <>
+void MakeCheckOpValueString(std::ostream* os, const char& v);
+template <>
+void MakeCheckOpValueString(std::ostream* os, const signed char& v);
+template <>
+void MakeCheckOpValueString(std::ostream* os, const unsigned char& v);
+
+#if LANG_CXX11
+// We need an explicit specialization for std::nullptr_t.
+template <>
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& v);
+#endif
+
+// A container for a string pointer which can be evaluated to a bool -
+// true iff the pointer is non-NULL.
+struct CheckOpString {
+  explicit CheckOpString(string* str) : str_(str) {}
+  // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
+  // so there's no point in cleaning up str_.
+  explicit operator bool() const { return TF_PREDICT_FALSE(str_ != nullptr); }
+  string* str_;
+};
+
+// Build the error message string. Specify no inlining for code size.
+template <typename T1, typename T2>
+string* MakeCheckOpString(const T1& v1, const T2& v2,
+                          const char* exprtext) TF_ATTRIBUTE_NOINLINE;
+
+// A helper class for formatting "expr (V1 vs. V2)" in a CHECK_XX
+// statement.  See MakeCheckOpString for sample usage.  Other
+// approaches were considered: use of a template method (e.g.,
+// base::BuildCheckOpString(exprtext, base::Print<T1>, &v1,
+// base::Print<T2>, &v2), however this approach has complications
+// related to volatile arguments and function-pointer arguments).
+class CheckOpMessageBuilder {
+ public:
+  // Inserts "exprtext" and " (" to the stream.
+  explicit CheckOpMessageBuilder(const char* exprtext);
+  // Deletes "stream_".
+  ~CheckOpMessageBuilder();
+  // For inserting the first variable.
+  std::ostream* ForVar1() { return stream_; }
+  // For inserting the second variable (adds an intermediate " vs. ").
+  std::ostream* ForVar2();
+  // Get the result (inserts the closing ")").
+  string* NewString();
+
+ private:
+  std::ostringstream* stream_;
+};
+
+template <typename T1, typename T2>
+string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
+  CheckOpMessageBuilder comb(exprtext);
+  MakeCheckOpValueString(comb.ForVar1(), v1);
+  MakeCheckOpValueString(comb.ForVar2(), v2);
+  return comb.NewString();
+}
+
+// Helper functions for CHECK_OP macro.
+// We use the full name Check_EQ, Check_NE, etc. in case the file including
+// absl/log/log.h provides its own #defines for the simpler names EQ, NE, etc.
+// This happens if, for example, those are used as token names in a
+// yacc grammar.
+// The (int, int) overload works around the issue that the compiler
+// will not instantiate the template version of the function on values of
+// unnamed enum type - see comment below.
+#define TF_DEFINE_CHECK_OP_IMPL(name, op)                           \
+  template <typename T1, typename T2>                               \
+  inline string* name##Impl(const T1& v1, const T2& v2,             \
+                            const char* exprtext) {                 \
+    if (TF_PREDICT_TRUE(v1 op v2))                                  \
+      return NULL;                                                  \
+    else                                                            \
+      return ::tsl::internal::MakeCheckOpString(v1, v2, exprtext);  \
+  }                                                                 \
+  inline string* name##Impl(int v1, int v2, const char* exprtext) { \
+    return name##Impl<int, int>(v1, v2, exprtext);                  \
+  }
+
+// The (size_t, int) and (int, size_t) specialization are to handle unsigned
+// comparison errors while still being thorough with the comparison.
+
+TF_DEFINE_CHECK_OP_IMPL(Check_EQ, ==)
+// Compilation error with CHECK_EQ(NULL, x)?
+// Use CHECK(x == NULL) instead.
+
+inline string* Check_EQImpl(int v1, size_t v2, const char* exprtext) {
+  if (TF_PREDICT_FALSE(v1 < 0))
+    ::tsl::internal::MakeCheckOpString(v1, v2, exprtext);
+
+  return Check_EQImpl(size_t(v1), v2, exprtext);
+}
+
+inline string* Check_EQImpl(size_t v1, int v2, const char* exprtext) {
+  return Check_EQImpl(v2, v1, exprtext);
+}
+
+TF_DEFINE_CHECK_OP_IMPL(Check_NE, !=)
+
+inline string* Check_NEImpl(int v1, size_t v2, const char* exprtext) {
+  if (v1 < 0) return NULL;
+
+  return Check_NEImpl(size_t(v1), v2, exprtext);
+}
+
+inline string* Check_NEImpl(size_t v1, int v2, const char* exprtext) {
+  return Check_NEImpl(v2, v1, exprtext);
+}
+
+TF_DEFINE_CHECK_OP_IMPL(Check_LE, <=)
+
+inline string* Check_LEImpl(int v1, size_t v2, const char* exprtext) {
+  if (v1 <= 0) return NULL;
+
+  return Check_LEImpl(size_t(v1), v2, exprtext);
+}
+
+inline string* Check_LEImpl(size_t v1, int v2, const char* exprtext) {
+  if (TF_PREDICT_FALSE(v2 < 0))
+    return ::tsl::internal::MakeCheckOpString(v1, v2, exprtext);
+  return Check_LEImpl(v1, size_t(v2), exprtext);
+}
+
+TF_DEFINE_CHECK_OP_IMPL(Check_LT, <)
+
+inline string* Check_LTImpl(int v1, size_t v2, const char* exprtext) {
+  if (v1 < 0) return NULL;
+
+  return Check_LTImpl(size_t(v1), v2, exprtext);
+}
+
+inline string* Check_LTImpl(size_t v1, int v2, const char* exprtext) {
+  if (v2 < 0) return ::tsl::internal::MakeCheckOpString(v1, v2, exprtext);
+  return Check_LTImpl(v1, size_t(v2), exprtext);
+}
+
+// Implement GE,GT in terms of LE,LT
+template <typename T1, typename T2>
+inline string* Check_GEImpl(const T1& v1, const T2& v2, const char* exprtext) {
+  return Check_LEImpl(v2, v1, exprtext);
+}
+
+template <typename T1, typename T2>
+inline string* Check_GTImpl(const T1& v1, const T2& v2, const char* exprtext) {
+  return Check_LTImpl(v2, v1, exprtext);
+}
+
+#undef TF_DEFINE_CHECK_OP_IMPL
+
+// In optimized mode, use CheckOpString to hint to compiler that
+// the while condition is unlikely.
+#define CHECK_OP_LOG(name, op, val1, val2)                                     \
+  while (::tsl::internal::CheckOpString _result{::tsl::internal::name##Impl(   \
+      ::tsl::internal::GetReferenceableValue(val1),                            \
+      ::tsl::internal::GetReferenceableValue(val2), #val1 " " #op " " #val2)}) \
+  ::tsl::internal::LogMessageFatal(__FILE__, __LINE__) << *(_result.str_)
+
+#define CHECK_OP(name, op, val1, val2) CHECK_OP_LOG(name, op, val1, val2)
+
+// CHECK_EQ/NE/...
+#define CHECK_EQ(val1, val2) CHECK_OP(Check_EQ, ==, val1, val2)
+#define CHECK_NE(val1, val2) CHECK_OP(Check_NE, !=, val1, val2)
+#define CHECK_LE(val1, val2) CHECK_OP(Check_LE, <=, val1, val2)
+#define CHECK_LT(val1, val2) CHECK_OP(Check_LT, <, val1, val2)
+#define CHECK_GE(val1, val2) CHECK_OP(Check_GE, >=, val1, val2)
+#define CHECK_GT(val1, val2) CHECK_OP(Check_GT, >, val1, val2)
+#define CHECK_NOTNULL(val)                          \
+  ::tsl::internal::CheckNotNull(__FILE__, __LINE__, \
+                                "'" #val "' Must be non NULL", (val))
+
+#ifndef NDEBUG
+// DCHECK_EQ/NE/...
+#define DCHECK(condition) CHECK(condition)
+#define DCHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
+#define DCHECK_NE(val1, val2) CHECK_NE(val1, val2)
+#define DCHECK_LE(val1, val2) CHECK_LE(val1, val2)
+#define DCHECK_LT(val1, val2) CHECK_LT(val1, val2)
+#define DCHECK_GE(val1, val2) CHECK_GE(val1, val2)
+#define DCHECK_GT(val1, val2) CHECK_GT(val1, val2)
+
+#else
+
+#define DCHECK(condition) \
+  while (false && (condition)) LOG(FATAL)
+
+// NDEBUG is defined, so DCHECK_EQ(x, y) and so on do nothing.
+// However, we still want the compiler to parse x and y, because
+// we don't want to lose potentially useful errors and warnings.
+// _DCHECK_NOP is a helper, and should not be used outside of this file.
+#define _TF_DCHECK_NOP(x, y) \
+  while (false && ((void)(x), (void)(y), 0)) LOG(FATAL)
+
+#define DCHECK_EQ(x, y) _TF_DCHECK_NOP(x, y)
+#define DCHECK_NE(x, y) _TF_DCHECK_NOP(x, y)
+#define DCHECK_LE(x, y) _TF_DCHECK_NOP(x, y)
+#define DCHECK_LT(x, y) _TF_DCHECK_NOP(x, y)
+#define DCHECK_GE(x, y) _TF_DCHECK_NOP(x, y)
+#define DCHECK_GT(x, y) _TF_DCHECK_NOP(x, y)
+
+#endif
+
+// These are for when you don't want a CHECK failure to print a verbose
+// stack trace.  The implementation of CHECK* in this file already doesn't.
+#define QCHECK(condition) CHECK(condition)
+#define QCHECK_EQ(x, y) CHECK_EQ(x, y)
+#define QCHECK_NE(x, y) CHECK_NE(x, y)
+#define QCHECK_LE(x, y) CHECK_LE(x, y)
+#define QCHECK_LT(x, y) CHECK_LT(x, y)
+#define QCHECK_GE(x, y) CHECK_GE(x, y)
+#define QCHECK_GT(x, y) CHECK_GT(x, y)
+
+template <typename T>
+T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
+  if (t == nullptr) {
+    LogMessageFatal(file, line) << string(exprtext);
+  }
+  return std::forward<T>(t);
+}
+
+absl::LogSeverityAtLeast MinLogLevelFromEnv();
+
+int MaxVLogLevelFromEnv();
+
+}  // namespace internal
+
+// LogSink support adapted from absl/log/log.h
+//
+// `LogSink` is an interface which can be extended to intercept and process
+// all log messages. LogSink implementations must be thread-safe. A single
+// instance will be called from whichever thread is performing a logging
+// operation.
+class TFLogEntry {
+ public:
+  explicit TFLogEntry(absl::LogSeverity severity, absl::string_view message)
+      : severity_(severity), message_(message) {}
+
+  explicit TFLogEntry(absl::LogSeverity severity, absl::string_view fname,
+                      int line, absl::string_view message)
+      : severity_(severity), fname_(fname), line_(line), message_(message) {}
+
+  absl::LogSeverity log_severity() const { return severity_; }
+  std::string FName() const { return fname_; }
+  int Line() const { return line_; }
+  std::string ToString() const { return message_; }
+  absl::string_view text_message() const { return message_; }
+
+  // Returning similar result as `text_message` as there is no prefix in this
+  // implementation.
+  absl::string_view text_message_with_prefix() const { return message_; }
+
+ private:
+  const absl::LogSeverity severity_;
+  const std::string fname_;
+  int line_ = -1;
+  const std::string message_;
+};
+
+class TFLogSink {
+ public:
+  virtual ~TFLogSink() = default;
+
+  // `Send` is called synchronously during the log statement.  The logging
+  // module guarantees not to call `Send` concurrently on the same log sink.
+  // Implementations should be careful not to call`LOG` or `CHECK` or take
+  // any locks that might be held by the `LOG` caller, to avoid deadlock.
+  //
+  // `e` is guaranteed to remain valid until the subsequent call to
+  // `WaitTillSent` completes, so implementations may store a pointer to or
+  // copy of `e` (e.g. in a thread local variable) for use in `WaitTillSent`.
+  virtual void Send(const TFLogEntry& entry) = 0;
+
+  // `WaitTillSent` blocks the calling thread (the thread that generated a log
+  // message) until the sink has finished processing the log message.
+  // `WaitTillSent` is called once per log message, following the call to
+  // `Send`.  This may be useful when log messages are buffered or processed
+  // asynchronously by an expensive log sink.
+  // The default implementation returns immediately.  Like `Send`,
+  // implementations should be careful not to call `LOG` or `CHECK or take any
+  // locks that might be held by the `LOG` caller, to avoid deadlock.
+  virtual void WaitTillSent() {}
+};
+
+// This is the default log sink. This log sink is used if there are no other
+// log sinks registered. To disable the default log sink, set the
+// "no_default_logger" Bazel config setting to true or define a
+// NO_DEFAULT_LOGGER preprocessor symbol. This log sink will always log to
+// stderr.
+class TFDefaultLogSink : public TFLogSink {
+ public:
+  void Send(const TFLogEntry& entry) override;
+};
+
+// Add or remove a `LogSink` as a consumer of logging data.  Thread-safe.
+void TFAddLogSink(TFLogSink* sink);
+void TFRemoveLogSink(TFLogSink* sink);
+
+// Get all the log sinks.  Thread-safe.
+std::vector<TFLogSink*> TFGetLogSinks();
+
+// Change verbose level of pre-defined files if envorionment
+// variable `env_var` is defined. This is currently a no op.
+void UpdateLogVerbosityIfDefined(const char* env_var);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_LOGGING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/posix_file_system.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/posix_file_system.h
new file mode 100644
index 00000000..a54ecf04
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/posix_file_system.h
@@ -0,0 +1,87 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_POSIX_FILE_SYSTEM_H_
+#define XLA_TSL_PLATFORM_DEFAULT_POSIX_FILE_SYSTEM_H_
+
+#include "xla/tsl/platform/env.h"
+#include "tsl/platform/path.h"
+
+namespace tsl {
+
+class PosixFileSystem : public FileSystem {
+ public:
+  PosixFileSystem() {}
+
+  ~PosixFileSystem() override {}
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override;
+
+  absl::Status NewAppendableFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override;
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override;
+
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override;
+
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stats) override;
+
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* results) override;
+
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override;
+
+  absl::Status CreateDir(const string& name, TransactionToken* token) override;
+
+  absl::Status DeleteDir(const string& name, TransactionToken* token) override;
+
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* size) override;
+
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override;
+
+  absl::Status CopyFile(const string& src, const string& target,
+                        TransactionToken* token) override;
+};
+
+class LocalPosixFileSystem : public PosixFileSystem {
+ public:
+  string TranslateName(const string& name) const override {
+    absl::string_view scheme, host, path;
+    io::ParseURI(name, &scheme, &host, &path);
+    return string(path);
+  }
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_POSIX_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/stacktrace.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/stacktrace.h
new file mode 100644
index 00000000..61f54546
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/stacktrace.h
@@ -0,0 +1,102 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_STACKTRACE_H_
+#define XLA_TSL_PLATFORM_DEFAULT_STACKTRACE_H_
+
+// clang-format off
+#include "tsl/platform/platform.h"
+// clang-format on
+
+#if !defined(IS_MOBILE_PLATFORM) && (defined(__clang__) || defined(__GNUC__))
+#define TF_HAS_STACKTRACE
+#endif
+
+#if defined(TF_HAS_STACKTRACE)
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#endif  // defined(TF_GENERATE_BACKTRACE)
+
+#include <sstream>
+#include <string>
+
+#include "tsl/platform/abi.h"
+
+namespace tsl {
+
+// Function to create a pretty stacktrace.
+inline std::string CurrentStackTrace() {
+#if defined(TF_HAS_STACKTRACE)
+  std::stringstream ss("");
+  ss << "*** Begin stack trace ***" << std::endl;
+
+  // Get the mangled stack trace.
+  int buffer_size = 128;
+  void* trace[128];
+  buffer_size = backtrace(trace, buffer_size);
+
+  for (int i = 0; i < buffer_size; ++i) {
+    const char* symbol = "";
+    Dl_info info;
+    if (dladdr(trace[i], &info)) {
+      if (info.dli_sname != nullptr) {
+        symbol = info.dli_sname;
+      }
+    }
+
+    std::string demangled = port::MaybeAbiDemangle(symbol);
+    if (demangled.length()) {
+      ss << "\t" << demangled << std::endl;
+    } else {
+      ss << "\t" << symbol << std::endl;
+    }
+  }
+
+  ss << "*** End stack trace ***" << std::endl;
+  return ss.str();
+#else
+  return std::string();
+#endif  // defined(TF_HAS_STACKTRACE)
+}
+
+inline void DebugWriteToString(const char* data, void* arg) {
+  reinterpret_cast<std::string*>(arg)->append(data);
+}
+
+// A dummy class that does nothing.  Someday, add real support.
+class SavedStackTrace {
+ public:
+  SavedStackTrace() {}
+
+  void CreateCurrent(int skip_count) {}
+
+  void Reset() {}
+
+  typedef void DebugWriter(const char*, void*);
+  void Dump(DebugWriter* writerfn, void* arg) const {}
+
+  int depth() const { return 0; }
+  void* const* stack() const { return stack_; }
+
+ private:
+  void* stack_[32];
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_STACKTRACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/status.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/status.h
new file mode 100644
index 00000000..3f6c7f17
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/status.h
@@ -0,0 +1,23 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PLATFORM_DEFAULT_STATUS_H_
+#define XLA_TSL_PLATFORM_DEFAULT_STATUS_H_
+
+#define MAYBE_ADD_SOURCE_LOCATION(status) \
+  {}
+
+#define ADD_SOURCE_LOCATION(status) status
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/statusor.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/statusor.h
new file mode 100644
index 00000000..babd52ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/statusor.h
@@ -0,0 +1,33 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PLATFORM_DEFAULT_STATUSOR_H_
+#define XLA_TSL_PLATFORM_DEFAULT_STATUSOR_H_
+
+#include "absl/status/statusor.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+
+#define TF_ASSIGN_OR_RETURN(lhs, rexpr) \
+  TF_ASSIGN_OR_RETURN_IMPL(             \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, rexpr)
+
+#define TF_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr) \
+  auto statusor = (rexpr);                             \
+  if (TF_PREDICT_FALSE(!statusor.ok())) {              \
+    return statusor.status();                          \
+  }                                                    \
+  lhs = std::move(statusor).value()
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_STATUSOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/subprocess.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/subprocess.h
new file mode 100644
index 00000000..e7ce0d88
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/subprocess.h
@@ -0,0 +1,132 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_SUBPROCESS_H_
+#define XLA_TSL_PLATFORM_DEFAULT_SUBPROCESS_H_
+
+#include <errno.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+class SubProcess {
+ public:
+  // SubProcess()
+  //    nfds: The number of file descriptors to use.
+  explicit SubProcess(int nfds = 3);
+
+  // Virtual for backwards compatibility; do not create new subclasses.
+  // It is illegal to delete the SubProcess within its exit callback.
+  virtual ~SubProcess();
+
+  // SetChannelAction()
+  //    Set how to handle a channel.  The default action is ACTION_CLOSE.
+  //    The action is set for all subsequent processes, until SetChannel()
+  //    is called again.
+  //
+  //    SetChannel may not be called while the process is running.
+  //
+  //    chan: Which channel this applies to.
+  //    action: What to do with the channel.
+  // Virtual for backwards compatibility; do not create new subclasses.
+  virtual void SetChannelAction(Channel chan, ChannelAction action);
+
+  // SetProgram()
+  //    Set up a program and argument list for execution, with the full
+  //    "raw" argument list passed as a vector of strings.  argv[0]
+  //    should be the program name, just as in execv().
+  //
+  //    file: The file containing the program.  This must be an absolute path
+  //          name - $PATH is not searched.
+  //    argv: The argument list.
+  virtual void SetProgram(const string& file, const std::vector<string>& argv);
+
+  // Start()
+  //    Run the command that was previously set up with SetProgram().
+  //    The following are fatal programming errors:
+  //       * Attempting to start when a process is already running.
+  //       * Attempting to start without first setting the command.
+  //    Note, however, that Start() does not try to validate that the binary
+  //    does anything reasonable (e.g. exists or can execute); as such, you can
+  //    specify a non-existent binary and Start() will still return true.  You
+  //    will get a failure from the process, but only after Start() returns.
+  //
+  //    Return true normally, or false if the program couldn't be started
+  //    because of some error.
+  // Virtual for backwards compatibility; do not create new subclasses.
+  virtual bool Start();
+
+  // Kill()
+  //    Send the given signal to the process.
+  //    Return true normally, or false if we couldn't send the signal - likely
+  //    because the process doesn't exist.
+  virtual bool Kill(int signal);
+
+  // Wait()
+  //    Block until the process exits.
+  //    Return true normally, or false if the process wasn't running.
+  virtual bool Wait();
+
+  // Communicate()
+  //    Read from stdout and stderr and writes to stdin until all pipes have
+  //    closed, then waits for the process to exit.
+  //    Note: Do NOT call Wait() after calling Communicate as it will always
+  //     fail, since Communicate calls Wait() internally.
+  //    'stdin_input', 'stdout_output', and 'stderr_output' may be NULL.
+  //    If this process is not configured to send stdout or stderr to pipes,
+  //     the output strings will not be modified.
+  //    If this process is not configured to take stdin from a pipe, stdin_input
+  //     will be ignored.
+  //    Returns the command's exit status.
+  virtual int Communicate(const string* stdin_input, string* stdout_output,
+                          string* stderr_output);
+
+ private:
+  static constexpr int kNFds = 3;
+  static bool chan_valid(int chan) { return ((chan >= 0) && (chan < kNFds)); }
+  static bool retry(int e) {
+    return ((e == EINTR) || (e == EAGAIN) || (e == EWOULDBLOCK));
+  }
+  void FreeArgs() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  void ClosePipes() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  bool WaitInternal(int* status);
+
+  // The separation between proc_mu_ and data_mu_ mutexes allows Kill() to be
+  // called by a thread while another thread is inside Wait() or Communicate().
+  mutable mutex proc_mu_;
+  bool running_ TF_GUARDED_BY(proc_mu_);
+  pid_t pid_ TF_GUARDED_BY(proc_mu_);
+
+  mutable mutex data_mu_ TF_ACQUIRED_AFTER(proc_mu_);
+  char* exec_path_ TF_GUARDED_BY(data_mu_);
+  char** exec_argv_ TF_GUARDED_BY(data_mu_);
+  ChannelAction action_[kNFds] TF_GUARDED_BY(data_mu_);
+  int parent_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
+  int child_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
+
+  SubProcess(const SubProcess&) = delete;
+  void operator=(const SubProcess&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_SUBPROCESS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/tracing_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/tracing_impl.h
new file mode 100644
index 00000000..63841a08
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/tracing_impl.h
@@ -0,0 +1,50 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_DEFAULT_TRACING_IMPL_H_
+#define XLA_TSL_PLATFORM_DEFAULT_TRACING_IMPL_H_
+
+#ifndef IS_MOBILE_PLATFORM
+#include "xla/tsl/profiler/backends/cpu/threadpool_listener_state.h"
+#endif
+// Stub implementations of tracing functionality.
+
+// Definitions that do nothing for platforms that don't have underlying thread
+// tracing support.
+#define TRACELITERAL(a) \
+  do {                  \
+  } while (0)
+#define TRACESTRING(s) \
+  do {                 \
+  } while (0)
+#define TRACEPRINTF(format, ...) \
+  do {                           \
+  } while (0)
+
+namespace tsl {
+namespace tracing {
+
+inline bool EventCollector::IsEnabled() {
+#ifndef IS_MOBILE_PLATFORM
+  return tsl::profiler::threadpool_listener::IsEnabled();
+#else
+  return false;
+#endif
+}
+
+}  // namespace tracing
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_TRACING_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/unbounded_work_queue.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/unbounded_work_queue.h
new file mode 100644
index 00000000..8c3c34b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/default/unbounded_work_queue.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_
+#define XLA_TSL_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_
+
+#include <cstddef>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/platform/env.h"
+
+namespace tsl {
+
+// An `UnboundedWorkQueue` provides a mechanism for temporally multiplexing a
+// potentially large number of "logical" threads onto a smaller number of
+// "physical" threads. The multiplexing is achieved by maintaining an internal
+// pool of long-running "physical" threads that are used to execute the
+// "logical" threads.  Like a regular thread, a "logical" thread may block on
+// other threads, and the size of the pool will increase to ensure that progress
+// is made. This mechanism is recommended in situations where short-lived
+// threads are created repeatedly, to avoid the overhead and memory
+// fragmentation that can result from excessive thread creation.
+class UnboundedWorkQueue {
+ public:
+  UnboundedWorkQueue(Env* env, absl::string_view thread_name,
+                     const ThreadOptions& thread_options = {});
+  ~UnboundedWorkQueue();
+
+  using WorkFunction = std::function<void()>;
+
+  // Schedule `fn` on a thread.  `fn` may perform blocking work, so if all the
+  // existing threads are blocked or busy, this may spawn a new thread which
+  // will be added to the thread pool managed by this work queue.
+  void Schedule(WorkFunction fn);
+
+ private:
+  void PooledThreadFunc();
+
+  bool HasWorkOrIsCancelled() const ABSL_SHARED_LOCKS_REQUIRED(work_queue_mu_) {
+    return !work_queue_.empty() || cancelled_;
+  }
+
+  Env* const env_;  // Not owned.
+  const std::string thread_name_;
+  const ThreadOptions thread_options_;
+  absl::Mutex work_queue_mu_;
+  size_t num_idle_threads_ ABSL_GUARDED_BY(work_queue_mu_) = 0;
+  bool cancelled_ ABSL_GUARDED_BY(work_queue_mu_) = false;
+  std::deque<WorkFunction> work_queue_ ABSL_GUARDED_BY(work_queue_mu_);
+  absl::Mutex thread_pool_mu_;
+  std::vector<std::unique_ptr<Thread>> thread_pool_
+      ABSL_GUARDED_BY(thread_pool_mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/env.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/env.h
new file mode 100644
index 00000000..9b302b80
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/env.h
@@ -0,0 +1,737 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_ENV_H_
+#define XLA_TSL_PLATFORM_ENV_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "xla/tsl/platform/env_time.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/numa.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/stringpiece.h"
+
+// Delete leaked Windows definitions.
+#ifdef PLATFORM_WINDOWS
+#undef CopyFile
+#undef DeleteFile
+#endif
+
+namespace tsl {
+
+class Thread;
+struct ThreadOptions;
+
+/// \brief An interface used by the tensorflow implementation to
+/// access operating system functionality like the filesystem etc.
+///
+/// Callers may wish to provide a custom Env object to get fine grain
+/// control.
+///
+/// All Env implementations of file-system modifying functionality are safe
+/// for concurrent access from multiple threads without any external
+/// synchronization, *however*, Envs and their underlying file systems are
+/// global objects, and therefore, if any thread modifies options, the modified
+/// options take effect process-wide. The SetOption functions themselves are
+/// also *not* thread safe.
+class Env {
+ public:
+  Env();
+  virtual ~Env() = default;
+
+  /// \brief Returns a default environment suitable for the current operating
+  /// system.
+  ///
+  /// Sophisticated users may wish to provide their own Env
+  /// implementation instead of relying on this default environment.
+  ///
+  /// The result of Default() belongs to this library and must never be deleted.
+  static Env* Default();
+
+  /// \brief Returns the FileSystem object to handle operations on the file
+  /// specified by 'fname'. The FileSystem object is used as the implementation
+  /// for the file system related (non-virtual) functions that follow.
+  /// Returned FileSystem object is still owned by the Env object and will
+  // (might) be destroyed when the environment is destroyed.
+  virtual absl::Status GetFileSystemForFile(const std::string& fname,
+                                            FileSystem** result);
+
+  /// \brief Returns the file system schemes registered for this Env.
+  virtual absl::Status GetRegisteredFileSystemSchemes(
+      std::vector<std::string>* schemes);
+
+  /// \brief Register a file system for a scheme.
+  virtual absl::Status RegisterFileSystem(const std::string& scheme,
+                                          FileSystemRegistry::Factory factory);
+
+  /// \brief Register a modular file system for a scheme.
+  ///
+  /// Same as `RegisterFileSystem` but for filesystems provided by plugins.
+  ///
+  /// TODO(b/139060984): After all filesystems are converted, make this be the
+  /// canonical registration function.
+  virtual absl::Status RegisterFileSystem(
+      const std::string& scheme, std::unique_ptr<FileSystem> filesystem);
+
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::string& value);
+
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::vector<string>& values);
+
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::vector<int64_t>& values);
+
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::vector<double>& values);
+
+  /// \brief Flush filesystem caches for all registered filesystems.
+  absl::Status FlushFileSystemCaches();
+
+  /// \brief Creates a brand new random access read-only file with the
+  /// specified name.
+
+  /// On success, stores a pointer to the new file in
+  /// *result and returns OK.  On failure stores NULL in *result and
+  /// returns non-OK.  If the file does not exist, returns a non-OK
+  /// status.
+  ///
+  /// The returned file may be concurrently accessed by multiple threads.
+  ///
+  /// The ownership of the returned RandomAccessFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
+  absl::Status NewRandomAccessFile(const std::string& fname,
+                                   std::unique_ptr<RandomAccessFile>* result);
+
+  absl::Status NewRandomAccessFile(const std::string& fname,
+                                   TransactionToken* token,
+                                   std::unique_ptr<RandomAccessFile>* result) {
+    // We duplicate these methods due to Google internal coding style prevents
+    // virtual functions with default arguments. See PR #41615.
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates an object that writes to a new file with the specified
+  /// name.
+  ///
+  /// Deletes any existing file with the same name and creates a
+  /// new file.  On success, stores a pointer to the new file in
+  /// *result and returns OK.  On failure stores NULL in *result and
+  /// returns non-OK.
+  ///
+  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
+  absl::Status NewWritableFile(const std::string& fname,
+                               std::unique_ptr<WritableFile>* result);
+
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates an object that either appends to an existing file, or
+  /// writes to a new file (if the file does not exist to begin with).
+  ///
+  /// On success, stores a pointer to the new file in *result and
+  /// returns OK.  On failure stores NULL in *result and returns
+  /// non-OK.
+  ///
+  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
+  absl::Status NewAppendableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result);
+
+  absl::Status NewAppendableFile(const std::string& fname,
+                                 TransactionToken* token,
+                                 std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
+  }
+  /// \brief Creates a readonly region of memory with the file context.
+  ///
+  /// On success, it returns a pointer to read-only memory region
+  /// from the content of file fname. The ownership of the region is passed to
+  /// the caller. On failure stores nullptr in *result and returns non-OK.
+  ///
+  /// The returned memory region can be accessed from many threads in parallel.
+  ///
+  /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
+  /// and the object should be deleted when is not used. The memory region
+  /// object shouldn't live longer than the Env object.
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result);
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return absl::OkStatus();
+  }
+
+  /// Returns OK if the named path exists and NOT_FOUND otherwise.
+  absl::Status FileExists(const std::string& fname);
+
+  absl::Status FileExists(const std::string& fname, TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// Returns true if all the listed files exist, false otherwise.
+  /// if status is not null, populate the vector with a detailed status
+  /// for each file.
+  bool FilesExist(const std::vector<string>& files,
+                  std::vector<absl::Status>* status);
+
+  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
+                  std::vector<absl::Status>* status) {
+    return true;
+  }
+
+  /// \brief Stores in *result the names of the children of the specified
+  /// directory. The names are relative to "dir".
+  ///
+  /// Original contents of *results are dropped.
+  absl::Status GetChildren(const std::string& dir, std::vector<string>* result);
+
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<string>* result) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Returns true if the path matches the given pattern. The wildcards
+  /// allowed in pattern are described in FileSystem::GetMatchingPaths.
+  virtual bool MatchPath(const std::string& path,
+                         const std::string& pattern) = 0;
+
+  /// \brief Given a pattern, stores in *results the set of paths that matches
+  /// that pattern. *results is cleared.
+  ///
+  /// More details about `pattern` in FileSystem::GetMatchingPaths.
+  virtual absl::Status GetMatchingPaths(const std::string& pattern,
+                                        std::vector<string>* results);
+
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<string>* results) {
+    return absl::OkStatus();
+  }
+
+  /// Deletes the named file.
+  absl::Status DeleteFile(const std::string& fname);
+
+  absl::Status DeleteFile(const std::string& fname, TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Deletes the specified directory and all subdirectories and files
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
+  /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
+  ///
+  /// Typical return codes:
+  ///  * OK - dirname exists and we were able to delete everything underneath.
+  ///  * NOT_FOUND - dirname doesn't exist
+  ///  * PERMISSION_DENIED - dirname or some descendant is not writable
+  ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
+  ///                    implemented
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs);
+
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates the specified directory and all the necessary
+  /// subdirectories. Typical return codes.
+  ///  * OK - successfully created the directory and sub directories, even if
+  ///         they were already created.
+  ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
+  absl::Status RecursivelyCreateDir(const std::string& dirname);
+
+  absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                    TransactionToken* token) {
+    return absl::OkStatus();
+  }
+  /// \brief Creates the specified directory. Typical return codes
+  ///  * OK - successfully created the directory.
+  ///  * ALREADY_EXISTS - directory already exists.
+  ///  * PERMISSION_DENIED - dirname is not writable.
+  absl::Status CreateDir(const std::string& dirname);
+
+  absl::Status CreateDir(const std::string& dirname, TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// Deletes the specified directory.
+  absl::Status DeleteDir(const std::string& dirname);
+
+  absl::Status DeleteDir(const std::string& dirname, TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// Obtains statistics for the given path.
+  absl::Status Stat(const std::string& fname, FileStatistics* stat);
+
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
+                    FileStatistics* stat) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Returns whether the given path is a directory or not.
+  /// Typical return codes (not guaranteed exhaustive):
+  ///  * OK - The path exists and is a directory.
+  ///  * FAILED_PRECONDITION - The path exists and is not a directory.
+  ///  * NOT_FOUND - The path entry does not exist.
+  ///  * PERMISSION_DENIED - Insufficient permissions.
+  ///  * UNIMPLEMENTED - The file factory doesn't support directories.
+  absl::Status IsDirectory(const std::string& fname);
+
+  /// \brief Returns whether the given path is on a file system
+  /// that has atomic move capabilities. This can be used
+  /// to determine if there needs to be a temp location to safely write objects.
+  /// The second boolean argument has_atomic_move contains this information.
+  ///
+  /// Returns one of the following status codes (not guaranteed exhaustive):
+  ///  * OK - The path is on a recognized file system,
+  ///         so has_atomic_move holds the above information.
+  ///  * UNIMPLEMENTED - The file system of the path hasn't been implemented in
+  ///  TF
+  absl::Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
+
+  /// Returns whether the give path is on a file system
+  /// that has ability to create a new temp file. This can be used
+  /// to determine if there needs to be a temp location to safely write objects.
+  /// If this returns false, TensorFlow will write directly to output files
+  /// instead of creating a temporary file and swapping it in. This may mean
+  /// that incomplete writes are visible to consumers.
+  absl::Status CanCreateTempFile(const std::string& fname,
+                                 bool* can_create_temp_file);
+
+  /// Stores the size of `fname` in `*file_size`.
+  absl::Status GetFileSize(const std::string& fname, uint64* file_size);
+
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64* file_size) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Renames file src to target. If target already exists, it will be
+  /// replaced.
+  absl::Status RenameFile(const std::string& src, const std::string& target);
+
+  absl::Status RenameFile(const std::string& src, const std::string& target,
+                          TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Copy the src to target.
+  absl::Status CopyFile(const std::string& src, const std::string& target);
+
+  absl::Status CopyFile(const std::string& src, const std::string& target,
+                        TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief starts a new transaction on the filesystem that handles filename
+  absl::Status StartTransaction(const std::string& filename,
+                                TransactionToken** token) {
+    *token = nullptr;
+    return absl::OkStatus();
+  }
+
+  /// \brief Adds `path` to transaction in `token` if token belongs to
+  /// filesystem that handles the path.
+  absl::Status AddToTransaction(const std::string& path,
+                                TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Get token for `path` or start a new transaction and add `path` to
+  /// it.
+  absl::Status GetTokenOrStartTransaction(const std::string& path,
+                                          TransactionToken** token) {
+    *token = nullptr;
+    return absl::OkStatus();
+  }
+
+  /// \brief Returns the transaction for `path` or nullptr in `token`
+  absl::Status GetTransactionForPath(const std::string& path,
+                                     TransactionToken** token) {
+    *token = nullptr;
+    return absl::OkStatus();
+  }
+
+  /// \brief Finalizes the transaction
+  absl::Status EndTransaction(TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Returns the absolute path of the current executable. It resolves
+  /// symlinks if there is any.
+  std::string GetExecutablePath();
+
+  /// Creates a local unique temporary file name. Returns true if success.
+  bool LocalTempFilename(std::string* filename);
+
+  /// Creates a local unique file name that starts with |prefix| and ends with
+  /// |suffix|. Returns true if success.
+  bool CreateUniqueFileName(std::string* prefix, const std::string& suffix);
+
+  /// \brief Return the runfiles directory if running under bazel. Returns
+  /// the directory the executable is located in if not running under bazel.
+  virtual std::string GetRunfilesDir() = 0;
+
+  // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
+  // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
+  // provide a routine to get the absolute time.
+
+  /// \brief Returns the number of nano-seconds since the Unix epoch.
+  virtual uint64 NowNanos() const { return EnvTime::NowNanos(); }
+
+  /// \brief Returns the number of micro-seconds since the Unix epoch.
+  virtual uint64 NowMicros() const { return EnvTime::NowMicros(); }
+
+  /// \brief Returns the number of seconds since the Unix epoch.
+  virtual uint64 NowSeconds() const { return EnvTime::NowSeconds(); }
+
+  /// Sleeps/delays the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int64_t micros) = 0;
+
+  /// Returns the process ID of the calling process.
+  int32 GetProcessId();
+
+  /// \brief Returns a new thread that is running fn() and is identified
+  /// (for debugging/performance-analysis) by "name".
+  ///
+  /// Caller takes ownership of the result and must delete it eventually
+  /// (the deletion will block until fn() stops running).
+  virtual Thread* StartThread(
+      const ThreadOptions& thread_options, const std::string& name,
+      absl::AnyInvocable<void()> fn) TF_MUST_USE_RESULT = 0;
+
+  // Returns the thread id of calling thread.
+  // Posix: Returns pthread id which is only guaranteed to be unique within a
+  //        process.
+  // Windows: Returns thread id which is unique.
+  virtual int64_t GetCurrentThreadId() = 0;
+
+  // Copies current thread name to "name". Returns true if success.
+  virtual bool GetCurrentThreadName(std::string* name) = 0;
+
+  // \brief Schedules the given closure on a thread-pool.
+  //
+  // NOTE(mrry): This closure may block.
+  virtual void SchedClosure(absl::AnyInvocable<void()> closure) = 0;
+
+  // \brief Schedules the given closure on a thread-pool after the given number
+  // of microseconds.
+  //
+  // NOTE(mrry): This closure must not block.
+  virtual void SchedClosureAfter(int64_t micros,
+                                 absl::AnyInvocable<void()> closure) = 0;
+
+  // \brief Load a dynamic library.
+  //
+  // Pass "library_filename" to a platform-specific mechanism for dynamically
+  // loading a library.  The rules for determining the exact location of the
+  // library are platform-specific and are not documented here.
+  //
+  // On success, returns a handle to the library in "*handle" and returns
+  // OK from the function.
+  // Otherwise returns nullptr in "*handle" and an error status from the
+  // function.
+  virtual absl::Status LoadDynamicLibrary(const char* library_filename,
+                                          void** handle) = 0;
+
+  // \brief Get a pointer to a symbol from a dynamic library.
+  //
+  // "handle" should be a pointer returned from a previous call to LoadLibrary.
+  // On success, store a pointer to the located symbol in "*symbol" and return
+  // OK from the function. Otherwise, returns nullptr in "*symbol" and an error
+  // status from the function.
+  virtual absl::Status GetSymbolFromLibrary(void* handle,
+                                            const char* symbol_name,
+                                            void** symbol) = 0;
+
+  // \brief build the name of dynamic library.
+  //
+  // "name" should be name of the library.
+  // "version" should be the version of the library or NULL
+  // returns the name that LoadLibrary() can use
+  virtual std::string FormatLibraryFileName(const std::string& name,
+                                            const std::string& version) = 0;
+
+  // Returns a possible list of local temporary directories.
+  virtual void GetLocalTempDirectories(std::vector<string>* list) = 0;
+
+ private:
+  std::unique_ptr<FileSystemRegistry> file_system_registry_;
+  Env(const Env&) = delete;
+  void operator=(const Env&) = delete;
+};
+
+/// \brief An implementation of Env that forwards all calls to another Env.
+///
+/// May be useful to clients who wish to override just part of the
+/// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  /// Initializes an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t) : target_(t) {}
+  ~EnvWrapper() override;
+
+  /// Returns the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  absl::Status GetFileSystemForFile(const std::string& fname,
+                                    FileSystem** result) override {
+    return target_->GetFileSystemForFile(fname, result);
+  }
+
+  absl::Status GetRegisteredFileSystemSchemes(
+      std::vector<string>* schemes) override {
+    return target_->GetRegisteredFileSystemSchemes(schemes);
+  }
+
+  absl::Status RegisterFileSystem(
+      const std::string& scheme, FileSystemRegistry::Factory factory) override {
+    return target_->RegisterFileSystem(scheme, factory);
+  }
+
+  bool MatchPath(const std::string& path, const std::string& pattern) override {
+    return target_->MatchPath(path, pattern);
+  }
+
+  uint64 NowMicros() const override { return target_->NowMicros(); }
+  void SleepForMicroseconds(int64_t micros) override {
+    target_->SleepForMicroseconds(micros);
+  }
+  Thread* StartThread(const ThreadOptions& thread_options,
+                      const std::string& name,
+                      absl::AnyInvocable<void()> fn) override {
+    return target_->StartThread(thread_options, name, std::move(fn));
+  }
+  int64_t GetCurrentThreadId() override {
+    return target_->GetCurrentThreadId();
+  }
+  bool GetCurrentThreadName(std::string* name) override {
+    return target_->GetCurrentThreadName(name);
+  }
+  void SchedClosure(absl::AnyInvocable<void()> closure) override {
+    target_->SchedClosure(std::move(closure));
+  }
+  void SchedClosureAfter(int64_t micros,
+                         absl::AnyInvocable<void()> closure) override {
+    target_->SchedClosureAfter(micros, std::move(closure));
+  }
+  absl::Status LoadDynamicLibrary(const char* library_filename,
+                                  void** handle) override {
+    return target_->LoadDynamicLibrary(library_filename, handle);
+  }
+  absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                    void** symbol) override {
+    return target_->GetSymbolFromLibrary(handle, symbol_name, symbol);
+  }
+  std::string FormatLibraryFileName(const std::string& name,
+                                    const std::string& version) override {
+    return target_->FormatLibraryFileName(name, version);
+  }
+
+  std::string GetRunfilesDir() override { return target_->GetRunfilesDir(); }
+
+ private:
+  void GetLocalTempDirectories(std::vector<string>* list) override {
+    target_->GetLocalTempDirectories(list);
+  }
+
+  Env* target_;
+};
+
+/// Represents a thread used to run a TSL function.
+class Thread {
+ public:
+  Thread() {}
+
+  /// Blocks until the thread of control stops running.
+  virtual ~Thread();
+
+ private:
+  Thread(const Thread&) = delete;
+  void operator=(const Thread&) = delete;
+};
+
+/// \brief Cross-platform setenv.
+///
+/// Since setenv() is not available on windows, we provide an
+/// alternative with platform specific implementations here.
+int setenv(const char* name, const char* value, int overwrite);
+
+/// Cross-platform unsetenv.
+int unsetenv(const char* name);
+
+/// \brief Options to configure a Thread.
+///
+/// Note that the options are all hints, and the
+/// underlying implementation may choose to ignore it.
+struct ThreadOptions {
+  /// Thread stack size to use (in bytes).
+  size_t stack_size = 0;  // 0: use system default value
+  /// Guard area size to use near thread stacks to use (in bytes)
+  size_t guard_size = 0;  // 0: use system default value
+  int numa_node = port::kNUMANoAffinity;
+};
+
+/// A utility routine: copy contents of `src` in file system `src_fs`
+/// to `target` in file system `target_fs`.
+absl::Status FileSystemCopyFile(FileSystem* src_fs, const std::string& src,
+                                FileSystem* target_fs,
+                                const std::string& target);
+
+/// A utility routine: reads contents of named file into `*data`
+absl::Status ReadFileToString(Env* env, const std::string& fname,
+                              std::string* data);
+
+/// A utility routine: write contents of `data` to file named `fname`
+/// (overwriting existing contents, if any).
+absl::Status WriteStringToFile(Env* env, const std::string& fname,
+                               const absl::string_view& data);
+
+/// Write binary representation of "proto" to the named file.
+absl::Status WriteBinaryProto(Env* env, const std::string& fname,
+                              const protobuf::MessageLite& proto);
+
+/// Reads contents of named file and parse as binary encoded proto data
+/// and store into `*proto`.
+absl::Status ReadBinaryProto(Env* env, const std::string& fname,
+                             protobuf::MessageLite* proto);
+
+/// Write the text representation of "proto" to the named file.
+inline absl::Status WriteTextProto(Env* /* env */,
+                                   const std::string& /* fname */,
+                                   const protobuf::MessageLite& /* proto */) {
+  return errors::Unimplemented("Can't write text protos with protolite.");
+}
+absl::Status WriteTextProto(Env* env, const std::string& fname,
+                            const protobuf::Message& proto);
+
+/// Read contents of named file and parse as text encoded proto data
+/// and store into `*proto`.
+inline absl::Status ReadTextProto(Env* /* env */,
+                                  const std::string& /* fname */,
+                                  protobuf::MessageLite* /* proto */) {
+  return errors::Unimplemented("Can't parse text protos with protolite.");
+}
+absl::Status ReadTextProto(Env* env, const std::string& fname,
+                           protobuf::Message* proto);
+
+/// Read contents of named file and parse as either text or binary encoded proto
+/// data and store into `*proto`.
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
+                                   protobuf::Message* proto);
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
+                                   protobuf::MessageLite* proto);
+
+// START_SKIP_DOXYGEN
+
+// The following approach to register filesystems is deprecated and will be
+// replaced with modular filesystem plugins registration.
+// TODO(b/139060984): After all filesystems are converted, remove this.
+namespace register_file_system {
+
+template <typename Factory>
+struct Register {
+  Register(Env* env, const std::string& scheme, bool try_modular_filesystems) {
+    // TODO(yongtang): Remove legacy file system registration for hdfs/s3/gcs
+    // after TF 2.6+.
+    if (try_modular_filesystems) {
+      const char* env_value = getenv("TF_USE_MODULAR_FILESYSTEM");
+      string load_plugin = env_value ? absl::AsciiStrToLower(env_value) : "";
+      if (load_plugin == "true" || load_plugin == "1") {
+        // We don't register the static filesystem and wait for SIG IO one
+        LOG(WARNING) << "Using modular file system for '" << scheme << "'."
+                     << " Please switch to tensorflow-io"
+                     << " (https://github.com/tensorflow/io) for file system"
+                     << " support of '" << scheme << "'.";
+        return;
+      }
+      // If the envvar is missing or not "true"/"1", then fall back to legacy
+      // implementation to be backwards compatible.
+    }
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    env->RegisterFileSystem(scheme, []() -> FileSystem* { return new Factory; })
+        .IgnoreError();
+  }
+};
+
+}  // namespace register_file_system
+
+// END_SKIP_DOXYGEN
+
+}  // namespace tsl
+
+// Register a FileSystem implementation for a scheme. Files with names that have
+// "scheme://" prefixes are routed to use this implementation.
+#define REGISTER_FILE_SYSTEM_ENV(env, scheme, factory, modular) \
+  REGISTER_FILE_SYSTEM_UNIQ_HELPER(__COUNTER__, env, scheme, factory, modular)
+#define REGISTER_FILE_SYSTEM_UNIQ_HELPER(ctr, env, scheme, factory, modular) \
+  REGISTER_FILE_SYSTEM_UNIQ(ctr, env, scheme, factory, modular)
+#define REGISTER_FILE_SYSTEM_UNIQ(ctr, env, scheme, factory, modular)    \
+  static ::tsl::register_file_system::Register<factory> register_ff##ctr \
+      TF_ATTRIBUTE_UNUSED =                                              \
+          ::tsl::register_file_system::Register<factory>(env, scheme, modular)
+
+#define REGISTER_FILE_SYSTEM(scheme, factory) \
+  REGISTER_FILE_SYSTEM_ENV(::tsl::Env::Default(), scheme, factory, false);
+
+#define REGISTER_LEGACY_FILE_SYSTEM(scheme, factory) \
+  REGISTER_FILE_SYSTEM_ENV(::tsl::Env::Default(), scheme, factory, true);
+
+#endif  // XLA_TSL_PLATFORM_ENV_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/env_time.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/env_time.h
new file mode 100644
index 00000000..f37e3129
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/env_time.h
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PLATFORM_ENV_TIME_H_
+#define XLA_TSL_PLATFORM_ENV_TIME_H_
+
+#include <stdint.h>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+/// \brief An interface used by the tsl implementation to
+/// access timer related operations.
+class EnvTime {
+ public:
+  static constexpr uint64 kMicrosToPicos = 1000ULL * 1000ULL;
+  static constexpr uint64 kMicrosToNanos = 1000ULL;
+  static constexpr uint64 kMillisToMicros = 1000ULL;
+  static constexpr uint64 kMillisToNanos = 1000ULL * 1000ULL;
+  static constexpr uint64 kNanosToPicos = 1000ULL;
+  static constexpr uint64 kSecondsToMillis = 1000ULL;
+  static constexpr uint64 kSecondsToMicros = 1000ULL * 1000ULL;
+  static constexpr uint64 kSecondsToNanos = 1000ULL * 1000ULL * 1000ULL;
+
+  EnvTime() = default;
+  virtual ~EnvTime() = default;
+
+  /// \brief Returns the number of nano-seconds since the Unix epoch.
+  static uint64 NowNanos();
+
+  /// \brief Returns the number of micro-seconds since the Unix epoch.
+  static uint64 NowMicros() { return NowNanos() / kMicrosToNanos; }
+
+  /// \brief Returns the number of seconds since the Unix epoch.
+  static uint64 NowSeconds() { return NowNanos() / kSecondsToNanos; }
+
+  /// \brief A version of NowNanos() that may be overridden by a subclass.
+  virtual uint64 GetOverridableNowNanos() const { return NowNanos(); }
+
+  /// \brief A version of NowMicros() that may be overridden by a subclass.
+  virtual uint64 GetOverridableNowMicros() const {
+    return GetOverridableNowNanos() / kMicrosToNanos;
+  }
+
+  /// \brief A version of NowSeconds() that may be overridden by a subclass.
+  virtual uint64 GetOverridableNowSeconds() const {
+    return GetOverridableNowNanos() / kSecondsToNanos;
+  }
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_ENV_TIME_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/errors.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/errors.h
new file mode 100644
index 00000000..a285c1f9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/errors.h
@@ -0,0 +1,646 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_ERRORS_H_
+#define XLA_TSL_PLATFORM_ERRORS_H_
+
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_join.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/str_util.h"
+#include "tsl/platform/strcat.h"
+
+namespace tsl {
+namespace error {
+// NOLINTBEGIN(misc-unused-using-decls)
+// TODO(aminim): figure out the protobuf migration story.
+using tensorflow::error::ABORTED;
+using tensorflow::error::ALREADY_EXISTS;
+using tensorflow::error::CANCELLED;
+using tensorflow::error::Code;
+using tensorflow::error::DATA_LOSS;
+using tensorflow::error::DEADLINE_EXCEEDED;
+using tensorflow::error::FAILED_PRECONDITION;
+using tensorflow::error::INTERNAL;
+using tensorflow::error::INVALID_ARGUMENT;
+using tensorflow::error::NOT_FOUND;
+using tensorflow::error::OK;
+using tensorflow::error::OUT_OF_RANGE;
+using tensorflow::error::PERMISSION_DENIED;
+using tensorflow::error::RESOURCE_EXHAUSTED;
+using tensorflow::error::UNAUTHENTICATED;
+using tensorflow::error::UNAVAILABLE;
+using tensorflow::error::UNIMPLEMENTED;
+using tensorflow::error::UNKNOWN;
+// NOLINTEND(misc-unused-using-decls)
+}  // namespace error
+
+namespace errors {
+
+namespace internal {
+
+// The DECLARE_ERROR macro below only supports types that can be converted
+// into StrCat's AlphaNum. For the other types we rely on a slower path
+// through std::stringstream. To add support of a new type, it is enough to
+// make sure there is an operator<<() for it:
+//
+//   std::ostream& operator<<(std::ostream& os, const MyType& foo) {
+//     os << foo.ToString();
+//     return os;
+//   }
+// Eventually absl::strings will have native support for this and we will be
+// able to completely remove PrepareForStrCat().
+template <typename T>
+typename std::enable_if<!std::is_convertible<T, strings::AlphaNum>::value,
+                        std::string>::type
+PrepareForStrCat(const T& t) {
+  std::stringstream ss;
+  ss << t;
+  return ss.str();
+}
+inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) {
+  return a;
+}
+
+}  // namespace internal
+
+// Maps UNIX errors into a Status.
+absl::Status IOError(const string& context, int err_number);
+
+// Returns all payloads from a Status as a key-value map.
+inline std::unordered_map<std::string, std::string> GetPayloads(
+    const absl::Status& status) {
+  std::unordered_map<std::string, std::string> payloads;
+  status.ForEachPayload(
+      [&payloads](absl::string_view key, const absl::Cord& value) {
+        payloads[std::string(key)] = std::string(value);
+      });
+  return payloads;
+}
+
+// Inserts all given payloads into the given status. Will overwrite existing
+// payloads if they exist with the same key.
+inline void InsertPayloads(
+    absl::Status& status,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  for (const auto& payload : payloads) {
+    status.SetPayload(payload.first, absl::Cord(payload.second));
+  }
+}
+
+// Copies all payloads from one Status to another. Will overwrite existing
+// payloads in the destination if they exist with the same key.
+inline void CopyPayloads(const absl::Status& from, absl::Status& to) {
+  from.ForEachPayload([&to](absl::string_view key, const absl::Cord& value) {
+    to.SetPayload(key, value);
+  });
+}
+
+#if defined(PLATFORM_GOOGLE)
+// Creates a new status with the given code, message and payloads.
+inline absl::Status Create(
+    absl::StatusCode code, absl::string_view message,
+    const std::unordered_map<std::string, std::string>& payloads,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  absl::Status status(code, message, loc);
+  InsertPayloads(status, payloads);
+  return status;
+}
+// Returns a new Status, replacing its message with the given.
+inline absl::Status CreateWithUpdatedMessage(const absl::Status& status,
+                                             absl::string_view message) {
+  auto locations = status.GetSourceLocations();
+  auto initial_loc =
+      locations.empty() ? absl::SourceLocation::current() : locations[0];
+  absl::Status new_status = Create(static_cast<absl::StatusCode>(status.code()),
+                                   message, GetPayloads(status), initial_loc);
+  if (locations.size() > 1) {
+    for (auto loc : locations.subspan(1)) {
+      new_status.AddSourceLocation(loc);
+    }
+  }
+  return new_status;
+}
+
+#else
+inline ::absl::Status Create(
+    absl::StatusCode code, ::tsl::StringPiece message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  Status status(code, message);
+  InsertPayloads(status, payloads);
+  return status;
+}
+// Returns a new Status, replacing its message with the given.
+inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
+                                              ::tsl::StringPiece message) {
+  return Create(static_cast<absl::StatusCode>(status.code()), message,
+                GetPayloads(status));
+}
+#endif
+
+// Append some context to an error message.  Each time we append
+// context put it on a new line, since it is possible for there
+// to be several layers of additional context.
+template <typename... Args>
+void AppendToMessage(absl::Status* status, Args... args) {
+  auto new_status = CreateWithUpdatedMessage(
+      *status, ::tsl::strings::StrCat(status->message(), "\n\t", args...));
+  CopyPayloads(*status, new_status);
+  *status = std::move(new_status);
+}
+
+// For propagating errors when calling a function.
+#define TF_RETURN_IF_ERROR(...)             \
+  do {                                      \
+    ::absl::Status _status = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_status.ok())) {  \
+      MAYBE_ADD_SOURCE_LOCATION(_status)    \
+      return _status;                       \
+    }                                       \
+  } while (0)
+
+#define TF_RETURN_WITH_CONTEXT_IF_ERROR(expr, ...)           \
+  do {                                                       \
+    ::tsl::Status _status = (expr);                          \
+    if (TF_PREDICT_FALSE(!_status.ok())) {                   \
+      ::tsl::errors::AppendToMessage(&_status, __VA_ARGS__); \
+      return _status;                                        \
+    }                                                        \
+  } while (0)
+
+// Convenience functions for generating and using error status.
+// Example usage:
+//   status.Update(errors::InvalidArgument("The ", foo, " isn't right."));
+//   if (errors::IsInvalidArgument(status)) { ... }
+//   switch (status.code()) { case error::INVALID_ARGUMENT: ... }
+
+// CANCELLED
+template <typename... Args>
+absl::Status Cancelled(Args... args) {
+  return absl::Status(absl::StatusCode::kCancelled,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status CancelledWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kCancelled, message, payloads);
+}
+
+// InvalidArgument
+template <typename... Args>
+absl::Status InvalidArgument(Args... args) {
+  return absl::Status(absl::StatusCode::kInvalidArgument,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+
+#if defined(PLATFORM_GOOGLE)
+// Specialized overloads to capture source location for up to three arguments.
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+::absl::Status InvalidArgument(
+    Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3),
+                             ::tsl::errors::internal::PrepareForStrCat(arg4)),
+      loc);
+}
+template <typename Arg1, typename Arg2, typename Arg3>
+::absl::Status InvalidArgument(
+    Arg1 arg1, Arg2 arg2, Arg3 arg3,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)),
+      loc);
+}
+template <typename Arg1, typename Arg2>
+::absl::Status InvalidArgument(
+    Arg1 arg1, Arg2 arg2,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)),
+      loc);
+}
+template <typename Arg1>
+::absl::Status InvalidArgument(
+    Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
+      loc);
+}
+template <typename... Args>
+::absl::Status InvalidArgumentWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return errors::Create(absl::StatusCode::kInvalidArgument, message, payloads,
+                        loc);
+}
+#else
+template <typename Arg1, typename Arg2, typename Arg3>
+::absl::Status InvalidArgument(Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return ::absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)));
+}
+template <typename Arg1, typename Arg2>
+::absl::Status InvalidArgument(Arg1 arg1, Arg2 arg2) {
+  return ::absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)));
+}
+template <typename Arg1>
+::absl::Status InvalidArgument(Arg1 arg1) {
+  return ::absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)));
+}
+template <typename... Args>
+::absl::Status InvalidArgumentWithPayloads(
+    const ::tsl::StringPiece& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kInvalidArgument, message, payloads);
+}
+#endif
+
+// NotFound
+template <typename... Args>
+absl::Status NotFound(Args... args) {
+  return absl::Status(absl::StatusCode::kNotFound,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+#if defined(PLATFORM_GOOGLE)
+// Specialized overloads to capture source location for up to three arguments.
+template <typename Arg1, typename Arg2, typename Arg3>
+::absl::Status NotFound(
+    Arg1 arg1, Arg2 arg2, Arg3 arg3,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)),
+      loc);
+}
+template <typename Arg1, typename Arg2>
+::absl::Status NotFound(
+    Arg1 arg1, Arg2 arg2,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)),
+      loc);
+}
+template <typename Arg1>
+::absl::Status NotFound(
+    Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
+      loc);
+}
+template <typename... Args>
+::absl::Status NotFoundWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return errors::Create(absl::StatusCode::kNotFound, message, payloads, loc);
+}
+#else
+template <typename Arg1, typename Arg2, typename Arg3>
+::absl::Status NotFound(Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return ::absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)));
+}
+template <typename Arg1, typename Arg2>
+::absl::Status NotFound(Arg1 arg1, Arg2 arg2) {
+  return ::absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)));
+}
+template <typename Arg1>
+::absl::Status NotFound(Arg1 arg1) {
+  return ::absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)));
+}
+template <typename... Args>
+::absl::Status NotFoundWithPayloads(
+    const ::tsl::StringPiece& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kNotFound, message, payloads);
+}
+#endif
+
+// AlreadyExists
+template <typename... Args>
+absl::Status AlreadyExists(Args... args) {
+  return absl::Status(absl::StatusCode::kAlreadyExists,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status AlreadyExistsWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kAlreadyExists, message, payloads);
+}
+
+// ResourceExhausted
+template <typename... Args>
+absl::Status ResourceExhausted(Args... args) {
+  return absl::Status(absl::StatusCode::kResourceExhausted,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status ResourceExhaustedWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kResourceExhausted, message,
+                        payloads);
+}
+
+// Unavailable
+template <typename... Args>
+absl::Status Unavailable(Args... args) {
+  return absl::Status(absl::StatusCode::kUnavailable,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status UnavailableWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kUnavailable, message, payloads);
+}
+
+// FailedPrecondition
+template <typename... Args>
+absl::Status FailedPrecondition(Args... args) {
+  return absl::Status(absl::StatusCode::kFailedPrecondition,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status FailedPreconditionWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kFailedPrecondition, message,
+                        payloads);
+}
+
+// OutOfRange
+template <typename... Args>
+absl::Status OutOfRange(Args... args) {
+  return absl::Status(absl::StatusCode::kOutOfRange,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status OutOfRangeWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kOutOfRange, message, payloads);
+}
+
+// Unimplemented
+template <typename... Args>
+absl::Status Unimplemented(Args... args) {
+  return absl::Status(absl::StatusCode::kUnimplemented,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status UnimplementedWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kUnimplemented, message, payloads);
+}
+
+// Internal
+template <typename... Args>
+absl::Status Internal(Args... args) {
+  return absl::Status(absl::StatusCode::kInternal,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status InternalWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kInternal, message, payloads);
+}
+
+// Aborted
+template <typename... Args>
+absl::Status Aborted(Args... args) {
+  return absl::Status(absl::StatusCode::kAborted,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status AbortedWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kAborted, message, payloads);
+}
+
+// DeadlineExceeded
+template <typename... Args>
+absl::Status DeadlineExceeded(Args... args) {
+  return absl::Status(absl::StatusCode::kDeadlineExceeded,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status DeadlineExceededWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kDeadlineExceeded, message, payloads);
+}
+
+// DataLoss
+template <typename... Args>
+absl::Status DataLoss(Args... args) {
+  return absl::Status(absl::StatusCode::kDataLoss,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status DataLossWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kDataLoss, message, payloads);
+}
+
+// Unknown
+template <typename... Args>
+absl::Status Unknown(Args... args) {
+  return absl::Status(absl::StatusCode::kUnknown,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status UnknownPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kUnknown, message, payloads);
+}
+// PermissionDenied
+template <typename... Args>
+absl::Status PermissionDenied(Args... args) {
+  return absl::Status(absl::StatusCode::kPermissionDenied,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status PermissionDeniedWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kPermissionDenied, message, payloads);
+}
+
+// Unauthenticated
+template <typename... Args>
+absl::Status Unauthenticated(Args... args) {
+  return absl::Status(absl::StatusCode::kUnauthenticated,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+}
+template <typename... Args>
+absl::Status UnauthenticatedWithPayloads(
+    const absl::string_view& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kUnauthenticated, message, payloads);
+}
+
+bool IsAborted(const absl::Status& status);
+bool IsAlreadyExists(const absl::Status& status);
+bool IsCancelled(const absl::Status& status);
+bool IsDataLoss(const absl::Status& status);
+bool IsDeadlineExceeded(const absl::Status& status);
+bool IsFailedPrecondition(const absl::Status& status);
+bool IsInternal(const absl::Status& status);
+bool IsInvalidArgument(const absl::Status& status);
+bool IsNotFound(const absl::Status& status);
+bool IsOutOfRange(const absl::Status& status);
+bool IsPermissionDenied(const absl::Status& status);
+bool IsResourceExhausted(const absl::Status& status);
+bool IsUnauthenticated(const absl::Status& status);
+bool IsUnavailable(const absl::Status& status);
+bool IsUnimplemented(const absl::Status& status);
+bool IsUnknown(const absl::Status& status);
+
+// Produces a formatted string pattern from the name which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <name>}}
+// Note: The pattern below determines the regex _NODEDEF_NAME_RE in the file
+// tensorflow/python/client/session.py
+// LINT.IfChange
+inline std::string FormatNodeNameForError(absl::string_view name) {
+  return strings::StrCat("{{node ", name, "}}");
+}
+// LINT.ThenChange(//tensorflow/python/client/session.py)
+template <typename T>
+std::string FormatNodeNamesForError(const T& names) {
+  return absl::StrJoin(
+      names, ", ", [](std::string* output, absl::string_view s) {
+        ::tsl::strings::StrAppend(output, FormatNodeNameForError(s));
+      });
+}
+// LINT.IfChange
+inline std::string FormatColocationNodeForError(absl::string_view name) {
+  return strings::StrCat("{{colocation_node ", name, "}}");
+}
+// LINT.ThenChange(//tensorflow/python/framework/error_interpolation.py)
+template <typename T, typename = std::enable_if_t<
+                          !std::is_convertible_v<T, absl::string_view>>>
+std::string FormatColocationNodeForError(const T& names) {
+  return absl::StrJoin(
+      names, ", ", [](std::string* output, absl::string_view s) {
+        ::tsl::strings::StrAppend(output, FormatColocationNodeForError(s));
+      });
+}
+
+inline std::string FormatFunctionForError(absl::string_view name) {
+  return strings::StrCat("{{function_node ", name, "}}");
+}
+
+inline absl::Status ReplaceErrorFromNonCommunicationOps(
+    const absl::Status s, absl::string_view op_name) {
+  assert(::tsl::errors::IsUnavailable(s));
+  return absl::Status(
+      absl::StatusCode::kInternal,
+      strings::StrCat(
+          s.message(), "\nExecuting non-communication op <", op_name,
+          "> originally returned UnavailableError, and was replaced by "
+          "InternalError to avoid invoking TF network error handling logic."));
+}
+
+template <typename T>
+std::string FormatOriginalNodeLocationForError(const T& node_names,
+                                               const T& func_names) {
+  std::vector<std::string> error_message;
+  for (int i = 0; i != node_names.size(); ++i) {
+    if (i != 0) {
+      error_message.push_back(", ");
+    }
+    if (i < func_names.size()) {
+      error_message.push_back(FormatFunctionForError(func_names[i]));
+    }
+    error_message.push_back(FormatNodeNameForError(node_names[i]));
+  }
+  return absl::StrJoin(error_message, "");
+}
+
+// The CanonicalCode() for non-errors.
+using ::tsl::error::OK;  // NOLINT
+
+}  // namespace errors
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_ERRORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_statistics.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_statistics.h
new file mode 100644
index 00000000..9686f548
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_statistics.h
@@ -0,0 +1,39 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_FILE_STATISTICS_H_
+#define XLA_TSL_PLATFORM_FILE_STATISTICS_H_
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+struct FileStatistics {
+  // The length of the file or -1 if finding file length is not supported.
+  int64_t length = -1;
+  // The last modified time in nanoseconds.
+  int64_t mtime_nsec = 0;
+  // True if the file is a directory, otherwise false.
+  bool is_directory = false;
+
+  FileStatistics() {}
+  FileStatistics(int64_t length, int64_t mtime_nsec, bool is_directory)
+      : length(length), mtime_nsec(mtime_nsec), is_directory(is_directory) {}
+  ~FileStatistics() {}
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_FILE_STATISTICS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_system.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_system.h
new file mode 100644
index 00000000..ba046fde
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_system.h
@@ -0,0 +1,936 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_FILE_SYSTEM_H_
+#define XLA_TSL_PLATFORM_FILE_SYSTEM_H_
+
+#include <stdint.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_statistics.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/cord.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/stringpiece.h"
+
+#ifdef PLATFORM_WINDOWS
+#undef DeleteFile
+#undef CopyFile
+#undef TranslateName
+#endif
+
+namespace tsl {
+
+class FileAcl;
+class RandomAccessFile;
+class ReadOnlyMemoryRegion;
+class WritableFile;
+
+class FileSystem;
+struct TransactionToken {
+  FileSystem* owner;
+  void* token;
+};
+
+/// A generic interface for accessing a file system.  Implementations
+/// of custom filesystem adapters must implement this interface,
+/// RandomAccessFile, WritableFile, and ReadOnlyMemoryRegion classes.
+class FileSystem {
+ public:
+  /// \brief Creates a brand new random access read-only file with the
+  /// specified name.
+  ///
+  /// On success, stores a pointer to the new file in
+  /// *result and returns OK.  On failure stores NULL in *result and
+  /// returns non-OK.  If the file does not exist, returns a non-OK
+  /// status.
+  ///
+  /// The returned file may be concurrently accessed by multiple threads.
+  ///
+  /// The ownership of the returned RandomAccessFile is passed to the caller
+  /// and the object should be deleted when is not used.
+  virtual absl::Status NewRandomAccessFile(
+      const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    return NewRandomAccessFile(fname, nullptr, result);
+  }
+
+  virtual absl::Status NewRandomAccessFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) {
+    // We duplicate these methods due to Google internal coding style prevents
+    // virtual functions with default arguments. See PR #41615.
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates an object that writes to a new file with the specified
+  /// name.
+  ///
+  /// Deletes any existing file with the same name and creates a
+  /// new file.  On success, stores a pointer to the new file in
+  /// *result and returns OK.  On failure stores NULL in *result and
+  /// returns non-OK.
+  ///
+  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used.
+  virtual absl::Status NewWritableFile(const std::string& fname,
+                                       std::unique_ptr<WritableFile>* result) {
+    return NewWritableFile(fname, nullptr, result);
+  }
+
+  virtual absl::Status NewWritableFile(const std::string& fname,
+                                       TransactionToken* token,
+                                       std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates an object that either appends to an existing file, or
+  /// writes to a new file (if the file does not exist to begin with).
+  ///
+  /// On success, stores a pointer to the new file in *result and
+  /// returns OK.  On failure stores NULL in *result and returns
+  /// non-OK.
+  ///
+  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used.
+  virtual absl::Status NewAppendableFile(
+      const std::string& fname, std::unique_ptr<WritableFile>* result) {
+    return NewAppendableFile(fname, nullptr, result);
+  }
+
+  virtual absl::Status NewAppendableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates a readonly region of memory with the file context.
+  ///
+  /// On success, it returns a pointer to read-only memory region
+  /// from the content of file fname. The ownership of the region is passed to
+  /// the caller. On failure stores nullptr in *result and returns non-OK.
+  ///
+  /// The returned memory region can be accessed from many threads in parallel.
+  ///
+  /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
+  /// and the object should be deleted when is not used.
+  virtual absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return NewReadOnlyMemoryRegionFromFile(fname, nullptr, result);
+  }
+
+  virtual absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return absl::OkStatus();
+  }
+
+  /// Returns OK if the named path exists and NOT_FOUND otherwise.
+  virtual absl::Status FileExists(const std::string& fname) {
+    return FileExists(fname, nullptr);
+  }
+
+  virtual absl::Status FileExists(const std::string& fname,
+                                  TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// Returns true if all the listed files exist, false otherwise.
+  /// if status is not null, populate the vector with a detailed status
+  /// for each file.
+  virtual bool FilesExist(const std::vector<string>& files,
+                          std::vector<absl::Status>* status) {
+    return FilesExist(files, nullptr, status);
+  }
+
+  virtual bool FilesExist(const std::vector<string>& files,
+                          TransactionToken* token,
+                          std::vector<absl::Status>* status);
+
+  /// \brief Returns the immediate children in the given directory.
+  ///
+  /// The returned paths are relative to 'dir'.
+  virtual absl::Status GetChildren(const std::string& dir,
+                                   std::vector<string>* result) {
+    return GetChildren(dir, nullptr, result);
+  }
+
+  virtual absl::Status GetChildren(const std::string& dir,
+                                   TransactionToken* token,
+                                   std::vector<string>* result) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Given a pattern, stores in *results the set of paths that matches
+  /// that pattern. *results is cleared.
+  ///
+  /// pattern must match all of a name, not just a substring.
+  ///
+  /// pattern: { term }
+  /// term:
+  ///   '*': matches any sequence of non-'/' characters
+  ///   '?': matches a single non-'/' character
+  ///   '[' [ '^' ] { match-list } ']':
+  ///        matches any single character (not) on the list
+  ///   c: matches character c (c != '*', '?', '\\', '[')
+  ///   '\\' c: matches character c
+  /// character-range:
+  ///   c: matches character c (c != '\\', '-', ']')
+  ///   '\\' c: matches character c
+  ///   lo '-' hi: matches character c for lo <= c <= hi
+  ///
+  /// Typical return codes:
+  ///  * OK - no errors
+  ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
+  ///                    implemented
+  virtual absl::Status GetMatchingPaths(const std::string& pattern,
+                                        std::vector<string>* results) {
+    return GetMatchingPaths(pattern, nullptr, results);
+  }
+
+  virtual absl::Status GetMatchingPaths(const std::string& pattern,
+                                        TransactionToken* token,
+                                        std::vector<string>* results) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Checks if the given filename matches the pattern.
+  ///
+  /// This function provides the equivalent of posix fnmatch, however it is
+  /// implemented without fnmatch to ensure that this can be used for cloud
+  /// filesystems on windows. For windows filesystems, it uses PathMatchSpec.
+  virtual bool Match(const std::string& filename, const std::string& pattern);
+
+  /// \brief Obtains statistics for the given path.
+  virtual absl::Status Stat(const std::string& fname, FileStatistics* stat) {
+    return Stat(fname, nullptr, stat);
+  }
+
+  virtual absl::Status Stat(const std::string& fname, TransactionToken* token,
+                            FileStatistics* stat) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Deletes the named file.
+  virtual absl::Status DeleteFile(const std::string& fname) {
+    return DeleteFile(fname, nullptr);
+  }
+
+  virtual absl::Status DeleteFile(const std::string& fname,
+                                  TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates the specified directory.
+  /// Typical return codes:
+  ///  * OK - successfully created the directory.
+  ///  * ALREADY_EXISTS - directory with name dirname already exists.
+  ///  * PERMISSION_DENIED - dirname is not writable.
+  virtual absl::Status CreateDir(const std::string& dirname) {
+    return CreateDir(dirname, nullptr);
+  }
+
+  virtual absl::Status CreateDir(const std::string& dirname,
+                                 TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Creates the specified directory and all the necessary
+  /// subdirectories.
+  /// Typical return codes:
+  ///  * OK - successfully created the directory and sub directories, even if
+  ///         they were already created.
+  ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
+  virtual absl::Status RecursivelyCreateDir(const std::string& dirname) {
+    return RecursivelyCreateDir(dirname, nullptr);
+  }
+
+  virtual absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                            TransactionToken* token);
+
+  /// \brief Deletes the specified directory.
+  virtual absl::Status DeleteDir(const std::string& dirname) {
+    return DeleteDir(dirname, nullptr);
+  }
+
+  virtual absl::Status DeleteDir(const std::string& dirname,
+                                 TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Deletes the specified directory and all subdirectories and files
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
+  /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
+  ///
+  /// Typical return codes:
+  ///  * OK - dirname exists and we were able to delete everything underneath.
+  ///  * NOT_FOUND - dirname doesn't exist
+  ///  * PERMISSION_DENIED - dirname or some descendant is not writable
+  ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
+  ///                    implemented
+  virtual absl::Status DeleteRecursively(const std::string& dirname,
+                                         int64_t* undeleted_files,
+                                         int64_t* undeleted_dirs) {
+    return DeleteRecursively(dirname, nullptr, undeleted_files, undeleted_dirs);
+  }
+
+  virtual absl::Status DeleteRecursively(const std::string& dirname,
+                                         TransactionToken* token,
+                                         int64_t* undeleted_files,
+                                         int64_t* undeleted_dirs);
+
+  /// \brief Stores the size of `fname` in `*file_size`.
+  virtual absl::Status GetFileSize(const std::string& fname,
+                                   uint64* file_size) {
+    return GetFileSize(fname, nullptr, file_size);
+  }
+
+  virtual absl::Status GetFileSize(const std::string& fname,
+                                   TransactionToken* token, uint64* file_size) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Overwrites the target if it exists.
+  virtual absl::Status RenameFile(const std::string& src,
+                                  const std::string& target) {
+    return RenameFile(src, target, nullptr);
+  }
+
+  virtual absl::Status RenameFile(const std::string& src,
+                                  const std::string& target,
+                                  TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Copy the src to target.
+  virtual absl::Status CopyFile(const std::string& src,
+                                const std::string& target) {
+    return CopyFile(src, target, nullptr);
+  }
+
+  virtual absl::Status CopyFile(const std::string& src,
+                                const std::string& target,
+                                TransactionToken* token);
+
+  /// \brief Translate an URI to a filename for the FileSystem implementation.
+  ///
+  /// The implementation in this class cleans up the path, removing
+  /// duplicate /'s, resolving .. and removing trailing '/'.
+  /// This respects relative vs. absolute paths, but does not
+  /// invoke any system calls (getcwd(2)) in order to resolve relative
+  /// paths with respect to the actual working directory.  That is, this is
+  /// purely string manipulation, completely independent of process state.
+  virtual std::string TranslateName(const std::string& name) const;
+
+  /// \brief Returns whether the given path is a directory or not.
+  ///
+  /// Typical return codes (not guaranteed exhaustive):
+  ///  * OK - The path exists and is a directory.
+  ///  * FAILED_PRECONDITION - The path exists and is not a directory.
+  ///  * NOT_FOUND - The path entry does not exist.
+  ///  * PERMISSION_DENIED - Insufficient permissions.
+  ///  * UNIMPLEMENTED - The file factory doesn't support directories.
+  virtual absl::Status IsDirectory(const std::string& fname) {
+    return IsDirectory(fname, nullptr);
+  }
+
+  virtual absl::Status IsDirectory(const std::string& fname,
+                                   TransactionToken* token);
+
+  /// \brief Returns whether the given path is on a file system
+  /// that has atomic move capabilities. This can be used
+  /// to determine if there needs to be a temp location to safely write objects.
+  /// The second boolean argument has_atomic_move contains this information.
+  ///
+  /// Returns one of the following status codes (not guaranteed exhaustive):
+  ///  * OK - The path is on a recognized file system,
+  ///         so has_atomic_move holds the above information.
+  ///  * UNIMPLEMENTED - The file system of the path hasn't been implemented in
+  ///  TF
+  virtual absl::Status HasAtomicMove(const std::string& path,
+                                     bool* has_atomic_move);
+
+  /// Returns whether the give path is on a file system
+  /// that has ability to create a new temp file. This can be used
+  /// to determine if there needs to be a temp location to safely write objects.
+  /// If the file system cannot create a temp file, it's possibile that
+  /// uncomplete result may appear in the given file.
+  virtual absl::Status CanCreateTempFile(const std::string& fname,
+                                         bool* can_create_temp_file);
+
+  /// \brief Flushes any cached filesystem objects from memory.
+  virtual void FlushCaches() { FlushCaches(nullptr); }
+
+  virtual void FlushCaches(TransactionToken* token);
+
+  /// \brief The separator this filesystem uses.
+  ///
+  /// This is implemented as a part of the filesystem, because even on windows,
+  /// a user may need access to filesystems with '/' separators, such as cloud
+  /// filesystems.
+  virtual char Separator() const;
+
+  /// \brief Split a path to its basename and dirname.
+  ///
+  /// Helper function for Basename and Dirname.
+  std::pair<absl::string_view, absl::string_view> SplitPath(
+      absl::string_view uri) const;
+
+  /// \brief returns the final file name in the given path.
+  ///
+  /// Returns the part of the path after the final "/".  If there is no
+  /// "/" in the path, the result is the same as the input.
+  virtual absl::string_view Basename(absl::string_view path) const;
+
+  /// \brief Returns the part of the path before the final "/".
+  ///
+  /// If there is a single leading "/" in the path, the result will be the
+  /// leading "/".  If there is no "/" in the path, the result is the empty
+  /// prefix of the input.
+  absl::string_view Dirname(absl::string_view path) const;
+
+  /// \brief Returns the part of the basename of path after the final ".".
+  ///
+  /// If there is no "." in the basename, the result is empty.
+  absl::string_view Extension(absl::string_view path) const;
+
+  /// \brief Clean duplicate and trailing, "/"s, and resolve ".." and ".".
+  ///
+  /// NOTE: This respects relative vs. absolute paths, but does not
+  /// invoke any system calls (getcwd(2)) in order to resolve relative
+  /// paths with respect to the actual working directory.  That is, this is
+  /// purely string manipulation, completely independent of process state.
+  std::string CleanPath(absl::string_view path) const;
+
+  /// \brief Creates a URI from a scheme, host, and path.
+  ///
+  /// If the scheme is empty, we just return the path.
+  std::string CreateURI(absl::string_view scheme, absl::string_view host,
+                        absl::string_view path) const;
+
+  /// \brief Return true if path is absolute.
+  bool IsAbsolutePath(absl::string_view path) const;
+
+#ifndef SWIG  // variadic templates
+  /// \brief Join multiple paths together.
+  ///
+  /// This function also removes the unnecessary path separators.
+  /// For example:
+  ///
+  ///  Arguments                  | JoinPath
+  ///  ---------------------------+----------
+  ///  '/foo', 'bar'              | /foo/bar
+  ///  '/foo/', 'bar'             | /foo/bar
+  ///  '/foo', '/bar'             | /foo/bar
+  ///
+  /// Usage:
+  /// string path = io::JoinPath("/mydir", filename);
+  /// string path = io::JoinPath(FLAGS_test_srcdir, filename);
+  /// string path = io::JoinPath("/full", "path", "to", "filename");
+  template <typename... T>
+  std::string JoinPath(const T&... args) {
+    return JoinPathImpl({args...});
+  }
+#endif /* SWIG */
+
+  std::string JoinPathImpl(std::initializer_list<absl::string_view> paths);
+
+  /// \brief Populates the scheme, host, and path from a URI.
+  ///
+  /// scheme, host, and path are guaranteed by this function to point into the
+  /// contents of uri, even if empty.
+  ///
+  /// Corner cases:
+  /// - If the URI is invalid, scheme and host are set to empty strings and the
+  ///  passed string is assumed to be a path
+  /// - If the URI omits the path (e.g. file://host), then the path is left
+  /// empty.
+  void ParseURI(absl::string_view remaining, absl::string_view* scheme,
+                absl::string_view* host, absl::string_view* path) const;
+
+  // Transaction related API
+
+  /// \brief Starts a new transaction
+  virtual absl::Status StartTransaction(TransactionToken** token) {
+    *token = nullptr;
+    return absl::OkStatus();
+  }
+
+  /// \brief Adds `path` to transaction in `token`
+  virtual absl::Status AddToTransaction(const std::string& path,
+                                        TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Ends transaction
+  virtual absl::Status EndTransaction(TransactionToken* token) {
+    return absl::OkStatus();
+  }
+
+  /// \brief Get token for `path` or start a new transaction and add `path` to
+  /// it.
+  virtual absl::Status GetTokenOrStartTransaction(const std::string& path,
+                                                  TransactionToken** token) {
+    *token = nullptr;
+    return absl::OkStatus();
+  }
+
+  /// \brief Return transaction for `path` or nullptr in `token`
+  virtual absl::Status GetTransactionForPath(const std::string& path,
+                                             TransactionToken** token) {
+    *token = nullptr;
+    return absl::OkStatus();
+  }
+
+  /// \brief Decode transaction to human readable string.
+  virtual std::string DecodeTransaction(const TransactionToken* token);
+
+  /// \brief Set File System Configuration Options
+  virtual absl::Status SetOption(const string& key, const string& value) {
+    return errors::Unimplemented("SetOption");
+  }
+
+  /// \brief Set File System Configuration Option
+  virtual absl::Status SetOption(const std::string& name,
+                                 const std::vector<string>& values) {
+    return errors::Unimplemented("SetOption");
+  }
+
+  /// \brief Set File System Configuration Option
+  virtual absl::Status SetOption(const std::string& name,
+                                 const std::vector<int64_t>& values) {
+    return errors::Unimplemented("SetOption");
+  }
+
+  /// \brief Set File System Configuration Option
+  virtual absl::Status SetOption(const std::string& name,
+                                 const std::vector<double>& values) {
+    return errors::Unimplemented("SetOption");
+  }
+
+  /// \brief Set File System ACL checker.
+  ///
+  /// No checks are enforced if a FileAcl is never set.
+  virtual absl::Status SetFileAcl(std::shared_ptr<FileAcl> file_acl) {
+    return errors::Unimplemented("SetFileAcl");
+  }
+
+  FileSystem() {}
+
+  virtual ~FileSystem() = default;
+};
+/// This macro adds forwarding methods from FileSystem class to
+/// used class since name hiding will prevent these to be accessed from
+/// derived classes and would require all use locations to migrate to
+/// Transactional API. This is an interim solution until ModularFileSystem class
+/// becomes a singleton.
+// TODO(sami): Remove this macro when filesystem plugins migration is complete.
+#define TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT \
+  using FileSystem::NewRandomAccessFile;                      \
+  using FileSystem::NewWritableFile;                          \
+  using FileSystem::NewAppendableFile;                        \
+  using FileSystem::NewReadOnlyMemoryRegionFromFile;          \
+  using FileSystem::FileExists;                               \
+  using FileSystem::GetChildren;                              \
+  using FileSystem::GetMatchingPaths;                         \
+  using FileSystem::Stat;                                     \
+  using FileSystem::DeleteFile;                               \
+  using FileSystem::RecursivelyCreateDir;                     \
+  using FileSystem::DeleteDir;                                \
+  using FileSystem::DeleteRecursively;                        \
+  using FileSystem::GetFileSize;                              \
+  using FileSystem::RenameFile;                               \
+  using FileSystem::CopyFile;                                 \
+  using FileSystem::IsDirectory;                              \
+  using FileSystem::FlushCaches
+
+/// A Wrapper class for Transactional FileSystem support.
+/// This provides means to make use of the transactions with minimal code change
+/// Any operations that are done through this interface will be through the
+/// transaction created at the time of construction of this instance.
+/// See FileSystem documentation for method descriptions.
+/// This class simply forwards all calls to wrapped filesystem either with given
+/// transaction token or with token used in its construction. This allows doing
+/// transactional filesystem access with minimal code change.
+class WrappedFileSystem : public FileSystem {
+ public:
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  absl::Status NewRandomAccessFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
+    return fs_->NewRandomAccessFile(fname, (token ? token : token_), result);
+  }
+
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
+    return fs_->NewWritableFile(fname, (token ? token : token_), result);
+  }
+
+  absl::Status NewAppendableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
+    return fs_->NewAppendableFile(fname, (token ? token : token_), result);
+  }
+
+  absl::Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return fs_->NewReadOnlyMemoryRegionFromFile(fname, (token ? token : token_),
+                                                result);
+  }
+
+  absl::Status FileExists(const std::string& fname,
+                          TransactionToken* token) override {
+    return fs_->FileExists(fname, (token ? token : token_));
+  }
+
+  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
+                  std::vector<absl::Status>* status) override {
+    return fs_->FilesExist(files, (token ? token : token_), status);
+  }
+
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
+    return fs_->GetChildren(dir, (token ? token : token_), result);
+  }
+
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<string>* results) override {
+    return fs_->GetMatchingPaths(pattern, (token ? token : token_), results);
+  }
+
+  bool Match(const std::string& filename, const std::string& pattern) override {
+    return fs_->Match(filename, pattern);
+  }
+
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
+    return fs_->Stat(fname, (token ? token : token_), stat);
+  }
+
+  absl::Status DeleteFile(const std::string& fname,
+                          TransactionToken* token) override {
+    return fs_->DeleteFile(fname, (token ? token : token_));
+  }
+
+  absl::Status CreateDir(const std::string& dirname,
+                         TransactionToken* token) override {
+    return fs_->CreateDir(dirname, (token ? token : token_));
+  }
+
+  absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                    TransactionToken* token) override {
+    return fs_->RecursivelyCreateDir(dirname, (token ? token : token_));
+  }
+
+  absl::Status DeleteDir(const std::string& dirname,
+                         TransactionToken* token) override {
+    return fs_->DeleteDir(dirname, (token ? token : token_));
+  }
+
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override {
+    return fs_->DeleteRecursively(dirname, (token ? token : token_),
+                                  undeleted_files, undeleted_dirs);
+  }
+
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64* file_size) override {
+    return fs_->GetFileSize(fname, (token ? token : token_), file_size);
+  }
+
+  absl::Status RenameFile(const std::string& src, const std::string& target,
+                          TransactionToken* token) override {
+    return fs_->RenameFile(src, target, (token ? token : token_));
+  }
+
+  absl::Status CopyFile(const std::string& src, const std::string& target,
+                        TransactionToken* token) override {
+    return fs_->CopyFile(src, target, (token ? token : token_));
+  }
+
+  std::string TranslateName(const std::string& name) const override {
+    return fs_->TranslateName(name);
+  }
+
+  absl::Status IsDirectory(const std::string& fname,
+                           TransactionToken* token) override {
+    return fs_->IsDirectory(fname, (token ? token : token_));
+  }
+
+  absl::Status HasAtomicMove(const std::string& path,
+                             bool* has_atomic_move) override {
+    return fs_->HasAtomicMove(path, has_atomic_move);
+  }
+
+  void FlushCaches(TransactionToken* token) override {
+    return fs_->FlushCaches((token ? token : token_));
+  }
+
+  char Separator() const override { return fs_->Separator(); }
+
+  absl::string_view Basename(absl::string_view path) const override {
+    return fs_->Basename(path);
+  }
+
+  absl::Status StartTransaction(TransactionToken** token) override {
+    return fs_->StartTransaction(token);
+  }
+
+  absl::Status AddToTransaction(const std::string& path,
+                                TransactionToken* token) override {
+    return fs_->AddToTransaction(path, (token ? token : token_));
+  }
+
+  absl::Status EndTransaction(TransactionToken* token) override {
+    return fs_->EndTransaction(token);
+  }
+
+  absl::Status GetTransactionForPath(const std::string& path,
+                                     TransactionToken** token) override {
+    return fs_->GetTransactionForPath(path, token);
+  }
+
+  absl::Status GetTokenOrStartTransaction(const std::string& path,
+                                          TransactionToken** token) override {
+    return fs_->GetTokenOrStartTransaction(path, token);
+  }
+
+  std::string DecodeTransaction(const TransactionToken* token) override {
+    return fs_->DecodeTransaction((token ? token : token_));
+  }
+
+  WrappedFileSystem(FileSystem* file_system, TransactionToken* token)
+      : fs_(file_system), token_(token) {}
+
+  ~WrappedFileSystem() override = default;
+
+ private:
+  FileSystem* fs_;
+  TransactionToken* token_;
+};
+
+/// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() {}
+  virtual ~RandomAccessFile() = default;
+
+  /// \brief Returns the name of the file.
+  ///
+  /// This is an optional operation that may not be implemented by every
+  /// filesystem.
+  virtual absl::Status Name(absl::string_view* result) const {
+    return errors::Unimplemented("This filesystem does not support Name()");
+  }
+
+  /// \brief Reads up to `n` bytes from the file starting at `offset`.
+  ///
+  /// `scratch[0..n-1]` may be written by this routine.  Sets `*result`
+  /// to the data that was read (including if fewer than `n` bytes were
+  /// successfully read).  May set `*result` to point at data in
+  /// `scratch[0..n-1]`, so `scratch[0..n-1]` must be live when
+  /// `*result` is used.
+  ///
+  /// On OK returned status: `n` bytes have been stored in `*result`.
+  /// On non-OK returned status: `[0..n]` bytes have been stored in `*result`.
+  ///
+  /// Returns `OUT_OF_RANGE` if fewer than n bytes were stored in `*result`
+  /// because of EOF.
+  ///
+  /// Safe for concurrent use by multiple threads.
+  virtual absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+                            char* scratch) const = 0;
+
+#if defined(TF_CORD_SUPPORT)
+  /// \brief Read up to `n` bytes from the file starting at `offset`.
+  virtual absl::Status Read(uint64 offset, size_t n, absl::Cord* cord) const {
+    return errors::Unimplemented(
+        "Read(uint64, size_t, absl::Cord*) is not "
+        "implemented");
+  }
+#endif
+
+ private:
+  RandomAccessFile(const RandomAccessFile&) = delete;
+  void operator=(const RandomAccessFile&) = delete;
+};
+
+/// \brief A file abstraction for sequential writing.
+///
+/// The implementation must provide buffering since callers may append
+/// small fragments at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() {}
+  virtual ~WritableFile() = default;
+
+  /// \brief Append 'data' to the file.
+  virtual absl::Status Append(absl::string_view data) = 0;
+
+#if defined(TF_CORD_SUPPORT)
+  // \brief Append 'data' to the file.
+  virtual absl::Status Append(const absl::Cord& cord) {
+    for (absl::string_view chunk : cord.Chunks()) {
+      TF_RETURN_IF_ERROR(Append(chunk));
+    }
+    return absl::OkStatus();
+  }
+#endif
+
+  /// \brief Close the file.
+  ///
+  /// Flush() and de-allocate resources associated with this file
+  ///
+  /// Typical return codes (not guaranteed to be exhaustive):
+  ///  * OK
+  ///  * Other codes, as returned from Flush()
+  virtual absl::Status Close() = 0;
+
+  /// \brief Flushes the file and optionally syncs contents to filesystem.
+  ///
+  /// This should flush any local buffers whose contents have not been
+  /// delivered to the filesystem.
+  ///
+  /// If the process terminates after a successful flush, the contents
+  /// may still be persisted, since the underlying filesystem may
+  /// eventually flush the contents.  If the OS or machine crashes
+  /// after a successful flush, the contents may or may not be
+  /// persisted, depending on the implementation.
+  virtual absl::Status Flush() = 0;
+
+  // \brief Returns the name of the file.
+  ///
+  /// This is an optional operation that may not be implemented by every
+  /// filesystem.
+  virtual absl::Status Name(absl::string_view* result) const {
+    return errors::Unimplemented("This filesystem does not support Name()");
+  }
+
+  /// \brief Syncs contents of file to filesystem.
+  ///
+  /// This waits for confirmation from the filesystem that the contents
+  /// of the file have been persisted to the filesystem; if the OS
+  /// or machine crashes after a successful Sync, the contents should
+  /// be properly saved.
+  virtual absl::Status Sync() = 0;
+
+  /// \brief Retrieves the current write position in the file, or -1 on
+  /// error.
+  ///
+  /// This is an optional operation, subclasses may choose to return
+  /// errors::Unimplemented.
+  virtual absl::Status Tell(int64_t* position) {
+    *position = -1;
+    return errors::Unimplemented("This filesystem does not support Tell()");
+  }
+
+ private:
+  WritableFile(const WritableFile&) = delete;
+  void operator=(const WritableFile&) = delete;
+};
+
+/// \brief A readonly memmapped file abstraction.
+///
+/// The implementation must guarantee that all memory is accessible when the
+/// object exists, independently from the Env that created it.
+class ReadOnlyMemoryRegion {
+ public:
+  ReadOnlyMemoryRegion() {}
+  virtual ~ReadOnlyMemoryRegion() = default;
+
+  /// \brief Returns a pointer to the memory region.
+  virtual const void* data() = 0;
+
+  /// \brief Returns the length of the memory region in bytes.
+  virtual uint64 length() = 0;
+};
+
+/// \brief A registry for file system implementations.
+///
+/// Filenames are specified as an URI, which is of the form
+/// [scheme://]<filename>.
+/// File system implementations are registered using the REGISTER_FILE_SYSTEM
+/// macro, providing the 'scheme' as the key.
+///
+/// There are two `Register` methods: one using `Factory` for legacy filesystems
+/// (deprecated mechanism of subclassing `FileSystem` and using
+/// `REGISTER_FILE_SYSTEM` macro), and one using `std::unique_ptr<FileSystem>`
+/// for the new modular approach.
+///
+/// Note that the new API expects a pointer to `ModularFileSystem` but this is
+/// not checked as there should be exactly one caller to the API and doing the
+/// check results in a circular dependency between `BUILD` targets.
+///
+/// Plan is to completely remove the filesystem registration from `Env` and
+/// incorporate it into `ModularFileSystem` class (which will be renamed to be
+/// the only `FileSystem` class and marked as `final`). But this will happen at
+/// a later time, after we convert all filesystems to the new API.
+///
+/// TODO(b/139060984): After all filesystems are converted, remove old
+/// registration and update comment.
+class FileSystemRegistry {
+ public:
+  typedef std::function<FileSystem*()> Factory;
+
+  virtual ~FileSystemRegistry() = default;
+  virtual absl::Status Register(const std::string& scheme, Factory factory) = 0;
+  virtual absl::Status Register(const std::string& scheme,
+                                std::unique_ptr<FileSystem> filesystem) = 0;
+  virtual FileSystem* Lookup(const std::string& scheme) = 0;
+  virtual absl::Status GetRegisteredFileSystemSchemes(
+      std::vector<std::string>* schemes) = 0;
+};
+
+/// \brief An abstraction for enforcing ACL checks in FileSystem.
+class FileAcl {
+ public:
+  virtual absl::Status CheckAccess(std::string_view path) = 0;
+  virtual ~FileAcl() = default;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_system_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_system_helper.h
new file mode 100644
index 00000000..218b4c88
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/file_system_helper.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define XLA_TSL_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace tsl {
+
+class FileSystem;
+class Env;
+
+namespace internal {
+
+// Given a pattern, stores in 'results' the set of paths (in the given file
+// system) that match that pattern.
+//
+// This helper may be used by implementations of FileSystem::GetMatchingPaths()
+// in order to provide parallel scanning of subdirectories (except on iOS).
+//
+// Arguments:
+//   fs: may not be null and will be used to identify directories and list
+//       their contents.
+//   env: may not be null and will be used to check if a match has been found.
+//   pattern: see FileSystem::GetMatchingPaths() for details.
+//   results: will be cleared and may not be null.
+//
+// Returns an error status if any call to 'fs' failed.
+absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                              std::vector<string>* results);
+
+// Given a file path, determines whether the file exists. This helper simplifies
+// the use of Env::FileExists.
+//
+// Arguments:
+//   env: may not be null.
+//   fname: the file path to look up
+//
+// Returns true if the file exists, false if it does not exist, or an error
+// Status.
+absl::StatusOr<bool> FileExists(Env* env, const string& fname);
+
+}  // namespace internal
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/logging.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/logging.h
new file mode 100644
index 00000000..a50fd04b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/logging.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_LOGGING_H_
+#define XLA_TSL_PLATFORM_LOGGING_H_
+
+#include "tsl/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
+    defined(PLATFORM_GOOGLE_IOS) || defined(GOOGLE_LOGGING) ||      \
+    defined(__EMSCRIPTEN__) || defined(PLATFORM_CHROMIUMOS)
+#include "xla/tsl/platform/google/logging.h"  // IWYU pragma: export
+#else
+#include "xla/tsl/platform/default/logging.h"  // IWYU pragma: export
+#endif
+
+#endif  // XLA_TSL_PLATFORM_LOGGING_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/macros.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/macros.h
new file mode 100644
index 00000000..e635f98f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/macros.h
@@ -0,0 +1,162 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_MACROS_H_
+#define XLA_TSL_PLATFORM_MACROS_H_
+
+// Compiler attributes
+#if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)
+// Compiler supports GCC-style attributes
+#define TF_ATTRIBUTE_NORETURN __attribute__((noreturn))
+#define TF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#define TF_ATTRIBUTE_NOINLINE __attribute__((noinline))
+#define TF_ATTRIBUTE_UNUSED __attribute__((unused))
+#define TF_ATTRIBUTE_COLD __attribute__((cold))
+#define TF_ATTRIBUTE_WEAK __attribute__((weak))
+#define TF_PACKED __attribute__((packed))
+#define TF_MUST_USE_RESULT __attribute__((warn_unused_result))
+#define TF_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
+  __attribute__((__format__(__scanf__, string_index, first_to_check)))
+#elif defined(_MSC_VER)
+// Non-GCC equivalents
+#define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#define TF_ATTRIBUTE_NOINLINE
+#define TF_ATTRIBUTE_UNUSED
+#define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
+#define TF_MUST_USE_RESULT
+#define TF_PACKED
+#define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
+#else
+// Non-GCC equivalents
+#define TF_ATTRIBUTE_NORETURN
+#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_NOINLINE
+#define TF_ATTRIBUTE_UNUSED
+#define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
+#define TF_MUST_USE_RESULT
+#define TF_PACKED
+#define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
+#endif
+
+// Control visibility outside .so
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_EXPORT __declspec(dllexport)
+#else
+#define TF_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+#ifdef __has_builtin
+#define TF_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TF_HAS_BUILTIN(x) 0
+#endif
+
+// C++11-style attributes (N2761)
+#if defined(__has_cpp_attribute)
+// Safely checks if an attribute is supported. Equivalent to
+// ABSL_HAVE_CPP_ATTRIBUTE.
+#define TF_HAS_CPP_ATTRIBUTE(n) __has_cpp_attribute(n)
+#else
+#define TF_HAS_CPP_ATTRIBUTE(n) 0
+#endif
+
+// [[clang::annotate("x")]] allows attaching custom strings (e.g. "x") to
+// declarations (variables, functions, fields, etc.) for use by tools. They are
+// represented in the Clang AST (as AnnotateAttr nodes) and in LLVM IR, but not
+// in final output.
+#if TF_HAS_CPP_ATTRIBUTE(clang::annotate)
+#define TF_ATTRIBUTE_ANNOTATE(str) [[clang::annotate(str)]]
+#else
+#define TF_ATTRIBUTE_ANNOTATE(str)
+#endif
+
+// A variable declaration annotated with the `TF_CONST_INIT` attribute will
+// not compile (on supported platforms) unless the variable has a constant
+// initializer.
+#if TF_HAS_CPP_ATTRIBUTE(clang::require_constant_initialization)
+#define TF_CONST_INIT [[clang::require_constant_initialization]]
+#else
+#define TF_CONST_INIT
+#endif
+
+// Compilers can be told that a certain branch is not likely to be taken
+// (for instance, a CHECK failure), and use that information in static
+// analysis. Giving it this information can help it optimize for the
+// common case in the absence of better information (ie.
+// -fprofile-arcs).
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
+#define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#else
+#define TF_PREDICT_FALSE(x) (x)
+#define TF_PREDICT_TRUE(x) (x)
+#endif
+
+// DEPRECATED: directly use the macro implementation instead.
+// A macro to disallow the copy constructor and operator= functions
+// This is usually placed in the private: declarations for a class.
+#define TF_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;         \
+  void operator=(const TypeName&) = delete
+
+// The TF_ARRAYSIZE(arr) macro returns the # of elements in an array arr.
+//
+// The expression TF_ARRAYSIZE(a) is a compile-time constant of type
+// size_t.
+#define TF_ARRAYSIZE(a)         \
+  ((sizeof(a) / sizeof(*(a))) / \
+   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1900)
+// Define this to 1 if the code is compiled in C++11 mode; leave it
+// undefined otherwise.  Do NOT define it to 0 -- that causes
+// '#ifdef LANG_CXX11' to behave differently from '#if LANG_CXX11'.
+#define LANG_CXX11 1
+#endif
+
+#if defined(__clang__) && defined(LANG_CXX11) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define TF_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
+#endif
+#endif
+
+#ifndef TF_FALLTHROUGH_INTENDED
+#define TF_FALLTHROUGH_INTENDED \
+  do {                          \
+  } while (0)
+#endif
+
+namespace tsl {
+namespace internal {
+template <typename T>
+void remove_unused_variable_compiler_warning(const T&){};
+}  // namespace internal
+}  // namespace tsl
+#define TF_UNUSED_VARIABLE(x) \
+  tensorflow::internal::remove_unused_variable_compiler_warning(x)
+
+#endif  // XLA_TSL_PLATFORM_MACROS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h
new file mode 100644
index 00000000..b796ef5b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -0,0 +1,69 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
+#define XLA_TSL_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
+
+#include <sys/types.h>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/profile_utils/i_cpu_utils_helper.h"
+#include "xla/tsl/platform/types.h"
+
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 21) && \
+    (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
+struct perf_event_attr;
+
+namespace tsl {
+namespace profile_utils {
+
+// Implementation of CpuUtilsHelper for Android armv7a
+class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
+ public:
+  AndroidArmV7ACpuUtilsHelper() = default;
+  void ResetClockCycle() final;
+  uint64 GetCurrentClockCycle() final;
+  void EnableClockCycleProfiling() final;
+  void DisableClockCycleProfiling() final;
+  int64 CalculateCpuFrequency() final;
+
+ private:
+  static constexpr int INVALID_FD = -1;
+  static constexpr int64 INVALID_CPU_FREQUENCY = -1;
+
+  void InitializeInternal();
+
+  // syscall __NR_perf_event_open with arguments
+  int OpenPerfEvent(perf_event_attr *const hw_event, const pid_t pid,
+                    const int cpu, const int group_fd,
+                    const unsigned long flags);
+
+  int64 ReadCpuFrequencyFile(const int cpu_id, const char *const type);
+
+  bool is_initialized_{false};
+  int fd_{INVALID_FD};
+
+  AndroidArmV7ACpuUtilsHelper(const AndroidArmV7ACpuUtilsHelper &) = delete;
+  void operator=(const AndroidArmV7ACpuUtilsHelper &) = delete;
+};
+
+}  // namespace profile_utils
+}  // namespace tsl
+
+#endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
+        // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
+#endif  // XLA_TSL_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
new file mode 100644
index 00000000..b922cb94
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+#define XLA_TSL_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+
+#include <algorithm>
+
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/profile_utils/cpu_utils.h"
+
+namespace tsl {
+
+class ClockCycleProfiler {
+ public:
+  ClockCycleProfiler() = default;
+
+  // Start counting clock cycle.
+  inline void Start() {
+    CHECK(!IsStarted()) << "Profiler has been already started.";
+    start_clock_ = GetCurrentClockCycleInternal();
+  }
+
+  // Stop counting clock cycle.
+  inline void Stop() {
+    CHECK(IsStarted()) << "Profiler is not started yet.";
+    AccumulateClockCycle();
+  }
+
+  // Get how many times Start() is called.
+  inline double GetCount() {
+    CHECK(!IsStarted());
+    return count_;
+  }
+
+  // Get average clock cycle.
+  inline double GetAverageClockCycle() {
+    CHECK(!IsStarted());
+    return average_clock_cycle_;
+  }
+
+  // TODO(satok): Support more statistics (e.g. standard deviation)
+  // Get worst clock cycle.
+  inline double GetWorstClockCycle() {
+    CHECK(!IsStarted());
+    return worst_clock_cycle_;
+  }
+
+  // Dump statistics
+  void DumpStatistics(const string& tag);
+
+ private:
+  inline uint64 GetCurrentClockCycleInternal() {
+    const uint64 clockCycle = profile_utils::CpuUtils::GetCurrentClockCycle();
+    if (clockCycle <= 0) {
+      if (valid_) {
+        LOG(WARNING) << "GetCurrentClockCycle is not implemented."
+                     << " Return 1 instead.";
+        valid_ = false;
+      }
+      return 1;
+    } else {
+      return clockCycle;
+    }
+  }
+
+  inline bool IsStarted() const { return start_clock_ > 0; }
+
+  inline void AccumulateClockCycle() {
+    const uint64 now = GetCurrentClockCycleInternal();
+    const double clock_diff = static_cast<double>(now - start_clock_);
+    const double next_count = count_ + 1.0;
+    const double next_count_inv = 1.0 / next_count;
+    const double next_ave_cpu_clock =
+        next_count_inv * (average_clock_cycle_ * count_ + clock_diff);
+    count_ = next_count;
+    average_clock_cycle_ = next_ave_cpu_clock;
+    worst_clock_cycle_ = std::max(worst_clock_cycle_, clock_diff);
+    start_clock_ = 0;
+  }
+
+  uint64 start_clock_{0};
+  double count_{0.0};
+  double average_clock_cycle_{0.0};
+  double worst_clock_cycle_{0.0};
+  bool valid_{true};
+
+  ClockCycleProfiler(const ClockCycleProfiler&) = delete;
+  void operator=(const ClockCycleProfiler&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
new file mode 100644
index 00000000..f3d6d425
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
@@ -0,0 +1,193 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This class is designed to get accurate profile for programs.
+
+#ifndef XLA_TSL_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
+#define XLA_TSL_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
+
+#include <chrono>
+#include <memory>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/profile_utils/i_cpu_utils_helper.h"
+#include "xla/tsl/platform/types.h"
+
+#if defined(ARMV6) || defined(__ARM_ARCH_7A__)
+#include <sys/time.h>
+#endif
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+namespace tsl {
+
+namespace profile_utils {
+
+// CpuUtils is a profiling tool with static functions
+// designed to be called from multiple classes.
+// A dedicated class which inherits ICpuUtilsHelper is
+// stored as a function-local static variable which inherits
+// GetCpuUtilsHelperSingletonInstance that caches CPU information,
+// because loading CPU information may take a long time.
+// Users must call EnableClockCycleProfiling before using CpuUtils.
+class CpuUtils {
+ public:
+  // Constant for invalid frequency.
+  // This value is returned when the frequency is not obtained somehow.
+  static constexpr int64_t INVALID_FREQUENCY = -1;
+  static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
+
+  // Return current clock cycle. This function is designed to
+  // minimize the overhead to get clock and maximize the accuracy of
+  // time for profile.
+  // This returns unsigned int because there is no guarantee that rdtsc
+  // is less than 2 ^ 61.
+  static inline uint64 GetCurrentClockCycle() {
+#if defined(__ANDROID__)
+    return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
+// ----------------------------------------------------------------
+#elif defined(_WIN32)
+    return __rdtsc();
+// ----------------------------------------------------------------
+#elif defined(__x86_64__) || defined(__amd64__)
+    uint64_t high, low;
+    __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+    return (high << 32) | low;
+// ----------------------------------------------------------------
+#elif defined(__aarch64__)
+    // System timer of ARMv8 runs at a different frequency than the CPU's.
+    // The frequency is fixed, typically in the range 1-50MHz.  It can because
+    // read at CNTFRQ special register.  We assume the OS has set up
+    // the virtual timer properly.
+    uint64_t virtual_timer_value;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+    return virtual_timer_value;
+// ----------------------------------------------------------------
+// V6 is the earliest arm that has a standard cyclecount
+#elif defined(ARMV6) || defined(__ARM_ARCH_7A__)
+    uint32_t pmccntr;
+    uint32_t pmuseren;
+    uint32_t pmcntenset;
+    // Read the user mode perf monitor counter access permissions.
+    asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+      asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+      if (pmcntenset & 0x80000000ul) {  // Is it counting?
+        asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+        // The counter is set up to count every 64th cyclecount
+        return static_cast<uint64>(pmccntr) * 64;  // Should optimize to << 64
+      }
+    }
+    // Returning dummy clock when can't access to the counter
+    return DUMMY_CYCLE_CLOCK;
+#elif defined(__powerpc64__) || defined(__ppc64__)
+    uint64 __t;
+    __asm__ __volatile__("mfspr %0,268" : "=r"(__t));
+    return __t;
+
+#elif defined(__powerpc__) || defined(__ppc__)
+    uint64 upper, lower, tmp;
+    __asm__ volatile(
+        "0:                     \n"
+        "\tmftbu   %0           \n"
+        "\tmftb    %1           \n"
+        "\tmftbu   %2           \n"
+        "\tcmpw    %2,%0        \n"
+        "\tbne     0b           \n"
+        : "=r"(upper), "=r"(lower), "=r"(tmp));
+    return ((static_cast<uint64>(upper) << 32) | lower);
+#elif defined(__s390x__)
+    // TOD Clock of s390x runs at a different frequency than the CPU's.
+    // The stepping is 244 picoseconds (~4Ghz).
+    uint64 t;
+    __asm__ __volatile__("stckf %0" : "=Q"(t));
+    return t;
+#else
+    // TODO(satok): Support generic way to emulate clock count.
+    // TODO(satok): Support other architectures if wanted.
+    // Returning dummy clock when can't access to the counter
+    return DUMMY_CYCLE_CLOCK;
+#endif
+  }
+
+// Return cycle counter frequency.
+// As this method caches the cpu frequency internally,
+// the first call will incur overhead, but not subsequent calls.
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  static uint64 GetCycleCounterFrequency();
+#else
+  static int64_t GetCycleCounterFrequency();
+#endif
+
+  // Return micro second per each clock
+  // As this method caches the cpu frequency internally,
+  // the first call will incur overhead, but not subsequent calls.
+  static double GetMicroSecPerClock();
+
+  // Reset clock cycle
+  // Resetting clock cycle is recommended to prevent
+  // clock cycle counters from overflowing on some platforms.
+  static void ResetClockCycle();
+
+  // Enable/Disable clock cycle profile
+  // You can enable / disable profile if it's supported by the platform
+  static void EnableClockCycleProfiling();
+  static void DisableClockCycleProfiling();
+
+  // Return chrono::duration per each clock
+  static std::chrono::duration<double> ConvertClockCycleToTime(
+      const int64_t clock_cycle);
+
+ private:
+  class DefaultCpuUtilsHelper : public ICpuUtilsHelper {
+   public:
+    DefaultCpuUtilsHelper() = default;
+    void ResetClockCycle() final {}
+    uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
+    void EnableClockCycleProfiling() final {}
+    void DisableClockCycleProfiling() final {}
+    int64_t CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
+
+   private:
+    DefaultCpuUtilsHelper(const DefaultCpuUtilsHelper&) = delete;
+    void operator=(const DefaultCpuUtilsHelper&) = delete;
+  };
+
+  // Return cpu frequency.
+  // CAVEAT: as this method calls system call and parse the message,
+  // this call may be slow. This is why this class caches the value by
+  // StaticVariableInitializer.
+  static int64_t GetCycleCounterFrequencyImpl();
+
+  // Return a singleton of ICpuUtilsHelper
+  // ICpuUtilsHelper is declared as a function-local static variable
+  // for the following two reasons:
+  // 1. Avoid passing instances to all classes which want
+  // to use profiling tools in CpuUtils
+  // 2. Minimize the overhead of acquiring ICpuUtilsHelper
+  static ICpuUtilsHelper& GetCpuUtilsHelperSingletonInstance();
+
+  CpuUtils(const CpuUtils&) = delete;
+  void operator=(const CpuUtils&) = delete;
+};
+
+}  // namespace profile_utils
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
new file mode 100644
index 00000000..11d5bf2f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
@@ -0,0 +1,55 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
+#define XLA_TSL_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace profile_utils {
+
+// ICpuUtilsHelper is an interface class for cpu_utils which proxies
+// the difference of profiling functions of different platforms.
+// Overridden functions must be thread safe.
+class ICpuUtilsHelper {
+ public:
+  ICpuUtilsHelper() = default;
+  virtual ~ICpuUtilsHelper() = default;
+  // Reset clock cycle.
+  // Resetting clock cycle is recommended to prevent
+  // clock cycle counters from overflowing on some platforms.
+  virtual void ResetClockCycle() = 0;
+  // Return current clock cycle.
+  virtual uint64 GetCurrentClockCycle() = 0;
+  // Enable/Disable clock cycle profile
+  // You can enable / disable profile if it's supported by the platform
+  virtual void EnableClockCycleProfiling() = 0;
+  virtual void DisableClockCycleProfiling() = 0;
+  // Return cpu frequency.
+  // CAVEAT: as this method may read file and/or call system calls,
+  // this call is supposed to be slow.
+  virtual int64_t CalculateCpuFrequency() = 0;
+
+ private:
+  ICpuUtilsHelper(const ICpuUtilsHelper&) = delete;
+  void operator=(const ICpuUtilsHelper&) = delete;
+};
+
+}  // namespace profile_utils
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status.h
new file mode 100644
index 00000000..0086587b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status.h
@@ -0,0 +1,226 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_STATUS_H_
+#define XLA_TSL_PLATFORM_STATUS_H_
+
+#include <functional>
+#include <iosfwd>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/protobuf/error_codes.pb.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/stack_frame.h"
+
+// Include appropriate platform-dependent parts of status.
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/status.h"  // IWYU pragma: export
+#else
+#include "xla/tsl/platform/default/status.h"  // IWYU pragma: export
+#endif
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tsl {
+
+// Since April 2023, tensorflow::Status is an alias to absl::Status. The first
+// TF release including this change will be TF 2.14 (the latest release in
+// April 2023 is 2.13).
+// At the same time `tsl::errors::Code` aliases `absl::StatusCode`.
+//
+// Here is a set of correspondences:
+// - Use `absl::OkStatus()` instead of `tsl::OkStatus()`.
+typedef absl::Status Status ABSL_DEPRECATE_AND_INLINE();
+
+namespace errors {
+typedef absl::StatusCode Code ABSL_DEPRECATE_AND_INLINE();
+}  // namespace errors
+namespace error {
+typedef ::tensorflow::error::Code Code;
+}  // namespace error
+}  // namespace tsl
+
+// Transparent comparison between tensorflow::error::Code protobuf enum and
+// absl::Status.
+//
+// The longer term objective is to delete these when we have done the transition
+// to absl::Status.
+namespace tensorflow::error {
+inline bool operator==(const ::tensorflow::error::Code& c1,
+                       const absl::StatusCode& c2) {
+  return static_cast<int>(c1) == static_cast<int>(c2);
+}
+
+inline bool operator!=(const ::tensorflow::error::Code& c1,
+                       const absl::StatusCode& c2) {
+  return static_cast<int>(c1) != static_cast<int>(c2);
+}
+}  // namespace tensorflow::error
+
+namespace absl {
+inline bool operator==(const ::absl::StatusCode& c1,
+                       const ::tensorflow::error::Code& c2) {
+  return static_cast<int>(c1) == static_cast<int>(c2);
+}
+
+inline bool operator!=(const ::absl::StatusCode& c1,
+                       const ::tensorflow::error::Code& c2) {
+  return static_cast<int>(c1) != static_cast<int>(c2);
+}
+}  // namespace absl
+
+namespace tsl {
+
+// OkStatus()
+//
+// Returns an OK status, equivalent to a default constructed instance. Prefer
+// usage of `OkStatus()` when constructing such an OK status.
+ABSL_DEPRECATE_AND_INLINE() inline absl::Status OkStatus() {
+  return absl::OkStatus();
+};
+
+ABSL_DEPRECATE_AND_INLINE()
+inline absl::Status FromAbslStatus(const absl::Status& s) { return s; }
+ABSL_DEPRECATE_AND_INLINE()
+inline absl::Status ToAbslStatus(const ::absl::Status& s) { return s; }
+
+// Given `Status.message()` does not guarantee to be always backed by a
+// null-terminated string, we have this utility function when it's needed for
+// the Tensorflow C-API.
+// A more robust API would be to get both a `char*` of the beginning of the
+// string, plus the size (see e.g. `XlaCustomCallStatusSetFailure`).
+// NB: This Windows-only implementation is exists only to avoid a linker error.
+// Remove if this is resolved.
+#ifdef _WIN32
+const char* NullTerminatedMessage(const absl::Status& status);
+#else
+ABSL_DEPRECATE_AND_INLINE()
+inline const char* NullTerminatedMessage(const absl::Status& status) {
+  return absl::StatusMessageAsCStr(status);
+}
+#endif
+
+// TODO(b/197552541) Move this namespace to errors.h.
+namespace errors {
+
+void SetStackTrace(absl::Status& status, std::vector<StackFrame> stack_trace);
+
+std::vector<StackFrame> GetStackTrace(const absl::Status& status);
+}  // namespace errors
+
+// Helper class to manage multiple child status values.
+class StatusGroup {
+ public:
+  StatusGroup();
+  // Constructor to form a StatusGroup from any N set of Status arguments.
+  // Usage: StatusGroup({status_a, status_b, status_c});
+  StatusGroup(std::initializer_list<absl::Status> statuses);
+
+  // Utility function to mark a Status as derived. By marking derived status,
+  // Derived status messages are ignored when reporting errors to end users.
+  static absl::Status MakeDerived(const absl::Status& s);
+  static bool IsDerived(const absl::Status& s);
+
+  // Enable warning and error log collection for appending to the aggregated
+  // status. This function may be called more than once.
+  static void ConfigureLogHistory();
+
+  // Returns merged payloads of all statuses. In case multiple statuses have the
+  // same payload key, non-derived statuses have priority over derived ones,
+  // otherwise one payload value will be chosen in an unspecified but
+  // deterministic order.
+  // NOTE: The payload marking derived statuses as derived will not be returned.
+  std::unordered_map<std::string, absl::Cord> GetPayloads() const;
+
+  // Return a merged status with combined child status messages with a summary.
+  absl::Status as_summary_status() const;
+  // Return a merged status with combined child status messages with
+  // concatenation.
+  absl::Status as_concatenated_status() const;
+
+  bool ok() const { return ok_; }
+
+  // Augment this group with the child status `status`.
+  void Update(const absl::Status& status);
+
+  // Attach recent warning and error log messages
+  void AttachLogMessages();
+  bool HasLogMessages() const { return !recent_logs_.empty(); }
+
+ private:
+  bool ok_ = true;
+  size_t num_ok_ = 0;
+
+  // Maintain a sorted collection of statuses.
+  struct CompareStatus {
+    bool operator()(const absl::Status& a, const absl::Status& b) const {
+      return a.ToString() > b.ToString();
+    }
+  };
+  // Using std::set instead of absl::btree_set to keep size for certain
+  // dependent libraries under the limit.
+  std::set<absl::Status, CompareStatus> derived_;
+  std::set<absl::Status, CompareStatus> non_derived_;
+
+  std::vector<std::string> recent_logs_;  // recent warning and error logs
+};
+
+typedef std::function<void(const absl::Status&)> StatusCallback;
+
+extern ::tsl::string* TfCheckOpHelperOutOfLine(const absl::Status& v,
+                                               const char* msg);
+
+inline ::tsl::string* TfCheckOpHelper(absl::Status v, const char* msg) {
+  if (v.ok()) return nullptr;
+  return TfCheckOpHelperOutOfLine(v, msg);
+}
+
+#define TF_DO_CHECK_OK(val, level)                          \
+  while (auto* _result = ::tsl::TfCheckOpHelper(val, #val)) \
+  LOG(level) << *(_result)
+
+#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL)
+#define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
+
+// DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
+// mode.
+#ifndef NDEBUG
+#define TF_DCHECK_OK(val) TF_CHECK_OK(val)
+#else
+#define TF_DCHECK_OK(val) \
+  while (false && (::tsl::OkStatus() == (val))) LOG(FATAL)
+#endif
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_STATUS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status_matchers.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status_matchers.h
new file mode 100644
index 00000000..9650ec28
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status_matchers.h
@@ -0,0 +1,343 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PLATFORM_STATUS_MATCHERS_H_
+#define XLA_TSL_PLATFORM_STATUS_MATCHERS_H_
+
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/protobuf/error_codes.pb.h"
+
+// Defines the following utilities:
+//
+// ===============
+// IsOkAndHolds(m)
+// ===============
+//
+// This matcher matches a StatusOr<T> value whose status is OK and whose inner
+// value matches matcher m. Example:
+//
+//   using ::tsl::testing::IsOkAndHolds;
+//   using ::testing::HasSubstr;
+//   ...
+//   StatusOr<std::string> status_or_message("Hello, world");
+//   EXPECT_THAT(status_or_message, IsOkAndHolds("Hello, world")));
+//   EXPECT_THAT(status_or_message, IsOkAndHolds(HasSubstr("Hello,")));
+//
+// ===============================
+// StatusIs(status_code_matcher,
+//          error_message_matcher)
+// ===============================
+//
+// This matcher matches a Status or StatusOr<T> if the following are true:
+//
+//   - the status's code() matches status_code_matcher, and
+//   - the status's error_message() matches error_message_matcher.
+//
+// Example:
+//
+//   using ::tsl::testing::StatusIs;
+//   using ::testing::HasSubstr;
+//   using ::testing::MatchesRegex;
+//   using ::testing::Ne;
+//   using ::testing::_;
+//   StatusOr<std::string> GetMessage(int id);
+//   ...
+//
+//   // The status code must be CANCELLED; the error message can be anything.
+//   EXPECT_THAT(GetName(42),
+//               StatusIs(tsl::error::CANCELLED, _));
+//
+//   // The status code can be anything; the error message must match the regex.
+//   EXPECT_THAT(GetName(43),
+//               StatusIs(_, MatchesRegex("server.*time-out")));
+//
+//   // The status code should not be CANCELLED; the error message can be
+//   // anything with "Cancelled" in it.
+//   EXPECT_THAT(GetName(44),
+//               StatusIs(Ne(tsl::error::CANCELLED),
+//                        HasSubstr("Cancelled"))));
+//
+// =============================
+// StatusIs(status_code_matcher)
+// =============================
+//
+// This is a shorthand for
+//   StatusIs(status_code_matcher, ::testing::_)
+//
+// In other words, it's like the two-argument StatusIs(), except that it ignores
+// error messages.
+//
+// ======
+// IsOk()
+// ======
+//
+// Matches a Status or StatusOr<T> whose status value is OK.
+// Equivalent to 'StatusIs(error::OK)'.
+//
+// Example:
+//   ...
+//   StatusOr<std::string> message("Hello, world");
+//   EXPECT_THAT(message, IsOk());
+//   Status status = OkStatus();
+//   EXPECT_THAT(status, IsOk());
+
+namespace tsl {
+
+inline void PrintTo(const tsl::error::Code code, std::ostream* os) {
+  *os << Code_Name(code);
+}
+
+template <typename T>
+void PrintTo(const StatusOr<T>& status_or, std::ostream* os) {
+  *os << ::testing::PrintToString(status_or.status());
+  if (status_or.ok()) {
+    *os << ": " << ::testing::PrintToString(status_or.value());
+  }
+}
+
+namespace testing {
+namespace internal_status {
+
+inline const absl::Status& GetStatus(const absl::Status& status) {
+  return status;
+}
+
+template <typename T>
+inline const absl::Status& GetStatus(const StatusOr<T>& status) {
+  return status.status();
+}
+
+////////////////////////////////////////////////////////////
+// Implementation of IsOkAndHolds().
+//
+// Monomorphic implementation of matcher IsOkAndHolds(m). StatusOrType is a
+// reference to StatusOr<T>.
+template <typename StatusOrType>
+class IsOkAndHoldsMatcherImpl
+    : public ::testing::MatcherInterface<StatusOrType> {
+ public:
+  typedef
+      typename std::remove_reference<StatusOrType>::type::value_type value_type;
+
+  template <typename InnerMatcher>
+  explicit IsOkAndHoldsMatcherImpl(InnerMatcher&& inner_matcher)
+      : inner_matcher_(::testing::SafeMatcherCast<const value_type&>(
+            std::forward<InnerMatcher>(inner_matcher))) {}
+
+  void DescribeTo(std::ostream* os) const override {
+    *os << "is OK and has a value that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    *os << "isn't OK or has a value that ";
+    inner_matcher_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(
+      StatusOrType actual_value,
+      ::testing::MatchResultListener* result_listener) const override {
+    if (!actual_value.ok()) {
+      *result_listener << "which has status " << actual_value.status();
+      return false;
+    }
+
+    ::testing::StringMatchResultListener inner_listener;
+    const bool matches =
+        inner_matcher_.MatchAndExplain(*actual_value, &inner_listener);
+    const std::string inner_explanation = inner_listener.str();
+    if (!inner_explanation.empty()) {
+      *result_listener << "which contains value "
+                       << ::testing::PrintToString(*actual_value) << ", "
+                       << inner_explanation;
+    }
+    return matches;
+  }
+
+ private:
+  const ::testing::Matcher<const value_type&> inner_matcher_;
+};
+
+// Implements IsOkAndHolds(m) as a polymorphic matcher.
+template <typename InnerMatcher>
+class IsOkAndHoldsMatcher {
+ public:
+  explicit IsOkAndHoldsMatcher(InnerMatcher inner_matcher)
+      : inner_matcher_(std::move(inner_matcher)) {}
+
+  // Converts this polymorphic matcher to a monomorphic matcher of the given
+  // type. StatusOrType can be either StatusOr<T> or a reference to StatusOr<T>.
+  template <typename StatusOrType>
+  operator ::testing::Matcher<StatusOrType>() const {  // NOLINT
+    return ::testing::Matcher<StatusOrType>(
+        new IsOkAndHoldsMatcherImpl<const StatusOrType&>(inner_matcher_));
+  }
+
+ private:
+  const InnerMatcher inner_matcher_;
+};
+
+////////////////////////////////////////////////////////////
+// Implementation of StatusIs().
+//
+// StatusIs() is a polymorphic matcher. This class is the common
+// implementation of it shared by all types T where StatusIs() can be used as
+// a Matcher<T>.
+
+class StatusIsMatcherCommonImpl {
+ public:
+  StatusIsMatcherCommonImpl(
+      ::testing::Matcher<const absl::StatusCode> code_matcher,
+      ::testing::Matcher<const std::string&> message_matcher)
+      : code_matcher_(std::move(code_matcher)),
+        message_matcher_(std::move(message_matcher)) {}
+
+  void DescribeTo(std::ostream* os) const;
+
+  void DescribeNegationTo(std::ostream* os) const;
+
+  bool MatchAndExplain(const absl::Status& status,
+                       ::testing::MatchResultListener* result_listener) const;
+
+ private:
+  const ::testing::Matcher<const absl::StatusCode> code_matcher_;
+  const ::testing::Matcher<const std::string&> message_matcher_;
+};
+
+// Monomorphic implementation of matcher StatusIs() for a given type T. T can
+// be Status, StatusOr<>, or a reference to either of them.
+template <typename T>
+class MonoStatusIsMatcherImpl : public ::testing::MatcherInterface<T> {
+ public:
+  explicit MonoStatusIsMatcherImpl(StatusIsMatcherCommonImpl common_impl)
+      : common_impl_(std::move(common_impl)) {}
+
+  void DescribeTo(std::ostream* os) const override {
+    common_impl_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    common_impl_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(
+      T actual_value,
+      ::testing::MatchResultListener* result_listener) const override {
+    return common_impl_.MatchAndExplain(GetStatus(actual_value),
+                                        result_listener);
+  }
+
+ private:
+  StatusIsMatcherCommonImpl common_impl_;
+};
+
+// Implements StatusIs() as a polymorphic matcher.
+class StatusIsMatcher {
+ public:
+  StatusIsMatcher(::testing::Matcher<const absl::StatusCode> code_matcher,
+                  ::testing::Matcher<const std::string&> message_matcher)
+      : common_impl_(
+            ::testing::MatcherCast<const absl::StatusCode>(code_matcher),
+            ::testing::MatcherCast<const std::string&>(message_matcher)) {}
+
+  // Converts this polymorphic matcher to a monomorphic matcher of the given
+  // type. T can be StatusOr<>, Status, or a reference to either of them.
+  template <typename T>
+  operator ::testing::Matcher<T>() const {  // NOLINT
+    return ::testing::MakeMatcher(new MonoStatusIsMatcherImpl<T>(common_impl_));
+  }
+
+ private:
+  const StatusIsMatcherCommonImpl common_impl_;
+};
+
+// Monomorphic implementation of matcher IsOk() for a given type T.
+// T can be Status, StatusOr<>, or a reference to either of them.
+template <typename T>
+class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
+ public:
+  void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
+  void DescribeNegationTo(std::ostream* os) const override {
+    *os << "is not OK";
+  }
+  bool MatchAndExplain(T actual_value,
+                       ::testing::MatchResultListener*) const override {
+    return GetStatus(actual_value).ok();
+  }
+};
+
+// Implements IsOk() as a polymorphic matcher.
+class IsOkMatcher {
+ public:
+  template <typename T>
+  operator ::testing::Matcher<T>() const {  // NOLINT
+    return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
+  }
+};
+}  // namespace internal_status
+
+// Returns a matcher that matches a StatusOr<> whose status is OK and whose
+// value matches the inner matcher.
+template <typename InnerMatcher>
+internal_status::IsOkAndHoldsMatcher<typename std::decay<InnerMatcher>::type>
+IsOkAndHolds(InnerMatcher&& inner_matcher) {
+  return internal_status::IsOkAndHoldsMatcher<
+      typename std::decay<InnerMatcher>::type>(
+      std::forward<InnerMatcher>(inner_matcher));
+}
+
+// Returns a matcher that matches a Status or StatusOr<> whose status code
+// matches code_matcher, and whose error message matches message_matcher.
+template <typename CodeMatcher, typename MessageMatcher>
+internal_status::StatusIsMatcher StatusIs(CodeMatcher code_matcher,
+                                          MessageMatcher message_matcher) {
+  return internal_status::StatusIsMatcher(std::move(code_matcher),
+                                          std::move(message_matcher));
+}
+// Remove this specialization when tensorflow::Status is absl::Status
+template <typename MessageMatcher>
+internal_status::StatusIsMatcher StatusIs(tensorflow::error::Code code_matcher,
+                                          MessageMatcher message_matcher) {
+  return internal_status::StatusIsMatcher(
+      static_cast<absl::StatusCode>(code_matcher), std::move(message_matcher));
+}
+
+// Returns a matcher that matches a Status or StatusOr<> whose status code
+// matches code_matcher.
+template <typename CodeMatcher>
+internal_status::StatusIsMatcher StatusIs(CodeMatcher code_matcher) {
+  return StatusIs(std::move(code_matcher), ::testing::_);
+}
+// Remove this specialization when tensorflow::Status is absl::Status
+template <>
+inline internal_status::StatusIsMatcher StatusIs(
+    tensorflow::error::Code code_matcher) {
+  return StatusIs(static_cast<absl::StatusCode>(code_matcher), ::testing::_);
+}
+
+// Returns a matcher that matches a Status or StatusOr<> which is OK.
+inline internal_status::IsOkMatcher IsOk() {
+  return internal_status::IsOkMatcher();
+}
+
+}  // namespace testing
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_STATUS_MATCHERS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status_to_from_proto.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status_to_from_proto.h
new file mode 100644
index 00000000..b26d824b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/status_to_from_proto.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
+#define XLA_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
+
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/status.pb.h"
+
+namespace tsl {
+
+// TODO(b/250921378): Merge this file with `status.h` once we figure out how to
+// fix the following error with the MacOS build:
+//
+// ImportError:
+// dlopen(/org_tensorflow/tensorflow/python/platform/_pywrap_tf2.so, 2):
+// Symbol not found: tensorflow11StatusProtoC1EPN6protobuf5ArenaEb
+
+// Converts a `Status` to a `StatusProto`.
+tensorflow::StatusProto StatusToProto(const absl::Status& s);
+
+#if defined(PLATFORM_GOOGLE)
+// Constructs a `Status` from a `StatusProto`.
+absl::Status StatusFromProto(
+    const tensorflow::StatusProto& proto,
+    absl::SourceLocation loc = absl::SourceLocation::current());
+#else
+Status StatusFromProto(const tensorflow::StatusProto& proto);
+#endif
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/statusor.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/statusor.h
new file mode 100644
index 00000000..f638fe3f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/statusor.h
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// StatusOr<T> is the union of a Status object and a T object. StatusOr models
+// the concept of an object that is either a value, or an error Status
+// explaining why such a value is not present. To this end, StatusOr<T> does not
+// allow its Status value to be Status::OK.
+//
+// The primary use-case for StatusOr<T> is as the return value of a
+// function which may fail.
+//
+// Example client usage for a StatusOr<T>, where T is not a pointer:
+//
+//  StatusOr<float> result = DoBigCalculationThatCouldFail();
+//  if (result.ok()) {
+//    float answer = result.value();
+//    printf("Big calculation yielded: %f", answer);
+//  } else {
+//    LOG(ERROR) << result.status();
+//  }
+//
+// Example client usage for a StatusOr<T*>:
+//
+//  StatusOr<Foo*> result = FooFactory::MakeNewFoo(arg);
+//  if (result.ok()) {
+//    std::unique_ptr<Foo> foo(result.value());
+//    foo->DoSomethingCool();
+//  } else {
+//    LOG(ERROR) << result.status();
+//  }
+//
+// Example client usage for a StatusOr<std::unique_ptr<T>>:
+//
+//  StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg);
+//  if (result.ok()) {
+//    std::unique_ptr<Foo> foo = std::move(result.value());
+//    foo->DoSomethingCool();
+//  } else {
+//    LOG(ERROR) << result.status();
+//  }
+//
+// Example factory implementation returning StatusOr<T*>:
+//
+//  StatusOr<Foo*> FooFactory::MakeNewFoo(int arg) {
+//    if (arg <= 0) {
+//      return tsl::InvalidArgument("Arg must be positive");
+//    } else {
+//      return new Foo(arg);
+//    }
+//  }
+//
+// Note that the assignment operators require that destroying the currently
+// stored value cannot invalidate the argument; in other words, the argument
+// cannot be an alias for the current value, or anything owned by the current
+// value.
+#ifndef XLA_TSL_PLATFORM_STATUSOR_H_
+#define XLA_TSL_PLATFORM_STATUSOR_H_
+
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/status/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/platform.h"
+
+// Include appropriate platform-dependent `TF_ASSIGN_OR_RETURN`.
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/statusor.h"  // IWYU pragma: export
+#else
+#include "xla/tsl/platform/default/statusor.h"  // IWYU pragma: export
+#endif
+
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
+namespace tsl {
+
+template <typename T>
+using StatusOr ABSL_DEPRECATE_AND_INLINE() = absl::StatusOr<T>;
+
+}  // namespace tsl
+
+#define TF_ASSERT_OK_AND_ASSIGN(lhs, rexpr)                             \
+  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                         \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
+      rexpr);
+
+#define TF_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr)  \
+  auto statusor = (rexpr);                                  \
+  ASSERT_TRUE(statusor.status().ok()) << statusor.status(); \
+  lhs = std::move(statusor).value()
+
+#define TF_STATUS_MACROS_CONCAT_NAME(x, y) TF_STATUS_MACROS_CONCAT_IMPL(x, y)
+#define TF_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+
+#endif  // XLA_TSL_PLATFORM_STATUSOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/subprocess.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/subprocess.h
new file mode 100644
index 00000000..8702b779
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/subprocess.h
@@ -0,0 +1,73 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_SUBPROCESS_H_
+#define XLA_TSL_PLATFORM_SUBPROCESS_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+// Channel identifiers.
+enum Channel {
+  CHAN_STDIN = 0,
+  CHAN_STDOUT = 1,
+  CHAN_STDERR = 2,
+};
+
+// Specify how a channel is handled.
+enum ChannelAction {
+  // Close the file descriptor when the process starts.
+  // This is the default behavior.
+  ACTION_CLOSE,
+
+  // Make a pipe to the channel.  It is used in the Communicate() method to
+  // transfer data between the parent and child processes.
+  ACTION_PIPE,
+
+  // Duplicate the parent's file descriptor. Useful if stdout/stderr should
+  // go to the same place that the parent writes it.
+  ACTION_DUPPARENT,
+};
+
+// Supports spawning and killing child processes.
+class SubProcess;
+
+// Returns an object that represents a child process that will be
+// launched with the given command-line arguments `argv`. The process
+// must be explicitly started by calling the Start() method on the
+// returned object.
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
+
+}  // namespace tsl
+
+#include "tsl/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "xla/tsl/platform/google/subprocess.h"
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) ||    \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_POSIX_IOS) || \
+    defined(PLATFORM_GOOGLE_IOS)
+#include "xla/tsl/platform/default/subprocess.h"  // IWYU pragma: export
+#elif defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/windows/subprocess.h"  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // XLA_TSL_PLATFORM_SUBPROCESS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/test.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/test.h
new file mode 100644
index 00000000..2569bc57
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/test.h
@@ -0,0 +1,86 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_TEST_H_
+#define XLA_TSL_PLATFORM_TEST_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>  // IWYU pragma: export
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/platform.h"
+
+// Includes gmock.h and enables the use of gmock matchers in tensorflow tests.
+//
+// Test including this header can use the macros EXPECT_THAT(...) and
+// ASSERT_THAT(...) in combination with gmock matchers.
+// Example:
+//  std::vector<int> vec = Foo();
+//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
+//  EXPECT_THAT(vec, ::testing::UnorderedElementsAre(2,3,1));
+//
+// For more details on gmock matchers see:
+// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
+//
+// The advantages of using gmock matchers instead of self defined matchers are
+// better error messages, more maintainable tests and more test coverage.
+#if !defined(PLATFORM_GOOGLE) && !defined(PLATFORM_GOOGLE_ANDROID) && \
+    !defined(PLATFORM_CHROMIUMOS)
+#include <gmock/gmock-actions.h>
+#include <gmock/gmock-matchers.h>            // IWYU pragma: export
+#include <gmock/gmock-more-matchers.h>       // IWYU pragma: export
+#endif
+#include <gmock/gmock.h>  // IWYU pragma: export
+
+namespace tsl {
+namespace testing {
+
+// Return a temporary directory suitable for temporary testing files.
+//
+// Where possible, consider using Env::LocalTempFilename over this function.
+std::string TmpDir();
+
+// Returns the path to TensorFlow in the directory containing data
+// dependencies.
+//
+// A better alternative would be making use if
+// tensorflow/tsl/platform/resource_loader.h:GetDataDependencyFilepath. That
+// function should do the right thing both within and outside of tests allowing
+// avoiding test specific APIs.
+std::string TensorFlowSrcRoot();
+
+// Returns the path to XLA in the directory containing data
+// dependencies.
+std::string XlaSrcRoot();
+
+// Returns the path to TSL in the directory containing data
+// dependencies.
+std::string TslSrcRoot();
+
+// Return a random number generator seed to use in randomized tests.
+// Returns the same value for the lifetime of the process.
+int RandomSeed();
+
+// Returns an unused port number, for use in multi-process testing.
+// NOTE: This function is not thread-safe.
+int PickUnusedPortOrDie();
+
+}  // namespace testing
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_TEST_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/test_benchmark.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/test_benchmark.h
new file mode 100644
index 00000000..2d0c4435
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/test_benchmark.h
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple benchmarking facility.
+#ifndef XLA_TSL_PLATFORM_TEST_BENCHMARK_H_
+#define XLA_TSL_PLATFORM_TEST_BENCHMARK_H_
+
+#include "benchmark/benchmark.h"  // IWYU pragma: export
+#include "tsl/platform/platform.h"
+
+// FIXME(vyng): Remove this.
+// Background: During the benchmark-migration projects, all benchmarks were made
+// to use "testing::benchmark::" prefix because that is what the internal
+// Google benchmark library use.
+namespace testing {
+namespace benchmark {
+using ::benchmark::State;  // NOLINT
+}  // namespace benchmark
+}  // namespace testing
+
+namespace tsl {
+namespace testing {
+
+inline void RunBenchmarks() { benchmark::RunSpecifiedBenchmarks(); }
+inline void InitializeBenchmarks(int* argc, char** argv) {
+  benchmark::Initialize(argc, argv);
+}
+
+template <class T>
+void DoNotOptimize(const T& var) {
+  ::benchmark::DoNotOptimize(var);
+}
+}  // namespace testing
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_TEST_BENCHMARK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool.h
new file mode 100644
index 00000000..ebd6ea59
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool.h
@@ -0,0 +1,245 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_THREADPOOL_H_
+#define XLA_TSL_PLATFORM_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/threadpool_interface.h"
+#include "xla/tsl/platform/types.h"
+
+namespace Eigen {
+class Allocator;
+class ThreadPoolInterface;
+struct ThreadPoolDevice;
+
+template <typename Environment>
+class ThreadPoolTempl;
+}  // namespace Eigen
+
+namespace tsl {
+namespace thread {
+
+struct EigenEnvironment;
+
+class ThreadPool {
+ public:
+  // Scheduling strategies for ParallelFor. The strategy governs how the given
+  // units of work are distributed among the available threads in the
+  // threadpool.
+  enum class SchedulingStrategy {
+    // The Adaptive scheduling strategy adaptively chooses the shard sizes based
+    // on the cost of each unit of work, and the cost model of the underlying
+    // threadpool device.
+    //
+    // The 'cost_per_unit' is an estimate of the number of CPU cycles (or
+    // nanoseconds if not CPU-bound) to complete a unit of work. Overestimating
+    // creates too many shards and CPU time will be dominated by per-shard
+    // overhead, such as Context creation. Underestimating may not fully make
+    // use of the specified parallelism, and may also cause inefficiencies due
+    // to load balancing issues and stragglers.
+    kAdaptive,
+    // The Fixed Block Size scheduling strategy shards the given units of work
+    // into shards of fixed size. In case the total number of units is not
+    // evenly divisible by 'block_size', at most one of the shards may be of
+    // smaller size. The exact number of shards may be found by a call to
+    // NumShardsUsedByFixedBlockSizeScheduling.
+    //
+    // Each shard may be executed on a different thread in parallel, depending
+    // on the number of threads available in the pool. Note that when there
+    // aren't enough threads in the pool to achieve full parallelism, function
+    // calls will be automatically queued.
+    kFixedBlockSize
+  };
+
+  // Contains additional parameters for either the Adaptive or the Fixed Block
+  // Size scheduling strategy.
+  class SchedulingParams {
+   public:
+    explicit SchedulingParams(SchedulingStrategy strategy,
+                              absl::optional<int64_t> cost_per_unit,
+                              absl::optional<int64_t> block_size)
+        : strategy_(strategy),
+          cost_per_unit_(cost_per_unit),
+          block_size_(block_size) {}
+
+    SchedulingStrategy strategy() const { return strategy_; }
+    absl::optional<int64_t> cost_per_unit() const { return cost_per_unit_; }
+    absl::optional<int64_t> block_size() const { return block_size_; }
+
+   private:
+    // The underlying Scheduling Strategy for which this instance contains
+    // additional parameters.
+    SchedulingStrategy strategy_;
+
+    // The estimated cost per unit of work in number of CPU cycles (or
+    // nanoseconds if not CPU-bound). Only applicable for Adaptive scheduling
+    // strategy.
+    absl::optional<int64_t> cost_per_unit_;
+
+    // The block size of each shard. Only applicable for Fixed Block Size
+    // scheduling strategy.
+    absl::optional<int64_t> block_size_;
+  };
+
+  // Constructs a pool that contains "num_threads" threads with specified
+  // "name". env->StartThread() is used to create individual threads with the
+  // given ThreadOptions. If "low_latency_hint" is true the thread pool
+  // implementation may use it as a hint that lower latency is preferred at the
+  // cost of higher CPU usage, e.g. by letting one or more idle threads spin
+  // wait. Conversely, if the threadpool is used to schedule high-latency
+  // operations like I/O the hint should be set to false.
+  //
+  // REQUIRES: num_threads > 0
+  ThreadPool(Env* env, const ThreadOptions& thread_options,
+             const std::string& name, int num_threads, bool low_latency_hint,
+             Eigen::Allocator* allocator = nullptr);
+
+  // Constructs a pool for low-latency ops that contains "num_threads" threads
+  // with specified "name". env->StartThread() is used to create individual
+  // threads.
+  // REQUIRES: num_threads > 0
+  ThreadPool(Env* env, const std::string& name, int num_threads);
+
+  // Constructs a pool for low-latency ops that contains "num_threads" threads
+  // with specified "name". env->StartThread() is used to create individual
+  // threads with the given ThreadOptions.
+  // REQUIRES: num_threads > 0
+  ThreadPool(Env* env, const ThreadOptions& thread_options,
+             const std::string& name, int num_threads);
+
+  // Constructs a pool that wraps around the thread::ThreadPoolInterface
+  // instance provided by the caller. Caller retains ownership of
+  // `user_threadpool` and must ensure its lifetime is longer than the
+  // ThreadPool instance.
+  explicit ThreadPool(thread::ThreadPoolInterface* user_threadpool);
+
+  // Waits until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~ThreadPool();
+
+  // Schedules fn() for execution in the pool of threads.
+  void Schedule(std::function<void()> fn);
+
+  void SetStealPartitions(
+      const std::vector<std::pair<unsigned, unsigned>>& partitions);
+
+  void ScheduleWithHint(std::function<void()> fn, int start, int limit);
+
+  // Returns the number of shards used by ParallelForFixedBlockSizeScheduling
+  // with these parameters.
+  int NumShardsUsedByFixedBlockSizeScheduling(const int64_t total,
+                                              const int64_t block_size);
+
+  // Returns the number of threads spawned by calling TransformRangeConcurrently
+  // with these parameters.
+  // Deprecated. Use NumShardsUsedByFixedBlockSizeScheduling.
+  int NumShardsUsedByTransformRangeConcurrently(const int64_t block_size,
+                                                const int64_t total);
+
+  // ParallelFor shards the "total" units of work assuming each unit of work
+  // having roughly "cost_per_unit" cost, in cycles. Each unit of work is
+  // indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work
+  // and the total cost of each shard is roughly the same.
+  //
+  // "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+  // if not CPU-bound) to complete a unit of work. Overestimating creates too
+  // many shards and CPU time will be dominated by per-shard overhead, such as
+  // Context creation. Underestimating may not fully make use of the specified
+  // parallelism, and may also cause inefficiencies due to load balancing
+  // issues and stragglers.
+  void ParallelFor(int64_t total, int64_t cost_per_unit,
+                   const std::function<void(int64_t, int64_t)>& fn);
+
+  // Similar to ParallelFor above, but takes the specified scheduling strategy
+  // into account.
+  void ParallelFor(int64_t total, const SchedulingParams& scheduling_params,
+                   const std::function<void(int64_t, int64_t)>& fn);
+
+  // Same as ParallelFor with Fixed Block Size scheduling strategy.
+  // Deprecated. Prefer ParallelFor with a SchedulingStrategy argument.
+  void TransformRangeConcurrently(
+      const int64_t block_size, const int64_t total,
+      const std::function<void(int64_t, int64_t)>& fn);
+
+  // Shards the "total" units of work. For more details, see "ParallelFor".
+  //
+  // The function is passed a thread_id between 0 and NumThreads() *inclusive*.
+  // This is because some work can happen on the caller thread while the threads
+  // in the pool are also being used.
+  //
+  // The caller can allocate NumThreads() + 1 separate buffers for each thread.
+  // Each thread can safely write to the buffer given by its id without
+  // synchronization. However, the worker fn may be called multiple times
+  // sequentially with the same id.
+  //
+  // At most NumThreads() unique ids will actually be used, and only a few may
+  // be used for small workloads. If each buffer is expensive, the buffers
+  // should be stored in an array initially filled with null, and a buffer
+  // should be allocated by fn the first time that the id is used.
+  void ParallelForWithWorkerId(
+      int64_t total, int64_t cost_per_unit,
+      const std::function<void(int64_t, int64_t, int)>& fn);
+
+  // Similar to ParallelForWithWorkerId above, but takes the specified
+  // scheduling strategy into account.
+  void ParallelForWithWorkerId(
+      int64_t total, const SchedulingParams& scheduling_params,
+      const std::function<void(int64_t, int64_t, int)>& fn);
+
+  // Returns the number of threads in the pool.
+  int NumThreads() const;
+
+  // Returns current thread id between 0 and NumThreads() - 1, if called from a
+  // thread in the pool. Returns -1 otherwise.
+  int CurrentThreadId() const;
+
+  // If ThreadPool implementation is compatible with Eigen::ThreadPoolInterface,
+  // returns a non-null pointer. The caller does not own the object the returned
+  // pointer points to, and should not attempt to delete.
+  Eigen::ThreadPoolInterface* AsEigenThreadPool() const;
+
+ private:
+  // Divides the work represented by the range [0, total) into k shards.
+  // Calls fn(i*block_size, (i+1)*block_size) from the ith shard (0 <= i < k).
+  // Each shard may be executed on a different thread in parallel, depending on
+  // the number of threads available in the pool.
+  // When (i+1)*block_size > total, fn(i*block_size, total) is called instead.
+  // Here, k = NumShardsUsedByFixedBlockSizeScheduling(total, block_size).
+  // Requires 0 < block_size <= total.
+  void ParallelForFixedBlockSizeScheduling(
+      const int64_t total, const int64_t block_size,
+      const std::function<void(int64_t, int64_t)>& fn);
+
+  // underlying_threadpool_ is the user_threadpool if user_threadpool is
+  // provided in the constructor. Otherwise it is the eigen_threadpool_.
+  Eigen::ThreadPoolInterface* underlying_threadpool_;
+  // eigen_threadpool_ is instantiated and owned by thread::ThreadPool if
+  // user_threadpool is not in the constructor.
+  std::unique_ptr<Eigen::ThreadPoolTempl<EigenEnvironment>> eigen_threadpool_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> threadpool_device_;
+  ThreadPool(const ThreadPool&) = delete;
+  void operator=(const ThreadPool&) = delete;
+};
+
+}  // namespace thread
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_async_executor.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_async_executor.h
new file mode 100644
index 00000000..3d35b5f5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_async_executor.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_THREADPOOL_ASYNC_EXECUTOR_H_
+#define XLA_TSL_PLATFORM_THREADPOOL_ASYNC_EXECUTOR_H_
+
+#include <utility>
+
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace tsl::thread {
+
+// An adaptor for a ThreadPool that converts it into the AsyncValue:Executor.
+//
+// AsncValue::Executor task is a move-only absl::AnyInvocable, and ThreadPool
+// expects a copyable std::function. This class adapts the two and makes sure
+// that the task is deleted when it's done executing.
+class ThreadPoolAsyncExecutor : public AsyncValue::Executor {
+ public:
+  explicit ThreadPoolAsyncExecutor(ThreadPool* thread_pool)
+      : thread_pool_(thread_pool) {}
+
+  void Execute(Task task) final {
+    auto* task_ptr = new Task(std::move(task));
+    thread_pool_->Schedule([task_ptr] {
+      (*task_ptr)();
+      delete task_ptr;
+    });
+  }
+
+ private:
+  ThreadPool* thread_pool_;
+};
+
+}  // namespace tsl::thread
+
+#endif  // XLA_TSL_PLATFORM_THREADPOOL_ASYNC_EXECUTOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_interface.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_interface.h
new file mode 100644
index 00000000..95ad088b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_interface.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_THREADPOOL_INTERFACE_H_
+#define XLA_TSL_PLATFORM_THREADPOOL_INTERFACE_H_
+
+#include "unsupported/Eigen/CXX11/ThreadPool"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+namespace thread {
+
+class ThreadPoolInterface : public Eigen::ThreadPoolInterface {};
+
+}  // namespace thread
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_THREADPOOL_INTERFACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_options.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_options.h
new file mode 100644
index 00000000..aa2ac294
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/threadpool_options.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_THREADPOOL_OPTIONS_H_
+#define XLA_TSL_PLATFORM_THREADPOOL_OPTIONS_H_
+
+#include "xla/tsl/platform/threadpool_interface.h"
+
+namespace tsl {
+namespace thread {
+
+struct ThreadPoolOptions {
+  // If not null, use this threadpool to schedule inter-op operation
+  thread::ThreadPoolInterface* inter_op_threadpool = nullptr;
+
+  // If not null, use this threadpool to schedule intra-op operation
+  thread::ThreadPoolInterface* intra_op_threadpool = nullptr;
+};
+
+}  // namespace thread
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_THREADPOOL_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/types.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/types.h
new file mode 100644
index 00000000..22131e33
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/types.h
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_TYPES_H_
+#define XLA_TSL_PLATFORM_TYPES_H_
+
+#include <string>
+
+#include "tsl/platform/bfloat16.h"
+#include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
+#include "tsl/platform/platform.h"
+#include "tsl/platform/tstring.h"
+
+// Include appropriate platform-dependent implementations
+#if defined(PLATFORM_GOOGLE) || defined(GOOGLE_INTEGRAL_TYPES)
+#include "xla/tsl/platform/google/integral_types.h"  // IWYU pragma: export
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) ||    \
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_POSIX_IOS) || \
+    defined(PLATFORM_GOOGLE_IOS) || defined(PLATFORM_WINDOWS)
+#include "xla/tsl/platform/default/integral_types.h"  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+namespace tsl {
+
+// Alias tsl::string to std::string.
+using std::string;
+
+static const uint4 kuint4max = static_cast<uint4>(0x0F);
+static const uint8 kuint8max = static_cast<uint8>(0xFF);
+static const uint16 kuint16max = static_cast<uint16>(0xFFFF);
+static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
+static const uint64 kuint64max = static_cast<uint64>(0xFFFFFFFFFFFFFFFFull);
+static const int8_t kint8min = static_cast<int8>(~0x7F);
+static const int8_t kint8max = static_cast<int8>(0x7F);
+static const int4 kint4min = static_cast<int4>(0x08);
+static const int4 kint4max = static_cast<int4>(0x07);
+static const int16_t kint16min = static_cast<int16>(~0x7FFF);
+static const int16_t kint16max = static_cast<int16>(0x7FFF);
+static const int32_t kint32min = static_cast<int32>(~0x7FFFFFFF);
+static const int32_t kint32max = static_cast<int32>(0x7FFFFFFF);
+static const int64_t kint64min = static_cast<int64_t>(~0x7FFFFFFFFFFFFFFFll);
+static const int64_t kint64max = static_cast<int64_t>(0x7FFFFFFFFFFFFFFFll);
+
+// A typedef for a uint64 used as a short fingerprint.
+using Fprint = uint64;
+
+}  // namespace tsl
+
+// Alias namespace ::stream_executor as ::tensorflow::se.
+namespace stream_executor {}
+namespace tensorflow {
+namespace se = ::stream_executor;
+}  // namespace tensorflow
+
+#if defined(PLATFORM_WINDOWS)
+#include <cstddef>
+typedef std::ptrdiff_t ssize_t;
+#endif
+
+#endif  // XLA_TSL_PLATFORM_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/error_windows.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/error_windows.h
new file mode 100644
index 00000000..2af1bf50
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/error_windows.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_WINDOWS_ERROR_WINDOWS_H_
+#define XLA_TSL_PLATFORM_WINDOWS_ERROR_WINDOWS_H_
+
+// This file is here to provide a windows specific interface to error functions
+// without needing to include any windows specific headers. This is intended to
+// reduce conflicts induced by code needing to run on multiple operating
+// systems.
+
+#include <string>
+
+namespace tsl {
+namespace internal {
+
+// WindowsGetLastErrorMessage calls GetLastError() and then formats the error
+// message for reporting.
+std::string WindowsGetLastErrorMessage();
+
+// WindowsWSLGetLastErrorMessage calls GetLastError() and then formats the error
+// message for reporting.
+std::string WindowsWSAGetLastErrorMessage();
+
+}  // namespace internal
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_WINDOWS_ERROR_WINDOWS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/intrinsics_port.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/intrinsics_port.h
new file mode 100644
index 00000000..0f2fa1d8
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/intrinsics_port.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
+#define XLA_TSL_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
+
+#ifdef _MSC_VER
+// the following avx intrinsics are not defined on windows
+// in immintrin.h so we define them here.
+//
+#include "xla/tsl/platform/types.h"
+
+#define _mm_load_pd1 _mm_load1_pd
+
+// only define these intrinsics if immintrin.h doesn't have them (VS2015 and
+// earlier)
+#if _MSC_VER < 1910
+static inline int _mm256_extract_epi32(__m256i a, const int i) {
+  return a.m256i_i32[i & 7];
+}
+
+static inline __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
+  __m256i c = a;
+  c.m256i_i32[i & 7] = b;
+  return c;
+}
+#endif
+#endif
+#endif  // XLA_TSL_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/stacktrace.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/stacktrace.h
new file mode 100644
index 00000000..492cbbf0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/stacktrace.h
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_WINDOWS_STACKTRACE_H_
+#define XLA_TSL_PLATFORM_WINDOWS_STACKTRACE_H_
+
+#include <string>
+
+#define TF_HAS_STACKTRACE
+
+namespace tsl {
+
+// Function to create a pretty stacktrace.
+std::string CurrentStackTrace();
+
+inline void DebugWriteToString(const char* data, void* arg) {
+  reinterpret_cast<std::string*>(arg)->append(data);
+}
+
+// A dummy class that does nothing.  Someday, add real support.
+class SavedStackTrace {
+ public:
+  SavedStackTrace() {}
+
+  void CreateCurrent(int skip_count) {}
+
+  void Reset() {}
+
+  typedef void DebugWriter(const char*, void*);
+  void Dump(DebugWriter* writerfn, void* arg) const {}
+
+  int depth() const { return 0; }
+  void* const* stack() const { return stack_; }
+
+ private:
+  void* stack_[32];
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_WINDOWS_STACKTRACE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/subprocess.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/subprocess.h
new file mode 100644
index 00000000..f8153553
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/subprocess.h
@@ -0,0 +1,126 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_WINDOWS_SUBPROCESS_H_
+#define XLA_TSL_PLATFORM_WINDOWS_SUBPROCESS_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+class SubProcess {
+ public:
+  // SubProcess()
+  //    nfds: The number of file descriptors to use.
+  explicit SubProcess(int nfds = 3);
+
+  // Virtual for backwards compatibility; do not create new subclasses.
+  // It is illegal to delete the SubProcess within its exit callback.
+  virtual ~SubProcess();
+
+  // SetChannelAction()
+  //    Set how to handle a channel.  The default action is ACTION_CLOSE.
+  //    The action is set for all subsequent processes, until SetChannel()
+  //    is called again.
+  //
+  //    SetChannel may not be called while the process is running.
+  //
+  //    chan: Which channel this applies to.
+  //    action: What to do with the channel.
+  // Virtual for backwards compatibility; do not create new subclasses.
+  virtual void SetChannelAction(Channel chan, ChannelAction action);
+
+  // SetProgram()
+  //    Set up a program and argument list for execution, with the full
+  //    "raw" argument list passed as a vector of strings.  argv[0]
+  //    should be the program name, just as in execv().
+  //
+  //    file: The file containing the program.  This must be an absolute path
+  //          name - $PATH is not searched.
+  //    argv: The argument list.
+  virtual void SetProgram(const string& file, const std::vector<string>& argv);
+
+  // Start()
+  //    Run the command that was previously set up with SetProgram().
+  //    The following are fatal programming errors:
+  //       * Attempting to start when a process is already running.
+  //       * Attempting to start without first setting the command.
+  //    Note, however, that Start() does not try to validate that the binary
+  //    does anything reasonable (e.g. exists or can execute); as such, you can
+  //    specify a non-existent binary and Start() will still return true.  You
+  //    will get a failure from the process, but only after Start() returns.
+  //
+  //    Return true normally, or false if the program couldn't be started
+  //    because of some error.
+  // Virtual for backwards compatibility; do not create new subclasses.
+  virtual bool Start();
+
+  // Kill()
+  //    Send the given signal to the process.
+  //    Return true normally, or false if we couldn't send the signal - likely
+  //    because the process doesn't exist.
+  virtual bool Kill(int signal);
+
+  // Wait()
+  //    Block until the process exits.
+  //    Return true normally, or false if the process wasn't running.
+  virtual bool Wait();
+
+  // Communicate()
+  //    Read from stdout and stderr and writes to stdin until all pipes have
+  //    closed, then waits for the process to exit.
+  //    Note: Do NOT call Wait() after calling Communicate as it will always
+  //     fail, since Communicate calls Wait() internally.
+  //    'stdin_input', 'stdout_output', and 'stderr_output' may be NULL.
+  //    If this process is not configured to send stdout or stderr to pipes,
+  //     the output strings will not be modified.
+  //    If this process is not configured to take stdin from a pipe, stdin_input
+  //     will be ignored.
+  //    Returns the command's exit status.
+  virtual int Communicate(const string* stdin_input, string* stdout_output,
+                          string* stderr_output);
+
+ private:
+  static const int kNFds = 3;
+  static bool chan_valid(int chan) { return ((chan >= 0) && (chan < kNFds)); }
+
+  void FreeArgs() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  void ClosePipes() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  bool WaitInternal(int* status);
+
+  // The separation between proc_mu_ and data_mu_ mutexes allows Kill() to be
+  // called by a thread while another thread is inside Wait() or Communicate().
+  mutable mutex proc_mu_;
+  bool running_ TF_GUARDED_BY(proc_mu_);
+  void* win_pi_ TF_GUARDED_BY(proc_mu_);
+
+  mutable mutex data_mu_ TF_ACQUIRED_AFTER(proc_mu_);
+  char* exec_path_ TF_GUARDED_BY(data_mu_);
+  char** exec_argv_ TF_GUARDED_BY(data_mu_);
+  ChannelAction action_[kNFds] TF_GUARDED_BY(data_mu_);
+  void* parent_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
+
+  SubProcess(const SubProcess&) = delete;
+  void operator=(const SubProcess&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_WINDOWS_SUBPROCESS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/wide_char.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/wide_char.h
new file mode 100644
index 00000000..9e263c72
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/wide_char.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_WINDOWS_WIDE_CHAR_H_
+#define XLA_TSL_PLATFORM_WINDOWS_WIDE_CHAR_H_
+
+#include <string>
+
+namespace tsl {
+
+std::wstring Utf8ToWideChar(const std::string& utf8str);
+
+std::string WideCharToUtf8(const std::wstring& wstr);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_WINDOWS_WIDE_CHAR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/windows_file_system.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
new file mode 100644
index 00000000..4dad7817
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
@@ -0,0 +1,99 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
+#define XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
+
+#include "xla/tsl/platform/file_system.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/platform.h"
+
+#ifdef PLATFORM_WINDOWS
+#undef CopyFile
+#undef DeleteFile
+#endif
+
+namespace tsl {
+
+class WindowsFileSystem : public FileSystem {
+ public:
+  WindowsFileSystem() {}
+
+  ~WindowsFileSystem() {}
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status NewRandomAccessFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
+
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
+
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  Status FileExists(const string& fname, TransactionToken* token) override;
+
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
+
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* result) override;
+
+  bool Match(const string& filename, const string& pattern) override;
+
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
+
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
+
+  Status CreateDir(const string& name, TransactionToken* token) override;
+
+  Status DeleteDir(const string& name, TransactionToken* token) override;
+
+  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
+                           int64_t* undeleted_files,
+                           int64_t* undeleted_dirs) override;
+
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
+
+  Status IsDirectory(const string& fname, TransactionToken* token) override;
+
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
+
+  string TranslateName(const string& name) const override { return name; }
+
+  char Separator() const override { return '\\'; };
+};
+
+class LocalWinFileSystem : public WindowsFileSystem {
+ public:
+  string TranslateName(const string& name) const override {
+    StringPiece scheme, host, path;
+    io::ParseURI(name, &scheme, &host, &path);
+    return string(path);
+  }
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h
new file mode 100644
index 00000000..0e3d1d0e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_BACKENDS_CPU_ANNOTATION_STACK_H_
+#define XLA_TSL_PROFILER_BACKENDS_CPU_ANNOTATION_STACK_H_
+
+#include <atomic>
+#include <cstdint>
+#include <string_view>
+
+#include "absl/types/span.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace profiler {
+
+// Backend for ScopedAnnotation.
+class AnnotationStack {
+ public:
+  // Appends name to the annotations for the current thread, separated by "::".
+  // The choice of separator "::" is based on characters not used by TensorFlow
+  // for its TensorOps.
+  static void PushAnnotation(std::string_view name);
+
+  // Resizes the annotation stack for the current thread.
+  static void PopAnnotation();
+
+  // Returns the annotation stack for the current thread.
+  static const string& Get();
+
+  // Returns the range id sequence for the stack for the current thread.
+  static absl::Span<const int64_t> GetScopeRangeIds();
+
+  // Enables or disables the annotation stack.
+  static void Enable(bool enable);
+
+  // Returns whether the annotation stack is enabled.
+  static bool IsEnabled() {
+    return generation_.load(std::memory_order_acquire) & 1;
+  }
+
+ private:
+  AnnotationStack() = default;
+
+  // Enabled if odd, disabled if even. The value is incremented for every call
+  // to Enable() which changes the enabled state.
+  static std::atomic<int> generation_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_BACKENDS_CPU_ANNOTATION_STACK_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/host_tracer_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/host_tracer_utils.h
new file mode 100644
index 00000000..eb0d7dd4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/host_tracer_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_BACKENDS_CPU_HOST_TRACER_UTILS_H_
+#define XLA_TSL_PROFILER_BACKENDS_CPU_HOST_TRACER_UTILS_H_
+
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/backends/cpu/traceme_recorder.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Convert complete events to XPlane format.
+void ConvertCompleteEventsToXPlane(uint64 start_timestamp_ns,
+                                   TraceMeRecorder::Events&& events,
+                                   tensorflow::profiler::XPlane* raw_plane);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_BACKENDS_CPU_HOST_TRACER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/threadpool_listener.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/threadpool_listener.h
new file mode 100644
index 00000000..5cef72cc
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/threadpool_listener.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_H_
+#define XLA_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_H_
+
+#include "absl/status/status.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/backends/cpu/threadpool_listener_state.h"
+#include "tsl/platform/tracing.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+namespace tsl {
+namespace profiler {
+
+class ThreadpoolEventCollector : public tsl::tracing::EventCollector {
+ public:
+  explicit ThreadpoolEventCollector() = default;
+
+  void RecordEvent(uint64 arg) const override;
+  void StartRegion(uint64 arg) const override;
+  void StopRegion() const override;
+
+  // Annotates the current thread with a name.
+  void SetCurrentThreadName(const char* name) {}
+  // Returns whether event collection is enabled.
+  static bool IsEnabled() { return threadpool_listener::IsEnabled(); }
+};
+
+class ThreadpoolProfilerInterface : public ProfilerInterface {
+ public:
+  explicit ThreadpoolProfilerInterface() = default;
+
+  absl::Status Start() override;
+  absl::Status Stop() override;
+
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
+
+ private:
+  absl::Status status_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/threadpool_listener_state.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/threadpool_listener_state.h
new file mode 100644
index 00000000..cb8cbe9d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/threadpool_listener_state.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_STATE_H_
+#define XLA_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_STATE_H_
+
+namespace tsl {
+namespace profiler {
+namespace threadpool_listener {
+
+// Check if the threadpool listener is enabled.
+bool IsEnabled();
+
+// Set global state of threadpool listener to enabled.
+void Activate();
+
+// Set global state of threadpool listener to disabled.
+void Deactivate();
+
+}  // namespace threadpool_listener
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_STATE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/traceme_recorder.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/traceme_recorder.h
new file mode 100644
index 00000000..ed8477bd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/backends/cpu/traceme_recorder.h
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_BACKENDS_CPU_TRACEME_RECORDER_H_
+#define XLA_TSL_PROFILER_BACKENDS_CPU_TRACEME_RECORDER_H_
+
+#include <sys/types.h>
+
+#include <atomic>
+#include <cstdint>
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace profiler {
+
+namespace internal {
+
+// Current trace level.
+// Static atomic so TraceMeRecorder::Active can be fast and non-blocking.
+// Modified by TraceMeRecorder singleton when tracing starts/stops.
+TF_EXPORT extern std::atomic<int> g_trace_level;
+TF_EXPORT extern std::atomic<uint64_t> g_trace_filter_bitmap;
+
+}  // namespace internal
+
+// TraceMeRecorder is a singleton repository of TraceMe events.
+// It can be safely and cheaply appended to by multiple threads.
+//
+// Start() and Stop() must be called in pairs, Stop() returns the events added
+// since the previous Start().
+//
+// This is the backend for TraceMe instrumentation.
+// The profiler starts the recorder, the TraceMe destructor records complete
+// events. TraceMe::ActivityStart records start events, and TraceMe::ActivityEnd
+// records end events. The profiler then stops the recorder and finds start/end
+// pairs. (Unpaired start/end events are discarded at that point).
+class TraceMeRecorder {
+ public:
+  // An Event is either the start of a TraceMe, the end of a TraceMe, or both.
+  // Times are in ns since the Unix epoch.
+  // A negative time encodes the activity_id used to pair up the start of an
+  // event with its end.
+  struct Event {
+    bool IsComplete() const { return start_time > 0 && end_time > 0; }
+    bool IsStart() const { return end_time < 0; }
+    bool IsEnd() const { return start_time < 0; }
+
+    int64_t ActivityId() const {
+      if (IsStart()) return -end_time;
+      if (IsEnd()) return -start_time;
+      return 1;  // complete
+    }
+
+    std::string name;
+    int64_t start_time;
+    int64_t end_time;
+  };
+  struct ThreadInfo {
+    int64_t tid;
+    std::string name;
+  };
+  struct ThreadEvents {
+    ThreadInfo thread;
+    std::deque<Event> events;
+  };
+  using Events = std::vector<ThreadEvents>;
+
+  // Starts recording of TraceMe().
+  // Only traces <= level will be recorded.
+  // Level must be >= 0. If level is 0, no traces will be recorded.
+  static bool Start(int level);
+
+  // Starts recording of TraceMe() with filter.
+  // Only traces <= level will be recorded.
+  // Level must be >= 0. If level is 0, no traces will be recorded.
+  // filter_mask is a bitmap that will be used to filter out traces during
+  // recording. Filter will be applied only if record function (e.g. TraceMe,
+  // ActivityStart, InstantActivity etc.) with filter_mask is called.
+  static bool Start(int level, uint64_t filter_mask);
+
+  // Stops recording and returns events recorded since Start().
+  // Events passed to Record after Stop has started will be dropped.
+  static Events Stop();
+
+  // Returns whether we're currently recording. Racy, but cheap!
+  static inline bool Active(int level = 1) {
+    return internal::g_trace_level.load(std::memory_order_acquire) >= level;
+  }
+
+  // Returns whether the filter is enabled.
+  static inline bool CheckFilter(uint64_t filter) {
+    return internal::g_trace_filter_bitmap & filter;
+  }
+
+  // Default value for trace_level_ when tracing is disabled
+  static constexpr int kTracingDisabled = -1;
+
+  // Records an event. Non-blocking.
+  static void Record(Event&& event);
+
+  // Returns an activity_id for TraceMe::ActivityStart.
+  static int64_t NewActivityId();
+
+ private:
+  TraceMeRecorder() = delete;
+  ~TraceMeRecorder() = delete;
+
+  // Clears events from all active threads that were added due to Record
+  // racing with Stop.
+  static void Clear();
+
+  // Gathers events from all active threads, and clears their buffers.
+  static TF_MUST_USE_RESULT Events Consume();
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_BACKENDS_CPU_TRACEME_RECORDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
new file mode 100644
index 00000000..287e7658
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
+#define XLA_TSL_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
+
+#include "xla/tsl/platform/types.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Post process XSpaces collected locally from multiple profilers.
+void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
+                                 uint64 start_time_ns, uint64 stop_time_ns);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/trace_container.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/trace_container.h
new file mode 100644
index 00000000..967ed518
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/trace_container.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_CONVERT_TRACE_CONTAINER_H_
+#define XLA_TSL_PROFILER_CONVERT_TRACE_CONTAINER_H_
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "tsl/profiler/protobuf/trace_events.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+using tsl::profiler::Device;
+using tsl::profiler::Trace;
+using tsl::profiler::TraceEvent;
+
+template <typename /*Comparable*/ Event>
+class AnyTraceContainer {
+ public:
+  virtual ~AnyTraceContainer() = default;
+  virtual TraceEvent* CreateEvent() = 0;
+  virtual const std::vector<TraceEvent*>& UnsortedEvents() const = 0;
+};
+
+class TraceContainer : public AnyTraceContainer<TraceEvent> {
+ public:
+  TraceContainer() = default;
+  ~TraceContainer() final {
+    for (const TraceEvent* event : events_) {
+      delete event;
+    }
+  }
+
+  // Returns the metadata for this trace container.
+  const Trace& trace() const { return metadata_; }
+
+  const std::vector<TraceEvent*>& UnsortedEvents() const final {
+    return events_;
+  }
+
+  // Caps the number of stored trace events to the specified limit,
+  // keeping the `max_count` earliest trace events by timestamp
+  // if there are more events than the limit. The sortedness of
+  // the trace events after calling this function is currently unspecified.
+  void CapEvents(uint32_t max_count);
+
+  // Returns a device descriptor.
+  Device* MutableDevice(uint32_t device_id) {
+    return &(*metadata_.mutable_devices())[device_id];
+  }
+
+  // Allocates and returns a pointer to a trace event owned by this
+  // container. Do not persist the pointer; it will be invalidated
+  // on `FlushAndSerializeEvents(output:)`, or when the container is
+  // deinitialized, whichever comes first.
+  TraceEvent* CreateEvent() final {
+    TraceEvent* event = new TraceEvent;
+    events_.push_back(event);
+    return event;
+  }
+
+  // Removes all stored trace events from the container, and serializes
+  // them as a protobuf string, along with the device metadata. This
+  // function does not clear the device metadata.
+  void FlushAndSerializeEvents(std::string* output);
+
+  // Used for testing
+  bool ParseMetadataFromString(const std::string& description);
+
+ private:
+  Trace metadata_;
+  std::vector<TraceEvent*> events_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_CONVERT_TRACE_CONTAINER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.h
new file mode 100644
index 00000000..2f64ee22
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
+#define XLA_TSL_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
+
+#include <string>
+
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/convert/trace_container.h"
+
+namespace tsl {
+namespace profiler {
+
+// Converts trace events in the trace proto to a JSON string that can be
+// consumed by catapult trace viewer.
+std::string TraceContainerToJson(const TraceContainer& container);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
new file mode 100644
index 00000000..5fe32719
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
@@ -0,0 +1,135 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_CONVERT_XLA_OP_UTILS_H_
+#define XLA_TSL_PROFILER_CONVERT_XLA_OP_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+namespace profiler {
+
+// HLO categories used for analysis.
+inline constexpr absl::string_view kHloInfeed = "infeed";
+inline constexpr absl::string_view kHloOutfeed = "outfeed";
+inline constexpr absl::string_view kHloAllReduce = "all-reduce";
+inline constexpr absl::string_view kHloAllToAll = "all-to-all";
+inline constexpr absl::string_view kHloSend = "send";
+inline constexpr absl::string_view kHloSendDone = "send-done";
+inline constexpr absl::string_view kHloRecv = "recv";
+inline constexpr absl::string_view kHloRecvDone = "recv-done";
+inline constexpr absl::string_view kHloHostSend = "host send";
+inline constexpr absl::string_view kHloHostSendDone = "host send-done";
+inline constexpr absl::string_view kHloHostRecv = "host recv";
+inline constexpr absl::string_view kHloHostRecvDone = "host recv-done";
+inline constexpr absl::string_view kHloCall = "call";
+inline constexpr absl::string_view kHloConditional = "conditional";
+inline constexpr absl::string_view kHloWhile = "while";
+inline constexpr absl::string_view kHloConvolution = "convolution";
+inline constexpr absl::string_view kHloConvolutionBaseDilated =
+    "convolution base-dilated";
+inline constexpr absl::string_view kHloConvolutionWindowDilated =
+    "convolution window-dilated";
+inline constexpr absl::string_view kHloOutputFusion = "output fusion";
+inline constexpr absl::string_view kHloConvolutionFusion = "convolution fusion";
+inline constexpr absl::string_view kHloCustomFusion = "custom fusion";
+inline constexpr absl::string_view kHloAllReduceFusion = "all-reduce fusion";
+inline constexpr absl::string_view kHloAllGatherFusion = "all-gather fusion";
+inline constexpr absl::string_view kHloAllReduceScatterFusion =
+    "all-reduce-scatter fusion";
+inline constexpr absl::string_view kHloGatherFusion = "gather fusion";
+inline constexpr absl::string_view kHloScatterFusion = "scatter fusion";
+inline constexpr absl::string_view kHloMegacoreFusion = "megacore fusion";
+inline constexpr absl::string_view kHloCopy = "copy";
+inline constexpr absl::string_view kHloCopyStart = "copy-start";
+inline constexpr absl::string_view kHloCopyDone = "copy-done";
+inline constexpr absl::string_view kHloCollectivePermute = "collective-permute";
+inline constexpr absl::string_view kHloCollectivePermuteStart =
+    "collective-permute-start";
+inline constexpr absl::string_view kHloCollectivePermuteDone =
+    "collective-permute-done";
+inline constexpr absl::string_view kHloAllGatherStart = "all-gather-start";
+inline constexpr absl::string_view kHloAllGatherDone = "all-gather-done";
+inline constexpr absl::string_view kHloAfterAll = "after-all";
+inline constexpr absl::string_view kHloAllGather = "all-gather";
+inline constexpr absl::string_view kHloAllReduceStart = "all-reduce-start";
+inline constexpr absl::string_view kHloAllReduceDone = "all-reduce-done";
+inline constexpr absl::string_view kHloAsyncStart = "async-start";
+inline constexpr absl::string_view kHloAsyncUpdate = "async-update";
+inline constexpr absl::string_view kHloAsyncDone = "async-done";
+inline constexpr absl::string_view kHloReshape = "reshape";
+inline constexpr absl::string_view kHloTranspose = "transpose";
+
+// SparseCore V0 sub-categories.
+TF_CONST_INIT extern const absl::string_view kHloSparseCoreV0Infeed;
+TF_CONST_INIT extern const absl::string_view kHloSparseCoreV0Outfeed;
+TF_CONST_INIT extern const absl::string_view kHloSparseCoreV0InfeedWait;
+TF_CONST_INIT extern const absl::string_view kHloSparseCoreV0InfeedTransform;
+
+// Return if a category is fusion.
+inline bool IsFusion(absl::string_view category) {
+  return absl::EndsWith(category, " fusion");
+}
+
+// Return a concatenation of the program name with program id.
+inline std::string HloModuleNameWithProgramId(absl::string_view hlo_module_name,
+                                              uint64_t program_id) {
+  return absl::StrCat(hlo_module_name, "(", program_id, ")");
+}
+
+inline bool IsHloRematerialization(absl::string_view hlo_expression) {
+  auto pos = hlo_expression.find_first_of('=');
+  if (pos != absl::string_view::npos) {
+    hlo_expression.remove_suffix(hlo_expression.size() - pos);
+  }
+  return absl::StrContains(hlo_expression, ".remat");
+}
+
+// Return true if framework_op is a remat.
+inline bool IsFrameworkRematerialization(absl::string_view framework_op_name) {
+  return absl::StrContains(framework_op_name, "/rematted_computation/");
+}
+
+// Return true if hlo_expression is a remat.
+inline bool IsRematerialization(absl::string_view hlo_expression,
+                                absl::string_view framework_op_name) {
+  return IsHloRematerialization(hlo_expression) ||
+         IsFrameworkRematerialization(framework_op_name);
+}
+
+inline bool IsInfeedOrOutfeed(absl::string_view category) {
+  return category == kHloInfeed || category == kHloOutfeed ||
+         absl::StrContains(category, kHloInfeed) ||
+         absl::StrContains(category, kHloOutfeed);
+}
+
+inline bool IsHostOrSparseCoreV0Infeed(absl::string_view category) {
+  return category == tsl::profiler::kHloInfeed ||
+         category == tsl::profiler::kHloSparseCoreV0Infeed;
+}
+
+inline bool MayHaveInnerOps(absl::string_view category) {
+  return category == kHloCall || category == kHloConditional ||
+         category == kHloWhile || category == kHloMegacoreFusion;
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_CONVERT_XLA_OP_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.h
new file mode 100644
index 00000000..83d5fcf0
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
+#define XLA_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
+
+#include <string>
+
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/convert/trace_container.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+TraceContainer ConvertXSpaceToTraceContainer(
+    const tensorflow::profiler::XSpace& xspace);
+
+void ConvertXSpaceToTraceEventsString(
+    const tensorflow::profiler::XSpace& xspace, std::string* content);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.h
new file mode 100644
index 00000000..bf5b52a7
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef XLA_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+#define XLA_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+
+#include <string>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Convert XSpace to tool data and saves under <logdir>/plugins/profile/.
+absl::Status ExportToTensorBoard(const tensorflow::profiler::XSpace& xspace,
+                                 const std::string& logdir,
+                                 bool also_export_trace_json = false);
+
+// Collects one sample of monitoring profile and shows user-friendly metrics.
+// If timestamp flag is true, timestamp will be displayed in "%H:%M:%S" format.
+absl::Status Monitor(const std::string& service_addr, int duration_ms,
+                     int monitoring_level, bool display_timestamp,
+                     std::string* result);
+
+// Starts tracing on a single or multiple hosts. Each host will save the result
+// in the given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts. Assumes that options have been validated.
+absl::Status CaptureRemoteTrace(
+    const std::string& logdir, int num_tracing_attempts,
+    tensorflow::RemoteProfilerSessionManagerOptions& opts,
+    bool is_cloud_tpu_session);
+
+// Generates RemoteProfilerSessionManagerOptions from inputs and calls
+// CaptureRemoteTrace.
+absl::Status CaptureRemoteTrace(
+    const char* service_addr, const char* logdir, const char* worker_list,
+    bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        options);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.h
new file mode 100644
index 00000000..37bf7fdd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef XLA_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+#define XLA_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/profiler/protobuf/profiler_analysis.grpc.pb.h"
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Note that tensorflow/tools/def_file_filter/symbols_pybind.txt is incompatible
+// with absl::string_view.
+absl::Status ProfileGrpc(const std::string& service_address,
+                         const tensorflow::ProfileRequest& request,
+                         tensorflow::ProfileResponse* response);
+
+absl::Status NewSessionGrpc(const std::string& service_address,
+                            const tensorflow::NewProfileSessionRequest& request,
+                            tensorflow::NewProfileSessionResponse* response);
+
+absl::Status MonitorGrpc(const std::string& service_address,
+                         const tensorflow::MonitorRequest& request,
+                         tensorflow::MonitorResponse* response);
+
+class RemoteProfilerSession {
+ public:
+  // Creates an instance and starts a remote profiling session immediately.
+  // This is a non-blocking call and does not wait for a response.
+  // Response must outlive the instantiation.
+  static std::unique_ptr<RemoteProfilerSession> Create(
+      const std::string& service_address, absl::Time deadline,
+      const tensorflow::ProfileRequest& profile_request);
+
+  // Not copyable or movable.
+  RemoteProfilerSession(const RemoteProfilerSession&) = delete;
+  RemoteProfilerSession& operator=(const RemoteProfilerSession&) = delete;
+
+  ~RemoteProfilerSession();
+
+  absl::string_view GetServiceAddress() const { return service_address_; }
+
+  // Blocks until a response has been received or until deadline expiry,
+  // whichever is first. Subsequent calls after the first will yield nullptr and
+  // an error status.
+  std::unique_ptr<tensorflow::ProfileResponse> WaitForCompletion(
+      absl::Status& out_status);
+
+ private:
+  explicit RemoteProfilerSession(
+      const std::string& service_addr, absl::Time deadline,
+      const tensorflow::ProfileRequest& profile_request);
+
+  // Starts a remote profiling session. This is a non-blocking call.
+  // Will be called exactly once during instantiation.
+  // RPC will write to response.profile_response eagerly. However, since
+  // response.status requires a conversion from grpc::Status, it can only be
+  //  evaluated lazily at WaitForCompletion() time.
+  void ProfileAsync();
+
+  absl::Status status_on_completion_;
+  std::unique_ptr<tensorflow::ProfileResponse> response_;
+  // Client address and connection attributes.
+  std::string service_address_;
+  std::unique_ptr<tensorflow::grpc::ProfilerService::Stub> stub_;
+  absl::Time deadline_;
+  ::grpc::ClientContext grpc_context_;
+  std::unique_ptr<
+      ::grpc::ClientAsyncResponseReader<tensorflow::ProfileResponse>>
+      rpc_;
+  ::grpc::Status grpc_status_ = ::grpc::Status::OK;
+
+  // Asynchronous completion queue states.
+  ::grpc::CompletionQueue cq_;
+
+  tensorflow::ProfileRequest profile_request_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test_util.h
new file mode 100644
index 00000000..d0a61f45
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test_util.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef XLA_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
+#define XLA_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/rpc/profiler_server.h"
+#include "tsl/profiler/lib/profiler_session.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tsl/profiler/protobuf/profiler_service.pb.h"
+
+namespace tsl {
+namespace profiler {
+namespace test {
+
+using tensorflow::ProfileRequest;
+
+inline std::unique_ptr<ProfilerServer> StartServer(
+    absl::Duration duration, std::string* service_address,
+    ProfileRequest* request = nullptr) {
+  auto profiler_server = absl::make_unique<ProfilerServer>();
+  int port = testing::PickUnusedPortOrDie();
+  profiler_server->StartProfilerServer(port);
+
+  DCHECK(service_address);
+  *service_address = absl::StrCat("localhost:", port);
+
+  if (request) {
+    request->set_duration_ms(absl::ToInt64Milliseconds(duration));
+    request->set_max_events(10000);
+    *request->mutable_opts() = ProfilerSession::DefaultOptions();
+    request->mutable_opts()->set_duration_ms(
+        absl::ToInt64Milliseconds(duration));
+    request->set_session_id("test_session");
+    request->set_host_name(*service_address);
+    request->set_repository_root(testing::TmpDir());
+  }
+
+  LOG(INFO) << "Started " << *service_address << " at " << absl::Now();
+  LOG(INFO) << "Duration: " << duration;
+
+  return profiler_server;
+}
+
+inline ::testing::Matcher<absl::Duration> DurationNear(
+    const absl::Duration duration, absl::Duration epsilon = absl::Seconds(1)) {
+  return ::testing::AllOf(::testing::Ge(duration - epsilon),
+                          ::testing::Le(duration + epsilon));
+}
+
+inline ::testing::Matcher<absl::Duration> DurationApproxLess(
+    const absl::Duration duration, absl::Duration epsilon = absl::Seconds(1)) {
+  return ::testing::Le(duration + epsilon);
+}
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h
new file mode 100644
index 00000000..d75eac57
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+#define XLA_TSL_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/rpc/client/profiler_client.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace profiler {
+
+using AddressResolver = std::function<std::string(absl::string_view)>;
+
+// Manages one or more remote profiling sessions.
+class RemoteProfilerSessionManager {
+ public:
+  struct Response {
+    std::string service_address;
+    std::unique_ptr<tensorflow::ProfileResponse> profile_response;
+    absl::Status status;
+  };
+  // Instantiates a collection of RemoteProfilerSessions starts profiling on
+  // each of them immediately. Assumes that options have already been validated.
+  static std::unique_ptr<RemoteProfilerSessionManager> Create(
+      const tensorflow::RemoteProfilerSessionManagerOptions& options,
+      const tensorflow::ProfileRequest& request, absl::Status& out_status,
+      AddressResolver resolver = nullptr);
+
+  // Awaits for responses from remote profiler sessions and returns them as a
+  // list. Subsequent calls beyond the first will yield a list of errors.
+  std::vector<Response> WaitForCompletion();
+
+  // Not copyable or movable.
+  RemoteProfilerSessionManager(const RemoteProfilerSessionManager&) = delete;
+  RemoteProfilerSessionManager& operator=(const RemoteProfilerSessionManager&) =
+      delete;
+
+  ~RemoteProfilerSessionManager();
+
+ private:
+  explicit RemoteProfilerSessionManager(
+      tensorflow::RemoteProfilerSessionManagerOptions options,
+      tensorflow::ProfileRequest request, AddressResolver resolver);
+
+  // Initialization of all client contexts.
+  absl::Status Init();
+
+  mutex mutex_;
+  // Remote profiler session options.
+  tensorflow::RemoteProfilerSessionManagerOptions options_
+      TF_GUARDED_BY(mutex_);
+  tensorflow::ProfileRequest request_ TF_GUARDED_BY(mutex_);
+  // List of clients, each connects to a profiling service.
+  std::vector<std::unique_ptr<RemoteProfilerSession>> clients_
+      TF_GUARDED_BY(mutex_);
+  // Resolves an address into a format that gRPC understands.
+  AddressResolver resolver_ TF_GUARDED_BY(mutex_);
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/save_profile.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/save_profile.h
new file mode 100644
index 00000000..c27942f3
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/client/save_profile.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
+#define XLA_TSL_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
+
+#include <ostream>
+#include <string>
+
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+std::string GetCurrentTimeStampAsString();
+
+// Returns the profile plugin directory given a logdir to TensorBoard.
+std::string GetTensorBoardProfilePluginDir(const std::string& logdir);
+
+// Saves all profiling tool data in a profile to <repository_root>/<run>/.
+// This writes user-facing log messages to `os`.
+// Note: this function creates a directory even when all fields in
+// ProfileResponse are unset/empty.
+absl::Status SaveProfile(const std::string& repository_root,
+                         const std::string& run, const std::string& host,
+                         const tensorflow::ProfileResponse& response,
+                         std::ostream* os);
+
+// Gzip the data and save to <repository_root>/<run>/.
+absl::Status SaveGzippedToolData(const std::string& repository_root,
+                                 const std::string& run,
+                                 const std::string& host,
+                                 const std::string& tool_name,
+                                 const std::string& data);
+
+// Save XSpace to <repository_root>/<run>/<host>_<port>.<kXPlanePb>.
+absl::Status SaveXSpace(const std::string& repository_root,
+                        const std::string& run, const std::string& host,
+                        const tensorflow::profiler::XSpace& xspace);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/profiler_server.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/profiler_server.h
new file mode 100644
index 00000000..8021de58
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/profiler_server.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_RPC_PROFILER_SERVER_H_
+#define XLA_TSL_PROFILER_RPC_PROFILER_SERVER_H_
+
+#include <memory>
+
+#include "grpcpp/grpcpp.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+class ProfilerServer {
+ public:
+  ~ProfilerServer();
+  // Starts a profiler server with a given port.
+  void StartProfilerServer(int32_t port);
+
+ private:
+  std::unique_ptr<tensorflow::grpc::ProfilerService::Service> service_;
+  std::unique_ptr<::grpc::Server> server_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.h
new file mode 100644
index 00000000..b0c2c795
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+#define XLA_TSL_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+std::unique_ptr<tensorflow::grpc::ProfilerService::Service>
+CreateProfilerService();
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h
new file mode 100644
index 00000000..5482b7cd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h
@@ -0,0 +1,62 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_BUFFER_POOL_H_
+#define XLA_TSL_PROFILER_UTILS_BUFFER_POOL_H_
+
+#include <vector>
+
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+
+namespace tsl {
+namespace profiler {
+
+// A lightweight buffer management class for tracking fixed sized buffers that
+// can be reused. ReusableBuffers only manages buffers that have been
+// reclaimed (i.e. relinquished by client).
+// This class is thread-safe.
+class BufferPool {
+ public:
+  // Allocated buffers will be of a fixed size specified during initialization.
+  explicit BufferPool(size_t buffer_size_in_bytes);
+
+  ~BufferPool();
+
+  // Returns a previously reclaimed buffer for use. If there are no buffers
+  // being managed, this allocates and returns 8B aligned buffers of size
+  // `buffer_size_in_bytes_`. The content of returned buffers is undefined.
+  uint8_t* GetOrCreateBuffer();
+
+  // Reclaims exclusive ownership of a buffer. Clients must pass in a buffer
+  // that was obtained from `GetOrCreateBuffer()`.
+  void ReclaimBuffer(uint8_t* buffer);
+
+  // Frees all relinquished buffers from memory.
+  void DestroyAllBuffers();
+
+  // Gets size of a single buffer in bytes.
+  size_t GetBufferSizeInBytes() const;
+
+ protected:
+  mutex buffers_mutex_;
+  std::vector<uint8_t*> buffers_ TF_GUARDED_BY(buffers_mutex_);
+  size_t buffer_size_in_bytes_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_BUFFER_POOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/device_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/device_utils.h
new file mode 100644
index 00000000..e07f5679
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/device_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_DEVICE_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_DEVICE_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+enum class DeviceType {
+  kUnknown,
+  kCpu,
+  kTpu,
+  kGpu,
+};
+
+// Gets DeviceType from XPlane.
+DeviceType GetDeviceType(const tensorflow::profiler::XPlane& plane);
+// Gets DeviceType from XPlane name.
+DeviceType GetDeviceType(absl::string_view plane_name);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_DEVICE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/file_system_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/file_system_utils.h
new file mode 100644
index 00000000..522b5284
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/file_system_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+
+#include <initializer_list>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tsl/platform/platform.h"
+
+#ifdef PLATFORM_WINDOWS
+const absl::string_view kPathSep = "\\";
+#else
+const absl::string_view kPathSep = "/";
+#endif
+
+namespace tsl {
+namespace profiler {
+
+inline std::string ProfilerJoinPathImpl(
+    std::initializer_list<absl::string_view> paths) {
+  std::string result;
+  for (absl::string_view path : paths) {
+    if (path.empty()) continue;
+
+    if (result.empty()) {
+      result = std::string(path);
+      continue;
+    }
+
+    path = absl::StripPrefix(path, kPathSep);
+    if (absl::EndsWith(result, kPathSep)) {
+      absl::StrAppend(&result, path);
+    } else {
+      absl::StrAppend(&result, kPathSep, path);
+    }
+  }
+
+  return result;
+}
+
+// A local duplication of ::tensorflow::io::JoinPath that supports windows.
+// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
+template <typename... T>
+std::string ProfilerJoinPath(const T&... args) {
+  return ProfilerJoinPathImpl({args...});
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/format_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/format_utils.h
new file mode 100644
index 00000000..583c6884
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/format_utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_FORMAT_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_FORMAT_UTILS_H_
+
+#include <stdio.h>
+
+#include <string>
+
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+namespace profiler {
+namespace internal {
+
+inline std::string FormatDouble(const char* fmt, double d) {
+  constexpr int kBufferSize = 32;
+  char buffer[kBufferSize];
+  int result = snprintf(buffer, kBufferSize, fmt, d);
+  DCHECK(result > 0 && result < kBufferSize);
+  return std::string(buffer);
+}
+
+}  // namespace internal
+
+// Formats d with one digit after the decimal point.
+inline std::string OneDigit(double d) {
+  return internal::FormatDouble("%.1f", d);
+}
+
+// Formats d with 2 digits after the decimal point.
+inline std::string TwoDigits(double d) {
+  return internal::FormatDouble("%.2f", d);
+}
+
+// Formats d with 3 digits after the decimal point.
+inline std::string ThreeDigits(double d) {
+  return internal::FormatDouble("%.3f", d);
+}
+
+// Formats d with maximum precision to allow parsing the result back to the same
+// number.
+inline std::string MaxPrecision(double d) {
+  return internal::FormatDouble("%.17g", d);
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_FORMAT_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/group_events.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/group_events.h
new file mode 100644
index 00000000..cdacea2b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/group_events.h
@@ -0,0 +1,261 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
+#define XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Information required to connect events across threads. The first two fields
+// specify the event types of parent and child events. In addition to matching
+// the event types, both events should have stats of the stat types specified
+// in stat_types and their values should be the same.
+struct InterThreadConnectInfo {
+  int64_t parent_event_type;
+  int64_t child_event_type;
+  std::vector<int64_t> parent_stat_types;
+  std::vector<int64_t> child_stat_types;
+};
+
+struct GroupMetadata {
+  std::string name;
+  absl::flat_hash_set<int64_t> parents;
+  absl::flat_hash_set<int64_t> children;
+};
+
+using GroupMetadataMap =
+    absl::flat_hash_map<int64_t /*group_id*/, GroupMetadata>;
+
+// A wrapper for XEvent with parent and children pointers. Through these
+// pointers, a tree of EventNode is formed.
+class EventNode {
+ public:
+  explicit EventNode(XEventVisitor visitor) : visitor_(std::move(visitor)) {}
+
+  EventNode(const EventNode& event_node) = delete;
+  EventNode& operator=(const EventNode&) = delete;
+
+  const std::vector<EventNode*>& GetParents() const { return parents_; }
+
+  const std::vector<EventNode*>& GetChildren() const { return children_; }
+
+  void AddChild(EventNode* child) {
+    children_.push_back(child);
+    child->parents_.push_back(this);
+  }
+
+  std::optional<int64_t> GetGroupId() const { return group_id_; }
+
+  std::string GetGroupName() const;
+
+  void SetGroupId(int64_t group_id);
+
+  // Sets group_id for this node and its descendants.
+  void PropagateGroupId(int64_t group_id, GroupMetadataMap* group_metadata_map);
+
+  const XEventVisitor& GetEventVisitor() const { return visitor_; }
+
+  std::optional<XStatVisitor> GetContextStat(int64_t stat_type) const;
+
+  void AddStepName(absl::string_view step_name);
+
+  void SetIsEager(bool is_eager);
+
+  // Returns true if this event is part of eagerly executed op.
+  bool IsEager() const;
+
+  bool IsNestedIn(EventNode* parent);
+
+  // Returns the closest parent (including itself) of the given event type.
+  const EventNode* FindParent(int64_t event_type) const;
+
+  void SetRootLevel(int root_level) { root_level_ = root_level; }
+
+  int RootLevel() const { return root_level_; }
+
+  bool IsCompiledFunc() const;
+
+  // Compare two EventNodes based on start timestamp.
+  bool operator<(const EventNode& other) const {
+    return GetEventVisitor().TimestampPs() <
+           other.GetEventVisitor().TimestampPs();
+  }
+
+ private:
+  XStat* FindOrAddStatByType(int64_t stat_type);
+
+  XEventVisitor visitor_;
+  std::vector<EventNode*> parents_;
+  std::vector<EventNode*> children_;
+  std::optional<int64_t> group_id_;
+  // Root event level.
+  // By default root_level_ is set to 0, which means it is not a root event.
+  // Events with root_level_ greater than 0 are considered as root events.
+  int root_level_ = 0;
+};
+
+using EventNodeMap =
+    absl::flat_hash_map<int64_t /*event_type*/, std::deque<EventNode>>;
+
+using EventList = std::vector<EventNode*>;
+
+struct ContextGroup {
+  std::vector<EventNode*> producers;
+  std::vector<EventNode*> consumers;
+};
+
+using ContextGroupMap = absl::flat_hash_map<
+    int /*context_type*/,
+    absl::flat_hash_map<uint64 /*context_id*/, ContextGroup>>;
+
+// EventForest augments the input XSpace with the trace context. The trace
+// context is created by stitching XEvents (1) using the nesting relationship
+// within the same thread and (2) comparing the semantic arguments or using
+// connect_info_list across threads. It also groups the events by the root
+// events specified in root_event_types or marked by the semantic argument.
+class EventForest {
+ public:
+  void AddSpace(
+      std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
+          visitor_factory,
+      tensorflow::profiler::XSpace* space);
+
+  void AddPlanes(
+      std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
+          visitor_factory,
+      const std::vector<tensorflow::profiler::XPlane*>& planes);
+
+  void ConnectEvents(
+      const std::vector<InterThreadConnectInfo>& connect_info_list = {});
+
+  void ConnectTfDataEvents();
+
+  void GroupEvents();
+
+  const EventNodeMap& GetEventNodeMap() const { return event_node_map_; }
+
+  const GroupMetadataMap& GetGroupMetadataMap() const {
+    return group_metadata_map_;
+  }
+
+ private:
+  void AddPlane(
+      std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
+          visitor_factory,
+      tensorflow::profiler::XPlane* plane);
+
+  // Creates an EventNode for each event in event_node_map and connect events
+  // according to the nesting relationship within the thread.
+  void ConnectIntraThread(tensorflow::profiler::XPlane* plane,
+                          XPlaneVisitor* visitor,
+                          ContextGroupMap* context_groups);
+
+  // Connects events across threads according to connect_info_list.
+  void ConnectInterThread(
+      const std::vector<InterThreadConnectInfo>& connect_info_list);
+
+  // Creates event groups and populates group_metadata_map. If a TF loop is
+  // used, each TF loop iteration becomes a root. Otherwise, top root events
+  // (i.e., none of their ancestors is a root event) are used as roots. A new
+  // group is created with all events reachable from a root.
+  void CreateEventGroups();
+
+  // Sets the is_eager stat to true for the eagerly executed GPU kernel events.
+  void MarkEagerlyExecutedGpuKernels();
+
+  // Sets the is_eager stat to true for the eagerly executed CPU TF op events.
+  void MarkEagerlyExecutedCpuTfOps();
+
+  // Populate all the step ids that associated with tf.data pipeline.
+  // Because FunctionRun is considered as root, but we want to exclude those
+  // FunctionRuns from tf.data.
+  void ProcessTfDataSteps();
+
+  // Processes the TF loops and registers the first TF executor event of each
+  // iteration to `tf_loop_root_events_`.
+  void ProcessTensorFlowLoop();
+
+  // Find the events of event_type which own ALL the given stat_types. If found,
+  // apply the given function to the node. The query predicates are
+  //     - The node's ContextStat contains stat_types(i.e. stat_types is a
+  //     subset of ContextStat) *AND*
+  //     - The node's event type in event_node_map_ is event_type.
+  void FindEventNodeAndApply(
+      int64_t event_type, const std::vector<int64_t>& stat_types,
+      const std::function<void(EventNode&, const std::vector<uint64>&)>& cb);
+
+  EventNodeMap event_node_map_;
+  std::vector<XPlaneVisitor> visitors_;
+  // std::deque for pointer stability.
+  std::deque<std::pair<tensorflow::profiler::XPlane*, XPlaneVisitor>> planes_;
+  // The "step" id (actually it is "function" id that are associated with
+  // the tf.data pipeline.
+  absl::flat_hash_set<int64_t> tf_data_step_ids_;
+  EventList tf_loop_root_events_;
+  GroupMetadataMap group_metadata_map_;
+};
+
+std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList();
+
+// Calls GroupEvents with connect_info_list and root_event_types specific to
+// TensorFlow.
+void GroupTfEvents(tensorflow::profiler::XSpace* space,
+                   EventForest* event_forest);
+void GroupTfEvents(tensorflow::profiler::XSpace* space);
+
+// Returns true if the given space has TF's loop ops.
+bool CheckLoopOp(const tensorflow::profiler::XSpace& space);
+
+// Adds step names from GroupMetadataMap to "Steps" line in plane.
+// The event name is updated when converted to trace events.
+void AddGroupMetadataToStepEvents(const GroupMetadataMap& group_metadata_map,
+                                  XLineBuilder& line);
+
+void GroupHostAndPlanes(
+    tensorflow::profiler::XSpace* space,
+    const std::vector<tensorflow::profiler::XPlane*>& device_traces,
+    EventForest* event_forest);
+
+void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
+                       const GroupMetadataMap& group_metadata_map);
+
+void GroupTpuEventsOSS(
+    tensorflow::profiler::XSpace* space,
+    const std::vector<tensorflow::profiler::XPlane*>& device_traces,
+    EventForest* event_forest);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/lock_free_queue.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/lock_free_queue.h
new file mode 100644
index 00000000..4b8b05a2
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/lock_free_queue.h
@@ -0,0 +1,314 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_LOCK_FREE_QUEUE_H_
+#define XLA_TSL_PROFILER_UTILS_LOCK_FREE_QUEUE_H_
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <optional>
+#include <utility>
+
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/profiler/utils/no_init.h"
+
+namespace tsl {
+namespace profiler {
+
+namespace QueueBaseInternal {
+
+// Internally implement the base block listed queue which will later extends to:
+//    * A single-producer single-consumer queue -- LockFreeQueue
+//    * A normal BlockedQueue which is not concerning the concurrency
+//
+// The queue base is a linked-list of blocks containing numbered slots, with
+// start and end pointers:
+//
+//  [ slots........ | next-]--> [ slots......... | next ]
+//  ^start_block_ ^start_         ^end_block_ ^end_
+//
+// start_ is the first occupied slot, end_ is the first unoccupied slot.
+//
+// Push writes at end_, and then advances it, allocating a block if needed.
+// Pop takes ownership of the element at start_, if any.
+// Clear removes all elements in the range [start_, end_).
+//
+// end_ will be atomic<size_t> when using under single-producer single-consumer
+// case, or it could be size_t for a thread-compatible data structure. In the
+// single-producer single-consumer scenario,
+//   Push and Pop are lock free and each might be called from a single thread.
+// Push is called by the producer thread. Pop is called by the consumer thread.
+// Since Pop might race with Push, Pop only removes an element if Push finished
+// before Pop was called. If Push is called while Pop is active, the new element
+// remains in the queue.
+//
+// Beyond the QueueBase,
+//   * LockFreeQueue's PopAll() will generate a BlockedQueue effeciently
+//   * BlockedQueue support move constructor/assignment and iterators
+
+template <typename T, size_t kBlockSize>
+struct InternalBlock {
+  // The number of slots in a block is chosen so the block fits in kBlockSize.
+  static constexpr size_t kNumSlots =
+      (kBlockSize -
+       (sizeof(size_t /*start*/) + sizeof(InternalBlock* /*next*/))) /
+      sizeof(NoInit<T>);
+
+  size_t start;  // The number of the first slot.
+  InternalBlock* next;
+  NoInit<T> slots[kNumSlots];
+};
+
+// Wraps size_t or std::atomic<size_t> used as index to the queue.
+// Index<true> wraps std::atomic<size_t>, Index<false> wraps size_t.
+template <bool kIsAtomic>
+struct Index;
+
+template <>
+struct Index<false> {
+  size_t value;
+  explicit Index(size_t pos = 0) : value(pos) {}
+  size_t Get() const { return value; }
+  void Set(size_t pos) { value = pos; }
+};
+
+template <>
+struct Index<true> {
+  std::atomic<size_t> value;
+  explicit Index(size_t pos = 0) : value(pos) {}
+  size_t Get() const { return value.load(std::memory_order_acquire); }
+  void Set(size_t pos) { value.store(pos, std::memory_order_release); }
+};
+
+template <typename T, size_t kBlockSize, bool kAtomicEnd>
+class BlockedQueueBase {
+  using Block = InternalBlock<T, kBlockSize>;
+
+ public:
+  static constexpr size_t kNumSlotsPerBlockForTesting = Block::kNumSlots;
+
+  BlockedQueueBase()
+      : start_block_(new Block{/*start=*/0, /*next=*/nullptr}),
+        start_(start_block_->start),
+        end_block_(start_block_),
+        end_(end_block_->start) {}
+
+  // Memory should be deallocated and elements destroyed on destruction.
+  // This doesn't require global lock as this discards all the stored elements
+  // and we assume of destruction of this instance only after the last Push()
+  // has been called.
+  ~BlockedQueueBase() {
+    Clear();
+    DCHECK(Empty());
+    delete end_block_;
+  }
+
+  // Adds a new element to the back of the queue. Fast and lock-free.
+  void Push(T&& element) {
+    size_t end = End();
+    auto& slot = end_block_->slots[end++ - end_block_->start];
+    slot.Emplace(std::move(element));
+    if (TF_PREDICT_FALSE(end - end_block_->start == Block::kNumSlots)) {
+      auto* new_block = new Block{/*start=*/end, /*next=*/nullptr};
+      end_block_ = (end_block_->next = new_block);
+    }
+    SetEnd(end);  // Write index after contents.
+  }
+
+  // Removes all elements from the queue.
+  void Clear() {
+    size_t end = End();
+    while (start_ != end) {
+      PopImpl();
+    }
+  }
+
+  // Removes one element off the front of the queue and returns it.
+  std::optional<T> Pop() {
+    std::optional<T> element;
+    size_t end = End();
+    if (start_ != end) {
+      element = PopImpl();
+    }
+    return element;
+  }
+
+ protected:
+  void SetEnd(size_t end) { end_.Set(end); }
+
+  size_t End() const { return end_.Get(); }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return (start_ == End()); }
+
+  // Removes one element off the front of the queue and returns it.
+  // REQUIRES: The queue must not be empty.
+  T PopImpl() {
+    DCHECK(!Empty());
+    // Move the next element into the output.
+    auto& slot = start_block_->slots[start_++ - start_block_->start];
+    T element = std::move(slot).Consume();
+    // If we reach the end of a block, we own it and should delete it.
+    // The next block is present: end_ always points to something.
+    if (TF_PREDICT_FALSE(start_ - start_block_->start == Block::kNumSlots)) {
+      auto* old_block = std::exchange(start_block_, start_block_->next);
+      delete old_block;
+      DCHECK_EQ(start_, start_block_->start);
+    }
+    return element;
+  }
+
+  Block* start_block_;     // Head: updated only by consumer thread.
+  size_t start_;           // Non-atomic: read only by consumer thread.
+  Block* end_block_;       // Tail: updated only by producer thread.
+  Index<kAtomicEnd> end_;  // Maybe atomic: read also by consumer thread.
+};
+
+}  // namespace QueueBaseInternal
+
+template <typename T, size_t kBlockSize>
+class LockFreeQueue;
+
+template <typename T, size_t kBlockSize = 1 << 16 /* 64 KiB */>
+class BlockedQueue final
+    : public QueueBaseInternal::BlockedQueueBase<T, kBlockSize, false> {
+  using Block = QueueBaseInternal::InternalBlock<T, kBlockSize>;
+  friend class LockFreeQueue<T, kBlockSize>;
+
+ public:
+  BlockedQueue() = default;
+
+  BlockedQueue(BlockedQueue&& src) { *this = std::move(src); }
+
+  BlockedQueue& operator=(BlockedQueue&& src) {
+    this->Clear();
+    std::swap(this->start_block_, src.start_block_);
+    std::swap(this->start_, src.start_);
+    std::swap(this->end_block_, src.end_block_);
+    auto origin_end = this->End();
+    this->SetEnd(src.End());
+    src.SetEnd(origin_end);
+    return *this;
+  }
+
+  class Iterator {
+   public:
+    bool operator==(const Iterator& another) const {
+      return (index_ == another.index_) && (queue_ == another.queue_);
+    }
+
+    bool operator!=(const Iterator& another) const {
+      return !(*this == another);
+    }
+
+    T& operator*() const {
+      DCHECK(block_ != nullptr);
+      DCHECK_GE(index_, block_->start);
+      DCHECK_LT(index_, block_->start + Block::kNumSlots);
+      DCHECK_LT(index_, queue_->End());
+      return block_->slots[index_ - block_->start].value;
+    }
+
+    T* operator->() const { return &(this->operator*()); }
+
+    Iterator& operator++() {
+      DCHECK(queue_ != nullptr);
+      DCHECK(block_ != nullptr);
+      if (index_ < queue_->End()) {
+        ++index_;
+        auto next_block_start = block_->start + Block::kNumSlots;
+        DCHECK_LE(index_, next_block_start);
+        if (index_ == next_block_start) {
+          block_ = block_->next;
+          DCHECK_NE(block_, nullptr);
+        }
+      }
+      return (*this);
+    }
+
+    Iterator operator++(int) {
+      auto temp(*this);
+      this->operator++();
+      return temp;
+    }
+
+   private:
+    friend class BlockedQueue;
+    Iterator(BlockedQueue* queue, BlockedQueue::Block* block, size_t index)
+        : queue_(queue), block_(block), index_(index) {};
+    BlockedQueue* queue_ = nullptr;
+    BlockedQueue::Block* block_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  Iterator begin() { return Iterator(this, this->start_block_, this->start_); }
+
+  Iterator end() { return Iterator(this, this->end_block_, this->End()); }
+};
+
+template <typename T, size_t kBlockSize = 1 << 16 /* 64 KiB */>
+class LockFreeQueue final
+    : public QueueBaseInternal::BlockedQueueBase<T, kBlockSize, true> {
+  using Block = QueueBaseInternal::InternalBlock<T, kBlockSize>;
+
+ public:
+  // Pop all events into an normal block storage queue, blocks are directly
+  // moved into new queue except the last block. Those events
+  // that are in the last block are in fact copied one by one.
+  BlockedQueue<T, kBlockSize> PopAll() {
+    BlockedQueue<T, kBlockSize> result;
+    auto* empty_block = result.start_block_;
+    result.start_block_ = result.end_block_ = nullptr;
+    result.start_ = this->start_;
+    // Use the end we see now, skip further growing if any in another thread
+    size_t end = this->End();
+    result.SetEnd(end);
+    while (this->start_block_->start + Block::kNumSlots <= end) {
+      auto* old_block =
+          std::exchange(this->start_block_, this->start_block_->next);
+      this->start_ = this->start_block_->start;
+      old_block->next = nullptr;
+      if (result.end_block_) {
+        result.end_block_->next = old_block;
+      } else {
+        result.start_block_ = old_block;
+      }
+      result.end_block_ = old_block;
+    }
+    empty_block->start = this->start_block_->start;
+    if (result.end_block_ == nullptr) {
+      result.end_block_ = result.start_block_ = empty_block;
+    } else {
+      result.end_block_->next = empty_block;
+      result.end_block_ = empty_block;
+    }
+    size_t bs = this->start_block_->start;
+    for (size_t i = std::max(this->start_, bs); i < end; i++) {
+      auto& src_slot = this->start_block_->slots[i - bs];
+      auto& dst_slot = result.end_block_->slots[i - bs];
+      dst_slot.Emplace(std::move(src_slot).Consume());
+    }
+    this->start_ = end;
+    return result;
+  }
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_LOCK_FREE_QUEUE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/math_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/math_utils.h
new file mode 100644
index 00000000..5b44da67
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/math_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_MATH_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_MATH_UTILS_H_
+
+#include <cstdint>
+
+namespace tsl {
+namespace profiler {
+
+// Converts among different SI units.
+// https://en.wikipedia.org/wiki/International_System_of_Units
+// NOTE: We use uint64 for picos and nanos, which are used in
+// storage, and double for other units that are used in the UI.
+inline double PicoToNano(uint64_t p) { return p / 1E3; }
+inline double PicoToMicro(uint64_t p) { return p / 1E6; }
+inline double PicoToMilli(uint64_t p) { return p / 1E9; }
+inline double PicoToUni(uint64_t p) { return p / 1E12; }
+inline uint64_t NanoToPico(uint64_t n) { return n * 1000; }
+inline double NanoToMicro(uint64_t n) { return n / 1E3; }
+inline double NanoToMilli(uint64_t n) { return n / 1E6; }
+inline double MicroToNano(double u) { return u * 1E3; }
+inline double MicroToMilli(double u) { return u / 1E3; }
+inline uint64_t MilliToPico(double m) { return m * 1E9; }
+inline uint64_t MilliToNano(double m) { return m * 1E6; }
+inline double MilliToUni(double m) { return m / 1E3; }
+inline uint64_t UniToPico(double uni) { return uni * 1E12; }
+inline uint64_t UniToNano(double uni) { return uni * 1E9; }
+inline double UniToMicro(double uni) { return uni * 1E6; }
+inline double UniToMega(double uni) { return uni / 1E6; }
+inline double UniToGiga(double uni) { return uni / 1E9; }
+inline double GigaToUni(double giga) { return giga * 1E9; }
+inline double GigaToTera(double giga) { return giga / 1E3; }
+inline double TeraToGiga(double tera) { return tera * 1E3; }
+
+// Convert from clock cycles to seconds.
+inline double CyclesToSeconds(double cycles, double frequency_hz) {
+  // cycles / (cycles/s) = s.
+  return cycles / frequency_hz;
+}
+
+// Checks the divisor and returns 0 to avoid divide by zero.
+inline double SafeDivide(double dividend, double divisor) {
+  constexpr double kEpsilon = 1.0E-10;
+  if ((-kEpsilon < divisor) && (divisor < kEpsilon)) return 0.0;
+  return dividend / divisor;
+}
+
+inline double GibiToGiga(double gibi) { return gibi * ((1 << 30) / 1.0e9); }
+inline double GigaToGibi(double giga) { return giga / ((1 << 30) / 1.0e9); }
+
+// Calculates GiB/s.
+inline double GibibytesPerSecond(double bytes, double ns) {
+  return GigaToGibi(SafeDivide(bytes, ns));
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_MATH_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/no_init.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/no_init.h
new file mode 100644
index 00000000..6f7d6aa9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/no_init.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_NO_INIT_H_
+#define XLA_TSL_PROFILER_UTILS_NO_INIT_H_
+
+#include <utility>
+
+namespace tsl {
+namespace profiler {
+
+// Wraps T into a union so that we can avoid the cost of automatic construction
+// and destruction when tracing is disabled.
+template <typename T>
+union NoInit {
+  // Ensure constructor and destructor do nothing.
+  NoInit() {}
+  ~NoInit() {}
+
+  template <typename... Ts>
+  void Emplace(Ts&&... args) {
+    new (&value) T(std::forward<Ts>(args)...);
+  }
+
+  void Destroy() { value.~T(); }
+
+  T Consume() && {
+    T v = std::move(value);
+    Destroy();
+    return v;
+  }
+
+  T value;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_NO_INIT_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/parse_annotation.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/parse_annotation.h
new file mode 100644
index 00000000..8d755f7e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/parse_annotation.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_PARSE_ANNOTATION_H_
+#define XLA_TSL_PROFILER_UTILS_PARSE_ANNOTATION_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace tsl {
+namespace profiler {
+
+// Parses a string passed to TraceMe or ScopedAnnotation.
+// Expect the format will be "<name>#<metadata>#".
+// <metadata> is a comma-separated list of "<key>=<value>" pairs.
+// If the format does not match, the result will be empty.
+struct Annotation {
+  absl::string_view name;
+  struct Metadata {
+    absl::string_view key;
+    absl::string_view value;
+  };
+  std::vector<Metadata> metadata;
+};
+Annotation ParseAnnotation(absl::string_view annotation);
+
+inline bool HasMetadata(absl::string_view annotation) {
+  constexpr char kUserMetadataMarker = '#';
+  return !annotation.empty() && annotation.back() == kUserMetadataMarker;
+}
+
+std::vector<Annotation> ParseAnnotationStack(
+    absl::string_view annotation_stack);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_PARSE_ANNOTATION_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/per_thread.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/per_thread.h
new file mode 100644
index 00000000..f3e9d792
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/per_thread.h
@@ -0,0 +1,146 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_PER_THREAD_H_
+#define XLA_TSL_PROFILER_UTILS_PER_THREAD_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+
+namespace tsl {
+namespace profiler {
+
+// PerThread<T> provides a thread-local instance of T accessible to each
+// application application thread, and provides the profiler thread access to
+// all thread-local instances of T.
+//
+// Get() returns a thread-local instance of T that is created on first access.
+//
+// The thread-local instance is destroyed when the thread exits, unless
+// StartRecording has been called. During recording, if a thread exits, its
+// thread-local instance of T is kept alive until StopRecording is called.
+template <typename T>
+class PerThread {
+ public:
+  // Returns the thread-local instance of T.
+  static T& Get() {
+    static thread_local ThreadLocalPtr thread;
+    return thread.Get();
+  }
+
+  // Starts keeping all thread-local instances of T alive.
+  // Returns all instances of T from live threads.
+  static std::vector<std::shared_ptr<T>> StartRecording() {
+    return Registry::Get().StartRecording();
+  }
+
+  // Stops keeping thread-local instances of T alive.
+  // Returns all instances of T from live and destroyed threads.
+  static std::vector<std::shared_ptr<T>> StopRecording() {
+    return Registry::Get().StopRecording();
+  }
+
+ private:
+  // Prevent instantiation.
+  PerThread() = delete;
+  ~PerThread() = delete;
+
+  // Singleton registry of all thread-local instances of T.
+  class Registry {
+   public:
+    static Registry& Get() {
+      static Registry* singleton = new Registry();
+      return *singleton;
+    }
+
+    std::vector<std::shared_ptr<T>> StartRecording() {
+      std::vector<std::shared_ptr<T>> threads;
+      absl::MutexLock lock(&mutex_);
+      threads.reserve(threads_.size());
+      for (auto iter = threads_.begin(); iter != threads_.end(); ++iter) {
+        threads.push_back(iter->first);
+      }
+      recording_ = true;
+      return threads;
+    }
+
+    std::vector<std::shared_ptr<T>> StopRecording() {
+      std::vector<std::shared_ptr<T>> threads;
+      absl::MutexLock lock(&mutex_);
+      threads.reserve(threads_.size());
+      for (auto iter = threads_.begin(); iter != threads_.end();) {
+        if (!iter->second) {  // The creator thread is dead.
+          threads.push_back(std::move(iter->first));
+          threads_.erase(iter++);
+        } else {
+          threads.push_back(iter->first);
+          ++iter;
+        }
+      }
+      recording_ = false;
+      return threads;
+    }
+
+    void Register(std::shared_ptr<T> thread) {
+      absl::MutexLock lock(&mutex_);
+      threads_.insert_or_assign(std::move(thread), true);
+    }
+
+    void Unregister(const std::shared_ptr<T>& thread) {
+      absl::MutexLock lock(&mutex_);
+      if (!recording_) {
+        threads_.erase(thread);
+      } else {
+        if (auto it = threads_.find(thread); it != threads_.end()) {
+          it->second = false;
+        }
+      }
+    }
+
+   private:
+    Registry() = default;
+
+    Registry(const Registry&) = delete;
+    void operator=(const Registry&) = delete;
+
+    absl::Mutex mutex_;
+    absl::flat_hash_map<std::shared_ptr<T>, bool> threads_
+        ABSL_GUARDED_BY(mutex_);
+    bool recording_ ABSL_GUARDED_BY(mutex_) = false;
+  };
+
+  // Thread-local instance of T.
+  class ThreadLocalPtr {
+   public:
+    ThreadLocalPtr() : ptr_(std::make_shared<T>()) {
+      Registry::Get().Register(ptr_);
+    }
+
+    ~ThreadLocalPtr() { Registry::Get().Unregister(ptr_); }
+
+    T& Get() { return *ptr_; }
+
+   private:
+    std::shared_ptr<T> ptr_;
+  };
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_PER_THREAD_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/preprocess_xplane.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/preprocess_xplane.h
new file mode 100644
index 00000000..46f94c16
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/preprocess_xplane.h
@@ -0,0 +1,536 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
+#define XLA_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/hash/hash.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
+#include "xla/tsl/profiler/utils/trace_utils.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "xla/tsl/profiler/utils/xplane_mutators.h"
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/lib/context_types.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+static constexpr uint32_t kRunIdMask = (1U << 27) - 1;
+
+/*
+ * mutate specific HostEventType by adding "_r" Xstats, which equal to the
+ * specified root level.
+ */
+class XplaneRootEventMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory(
+      HostEventType event_type, int64_t root_level) {
+    return absl::WrapUnique(
+        new XplaneRootEventMutatorFactory(event_type, root_level));
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder& xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    if (auto* event_metadata =
+            xplane.GetEventMetadata(GetHostEventTypeStr(event_type_))) {
+      XStatMetadata* root_metadata =
+          xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kIsRoot));
+      mutators.emplace_back(std::make_unique<XplaneRootEventMutator>(
+          event_metadata, *root_metadata, root_level_));
+    }
+    return mutators;
+  }
+
+ private:
+  explicit XplaneRootEventMutatorFactory(HostEventType event_type,
+                                         int64_t root_level)
+      : event_type_(event_type), root_level_(root_level) {}
+
+  class XplaneRootEventMutator : public XplaneEventMutator {
+   public:
+    XplaneRootEventMutator(XEventMetadata* event_metadata,
+                           XStatMetadata& root_stats_metadata,
+                           int64_t root_level)
+        : XplaneEventMutator(event_metadata),
+          root_stats_metadata_(root_stats_metadata),
+          root_level_(root_level) {}
+    void Mutate(XEventBuilder& event_builder) override {
+      event_builder.SetOrAddStatValue(root_stats_metadata_, root_level_);
+    }
+    void MutateEventsInLine(XLineBuilder& line) override {
+      CHECK(false);  // Crash OK
+    }
+
+   private:
+    XStatMetadata& root_stats_metadata_;
+    int64_t root_level_;
+  };
+
+  HostEventType event_type_;
+  int64_t root_level_;
+};
+
+template <typename StatValueType, StatType kStatId>
+class XContextStatsAccessor {
+ public:
+  using value_type = StatValueType;
+
+  bool Initialize(XPlaneBuilder& xplane) {
+    stats_metadata_ = xplane.GetStatMetadata(GetStatTypeStr(kStatId));
+    return stats_metadata_;
+  }
+
+  std::optional<StatValueType> GetStat(XEventBuilder& event_builder) {
+    if (stats_metadata_ == nullptr) return std::nullopt;
+    auto* stat = event_builder.GetStat(*stats_metadata_);
+    if (stat == nullptr) return std::nullopt;
+    if constexpr (std::is_integral_v<StatValueType>) {
+      return event_builder.IntOrUintValue(*stat);
+    } else {
+      return event_builder.StrOrRefValue(*stat);
+    }
+  }
+
+ private:
+  XStatMetadata* stats_metadata_ = nullptr;
+};
+
+template <typename StatValueType, StatType kStatId, StatValueType kDefaultValue>
+class XContextStatsAccessorWithDefault {
+ public:
+  using value_type = StatValueType;
+
+  bool Initialize(XPlaneBuilder& xplane) {
+    stats_metadata_ = xplane.GetStatMetadata(GetStatTypeStr(kStatId));
+    return true;  // Always return true, even stat_metadata doesn't exist.
+  }
+
+  std::optional<StatValueType> GetStat(XEventBuilder& event_builder) {
+    if (stats_metadata_ == nullptr) return kDefaultValue;
+    auto* stat = event_builder.GetStat(*stats_metadata_);
+    if (stat == nullptr) return kDefaultValue;
+    if constexpr (std::is_integral_v<StatValueType>) {
+      return event_builder.IntOrUintValue(*stat);
+    } else {
+      return event_builder.StrOrRefValue(*stat);
+    }
+  }
+
+ private:
+  XStatMetadata* stats_metadata_ = nullptr;
+};
+
+// A template helper for tuple manipulation, although std::apply can achieve
+// similar result. However it requires C++ 17, TF windows bot is still C++ 14.
+template <std::size_t... Idx>
+auto make_index_dispatcher(std::index_sequence<Idx...>) {
+  return [](auto&& f) { (f(std::integral_constant<std::size_t, Idx>{}), ...); };
+}
+
+template <std::size_t N>
+auto make_index_dispatcher() {
+  return make_index_dispatcher(std::make_index_sequence<N>{});
+}
+
+template <typename Tuple, typename Func>
+void for_each(Tuple&& t, Func&& f) {
+  constexpr auto n = std::tuple_size<std::decay_t<Tuple>>::value;
+  auto dispatcher = make_index_dispatcher<n>();
+  dispatcher([&f, &t](auto idx) { f(std::get<idx>(std::forward<Tuple>(t))); });
+}
+
+/*
+ * mutate specific pair of HostEventType with specified XStats list by adding
+ * relevant producer and consumer connected TraceMe 2.0 semantics.
+ * 1. both produer and consumer side of smenatics is populated,
+ * 2. using the specified ContextType.
+ * 3. unique context id is automatically generated.
+ *    if the combination of stats value is unique under specified context_type,
+ *    then set unique_stats true, then context_id is a hash of stats tuple.
+ *    otherwise (unique_stats = false), context id is computed as a hash of
+ *    tuple <producer_event, consumer_event, stats>
+ */
+template <HostEventType producer_event, HostEventType consumer_event,
+          ContextType context_type, bool unique_stats,
+          typename... StatsAccessorTypes>
+class XplaneConnectedEventMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new XplaneConnectedEventMutatorFactory());
+  }
+
+  using StatsAccessors = std::tuple<StatsAccessorTypes...>;
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder& xplane) const override {
+    // Check if all stats exist in current plane.
+    StatsAccessors stats_accessors;
+    bool all_required_stats_exist = true;
+    auto check_stats_meta = [&all_required_stats_exist,
+                             &xplane](auto&& accessor) {
+      all_required_stats_exist =
+          all_required_stats_exist && accessor.Initialize(xplane);
+    };
+    for_each(stats_accessors, check_stats_meta);
+    if (!all_required_stats_exist) return {};
+
+    XEventMetadata* producer_event_metadata =
+        xplane.GetEventMetadata(GetHostEventTypeStr(producer_event));
+    XEventMetadata* consumer_event_metadata =
+        xplane.GetEventMetadata(GetHostEventTypeStr(consumer_event));
+
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    if (producer_event_metadata) {
+      XStatMetadata* context_type_metadata = xplane.GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kProducerType));
+      XStatMetadata* context_id_metadata =
+          xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProducerId));
+      mutators.emplace_back(std::make_unique<XplaneConnectedEventMutator>(
+          producer_event_metadata, *context_type_metadata, *context_id_metadata,
+          stats_accessors));
+    }
+    if (consumer_event_metadata) {
+      XStatMetadata* context_type_metadata = xplane.GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerType));
+      XStatMetadata* context_id_metadata =
+          xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kConsumerId));
+      mutators.emplace_back(std::make_unique<XplaneConnectedEventMutator>(
+          consumer_event_metadata, *context_type_metadata, *context_id_metadata,
+          stats_accessors));
+    }
+    return mutators;
+  }
+
+ private:
+  XplaneConnectedEventMutatorFactory() = default;
+
+  class XplaneConnectedEventMutator : public XplaneEventMutator {
+   public:
+    XplaneConnectedEventMutator(XEventMetadata* event_metadata,
+                                XStatMetadata& context_type_metadata,
+                                XStatMetadata& context_id_metadata,
+                                const StatsAccessors& accessors)
+        : XplaneEventMutator(event_metadata),
+          context_type_metadata_(context_type_metadata),
+          context_id_metadata_(context_id_metadata),
+          accessors_(accessors) {}
+
+    void Mutate(XEventBuilder& event_builder) override {
+      bool all_required_stats_exist = true;
+      std::vector<std::variant<absl::string_view, uint64_t>> required_stats;
+      auto check_stats_meta = [&all_required_stats_exist, &required_stats,
+                               &event_builder](auto&& accessor) {
+        if (all_required_stats_exist == false) return;
+        auto stats_data = accessor.GetStat(event_builder);
+        if (!stats_data) {
+          all_required_stats_exist = false;
+        } else {
+          required_stats.emplace_back(*stats_data);
+        }
+      };
+      for_each(accessors_, check_stats_meta);
+
+      if (!all_required_stats_exist) return;
+
+      int64_t context_id;
+      if constexpr (unique_stats) {
+        context_id = absl::HashOf(required_stats);
+      } else {
+        context_id =
+            absl::HashOf(producer_event, consumer_event, required_stats);
+      }
+      event_builder.SetOrAddStatValue(context_type_metadata_,
+                                      static_cast<int64_t>(context_type));
+      event_builder.SetOrAddStatValue(context_id_metadata_, context_id);
+    }
+
+    void MutateEventsInLine(XLineBuilder& line) override {
+      CHECK(false);  // Crash OK
+    }
+
+   private:
+    XStatMetadata& context_type_metadata_;
+    XStatMetadata& context_id_metadata_;
+    StatsAccessors accessors_;
+  };
+};
+
+template <HostEventType event_type>
+class HostRunIdMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new HostRunIdMutatorFactory());
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder& xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    if (auto* event_metadata =
+            xplane.GetEventMetadata(GetHostEventTypeStr(event_type))) {
+      XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor;
+      if (run_id_stats_accessor.Initialize(xplane)) {
+        XStatMetadata* run_id_metadata =
+            xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kRunId));
+        mutators.emplace_back(std::make_unique<HostRunIdMutator>(
+            event_metadata, run_id_stats_accessor, *run_id_metadata));
+      }
+    }
+    return mutators;
+  }
+
+ private:
+  HostRunIdMutatorFactory() = default;
+  class HostRunIdMutator : public XplaneEventMutator {
+   public:
+    HostRunIdMutator(
+        XEventMetadata* event_metadata,
+        XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor,
+        XStatMetadata& run_id_metadata)
+        : XplaneEventMutator(event_metadata),
+          run_id_stats_accessor_(run_id_stats_accessor),
+          run_id_metadata_(run_id_metadata) {}
+
+    void Mutate(XEventBuilder& event_builder) override {
+      auto run_id = run_id_stats_accessor_.GetStat(event_builder);
+      if (!run_id) return;
+      int64_t fixed_run_id = ((uint64_t)run_id.value() & kRunIdMask);
+      event_builder.SetOrAddStatValue(run_id_metadata_, fixed_run_id);
+    }
+
+    void MutateEventsInLine(XLineBuilder& line) override {
+      CHECK(false);  // Crash OK
+    }
+
+   private:
+    XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor_;
+    XStatMetadata& run_id_metadata_;
+  };
+};
+
+// Line mutator for TPU XLA module line.
+// To connect these events with launch events from CPU plane, we need to
+// create appropriate TraceMe 2.0 semantics (_c, _ct stats) from their
+// device_ordinal(from plane name) / run_id / queue_id stats (from event stats).
+class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new TpuModuleLineMutatorFactory());
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder& xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    if (absl::StartsWith(xplane.Name(), kTpuPlanePrefix) &&
+        GetTensorCoreId(xplane.Name()).has_value()) {
+      if (auto device_ordinal = ParseDeviceOrdinal(xplane.Name())) {
+        XStatMetadata* context_type_metadata = xplane.GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kConsumerType));
+        XStatMetadata* context_id_metadata = xplane.GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kConsumerId));
+        XContextStatsAccessor<uint64_t, StatType::kQueueId>
+            queue_id_stats_accessor;
+        XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor;
+        XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
+            core_type_stats_accessor;
+        if (queue_id_stats_accessor.Initialize(xplane) &&
+            run_id_stats_accessor.Initialize(xplane) &&
+            core_type_stats_accessor.Initialize(xplane)) {
+          mutators.emplace_back(std::make_unique<TpuModuleLineMutator>(
+              *device_ordinal, *context_type_metadata, *context_id_metadata,
+              queue_id_stats_accessor, run_id_stats_accessor,
+              core_type_stats_accessor));
+        }
+      }
+    }
+    return mutators;
+  }
+
+ private:
+  TpuModuleLineMutatorFactory() = default;
+
+  class TpuModuleLineMutator : public XplaneEventMutator {
+   public:
+    TpuModuleLineMutator(
+        uint32_t device_ordinal, XStatMetadata& context_type_metadata,
+        XStatMetadata& context_id_metadata,
+        XContextStatsAccessor<uint64_t, StatType::kQueueId>
+            queue_id_stats_accessor,
+        XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor,
+        XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
+            core_type_stats_accessor)
+        : XplaneEventMutator(nullptr),
+          device_ordinal_(device_ordinal),
+          context_type_metadata_(context_type_metadata),
+          context_id_metadata_(context_id_metadata),
+          queue_id_stats_accessor_(queue_id_stats_accessor),
+          run_id_stats_accessor_(run_id_stats_accessor),
+          core_type_stats_accessor_(core_type_stats_accessor) {}
+
+    void Mutate(XEventBuilder& event_builder) override {
+      CHECK(false);  // Crash OK
+    }
+
+    void MutateEventsInLine(XLineBuilder& line) override {
+      if (line.Name() != kXlaModuleLineName) return;
+      line.ForEachEvent([&](XEventBuilder event) {
+        auto run_id = run_id_stats_accessor_.GetStat(event);
+        auto queue_id = queue_id_stats_accessor_.GetStat(event);
+        auto core_type = core_type_stats_accessor_.GetStat(event);
+        if (!run_id || !queue_id) return;
+        // The order of tuple <device_ordinal, queue_id, run_id> need to be
+        // consistent with other kTpuLaunch types.
+        std::vector<std::variant<absl::string_view, uint64_t>> required_stats;
+        required_stats.reserve(4);
+        required_stats.push_back(device_ordinal_);
+        required_stats.emplace_back(*queue_id);
+        required_stats.emplace_back(*run_id);
+        required_stats.emplace_back(static_cast<uint64_t>(*core_type));
+        int64_t context_id = absl::HashOf(required_stats);
+        event.SetOrAddStatValue(context_type_metadata_,
+                                static_cast<int64_t>(ContextType::kTpuLaunch));
+        event.SetOrAddStatValue(context_id_metadata_, context_id);
+      });
+    }
+
+   private:
+    uint64_t device_ordinal_;
+    XStatMetadata& context_type_metadata_;
+    XStatMetadata& context_id_metadata_;
+    XContextStatsAccessor<uint64_t, StatType::kQueueId>
+        queue_id_stats_accessor_;
+    XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor_;
+    XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
+        core_type_stats_accessor_;
+  };
+};
+
+// Line mutator for threadpool line.
+// Threadpool Line Mutator create a kThreadpoolListenerRegion from StartRegion
+// to StopRegion events, and propagates the context information from the
+// StartRegion to the newly added event.
+class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new ThreadpoolLineMutatorFactory());
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder& xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    mutators.emplace_back(std::make_unique<ThreadpoolLineMutator>(xplane));
+    return mutators;
+  }
+
+ private:
+  ThreadpoolLineMutatorFactory() = default;
+
+  class ThreadpoolLineMutator : public XplaneEventMutator {
+   public:
+    explicit ThreadpoolLineMutator(XPlaneBuilder& xplane)
+        : XplaneEventMutator(nullptr), xplane_(xplane) {
+      start_region_metadata_ =
+          xplane_.GetEventMetadata(kThreadpoolListenerStartRegion);
+      stop_region_metadata_ =
+          xplane_.GetEventMetadata(kThreadpoolListenerStopRegion);
+      thread_pool_metadata_ =
+          xplane_.GetOrCreateEventMetadata(kThreadpoolListenerRegion);
+      consumer_ = xplane_.GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerId));
+      consumer_type_ = xplane_.GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerType));
+    }
+
+    void Mutate(XEventBuilder& event_builder) override {
+      CHECK(false);  // Crash OK
+    }
+
+    void MutateEventsInLine(XLineBuilder& line) override {
+      if (start_region_metadata_ == nullptr ||
+          stop_region_metadata_ == nullptr) {
+        // Skip mutations for xplanes that do not have region markers. These
+        // include device_planes, or situations where the threadpool_listeners
+        // did not start or were not present.
+        return;
+      }
+      int64_t start_region_timestamp_ps = 0;
+      int64_t region_id;
+      struct EventMetadata {
+        int64_t start_region_timestamp_ps;
+        int64_t region_id;
+        int64_t end_region_timestamp_ps;
+      };
+
+      std::vector<EventMetadata> event_metadata;
+      line.ForEachEvent([&](const XEventBuilder& event) {
+        if (event.MetadataId() == start_region_metadata_->id()) {
+          auto consumer_id = event.GetStat(*consumer_);
+          if (!consumer_id) return;
+          start_region_timestamp_ps = event.TimestampPs();
+          region_id = event.IntOrUintValue(*consumer_id);
+        } else if (event.MetadataId() == stop_region_metadata_->id() &&
+                   start_region_timestamp_ps != 0) {
+          EventMetadata metadata;
+          metadata.start_region_timestamp_ps = start_region_timestamp_ps;
+          metadata.region_id = region_id;
+          metadata.end_region_timestamp_ps = event.TimestampPs();
+          event_metadata.push_back(metadata);
+        }
+      });
+      for (const auto& event_metadata : event_metadata) {
+        XEventBuilder region = line.AddEvent(*thread_pool_metadata_);
+        region.SetTimestampPs(event_metadata.start_region_timestamp_ps);
+        region.SetEndTimestampPs(event_metadata.end_region_timestamp_ps);
+        region.SetOrAddStatValue(*consumer_, event_metadata.region_id);
+        region.SetOrAddStatValue(
+            *consumer_type_,
+            static_cast<int64_t>(ContextType::kThreadpoolEvent));
+      }
+    }
+
+   private:
+    XStatMetadata* consumer_;
+    XStatMetadata* consumer_type_;
+    XPlaneBuilder& xplane_;
+    XEventMetadata* start_region_metadata_;
+    XEventMetadata* stop_region_metadata_;
+    XEventMetadata* thread_pool_metadata_;
+  };
+};
+// Preprocess the given XSpace to support legacy traces. It converts old
+// context events and stats into new ones according to
+// go/xprof-traceme2-semantics.
+void PreprocessXSpace(XSpace* space);
+void PreprocessXPlane(XPlane* plane);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/session_manager.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/session_manager.h
new file mode 100644
index 00000000..557f7085
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/session_manager.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_SESSION_MANAGER_H_
+#define XLA_TSL_PROFILER_UTILS_SESSION_MANAGER_H_
+
+#include <string>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Validate RemoteProfilerSessionManagerOptions.
+absl::Status ValidateRemoteProfilerSessionManagerOptions(
+    const tensorflow::RemoteProfilerSessionManagerOptions& options);
+
+// Get RemoteSessionManagerOptions from logdir and opts.
+tensorflow::RemoteProfilerSessionManagerOptions
+GetRemoteSessionManagerOptionsLocked(
+    absl::string_view logdir,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        opts);
+
+// Get RemoteSessionManagerOptions from provided options.
+tensorflow::RemoteProfilerSessionManagerOptions
+GetRemoteSessionManagerOptionsLocked(
+    absl::string_view service_addresses, absl::string_view logdir,
+    absl::string_view worker_list, bool include_dataset_ops,
+    int32_t duration_ms,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        opts,
+    bool* is_cloud_tpu_session);
+
+// Validate Host Port pair.
+absl::Status ValidateHostPortPair(absl::string_view host_port);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_SESSION_MANAGER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tf_op_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tf_op_utils.h
new file mode 100644
index 00000000..6ef73646
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tf_op_utils.h
@@ -0,0 +1,154 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TF_OP_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_TF_OP_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/macros.h"
+
+namespace tsl {
+namespace profiler {
+
+// Special op types.
+TF_CONST_INIT extern const absl::string_view kUnknownOp;
+TF_CONST_INIT extern const absl::string_view kDatasetOp;
+TF_CONST_INIT extern const absl::string_view kMemcpyHToDOp;
+TF_CONST_INIT extern const absl::string_view kMemcpyDToHOp;
+TF_CONST_INIT extern const absl::string_view kMemcpyDToDOp;
+TF_CONST_INIT extern const absl::string_view kMemcpyHToHOp;
+
+enum class Category {
+  kUnknown,
+  kTensorFlow,
+  kJax,
+  kTfData,
+  kMemcpyHToD,
+  kMemcpyDToH,
+  kMemcpyDToD,
+  kMemcpyHToH,
+};
+
+// Breaks a TensorFlow op fullname into name and type.
+struct TfOp {
+  Category category = Category::kUnknown;
+  absl::string_view name;
+  absl::string_view type;
+};
+TfOp ParseTfOpFullname(absl::string_view tf_op_fullname);
+
+// Returns a vector of TF name scopes extracted from a TF op name.
+std::vector<absl::string_view> ParseTfNameScopes(absl::string_view tf_op_name);
+std::vector<absl::string_view> ParseTfNameScopes(const TfOp& tf_op);
+
+// Trace event name for TF ops is the op type so they have the same color in
+// trace viewer.
+std::string TfOpEventName(const TfOp& tf_op);
+std::string TfOpEventName(absl::string_view tf_op_fullname);
+
+// Trace event name for dataset ops.
+std::string DatasetOpEventName(absl::string_view full_name);
+
+// Returns the iterator name without prefix and parent iterator names.
+std::string IteratorName(absl::string_view full_name);
+
+// Returns true if the given name is a TensorFlow Dataset Op.
+inline bool IsDatasetOp(absl::string_view tf_op_type) {
+  return tf_op_type == kDatasetOp;
+}
+inline bool IsDatasetOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kTfData;
+}
+
+// Returns true if the given name is a TensorFlow Infeed Enqueue Op.
+// See: tensorflow/tsl/tpu/kernels/infeed_ops.h
+inline bool IsInfeedEnqueueOp(absl::string_view tf_op_type) {
+  return absl::StartsWith(tf_op_type, "InfeedEnqueue");
+}
+inline bool IsInfeedEnqueueOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kTensorFlow &&
+         IsInfeedEnqueueOp(tf_op.type);
+}
+
+// Returns true if the given op has XlaSendToHost/XlaRecvFromHost in fullname.
+inline bool IsOutsideCompilationOp(absl::string_view tf_op_fullname) {
+  if (absl::EndsWith(tf_op_fullname, ":XlaSendToHost")) return true;
+  if (absl::EndsWith(tf_op_fullname, ":XlaRecvFromHost")) return true;
+  return false;
+}
+
+// Returns true if the given op is for outside compilation.
+inline bool IsOutsideCompilationOp(absl::string_view tf_op_fullname,
+                                   absl::string_view hlo_expression) {
+  if (IsOutsideCompilationOp(tf_op_fullname)) return true;
+  if (absl::StrContains(hlo_expression, "send-done") &&
+      absl::StrContains(hlo_expression, "is_host_transfer=true"))
+    return true;
+  return false;
+}
+
+// Returns true if the given name is a TensorFlow embedding op.
+inline bool IsEmbeddingOp(absl::string_view tf_op_fullname) {
+  return absl::StrContains(tf_op_fullname, "Embedding");
+}
+
+// Returns true if the given op is for copying data from host to device.
+inline bool IsMemcpyHToDOp(absl::string_view tf_op_type) {
+  return tf_op_type == kMemcpyHToDOp;
+}
+inline bool IsMemcpyHToDOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kMemcpyHToD;
+}
+
+// Returns true if the given op is for copying data from device to host.
+inline bool IsMemcpyDToHOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kMemcpyDToH;
+}
+
+// Returns true if the given op is for copying data from device to device.
+inline bool IsMemcpyDToDOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kMemcpyDToD;
+}
+
+// Returns true if the given op is for copying data from host to host.
+inline bool IsMemcpyHToHOp(const TfOp& tf_op) {
+  return tf_op.category == Category::kMemcpyHToH;
+}
+
+// Splits a string of tensor shapes in "(shape1;shape2;...)" format, i.e.,
+// delimited by '(' and ')' and separated by ';', into the individual shapes.
+std::vector<absl::string_view> ParseTensorShapes(
+    absl::string_view tensor_shapes);
+
+// Returns true if the given string matches OpDef.name pattern.
+bool IsTfOpName(absl::string_view op_name);
+
+// Returns true if the given string matches NodeDef.name pattern.
+bool IsTfOpType(absl::string_view op_type);
+
+// Returns true if the given string matches JAX pattern.
+bool IsJaxOpType(absl::string_view op_type);
+
+// Returns true if the given strings match JAX pattern.
+bool IsJaxOpNameAndType(absl::string_view op_name, absl::string_view op_type);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TF_OP_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tf_xplane_visitor.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tf_xplane_visitor.h
new file mode 100644
index 00000000..f9025629
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tf_xplane_visitor.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
+#define XLA_TSL_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
+
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+inline XPlaneVisitor CreateTfXPlaneVisitor(
+    const tensorflow::profiler::XPlane* plane) {
+  return XPlaneVisitor(plane, {FindHostEventType, FindTfOpEventType},
+                       {FindStatType});
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/time_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/time_utils.h
new file mode 100644
index 00000000..65c12c70
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/time_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TIME_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_TIME_UTILS_H_
+
+#include <cstdint>
+
+#include "xla/tsl/profiler/utils/math_utils.h"
+
+namespace tsl {
+namespace profiler {
+
+// Returns the current CPU wallclock time in nanoseconds.
+int64_t GetCurrentTimeNanos();
+
+// Sleeps for the specified duration.
+void SleepForNanos(int64_t ns);
+inline void SleepForMicros(int64_t us) { SleepForNanos(MicroToNano(us)); }
+inline void SleepForMillis(int64_t ms) { SleepForNanos(MilliToNano(ms)); }
+inline void SleepForSeconds(int64_t s) { SleepForNanos(UniToNano(s)); }
+
+// Spins to simulate doing some work instead of sleeping, because sleep
+// precision is poor. For testing only.
+void SpinForNanos(int64_t ns);
+inline void SpinForMicros(int64_t us) { SpinForNanos(us * 1000); }
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TIME_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/timespan.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/timespan.h
new file mode 100644
index 00000000..ea913b14
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/timespan.h
@@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TIMESPAN_H_
+#define XLA_TSL_PROFILER_UTILS_TIMESPAN_H_
+
+#include <algorithm>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/utils/math_utils.h"
+
+namespace tsl {
+namespace profiler {
+
+// A Timespan is the time extent of an event: a pair of (begin, duration).
+// Events may have duration 0 ("instant events") but duration can't be negative.
+class Timespan {
+ public:
+  static Timespan FromEndPoints(uint64 begin_ps, uint64 end_ps) {
+    if (begin_ps > end_ps) {
+      return Timespan(begin_ps, 0);
+    }
+    return Timespan(begin_ps, end_ps - begin_ps);
+  }
+
+  explicit Timespan(uint64 begin_ps = 0, uint64 duration_ps = 0)
+      : begin_ps_(begin_ps), duration_ps_(duration_ps) {}
+
+  uint64 begin_ps() const { return begin_ps_; }
+  uint64 middle_ps() const { return begin_ps_ + duration_ps_ / 2; }
+  uint64 end_ps() const { return begin_ps_ + duration_ps_; }
+  uint64 duration_ps() const { return duration_ps_; }
+
+  // Returns true if the Timespan represents an instant in time (duration 0).
+  bool Instant() const { return duration_ps() == 0; }
+
+  // Returns true if this is an empty timespan.
+  bool Empty() const { return begin_ps() == 0 && duration_ps() == 0; }
+
+  // Note for Overlaps() and Includes(Timespan& other) below:
+  //   We have a design choice whether the end-point comparison should be
+  //   inclusive or exclusive. We decide to go for inclusive. The implication
+  //   is that an instant timespan could belong to two consecutive intervals
+  //   (e.g., Timespan(12, 0) will be included in both Timespan(11, 1) and
+  //   Timespan(12, 1)). We think this is okay because the common scenario
+  //   would be that we search for the interval that includes a point
+  //   in time from left to right, and return the first interval found.
+
+  // Returns true if the Timespan overlaps with other.
+  bool Overlaps(const Timespan& other) const {
+    return begin_ps() <= other.end_ps() && other.begin_ps() <= end_ps();
+  }
+
+  // Returns true if this Timespan includes the other.
+  bool Includes(const Timespan& other) const {
+    return begin_ps() <= other.begin_ps() && other.end_ps() <= end_ps();
+  }
+
+  // Returns true if time_ps is within this Timespan.
+  bool Includes(uint64 time_ps) const { return Includes(Timespan(time_ps)); }
+
+  // Returns the duration in ps that this Timespan overlaps with the other.
+  uint64 OverlappedDurationPs(const Timespan& other) const {
+    if (!Overlaps(other)) return 0;
+    return std::min(end_ps(), other.end_ps()) -
+           std::max(begin_ps(), other.begin_ps());
+  }
+
+  // Expands the timespan to include other.
+  void ExpandToInclude(const Timespan& other) {
+    *this = FromEndPoints(std::min(begin_ps(), other.begin_ps()),
+                          std::max(end_ps(), other.end_ps()));
+  }
+
+  // Compares timespans by their begin time (ascending), duration (descending)
+  // so nested spans are sorted from outer to innermost.
+  bool operator<(const Timespan& other) const {
+    if (begin_ps_ < other.begin_ps_) return true;
+    if (begin_ps_ > other.begin_ps_) return false;
+    return duration_ps_ > other.duration_ps_;
+  }
+
+  // Returns true if this timespan is equal to the given timespan.
+  bool operator==(const Timespan& other) const {
+    return begin_ps_ == other.begin_ps_ && duration_ps_ == other.duration_ps_;
+  }
+
+  // The compiler can't synthesize <= from < and == until C++ 20's <=>, but we
+  // can't yet assume C++20 support.
+  bool operator<=(const Timespan& other) const {
+    return *this < other || *this == other;
+  }
+
+  // Returns a string that shows the begin and end times.
+  std::string DebugString() const {
+    return absl::StrCat("[", begin_ps(), ", ", end_ps(), "]");
+  }
+
+  // Compares timespans by their duration_ps (ascending), begin time
+  // (ascending).
+  static bool ByDuration(const Timespan& a, const Timespan& b) {
+    if (a.duration_ps_ < b.duration_ps_) return true;
+    if (a.duration_ps_ > b.duration_ps_) return false;
+    return a.begin_ps_ < b.begin_ps_;
+  }
+
+ private:
+  uint64 begin_ps_;
+  uint64 duration_ps_;  // 0 for an instant event.
+};
+
+// Creates a Timespan from endpoints in picoseconds.
+inline Timespan PicoSpan(uint64 start_ps, uint64 end_ps) {
+  return Timespan::FromEndPoints(start_ps, end_ps);
+}
+
+// Creates a Timespan from endpoints in milliseconds.
+inline Timespan MilliSpan(double start_ms, double end_ms) {
+  return PicoSpan(MilliToPico(start_ms), MilliToPico(end_ms));
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TIMESPAN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/timestamp_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/timestamp_utils.h
new file mode 100644
index 00000000..a2b61672
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/timestamp_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TIMESTAMP_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_TIMESTAMP_UTILS_H_
+
+#include <cstdint>
+
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Add metadata regarding profile start_time and stop_time to xspace.
+// This function won't have an effect if either of the timestamps is zero.
+void SetSessionTimestamps(uint64_t start_walltime_ns, uint64_t stop_walltime_ns,
+                          tensorflow::profiler::XSpace& space);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TIMESTAMP_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tpu_xplane_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tpu_xplane_utils.h
new file mode 100644
index 00000000..3f6adb49
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/tpu_xplane_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_TPU_XPLANE_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_TPU_XPLANE_UTILS_H_
+
+#include <optional>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Find and return TensorCore XPlanes from the XSpace.
+std::vector<const tensorflow::profiler::XPlane*> FindTensorCorePlanes(
+    const tensorflow::profiler::XSpace& xspace);
+
+// Find and return Mutable TensorCore XPlanes from the XSpace.
+std::vector<tensorflow::profiler::XPlane*> FindMutableTensorCorePlanes(
+    tensorflow::profiler::XSpace* xspace);
+
+// Get Tensorcore Id from TensorCore plane name if plane name is a valid
+// TensorCore plane name.
+std::optional<int> GetTensorCoreId(absl::string_view plane_name);
+
+// Get Sparsecore Id from SparseCore plane name if plane name is a valid
+// SparseCore plane name.
+std::optional<int> GetSparseCoreId(absl::string_view plane_name);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TPU_XPLANE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/trace_filter_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/trace_filter_utils.h
new file mode 100644
index 00000000..29ec2d32
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/trace_filter_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TRACE_FILTER_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_TRACE_FILTER_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+
+namespace tsl {
+namespace profiler {
+
+// TraceMeFilter is a bitmap that will be used to filter out TraceMe events
+// during recording. Filter will be applied only if record function (e.g.
+// TraceMe, ActivityStart, InstantActivity etc.) with filter_mask is called.
+// If filter_mask is not passed in the profile request, filter will not be
+// applied. Lowest 32 bit are reserved for 3P so this enum should only have
+// values in the range [0, 31].
+enum class TraceMeFilter {
+  kTraceMemory = 0,
+};
+
+static uint64_t TraceMeFiltersToMask(
+    absl::flat_hash_set<tsl::profiler::TraceMeFilter> filter) {
+  uint64_t mask = 0;
+  for (const tsl::profiler::TraceMeFilter filter : filter) {
+    DCHECK_LT(static_cast<uint64_t>(filter), 32);
+    mask |= (1ull << static_cast<uint64_t>(filter));
+  }
+  return mask;
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TRACE_FILTER_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/trace_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
new file mode 100644
index 00000000..090e9ae1
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_TRACE_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_TRACE_UTILS_H_
+
+#include <optional>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+namespace profiler {
+
+// Constants used as trace_viewer PID (device_id in trace_events.proto).
+// PID 0 is unused.
+// Support up to 500 accelerator devices.
+constexpr uint32 kFirstDeviceId = 1;
+constexpr uint32 kLastDeviceId = 500;
+// Support Upto 200 custom planes as fake devices (i.e., planes with a
+// "/custom:" prefix). See `<project_name>::kCustomPlanePrefix` for more
+// information
+constexpr uint32 kFirstCustomPlaneDeviceId = kLastDeviceId + 1;
+constexpr uint32 kMaxCustomPlaneDevicesPerHost = 200;
+constexpr uint32 kLastCustomPlaneDeviceId =
+    kFirstCustomPlaneDeviceId + kMaxCustomPlaneDevicesPerHost - 1;
+// Host threads are shown as a single fake device.
+constexpr uint32 kHostThreadsDeviceId = kLastCustomPlaneDeviceId + 1;
+
+// Constants used as trace_viewer TID (resource_id in trace_events.proto).
+constexpr int kThreadIdDerivedMin = 0xdeadbeef;
+constexpr int kThreadIdStepInfo = kThreadIdDerivedMin;
+constexpr int kThreadIdKernelLaunch = kThreadIdDerivedMin + 1;
+constexpr int kThreadIdTfNameScope = kThreadIdDerivedMin + 2;
+constexpr int kThreadIdTfOp = kThreadIdDerivedMin + 3;
+constexpr int kThreadIdHloModule = kThreadIdDerivedMin + 4;
+constexpr int kThreadIdHloOp = kThreadIdDerivedMin + 5;
+constexpr int kThreadIdOverhead = kThreadIdDerivedMin + 6;
+constexpr int kThreadIdSource = kThreadIdDerivedMin + 7;
+constexpr int kThreadIdHostOffloadOpStart = kThreadIdDerivedMin + 8;
+constexpr int kThreadIdHostOffloadOpEnd = kThreadIdDerivedMin + 48;
+// Space for derived lines for host XLA Ops
+constexpr int kThreadIdHostXlaRegionStart = kThreadIdDerivedMin + 49;
+constexpr int kThreadIdHostXlaRegionEnd = kThreadIdHostXlaRegionStart + 240;
+constexpr int kThreadIdDerivedMax = kThreadIdHostXlaRegionEnd;
+
+static inline bool IsDerivedThreadId(int thread_id) {
+  return thread_id >= kThreadIdDerivedMin && thread_id <= kThreadIdDerivedMax;
+}
+
+// Parses the device ordinal (N) from device names that use TensorFlow
+// convention: "hostname /device:xPU:N".
+static inline std::optional<uint32_t> ParseDeviceOrdinal(
+    absl::string_view device_name) {
+  if (auto pos = device_name.find_last_of(':');
+      pos != absl::string_view::npos) {
+    device_name.remove_prefix(pos + 1);
+  }
+  if (auto pos = device_name.find_first_of(' ');
+      pos != absl::string_view::npos) {
+    device_name.remove_suffix(device_name.size() - pos);
+  }
+  uint32_t device_id;
+  if (absl::SimpleAtoi(device_name, &device_id)) return device_id;
+  return std::nullopt;
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_TRACE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
new file mode 100644
index 00000000..02be5da5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
@@ -0,0 +1,462 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_XPLANE_BUILDER_H_
+#define XLA_TSL_PROFILER_UTILS_XPLANE_BUILDER_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/meta/type_traits.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/utils/math_utils.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+using tensorflow::profiler::XEvent;          // NOLINT
+using tensorflow::profiler::XEventMetadata;  // NOLINT
+using tensorflow::profiler::XLine;           // NOLINT
+using tensorflow::profiler::XPlane;          // NOLINT
+using tensorflow::profiler::XSpace;          // NOLINT
+using tensorflow::profiler::XStat;           // NOLINT
+using tensorflow::profiler::XStatMetadata;   // NOLINT
+
+class XPlaneBuilder;
+
+template <typename T>
+class XStatsBuilder {
+ public:
+  explicit XStatsBuilder(T* stats_owner, XPlaneBuilder* stats_metadata_owner)
+      : stats_owner_(stats_owner),
+        stats_metadata_owner_(stats_metadata_owner) {}
+
+  // NOTE: A stat shouldn't have existed for the given metadata.
+  // Adds a stat for the given metadata and sets its value.
+  template <typename ValueT>
+  void AddStatValue(const XStatMetadata& metadata, ValueT&& value) {
+    SetStatValue(std::forward<ValueT>(value), AddStat(metadata));
+  }
+
+  // Adds or finds a stat for the given metadata and sets its value.
+  template <typename ValueT>
+  void SetOrAddStatValue(const XStatMetadata& metadata, ValueT&& value) {
+    SetStatValue(std::forward<ValueT>(value), FindOrAddStat(metadata));
+  }
+
+  // Adds a stat by copying a stat from another XPlane. Does not check if a stat
+  // with the same metadata already exists in the event. To avoid duplicated
+  // stats, use the variant below.
+  void AddStat(const XStatMetadata& metadata, const XStat& src_stat,
+               const XPlane& src_plane) {
+    CopyStatValue(src_stat, src_plane, AddStat(metadata));
+  }
+  // Same as above but overrides an existing stat with the same metadata.
+  void SetOrAddStat(const XStatMetadata& metadata, const XStat& src_stat,
+                    const XPlane& src_plane) {
+    CopyStatValue(src_stat, src_plane, FindOrAddStat(metadata));
+  }
+
+  void ParseAndAddStatValue(const XStatMetadata& metadata,
+                            absl::string_view value) {
+    int64_t int_value;
+    uint64 uint_value;
+    double double_value;
+    if (absl::SimpleAtoi(value, &int_value)) {
+      AddStatValue(metadata, int_value);
+    } else if (absl::SimpleAtoi(value, &uint_value)) {
+      AddStatValue(metadata, uint_value);
+    } else if (absl::SimpleAtod(value, &double_value)) {
+      AddStatValue(metadata, double_value);
+    } else {
+      AddStatValue(metadata, GetOrCreateStatMetadata(value));
+    }
+  }
+
+  void ReserveStats(size_t num_stats) {
+    stats_owner_->mutable_stats()->Reserve(num_stats);
+  }
+
+  template <typename ForEachStatFunc>
+  void ForEachStat(ForEachStatFunc&& for_each_stat) {
+    for (XStat& stat : *stats_owner_->mutable_stats()) {
+      for_each_stat(&stat);
+    }
+  }
+
+  const XStat* GetStat(const XStatMetadata& stat_metadata) const {
+    for (auto& stat : *stats_owner_->mutable_stats()) {
+      if (stat.metadata_id() == stat_metadata.id()) {
+        return &stat;
+      }
+    }
+    return nullptr;
+  }
+
+  static uint64 IntOrUintValue(const XStat& stat) {
+    return stat.value_case() == XStat::kUint64Value ? stat.uint64_value()
+                                                    : stat.int64_value();
+  }
+
+  absl::string_view StrOrRefValue(const XStat& stat);
+
+ private:
+  XStat* AddStat(const XStatMetadata& metadata) {
+    XStat* stat = stats_owner_->add_stats();
+    stat->set_metadata_id(metadata.id());
+    return stat;
+  }
+
+  XStat* FindOrAddStat(const XStatMetadata& metadata) {
+    for (auto& stat : *stats_owner_->mutable_stats()) {
+      if (stat.metadata_id() == metadata.id()) {
+        return &stat;
+      }
+    }
+    return AddStat(metadata);
+  }
+
+  static void SetStatValue(bool value, XStat* stat) {
+    // bool is integral unsigned, but saved in the signed slot for backwards
+    // compatibility.
+    stat->set_int64_value(value);
+  }
+  template <typename Int,
+            std::enable_if_t<absl::conjunction<std::is_integral<Int>,
+                                               std::is_signed<Int>>::value,
+                             bool> = true>
+  static void SetStatValue(Int value, XStat* stat) {
+    stat->set_int64_value(value);
+  }
+  template <typename UInt,
+            std::enable_if_t<
+                absl::conjunction<std::is_integral<UInt>,
+                                  absl::negation<std::is_signed<UInt>>>::value,
+                bool> = true>
+  static void SetStatValue(UInt value, XStat* stat) {
+    stat->set_uint64_value(value);
+  }
+  static void SetStatValue(double value, XStat* stat) {
+    stat->set_double_value(value);
+  }
+  static void SetStatValue(const char* value, XStat* stat) {
+    stat->set_str_value(std::string(value));
+  }
+  static void SetStatValue(absl::string_view value, XStat* stat) {
+    stat->set_str_value(std::string(value));
+  }
+  static void SetStatValue(std::string&& value, XStat* stat) {
+    stat->set_str_value(std::move(value));
+  }
+  static void SetStatValue(const XStatMetadata& value, XStat* stat) {
+    stat->set_ref_value(value.id());
+  }
+  static void SetStatValue(const protobuf::MessageLite& proto, XStat* stat) {
+    auto* bytes = stat->mutable_bytes_value();
+    proto.SerializeToString(bytes);
+  }
+
+  void CopyStatValue(const XStat& src_stat, const XPlane& src_plane,
+                     XStat* dst_stat) {
+    switch (src_stat.value_case()) {
+      case XStat::VALUE_NOT_SET:
+        break;
+      case XStat::kInt64Value:
+        dst_stat->set_int64_value(src_stat.int64_value());
+        break;
+      case XStat::kUint64Value:
+        dst_stat->set_uint64_value(src_stat.uint64_value());
+        break;
+      case XStat::kDoubleValue:
+        dst_stat->set_double_value(src_stat.double_value());
+        break;
+      case XStat::kStrValue:
+        dst_stat->set_str_value(src_stat.str_value());
+        break;
+      case XStat::kRefValue: {
+        const auto& stat_metadata_by_id = src_plane.stat_metadata();
+        const auto it = stat_metadata_by_id.find(src_stat.ref_value());
+        if (TF_PREDICT_TRUE(it != stat_metadata_by_id.end())) {
+          absl::string_view value = it->second.name();
+          dst_stat->set_ref_value(GetOrCreateStatMetadata(value).id());
+        }
+        break;
+      }
+      case XStat::kBytesValue:
+        dst_stat->set_bytes_value(src_stat.bytes_value());
+        break;
+    }
+  }
+
+  const XStatMetadata& GetOrCreateStatMetadata(absl::string_view value);
+
+  T* stats_owner_;
+  XPlaneBuilder* stats_metadata_owner_;
+};
+
+class XEventBuilder : public XStatsBuilder<XEvent> {
+ public:
+  XEventBuilder(const XLine* line, XPlaneBuilder* plane, XEvent* event)
+      : XStatsBuilder<XEvent>(event, plane), line_(line), event_(event) {}
+
+  int64_t LineTimestampPs() const { return NanoToPico(line_->timestamp_ns()); }
+  int64_t OffsetPs() const { return event_->offset_ps(); }
+  int64_t TimestampPs() const { return LineTimestampPs() + OffsetPs(); }
+  int64_t DurationPs() const { return event_->duration_ps(); }
+  int64_t MetadataId() const { return event_->metadata_id(); }
+
+  void SetOffsetPs(int64_t offset_ps) { event_->set_offset_ps(offset_ps); }
+
+  void SetOffsetNs(int64_t offset_ns) { SetOffsetPs(NanoToPico(offset_ns)); }
+
+  void SetTimestampPs(int64_t timestamp_ps) {
+    SetOffsetPs(timestamp_ps - LineTimestampPs());
+  }
+  void SetTimestampNs(int64_t timestamp_ns) {
+    SetOffsetNs(timestamp_ns - line_->timestamp_ns());
+  }
+
+  void SetNumOccurrences(int64_t num_occurrences) {
+    event_->set_num_occurrences(num_occurrences);
+  }
+
+  void SetDurationPs(int64_t duration_ps) {
+    event_->set_duration_ps(duration_ps);
+  }
+  void SetDurationNs(int64_t duration_ns) {
+    SetDurationPs(NanoToPico(duration_ns));
+  }
+
+  void SetEndTimestampPs(int64_t end_timestamp_ps) {
+    SetDurationPs(end_timestamp_ps - TimestampPs());
+  }
+  void SetEndTimestampNs(int64_t end_timestamp_ns) {
+    SetDurationPs(NanoToPico(end_timestamp_ns - line_->timestamp_ns()) -
+                  event_->offset_ps());
+  }
+
+  Timespan GetTimespan() const { return Timespan(TimestampPs(), DurationPs()); }
+
+  void SetTimespan(Timespan timespan) {
+    SetTimestampPs(timespan.begin_ps());
+    SetDurationPs(timespan.duration_ps());
+  }
+
+  bool operator<(const XEventBuilder& other) const {
+    return GetTimespan() < other.GetTimespan();
+  }
+
+ private:
+  const XLine* line_;
+  XEvent* event_;
+};
+
+class XLineBuilder {
+ public:
+  explicit XLineBuilder(XLine* line, XPlaneBuilder* plane)
+      : line_(line), plane_(plane) {}
+
+  // Returns the owner plane.
+  XPlaneBuilder* Plane() const { return plane_; }
+
+  int64_t Id() const { return line_->id(); }
+  void SetId(int64_t id) { line_->set_id(id); }
+
+  int64_t NumEvents() const { return line_->events_size(); }
+
+  absl::string_view Name() const { return line_->name(); }
+  void SetName(absl::string_view name) { line_->set_name(std::string(name)); }
+
+  void SetNameIfEmpty(absl::string_view name) {
+    if (line_->name().empty()) SetName(name);
+  }
+
+  int64_t TimestampNs() const { return line_->timestamp_ns(); }
+  // This will set the line start timestamp.
+  // WARNING: The offset_ps of existing events will not be altered.
+  void SetTimestampNs(int64_t timestamp_ns) {
+    line_->set_timestamp_ns(timestamp_ns);
+  }
+  // This will set the line start timestamp to specific time, and adjust
+  // the offset_ps of all existing events.
+  void SetTimestampNsAndAdjustEventOffsets(int64_t timestamp_ns);
+
+  void SetDurationPs(int64_t duration_ps) {
+    line_->set_duration_ps(duration_ps);
+  }
+
+  void ReserveEvents(size_t num_events) {
+    line_->mutable_events()->Reserve(num_events);
+  }
+
+  void SetDisplayNameIfEmpty(absl::string_view display_name) {
+    if (line_->display_name().empty()) {
+      line_->set_display_name(std::string(display_name));
+    }
+  }
+
+  XEventBuilder AddEvent(const Timespan& timespan,
+                         const XEventMetadata& metadata);
+  XEventBuilder AddEvent(const XEventMetadata& metadata);
+  XEventBuilder AddEvent(const XEvent& event);
+
+  template <typename ForEachEventFunc>
+  void ForEachEvent(ForEachEventFunc&& for_each_event) {
+    for (XEvent& event : *line_->mutable_events()) {
+      for_each_event(XEventBuilder(line_, plane_, &event));
+    }
+  }
+
+ private:
+  XLine* line_;
+  XPlaneBuilder* plane_;
+};
+
+// Provides methods to build an XPlane.
+// NOTE: avoid to use two builders to wrap the same XPlane.
+class XPlaneBuilder : public XStatsBuilder<XPlane> {
+ public:
+  explicit XPlaneBuilder(XPlane* plane);
+
+  int64_t Id() const { return plane_->id(); }
+  void SetId(int64_t id) { plane_->set_id(id); }
+
+  absl::string_view Name() const { return plane_->name(); }
+  void SetName(absl::string_view name) { plane_->set_name(std::string(name)); }
+
+  void ReserveLines(size_t num_lines) {
+    plane_->mutable_lines()->Reserve(num_lines);
+  }
+
+  template <typename ForEachLineFunc>
+  void ForEachLine(ForEachLineFunc&& for_each_line) {
+    for (XLine& line : *plane_->mutable_lines()) {
+      for_each_line(XLineBuilder(&line, this));
+    }
+  }
+
+  // Returns a builder for the line with the given id. Creates a new line if the
+  // id was unused, otherwise the builder will add events to an existing line.
+  XLineBuilder GetOrCreateLine(int64_t line_id);
+
+  // Returns a builder for the counter line. Creates a new line if the
+  // line was unused.
+  XLineBuilder GetOrCreateCounterLine();
+
+  // Returns a new event metadata with an automatically generated metadata_id.
+  // WARNING: If calling this function, don't call GetOrCreateEventMetadata.
+  XEventMetadata* CreateEventMetadata();
+
+  // Returns event metadata with the given id. Creates a new metadata if the id
+  // was unused.
+  // WARNING: If calling this function, don't call the string overloads below
+  // on the same instance.
+  // TODO(b/363276932): deprecate this method and add GetEventMetadata(int64_t)
+  XEventMetadata* GetOrCreateEventMetadata(int64_t metadata_id);
+
+  // Returns event metadata with the given name. The id is internally assigned.
+  // Creates a new metadata if the name was unused.
+  // Using these overloads guarantees names are unique.
+  // WARNING: If calling any of these overloads, do not call the integer one
+  // above on the same instance.
+  XEventMetadata* GetOrCreateEventMetadata(absl::string_view name);
+  XEventMetadata* GetOrCreateEventMetadata(std::string&& name);
+  XEventMetadata* GetOrCreateEventMetadata(const char* name) {
+    return GetOrCreateEventMetadata(absl::string_view(name));
+  }
+  // Like the functions above but for multiple names.
+  std::vector<XEventMetadata*> GetOrCreateEventsMetadata(
+      const std::vector<absl::string_view>& names);
+
+  // Returns event metadata with the given name. Returns nullptr if not found.
+  XEventMetadata* GetEventMetadata(absl::string_view name) const;
+
+  // Returns stat metadata with the given name. Returns nullptr if not found.
+  XStatMetadata* GetStatMetadata(absl::string_view name) const;
+
+  // Returns stat metadata given its id. Returns a default value if not found.
+  const XStatMetadata* GetStatMetadata(int64_t metadata_id) const;
+
+  // Returns a new stat metadata with an automatically generated metadata_id.
+  // WARNING: If calling this function, don't call GetOrCreateEventMetadata.
+  XStatMetadata* CreateStatMetadata();
+
+  // Returns stat metadata with the given id. Creates a new metadata if the id
+  // was unused.
+  // WARNING: If calling this function, don't call the string overloads below
+  // on the same instance.
+  XStatMetadata* GetOrCreateStatMetadata(int64_t metadata_id);
+
+  // Returns stat metadata with the given name. The id is internally assigned.
+  // Creates a new metadata if the name was unused.
+  // Using these overloads guarantees names are unique.
+  // WARNING: If calling any of these overloads, do not call the integer one
+  // above on the same instance.
+  XStatMetadata* GetOrCreateStatMetadata(absl::string_view name);
+  XStatMetadata* GetOrCreateStatMetadata(std::string&& name);
+  XStatMetadata* GetOrCreateStatMetadata(const char* name) {
+    return GetOrCreateStatMetadata(absl::string_view(name));
+  }
+
+ private:
+  XPlane* plane_;
+
+  // Artifacts to accelerate the builders.
+  int64_t last_event_metadata_id_ = 0LL;
+  int64_t last_stat_metadata_id_ = 0LL;
+  absl::flat_hash_map<std::string, XEventMetadata*> event_metadata_by_name_;
+  absl::flat_hash_map<std::string, XStatMetadata*> stat_metadata_by_name_;
+  absl::flat_hash_map<int64_t, XLine*> lines_by_id_;
+};
+
+template <typename T>
+const XStatMetadata& XStatsBuilder<T>::GetOrCreateStatMetadata(
+    absl::string_view value) {
+  return *stats_metadata_owner_->GetOrCreateStatMetadata(value);
+}
+
+template <typename T>
+absl::string_view XStatsBuilder<T>::StrOrRefValue(const XStat& stat) {
+  switch (stat.value_case()) {
+    case XStat::kStrValue:
+      return stat.str_value();
+    case XStat::kRefValue: {
+      auto* ref_stat = stats_metadata_owner_->GetStatMetadata(stat.ref_value());
+      return ref_stat ? ref_stat->name() : absl::string_view();
+    }
+    case XStat::kInt64Value:
+    case XStat::kUint64Value:
+    case XStat::kDoubleValue:
+    case XStat::kBytesValue:
+    case XStat::VALUE_NOT_SET:
+      return absl::string_view();
+  }
+}
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_XPLANE_BUILDER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_mutators.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_mutators.h
new file mode 100644
index 00000000..0873aee9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_mutators.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_XPLANE_MUTATORS_H_
+#define XLA_TSL_PROFILER_UTILS_XPLANE_MUTATORS_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+
+namespace tsl {
+namespace profiler {
+
+/*
+ * Subclass of this interface will perform different mutatation to the event.
+ * Checking eligibilities of event mutation is not responsible of this class.
+ */
+class XplaneEventMutator {
+ public:
+  virtual ~XplaneEventMutator() = default;
+
+  // Mutate event by event specified by the event_metadata.
+  virtual void Mutate(XEventBuilder& builder) = 0;
+  // Mutate line by line if event_metadata() return nullptr.
+  virtual void MutateEventsInLine(XLineBuilder& line) = 0;
+
+  const XEventMetadata* event_metadata() const { return event_metadata_; }
+
+ protected:
+  explicit XplaneEventMutator(XEventMetadata* event_metadata)
+      : event_metadata_(event_metadata) {}
+
+  XEventMetadata* event_metadata_;
+};
+
+class XplaneEventMutatorFactory {
+ public:
+  virtual ~XplaneEventMutatorFactory() = default;
+
+  virtual std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder& xplane) const = 0;
+
+ protected:
+  XplaneEventMutatorFactory() = default;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_XPLANE_MUTATORS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
new file mode 100644
index 00000000..ad0af06e
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
@@ -0,0 +1,557 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_XPLANE_SCHEMA_H_
+#define XLA_TSL_PROFILER_UTILS_XPLANE_SCHEMA_H_
+
+#include <atomic>
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/hash/hash.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/profiler/lib/context_types.h"
+
+namespace tsl {
+namespace profiler {
+
+// Name of XPlane that contains TraceMe events.
+TF_CONST_INIT extern const absl::string_view kHostThreadsPlaneName;
+// Name prefix of XPlane that contains GPU events.
+TF_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
+// Name prefix of XPlane that contains TPU events.
+TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
+// Regex for XPlanes that contain TensorCore planes.
+TF_CONST_INIT extern const char kTpuPlaneRegex[];
+// Regex for XPlanes that contain TPU Core planes.
+TF_CONST_INIT extern const char kSparseCorePlaneRegex[];
+// Name prefix of XPlane that contains custom device events.
+TF_CONST_INIT extern const absl::string_view kCustomPlanePrefix;
+// Name prefix of XPlane that contains TPU non-core events such as HBM, ICI etc.
+TF_CONST_INIT extern const absl::string_view kTpuNonCorePlaneNamePrefix;
+// Name prefix of XPlane that contains TPU runtime events.
+TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
+// Name of XPlane that contains CUPTI driver API generated events.
+TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
+// Name of XPlane that contains Roctracer API generated events.
+TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
+// Name of XPlane that contains profile metadata such as XLA debug info.
+TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
+// Name of XPlane that contains kpi related metrics.
+TF_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
+// Name of XPlane that contains events from python tracer.
+TF_CONST_INIT extern const absl::string_view kPythonTracerPlaneName;
+// Name of XPlane that contains kTrace thread-switch events
+TF_CONST_INIT extern const absl::string_view kHostCpusPlaneName;
+// Name of XPlane that contains kTrace system calls.
+TF_CONST_INIT extern const absl::string_view kSyscallsPlaneName;
+// Name of XPlane that contains namescope stack tree.
+TF_CONST_INIT extern const absl::string_view kScopeRangeIdTreePlaneName;
+
+// Names of XLines that contain ML-level events.
+TF_CONST_INIT extern const absl::string_view kStepLineName;
+TF_CONST_INIT extern const absl::string_view kTensorFlowNameScopeLineName;
+TF_CONST_INIT extern const absl::string_view kTensorFlowOpLineName;
+TF_CONST_INIT extern const absl::string_view kXlaModuleLineName;
+TF_CONST_INIT extern const absl::string_view kXlaOpLineName;
+TF_CONST_INIT extern const absl::string_view kSparseCoreStepLineName;
+TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
+TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
+TF_CONST_INIT extern const absl::string_view kSourceLineName;
+TF_CONST_INIT extern const absl::string_view kCounterEventsLineName;
+TF_CONST_INIT extern const absl::string_view kHostOffloadOpLineName;
+
+// GPU device vendors.
+TF_CONST_INIT extern const absl::string_view kDeviceVendorNvidia;
+TF_CONST_INIT extern const absl::string_view kDeviceVendorAMD;
+
+// Name of Xplane that contains environment information
+TF_CONST_INIT extern const absl::string_view kTaskEnvPlaneName;
+
+// Max collectives to display per TPU.
+// Since in most cases there will be more than 9 collectives, the last line
+// contains all collectives that did not qualify to get their own line.
+static constexpr uint32_t kMaxCollectivesToDisplay = 9;
+
+// Interesting event types (i.e., TraceMe names).
+enum HostEventType {
+  kFirstHostEventType = 0,
+  kUnknownHostEventType = kFirstHostEventType,
+  kTraceContext,
+  kSessionRun,
+  kFunctionRun,
+  kRunGraph,
+  kRunGraphDone,
+  kTfOpRun,
+  kEagerKernelExecute,
+  kExecutorStateProcess,
+  kExecutorDoneCallback,
+  kMemoryAllocation,
+  kMemoryDeallocation,
+  // Performance counter related.
+  kRemotePerf,
+  // tf.data captured function events.
+  kTfDataCapturedFunctionRun,
+  kTfDataCapturedFunctionRunWithBorrowedArgs,
+  kTfDataCapturedFunctionRunInstantiated,
+  kTfDataCapturedFunctionRunAsync,
+  // Loop ops.
+  kParallelForOp,
+  kForeverOp,
+  kWhileOpEvalCond,
+  kWhileOpStartBody,
+  kForOp,
+  // tf.data related.
+  kIteratorGetNextOp,
+  kIteratorGetNextAsOptionalOp,
+  kIterator,
+  kDeviceInputPipelineSecondIterator,
+  kPrefetchProduce,
+  kPrefetchConsume,
+  kParallelInterleaveProduce,
+  kParallelInterleaveConsume,
+  kParallelInterleaveInitializedInput,
+  kParallelMapProduce,
+  kParallelMapConsume,
+  kMapAndBatchProduce,
+  kMapAndBatchConsume,
+  kParseExampleProduce,
+  kParseExampleConsume,
+  kParallelBatchProduce,
+  kParallelBatchConsume,
+  // Batching related.
+  kBatchingSessionRun,
+  kProcessBatch,
+  kBrainSessionRun,
+  kConcatInputTensors,
+  kMergeInputTensors,
+  kScheduleWithoutSplit,
+  kScheduleWithSplit,
+  kScheduleWithEagerSplit,
+  kASBSQueueSchedule,
+  // TFRT related.
+  kTfrtModelRun,
+  // Serving related.
+  kServingModelRun,
+  // GPU related.
+  kKernelLaunch,
+  kKernelExecute,
+  // TPU related
+  kEnqueueRequestLocked,
+  kRunProgramRequest,
+  kHostCallbackRequest,
+  kTransferH2DRequest,
+  kTransferPreprocessedH2DRequest,
+  kTransferD2HRequest,
+  kOnDeviceSendRequest,
+  kOnDeviceRecvRequest,
+  kOnDeviceSendRecvLocalRequest,
+  kCustomWait,
+  kOnDeviceSendRequestMulti,
+  kOnDeviceRecvRequestMulti,
+  kPjrtAsyncWait,
+  kDoEnqueueProgram,
+  kDoEnqueueContinuationProgram,
+  kWriteHbm,
+  kReadHbm,
+  kTpuExecuteOp,
+  kCompleteCallbacks,
+  kTransferToDeviceIssueEvent,
+  kTransferToDeviceDone,
+  kTransferFromDeviceIssueEvent,
+  kTransferFromDeviceDone,
+  kTpuSystemExecute,
+  kTpuPartitionedCallOpInitializeVarOnTpu,
+  kTpuPartitionedCallOpExecuteRemote,
+  kTpuPartitionedCallOpExecuteLocal,
+  kLinearize,
+  kDelinearize,
+  kTransferBufferFromDeviceFastPath,
+  kLastHostEventType = kTransferBufferFromDeviceFastPath,
+};
+
+enum StatType {
+  kFirstStatType = 0,
+  kUnknownStatType = kFirstStatType,
+  // TraceMe arguments.
+  kStepId,
+  kDeviceOrdinal,
+  kChipOrdinal,
+  kNodeOrdinal,
+  kModelId,
+  kQueueId,
+  kQueueAddr,
+  kRequestId,
+  kRunId,
+  kReplicaId,
+  kGraphType,
+  kStepNum,
+  kIterNum,
+  kIndexOnHost,
+  kAllocatorName,
+  kBytesReserved,
+  kBytesAllocated,
+  kBytesAvailable,
+  kFragmentation,
+  kPeakBytesInUse,
+  kRequestedBytes,
+  kAllocationBytes,
+  kAddress,
+  kRegionType,
+  kDataType,
+  kTensorShapes,
+  kTensorLayout,
+  kKpiName,
+  kKpiValue,
+  kElementId,
+  kParentId,
+  kCoreType,
+  // XPlane semantics related.
+  kProducerType,
+  kConsumerType,
+  kProducerId,
+  kConsumerId,
+  kIsRoot,
+  kIsAsync,
+  // Device trace arguments.
+  kDeviceId,
+  kDeviceTypeString,
+  kContextId,
+  kCorrelationId,
+  // TODO(b/176137043): These "details" should differentiate between activity
+  // and API event sources.
+  kMemcpyDetails,
+  kMemallocDetails,
+  kMemFreeDetails,
+  kMemsetDetails,
+  kMemoryResidencyDetails,
+  kNVTXRange,
+  kKernelDetails,
+  kStream,
+  // Stats added when processing traces.
+  kGroupId,
+  kFlow,
+  kStepName,
+  kTfOp,
+  kHloOp,
+  kDeduplicatedName,
+  kHloCategory,
+  kHloModule,
+  kProgramId,
+  kEquation,
+  kIsEager,
+  kIsFunc,
+  kTfFunctionCall,
+  kTfFunctionTracingCount,
+  kFlops,
+  kModelFlops,
+  kBytesAccessed,
+  kRawBytesAccessed,
+  kMemoryAccessBreakdown,
+  kShapeWithLayout,
+  kSourceInfo,
+  kModelName,
+  kModelVersion,
+  kBytesTransferred,
+  kDmaQueue,
+  kDcnCollectiveInfo,
+  // Performance counter related.
+  kRawValue,
+  kScaledValue,
+  kThreadId,
+  kMatrixUnitUtilizationPercent,
+  // XLA metadata map related.
+  kHloProto,
+  // Device capability related.
+  kDevCapClockRateKHz,
+  // For GPU, this is the number of SMs.
+  kDevCapCoreCount,
+  kDevCapMemoryBandwidth,
+  kDevCapMemorySize,
+  kDevCapComputeCapMajor,
+  kDevCapComputeCapMinor,
+  kDevCapPeakTeraflopsPerSecond,
+  kDevCapPeakHbmBwGigabytesPerSecond,
+  kDevCapPeakCmemRdBwGigabytesPerSecond,
+  kDevCapPeakCmemWrBwGigabytesPerSecond,
+  kDevCapPeakVmemRdBwGigabytesPerSecond,
+  kDevCapPeakVmemWrBwGigabytesPerSecond,
+  kDevCapPeakSramRdBwGigabytesPerSecond,
+  kDevCapPeakSramWrBwGigabytesPerSecond,
+  kDevVendor,
+  kDevHasMegacore,
+  kDevHasMergedVmem,
+  // Batching related.
+  kBatchSizeAfterPadding,
+  kPaddingAmount,
+  kBatchingInputTaskSize,
+  // GPU occupancy metrics
+  kTheoreticalOccupancyPct,
+  kOccupancyMinGridSize,
+  kOccupancySuggestedBlockSize,
+  // Aggregated Stats
+  kSelfDurationPs,
+  kMinDurationPs,
+  kTotalProfileDurationPs,
+  kMaxIterationNum,
+  kDeviceType,
+  kUsesMegaCore,
+  kSymbolId,
+  kTfOpName,
+  kDmaStallDurationPs,
+  kKey,
+  kPayloadSizeBytes,
+  kDuration,
+  kBufferSize,
+  kTransfers,
+  // Dcn message Stats
+  kDcnLabel,
+  kDcnSourceSliceId,
+  kDcnSourcePerSliceDeviceId,
+  kDcnDestinationSliceId,
+  kDcnDestinationPerSliceDeviceId,
+  kDcnChunk,
+  kDcnLoopIndex,
+  kEdgeTpuModelInfo,
+  kEdgeTpuModelProfileInfo,
+  kEdgeTpuMlir,
+  kDroppedTraces,
+  kCudaGraphId,
+  // Many events have kCudaGraphId, such as graph sub events when tracing is in
+  // node level. Yet kCudaGraphExecId is used only for CudaGraphExecution events
+  // on the GPU device when tracing is in graph level.
+  kCudaGraphExecId,
+  kCudaGraphOrigId,
+  kStepIdleTimePs,
+  kGpuDeviceName,
+  kSourceStack,
+  kDeviceOffsetPs,
+  kDeviceDurationPs,
+  kScopeRangeId,
+  kCoreDetails,
+  kLastStatType = kCoreDetails,
+};
+
+enum MegaScaleStatType : uint8_t {
+  kMegaScaleGraphKey,
+  kFirstMegaScaleStatType = kMegaScaleGraphKey,
+  kMegaScaleLocalDeviceId,
+  kMegaScaleNumActions,
+  kMegaScaleCollectiveType,
+  kMegaScaleInputSize,
+  kMegaScaleSlackUs,
+  kMegaScaleActionType,
+  kMegaScaleStartEndType,
+  kMegaScaleActionIndex,
+  kMegaScaleActionDurationNs,
+  kMegaScaleActionInputs,
+  kMegaScaleTransferSource,
+  kMegaScaleTransferDestinations,
+  kMegaScaleTransferDcnTopologyLevel,
+  kMegaScaleBufferSizes,
+  kMegaScaleComputeOperation,
+  kMegaScaleChunk,
+  kMegaScaleLaunchId,
+  kMegaScaleLoopIteration,
+  kMegaScaleGraphProtos,
+  kMegaScaleNetworkTransportLatency,
+  kMegaScaleTransmissionBudgetUs,
+  kMegaScaleDelayBudgetUs,
+  kMegaScaleHloModule,
+  kMegaScaleMultiSliceTopology,
+  kLastMegaScaleStatType = kMegaScaleMultiSliceTopology,
+};
+
+enum TaskEnvStatType {
+  kFirstTaskEnvStatType = 1,
+  kEnvProfileStartTime = kFirstTaskEnvStatType,
+  kEnvProfileStopTime,
+  kLastTaskEnvStatType = kEnvProfileStopTime,
+};
+
+static constexpr uint32_t kLineIdOffset = 10000;
+
+enum LineIdType {
+  kFirstLineIdType = kLineIdOffset,
+  kUnknownLineIdType = kFirstLineIdType,
+  // DCN Traffic
+  kDcnHostTraffic,
+  kDcnCollectiveTraffic,
+  // kDcnCollectiveTrafficMax reserves id's from kDcnCollectiveTraffic to
+  // (kDcnCollectiveTraffic + kMaxCollectivesToDisplay) for DcnCollective lines.
+  kDcnCollectiveTrafficMax = kDcnCollectiveTraffic + kMaxCollectivesToDisplay,
+  kLastLineIdType = kDcnCollectiveTrafficMax,
+};
+
+inline std::string TpuPlaneName(int32_t device_ordinal) {
+  return absl::StrCat(kTpuPlanePrefix, device_ordinal);
+}
+
+inline std::string GpuPlaneName(int32_t device_ordinal) {
+  return absl::StrCat(kGpuPlanePrefix, device_ordinal);
+}
+
+absl::string_view GetHostEventTypeStr(HostEventType event_type);
+
+bool IsHostEventType(HostEventType event_type, absl::string_view event_name);
+
+inline bool IsHostEventType(HostEventType event_type,
+                            absl::string_view event_name) {
+  return GetHostEventTypeStr(event_type) == event_name;
+}
+
+std::optional<int64_t> FindHostEventType(absl::string_view event_name);
+
+std::optional<int64_t> FindTfOpEventType(absl::string_view event_name);
+
+absl::string_view GetStatTypeStr(StatType stat_type);
+
+bool IsStatType(StatType stat_type, absl::string_view stat_name);
+
+inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
+  return GetStatTypeStr(stat_type) == stat_name;
+}
+
+std::optional<int64_t> FindStatType(absl::string_view stat_name);
+
+absl::string_view GetMegaScaleStatTypeStr(MegaScaleStatType stat_type);
+
+inline bool IsMegaScaleStatType(MegaScaleStatType stat_type,
+                                absl::string_view stat_name) {
+  return GetMegaScaleStatTypeStr(stat_type) == stat_name;
+}
+
+std::optional<int64_t> FindMegaScaleStatType(absl::string_view stat_name);
+
+// Returns true if the given event shouldn't be shown in the trace viewer.
+bool IsInternalEvent(std::optional<int64_t> event_type);
+
+// Returns true if the given stat shouldn't be shown in the trace viewer.
+bool IsInternalStat(std::optional<int64_t> stat_type);
+
+absl::string_view GetTaskEnvStatTypeStr(TaskEnvStatType stat_type);
+
+std::optional<int64_t> FindTaskEnvStatType(absl::string_view stat_name);
+
+// Support for flow events:
+// This class enables encoding/decoding the flow id and direction, stored as
+// XStat value. The flow id are limited to 56 bits.
+class XFlow {
+ public:
+  enum FlowDirection {
+    kFlowUnspecified = 0x0,
+    kFlowIn = 0x1,
+    kFlowOut = 0x2,
+    kFlowInOut = 0x3,
+  };
+
+  XFlow(uint64_t flow_id, FlowDirection direction,
+        ContextType category = ContextType::kGeneric) {
+    DCHECK_NE(direction, kFlowUnspecified);
+    encoded_.parts.direction = direction;
+    encoded_.parts.flow_id = flow_id;
+    encoded_.parts.category = static_cast<uint64_t>(category);
+  }
+
+  // Encoding
+  uint64 ToStatValue() const { return encoded_.whole; }
+
+  // Decoding
+  static XFlow FromStatValue(uint64_t encoded) { return XFlow(encoded); }
+
+  /* NOTE: absl::HashOf is not consistent across processes (some process level
+   * salt is added), even different executions of the same program.
+   * However we are not tracking cross-host flows, i.e. A single flow's
+   * participating events are from the same XSpace. On the other hand,
+   * events from the same XSpace is always processed in the same profiler
+   * process. Flows from different hosts are unlikely to collide because of
+   * 2^56 hash space. Therefore, we can consider this is good for now. We should
+   * revisit the hash function when cross-hosts flows became more popular.
+   */
+  template <typename... Args>
+  static uint64_t GetFlowId(Args&&... args) {
+    return absl::HashOf(std::forward<Args>(args)...) & kFlowMask;
+  }
+
+  uint64_t Id() const { return encoded_.parts.flow_id; }
+  ContextType Category() const {
+    return GetSafeContextType(encoded_.parts.category);
+  }
+  FlowDirection Direction() const {
+    return FlowDirection(encoded_.parts.direction);
+  }
+
+  static uint64_t GetUniqueId() {  // unique in current process.
+    return next_flow_id_.fetch_add(1);
+  }
+
+ private:
+  explicit XFlow(uint64_t encoded) { encoded_.whole = encoded; }
+  static constexpr uint64_t kFlowMask = (1ULL << 56) - 1;
+  static std::atomic<uint64_t> next_flow_id_;
+
+  union {
+    // Encoded representation.
+    uint64_t whole;
+    struct {
+      uint64_t direction : 2;
+      uint64_t flow_id : 56;
+      uint64_t category : 6;
+    } parts;
+  } encoded_ ABSL_ATTRIBUTE_PACKED;
+
+  static_assert(sizeof(encoded_) == sizeof(uint64_t), "Must be 64 bits.");
+};
+
+// String constants for XProf TraceMes for DCN Messages.
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnReceive;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnSend;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnSendFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnMemAllocate;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnMemCopy;
+TF_CONST_INIT extern const absl::string_view kMegaScaleTopologyDiscovery;
+TF_CONST_INIT extern const absl::string_view kMegaScaleBarrier;
+TF_CONST_INIT extern const absl::string_view kMegaScaleHostCommand;
+TF_CONST_INIT extern const absl::string_view kMegaScaleD2HTransferStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleD2HTransferFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleH2DTransferStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleH2DTransferFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleReductionStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleReductionFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleCompressionStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleCompressionFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDecompressionStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDecompressionFinished;
+TF_CONST_INIT extern const char kXProfMetadataKey[];
+TF_CONST_INIT extern const char kXProfMetadataFlow[];
+TF_CONST_INIT extern const char kXProfMetadataTransfers[];
+TF_CONST_INIT extern const char kXProfMetadataBufferSize[];
+
+// String constants for threadpool_listener events
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerRecord;
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerStartRegion;
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerStopRegion;
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerRegion;
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_XPLANE_SCHEMA_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
new file mode 100644
index 00000000..ed78ed5a
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+
+#include <initializer_list>
+#include <utility>
+#include <variant>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/variant.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+
+namespace tsl {
+namespace profiler {
+
+using XStatValue = std::variant<int64_t, uint64, absl::string_view>;
+
+XPlane* GetOrCreateHostXPlane(XSpace* space);
+
+XPlane* GetOrCreateGpuXPlane(XSpace* space, int32_t device_ordinal);
+
+XPlane* GetOrCreateTpuXPlane(XSpace* space, int32_t device_ordinal,
+                             absl::string_view device_type,
+                             double peak_tera_flops_per_second,
+                             double peak_hbm_bw_gigabytes_per_second);
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64_t offset_ps, int64_t duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats = {});
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    HostEventType event_type, int64_t offset_ps, int64_t duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats = {});
+
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name,
+                               int64_t offset_ps, int64_t duration_ps,
+                               absl::string_view execution_mode,
+                               int64_t tracing_count = -1);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
new file mode 100644
index 00000000..273804bb
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
@@ -0,0 +1,231 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_XPLANE_UTILS_H_
+#define XLA_TSL_PROFILER_UTILS_XPLANE_UTILS_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "xla/tsl/profiler/utils/trace_utils.h"
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Returns a Timespan from an XEvent.
+// WARNING: This should only be used when comparing events from the same XLine.
+inline Timespan XEventTimespan(const XEvent& event) {
+  return Timespan(event.offset_ps(), event.duration_ps());
+}
+
+// Returns the planes with the given predicate.
+template <typename F>
+std::vector<const XPlane*> FindPlanes(const XSpace& space, const F& predicate) {
+  std::vector<const XPlane*> result;
+  for (const XPlane& plane : space.planes()) {
+    if (predicate(plane)) {
+      result.push_back(&plane);
+    }
+  }
+  return result;
+}
+
+// Returns mutable planes with the given predicate.
+template <typename F>
+std::vector<XPlane*> FindMutablePlanes(XSpace* space, const F& predicate) {
+  std::vector<XPlane*> result;
+  for (XPlane& plane : *space->mutable_planes()) {
+    if (predicate(plane)) {
+      result.push_back(&plane);
+    }
+  }
+  return result;
+}
+
+// Returns the plane with the given name or nullptr if not found.
+const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name);
+XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name);
+
+// Returns the planes with the given names, if found.
+std::vector<const XPlane*> FindPlanesWithNames(
+    const XSpace& space, const std::vector<absl::string_view>& names);
+
+// Returns the plane with the given name in the container. If necessary, adds a
+// new plane to the container.
+XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name);
+
+// Returns all the planes with a given prefix.
+std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
+                                                absl::string_view prefix);
+std::vector<XPlane*> FindMutablePlanesWithPrefix(XSpace* space,
+                                                 absl::string_view prefix);
+
+// Returns the plane with the given id/name or nullptr if not found.
+const XLine* FindLineWithId(const XPlane& plane, int64_t id);
+std::vector<const XLine*> FindLinesWithId(const XPlane& plane, int64_t id);
+const XLine* FindLineWithName(const XPlane& plane, absl::string_view name);
+
+XStat* FindOrAddMutableStat(const XStatMetadata& stat_metadata, XEvent* event);
+
+void RemovePlane(XSpace* space, const XPlane* plane);
+void RemovePlanes(XSpace* space, const std::vector<const XPlane*>& planes);
+void RemoveLine(XPlane* plane, const XLine* line);
+void RemoveEvents(XLine* line,
+                  const absl::flat_hash_set<const XEvent*>& events);
+
+void RemoveEmptyPlanes(XSpace* space);
+void RemoveEmptyLines(XPlane* plane);
+
+// Sort lines in plane with a provided comparator.
+template <class Compare>
+void SortXLinesBy(XPlane* plane, Compare comp) {
+  std::sort(plane->mutable_lines()->pointer_begin(),
+            plane->mutable_lines()->pointer_end(), comp);
+}
+
+class XLinesComparatorByName {
+ public:
+  bool operator()(const XLine* a, const XLine* b) const {
+    auto& line_a = a->display_name().empty() ? a->name() : a->display_name();
+    auto& line_b = b->display_name().empty() ? b->name() : b->display_name();
+    return line_a < line_b;
+  }
+};
+
+// Sorts each XLine's XEvents by offset_ps (ascending) and duration_ps
+// (descending) so nested events are sorted from outer to innermost.
+void SortXPlane(XPlane* plane);
+// Sorts each plane of the XSpace.
+void SortXSpace(XSpace* space);
+
+// Functor that compares XEvents for sorting by timespan.
+struct XEventsComparator {
+  bool operator()(const XEvent* a, const XEvent* b) const;
+};
+
+// Returns a sorted vector of all XEvents in the given XPlane.
+// This template can be used with either XPlaneVisitor or XPlaneBuilder.
+// If line_ids is empty, all lines could be used to collect events. Otherwise,
+// only lines whose id exists in the line_ids will be used to collect events.
+template <typename Event, typename Plane>
+std::vector<Event> GetSortedEvents(Plane& plane,
+                                   bool include_derived_events = false,
+                                   absl::Span<const int64_t> line_ids = {}) {
+  std::vector<Event> events;
+  plane.ForEachLine([&events, include_derived_events, line_ids](auto line) {
+    if (!include_derived_events && IsDerivedThreadId(line.Id())) return;
+    if (!line_ids.empty() && std::find(line_ids.begin(), line_ids.end(),
+                                       line.Id()) == line_ids.end())
+      return;
+    line.ForEachEvent(
+        [&events](auto event) { events.emplace_back(std::move(event)); });
+  });
+  absl::c_sort(events);
+  return events;
+}
+
+// Normalize timestamps by time-shifting to start_time_ns_ as origin.
+void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
+void NormalizeTimestamps(XSpace* space, uint64 start_time_ns);
+
+// Merges src_plane into dst_plane. Both plane level stats, lines, events and
+// event level stats are merged. If src_plane and dst_plane both have the same
+// line, which have different start timestamps, we will normalize the events
+// offset timestamp correspondingly.
+void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
+
+// Merges each plane with a src_planes, into the dst_plane.
+void MergePlanes(const std::vector<const XPlane*>& src_planes,
+                 XPlane* dst_plane);
+
+// Plane's start timestamp is defined as the minimum of all lines' start
+// timestamps. If zero line exists, return 0;
+int64_t GetStartTimestampNs(const XPlane& plane);
+
+// Returns true if there are no XEvents.
+bool IsEmpty(const XSpace& space);
+
+// Return true if grouping/step-tracking is done on the Xspace already.
+bool IsXSpaceGrouped(const XSpace& space);
+
+// Mutate the XPlane by adding predefined XFlow. e.g. GPU kernel launches =>
+// GPU kernel events.
+void AddFlowsToXplane(int32_t host_id, bool is_host_plane, bool connect_traceme,
+                      XPlane* plane);
+
+// Get a fingerprint of device plane for deduplicating derived lines in similar
+// device planes. The fingerprint is a hash of sorted HLO modules name which
+// were appeared on current plane.
+// Returns 0 when such "Xla Modules" line don't exist.
+uint64_t GetDevicePlaneFingerprint(const XPlane& plane);
+template <typename XPlanePointerIterator>
+void SortPlanesById(XPlanePointerIterator begin, XPlanePointerIterator end) {
+  std::sort(begin, end, [&](const XPlane* a, const XPlane* b) {
+    return a->id() < b->id();  // ascending order of device xplane id.
+  });
+}
+
+// When certain event context only exists from event from other line, which
+// "encloses" current event in timeline, we need to find out quickly which
+// enclosing event is (or if there is one).
+// To Avoid O(N) search overhead, assume the event are processed in the order
+// of "XLine default sorting order".
+class XEventContextTracker {
+ public:
+  // The events on line need to be sorted and disjointed.
+  XEventContextTracker(const XPlaneVisitor* plane, const XLine* line)
+      : plane_(plane), line_(line) {}
+
+  // Returns the event that encloses/contains the specified input event.
+  // Expects called with events with start timestamps sorted incrementingly.
+  std::optional<XEventVisitor> GetContainingEvent(const Timespan& event);
+
+  // Returns the event that overlaps the specified input event.
+  // Expects called with events with start timestamps sorted incrementingly.
+  std::optional<XEventVisitor> GetOverlappingEvent(const Timespan& event);
+
+ private:
+  const XPlaneVisitor* plane_;
+  const XLine* line_;
+  int64_t current_index_ = -1;
+};
+
+// Aggregate traces on op_line in the full_trace xplane and add them onto the
+// aggregated_trace xplane. The function also copies the step line from the
+// full_trace into the aggregated_trace.
+void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace);
+
+// Return whether this is a custom plan.
+bool IsCustomPlane(const XPlane& plane);
+
+// Return whether this is a host plan.
+bool IsHostPlane(const XPlane& plane);
+
+// Return whether this is a device plan.
+bool IsDevicePlane(const XPlane& plane);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_XPLANE_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
new file mode 100644
index 00000000..7dce2e1f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
@@ -0,0 +1,370 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_PROFILER_UTILS_XPLANE_VISITOR_H_
+#define XLA_TSL_PROFILER_UTILS_XPLANE_VISITOR_H_
+
+#include <stddef.h>
+
+#include <functional>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/profiler/utils/timespan.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+using tensorflow::profiler::XEvent;          // NOLINT
+using tensorflow::profiler::XEventMetadata;  // NOLINT
+using tensorflow::profiler::XLine;           // NOLINT
+using tensorflow::profiler::XPlane;          // NOLINT
+using tensorflow::profiler::XSpace;          // NOLINT
+using tensorflow::profiler::XStat;           // NOLINT
+using tensorflow::profiler::XStatMetadata;   // NOLINT
+
+class XPlaneVisitor;
+
+class XStatVisitor {
+ public:
+  // REQUIRED: plane and stat cannot be nullptr.
+  XStatVisitor(const XPlaneVisitor* plane, const XStat* stat);
+
+  // REQUIRED: plane, stat and metadata cannot be nullptr.
+  XStatVisitor(const XPlaneVisitor* plane, const XStat* stat,
+               const XStatMetadata* metadata, std::optional<int64_t> type);
+
+  int64_t Id() const { return stat_->metadata_id(); }
+
+  absl::string_view Name() const { return metadata_->name(); }
+
+  std::optional<int64_t> Type() const { return type_; }
+
+  absl::string_view Description() const { return metadata_->description(); }
+
+  XStat::ValueCase ValueCase() const { return stat_->value_case(); }
+
+  bool BoolValue() const { return static_cast<bool>(IntValue()); }
+
+  int64_t IntValue() const { return stat_->int64_value(); }
+
+  uint64 UintValue() const { return stat_->uint64_value(); }
+
+  absl::string_view BytesValue() const { return stat_->bytes_value(); }
+
+  uint64 IntOrUintValue() const {
+    return ValueCase() == XStat::kUint64Value ? UintValue()
+                                              : static_cast<uint64>(IntValue());
+  }
+
+  double DoubleValue() const { return stat_->double_value(); }
+
+  // Returns a string view.
+  // REQUIRED: the value type should be string type or reference type.
+  absl::string_view StrOrRefValue() const;
+
+  const XStat& RawStat() const { return *stat_; }
+
+  // Return a string representation of all value type.
+  std::string ToString() const;
+
+ private:
+  const XStat* stat_;
+  const XStatMetadata* metadata_;
+  const XPlaneVisitor* plane_;
+  std::optional<int64_t> type_;
+};
+
+template <class T>
+class XStatsOwner {
+ public:
+  // REQUIRED: plane and stats_owner cannot be nullptr.
+  XStatsOwner(const XPlaneVisitor* plane, const T* stats_owner)
+      : plane_(plane), stats_owner_(stats_owner) {}
+
+  // For each stat, call the specified lambda.
+  template <typename ForEachStatFunc>
+  void ForEachStat(ForEachStatFunc&& for_each_stat) const {
+    for (const XStat& stat : stats_owner_->stats()) {
+      for_each_stat(XStatVisitor(plane_, &stat));
+    }
+  }
+
+  // Shortcut to get a specific stat type, nullopt if absent.
+  // This function performs a linear search for the requested stat value.
+  // Prefer ForEachStat above when multiple stat values are necessary.
+  std::optional<XStatVisitor> GetStat(int64_t stat_type) const;
+
+  // Same as above that skips searching for the stat.
+  std::optional<XStatVisitor> GetStat(
+      int64_t stat_type, const XStatMetadata& stat_metadata) const {
+    for (const XStat& stat : stats_owner_->stats()) {
+      if (stat.metadata_id() == stat_metadata.id()) {
+        return XStatVisitor(plane_, &stat, &stat_metadata, stat_type);
+      }
+    }
+    return std::nullopt;  // type does not exist in this owner.
+  }
+
+ protected:
+  const XPlaneVisitor* plane() const { return plane_; }
+  const T* stats_owner() const { return stats_owner_; }
+
+ private:
+  const XPlaneVisitor* plane_;
+  const T* stats_owner_;
+};
+
+class XEventMetadataVisitor : public XStatsOwner<XEventMetadata> {
+ public:
+  // REQUIRED: plane and metadata cannot be nullptr.
+  XEventMetadataVisitor(const XPlaneVisitor* plane,
+                        const XEventMetadata* metadata)
+      : XStatsOwner(plane, metadata) {}
+
+  int64_t Id() const { return metadata()->id(); }
+
+  absl::string_view Name() const { return metadata()->name(); }
+
+  bool HasDisplayName() const { return !metadata()->display_name().empty(); }
+
+  absl::string_view DisplayName() const { return metadata()->display_name(); }
+
+  // For each child event metadata, call the specified lambda.
+  template <typename ForEachChildFunc>
+  void ForEachChild(ForEachChildFunc&& for_each_child) const;
+
+ private:
+  const XEventMetadata* metadata() const { return stats_owner(); }
+};
+
+class XEventVisitor : public XStatsOwner<XEvent> {
+ public:
+  // REQUIRED: plane, line and event cannot be nullptr.
+  XEventVisitor(const XPlaneVisitor* plane, const XLine* line,
+                const XEvent* event);
+
+  const XPlaneVisitor& Plane() const { return *plane_; }
+
+  const XEvent& RawEvent() const { return *event_; }
+
+  int64_t Id() const { return event_->metadata_id(); }
+
+  absl::string_view Name() const { return metadata_->name(); }
+
+  std::optional<int64_t> Type() const { return type_; }
+
+  bool HasDisplayName() const { return !metadata_->display_name().empty(); }
+
+  absl::string_view DisplayName() const { return metadata_->display_name(); }
+
+  double OffsetNs() const { return PicoToNano(event_->offset_ps()); }
+
+  int64_t OffsetPs() const { return event_->offset_ps(); }
+
+  int64_t LineTimestampNs() const { return line_->timestamp_ns(); }
+
+  int64_t TimestampNs() const { return line_->timestamp_ns() + OffsetNs(); }
+
+  int64_t TimestampPs() const {
+    return NanoToPico(line_->timestamp_ns()) + event_->offset_ps();
+  }
+
+  double DurationNs() const { return PicoToNano(event_->duration_ps()); }
+
+  int64_t DurationPs() const { return event_->duration_ps(); }
+
+  int64_t EndOffsetPs() const {
+    return event_->offset_ps() + event_->duration_ps();
+  }
+
+  int64_t EndTimestampNs() const { return TimestampNs() + DurationNs(); }
+
+  int64_t EndTimestampPs() const { return TimestampPs() + DurationPs(); }
+
+  int64_t NumOccurrences() const { return event_->num_occurrences(); }
+
+  bool IsAggregatedEvent() const {
+    return event_->data_case() == XEvent::kNumOccurrences;
+  }
+
+  bool operator<(const XEventVisitor& other) const {
+    return GetTimespan() < other.GetTimespan();
+  }
+
+  bool operator==(const XEventVisitor& other) const {
+    return GetTimespan() == other.GetTimespan();
+  }
+
+  bool operator<=(const XEventVisitor& other) const {
+    return GetTimespan() <= other.GetTimespan();
+  }
+
+  const XEventMetadata* metadata() const { return metadata_; }
+
+  XEventMetadataVisitor Metadata() const {
+    return XEventMetadataVisitor(plane_, metadata_);
+  }
+
+  Timespan GetTimespan() const { return Timespan(TimestampPs(), DurationPs()); }
+
+ private:
+  const XPlaneVisitor* plane_;
+  const XLine* line_;
+  const XEvent* event_;
+  const XEventMetadata* metadata_;
+  std::optional<int64_t> type_;
+};
+
+class XLineVisitor {
+ public:
+  // REQUIRED: plane and line cannot be nullptr.
+  XLineVisitor(const XPlaneVisitor* plane, const XLine* line)
+      : plane_(plane), line_(line) {}
+
+  int64_t Id() const { return line_->id(); }
+
+  int64_t DisplayId() const {
+    return line_->display_id() ? line_->display_id() : line_->id();
+  }
+
+  absl::string_view Name() const { return line_->name(); }
+
+  absl::string_view DisplayName() const {
+    return !line_->display_name().empty() ? line_->display_name()
+                                          : line_->name();
+  }
+
+  int64_t TimestampNs() const { return line_->timestamp_ns(); }
+
+  int64_t DurationPs() const { return line_->duration_ps(); }
+
+  size_t NumEvents() const { return line_->events_size(); }
+
+  template <typename ForEachEventFunc>
+  void ForEachEvent(ForEachEventFunc&& for_each_event) const {
+    for (const XEvent& event : line_->events()) {
+      for_each_event(XEventVisitor(plane_, line_, &event));
+    }
+  }
+
+ private:
+  const XPlaneVisitor* plane_;
+  const XLine* line_;
+};
+
+using TypeGetter = std::function<std::optional<int64_t>(absl::string_view)>;
+using TypeGetterList = std::vector<TypeGetter>;
+
+class XPlaneVisitor : public XStatsOwner<XPlane> {
+ public:
+  // REQUIRED: plane cannot be nullptr.
+  explicit XPlaneVisitor(
+      const XPlane* plane,
+      const TypeGetterList& event_type_getter_list = TypeGetterList(),
+      const TypeGetterList& stat_type_getter_list = TypeGetterList());
+
+  int64_t Id() const { return plane_->id(); }
+
+  absl::string_view Name() const { return plane_->name(); }
+
+  size_t NumLines() const { return plane_->lines_size(); }
+
+  template <typename ForEachLineFunc>
+  void ForEachLine(ForEachLineFunc&& for_each_line) const {
+    for (const XLine& line : plane_->lines()) {
+      for_each_line(XLineVisitor(this, &line));
+    }
+  }
+  template <typename ThreadBundle, typename ForEachLineFunc>
+  void ForEachLineInParallel(ForEachLineFunc&& for_each_line) const {
+    ThreadBundle bundle;
+    for (const XLine& line : plane_->lines()) {
+      bundle.Add([this, line = &line, &for_each_line] {
+        for_each_line(XLineVisitor(this, line));
+      });
+    }
+    bundle.JoinAll();
+  }
+
+  template <typename ForEachEventMetadataFunc>
+  void ForEachEventMetadata(
+      ForEachEventMetadataFunc&& for_each_event_metadata) const {
+    for (const auto& [id, event_metadata] : plane_->event_metadata()) {
+      for_each_event_metadata(XEventMetadataVisitor(this, &event_metadata));
+    }
+  }
+
+  // Returns event metadata given its id. Returns a default value if not found.
+  const XEventMetadata* GetEventMetadata(int64_t event_metadata_id) const;
+
+  // Returns the type of an event given its id.
+  std::optional<int64_t> GetEventType(int64_t event_metadata_id) const;
+
+  // Returns stat metadata given its id. Returns a default value if not found.
+  const XStatMetadata* GetStatMetadata(int64_t stat_metadata_id) const;
+
+  // Returns stat metadata given its type. Returns nullptr if not found.
+  // Use as an alternative to GetStatMetadata above.
+  const XStatMetadata* GetStatMetadataByType(int64_t stat_type) const;
+
+  // Returns the type of an stat given its id.
+  std::optional<int64_t> GetStatType(int64_t stat_metadata_id) const;
+
+ private:
+  void BuildEventTypeMap(const XPlane* plane,
+                         const TypeGetterList& event_type_getter_list);
+  void BuildStatTypeMap(const XPlane* plane,
+                        const TypeGetterList& stat_type_getter_list);
+
+  const XPlane* plane_;
+
+  absl::flat_hash_map<int64_t /*metadata_id*/, int64_t /*EventType*/>
+      event_type_by_id_;
+  absl::flat_hash_map<int64_t /*metadata_id*/, int64_t /*StatType*/>
+      stat_type_by_id_;
+  absl::flat_hash_map<int64_t /*StatType*/, const XStatMetadata*>
+      stat_metadata_by_type_;
+};
+
+template <class T>
+std::optional<XStatVisitor> XStatsOwner<T>::GetStat(int64_t stat_type) const {
+  const auto* stat_metadata = plane_->GetStatMetadataByType(stat_type);
+  if (stat_metadata != nullptr) {
+    return GetStat(stat_type, *stat_metadata);
+  }
+  return std::nullopt;  // type does not exist in this owner.
+}
+
+template <typename ForEachChildFunc>
+void XEventMetadataVisitor::ForEachChild(
+    ForEachChildFunc&& for_each_child) const {
+  for (int64_t child_id : metadata()->child_id()) {
+    const auto* event_metadata = plane()->GetEventMetadata(child_id);
+    if (event_metadata != nullptr) {
+      for_each_child(XEventMetadataVisitor(plane(), event_metadata));
+    }
+  }
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_XPLANE_VISITOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
new file mode 100644
index 00000000..b3aa94e4
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#define XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+
+// Registers all custom types from the python ml_dtypes package.
+//   https://github.com/jax-ml/ml_dtypes
+
+namespace tsl {
+namespace ml_dtypes {
+
+struct NumpyDtypes {
+  int bfloat16;
+  int float8_e3m4;
+  int float8_e4m3;
+  int float8_e4m3fn;
+  int float8_e4m3b11fnuz;
+  int float8_e4m3fnuz;
+  int float8_e5m2;
+  int float8_e5m2fnuz;
+  int int4;
+  int uint4;
+};
+
+// RegisterTypes imports the ml_dtypes module. It should be called before using
+// the functions below, and it fails (by returning false) if there was an error
+// importing that module. If the build system guarantees that the module exists,
+// the call can be omitted, since it is implied by the functions below.
+bool RegisterTypes();
+
+// Implicitly calls RegisterTypes on first use.
+const NumpyDtypes& GetNumpyDtypes();
+
+inline int GetBfloat16TypeNum() { return GetNumpyDtypes().bfloat16; }
+
+}  // namespace ml_dtypes
+}  // namespace tsl
+
+#endif  // XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/python/lib/core/numpy.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/python/lib/core/numpy.h
new file mode 100644
index 00000000..ca57a037
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/python/lib/core/numpy.h
@@ -0,0 +1,60 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#define XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
+
+#ifdef PyArray_Type
+#error "Numpy cannot be included before numpy.h."
+#endif
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+// We import_array in the XLA init function only.
+#define PY_ARRAY_UNIQUE_SYMBOL _xla_numpy_api
+#ifndef XLA_IMPORT_NUMPY
+#define NO_IMPORT_ARRAY
+#endif
+
+// Prevent linking error with numpy>=2.1.0
+// error: undefined hidden symbol: _xla_numpy_apiPyArray_RUNTIME_VERSION
+// Without this define, Numpy's API symbols will have hidden symbol visibility,
+// which may break things if Bazel chooses to build a cc_library target into
+// its own .so file. Bazel typically does this for debug builds.
+#define NPY_API_SYMBOL_ATTRIBUTE
+
+// clang-format off
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
+#include <locale>
+
+#include <Python.h>
+// clang-format on
+
+#include "numpy/arrayobject.h"  // IWYU pragma: export
+#include "numpy/npy_common.h"   // IWYU pragma: export
+#include "numpy/ufuncobject.h"  // IWYU pragma: export
+
+namespace tsl {
+
+// Import numpy.  This wrapper function exists so that the
+// PY_ARRAY_UNIQUE_SYMBOL can be safely defined in a .cc file to
+// avoid weird linking issues.  Should be called only from our
+// module initialization function.
+void ImportNumpy();
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/byte_swap_array.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/byte_swap_array.h
new file mode 100644
index 00000000..d6eff172
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/byte_swap_array.h
@@ -0,0 +1,104 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#define XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/byte_order.h"
+
+// Define basic byte swapping operations.
+// These operations must be macros to use compiler intrinsics.
+// Note that the code here is written for portability, not speed. Byte swapping
+// only happens when importing a checkpoint from one hardware architecture onto
+// a different architecture. If these operations become part of a fast path,
+// then the function ByteSwapArray() below should be rewritten to use
+// architecture-appropriate SIMD instructions that swap multiple words at once.
+
+#if defined(__linux__)
+
+// Use the Gnu byte swap macros when available.  See bswap(3) for more info.
+#include <byteswap.h>
+#define BYTE_SWAP_16(x) bswap_16(x)
+#define BYTE_SWAP_32(x) bswap_32(x)
+#define BYTE_SWAP_64(x) bswap_64(x)
+
+#elif defined(PLATFORM_WINDOWS)
+
+// On windows, byte-swapping is in winsock.h, and winsock2.h has a version of
+// of htonl that can byte-swap 64-bit values.
+#include <winsock2.h>
+#define BYTE_SWAP_16(x) htons(x)
+#define BYTE_SWAP_32(x) htonl(x)
+// At the moment the 64-bit and 128-bit byte-swapping routines in Winsock2 are
+// disabled in TensorFlow's standard Windows build environment, so we use
+// htonl() instead of "#define BYTE_SWAP_64(x) htonll (x)".
+#define BYTE_SWAP_64(x)                                \
+  ((uint64_t(htonl((x)&0x00000000ffffffffUL)) << 32) | \
+   (htonl(((x)&0xffffffff00000000UL) >> 32)))
+
+#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+// On non-Linux, non-Windows, but little-endian, environments, use htonl/s,
+// which byte-swap when the host byte order is little-endian. POSIX doesn't
+// define a 64-bit version of these library functions, so we roll our own.
+#include <arpa/inet.h>
+#define BYTE_SWAP_16(x) htons(x)
+#define BYTE_SWAP_32(x) htonl(x)
+#define BYTE_SWAP_64(x)                                \
+  ((uint64_t(htonl((x)&0x00000000ffffffffUL)) << 32) | \
+   (htonl(((x)&0xffffffff00000000UL) >> 32)))
+
+#else  // not defined(__linux__) and not defined(PLATFORM_WINDOWS)
+       // and (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__)
+
+// Fall back on a non-optimized implementation on other big-endian targets.
+// This code swaps one byte at a time and is probably an order of magnitude
+// slower.
+
+#define BYTE_SWAP_16(x) ((((x)&0x00ff) << 8) | (((x)&0xff00) >> 8))
+
+#define BYTE_SWAP_32(x)                                   \
+  ((((x)&0x000000ffU) << 24) | (((x)&0x0000ff00U) << 8) | \
+   (((x)&0x00ff0000U) >> 8) | (((x)&0xff000000U) >> 24))
+
+#define BYTE_SWAP_64(x)                                                      \
+  ((((x)&0x00000000000000ffUL) << 56) | (((x)&0x000000000000ff00UL) << 40) | \
+   (((x)&0x0000000000ff0000UL) << 24) | (((x)&0x00000000ff000000UL) << 8) |  \
+   (((x)&0x000000ff00000000UL) >> 8) | (((x)&0x0000ff0000000000UL) >> 24) |  \
+   (((x)&0x00ff000000000000UL) >> 40) | (((x)&0xff00000000000000UL) >> 56))
+
+#endif  // defined(__linux__)
+
+namespace tsl {
+
+// Byte-swap an entire array of atomic C/C++ types in place.
+//
+// Note: When calling this function on arrays of std::complex<> types,
+// multiply the number of elements by 2 and divide the bytes per element by 2.
+//
+// Args:
+//  array: Pointer to the beginning of the array
+//  bytes_per_elem: Number of bytes in each element of the array
+//  array_len: Number of elements in the array
+//
+// Returns: absl::OkStatus() on success, -1 otherwise
+//
+absl::Status ByteSwapArray(char *array, size_t bytes_per_elem, int array_len);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/command_line_flags.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/command_line_flags.h
new file mode 100644
index 00000000..50888879
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/command_line_flags.h
@@ -0,0 +1,148 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#define XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/platform/types.h"
+
+namespace tsl {
+
+// N.B. This library is for INTERNAL use only.
+//
+// This is a simple command-line argument parsing module to help us handle
+// parameters for C++ binaries. The recommended way of using it is with local
+// variables and an initializer list of Flag objects, for example:
+//
+// int some_int = 10;
+// bool some_switch = false;
+// string some_name = "something";
+// std::vector<tsl::Flag> flag_list = {
+//   Flag("some_int", &some_int, "an integer that affects X"),
+//   Flag("some_switch", &some_switch, "a bool that affects Y"),
+//   Flag("some_name", &some_name, "a string that affects Z")
+// };
+// // Get usage message before ParseFlags() to capture default values.
+// string usage = Flag::Usage(argv[0], flag_list);
+// bool parsed_values_ok = Flags::Parse(&argc, argv, flag_list);
+//
+// tsl::port::InitMain(usage.c_str(), &argc, &argv);
+// if (argc != 1 || !parsed_values_ok) {
+//    ...output usage and error message...
+// }
+//
+// The argc and argv values are adjusted by the Parse function so all that
+// remains is the program name (at argv[0]) and any unknown arguments fill the
+// rest of the array. This means you can check for flags that weren't understood
+// by seeing if argv is greater than 1.
+// The result indicates if there were any errors parsing the values that were
+// passed to the command-line switches. For example, --some_int=foo would return
+// false because the argument is expected to be an integer.
+//
+// NOTE: Unlike gflags-style libraries, this library is intended to be
+// used in the `main()` function of your binary. It does not handle
+// flag definitions that are scattered around the source code.
+
+// A description of a single command line flag, holding its name, type, usage
+// text, and a pointer to the corresponding variable.
+class Flag {
+ public:
+  Flag(const char* name, int32* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, int64_t* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, bool* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, string* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, float* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+
+  // These constructors invoke a hook on a match instead of writing to a
+  // specific memory location.  The hook may return false to signal a malformed
+  // or illegal value, which will then fail the command line parse.
+  //
+  // "default_value_for_display" is shown as the default value of this flag in
+  // Flags::Usage().
+  Flag(const char* name, std::function<bool(int32_t)> int32_hook,
+       int32_t default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(int64_t)> int64_hook,
+       int64_t default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(float)> float_hook,
+       float default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(bool)> bool_hook,
+       bool default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(string)> string_hook,
+       string default_value_for_display, const string& usage_text);
+
+ private:
+  friend class Flags;
+
+  bool Parse(string arg, bool* value_parsing_ok) const;
+
+  string name_;
+  enum {
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_BOOL,
+    TYPE_STRING,
+    TYPE_FLOAT,
+  } type_;
+
+  std::function<bool(int32_t)> int32_hook_;
+  int32 int32_default_for_display_;
+
+  std::function<bool(int64_t)> int64_hook_;
+  int64_t int64_default_for_display_;
+
+  std::function<bool(float)> float_hook_;
+  float float_default_for_display_;
+
+  std::function<bool(bool)> bool_hook_;
+  bool bool_default_for_display_;
+
+  std::function<bool(string)> string_hook_;
+  string string_default_for_display_;
+
+  string usage_text_;
+};
+
+class Flags {
+ public:
+  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
+  // instances matching flags in flaglist[].  Update the variables associated
+  // with matching flags, and remove the matching arguments from (*argc, argv).
+  // Return true iff all recognized flag values were parsed correctly, and the
+  // first remaining argument is not "--help".
+  static bool Parse(int* argc, char** argv, const std::vector<Flag>& flag_list);
+
+  // Similar as above, but accepts a mutable vector of strings in place of
+  // argc and argv. Doesn't ignore the first flag, and return the unknown flags
+  // back in flags vector.
+  static bool Parse(std::vector<std::string>& flags,
+                    const std::vector<Flag>& flag_list);
+  // Return a usage message with command line cmdline, and the
+  // usage_text strings in flag_list[].
+  static string Usage(const string& cmdline,
+                      const std::vector<Flag>& flag_list);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/determinism.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/determinism.h
new file mode 100644
index 00000000..2f1861ed
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/determinism.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_DETERMINISM_H_
+#define XLA_TSL_UTIL_DETERMINISM_H_
+
+namespace tsl {
+
+bool OpDeterminismRequired();
+bool OpOrderDeterminismRequired();
+void EnableOpDeterminism(bool enabled);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_DETERMINISM_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/determinism_test_util.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/determinism_test_util.h
new file mode 100644
index 00000000..34b4552b
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/determinism_test_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#define XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+
+#include "xla/tsl/util/determinism.h"
+
+namespace tsl {
+namespace test {
+
+// Enables determinism for a single test method.
+class DeterministicOpsScope {
+ public:
+  DeterministicOpsScope() : was_enabled_(OpDeterminismRequired()) {
+    EnableOpDeterminism(true);
+  }
+  ~DeterministicOpsScope() { EnableOpDeterminism(was_enabled_); }
+
+ private:
+  const bool was_enabled_;
+};
+
+}  // namespace test
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/device_name_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/device_name_utils.h
new file mode 100644
index 00000000..950387a6
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/device_name_utils.h
@@ -0,0 +1,297 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#define XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
+
+#include <string>
+
+#include "xla/tsl/platform/status.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+// In TensorFlow a device name is a string of the following form:
+//   /job:<name>/replica:<replica>/task:<task>/device:<type>:<device_num>
+//
+// <name> is a short identifier conforming to the regexp
+//     [a-zA-Z][_a-zA-Z]*
+// <type> is a supported device type (e.g. 'cpu' or 'gpu')
+// <replica>, <task>, <device_num> are small non-negative integers and are
+// densely allocated (except in tests).
+//
+// For some purposes, we also allow device patterns, which can specify
+// some or none of the specific fields above, with missing components,
+// or "<component>:*" indicating "any value allowed for that component.
+//
+// For example:
+//   "/job:param_server"   - Consider any devices in the "param_server" job
+//   "/device:cpu:*"       - Consider any cpu devices in any job/task/replica
+//   "/job:*/replica:*/task:*/device:cpu:*"  - Consider any cpu devices in any
+//                                             job/task/replica
+//   "/job:w/replica:0/task:0/device:gpu:*"  - Consider any gpu devices in
+//                                             replica 0, task 0, of job "w"
+class DeviceNameUtils {
+ public:
+  // Returns a fully qualified device name given the parameters.
+  static std::string FullName(const std::string& job, int replica, int task,
+                              const std::string& type, int id);
+
+  // TODO(b/278776328): Convert this to a Protobuf, since emptiness of a field
+  // is a standardized pattern in Protobuf.
+  struct ParsedName {
+    void Clear() {
+      has_job = false;
+      has_replica = false;
+      has_task = false;
+      has_type = false;
+      has_id = false;
+      job.clear();
+      replica = 0;
+      task = 0;
+      type.clear();
+      id = 0;
+    }
+
+    bool operator==(const ParsedName& other) const {
+      return (has_job ? (other.has_job && job == other.job) : !other.has_job) &&
+             (has_replica ? (other.has_replica && replica == other.replica)
+                          : !other.has_replica) &&
+             (has_task ? (other.has_task && task == other.task)
+                       : !other.has_task) &&
+             (has_type ? (other.has_type && type == other.type)
+                       : !other.has_type) &&
+             (has_id ? (other.has_id && id == other.id) : !other.has_id);
+    }
+
+    bool operator!=(const ParsedName& other) const {
+      return !operator==(other);
+    }
+
+    bool operator<(const ParsedName& other) const {
+      if (has_job != other.has_job) return !has_job;
+      if (has_job) {
+        if (job < other.job) {
+          return true;
+        }
+        if (job > other.job) {
+          return false;
+        }
+      }
+      if (has_replica != other.has_replica) return !has_replica;
+      if (has_replica) {
+        if (replica < other.replica) {
+          return true;
+        }
+        if (replica > other.replica) {
+          return false;
+        }
+      }
+      if (has_task != other.has_task) return !has_task;
+      if (has_task) {
+        if (task < other.task) {
+          return true;
+        }
+        if (task > other.task) {
+          return false;
+        }
+      }
+      if (has_type != other.has_type) return !has_type;
+      if (has_type) {
+        if (type < other.type) {
+          return true;
+        }
+        if (type > other.type) {
+          return false;
+        }
+      }
+      if (has_id != other.has_id) return !has_id;
+      if (has_id) {
+        if (id < other.id) {
+          return true;
+        }
+        if (id > other.id) {
+          return false;
+        }
+      }
+      return false;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const ParsedName& n) {
+      return H::combine(std::move(h), n.has_job ? n.job : "",
+                        n.has_replica ? n.replica : 0, n.has_task ? n.task : 0,
+                        n.has_type ? n.type : "", n.has_id ? n.id : 0);
+    }
+
+    bool has_job = false;
+    std::string job;
+    bool has_replica = false;
+    int replica = 0;
+    bool has_task = false;
+    int task = 0;
+    bool has_type = false;
+    std::string type;
+    bool has_id = false;
+    int id = 0;
+  };
+
+  // Parses the device name, first as a full name, then, if it fails, as a
+  // global one. Returns `false` if both attempts fail.
+  static bool ParseFullOrLocalName(absl::string_view fullname,
+                                   ParsedName* parsed);
+
+  // Parses "fullname" into "*parsed". Returns true iff succeeds.
+  // Legacy names like "/cpu:0" that don't contain "device",
+  // are parsed to mean their current counterparts "/device:CPU:0". More
+  // specifically, the lower case "cpu" and "gpu" is capitalized and "device"
+  // is added. "/tpu:0" is not treated the same way - it has use the current
+  // full syntax.
+  // Also, note that lower case "cpu" and "gpu" device types in current syntax
+  // are not capitalized. For example, "/device:CPU:0" is different from
+  // "/device:cpu:0"
+  static bool ParseFullName(absl::string_view fullname, ParsedName* parsed);
+
+  // Canonicalizes "fullname" into "*canonical_name". Uses a fully specified
+  // basename to fill in fields that are missing. Accepts both legacy, newer
+  // and local versions of the device spec. Returns the newer version of the
+  // device spec. If we were unable to interpret / parse "fullname" returns
+  // an error and *canonical_name is set to "".
+  static absl::Status CanonicalizeDeviceName(absl::string_view fullname,
+                                             absl::string_view basename,
+                                             std::string* canonical_name);
+
+  // Returns true if "name" specifies any non-trivial constraint on the device.
+  static bool HasSomeDetails(const ParsedName& name) {
+    return name.has_job || name.has_replica || name.has_task || name.has_type ||
+           name.has_id;
+  }
+
+  // Returns true if more_specific is a specification of
+  // less_specific, i.e. everywhere that less-specific has a
+  // non-wildcard component value, more_specific has the same value
+  // for that component.
+  static bool IsSpecification(const ParsedName& less_specific,
+                              const ParsedName& more_specific);
+
+  // Makes minimal changes to more_specific so that it becomes a
+  // specification of less_specific.
+  static void EnsureSpecification(ParsedName* more_specific,
+                                  const ParsedName& less_specific);
+
+  // Like IsSpecification, but the second argument "name" must have a
+  // non-wildcard value for all of its components.
+  static bool IsCompleteSpecification(const ParsedName& pattern,
+                                      const ParsedName& name);
+
+  // True iff there exists any possible device name that is a specification of
+  // both "a" and "b".
+  static bool AreCompatibleDevNames(const ParsedName& a, const ParsedName& b);
+
+  // Merges the device specifications in "*target" and "other", and
+  // stores the result in "*target". Returns OK if "*target" and
+  // "other" are compatible, otherwise returns an error.
+  static absl::Status MergeDevNames(ParsedName* target,
+                                    const ParsedName& other) {
+    return MergeDevNames(target, other, false);
+  }
+  static absl::Status MergeDevNames(ParsedName* target, const ParsedName& other,
+                                    bool allow_soft_placement);
+  // Same as MergeDevNames with allow_soft_placement=true, but instead of
+  // clearing conflicting fields, overrides them with `other`'s values.
+  static absl::Status MergeOverrideDevNames(ParsedName* target,
+                                            const ParsedName& other);
+
+  // Merges the device specifications in "*target" and "other", and
+  // stores the result in "*target" by setting all unset values in target with
+  // corresponding set ones in other.
+  static void MergeUnsetDevNames(ParsedName* target, const ParsedName& other);
+
+  // Returns true iff devices identified by 'src' and 'dst' are in the
+  // same address space.
+  static bool IsSameAddressSpace(absl::string_view src, absl::string_view dst);
+  static bool IsSameAddressSpace(const ParsedName& src, const ParsedName& dst);
+
+  // Returns true iff devices identified by 'a' and 'b' are in different
+  // address space.
+  static bool IsDifferentAddressSpace(const ParsedName& a, const ParsedName& b);
+
+  // Returns the an address space specification containing only the
+  // job/replica/task of the given name.
+  static const ParsedName AddressSpace(const ParsedName& name);
+
+  // Returns the local device given its "type" and "id".
+  static std::string LocalName(absl::string_view type, int id);
+
+  // Returns a short local device name (cpu:0, gpu:1, etc) based on
+  // the given fullname.
+  static std::string LocalName(absl::string_view fullname);
+
+  // If "name" is a valid local device name (cpu:0, gpu:1, etc.),
+  // fills in parsed.type and parsed.id accordingly. Returns true iff
+  // succeeds.
+  static bool ParseLocalName(absl::string_view name, ParsedName* parsed);
+
+  // Splits a fully-qualified device name into a task identifier and a
+  // relative device identifier. It first parses "name" using
+  // ParseFullName(), then assigns *task with everything except for
+  // the local device component, and assigns the relative device
+  // component into *device.  This function will still return true if
+  // the task component is empty, but it requires the relative device
+  // component to be fully specified.
+  static bool SplitDeviceName(absl::string_view name, std::string* task,
+                              std::string* device);
+
+  // Get the task name from ParsedName. Return false if the task component is
+  // not fully specified.
+  static bool GetTaskName(const ParsedName& pn, std::string* task);
+
+  static std::string ParsedNameToString(const ParsedName& pn);
+
+  // Returns canonical and legacy full names for the given parsed
+  // device name 'pn'. The returned string names are often useful to
+  // look up devices from a mapping.
+  static std::vector<string> GetNamesForDeviceMappings(const ParsedName& pn);
+
+  // Returns canonical and legacy local names for the given parsed device name
+  // 'pn'. The returned string names are often useful to look up devices from a
+  // mapping.
+  static std::vector<string> GetLocalNamesForDeviceMappings(
+      const ParsedName& pn);
+
+  // Returns name of the CPU:0 device on the same host as the device
+  // `device_name`.
+  static absl::Status DeviceNameToCpuDeviceName(const std::string& device_name,
+                                                std::string* host_device_name);
+
+  static bool CompareFullNames(const absl::string_view& a,
+                               const absl::string_view& b) {
+    ParsedName parsed_a;
+    ParsedName parsed_b;
+    bool a_status = ParseFullName(a, &parsed_a);
+    bool b_status = ParseFullName(b, &parsed_b);
+    // Orders unparsable names first.
+    if (a_status != b_status) return !a_status;
+    if (!a_status) return a < b;
+    return parsed_a < parsed_b;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const DeviceNameUtils::ParsedName& x);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/env_var.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/env_var.h
new file mode 100644
index 00000000..fdfb366d
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/env_var.h
@@ -0,0 +1,58 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_ENV_VAR_H_
+#define XLA_TSL_UTIL_ENV_VAR_H_
+
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/stringpiece.h"
+
+namespace tsl {
+
+// Returns a boolean into "value" from the environmental variable
+// "env_var_name". If it is unset, the default value is used. A string "0" or a
+// case insensitive "false" is interpreted as false. A string "1" or a case
+// insensitive "true" is interpreted as true. Otherwise, an error status is
+// returned.
+absl::Status ReadBoolFromEnvVar(absl::string_view env_var_name,
+                                bool default_val, bool* value);
+
+// Returns an int64 into "value" from the environmental variable "env_var_name".
+// If it is unset, the default value is used.
+// If the string cannot be parsed into int64, an error status is returned.
+absl::Status ReadInt64FromEnvVar(absl::string_view env_var_name,
+                                 int64_t default_val, int64_t* value);
+// Returns a float into "value" from the environmental variable "env_var_name".
+// If it is unset, the default value is used.
+// If the string cannot be parsed into float, an error status is returned.
+absl::Status ReadFloatFromEnvVar(absl::string_view env_var_name,
+                                 float default_val, float* value);
+
+// Returns a string into "value" from the environmental variable "env_var_name".
+// If it is unset, the default value is used.
+absl::Status ReadStringFromEnvVar(absl::string_view env_var_name,
+                                  absl::string_view default_val,
+                                  std::string* value);
+
+// Returns a comma separated string into "value" from the environmental variable
+// "env_var_name". If it is unset, the default value is comma split and used.
+absl::Status ReadStringsFromEnvVar(absl::string_view env_var_name,
+                                   absl::string_view default_val,
+                                   std::vector<std::string>* value);
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_ENV_VAR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/onednn_threadpool.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/onednn_threadpool.h
new file mode 100644
index 00000000..7bf988b5
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/onednn_threadpool.h
@@ -0,0 +1,193 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#define XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#ifdef INTEL_MKL
+
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#define EIGEN_USE_THREADS
+
+#include "dnnl_threadpool.hpp"
+#include "absl/synchronization/blocking_counter.h"
+#include "dnnl.hpp"
+#include "xla/tsl/platform/threadpool.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace tsl {
+
+#ifndef ENABLE_ONEDNN_OPENMP
+using dnnl::threadpool_interop::threadpool_iface;
+
+// Divide 'n' units of work equally among 'teams' threads. If 'n' is not
+// divisible by 'teams' and has a remainder 'r', the first 'r' teams have one
+// unit of work more than the rest. Returns the range of work that belongs to
+// the team 'tid'.
+// Parameters
+//   n        Total number of jobs.
+//   team     Number of workers.
+//   tid      Current thread_id.
+//   n_start  start of range operated by the thread.
+//   n_end    end of the range operated by the thread.
+
+template <typename T, typename U>
+inline void balance211(T n, U team, U tid, T* n_start, T* n_end) {
+  if (team <= 1 || n == 0) {
+    *n_start = 0;
+    *n_end = n;
+    return;
+  }
+  T min_per_team = n / team;
+  T remainder = n - min_per_team * team;  // i.e., n % teams.
+  *n_start = tid * min_per_team + std::min(tid, remainder);
+  *n_end = *n_start + min_per_team + (tid < remainder);
+}
+
+inline void run_jobs(bool balance, int i, int n, int njobs,
+                     const std::function<void(int, int)>& fn) {
+  if (balance) {
+    int start, end;
+    balance211(n, njobs, i, &start, &end);
+    for (int j = start; j < end; j++) fn(j, n);
+  } else {
+    fn(i, n);
+  }
+}
+
+class OneDnnThreadPool : public threadpool_iface {
+ public:
+  OneDnnThreadPool() = default;
+
+  OneDnnThreadPool(Eigen::ThreadPoolInterface* eigen_interface,
+                   int num_threads = -1)
+      : eigen_interface_(eigen_interface) {
+    set_num_and_max_threads(num_threads);
+  }
+  OneDnnThreadPool(Eigen::ThreadPoolInterface* eigen_interface,
+                   bool can_use_caller_thread, int num_threads = -1)
+      : eigen_interface_(eigen_interface),
+        can_use_caller_thread_(can_use_caller_thread) {
+    set_num_and_max_threads(num_threads);
+  }
+  virtual int get_num_threads() const override { return num_threads_; }
+  virtual bool get_in_parallel() const override {
+    return (eigen_interface_->CurrentThreadId() != -1) ? true : false;
+  }
+  virtual uint64_t get_flags() const override { return ASYNCHRONOUS; }
+  virtual void parallel_for(int n,
+                            const std::function<void(int, int)>& fn) override {
+    // Should never happen (handled by DNNL)
+    if (n == 0) return;
+
+    // Should never happen (handled by DNNL)
+    if (n == 1) {
+      fn(0, 1);
+      return;
+    }
+
+    int nthr = get_num_threads();
+    int njobs = std::min(n, nthr);
+    bool balance = (nthr < n);
+
+    // If use_caller_thread, schedule njobs-1 jobs to thread pool and run last
+    // job directly.
+    const bool use_caller_thread =
+        can_use_caller_thread_ && nthr == port::NumSchedulableCPUs();
+    const int njobs_to_schedule = use_caller_thread ? njobs - 1 : njobs;
+
+    if (use_caller_thread) {
+      for (int i = 0; i < njobs_to_schedule; i++) {
+        eigen_interface_->ScheduleWithHint(
+            [balance, i, n, njobs, fn]() {
+              run_jobs(balance, i, n, njobs, fn);
+            },
+            i, i + 1);
+      }
+      run_jobs(balance, njobs_to_schedule, n, njobs, fn);
+    } else {
+      absl::BlockingCounter counter(njobs);
+      std::function<void(int, int)> handle_range = [=, &handle_range, &counter](
+                                                       int first, int last) {
+        while (last - first > 1) {
+          const auto mid = first + (last - first) / 2;
+          // Find something near the midpoint which is a multiple of block size.
+          eigen_interface_->ScheduleWithHint([=]() { handle_range(mid, last); },
+                                             mid, mid + 1);
+          last = mid;
+        }
+        counter.DecrementCount();
+        run_jobs(balance, first, n, njobs, fn);
+      };
+
+      // Eigen avoids a thread hop by running the root of the tree on the main
+      // thread. We have disabled this because it actually slows things down
+      // relative to base because base cheats and uses n threads while letting
+      // main continue doing other work
+      eigen_interface_->ScheduleWithHint([=]() { handle_range(0, njobs); }, 0,
+                                         1);
+
+      counter.Wait();
+    }
+  }
+
+  ~OneDnnThreadPool() {}
+
+  static void set_onednn_max_threads(int num_threads) {
+#if DNNL_VERSION_MAJOR >= 3 || \
+    (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
+#ifndef DNNL_AARCH64_USE_ACL
+    dnnl_threadpool_interop_set_max_concurrency(num_threads);
+#endif  // DNNL_AARCH64_USE_ACL
+#endif  // DNNL_VERSION_MAJOR >= 3 ||
+        // (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
+  }
+
+ private:
+  Eigen::ThreadPoolInterface* eigen_interface_ = nullptr;
+  int num_threads_ = 1;                 // Execute in caller thread.
+  bool can_use_caller_thread_ = false;  // true if the user set the env variable
+                                        // to use caller thread also.
+  inline void set_num_and_max_threads(int num_threads) {
+    num_threads_ =
+        num_threads == -1 ? eigen_interface_->NumThreads() : num_threads;
+    set_onednn_max_threads(num_threads_);
+  }
+};
+
+#else
+
+// This class was just added to enable successful OMP-based build.
+class OneDnnThreadPool {
+ public:
+  OneDnnThreadPool() = default;
+  OneDnnThreadPool(Eigen::ThreadPoolInterface* eigen_interface) {}
+  OneDnnThreadPool(Eigen::ThreadPoolInterface* eigen_interface,
+                   bool can_use_caller_thread, int num_threads = -1) {}
+  static void set_onednn_max_threads(int num_threads) {}
+};
+
+#endif  // !ENABLE_ONEDNN_OPENMP
+
+}  // namespace tsl
+
+#endif  // INTEL_MKL
+#endif  // XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/proto/proto_utils.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/proto/proto_utils.h
new file mode 100644
index 00000000..2762f4df
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/proto/proto_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#define XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
+
+#include "google/protobuf/duration.pb.h"
+#include "absl/time/time.h"
+
+namespace tsl {
+namespace proto_utils {
+
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
+}  // namespace proto_utils
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/reporter.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/reporter.h
new file mode 100644
index 00000000..be504656
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/reporter.h
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_REPORTER_H_
+#define XLA_TSL_UTIL_REPORTER_H_
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/types.h"
+#include "xla/tsl/protobuf/test_log.pb.h"
+#include "tsl/platform/mutex.h"
+
+namespace tsl {
+
+// The TestReportFile provides a file abstraction for TF tests to use.
+class TestReportFile {
+ public:
+  // Create a TestReportFile with the test name 'test_name'.
+  TestReportFile(const string& fname, const string& test_name);
+
+  // Initialize the TestReportFile.  If the reporting env flag is set,
+  // try to create the reporting file.  Fails if the file already exists.
+  absl::Status Initialize();
+
+  // Append the report file w/ 'content'.
+  absl::Status Append(const string& content);
+
+  // Close the report file.
+  absl::Status Close();
+
+  bool IsClosed() const { return closed_; }
+
+  ~TestReportFile() { Close().IgnoreError(); }  // Autoclose in destructor.
+
+ private:
+  bool closed_;
+  string fname_;
+  string test_name_;
+  std::unique_ptr<WritableFile> log_file_;
+  TestReportFile(const TestReportFile&) = delete;
+  void operator=(const TestReportFile&) = delete;
+};
+
+// The TestReporter writes test / benchmark output to binary Protobuf files when
+// the environment variable "TEST_REPORT_FILE_PREFIX" is defined.
+//
+// If this environment variable is not defined, no logging is performed.
+//
+// The intended use is via the following lines:
+//
+//  TestReporter reporter(test_name);
+//  TF_CHECK_OK(reporter.Initialize()));
+//  TF_CHECK_OK(reporter.Benchmark(iters, cpu_time, wall_time, throughput));
+//  TF_CHECK_OK(reporter.SetProperty("some_string_property", "some_value");
+//  TF_CHECK_OK(reporter.SetProperty("some_double_property", double_value);
+//  TF_CHECK_OK(reporter.Close());
+//
+// For example, if the environment variable
+//   TEST_REPORT_FILE_PREFIX="/tmp/run_"
+// is set, and test_name is "BM_Foo/1/2", then a BenchmarkEntries pb
+// with a single entry is written to file:
+//   /tmp/run_BM_Foo__1__2
+//
+class TestReporter {
+ public:
+  static constexpr const char* kTestReporterEnv = "TEST_REPORT_FILE_PREFIX";
+
+  // Create a TestReporter with the test name 'test_name'.
+  explicit TestReporter(const string& test_name)
+      : TestReporter(GetLogEnv(), test_name) {}
+
+  // Provide a prefix filename, mostly used for testing this class.
+  TestReporter(const string& fname, const string& test_name);
+
+  // Initialize the TestReporter.  If the reporting env flag is set,
+  // try to create the reporting file.  Fails if the file already exists.
+  absl::Status Initialize();
+
+  // Finalize the report.  If the reporting env flag is set,
+  // flush the reporting file and close it.
+  // Once Close is called, no other methods should be called other
+  // than Close and the destructor.
+  absl::Status Close();
+
+  // Set the report to be a Benchmark and log the given parameters.
+  // Only does something if the reporting env flag is set.
+  // Does not guarantee the report is written.  Use Close() to
+  // enforce I/O operations.
+  absl::Status Benchmark(int64_t iters, double cpu_time, double wall_time,
+                         double throughput);
+
+  // Set property on Benchmark to the given value.
+  absl::Status SetProperty(const string& name, double value);
+
+  // Set property on Benchmark to the given value.
+  absl::Status SetProperty(const string& name, const string& value);
+
+  // Add the given value to the metrics on the Benchmark.
+  absl::Status AddMetric(const string& name, double value);
+
+  // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+  ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
+
+ private:
+  static string GetLogEnv() {
+    const char* fname_ptr = getenv(kTestReporterEnv);
+    return (fname_ptr != nullptr) ? fname_ptr : "";
+  }
+  TestReportFile report_file_;
+  tensorflow::BenchmarkEntry benchmark_entry_;
+  TestReporter(const TestReporter&) = delete;
+  void operator=(const TestReporter&) = delete;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_REPORTER_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/stat_summarizer_options.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/stat_summarizer_options.h
new file mode 100644
index 00000000..c3ed6ffd
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/stat_summarizer_options.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#define XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+namespace tsl {
+// Used to control the output of the statistics summarizer;
+struct StatSummarizerOptions {
+  StatSummarizerOptions()
+      : show_run_order(true),
+        run_order_limit(0),
+        show_time(true),
+        time_limit(10),
+        show_memory(true),
+        memory_limit(10),
+        show_type(true),
+        show_summary(true),
+        format_as_csv(false) {}
+
+  bool show_run_order;
+  int run_order_limit;
+  bool show_time;
+  int time_limit;
+  bool show_memory;
+  int memory_limit;
+  bool show_type;
+  bool show_summary;
+  bool format_as_csv;
+};
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/stats_calculator.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/stats_calculator.h
new file mode 100644
index 00000000..253895ca
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/stats_calculator.h
@@ -0,0 +1,237 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_UTIL_STATS_CALCULATOR_H_
+#define XLA_TSL_UTIL_STATS_CALCULATOR_H_
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xla/tsl/util/stat_summarizer_options.h"
+
+namespace tsl {
+
+template <typename ValueType, typename HighPrecisionValueType = double>
+class Stat {
+ public:
+  void UpdateStat(ValueType v) {
+    if (count_ == 0) {
+      first_ = v;
+    }
+
+    newest_ = v;
+    max_ = std::max(v, max_);
+    min_ = std::min(v, min_);
+    ++count_;
+    sum_ += v;
+    squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
+  }
+
+  void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
+
+  bool empty() const { return count_ == 0; }
+
+  ValueType first() const { return first_; }
+
+  ValueType newest() const { return newest_; }
+
+  ValueType max() const { return max_; }
+
+  ValueType min() const { return min_; }
+
+  int64_t count() const { return count_; }
+
+  ValueType sum() const { return sum_; }
+
+  HighPrecisionValueType squared_sum() const { return squared_sum_; }
+
+  bool all_same() const { return (count_ == 0 || min_ == max_); }
+
+  HighPrecisionValueType avg() const {
+    return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
+                   : static_cast<HighPrecisionValueType>(sum_) / count_;
+  }
+
+  // Returns sample variance.
+  ValueType sample_variance() const {
+    return all_same()
+               ? 0
+               : (squared_sum_ - std::pow(sum_, 2.0) / count_) / (count_ - 1);
+  }
+
+  // Returns population variance.
+  ValueType variance() const {
+    return all_same() ? 0 : (squared_sum_ / count_) - (avg() * avg());
+  }
+
+  // Returns population stddev.
+  ValueType std_deviation() const {
+    return all_same() ? 0 : std::sqrt(variance());
+  }
+
+  void OutputToStream(std::ostream* stream) const {
+    if (empty()) {
+      *stream << "count=0";
+    } else if (all_same()) {
+      *stream << "count=" << count_ << " curr=" << newest_;
+      if (count_ > 1) *stream << "(all same)";
+    } else {
+      *stream << "count=" << count_ << " first=" << first_
+              << " curr=" << newest_ << " min=" << min_ << " max=" << max_
+              << " avg=" << avg() << " std=" << std_deviation();
+    }
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream,
+                                  const Stat<ValueType>& stat) {
+    stat.OutputToStream(&stream);
+    return stream;
+  }
+
+ private:
+  ValueType first_ = 0;
+  ValueType newest_ = 0;
+  ValueType max_ = std::numeric_limits<ValueType>::min();
+  ValueType min_ = std::numeric_limits<ValueType>::max();
+  int64_t count_ = 0;
+  ValueType sum_ = 0;
+  HighPrecisionValueType squared_sum_ = 0;
+};
+
+// A `StatWithPercentiles` inherited from `Stat`, also keeps track of the
+// values added and can be used to compute the percentile values.
+template <typename ValueType, typename HighPrecisionValueType = double>
+class StatWithPercentiles : public Stat<ValueType, HighPrecisionValueType> {
+ public:
+  void UpdateStat(ValueType v) {
+    Stat<ValueType, HighPrecisionValueType>::UpdateStat(v);
+    values_.push_back(v);
+  }
+
+  // Returns the percentile value.
+  ValueType percentile(int percentile) const {
+    if (percentile < 0 || percentile > 100 || values_.empty()) {
+      return std::numeric_limits<ValueType>::quiet_NaN();
+    }
+    std::vector<ValueType> values = values_;
+    if (percentile == 100) {
+      return values[values.size() - 1];
+    } else {
+      std::nth_element(values.begin(),
+                       values.begin() + values.size() * percentile / 100,
+                       values.end());
+      return values[values.size() * percentile / 100];
+    }
+  }
+
+  void OutputToStream(std::ostream* stream) const {
+    Stat<ValueType, HighPrecisionValueType>::OutputToStream(stream);
+    *stream << " p5=" << percentile(5) << " median=" << percentile(50)
+            << " p95=" << percentile(95);
+  }
+
+ private:
+  std::vector<ValueType> values_;
+};
+
+// A StatsCalculator assists in performance analysis of Graph executions.
+//
+// It summarizes time spent executing (on GPU/CPU), memory used etc for
+// graph execution.
+//
+// For example usage see StatsSummarizer.
+class StatsCalculator {
+ public:
+  enum SortingMetric {
+    BY_NAME,
+    BY_RUN_ORDER,
+    BY_TIME,
+    BY_MEMORY,
+    BY_TYPE,
+  };
+
+  explicit StatsCalculator(const StatSummarizerOptions& options);
+
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const;
+
+  std::string GetShortSummary() const;
+
+  void ComputeStatsByType(
+      std::map<std::string, int64_t>* node_type_map_count,
+      std::map<std::string, int64_t>* node_type_map_time,
+      std::map<std::string, int64_t>* node_type_map_memory,
+      std::map<std::string, int64_t>* node_type_map_times_called,
+      int64_t* accumulated_us) const;
+
+  std::string GetStatsByNodeType() const;
+
+  std::string GetStatsByMetric(const std::string& title,
+                               SortingMetric sorting_metric,
+                               int num_stats) const;
+
+  // Returns number of runs.
+  int num_runs() const { return static_cast<int>(run_total_us_.count()); }
+
+  // Returns stats of total microseconds spent by all nodes in each run.
+  const Stat<int64_t>& run_total_us() const { return run_total_us_; }
+
+  void UpdateRunTotalUs(int64_t run_total_us) {
+    run_total_us_.UpdateStat(run_total_us);
+  }
+
+  void UpdateMemoryUsed(int64_t memory) { memory_.UpdateStat(memory); }
+
+  struct Detail {
+    std::string name;
+    std::string type;
+    int64_t run_order;
+    Stat<int64_t> elapsed_time;
+    Stat<int64_t> mem_used;
+    int64_t times_called;
+  };
+
+  const std::map<std::string, Detail>& GetDetails() const { return details_; }
+
+  void AddNodeStats(const std::string& name, const std::string& type,
+                    int64_t run_order, int64_t rel_end_us, int64_t mem_used);
+
+ private:
+  void OrderNodesByMetric(SortingMetric sorting_metric,
+                          std::vector<const Detail*>* details) const;
+
+  std::string HeaderString(const std::string& title) const;
+  std::string ColumnString(const Detail& detail,
+                           const int64_t cumulative_stat_on_node,
+                           const Stat<int64_t>& stat) const;
+
+  Stat<int64_t> run_total_us_;
+  Stat<int64_t> memory_;
+
+  std::map<std::string, Detail> details_;
+  StatSummarizerOptions options_;
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_STATS_CALCULATOR_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/use_cudnn.h b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/use_cudnn.h
new file mode 100644
index 00000000..41c29b25
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/tsl/util/use_cudnn.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to check Cudnn dependency and set Cudnn-related flags.
+
+#ifndef XLA_TSL_UTIL_USE_CUDNN_H_
+#define XLA_TSL_UTIL_USE_CUDNN_H_
+
+#include <cstdint>
+
+namespace tsl {
+
+bool CudnnUseAutotune();
+bool CudnnUseFrontend();
+bool CudnnUseRuntimeFusion();
+bool CudnnRnnUseAutotune();
+bool CudnnDisableConv1x1Optimization();
+bool DebugCudnnRnn();
+bool DebugCudnnRnnUseTensorOps();
+int64_t DebugCudnnRnnAlgo();
+
+// Returns true if the CuDNN depthwise convolution can be used. See cudnn
+// release note 7.6.3.
+// (https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_763.html)
+bool ShouldCudnnGroupedConvolutionBeUsed(const int32_t filter_rows,
+                                         const int32_t filter_cols,
+                                         const int32_t in_depth,
+                                         const int32_t out_depth);
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_USE_CUDNN_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/types.h b/third_party/tflite-hdrs/third_party/xla/xla/types.h
new file mode 100644
index 00000000..98e3d7c9
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/types.h
@@ -0,0 +1,146 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TYPES_H_
+#define XLA_TYPES_H_
+
+#include <complex>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/str_cat.h"
+#include "Eigen/Core"  // IWYU pragma: export
+#include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
+
+namespace xla {
+
+using ::Eigen::bfloat16;  // NOLINT(misc-unused-using-decls)
+using ::Eigen::half;      // NOLINT(misc-unused-using-decls)
+
+using complex64 = std::complex<float>;
+using complex128 = std::complex<double>;
+
+template <class T>
+struct is_complex : std::false_type {};
+template <class T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_complex_v = is_complex<T>::value;
+
+template <typename T>
+struct is_specialized_floating_point
+    : std::bool_constant<std::numeric_limits<T>::is_specialized &&
+                         !std::numeric_limits<T>::is_integer> {};
+
+template <typename T>
+inline constexpr bool is_specialized_floating_point_v =
+    is_specialized_floating_point<T>::value;
+
+template <typename T>
+struct is_specialized_integral
+    : std::bool_constant<std::numeric_limits<T>::is_specialized &&
+                         std::numeric_limits<T>::is_integer> {};
+
+template <typename T>
+inline constexpr bool is_specialized_integral_v =
+    is_specialized_integral<T>::value;
+
+using u1 = tsl::uint1;
+using s1 = tsl::int1;
+using u2 = tsl::uint2;
+using s2 = tsl::int2;
+using u4 = tsl::uint4;
+using s4 = tsl::int4;
+
+template <class T>
+struct is_intN : std::false_type {};
+template <int kN, typename UnderlyingType>
+struct is_intN<::ml_dtypes::intN<kN, UnderlyingType>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_intN_v = is_intN<T>::value;
+
+}  // namespace xla
+
+// Extend ml_dtypes to allow absl::String functions.
+namespace ml_dtypes {
+
+template <typename Sink, typename T,
+          std::enable_if_t<xla::is_intN_v<T>, int> = 0>
+void AbslStringify(Sink& sink, const T& i) {
+  static_assert(xla::is_specialized_integral_v<T>);
+  if constexpr (std::numeric_limits<T>::is_signed) {
+    sink.Append(absl::StrCat(static_cast<int32_t>(i)));
+  } else {
+    sink.Append(absl::StrCat(static_cast<uint32_t>(i)));
+  }
+}
+}  // namespace ml_dtypes
+
+// Alias namespace ::stream_executor as ::xla::se.
+namespace stream_executor {}
+namespace xla {
+namespace se = ::stream_executor;  // NOLINT(misc-unused-alias-decls)
+
+// std::make_signed_t is “behavior undefined” for custom types, so provide a
+// general util to make signed/unsigned for both primitive and custom types.
+template <typename T, typename = void>
+struct make_specialized_unsigned {
+  using type = std::make_unsigned_t<T>;
+};
+
+template <typename T>
+struct make_specialized_unsigned<T, typename std::enable_if_t<is_intN_v<T>>> {
+  static_assert(std::is_integral_v<typename T::underlying_type>);
+  using type =
+      ::ml_dtypes::intN<T::bits,
+                        std::make_unsigned_t<typename T::underlying_type>>;
+};
+
+template <typename T>
+using make_specialized_unsigned_t = typename make_specialized_unsigned<T>::type;
+
+template <typename T, typename = void>
+struct make_specialized_signed {
+  using type = std::make_signed_t<T>;
+};
+
+template <typename T>
+struct make_specialized_signed<T, typename std::enable_if_t<is_intN_v<T>>> {
+  static_assert(std::is_integral_v<typename T::underlying_type>);
+  using type =
+      ::ml_dtypes::intN<T::bits,
+                        std::make_signed_t<typename T::underlying_type>>;
+};
+
+template <typename T>
+using make_specialized_signed_t = typename make_specialized_signed<T>::type;
+
+template <typename T>
+struct has_negative_zero
+    : std::bool_constant<std::numeric_limits<T>::is_iec559> {};
+
+template <>
+struct has_negative_zero<tsl::float8_e4m3fn> : std::bool_constant<true> {};
+
+template <typename T>
+inline constexpr bool has_negative_zero_v = has_negative_zero<T>::value;
+
+}  // namespace xla
+
+#endif  // XLA_TYPES_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/union_find.h b/third_party/tflite-hdrs/third_party/xla/xla/union_find.h
new file mode 100644
index 00000000..c3001d9f
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/union_find.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_UNION_FIND_H_
+#define XLA_UNION_FIND_H_
+
+namespace xla {
+
+// Union-Find data structure.
+// Each cluster has an associated value; when merging clusters we can control
+// which value becomes the representative of the merged clusters. Values must be
+// copyable.
+template <typename T>
+class UnionFind {
+ public:
+  explicit UnionFind(const T& value = T())
+      : rank_(0), size_(1), parent_(nullptr), value_(value) {}
+
+  // Returns the number of elements in a cluster.
+  int Size() { return FindRoot()->size_; }
+
+  // Merges this cluster with 'other'. This cluster's value becomes
+  // the value of the merged cluster; the value of 'other' is ignored.
+  void Merge(UnionFind* other);
+
+  // Each cluster has an associated value. Retrieves the value associated
+  // with this cluster.
+  T& Get() { return FindRoot()->value_; }
+
+ private:
+  // Finds the root element of the cluster. Performs path compression.
+  UnionFind* FindRoot();
+
+  int rank_;
+  int size_;  // Size of the cluster.
+  UnionFind* parent_;
+  T value_;
+};
+
+template <typename T>
+void UnionFind<T>::Merge(UnionFind* other) {
+  UnionFind<T>* a = FindRoot();
+  UnionFind<T>* b = other->FindRoot();
+  if (a == b) return;
+  if (a->rank_ > b->rank_) {
+    b->parent_ = a;
+    a->size_ += b->size_;
+    return;
+  }
+
+  a->parent_ = b;
+  if (a->rank_ == b->rank_) {
+    b->rank_++;
+  }
+  b->value_ = a->value_;
+  b->size_ += a->size_;
+}
+
+template <typename T>
+UnionFind<T>* UnionFind<T>::FindRoot() {
+  if (!parent_) return this;
+  // Path compression: update intermediate nodes to point to the root of the
+  // equivalence class.
+  parent_ = parent_->FindRoot();
+  return parent_;
+}
+
+}  // namespace xla
+
+#endif  // XLA_UNION_FIND_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/util.h b/third_party/tflite-hdrs/third_party/xla/xla/util.h
new file mode 100644
index 00000000..95900907
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/util.h
@@ -0,0 +1,951 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generally useful utility functions that are common to (not specific to any
+// given part of) the XLA code base.
+
+#ifndef XLA_UTIL_H_
+#define XLA_UTIL_H_
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/log_severity.h"
+#include "absl/base/macros.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"
+#include "xla/status_macros.h"
+#include "xla/tsl/lib/math/math_util.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/bfloat16.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/errors.h"  // IWYU pragma: keep
+#include "tsl/platform/logging.h"
+#include "tsl/platform/ml_dtypes.h"
+
+namespace xla {
+
+// Converts the unsigned integer n into a mixed-radix representation with the
+// given bounds (radices). More precisely, if there are K radices, then the
+// returned vector digits has K entries and satisfies
+//
+//   0 <= digits[i] < bounds[i],  for i = 0, ..., K - 1
+//
+// and FromMixedRadix(digits) == n. The mixed radix representation is unique
+// modulo the product of the entries of bounds.
+std::vector<int64_t> ToMixedRadix(int64_t n, absl::Span<const int64_t> bounds);
+
+// Logs the provided status message with a backtrace.
+//
+// For use by absl::Status-factories, logs a backtrace at the point where the
+// status is created, such that we can use --vmodule=util=1 to see all status
+// creation backtraces.
+absl::Status WithLogBacktrace(const absl::Status& status);
+
+// Ranks greater than 6 are very rare, so use InlinedVector<int64_t, 6> to store
+// the bounds and indices. And for the rare cases of ranks greater than 6,
+// the InlinedVector will just behave like an std::vector<> and allocate the
+// memory to store its values.
+inline constexpr int InlineRank() { return 6; }
+using DimensionVector = absl::InlinedVector<int64_t, InlineRank()>;
+using DimLevelTypeVector = absl::InlinedVector<DimLevelType, InlineRank()>;
+
+// RAII timer that logs with a given label the wall clock time duration in human
+// readable form. This differs from base's ElapsedTimer primarily in that it
+// spits out the human-readable duration form.
+//
+// Keeps track of global maximum and cumulative times across all invocations.
+//
+// By default, the timing traces are only printed at VLOG(1) and above:
+//
+//   XLA_SCOPED_LOGGING_TIMER("fooing bar");  // nop if !VLOG_IS_ON(1).
+//
+// but you can control this via:
+//
+//   XLA_SCOPED_LOGGING_TIMER_LEVEL("fooing bar", 2);  // nop if !VLOG_IS_ON(2)
+//
+#define XLA_SCOPED_LOGGING_TIMER(label) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, 1, __COUNTER__, /*condition=*/true)
+#define XLA_SCOPED_LOGGING_TIMER_LEVEL(label, level) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, level, __COUNTER__, /*condition=*/true)
+// The timer trace is only printed if the condition is true.
+#define XLA_SCOPED_LOGGING_TIMER_IF(label, condition) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, 1, __COUNTER__, (condition))
+
+// Helper for implementing macros above.  Do not use directly.
+//
+// Forces the evaluation of "counter", which we expect is equal to __COUNTER__.
+#define XLA_SCOPED_LOGGING_TIMER_HELPER(label, level, counter, condition) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter, (condition))
+
+// Helper for macros above.  Don't use directly.
+#define XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter, condition)     \
+  static ::xla::TimerStats XLA_TimerStats##counter;                            \
+  ::xla::ScopedLoggingTimer XLA_ScopedLoggingTimerInstance##counter(           \
+      label, /*enabled=*/VLOG_IS_ON(level) && (condition), __FILE__, __LINE__, \
+      &XLA_TimerStats##counter);
+
+struct TimerStats {
+  absl::Mutex stats_mutex;
+  double cumulative_secs ABSL_GUARDED_BY(stats_mutex) = 0;
+  double max_secs ABSL_GUARDED_BY(stats_mutex) = 0;
+  uint64_t times_called ABSL_GUARDED_BY(stats_mutex) = 0;
+};
+
+// RAII timer for XLA_SCOPED_LOGGING_TIMER and XLA_SCOPED_LOGGING_TIMER_LEVEL
+// macros above.  Recommended usage is via the macros so you don't have to give
+// the timer a name or worry about calling VLOG_IS_ON yourself.
+class ScopedLoggingTimer {
+ public:
+  // label: Label to display for logging.
+  // enabled: Whether this timer should do anything at all.
+  // file: Filename to display in logging.
+  // line: Line number to display in logging.
+  // `timer_stats`: unowned non-null pointer which is used to populate the
+  // global timer statistics.
+  ScopedLoggingTimer(absl::string_view label, bool enabled, const char* file,
+                     int line, TimerStats* timer_stats);
+
+  // Stop the timer and log the tracked time. Timer is disabled after this
+  // function is called.
+  void StopAndLog();
+
+  ~ScopedLoggingTimer();
+
+ private:
+  const std::string label_;
+  const char* const file_;
+  const int line_;
+  TimerStats* const timer_stats_;
+  uint64_t start_micros_;
+  bool enabled_;
+};
+
+// Turns an immutable slice of type T into an immutable slice of bytes with the
+// same byte size.
+template <typename T>
+absl::Span<const uint8_t> CastToByteSlice(absl::Span<const T> slice) {
+  return absl::Span<const uint8_t>(
+      reinterpret_cast<const uint8_t*>(slice.data()), slice.size() * sizeof(T));
+}
+
+// Casts a byte slice to a non-byte type T, checking that the original slice
+// length is a multiple of sizeof(T).
+template <typename T>
+absl::Span<const T> CastByteSlice(absl::Span<const uint8_t> slice) {
+  CHECK_EQ(0, slice.size() % sizeof(T));
+  return absl::Span<const T>(reinterpret_cast<const T*>(slice.data()),
+                             slice.size() / sizeof(T));
+}
+
+// Compares two containers for equality. Returns true iff the two containers
+// have the same size and all their elements compare equal using their
+// operator==. Like std::equal, but forces size equality.
+template <typename Container1T,
+          typename ElementType = typename Container1T::value_type>
+bool ContainersEqual(const Container1T& c1,
+                     std::initializer_list<ElementType> il) {
+  absl::Span<const ElementType> c2{il};
+  return absl::c_equal(c1, c2);
+}
+
+#if defined(__cpp_lib_to_underlying) && __cpp_lib_to_underlying >= 202102L
+using to_underlying = std::to_underlying;
+#else
+// Helper function which implements C++23's std::to_underlying.
+template <typename T>
+constexpr std::underlying_type_t<T> to_underlying(T value) noexcept {
+  return static_cast<std::underlying_type_t<T>>(value);
+}
+#endif
+
+// Performs a copy of count values from src to dest, using different strides for
+// source and destination. The source starting index is src_base, while the
+// destination one is dest_base.
+template <typename D, typename S>
+void StridedCopy(D* dest, int64_t dest_stride, const S* src, int64_t src_stride,
+                 int64_t count) {
+  const S* src_end = src + count * src_stride;
+  DCHECK_LT(src, src_end);
+  for (; src < src_end; dest += dest_stride, src += src_stride) {
+    *dest = static_cast<D>(*src);
+  }
+}
+
+// Adds some context information to the error message in a
+// absl::Status.  This is useful as absl::Statuses are
+// propagated upwards.
+absl::Status AddStatus(absl::Status prior, absl::string_view context);
+absl::Status AppendStatus(absl::Status prior, absl::string_view context);
+
+// The following three macros define a common set of code for creating
+// absl::Status errors with the given error_type, with the addition of adding
+// absl::SourceLocation if it's available (PLATFORM_GOOGLE).  They're a
+// complicated by the need to use #ifdefs within the code.  This would be the
+// equivalent code for ResourceExhausted if a #define macro could have embedded
+// #ifdef directives:
+//
+// template <typename... Args>
+// struct ResourceExhausted {
+//   absl::Status status;
+// #if defined(PLATFORM_GOOGLE)
+//   // NOLINTNEXTLINE(google-explicit-constructor)
+//   ResourceExhausted(const absl::FormatSpec<Args...>& format, Args&&... args,
+//                     absl::SourceLocation loc =
+//                     absl::SourceLocation::current())
+//       : status(WithLogBacktrace(
+//             absl::ResourceExhaustedError(absl::StrFormat(format, args...))
+//                 .WithSourceLocation(loc))) {}
+// #else
+//   ResourceExhaustedStrCat(Args&&... concat)
+//       : status(WithLogBacktrace(
+//             absl::ResourceExhaustedError(absl::StrFormat(format, args...)))
+//             {}
+// #endif
+//
+//   // NOLINTNEXTLINE(google-explicit-constructor)
+//   operator absl::Status() const { return status; }
+// };
+//
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX(error_type) \
+  template <typename... Args>                                     \
+  struct error_type {                                             \
+    absl::Status status;
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX(error_type)        \
+  /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
+  operator absl::Status() const { return status; }                       \
+  }                                                                      \
+  ;                                                                      \
+  /*Deduction guide to make variadic arguments play nice with default */ \
+  /* absl::SourceLocation argument. */                                   \
+  template <typename... Args>                                            \
+  error_type(const absl::FormatSpec<Args...>& format,                    \
+             Args&&...) -> error_type<Args...>;
+
+#if defined(PLATFORM_GOOGLE)
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(error_type)               \
+  XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX(error_type)              \
+  /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
+  error_type(const absl::FormatSpec<Args...>& format, Args&&... args,    \
+             absl::SourceLocation loc = absl::SourceLocation::current()) \
+      : status(WithLogBacktrace(                                         \
+            absl::error_type##Error(absl::StrFormat(format, args...))    \
+                .WithSourceLocation(loc))) {}                            \
+  XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX(error_type)
+#else
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(error_type)          \
+  template <typename... Args>                                       \
+  absl::Status error_type(const absl::FormatSpec<Args...>& format,  \
+                          const Args&... args) {                    \
+    return WithLogBacktrace(                                        \
+        absl::error_type##Error(absl::StrFormat(format, args...))); \
+  }
+#endif
+
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Cancelled);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(FailedPrecondition);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Internal);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(InvalidArgument);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(NotFound);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(ResourceExhausted);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unavailable);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unimplemented);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unknown);
+
+#undef XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE
+#undef XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX
+#undef XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX
+
+// The following three macros define a common set of code for creating
+// absl::Status errors with the given error_type, with the addition of adding
+// absl::SourceLocation if it's available (PLATFORM_GOOGLE).  They're a
+// complicated by the need to use #ifdefs within the code.  This would be the
+// equivalent code for ResourceExhausted if a #define macro could have embedded
+// #ifdef directives:
+//
+// template <typename... Args>
+// struct ResourceExhaustedStrCat {
+//   absl::Status status;
+// #if defined(PLATFORM_GOOGLE)
+//   // NOLINTNEXTLINE(google-explicit-constructor)
+//   ResourceExhaustedStrCat(Args&&... concat, absl::SourceLocation loc =
+//                                             absl::SourceLocation::current())
+//       : status(WithLogBacktrace(
+//             absl::ResourceExhaustedError(absl::StrCat(
+//                                          std::forward<Args>(concat)...))
+//                 .WithSourceLocation(loc))) {}
+// #else
+//   ResourceExhaustedStrCat(Args&&... concat)
+//       : status(WithLogBacktrace(
+//             absl::ResourceExhaustedError(absl::StrCat(
+//                                          std::forward<Args>(concat)...))))
+//             {}
+// #endif
+//
+//   // NOLINTNEXTLINE(google-explicit-constructor)
+//   operator absl::Status() const { return status; }
+// };
+//
+#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type) \
+  template <typename... Args>                                  \
+  struct error_type##StrCat {                                  \
+    absl::Status status;                                       \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */
+#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX(error_type)           \
+  /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
+  operator absl::Status() const { return status; }                       \
+  }                                                                      \
+  ;                                                                      \
+  /*Deduction guide to make variadic arguments play nice with default */ \
+  /* absl::SourceLocation argument. */                                   \
+  template <typename... Args>                                            \
+  error_type##StrCat(Args&&...)->error_type##StrCat<Args...>;
+
+#if defined(PLATFORM_GOOGLE)
+#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(error_type)                       \
+  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type)                      \
+  error_type##StrCat(Args&&... concat, absl::SourceLocation loc =             \
+                                           absl::SourceLocation::current())   \
+      : status(                                                               \
+            WithLogBacktrace(absl::error_type##Error(                         \
+                                 absl::StrCat(std::forward<Args>(concat)...)) \
+                                 .WithSourceLocation(loc))) {}                \
+  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX(error_type)
+#else
+#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(error_type)       \
+  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type)      \
+  error_type##StrCat(Args&&... concat)                        \
+      : status(WithLogBacktrace(absl::error_type##Error(      \
+            absl::StrCat(std::forward<Args>(concat)...)))) {} \
+  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX(error_type)
+#endif
+
+XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(ResourceExhausted);
+XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(InvalidArgument);
+XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(Unimplemented);
+XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(Internal);
+
+#undef XLA_ERROR_WITH_STRCAT_AND_BACKTRACE
+#undef XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX
+#undef XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX
+
+// Splits the lines of the original, replaces leading whitespace with the prefix
+// given by "indentation", and returns the string joined by newlines again. As a
+// side effect, any additional trailing whitespace is removed.
+//
+// Note: even different amounts of leading whitespace on different lines will be
+// uniformly replaced with "indentation".
+std::string Reindent(absl::string_view original, absl::string_view indentation);
+
+template <typename Container>
+int64_t PositionInContainer(const Container& container, int64_t value) {
+  return std::distance(container.begin(), absl::c_find(container, value));
+}
+
+// Formats the container as a comma-separated string. StrAppend must support
+// appending the elements of the container. Prefix is prepended and suffix is
+// appended to the returned string.
+template <typename Container>
+std::string CommaSeparatedString(const Container& c, const char* prefix = "",
+                                 const char* suffix = "") {
+  // Not using Join() since the implementation here is simple anyway and this
+  // avoids copying the string to append prefix.
+  std::string comma_separated = prefix;
+  const char* separator = "";
+  for (const auto& entry : c) {
+    absl::StrAppend(&comma_separated, separator, entry);
+    separator = ", ";
+  }
+  comma_separated += suffix;
+  return comma_separated;
+}
+
+// Overload needed to allow the container to be an initializer list. The default
+// type for T makes an empty initializer list work as well.
+template <typename T = int>
+std::string CommaSeparatedString(const std::initializer_list<T>& c,
+                                 const char* prefix = "",
+                                 const char* suffix = "") {
+  return CommaSeparatedString<std::initializer_list<T>>(c, prefix, suffix);
+}
+
+// Formats the container in the mathematical notation for a vector, e.g. (1, 3,
+// 7). StrAppend must support appending the elements of c.
+template <typename Container>
+std::string VectorString(const Container& c) {
+  return CommaSeparatedString(c, "(", ")");
+}
+
+// Overload needed to allow the container to be an initializer list. The default
+// type for T makes an empty initializer list work as well.
+template <typename T = int>
+std::string VectorString(const std::initializer_list<T>& c) {
+  return VectorString<std::initializer_list<T>>(c);
+}
+
+// Returns a string which can losslessly round trip to a float8 E5M2.
+std::string RoundTripFpToString(tsl::float8_e5m2 value);
+
+// Returns a string which can losslessly round trip to a float8 E4M3.
+std::string RoundTripFpToString(tsl::float8_e4m3 value);
+
+// Returns a string which can losslessly round trip to a float8 E4M3FN.
+std::string RoundTripFpToString(tsl::float8_e4m3fn value);
+
+// Returns a string which can losslessly round trip to a float8 E4M3B11.
+std::string RoundTripFpToString(tsl::float8_e4m3b11fnuz value);
+
+// Returns a string which can losslessly round trip to a float8 E5M2FNUZ.
+std::string RoundTripFpToString(tsl::float8_e5m2fnuz value);
+
+// Returns a string which can losslessly round trip to a float8 E4M3FNUZ.
+std::string RoundTripFpToString(tsl::float8_e4m3fnuz value);
+
+// Returns a string which can losslessly round trip to a float8 E3M4.
+std::string RoundTripFpToString(tsl::float8_e3m4 value);
+
+// Returns a string which can losslessly round trip to a bfloat.
+std::string RoundTripFpToString(tsl::bfloat16 value);
+
+// Returns a string which can losslessly round trip to a fp16.
+std::string RoundTripFpToString(Eigen::half value);
+
+// Returns a string which can losslessly round trip to a float.
+std::string RoundTripFpToString(float value);
+
+// Returns a string which can losslessly round trip to a double.
+std::string RoundTripFpToString(double value);
+
+// Returns a PaddingConfig object that represents no padding for the given rank.
+PaddingConfig MakeNoPaddingConfig(int64_t rank);
+
+// Returns a PaddingConfig object where 'padding' contains
+// (low edge padding, high edge padding) pairs for each dimension.
+PaddingConfig MakeEdgePaddingConfig(
+    absl::Span<const std::pair<int64_t, int64_t>> padding);
+
+// Returns true if the padding configuration has at least one dimension with
+// non-zero interior padding.
+bool HasInteriorPadding(const PaddingConfig& config);
+
+// Imports the templated FloorOfRatio math function from the TensorFlow
+// namespace, as it is very commonly used.
+template <typename T>
+constexpr T FloorOfRatio(T dividend, T divisor) {
+  return tsl::MathUtil::FloorOfRatio<T>(dividend, divisor);
+}
+
+// Imports the templated CeilOfRatio math function from the TensorFlow
+// namespace, as it is very commonly used.
+template <typename T>
+constexpr T CeilOfRatio(T dividend, T divisor) {
+  return tsl::MathUtil::CeilOfRatio<T>(dividend, divisor);
+}
+
+// Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
+// then multiplying by the divisor. For example: RoundUpTo(13, 8) => 16
+template <typename T>
+constexpr T RoundUpTo(T value, T divisor) {
+  return CeilOfRatio(value, divisor) * divisor;
+}
+
+// Rounds the value down to a multiple of the divisor by first calling
+// FloorOfRatio then multiplying by the divisor. For example:
+// RoundDownTo(13, 8) => 8
+template <typename T>
+constexpr T RoundDownTo(T value, T divisor) {
+  return FloorOfRatio(value, divisor) * divisor;
+}
+
+template <typename T>
+struct DivMod {
+  T quotient;
+  T modulo;
+};
+
+// Divide `dividend` by `divisor` such that the quotient is rounded towards
+// negative infinity. The remainder will have the same sign as `divisor`.
+template <typename T>
+constexpr DivMod<T> FloorDivMod(T dividend, T divisor) {
+  DivMod<T> div_mod;
+  div_mod.quotient = FloorOfRatio(dividend, divisor);
+  div_mod.modulo = dividend - div_mod.quotient * divisor;
+  return div_mod;
+}
+
+// Given a number of flops executed in an amount of time, produces a string that
+// represents the throughput;
+// e.g. HumanReadableNumFlops(1e9, 1e9) => 1.00GFLOP/s.
+std::string HumanReadableNumFlops(double flops, double nanoseconds);
+
+// Given a number of transcendental ops executed in an amount of time, produces
+// a string that represents the throughput;
+// e.g. HumanReadableNumTranscendentalOps(1e9, 1e9) => 1.00GTROP/s.
+std::string HumanReadableNumTranscendentalOps(double trops, double nanoseconds);
+
+// Split the text into multiple lines and log each line with the given
+// severity, filename, and line number.
+void LogLines(absl::LogSeverity sev, absl::string_view text, const char* fname,
+              int lineno);
+inline void LogLinesINFO(absl::string_view text, const char* fname,
+                         int lineno) {
+  return LogLines(absl::LogSeverity::kInfo, text, fname, lineno);
+}
+inline void LogLinesWARNING(absl::string_view text, const char* fname,
+                            int lineno) {
+  return LogLines(absl::LogSeverity::kWarning, text, fname, lineno);
+}
+inline void LogLinesERROR(absl::string_view text, const char* fname,
+                          int lineno) {
+  return LogLines(absl::LogSeverity::kError, text, fname, lineno);
+}
+inline void LogLinesFATAL(absl::string_view text, const char* fname,
+                          int lineno) {
+  return LogLines(absl::LogSeverity::kFatal, text, fname, lineno);
+}
+
+// Returns a mask with "width" number of least significant bits set.
+template <typename T>
+constexpr inline T LsbMask(int width) {
+  static_assert(std::is_unsigned<T>::value,
+                "T should be an unsigned integer type");
+  ABSL_ASSERT(width >= 0);
+  ABSL_ASSERT(width <= std::numeric_limits<T>::digits);
+  return width == 0
+             ? 0
+             : static_cast<T>(-1) >> (std::numeric_limits<T>::digits - width);
+}
+
+// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+template <typename T>
+constexpr inline int Log2Floor(T x) {
+  static_assert(std::is_unsigned<T>::value,
+                "T should be an unsigned integer type");
+  return absl::bit_width(x) - 1;
+}
+
+// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+template <typename T>
+constexpr inline int Log2Ceiling(T x) {
+  static_assert(std::is_unsigned<T>::value,
+                "T should be an unsigned integer type");
+  return x == 0 ? -1 : absl::bit_width(x - 1);
+}
+
+// Return the number of sign bits (i.e. the number of leading ones for negative
+// numbers and the number of leading zeros for non-negative numbers).
+template <typename T>
+constexpr inline int CountLeadingSignBits(T x) {
+  static_assert(std::is_signed<T>::value, "T should be a signed integer type");
+  using UnsignedType = std::make_unsigned_t<T>;
+  return x < T{0} ? absl::countl_one<UnsignedType>(x)
+                  : absl::countl_zero<UnsignedType>(x);
+}
+
+// Returns `value` with the low `width` bits set and the remaining bits set to
+// zero.
+template <typename T>
+constexpr inline T KeepLowerBits(T value, int width) {
+  return value & LsbMask<T>(width);
+}
+
+// Returns `base` multiplied by itself `exponent` number of times.
+//
+// Note: returns 1 when `exponent` is zero.
+// Precondition: `exponent` is non-negative for integral `T`.
+template <typename T, typename ExpType>
+constexpr T IPow(T base, ExpType exponent) {
+  static_assert(std::numeric_limits<ExpType>::is_integer);
+  if constexpr (std::numeric_limits<T>::is_integer) {
+    // A negative `exponent` is indicative of a logic bug for integral `base`.
+    // We disallow it for floating-point types for symmetry.
+    ABSL_ASSERT(exponent >= 0);
+  }
+  const bool take_reciprocal = exponent < 0;
+  // We use the right-to-left binary exponentiation algorithm.
+  T result(1);
+  for (;;) {
+    if ((exponent & 1) != 0) {
+      result *= base;
+    }
+    exponent /= 2;
+    if (exponent == 0) {
+      break;
+    }
+    base *= base;
+  }
+  if constexpr (std::numeric_limits<ExpType>::is_signed) {
+    if (take_reciprocal) {
+      return T(1) / result;
+    }
+  }
+  return result;
+}
+
+// UnsignedIntegerTypeForSize<N> gets an unsigned integer with the given size in
+// bytes.
+template <size_t>
+struct UnsignedIntegerTypeForSize;
+
+template <>
+struct UnsignedIntegerTypeForSize<1> {
+  using type = uint8_t;
+};
+
+template <>
+struct UnsignedIntegerTypeForSize<2> {
+  using type = uint16_t;
+};
+
+template <>
+struct UnsignedIntegerTypeForSize<4> {
+  using type = uint32_t;
+};
+
+template <>
+struct UnsignedIntegerTypeForSize<8> {
+  using type = uint64_t;
+};
+
+template <size_t kBytes>
+using UnsignedIntegerTypeForSizeType =
+    typename UnsignedIntegerTypeForSize<kBytes>::type;
+
+template <size_t kBytes>
+using SignedIntegerTypeForSizeType =
+    std::make_signed_t<UnsignedIntegerTypeForSizeType<kBytes>>;
+
+template <typename T>
+auto SignAndMagnitude(T x) {
+  using BitType = UnsignedIntegerTypeForSizeType<sizeof(T)>;
+  BitType x_abs_bits = Eigen::numext::bit_cast<BitType>(Eigen::numext::abs(x));
+  const BitType x_bits = Eigen::numext::bit_cast<BitType>(x);
+  const BitType x_sign = x_bits ^ x_abs_bits;
+  if constexpr (!has_negative_zero_v<T>) {
+    //  f8e4m3b11, f8e4m3fnuz, and f8e5m2fnuz don't support -0, adjust negative
+    //  numbers to fill in the gap.
+    if (x_sign) {
+      x_abs_bits -= 1;
+    }
+  }
+  return std::make_pair(x_sign, x_abs_bits);
+}
+
+template <typename T>
+auto SignAndMagnitudeToTwosComplement(T sign, T magnitude) {
+  static_assert(!std::numeric_limits<T>::is_signed);
+  using SignedType = std::make_signed_t<T>;
+  return static_cast<SignedType>(magnitude) ^
+         (static_cast<SignedType>(sign) < 0 ? SignedType{-1} : SignedType{0});
+}
+
+// Returns the signed magnitude of T.
+template <typename T>
+auto ToSignMagnitude(T input) {
+  auto [sign, magnitude] = SignAndMagnitude(input);
+  return SignAndMagnitudeToTwosComplement(sign, magnitude);
+}
+
+template <typename T>
+constexpr int NanPayloadBits() {
+  // Floating point types with signaling NaNs have payloads.
+  if constexpr (!std::numeric_limits<T>::has_signaling_NaN) {
+    return 0;
+  }
+  return std::numeric_limits<T>::digits - 1;
+}
+
+template <typename T>
+constexpr uint64_t QuietNanWithoutPayload() {
+  constexpr int bits = NanPayloadBits<T>();
+  if constexpr (bits > 0) {
+    return uint64_t{1} << (bits - 1);
+  }
+  return 0;
+}
+
+template <typename T>
+constexpr uint64_t NanPayloadBitMask() {
+  constexpr int bits = NanPayloadBits<T>();
+  if constexpr (bits > 0) {
+    return LsbMask<uint64_t>(bits);
+  }
+  return 0;
+}
+
+template <typename T>
+T NanWithSignAndPayload(bool sign, uint64_t nan_payload) {
+  static_assert(NanPayloadBits<T>() > 0);
+  using RepT = UnsignedIntegerTypeForSizeType<sizeof(T)>;
+  // Clear the sign bit.
+  T val = Eigen::numext::abs(std::numeric_limits<T>::quiet_NaN());
+  // Conditionally set the sign bit.
+  if (sign) {
+    val = -val;
+  }
+  auto rep = absl::bit_cast<RepT>(val);
+  rep |= uint64_t{sign} << (std::numeric_limits<RepT>::digits - 1);
+  constexpr int kPayloadBits = NanPayloadBits<T>();
+  if (kPayloadBits > 0) {
+    // Clear rep's NaN payload.
+    rep &= ~NanPayloadBitMask<T>();
+    CHECK_NE(nan_payload, 0);
+    rep |= nan_payload;
+  }
+  return absl::bit_cast<T>(rep);
+}
+
+// Utility for performing a down_cast<> on a std::unique_ptr<>.
+template <typename Derived, typename Base>
+std::unique_ptr<Derived> unique_ptr_down_cast(std::unique_ptr<Base> ptr) {
+  return absl::WrapUnique(tensorflow::down_cast<Derived*>(ptr.release()));
+}
+
+int64_t Product(absl::Span<const int64_t> xs);
+
+// Returns an array of results after performing elementwise product of a and b.
+std::vector<int64_t> ElemwiseProduct(absl::Span<const int64_t> a,
+                                     absl::Span<const int64_t> b);
+
+// Returns the start indices of consecutive non-overlapping subsequences of `a`
+// and `b` with the same product, i.e. `(i, j)` so
+// • a = {a[0 = i_0], ..., a[i_1 - 1], a[i_1], ... , a[i_2 - 1], ...}
+// • b = {b[0 = j_0], ..., b[j_1 - 1], b[j_1], ... , b[j_2 - 1], ...}
+// • ∀ k . 0 <= k < CommonFactors(a, b).size - 1 =>
+//         a[i_k] × a[i_k + 1] × ... × a[i_(k+1) - 1] =
+//         b[j_k] × b[j_k + 1] × ... × b[j_(k+1) - 1]
+// where `CommonFactors(a, b)[CommonFactors(a, b).size - 1] = (a.size, b.size)`
+//
+// If input and output are the same, return {(0, 0), {1, 1}, ... {a.size,
+// b.size}}, otherwise if the given shapes have non-zero size, returns the
+// bounds of the shortest possible such subsequences; else, returns `{(0, 0),
+// (a.size, b.size)}`.
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8> CommonFactors(
+    absl::Span<const int64_t> a, absl::Span<const int64_t> b);
+
+struct ConvertedDimensionNumbers {
+  DimensionVector transformed_from_dimensions;
+  DimensionVector untransformed_from_dimensions;
+  DimensionVector to_dimensions;
+  DimensionVector split_from_dimensions;
+  DimensionVector split_from_sizes;
+  DimensionVector split_to_dimensions;
+};
+
+// Convert and unsorted list of dimensions from one shapes dimension sizes to
+// another shapes dimensions sizes.
+ConvertedDimensionNumbers ConvertDimensionNumbers(
+    absl::Span<const int64_t> from_dimensions,
+    absl::Span<const int64_t> from_sizes, absl::Span<const int64_t> to_sizes);
+
+// Returns non contracting dimensions for a dot operand based on rank, batch and
+// contracting dimension numbers.
+DimensionVector GetNonContractingDims(
+    int64_t rank, absl::Span<const int64_t> contracting_dim_numbers,
+    absl::Span<const int64_t> batch_dim_numbers);
+
+// Removes illegal characters from filenames.
+std::string SanitizeFileName(std::string file_name);
+
+// Check that a sequence of distinct numbers can form a continuous interval.
+bool DistinctNumbersAreConsecutiveIfSorted(absl::Span<const int64_t>);
+
+template <typename C, typename Value>
+int64_t FindIndex(const C& c, Value&& value) {
+  auto it = absl::c_find(c, std::forward<Value>(value));
+  return std::distance(c.begin(), it);
+}
+
+template <typename C, typename Value>
+void InsertAt(C* c, int64_t index, Value&& value) {
+  c->insert(c->begin() + index, std::forward<Value>(value));
+}
+
+template <typename C>
+void EraseAt(C* c, int64_t index) {
+  c->erase(c->begin() + index);
+}
+
+template <typename T>
+std::vector<T> SpanToVector(absl::Span<const T> slice) {
+  return std::vector<T>(slice.begin(), slice.end());
+}
+
+template <typename T, size_t N>
+std::vector<T> InlinedVectorToVector(
+    const absl::InlinedVector<T, N>& inlined_vector) {
+  return std::vector<T>(inlined_vector.begin(), inlined_vector.end());
+}
+
+// Returns true if `x` fits in 32-bits.
+template <typename T>
+bool IsInt32(T x) {
+  // Following conversion rules: "the value is unchanged if it can be
+  // represented in the destination type (and bit-field width); otherwise, the
+  // value is implementation-defined."
+  return static_cast<int32_t>(x) == x;
+}
+
+template <typename T>
+absl::Status EraseElementFromVector(std::vector<T>* container, const T& value) {
+  // absl::c_find returns a const_iterator which does not seem to work on
+  // gcc 4.8.4, and this breaks the ubuntu/xla_gpu build bot.
+  auto it = std::find(container->begin(), container->end(), value);
+  TF_RET_CHECK(it != container->end());
+  container->erase(it);
+  return absl::OkStatus();
+}
+
+// Takes a sequence of unpacked n-bit values, such that every byte stores one
+// value in the low-order bits, and packs them so every byte stores as many
+// which will fit. `output` should have ceil((input.size()*kBitsPerElement)/8)
+// bytes. The high-order bits of each byte in `input` are ignored.
+template <size_t kBitsPerElement>
+void PackIntN(absl::Span<const char> input, absl::Span<char> output) {
+  constexpr auto kElementsPerByte = 8 / kBitsPerElement;
+  const size_t aligned_inputs = input.size() / kElementsPerByte;
+  for (size_t i = 0; i < aligned_inputs; ++i) {
+    char byte = 0;
+    for (size_t j = 0; j < kElementsPerByte; ++j) {
+      byte |=
+          (input[i * kElementsPerByte + j] & LsbMask<uint8_t>(kBitsPerElement))
+          << (kBitsPerElement * (kElementsPerByte - j - 1));
+    }
+    output[i] = byte;
+  }
+  if (size_t remainder = input.size() % kElementsPerByte; remainder != 0) {
+    char byte = 0;
+    for (size_t j = 0; j < remainder; ++j) {
+      byte |= (input[aligned_inputs * kElementsPerByte + j] &
+               LsbMask<uint8_t>(kBitsPerElement))
+              << (kBitsPerElement * (kElementsPerByte - j - 1));
+    }
+    output[aligned_inputs] = byte;
+  }
+}
+
+inline void PackIntN(int bits_per_element, absl::Span<const char> input,
+                     absl::Span<char> output) {
+  if (bits_per_element == 2) {
+    PackIntN<2>(input, output);
+  } else if (bits_per_element == 4) {
+    PackIntN<4>(input, output);
+  } else {
+    LOG(FATAL) << "Invalid bits_per_element: " << bits_per_element;
+  }
+}
+
+// Takes a sequence of packed values, such that every byte stores multiple
+// values, and unpacks them so every byte stores one value in the low-order
+// bits. `input` should have
+// ceil(output.size()*8/kBitsPerElement) bytes. The high-order bits in each
+// output are zero.
+template <size_t kBitsPerElement>
+void UnpackIntN(absl::Span<const char> input, absl::Span<char> output) {
+  constexpr auto kElementsPerByte = 8 / kBitsPerElement;
+  const size_t aligned_outputs = output.size() / kElementsPerByte;
+  for (size_t i = 0; i < aligned_outputs; ++i) {
+    const char byte = input[i];
+    for (int j = 0; j < kElementsPerByte; ++j) {
+      output[i * kElementsPerByte + j] =
+          (byte >> (kBitsPerElement * (kElementsPerByte - j - 1))) &
+          LsbMask<uint8_t>(kBitsPerElement);
+    }
+  }
+  if (size_t remainder = output.size() % kElementsPerByte; remainder != 0) {
+    const char byte = input[aligned_outputs];
+    for (size_t j = 0; j < remainder; ++j) {
+      output[aligned_outputs * kElementsPerByte + j] =
+          (byte >> (kBitsPerElement * (kElementsPerByte - j - 1))) &
+          LsbMask<uint8_t>(kBitsPerElement);
+    }
+  }
+}
+
+inline void UnpackIntN(int bits_per_element, absl::Span<const char> input,
+                       absl::Span<char> output) {
+  if (bits_per_element == 2) {
+    UnpackIntN<2>(input, output);
+  } else if (bits_per_element == 4) {
+    UnpackIntN<4>(input, output);
+  } else {
+    LOG(FATAL) << "Invalid bits_per_element: " << bits_per_element;
+  }
+}
+
+// Returns a container with `sorted_ids_to_remove` elements removed.
+template <typename T>
+static T RemoveElements(absl::Span<int64_t const> sorted_ids_to_remove,
+                        const T& container) {
+  T result;
+  auto id_to_remove = sorted_ids_to_remove.begin();
+  for (size_t i = 0; i < container.size(); ++i) {
+    if (id_to_remove != sorted_ids_to_remove.end() && *id_to_remove == i) {
+      ++id_to_remove;
+      continue;
+    }
+    result.push_back(container[i]);
+  }
+  return result;
+}
+
+class HloInstruction;
+class HloModule;
+
+// A predicate over HLO instruction.
+using HloPredicate = std::function<bool(const HloInstruction*)>;
+using HloModulePredicate = std::function<bool(const HloModule*)>;
+
+inline bool HloPredicateTrue(const HloInstruction*) { return true; }
+inline bool HloPredicateFalse(const HloInstruction*) { return false; }
+
+using Vector2 = std::array<int64_t, 2>;
+using Vector3 = std::array<int64_t, 3>;
+
+}  // namespace xla
+
+// Note that STRING is evaluated regardless of whether it will be logged.
+#define XLA_LOG_LINES(SEV, STRING) \
+  ::xla::LogLines##SEV(STRING, __FILE__, __LINE__)
+
+// Like LOG_LINES, but only logs if VLOG is enabled for the given level.
+// STRING is evaluated only if it will be logged.
+#define XLA_VLOG_LINES(LEVEL, STRING)                   \
+  do {                                                  \
+    if (VLOG_IS_ON(LEVEL)) XLA_LOG_LINES(INFO, STRING); \
+  } while (false)
+
+#endif  // XLA_UTIL_H_
diff --git a/third_party/tflite-hdrs/third_party/xla/xla/window_util.h b/third_party/tflite-hdrs/third_party/xla/xla/window_util.h
new file mode 100644
index 00000000..91c28bec
--- /dev/null
+++ b/third_party/tflite-hdrs/third_party/xla/xla/window_util.h
@@ -0,0 +1,99 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_WINDOW_UTIL_H_
+#define XLA_WINDOW_UTIL_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/types/span.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace window_util {
+
+// Creates a window with the given sizes in the dimensions and all strides set
+// to 1.
+Window MakeWindow(absl::Span<const int64_t> sizes);
+
+// Creates a window with the given sizes in the dimensions and given strides.
+Window MakeWindow(absl::Span<const int64_t> sizes,
+                  absl::Span<const int64_t> strides);
+
+// Creates a padding config with symmetrical padding in each dimension, of value
+// given by sizes; e.g. {0, 1, 2} would create a R3 padding config that had zero
+// pixels of padding in dimension 0, one pixel of padding symmetrically, on each
+// side of dimension 1, and two pixels of padding symmetrically on dimension 2.
+PaddingConfig MakeSymmetricPadding(absl::Span<const int64_t> sizes);
+
+std::string ToString(const WindowDimension& dim);
+std::string ToString(const Window& window);
+
+// The below functions return true if the given field is set to have a
+// non-trivial effect, e.g. having a stride means that the stride of some
+// dimension is not one. Whether the proto field is populated is not a
+// consideration.
+
+bool HasStride(const Window& window);
+bool HasPadding(const Window& window);
+bool HasSymmetricPadding(const Window& window);
+bool HasNegativePadding(const Window& window);
+
+// As with HasSymmetricPadding(Window) above, returns whether the "padding low"
+// is equivalent to the "padding high" for all dimensions, but works on a
+// padding configuration.
+bool HasSymmetricPadding(const PaddingConfig& padding_config);
+
+bool HasBaseDilation(const Window& window);
+bool HasWindowDilation(const Window& window);
+bool HasDilation(const Window& window);
+
+// Returns true if the window overlaps.
+bool HasOverlappingWindow(const Window& window);
+
+bool HasWindowReversal(const Window& window);
+bool AllOrNoneReversed(const Window& window);
+
+// Returns true if the provided window dimension is trivial in the sense that it
+// has window bound 1, no striding, no padding and no dilation.
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension);
+
+// Returns the new bound after dilation.
+//
+// If a window with the given bound in some dimension is dilated with the given
+// dilation factor in that dimension, then the value returned is the bound for
+// the array in that dimension after dilation.
+//
+// For a 1D array with 3 entries 1, 2, 3, a dilation factor of 2 yields a new
+// window with values 1, x, 2, x, 3, where x indicates holes left by the
+// dilation. So DilatedBound(3, 2) == 5.
+int64_t DilatedBound(int64_t bound, int64_t dilation);
+
+// Returns the number of valid positions of a window with the given size and
+// stride within an array with the given bound. This is the bound of an output
+// array with one element per valid position of the window.
+//
+// For example, for arguments of (bound=5, window_size=2, stride=2), the
+// returned value is 2. There are valid positions at offset 0 and offset 2,
+// while offset 4 is not valid since the window's last entry would be at 5,
+// which is beyond the bound of 5.
+int64_t StridedBound(int64_t bound, int64_t window_size, int64_t stride);
+
+}  // namespace window_util
+}  // namespace xla
+
+#endif  // XLA_WINDOW_UTIL_H_
diff --git a/ui/assets/supported.xml b/ui/assets/supported.xml
index 61ffb674..27d715bf 100644
--- a/ui/assets/supported.xml
+++ b/ui/assets/supported.xml
@@ -1,5 +1,15 @@
 <?xml version="1.0" ?>
 <Workspace>
+  <Target name="ai_server">
+    <Kernel name="remote.model.chat">
+      <Inputs>
+        <Input>VX_TYPE_ARRAY</Input>
+      </Inputs>
+      <Outputs>
+        <Output>VX_TYPE_ARRAY</Output>
+      </Outputs>
+    </Kernel>
+  </Target>
   <Target name="c_model">
     <Kernel name="org.khronos.nn_extension.activation_layer">
       <Inputs>
diff --git a/ui/lib/export.dart b/ui/lib/export.dart
index fdf14b6a..b4baeced 100644
--- a/ui/lib/export.dart
+++ b/ui/lib/export.dart
@@ -4,70 +4,75 @@ import 'package:file_picker/file_picker.dart';
 import 'package:flutter/material.dart';
 import 'package:xml/xml.dart' as xml;
 
-class XmlExport extends StatelessWidget {
+class XmlExport {
   const XmlExport(
-      {super.key,
-      required this.graphs,
-      required this.graphIndex,
-      required this.refCount});
+      {required this.graphs, required this.graphIndex, required this.refCount});
 
   final List<Graph> graphs;
   final int graphIndex;
   final int refCount;
 
-  @override
-  Widget build(BuildContext context) {
-    return IconButton(
-      icon: Icon(Icons.code_off),
-      tooltip: 'Export XML',
-      onPressed: () {
-        if (graphs.isNotEmpty) {
-          final graph = graphs[graphIndex];
-          // Export the selected graph
-          final xml = _exportXML(graph, refCount);
-          showDialog(
-            context: context,
-            builder: (context) => AlertDialog(
-              title: Text("Export XML"),
-              content: SingleChildScrollView(child: Text(xml)),
-              actions: [
-                TextButton(
-                  onPressed: () async {
-                    // Use FilePicker to save the XML file
-                    final result = await FilePicker.platform.saveFile(
-                      dialogTitle: 'Save XML File',
-                      fileName: 'graph.xml',
+  void export(BuildContext context) {
+    if (graphs.isNotEmpty) {
+      final graph = graphs[graphIndex];
+      // Export the selected graph
+      final xml = _exportXML(graph, refCount);
+      showDialog(
+        context: context,
+        builder: (context) => AlertDialog(
+          title: Text("Export XML"),
+          content: SingleChildScrollView(child: Text(xml)),
+          actions: [
+            TextButton(
+              onPressed: () async {
+                // Use FilePicker to save the XML file
+                final result = await FilePicker.platform.saveFile(
+                  dialogTitle: 'Save XML File',
+                  fileName: 'graph.xml',
+                );
+
+                if (result != null) {
+                  final file = File(result);
+                  await file.writeAsString(xml);
+
+                  // Show a confirmation message
+                  if (context.mounted) {
+                    ScaffoldMessenger.of(context).showSnackBar(
+                      SnackBar(content: Text('XML file saved to $result')),
                     );
-
-                    if (result != null) {
-                      final file = File(result);
-                      await file.writeAsString(xml);
-
-                      // Show a confirmation message
-                      if (context.mounted) {
-                        ScaffoldMessenger.of(context).showSnackBar(
-                          SnackBar(content: Text('XML file saved to $result')),
-                        );
-                      }
-                    }
-
-                    // Close the dialog
-                    if (context.mounted) {
-                      Navigator.of(context).pop();
-                    }
-                  },
-                  child: Text("Save"),
-                ),
-                TextButton(
-                  onPressed: () => Navigator.of(context).pop(),
-                  child: Text("Close"),
-                ),
-              ],
+                  }
+                }
+
+                // Close the dialog
+                if (context.mounted) {
+                  Navigator.of(context).pop();
+                }
+              },
+              child: Text("Save"),
             ),
-          );
-        }
-      },
-    );
+            TextButton(
+              onPressed: () => Navigator.of(context).pop(),
+              child: Text("Close"),
+            ),
+          ],
+        ),
+      );
+    } else {
+      // Show a message if there are no graphs to export
+      showDialog(
+        context: context,
+        builder: (context) => AlertDialog(
+          title: Text("No Graphs Defined!"),
+          content: Text("There are no graphs to export."),
+          actions: [
+            TextButton(
+              onPressed: () => Navigator.of(context).pop(),
+              child: Text("Close"),
+            ),
+          ],
+        ),
+      );
+    }
   }
 
   void _addReferenceElement(xml.XmlBuilder builder, Reference reference) {
@@ -237,9 +242,8 @@ class XmlExport extends StatelessWidget {
   } // End of _getTargets
 } // End of XmlExport
 
-class DotExport extends StatelessWidget {
+class DotExport {
   const DotExport({
-    super.key,
     required this.graphs,
     required this.graphIndex,
   });
@@ -247,58 +251,65 @@ class DotExport extends StatelessWidget {
   final List<Graph> graphs;
   final int graphIndex;
 
-  @override
-  Widget build(BuildContext context) {
-    return IconButton(
-      icon: Icon(Icons.code),
-      tooltip: 'Export DOT',
-      onPressed: () {
-        // Check if there are any graphs available
-        if (graphs.isNotEmpty) {
-          final graph = graphs[graphIndex];
-          // Export the selected graph
-          final dot = _exportDOT(graph);
-          showDialog(
-            context: context,
-            builder: (context) => AlertDialog(
-              title: Text("Export DOT"),
-              content: SingleChildScrollView(child: Text(dot)),
-              actions: [
-                TextButton(
-                  onPressed: () async {
-                    final result = await FilePicker.platform.saveFile(
-                      dialogTitle: 'Save DOT File',
-                      fileName: 'graph.dot',
+  void export(BuildContext context) {
+    if (graphs.isNotEmpty) {
+      final graph = graphs[graphIndex];
+      // Export the selected graph
+      final dot = _exportDOT(graph);
+      showDialog(
+        context: context,
+        builder: (context) => AlertDialog(
+          title: Text("Export DOT"),
+          content: SingleChildScrollView(child: Text(dot)),
+          actions: [
+            TextButton(
+              onPressed: () async {
+                final result = await FilePicker.platform.saveFile(
+                  dialogTitle: 'Save DOT File',
+                  fileName: 'graph.dot',
+                );
+
+                if (result != null) {
+                  final file = File(result);
+                  await file.writeAsString(dot);
+
+                  // Show a confirmation message
+                  if (context.mounted) {
+                    ScaffoldMessenger.of(context).showSnackBar(
+                      SnackBar(content: Text('DOT file saved to $result')),
                     );
-
-                    if (result != null) {
-                      final file = File(result);
-                      await file.writeAsString(dot);
-
-                      // Show a confirmation message
-                      if (context.mounted) {
-                        ScaffoldMessenger.of(context).showSnackBar(
-                          SnackBar(content: Text('DOT file saved to $result')),
-                        );
-                      }
-                    }
-
-                    if (context.mounted) {
-                      Navigator.of(context).pop(); // Close the dialog
-                    }
-                  },
-                  child: Text("Save"),
-                ),
-                TextButton(
-                  onPressed: () => Navigator.of(context).pop(),
-                  child: Text("Close"),
-                ),
-              ],
+                  }
+                }
+
+                if (context.mounted) {
+                  Navigator.of(context).pop(); // Close the dialog
+                }
+              },
+              child: Text("Save"),
             ),
-          );
-        }
-      },
-    );
+            TextButton(
+              onPressed: () => Navigator.of(context).pop(),
+              child: Text("Close"),
+            ),
+          ],
+        ),
+      );
+    } else {
+      // Show a message if there are no graphs to export
+      showDialog(
+        context: context,
+        builder: (context) => AlertDialog(
+          title: Text("No Graphs Defined!"),
+          content: Text("There are no graphs to export."),
+          actions: [
+            TextButton(
+              onPressed: () => Navigator.of(context).pop(),
+              child: Text("Close"),
+            ),
+          ],
+        ),
+      );
+    }
   }
 
   String _formatReferenceLabel(Reference reference) {
diff --git a/ui/lib/graph_editor.dart b/ui/lib/graph_editor.dart
index 273a3994..104c1766 100644
--- a/ui/lib/graph_editor.dart
+++ b/ui/lib/graph_editor.dart
@@ -172,6 +172,8 @@ class GraphEditorState extends State<GraphEditor> {
       selectedNode = null;
       selectedEdge = null;
       selectedGraphRow = null;
+      edgeStartNode = null;
+      edgeStartOutput = null;
     });
   } // End of _deselectAll
 
@@ -218,14 +220,40 @@ class GraphEditorState extends State<GraphEditor> {
 
     return Scaffold(
       appBar: AppBar(
+        centerTitle: true,
+        title: Text(
+          'Edge Studio',
+          style: TextStyle(fontSize: 20, fontWeight: FontWeight.bold),
+        ),
         actions: [
-          // IconButton to export the currently selected graph in DOT format.
-          DotExport(graphs: graphs, graphIndex: selectedGraphIndex),
-          // IconButton to export the currently selected graph in XML format.
-          XmlExport(
-              graphs: graphs,
-              graphIndex: selectedGraphIndex,
-              refCount: _refCount),
+          PopupMenuButton<String>(
+            icon: Icon(Icons.code_rounded), // Single export icon
+            tooltip: 'Export',
+            onSelected: (value) {
+              if (value == 'Export DOT') {
+                // Export the currently selected graph in DOT format.
+                DotExport(graphs: graphs, graphIndex: selectedGraphIndex)
+                    .export(context);
+              } else if (value == 'Export XML') {
+                // Export the currently selected graph in XML format.
+                XmlExport(
+                  graphs: graphs,
+                  graphIndex: selectedGraphIndex,
+                  refCount: _refCount,
+                ).export(context);
+              }
+            },
+            itemBuilder: (context) => [
+              PopupMenuItem(
+                value: 'Export DOT',
+                child: Text('Export DOT'),
+              ),
+              PopupMenuItem(
+                value: 'Export XML',
+                child: Text('Export XML'),
+              ),
+            ],
+          ),
         ],
       ),
       body: Column(
@@ -1043,30 +1071,27 @@ class NodeAttributesPanel extends StatelessWidget {
                   },
                 ),
                 SizedBox(height: 8.0),
-                SizedBox(
-                  width: double.infinity,
-                  child: DropdownButtonFormField<String>(
-                    isExpanded: true,
-                    value: selectedNode!.target,
-                    decoration: InputDecoration(
-                      labelText:
-                          Text('Target', overflow: TextOverflow.ellipsis).data,
-                      isDense: true,
-                    ),
-                    items: supportedTargets
-                        .map((target) => DropdownMenuItem<String>(
-                              alignment: Alignment.centerLeft,
-                              value: target.name,
-                              child: Text(
-                                target.name,
-                                overflow: TextOverflow.ellipsis,
-                              ),
-                            ))
-                        .toList(),
-                    onChanged: (newValue) {
-                      onTargetChanged(newValue!);
-                    },
+                DropdownButtonFormField<String>(
+                  isExpanded: true,
+                  value: selectedNode!.target,
+                  decoration: InputDecoration(
+                    labelText:
+                        Text('Target', overflow: TextOverflow.ellipsis).data,
+                    isDense: true,
                   ),
+                  items: supportedTargets
+                      .map((target) => DropdownMenuItem<String>(
+                            alignment: Alignment.centerLeft,
+                            value: target.name,
+                            child: Text(
+                              target.name,
+                              overflow: TextOverflow.ellipsis,
+                            ),
+                          ))
+                      .toList(),
+                  onChanged: (newValue) {
+                    onTargetChanged(newValue!);
+                  },
                 ),
                 SizedBox(height: 8.0),
                 DropdownButtonFormField<String>(